diff --git a/.github/workflows/fpm-deployment.yml b/.github/workflows/fpm-deployment.yml
index 137fdd960..de19693f2 100644
--- a/.github/workflows/fpm-deployment.yml
+++ b/.github/workflows/fpm-deployment.yml
@@ -31,9 +31,9 @@ jobs:
         --slave /usr/bin/gcov gcov /usr/bin/gcov-${{ env.GCC_V }}
 
     - name: Install fpm latest release
-      uses: fortran-lang/setup-fpm@v3
+      uses: fortran-lang/setup-fpm@v5
       with:
-        fpm-version: 'v0.4.0'
+        fpm-version: 'v0.10.0'
 
     - name: Run fpm test ⚙
       run: |
diff --git a/ci/fpm.toml b/ci/fpm.toml
index e85a54f69..6a0509007 100644
--- a/ci/fpm.toml
+++ b/ci/fpm.toml
@@ -8,3 +8,6 @@ copyright = "2019-2021 stdlib contributors"
 [dev-dependencies]
 test-drive.git = "https://github.com/fortran-lang/test-drive"
 test-drive.tag = "v0.4.0"
+
+[preprocess]
+[preprocess.cpp]
diff --git a/cmake/stdlib.cmake b/cmake/stdlib.cmake
index 9a1fec059..bdaf87d1f 100644
--- a/cmake/stdlib.cmake
+++ b/cmake/stdlib.cmake
@@ -40,3 +40,10 @@ function (fypp_f90 fyppopts fyppfiles f90files)
   preprocess("${FYPP}" "${fyppopts}" "fypp" "f90" "${fyppfiles}" _f90files)
   set(${f90files} ${_f90files} PARENT_SCOPE)
 endfunction()
+
+# For fortran sources that contain C preprocessor flags: create ".F90" files 
+function (fypp_f90pp fyppopts fyppfiles F90files)
+  preprocess("${FYPP}" "${fyppopts}" "fypp" "F90" "${fyppfiles}" _F90files)
+  set(${F90files} ${_F90files} PARENT_SCOPE)
+endfunction()
+
diff --git a/doc/specs/stdlib_linalg.md b/doc/specs/stdlib_linalg.md
index 671cfee2f..912cd4828 100644
--- a/doc/specs/stdlib_linalg.md
+++ b/doc/specs/stdlib_linalg.md
@@ -6,6 +6,176 @@ title: linalg
 
 [TOC]
 
+The `stdlib` linear algebra library provides high-level APIs for dealing with common linear algebra operations.
+
+## BLAS and LAPACK
+
+### Status
+
+Experimental
+
+### Description
+
+`BLAS` and `LAPACK` backends provide efficient low level implementations of many linear algebra algorithms, and are employed for non-trivial operators. 
+A Modern Fortran version of the [Reference-LAPACK 3.10.1](http://github.com/reference-LAPACK) implementation is provided as a backend. 
+Modern Fortran modules with full explicit typing features are provided after an 
+[automated conversion](https://github.com/perazz/fortran-lapack/blob/main/scripts/modularize_blas.py) 
+of the legacy codes: 
+- [stdlib_linalg_blas(module)], [stdlib_linalg_lapack(module)] provide kind-agnostic interfaces to all functions.
+- Both libraries are available for 32- (`sp`), 64- (`dp`) and 128-bit (`qp`) `real` and `complex` numbers (the latter if available in the current build)
+- Free format, lower-case style
+- `implicit none(type, external)` applied to all procedures and modules
+- `intent` added and all `pure` procedures where possible
+- `stdlib` provides all procedures in two different flavors: (a) original BLAS/LAPACK names with a prefix `stdlib_?<name>` (ex: `stdlib_dgemv`, `stdlib_sgemv`); (b) A generic, kind agnostic `<name>`, i.e. `gemv`. 
+- F77-style `parameter`s removed, and all numeric constants have been generalized with KIND-dependent Fortran intrinsics. 
+- preprocessor-based OpenMP directives retained.
+The single-source module structure hopefully allows for cross-procedural inlining which is otherwise impossible without link-time optimization.
+
+When available, highly optimized libraries that take advantage of specialized processor instructions should be preferred over the `stdlib` implementation. 
+Examples of such libraries are: OpenBLAS, MKL (TM), Accelerate, and ATLAS. In order to enable their usage, simply ensure that the following pre-processor macros are defined: 
+
+- `STDLIB_EXTERNAL_BLAS`   wraps all BLAS procedures (except for the 128-bit ones) to an external library
+- `STDLIB_EXTERNAL_LAPACK` wraps all LAPACK procedures (except for the 128-bit ones) to an external library
+
+These can be enabled during the build process. For example, with CMake, one can enable these preprocessor directives using `add_compile_definitions(STDLIB_EXTERNAL_BLAS STDLIB_EXTERNAL_LAPACK)`.
+The same is possible from the `fpm` branch, where the `cpp` preprocessor is enabled by default. For example, the macros can be added to the project's manifest:
+
+```toml
+[dependencies]
+stdlib="*"
+
+# Macros are only needed if using an external library
+[preprocess]
+[preprocess.cpp]
+macros = ["STDLIB_EXTERNAL_BLAS", "STDLIB_EXTERNAL_LAPACK"]
+```
+
+or directly via compiler flags: 
+
+`fpm build --flag "-DSTDLIB_EXTERNAL_BLAS -DSTDLIB_EXTERNAL_LAPACK -lblas -llapack"`.
+
+### Syntax 
+
+All procedures in the `BLAS` and `LAPACK` backends follow the standard interfaces from the 
+[Reference LAPACK](https://www.netlib.org/lapack/). So, the online [Users Guide](https://www.netlib.org/lapack/explore-html/)
+should be consulted for the full API and descriptions of procedure arguments and their usage. 
+
+The `stdlib` implementation makes both kind-agnostic and specific procedure interfaces available via modules
+[stdlib_linalg_blas(module)] and [stdlib_linalg_lapack(module)]. Because all procedures start with a letter 
+[that indicates the base datatype](https://www.netlib.org/lapack/lug/node24.html), the `stdlib` generic
+interface drops the heading letter and contains all kind-dependent implementations. For example, the generic 
+interface to the `axpy` function looks like: 
+
+```fortran  
+!> AXPY: constant times a vector plus a vector.
+interface axpy
+    module procedure stdlib_saxpy
+    module procedure stdlib_daxpy
+    module procedure stdlib_qaxpy
+    module procedure stdlib_caxpy
+    module procedure stdlib_zaxpy
+    module procedure stdlib_waxpy
+end interface axpy
+```
+
+The generic interface is the endpoint for using an external library. Whenever the latter is used, references
+to the internal `module procedure`s are replaced with interfaces to the external library, 
+for example: 
+
+```fortran  
+!> AXPY: constant times a vector plus a vector.
+interface axpy
+    pure subroutine caxpy(n,ca,cx,incx,cy,incy)
+        import sp,dp,qp,ilp,lk 
+        implicit none(type,external) 
+        complex(sp), intent(in) :: ca,cx(*)
+        integer(ilp), intent(in) :: incx,incy,n
+        complex(sp), intent(inout) :: cy(*)
+    end subroutine caxpy
+    ! [....]
+    module procedure stdlib_qaxpy
+end interface axpy
+```
+
+Note that the 128-bit functions are only provided by `stdlib` and always point to the internal implementation. 
+Because 128-bit precision is identified as [stdlib_kinds(module):qp], initials for 128-bit procedures were 
+labelled as `q` (quadruple-precision reals) and `w` ("wide" or quadruple-precision complex numbers). 
+Extended precision ([stdlib_kinds(module):xdp]) calculations are currently not supported.
+
+### Example
+
+```fortran
+{!example/linalg/example_blas_gemv.f90!}
+```
+
+```fortran
+{!example/linalg/example_lapack_getrf.f90!}
+```
+
+### Licensing
+
+The Fortran Standard Library is distributed under the MIT License. `LAPACK` and its contained `BLAS` are a 
+freely-available software package. They are available from [netlib](https://www.netlib.org/lapack/) via anonymous 
+ftp and the World Wide Web. Thus, they can be included in commercial software packages (and have been). 
+The license used for the `BLAS` and `LAPACK` backends is the [modified BSD license](https://www.netlib.org/lapack/LICENSE.txt).
+
+The header of the `LICENSE.txt` file has as its licensing requirements:
+
+    Copyright (c) 1992-2013 The University of Tennessee and The University
+                            of Tennessee Research Foundation.  All rights
+                            reserved.
+    Copyright (c) 2000-2013 The University of California Berkeley. All
+                            rights reserved.
+    Copyright (c) 2006-2013 The University of Colorado Denver.  All rights
+                            reserved.
+
+    $COPYRIGHT$
+
+    Additional copyrights may follow
+
+    $HEADER$
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer listed
+      in this license in the documentation and/or other materials
+      provided with the distribution.
+
+    - Neither the name of the copyright holders nor the names of its
+      contributors may be used to endorse or promote products derived from
+      this software without specific prior written permission.
+
+    The copyright holders provide no reassurances that the source code
+    provided does not infringe any patent, copyright, or any other
+    intellectual property rights of third parties.  The copyright holders
+    disclaim any liability to any recipient for claims brought against
+    recipient by any third party for infringement of that parties
+    intellectual property rights.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+So the license for the `LICENSE.txt` code is compatible with the use of
+modified versions of the code in the Fortran Standard Library under the MIT license.    
+Credit for the `BLAS`, `LAPACK` libraries should be given to the [LAPACK authors](https://www.netlib.org/lapack/contributor-list.html).
+According to the original license, we also changed the name of the routines and commented the changes made 
+to the original.
+    
 ## `diag` - Create a diagonal array or extract the diagonal elements of an array
 
 ### Status
diff --git a/example/linalg/example_blas_gemv.f90 b/example/linalg/example_blas_gemv.f90
new file mode 100644
index 000000000..e5d0e0799
--- /dev/null
+++ b/example/linalg/example_blas_gemv.f90
@@ -0,0 +1,14 @@
+program example_gemv
+  use stdlib_linalg, only: eye
+  use stdlib_linalg_blas, only: sp,gemv
+  implicit none(type,external)
+  real(sp) :: A(2, 2), B(2)
+  B = [1.0,2.0]
+  A = eye(2)
+  
+  ! Use legacy BLAS interface 
+  call gemv('No transpose',m=size(A,1),n=size(A,2),alpha=1.0,a=A,lda=size(A,1),x=B,incx=1,beta=0.0,y=B,incy=1)
+
+  print *, B ! returns 1.0 2.0
+
+end program example_gemv
diff --git a/example/linalg/example_lapack_getrf.f90 b/example/linalg/example_lapack_getrf.f90
new file mode 100644
index 000000000..221d83a71
--- /dev/null
+++ b/example/linalg/example_lapack_getrf.f90
@@ -0,0 +1,14 @@
+program example_getrf
+  use stdlib_linalg, only: eye
+  use stdlib_linalg_lapack, only: dp,ilp,getrf
+  implicit none(type,external)
+  real(dp) :: A(3, 3)
+  integer(ilp) :: ipiv(3),info
+  
+  A = eye(3)
+  
+  ! LAPACK matrix factorization interface (overwrite result)
+  call getrf(size(A,1),size(A,2),A,size(A,1),ipiv,info)
+  print *, info ! info==0: Success!
+
+end program example_getrf
diff --git a/include/common.fypp b/include/common.fypp
index 52ca02941..1683239e4 100644
--- a/include/common.fypp
+++ b/include/common.fypp
@@ -87,7 +87,6 @@
 #! Whether Fortran 90 compatible code should be generated
 #:set VERSION90 = defined('VERSION90')
 
-
 #! Ranks to be generated when templates are created
 #:if not defined('MAXRANK')
   #:if VERSION90
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 89890f4a0..6285ed0e5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -64,8 +64,29 @@ set(fppFiles
     stdlib_version.fypp
 )
 
+# Preprocessed files to contain preprocessor directives -> .F90 
+set(cppFiles
+    stdlib_linalg_constants.fypp
+    stdlib_linalg_blas.fypp
+    stdlib_linalg_blas_aux.fypp
+    stdlib_linalg_blas_s.fypp
+    stdlib_linalg_blas_d.fypp
+    stdlib_linalg_blas_q.fypp
+    stdlib_linalg_blas_c.fypp
+    stdlib_linalg_blas_z.fypp
+    stdlib_linalg_blas_w.fypp
+    stdlib_linalg_lapack.fypp
+    stdlib_linalg_lapack_aux.fypp
+    stdlib_linalg_lapack_s.fypp
+    stdlib_linalg_lapack_d.fypp
+    stdlib_linalg_lapack_q.fypp
+    stdlib_linalg_lapack_c.fypp
+    stdlib_linalg_lapack_z.fypp
+    stdlib_linalg_lapack_w.fypp
+)
 
 fypp_f90("${fyppFlags}" "${fppFiles}" outFiles)
+fypp_f90pp("${fyppFlags}" "${cppFiles}" outPreprocFiles)
 
 set(SRC
     stdlib_ansi.f90
@@ -85,6 +106,7 @@ set(SRC
     stdlib_quadrature_gauss.f90
     stdlib_stringlist_type.f90
     ${outFiles}
+    ${outPreprocFiles}
 )
 
 add_library(${PROJECT_NAME} ${SRC})
diff --git a/src/stdlib_linalg_blas.fypp b/src/stdlib_linalg_blas.fypp
new file mode 100644
index 000000000..8c5bdb60e
--- /dev/null
+++ b/src/stdlib_linalg_blas.fypp
@@ -0,0 +1,2242 @@
+#:include "common.fypp" 
+module stdlib_linalg_blas
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas_aux
+     use stdlib_linalg_blas_s
+     use stdlib_linalg_blas_d
+#:if WITH_QP 
+     use stdlib_linalg_blas_q
+#:endif 
+     use stdlib_linalg_blas_c
+     use stdlib_linalg_blas_z
+#:if WITH_QP 
+     use stdlib_linalg_blas_w
+#:endif 
+     implicit none(type,external)
+     public
+
+          !> AXPY: constant times a vector plus a vector.
+          interface axpy
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine caxpy(n,ca,cx,incx,cy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: ca,cx(*)
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(sp), intent(inout) :: cy(*)
+               end subroutine caxpy
+#else 
+               module procedure stdlib_caxpy
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine daxpy(n,da,dx,incx,dy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: da,dx(*)
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(dp), intent(inout) :: dy(*)
+               end subroutine daxpy
+#else 
+               module procedure stdlib_daxpy
+#endif
+#:if WITH_QP
+               module procedure stdlib_qaxpy
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine saxpy(n,sa,sx,incx,sy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: sa,sx(*)
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(sp), intent(inout) :: sy(*)
+               end subroutine saxpy
+#else 
+               module procedure stdlib_saxpy
+#endif
+#:if WITH_QP
+               module procedure stdlib_waxpy
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zaxpy(n,za,zx,incx,zy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: za,zx(*)
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(dp), intent(inout) :: zy(*)
+               end subroutine zaxpy
+#else 
+               module procedure stdlib_zaxpy
+#endif
+          end interface axpy
+
+
+
+          !> COPY: copies a vector x to a vector y.
+          interface copy
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ccopy(n,cx,incx,cy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(sp), intent(in) :: cx(*)
+                    complex(sp), intent(out) :: cy(*)
+               end subroutine ccopy
+#else 
+               module procedure stdlib_ccopy
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dcopy(n,dx,incx,dy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(dp), intent(in) :: dx(*)
+                    real(dp), intent(out) :: dy(*)
+               end subroutine dcopy
+#else 
+               module procedure stdlib_dcopy
+#endif
+#:if WITH_QP
+               module procedure stdlib_qcopy
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine scopy(n,sx,incx,sy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(sp), intent(in) :: sx(*)
+                    real(sp), intent(out) :: sy(*)
+               end subroutine scopy
+#else 
+               module procedure stdlib_scopy
+#endif
+#:if WITH_QP
+               module procedure stdlib_wcopy
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zcopy(n,zx,incx,zy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(dp), intent(in) :: zx(*)
+                    complex(dp), intent(out) :: zy(*)
+               end subroutine zcopy
+#else 
+               module procedure stdlib_zcopy
+#endif
+          end interface copy
+
+
+
+          !> DOT: forms the dot product of two vectors.
+          !> uses unrolled loops for increments equal to one.
+          interface dot
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure real(dp) function ddot(n,dx,incx,dy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(dp), intent(in) :: dx(*),dy(*)
+               end function ddot
+#else 
+               module procedure stdlib_ddot
+#endif
+#:if WITH_QP
+               module procedure stdlib_qdot
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure real(sp) function sdot(n,sx,incx,sy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(sp), intent(in) :: sx(*),sy(*)
+               end function sdot
+#else 
+               module procedure stdlib_sdot
+#endif
+          end interface dot
+
+
+
+          !> DOTC: forms the dot product of two complex vectors
+          !> DOTC = X^H * Y
+          interface dotc
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure complex(sp) function cdotc(n,cx,incx,cy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(sp), intent(in) :: cx(*),cy(*)
+               end function cdotc
+#else 
+               module procedure stdlib_cdotc
+#endif
+#:if WITH_QP
+               module procedure stdlib_wdotc
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure complex(dp) function zdotc(n,zx,incx,zy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(dp), intent(in) :: zx(*),zy(*)
+               end function zdotc
+#else 
+               module procedure stdlib_zdotc
+#endif
+          end interface dotc
+
+
+
+          !> DOTU: forms the dot product of two complex vectors
+          !> DOTU = X^T * Y
+          interface dotu
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure complex(sp) function cdotu(n,cx,incx,cy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(sp), intent(in) :: cx(*),cy(*)
+               end function cdotu
+#else 
+               module procedure stdlib_cdotu
+#endif
+#:if WITH_QP
+               module procedure stdlib_wdotu
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure complex(dp) function zdotu(n,zx,incx,zy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(dp), intent(in) :: zx(*),zy(*)
+               end function zdotu
+#else 
+               module procedure stdlib_zdotu
+#endif
+          end interface dotu
+
+
+
+          !> GBMV:  performs one of the matrix-vector operations
+          !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,   or
+          !> y := alpha*A**H*x + beta*y,
+          !> where alpha and beta are scalars, x and y are vectors and A is an
+          !> m by n band matrix, with kl sub-diagonals and ku super-diagonals.
+          interface gbmv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine cgbmv(trans,m,n,kl,ku,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,kl,ku,lda,m,n
+                    character, intent(in) :: trans
+                    complex(sp), intent(inout) :: y(*)
+               end subroutine cgbmv
+#else 
+               module procedure stdlib_cgbmv
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dgbmv(trans,m,n,kl,ku,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,kl,ku,lda,m,n
+                    character, intent(in) :: trans
+                    real(dp), intent(inout) :: y(*)
+               end subroutine dgbmv
+#else 
+               module procedure stdlib_dgbmv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgbmv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine sgbmv(trans,m,n,kl,ku,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,kl,ku,lda,m,n
+                    character, intent(in) :: trans
+                    real(sp), intent(inout) :: y(*)
+               end subroutine sgbmv
+#else 
+               module procedure stdlib_sgbmv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgbmv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zgbmv(trans,m,n,kl,ku,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,kl,ku,lda,m,n
+                    character, intent(in) :: trans
+                    complex(dp), intent(inout) :: y(*)
+               end subroutine zgbmv
+#else 
+               module procedure stdlib_zgbmv
+#endif
+          end interface gbmv
+
+
+
+          !> GEMM:  performs one of the matrix-matrix operations
+          !> C := alpha*op( A )*op( B ) + beta*C,
+          !> where  op( X ) is one of
+          !> op( X ) = X   or   op( X ) = X**T   or   op( X ) = X**H,
+          !> alpha and beta are scalars, and A, B and C are matrices, with op( A )
+          !> an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+          interface gemm
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine cgemm(transa,transb,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: k,lda,ldb,ldc,m,n
+                    character, intent(in) :: transa,transb
+                    complex(sp), intent(inout) :: c(ldc,*)
+               end subroutine cgemm
+#else 
+               module procedure stdlib_cgemm
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dgemm(transa,transb,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: k,lda,ldb,ldc,m,n
+                    character, intent(in) :: transa,transb
+                    real(dp), intent(inout) :: c(ldc,*)
+               end subroutine dgemm
+#else 
+               module procedure stdlib_dgemm
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgemm
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine sgemm(transa,transb,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: k,lda,ldb,ldc,m,n
+                    character, intent(in) :: transa,transb
+                    real(sp), intent(inout) :: c(ldc,*)
+               end subroutine sgemm
+#else 
+               module procedure stdlib_sgemm
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgemm
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zgemm(transa,transb,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: k,lda,ldb,ldc,m,n
+                    character, intent(in) :: transa,transb
+                    complex(dp), intent(inout) :: c(ldc,*)
+               end subroutine zgemm
+#else 
+               module procedure stdlib_zgemm
+#endif
+          end interface gemm
+
+
+
+          !> GEMV: performs one of the matrix-vector operations
+          !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,   or
+          !> y := alpha*A**H*x + beta*y,
+          !> where alpha and beta are scalars, x and y are vectors and A is an
+          !> m by n matrix.
+          interface gemv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine cgemv(trans,m,n,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n
+                    character, intent(in) :: trans
+                    complex(sp), intent(inout) :: y(*)
+               end subroutine cgemv
+#else 
+               module procedure stdlib_cgemv
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dgemv(trans,m,n,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n
+                    character, intent(in) :: trans
+                    real(dp), intent(inout) :: y(*)
+               end subroutine dgemv
+#else 
+               module procedure stdlib_dgemv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgemv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine sgemv(trans,m,n,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n
+                    character, intent(in) :: trans
+                    real(sp), intent(inout) :: y(*)
+               end subroutine sgemv
+#else 
+               module procedure stdlib_sgemv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgemv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zgemv(trans,m,n,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n
+                    character, intent(in) :: trans
+                    complex(dp), intent(inout) :: y(*)
+               end subroutine zgemv
+#else 
+               module procedure stdlib_zgemv
+#endif
+          end interface gemv
+
+
+
+          !> GER:   performs the rank 1 operation
+          !> A := alpha*x*y**T + A,
+          !> where alpha is a scalar, x is an m element vector, y is an n element
+          !> vector and A is an m by n matrix.
+          interface ger
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dger(m,n,alpha,x,incx,y,incy,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dger
+#else 
+               module procedure stdlib_dger
+#endif
+#:if WITH_QP
+               module procedure stdlib_qger
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine sger(m,n,alpha,x,incx,y,incy,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine sger
+#else 
+               module procedure stdlib_sger
+#endif
+          end interface ger
+
+
+
+          !> GERC:  performs the rank 1 operation
+          !> A := alpha*x*y**H + A,
+          !> where alpha is a scalar, x is an m element vector, y is an n element
+          !> vector and A is an m by n matrix.
+          interface gerc
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine cgerc(m,n,alpha,x,incx,y,incy,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine cgerc
+#else 
+               module procedure stdlib_cgerc
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgerc
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zgerc(m,n,alpha,x,incx,y,incy,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zgerc
+#else 
+               module procedure stdlib_zgerc
+#endif
+          end interface gerc
+
+
+
+          !> GERU:  performs the rank 1 operation
+          !> A := alpha*x*y**T + A,
+          !> where alpha is a scalar, x is an m element vector, y is an n element
+          !> vector and A is an m by n matrix.
+          interface geru
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine cgeru(m,n,alpha,x,incx,y,incy,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine cgeru
+#else 
+               module procedure stdlib_cgeru
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgeru
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zgeru(m,n,alpha,x,incx,y,incy,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zgeru
+#else 
+               module procedure stdlib_zgeru
+#endif
+          end interface geru
+
+
+
+          !> HBMV:  performs the matrix-vector  operation
+          !> y := alpha*A*x + beta*y,
+          !> where alpha and beta are scalars, x and y are n element vectors and
+          !> A is an n by n hermitian band matrix, with k super-diagonals.
+          interface hbmv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine chbmv(uplo,n,k,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,k,lda,n
+                    character, intent(in) :: uplo
+                    complex(sp), intent(inout) :: y(*)
+               end subroutine chbmv
+#else 
+               module procedure stdlib_chbmv
+#endif
+#:if WITH_QP
+               module procedure stdlib_whbmv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zhbmv(uplo,n,k,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,k,lda,n
+                    character, intent(in) :: uplo
+                    complex(dp), intent(inout) :: y(*)
+               end subroutine zhbmv
+#else 
+               module procedure stdlib_zhbmv
+#endif
+          end interface hbmv
+
+
+
+          !> HEMM:  performs one of the matrix-matrix operations
+          !> C := alpha*A*B + beta*C,
+          !> or
+          !> C := alpha*B*A + beta*C,
+          !> where alpha and beta are scalars, A is an hermitian matrix and  B and
+          !> C are m by n matrices.
+          interface hemm
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine chemm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: lda,ldb,ldc,m,n
+                    character, intent(in) :: side,uplo
+                    complex(sp), intent(inout) :: c(ldc,*)
+               end subroutine chemm
+#else 
+               module procedure stdlib_chemm
+#endif
+#:if WITH_QP
+               module procedure stdlib_whemm
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zhemm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: lda,ldb,ldc,m,n
+                    character, intent(in) :: side,uplo
+                    complex(dp), intent(inout) :: c(ldc,*)
+               end subroutine zhemm
+#else 
+               module procedure stdlib_zhemm
+#endif
+          end interface hemm
+
+
+
+          !> HEMV:  performs the matrix-vector  operation
+          !> y := alpha*A*x + beta*y,
+          !> where alpha and beta are scalars, x and y are n element vectors and
+          !> A is an n by n hermitian matrix.
+          interface hemv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine chemv(uplo,n,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,n
+                    character, intent(in) :: uplo
+                    complex(sp), intent(inout) :: y(*)
+               end subroutine chemv
+#else 
+               module procedure stdlib_chemv
+#endif
+#:if WITH_QP
+               module procedure stdlib_whemv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zhemv(uplo,n,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,n
+                    character, intent(in) :: uplo
+                    complex(dp), intent(inout) :: y(*)
+               end subroutine zhemv
+#else 
+               module procedure stdlib_zhemv
+#endif
+          end interface hemv
+
+
+
+          !> HER:   performs the hermitian rank 1 operation
+          !> A := alpha*x*x**H + A,
+          !> where alpha is a real scalar, x is an n element vector and A is an
+          !> n by n hermitian matrix.
+          interface her
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine cher(uplo,n,alpha,x,incx,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha
+                    integer(ilp), intent(in) :: incx,lda,n
+                    character, intent(in) :: uplo
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(in) :: x(*)
+               end subroutine cher
+#else 
+               module procedure stdlib_cher
+#endif
+#:if WITH_QP
+               module procedure stdlib_wher
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zher(uplo,n,alpha,x,incx,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha
+                    integer(ilp), intent(in) :: incx,lda,n
+                    character, intent(in) :: uplo
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(in) :: x(*)
+               end subroutine zher
+#else 
+               module procedure stdlib_zher
+#endif
+          end interface her
+
+
+
+          !> HER2:  performs the hermitian rank 2 operation
+          !> A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
+          !> where alpha is a scalar, x and y are n element vectors and A is an n
+          !> by n hermitian matrix.
+          interface her2
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine cher2(uplo,n,alpha,x,incx,y,incy,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,n
+                    character, intent(in) :: uplo
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine cher2
+#else 
+               module procedure stdlib_cher2
+#endif
+#:if WITH_QP
+               module procedure stdlib_wher2
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zher2(uplo,n,alpha,x,incx,y,incy,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,n
+                    character, intent(in) :: uplo
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zher2
+#else 
+               module procedure stdlib_zher2
+#endif
+          end interface her2
+
+
+
+          !> HER2K:  performs one of the hermitian rank 2k operations
+          !> C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C,
+          !> or
+          !> C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C,
+          !> where  alpha and beta  are scalars with  beta  real,  C is an  n by n
+          !> hermitian matrix and  A and B  are  n by k matrices in the first case
+          !> and  k by n  matrices in the second case.
+          interface her2k
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine cher2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,a(lda,*),b(ldb,*)
+                    real(sp), intent(in) :: beta
+                    integer(ilp), intent(in) :: k,lda,ldb,ldc,n
+                    character, intent(in) :: trans,uplo
+                    complex(sp), intent(inout) :: c(ldc,*)
+               end subroutine cher2k
+#else 
+               module procedure stdlib_cher2k
+#endif
+#:if WITH_QP
+               module procedure stdlib_wher2k
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zher2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,a(lda,*),b(ldb,*)
+                    real(dp), intent(in) :: beta
+                    integer(ilp), intent(in) :: k,lda,ldb,ldc,n
+                    character, intent(in) :: trans,uplo
+                    complex(dp), intent(inout) :: c(ldc,*)
+               end subroutine zher2k
+#else 
+               module procedure stdlib_zher2k
+#endif
+          end interface her2k
+
+
+
+          !> HERK:  performs one of the hermitian rank k operations
+          !> C := alpha*A*A**H + beta*C,
+          !> or
+          !> C := alpha*A**H*A + beta*C,
+          !> where  alpha and beta  are  real scalars,  C is an  n by n  hermitian
+          !> matrix and  A  is an  n by k  matrix in the  first case and a  k by n
+          !> matrix in the second case.
+          interface herk
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine cherk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta
+                    integer(ilp), intent(in) :: k,lda,ldc,n
+                    character, intent(in) :: trans,uplo
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: c(ldc,*)
+               end subroutine cherk
+#else 
+               module procedure stdlib_cherk
+#endif
+#:if WITH_QP
+               module procedure stdlib_wherk
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zherk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta
+                    integer(ilp), intent(in) :: k,lda,ldc,n
+                    character, intent(in) :: trans,uplo
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: c(ldc,*)
+               end subroutine zherk
+#else 
+               module procedure stdlib_zherk
+#endif
+          end interface herk
+
+
+
+          !> HPMV:  performs the matrix-vector operation
+          !> y := alpha*A*x + beta*y,
+          !> where alpha and beta are scalars, x and y are n element vectors and
+          !> A is an n by n hermitian matrix, supplied in packed form.
+          interface hpmv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine chpmv(uplo,n,alpha,ap,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,beta,ap(*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,n
+                    character, intent(in) :: uplo
+                    complex(sp), intent(inout) :: y(*)
+               end subroutine chpmv
+#else 
+               module procedure stdlib_chpmv
+#endif
+#:if WITH_QP
+               module procedure stdlib_whpmv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zhpmv(uplo,n,alpha,ap,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,beta,ap(*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,n
+                    character, intent(in) :: uplo
+                    complex(dp), intent(inout) :: y(*)
+               end subroutine zhpmv
+#else 
+               module procedure stdlib_zhpmv
+#endif
+          end interface hpmv
+
+
+
+          !> HPR:    performs the hermitian rank 1 operation
+          !> A := alpha*x*x**H + A,
+          !> where alpha is a real scalar, x is an n element vector and A is an
+          !> n by n hermitian matrix, supplied in packed form.
+          interface hpr
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine chpr(uplo,n,alpha,x,incx,ap)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha
+                    integer(ilp), intent(in) :: incx,n
+                    character, intent(in) :: uplo
+                    complex(sp), intent(inout) :: ap(*)
+                    complex(sp), intent(in) :: x(*)
+               end subroutine chpr
+#else 
+               module procedure stdlib_chpr
+#endif
+#:if WITH_QP
+               module procedure stdlib_whpr
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zhpr(uplo,n,alpha,x,incx,ap)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha
+                    integer(ilp), intent(in) :: incx,n
+                    character, intent(in) :: uplo
+                    complex(dp), intent(inout) :: ap(*)
+                    complex(dp), intent(in) :: x(*)
+               end subroutine zhpr
+#else 
+               module procedure stdlib_zhpr
+#endif
+          end interface hpr
+
+
+
+          !> HPR2:  performs the hermitian rank 2 operation
+          !> A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
+          !> where alpha is a scalar, x and y are n element vectors and A is an
+          !> n by n hermitian matrix, supplied in packed form.
+          interface hpr2
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine chpr2(uplo,n,alpha,x,incx,y,incy,ap)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,n
+                    character, intent(in) :: uplo
+                    complex(sp), intent(inout) :: ap(*)
+               end subroutine chpr2
+#else 
+               module procedure stdlib_chpr2
+#endif
+#:if WITH_QP
+               module procedure stdlib_whpr2
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zhpr2(uplo,n,alpha,x,incx,y,incy,ap)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,n
+                    character, intent(in) :: uplo
+                    complex(dp), intent(inout) :: ap(*)
+               end subroutine zhpr2
+#else 
+               module procedure stdlib_zhpr2
+#endif
+          end interface hpr2
+
+
+
+          !> !
+          !>
+          !> NRM2: returns the euclidean norm of a vector via the function
+          !> name, so that
+          !> NRM2 := sqrt( x'*x )
+          interface nrm2
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure real(dp) function dnrm2( n, x, incx )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    real(dp), intent(in) :: x(*)
+               end function dnrm2
+#else 
+               module procedure stdlib_dnrm2
+#endif
+#:if WITH_QP
+               module procedure stdlib_qnrm2
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure real(sp) function snrm2( n, x, incx )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    real(sp), intent(in) :: x(*)
+               end function snrm2
+#else 
+               module procedure stdlib_snrm2
+#endif
+          end interface nrm2
+
+
+
+          !> ROT: applies a plane rotation.
+          interface rot
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine drot(n,dx,incx,dy,incy,c,s)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: c,s
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(dp), intent(inout) :: dx(*),dy(*)
+               end subroutine drot
+#else 
+               module procedure stdlib_drot
+#endif
+#:if WITH_QP
+               module procedure stdlib_qrot
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine srot(n,sx,incx,sy,incy,c,s)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: c,s
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(sp), intent(inout) :: sx(*),sy(*)
+               end subroutine srot
+#else 
+               module procedure stdlib_srot
+#endif
+          end interface rot
+
+
+
+          !> !
+          !>
+          !> The computation uses the formulas
+          !> |x| = sqrt( Re(x)**2 + Im(x)**2 )
+          !> sgn(x) = x / |x|  if x /= 0
+          !> = 1        if x  = 0
+          !> c = |a| / sqrt(|a|**2 + |b|**2)
+          !> s = sgn(a) * conjg(b) / sqrt(|a|**2 + |b|**2)
+          !> When a and b are real and r /= 0, the formulas simplify to
+          !> r = sgn(a)*sqrt(|a|**2 + |b|**2)
+          !> c = a / r
+          !> s = b / r
+          !> the same as in SROTG when |a| > |b|.  When |b| >= |a|, the
+          !> sign of c and s will be different from those computed by SROTG
+          !> if the signs of a and b are not the same.
+          interface rotg
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine crotg( a, b, c, s )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(out) :: c
+                    complex(sp), intent(inout) :: a
+                    complex(sp), intent(in) :: b
+                    complex(sp), intent(out) :: s
+               end subroutine crotg
+#else 
+               module procedure stdlib_crotg
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine drotg( a, b, c, s )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(inout) :: a,b
+                    real(dp), intent(out) :: c,s
+               end subroutine drotg
+#else 
+               module procedure stdlib_drotg
+#endif
+#:if WITH_QP
+               module procedure stdlib_qrotg
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine srotg( a, b, c, s )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(inout) :: a,b
+                    real(sp), intent(out) :: c,s
+               end subroutine srotg
+#else 
+               module procedure stdlib_srotg
+#endif
+#:if WITH_QP
+               module procedure stdlib_wrotg
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zrotg( a, b, c, s )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(out) :: c
+                    complex(dp), intent(inout) :: a
+                    complex(dp), intent(in) :: b
+                    complex(dp), intent(out) :: s
+               end subroutine zrotg
+#else 
+               module procedure stdlib_zrotg
+#endif
+          end interface rotg
+
+
+
+          !> APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX
+          !> (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN
+          !> (DY**T)
+          !> DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX >= 0, ELSE
+          !> LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY.
+          !> WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
+          !> DFLAG=-1._dp     DFLAG=0._dp        DFLAG=1._dp     DFLAG=-2.D0
+          !> (DH11  DH12)    (1._dp  DH12)    (DH11  1._dp)    (1._dp  0._dp)
+          !> H=(          )    (          )    (          )    (          )
+          !> (DH21  DH22),   (DH21  1._dp),   (-1._dp DH22),   (0._dp  1._dp).
+          !> SEE ROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM.
+          interface rotm
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine drotm(n,dx,incx,dy,incy,dparam)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(dp), intent(in) :: dparam(5)
+                    real(dp), intent(inout) :: dx(*),dy(*)
+               end subroutine drotm
+#else 
+               module procedure stdlib_drotm
+#endif
+#:if WITH_QP
+               module procedure stdlib_qrotm
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine srotm(n,sx,incx,sy,incy,sparam)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(sp), intent(in) :: sparam(5)
+                    real(sp), intent(inout) :: sx(*),sy(*)
+               end subroutine srotm
+#else 
+               module procedure stdlib_srotm
+#endif
+          end interface rotm
+
+
+
+          !> CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS
+          !> THE SECOND COMPONENT OF THE 2-VECTOR  (SQRT(DD1)*DX1,SQRT(DD2)    DY2)**T.
+          !> WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
+          !> DFLAG=-1._dp     DFLAG=0._dp        DFLAG=1._dp     DFLAG=-2.D0
+          !> (DH11  DH12)    (1._dp  DH12)    (DH11  1._dp)    (1._dp  0._dp)
+          !> H=(          )    (          )    (          )    (          )
+          !> (DH21  DH22),   (DH21  1._dp),   (-1._dp DH22),   (0._dp  1._dp).
+          !> LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22
+          !> RESPECTIVELY. (VALUES OF 1._dp, -1._dp, OR 0._dp IMPLIED BY THE
+          !> VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.)
+          !> THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE
+          !> INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE
+          !> OF DD1 AND DD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM.
+          interface rotmg
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine drotmg(dd1,dd2,dx1,dy1,dparam)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(inout) :: dd1,dd2,dx1
+                    real(dp), intent(in) :: dy1
+                    real(dp), intent(out) :: dparam(5)
+               end subroutine drotmg
+#else 
+               module procedure stdlib_drotmg
+#endif
+#:if WITH_QP
+               module procedure stdlib_qrotmg
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine srotmg(sd1,sd2,sx1,sy1,sparam)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(inout) :: sd1,sd2,sx1
+                    real(sp), intent(in) :: sy1
+                    real(sp), intent(out) :: sparam(5)
+               end subroutine srotmg
+#else 
+               module procedure stdlib_srotmg
+#endif
+          end interface rotmg
+
+
+
+          !> SBMV:  performs the matrix-vector  operation
+          !> y := alpha*A*x + beta*y,
+          !> where alpha and beta are scalars, x and y are n element vectors and
+          !> A is an n by n symmetric band matrix, with k super-diagonals.
+          interface sbmv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dsbmv(uplo,n,k,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,k,lda,n
+                    character, intent(in) :: uplo
+                    real(dp), intent(inout) :: y(*)
+               end subroutine dsbmv
+#else 
+               module procedure stdlib_dsbmv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsbmv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ssbmv(uplo,n,k,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,k,lda,n
+                    character, intent(in) :: uplo
+                    real(sp), intent(inout) :: y(*)
+               end subroutine ssbmv
+#else 
+               module procedure stdlib_ssbmv
+#endif
+          end interface sbmv
+
+
+
+          !> SCAL: scales a vector by a constant.
+          interface scal
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine cscal(n,ca,cx,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: ca
+                    integer(ilp), intent(in) :: incx,n
+                    complex(sp), intent(inout) :: cx(*)
+               end subroutine cscal
+#else 
+               module procedure stdlib_cscal
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dscal(n,da,dx,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: da
+                    integer(ilp), intent(in) :: incx,n
+                    real(dp), intent(inout) :: dx(*)
+               end subroutine dscal
+#else 
+               module procedure stdlib_dscal
+#endif
+#:if WITH_QP
+               module procedure stdlib_qscal
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine sscal(n,sa,sx,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: sa
+                    integer(ilp), intent(in) :: incx,n
+                    real(sp), intent(inout) :: sx(*)
+               end subroutine sscal
+#else 
+               module procedure stdlib_sscal
+#endif
+#:if WITH_QP
+               module procedure stdlib_wscal
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zscal(n,za,zx,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: za
+                    integer(ilp), intent(in) :: incx,n
+                    complex(dp), intent(inout) :: zx(*)
+               end subroutine zscal
+#else 
+               module procedure stdlib_zscal
+#endif
+          end interface scal
+
+
+
+          !> Compute the inner product of two vectors with extended
+          !> precision accumulation and result.
+          !> Returns D.P. dot product accumulated in D.P., for S.P. SX and SY
+          !> SDOT: = sum for I = 0 to N-1 of  SX(LX+I*INCX) * SY(LY+I*INCY),
+          !> where LX = 1 if INCX >= 0, else LX = 1+(1-N)*INCX, and LY is
+          !> defined in a similar way using INCY.
+          interface sdot
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure real(dp) function dsdot(n,sx,incx,sy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(sp), intent(in) :: sx(*),sy(*)
+               end function dsdot
+#else 
+               module procedure stdlib_dsdot
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsdot
+#:endif
+          end interface sdot
+
+
+
+          !> SPMV:  performs the matrix-vector operation
+          !> y := alpha*A*x + beta*y,
+          !> where alpha and beta are scalars, x and y are n element vectors and
+          !> A is an n by n symmetric matrix, supplied in packed form.
+          interface spmv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dspmv(uplo,n,alpha,ap,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta,ap(*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,n
+                    character, intent(in) :: uplo
+                    real(dp), intent(inout) :: y(*)
+               end subroutine dspmv
+#else 
+               module procedure stdlib_dspmv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qspmv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine sspmv(uplo,n,alpha,ap,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta,ap(*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,n
+                    character, intent(in) :: uplo
+                    real(sp), intent(inout) :: y(*)
+               end subroutine sspmv
+#else 
+               module procedure stdlib_sspmv
+#endif
+          end interface spmv
+
+
+
+          !> SPR:    performs the symmetric rank 1 operation
+          !> A := alpha*x*x**T + A,
+          !> where alpha is a real scalar, x is an n element vector and A is an
+          !> n by n symmetric matrix, supplied in packed form.
+          interface spr
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dspr(uplo,n,alpha,x,incx,ap)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,x(*)
+                    integer(ilp), intent(in) :: incx,n
+                    character, intent(in) :: uplo
+                    real(dp), intent(inout) :: ap(*)
+               end subroutine dspr
+#else 
+               module procedure stdlib_dspr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qspr
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine sspr(uplo,n,alpha,x,incx,ap)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,x(*)
+                    integer(ilp), intent(in) :: incx,n
+                    character, intent(in) :: uplo
+                    real(sp), intent(inout) :: ap(*)
+               end subroutine sspr
+#else 
+               module procedure stdlib_sspr
+#endif
+          end interface spr
+
+
+
+          !> SPR2:  performs the symmetric rank 2 operation
+          !> A := alpha*x*y**T + alpha*y*x**T + A,
+          !> where alpha is a scalar, x and y are n element vectors and A is an
+          !> n by n symmetric matrix, supplied in packed form.
+          interface spr2
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dspr2(uplo,n,alpha,x,incx,y,incy,ap)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,n
+                    character, intent(in) :: uplo
+                    real(dp), intent(inout) :: ap(*)
+               end subroutine dspr2
+#else 
+               module procedure stdlib_dspr2
+#endif
+#:if WITH_QP
+               module procedure stdlib_qspr2
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine sspr2(uplo,n,alpha,x,incx,y,incy,ap)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,n
+                    character, intent(in) :: uplo
+                    real(sp), intent(inout) :: ap(*)
+               end subroutine sspr2
+#else 
+               module procedure stdlib_sspr2
+#endif
+          end interface spr2
+
+
+
+          !> SROT: applies a plane rotation, where the cos and sin (c and s) are real
+          !> and the vectors cx and cy are complex.
+          !> jack dongarra, linpack, 3/11/78.
+          interface srot
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine csrot( n, cx, incx, cy, incy, c, s )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(sp), intent(in) :: c,s
+                    complex(sp), intent(inout) :: cx(*),cy(*)
+               end subroutine csrot
+#else 
+               module procedure stdlib_csrot
+#endif
+          end interface srot
+
+
+
+          !> SSCAL: scales a complex vector by a real constant.
+          interface sscal
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine csscal(n,sa,cx,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: sa
+                    integer(ilp), intent(in) :: incx,n
+                    complex(sp), intent(inout) :: cx(*)
+               end subroutine csscal
+#else 
+               module procedure stdlib_csscal
+#endif
+          end interface sscal
+
+
+
+          !> SWAP: interchanges two vectors.
+          interface swap
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine cswap(n,cx,incx,cy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(sp), intent(inout) :: cx(*),cy(*)
+               end subroutine cswap
+#else 
+               module procedure stdlib_cswap
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dswap(n,dx,incx,dy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(dp), intent(inout) :: dx(*),dy(*)
+               end subroutine dswap
+#else 
+               module procedure stdlib_dswap
+#endif
+#:if WITH_QP
+               module procedure stdlib_qswap
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine sswap(n,sx,incx,sy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(sp), intent(inout) :: sx(*),sy(*)
+               end subroutine sswap
+#else 
+               module procedure stdlib_sswap
+#endif
+#:if WITH_QP
+               module procedure stdlib_wswap
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zswap(n,zx,incx,zy,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(dp), intent(inout) :: zx(*),zy(*)
+               end subroutine zswap
+#else 
+               module procedure stdlib_zswap
+#endif
+          end interface swap
+
+
+
+          !> SYMM:  performs one of the matrix-matrix operations
+          !> C := alpha*A*B + beta*C,
+          !> or
+          !> C := alpha*B*A + beta*C,
+          !> where  alpha and beta are scalars, A is a symmetric matrix and  B and
+          !> C are m by n matrices.
+          interface symm
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine csymm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: lda,ldb,ldc,m,n
+                    character, intent(in) :: side,uplo
+                    complex(sp), intent(inout) :: c(ldc,*)
+               end subroutine csymm
+#else 
+               module procedure stdlib_csymm
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dsymm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: lda,ldb,ldc,m,n
+                    character, intent(in) :: side,uplo
+                    real(dp), intent(inout) :: c(ldc,*)
+               end subroutine dsymm
+#else 
+               module procedure stdlib_dsymm
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsymm
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ssymm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: lda,ldb,ldc,m,n
+                    character, intent(in) :: side,uplo
+                    real(sp), intent(inout) :: c(ldc,*)
+               end subroutine ssymm
+#else 
+               module procedure stdlib_ssymm
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsymm
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zsymm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: lda,ldb,ldc,m,n
+                    character, intent(in) :: side,uplo
+                    complex(dp), intent(inout) :: c(ldc,*)
+               end subroutine zsymm
+#else 
+               module procedure stdlib_zsymm
+#endif
+          end interface symm
+
+
+
+          !> SYMV:  performs the matrix-vector  operation
+          !> y := alpha*A*x + beta*y,
+          !> where alpha and beta are scalars, x and y are n element vectors and
+          !> A is an n by n symmetric matrix.
+          interface symv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dsymv(uplo,n,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,n
+                    character, intent(in) :: uplo
+                    real(dp), intent(inout) :: y(*)
+               end subroutine dsymv
+#else 
+               module procedure stdlib_dsymv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsymv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ssymv(uplo,n,alpha,a,lda,x,incx,beta,y,incy)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,n
+                    character, intent(in) :: uplo
+                    real(sp), intent(inout) :: y(*)
+               end subroutine ssymv
+#else 
+               module procedure stdlib_ssymv
+#endif
+          end interface symv
+
+
+
+          !> SYR:   performs the symmetric rank 1 operation
+          !> A := alpha*x*x**T + A,
+          !> where alpha is a real scalar, x is an n element vector and A is an
+          !> n by n symmetric matrix.
+          interface syr
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dsyr(uplo,n,alpha,x,incx,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,x(*)
+                    integer(ilp), intent(in) :: incx,lda,n
+                    character, intent(in) :: uplo
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dsyr
+#else 
+               module procedure stdlib_dsyr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsyr
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ssyr(uplo,n,alpha,x,incx,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,x(*)
+                    integer(ilp), intent(in) :: incx,lda,n
+                    character, intent(in) :: uplo
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine ssyr
+#else 
+               module procedure stdlib_ssyr
+#endif
+          end interface syr
+
+
+
+          !> SYR2:  performs the symmetric rank 2 operation
+          !> A := alpha*x*y**T + alpha*y*x**T + A,
+          !> where alpha is a scalar, x and y are n element vectors and A is an n
+          !> by n symmetric matrix.
+          interface syr2
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dsyr2(uplo,n,alpha,x,incx,y,incy,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,n
+                    character, intent(in) :: uplo
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dsyr2
+#else 
+               module procedure stdlib_dsyr2
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsyr2
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ssyr2(uplo,n,alpha,x,incx,y,incy,a,lda)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,x(*),y(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,n
+                    character, intent(in) :: uplo
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine ssyr2
+#else 
+               module procedure stdlib_ssyr2
+#endif
+          end interface syr2
+
+
+
+          !> SYR2K:  performs one of the symmetric rank 2k operations
+          !> C := alpha*A*B**T + alpha*B*A**T + beta*C,
+          !> or
+          !> C := alpha*A**T*B + alpha*B**T*A + beta*C,
+          !> where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+          !> and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+          !> matrices in the second case.
+          interface syr2k
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine csyr2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: k,lda,ldb,ldc,n
+                    character, intent(in) :: trans,uplo
+                    complex(sp), intent(inout) :: c(ldc,*)
+               end subroutine csyr2k
+#else 
+               module procedure stdlib_csyr2k
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dsyr2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: k,lda,ldb,ldc,n
+                    character, intent(in) :: trans,uplo
+                    real(dp), intent(inout) :: c(ldc,*)
+               end subroutine dsyr2k
+#else 
+               module procedure stdlib_dsyr2k
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsyr2k
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ssyr2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: k,lda,ldb,ldc,n
+                    character, intent(in) :: trans,uplo
+                    real(sp), intent(inout) :: c(ldc,*)
+               end subroutine ssyr2k
+#else 
+               module procedure stdlib_ssyr2k
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsyr2k
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zsyr2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,beta,a(lda,*),b(ldb,*)
+                    integer(ilp), intent(in) :: k,lda,ldb,ldc,n
+                    character, intent(in) :: trans,uplo
+                    complex(dp), intent(inout) :: c(ldc,*)
+               end subroutine zsyr2k
+#else 
+               module procedure stdlib_zsyr2k
+#endif
+          end interface syr2k
+
+
+
+          !> SYRK:  performs one of the symmetric rank k operations
+          !> C := alpha*A*A**T + beta*C,
+          !> or
+          !> C := alpha*A**T*A + beta*C,
+          !> where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+          !> and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+          !> in the second case.
+          interface syrk
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine csyrk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,beta,a(lda,*)
+                    integer(ilp), intent(in) :: k,lda,ldc,n
+                    character, intent(in) :: trans,uplo
+                    complex(sp), intent(inout) :: c(ldc,*)
+               end subroutine csyrk
+#else 
+               module procedure stdlib_csyrk
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dsyrk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta,a(lda,*)
+                    integer(ilp), intent(in) :: k,lda,ldc,n
+                    character, intent(in) :: trans,uplo
+                    real(dp), intent(inout) :: c(ldc,*)
+               end subroutine dsyrk
+#else 
+               module procedure stdlib_dsyrk
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsyrk
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ssyrk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta,a(lda,*)
+                    integer(ilp), intent(in) :: k,lda,ldc,n
+                    character, intent(in) :: trans,uplo
+                    real(sp), intent(inout) :: c(ldc,*)
+               end subroutine ssyrk
+#else 
+               module procedure stdlib_ssyrk
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsyrk
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine zsyrk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,beta,a(lda,*)
+                    integer(ilp), intent(in) :: k,lda,ldc,n
+                    character, intent(in) :: trans,uplo
+                    complex(dp), intent(inout) :: c(ldc,*)
+               end subroutine zsyrk
+#else 
+               module procedure stdlib_zsyrk
+#endif
+          end interface syrk
+
+
+
+          !> TBMV:  performs one of the matrix-vector operations
+          !> x := A*x,   or   x := A**T*x,   or   x := A**H*x,
+          !> where x is an n element vector and  A is an n by n unit, or non-unit,
+          !> upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+          interface tbmv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ctbmv(uplo,trans,diag,n,k,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,k,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: x(*)
+               end subroutine ctbmv
+#else 
+               module procedure stdlib_ctbmv
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dtbmv(uplo,trans,diag,n,k,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,k,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(inout) :: x(*)
+               end subroutine dtbmv
+#else 
+               module procedure stdlib_dtbmv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtbmv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine stbmv(uplo,trans,diag,n,k,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,k,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(inout) :: x(*)
+               end subroutine stbmv
+#else 
+               module procedure stdlib_stbmv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtbmv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ztbmv(uplo,trans,diag,n,k,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,k,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: x(*)
+               end subroutine ztbmv
+#else 
+               module procedure stdlib_ztbmv
+#endif
+          end interface tbmv
+
+
+
+          !> TBSV:  solves one of the systems of equations
+          !> A*x = b,   or   A**T*x = b,   or   A**H*x = b,
+          !> where b and x are n element vectors and A is an n by n unit, or
+          !> non-unit, upper or lower triangular band matrix, with ( k + 1 )
+          !> diagonals.
+          !> No test for singularity or near-singularity is included in this
+          !> routine. Such tests must be performed before calling this routine.
+          interface tbsv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ctbsv(uplo,trans,diag,n,k,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,k,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: x(*)
+               end subroutine ctbsv
+#else 
+               module procedure stdlib_ctbsv
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dtbsv(uplo,trans,diag,n,k,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,k,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(inout) :: x(*)
+               end subroutine dtbsv
+#else 
+               module procedure stdlib_dtbsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtbsv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine stbsv(uplo,trans,diag,n,k,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,k,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(inout) :: x(*)
+               end subroutine stbsv
+#else 
+               module procedure stdlib_stbsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtbsv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ztbsv(uplo,trans,diag,n,k,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,k,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: x(*)
+               end subroutine ztbsv
+#else 
+               module procedure stdlib_ztbsv
+#endif
+          end interface tbsv
+
+
+
+          !> TPMV:  performs one of the matrix-vector operations
+          !> x := A*x,   or   x := A**T*x,   or   x := A**H*x,
+          !> where x is an n element vector and  A is an n by n unit, or non-unit,
+          !> upper or lower triangular matrix, supplied in packed form.
+          interface tpmv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ctpmv(uplo,trans,diag,n,ap,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    character, intent(in) :: diag,trans,uplo
+                    complex(sp), intent(in) :: ap(*)
+                    complex(sp), intent(inout) :: x(*)
+               end subroutine ctpmv
+#else 
+               module procedure stdlib_ctpmv
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dtpmv(uplo,trans,diag,n,ap,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    character, intent(in) :: diag,trans,uplo
+                    real(dp), intent(in) :: ap(*)
+                    real(dp), intent(inout) :: x(*)
+               end subroutine dtpmv
+#else 
+               module procedure stdlib_dtpmv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtpmv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine stpmv(uplo,trans,diag,n,ap,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    character, intent(in) :: diag,trans,uplo
+                    real(sp), intent(in) :: ap(*)
+                    real(sp), intent(inout) :: x(*)
+               end subroutine stpmv
+#else 
+               module procedure stdlib_stpmv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtpmv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ztpmv(uplo,trans,diag,n,ap,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    character, intent(in) :: diag,trans,uplo
+                    complex(dp), intent(in) :: ap(*)
+                    complex(dp), intent(inout) :: x(*)
+               end subroutine ztpmv
+#else 
+               module procedure stdlib_ztpmv
+#endif
+          end interface tpmv
+
+
+
+          !> TPSV:  solves one of the systems of equations
+          !> A*x = b,   or   A**T*x = b,   or   A**H*x = b,
+          !> where b and x are n element vectors and A is an n by n unit, or
+          !> non-unit, upper or lower triangular matrix, supplied in packed form.
+          !> No test for singularity or near-singularity is included in this
+          !> routine. Such tests must be performed before calling this routine.
+          interface tpsv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ctpsv(uplo,trans,diag,n,ap,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    character, intent(in) :: diag,trans,uplo
+                    complex(sp), intent(in) :: ap(*)
+                    complex(sp), intent(inout) :: x(*)
+               end subroutine ctpsv
+#else 
+               module procedure stdlib_ctpsv
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dtpsv(uplo,trans,diag,n,ap,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    character, intent(in) :: diag,trans,uplo
+                    real(dp), intent(in) :: ap(*)
+                    real(dp), intent(inout) :: x(*)
+               end subroutine dtpsv
+#else 
+               module procedure stdlib_dtpsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtpsv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine stpsv(uplo,trans,diag,n,ap,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    character, intent(in) :: diag,trans,uplo
+                    real(sp), intent(in) :: ap(*)
+                    real(sp), intent(inout) :: x(*)
+               end subroutine stpsv
+#else 
+               module procedure stdlib_stpsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtpsv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ztpsv(uplo,trans,diag,n,ap,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    character, intent(in) :: diag,trans,uplo
+                    complex(dp), intent(in) :: ap(*)
+                    complex(dp), intent(inout) :: x(*)
+               end subroutine ztpsv
+#else 
+               module procedure stdlib_ztpsv
+#endif
+          end interface tpsv
+
+
+
+          !> TRMM:  performs one of the matrix-matrix operations
+          !> B := alpha*op( A )*B,   or   B := alpha*B*op( A )
+          !> where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+          !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+          !> op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
+          interface trmm
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ctrmm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,a(lda,*)
+                    integer(ilp), intent(in) :: lda,ldb,m,n
+                    character, intent(in) :: diag,side,transa,uplo
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine ctrmm
+#else 
+               module procedure stdlib_ctrmm
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dtrmm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,a(lda,*)
+                    integer(ilp), intent(in) :: lda,ldb,m,n
+                    character, intent(in) :: diag,side,transa,uplo
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dtrmm
+#else 
+               module procedure stdlib_dtrmm
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrmm
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine strmm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,a(lda,*)
+                    integer(ilp), intent(in) :: lda,ldb,m,n
+                    character, intent(in) :: diag,side,transa,uplo
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine strmm
+#else 
+               module procedure stdlib_strmm
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrmm
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ztrmm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,a(lda,*)
+                    integer(ilp), intent(in) :: lda,ldb,m,n
+                    character, intent(in) :: diag,side,transa,uplo
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine ztrmm
+#else 
+               module procedure stdlib_ztrmm
+#endif
+          end interface trmm
+
+
+
+          !> TRMV:  performs one of the matrix-vector operations
+          !> x := A*x,   or   x := A**T*x,   or   x := A**H*x,
+          !> where x is an n element vector and  A is an n by n unit, or non-unit,
+          !> upper or lower triangular matrix.
+          interface trmv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ctrmv(uplo,trans,diag,n,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: x(*)
+               end subroutine ctrmv
+#else 
+               module procedure stdlib_ctrmv
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dtrmv(uplo,trans,diag,n,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(inout) :: x(*)
+               end subroutine dtrmv
+#else 
+               module procedure stdlib_dtrmv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrmv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine strmv(uplo,trans,diag,n,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(inout) :: x(*)
+               end subroutine strmv
+#else 
+               module procedure stdlib_strmv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrmv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ztrmv(uplo,trans,diag,n,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: x(*)
+               end subroutine ztrmv
+#else 
+               module procedure stdlib_ztrmv
+#endif
+          end interface trmv
+
+
+
+          !> TRSM:  solves one of the matrix equations
+          !> op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+          !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+          !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+          !> op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
+          !> The matrix X is overwritten on B.
+          interface trsm
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ctrsm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: alpha,a(lda,*)
+                    integer(ilp), intent(in) :: lda,ldb,m,n
+                    character, intent(in) :: diag,side,transa,uplo
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine ctrsm
+#else 
+               module procedure stdlib_ctrsm
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dtrsm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,a(lda,*)
+                    integer(ilp), intent(in) :: lda,ldb,m,n
+                    character, intent(in) :: diag,side,transa,uplo
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dtrsm
+#else 
+               module procedure stdlib_dtrsm
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrsm
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine strsm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,a(lda,*)
+                    integer(ilp), intent(in) :: lda,ldb,m,n
+                    character, intent(in) :: diag,side,transa,uplo
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine strsm
+#else 
+               module procedure stdlib_strsm
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrsm
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ztrsm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: alpha,a(lda,*)
+                    integer(ilp), intent(in) :: lda,ldb,m,n
+                    character, intent(in) :: diag,side,transa,uplo
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine ztrsm
+#else 
+               module procedure stdlib_ztrsm
+#endif
+          end interface trsm
+
+
+
+          !> TRSV:  solves one of the systems of equations
+          !> A*x = b,   or   A**T*x = b,   or   A**H*x = b,
+          !> where b and x are n element vectors and A is an n by n unit, or
+          !> non-unit, upper or lower triangular matrix.
+          !> No test for singularity or near-singularity is included in this
+          !> routine. Such tests must be performed before calling this routine.
+          interface trsv
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ctrsv(uplo,trans,diag,n,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: x(*)
+               end subroutine ctrsv
+#else 
+               module procedure stdlib_ctrsv
+#endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine dtrsv(uplo,trans,diag,n,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(inout) :: x(*)
+               end subroutine dtrsv
+#else 
+               module procedure stdlib_dtrsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrsv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine strsv(uplo,trans,diag,n,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(inout) :: x(*)
+               end subroutine strsv
+#else 
+               module procedure stdlib_strsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrsv
+#:endif
+#ifdef STDLIB_EXTERNAL_BLAS
+               pure subroutine ztrsv(uplo,trans,diag,n,a,lda,x,incx)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,lda,n
+                    character, intent(in) :: diag,trans,uplo
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: x(*)
+               end subroutine ztrsv
+#else 
+               module procedure stdlib_ztrsv
+#endif
+          end interface trsv
+
+
+
+
+
+end module stdlib_linalg_blas
diff --git a/src/stdlib_linalg_blas_aux.fypp b/src/stdlib_linalg_blas_aux.fypp
new file mode 100644
index 000000000..20e024487
--- /dev/null
+++ b/src/stdlib_linalg_blas_aux.fypp
@@ -0,0 +1,460 @@
+#:include "common.fypp" 
+module stdlib_linalg_blas_aux
+     use stdlib_linalg_constants
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+     public :: stdlib_dcabs1
+     public :: stdlib_icamax
+     public :: stdlib_idamax
+     public :: stdlib_isamax
+     public :: stdlib_izamax
+     public :: stdlib_lsame
+     public :: stdlib_scabs1
+     public :: stdlib_xerbla
+     public :: stdlib_xerbla_array
+#:if WITH_QP 
+     public :: stdlib_qcabs1
+#:endif 
+#:if WITH_QP 
+     public :: stdlib_iqamax
+#:endif 
+#:if WITH_QP 
+     public :: stdlib_iwamax
+#:endif 
+
+
+     contains
+
+     !> DCABS1: computes |Re(.)| + |Im(.)| of a double complex number
+
+     pure real(dp) function stdlib_dcabs1(z)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: z
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag
+           stdlib_dcabs1 = abs(real(z,KIND=dp)) + abs(aimag(z))
+           return
+     end function stdlib_dcabs1
+
+     !> ISAMAX: finds the index of the first element having maximum absolute value.
+
+     pure integer(ilp) function stdlib_isamax(n,sx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           real(sp), intent(in) :: sx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: smax
+           integer(ilp) :: i, ix
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           stdlib_isamax = 0
+           if (n<1 .or. incx<=0) return
+           stdlib_isamax = 1
+           if (n==1) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              smax = abs(sx(1))
+              do i = 2,n
+                 if (abs(sx(i))>smax) then
+                    stdlib_isamax = i
+                    smax = abs(sx(i))
+                 end if
+              end do
+           else
+              ! code for increment not equal to 1
+              ix = 1
+              smax = abs(sx(1))
+              ix = ix + incx
+              do i = 2,n
+                 if (abs(sx(ix))>smax) then
+                    stdlib_isamax = i
+                    smax = abs(sx(ix))
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end function stdlib_isamax
+
+     !> IZAMAX: finds the index of the first element having maximum |Re(.)| + |Im(.)|
+
+     pure integer(ilp) function stdlib_izamax(n,zx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(dp), intent(in) :: zx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: dmax
+           integer(ilp) :: i, ix
+           stdlib_izamax = 0
+           if (n<1 .or. incx<=0) return
+           stdlib_izamax = 1
+           if (n==1) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              dmax = stdlib_dcabs1(zx(1))
+              do i = 2,n
+                 if (stdlib_dcabs1(zx(i))>dmax) then
+                    stdlib_izamax = i
+                    dmax = stdlib_dcabs1(zx(i))
+                 end if
+              end do
+           else
+              ! code for increment not equal to 1
+              ix = 1
+              dmax = stdlib_dcabs1(zx(1))
+              ix = ix + incx
+              do i = 2,n
+                 if (stdlib_dcabs1(zx(ix))>dmax) then
+                    stdlib_izamax = i
+                    dmax = stdlib_dcabs1(zx(ix))
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end function stdlib_izamax
+
+     !> LSAME: returns .TRUE. if CA is the same letter as CB regardless of
+     !> case.
+
+     pure logical(lk) function stdlib_lsame(ca,cb)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: ca, cb
+       ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: ichar
+           ! Local Scalars 
+           integer(ilp) :: inta, intb, zcode
+           ! test if the characters are equal
+           stdlib_lsame = ca == cb
+           if (stdlib_lsame) return
+           ! now test for equivalence if both characters are alphabetic.
+           zcode = ichar('Z')
+           ! use 'z' rather than 'a' so that ascii can be detected on prime
+           ! machines, on which ichar returns a value with bit 8 set.
+           ! ichar('a') on prime machines returns 193 which is the same as
+           ! ichar('a') on an ebcdic machine.
+           inta = ichar(ca)
+           intb = ichar(cb)
+           if (zcode==90 .or. zcode==122) then
+              ! ascii is assumed - zcode is the ascii code of either lower or
+              ! upper case 'z'.
+               if (inta>=97 .and. inta<=122) inta = inta - 32
+               if (intb>=97 .and. intb<=122) intb = intb - 32
+           else if (zcode==233 .or. zcode==169) then
+              ! ebcdic is assumed - zcode is the ebcdic code of either lower or
+              ! upper case 'z'.
+               if (inta>=129 .and. inta<=137 .or.inta>=145 .and. inta<=153 .or.inta>=162 .and. &
+                         inta<=169) inta = inta + 64
+               if (intb>=129 .and. intb<=137 .or.intb>=145 .and. intb<=153 .or.intb>=162 .and. &
+                         intb<=169) intb = intb + 64
+           else if (zcode==218 .or. zcode==250) then
+              ! ascii is assumed, on prime machines - zcode is the ascii code
+              ! plus 128 of either lower or upper case 'z'.
+               if (inta>=225 .and. inta<=250) inta = inta - 32
+               if (intb>=225 .and. intb<=250) intb = intb - 32
+           end if
+           stdlib_lsame = inta == intb
+           ! return
+     end function stdlib_lsame
+
+     !> SCABS1: computes |Re(.)| + |Im(.)| of a complex number
+
+     pure real(sp) function stdlib_scabs1(z)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: z
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,real
+           stdlib_scabs1 = abs(real(z,KIND=sp)) + abs(aimag(z))
+           return
+     end function stdlib_scabs1
+
+     !> XERBLA:  is an error handler for the LAPACK routines.
+     !> It is called by an LAPACK routine if an input parameter has an
+     !> invalid value.  A message is printed and execution stops.
+     !> Installers may consider modifying the STOP statement in order to
+     !> call system-specific exception-handling facilities.
+
+     pure subroutine stdlib_xerbla( srname, info )
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character(len=*), intent(in) :: srname
+           integer(ilp), intent(in) :: info
+       ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: len_trim
+           ! Executable Statements 
+      9999 format( ' ** ON ENTRY TO ', a, ' PARAMETER NUMBER ', i2, ' HAD ','AN ILLEGAL VALUE' )
+                
+     end subroutine stdlib_xerbla
+
+     !> XERBLA_ARRAY: assists other languages in calling XERBLA, the LAPACK
+     !> and BLAS error handler.  Rather than taking a Fortran string argument
+     !> as the function's name, XERBLA_ARRAY takes an array of single
+     !> characters along with the array's length.  XERBLA_ARRAY then copies
+     !> up to 32 characters of that array into a Fortran string and passes
+     !> that to XERBLA.  If called with a non-positive SRNAME_LEN,
+     !> XERBLA_ARRAY will call XERBLA with a string of all blank characters.
+     !> Say some macro or other device makes XERBLA_ARRAY available to C99
+     !> by a name lapack_xerbla and with a common Fortran calling convention.
+     !> Then a C99 program could invoke XERBLA via:
+     !> {
+     !> int flen = strlen(__func__);
+     !> lapack_xerbla(__func__,
+     !> }
+     !> Providing XERBLA_ARRAY is not necessary for intercepting LAPACK
+     !> errors.  XERBLA_ARRAY calls XERBLA.
+
+     pure subroutine stdlib_xerbla_array(srname_array, srname_len, info)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: srname_len, info
+           ! Array Arguments 
+           character(1), intent(in) :: srname_array(srname_len)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Local Arrays 
+           character*32 srname
+           ! Intrinsic Functions 
+           intrinsic :: min,len
+           ! Executable Statements 
+           srname = ''
+           do i = 1, min( srname_len, len( srname ) )
+              srname( i:i ) = srname_array( i )
+           end do
+           call stdlib_xerbla( srname, info )
+           return
+     end subroutine stdlib_xerbla_array
+
+#:if WITH_QP 
+
+     !> DCABS1: computes |Re(.)| + |Im(.)| of a double complex number
+
+     pure real(qp) function stdlib_qcabs1(z)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: z
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag
+           stdlib_qcabs1 = abs(real(z,KIND=qp)) + abs(aimag(z))
+           return
+     end function stdlib_qcabs1
+#:endif 
+
+#:if WITH_QP 
+
+     !> IDAMAX: finds the index of the first element having maximum absolute value.
+
+     pure integer(ilp) function stdlib_iqamax(n,dx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           real(qp), intent(in) :: dx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: dmax
+           integer(ilp) :: i, ix
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           stdlib_iqamax = 0
+           if (n<1 .or. incx<=0) return
+           stdlib_iqamax = 1
+           if (n==1) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              dmax = abs(dx(1))
+              do i = 2,n
+                 if (abs(dx(i))>dmax) then
+                    stdlib_iqamax = i
+                    dmax = abs(dx(i))
+                 end if
+              end do
+           else
+              ! code for increment not equal to 1
+              ix = 1
+              dmax = abs(dx(1))
+              ix = ix + incx
+              do i = 2,n
+                 if (abs(dx(ix))>dmax) then
+                    stdlib_iqamax = i
+                    dmax = abs(dx(ix))
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end function stdlib_iqamax
+#:endif 
+
+#:if WITH_QP 
+
+     !> IZAMAX: finds the index of the first element having maximum |Re(.)| + |Im(.)|
+
+     pure integer(ilp) function stdlib_iwamax(n,zx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(qp), intent(in) :: zx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: dmax
+           integer(ilp) :: i, ix
+           stdlib_iwamax = 0
+           if (n<1 .or. incx<=0) return
+           stdlib_iwamax = 1
+           if (n==1) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              dmax = stdlib_qcabs1(zx(1))
+              do i = 2,n
+                 if (stdlib_qcabs1(zx(i))>dmax) then
+                    stdlib_iwamax = i
+                    dmax = stdlib_qcabs1(zx(i))
+                 end if
+              end do
+           else
+              ! code for increment not equal to 1
+              ix = 1
+              dmax = stdlib_qcabs1(zx(1))
+              ix = ix + incx
+              do i = 2,n
+                 if (stdlib_qcabs1(zx(ix))>dmax) then
+                    stdlib_iwamax = i
+                    dmax = stdlib_qcabs1(zx(ix))
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end function stdlib_iwamax
+#:endif 
+
+     !> ICAMAX: finds the index of the first element having maximum |Re(.)| + |Im(.)|
+
+     pure integer(ilp) function stdlib_icamax(n,cx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(sp), intent(in) :: cx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: smax
+           integer(ilp) :: i, ix
+           stdlib_icamax = 0
+           if (n<1 .or. incx<=0) return
+           stdlib_icamax = 1
+           if (n==1) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              smax = stdlib_scabs1(cx(1))
+              do i = 2,n
+                 if (stdlib_scabs1(cx(i))>smax) then
+                    stdlib_icamax = i
+                    smax = stdlib_scabs1(cx(i))
+                 end if
+              end do
+           else
+              ! code for increment not equal to 1
+              ix = 1
+              smax = stdlib_scabs1(cx(1))
+              ix = ix + incx
+              do i = 2,n
+                 if (stdlib_scabs1(cx(ix))>smax) then
+                    stdlib_icamax = i
+                    smax = stdlib_scabs1(cx(ix))
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end function stdlib_icamax
+
+     !> IDAMAX: finds the index of the first element having maximum absolute value.
+
+     pure integer(ilp) function stdlib_idamax(n,dx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           real(dp), intent(in) :: dx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: dmax
+           integer(ilp) :: i, ix
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           stdlib_idamax = 0
+           if (n<1 .or. incx<=0) return
+           stdlib_idamax = 1
+           if (n==1) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              dmax = abs(dx(1))
+              do i = 2,n
+                 if (abs(dx(i))>dmax) then
+                    stdlib_idamax = i
+                    dmax = abs(dx(i))
+                 end if
+              end do
+           else
+              ! code for increment not equal to 1
+              ix = 1
+              dmax = abs(dx(1))
+              ix = ix + incx
+              do i = 2,n
+                 if (abs(dx(ix))>dmax) then
+                    stdlib_idamax = i
+                    dmax = abs(dx(ix))
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end function stdlib_idamax
+
+
+
+end module stdlib_linalg_blas_aux
diff --git a/src/stdlib_linalg_blas_c.fypp b/src/stdlib_linalg_blas_c.fypp
new file mode 100644
index 000000000..68890f8d4
--- /dev/null
+++ b/src/stdlib_linalg_blas_c.fypp
@@ -0,0 +1,5061 @@
+#:include "common.fypp" 
+module stdlib_linalg_blas_c
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas_aux
+     use stdlib_linalg_blas_s
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+     public :: stdlib_caxpy
+     public :: stdlib_ccopy
+     public :: stdlib_cdotc
+     public :: stdlib_cdotu
+     public :: stdlib_cgbmv
+     public :: stdlib_cgemm
+     public :: stdlib_cgemv
+     public :: stdlib_cgerc
+     public :: stdlib_cgeru
+     public :: stdlib_chbmv
+     public :: stdlib_chemm
+     public :: stdlib_chemv
+     public :: stdlib_cher
+     public :: stdlib_cher2
+     public :: stdlib_cher2k
+     public :: stdlib_cherk
+     public :: stdlib_chpmv
+     public :: stdlib_chpr
+     public :: stdlib_chpr2
+     public :: stdlib_crotg
+     public :: stdlib_cscal
+     public :: stdlib_csrot
+     public :: stdlib_csscal
+     public :: stdlib_cswap
+     public :: stdlib_csymm
+     public :: stdlib_csyr2k
+     public :: stdlib_csyrk
+     public :: stdlib_ctbmv
+     public :: stdlib_ctbsv
+     public :: stdlib_ctpmv
+     public :: stdlib_ctpsv
+     public :: stdlib_ctrmm
+     public :: stdlib_ctrmv
+     public :: stdlib_ctrsm
+     public :: stdlib_ctrsv
+
+     ! 32-bit real constants 
+     real(sp),    parameter, private ::     negone = -1.00_sp
+     real(sp),    parameter, private ::       zero = 0.00_sp
+     real(sp),    parameter, private ::       half = 0.50_sp
+     real(sp),    parameter, private ::        one = 1.00_sp
+     real(sp),    parameter, private ::        two = 2.00_sp
+     real(sp),    parameter, private ::      three = 3.00_sp
+     real(sp),    parameter, private ::       four = 4.00_sp
+     real(sp),    parameter, private ::      eight = 8.00_sp
+     real(sp),    parameter, private ::        ten = 10.00_sp
+
+     ! 32-bit complex constants 
+     complex(sp), parameter, private :: czero   = ( 0.0_sp,0.0_sp)
+     complex(sp), parameter, private :: chalf   = ( 0.5_sp,0.0_sp)
+     complex(sp), parameter, private :: cone    = ( 1.0_sp,0.0_sp)
+     complex(sp), parameter, private :: cnegone = (-1.0_sp,0.0_sp)
+
+     ! 32-bit scaling constants 
+     integer,     parameter, private :: maxexp = maxexponent(zero) 
+     integer,     parameter, private :: minexp = minexponent(zero) 
+     real(sp),    parameter, private :: rradix = real(radix(zero),sp) 
+     real(sp),    parameter, private :: ulp    = epsilon(zero) 
+     real(sp),    parameter, private :: eps    = ulp*half 
+     real(sp),    parameter, private :: safmin = rradix**max(minexp-1,1-maxexp) 
+     real(sp),    parameter, private :: safmax = one/safmin 
+     real(sp),    parameter, private :: smlnum = safmin/ulp 
+     real(sp),    parameter, private :: bignum = safmax*ulp 
+     real(sp),    parameter, private :: rtmin  = sqrt(smlnum) 
+     real(sp),    parameter, private :: rtmax  = sqrt(bignum) 
+
+     ! 32-bit Blue's scaling constants 
+     ! ssml>=1/s and sbig==1/S with s,S as defined in https://doi.org/10.1145/355769.355771 
+     real(sp),    parameter, private :: tsml   = rradix**ceiling((minexp-1)*half) 
+     real(sp),    parameter, private :: tbig   = rradix**floor((maxexp-digits(zero)+1)*half) 
+     real(sp),    parameter, private :: ssml   = rradix**(-floor((minexp-digits(zero))*half)) 
+     real(sp),    parameter, private :: sbig   = rradix**(-ceiling((maxexp+digits(zero)-1)*half)) 
+
+
+     contains
+
+     !> CAXPY: constant times a vector plus a vector.
+
+     pure subroutine stdlib_caxpy(n,ca,cx,incx,cy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: ca
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(sp), intent(in) :: cx(*)
+           complex(sp), intent(inout) :: cy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           if (n<=0) return
+           if (stdlib_scabs1(ca)==0.0e+0_sp) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              do i = 1,n
+                 cy(i) = cy(i) + ca*cx(i)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 cy(iy) = cy(iy) + ca*cx(ix)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_caxpy
+
+     !> CCOPY: copies a vector x to a vector y.
+
+     pure subroutine stdlib_ccopy(n,cx,incx,cy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(sp), intent(in) :: cx(*)
+           complex(sp), intent(out) :: cy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              do i = 1,n
+                 cy(i) = cx(i)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 cy(iy) = cx(ix)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_ccopy
+
+     !> CDOTC: forms the dot product of two complex vectors
+     !> CDOTC = X^H * Y
+
+     pure complex(sp) function stdlib_cdotc(n,cx,incx,cy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(sp), intent(in) :: cx(*), cy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           complex(sp) :: ctemp
+           integer(ilp) :: i, ix, iy
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ctemp = (0.0_sp,0.0_sp)
+           stdlib_cdotc = (0.0_sp,0.0_sp)
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              do i = 1,n
+                 ctemp = ctemp + conjg(cx(i))*cy(i)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 ctemp = ctemp + conjg(cx(ix))*cy(iy)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           stdlib_cdotc = ctemp
+           return
+     end function stdlib_cdotc
+
+     !> CDOTU: forms the dot product of two complex vectors
+     !> CDOTU = X^T * Y
+
+     pure complex(sp) function stdlib_cdotu(n,cx,incx,cy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(sp), intent(in) :: cx(*), cy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           complex(sp) :: ctemp
+           integer(ilp) :: i, ix, iy
+           ctemp = (0.0_sp,0.0_sp)
+           stdlib_cdotu = (0.0_sp,0.0_sp)
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              do i = 1,n
+                 ctemp = ctemp + cx(i)*cy(i)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 ctemp = ctemp + cx(ix)*cy(iy)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           stdlib_cdotu = ctemp
+           return
+     end function stdlib_cdotu
+
+     !> CGBMV:  performs one of the matrix-vector operations
+     !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,   or
+     !> y := alpha*A**H*x + beta*y,
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n band matrix, with kl sub-diagonals and ku super-diagonals.
+
+     pure subroutine stdlib_cgbmv(trans,m,n,kl,ku,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, kl, ku, lda, m, n
+           character, intent(in) :: trans
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), x(*)
+           complex(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kup1, kx, ky, lenx, leny
+           logical(lk) :: noconj
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 1
+           else if (m<0) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (kl<0) then
+               info = 4
+           else if (ku<0) then
+               info = 5
+           else if (lda< (kl+ku+1)) then
+               info = 8
+           else if (incx==0) then
+               info = 10
+           else if (incy==0) then
+               info = 13
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CGBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==czero).and. (beta==cone))) return
+           noconj = stdlib_lsame(trans,'T')
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if (stdlib_lsame(trans,'N')) then
+               lenx = n
+               leny = m
+           else
+               lenx = m
+               leny = n
+           end if
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (lenx-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (leny-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the band part of a.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,leny
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,leny
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,leny
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,leny
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           kup1 = ku + 1
+           if (stdlib_lsame(trans,'N')) then
+              ! form  y := alpha*a*x + y.
+               jx = kx
+               if (incy==1) then
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           y(i) = y(i) + temp*a(k+i,j)
+                       end do
+                       jx = jx + incx
+                   end do
+               else
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       iy = ky
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           y(iy) = y(iy) + temp*a(k+i,j)
+                           iy = iy + incy
+                       end do
+                       jx = jx + incx
+                       if (j>ku) ky = ky + incy
+                   end do
+               end if
+           else
+              ! form  y := alpha*a**t*x + y  or  y := alpha*a**h*x + y.
+               jy = ky
+               if (incx==1) then
+                   do j = 1,n
+                       temp = czero
+                       k = kup1 - j
+                       if (noconj) then
+                           do i = max(1,j-ku),min(m,j+kl)
+                               temp = temp + a(k+i,j)*x(i)
+                           end do
+                       else
+                           do i = max(1,j-ku),min(m,j+kl)
+                               temp = temp + conjg(a(k+i,j))*x(i)
+                           end do
+                       end if
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               else
+                   do j = 1,n
+                       temp = czero
+                       ix = kx
+                       k = kup1 - j
+                       if (noconj) then
+                           do i = max(1,j-ku),min(m,j+kl)
+                               temp = temp + a(k+i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                       else
+                           do i = max(1,j-ku),min(m,j+kl)
+                               temp = temp + conjg(a(k+i,j))*x(ix)
+                               ix = ix + incx
+                           end do
+                       end if
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                       if (j>ku) kx = kx + incx
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_cgbmv
+
+     !> CGEMM:  performs one of the matrix-matrix operations
+     !> C := alpha*op( A )*op( B ) + beta*C,
+     !> where  op( X ) is one of
+     !> op( X ) = X   or   op( X ) = X**T   or   op( X ) = X**H,
+     !> alpha and beta are scalars, and A, B and C are matrices, with op( A )
+     !> an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+
+     pure subroutine stdlib_cgemm(transa,transb,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, m, n
+           character, intent(in) :: transa, transb
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, j, l, nrowa, nrowb
+           logical(lk) :: conja, conjb, nota, notb
+           
+           
+           ! set  nota  and  notb  as  true if  a  and  b  respectively are not
+           ! conjugated or transposed, set  conja and conjb  as true if  a  and
+           ! b  respectively are to be  transposed but  not conjugated  and set
+           ! nrowa and  nrowb  as the number of rows of  a  and  b  respectively.
+           nota = stdlib_lsame(transa,'N')
+           notb = stdlib_lsame(transb,'N')
+           conja = stdlib_lsame(transa,'C')
+           conjb = stdlib_lsame(transb,'C')
+           if (nota) then
+               nrowa = m
+           else
+               nrowa = k
+           end if
+           if (notb) then
+               nrowb = k
+           else
+               nrowb = n
+           end if
+           ! test the input parameters.
+           info = 0
+           if ((.not.nota) .and. (.not.conja) .and.(.not.stdlib_lsame(transa,'T'))) then
+               info = 1
+           else if ((.not.notb) .and. (.not.conjb) .and.(.not.stdlib_lsame(transb,'T'))) &
+                     then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda<max(1,nrowa)) then
+               info = 8
+           else if (ldb<max(1,nrowb)) then
+               info = 10
+           else if (ldc<max(1,m)) then
+               info = 13
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CGEMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.(((alpha==czero).or. (k==0)).and. (beta==cone))) &
+                     return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (beta==czero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = czero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (notb) then
+               if (nota) then
+                 ! form  c := alpha*a*b + beta*c.
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,m
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*b(l,j)
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               else if (conja) then
+                 ! form  c := alpha*a**h*b + beta*c.
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*b(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**t*b + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*b(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           else if (nota) then
+               if (conjb) then
+                 ! form  c := alpha*a*b**h + beta*c.
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,m
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*conjg(b(j,l))
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a*b**t + beta*c
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,m
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*b(j,l)
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               end if
+           else if (conja) then
+               if (conjb) then
+                 ! form  c := alpha*a**h*b**h + beta*c.
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*conjg(b(j,l))
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**h*b**t + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*b(j,l)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           else
+               if (conjb) then
+                 ! form  c := alpha*a**t*b**h + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*conjg(b(j,l))
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**t*b**t + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*b(j,l)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_cgemm
+
+     !> CGEMV: performs one of the matrix-vector operations
+     !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,   or
+     !> y := alpha*A**H*x + beta*y,
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+
+     pure subroutine stdlib_cgemv(trans,m,n,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           character, intent(in) :: trans
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), x(*)
+           complex(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky, lenx, leny
+           logical(lk) :: noconj
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 1
+           else if (m<0) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (lda<max(1,m)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           else if (incy==0) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CGEMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==czero).and. (beta==cone))) return
+           noconj = stdlib_lsame(trans,'T')
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if (stdlib_lsame(trans,'N')) then
+               lenx = n
+               leny = m
+           else
+               lenx = m
+               leny = n
+           end if
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (lenx-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (leny-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,leny
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,leny
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,leny
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,leny
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           if (stdlib_lsame(trans,'N')) then
+              ! form  y := alpha*a*x + y.
+               jx = kx
+               if (incy==1) then
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       do i = 1,m
+                           y(i) = y(i) + temp*a(i,j)
+                       end do
+                       jx = jx + incx
+                   end do
+               else
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       iy = ky
+                       do i = 1,m
+                           y(iy) = y(iy) + temp*a(i,j)
+                           iy = iy + incy
+                       end do
+                       jx = jx + incx
+                   end do
+               end if
+           else
+              ! form  y := alpha*a**t*x + y  or  y := alpha*a**h*x + y.
+               jy = ky
+               if (incx==1) then
+                   do j = 1,n
+                       temp = czero
+                       if (noconj) then
+                           do i = 1,m
+                               temp = temp + a(i,j)*x(i)
+                           end do
+                       else
+                           do i = 1,m
+                               temp = temp + conjg(a(i,j))*x(i)
+                           end do
+                       end if
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               else
+                   do j = 1,n
+                       temp = czero
+                       ix = kx
+                       if (noconj) then
+                           do i = 1,m
+                               temp = temp + a(i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                       else
+                           do i = 1,m
+                               temp = temp + conjg(a(i,j))*x(ix)
+                               ix = ix + incx
+                           end do
+                       end if
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_cgemv
+
+     !> CGERC:  performs the rank 1 operation
+     !> A := alpha*x*y**H + A,
+     !> where alpha is a scalar, x is an m element vector, y is an n element
+     !> vector and A is an m by n matrix.
+
+     pure subroutine stdlib_cgerc(m,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jy, kx
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! test the input parameters.
+           info = 0
+           if (m<0) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,m)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CGERC ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or. (alpha==czero)) return
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (incy>0) then
+               jy = 1
+           else
+               jy = 1 - (n-1)*incy
+           end if
+           if (incx==1) then
+               do j = 1,n
+                   if (y(jy)/=czero) then
+                       temp = alpha*conjg(y(jy))
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(i)*temp
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           else
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (m-1)*incx
+               end if
+               do j = 1,n
+                   if (y(jy)/=czero) then
+                       temp = alpha*conjg(y(jy))
+                       ix = kx
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(ix)*temp
+                           ix = ix + incx
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           end if
+           return
+     end subroutine stdlib_cgerc
+
+     !> CGERU:  performs the rank 1 operation
+     !> A := alpha*x*y**T + A,
+     !> where alpha is a scalar, x is an m element vector, y is an n element
+     !> vector and A is an m by n matrix.
+
+     pure subroutine stdlib_cgeru(m,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jy, kx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (m<0) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,m)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CGERU ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or. (alpha==czero)) return
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (incy>0) then
+               jy = 1
+           else
+               jy = 1 - (n-1)*incy
+           end if
+           if (incx==1) then
+               do j = 1,n
+                   if (y(jy)/=czero) then
+                       temp = alpha*y(jy)
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(i)*temp
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           else
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (m-1)*incx
+               end if
+               do j = 1,n
+                   if (y(jy)/=czero) then
+                       temp = alpha*y(jy)
+                       ix = kx
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(ix)*temp
+                           ix = ix + incx
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           end if
+           return
+     end subroutine stdlib_cgeru
+
+     !> CHBMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n hermitian band matrix, with k super-diagonals.
+
+     pure subroutine stdlib_chbmv(uplo,n,k,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, k, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), x(*)
+           complex(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(sp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kplus1, kx, ky, l
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min,real
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (k<0) then
+               info = 3
+           else if (lda< (k+1)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           else if (incy==0) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CHBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==czero).and. (beta==cone))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of the array a
+           ! are accessed sequentially with cone pass through a.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when upper triangle of a is stored.
+               kplus1 = k + 1
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       l = kplus1 - j
+                       do i = max(1,j-k),j - 1
+                           y(i) = y(i) + temp1*a(l+i,j)
+                           temp2 = temp2 + conjg(a(l+i,j))*x(i)
+                       end do
+                       y(j) = y(j) + temp1*real(a(kplus1,j),KIND=sp) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       ix = kx
+                       iy = ky
+                       l = kplus1 - j
+                       do i = max(1,j-k),j - 1
+                           y(iy) = y(iy) + temp1*a(l+i,j)
+                           temp2 = temp2 + conjg(a(l+i,j))*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*real(a(kplus1,j),KIND=sp) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       if (j>k) then
+                           kx = kx + incx
+                           ky = ky + incy
+                       end if
+                   end do
+               end if
+           else
+              ! form  y  when lower triangle of a is stored.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       y(j) = y(j) + temp1*real(a(1,j),KIND=sp)
+                       l = 1 - j
+                       do i = j + 1,min(n,j+k)
+                           y(i) = y(i) + temp1*a(l+i,j)
+                           temp2 = temp2 + conjg(a(l+i,j))*x(i)
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       y(jy) = y(jy) + temp1*real(a(1,j),KIND=sp)
+                       l = 1 - j
+                       ix = jx
+                       iy = jy
+                       do i = j + 1,min(n,j+k)
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*a(l+i,j)
+                           temp2 = temp2 + conjg(a(l+i,j))*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_chbmv
+
+     !> CHEMM:  performs one of the matrix-matrix operations
+     !> C := alpha*A*B + beta*C,
+     !> or
+     !> C := alpha*B*A + beta*C,
+     !> where alpha and beta are scalars, A is an hermitian matrix and  B and
+     !> C are m by n matrices.
+
+     pure subroutine stdlib_chemm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           character, intent(in) :: side, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,real
+           ! Local Scalars 
+           complex(sp) :: temp1, temp2
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: upper
+           
+           
+           ! set nrowa as the number of rows of a.
+           if (stdlib_lsame(side,'L')) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           ! test the input parameters.
+           info = 0
+           if ((.not.stdlib_lsame(side,'L')) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,m)) then
+               info = 9
+           else if (ldc<max(1,m)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CHEMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==czero).and. (beta==cone))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (beta==czero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = czero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(side,'L')) then
+              ! form  c := alpha*a*b + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,m
+                           temp1 = alpha*b(i,j)
+                           temp2 = czero
+                           do k = 1,i - 1
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*conjg(a(k,i))
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = temp1*real(a(i,i),KIND=sp) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*real(a(i,i),KIND=sp) +&
+                                         alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = m,1,-1
+                           temp1 = alpha*b(i,j)
+                           temp2 = czero
+                           do k = i + 1,m
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*conjg(a(k,i))
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = temp1*real(a(i,i),KIND=sp) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*real(a(i,i),KIND=sp) +&
+                                         alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*b*a + beta*c.
+               loop_170: do j = 1,n
+                   temp1 = alpha*real(a(j,j),KIND=sp)
+                   if (beta==czero) then
+                       do i = 1,m
+                           c(i,j) = temp1*b(i,j)
+                       end do
+                   else
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j) + temp1*b(i,j)
+                       end do
+                   end if
+                   do k = 1,j - 1
+                       if (upper) then
+                           temp1 = alpha*a(k,j)
+                       else
+                           temp1 = alpha*conjg(a(j,k))
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+                   do k = j + 1,n
+                       if (upper) then
+                           temp1 = alpha*conjg(a(j,k))
+                       else
+                           temp1 = alpha*a(k,j)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+               end do loop_170
+           end if
+           return
+     end subroutine stdlib_chemm
+
+     !> CHEMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n hermitian matrix.
+
+     pure subroutine stdlib_chemv(uplo,n,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), x(*)
+           complex(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(sp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,real
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (lda<max(1,n)) then
+               info = 5
+           else if (incx==0) then
+               info = 7
+           else if (incy==0) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CHEMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==czero).and. (beta==cone))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when a is stored in upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       do i = 1,j - 1
+                           y(i) = y(i) + temp1*a(i,j)
+                           temp2 = temp2 + conjg(a(i,j))*x(i)
+                       end do
+                       y(j) = y(j) + temp1*real(a(j,j),KIND=sp) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       ix = kx
+                       iy = ky
+                       do i = 1,j - 1
+                           y(iy) = y(iy) + temp1*a(i,j)
+                           temp2 = temp2 + conjg(a(i,j))*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*real(a(j,j),KIND=sp) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           else
+              ! form  y  when a is stored in lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       y(j) = y(j) + temp1*real(a(j,j),KIND=sp)
+                       do i = j + 1,n
+                           y(i) = y(i) + temp1*a(i,j)
+                           temp2 = temp2 + conjg(a(i,j))*x(i)
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       y(jy) = y(jy) + temp1*real(a(j,j),KIND=sp)
+                       ix = jx
+                       iy = jy
+                       do i = j + 1,n
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*a(i,j)
+                           temp2 = temp2 + conjg(a(i,j))*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_chemv
+
+     !> CHER:   performs the hermitian rank 1 operation
+     !> A := alpha*x*x**H + A,
+     !> where alpha is a real scalar, x is an n element vector and A is an
+     !> n by n hermitian matrix.
+
+     pure subroutine stdlib_cher(uplo,n,alpha,x,incx,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,real
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (lda<max(1,n)) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CHER  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==real(czero,KIND=sp))) return
+           ! set the start point in x if the increment is not unity.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when a is stored in upper triangle.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=czero) then
+                           temp = alpha*conjg(x(j))
+                           do i = 1,j - 1
+                               a(i,j) = a(i,j) + x(i)*temp
+                           end do
+                           a(j,j) = real(a(j,j),KIND=sp) + real(x(j)*temp,KIND=sp)
+                       else
+                           a(j,j) = real(a(j,j),KIND=sp)
+                       end if
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=czero) then
+                           temp = alpha*conjg(x(jx))
+                           ix = kx
+                           do i = 1,j - 1
+                               a(i,j) = a(i,j) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                           a(j,j) = real(a(j,j),KIND=sp) + real(x(jx)*temp,KIND=sp)
+                       else
+                           a(j,j) = real(a(j,j),KIND=sp)
+                       end if
+                       jx = jx + incx
+                   end do
+               end if
+           else
+              ! form  a  when a is stored in lower triangle.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=czero) then
+                           temp = alpha*conjg(x(j))
+                           a(j,j) = real(a(j,j),KIND=sp) + real(temp*x(j),KIND=sp)
+                           do i = j + 1,n
+                               a(i,j) = a(i,j) + x(i)*temp
+                           end do
+                       else
+                           a(j,j) = real(a(j,j),KIND=sp)
+                       end if
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=czero) then
+                           temp = alpha*conjg(x(jx))
+                           a(j,j) = real(a(j,j),KIND=sp) + real(temp*x(jx),KIND=sp)
+                           ix = jx
+                           do i = j + 1,n
+                               ix = ix + incx
+                               a(i,j) = a(i,j) + x(ix)*temp
+                           end do
+                       else
+                           a(j,j) = real(a(j,j),KIND=sp)
+                       end if
+                       jx = jx + incx
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_cher
+
+     !> CHER2:  performs the hermitian rank 2 operation
+     !> A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
+     !> where alpha is a scalar, x and y are n element vectors and A is an n
+     !> by n hermitian matrix.
+
+     pure subroutine stdlib_cher2(uplo,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,real
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,n)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CHER2 ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==czero)) return
+           ! set up the start points in x and y if the increments are not both
+           ! unity.
+           if ((incx/=1) .or. (incy/=1)) then
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (n-1)*incx
+               end if
+               if (incy>0) then
+                   ky = 1
+               else
+                   ky = 1 - (n-1)*incy
+               end if
+               jx = kx
+               jy = ky
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when a is stored in the upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=czero) .or. (y(j)/=czero)) then
+                           temp1 = alpha*conjg(y(j))
+                           temp2 = conjg(alpha*x(j))
+                           do i = 1,j - 1
+                               a(i,j) = a(i,j) + x(i)*temp1 + y(i)*temp2
+                           end do
+                           a(j,j) = real(a(j,j),KIND=sp) +real(x(j)*temp1+y(j)*temp2,KIND=sp)
+                                     
+                       else
+                           a(j,j) = real(a(j,j),KIND=sp)
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=czero) .or. (y(jy)/=czero)) then
+                           temp1 = alpha*conjg(y(jy))
+                           temp2 = conjg(alpha*x(jx))
+                           ix = kx
+                           iy = ky
+                           do i = 1,j - 1
+                               a(i,j) = a(i,j) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                           a(j,j) = real(a(j,j),KIND=sp) +real(x(jx)*temp1+y(jy)*temp2,KIND=sp)
+                                     
+                       else
+                           a(j,j) = real(a(j,j),KIND=sp)
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           else
+              ! form  a  when a is stored in the lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=czero) .or. (y(j)/=czero)) then
+                           temp1 = alpha*conjg(y(j))
+                           temp2 = conjg(alpha*x(j))
+                           a(j,j) = real(a(j,j),KIND=sp) +real(x(j)*temp1+y(j)*temp2,KIND=sp)
+                                     
+                           do i = j + 1,n
+                               a(i,j) = a(i,j) + x(i)*temp1 + y(i)*temp2
+                           end do
+                       else
+                           a(j,j) = real(a(j,j),KIND=sp)
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=czero) .or. (y(jy)/=czero)) then
+                           temp1 = alpha*conjg(y(jy))
+                           temp2 = conjg(alpha*x(jx))
+                           a(j,j) = real(a(j,j),KIND=sp) +real(x(jx)*temp1+y(jy)*temp2,KIND=sp)
+                                     
+                           ix = jx
+                           iy = jy
+                           do i = j + 1,n
+                               ix = ix + incx
+                               iy = iy + incy
+                               a(i,j) = a(i,j) + x(ix)*temp1 + y(iy)*temp2
+                           end do
+                       else
+                           a(j,j) = real(a(j,j),KIND=sp)
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_cher2
+
+     !> CHER2K:  performs one of the hermitian rank 2k operations
+     !> C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C,
+     !> or
+     !> C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C,
+     !> where  alpha and beta  are scalars with  beta  real,  C is an  n by n
+     !> hermitian matrix and  A and B  are  n by k matrices in the first case
+     !> and  k by n  matrices in the second case.
+
+     pure subroutine stdlib_cher2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha
+           real(sp), intent(in) :: beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,real
+           ! Local Scalars 
+           complex(sp) :: temp1, temp2
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'C'))) &
+                     then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,nrowa)) then
+               info = 9
+           else if (ldc<max(1,n)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CHER2K',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==czero).or.(k==0)).and. (beta==one))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (upper) then
+                   if (beta==real(czero,KIND=sp)) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j - 1
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=sp)
+                       end do
+                   end if
+               else
+                   if (beta==real(czero,KIND=sp)) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           c(j,j) = beta*real(c(j,j),KIND=sp)
+                           do i = j + 1,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*b**h + conjg( alpha )*b*a**h +
+                         ! c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==real(czero,KIND=sp)) then
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,j - 1
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=sp)
+                       else
+                           c(j,j) = real(c(j,j),KIND=sp)
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=czero) .or. (b(j,l)/=czero)) then
+                               temp1 = alpha*conjg(b(j,l))
+                               temp2 = conjg(alpha*a(j,l))
+                               do i = 1,j - 1
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                               c(j,j) = real(c(j,j),KIND=sp) +real(a(j,l)*temp1+b(j,l)*temp2,&
+                                         KIND=sp)
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==real(czero,KIND=sp)) then
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=one) then
+                           do i = j + 1,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=sp)
+                       else
+                           c(j,j) = real(c(j,j),KIND=sp)
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=czero) .or. (b(j,l)/=czero)) then
+                               temp1 = alpha*conjg(b(j,l))
+                               temp2 = conjg(alpha*a(j,l))
+                               do i = j + 1,n
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                               c(j,j) = real(c(j,j),KIND=sp) +real(a(j,l)*temp1+b(j,l)*temp2,&
+                                         KIND=sp)
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**h*b + conjg( alpha )*b**h*a +
+                         ! c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp1 = czero
+                           temp2 = czero
+                           do l = 1,k
+                               temp1 = temp1 + conjg(a(l,i))*b(l,j)
+                               temp2 = temp2 + conjg(b(l,i))*a(l,j)
+                           end do
+                           if (i==j) then
+                               if (beta==real(czero,KIND=sp)) then
+                                   c(j,j) = real(alpha*temp1+conjg(alpha)*temp2,KIND=sp)
+                               else
+                                   c(j,j) = beta*real(c(j,j),KIND=sp) +real(alpha*temp1+conjg(&
+                                             alpha)*temp2,KIND=sp)
+                               end if
+                           else
+                               if (beta==real(czero,KIND=sp)) then
+                                   c(i,j) = alpha*temp1 + conjg(alpha)*temp2
+                               else
+                                   c(i,j) = beta*c(i,j) + alpha*temp1 +conjg(alpha)*temp2
+                               end if
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp1 = czero
+                           temp2 = czero
+                           do l = 1,k
+                               temp1 = temp1 + conjg(a(l,i))*b(l,j)
+                               temp2 = temp2 + conjg(b(l,i))*a(l,j)
+                           end do
+                           if (i==j) then
+                               if (beta==real(czero,KIND=sp)) then
+                                   c(j,j) = real(alpha*temp1+conjg(alpha)*temp2,KIND=sp)
+                               else
+                                   c(j,j) = beta*real(c(j,j),KIND=sp) +real(alpha*temp1+conjg(&
+                                             alpha)*temp2,KIND=sp)
+                               end if
+                           else
+                               if (beta==real(czero,KIND=sp)) then
+                                   c(i,j) = alpha*temp1 + conjg(alpha)*temp2
+                               else
+                                   c(i,j) = beta*c(i,j) + alpha*temp1 +conjg(alpha)*temp2
+                               end if
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_cher2k
+
+     !> CHERK:  performs one of the hermitian rank k operations
+     !> C := alpha*A*A**H + beta*C,
+     !> or
+     !> C := alpha*A**H*A + beta*C,
+     !> where  alpha and beta  are  real scalars,  C is an  n by n  hermitian
+     !> matrix and  A  is an  n by k  matrix in the  first case and a  k by n
+     !> matrix in the second case.
+
+     pure subroutine stdlib_cherk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,conjg,max,real
+           ! Local Scalars 
+           complex(sp) :: temp
+           real(sp) :: rtemp
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'C'))) &
+                     then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldc<max(1,n)) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CHERK ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==zero).or.(k==0)).and. (beta==one))) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               if (upper) then
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j - 1
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=sp)
+                       end do
+                   end if
+               else
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           c(j,j) = beta*real(c(j,j),KIND=sp)
+                           do i = j + 1,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*a**h + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,j - 1
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=sp)
+                       else
+                           c(j,j) = real(c(j,j),KIND=sp)
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=cmplx(zero,KIND=sp)) then
+                               temp = alpha*conjg(a(j,l))
+                               do i = 1,j - 1
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                               c(j,j) = real(c(j,j),KIND=sp) + real(temp*a(i,l),KIND=sp)
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           c(j,j) = beta*real(c(j,j),KIND=sp)
+                           do i = j + 1,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       else
+                           c(j,j) = real(c(j,j),KIND=sp)
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=cmplx(zero,KIND=sp)) then
+                               temp = alpha*conjg(a(j,l))
+                               c(j,j) = real(c(j,j),KIND=sp) + real(temp*a(j,l),KIND=sp)
+                               do i = j + 1,n
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**h*a + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j - 1
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                       rtemp = zero
+                       do l = 1,k
+                           rtemp = rtemp + conjg(a(l,j))*a(l,j)
+                       end do
+                       if (beta==zero) then
+                           c(j,j) = alpha*rtemp
+                       else
+                           c(j,j) = alpha*rtemp + beta*real(c(j,j),KIND=sp)
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       rtemp = zero
+                       do l = 1,k
+                           rtemp = rtemp + conjg(a(l,j))*a(l,j)
+                       end do
+                       if (beta==zero) then
+                           c(j,j) = alpha*rtemp
+                       else
+                           c(j,j) = alpha*rtemp + beta*real(c(j,j),KIND=sp)
+                       end if
+                       do i = j + 1,n
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_cherk
+
+     !> CHPMV:  performs the matrix-vector operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n hermitian matrix, supplied in packed form.
+
+     pure subroutine stdlib_chpmv(uplo,n,alpha,ap,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: ap(*), x(*)
+           complex(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(sp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: conjg,real
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 6
+           else if (incy==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CHPMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==czero).and. (beta==cone))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when ap contains the upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       k = kk
+                       do i = 1,j - 1
+                           y(i) = y(i) + temp1*ap(k)
+                           temp2 = temp2 + conjg(ap(k))*x(i)
+                           k = k + 1
+                       end do
+                       y(j) = y(j) + temp1*real(ap(kk+j-1),KIND=sp) + alpha*temp2
+                       kk = kk + j
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       ix = kx
+                       iy = ky
+                       do k = kk,kk + j - 2
+                           y(iy) = y(iy) + temp1*ap(k)
+                           temp2 = temp2 + conjg(ap(k))*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*real(ap(kk+j-1),KIND=sp) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  y  when ap contains the lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       y(j) = y(j) + temp1*real(ap(kk),KIND=sp)
+                       k = kk + 1
+                       do i = j + 1,n
+                           y(i) = y(i) + temp1*ap(k)
+                           temp2 = temp2 + conjg(ap(k))*x(i)
+                           k = k + 1
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                       kk = kk + (n-j+1)
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       y(jy) = y(jy) + temp1*real(ap(kk),KIND=sp)
+                       ix = jx
+                       iy = jy
+                       do k = kk + 1,kk + n - j
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*ap(k)
+                           temp2 = temp2 + conjg(ap(k))*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + (n-j+1)
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_chpmv
+
+     !> CHPR:    performs the hermitian rank 1 operation
+     !> A := alpha*x*x**H + A,
+     !> where alpha is a real scalar, x is an n element vector and A is an
+     !> n by n hermitian matrix, supplied in packed form.
+
+     pure subroutine stdlib_chpr(uplo,n,alpha,x,incx,ap)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(sp), intent(inout) :: ap(*)
+           complex(sp), intent(in) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           ! Intrinsic Functions 
+           intrinsic :: conjg,real
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CHPR  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==real(czero,KIND=sp))) return
+           ! set the start point in x if the increment is not unity.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when upper triangle is stored in ap.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=czero) then
+                           temp = alpha*conjg(x(j))
+                           k = kk
+                           do i = 1,j - 1
+                               ap(k) = ap(k) + x(i)*temp
+                               k = k + 1
+                           end do
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=sp) + real(x(j)*temp,KIND=sp)
+                       else
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=sp)
+                       end if
+                       kk = kk + j
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=czero) then
+                           temp = alpha*conjg(x(jx))
+                           ix = kx
+                           do k = kk,kk + j - 2
+                               ap(k) = ap(k) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=sp) + real(x(jx)*temp,KIND=sp)
+                                     
+                       else
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=sp)
+                       end if
+                       jx = jx + incx
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=czero) then
+                           temp = alpha*conjg(x(j))
+                           ap(kk) = real(ap(kk),KIND=sp) + real(temp*x(j),KIND=sp)
+                           k = kk + 1
+                           do i = j + 1,n
+                               ap(k) = ap(k) + x(i)*temp
+                               k = k + 1
+                           end do
+                       else
+                           ap(kk) = real(ap(kk),KIND=sp)
+                       end if
+                       kk = kk + n - j + 1
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=czero) then
+                           temp = alpha*conjg(x(jx))
+                           ap(kk) = real(ap(kk),KIND=sp) + real(temp*x(jx),KIND=sp)
+                           ix = jx
+                           do k = kk + 1,kk + n - j
+                               ix = ix + incx
+                               ap(k) = ap(k) + x(ix)*temp
+                           end do
+                       else
+                           ap(kk) = real(ap(kk),KIND=sp)
+                       end if
+                       jx = jx + incx
+                       kk = kk + n - j + 1
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_chpr
+
+     !> CHPR2:  performs the hermitian rank 2 operation
+     !> A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
+     !> where alpha is a scalar, x and y are n element vectors and A is an
+     !> n by n hermitian matrix, supplied in packed form.
+
+     pure subroutine stdlib_chpr2(uplo,n,alpha,x,incx,y,incy,ap)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(sp), intent(inout) :: ap(*)
+           complex(sp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: conjg,real
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CHPR2 ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==czero)) return
+           ! set up the start points in x and y if the increments are not both
+           ! unity.
+           if ((incx/=1) .or. (incy/=1)) then
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (n-1)*incx
+               end if
+               if (incy>0) then
+                   ky = 1
+               else
+                   ky = 1 - (n-1)*incy
+               end if
+               jx = kx
+               jy = ky
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when upper triangle is stored in ap.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=czero) .or. (y(j)/=czero)) then
+                           temp1 = alpha*conjg(y(j))
+                           temp2 = conjg(alpha*x(j))
+                           k = kk
+                           do i = 1,j - 1
+                               ap(k) = ap(k) + x(i)*temp1 + y(i)*temp2
+                               k = k + 1
+                           end do
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=sp) +real(x(j)*temp1+y(j)*temp2,&
+                                     KIND=sp)
+                       else
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=sp)
+                       end if
+                       kk = kk + j
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=czero) .or. (y(jy)/=czero)) then
+                           temp1 = alpha*conjg(y(jy))
+                           temp2 = conjg(alpha*x(jx))
+                           ix = kx
+                           iy = ky
+                           do k = kk,kk + j - 2
+                               ap(k) = ap(k) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=sp) +real(x(jx)*temp1+y(jy)*temp2,&
+                                     KIND=sp)
+                       else
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=sp)
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=czero) .or. (y(j)/=czero)) then
+                           temp1 = alpha*conjg(y(j))
+                           temp2 = conjg(alpha*x(j))
+                           ap(kk) = real(ap(kk),KIND=sp) +real(x(j)*temp1+y(j)*temp2,KIND=sp)
+                                     
+                           k = kk + 1
+                           do i = j + 1,n
+                               ap(k) = ap(k) + x(i)*temp1 + y(i)*temp2
+                               k = k + 1
+                           end do
+                       else
+                           ap(kk) = real(ap(kk),KIND=sp)
+                       end if
+                       kk = kk + n - j + 1
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=czero) .or. (y(jy)/=czero)) then
+                           temp1 = alpha*conjg(y(jy))
+                           temp2 = conjg(alpha*x(jx))
+                           ap(kk) = real(ap(kk),KIND=sp) +real(x(jx)*temp1+y(jy)*temp2,KIND=sp)
+                                     
+                           ix = jx
+                           iy = jy
+                           do k = kk + 1,kk + n - j
+                               ix = ix + incx
+                               iy = iy + incy
+                               ap(k) = ap(k) + x(ix)*temp1 + y(iy)*temp2
+                           end do
+                       else
+                           ap(kk) = real(ap(kk),KIND=sp)
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + n - j + 1
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_chpr2
+
+     !> !
+     !>
+     !> The computation uses the formulas
+     !> |x| = sqrt( Re(x)**2 + Im(x)**2 )
+     !> sgn(x) = x / |x|  if x /= 0
+     !> = 1        if x  = 0
+     !> c = |a| / sqrt(|a|**2 + |b|**2)
+     !> s = sgn(a) * conjg(b) / sqrt(|a|**2 + |b|**2)
+     !> When a and b are real and r /= 0, the formulas simplify to
+     !> r = sgn(a)*sqrt(|a|**2 + |b|**2)
+     !> c = a / r
+     !> s = b / r
+     !> the same as in SROTG when |a| > |b|.  When |b| >= |a|, the
+     !> sign of c and s will be different from those computed by SROTG
+     !> if the signs of a and b are not the same.
+
+     pure subroutine stdlib_crotg( a, b, c, s )
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+        ! Constants 
+        integer, parameter :: wp = kind(1._sp)
+        ! Scaling Constants 
+        ! Scalar Arguments 
+        real(sp), intent(out) :: c
+        complex(sp), intent(inout) :: a
+        complex(sp), intent(in) :: b
+        complex(sp), intent(out) :: s
+        ! Local Scalars 
+        real(sp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w
+        complex(sp) :: f, fs, g, gs, r, t
+        ! Intrinsic Functions 
+        intrinsic :: abs,aimag,conjg,max,min,real,sqrt
+        ! Statement Functions 
+        real(sp) :: abssq
+        ! Statement Function Definitions 
+        abssq( t ) = real( t,KIND=sp)**2 + aimag( t )**2
+        ! Executable Statements 
+        f = a
+        g = b
+        if( g == czero ) then
+           c = one
+           s = czero
+           r = f
+        else if( f == czero ) then
+           c = zero
+           g1 = max( abs(real(g,KIND=sp)), abs(aimag(g)) )
+           if( g1 > rtmin .and. g1 < rtmax ) then
+              ! use unscaled algorithm
+              g2 = abssq( g )
+              d = sqrt( g2 )
+              s = conjg( g ) / d
+              r = d
+           else
+              ! use scaled algorithm
+              u = min( safmax, max( safmin, g1 ) )
+              uu = one / u
+              gs = g*uu
+              g2 = abssq( gs )
+              d = sqrt( g2 )
+              s = conjg( gs ) / d
+              r = d*u
+           end if
+        else
+           f1 = max( abs(real(f,KIND=sp)), abs(aimag(f)) )
+           g1 = max( abs(real(g,KIND=sp)), abs(aimag(g)) )
+     if( f1 > rtmin .and. f1 < rtmax .and.          g1 > rtmin .and. g1 < rtmax ) then
+              ! use unscaled algorithm
+              f2 = abssq( f )
+              g2 = abssq( g )
+              h2 = f2 + g2
+              if( f2 > rtmin .and. h2 < rtmax ) then
+                 d = sqrt( f2*h2 )
+              else
+                 d = sqrt( f2 )*sqrt( h2 )
+              end if
+              p = 1 / d
+              c = f2*p
+              s = conjg( g )*( f*p )
+              r = f*( h2*p )
+           else
+              ! use scaled algorithm
+              u = min( safmax, max( safmin, f1, g1 ) )
+              uu = one / u
+              gs = g*uu
+              g2 = abssq( gs )
+              if( f1*uu < rtmin ) then
+                 ! f is not well-scaled when scaled by g1.
+                 ! use a different scaling for f.
+                 v = min( safmax, max( safmin, f1 ) )
+                 vv = one / v
+                 w = v * uu
+                 fs = f*vv
+                 f2 = abssq( fs )
+                 h2 = f2*w**2 + g2
+              else
+                 ! otherwise use the same scaling for f and g.
+                 w = one
+                 fs = f*uu
+                 f2 = abssq( fs )
+                 h2 = f2 + g2
+              end if
+              if( f2 > rtmin .and. h2 < rtmax ) then
+                 d = sqrt( f2*h2 )
+              else
+                 d = sqrt( f2 )*sqrt( h2 )
+              end if
+              p = 1 / d
+              c = ( f2*p )*w
+              s = conjg( gs )*( fs*p )
+              r = ( fs*( h2*p ) )*u
+           end if
+        end if
+        a = r
+        return
+     end subroutine stdlib_crotg
+
+     !> CSCAL: scales a vector by a constant.
+
+     pure subroutine stdlib_cscal(n,ca,cx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: ca
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: cx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, nincx
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              do i = 1,n
+                 cx(i) = ca*cx(i)
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 cx(i) = ca*cx(i)
+              end do
+           end if
+           return
+     end subroutine stdlib_cscal
+
+     !> CSROT: applies a plane rotation, where the cos and sin (c and s) are real
+     !> and the vectors cx and cy are complex.
+     !> jack dongarra, linpack, 3/11/78.
+
+     pure subroutine stdlib_csrot( n, cx, incx, cy, incy, c, s )
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           real(sp), intent(in) :: c, s
+           ! Array Arguments 
+           complex(sp), intent(inout) :: cx(*), cy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           complex(sp) :: ctemp
+           ! Executable Statements 
+           if( n<=0 )return
+           if( incx==1 .and. incy==1 ) then
+              ! code for both increments equal to 1
+              do i = 1, n
+                 ctemp = c*cx( i ) + s*cy( i )
+                 cy( i ) = c*cy( i ) - s*cx( i )
+                 cx( i ) = ctemp
+              end do
+           else
+              ! code for unequal increments or equal increments not equal
+                ! to 1
+              ix = 1
+              iy = 1
+              if( incx<0 )ix = ( -n+1 )*incx + 1
+              if( incy<0 )iy = ( -n+1 )*incy + 1
+              do i = 1, n
+                 ctemp = c*cx( ix ) + s*cy( iy )
+                 cy( iy ) = c*cy( iy ) - s*cx( ix )
+                 cx( ix ) = ctemp
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_csrot
+
+     !> CSSCAL: scales a complex vector by a real constant.
+
+     pure subroutine stdlib_csscal(n,sa,cx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: sa
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: cx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, nincx
+           ! Intrinsic Functions 
+           intrinsic :: aimag,cmplx,real
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              do i = 1,n
+                 cx(i) = cmplx(sa*real(cx(i),KIND=sp),sa*aimag(cx(i)),KIND=sp)
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 cx(i) = cmplx(sa*real(cx(i),KIND=sp),sa*aimag(cx(i)),KIND=sp)
+              end do
+           end if
+           return
+     end subroutine stdlib_csscal
+
+     !> CSWAP: interchanges two vectors.
+
+     pure subroutine stdlib_cswap(n,cx,incx,cy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: cx(*), cy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           complex(sp) :: ctemp
+           integer(ilp) :: i, ix, iy
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+             ! code for both increments equal to 1
+              do i = 1,n
+                 ctemp = cx(i)
+                 cx(i) = cy(i)
+                 cy(i) = ctemp
+              end do
+           else
+             ! code for unequal increments or equal increments not equal
+               ! to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 ctemp = cx(ix)
+                 cx(ix) = cy(iy)
+                 cy(iy) = ctemp
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_cswap
+
+     !> CSYMM:  performs one of the matrix-matrix operations
+     !> C := alpha*A*B + beta*C,
+     !> or
+     !> C := alpha*B*A + beta*C,
+     !> where  alpha and beta are scalars, A is a symmetric matrix and  B and
+     !> C are m by n matrices.
+
+     pure subroutine stdlib_csymm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           character, intent(in) :: side, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           complex(sp) :: temp1, temp2
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: upper
+           
+           
+           ! set nrowa as the number of rows of a.
+           if (stdlib_lsame(side,'L')) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           ! test the input parameters.
+           info = 0
+           if ((.not.stdlib_lsame(side,'L')) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,m)) then
+               info = 9
+           else if (ldc<max(1,m)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CSYMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==czero).and. (beta==cone))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (beta==czero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = czero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(side,'L')) then
+              ! form  c := alpha*a*b + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,m
+                           temp1 = alpha*b(i,j)
+                           temp2 = czero
+                           do k = 1,i - 1
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*a(k,i)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = temp1*a(i,i) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*a(i,i) +alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = m,1,-1
+                           temp1 = alpha*b(i,j)
+                           temp2 = czero
+                           do k = i + 1,m
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*a(k,i)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = temp1*a(i,i) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*a(i,i) +alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*b*a + beta*c.
+               loop_170: do j = 1,n
+                   temp1 = alpha*a(j,j)
+                   if (beta==czero) then
+                       do i = 1,m
+                           c(i,j) = temp1*b(i,j)
+                       end do
+                   else
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j) + temp1*b(i,j)
+                       end do
+                   end if
+                   do k = 1,j - 1
+                       if (upper) then
+                           temp1 = alpha*a(k,j)
+                       else
+                           temp1 = alpha*a(j,k)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+                   do k = j + 1,n
+                       if (upper) then
+                           temp1 = alpha*a(j,k)
+                       else
+                           temp1 = alpha*a(k,j)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+               end do loop_170
+           end if
+           return
+     end subroutine stdlib_csymm
+
+     !> CSYR2K:  performs one of the symmetric rank 2k operations
+     !> C := alpha*A*B**T + alpha*B*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*B + alpha*B**T*A + beta*C,
+     !> where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+     !> and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+     !> matrices in the second case.
+
+     pure subroutine stdlib_csyr2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           complex(sp) :: temp1, temp2
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'T'))) &
+                     then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,nrowa)) then
+               info = 9
+           else if (ldc<max(1,n)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CSYR2K',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==czero).or.(k==0)).and. (beta==cone))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (upper) then
+                   if (beta==czero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               else
+                   if (beta==czero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*b**t + alpha*b*a**t + c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=czero) .or. (b(j,l)/=czero)) then
+                               temp1 = alpha*b(j,l)
+                               temp2 = alpha*a(j,l)
+                               do i = 1,j
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=czero) .or. (b(j,l)/=czero)) then
+                               temp1 = alpha*b(j,l)
+                               temp2 = alpha*a(j,l)
+                               do i = j,n
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**t*b + alpha*b**t*a + c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp1 = czero
+                           temp2 = czero
+                           do l = 1,k
+                               temp1 = temp1 + a(l,i)*b(l,j)
+                               temp2 = temp2 + b(l,i)*a(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp1 + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + alpha*temp1 +alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp1 = czero
+                           temp2 = czero
+                           do l = 1,k
+                               temp1 = temp1 + a(l,i)*b(l,j)
+                               temp2 = temp2 + b(l,i)*a(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp1 + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + alpha*temp1 +alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_csyr2k
+
+     !> CSYRK:  performs one of the symmetric rank k operations
+     !> C := alpha*A*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*A + beta*C,
+     !> where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+     !> and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+     !> in the second case.
+
+     pure subroutine stdlib_csyrk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'T'))) &
+                     then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldc<max(1,n)) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CSYRK ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==czero).or.(k==0)).and. (beta==cone))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (upper) then
+                   if (beta==czero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               else
+                   if (beta==czero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*a**t + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=czero) then
+                               temp = alpha*a(j,l)
+                               do i = 1,j
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=czero) then
+                               temp = alpha*a(j,l)
+                               do i = j,n
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**t*a + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*a(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*a(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_csyrk
+
+     !> CTBMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,   or   x := A**H*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+
+     pure subroutine stdlib_ctbmv(uplo,trans,diag,n,k,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kplus1, kx, l
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda< (k+1)) then
+               info = 7
+           else if (incx==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CTBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx   too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (stdlib_lsame(trans,'N')) then
+               ! form  x := a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               l = kplus1 - j
+                               do i = max(1,j-k),j - 1
+                                   x(i) = x(i) + temp*a(l+i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(kplus1,j)
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               l = kplus1 - j
+                               do i = max(1,j-k),j - 1
+                                   x(ix) = x(ix) + temp*a(l+i,j)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(kplus1,j)
+                           end if
+                           jx = jx + incx
+                           if (j>k) kx = kx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               l = 1 - j
+                               do i = min(n,j+k),j + 1,-1
+                                   x(i) = x(i) + temp*a(l+i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(1,j)
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               l = 1 - j
+                               do i = min(n,j+k),j + 1,-1
+                                   x(ix) = x(ix) + temp*a(l+i,j)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(1,j)
+                           end if
+                           jx = jx - incx
+                           if ((n-j)>=k) kx = kx - incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x  or  x := a**h*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           l = kplus1 - j
+                           if (noconj) then
+                               if (nounit) temp = temp*a(kplus1,j)
+                               do i = j - 1,max(1,j-k),-1
+                                   temp = temp + a(l+i,j)*x(i)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(kplus1,j))
+                               do i = j - 1,max(1,j-k),-1
+                                   temp = temp + conjg(a(l+i,j))*x(i)
+                               end do
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           kx = kx - incx
+                           ix = kx
+                           l = kplus1 - j
+                           if (noconj) then
+                               if (nounit) temp = temp*a(kplus1,j)
+                               do i = j - 1,max(1,j-k),-1
+                                   temp = temp + a(l+i,j)*x(ix)
+                                   ix = ix - incx
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(kplus1,j))
+                               do i = j - 1,max(1,j-k),-1
+                                   temp = temp + conjg(a(l+i,j))*x(ix)
+                                   ix = ix - incx
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           l = 1 - j
+                           if (noconj) then
+                               if (nounit) temp = temp*a(1,j)
+                               do i = j + 1,min(n,j+k)
+                                   temp = temp + a(l+i,j)*x(i)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(1,j))
+                               do i = j + 1,min(n,j+k)
+                                   temp = temp + conjg(a(l+i,j))*x(i)
+                               end do
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           kx = kx + incx
+                           ix = kx
+                           l = 1 - j
+                           if (noconj) then
+                               if (nounit) temp = temp*a(1,j)
+                               do i = j + 1,min(n,j+k)
+                                   temp = temp + a(l+i,j)*x(ix)
+                                   ix = ix + incx
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(1,j))
+                               do i = j + 1,min(n,j+k)
+                                   temp = temp + conjg(a(l+i,j))*x(ix)
+                                   ix = ix + incx
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ctbmv
+
+     !> CTBSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,   or   A**H*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular band matrix, with ( k + 1 )
+     !> diagonals.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_ctbsv(uplo,trans,diag,n,k,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kplus1, kx, l
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda< (k+1)) then
+               info = 7
+           else if (incx==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CTBSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed by sequentially with cone pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               l = kplus1 - j
+                               if (nounit) x(j) = x(j)/a(kplus1,j)
+                               temp = x(j)
+                               do i = j - 1,max(1,j-k),-1
+                                   x(i) = x(i) - temp*a(l+i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           kx = kx - incx
+                           if (x(jx)/=czero) then
+                               ix = kx
+                               l = kplus1 - j
+                               if (nounit) x(jx) = x(jx)/a(kplus1,j)
+                               temp = x(jx)
+                               do i = j - 1,max(1,j-k),-1
+                                   x(ix) = x(ix) - temp*a(l+i,j)
+                                   ix = ix - incx
+                               end do
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               l = 1 - j
+                               if (nounit) x(j) = x(j)/a(1,j)
+                               temp = x(j)
+                               do i = j + 1,min(n,j+k)
+                                   x(i) = x(i) - temp*a(l+i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           kx = kx + incx
+                           if (x(jx)/=czero) then
+                               ix = kx
+                               l = 1 - j
+                               if (nounit) x(jx) = x(jx)/a(1,j)
+                               temp = x(jx)
+                               do i = j + 1,min(n,j+k)
+                                   x(ix) = x(ix) - temp*a(l+i,j)
+                                   ix = ix + incx
+                               end do
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x  or  x := inv( a**h )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           l = kplus1 - j
+                           if (noconj) then
+                               do i = max(1,j-k),j - 1
+                                   temp = temp - a(l+i,j)*x(i)
+                               end do
+                               if (nounit) temp = temp/a(kplus1,j)
+                           else
+                               do i = max(1,j-k),j - 1
+                                   temp = temp - conjg(a(l+i,j))*x(i)
+                               end do
+                               if (nounit) temp = temp/conjg(a(kplus1,j))
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           l = kplus1 - j
+                           if (noconj) then
+                               do i = max(1,j-k),j - 1
+                                   temp = temp - a(l+i,j)*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/a(kplus1,j)
+                           else
+                               do i = max(1,j-k),j - 1
+                                   temp = temp - conjg(a(l+i,j))*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/conjg(a(kplus1,j))
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                           if (j>k) kx = kx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           l = 1 - j
+                           if (noconj) then
+                               do i = min(n,j+k),j + 1,-1
+                                   temp = temp - a(l+i,j)*x(i)
+                               end do
+                               if (nounit) temp = temp/a(1,j)
+                           else
+                               do i = min(n,j+k),j + 1,-1
+                                   temp = temp - conjg(a(l+i,j))*x(i)
+                               end do
+                               if (nounit) temp = temp/conjg(a(1,j))
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           l = 1 - j
+                           if (noconj) then
+                               do i = min(n,j+k),j + 1,-1
+                                   temp = temp - a(l+i,j)*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/a(1,j)
+                           else
+                               do i = min(n,j+k),j + 1,-1
+                                   temp = temp - conjg(a(l+i,j))*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/conjg(a(1,j))
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                           if ((n-j)>=k) kx = kx - incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ctbsv
+
+     !> CTPMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,   or   x := A**H*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular matrix, supplied in packed form.
+
+     pure subroutine stdlib_ctpmv(uplo,trans,diag,n,ap,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: ap(*)
+           complex(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (incx==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CTPMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of ap are
+           ! accessed sequentially with cone pass through ap.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x:= a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               k = kk
+                               do i = 1,j - 1
+                                   x(i) = x(i) + temp*ap(k)
+                                   k = k + 1
+                               end do
+                               if (nounit) x(j) = x(j)*ap(kk+j-1)
+                           end if
+                           kk = kk + j
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               do k = kk,kk + j - 2
+                                   x(ix) = x(ix) + temp*ap(k)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*ap(kk+j-1)
+                           end if
+                           jx = jx + incx
+                           kk = kk + j
+                       end do
+                   end if
+               else
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               k = kk
+                               do i = n,j + 1,-1
+                                   x(i) = x(i) + temp*ap(k)
+                                   k = k - 1
+                               end do
+                               if (nounit) x(j) = x(j)*ap(kk-n+j)
+                           end if
+                           kk = kk - (n-j+1)
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               do k = kk,kk - (n- (j+1)),-1
+                                   x(ix) = x(ix) + temp*ap(k)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*ap(kk-n+j)
+                           end if
+                           jx = jx - incx
+                           kk = kk - (n-j+1)
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x  or  x := a**h*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           k = kk - 1
+                           if (noconj) then
+                               if (nounit) temp = temp*ap(kk)
+                               do i = j - 1,1,-1
+                                   temp = temp + ap(k)*x(i)
+                                   k = k - 1
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(ap(kk))
+                               do i = j - 1,1,-1
+                                   temp = temp + conjg(ap(k))*x(i)
+                                   k = k - 1
+                               end do
+                           end if
+                           x(j) = temp
+                           kk = kk - j
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = jx
+                           if (noconj) then
+                               if (nounit) temp = temp*ap(kk)
+                               do k = kk - 1,kk - j + 1,-1
+                                   ix = ix - incx
+                                   temp = temp + ap(k)*x(ix)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(ap(kk))
+                               do k = kk - 1,kk - j + 1,-1
+                                   ix = ix - incx
+                                   temp = temp + conjg(ap(k))*x(ix)
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                           kk = kk - j
+                       end do
+                   end if
+               else
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           k = kk + 1
+                           if (noconj) then
+                               if (nounit) temp = temp*ap(kk)
+                               do i = j + 1,n
+                                   temp = temp + ap(k)*x(i)
+                                   k = k + 1
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(ap(kk))
+                               do i = j + 1,n
+                                   temp = temp + conjg(ap(k))*x(i)
+                                   k = k + 1
+                               end do
+                           end if
+                           x(j) = temp
+                           kk = kk + (n-j+1)
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = jx
+                           if (noconj) then
+                               if (nounit) temp = temp*ap(kk)
+                               do k = kk + 1,kk + n - j
+                                   ix = ix + incx
+                                   temp = temp + ap(k)*x(ix)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(ap(kk))
+                               do k = kk + 1,kk + n - j
+                                   ix = ix + incx
+                                   temp = temp + conjg(ap(k))*x(ix)
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                           kk = kk + (n-j+1)
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ctpmv
+
+     !> CTPSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,   or   A**H*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular matrix, supplied in packed form.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_ctpsv(uplo,trans,diag,n,ap,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: ap(*)
+           complex(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (incx==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CTPSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of ap are
+           ! accessed sequentially with cone pass through ap.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               if (nounit) x(j) = x(j)/ap(kk)
+                               temp = x(j)
+                               k = kk - 1
+                               do i = j - 1,1,-1
+                                   x(i) = x(i) - temp*ap(k)
+                                   k = k - 1
+                               end do
+                           end if
+                           kk = kk - j
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               if (nounit) x(jx) = x(jx)/ap(kk)
+                               temp = x(jx)
+                               ix = jx
+                               do k = kk - 1,kk - j + 1,-1
+                                   ix = ix - incx
+                                   x(ix) = x(ix) - temp*ap(k)
+                               end do
+                           end if
+                           jx = jx - incx
+                           kk = kk - j
+                       end do
+                   end if
+               else
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               if (nounit) x(j) = x(j)/ap(kk)
+                               temp = x(j)
+                               k = kk + 1
+                               do i = j + 1,n
+                                   x(i) = x(i) - temp*ap(k)
+                                   k = k + 1
+                               end do
+                           end if
+                           kk = kk + (n-j+1)
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               if (nounit) x(jx) = x(jx)/ap(kk)
+                               temp = x(jx)
+                               ix = jx
+                               do k = kk + 1,kk + n - j
+                                   ix = ix + incx
+                                   x(ix) = x(ix) - temp*ap(k)
+                               end do
+                           end if
+                           jx = jx + incx
+                           kk = kk + (n-j+1)
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x  or  x := inv( a**h )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           k = kk
+                           if (noconj) then
+                               do i = 1,j - 1
+                                   temp = temp - ap(k)*x(i)
+                                   k = k + 1
+                               end do
+                               if (nounit) temp = temp/ap(kk+j-1)
+                           else
+                               do i = 1,j - 1
+                                   temp = temp - conjg(ap(k))*x(i)
+                                   k = k + 1
+                               end do
+                               if (nounit) temp = temp/conjg(ap(kk+j-1))
+                           end if
+                           x(j) = temp
+                           kk = kk + j
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           if (noconj) then
+                               do k = kk,kk + j - 2
+                                   temp = temp - ap(k)*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/ap(kk+j-1)
+                           else
+                               do k = kk,kk + j - 2
+                                   temp = temp - conjg(ap(k))*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/conjg(ap(kk+j-1))
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                           kk = kk + j
+                       end do
+                   end if
+               else
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           k = kk
+                           if (noconj) then
+                               do i = n,j + 1,-1
+                                   temp = temp - ap(k)*x(i)
+                                   k = k - 1
+                               end do
+                               if (nounit) temp = temp/ap(kk-n+j)
+                           else
+                               do i = n,j + 1,-1
+                                   temp = temp - conjg(ap(k))*x(i)
+                                   k = k - 1
+                               end do
+                               if (nounit) temp = temp/conjg(ap(kk-n+j))
+                           end if
+                           x(j) = temp
+                           kk = kk - (n-j+1)
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           if (noconj) then
+                               do k = kk,kk - (n- (j+1)),-1
+                                   temp = temp - ap(k)*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/ap(kk-n+j)
+                           else
+                               do k = kk,kk - (n- (j+1)),-1
+                                   temp = temp - conjg(ap(k))*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/conjg(ap(kk-n+j))
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                           kk = kk - (n-j+1)
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ctpsv
+
+     !> CTRMM:  performs one of the matrix-matrix operations
+     !> B := alpha*op( A )*B,   or   B := alpha*B*op( A )
+     !> where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
+
+     pure subroutine stdlib_ctrmm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           character, intent(in) :: diag, side, transa, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: lside, noconj, nounit, upper
+           
+           
+           ! test the input parameters.
+           lside = stdlib_lsame(side,'L')
+           if (lside) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           noconj = stdlib_lsame(transa,'T')
+           nounit = stdlib_lsame(diag,'N')
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.lside) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if ((.not.stdlib_lsame(transa,'N')) .and.(.not.stdlib_lsame(transa,'T')) .and.(&
+                     .not.stdlib_lsame(transa,'C'))) then
+               info = 3
+           else if ((.not.stdlib_lsame(diag,'U')) .and. (.not.stdlib_lsame(diag,'N'))) &
+                     then
+               info = 4
+           else if (m<0) then
+               info = 5
+           else if (n<0) then
+               info = 6
+           else if (lda<max(1,nrowa)) then
+               info = 9
+           else if (ldb<max(1,m)) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CTRMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (m==0 .or. n==0) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               do j = 1,n
+                   do i = 1,m
+                       b(i,j) = czero
+                   end do
+               end do
+               return
+           end if
+           ! start the operations.
+           if (lside) then
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*a*b.
+                   if (upper) then
+                       do j = 1,n
+                           do k = 1,m
+                               if (b(k,j)/=czero) then
+                                   temp = alpha*b(k,j)
+                                   do i = 1,k - 1
+                                       b(i,j) = b(i,j) + temp*a(i,k)
+                                   end do
+                                   if (nounit) temp = temp*a(k,k)
+                                   b(k,j) = temp
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do k = m,1,-1
+                               if (b(k,j)/=czero) then
+                                   temp = alpha*b(k,j)
+                                   b(k,j) = temp
+                                   if (nounit) b(k,j) = b(k,j)*a(k,k)
+                                   do i = k + 1,m
+                                       b(i,j) = b(i,j) + temp*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*a**t*b   or   b := alpha*a**h*b.
+                   if (upper) then
+                       do j = 1,n
+                           do i = m,1,-1
+                               temp = b(i,j)
+                               if (noconj) then
+                                   if (nounit) temp = temp*a(i,i)
+                                   do k = 1,i - 1
+                                       temp = temp + a(k,i)*b(k,j)
+                                   end do
+                               else
+                                   if (nounit) temp = temp*conjg(a(i,i))
+                                   do k = 1,i - 1
+                                       temp = temp + conjg(a(k,i))*b(k,j)
+                                   end do
+                               end if
+                               b(i,j) = alpha*temp
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,m
+                               temp = b(i,j)
+                               if (noconj) then
+                                   if (nounit) temp = temp*a(i,i)
+                                   do k = i + 1,m
+                                       temp = temp + a(k,i)*b(k,j)
+                                   end do
+                               else
+                                   if (nounit) temp = temp*conjg(a(i,i))
+                                   do k = i + 1,m
+                                       temp = temp + conjg(a(k,i))*b(k,j)
+                                   end do
+                               end if
+                               b(i,j) = alpha*temp
+                           end do
+                       end do
+                   end if
+               end if
+           else
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*b*a.
+                   if (upper) then
+                       do j = n,1,-1
+                           temp = alpha
+                           if (nounit) temp = temp*a(j,j)
+                           do i = 1,m
+                               b(i,j) = temp*b(i,j)
+                           end do
+                           do k = 1,j - 1
+                               if (a(k,j)/=czero) then
+                                   temp = alpha*a(k,j)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           temp = alpha
+                           if (nounit) temp = temp*a(j,j)
+                           do i = 1,m
+                               b(i,j) = temp*b(i,j)
+                           end do
+                           do k = j + 1,n
+                               if (a(k,j)/=czero) then
+                                   temp = alpha*a(k,j)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*b*a**t   or   b := alpha*b*a**h.
+                   if (upper) then
+                       loop_280: do k = 1,n
+                           do j = 1,k - 1
+                               if (a(j,k)/=czero) then
+                                   if (noconj) then
+                                       temp = alpha*a(j,k)
+                                   else
+                                       temp = alpha*conjg(a(j,k))
+                                   end if
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           temp = alpha
+                           if (nounit) then
+                               if (noconj) then
+                                   temp = temp*a(k,k)
+                               else
+                                   temp = temp*conjg(a(k,k))
+                               end if
+                           end if
+                           if (temp/=cone) then
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                       end do loop_280
+                   else
+                       loop_320: do k = n,1,-1
+                           do j = k + 1,n
+                               if (a(j,k)/=czero) then
+                                   if (noconj) then
+                                       temp = alpha*a(j,k)
+                                   else
+                                       temp = alpha*conjg(a(j,k))
+                                   end if
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           temp = alpha
+                           if (nounit) then
+                               if (noconj) then
+                                   temp = temp*a(k,k)
+                               else
+                                   temp = temp*conjg(a(k,k))
+                               end if
+                           end if
+                           if (temp/=cone) then
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                       end do loop_320
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ctrmm
+
+     !> CTRMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,   or   x := A**H*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular matrix.
+
+     pure subroutine stdlib_ctrmv(uplo,trans,diag,n,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,n)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CTRMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               do i = 1,j - 1
+                                   x(i) = x(i) + temp*a(i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(j,j)
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               do i = 1,j - 1
+                                   x(ix) = x(ix) + temp*a(i,j)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(j,j)
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               do i = n,j + 1,-1
+                                   x(i) = x(i) + temp*a(i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(j,j)
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               do i = n,j + 1,-1
+                                   x(ix) = x(ix) + temp*a(i,j)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(j,j)
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x  or  x := a**h*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           if (noconj) then
+                               if (nounit) temp = temp*a(j,j)
+                               do i = j - 1,1,-1
+                                   temp = temp + a(i,j)*x(i)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(j,j))
+                               do i = j - 1,1,-1
+                                   temp = temp + conjg(a(i,j))*x(i)
+                               end do
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = jx
+                           if (noconj) then
+                               if (nounit) temp = temp*a(j,j)
+                               do i = j - 1,1,-1
+                                   ix = ix - incx
+                                   temp = temp + a(i,j)*x(ix)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(j,j))
+                               do i = j - 1,1,-1
+                                   ix = ix - incx
+                                   temp = temp + conjg(a(i,j))*x(ix)
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           if (noconj) then
+                               if (nounit) temp = temp*a(j,j)
+                               do i = j + 1,n
+                                   temp = temp + a(i,j)*x(i)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(j,j))
+                               do i = j + 1,n
+                                   temp = temp + conjg(a(i,j))*x(i)
+                               end do
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = jx
+                           if (noconj) then
+                               if (nounit) temp = temp*a(j,j)
+                               do i = j + 1,n
+                                   ix = ix + incx
+                                   temp = temp + a(i,j)*x(ix)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(j,j))
+                               do i = j + 1,n
+                                   ix = ix + incx
+                                   temp = temp + conjg(a(i,j))*x(ix)
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ctrmv
+
+     !> CTRSM:  solves one of the matrix equations
+     !> op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+     !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
+     !> The matrix X is overwritten on B.
+
+     pure subroutine stdlib_ctrsm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           character, intent(in) :: diag, side, transa, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: lside, noconj, nounit, upper
+           
+           
+           ! test the input parameters.
+           lside = stdlib_lsame(side,'L')
+           if (lside) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           noconj = stdlib_lsame(transa,'T')
+           nounit = stdlib_lsame(diag,'N')
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.lside) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if ((.not.stdlib_lsame(transa,'N')) .and.(.not.stdlib_lsame(transa,'T')) .and.(&
+                     .not.stdlib_lsame(transa,'C'))) then
+               info = 3
+           else if ((.not.stdlib_lsame(diag,'U')) .and. (.not.stdlib_lsame(diag,'N'))) &
+                     then
+               info = 4
+           else if (m<0) then
+               info = 5
+           else if (n<0) then
+               info = 6
+           else if (lda<max(1,nrowa)) then
+               info = 9
+           else if (ldb<max(1,m)) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CTRSM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (m==0 .or. n==0) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               do j = 1,n
+                   do i = 1,m
+                       b(i,j) = czero
+                   end do
+               end do
+               return
+           end if
+           ! start the operations.
+           if (lside) then
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*inv( a )*b.
+                   if (upper) then
+                       do j = 1,n
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = m,1,-1
+                               if (b(k,j)/=czero) then
+                                   if (nounit) b(k,j) = b(k,j)/a(k,k)
+                                   do i = 1,k - 1
+                                       b(i,j) = b(i,j) - b(k,j)*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = 1,m
+                               if (b(k,j)/=czero) then
+                                   if (nounit) b(k,j) = b(k,j)/a(k,k)
+                                   do i = k + 1,m
+                                       b(i,j) = b(i,j) - b(k,j)*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*inv( a**t )*b
+                 ! or    b := alpha*inv( a**h )*b.
+                   if (upper) then
+                       do j = 1,n
+                           do i = 1,m
+                               temp = alpha*b(i,j)
+                               if (noconj) then
+                                   do k = 1,i - 1
+                                       temp = temp - a(k,i)*b(k,j)
+                                   end do
+                                   if (nounit) temp = temp/a(i,i)
+                               else
+                                   do k = 1,i - 1
+                                       temp = temp - conjg(a(k,i))*b(k,j)
+                                   end do
+                                   if (nounit) temp = temp/conjg(a(i,i))
+                               end if
+                               b(i,j) = temp
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = m,1,-1
+                               temp = alpha*b(i,j)
+                               if (noconj) then
+                                   do k = i + 1,m
+                                       temp = temp - a(k,i)*b(k,j)
+                                   end do
+                                   if (nounit) temp = temp/a(i,i)
+                               else
+                                   do k = i + 1,m
+                                       temp = temp - conjg(a(k,i))*b(k,j)
+                                   end do
+                                   if (nounit) temp = temp/conjg(a(i,i))
+                               end if
+                               b(i,j) = temp
+                           end do
+                       end do
+                   end if
+               end if
+           else
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*b*inv( a ).
+                   if (upper) then
+                       do j = 1,n
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = 1,j - 1
+                               if (a(k,j)/=czero) then
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - a(k,j)*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (nounit) then
+                               temp = cone/a(j,j)
+                               do i = 1,m
+                                   b(i,j) = temp*b(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       do j = n,1,-1
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = j + 1,n
+                               if (a(k,j)/=czero) then
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - a(k,j)*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (nounit) then
+                               temp = cone/a(j,j)
+                               do i = 1,m
+                                   b(i,j) = temp*b(i,j)
+                               end do
+                           end if
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*b*inv( a**t )
+                 ! or    b := alpha*b*inv( a**h ).
+                   if (upper) then
+                       loop_330: do k = n,1,-1
+                           if (nounit) then
+                               if (noconj) then
+                                   temp = cone/a(k,k)
+                               else
+                                   temp = cone/conjg(a(k,k))
+                               end if
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                           do j = 1,k - 1
+                               if (a(j,k)/=czero) then
+                                   if (noconj) then
+                                       temp = a(j,k)
+                                   else
+                                       temp = conjg(a(j,k))
+                                   end if
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,k) = alpha*b(i,k)
+                               end do
+                           end if
+                       end do loop_330
+                   else
+                       loop_380: do k = 1,n
+                           if (nounit) then
+                               if (noconj) then
+                                   temp = cone/a(k,k)
+                               else
+                                   temp = cone/conjg(a(k,k))
+                               end if
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                           do j = k + 1,n
+                               if (a(j,k)/=czero) then
+                                   if (noconj) then
+                                       temp = a(j,k)
+                                   else
+                                       temp = conjg(a(j,k))
+                                   end if
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,k) = alpha*b(i,k)
+                               end do
+                           end if
+                       end do loop_380
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ctrsm
+
+     !> CTRSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,   or   A**H*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular matrix.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_ctrsv(uplo,trans,diag,n,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,n)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('CTRSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               if (nounit) x(j) = x(j)/a(j,j)
+                               temp = x(j)
+                               do i = j - 1,1,-1
+                                   x(i) = x(i) - temp*a(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               if (nounit) x(jx) = x(jx)/a(j,j)
+                               temp = x(jx)
+                               ix = jx
+                               do i = j - 1,1,-1
+                                   ix = ix - incx
+                                   x(ix) = x(ix) - temp*a(i,j)
+                               end do
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               if (nounit) x(j) = x(j)/a(j,j)
+                               temp = x(j)
+                               do i = j + 1,n
+                                   x(i) = x(i) - temp*a(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               if (nounit) x(jx) = x(jx)/a(j,j)
+                               temp = x(jx)
+                               ix = jx
+                               do i = j + 1,n
+                                   ix = ix + incx
+                                   x(ix) = x(ix) - temp*a(i,j)
+                               end do
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x  or  x := inv( a**h )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           if (noconj) then
+                               do i = 1,j - 1
+                                   temp = temp - a(i,j)*x(i)
+                               end do
+                               if (nounit) temp = temp/a(j,j)
+                           else
+                               do i = 1,j - 1
+                                   temp = temp - conjg(a(i,j))*x(i)
+                               end do
+                               if (nounit) temp = temp/conjg(a(j,j))
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           ix = kx
+                           temp = x(jx)
+                           if (noconj) then
+                               do i = 1,j - 1
+                                   temp = temp - a(i,j)*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/a(j,j)
+                           else
+                               do i = 1,j - 1
+                                   temp = temp - conjg(a(i,j))*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/conjg(a(j,j))
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           if (noconj) then
+                               do i = n,j + 1,-1
+                                   temp = temp - a(i,j)*x(i)
+                               end do
+                               if (nounit) temp = temp/a(j,j)
+                           else
+                               do i = n,j + 1,-1
+                                   temp = temp - conjg(a(i,j))*x(i)
+                               end do
+                               if (nounit) temp = temp/conjg(a(j,j))
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           ix = kx
+                           temp = x(jx)
+                           if (noconj) then
+                               do i = n,j + 1,-1
+                                   temp = temp - a(i,j)*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/a(j,j)
+                           else
+                               do i = n,j + 1,-1
+                                   temp = temp - conjg(a(i,j))*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/conjg(a(j,j))
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ctrsv
+
+
+
+end module stdlib_linalg_blas_c
diff --git a/src/stdlib_linalg_blas_d.fypp b/src/stdlib_linalg_blas_d.fypp
new file mode 100644
index 000000000..125ad603c
--- /dev/null
+++ b/src/stdlib_linalg_blas_d.fypp
@@ -0,0 +1,4520 @@
+#:include "common.fypp" 
+module stdlib_linalg_blas_d
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas_aux
+     use stdlib_linalg_blas_s
+     use stdlib_linalg_blas_c
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+     public :: stdlib_dasum
+     public :: stdlib_daxpy
+     public :: stdlib_dcopy
+     public :: stdlib_ddot
+     public :: stdlib_dgbmv
+     public :: stdlib_dgemm
+     public :: stdlib_dgemv
+     public :: stdlib_dger
+     public :: stdlib_dnrm2
+     public :: stdlib_drot
+     public :: stdlib_drotg
+     public :: stdlib_drotm
+     public :: stdlib_drotmg
+     public :: stdlib_dsbmv
+     public :: stdlib_dscal
+     public :: stdlib_dsdot
+     public :: stdlib_dspmv
+     public :: stdlib_dspr
+     public :: stdlib_dspr2
+     public :: stdlib_dswap
+     public :: stdlib_dsymm
+     public :: stdlib_dsymv
+     public :: stdlib_dsyr
+     public :: stdlib_dsyr2
+     public :: stdlib_dsyr2k
+     public :: stdlib_dsyrk
+     public :: stdlib_dtbmv
+     public :: stdlib_dtbsv
+     public :: stdlib_dtpmv
+     public :: stdlib_dtpsv
+     public :: stdlib_dtrmm
+     public :: stdlib_dtrmv
+     public :: stdlib_dtrsm
+     public :: stdlib_dtrsv
+     public :: stdlib_dzasum
+     public :: stdlib_dznrm2
+
+     ! 64-bit real constants 
+     real(dp),    parameter, private ::     negone = -1.00_dp
+     real(dp),    parameter, private ::       zero = 0.00_dp
+     real(dp),    parameter, private ::       half = 0.50_dp
+     real(dp),    parameter, private ::        one = 1.00_dp
+     real(dp),    parameter, private ::        two = 2.00_dp
+     real(dp),    parameter, private ::      three = 3.00_dp
+     real(dp),    parameter, private ::       four = 4.00_dp
+     real(dp),    parameter, private ::      eight = 8.00_dp
+     real(dp),    parameter, private ::        ten = 10.00_dp
+
+     ! 64-bit complex constants 
+     complex(dp), parameter, private :: czero   = ( 0.0_dp,0.0_dp)
+     complex(dp), parameter, private :: chalf   = ( 0.5_dp,0.0_dp)
+     complex(dp), parameter, private :: cone    = ( 1.0_dp,0.0_dp)
+     complex(dp), parameter, private :: cnegone = (-1.0_dp,0.0_dp)
+
+     ! 64-bit scaling constants 
+     integer,     parameter, private :: maxexp = maxexponent(zero) 
+     integer,     parameter, private :: minexp = minexponent(zero) 
+     real(dp),    parameter, private :: rradix = real(radix(zero),dp) 
+     real(dp),    parameter, private :: ulp    = epsilon(zero) 
+     real(dp),    parameter, private :: eps    = ulp*half 
+     real(dp),    parameter, private :: safmin = rradix**max(minexp-1,1-maxexp) 
+     real(dp),    parameter, private :: safmax = one/safmin 
+     real(dp),    parameter, private :: smlnum = safmin/ulp 
+     real(dp),    parameter, private :: bignum = safmax*ulp 
+     real(dp),    parameter, private :: rtmin  = sqrt(smlnum) 
+     real(dp),    parameter, private :: rtmax  = sqrt(bignum) 
+
+     ! 64-bit Blue's scaling constants 
+     ! ssml>=1/s and sbig==1/S with s,S as defined in https://doi.org/10.1145/355769.355771 
+     real(dp),    parameter, private :: tsml   = rradix**ceiling((minexp-1)*half) 
+     real(dp),    parameter, private :: tbig   = rradix**floor((maxexp-digits(zero)+1)*half) 
+     real(dp),    parameter, private :: ssml   = rradix**(-floor((minexp-digits(zero))*half)) 
+     real(dp),    parameter, private :: sbig   = rradix**(-ceiling((maxexp+digits(zero)-1)*half)) 
+
+
+     contains
+
+     !> DASUM: takes the sum of the absolute values.
+
+     pure real(dp) function stdlib_dasum(n,dx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           real(dp), intent(in) :: dx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: dtemp
+           integer(ilp) :: i, m, mp1, nincx
+           ! Intrinsic Functions 
+           intrinsic :: abs,mod
+           stdlib_dasum = zero
+           dtemp = zero
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              ! clean-up loop
+              m = mod(n,6)
+              if (m/=0) then
+                 do i = 1,m
+                    dtemp = dtemp + abs(dx(i))
+                 end do
+                 if (n<6) then
+                    stdlib_dasum = dtemp
+                    return
+                 end if
+              end if
+              mp1 = m + 1
+              do i = mp1,n,6
+                 dtemp = dtemp + abs(dx(i)) + abs(dx(i+1)) +abs(dx(i+2)) + abs(dx(i+3)) +abs(dx(i+&
+                           4)) + abs(dx(i+5))
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 dtemp = dtemp + abs(dx(i))
+              end do
+           end if
+           stdlib_dasum = dtemp
+           return
+     end function stdlib_dasum
+
+     !> DAXPY: constant times a vector plus a vector.
+     !> uses unrolled loops for increments equal to one.
+
+     pure subroutine stdlib_daxpy(n,da,dx,incx,dy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: da
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(dp), intent(in) :: dx(*)
+           real(dp), intent(inout) :: dy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy, m, mp1
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           if (n<=0) return
+           if (da==0.0_dp) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              ! clean-up loop
+              m = mod(n,4)
+              if (m/=0) then
+                 do i = 1,m
+                    dy(i) = dy(i) + da*dx(i)
+                 end do
+              end if
+              if (n<4) return
+              mp1 = m + 1
+              do i = mp1,n,4
+                 dy(i) = dy(i) + da*dx(i)
+                 dy(i+1) = dy(i+1) + da*dx(i+1)
+                 dy(i+2) = dy(i+2) + da*dx(i+2)
+                 dy(i+3) = dy(i+3) + da*dx(i+3)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+               dy(iy) = dy(iy) + da*dx(ix)
+               ix = ix + incx
+               iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_daxpy
+
+     !> DCOPY: copies a vector, x, to a vector, y.
+     !> uses unrolled loops for increments equal to 1.
+
+     pure subroutine stdlib_dcopy(n,dx,incx,dy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(dp), intent(in) :: dx(*)
+           real(dp), intent(out) :: dy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy, m, mp1
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              ! clean-up loop
+              m = mod(n,7)
+              if (m/=0) then
+                 do i = 1,m
+                    dy(i) = dx(i)
+                 end do
+                 if (n<7) return
+              end if
+              mp1 = m + 1
+              do i = mp1,n,7
+                 dy(i) = dx(i)
+                 dy(i+1) = dx(i+1)
+                 dy(i+2) = dx(i+2)
+                 dy(i+3) = dx(i+3)
+                 dy(i+4) = dx(i+4)
+                 dy(i+5) = dx(i+5)
+                 dy(i+6) = dx(i+6)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 dy(iy) = dx(ix)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_dcopy
+
+     !> DDOT: forms the dot product of two vectors.
+     !> uses unrolled loops for increments equal to one.
+
+     pure real(dp) function stdlib_ddot(n,dx,incx,dy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(dp), intent(in) :: dx(*), dy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: dtemp
+           integer(ilp) :: i, ix, iy, m, mp1
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           stdlib_ddot = zero
+           dtemp = zero
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              ! clean-up loop
+              m = mod(n,5)
+              if (m/=0) then
+                 do i = 1,m
+                    dtemp = dtemp + dx(i)*dy(i)
+                 end do
+                 if (n<5) then
+                    stdlib_ddot=dtemp
+                 return
+                 end if
+              end if
+              mp1 = m + 1
+              do i = mp1,n,5
+               dtemp = dtemp + dx(i)*dy(i) + dx(i+1)*dy(i+1) +dx(i+2)*dy(i+2) + dx(i+3)*dy(i+3) + &
+                         dx(i+4)*dy(i+4)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 dtemp = dtemp + dx(ix)*dy(iy)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           stdlib_ddot = dtemp
+           return
+     end function stdlib_ddot
+
+     !> DGBMV:  performs one of the matrix-vector operations
+     !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n band matrix, with kl sub-diagonals and ku super-diagonals.
+
+     pure subroutine stdlib_dgbmv(trans,m,n,kl,ku,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, kl, ku, lda, m, n
+           character, intent(in) :: trans
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), x(*)
+           real(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kup1, kx, ky, lenx, leny
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 1
+           else if (m<0) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (kl<0) then
+               info = 4
+           else if (ku<0) then
+               info = 5
+           else if (lda< (kl+ku+1)) then
+               info = 8
+           else if (incx==0) then
+               info = 10
+           else if (incy==0) then
+               info = 13
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DGBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==zero).and. (beta==one))) return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if (stdlib_lsame(trans,'N')) then
+               lenx = n
+               leny = m
+           else
+               lenx = m
+               leny = n
+           end if
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (lenx-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (leny-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through the band part of a.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,leny
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,leny
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,leny
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,leny
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           kup1 = ku + 1
+           if (stdlib_lsame(trans,'N')) then
+              ! form  y := alpha*a*x + y.
+               jx = kx
+               if (incy==1) then
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           y(i) = y(i) + temp*a(k+i,j)
+                       end do
+                       jx = jx + incx
+                   end do
+               else
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       iy = ky
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           y(iy) = y(iy) + temp*a(k+i,j)
+                           iy = iy + incy
+                       end do
+                       jx = jx + incx
+                       if (j>ku) ky = ky + incy
+                   end do
+               end if
+           else
+              ! form  y := alpha*a**t*x + y.
+               jy = ky
+               if (incx==1) then
+                   do j = 1,n
+                       temp = zero
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           temp = temp + a(k+i,j)*x(i)
+                       end do
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               else
+                   do j = 1,n
+                       temp = zero
+                       ix = kx
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           temp = temp + a(k+i,j)*x(ix)
+                           ix = ix + incx
+                       end do
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                       if (j>ku) kx = kx + incx
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_dgbmv
+
+     !> DGEMM:  performs one of the matrix-matrix operations
+     !> C := alpha*op( A )*op( B ) + beta*C,
+     !> where  op( X ) is one of
+     !> op( X ) = X   or   op( X ) = X**T,
+     !> alpha and beta are scalars, and A, B and C are matrices, with op( A )
+     !> an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+
+     pure subroutine stdlib_dgemm(transa,transb,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, m, n
+           character, intent(in) :: transa, transb
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), b(ldb,*)
+           real(dp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, j, l, nrowa, nrowb
+           logical(lk) :: nota, notb
+           
+           ! set  nota  and  notb  as  true if  a  and  b  respectively are not
+           ! transposed and set  nrowa and nrowb  as the number of rows of  a
+           ! and  b  respectively.
+           nota = stdlib_lsame(transa,'N')
+           notb = stdlib_lsame(transb,'N')
+           if (nota) then
+               nrowa = m
+           else
+               nrowa = k
+           end if
+           if (notb) then
+               nrowb = k
+           else
+               nrowb = n
+           end if
+           ! test the input parameters.
+           info = 0
+           if ((.not.nota) .and. (.not.stdlib_lsame(transa,'C')) .and.(.not.stdlib_lsame(transa,&
+                     'T'))) then
+               info = 1
+           else if ((.not.notb) .and. (.not.stdlib_lsame(transb,'C')) .and.(.not.stdlib_lsame(&
+                     transb,'T'))) then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda<max(1,nrowa)) then
+               info = 8
+           else if (ldb<max(1,nrowb)) then
+               info = 10
+           else if (ldc<max(1,m)) then
+               info = 13
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DGEMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.(((alpha==zero).or. (k==0)).and. (beta==one))) &
+                     return
+           ! and if  alpha.eq.zero.
+           if (alpha==zero) then
+               if (beta==zero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = zero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (notb) then
+               if (nota) then
+                 ! form  c := alpha*a*b + beta*c.
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,m
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*b(l,j)
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**t*b + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + a(l,i)*b(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           else
+               if (nota) then
+                 ! form  c := alpha*a*b**t + beta*c
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,m
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*b(j,l)
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**t*b**t + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + a(l,i)*b(j,l)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_dgemm
+
+     !> DGEMV:  performs one of the matrix-vector operations
+     !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+
+     pure subroutine stdlib_dgemv(trans,m,n,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           character, intent(in) :: trans
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), x(*)
+           real(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky, lenx, leny
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 1
+           else if (m<0) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (lda<max(1,m)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           else if (incy==0) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DGEMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==zero).and. (beta==one))) return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if (stdlib_lsame(trans,'N')) then
+               lenx = n
+               leny = m
+           else
+               lenx = m
+               leny = n
+           end if
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (lenx-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (leny-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,leny
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,leny
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,leny
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,leny
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           if (stdlib_lsame(trans,'N')) then
+              ! form  y := alpha*a*x + y.
+               jx = kx
+               if (incy==1) then
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       do i = 1,m
+                           y(i) = y(i) + temp*a(i,j)
+                       end do
+                       jx = jx + incx
+                   end do
+               else
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       iy = ky
+                       do i = 1,m
+                           y(iy) = y(iy) + temp*a(i,j)
+                           iy = iy + incy
+                       end do
+                       jx = jx + incx
+                   end do
+               end if
+           else
+              ! form  y := alpha*a**t*x + y.
+               jy = ky
+               if (incx==1) then
+                   do j = 1,n
+                       temp = zero
+                       do i = 1,m
+                           temp = temp + a(i,j)*x(i)
+                       end do
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               else
+                   do j = 1,n
+                       temp = zero
+                       ix = kx
+                       do i = 1,m
+                           temp = temp + a(i,j)*x(ix)
+                           ix = ix + incx
+                       end do
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_dgemv
+
+     !> DGER:   performs the rank 1 operation
+     !> A := alpha*x*y**T + A,
+     !> where alpha is a scalar, x is an m element vector, y is an n element
+     !> vector and A is an m by n matrix.
+
+     pure subroutine stdlib_dger(m,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jy, kx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (m<0) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,m)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DGER  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or. (alpha==zero)) return
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           if (incy>0) then
+               jy = 1
+           else
+               jy = 1 - (n-1)*incy
+           end if
+           if (incx==1) then
+               do j = 1,n
+                   if (y(jy)/=zero) then
+                       temp = alpha*y(jy)
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(i)*temp
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           else
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (m-1)*incx
+               end if
+               do j = 1,n
+                   if (y(jy)/=zero) then
+                       temp = alpha*y(jy)
+                       ix = kx
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(ix)*temp
+                           ix = ix + incx
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           end if
+           return
+     end subroutine stdlib_dger
+
+     !> !
+     !>
+     !> DNRM2: returns the euclidean norm of a vector via the function
+     !> name, so that
+     !> DNRM2 := sqrt( x'*x )
+
+     pure function stdlib_dnrm2( n, x, incx )
+        real(dp) :: stdlib_dnrm2
+        ! -- reference blas level1 routine (version 3.9.1_dp) --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! march 2021
+        ! Constants 
+        integer, parameter :: wp = kind(1._dp)
+        real(dp), parameter :: maxn = huge(0.0_dp)
+        ! .. blue's scaling constants ..
+        ! Scalar Arguments 
+     integer(ilp), intent(in) :: incx, n
+        ! Array Arguments 
+        real(dp), intent(in) :: x(*)
+        ! Local Scalars 
+     integer(ilp) :: i, ix
+     logical(lk) :: notbig
+        real(dp) :: abig, amed, asml, ax, scl, sumsq, ymax, ymin
+        ! quick return if possible
+        stdlib_dnrm2 = zero
+        if( n <= 0 ) return
+        scl = one
+        sumsq = zero
+        ! compute the sum of squares in 3 accumulators:
+           ! abig -- sums of squares scaled down to avoid overflow
+           ! asml -- sums of squares scaled up to avoid underflow
+           ! amed -- sums of squares that do not require scaling
+        ! the thresholds and multipliers are
+           ! tbig -- values bigger than this are scaled down by sbig
+           ! tsml -- values smaller than this are scaled up by ssml
+        notbig = .true.
+        asml = zero
+        amed = zero
+        abig = zero
+        ix = 1
+        if( incx < 0 ) ix = 1 - (n-1)*incx
+        do i = 1, n
+           ax = abs(x(ix))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ix = ix + incx
+        end do
+        ! combine abig and amed or amed and asml if more than one
+        ! accumulator was used.
+        if (abig > zero) then
+           ! combine abig and amed if abig > 0.
+           if ( (amed > zero) .or. (amed > maxn) .or. (amed /= amed) ) then
+              abig = abig + (amed*sbig)*sbig
+           end if
+           scl = one / sbig
+           sumsq = abig
+        else if (asml > zero) then
+           ! combine amed and asml if asml > 0.
+           if ( (amed > zero) .or. (amed > maxn) .or. (amed /= amed) ) then
+              amed = sqrt(amed)
+              asml = sqrt(asml) / ssml
+              if (asml > amed) then
+                 ymin = amed
+                 ymax = asml
+              else
+                 ymin = asml
+                 ymax = amed
+              end if
+              scl = one
+              sumsq = ymax**2*( one + (ymin/ymax)**2 )
+           else
+              scl = one / ssml
+              sumsq = asml
+           end if
+        else
+           ! otherwise all values are mid-range
+           scl = one
+           sumsq = amed
+        end if
+        stdlib_dnrm2 = scl*sqrt( sumsq )
+        return
+     end function stdlib_dnrm2
+
+     !> DROT: applies a plane rotation.
+
+     pure subroutine stdlib_drot(n,dx,incx,dy,incy,c,s)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: c, s
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: dx(*), dy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: dtemp
+           integer(ilp) :: i, ix, iy
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+             ! code for both increments equal to 1
+              do i = 1,n
+                 dtemp = c*dx(i) + s*dy(i)
+                 dy(i) = c*dy(i) - s*dx(i)
+                 dx(i) = dtemp
+              end do
+           else
+             ! code for unequal increments or equal increments not equal
+               ! to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 dtemp = c*dx(ix) + s*dy(iy)
+                 dy(iy) = c*dy(iy) - s*dx(ix)
+                 dx(ix) = dtemp
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_drot
+
+     !> !
+     !>
+     !> The computation uses the formulas
+     !> sigma = sgn(a)    if |a| >  |b|
+     !> = sgn(b)    if |b| >= |a|
+     !> r = sigma*sqrt( a**2 + b**2 )
+     !> c = 1; s = 0      if r = 0
+     !> c = a/r; s = b/r  if r != 0
+     !> The subroutine also computes
+     !> z = s    if |a| > |b|,
+     !> = 1/c  if |b| >= |a| and c != 0
+     !> = 1    if c = 0
+     !> This allows c and s to be reconstructed from z as follows:
+     !> If z = 1, set c = 0, s = 1.
+     !> If |z| < 1, set c = sqrt(1 - z**2) and s = z.
+     !> If |z| > 1, set c = 1/z and s = sqrt( 1 - c**2).
+
+     pure subroutine stdlib_drotg( a, b, c, s )
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+        ! Constants 
+        integer, parameter :: wp = kind(1._dp)
+        ! Scaling Constants 
+        ! Scalar Arguments 
+        real(dp), intent(inout) :: a, b
+        real(dp), intent(out) :: c, s
+        ! Local Scalars 
+        real(dp) :: anorm, bnorm, scl, sigma, r, z
+        anorm = abs(a)
+        bnorm = abs(b)
+        if( bnorm == zero ) then
+           c = one
+           s = zero
+           b = zero
+        else if( anorm == zero ) then
+           c = zero
+           s = one
+           a = b
+           b = one
+        else
+           scl = min( safmax, max( safmin, anorm, bnorm ) )
+           if( anorm > bnorm ) then
+              sigma = sign(one,a)
+           else
+              sigma = sign(one,b)
+           end if
+           r = sigma*( scl*sqrt((a/scl)**2 + (b/scl)**2) )
+           c = a/r
+           s = b/r
+           if( anorm > bnorm ) then
+              z = s
+           else if( c /= zero ) then
+              z = one/c
+           else
+              z = one
+           end if
+           a = r
+           b = z
+        end if
+        return
+     end subroutine stdlib_drotg
+
+     !> APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX
+     !> (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN
+     !> (DY**T)
+     !> DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX >= 0, ELSE
+     !> LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY.
+     !> WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
+     !> DFLAG=-1._dp     DFLAG=0._dp        DFLAG=1._dp     DFLAG=-2.D0
+     !> (DH11  DH12)    (1._dp  DH12)    (DH11  1._dp)    (1._dp  0._dp)
+     !> H=(          )    (          )    (          )    (          )
+     !> (DH21  DH22),   (DH21  1._dp),   (-1._dp DH22),   (0._dp  1._dp).
+     !> SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM.
+
+     pure subroutine stdlib_drotm(n,dx,incx,dy,incy,dparam)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(dp), intent(in) :: dparam(5)
+           real(dp), intent(inout) :: dx(*), dy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: dflag, dh11, dh12, dh21, dh22, two, w, z, zero
+           integer(ilp) :: i, kx, ky, nsteps
+           ! Data Statements 
+           zero = 0.0_dp
+           two = 2.0_dp
+           dflag = dparam(1)
+           if (n<=0 .or. (dflag+two==zero)) return
+           if (incx==incy.and.incx>0) then
+              nsteps = n*incx
+              if (dflag<zero) then
+                 dh11 = dparam(2)
+                 dh12 = dparam(4)
+                 dh21 = dparam(3)
+                 dh22 = dparam(5)
+                 do i = 1,nsteps,incx
+                    w = dx(i)
+                    z = dy(i)
+                    dx(i) = w*dh11 + z*dh12
+                    dy(i) = w*dh21 + z*dh22
+                 end do
+              else if (dflag==zero) then
+                 dh12 = dparam(4)
+                 dh21 = dparam(3)
+                 do i = 1,nsteps,incx
+                    w = dx(i)
+                    z = dy(i)
+                    dx(i) = w + z*dh12
+                    dy(i) = w*dh21 + z
+                 end do
+              else
+                 dh11 = dparam(2)
+                 dh22 = dparam(5)
+                 do i = 1,nsteps,incx
+                    w = dx(i)
+                    z = dy(i)
+                    dx(i) = w*dh11 + z
+                    dy(i) = -w + dh22*z
+                 end do
+              end if
+           else
+              kx = 1
+              ky = 1
+              if (incx<0) kx = 1 + (1-n)*incx
+              if (incy<0) ky = 1 + (1-n)*incy
+              if (dflag<zero) then
+                 dh11 = dparam(2)
+                 dh12 = dparam(4)
+                 dh21 = dparam(3)
+                 dh22 = dparam(5)
+                 do i = 1,n
+                    w = dx(kx)
+                    z = dy(ky)
+                    dx(kx) = w*dh11 + z*dh12
+                    dy(ky) = w*dh21 + z*dh22
+                    kx = kx + incx
+                    ky = ky + incy
+                 end do
+              else if (dflag==zero) then
+                 dh12 = dparam(4)
+                 dh21 = dparam(3)
+                 do i = 1,n
+                    w = dx(kx)
+                    z = dy(ky)
+                    dx(kx) = w + z*dh12
+                    dy(ky) = w*dh21 + z
+                    kx = kx + incx
+                    ky = ky + incy
+                 end do
+              else
+                  dh11 = dparam(2)
+                  dh22 = dparam(5)
+                  do i = 1,n
+                     w = dx(kx)
+                     z = dy(ky)
+                     dx(kx) = w*dh11 + z
+                     dy(ky) = -w + dh22*z
+                     kx = kx + incx
+                     ky = ky + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_drotm
+
+     !> CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS
+     !> THE SECOND COMPONENT OF THE 2-VECTOR  (SQRT(DD1)*DX1,SQRT(DD2)    DY2)**T.
+     !> WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
+     !> DFLAG=-1._dp     DFLAG=0._dp        DFLAG=1._dp     DFLAG=-2.D0
+     !> (DH11  DH12)    (1._dp  DH12)    (DH11  1._dp)    (1._dp  0._dp)
+     !> H=(          )    (          )    (          )    (          )
+     !> (DH21  DH22),   (DH21  1._dp),   (-1._dp DH22),   (0._dp  1._dp).
+     !> LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22
+     !> RESPECTIVELY. (VALUES OF 1._dp, -1._dp, OR 0._dp IMPLIED BY THE
+     !> VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.)
+     !> THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE
+     !> INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE
+     !> OF DD1 AND DD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM.
+
+     pure subroutine stdlib_drotmg(dd1,dd2,dx1,dy1,dparam)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(inout) :: dd1, dd2, dx1
+           real(dp), intent(in) :: dy1
+           ! Array Arguments 
+           real(dp), intent(out) :: dparam(5)
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: dflag, dh11, dh12, dh21, dh22, dp1, dp2, dq1, dq2, dtemp, du, gam, gamsq, &
+                     one, rgamsq, two, zero
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Data Statements 
+           zero = 0.0_dp
+           one = 1.0_dp
+           two = 2.0_dp
+           gam = 4096.0_dp
+           gamsq = 16777216.0_dp
+           rgamsq = 5.9604645e-8_dp
+           if (dd1<zero) then
+              ! go zero-h-d-and-dx1..
+              dflag = -one
+              dh11 = zero
+              dh12 = zero
+              dh21 = zero
+              dh22 = zero
+              dd1 = zero
+              dd2 = zero
+              dx1 = zero
+           else
+              ! case-dd1-nonnegative
+              dp2 = dd2*dy1
+              if (dp2==zero) then
+                 dflag = -two
+                 dparam(1) = dflag
+                 return
+              end if
+              ! regular-case..
+              dp1 = dd1*dx1
+              dq2 = dp2*dy1
+              dq1 = dp1*dx1
+              if (abs(dq1)>abs(dq2)) then
+                 dh21 = -dy1/dx1
+                 dh12 = dp2/dp1
+                 du = one - dh12*dh21
+                if (du>zero) then
+                  dflag = zero
+                  dd1 = dd1/du
+                  dd2 = dd2/du
+                  dx1 = dx1*du
+                else
+                  ! this code path if here for safety. we do not expect this
+                  ! condition to ever hold except in edge cases with rounding
+                  ! errors. see doi: 10.1145/355841.355847
+                  dflag = -one
+                  dh11 = zero
+                  dh12 = zero
+                  dh21 = zero
+                  dh22 = zero
+                  dd1 = zero
+                  dd2 = zero
+                  dx1 = zero
+                end if
+              else
+                 if (dq2<zero) then
+                    ! go zero-h-d-and-dx1..
+                    dflag = -one
+                    dh11 = zero
+                    dh12 = zero
+                    dh21 = zero
+                    dh22 = zero
+                    dd1 = zero
+                    dd2 = zero
+                    dx1 = zero
+                 else
+                    dflag = one
+                    dh11 = dp1/dp2
+                    dh22 = dx1/dy1
+                    du = one + dh11*dh22
+                    dtemp = dd2/du
+                    dd2 = dd1/du
+                    dd1 = dtemp
+                    dx1 = dy1*du
+                 end if
+              end if
+           ! procedure..scale-check
+              if (dd1/=zero) then
+                 do while ((dd1<=rgamsq) .or. (dd1>=gamsq))
+                    if (dflag==zero) then
+                       dh11 = one
+                       dh22 = one
+                       dflag = -one
+                    else
+                       dh21 = -one
+                       dh12 = one
+                       dflag = -one
+                    end if
+                    if (dd1<=rgamsq) then
+                       dd1 = dd1*gam**2
+                       dx1 = dx1/gam
+                       dh11 = dh11/gam
+                       dh12 = dh12/gam
+                    else
+                       dd1 = dd1/gam**2
+                       dx1 = dx1*gam
+                       dh11 = dh11*gam
+                       dh12 = dh12*gam
+                    end if
+                 enddo
+              end if
+              if (dd2/=zero) then
+                 do while ( (abs(dd2)<=rgamsq) .or. (abs(dd2)>=gamsq) )
+                    if (dflag==zero) then
+                       dh11 = one
+                       dh22 = one
+                       dflag = -one
+                    else
+                       dh21 = -one
+                       dh12 = one
+                       dflag = -one
+                    end if
+                    if (abs(dd2)<=rgamsq) then
+                       dd2 = dd2*gam**2
+                       dh21 = dh21/gam
+                       dh22 = dh22/gam
+                    else
+                       dd2 = dd2/gam**2
+                       dh21 = dh21*gam
+                       dh22 = dh22*gam
+                    end if
+                 end do
+              end if
+           end if
+           if (dflag<zero) then
+              dparam(2) = dh11
+              dparam(3) = dh21
+              dparam(4) = dh12
+              dparam(5) = dh22
+           else if (dflag==zero) then
+              dparam(3) = dh21
+              dparam(4) = dh12
+           else
+              dparam(2) = dh11
+              dparam(5) = dh22
+           end if
+           dparam(1) = dflag
+           return
+     end subroutine stdlib_drotmg
+
+     !> DSBMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric band matrix, with k super-diagonals.
+
+     pure subroutine stdlib_dsbmv(uplo,n,k,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, k, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), x(*)
+           real(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kplus1, kx, ky, l
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (k<0) then
+               info = 3
+           else if (lda< (k+1)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           else if (incy==0) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==zero).and. (beta==one))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of the array a
+           ! are accessed sequentially with one pass through a.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when upper triangle of a is stored.
+               kplus1 = k + 1
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       l = kplus1 - j
+                       do i = max(1,j-k),j - 1
+                           y(i) = y(i) + temp1*a(l+i,j)
+                           temp2 = temp2 + a(l+i,j)*x(i)
+                       end do
+                       y(j) = y(j) + temp1*a(kplus1,j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       ix = kx
+                       iy = ky
+                       l = kplus1 - j
+                       do i = max(1,j-k),j - 1
+                           y(iy) = y(iy) + temp1*a(l+i,j)
+                           temp2 = temp2 + a(l+i,j)*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*a(kplus1,j) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       if (j>k) then
+                           kx = kx + incx
+                           ky = ky + incy
+                       end if
+                   end do
+               end if
+           else
+              ! form  y  when lower triangle of a is stored.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       y(j) = y(j) + temp1*a(1,j)
+                       l = 1 - j
+                       do i = j + 1,min(n,j+k)
+                           y(i) = y(i) + temp1*a(l+i,j)
+                           temp2 = temp2 + a(l+i,j)*x(i)
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       y(jy) = y(jy) + temp1*a(1,j)
+                       l = 1 - j
+                       ix = jx
+                       iy = jy
+                       do i = j + 1,min(n,j+k)
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*a(l+i,j)
+                           temp2 = temp2 + a(l+i,j)*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_dsbmv
+
+     !> DSCAL: scales a vector by a constant.
+     !> uses unrolled loops for increment equal to 1.
+
+     pure subroutine stdlib_dscal(n,da,dx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: da
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: dx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, m, mp1, nincx
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              ! clean-up loop
+              m = mod(n,5)
+              if (m/=0) then
+                 do i = 1,m
+                    dx(i) = da*dx(i)
+                 end do
+                 if (n<5) return
+              end if
+              mp1 = m + 1
+              do i = mp1,n,5
+                 dx(i) = da*dx(i)
+                 dx(i+1) = da*dx(i+1)
+                 dx(i+2) = da*dx(i+2)
+                 dx(i+3) = da*dx(i+3)
+                 dx(i+4) = da*dx(i+4)
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 dx(i) = da*dx(i)
+              end do
+           end if
+           return
+     end subroutine stdlib_dscal
+
+     !> Compute the inner product of two vectors with extended
+     !> precision accumulation and result.
+     !> Returns D.P. dot product accumulated in D.P., for S.P. SX and SY
+     !> DSDOT: = sum for I = 0 to N-1 of  SX(LX+I*INCX) * SY(LY+I*INCY),
+     !> where LX = 1 if INCX >= 0, else LX = 1+(1-N)*INCX, and LY is
+     !> defined in a similar way using INCY.
+
+     pure real(dp) function stdlib_dsdot(n,sx,incx,sy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(sp), intent(in) :: sx(*), sy(*)
+        ! authors:
+        ! ========
+        ! lawson, c. l., (jpl), hanson, r. j., (snla),
+        ! kincaid, d. r., (u. of texas), krogh, f. t., (jpl)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, kx, ky, ns
+           ! Intrinsic Functions 
+           intrinsic :: real
+           stdlib_dsdot = zero
+           if (n<=0) return
+           if (incx==incy .and. incx>0) then
+           ! code for equal, positive, non-unit increments.
+              ns = n*incx
+              do i = 1,ns,incx
+                 stdlib_dsdot = stdlib_dsdot + real(sx(i),KIND=dp)*real(sy(i),KIND=dp)
+              end do
+           else
+           ! code for unequal or nonpositive increments.
+              kx = 1
+              ky = 1
+              if (incx<0) kx = 1 + (1-n)*incx
+              if (incy<0) ky = 1 + (1-n)*incy
+              do i = 1,n
+                 stdlib_dsdot = stdlib_dsdot + real(sx(kx),KIND=dp)*real(sy(ky),KIND=dp)
+                 kx = kx + incx
+                 ky = ky + incy
+              end do
+           end if
+           return
+     end function stdlib_dsdot
+
+     !> DSPMV:  performs the matrix-vector operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_dspmv(uplo,n,alpha,ap,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: ap(*), x(*)
+           real(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 6
+           else if (incy==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSPMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==zero).and. (beta==one))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with one pass through ap.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when ap contains the upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       k = kk
+                       do i = 1,j - 1
+                           y(i) = y(i) + temp1*ap(k)
+                           temp2 = temp2 + ap(k)*x(i)
+                           k = k + 1
+                       end do
+                       y(j) = y(j) + temp1*ap(kk+j-1) + alpha*temp2
+                       kk = kk + j
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       ix = kx
+                       iy = ky
+                       do k = kk,kk + j - 2
+                           y(iy) = y(iy) + temp1*ap(k)
+                           temp2 = temp2 + ap(k)*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*ap(kk+j-1) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  y  when ap contains the lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       y(j) = y(j) + temp1*ap(kk)
+                       k = kk + 1
+                       do i = j + 1,n
+                           y(i) = y(i) + temp1*ap(k)
+                           temp2 = temp2 + ap(k)*x(i)
+                           k = k + 1
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                       kk = kk + (n-j+1)
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       y(jy) = y(jy) + temp1*ap(kk)
+                       ix = jx
+                       iy = jy
+                       do k = kk + 1,kk + n - j
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*ap(k)
+                           temp2 = temp2 + ap(k)*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + (n-j+1)
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_dspmv
+
+     !> DSPR:    performs the symmetric rank 1 operation
+     !> A := alpha*x*x**T + A,
+     !> where alpha is a real scalar, x is an n element vector and A is an
+     !> n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_dspr(uplo,n,alpha,x,incx,ap)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(dp), intent(inout) :: ap(*)
+           real(dp), intent(in) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSPR  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==zero)) return
+           ! set the start point in x if the increment is not unity.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with one pass through ap.
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when upper triangle is stored in ap.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=zero) then
+                           temp = alpha*x(j)
+                           k = kk
+                           do i = 1,j
+                               ap(k) = ap(k) + x(i)*temp
+                               k = k + 1
+                           end do
+                       end if
+                       kk = kk + j
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=zero) then
+                           temp = alpha*x(jx)
+                           ix = kx
+                           do k = kk,kk + j - 1
+                               ap(k) = ap(k) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                       end if
+                       jx = jx + incx
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=zero) then
+                           temp = alpha*x(j)
+                           k = kk
+                           do i = j,n
+                               ap(k) = ap(k) + x(i)*temp
+                               k = k + 1
+                           end do
+                       end if
+                       kk = kk + n - j + 1
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=zero) then
+                           temp = alpha*x(jx)
+                           ix = jx
+                           do k = kk,kk + n - j
+                               ap(k) = ap(k) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                       end if
+                       jx = jx + incx
+                       kk = kk + n - j + 1
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_dspr
+
+     !> DSPR2:  performs the symmetric rank 2 operation
+     !> A := alpha*x*y**T + alpha*y*x**T + A,
+     !> where alpha is a scalar, x and y are n element vectors and A is an
+     !> n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_dspr2(uplo,n,alpha,x,incx,y,incy,ap)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(dp), intent(inout) :: ap(*)
+           real(dp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSPR2 ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==zero)) return
+           ! set up the start points in x and y if the increments are not both
+           ! unity.
+           if ((incx/=1) .or. (incy/=1)) then
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (n-1)*incx
+               end if
+               if (incy>0) then
+                   ky = 1
+               else
+                   ky = 1 - (n-1)*incy
+               end if
+               jx = kx
+               jy = ky
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with one pass through ap.
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when upper triangle is stored in ap.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=zero) .or. (y(j)/=zero)) then
+                           temp1 = alpha*y(j)
+                           temp2 = alpha*x(j)
+                           k = kk
+                           do i = 1,j
+                               ap(k) = ap(k) + x(i)*temp1 + y(i)*temp2
+                               k = k + 1
+                           end do
+                       end if
+                       kk = kk + j
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=zero) .or. (y(jy)/=zero)) then
+                           temp1 = alpha*y(jy)
+                           temp2 = alpha*x(jx)
+                           ix = kx
+                           iy = ky
+                           do k = kk,kk + j - 1
+                               ap(k) = ap(k) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=zero) .or. (y(j)/=zero)) then
+                           temp1 = alpha*y(j)
+                           temp2 = alpha*x(j)
+                           k = kk
+                           do i = j,n
+                               ap(k) = ap(k) + x(i)*temp1 + y(i)*temp2
+                               k = k + 1
+                           end do
+                       end if
+                       kk = kk + n - j + 1
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=zero) .or. (y(jy)/=zero)) then
+                           temp1 = alpha*y(jy)
+                           temp2 = alpha*x(jx)
+                           ix = jx
+                           iy = jy
+                           do k = kk,kk + n - j
+                               ap(k) = ap(k) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + n - j + 1
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_dspr2
+
+     !> DSWAP: interchanges two vectors.
+     !> uses unrolled loops for increments equal to 1.
+
+     pure subroutine stdlib_dswap(n,dx,incx,dy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: dx(*), dy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: dtemp
+           integer(ilp) :: i, ix, iy, m, mp1
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+             ! code for both increments equal to 1
+             ! clean-up loop
+              m = mod(n,3)
+              if (m/=0) then
+                 do i = 1,m
+                    dtemp = dx(i)
+                    dx(i) = dy(i)
+                    dy(i) = dtemp
+                 end do
+                 if (n<3) return
+              end if
+              mp1 = m + 1
+              do i = mp1,n,3
+                 dtemp = dx(i)
+                 dx(i) = dy(i)
+                 dy(i) = dtemp
+                 dtemp = dx(i+1)
+                 dx(i+1) = dy(i+1)
+                 dy(i+1) = dtemp
+                 dtemp = dx(i+2)
+                 dx(i+2) = dy(i+2)
+                 dy(i+2) = dtemp
+              end do
+           else
+             ! code for unequal increments or equal increments not equal
+               ! to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 dtemp = dx(ix)
+                 dx(ix) = dy(iy)
+                 dy(iy) = dtemp
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_dswap
+
+     !> DSYMM:  performs one of the matrix-matrix operations
+     !> C := alpha*A*B + beta*C,
+     !> or
+     !> C := alpha*B*A + beta*C,
+     !> where alpha and beta are scalars,  A is a symmetric matrix and  B and
+     !> C are  m by n matrices.
+
+     pure subroutine stdlib_dsymm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           character, intent(in) :: side, uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), b(ldb,*)
+           real(dp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(dp) :: temp1, temp2
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: upper
+           
+           ! set nrowa as the number of rows of a.
+           if (stdlib_lsame(side,'L')) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           ! test the input parameters.
+           info = 0
+           if ((.not.stdlib_lsame(side,'L')) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,m)) then
+               info = 9
+           else if (ldc<max(1,m)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSYMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==zero).and. (beta==one))) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               if (beta==zero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = zero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(side,'L')) then
+              ! form  c := alpha*a*b + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,m
+                           temp1 = alpha*b(i,j)
+                           temp2 = zero
+                           do k = 1,i - 1
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*a(k,i)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = temp1*a(i,i) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*a(i,i) +alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = m,1,-1
+                           temp1 = alpha*b(i,j)
+                           temp2 = zero
+                           do k = i + 1,m
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*a(k,i)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = temp1*a(i,i) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*a(i,i) +alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*b*a + beta*c.
+               loop_170: do j = 1,n
+                   temp1 = alpha*a(j,j)
+                   if (beta==zero) then
+                       do i = 1,m
+                           c(i,j) = temp1*b(i,j)
+                       end do
+                   else
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j) + temp1*b(i,j)
+                       end do
+                   end if
+                   do k = 1,j - 1
+                       if (upper) then
+                           temp1 = alpha*a(k,j)
+                       else
+                           temp1 = alpha*a(j,k)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+                   do k = j + 1,n
+                       if (upper) then
+                           temp1 = alpha*a(j,k)
+                       else
+                           temp1 = alpha*a(k,j)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+               end do loop_170
+           end if
+           return
+     end subroutine stdlib_dsymm
+
+     !> DSYMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric matrix.
+
+     pure subroutine stdlib_dsymv(uplo,n,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), x(*)
+           real(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (lda<max(1,n)) then
+               info = 5
+           else if (incx==0) then
+               info = 7
+           else if (incy==0) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSYMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==zero).and. (beta==one))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through the triangular part
+           ! of a.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when a is stored in upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       do i = 1,j - 1
+                           y(i) = y(i) + temp1*a(i,j)
+                           temp2 = temp2 + a(i,j)*x(i)
+                       end do
+                       y(j) = y(j) + temp1*a(j,j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       ix = kx
+                       iy = ky
+                       do i = 1,j - 1
+                           y(iy) = y(iy) + temp1*a(i,j)
+                           temp2 = temp2 + a(i,j)*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*a(j,j) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           else
+              ! form  y  when a is stored in lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       y(j) = y(j) + temp1*a(j,j)
+                       do i = j + 1,n
+                           y(i) = y(i) + temp1*a(i,j)
+                           temp2 = temp2 + a(i,j)*x(i)
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       y(jy) = y(jy) + temp1*a(j,j)
+                       ix = jx
+                       iy = jy
+                       do i = j + 1,n
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*a(i,j)
+                           temp2 = temp2 + a(i,j)*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_dsymv
+
+     !> DSYR:   performs the symmetric rank 1 operation
+     !> A := alpha*x*x**T + A,
+     !> where alpha is a real scalar, x is an n element vector and A is an
+     !> n by n symmetric matrix.
+
+     pure subroutine stdlib_dsyr(uplo,n,alpha,x,incx,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (lda<max(1,n)) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSYR  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==zero)) return
+           ! set the start point in x if the increment is not unity.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through the triangular part
+           ! of a.
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when a is stored in upper triangle.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=zero) then
+                           temp = alpha*x(j)
+                           do i = 1,j
+                               a(i,j) = a(i,j) + x(i)*temp
+                           end do
+                       end if
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=zero) then
+                           temp = alpha*x(jx)
+                           ix = kx
+                           do i = 1,j
+                               a(i,j) = a(i,j) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                       end if
+                       jx = jx + incx
+                   end do
+               end if
+           else
+              ! form  a  when a is stored in lower triangle.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=zero) then
+                           temp = alpha*x(j)
+                           do i = j,n
+                               a(i,j) = a(i,j) + x(i)*temp
+                           end do
+                       end if
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=zero) then
+                           temp = alpha*x(jx)
+                           ix = jx
+                           do i = j,n
+                               a(i,j) = a(i,j) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                       end if
+                       jx = jx + incx
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_dsyr
+
+     !> DSYR2:  performs the symmetric rank 2 operation
+     !> A := alpha*x*y**T + alpha*y*x**T + A,
+     !> where alpha is a scalar, x and y are n element vectors and A is an n
+     !> by n symmetric matrix.
+
+     pure subroutine stdlib_dsyr2(uplo,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,n)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSYR2 ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==zero)) return
+           ! set up the start points in x and y if the increments are not both
+           ! unity.
+           if ((incx/=1) .or. (incy/=1)) then
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (n-1)*incx
+               end if
+               if (incy>0) then
+                   ky = 1
+               else
+                   ky = 1 - (n-1)*incy
+               end if
+               jx = kx
+               jy = ky
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through the triangular part
+           ! of a.
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when a is stored in the upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=zero) .or. (y(j)/=zero)) then
+                           temp1 = alpha*y(j)
+                           temp2 = alpha*x(j)
+                           do i = 1,j
+                               a(i,j) = a(i,j) + x(i)*temp1 + y(i)*temp2
+                           end do
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=zero) .or. (y(jy)/=zero)) then
+                           temp1 = alpha*y(jy)
+                           temp2 = alpha*x(jx)
+                           ix = kx
+                           iy = ky
+                           do i = 1,j
+                               a(i,j) = a(i,j) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           else
+              ! form  a  when a is stored in the lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=zero) .or. (y(j)/=zero)) then
+                           temp1 = alpha*y(j)
+                           temp2 = alpha*x(j)
+                           do i = j,n
+                               a(i,j) = a(i,j) + x(i)*temp1 + y(i)*temp2
+                           end do
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=zero) .or. (y(jy)/=zero)) then
+                           temp1 = alpha*y(jy)
+                           temp2 = alpha*x(jx)
+                           ix = jx
+                           iy = jy
+                           do i = j,n
+                               a(i,j) = a(i,j) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_dsyr2
+
+     !> DSYR2K:  performs one of the symmetric rank 2k operations
+     !> C := alpha*A*B**T + alpha*B*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*B + alpha*B**T*A + beta*C,
+     !> where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+     !> and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+     !> matrices in the second case.
+
+     pure subroutine stdlib_dsyr2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), b(ldb,*)
+           real(dp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(dp) :: temp1, temp2
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'T')) .and.(&
+                     .not.stdlib_lsame(trans,'C'))) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,nrowa)) then
+               info = 9
+           else if (ldc<max(1,n)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSYR2K',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==zero).or.(k==0)).and. (beta==one))) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               if (upper) then
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               else
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*b**t + alpha*b*a**t + c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=zero) .or. (b(j,l)/=zero)) then
+                               temp1 = alpha*b(j,l)
+                               temp2 = alpha*a(j,l)
+                               do i = 1,j
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=zero) .or. (b(j,l)/=zero)) then
+                               temp1 = alpha*b(j,l)
+                               temp2 = alpha*a(j,l)
+                               do i = j,n
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**t*b + alpha*b**t*a + c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp1 = zero
+                           temp2 = zero
+                           do l = 1,k
+                               temp1 = temp1 + a(l,i)*b(l,j)
+                               temp2 = temp2 + b(l,i)*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp1 + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + alpha*temp1 +alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp1 = zero
+                           temp2 = zero
+                           do l = 1,k
+                               temp1 = temp1 + a(l,i)*b(l,j)
+                               temp2 = temp2 + b(l,i)*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp1 + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + alpha*temp1 +alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_dsyr2k
+
+     !> DSYRK:  performs one of the symmetric rank k operations
+     !> C := alpha*A*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*A + beta*C,
+     !> where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+     !> and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+     !> in the second case.
+
+     pure subroutine stdlib_dsyrk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'T')) .and.(&
+                     .not.stdlib_lsame(trans,'C'))) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldc<max(1,n)) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSYRK ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==zero).or.(k==0)).and. (beta==one))) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               if (upper) then
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               else
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*a**t + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=zero) then
+                               temp = alpha*a(j,l)
+                               do i = 1,j
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=zero) then
+                               temp = alpha*a(j,l)
+                               do i = j,n
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**t*a + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + a(l,i)*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + a(l,i)*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_dsyrk
+
+     !> DTBMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+
+     pure subroutine stdlib_dtbmv(uplo,trans,diag,n,k,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kplus1, kx, l
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda< (k+1)) then
+               info = 7
+           else if (incx==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx   too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           if (stdlib_lsame(trans,'N')) then
+               ! form  x := a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               l = kplus1 - j
+                               do i = max(1,j-k),j - 1
+                                   x(i) = x(i) + temp*a(l+i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(kplus1,j)
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               l = kplus1 - j
+                               do i = max(1,j-k),j - 1
+                                   x(ix) = x(ix) + temp*a(l+i,j)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(kplus1,j)
+                           end if
+                           jx = jx + incx
+                           if (j>k) kx = kx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               l = 1 - j
+                               do i = min(n,j+k),j + 1,-1
+                                   x(i) = x(i) + temp*a(l+i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(1,j)
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               l = 1 - j
+                               do i = min(n,j+k),j + 1,-1
+                                   x(ix) = x(ix) + temp*a(l+i,j)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(1,j)
+                           end if
+                           jx = jx - incx
+                           if ((n-j)>=k) kx = kx - incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           l = kplus1 - j
+                           if (nounit) temp = temp*a(kplus1,j)
+                           do i = j - 1,max(1,j-k),-1
+                               temp = temp + a(l+i,j)*x(i)
+                           end do
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           kx = kx - incx
+                           ix = kx
+                           l = kplus1 - j
+                           if (nounit) temp = temp*a(kplus1,j)
+                           do i = j - 1,max(1,j-k),-1
+                               temp = temp + a(l+i,j)*x(ix)
+                               ix = ix - incx
+                           end do
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           l = 1 - j
+                           if (nounit) temp = temp*a(1,j)
+                           do i = j + 1,min(n,j+k)
+                               temp = temp + a(l+i,j)*x(i)
+                           end do
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           kx = kx + incx
+                           ix = kx
+                           l = 1 - j
+                           if (nounit) temp = temp*a(1,j)
+                           do i = j + 1,min(n,j+k)
+                               temp = temp + a(l+i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_dtbmv
+
+     !> DTBSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular band matrix, with ( k + 1 )
+     !> diagonals.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_dtbsv(uplo,trans,diag,n,k,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kplus1, kx, l
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda< (k+1)) then
+               info = 7
+           else if (incx==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTBSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed by sequentially with one pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               l = kplus1 - j
+                               if (nounit) x(j) = x(j)/a(kplus1,j)
+                               temp = x(j)
+                               do i = j - 1,max(1,j-k),-1
+                                   x(i) = x(i) - temp*a(l+i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           kx = kx - incx
+                           if (x(jx)/=zero) then
+                               ix = kx
+                               l = kplus1 - j
+                               if (nounit) x(jx) = x(jx)/a(kplus1,j)
+                               temp = x(jx)
+                               do i = j - 1,max(1,j-k),-1
+                                   x(ix) = x(ix) - temp*a(l+i,j)
+                                   ix = ix - incx
+                               end do
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               l = 1 - j
+                               if (nounit) x(j) = x(j)/a(1,j)
+                               temp = x(j)
+                               do i = j + 1,min(n,j+k)
+                                   x(i) = x(i) - temp*a(l+i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           kx = kx + incx
+                           if (x(jx)/=zero) then
+                               ix = kx
+                               l = 1 - j
+                               if (nounit) x(jx) = x(jx)/a(1,j)
+                               temp = x(jx)
+                               do i = j + 1,min(n,j+k)
+                                   x(ix) = x(ix) - temp*a(l+i,j)
+                                   ix = ix + incx
+                               end do
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t)*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           l = kplus1 - j
+                           do i = max(1,j-k),j - 1
+                               temp = temp - a(l+i,j)*x(i)
+                           end do
+                           if (nounit) temp = temp/a(kplus1,j)
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           l = kplus1 - j
+                           do i = max(1,j-k),j - 1
+                               temp = temp - a(l+i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                           if (nounit) temp = temp/a(kplus1,j)
+                           x(jx) = temp
+                           jx = jx + incx
+                           if (j>k) kx = kx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           l = 1 - j
+                           do i = min(n,j+k),j + 1,-1
+                               temp = temp - a(l+i,j)*x(i)
+                           end do
+                           if (nounit) temp = temp/a(1,j)
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           l = 1 - j
+                           do i = min(n,j+k),j + 1,-1
+                               temp = temp - a(l+i,j)*x(ix)
+                               ix = ix - incx
+                           end do
+                           if (nounit) temp = temp/a(1,j)
+                           x(jx) = temp
+                           jx = jx - incx
+                           if ((n-j)>=k) kx = kx - incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_dtbsv
+
+     !> DTPMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular matrix, supplied in packed form.
+
+     pure subroutine stdlib_dtpmv(uplo,trans,diag,n,ap,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: ap(*)
+           real(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           logical(lk) :: nounit
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (incx==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTPMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of ap are
+           ! accessed sequentially with one pass through ap.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x:= a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               k = kk
+                               do i = 1,j - 1
+                                   x(i) = x(i) + temp*ap(k)
+                                   k = k + 1
+                               end do
+                               if (nounit) x(j) = x(j)*ap(kk+j-1)
+                           end if
+                           kk = kk + j
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               do k = kk,kk + j - 2
+                                   x(ix) = x(ix) + temp*ap(k)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*ap(kk+j-1)
+                           end if
+                           jx = jx + incx
+                           kk = kk + j
+                       end do
+                   end if
+               else
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               k = kk
+                               do i = n,j + 1,-1
+                                   x(i) = x(i) + temp*ap(k)
+                                   k = k - 1
+                               end do
+                               if (nounit) x(j) = x(j)*ap(kk-n+j)
+                           end if
+                           kk = kk - (n-j+1)
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               do k = kk,kk - (n- (j+1)),-1
+                                   x(ix) = x(ix) + temp*ap(k)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*ap(kk-n+j)
+                           end if
+                           jx = jx - incx
+                           kk = kk - (n-j+1)
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           if (nounit) temp = temp*ap(kk)
+                           k = kk - 1
+                           do i = j - 1,1,-1
+                               temp = temp + ap(k)*x(i)
+                               k = k - 1
+                           end do
+                           x(j) = temp
+                           kk = kk - j
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = jx
+                           if (nounit) temp = temp*ap(kk)
+                           do k = kk - 1,kk - j + 1,-1
+                               ix = ix - incx
+                               temp = temp + ap(k)*x(ix)
+                           end do
+                           x(jx) = temp
+                           jx = jx - incx
+                           kk = kk - j
+                       end do
+                   end if
+               else
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           if (nounit) temp = temp*ap(kk)
+                           k = kk + 1
+                           do i = j + 1,n
+                               temp = temp + ap(k)*x(i)
+                               k = k + 1
+                           end do
+                           x(j) = temp
+                           kk = kk + (n-j+1)
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = jx
+                           if (nounit) temp = temp*ap(kk)
+                           do k = kk + 1,kk + n - j
+                               ix = ix + incx
+                               temp = temp + ap(k)*x(ix)
+                           end do
+                           x(jx) = temp
+                           jx = jx + incx
+                           kk = kk + (n-j+1)
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_dtpmv
+
+     !> DTPSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular matrix, supplied in packed form.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_dtpsv(uplo,trans,diag,n,ap,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: ap(*)
+           real(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           logical(lk) :: nounit
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (incx==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTPSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of ap are
+           ! accessed sequentially with one pass through ap.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               if (nounit) x(j) = x(j)/ap(kk)
+                               temp = x(j)
+                               k = kk - 1
+                               do i = j - 1,1,-1
+                                   x(i) = x(i) - temp*ap(k)
+                                   k = k - 1
+                               end do
+                           end if
+                           kk = kk - j
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               if (nounit) x(jx) = x(jx)/ap(kk)
+                               temp = x(jx)
+                               ix = jx
+                               do k = kk - 1,kk - j + 1,-1
+                                   ix = ix - incx
+                                   x(ix) = x(ix) - temp*ap(k)
+                               end do
+                           end if
+                           jx = jx - incx
+                           kk = kk - j
+                       end do
+                   end if
+               else
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               if (nounit) x(j) = x(j)/ap(kk)
+                               temp = x(j)
+                               k = kk + 1
+                               do i = j + 1,n
+                                   x(i) = x(i) - temp*ap(k)
+                                   k = k + 1
+                               end do
+                           end if
+                           kk = kk + (n-j+1)
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               if (nounit) x(jx) = x(jx)/ap(kk)
+                               temp = x(jx)
+                               ix = jx
+                               do k = kk + 1,kk + n - j
+                                   ix = ix + incx
+                                   x(ix) = x(ix) - temp*ap(k)
+                               end do
+                           end if
+                           jx = jx + incx
+                           kk = kk + (n-j+1)
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           k = kk
+                           do i = 1,j - 1
+                               temp = temp - ap(k)*x(i)
+                               k = k + 1
+                           end do
+                           if (nounit) temp = temp/ap(kk+j-1)
+                           x(j) = temp
+                           kk = kk + j
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           do k = kk,kk + j - 2
+                               temp = temp - ap(k)*x(ix)
+                               ix = ix + incx
+                           end do
+                           if (nounit) temp = temp/ap(kk+j-1)
+                           x(jx) = temp
+                           jx = jx + incx
+                           kk = kk + j
+                       end do
+                   end if
+               else
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           k = kk
+                           do i = n,j + 1,-1
+                               temp = temp - ap(k)*x(i)
+                               k = k - 1
+                           end do
+                           if (nounit) temp = temp/ap(kk-n+j)
+                           x(j) = temp
+                           kk = kk - (n-j+1)
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           do k = kk,kk - (n- (j+1)),-1
+                               temp = temp - ap(k)*x(ix)
+                               ix = ix - incx
+                           end do
+                           if (nounit) temp = temp/ap(kk-n+j)
+                           x(jx) = temp
+                           jx = jx - incx
+                           kk = kk - (n-j+1)
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_dtpsv
+
+     !> DTRMM:  performs one of the matrix-matrix operations
+     !> B := alpha*op( A )*B,   or   B := alpha*B*op( A ),
+     !> where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T.
+
+     pure subroutine stdlib_dtrmm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           character, intent(in) :: diag, side, transa, uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: lside, nounit, upper
+           
+           ! test the input parameters.
+           lside = stdlib_lsame(side,'L')
+           if (lside) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           nounit = stdlib_lsame(diag,'N')
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.lside) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if ((.not.stdlib_lsame(transa,'N')) .and.(.not.stdlib_lsame(transa,'T')) .and.(&
+                     .not.stdlib_lsame(transa,'C'))) then
+               info = 3
+           else if ((.not.stdlib_lsame(diag,'U')) .and. (.not.stdlib_lsame(diag,'N'))) &
+                     then
+               info = 4
+           else if (m<0) then
+               info = 5
+           else if (n<0) then
+               info = 6
+           else if (lda<max(1,nrowa)) then
+               info = 9
+           else if (ldb<max(1,m)) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTRMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (m==0 .or. n==0) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               do j = 1,n
+                   do i = 1,m
+                       b(i,j) = zero
+                   end do
+               end do
+               return
+           end if
+           ! start the operations.
+           if (lside) then
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*a*b.
+                   if (upper) then
+                       do j = 1,n
+                           do k = 1,m
+                               if (b(k,j)/=zero) then
+                                   temp = alpha*b(k,j)
+                                   do i = 1,k - 1
+                                       b(i,j) = b(i,j) + temp*a(i,k)
+                                   end do
+                                   if (nounit) temp = temp*a(k,k)
+                                   b(k,j) = temp
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do k = m,1,-1
+                               if (b(k,j)/=zero) then
+                                   temp = alpha*b(k,j)
+                                   b(k,j) = temp
+                                   if (nounit) b(k,j) = b(k,j)*a(k,k)
+                                   do i = k + 1,m
+                                       b(i,j) = b(i,j) + temp*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*a**t*b.
+                   if (upper) then
+                       do j = 1,n
+                           do i = m,1,-1
+                               temp = b(i,j)
+                               if (nounit) temp = temp*a(i,i)
+                               do k = 1,i - 1
+                                   temp = temp + a(k,i)*b(k,j)
+                               end do
+                               b(i,j) = alpha*temp
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,m
+                               temp = b(i,j)
+                               if (nounit) temp = temp*a(i,i)
+                               do k = i + 1,m
+                                   temp = temp + a(k,i)*b(k,j)
+                               end do
+                               b(i,j) = alpha*temp
+                           end do
+                       end do
+                   end if
+               end if
+           else
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*b*a.
+                   if (upper) then
+                       do j = n,1,-1
+                           temp = alpha
+                           if (nounit) temp = temp*a(j,j)
+                           do i = 1,m
+                               b(i,j) = temp*b(i,j)
+                           end do
+                           do k = 1,j - 1
+                               if (a(k,j)/=zero) then
+                                   temp = alpha*a(k,j)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           temp = alpha
+                           if (nounit) temp = temp*a(j,j)
+                           do i = 1,m
+                               b(i,j) = temp*b(i,j)
+                           end do
+                           do k = j + 1,n
+                               if (a(k,j)/=zero) then
+                                   temp = alpha*a(k,j)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*b*a**t.
+                   if (upper) then
+                       do k = 1,n
+                           do j = 1,k - 1
+                               if (a(j,k)/=zero) then
+                                   temp = alpha*a(j,k)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           temp = alpha
+                           if (nounit) temp = temp*a(k,k)
+                           if (temp/=one) then
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                       end do
+                   else
+                       do k = n,1,-1
+                           do j = k + 1,n
+                               if (a(j,k)/=zero) then
+                                   temp = alpha*a(j,k)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           temp = alpha
+                           if (nounit) temp = temp*a(k,k)
+                           if (temp/=one) then
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_dtrmm
+
+     !> DTRMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular matrix.
+
+     pure subroutine stdlib_dtrmv(uplo,trans,diag,n,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,n)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTRMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               do i = 1,j - 1
+                                   x(i) = x(i) + temp*a(i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(j,j)
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               do i = 1,j - 1
+                                   x(ix) = x(ix) + temp*a(i,j)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(j,j)
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               do i = n,j + 1,-1
+                                   x(i) = x(i) + temp*a(i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(j,j)
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               do i = n,j + 1,-1
+                                   x(ix) = x(ix) + temp*a(i,j)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(j,j)
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           if (nounit) temp = temp*a(j,j)
+                           do i = j - 1,1,-1
+                               temp = temp + a(i,j)*x(i)
+                           end do
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = jx
+                           if (nounit) temp = temp*a(j,j)
+                           do i = j - 1,1,-1
+                               ix = ix - incx
+                               temp = temp + a(i,j)*x(ix)
+                           end do
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           if (nounit) temp = temp*a(j,j)
+                           do i = j + 1,n
+                               temp = temp + a(i,j)*x(i)
+                           end do
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = jx
+                           if (nounit) temp = temp*a(j,j)
+                           do i = j + 1,n
+                               ix = ix + incx
+                               temp = temp + a(i,j)*x(ix)
+                           end do
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_dtrmv
+
+     !> DTRSM:  solves one of the matrix equations
+     !> op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+     !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T.
+     !> The matrix X is overwritten on B.
+
+     pure subroutine stdlib_dtrsm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           character, intent(in) :: diag, side, transa, uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: lside, nounit, upper
+           
+           ! test the input parameters.
+           lside = stdlib_lsame(side,'L')
+           if (lside) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           nounit = stdlib_lsame(diag,'N')
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.lside) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if ((.not.stdlib_lsame(transa,'N')) .and.(.not.stdlib_lsame(transa,'T')) .and.(&
+                     .not.stdlib_lsame(transa,'C'))) then
+               info = 3
+           else if ((.not.stdlib_lsame(diag,'U')) .and. (.not.stdlib_lsame(diag,'N'))) &
+                     then
+               info = 4
+           else if (m<0) then
+               info = 5
+           else if (n<0) then
+               info = 6
+           else if (lda<max(1,nrowa)) then
+               info = 9
+           else if (ldb<max(1,m)) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTRSM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (m==0 .or. n==0) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               do j = 1,n
+                   do i = 1,m
+                       b(i,j) = zero
+                   end do
+               end do
+               return
+           end if
+           ! start the operations.
+           if (lside) then
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*inv( a )*b.
+                   if (upper) then
+                       do j = 1,n
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = m,1,-1
+                               if (b(k,j)/=zero) then
+                                   if (nounit) b(k,j) = b(k,j)/a(k,k)
+                                   do i = 1,k - 1
+                                       b(i,j) = b(i,j) - b(k,j)*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = 1,m
+                               if (b(k,j)/=zero) then
+                                   if (nounit) b(k,j) = b(k,j)/a(k,k)
+                                   do i = k + 1,m
+                                       b(i,j) = b(i,j) - b(k,j)*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*inv( a**t )*b.
+                   if (upper) then
+                       do j = 1,n
+                           do i = 1,m
+                               temp = alpha*b(i,j)
+                               do k = 1,i - 1
+                                   temp = temp - a(k,i)*b(k,j)
+                               end do
+                               if (nounit) temp = temp/a(i,i)
+                               b(i,j) = temp
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = m,1,-1
+                               temp = alpha*b(i,j)
+                               do k = i + 1,m
+                                   temp = temp - a(k,i)*b(k,j)
+                               end do
+                               if (nounit) temp = temp/a(i,i)
+                               b(i,j) = temp
+                           end do
+                       end do
+                   end if
+               end if
+           else
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*b*inv( a ).
+                   if (upper) then
+                       do j = 1,n
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = 1,j - 1
+                               if (a(k,j)/=zero) then
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - a(k,j)*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (nounit) then
+                               temp = one/a(j,j)
+                               do i = 1,m
+                                   b(i,j) = temp*b(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       do j = n,1,-1
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = j + 1,n
+                               if (a(k,j)/=zero) then
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - a(k,j)*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (nounit) then
+                               temp = one/a(j,j)
+                               do i = 1,m
+                                   b(i,j) = temp*b(i,j)
+                               end do
+                           end if
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*b*inv( a**t ).
+                   if (upper) then
+                       do k = n,1,-1
+                           if (nounit) then
+                               temp = one/a(k,k)
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                           do j = 1,k - 1
+                               if (a(j,k)/=zero) then
+                                   temp = a(j,k)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,k) = alpha*b(i,k)
+                               end do
+                           end if
+                       end do
+                   else
+                       do k = 1,n
+                           if (nounit) then
+                               temp = one/a(k,k)
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                           do j = k + 1,n
+                               if (a(j,k)/=zero) then
+                                   temp = a(j,k)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,k) = alpha*b(i,k)
+                               end do
+                           end if
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_dtrsm
+
+     !> DTRSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular matrix.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_dtrsv(uplo,trans,diag,n,a,lda,x,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,n)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTRSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               if (nounit) x(j) = x(j)/a(j,j)
+                               temp = x(j)
+                               do i = j - 1,1,-1
+                                   x(i) = x(i) - temp*a(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               if (nounit) x(jx) = x(jx)/a(j,j)
+                               temp = x(jx)
+                               ix = jx
+                               do i = j - 1,1,-1
+                                   ix = ix - incx
+                                   x(ix) = x(ix) - temp*a(i,j)
+                               end do
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               if (nounit) x(j) = x(j)/a(j,j)
+                               temp = x(j)
+                               do i = j + 1,n
+                                   x(i) = x(i) - temp*a(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               if (nounit) x(jx) = x(jx)/a(j,j)
+                               temp = x(jx)
+                               ix = jx
+                               do i = j + 1,n
+                                   ix = ix + incx
+                                   x(ix) = x(ix) - temp*a(i,j)
+                               end do
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           do i = 1,j - 1
+                               temp = temp - a(i,j)*x(i)
+                           end do
+                           if (nounit) temp = temp/a(j,j)
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           do i = 1,j - 1
+                               temp = temp - a(i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                           if (nounit) temp = temp/a(j,j)
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           do i = n,j + 1,-1
+                               temp = temp - a(i,j)*x(i)
+                           end do
+                           if (nounit) temp = temp/a(j,j)
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           do i = n,j + 1,-1
+                               temp = temp - a(i,j)*x(ix)
+                               ix = ix - incx
+                           end do
+                           if (nounit) temp = temp/a(j,j)
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_dtrsv
+
+     !> DZASUM: takes the sum of the (|Re(.)| + |Im(.)|)'s of a complex vector and
+     !> returns a double precision result.
+
+     pure real(dp) function stdlib_dzasum(n,zx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(dp), intent(in) :: zx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: stemp
+           integer(ilp) :: i, nincx
+           stdlib_dzasum = zero
+           stemp = zero
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              do i = 1,n
+                 stemp = stemp + stdlib_dcabs1(zx(i))
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 stemp = stemp + stdlib_dcabs1(zx(i))
+              end do
+           end if
+           stdlib_dzasum = stemp
+           return
+     end function stdlib_dzasum
+
+     !> !
+     !>
+     !> DZNRM2: returns the euclidean norm of a vector via the function
+     !> name, so that
+     !> DZNRM2 := sqrt( x**H*x )
+
+     pure function stdlib_dznrm2( n, x, incx )
+        real(dp) :: stdlib_dznrm2
+        ! -- reference blas level1 routine (version 3.9.1_dp) --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! march 2021
+        ! Constants 
+        integer, parameter :: wp = kind(1._dp)
+        real(dp), parameter :: maxn = huge(0.0_dp)
+        ! .. blue's scaling constants ..
+        ! Scalar Arguments 
+     integer(ilp), intent(in) :: incx, n
+        ! Array Arguments 
+        complex(dp), intent(in) :: x(*)
+        ! Local Scalars 
+     integer(ilp) :: i, ix
+     logical(lk) :: notbig
+        real(dp) :: abig, amed, asml, ax, scl, sumsq, ymax, ymin
+        ! quick return if possible
+        stdlib_dznrm2 = zero
+        if( n <= 0 ) return
+        scl = one
+        sumsq = zero
+        ! compute the sum of squares in 3 accumulators:
+           ! abig -- sums of squares scaled down to avoid overflow
+           ! asml -- sums of squares scaled up to avoid underflow
+           ! amed -- sums of squares that do not require scaling
+        ! the thresholds and multipliers are
+           ! tbig -- values bigger than this are scaled down by sbig
+           ! tsml -- values smaller than this are scaled up by ssml
+        notbig = .true.
+        asml = zero
+        amed = zero
+        abig = zero
+        ix = 1
+        if( incx < 0 ) ix = 1 - (n-1)*incx
+        do i = 1, n
+           ax = abs(real(x(ix),KIND=dp))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ax = abs(aimag(x(ix)))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ix = ix + incx
+        end do
+        ! combine abig and amed or amed and asml if more than one
+        ! accumulator was used.
+        if (abig > zero) then
+           ! combine abig and amed if abig > 0.
+           if ( (amed > zero) .or. (amed > maxn) .or. (amed /= amed) ) then
+              abig = abig + (amed*sbig)*sbig
+           end if
+           scl = one / sbig
+           sumsq = abig
+        else if (asml > zero) then
+           ! combine amed and asml if asml > 0.
+           if ( (amed > zero) .or. (amed > maxn) .or. (amed /= amed) ) then
+              amed = sqrt(amed)
+              asml = sqrt(asml) / ssml
+              if (asml > amed) then
+                 ymin = amed
+                 ymax = asml
+              else
+                 ymin = asml
+                 ymax = amed
+              end if
+              scl = one
+              sumsq = ymax**2*( one + (ymin/ymax)**2 )
+           else
+              scl = one / ssml
+              sumsq = asml
+           end if
+        else
+           ! otherwise all values are mid-range
+           scl = one
+           sumsq = amed
+        end if
+        stdlib_dznrm2 = scl*sqrt( sumsq )
+        return
+     end function stdlib_dznrm2
+
+
+
+end module stdlib_linalg_blas_d
diff --git a/src/stdlib_linalg_blas_q.fypp b/src/stdlib_linalg_blas_q.fypp
new file mode 100644
index 000000000..b7209fd35
--- /dev/null
+++ b/src/stdlib_linalg_blas_q.fypp
@@ -0,0 +1,4524 @@
+#:include "common.fypp" 
+#:if WITH_QP 
+module stdlib_linalg_blas_q
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas_aux
+     use stdlib_linalg_blas_s
+     use stdlib_linalg_blas_c
+     use stdlib_linalg_blas_d
+     use stdlib_linalg_blas_z
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+     public :: stdlib_qasum
+     public :: stdlib_qaxpy
+     public :: stdlib_qcopy
+     public :: stdlib_qdot
+     public :: stdlib_qgbmv
+     public :: stdlib_qgemm
+     public :: stdlib_qgemv
+     public :: stdlib_qger
+     public :: stdlib_qnrm2
+     public :: stdlib_qrot
+     public :: stdlib_qrotg
+     public :: stdlib_qrotm
+     public :: stdlib_qrotmg
+     public :: stdlib_qsbmv
+     public :: stdlib_qscal
+     public :: stdlib_qsdot
+     public :: stdlib_qspmv
+     public :: stdlib_qspr
+     public :: stdlib_qspr2
+     public :: stdlib_qswap
+     public :: stdlib_qsymm
+     public :: stdlib_qsymv
+     public :: stdlib_qsyr
+     public :: stdlib_qsyr2
+     public :: stdlib_qsyr2k
+     public :: stdlib_qsyrk
+     public :: stdlib_qtbmv
+     public :: stdlib_qtbsv
+     public :: stdlib_qtpmv
+     public :: stdlib_qtpsv
+     public :: stdlib_qtrmm
+     public :: stdlib_qtrmv
+     public :: stdlib_qtrsm
+     public :: stdlib_qtrsv
+     public :: stdlib_qzasum
+     public :: stdlib_qznrm2
+
+     ! 128-bit real constants 
+     real(qp),    parameter, private ::     negone = -1.00_qp
+     real(qp),    parameter, private ::       zero = 0.00_qp
+     real(qp),    parameter, private ::       half = 0.50_qp
+     real(qp),    parameter, private ::        one = 1.00_qp
+     real(qp),    parameter, private ::        two = 2.00_qp
+     real(qp),    parameter, private ::      three = 3.00_qp
+     real(qp),    parameter, private ::       four = 4.00_qp
+     real(qp),    parameter, private ::      eight = 8.00_qp
+     real(qp),    parameter, private ::        ten = 10.00_qp
+
+     ! 128-bit complex constants 
+     complex(qp), parameter, private :: czero   = ( 0.0_qp,0.0_qp)
+     complex(qp), parameter, private :: chalf   = ( 0.5_qp,0.0_qp)
+     complex(qp), parameter, private :: cone    = ( 1.0_qp,0.0_qp)
+     complex(qp), parameter, private :: cnegone = (-1.0_qp,0.0_qp)
+
+     ! 128-bit scaling constants 
+     integer,     parameter, private :: maxexp = maxexponent(zero) 
+     integer,     parameter, private :: minexp = minexponent(zero) 
+     real(qp),    parameter, private :: rradix = real(radix(zero),qp) 
+     real(qp),    parameter, private :: ulp    = epsilon(zero) 
+     real(qp),    parameter, private :: eps    = ulp*half 
+     real(qp),    parameter, private :: safmin = rradix**max(minexp-1,1-maxexp) 
+     real(qp),    parameter, private :: safmax = one/safmin 
+     real(qp),    parameter, private :: smlnum = safmin/ulp 
+     real(qp),    parameter, private :: bignum = safmax*ulp 
+     real(qp),    parameter, private :: rtmin  = sqrt(smlnum) 
+     real(qp),    parameter, private :: rtmax  = sqrt(bignum) 
+
+     ! 128-bit Blue's scaling constants 
+     ! ssml>=1/s and sbig==1/S with s,S as defined in https://doi.org/10.1145/355769.355771 
+     real(qp),    parameter, private :: tsml   = rradix**ceiling((minexp-1)*half) 
+     real(qp),    parameter, private :: tbig   = rradix**floor((maxexp-digits(zero)+1)*half) 
+     real(qp),    parameter, private :: ssml   = rradix**(-floor((minexp-digits(zero))*half)) 
+     real(qp),    parameter, private :: sbig   = rradix**(-ceiling((maxexp+digits(zero)-1)*half)) 
+
+
+     contains
+
+     !> DASUM: takes the sum of the absolute values.
+
+     pure real(qp) function stdlib_qasum(n,dx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           real(qp), intent(in) :: dx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: dtemp
+           integer(ilp) :: i, m, mp1, nincx
+           ! Intrinsic Functions 
+           intrinsic :: abs,mod
+           stdlib_qasum = zero
+           dtemp = zero
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              ! clean-up loop
+              m = mod(n,6)
+              if (m/=0) then
+                 do i = 1,m
+                    dtemp = dtemp + abs(dx(i))
+                 end do
+                 if (n<6) then
+                    stdlib_qasum = dtemp
+                    return
+                 end if
+              end if
+              mp1 = m + 1
+              do i = mp1,n,6
+                 dtemp = dtemp + abs(dx(i)) + abs(dx(i+1)) +abs(dx(i+2)) + abs(dx(i+3)) +abs(dx(i+&
+                           4)) + abs(dx(i+5))
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 dtemp = dtemp + abs(dx(i))
+              end do
+           end if
+           stdlib_qasum = dtemp
+           return
+     end function stdlib_qasum
+
+     !> DAXPY: constant times a vector plus a vector.
+     !> uses unrolled loops for increments equal to one.
+
+     pure subroutine stdlib_qaxpy(n,da,dx,incx,dy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: da
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(qp), intent(in) :: dx(*)
+           real(qp), intent(inout) :: dy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy, m, mp1
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           if (n<=0) return
+           if (da==0.0_qp) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              ! clean-up loop
+              m = mod(n,4)
+              if (m/=0) then
+                 do i = 1,m
+                    dy(i) = dy(i) + da*dx(i)
+                 end do
+              end if
+              if (n<4) return
+              mp1 = m + 1
+              do i = mp1,n,4
+                 dy(i) = dy(i) + da*dx(i)
+                 dy(i+1) = dy(i+1) + da*dx(i+1)
+                 dy(i+2) = dy(i+2) + da*dx(i+2)
+                 dy(i+3) = dy(i+3) + da*dx(i+3)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+               dy(iy) = dy(iy) + da*dx(ix)
+               ix = ix + incx
+               iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_qaxpy
+
+     !> DCOPY: copies a vector, x, to a vector, y.
+     !> uses unrolled loops for increments equal to 1.
+
+     pure subroutine stdlib_qcopy(n,dx,incx,dy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(qp), intent(in) :: dx(*)
+           real(qp), intent(out) :: dy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy, m, mp1
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              ! clean-up loop
+              m = mod(n,7)
+              if (m/=0) then
+                 do i = 1,m
+                    dy(i) = dx(i)
+                 end do
+                 if (n<7) return
+              end if
+              mp1 = m + 1
+              do i = mp1,n,7
+                 dy(i) = dx(i)
+                 dy(i+1) = dx(i+1)
+                 dy(i+2) = dx(i+2)
+                 dy(i+3) = dx(i+3)
+                 dy(i+4) = dx(i+4)
+                 dy(i+5) = dx(i+5)
+                 dy(i+6) = dx(i+6)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 dy(iy) = dx(ix)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_qcopy
+
+     !> DDOT: forms the dot product of two vectors.
+     !> uses unrolled loops for increments equal to one.
+
+     pure real(qp) function stdlib_qdot(n,dx,incx,dy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(qp), intent(in) :: dx(*), dy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: dtemp
+           integer(ilp) :: i, ix, iy, m, mp1
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           stdlib_qdot = zero
+           dtemp = zero
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              ! clean-up loop
+              m = mod(n,5)
+              if (m/=0) then
+                 do i = 1,m
+                    dtemp = dtemp + dx(i)*dy(i)
+                 end do
+                 if (n<5) then
+                    stdlib_qdot=dtemp
+                 return
+                 end if
+              end if
+              mp1 = m + 1
+              do i = mp1,n,5
+               dtemp = dtemp + dx(i)*dy(i) + dx(i+1)*dy(i+1) +dx(i+2)*dy(i+2) + dx(i+3)*dy(i+3) + &
+                         dx(i+4)*dy(i+4)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 dtemp = dtemp + dx(ix)*dy(iy)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           stdlib_qdot = dtemp
+           return
+     end function stdlib_qdot
+
+     !> DGBMV:  performs one of the matrix-vector operations
+     !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n band matrix, with kl sub-diagonals and ku super-diagonals.
+
+     pure subroutine stdlib_qgbmv(trans,m,n,kl,ku,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, kl, ku, lda, m, n
+           character, intent(in) :: trans
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), x(*)
+           real(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kup1, kx, ky, lenx, leny
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 1
+           else if (m<0) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (kl<0) then
+               info = 4
+           else if (ku<0) then
+               info = 5
+           else if (lda< (kl+ku+1)) then
+               info = 8
+           else if (incx==0) then
+               info = 10
+           else if (incy==0) then
+               info = 13
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DGBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==zero).and. (beta==one))) return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if (stdlib_lsame(trans,'N')) then
+               lenx = n
+               leny = m
+           else
+               lenx = m
+               leny = n
+           end if
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (lenx-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (leny-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through the band part of a.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,leny
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,leny
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,leny
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,leny
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           kup1 = ku + 1
+           if (stdlib_lsame(trans,'N')) then
+              ! form  y := alpha*a*x + y.
+               jx = kx
+               if (incy==1) then
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           y(i) = y(i) + temp*a(k+i,j)
+                       end do
+                       jx = jx + incx
+                   end do
+               else
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       iy = ky
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           y(iy) = y(iy) + temp*a(k+i,j)
+                           iy = iy + incy
+                       end do
+                       jx = jx + incx
+                       if (j>ku) ky = ky + incy
+                   end do
+               end if
+           else
+              ! form  y := alpha*a**t*x + y.
+               jy = ky
+               if (incx==1) then
+                   do j = 1,n
+                       temp = zero
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           temp = temp + a(k+i,j)*x(i)
+                       end do
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               else
+                   do j = 1,n
+                       temp = zero
+                       ix = kx
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           temp = temp + a(k+i,j)*x(ix)
+                           ix = ix + incx
+                       end do
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                       if (j>ku) kx = kx + incx
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_qgbmv
+
+     !> DGEMM:  performs one of the matrix-matrix operations
+     !> C := alpha*op( A )*op( B ) + beta*C,
+     !> where  op( X ) is one of
+     !> op( X ) = X   or   op( X ) = X**T,
+     !> alpha and beta are scalars, and A, B and C are matrices, with op( A )
+     !> an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+
+     pure subroutine stdlib_qgemm(transa,transb,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, m, n
+           character, intent(in) :: transa, transb
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), b(ldb,*)
+           real(qp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, j, l, nrowa, nrowb
+           logical(lk) :: nota, notb
+           
+           ! set  nota  and  notb  as  true if  a  and  b  respectively are not
+           ! transposed and set  nrowa and nrowb  as the number of rows of  a
+           ! and  b  respectively.
+           nota = stdlib_lsame(transa,'N')
+           notb = stdlib_lsame(transb,'N')
+           if (nota) then
+               nrowa = m
+           else
+               nrowa = k
+           end if
+           if (notb) then
+               nrowb = k
+           else
+               nrowb = n
+           end if
+           ! test the input parameters.
+           info = 0
+           if ((.not.nota) .and. (.not.stdlib_lsame(transa,'C')) .and.(.not.stdlib_lsame(transa,&
+                     'T'))) then
+               info = 1
+           else if ((.not.notb) .and. (.not.stdlib_lsame(transb,'C')) .and.(.not.stdlib_lsame(&
+                     transb,'T'))) then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda<max(1,nrowa)) then
+               info = 8
+           else if (ldb<max(1,nrowb)) then
+               info = 10
+           else if (ldc<max(1,m)) then
+               info = 13
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DGEMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.(((alpha==zero).or. (k==0)).and. (beta==one))) &
+                     return
+           ! and if  alpha.eq.zero.
+           if (alpha==zero) then
+               if (beta==zero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = zero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (notb) then
+               if (nota) then
+                 ! form  c := alpha*a*b + beta*c.
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,m
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*b(l,j)
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**t*b + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + a(l,i)*b(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           else
+               if (nota) then
+                 ! form  c := alpha*a*b**t + beta*c
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,m
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*b(j,l)
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**t*b**t + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + a(l,i)*b(j,l)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_qgemm
+
+     !> DGEMV:  performs one of the matrix-vector operations
+     !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+
+     pure subroutine stdlib_qgemv(trans,m,n,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           character, intent(in) :: trans
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), x(*)
+           real(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky, lenx, leny
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 1
+           else if (m<0) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (lda<max(1,m)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           else if (incy==0) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DGEMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==zero).and. (beta==one))) return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if (stdlib_lsame(trans,'N')) then
+               lenx = n
+               leny = m
+           else
+               lenx = m
+               leny = n
+           end if
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (lenx-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (leny-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,leny
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,leny
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,leny
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,leny
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           if (stdlib_lsame(trans,'N')) then
+              ! form  y := alpha*a*x + y.
+               jx = kx
+               if (incy==1) then
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       do i = 1,m
+                           y(i) = y(i) + temp*a(i,j)
+                       end do
+                       jx = jx + incx
+                   end do
+               else
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       iy = ky
+                       do i = 1,m
+                           y(iy) = y(iy) + temp*a(i,j)
+                           iy = iy + incy
+                       end do
+                       jx = jx + incx
+                   end do
+               end if
+           else
+              ! form  y := alpha*a**t*x + y.
+               jy = ky
+               if (incx==1) then
+                   do j = 1,n
+                       temp = zero
+                       do i = 1,m
+                           temp = temp + a(i,j)*x(i)
+                       end do
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               else
+                   do j = 1,n
+                       temp = zero
+                       ix = kx
+                       do i = 1,m
+                           temp = temp + a(i,j)*x(ix)
+                           ix = ix + incx
+                       end do
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_qgemv
+
+     !> DGER:   performs the rank 1 operation
+     !> A := alpha*x*y**T + A,
+     !> where alpha is a scalar, x is an m element vector, y is an n element
+     !> vector and A is an m by n matrix.
+
+     pure subroutine stdlib_qger(m,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jy, kx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (m<0) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,m)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DGER  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or. (alpha==zero)) return
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           if (incy>0) then
+               jy = 1
+           else
+               jy = 1 - (n-1)*incy
+           end if
+           if (incx==1) then
+               do j = 1,n
+                   if (y(jy)/=zero) then
+                       temp = alpha*y(jy)
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(i)*temp
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           else
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (m-1)*incx
+               end if
+               do j = 1,n
+                   if (y(jy)/=zero) then
+                       temp = alpha*y(jy)
+                       ix = kx
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(ix)*temp
+                           ix = ix + incx
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           end if
+           return
+     end subroutine stdlib_qger
+
+     !> !
+     !>
+     !> DNRM2: returns the euclidean norm of a vector via the function
+     !> name, so that
+     !> DNRM2 := sqrt( x'*x )
+
+     pure function stdlib_qnrm2( n, x, incx )
+        real(qp) :: stdlib_qnrm2
+        ! -- reference blas level1 routine (version 3.9.1_qp) --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! march 2021
+        ! Constants 
+        integer, parameter :: wp = kind(1._qp)
+        real(qp), parameter :: maxn = huge(0.0_qp)
+        ! .. blue's scaling constants ..
+        ! Scalar Arguments 
+     integer(ilp), intent(in) :: incx, n
+        ! Array Arguments 
+        real(qp), intent(in) :: x(*)
+        ! Local Scalars 
+     integer(ilp) :: i, ix
+     logical(lk) :: notbig
+        real(qp) :: abig, amed, asml, ax, scl, sumsq, ymax, ymin
+        ! quick return if possible
+        stdlib_qnrm2 = zero
+        if( n <= 0 ) return
+        scl = one
+        sumsq = zero
+        ! compute the sum of squares in 3 accumulators:
+           ! abig -- sums of squares scaled down to avoid overflow
+           ! asml -- sums of squares scaled up to avoid underflow
+           ! amed -- sums of squares that do not require scaling
+        ! the thresholds and multipliers are
+           ! tbig -- values bigger than this are scaled down by sbig
+           ! tsml -- values smaller than this are scaled up by ssml
+        notbig = .true.
+        asml = zero
+        amed = zero
+        abig = zero
+        ix = 1
+        if( incx < 0 ) ix = 1 - (n-1)*incx
+        do i = 1, n
+           ax = abs(x(ix))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ix = ix + incx
+        end do
+        ! combine abig and amed or amed and asml if more than one
+        ! accumulator was used.
+        if (abig > zero) then
+           ! combine abig and amed if abig > 0.
+           if ( (amed > zero) .or. (amed > maxn) .or. (amed /= amed) ) then
+              abig = abig + (amed*sbig)*sbig
+           end if
+           scl = one / sbig
+           sumsq = abig
+        else if (asml > zero) then
+           ! combine amed and asml if asml > 0.
+           if ( (amed > zero) .or. (amed > maxn) .or. (amed /= amed) ) then
+              amed = sqrt(amed)
+              asml = sqrt(asml) / ssml
+              if (asml > amed) then
+                 ymin = amed
+                 ymax = asml
+              else
+                 ymin = asml
+                 ymax = amed
+              end if
+              scl = one
+              sumsq = ymax**2*( one + (ymin/ymax)**2 )
+           else
+              scl = one / ssml
+              sumsq = asml
+           end if
+        else
+           ! otherwise all values are mid-range
+           scl = one
+           sumsq = amed
+        end if
+        stdlib_qnrm2 = scl*sqrt( sumsq )
+        return
+     end function stdlib_qnrm2
+
+     !> DROT: applies a plane rotation.
+
+     pure subroutine stdlib_qrot(n,dx,incx,dy,incy,c,s)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: c, s
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: dx(*), dy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: dtemp
+           integer(ilp) :: i, ix, iy
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+             ! code for both increments equal to 1
+              do i = 1,n
+                 dtemp = c*dx(i) + s*dy(i)
+                 dy(i) = c*dy(i) - s*dx(i)
+                 dx(i) = dtemp
+              end do
+           else
+             ! code for unequal increments or equal increments not equal
+               ! to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 dtemp = c*dx(ix) + s*dy(iy)
+                 dy(iy) = c*dy(iy) - s*dx(ix)
+                 dx(ix) = dtemp
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_qrot
+
+     !> !
+     !>
+     !> The computation uses the formulas
+     !> sigma = sgn(a)    if |a| >  |b|
+     !> = sgn(b)    if |b| >= |a|
+     !> r = sigma*sqrt( a**2 + b**2 )
+     !> c = 1; s = 0      if r = 0
+     !> c = a/r; s = b/r  if r != 0
+     !> The subroutine also computes
+     !> z = s    if |a| > |b|,
+     !> = 1/c  if |b| >= |a| and c != 0
+     !> = 1    if c = 0
+     !> This allows c and s to be reconstructed from z as follows:
+     !> If z = 1, set c = 0, s = 1.
+     !> If |z| < 1, set c = sqrt(1 - z**2) and s = z.
+     !> If |z| > 1, set c = 1/z and s = sqrt( 1 - c**2).
+
+     pure subroutine stdlib_qrotg( a, b, c, s )
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+        ! Constants 
+        integer, parameter :: wp = kind(1._qp)
+        ! Scaling Constants 
+        ! Scalar Arguments 
+        real(qp), intent(inout) :: a, b
+        real(qp), intent(out) :: c, s
+        ! Local Scalars 
+        real(qp) :: anorm, bnorm, scl, sigma, r, z
+        anorm = abs(a)
+        bnorm = abs(b)
+        if( bnorm == zero ) then
+           c = one
+           s = zero
+           b = zero
+        else if( anorm == zero ) then
+           c = zero
+           s = one
+           a = b
+           b = one
+        else
+           scl = min( safmax, max( safmin, anorm, bnorm ) )
+           if( anorm > bnorm ) then
+              sigma = sign(one,a)
+           else
+              sigma = sign(one,b)
+           end if
+           r = sigma*( scl*sqrt((a/scl)**2 + (b/scl)**2) )
+           c = a/r
+           s = b/r
+           if( anorm > bnorm ) then
+              z = s
+           else if( c /= zero ) then
+              z = one/c
+           else
+              z = one
+           end if
+           a = r
+           b = z
+        end if
+        return
+     end subroutine stdlib_qrotg
+
+     !> APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX
+     !> (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN
+     !> (DY**T)
+     !> DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX >= 0, ELSE
+     !> LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY.
+     !> WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
+     !> DFLAG=-1._qp     DFLAG=0._qp        DFLAG=1._qp     DFLAG=-2.D0
+     !> (DH11  DH12)    (1._qp  DH12)    (DH11  1._qp)    (1._qp  0._qp)
+     !> H=(          )    (          )    (          )    (          )
+     !> (DH21  DH22),   (DH21  1._qp),   (-1._qp DH22),   (0._qp  1._qp).
+     !> SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM.
+
+     pure subroutine stdlib_qrotm(n,dx,incx,dy,incy,dparam)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(qp), intent(in) :: dparam(5)
+           real(qp), intent(inout) :: dx(*), dy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: dflag, dh11, dh12, dh21, dh22, two, w, z, zero
+           integer(ilp) :: i, kx, ky, nsteps
+           ! Data Statements 
+           zero = 0.0_qp
+           two = 2.0_qp
+           dflag = dparam(1)
+           if (n<=0 .or. (dflag+two==zero)) return
+           if (incx==incy.and.incx>0) then
+              nsteps = n*incx
+              if (dflag<zero) then
+                 dh11 = dparam(2)
+                 dh12 = dparam(4)
+                 dh21 = dparam(3)
+                 dh22 = dparam(5)
+                 do i = 1,nsteps,incx
+                    w = dx(i)
+                    z = dy(i)
+                    dx(i) = w*dh11 + z*dh12
+                    dy(i) = w*dh21 + z*dh22
+                 end do
+              else if (dflag==zero) then
+                 dh12 = dparam(4)
+                 dh21 = dparam(3)
+                 do i = 1,nsteps,incx
+                    w = dx(i)
+                    z = dy(i)
+                    dx(i) = w + z*dh12
+                    dy(i) = w*dh21 + z
+                 end do
+              else
+                 dh11 = dparam(2)
+                 dh22 = dparam(5)
+                 do i = 1,nsteps,incx
+                    w = dx(i)
+                    z = dy(i)
+                    dx(i) = w*dh11 + z
+                    dy(i) = -w + dh22*z
+                 end do
+              end if
+           else
+              kx = 1
+              ky = 1
+              if (incx<0) kx = 1 + (1-n)*incx
+              if (incy<0) ky = 1 + (1-n)*incy
+              if (dflag<zero) then
+                 dh11 = dparam(2)
+                 dh12 = dparam(4)
+                 dh21 = dparam(3)
+                 dh22 = dparam(5)
+                 do i = 1,n
+                    w = dx(kx)
+                    z = dy(ky)
+                    dx(kx) = w*dh11 + z*dh12
+                    dy(ky) = w*dh21 + z*dh22
+                    kx = kx + incx
+                    ky = ky + incy
+                 end do
+              else if (dflag==zero) then
+                 dh12 = dparam(4)
+                 dh21 = dparam(3)
+                 do i = 1,n
+                    w = dx(kx)
+                    z = dy(ky)
+                    dx(kx) = w + z*dh12
+                    dy(ky) = w*dh21 + z
+                    kx = kx + incx
+                    ky = ky + incy
+                 end do
+              else
+                  dh11 = dparam(2)
+                  dh22 = dparam(5)
+                  do i = 1,n
+                     w = dx(kx)
+                     z = dy(ky)
+                     dx(kx) = w*dh11 + z
+                     dy(ky) = -w + dh22*z
+                     kx = kx + incx
+                     ky = ky + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qrotm
+
+     !> CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS
+     !> THE SECOND COMPONENT OF THE 2-VECTOR  (SQRT(DD1)*DX1,SQRT(DD2)    DY2)**T.
+     !> WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
+     !> DFLAG=-1._qp     DFLAG=0._qp        DFLAG=1._qp     DFLAG=-2.D0
+     !> (DH11  DH12)    (1._qp  DH12)    (DH11  1._qp)    (1._qp  0._qp)
+     !> H=(          )    (          )    (          )    (          )
+     !> (DH21  DH22),   (DH21  1._qp),   (-1._qp DH22),   (0._qp  1._qp).
+     !> LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22
+     !> RESPECTIVELY. (VALUES OF 1._qp, -1._qp, OR 0._qp IMPLIED BY THE
+     !> VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.)
+     !> THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE
+     !> INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE
+     !> OF DD1 AND DD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM.
+
+     pure subroutine stdlib_qrotmg(dd1,dd2,dx1,dy1,dparam)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(inout) :: dd1, dd2, dx1
+           real(qp), intent(in) :: dy1
+           ! Array Arguments 
+           real(qp), intent(out) :: dparam(5)
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: dflag, dh11, dh12, dh21, dh22, dp1, dp2, dq1, dq2, dtemp, du, gam, gamsq, &
+                     one, rgamsq, two, zero
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Data Statements 
+           zero = 0.0_qp
+           one = 1.0_qp
+           two = 2.0_qp
+           gam = 4096.0_qp
+           gamsq = 16777216.0_qp
+           rgamsq = 5.9604645e-8_qp
+           if (dd1<zero) then
+              ! go zero-h-d-and-dx1..
+              dflag = -one
+              dh11 = zero
+              dh12 = zero
+              dh21 = zero
+              dh22 = zero
+              dd1 = zero
+              dd2 = zero
+              dx1 = zero
+           else
+              ! case-dd1-nonnegative
+              dp2 = dd2*dy1
+              if (dp2==zero) then
+                 dflag = -two
+                 dparam(1) = dflag
+                 return
+              end if
+              ! regular-case..
+              dp1 = dd1*dx1
+              dq2 = dp2*dy1
+              dq1 = dp1*dx1
+              if (abs(dq1)>abs(dq2)) then
+                 dh21 = -dy1/dx1
+                 dh12 = dp2/dp1
+                 du = one - dh12*dh21
+                if (du>zero) then
+                  dflag = zero
+                  dd1 = dd1/du
+                  dd2 = dd2/du
+                  dx1 = dx1*du
+                else
+                  ! this code path if here for safety. we do not expect this
+                  ! condition to ever hold except in edge cases with rounding
+                  ! errors. see doi: 10.1145/355841.355847
+                  dflag = -one
+                  dh11 = zero
+                  dh12 = zero
+                  dh21 = zero
+                  dh22 = zero
+                  dd1 = zero
+                  dd2 = zero
+                  dx1 = zero
+                end if
+              else
+                 if (dq2<zero) then
+                    ! go zero-h-d-and-dx1..
+                    dflag = -one
+                    dh11 = zero
+                    dh12 = zero
+                    dh21 = zero
+                    dh22 = zero
+                    dd1 = zero
+                    dd2 = zero
+                    dx1 = zero
+                 else
+                    dflag = one
+                    dh11 = dp1/dp2
+                    dh22 = dx1/dy1
+                    du = one + dh11*dh22
+                    dtemp = dd2/du
+                    dd2 = dd1/du
+                    dd1 = dtemp
+                    dx1 = dy1*du
+                 end if
+              end if
+           ! procedure..scale-check
+              if (dd1/=zero) then
+                 do while ((dd1<=rgamsq) .or. (dd1>=gamsq))
+                    if (dflag==zero) then
+                       dh11 = one
+                       dh22 = one
+                       dflag = -one
+                    else
+                       dh21 = -one
+                       dh12 = one
+                       dflag = -one
+                    end if
+                    if (dd1<=rgamsq) then
+                       dd1 = dd1*gam**2
+                       dx1 = dx1/gam
+                       dh11 = dh11/gam
+                       dh12 = dh12/gam
+                    else
+                       dd1 = dd1/gam**2
+                       dx1 = dx1*gam
+                       dh11 = dh11*gam
+                       dh12 = dh12*gam
+                    end if
+                 enddo
+              end if
+              if (dd2/=zero) then
+                 do while ( (abs(dd2)<=rgamsq) .or. (abs(dd2)>=gamsq) )
+                    if (dflag==zero) then
+                       dh11 = one
+                       dh22 = one
+                       dflag = -one
+                    else
+                       dh21 = -one
+                       dh12 = one
+                       dflag = -one
+                    end if
+                    if (abs(dd2)<=rgamsq) then
+                       dd2 = dd2*gam**2
+                       dh21 = dh21/gam
+                       dh22 = dh22/gam
+                    else
+                       dd2 = dd2/gam**2
+                       dh21 = dh21*gam
+                       dh22 = dh22*gam
+                    end if
+                 end do
+              end if
+           end if
+           if (dflag<zero) then
+              dparam(2) = dh11
+              dparam(3) = dh21
+              dparam(4) = dh12
+              dparam(5) = dh22
+           else if (dflag==zero) then
+              dparam(3) = dh21
+              dparam(4) = dh12
+           else
+              dparam(2) = dh11
+              dparam(5) = dh22
+           end if
+           dparam(1) = dflag
+           return
+     end subroutine stdlib_qrotmg
+
+     !> DSBMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric band matrix, with k super-diagonals.
+
+     pure subroutine stdlib_qsbmv(uplo,n,k,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, k, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), x(*)
+           real(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kplus1, kx, ky, l
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (k<0) then
+               info = 3
+           else if (lda< (k+1)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           else if (incy==0) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==zero).and. (beta==one))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of the array a
+           ! are accessed sequentially with one pass through a.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when upper triangle of a is stored.
+               kplus1 = k + 1
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       l = kplus1 - j
+                       do i = max(1,j-k),j - 1
+                           y(i) = y(i) + temp1*a(l+i,j)
+                           temp2 = temp2 + a(l+i,j)*x(i)
+                       end do
+                       y(j) = y(j) + temp1*a(kplus1,j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       ix = kx
+                       iy = ky
+                       l = kplus1 - j
+                       do i = max(1,j-k),j - 1
+                           y(iy) = y(iy) + temp1*a(l+i,j)
+                           temp2 = temp2 + a(l+i,j)*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*a(kplus1,j) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       if (j>k) then
+                           kx = kx + incx
+                           ky = ky + incy
+                       end if
+                   end do
+               end if
+           else
+              ! form  y  when lower triangle of a is stored.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       y(j) = y(j) + temp1*a(1,j)
+                       l = 1 - j
+                       do i = j + 1,min(n,j+k)
+                           y(i) = y(i) + temp1*a(l+i,j)
+                           temp2 = temp2 + a(l+i,j)*x(i)
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       y(jy) = y(jy) + temp1*a(1,j)
+                       l = 1 - j
+                       ix = jx
+                       iy = jy
+                       do i = j + 1,min(n,j+k)
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*a(l+i,j)
+                           temp2 = temp2 + a(l+i,j)*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_qsbmv
+
+     !> DSCAL: scales a vector by a constant.
+     !> uses unrolled loops for increment equal to 1.
+
+     pure subroutine stdlib_qscal(n,da,dx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: da
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: dx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, m, mp1, nincx
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              ! clean-up loop
+              m = mod(n,5)
+              if (m/=0) then
+                 do i = 1,m
+                    dx(i) = da*dx(i)
+                 end do
+                 if (n<5) return
+              end if
+              mp1 = m + 1
+              do i = mp1,n,5
+                 dx(i) = da*dx(i)
+                 dx(i+1) = da*dx(i+1)
+                 dx(i+2) = da*dx(i+2)
+                 dx(i+3) = da*dx(i+3)
+                 dx(i+4) = da*dx(i+4)
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 dx(i) = da*dx(i)
+              end do
+           end if
+           return
+     end subroutine stdlib_qscal
+
+     !> Compute the inner product of two vectors with extended
+     !> precision accumulation and result.
+     !> Returns D.P. dot product accumulated in D.P., for S.P. SX and SY
+     !> DSDOT: = sum for I = 0 to N-1 of  SX(LX+I*INCX) * SY(LY+I*INCY),
+     !> where LX = 1 if INCX >= 0, else LX = 1+(1-N)*INCX, and LY is
+     !> defined in a similar way using INCY.
+
+     pure real(qp) function stdlib_qsdot(n,sx,incx,sy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(dp), intent(in) :: sx(*), sy(*)
+        ! authors:
+        ! ========
+        ! lawson, c. l., (jpl), hanson, r. j., (snla),
+        ! kincaid, d. r., (u. of texas), krogh, f. t., (jpl)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, kx, ky, ns
+           ! Intrinsic Functions 
+           intrinsic :: real
+           stdlib_qsdot = zero
+           if (n<=0) return
+           if (incx==incy .and. incx>0) then
+           ! code for equal, positive, non-unit increments.
+              ns = n*incx
+              do i = 1,ns,incx
+                 stdlib_qsdot = stdlib_qsdot + real(sx(i),KIND=qp)*real(sy(i),KIND=qp)
+              end do
+           else
+           ! code for unequal or nonpositive increments.
+              kx = 1
+              ky = 1
+              if (incx<0) kx = 1 + (1-n)*incx
+              if (incy<0) ky = 1 + (1-n)*incy
+              do i = 1,n
+                 stdlib_qsdot = stdlib_qsdot + real(sx(kx),KIND=qp)*real(sy(ky),KIND=qp)
+                 kx = kx + incx
+                 ky = ky + incy
+              end do
+           end if
+           return
+     end function stdlib_qsdot
+
+     !> DSPMV:  performs the matrix-vector operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_qspmv(uplo,n,alpha,ap,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: ap(*), x(*)
+           real(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 6
+           else if (incy==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSPMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==zero).and. (beta==one))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with one pass through ap.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when ap contains the upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       k = kk
+                       do i = 1,j - 1
+                           y(i) = y(i) + temp1*ap(k)
+                           temp2 = temp2 + ap(k)*x(i)
+                           k = k + 1
+                       end do
+                       y(j) = y(j) + temp1*ap(kk+j-1) + alpha*temp2
+                       kk = kk + j
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       ix = kx
+                       iy = ky
+                       do k = kk,kk + j - 2
+                           y(iy) = y(iy) + temp1*ap(k)
+                           temp2 = temp2 + ap(k)*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*ap(kk+j-1) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  y  when ap contains the lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       y(j) = y(j) + temp1*ap(kk)
+                       k = kk + 1
+                       do i = j + 1,n
+                           y(i) = y(i) + temp1*ap(k)
+                           temp2 = temp2 + ap(k)*x(i)
+                           k = k + 1
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                       kk = kk + (n-j+1)
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       y(jy) = y(jy) + temp1*ap(kk)
+                       ix = jx
+                       iy = jy
+                       do k = kk + 1,kk + n - j
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*ap(k)
+                           temp2 = temp2 + ap(k)*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + (n-j+1)
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_qspmv
+
+     !> DSPR:    performs the symmetric rank 1 operation
+     !> A := alpha*x*x**T + A,
+     !> where alpha is a real scalar, x is an n element vector and A is an
+     !> n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_qspr(uplo,n,alpha,x,incx,ap)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(qp), intent(inout) :: ap(*)
+           real(qp), intent(in) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSPR  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==zero)) return
+           ! set the start point in x if the increment is not unity.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with one pass through ap.
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when upper triangle is stored in ap.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=zero) then
+                           temp = alpha*x(j)
+                           k = kk
+                           do i = 1,j
+                               ap(k) = ap(k) + x(i)*temp
+                               k = k + 1
+                           end do
+                       end if
+                       kk = kk + j
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=zero) then
+                           temp = alpha*x(jx)
+                           ix = kx
+                           do k = kk,kk + j - 1
+                               ap(k) = ap(k) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                       end if
+                       jx = jx + incx
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=zero) then
+                           temp = alpha*x(j)
+                           k = kk
+                           do i = j,n
+                               ap(k) = ap(k) + x(i)*temp
+                               k = k + 1
+                           end do
+                       end if
+                       kk = kk + n - j + 1
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=zero) then
+                           temp = alpha*x(jx)
+                           ix = jx
+                           do k = kk,kk + n - j
+                               ap(k) = ap(k) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                       end if
+                       jx = jx + incx
+                       kk = kk + n - j + 1
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_qspr
+
+     !> DSPR2:  performs the symmetric rank 2 operation
+     !> A := alpha*x*y**T + alpha*y*x**T + A,
+     !> where alpha is a scalar, x and y are n element vectors and A is an
+     !> n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_qspr2(uplo,n,alpha,x,incx,y,incy,ap)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(qp), intent(inout) :: ap(*)
+           real(qp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSPR2 ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==zero)) return
+           ! set up the start points in x and y if the increments are not both
+           ! unity.
+           if ((incx/=1) .or. (incy/=1)) then
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (n-1)*incx
+               end if
+               if (incy>0) then
+                   ky = 1
+               else
+                   ky = 1 - (n-1)*incy
+               end if
+               jx = kx
+               jy = ky
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with one pass through ap.
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when upper triangle is stored in ap.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=zero) .or. (y(j)/=zero)) then
+                           temp1 = alpha*y(j)
+                           temp2 = alpha*x(j)
+                           k = kk
+                           do i = 1,j
+                               ap(k) = ap(k) + x(i)*temp1 + y(i)*temp2
+                               k = k + 1
+                           end do
+                       end if
+                       kk = kk + j
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=zero) .or. (y(jy)/=zero)) then
+                           temp1 = alpha*y(jy)
+                           temp2 = alpha*x(jx)
+                           ix = kx
+                           iy = ky
+                           do k = kk,kk + j - 1
+                               ap(k) = ap(k) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=zero) .or. (y(j)/=zero)) then
+                           temp1 = alpha*y(j)
+                           temp2 = alpha*x(j)
+                           k = kk
+                           do i = j,n
+                               ap(k) = ap(k) + x(i)*temp1 + y(i)*temp2
+                               k = k + 1
+                           end do
+                       end if
+                       kk = kk + n - j + 1
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=zero) .or. (y(jy)/=zero)) then
+                           temp1 = alpha*y(jy)
+                           temp2 = alpha*x(jx)
+                           ix = jx
+                           iy = jy
+                           do k = kk,kk + n - j
+                               ap(k) = ap(k) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + n - j + 1
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_qspr2
+
+     !> DSWAP: interchanges two vectors.
+     !> uses unrolled loops for increments equal to 1.
+
+     pure subroutine stdlib_qswap(n,dx,incx,dy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: dx(*), dy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: dtemp
+           integer(ilp) :: i, ix, iy, m, mp1
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+             ! code for both increments equal to 1
+             ! clean-up loop
+              m = mod(n,3)
+              if (m/=0) then
+                 do i = 1,m
+                    dtemp = dx(i)
+                    dx(i) = dy(i)
+                    dy(i) = dtemp
+                 end do
+                 if (n<3) return
+              end if
+              mp1 = m + 1
+              do i = mp1,n,3
+                 dtemp = dx(i)
+                 dx(i) = dy(i)
+                 dy(i) = dtemp
+                 dtemp = dx(i+1)
+                 dx(i+1) = dy(i+1)
+                 dy(i+1) = dtemp
+                 dtemp = dx(i+2)
+                 dx(i+2) = dy(i+2)
+                 dy(i+2) = dtemp
+              end do
+           else
+             ! code for unequal increments or equal increments not equal
+               ! to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 dtemp = dx(ix)
+                 dx(ix) = dy(iy)
+                 dy(iy) = dtemp
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_qswap
+
+     !> DSYMM:  performs one of the matrix-matrix operations
+     !> C := alpha*A*B + beta*C,
+     !> or
+     !> C := alpha*B*A + beta*C,
+     !> where alpha and beta are scalars,  A is a symmetric matrix and  B and
+     !> C are  m by n matrices.
+
+     pure subroutine stdlib_qsymm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           character, intent(in) :: side, uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), b(ldb,*)
+           real(qp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(qp) :: temp1, temp2
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: upper
+           
+           ! set nrowa as the number of rows of a.
+           if (stdlib_lsame(side,'L')) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           ! test the input parameters.
+           info = 0
+           if ((.not.stdlib_lsame(side,'L')) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,m)) then
+               info = 9
+           else if (ldc<max(1,m)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSYMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==zero).and. (beta==one))) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               if (beta==zero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = zero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(side,'L')) then
+              ! form  c := alpha*a*b + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,m
+                           temp1 = alpha*b(i,j)
+                           temp2 = zero
+                           do k = 1,i - 1
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*a(k,i)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = temp1*a(i,i) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*a(i,i) +alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = m,1,-1
+                           temp1 = alpha*b(i,j)
+                           temp2 = zero
+                           do k = i + 1,m
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*a(k,i)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = temp1*a(i,i) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*a(i,i) +alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*b*a + beta*c.
+               loop_170: do j = 1,n
+                   temp1 = alpha*a(j,j)
+                   if (beta==zero) then
+                       do i = 1,m
+                           c(i,j) = temp1*b(i,j)
+                       end do
+                   else
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j) + temp1*b(i,j)
+                       end do
+                   end if
+                   do k = 1,j - 1
+                       if (upper) then
+                           temp1 = alpha*a(k,j)
+                       else
+                           temp1 = alpha*a(j,k)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+                   do k = j + 1,n
+                       if (upper) then
+                           temp1 = alpha*a(j,k)
+                       else
+                           temp1 = alpha*a(k,j)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+               end do loop_170
+           end if
+           return
+     end subroutine stdlib_qsymm
+
+     !> DSYMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric matrix.
+
+     pure subroutine stdlib_qsymv(uplo,n,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), x(*)
+           real(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (lda<max(1,n)) then
+               info = 5
+           else if (incx==0) then
+               info = 7
+           else if (incy==0) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSYMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==zero).and. (beta==one))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through the triangular part
+           ! of a.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when a is stored in upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       do i = 1,j - 1
+                           y(i) = y(i) + temp1*a(i,j)
+                           temp2 = temp2 + a(i,j)*x(i)
+                       end do
+                       y(j) = y(j) + temp1*a(j,j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       ix = kx
+                       iy = ky
+                       do i = 1,j - 1
+                           y(iy) = y(iy) + temp1*a(i,j)
+                           temp2 = temp2 + a(i,j)*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*a(j,j) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           else
+              ! form  y  when a is stored in lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       y(j) = y(j) + temp1*a(j,j)
+                       do i = j + 1,n
+                           y(i) = y(i) + temp1*a(i,j)
+                           temp2 = temp2 + a(i,j)*x(i)
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       y(jy) = y(jy) + temp1*a(j,j)
+                       ix = jx
+                       iy = jy
+                       do i = j + 1,n
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*a(i,j)
+                           temp2 = temp2 + a(i,j)*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_qsymv
+
+     !> DSYR:   performs the symmetric rank 1 operation
+     !> A := alpha*x*x**T + A,
+     !> where alpha is a real scalar, x is an n element vector and A is an
+     !> n by n symmetric matrix.
+
+     pure subroutine stdlib_qsyr(uplo,n,alpha,x,incx,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (lda<max(1,n)) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSYR  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==zero)) return
+           ! set the start point in x if the increment is not unity.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through the triangular part
+           ! of a.
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when a is stored in upper triangle.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=zero) then
+                           temp = alpha*x(j)
+                           do i = 1,j
+                               a(i,j) = a(i,j) + x(i)*temp
+                           end do
+                       end if
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=zero) then
+                           temp = alpha*x(jx)
+                           ix = kx
+                           do i = 1,j
+                               a(i,j) = a(i,j) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                       end if
+                       jx = jx + incx
+                   end do
+               end if
+           else
+              ! form  a  when a is stored in lower triangle.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=zero) then
+                           temp = alpha*x(j)
+                           do i = j,n
+                               a(i,j) = a(i,j) + x(i)*temp
+                           end do
+                       end if
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=zero) then
+                           temp = alpha*x(jx)
+                           ix = jx
+                           do i = j,n
+                               a(i,j) = a(i,j) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                       end if
+                       jx = jx + incx
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_qsyr
+
+     !> DSYR2:  performs the symmetric rank 2 operation
+     !> A := alpha*x*y**T + alpha*y*x**T + A,
+     !> where alpha is a scalar, x and y are n element vectors and A is an n
+     !> by n symmetric matrix.
+
+     pure subroutine stdlib_qsyr2(uplo,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,n)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSYR2 ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==zero)) return
+           ! set up the start points in x and y if the increments are not both
+           ! unity.
+           if ((incx/=1) .or. (incy/=1)) then
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (n-1)*incx
+               end if
+               if (incy>0) then
+                   ky = 1
+               else
+                   ky = 1 - (n-1)*incy
+               end if
+               jx = kx
+               jy = ky
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through the triangular part
+           ! of a.
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when a is stored in the upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=zero) .or. (y(j)/=zero)) then
+                           temp1 = alpha*y(j)
+                           temp2 = alpha*x(j)
+                           do i = 1,j
+                               a(i,j) = a(i,j) + x(i)*temp1 + y(i)*temp2
+                           end do
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=zero) .or. (y(jy)/=zero)) then
+                           temp1 = alpha*y(jy)
+                           temp2 = alpha*x(jx)
+                           ix = kx
+                           iy = ky
+                           do i = 1,j
+                               a(i,j) = a(i,j) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           else
+              ! form  a  when a is stored in the lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=zero) .or. (y(j)/=zero)) then
+                           temp1 = alpha*y(j)
+                           temp2 = alpha*x(j)
+                           do i = j,n
+                               a(i,j) = a(i,j) + x(i)*temp1 + y(i)*temp2
+                           end do
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=zero) .or. (y(jy)/=zero)) then
+                           temp1 = alpha*y(jy)
+                           temp2 = alpha*x(jx)
+                           ix = jx
+                           iy = jy
+                           do i = j,n
+                               a(i,j) = a(i,j) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_qsyr2
+
+     !> DSYR2K:  performs one of the symmetric rank 2k operations
+     !> C := alpha*A*B**T + alpha*B*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*B + alpha*B**T*A + beta*C,
+     !> where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+     !> and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+     !> matrices in the second case.
+
+     pure subroutine stdlib_qsyr2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), b(ldb,*)
+           real(qp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(qp) :: temp1, temp2
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'T')) .and.(&
+                     .not.stdlib_lsame(trans,'C'))) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,nrowa)) then
+               info = 9
+           else if (ldc<max(1,n)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSYR2K',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==zero).or.(k==0)).and. (beta==one))) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               if (upper) then
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               else
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*b**t + alpha*b*a**t + c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=zero) .or. (b(j,l)/=zero)) then
+                               temp1 = alpha*b(j,l)
+                               temp2 = alpha*a(j,l)
+                               do i = 1,j
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=zero) .or. (b(j,l)/=zero)) then
+                               temp1 = alpha*b(j,l)
+                               temp2 = alpha*a(j,l)
+                               do i = j,n
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**t*b + alpha*b**t*a + c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp1 = zero
+                           temp2 = zero
+                           do l = 1,k
+                               temp1 = temp1 + a(l,i)*b(l,j)
+                               temp2 = temp2 + b(l,i)*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp1 + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + alpha*temp1 +alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp1 = zero
+                           temp2 = zero
+                           do l = 1,k
+                               temp1 = temp1 + a(l,i)*b(l,j)
+                               temp2 = temp2 + b(l,i)*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp1 + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + alpha*temp1 +alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_qsyr2k
+
+     !> DSYRK:  performs one of the symmetric rank k operations
+     !> C := alpha*A*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*A + beta*C,
+     !> where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+     !> and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+     !> in the second case.
+
+     pure subroutine stdlib_qsyrk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'T')) .and.(&
+                     .not.stdlib_lsame(trans,'C'))) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldc<max(1,n)) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DSYRK ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==zero).or.(k==0)).and. (beta==one))) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               if (upper) then
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               else
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*a**t + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=zero) then
+                               temp = alpha*a(j,l)
+                               do i = 1,j
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=zero) then
+                               temp = alpha*a(j,l)
+                               do i = j,n
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**t*a + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + a(l,i)*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + a(l,i)*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_qsyrk
+
+     !> DTBMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+
+     pure subroutine stdlib_qtbmv(uplo,trans,diag,n,k,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kplus1, kx, l
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda< (k+1)) then
+               info = 7
+           else if (incx==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx   too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           if (stdlib_lsame(trans,'N')) then
+               ! form  x := a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               l = kplus1 - j
+                               do i = max(1,j-k),j - 1
+                                   x(i) = x(i) + temp*a(l+i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(kplus1,j)
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               l = kplus1 - j
+                               do i = max(1,j-k),j - 1
+                                   x(ix) = x(ix) + temp*a(l+i,j)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(kplus1,j)
+                           end if
+                           jx = jx + incx
+                           if (j>k) kx = kx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               l = 1 - j
+                               do i = min(n,j+k),j + 1,-1
+                                   x(i) = x(i) + temp*a(l+i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(1,j)
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               l = 1 - j
+                               do i = min(n,j+k),j + 1,-1
+                                   x(ix) = x(ix) + temp*a(l+i,j)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(1,j)
+                           end if
+                           jx = jx - incx
+                           if ((n-j)>=k) kx = kx - incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           l = kplus1 - j
+                           if (nounit) temp = temp*a(kplus1,j)
+                           do i = j - 1,max(1,j-k),-1
+                               temp = temp + a(l+i,j)*x(i)
+                           end do
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           kx = kx - incx
+                           ix = kx
+                           l = kplus1 - j
+                           if (nounit) temp = temp*a(kplus1,j)
+                           do i = j - 1,max(1,j-k),-1
+                               temp = temp + a(l+i,j)*x(ix)
+                               ix = ix - incx
+                           end do
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           l = 1 - j
+                           if (nounit) temp = temp*a(1,j)
+                           do i = j + 1,min(n,j+k)
+                               temp = temp + a(l+i,j)*x(i)
+                           end do
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           kx = kx + incx
+                           ix = kx
+                           l = 1 - j
+                           if (nounit) temp = temp*a(1,j)
+                           do i = j + 1,min(n,j+k)
+                               temp = temp + a(l+i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_qtbmv
+
+     !> DTBSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular band matrix, with ( k + 1 )
+     !> diagonals.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_qtbsv(uplo,trans,diag,n,k,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kplus1, kx, l
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda< (k+1)) then
+               info = 7
+           else if (incx==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTBSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed by sequentially with one pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               l = kplus1 - j
+                               if (nounit) x(j) = x(j)/a(kplus1,j)
+                               temp = x(j)
+                               do i = j - 1,max(1,j-k),-1
+                                   x(i) = x(i) - temp*a(l+i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           kx = kx - incx
+                           if (x(jx)/=zero) then
+                               ix = kx
+                               l = kplus1 - j
+                               if (nounit) x(jx) = x(jx)/a(kplus1,j)
+                               temp = x(jx)
+                               do i = j - 1,max(1,j-k),-1
+                                   x(ix) = x(ix) - temp*a(l+i,j)
+                                   ix = ix - incx
+                               end do
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               l = 1 - j
+                               if (nounit) x(j) = x(j)/a(1,j)
+                               temp = x(j)
+                               do i = j + 1,min(n,j+k)
+                                   x(i) = x(i) - temp*a(l+i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           kx = kx + incx
+                           if (x(jx)/=zero) then
+                               ix = kx
+                               l = 1 - j
+                               if (nounit) x(jx) = x(jx)/a(1,j)
+                               temp = x(jx)
+                               do i = j + 1,min(n,j+k)
+                                   x(ix) = x(ix) - temp*a(l+i,j)
+                                   ix = ix + incx
+                               end do
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t)*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           l = kplus1 - j
+                           do i = max(1,j-k),j - 1
+                               temp = temp - a(l+i,j)*x(i)
+                           end do
+                           if (nounit) temp = temp/a(kplus1,j)
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           l = kplus1 - j
+                           do i = max(1,j-k),j - 1
+                               temp = temp - a(l+i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                           if (nounit) temp = temp/a(kplus1,j)
+                           x(jx) = temp
+                           jx = jx + incx
+                           if (j>k) kx = kx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           l = 1 - j
+                           do i = min(n,j+k),j + 1,-1
+                               temp = temp - a(l+i,j)*x(i)
+                           end do
+                           if (nounit) temp = temp/a(1,j)
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           l = 1 - j
+                           do i = min(n,j+k),j + 1,-1
+                               temp = temp - a(l+i,j)*x(ix)
+                               ix = ix - incx
+                           end do
+                           if (nounit) temp = temp/a(1,j)
+                           x(jx) = temp
+                           jx = jx - incx
+                           if ((n-j)>=k) kx = kx - incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_qtbsv
+
+     !> DTPMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular matrix, supplied in packed form.
+
+     pure subroutine stdlib_qtpmv(uplo,trans,diag,n,ap,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: ap(*)
+           real(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           logical(lk) :: nounit
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (incx==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTPMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of ap are
+           ! accessed sequentially with one pass through ap.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x:= a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               k = kk
+                               do i = 1,j - 1
+                                   x(i) = x(i) + temp*ap(k)
+                                   k = k + 1
+                               end do
+                               if (nounit) x(j) = x(j)*ap(kk+j-1)
+                           end if
+                           kk = kk + j
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               do k = kk,kk + j - 2
+                                   x(ix) = x(ix) + temp*ap(k)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*ap(kk+j-1)
+                           end if
+                           jx = jx + incx
+                           kk = kk + j
+                       end do
+                   end if
+               else
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               k = kk
+                               do i = n,j + 1,-1
+                                   x(i) = x(i) + temp*ap(k)
+                                   k = k - 1
+                               end do
+                               if (nounit) x(j) = x(j)*ap(kk-n+j)
+                           end if
+                           kk = kk - (n-j+1)
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               do k = kk,kk - (n- (j+1)),-1
+                                   x(ix) = x(ix) + temp*ap(k)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*ap(kk-n+j)
+                           end if
+                           jx = jx - incx
+                           kk = kk - (n-j+1)
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           if (nounit) temp = temp*ap(kk)
+                           k = kk - 1
+                           do i = j - 1,1,-1
+                               temp = temp + ap(k)*x(i)
+                               k = k - 1
+                           end do
+                           x(j) = temp
+                           kk = kk - j
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = jx
+                           if (nounit) temp = temp*ap(kk)
+                           do k = kk - 1,kk - j + 1,-1
+                               ix = ix - incx
+                               temp = temp + ap(k)*x(ix)
+                           end do
+                           x(jx) = temp
+                           jx = jx - incx
+                           kk = kk - j
+                       end do
+                   end if
+               else
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           if (nounit) temp = temp*ap(kk)
+                           k = kk + 1
+                           do i = j + 1,n
+                               temp = temp + ap(k)*x(i)
+                               k = k + 1
+                           end do
+                           x(j) = temp
+                           kk = kk + (n-j+1)
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = jx
+                           if (nounit) temp = temp*ap(kk)
+                           do k = kk + 1,kk + n - j
+                               ix = ix + incx
+                               temp = temp + ap(k)*x(ix)
+                           end do
+                           x(jx) = temp
+                           jx = jx + incx
+                           kk = kk + (n-j+1)
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_qtpmv
+
+     !> DTPSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular matrix, supplied in packed form.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_qtpsv(uplo,trans,diag,n,ap,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: ap(*)
+           real(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           logical(lk) :: nounit
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (incx==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTPSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of ap are
+           ! accessed sequentially with one pass through ap.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               if (nounit) x(j) = x(j)/ap(kk)
+                               temp = x(j)
+                               k = kk - 1
+                               do i = j - 1,1,-1
+                                   x(i) = x(i) - temp*ap(k)
+                                   k = k - 1
+                               end do
+                           end if
+                           kk = kk - j
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               if (nounit) x(jx) = x(jx)/ap(kk)
+                               temp = x(jx)
+                               ix = jx
+                               do k = kk - 1,kk - j + 1,-1
+                                   ix = ix - incx
+                                   x(ix) = x(ix) - temp*ap(k)
+                               end do
+                           end if
+                           jx = jx - incx
+                           kk = kk - j
+                       end do
+                   end if
+               else
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               if (nounit) x(j) = x(j)/ap(kk)
+                               temp = x(j)
+                               k = kk + 1
+                               do i = j + 1,n
+                                   x(i) = x(i) - temp*ap(k)
+                                   k = k + 1
+                               end do
+                           end if
+                           kk = kk + (n-j+1)
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               if (nounit) x(jx) = x(jx)/ap(kk)
+                               temp = x(jx)
+                               ix = jx
+                               do k = kk + 1,kk + n - j
+                                   ix = ix + incx
+                                   x(ix) = x(ix) - temp*ap(k)
+                               end do
+                           end if
+                           jx = jx + incx
+                           kk = kk + (n-j+1)
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           k = kk
+                           do i = 1,j - 1
+                               temp = temp - ap(k)*x(i)
+                               k = k + 1
+                           end do
+                           if (nounit) temp = temp/ap(kk+j-1)
+                           x(j) = temp
+                           kk = kk + j
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           do k = kk,kk + j - 2
+                               temp = temp - ap(k)*x(ix)
+                               ix = ix + incx
+                           end do
+                           if (nounit) temp = temp/ap(kk+j-1)
+                           x(jx) = temp
+                           jx = jx + incx
+                           kk = kk + j
+                       end do
+                   end if
+               else
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           k = kk
+                           do i = n,j + 1,-1
+                               temp = temp - ap(k)*x(i)
+                               k = k - 1
+                           end do
+                           if (nounit) temp = temp/ap(kk-n+j)
+                           x(j) = temp
+                           kk = kk - (n-j+1)
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           do k = kk,kk - (n- (j+1)),-1
+                               temp = temp - ap(k)*x(ix)
+                               ix = ix - incx
+                           end do
+                           if (nounit) temp = temp/ap(kk-n+j)
+                           x(jx) = temp
+                           jx = jx - incx
+                           kk = kk - (n-j+1)
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_qtpsv
+
+     !> DTRMM:  performs one of the matrix-matrix operations
+     !> B := alpha*op( A )*B,   or   B := alpha*B*op( A ),
+     !> where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T.
+
+     pure subroutine stdlib_qtrmm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           character, intent(in) :: diag, side, transa, uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: lside, nounit, upper
+           
+           ! test the input parameters.
+           lside = stdlib_lsame(side,'L')
+           if (lside) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           nounit = stdlib_lsame(diag,'N')
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.lside) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if ((.not.stdlib_lsame(transa,'N')) .and.(.not.stdlib_lsame(transa,'T')) .and.(&
+                     .not.stdlib_lsame(transa,'C'))) then
+               info = 3
+           else if ((.not.stdlib_lsame(diag,'U')) .and. (.not.stdlib_lsame(diag,'N'))) &
+                     then
+               info = 4
+           else if (m<0) then
+               info = 5
+           else if (n<0) then
+               info = 6
+           else if (lda<max(1,nrowa)) then
+               info = 9
+           else if (ldb<max(1,m)) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTRMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (m==0 .or. n==0) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               do j = 1,n
+                   do i = 1,m
+                       b(i,j) = zero
+                   end do
+               end do
+               return
+           end if
+           ! start the operations.
+           if (lside) then
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*a*b.
+                   if (upper) then
+                       do j = 1,n
+                           do k = 1,m
+                               if (b(k,j)/=zero) then
+                                   temp = alpha*b(k,j)
+                                   do i = 1,k - 1
+                                       b(i,j) = b(i,j) + temp*a(i,k)
+                                   end do
+                                   if (nounit) temp = temp*a(k,k)
+                                   b(k,j) = temp
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do k = m,1,-1
+                               if (b(k,j)/=zero) then
+                                   temp = alpha*b(k,j)
+                                   b(k,j) = temp
+                                   if (nounit) b(k,j) = b(k,j)*a(k,k)
+                                   do i = k + 1,m
+                                       b(i,j) = b(i,j) + temp*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*a**t*b.
+                   if (upper) then
+                       do j = 1,n
+                           do i = m,1,-1
+                               temp = b(i,j)
+                               if (nounit) temp = temp*a(i,i)
+                               do k = 1,i - 1
+                                   temp = temp + a(k,i)*b(k,j)
+                               end do
+                               b(i,j) = alpha*temp
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,m
+                               temp = b(i,j)
+                               if (nounit) temp = temp*a(i,i)
+                               do k = i + 1,m
+                                   temp = temp + a(k,i)*b(k,j)
+                               end do
+                               b(i,j) = alpha*temp
+                           end do
+                       end do
+                   end if
+               end if
+           else
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*b*a.
+                   if (upper) then
+                       do j = n,1,-1
+                           temp = alpha
+                           if (nounit) temp = temp*a(j,j)
+                           do i = 1,m
+                               b(i,j) = temp*b(i,j)
+                           end do
+                           do k = 1,j - 1
+                               if (a(k,j)/=zero) then
+                                   temp = alpha*a(k,j)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           temp = alpha
+                           if (nounit) temp = temp*a(j,j)
+                           do i = 1,m
+                               b(i,j) = temp*b(i,j)
+                           end do
+                           do k = j + 1,n
+                               if (a(k,j)/=zero) then
+                                   temp = alpha*a(k,j)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*b*a**t.
+                   if (upper) then
+                       do k = 1,n
+                           do j = 1,k - 1
+                               if (a(j,k)/=zero) then
+                                   temp = alpha*a(j,k)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           temp = alpha
+                           if (nounit) temp = temp*a(k,k)
+                           if (temp/=one) then
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                       end do
+                   else
+                       do k = n,1,-1
+                           do j = k + 1,n
+                               if (a(j,k)/=zero) then
+                                   temp = alpha*a(j,k)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           temp = alpha
+                           if (nounit) temp = temp*a(k,k)
+                           if (temp/=one) then
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_qtrmm
+
+     !> DTRMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular matrix.
+
+     pure subroutine stdlib_qtrmv(uplo,trans,diag,n,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,n)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTRMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               do i = 1,j - 1
+                                   x(i) = x(i) + temp*a(i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(j,j)
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               do i = 1,j - 1
+                                   x(ix) = x(ix) + temp*a(i,j)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(j,j)
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               do i = n,j + 1,-1
+                                   x(i) = x(i) + temp*a(i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(j,j)
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               do i = n,j + 1,-1
+                                   x(ix) = x(ix) + temp*a(i,j)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(j,j)
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           if (nounit) temp = temp*a(j,j)
+                           do i = j - 1,1,-1
+                               temp = temp + a(i,j)*x(i)
+                           end do
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = jx
+                           if (nounit) temp = temp*a(j,j)
+                           do i = j - 1,1,-1
+                               ix = ix - incx
+                               temp = temp + a(i,j)*x(ix)
+                           end do
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           if (nounit) temp = temp*a(j,j)
+                           do i = j + 1,n
+                               temp = temp + a(i,j)*x(i)
+                           end do
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = jx
+                           if (nounit) temp = temp*a(j,j)
+                           do i = j + 1,n
+                               ix = ix + incx
+                               temp = temp + a(i,j)*x(ix)
+                           end do
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_qtrmv
+
+     !> DTRSM:  solves one of the matrix equations
+     !> op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+     !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T.
+     !> The matrix X is overwritten on B.
+
+     pure subroutine stdlib_qtrsm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           character, intent(in) :: diag, side, transa, uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: lside, nounit, upper
+           
+           ! test the input parameters.
+           lside = stdlib_lsame(side,'L')
+           if (lside) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           nounit = stdlib_lsame(diag,'N')
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.lside) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if ((.not.stdlib_lsame(transa,'N')) .and.(.not.stdlib_lsame(transa,'T')) .and.(&
+                     .not.stdlib_lsame(transa,'C'))) then
+               info = 3
+           else if ((.not.stdlib_lsame(diag,'U')) .and. (.not.stdlib_lsame(diag,'N'))) &
+                     then
+               info = 4
+           else if (m<0) then
+               info = 5
+           else if (n<0) then
+               info = 6
+           else if (lda<max(1,nrowa)) then
+               info = 9
+           else if (ldb<max(1,m)) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTRSM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (m==0 .or. n==0) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               do j = 1,n
+                   do i = 1,m
+                       b(i,j) = zero
+                   end do
+               end do
+               return
+           end if
+           ! start the operations.
+           if (lside) then
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*inv( a )*b.
+                   if (upper) then
+                       do j = 1,n
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = m,1,-1
+                               if (b(k,j)/=zero) then
+                                   if (nounit) b(k,j) = b(k,j)/a(k,k)
+                                   do i = 1,k - 1
+                                       b(i,j) = b(i,j) - b(k,j)*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = 1,m
+                               if (b(k,j)/=zero) then
+                                   if (nounit) b(k,j) = b(k,j)/a(k,k)
+                                   do i = k + 1,m
+                                       b(i,j) = b(i,j) - b(k,j)*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*inv( a**t )*b.
+                   if (upper) then
+                       do j = 1,n
+                           do i = 1,m
+                               temp = alpha*b(i,j)
+                               do k = 1,i - 1
+                                   temp = temp - a(k,i)*b(k,j)
+                               end do
+                               if (nounit) temp = temp/a(i,i)
+                               b(i,j) = temp
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = m,1,-1
+                               temp = alpha*b(i,j)
+                               do k = i + 1,m
+                                   temp = temp - a(k,i)*b(k,j)
+                               end do
+                               if (nounit) temp = temp/a(i,i)
+                               b(i,j) = temp
+                           end do
+                       end do
+                   end if
+               end if
+           else
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*b*inv( a ).
+                   if (upper) then
+                       do j = 1,n
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = 1,j - 1
+                               if (a(k,j)/=zero) then
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - a(k,j)*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (nounit) then
+                               temp = one/a(j,j)
+                               do i = 1,m
+                                   b(i,j) = temp*b(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       do j = n,1,-1
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = j + 1,n
+                               if (a(k,j)/=zero) then
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - a(k,j)*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (nounit) then
+                               temp = one/a(j,j)
+                               do i = 1,m
+                                   b(i,j) = temp*b(i,j)
+                               end do
+                           end if
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*b*inv( a**t ).
+                   if (upper) then
+                       do k = n,1,-1
+                           if (nounit) then
+                               temp = one/a(k,k)
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                           do j = 1,k - 1
+                               if (a(j,k)/=zero) then
+                                   temp = a(j,k)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,k) = alpha*b(i,k)
+                               end do
+                           end if
+                       end do
+                   else
+                       do k = 1,n
+                           if (nounit) then
+                               temp = one/a(k,k)
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                           do j = k + 1,n
+                               if (a(j,k)/=zero) then
+                                   temp = a(j,k)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,k) = alpha*b(i,k)
+                               end do
+                           end if
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_qtrsm
+
+     !> DTRSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular matrix.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_qtrsv(uplo,trans,diag,n,a,lda,x,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,n)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('DTRSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               if (nounit) x(j) = x(j)/a(j,j)
+                               temp = x(j)
+                               do i = j - 1,1,-1
+                                   x(i) = x(i) - temp*a(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               if (nounit) x(jx) = x(jx)/a(j,j)
+                               temp = x(jx)
+                               ix = jx
+                               do i = j - 1,1,-1
+                                   ix = ix - incx
+                                   x(ix) = x(ix) - temp*a(i,j)
+                               end do
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               if (nounit) x(j) = x(j)/a(j,j)
+                               temp = x(j)
+                               do i = j + 1,n
+                                   x(i) = x(i) - temp*a(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               if (nounit) x(jx) = x(jx)/a(j,j)
+                               temp = x(jx)
+                               ix = jx
+                               do i = j + 1,n
+                                   ix = ix + incx
+                                   x(ix) = x(ix) - temp*a(i,j)
+                               end do
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           do i = 1,j - 1
+                               temp = temp - a(i,j)*x(i)
+                           end do
+                           if (nounit) temp = temp/a(j,j)
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           do i = 1,j - 1
+                               temp = temp - a(i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                           if (nounit) temp = temp/a(j,j)
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           do i = n,j + 1,-1
+                               temp = temp - a(i,j)*x(i)
+                           end do
+                           if (nounit) temp = temp/a(j,j)
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           do i = n,j + 1,-1
+                               temp = temp - a(i,j)*x(ix)
+                               ix = ix - incx
+                           end do
+                           if (nounit) temp = temp/a(j,j)
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_qtrsv
+
+     !> DZASUM: takes the sum of the (|Re(.)| + |Im(.)|)'s of a complex vector and
+     !> returns a quad precision result.
+
+     pure real(qp) function stdlib_qzasum(n,zx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(qp), intent(in) :: zx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: stemp
+           integer(ilp) :: i, nincx
+           stdlib_qzasum = zero
+           stemp = zero
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              do i = 1,n
+                 stemp = stemp + stdlib_qcabs1(zx(i))
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 stemp = stemp + stdlib_qcabs1(zx(i))
+              end do
+           end if
+           stdlib_qzasum = stemp
+           return
+     end function stdlib_qzasum
+
+     !> !
+     !>
+     !> DZNRM2: returns the euclidean norm of a vector via the function
+     !> name, so that
+     !> DZNRM2 := sqrt( x**H*x )
+
+     pure function stdlib_qznrm2( n, x, incx )
+        real(qp) :: stdlib_qznrm2
+        ! -- reference blas level1 routine (version 3.9.1_qp) --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! march 2021
+        ! Constants 
+        integer, parameter :: wp = kind(1._qp)
+        real(qp), parameter :: maxn = huge(0.0_qp)
+        ! .. blue's scaling constants ..
+        ! Scalar Arguments 
+     integer(ilp), intent(in) :: incx, n
+        ! Array Arguments 
+        complex(qp), intent(in) :: x(*)
+        ! Local Scalars 
+     integer(ilp) :: i, ix
+     logical(lk) :: notbig
+        real(qp) :: abig, amed, asml, ax, scl, sumsq, ymax, ymin
+        ! quick return if possible
+        stdlib_qznrm2 = zero
+        if( n <= 0 ) return
+        scl = one
+        sumsq = zero
+        ! compute the sum of squares in 3 accumulators:
+           ! abig -- sums of squares scaled down to avoid overflow
+           ! asml -- sums of squares scaled up to avoid underflow
+           ! amed -- sums of squares that do not require scaling
+        ! the thresholds and multipliers are
+           ! tbig -- values bigger than this are scaled down by sbig
+           ! tsml -- values smaller than this are scaled up by ssml
+        notbig = .true.
+        asml = zero
+        amed = zero
+        abig = zero
+        ix = 1
+        if( incx < 0 ) ix = 1 - (n-1)*incx
+        do i = 1, n
+           ax = abs(real(x(ix),KIND=qp))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ax = abs(aimag(x(ix)))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ix = ix + incx
+        end do
+        ! combine abig and amed or amed and asml if more than one
+        ! accumulator was used.
+        if (abig > zero) then
+           ! combine abig and amed if abig > 0.
+           if ( (amed > zero) .or. (amed > maxn) .or. (amed /= amed) ) then
+              abig = abig + (amed*sbig)*sbig
+           end if
+           scl = one / sbig
+           sumsq = abig
+        else if (asml > zero) then
+           ! combine amed and asml if asml > 0.
+           if ( (amed > zero) .or. (amed > maxn) .or. (amed /= amed) ) then
+              amed = sqrt(amed)
+              asml = sqrt(asml) / ssml
+              if (asml > amed) then
+                 ymin = amed
+                 ymax = asml
+              else
+                 ymin = asml
+                 ymax = amed
+              end if
+              scl = one
+              sumsq = ymax**2*( one + (ymin/ymax)**2 )
+           else
+              scl = one / ssml
+              sumsq = asml
+           end if
+        else
+           ! otherwise all values are mid-range
+           scl = one
+           sumsq = amed
+        end if
+        stdlib_qznrm2 = scl*sqrt( sumsq )
+        return
+     end function stdlib_qznrm2
+
+
+
+end module stdlib_linalg_blas_q
+#:endif
diff --git a/src/stdlib_linalg_blas_s.fypp b/src/stdlib_linalg_blas_s.fypp
new file mode 100644
index 000000000..504af9d3a
--- /dev/null
+++ b/src/stdlib_linalg_blas_s.fypp
@@ -0,0 +1,4522 @@
+#:include "common.fypp" 
+module stdlib_linalg_blas_s
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas_aux
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+     public :: stdlib_sasum
+     public :: stdlib_saxpy
+     public :: stdlib_scasum
+     public :: stdlib_scnrm2
+     public :: stdlib_scopy
+     public :: stdlib_sdot
+     public :: stdlib_sdsdot
+     public :: stdlib_sgbmv
+     public :: stdlib_sgemm
+     public :: stdlib_sgemv
+     public :: stdlib_sger
+     public :: stdlib_snrm2
+     public :: stdlib_srot
+     public :: stdlib_srotg
+     public :: stdlib_srotm
+     public :: stdlib_srotmg
+     public :: stdlib_ssbmv
+     public :: stdlib_sscal
+     public :: stdlib_sspmv
+     public :: stdlib_sspr
+     public :: stdlib_sspr2
+     public :: stdlib_sswap
+     public :: stdlib_ssymm
+     public :: stdlib_ssymv
+     public :: stdlib_ssyr
+     public :: stdlib_ssyr2
+     public :: stdlib_ssyr2k
+     public :: stdlib_ssyrk
+     public :: stdlib_stbmv
+     public :: stdlib_stbsv
+     public :: stdlib_stpmv
+     public :: stdlib_stpsv
+     public :: stdlib_strmm
+     public :: stdlib_strmv
+     public :: stdlib_strsm
+     public :: stdlib_strsv
+
+     ! 32-bit real constants 
+     real(sp),    parameter, private ::     negone = -1.00_sp
+     real(sp),    parameter, private ::       zero = 0.00_sp
+     real(sp),    parameter, private ::       half = 0.50_sp
+     real(sp),    parameter, private ::        one = 1.00_sp
+     real(sp),    parameter, private ::        two = 2.00_sp
+     real(sp),    parameter, private ::      three = 3.00_sp
+     real(sp),    parameter, private ::       four = 4.00_sp
+     real(sp),    parameter, private ::      eight = 8.00_sp
+     real(sp),    parameter, private ::        ten = 10.00_sp
+
+     ! 32-bit complex constants 
+     complex(sp), parameter, private :: czero   = ( 0.0_sp,0.0_sp)
+     complex(sp), parameter, private :: chalf   = ( 0.5_sp,0.0_sp)
+     complex(sp), parameter, private :: cone    = ( 1.0_sp,0.0_sp)
+     complex(sp), parameter, private :: cnegone = (-1.0_sp,0.0_sp)
+
+     ! 32-bit scaling constants 
+     integer,     parameter, private :: maxexp = maxexponent(zero) 
+     integer,     parameter, private :: minexp = minexponent(zero) 
+     real(sp),    parameter, private :: rradix = real(radix(zero),sp) 
+     real(sp),    parameter, private :: ulp    = epsilon(zero) 
+     real(sp),    parameter, private :: eps    = ulp*half 
+     real(sp),    parameter, private :: safmin = rradix**max(minexp-1,1-maxexp) 
+     real(sp),    parameter, private :: safmax = one/safmin 
+     real(sp),    parameter, private :: smlnum = safmin/ulp 
+     real(sp),    parameter, private :: bignum = safmax*ulp 
+     real(sp),    parameter, private :: rtmin  = sqrt(smlnum) 
+     real(sp),    parameter, private :: rtmax  = sqrt(bignum) 
+
+     ! 32-bit Blue's scaling constants 
+     ! ssml>=1/s and sbig==1/S with s,S as defined in https://doi.org/10.1145/355769.355771 
+     real(sp),    parameter, private :: tsml   = rradix**ceiling((minexp-1)*half) 
+     real(sp),    parameter, private :: tbig   = rradix**floor((maxexp-digits(zero)+1)*half) 
+     real(sp),    parameter, private :: ssml   = rradix**(-floor((minexp-digits(zero))*half)) 
+     real(sp),    parameter, private :: sbig   = rradix**(-ceiling((maxexp+digits(zero)-1)*half)) 
+
+
+     contains
+
+     !> SASUM: takes the sum of the absolute values.
+     !> uses unrolled loops for increment equal to one.
+
+     pure real(sp) function stdlib_sasum(n,sx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           real(sp), intent(in) :: sx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: stemp
+           integer(ilp) :: i, m, mp1, nincx
+           ! Intrinsic Functions 
+           intrinsic :: abs,mod
+           stdlib_sasum = zero
+           stemp = zero
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              ! clean-up loop
+              m = mod(n,6)
+              if (m/=0) then
+                 do i = 1,m
+                    stemp = stemp + abs(sx(i))
+                 end do
+                 if (n<6) then
+                    stdlib_sasum = stemp
+                    return
+                 end if
+              end if
+              mp1 = m + 1
+              do i = mp1,n,6
+                 stemp = stemp + abs(sx(i)) + abs(sx(i+1)) +abs(sx(i+2)) + abs(sx(i+3)) +abs(sx(i+&
+                           4)) + abs(sx(i+5))
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 stemp = stemp + abs(sx(i))
+              end do
+           end if
+           stdlib_sasum = stemp
+           return
+     end function stdlib_sasum
+
+     !> SAXPY: constant times a vector plus a vector.
+     !> uses unrolled loops for increments equal to one.
+
+     pure subroutine stdlib_saxpy(n,sa,sx,incx,sy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: sa
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(sp), intent(in) :: sx(*)
+           real(sp), intent(inout) :: sy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy, m, mp1
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           if (n<=0) return
+           if (sa==0.0_sp) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              ! clean-up loop
+              m = mod(n,4)
+              if (m/=0) then
+                 do i = 1,m
+                    sy(i) = sy(i) + sa*sx(i)
+                 end do
+              end if
+              if (n<4) return
+              mp1 = m + 1
+              do i = mp1,n,4
+                 sy(i) = sy(i) + sa*sx(i)
+                 sy(i+1) = sy(i+1) + sa*sx(i+1)
+                 sy(i+2) = sy(i+2) + sa*sx(i+2)
+                 sy(i+3) = sy(i+3) + sa*sx(i+3)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+               sy(iy) = sy(iy) + sa*sx(ix)
+               ix = ix + incx
+               iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_saxpy
+
+     !> SCASUM: takes the sum of the (|Re(.)| + |Im(.)|)'s of a complex vector and
+     !> returns a single precision result.
+
+     pure real(sp) function stdlib_scasum(n,cx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(sp), intent(in) :: cx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: stemp
+           integer(ilp) :: i, nincx
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,real
+           stdlib_scasum = zero
+           stemp = zero
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              do i = 1,n
+                 stemp = stemp + abs(real(cx(i),KIND=sp)) + abs(aimag(cx(i)))
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 stemp = stemp + abs(real(cx(i),KIND=sp)) + abs(aimag(cx(i)))
+              end do
+           end if
+           stdlib_scasum = stemp
+           return
+     end function stdlib_scasum
+
+     !> !
+     !>
+     !> SCNRM2: returns the euclidean norm of a vector via the function
+     !> name, so that
+     !> SCNRM2 := sqrt( x**H*x )
+
+     pure function stdlib_scnrm2( n, x, incx )
+        real(sp) :: stdlib_scnrm2
+        ! -- reference blas level1 routine (version 3.9.1_sp) --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! march 2021
+        ! Constants 
+        integer, parameter :: wp = kind(1._sp)
+        real(sp), parameter :: maxn = huge(0.0_sp)
+        ! .. blue's scaling constants ..
+        ! Scalar Arguments 
+     integer(ilp), intent(in) :: incx, n
+        ! Array Arguments 
+        complex(sp), intent(in) :: x(*)
+        ! Local Scalars 
+     integer(ilp) :: i, ix
+     logical(lk) :: notbig
+        real(sp) :: abig, amed, asml, ax, scl, sumsq, ymax, ymin
+        ! quick return if possible
+        stdlib_scnrm2 = zero
+        if( n <= 0 ) return
+        scl = one
+        sumsq = zero
+        ! compute the sum of squares in 3 accumulators:
+           ! abig -- sums of squares scaled down to avoid overflow
+           ! asml -- sums of squares scaled up to avoid underflow
+           ! amed -- sums of squares that do not require scaling
+        ! the thresholds and multipliers are
+           ! tbig -- values bigger than this are scaled down by sbig
+           ! tsml -- values smaller than this are scaled up by ssml
+        notbig = .true.
+        asml = zero
+        amed = zero
+        abig = zero
+        ix = 1
+        if( incx < 0 ) ix = 1 - (n-1)*incx
+        do i = 1, n
+           ax = abs(real(x(ix),KIND=sp))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ax = abs(aimag(x(ix)))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ix = ix + incx
+        end do
+        ! combine abig and amed or amed and asml if more than one
+        ! accumulator was used.
+        if (abig > zero) then
+           ! combine abig and amed if abig > 0.
+           if ( (amed > zero) .or. (amed > maxn) .or. (amed /= amed) ) then
+              abig = abig + (amed*sbig)*sbig
+           end if
+           scl = one / sbig
+           sumsq = abig
+        else if (asml > zero) then
+           ! combine amed and asml if asml > 0.
+           if ( (amed > zero) .or. (amed > maxn) .or. (amed /= amed) ) then
+              amed = sqrt(amed)
+              asml = sqrt(asml) / ssml
+              if (asml > amed) then
+                 ymin = amed
+                 ymax = asml
+              else
+                 ymin = asml
+                 ymax = amed
+              end if
+              scl = one
+              sumsq = ymax**2*( one + (ymin/ymax)**2 )
+           else
+              scl = one / ssml
+              sumsq = asml
+           end if
+        else
+           ! otherwise all values are mid-range
+           scl = one
+           sumsq = amed
+        end if
+        stdlib_scnrm2 = scl*sqrt( sumsq )
+        return
+     end function stdlib_scnrm2
+
+     !> SCOPY: copies a vector, x, to a vector, y.
+     !> uses unrolled loops for increments equal to 1.
+
+     pure subroutine stdlib_scopy(n,sx,incx,sy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(sp), intent(in) :: sx(*)
+           real(sp), intent(out) :: sy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy, m, mp1
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              ! clean-up loop
+              m = mod(n,7)
+              if (m/=0) then
+                 do i = 1,m
+                    sy(i) = sx(i)
+                 end do
+                 if (n<7) return
+              end if
+              mp1 = m + 1
+              do i = mp1,n,7
+                 sy(i) = sx(i)
+                 sy(i+1) = sx(i+1)
+                 sy(i+2) = sx(i+2)
+                 sy(i+3) = sx(i+3)
+                 sy(i+4) = sx(i+4)
+                 sy(i+5) = sx(i+5)
+                 sy(i+6) = sx(i+6)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 sy(iy) = sx(ix)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_scopy
+
+     !> SDOT: forms the dot product of two vectors.
+     !> uses unrolled loops for increments equal to one.
+
+     pure real(sp) function stdlib_sdot(n,sx,incx,sy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(sp), intent(in) :: sx(*), sy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: stemp
+           integer(ilp) :: i, ix, iy, m, mp1
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           stemp = zero
+           stdlib_sdot = zero
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              ! clean-up loop
+              m = mod(n,5)
+              if (m/=0) then
+                 do i = 1,m
+                    stemp = stemp + sx(i)*sy(i)
+                 end do
+                 if (n<5) then
+                    stdlib_sdot=stemp
+                 return
+                 end if
+              end if
+              mp1 = m + 1
+              do i = mp1,n,5
+               stemp = stemp + sx(i)*sy(i) + sx(i+1)*sy(i+1) +sx(i+2)*sy(i+2) + sx(i+3)*sy(i+3) + &
+                         sx(i+4)*sy(i+4)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 stemp = stemp + sx(ix)*sy(iy)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           stdlib_sdot = stemp
+           return
+     end function stdlib_sdot
+
+     !> Compute the inner product of two vectors with extended
+     !> precision accumulation.
+     !> Returns S.P. result with dot product accumulated in D.P.
+     !> SDSDOT: = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY),
+     !> where LX = 1 if INCX >= 0, else LX = 1+(1-N)*INCX, and LY is
+     !> defined in a similar way using INCY.
+
+     pure real(sp) function stdlib_sdsdot(n,sb,sx,incx,sy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: sb
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(sp), intent(in) :: sx(*), sy(*)
+           ! Local Scalars 
+           real(dp) :: dsdot
+           integer(ilp) :: i, kx, ky, ns
+           ! Intrinsic Functions 
+           intrinsic :: real
+           dsdot = sb
+           if (n<=0) then
+              stdlib_sdsdot = dsdot
+              return
+           end if
+           if (incx==incy .and. incx>0) then
+           ! code for equal and positive increments.
+              ns = n*incx
+              do i = 1,ns,incx
+                 dsdot = dsdot + real(sx(i),KIND=sp)*real(sy(i),KIND=sp)
+              end do
+           else
+           ! code for unequal or nonpositive increments.
+              kx = 1
+              ky = 1
+              if (incx<0) kx = 1 + (1-n)*incx
+              if (incy<0) ky = 1 + (1-n)*incy
+              do i = 1,n
+                 dsdot = dsdot + real(sx(kx),KIND=sp)*real(sy(ky),KIND=sp)
+                 kx = kx + incx
+                 ky = ky + incy
+              end do
+           end if
+           stdlib_sdsdot = dsdot
+           return
+     end function stdlib_sdsdot
+
+     !> SGBMV:  performs one of the matrix-vector operations
+     !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n band matrix, with kl sub-diagonals and ku super-diagonals.
+
+     pure subroutine stdlib_sgbmv(trans,m,n,kl,ku,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, kl, ku, lda, m, n
+           character, intent(in) :: trans
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), x(*)
+           real(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kup1, kx, ky, lenx, leny
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 1
+           else if (m<0) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (kl<0) then
+               info = 4
+           else if (ku<0) then
+               info = 5
+           else if (lda< (kl+ku+1)) then
+               info = 8
+           else if (incx==0) then
+               info = 10
+           else if (incy==0) then
+               info = 13
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SGBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==zero).and. (beta==one))) return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if (stdlib_lsame(trans,'N')) then
+               lenx = n
+               leny = m
+           else
+               lenx = m
+               leny = n
+           end if
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (lenx-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (leny-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through the band part of a.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,leny
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,leny
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,leny
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,leny
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           kup1 = ku + 1
+           if (stdlib_lsame(trans,'N')) then
+              ! form  y := alpha*a*x + y.
+               jx = kx
+               if (incy==1) then
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           y(i) = y(i) + temp*a(k+i,j)
+                       end do
+                       jx = jx + incx
+                   end do
+               else
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       iy = ky
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           y(iy) = y(iy) + temp*a(k+i,j)
+                           iy = iy + incy
+                       end do
+                       jx = jx + incx
+                       if (j>ku) ky = ky + incy
+                   end do
+               end if
+           else
+              ! form  y := alpha*a**t*x + y.
+               jy = ky
+               if (incx==1) then
+                   do j = 1,n
+                       temp = zero
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           temp = temp + a(k+i,j)*x(i)
+                       end do
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               else
+                   do j = 1,n
+                       temp = zero
+                       ix = kx
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           temp = temp + a(k+i,j)*x(ix)
+                           ix = ix + incx
+                       end do
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                       if (j>ku) kx = kx + incx
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_sgbmv
+
+     !> SGEMM:  performs one of the matrix-matrix operations
+     !> C := alpha*op( A )*op( B ) + beta*C,
+     !> where  op( X ) is one of
+     !> op( X ) = X   or   op( X ) = X**T,
+     !> alpha and beta are scalars, and A, B and C are matrices, with op( A )
+     !> an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+
+     pure subroutine stdlib_sgemm(transa,transb,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, m, n
+           character, intent(in) :: transa, transb
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), b(ldb,*)
+           real(sp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, j, l, nrowa, nrowb
+           logical(lk) :: nota, notb
+           
+           ! set  nota  and  notb  as  true if  a  and  b  respectively are not
+           ! transposed and set  nrowa and nrowb  as the number of rows of  a
+           ! and  b  respectively.
+           nota = stdlib_lsame(transa,'N')
+           notb = stdlib_lsame(transb,'N')
+           if (nota) then
+               nrowa = m
+           else
+               nrowa = k
+           end if
+           if (notb) then
+               nrowb = k
+           else
+               nrowb = n
+           end if
+           ! test the input parameters.
+           info = 0
+           if ((.not.nota) .and. (.not.stdlib_lsame(transa,'C')) .and.(.not.stdlib_lsame(transa,&
+                     'T'))) then
+               info = 1
+           else if ((.not.notb) .and. (.not.stdlib_lsame(transb,'C')) .and.(.not.stdlib_lsame(&
+                     transb,'T'))) then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda<max(1,nrowa)) then
+               info = 8
+           else if (ldb<max(1,nrowb)) then
+               info = 10
+           else if (ldc<max(1,m)) then
+               info = 13
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SGEMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.(((alpha==zero).or. (k==0)).and. (beta==one))) &
+                     return
+           ! and if  alpha.eq.zero.
+           if (alpha==zero) then
+               if (beta==zero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = zero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (notb) then
+               if (nota) then
+                 ! form  c := alpha*a*b + beta*c.
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,m
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*b(l,j)
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**t*b + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + a(l,i)*b(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           else
+               if (nota) then
+                 ! form  c := alpha*a*b**t + beta*c
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,m
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*b(j,l)
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**t*b**t + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + a(l,i)*b(j,l)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_sgemm
+
+     !> SGEMV:  performs one of the matrix-vector operations
+     !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+
+     pure subroutine stdlib_sgemv(trans,m,n,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           character, intent(in) :: trans
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), x(*)
+           real(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky, lenx, leny
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 1
+           else if (m<0) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (lda<max(1,m)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           else if (incy==0) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SGEMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==zero).and. (beta==one))) return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if (stdlib_lsame(trans,'N')) then
+               lenx = n
+               leny = m
+           else
+               lenx = m
+               leny = n
+           end if
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (lenx-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (leny-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,leny
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,leny
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,leny
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,leny
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           if (stdlib_lsame(trans,'N')) then
+              ! form  y := alpha*a*x + y.
+               jx = kx
+               if (incy==1) then
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       do i = 1,m
+                           y(i) = y(i) + temp*a(i,j)
+                       end do
+                       jx = jx + incx
+                   end do
+               else
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       iy = ky
+                       do i = 1,m
+                           y(iy) = y(iy) + temp*a(i,j)
+                           iy = iy + incy
+                       end do
+                       jx = jx + incx
+                   end do
+               end if
+           else
+              ! form  y := alpha*a**t*x + y.
+               jy = ky
+               if (incx==1) then
+                   do j = 1,n
+                       temp = zero
+                       do i = 1,m
+                           temp = temp + a(i,j)*x(i)
+                       end do
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               else
+                   do j = 1,n
+                       temp = zero
+                       ix = kx
+                       do i = 1,m
+                           temp = temp + a(i,j)*x(ix)
+                           ix = ix + incx
+                       end do
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_sgemv
+
+     !> SGER:   performs the rank 1 operation
+     !> A := alpha*x*y**T + A,
+     !> where alpha is a scalar, x is an m element vector, y is an n element
+     !> vector and A is an m by n matrix.
+
+     pure subroutine stdlib_sger(m,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jy, kx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (m<0) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,m)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SGER  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or. (alpha==zero)) return
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           if (incy>0) then
+               jy = 1
+           else
+               jy = 1 - (n-1)*incy
+           end if
+           if (incx==1) then
+               do j = 1,n
+                   if (y(jy)/=zero) then
+                       temp = alpha*y(jy)
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(i)*temp
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           else
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (m-1)*incx
+               end if
+               do j = 1,n
+                   if (y(jy)/=zero) then
+                       temp = alpha*y(jy)
+                       ix = kx
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(ix)*temp
+                           ix = ix + incx
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           end if
+           return
+     end subroutine stdlib_sger
+
+     !> !
+     !>
+     !> SNRM2: returns the euclidean norm of a vector via the function
+     !> name, so that
+     !> SNRM2 := sqrt( x'*x ).
+
+     pure function stdlib_snrm2( n, x, incx )
+        real(sp) :: stdlib_snrm2
+        ! -- reference blas level1 routine (version 3.9.1_sp) --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! march 2021
+        ! Constants 
+        integer, parameter :: wp = kind(1._sp)
+        real(sp), parameter :: maxn = huge(0.0_sp)
+        ! .. blue's scaling constants ..
+        ! Scalar Arguments 
+     integer(ilp), intent(in) :: incx, n
+        ! Array Arguments 
+        real(sp), intent(in) :: x(*)
+        ! Local Scalars 
+     integer(ilp) :: i, ix
+     logical(lk) :: notbig
+        real(sp) :: abig, amed, asml, ax, scl, sumsq, ymax, ymin
+        ! quick return if possible
+        stdlib_snrm2 = zero
+        if( n <= 0 ) return
+        scl = one
+        sumsq = zero
+        ! compute the sum of squares in 3 accumulators:
+           ! abig -- sums of squares scaled down to avoid overflow
+           ! asml -- sums of squares scaled up to avoid underflow
+           ! amed -- sums of squares that do not require scaling
+        ! the thresholds and multipliers are
+           ! tbig -- values bigger than this are scaled down by sbig
+           ! tsml -- values smaller than this are scaled up by ssml
+        notbig = .true.
+        asml = zero
+        amed = zero
+        abig = zero
+        ix = 1
+        if( incx < 0 ) ix = 1 - (n-1)*incx
+        do i = 1, n
+           ax = abs(x(ix))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ix = ix + incx
+        end do
+        ! combine abig and amed or amed and asml if more than one
+        ! accumulator was used.
+        if (abig > zero) then
+           ! combine abig and amed if abig > 0.
+           if ( (amed > zero) .or. (amed > maxn) .or. (amed /= amed) ) then
+              abig = abig + (amed*sbig)*sbig
+           end if
+           scl = one / sbig
+           sumsq = abig
+        else if (asml > zero) then
+           ! combine amed and asml if asml > 0.
+           if ( (amed > zero) .or. (amed > maxn) .or. (amed /= amed) ) then
+              amed = sqrt(amed)
+              asml = sqrt(asml) / ssml
+              if (asml > amed) then
+                 ymin = amed
+                 ymax = asml
+              else
+                 ymin = asml
+                 ymax = amed
+              end if
+              scl = one
+              sumsq = ymax**2*( one + (ymin/ymax)**2 )
+           else
+              scl = one / ssml
+              sumsq = asml
+           end if
+        else
+           ! otherwise all values are mid-range
+           scl = one
+           sumsq = amed
+        end if
+        stdlib_snrm2 = scl*sqrt( sumsq )
+        return
+     end function stdlib_snrm2
+
+     !> applies a plane rotation.
+
+     pure subroutine stdlib_srot(n,sx,incx,sy,incy,c,s)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: c, s
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: sx(*), sy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: stemp
+           integer(ilp) :: i, ix, iy
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+             ! code for both increments equal to 1
+              do i = 1,n
+                 stemp = c*sx(i) + s*sy(i)
+                 sy(i) = c*sy(i) - s*sx(i)
+                 sx(i) = stemp
+              end do
+           else
+             ! code for unequal increments or equal increments not equal
+               ! to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 stemp = c*sx(ix) + s*sy(iy)
+                 sy(iy) = c*sy(iy) - s*sx(ix)
+                 sx(ix) = stemp
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_srot
+
+     !> !
+     !>
+     !> The computation uses the formulas
+     !> sigma = sgn(a)    if |a| >  |b|
+     !> = sgn(b)    if |b| >= |a|
+     !> r = sigma*sqrt( a**2 + b**2 )
+     !> c = 1; s = 0      if r = 0
+     !> c = a/r; s = b/r  if r != 0
+     !> The subroutine also computes
+     !> z = s    if |a| > |b|,
+     !> = 1/c  if |b| >= |a| and c != 0
+     !> = 1    if c = 0
+     !> This allows c and s to be reconstructed from z as follows:
+     !> If z = 1, set c = 0, s = 1.
+     !> If |z| < 1, set c = sqrt(1 - z**2) and s = z.
+     !> If |z| > 1, set c = 1/z and s = sqrt( 1 - c**2).
+
+     pure subroutine stdlib_srotg( a, b, c, s )
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+        ! Constants 
+        integer, parameter :: wp = kind(1._sp)
+        ! Scaling Constants 
+        ! Scalar Arguments 
+        real(sp), intent(inout) :: a, b
+        real(sp), intent(out) :: c, s
+        ! Local Scalars 
+        real(sp) :: anorm, bnorm, scl, sigma, r, z
+        anorm = abs(a)
+        bnorm = abs(b)
+        if( bnorm == zero ) then
+           c = one
+           s = zero
+           b = zero
+        else if( anorm == zero ) then
+           c = zero
+           s = one
+           a = b
+           b = one
+        else
+           scl = min( safmax, max( safmin, anorm, bnorm ) )
+           if( anorm > bnorm ) then
+              sigma = sign(one,a)
+           else
+              sigma = sign(one,b)
+           end if
+           r = sigma*( scl*sqrt((a/scl)**2 + (b/scl)**2) )
+           c = a/r
+           s = b/r
+           if( anorm > bnorm ) then
+              z = s
+           else if( c /= zero ) then
+              z = one/c
+           else
+              z = one
+           end if
+           a = r
+           b = z
+        end if
+        return
+     end subroutine stdlib_srotg
+
+     !> APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX
+     !> (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN
+     !> (SX**T)
+     !> SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX >= 0, ELSE
+     !> LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY.
+     !> WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS..
+     !> SFLAG=-1._sp     SFLAG=0._sp        SFLAG=1._sp     SFLAG=-2.E0
+     !> (SH11  SH12)    (1._sp  SH12)    (SH11  1._sp)    (1._sp  0._sp)
+     !> H=(          )    (          )    (          )    (          )
+     !> (SH21  SH22),   (SH21  1._sp),   (-1._sp SH22),   (0._sp  1._sp).
+     !> SEE  SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM.
+
+     pure subroutine stdlib_srotm(n,sx,incx,sy,incy,sparam)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(sp), intent(in) :: sparam(5)
+           real(sp), intent(inout) :: sx(*), sy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: sflag, sh11, sh12, sh21, sh22, two, w, z, zero
+           integer(ilp) :: i, kx, ky, nsteps
+           ! Data Statements 
+           zero = 0.0_sp
+           two = 2.0_sp
+           sflag = sparam(1)
+           if (n<=0 .or. (sflag+two==zero)) return
+           if (incx==incy.and.incx>0) then
+              nsteps = n*incx
+              if (sflag<zero) then
+                 sh11 = sparam(2)
+                 sh12 = sparam(4)
+                 sh21 = sparam(3)
+                 sh22 = sparam(5)
+                 do i = 1,nsteps,incx
+                    w = sx(i)
+                    z = sy(i)
+                    sx(i) = w*sh11 + z*sh12
+                    sy(i) = w*sh21 + z*sh22
+                 end do
+              else if (sflag==zero) then
+                 sh12 = sparam(4)
+                 sh21 = sparam(3)
+                 do i = 1,nsteps,incx
+                    w = sx(i)
+                    z = sy(i)
+                    sx(i) = w + z*sh12
+                    sy(i) = w*sh21 + z
+                 end do
+              else
+                 sh11 = sparam(2)
+                 sh22 = sparam(5)
+                 do i = 1,nsteps,incx
+                    w = sx(i)
+                    z = sy(i)
+                    sx(i) = w*sh11 + z
+                    sy(i) = -w + sh22*z
+                 end do
+              end if
+           else
+              kx = 1
+              ky = 1
+              if (incx<0) kx = 1 + (1-n)*incx
+              if (incy<0) ky = 1 + (1-n)*incy
+              if (sflag<zero) then
+                 sh11 = sparam(2)
+                 sh12 = sparam(4)
+                 sh21 = sparam(3)
+                 sh22 = sparam(5)
+                 do i = 1,n
+                    w = sx(kx)
+                    z = sy(ky)
+                    sx(kx) = w*sh11 + z*sh12
+                    sy(ky) = w*sh21 + z*sh22
+                    kx = kx + incx
+                    ky = ky + incy
+                 end do
+              else if (sflag==zero) then
+                 sh12 = sparam(4)
+                 sh21 = sparam(3)
+                 do i = 1,n
+                    w = sx(kx)
+                    z = sy(ky)
+                    sx(kx) = w + z*sh12
+                    sy(ky) = w*sh21 + z
+                    kx = kx + incx
+                    ky = ky + incy
+                 end do
+              else
+                  sh11 = sparam(2)
+                  sh22 = sparam(5)
+                  do i = 1,n
+                     w = sx(kx)
+                     z = sy(ky)
+                     sx(kx) = w*sh11 + z
+                     sy(ky) = -w + sh22*z
+                     kx = kx + incx
+                     ky = ky + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_srotm
+
+     !> CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS
+     !> THE SECOND COMPONENT OF THE 2-VECTOR  (SQRT(SD1)*SX1,SQRT(SD2)    SY2)**T.
+     !> WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS..
+     !> SFLAG=-1._sp     SFLAG=0._sp        SFLAG=1._sp     SFLAG=-2.E0
+     !> (SH11  SH12)    (1._sp  SH12)    (SH11  1._sp)    (1._sp  0._sp)
+     !> H=(          )    (          )    (          )    (          )
+     !> (SH21  SH22),   (SH21  1._sp),   (-1._sp SH22),   (0._sp  1._sp).
+     !> LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22
+     !> RESPECTIVELY. (VALUES OF 1._sp, -1._sp, OR 0._sp IMPLIED BY THE
+     !> VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.)
+     !> THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE
+     !> INEXACT.  THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE
+     !> OF SD1 AND SD2.  ALL ACTUAL SCALING OF DATA IS DONE USING GAM.
+
+     pure subroutine stdlib_srotmg(sd1,sd2,sx1,sy1,sparam)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(inout) :: sd1, sd2, sx1
+           real(sp), intent(in) :: sy1
+           ! Array Arguments 
+           real(sp), intent(out) :: sparam(5)
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: gam, gamsq, one, rgamsq, sflag, sh11, sh12, sh21, sh22, sp1, sp2, sq1, sq2,&
+                      stemp, su, two, zero
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Data Statements 
+           zero = 0.0_sp
+           one = 1.0_sp
+           two = 2.0_sp
+           gam = 4096.0_sp
+           gamsq = 1.67772e7_sp
+           rgamsq = 5.96046e-8_sp
+           if (sd1<zero) then
+              ! go zero-h-d-and-sx1..
+              sflag = -one
+              sh11 = zero
+              sh12 = zero
+              sh21 = zero
+              sh22 = zero
+              sd1 = zero
+              sd2 = zero
+              sx1 = zero
+           else
+              ! case-sd1-nonnegative
+              sp2 = sd2*sy1
+              if (sp2==zero) then
+                 sflag = -two
+                 sparam(1) = sflag
+                 return
+              end if
+              ! regular-case..
+              sp1 = sd1*sx1
+              sq2 = sp2*sy1
+              sq1 = sp1*sx1
+              if (abs(sq1)>abs(sq2)) then
+                 sh21 = -sy1/sx1
+                 sh12 = sp2/sp1
+                 su = one - sh12*sh21
+                if (su>zero) then
+                  sflag = zero
+                  sd1 = sd1/su
+                  sd2 = sd2/su
+                  sx1 = sx1*su
+                else
+                  ! this code path if here for safety. we do not expect this
+                  ! condition to ever hold except in edge cases with rounding
+                  ! errors. see doi: 10.1145/355841.355847
+                  sflag = -one
+                  sh11 = zero
+                  sh12 = zero
+                  sh21 = zero
+                  sh22 = zero
+                  sd1 = zero
+                  sd2 = zero
+                  sx1 = zero
+                end if
+              else
+                 if (sq2<zero) then
+                    ! go zero-h-d-and-sx1..
+                    sflag = -one
+                    sh11 = zero
+                    sh12 = zero
+                    sh21 = zero
+                    sh22 = zero
+                    sd1 = zero
+                    sd2 = zero
+                    sx1 = zero
+                 else
+                    sflag = one
+                    sh11 = sp1/sp2
+                    sh22 = sx1/sy1
+                    su = one + sh11*sh22
+                    stemp = sd2/su
+                    sd2 = sd1/su
+                    sd1 = stemp
+                    sx1 = sy1*su
+                 end if
+              end if
+           ! procedure..scale-check
+              if (sd1/=zero) then
+                 do while ((sd1<=rgamsq) .or. (sd1>=gamsq))
+                    if (sflag==zero) then
+                       sh11 = one
+                       sh22 = one
+                       sflag = -one
+                    else
+                       sh21 = -one
+                       sh12 = one
+                       sflag = -one
+                    end if
+                    if (sd1<=rgamsq) then
+                       sd1 = sd1*gam**2
+                       sx1 = sx1/gam
+                       sh11 = sh11/gam
+                       sh12 = sh12/gam
+                    else
+                       sd1 = sd1/gam**2
+                       sx1 = sx1*gam
+                       sh11 = sh11*gam
+                       sh12 = sh12*gam
+                    end if
+                 enddo
+              end if
+              if (sd2/=zero) then
+                 do while ( (abs(sd2)<=rgamsq) .or. (abs(sd2)>=gamsq) )
+                    if (sflag==zero) then
+                       sh11 = one
+                       sh22 = one
+                       sflag = -one
+                    else
+                       sh21 = -one
+                       sh12 = one
+                       sflag = -one
+                    end if
+                    if (abs(sd2)<=rgamsq) then
+                       sd2 = sd2*gam**2
+                       sh21 = sh21/gam
+                       sh22 = sh22/gam
+                    else
+                       sd2 = sd2/gam**2
+                       sh21 = sh21*gam
+                       sh22 = sh22*gam
+                    end if
+                 end do
+              end if
+           end if
+           if (sflag<zero) then
+              sparam(2) = sh11
+              sparam(3) = sh21
+              sparam(4) = sh12
+              sparam(5) = sh22
+           else if (sflag==zero) then
+              sparam(3) = sh21
+              sparam(4) = sh12
+           else
+              sparam(2) = sh11
+              sparam(5) = sh22
+           end if
+           sparam(1) = sflag
+           return
+     end subroutine stdlib_srotmg
+
+     !> SSBMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric band matrix, with k super-diagonals.
+
+     pure subroutine stdlib_ssbmv(uplo,n,k,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, k, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), x(*)
+           real(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kplus1, kx, ky, l
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (k<0) then
+               info = 3
+           else if (lda< (k+1)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           else if (incy==0) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SSBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==zero).and. (beta==one))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of the array a
+           ! are accessed sequentially with one pass through a.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when upper triangle of a is stored.
+               kplus1 = k + 1
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       l = kplus1 - j
+                       do i = max(1,j-k),j - 1
+                           y(i) = y(i) + temp1*a(l+i,j)
+                           temp2 = temp2 + a(l+i,j)*x(i)
+                       end do
+                       y(j) = y(j) + temp1*a(kplus1,j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       ix = kx
+                       iy = ky
+                       l = kplus1 - j
+                       do i = max(1,j-k),j - 1
+                           y(iy) = y(iy) + temp1*a(l+i,j)
+                           temp2 = temp2 + a(l+i,j)*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*a(kplus1,j) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       if (j>k) then
+                           kx = kx + incx
+                           ky = ky + incy
+                       end if
+                   end do
+               end if
+           else
+              ! form  y  when lower triangle of a is stored.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       y(j) = y(j) + temp1*a(1,j)
+                       l = 1 - j
+                       do i = j + 1,min(n,j+k)
+                           y(i) = y(i) + temp1*a(l+i,j)
+                           temp2 = temp2 + a(l+i,j)*x(i)
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       y(jy) = y(jy) + temp1*a(1,j)
+                       l = 1 - j
+                       ix = jx
+                       iy = jy
+                       do i = j + 1,min(n,j+k)
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*a(l+i,j)
+                           temp2 = temp2 + a(l+i,j)*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_ssbmv
+
+     !> SSCAL: scales a vector by a constant.
+     !> uses unrolled loops for increment equal to 1.
+
+     pure subroutine stdlib_sscal(n,sa,sx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: sa
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: sx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, m, mp1, nincx
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              ! clean-up loop
+              m = mod(n,5)
+              if (m/=0) then
+                 do i = 1,m
+                    sx(i) = sa*sx(i)
+                 end do
+                 if (n<5) return
+              end if
+              mp1 = m + 1
+              do i = mp1,n,5
+                 sx(i) = sa*sx(i)
+                 sx(i+1) = sa*sx(i+1)
+                 sx(i+2) = sa*sx(i+2)
+                 sx(i+3) = sa*sx(i+3)
+                 sx(i+4) = sa*sx(i+4)
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 sx(i) = sa*sx(i)
+              end do
+           end if
+           return
+     end subroutine stdlib_sscal
+
+     !> SSPMV:  performs the matrix-vector operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_sspmv(uplo,n,alpha,ap,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: ap(*), x(*)
+           real(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 6
+           else if (incy==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SSPMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==zero).and. (beta==one))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with one pass through ap.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when ap contains the upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       k = kk
+                       do i = 1,j - 1
+                           y(i) = y(i) + temp1*ap(k)
+                           temp2 = temp2 + ap(k)*x(i)
+                           k = k + 1
+                       end do
+                       y(j) = y(j) + temp1*ap(kk+j-1) + alpha*temp2
+                       kk = kk + j
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       ix = kx
+                       iy = ky
+                       do k = kk,kk + j - 2
+                           y(iy) = y(iy) + temp1*ap(k)
+                           temp2 = temp2 + ap(k)*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*ap(kk+j-1) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  y  when ap contains the lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       y(j) = y(j) + temp1*ap(kk)
+                       k = kk + 1
+                       do i = j + 1,n
+                           y(i) = y(i) + temp1*ap(k)
+                           temp2 = temp2 + ap(k)*x(i)
+                           k = k + 1
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                       kk = kk + (n-j+1)
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       y(jy) = y(jy) + temp1*ap(kk)
+                       ix = jx
+                       iy = jy
+                       do k = kk + 1,kk + n - j
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*ap(k)
+                           temp2 = temp2 + ap(k)*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + (n-j+1)
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_sspmv
+
+     !> SSPR:    performs the symmetric rank 1 operation
+     !> A := alpha*x*x**T + A,
+     !> where alpha is a real scalar, x is an n element vector and A is an
+     !> n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_sspr(uplo,n,alpha,x,incx,ap)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(sp), intent(inout) :: ap(*)
+           real(sp), intent(in) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SSPR  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==zero)) return
+           ! set the start point in x if the increment is not unity.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with one pass through ap.
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when upper triangle is stored in ap.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=zero) then
+                           temp = alpha*x(j)
+                           k = kk
+                           do i = 1,j
+                               ap(k) = ap(k) + x(i)*temp
+                               k = k + 1
+                           end do
+                       end if
+                       kk = kk + j
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=zero) then
+                           temp = alpha*x(jx)
+                           ix = kx
+                           do k = kk,kk + j - 1
+                               ap(k) = ap(k) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                       end if
+                       jx = jx + incx
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=zero) then
+                           temp = alpha*x(j)
+                           k = kk
+                           do i = j,n
+                               ap(k) = ap(k) + x(i)*temp
+                               k = k + 1
+                           end do
+                       end if
+                       kk = kk + n - j + 1
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=zero) then
+                           temp = alpha*x(jx)
+                           ix = jx
+                           do k = kk,kk + n - j
+                               ap(k) = ap(k) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                       end if
+                       jx = jx + incx
+                       kk = kk + n - j + 1
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_sspr
+
+     !> SSPR2:  performs the symmetric rank 2 operation
+     !> A := alpha*x*y**T + alpha*y*x**T + A,
+     !> where alpha is a scalar, x and y are n element vectors and A is an
+     !> n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_sspr2(uplo,n,alpha,x,incx,y,incy,ap)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(sp), intent(inout) :: ap(*)
+           real(sp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SSPR2 ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==zero)) return
+           ! set up the start points in x and y if the increments are not both
+           ! unity.
+           if ((incx/=1) .or. (incy/=1)) then
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (n-1)*incx
+               end if
+               if (incy>0) then
+                   ky = 1
+               else
+                   ky = 1 - (n-1)*incy
+               end if
+               jx = kx
+               jy = ky
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with one pass through ap.
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when upper triangle is stored in ap.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=zero) .or. (y(j)/=zero)) then
+                           temp1 = alpha*y(j)
+                           temp2 = alpha*x(j)
+                           k = kk
+                           do i = 1,j
+                               ap(k) = ap(k) + x(i)*temp1 + y(i)*temp2
+                               k = k + 1
+                           end do
+                       end if
+                       kk = kk + j
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=zero) .or. (y(jy)/=zero)) then
+                           temp1 = alpha*y(jy)
+                           temp2 = alpha*x(jx)
+                           ix = kx
+                           iy = ky
+                           do k = kk,kk + j - 1
+                               ap(k) = ap(k) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=zero) .or. (y(j)/=zero)) then
+                           temp1 = alpha*y(j)
+                           temp2 = alpha*x(j)
+                           k = kk
+                           do i = j,n
+                               ap(k) = ap(k) + x(i)*temp1 + y(i)*temp2
+                               k = k + 1
+                           end do
+                       end if
+                       kk = kk + n - j + 1
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=zero) .or. (y(jy)/=zero)) then
+                           temp1 = alpha*y(jy)
+                           temp2 = alpha*x(jx)
+                           ix = jx
+                           iy = jy
+                           do k = kk,kk + n - j
+                               ap(k) = ap(k) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + n - j + 1
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_sspr2
+
+     !> SSWAP: interchanges two vectors.
+     !> uses unrolled loops for increments equal to 1.
+
+     pure subroutine stdlib_sswap(n,sx,incx,sy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: sx(*), sy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: stemp
+           integer(ilp) :: i, ix, iy, m, mp1
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+             ! code for both increments equal to 1
+             ! clean-up loop
+              m = mod(n,3)
+              if (m/=0) then
+                 do i = 1,m
+                    stemp = sx(i)
+                    sx(i) = sy(i)
+                    sy(i) = stemp
+                 end do
+                 if (n<3) return
+              end if
+              mp1 = m + 1
+              do i = mp1,n,3
+                 stemp = sx(i)
+                 sx(i) = sy(i)
+                 sy(i) = stemp
+                 stemp = sx(i+1)
+                 sx(i+1) = sy(i+1)
+                 sy(i+1) = stemp
+                 stemp = sx(i+2)
+                 sx(i+2) = sy(i+2)
+                 sy(i+2) = stemp
+              end do
+           else
+             ! code for unequal increments or equal increments not equal
+               ! to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 stemp = sx(ix)
+                 sx(ix) = sy(iy)
+                 sy(iy) = stemp
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_sswap
+
+     !> SSYMM:  performs one of the matrix-matrix operations
+     !> C := alpha*A*B + beta*C,
+     !> or
+     !> C := alpha*B*A + beta*C,
+     !> where alpha and beta are scalars,  A is a symmetric matrix and  B and
+     !> C are  m by n matrices.
+
+     pure subroutine stdlib_ssymm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           character, intent(in) :: side, uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), b(ldb,*)
+           real(sp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(sp) :: temp1, temp2
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: upper
+           
+           ! set nrowa as the number of rows of a.
+           if (stdlib_lsame(side,'L')) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           ! test the input parameters.
+           info = 0
+           if ((.not.stdlib_lsame(side,'L')) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,m)) then
+               info = 9
+           else if (ldc<max(1,m)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SSYMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==zero).and. (beta==one))) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               if (beta==zero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = zero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(side,'L')) then
+              ! form  c := alpha*a*b + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,m
+                           temp1 = alpha*b(i,j)
+                           temp2 = zero
+                           do k = 1,i - 1
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*a(k,i)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = temp1*a(i,i) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*a(i,i) +alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = m,1,-1
+                           temp1 = alpha*b(i,j)
+                           temp2 = zero
+                           do k = i + 1,m
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*a(k,i)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = temp1*a(i,i) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*a(i,i) +alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*b*a + beta*c.
+               loop_170: do j = 1,n
+                   temp1 = alpha*a(j,j)
+                   if (beta==zero) then
+                       do i = 1,m
+                           c(i,j) = temp1*b(i,j)
+                       end do
+                   else
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j) + temp1*b(i,j)
+                       end do
+                   end if
+                   do k = 1,j - 1
+                       if (upper) then
+                           temp1 = alpha*a(k,j)
+                       else
+                           temp1 = alpha*a(j,k)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+                   do k = j + 1,n
+                       if (upper) then
+                           temp1 = alpha*a(j,k)
+                       else
+                           temp1 = alpha*a(k,j)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+               end do loop_170
+           end if
+           return
+     end subroutine stdlib_ssymm
+
+     !> SSYMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric matrix.
+
+     pure subroutine stdlib_ssymv(uplo,n,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), x(*)
+           real(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (lda<max(1,n)) then
+               info = 5
+           else if (incx==0) then
+               info = 7
+           else if (incy==0) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SSYMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==zero).and. (beta==one))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through the triangular part
+           ! of a.
+           ! first form  y := beta*y.
+           if (beta/=one) then
+               if (incy==1) then
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(i) = zero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==zero) then
+                       do i = 1,n
+                           y(iy) = zero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==zero) return
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when a is stored in upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       do i = 1,j - 1
+                           y(i) = y(i) + temp1*a(i,j)
+                           temp2 = temp2 + a(i,j)*x(i)
+                       end do
+                       y(j) = y(j) + temp1*a(j,j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       ix = kx
+                       iy = ky
+                       do i = 1,j - 1
+                           y(iy) = y(iy) + temp1*a(i,j)
+                           temp2 = temp2 + a(i,j)*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*a(j,j) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           else
+              ! form  y  when a is stored in lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = zero
+                       y(j) = y(j) + temp1*a(j,j)
+                       do i = j + 1,n
+                           y(i) = y(i) + temp1*a(i,j)
+                           temp2 = temp2 + a(i,j)*x(i)
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = zero
+                       y(jy) = y(jy) + temp1*a(j,j)
+                       ix = jx
+                       iy = jy
+                       do i = j + 1,n
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*a(i,j)
+                           temp2 = temp2 + a(i,j)*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_ssymv
+
+     !> SSYR:   performs the symmetric rank 1 operation
+     !> A := alpha*x*x**T + A,
+     !> where alpha is a real scalar, x is an n element vector and A is an
+     !> n by n symmetric matrix.
+
+     pure subroutine stdlib_ssyr(uplo,n,alpha,x,incx,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (lda<max(1,n)) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SSYR  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==zero)) return
+           ! set the start point in x if the increment is not unity.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through the triangular part
+           ! of a.
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when a is stored in upper triangle.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=zero) then
+                           temp = alpha*x(j)
+                           do i = 1,j
+                               a(i,j) = a(i,j) + x(i)*temp
+                           end do
+                       end if
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=zero) then
+                           temp = alpha*x(jx)
+                           ix = kx
+                           do i = 1,j
+                               a(i,j) = a(i,j) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                       end if
+                       jx = jx + incx
+                   end do
+               end if
+           else
+              ! form  a  when a is stored in lower triangle.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=zero) then
+                           temp = alpha*x(j)
+                           do i = j,n
+                               a(i,j) = a(i,j) + x(i)*temp
+                           end do
+                       end if
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=zero) then
+                           temp = alpha*x(jx)
+                           ix = jx
+                           do i = j,n
+                               a(i,j) = a(i,j) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                       end if
+                       jx = jx + incx
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_ssyr
+
+     !> SSYR2:  performs the symmetric rank 2 operation
+     !> A := alpha*x*y**T + alpha*y*x**T + A,
+     !> where alpha is a scalar, x and y are n element vectors and A is an n
+     !> by n symmetric matrix.
+
+     pure subroutine stdlib_ssyr2(uplo,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,n)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SSYR2 ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==zero)) return
+           ! set up the start points in x and y if the increments are not both
+           ! unity.
+           if ((incx/=1) .or. (incy/=1)) then
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (n-1)*incx
+               end if
+               if (incy>0) then
+                   ky = 1
+               else
+                   ky = 1 - (n-1)*incy
+               end if
+               jx = kx
+               jy = ky
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through the triangular part
+           ! of a.
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when a is stored in the upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=zero) .or. (y(j)/=zero)) then
+                           temp1 = alpha*y(j)
+                           temp2 = alpha*x(j)
+                           do i = 1,j
+                               a(i,j) = a(i,j) + x(i)*temp1 + y(i)*temp2
+                           end do
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=zero) .or. (y(jy)/=zero)) then
+                           temp1 = alpha*y(jy)
+                           temp2 = alpha*x(jx)
+                           ix = kx
+                           iy = ky
+                           do i = 1,j
+                               a(i,j) = a(i,j) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           else
+              ! form  a  when a is stored in the lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=zero) .or. (y(j)/=zero)) then
+                           temp1 = alpha*y(j)
+                           temp2 = alpha*x(j)
+                           do i = j,n
+                               a(i,j) = a(i,j) + x(i)*temp1 + y(i)*temp2
+                           end do
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=zero) .or. (y(jy)/=zero)) then
+                           temp1 = alpha*y(jy)
+                           temp2 = alpha*x(jx)
+                           ix = jx
+                           iy = jy
+                           do i = j,n
+                               a(i,j) = a(i,j) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_ssyr2
+
+     !> SSYR2K:  performs one of the symmetric rank 2k operations
+     !> C := alpha*A*B**T + alpha*B*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*B + alpha*B**T*A + beta*C,
+     !> where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+     !> and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+     !> matrices in the second case.
+
+     pure subroutine stdlib_ssyr2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), b(ldb,*)
+           real(sp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(sp) :: temp1, temp2
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'T')) .and.(&
+                     .not.stdlib_lsame(trans,'C'))) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,nrowa)) then
+               info = 9
+           else if (ldc<max(1,n)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SSYR2K',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==zero).or.(k==0)).and. (beta==one))) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               if (upper) then
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               else
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*b**t + alpha*b*a**t + c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=zero) .or. (b(j,l)/=zero)) then
+                               temp1 = alpha*b(j,l)
+                               temp2 = alpha*a(j,l)
+                               do i = 1,j
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=zero) .or. (b(j,l)/=zero)) then
+                               temp1 = alpha*b(j,l)
+                               temp2 = alpha*a(j,l)
+                               do i = j,n
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**t*b + alpha*b**t*a + c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp1 = zero
+                           temp2 = zero
+                           do l = 1,k
+                               temp1 = temp1 + a(l,i)*b(l,j)
+                               temp2 = temp2 + b(l,i)*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp1 + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + alpha*temp1 +alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp1 = zero
+                           temp2 = zero
+                           do l = 1,k
+                               temp1 = temp1 + a(l,i)*b(l,j)
+                               temp2 = temp2 + b(l,i)*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp1 + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + alpha*temp1 +alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_ssyr2k
+
+     !> SSYRK:  performs one of the symmetric rank k operations
+     !> C := alpha*A*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*A + beta*C,
+     !> where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+     !> and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+     !> in the second case.
+
+     pure subroutine stdlib_ssyrk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'T')) .and.(&
+                     .not.stdlib_lsame(trans,'C'))) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldc<max(1,n)) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('SSYRK ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==zero).or.(k==0)).and. (beta==one))) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               if (upper) then
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               else
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*a**t + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=zero) then
+                               temp = alpha*a(j,l)
+                               do i = 1,j
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=zero) then
+                               temp = alpha*a(j,l)
+                               do i = j,n
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**t*a + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + a(l,i)*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + a(l,i)*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_ssyrk
+
+     !> STBMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+
+     pure subroutine stdlib_stbmv(uplo,trans,diag,n,k,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kplus1, kx, l
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda< (k+1)) then
+               info = 7
+           else if (incx==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('STBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx   too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           if (stdlib_lsame(trans,'N')) then
+               ! form  x := a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               l = kplus1 - j
+                               do i = max(1,j-k),j - 1
+                                   x(i) = x(i) + temp*a(l+i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(kplus1,j)
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               l = kplus1 - j
+                               do i = max(1,j-k),j - 1
+                                   x(ix) = x(ix) + temp*a(l+i,j)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(kplus1,j)
+                           end if
+                           jx = jx + incx
+                           if (j>k) kx = kx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               l = 1 - j
+                               do i = min(n,j+k),j + 1,-1
+                                   x(i) = x(i) + temp*a(l+i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(1,j)
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               l = 1 - j
+                               do i = min(n,j+k),j + 1,-1
+                                   x(ix) = x(ix) + temp*a(l+i,j)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(1,j)
+                           end if
+                           jx = jx - incx
+                           if ((n-j)>=k) kx = kx - incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           l = kplus1 - j
+                           if (nounit) temp = temp*a(kplus1,j)
+                           do i = j - 1,max(1,j-k),-1
+                               temp = temp + a(l+i,j)*x(i)
+                           end do
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           kx = kx - incx
+                           ix = kx
+                           l = kplus1 - j
+                           if (nounit) temp = temp*a(kplus1,j)
+                           do i = j - 1,max(1,j-k),-1
+                               temp = temp + a(l+i,j)*x(ix)
+                               ix = ix - incx
+                           end do
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           l = 1 - j
+                           if (nounit) temp = temp*a(1,j)
+                           do i = j + 1,min(n,j+k)
+                               temp = temp + a(l+i,j)*x(i)
+                           end do
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           kx = kx + incx
+                           ix = kx
+                           l = 1 - j
+                           if (nounit) temp = temp*a(1,j)
+                           do i = j + 1,min(n,j+k)
+                               temp = temp + a(l+i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_stbmv
+
+     !> STBSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular band matrix, with ( k + 1 )
+     !> diagonals.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_stbsv(uplo,trans,diag,n,k,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kplus1, kx, l
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda< (k+1)) then
+               info = 7
+           else if (incx==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('STBSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed by sequentially with one pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               l = kplus1 - j
+                               if (nounit) x(j) = x(j)/a(kplus1,j)
+                               temp = x(j)
+                               do i = j - 1,max(1,j-k),-1
+                                   x(i) = x(i) - temp*a(l+i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           kx = kx - incx
+                           if (x(jx)/=zero) then
+                               ix = kx
+                               l = kplus1 - j
+                               if (nounit) x(jx) = x(jx)/a(kplus1,j)
+                               temp = x(jx)
+                               do i = j - 1,max(1,j-k),-1
+                                   x(ix) = x(ix) - temp*a(l+i,j)
+                                   ix = ix - incx
+                               end do
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               l = 1 - j
+                               if (nounit) x(j) = x(j)/a(1,j)
+                               temp = x(j)
+                               do i = j + 1,min(n,j+k)
+                                   x(i) = x(i) - temp*a(l+i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           kx = kx + incx
+                           if (x(jx)/=zero) then
+                               ix = kx
+                               l = 1 - j
+                               if (nounit) x(jx) = x(jx)/a(1,j)
+                               temp = x(jx)
+                               do i = j + 1,min(n,j+k)
+                                   x(ix) = x(ix) - temp*a(l+i,j)
+                                   ix = ix + incx
+                               end do
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t)*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           l = kplus1 - j
+                           do i = max(1,j-k),j - 1
+                               temp = temp - a(l+i,j)*x(i)
+                           end do
+                           if (nounit) temp = temp/a(kplus1,j)
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           l = kplus1 - j
+                           do i = max(1,j-k),j - 1
+                               temp = temp - a(l+i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                           if (nounit) temp = temp/a(kplus1,j)
+                           x(jx) = temp
+                           jx = jx + incx
+                           if (j>k) kx = kx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           l = 1 - j
+                           do i = min(n,j+k),j + 1,-1
+                               temp = temp - a(l+i,j)*x(i)
+                           end do
+                           if (nounit) temp = temp/a(1,j)
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           l = 1 - j
+                           do i = min(n,j+k),j + 1,-1
+                               temp = temp - a(l+i,j)*x(ix)
+                               ix = ix - incx
+                           end do
+                           if (nounit) temp = temp/a(1,j)
+                           x(jx) = temp
+                           jx = jx - incx
+                           if ((n-j)>=k) kx = kx - incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_stbsv
+
+     !> STPMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular matrix, supplied in packed form.
+
+     pure subroutine stdlib_stpmv(uplo,trans,diag,n,ap,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: ap(*)
+           real(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           logical(lk) :: nounit
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (incx==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('STPMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of ap are
+           ! accessed sequentially with one pass through ap.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x:= a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               k = kk
+                               do i = 1,j - 1
+                                   x(i) = x(i) + temp*ap(k)
+                                   k = k + 1
+                               end do
+                               if (nounit) x(j) = x(j)*ap(kk+j-1)
+                           end if
+                           kk = kk + j
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               do k = kk,kk + j - 2
+                                   x(ix) = x(ix) + temp*ap(k)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*ap(kk+j-1)
+                           end if
+                           jx = jx + incx
+                           kk = kk + j
+                       end do
+                   end if
+               else
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               k = kk
+                               do i = n,j + 1,-1
+                                   x(i) = x(i) + temp*ap(k)
+                                   k = k - 1
+                               end do
+                               if (nounit) x(j) = x(j)*ap(kk-n+j)
+                           end if
+                           kk = kk - (n-j+1)
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               do k = kk,kk - (n- (j+1)),-1
+                                   x(ix) = x(ix) + temp*ap(k)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*ap(kk-n+j)
+                           end if
+                           jx = jx - incx
+                           kk = kk - (n-j+1)
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           if (nounit) temp = temp*ap(kk)
+                           k = kk - 1
+                           do i = j - 1,1,-1
+                               temp = temp + ap(k)*x(i)
+                               k = k - 1
+                           end do
+                           x(j) = temp
+                           kk = kk - j
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = jx
+                           if (nounit) temp = temp*ap(kk)
+                           do k = kk - 1,kk - j + 1,-1
+                               ix = ix - incx
+                               temp = temp + ap(k)*x(ix)
+                           end do
+                           x(jx) = temp
+                           jx = jx - incx
+                           kk = kk - j
+                       end do
+                   end if
+               else
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           if (nounit) temp = temp*ap(kk)
+                           k = kk + 1
+                           do i = j + 1,n
+                               temp = temp + ap(k)*x(i)
+                               k = k + 1
+                           end do
+                           x(j) = temp
+                           kk = kk + (n-j+1)
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = jx
+                           if (nounit) temp = temp*ap(kk)
+                           do k = kk + 1,kk + n - j
+                               ix = ix + incx
+                               temp = temp + ap(k)*x(ix)
+                           end do
+                           x(jx) = temp
+                           jx = jx + incx
+                           kk = kk + (n-j+1)
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_stpmv
+
+     !> STPSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular matrix, supplied in packed form.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_stpsv(uplo,trans,diag,n,ap,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: ap(*)
+           real(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           logical(lk) :: nounit
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (incx==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('STPSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of ap are
+           ! accessed sequentially with one pass through ap.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               if (nounit) x(j) = x(j)/ap(kk)
+                               temp = x(j)
+                               k = kk - 1
+                               do i = j - 1,1,-1
+                                   x(i) = x(i) - temp*ap(k)
+                                   k = k - 1
+                               end do
+                           end if
+                           kk = kk - j
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               if (nounit) x(jx) = x(jx)/ap(kk)
+                               temp = x(jx)
+                               ix = jx
+                               do k = kk - 1,kk - j + 1,-1
+                                   ix = ix - incx
+                                   x(ix) = x(ix) - temp*ap(k)
+                               end do
+                           end if
+                           jx = jx - incx
+                           kk = kk - j
+                       end do
+                   end if
+               else
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               if (nounit) x(j) = x(j)/ap(kk)
+                               temp = x(j)
+                               k = kk + 1
+                               do i = j + 1,n
+                                   x(i) = x(i) - temp*ap(k)
+                                   k = k + 1
+                               end do
+                           end if
+                           kk = kk + (n-j+1)
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               if (nounit) x(jx) = x(jx)/ap(kk)
+                               temp = x(jx)
+                               ix = jx
+                               do k = kk + 1,kk + n - j
+                                   ix = ix + incx
+                                   x(ix) = x(ix) - temp*ap(k)
+                               end do
+                           end if
+                           jx = jx + incx
+                           kk = kk + (n-j+1)
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           k = kk
+                           do i = 1,j - 1
+                               temp = temp - ap(k)*x(i)
+                               k = k + 1
+                           end do
+                           if (nounit) temp = temp/ap(kk+j-1)
+                           x(j) = temp
+                           kk = kk + j
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           do k = kk,kk + j - 2
+                               temp = temp - ap(k)*x(ix)
+                               ix = ix + incx
+                           end do
+                           if (nounit) temp = temp/ap(kk+j-1)
+                           x(jx) = temp
+                           jx = jx + incx
+                           kk = kk + j
+                       end do
+                   end if
+               else
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           k = kk
+                           do i = n,j + 1,-1
+                               temp = temp - ap(k)*x(i)
+                               k = k - 1
+                           end do
+                           if (nounit) temp = temp/ap(kk-n+j)
+                           x(j) = temp
+                           kk = kk - (n-j+1)
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           do k = kk,kk - (n- (j+1)),-1
+                               temp = temp - ap(k)*x(ix)
+                               ix = ix - incx
+                           end do
+                           if (nounit) temp = temp/ap(kk-n+j)
+                           x(jx) = temp
+                           jx = jx - incx
+                           kk = kk - (n-j+1)
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_stpsv
+
+     !> STRMM:  performs one of the matrix-matrix operations
+     !> B := alpha*op( A )*B,   or   B := alpha*B*op( A ),
+     !> where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T.
+
+     pure subroutine stdlib_strmm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           character, intent(in) :: diag, side, transa, uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: lside, nounit, upper
+           
+           ! test the input parameters.
+           lside = stdlib_lsame(side,'L')
+           if (lside) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           nounit = stdlib_lsame(diag,'N')
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.lside) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if ((.not.stdlib_lsame(transa,'N')) .and.(.not.stdlib_lsame(transa,'T')) .and.(&
+                     .not.stdlib_lsame(transa,'C'))) then
+               info = 3
+           else if ((.not.stdlib_lsame(diag,'U')) .and. (.not.stdlib_lsame(diag,'N'))) &
+                     then
+               info = 4
+           else if (m<0) then
+               info = 5
+           else if (n<0) then
+               info = 6
+           else if (lda<max(1,nrowa)) then
+               info = 9
+           else if (ldb<max(1,m)) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('STRMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (m==0 .or. n==0) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               do j = 1,n
+                   do i = 1,m
+                       b(i,j) = zero
+                   end do
+               end do
+               return
+           end if
+           ! start the operations.
+           if (lside) then
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*a*b.
+                   if (upper) then
+                       do j = 1,n
+                           do k = 1,m
+                               if (b(k,j)/=zero) then
+                                   temp = alpha*b(k,j)
+                                   do i = 1,k - 1
+                                       b(i,j) = b(i,j) + temp*a(i,k)
+                                   end do
+                                   if (nounit) temp = temp*a(k,k)
+                                   b(k,j) = temp
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do k = m,1,-1
+                               if (b(k,j)/=zero) then
+                                   temp = alpha*b(k,j)
+                                   b(k,j) = temp
+                                   if (nounit) b(k,j) = b(k,j)*a(k,k)
+                                   do i = k + 1,m
+                                       b(i,j) = b(i,j) + temp*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*a**t*b.
+                   if (upper) then
+                       do j = 1,n
+                           do i = m,1,-1
+                               temp = b(i,j)
+                               if (nounit) temp = temp*a(i,i)
+                               do k = 1,i - 1
+                                   temp = temp + a(k,i)*b(k,j)
+                               end do
+                               b(i,j) = alpha*temp
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,m
+                               temp = b(i,j)
+                               if (nounit) temp = temp*a(i,i)
+                               do k = i + 1,m
+                                   temp = temp + a(k,i)*b(k,j)
+                               end do
+                               b(i,j) = alpha*temp
+                           end do
+                       end do
+                   end if
+               end if
+           else
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*b*a.
+                   if (upper) then
+                       do j = n,1,-1
+                           temp = alpha
+                           if (nounit) temp = temp*a(j,j)
+                           do i = 1,m
+                               b(i,j) = temp*b(i,j)
+                           end do
+                           do k = 1,j - 1
+                               if (a(k,j)/=zero) then
+                                   temp = alpha*a(k,j)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           temp = alpha
+                           if (nounit) temp = temp*a(j,j)
+                           do i = 1,m
+                               b(i,j) = temp*b(i,j)
+                           end do
+                           do k = j + 1,n
+                               if (a(k,j)/=zero) then
+                                   temp = alpha*a(k,j)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*b*a**t.
+                   if (upper) then
+                       do k = 1,n
+                           do j = 1,k - 1
+                               if (a(j,k)/=zero) then
+                                   temp = alpha*a(j,k)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           temp = alpha
+                           if (nounit) temp = temp*a(k,k)
+                           if (temp/=one) then
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                       end do
+                   else
+                       do k = n,1,-1
+                           do j = k + 1,n
+                               if (a(j,k)/=zero) then
+                                   temp = alpha*a(j,k)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           temp = alpha
+                           if (nounit) temp = temp*a(k,k)
+                           if (temp/=one) then
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_strmm
+
+     !> STRMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular matrix.
+
+     pure subroutine stdlib_strmv(uplo,trans,diag,n,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,n)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('STRMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               do i = 1,j - 1
+                                   x(i) = x(i) + temp*a(i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(j,j)
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               do i = 1,j - 1
+                                   x(ix) = x(ix) + temp*a(i,j)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(j,j)
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               temp = x(j)
+                               do i = n,j + 1,-1
+                                   x(i) = x(i) + temp*a(i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(j,j)
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               temp = x(jx)
+                               ix = kx
+                               do i = n,j + 1,-1
+                                   x(ix) = x(ix) + temp*a(i,j)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(j,j)
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           if (nounit) temp = temp*a(j,j)
+                           do i = j - 1,1,-1
+                               temp = temp + a(i,j)*x(i)
+                           end do
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = jx
+                           if (nounit) temp = temp*a(j,j)
+                           do i = j - 1,1,-1
+                               ix = ix - incx
+                               temp = temp + a(i,j)*x(ix)
+                           end do
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           if (nounit) temp = temp*a(j,j)
+                           do i = j + 1,n
+                               temp = temp + a(i,j)*x(i)
+                           end do
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = jx
+                           if (nounit) temp = temp*a(j,j)
+                           do i = j + 1,n
+                               ix = ix + incx
+                               temp = temp + a(i,j)*x(ix)
+                           end do
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_strmv
+
+     !> STRSM:  solves one of the matrix equations
+     !> op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+     !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T.
+     !> The matrix X is overwritten on B.
+
+     pure subroutine stdlib_strsm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           character, intent(in) :: diag, side, transa, uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: lside, nounit, upper
+           
+           ! test the input parameters.
+           lside = stdlib_lsame(side,'L')
+           if (lside) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           nounit = stdlib_lsame(diag,'N')
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.lside) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if ((.not.stdlib_lsame(transa,'N')) .and.(.not.stdlib_lsame(transa,'T')) .and.(&
+                     .not.stdlib_lsame(transa,'C'))) then
+               info = 3
+           else if ((.not.stdlib_lsame(diag,'U')) .and. (.not.stdlib_lsame(diag,'N'))) &
+                     then
+               info = 4
+           else if (m<0) then
+               info = 5
+           else if (n<0) then
+               info = 6
+           else if (lda<max(1,nrowa)) then
+               info = 9
+           else if (ldb<max(1,m)) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('STRSM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (m==0 .or. n==0) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               do j = 1,n
+                   do i = 1,m
+                       b(i,j) = zero
+                   end do
+               end do
+               return
+           end if
+           ! start the operations.
+           if (lside) then
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*inv( a )*b.
+                   if (upper) then
+                       do j = 1,n
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = m,1,-1
+                               if (b(k,j)/=zero) then
+                                   if (nounit) b(k,j) = b(k,j)/a(k,k)
+                                   do i = 1,k - 1
+                                       b(i,j) = b(i,j) - b(k,j)*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = 1,m
+                               if (b(k,j)/=zero) then
+                                   if (nounit) b(k,j) = b(k,j)/a(k,k)
+                                   do i = k + 1,m
+                                       b(i,j) = b(i,j) - b(k,j)*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*inv( a**t )*b.
+                   if (upper) then
+                       do j = 1,n
+                           do i = 1,m
+                               temp = alpha*b(i,j)
+                               do k = 1,i - 1
+                                   temp = temp - a(k,i)*b(k,j)
+                               end do
+                               if (nounit) temp = temp/a(i,i)
+                               b(i,j) = temp
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = m,1,-1
+                               temp = alpha*b(i,j)
+                               do k = i + 1,m
+                                   temp = temp - a(k,i)*b(k,j)
+                               end do
+                               if (nounit) temp = temp/a(i,i)
+                               b(i,j) = temp
+                           end do
+                       end do
+                   end if
+               end if
+           else
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*b*inv( a ).
+                   if (upper) then
+                       do j = 1,n
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = 1,j - 1
+                               if (a(k,j)/=zero) then
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - a(k,j)*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (nounit) then
+                               temp = one/a(j,j)
+                               do i = 1,m
+                                   b(i,j) = temp*b(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       do j = n,1,-1
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = j + 1,n
+                               if (a(k,j)/=zero) then
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - a(k,j)*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (nounit) then
+                               temp = one/a(j,j)
+                               do i = 1,m
+                                   b(i,j) = temp*b(i,j)
+                               end do
+                           end if
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*b*inv( a**t ).
+                   if (upper) then
+                       do k = n,1,-1
+                           if (nounit) then
+                               temp = one/a(k,k)
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                           do j = 1,k - 1
+                               if (a(j,k)/=zero) then
+                                   temp = a(j,k)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,k) = alpha*b(i,k)
+                               end do
+                           end if
+                       end do
+                   else
+                       do k = 1,n
+                           if (nounit) then
+                               temp = one/a(k,k)
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                           do j = k + 1,n
+                               if (a(j,k)/=zero) then
+                                   temp = a(j,k)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (alpha/=one) then
+                               do i = 1,m
+                                   b(i,k) = alpha*b(i,k)
+                               end do
+                           end if
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_strsm
+
+     !> STRSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular matrix.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_strsv(uplo,trans,diag,n,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,n)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('STRSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with one pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=zero) then
+                               if (nounit) x(j) = x(j)/a(j,j)
+                               temp = x(j)
+                               do i = j - 1,1,-1
+                                   x(i) = x(i) - temp*a(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           if (x(jx)/=zero) then
+                               if (nounit) x(jx) = x(jx)/a(j,j)
+                               temp = x(jx)
+                               ix = jx
+                               do i = j - 1,1,-1
+                                   ix = ix - incx
+                                   x(ix) = x(ix) - temp*a(i,j)
+                               end do
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=zero) then
+                               if (nounit) x(j) = x(j)/a(j,j)
+                               temp = x(j)
+                               do i = j + 1,n
+                                   x(i) = x(i) - temp*a(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=zero) then
+                               if (nounit) x(jx) = x(jx)/a(j,j)
+                               temp = x(jx)
+                               ix = jx
+                               do i = j + 1,n
+                                   ix = ix + incx
+                                   x(ix) = x(ix) - temp*a(i,j)
+                               end do
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           do i = 1,j - 1
+                               temp = temp - a(i,j)*x(i)
+                           end do
+                           if (nounit) temp = temp/a(j,j)
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           do i = 1,j - 1
+                               temp = temp - a(i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                           if (nounit) temp = temp/a(j,j)
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           do i = n,j + 1,-1
+                               temp = temp - a(i,j)*x(i)
+                           end do
+                           if (nounit) temp = temp/a(j,j)
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           do i = n,j + 1,-1
+                               temp = temp - a(i,j)*x(ix)
+                               ix = ix - incx
+                           end do
+                           if (nounit) temp = temp/a(j,j)
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_strsv
+
+
+
+end module stdlib_linalg_blas_s
diff --git a/src/stdlib_linalg_blas_w.fypp b/src/stdlib_linalg_blas_w.fypp
new file mode 100644
index 000000000..841dd9aee
--- /dev/null
+++ b/src/stdlib_linalg_blas_w.fypp
@@ -0,0 +1,5067 @@
+#:include "common.fypp" 
+#:if WITH_QP 
+module stdlib_linalg_blas_w
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas_aux
+     use stdlib_linalg_blas_s
+     use stdlib_linalg_blas_c
+     use stdlib_linalg_blas_d
+     use stdlib_linalg_blas_z
+     use stdlib_linalg_blas_q
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+     public :: stdlib_waxpy
+     public :: stdlib_wcopy
+     public :: stdlib_wdotc
+     public :: stdlib_wdotu
+     public :: stdlib_wdrot
+     public :: stdlib_wdscal
+     public :: stdlib_wgbmv
+     public :: stdlib_wgemm
+     public :: stdlib_wgemv
+     public :: stdlib_wgerc
+     public :: stdlib_wgeru
+     public :: stdlib_whbmv
+     public :: stdlib_whemm
+     public :: stdlib_whemv
+     public :: stdlib_wher
+     public :: stdlib_wher2
+     public :: stdlib_wher2k
+     public :: stdlib_wherk
+     public :: stdlib_whpmv
+     public :: stdlib_whpr
+     public :: stdlib_whpr2
+     public :: stdlib_wrotg
+     public :: stdlib_wscal
+     public :: stdlib_wswap
+     public :: stdlib_wsymm
+     public :: stdlib_wsyr2k
+     public :: stdlib_wsyrk
+     public :: stdlib_wtbmv
+     public :: stdlib_wtbsv
+     public :: stdlib_wtpmv
+     public :: stdlib_wtpsv
+     public :: stdlib_wtrmm
+     public :: stdlib_wtrmv
+     public :: stdlib_wtrsm
+     public :: stdlib_wtrsv
+
+     ! 128-bit real constants 
+     real(qp),    parameter, private ::     negone = -1.00_qp
+     real(qp),    parameter, private ::       zero = 0.00_qp
+     real(qp),    parameter, private ::       half = 0.50_qp
+     real(qp),    parameter, private ::        one = 1.00_qp
+     real(qp),    parameter, private ::        two = 2.00_qp
+     real(qp),    parameter, private ::      three = 3.00_qp
+     real(qp),    parameter, private ::       four = 4.00_qp
+     real(qp),    parameter, private ::      eight = 8.00_qp
+     real(qp),    parameter, private ::        ten = 10.00_qp
+
+     ! 128-bit complex constants 
+     complex(qp), parameter, private :: czero   = ( 0.0_qp,0.0_qp)
+     complex(qp), parameter, private :: chalf   = ( 0.5_qp,0.0_qp)
+     complex(qp), parameter, private :: cone    = ( 1.0_qp,0.0_qp)
+     complex(qp), parameter, private :: cnegone = (-1.0_qp,0.0_qp)
+
+     ! 128-bit scaling constants 
+     integer,     parameter, private :: maxexp = maxexponent(zero) 
+     integer,     parameter, private :: minexp = minexponent(zero) 
+     real(qp),    parameter, private :: rradix = real(radix(zero),qp) 
+     real(qp),    parameter, private :: ulp    = epsilon(zero) 
+     real(qp),    parameter, private :: eps    = ulp*half 
+     real(qp),    parameter, private :: safmin = rradix**max(minexp-1,1-maxexp) 
+     real(qp),    parameter, private :: safmax = one/safmin 
+     real(qp),    parameter, private :: smlnum = safmin/ulp 
+     real(qp),    parameter, private :: bignum = safmax*ulp 
+     real(qp),    parameter, private :: rtmin  = sqrt(smlnum) 
+     real(qp),    parameter, private :: rtmax  = sqrt(bignum) 
+
+     ! 128-bit Blue's scaling constants 
+     ! ssml>=1/s and sbig==1/S with s,S as defined in https://doi.org/10.1145/355769.355771 
+     real(qp),    parameter, private :: tsml   = rradix**ceiling((minexp-1)*half) 
+     real(qp),    parameter, private :: tbig   = rradix**floor((maxexp-digits(zero)+1)*half) 
+     real(qp),    parameter, private :: ssml   = rradix**(-floor((minexp-digits(zero))*half)) 
+     real(qp),    parameter, private :: sbig   = rradix**(-ceiling((maxexp+digits(zero)-1)*half)) 
+
+
+     contains
+
+     !> ZAXPY: constant times a vector plus a vector.
+
+     pure subroutine stdlib_waxpy(n,za,zx,incx,zy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: za
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(qp), intent(in) :: zx(*)
+           complex(qp), intent(inout) :: zy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           if (n<=0) return
+           if (stdlib_qcabs1(za)==0.0_qp) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              do i = 1,n
+                 zy(i) = zy(i) + za*zx(i)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 zy(iy) = zy(iy) + za*zx(ix)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_waxpy
+
+     !> ZCOPY: copies a vector, x, to a vector, y.
+
+     pure subroutine stdlib_wcopy(n,zx,incx,zy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(qp), intent(in) :: zx(*)
+           complex(qp), intent(out) :: zy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              do i = 1,n
+               zy(i) = zx(i)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 zy(iy) = zx(ix)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_wcopy
+
+     !> ZDOTC: forms the dot product of two complex vectors
+     !> ZDOTC = X^H * Y
+
+     pure complex(qp) function stdlib_wdotc(n,zx,incx,zy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(qp), intent(in) :: zx(*), zy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           complex(qp) :: ztemp
+           integer(ilp) :: i, ix, iy
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ztemp = (0.0_qp,0.0_qp)
+           stdlib_wdotc = (0.0_qp,0.0_qp)
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              do i = 1,n
+                 ztemp = ztemp + conjg(zx(i))*zy(i)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 ztemp = ztemp + conjg(zx(ix))*zy(iy)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           stdlib_wdotc = ztemp
+           return
+     end function stdlib_wdotc
+
+     !> ZDOTU: forms the dot product of two complex vectors
+     !> ZDOTU = X^T * Y
+
+     pure complex(qp) function stdlib_wdotu(n,zx,incx,zy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(qp), intent(in) :: zx(*), zy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           complex(qp) :: ztemp
+           integer(ilp) :: i, ix, iy
+           ztemp = (0.0_qp,0.0_qp)
+           stdlib_wdotu = (0.0_qp,0.0_qp)
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              do i = 1,n
+                 ztemp = ztemp + zx(i)*zy(i)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 ztemp = ztemp + zx(ix)*zy(iy)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           stdlib_wdotu = ztemp
+           return
+     end function stdlib_wdotu
+
+     !> Applies a plane rotation, where the cos and sin (c and s) are real
+     !> and the vectors cx and cy are complex.
+     !> jack dongarra, linpack, 3/11/78.
+
+     pure subroutine stdlib_wdrot( n, zx, incx, zy, incy, c, s )
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           real(qp), intent(in) :: c, s
+           ! Array Arguments 
+           complex(qp), intent(inout) :: zx(*), zy(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           complex(qp) :: ctemp
+           ! Executable Statements 
+           if( n<=0 )return
+           if( incx==1 .and. incy==1 ) then
+              ! code for both increments equal to 1
+              do i = 1, n
+                 ctemp = c*zx( i ) + s*zy( i )
+                 zy( i ) = c*zy( i ) - s*zx( i )
+                 zx( i ) = ctemp
+              end do
+           else
+              ! code for unequal increments or equal increments not equal
+                ! to 1
+              ix = 1
+              iy = 1
+              if( incx<0 )ix = ( -n+1 )*incx + 1
+              if( incy<0 )iy = ( -n+1 )*incy + 1
+              do i = 1, n
+                 ctemp = c*zx( ix ) + s*zy( iy )
+                 zy( iy ) = c*zy( iy ) - s*zx( ix )
+                 zx( ix ) = ctemp
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_wdrot
+
+     !> ZDSCAL: scales a vector by a constant.
+
+     pure subroutine stdlib_wdscal(n,da,zx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: da
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: zx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, nincx
+           ! Intrinsic Functions 
+           intrinsic :: cmplx
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              do i = 1,n
+                 zx(i) = cmplx(da,0.0_qp,KIND=qp)*zx(i)
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 zx(i) = cmplx(da,0.0_qp,KIND=qp)*zx(i)
+              end do
+           end if
+           return
+     end subroutine stdlib_wdscal
+
+     !> ZGBMV:  performs one of the matrix-vector operations
+     !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,   or
+     !> y := alpha*A**H*x + beta*y,
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n band matrix, with kl sub-diagonals and ku super-diagonals.
+
+     pure subroutine stdlib_wgbmv(trans,m,n,kl,ku,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, kl, ku, lda, m, n
+           character, intent(in) :: trans
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), x(*)
+           complex(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kup1, kx, ky, lenx, leny
+           logical(lk) :: noconj
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 1
+           else if (m<0) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (kl<0) then
+               info = 4
+           else if (ku<0) then
+               info = 5
+           else if (lda< (kl+ku+1)) then
+               info = 8
+           else if (incx==0) then
+               info = 10
+           else if (incy==0) then
+               info = 13
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZGBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==czero).and. (beta==cone))) return
+           noconj = stdlib_lsame(trans,'T')
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if (stdlib_lsame(trans,'N')) then
+               lenx = n
+               leny = m
+           else
+               lenx = m
+               leny = n
+           end if
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (lenx-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (leny-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the band part of a.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,leny
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,leny
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,leny
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,leny
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           kup1 = ku + 1
+           if (stdlib_lsame(trans,'N')) then
+              ! form  y := alpha*a*x + y.
+               jx = kx
+               if (incy==1) then
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           y(i) = y(i) + temp*a(k+i,j)
+                       end do
+                       jx = jx + incx
+                   end do
+               else
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       iy = ky
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           y(iy) = y(iy) + temp*a(k+i,j)
+                           iy = iy + incy
+                       end do
+                       jx = jx + incx
+                       if (j>ku) ky = ky + incy
+                   end do
+               end if
+           else
+              ! form  y := alpha*a**t*x + y  or  y := alpha*a**h*x + y.
+               jy = ky
+               if (incx==1) then
+                   do j = 1,n
+                       temp = czero
+                       k = kup1 - j
+                       if (noconj) then
+                           do i = max(1,j-ku),min(m,j+kl)
+                               temp = temp + a(k+i,j)*x(i)
+                           end do
+                       else
+                           do i = max(1,j-ku),min(m,j+kl)
+                               temp = temp + conjg(a(k+i,j))*x(i)
+                           end do
+                       end if
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               else
+                   do j = 1,n
+                       temp = czero
+                       ix = kx
+                       k = kup1 - j
+                       if (noconj) then
+                           do i = max(1,j-ku),min(m,j+kl)
+                               temp = temp + a(k+i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                       else
+                           do i = max(1,j-ku),min(m,j+kl)
+                               temp = temp + conjg(a(k+i,j))*x(ix)
+                               ix = ix + incx
+                           end do
+                       end if
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                       if (j>ku) kx = kx + incx
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_wgbmv
+
+     !> ZGEMM:  performs one of the matrix-matrix operations
+     !> C := alpha*op( A )*op( B ) + beta*C,
+     !> where  op( X ) is one of
+     !> op( X ) = X   or   op( X ) = X**T   or   op( X ) = X**H,
+     !> alpha and beta are scalars, and A, B and C are matrices, with op( A )
+     !> an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+
+     pure subroutine stdlib_wgemm(transa,transb,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, m, n
+           character, intent(in) :: transa, transb
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, j, l, nrowa, nrowb
+           logical(lk) :: conja, conjb, nota, notb
+           
+           
+           ! set  nota  and  notb  as  true if  a  and  b  respectively are not
+           ! conjugated or transposed, set  conja and conjb  as true if  a  and
+           ! b  respectively are to be  transposed but  not conjugated  and set
+           ! nrowa and nrowb  as the number of rows  of  a  and  b  respectively.
+           nota = stdlib_lsame(transa,'N')
+           notb = stdlib_lsame(transb,'N')
+           conja = stdlib_lsame(transa,'C')
+           conjb = stdlib_lsame(transb,'C')
+           if (nota) then
+               nrowa = m
+           else
+               nrowa = k
+           end if
+           if (notb) then
+               nrowb = k
+           else
+               nrowb = n
+           end if
+           ! test the input parameters.
+           info = 0
+           if ((.not.nota) .and. (.not.conja) .and.(.not.stdlib_lsame(transa,'T'))) then
+               info = 1
+           else if ((.not.notb) .and. (.not.conjb) .and.(.not.stdlib_lsame(transb,'T'))) &
+                     then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda<max(1,nrowa)) then
+               info = 8
+           else if (ldb<max(1,nrowb)) then
+               info = 10
+           else if (ldc<max(1,m)) then
+               info = 13
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZGEMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.(((alpha==czero).or. (k==0)).and. (beta==cone))) &
+                     return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (beta==czero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = czero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (notb) then
+               if (nota) then
+                 ! form  c := alpha*a*b + beta*c.
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,m
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*b(l,j)
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               else if (conja) then
+                 ! form  c := alpha*a**h*b + beta*c.
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*b(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**t*b + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*b(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           else if (nota) then
+               if (conjb) then
+                 ! form  c := alpha*a*b**h + beta*c.
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,m
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*conjg(b(j,l))
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a*b**t + beta*c
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,m
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*b(j,l)
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               end if
+           else if (conja) then
+               if (conjb) then
+                 ! form  c := alpha*a**h*b**h + beta*c.
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*conjg(b(j,l))
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**h*b**t + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*b(j,l)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           else
+               if (conjb) then
+                 ! form  c := alpha*a**t*b**h + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*conjg(b(j,l))
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**t*b**t + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*b(j,l)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_wgemm
+
+     !> ZGEMV:  performs one of the matrix-vector operations
+     !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,   or
+     !> y := alpha*A**H*x + beta*y,
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+
+     pure subroutine stdlib_wgemv(trans,m,n,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           character, intent(in) :: trans
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), x(*)
+           complex(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky, lenx, leny
+           logical(lk) :: noconj
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 1
+           else if (m<0) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (lda<max(1,m)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           else if (incy==0) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZGEMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==czero).and. (beta==cone))) return
+           noconj = stdlib_lsame(trans,'T')
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if (stdlib_lsame(trans,'N')) then
+               lenx = n
+               leny = m
+           else
+               lenx = m
+               leny = n
+           end if
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (lenx-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (leny-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,leny
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,leny
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,leny
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,leny
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           if (stdlib_lsame(trans,'N')) then
+              ! form  y := alpha*a*x + y.
+               jx = kx
+               if (incy==1) then
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       do i = 1,m
+                           y(i) = y(i) + temp*a(i,j)
+                       end do
+                       jx = jx + incx
+                   end do
+               else
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       iy = ky
+                       do i = 1,m
+                           y(iy) = y(iy) + temp*a(i,j)
+                           iy = iy + incy
+                       end do
+                       jx = jx + incx
+                   end do
+               end if
+           else
+              ! form  y := alpha*a**t*x + y  or  y := alpha*a**h*x + y.
+               jy = ky
+               if (incx==1) then
+                   do j = 1,n
+                       temp = czero
+                       if (noconj) then
+                           do i = 1,m
+                               temp = temp + a(i,j)*x(i)
+                           end do
+                       else
+                           do i = 1,m
+                               temp = temp + conjg(a(i,j))*x(i)
+                           end do
+                       end if
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               else
+                   do j = 1,n
+                       temp = czero
+                       ix = kx
+                       if (noconj) then
+                           do i = 1,m
+                               temp = temp + a(i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                       else
+                           do i = 1,m
+                               temp = temp + conjg(a(i,j))*x(ix)
+                               ix = ix + incx
+                           end do
+                       end if
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_wgemv
+
+     !> ZGERC:  performs the rank 1 operation
+     !> A := alpha*x*y**H + A,
+     !> where alpha is a scalar, x is an m element vector, y is an n element
+     !> vector and A is an m by n matrix.
+
+     pure subroutine stdlib_wgerc(m,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jy, kx
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! test the input parameters.
+           info = 0
+           if (m<0) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,m)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZGERC ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or. (alpha==czero)) return
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (incy>0) then
+               jy = 1
+           else
+               jy = 1 - (n-1)*incy
+           end if
+           if (incx==1) then
+               do j = 1,n
+                   if (y(jy)/=czero) then
+                       temp = alpha*conjg(y(jy))
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(i)*temp
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           else
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (m-1)*incx
+               end if
+               do j = 1,n
+                   if (y(jy)/=czero) then
+                       temp = alpha*conjg(y(jy))
+                       ix = kx
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(ix)*temp
+                           ix = ix + incx
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           end if
+           return
+     end subroutine stdlib_wgerc
+
+     !> ZGERU:  performs the rank 1 operation
+     !> A := alpha*x*y**T + A,
+     !> where alpha is a scalar, x is an m element vector, y is an n element
+     !> vector and A is an m by n matrix.
+
+     pure subroutine stdlib_wgeru(m,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jy, kx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (m<0) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,m)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZGERU ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or. (alpha==czero)) return
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (incy>0) then
+               jy = 1
+           else
+               jy = 1 - (n-1)*incy
+           end if
+           if (incx==1) then
+               do j = 1,n
+                   if (y(jy)/=czero) then
+                       temp = alpha*y(jy)
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(i)*temp
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           else
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (m-1)*incx
+               end if
+               do j = 1,n
+                   if (y(jy)/=czero) then
+                       temp = alpha*y(jy)
+                       ix = kx
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(ix)*temp
+                           ix = ix + incx
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           end if
+           return
+     end subroutine stdlib_wgeru
+
+     !> ZHBMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n hermitian band matrix, with k super-diagonals.
+
+     pure subroutine stdlib_whbmv(uplo,n,k,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, k, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), x(*)
+           complex(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(qp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kplus1, kx, ky, l
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (k<0) then
+               info = 3
+           else if (lda< (k+1)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           else if (incy==0) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==czero).and. (beta==cone))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of the array a
+           ! are accessed sequentially with cone pass through a.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when upper triangle of a is stored.
+               kplus1 = k + 1
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       l = kplus1 - j
+                       do i = max(1,j-k),j - 1
+                           y(i) = y(i) + temp1*a(l+i,j)
+                           temp2 = temp2 + conjg(a(l+i,j))*x(i)
+                       end do
+                       y(j) = y(j) + temp1*real(a(kplus1,j),KIND=qp) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       ix = kx
+                       iy = ky
+                       l = kplus1 - j
+                       do i = max(1,j-k),j - 1
+                           y(iy) = y(iy) + temp1*a(l+i,j)
+                           temp2 = temp2 + conjg(a(l+i,j))*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*real(a(kplus1,j),KIND=qp) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       if (j>k) then
+                           kx = kx + incx
+                           ky = ky + incy
+                       end if
+                   end do
+               end if
+           else
+              ! form  y  when lower triangle of a is stored.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       y(j) = y(j) + temp1*real(a(1,j),KIND=qp)
+                       l = 1 - j
+                       do i = j + 1,min(n,j+k)
+                           y(i) = y(i) + temp1*a(l+i,j)
+                           temp2 = temp2 + conjg(a(l+i,j))*x(i)
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       y(jy) = y(jy) + temp1*real(a(1,j),KIND=qp)
+                       l = 1 - j
+                       ix = jx
+                       iy = jy
+                       do i = j + 1,min(n,j+k)
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*a(l+i,j)
+                           temp2 = temp2 + conjg(a(l+i,j))*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_whbmv
+
+     !> ZHEMM:  performs one of the matrix-matrix operations
+     !> C := alpha*A*B + beta*C,
+     !> or
+     !> C := alpha*B*A + beta*C,
+     !> where alpha and beta are scalars, A is an hermitian matrix and  B and
+     !> C are m by n matrices.
+
+     pure subroutine stdlib_whemm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           character, intent(in) :: side, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Local Scalars 
+           complex(qp) :: temp1, temp2
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: upper
+           
+           
+           ! set nrowa as the number of rows of a.
+           if (stdlib_lsame(side,'L')) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           ! test the input parameters.
+           info = 0
+           if ((.not.stdlib_lsame(side,'L')) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,m)) then
+               info = 9
+           else if (ldc<max(1,m)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHEMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==czero).and. (beta==cone))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (beta==czero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = czero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(side,'L')) then
+              ! form  c := alpha*a*b + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,m
+                           temp1 = alpha*b(i,j)
+                           temp2 = czero
+                           do k = 1,i - 1
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*conjg(a(k,i))
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = temp1*real(a(i,i),KIND=qp) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*real(a(i,i),KIND=qp) +&
+                                         alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = m,1,-1
+                           temp1 = alpha*b(i,j)
+                           temp2 = czero
+                           do k = i + 1,m
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*conjg(a(k,i))
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = temp1*real(a(i,i),KIND=qp) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*real(a(i,i),KIND=qp) +&
+                                         alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*b*a + beta*c.
+               loop_170: do j = 1,n
+                   temp1 = alpha*real(a(j,j),KIND=qp)
+                   if (beta==czero) then
+                       do i = 1,m
+                           c(i,j) = temp1*b(i,j)
+                       end do
+                   else
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j) + temp1*b(i,j)
+                       end do
+                   end if
+                   do k = 1,j - 1
+                       if (upper) then
+                           temp1 = alpha*a(k,j)
+                       else
+                           temp1 = alpha*conjg(a(j,k))
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+                   do k = j + 1,n
+                       if (upper) then
+                           temp1 = alpha*conjg(a(j,k))
+                       else
+                           temp1 = alpha*a(k,j)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+               end do loop_170
+           end if
+           return
+     end subroutine stdlib_whemm
+
+     !> ZHEMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n hermitian matrix.
+
+     pure subroutine stdlib_whemv(uplo,n,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), x(*)
+           complex(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(qp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (lda<max(1,n)) then
+               info = 5
+           else if (incx==0) then
+               info = 7
+           else if (incy==0) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHEMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==czero).and. (beta==cone))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when a is stored in upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       do i = 1,j - 1
+                           y(i) = y(i) + temp1*a(i,j)
+                           temp2 = temp2 + conjg(a(i,j))*x(i)
+                       end do
+                       y(j) = y(j) + temp1*real(a(j,j),KIND=qp) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       ix = kx
+                       iy = ky
+                       do i = 1,j - 1
+                           y(iy) = y(iy) + temp1*a(i,j)
+                           temp2 = temp2 + conjg(a(i,j))*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*real(a(j,j),KIND=qp) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           else
+              ! form  y  when a is stored in lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       y(j) = y(j) + temp1*real(a(j,j),KIND=qp)
+                       do i = j + 1,n
+                           y(i) = y(i) + temp1*a(i,j)
+                           temp2 = temp2 + conjg(a(i,j))*x(i)
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       y(jy) = y(jy) + temp1*real(a(j,j),KIND=qp)
+                       ix = jx
+                       iy = jy
+                       do i = j + 1,n
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*a(i,j)
+                           temp2 = temp2 + conjg(a(i,j))*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_whemv
+
+     !> ZHER:   performs the hermitian rank 1 operation
+     !> A := alpha*x*x**H + A,
+     !> where alpha is a real scalar, x is an n element vector and A is an
+     !> n by n hermitian matrix.
+
+     pure subroutine stdlib_wher(uplo,n,alpha,x,incx,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (lda<max(1,n)) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHER  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==real(czero,KIND=qp))) return
+           ! set the start point in x if the increment is not unity.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when a is stored in upper triangle.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=czero) then
+                           temp = alpha*conjg(x(j))
+                           do i = 1,j - 1
+                               a(i,j) = a(i,j) + x(i)*temp
+                           end do
+                           a(j,j) = real(a(j,j),KIND=qp) + real(x(j)*temp,KIND=qp)
+                       else
+                           a(j,j) = real(a(j,j),KIND=qp)
+                       end if
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=czero) then
+                           temp = alpha*conjg(x(jx))
+                           ix = kx
+                           do i = 1,j - 1
+                               a(i,j) = a(i,j) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                           a(j,j) = real(a(j,j),KIND=qp) + real(x(jx)*temp,KIND=qp)
+                       else
+                           a(j,j) = real(a(j,j),KIND=qp)
+                       end if
+                       jx = jx + incx
+                   end do
+               end if
+           else
+              ! form  a  when a is stored in lower triangle.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=czero) then
+                           temp = alpha*conjg(x(j))
+                           a(j,j) = real(a(j,j),KIND=qp) + real(temp*x(j),KIND=qp)
+                           do i = j + 1,n
+                               a(i,j) = a(i,j) + x(i)*temp
+                           end do
+                       else
+                           a(j,j) = real(a(j,j),KIND=qp)
+                       end if
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=czero) then
+                           temp = alpha*conjg(x(jx))
+                           a(j,j) = real(a(j,j),KIND=qp) + real(temp*x(jx),KIND=qp)
+                           ix = jx
+                           do i = j + 1,n
+                               ix = ix + incx
+                               a(i,j) = a(i,j) + x(ix)*temp
+                           end do
+                       else
+                           a(j,j) = real(a(j,j),KIND=qp)
+                       end if
+                       jx = jx + incx
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_wher
+
+     !> ZHER2:  performs the hermitian rank 2 operation
+     !> A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
+     !> where alpha is a scalar, x and y are n element vectors and A is an n
+     !> by n hermitian matrix.
+
+     pure subroutine stdlib_wher2(uplo,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,n)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHER2 ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==czero)) return
+           ! set up the start points in x and y if the increments are not both
+           ! unity.
+           if ((incx/=1) .or. (incy/=1)) then
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (n-1)*incx
+               end if
+               if (incy>0) then
+                   ky = 1
+               else
+                   ky = 1 - (n-1)*incy
+               end if
+               jx = kx
+               jy = ky
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when a is stored in the upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=czero) .or. (y(j)/=czero)) then
+                           temp1 = alpha*conjg(y(j))
+                           temp2 = conjg(alpha*x(j))
+                           do i = 1,j - 1
+                               a(i,j) = a(i,j) + x(i)*temp1 + y(i)*temp2
+                           end do
+                           a(j,j) = real(a(j,j),KIND=qp) +real(x(j)*temp1+y(j)*temp2,KIND=qp)
+                                     
+                       else
+                           a(j,j) = real(a(j,j),KIND=qp)
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=czero) .or. (y(jy)/=czero)) then
+                           temp1 = alpha*conjg(y(jy))
+                           temp2 = conjg(alpha*x(jx))
+                           ix = kx
+                           iy = ky
+                           do i = 1,j - 1
+                               a(i,j) = a(i,j) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                           a(j,j) = real(a(j,j),KIND=qp) +real(x(jx)*temp1+y(jy)*temp2,KIND=qp)
+                                     
+                       else
+                           a(j,j) = real(a(j,j),KIND=qp)
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           else
+              ! form  a  when a is stored in the lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=czero) .or. (y(j)/=czero)) then
+                           temp1 = alpha*conjg(y(j))
+                           temp2 = conjg(alpha*x(j))
+                           a(j,j) = real(a(j,j),KIND=qp) +real(x(j)*temp1+y(j)*temp2,KIND=qp)
+                                     
+                           do i = j + 1,n
+                               a(i,j) = a(i,j) + x(i)*temp1 + y(i)*temp2
+                           end do
+                       else
+                           a(j,j) = real(a(j,j),KIND=qp)
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=czero) .or. (y(jy)/=czero)) then
+                           temp1 = alpha*conjg(y(jy))
+                           temp2 = conjg(alpha*x(jx))
+                           a(j,j) = real(a(j,j),KIND=qp) +real(x(jx)*temp1+y(jy)*temp2,KIND=qp)
+                                     
+                           ix = jx
+                           iy = jy
+                           do i = j + 1,n
+                               ix = ix + incx
+                               iy = iy + incy
+                               a(i,j) = a(i,j) + x(ix)*temp1 + y(iy)*temp2
+                           end do
+                       else
+                           a(j,j) = real(a(j,j),KIND=qp)
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_wher2
+
+     !> ZHER2K:  performs one of the hermitian rank 2k operations
+     !> C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C,
+     !> or
+     !> C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C,
+     !> where  alpha and beta  are scalars with  beta  real,  C is an  n by n
+     !> hermitian matrix and  A and B  are  n by k matrices in the first case
+     !> and  k by n  matrices in the second case.
+
+     pure subroutine stdlib_wher2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha
+           real(qp), intent(in) :: beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Local Scalars 
+           complex(qp) :: temp1, temp2
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'C'))) &
+                     then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,nrowa)) then
+               info = 9
+           else if (ldc<max(1,n)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHER2K',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==czero).or.(k==0)).and. (beta==one))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (upper) then
+                   if (beta==real(czero,KIND=qp)) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j - 1
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=qp)
+                       end do
+                   end if
+               else
+                   if (beta==real(czero,KIND=qp)) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           c(j,j) = beta*real(c(j,j),KIND=qp)
+                           do i = j + 1,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*b**h + conjg( alpha )*b*a**h +
+                         ! c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==real(czero,KIND=qp)) then
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,j - 1
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=qp)
+                       else
+                           c(j,j) = real(c(j,j),KIND=qp)
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=czero) .or. (b(j,l)/=czero)) then
+                               temp1 = alpha*conjg(b(j,l))
+                               temp2 = conjg(alpha*a(j,l))
+                               do i = 1,j - 1
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                               c(j,j) = real(c(j,j),KIND=qp) +real(a(j,l)*temp1+b(j,l)*temp2,&
+                                         KIND=qp)
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==real(czero,KIND=qp)) then
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=one) then
+                           do i = j + 1,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=qp)
+                       else
+                           c(j,j) = real(c(j,j),KIND=qp)
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=czero) .or. (b(j,l)/=czero)) then
+                               temp1 = alpha*conjg(b(j,l))
+                               temp2 = conjg(alpha*a(j,l))
+                               do i = j + 1,n
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                               c(j,j) = real(c(j,j),KIND=qp) +real(a(j,l)*temp1+b(j,l)*temp2,&
+                                         KIND=qp)
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**h*b + conjg( alpha )*b**h*a +
+                         ! c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp1 = czero
+                           temp2 = czero
+                           do l = 1,k
+                               temp1 = temp1 + conjg(a(l,i))*b(l,j)
+                               temp2 = temp2 + conjg(b(l,i))*a(l,j)
+                           end do
+                           if (i==j) then
+                               if (beta==real(czero,KIND=qp)) then
+                                   c(j,j) = real(alpha*temp1+conjg(alpha)*temp2,KIND=qp)
+                               else
+                                   c(j,j) = beta*real(c(j,j),KIND=qp) +real(alpha*temp1+conjg(&
+                                             alpha)*temp2,KIND=qp)
+                               end if
+                           else
+                               if (beta==real(czero,KIND=qp)) then
+                                   c(i,j) = alpha*temp1 + conjg(alpha)*temp2
+                               else
+                                   c(i,j) = beta*c(i,j) + alpha*temp1 +conjg(alpha)*temp2
+                               end if
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp1 = czero
+                           temp2 = czero
+                           do l = 1,k
+                               temp1 = temp1 + conjg(a(l,i))*b(l,j)
+                               temp2 = temp2 + conjg(b(l,i))*a(l,j)
+                           end do
+                           if (i==j) then
+                               if (beta==real(czero,KIND=qp)) then
+                                   c(j,j) = real(alpha*temp1+conjg(alpha)*temp2,KIND=qp)
+                               else
+                                   c(j,j) = beta*real(c(j,j),KIND=qp) +real(alpha*temp1+conjg(&
+                                             alpha)*temp2,KIND=qp)
+                               end if
+                           else
+                               if (beta==real(czero,KIND=qp)) then
+                                   c(i,j) = alpha*temp1 + conjg(alpha)*temp2
+                               else
+                                   c(i,j) = beta*c(i,j) + alpha*temp1 +conjg(alpha)*temp2
+                               end if
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_wher2k
+
+     !> ZHERK:  performs one of the hermitian rank k operations
+     !> C := alpha*A*A**H + beta*C,
+     !> or
+     !> C := alpha*A**H*A + beta*C,
+     !> where  alpha and beta  are  real scalars,  C is an  n by n  hermitian
+     !> matrix and  A  is an  n by k  matrix in the  first case and a  k by n
+     !> matrix in the second case.
+
+     pure subroutine stdlib_wherk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,conjg,max
+           ! Local Scalars 
+           complex(qp) :: temp
+           real(qp) :: rtemp
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'C'))) &
+                     then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldc<max(1,n)) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHERK ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==zero).or.(k==0)).and. (beta==one))) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               if (upper) then
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j - 1
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=qp)
+                       end do
+                   end if
+               else
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           c(j,j) = beta*real(c(j,j),KIND=qp)
+                           do i = j + 1,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*a**h + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,j - 1
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=qp)
+                       else
+                           c(j,j) = real(c(j,j),KIND=qp)
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=cmplx(zero,KIND=qp)) then
+                               temp = alpha*conjg(a(j,l))
+                               do i = 1,j - 1
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                               c(j,j) = real(c(j,j),KIND=qp) + real(temp*a(i,l),KIND=qp)
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           c(j,j) = beta*real(c(j,j),KIND=qp)
+                           do i = j + 1,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       else
+                           c(j,j) = real(c(j,j),KIND=qp)
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=cmplx(zero,KIND=qp)) then
+                               temp = alpha*conjg(a(j,l))
+                               c(j,j) = real(c(j,j),KIND=qp) + real(temp*a(j,l),KIND=qp)
+                               do i = j + 1,n
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**h*a + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j - 1
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                       rtemp = zero
+                       do l = 1,k
+                           rtemp = rtemp + conjg(a(l,j))*a(l,j)
+                       end do
+                       if (beta==zero) then
+                           c(j,j) = alpha*rtemp
+                       else
+                           c(j,j) = alpha*rtemp + beta*real(c(j,j),KIND=qp)
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       rtemp = zero
+                       do l = 1,k
+                           rtemp = rtemp + conjg(a(l,j))*a(l,j)
+                       end do
+                       if (beta==zero) then
+                           c(j,j) = alpha*rtemp
+                       else
+                           c(j,j) = alpha*rtemp + beta*real(c(j,j),KIND=qp)
+                       end if
+                       do i = j + 1,n
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_wherk
+
+     !> ZHPMV:  performs the matrix-vector operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n hermitian matrix, supplied in packed form.
+
+     pure subroutine stdlib_whpmv(uplo,n,alpha,ap,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: ap(*), x(*)
+           complex(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(qp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 6
+           else if (incy==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHPMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==czero).and. (beta==cone))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when ap contains the upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       k = kk
+                       do i = 1,j - 1
+                           y(i) = y(i) + temp1*ap(k)
+                           temp2 = temp2 + conjg(ap(k))*x(i)
+                           k = k + 1
+                       end do
+                       y(j) = y(j) + temp1*real(ap(kk+j-1),KIND=qp) + alpha*temp2
+                       kk = kk + j
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       ix = kx
+                       iy = ky
+                       do k = kk,kk + j - 2
+                           y(iy) = y(iy) + temp1*ap(k)
+                           temp2 = temp2 + conjg(ap(k))*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*real(ap(kk+j-1),KIND=qp) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  y  when ap contains the lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       y(j) = y(j) + temp1*real(ap(kk),KIND=qp)
+                       k = kk + 1
+                       do i = j + 1,n
+                           y(i) = y(i) + temp1*ap(k)
+                           temp2 = temp2 + conjg(ap(k))*x(i)
+                           k = k + 1
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                       kk = kk + (n-j+1)
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       y(jy) = y(jy) + temp1*real(ap(kk),KIND=qp)
+                       ix = jx
+                       iy = jy
+                       do k = kk + 1,kk + n - j
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*ap(k)
+                           temp2 = temp2 + conjg(ap(k))*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + (n-j+1)
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_whpmv
+
+     !> ZHPR:    performs the hermitian rank 1 operation
+     !> A := alpha*x*x**H + A,
+     !> where alpha is a real scalar, x is an n element vector and A is an
+     !> n by n hermitian matrix, supplied in packed form.
+
+     pure subroutine stdlib_whpr(uplo,n,alpha,x,incx,ap)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(qp), intent(inout) :: ap(*)
+           complex(qp), intent(in) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHPR  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==real(czero,KIND=qp))) return
+           ! set the start point in x if the increment is not unity.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when upper triangle is stored in ap.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=czero) then
+                           temp = alpha*conjg(x(j))
+                           k = kk
+                           do i = 1,j - 1
+                               ap(k) = ap(k) + x(i)*temp
+                               k = k + 1
+                           end do
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=qp) + real(x(j)*temp,KIND=qp)
+                       else
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=qp)
+                       end if
+                       kk = kk + j
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=czero) then
+                           temp = alpha*conjg(x(jx))
+                           ix = kx
+                           do k = kk,kk + j - 2
+                               ap(k) = ap(k) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=qp) + real(x(jx)*temp,KIND=qp)
+                                     
+                       else
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=qp)
+                       end if
+                       jx = jx + incx
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=czero) then
+                           temp = alpha*conjg(x(j))
+                           ap(kk) = real(ap(kk),KIND=qp) + real(temp*x(j),KIND=qp)
+                           k = kk + 1
+                           do i = j + 1,n
+                               ap(k) = ap(k) + x(i)*temp
+                               k = k + 1
+                           end do
+                       else
+                           ap(kk) = real(ap(kk),KIND=qp)
+                       end if
+                       kk = kk + n - j + 1
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=czero) then
+                           temp = alpha*conjg(x(jx))
+                           ap(kk) = real(ap(kk),KIND=qp) + real(temp*x(jx),KIND=qp)
+                           ix = jx
+                           do k = kk + 1,kk + n - j
+                               ix = ix + incx
+                               ap(k) = ap(k) + x(ix)*temp
+                           end do
+                       else
+                           ap(kk) = real(ap(kk),KIND=qp)
+                       end if
+                       jx = jx + incx
+                       kk = kk + n - j + 1
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_whpr
+
+     !> ZHPR2:  performs the hermitian rank 2 operation
+     !> A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
+     !> where alpha is a scalar, x and y are n element vectors and A is an
+     !> n by n hermitian matrix, supplied in packed form.
+
+     pure subroutine stdlib_whpr2(uplo,n,alpha,x,incx,y,incy,ap)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(qp), intent(inout) :: ap(*)
+           complex(qp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHPR2 ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==czero)) return
+           ! set up the start points in x and y if the increments are not both
+           ! unity.
+           if ((incx/=1) .or. (incy/=1)) then
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (n-1)*incx
+               end if
+               if (incy>0) then
+                   ky = 1
+               else
+                   ky = 1 - (n-1)*incy
+               end if
+               jx = kx
+               jy = ky
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when upper triangle is stored in ap.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=czero) .or. (y(j)/=czero)) then
+                           temp1 = alpha*conjg(y(j))
+                           temp2 = conjg(alpha*x(j))
+                           k = kk
+                           do i = 1,j - 1
+                               ap(k) = ap(k) + x(i)*temp1 + y(i)*temp2
+                               k = k + 1
+                           end do
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=qp) +real(x(j)*temp1+y(j)*temp2,&
+                                     KIND=qp)
+                       else
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=qp)
+                       end if
+                       kk = kk + j
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=czero) .or. (y(jy)/=czero)) then
+                           temp1 = alpha*conjg(y(jy))
+                           temp2 = conjg(alpha*x(jx))
+                           ix = kx
+                           iy = ky
+                           do k = kk,kk + j - 2
+                               ap(k) = ap(k) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=qp) +real(x(jx)*temp1+y(jy)*temp2,&
+                                     KIND=qp)
+                       else
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=qp)
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=czero) .or. (y(j)/=czero)) then
+                           temp1 = alpha*conjg(y(j))
+                           temp2 = conjg(alpha*x(j))
+                           ap(kk) = real(ap(kk),KIND=qp) +real(x(j)*temp1+y(j)*temp2,KIND=qp)
+                                     
+                           k = kk + 1
+                           do i = j + 1,n
+                               ap(k) = ap(k) + x(i)*temp1 + y(i)*temp2
+                               k = k + 1
+                           end do
+                       else
+                           ap(kk) = real(ap(kk),KIND=qp)
+                       end if
+                       kk = kk + n - j + 1
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=czero) .or. (y(jy)/=czero)) then
+                           temp1 = alpha*conjg(y(jy))
+                           temp2 = conjg(alpha*x(jx))
+                           ap(kk) = real(ap(kk),KIND=qp) +real(x(jx)*temp1+y(jy)*temp2,KIND=qp)
+                                     
+                           ix = jx
+                           iy = jy
+                           do k = kk + 1,kk + n - j
+                               ix = ix + incx
+                               iy = iy + incy
+                               ap(k) = ap(k) + x(ix)*temp1 + y(iy)*temp2
+                           end do
+                       else
+                           ap(kk) = real(ap(kk),KIND=qp)
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + n - j + 1
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_whpr2
+
+     !> !
+     !>
+     !> The computation uses the formulas
+     !> |x| = sqrt( Re(x)**2 + Im(x)**2 )
+     !> sgn(x) = x / |x|  if x /= 0
+     !> = 1        if x  = 0
+     !> c = |a| / sqrt(|a|**2 + |b|**2)
+     !> s = sgn(a) * conjg(b) / sqrt(|a|**2 + |b|**2)
+     !> When a and b are real and r /= 0, the formulas simplify to
+     !> r = sgn(a)*sqrt(|a|**2 + |b|**2)
+     !> c = a / r
+     !> s = b / r
+     !> the same as in DROTG when |a| > |b|.  When |b| >= |a|, the
+     !> sign of c and s will be different from those computed by DROTG
+     !> if the signs of a and b are not the same.
+
+     pure subroutine stdlib_wrotg( a, b, c, s )
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+        ! Constants 
+        integer, parameter :: wp = kind(1._qp)
+        ! Scaling Constants 
+        ! Scalar Arguments 
+        real(qp), intent(out) :: c
+        complex(qp), intent(inout) :: a
+        complex(qp), intent(in) :: b
+        complex(qp), intent(out) :: s
+        ! Local Scalars 
+        real(qp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w
+        complex(qp) :: f, fs, g, gs, r, t
+        ! Intrinsic Functions 
+        intrinsic :: abs,aimag,conjg,max,min,real,sqrt
+        ! Statement Functions 
+        real(qp) :: abssq
+        ! Statement Function Definitions 
+        abssq( t ) = real( t,KIND=qp)**2 + aimag( t )**2
+        ! Executable Statements 
+        f = a
+        g = b
+        if( g == czero ) then
+           c = one
+           s = czero
+           r = f
+        else if( f == czero ) then
+           c = zero
+           g1 = max( abs(real(g,KIND=qp)), abs(aimag(g)) )
+           if( g1 > rtmin .and. g1 < rtmax ) then
+              ! use unscaled algorithm
+              g2 = abssq( g )
+              d = sqrt( g2 )
+              s = conjg( g ) / d
+              r = d
+           else
+              ! use scaled algorithm
+              u = min( safmax, max( safmin, g1 ) )
+              uu = one / u
+              gs = g*uu
+              g2 = abssq( gs )
+              d = sqrt( g2 )
+              s = conjg( gs ) / d
+              r = d*u
+           end if
+        else
+           f1 = max( abs(real(f,KIND=qp)), abs(aimag(f)) )
+           g1 = max( abs(real(g,KIND=qp)), abs(aimag(g)) )
+     if( f1 > rtmin .and. f1 < rtmax .and.          g1 > rtmin .and. g1 < rtmax ) then
+              ! use unscaled algorithm
+              f2 = abssq( f )
+              g2 = abssq( g )
+              h2 = f2 + g2
+              if( f2 > rtmin .and. h2 < rtmax ) then
+                 d = sqrt( f2*h2 )
+              else
+                 d = sqrt( f2 )*sqrt( h2 )
+              end if
+              p = 1 / d
+              c = f2*p
+              s = conjg( g )*( f*p )
+              r = f*( h2*p )
+           else
+              ! use scaled algorithm
+              u = min( safmax, max( safmin, f1, g1 ) )
+              uu = one / u
+              gs = g*uu
+              g2 = abssq( gs )
+              if( f1*uu < rtmin ) then
+                 ! f is not well-scaled when scaled by g1.
+                 ! use a different scaling for f.
+                 v = min( safmax, max( safmin, f1 ) )
+                 vv = one / v
+                 w = v * uu
+                 fs = f*vv
+                 f2 = abssq( fs )
+                 h2 = f2*w**2 + g2
+              else
+                 ! otherwise use the same scaling for f and g.
+                 w = one
+                 fs = f*uu
+                 f2 = abssq( fs )
+                 h2 = f2 + g2
+              end if
+              if( f2 > rtmin .and. h2 < rtmax ) then
+                 d = sqrt( f2*h2 )
+              else
+                 d = sqrt( f2 )*sqrt( h2 )
+              end if
+              p = 1 / d
+              c = ( f2*p )*w
+              s = conjg( gs )*( fs*p )
+              r = ( fs*( h2*p ) )*u
+           end if
+        end if
+        a = r
+        return
+     end subroutine stdlib_wrotg
+
+     !> ZSCAL: scales a vector by a constant.
+
+     pure subroutine stdlib_wscal(n,za,zx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: za
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: zx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, nincx
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              do i = 1,n
+                 zx(i) = za*zx(i)
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 zx(i) = za*zx(i)
+              end do
+           end if
+           return
+     end subroutine stdlib_wscal
+
+     !> ZSWAP: interchanges two vectors.
+
+     pure subroutine stdlib_wswap(n,zx,incx,zy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: zx(*), zy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           complex(qp) :: ztemp
+           integer(ilp) :: i, ix, iy
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+             ! code for both increments equal to 1
+              do i = 1,n
+                 ztemp = zx(i)
+                 zx(i) = zy(i)
+                 zy(i) = ztemp
+              end do
+           else
+             ! code for unequal increments or equal increments not equal
+               ! to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 ztemp = zx(ix)
+                 zx(ix) = zy(iy)
+                 zy(iy) = ztemp
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_wswap
+
+     !> ZSYMM:  performs one of the matrix-matrix operations
+     !> C := alpha*A*B + beta*C,
+     !> or
+     !> C := alpha*B*A + beta*C,
+     !> where  alpha and beta are scalars, A is a symmetric matrix and  B and
+     !> C are m by n matrices.
+
+     pure subroutine stdlib_wsymm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           character, intent(in) :: side, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           complex(qp) :: temp1, temp2
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: upper
+           
+           
+           ! set nrowa as the number of rows of a.
+           if (stdlib_lsame(side,'L')) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           ! test the input parameters.
+           info = 0
+           if ((.not.stdlib_lsame(side,'L')) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,m)) then
+               info = 9
+           else if (ldc<max(1,m)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZSYMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==czero).and. (beta==cone))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (beta==czero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = czero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(side,'L')) then
+              ! form  c := alpha*a*b + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,m
+                           temp1 = alpha*b(i,j)
+                           temp2 = czero
+                           do k = 1,i - 1
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*a(k,i)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = temp1*a(i,i) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*a(i,i) +alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = m,1,-1
+                           temp1 = alpha*b(i,j)
+                           temp2 = czero
+                           do k = i + 1,m
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*a(k,i)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = temp1*a(i,i) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*a(i,i) +alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*b*a + beta*c.
+               loop_170: do j = 1,n
+                   temp1 = alpha*a(j,j)
+                   if (beta==czero) then
+                       do i = 1,m
+                           c(i,j) = temp1*b(i,j)
+                       end do
+                   else
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j) + temp1*b(i,j)
+                       end do
+                   end if
+                   do k = 1,j - 1
+                       if (upper) then
+                           temp1 = alpha*a(k,j)
+                       else
+                           temp1 = alpha*a(j,k)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+                   do k = j + 1,n
+                       if (upper) then
+                           temp1 = alpha*a(j,k)
+                       else
+                           temp1 = alpha*a(k,j)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+               end do loop_170
+           end if
+           return
+     end subroutine stdlib_wsymm
+
+     !> ZSYR2K:  performs one of the symmetric rank 2k operations
+     !> C := alpha*A*B**T + alpha*B*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*B + alpha*B**T*A + beta*C,
+     !> where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+     !> and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+     !> matrices in the second case.
+
+     pure subroutine stdlib_wsyr2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           complex(qp) :: temp1, temp2
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'T'))) &
+                     then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,nrowa)) then
+               info = 9
+           else if (ldc<max(1,n)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZSYR2K',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==czero).or.(k==0)).and. (beta==cone))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (upper) then
+                   if (beta==czero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               else
+                   if (beta==czero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*b**t + alpha*b*a**t + c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=czero) .or. (b(j,l)/=czero)) then
+                               temp1 = alpha*b(j,l)
+                               temp2 = alpha*a(j,l)
+                               do i = 1,j
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=czero) .or. (b(j,l)/=czero)) then
+                               temp1 = alpha*b(j,l)
+                               temp2 = alpha*a(j,l)
+                               do i = j,n
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**t*b + alpha*b**t*a + c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp1 = czero
+                           temp2 = czero
+                           do l = 1,k
+                               temp1 = temp1 + a(l,i)*b(l,j)
+                               temp2 = temp2 + b(l,i)*a(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp1 + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + alpha*temp1 +alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp1 = czero
+                           temp2 = czero
+                           do l = 1,k
+                               temp1 = temp1 + a(l,i)*b(l,j)
+                               temp2 = temp2 + b(l,i)*a(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp1 + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + alpha*temp1 +alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_wsyr2k
+
+     !> ZSYRK:  performs one of the symmetric rank k operations
+     !> C := alpha*A*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*A + beta*C,
+     !> where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+     !> and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+     !> in the second case.
+
+     pure subroutine stdlib_wsyrk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'T'))) &
+                     then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldc<max(1,n)) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZSYRK ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==czero).or.(k==0)).and. (beta==cone))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (upper) then
+                   if (beta==czero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               else
+                   if (beta==czero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*a**t + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=czero) then
+                               temp = alpha*a(j,l)
+                               do i = 1,j
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=czero) then
+                               temp = alpha*a(j,l)
+                               do i = j,n
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**t*a + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*a(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*a(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_wsyrk
+
+     !> ZTBMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,   or   x := A**H*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+
+     pure subroutine stdlib_wtbmv(uplo,trans,diag,n,k,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kplus1, kx, l
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda< (k+1)) then
+               info = 7
+           else if (incx==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx   too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (stdlib_lsame(trans,'N')) then
+               ! form  x := a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               l = kplus1 - j
+                               do i = max(1,j-k),j - 1
+                                   x(i) = x(i) + temp*a(l+i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(kplus1,j)
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               l = kplus1 - j
+                               do i = max(1,j-k),j - 1
+                                   x(ix) = x(ix) + temp*a(l+i,j)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(kplus1,j)
+                           end if
+                           jx = jx + incx
+                           if (j>k) kx = kx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               l = 1 - j
+                               do i = min(n,j+k),j + 1,-1
+                                   x(i) = x(i) + temp*a(l+i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(1,j)
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               l = 1 - j
+                               do i = min(n,j+k),j + 1,-1
+                                   x(ix) = x(ix) + temp*a(l+i,j)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(1,j)
+                           end if
+                           jx = jx - incx
+                           if ((n-j)>=k) kx = kx - incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x  or  x := a**h*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           l = kplus1 - j
+                           if (noconj) then
+                               if (nounit) temp = temp*a(kplus1,j)
+                               do i = j - 1,max(1,j-k),-1
+                                   temp = temp + a(l+i,j)*x(i)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(kplus1,j))
+                               do i = j - 1,max(1,j-k),-1
+                                   temp = temp + conjg(a(l+i,j))*x(i)
+                               end do
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           kx = kx - incx
+                           ix = kx
+                           l = kplus1 - j
+                           if (noconj) then
+                               if (nounit) temp = temp*a(kplus1,j)
+                               do i = j - 1,max(1,j-k),-1
+                                   temp = temp + a(l+i,j)*x(ix)
+                                   ix = ix - incx
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(kplus1,j))
+                               do i = j - 1,max(1,j-k),-1
+                                   temp = temp + conjg(a(l+i,j))*x(ix)
+                                   ix = ix - incx
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           l = 1 - j
+                           if (noconj) then
+                               if (nounit) temp = temp*a(1,j)
+                               do i = j + 1,min(n,j+k)
+                                   temp = temp + a(l+i,j)*x(i)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(1,j))
+                               do i = j + 1,min(n,j+k)
+                                   temp = temp + conjg(a(l+i,j))*x(i)
+                               end do
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           kx = kx + incx
+                           ix = kx
+                           l = 1 - j
+                           if (noconj) then
+                               if (nounit) temp = temp*a(1,j)
+                               do i = j + 1,min(n,j+k)
+                                   temp = temp + a(l+i,j)*x(ix)
+                                   ix = ix + incx
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(1,j))
+                               do i = j + 1,min(n,j+k)
+                                   temp = temp + conjg(a(l+i,j))*x(ix)
+                                   ix = ix + incx
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_wtbmv
+
+     !> ZTBSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,   or   A**H*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular band matrix, with ( k + 1 )
+     !> diagonals.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_wtbsv(uplo,trans,diag,n,k,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kplus1, kx, l
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda< (k+1)) then
+               info = 7
+           else if (incx==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTBSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed by sequentially with cone pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               l = kplus1 - j
+                               if (nounit) x(j) = x(j)/a(kplus1,j)
+                               temp = x(j)
+                               do i = j - 1,max(1,j-k),-1
+                                   x(i) = x(i) - temp*a(l+i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           kx = kx - incx
+                           if (x(jx)/=czero) then
+                               ix = kx
+                               l = kplus1 - j
+                               if (nounit) x(jx) = x(jx)/a(kplus1,j)
+                               temp = x(jx)
+                               do i = j - 1,max(1,j-k),-1
+                                   x(ix) = x(ix) - temp*a(l+i,j)
+                                   ix = ix - incx
+                               end do
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               l = 1 - j
+                               if (nounit) x(j) = x(j)/a(1,j)
+                               temp = x(j)
+                               do i = j + 1,min(n,j+k)
+                                   x(i) = x(i) - temp*a(l+i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           kx = kx + incx
+                           if (x(jx)/=czero) then
+                               ix = kx
+                               l = 1 - j
+                               if (nounit) x(jx) = x(jx)/a(1,j)
+                               temp = x(jx)
+                               do i = j + 1,min(n,j+k)
+                                   x(ix) = x(ix) - temp*a(l+i,j)
+                                   ix = ix + incx
+                               end do
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x  or  x := inv( a**h )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           l = kplus1 - j
+                           if (noconj) then
+                               do i = max(1,j-k),j - 1
+                                   temp = temp - a(l+i,j)*x(i)
+                               end do
+                               if (nounit) temp = temp/a(kplus1,j)
+                           else
+                               do i = max(1,j-k),j - 1
+                                   temp = temp - conjg(a(l+i,j))*x(i)
+                               end do
+                               if (nounit) temp = temp/conjg(a(kplus1,j))
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           l = kplus1 - j
+                           if (noconj) then
+                               do i = max(1,j-k),j - 1
+                                   temp = temp - a(l+i,j)*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/a(kplus1,j)
+                           else
+                               do i = max(1,j-k),j - 1
+                                   temp = temp - conjg(a(l+i,j))*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/conjg(a(kplus1,j))
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                           if (j>k) kx = kx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           l = 1 - j
+                           if (noconj) then
+                               do i = min(n,j+k),j + 1,-1
+                                   temp = temp - a(l+i,j)*x(i)
+                               end do
+                               if (nounit) temp = temp/a(1,j)
+                           else
+                               do i = min(n,j+k),j + 1,-1
+                                   temp = temp - conjg(a(l+i,j))*x(i)
+                               end do
+                               if (nounit) temp = temp/conjg(a(1,j))
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           l = 1 - j
+                           if (noconj) then
+                               do i = min(n,j+k),j + 1,-1
+                                   temp = temp - a(l+i,j)*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/a(1,j)
+                           else
+                               do i = min(n,j+k),j + 1,-1
+                                   temp = temp - conjg(a(l+i,j))*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/conjg(a(1,j))
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                           if ((n-j)>=k) kx = kx - incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_wtbsv
+
+     !> ZTPMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,   or   x := A**H*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular matrix, supplied in packed form.
+
+     pure subroutine stdlib_wtpmv(uplo,trans,diag,n,ap,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: ap(*)
+           complex(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (incx==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTPMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of ap are
+           ! accessed sequentially with cone pass through ap.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x:= a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               k = kk
+                               do i = 1,j - 1
+                                   x(i) = x(i) + temp*ap(k)
+                                   k = k + 1
+                               end do
+                               if (nounit) x(j) = x(j)*ap(kk+j-1)
+                           end if
+                           kk = kk + j
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               do k = kk,kk + j - 2
+                                   x(ix) = x(ix) + temp*ap(k)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*ap(kk+j-1)
+                           end if
+                           jx = jx + incx
+                           kk = kk + j
+                       end do
+                   end if
+               else
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               k = kk
+                               do i = n,j + 1,-1
+                                   x(i) = x(i) + temp*ap(k)
+                                   k = k - 1
+                               end do
+                               if (nounit) x(j) = x(j)*ap(kk-n+j)
+                           end if
+                           kk = kk - (n-j+1)
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               do k = kk,kk - (n- (j+1)),-1
+                                   x(ix) = x(ix) + temp*ap(k)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*ap(kk-n+j)
+                           end if
+                           jx = jx - incx
+                           kk = kk - (n-j+1)
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x  or  x := a**h*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           k = kk - 1
+                           if (noconj) then
+                               if (nounit) temp = temp*ap(kk)
+                               do i = j - 1,1,-1
+                                   temp = temp + ap(k)*x(i)
+                                   k = k - 1
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(ap(kk))
+                               do i = j - 1,1,-1
+                                   temp = temp + conjg(ap(k))*x(i)
+                                   k = k - 1
+                               end do
+                           end if
+                           x(j) = temp
+                           kk = kk - j
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = jx
+                           if (noconj) then
+                               if (nounit) temp = temp*ap(kk)
+                               do k = kk - 1,kk - j + 1,-1
+                                   ix = ix - incx
+                                   temp = temp + ap(k)*x(ix)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(ap(kk))
+                               do k = kk - 1,kk - j + 1,-1
+                                   ix = ix - incx
+                                   temp = temp + conjg(ap(k))*x(ix)
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                           kk = kk - j
+                       end do
+                   end if
+               else
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           k = kk + 1
+                           if (noconj) then
+                               if (nounit) temp = temp*ap(kk)
+                               do i = j + 1,n
+                                   temp = temp + ap(k)*x(i)
+                                   k = k + 1
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(ap(kk))
+                               do i = j + 1,n
+                                   temp = temp + conjg(ap(k))*x(i)
+                                   k = k + 1
+                               end do
+                           end if
+                           x(j) = temp
+                           kk = kk + (n-j+1)
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = jx
+                           if (noconj) then
+                               if (nounit) temp = temp*ap(kk)
+                               do k = kk + 1,kk + n - j
+                                   ix = ix + incx
+                                   temp = temp + ap(k)*x(ix)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(ap(kk))
+                               do k = kk + 1,kk + n - j
+                                   ix = ix + incx
+                                   temp = temp + conjg(ap(k))*x(ix)
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                           kk = kk + (n-j+1)
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_wtpmv
+
+     !> ZTPSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,   or   A**H*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular matrix, supplied in packed form.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_wtpsv(uplo,trans,diag,n,ap,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: ap(*)
+           complex(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (incx==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTPSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of ap are
+           ! accessed sequentially with cone pass through ap.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               if (nounit) x(j) = x(j)/ap(kk)
+                               temp = x(j)
+                               k = kk - 1
+                               do i = j - 1,1,-1
+                                   x(i) = x(i) - temp*ap(k)
+                                   k = k - 1
+                               end do
+                           end if
+                           kk = kk - j
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               if (nounit) x(jx) = x(jx)/ap(kk)
+                               temp = x(jx)
+                               ix = jx
+                               do k = kk - 1,kk - j + 1,-1
+                                   ix = ix - incx
+                                   x(ix) = x(ix) - temp*ap(k)
+                               end do
+                           end if
+                           jx = jx - incx
+                           kk = kk - j
+                       end do
+                   end if
+               else
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               if (nounit) x(j) = x(j)/ap(kk)
+                               temp = x(j)
+                               k = kk + 1
+                               do i = j + 1,n
+                                   x(i) = x(i) - temp*ap(k)
+                                   k = k + 1
+                               end do
+                           end if
+                           kk = kk + (n-j+1)
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               if (nounit) x(jx) = x(jx)/ap(kk)
+                               temp = x(jx)
+                               ix = jx
+                               do k = kk + 1,kk + n - j
+                                   ix = ix + incx
+                                   x(ix) = x(ix) - temp*ap(k)
+                               end do
+                           end if
+                           jx = jx + incx
+                           kk = kk + (n-j+1)
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x  or  x := inv( a**h )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           k = kk
+                           if (noconj) then
+                               do i = 1,j - 1
+                                   temp = temp - ap(k)*x(i)
+                                   k = k + 1
+                               end do
+                               if (nounit) temp = temp/ap(kk+j-1)
+                           else
+                               do i = 1,j - 1
+                                   temp = temp - conjg(ap(k))*x(i)
+                                   k = k + 1
+                               end do
+                               if (nounit) temp = temp/conjg(ap(kk+j-1))
+                           end if
+                           x(j) = temp
+                           kk = kk + j
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           if (noconj) then
+                               do k = kk,kk + j - 2
+                                   temp = temp - ap(k)*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/ap(kk+j-1)
+                           else
+                               do k = kk,kk + j - 2
+                                   temp = temp - conjg(ap(k))*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/conjg(ap(kk+j-1))
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                           kk = kk + j
+                       end do
+                   end if
+               else
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           k = kk
+                           if (noconj) then
+                               do i = n,j + 1,-1
+                                   temp = temp - ap(k)*x(i)
+                                   k = k - 1
+                               end do
+                               if (nounit) temp = temp/ap(kk-n+j)
+                           else
+                               do i = n,j + 1,-1
+                                   temp = temp - conjg(ap(k))*x(i)
+                                   k = k - 1
+                               end do
+                               if (nounit) temp = temp/conjg(ap(kk-n+j))
+                           end if
+                           x(j) = temp
+                           kk = kk - (n-j+1)
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           if (noconj) then
+                               do k = kk,kk - (n- (j+1)),-1
+                                   temp = temp - ap(k)*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/ap(kk-n+j)
+                           else
+                               do k = kk,kk - (n- (j+1)),-1
+                                   temp = temp - conjg(ap(k))*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/conjg(ap(kk-n+j))
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                           kk = kk - (n-j+1)
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_wtpsv
+
+     !> ZTRMM:  performs one of the matrix-matrix operations
+     !> B := alpha*op( A )*B,   or   B := alpha*B*op( A )
+     !> where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
+
+     pure subroutine stdlib_wtrmm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           character, intent(in) :: diag, side, transa, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: lside, noconj, nounit, upper
+           
+           
+           ! test the input parameters.
+           lside = stdlib_lsame(side,'L')
+           if (lside) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           noconj = stdlib_lsame(transa,'T')
+           nounit = stdlib_lsame(diag,'N')
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.lside) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if ((.not.stdlib_lsame(transa,'N')) .and.(.not.stdlib_lsame(transa,'T')) .and.(&
+                     .not.stdlib_lsame(transa,'C'))) then
+               info = 3
+           else if ((.not.stdlib_lsame(diag,'U')) .and. (.not.stdlib_lsame(diag,'N'))) &
+                     then
+               info = 4
+           else if (m<0) then
+               info = 5
+           else if (n<0) then
+               info = 6
+           else if (lda<max(1,nrowa)) then
+               info = 9
+           else if (ldb<max(1,m)) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTRMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (m==0 .or. n==0) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               do j = 1,n
+                   do i = 1,m
+                       b(i,j) = czero
+                   end do
+               end do
+               return
+           end if
+           ! start the operations.
+           if (lside) then
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*a*b.
+                   if (upper) then
+                       do j = 1,n
+                           do k = 1,m
+                               if (b(k,j)/=czero) then
+                                   temp = alpha*b(k,j)
+                                   do i = 1,k - 1
+                                       b(i,j) = b(i,j) + temp*a(i,k)
+                                   end do
+                                   if (nounit) temp = temp*a(k,k)
+                                   b(k,j) = temp
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do k = m,1,-1
+                               if (b(k,j)/=czero) then
+                                   temp = alpha*b(k,j)
+                                   b(k,j) = temp
+                                   if (nounit) b(k,j) = b(k,j)*a(k,k)
+                                   do i = k + 1,m
+                                       b(i,j) = b(i,j) + temp*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*a**t*b   or   b := alpha*a**h*b.
+                   if (upper) then
+                       do j = 1,n
+                           do i = m,1,-1
+                               temp = b(i,j)
+                               if (noconj) then
+                                   if (nounit) temp = temp*a(i,i)
+                                   do k = 1,i - 1
+                                       temp = temp + a(k,i)*b(k,j)
+                                   end do
+                               else
+                                   if (nounit) temp = temp*conjg(a(i,i))
+                                   do k = 1,i - 1
+                                       temp = temp + conjg(a(k,i))*b(k,j)
+                                   end do
+                               end if
+                               b(i,j) = alpha*temp
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,m
+                               temp = b(i,j)
+                               if (noconj) then
+                                   if (nounit) temp = temp*a(i,i)
+                                   do k = i + 1,m
+                                       temp = temp + a(k,i)*b(k,j)
+                                   end do
+                               else
+                                   if (nounit) temp = temp*conjg(a(i,i))
+                                   do k = i + 1,m
+                                       temp = temp + conjg(a(k,i))*b(k,j)
+                                   end do
+                               end if
+                               b(i,j) = alpha*temp
+                           end do
+                       end do
+                   end if
+               end if
+           else
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*b*a.
+                   if (upper) then
+                       do j = n,1,-1
+                           temp = alpha
+                           if (nounit) temp = temp*a(j,j)
+                           do i = 1,m
+                               b(i,j) = temp*b(i,j)
+                           end do
+                           do k = 1,j - 1
+                               if (a(k,j)/=czero) then
+                                   temp = alpha*a(k,j)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           temp = alpha
+                           if (nounit) temp = temp*a(j,j)
+                           do i = 1,m
+                               b(i,j) = temp*b(i,j)
+                           end do
+                           do k = j + 1,n
+                               if (a(k,j)/=czero) then
+                                   temp = alpha*a(k,j)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*b*a**t   or   b := alpha*b*a**h.
+                   if (upper) then
+                       loop_280: do k = 1,n
+                           do j = 1,k - 1
+                               if (a(j,k)/=czero) then
+                                   if (noconj) then
+                                       temp = alpha*a(j,k)
+                                   else
+                                       temp = alpha*conjg(a(j,k))
+                                   end if
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           temp = alpha
+                           if (nounit) then
+                               if (noconj) then
+                                   temp = temp*a(k,k)
+                               else
+                                   temp = temp*conjg(a(k,k))
+                               end if
+                           end if
+                           if (temp/=cone) then
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                       end do loop_280
+                   else
+                       loop_320: do k = n,1,-1
+                           do j = k + 1,n
+                               if (a(j,k)/=czero) then
+                                   if (noconj) then
+                                       temp = alpha*a(j,k)
+                                   else
+                                       temp = alpha*conjg(a(j,k))
+                                   end if
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           temp = alpha
+                           if (nounit) then
+                               if (noconj) then
+                                   temp = temp*a(k,k)
+                               else
+                                   temp = temp*conjg(a(k,k))
+                               end if
+                           end if
+                           if (temp/=cone) then
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                       end do loop_320
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_wtrmm
+
+     !> ZTRMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,   or   x := A**H*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular matrix.
+
+     pure subroutine stdlib_wtrmv(uplo,trans,diag,n,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,n)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTRMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               do i = 1,j - 1
+                                   x(i) = x(i) + temp*a(i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(j,j)
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               do i = 1,j - 1
+                                   x(ix) = x(ix) + temp*a(i,j)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(j,j)
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               do i = n,j + 1,-1
+                                   x(i) = x(i) + temp*a(i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(j,j)
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               do i = n,j + 1,-1
+                                   x(ix) = x(ix) + temp*a(i,j)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(j,j)
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x  or  x := a**h*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           if (noconj) then
+                               if (nounit) temp = temp*a(j,j)
+                               do i = j - 1,1,-1
+                                   temp = temp + a(i,j)*x(i)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(j,j))
+                               do i = j - 1,1,-1
+                                   temp = temp + conjg(a(i,j))*x(i)
+                               end do
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = jx
+                           if (noconj) then
+                               if (nounit) temp = temp*a(j,j)
+                               do i = j - 1,1,-1
+                                   ix = ix - incx
+                                   temp = temp + a(i,j)*x(ix)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(j,j))
+                               do i = j - 1,1,-1
+                                   ix = ix - incx
+                                   temp = temp + conjg(a(i,j))*x(ix)
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           if (noconj) then
+                               if (nounit) temp = temp*a(j,j)
+                               do i = j + 1,n
+                                   temp = temp + a(i,j)*x(i)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(j,j))
+                               do i = j + 1,n
+                                   temp = temp + conjg(a(i,j))*x(i)
+                               end do
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = jx
+                           if (noconj) then
+                               if (nounit) temp = temp*a(j,j)
+                               do i = j + 1,n
+                                   ix = ix + incx
+                                   temp = temp + a(i,j)*x(ix)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(j,j))
+                               do i = j + 1,n
+                                   ix = ix + incx
+                                   temp = temp + conjg(a(i,j))*x(ix)
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_wtrmv
+
+     !> ZTRSM:  solves one of the matrix equations
+     !> op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+     !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
+     !> The matrix X is overwritten on B.
+
+     pure subroutine stdlib_wtrsm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: alpha
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           character, intent(in) :: diag, side, transa, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: lside, noconj, nounit, upper
+           
+           
+           ! test the input parameters.
+           lside = stdlib_lsame(side,'L')
+           if (lside) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           noconj = stdlib_lsame(transa,'T')
+           nounit = stdlib_lsame(diag,'N')
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.lside) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if ((.not.stdlib_lsame(transa,'N')) .and.(.not.stdlib_lsame(transa,'T')) .and.(&
+                     .not.stdlib_lsame(transa,'C'))) then
+               info = 3
+           else if ((.not.stdlib_lsame(diag,'U')) .and. (.not.stdlib_lsame(diag,'N'))) &
+                     then
+               info = 4
+           else if (m<0) then
+               info = 5
+           else if (n<0) then
+               info = 6
+           else if (lda<max(1,nrowa)) then
+               info = 9
+           else if (ldb<max(1,m)) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTRSM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (m==0 .or. n==0) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               do j = 1,n
+                   do i = 1,m
+                       b(i,j) = czero
+                   end do
+               end do
+               return
+           end if
+           ! start the operations.
+           if (lside) then
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*inv( a )*b.
+                   if (upper) then
+                       do j = 1,n
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = m,1,-1
+                               if (b(k,j)/=czero) then
+                                   if (nounit) b(k,j) = b(k,j)/a(k,k)
+                                   do i = 1,k - 1
+                                       b(i,j) = b(i,j) - b(k,j)*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = 1,m
+                               if (b(k,j)/=czero) then
+                                   if (nounit) b(k,j) = b(k,j)/a(k,k)
+                                   do i = k + 1,m
+                                       b(i,j) = b(i,j) - b(k,j)*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*inv( a**t )*b
+                 ! or    b := alpha*inv( a**h )*b.
+                   if (upper) then
+                       do j = 1,n
+                           do i = 1,m
+                               temp = alpha*b(i,j)
+                               if (noconj) then
+                                   do k = 1,i - 1
+                                       temp = temp - a(k,i)*b(k,j)
+                                   end do
+                                   if (nounit) temp = temp/a(i,i)
+                               else
+                                   do k = 1,i - 1
+                                       temp = temp - conjg(a(k,i))*b(k,j)
+                                   end do
+                                   if (nounit) temp = temp/conjg(a(i,i))
+                               end if
+                               b(i,j) = temp
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = m,1,-1
+                               temp = alpha*b(i,j)
+                               if (noconj) then
+                                   do k = i + 1,m
+                                       temp = temp - a(k,i)*b(k,j)
+                                   end do
+                                   if (nounit) temp = temp/a(i,i)
+                               else
+                                   do k = i + 1,m
+                                       temp = temp - conjg(a(k,i))*b(k,j)
+                                   end do
+                                   if (nounit) temp = temp/conjg(a(i,i))
+                               end if
+                               b(i,j) = temp
+                           end do
+                       end do
+                   end if
+               end if
+           else
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*b*inv( a ).
+                   if (upper) then
+                       do j = 1,n
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = 1,j - 1
+                               if (a(k,j)/=czero) then
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - a(k,j)*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (nounit) then
+                               temp = cone/a(j,j)
+                               do i = 1,m
+                                   b(i,j) = temp*b(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       do j = n,1,-1
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = j + 1,n
+                               if (a(k,j)/=czero) then
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - a(k,j)*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (nounit) then
+                               temp = cone/a(j,j)
+                               do i = 1,m
+                                   b(i,j) = temp*b(i,j)
+                               end do
+                           end if
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*b*inv( a**t )
+                 ! or    b := alpha*b*inv( a**h ).
+                   if (upper) then
+                       loop_330: do k = n,1,-1
+                           if (nounit) then
+                               if (noconj) then
+                                   temp = cone/a(k,k)
+                               else
+                                   temp = cone/conjg(a(k,k))
+                               end if
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                           do j = 1,k - 1
+                               if (a(j,k)/=czero) then
+                                   if (noconj) then
+                                       temp = a(j,k)
+                                   else
+                                       temp = conjg(a(j,k))
+                                   end if
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,k) = alpha*b(i,k)
+                               end do
+                           end if
+                       end do loop_330
+                   else
+                       loop_380: do k = 1,n
+                           if (nounit) then
+                               if (noconj) then
+                                   temp = cone/a(k,k)
+                               else
+                                   temp = cone/conjg(a(k,k))
+                               end if
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                           do j = k + 1,n
+                               if (a(j,k)/=czero) then
+                                   if (noconj) then
+                                       temp = a(j,k)
+                                   else
+                                       temp = conjg(a(j,k))
+                                   end if
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,k) = alpha*b(i,k)
+                               end do
+                           end if
+                       end do loop_380
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_wtrsm
+
+     !> ZTRSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,   or   A**H*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular matrix.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_wtrsv(uplo,trans,diag,n,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,n)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTRSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               if (nounit) x(j) = x(j)/a(j,j)
+                               temp = x(j)
+                               do i = j - 1,1,-1
+                                   x(i) = x(i) - temp*a(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               if (nounit) x(jx) = x(jx)/a(j,j)
+                               temp = x(jx)
+                               ix = jx
+                               do i = j - 1,1,-1
+                                   ix = ix - incx
+                                   x(ix) = x(ix) - temp*a(i,j)
+                               end do
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               if (nounit) x(j) = x(j)/a(j,j)
+                               temp = x(j)
+                               do i = j + 1,n
+                                   x(i) = x(i) - temp*a(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               if (nounit) x(jx) = x(jx)/a(j,j)
+                               temp = x(jx)
+                               ix = jx
+                               do i = j + 1,n
+                                   ix = ix + incx
+                                   x(ix) = x(ix) - temp*a(i,j)
+                               end do
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x  or  x := inv( a**h )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           if (noconj) then
+                               do i = 1,j - 1
+                                   temp = temp - a(i,j)*x(i)
+                               end do
+                               if (nounit) temp = temp/a(j,j)
+                           else
+                               do i = 1,j - 1
+                                   temp = temp - conjg(a(i,j))*x(i)
+                               end do
+                               if (nounit) temp = temp/conjg(a(j,j))
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           ix = kx
+                           temp = x(jx)
+                           if (noconj) then
+                               do i = 1,j - 1
+                                   temp = temp - a(i,j)*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/a(j,j)
+                           else
+                               do i = 1,j - 1
+                                   temp = temp - conjg(a(i,j))*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/conjg(a(j,j))
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           if (noconj) then
+                               do i = n,j + 1,-1
+                                   temp = temp - a(i,j)*x(i)
+                               end do
+                               if (nounit) temp = temp/a(j,j)
+                           else
+                               do i = n,j + 1,-1
+                                   temp = temp - conjg(a(i,j))*x(i)
+                               end do
+                               if (nounit) temp = temp/conjg(a(j,j))
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           ix = kx
+                           temp = x(jx)
+                           if (noconj) then
+                               do i = n,j + 1,-1
+                                   temp = temp - a(i,j)*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/a(j,j)
+                           else
+                               do i = n,j + 1,-1
+                                   temp = temp - conjg(a(i,j))*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/conjg(a(j,j))
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_wtrsv
+
+
+
+end module stdlib_linalg_blas_w
+#:endif
diff --git a/src/stdlib_linalg_blas_z.fypp b/src/stdlib_linalg_blas_z.fypp
new file mode 100644
index 000000000..00e4e3181
--- /dev/null
+++ b/src/stdlib_linalg_blas_z.fypp
@@ -0,0 +1,5063 @@
+#:include "common.fypp" 
+module stdlib_linalg_blas_z
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas_aux
+     use stdlib_linalg_blas_s
+     use stdlib_linalg_blas_c
+     use stdlib_linalg_blas_d
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+     public :: stdlib_zaxpy
+     public :: stdlib_zcopy
+     public :: stdlib_zdotc
+     public :: stdlib_zdotu
+     public :: stdlib_zdrot
+     public :: stdlib_zdscal
+     public :: stdlib_zgbmv
+     public :: stdlib_zgemm
+     public :: stdlib_zgemv
+     public :: stdlib_zgerc
+     public :: stdlib_zgeru
+     public :: stdlib_zhbmv
+     public :: stdlib_zhemm
+     public :: stdlib_zhemv
+     public :: stdlib_zher
+     public :: stdlib_zher2
+     public :: stdlib_zher2k
+     public :: stdlib_zherk
+     public :: stdlib_zhpmv
+     public :: stdlib_zhpr
+     public :: stdlib_zhpr2
+     public :: stdlib_zrotg
+     public :: stdlib_zscal
+     public :: stdlib_zswap
+     public :: stdlib_zsymm
+     public :: stdlib_zsyr2k
+     public :: stdlib_zsyrk
+     public :: stdlib_ztbmv
+     public :: stdlib_ztbsv
+     public :: stdlib_ztpmv
+     public :: stdlib_ztpsv
+     public :: stdlib_ztrmm
+     public :: stdlib_ztrmv
+     public :: stdlib_ztrsm
+     public :: stdlib_ztrsv
+
+     ! 64-bit real constants 
+     real(dp),    parameter, private ::     negone = -1.00_dp
+     real(dp),    parameter, private ::       zero = 0.00_dp
+     real(dp),    parameter, private ::       half = 0.50_dp
+     real(dp),    parameter, private ::        one = 1.00_dp
+     real(dp),    parameter, private ::        two = 2.00_dp
+     real(dp),    parameter, private ::      three = 3.00_dp
+     real(dp),    parameter, private ::       four = 4.00_dp
+     real(dp),    parameter, private ::      eight = 8.00_dp
+     real(dp),    parameter, private ::        ten = 10.00_dp
+
+     ! 64-bit complex constants 
+     complex(dp), parameter, private :: czero   = ( 0.0_dp,0.0_dp)
+     complex(dp), parameter, private :: chalf   = ( 0.5_dp,0.0_dp)
+     complex(dp), parameter, private :: cone    = ( 1.0_dp,0.0_dp)
+     complex(dp), parameter, private :: cnegone = (-1.0_dp,0.0_dp)
+
+     ! 64-bit scaling constants 
+     integer,     parameter, private :: maxexp = maxexponent(zero) 
+     integer,     parameter, private :: minexp = minexponent(zero) 
+     real(dp),    parameter, private :: rradix = real(radix(zero),dp) 
+     real(dp),    parameter, private :: ulp    = epsilon(zero) 
+     real(dp),    parameter, private :: eps    = ulp*half 
+     real(dp),    parameter, private :: safmin = rradix**max(minexp-1,1-maxexp) 
+     real(dp),    parameter, private :: safmax = one/safmin 
+     real(dp),    parameter, private :: smlnum = safmin/ulp 
+     real(dp),    parameter, private :: bignum = safmax*ulp 
+     real(dp),    parameter, private :: rtmin  = sqrt(smlnum) 
+     real(dp),    parameter, private :: rtmax  = sqrt(bignum) 
+
+     ! 64-bit Blue's scaling constants 
+     ! ssml>=1/s and sbig==1/S with s,S as defined in https://doi.org/10.1145/355769.355771 
+     real(dp),    parameter, private :: tsml   = rradix**ceiling((minexp-1)*half) 
+     real(dp),    parameter, private :: tbig   = rradix**floor((maxexp-digits(zero)+1)*half) 
+     real(dp),    parameter, private :: ssml   = rradix**(-floor((minexp-digits(zero))*half)) 
+     real(dp),    parameter, private :: sbig   = rradix**(-ceiling((maxexp+digits(zero)-1)*half)) 
+
+
+     contains
+
+     !> ZAXPY: constant times a vector plus a vector.
+
+     pure subroutine stdlib_zaxpy(n,za,zx,incx,zy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: za
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(dp), intent(in) :: zx(*)
+           complex(dp), intent(inout) :: zy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           if (n<=0) return
+           if (stdlib_dcabs1(za)==0.0_dp) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              do i = 1,n
+                 zy(i) = zy(i) + za*zx(i)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 zy(iy) = zy(iy) + za*zx(ix)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_zaxpy
+
+     !> ZCOPY: copies a vector, x, to a vector, y.
+
+     pure subroutine stdlib_zcopy(n,zx,incx,zy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(dp), intent(in) :: zx(*)
+           complex(dp), intent(out) :: zy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              do i = 1,n
+               zy(i) = zx(i)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 zy(iy) = zx(ix)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_zcopy
+
+     !> ZDOTC: forms the dot product of two complex vectors
+     !> ZDOTC = X^H * Y
+
+     pure complex(dp) function stdlib_zdotc(n,zx,incx,zy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(dp), intent(in) :: zx(*), zy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           complex(dp) :: ztemp
+           integer(ilp) :: i, ix, iy
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ztemp = (0.0_dp,0.0_dp)
+           stdlib_zdotc = (0.0_dp,0.0_dp)
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              do i = 1,n
+                 ztemp = ztemp + conjg(zx(i))*zy(i)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 ztemp = ztemp + conjg(zx(ix))*zy(iy)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           stdlib_zdotc = ztemp
+           return
+     end function stdlib_zdotc
+
+     !> ZDOTU: forms the dot product of two complex vectors
+     !> ZDOTU = X^T * Y
+
+     pure complex(dp) function stdlib_zdotu(n,zx,incx,zy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(dp), intent(in) :: zx(*), zy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           complex(dp) :: ztemp
+           integer(ilp) :: i, ix, iy
+           ztemp = (0.0_dp,0.0_dp)
+           stdlib_zdotu = (0.0_dp,0.0_dp)
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+              ! code for both increments equal to 1
+              do i = 1,n
+                 ztemp = ztemp + zx(i)*zy(i)
+              end do
+           else
+              ! code for unequal increments or equal increments
+                ! not equal to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 ztemp = ztemp + zx(ix)*zy(iy)
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           stdlib_zdotu = ztemp
+           return
+     end function stdlib_zdotu
+
+     !> Applies a plane rotation, where the cos and sin (c and s) are real
+     !> and the vectors cx and cy are complex.
+     !> jack dongarra, linpack, 3/11/78.
+
+     pure subroutine stdlib_zdrot( n, zx, incx, zy, incy, c, s )
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           real(dp), intent(in) :: c, s
+           ! Array Arguments 
+           complex(dp), intent(inout) :: zx(*), zy(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           complex(dp) :: ctemp
+           ! Executable Statements 
+           if( n<=0 )return
+           if( incx==1 .and. incy==1 ) then
+              ! code for both increments equal to 1
+              do i = 1, n
+                 ctemp = c*zx( i ) + s*zy( i )
+                 zy( i ) = c*zy( i ) - s*zx( i )
+                 zx( i ) = ctemp
+              end do
+           else
+              ! code for unequal increments or equal increments not equal
+                ! to 1
+              ix = 1
+              iy = 1
+              if( incx<0 )ix = ( -n+1 )*incx + 1
+              if( incy<0 )iy = ( -n+1 )*incy + 1
+              do i = 1, n
+                 ctemp = c*zx( ix ) + s*zy( iy )
+                 zy( iy ) = c*zy( iy ) - s*zx( ix )
+                 zx( ix ) = ctemp
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_zdrot
+
+     !> ZDSCAL: scales a vector by a constant.
+
+     pure subroutine stdlib_zdscal(n,da,zx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: da
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: zx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, nincx
+           ! Intrinsic Functions 
+           intrinsic :: cmplx
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              do i = 1,n
+                 zx(i) = cmplx(da,0.0_dp,KIND=dp)*zx(i)
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 zx(i) = cmplx(da,0.0_dp,KIND=dp)*zx(i)
+              end do
+           end if
+           return
+     end subroutine stdlib_zdscal
+
+     !> ZGBMV:  performs one of the matrix-vector operations
+     !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,   or
+     !> y := alpha*A**H*x + beta*y,
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n band matrix, with kl sub-diagonals and ku super-diagonals.
+
+     pure subroutine stdlib_zgbmv(trans,m,n,kl,ku,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, kl, ku, lda, m, n
+           character, intent(in) :: trans
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), x(*)
+           complex(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kup1, kx, ky, lenx, leny
+           logical(lk) :: noconj
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 1
+           else if (m<0) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (kl<0) then
+               info = 4
+           else if (ku<0) then
+               info = 5
+           else if (lda< (kl+ku+1)) then
+               info = 8
+           else if (incx==0) then
+               info = 10
+           else if (incy==0) then
+               info = 13
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZGBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==czero).and. (beta==cone))) return
+           noconj = stdlib_lsame(trans,'T')
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if (stdlib_lsame(trans,'N')) then
+               lenx = n
+               leny = m
+           else
+               lenx = m
+               leny = n
+           end if
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (lenx-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (leny-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the band part of a.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,leny
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,leny
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,leny
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,leny
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           kup1 = ku + 1
+           if (stdlib_lsame(trans,'N')) then
+              ! form  y := alpha*a*x + y.
+               jx = kx
+               if (incy==1) then
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           y(i) = y(i) + temp*a(k+i,j)
+                       end do
+                       jx = jx + incx
+                   end do
+               else
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       iy = ky
+                       k = kup1 - j
+                       do i = max(1,j-ku),min(m,j+kl)
+                           y(iy) = y(iy) + temp*a(k+i,j)
+                           iy = iy + incy
+                       end do
+                       jx = jx + incx
+                       if (j>ku) ky = ky + incy
+                   end do
+               end if
+           else
+              ! form  y := alpha*a**t*x + y  or  y := alpha*a**h*x + y.
+               jy = ky
+               if (incx==1) then
+                   do j = 1,n
+                       temp = czero
+                       k = kup1 - j
+                       if (noconj) then
+                           do i = max(1,j-ku),min(m,j+kl)
+                               temp = temp + a(k+i,j)*x(i)
+                           end do
+                       else
+                           do i = max(1,j-ku),min(m,j+kl)
+                               temp = temp + conjg(a(k+i,j))*x(i)
+                           end do
+                       end if
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               else
+                   do j = 1,n
+                       temp = czero
+                       ix = kx
+                       k = kup1 - j
+                       if (noconj) then
+                           do i = max(1,j-ku),min(m,j+kl)
+                               temp = temp + a(k+i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                       else
+                           do i = max(1,j-ku),min(m,j+kl)
+                               temp = temp + conjg(a(k+i,j))*x(ix)
+                               ix = ix + incx
+                           end do
+                       end if
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                       if (j>ku) kx = kx + incx
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zgbmv
+
+     !> ZGEMM:  performs one of the matrix-matrix operations
+     !> C := alpha*op( A )*op( B ) + beta*C,
+     !> where  op( X ) is one of
+     !> op( X ) = X   or   op( X ) = X**T   or   op( X ) = X**H,
+     !> alpha and beta are scalars, and A, B and C are matrices, with op( A )
+     !> an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+
+     pure subroutine stdlib_zgemm(transa,transb,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, m, n
+           character, intent(in) :: transa, transb
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, j, l, nrowa, nrowb
+           logical(lk) :: conja, conjb, nota, notb
+           
+           
+           ! set  nota  and  notb  as  true if  a  and  b  respectively are not
+           ! conjugated or transposed, set  conja and conjb  as true if  a  and
+           ! b  respectively are to be  transposed but  not conjugated  and set
+           ! nrowa and nrowb  as the number of rows  of  a  and  b  respectively.
+           nota = stdlib_lsame(transa,'N')
+           notb = stdlib_lsame(transb,'N')
+           conja = stdlib_lsame(transa,'C')
+           conjb = stdlib_lsame(transb,'C')
+           if (nota) then
+               nrowa = m
+           else
+               nrowa = k
+           end if
+           if (notb) then
+               nrowb = k
+           else
+               nrowb = n
+           end if
+           ! test the input parameters.
+           info = 0
+           if ((.not.nota) .and. (.not.conja) .and.(.not.stdlib_lsame(transa,'T'))) then
+               info = 1
+           else if ((.not.notb) .and. (.not.conjb) .and.(.not.stdlib_lsame(transb,'T'))) &
+                     then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda<max(1,nrowa)) then
+               info = 8
+           else if (ldb<max(1,nrowb)) then
+               info = 10
+           else if (ldc<max(1,m)) then
+               info = 13
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZGEMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.(((alpha==czero).or. (k==0)).and. (beta==cone))) &
+                     return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (beta==czero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = czero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (notb) then
+               if (nota) then
+                 ! form  c := alpha*a*b + beta*c.
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,m
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*b(l,j)
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               else if (conja) then
+                 ! form  c := alpha*a**h*b + beta*c.
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*b(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**t*b + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*b(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           else if (nota) then
+               if (conjb) then
+                 ! form  c := alpha*a*b**h + beta*c.
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,m
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*conjg(b(j,l))
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a*b**t + beta*c
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,m
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,m
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           temp = alpha*b(j,l)
+                           do i = 1,m
+                               c(i,j) = c(i,j) + temp*a(i,l)
+                           end do
+                       end do
+                   end do
+               end if
+           else if (conja) then
+               if (conjb) then
+                 ! form  c := alpha*a**h*b**h + beta*c.
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*conjg(b(j,l))
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**h*b**t + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*b(j,l)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           else
+               if (conjb) then
+                 ! form  c := alpha*a**t*b**h + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*conjg(b(j,l))
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                 ! form  c := alpha*a**t*b**t + beta*c
+                   do j = 1,n
+                       do i = 1,m
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*b(j,l)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zgemm
+
+     !> ZGEMV:  performs one of the matrix-vector operations
+     !> y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,   or
+     !> y := alpha*A**H*x + beta*y,
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+
+     pure subroutine stdlib_zgemv(trans,m,n,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           character, intent(in) :: trans
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), x(*)
+           complex(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky, lenx, leny
+           logical(lk) :: noconj
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 1
+           else if (m<0) then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (lda<max(1,m)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           else if (incy==0) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZGEMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==czero).and. (beta==cone))) return
+           noconj = stdlib_lsame(trans,'T')
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if (stdlib_lsame(trans,'N')) then
+               lenx = n
+               leny = m
+           else
+               lenx = m
+               leny = n
+           end if
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (lenx-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (leny-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,leny
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,leny
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,leny
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,leny
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           if (stdlib_lsame(trans,'N')) then
+              ! form  y := alpha*a*x + y.
+               jx = kx
+               if (incy==1) then
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       do i = 1,m
+                           y(i) = y(i) + temp*a(i,j)
+                       end do
+                       jx = jx + incx
+                   end do
+               else
+                   do j = 1,n
+                       temp = alpha*x(jx)
+                       iy = ky
+                       do i = 1,m
+                           y(iy) = y(iy) + temp*a(i,j)
+                           iy = iy + incy
+                       end do
+                       jx = jx + incx
+                   end do
+               end if
+           else
+              ! form  y := alpha*a**t*x + y  or  y := alpha*a**h*x + y.
+               jy = ky
+               if (incx==1) then
+                   do j = 1,n
+                       temp = czero
+                       if (noconj) then
+                           do i = 1,m
+                               temp = temp + a(i,j)*x(i)
+                           end do
+                       else
+                           do i = 1,m
+                               temp = temp + conjg(a(i,j))*x(i)
+                           end do
+                       end if
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               else
+                   do j = 1,n
+                       temp = czero
+                       ix = kx
+                       if (noconj) then
+                           do i = 1,m
+                               temp = temp + a(i,j)*x(ix)
+                               ix = ix + incx
+                           end do
+                       else
+                           do i = 1,m
+                               temp = temp + conjg(a(i,j))*x(ix)
+                               ix = ix + incx
+                           end do
+                       end if
+                       y(jy) = y(jy) + alpha*temp
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zgemv
+
+     !> ZGERC:  performs the rank 1 operation
+     !> A := alpha*x*y**H + A,
+     !> where alpha is a scalar, x is an m element vector, y is an n element
+     !> vector and A is an m by n matrix.
+
+     pure subroutine stdlib_zgerc(m,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jy, kx
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! test the input parameters.
+           info = 0
+           if (m<0) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,m)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZGERC ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or. (alpha==czero)) return
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (incy>0) then
+               jy = 1
+           else
+               jy = 1 - (n-1)*incy
+           end if
+           if (incx==1) then
+               do j = 1,n
+                   if (y(jy)/=czero) then
+                       temp = alpha*conjg(y(jy))
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(i)*temp
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           else
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (m-1)*incx
+               end if
+               do j = 1,n
+                   if (y(jy)/=czero) then
+                       temp = alpha*conjg(y(jy))
+                       ix = kx
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(ix)*temp
+                           ix = ix + incx
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           end if
+           return
+     end subroutine stdlib_zgerc
+
+     !> ZGERU:  performs the rank 1 operation
+     !> A := alpha*x*y**T + A,
+     !> where alpha is a scalar, x is an m element vector, y is an n element
+     !> vector and A is an m by n matrix.
+
+     pure subroutine stdlib_zgeru(m,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jy, kx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! test the input parameters.
+           info = 0
+           if (m<0) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,m)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZGERU ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or. (alpha==czero)) return
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (incy>0) then
+               jy = 1
+           else
+               jy = 1 - (n-1)*incy
+           end if
+           if (incx==1) then
+               do j = 1,n
+                   if (y(jy)/=czero) then
+                       temp = alpha*y(jy)
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(i)*temp
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           else
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (m-1)*incx
+               end if
+               do j = 1,n
+                   if (y(jy)/=czero) then
+                       temp = alpha*y(jy)
+                       ix = kx
+                       do i = 1,m
+                           a(i,j) = a(i,j) + x(ix)*temp
+                           ix = ix + incx
+                       end do
+                   end if
+                   jy = jy + incy
+               end do
+           end if
+           return
+     end subroutine stdlib_zgeru
+
+     !> ZHBMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n hermitian band matrix, with k super-diagonals.
+
+     pure subroutine stdlib_zhbmv(uplo,n,k,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, k, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), x(*)
+           complex(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(dp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kplus1, kx, ky, l
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (k<0) then
+               info = 3
+           else if (lda< (k+1)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           else if (incy==0) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==czero).and. (beta==cone))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of the array a
+           ! are accessed sequentially with cone pass through a.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when upper triangle of a is stored.
+               kplus1 = k + 1
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       l = kplus1 - j
+                       do i = max(1,j-k),j - 1
+                           y(i) = y(i) + temp1*a(l+i,j)
+                           temp2 = temp2 + conjg(a(l+i,j))*x(i)
+                       end do
+                       y(j) = y(j) + temp1*real(a(kplus1,j),KIND=dp) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       ix = kx
+                       iy = ky
+                       l = kplus1 - j
+                       do i = max(1,j-k),j - 1
+                           y(iy) = y(iy) + temp1*a(l+i,j)
+                           temp2 = temp2 + conjg(a(l+i,j))*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*real(a(kplus1,j),KIND=dp) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       if (j>k) then
+                           kx = kx + incx
+                           ky = ky + incy
+                       end if
+                   end do
+               end if
+           else
+              ! form  y  when lower triangle of a is stored.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       y(j) = y(j) + temp1*real(a(1,j),KIND=dp)
+                       l = 1 - j
+                       do i = j + 1,min(n,j+k)
+                           y(i) = y(i) + temp1*a(l+i,j)
+                           temp2 = temp2 + conjg(a(l+i,j))*x(i)
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       y(jy) = y(jy) + temp1*real(a(1,j),KIND=dp)
+                       l = 1 - j
+                       ix = jx
+                       iy = jy
+                       do i = j + 1,min(n,j+k)
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*a(l+i,j)
+                           temp2 = temp2 + conjg(a(l+i,j))*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zhbmv
+
+     !> ZHEMM:  performs one of the matrix-matrix operations
+     !> C := alpha*A*B + beta*C,
+     !> or
+     !> C := alpha*B*A + beta*C,
+     !> where alpha and beta are scalars, A is an hermitian matrix and  B and
+     !> C are m by n matrices.
+
+     pure subroutine stdlib_zhemm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           character, intent(in) :: side, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Local Scalars 
+           complex(dp) :: temp1, temp2
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: upper
+           
+           
+           ! set nrowa as the number of rows of a.
+           if (stdlib_lsame(side,'L')) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           ! test the input parameters.
+           info = 0
+           if ((.not.stdlib_lsame(side,'L')) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,m)) then
+               info = 9
+           else if (ldc<max(1,m)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHEMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==czero).and. (beta==cone))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (beta==czero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = czero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(side,'L')) then
+              ! form  c := alpha*a*b + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,m
+                           temp1 = alpha*b(i,j)
+                           temp2 = czero
+                           do k = 1,i - 1
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*conjg(a(k,i))
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = temp1*real(a(i,i),KIND=dp) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*real(a(i,i),KIND=dp) +&
+                                         alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = m,1,-1
+                           temp1 = alpha*b(i,j)
+                           temp2 = czero
+                           do k = i + 1,m
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*conjg(a(k,i))
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = temp1*real(a(i,i),KIND=dp) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*real(a(i,i),KIND=dp) +&
+                                         alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*b*a + beta*c.
+               loop_170: do j = 1,n
+                   temp1 = alpha*real(a(j,j),KIND=dp)
+                   if (beta==czero) then
+                       do i = 1,m
+                           c(i,j) = temp1*b(i,j)
+                       end do
+                   else
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j) + temp1*b(i,j)
+                       end do
+                   end if
+                   do k = 1,j - 1
+                       if (upper) then
+                           temp1 = alpha*a(k,j)
+                       else
+                           temp1 = alpha*conjg(a(j,k))
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+                   do k = j + 1,n
+                       if (upper) then
+                           temp1 = alpha*conjg(a(j,k))
+                       else
+                           temp1 = alpha*a(k,j)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+               end do loop_170
+           end if
+           return
+     end subroutine stdlib_zhemm
+
+     !> ZHEMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n hermitian matrix.
+
+     pure subroutine stdlib_zhemv(uplo,n,alpha,a,lda,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), x(*)
+           complex(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(dp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (lda<max(1,n)) then
+               info = 5
+           else if (incx==0) then
+               info = 7
+           else if (incy==0) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHEMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==czero).and. (beta==cone))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when a is stored in upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       do i = 1,j - 1
+                           y(i) = y(i) + temp1*a(i,j)
+                           temp2 = temp2 + conjg(a(i,j))*x(i)
+                       end do
+                       y(j) = y(j) + temp1*real(a(j,j),KIND=dp) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       ix = kx
+                       iy = ky
+                       do i = 1,j - 1
+                           y(iy) = y(iy) + temp1*a(i,j)
+                           temp2 = temp2 + conjg(a(i,j))*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*real(a(j,j),KIND=dp) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           else
+              ! form  y  when a is stored in lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       y(j) = y(j) + temp1*real(a(j,j),KIND=dp)
+                       do i = j + 1,n
+                           y(i) = y(i) + temp1*a(i,j)
+                           temp2 = temp2 + conjg(a(i,j))*x(i)
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       y(jy) = y(jy) + temp1*real(a(j,j),KIND=dp)
+                       ix = jx
+                       iy = jy
+                       do i = j + 1,n
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*a(i,j)
+                           temp2 = temp2 + conjg(a(i,j))*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zhemv
+
+     !> ZHER:   performs the hermitian rank 1 operation
+     !> A := alpha*x*x**H + A,
+     !> where alpha is a real scalar, x is an n element vector and A is an
+     !> n by n hermitian matrix.
+
+     pure subroutine stdlib_zher(uplo,n,alpha,x,incx,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (lda<max(1,n)) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHER  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==real(czero,KIND=dp))) return
+           ! set the start point in x if the increment is not unity.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when a is stored in upper triangle.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=czero) then
+                           temp = alpha*conjg(x(j))
+                           do i = 1,j - 1
+                               a(i,j) = a(i,j) + x(i)*temp
+                           end do
+                           a(j,j) = real(a(j,j),KIND=dp) + real(x(j)*temp,KIND=dp)
+                       else
+                           a(j,j) = real(a(j,j),KIND=dp)
+                       end if
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=czero) then
+                           temp = alpha*conjg(x(jx))
+                           ix = kx
+                           do i = 1,j - 1
+                               a(i,j) = a(i,j) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                           a(j,j) = real(a(j,j),KIND=dp) + real(x(jx)*temp,KIND=dp)
+                       else
+                           a(j,j) = real(a(j,j),KIND=dp)
+                       end if
+                       jx = jx + incx
+                   end do
+               end if
+           else
+              ! form  a  when a is stored in lower triangle.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=czero) then
+                           temp = alpha*conjg(x(j))
+                           a(j,j) = real(a(j,j),KIND=dp) + real(temp*x(j),KIND=dp)
+                           do i = j + 1,n
+                               a(i,j) = a(i,j) + x(i)*temp
+                           end do
+                       else
+                           a(j,j) = real(a(j,j),KIND=dp)
+                       end if
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=czero) then
+                           temp = alpha*conjg(x(jx))
+                           a(j,j) = real(a(j,j),KIND=dp) + real(temp*x(jx),KIND=dp)
+                           ix = jx
+                           do i = j + 1,n
+                               ix = ix + incx
+                               a(i,j) = a(i,j) + x(ix)*temp
+                           end do
+                       else
+                           a(j,j) = real(a(j,j),KIND=dp)
+                       end if
+                       jx = jx + incx
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zher
+
+     !> ZHER2:  performs the hermitian rank 2 operation
+     !> A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
+     !> where alpha is a scalar, x and y are n element vectors and A is an n
+     !> by n hermitian matrix.
+
+     pure subroutine stdlib_zher2(uplo,n,alpha,x,incx,y,incy,a,lda)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           else if (lda<max(1,n)) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHER2 ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==czero)) return
+           ! set up the start points in x and y if the increments are not both
+           ! unity.
+           if ((incx/=1) .or. (incy/=1)) then
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (n-1)*incx
+               end if
+               if (incy>0) then
+                   ky = 1
+               else
+                   ky = 1 - (n-1)*incy
+               end if
+               jx = kx
+               jy = ky
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when a is stored in the upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=czero) .or. (y(j)/=czero)) then
+                           temp1 = alpha*conjg(y(j))
+                           temp2 = conjg(alpha*x(j))
+                           do i = 1,j - 1
+                               a(i,j) = a(i,j) + x(i)*temp1 + y(i)*temp2
+                           end do
+                           a(j,j) = real(a(j,j),KIND=dp) +real(x(j)*temp1+y(j)*temp2,KIND=dp)
+                                     
+                       else
+                           a(j,j) = real(a(j,j),KIND=dp)
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=czero) .or. (y(jy)/=czero)) then
+                           temp1 = alpha*conjg(y(jy))
+                           temp2 = conjg(alpha*x(jx))
+                           ix = kx
+                           iy = ky
+                           do i = 1,j - 1
+                               a(i,j) = a(i,j) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                           a(j,j) = real(a(j,j),KIND=dp) +real(x(jx)*temp1+y(jy)*temp2,KIND=dp)
+                                     
+                       else
+                           a(j,j) = real(a(j,j),KIND=dp)
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           else
+              ! form  a  when a is stored in the lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=czero) .or. (y(j)/=czero)) then
+                           temp1 = alpha*conjg(y(j))
+                           temp2 = conjg(alpha*x(j))
+                           a(j,j) = real(a(j,j),KIND=dp) +real(x(j)*temp1+y(j)*temp2,KIND=dp)
+                                     
+                           do i = j + 1,n
+                               a(i,j) = a(i,j) + x(i)*temp1 + y(i)*temp2
+                           end do
+                       else
+                           a(j,j) = real(a(j,j),KIND=dp)
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=czero) .or. (y(jy)/=czero)) then
+                           temp1 = alpha*conjg(y(jy))
+                           temp2 = conjg(alpha*x(jx))
+                           a(j,j) = real(a(j,j),KIND=dp) +real(x(jx)*temp1+y(jy)*temp2,KIND=dp)
+                                     
+                           ix = jx
+                           iy = jy
+                           do i = j + 1,n
+                               ix = ix + incx
+                               iy = iy + incy
+                               a(i,j) = a(i,j) + x(ix)*temp1 + y(iy)*temp2
+                           end do
+                       else
+                           a(j,j) = real(a(j,j),KIND=dp)
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zher2
+
+     !> ZHER2K:  performs one of the hermitian rank 2k operations
+     !> C := alpha*A*B**H + conjg( alpha )*B*A**H + beta*C,
+     !> or
+     !> C := alpha*A**H*B + conjg( alpha )*B**H*A + beta*C,
+     !> where  alpha and beta  are scalars with  beta  real,  C is an  n by n
+     !> hermitian matrix and  A and B  are  n by k matrices in the first case
+     !> and  k by n  matrices in the second case.
+
+     pure subroutine stdlib_zher2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha
+           real(dp), intent(in) :: beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Local Scalars 
+           complex(dp) :: temp1, temp2
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'C'))) &
+                     then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,nrowa)) then
+               info = 9
+           else if (ldc<max(1,n)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHER2K',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==czero).or.(k==0)).and. (beta==one))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (upper) then
+                   if (beta==real(czero,KIND=dp)) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j - 1
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=dp)
+                       end do
+                   end if
+               else
+                   if (beta==real(czero,KIND=dp)) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           c(j,j) = beta*real(c(j,j),KIND=dp)
+                           do i = j + 1,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*b**h + conjg( alpha )*b*a**h +
+                         ! c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==real(czero,KIND=dp)) then
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,j - 1
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=dp)
+                       else
+                           c(j,j) = real(c(j,j),KIND=dp)
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=czero) .or. (b(j,l)/=czero)) then
+                               temp1 = alpha*conjg(b(j,l))
+                               temp2 = conjg(alpha*a(j,l))
+                               do i = 1,j - 1
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                               c(j,j) = real(c(j,j),KIND=dp) +real(a(j,l)*temp1+b(j,l)*temp2,&
+                                         KIND=dp)
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==real(czero,KIND=dp)) then
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=one) then
+                           do i = j + 1,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=dp)
+                       else
+                           c(j,j) = real(c(j,j),KIND=dp)
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=czero) .or. (b(j,l)/=czero)) then
+                               temp1 = alpha*conjg(b(j,l))
+                               temp2 = conjg(alpha*a(j,l))
+                               do i = j + 1,n
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                               c(j,j) = real(c(j,j),KIND=dp) +real(a(j,l)*temp1+b(j,l)*temp2,&
+                                         KIND=dp)
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**h*b + conjg( alpha )*b**h*a +
+                         ! c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp1 = czero
+                           temp2 = czero
+                           do l = 1,k
+                               temp1 = temp1 + conjg(a(l,i))*b(l,j)
+                               temp2 = temp2 + conjg(b(l,i))*a(l,j)
+                           end do
+                           if (i==j) then
+                               if (beta==real(czero,KIND=dp)) then
+                                   c(j,j) = real(alpha*temp1+conjg(alpha)*temp2,KIND=dp)
+                               else
+                                   c(j,j) = beta*real(c(j,j),KIND=dp) +real(alpha*temp1+conjg(&
+                                             alpha)*temp2,KIND=dp)
+                               end if
+                           else
+                               if (beta==real(czero,KIND=dp)) then
+                                   c(i,j) = alpha*temp1 + conjg(alpha)*temp2
+                               else
+                                   c(i,j) = beta*c(i,j) + alpha*temp1 +conjg(alpha)*temp2
+                               end if
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp1 = czero
+                           temp2 = czero
+                           do l = 1,k
+                               temp1 = temp1 + conjg(a(l,i))*b(l,j)
+                               temp2 = temp2 + conjg(b(l,i))*a(l,j)
+                           end do
+                           if (i==j) then
+                               if (beta==real(czero,KIND=dp)) then
+                                   c(j,j) = real(alpha*temp1+conjg(alpha)*temp2,KIND=dp)
+                               else
+                                   c(j,j) = beta*real(c(j,j),KIND=dp) +real(alpha*temp1+conjg(&
+                                             alpha)*temp2,KIND=dp)
+                               end if
+                           else
+                               if (beta==real(czero,KIND=dp)) then
+                                   c(i,j) = alpha*temp1 + conjg(alpha)*temp2
+                               else
+                                   c(i,j) = beta*c(i,j) + alpha*temp1 +conjg(alpha)*temp2
+                               end if
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zher2k
+
+     !> ZHERK:  performs one of the hermitian rank k operations
+     !> C := alpha*A*A**H + beta*C,
+     !> or
+     !> C := alpha*A**H*A + beta*C,
+     !> where  alpha and beta  are  real scalars,  C is an  n by n  hermitian
+     !> matrix and  A  is an  n by k  matrix in the  first case and a  k by n
+     !> matrix in the second case.
+
+     pure subroutine stdlib_zherk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,conjg,max
+           ! Local Scalars 
+           complex(dp) :: temp
+           real(dp) :: rtemp
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'C'))) &
+                     then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldc<max(1,n)) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHERK ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==zero).or.(k==0)).and. (beta==one))) return
+           ! and when  alpha.eq.zero.
+           if (alpha==zero) then
+               if (upper) then
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j - 1
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=dp)
+                       end do
+                   end if
+               else
+                   if (beta==zero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           c(j,j) = beta*real(c(j,j),KIND=dp)
+                           do i = j + 1,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*a**h + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = 1,j
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           do i = 1,j - 1
+                               c(i,j) = beta*c(i,j)
+                           end do
+                           c(j,j) = beta*real(c(j,j),KIND=dp)
+                       else
+                           c(j,j) = real(c(j,j),KIND=dp)
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=cmplx(zero,KIND=dp)) then
+                               temp = alpha*conjg(a(j,l))
+                               do i = 1,j - 1
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                               c(j,j) = real(c(j,j),KIND=dp) + real(temp*a(i,l),KIND=dp)
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==zero) then
+                           do i = j,n
+                               c(i,j) = zero
+                           end do
+                       else if (beta/=one) then
+                           c(j,j) = beta*real(c(j,j),KIND=dp)
+                           do i = j + 1,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       else
+                           c(j,j) = real(c(j,j),KIND=dp)
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=cmplx(zero,KIND=dp)) then
+                               temp = alpha*conjg(a(j,l))
+                               c(j,j) = real(c(j,j),KIND=dp) + real(temp*a(j,l),KIND=dp)
+                               do i = j + 1,n
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**h*a + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j - 1
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                       rtemp = zero
+                       do l = 1,k
+                           rtemp = rtemp + conjg(a(l,j))*a(l,j)
+                       end do
+                       if (beta==zero) then
+                           c(j,j) = alpha*rtemp
+                       else
+                           c(j,j) = alpha*rtemp + beta*real(c(j,j),KIND=dp)
+                       end if
+                   end do
+               else
+                   do j = 1,n
+                       rtemp = zero
+                       do l = 1,k
+                           rtemp = rtemp + conjg(a(l,j))*a(l,j)
+                       end do
+                       if (beta==zero) then
+                           c(j,j) = alpha*rtemp
+                       else
+                           c(j,j) = alpha*rtemp + beta*real(c(j,j),KIND=dp)
+                       end if
+                       do i = j + 1,n
+                           temp = zero
+                           do l = 1,k
+                               temp = temp + conjg(a(l,i))*a(l,j)
+                           end do
+                           if (beta==zero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zherk
+
+     !> ZHPMV:  performs the matrix-vector operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n hermitian matrix, supplied in packed form.
+
+     pure subroutine stdlib_zhpmv(uplo,n,alpha,ap,x,incx,beta,y,incy)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: ap(*), x(*)
+           complex(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(dp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 6
+           else if (incy==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHPMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. ((alpha==czero).and. (beta==cone))) return
+           ! set up the start points in  x  and  y.
+           if (incx>0) then
+               kx = 1
+           else
+               kx = 1 - (n-1)*incx
+           end if
+           if (incy>0) then
+               ky = 1
+           else
+               ky = 1 - (n-1)*incy
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           ! first form  y := beta*y.
+           if (beta/=cone) then
+               if (incy==1) then
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(i) = czero
+                       end do
+                   else
+                       do i = 1,n
+                           y(i) = beta*y(i)
+                       end do
+                   end if
+               else
+                   iy = ky
+                   if (beta==czero) then
+                       do i = 1,n
+                           y(iy) = czero
+                           iy = iy + incy
+                       end do
+                   else
+                       do i = 1,n
+                           y(iy) = beta*y(iy)
+                           iy = iy + incy
+                       end do
+                   end if
+               end if
+           end if
+           if (alpha==czero) return
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  y  when ap contains the upper triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       k = kk
+                       do i = 1,j - 1
+                           y(i) = y(i) + temp1*ap(k)
+                           temp2 = temp2 + conjg(ap(k))*x(i)
+                           k = k + 1
+                       end do
+                       y(j) = y(j) + temp1*real(ap(kk+j-1),KIND=dp) + alpha*temp2
+                       kk = kk + j
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       ix = kx
+                       iy = ky
+                       do k = kk,kk + j - 2
+                           y(iy) = y(iy) + temp1*ap(k)
+                           temp2 = temp2 + conjg(ap(k))*x(ix)
+                           ix = ix + incx
+                           iy = iy + incy
+                       end do
+                       y(jy) = y(jy) + temp1*real(ap(kk+j-1),KIND=dp) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  y  when ap contains the lower triangle.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       temp1 = alpha*x(j)
+                       temp2 = czero
+                       y(j) = y(j) + temp1*real(ap(kk),KIND=dp)
+                       k = kk + 1
+                       do i = j + 1,n
+                           y(i) = y(i) + temp1*ap(k)
+                           temp2 = temp2 + conjg(ap(k))*x(i)
+                           k = k + 1
+                       end do
+                       y(j) = y(j) + alpha*temp2
+                       kk = kk + (n-j+1)
+                   end do
+               else
+                   jx = kx
+                   jy = ky
+                   do j = 1,n
+                       temp1 = alpha*x(jx)
+                       temp2 = czero
+                       y(jy) = y(jy) + temp1*real(ap(kk),KIND=dp)
+                       ix = jx
+                       iy = jy
+                       do k = kk + 1,kk + n - j
+                           ix = ix + incx
+                           iy = iy + incy
+                           y(iy) = y(iy) + temp1*ap(k)
+                           temp2 = temp2 + conjg(ap(k))*x(ix)
+                       end do
+                       y(jy) = y(jy) + alpha*temp2
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + (n-j+1)
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zhpmv
+
+     !> ZHPR:    performs the hermitian rank 1 operation
+     !> A := alpha*x*x**H + A,
+     !> where alpha is a real scalar, x is an n element vector and A is an
+     !> n by n hermitian matrix, supplied in packed form.
+
+     pure subroutine stdlib_zhpr(uplo,n,alpha,x,incx,ap)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(dp), intent(inout) :: ap(*)
+           complex(dp), intent(in) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHPR  ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==real(czero,KIND=dp))) return
+           ! set the start point in x if the increment is not unity.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when upper triangle is stored in ap.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=czero) then
+                           temp = alpha*conjg(x(j))
+                           k = kk
+                           do i = 1,j - 1
+                               ap(k) = ap(k) + x(i)*temp
+                               k = k + 1
+                           end do
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=dp) + real(x(j)*temp,KIND=dp)
+                       else
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=dp)
+                       end if
+                       kk = kk + j
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=czero) then
+                           temp = alpha*conjg(x(jx))
+                           ix = kx
+                           do k = kk,kk + j - 2
+                               ap(k) = ap(k) + x(ix)*temp
+                               ix = ix + incx
+                           end do
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=dp) + real(x(jx)*temp,KIND=dp)
+                                     
+                       else
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=dp)
+                       end if
+                       jx = jx + incx
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+               if (incx==1) then
+                   do j = 1,n
+                       if (x(j)/=czero) then
+                           temp = alpha*conjg(x(j))
+                           ap(kk) = real(ap(kk),KIND=dp) + real(temp*x(j),KIND=dp)
+                           k = kk + 1
+                           do i = j + 1,n
+                               ap(k) = ap(k) + x(i)*temp
+                               k = k + 1
+                           end do
+                       else
+                           ap(kk) = real(ap(kk),KIND=dp)
+                       end if
+                       kk = kk + n - j + 1
+                   end do
+               else
+                   jx = kx
+                   do j = 1,n
+                       if (x(jx)/=czero) then
+                           temp = alpha*conjg(x(jx))
+                           ap(kk) = real(ap(kk),KIND=dp) + real(temp*x(jx),KIND=dp)
+                           ix = jx
+                           do k = kk + 1,kk + n - j
+                               ix = ix + incx
+                               ap(k) = ap(k) + x(ix)*temp
+                           end do
+                       else
+                           ap(kk) = real(ap(kk),KIND=dp)
+                       end if
+                       jx = jx + incx
+                       kk = kk + n - j + 1
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zhpr
+
+     !> ZHPR2:  performs the hermitian rank 2 operation
+     !> A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
+     !> where alpha is a scalar, x and y are n element vectors and A is an
+     !> n by n hermitian matrix, supplied in packed form.
+
+     pure subroutine stdlib_zhpr2(uplo,n,alpha,x,incx,y,incy,ap)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: incx, incy, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(dp), intent(inout) :: ap(*)
+           complex(dp), intent(in) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: temp1, temp2
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (n<0) then
+               info = 2
+           else if (incx==0) then
+               info = 5
+           else if (incy==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZHPR2 ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (alpha==czero)) return
+           ! set up the start points in x and y if the increments are not both
+           ! unity.
+           if ((incx/=1) .or. (incy/=1)) then
+               if (incx>0) then
+                   kx = 1
+               else
+                   kx = 1 - (n-1)*incx
+               end if
+               if (incy>0) then
+                   ky = 1
+               else
+                   ky = 1 - (n-1)*incy
+               end if
+               jx = kx
+               jy = ky
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           kk = 1
+           if (stdlib_lsame(uplo,'U')) then
+              ! form  a  when upper triangle is stored in ap.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=czero) .or. (y(j)/=czero)) then
+                           temp1 = alpha*conjg(y(j))
+                           temp2 = conjg(alpha*x(j))
+                           k = kk
+                           do i = 1,j - 1
+                               ap(k) = ap(k) + x(i)*temp1 + y(i)*temp2
+                               k = k + 1
+                           end do
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=dp) +real(x(j)*temp1+y(j)*temp2,&
+                                     KIND=dp)
+                       else
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=dp)
+                       end if
+                       kk = kk + j
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=czero) .or. (y(jy)/=czero)) then
+                           temp1 = alpha*conjg(y(jy))
+                           temp2 = conjg(alpha*x(jx))
+                           ix = kx
+                           iy = ky
+                           do k = kk,kk + j - 2
+                               ap(k) = ap(k) + x(ix)*temp1 + y(iy)*temp2
+                               ix = ix + incx
+                               iy = iy + incy
+                           end do
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=dp) +real(x(jx)*temp1+y(jy)*temp2,&
+                                     KIND=dp)
+                       else
+                           ap(kk+j-1) = real(ap(kk+j-1),KIND=dp)
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + j
+                   end do
+               end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+               if ((incx==1) .and. (incy==1)) then
+                   do j = 1,n
+                       if ((x(j)/=czero) .or. (y(j)/=czero)) then
+                           temp1 = alpha*conjg(y(j))
+                           temp2 = conjg(alpha*x(j))
+                           ap(kk) = real(ap(kk),KIND=dp) +real(x(j)*temp1+y(j)*temp2,KIND=dp)
+                                     
+                           k = kk + 1
+                           do i = j + 1,n
+                               ap(k) = ap(k) + x(i)*temp1 + y(i)*temp2
+                               k = k + 1
+                           end do
+                       else
+                           ap(kk) = real(ap(kk),KIND=dp)
+                       end if
+                       kk = kk + n - j + 1
+                   end do
+               else
+                   do j = 1,n
+                       if ((x(jx)/=czero) .or. (y(jy)/=czero)) then
+                           temp1 = alpha*conjg(y(jy))
+                           temp2 = conjg(alpha*x(jx))
+                           ap(kk) = real(ap(kk),KIND=dp) +real(x(jx)*temp1+y(jy)*temp2,KIND=dp)
+                                     
+                           ix = jx
+                           iy = jy
+                           do k = kk + 1,kk + n - j
+                               ix = ix + incx
+                               iy = iy + incy
+                               ap(k) = ap(k) + x(ix)*temp1 + y(iy)*temp2
+                           end do
+                       else
+                           ap(kk) = real(ap(kk),KIND=dp)
+                       end if
+                       jx = jx + incx
+                       jy = jy + incy
+                       kk = kk + n - j + 1
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zhpr2
+
+     !> !
+     !>
+     !> The computation uses the formulas
+     !> |x| = sqrt( Re(x)**2 + Im(x)**2 )
+     !> sgn(x) = x / |x|  if x /= 0
+     !> = 1        if x  = 0
+     !> c = |a| / sqrt(|a|**2 + |b|**2)
+     !> s = sgn(a) * conjg(b) / sqrt(|a|**2 + |b|**2)
+     !> When a and b are real and r /= 0, the formulas simplify to
+     !> r = sgn(a)*sqrt(|a|**2 + |b|**2)
+     !> c = a / r
+     !> s = b / r
+     !> the same as in DROTG when |a| > |b|.  When |b| >= |a|, the
+     !> sign of c and s will be different from those computed by DROTG
+     !> if the signs of a and b are not the same.
+
+     pure subroutine stdlib_zrotg( a, b, c, s )
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+        ! Constants 
+        integer, parameter :: wp = kind(1._dp)
+        ! Scaling Constants 
+        ! Scalar Arguments 
+        real(dp), intent(out) :: c
+        complex(dp), intent(inout) :: a
+        complex(dp), intent(in) :: b
+        complex(dp), intent(out) :: s
+        ! Local Scalars 
+        real(dp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w
+        complex(dp) :: f, fs, g, gs, r, t
+        ! Intrinsic Functions 
+        intrinsic :: abs,aimag,conjg,max,min,real,sqrt
+        ! Statement Functions 
+        real(dp) :: abssq
+        ! Statement Function Definitions 
+        abssq( t ) = real( t,KIND=dp)**2 + aimag( t )**2
+        ! Executable Statements 
+        f = a
+        g = b
+        if( g == czero ) then
+           c = one
+           s = czero
+           r = f
+        else if( f == czero ) then
+           c = zero
+           g1 = max( abs(real(g,KIND=dp)), abs(aimag(g)) )
+           if( g1 > rtmin .and. g1 < rtmax ) then
+              ! use unscaled algorithm
+              g2 = abssq( g )
+              d = sqrt( g2 )
+              s = conjg( g ) / d
+              r = d
+           else
+              ! use scaled algorithm
+              u = min( safmax, max( safmin, g1 ) )
+              uu = one / u
+              gs = g*uu
+              g2 = abssq( gs )
+              d = sqrt( g2 )
+              s = conjg( gs ) / d
+              r = d*u
+           end if
+        else
+           f1 = max( abs(real(f,KIND=dp)), abs(aimag(f)) )
+           g1 = max( abs(real(g,KIND=dp)), abs(aimag(g)) )
+     if( f1 > rtmin .and. f1 < rtmax .and.          g1 > rtmin .and. g1 < rtmax ) then
+              ! use unscaled algorithm
+              f2 = abssq( f )
+              g2 = abssq( g )
+              h2 = f2 + g2
+              if( f2 > rtmin .and. h2 < rtmax ) then
+                 d = sqrt( f2*h2 )
+              else
+                 d = sqrt( f2 )*sqrt( h2 )
+              end if
+              p = 1 / d
+              c = f2*p
+              s = conjg( g )*( f*p )
+              r = f*( h2*p )
+           else
+              ! use scaled algorithm
+              u = min( safmax, max( safmin, f1, g1 ) )
+              uu = one / u
+              gs = g*uu
+              g2 = abssq( gs )
+              if( f1*uu < rtmin ) then
+                 ! f is not well-scaled when scaled by g1.
+                 ! use a different scaling for f.
+                 v = min( safmax, max( safmin, f1 ) )
+                 vv = one / v
+                 w = v * uu
+                 fs = f*vv
+                 f2 = abssq( fs )
+                 h2 = f2*w**2 + g2
+              else
+                 ! otherwise use the same scaling for f and g.
+                 w = one
+                 fs = f*uu
+                 f2 = abssq( fs )
+                 h2 = f2 + g2
+              end if
+              if( f2 > rtmin .and. h2 < rtmax ) then
+                 d = sqrt( f2*h2 )
+              else
+                 d = sqrt( f2 )*sqrt( h2 )
+              end if
+              p = 1 / d
+              c = ( f2*p )*w
+              s = conjg( gs )*( fs*p )
+              r = ( fs*( h2*p ) )*u
+           end if
+        end if
+        a = r
+        return
+     end subroutine stdlib_zrotg
+
+     !> ZSCAL: scales a vector by a constant.
+
+     pure subroutine stdlib_zscal(n,za,zx,incx)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: za
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: zx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, nincx
+           if (n<=0 .or. incx<=0) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              do i = 1,n
+                 zx(i) = za*zx(i)
+              end do
+           else
+              ! code for increment not equal to 1
+              nincx = n*incx
+              do i = 1,nincx,incx
+                 zx(i) = za*zx(i)
+              end do
+           end if
+           return
+     end subroutine stdlib_zscal
+
+     !> ZSWAP: interchanges two vectors.
+
+     pure subroutine stdlib_zswap(n,zx,incx,zy,incy)
+        ! -- reference blas level1 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: zx(*), zy(*)
+        ! =====================================================================
+           ! Local Scalars 
+           complex(dp) :: ztemp
+           integer(ilp) :: i, ix, iy
+           if (n<=0) return
+           if (incx==1 .and. incy==1) then
+             ! code for both increments equal to 1
+              do i = 1,n
+                 ztemp = zx(i)
+                 zx(i) = zy(i)
+                 zy(i) = ztemp
+              end do
+           else
+             ! code for unequal increments or equal increments not equal
+               ! to 1
+              ix = 1
+              iy = 1
+              if (incx<0) ix = (-n+1)*incx + 1
+              if (incy<0) iy = (-n+1)*incy + 1
+              do i = 1,n
+                 ztemp = zx(ix)
+                 zx(ix) = zy(iy)
+                 zy(iy) = ztemp
+                 ix = ix + incx
+                 iy = iy + incy
+              end do
+           end if
+           return
+     end subroutine stdlib_zswap
+
+     !> ZSYMM:  performs one of the matrix-matrix operations
+     !> C := alpha*A*B + beta*C,
+     !> or
+     !> C := alpha*B*A + beta*C,
+     !> where  alpha and beta are scalars, A is a symmetric matrix and  B and
+     !> C are m by n matrices.
+
+     pure subroutine stdlib_zsymm(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           character, intent(in) :: side, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           complex(dp) :: temp1, temp2
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: upper
+           
+           
+           ! set nrowa as the number of rows of a.
+           if (stdlib_lsame(side,'L')) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           ! test the input parameters.
+           info = 0
+           if ((.not.stdlib_lsame(side,'L')) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if (m<0) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,m)) then
+               info = 9
+           else if (ldc<max(1,m)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZSYMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((m==0) .or. (n==0) .or.((alpha==czero).and. (beta==cone))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (beta==czero) then
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = czero
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j)
+                       end do
+                   end do
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(side,'L')) then
+              ! form  c := alpha*a*b + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,m
+                           temp1 = alpha*b(i,j)
+                           temp2 = czero
+                           do k = 1,i - 1
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*a(k,i)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = temp1*a(i,i) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*a(i,i) +alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = m,1,-1
+                           temp1 = alpha*b(i,j)
+                           temp2 = czero
+                           do k = i + 1,m
+                               c(k,j) = c(k,j) + temp1*a(k,i)
+                               temp2 = temp2 + b(k,j)*a(k,i)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = temp1*a(i,i) + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + temp1*a(i,i) +alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*b*a + beta*c.
+               loop_170: do j = 1,n
+                   temp1 = alpha*a(j,j)
+                   if (beta==czero) then
+                       do i = 1,m
+                           c(i,j) = temp1*b(i,j)
+                       end do
+                   else
+                       do i = 1,m
+                           c(i,j) = beta*c(i,j) + temp1*b(i,j)
+                       end do
+                   end if
+                   do k = 1,j - 1
+                       if (upper) then
+                           temp1 = alpha*a(k,j)
+                       else
+                           temp1 = alpha*a(j,k)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+                   do k = j + 1,n
+                       if (upper) then
+                           temp1 = alpha*a(j,k)
+                       else
+                           temp1 = alpha*a(k,j)
+                       end if
+                       do i = 1,m
+                           c(i,j) = c(i,j) + temp1*b(i,k)
+                       end do
+                   end do
+               end do loop_170
+           end if
+           return
+     end subroutine stdlib_zsymm
+
+     !> ZSYR2K:  performs one of the symmetric rank 2k operations
+     !> C := alpha*A*B**T + alpha*B*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*B + alpha*B**T*A + beta*C,
+     !> where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+     !> and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+     !> matrices in the second case.
+
+     pure subroutine stdlib_zsyr2k(uplo,trans,n,k,alpha,a,lda,b,ldb,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldb, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           complex(dp) :: temp1, temp2
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'T'))) &
+                     then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldb<max(1,nrowa)) then
+               info = 9
+           else if (ldc<max(1,n)) then
+               info = 12
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZSYR2K',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==czero).or.(k==0)).and. (beta==cone))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (upper) then
+                   if (beta==czero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               else
+                   if (beta==czero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*b**t + alpha*b*a**t + c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=czero) .or. (b(j,l)/=czero)) then
+                               temp1 = alpha*b(j,l)
+                               temp2 = alpha*a(j,l)
+                               do i = 1,j
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if ((a(j,l)/=czero) .or. (b(j,l)/=czero)) then
+                               temp1 = alpha*b(j,l)
+                               temp2 = alpha*a(j,l)
+                               do i = j,n
+                                   c(i,j) = c(i,j) + a(i,l)*temp1 +b(i,l)*temp2
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**t*b + alpha*b**t*a + c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp1 = czero
+                           temp2 = czero
+                           do l = 1,k
+                               temp1 = temp1 + a(l,i)*b(l,j)
+                               temp2 = temp2 + b(l,i)*a(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp1 + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + alpha*temp1 +alpha*temp2
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp1 = czero
+                           temp2 = czero
+                           do l = 1,k
+                               temp1 = temp1 + a(l,i)*b(l,j)
+                               temp2 = temp2 + b(l,i)*a(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp1 + alpha*temp2
+                           else
+                               c(i,j) = beta*c(i,j) + alpha*temp1 +alpha*temp2
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zsyr2k
+
+     !> ZSYRK:  performs one of the symmetric rank k operations
+     !> C := alpha*A*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*A + beta*C,
+     !> where  alpha and beta  are scalars,  C is an  n by n symmetric matrix
+     !> and  A  is an  n by k  matrix in the first case and a  k by n  matrix
+     !> in the second case.
+
+     pure subroutine stdlib_zsyrk(uplo,trans,n,k,alpha,a,lda,beta,c,ldc)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, ldc, n
+           character, intent(in) :: trans, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, j, l, nrowa
+           logical(lk) :: upper
+           
+           
+           ! test the input parameters.
+           if (stdlib_lsame(trans,'N')) then
+               nrowa = n
+           else
+               nrowa = k
+           end if
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 1
+           else if ((.not.stdlib_lsame(trans,'N')) .and.(.not.stdlib_lsame(trans,'T'))) &
+                     then
+               info = 2
+           else if (n<0) then
+               info = 3
+           else if (k<0) then
+               info = 4
+           else if (lda<max(1,nrowa)) then
+               info = 7
+           else if (ldc<max(1,n)) then
+               info = 10
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZSYRK ',info)
+               return
+           end if
+           ! quick return if possible.
+           if ((n==0) .or. (((alpha==czero).or.(k==0)).and. (beta==cone))) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               if (upper) then
+                   if (beta==czero) then
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               else
+                   if (beta==czero) then
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end do
+                   end if
+               end if
+               return
+           end if
+           ! start the operations.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  c := alpha*a*a**t + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = 1,j
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = 1,j
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=czero) then
+                               temp = alpha*a(j,l)
+                               do i = 1,j
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       if (beta==czero) then
+                           do i = j,n
+                               c(i,j) = czero
+                           end do
+                       else if (beta/=cone) then
+                           do i = j,n
+                               c(i,j) = beta*c(i,j)
+                           end do
+                       end if
+                       do l = 1,k
+                           if (a(j,l)/=czero) then
+                               temp = alpha*a(j,l)
+                               do i = j,n
+                                   c(i,j) = c(i,j) + temp*a(i,l)
+                               end do
+                           end if
+                       end do
+                   end do
+               end if
+           else
+              ! form  c := alpha*a**t*a + beta*c.
+               if (upper) then
+                   do j = 1,n
+                       do i = 1,j
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*a(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               else
+                   do j = 1,n
+                       do i = j,n
+                           temp = czero
+                           do l = 1,k
+                               temp = temp + a(l,i)*a(l,j)
+                           end do
+                           if (beta==czero) then
+                               c(i,j) = alpha*temp
+                           else
+                               c(i,j) = alpha*temp + beta*c(i,j)
+                           end if
+                       end do
+                   end do
+               end if
+           end if
+           return
+     end subroutine stdlib_zsyrk
+
+     !> ZTBMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,   or   x := A**H*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular band matrix, with ( k + 1 ) diagonals.
+
+     pure subroutine stdlib_ztbmv(uplo,trans,diag,n,k,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kplus1, kx, l
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda< (k+1)) then
+               info = 7
+           else if (incx==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTBMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx   too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (stdlib_lsame(trans,'N')) then
+               ! form  x := a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               l = kplus1 - j
+                               do i = max(1,j-k),j - 1
+                                   x(i) = x(i) + temp*a(l+i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(kplus1,j)
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               l = kplus1 - j
+                               do i = max(1,j-k),j - 1
+                                   x(ix) = x(ix) + temp*a(l+i,j)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(kplus1,j)
+                           end if
+                           jx = jx + incx
+                           if (j>k) kx = kx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               l = 1 - j
+                               do i = min(n,j+k),j + 1,-1
+                                   x(i) = x(i) + temp*a(l+i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(1,j)
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               l = 1 - j
+                               do i = min(n,j+k),j + 1,-1
+                                   x(ix) = x(ix) + temp*a(l+i,j)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(1,j)
+                           end if
+                           jx = jx - incx
+                           if ((n-j)>=k) kx = kx - incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x  or  x := a**h*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           l = kplus1 - j
+                           if (noconj) then
+                               if (nounit) temp = temp*a(kplus1,j)
+                               do i = j - 1,max(1,j-k),-1
+                                   temp = temp + a(l+i,j)*x(i)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(kplus1,j))
+                               do i = j - 1,max(1,j-k),-1
+                                   temp = temp + conjg(a(l+i,j))*x(i)
+                               end do
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           kx = kx - incx
+                           ix = kx
+                           l = kplus1 - j
+                           if (noconj) then
+                               if (nounit) temp = temp*a(kplus1,j)
+                               do i = j - 1,max(1,j-k),-1
+                                   temp = temp + a(l+i,j)*x(ix)
+                                   ix = ix - incx
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(kplus1,j))
+                               do i = j - 1,max(1,j-k),-1
+                                   temp = temp + conjg(a(l+i,j))*x(ix)
+                                   ix = ix - incx
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           l = 1 - j
+                           if (noconj) then
+                               if (nounit) temp = temp*a(1,j)
+                               do i = j + 1,min(n,j+k)
+                                   temp = temp + a(l+i,j)*x(i)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(1,j))
+                               do i = j + 1,min(n,j+k)
+                                   temp = temp + conjg(a(l+i,j))*x(i)
+                               end do
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           kx = kx + incx
+                           ix = kx
+                           l = 1 - j
+                           if (noconj) then
+                               if (nounit) temp = temp*a(1,j)
+                               do i = j + 1,min(n,j+k)
+                                   temp = temp + a(l+i,j)*x(ix)
+                                   ix = ix + incx
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(1,j))
+                               do i = j + 1,min(n,j+k)
+                                   temp = temp + conjg(a(l+i,j))*x(ix)
+                                   ix = ix + incx
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ztbmv
+
+     !> ZTBSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,   or   A**H*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular band matrix, with ( k + 1 )
+     !> diagonals.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_ztbsv(uplo,trans,diag,n,k,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kplus1, kx, l
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (k<0) then
+               info = 5
+           else if (lda< (k+1)) then
+               info = 7
+           else if (incx==0) then
+               info = 9
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTBSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed by sequentially with cone pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               l = kplus1 - j
+                               if (nounit) x(j) = x(j)/a(kplus1,j)
+                               temp = x(j)
+                               do i = j - 1,max(1,j-k),-1
+                                   x(i) = x(i) - temp*a(l+i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           kx = kx - incx
+                           if (x(jx)/=czero) then
+                               ix = kx
+                               l = kplus1 - j
+                               if (nounit) x(jx) = x(jx)/a(kplus1,j)
+                               temp = x(jx)
+                               do i = j - 1,max(1,j-k),-1
+                                   x(ix) = x(ix) - temp*a(l+i,j)
+                                   ix = ix - incx
+                               end do
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               l = 1 - j
+                               if (nounit) x(j) = x(j)/a(1,j)
+                               temp = x(j)
+                               do i = j + 1,min(n,j+k)
+                                   x(i) = x(i) - temp*a(l+i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           kx = kx + incx
+                           if (x(jx)/=czero) then
+                               ix = kx
+                               l = 1 - j
+                               if (nounit) x(jx) = x(jx)/a(1,j)
+                               temp = x(jx)
+                               do i = j + 1,min(n,j+k)
+                                   x(ix) = x(ix) - temp*a(l+i,j)
+                                   ix = ix + incx
+                               end do
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x  or  x := inv( a**h )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kplus1 = k + 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           l = kplus1 - j
+                           if (noconj) then
+                               do i = max(1,j-k),j - 1
+                                   temp = temp - a(l+i,j)*x(i)
+                               end do
+                               if (nounit) temp = temp/a(kplus1,j)
+                           else
+                               do i = max(1,j-k),j - 1
+                                   temp = temp - conjg(a(l+i,j))*x(i)
+                               end do
+                               if (nounit) temp = temp/conjg(a(kplus1,j))
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           l = kplus1 - j
+                           if (noconj) then
+                               do i = max(1,j-k),j - 1
+                                   temp = temp - a(l+i,j)*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/a(kplus1,j)
+                           else
+                               do i = max(1,j-k),j - 1
+                                   temp = temp - conjg(a(l+i,j))*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/conjg(a(kplus1,j))
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                           if (j>k) kx = kx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           l = 1 - j
+                           if (noconj) then
+                               do i = min(n,j+k),j + 1,-1
+                                   temp = temp - a(l+i,j)*x(i)
+                               end do
+                               if (nounit) temp = temp/a(1,j)
+                           else
+                               do i = min(n,j+k),j + 1,-1
+                                   temp = temp - conjg(a(l+i,j))*x(i)
+                               end do
+                               if (nounit) temp = temp/conjg(a(1,j))
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           l = 1 - j
+                           if (noconj) then
+                               do i = min(n,j+k),j + 1,-1
+                                   temp = temp - a(l+i,j)*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/a(1,j)
+                           else
+                               do i = min(n,j+k),j + 1,-1
+                                   temp = temp - conjg(a(l+i,j))*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/conjg(a(1,j))
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                           if ((n-j)>=k) kx = kx - incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ztbsv
+
+     !> ZTPMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,   or   x := A**H*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular matrix, supplied in packed form.
+
+     pure subroutine stdlib_ztpmv(uplo,trans,diag,n,ap,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: ap(*)
+           complex(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (incx==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTPMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of ap are
+           ! accessed sequentially with cone pass through ap.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x:= a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               k = kk
+                               do i = 1,j - 1
+                                   x(i) = x(i) + temp*ap(k)
+                                   k = k + 1
+                               end do
+                               if (nounit) x(j) = x(j)*ap(kk+j-1)
+                           end if
+                           kk = kk + j
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               do k = kk,kk + j - 2
+                                   x(ix) = x(ix) + temp*ap(k)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*ap(kk+j-1)
+                           end if
+                           jx = jx + incx
+                           kk = kk + j
+                       end do
+                   end if
+               else
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               k = kk
+                               do i = n,j + 1,-1
+                                   x(i) = x(i) + temp*ap(k)
+                                   k = k - 1
+                               end do
+                               if (nounit) x(j) = x(j)*ap(kk-n+j)
+                           end if
+                           kk = kk - (n-j+1)
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               do k = kk,kk - (n- (j+1)),-1
+                                   x(ix) = x(ix) + temp*ap(k)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*ap(kk-n+j)
+                           end if
+                           jx = jx - incx
+                           kk = kk - (n-j+1)
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x  or  x := a**h*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           k = kk - 1
+                           if (noconj) then
+                               if (nounit) temp = temp*ap(kk)
+                               do i = j - 1,1,-1
+                                   temp = temp + ap(k)*x(i)
+                                   k = k - 1
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(ap(kk))
+                               do i = j - 1,1,-1
+                                   temp = temp + conjg(ap(k))*x(i)
+                                   k = k - 1
+                               end do
+                           end if
+                           x(j) = temp
+                           kk = kk - j
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = jx
+                           if (noconj) then
+                               if (nounit) temp = temp*ap(kk)
+                               do k = kk - 1,kk - j + 1,-1
+                                   ix = ix - incx
+                                   temp = temp + ap(k)*x(ix)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(ap(kk))
+                               do k = kk - 1,kk - j + 1,-1
+                                   ix = ix - incx
+                                   temp = temp + conjg(ap(k))*x(ix)
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                           kk = kk - j
+                       end do
+                   end if
+               else
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           k = kk + 1
+                           if (noconj) then
+                               if (nounit) temp = temp*ap(kk)
+                               do i = j + 1,n
+                                   temp = temp + ap(k)*x(i)
+                                   k = k + 1
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(ap(kk))
+                               do i = j + 1,n
+                                   temp = temp + conjg(ap(k))*x(i)
+                                   k = k + 1
+                               end do
+                           end if
+                           x(j) = temp
+                           kk = kk + (n-j+1)
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = jx
+                           if (noconj) then
+                               if (nounit) temp = temp*ap(kk)
+                               do k = kk + 1,kk + n - j
+                                   ix = ix + incx
+                                   temp = temp + ap(k)*x(ix)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(ap(kk))
+                               do k = kk + 1,kk + n - j
+                                   ix = ix + incx
+                                   temp = temp + conjg(ap(k))*x(ix)
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                           kk = kk + (n-j+1)
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ztpmv
+
+     !> ZTPSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,   or   A**H*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular matrix, supplied in packed form.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_ztpsv(uplo,trans,diag,n,ap,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: ap(*)
+           complex(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (incx==0) then
+               info = 7
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTPSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of ap are
+           ! accessed sequentially with cone pass through ap.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               if (nounit) x(j) = x(j)/ap(kk)
+                               temp = x(j)
+                               k = kk - 1
+                               do i = j - 1,1,-1
+                                   x(i) = x(i) - temp*ap(k)
+                                   k = k - 1
+                               end do
+                           end if
+                           kk = kk - j
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               if (nounit) x(jx) = x(jx)/ap(kk)
+                               temp = x(jx)
+                               ix = jx
+                               do k = kk - 1,kk - j + 1,-1
+                                   ix = ix - incx
+                                   x(ix) = x(ix) - temp*ap(k)
+                               end do
+                           end if
+                           jx = jx - incx
+                           kk = kk - j
+                       end do
+                   end if
+               else
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               if (nounit) x(j) = x(j)/ap(kk)
+                               temp = x(j)
+                               k = kk + 1
+                               do i = j + 1,n
+                                   x(i) = x(i) - temp*ap(k)
+                                   k = k + 1
+                               end do
+                           end if
+                           kk = kk + (n-j+1)
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               if (nounit) x(jx) = x(jx)/ap(kk)
+                               temp = x(jx)
+                               ix = jx
+                               do k = kk + 1,kk + n - j
+                                   ix = ix + incx
+                                   x(ix) = x(ix) - temp*ap(k)
+                               end do
+                           end if
+                           jx = jx + incx
+                           kk = kk + (n-j+1)
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x  or  x := inv( a**h )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   kk = 1
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           k = kk
+                           if (noconj) then
+                               do i = 1,j - 1
+                                   temp = temp - ap(k)*x(i)
+                                   k = k + 1
+                               end do
+                               if (nounit) temp = temp/ap(kk+j-1)
+                           else
+                               do i = 1,j - 1
+                                   temp = temp - conjg(ap(k))*x(i)
+                                   k = k + 1
+                               end do
+                               if (nounit) temp = temp/conjg(ap(kk+j-1))
+                           end if
+                           x(j) = temp
+                           kk = kk + j
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = kx
+                           if (noconj) then
+                               do k = kk,kk + j - 2
+                                   temp = temp - ap(k)*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/ap(kk+j-1)
+                           else
+                               do k = kk,kk + j - 2
+                                   temp = temp - conjg(ap(k))*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/conjg(ap(kk+j-1))
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                           kk = kk + j
+                       end do
+                   end if
+               else
+                   kk = (n* (n+1))/2
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           k = kk
+                           if (noconj) then
+                               do i = n,j + 1,-1
+                                   temp = temp - ap(k)*x(i)
+                                   k = k - 1
+                               end do
+                               if (nounit) temp = temp/ap(kk-n+j)
+                           else
+                               do i = n,j + 1,-1
+                                   temp = temp - conjg(ap(k))*x(i)
+                                   k = k - 1
+                               end do
+                               if (nounit) temp = temp/conjg(ap(kk-n+j))
+                           end if
+                           x(j) = temp
+                           kk = kk - (n-j+1)
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = kx
+                           if (noconj) then
+                               do k = kk,kk - (n- (j+1)),-1
+                                   temp = temp - ap(k)*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/ap(kk-n+j)
+                           else
+                               do k = kk,kk - (n- (j+1)),-1
+                                   temp = temp - conjg(ap(k))*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/conjg(ap(kk-n+j))
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                           kk = kk - (n-j+1)
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ztpsv
+
+     !> ZTRMM:  performs one of the matrix-matrix operations
+     !> B := alpha*op( A )*B,   or   B := alpha*B*op( A )
+     !> where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
+
+     pure subroutine stdlib_ztrmm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           character, intent(in) :: diag, side, transa, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: lside, noconj, nounit, upper
+           
+           
+           ! test the input parameters.
+           lside = stdlib_lsame(side,'L')
+           if (lside) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           noconj = stdlib_lsame(transa,'T')
+           nounit = stdlib_lsame(diag,'N')
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.lside) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if ((.not.stdlib_lsame(transa,'N')) .and.(.not.stdlib_lsame(transa,'T')) .and.(&
+                     .not.stdlib_lsame(transa,'C'))) then
+               info = 3
+           else if ((.not.stdlib_lsame(diag,'U')) .and. (.not.stdlib_lsame(diag,'N'))) &
+                     then
+               info = 4
+           else if (m<0) then
+               info = 5
+           else if (n<0) then
+               info = 6
+           else if (lda<max(1,nrowa)) then
+               info = 9
+           else if (ldb<max(1,m)) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTRMM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (m==0 .or. n==0) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               do j = 1,n
+                   do i = 1,m
+                       b(i,j) = czero
+                   end do
+               end do
+               return
+           end if
+           ! start the operations.
+           if (lside) then
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*a*b.
+                   if (upper) then
+                       do j = 1,n
+                           do k = 1,m
+                               if (b(k,j)/=czero) then
+                                   temp = alpha*b(k,j)
+                                   do i = 1,k - 1
+                                       b(i,j) = b(i,j) + temp*a(i,k)
+                                   end do
+                                   if (nounit) temp = temp*a(k,k)
+                                   b(k,j) = temp
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do k = m,1,-1
+                               if (b(k,j)/=czero) then
+                                   temp = alpha*b(k,j)
+                                   b(k,j) = temp
+                                   if (nounit) b(k,j) = b(k,j)*a(k,k)
+                                   do i = k + 1,m
+                                       b(i,j) = b(i,j) + temp*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*a**t*b   or   b := alpha*a**h*b.
+                   if (upper) then
+                       do j = 1,n
+                           do i = m,1,-1
+                               temp = b(i,j)
+                               if (noconj) then
+                                   if (nounit) temp = temp*a(i,i)
+                                   do k = 1,i - 1
+                                       temp = temp + a(k,i)*b(k,j)
+                                   end do
+                               else
+                                   if (nounit) temp = temp*conjg(a(i,i))
+                                   do k = 1,i - 1
+                                       temp = temp + conjg(a(k,i))*b(k,j)
+                                   end do
+                               end if
+                               b(i,j) = alpha*temp
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = 1,m
+                               temp = b(i,j)
+                               if (noconj) then
+                                   if (nounit) temp = temp*a(i,i)
+                                   do k = i + 1,m
+                                       temp = temp + a(k,i)*b(k,j)
+                                   end do
+                               else
+                                   if (nounit) temp = temp*conjg(a(i,i))
+                                   do k = i + 1,m
+                                       temp = temp + conjg(a(k,i))*b(k,j)
+                                   end do
+                               end if
+                               b(i,j) = alpha*temp
+                           end do
+                       end do
+                   end if
+               end if
+           else
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*b*a.
+                   if (upper) then
+                       do j = n,1,-1
+                           temp = alpha
+                           if (nounit) temp = temp*a(j,j)
+                           do i = 1,m
+                               b(i,j) = temp*b(i,j)
+                           end do
+                           do k = 1,j - 1
+                               if (a(k,j)/=czero) then
+                                   temp = alpha*a(k,j)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           temp = alpha
+                           if (nounit) temp = temp*a(j,j)
+                           do i = 1,m
+                               b(i,j) = temp*b(i,j)
+                           end do
+                           do k = j + 1,n
+                               if (a(k,j)/=czero) then
+                                   temp = alpha*a(k,j)
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*b*a**t   or   b := alpha*b*a**h.
+                   if (upper) then
+                       loop_280: do k = 1,n
+                           do j = 1,k - 1
+                               if (a(j,k)/=czero) then
+                                   if (noconj) then
+                                       temp = alpha*a(j,k)
+                                   else
+                                       temp = alpha*conjg(a(j,k))
+                                   end if
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           temp = alpha
+                           if (nounit) then
+                               if (noconj) then
+                                   temp = temp*a(k,k)
+                               else
+                                   temp = temp*conjg(a(k,k))
+                               end if
+                           end if
+                           if (temp/=cone) then
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                       end do loop_280
+                   else
+                       loop_320: do k = n,1,-1
+                           do j = k + 1,n
+                               if (a(j,k)/=czero) then
+                                   if (noconj) then
+                                       temp = alpha*a(j,k)
+                                   else
+                                       temp = alpha*conjg(a(j,k))
+                                   end if
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) + temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           temp = alpha
+                           if (nounit) then
+                               if (noconj) then
+                                   temp = temp*a(k,k)
+                               else
+                                   temp = temp*conjg(a(k,k))
+                               end if
+                           end if
+                           if (temp/=cone) then
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                       end do loop_320
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ztrmm
+
+     !> ZTRMV:  performs one of the matrix-vector operations
+     !> x := A*x,   or   x := A**T*x,   or   x := A**H*x,
+     !> where x is an n element vector and  A is an n by n unit, or non-unit,
+     !> upper or lower triangular matrix.
+
+     pure subroutine stdlib_ztrmv(uplo,trans,diag,n,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,n)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTRMV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := a*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               do i = 1,j - 1
+                                   x(i) = x(i) + temp*a(i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(j,j)
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               do i = 1,j - 1
+                                   x(ix) = x(ix) + temp*a(i,j)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(j,j)
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               temp = x(j)
+                               do i = n,j + 1,-1
+                                   x(i) = x(i) + temp*a(i,j)
+                               end do
+                               if (nounit) x(j) = x(j)*a(j,j)
+                           end if
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               temp = x(jx)
+                               ix = kx
+                               do i = n,j + 1,-1
+                                   x(ix) = x(ix) + temp*a(i,j)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) x(jx) = x(jx)*a(j,j)
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := a**t*x  or  x := a**h*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           if (noconj) then
+                               if (nounit) temp = temp*a(j,j)
+                               do i = j - 1,1,-1
+                                   temp = temp + a(i,j)*x(i)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(j,j))
+                               do i = j - 1,1,-1
+                                   temp = temp + conjg(a(i,j))*x(i)
+                               end do
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           temp = x(jx)
+                           ix = jx
+                           if (noconj) then
+                               if (nounit) temp = temp*a(j,j)
+                               do i = j - 1,1,-1
+                                   ix = ix - incx
+                                   temp = temp + a(i,j)*x(ix)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(j,j))
+                               do i = j - 1,1,-1
+                                   ix = ix - incx
+                                   temp = temp + conjg(a(i,j))*x(ix)
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           if (noconj) then
+                               if (nounit) temp = temp*a(j,j)
+                               do i = j + 1,n
+                                   temp = temp + a(i,j)*x(i)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(j,j))
+                               do i = j + 1,n
+                                   temp = temp + conjg(a(i,j))*x(i)
+                               end do
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           temp = x(jx)
+                           ix = jx
+                           if (noconj) then
+                               if (nounit) temp = temp*a(j,j)
+                               do i = j + 1,n
+                                   ix = ix + incx
+                                   temp = temp + a(i,j)*x(ix)
+                               end do
+                           else
+                               if (nounit) temp = temp*conjg(a(j,j))
+                               do i = j + 1,n
+                                   ix = ix + incx
+                                   temp = temp + conjg(a(i,j))*x(ix)
+                               end do
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ztrmv
+
+     !> ZTRSM:  solves one of the matrix equations
+     !> op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+     !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
+     !> The matrix X is overwritten on B.
+
+     pure subroutine stdlib_ztrsm(side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb)
+        ! -- reference blas level3 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: alpha
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           character, intent(in) :: diag, side, transa, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, j, k, nrowa
+           logical(lk) :: lside, noconj, nounit, upper
+           
+           
+           ! test the input parameters.
+           lside = stdlib_lsame(side,'L')
+           if (lside) then
+               nrowa = m
+           else
+               nrowa = n
+           end if
+           noconj = stdlib_lsame(transa,'T')
+           nounit = stdlib_lsame(diag,'N')
+           upper = stdlib_lsame(uplo,'U')
+           info = 0
+           if ((.not.lside) .and. (.not.stdlib_lsame(side,'R'))) then
+               info = 1
+           else if ((.not.upper) .and. (.not.stdlib_lsame(uplo,'L'))) then
+               info = 2
+           else if ((.not.stdlib_lsame(transa,'N')) .and.(.not.stdlib_lsame(transa,'T')) .and.(&
+                     .not.stdlib_lsame(transa,'C'))) then
+               info = 3
+           else if ((.not.stdlib_lsame(diag,'U')) .and. (.not.stdlib_lsame(diag,'N'))) &
+                     then
+               info = 4
+           else if (m<0) then
+               info = 5
+           else if (n<0) then
+               info = 6
+           else if (lda<max(1,nrowa)) then
+               info = 9
+           else if (ldb<max(1,m)) then
+               info = 11
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTRSM ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (m==0 .or. n==0) return
+           ! and when  alpha.eq.czero.
+           if (alpha==czero) then
+               do j = 1,n
+                   do i = 1,m
+                       b(i,j) = czero
+                   end do
+               end do
+               return
+           end if
+           ! start the operations.
+           if (lside) then
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*inv( a )*b.
+                   if (upper) then
+                       do j = 1,n
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = m,1,-1
+                               if (b(k,j)/=czero) then
+                                   if (nounit) b(k,j) = b(k,j)/a(k,k)
+                                   do i = 1,k - 1
+                                       b(i,j) = b(i,j) - b(k,j)*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = 1,m
+                               if (b(k,j)/=czero) then
+                                   if (nounit) b(k,j) = b(k,j)/a(k,k)
+                                   do i = k + 1,m
+                                       b(i,j) = b(i,j) - b(k,j)*a(i,k)
+                                   end do
+                               end if
+                           end do
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*inv( a**t )*b
+                 ! or    b := alpha*inv( a**h )*b.
+                   if (upper) then
+                       do j = 1,n
+                           do i = 1,m
+                               temp = alpha*b(i,j)
+                               if (noconj) then
+                                   do k = 1,i - 1
+                                       temp = temp - a(k,i)*b(k,j)
+                                   end do
+                                   if (nounit) temp = temp/a(i,i)
+                               else
+                                   do k = 1,i - 1
+                                       temp = temp - conjg(a(k,i))*b(k,j)
+                                   end do
+                                   if (nounit) temp = temp/conjg(a(i,i))
+                               end if
+                               b(i,j) = temp
+                           end do
+                       end do
+                   else
+                       do j = 1,n
+                           do i = m,1,-1
+                               temp = alpha*b(i,j)
+                               if (noconj) then
+                                   do k = i + 1,m
+                                       temp = temp - a(k,i)*b(k,j)
+                                   end do
+                                   if (nounit) temp = temp/a(i,i)
+                               else
+                                   do k = i + 1,m
+                                       temp = temp - conjg(a(k,i))*b(k,j)
+                                   end do
+                                   if (nounit) temp = temp/conjg(a(i,i))
+                               end if
+                               b(i,j) = temp
+                           end do
+                       end do
+                   end if
+               end if
+           else
+               if (stdlib_lsame(transa,'N')) then
+                 ! form  b := alpha*b*inv( a ).
+                   if (upper) then
+                       do j = 1,n
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = 1,j - 1
+                               if (a(k,j)/=czero) then
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - a(k,j)*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (nounit) then
+                               temp = cone/a(j,j)
+                               do i = 1,m
+                                   b(i,j) = temp*b(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       do j = n,1,-1
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,j) = alpha*b(i,j)
+                               end do
+                           end if
+                           do k = j + 1,n
+                               if (a(k,j)/=czero) then
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - a(k,j)*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (nounit) then
+                               temp = cone/a(j,j)
+                               do i = 1,m
+                                   b(i,j) = temp*b(i,j)
+                               end do
+                           end if
+                       end do
+                   end if
+               else
+                 ! form  b := alpha*b*inv( a**t )
+                 ! or    b := alpha*b*inv( a**h ).
+                   if (upper) then
+                       loop_330: do k = n,1,-1
+                           if (nounit) then
+                               if (noconj) then
+                                   temp = cone/a(k,k)
+                               else
+                                   temp = cone/conjg(a(k,k))
+                               end if
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                           do j = 1,k - 1
+                               if (a(j,k)/=czero) then
+                                   if (noconj) then
+                                       temp = a(j,k)
+                                   else
+                                       temp = conjg(a(j,k))
+                                   end if
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,k) = alpha*b(i,k)
+                               end do
+                           end if
+                       end do loop_330
+                   else
+                       loop_380: do k = 1,n
+                           if (nounit) then
+                               if (noconj) then
+                                   temp = cone/a(k,k)
+                               else
+                                   temp = cone/conjg(a(k,k))
+                               end if
+                               do i = 1,m
+                                   b(i,k) = temp*b(i,k)
+                               end do
+                           end if
+                           do j = k + 1,n
+                               if (a(j,k)/=czero) then
+                                   if (noconj) then
+                                       temp = a(j,k)
+                                   else
+                                       temp = conjg(a(j,k))
+                                   end if
+                                   do i = 1,m
+                                       b(i,j) = b(i,j) - temp*b(i,k)
+                                   end do
+                               end if
+                           end do
+                           if (alpha/=cone) then
+                               do i = 1,m
+                                   b(i,k) = alpha*b(i,k)
+                               end do
+                           end if
+                       end do loop_380
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ztrsm
+
+     !> ZTRSV:  solves one of the systems of equations
+     !> A*x = b,   or   A**T*x = b,   or   A**H*x = b,
+     !> where b and x are n element vectors and A is an n by n unit, or
+     !> non-unit, upper or lower triangular matrix.
+     !> No test for singularity or near-singularity is included in this
+     !> routine. Such tests must be performed before calling this routine.
+
+     pure subroutine stdlib_ztrsv(uplo,trans,diag,n,a,lda,x,incx)
+        ! -- reference blas level2 routine --
+        ! -- reference blas is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, lda, n
+           character, intent(in) :: diag, trans, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: temp
+           integer(ilp) :: i, info, ix, j, jx, kx
+           logical(lk) :: noconj, nounit
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! test the input parameters.
+           info = 0
+           if (.not.stdlib_lsame(uplo,'U') .and. .not.stdlib_lsame(uplo,'L')) then
+               info = 1
+           else if (.not.stdlib_lsame(trans,'N') .and. .not.stdlib_lsame(trans,'T') &
+                     .and..not.stdlib_lsame(trans,'C')) then
+               info = 2
+           else if (.not.stdlib_lsame(diag,'U') .and. .not.stdlib_lsame(diag,'N')) then
+               info = 3
+           else if (n<0) then
+               info = 4
+           else if (lda<max(1,n)) then
+               info = 6
+           else if (incx==0) then
+               info = 8
+           end if
+           if (info/=0) then
+               call stdlib_xerbla('ZTRSV ',info)
+               return
+           end if
+           ! quick return if possible.
+           if (n==0) return
+           noconj = stdlib_lsame(trans,'T')
+           nounit = stdlib_lsame(diag,'N')
+           ! set up the start point in x if the increment is not unity. this
+           ! will be  ( n - 1 )*incx  too small for descending loops.
+           if (incx<=0) then
+               kx = 1 - (n-1)*incx
+           else if (incx/=1) then
+               kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through a.
+           if (stdlib_lsame(trans,'N')) then
+              ! form  x := inv( a )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = n,1,-1
+                           if (x(j)/=czero) then
+                               if (nounit) x(j) = x(j)/a(j,j)
+                               temp = x(j)
+                               do i = j - 1,1,-1
+                                   x(i) = x(i) - temp*a(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx + (n-1)*incx
+                       do j = n,1,-1
+                           if (x(jx)/=czero) then
+                               if (nounit) x(jx) = x(jx)/a(j,j)
+                               temp = x(jx)
+                               ix = jx
+                               do i = j - 1,1,-1
+                                   ix = ix - incx
+                                   x(ix) = x(ix) - temp*a(i,j)
+                               end do
+                           end if
+                           jx = jx - incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = 1,n
+                           if (x(j)/=czero) then
+                               if (nounit) x(j) = x(j)/a(j,j)
+                               temp = x(j)
+                               do i = j + 1,n
+                                   x(i) = x(i) - temp*a(i,j)
+                               end do
+                           end if
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           if (x(jx)/=czero) then
+                               if (nounit) x(jx) = x(jx)/a(j,j)
+                               temp = x(jx)
+                               ix = jx
+                               do i = j + 1,n
+                                   ix = ix + incx
+                                   x(ix) = x(ix) - temp*a(i,j)
+                               end do
+                           end if
+                           jx = jx + incx
+                       end do
+                   end if
+               end if
+           else
+              ! form  x := inv( a**t )*x  or  x := inv( a**h )*x.
+               if (stdlib_lsame(uplo,'U')) then
+                   if (incx==1) then
+                       do j = 1,n
+                           temp = x(j)
+                           if (noconj) then
+                               do i = 1,j - 1
+                                   temp = temp - a(i,j)*x(i)
+                               end do
+                               if (nounit) temp = temp/a(j,j)
+                           else
+                               do i = 1,j - 1
+                                   temp = temp - conjg(a(i,j))*x(i)
+                               end do
+                               if (nounit) temp = temp/conjg(a(j,j))
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       jx = kx
+                       do j = 1,n
+                           ix = kx
+                           temp = x(jx)
+                           if (noconj) then
+                               do i = 1,j - 1
+                                   temp = temp - a(i,j)*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/a(j,j)
+                           else
+                               do i = 1,j - 1
+                                   temp = temp - conjg(a(i,j))*x(ix)
+                                   ix = ix + incx
+                               end do
+                               if (nounit) temp = temp/conjg(a(j,j))
+                           end if
+                           x(jx) = temp
+                           jx = jx + incx
+                       end do
+                   end if
+               else
+                   if (incx==1) then
+                       do j = n,1,-1
+                           temp = x(j)
+                           if (noconj) then
+                               do i = n,j + 1,-1
+                                   temp = temp - a(i,j)*x(i)
+                               end do
+                               if (nounit) temp = temp/a(j,j)
+                           else
+                               do i = n,j + 1,-1
+                                   temp = temp - conjg(a(i,j))*x(i)
+                               end do
+                               if (nounit) temp = temp/conjg(a(j,j))
+                           end if
+                           x(j) = temp
+                       end do
+                   else
+                       kx = kx + (n-1)*incx
+                       jx = kx
+                       do j = n,1,-1
+                           ix = kx
+                           temp = x(jx)
+                           if (noconj) then
+                               do i = n,j + 1,-1
+                                   temp = temp - a(i,j)*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/a(j,j)
+                           else
+                               do i = n,j + 1,-1
+                                   temp = temp - conjg(a(i,j))*x(ix)
+                                   ix = ix - incx
+                               end do
+                               if (nounit) temp = temp/conjg(a(j,j))
+                           end if
+                           x(jx) = temp
+                           jx = jx - incx
+                       end do
+                   end if
+               end if
+           end if
+           return
+     end subroutine stdlib_ztrsv
+
+
+
+end module stdlib_linalg_blas_z
diff --git a/src/stdlib_linalg_constants.fypp b/src/stdlib_linalg_constants.fypp
new file mode 100644
index 000000000..7ffdbf209
--- /dev/null
+++ b/src/stdlib_linalg_constants.fypp
@@ -0,0 +1,15 @@
+#:include "common.fypp"
+module stdlib_linalg_constants
+     use stdlib_kinds, only: sp, dp, qp, int32, int64, lk
+     use, intrinsic :: ieee_arithmetic, only: ieee_is_nan 
+     !$ use omp_lib
+     implicit none(type,external)
+     public
+
+
+
+     ! Integer size support for ILP64 builds should be done here
+     integer, parameter :: ilp = int32
+     private            :: int32, int64
+
+end module stdlib_linalg_constants
diff --git a/src/stdlib_linalg_lapack.fypp b/src/stdlib_linalg_lapack.fypp
new file mode 100644
index 000000000..46c299ec7
--- /dev/null
+++ b/src/stdlib_linalg_lapack.fypp
@@ -0,0 +1,29452 @@
+#:include "common.fypp" 
+module stdlib_linalg_lapack
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas
+     use stdlib_linalg_lapack_aux
+     use stdlib_linalg_lapack_s
+     use stdlib_linalg_lapack_d
+#:if WITH_QP 
+     use stdlib_linalg_lapack_q
+#:endif 
+     use stdlib_linalg_lapack_c
+     use stdlib_linalg_lapack_z
+#:if WITH_QP 
+     use stdlib_linalg_lapack_w
+#:endif 
+     implicit none(type,external)
+     public
+
+          !> BBCSD: computes the CS decomposition of a unitary matrix in
+          !> bidiagonal-block form,
+          !> [ B11 | B12 0  0 ]
+          !> [  0  |  0 -I  0 ]
+          !> X = [----------------]
+          !> [ B21 | B22 0  0 ]
+          !> [  0  |  0  0  I ]
+          !> [  C | -S  0  0 ]
+          !> [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**H
+          !> = [---------] [---------------] [---------]   .
+          !> [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
+          !> [  0 |  0  0  I ]
+          !> X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
+          !> than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
+          !> transposed and/or permuted. This can be done in constant time using
+          !> the TRANS and SIGNS options. See CUNCSD for details.)
+          !> The bidiagonal matrices B11, B12, B21, and B22 are represented
+          !> implicitly by angles THETA(1:Q) and PHI(1:Q-1).
+          !> The unitary matrices U1, U2, V1T, and V2T are input/output.
+          !> The input matrices are pre- or post-multiplied by the appropriate
+          !> singular vector matrices.
+          interface bbcsd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, &
+               u1, ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d,&
+                          b22e, rwork, lrwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu1,jobu2,jobv1t,jobv2t,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldu1,ldu2,ldv1t,ldv2t,lrwork,m,p,q
+                    real(sp), intent(out) :: b11d(*),b11e(*),b12d(*),b12e(*),b21d(*),b21e(*),b22d(&
+                              *),b22e(*),rwork(*)
+                    real(sp), intent(inout) :: phi(*),theta(*)
+                    complex(sp), intent(inout) :: u1(ldu1,*),u2(ldu2,*),v1t(ldv1t,*),v2t(ldv2t,*)
+                              
+               end subroutine cbbcsd
+#else 
+               module procedure stdlib_cbbcsd
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, &
+               u1, ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d,&
+                          b22e, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu1,jobu2,jobv1t,jobv2t,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldu1,ldu2,ldv1t,ldv2t,lwork,m,p,q
+                    real(dp), intent(out) :: b11d(*),b11e(*),b12d(*),b12e(*),b21d(*),b21e(*),b22d(&
+                              *),b22e(*),work(*)
+                    real(dp), intent(inout) :: phi(*),theta(*),u1(ldu1,*),u2(ldu2,*),v1t(ldv1t,*),&
+                              v2t(ldv2t,*)
+               end subroutine dbbcsd
+#else 
+               module procedure stdlib_dbbcsd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qbbcsd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, &
+               u1, ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d,&
+                          b22e, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu1,jobu2,jobv1t,jobv2t,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldu1,ldu2,ldv1t,ldv2t,lwork,m,p,q
+                    real(sp), intent(out) :: b11d(*),b11e(*),b12d(*),b12e(*),b21d(*),b21e(*),b22d(&
+                              *),b22e(*),work(*)
+                    real(sp), intent(inout) :: phi(*),theta(*),u1(ldu1,*),u2(ldu2,*),v1t(ldv1t,*),&
+                              v2t(ldv2t,*)
+               end subroutine sbbcsd
+#else 
+               module procedure stdlib_sbbcsd
+#endif
+#:if WITH_QP
+               module procedure stdlib_wbbcsd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, &
+               u1, ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d,&
+                          b22e, rwork, lrwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu1,jobu2,jobv1t,jobv2t,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldu1,ldu2,ldv1t,ldv2t,lrwork,m,p,q
+                    real(dp), intent(out) :: b11d(*),b11e(*),b12d(*),b12e(*),b21d(*),b21e(*),b22d(&
+                              *),b22e(*),rwork(*)
+                    real(dp), intent(inout) :: phi(*),theta(*)
+                    complex(dp), intent(inout) :: u1(ldu1,*),u2(ldu2,*),v1t(ldv1t,*),v2t(ldv2t,*)
+                              
+               end subroutine zbbcsd
+#else 
+               module procedure stdlib_zbbcsd
+#endif
+          end interface bbcsd
+
+
+
+          !> BDSDC: computes the singular value decomposition (SVD) of a real
+          !> N-by-N (upper or lower) bidiagonal matrix B:  B = U * S * VT,
+          !> using a divide and conquer method, where S is a diagonal matrix
+          !> with non-negative diagonal elements (the singular values of B), and
+          !> U and VT are orthogonal matrices of left and right singular vectors,
+          !> respectively. BDSDC can be used to compute all singular values,
+          !> and optionally, singular vectors or singular vectors in compact form.
+          !> This code makes very mild assumptions about floating point
+          !> arithmetic. It will work on machines with a guard digit in
+          !> add/subtract, or on those binary machines without guard digits
+          !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+          !> It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.  See DLASD3 for details.
+          !> The code currently calls DLASDQ if singular values only are desired.
+          !> However, it can be slightly modified to compute singular values
+          !> using the divide and conquer method.
+          interface bdsdc
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dbdsdc( uplo, compq, n, d, e, u, ldu, vt, ldvt, q, iq,work, iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,uplo
+                    integer(ilp), intent(out) :: info,iq(*),iwork(*)
+                    integer(ilp), intent(in) :: ldu,ldvt,n
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: q(*),u(ldu,*),vt(ldvt,*),work(*)
+               end subroutine dbdsdc
+#else 
+               module procedure stdlib_dbdsdc
+#endif
+#:if WITH_QP
+               module procedure stdlib_qbdsdc
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sbdsdc( uplo, compq, n, d, e, u, ldu, vt, ldvt, q, iq,work, iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,uplo
+                    integer(ilp), intent(out) :: info,iq(*),iwork(*)
+                    integer(ilp), intent(in) :: ldu,ldvt,n
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: q(*),u(ldu,*),vt(ldvt,*),work(*)
+               end subroutine sbdsdc
+#else 
+               module procedure stdlib_sbdsdc
+#endif
+          end interface bdsdc
+
+
+
+          !> BDSQR: computes the singular values and, optionally, the right and/or
+          !> left singular vectors from the singular value decomposition (SVD) of
+          !> a real N-by-N (upper or lower) bidiagonal matrix B using the implicit
+          !> zero-shift QR algorithm.  The SVD of B has the form
+          !> B = Q * S * P**H
+          !> where S is the diagonal matrix of singular values, Q is an orthogonal
+          !> matrix of left singular vectors, and P is an orthogonal matrix of
+          !> right singular vectors.  If left singular vectors are requested, this
+          !> subroutine actually returns U*Q instead of Q, and, if right singular
+          !> vectors are requested, this subroutine returns P**H*VT instead of
+          !> P**H, for given complex input matrices U and VT.  When U and VT are
+          !> the unitary matrices that reduce a general matrix A to bidiagonal
+          !> form: A = U*B*VT, as computed by CGEBRD, then
+          !> A = (U*Q) * S * (P**H*VT)
+          !> is the SVD of A.  Optionally, the subroutine may also compute Q**H*C
+          !> for a given complex input matrix C.
+          !> See "Computing  Small Singular Values of Bidiagonal Matrices With
+          !> Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+          !> LAPACK Working Note #3 (or SIAM J. Sci. Statist. Comput. vol. 11,
+          !> no. 5, pp. 873-912, Sept 1990) and
+          !> "Accurate singular values and differential qd algorithms," by
+          !> B. Parlett and V. Fernando, Technical Report CPAM-554, Mathematics
+          !> Department, University of California at Berkeley, July 1992
+          !> for a detailed description of the algorithm.
+          interface bdsqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cbdsqr( uplo, n, ncvt, nru, ncc, d, e, vt, ldvt, u,ldu, c, ldc, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldc,ldu,ldvt,n,ncc,ncvt,nru
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: c(ldc,*),u(ldu,*),vt(ldvt,*)
+               end subroutine cbdsqr
+#else 
+               module procedure stdlib_cbdsqr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dbdsqr( uplo, n, ncvt, nru, ncc, d, e, vt, ldvt, u,ldu, c, ldc, &
+                         work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldc,ldu,ldvt,n,ncc,ncvt,nru
+                    real(dp), intent(inout) :: c(ldc,*),d(*),e(*),u(ldu,*),vt(ldvt,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dbdsqr
+#else 
+               module procedure stdlib_dbdsqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qbdsqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sbdsqr( uplo, n, ncvt, nru, ncc, d, e, vt, ldvt, u,ldu, c, ldc, &
+                         work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldc,ldu,ldvt,n,ncc,ncvt,nru
+                    real(sp), intent(inout) :: c(ldc,*),d(*),e(*),u(ldu,*),vt(ldvt,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sbdsqr
+#else 
+               module procedure stdlib_sbdsqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wbdsqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zbdsqr( uplo, n, ncvt, nru, ncc, d, e, vt, ldvt, u,ldu, c, ldc, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldc,ldu,ldvt,n,ncc,ncvt,nru
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: c(ldc,*),u(ldu,*),vt(ldvt,*)
+               end subroutine zbdsqr
+#else 
+               module procedure stdlib_zbdsqr
+#endif
+          end interface bdsqr
+
+
+
+          !> DISNA: computes the reciprocal condition numbers for the eigenvectors
+          !> of a real symmetric or complex Hermitian matrix or for the left or
+          !> right singular vectors of a general m-by-n matrix. The reciprocal
+          !> condition number is the 'gap' between the corresponding eigenvalue or
+          !> singular value and the nearest other one.
+          !> The bound on the error, measured by angle in radians, in the I-th
+          !> computed vector is given by
+          !> DLAMCH( 'E' ) * ( ANORM / SEP( I ) )
+          !> where ANORM = 2-norm(A) = max( abs( D(j) ) ).  SEP(I) is not allowed
+          !> to be smaller than DLAMCH( 'E' )*ANORM in order to limit the size of
+          !> the error bound.
+          !> DISNA may also be used to compute error bounds for eigenvectors of
+          !> the generalized symmetric definite eigenproblem.
+          interface disna
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ddisna( job, m, n, d, sep, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: m,n
+                    real(dp), intent(in) :: d(*)
+                    real(dp), intent(out) :: sep(*)
+               end subroutine ddisna
+#else 
+               module procedure stdlib_ddisna
+#endif
+#:if WITH_QP
+               module procedure stdlib_qdisna
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sdisna( job, m, n, d, sep, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: m,n
+                    real(sp), intent(in) :: d(*)
+                    real(sp), intent(out) :: sep(*)
+               end subroutine sdisna
+#else 
+               module procedure stdlib_sdisna
+#endif
+          end interface disna
+
+
+
+          !> GBBRD: reduces a complex general m-by-n band matrix A to real upper
+          !> bidiagonal form B by a unitary transformation: Q**H * A * P = B.
+          !> The routine computes B, and optionally forms Q or P**H, or computes
+          !> Q**H*C for a given matrix C.
+          interface gbbrd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgbbrd( vect, m, n, ncc, kl, ku, ab, ldab, d, e, q,ldq, pt, ldpt, &
+                         c, ldc, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldc,ldpt,ldq,m,n,ncc
+                    real(sp), intent(out) :: d(*),e(*),rwork(*)
+                    complex(sp), intent(inout) :: ab(ldab,*),c(ldc,*)
+                    complex(sp), intent(out) :: pt(ldpt,*),q(ldq,*),work(*)
+               end subroutine cgbbrd
+#else 
+               module procedure stdlib_cgbbrd
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgbbrd( vect, m, n, ncc, kl, ku, ab, ldab, d, e, q,ldq, pt, ldpt, &
+                         c, ldc, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldc,ldpt,ldq,m,n,ncc
+                    real(dp), intent(inout) :: ab(ldab,*),c(ldc,*)
+                    real(dp), intent(out) :: d(*),e(*),pt(ldpt,*),q(ldq,*),work(*)
+               end subroutine dgbbrd
+#else 
+               module procedure stdlib_dgbbrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgbbrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgbbrd( vect, m, n, ncc, kl, ku, ab, ldab, d, e, q,ldq, pt, ldpt, &
+                         c, ldc, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldc,ldpt,ldq,m,n,ncc
+                    real(sp), intent(inout) :: ab(ldab,*),c(ldc,*)
+                    real(sp), intent(out) :: d(*),e(*),pt(ldpt,*),q(ldq,*),work(*)
+               end subroutine sgbbrd
+#else 
+               module procedure stdlib_sgbbrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgbbrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgbbrd( vect, m, n, ncc, kl, ku, ab, ldab, d, e, q,ldq, pt, ldpt, &
+                         c, ldc, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldc,ldpt,ldq,m,n,ncc
+                    real(dp), intent(out) :: d(*),e(*),rwork(*)
+                    complex(dp), intent(inout) :: ab(ldab,*),c(ldc,*)
+                    complex(dp), intent(out) :: pt(ldpt,*),q(ldq,*),work(*)
+               end subroutine zgbbrd
+#else 
+               module procedure stdlib_zgbbrd
+#endif
+          end interface gbbrd
+
+
+
+          !> GBCON: estimates the reciprocal of the condition number of a complex
+          !> general band matrix A, in either the 1-norm or the infinity-norm,
+          !> using the LU factorization computed by CGBTRF.
+          !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+          !> condition number is computed as
+          !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+          interface gbcon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgbcon( norm, n, kl, ku, ab, ldab, ipiv, anorm, rcond,work, rwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,n,ipiv(*)
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond,rwork(*)
+                    complex(sp), intent(in) :: ab(ldab,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cgbcon
+#else 
+               module procedure stdlib_cgbcon
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgbcon( norm, n, kl, ku, ab, ldab, ipiv, anorm, rcond,work, iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kl,ku,ldab,n,ipiv(*)
+                    real(dp), intent(in) :: anorm,ab(ldab,*)
+                    real(dp), intent(out) :: rcond,work(*)
+               end subroutine dgbcon
+#else 
+               module procedure stdlib_dgbcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgbcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgbcon( norm, n, kl, ku, ab, ldab, ipiv, anorm, rcond,work, iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kl,ku,ldab,n,ipiv(*)
+                    real(sp), intent(in) :: anorm,ab(ldab,*)
+                    real(sp), intent(out) :: rcond,work(*)
+               end subroutine sgbcon
+#else 
+               module procedure stdlib_sgbcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgbcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgbcon( norm, n, kl, ku, ab, ldab, ipiv, anorm, rcond,work, rwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,n,ipiv(*)
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond,rwork(*)
+                    complex(dp), intent(in) :: ab(ldab,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zgbcon
+#else 
+               module procedure stdlib_zgbcon
+#endif
+          end interface gbcon
+
+
+
+          !> GBEQU: computes row and column scalings intended to equilibrate an
+          !> M-by-N band matrix A and reduce its condition number.  R returns the
+          !> row scale factors and C the column scale factors, chosen to try to
+          !> make the largest element in each row and column of the matrix B with
+          !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+          !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+          !> number and BIGNUM = largest safe number.  Use of these scaling
+          !> factors is not guaranteed to reduce the condition number of A but
+          !> works well in practice.
+          interface gbequ
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgbequ( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(sp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    complex(sp), intent(in) :: ab(ldab,*)
+               end subroutine cgbequ
+#else 
+               module procedure stdlib_cgbequ
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgbequ( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(dp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    real(dp), intent(in) :: ab(ldab,*)
+               end subroutine dgbequ
+#else 
+               module procedure stdlib_dgbequ
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgbequ
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgbequ( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(sp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    real(sp), intent(in) :: ab(ldab,*)
+               end subroutine sgbequ
+#else 
+               module procedure stdlib_sgbequ
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgbequ
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgbequ( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(dp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    complex(dp), intent(in) :: ab(ldab,*)
+               end subroutine zgbequ
+#else 
+               module procedure stdlib_zgbequ
+#endif
+          end interface gbequ
+
+
+
+          !> GBEQUB: computes row and column scalings intended to equilibrate an
+          !> M-by-N matrix A and reduce its condition number.  R returns the row
+          !> scale factors and C the column scale factors, chosen to try to make
+          !> the largest element in each row and column of the matrix B with
+          !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+          !> the radix.
+          !> R(i) and C(j) are restricted to be a power of the radix between
+          !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+          !> of these scaling factors is not guaranteed to reduce the condition
+          !> number of A but works well in practice.
+          !> This routine differs from CGEEQU by restricting the scaling factors
+          !> to a power of the radix.  Barring over- and underflow, scaling by
+          !> these factors introduces no additional rounding errors.  However, the
+          !> scaled entries' magnitudes are no longer approximately 1 but lie
+          !> between sqrt(radix) and 1/sqrt(radix).
+          interface gbequb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgbequb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(sp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    complex(sp), intent(in) :: ab(ldab,*)
+               end subroutine cgbequb
+#else 
+               module procedure stdlib_cgbequb
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgbequb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(dp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    real(dp), intent(in) :: ab(ldab,*)
+               end subroutine dgbequb
+#else 
+               module procedure stdlib_dgbequb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgbequb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgbequb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(sp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    real(sp), intent(in) :: ab(ldab,*)
+               end subroutine sgbequb
+#else 
+               module procedure stdlib_sgbequb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgbequb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgbequb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(dp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    complex(dp), intent(in) :: ab(ldab,*)
+               end subroutine zgbequb
+#else 
+               module procedure stdlib_zgbequb
+#endif
+          end interface gbequb
+
+
+
+          !> GBRFS: improves the computed solution to a system of linear
+          !> equations when the coefficient matrix is banded, and provides
+          !> error bounds and backward error estimates for the solution.
+          interface gbrfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb,ipiv, b, ldb, &
+                         x, ldx, ferr, berr, work, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldafb,ldb,ldx,n,nrhs,ipiv(*)
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(sp), intent(in) :: ab(ldab,*),afb(ldafb,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: x(ldx,*)
+               end subroutine cgbrfs
+#else 
+               module procedure stdlib_cgbrfs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb,ipiv, b, ldb, &
+                         x, ldx, ferr, berr, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldafb,ldb,ldx,n,nrhs,ipiv(*)
+                    real(dp), intent(in) :: ab(ldab,*),afb(ldafb,*),b(ldb,*)
+                    real(dp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(dp), intent(inout) :: x(ldx,*)
+               end subroutine dgbrfs
+#else 
+               module procedure stdlib_dgbrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgbrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb,ipiv, b, ldb, &
+                         x, ldx, ferr, berr, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldafb,ldb,ldx,n,nrhs,ipiv(*)
+                    real(sp), intent(in) :: ab(ldab,*),afb(ldafb,*),b(ldb,*)
+                    real(sp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(sp), intent(inout) :: x(ldx,*)
+               end subroutine sgbrfs
+#else 
+               module procedure stdlib_sgbrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgbrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb,ipiv, b, ldb, &
+                         x, ldx, ferr, berr, work, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldafb,ldb,ldx,n,nrhs,ipiv(*)
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(dp), intent(in) :: ab(ldab,*),afb(ldafb,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: x(ldx,*)
+               end subroutine zgbrfs
+#else 
+               module procedure stdlib_zgbrfs
+#endif
+          end interface gbrfs
+
+
+
+          !> GBSV: computes the solution to a complex system of linear equations
+          !> A * X = B, where A is a band matrix of order N with KL subdiagonals
+          !> and KU superdiagonals, and X and B are N-by-NRHS matrices.
+          !> The LU decomposition with partial pivoting and row interchanges is
+          !> used to factor A as A = L * U, where L is a product of permutation
+          !> and unit lower triangular matrices with KL subdiagonals, and U is
+          !> upper triangular with KL+KU superdiagonals.  The factored form of A
+          !> is then used to solve the system of equations A * X = B.
+          interface gbsv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgbsv( n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldb,n,nrhs
+                    complex(sp), intent(inout) :: ab(ldab,*),b(ldb,*)
+               end subroutine cgbsv
+#else 
+               module procedure stdlib_cgbsv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgbsv( n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldb,n,nrhs
+                    real(dp), intent(inout) :: ab(ldab,*),b(ldb,*)
+               end subroutine dgbsv
+#else 
+               module procedure stdlib_dgbsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgbsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgbsv( n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldb,n,nrhs
+                    real(sp), intent(inout) :: ab(ldab,*),b(ldb,*)
+               end subroutine sgbsv
+#else 
+               module procedure stdlib_sgbsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgbsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgbsv( n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldb,n,nrhs
+                    complex(dp), intent(inout) :: ab(ldab,*),b(ldb,*)
+               end subroutine zgbsv
+#else 
+               module procedure stdlib_zgbsv
+#endif
+          end interface gbsv
+
+
+
+          !> GBTRF: computes an LU factorization of a complex m-by-n band matrix A
+          !> using partial pivoting with row interchanges.
+          !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+          interface gbtrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgbtrf( m, n, kl, ku, ab, ldab, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    complex(sp), intent(inout) :: ab(ldab,*)
+               end subroutine cgbtrf
+#else 
+               module procedure stdlib_cgbtrf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgbtrf( m, n, kl, ku, ab, ldab, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(dp), intent(inout) :: ab(ldab,*)
+               end subroutine dgbtrf
+#else 
+               module procedure stdlib_dgbtrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgbtrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgbtrf( m, n, kl, ku, ab, ldab, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(sp), intent(inout) :: ab(ldab,*)
+               end subroutine sgbtrf
+#else 
+               module procedure stdlib_sgbtrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgbtrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgbtrf( m, n, kl, ku, ab, ldab, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    complex(dp), intent(inout) :: ab(ldab,*)
+               end subroutine zgbtrf
+#else 
+               module procedure stdlib_zgbtrf
+#endif
+          end interface gbtrf
+
+
+
+          !> GBTRS: solves a system of linear equations
+          !> A * X = B,  A**T * X = B,  or  A**H * X = B
+          !> with a general band matrix A using the LU factorization computed
+          !> by CGBTRF.
+          interface gbtrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgbtrs( trans, n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldb,n,nrhs,ipiv(*)
+                    complex(sp), intent(in) :: ab(ldab,*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine cgbtrs
+#else 
+               module procedure stdlib_cgbtrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgbtrs( trans, n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldb,n,nrhs,ipiv(*)
+                    real(dp), intent(in) :: ab(ldab,*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dgbtrs
+#else 
+               module procedure stdlib_dgbtrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgbtrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgbtrs( trans, n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldb,n,nrhs,ipiv(*)
+                    real(sp), intent(in) :: ab(ldab,*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine sgbtrs
+#else 
+               module procedure stdlib_sgbtrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgbtrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgbtrs( trans, n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,ldab,ldb,n,nrhs,ipiv(*)
+                    complex(dp), intent(in) :: ab(ldab,*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zgbtrs
+#else 
+               module procedure stdlib_zgbtrs
+#endif
+          end interface gbtrs
+
+
+
+          !> GEBAK: forms the right or left eigenvectors of a complex general
+          !> matrix by backward transformation on the computed eigenvectors of the
+          !> balanced matrix output by CGEBAL.
+          interface gebak
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgebak( job, side, n, ilo, ihi, scale, m, v, ldv,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job,side
+                    integer(ilp), intent(in) :: ihi,ilo,ldv,m,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(in) :: scale(*)
+                    complex(sp), intent(inout) :: v(ldv,*)
+               end subroutine cgebak
+#else 
+               module procedure stdlib_cgebak
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgebak( job, side, n, ilo, ihi, scale, m, v, ldv,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job,side
+                    integer(ilp), intent(in) :: ihi,ilo,ldv,m,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(in) :: scale(*)
+                    real(dp), intent(inout) :: v(ldv,*)
+               end subroutine dgebak
+#else 
+               module procedure stdlib_dgebak
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgebak
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgebak( job, side, n, ilo, ihi, scale, m, v, ldv,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job,side
+                    integer(ilp), intent(in) :: ihi,ilo,ldv,m,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(inout) :: v(ldv,*)
+                    real(sp), intent(in) :: scale(*)
+               end subroutine sgebak
+#else 
+               module procedure stdlib_sgebak
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgebak
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgebak( job, side, n, ilo, ihi, scale, m, v, ldv,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job,side
+                    integer(ilp), intent(in) :: ihi,ilo,ldv,m,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(in) :: scale(*)
+                    complex(dp), intent(inout) :: v(ldv,*)
+               end subroutine zgebak
+#else 
+               module procedure stdlib_zgebak
+#endif
+          end interface gebak
+
+
+
+          !> GEBAL: balances a general complex matrix A.  This involves, first,
+          !> permuting A by a similarity transformation to isolate eigenvalues
+          !> in the first 1 to ILO-1 and last IHI+1 to N elements on the
+          !> diagonal; and second, applying a diagonal similarity transformation
+          !> to rows and columns ILO to IHI to make the rows and columns as
+          !> close in norm as possible.  Both steps are optional.
+          !> Balancing may reduce the 1-norm of the matrix, and improve the
+          !> accuracy of the computed eigenvalues and/or eigenvectors.
+          interface gebal
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgebal( job, n, a, lda, ilo, ihi, scale, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job
+                    integer(ilp), intent(out) :: ihi,ilo,info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: scale(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine cgebal
+#else 
+               module procedure stdlib_cgebal
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgebal( job, n, a, lda, ilo, ihi, scale, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job
+                    integer(ilp), intent(out) :: ihi,ilo,info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: scale(*)
+               end subroutine dgebal
+#else 
+               module procedure stdlib_dgebal
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgebal
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgebal( job, n, a, lda, ilo, ihi, scale, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job
+                    integer(ilp), intent(out) :: ihi,ilo,info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: scale(*)
+               end subroutine sgebal
+#else 
+               module procedure stdlib_sgebal
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgebal
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgebal( job, n, a, lda, ilo, ihi, scale, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job
+                    integer(ilp), intent(out) :: ihi,ilo,info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: scale(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zgebal
+#else 
+               module procedure stdlib_zgebal
+#endif
+          end interface gebal
+
+
+
+          !> GEBRD: reduces a general complex M-by-N matrix A to upper or lower
+          !> bidiagonal form B by a unitary transformation: Q**H * A * P = B.
+          !> If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+          interface gebrd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgebrd( m, n, a, lda, d, e, tauq, taup, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(sp), intent(out) :: d(*),e(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: taup(*),tauq(*),work(*)
+               end subroutine cgebrd
+#else 
+               module procedure stdlib_cgebrd
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgebrd( m, n, a, lda, d, e, tauq, taup, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: d(*),e(*),taup(*),tauq(*),work(*)
+               end subroutine dgebrd
+#else 
+               module procedure stdlib_dgebrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgebrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgebrd( m, n, a, lda, d, e, tauq, taup, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: d(*),e(*),taup(*),tauq(*),work(*)
+               end subroutine sgebrd
+#else 
+               module procedure stdlib_sgebrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgebrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgebrd( m, n, a, lda, d, e, tauq, taup, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(dp), intent(out) :: d(*),e(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: taup(*),tauq(*),work(*)
+               end subroutine zgebrd
+#else 
+               module procedure stdlib_zgebrd
+#endif
+          end interface gebrd
+
+
+
+          !> GECON: estimates the reciprocal of the condition number of a general
+          !> complex matrix A, in either the 1-norm or the infinity-norm, using
+          !> the LU factorization computed by CGETRF.
+          !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+          !> condition number is computed as
+          !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+          interface gecon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgecon( norm, n, a, lda, anorm, rcond, work, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond,rwork(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cgecon
+#else 
+               module procedure stdlib_cgecon
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgecon( norm, n, a, lda, anorm, rcond, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond,work(*)
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dgecon
+#else 
+               module procedure stdlib_dgecon
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgecon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgecon( norm, n, a, lda, anorm, rcond, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond,work(*)
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine sgecon
+#else 
+               module procedure stdlib_sgecon
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgecon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgecon( norm, n, a, lda, anorm, rcond, work, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond,rwork(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zgecon
+#else 
+               module procedure stdlib_zgecon
+#endif
+          end interface gecon
+
+
+
+          !> GEEQU: computes row and column scalings intended to equilibrate an
+          !> M-by-N matrix A and reduce its condition number.  R returns the row
+          !> scale factors and C the column scale factors, chosen to try to make
+          !> the largest element in each row and column of the matrix B with
+          !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+          !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+          !> number and BIGNUM = largest safe number.  Use of these scaling
+          !> factors is not guaranteed to reduce the condition number of A but
+          !> works well in practice.
+          interface geequ
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgeequ( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    complex(sp), intent(in) :: a(lda,*)
+               end subroutine cgeequ
+#else 
+               module procedure stdlib_cgeequ
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgeequ( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    real(dp), intent(in) :: a(lda,*)
+               end subroutine dgeequ
+#else 
+               module procedure stdlib_dgeequ
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgeequ
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgeequ( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    real(sp), intent(in) :: a(lda,*)
+               end subroutine sgeequ
+#else 
+               module procedure stdlib_sgeequ
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgeequ
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgeequ( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    complex(dp), intent(in) :: a(lda,*)
+               end subroutine zgeequ
+#else 
+               module procedure stdlib_zgeequ
+#endif
+          end interface geequ
+
+
+
+          !> GEEQUB: computes row and column scalings intended to equilibrate an
+          !> M-by-N matrix A and reduce its condition number.  R returns the row
+          !> scale factors and C the column scale factors, chosen to try to make
+          !> the largest element in each row and column of the matrix B with
+          !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+          !> the radix.
+          !> R(i) and C(j) are restricted to be a power of the radix between
+          !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+          !> of these scaling factors is not guaranteed to reduce the condition
+          !> number of A but works well in practice.
+          !> This routine differs from CGEEQU by restricting the scaling factors
+          !> to a power of the radix.  Barring over- and underflow, scaling by
+          !> these factors introduces no additional rounding errors.  However, the
+          !> scaled entries' magnitudes are no longer approximately 1 but lie
+          !> between sqrt(radix) and 1/sqrt(radix).
+          interface geequb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgeequb( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    complex(sp), intent(in) :: a(lda,*)
+               end subroutine cgeequb
+#else 
+               module procedure stdlib_cgeequb
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgeequb( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    real(dp), intent(in) :: a(lda,*)
+               end subroutine dgeequb
+#else 
+               module procedure stdlib_dgeequb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgeequb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgeequb( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    real(sp), intent(in) :: a(lda,*)
+               end subroutine sgeequb
+#else 
+               module procedure stdlib_sgeequb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgeequb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgeequb( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(out) :: amax,colcnd,rowcnd,c(*),r(*)
+                    complex(dp), intent(in) :: a(lda,*)
+               end subroutine zgeequb
+#else 
+               module procedure stdlib_zgeequb
+#endif
+          end interface geequb
+
+
+
+          !> GEES: computes for an N-by-N complex nonsymmetric matrix A, the
+          !> eigenvalues, the Schur form T, and, optionally, the matrix of Schur
+          !> vectors Z.  This gives the Schur factorization A = Z*T*(Z**H).
+          !> Optionally, it also orders the eigenvalues on the diagonal of the
+          !> Schur form so that selected eigenvalues are at the top left.
+          !> The leading columns of Z then form an orthonormal basis for the
+          !> invariant subspace corresponding to the selected eigenvalues.
+          !> A complex matrix is in Schur form if it is upper triangular.
+          interface gees
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cgees( jobvs, sort, select, n, a, lda, sdim, w, vs,ldvs, work, lwork, &
+                         rwork, bwork, info )
+                    import sp,dp,qp,ilp,lk,stdlib_select_c
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvs,sort
+                    integer(ilp), intent(out) :: info,sdim
+                    integer(ilp), intent(in) :: lda,ldvs,lwork,n
+                    logical(lk), intent(out) :: bwork(*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: vs(ldvs,*),w(*),work(*)
+                    procedure(stdlib_select_c) :: select
+               end subroutine cgees
+#else 
+               module procedure stdlib_cgees
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dgees( jobvs, sort, select, n, a, lda, sdim, wr, wi,vs, ldvs, work, &
+                         lwork, bwork, info )
+                    import sp,dp,qp,ilp,lk,stdlib_select_d
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvs,sort
+                    integer(ilp), intent(out) :: info,sdim
+                    integer(ilp), intent(in) :: lda,ldvs,lwork,n
+                    logical(lk), intent(out) :: bwork(*)
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: vs(ldvs,*),wi(*),work(*),wr(*)
+                    procedure(stdlib_select_d) :: select
+               end subroutine dgees
+#else 
+               module procedure stdlib_dgees
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgees
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sgees( jobvs, sort, select, n, a, lda, sdim, wr, wi,vs, ldvs, work, &
+                         lwork, bwork, info )
+                    import sp,dp,qp,ilp,lk,stdlib_select_s 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvs,sort
+                    integer(ilp), intent(out) :: info,sdim
+                    integer(ilp), intent(in) :: lda,ldvs,lwork,n
+                    logical(lk), intent(out) :: bwork(*)
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: vs(ldvs,*),wi(*),work(*),wr(*)
+                    procedure(stdlib_select_s) :: select
+               end subroutine sgees
+#else 
+               module procedure stdlib_sgees
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgees
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zgees( jobvs, sort, select, n, a, lda, sdim, w, vs,ldvs, work, lwork, &
+                         rwork, bwork, info )
+                    import sp,dp,qp,ilp,lk,stdlib_select_z 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvs,sort
+                    integer(ilp), intent(out) :: info,sdim
+                    integer(ilp), intent(in) :: lda,ldvs,lwork,n
+                    logical(lk), intent(out) :: bwork(*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: vs(ldvs,*),w(*),work(*)
+                    procedure(stdlib_select_z) :: select
+               end subroutine zgees
+#else 
+               module procedure stdlib_zgees
+#endif
+          end interface gees
+
+
+
+          !> GEEV: computes for an N-by-N complex nonsymmetric matrix A, the
+          !> eigenvalues and, optionally, the left and/or right eigenvectors.
+          !> The right eigenvector v(j) of A satisfies
+          !> A * v(j) = lambda(j) * v(j)
+          !> where lambda(j) is its eigenvalue.
+          !> The left eigenvector u(j) of A satisfies
+          !> u(j)**H * A = lambda(j) * u(j)**H
+          !> where u(j)**H denotes the conjugate transpose of u(j).
+          !> The computed eigenvectors are normalized to have Euclidean norm
+          !> equal to 1 and largest component real.
+          interface geev
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cgeev( jobvl, jobvr, n, a, lda, w, vl, ldvl, vr, ldvr,work, lwork, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvl,jobvr
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldvl,ldvr,lwork,n
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: vl(ldvl,*),vr(ldvr,*),w(*),work(*)
+               end subroutine cgeev
+#else 
+               module procedure stdlib_cgeev
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dgeev( jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr,ldvr, work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvl,jobvr
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldvl,ldvr,lwork,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: vl(ldvl,*),vr(ldvr,*),wi(*),work(*),wr(*)
+               end subroutine dgeev
+#else 
+               module procedure stdlib_dgeev
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgeev
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sgeev( jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr,ldvr, work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvl,jobvr
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldvl,ldvr,lwork,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: vl(ldvl,*),vr(ldvr,*),wi(*),work(*),wr(*)
+               end subroutine sgeev
+#else 
+               module procedure stdlib_sgeev
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgeev
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zgeev( jobvl, jobvr, n, a, lda, w, vl, ldvl, vr, ldvr,work, lwork, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvl,jobvr
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldvl,ldvr,lwork,n
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: vl(ldvl,*),vr(ldvr,*),w(*),work(*)
+               end subroutine zgeev
+#else 
+               module procedure stdlib_zgeev
+#endif
+          end interface geev
+
+
+
+          !> GEHRD: reduces a complex general matrix A to upper Hessenberg form H by
+          !> an unitary similarity transformation:  Q**H * A * Q = H .
+          interface gehrd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgehrd( n, ilo, ihi, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ilo,lda,lwork,n
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: tau(*),work(*)
+               end subroutine cgehrd
+#else 
+               module procedure stdlib_cgehrd
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgehrd( n, ilo, ihi, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ilo,lda,lwork,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: tau(*),work(*)
+               end subroutine dgehrd
+#else 
+               module procedure stdlib_dgehrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgehrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgehrd( n, ilo, ihi, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ilo,lda,lwork,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: tau(*),work(*)
+               end subroutine sgehrd
+#else 
+               module procedure stdlib_sgehrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgehrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgehrd( n, ilo, ihi, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ilo,lda,lwork,n
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: tau(*),work(*)
+               end subroutine zgehrd
+#else 
+               module procedure stdlib_zgehrd
+#endif
+          end interface gehrd
+
+
+
+          !> GEJSV: computes the singular value decomposition (SVD) of a complex M-by-N
+          !> matrix [A], where M >= N. The SVD of [A] is written as
+          !> [A] = [U] * [SIGMA] * [V]^*,
+          !> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+          !> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and
+          !> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are
+          !> the singular values of [A]. The columns of [U] and [V] are the left and
+          !> the right singular vectors of [A], respectively. The matrices [U] and [V]
+          !> are computed and stored in the arrays U and V, respectively. The diagonal
+          !> of [SIGMA] is computed and stored in the array SVA.
+          interface gejsv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgejsv( joba, jobu, jobv, jobr, jobt, jobp,m, n, a, lda, sva, u, &
+                         ldu, v, ldv,cwork, lwork, rwork, lrwork, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldu,ldv,lwork,lrwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: u(ldu,*),v(ldv,*),cwork(lwork)
+                    real(sp), intent(out) :: sva(n),rwork(lrwork)
+                    character, intent(in) :: joba,jobp,jobr,jobt,jobu,jobv
+               end subroutine cgejsv
+#else 
+               module procedure stdlib_cgejsv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgejsv( joba, jobu, jobv, jobr, jobt, jobp,m, n, a, lda, sva, u, &
+                         ldu, v, ldv,work, lwork, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldu,ldv,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: sva(n),u(ldu,*),v(ldv,*),work(lwork)
+                    character, intent(in) :: joba,jobp,jobr,jobt,jobu,jobv
+               end subroutine dgejsv
+#else 
+               module procedure stdlib_dgejsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgejsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgejsv( joba, jobu, jobv, jobr, jobt, jobp,m, n, a, lda, sva, u, &
+                         ldu, v, ldv,work, lwork, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldu,ldv,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: sva(n),u(ldu,*),v(ldv,*),work(lwork)
+                    character, intent(in) :: joba,jobp,jobr,jobt,jobu,jobv
+               end subroutine sgejsv
+#else 
+               module procedure stdlib_sgejsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgejsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgejsv( joba, jobu, jobv, jobr, jobt, jobp,m, n, a, lda, sva, u, &
+                         ldu, v, ldv,cwork, lwork, rwork, lrwork, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldu,ldv,lwork,lrwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: u(ldu,*),v(ldv,*),cwork(lwork)
+                    real(dp), intent(out) :: sva(n),rwork(lrwork)
+                    character, intent(in) :: joba,jobp,jobr,jobt,jobu,jobv
+               end subroutine zgejsv
+#else 
+               module procedure stdlib_zgejsv
+#endif
+          end interface gejsv
+
+
+
+          !> GELQ: computes an LQ factorization of a complex M-by-N matrix A:
+          !> A = ( L 0 ) *  Q
+          !> where:
+          !> Q is a N-by-N orthogonal matrix;
+          !> L is a lower-triangular M-by-M matrix;
+          !> 0 is a M-by-(N-M) zero matrix, if M < N.
+          interface gelq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgelq( m, n, a, lda, t, tsize, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,tsize,lwork
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: t(*),work(*)
+               end subroutine cgelq
+#else 
+               module procedure stdlib_cgelq
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgelq( m, n, a, lda, t, tsize, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,tsize,lwork
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: t(*),work(*)
+               end subroutine dgelq
+#else 
+               module procedure stdlib_dgelq
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgelq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgelq( m, n, a, lda, t, tsize, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,tsize,lwork
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: t(*),work(*)
+               end subroutine sgelq
+#else 
+               module procedure stdlib_sgelq
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgelq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgelq( m, n, a, lda, t, tsize, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,tsize,lwork
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: t(*),work(*)
+               end subroutine zgelq
+#else 
+               module procedure stdlib_zgelq
+#endif
+          end interface gelq
+
+
+
+          !> GELQF: computes an LQ factorization of a complex M-by-N matrix A:
+          !> A = ( L 0 ) *  Q
+          !> where:
+          !> Q is a N-by-N orthogonal matrix;
+          !> L is a lower-triangular M-by-M matrix;
+          !> 0 is a M-by-(N-M) zero matrix, if M < N.
+          interface gelqf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgelqf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: tau(*),work(*)
+               end subroutine cgelqf
+#else 
+               module procedure stdlib_cgelqf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgelqf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: tau(*),work(*)
+               end subroutine dgelqf
+#else 
+               module procedure stdlib_dgelqf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgelqf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgelqf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: tau(*),work(*)
+               end subroutine sgelqf
+#else 
+               module procedure stdlib_sgelqf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgelqf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgelqf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: tau(*),work(*)
+               end subroutine zgelqf
+#else 
+               module procedure stdlib_zgelqf
+#endif
+          end interface gelqf
+
+
+
+          !> GELQT: computes a blocked LQ factorization of a complex M-by-N matrix A
+          !> using the compact WY representation of Q.
+          interface gelqt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgelqt( m, n, mb, a, lda, t, ldt, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n,mb
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: t(ldt,*),work(*)
+               end subroutine cgelqt
+#else 
+               module procedure stdlib_cgelqt
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgelqt( m, n, mb, a, lda, t, ldt, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n,mb
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: t(ldt,*),work(*)
+               end subroutine dgelqt
+#else 
+               module procedure stdlib_dgelqt
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgelqt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgelqt( m, n, mb, a, lda, t, ldt, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n,mb
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: t(ldt,*),work(*)
+               end subroutine sgelqt
+#else 
+               module procedure stdlib_sgelqt
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgelqt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgelqt( m, n, mb, a, lda, t, ldt, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n,mb
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: t(ldt,*),work(*)
+               end subroutine zgelqt
+#else 
+               module procedure stdlib_zgelqt
+#endif
+          end interface gelqt
+
+
+
+          !> GELQT3: recursively computes a LQ factorization of a complex M-by-N
+          !> matrix A, using the compact WY representation of Q.
+          !> Based on the algorithm of Elmroth and Gustavson,
+          !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+          interface gelqt3
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine cgelqt3( m, n, a, lda, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,ldt
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: t(ldt,*)
+               end subroutine cgelqt3
+#else 
+               module procedure stdlib_cgelqt3
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine dgelqt3( m, n, a, lda, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,ldt
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: t(ldt,*)
+               end subroutine dgelqt3
+#else 
+               module procedure stdlib_dgelqt3
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgelqt3
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine sgelqt3( m, n, a, lda, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,ldt
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: t(ldt,*)
+               end subroutine sgelqt3
+#else 
+               module procedure stdlib_sgelqt3
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgelqt3
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine zgelqt3( m, n, a, lda, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,ldt
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: t(ldt,*)
+               end subroutine zgelqt3
+#else 
+               module procedure stdlib_zgelqt3
+#endif
+          end interface gelqt3
+
+
+
+          !> GELS: solves overdetermined or underdetermined complex linear systems
+          !> involving an M-by-N matrix A, or its conjugate-transpose, using a QR
+          !> or LQ factorization of A.  It is assumed that A has full rank.
+          !> The following options are provided:
+          !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+          !> an overdetermined system, i.e., solve the least squares problem
+          !> minimize || B - A*X ||.
+          !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+          !> an underdetermined system A * X = B.
+          !> 3. If TRANS = 'C' and m >= n:  find the minimum norm solution of
+          !> an underdetermined system A**H * X = B.
+          !> 4. If TRANS = 'C' and m < n:  find the least squares solution of
+          !> an overdetermined system, i.e., solve the least squares problem
+          !> minimize || B - A**H * X ||.
+          !> Several right hand side vectors b and solution vectors x can be
+          !> handled in a single call; they are stored as the columns of the
+          !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+          !> matrix X.
+          interface gels
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cgels( trans, m, n, nrhs, a, lda, b, ldb, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cgels
+#else 
+               module procedure stdlib_cgels
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dgels( trans, m, n, nrhs, a, lda, b, ldb, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dgels
+#else 
+               module procedure stdlib_dgels
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgels
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sgels( trans, m, n, nrhs, a, lda, b, ldb, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sgels
+#else 
+               module procedure stdlib_sgels
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgels
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zgels( trans, m, n, nrhs, a, lda, b, ldb, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zgels
+#else 
+               module procedure stdlib_zgels
+#endif
+          end interface gels
+
+
+
+          !> GELSD: computes the minimum-norm solution to a real linear least
+          !> squares problem:
+          !> minimize 2-norm(| b - A*x |)
+          !> using the singular value decomposition (SVD) of A. A is an M-by-N
+          !> matrix which may be rank-deficient.
+          !> Several right hand side vectors b and solution vectors x can be
+          !> handled in a single call; they are stored as the columns of the
+          !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+          !> matrix X.
+          !> The problem is solved in three steps:
+          !> (1) Reduce the coefficient matrix A to bidiagonal form with
+          !> Householder transformations, reducing the original problem
+          !> into a "bidiagonal least squares problem" (BLS)
+          !> (2) Solve the BLS using a divide and conquer approach.
+          !> (3) Apply back all the Householder transformations to solve
+          !> the original least squares problem.
+          !> The effective rank of A is determined by treating as zero those
+          !> singular values which are less than RCOND times the largest singular
+          !> value.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface gelsd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cgelsd( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, rwork, &
+                         iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,rank,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(sp), intent(in) :: rcond
+                    real(sp), intent(out) :: rwork(*),s(*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cgelsd
+#else 
+               module procedure stdlib_cgelsd
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dgelsd( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,rank,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(dp), intent(in) :: rcond
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: s(*),work(*)
+               end subroutine dgelsd
+#else 
+               module procedure stdlib_dgelsd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgelsd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sgelsd( m, n, nrhs, a, lda, b, ldb, s, rcond,rank, work, lwork, iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,rank,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(sp), intent(in) :: rcond
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: s(*),work(*)
+               end subroutine sgelsd
+#else 
+               module procedure stdlib_sgelsd
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgelsd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zgelsd( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, rwork, &
+                         iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,rank,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(dp), intent(in) :: rcond
+                    real(dp), intent(out) :: rwork(*),s(*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zgelsd
+#else 
+               module procedure stdlib_zgelsd
+#endif
+          end interface gelsd
+
+
+
+          !> GELSS: computes the minimum norm solution to a complex linear
+          !> least squares problem:
+          !> Minimize 2-norm(| b - A*x |).
+          !> using the singular value decomposition (SVD) of A. A is an M-by-N
+          !> matrix which may be rank-deficient.
+          !> Several right hand side vectors b and solution vectors x can be
+          !> handled in a single call; they are stored as the columns of the
+          !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution matrix
+          !> X.
+          !> The effective rank of A is determined by treating as zero those
+          !> singular values which are less than RCOND times the largest singular
+          !> value.
+          interface gelss
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cgelss( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, rwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,rank
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(sp), intent(in) :: rcond
+                    real(sp), intent(out) :: rwork(*),s(*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cgelss
+#else 
+               module procedure stdlib_cgelss
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dgelss( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,rank
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(dp), intent(in) :: rcond
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: s(*),work(*)
+               end subroutine dgelss
+#else 
+               module procedure stdlib_dgelss
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgelss
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sgelss( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,rank
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(sp), intent(in) :: rcond
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: s(*),work(*)
+               end subroutine sgelss
+#else 
+               module procedure stdlib_sgelss
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgelss
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zgelss( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, rwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,rank
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(dp), intent(in) :: rcond
+                    real(dp), intent(out) :: rwork(*),s(*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zgelss
+#else 
+               module procedure stdlib_zgelss
+#endif
+          end interface gelss
+
+
+
+          !> GELSY: computes the minimum-norm solution to a complex linear least
+          !> squares problem:
+          !> minimize || A * X - B ||
+          !> using a complete orthogonal factorization of A.  A is an M-by-N
+          !> matrix which may be rank-deficient.
+          !> Several right hand side vectors b and solution vectors x can be
+          !> handled in a single call; they are stored as the columns of the
+          !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+          !> matrix X.
+          !> The routine first computes a QR factorization with column pivoting:
+          !> A * P = Q * [ R11 R12 ]
+          !> [  0  R22 ]
+          !> with R11 defined as the largest leading submatrix whose estimated
+          !> condition number is less than 1/RCOND.  The order of R11, RANK,
+          !> is the effective rank of A.
+          !> Then, R22 is considered to be negligible, and R12 is annihilated
+          !> by unitary transformations from the right, arriving at the
+          !> complete orthogonal factorization:
+          !> A * P = Q * [ T11 0 ] * Z
+          !> [  0  0 ]
+          !> The minimum-norm solution is then
+          !> X = P * Z**H [ inv(T11)*Q1**H*B ]
+          !> [        0         ]
+          !> where Q1 consists of the first RANK columns of Q.
+          !> This routine is basically identical to the original xGELSX except
+          !> three differences:
+          !> o The permutation of matrix B (the right hand side) is faster and
+          !> more simple.
+          !> o The call to the subroutine xGEQPF has been substituted by the
+          !> the call to the subroutine xGEQP3. This subroutine is a Blas-3
+          !> version of the QR factorization with column pivoting.
+          !> o Matrix B (the right hand side) is updated with Blas-3.
+          interface gelsy
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cgelsy( m, n, nrhs, a, lda, b, ldb, jpvt, rcond, rank,work, lwork, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,rank
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(sp), intent(in) :: rcond
+                    integer(ilp), intent(inout) :: jpvt(*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cgelsy
+#else 
+               module procedure stdlib_cgelsy
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dgelsy( m, n, nrhs, a, lda, b, ldb, jpvt, rcond, rank,work, lwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,rank
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(dp), intent(in) :: rcond
+                    integer(ilp), intent(inout) :: jpvt(*)
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dgelsy
+#else 
+               module procedure stdlib_dgelsy
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgelsy
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sgelsy( m, n, nrhs, a, lda, b, ldb, jpvt, rcond, rank,work, lwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,rank
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(sp), intent(in) :: rcond
+                    integer(ilp), intent(inout) :: jpvt(*)
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sgelsy
+#else 
+               module procedure stdlib_sgelsy
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgelsy
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zgelsy( m, n, nrhs, a, lda, b, ldb, jpvt, rcond, rank,work, lwork, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,rank
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(dp), intent(in) :: rcond
+                    integer(ilp), intent(inout) :: jpvt(*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zgelsy
+#else 
+               module procedure stdlib_zgelsy
+#endif
+          end interface gelsy
+
+
+
+          !> GEMLQ: overwrites the general real M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'C':      Q**H * C       C * Q**H
+          !> where Q is a complex unitary matrix defined as the product
+          !> of blocked elementary reflectors computed by short wide
+          !> LQ factorization (CGELQ)
+          interface gemlq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgemlq( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,tsize,lwork,ldc
+                    complex(sp), intent(in) :: a(lda,*),t(*)
+                    complex(sp), intent(inout) :: c(ldc,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cgemlq
+#else 
+               module procedure stdlib_cgemlq
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgemlq( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,tsize,lwork,ldc
+                    real(dp), intent(in) :: a(lda,*),t(*)
+                    real(dp), intent(inout) :: c(ldc,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dgemlq
+#else 
+               module procedure stdlib_dgemlq
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgemlq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgemlq( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,tsize,lwork,ldc
+                    real(sp), intent(in) :: a(lda,*),t(*)
+                    real(sp), intent(inout) :: c(ldc,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sgemlq
+#else 
+               module procedure stdlib_sgemlq
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgemlq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgemlq( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,tsize,lwork,ldc
+                    complex(dp), intent(in) :: a(lda,*),t(*)
+                    complex(dp), intent(inout) :: c(ldc,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zgemlq
+#else 
+               module procedure stdlib_zgemlq
+#endif
+          end interface gemlq
+
+
+
+          !> GEMLQT: overwrites the general complex M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q C            C Q
+          !> TRANS = 'C':   Q**H C            C Q**H
+          !> where Q is a complex unitary matrix defined as the product of K
+          !> elementary reflectors:
+          !> Q = H(1) H(2) . . . H(K) = I - V T V**H
+          !> generated using the compact WY representation as returned by CGELQT.
+          !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+          interface gemlqt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgemlqt( side, trans, m, n, k, mb, v, ldv, t, ldt,c, ldc, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,ldc,m,n,mb,ldt
+                    complex(sp), intent(in) :: v(ldv,*),t(ldt,*)
+                    complex(sp), intent(inout) :: c(ldc,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cgemlqt
+#else 
+               module procedure stdlib_cgemlqt
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgemlqt( side, trans, m, n, k, mb, v, ldv, t, ldt,c, ldc, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,ldc,m,n,mb,ldt
+                    real(dp), intent(in) :: v(ldv,*),t(ldt,*)
+                    real(dp), intent(inout) :: c(ldc,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dgemlqt
+#else 
+               module procedure stdlib_dgemlqt
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgemlqt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgemlqt( side, trans, m, n, k, mb, v, ldv, t, ldt,c, ldc, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,ldc,m,n,mb,ldt
+                    real(sp), intent(in) :: v(ldv,*),t(ldt,*)
+                    real(sp), intent(inout) :: c(ldc,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sgemlqt
+#else 
+               module procedure stdlib_sgemlqt
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgemlqt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgemlqt( side, trans, m, n, k, mb, v, ldv, t, ldt,c, ldc, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,ldc,m,n,mb,ldt
+                    complex(dp), intent(in) :: v(ldv,*),t(ldt,*)
+                    complex(dp), intent(inout) :: c(ldc,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zgemlqt
+#else 
+               module procedure stdlib_zgemlqt
+#endif
+          end interface gemlqt
+
+
+
+          !> GEMQR: overwrites the general real M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'T':      Q**H * C       C * Q**H
+          !> where Q is a complex unitary matrix defined as the product
+          !> of blocked elementary reflectors computed by tall skinny
+          !> QR factorization (CGEQR)
+          interface gemqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgemqr( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,tsize,lwork,ldc
+                    complex(sp), intent(in) :: a(lda,*),t(*)
+                    complex(sp), intent(inout) :: c(ldc,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cgemqr
+#else 
+               module procedure stdlib_cgemqr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgemqr( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,tsize,lwork,ldc
+                    real(dp), intent(in) :: a(lda,*),t(*)
+                    real(dp), intent(inout) :: c(ldc,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dgemqr
+#else 
+               module procedure stdlib_dgemqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgemqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgemqr( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,tsize,lwork,ldc
+                    real(sp), intent(in) :: a(lda,*),t(*)
+                    real(sp), intent(inout) :: c(ldc,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sgemqr
+#else 
+               module procedure stdlib_sgemqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgemqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgemqr( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,tsize,lwork,ldc
+                    complex(dp), intent(in) :: a(lda,*),t(*)
+                    complex(dp), intent(inout) :: c(ldc,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zgemqr
+#else 
+               module procedure stdlib_zgemqr
+#endif
+          end interface gemqr
+
+
+
+          !> GEMQRT: overwrites the general complex M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q C            C Q
+          !> TRANS = 'C':    Q**H C            C Q**H
+          !> where Q is a complex orthogonal matrix defined as the product of K
+          !> elementary reflectors:
+          !> Q = H(1) H(2) . . . H(K) = I - V T V**H
+          !> generated using the compact WY representation as returned by CGEQRT.
+          !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+          interface gemqrt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgemqrt( side, trans, m, n, k, nb, v, ldv, t, ldt,c, ldc, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,ldc,m,n,nb,ldt
+                    complex(sp), intent(in) :: v(ldv,*),t(ldt,*)
+                    complex(sp), intent(inout) :: c(ldc,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cgemqrt
+#else 
+               module procedure stdlib_cgemqrt
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgemqrt( side, trans, m, n, k, nb, v, ldv, t, ldt,c, ldc, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,ldc,m,n,nb,ldt
+                    real(dp), intent(in) :: v(ldv,*),t(ldt,*)
+                    real(dp), intent(inout) :: c(ldc,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dgemqrt
+#else 
+               module procedure stdlib_dgemqrt
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgemqrt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgemqrt( side, trans, m, n, k, nb, v, ldv, t, ldt,c, ldc, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,ldc,m,n,nb,ldt
+                    real(sp), intent(in) :: v(ldv,*),t(ldt,*)
+                    real(sp), intent(inout) :: c(ldc,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sgemqrt
+#else 
+               module procedure stdlib_sgemqrt
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgemqrt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgemqrt( side, trans, m, n, k, nb, v, ldv, t, ldt,c, ldc, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,ldc,m,n,nb,ldt
+                    complex(dp), intent(in) :: v(ldv,*),t(ldt,*)
+                    complex(dp), intent(inout) :: c(ldc,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zgemqrt
+#else 
+               module procedure stdlib_zgemqrt
+#endif
+          end interface gemqrt
+
+
+
+          !> GEQLF: computes a QL factorization of a complex M-by-N matrix A:
+          !> A = Q * L.
+          interface geqlf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgeqlf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: tau(*),work(*)
+               end subroutine cgeqlf
+#else 
+               module procedure stdlib_cgeqlf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgeqlf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: tau(*),work(*)
+               end subroutine dgeqlf
+#else 
+               module procedure stdlib_dgeqlf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgeqlf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgeqlf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: tau(*),work(*)
+               end subroutine sgeqlf
+#else 
+               module procedure stdlib_sgeqlf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgeqlf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgeqlf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: tau(*),work(*)
+               end subroutine zgeqlf
+#else 
+               module procedure stdlib_zgeqlf
+#endif
+          end interface geqlf
+
+
+
+          !> GEQR: computes a QR factorization of a complex M-by-N matrix A:
+          !> A = Q * ( R ),
+          !> ( 0 )
+          !> where:
+          !> Q is a M-by-M orthogonal matrix;
+          !> R is an upper-triangular N-by-N matrix;
+          !> 0 is a (M-N)-by-N zero matrix, if M > N.
+          interface geqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgeqr( m, n, a, lda, t, tsize, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,tsize,lwork
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: t(*),work(*)
+               end subroutine cgeqr
+#else 
+               module procedure stdlib_cgeqr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgeqr( m, n, a, lda, t, tsize, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,tsize,lwork
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: t(*),work(*)
+               end subroutine dgeqr
+#else 
+               module procedure stdlib_dgeqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgeqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgeqr( m, n, a, lda, t, tsize, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,tsize,lwork
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: t(*),work(*)
+               end subroutine sgeqr
+#else 
+               module procedure stdlib_sgeqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgeqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgeqr( m, n, a, lda, t, tsize, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,tsize,lwork
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: t(*),work(*)
+               end subroutine zgeqr
+#else 
+               module procedure stdlib_zgeqr
+#endif
+          end interface geqr
+
+
+
+          !> GEQR2P: computes a QR factorization of a complex m-by-n matrix A:
+          !> A = Q * ( R ),
+          !> ( 0 )
+          !> where:
+          !> Q is a m-by-m orthogonal matrix;
+          !> R is an upper-triangular n-by-n matrix with nonnegative diagonal
+          !> entries;
+          !> 0 is a (m-n)-by-n zero matrix, if m > n.
+          interface geqr2p
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cgeqr2p( m, n, a, lda, tau, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: tau(*),work(*)
+               end subroutine cgeqr2p
+#else 
+               module procedure stdlib_cgeqr2p
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dgeqr2p( m, n, a, lda, tau, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: tau(*),work(*)
+               end subroutine dgeqr2p
+#else 
+               module procedure stdlib_dgeqr2p
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgeqr2p
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sgeqr2p( m, n, a, lda, tau, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: tau(*),work(*)
+               end subroutine sgeqr2p
+#else 
+               module procedure stdlib_sgeqr2p
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgeqr2p
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zgeqr2p( m, n, a, lda, tau, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: tau(*),work(*)
+               end subroutine zgeqr2p
+#else 
+               module procedure stdlib_zgeqr2p
+#endif
+          end interface geqr2p
+
+
+
+          !> GEQRF: computes a QR factorization of a complex M-by-N matrix A:
+          !> A = Q * ( R ),
+          !> ( 0 )
+          !> where:
+          !> Q is a M-by-M orthogonal matrix;
+          !> R is an upper-triangular N-by-N matrix;
+          !> 0 is a (M-N)-by-N zero matrix, if M > N.
+          interface geqrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgeqrf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: tau(*),work(*)
+               end subroutine cgeqrf
+#else 
+               module procedure stdlib_cgeqrf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgeqrf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: tau(*),work(*)
+               end subroutine dgeqrf
+#else 
+               module procedure stdlib_dgeqrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgeqrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgeqrf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: tau(*),work(*)
+               end subroutine sgeqrf
+#else 
+               module procedure stdlib_sgeqrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgeqrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgeqrf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: tau(*),work(*)
+               end subroutine zgeqrf
+#else 
+               module procedure stdlib_zgeqrf
+#endif
+          end interface geqrf
+
+
+
+          !> CGEQR2P computes a QR factorization of a complex M-by-N matrix A:
+          !> A = Q * ( R ),
+          !> ( 0 )
+          !> where:
+          !> Q is a M-by-M orthogonal matrix;
+          !> R is an upper-triangular N-by-N matrix with nonnegative diagonal
+          !> entries;
+          !> 0 is a (M-N)-by-N zero matrix, if M > N.
+          interface geqrfp
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cgeqrfp( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: tau(*),work(*)
+               end subroutine cgeqrfp
+#else 
+               module procedure stdlib_cgeqrfp
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dgeqrfp( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: tau(*),work(*)
+               end subroutine dgeqrfp
+#else 
+               module procedure stdlib_dgeqrfp
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgeqrfp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sgeqrfp( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: tau(*),work(*)
+               end subroutine sgeqrfp
+#else 
+               module procedure stdlib_sgeqrfp
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgeqrfp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zgeqrfp( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: tau(*),work(*)
+               end subroutine zgeqrfp
+#else 
+               module procedure stdlib_zgeqrfp
+#endif
+          end interface geqrfp
+
+
+
+          !> GEQRT: computes a blocked QR factorization of a complex M-by-N matrix A
+          !> using the compact WY representation of Q.
+          interface geqrt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgeqrt( m, n, nb, a, lda, t, ldt, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n,nb
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: t(ldt,*),work(*)
+               end subroutine cgeqrt
+#else 
+               module procedure stdlib_cgeqrt
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgeqrt( m, n, nb, a, lda, t, ldt, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n,nb
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: t(ldt,*),work(*)
+               end subroutine dgeqrt
+#else 
+               module procedure stdlib_dgeqrt
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgeqrt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgeqrt( m, n, nb, a, lda, t, ldt, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n,nb
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: t(ldt,*),work(*)
+               end subroutine sgeqrt
+#else 
+               module procedure stdlib_sgeqrt
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgeqrt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgeqrt( m, n, nb, a, lda, t, ldt, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n,nb
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: t(ldt,*),work(*)
+               end subroutine zgeqrt
+#else 
+               module procedure stdlib_zgeqrt
+#endif
+          end interface geqrt
+
+
+
+          !> GEQRT2: computes a QR factorization of a complex M-by-N matrix A,
+          !> using the compact WY representation of Q.
+          interface geqrt2
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgeqrt2( m, n, a, lda, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: t(ldt,*)
+               end subroutine cgeqrt2
+#else 
+               module procedure stdlib_cgeqrt2
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgeqrt2( m, n, a, lda, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: t(ldt,*)
+               end subroutine dgeqrt2
+#else 
+               module procedure stdlib_dgeqrt2
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgeqrt2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgeqrt2( m, n, a, lda, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: t(ldt,*)
+               end subroutine sgeqrt2
+#else 
+               module procedure stdlib_sgeqrt2
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgeqrt2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgeqrt2( m, n, a, lda, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: t(ldt,*)
+               end subroutine zgeqrt2
+#else 
+               module procedure stdlib_zgeqrt2
+#endif
+          end interface geqrt2
+
+
+
+          !> GEQRT3: recursively computes a QR factorization of a complex M-by-N matrix A,
+          !> using the compact WY representation of Q.
+          !> Based on the algorithm of Elmroth and Gustavson,
+          !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+          interface geqrt3
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine cgeqrt3( m, n, a, lda, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,ldt
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: t(ldt,*)
+               end subroutine cgeqrt3
+#else 
+               module procedure stdlib_cgeqrt3
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine dgeqrt3( m, n, a, lda, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,ldt
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: t(ldt,*)
+               end subroutine dgeqrt3
+#else 
+               module procedure stdlib_dgeqrt3
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgeqrt3
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine sgeqrt3( m, n, a, lda, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,ldt
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: t(ldt,*)
+               end subroutine sgeqrt3
+#else 
+               module procedure stdlib_sgeqrt3
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgeqrt3
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine zgeqrt3( m, n, a, lda, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,ldt
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: t(ldt,*)
+               end subroutine zgeqrt3
+#else 
+               module procedure stdlib_zgeqrt3
+#endif
+          end interface geqrt3
+
+
+
+          !> GERFS: improves the computed solution to a system of linear
+          !> equations and provides error bounds and backward error estimates for
+          !> the solution.
+          interface gerfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, &
+                         ferr, berr, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs,ipiv(*)
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(sp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: x(ldx,*)
+               end subroutine cgerfs
+#else 
+               module procedure stdlib_cgerfs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, &
+                         ferr, berr, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs,ipiv(*)
+                    real(dp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    real(dp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(dp), intent(inout) :: x(ldx,*)
+               end subroutine dgerfs
+#else 
+               module procedure stdlib_dgerfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgerfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, &
+                         ferr, berr, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs,ipiv(*)
+                    real(sp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    real(sp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(sp), intent(inout) :: x(ldx,*)
+               end subroutine sgerfs
+#else 
+               module procedure stdlib_sgerfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgerfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, &
+                         ferr, berr, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs,ipiv(*)
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(dp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: x(ldx,*)
+               end subroutine zgerfs
+#else 
+               module procedure stdlib_zgerfs
+#endif
+          end interface gerfs
+
+
+
+          !> GERQF: computes an RQ factorization of a complex M-by-N matrix A:
+          !> A = R * Q.
+          interface gerqf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgerqf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: tau(*),work(*)
+               end subroutine cgerqf
+#else 
+               module procedure stdlib_cgerqf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgerqf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: tau(*),work(*)
+               end subroutine dgerqf
+#else 
+               module procedure stdlib_dgerqf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgerqf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgerqf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: tau(*),work(*)
+               end subroutine sgerqf
+#else 
+               module procedure stdlib_sgerqf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgerqf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgerqf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: tau(*),work(*)
+               end subroutine zgerqf
+#else 
+               module procedure stdlib_zgerqf
+#endif
+          end interface gerqf
+
+
+
+          !> GESDD: computes the singular value decomposition (SVD) of a complex
+          !> M-by-N matrix A, optionally computing the left and/or right singular
+          !> vectors, by using divide-and-conquer method. The SVD is written
+          !> A = U * SIGMA * conjugate-transpose(V)
+          !> where SIGMA is an M-by-N matrix which is zero except for its
+          !> min(m,n) diagonal elements, U is an M-by-M unitary matrix, and
+          !> V is an N-by-N unitary matrix.  The diagonal elements of SIGMA
+          !> are the singular values of A; they are real and non-negative, and
+          !> are returned in descending order.  The first min(m,n) columns of
+          !> U and V are the left and right singular vectors of A.
+          !> Note that the routine returns VT = V**H, not V.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface gesdd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cgesdd( jobz, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, rwork, &
+                         iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldu,ldvt,lwork,m,n
+                    real(sp), intent(out) :: rwork(*),s(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: u(ldu,*),vt(ldvt,*),work(*)
+               end subroutine cgesdd
+#else 
+               module procedure stdlib_cgesdd
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dgesdd( jobz, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldu,ldvt,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: s(*),u(ldu,*),vt(ldvt,*),work(*)
+               end subroutine dgesdd
+#else 
+               module procedure stdlib_dgesdd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgesdd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sgesdd( jobz, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldu,ldvt,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: s(*),u(ldu,*),vt(ldvt,*),work(*)
+               end subroutine sgesdd
+#else 
+               module procedure stdlib_sgesdd
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgesdd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zgesdd( jobz, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, rwork, &
+                         iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldu,ldvt,lwork,m,n
+                    real(dp), intent(out) :: rwork(*),s(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: u(ldu,*),vt(ldvt,*),work(*)
+               end subroutine zgesdd
+#else 
+               module procedure stdlib_zgesdd
+#endif
+          end interface gesdd
+
+
+
+          !> GESV: computes the solution to a complex system of linear equations
+          !> A * X = B,
+          !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+          !> The LU decomposition with partial pivoting and row interchanges is
+          !> used to factor A as
+          !> A = P * L * U,
+          !> where P is a permutation matrix, L is unit lower triangular, and U is
+          !> upper triangular.  The factored form of A is then used to solve the
+          !> system of equations A * X = B.
+          interface gesv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgesv( n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+               end subroutine cgesv
+#else 
+               module procedure stdlib_cgesv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgesv( n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+               end subroutine dgesv
+#else 
+               module procedure stdlib_dgesv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgesv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgesv( n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+               end subroutine sgesv
+#else 
+               module procedure stdlib_sgesv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgesv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgesv( n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+               end subroutine zgesv
+#else 
+               module procedure stdlib_zgesv
+#endif
+          end interface gesv
+
+
+
+          !> GESVD: computes the singular value decomposition (SVD) of a complex
+          !> M-by-N matrix A, optionally computing the left and/or right singular
+          !> vectors. The SVD is written
+          !> A = U * SIGMA * conjugate-transpose(V)
+          !> where SIGMA is an M-by-N matrix which is zero except for its
+          !> min(m,n) diagonal elements, U is an M-by-M unitary matrix, and
+          !> V is an N-by-N unitary matrix.  The diagonal elements of SIGMA
+          !> are the singular values of A; they are real and non-negative, and
+          !> are returned in descending order.  The first min(m,n) columns of
+          !> U and V are the left and right singular vectors of A.
+          !> Note that the routine returns V**H, not V.
+          interface gesvd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cgesvd( jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu,jobvt
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldu,ldvt,lwork,m,n
+                    real(sp), intent(out) :: rwork(*),s(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: u(ldu,*),vt(ldvt,*),work(*)
+               end subroutine cgesvd
+#else 
+               module procedure stdlib_cgesvd
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dgesvd( jobu, jobvt, m, n, a, lda, s, u, ldu,vt, ldvt, work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu,jobvt
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldu,ldvt,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: s(*),u(ldu,*),vt(ldvt,*),work(*)
+               end subroutine dgesvd
+#else 
+               module procedure stdlib_dgesvd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgesvd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sgesvd( jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu,jobvt
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldu,ldvt,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: s(*),u(ldu,*),vt(ldvt,*),work(*)
+               end subroutine sgesvd
+#else 
+               module procedure stdlib_sgesvd
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgesvd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zgesvd( jobu, jobvt, m, n, a, lda, s, u, ldu,vt, ldvt, work, lwork, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu,jobvt
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldu,ldvt,lwork,m,n
+                    real(dp), intent(out) :: rwork(*),s(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: u(ldu,*),vt(ldvt,*),work(*)
+               end subroutine zgesvd
+#else 
+               module procedure stdlib_zgesvd
+#endif
+          end interface gesvd
+
+
+
+          !> GESVDQ: computes the singular value decomposition (SVD) of a complex
+          !> M-by-N matrix A, where M >= N. The SVD of A is written as
+          !> [++]   [xx]   [x0]   [xx]
+          !> A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+          !> [++]   [xx]
+          !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+          !> matrix, and V is an N-by-N unitary matrix. The diagonal elements
+          !> of SIGMA are the singular values of A. The columns of U and V are the
+          !> left and the right singular vectors of A, respectively.
+          interface gesvdq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cgesvdq( joba, jobp, jobr, jobu, jobv, m, n, a, lda,s, u, ldu, v, ldv, &
+                         numrank, iwork, liwork,cwork, lcwork, rwork, lrwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: joba,jobp,jobr,jobu,jobv
+                    integer(ilp), intent(in) :: m,n,lda,ldu,ldv,liwork,lrwork
+                    integer(ilp), intent(out) :: numrank,info,iwork(*)
+                    integer(ilp), intent(inout) :: lcwork
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: u(ldu,*),v(ldv,*),cwork(*)
+                    real(sp), intent(out) :: s(*),rwork(*)
+               end subroutine cgesvdq
+#else 
+               module procedure stdlib_cgesvdq
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dgesvdq( joba, jobp, jobr, jobu, jobv, m, n, a, lda,s, u, ldu, v, ldv, &
+                         numrank, iwork, liwork,work, lwork, rwork, lrwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: joba,jobp,jobr,jobu,jobv
+                    integer(ilp), intent(in) :: m,n,lda,ldu,ldv,liwork,lrwork
+                    integer(ilp), intent(out) :: numrank,info,iwork(*)
+                    integer(ilp), intent(inout) :: lwork
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: u(ldu,*),v(ldv,*),work(*),s(*),rwork(*)
+               end subroutine dgesvdq
+#else 
+               module procedure stdlib_dgesvdq
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgesvdq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sgesvdq( joba, jobp, jobr, jobu, jobv, m, n, a, lda,s, u, ldu, v, ldv, &
+                         numrank, iwork, liwork,work, lwork, rwork, lrwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: joba,jobp,jobr,jobu,jobv
+                    integer(ilp), intent(in) :: m,n,lda,ldu,ldv,liwork,lrwork
+                    integer(ilp), intent(out) :: numrank,info,iwork(*)
+                    integer(ilp), intent(inout) :: lwork
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: u(ldu,*),v(ldv,*),work(*),s(*),rwork(*)
+               end subroutine sgesvdq
+#else 
+               module procedure stdlib_sgesvdq
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgesvdq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zgesvdq( joba, jobp, jobr, jobu, jobv, m, n, a, lda,s, u, ldu, v, ldv, &
+                         numrank, iwork, liwork,cwork, lcwork, rwork, lrwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: joba,jobp,jobr,jobu,jobv
+                    integer(ilp), intent(in) :: m,n,lda,ldu,ldv,liwork,lrwork
+                    integer(ilp), intent(out) :: numrank,info,iwork(*)
+                    integer(ilp), intent(inout) :: lcwork
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: u(ldu,*),v(ldv,*),cwork(*)
+                    real(dp), intent(out) :: s(*),rwork(*)
+               end subroutine zgesvdq
+#else 
+               module procedure stdlib_zgesvdq
+#endif
+          end interface gesvdq
+
+
+
+          !> GESVJ: computes the singular value decomposition (SVD) of a complex
+          !> M-by-N matrix A, where M >= N. The SVD of A is written as
+          !> [++]   [xx]   [x0]   [xx]
+          !> A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+          !> [++]   [xx]
+          !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+          !> matrix, and V is an N-by-N unitary matrix. The diagonal elements
+          !> of SIGMA are the singular values of A. The columns of U and V are the
+          !> left and the right singular vectors of A, respectively.
+          interface gesvj
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgesvj( joba, jobu, jobv, m, n, a, lda, sva, mv, v,ldv, cwork, &
+                         lwork, rwork, lrwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldv,lwork,lrwork,m,mv,n
+                    character, intent(in) :: joba,jobu,jobv
+                    complex(sp), intent(inout) :: a(lda,*),v(ldv,*),cwork(lwork)
+                    real(sp), intent(inout) :: rwork(lrwork)
+                    real(sp), intent(out) :: sva(n)
+               end subroutine cgesvj
+#else 
+               module procedure stdlib_cgesvj
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgesvj( joba, jobu, jobv, m, n, a, lda, sva, mv, v,ldv, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldv,lwork,m,mv,n
+                    character, intent(in) :: joba,jobu,jobv
+                    real(dp), intent(inout) :: a(lda,*),v(ldv,*),work(lwork)
+                    real(dp), intent(out) :: sva(n)
+               end subroutine dgesvj
+#else 
+               module procedure stdlib_dgesvj
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgesvj
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgesvj( joba, jobu, jobv, m, n, a, lda, sva, mv, v,ldv, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldv,lwork,m,mv,n
+                    character, intent(in) :: joba,jobu,jobv
+                    real(sp), intent(inout) :: a(lda,*),v(ldv,*),work(lwork)
+                    real(sp), intent(out) :: sva(n)
+               end subroutine sgesvj
+#else 
+               module procedure stdlib_sgesvj
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgesvj
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgesvj( joba, jobu, jobv, m, n, a, lda, sva, mv, v,ldv, cwork, &
+                         lwork, rwork, lrwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldv,lwork,lrwork,m,mv,n
+                    character, intent(in) :: joba,jobu,jobv
+                    complex(dp), intent(inout) :: a(lda,*),v(ldv,*),cwork(lwork)
+                    real(dp), intent(inout) :: rwork(lrwork)
+                    real(dp), intent(out) :: sva(n)
+               end subroutine zgesvj
+#else 
+               module procedure stdlib_zgesvj
+#endif
+          end interface gesvj
+
+
+
+          !> GETRF: computes an LU factorization of a general M-by-N matrix A
+          !> using partial pivoting with row interchanges.
+          !> The factorization has the form
+          !> A = P * L * U
+          !> where P is a permutation matrix, L is lower triangular with unit
+          !> diagonal elements (lower trapezoidal if m > n), and U is upper
+          !> triangular (upper trapezoidal if m < n).
+          !> This is the right-looking Level 3 BLAS version of the algorithm.
+          interface getrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgetrf( m, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine cgetrf
+#else 
+               module procedure stdlib_cgetrf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgetrf( m, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dgetrf
+#else 
+               module procedure stdlib_dgetrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgetrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgetrf( m, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine sgetrf
+#else 
+               module procedure stdlib_sgetrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgetrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgetrf( m, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zgetrf
+#else 
+               module procedure stdlib_zgetrf
+#endif
+          end interface getrf
+
+
+
+          !> GETRF2: computes an LU factorization of a general M-by-N matrix A
+          !> using partial pivoting with row interchanges.
+          !> The factorization has the form
+          !> A = P * L * U
+          !> where P is a permutation matrix, L is lower triangular with unit
+          !> diagonal elements (lower trapezoidal if m > n), and U is upper
+          !> triangular (upper trapezoidal if m < n).
+          !> This is the recursive version of the algorithm. It divides
+          !> the matrix into four submatrices:
+          !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+          !> A = [ -----|----- ]  with n1 = min(m,n)/2
+          !> [  A21 | A22  ]       n2 = n-n1
+          !> [ A11 ]
+          !> The subroutine calls itself to factor [ --- ],
+          !> [ A12 ]
+          !> [ A12 ]
+          !> do the swaps on [ --- ], solve A12, update A22,
+          !> [ A22 ]
+          !> then calls itself to factor A22 and do the swaps on A21.
+          interface getrf2
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine cgetrf2( m, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine cgetrf2
+#else 
+               module procedure stdlib_cgetrf2
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine dgetrf2( m, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dgetrf2
+#else 
+               module procedure stdlib_dgetrf2
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgetrf2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine sgetrf2( m, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine sgetrf2
+#else 
+               module procedure stdlib_sgetrf2
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgetrf2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine zgetrf2( m, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zgetrf2
+#else 
+               module procedure stdlib_zgetrf2
+#endif
+          end interface getrf2
+
+
+
+          !> GETRI: computes the inverse of a matrix using the LU factorization
+          !> computed by CGETRF.
+          !> This method inverts U and then computes inv(A) by solving the system
+          !> inv(A)*L = inv(U) for inv(A).
+          interface getri
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgetri( n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n,ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cgetri
+#else 
+               module procedure stdlib_cgetri
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgetri( n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n,ipiv(*)
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dgetri
+#else 
+               module procedure stdlib_dgetri
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgetri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgetri( n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n,ipiv(*)
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sgetri
+#else 
+               module procedure stdlib_sgetri
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgetri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgetri( n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n,ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zgetri
+#else 
+               module procedure stdlib_zgetri
+#endif
+          end interface getri
+
+
+
+          !> GETRS: solves a system of linear equations
+          !> A * X = B,  A**T * X = B,  or  A**H * X = B
+          !> with a general N-by-N matrix A using the LU factorization computed
+          !> by CGETRF.
+          interface getrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgetrs( trans, n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine cgetrs
+#else 
+               module procedure stdlib_cgetrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgetrs( trans, n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dgetrs
+#else 
+               module procedure stdlib_dgetrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgetrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgetrs( trans, n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine sgetrs
+#else 
+               module procedure stdlib_sgetrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgetrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgetrs( trans, n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zgetrs
+#else 
+               module procedure stdlib_zgetrs
+#endif
+          end interface getrs
+
+
+
+          !> GETSLS: solves overdetermined or underdetermined complex linear systems
+          !> involving an M-by-N matrix A, using a tall skinny QR or short wide LQ
+          !> factorization of A.  It is assumed that A has full rank.
+          !> The following options are provided:
+          !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+          !> an overdetermined system, i.e., solve the least squares problem
+          !> minimize || B - A*X ||.
+          !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+          !> an underdetermined system A * X = B.
+          !> 3. If TRANS = 'C' and m >= n:  find the minimum norm solution of
+          !> an undetermined system A**T * X = B.
+          !> 4. If TRANS = 'C' and m < n:  find the least squares solution of
+          !> an overdetermined system, i.e., solve the least squares problem
+          !> minimize || B - A**T * X ||.
+          !> Several right hand side vectors b and solution vectors x can be
+          !> handled in a single call; they are stored as the columns of the
+          !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+          !> matrix X.
+          interface getsls
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cgetsls( trans, m, n, nrhs, a, lda, b, ldb,work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cgetsls
+#else 
+               module procedure stdlib_cgetsls
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dgetsls( trans, m, n, nrhs, a, lda, b, ldb,work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dgetsls
+#else 
+               module procedure stdlib_dgetsls
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgetsls
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sgetsls( trans, m, n, nrhs, a, lda, b, ldb,work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sgetsls
+#else 
+               module procedure stdlib_sgetsls
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgetsls
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zgetsls( trans, m, n, nrhs, a, lda, b, ldb,work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,nrhs
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zgetsls
+#else 
+               module procedure stdlib_zgetsls
+#endif
+          end interface getsls
+
+
+
+          !> GETSQRHRT: computes a NB2-sized column blocked QR-factorization
+          !> of a complex M-by-N matrix A with M >= N,
+          !> A = Q * R.
+          !> The routine uses internally a NB1-sized column blocked and MB1-sized
+          !> row blocked TSQR-factorization and perfors the reconstruction
+          !> of the Householder vectors from the TSQR output. The routine also
+          !> converts the R_tsqr factor from the TSQR-factorization output into
+          !> the R factor that corresponds to the Householder QR-factorization,
+          !> A = Q_tsqr * R_tsqr = Q * R.
+          !> The output Q and R factors are stored in the same format as in CGEQRT
+          !> (Q is in blocked compact WY-representation). See the documentation
+          !> of CGEQRT for more details on the format.
+          interface getsqrhrt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgetsqrhrt( m, n, mb1, nb1, nb2, a, lda, t, ldt, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,lwork,m,n,nb1,nb2,mb1
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: t(ldt,*),work(*)
+               end subroutine cgetsqrhrt
+#else 
+               module procedure stdlib_cgetsqrhrt
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgetsqrhrt( m, n, mb1, nb1, nb2, a, lda, t, ldt, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,lwork,m,n,nb1,nb2,mb1
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: t(ldt,*),work(*)
+               end subroutine dgetsqrhrt
+#else 
+               module procedure stdlib_dgetsqrhrt
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgetsqrhrt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgetsqrhrt( m, n, mb1, nb1, nb2, a, lda, t, ldt, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,lwork,m,n,nb1,nb2,mb1
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: t(ldt,*),work(*)
+               end subroutine sgetsqrhrt
+#else 
+               module procedure stdlib_sgetsqrhrt
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgetsqrhrt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgetsqrhrt( m, n, mb1, nb1, nb2, a, lda, t, ldt, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,lwork,m,n,nb1,nb2,mb1
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: t(ldt,*),work(*)
+               end subroutine zgetsqrhrt
+#else 
+               module procedure stdlib_zgetsqrhrt
+#endif
+          end interface getsqrhrt
+
+
+
+          !> GGBAK: forms the right or left eigenvectors of a complex generalized
+          !> eigenvalue problem A*x = lambda*B*x, by backward transformation on
+          !> the computed eigenvectors of the balanced pair of matrices output by
+          !> CGGBAL.
+          interface ggbak
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cggbak( job, side, n, ilo, ihi, lscale, rscale, m, v,ldv, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job,side
+                    integer(ilp), intent(in) :: ihi,ilo,ldv,m,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(in) :: lscale(*),rscale(*)
+                    complex(sp), intent(inout) :: v(ldv,*)
+               end subroutine cggbak
+#else 
+               module procedure stdlib_cggbak
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dggbak( job, side, n, ilo, ihi, lscale, rscale, m, v,ldv, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job,side
+                    integer(ilp), intent(in) :: ihi,ilo,ldv,m,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(in) :: lscale(*),rscale(*)
+                    real(dp), intent(inout) :: v(ldv,*)
+               end subroutine dggbak
+#else 
+               module procedure stdlib_dggbak
+#endif
+#:if WITH_QP
+               module procedure stdlib_qggbak
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sggbak( job, side, n, ilo, ihi, lscale, rscale, m, v,ldv, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job,side
+                    integer(ilp), intent(in) :: ihi,ilo,ldv,m,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(in) :: lscale(*),rscale(*)
+                    real(sp), intent(inout) :: v(ldv,*)
+               end subroutine sggbak
+#else 
+               module procedure stdlib_sggbak
+#endif
+#:if WITH_QP
+               module procedure stdlib_wggbak
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zggbak( job, side, n, ilo, ihi, lscale, rscale, m, v,ldv, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job,side
+                    integer(ilp), intent(in) :: ihi,ilo,ldv,m,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(in) :: lscale(*),rscale(*)
+                    complex(dp), intent(inout) :: v(ldv,*)
+               end subroutine zggbak
+#else 
+               module procedure stdlib_zggbak
+#endif
+          end interface ggbak
+
+
+
+          !> GGBAL: balances a pair of general complex matrices (A,B).  This
+          !> involves, first, permuting A and B by similarity transformations to
+          !> isolate eigenvalues in the first 1 to ILO$-$1 and last IHI+1 to N
+          !> elements on the diagonal; and second, applying a diagonal similarity
+          !> transformation to rows and columns ILO to IHI to make the rows
+          !> and columns as close in norm as possible. Both steps are optional.
+          !> Balancing may reduce the 1-norm of the matrices, and improve the
+          !> accuracy of the computed eigenvalues and/or eigenvectors in the
+          !> generalized eigenvalue problem A*x = lambda*B*x.
+          interface ggbal
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cggbal( job, n, a, lda, b, ldb, ilo, ihi, lscale,rscale, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job
+                    integer(ilp), intent(out) :: ihi,ilo,info
+                    integer(ilp), intent(in) :: lda,ldb,n
+                    real(sp), intent(out) :: lscale(*),rscale(*),work(*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+               end subroutine cggbal
+#else 
+               module procedure stdlib_cggbal
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dggbal( job, n, a, lda, b, ldb, ilo, ihi, lscale,rscale, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job
+                    integer(ilp), intent(out) :: ihi,ilo,info
+                    integer(ilp), intent(in) :: lda,ldb,n
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: lscale(*),rscale(*),work(*)
+               end subroutine dggbal
+#else 
+               module procedure stdlib_dggbal
+#endif
+#:if WITH_QP
+               module procedure stdlib_qggbal
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sggbal( job, n, a, lda, b, ldb, ilo, ihi, lscale,rscale, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job
+                    integer(ilp), intent(out) :: ihi,ilo,info
+                    integer(ilp), intent(in) :: lda,ldb,n
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: lscale(*),rscale(*),work(*)
+               end subroutine sggbal
+#else 
+               module procedure stdlib_sggbal
+#endif
+#:if WITH_QP
+               module procedure stdlib_wggbal
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zggbal( job, n, a, lda, b, ldb, ilo, ihi, lscale,rscale, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: job
+                    integer(ilp), intent(out) :: ihi,ilo,info
+                    integer(ilp), intent(in) :: lda,ldb,n
+                    real(dp), intent(out) :: lscale(*),rscale(*),work(*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+               end subroutine zggbal
+#else 
+               module procedure stdlib_zggbal
+#endif
+          end interface ggbal
+
+
+
+          !> GGES: computes for a pair of N-by-N complex nonsymmetric matrices
+          !> (A,B), the generalized eigenvalues, the generalized complex Schur
+          !> form (S, T), and optionally left and/or right Schur vectors (VSL
+          !> and VSR). This gives the generalized Schur factorization
+          !> (A,B) = ( (VSL)*S*(VSR)**H, (VSL)*T*(VSR)**H )
+          !> where (VSR)**H is the conjugate-transpose of VSR.
+          !> Optionally, it also orders the eigenvalues so that a selected cluster
+          !> of eigenvalues appears in the leading diagonal blocks of the upper
+          !> triangular matrix S and the upper triangular matrix T. The leading
+          !> columns of VSL and VSR then form an unitary basis for the
+          !> corresponding left and right eigenspaces (deflating subspaces).
+          !> (If only the generalized eigenvalues are needed, use the driver
+          !> CGGEV instead, which is faster.)
+          !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+          !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+          !> usually represented as the pair (alpha,beta), as there is a
+          !> reasonable interpretation for beta=0, and even for both being zero.
+          !> A pair of matrices (S,T) is in generalized complex Schur form if S
+          !> and T are upper triangular and, in addition, the diagonal elements
+          !> of T are non-negative real numbers.
+          interface gges
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cgges( jobvsl, jobvsr, sort, selctg, n, a, lda, b, ldb,sdim, alpha, &
+                         beta, vsl, ldvsl, vsr, ldvsr, work,lwork, rwork, bwork, info )
+                    import sp,dp,qp,ilp,lk,stdlib_selctg_c 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvsl,jobvsr,sort
+                    integer(ilp), intent(out) :: info,sdim
+                    integer(ilp), intent(in) :: lda,ldb,ldvsl,ldvsr,lwork,n
+                    logical(lk), intent(out) :: bwork(*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: alpha(*),beta(*),vsl(ldvsl,*),vsr(ldvsr,*),work(*)
+                              
+                    procedure(stdlib_selctg_c) :: selctg
+               end subroutine cgges
+#else 
+               module procedure stdlib_cgges
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dgges( jobvsl, jobvsr, sort, selctg, n, a, lda, b, ldb,sdim, alphar, &
+                         alphai, beta, vsl, ldvsl, vsr,ldvsr, work, lwork, bwork, info )
+                    import sp,dp,qp,ilp,lk,stdlib_selctg_d 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvsl,jobvsr,sort
+                    integer(ilp), intent(out) :: info,sdim
+                    integer(ilp), intent(in) :: lda,ldb,ldvsl,ldvsr,lwork,n
+                    logical(lk), intent(out) :: bwork(*)
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: alphai(*),alphar(*),beta(*),vsl(ldvsl,*),vsr(ldvsr,*)&
+                              ,work(*)
+                    procedure(stdlib_selctg_d) :: selctg
+               end subroutine dgges
+#else 
+               module procedure stdlib_dgges
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgges
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sgges( jobvsl, jobvsr, sort, selctg, n, a, lda, b, ldb,sdim, alphar, &
+                         alphai, beta, vsl, ldvsl, vsr,ldvsr, work, lwork, bwork, info )
+                    import sp,dp,qp,ilp,lk,stdlib_selctg_s 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvsl,jobvsr,sort
+                    integer(ilp), intent(out) :: info,sdim
+                    integer(ilp), intent(in) :: lda,ldb,ldvsl,ldvsr,lwork,n
+                    logical(lk), intent(out) :: bwork(*)
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: alphai(*),alphar(*),beta(*),vsl(ldvsl,*),vsr(ldvsr,*)&
+                              ,work(*)
+                    procedure(stdlib_selctg_s) :: selctg
+               end subroutine sgges
+#else 
+               module procedure stdlib_sgges
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgges
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zgges( jobvsl, jobvsr, sort, selctg, n, a, lda, b, ldb,sdim, alpha, &
+                         beta, vsl, ldvsl, vsr, ldvsr, work,lwork, rwork, bwork, info )
+                    import sp,dp,qp,ilp,lk,stdlib_selctg_z
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvsl,jobvsr,sort
+                    integer(ilp), intent(out) :: info,sdim
+                    integer(ilp), intent(in) :: lda,ldb,ldvsl,ldvsr,lwork,n
+                    logical(lk), intent(out) :: bwork(*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: alpha(*),beta(*),vsl(ldvsl,*),vsr(ldvsr,*),work(*)
+                              
+                    procedure(stdlib_selctg_z) :: selctg
+               end subroutine zgges
+#else 
+               module procedure stdlib_zgges
+#endif
+          end interface gges
+
+
+
+          !> GGEV: computes for a pair of N-by-N complex nonsymmetric matrices
+          !> (A,B), the generalized eigenvalues, and optionally, the left and/or
+          !> right generalized eigenvectors.
+          !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+          !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+          !> singular. It is usually represented as the pair (alpha,beta), as
+          !> there is a reasonable interpretation for beta=0, and even for both
+          !> being zero.
+          !> The right generalized eigenvector v(j) corresponding to the
+          !> generalized eigenvalue lambda(j) of (A,B) satisfies
+          !> A * v(j) = lambda(j) * B * v(j).
+          !> The left generalized eigenvector u(j) corresponding to the
+          !> generalized eigenvalues lambda(j) of (A,B) satisfies
+          !> u(j)**H * A = lambda(j) * u(j)**H * B
+          !> where u(j)**H is the conjugate-transpose of u(j).
+          interface ggev
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cggev( jobvl, jobvr, n, a, lda, b, ldb, alpha, beta,vl, ldvl, vr, ldvr, &
+                         work, lwork, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvl,jobvr
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldvl,ldvr,lwork,n
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: alpha(*),beta(*),vl(ldvl,*),vr(ldvr,*),work(*)
+                              
+               end subroutine cggev
+#else 
+               module procedure stdlib_cggev
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dggev( jobvl, jobvr, n, a, lda, b, ldb, alphar, alphai,beta, vl, ldvl, &
+                         vr, ldvr, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvl,jobvr
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldvl,ldvr,lwork,n
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: alphai(*),alphar(*),beta(*),vl(ldvl,*),vr(ldvr,*),&
+                              work(*)
+               end subroutine dggev
+#else 
+               module procedure stdlib_dggev
+#endif
+#:if WITH_QP
+               module procedure stdlib_qggev
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sggev( jobvl, jobvr, n, a, lda, b, ldb, alphar, alphai,beta, vl, ldvl, &
+                         vr, ldvr, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvl,jobvr
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldvl,ldvr,lwork,n
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: alphai(*),alphar(*),beta(*),vl(ldvl,*),vr(ldvr,*),&
+                              work(*)
+               end subroutine sggev
+#else 
+               module procedure stdlib_sggev
+#endif
+#:if WITH_QP
+               module procedure stdlib_wggev
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zggev( jobvl, jobvr, n, a, lda, b, ldb, alpha, beta,vl, ldvl, vr, ldvr, &
+                         work, lwork, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobvl,jobvr
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldvl,ldvr,lwork,n
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: alpha(*),beta(*),vl(ldvl,*),vr(ldvr,*),work(*)
+                              
+               end subroutine zggev
+#else 
+               module procedure stdlib_zggev
+#endif
+          end interface ggev
+
+
+
+          !> GGGLM: solves a general Gauss-Markov linear model (GLM) problem:
+          !> minimize || y ||_2   subject to   d = A*x + B*y
+          !> x
+          !> where A is an N-by-M matrix, B is an N-by-P matrix, and d is a
+          !> given N-vector. It is assumed that M <= N <= M+P, and
+          !> rank(A) = M    and    rank( A B ) = N.
+          !> Under these assumptions, the constrained equation is always
+          !> consistent, and there is a unique solution x and a minimal 2-norm
+          !> solution y, which is obtained using a generalized QR factorization
+          !> of the matrices (A, B) given by
+          !> A = Q*(R),   B = Q*T*Z.
+          !> (0)
+          !> In particular, if matrix B is square nonsingular, then the problem
+          !> GLM is equivalent to the following weighted linear least squares
+          !> problem
+          !> minimize || inv(B)*(d-A*x) ||_2
+          !> x
+          !> where inv(B) denotes the inverse of B.
+          interface ggglm
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cggglm( n, m, p, a, lda, b, ldb, d, x, y, work, lwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*),d(*)
+                    complex(sp), intent(out) :: work(*),x(*),y(*)
+               end subroutine cggglm
+#else 
+               module procedure stdlib_cggglm
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dggglm( n, m, p, a, lda, b, ldb, d, x, y, work, lwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*),d(*)
+                    real(dp), intent(out) :: work(*),x(*),y(*)
+               end subroutine dggglm
+#else 
+               module procedure stdlib_dggglm
+#endif
+#:if WITH_QP
+               module procedure stdlib_qggglm
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sggglm( n, m, p, a, lda, b, ldb, d, x, y, work, lwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*),d(*)
+                    real(sp), intent(out) :: work(*),x(*),y(*)
+               end subroutine sggglm
+#else 
+               module procedure stdlib_sggglm
+#endif
+#:if WITH_QP
+               module procedure stdlib_wggglm
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zggglm( n, m, p, a, lda, b, ldb, d, x, y, work, lwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*),d(*)
+                    complex(dp), intent(out) :: work(*),x(*),y(*)
+               end subroutine zggglm
+#else 
+               module procedure stdlib_zggglm
+#endif
+          end interface ggglm
+
+
+
+          !> GGHRD: reduces a pair of complex matrices (A,B) to generalized upper
+          !> Hessenberg form using unitary transformations, where A is a
+          !> general matrix and B is upper triangular.  The form of the generalized
+          !> eigenvalue problem is
+          !> A*x = lambda*B*x,
+          !> and B is typically made upper triangular by computing its QR
+          !> factorization and moving the unitary matrix Q to the left side
+          !> of the equation.
+          !> This subroutine simultaneously reduces A to a Hessenberg matrix H:
+          !> Q**H*A*Z = H
+          !> and transforms B to another upper triangular matrix T:
+          !> Q**H*B*Z = T
+          !> in order to reduce the problem to its standard form
+          !> H*y = lambda*T*y
+          !> where y = Z**H*x.
+          !> The unitary matrices Q and Z are determined as products of Givens
+          !> rotations.  They may either be formed explicitly, or they may be
+          !> postmultiplied into input matrices Q1 and Z1, so that
+          !> Q1 * A * Z1**H = (Q1*Q) * H * (Z1*Z)**H
+          !> Q1 * B * Z1**H = (Q1*Q) * T * (Z1*Z)**H
+          !> If Q1 is the unitary matrix from the QR factorization of B in the
+          !> original equation A*x = lambda*B*x, then GGHRD reduces the original
+          !> problem to generalized Hessenberg form.
+          interface gghrd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgghrd( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,compz
+                    integer(ilp), intent(in) :: ihi,ilo,lda,ldb,ldq,ldz,n
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+               end subroutine cgghrd
+#else 
+               module procedure stdlib_cgghrd
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgghrd( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,compz
+                    integer(ilp), intent(in) :: ihi,ilo,lda,ldb,ldq,ldz,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+               end subroutine dgghrd
+#else 
+               module procedure stdlib_dgghrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgghrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgghrd( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,compz
+                    integer(ilp), intent(in) :: ihi,ilo,lda,ldb,ldq,ldz,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+               end subroutine sgghrd
+#else 
+               module procedure stdlib_sgghrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgghrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgghrd( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,compz
+                    integer(ilp), intent(in) :: ihi,ilo,lda,ldb,ldq,ldz,n
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+               end subroutine zgghrd
+#else 
+               module procedure stdlib_zgghrd
+#endif
+          end interface gghrd
+
+
+
+          !> GGLSE: solves the linear equality-constrained least squares (LSE)
+          !> problem:
+          !> minimize || c - A*x ||_2   subject to   B*x = d
+          !> where A is an M-by-N matrix, B is a P-by-N matrix, c is a given
+          !> M-vector, and d is a given P-vector. It is assumed that
+          !> P <= N <= M+P, and
+          !> rank(B) = P and  rank( (A) ) = N.
+          !> ( (B) )
+          !> These conditions ensure that the LSE problem has a unique solution,
+          !> which is obtained using a generalized RQ factorization of the
+          !> matrices (B, A) given by
+          !> B = (0 R)*Q,   A = Z*T*Q.
+          interface gglse
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgglse( m, n, p, a, lda, b, ldb, c, d, x, work, lwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*),c(*),d(*)
+                    complex(sp), intent(out) :: work(*),x(*)
+               end subroutine cgglse
+#else 
+               module procedure stdlib_cgglse
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgglse( m, n, p, a, lda, b, ldb, c, d, x, work, lwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*),c(*),d(*)
+                    real(dp), intent(out) :: work(*),x(*)
+               end subroutine dgglse
+#else 
+               module procedure stdlib_dgglse
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgglse
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgglse( m, n, p, a, lda, b, ldb, c, d, x, work, lwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*),c(*),d(*)
+                    real(sp), intent(out) :: work(*),x(*)
+               end subroutine sgglse
+#else 
+               module procedure stdlib_sgglse
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgglse
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgglse( m, n, p, a, lda, b, ldb, c, d, x, work, lwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*),c(*),d(*)
+                    complex(dp), intent(out) :: work(*),x(*)
+               end subroutine zgglse
+#else 
+               module procedure stdlib_zgglse
+#endif
+          end interface gglse
+
+
+
+          !> GGQRF: computes a generalized QR factorization of an N-by-M matrix A
+          !> and an N-by-P matrix B:
+          !> A = Q*R,        B = Q*T*Z,
+          !> where Q is an N-by-N unitary matrix, Z is a P-by-P unitary matrix,
+          !> and R and T assume one of the forms:
+          !> if N >= M,  R = ( R11 ) M  ,   or if N < M,  R = ( R11  R12 ) N,
+          !> (  0  ) N-M                         N   M-N
+          !> M
+          !> where R11 is upper triangular, and
+          !> if N <= P,  T = ( 0  T12 ) N,   or if N > P,  T = ( T11 ) N-P,
+          !> P-N  N                           ( T21 ) P
+          !> P
+          !> where T12 or T21 is upper triangular.
+          !> In particular, if B is square and nonsingular, the GQR factorization
+          !> of A and B implicitly gives the QR factorization of inv(B)*A:
+          !> inv(B)*A = Z**H * (inv(T)*R)
+          !> where inv(B) denotes the inverse of the matrix B, and Z' denotes the
+          !> conjugate transpose of matrix Z.
+          interface ggqrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cggqrf( n, m, p, a, lda, taua, b, ldb, taub, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: taua(*),taub(*),work(*)
+               end subroutine cggqrf
+#else 
+               module procedure stdlib_cggqrf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dggqrf( n, m, p, a, lda, taua, b, ldb, taub, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: taua(*),taub(*),work(*)
+               end subroutine dggqrf
+#else 
+               module procedure stdlib_dggqrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qggqrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sggqrf( n, m, p, a, lda, taua, b, ldb, taub, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: taua(*),taub(*),work(*)
+               end subroutine sggqrf
+#else 
+               module procedure stdlib_sggqrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wggqrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zggqrf( n, m, p, a, lda, taua, b, ldb, taub, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: taua(*),taub(*),work(*)
+               end subroutine zggqrf
+#else 
+               module procedure stdlib_zggqrf
+#endif
+          end interface ggqrf
+
+
+
+          !> GGRQF: computes a generalized RQ factorization of an M-by-N matrix A
+          !> and a P-by-N matrix B:
+          !> A = R*Q,        B = Z*T*Q,
+          !> where Q is an N-by-N unitary matrix, Z is a P-by-P unitary
+          !> matrix, and R and T assume one of the forms:
+          !> if M <= N,  R = ( 0  R12 ) M,   or if M > N,  R = ( R11 ) M-N,
+          !> N-M  M                           ( R21 ) N
+          !> N
+          !> where R12 or R21 is upper triangular, and
+          !> if P >= N,  T = ( T11 ) N  ,   or if P < N,  T = ( T11  T12 ) P,
+          !> (  0  ) P-N                         P   N-P
+          !> N
+          !> where T11 is upper triangular.
+          !> In particular, if B is square and nonsingular, the GRQ factorization
+          !> of A and B implicitly gives the RQ factorization of A*inv(B):
+          !> A*inv(B) = (R*inv(T))*Z**H
+          !> where inv(B) denotes the inverse of the matrix B, and Z**H denotes the
+          !> conjugate transpose of the matrix Z.
+          interface ggrqf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cggrqf( m, p, n, a, lda, taua, b, ldb, taub, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: taua(*),taub(*),work(*)
+               end subroutine cggrqf
+#else 
+               module procedure stdlib_cggrqf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dggrqf( m, p, n, a, lda, taua, b, ldb, taub, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: taua(*),taub(*),work(*)
+               end subroutine dggrqf
+#else 
+               module procedure stdlib_dggrqf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qggrqf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sggrqf( m, p, n, a, lda, taua, b, ldb, taub, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: taua(*),taub(*),work(*)
+               end subroutine sggrqf
+#else 
+               module procedure stdlib_sggrqf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wggrqf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zggrqf( m, p, n, a, lda, taua, b, ldb, taub, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,lwork,m,n,p
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: taua(*),taub(*),work(*)
+               end subroutine zggrqf
+#else 
+               module procedure stdlib_zggrqf
+#endif
+          end interface ggrqf
+
+
+
+          !> GSVJ0: is called from CGESVJ as a pre-processor and that is its main
+          !> purpose. It applies Jacobi rotations in the same way as CGESVJ does, but
+          !> it does not check convergence (stopping criterion). Few tuning
+          !> parameters (marked by [TP]) are available for the implementer.
+          interface gsvj0
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgsvj0( jobv, m, n, a, lda, d, sva, mv, v, ldv, eps,sfmin, tol, &
+                         nsweep, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldv,lwork,m,mv,n,nsweep
+                    real(sp), intent(in) :: eps,sfmin,tol
+                    character, intent(in) :: jobv
+                    complex(sp), intent(inout) :: a(lda,*),d(n),v(ldv,*)
+                    complex(sp), intent(out) :: work(lwork)
+                    real(sp), intent(inout) :: sva(n)
+               end subroutine cgsvj0
+#else 
+               module procedure stdlib_cgsvj0
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgsvj0( jobv, m, n, a, lda, d, sva, mv, v, ldv, eps,sfmin, tol, &
+                         nsweep, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldv,lwork,m,mv,n,nsweep
+                    real(dp), intent(in) :: eps,sfmin,tol
+                    character, intent(in) :: jobv
+                    real(dp), intent(inout) :: a(lda,*),sva(n),d(n),v(ldv,*)
+                    real(dp), intent(out) :: work(lwork)
+               end subroutine dgsvj0
+#else 
+               module procedure stdlib_dgsvj0
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgsvj0
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgsvj0( jobv, m, n, a, lda, d, sva, mv, v, ldv, eps,sfmin, tol, &
+                         nsweep, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldv,lwork,m,mv,n,nsweep
+                    real(sp), intent(in) :: eps,sfmin,tol
+                    character, intent(in) :: jobv
+                    real(sp), intent(inout) :: a(lda,*),sva(n),d(n),v(ldv,*)
+                    real(sp), intent(out) :: work(lwork)
+               end subroutine sgsvj0
+#else 
+               module procedure stdlib_sgsvj0
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgsvj0
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgsvj0( jobv, m, n, a, lda, d, sva, mv, v, ldv, eps,sfmin, tol, &
+                         nsweep, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldv,lwork,m,mv,n,nsweep
+                    real(dp), intent(in) :: eps,sfmin,tol
+                    character, intent(in) :: jobv
+                    complex(dp), intent(inout) :: a(lda,*),d(n),v(ldv,*)
+                    complex(dp), intent(out) :: work(lwork)
+                    real(dp), intent(inout) :: sva(n)
+               end subroutine zgsvj0
+#else 
+               module procedure stdlib_zgsvj0
+#endif
+          end interface gsvj0
+
+
+
+          !> GSVJ1: is called from CGESVJ as a pre-processor and that is its main
+          !> purpose. It applies Jacobi rotations in the same way as CGESVJ does, but
+          !> it targets only particular pivots and it does not check convergence
+          !> (stopping criterion). Few tuning parameters (marked by [TP]) are
+          !> available for the implementer.
+          !> Further Details
+          !> ~~~~~~~~~~~~~~~
+          !> GSVJ1 applies few sweeps of Jacobi rotations in the column space of
+          !> the input M-by-N matrix A. The pivot pairs are taken from the (1,2)
+          !> off-diagonal block in the corresponding N-by-N Gram matrix A^T * A. The
+          !> block-entries (tiles) of the (1,2) off-diagonal block are marked by the
+          !> [x]'s in the following scheme:
+          !> | *  *  * [x] [x] [x]|
+          !> | *  *  * [x] [x] [x]|    Row-cycling in the nblr-by-nblc [x] blocks.
+          !> | *  *  * [x] [x] [x]|    Row-cyclic pivoting inside each [x] block.
+          !> |[x] [x] [x] *  *  * |
+          !> |[x] [x] [x] *  *  * |
+          !> |[x] [x] [x] *  *  * |
+          !> In terms of the columns of A, the first N1 columns are rotated 'against'
+          !> the remaining N-N1 columns, trying to increase the angle between the
+          !> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is
+          !> tiled using quadratic tiles of side KBL. Here, KBL is a tuning parameter.
+          !> The number of sweeps is given in NSWEEP and the orthogonality threshold
+          !> is given in TOL.
+          interface gsvj1
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgsvj1( jobv, m, n, n1, a, lda, d, sva, mv, v, ldv,eps, sfmin, tol,&
+                          nsweep, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: eps,sfmin,tol
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldv,lwork,m,mv,n,n1,nsweep
+                    character, intent(in) :: jobv
+                    complex(sp), intent(inout) :: a(lda,*),d(n),v(ldv,*)
+                    complex(sp), intent(out) :: work(lwork)
+                    real(sp), intent(inout) :: sva(n)
+               end subroutine cgsvj1
+#else 
+               module procedure stdlib_cgsvj1
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgsvj1( jobv, m, n, n1, a, lda, d, sva, mv, v, ldv,eps, sfmin, tol,&
+                          nsweep, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: eps,sfmin,tol
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldv,lwork,m,mv,n,n1,nsweep
+                    character, intent(in) :: jobv
+                    real(dp), intent(inout) :: a(lda,*),d(n),sva(n),v(ldv,*)
+                    real(dp), intent(out) :: work(lwork)
+               end subroutine dgsvj1
+#else 
+               module procedure stdlib_dgsvj1
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgsvj1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgsvj1( jobv, m, n, n1, a, lda, d, sva, mv, v, ldv,eps, sfmin, tol,&
+                          nsweep, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: eps,sfmin,tol
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldv,lwork,m,mv,n,n1,nsweep
+                    character, intent(in) :: jobv
+                    real(sp), intent(inout) :: a(lda,*),d(n),sva(n),v(ldv,*)
+                    real(sp), intent(out) :: work(lwork)
+               end subroutine sgsvj1
+#else 
+               module procedure stdlib_sgsvj1
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgsvj1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgsvj1( jobv, m, n, n1, a, lda, d, sva, mv, v, ldv,eps, sfmin, tol,&
+                          nsweep, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: eps,sfmin,tol
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldv,lwork,m,mv,n,n1,nsweep
+                    character, intent(in) :: jobv
+                    complex(dp), intent(inout) :: a(lda,*),d(n),v(ldv,*)
+                    complex(dp), intent(out) :: work(lwork)
+                    real(dp), intent(inout) :: sva(n)
+               end subroutine zgsvj1
+#else 
+               module procedure stdlib_zgsvj1
+#endif
+          end interface gsvj1
+
+
+
+          !> GTCON: estimates the reciprocal of the condition number of a complex
+          !> tridiagonal matrix A using the LU factorization as computed by
+          !> CGTTRF.
+          !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+          !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+          interface gtcon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgtcon( norm, n, dl, d, du, du2, ipiv, anorm, rcond,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond
+                    complex(sp), intent(in) :: d(*),dl(*),du(*),du2(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cgtcon
+#else 
+               module procedure stdlib_cgtcon
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgtcon( norm, n, dl, d, du, du2, ipiv, anorm, rcond,work, iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    real(dp), intent(in) :: anorm,d(*),dl(*),du(*),du2(*)
+                    real(dp), intent(out) :: rcond,work(*)
+               end subroutine dgtcon
+#else 
+               module procedure stdlib_dgtcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgtcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgtcon( norm, n, dl, d, du, du2, ipiv, anorm, rcond,work, iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    real(sp), intent(in) :: anorm,d(*),dl(*),du(*),du2(*)
+                    real(sp), intent(out) :: rcond,work(*)
+               end subroutine sgtcon
+#else 
+               module procedure stdlib_sgtcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgtcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgtcon( norm, n, dl, d, du, du2, ipiv, anorm, rcond,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond
+                    complex(dp), intent(in) :: d(*),dl(*),du(*),du2(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zgtcon
+#else 
+               module procedure stdlib_zgtcon
+#endif
+          end interface gtcon
+
+
+
+          !> GTRFS: improves the computed solution to a system of linear
+          !> equations when the coefficient matrix is tridiagonal, and provides
+          !> error bounds and backward error estimates for the solution.
+          interface gtrfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2,ipiv, b, ldb, &
+                         x, ldx, ferr, berr, work, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs,ipiv(*)
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(sp), intent(in) :: b(ldb,*),d(*),df(*),dl(*),dlf(*),du(*),du2(*),duf(&
+                              *)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: x(ldx,*)
+               end subroutine cgtrfs
+#else 
+               module procedure stdlib_cgtrfs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2,ipiv, b, ldb, &
+                         x, ldx, ferr, berr, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs,ipiv(*)
+                    real(dp), intent(in) :: b(ldb,*),d(*),df(*),dl(*),dlf(*),du(*),du2(*),duf(*)
+                              
+                    real(dp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(dp), intent(inout) :: x(ldx,*)
+               end subroutine dgtrfs
+#else 
+               module procedure stdlib_dgtrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgtrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2,ipiv, b, ldb, &
+                         x, ldx, ferr, berr, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs,ipiv(*)
+                    real(sp), intent(in) :: b(ldb,*),d(*),df(*),dl(*),dlf(*),du(*),du2(*),duf(*)
+                              
+                    real(sp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(sp), intent(inout) :: x(ldx,*)
+               end subroutine sgtrfs
+#else 
+               module procedure stdlib_sgtrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgtrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2,ipiv, b, ldb, &
+                         x, ldx, ferr, berr, work, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs,ipiv(*)
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(dp), intent(in) :: b(ldb,*),d(*),df(*),dl(*),dlf(*),du(*),du2(*),duf(&
+                              *)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: x(ldx,*)
+               end subroutine zgtrfs
+#else 
+               module procedure stdlib_zgtrfs
+#endif
+          end interface gtrfs
+
+
+
+          !> GTSV:  solves the equation
+          !> A*X = B,
+          !> where A is an N-by-N tridiagonal matrix, by Gaussian elimination with
+          !> partial pivoting.
+          !> Note that the equation  A**T *X = B  may be solved by interchanging the
+          !> order of the arguments DU and DL.
+          interface gtsv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgtsv( n, nrhs, dl, d, du, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(sp), intent(inout) :: b(ldb,*),d(*),dl(*),du(*)
+               end subroutine cgtsv
+#else 
+               module procedure stdlib_cgtsv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgtsv( n, nrhs, dl, d, du, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(dp), intent(inout) :: b(ldb,*),d(*),dl(*),du(*)
+               end subroutine dgtsv
+#else 
+               module procedure stdlib_dgtsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgtsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgtsv( n, nrhs, dl, d, du, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(sp), intent(inout) :: b(ldb,*),d(*),dl(*),du(*)
+               end subroutine sgtsv
+#else 
+               module procedure stdlib_sgtsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgtsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgtsv( n, nrhs, dl, d, du, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(dp), intent(inout) :: b(ldb,*),d(*),dl(*),du(*)
+               end subroutine zgtsv
+#else 
+               module procedure stdlib_zgtsv
+#endif
+          end interface gtsv
+
+
+
+          !> GTTRF: computes an LU factorization of a complex tridiagonal matrix A
+          !> using elimination with partial pivoting and row interchanges.
+          !> The factorization has the form
+          !> A = L * U
+          !> where L is a product of permutation and unit lower bidiagonal
+          !> matrices and U is upper triangular with nonzeros in only the main
+          !> diagonal and first two superdiagonals.
+          interface gttrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgttrf( n, dl, d, du, du2, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: n
+                    complex(sp), intent(inout) :: d(*),dl(*),du(*)
+                    complex(sp), intent(out) :: du2(*)
+               end subroutine cgttrf
+#else 
+               module procedure stdlib_cgttrf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgttrf( n, dl, d, du, du2, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: d(*),dl(*),du(*)
+                    real(dp), intent(out) :: du2(*)
+               end subroutine dgttrf
+#else 
+               module procedure stdlib_dgttrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgttrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgttrf( n, dl, d, du, du2, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: d(*),dl(*),du(*)
+                    real(sp), intent(out) :: du2(*)
+               end subroutine sgttrf
+#else 
+               module procedure stdlib_sgttrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgttrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgttrf( n, dl, d, du, du2, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: n
+                    complex(dp), intent(inout) :: d(*),dl(*),du(*)
+                    complex(dp), intent(out) :: du2(*)
+               end subroutine zgttrf
+#else 
+               module procedure stdlib_zgttrf
+#endif
+          end interface gttrf
+
+
+
+          !> GTTRS: solves one of the systems of equations
+          !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+          !> with a tridiagonal matrix A using the LU factorization computed
+          !> by CGTTRF.
+          interface gttrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cgttrs( trans, n, nrhs, dl, d, du, du2, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs,ipiv(*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+                    complex(sp), intent(in) :: d(*),dl(*),du(*),du2(*)
+               end subroutine cgttrs
+#else 
+               module procedure stdlib_cgttrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dgttrs( trans, n, nrhs, dl, d, du, du2, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs,ipiv(*)
+                    real(dp), intent(inout) :: b(ldb,*)
+                    real(dp), intent(in) :: d(*),dl(*),du(*),du2(*)
+               end subroutine dgttrs
+#else 
+               module procedure stdlib_dgttrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qgttrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sgttrs( trans, n, nrhs, dl, d, du, du2, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs,ipiv(*)
+                    real(sp), intent(inout) :: b(ldb,*)
+                    real(sp), intent(in) :: d(*),dl(*),du(*),du2(*)
+               end subroutine sgttrs
+#else 
+               module procedure stdlib_sgttrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wgttrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zgttrs( trans, n, nrhs, dl, d, du, du2, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs,ipiv(*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+                    complex(dp), intent(in) :: d(*),dl(*),du(*),du2(*)
+               end subroutine zgttrs
+#else 
+               module procedure stdlib_zgttrs
+#endif
+          end interface gttrs
+
+
+
+          !> HB2ST_KERNELS: is an internal routine used by the CHETRD_HB2ST
+          !> subroutine.
+          interface hb2st_kernels
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine  chb2st_kernels( uplo, wantz, ttype,st, ed, sweep, n, nb, ib,a, &
+                         lda, v, tau, ldvt, work)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    logical(lk), intent(in) :: wantz
+                    integer(ilp), intent(in) :: ttype,st,ed,sweep,n,nb,ib,lda,ldvt
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: v(*),tau(*),work(*)
+               end subroutine chb2st_kernels
+#else 
+               module procedure stdlib_chb2st_kernels
+#endif
+#:if WITH_QP
+               module procedure stdlib_whb2st_kernels
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine  zhb2st_kernels( uplo, wantz, ttype,st, ed, sweep, n, nb, ib,a, &
+                         lda, v, tau, ldvt, work)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    logical(lk), intent(in) :: wantz
+                    integer(ilp), intent(in) :: ttype,st,ed,sweep,n,nb,ib,lda,ldvt
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: v(*),tau(*),work(*)
+               end subroutine zhb2st_kernels
+#else 
+               module procedure stdlib_zhb2st_kernels
+#endif
+          end interface hb2st_kernels
+
+
+
+          !> HBEV: computes all the eigenvalues and, optionally, eigenvectors of
+          !> a complex Hermitian band matrix A.
+          interface hbev
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine chbev( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,rwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldz,n
+                    real(sp), intent(out) :: rwork(*),w(*)
+                    complex(sp), intent(inout) :: ab(ldab,*)
+                    complex(sp), intent(out) :: work(*),z(ldz,*)
+               end subroutine chbev
+#else 
+               module procedure stdlib_chbev
+#endif
+#:if WITH_QP
+               module procedure stdlib_whbev
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zhbev( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,rwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldz,n
+                    real(dp), intent(out) :: rwork(*),w(*)
+                    complex(dp), intent(inout) :: ab(ldab,*)
+                    complex(dp), intent(out) :: work(*),z(ldz,*)
+               end subroutine zhbev
+#else 
+               module procedure stdlib_zhbev
+#endif
+          end interface hbev
+
+
+
+          !> HBEVD: computes all the eigenvalues and, optionally, eigenvectors of
+          !> a complex Hermitian band matrix A.  If eigenvectors are desired, it
+          !> uses a divide and conquer algorithm.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface hbevd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine chbevd( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,lwork, rwork, &
+                         lrwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kd,ldab,ldz,liwork,lrwork,lwork,n
+                    real(sp), intent(out) :: rwork(*),w(*)
+                    complex(sp), intent(inout) :: ab(ldab,*)
+                    complex(sp), intent(out) :: work(*),z(ldz,*)
+               end subroutine chbevd
+#else 
+               module procedure stdlib_chbevd
+#endif
+#:if WITH_QP
+               module procedure stdlib_whbevd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zhbevd( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,lwork, rwork, &
+                         lrwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kd,ldab,ldz,liwork,lrwork,lwork,n
+                    real(dp), intent(out) :: rwork(*),w(*)
+                    complex(dp), intent(inout) :: ab(ldab,*)
+                    complex(dp), intent(out) :: work(*),z(ldz,*)
+               end subroutine zhbevd
+#else 
+               module procedure stdlib_zhbevd
+#endif
+          end interface hbevd
+
+
+
+          !> HBGST: reduces a complex Hermitian-definite banded generalized
+          !> eigenproblem  A*x = lambda*B*x  to standard form  C*y = lambda*y,
+          !> such that C has the same bandwidth as A.
+          !> B must have been previously factorized as S**H*S by CPBSTF, using a
+          !> split Cholesky factorization. A is overwritten by C = X**H*A*X, where
+          !> X = S**(-1)*Q and Q is a unitary matrix chosen to preserve the
+          !> bandwidth of A.
+          interface hbgst
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chbgst( vect, uplo, n, ka, kb, ab, ldab, bb, ldbb, x,ldx, work, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ka,kb,ldab,ldbb,ldx,n
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: ab(ldab,*)
+                    complex(sp), intent(in) :: bb(ldbb,*)
+                    complex(sp), intent(out) :: work(*),x(ldx,*)
+               end subroutine chbgst
+#else 
+               module procedure stdlib_chbgst
+#endif
+#:if WITH_QP
+               module procedure stdlib_whbgst
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhbgst( vect, uplo, n, ka, kb, ab, ldab, bb, ldbb, x,ldx, work, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ka,kb,ldab,ldbb,ldx,n
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: ab(ldab,*)
+                    complex(dp), intent(in) :: bb(ldbb,*)
+                    complex(dp), intent(out) :: work(*),x(ldx,*)
+               end subroutine zhbgst
+#else 
+               module procedure stdlib_zhbgst
+#endif
+          end interface hbgst
+
+
+
+          !> HBGV: computes all the eigenvalues, and optionally, the eigenvectors
+          !> of a complex generalized Hermitian-definite banded eigenproblem, of
+          !> the form A*x=(lambda)*B*x. Here A and B are assumed to be Hermitian
+          !> and banded, and B is also positive definite.
+          interface hbgv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chbgv( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w, z,ldz, work, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ka,kb,ldab,ldbb,ldz,n
+                    real(sp), intent(out) :: rwork(*),w(*)
+                    complex(sp), intent(inout) :: ab(ldab,*),bb(ldbb,*)
+                    complex(sp), intent(out) :: work(*),z(ldz,*)
+               end subroutine chbgv
+#else 
+               module procedure stdlib_chbgv
+#endif
+#:if WITH_QP
+               module procedure stdlib_whbgv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhbgv( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w, z,ldz, work, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ka,kb,ldab,ldbb,ldz,n
+                    real(dp), intent(out) :: rwork(*),w(*)
+                    complex(dp), intent(inout) :: ab(ldab,*),bb(ldbb,*)
+                    complex(dp), intent(out) :: work(*),z(ldz,*)
+               end subroutine zhbgv
+#else 
+               module procedure stdlib_zhbgv
+#endif
+          end interface hbgv
+
+
+
+          !> HBGVD: computes all the eigenvalues, and optionally, the eigenvectors
+          !> of a complex generalized Hermitian-definite banded eigenproblem, of
+          !> the form A*x=(lambda)*B*x. Here A and B are assumed to be Hermitian
+          !> and banded, and B is also positive definite.  If eigenvectors are
+          !> desired, it uses a divide and conquer algorithm.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface hbgvd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chbgvd( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w,z, ldz, work, &
+                         lwork, rwork, lrwork, iwork,liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ka,kb,ldab,ldbb,ldz,liwork,lrwork,lwork,n
+                    real(sp), intent(out) :: rwork(*),w(*)
+                    complex(sp), intent(inout) :: ab(ldab,*),bb(ldbb,*)
+                    complex(sp), intent(out) :: work(*),z(ldz,*)
+               end subroutine chbgvd
+#else 
+               module procedure stdlib_chbgvd
+#endif
+#:if WITH_QP
+               module procedure stdlib_whbgvd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhbgvd( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w,z, ldz, work, &
+                         lwork, rwork, lrwork, iwork,liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ka,kb,ldab,ldbb,ldz,liwork,lrwork,lwork,n
+                    real(dp), intent(out) :: rwork(*),w(*)
+                    complex(dp), intent(inout) :: ab(ldab,*),bb(ldbb,*)
+                    complex(dp), intent(out) :: work(*),z(ldz,*)
+               end subroutine zhbgvd
+#else 
+               module procedure stdlib_zhbgvd
+#endif
+          end interface hbgvd
+
+
+
+          !> HBTRD: reduces a complex Hermitian band matrix A to real symmetric
+          !> tridiagonal form T by a unitary similarity transformation:
+          !> Q**H * A * Q = T.
+          interface hbtrd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chbtrd( vect, uplo, n, kd, ab, ldab, d, e, q, ldq,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldq,n
+                    real(sp), intent(out) :: d(*),e(*)
+                    complex(sp), intent(inout) :: ab(ldab,*),q(ldq,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chbtrd
+#else 
+               module procedure stdlib_chbtrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_whbtrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhbtrd( vect, uplo, n, kd, ab, ldab, d, e, q, ldq,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldq,n
+                    real(dp), intent(out) :: d(*),e(*)
+                    complex(dp), intent(inout) :: ab(ldab,*),q(ldq,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhbtrd
+#else 
+               module procedure stdlib_zhbtrd
+#endif
+          end interface hbtrd
+
+
+
+          !> HECON: estimates the reciprocal of the condition number of a complex
+          !> Hermitian matrix A using the factorization A = U*D*U**H or
+          !> A = L*D*L**H computed by CHETRF.
+          !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+          !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+          interface hecon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine checon( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine checon
+#else 
+               module procedure stdlib_checon
+#endif
+#:if WITH_QP
+               module procedure stdlib_whecon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhecon( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhecon
+#else 
+               module procedure stdlib_zhecon
+#endif
+          end interface hecon
+
+
+
+          !> HECON_ROOK: estimates the reciprocal of the condition number of a complex
+          !> Hermitian matrix A using the factorization A = U*D*U**H or
+          !> A = L*D*L**H computed by CHETRF_ROOK.
+          !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+          !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+          interface hecon_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine checon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine checon_rook
+#else 
+               module procedure stdlib_checon_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_whecon_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhecon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhecon_rook
+#else 
+               module procedure stdlib_zhecon_rook
+#endif
+          end interface hecon_rook
+
+
+
+          !> HEEQUB: computes row and column scalings intended to equilibrate a
+          !> Hermitian matrix A (with respect to the Euclidean norm) and reduce
+          !> its condition number. The scale factors S are computed by the BIN
+          !> algorithm (see references) so that the scaled matrix B with elements
+          !> B(i,j) = S(i)*A(i,j)*S(j) has a condition number within a factor N of
+          !> the smallest possible condition number over all possible diagonal
+          !> scalings.
+          interface heequb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cheequb( uplo, n, a, lda, s, scond, amax, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: amax,scond,s(*)
+                    character, intent(in) :: uplo
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cheequb
+#else 
+               module procedure stdlib_cheequb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wheequb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zheequb( uplo, n, a, lda, s, scond, amax, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: amax,scond,s(*)
+                    character, intent(in) :: uplo
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zheequb
+#else 
+               module procedure stdlib_zheequb
+#endif
+          end interface heequb
+
+
+
+          !> HEEV: computes all eigenvalues and, optionally, eigenvectors of a
+          !> complex Hermitian matrix A.
+          interface heev
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cheev( jobz, uplo, n, a, lda, w, work, lwork, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(sp), intent(out) :: rwork(*),w(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cheev
+#else 
+               module procedure stdlib_cheev
+#endif
+#:if WITH_QP
+               module procedure stdlib_wheev
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zheev( jobz, uplo, n, a, lda, w, work, lwork, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(dp), intent(out) :: rwork(*),w(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zheev
+#else 
+               module procedure stdlib_zheev
+#endif
+          end interface heev
+
+
+
+          !> HEEVD: computes all eigenvalues and, optionally, eigenvectors of a
+          !> complex Hermitian matrix A.  If eigenvectors are desired, it uses a
+          !> divide and conquer algorithm.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface heevd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cheevd( jobz, uplo, n, a, lda, w, work, lwork, rwork,lrwork, iwork, &
+                         liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,liwork,lrwork,lwork,n
+                    real(sp), intent(out) :: rwork(*),w(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cheevd
+#else 
+               module procedure stdlib_cheevd
+#endif
+#:if WITH_QP
+               module procedure stdlib_wheevd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zheevd( jobz, uplo, n, a, lda, w, work, lwork, rwork,lrwork, iwork, &
+                         liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,liwork,lrwork,lwork,n
+                    real(dp), intent(out) :: rwork(*),w(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zheevd
+#else 
+               module procedure stdlib_zheevd
+#endif
+          end interface heevd
+
+
+
+          !> HEEVR: computes selected eigenvalues and, optionally, eigenvectors
+          !> of a complex Hermitian matrix A.  Eigenvalues and eigenvectors can
+          !> be selected by specifying either a range of values or a range of
+          !> indices for the desired eigenvalues.
+          !> HEEVR first reduces the matrix A to tridiagonal form T with a call
+          !> to CHETRD.  Then, whenever possible, HEEVR calls CSTEMR to compute
+          !> the eigenspectrum using Relatively Robust Representations.  CSTEMR
+          !> computes eigenvalues by the dqds algorithm, while orthogonal
+          !> eigenvectors are computed from various "good" L D L^T representations
+          !> (also known as Relatively Robust Representations). Gram-Schmidt
+          !> orthogonalization is avoided as far as possible. More specifically,
+          !> the various steps of the algorithm are as follows.
+          !> For each unreduced block (submatrix) of T,
+          !> (a) Compute T - sigma I  = L D L^T, so that L and D
+          !> define all the wanted eigenvalues to high relative accuracy.
+          !> This means that small relative changes in the entries of D and L
+          !> cause only small relative changes in the eigenvalues and
+          !> eigenvectors. The standard (unfactored) representation of the
+          !> tridiagonal matrix T does not have this property in general.
+          !> (b) Compute the eigenvalues to suitable accuracy.
+          !> If the eigenvectors are desired, the algorithm attains full
+          !> accuracy of the computed eigenvalues only right before
+          !> the corresponding vectors have to be computed, see steps c) and d).
+          !> (c) For each cluster of close eigenvalues, select a new
+          !> shift close to the cluster, find a new factorization, and refine
+          !> the shifted eigenvalues to suitable accuracy.
+          !> (d) For each eigenvalue with a large enough relative separation compute
+          !> the corresponding eigenvector by forming a rank revealing twisted
+          !> factorization. Go back to (c) for any clusters that remain.
+          !> The desired accuracy of the output can be specified by the input
+          !> parameter ABSTOL.
+          !> For more details, see CSTEMR's documentation and:
+          !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+          !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+          !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+          !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+          !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+          !> 2004.  Also LAPACK Working Note 154.
+          !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+          !> tridiagonal eigenvalue/eigenvector problem",
+          !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+          !> UC Berkeley, May 1997.
+          !> Note 1 : HEEVR calls CSTEMR when the full spectrum is requested
+          !> on machines which conform to the ieee-754 floating point standard.
+          !> HEEVR calls SSTEBZ and CSTEIN on non-ieee machines and
+          !> when partial spectrum requests are made.
+          !> Normal execution of CSTEMR may create NaNs and infinities and
+          !> hence may abort due to a floating point exception in environments
+          !> which do not handle NaNs and infinities in the ieee standard default
+          !> manner.
+          interface heevr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cheevr( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, &
+                         ldz, isuppz, work, lwork,rwork, lrwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range,uplo
+                    integer(ilp), intent(in) :: il,iu,lda,ldz,liwork,lrwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(sp), intent(in) :: abstol,vl,vu
+                    real(sp), intent(out) :: rwork(*),w(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*),z(ldz,*)
+               end subroutine cheevr
+#else 
+               module procedure stdlib_cheevr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wheevr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zheevr( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, &
+                         ldz, isuppz, work, lwork,rwork, lrwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range,uplo
+                    integer(ilp), intent(in) :: il,iu,lda,ldz,liwork,lrwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(dp), intent(in) :: abstol,vl,vu
+                    real(dp), intent(out) :: rwork(*),w(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*),z(ldz,*)
+               end subroutine zheevr
+#else 
+               module procedure stdlib_zheevr
+#endif
+          end interface heevr
+
+
+
+          !> HEGST: reduces a complex Hermitian-definite generalized
+          !> eigenproblem to standard form.
+          !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+          !> and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
+          !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+          !> B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L.
+          !> B must have been previously factorized as U**H*U or L*L**H by CPOTRF.
+          interface hegst
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chegst( itype, uplo, n, a, lda, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,lda,ldb,n
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+               end subroutine chegst
+#else 
+               module procedure stdlib_chegst
+#endif
+#:if WITH_QP
+               module procedure stdlib_whegst
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhegst( itype, uplo, n, a, lda, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,lda,ldb,n
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+               end subroutine zhegst
+#else 
+               module procedure stdlib_zhegst
+#endif
+          end interface hegst
+
+
+
+          !> HEGV: computes all the eigenvalues, and optionally, the eigenvectors
+          !> of a complex generalized Hermitian-definite eigenproblem, of the form
+          !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+          !> Here A and B are assumed to be Hermitian and B is also
+          !> positive definite.
+          interface hegv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine chegv( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, rwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,lda,ldb,lwork,n
+                    real(sp), intent(out) :: rwork(*),w(*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chegv
+#else 
+               module procedure stdlib_chegv
+#endif
+#:if WITH_QP
+               module procedure stdlib_whegv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zhegv( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, rwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,lda,ldb,lwork,n
+                    real(dp), intent(out) :: rwork(*),w(*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhegv
+#else 
+               module procedure stdlib_zhegv
+#endif
+          end interface hegv
+
+
+
+          !> HEGVD: computes all the eigenvalues, and optionally, the eigenvectors
+          !> of a complex generalized Hermitian-definite eigenproblem, of the form
+          !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+          !> B are assumed to be Hermitian and B is also positive definite.
+          !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface hegvd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine chegvd( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, rwork, &
+                         lrwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: itype,lda,ldb,liwork,lrwork,lwork,n
+                    real(sp), intent(out) :: rwork(*),w(*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chegvd
+#else 
+               module procedure stdlib_chegvd
+#endif
+#:if WITH_QP
+               module procedure stdlib_whegvd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zhegvd( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, rwork, &
+                         lrwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: itype,lda,ldb,liwork,lrwork,lwork,n
+                    real(dp), intent(out) :: rwork(*),w(*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhegvd
+#else 
+               module procedure stdlib_zhegvd
+#endif
+          end interface hegvd
+
+
+
+          !> HERFS: improves the computed solution to a system of linear
+          !> equations when the coefficient matrix is Hermitian indefinite, and
+          !> provides error bounds and backward error estimates for the solution.
+          interface herfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cherfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr,&
+                          berr, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs,ipiv(*)
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(sp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: x(ldx,*)
+               end subroutine cherfs
+#else 
+               module procedure stdlib_cherfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wherfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zherfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr,&
+                          berr, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs,ipiv(*)
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(dp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: x(ldx,*)
+               end subroutine zherfs
+#else 
+               module procedure stdlib_zherfs
+#endif
+          end interface herfs
+
+
+
+          !> HESV: computes the solution to a complex system of linear equations
+          !> A * X = B,
+          !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+          !> matrices.
+          !> The diagonal pivoting method is used to factor A as
+          !> A = U * D * U**H,  if UPLO = 'U', or
+          !> A = L * D * L**H,  if UPLO = 'L',
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and D is Hermitian and block diagonal with
+          !> 1-by-1 and 2-by-2 diagonal blocks.  The factored form of A is then
+          !> used to solve the system of equations A * X = B.
+          interface hesv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chesv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chesv
+#else 
+               module procedure stdlib_chesv
+#endif
+#:if WITH_QP
+               module procedure stdlib_whesv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhesv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhesv
+#else 
+               module procedure stdlib_zhesv
+#endif
+          end interface hesv
+
+
+
+          !> HESV_AA: computes the solution to a complex system of linear equations
+          !> A * X = B,
+          !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+          !> matrices.
+          !> Aasen's algorithm is used to factor A as
+          !> A = U**H * T * U,  if UPLO = 'U', or
+          !> A = L * T * L**H,  if UPLO = 'L',
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and T is Hermitian and tridiagonal. The factored form
+          !> of A is then used to solve the system of equations A * X = B.
+          interface hesv_aa
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chesv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chesv_aa
+#else 
+               module procedure stdlib_chesv_aa
+#endif
+#:if WITH_QP
+               module procedure stdlib_whesv_aa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhesv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhesv_aa
+#else 
+               module procedure stdlib_zhesv_aa
+#endif
+          end interface hesv_aa
+
+
+
+          !> HESV_RK: computes the solution to a complex system of linear
+          !> equations A * X = B, where A is an N-by-N Hermitian matrix
+          !> and X and B are N-by-NRHS matrices.
+          !> The bounded Bunch-Kaufman (rook) diagonal pivoting method is used
+          !> to factor A as
+          !> A = P*U*D*(U**H)*(P**T),  if UPLO = 'U', or
+          !> A = P*L*D*(L**H)*(P**T),  if UPLO = 'L',
+          !> where U (or L) is unit upper (or lower) triangular matrix,
+          !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+          !> matrix, P**T is the transpose of P, and D is Hermitian and block
+          !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+          !> CHETRF_RK is called to compute the factorization of a complex
+          !> Hermitian matrix.  The factored form of A is then used to solve
+          !> the system of equations A * X = B by calling BLAS3 routine CHETRS_3.
+          interface hesv_rk
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chesv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, work,lwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: e(*),work(*)
+               end subroutine chesv_rk
+#else 
+               module procedure stdlib_chesv_rk
+#endif
+#:if WITH_QP
+               module procedure stdlib_whesv_rk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhesv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, work,lwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: e(*),work(*)
+               end subroutine zhesv_rk
+#else 
+               module procedure stdlib_zhesv_rk
+#endif
+          end interface hesv_rk
+
+
+
+          !> HESV_ROOK: computes the solution to a complex system of linear equations
+          !> A * X = B,
+          !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+          !> matrices.
+          !> The bounded Bunch-Kaufman ("rook") diagonal pivoting method is used
+          !> to factor A as
+          !> A = U * D * U**T,  if UPLO = 'U', or
+          !> A = L * D * L**T,  if UPLO = 'L',
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and D is Hermitian and block diagonal with
+          !> 1-by-1 and 2-by-2 diagonal blocks.
+          !> CHETRF_ROOK is called to compute the factorization of a complex
+          !> Hermition matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+          !> pivoting method.
+          !> The factored form of A is then used to solve the system
+          !> of equations A * X = B by calling CHETRS_ROOK (uses BLAS 2).
+          interface hesv_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chesv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chesv_rook
+#else 
+               module procedure stdlib_chesv_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_whesv_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhesv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhesv_rook
+#else 
+               module procedure stdlib_zhesv_rook
+#endif
+          end interface hesv_rook
+
+
+
+          !> HESWAPR: applies an elementary permutation on the rows and the columns of
+          !> a hermitian matrix.
+          interface heswapr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cheswapr( uplo, n, a, lda, i1, i2)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: i1,i2,lda,n
+                    complex(sp), intent(inout) :: a(lda,n)
+               end subroutine cheswapr
+#else 
+               module procedure stdlib_cheswapr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wheswapr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zheswapr( uplo, n, a, lda, i1, i2)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: i1,i2,lda,n
+                    complex(dp), intent(inout) :: a(lda,n)
+               end subroutine zheswapr
+#else 
+               module procedure stdlib_zheswapr
+#endif
+          end interface heswapr
+
+
+
+          !> HETF2_RK: computes the factorization of a complex Hermitian matrix A
+          !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+          !> A = P*U*D*(U**H)*(P**T) or A = P*L*D*(L**H)*(P**T),
+          !> where U (or L) is unit upper (or lower) triangular matrix,
+          !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+          !> matrix, P**T is the transpose of P, and D is Hermitian and block
+          !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+          !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+          !> For more information see Further Details section.
+          interface hetf2_rk
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetf2_rk( uplo, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: e(*)
+               end subroutine chetf2_rk
+#else 
+               module procedure stdlib_chetf2_rk
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetf2_rk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetf2_rk( uplo, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: e(*)
+               end subroutine zhetf2_rk
+#else 
+               module procedure stdlib_zhetf2_rk
+#endif
+          end interface hetf2_rk
+
+
+
+          !> HETF2_ROOK: computes the factorization of a complex Hermitian matrix A
+          !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method:
+          !> A = U*D*U**H  or  A = L*D*L**H
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, U**H is the conjugate transpose of U, and D is
+          !> Hermitian and block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+          !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+          interface hetf2_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetf2_rook( uplo, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,n
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine chetf2_rook
+#else 
+               module procedure stdlib_chetf2_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetf2_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetf2_rook( uplo, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,n
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zhetf2_rook
+#else 
+               module procedure stdlib_zhetf2_rook
+#endif
+          end interface hetf2_rook
+
+
+
+          !> HETRD: reduces a complex Hermitian matrix A to real symmetric
+          !> tridiagonal form T by a unitary similarity transformation:
+          !> Q**H * A * Q = T.
+          interface hetrd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetrd( uplo, n, a, lda, d, e, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(sp), intent(out) :: d(*),e(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: tau(*),work(*)
+               end subroutine chetrd
+#else 
+               module procedure stdlib_chetrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetrd( uplo, n, a, lda, d, e, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(dp), intent(out) :: d(*),e(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: tau(*),work(*)
+               end subroutine zhetrd
+#else 
+               module procedure stdlib_zhetrd
+#endif
+          end interface hetrd
+
+
+
+          !> HETRD_HB2ST: reduces a complex Hermitian band matrix A to real symmetric
+          !> tridiagonal form T by a unitary similarity transformation:
+          !> Q**H * A * Q = T.
+          interface hetrd_hb2st
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetrd_hb2st( stage1, vect, uplo, n, kd, ab, ldab,d, e, hous, &
+                         lhous, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: stage1,uplo,vect
+                    integer(ilp), intent(in) :: n,kd,ldab,lhous,lwork
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(out) :: d(*),e(*)
+                    complex(sp), intent(inout) :: ab(ldab,*)
+                    complex(sp), intent(out) :: hous(*),work(*)
+               end subroutine chetrd_hb2st
+#else 
+               module procedure stdlib_chetrd_hb2st
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetrd_hb2st
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetrd_hb2st( stage1, vect, uplo, n, kd, ab, ldab,d, e, hous, &
+                         lhous, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: stage1,uplo,vect
+                    integer(ilp), intent(in) :: n,kd,ldab,lhous,lwork
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(out) :: d(*),e(*)
+                    complex(dp), intent(inout) :: ab(ldab,*)
+                    complex(dp), intent(out) :: hous(*),work(*)
+               end subroutine zhetrd_hb2st
+#else 
+               module procedure stdlib_zhetrd_hb2st
+#endif
+          end interface hetrd_hb2st
+
+
+
+          !> HETRD_HE2HB: reduces a complex Hermitian matrix A to complex Hermitian
+          !> band-diagonal form AB by a unitary similarity transformation:
+          !> Q**H * A * Q = AB.
+          interface hetrd_he2hb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetrd_he2hb( uplo, n, kd, a, lda, ab, ldab, tau,work, lwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldab,lwork,n,kd
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: ab(ldab,*),tau(*),work(*)
+               end subroutine chetrd_he2hb
+#else 
+               module procedure stdlib_chetrd_he2hb
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetrd_he2hb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetrd_he2hb( uplo, n, kd, a, lda, ab, ldab, tau,work, lwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldab,lwork,n,kd
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: ab(ldab,*),tau(*),work(*)
+               end subroutine zhetrd_he2hb
+#else 
+               module procedure stdlib_zhetrd_he2hb
+#endif
+          end interface hetrd_he2hb
+
+
+
+          !> HETRF: computes the factorization of a complex Hermitian matrix A
+          !> using the Bunch-Kaufman diagonal pivoting method.  The form of the
+          !> factorization is
+          !> A = U*D*U**H  or  A = L*D*L**H
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and D is Hermitian and block diagonal with
+          !> 1-by-1 and 2-by-2 diagonal blocks.
+          !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+          interface hetrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetrf( uplo, n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chetrf
+#else 
+               module procedure stdlib_chetrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetrf( uplo, n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhetrf
+#else 
+               module procedure stdlib_zhetrf
+#endif
+          end interface hetrf
+
+
+
+          !> HETRF_AA: computes the factorization of a complex hermitian matrix A
+          !> using the Aasen's algorithm.  The form of the factorization is
+          !> A = U**H*T*U  or  A = L*T*L**H
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and T is a hermitian tridiagonal matrix.
+          !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+          interface hetrf_aa
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,lda,lwork
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chetrf_aa
+#else 
+               module procedure stdlib_chetrf_aa
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetrf_aa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,lda,lwork
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhetrf_aa
+#else 
+               module procedure stdlib_zhetrf_aa
+#endif
+          end interface hetrf_aa
+
+
+
+          !> HETRF_RK: computes the factorization of a complex Hermitian matrix A
+          !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+          !> A = P*U*D*(U**H)*(P**T) or A = P*L*D*(L**H)*(P**T),
+          !> where U (or L) is unit upper (or lower) triangular matrix,
+          !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+          !> matrix, P**T is the transpose of P, and D is Hermitian and block
+          !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+          !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+          !> For more information see Further Details section.
+          interface hetrf_rk
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: e(*),work(*)
+               end subroutine chetrf_rk
+#else 
+               module procedure stdlib_chetrf_rk
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetrf_rk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: e(*),work(*)
+               end subroutine zhetrf_rk
+#else 
+               module procedure stdlib_zhetrf_rk
+#endif
+          end interface hetrf_rk
+
+
+
+          !> HETRF_ROOK: computes the factorization of a complex Hermitian matrix A
+          !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+          !> The form of the factorization is
+          !> A = U*D*U**T  or  A = L*D*L**T
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and D is Hermitian and block diagonal with
+          !> 1-by-1 and 2-by-2 diagonal blocks.
+          !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+          interface hetrf_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chetrf_rook
+#else 
+               module procedure stdlib_chetrf_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetrf_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhetrf_rook
+#else 
+               module procedure stdlib_zhetrf_rook
+#endif
+          end interface hetrf_rook
+
+
+
+          !> HETRI: computes the inverse of a complex Hermitian indefinite matrix
+          !> A using the factorization A = U*D*U**H or A = L*D*L**H computed by
+          !> CHETRF.
+          interface hetri
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetri( uplo, n, a, lda, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chetri
+#else 
+               module procedure stdlib_chetri
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetri( uplo, n, a, lda, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhetri
+#else 
+               module procedure stdlib_zhetri
+#endif
+          end interface hetri
+
+
+
+          !> HETRI_ROOK: computes the inverse of a complex Hermitian indefinite matrix
+          !> A using the factorization A = U*D*U**H or A = L*D*L**H computed by
+          !> CHETRF_ROOK.
+          interface hetri_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetri_rook( uplo, n, a, lda, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chetri_rook
+#else 
+               module procedure stdlib_chetri_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetri_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetri_rook( uplo, n, a, lda, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhetri_rook
+#else 
+               module procedure stdlib_zhetri_rook
+#endif
+          end interface hetri_rook
+
+
+
+          !> HETRS: solves a system of linear equations A*X = B with a complex
+          !> Hermitian matrix A using the factorization A = U*D*U**H or
+          !> A = L*D*L**H computed by CHETRF.
+          interface hetrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine chetrs
+#else 
+               module procedure stdlib_chetrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zhetrs
+#else 
+               module procedure stdlib_zhetrs
+#endif
+          end interface hetrs
+
+
+
+          !> HETRS2: solves a system of linear equations A*X = B with a complex
+          !> Hermitian matrix A using the factorization A = U*D*U**H or
+          !> A = L*D*L**H computed by CHETRF and converted by CSYCONV.
+          interface hetrs2
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chetrs2
+#else 
+               module procedure stdlib_chetrs2
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetrs2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhetrs2
+#else 
+               module procedure stdlib_zhetrs2
+#endif
+          end interface hetrs2
+
+
+
+          !> HETRS_3: solves a system of linear equations A * X = B with a complex
+          !> Hermitian matrix A using the factorization computed
+          !> by CHETRF_RK or CHETRF_BK:
+          !> A = P*U*D*(U**H)*(P**T) or A = P*L*D*(L**H)*(P**T),
+          !> where U (or L) is unit upper (or lower) triangular matrix,
+          !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+          !> matrix, P**T is the transpose of P, and D is Hermitian and block
+          !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+          !> This algorithm is using Level 3 BLAS.
+          interface hetrs_3
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(sp), intent(in) :: a(lda,*),e(*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine chetrs_3
+#else 
+               module procedure stdlib_chetrs_3
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetrs_3
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(dp), intent(in) :: a(lda,*),e(*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zhetrs_3
+#else 
+               module procedure stdlib_zhetrs_3
+#endif
+          end interface hetrs_3
+
+
+
+          !> HETRS_AA: solves a system of linear equations A*X = B with a complex
+          !> hermitian matrix A using the factorization A = U**H*T*U or
+          !> A = L*T*L**H computed by CHETRF_AA.
+          interface hetrs_aa
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,nrhs,lda,ldb,lwork,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chetrs_aa
+#else 
+               module procedure stdlib_chetrs_aa
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetrs_aa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,nrhs,lda,ldb,lwork,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhetrs_aa
+#else 
+               module procedure stdlib_zhetrs_aa
+#endif
+          end interface hetrs_aa
+
+
+
+          !> HETRS_ROOK: solves a system of linear equations A*X = B with a complex
+          !> Hermitian matrix A using the factorization A = U*D*U**H or
+          !> A = L*D*L**H computed by CHETRF_ROOK.
+          interface hetrs_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chetrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine chetrs_rook
+#else 
+               module procedure stdlib_chetrs_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_whetrs_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhetrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zhetrs_rook
+#else 
+               module procedure stdlib_zhetrs_rook
+#endif
+          end interface hetrs_rook
+
+
+
+          !> Level 3 BLAS like routine for C in RFP Format.
+          !> HFRK: performs one of the Hermitian rank--k operations
+          !> C := alpha*A*A**H + beta*C,
+          !> or
+          !> C := alpha*A**H*A + beta*C,
+          !> where alpha and beta are real scalars, C is an n--by--n Hermitian
+          !> matrix and A is an n--by--k matrix in the first case and a k--by--n
+          !> matrix in the second case.
+          interface hfrk
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chfrk( transr, uplo, trans, n, k, alpha, a, lda, beta,c )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta
+                    integer(ilp), intent(in) :: k,lda,n
+                    character, intent(in) :: trans,transr,uplo
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: c(*)
+               end subroutine chfrk
+#else 
+               module procedure stdlib_chfrk
+#endif
+#:if WITH_QP
+               module procedure stdlib_whfrk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhfrk( transr, uplo, trans, n, k, alpha, a, lda, beta,c )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta
+                    integer(ilp), intent(in) :: k,lda,n
+                    character, intent(in) :: trans,transr,uplo
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: c(*)
+               end subroutine zhfrk
+#else 
+               module procedure stdlib_zhfrk
+#endif
+          end interface hfrk
+
+
+
+          !> HGEQZ: computes the eigenvalues of a complex matrix pair (H,T),
+          !> where H is an upper Hessenberg matrix and T is upper triangular,
+          !> using the single-shift QZ method.
+          !> Matrix pairs of this type are produced by the reduction to
+          !> generalized upper Hessenberg form of a complex matrix pair (A,B):
+          !> A = Q1*H*Z1**H,  B = Q1*T*Z1**H,
+          !> as computed by CGGHRD.
+          !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+          !> also reduced to generalized Schur form,
+          !> H = Q*S*Z**H,  T = Q*P*Z**H,
+          !> where Q and Z are unitary matrices and S and P are upper triangular.
+          !> Optionally, the unitary matrix Q from the generalized Schur
+          !> factorization may be postmultiplied into an input matrix Q1, and the
+          !> unitary matrix Z may be postmultiplied into an input matrix Z1.
+          !> If Q1 and Z1 are the unitary matrices from CGGHRD that reduced
+          !> the matrix pair (A,B) to generalized Hessenberg form, then the output
+          !> matrices Q1*Q and Z1*Z are the unitary factors from the generalized
+          !> Schur factorization of (A,B):
+          !> A = (Q1*Q)*S*(Z1*Z)**H,  B = (Q1*Q)*P*(Z1*Z)**H.
+          !> To avoid overflow, eigenvalues of the matrix pair (H,T)
+          !> (equivalently, of (A,B)) are computed as a pair of complex values
+          !> (alpha,beta).  If beta is nonzero, lambda = alpha / beta is an
+          !> eigenvalue of the generalized nonsymmetric eigenvalue problem (GNEP)
+          !> A*x = lambda*B*x
+          !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+          !> alternate form of the GNEP
+          !> mu*A*y = B*y.
+          !> The values of alpha and beta for the i-th eigenvalue can be read
+          !> directly from the generalized Schur form:  alpha = S(i,i),
+          !> beta = P(i,i).
+          !> Ref: C.B. Moler
+          !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+          !> pp. 241--256.
+          interface hgeqz
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine chgeqz( job, compq, compz, n, ilo, ihi, h, ldh, t, ldt,alpha, beta, q, &
+                         ldq, z, ldz, work, lwork,rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,compz,job
+                    integer(ilp), intent(in) :: ihi,ilo,ldh,ldq,ldt,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(out) :: alpha(*),beta(*),work(*)
+                    complex(sp), intent(inout) :: h(ldh,*),q(ldq,*),t(ldt,*),z(ldz,*)
+               end subroutine chgeqz
+#else 
+               module procedure stdlib_chgeqz
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dhgeqz( job, compq, compz, n, ilo, ihi, h, ldh, t, ldt,alphar, alphai, &
+                         beta, q, ldq, z, ldz, work,lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,compz,job
+                    integer(ilp), intent(in) :: ihi,ilo,ldh,ldq,ldt,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(out) :: alphai(*),alphar(*),beta(*),work(*)
+                    real(dp), intent(inout) :: h(ldh,*),q(ldq,*),t(ldt,*),z(ldz,*)
+               end subroutine dhgeqz
+#else 
+               module procedure stdlib_dhgeqz
+#endif
+#:if WITH_QP
+               module procedure stdlib_qhgeqz
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine shgeqz( job, compq, compz, n, ilo, ihi, h, ldh, t, ldt,alphar, alphai, &
+                         beta, q, ldq, z, ldz, work,lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,compz,job
+                    integer(ilp), intent(in) :: ihi,ilo,ldh,ldq,ldt,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(out) :: alphai(*),alphar(*),beta(*),work(*)
+                    real(sp), intent(inout) :: h(ldh,*),q(ldq,*),t(ldt,*),z(ldz,*)
+               end subroutine shgeqz
+#else 
+               module procedure stdlib_shgeqz
+#endif
+#:if WITH_QP
+               module procedure stdlib_whgeqz
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zhgeqz( job, compq, compz, n, ilo, ihi, h, ldh, t, ldt,alpha, beta, q, &
+                         ldq, z, ldz, work, lwork,rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,compz,job
+                    integer(ilp), intent(in) :: ihi,ilo,ldh,ldq,ldt,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(out) :: alpha(*),beta(*),work(*)
+                    complex(dp), intent(inout) :: h(ldh,*),q(ldq,*),t(ldt,*),z(ldz,*)
+               end subroutine zhgeqz
+#else 
+               module procedure stdlib_zhgeqz
+#endif
+          end interface hgeqz
+
+
+
+          !> HPCON: estimates the reciprocal of the condition number of a complex
+          !> Hermitian packed matrix A using the factorization A = U*D*U**H or
+          !> A = L*D*L**H computed by CHPTRF.
+          !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+          !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+          interface hpcon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chpcon( uplo, n, ap, ipiv, anorm, rcond, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond
+                    complex(sp), intent(in) :: ap(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chpcon
+#else 
+               module procedure stdlib_chpcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_whpcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhpcon( uplo, n, ap, ipiv, anorm, rcond, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond
+                    complex(dp), intent(in) :: ap(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhpcon
+#else 
+               module procedure stdlib_zhpcon
+#endif
+          end interface hpcon
+
+
+
+          !> HPEV: computes all the eigenvalues and, optionally, eigenvectors of a
+          !> complex Hermitian matrix in packed storage.
+          interface hpev
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine chpev( jobz, uplo, n, ap, w, z, ldz, work, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(sp), intent(out) :: rwork(*),w(*)
+                    complex(sp), intent(inout) :: ap(*)
+                    complex(sp), intent(out) :: work(*),z(ldz,*)
+               end subroutine chpev
+#else 
+               module procedure stdlib_chpev
+#endif
+#:if WITH_QP
+               module procedure stdlib_whpev
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zhpev( jobz, uplo, n, ap, w, z, ldz, work, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(dp), intent(out) :: rwork(*),w(*)
+                    complex(dp), intent(inout) :: ap(*)
+                    complex(dp), intent(out) :: work(*),z(ldz,*)
+               end subroutine zhpev
+#else 
+               module procedure stdlib_zhpev
+#endif
+          end interface hpev
+
+
+
+          !> HPEVD: computes all the eigenvalues and, optionally, eigenvectors of
+          !> a complex Hermitian matrix A in packed storage.  If eigenvectors are
+          !> desired, it uses a divide and conquer algorithm.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface hpevd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine chpevd( jobz, uplo, n, ap, w, z, ldz, work, lwork,rwork, lrwork, iwork, &
+                         liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldz,liwork,lrwork,lwork,n
+                    real(sp), intent(out) :: rwork(*),w(*)
+                    complex(sp), intent(inout) :: ap(*)
+                    complex(sp), intent(out) :: work(*),z(ldz,*)
+               end subroutine chpevd
+#else 
+               module procedure stdlib_chpevd
+#endif
+#:if WITH_QP
+               module procedure stdlib_whpevd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zhpevd( jobz, uplo, n, ap, w, z, ldz, work, lwork,rwork, lrwork, iwork, &
+                         liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldz,liwork,lrwork,lwork,n
+                    real(dp), intent(out) :: rwork(*),w(*)
+                    complex(dp), intent(inout) :: ap(*)
+                    complex(dp), intent(out) :: work(*),z(ldz,*)
+               end subroutine zhpevd
+#else 
+               module procedure stdlib_zhpevd
+#endif
+          end interface hpevd
+
+
+
+          !> HPGST: reduces a complex Hermitian-definite generalized
+          !> eigenproblem to standard form, using packed storage.
+          !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+          !> and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
+          !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+          !> B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L.
+          !> B must have been previously factorized as U**H*U or L*L**H by CPPTRF.
+          interface hpgst
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chpgst( itype, uplo, n, ap, bp, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,n
+                    complex(sp), intent(inout) :: ap(*)
+                    complex(sp), intent(in) :: bp(*)
+               end subroutine chpgst
+#else 
+               module procedure stdlib_chpgst
+#endif
+#:if WITH_QP
+               module procedure stdlib_whpgst
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhpgst( itype, uplo, n, ap, bp, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,n
+                    complex(dp), intent(inout) :: ap(*)
+                    complex(dp), intent(in) :: bp(*)
+               end subroutine zhpgst
+#else 
+               module procedure stdlib_zhpgst
+#endif
+          end interface hpgst
+
+
+
+          !> HPGV: computes all the eigenvalues and, optionally, the eigenvectors
+          !> of a complex generalized Hermitian-definite eigenproblem, of the form
+          !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+          !> Here A and B are assumed to be Hermitian, stored in packed format,
+          !> and B is also positive definite.
+          interface hpgv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine chpgv( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,rwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,ldz,n
+                    real(sp), intent(out) :: rwork(*),w(*)
+                    complex(sp), intent(inout) :: ap(*),bp(*)
+                    complex(sp), intent(out) :: work(*),z(ldz,*)
+               end subroutine chpgv
+#else 
+               module procedure stdlib_chpgv
+#endif
+#:if WITH_QP
+               module procedure stdlib_whpgv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zhpgv( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,rwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,ldz,n
+                    real(dp), intent(out) :: rwork(*),w(*)
+                    complex(dp), intent(inout) :: ap(*),bp(*)
+                    complex(dp), intent(out) :: work(*),z(ldz,*)
+               end subroutine zhpgv
+#else 
+               module procedure stdlib_zhpgv
+#endif
+          end interface hpgv
+
+
+
+          !> HPGVD: computes all the eigenvalues and, optionally, the eigenvectors
+          !> of a complex generalized Hermitian-definite eigenproblem, of the form
+          !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+          !> B are assumed to be Hermitian, stored in packed format, and B is also
+          !> positive definite.
+          !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface hpgvd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine chpgvd( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,lwork, rwork, &
+                         lrwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: itype,ldz,liwork,lrwork,lwork,n
+                    real(sp), intent(out) :: rwork(*),w(*)
+                    complex(sp), intent(inout) :: ap(*),bp(*)
+                    complex(sp), intent(out) :: work(*),z(ldz,*)
+               end subroutine chpgvd
+#else 
+               module procedure stdlib_chpgvd
+#endif
+#:if WITH_QP
+               module procedure stdlib_whpgvd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zhpgvd( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,lwork, rwork, &
+                         lrwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: itype,ldz,liwork,lrwork,lwork,n
+                    real(dp), intent(out) :: rwork(*),w(*)
+                    complex(dp), intent(inout) :: ap(*),bp(*)
+                    complex(dp), intent(out) :: work(*),z(ldz,*)
+               end subroutine zhpgvd
+#else 
+               module procedure stdlib_zhpgvd
+#endif
+          end interface hpgvd
+
+
+
+          !> HPRFS: improves the computed solution to a system of linear
+          !> equations when the coefficient matrix is Hermitian indefinite
+          !> and packed, and provides error bounds and backward error estimates
+          !> for the solution.
+          interface hprfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, &
+                         work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs,ipiv(*)
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(sp), intent(in) :: afp(*),ap(*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: x(ldx,*)
+               end subroutine chprfs
+#else 
+               module procedure stdlib_chprfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_whprfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, &
+                         work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs,ipiv(*)
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(dp), intent(in) :: afp(*),ap(*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: x(ldx,*)
+               end subroutine zhprfs
+#else 
+               module procedure stdlib_zhprfs
+#endif
+          end interface hprfs
+
+
+
+          !> HPSV: computes the solution to a complex system of linear equations
+          !> A * X = B,
+          !> where A is an N-by-N Hermitian matrix stored in packed format and X
+          !> and B are N-by-NRHS matrices.
+          !> The diagonal pivoting method is used to factor A as
+          !> A = U * D * U**H,  if UPLO = 'U', or
+          !> A = L * D * L**H,  if UPLO = 'L',
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, D is Hermitian and block diagonal with 1-by-1
+          !> and 2-by-2 diagonal blocks.  The factored form of A is then used to
+          !> solve the system of equations A * X = B.
+          interface hpsv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chpsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(sp), intent(inout) :: ap(*),b(ldb,*)
+               end subroutine chpsv
+#else 
+               module procedure stdlib_chpsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_whpsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhpsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(dp), intent(inout) :: ap(*),b(ldb,*)
+               end subroutine zhpsv
+#else 
+               module procedure stdlib_zhpsv
+#endif
+          end interface hpsv
+
+
+
+          !> HPTRD: reduces a complex Hermitian matrix A stored in packed form to
+          !> real symmetric tridiagonal form T by a unitary similarity
+          !> transformation: Q**H * A * Q = T.
+          interface hptrd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chptrd( uplo, n, ap, d, e, tau, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(out) :: d(*),e(*)
+                    complex(sp), intent(inout) :: ap(*)
+                    complex(sp), intent(out) :: tau(*)
+               end subroutine chptrd
+#else 
+               module procedure stdlib_chptrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_whptrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhptrd( uplo, n, ap, d, e, tau, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(out) :: d(*),e(*)
+                    complex(dp), intent(inout) :: ap(*)
+                    complex(dp), intent(out) :: tau(*)
+               end subroutine zhptrd
+#else 
+               module procedure stdlib_zhptrd
+#endif
+          end interface hptrd
+
+
+
+          !> HPTRF: computes the factorization of a complex Hermitian packed
+          !> matrix A using the Bunch-Kaufman diagonal pivoting method:
+          !> A = U*D*U**H  or  A = L*D*L**H
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and D is Hermitian and block diagonal with
+          !> 1-by-1 and 2-by-2 diagonal blocks.
+          interface hptrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chptrf( uplo, n, ap, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: n
+                    complex(sp), intent(inout) :: ap(*)
+               end subroutine chptrf
+#else 
+               module procedure stdlib_chptrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_whptrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhptrf( uplo, n, ap, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: n
+                    complex(dp), intent(inout) :: ap(*)
+               end subroutine zhptrf
+#else 
+               module procedure stdlib_zhptrf
+#endif
+          end interface hptrf
+
+
+
+          !> HPTRI: computes the inverse of a complex Hermitian indefinite matrix
+          !> A in packed storage using the factorization A = U*D*U**H or
+          !> A = L*D*L**H computed by CHPTRF.
+          interface hptri
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chptri( uplo, n, ap, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    complex(sp), intent(inout) :: ap(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chptri
+#else 
+               module procedure stdlib_chptri
+#endif
+#:if WITH_QP
+               module procedure stdlib_whptri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhptri( uplo, n, ap, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    complex(dp), intent(inout) :: ap(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhptri
+#else 
+               module procedure stdlib_zhptri
+#endif
+          end interface hptri
+
+
+
+          !> HPTRS: solves a system of linear equations A*X = B with a complex
+          !> Hermitian matrix A stored in packed format using the factorization
+          !> A = U*D*U**H or A = L*D*L**H computed by CHPTRF.
+          interface hptrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs,ipiv(*)
+                    complex(sp), intent(in) :: ap(*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine chptrs
+#else 
+               module procedure stdlib_chptrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_whptrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs,ipiv(*)
+                    complex(dp), intent(in) :: ap(*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zhptrs
+#else 
+               module procedure stdlib_zhptrs
+#endif
+          end interface hptrs
+
+
+
+          !> HSEIN: uses inverse iteration to find specified right and/or left
+          !> eigenvectors of a complex upper Hessenberg matrix H.
+          !> The right eigenvector x and the left eigenvector y of the matrix H
+          !> corresponding to an eigenvalue w are defined by:
+          !> H * x = w * x,     y**h * H = w * y**h
+          !> where y**h denotes the conjugate transpose of the vector y.
+          interface hsein
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine chsein( side, eigsrc, initv, select, n, h, ldh, w, vl,ldvl, vr, ldvr, &
+                         mm, m, work, rwork, ifaill,ifailr, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: eigsrc,initv,side
+                    integer(ilp), intent(out) :: info,m,ifaill(*),ifailr(*)
+                    integer(ilp), intent(in) :: ldh,ldvl,ldvr,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(in) :: h(ldh,*)
+                    complex(sp), intent(inout) :: vl(ldvl,*),vr(ldvr,*),w(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine chsein
+#else 
+               module procedure stdlib_chsein
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dhsein( side, eigsrc, initv, select, n, h, ldh, wr, wi,vl, ldvl, vr, &
+                         ldvr, mm, m, work, ifaill,ifailr, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: eigsrc,initv,side
+                    integer(ilp), intent(out) :: info,m,ifaill(*),ifailr(*)
+                    integer(ilp), intent(in) :: ldh,ldvl,ldvr,mm,n
+                    logical(lk), intent(inout) :: select(*)
+                    real(dp), intent(in) :: h(ldh,*),wi(*)
+                    real(dp), intent(inout) :: vl(ldvl,*),vr(ldvr,*),wr(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dhsein
+#else 
+               module procedure stdlib_dhsein
+#endif
+#:if WITH_QP
+               module procedure stdlib_qhsein
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine shsein( side, eigsrc, initv, select, n, h, ldh, wr, wi,vl, ldvl, vr, &
+                         ldvr, mm, m, work, ifaill,ifailr, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: eigsrc,initv,side
+                    integer(ilp), intent(out) :: info,m,ifaill(*),ifailr(*)
+                    integer(ilp), intent(in) :: ldh,ldvl,ldvr,mm,n
+                    logical(lk), intent(inout) :: select(*)
+                    real(sp), intent(in) :: h(ldh,*),wi(*)
+                    real(sp), intent(inout) :: vl(ldvl,*),vr(ldvr,*),wr(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine shsein
+#else 
+               module procedure stdlib_shsein
+#endif
+#:if WITH_QP
+               module procedure stdlib_whsein
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zhsein( side, eigsrc, initv, select, n, h, ldh, w, vl,ldvl, vr, ldvr, &
+                         mm, m, work, rwork, ifaill,ifailr, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: eigsrc,initv,side
+                    integer(ilp), intent(out) :: info,m,ifaill(*),ifailr(*)
+                    integer(ilp), intent(in) :: ldh,ldvl,ldvr,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(in) :: h(ldh,*)
+                    complex(dp), intent(inout) :: vl(ldvl,*),vr(ldvr,*),w(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zhsein
+#else 
+               module procedure stdlib_zhsein
+#endif
+          end interface hsein
+
+
+
+          !> HSEQR: computes the eigenvalues of a Hessenberg matrix H
+          !> and, optionally, the matrices T and Z from the Schur decomposition
+          !> H = Z T Z**H, where T is an upper triangular matrix (the
+          !> Schur form), and Z is the unitary matrix of Schur vectors.
+          !> Optionally Z may be postmultiplied into an input unitary
+          !> matrix Q so that this routine can give the Schur factorization
+          !> of a matrix A which has been reduced to the Hessenberg form H
+          !> by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*T*(QZ)**H.
+          interface hseqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine chseqr( job, compz, n, ilo, ihi, h, ldh, w, z, ldz,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ilo,ldh,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    character, intent(in) :: compz,job
+                    complex(sp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    complex(sp), intent(out) :: w(*),work(*)
+               end subroutine chseqr
+#else 
+               module procedure stdlib_chseqr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dhseqr( job, compz, n, ilo, ihi, h, ldh, wr, wi, z,ldz, work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ilo,ldh,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    character, intent(in) :: compz,job
+                    real(dp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    real(dp), intent(out) :: wi(*),work(*),wr(*)
+               end subroutine dhseqr
+#else 
+               module procedure stdlib_dhseqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qhseqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine shseqr( job, compz, n, ilo, ihi, h, ldh, wr, wi, z,ldz, work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ilo,ldh,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    character, intent(in) :: compz,job
+                    real(sp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    real(sp), intent(out) :: wi(*),work(*),wr(*)
+               end subroutine shseqr
+#else 
+               module procedure stdlib_shseqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_whseqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zhseqr( job, compz, n, ilo, ihi, h, ldh, w, z, ldz,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ilo,ldh,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    character, intent(in) :: compz,job
+                    complex(dp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    complex(dp), intent(out) :: w(*),work(*)
+               end subroutine zhseqr
+#else 
+               module procedure stdlib_zhseqr
+#endif
+          end interface hseqr
+
+
+
+          !> ISNAN: returns .TRUE. if its argument is NaN, and .FALSE.
+          !> otherwise.  To be replaced by the Fortran 2003 intrinsic in the
+          !> future.
+          interface isnan
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure logical(lk) function disnan( din )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: din
+               end function disnan
+#else 
+               module procedure stdlib_disnan
+#endif
+#:if WITH_QP
+               module procedure stdlib_qisnan
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure logical(lk) function sisnan( sin )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: sin
+               end function sisnan
+#else 
+               module procedure stdlib_sisnan
+#endif
+          end interface isnan
+
+
+
+          !> LA_GBAMV:  performs one of the matrix-vector operations
+          !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+          !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+          !> where alpha and beta are scalars, x and y are vectors and A is an
+          !> m by n matrix.
+          !> This function is primarily used in calculating error bounds.
+          !> To protect against underflow during evaluation, components in
+          !> the resulting vector are perturbed away from zero by (N+1)
+          !> times the underflow threshold.  To prevent unnecessarily large
+          !> errors for block-structure embedded in general matrices,
+          !> "symbolically" zero components are not perturbed.  A zero
+          !> entry is considered "symbolic" if all multiplications involved
+          !> in computing that entry have at least one zero multiplicand.
+          interface la_gbamv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cla_gbamv( trans, m, n, kl, ku, alpha, ab, ldab, x,incx, beta, y, incy )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta
+                    integer(ilp), intent(in) :: incx,incy,ldab,m,n,kl,ku,trans
+                    complex(sp), intent(in) :: ab(ldab,*),x(*)
+                    real(sp), intent(inout) :: y(*)
+               end subroutine cla_gbamv
+#else 
+               module procedure stdlib_cla_gbamv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dla_gbamv( trans, m, n, kl, ku, alpha, ab, ldab, x,incx, beta, y, incy )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta,ab(ldab,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,ldab,m,n,kl,ku,trans
+                    real(dp), intent(inout) :: y(*)
+               end subroutine dla_gbamv
+#else 
+               module procedure stdlib_dla_gbamv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qla_gbamv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sla_gbamv( trans, m, n, kl, ku, alpha, ab, ldab, x,incx, beta, y, incy )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta,ab(ldab,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,ldab,m,n,kl,ku,trans
+                    real(sp), intent(inout) :: y(*)
+               end subroutine sla_gbamv
+#else 
+               module procedure stdlib_sla_gbamv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_gbamv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zla_gbamv( trans, m, n, kl, ku, alpha, ab, ldab, x,incx, beta, y, incy )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta
+                    integer(ilp), intent(in) :: incx,incy,ldab,m,n,kl,ku,trans
+                    complex(dp), intent(in) :: ab(ldab,*),x(*)
+                    real(dp), intent(inout) :: y(*)
+               end subroutine zla_gbamv
+#else 
+               module procedure stdlib_zla_gbamv
+#endif
+          end interface la_gbamv
+
+
+
+          !> LA_GBRCOND: Estimates the Skeel condition number of  op(A) * op2(C)
+          !> where op2 is determined by CMODE as follows
+          !> CMODE =  1    op2(C) = C
+          !> CMODE =  0    op2(C) = I
+          !> CMODE = -1    op2(C) = inv(C)
+          !> The Skeel condition number  cond(A) = norminf( |inv(A)||A| )
+          !> is computed by computing scaling factors R such that
+          !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+          !> infinity-norm condition number.
+          interface la_gbrcond
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dla_gbrcond( trans, n, kl, ku, ab, ldab,afb, ldafb, ipiv, cmode, &
+                         c,info, work, iwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(in) :: n,ldab,ldafb,kl,ku,cmode,ipiv(*)
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(dp), intent(in) :: ab(ldab,*),afb(ldafb,*),c(*)
+                    real(dp), intent(out) :: work(*)
+               end function dla_gbrcond
+#else 
+               module procedure stdlib_dla_gbrcond
+#endif
+#:if WITH_QP
+               module procedure stdlib_qla_gbrcond
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function sla_gbrcond( trans, n, kl, ku, ab, ldab, afb, ldafb,ipiv, cmode, &
+                         c, info, work, iwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(in) :: n,ldab,ldafb,kl,ku,cmode,ipiv(*)
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(sp), intent(in) :: ab(ldab,*),afb(ldafb,*),c(*)
+                    real(sp), intent(out) :: work(*)
+               end function sla_gbrcond
+#else 
+               module procedure stdlib_sla_gbrcond
+#endif
+          end interface la_gbrcond
+
+
+
+          !> LA_GBRCOND_C: Computes the infinity norm condition number of
+          !> op(A) * inv(diag(C)) where C is a REAL vector.
+          interface la_gbrcond_c
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function cla_gbrcond_c( trans, n, kl, ku, ab, ldab, afb,ldafb, ipiv, c, &
+                         capply, info, work,rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    logical(lk), intent(in) :: capply
+                    integer(ilp), intent(in) :: n,kl,ku,ldab,ldafb,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(in) :: ab(ldab,*),afb(ldafb,*)
+                    complex(sp), intent(out) :: work(*)
+                    real(sp), intent(in) :: c(*)
+                    real(sp), intent(out) :: rwork(*)
+               end function cla_gbrcond_c
+#else 
+               module procedure stdlib_cla_gbrcond_c
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_gbrcond_c
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zla_gbrcond_c( trans, n, kl, ku, ab,ldab, afb, ldafb, ipiv,c, &
+                         capply, info, work,rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    logical(lk), intent(in) :: capply
+                    integer(ilp), intent(in) :: n,kl,ku,ldab,ldafb,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(in) :: ab(ldab,*),afb(ldafb,*)
+                    complex(dp), intent(out) :: work(*)
+                    real(dp), intent(in) :: c(*)
+                    real(dp), intent(out) :: rwork(*)
+               end function zla_gbrcond_c
+#else 
+               module procedure stdlib_zla_gbrcond_c
+#endif
+          end interface la_gbrcond_c
+
+
+
+          !> LA_GBRPVGRW: computes the reciprocal pivot growth factor
+          !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+          !> much less than 1, the stability of the LU factorization of the
+          !> (equilibrated) matrix A could be poor. This also means that the
+          !> solution X, estimated condition numbers, and error bounds could be
+          !> unreliable.
+          interface la_gbrpvgrw
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(sp) function cla_gbrpvgrw( n, kl, ku, ncols, ab, ldab, afb,ldafb )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,kl,ku,ncols,ldab,ldafb
+                    complex(sp), intent(in) :: ab(ldab,*),afb(ldafb,*)
+               end function cla_gbrpvgrw
+#else 
+               module procedure stdlib_cla_gbrpvgrw
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(dp) function dla_gbrpvgrw( n, kl, ku, ncols, ab,ldab, afb, ldafb )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,kl,ku,ncols,ldab,ldafb
+                    real(dp), intent(in) :: ab(ldab,*),afb(ldafb,*)
+               end function dla_gbrpvgrw
+#else 
+               module procedure stdlib_dla_gbrpvgrw
+#endif
+#:if WITH_QP
+               module procedure stdlib_qla_gbrpvgrw
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(sp) function sla_gbrpvgrw( n, kl, ku, ncols, ab, ldab, afb,ldafb )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,kl,ku,ncols,ldab,ldafb
+                    real(sp), intent(in) :: ab(ldab,*),afb(ldafb,*)
+               end function sla_gbrpvgrw
+#else 
+               module procedure stdlib_sla_gbrpvgrw
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_gbrpvgrw
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(dp) function zla_gbrpvgrw( n, kl, ku, ncols, ab,ldab, afb, ldafb )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,kl,ku,ncols,ldab,ldafb
+                    complex(dp), intent(in) :: ab(ldab,*),afb(ldafb,*)
+               end function zla_gbrpvgrw
+#else 
+               module procedure stdlib_zla_gbrpvgrw
+#endif
+          end interface la_gbrpvgrw
+
+
+
+          !> LA_GEAMV:  performs one of the matrix-vector operations
+          !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+          !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+          !> where alpha and beta are scalars, x and y are vectors and A is an
+          !> m by n matrix.
+          !> This function is primarily used in calculating error bounds.
+          !> To protect against underflow during evaluation, components in
+          !> the resulting vector are perturbed away from zero by (N+1)
+          !> times the underflow threshold.  To prevent unnecessarily large
+          !> errors for block-structure embedded in general matrices,
+          !> "symbolically" zero components are not perturbed.  A zero
+          !> entry is considered "symbolic" if all multiplications involved
+          !> in computing that entry have at least one zero multiplicand.
+          interface la_geamv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cla_geamv( trans, m, n, alpha, a, lda, x, incx, beta,y, incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n,trans
+                    complex(sp), intent(in) :: a(lda,*),x(*)
+                    real(sp), intent(inout) :: y(*)
+               end subroutine cla_geamv
+#else 
+               module procedure stdlib_cla_geamv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dla_geamv ( trans, m, n, alpha, a, lda, x, incx, beta,y, incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n,trans
+                    real(dp), intent(inout) :: y(*)
+               end subroutine dla_geamv
+#else 
+               module procedure stdlib_dla_geamv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qla_geamv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sla_geamv( trans, m, n, alpha, a, lda, x, incx, beta,y, incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n,trans
+                    real(sp), intent(inout) :: y(*)
+               end subroutine sla_geamv
+#else 
+               module procedure stdlib_sla_geamv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_geamv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zla_geamv( trans, m, n, alpha, a, lda, x, incx, beta,y, incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta
+                    integer(ilp), intent(in) :: incx,incy,lda,m,n,trans
+                    complex(dp), intent(in) :: a(lda,*),x(*)
+                    real(dp), intent(inout) :: y(*)
+               end subroutine zla_geamv
+#else 
+               module procedure stdlib_zla_geamv
+#endif
+          end interface la_geamv
+
+
+
+          !> LA_GERCOND: estimates the Skeel condition number of op(A) * op2(C)
+          !> where op2 is determined by CMODE as follows
+          !> CMODE =  1    op2(C) = C
+          !> CMODE =  0    op2(C) = I
+          !> CMODE = -1    op2(C) = inv(C)
+          !> The Skeel condition number cond(A) = norminf( |inv(A)||A| )
+          !> is computed by computing scaling factors R such that
+          !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+          !> infinity-norm condition number.
+          interface la_gercond
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dla_gercond( trans, n, a, lda, af,ldaf, ipiv, cmode, c,info, &
+                         work, iwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(in) :: n,lda,ldaf,cmode,ipiv(*)
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(dp), intent(in) :: a(lda,*),af(ldaf,*),c(*)
+                    real(dp), intent(out) :: work(*)
+               end function dla_gercond
+#else 
+               module procedure stdlib_dla_gercond
+#endif
+#:if WITH_QP
+               module procedure stdlib_qla_gercond
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function sla_gercond( trans, n, a, lda, af, ldaf, ipiv,cmode, c, info, &
+                         work, iwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(in) :: n,lda,ldaf,cmode,ipiv(*)
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(sp), intent(in) :: a(lda,*),af(ldaf,*),c(*)
+                    real(sp), intent(out) :: work(*)
+               end function sla_gercond
+#else 
+               module procedure stdlib_sla_gercond
+#endif
+          end interface la_gercond
+
+
+
+          !> LA_GERCOND_C: computes the infinity norm condition number of
+          !> op(A) * inv(diag(C)) where C is a REAL vector.
+          interface la_gercond_c
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function cla_gercond_c( trans, n, a, lda, af, ldaf, ipiv, c,capply, info, &
+                         work, rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    logical(lk), intent(in) :: capply
+                    integer(ilp), intent(in) :: n,lda,ldaf,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(in) :: a(lda,*),af(ldaf,*)
+                    complex(sp), intent(out) :: work(*)
+                    real(sp), intent(in) :: c(*)
+                    real(sp), intent(out) :: rwork(*)
+               end function cla_gercond_c
+#else 
+               module procedure stdlib_cla_gercond_c
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_gercond_c
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zla_gercond_c( trans, n, a, lda, af,ldaf, ipiv, c, capply,info, &
+                         work, rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    logical(lk), intent(in) :: capply
+                    integer(ilp), intent(in) :: n,lda,ldaf,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(in) :: a(lda,*),af(ldaf,*)
+                    complex(dp), intent(out) :: work(*)
+                    real(dp), intent(in) :: c(*)
+                    real(dp), intent(out) :: rwork(*)
+               end function zla_gercond_c
+#else 
+               module procedure stdlib_zla_gercond_c
+#endif
+          end interface la_gercond_c
+
+
+
+          !> LA_GERPVGRW: computes the reciprocal pivot growth factor
+          !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+          !> much less than 1, the stability of the LU factorization of the
+          !> (equilibrated) matrix A could be poor. This also means that the
+          !> solution X, estimated condition numbers, and error bounds could be
+          !> unreliable.
+          interface la_gerpvgrw
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(sp) function cla_gerpvgrw( n, ncols, a, lda, af, ldaf )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,ncols,lda,ldaf
+                    complex(sp), intent(in) :: a(lda,*),af(ldaf,*)
+               end function cla_gerpvgrw
+#else 
+               module procedure stdlib_cla_gerpvgrw
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(dp) function dla_gerpvgrw( n, ncols, a, lda, af,ldaf )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,ncols,lda,ldaf
+                    real(dp), intent(in) :: a(lda,*),af(ldaf,*)
+               end function dla_gerpvgrw
+#else 
+               module procedure stdlib_dla_gerpvgrw
+#endif
+#:if WITH_QP
+               module procedure stdlib_qla_gerpvgrw
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(sp) function sla_gerpvgrw( n, ncols, a, lda, af, ldaf )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,ncols,lda,ldaf
+                    real(sp), intent(in) :: a(lda,*),af(ldaf,*)
+               end function sla_gerpvgrw
+#else 
+               module procedure stdlib_sla_gerpvgrw
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_gerpvgrw
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(dp) function zla_gerpvgrw( n, ncols, a, lda, af,ldaf )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,ncols,lda,ldaf
+                    complex(dp), intent(in) :: a(lda,*),af(ldaf,*)
+               end function zla_gerpvgrw
+#else 
+               module procedure stdlib_zla_gerpvgrw
+#endif
+          end interface la_gerpvgrw
+
+
+
+          !> CLA_SYAMV  performs the matrix-vector operation
+          !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+          !> where alpha and beta are scalars, x and y are vectors and A is an
+          !> n by n symmetric matrix.
+          !> This function is primarily used in calculating error bounds.
+          !> To protect against underflow during evaluation, components in
+          !> the resulting vector are perturbed away from zero by (N+1)
+          !> times the underflow threshold.  To prevent unnecessarily large
+          !> errors for block-structure embedded in general matrices,
+          !> "symbolically" zero components are not perturbed.  A zero
+          !> entry is considered "symbolic" if all multiplications involved
+          !> in computing that entry have at least one zero multiplicand.
+          interface la_heamv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cla_heamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta
+                    integer(ilp), intent(in) :: incx,incy,lda,n,uplo
+                    complex(sp), intent(in) :: a(lda,*),x(*)
+                    real(sp), intent(inout) :: y(*)
+               end subroutine cla_heamv
+#else 
+               module procedure stdlib_cla_heamv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_heamv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zla_heamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta
+                    integer(ilp), intent(in) :: incx,incy,lda,n,uplo
+                    complex(dp), intent(in) :: a(lda,*),x(*)
+                    real(dp), intent(inout) :: y(*)
+               end subroutine zla_heamv
+#else 
+               module procedure stdlib_zla_heamv
+#endif
+          end interface la_heamv
+
+
+
+          !> LA_HERCOND_C: computes the infinity norm condition number of
+          !> op(A) * inv(diag(C)) where C is a REAL vector.
+          interface la_hercond_c
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function cla_hercond_c( uplo, n, a, lda, af, ldaf, ipiv, c,capply, info, &
+                         work, rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    logical(lk), intent(in) :: capply
+                    integer(ilp), intent(in) :: n,lda,ldaf,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(in) :: a(lda,*),af(ldaf,*)
+                    complex(sp), intent(out) :: work(*)
+                    real(sp), intent(in) :: c(*)
+                    real(sp), intent(out) :: rwork(*)
+               end function cla_hercond_c
+#else 
+               module procedure stdlib_cla_hercond_c
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_hercond_c
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zla_hercond_c( uplo, n, a, lda, af,ldaf, ipiv, c, capply,info, &
+                         work, rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    logical(lk), intent(in) :: capply
+                    integer(ilp), intent(in) :: n,lda,ldaf,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(in) :: a(lda,*),af(ldaf,*)
+                    complex(dp), intent(out) :: work(*)
+                    real(dp), intent(in) :: c(*)
+                    real(dp), intent(out) :: rwork(*)
+               end function zla_hercond_c
+#else 
+               module procedure stdlib_zla_hercond_c
+#endif
+          end interface la_hercond_c
+
+
+
+          !> LA_HERPVGRW: computes the reciprocal pivot growth factor
+          !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+          !> much less than 1, the stability of the LU factorization of the
+          !> (equilibrated) matrix A could be poor. This also means that the
+          !> solution X, estimated condition numbers, and error bounds could be
+          !> unreliable.
+          interface la_herpvgrw
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function cla_herpvgrw( uplo, n, info, a, lda, af, ldaf, ipiv,work )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,info,lda,ldaf,ipiv(*)
+                    complex(sp), intent(in) :: a(lda,*),af(ldaf,*)
+                    real(sp), intent(out) :: work(*)
+               end function cla_herpvgrw
+#else 
+               module procedure stdlib_cla_herpvgrw
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_herpvgrw
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zla_herpvgrw( uplo, n, info, a, lda, af,ldaf, ipiv, work )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,info,lda,ldaf,ipiv(*)
+                    complex(dp), intent(in) :: a(lda,*),af(ldaf,*)
+                    real(dp), intent(out) :: work(*)
+               end function zla_herpvgrw
+#else 
+               module procedure stdlib_zla_herpvgrw
+#endif
+          end interface la_herpvgrw
+
+
+
+          !> LA_LIN_BERR: computes componentwise relative backward error from
+          !> the formula
+          !> max(i) ( abs(R(i)) / ( abs(op(A_s))*abs(Y) + abs(B_s) )(i) )
+          !> where abs(Z) is the componentwise absolute value of the matrix
+          !> or vector Z.
+          interface la_lin_berr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cla_lin_berr( n, nz, nrhs, res, ayb, berr )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,nz,nrhs
+                    real(sp), intent(in) :: ayb(n,nrhs)
+                    real(sp), intent(out) :: berr(nrhs)
+                    complex(sp), intent(in) :: res(n,nrhs)
+               end subroutine cla_lin_berr
+#else 
+               module procedure stdlib_cla_lin_berr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dla_lin_berr ( n, nz, nrhs, res, ayb, berr )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,nz,nrhs
+                    real(dp), intent(in) :: ayb(n,nrhs),res(n,nrhs)
+                    real(dp), intent(out) :: berr(nrhs)
+               end subroutine dla_lin_berr
+#else 
+               module procedure stdlib_dla_lin_berr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qla_lin_berr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sla_lin_berr( n, nz, nrhs, res, ayb, berr )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,nz,nrhs
+                    real(sp), intent(in) :: ayb(n,nrhs),res(n,nrhs)
+                    real(sp), intent(out) :: berr(nrhs)
+               end subroutine sla_lin_berr
+#else 
+               module procedure stdlib_sla_lin_berr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_lin_berr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zla_lin_berr( n, nz, nrhs, res, ayb, berr )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,nz,nrhs
+                    real(dp), intent(in) :: ayb(n,nrhs)
+                    real(dp), intent(out) :: berr(nrhs)
+                    complex(dp), intent(in) :: res(n,nrhs)
+               end subroutine zla_lin_berr
+#else 
+               module procedure stdlib_zla_lin_berr
+#endif
+          end interface la_lin_berr
+
+
+
+          !> LA_PORCOND: Estimates the Skeel condition number of  op(A) * op2(C)
+          !> where op2 is determined by CMODE as follows
+          !> CMODE =  1    op2(C) = C
+          !> CMODE =  0    op2(C) = I
+          !> CMODE = -1    op2(C) = inv(C)
+          !> The Skeel condition number  cond(A) = norminf( |inv(A)||A| )
+          !> is computed by computing scaling factors R such that
+          !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+          !> infinity-norm condition number.
+          interface la_porcond
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dla_porcond( uplo, n, a, lda, af, ldaf,cmode, c, info, work,&
+                         iwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,lda,ldaf,cmode
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(dp), intent(in) :: a(lda,*),af(ldaf,*),c(*)
+                    real(dp), intent(out) :: work(*)
+               end function dla_porcond
+#else 
+               module procedure stdlib_dla_porcond
+#endif
+#:if WITH_QP
+               module procedure stdlib_qla_porcond
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function sla_porcond( uplo, n, a, lda, af, ldaf, cmode, c,info, work, &
+                         iwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,lda,ldaf,cmode
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(sp), intent(in) :: a(lda,*),af(ldaf,*),c(*)
+                    real(sp), intent(out) :: work(*)
+               end function sla_porcond
+#else 
+               module procedure stdlib_sla_porcond
+#endif
+          end interface la_porcond
+
+
+
+          !> LA_PORCOND_C: Computes the infinity norm condition number of
+          !> op(A) * inv(diag(C)) where C is a REAL vector
+          interface la_porcond_c
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function cla_porcond_c( uplo, n, a, lda, af, ldaf, c, capply,info, work, &
+                         rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    logical(lk), intent(in) :: capply
+                    integer(ilp), intent(in) :: n,lda,ldaf
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(in) :: a(lda,*),af(ldaf,*)
+                    complex(sp), intent(out) :: work(*)
+                    real(sp), intent(in) :: c(*)
+                    real(sp), intent(out) :: rwork(*)
+               end function cla_porcond_c
+#else 
+               module procedure stdlib_cla_porcond_c
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_porcond_c
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zla_porcond_c( uplo, n, a, lda, af,ldaf, c, capply, info,work, &
+                         rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    logical(lk), intent(in) :: capply
+                    integer(ilp), intent(in) :: n,lda,ldaf
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(in) :: a(lda,*),af(ldaf,*)
+                    complex(dp), intent(out) :: work(*)
+                    real(dp), intent(in) :: c(*)
+                    real(dp), intent(out) :: rwork(*)
+               end function zla_porcond_c
+#else 
+               module procedure stdlib_zla_porcond_c
+#endif
+          end interface la_porcond_c
+
+
+
+          !> LA_PORPVGRW: computes the reciprocal pivot growth factor
+          !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+          !> much less than 1, the stability of the LU factorization of the
+          !> (equilibrated) matrix A could be poor. This also means that the
+          !> solution X, estimated condition numbers, and error bounds could be
+          !> unreliable.
+          interface la_porpvgrw
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function cla_porpvgrw( uplo, ncols, a, lda, af, ldaf, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: ncols,lda,ldaf
+                    complex(sp), intent(in) :: a(lda,*),af(ldaf,*)
+                    real(sp), intent(out) :: work(*)
+               end function cla_porpvgrw
+#else 
+               module procedure stdlib_cla_porpvgrw
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dla_porpvgrw( uplo, ncols, a, lda, af,ldaf, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: ncols,lda,ldaf
+                    real(dp), intent(in) :: a(lda,*),af(ldaf,*)
+                    real(dp), intent(out) :: work(*)
+               end function dla_porpvgrw
+#else 
+               module procedure stdlib_dla_porpvgrw
+#endif
+#:if WITH_QP
+               module procedure stdlib_qla_porpvgrw
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function sla_porpvgrw( uplo, ncols, a, lda, af, ldaf, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: ncols,lda,ldaf
+                    real(sp), intent(in) :: a(lda,*),af(ldaf,*)
+                    real(sp), intent(out) :: work(*)
+               end function sla_porpvgrw
+#else 
+               module procedure stdlib_sla_porpvgrw
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_porpvgrw
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zla_porpvgrw( uplo, ncols, a, lda, af,ldaf, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: ncols,lda,ldaf
+                    complex(dp), intent(in) :: a(lda,*),af(ldaf,*)
+                    real(dp), intent(out) :: work(*)
+               end function zla_porpvgrw
+#else 
+               module procedure stdlib_zla_porpvgrw
+#endif
+          end interface la_porpvgrw
+
+
+
+          !> LA_SYAMV:  performs the matrix-vector operation
+          !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+          !> where alpha and beta are scalars, x and y are vectors and A is an
+          !> n by n symmetric matrix.
+          !> This function is primarily used in calculating error bounds.
+          !> To protect against underflow during evaluation, components in
+          !> the resulting vector are perturbed away from zero by (N+1)
+          !> times the underflow threshold.  To prevent unnecessarily large
+          !> errors for block-structure embedded in general matrices,
+          !> "symbolically" zero components are not perturbed.  A zero
+          !> entry is considered "symbolic" if all multiplications involved
+          !> in computing that entry have at least one zero multiplicand.
+          interface la_syamv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cla_syamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta
+                    integer(ilp), intent(in) :: incx,incy,lda,n,uplo
+                    complex(sp), intent(in) :: a(lda,*),x(*)
+                    real(sp), intent(inout) :: y(*)
+               end subroutine cla_syamv
+#else 
+               module procedure stdlib_cla_syamv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dla_syamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,n,uplo
+                    real(dp), intent(inout) :: y(*)
+               end subroutine dla_syamv
+#else 
+               module procedure stdlib_dla_syamv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qla_syamv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sla_syamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    integer(ilp), intent(in) :: incx,incy,lda,n,uplo
+                    real(sp), intent(inout) :: y(*)
+               end subroutine sla_syamv
+#else 
+               module procedure stdlib_sla_syamv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_syamv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zla_syamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta
+                    integer(ilp), intent(in) :: incx,incy,lda,n,uplo
+                    complex(dp), intent(in) :: a(lda,*),x(*)
+                    real(dp), intent(inout) :: y(*)
+               end subroutine zla_syamv
+#else 
+               module procedure stdlib_zla_syamv
+#endif
+          end interface la_syamv
+
+
+
+          !> LA_SYRCOND: estimates the Skeel condition number of  op(A) * op2(C)
+          !> where op2 is determined by CMODE as follows
+          !> CMODE =  1    op2(C) = C
+          !> CMODE =  0    op2(C) = I
+          !> CMODE = -1    op2(C) = inv(C)
+          !> The Skeel condition number cond(A) = norminf( |inv(A)||A| )
+          !> is computed by computing scaling factors R such that
+          !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+          !> infinity-norm condition number.
+          interface la_syrcond
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dla_syrcond( uplo, n, a, lda, af, ldaf,ipiv, cmode, c, info, &
+                         work,iwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,lda,ldaf,cmode,ipiv(*)
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(dp), intent(in) :: a(lda,*),af(ldaf,*),c(*)
+                    real(dp), intent(out) :: work(*)
+               end function dla_syrcond
+#else 
+               module procedure stdlib_dla_syrcond
+#endif
+#:if WITH_QP
+               module procedure stdlib_qla_syrcond
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function sla_syrcond( uplo, n, a, lda, af, ldaf, ipiv, cmode,c, info, &
+                         work, iwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,lda,ldaf,cmode,ipiv(*)
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(sp), intent(in) :: a(lda,*),af(ldaf,*),c(*)
+                    real(sp), intent(out) :: work(*)
+               end function sla_syrcond
+#else 
+               module procedure stdlib_sla_syrcond
+#endif
+          end interface la_syrcond
+
+
+
+          !> LA_SYRCOND_C: Computes the infinity norm condition number of
+          !> op(A) * inv(diag(C)) where C is a REAL vector.
+          interface la_syrcond_c
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function cla_syrcond_c( uplo, n, a, lda, af, ldaf, ipiv, c,capply, info, &
+                         work, rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    logical(lk), intent(in) :: capply
+                    integer(ilp), intent(in) :: n,lda,ldaf,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(in) :: a(lda,*),af(ldaf,*)
+                    complex(sp), intent(out) :: work(*)
+                    real(sp), intent(in) :: c(*)
+                    real(sp), intent(out) :: rwork(*)
+               end function cla_syrcond_c
+#else 
+               module procedure stdlib_cla_syrcond_c
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_syrcond_c
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zla_syrcond_c( uplo, n, a, lda, af,ldaf, ipiv, c, capply,info, &
+                         work, rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    logical(lk), intent(in) :: capply
+                    integer(ilp), intent(in) :: n,lda,ldaf,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(in) :: a(lda,*),af(ldaf,*)
+                    complex(dp), intent(out) :: work(*)
+                    real(dp), intent(in) :: c(*)
+                    real(dp), intent(out) :: rwork(*)
+               end function zla_syrcond_c
+#else 
+               module procedure stdlib_zla_syrcond_c
+#endif
+          end interface la_syrcond_c
+
+
+
+          !> LA_SYRPVGRW: computes the reciprocal pivot growth factor
+          !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+          !> much less than 1, the stability of the LU factorization of the
+          !> (equilibrated) matrix A could be poor. This also means that the
+          !> solution X, estimated condition numbers, and error bounds could be
+          !> unreliable.
+          interface la_syrpvgrw
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function cla_syrpvgrw( uplo, n, info, a, lda, af, ldaf, ipiv,work )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,info,lda,ldaf,ipiv(*)
+                    complex(sp), intent(in) :: a(lda,*),af(ldaf,*)
+                    real(sp), intent(out) :: work(*)
+               end function cla_syrpvgrw
+#else 
+               module procedure stdlib_cla_syrpvgrw
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dla_syrpvgrw( uplo, n, info, a, lda, af,ldaf, ipiv, work )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,info,lda,ldaf,ipiv(*)
+                    real(dp), intent(in) :: a(lda,*),af(ldaf,*)
+                    real(dp), intent(out) :: work(*)
+               end function dla_syrpvgrw
+#else 
+               module procedure stdlib_dla_syrpvgrw
+#endif
+#:if WITH_QP
+               module procedure stdlib_qla_syrpvgrw
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function sla_syrpvgrw( uplo, n, info, a, lda, af, ldaf, ipiv,work )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,info,lda,ldaf,ipiv(*)
+                    real(sp), intent(in) :: a(lda,*),af(ldaf,*)
+                    real(sp), intent(out) :: work(*)
+               end function sla_syrpvgrw
+#else 
+               module procedure stdlib_sla_syrpvgrw
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_syrpvgrw
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zla_syrpvgrw( uplo, n, info, a, lda, af,ldaf, ipiv, work )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,info,lda,ldaf,ipiv(*)
+                    complex(dp), intent(in) :: a(lda,*),af(ldaf,*)
+                    real(dp), intent(out) :: work(*)
+               end function zla_syrpvgrw
+#else 
+               module procedure stdlib_zla_syrpvgrw
+#endif
+          end interface la_syrpvgrw
+
+
+
+          !> LA_WWADDW: adds a vector W into a doubled-single vector (X, Y).
+          !> This works for all extant IBM's hex and binary floating point
+          !> arithmetic, but not for decimal.
+          interface la_wwaddw
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cla_wwaddw( n, x, y, w )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n
+                    complex(sp), intent(inout) :: x(*),y(*)
+                    complex(sp), intent(in) :: w(*)
+               end subroutine cla_wwaddw
+#else 
+               module procedure stdlib_cla_wwaddw
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dla_wwaddw( n, x, y, w )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: x(*),y(*)
+                    real(dp), intent(in) :: w(*)
+               end subroutine dla_wwaddw
+#else 
+               module procedure stdlib_dla_wwaddw
+#endif
+#:if WITH_QP
+               module procedure stdlib_qla_wwaddw
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sla_wwaddw( n, x, y, w )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: x(*),y(*)
+                    real(sp), intent(in) :: w(*)
+               end subroutine sla_wwaddw
+#else 
+               module procedure stdlib_sla_wwaddw
+#endif
+#:if WITH_QP
+               module procedure stdlib_wla_wwaddw
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zla_wwaddw( n, x, y, w )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n
+                    complex(dp), intent(inout) :: x(*),y(*)
+                    complex(dp), intent(in) :: w(*)
+               end subroutine zla_wwaddw
+#else 
+               module procedure stdlib_zla_wwaddw
+#endif
+          end interface la_wwaddw
+
+
+
+          !> LABAD: takes as input the values computed by DLAMCH for underflow and
+          !> overflow, and returns the square root of each of these values if the
+          !> log of LARGE is sufficiently large.  This subroutine is intended to
+          !> identify machines with a large exponent range, such as the Crays, and
+          !> redefine the underflow and overflow limits to be the square roots of
+          !> the values computed by DLAMCH.  This subroutine is needed because
+          !> DLAMCH does not compensate for poor arithmetic in the upper half of
+          !> the exponent range, as is found on a Cray.
+          interface labad
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlabad( small, large )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(inout) :: large,small
+               end subroutine dlabad
+#else 
+               module procedure stdlib_dlabad
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlabad
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slabad( small, large )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(inout) :: large,small
+               end subroutine slabad
+#else 
+               module procedure stdlib_slabad
+#endif
+          end interface labad
+
+
+
+          !> LABRD: reduces the first NB rows and columns of a complex general
+          !> m by n matrix A to upper or lower real bidiagonal form by a unitary
+          !> transformation Q**H * A * P, and returns the matrices X and Y which
+          !> are needed to apply the transformation to the unreduced part of A.
+          !> If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
+          !> bidiagonal form.
+          !> This is an auxiliary routine called by CGEBRD
+          interface labrd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clabrd( m, n, nb, a, lda, d, e, tauq, taup, x, ldx, y,ldy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: lda,ldx,ldy,m,n,nb
+                    real(sp), intent(out) :: d(*),e(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: taup(*),tauq(*),x(ldx,*),y(ldy,*)
+               end subroutine clabrd
+#else 
+               module procedure stdlib_clabrd
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlabrd( m, n, nb, a, lda, d, e, tauq, taup, x, ldx, y,ldy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: lda,ldx,ldy,m,n,nb
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: d(*),e(*),taup(*),tauq(*),x(ldx,*),y(ldy,*)
+               end subroutine dlabrd
+#else 
+               module procedure stdlib_dlabrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlabrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slabrd( m, n, nb, a, lda, d, e, tauq, taup, x, ldx, y,ldy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: lda,ldx,ldy,m,n,nb
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: d(*),e(*),taup(*),tauq(*),x(ldx,*),y(ldy,*)
+               end subroutine slabrd
+#else 
+               module procedure stdlib_slabrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlabrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlabrd( m, n, nb, a, lda, d, e, tauq, taup, x, ldx, y,ldy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: lda,ldx,ldy,m,n,nb
+                    real(dp), intent(out) :: d(*),e(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: taup(*),tauq(*),x(ldx,*),y(ldy,*)
+               end subroutine zlabrd
+#else 
+               module procedure stdlib_zlabrd
+#endif
+          end interface labrd
+
+
+
+          !> LACGV: conjugates a complex vector of length N.
+          interface lacgv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clacgv( n, x, incx )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    complex(sp), intent(inout) :: x(*)
+               end subroutine clacgv
+#else 
+               module procedure stdlib_clacgv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlacgv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlacgv( n, x, incx )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    complex(dp), intent(inout) :: x(*)
+               end subroutine zlacgv
+#else 
+               module procedure stdlib_zlacgv
+#endif
+          end interface lacgv
+
+
+
+          !> LACON: estimates the 1-norm of a square, complex matrix A.
+          !> Reverse communication is used for evaluating matrix-vector products.
+          interface lacon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine clacon( n, v, x, est, kase )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(inout) :: kase
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: est
+                    complex(sp), intent(out) :: v(n)
+                    complex(sp), intent(inout) :: x(n)
+               end subroutine clacon
+#else 
+               module procedure stdlib_clacon
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dlacon( n, v, x, isgn, est, kase )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(inout) :: kase
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: est,x(*)
+                    integer(ilp), intent(out) :: isgn(*)
+                    real(dp), intent(out) :: v(*)
+               end subroutine dlacon
+#else 
+               module procedure stdlib_dlacon
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlacon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine slacon( n, v, x, isgn, est, kase )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(inout) :: kase
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: est,x(*)
+                    integer(ilp), intent(out) :: isgn(*)
+                    real(sp), intent(out) :: v(*)
+               end subroutine slacon
+#else 
+               module procedure stdlib_slacon
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlacon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zlacon( n, v, x, est, kase )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(inout) :: kase
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: est
+                    complex(dp), intent(out) :: v(n)
+                    complex(dp), intent(inout) :: x(n)
+               end subroutine zlacon
+#else 
+               module procedure stdlib_zlacon
+#endif
+          end interface lacon
+
+
+
+          !> LACPY: copies all or part of a two-dimensional matrix A to another
+          !> matrix B.
+          interface lacpy
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clacpy( uplo, m, n, a, lda, b, ldb )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,ldb,m,n
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(out) :: b(ldb,*)
+               end subroutine clacpy
+#else 
+               module procedure stdlib_clacpy
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlacpy( uplo, m, n, a, lda, b, ldb )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,ldb,m,n
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(out) :: b(ldb,*)
+               end subroutine dlacpy
+#else 
+               module procedure stdlib_dlacpy
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlacpy
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slacpy( uplo, m, n, a, lda, b, ldb )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,ldb,m,n
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(out) :: b(ldb,*)
+               end subroutine slacpy
+#else 
+               module procedure stdlib_slacpy
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlacpy
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlacpy( uplo, m, n, a, lda, b, ldb )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,ldb,m,n
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(out) :: b(ldb,*)
+               end subroutine zlacpy
+#else 
+               module procedure stdlib_zlacpy
+#endif
+          end interface lacpy
+
+
+
+          !> LACRM: performs a very simple matrix-matrix multiplication:
+          !> C := A * B,
+          !> where A is M by N and complex; B is N by N and real;
+          !> C is M by N and complex.
+          interface lacrm
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clacrm( m, n, a, lda, b, ldb, c, ldc, rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: lda,ldb,ldc,m,n
+                    real(sp), intent(in) :: b(ldb,*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(out) :: c(ldc,*)
+               end subroutine clacrm
+#else 
+               module procedure stdlib_clacrm
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlacrm
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlacrm( m, n, a, lda, b, ldb, c, ldc, rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: lda,ldb,ldc,m,n
+                    real(dp), intent(in) :: b(ldb,*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(out) :: c(ldc,*)
+               end subroutine zlacrm
+#else 
+               module procedure stdlib_zlacrm
+#endif
+          end interface lacrm
+
+
+
+          !> LACRT: performs the operation
+          !> (  c  s )( x )  ==> ( x )
+          !> ( -s  c )( y )      ( y )
+          !> where c and s are complex and the vectors x and y are complex.
+          interface lacrt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clacrt( n, cx, incx, cy, incy, c, s )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(sp), intent(in) :: c,s
+                    complex(sp), intent(inout) :: cx(*),cy(*)
+               end subroutine clacrt
+#else 
+               module procedure stdlib_clacrt
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlacrt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlacrt( n, cx, incx, cy, incy, c, s )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(dp), intent(in) :: c,s
+                    complex(dp), intent(inout) :: cx(*),cy(*)
+               end subroutine zlacrt
+#else 
+               module procedure stdlib_zlacrt
+#endif
+          end interface lacrt
+
+
+
+          !> LADIV_F: := X / Y, where X and Y are complex.  The computation of X / Y
+          !> will not overflow on an intermediary step unless the results
+          !> overflows.
+          interface ladiv_f
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure complex(sp) function cladiv( x, y )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: x,y
+               end function cladiv
+#else 
+               module procedure stdlib_cladiv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wladiv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure complex(dp)     function zladiv( x, y )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: x,y
+               end function zladiv
+#else 
+               module procedure stdlib_zladiv
+#endif
+          end interface ladiv_f
+
+
+
+          !> LADIV_S: performs complex division in  real arithmetic
+          !> a + i*b
+          !> p + i*q = ---------
+          !> c + i*d
+          !> The algorithm is due to Michael Baudin and Robert L. Smith
+          !> and can be found in the paper
+          !> "A Robust Complex Division in Scilab"
+          interface ladiv_s
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dladiv( a, b, c, d, p, q )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: a,b,c,d
+                    real(dp), intent(out) :: p,q
+               end subroutine dladiv
+#else 
+               module procedure stdlib_dladiv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qladiv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sladiv( a, b, c, d, p, q )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: a,b,c,d
+                    real(sp), intent(out) :: p,q
+               end subroutine sladiv
+#else 
+               module procedure stdlib_sladiv
+#endif
+          end interface ladiv_s
+
+
+
+          interface ladiv1
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dladiv1( a, b, c, d, p, q )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(inout) :: a
+                    real(dp), intent(in) :: b,c,d
+                    real(dp), intent(out) :: p,q
+               end subroutine dladiv1
+#else 
+               module procedure stdlib_dladiv1
+#endif
+#:if WITH_QP
+               module procedure stdlib_qladiv1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sladiv1( a, b, c, d, p, q )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(inout) :: a
+                    real(sp), intent(in) :: b,c,d
+                    real(sp), intent(out) :: p,q
+               end subroutine sladiv1
+#else 
+               module procedure stdlib_sladiv1
+#endif
+          end interface ladiv1
+
+
+
+          interface ladiv2
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(dp) function dladiv2( a, b, c, d, r, t )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: a,b,c,d,r,t
+               end function dladiv2
+#else 
+               module procedure stdlib_dladiv2
+#endif
+#:if WITH_QP
+               module procedure stdlib_qladiv2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(sp) function sladiv2( a, b, c, d, r, t )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: a,b,c,d,r,t
+               end function sladiv2
+#else 
+               module procedure stdlib_sladiv2
+#endif
+          end interface ladiv2
+
+
+
+          !> LAEBZ: contains the iteration loops which compute and use the
+          !> function N(w), which is the count of eigenvalues of a symmetric
+          !> tridiagonal matrix T less than or equal to its argument  w.  It
+          !> performs a choice of two types of loops:
+          !> IJOB=1, followed by
+          !> IJOB=2: It takes as input a list of intervals and returns a list of
+          !> sufficiently small intervals whose union contains the same
+          !> eigenvalues as the union of the original intervals.
+          !> The input intervals are (AB(j,1),AB(j,2)], j=1,...,MINP.
+          !> The output interval (AB(j,1),AB(j,2)] will contain
+          !> eigenvalues NAB(j,1)+1,...,NAB(j,2), where 1 <= j <= MOUT.
+          !> IJOB=3: It performs a binary search in each input interval
+          !> (AB(j,1),AB(j,2)] for a point  w(j)  such that
+          !> N(w(j))=NVAL(j), and uses  C(j)  as the starting point of
+          !> the search.  If such a w(j) is found, then on output
+          !> AB(j,1)=AB(j,2)=w.  If no such w(j) is found, then on output
+          !> (AB(j,1),AB(j,2)] will be a small interval containing the
+          !> point where N(w) jumps through NVAL(j), unless that point
+          !> lies outside the initial interval.
+          !> Note that the intervals are in all cases half-open intervals,
+          !> i.e., of the form  (a,b] , which includes  b  but not  a .
+          !> To avoid underflow, the matrix should be scaled so that its largest
+          !> element is no greater than  overflow**(1/2) * underflow**(1/4)
+          !> in absolute value.  To assure the most accurate computation
+          !> of small eigenvalues, the matrix should be scaled to be
+          !> not much smaller than that, either.
+          !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+          !> Matrix", Report CS41, Computer Science Dept., Stanford
+          !> University, July 21, 1966
+          !> Note: the arguments are, in general, *not* checked for unreasonable
+          !> values.
+          interface laebz
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaebz( ijob, nitmax, n, mmax, minp, nbmin, abstol,reltol, pivmin, &
+                         d, e, e2, nval, ab, c, mout,nab, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ijob,minp,mmax,n,nbmin,nitmax
+                    integer(ilp), intent(out) :: info,mout,iwork(*)
+                    real(dp), intent(in) :: abstol,pivmin,reltol,d(*),e(*),e2(*)
+                    integer(ilp), intent(inout) :: nab(mmax,*),nval(*)
+                    real(dp), intent(inout) :: ab(mmax,*),c(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlaebz
+#else 
+               module procedure stdlib_dlaebz
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaebz
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaebz( ijob, nitmax, n, mmax, minp, nbmin, abstol,reltol, pivmin, &
+                         d, e, e2, nval, ab, c, mout,nab, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ijob,minp,mmax,n,nbmin,nitmax
+                    integer(ilp), intent(out) :: info,mout,iwork(*)
+                    real(sp), intent(in) :: abstol,pivmin,reltol,d(*),e(*),e2(*)
+                    integer(ilp), intent(inout) :: nab(mmax,*),nval(*)
+                    real(sp), intent(inout) :: ab(mmax,*),c(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine slaebz
+#else 
+               module procedure stdlib_slaebz
+#endif
+          end interface laebz
+
+
+
+          !> Using the divide and conquer method, LAED0: computes all eigenvalues
+          !> of a symmetric tridiagonal matrix which is one diagonal block of
+          !> those from reducing a dense or band Hermitian matrix and
+          !> corresponding eigenvectors of the dense or band matrix.
+          interface laed0
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claed0( qsiz, n, d, e, q, ldq, qstore, ldqs, rwork,iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldq,ldqs,n,qsiz
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: q(ldq,*)
+                    complex(sp), intent(out) :: qstore(ldqs,*)
+               end subroutine claed0
+#else 
+               module procedure stdlib_claed0
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaed0( icompq, qsiz, n, d, e, q, ldq, qstore, ldqs,work, iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: icompq,ldq,ldqs,n,qsiz
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(dp), intent(inout) :: d(*),e(*),q(ldq,*)
+                    real(dp), intent(out) :: qstore(ldqs,*),work(*)
+               end subroutine dlaed0
+#else 
+               module procedure stdlib_dlaed0
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaed0
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaed0( icompq, qsiz, n, d, e, q, ldq, qstore, ldqs,work, iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: icompq,ldq,ldqs,n,qsiz
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(sp), intent(inout) :: d(*),e(*),q(ldq,*)
+                    real(sp), intent(out) :: qstore(ldqs,*),work(*)
+               end subroutine slaed0
+#else 
+               module procedure stdlib_slaed0
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaed0
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaed0( qsiz, n, d, e, q, ldq, qstore, ldqs, rwork,iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldq,ldqs,n,qsiz
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: q(ldq,*)
+                    complex(dp), intent(out) :: qstore(ldqs,*)
+               end subroutine zlaed0
+#else 
+               module procedure stdlib_zlaed0
+#endif
+          end interface laed0
+
+
+
+          !> LAED1: computes the updated eigensystem of a diagonal
+          !> matrix after modification by a rank-one symmetric matrix.  This
+          !> routine is used only for the eigenproblem which requires all
+          !> eigenvalues and eigenvectors of a tridiagonal matrix.  DLAED7 handles
+          !> the case in which eigenvalues only or eigenvalues and eigenvectors
+          !> of a full symmetric matrix (which was reduced to tridiagonal form)
+          !> are desired.
+          !> T = Q(in) ( D(in) + RHO * Z*Z**T ) Q**T(in) = Q(out) * D(out) * Q**T(out)
+          !> where Z = Q**T*u, u is a vector of length N with ones in the
+          !> CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+          !> The eigenvectors of the original matrix are stored in Q, and the
+          !> eigenvalues are in D.  The algorithm consists of three stages:
+          !> The first stage consists of deflating the size of the problem
+          !> when there are multiple eigenvalues or if there is a zero in
+          !> the Z vector.  For each such occurrence the dimension of the
+          !> secular equation problem is reduced by one.  This stage is
+          !> performed by the routine DLAED2.
+          !> The second stage consists of calculating the updated
+          !> eigenvalues. This is done by finding the roots of the secular
+          !> equation via the routine DLAED4 (as called by DLAED3).
+          !> This routine also calculates the eigenvectors of the current
+          !> problem.
+          !> The final stage consists of computing the updated eigenvectors
+          !> directly using the updated eigenvalues.  The eigenvectors for
+          !> the current problem are multiplied with the eigenvectors from
+          !> the overall problem.
+          interface laed1
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaed1( n, d, q, ldq, indxq, rho, cutpnt, work, iwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: cutpnt,ldq,n
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(dp), intent(inout) :: rho,d(*),q(ldq,*)
+                    integer(ilp), intent(inout) :: indxq(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlaed1
+#else 
+               module procedure stdlib_dlaed1
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaed1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaed1( n, d, q, ldq, indxq, rho, cutpnt, work, iwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: cutpnt,ldq,n
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(sp), intent(inout) :: rho,d(*),q(ldq,*)
+                    integer(ilp), intent(inout) :: indxq(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine slaed1
+#else 
+               module procedure stdlib_slaed1
+#endif
+          end interface laed1
+
+
+
+          !> This subroutine computes the I-th updated eigenvalue of a symmetric
+          !> rank-one modification to a diagonal matrix whose elements are
+          !> given in the array d, and that
+          !> D(i) < D(j)  for  i < j
+          !> and that RHO > 0.  This is arranged by the calling routine, and is
+          !> no loss in generality.  The rank-one modified system is thus
+          !> diag( D )  +  RHO * Z * Z_transpose.
+          !> where we assume the Euclidean norm of Z is 1.
+          !> The method consists of approximating the rational functions in the
+          !> secular equation by simpler interpolating rational functions.
+          interface laed4
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaed4( n, i, d, z, delta, rho, dlam, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: i,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(out) :: dlam,delta(*)
+                    real(dp), intent(in) :: rho,d(*),z(*)
+               end subroutine dlaed4
+#else 
+               module procedure stdlib_dlaed4
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaed4
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaed4( n, i, d, z, delta, rho, dlam, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: i,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(out) :: dlam,delta(*)
+                    real(sp), intent(in) :: rho,d(*),z(*)
+               end subroutine slaed4
+#else 
+               module procedure stdlib_slaed4
+#endif
+          end interface laed4
+
+
+
+          !> This subroutine computes the I-th eigenvalue of a symmetric rank-one
+          !> modification of a 2-by-2 diagonal matrix
+          !> diag( D )  +  RHO * Z * transpose(Z) .
+          !> The diagonal elements in the array D are assumed to satisfy
+          !> D(i) < D(j)  for  i < j .
+          !> We also assume RHO > 0 and that the Euclidean norm of the vector
+          !> Z is one.
+          interface laed5
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaed5( i, d, z, delta, rho, dlam )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: i
+                    real(dp), intent(out) :: dlam,delta(2)
+                    real(dp), intent(in) :: rho,d(2),z(2)
+               end subroutine dlaed5
+#else 
+               module procedure stdlib_dlaed5
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaed5
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaed5( i, d, z, delta, rho, dlam )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: i
+                    real(sp), intent(out) :: dlam,delta(2)
+                    real(sp), intent(in) :: rho,d(2),z(2)
+               end subroutine slaed5
+#else 
+               module procedure stdlib_slaed5
+#endif
+          end interface laed5
+
+
+
+          !> LAED6: computes the positive or negative root (closest to the origin)
+          !> of
+          !> z(1)        z(2)        z(3)
+          !> f(x) =   rho + --------- + ---------- + ---------
+          !> d(1)-x      d(2)-x      d(3)-x
+          !> It is assumed that
+          !> if ORGATI = .true. the root is between d(2) and d(3);
+          !> otherwise it is between d(1) and d(2)
+          !> This routine will be called by DLAED4 when necessary. In most cases,
+          !> the root sought is the smallest in magnitude, though it might not be
+          !> in some extremely rare situations.
+          interface laed6
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaed6( kniter, orgati, rho, d, z, finit, tau, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: orgati
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kniter
+                    real(dp), intent(in) :: finit,rho,d(3),z(3)
+                    real(dp), intent(out) :: tau
+               end subroutine dlaed6
+#else 
+               module procedure stdlib_dlaed6
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaed6
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaed6( kniter, orgati, rho, d, z, finit, tau, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: orgati
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kniter
+                    real(sp), intent(in) :: finit,rho,d(3),z(3)
+                    real(sp), intent(out) :: tau
+               end subroutine slaed6
+#else 
+               module procedure stdlib_slaed6
+#endif
+          end interface laed6
+
+
+
+          !> LAED7: computes the updated eigensystem of a diagonal
+          !> matrix after modification by a rank-one symmetric matrix. This
+          !> routine is used only for the eigenproblem which requires all
+          !> eigenvalues and optionally eigenvectors of a dense or banded
+          !> Hermitian matrix that has been reduced to tridiagonal form.
+          !> T = Q(in) ( D(in) + RHO * Z*Z**H ) Q**H(in) = Q(out) * D(out) * Q**H(out)
+          !> where Z = Q**Hu, u is a vector of length N with ones in the
+          !> CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+          !> The eigenvectors of the original matrix are stored in Q, and the
+          !> eigenvalues are in D.  The algorithm consists of three stages:
+          !> The first stage consists of deflating the size of the problem
+          !> when there are multiple eigenvalues or if there is a zero in
+          !> the Z vector.  For each such occurrence the dimension of the
+          !> secular equation problem is reduced by one.  This stage is
+          !> performed by the routine SLAED2.
+          !> The second stage consists of calculating the updated
+          !> eigenvalues. This is done by finding the roots of the secular
+          !> equation via the routine SLAED4 (as called by SLAED3).
+          !> This routine also calculates the eigenvectors of the current
+          !> problem.
+          !> The final stage consists of computing the updated eigenvectors
+          !> directly using the updated eigenvalues.  The eigenvectors for
+          !> the current problem are multiplied with the eigenvectors from
+          !> the overall problem.
+          interface laed7
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claed7( n, cutpnt, qsiz, tlvls, curlvl, curpbm, d, q,ldq, rho, &
+               indxq, qstore, qptr, prmptr, perm,givptr, givcol, givnum, work, rwork, iwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: curlvl,curpbm,cutpnt,ldq,n,qsiz,tlvls
+                    integer(ilp), intent(out) :: info,indxq(*),iwork(*)
+                    real(sp), intent(inout) :: rho,d(*),givnum(2,*),qstore(*)
+                    integer(ilp), intent(inout) :: givcol(2,*),givptr(*),perm(*),prmptr(*),qptr(*)
+                              
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: q(ldq,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine claed7
+#else 
+               module procedure stdlib_claed7
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaed7( icompq, n, qsiz, tlvls, curlvl, curpbm, d, q,ldq, indxq, &
+               rho, cutpnt, qstore, qptr, prmptr,perm, givptr, givcol, givnum, work, iwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: curlvl,curpbm,cutpnt,icompq,ldq,n,qsiz,&
+                              tlvls
+                    integer(ilp), intent(out) :: info,indxq(*),iwork(*)
+                    real(dp), intent(inout) :: rho,d(*),givnum(2,*),q(ldq,*),qstore(*)
+                    integer(ilp), intent(inout) :: givcol(2,*),givptr(*),perm(*),prmptr(*),qptr(*)
+                              
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlaed7
+#else 
+               module procedure stdlib_dlaed7
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaed7
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaed7( icompq, n, qsiz, tlvls, curlvl, curpbm, d, q,ldq, indxq, &
+               rho, cutpnt, qstore, qptr, prmptr,perm, givptr, givcol, givnum, work, iwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: curlvl,curpbm,cutpnt,icompq,ldq,n,qsiz,&
+                              tlvls
+                    integer(ilp), intent(out) :: info,indxq(*),iwork(*)
+                    real(sp), intent(inout) :: rho,d(*),givnum(2,*),q(ldq,*),qstore(*)
+                    integer(ilp), intent(inout) :: givcol(2,*),givptr(*),perm(*),prmptr(*),qptr(*)
+                              
+                    real(sp), intent(out) :: work(*)
+               end subroutine slaed7
+#else 
+               module procedure stdlib_slaed7
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaed7
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaed7( n, cutpnt, qsiz, tlvls, curlvl, curpbm, d, q,ldq, rho, &
+               indxq, qstore, qptr, prmptr, perm,givptr, givcol, givnum, work, rwork, iwork,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: curlvl,curpbm,cutpnt,ldq,n,qsiz,tlvls
+                    integer(ilp), intent(out) :: info,indxq(*),iwork(*)
+                    real(dp), intent(inout) :: rho,d(*),givnum(2,*),qstore(*)
+                    integer(ilp), intent(inout) :: givcol(2,*),givptr(*),perm(*),prmptr(*),qptr(*)
+                              
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: q(ldq,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zlaed7
+#else 
+               module procedure stdlib_zlaed7
+#endif
+          end interface laed7
+
+
+
+          !> LAED8: merges the two sets of eigenvalues together into a single
+          !> sorted set.  Then it tries to deflate the size of the problem.
+          !> There are two ways in which deflation can occur:  when two or more
+          !> eigenvalues are close together or if there is a tiny element in the
+          !> Z vector.  For each such occurrence the order of the related secular
+          !> equation problem is reduced by one.
+          interface laed8
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claed8( k, n, qsiz, q, ldq, d, rho, cutpnt, z, dlamda,q2, ldq2, w, &
+                         indxp, indx, indxq, perm, givptr,givcol, givnum, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: cutpnt,ldq,ldq2,n,qsiz
+                    integer(ilp), intent(out) :: givptr,info,k,givcol(2,*),indx(*),indxp(*),perm(&
+                              *)
+                    real(sp), intent(inout) :: rho,d(*),z(*)
+                    integer(ilp), intent(inout) :: indxq(*)
+                    real(sp), intent(out) :: dlamda(*),givnum(2,*),w(*)
+                    complex(sp), intent(inout) :: q(ldq,*)
+                    complex(sp), intent(out) :: q2(ldq2,*)
+               end subroutine claed8
+#else 
+               module procedure stdlib_claed8
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaed8( icompq, k, n, qsiz, d, q, ldq, indxq, rho,cutpnt, z, &
+                         dlamda, q2, ldq2, w, perm, givptr,givcol, givnum, indxp, indx, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: cutpnt,icompq,ldq,ldq2,n,qsiz
+                    integer(ilp), intent(out) :: givptr,info,k,givcol(2,*),indx(*),indxp(*),perm(&
+                              *)
+                    real(dp), intent(inout) :: rho,d(*),q(ldq,*),z(*)
+                    integer(ilp), intent(inout) :: indxq(*)
+                    real(dp), intent(out) :: dlamda(*),givnum(2,*),q2(ldq2,*),w(*)
+               end subroutine dlaed8
+#else 
+               module procedure stdlib_dlaed8
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaed8
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaed8( icompq, k, n, qsiz, d, q, ldq, indxq, rho,cutpnt, z, &
+                         dlamda, q2, ldq2, w, perm, givptr,givcol, givnum, indxp, indx, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: cutpnt,icompq,ldq,ldq2,n,qsiz
+                    integer(ilp), intent(out) :: givptr,info,k,givcol(2,*),indx(*),indxp(*),perm(&
+                              *)
+                    real(sp), intent(inout) :: rho,d(*),q(ldq,*),z(*)
+                    integer(ilp), intent(inout) :: indxq(*)
+                    real(sp), intent(out) :: dlamda(*),givnum(2,*),q2(ldq2,*),w(*)
+               end subroutine slaed8
+#else 
+               module procedure stdlib_slaed8
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaed8
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaed8( k, n, qsiz, q, ldq, d, rho, cutpnt, z, dlamda,q2, ldq2, w, &
+                         indxp, indx, indxq, perm, givptr,givcol, givnum, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: cutpnt,ldq,ldq2,n,qsiz
+                    integer(ilp), intent(out) :: givptr,info,k,givcol(2,*),indx(*),indxp(*),perm(&
+                              *)
+                    real(dp), intent(inout) :: rho,d(*),z(*)
+                    integer(ilp), intent(inout) :: indxq(*)
+                    real(dp), intent(out) :: dlamda(*),givnum(2,*),w(*)
+                    complex(dp), intent(inout) :: q(ldq,*)
+                    complex(dp), intent(out) :: q2(ldq2,*)
+               end subroutine zlaed8
+#else 
+               module procedure stdlib_zlaed8
+#endif
+          end interface laed8
+
+
+
+          !> LAED9: finds the roots of the secular equation, as defined by the
+          !> values in D, Z, and RHO, between KSTART and KSTOP.  It makes the
+          !> appropriate calls to DLAED4 and then stores the new matrix of
+          !> eigenvectors for use in calculating the next level of Z vectors.
+          interface laed9
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaed9( k, kstart, kstop, n, d, q, ldq, rho, dlamda, w,s, lds, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,kstart,kstop,ldq,lds,n
+                    real(dp), intent(in) :: rho
+                    real(dp), intent(out) :: d(*),q(ldq,*),s(lds,*)
+                    real(dp), intent(inout) :: dlamda(*),w(*)
+               end subroutine dlaed9
+#else 
+               module procedure stdlib_dlaed9
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaed9
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaed9( k, kstart, kstop, n, d, q, ldq, rho, dlamda, w,s, lds, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,kstart,kstop,ldq,lds,n
+                    real(sp), intent(in) :: rho
+                    real(sp), intent(out) :: d(*),q(ldq,*),s(lds,*)
+                    real(sp), intent(inout) :: dlamda(*),w(*)
+               end subroutine slaed9
+#else 
+               module procedure stdlib_slaed9
+#endif
+          end interface laed9
+
+
+
+          !> LAEDA: computes the Z vector corresponding to the merge step in the
+          !> CURLVLth step of the merge process with TLVLS steps for the CURPBMth
+          !> problem.
+          interface laeda
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaeda( n, tlvls, curlvl, curpbm, prmptr, perm, givptr,givcol, &
+                         givnum, q, qptr, z, ztemp, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: curlvl,curpbm,n,tlvls,givcol(2,*),givptr(*),perm(&
+                              *),prmptr(*),qptr(*)
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(in) :: givnum(2,*),q(*)
+                    real(dp), intent(out) :: z(*),ztemp(*)
+               end subroutine dlaeda
+#else 
+               module procedure stdlib_dlaeda
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaeda
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaeda( n, tlvls, curlvl, curpbm, prmptr, perm, givptr,givcol, &
+                         givnum, q, qptr, z, ztemp, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: curlvl,curpbm,n,tlvls,givcol(2,*),givptr(*),perm(&
+                              *),prmptr(*),qptr(*)
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(in) :: givnum(2,*),q(*)
+                    real(sp), intent(out) :: z(*),ztemp(*)
+               end subroutine slaeda
+#else 
+               module procedure stdlib_slaeda
+#endif
+          end interface laeda
+
+
+
+          !> LAEIN: uses inverse iteration to find a right or left eigenvector
+          !> corresponding to the eigenvalue W of a complex upper Hessenberg
+          !> matrix H.
+          interface laein
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claein( rightv, noinit, n, h, ldh, w, v, b, ldb, rwork,eps3, &
+                         smlnum, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: noinit,rightv
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldh,n
+                    real(sp), intent(in) :: eps3,smlnum
+                    complex(sp), intent(in) :: w,h(ldh,*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(out) :: b(ldb,*)
+                    complex(sp), intent(inout) :: v(*)
+               end subroutine claein
+#else 
+               module procedure stdlib_claein
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaein( rightv, noinit, n, h, ldh, wr, wi, vr, vi, b,ldb, work, &
+                         eps3, smlnum, bignum, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: noinit,rightv
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldh,n
+                    real(dp), intent(in) :: bignum,eps3,smlnum,wi,wr,h(ldh,*)
+                    real(dp), intent(out) :: b(ldb,*),work(*)
+                    real(dp), intent(inout) :: vi(*),vr(*)
+               end subroutine dlaein
+#else 
+               module procedure stdlib_dlaein
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaein
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaein( rightv, noinit, n, h, ldh, wr, wi, vr, vi, b,ldb, work, &
+                         eps3, smlnum, bignum, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: noinit,rightv
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldh,n
+                    real(sp), intent(in) :: bignum,eps3,smlnum,wi,wr,h(ldh,*)
+                    real(sp), intent(out) :: b(ldb,*),work(*)
+                    real(sp), intent(inout) :: vi(*),vr(*)
+               end subroutine slaein
+#else 
+               module procedure stdlib_slaein
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaein
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaein( rightv, noinit, n, h, ldh, w, v, b, ldb, rwork,eps3, &
+                         smlnum, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: noinit,rightv
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldh,n
+                    real(dp), intent(in) :: eps3,smlnum
+                    complex(dp), intent(in) :: w,h(ldh,*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(out) :: b(ldb,*)
+                    complex(dp), intent(inout) :: v(*)
+               end subroutine zlaein
+#else 
+               module procedure stdlib_zlaein
+#endif
+          end interface laein
+
+
+
+          !> LAESY: computes the eigendecomposition of a 2-by-2 symmetric matrix
+          !> ( ( A, B );( B, C ) )
+          !> provided the norm of the matrix of eigenvectors is larger than
+          !> some threshold value.
+          !> RT1 is the eigenvalue of larger absolute value, and RT2 of
+          !> smaller absolute value.  If the eigenvectors are computed, then
+          !> on return ( CS1, SN1 ) is the unit eigenvector for RT1, hence
+          !> [  CS1     SN1   ] . [ A  B ] . [ CS1    -SN1   ] = [ RT1  0  ]
+          !> [ -SN1     CS1   ]   [ B  C ]   [ SN1     CS1   ]   [  0  RT2 ]
+          interface laesy
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claesy( a, b, c, rt1, rt2, evscal, cs1, sn1 )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: a,b,c
+                    complex(sp), intent(out) :: cs1,evscal,rt1,rt2,sn1
+               end subroutine claesy
+#else 
+               module procedure stdlib_claesy
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaesy
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaesy( a, b, c, rt1, rt2, evscal, cs1, sn1 )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: a,b,c
+                    complex(dp), intent(out) :: cs1,evscal,rt1,rt2,sn1
+               end subroutine zlaesy
+#else 
+               module procedure stdlib_zlaesy
+#endif
+          end interface laesy
+
+
+
+          !> LAEXC: swaps adjacent diagonal blocks T11 and T22 of order 1 or 2 in
+          !> an upper quasi-triangular matrix T by an orthogonal similarity
+          !> transformation.
+          !> T must be in Schur canonical form, that is, block upper triangular
+          !> with 1-by-1 and 2-by-2 diagonal blocks; each 2-by-2 diagonal block
+          !> has its diagonal elements equal and its off-diagonal elements of
+          !> opposite sign.
+          interface laexc
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dlaexc( wantq, n, t, ldt, q, ldq, j1, n1, n2, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantq
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: j1,ldq,ldt,n,n1,n2
+                    real(dp), intent(inout) :: q(ldq,*),t(ldt,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlaexc
+#else 
+               module procedure stdlib_dlaexc
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaexc
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine slaexc( wantq, n, t, ldt, q, ldq, j1, n1, n2, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantq
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: j1,ldq,ldt,n,n1,n2
+                    real(sp), intent(inout) :: q(ldq,*),t(ldt,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine slaexc
+#else 
+               module procedure stdlib_slaexc
+#endif
+          end interface laexc
+
+
+
+          !> LAGTF: factorizes the matrix (T - lambda*I), where T is an n by n
+          !> tridiagonal matrix and lambda is a scalar, as
+          !> T - lambda*I = PLU,
+          !> where P is a permutation matrix, L is a unit lower tridiagonal matrix
+          !> with at most one non-zero sub-diagonal elements per column and U is
+          !> an upper triangular matrix with at most two non-zero super-diagonal
+          !> elements per column.
+          !> The factorization is obtained by Gaussian elimination with partial
+          !> pivoting and implicit row scaling.
+          !> The parameter LAMBDA is included in the routine so that LAGTF may
+          !> be used, in conjunction with DLAGTS, to obtain eigenvectors of T by
+          !> inverse iteration.
+          interface lagtf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlagtf( n, a, lambda, b, c, tol, d, in, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,in(*)
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: lambda,tol
+                    real(dp), intent(inout) :: a(*),b(*),c(*)
+                    real(dp), intent(out) :: d(*)
+               end subroutine dlagtf
+#else 
+               module procedure stdlib_dlagtf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlagtf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slagtf( n, a, lambda, b, c, tol, d, in, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,in(*)
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: lambda,tol
+                    real(sp), intent(inout) :: a(*),b(*),c(*)
+                    real(sp), intent(out) :: d(*)
+               end subroutine slagtf
+#else 
+               module procedure stdlib_slagtf
+#endif
+          end interface lagtf
+
+
+
+          !> LAGTM: performs a matrix-vector product of the form
+          !> B := alpha * A * X + beta * B
+          !> where A is a tridiagonal matrix of order N, B and X are N by NRHS
+          !> matrices, and alpha and beta are real scalars, each of which may be
+          !> 0., 1., or -1.
+          interface lagtm
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clagtm( trans, n, nrhs, alpha, dl, d, du, x, ldx, beta,b, ldb )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(sp), intent(in) :: alpha,beta
+                    complex(sp), intent(inout) :: b(ldb,*)
+                    complex(sp), intent(in) :: d(*),dl(*),du(*),x(ldx,*)
+               end subroutine clagtm
+#else 
+               module procedure stdlib_clagtm
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlagtm( trans, n, nrhs, alpha, dl, d, du, x, ldx, beta,b, ldb )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(dp), intent(in) :: alpha,beta,d(*),dl(*),du(*),x(ldx,*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dlagtm
+#else 
+               module procedure stdlib_dlagtm
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlagtm
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slagtm( trans, n, nrhs, alpha, dl, d, du, x, ldx, beta,b, ldb )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(sp), intent(in) :: alpha,beta,d(*),dl(*),du(*),x(ldx,*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine slagtm
+#else 
+               module procedure stdlib_slagtm
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlagtm
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlagtm( trans, n, nrhs, alpha, dl, d, du, x, ldx, beta,b, ldb )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(dp), intent(in) :: alpha,beta
+                    complex(dp), intent(inout) :: b(ldb,*)
+                    complex(dp), intent(in) :: d(*),dl(*),du(*),x(ldx,*)
+               end subroutine zlagtm
+#else 
+               module procedure stdlib_zlagtm
+#endif
+          end interface lagtm
+
+
+
+          !> LAGTS: may be used to solve one of the systems of equations
+          !> (T - lambda*I)*x = y   or   (T - lambda*I)**T*x = y,
+          !> where T is an n by n tridiagonal matrix, for x, following the
+          !> factorization of (T - lambda*I) as
+          !> (T - lambda*I) = P*L*U ,
+          !> by routine DLAGTF. The choice of equation to be solved is
+          !> controlled by the argument JOB, and in each case there is an option
+          !> to perturb zero or very small diagonal elements of U, this option
+          !> being intended for use in applications such as inverse iteration.
+          interface lagts
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlagts( job, n, a, b, c, d, in, y, tol, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: job,n,in(*)
+                    real(dp), intent(inout) :: tol,y(*)
+                    real(dp), intent(in) :: a(*),b(*),c(*),d(*)
+               end subroutine dlagts
+#else 
+               module procedure stdlib_dlagts
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlagts
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slagts( job, n, a, b, c, d, in, y, tol, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: job,n,in(*)
+                    real(sp), intent(inout) :: tol,y(*)
+                    real(sp), intent(in) :: a(*),b(*),c(*),d(*)
+               end subroutine slagts
+#else 
+               module procedure stdlib_slagts
+#endif
+          end interface lagts
+
+
+
+          !> LAHEF: computes a partial factorization of a complex Hermitian
+          !> matrix A using the Bunch-Kaufman diagonal pivoting method. The
+          !> partial factorization has the form:
+          !> A  =  ( I  U12 ) ( A11  0  ) (  I      0     )  if UPLO = 'U', or:
+          !> ( 0  U22 ) (  0   D  ) ( U12**H U22**H )
+          !> A  =  ( L11  0 ) (  D   0  ) ( L11**H L21**H )  if UPLO = 'L'
+          !> ( L21  I ) (  0  A22 ) (  0      I     )
+          !> where the order of D is at most NB. The actual order is returned in
+          !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+          !> Note that U**H denotes the conjugate transpose of U.
+          !> LAHEF is an auxiliary routine called by CHETRF. It uses blocked code
+          !> (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or
+          !> A22 (if UPLO = 'L').
+          interface lahef
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clahef( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: w(ldw,*)
+               end subroutine clahef
+#else 
+               module procedure stdlib_clahef
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlahef
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlahef( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: w(ldw,*)
+               end subroutine zlahef
+#else 
+               module procedure stdlib_zlahef
+#endif
+          end interface lahef
+
+
+
+          !> LAHEF_AA: factorizes a panel of a complex hermitian matrix A using
+          !> the Aasen's algorithm. The panel consists of a set of NB rows of A
+          !> when UPLO is U, or a set of NB columns when UPLO is L.
+          !> In order to factorize the panel, the Aasen's algorithm requires the
+          !> last row, or column, of the previous panel. The first row, or column,
+          !> of A is set to be the first row, or column, of an identity matrix,
+          !> which is used to factorize the first panel.
+          !> The resulting J-th row of U, or J-th column of L, is stored in the
+          !> (J-1)-th row, or column, of A (without the unit diagonals), while
+          !> the diagonal and subdiagonal of A are overwritten by those of T.
+          interface lahef_aa
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clahef_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: m,nb,j1,lda,ldh
+                    integer(ilp), intent(out) :: ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*),h(ldh,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine clahef_aa
+#else 
+               module procedure stdlib_clahef_aa
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlahef_aa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlahef_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: m,nb,j1,lda,ldh
+                    integer(ilp), intent(out) :: ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*),h(ldh,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zlahef_aa
+#else 
+               module procedure stdlib_zlahef_aa
+#endif
+          end interface lahef_aa
+
+
+
+          !> LAHEF_RK: computes a partial factorization of a complex Hermitian
+          !> matrix A using the bounded Bunch-Kaufman (rook) diagonal
+          !> pivoting method. The partial factorization has the form:
+          !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+          !> ( 0  U22 ) (  0   D  ) ( U12**H U22**H )
+          !> A  =  ( L11  0 ) (  D   0  ) ( L11**H L21**H )  if UPLO = 'L',
+          !> ( L21  I ) (  0  A22 ) (  0       I    )
+          !> where the order of D is at most NB. The actual order is returned in
+          !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+          !> LAHEF_RK is an auxiliary routine called by CHETRF_RK. It uses
+          !> blocked code (calling Level 3 BLAS) to update the submatrix
+          !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+          interface lahef_rk
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clahef_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: w(ldw,*),e(*)
+               end subroutine clahef_rk
+#else 
+               module procedure stdlib_clahef_rk
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlahef_rk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlahef_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: w(ldw,*),e(*)
+               end subroutine zlahef_rk
+#else 
+               module procedure stdlib_zlahef_rk
+#endif
+          end interface lahef_rk
+
+
+
+          !> LAHEF_ROOK: computes a partial factorization of a complex Hermitian
+          !> matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting
+          !> method. The partial factorization has the form:
+          !> A  =  ( I  U12 ) ( A11  0  ) (  I      0     )  if UPLO = 'U', or:
+          !> ( 0  U22 ) (  0   D  ) ( U12**H U22**H )
+          !> A  =  ( L11  0 ) (  D   0  ) ( L11**H L21**H )  if UPLO = 'L'
+          !> ( L21  I ) (  0  A22 ) (  0      I     )
+          !> where the order of D is at most NB. The actual order is returned in
+          !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+          !> Note that U**H denotes the conjugate transpose of U.
+          !> LAHEF_ROOK is an auxiliary routine called by CHETRF_ROOK. It uses
+          !> blocked code (calling Level 3 BLAS) to update the submatrix
+          !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+          interface lahef_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clahef_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: w(ldw,*)
+               end subroutine clahef_rook
+#else 
+               module procedure stdlib_clahef_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlahef_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlahef_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: w(ldw,*)
+               end subroutine zlahef_rook
+#else 
+               module procedure stdlib_zlahef_rook
+#endif
+          end interface lahef_rook
+
+
+
+          !> LAHQR: is an auxiliary routine called by CHSEQR to update the
+          !> eigenvalues and Schur decomposition already computed by CHSEQR, by
+          !> dealing with the Hessenberg submatrix in rows and columns ILO to
+          !> IHI.
+          interface lahqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ihiz,ilo,iloz,ldh,ldz,n
+                    integer(ilp), intent(out) :: info
+                    logical(lk), intent(in) :: wantt,wantz
+                    complex(sp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    complex(sp), intent(out) :: w(*)
+               end subroutine clahqr
+#else 
+               module procedure stdlib_clahqr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, &
+                         ldz, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ihiz,ilo,iloz,ldh,ldz,n
+                    integer(ilp), intent(out) :: info
+                    logical(lk), intent(in) :: wantt,wantz
+                    real(dp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    real(dp), intent(out) :: wi(*),wr(*)
+               end subroutine dlahqr
+#else 
+               module procedure stdlib_dlahqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlahqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, &
+                         ldz, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ihiz,ilo,iloz,ldh,ldz,n
+                    integer(ilp), intent(out) :: info
+                    logical(lk), intent(in) :: wantt,wantz
+                    real(sp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    real(sp), intent(out) :: wi(*),wr(*)
+               end subroutine slahqr
+#else 
+               module procedure stdlib_slahqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlahqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ihiz,ilo,iloz,ldh,ldz,n
+                    integer(ilp), intent(out) :: info
+                    logical(lk), intent(in) :: wantt,wantz
+                    complex(dp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    complex(dp), intent(out) :: w(*)
+               end subroutine zlahqr
+#else 
+               module procedure stdlib_zlahqr
+#endif
+          end interface lahqr
+
+
+
+          !> LAIC1: applies one step of incremental condition estimation in
+          !> its simplest version:
+          !> Let x, twonorm(x) = 1, be an approximate singular vector of an j-by-j
+          !> lower triangular matrix L, such that
+          !> twonorm(L*x) = sest
+          !> Then LAIC1 computes sestpr, s, c such that
+          !> the vector
+          !> [ s*x ]
+          !> xhat = [  c  ]
+          !> is an approximate singular vector of
+          !> [ L      0  ]
+          !> Lhat = [ w**H gamma ]
+          !> in the sense that
+          !> twonorm(Lhat*xhat) = sestpr.
+          !> Depending on JOB, an estimate for the largest or smallest singular
+          !> value is computed.
+          !> Note that [s c]**H and sestpr**2 is an eigenpair of the system
+          !> diag(sest*sest, 0) + [alpha  gamma] * [ conjg(alpha) ]
+          !> [ conjg(gamma) ]
+          !> where  alpha =  x**H*w.
+          interface laic1
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claic1( job, j, x, sest, w, gamma, sestpr, s, c )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: j,job
+                    real(sp), intent(in) :: sest
+                    real(sp), intent(out) :: sestpr
+                    complex(sp), intent(out) :: c,s
+                    complex(sp), intent(in) :: gamma,w(j),x(j)
+               end subroutine claic1
+#else 
+               module procedure stdlib_claic1
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaic1( job, j, x, sest, w, gamma, sestpr, s, c )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: j,job
+                    real(dp), intent(out) :: c,s,sestpr
+                    real(dp), intent(in) :: gamma,sest,w(j),x(j)
+               end subroutine dlaic1
+#else 
+               module procedure stdlib_dlaic1
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaic1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaic1( job, j, x, sest, w, gamma, sestpr, s, c )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: j,job
+                    real(sp), intent(out) :: c,s,sestpr
+                    real(sp), intent(in) :: gamma,sest,w(j),x(j)
+               end subroutine slaic1
+#else 
+               module procedure stdlib_slaic1
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaic1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaic1( job, j, x, sest, w, gamma, sestpr, s, c )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: j,job
+                    real(dp), intent(in) :: sest
+                    real(dp), intent(out) :: sestpr
+                    complex(dp), intent(out) :: c,s
+                    complex(dp), intent(in) :: gamma,w(j),x(j)
+               end subroutine zlaic1
+#else 
+               module procedure stdlib_zlaic1
+#endif
+          end interface laic1
+
+
+
+          !> This routine is not for general use.  It exists solely to avoid
+          !> over-optimization in DISNAN.
+          !> LAISNAN: checks for NaNs by comparing its two arguments for
+          !> inequality.  NaN is the only floating-point value where NaN != NaN
+          !> returns .TRUE.  To check for NaNs, pass the same variable as both
+          !> arguments.
+          !> A compiler must assume that the two arguments are
+          !> not the same variable, and the test will not be optimized away.
+          !> Interprocedural or whole-program optimization may delete this
+          !> test.  The ISNAN functions will be replaced by the correct
+          !> Fortran 03 intrinsic once the intrinsic is widely available.
+          interface laisnan
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure logical(lk) function dlaisnan( din1, din2 )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: din1,din2
+               end function dlaisnan
+#else 
+               module procedure stdlib_dlaisnan
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaisnan
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure logical(lk) function slaisnan( sin1, sin2 )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: sin1,sin2
+               end function slaisnan
+#else 
+               module procedure stdlib_slaisnan
+#endif
+          end interface laisnan
+
+
+
+          !> LALS0: applies back the multiplying factors of either the left or the
+          !> right singular vector matrix of a diagonal matrix appended by a row
+          !> to the right hand side matrix B in solving the least squares problem
+          !> using the divide-and-conquer SVD approach.
+          !> For the left singular vector matrix, three types of orthogonal
+          !> matrices are involved:
+          !> (1L) Givens rotations: the number of such rotations is GIVPTR; the
+          !> pairs of columns/rows they were applied to are stored in GIVCOL;
+          !> and the C- and S-values of these rotations are stored in GIVNUM.
+          !> (2L) Permutation. The (NL+1)-st row of B is to be moved to the first
+          !> row, and for J=2:N, PERM(J)-th row of B is to be moved to the
+          !> J-th row.
+          !> (3L) The left singular vector matrix of the remaining matrix.
+          !> For the right singular vector matrix, four types of orthogonal
+          !> matrices are involved:
+          !> (1R) The right singular vector matrix of the remaining matrix.
+          !> (2R) If SQRE = 1, one extra Givens rotation to generate the right
+          !> null space.
+          !> (3R) The inverse transformation of (2L).
+          !> (4R) The inverse transformation of (1L).
+          interface lals0
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clals0( icompq, nl, nr, sqre, nrhs, b, ldb, bx, ldbx,perm, givptr, &
+                         givcol, ldgcol, givnum, ldgnum,poles, difl, difr, z, k, c, s, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: givptr,icompq,k,ldb,ldbx,ldgcol,ldgnum,nl,nr,nrhs,&
+                              sqre,givcol(ldgcol,*),perm(*)
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(in) :: c,s,difl(*),difr(ldgnum,*),givnum(ldgnum,*),poles(&
+                              ldgnum,*),z(*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+                    complex(sp), intent(out) :: bx(ldbx,*)
+               end subroutine clals0
+#else 
+               module procedure stdlib_clals0
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlals0( icompq, nl, nr, sqre, nrhs, b, ldb, bx, ldbx,perm, givptr, &
+                         givcol, ldgcol, givnum, ldgnum,poles, difl, difr, z, k, c, s, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: givptr,icompq,k,ldb,ldbx,ldgcol,ldgnum,nl,nr,nrhs,&
+                              sqre,givcol(ldgcol,*),perm(*)
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(in) :: c,s,difl(*),difr(ldgnum,*),givnum(ldgnum,*),poles(&
+                              ldgnum,*),z(*)
+                    real(dp), intent(inout) :: b(ldb,*)
+                    real(dp), intent(out) :: bx(ldbx,*),work(*)
+               end subroutine dlals0
+#else 
+               module procedure stdlib_dlals0
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlals0
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slals0( icompq, nl, nr, sqre, nrhs, b, ldb, bx, ldbx,perm, givptr, &
+                         givcol, ldgcol, givnum, ldgnum,poles, difl, difr, z, k, c, s, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: givptr,icompq,k,ldb,ldbx,ldgcol,ldgnum,nl,nr,nrhs,&
+                              sqre,givcol(ldgcol,*),perm(*)
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(in) :: c,s,difl(*),difr(ldgnum,*),givnum(ldgnum,*),poles(&
+                              ldgnum,*),z(*)
+                    real(sp), intent(inout) :: b(ldb,*)
+                    real(sp), intent(out) :: bx(ldbx,*),work(*)
+               end subroutine slals0
+#else 
+               module procedure stdlib_slals0
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlals0
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlals0( icompq, nl, nr, sqre, nrhs, b, ldb, bx, ldbx,perm, givptr, &
+                         givcol, ldgcol, givnum, ldgnum,poles, difl, difr, z, k, c, s, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: givptr,icompq,k,ldb,ldbx,ldgcol,ldgnum,nl,nr,nrhs,&
+                              sqre,givcol(ldgcol,*),perm(*)
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(in) :: c,s,difl(*),difr(ldgnum,*),givnum(ldgnum,*),poles(&
+                              ldgnum,*),z(*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+                    complex(dp), intent(out) :: bx(ldbx,*)
+               end subroutine zlals0
+#else 
+               module procedure stdlib_zlals0
+#endif
+          end interface lals0
+
+
+
+          !> LALSA: is an itermediate step in solving the least squares problem
+          !> by computing the SVD of the coefficient matrix in compact form (The
+          !> singular vectors are computed as products of simple orthorgonal
+          !> matrices.).
+          !> If ICOMPQ = 0, LALSA applies the inverse of the left singular vector
+          !> matrix of an upper bidiagonal matrix to the right hand side; and if
+          !> ICOMPQ = 1, LALSA applies the right singular vector matrix to the
+          !> right hand side. The singular vector matrices were generated in
+          !> compact form by LALSA.
+          interface lalsa
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clalsa( icompq, smlsiz, n, nrhs, b, ldb, bx, ldbx, u,ldu, vt, k, &
+               difl, difr, z, poles, givptr,givcol, ldgcol, perm, givnum, c, s, rwork,iwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: icompq,ldb,ldbx,ldgcol,ldu,n,nrhs,smlsiz,givcol(&
+                              ldgcol,*),givptr(*),k(*),perm(ldgcol,*)
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(sp), intent(in) :: c(*),difl(ldu,*),difr(ldu,*),givnum(ldu,*),poles(ldu,&
+                              *),s(*),u(ldu,*),vt(ldu,*),z(ldu,*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+                    complex(sp), intent(out) :: bx(ldbx,*)
+               end subroutine clalsa
+#else 
+               module procedure stdlib_clalsa
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlalsa( icompq, smlsiz, n, nrhs, b, ldb, bx, ldbx, u,ldu, vt, k, &
+               difl, difr, z, poles, givptr,givcol, ldgcol, perm, givnum, c, s, work,iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: icompq,ldb,ldbx,ldgcol,ldu,n,nrhs,smlsiz,givcol(&
+                              ldgcol,*),givptr(*),k(*),perm(ldgcol,*)
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(dp), intent(inout) :: b(ldb,*)
+                    real(dp), intent(out) :: bx(ldbx,*),work(*)
+                    real(dp), intent(in) :: c(*),difl(ldu,*),difr(ldu,*),givnum(ldu,*),poles(ldu,&
+                              *),s(*),u(ldu,*),vt(ldu,*),z(ldu,*)
+               end subroutine dlalsa
+#else 
+               module procedure stdlib_dlalsa
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlalsa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slalsa( icompq, smlsiz, n, nrhs, b, ldb, bx, ldbx, u,ldu, vt, k, &
+               difl, difr, z, poles, givptr,givcol, ldgcol, perm, givnum, c, s, work,iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: icompq,ldb,ldbx,ldgcol,ldu,n,nrhs,smlsiz,givcol(&
+                              ldgcol,*),givptr(*),k(*),perm(ldgcol,*)
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(sp), intent(inout) :: b(ldb,*)
+                    real(sp), intent(out) :: bx(ldbx,*),work(*)
+                    real(sp), intent(in) :: c(*),difl(ldu,*),difr(ldu,*),givnum(ldu,*),poles(ldu,&
+                              *),s(*),u(ldu,*),vt(ldu,*),z(ldu,*)
+               end subroutine slalsa
+#else 
+               module procedure stdlib_slalsa
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlalsa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlalsa( icompq, smlsiz, n, nrhs, b, ldb, bx, ldbx, u,ldu, vt, k, &
+               difl, difr, z, poles, givptr,givcol, ldgcol, perm, givnum, c, s, rwork,iwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: icompq,ldb,ldbx,ldgcol,ldu,n,nrhs,smlsiz,givcol(&
+                              ldgcol,*),givptr(*),k(*),perm(ldgcol,*)
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(dp), intent(in) :: c(*),difl(ldu,*),difr(ldu,*),givnum(ldu,*),poles(ldu,&
+                              *),s(*),u(ldu,*),vt(ldu,*),z(ldu,*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+                    complex(dp), intent(out) :: bx(ldbx,*)
+               end subroutine zlalsa
+#else 
+               module procedure stdlib_zlalsa
+#endif
+          end interface lalsa
+
+
+
+          !> LALSD: uses the singular value decomposition of A to solve the least
+          !> squares problem of finding X to minimize the Euclidean norm of each
+          !> column of A*X-B, where A is N-by-N upper bidiagonal, and X and B
+          !> are N-by-NRHS. The solution X overwrites B.
+          !> The singular values of A smaller than RCOND times the largest
+          !> singular value are treated as zero in solving the least squares
+          !> problem; in this case a minimum norm solution is returned.
+          !> The actual singular values are returned in D in ascending order.
+          !> This code makes very mild assumptions about floating point
+          !> arithmetic. It will work on machines with a guard digit in
+          !> add/subtract, or on those binary machines without guard digits
+          !> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+          !> It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface lalsd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clalsd( uplo, smlsiz, n, nrhs, d, e, b, ldb, rcond,rank, work, &
+                         rwork, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,rank,iwork(*)
+                    integer(ilp), intent(in) :: ldb,n,nrhs,smlsiz
+                    real(sp), intent(in) :: rcond
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine clalsd
+#else 
+               module procedure stdlib_clalsd
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlalsd( uplo, smlsiz, n, nrhs, d, e, b, ldb, rcond,rank, work, &
+                         iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,rank,iwork(*)
+                    integer(ilp), intent(in) :: ldb,n,nrhs,smlsiz
+                    real(dp), intent(in) :: rcond
+                    real(dp), intent(inout) :: b(ldb,*),d(*),e(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlalsd
+#else 
+               module procedure stdlib_dlalsd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlalsd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slalsd( uplo, smlsiz, n, nrhs, d, e, b, ldb, rcond,rank, work, &
+                         iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,rank,iwork(*)
+                    integer(ilp), intent(in) :: ldb,n,nrhs,smlsiz
+                    real(sp), intent(in) :: rcond
+                    real(sp), intent(inout) :: b(ldb,*),d(*),e(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine slalsd
+#else 
+               module procedure stdlib_slalsd
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlalsd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlalsd( uplo, smlsiz, n, nrhs, d, e, b, ldb, rcond,rank, work, &
+                         rwork, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,rank,iwork(*)
+                    integer(ilp), intent(in) :: ldb,n,nrhs,smlsiz
+                    real(dp), intent(in) :: rcond
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zlalsd
+#else 
+               module procedure stdlib_zlalsd
+#endif
+          end interface lalsd
+
+
+
+          !> LAMRG: will create a permutation list which will merge the elements
+          !> of A (which is composed of two independently sorted sets) into a
+          !> single set which is sorted in ascending order.
+          interface lamrg
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlamrg( n1, n2, a, dtrd1, dtrd2, index )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: dtrd1,dtrd2,n1,n2
+                    integer(ilp), intent(out) :: index(*)
+                    real(dp), intent(in) :: a(*)
+               end subroutine dlamrg
+#else 
+               module procedure stdlib_dlamrg
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlamrg
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slamrg( n1, n2, a, strd1, strd2, index )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n1,n2,strd1,strd2
+                    integer(ilp), intent(out) :: index(*)
+                    real(sp), intent(in) :: a(*)
+               end subroutine slamrg
+#else 
+               module procedure stdlib_slamrg
+#endif
+          end interface lamrg
+
+
+
+          !> LAMSWLQ: overwrites the general complex M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'T':      Q**H * C       C * Q**H
+          !> where Q is a complex unitary matrix defined as the product of blocked
+          !> elementary reflectors computed by short wide LQ
+          !> factorization (CLASWLQ)
+          interface lamswlq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clamswlq( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,mb,nb,ldt,lwork,ldc
+                    complex(sp), intent(in) :: a(lda,*),t(ldt,*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: c(ldc,*)
+               end subroutine clamswlq
+#else 
+               module procedure stdlib_clamswlq
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlamswlq( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,mb,nb,ldt,lwork,ldc
+                    real(dp), intent(in) :: a(lda,*),t(ldt,*)
+                    real(dp), intent(out) :: work(*)
+                    real(dp), intent(inout) :: c(ldc,*)
+               end subroutine dlamswlq
+#else 
+               module procedure stdlib_dlamswlq
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlamswlq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slamswlq( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,mb,nb,ldt,lwork,ldc
+                    real(sp), intent(in) :: a(lda,*),t(ldt,*)
+                    real(sp), intent(out) :: work(*)
+                    real(sp), intent(inout) :: c(ldc,*)
+               end subroutine slamswlq
+#else 
+               module procedure stdlib_slamswlq
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlamswlq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlamswlq( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,mb,nb,ldt,lwork,ldc
+                    complex(dp), intent(in) :: a(lda,*),t(ldt,*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: c(ldc,*)
+               end subroutine zlamswlq
+#else 
+               module procedure stdlib_zlamswlq
+#endif
+          end interface lamswlq
+
+
+
+          !> LAMTSQR: overwrites the general complex M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'C':      Q**H * C       C * Q**H
+          !> where Q is a complex unitary matrix defined as the product
+          !> of blocked elementary reflectors computed by tall skinny
+          !> QR factorization (CLATSQR)
+          interface lamtsqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clamtsqr( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,mb,nb,ldt,lwork,ldc
+                    complex(sp), intent(in) :: a(lda,*),t(ldt,*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: c(ldc,*)
+               end subroutine clamtsqr
+#else 
+               module procedure stdlib_clamtsqr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlamtsqr( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,mb,nb,ldt,lwork,ldc
+                    real(dp), intent(in) :: a(lda,*),t(ldt,*)
+                    real(dp), intent(out) :: work(*)
+                    real(dp), intent(inout) :: c(ldc,*)
+               end subroutine dlamtsqr
+#else 
+               module procedure stdlib_dlamtsqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlamtsqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slamtsqr( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,mb,nb,ldt,lwork,ldc
+                    real(sp), intent(in) :: a(lda,*),t(ldt,*)
+                    real(sp), intent(out) :: work(*)
+                    real(sp), intent(inout) :: c(ldc,*)
+               end subroutine slamtsqr
+#else 
+               module procedure stdlib_slamtsqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlamtsqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlamtsqr( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,k,mb,nb,ldt,lwork,ldc
+                    complex(dp), intent(in) :: a(lda,*),t(ldt,*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: c(ldc,*)
+               end subroutine zlamtsqr
+#else 
+               module procedure stdlib_zlamtsqr
+#endif
+          end interface lamtsqr
+
+
+
+          !> LANEG: computes the Sturm count, the number of negative pivots
+          !> encountered while factoring tridiagonal T - sigma I = L D L^T.
+          !> This implementation works directly on the factors without forming
+          !> the tridiagonal matrix T.  The Sturm count is also the number of
+          !> eigenvalues of T less than sigma.
+          !> This routine is called from DLARRB.
+          !> The current routine does not use the PIVMIN parameter but rather
+          !> requires IEEE-754 propagation of Infinities and NaNs.  This
+          !> routine also has no input range restrictions but does require
+          !> default exception handling such that x/0 produces Inf when x is
+          !> non-zero, and Inf/Inf produces NaN.  For more information, see:
+          !> Marques, Riedy, and Voemel, "Benefits of IEEE-754 Features in
+          !> Modern Symmetric Tridiagonal Eigensolvers," SIAM Journal on
+          !> Scientific Computing, v28, n5, 2006.  DOI 10.1137/050641624
+          !> (Tech report version in LAWN 172 with the same title.)
+          interface laneg
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure integer(ilp) function dlaneg( n, d, lld, sigma, pivmin, r )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,r
+                    real(dp), intent(in) :: pivmin,sigma,d(*),lld(*)
+               end function dlaneg
+#else 
+               module procedure stdlib_dlaneg
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaneg
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure integer(ilp) function slaneg( n, d, lld, sigma, pivmin, r )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n,r
+                    real(sp), intent(in) :: pivmin,sigma,d(*),lld(*)
+               end function slaneg
+#else 
+               module procedure stdlib_slaneg
+#endif
+          end interface laneg
+
+
+
+          !> LANGB:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the element of  largest absolute value  of an
+          !> n by n band matrix  A,  with kl sub-diagonals and ku super-diagonals.
+          interface langb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function clangb( norm, n, kl, ku, ab, ldab,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: kl,ku,ldab,n
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(in) :: ab(ldab,*)
+               end function clangb
+#else 
+               module procedure stdlib_clangb
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dlangb( norm, n, kl, ku, ab, ldab,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: kl,ku,ldab,n
+                    real(dp), intent(in) :: ab(ldab,*)
+                    real(dp), intent(out) :: work(*)
+               end function dlangb
+#else 
+               module procedure stdlib_dlangb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlangb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function slangb( norm, n, kl, ku, ab, ldab,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: kl,ku,ldab,n
+                    real(sp), intent(in) :: ab(ldab,*)
+                    real(sp), intent(out) :: work(*)
+               end function slangb
+#else 
+               module procedure stdlib_slangb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlangb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zlangb( norm, n, kl, ku, ab, ldab,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: kl,ku,ldab,n
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(in) :: ab(ldab,*)
+               end function zlangb
+#else 
+               module procedure stdlib_zlangb
+#endif
+          end interface langb
+
+
+
+          !> LANGE:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the  element of  largest absolute value  of a
+          !> complex matrix A.
+          interface lange
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function clange( norm, m, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(in) :: a(lda,*)
+               end function clange
+#else 
+               module procedure stdlib_clange
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dlange( norm, m, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(out) :: work(*)
+               end function dlange
+#else 
+               module procedure stdlib_dlange
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlange
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function slange( norm, m, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(out) :: work(*)
+               end function slange
+#else 
+               module procedure stdlib_slange
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlange
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zlange( norm, m, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(in) :: a(lda,*)
+               end function zlange
+#else 
+               module procedure stdlib_zlange
+#endif
+          end interface lange
+
+
+
+          !> LANGT:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the  element of  largest absolute value  of a
+          !> complex tridiagonal matrix A.
+          interface langt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(sp)             function clangt( norm, n, dl, d, du )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: n
+                    complex(sp), intent(in) :: d(*),dl(*),du(*)
+               end function clangt
+#else 
+               module procedure stdlib_clangt
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(dp) function dlangt( norm, n, dl, d, du )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: d(*),dl(*),du(*)
+               end function dlangt
+#else 
+               module procedure stdlib_dlangt
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlangt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(sp)             function slangt( norm, n, dl, d, du )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: d(*),dl(*),du(*)
+               end function slangt
+#else 
+               module procedure stdlib_slangt
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlangt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(dp) function zlangt( norm, n, dl, d, du )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: n
+                    complex(dp), intent(in) :: d(*),dl(*),du(*)
+               end function zlangt
+#else 
+               module procedure stdlib_zlangt
+#endif
+          end interface langt
+
+
+
+          !> LANHB:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the element of  largest absolute value  of an
+          !> n by n hermitian band matrix A,  with k super-diagonals.
+          interface lanhb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function clanhb( norm, uplo, n, k, ab, ldab,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: k,ldab,n
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(in) :: ab(ldab,*)
+               end function clanhb
+#else 
+               module procedure stdlib_clanhb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlanhb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zlanhb( norm, uplo, n, k, ab, ldab,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: k,ldab,n
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(in) :: ab(ldab,*)
+               end function zlanhb
+#else 
+               module procedure stdlib_zlanhb
+#endif
+          end interface lanhb
+
+
+
+          !> LANHE:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the  element of  largest absolute value  of a
+          !> complex hermitian matrix A.
+          interface lanhe
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function clanhe( norm, uplo, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(in) :: a(lda,*)
+               end function clanhe
+#else 
+               module procedure stdlib_clanhe
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlanhe
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zlanhe( norm, uplo, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(in) :: a(lda,*)
+               end function zlanhe
+#else 
+               module procedure stdlib_zlanhe
+#endif
+          end interface lanhe
+
+
+
+          !> LANHF:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the  element of  largest absolute value  of a
+          !> complex Hermitian matrix A in RFP format.
+          interface lanhf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function clanhf( norm, transr, uplo, n, a, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,transr,uplo
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(out) :: work(0:*)
+                    complex(sp), intent(in) :: a(0:*)
+               end function clanhf
+#else 
+               module procedure stdlib_clanhf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlanhf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zlanhf( norm, transr, uplo, n, a, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,transr,uplo
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(out) :: work(0:*)
+                    complex(dp), intent(in) :: a(0:*)
+               end function zlanhf
+#else 
+               module procedure stdlib_zlanhf
+#endif
+          end interface lanhf
+
+
+
+          !> LANHP:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the  element of  largest absolute value  of a
+          !> complex hermitian matrix A,  supplied in packed form.
+          interface lanhp
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function clanhp( norm, uplo, n, ap, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(in) :: ap(*)
+               end function clanhp
+#else 
+               module procedure stdlib_clanhp
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlanhp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zlanhp( norm, uplo, n, ap, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(in) :: ap(*)
+               end function zlanhp
+#else 
+               module procedure stdlib_zlanhp
+#endif
+          end interface lanhp
+
+
+
+          !> LANHS:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the  element of  largest absolute value  of a
+          !> Hessenberg matrix A.
+          interface lanhs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function clanhs( norm, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(in) :: a(lda,*)
+               end function clanhs
+#else 
+               module procedure stdlib_clanhs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dlanhs( norm, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(out) :: work(*)
+               end function dlanhs
+#else 
+               module procedure stdlib_dlanhs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlanhs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function slanhs( norm, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(out) :: work(*)
+               end function slanhs
+#else 
+               module procedure stdlib_slanhs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlanhs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zlanhs( norm, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(in) :: a(lda,*)
+               end function zlanhs
+#else 
+               module procedure stdlib_zlanhs
+#endif
+          end interface lanhs
+
+
+
+          !> LANHT:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the  element of  largest absolute value  of a
+          !> complex Hermitian tridiagonal matrix A.
+          interface lanht
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(sp)             function clanht( norm, n, d, e )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: d(*)
+                    complex(sp), intent(in) :: e(*)
+               end function clanht
+#else 
+               module procedure stdlib_clanht
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlanht
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(dp) function zlanht( norm, n, d, e )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: d(*)
+                    complex(dp), intent(in) :: e(*)
+               end function zlanht
+#else 
+               module procedure stdlib_zlanht
+#endif
+          end interface lanht
+
+
+
+          !> LANSB:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the element of  largest absolute value  of an
+          !> n by n symmetric band matrix A,  with k super-diagonals.
+          interface lansb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function clansb( norm, uplo, n, k, ab, ldab,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: k,ldab,n
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(in) :: ab(ldab,*)
+               end function clansb
+#else 
+               module procedure stdlib_clansb
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dlansb( norm, uplo, n, k, ab, ldab,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: k,ldab,n
+                    real(dp), intent(in) :: ab(ldab,*)
+                    real(dp), intent(out) :: work(*)
+               end function dlansb
+#else 
+               module procedure stdlib_dlansb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlansb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function slansb( norm, uplo, n, k, ab, ldab,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: k,ldab,n
+                    real(sp), intent(in) :: ab(ldab,*)
+                    real(sp), intent(out) :: work(*)
+               end function slansb
+#else 
+               module procedure stdlib_slansb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlansb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zlansb( norm, uplo, n, k, ab, ldab,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: k,ldab,n
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(in) :: ab(ldab,*)
+               end function zlansb
+#else 
+               module procedure stdlib_zlansb
+#endif
+          end interface lansb
+
+
+
+          !> LANSF: returns the value of the one norm, or the Frobenius norm, or
+          !> the infinity norm, or the element of largest absolute value of a
+          !> real symmetric matrix A in RFP format.
+          interface lansf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dlansf( norm, transr, uplo, n, a, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,transr,uplo
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: a(0:*)
+                    real(dp), intent(out) :: work(0:*)
+               end function dlansf
+#else 
+               module procedure stdlib_dlansf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlansf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp) function slansf( norm, transr, uplo, n, a, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,transr,uplo
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: a(0:*)
+                    real(sp), intent(out) :: work(0:*)
+               end function slansf
+#else 
+               module procedure stdlib_slansf
+#endif
+          end interface lansf
+
+
+
+          !> LANSP:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the  element of  largest absolute value  of a
+          !> complex symmetric matrix A,  supplied in packed form.
+          interface lansp
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function clansp( norm, uplo, n, ap, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(in) :: ap(*)
+               end function clansp
+#else 
+               module procedure stdlib_clansp
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dlansp( norm, uplo, n, ap, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: ap(*)
+                    real(dp), intent(out) :: work(*)
+               end function dlansp
+#else 
+               module procedure stdlib_dlansp
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlansp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function slansp( norm, uplo, n, ap, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: ap(*)
+                    real(sp), intent(out) :: work(*)
+               end function slansp
+#else 
+               module procedure stdlib_slansp
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlansp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zlansp( norm, uplo, n, ap, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(in) :: ap(*)
+               end function zlansp
+#else 
+               module procedure stdlib_zlansp
+#endif
+          end interface lansp
+
+
+
+          !> LANST:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the  element of  largest absolute value  of a
+          !> real symmetric tridiagonal matrix A.
+          interface lanst
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(dp) function dlanst( norm, n, d, e )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: d(*),e(*)
+               end function dlanst
+#else 
+               module procedure stdlib_dlanst
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlanst
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure real(sp)             function slanst( norm, n, d, e )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: d(*),e(*)
+               end function slanst
+#else 
+               module procedure stdlib_slanst
+#endif
+          end interface lanst
+
+
+
+          !> LANSY:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the  element of  largest absolute value  of a
+          !> complex symmetric matrix A.
+          interface lansy
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function clansy( norm, uplo, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(in) :: a(lda,*)
+               end function clansy
+#else 
+               module procedure stdlib_clansy
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dlansy( norm, uplo, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(out) :: work(*)
+               end function dlansy
+#else 
+               module procedure stdlib_dlansy
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlansy
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function slansy( norm, uplo, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(out) :: work(*)
+               end function slansy
+#else 
+               module procedure stdlib_slansy
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlansy
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zlansy( norm, uplo, n, a, lda, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: norm,uplo
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(in) :: a(lda,*)
+               end function zlansy
+#else 
+               module procedure stdlib_zlansy
+#endif
+          end interface lansy
+
+
+
+          !> LANTB:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the element of  largest absolute value  of an
+          !> n by n triangular band matrix A,  with ( k + 1 ) diagonals.
+          interface lantb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function clantb( norm, uplo, diag, n, k, ab,ldab, work )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(in) :: k,ldab,n
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(in) :: ab(ldab,*)
+               end function clantb
+#else 
+               module procedure stdlib_clantb
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dlantb( norm, uplo, diag, n, k, ab,ldab, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(in) :: k,ldab,n
+                    real(dp), intent(in) :: ab(ldab,*)
+                    real(dp), intent(out) :: work(*)
+               end function dlantb
+#else 
+               module procedure stdlib_dlantb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlantb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function slantb( norm, uplo, diag, n, k, ab,ldab, work )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(in) :: k,ldab,n
+                    real(sp), intent(in) :: ab(ldab,*)
+                    real(sp), intent(out) :: work(*)
+               end function slantb
+#else 
+               module procedure stdlib_slantb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlantb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zlantb( norm, uplo, diag, n, k, ab,ldab, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(in) :: k,ldab,n
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(in) :: ab(ldab,*)
+               end function zlantb
+#else 
+               module procedure stdlib_zlantb
+#endif
+          end interface lantb
+
+
+
+          !> LANTP:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the  element of  largest absolute value  of a
+          !> triangular matrix A, supplied in packed form.
+          interface lantp
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function clantp( norm, uplo, diag, n, ap, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(in) :: ap(*)
+               end function clantp
+#else 
+               module procedure stdlib_clantp
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dlantp( norm, uplo, diag, n, ap, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: ap(*)
+                    real(dp), intent(out) :: work(*)
+               end function dlantp
+#else 
+               module procedure stdlib_dlantp
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlantp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function slantp( norm, uplo, diag, n, ap, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: ap(*)
+                    real(sp), intent(out) :: work(*)
+               end function slantp
+#else 
+               module procedure stdlib_slantp
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlantp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zlantp( norm, uplo, diag, n, ap, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(in) :: ap(*)
+               end function zlantp
+#else 
+               module procedure stdlib_zlantp
+#endif
+          end interface lantp
+
+
+
+          !> LANTR:  returns the value of the one norm,  or the Frobenius norm, or
+          !> the  infinity norm,  or the  element of  largest absolute value  of a
+          !> trapezoidal or triangular matrix A.
+          interface lantr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function clantr( norm, uplo, diag, m, n, a, lda,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(in) :: a(lda,*)
+               end function clantr
+#else 
+               module procedure stdlib_clantr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function dlantr( norm, uplo, diag, m, n, a, lda,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(out) :: work(*)
+               end function dlantr
+#else 
+               module procedure stdlib_dlantr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlantr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(sp)             function slantr( norm, uplo, diag, m, n, a, lda,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(out) :: work(*)
+               end function slantr
+#else 
+               module procedure stdlib_slantr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlantr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               real(dp) function zlantr( norm, uplo, diag, m, n, a, lda,work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(in) :: a(lda,*)
+               end function zlantr
+#else 
+               module procedure stdlib_zlantr
+#endif
+          end interface lantr
+
+
+
+          !> LAORHR_COL_GETRFNP: computes the modified LU factorization without
+          !> pivoting of a real general M-by-N matrix A. The factorization has
+          !> the form:
+          !> A - S = L * U,
+          !> where:
+          !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+          !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+          !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+          !> i-1 steps of Gaussian elimination. This means that the diagonal
+          !> element at each step of "modified" Gaussian elimination is
+          !> at least one in absolute value (so that division-by-zero not
+          !> not possible during the division by the diagonal element);
+          !> L is a M-by-N lower triangular matrix with unit diagonal elements
+          !> (lower trapezoidal if M > N);
+          !> and U is a M-by-N upper triangular matrix
+          !> (upper trapezoidal if M < N).
+          !> This routine is an auxiliary routine used in the Householder
+          !> reconstruction routine DORHR_COL. In DORHR_COL, this routine is
+          !> applied to an M-by-N matrix A with orthonormal columns, where each
+          !> element is bounded by one in absolute value. With the choice of
+          !> the matrix S above, one can show that the diagonal element at each
+          !> step of Gaussian elimination is the largest (in absolute value) in
+          !> the column on or below the diagonal, so that no pivoting is required
+          !> for numerical stability [1].
+          !> For more details on the Householder reconstruction algorithm,
+          !> including the modified LU factorization, see [1].
+          !> This is the blocked right-looking version of the algorithm,
+          !> calling Level 3 BLAS to update the submatrix. To factorize a block,
+          !> this routine calls the recursive routine LAORHR_COL_GETRFNP2.
+          !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+          !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+          !> E. Solomonik, J. Parallel Distrib. Comput.,
+          !> vol. 85, pp. 3-31, 2015.
+          interface laorhr_col_getrfnp
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaorhr_col_getrfnp( m, n, a, lda, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: d(*)
+               end subroutine dlaorhr_col_getrfnp
+#else 
+               module procedure stdlib_dlaorhr_col_getrfnp
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaorhr_col_getrfnp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaorhr_col_getrfnp( m, n, a, lda, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: d(*)
+               end subroutine slaorhr_col_getrfnp
+#else 
+               module procedure stdlib_slaorhr_col_getrfnp
+#endif
+          end interface laorhr_col_getrfnp
+
+
+
+          !> LAORHR_COL_GETRFNP2: computes the modified LU factorization without
+          !> pivoting of a real general M-by-N matrix A. The factorization has
+          !> the form:
+          !> A - S = L * U,
+          !> where:
+          !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+          !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+          !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+          !> i-1 steps of Gaussian elimination. This means that the diagonal
+          !> element at each step of "modified" Gaussian elimination is at
+          !> least one in absolute value (so that division-by-zero not
+          !> possible during the division by the diagonal element);
+          !> L is a M-by-N lower triangular matrix with unit diagonal elements
+          !> (lower trapezoidal if M > N);
+          !> and U is a M-by-N upper triangular matrix
+          !> (upper trapezoidal if M < N).
+          !> This routine is an auxiliary routine used in the Householder
+          !> reconstruction routine DORHR_COL. In DORHR_COL, this routine is
+          !> applied to an M-by-N matrix A with orthonormal columns, where each
+          !> element is bounded by one in absolute value. With the choice of
+          !> the matrix S above, one can show that the diagonal element at each
+          !> step of Gaussian elimination is the largest (in absolute value) in
+          !> the column on or below the diagonal, so that no pivoting is required
+          !> for numerical stability [1].
+          !> For more details on the Householder reconstruction algorithm,
+          !> including the modified LU factorization, see [1].
+          !> This is the recursive version of the LU factorization algorithm.
+          !> Denote A - S by B. The algorithm divides the matrix B into four
+          !> submatrices:
+          !> [  B11 | B12  ]  where B11 is n1 by n1,
+          !> B = [ -----|----- ]        B21 is (m-n1) by n1,
+          !> [  B21 | B22  ]        B12 is n1 by n2,
+          !> B22 is (m-n1) by n2,
+          !> with n1 = min(m,n)/2, n2 = n-n1.
+          !> The subroutine calls itself to factor B11, solves for B21,
+          !> solves for B12, updates B22, then calls itself to factor B22.
+          !> For more details on the recursive LU algorithm, see [2].
+          !> LAORHR_COL_GETRFNP2 is called to factorize a block by the blocked
+          !> routine DLAORHR_COL_GETRFNP, which uses blocked code calling
+          !> Level 3 BLAS to update the submatrix. However, LAORHR_COL_GETRFNP2
+          !> is self-sufficient and can be used without DLAORHR_COL_GETRFNP.
+          !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+          !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+          !> E. Solomonik, J. Parallel Distrib. Comput.,
+          !> vol. 85, pp. 3-31, 2015.
+          !> [2] "Recursion leads to automatic variable blocking for dense linear
+          !> algebra algorithms", F. Gustavson, IBM J. of Res. and Dev.,
+          !> vol. 41, no. 6, pp. 737-755, 1997.
+          interface laorhr_col_getrfnp2
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine dlaorhr_col_getrfnp2( m, n, a, lda, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: d(*)
+               end subroutine dlaorhr_col_getrfnp2
+#else 
+               module procedure stdlib_dlaorhr_col_getrfnp2
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaorhr_col_getrfnp2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine slaorhr_col_getrfnp2( m, n, a, lda, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: d(*)
+               end subroutine slaorhr_col_getrfnp2
+#else 
+               module procedure stdlib_slaorhr_col_getrfnp2
+#endif
+          end interface laorhr_col_getrfnp2
+
+
+
+          !> Given two column vectors X and Y, let
+          !> A = ( X Y ).
+          !> The subroutine first computes the QR factorization of A = Q*R,
+          !> and then computes the SVD of the 2-by-2 upper triangular matrix R.
+          !> The smaller singular value of R is returned in SSMIN, which is used
+          !> as the measurement of the linear dependency of the vectors X and Y.
+          interface lapll
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clapll( n, x, incx, y, incy, ssmin )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(sp), intent(out) :: ssmin
+                    complex(sp), intent(inout) :: x(*),y(*)
+               end subroutine clapll
+#else 
+               module procedure stdlib_clapll
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlapll( n, x, incx, y, incy, ssmin )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(dp), intent(out) :: ssmin
+                    real(dp), intent(inout) :: x(*),y(*)
+               end subroutine dlapll
+#else 
+               module procedure stdlib_dlapll
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlapll
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slapll( n, x, incx, y, incy, ssmin )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(sp), intent(out) :: ssmin
+                    real(sp), intent(inout) :: x(*),y(*)
+               end subroutine slapll
+#else 
+               module procedure stdlib_slapll
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlapll
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlapll( n, x, incx, y, incy, ssmin )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(dp), intent(out) :: ssmin
+                    complex(dp), intent(inout) :: x(*),y(*)
+               end subroutine zlapll
+#else 
+               module procedure stdlib_zlapll
+#endif
+          end interface lapll
+
+
+
+          !> LAPMR: rearranges the rows of the M by N matrix X as specified
+          !> by the permutation K(1),K(2),...,K(M) of the integers 1,...,M.
+          !> If FORWRD = .TRUE.,  forward permutation:
+          !> X(K(I),*) is moved X(I,*) for I = 1,2,...,M.
+          !> If FORWRD = .FALSE., backward permutation:
+          !> X(I,*) is moved to X(K(I),*) for I = 1,2,...,M.
+          interface lapmr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clapmr( forwrd, m, n, x, ldx, k )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: forwrd
+                    integer(ilp), intent(in) :: ldx,m,n
+                    integer(ilp), intent(inout) :: k(*)
+                    complex(sp), intent(inout) :: x(ldx,*)
+               end subroutine clapmr
+#else 
+               module procedure stdlib_clapmr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlapmr( forwrd, m, n, x, ldx, k )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: forwrd
+                    integer(ilp), intent(in) :: ldx,m,n
+                    integer(ilp), intent(inout) :: k(*)
+                    real(dp), intent(inout) :: x(ldx,*)
+               end subroutine dlapmr
+#else 
+               module procedure stdlib_dlapmr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlapmr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slapmr( forwrd, m, n, x, ldx, k )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: forwrd
+                    integer(ilp), intent(in) :: ldx,m,n
+                    integer(ilp), intent(inout) :: k(*)
+                    real(sp), intent(inout) :: x(ldx,*)
+               end subroutine slapmr
+#else 
+               module procedure stdlib_slapmr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlapmr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlapmr( forwrd, m, n, x, ldx, k )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: forwrd
+                    integer(ilp), intent(in) :: ldx,m,n
+                    integer(ilp), intent(inout) :: k(*)
+                    complex(dp), intent(inout) :: x(ldx,*)
+               end subroutine zlapmr
+#else 
+               module procedure stdlib_zlapmr
+#endif
+          end interface lapmr
+
+
+
+          !> LAPMT: rearranges the columns of the M by N matrix X as specified
+          !> by the permutation K(1),K(2),...,K(N) of the integers 1,...,N.
+          !> If FORWRD = .TRUE.,  forward permutation:
+          !> X(*,K(J)) is moved X(*,J) for J = 1,2,...,N.
+          !> If FORWRD = .FALSE., backward permutation:
+          !> X(*,J) is moved to X(*,K(J)) for J = 1,2,...,N.
+          interface lapmt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clapmt( forwrd, m, n, x, ldx, k )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: forwrd
+                    integer(ilp), intent(in) :: ldx,m,n
+                    integer(ilp), intent(inout) :: k(*)
+                    complex(sp), intent(inout) :: x(ldx,*)
+               end subroutine clapmt
+#else 
+               module procedure stdlib_clapmt
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlapmt( forwrd, m, n, x, ldx, k )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: forwrd
+                    integer(ilp), intent(in) :: ldx,m,n
+                    integer(ilp), intent(inout) :: k(*)
+                    real(dp), intent(inout) :: x(ldx,*)
+               end subroutine dlapmt
+#else 
+               module procedure stdlib_dlapmt
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlapmt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slapmt( forwrd, m, n, x, ldx, k )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: forwrd
+                    integer(ilp), intent(in) :: ldx,m,n
+                    integer(ilp), intent(inout) :: k(*)
+                    real(sp), intent(inout) :: x(ldx,*)
+               end subroutine slapmt
+#else 
+               module procedure stdlib_slapmt
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlapmt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlapmt( forwrd, m, n, x, ldx, k )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: forwrd
+                    integer(ilp), intent(in) :: ldx,m,n
+                    integer(ilp), intent(inout) :: k(*)
+                    complex(dp), intent(inout) :: x(ldx,*)
+               end subroutine zlapmt
+#else 
+               module procedure stdlib_zlapmt
+#endif
+          end interface lapmt
+
+
+
+          !> LAQGB: equilibrates a general M by N band matrix A with KL
+          !> subdiagonals and KU superdiagonals using the row and scaling factors
+          !> in the vectors R and C.
+          interface laqgb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqgb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(sp), intent(in) :: amax,colcnd,rowcnd,c(*),r(*)
+                    complex(sp), intent(inout) :: ab(ldab,*)
+               end subroutine claqgb
+#else 
+               module procedure stdlib_claqgb
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaqgb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(dp), intent(in) :: amax,colcnd,rowcnd,c(*),r(*)
+                    real(dp), intent(inout) :: ab(ldab,*)
+               end subroutine dlaqgb
+#else 
+               module procedure stdlib_dlaqgb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqgb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaqgb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(sp), intent(in) :: amax,colcnd,rowcnd,c(*),r(*)
+                    real(sp), intent(inout) :: ab(ldab,*)
+               end subroutine slaqgb
+#else 
+               module procedure stdlib_slaqgb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqgb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqgb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    integer(ilp), intent(in) :: kl,ku,ldab,m,n
+                    real(dp), intent(in) :: amax,colcnd,rowcnd,c(*),r(*)
+                    complex(dp), intent(inout) :: ab(ldab,*)
+               end subroutine zlaqgb
+#else 
+               module procedure stdlib_zlaqgb
+#endif
+          end interface laqgb
+
+
+
+          !> LAQGE: equilibrates a general M by N matrix A using the row and
+          !> column scaling factors in the vectors R and C.
+          interface laqge
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqge( m, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(in) :: amax,colcnd,rowcnd,c(*),r(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine claqge
+#else 
+               module procedure stdlib_claqge
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaqge( m, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(in) :: amax,colcnd,rowcnd,c(*),r(*)
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dlaqge
+#else 
+               module procedure stdlib_dlaqge
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqge
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaqge( m, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(in) :: amax,colcnd,rowcnd,c(*),r(*)
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine slaqge
+#else 
+               module procedure stdlib_slaqge
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqge
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqge( m, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(in) :: amax,colcnd,rowcnd,c(*),r(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zlaqge
+#else 
+               module procedure stdlib_zlaqge
+#endif
+          end interface laqge
+
+
+
+          !> LAQHB: equilibrates an Hermitian band matrix A using the scaling
+          !> factors in the vector S.
+          interface laqhb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqhb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(sp), intent(in) :: amax,scond
+                    real(sp), intent(out) :: s(*)
+                    complex(sp), intent(inout) :: ab(ldab,*)
+               end subroutine claqhb
+#else 
+               module procedure stdlib_claqhb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqhb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqhb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(dp), intent(in) :: amax,scond
+                    real(dp), intent(out) :: s(*)
+                    complex(dp), intent(inout) :: ab(ldab,*)
+               end subroutine zlaqhb
+#else 
+               module procedure stdlib_zlaqhb
+#endif
+          end interface laqhb
+
+
+
+          !> LAQHE: equilibrates a Hermitian matrix A using the scaling factors
+          !> in the vector S.
+          interface laqhe
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqhe( uplo, n, a, lda, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(in) :: amax,scond,s(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine claqhe
+#else 
+               module procedure stdlib_claqhe
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqhe
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqhe( uplo, n, a, lda, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(in) :: amax,scond,s(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zlaqhe
+#else 
+               module procedure stdlib_zlaqhe
+#endif
+          end interface laqhe
+
+
+
+          !> LAQHP: equilibrates a Hermitian matrix A using the scaling factors
+          !> in the vector S.
+          interface laqhp
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqhp( uplo, n, ap, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: amax,scond,s(*)
+                    complex(sp), intent(inout) :: ap(*)
+               end subroutine claqhp
+#else 
+               module procedure stdlib_claqhp
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqhp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqhp( uplo, n, ap, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: amax,scond,s(*)
+                    complex(dp), intent(inout) :: ap(*)
+               end subroutine zlaqhp
+#else 
+               module procedure stdlib_zlaqhp
+#endif
+          end interface laqhp
+
+
+
+          !> LAQPS: computes a step of QR factorization with column pivoting
+          !> of a complex M-by-N matrix A by using Blas-3.  It tries to factorize
+          !> NB columns from A starting from the row OFFSET+1, and updates all
+          !> of the matrix with Blas-3 xGEMM.
+          !> In some cases, due to catastrophic cancellations, it cannot
+          !> factorize NB columns.  Hence, the actual number of factorized
+          !> columns is returned in KB.
+          !> Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.
+          interface laqps
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqps( m, n, offset, nb, kb, a, lda, jpvt, tau, vn1,vn2, auxv, f, &
+                         ldf )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: kb
+                    integer(ilp), intent(in) :: lda,ldf,m,n,nb,offset
+                    integer(ilp), intent(inout) :: jpvt(*)
+                    real(sp), intent(inout) :: vn1(*),vn2(*)
+                    complex(sp), intent(inout) :: a(lda,*),auxv(*),f(ldf,*)
+                    complex(sp), intent(out) :: tau(*)
+               end subroutine claqps
+#else 
+               module procedure stdlib_claqps
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaqps( m, n, offset, nb, kb, a, lda, jpvt, tau, vn1,vn2, auxv, f, &
+                         ldf )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: kb
+                    integer(ilp), intent(in) :: lda,ldf,m,n,nb,offset
+                    integer(ilp), intent(inout) :: jpvt(*)
+                    real(dp), intent(inout) :: a(lda,*),auxv(*),f(ldf,*),vn1(*),vn2(*)
+                    real(dp), intent(out) :: tau(*)
+               end subroutine dlaqps
+#else 
+               module procedure stdlib_dlaqps
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqps
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaqps( m, n, offset, nb, kb, a, lda, jpvt, tau, vn1,vn2, auxv, f, &
+                         ldf )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: kb
+                    integer(ilp), intent(in) :: lda,ldf,m,n,nb,offset
+                    integer(ilp), intent(inout) :: jpvt(*)
+                    real(sp), intent(inout) :: a(lda,*),auxv(*),f(ldf,*),vn1(*),vn2(*)
+                    real(sp), intent(out) :: tau(*)
+               end subroutine slaqps
+#else 
+               module procedure stdlib_slaqps
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqps
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqps( m, n, offset, nb, kb, a, lda, jpvt, tau, vn1,vn2, auxv, f, &
+                         ldf )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: kb
+                    integer(ilp), intent(in) :: lda,ldf,m,n,nb,offset
+                    integer(ilp), intent(inout) :: jpvt(*)
+                    real(dp), intent(inout) :: vn1(*),vn2(*)
+                    complex(dp), intent(inout) :: a(lda,*),auxv(*),f(ldf,*)
+                    complex(dp), intent(out) :: tau(*)
+               end subroutine zlaqps
+#else 
+               module procedure stdlib_zlaqps
+#endif
+          end interface laqps
+
+
+
+          !> LAQR0: computes the eigenvalues of a Hessenberg matrix H
+          !> and, optionally, the matrices T and Z from the Schur decomposition
+          !> H = Z T Z**H, where T is an upper triangular matrix (the
+          !> Schur form), and Z is the unitary matrix of Schur vectors.
+          !> Optionally Z may be postmultiplied into an input unitary
+          !> matrix Q so that this routine can give the Schur factorization
+          !> of a matrix A which has been reduced to the Hessenberg form H
+          !> by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+          interface laqr0
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqr0( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ihiz,ilo,iloz,ldh,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    logical(lk), intent(in) :: wantt,wantz
+                    complex(sp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    complex(sp), intent(out) :: w(*),work(*)
+               end subroutine claqr0
+#else 
+               module procedure stdlib_claqr0
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, ldz, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ihiz,ilo,iloz,ldh,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    logical(lk), intent(in) :: wantt,wantz
+                    real(dp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    real(dp), intent(out) :: wi(*),work(*),wr(*)
+               end subroutine dlaqr0
+#else 
+               module procedure stdlib_dlaqr0
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqr0
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine slaqr0( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, ldz, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ihiz,ilo,iloz,ldh,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    logical(lk), intent(in) :: wantt,wantz
+                    real(sp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    real(sp), intent(out) :: wi(*),work(*),wr(*)
+               end subroutine slaqr0
+#else 
+               module procedure stdlib_slaqr0
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqr0
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ihiz,ilo,iloz,ldh,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    logical(lk), intent(in) :: wantt,wantz
+                    complex(dp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    complex(dp), intent(out) :: w(*),work(*)
+               end subroutine zlaqr0
+#else 
+               module procedure stdlib_zlaqr0
+#endif
+          end interface laqr0
+
+
+
+          !> Given a 2-by-2 or 3-by-3 matrix H, LAQR1: sets v to a
+          !> scalar multiple of the first column of the product
+          !> (*)  K = (H - s1*I)*(H - s2*I)
+          !> scaling to avoid overflows and most underflows.
+          !> This is useful for starting double implicit shift bulges
+          !> in the QR algorithm.
+          interface laqr1
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqr1( n, h, ldh, s1, s2, v )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(sp), intent(in) :: s1,s2,h(ldh,*)
+                    integer(ilp), intent(in) :: ldh,n
+                    complex(sp), intent(out) :: v(*)
+               end subroutine claqr1
+#else 
+               module procedure stdlib_claqr1
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaqr1( n, h, ldh, sr1, si1, sr2, si2, v )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: si1,si2,sr1,sr2,h(ldh,*)
+                    integer(ilp), intent(in) :: ldh,n
+                    real(dp), intent(out) :: v(*)
+               end subroutine dlaqr1
+#else 
+               module procedure stdlib_dlaqr1
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqr1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaqr1( n, h, ldh, sr1, si1, sr2, si2, v )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: si1,si2,sr1,sr2,h(ldh,*)
+                    integer(ilp), intent(in) :: ldh,n
+                    real(sp), intent(out) :: v(*)
+               end subroutine slaqr1
+#else 
+               module procedure stdlib_slaqr1
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqr1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqr1( n, h, ldh, s1, s2, v )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    complex(dp), intent(in) :: s1,s2,h(ldh,*)
+                    integer(ilp), intent(in) :: ldh,n
+                    complex(dp), intent(out) :: v(*)
+               end subroutine zlaqr1
+#else 
+               module procedure stdlib_zlaqr1
+#endif
+          end interface laqr1
+
+
+
+          !> LAQR4: implements one level of recursion for CLAQR0.
+          !> It is a complete implementation of the small bulge multi-shift
+          !> QR algorithm.  It may be called by CLAQR0 and, for large enough
+          !> deflation window size, it may be called by CLAQR3.  This
+          !> subroutine is identical to CLAQR0 except that it calls CLAQR2
+          !> instead of CLAQR3.
+          !> LAQR4 computes the eigenvalues of a Hessenberg matrix H
+          !> and, optionally, the matrices T and Z from the Schur decomposition
+          !> H = Z T Z**H, where T is an upper triangular matrix (the
+          !> Schur form), and Z is the unitary matrix of Schur vectors.
+          !> Optionally Z may be postmultiplied into an input unitary
+          !> matrix Q so that this routine can give the Schur factorization
+          !> of a matrix A which has been reduced to the Hessenberg form H
+          !> by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+          interface laqr4
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqr4( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ihiz,ilo,iloz,ldh,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    logical(lk), intent(in) :: wantt,wantz
+                    complex(sp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    complex(sp), intent(out) :: w(*),work(*)
+               end subroutine claqr4
+#else 
+               module procedure stdlib_claqr4
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dlaqr4( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, ldz, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ihiz,ilo,iloz,ldh,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    logical(lk), intent(in) :: wantt,wantz
+                    real(dp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    real(dp), intent(out) :: wi(*),work(*),wr(*)
+               end subroutine dlaqr4
+#else 
+               module procedure stdlib_dlaqr4
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqr4
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine slaqr4( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, ldz, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ihiz,ilo,iloz,ldh,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    logical(lk), intent(in) :: wantt,wantz
+                    real(sp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    real(sp), intent(out) :: wi(*),work(*),wr(*)
+               end subroutine slaqr4
+#else 
+               module procedure stdlib_slaqr4
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqr4
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqr4( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, &
+                         work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ihiz,ilo,iloz,ldh,ldz,lwork,n
+                    integer(ilp), intent(out) :: info
+                    logical(lk), intent(in) :: wantt,wantz
+                    complex(dp), intent(inout) :: h(ldh,*),z(ldz,*)
+                    complex(dp), intent(out) :: w(*),work(*)
+               end subroutine zlaqr4
+#else 
+               module procedure stdlib_zlaqr4
+#endif
+          end interface laqr4
+
+
+
+          !> LAQR5: called by CLAQR0 performs a
+          !> single small-bulge multi-shift QR sweep.
+          interface laqr5
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqr5( wantt, wantz, kacc22, n, ktop, kbot, nshfts, s,h, ldh, &
+                         iloz, ihiz, z, ldz, v, ldv, u, ldu, nv,wv, ldwv, nh, wh, ldwh )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihiz,iloz,kacc22,kbot,ktop,ldh,ldu,ldv,ldwh,ldwv,&
+                              ldz,n,nh,nshfts,nv
+                    logical(lk), intent(in) :: wantt,wantz
+                    complex(sp), intent(inout) :: h(ldh,*),s(*),z(ldz,*)
+                    complex(sp), intent(out) :: u(ldu,*),v(ldv,*),wh(ldwh,*),wv(ldwv,*)
+               end subroutine claqr5
+#else 
+               module procedure stdlib_claqr5
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaqr5( wantt, wantz, kacc22, n, ktop, kbot, nshfts,sr, si, h, ldh,&
+                          iloz, ihiz, z, ldz, v, ldv, u,ldu, nv, wv, ldwv, nh, wh, ldwh )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihiz,iloz,kacc22,kbot,ktop,ldh,ldu,ldv,ldwh,ldwv,&
+                              ldz,n,nh,nshfts,nv
+                    logical(lk), intent(in) :: wantt,wantz
+                    real(dp), intent(inout) :: h(ldh,*),si(*),sr(*),z(ldz,*)
+                    real(dp), intent(out) :: u(ldu,*),v(ldv,*),wh(ldwh,*),wv(ldwv,*)
+               end subroutine dlaqr5
+#else 
+               module procedure stdlib_dlaqr5
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqr5
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaqr5( wantt, wantz, kacc22, n, ktop, kbot, nshfts,sr, si, h, ldh,&
+                          iloz, ihiz, z, ldz, v, ldv, u,ldu, nv, wv, ldwv, nh, wh, ldwh )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihiz,iloz,kacc22,kbot,ktop,ldh,ldu,ldv,ldwh,ldwv,&
+                              ldz,n,nh,nshfts,nv
+                    logical(lk), intent(in) :: wantt,wantz
+                    real(sp), intent(inout) :: h(ldh,*),si(*),sr(*),z(ldz,*)
+                    real(sp), intent(out) :: u(ldu,*),v(ldv,*),wh(ldwh,*),wv(ldwv,*)
+               end subroutine slaqr5
+#else 
+               module procedure stdlib_slaqr5
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqr5
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqr5( wantt, wantz, kacc22, n, ktop, kbot, nshfts, s,h, ldh, &
+                         iloz, ihiz, z, ldz, v, ldv, u, ldu, nv,wv, ldwv, nh, wh, ldwh )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihiz,iloz,kacc22,kbot,ktop,ldh,ldu,ldv,ldwh,ldwv,&
+                              ldz,n,nh,nshfts,nv
+                    logical(lk), intent(in) :: wantt,wantz
+                    complex(dp), intent(inout) :: h(ldh,*),s(*),z(ldz,*)
+                    complex(dp), intent(out) :: u(ldu,*),v(ldv,*),wh(ldwh,*),wv(ldwv,*)
+               end subroutine zlaqr5
+#else 
+               module procedure stdlib_zlaqr5
+#endif
+          end interface laqr5
+
+
+
+          !> LAQSB: equilibrates a symmetric band matrix A using the scaling
+          !> factors in the vector S.
+          interface laqsb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqsb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(sp), intent(in) :: amax,scond,s(*)
+                    complex(sp), intent(inout) :: ab(ldab,*)
+               end subroutine claqsb
+#else 
+               module procedure stdlib_claqsb
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaqsb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(dp), intent(in) :: amax,scond,s(*)
+                    real(dp), intent(inout) :: ab(ldab,*)
+               end subroutine dlaqsb
+#else 
+               module procedure stdlib_dlaqsb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqsb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaqsb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(sp), intent(in) :: amax,scond,s(*)
+                    real(sp), intent(inout) :: ab(ldab,*)
+               end subroutine slaqsb
+#else 
+               module procedure stdlib_slaqsb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqsb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqsb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(dp), intent(in) :: amax,scond,s(*)
+                    complex(dp), intent(inout) :: ab(ldab,*)
+               end subroutine zlaqsb
+#else 
+               module procedure stdlib_zlaqsb
+#endif
+          end interface laqsb
+
+
+
+          !> LAQSP: equilibrates a symmetric matrix A using the scaling factors
+          !> in the vector S.
+          interface laqsp
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqsp( uplo, n, ap, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: amax,scond,s(*)
+                    complex(sp), intent(inout) :: ap(*)
+               end subroutine claqsp
+#else 
+               module procedure stdlib_claqsp
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaqsp( uplo, n, ap, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: amax,scond,s(*)
+                    real(dp), intent(inout) :: ap(*)
+               end subroutine dlaqsp
+#else 
+               module procedure stdlib_dlaqsp
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqsp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaqsp( uplo, n, ap, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: amax,scond,s(*)
+                    real(sp), intent(inout) :: ap(*)
+               end subroutine slaqsp
+#else 
+               module procedure stdlib_slaqsp
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqsp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqsp( uplo, n, ap, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: amax,scond,s(*)
+                    complex(dp), intent(inout) :: ap(*)
+               end subroutine zlaqsp
+#else 
+               module procedure stdlib_zlaqsp
+#endif
+          end interface laqsp
+
+
+
+          !> LAQSY: equilibrates a symmetric matrix A using the scaling factors
+          !> in the vector S.
+          interface laqsy
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqsy( uplo, n, a, lda, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(in) :: amax,scond,s(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine claqsy
+#else 
+               module procedure stdlib_claqsy
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaqsy( uplo, n, a, lda, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(in) :: amax,scond,s(*)
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dlaqsy
+#else 
+               module procedure stdlib_dlaqsy
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqsy
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaqsy( uplo, n, a, lda, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(in) :: amax,scond,s(*)
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine slaqsy
+#else 
+               module procedure stdlib_slaqsy
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqsy
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqsy( uplo, n, a, lda, s, scond, amax, equed )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(out) :: equed
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(in) :: amax,scond,s(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zlaqsy
+#else 
+               module procedure stdlib_zlaqsy
+#endif
+          end interface laqsy
+
+
+
+          !> LAQTR: solves the real quasi-triangular system
+          !> op(T)*p = scale*c,               if LREAL = .TRUE.
+          !> or the complex quasi-triangular systems
+          !> op(T + iB)*(p+iq) = scale*(c+id),  if LREAL = .FALSE.
+          !> in real arithmetic, where T is upper quasi-triangular.
+          !> If LREAL = .FALSE., then the first diagonal block of T must be
+          !> 1 by 1, B is the specially structured matrix
+          !> B = [ b(1) b(2) ... b(n) ]
+          !> [       w            ]
+          !> [           w        ]
+          !> [              .     ]
+          !> [                 w  ]
+          !> op(A) = A or A**T, A**T denotes the transpose of
+          !> matrix A.
+          !> On input, X = [ c ].  On output, X = [ p ].
+          !> [ d ]                  [ q ]
+          !> This subroutine is designed for the condition number estimation
+          !> in routine DTRSNA.
+          interface laqtr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dlaqtr( ltran, lreal, n, t, ldt, b, w, scale, x, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: lreal,ltran
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldt,n
+                    real(dp), intent(out) :: scale,work(*)
+                    real(dp), intent(in) :: w,b(*),t(ldt,*)
+                    real(dp), intent(inout) :: x(*)
+               end subroutine dlaqtr
+#else 
+               module procedure stdlib_dlaqtr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqtr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine slaqtr( ltran, lreal, n, t, ldt, b, w, scale, x, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: lreal,ltran
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldt,n
+                    real(sp), intent(out) :: scale,work(*)
+                    real(sp), intent(in) :: w,b(*),t(ldt,*)
+                    real(sp), intent(inout) :: x(*)
+               end subroutine slaqtr
+#else 
+               module procedure stdlib_slaqtr
+#endif
+          end interface laqtr
+
+
+
+          !> LAQZ0: computes the eigenvalues of a matrix pair (H,T),
+          !> where H is an upper Hessenberg matrix and T is upper triangular,
+          !> using the double-shift QZ method.
+          !> Matrix pairs of this type are produced by the reduction to
+          !> generalized upper Hessenberg form of a matrix pair (A,B):
+          !> A = Q1*H*Z1**H,  B = Q1*T*Z1**H,
+          !> as computed by CGGHRD.
+          !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+          !> also reduced to generalized Schur form,
+          !> H = Q*S*Z**H,  T = Q*P*Z**H,
+          !> where Q and Z are unitary matrices, P and S are an upper triangular
+          !> matrices.
+          !> Optionally, the unitary matrix Q from the generalized Schur
+          !> factorization may be postmultiplied into an input matrix Q1, and the
+          !> unitary matrix Z may be postmultiplied into an input matrix Z1.
+          !> If Q1 and Z1 are the unitary matrices from CGGHRD that reduced
+          !> the matrix pair (A,B) to generalized upper Hessenberg form, then the
+          !> output matrices Q1*Q and Z1*Z are the unitary factors from the
+          !> generalized Schur factorization of (A,B):
+          !> A = (Q1*Q)*S*(Z1*Z)**H,  B = (Q1*Q)*P*(Z1*Z)**H.
+          !> To avoid overflow, eigenvalues of the matrix pair (H,T) (equivalently,
+          !> of (A,B)) are computed as a pair of values (alpha,beta), where alpha is
+          !> complex and beta real.
+          !> If beta is nonzero, lambda = alpha / beta is an eigenvalue of the
+          !> generalized nonsymmetric eigenvalue problem (GNEP)
+          !> A*x = lambda*B*x
+          !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+          !> alternate form of the GNEP
+          !> mu*A*y = B*y.
+          !> Eigenvalues can be read directly from the generalized Schur
+          !> form:
+          !> alpha = S(i,i), beta = P(i,i).
+          !> Ref: C.B. Moler
+          !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+          !> pp. 241--256.
+          !> Ref: B. Kagstrom, D. Kressner, "Multishift Variants of the QZ
+          !> Algorithm with Aggressive Early Deflation", SIAM J. Numer.
+          !> Anal., 29(2006), pp. 199--227.
+          !> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
+          !> multipole rational QZ method with agressive early deflation"
+          interface laqz0
+#ifdef STDLIB_EXTERNAL_LAPACK
+               recursive subroutine claqz0( wants, wantq, wantz, n, ilo, ihi, a,lda, b, ldb, &
+                         alpha, beta, q, ldq, z,ldz, work, lwork, rwork, rec,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: wants,wantq,wantz
+                    integer(ilp), intent(in) :: n,ilo,ihi,lda,ldb,ldq,ldz,lwork,rec
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+                    complex(sp), intent(inout) :: alpha(*),beta(*),work(*)
+                    real(sp), intent(out) :: rwork(*)
+               end subroutine claqz0
+#else 
+               module procedure stdlib_claqz0
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               recursive subroutine dlaqz0( wants, wantq, wantz, n, ilo, ihi, a,lda, b, ldb, &
+                         alphar, alphai, beta,q, ldq, z, ldz, work, lwork, rec,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: wants,wantq,wantz
+                    integer(ilp), intent(in) :: n,ilo,ihi,lda,ldb,ldq,ldz,lwork,rec
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+                    real(dp), intent(inout) :: alphar(*),alphai(*),beta(*),work(*)
+               end subroutine dlaqz0
+#else 
+               module procedure stdlib_dlaqz0
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqz0
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               recursive subroutine slaqz0( wants, wantq, wantz, n, ilo, ihi, a,lda, b, ldb, &
+                         alphar, alphai, beta,q, ldq, z, ldz, work, lwork, rec,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: wants,wantq,wantz
+                    integer(ilp), intent(in) :: n,ilo,ihi,lda,ldb,ldq,ldz,lwork,rec
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+                    real(sp), intent(inout) :: alphar(*),alphai(*),beta(*),work(*)
+               end subroutine slaqz0
+#else 
+               module procedure stdlib_slaqz0
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqz0
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               recursive subroutine zlaqz0( wants, wantq, wantz, n, ilo, ihi, a,lda, b, ldb, &
+                         alpha, beta, q, ldq, z,ldz, work, lwork, rwork, rec,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: wants,wantq,wantz
+                    integer(ilp), intent(in) :: n,ilo,ihi,lda,ldb,ldq,ldz,lwork,rec
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+                    complex(dp), intent(inout) :: alpha(*),beta(*),work(*)
+                    real(dp), intent(out) :: rwork(*)
+               end subroutine zlaqz0
+#else 
+               module procedure stdlib_zlaqz0
+#endif
+          end interface laqz0
+
+
+
+          !> LAQZ1: chases a 1x1 shift bulge in a matrix pencil down a single position
+          interface laqz1
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claqz1( ilq, ilz, k, istartm, istopm, ihi, a, lda, b,ldb, nq, &
+                         qstart, q, ldq, nz, zstart, z, ldz )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: ilq,ilz
+                    integer(ilp), intent(in) :: k,lda,ldb,ldq,ldz,istartm,istopm,nq,nz,qstart,&
+                              zstart,ihi
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+               end subroutine claqz1
+#else 
+               module procedure stdlib_claqz1
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaqz1( a, lda, b, ldb, sr1, sr2, si, beta1, beta2,v )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: lda,ldb
+                    real(dp), intent(in) :: a(lda,*),b(ldb,*),sr1,sr2,si,beta1,beta2
+                    real(dp), intent(out) :: v(*)
+               end subroutine dlaqz1
+#else 
+               module procedure stdlib_dlaqz1
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqz1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaqz1( a, lda, b, ldb, sr1, sr2, si, beta1, beta2,v )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: lda,ldb
+                    real(sp), intent(in) :: a(lda,*),b(ldb,*),sr1,sr2,si,beta1,beta2
+                    real(sp), intent(out) :: v(*)
+               end subroutine slaqz1
+#else 
+               module procedure stdlib_slaqz1
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaqz1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaqz1( ilq, ilz, k, istartm, istopm, ihi, a, lda, b,ldb, nq, &
+                         qstart, q, ldq, nz, zstart, z, ldz )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: ilq,ilz
+                    integer(ilp), intent(in) :: k,lda,ldb,ldq,ldz,istartm,istopm,nq,nz,qstart,&
+                              zstart,ihi
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+               end subroutine zlaqz1
+#else 
+               module procedure stdlib_zlaqz1
+#endif
+          end interface laqz1
+
+
+
+          !> LAQZ4: Executes a single multishift QZ sweep
+          interface laqz4
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaqz4( ilschur, ilq, ilz, n, ilo, ihi, nshifts,nblock_desired, sr,&
+                si, ss, a, lda, b, ldb, q,ldq, z, ldz, qc, ldqc, zc, ldzc, work, lwork,info )
+                          
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: ilschur,ilq,ilz
+                    integer(ilp), intent(in) :: n,ilo,ihi,lda,ldb,ldq,ldz,lwork,nshifts,&
+                              nblock_desired,ldqc,ldzc
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*),qc(ldqc,*),zc(&
+                              ldzc,*)
+                    real(dp), intent(inout) :: work(*)
+                    real(dp), intent(inout) :: sr(*),si(*),ss(*)
+                    integer(ilp), intent(out) :: info
+               end subroutine dlaqz4
+#else 
+               module procedure stdlib_dlaqz4
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaqz4
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaqz4( ilschur, ilq, ilz, n, ilo, ihi, nshifts,nblock_desired, sr,&
+                si, ss, a, lda, b, ldb, q,ldq, z, ldz, qc, ldqc, zc, ldzc, work, lwork,info )
+                          
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: ilschur,ilq,ilz
+                    integer(ilp), intent(in) :: n,ilo,ihi,lda,ldb,ldq,ldz,lwork,nshifts,&
+                              nblock_desired,ldqc,ldzc
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*),qc(ldqc,*),zc(&
+                              ldzc,*)
+                    real(sp), intent(inout) :: work(*)
+                    real(sp), intent(inout) :: sr(*),si(*),ss(*)
+                    integer(ilp), intent(out) :: info
+               end subroutine slaqz4
+#else 
+               module procedure stdlib_slaqz4
+#endif
+          end interface laqz4
+
+
+
+          !> LAR1V: computes the (scaled) r-th column of the inverse of
+          !> the sumbmatrix in rows B1 through BN of the tridiagonal matrix
+          !> L D L**T - sigma I. When sigma is close to an eigenvalue, the
+          !> computed vector is an accurate eigenvector. Usually, r corresponds
+          !> to the index where the eigenvector is largest in magnitude.
+          !> The following steps accomplish this computation :
+          !> (a) Stationary qd transform,  L D L**T - sigma I = L(+) D(+) L(+)**T,
+          !> (b) Progressive qd transform, L D L**T - sigma I = U(-) D(-) U(-)**T,
+          !> (c) Computation of the diagonal elements of the inverse of
+          !> L D L**T - sigma I by combining the above transforms, and choosing
+          !> r as the index where the diagonal of the inverse is (one of the)
+          !> largest in magnitude.
+          !> (d) Computation of the (scaled) r-th column of the inverse using the
+          !> twisted factorization obtained by combining the top part of the
+          !> the stationary and the bottom part of the progressive transform.
+          interface lar1v
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clar1v( n, b1, bn, lambda, d, l, ld, lld,pivmin, gaptol, z, wantnc,&
+                          negcnt, ztz, mingma,r, isuppz, nrminv, resid, rqcorr, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantnc
+                    integer(ilp), intent(in) :: b1,bn,n
+                    integer(ilp), intent(out) :: negcnt,isuppz(*)
+                    integer(ilp), intent(inout) :: r
+                    real(sp), intent(in) :: gaptol,lambda,pivmin,d(*),l(*),ld(*),lld(*)
+                    real(sp), intent(out) :: mingma,nrminv,resid,rqcorr,ztz,work(*)
+                    complex(sp), intent(inout) :: z(*)
+               end subroutine clar1v
+#else 
+               module procedure stdlib_clar1v
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlar1v( n, b1, bn, lambda, d, l, ld, lld,pivmin, gaptol, z, wantnc,&
+                          negcnt, ztz, mingma,r, isuppz, nrminv, resid, rqcorr, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantnc
+                    integer(ilp), intent(in) :: b1,bn,n
+                    integer(ilp), intent(out) :: negcnt,isuppz(*)
+                    integer(ilp), intent(inout) :: r
+                    real(dp), intent(in) :: gaptol,lambda,pivmin,d(*),l(*),ld(*),lld(*)
+                    real(dp), intent(out) :: mingma,nrminv,resid,rqcorr,ztz,work(*)
+                    real(dp), intent(inout) :: z(*)
+               end subroutine dlar1v
+#else 
+               module procedure stdlib_dlar1v
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlar1v
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slar1v( n, b1, bn, lambda, d, l, ld, lld,pivmin, gaptol, z, wantnc,&
+                          negcnt, ztz, mingma,r, isuppz, nrminv, resid, rqcorr, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantnc
+                    integer(ilp), intent(in) :: b1,bn,n
+                    integer(ilp), intent(out) :: negcnt,isuppz(*)
+                    integer(ilp), intent(inout) :: r
+                    real(sp), intent(in) :: gaptol,lambda,pivmin,d(*),l(*),ld(*),lld(*)
+                    real(sp), intent(out) :: mingma,nrminv,resid,rqcorr,ztz,work(*)
+                    real(sp), intent(inout) :: z(*)
+               end subroutine slar1v
+#else 
+               module procedure stdlib_slar1v
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlar1v
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlar1v( n, b1, bn, lambda, d, l, ld, lld,pivmin, gaptol, z, wantnc,&
+                          negcnt, ztz, mingma,r, isuppz, nrminv, resid, rqcorr, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantnc
+                    integer(ilp), intent(in) :: b1,bn,n
+                    integer(ilp), intent(out) :: negcnt,isuppz(*)
+                    integer(ilp), intent(inout) :: r
+                    real(dp), intent(in) :: gaptol,lambda,pivmin,d(*),l(*),ld(*),lld(*)
+                    real(dp), intent(out) :: mingma,nrminv,resid,rqcorr,ztz,work(*)
+                    complex(dp), intent(inout) :: z(*)
+               end subroutine zlar1v
+#else 
+               module procedure stdlib_zlar1v
+#endif
+          end interface lar1v
+
+
+
+          !> LAR2V: applies a vector of complex plane rotations with real cosines
+          !> from both sides to a sequence of 2-by-2 complex Hermitian matrices,
+          !> defined by the elements of the vectors x, y and z. For i = 1,2,...,n
+          !> (       x(i)  z(i) ) :=
+          !> ( conjg(z(i)) y(i) )
+          !> (  c(i) conjg(s(i)) ) (       x(i)  z(i) ) ( c(i) -conjg(s(i)) )
+          !> ( -s(i)       c(i)  ) ( conjg(z(i)) y(i) ) ( s(i)        c(i)  )
+          interface lar2v
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clar2v( n, x, y, z, incx, c, s, incc )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incc,incx,n
+                    real(sp), intent(in) :: c(*)
+                    complex(sp), intent(in) :: s(*)
+                    complex(sp), intent(inout) :: x(*),y(*),z(*)
+               end subroutine clar2v
+#else 
+               module procedure stdlib_clar2v
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlar2v( n, x, y, z, incx, c, s, incc )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incc,incx,n
+                    real(dp), intent(in) :: c(*),s(*)
+                    real(dp), intent(inout) :: x(*),y(*),z(*)
+               end subroutine dlar2v
+#else 
+               module procedure stdlib_dlar2v
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlar2v
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slar2v( n, x, y, z, incx, c, s, incc )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incc,incx,n
+                    real(sp), intent(in) :: c(*),s(*)
+                    real(sp), intent(inout) :: x(*),y(*),z(*)
+               end subroutine slar2v
+#else 
+               module procedure stdlib_slar2v
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlar2v
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlar2v( n, x, y, z, incx, c, s, incc )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incc,incx,n
+                    real(dp), intent(in) :: c(*)
+                    complex(dp), intent(in) :: s(*)
+                    complex(dp), intent(inout) :: x(*),y(*),z(*)
+               end subroutine zlar2v
+#else 
+               module procedure stdlib_zlar2v
+#endif
+          end interface lar2v
+
+
+
+          !> LARCM: performs a very simple matrix-matrix multiplication:
+          !> C := A * B,
+          !> where A is M by M and real; B is M by N and complex;
+          !> C is M by N and complex.
+          interface larcm
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clarcm( m, n, a, lda, b, ldb, c, ldc, rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: lda,ldb,ldc,m,n
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(in) :: b(ldb,*)
+                    complex(sp), intent(out) :: c(ldc,*)
+               end subroutine clarcm
+#else 
+               module procedure stdlib_clarcm
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlarcm
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlarcm( m, n, a, lda, b, ldb, c, ldc, rwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: lda,ldb,ldc,m,n
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(in) :: b(ldb,*)
+                    complex(dp), intent(out) :: c(ldc,*)
+               end subroutine zlarcm
+#else 
+               module procedure stdlib_zlarcm
+#endif
+          end interface larcm
+
+
+
+          !> LARF: applies a complex elementary reflector H to a complex M-by-N
+          !> matrix C, from either the left or the right. H is represented in the
+          !> form
+          !> H = I - tau * v * v**H
+          !> where tau is a complex scalar and v is a complex vector.
+          !> If tau = 0, then H is taken to be the unit matrix.
+          !> To apply H**H (the conjugate transpose of H), supply conjg(tau) instead
+          !> tau.
+          interface larf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clarf( side, m, n, v, incv, tau, c, ldc, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side
+                    integer(ilp), intent(in) :: incv,ldc,m,n
+                    complex(sp), intent(in) :: tau,v(*)
+                    complex(sp), intent(inout) :: c(ldc,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine clarf
+#else 
+               module procedure stdlib_clarf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarf( side, m, n, v, incv, tau, c, ldc, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side
+                    integer(ilp), intent(in) :: incv,ldc,m,n
+                    real(dp), intent(in) :: tau,v(*)
+                    real(dp), intent(inout) :: c(ldc,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlarf
+#else 
+               module procedure stdlib_dlarf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarf( side, m, n, v, incv, tau, c, ldc, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side
+                    integer(ilp), intent(in) :: incv,ldc,m,n
+                    real(sp), intent(in) :: tau,v(*)
+                    real(sp), intent(inout) :: c(ldc,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine slarf
+#else 
+               module procedure stdlib_slarf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlarf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlarf( side, m, n, v, incv, tau, c, ldc, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side
+                    integer(ilp), intent(in) :: incv,ldc,m,n
+                    complex(dp), intent(in) :: tau,v(*)
+                    complex(dp), intent(inout) :: c(ldc,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zlarf
+#else 
+               module procedure stdlib_zlarf
+#endif
+          end interface larf
+
+
+
+          !> LARFB: applies a complex block reflector H or its transpose H**H to a
+          !> complex M-by-N matrix C, from either the left or the right.
+          interface larfb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clarfb( side, trans, direct, storev, m, n, k, v, ldv,t, ldt, c, &
+                         ldc, work, ldwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,side,storev,trans
+                    integer(ilp), intent(in) :: k,ldc,ldt,ldv,ldwork,m,n
+                    complex(sp), intent(inout) :: c(ldc,*)
+                    complex(sp), intent(in) :: t(ldt,*),v(ldv,*)
+                    complex(sp), intent(out) :: work(ldwork,*)
+               end subroutine clarfb
+#else 
+               module procedure stdlib_clarfb
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarfb( side, trans, direct, storev, m, n, k, v, ldv,t, ldt, c, &
+                         ldc, work, ldwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,side,storev,trans
+                    integer(ilp), intent(in) :: k,ldc,ldt,ldv,ldwork,m,n
+                    real(dp), intent(inout) :: c(ldc,*)
+                    real(dp), intent(in) :: t(ldt,*),v(ldv,*)
+                    real(dp), intent(out) :: work(ldwork,*)
+               end subroutine dlarfb
+#else 
+               module procedure stdlib_dlarfb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarfb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarfb( side, trans, direct, storev, m, n, k, v, ldv,t, ldt, c, &
+                         ldc, work, ldwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,side,storev,trans
+                    integer(ilp), intent(in) :: k,ldc,ldt,ldv,ldwork,m,n
+                    real(sp), intent(inout) :: c(ldc,*)
+                    real(sp), intent(in) :: t(ldt,*),v(ldv,*)
+                    real(sp), intent(out) :: work(ldwork,*)
+               end subroutine slarfb
+#else 
+               module procedure stdlib_slarfb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlarfb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlarfb( side, trans, direct, storev, m, n, k, v, ldv,t, ldt, c, &
+                         ldc, work, ldwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,side,storev,trans
+                    integer(ilp), intent(in) :: k,ldc,ldt,ldv,ldwork,m,n
+                    complex(dp), intent(inout) :: c(ldc,*)
+                    complex(dp), intent(in) :: t(ldt,*),v(ldv,*)
+                    complex(dp), intent(out) :: work(ldwork,*)
+               end subroutine zlarfb
+#else 
+               module procedure stdlib_zlarfb
+#endif
+          end interface larfb
+
+
+
+          !> LARFB_GETT: applies a complex Householder block reflector H from the
+          !> left to a complex (K+M)-by-N  "triangular-pentagonal" matrix
+          !> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+          !> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+          !> in the array B. The block reflector H is stored in a compact
+          !> WY-representation, where the elementary reflectors are in the
+          !> arrays A, B and T. See Further Details section.
+          interface larfb_gett
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clarfb_gett( ident, m, n, k, t, ldt, a, lda, b, ldb,work, ldwork )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: ident
+                    integer(ilp), intent(in) :: k,lda,ldb,ldt,ldwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(in) :: t(ldt,*)
+                    complex(sp), intent(out) :: work(ldwork,*)
+               end subroutine clarfb_gett
+#else 
+               module procedure stdlib_clarfb_gett
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarfb_gett( ident, m, n, k, t, ldt, a, lda, b, ldb,work, ldwork )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: ident
+                    integer(ilp), intent(in) :: k,lda,ldb,ldt,ldwork,m,n
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(in) :: t(ldt,*)
+                    real(dp), intent(out) :: work(ldwork,*)
+               end subroutine dlarfb_gett
+#else 
+               module procedure stdlib_dlarfb_gett
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarfb_gett
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarfb_gett( ident, m, n, k, t, ldt, a, lda, b, ldb,work, ldwork )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: ident
+                    integer(ilp), intent(in) :: k,lda,ldb,ldt,ldwork,m,n
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(in) :: t(ldt,*)
+                    real(sp), intent(out) :: work(ldwork,*)
+               end subroutine slarfb_gett
+#else 
+               module procedure stdlib_slarfb_gett
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlarfb_gett
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlarfb_gett( ident, m, n, k, t, ldt, a, lda, b, ldb,work, ldwork )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: ident
+                    integer(ilp), intent(in) :: k,lda,ldb,ldt,ldwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(in) :: t(ldt,*)
+                    complex(dp), intent(out) :: work(ldwork,*)
+               end subroutine zlarfb_gett
+#else 
+               module procedure stdlib_zlarfb_gett
+#endif
+          end interface larfb_gett
+
+
+
+          !> LARFG: generates a complex elementary reflector H of order n, such
+          !> that
+          !> H**H * ( alpha ) = ( beta ),   H**H * H = I.
+          !> (   x   )   (   0  )
+          !> where alpha and beta are scalars, with beta real, and x is an
+          !> (n-1)-element complex vector. H is represented in the form
+          !> H = I - tau * ( 1 ) * ( 1 v**H ) ,
+          !> ( v )
+          !> where tau is a complex scalar and v is a complex (n-1)-element
+          !> vector. Note that H is not hermitian.
+          !> If the elements of x are all zero and alpha is real, then tau = 0
+          !> and H is taken to be the unit matrix.
+          !> Otherwise  1 <= real(tau) <= 2  and  abs(tau-1) <= 1 .
+          interface larfg
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clarfg( n, alpha, x, incx, tau )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    complex(sp), intent(inout) :: alpha,x(*)
+                    complex(sp), intent(out) :: tau
+               end subroutine clarfg
+#else 
+               module procedure stdlib_clarfg
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarfg( n, alpha, x, incx, tau )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    real(dp), intent(inout) :: alpha,x(*)
+                    real(dp), intent(out) :: tau
+               end subroutine dlarfg
+#else 
+               module procedure stdlib_dlarfg
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarfg
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarfg( n, alpha, x, incx, tau )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    real(sp), intent(inout) :: alpha,x(*)
+                    real(sp), intent(out) :: tau
+               end subroutine slarfg
+#else 
+               module procedure stdlib_slarfg
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlarfg
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlarfg( n, alpha, x, incx, tau )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    complex(dp), intent(inout) :: alpha,x(*)
+                    complex(dp), intent(out) :: tau
+               end subroutine zlarfg
+#else 
+               module procedure stdlib_zlarfg
+#endif
+          end interface larfg
+
+
+
+          !> LARFGP: generates a complex elementary reflector H of order n, such
+          !> that
+          !> H**H * ( alpha ) = ( beta ),   H**H * H = I.
+          !> (   x   )   (   0  )
+          !> where alpha and beta are scalars, beta is real and non-negative, and
+          !> x is an (n-1)-element complex vector.  H is represented in the form
+          !> H = I - tau * ( 1 ) * ( 1 v**H ) ,
+          !> ( v )
+          !> where tau is a complex scalar and v is a complex (n-1)-element
+          !> vector. Note that H is not hermitian.
+          !> If the elements of x are all zero and alpha is real, then tau = 0
+          !> and H is taken to be the unit matrix.
+          interface larfgp
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine clarfgp( n, alpha, x, incx, tau )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    complex(sp), intent(inout) :: alpha,x(*)
+                    complex(sp), intent(out) :: tau
+               end subroutine clarfgp
+#else 
+               module procedure stdlib_clarfgp
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dlarfgp( n, alpha, x, incx, tau )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    real(dp), intent(inout) :: alpha,x(*)
+                    real(dp), intent(out) :: tau
+               end subroutine dlarfgp
+#else 
+               module procedure stdlib_dlarfgp
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarfgp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine slarfgp( n, alpha, x, incx, tau )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    real(sp), intent(inout) :: alpha,x(*)
+                    real(sp), intent(out) :: tau
+               end subroutine slarfgp
+#else 
+               module procedure stdlib_slarfgp
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlarfgp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zlarfgp( n, alpha, x, incx, tau )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    complex(dp), intent(inout) :: alpha,x(*)
+                    complex(dp), intent(out) :: tau
+               end subroutine zlarfgp
+#else 
+               module procedure stdlib_zlarfgp
+#endif
+          end interface larfgp
+
+
+
+          !> LARFT: forms the triangular factor T of a complex block reflector H
+          !> of order n, which is defined as a product of k elementary reflectors.
+          !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+          !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+          !> If STOREV = 'C', the vector which defines the elementary reflector
+          !> H(i) is stored in the i-th column of the array V, and
+          !> H  =  I - V * T * V**H
+          !> If STOREV = 'R', the vector which defines the elementary reflector
+          !> H(i) is stored in the i-th row of the array V, and
+          !> H  =  I - V**H * T * V
+          interface larft
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clarft( direct, storev, n, k, v, ldv, tau, t, ldt )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,storev
+                    integer(ilp), intent(in) :: k,ldt,ldv,n
+                    complex(sp), intent(out) :: t(ldt,*)
+                    complex(sp), intent(in) :: tau(*),v(ldv,*)
+               end subroutine clarft
+#else 
+               module procedure stdlib_clarft
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarft( direct, storev, n, k, v, ldv, tau, t, ldt )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,storev
+                    integer(ilp), intent(in) :: k,ldt,ldv,n
+                    real(dp), intent(out) :: t(ldt,*)
+                    real(dp), intent(in) :: tau(*),v(ldv,*)
+               end subroutine dlarft
+#else 
+               module procedure stdlib_dlarft
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarft
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarft( direct, storev, n, k, v, ldv, tau, t, ldt )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,storev
+                    integer(ilp), intent(in) :: k,ldt,ldv,n
+                    real(sp), intent(out) :: t(ldt,*)
+                    real(sp), intent(in) :: tau(*),v(ldv,*)
+               end subroutine slarft
+#else 
+               module procedure stdlib_slarft
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlarft
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlarft( direct, storev, n, k, v, ldv, tau, t, ldt )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,storev
+                    integer(ilp), intent(in) :: k,ldt,ldv,n
+                    complex(dp), intent(out) :: t(ldt,*)
+                    complex(dp), intent(in) :: tau(*),v(ldv,*)
+               end subroutine zlarft
+#else 
+               module procedure stdlib_zlarft
+#endif
+          end interface larft
+
+
+
+          !> LARFY: applies an elementary reflector, or Householder matrix, H,
+          !> to an n x n Hermitian matrix C, from both the left and the right.
+          !> H is represented in the form
+          !> H = I - tau * v * v'
+          !> where  tau  is a scalar and  v  is a vector.
+          !> If  tau  is  zero, then  H  is taken to be the unit matrix.
+          interface larfy
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clarfy( uplo, n, v, incv, tau, c, ldc, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: incv,ldc,n
+                    complex(sp), intent(in) :: tau,v(*)
+                    complex(sp), intent(inout) :: c(ldc,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine clarfy
+#else 
+               module procedure stdlib_clarfy
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarfy( uplo, n, v, incv, tau, c, ldc, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: incv,ldc,n
+                    real(dp), intent(in) :: tau,v(*)
+                    real(dp), intent(inout) :: c(ldc,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlarfy
+#else 
+               module procedure stdlib_dlarfy
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarfy
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarfy( uplo, n, v, incv, tau, c, ldc, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: incv,ldc,n
+                    real(sp), intent(in) :: tau,v(*)
+                    real(sp), intent(inout) :: c(ldc,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine slarfy
+#else 
+               module procedure stdlib_slarfy
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlarfy
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlarfy( uplo, n, v, incv, tau, c, ldc, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: incv,ldc,n
+                    complex(dp), intent(in) :: tau,v(*)
+                    complex(dp), intent(inout) :: c(ldc,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zlarfy
+#else 
+               module procedure stdlib_zlarfy
+#endif
+          end interface larfy
+
+
+
+          !> LARGV: generates a vector of complex plane rotations with real
+          !> cosines, determined by elements of the complex vectors x and y.
+          !> For i = 1,2,...,n
+          !> (        c(i)   s(i) ) ( x(i) ) = ( r(i) )
+          !> ( -conjg(s(i))  c(i) ) ( y(i) ) = (   0  )
+          !> where c(i)**2 + ABS(s(i))**2 = 1
+          !> The following conventions are used (these are the same as in CLARTG,
+          !> but differ from the BLAS1 routine CROTG):
+          !> If y(i)=0, then c(i)=1 and s(i)=0.
+          !> If x(i)=0, then c(i)=0 and s(i) is chosen so that r(i) is real.
+          interface largv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clargv( n, x, incx, y, incy, c, incc )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incc,incx,incy,n
+                    real(sp), intent(out) :: c(*)
+                    complex(sp), intent(inout) :: x(*),y(*)
+               end subroutine clargv
+#else 
+               module procedure stdlib_clargv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlargv( n, x, incx, y, incy, c, incc )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incc,incx,incy,n
+                    real(dp), intent(out) :: c(*)
+                    real(dp), intent(inout) :: x(*),y(*)
+               end subroutine dlargv
+#else 
+               module procedure stdlib_dlargv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlargv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slargv( n, x, incx, y, incy, c, incc )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incc,incx,incy,n
+                    real(sp), intent(out) :: c(*)
+                    real(sp), intent(inout) :: x(*),y(*)
+               end subroutine slargv
+#else 
+               module procedure stdlib_slargv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlargv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlargv( n, x, incx, y, incy, c, incc )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incc,incx,incy,n
+                    real(dp), intent(out) :: c(*)
+                    complex(dp), intent(inout) :: x(*),y(*)
+               end subroutine zlargv
+#else 
+               module procedure stdlib_zlargv
+#endif
+          end interface largv
+
+
+
+          !> LARNV: returns a vector of n random complex numbers from a uniform or
+          !> normal distribution.
+          interface larnv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clarnv( idist, iseed, n, x )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: idist,n
+                    integer(ilp), intent(inout) :: iseed(4)
+                    complex(sp), intent(out) :: x(*)
+               end subroutine clarnv
+#else 
+               module procedure stdlib_clarnv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarnv( idist, iseed, n, x )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: idist,n
+                    integer(ilp), intent(inout) :: iseed(4)
+                    real(dp), intent(out) :: x(*)
+               end subroutine dlarnv
+#else 
+               module procedure stdlib_dlarnv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarnv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarnv( idist, iseed, n, x )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: idist,n
+                    integer(ilp), intent(inout) :: iseed(4)
+                    real(sp), intent(out) :: x(*)
+               end subroutine slarnv
+#else 
+               module procedure stdlib_slarnv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlarnv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlarnv( idist, iseed, n, x )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: idist,n
+                    integer(ilp), intent(inout) :: iseed(4)
+                    complex(dp), intent(out) :: x(*)
+               end subroutine zlarnv
+#else 
+               module procedure stdlib_zlarnv
+#endif
+          end interface larnv
+
+
+
+          !> Compute the splitting points with threshold SPLTOL.
+          !> LARRA: sets any "small" off-diagonal elements to zero.
+          interface larra
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarra( n, d, e, e2, spltol, tnrm,nsplit, isplit, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,nsplit,isplit(*)
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: spltol,tnrm,d(*)
+                    real(dp), intent(inout) :: e(*),e2(*)
+               end subroutine dlarra
+#else 
+               module procedure stdlib_dlarra
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarra
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarra( n, d, e, e2, spltol, tnrm,nsplit, isplit, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,nsplit,isplit(*)
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: spltol,tnrm,d(*)
+                    real(sp), intent(inout) :: e(*),e2(*)
+               end subroutine slarra
+#else 
+               module procedure stdlib_slarra
+#endif
+          end interface larra
+
+
+
+          !> Given the relatively robust representation(RRR) L D L^T, LARRB:
+          !> does "limited" bisection to refine the eigenvalues of L D L^T,
+          !> W( IFIRST-OFFSET ) through W( ILAST-OFFSET ), to more accuracy. Initial
+          !> guesses for these eigenvalues are input in W, the corresponding estimate
+          !> of the error in these guesses and their gaps are input in WERR
+          !> and WGAP, respectively. During bisection, intervals
+          !> [left, right] are maintained by storing their mid-points and
+          !> semi-widths in the arrays W and WERR respectively.
+          interface larrb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarrb( n, d, lld, ifirst, ilast, rtol1,rtol2, offset, w, wgap, &
+                         werr, work, iwork,pivmin, spdiam, twist, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ifirst,ilast,n,offset,twist
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(dp), intent(in) :: pivmin,rtol1,rtol2,spdiam,d(*),lld(*)
+                    real(dp), intent(inout) :: w(*),werr(*),wgap(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlarrb
+#else 
+               module procedure stdlib_dlarrb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarrb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarrb( n, d, lld, ifirst, ilast, rtol1,rtol2, offset, w, wgap, &
+                         werr, work, iwork,pivmin, spdiam, twist, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ifirst,ilast,n,offset,twist
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(sp), intent(in) :: pivmin,rtol1,rtol2,spdiam,d(*),lld(*)
+                    real(sp), intent(inout) :: w(*),werr(*),wgap(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine slarrb
+#else 
+               module procedure stdlib_slarrb
+#endif
+          end interface larrb
+
+
+
+          !> Find the number of eigenvalues of the symmetric tridiagonal matrix T
+          !> that are in the interval (VL,VU] if JOBT = 'T', and of L D L^T
+          !> if JOBT = 'L'.
+          interface larrc
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarrc( jobt, n, vl, vu, d, e, pivmin,eigcnt, lcnt, rcnt, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobt
+                    integer(ilp), intent(out) :: eigcnt,info,lcnt,rcnt
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: pivmin,vl,vu,d(*),e(*)
+               end subroutine dlarrc
+#else 
+               module procedure stdlib_dlarrc
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarrc
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarrc( jobt, n, vl, vu, d, e, pivmin,eigcnt, lcnt, rcnt, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobt
+                    integer(ilp), intent(out) :: eigcnt,info,lcnt,rcnt
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: pivmin,vl,vu,d(*),e(*)
+               end subroutine slarrc
+#else 
+               module procedure stdlib_slarrc
+#endif
+          end interface larrc
+
+
+
+          !> LARRD: computes the eigenvalues of a symmetric tridiagonal
+          !> matrix T to suitable accuracy. This is an auxiliary code to be
+          !> called from DSTEMR.
+          !> The user may ask for all eigenvalues, all eigenvalues
+          !> in the half-open interval (VL, VU], or the IL-th through IU-th
+          !> eigenvalues.
+          !> To avoid overflow, the matrix must be scaled so that its
+          !> largest element is no greater than overflow**(1/2) * underflow**(1/4) in absolute value, and for greatest
+          !> accuracy, it should not be much smaller than that.
+          !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+          !> Matrix", Report CS41, Computer Science Dept., Stanford
+          !> University, July 21, 1966.
+          interface larrd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarrd( range, order, n, vl, vu, il, iu, gers,reltol, d, e, e2, &
+               pivmin, nsplit, isplit,m, w, werr, wl, wu, iblock, indexw,work, iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: order,range
+                    integer(ilp), intent(in) :: il,iu,n,nsplit,isplit(*)
+                    integer(ilp), intent(out) :: info,m,iblock(*),indexw(*),iwork(*)
+                    real(dp), intent(in) :: pivmin,reltol,vl,vu,d(*),e(*),e2(*),gers(*)
+                    real(dp), intent(out) :: wl,wu,w(*),werr(*),work(*)
+               end subroutine dlarrd
+#else 
+               module procedure stdlib_dlarrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarrd( range, order, n, vl, vu, il, iu, gers,reltol, d, e, e2, &
+               pivmin, nsplit, isplit,m, w, werr, wl, wu, iblock, indexw,work, iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: order,range
+                    integer(ilp), intent(in) :: il,iu,n,nsplit,isplit(*)
+                    integer(ilp), intent(out) :: info,m,iblock(*),indexw(*),iwork(*)
+                    real(sp), intent(in) :: pivmin,reltol,vl,vu,d(*),e(*),e2(*),gers(*)
+                    real(sp), intent(out) :: wl,wu,w(*),werr(*),work(*)
+               end subroutine slarrd
+#else 
+               module procedure stdlib_slarrd
+#endif
+          end interface larrd
+
+
+
+          !> To find the desired eigenvalues of a given real symmetric
+          !> tridiagonal matrix T, LARRE: sets any "small" off-diagonal
+          !> elements to zero, and for each unreduced block T_i, it finds
+          !> (a) a suitable shift at one end of the block's spectrum,
+          !> (b) the base representation, T_i - sigma_i I = L_i D_i L_i^T, and
+          !> (c) eigenvalues of each L_i D_i L_i^T.
+          !> The representations and eigenvalues found are then used by
+          !> DSTEMR to compute the eigenvectors of T.
+          !> The accuracy varies depending on whether bisection is used to
+          !> find a few eigenvalues or the dqds algorithm (subroutine DLASQ2) to
+          !> conpute all and then discard any unwanted one.
+          !> As an added benefit, LARRE also outputs the n
+          !> Gerschgorin intervals for the matrices L_i D_i L_i^T.
+          interface larre
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarre( range, n, vl, vu, il, iu, d, e, e2,rtol1, rtol2, spltol, &
+               nsplit, isplit, m,w, werr, wgap, iblock, indexw, gers, pivmin,work, iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: range
+                    integer(ilp), intent(in) :: il,iu,n
+                    integer(ilp), intent(out) :: info,m,nsplit,iblock(*),isplit(*),iwork(*),&
+                              indexw(*)
+                    real(dp), intent(out) :: pivmin,gers(*),w(*),werr(*),wgap(*),work(*)
+                    real(dp), intent(in) :: rtol1,rtol2,spltol
+                    real(dp), intent(inout) :: vl,vu,d(*),e(*),e2(*)
+               end subroutine dlarre
+#else 
+               module procedure stdlib_dlarre
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarre
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarre( range, n, vl, vu, il, iu, d, e, e2,rtol1, rtol2, spltol, &
+               nsplit, isplit, m,w, werr, wgap, iblock, indexw, gers, pivmin,work, iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: range
+                    integer(ilp), intent(in) :: il,iu,n
+                    integer(ilp), intent(out) :: info,m,nsplit,iblock(*),isplit(*),iwork(*),&
+                              indexw(*)
+                    real(sp), intent(out) :: pivmin,gers(*),w(*),werr(*),wgap(*),work(*)
+                    real(sp), intent(in) :: rtol1,rtol2,spltol
+                    real(sp), intent(inout) :: vl,vu,d(*),e(*),e2(*)
+               end subroutine slarre
+#else 
+               module procedure stdlib_slarre
+#endif
+          end interface larre
+
+
+
+          !> Given the initial representation L D L^T and its cluster of close
+          !> eigenvalues (in a relative measure), W( CLSTRT ), W( CLSTRT+1 ), ...
+          !> W( CLEND ), LARRF: finds a new relatively robust representation
+          !> L D L^T - SIGMA I = L(+) D(+) L(+)^T such that at least one of the
+          !> eigenvalues of L(+) D(+) L(+)^T is relatively isolated.
+          interface larrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarrf( n, d, l, ld, clstrt, clend,w, wgap, werr,spdiam, clgapl, &
+                         clgapr, pivmin, sigma,dplus, lplus, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: clstrt,clend,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(in) :: clgapl,clgapr,pivmin,spdiam,d(*),l(*),ld(*),w(*),werr(&
+                              *)
+                    real(dp), intent(out) :: sigma,dplus(*),lplus(*),work(*)
+                    real(dp), intent(inout) :: wgap(*)
+               end subroutine dlarrf
+#else 
+               module procedure stdlib_dlarrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarrf( n, d, l, ld, clstrt, clend,w, wgap, werr,spdiam, clgapl, &
+                         clgapr, pivmin, sigma,dplus, lplus, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: clstrt,clend,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(in) :: clgapl,clgapr,pivmin,spdiam,d(*),l(*),ld(*),w(*),werr(&
+                              *)
+                    real(sp), intent(out) :: sigma,dplus(*),lplus(*),work(*)
+                    real(sp), intent(inout) :: wgap(*)
+               end subroutine slarrf
+#else 
+               module procedure stdlib_slarrf
+#endif
+          end interface larrf
+
+
+
+          !> Given the initial eigenvalue approximations of T, LARRJ:
+          !> does  bisection to refine the eigenvalues of T,
+          !> W( IFIRST-OFFSET ) through W( ILAST-OFFSET ), to more accuracy. Initial
+          !> guesses for these eigenvalues are input in W, the corresponding estimate
+          !> of the error in these guesses in WERR. During bisection, intervals
+          !> [left, right] are maintained by storing their mid-points and
+          !> semi-widths in the arrays W and WERR respectively.
+          interface larrj
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarrj( n, d, e2, ifirst, ilast,rtol, offset, w, werr, work, iwork,&
+                         pivmin, spdiam, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ifirst,ilast,n,offset
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(dp), intent(in) :: pivmin,rtol,spdiam,d(*),e2(*)
+                    real(dp), intent(inout) :: w(*),werr(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlarrj
+#else 
+               module procedure stdlib_dlarrj
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarrj
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarrj( n, d, e2, ifirst, ilast,rtol, offset, w, werr, work, iwork,&
+                         pivmin, spdiam, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ifirst,ilast,n,offset
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(sp), intent(in) :: pivmin,rtol,spdiam,d(*),e2(*)
+                    real(sp), intent(inout) :: w(*),werr(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine slarrj
+#else 
+               module procedure stdlib_slarrj
+#endif
+          end interface larrj
+
+
+
+          !> LARRK: computes one eigenvalue of a symmetric tridiagonal
+          !> matrix T to suitable accuracy. This is an auxiliary code to be
+          !> called from DSTEMR.
+          !> To avoid overflow, the matrix must be scaled so that its
+          !> largest element is no greater than overflow**(1/2) * underflow**(1/4) in absolute value, and for greatest
+          !> accuracy, it should not be much smaller than that.
+          !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+          !> Matrix", Report CS41, Computer Science Dept., Stanford
+          !> University, July 21, 1966.
+          interface larrk
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarrk( n, iw, gl, gu,d, e2, pivmin, reltol, w, werr, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: iw,n
+                    real(dp), intent(in) :: pivmin,reltol,gl,gu,d(*),e2(*)
+                    real(dp), intent(out) :: w,werr
+               end subroutine dlarrk
+#else 
+               module procedure stdlib_dlarrk
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarrk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarrk( n, iw, gl, gu,d, e2, pivmin, reltol, w, werr, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: iw,n
+                    real(sp), intent(in) :: pivmin,reltol,gl,gu,d(*),e2(*)
+                    real(sp), intent(out) :: w,werr
+               end subroutine slarrk
+#else 
+               module procedure stdlib_slarrk
+#endif
+          end interface larrk
+
+
+
+          !> Perform tests to decide whether the symmetric tridiagonal matrix T
+          !> warrants expensive computations which guarantee high relative accuracy
+          !> in the eigenvalues.
+          interface larrr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarrr( n, d, e, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(in) :: d(*)
+                    real(dp), intent(inout) :: e(*)
+               end subroutine dlarrr
+#else 
+               module procedure stdlib_dlarrr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarrr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarrr( n, d, e, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(in) :: d(*)
+                    real(sp), intent(inout) :: e(*)
+               end subroutine slarrr
+#else 
+               module procedure stdlib_slarrr
+#endif
+          end interface larrr
+
+
+
+          !> LARRV: computes the eigenvectors of the tridiagonal matrix
+          !> T = L D L**T given L, D and APPROXIMATIONS to the eigenvalues of L D L**T.
+          !> The input eigenvalues should have been computed by SLARRE.
+          interface larrv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clarrv( n, vl, vu, d, l, pivmin,isplit, m, dol, dou, minrgp,rtol1, &
+               rtol2, w, werr, wgap,iblock, indexw, gers, z, ldz, isuppz,work, iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: dol,dou,ldz,m,n,iblock(*),indexw(*),isplit(*)
+                              
+                    integer(ilp), intent(out) :: info,isuppz(*),iwork(*)
+                    real(sp), intent(in) :: minrgp,pivmin,vl,vu,gers(*)
+                    real(sp), intent(inout) :: rtol1,rtol2,d(*),l(*),w(*),werr(*),wgap(*)
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(out) :: z(ldz,*)
+               end subroutine clarrv
+#else 
+               module procedure stdlib_clarrv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarrv( n, vl, vu, d, l, pivmin,isplit, m, dol, dou, minrgp,rtol1, &
+               rtol2, w, werr, wgap,iblock, indexw, gers, z, ldz, isuppz,work, iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: dol,dou,ldz,m,n,iblock(*),indexw(*),isplit(*)
+                              
+                    integer(ilp), intent(out) :: info,isuppz(*),iwork(*)
+                    real(dp), intent(in) :: minrgp,pivmin,vl,vu,gers(*)
+                    real(dp), intent(inout) :: rtol1,rtol2,d(*),l(*),w(*),werr(*),wgap(*)
+                    real(dp), intent(out) :: work(*),z(ldz,*)
+               end subroutine dlarrv
+#else 
+               module procedure stdlib_dlarrv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarrv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarrv( n, vl, vu, d, l, pivmin,isplit, m, dol, dou, minrgp,rtol1, &
+               rtol2, w, werr, wgap,iblock, indexw, gers, z, ldz, isuppz,work, iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: dol,dou,ldz,m,n,iblock(*),indexw(*),isplit(*)
+                              
+                    integer(ilp), intent(out) :: info,isuppz(*),iwork(*)
+                    real(sp), intent(in) :: minrgp,pivmin,vl,vu,gers(*)
+                    real(sp), intent(inout) :: rtol1,rtol2,d(*),l(*),w(*),werr(*),wgap(*)
+                    real(sp), intent(out) :: work(*),z(ldz,*)
+               end subroutine slarrv
+#else 
+               module procedure stdlib_slarrv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlarrv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlarrv( n, vl, vu, d, l, pivmin,isplit, m, dol, dou, minrgp,rtol1, &
+               rtol2, w, werr, wgap,iblock, indexw, gers, z, ldz, isuppz,work, iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: dol,dou,ldz,m,n,iblock(*),indexw(*),isplit(*)
+                              
+                    integer(ilp), intent(out) :: info,isuppz(*),iwork(*)
+                    real(dp), intent(in) :: minrgp,pivmin,vl,vu,gers(*)
+                    real(dp), intent(inout) :: rtol1,rtol2,d(*),l(*),w(*),werr(*),wgap(*)
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(out) :: z(ldz,*)
+               end subroutine zlarrv
+#else 
+               module procedure stdlib_zlarrv
+#endif
+          end interface larrv
+
+
+
+          !> !
+          !>
+          !> LARTG: generates a plane rotation so that
+          !> [  C         S  ] . [ F ]  =  [ R ]
+          !> [ -conjg(S)  C  ]   [ G ]     [ 0 ]
+          !> where C is real and C**2 + |S|**2 = 1.
+          !> The mathematical formulas used for C and S are
+          !> sgn(x) = {  x / |x|,   x != 0
+          !> {  1,         x = 0
+          !> R = sgn(F) * sqrt(|F|**2 + |G|**2)
+          !> C = |F| / sqrt(|F|**2 + |G|**2)
+          !> S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2)
+          !> When F and G are real, the formulas simplify to C = F/R and
+          !> S = G/R, and the returned values of C, S, and R should be
+          !> identical to those returned by LARTG.
+          !> The algorithm used to compute these quantities incorporates scaling
+          !> to avoid overflow or underflow in computing the square root of the
+          !> sum of squares.
+          !> This is a faster version of the BLAS1 routine CROTG, except for
+          !> the following differences:
+          !> F and G are unchanged on return.
+          !> If G=0, then C=1 and S=0.
+          !> If F=0, then C=0 and S is chosen so that R is real.
+          !> Below, wp=>sp stands for single precision from LA_CONSTANTS module.
+          interface lartg
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clartg( f, g, c, s, r )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(out) :: c
+                    complex(sp), intent(in) :: f,g
+                    complex(sp), intent(out) :: r,s
+               end subroutine clartg
+#else 
+               module procedure stdlib_clartg
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlartg( f, g, c, s, r )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(out) :: c,r,s
+                    real(dp), intent(in) :: f,g
+               end subroutine dlartg
+#else 
+               module procedure stdlib_dlartg
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlartg
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slartg( f, g, c, s, r )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(out) :: c,r,s
+                    real(sp), intent(in) :: f,g
+               end subroutine slartg
+#else 
+               module procedure stdlib_slartg
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlartg
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlartg( f, g, c, s, r )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(out) :: c
+                    complex(dp), intent(in) :: f,g
+                    complex(dp), intent(out) :: r,s
+               end subroutine zlartg
+#else 
+               module procedure stdlib_zlartg
+#endif
+          end interface lartg
+
+
+
+          !> LARTGP: generates a plane rotation so that
+          !> [  CS  SN  ]  .  [ F ]  =  [ R ]   where CS**2 + SN**2 = 1.
+          !> [ -SN  CS  ]     [ G ]     [ 0 ]
+          !> This is a slower, more accurate version of the Level 1 BLAS routine DROTG,
+          !> with the following other differences:
+          !> F and G are unchanged on return.
+          !> If G=0, then CS=(+/-)1 and SN=0.
+          !> If F=0 and (G .ne. 0), then CS=0 and SN=(+/-)1.
+          !> The sign is chosen so that R >= 0.
+          interface lartgp
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlartgp( f, g, cs, sn, r )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(out) :: cs,r,sn
+                    real(dp), intent(in) :: f,g
+               end subroutine dlartgp
+#else 
+               module procedure stdlib_dlartgp
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlartgp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slartgp( f, g, cs, sn, r )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(out) :: cs,r,sn
+                    real(sp), intent(in) :: f,g
+               end subroutine slartgp
+#else 
+               module procedure stdlib_slartgp
+#endif
+          end interface lartgp
+
+
+
+          !> LARTGS: generates a plane rotation designed to introduce a bulge in
+          !> Golub-Reinsch-style implicit QR iteration for the bidiagonal SVD
+          !> problem. X and Y are the top-row entries, and SIGMA is the shift.
+          !> The computed CS and SN define a plane rotation satisfying
+          !> [  CS  SN  ]  .  [ X^2 - SIGMA ]  =  [ R ],
+          !> [ -SN  CS  ]     [    X * Y    ]     [ 0 ]
+          !> with R nonnegative.  If X^2 - SIGMA and X * Y are 0, then the
+          !> rotation is by PI/2.
+          interface lartgs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlartgs( x, y, sigma, cs, sn )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(out) :: cs,sn
+                    real(dp), intent(in) :: sigma,x,y
+               end subroutine dlartgs
+#else 
+               module procedure stdlib_dlartgs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlartgs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slartgs( x, y, sigma, cs, sn )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(out) :: cs,sn
+                    real(sp), intent(in) :: sigma,x,y
+               end subroutine slartgs
+#else 
+               module procedure stdlib_slartgs
+#endif
+          end interface lartgs
+
+
+
+          !> LARTV: applies a vector of complex plane rotations with real cosines
+          !> to elements of the complex vectors x and y. For i = 1,2,...,n
+          !> ( x(i) ) := (        c(i)   s(i) ) ( x(i) )
+          !> ( y(i) )    ( -conjg(s(i))  c(i) ) ( y(i) )
+          interface lartv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clartv( n, x, incx, y, incy, c, s, incc )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incc,incx,incy,n
+                    real(sp), intent(in) :: c(*)
+                    complex(sp), intent(in) :: s(*)
+                    complex(sp), intent(inout) :: x(*),y(*)
+               end subroutine clartv
+#else 
+               module procedure stdlib_clartv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlartv( n, x, incx, y, incy, c, s, incc )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incc,incx,incy,n
+                    real(dp), intent(in) :: c(*),s(*)
+                    real(dp), intent(inout) :: x(*),y(*)
+               end subroutine dlartv
+#else 
+               module procedure stdlib_dlartv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlartv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slartv( n, x, incx, y, incy, c, s, incc )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incc,incx,incy,n
+                    real(sp), intent(in) :: c(*),s(*)
+                    real(sp), intent(inout) :: x(*),y(*)
+               end subroutine slartv
+#else 
+               module procedure stdlib_slartv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlartv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlartv( n, x, incx, y, incy, c, s, incc )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incc,incx,incy,n
+                    real(dp), intent(in) :: c(*)
+                    complex(dp), intent(in) :: s(*)
+                    complex(dp), intent(inout) :: x(*),y(*)
+               end subroutine zlartv
+#else 
+               module procedure stdlib_zlartv
+#endif
+          end interface lartv
+
+
+
+          !> LARUV: returns a vector of n random real numbers from a uniform (0,1)
+          !> distribution (n <= 128).
+          !> This is an auxiliary routine called by DLARNV and ZLARNV.
+          interface laruv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaruv( iseed, n, x )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n
+                    integer(ilp), intent(inout) :: iseed(4)
+                    real(dp), intent(out) :: x(n)
+               end subroutine dlaruv
+#else 
+               module procedure stdlib_dlaruv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaruv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaruv( iseed, n, x )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: n
+                    integer(ilp), intent(inout) :: iseed(4)
+                    real(sp), intent(out) :: x(n)
+               end subroutine slaruv
+#else 
+               module procedure stdlib_slaruv
+#endif
+          end interface laruv
+
+
+
+          !> LARZ: applies a complex elementary reflector H to a complex
+          !> M-by-N matrix C, from either the left or the right. H is represented
+          !> in the form
+          !> H = I - tau * v * v**H
+          !> where tau is a complex scalar and v is a complex vector.
+          !> If tau = 0, then H is taken to be the unit matrix.
+          !> To apply H**H (the conjugate transpose of H), supply conjg(tau) instead
+          !> tau.
+          !> H is a product of k elementary reflectors as returned by CTZRZF.
+          interface larz
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clarz( side, m, n, l, v, incv, tau, c, ldc, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side
+                    integer(ilp), intent(in) :: incv,l,ldc,m,n
+                    complex(sp), intent(in) :: tau,v(*)
+                    complex(sp), intent(inout) :: c(ldc,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine clarz
+#else 
+               module procedure stdlib_clarz
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarz( side, m, n, l, v, incv, tau, c, ldc, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side
+                    integer(ilp), intent(in) :: incv,l,ldc,m,n
+                    real(dp), intent(in) :: tau,v(*)
+                    real(dp), intent(inout) :: c(ldc,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlarz
+#else 
+               module procedure stdlib_dlarz
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarz
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarz( side, m, n, l, v, incv, tau, c, ldc, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side
+                    integer(ilp), intent(in) :: incv,l,ldc,m,n
+                    real(sp), intent(in) :: tau,v(*)
+                    real(sp), intent(inout) :: c(ldc,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine slarz
+#else 
+               module procedure stdlib_slarz
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlarz
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlarz( side, m, n, l, v, incv, tau, c, ldc, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side
+                    integer(ilp), intent(in) :: incv,l,ldc,m,n
+                    complex(dp), intent(in) :: tau,v(*)
+                    complex(dp), intent(inout) :: c(ldc,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zlarz
+#else 
+               module procedure stdlib_zlarz
+#endif
+          end interface larz
+
+
+
+          !> LARZB: applies a complex block reflector H or its transpose H**H
+          !> to a complex distributed M-by-N  C from the left or the right.
+          !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+          interface larzb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clarzb( side, trans, direct, storev, m, n, k, l, v,ldv, t, ldt, c, &
+                         ldc, work, ldwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,side,storev,trans
+                    integer(ilp), intent(in) :: k,l,ldc,ldt,ldv,ldwork,m,n
+                    complex(sp), intent(inout) :: c(ldc,*),t(ldt,*),v(ldv,*)
+                    complex(sp), intent(out) :: work(ldwork,*)
+               end subroutine clarzb
+#else 
+               module procedure stdlib_clarzb
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarzb( side, trans, direct, storev, m, n, k, l, v,ldv, t, ldt, c, &
+                         ldc, work, ldwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,side,storev,trans
+                    integer(ilp), intent(in) :: k,l,ldc,ldt,ldv,ldwork,m,n
+                    real(dp), intent(inout) :: c(ldc,*),t(ldt,*),v(ldv,*)
+                    real(dp), intent(out) :: work(ldwork,*)
+               end subroutine dlarzb
+#else 
+               module procedure stdlib_dlarzb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarzb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarzb( side, trans, direct, storev, m, n, k, l, v,ldv, t, ldt, c, &
+                         ldc, work, ldwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,side,storev,trans
+                    integer(ilp), intent(in) :: k,l,ldc,ldt,ldv,ldwork,m,n
+                    real(sp), intent(inout) :: c(ldc,*),t(ldt,*),v(ldv,*)
+                    real(sp), intent(out) :: work(ldwork,*)
+               end subroutine slarzb
+#else 
+               module procedure stdlib_slarzb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlarzb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlarzb( side, trans, direct, storev, m, n, k, l, v,ldv, t, ldt, c, &
+                         ldc, work, ldwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,side,storev,trans
+                    integer(ilp), intent(in) :: k,l,ldc,ldt,ldv,ldwork,m,n
+                    complex(dp), intent(inout) :: c(ldc,*),t(ldt,*),v(ldv,*)
+                    complex(dp), intent(out) :: work(ldwork,*)
+               end subroutine zlarzb
+#else 
+               module procedure stdlib_zlarzb
+#endif
+          end interface larzb
+
+
+
+          !> LARZT: forms the triangular factor T of a complex block reflector
+          !> H of order > n, which is defined as a product of k elementary
+          !> reflectors.
+          !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+          !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+          !> If STOREV = 'C', the vector which defines the elementary reflector
+          !> H(i) is stored in the i-th column of the array V, and
+          !> H  =  I - V * T * V**H
+          !> If STOREV = 'R', the vector which defines the elementary reflector
+          !> H(i) is stored in the i-th row of the array V, and
+          !> H  =  I - V**H * T * V
+          !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+          interface larzt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clarzt( direct, storev, n, k, v, ldv, tau, t, ldt )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,storev
+                    integer(ilp), intent(in) :: k,ldt,ldv,n
+                    complex(sp), intent(out) :: t(ldt,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(inout) :: v(ldv,*)
+               end subroutine clarzt
+#else 
+               module procedure stdlib_clarzt
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlarzt( direct, storev, n, k, v, ldv, tau, t, ldt )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,storev
+                    integer(ilp), intent(in) :: k,ldt,ldv,n
+                    real(dp), intent(out) :: t(ldt,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(inout) :: v(ldv,*)
+               end subroutine dlarzt
+#else 
+               module procedure stdlib_dlarzt
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlarzt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slarzt( direct, storev, n, k, v, ldv, tau, t, ldt )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,storev
+                    integer(ilp), intent(in) :: k,ldt,ldv,n
+                    real(sp), intent(out) :: t(ldt,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(inout) :: v(ldv,*)
+               end subroutine slarzt
+#else 
+               module procedure stdlib_slarzt
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlarzt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlarzt( direct, storev, n, k, v, ldv, tau, t, ldt )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,storev
+                    integer(ilp), intent(in) :: k,ldt,ldv,n
+                    complex(dp), intent(out) :: t(ldt,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(inout) :: v(ldv,*)
+               end subroutine zlarzt
+#else 
+               module procedure stdlib_zlarzt
+#endif
+          end interface larzt
+
+
+
+          !> LASCL: multiplies the M by N complex matrix A by the real scalar
+          !> CTO/CFROM.  This is done without over/underflow as long as the final
+          !> result CTO*A(I,J)/CFROM does not over/underflow. TYPE specifies that
+          !> A may be full, upper triangular, lower triangular, upper Hessenberg,
+          !> or banded.
+          interface lascl
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clascl( type, kl, ku, cfrom, cto, m, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: type
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,lda,m,n
+                    real(sp), intent(in) :: cfrom,cto
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine clascl
+#else 
+               module procedure stdlib_clascl
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlascl( type, kl, ku, cfrom, cto, m, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: type
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,lda,m,n
+                    real(dp), intent(in) :: cfrom,cto
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dlascl
+#else 
+               module procedure stdlib_dlascl
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlascl
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slascl( type, kl, ku, cfrom, cto, m, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: type
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,lda,m,n
+                    real(sp), intent(in) :: cfrom,cto
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine slascl
+#else 
+               module procedure stdlib_slascl
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlascl
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlascl( type, kl, ku, cfrom, cto, m, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: type
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kl,ku,lda,m,n
+                    real(dp), intent(in) :: cfrom,cto
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zlascl
+#else 
+               module procedure stdlib_zlascl
+#endif
+          end interface lascl
+
+
+
+          !> Using a divide and conquer approach, LASD0: computes the singular
+          !> value decomposition (SVD) of a real upper bidiagonal N-by-M
+          !> matrix B with diagonal D and offdiagonal E, where M = N + SQRE.
+          !> The algorithm computes orthogonal matrices U and VT such that
+          !> B = U * S * VT. The singular values S are overwritten on D.
+          !> A related subroutine, DLASDA, computes only the singular values,
+          !> and optionally, the singular vectors in compact form.
+          interface lasd0
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasd0( n, sqre, d, e, u, ldu, vt, ldvt, smlsiz, iwork,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldu,ldvt,n,smlsiz,sqre
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: u(ldu,*),vt(ldvt,*),work(*)
+               end subroutine dlasd0
+#else 
+               module procedure stdlib_dlasd0
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasd0
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasd0( n, sqre, d, e, u, ldu, vt, ldvt, smlsiz, iwork,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldu,ldvt,n,smlsiz,sqre
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: u(ldu,*),vt(ldvt,*),work(*)
+               end subroutine slasd0
+#else 
+               module procedure stdlib_slasd0
+#endif
+          end interface lasd0
+
+
+
+          !> LASD1: computes the SVD of an upper bidiagonal N-by-M matrix B,
+          !> where N = NL + NR + 1 and M = N + SQRE. LASD1 is called from DLASD0.
+          !> A related subroutine DLASD7 handles the case in which the singular
+          !> values (and the singular vectors in factored form) are desired.
+          !> LASD1 computes the SVD as follows:
+          !> ( D1(in)    0    0       0 )
+          !> B = U(in) * (   Z1**T   a   Z2**T    b ) * VT(in)
+          !> (   0       0   D2(in)   0 )
+          !> = U(out) * ( D(out) 0) * VT(out)
+          !> where Z**T = (Z1**T a Z2**T b) = u**T VT**T, and u is a vector of dimension M
+          !> with ALPHA and BETA in the NL+1 and NL+2 th entries and zeros
+          !> elsewhere; and the entry b is empty if SQRE = 0.
+          !> The left singular vectors of the original matrix are stored in U, and
+          !> the transpose of the right singular vectors are stored in VT, and the
+          !> singular values are in D.  The algorithm consists of three stages:
+          !> The first stage consists of deflating the size of the problem
+          !> when there are multiple singular values or when there are zeros in
+          !> the Z vector.  For each such occurrence the dimension of the
+          !> secular equation problem is reduced by one.  This stage is
+          !> performed by the routine DLASD2.
+          !> The second stage consists of calculating the updated
+          !> singular values. This is done by finding the square roots of the
+          !> roots of the secular equation via the routine DLASD4 (as called
+          !> by DLASD3). This routine also calculates the singular vectors of
+          !> the current problem.
+          !> The final stage consists of computing the updated singular vectors
+          !> directly using the updated singular values.  The singular vectors
+          !> for the current problem are multiplied with the singular vectors
+          !> from the overall problem.
+          interface lasd1
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasd1( nl, nr, sqre, d, alpha, beta, u, ldu, vt, ldvt,idxq, iwork,&
+                          work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldu,ldvt,nl,nr,sqre
+                    real(dp), intent(inout) :: alpha,beta,d(*),u(ldu,*),vt(ldvt,*)
+                    integer(ilp), intent(inout) :: idxq(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlasd1
+#else 
+               module procedure stdlib_dlasd1
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasd1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasd1( nl, nr, sqre, d, alpha, beta, u, ldu, vt, ldvt,idxq, iwork,&
+                          work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldu,ldvt,nl,nr,sqre
+                    real(sp), intent(inout) :: alpha,beta,d(*),u(ldu,*),vt(ldvt,*)
+                    integer(ilp), intent(inout) :: idxq(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine slasd1
+#else 
+               module procedure stdlib_slasd1
+#endif
+          end interface lasd1
+
+
+
+          !> This subroutine computes the square root of the I-th updated
+          !> eigenvalue of a positive symmetric rank-one modification to
+          !> a positive diagonal matrix whose entries are given as the squares
+          !> of the corresponding entries in the array d, and that
+          !> 0 <= D(i) < D(j)  for  i < j
+          !> and that RHO > 0. This is arranged by the calling routine, and is
+          !> no loss in generality.  The rank-one modified system is thus
+          !> diag( D ) * diag( D ) +  RHO * Z * Z_transpose.
+          !> where we assume the Euclidean norm of Z is 1.
+          !> The method consists of approximating the rational functions in the
+          !> secular equation by simpler interpolating rational functions.
+          interface lasd4
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasd4( n, i, d, z, delta, rho, sigma, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: i,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(in) :: rho,d(*),z(*)
+                    real(dp), intent(out) :: sigma,delta(*),work(*)
+               end subroutine dlasd4
+#else 
+               module procedure stdlib_dlasd4
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasd4
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasd4( n, i, d, z, delta, rho, sigma, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: i,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(in) :: rho,d(*),z(*)
+                    real(sp), intent(out) :: sigma,delta(*),work(*)
+               end subroutine slasd4
+#else 
+               module procedure stdlib_slasd4
+#endif
+          end interface lasd4
+
+
+
+          !> This subroutine computes the square root of the I-th eigenvalue
+          !> of a positive symmetric rank-one modification of a 2-by-2 diagonal
+          !> matrix
+          !> diag( D ) * diag( D ) +  RHO * Z * transpose(Z) .
+          !> The diagonal entries in the array D are assumed to satisfy
+          !> 0 <= D(i) < D(j)  for  i < j .
+          !> We also assume RHO > 0 and that the Euclidean norm of the vector
+          !> Z is one.
+          interface lasd5
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasd5( i, d, z, delta, rho, dsigma, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: i
+                    real(dp), intent(out) :: dsigma,delta(2),work(2)
+                    real(dp), intent(in) :: rho,d(2),z(2)
+               end subroutine dlasd5
+#else 
+               module procedure stdlib_dlasd5
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasd5
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasd5( i, d, z, delta, rho, dsigma, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: i
+                    real(sp), intent(out) :: dsigma,delta(2),work(2)
+                    real(sp), intent(in) :: rho,d(2),z(2)
+               end subroutine slasd5
+#else 
+               module procedure stdlib_slasd5
+#endif
+          end interface lasd5
+
+
+
+          !> LASD6: computes the SVD of an updated upper bidiagonal matrix B
+          !> obtained by merging two smaller ones by appending a row. This
+          !> routine is used only for the problem which requires all singular
+          !> values and optionally singular vector matrices in factored form.
+          !> B is an N-by-M matrix with N = NL + NR + 1 and M = N + SQRE.
+          !> A related subroutine, DLASD1, handles the case in which all singular
+          !> values and singular vectors of the bidiagonal matrix are desired.
+          !> LASD6 computes the SVD as follows:
+          !> ( D1(in)    0    0       0 )
+          !> B = U(in) * (   Z1**T   a   Z2**T    b ) * VT(in)
+          !> (   0       0   D2(in)   0 )
+          !> = U(out) * ( D(out) 0) * VT(out)
+          !> where Z**T = (Z1**T a Z2**T b) = u**T VT**T, and u is a vector of dimension M
+          !> with ALPHA and BETA in the NL+1 and NL+2 th entries and zeros
+          !> elsewhere; and the entry b is empty if SQRE = 0.
+          !> The singular values of B can be computed using D1, D2, the first
+          !> components of all the right singular vectors of the lower block, and
+          !> the last components of all the right singular vectors of the upper
+          !> block. These components are stored and updated in VF and VL,
+          !> respectively, in LASD6. Hence U and VT are not explicitly
+          !> referenced.
+          !> The singular values are stored in D. The algorithm consists of two
+          !> stages:
+          !> The first stage consists of deflating the size of the problem
+          !> when there are multiple singular values or if there is a zero
+          !> in the Z vector. For each such occurrence the dimension of the
+          !> secular equation problem is reduced by one. This stage is
+          !> performed by the routine DLASD7.
+          !> The second stage consists of calculating the updated
+          !> singular values. This is done by finding the roots of the
+          !> secular equation via the routine DLASD4 (as called by DLASD8).
+          !> This routine also updates VF and VL and computes the distances
+          !> between the updated singular values and the old singular
+          !> values.
+          !> LASD6 is called from DLASDA.
+          interface lasd6
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasd6( icompq, nl, nr, sqre, d, vf, vl, alpha, beta,idxq, perm, &
+               givptr, givcol, ldgcol, givnum,ldgnum, poles, difl, difr, z, k, c, s, work,iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: givptr,info,k,givcol(ldgcol,*),iwork(*),perm(*)
+                              
+                    integer(ilp), intent(in) :: icompq,ldgcol,ldgnum,nl,nr,sqre
+                    real(dp), intent(inout) :: alpha,beta,d(*),vf(*),vl(*)
+                    real(dp), intent(out) :: c,s,difl(*),difr(*),givnum(ldgnum,*),poles(ldgnum,*),&
+                              work(*),z(*)
+                    integer(ilp), intent(inout) :: idxq(*)
+               end subroutine dlasd6
+#else 
+               module procedure stdlib_dlasd6
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasd6
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasd6( icompq, nl, nr, sqre, d, vf, vl, alpha, beta,idxq, perm, &
+               givptr, givcol, ldgcol, givnum,ldgnum, poles, difl, difr, z, k, c, s, work,iwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: givptr,info,k,givcol(ldgcol,*),iwork(*),perm(*)
+                              
+                    integer(ilp), intent(in) :: icompq,ldgcol,ldgnum,nl,nr,sqre
+                    real(sp), intent(inout) :: alpha,beta,d(*),vf(*),vl(*)
+                    real(sp), intent(out) :: c,s,difl(*),difr(*),givnum(ldgnum,*),poles(ldgnum,*),&
+                              work(*),z(*)
+                    integer(ilp), intent(inout) :: idxq(*)
+               end subroutine slasd6
+#else 
+               module procedure stdlib_slasd6
+#endif
+          end interface lasd6
+
+
+
+          !> LASD7: merges the two sets of singular values together into a single
+          !> sorted set. Then it tries to deflate the size of the problem. There
+          !> are two ways in which deflation can occur:  when two or more singular
+          !> values are close together or if there is a tiny entry in the Z
+          !> vector. For each such occurrence the order of the related
+          !> secular equation problem is reduced by one.
+          !> LASD7 is called from DLASD6.
+          interface lasd7
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasd7( icompq, nl, nr, sqre, k, d, z, zw, vf, vfw, vl,vlw, alpha, &
+               beta, dsigma, idx, idxp, idxq,perm, givptr, givcol, ldgcol, givnum, ldgnum,c, s, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: givptr,info,k,givcol(ldgcol,*),idx(*),idxp(*),&
+                              perm(*)
+                    integer(ilp), intent(in) :: icompq,ldgcol,ldgnum,nl,nr,sqre
+                    real(dp), intent(in) :: alpha,beta
+                    real(dp), intent(out) :: c,s,dsigma(*),givnum(ldgnum,*),vfw(*),vlw(*),z(*),zw(&
+                              *)
+                    integer(ilp), intent(inout) :: idxq(*)
+                    real(dp), intent(inout) :: d(*),vf(*),vl(*)
+               end subroutine dlasd7
+#else 
+               module procedure stdlib_dlasd7
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasd7
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasd7( icompq, nl, nr, sqre, k, d, z, zw, vf, vfw, vl,vlw, alpha, &
+               beta, dsigma, idx, idxp, idxq,perm, givptr, givcol, ldgcol, givnum, ldgnum,c, s, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: givptr,info,k,givcol(ldgcol,*),idx(*),idxp(*),&
+                              perm(*)
+                    integer(ilp), intent(in) :: icompq,ldgcol,ldgnum,nl,nr,sqre
+                    real(sp), intent(in) :: alpha,beta
+                    real(sp), intent(out) :: c,s,dsigma(*),givnum(ldgnum,*),vfw(*),vlw(*),z(*),zw(&
+                              *)
+                    integer(ilp), intent(inout) :: idxq(*)
+                    real(sp), intent(inout) :: d(*),vf(*),vl(*)
+               end subroutine slasd7
+#else 
+               module procedure stdlib_slasd7
+#endif
+          end interface lasd7
+
+
+
+          !> LASD8: finds the square roots of the roots of the secular equation,
+          !> as defined by the values in DSIGMA and Z. It makes the appropriate
+          !> calls to DLASD4, and stores, for each  element in D, the distance
+          !> to its two nearest poles (elements in DSIGMA). It also updates
+          !> the arrays VF and VL, the first and last components of all the
+          !> right singular vectors of the original bidiagonal matrix.
+          !> LASD8 is called from DLASD6.
+          interface lasd8
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasd8( icompq, k, d, z, vf, vl, difl, difr, lddifr,dsigma, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: icompq,k,lddifr
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(out) :: d(*),difl(*),difr(lddifr,*),work(*)
+                    real(dp), intent(inout) :: dsigma(*),vf(*),vl(*),z(*)
+               end subroutine dlasd8
+#else 
+               module procedure stdlib_dlasd8
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasd8
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasd8( icompq, k, d, z, vf, vl, difl, difr, lddifr,dsigma, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: icompq,k,lddifr
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(out) :: d(*),difl(*),difr(lddifr,*),work(*)
+                    real(sp), intent(inout) :: dsigma(*),vf(*),vl(*),z(*)
+               end subroutine slasd8
+#else 
+               module procedure stdlib_slasd8
+#endif
+          end interface lasd8
+
+
+
+          !> Using a divide and conquer approach, LASDA: computes the singular
+          !> value decomposition (SVD) of a real upper bidiagonal N-by-M matrix
+          !> B with diagonal D and offdiagonal E, where M = N + SQRE. The
+          !> algorithm computes the singular values in the SVD B = U * S * VT.
+          !> The orthogonal matrices U and VT are optionally computed in
+          !> compact form.
+          !> A related subroutine, DLASD0, computes the singular values and
+          !> the singular vectors in explicit form.
+          interface lasda
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasda( icompq, smlsiz, n, sqre, d, e, u, ldu, vt, k,difl, difr, z,&
+                          poles, givptr, givcol, ldgcol,perm, givnum, c, s, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: icompq,ldgcol,ldu,n,smlsiz,sqre
+                    integer(ilp), intent(out) :: info,givcol(ldgcol,*),givptr(*),iwork(*),k(*),&
+                              perm(ldgcol,*)
+                    real(dp), intent(out) :: c(*),difl(ldu,*),difr(ldu,*),givnum(ldu,*),poles(ldu,&
+                              *),s(*),u(ldu,*),vt(ldu,*),work(*),z(ldu,*)
+                    real(dp), intent(inout) :: d(*),e(*)
+               end subroutine dlasda
+#else 
+               module procedure stdlib_dlasda
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasda
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasda( icompq, smlsiz, n, sqre, d, e, u, ldu, vt, k,difl, difr, z,&
+                          poles, givptr, givcol, ldgcol,perm, givnum, c, s, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: icompq,ldgcol,ldu,n,smlsiz,sqre
+                    integer(ilp), intent(out) :: info,givcol(ldgcol,*),givptr(*),iwork(*),k(*),&
+                              perm(ldgcol,*)
+                    real(sp), intent(out) :: c(*),difl(ldu,*),difr(ldu,*),givnum(ldu,*),poles(ldu,&
+                              *),s(*),u(ldu,*),vt(ldu,*),work(*),z(ldu,*)
+                    real(sp), intent(inout) :: d(*),e(*)
+               end subroutine slasda
+#else 
+               module procedure stdlib_slasda
+#endif
+          end interface lasda
+
+
+
+          !> LASDQ: computes the singular value decomposition (SVD) of a real
+          !> (upper or lower) bidiagonal matrix with diagonal D and offdiagonal
+          !> E, accumulating the transformations if desired. Letting B denote
+          !> the input bidiagonal matrix, the algorithm computes orthogonal
+          !> matrices Q and P such that B = Q * S * P**T (P**T denotes the transpose
+          !> of P). The singular values S are overwritten on D.
+          !> The input matrix U  is changed to U  * Q  if desired.
+          !> The input matrix VT is changed to P**T * VT if desired.
+          !> The input matrix C  is changed to Q**T * C  if desired.
+          !> See "Computing  Small Singular Values of Bidiagonal Matrices With
+          !> Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+          !> LAPACK Working Note #3, for a detailed description of the algorithm.
+          interface lasdq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasdq( uplo, sqre, n, ncvt, nru, ncc, d, e, vt, ldvt,u, ldu, c, &
+                         ldc, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldc,ldu,ldvt,n,ncc,ncvt,nru,sqre
+                    real(dp), intent(inout) :: c(ldc,*),d(*),e(*),u(ldu,*),vt(ldvt,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlasdq
+#else 
+               module procedure stdlib_dlasdq
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasdq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasdq( uplo, sqre, n, ncvt, nru, ncc, d, e, vt, ldvt,u, ldu, c, &
+                         ldc, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldc,ldu,ldvt,n,ncc,ncvt,nru,sqre
+                    real(sp), intent(inout) :: c(ldc,*),d(*),e(*),u(ldu,*),vt(ldvt,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine slasdq
+#else 
+               module procedure stdlib_slasdq
+#endif
+          end interface lasdq
+
+
+
+          !> LASET: initializes a 2-D array A to BETA on the diagonal and
+          !> ALPHA on the offdiagonals.
+          interface laset
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claset( uplo, m, n, alpha, beta, a, lda )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,m,n
+                    complex(sp), intent(in) :: alpha,beta
+                    complex(sp), intent(out) :: a(lda,*)
+               end subroutine claset
+#else 
+               module procedure stdlib_claset
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaset( uplo, m, n, alpha, beta, a, lda )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(in) :: alpha,beta
+                    real(dp), intent(out) :: a(lda,*)
+               end subroutine dlaset
+#else 
+               module procedure stdlib_dlaset
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaset
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaset( uplo, m, n, alpha, beta, a, lda )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(in) :: alpha,beta
+                    real(sp), intent(out) :: a(lda,*)
+               end subroutine slaset
+#else 
+               module procedure stdlib_slaset
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaset
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaset( uplo, m, n, alpha, beta, a, lda )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,m,n
+                    complex(dp), intent(in) :: alpha,beta
+                    complex(dp), intent(out) :: a(lda,*)
+               end subroutine zlaset
+#else 
+               module procedure stdlib_zlaset
+#endif
+          end interface laset
+
+
+
+          !> LASQ1: computes the singular values of a real N-by-N bidiagonal
+          !> matrix with diagonal D and off-diagonal E. The singular values
+          !> are computed to high relative accuracy, in the absence of
+          !> denormalization, underflow and overflow. The algorithm was first
+          !> presented in
+          !> "Accurate singular values and differential qd algorithms" by K. V.
+          !> Fernando and B. N. Parlett, Numer. Math., Vol-67, No. 2, pp. 191-230,
+          !> 1994,
+          !> and the present implementation is described in "An implementation of
+          !> the dqds Algorithm (Positive Case)", LAPACK Working Note.
+          interface lasq1
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasq1( n, d, e, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlasq1
+#else 
+               module procedure stdlib_dlasq1
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasq1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasq1( n, d, e, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine slasq1
+#else 
+               module procedure stdlib_slasq1
+#endif
+          end interface lasq1
+
+
+
+          !> LASQ4: computes an approximation TAU to the smallest eigenvalue
+          !> using values of d from the previous transform.
+          interface lasq4
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasq4( i0, n0, z, pp, n0in, dmin, dmin1, dmin2, dn,dn1, dn2, tau, &
+                         ttype, g )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: i0,n0,n0in,pp
+                    integer(ilp), intent(out) :: ttype
+                    real(dp), intent(in) :: dmin,dmin1,dmin2,dn,dn1,dn2,z(*)
+                    real(dp), intent(inout) :: g
+                    real(dp), intent(out) :: tau
+               end subroutine dlasq4
+#else 
+               module procedure stdlib_dlasq4
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasq4
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasq4( i0, n0, z, pp, n0in, dmin, dmin1, dmin2, dn,dn1, dn2, tau, &
+                         ttype, g )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: i0,n0,n0in,pp
+                    integer(ilp), intent(out) :: ttype
+                    real(sp), intent(in) :: dmin,dmin1,dmin2,dn,dn1,dn2,z(*)
+                    real(sp), intent(inout) :: g
+                    real(sp), intent(out) :: tau
+               end subroutine slasq4
+#else 
+               module procedure stdlib_slasq4
+#endif
+          end interface lasq4
+
+
+
+          !> LASQ5: computes one dqds transform in ping-pong form, one
+          !> version for IEEE machines another for non IEEE machines.
+          interface lasq5
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasq5( i0, n0, z, pp, tau, sigma, dmin, dmin1, dmin2,dn, dnm1, &
+                         dnm2, ieee, eps )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: ieee
+                    integer(ilp), intent(in) :: i0,n0,pp
+                    real(dp), intent(out) :: dmin,dmin1,dmin2,dn,dnm1,dnm2
+                    real(dp), intent(inout) :: tau,z(*)
+                    real(dp), intent(in) :: sigma,eps
+               end subroutine dlasq5
+#else 
+               module procedure stdlib_dlasq5
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasq5
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasq5( i0, n0, z, pp, tau, sigma, dmin, dmin1, dmin2,dn, dnm1, &
+                         dnm2, ieee, eps )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: ieee
+                    integer(ilp), intent(in) :: i0,n0,pp
+                    real(sp), intent(out) :: dmin,dmin1,dmin2,dn,dnm1,dnm2
+                    real(sp), intent(inout) :: tau,z(*)
+                    real(sp), intent(in) :: sigma,eps
+               end subroutine slasq5
+#else 
+               module procedure stdlib_slasq5
+#endif
+          end interface lasq5
+
+
+
+          !> LASQ6: computes one dqd (shift equal to zero) transform in
+          !> ping-pong form, with protection against underflow and overflow.
+          interface lasq6
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasq6( i0, n0, z, pp, dmin, dmin1, dmin2, dn,dnm1, dnm2 )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: i0,n0,pp
+                    real(dp), intent(out) :: dmin,dmin1,dmin2,dn,dnm1,dnm2
+                    real(dp), intent(inout) :: z(*)
+               end subroutine dlasq6
+#else 
+               module procedure stdlib_dlasq6
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasq6
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasq6( i0, n0, z, pp, dmin, dmin1, dmin2, dn,dnm1, dnm2 )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: i0,n0,pp
+                    real(sp), intent(out) :: dmin,dmin1,dmin2,dn,dnm1,dnm2
+                    real(sp), intent(inout) :: z(*)
+               end subroutine slasq6
+#else 
+               module procedure stdlib_slasq6
+#endif
+          end interface lasq6
+
+
+
+          !> LASR: applies a sequence of real plane rotations to a complex matrix
+          !> A, from either the left or the right.
+          !> When SIDE = 'L', the transformation takes the form
+          !> A := P*A
+          !> and when SIDE = 'R', the transformation takes the form
+          !> A := A*P**T
+          !> where P is an orthogonal matrix consisting of a sequence of z plane
+          !> rotations, with z = M when SIDE = 'L' and z = N when SIDE = 'R',
+          !> and P**T is the transpose of P.
+          !> When DIRECT = 'F' (Forward sequence), then
+          !> P = P(z-1) * ... * P(2) * P(1)
+          !> and when DIRECT = 'B' (Backward sequence), then
+          !> P = P(1) * P(2) * ... * P(z-1)
+          !> where P(k) is a plane rotation matrix defined by the 2-by-2 rotation
+          !> R(k) = (  c(k)  s(k) )
+          !> = ( -s(k)  c(k) ).
+          !> When PIVOT = 'V' (Variable pivot), the rotation is performed
+          !> for the plane (k,k+1), i.e., P(k) has the form
+          !> P(k) = (  1                                            )
+          !> (       ...                                     )
+          !> (              1                                )
+          !> (                   c(k)  s(k)                  )
+          !> (                  -s(k)  c(k)                  )
+          !> (                                1              )
+          !> (                                     ...       )
+          !> (                                            1  )
+          !> where R(k) appears as a rank-2 modification to the identity matrix in
+          !> rows and columns k and k+1.
+          !> When PIVOT = 'T' (Top pivot), the rotation is performed for the
+          !> plane (1,k+1), so P(k) has the form
+          !> P(k) = (  c(k)                    s(k)                 )
+          !> (         1                                     )
+          !> (              ...                              )
+          !> (                     1                         )
+          !> ( -s(k)                    c(k)                 )
+          !> (                                 1             )
+          !> (                                      ...      )
+          !> (                                             1 )
+          !> where R(k) appears in rows and columns 1 and k+1.
+          !> Similarly, when PIVOT = 'B' (Bottom pivot), the rotation is
+          !> performed for the plane (k,z), giving P(k) the form
+          !> P(k) = ( 1                                             )
+          !> (      ...                                      )
+          !> (             1                                 )
+          !> (                  c(k)                    s(k) )
+          !> (                         1                     )
+          !> (                              ...              )
+          !> (                                     1         )
+          !> (                 -s(k)                    c(k) )
+          !> where R(k) appears in rows and columns k and z.  The rotations are
+          !> performed without ever forming P(k) explicitly.
+          interface lasr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clasr( side, pivot, direct, m, n, c, s, a, lda )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,pivot,side
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(in) :: c(*),s(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine clasr
+#else 
+               module procedure stdlib_clasr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasr( side, pivot, direct, m, n, c, s, a, lda )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,pivot,side
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(in) :: c(*),s(*)
+               end subroutine dlasr
+#else 
+               module procedure stdlib_dlasr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasr( side, pivot, direct, m, n, c, s, a, lda )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,pivot,side
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(in) :: c(*),s(*)
+               end subroutine slasr
+#else 
+               module procedure stdlib_slasr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlasr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlasr( side, pivot, direct, m, n, c, s, a, lda )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,pivot,side
+                    integer(ilp), intent(in) :: lda,m,n
+                    real(dp), intent(in) :: c(*),s(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zlasr
+#else 
+               module procedure stdlib_zlasr
+#endif
+          end interface lasr
+
+
+
+          !> Sort the numbers in D in increasing order (if ID = 'I') or
+          !> in decreasing order (if ID = 'D' ).
+          !> Use Quick Sort, reverting to Insertion sort on arrays of
+          !> size <= 20. Dimension of STACK limits N to about 2**32.
+          interface lasrt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasrt( id, n, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: id
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: d(*)
+               end subroutine dlasrt
+#else 
+               module procedure stdlib_dlasrt
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasrt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasrt( id, n, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: id
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: d(*)
+               end subroutine slasrt
+#else 
+               module procedure stdlib_slasrt
+#endif
+          end interface lasrt
+
+
+
+          !> !
+          !>
+          !> LASSQ:  returns the values  scl  and  smsq  such that
+          !> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+          !> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+          !> assumed to be non-negative.
+          !> scale and sumsq must be supplied in SCALE and SUMSQ and
+          !> scl and smsq are overwritten on SCALE and SUMSQ respectively.
+          !> If scale * sqrt( sumsq ) > tbig then
+          !> we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
+          !> and if 0 < scale * sqrt( sumsq ) < tsml then
+          !> we require:   scale <= sqrt( HUGE ) / ssml       on entry,
+          !> where
+          !> tbig -- upper threshold for values whose square is representable;
+          !> sbig -- scaling constant for big numbers; \see la_constants.f90
+          !> tsml -- lower threshold for values whose square is representable;
+          !> ssml -- scaling constant for small numbers; \see la_constants.f90
+          !> and
+          !> TINY*EPS -- tiniest representable number;
+          !> HUGE     -- biggest representable number.
+          interface lassq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine classq( n, x, incx, scl, sumsq )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    real(sp), intent(inout) :: scl,sumsq
+                    complex(sp), intent(in) :: x(*)
+               end subroutine classq
+#else 
+               module procedure stdlib_classq
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlassq( n, x, incx, scl, sumsq )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    real(dp), intent(inout) :: scl,sumsq
+                    real(dp), intent(in) :: x(*)
+               end subroutine dlassq
+#else 
+               module procedure stdlib_dlassq
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlassq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slassq( n, x, incx, scl, sumsq )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    real(sp), intent(inout) :: scl,sumsq
+                    real(sp), intent(in) :: x(*)
+               end subroutine slassq
+#else 
+               module procedure stdlib_slassq
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlassq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlassq( n, x, incx, scl, sumsq )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    real(dp), intent(inout) :: scl,sumsq
+                    complex(dp), intent(in) :: x(*)
+               end subroutine zlassq
+#else 
+               module procedure stdlib_zlassq
+#endif
+          end interface lassq
+
+
+
+          !> LASWLQ: computes a blocked Tall-Skinny LQ factorization of
+          !> a complex M-by-N matrix A for M <= N:
+          !> A = ( L 0 ) *  Q,
+          !> where:
+          !> Q is a n-by-N orthogonal matrix, stored on exit in an implicit
+          !> form in the elements above the diagonal of the array A and in
+          !> the elements of the array T;
+          !> L is a lower-triangular M-by-M matrix stored on exit in
+          !> the elements on and below the diagonal of the array A.
+          !> 0 is a M-by-(N-M) zero matrix, if M < N, and is not stored.
+          interface laswlq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claswlq( m, n, mb, nb, a, lda, t, ldt, work, lwork,info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,mb,nb,lwork,ldt
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*),t(ldt,*)
+               end subroutine claswlq
+#else 
+               module procedure stdlib_claswlq
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaswlq( m, n, mb, nb, a, lda, t, ldt, work, lwork,info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,mb,nb,lwork,ldt
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: work(*),t(ldt,*)
+               end subroutine dlaswlq
+#else 
+               module procedure stdlib_dlaswlq
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaswlq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaswlq( m, n, mb, nb, a, lda, t, ldt, work, lwork,info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,mb,nb,lwork,ldt
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: work(*),t(ldt,*)
+               end subroutine slaswlq
+#else 
+               module procedure stdlib_slaswlq
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaswlq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaswlq( m, n, mb, nb, a, lda, t, ldt, work, lwork,info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,mb,nb,lwork,ldt
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*),t(ldt,*)
+               end subroutine zlaswlq
+#else 
+               module procedure stdlib_zlaswlq
+#endif
+          end interface laswlq
+
+
+
+          !> LASWP: performs a series of row interchanges on the matrix A.
+          !> One row interchange is initiated for each of rows K1 through K2 of A.
+          interface laswp
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claswp( n, a, lda, k1, k2, ipiv, incx )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,k1,k2,lda,n,ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine claswp
+#else 
+               module procedure stdlib_claswp
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlaswp( n, a, lda, k1, k2, ipiv, incx )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,k1,k2,lda,n,ipiv(*)
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dlaswp
+#else 
+               module procedure stdlib_dlaswp
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlaswp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slaswp( n, a, lda, k1, k2, ipiv, incx )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,k1,k2,lda,n,ipiv(*)
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine slaswp
+#else 
+               module procedure stdlib_slaswp
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaswp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaswp( n, a, lda, k1, k2, ipiv, incx )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,k1,k2,lda,n,ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zlaswp
+#else 
+               module procedure stdlib_zlaswp
+#endif
+          end interface laswp
+
+
+
+          !> LASYF: computes a partial factorization of a complex symmetric matrix
+          !> A using the Bunch-Kaufman diagonal pivoting method. The partial
+          !> factorization has the form:
+          !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+          !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+          !> A  =  ( L11  0 ) ( D    0  ) ( L11**T L21**T )  if UPLO = 'L'
+          !> ( L21  I ) ( 0   A22 ) (  0       I    )
+          !> where the order of D is at most NB. The actual order is returned in
+          !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+          !> Note that U**T denotes the transpose of U.
+          !> LASYF is an auxiliary routine called by CSYTRF. It uses blocked code
+          !> (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or
+          !> A22 (if UPLO = 'L').
+          interface lasyf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clasyf( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: w(ldw,*)
+               end subroutine clasyf
+#else 
+               module procedure stdlib_clasyf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasyf( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: w(ldw,*)
+               end subroutine dlasyf
+#else 
+               module procedure stdlib_dlasyf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasyf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasyf( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: w(ldw,*)
+               end subroutine slasyf
+#else 
+               module procedure stdlib_slasyf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlasyf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlasyf( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: w(ldw,*)
+               end subroutine zlasyf
+#else 
+               module procedure stdlib_zlasyf
+#endif
+          end interface lasyf
+
+
+
+          !> DLATRF_AA factorizes a panel of a complex symmetric matrix A using
+          !> the Aasen's algorithm. The panel consists of a set of NB rows of A
+          !> when UPLO is U, or a set of NB columns when UPLO is L.
+          !> In order to factorize the panel, the Aasen's algorithm requires the
+          !> last row, or column, of the previous panel. The first row, or column,
+          !> of A is set to be the first row, or column, of an identity matrix,
+          !> which is used to factorize the first panel.
+          !> The resulting J-th row of U, or J-th column of L, is stored in the
+          !> (J-1)-th row, or column, of A (without the unit diagonals), while
+          !> the diagonal and subdiagonal of A are overwritten by those of T.
+          interface lasyf_aa
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clasyf_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: m,nb,j1,lda,ldh
+                    integer(ilp), intent(out) :: ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*),h(ldh,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine clasyf_aa
+#else 
+               module procedure stdlib_clasyf_aa
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasyf_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: m,nb,j1,lda,ldh
+                    integer(ilp), intent(out) :: ipiv(*)
+                    real(dp), intent(inout) :: a(lda,*),h(ldh,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dlasyf_aa
+#else 
+               module procedure stdlib_dlasyf_aa
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasyf_aa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasyf_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: m,nb,j1,lda,ldh
+                    integer(ilp), intent(out) :: ipiv(*)
+                    real(sp), intent(inout) :: a(lda,*),h(ldh,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine slasyf_aa
+#else 
+               module procedure stdlib_slasyf_aa
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlasyf_aa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlasyf_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: m,nb,j1,lda,ldh
+                    integer(ilp), intent(out) :: ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*),h(ldh,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zlasyf_aa
+#else 
+               module procedure stdlib_zlasyf_aa
+#endif
+          end interface lasyf_aa
+
+
+
+          !> LASYF_RK: computes a partial factorization of a complex symmetric
+          !> matrix A using the bounded Bunch-Kaufman (rook) diagonal
+          !> pivoting method. The partial factorization has the form:
+          !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+          !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+          !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L',
+          !> ( L21  I ) (  0  A22 ) (  0       I    )
+          !> where the order of D is at most NB. The actual order is returned in
+          !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+          !> LASYF_RK is an auxiliary routine called by CSYTRF_RK. It uses
+          !> blocked code (calling Level 3 BLAS) to update the submatrix
+          !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+          interface lasyf_rk
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clasyf_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: e(*),w(ldw,*)
+               end subroutine clasyf_rk
+#else 
+               module procedure stdlib_clasyf_rk
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasyf_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: e(*),w(ldw,*)
+               end subroutine dlasyf_rk
+#else 
+               module procedure stdlib_dlasyf_rk
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasyf_rk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasyf_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: e(*),w(ldw,*)
+               end subroutine slasyf_rk
+#else 
+               module procedure stdlib_slasyf_rk
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlasyf_rk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlasyf_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: e(*),w(ldw,*)
+               end subroutine zlasyf_rk
+#else 
+               module procedure stdlib_zlasyf_rk
+#endif
+          end interface lasyf_rk
+
+
+
+          !> LASYF_ROOK: computes a partial factorization of a complex symmetric
+          !> matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+          !> pivoting method. The partial factorization has the form:
+          !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+          !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+          !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L'
+          !> ( L21  I ) (  0  A22 ) (  0       I    )
+          !> where the order of D is at most NB. The actual order is returned in
+          !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+          !> LASYF_ROOK is an auxiliary routine called by CSYTRF_ROOK. It uses
+          !> blocked code (calling Level 3 BLAS) to update the submatrix
+          !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+          interface lasyf_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clasyf_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: w(ldw,*)
+               end subroutine clasyf_rook
+#else 
+               module procedure stdlib_clasyf_rook
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlasyf_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: w(ldw,*)
+               end subroutine dlasyf_rook
+#else 
+               module procedure stdlib_dlasyf_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlasyf_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slasyf_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: w(ldw,*)
+               end subroutine slasyf_rook
+#else 
+               module procedure stdlib_slasyf_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlasyf_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlasyf_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,kb,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: w(ldw,*)
+               end subroutine zlasyf_rook
+#else 
+               module procedure stdlib_zlasyf_rook
+#endif
+          end interface lasyf_rook
+
+
+
+          !> LATBS: solves one of the triangular systems
+          !> A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+          !> with scaling to prevent overflow, where A is an upper or lower
+          !> triangular band matrix.  Here A**T denotes the transpose of A, x and b
+          !> are n-element vectors, and s is a scaling factor, usually less than
+          !> or equal to 1, chosen so that the components of x will be less than
+          !> the overflow threshold.  If the unscaled problem will not cause
+          !> overflow, the Level 2 BLAS routine CTBSV is called.  If the matrix A
+          !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+          !> non-trivial solution to A*x = 0 is returned.
+          interface latbs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clatbs( uplo, trans, diag, normin, n, kd, ab, ldab, x,scale, cnorm,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,normin,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(sp), intent(out) :: scale
+                    real(sp), intent(inout) :: cnorm(*)
+                    complex(sp), intent(in) :: ab(ldab,*)
+                    complex(sp), intent(inout) :: x(*)
+               end subroutine clatbs
+#else 
+               module procedure stdlib_clatbs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlatbs( uplo, trans, diag, normin, n, kd, ab, ldab, x,scale, cnorm,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,normin,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(dp), intent(out) :: scale
+                    real(dp), intent(in) :: ab(ldab,*)
+                    real(dp), intent(inout) :: cnorm(*),x(*)
+               end subroutine dlatbs
+#else 
+               module procedure stdlib_dlatbs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlatbs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slatbs( uplo, trans, diag, normin, n, kd, ab, ldab, x,scale, cnorm,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,normin,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(sp), intent(out) :: scale
+                    real(sp), intent(in) :: ab(ldab,*)
+                    real(sp), intent(inout) :: cnorm(*),x(*)
+               end subroutine slatbs
+#else 
+               module procedure stdlib_slatbs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlatbs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlatbs( uplo, trans, diag, normin, n, kd, ab, ldab, x,scale, cnorm,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,normin,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(dp), intent(out) :: scale
+                    real(dp), intent(inout) :: cnorm(*)
+                    complex(dp), intent(in) :: ab(ldab,*)
+                    complex(dp), intent(inout) :: x(*)
+               end subroutine zlatbs
+#else 
+               module procedure stdlib_zlatbs
+#endif
+          end interface latbs
+
+
+
+          !> LATDF: computes the contribution to the reciprocal Dif-estimate
+          !> by solving for x in Z * x = b, where b is chosen such that the norm
+          !> of x is as large as possible. It is assumed that LU decomposition
+          !> of Z has been computed by CGETC2. On entry RHS = f holds the
+          !> contribution from earlier solved sub-systems, and on return RHS = x.
+          !> The factorization of Z returned by CGETC2 has the form
+          !> Z = P * L * U * Q, where P and Q are permutation matrices. L is lower
+          !> triangular with unit diagonal elements and U is upper triangular.
+          interface latdf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clatdf( ijob, n, z, ldz, rhs, rdsum, rdscal, ipiv,jpiv )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ijob,ldz,n,ipiv(*),jpiv(*)
+                    real(sp), intent(inout) :: rdscal,rdsum
+                    complex(sp), intent(inout) :: rhs(*),z(ldz,*)
+               end subroutine clatdf
+#else 
+               module procedure stdlib_clatdf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlatdf( ijob, n, z, ldz, rhs, rdsum, rdscal, ipiv,jpiv )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ijob,ldz,n,ipiv(*),jpiv(*)
+                    real(dp), intent(inout) :: rdscal,rdsum,rhs(*),z(ldz,*)
+               end subroutine dlatdf
+#else 
+               module procedure stdlib_dlatdf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlatdf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slatdf( ijob, n, z, ldz, rhs, rdsum, rdscal, ipiv,jpiv )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ijob,ldz,n,ipiv(*),jpiv(*)
+                    real(sp), intent(inout) :: rdscal,rdsum,rhs(*),z(ldz,*)
+               end subroutine slatdf
+#else 
+               module procedure stdlib_slatdf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlatdf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlatdf( ijob, n, z, ldz, rhs, rdsum, rdscal, ipiv,jpiv )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ijob,ldz,n,ipiv(*),jpiv(*)
+                    real(dp), intent(inout) :: rdscal,rdsum
+                    complex(dp), intent(inout) :: rhs(*),z(ldz,*)
+               end subroutine zlatdf
+#else 
+               module procedure stdlib_zlatdf
+#endif
+          end interface latdf
+
+
+
+          !> LATPS: solves one of the triangular systems
+          !> A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+          !> with scaling to prevent overflow, where A is an upper or lower
+          !> triangular matrix stored in packed form.  Here A**T denotes the
+          !> transpose of A, A**H denotes the conjugate transpose of A, x and b
+          !> are n-element vectors, and s is a scaling factor, usually less than
+          !> or equal to 1, chosen so that the components of x will be less than
+          !> the overflow threshold.  If the unscaled problem will not cause
+          !> overflow, the Level 2 BLAS routine CTPSV is called. If the matrix A
+          !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+          !> non-trivial solution to A*x = 0 is returned.
+          interface latps
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clatps( uplo, trans, diag, normin, n, ap, x, scale,cnorm, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,normin,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(out) :: scale
+                    real(sp), intent(inout) :: cnorm(*)
+                    complex(sp), intent(in) :: ap(*)
+                    complex(sp), intent(inout) :: x(*)
+               end subroutine clatps
+#else 
+               module procedure stdlib_clatps
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlatps( uplo, trans, diag, normin, n, ap, x, scale,cnorm, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,normin,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(out) :: scale
+                    real(dp), intent(in) :: ap(*)
+                    real(dp), intent(inout) :: cnorm(*),x(*)
+               end subroutine dlatps
+#else 
+               module procedure stdlib_dlatps
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlatps
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slatps( uplo, trans, diag, normin, n, ap, x, scale,cnorm, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,normin,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(out) :: scale
+                    real(sp), intent(in) :: ap(*)
+                    real(sp), intent(inout) :: cnorm(*),x(*)
+               end subroutine slatps
+#else 
+               module procedure stdlib_slatps
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlatps
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlatps( uplo, trans, diag, normin, n, ap, x, scale,cnorm, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,normin,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(out) :: scale
+                    real(dp), intent(inout) :: cnorm(*)
+                    complex(dp), intent(in) :: ap(*)
+                    complex(dp), intent(inout) :: x(*)
+               end subroutine zlatps
+#else 
+               module procedure stdlib_zlatps
+#endif
+          end interface latps
+
+
+
+          !> LATRD: reduces NB rows and columns of a complex Hermitian matrix A to
+          !> Hermitian tridiagonal form by a unitary similarity
+          !> transformation Q**H * A * Q, and returns the matrices V and W which are
+          !> needed to apply the transformation to the unreduced part of A.
+          !> If UPLO = 'U', LATRD reduces the last NB rows and columns of a
+          !> matrix, of which the upper triangle is supplied;
+          !> if UPLO = 'L', LATRD reduces the first NB rows and columns of a
+          !> matrix, of which the lower triangle is supplied.
+          !> This is an auxiliary routine called by CHETRD.
+          interface latrd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clatrd( uplo, n, nb, a, lda, e, tau, w, ldw )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    real(sp), intent(out) :: e(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: tau(*),w(ldw,*)
+               end subroutine clatrd
+#else 
+               module procedure stdlib_clatrd
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlatrd( uplo, n, nb, a, lda, e, tau, w, ldw )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: e(*),tau(*),w(ldw,*)
+               end subroutine dlatrd
+#else 
+               module procedure stdlib_dlatrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlatrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slatrd( uplo, n, nb, a, lda, e, tau, w, ldw )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: e(*),tau(*),w(ldw,*)
+               end subroutine slatrd
+#else 
+               module procedure stdlib_slatrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlatrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlatrd( uplo, n, nb, a, lda, e, tau, w, ldw )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: lda,ldw,n,nb
+                    real(dp), intent(out) :: e(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: tau(*),w(ldw,*)
+               end subroutine zlatrd
+#else 
+               module procedure stdlib_zlatrd
+#endif
+          end interface latrd
+
+
+
+          !> LATRS: solves one of the triangular systems
+          !> A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+          !> with scaling to prevent overflow.  Here A is an upper or lower
+          !> triangular matrix, A**T denotes the transpose of A, A**H denotes the
+          !> conjugate transpose of A, x and b are n-element vectors, and s is a
+          !> scaling factor, usually less than or equal to 1, chosen so that the
+          !> components of x will be less than the overflow threshold.  If the
+          !> unscaled problem will not cause overflow, the Level 2 BLAS routine
+          !> CTRSV is called. If the matrix A is singular (A(j,j) = 0 for some j),
+          !> then s is set to 0 and a non-trivial solution to A*x = 0 is returned.
+          interface latrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clatrs( uplo, trans, diag, normin, n, a, lda, x, scale,cnorm, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,normin,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: scale
+                    real(sp), intent(inout) :: cnorm(*)
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: x(*)
+               end subroutine clatrs
+#else 
+               module procedure stdlib_clatrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlatrs( uplo, trans, diag, normin, n, a, lda, x, scale,cnorm, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,normin,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: scale
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(inout) :: cnorm(*),x(*)
+               end subroutine dlatrs
+#else 
+               module procedure stdlib_dlatrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlatrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slatrs( uplo, trans, diag, normin, n, a, lda, x, scale,cnorm, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,normin,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: scale
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(inout) :: cnorm(*),x(*)
+               end subroutine slatrs
+#else 
+               module procedure stdlib_slatrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlatrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlatrs( uplo, trans, diag, normin, n, a, lda, x, scale,cnorm, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,normin,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: scale
+                    real(dp), intent(inout) :: cnorm(*)
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: x(*)
+               end subroutine zlatrs
+#else 
+               module procedure stdlib_zlatrs
+#endif
+          end interface latrs
+
+
+
+          !> LATRZ: factors the M-by-(M+L) complex upper trapezoidal matrix
+          !> [ A1 A2 ] = [ A(1:M,1:M) A(1:M,N-L+1:N) ] as ( R  0 ) * Z by means
+          !> of unitary transformations, where  Z is an (M+L)-by-(M+L) unitary
+          !> matrix and, R and A1 are M-by-M upper triangular matrices.
+          interface latrz
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clatrz( m, n, l, a, lda, tau, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: l,lda,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: tau(*),work(*)
+               end subroutine clatrz
+#else 
+               module procedure stdlib_clatrz
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlatrz( m, n, l, a, lda, tau, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: l,lda,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: tau(*),work(*)
+               end subroutine dlatrz
+#else 
+               module procedure stdlib_dlatrz
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlatrz
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slatrz( m, n, l, a, lda, tau, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: l,lda,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: tau(*),work(*)
+               end subroutine slatrz
+#else 
+               module procedure stdlib_slatrz
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlatrz
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlatrz( m, n, l, a, lda, tau, work )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: l,lda,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: tau(*),work(*)
+               end subroutine zlatrz
+#else 
+               module procedure stdlib_zlatrz
+#endif
+          end interface latrz
+
+
+
+          !> LATSQR: computes a blocked Tall-Skinny QR factorization of
+          !> a complex M-by-N matrix A for M >= N:
+          !> A = Q * ( R ),
+          !> ( 0 )
+          !> where:
+          !> Q is a M-by-M orthogonal matrix, stored on exit in an implicit
+          !> form in the elements below the diagonal of the array A and in
+          !> the elements of the array T;
+          !> R is an upper-triangular N-by-N matrix, stored on exit in
+          !> the elements on and above the diagonal of the array A.
+          !> 0 is a (M-N)-by-N zero matrix, and is not stored.
+          interface latsqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clatsqr( m, n, mb, nb, a, lda, t, ldt, work,lwork, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,mb,nb,ldt,lwork
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*),t(ldt,*)
+               end subroutine clatsqr
+#else 
+               module procedure stdlib_clatsqr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlatsqr( m, n, mb, nb, a, lda, t, ldt, work,lwork, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,mb,nb,ldt,lwork
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: work(*),t(ldt,*)
+               end subroutine dlatsqr
+#else 
+               module procedure stdlib_dlatsqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlatsqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slatsqr( m, n, mb, nb, a, lda, t, ldt, work,lwork, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,mb,nb,ldt,lwork
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: work(*),t(ldt,*)
+               end subroutine slatsqr
+#else 
+               module procedure stdlib_slatsqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlatsqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlatsqr( m, n, mb, nb, a, lda, t, ldt, work,lwork, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n,mb,nb,ldt,lwork
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*),t(ldt,*)
+               end subroutine zlatsqr
+#else 
+               module procedure stdlib_zlatsqr
+#endif
+          end interface latsqr
+
+
+
+          !> LAUNHR_COL_GETRFNP: computes the modified LU factorization without
+          !> pivoting of a complex general M-by-N matrix A. The factorization has
+          !> the form:
+          !> A - S = L * U,
+          !> where:
+          !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+          !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+          !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+          !> i-1 steps of Gaussian elimination. This means that the diagonal
+          !> element at each step of "modified" Gaussian elimination is
+          !> at least one in absolute value (so that division-by-zero not
+          !> not possible during the division by the diagonal element);
+          !> L is a M-by-N lower triangular matrix with unit diagonal elements
+          !> (lower trapezoidal if M > N);
+          !> and U is a M-by-N upper triangular matrix
+          !> (upper trapezoidal if M < N).
+          !> This routine is an auxiliary routine used in the Householder
+          !> reconstruction routine CUNHR_COL. In CUNHR_COL, this routine is
+          !> applied to an M-by-N matrix A with orthonormal columns, where each
+          !> element is bounded by one in absolute value. With the choice of
+          !> the matrix S above, one can show that the diagonal element at each
+          !> step of Gaussian elimination is the largest (in absolute value) in
+          !> the column on or below the diagonal, so that no pivoting is required
+          !> for numerical stability [1].
+          !> For more details on the Householder reconstruction algorithm,
+          !> including the modified LU factorization, see [1].
+          !> This is the blocked right-looking version of the algorithm,
+          !> calling Level 3 BLAS to update the submatrix. To factorize a block,
+          !> this routine calls the recursive routine LAUNHR_COL_GETRFNP2.
+          !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+          !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+          !> E. Solomonik, J. Parallel Distrib. Comput.,
+          !> vol. 85, pp. 3-31, 2015.
+          interface launhr_col_getrfnp
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine claunhr_col_getrfnp( m, n, a, lda, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: d(*)
+               end subroutine claunhr_col_getrfnp
+#else 
+               module procedure stdlib_claunhr_col_getrfnp
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaunhr_col_getrfnp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlaunhr_col_getrfnp( m, n, a, lda, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: d(*)
+               end subroutine zlaunhr_col_getrfnp
+#else 
+               module procedure stdlib_zlaunhr_col_getrfnp
+#endif
+          end interface launhr_col_getrfnp
+
+
+
+          !> LAUNHR_COL_GETRFNP2: computes the modified LU factorization without
+          !> pivoting of a complex general M-by-N matrix A. The factorization has
+          !> the form:
+          !> A - S = L * U,
+          !> where:
+          !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+          !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+          !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+          !> i-1 steps of Gaussian elimination. This means that the diagonal
+          !> element at each step of "modified" Gaussian elimination is at
+          !> least one in absolute value (so that division-by-zero not
+          !> possible during the division by the diagonal element);
+          !> L is a M-by-N lower triangular matrix with unit diagonal elements
+          !> (lower trapezoidal if M > N);
+          !> and U is a M-by-N upper triangular matrix
+          !> (upper trapezoidal if M < N).
+          !> This routine is an auxiliary routine used in the Householder
+          !> reconstruction routine CUNHR_COL. In CUNHR_COL, this routine is
+          !> applied to an M-by-N matrix A with orthonormal columns, where each
+          !> element is bounded by one in absolute value. With the choice of
+          !> the matrix S above, one can show that the diagonal element at each
+          !> step of Gaussian elimination is the largest (in absolute value) in
+          !> the column on or below the diagonal, so that no pivoting is required
+          !> for numerical stability [1].
+          !> For more details on the Householder reconstruction algorithm,
+          !> including the modified LU factorization, see [1].
+          !> This is the recursive version of the LU factorization algorithm.
+          !> Denote A - S by B. The algorithm divides the matrix B into four
+          !> submatrices:
+          !> [  B11 | B12  ]  where B11 is n1 by n1,
+          !> B = [ -----|----- ]        B21 is (m-n1) by n1,
+          !> [  B21 | B22  ]        B12 is n1 by n2,
+          !> B22 is (m-n1) by n2,
+          !> with n1 = min(m,n)/2, n2 = n-n1.
+          !> The subroutine calls itself to factor B11, solves for B21,
+          !> solves for B12, updates B22, then calls itself to factor B22.
+          !> For more details on the recursive LU algorithm, see [2].
+          !> LAUNHR_COL_GETRFNP2 is called to factorize a block by the blocked
+          !> routine CLAUNHR_COL_GETRFNP, which uses blocked code calling
+          !> Level 3 BLAS to update the submatrix. However, LAUNHR_COL_GETRFNP2
+          !> is self-sufficient and can be used without CLAUNHR_COL_GETRFNP.
+          !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+          !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+          !> E. Solomonik, J. Parallel Distrib. Comput.,
+          !> vol. 85, pp. 3-31, 2015.
+          !> [2] "Recursion leads to automatic variable blocking for dense linear
+          !> algebra algorithms", F. Gustavson, IBM J. of Res. and Dev.,
+          !> vol. 41, no. 6, pp. 737-755, 1997.
+          interface launhr_col_getrfnp2
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine claunhr_col_getrfnp2( m, n, a, lda, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: d(*)
+               end subroutine claunhr_col_getrfnp2
+#else 
+               module procedure stdlib_claunhr_col_getrfnp2
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlaunhr_col_getrfnp2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine zlaunhr_col_getrfnp2( m, n, a, lda, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: d(*)
+               end subroutine zlaunhr_col_getrfnp2
+#else 
+               module procedure stdlib_zlaunhr_col_getrfnp2
+#endif
+          end interface launhr_col_getrfnp2
+
+
+
+          !> LAUUM: computes the product U * U**H or L**H * L, where the triangular
+          !> factor U or L is stored in the upper or lower triangular part of
+          !> the array A.
+          !> If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+          !> overwriting the factor U in A.
+          !> If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+          !> overwriting the factor L in A.
+          !> This is the blocked form of the algorithm, calling Level 3 BLAS.
+          interface lauum
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine clauum( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine clauum
+#else 
+               module procedure stdlib_clauum
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dlauum( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dlauum
+#else 
+               module procedure stdlib_dlauum
+#endif
+#:if WITH_QP
+               module procedure stdlib_qlauum
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine slauum( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine slauum
+#else 
+               module procedure stdlib_slauum
+#endif
+#:if WITH_QP
+               module procedure stdlib_wlauum
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zlauum( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zlauum
+#else 
+               module procedure stdlib_zlauum
+#endif
+          end interface lauum
+
+
+
+          !> OPGTR: generates a real orthogonal matrix Q which is defined as the
+          !> product of n-1 elementary reflectors H(i) of order n, as returned by
+          !> DSPTRD using packed storage:
+          !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+          !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+          interface opgtr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dopgtr( uplo, n, ap, tau, q, ldq, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldq,n
+                    real(dp), intent(in) :: ap(*),tau(*)
+                    real(dp), intent(out) :: q(ldq,*),work(*)
+               end subroutine dopgtr
+#else 
+               module procedure stdlib_dopgtr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qopgtr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sopgtr( uplo, n, ap, tau, q, ldq, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldq,n
+                    real(sp), intent(in) :: ap(*),tau(*)
+                    real(sp), intent(out) :: q(ldq,*),work(*)
+               end subroutine sopgtr
+#else 
+               module procedure stdlib_sopgtr
+#endif
+          end interface opgtr
+
+
+
+          !> OPMTR: overwrites the general real M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'T':      Q**T * C       C * Q**T
+          !> where Q is a real orthogonal matrix of order nq, with nq = m if
+          !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+          !> nq-1 elementary reflectors, as returned by DSPTRD using packed
+          !> storage:
+          !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+          !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+          interface opmtr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dopmtr( side, uplo, trans, m, n, ap, tau, c, ldc, work,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldc,m,n
+                    real(dp), intent(inout) :: ap(*),c(ldc,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dopmtr
+#else 
+               module procedure stdlib_dopmtr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qopmtr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sopmtr( side, uplo, trans, m, n, ap, tau, c, ldc, work,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldc,m,n
+                    real(sp), intent(inout) :: ap(*),c(ldc,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sopmtr
+#else 
+               module procedure stdlib_sopmtr
+#endif
+          end interface opmtr
+
+
+
+          !> ORBDB: simultaneously bidiagonalizes the blocks of an M-by-M
+          !> partitioned orthogonal matrix X:
+          !> [ B11 | B12 0  0 ]
+          !> [ X11 | X12 ]   [ P1 |    ] [  0  |  0 -I  0 ] [ Q1 |    ]**T
+          !> X = [-----------] = [---------] [----------------] [---------]   .
+          !> [ X21 | X22 ]   [    | P2 ] [ B21 | B22 0  0 ] [    | Q2 ]
+          !> [  0  |  0  0  I ]
+          !> X11 is P-by-Q. Q must be no larger than P, M-P, or M-Q. (If this is
+          !> not the case, then X must be transposed and/or permuted. This can be
+          !> done in constant time using the TRANS and SIGNS options. See DORCSD
+          !> for details.)
+          !> The orthogonal matrices P1, P2, Q1, and Q2 are P-by-P, (M-P)-by-
+          !> (M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. They are
+          !> represented implicitly by Householder vectors.
+          !> B11, B12, B21, and B22 are Q-by-Q bidiagonal matrices represented
+          !> implicitly by angles THETA, PHI.
+          interface orbdb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+                         ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: signs,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldx11,ldx12,ldx21,ldx22,lwork,m,p,q
+                    real(dp), intent(out) :: phi(*),theta(*),taup1(*),taup2(*),tauq1(*),tauq2(*),&
+                              work(*)
+                    real(dp), intent(inout) :: x11(ldx11,*),x12(ldx12,*),x21(ldx21,*),x22(ldx22,*)
+                              
+               end subroutine dorbdb
+#else 
+               module procedure stdlib_dorbdb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorbdb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+                         ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: signs,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldx11,ldx12,ldx21,ldx22,lwork,m,p,q
+                    real(sp), intent(out) :: phi(*),theta(*),taup1(*),taup2(*),tauq1(*),tauq2(*),&
+                              work(*)
+                    real(sp), intent(inout) :: x11(ldx11,*),x12(ldx12,*),x21(ldx21,*),x22(ldx22,*)
+                              
+               end subroutine sorbdb
+#else 
+               module procedure stdlib_sorbdb
+#endif
+          end interface orbdb
+
+
+
+          !> ORBDB1: simultaneously bidiagonalizes the blocks of a tall and skinny
+          !> matrix X with orthonomal columns:
+          !> [ B11 ]
+          !> [ X11 ]   [ P1 |    ] [  0  ]
+          !> [-----] = [---------] [-----] Q1**T .
+          !> [ X21 ]   [    | P2 ] [ B21 ]
+          !> [  0  ]
+          !> X11 is P-by-Q, and X21 is (M-P)-by-Q. Q must be no larger than P,
+          !> M-P, or M-Q. Routines DORBDB2, DORBDB3, and DORBDB4 handle cases in
+          !> which Q is not the minimum dimension.
+          !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+          !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+          !> Householder vectors.
+          !> B11 and B12 are Q-by-Q bidiagonal matrices represented implicitly by
+          !> angles THETA, PHI.
+          interface orbdb1
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(dp), intent(out) :: phi(*),theta(*),taup1(*),taup2(*),tauq1(*),work(*)
+                              
+                    real(dp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine dorbdb1
+#else 
+               module procedure stdlib_dorbdb1
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorbdb1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(sp), intent(out) :: phi(*),theta(*),taup1(*),taup2(*),tauq1(*),work(*)
+                              
+                    real(sp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine sorbdb1
+#else 
+               module procedure stdlib_sorbdb1
+#endif
+          end interface orbdb1
+
+
+
+          !> ORBDB2: simultaneously bidiagonalizes the blocks of a tall and skinny
+          !> matrix X with orthonomal columns:
+          !> [ B11 ]
+          !> [ X11 ]   [ P1 |    ] [  0  ]
+          !> [-----] = [---------] [-----] Q1**T .
+          !> [ X21 ]   [    | P2 ] [ B21 ]
+          !> [  0  ]
+          !> X11 is P-by-Q, and X21 is (M-P)-by-Q. P must be no larger than M-P,
+          !> Q, or M-Q. Routines DORBDB1, DORBDB3, and DORBDB4 handle cases in
+          !> which P is not the minimum dimension.
+          !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+          !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+          !> Householder vectors.
+          !> B11 and B12 are P-by-P bidiagonal matrices represented implicitly by
+          !> angles THETA, PHI.
+          interface orbdb2
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(dp), intent(out) :: phi(*),theta(*),taup1(*),taup2(*),tauq1(*),work(*)
+                              
+                    real(dp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine dorbdb2
+#else 
+               module procedure stdlib_dorbdb2
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorbdb2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(sp), intent(out) :: phi(*),theta(*),taup1(*),taup2(*),tauq1(*),work(*)
+                              
+                    real(sp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine sorbdb2
+#else 
+               module procedure stdlib_sorbdb2
+#endif
+          end interface orbdb2
+
+
+
+          !> ORBDB3: simultaneously bidiagonalizes the blocks of a tall and skinny
+          !> matrix X with orthonomal columns:
+          !> [ B11 ]
+          !> [ X11 ]   [ P1 |    ] [  0  ]
+          !> [-----] = [---------] [-----] Q1**T .
+          !> [ X21 ]   [    | P2 ] [ B21 ]
+          !> [  0  ]
+          !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-P must be no larger than P,
+          !> Q, or M-Q. Routines DORBDB1, DORBDB2, and DORBDB4 handle cases in
+          !> which M-P is not the minimum dimension.
+          !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+          !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+          !> Householder vectors.
+          !> B11 and B12 are (M-P)-by-(M-P) bidiagonal matrices represented
+          !> implicitly by angles THETA, PHI.
+          interface orbdb3
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(dp), intent(out) :: phi(*),theta(*),taup1(*),taup2(*),tauq1(*),work(*)
+                              
+                    real(dp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine dorbdb3
+#else 
+               module procedure stdlib_dorbdb3
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorbdb3
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(sp), intent(out) :: phi(*),theta(*),taup1(*),taup2(*),tauq1(*),work(*)
+                              
+                    real(sp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine sorbdb3
+#else 
+               module procedure stdlib_sorbdb3
+#endif
+          end interface orbdb3
+
+
+
+          !> ORBDB4: simultaneously bidiagonalizes the blocks of a tall and skinny
+          !> matrix X with orthonomal columns:
+          !> [ B11 ]
+          !> [ X11 ]   [ P1 |    ] [  0  ]
+          !> [-----] = [---------] [-----] Q1**T .
+          !> [ X21 ]   [    | P2 ] [ B21 ]
+          !> [  0  ]
+          !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-Q must be no larger than P,
+          !> M-P, or Q. Routines DORBDB1, DORBDB2, and DORBDB3 handle cases in
+          !> which M-Q is not the minimum dimension.
+          !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+          !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+          !> Householder vectors.
+          !> B11 and B12 are (M-Q)-by-(M-Q) bidiagonal matrices represented
+          !> implicitly by angles THETA, PHI.
+          interface orbdb4
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, phantom, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(dp), intent(out) :: phi(*),theta(*),phantom(*),taup1(*),taup2(*),tauq1(*)&
+                              ,work(*)
+                    real(dp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine dorbdb4
+#else 
+               module procedure stdlib_dorbdb4
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorbdb4
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, phantom, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(sp), intent(out) :: phi(*),theta(*),phantom(*),taup1(*),taup2(*),tauq1(*)&
+                              ,work(*)
+                    real(sp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine sorbdb4
+#else 
+               module procedure stdlib_sorbdb4
+#endif
+          end interface orbdb4
+
+
+
+          !> ORBDB5: orthogonalizes the column vector
+          !> X = [ X1 ]
+          !> [ X2 ]
+          !> with respect to the columns of
+          !> Q = [ Q1 ] .
+          !> [ Q2 ]
+          !> The columns of Q must be orthonormal.
+          !> If the projection is zero according to Kahan's "twice is enough"
+          !> criterion, then some other vector from the orthogonal complement
+          !> is returned. This vector is chosen in an arbitrary but deterministic
+          !> way.
+          interface orbdb5
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorbdb5( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx1,incx2,ldq1,ldq2,lwork,m1,m2,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(in) :: q1(ldq1,*),q2(ldq2,*)
+                    real(dp), intent(out) :: work(*)
+                    real(dp), intent(inout) :: x1(*),x2(*)
+               end subroutine dorbdb5
+#else 
+               module procedure stdlib_dorbdb5
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorbdb5
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorbdb5( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx1,incx2,ldq1,ldq2,lwork,m1,m2,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(in) :: q1(ldq1,*),q2(ldq2,*)
+                    real(sp), intent(out) :: work(*)
+                    real(sp), intent(inout) :: x1(*),x2(*)
+               end subroutine sorbdb5
+#else 
+               module procedure stdlib_sorbdb5
+#endif
+          end interface orbdb5
+
+
+
+          !> ORBDB6: orthogonalizes the column vector
+          !> X = [ X1 ]
+          !> [ X2 ]
+          !> with respect to the columns of
+          !> Q = [ Q1 ] .
+          !> [ Q2 ]
+          !> The columns of Q must be orthonormal.
+          !> If the projection is zero according to Kahan's "twice is enough"
+          !> criterion, then the zero vector is returned.
+          interface orbdb6
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx1,incx2,ldq1,ldq2,lwork,m1,m2,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(in) :: q1(ldq1,*),q2(ldq2,*)
+                    real(dp), intent(out) :: work(*)
+                    real(dp), intent(inout) :: x1(*),x2(*)
+               end subroutine dorbdb6
+#else 
+               module procedure stdlib_dorbdb6
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorbdb6
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx1,incx2,ldq1,ldq2,lwork,m1,m2,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(in) :: q1(ldq1,*),q2(ldq2,*)
+                    real(sp), intent(out) :: work(*)
+                    real(sp), intent(inout) :: x1(*),x2(*)
+               end subroutine sorbdb6
+#else 
+               module procedure stdlib_sorbdb6
+#endif
+          end interface orbdb6
+
+
+
+          !> ORCSD: computes the CS decomposition of an M-by-M partitioned
+          !> orthogonal matrix X:
+          !> [  I  0  0 |  0  0  0 ]
+          !> [  0  C  0 |  0 -S  0 ]
+          !> [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**T
+          !> X = [-----------] = [---------] [---------------------] [---------]   .
+          !> [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
+          !> [  0  S  0 |  0  C  0 ]
+          !> [  0  0  I |  0  0  0 ]
+          !> X11 is P-by-Q. The orthogonal matrices U1, U2, V1, and V2 are P-by-P,
+          !> (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
+          !> R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
+          !> which R = MIN(P,M-P,Q,M-Q).
+          interface orcsd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               recursive subroutine dorcsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, &
+               x11, ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, &
+                         ldv1t, v2t,ldv2t, work, lwork, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu1,jobu2,jobv1t,jobv2t,signs,trans
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldu1,ldu2,ldv1t,ldv2t,ldx11,ldx12,ldx21,ldx22,&
+                              lwork,m,p,q
+                    real(dp), intent(out) :: theta(*),u1(ldu1,*),u2(ldu2,*),v1t(ldv1t,*),v2t(&
+                              ldv2t,*),work(*)
+                    real(dp), intent(inout) :: x11(ldx11,*),x12(ldx12,*),x21(ldx21,*),x22(ldx22,*)
+                              
+               end subroutine dorcsd
+#else 
+               module procedure stdlib_dorcsd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorcsd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               recursive subroutine sorcsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, &
+               x11, ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, &
+                         ldv1t, v2t,ldv2t, work, lwork, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu1,jobu2,jobv1t,jobv2t,signs,trans
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldu1,ldu2,ldv1t,ldv2t,ldx11,ldx12,ldx21,ldx22,&
+                              lwork,m,p,q
+                    real(sp), intent(out) :: theta(*),u1(ldu1,*),u2(ldu2,*),v1t(ldv1t,*),v2t(&
+                              ldv2t,*),work(*)
+                    real(sp), intent(inout) :: x11(ldx11,*),x12(ldx12,*),x21(ldx21,*),x22(ldx22,*)
+                              
+               end subroutine sorcsd
+#else 
+               module procedure stdlib_sorcsd
+#endif
+          end interface orcsd
+
+
+
+          !> ORCSD2BY1: computes the CS decomposition of an M-by-Q matrix X with
+          !> orthonormal columns that has been partitioned into a 2-by-1 block
+          !> structure:
+          !> [  I1 0  0 ]
+          !> [  0  C  0 ]
+          !> [ X11 ]   [ U1 |    ] [  0  0  0 ]
+          !> X = [-----] = [---------] [----------] V1**T .
+          !> [ X21 ]   [    | U2 ] [  0  0  0 ]
+          !> [  0  S  0 ]
+          !> [  0  0  I2]
+          !> X11 is P-by-Q. The orthogonal matrices U1, U2, and V1 are P-by-P,
+          !> (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
+          !> nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
+          !> R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
+          !> K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
+          interface orcsd2by1
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dorcsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta,&
+                          u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu1,jobu2,jobv1t
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldu1,ldu2,ldv1t,lwork,ldx11,ldx21,m,p,q
+                    real(dp), intent(out) :: theta(*),u1(ldu1,*),u2(ldu2,*),v1t(ldv1t,*),work(*)
+                              
+                    real(dp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine dorcsd2by1
+#else 
+               module procedure stdlib_dorcsd2by1
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorcsd2by1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sorcsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta,&
+                          u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu1,jobu2,jobv1t
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldu1,ldu2,ldv1t,lwork,ldx11,ldx21,m,p,q
+                    real(sp), intent(out) :: theta(*),u1(ldu1,*),u2(ldu2,*),v1t(ldv1t,*),work(*)
+                              
+                    real(sp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine sorcsd2by1
+#else 
+               module procedure stdlib_sorcsd2by1
+#endif
+          end interface orcsd2by1
+
+
+
+          !> ORG2L: generates an m by n real matrix Q with orthonormal columns,
+          !> which is defined as the last n columns of a product of k elementary
+          !> reflectors of order m
+          !> Q  =  H(k) . . . H(2) H(1)
+          !> as returned by DGEQLF.
+          interface org2l
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorg2l( m, n, k, a, lda, tau, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dorg2l
+#else 
+               module procedure stdlib_dorg2l
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorg2l
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorg2l( m, n, k, a, lda, tau, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sorg2l
+#else 
+               module procedure stdlib_sorg2l
+#endif
+          end interface org2l
+
+
+
+          !> ORG2R: generates an m by n real matrix Q with orthonormal columns,
+          !> which is defined as the first n columns of a product of k elementary
+          !> reflectors of order m
+          !> Q  =  H(1) H(2) . . . H(k)
+          !> as returned by DGEQRF.
+          interface org2r
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorg2r( m, n, k, a, lda, tau, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dorg2r
+#else 
+               module procedure stdlib_dorg2r
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorg2r
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorg2r( m, n, k, a, lda, tau, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sorg2r
+#else 
+               module procedure stdlib_sorg2r
+#endif
+          end interface org2r
+
+
+
+          !> ORGBR: generates one of the real orthogonal matrices Q or P**T
+          !> determined by DGEBRD when reducing a real matrix A to bidiagonal
+          !> form: A = Q * B * P**T.  Q and P**T are defined as products of
+          !> elementary reflectors H(i) or G(i) respectively.
+          !> If VECT = 'Q', A is assumed to have been an M-by-K matrix, and Q
+          !> is of order M:
+          !> if m >= k, Q = H(1) H(2) . . . H(k) and ORGBR returns the first n
+          !> columns of Q, where m >= n >= k;
+          !> if m < k, Q = H(1) H(2) . . . H(m-1) and ORGBR returns Q as an
+          !> M-by-M matrix.
+          !> If VECT = 'P', A is assumed to have been a K-by-N matrix, and P**T
+          !> is of order N:
+          !> if k < n, P**T = G(k) . . . G(2) G(1) and ORGBR returns the first m
+          !> rows of P**T, where n >= m >= k;
+          !> if k >= n, P**T = G(n-1) . . . G(2) G(1) and ORGBR returns P**T as
+          !> an N-by-N matrix.
+          interface orgbr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorgbr( vect, m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dorgbr
+#else 
+               module procedure stdlib_dorgbr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorgbr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorgbr( vect, m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sorgbr
+#else 
+               module procedure stdlib_sorgbr
+#endif
+          end interface orgbr
+
+
+
+          !> ORGHR: generates a real orthogonal matrix Q which is defined as the
+          !> product of IHI-ILO elementary reflectors of order N, as returned by
+          !> DGEHRD:
+          !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+          interface orghr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorghr( n, ilo, ihi, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ilo,lda,lwork,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dorghr
+#else 
+               module procedure stdlib_dorghr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorghr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorghr( n, ilo, ihi, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ilo,lda,lwork,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sorghr
+#else 
+               module procedure stdlib_sorghr
+#endif
+          end interface orghr
+
+
+
+          !> ORGLQ: generates an M-by-N real matrix Q with orthonormal rows,
+          !> which is defined as the first M rows of a product of K elementary
+          !> reflectors of order N
+          !> Q  =  H(k) . . . H(2) H(1)
+          !> as returned by DGELQF.
+          interface orglq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorglq( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dorglq
+#else 
+               module procedure stdlib_dorglq
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorglq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorglq( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sorglq
+#else 
+               module procedure stdlib_sorglq
+#endif
+          end interface orglq
+
+
+
+          !> ORGQL: generates an M-by-N real matrix Q with orthonormal columns,
+          !> which is defined as the last N columns of a product of K elementary
+          !> reflectors of order M
+          !> Q  =  H(k) . . . H(2) H(1)
+          !> as returned by DGEQLF.
+          interface orgql
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorgql( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dorgql
+#else 
+               module procedure stdlib_dorgql
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorgql
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorgql( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sorgql
+#else 
+               module procedure stdlib_sorgql
+#endif
+          end interface orgql
+
+
+
+          !> ORGQR: generates an M-by-N real matrix Q with orthonormal columns,
+          !> which is defined as the first N columns of a product of K elementary
+          !> reflectors of order M
+          !> Q  =  H(1) H(2) . . . H(k)
+          !> as returned by DGEQRF.
+          interface orgqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorgqr( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dorgqr
+#else 
+               module procedure stdlib_dorgqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorgqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorgqr( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sorgqr
+#else 
+               module procedure stdlib_sorgqr
+#endif
+          end interface orgqr
+
+
+
+          !> ORGRQ: generates an M-by-N real matrix Q with orthonormal rows,
+          !> which is defined as the last M rows of a product of K elementary
+          !> reflectors of order N
+          !> Q  =  H(1) H(2) . . . H(k)
+          !> as returned by DGERQF.
+          interface orgrq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorgrq( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dorgrq
+#else 
+               module procedure stdlib_dorgrq
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorgrq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorgrq( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sorgrq
+#else 
+               module procedure stdlib_sorgrq
+#endif
+          end interface orgrq
+
+
+
+          !> ORGTR: generates a real orthogonal matrix Q which is defined as the
+          !> product of n-1 elementary reflectors of order N, as returned by
+          !> DSYTRD:
+          !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+          !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+          interface orgtr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorgtr( uplo, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dorgtr
+#else 
+               module procedure stdlib_dorgtr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorgtr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorgtr( uplo, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sorgtr
+#else 
+               module procedure stdlib_sorgtr
+#endif
+          end interface orgtr
+
+
+
+          !> ORGTSQR: generates an M-by-N real matrix Q_out with orthonormal columns,
+          !> which are the first N columns of a product of real orthogonal
+          !> matrices of order M which are returned by DLATSQR
+          !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+          !> See the documentation for DLATSQR.
+          interface orgtsqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorgtsqr( m, n, mb, nb, a, lda, t, ldt, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,lwork,m,n,mb,nb
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(in) :: t(ldt,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dorgtsqr
+#else 
+               module procedure stdlib_dorgtsqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorgtsqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorgtsqr( m, n, mb, nb, a, lda, t, ldt, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,lwork,m,n,mb,nb
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(in) :: t(ldt,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sorgtsqr
+#else 
+               module procedure stdlib_sorgtsqr
+#endif
+          end interface orgtsqr
+
+
+
+          !> ORGTSQR_ROW: generates an M-by-N real matrix Q_out with
+          !> orthonormal columns from the output of DLATSQR. These N orthonormal
+          !> columns are the first N columns of a product of complex unitary
+          !> matrices Q(k)_in of order M, which are returned by DLATSQR in
+          !> a special format.
+          !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+          !> The input matrices Q(k)_in are stored in row and column blocks in A.
+          !> See the documentation of DLATSQR for more details on the format of
+          !> Q(k)_in, where each Q(k)_in is represented by block Householder
+          !> transformations. This routine calls an auxiliary routine DLARFB_GETT,
+          !> where the computation is performed on each individual block. The
+          !> algorithm first sweeps NB-sized column blocks from the right to left
+          !> starting in the bottom row block and continues to the top row block
+          !> (hence _ROW in the routine name). This sweep is in reverse order of
+          !> the order in which DLATSQR generates the output blocks.
+          interface orgtsqr_row
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorgtsqr_row( m, n, mb, nb, a, lda, t, ldt, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,lwork,m,n,mb,nb
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(in) :: t(ldt,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dorgtsqr_row
+#else 
+               module procedure stdlib_dorgtsqr_row
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorgtsqr_row
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorgtsqr_row( m, n, mb, nb, a, lda, t, ldt, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,lwork,m,n,mb,nb
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(in) :: t(ldt,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sorgtsqr_row
+#else 
+               module procedure stdlib_sorgtsqr_row
+#endif
+          end interface orgtsqr_row
+
+
+
+          !> ORHR_COL: takes an M-by-N real matrix Q_in with orthonormal columns
+          !> as input, stored in A, and performs Householder Reconstruction (HR),
+          !> i.e. reconstructs Householder vectors V(i) implicitly representing
+          !> another M-by-N matrix Q_out, with the property that Q_in = Q_out*S,
+          !> where S is an N-by-N diagonal matrix with diagonal entries
+          !> equal to +1 or -1. The Householder vectors (columns V(i) of V) are
+          !> stored in A on output, and the diagonal entries of S are stored in D.
+          !> Block reflectors are also returned in T
+          !> (same output format as DGEQRT).
+          interface orhr_col
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorhr_col( m, n, nb, a, lda, t, ldt, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n,nb
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: d(*),t(ldt,*)
+               end subroutine dorhr_col
+#else 
+               module procedure stdlib_dorhr_col
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorhr_col
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorhr_col( m, n, nb, a, lda, t, ldt, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n,nb
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: d(*),t(ldt,*)
+               end subroutine sorhr_col
+#else 
+               module procedure stdlib_sorhr_col
+#endif
+          end interface orhr_col
+
+
+
+          !> ORM2L: overwrites the general real m by n matrix C with
+          !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+          !> Q**T * C  if SIDE = 'L' and TRANS = 'T', or
+          !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+          !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+          !> where Q is a real orthogonal matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(k) . . . H(2) H(1)
+          !> as returned by DGEQLF. Q is of order m if SIDE = 'L' and of order n
+          !> if SIDE = 'R'.
+          interface orm2l
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorm2l( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,m,n
+                    real(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dorm2l
+#else 
+               module procedure stdlib_dorm2l
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorm2l
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorm2l( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,m,n
+                    real(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sorm2l
+#else 
+               module procedure stdlib_sorm2l
+#endif
+          end interface orm2l
+
+
+
+          !> ORM2R: overwrites the general real m by n matrix C with
+          !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+          !> Q**T* C  if SIDE = 'L' and TRANS = 'T', or
+          !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+          !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+          !> where Q is a real orthogonal matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(1) H(2) . . . H(k)
+          !> as returned by DGEQRF. Q is of order m if SIDE = 'L' and of order n
+          !> if SIDE = 'R'.
+          interface orm2r
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dorm2r( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,m,n
+                    real(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dorm2r
+#else 
+               module procedure stdlib_dorm2r
+#endif
+#:if WITH_QP
+               module procedure stdlib_qorm2r
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sorm2r( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,m,n
+                    real(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sorm2r
+#else 
+               module procedure stdlib_sorm2r
+#endif
+          end interface orm2r
+
+
+
+          !> If VECT = 'Q', ORMBR: overwrites the general real M-by-N matrix C
+          !> with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'T':      Q**T * C       C * Q**T
+          !> If VECT = 'P', ORMBR overwrites the general real M-by-N matrix C
+          !> with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      P * C          C * P
+          !> TRANS = 'T':      P**T * C       C * P**T
+          !> Here Q and P**T are the orthogonal matrices determined by DGEBRD when
+          !> reducing a real matrix A to bidiagonal form: A = Q * B * P**T. Q and
+          !> P**T are defined as products of elementary reflectors H(i) and G(i)
+          !> respectively.
+          !> Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the
+          !> order of the orthogonal matrix Q or P**T that is applied.
+          !> If VECT = 'Q', A is assumed to have been an NQ-by-K matrix:
+          !> if nq >= k, Q = H(1) H(2) . . . H(k);
+          !> if nq < k, Q = H(1) H(2) . . . H(nq-1).
+          !> If VECT = 'P', A is assumed to have been a K-by-NQ matrix:
+          !> if k < nq, P = G(1) G(2) . . . G(k);
+          !> if k >= nq, P = G(1) G(2) . . . G(nq-1).
+          interface ormbr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dormbr( vect, side, trans, m, n, k, a, lda, tau, c,ldc, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans,vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dormbr
+#else 
+               module procedure stdlib_dormbr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qormbr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sormbr( vect, side, trans, m, n, k, a, lda, tau, c,ldc, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans,vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sormbr
+#else 
+               module procedure stdlib_sormbr
+#endif
+          end interface ormbr
+
+
+
+          !> ORMHR: overwrites the general real M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'T':      Q**T * C       C * Q**T
+          !> where Q is a real orthogonal matrix of order nq, with nq = m if
+          !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+          !> IHI-ILO elementary reflectors, as returned by DGEHRD:
+          !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+          interface ormhr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dormhr( side, trans, m, n, ilo, ihi, a, lda, tau, c,ldc, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(in) :: ihi,ilo,lda,ldc,lwork,m,n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dormhr
+#else 
+               module procedure stdlib_dormhr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qormhr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sormhr( side, trans, m, n, ilo, ihi, a, lda, tau, c,ldc, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(in) :: ihi,ilo,lda,ldc,lwork,m,n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sormhr
+#else 
+               module procedure stdlib_sormhr
+#endif
+          end interface ormhr
+
+
+
+          !> ORMLQ: overwrites the general real M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'T':      Q**T * C       C * Q**T
+          !> where Q is a real orthogonal matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(k) . . . H(2) H(1)
+          !> as returned by DGELQF. Q is of order M if SIDE = 'L' and of order N
+          !> if SIDE = 'R'.
+          interface ormlq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dormlq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dormlq
+#else 
+               module procedure stdlib_dormlq
+#endif
+#:if WITH_QP
+               module procedure stdlib_qormlq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sormlq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sormlq
+#else 
+               module procedure stdlib_sormlq
+#endif
+          end interface ormlq
+
+
+
+          !> ORMQL: overwrites the general real M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'T':      Q**T * C       C * Q**T
+          !> where Q is a real orthogonal matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(k) . . . H(2) H(1)
+          !> as returned by DGEQLF. Q is of order M if SIDE = 'L' and of order N
+          !> if SIDE = 'R'.
+          interface ormql
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dormql( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dormql
+#else 
+               module procedure stdlib_dormql
+#endif
+#:if WITH_QP
+               module procedure stdlib_qormql
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sormql( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sormql
+#else 
+               module procedure stdlib_sormql
+#endif
+          end interface ormql
+
+
+
+          !> ORMQR: overwrites the general real M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'T':      Q**T * C       C * Q**T
+          !> where Q is a real orthogonal matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(1) H(2) . . . H(k)
+          !> as returned by DGEQRF. Q is of order M if SIDE = 'L' and of order N
+          !> if SIDE = 'R'.
+          interface ormqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dormqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dormqr
+#else 
+               module procedure stdlib_dormqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qormqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sormqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sormqr
+#else 
+               module procedure stdlib_sormqr
+#endif
+          end interface ormqr
+
+
+
+          !> ORMRQ: overwrites the general real M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'T':      Q**T * C       C * Q**T
+          !> where Q is a real orthogonal matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(1) H(2) . . . H(k)
+          !> as returned by DGERQF. Q is of order M if SIDE = 'L' and of order N
+          !> if SIDE = 'R'.
+          interface ormrq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dormrq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dormrq
+#else 
+               module procedure stdlib_dormrq
+#endif
+#:if WITH_QP
+               module procedure stdlib_qormrq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sormrq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sormrq
+#else 
+               module procedure stdlib_sormrq
+#endif
+          end interface ormrq
+
+
+
+          !> ORMRZ: overwrites the general real M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'T':      Q**T * C       C * Q**T
+          !> where Q is a real orthogonal matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(1) H(2) . . . H(k)
+          !> as returned by DTZRZF. Q is of order M if SIDE = 'L' and of order N
+          !> if SIDE = 'R'.
+          interface ormrz
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dormrz( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,l,lda,ldc,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dormrz
+#else 
+               module procedure stdlib_dormrz
+#endif
+#:if WITH_QP
+               module procedure stdlib_qormrz
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sormrz( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,l,lda,ldc,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sormrz
+#else 
+               module procedure stdlib_sormrz
+#endif
+          end interface ormrz
+
+
+
+          !> ORMTR: overwrites the general real M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'T':      Q**T * C       C * Q**T
+          !> where Q is a real orthogonal matrix of order nq, with nq = m if
+          !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+          !> nq-1 elementary reflectors, as returned by DSYTRD:
+          !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+          !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+          interface ormtr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dormtr( side, uplo, trans, m, n, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldc,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(dp), intent(in) :: tau(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dormtr
+#else 
+               module procedure stdlib_dormtr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qormtr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sormtr( side, uplo, trans, m, n, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldc,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    real(sp), intent(in) :: tau(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sormtr
+#else 
+               module procedure stdlib_sormtr
+#endif
+          end interface ormtr
+
+
+
+          !> PBCON: estimates the reciprocal of the condition number (in the
+          !> 1-norm) of a complex Hermitian positive definite band matrix using
+          !> the Cholesky factorization A = U**H*U or A = L*L**H computed by
+          !> CPBTRF.
+          !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+          !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+          interface pbcon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpbcon( uplo, n, kd, ab, ldab, anorm, rcond, work,rwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond,rwork(*)
+                    complex(sp), intent(in) :: ab(ldab,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cpbcon
+#else 
+               module procedure stdlib_cpbcon
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpbcon( uplo, n, kd, ab, ldab, anorm, rcond, work,iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(dp), intent(in) :: anorm,ab(ldab,*)
+                    real(dp), intent(out) :: rcond,work(*)
+               end subroutine dpbcon
+#else 
+               module procedure stdlib_dpbcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpbcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spbcon( uplo, n, kd, ab, ldab, anorm, rcond, work,iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(sp), intent(in) :: anorm,ab(ldab,*)
+                    real(sp), intent(out) :: rcond,work(*)
+               end subroutine spbcon
+#else 
+               module procedure stdlib_spbcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpbcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpbcon( uplo, n, kd, ab, ldab, anorm, rcond, work,rwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond,rwork(*)
+                    complex(dp), intent(in) :: ab(ldab,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zpbcon
+#else 
+               module procedure stdlib_zpbcon
+#endif
+          end interface pbcon
+
+
+
+          !> PBEQU: computes row and column scalings intended to equilibrate a
+          !> Hermitian positive definite band matrix A and reduce its condition
+          !> number (with respect to the two-norm).  S contains the scale factors,
+          !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+          !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+          !> choice of S puts the condition number of B within a factor N of the
+          !> smallest possible condition number over all possible diagonal
+          !> scalings.
+          interface pbequ
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpbequ( uplo, n, kd, ab, ldab, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(sp), intent(out) :: amax,scond,s(*)
+                    complex(sp), intent(in) :: ab(ldab,*)
+               end subroutine cpbequ
+#else 
+               module procedure stdlib_cpbequ
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpbequ( uplo, n, kd, ab, ldab, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(dp), intent(out) :: amax,scond,s(*)
+                    real(dp), intent(in) :: ab(ldab,*)
+               end subroutine dpbequ
+#else 
+               module procedure stdlib_dpbequ
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpbequ
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spbequ( uplo, n, kd, ab, ldab, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(sp), intent(out) :: amax,scond,s(*)
+                    real(sp), intent(in) :: ab(ldab,*)
+               end subroutine spbequ
+#else 
+               module procedure stdlib_spbequ
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpbequ
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpbequ( uplo, n, kd, ab, ldab, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(dp), intent(out) :: amax,scond,s(*)
+                    complex(dp), intent(in) :: ab(ldab,*)
+               end subroutine zpbequ
+#else 
+               module procedure stdlib_zpbequ
+#endif
+          end interface pbequ
+
+
+
+          !> PBRFS: improves the computed solution to a system of linear
+          !> equations when the coefficient matrix is Hermitian positive definite
+          !> and banded, and provides error bounds and backward error estimates
+          !> for the solution.
+          interface pbrfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b,ldb, x, ldx, &
+                         ferr, berr, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldafb,ldb,ldx,n,nrhs
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(sp), intent(in) :: ab(ldab,*),afb(ldafb,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: x(ldx,*)
+               end subroutine cpbrfs
+#else 
+               module procedure stdlib_cpbrfs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b,ldb, x, ldx, &
+                         ferr, berr, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kd,ldab,ldafb,ldb,ldx,n,nrhs
+                    real(dp), intent(in) :: ab(ldab,*),afb(ldafb,*),b(ldb,*)
+                    real(dp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(dp), intent(inout) :: x(ldx,*)
+               end subroutine dpbrfs
+#else 
+               module procedure stdlib_dpbrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpbrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b,ldb, x, ldx, &
+                         ferr, berr, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kd,ldab,ldafb,ldb,ldx,n,nrhs
+                    real(sp), intent(in) :: ab(ldab,*),afb(ldafb,*),b(ldb,*)
+                    real(sp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(sp), intent(inout) :: x(ldx,*)
+               end subroutine spbrfs
+#else 
+               module procedure stdlib_spbrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpbrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b,ldb, x, ldx, &
+                         ferr, berr, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldafb,ldb,ldx,n,nrhs
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(dp), intent(in) :: ab(ldab,*),afb(ldafb,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: x(ldx,*)
+               end subroutine zpbrfs
+#else 
+               module procedure stdlib_zpbrfs
+#endif
+          end interface pbrfs
+
+
+
+          !> PBSTF: computes a split Cholesky factorization of a complex
+          !> Hermitian positive definite band matrix A.
+          !> This routine is designed to be used in conjunction with CHBGST.
+          !> The factorization has the form  A = S**H*S  where S is a band matrix
+          !> of the same bandwidth as A and the following structure:
+          !> S = ( U    )
+          !> ( M  L )
+          !> where U is upper triangular of order m = (n+kd)/2, and L is lower
+          !> triangular of order n-m.
+          interface pbstf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpbstf( uplo, n, kd, ab, ldab, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    complex(sp), intent(inout) :: ab(ldab,*)
+               end subroutine cpbstf
+#else 
+               module procedure stdlib_cpbstf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpbstf( uplo, n, kd, ab, ldab, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(dp), intent(inout) :: ab(ldab,*)
+               end subroutine dpbstf
+#else 
+               module procedure stdlib_dpbstf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpbstf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spbstf( uplo, n, kd, ab, ldab, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(sp), intent(inout) :: ab(ldab,*)
+               end subroutine spbstf
+#else 
+               module procedure stdlib_spbstf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpbstf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpbstf( uplo, n, kd, ab, ldab, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    complex(dp), intent(inout) :: ab(ldab,*)
+               end subroutine zpbstf
+#else 
+               module procedure stdlib_zpbstf
+#endif
+          end interface pbstf
+
+
+
+          !> PBSV: computes the solution to a complex system of linear equations
+          !> A * X = B,
+          !> where A is an N-by-N Hermitian positive definite band matrix and X
+          !> and B are N-by-NRHS matrices.
+          !> The Cholesky decomposition is used to factor A as
+          !> A = U**H * U,  if UPLO = 'U', or
+          !> A = L * L**H,  if UPLO = 'L',
+          !> where U is an upper triangular band matrix, and L is a lower
+          !> triangular band matrix, with the same number of superdiagonals or
+          !> subdiagonals as A.  The factored form of A is then used to solve the
+          !> system of equations A * X = B.
+          interface pbsv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpbsv( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,n,nrhs
+                    complex(sp), intent(inout) :: ab(ldab,*),b(ldb,*)
+               end subroutine cpbsv
+#else 
+               module procedure stdlib_cpbsv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpbsv( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,n,nrhs
+                    real(dp), intent(inout) :: ab(ldab,*),b(ldb,*)
+               end subroutine dpbsv
+#else 
+               module procedure stdlib_dpbsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpbsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spbsv( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,n,nrhs
+                    real(sp), intent(inout) :: ab(ldab,*),b(ldb,*)
+               end subroutine spbsv
+#else 
+               module procedure stdlib_spbsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpbsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpbsv( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,n,nrhs
+                    complex(dp), intent(inout) :: ab(ldab,*),b(ldb,*)
+               end subroutine zpbsv
+#else 
+               module procedure stdlib_zpbsv
+#endif
+          end interface pbsv
+
+
+
+          !> PBTRF: computes the Cholesky factorization of a complex Hermitian
+          !> positive definite band matrix A.
+          !> The factorization has the form
+          !> A = U**H * U,  if UPLO = 'U', or
+          !> A = L  * L**H,  if UPLO = 'L',
+          !> where U is an upper triangular matrix and L is lower triangular.
+          interface pbtrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpbtrf( uplo, n, kd, ab, ldab, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    complex(sp), intent(inout) :: ab(ldab,*)
+               end subroutine cpbtrf
+#else 
+               module procedure stdlib_cpbtrf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpbtrf( uplo, n, kd, ab, ldab, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(dp), intent(inout) :: ab(ldab,*)
+               end subroutine dpbtrf
+#else 
+               module procedure stdlib_dpbtrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpbtrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spbtrf( uplo, n, kd, ab, ldab, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(sp), intent(inout) :: ab(ldab,*)
+               end subroutine spbtrf
+#else 
+               module procedure stdlib_spbtrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpbtrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpbtrf( uplo, n, kd, ab, ldab, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    complex(dp), intent(inout) :: ab(ldab,*)
+               end subroutine zpbtrf
+#else 
+               module procedure stdlib_zpbtrf
+#endif
+          end interface pbtrf
+
+
+
+          !> PBTRS: solves a system of linear equations A*X = B with a Hermitian
+          !> positive definite band matrix A using the Cholesky factorization
+          !> A = U**H*U or A = L*L**H computed by CPBTRF.
+          interface pbtrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,n,nrhs
+                    complex(sp), intent(in) :: ab(ldab,*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine cpbtrs
+#else 
+               module procedure stdlib_cpbtrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,n,nrhs
+                    real(dp), intent(in) :: ab(ldab,*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dpbtrs
+#else 
+               module procedure stdlib_dpbtrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpbtrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,n,nrhs
+                    real(sp), intent(in) :: ab(ldab,*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine spbtrs
+#else 
+               module procedure stdlib_spbtrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpbtrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,n,nrhs
+                    complex(dp), intent(in) :: ab(ldab,*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zpbtrs
+#else 
+               module procedure stdlib_zpbtrs
+#endif
+          end interface pbtrs
+
+
+
+          !> PFTRF: computes the Cholesky factorization of a complex Hermitian
+          !> positive definite matrix A.
+          !> The factorization has the form
+          !> A = U**H * U,  if UPLO = 'U', or
+          !> A = L  * L**H,  if UPLO = 'L',
+          !> where U is an upper triangular matrix and L is lower triangular.
+          !> This is the block version of the algorithm, calling Level 3 BLAS.
+          interface pftrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpftrf( transr, uplo, n, a, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(in) :: n
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(inout) :: a(0:*)
+               end subroutine cpftrf
+#else 
+               module procedure stdlib_cpftrf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpftrf( transr, uplo, n, a, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(in) :: n
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(inout) :: a(0:*)
+               end subroutine dpftrf
+#else 
+               module procedure stdlib_dpftrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpftrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spftrf( transr, uplo, n, a, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(in) :: n
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(inout) :: a(0:*)
+               end subroutine spftrf
+#else 
+               module procedure stdlib_spftrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpftrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpftrf( transr, uplo, n, a, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(in) :: n
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(inout) :: a(0:*)
+               end subroutine zpftrf
+#else 
+               module procedure stdlib_zpftrf
+#endif
+          end interface pftrf
+
+
+
+          !> PFTRI: computes the inverse of a complex Hermitian positive definite
+          !> matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+          !> computed by CPFTRF.
+          interface pftri
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpftri( transr, uplo, n, a, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(sp), intent(inout) :: a(0:*)
+               end subroutine cpftri
+#else 
+               module procedure stdlib_cpftri
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpftri( transr, uplo, n, a, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: a(0:*)
+               end subroutine dpftri
+#else 
+               module procedure stdlib_dpftri
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpftri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spftri( transr, uplo, n, a, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: a(0:*)
+               end subroutine spftri
+#else 
+               module procedure stdlib_spftri
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpftri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpftri( transr, uplo, n, a, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(dp), intent(inout) :: a(0:*)
+               end subroutine zpftri
+#else 
+               module procedure stdlib_zpftri
+#endif
+          end interface pftri
+
+
+
+          !> PFTRS: solves a system of linear equations A*X = B with a Hermitian
+          !> positive definite matrix A using the Cholesky factorization
+          !> A = U**H*U or A = L*L**H computed by CPFTRF.
+          interface pftrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpftrs( transr, uplo, n, nrhs, a, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(sp), intent(in) :: a(0:*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine cpftrs
+#else 
+               module procedure stdlib_cpftrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpftrs( transr, uplo, n, nrhs, a, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(dp), intent(in) :: a(0:*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dpftrs
+#else 
+               module procedure stdlib_dpftrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpftrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spftrs( transr, uplo, n, nrhs, a, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(sp), intent(in) :: a(0:*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine spftrs
+#else 
+               module procedure stdlib_spftrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpftrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpftrs( transr, uplo, n, nrhs, a, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(dp), intent(in) :: a(0:*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zpftrs
+#else 
+               module procedure stdlib_zpftrs
+#endif
+          end interface pftrs
+
+
+
+          !> POCON: estimates the reciprocal of the condition number (in the
+          !> 1-norm) of a complex Hermitian positive definite matrix using the
+          !> Cholesky factorization A = U**H*U or A = L*L**H computed by CPOTRF.
+          !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+          !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+          interface pocon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpocon( uplo, n, a, lda, anorm, rcond, work, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond,rwork(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cpocon
+#else 
+               module procedure stdlib_cpocon
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpocon( uplo, n, a, lda, anorm, rcond, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond,work(*)
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dpocon
+#else 
+               module procedure stdlib_dpocon
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpocon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spocon( uplo, n, a, lda, anorm, rcond, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond,work(*)
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine spocon
+#else 
+               module procedure stdlib_spocon
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpocon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpocon( uplo, n, a, lda, anorm, rcond, work, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond,rwork(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zpocon
+#else 
+               module procedure stdlib_zpocon
+#endif
+          end interface pocon
+
+
+
+          !> POEQU: computes row and column scalings intended to equilibrate a
+          !> Hermitian positive definite matrix A and reduce its condition number
+          !> (with respect to the two-norm).  S contains the scale factors,
+          !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+          !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+          !> choice of S puts the condition number of B within a factor N of the
+          !> smallest possible condition number over all possible diagonal
+          !> scalings.
+          interface poequ
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpoequ( n, a, lda, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: amax,scond,s(*)
+                    complex(sp), intent(in) :: a(lda,*)
+               end subroutine cpoequ
+#else 
+               module procedure stdlib_cpoequ
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpoequ( n, a, lda, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: amax,scond,s(*)
+                    real(dp), intent(in) :: a(lda,*)
+               end subroutine dpoequ
+#else 
+               module procedure stdlib_dpoequ
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpoequ
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spoequ( n, a, lda, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: amax,scond,s(*)
+                    real(sp), intent(in) :: a(lda,*)
+               end subroutine spoequ
+#else 
+               module procedure stdlib_spoequ
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpoequ
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpoequ( n, a, lda, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: amax,scond,s(*)
+                    complex(dp), intent(in) :: a(lda,*)
+               end subroutine zpoequ
+#else 
+               module procedure stdlib_zpoequ
+#endif
+          end interface poequ
+
+
+
+          !> POEQUB: computes row and column scalings intended to equilibrate a
+          !> Hermitian positive definite matrix A and reduce its condition number
+          !> (with respect to the two-norm).  S contains the scale factors,
+          !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+          !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+          !> choice of S puts the condition number of B within a factor N of the
+          !> smallest possible condition number over all possible diagonal
+          !> scalings.
+          !> This routine differs from CPOEQU by restricting the scaling factors
+          !> to a power of the radix.  Barring over- and underflow, scaling by
+          !> these factors introduces no additional rounding errors.  However, the
+          !> scaled diagonal entries are no longer approximately 1 but lie
+          !> between sqrt(radix) and 1/sqrt(radix).
+          interface poequb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpoequb( n, a, lda, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: amax,scond,s(*)
+                    complex(sp), intent(in) :: a(lda,*)
+               end subroutine cpoequb
+#else 
+               module procedure stdlib_cpoequb
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpoequb( n, a, lda, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: amax,scond,s(*)
+                    real(dp), intent(in) :: a(lda,*)
+               end subroutine dpoequb
+#else 
+               module procedure stdlib_dpoequb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpoequb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spoequb( n, a, lda, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: amax,scond,s(*)
+                    real(sp), intent(in) :: a(lda,*)
+               end subroutine spoequb
+#else 
+               module procedure stdlib_spoequb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpoequb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpoequb( n, a, lda, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: amax,scond,s(*)
+                    complex(dp), intent(in) :: a(lda,*)
+               end subroutine zpoequb
+#else 
+               module procedure stdlib_zpoequb
+#endif
+          end interface poequb
+
+
+
+          !> PORFS: improves the computed solution to a system of linear
+          !> equations when the coefficient matrix is Hermitian positive definite,
+          !> and provides error bounds and backward error estimates for the
+          !> solution.
+          interface porfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x,ldx, ferr, berr,&
+                          work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(sp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: x(ldx,*)
+               end subroutine cporfs
+#else 
+               module procedure stdlib_cporfs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x,ldx, ferr, berr,&
+                          work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs
+                    real(dp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    real(dp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(dp), intent(inout) :: x(ldx,*)
+               end subroutine dporfs
+#else 
+               module procedure stdlib_dporfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qporfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x,ldx, ferr, berr,&
+                          work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs
+                    real(sp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    real(sp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(sp), intent(inout) :: x(ldx,*)
+               end subroutine sporfs
+#else 
+               module procedure stdlib_sporfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wporfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x,ldx, ferr, berr,&
+                          work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(dp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: x(ldx,*)
+               end subroutine zporfs
+#else 
+               module procedure stdlib_zporfs
+#endif
+          end interface porfs
+
+
+
+          !> POSV: computes the solution to a complex system of linear equations
+          !> A * X = B,
+          !> where A is an N-by-N Hermitian positive definite matrix and X and B
+          !> are N-by-NRHS matrices.
+          !> The Cholesky decomposition is used to factor A as
+          !> A = U**H* U,  if UPLO = 'U', or
+          !> A = L * L**H,  if UPLO = 'L',
+          !> where U is an upper triangular matrix and  L is a lower triangular
+          !> matrix.  The factored form of A is then used to solve the system of
+          !> equations A * X = B.
+          interface posv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cposv( uplo, n, nrhs, a, lda, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+               end subroutine cposv
+#else 
+               module procedure stdlib_cposv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dposv( uplo, n, nrhs, a, lda, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+               end subroutine dposv
+#else 
+               module procedure stdlib_dposv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qposv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sposv( uplo, n, nrhs, a, lda, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+               end subroutine sposv
+#else 
+               module procedure stdlib_sposv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wposv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zposv( uplo, n, nrhs, a, lda, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+               end subroutine zposv
+#else 
+               module procedure stdlib_zposv
+#endif
+          end interface posv
+
+
+
+          !> POTRF: computes the Cholesky factorization of a complex Hermitian
+          !> positive definite matrix A.
+          !> The factorization has the form
+          !> A = U**H * U,  if UPLO = 'U', or
+          !> A = L  * L**H,  if UPLO = 'L',
+          !> where U is an upper triangular matrix and L is lower triangular.
+          !> This is the block version of the algorithm, calling Level 3 BLAS.
+          interface potrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpotrf( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine cpotrf
+#else 
+               module procedure stdlib_cpotrf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpotrf( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dpotrf
+#else 
+               module procedure stdlib_dpotrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpotrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spotrf( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine spotrf
+#else 
+               module procedure stdlib_spotrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpotrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpotrf( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zpotrf
+#else 
+               module procedure stdlib_zpotrf
+#endif
+          end interface potrf
+
+
+
+          !> POTRF2: computes the Cholesky factorization of a Hermitian
+          !> positive definite matrix A using the recursive algorithm.
+          !> The factorization has the form
+          !> A = U**H * U,  if UPLO = 'U', or
+          !> A = L  * L**H,  if UPLO = 'L',
+          !> where U is an upper triangular matrix and L is lower triangular.
+          !> This is the recursive version of the algorithm. It divides
+          !> the matrix into four submatrices:
+          !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+          !> A = [ -----|----- ]  with n1 = n/2
+          !> [  A21 | A22  ]       n2 = n-n1
+          !> The subroutine calls itself to factor A11. Update and scale A21
+          !> or A12, update A22 then calls itself to factor A22.
+          interface potrf2
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine cpotrf2( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine cpotrf2
+#else 
+               module procedure stdlib_cpotrf2
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine dpotrf2( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dpotrf2
+#else 
+               module procedure stdlib_dpotrf2
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpotrf2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine spotrf2( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine spotrf2
+#else 
+               module procedure stdlib_spotrf2
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpotrf2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure recursive subroutine zpotrf2( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zpotrf2
+#else 
+               module procedure stdlib_zpotrf2
+#endif
+          end interface potrf2
+
+
+
+          !> POTRI: computes the inverse of a complex Hermitian positive definite
+          !> matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+          !> computed by CPOTRF.
+          interface potri
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpotri( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine cpotri
+#else 
+               module procedure stdlib_cpotri
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpotri( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dpotri
+#else 
+               module procedure stdlib_dpotri
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpotri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spotri( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine spotri
+#else 
+               module procedure stdlib_spotri
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpotri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpotri( uplo, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zpotri
+#else 
+               module procedure stdlib_zpotri
+#endif
+          end interface potri
+
+
+
+          !> POTRS: solves a system of linear equations A*X = B with a Hermitian
+          !> positive definite matrix A using the Cholesky factorization
+          !> A = U**H*U or A = L*L**H computed by CPOTRF.
+          interface potrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine cpotrs
+#else 
+               module procedure stdlib_cpotrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dpotrs
+#else 
+               module procedure stdlib_dpotrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpotrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine spotrs
+#else 
+               module procedure stdlib_spotrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpotrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zpotrs
+#else 
+               module procedure stdlib_zpotrs
+#endif
+          end interface potrs
+
+
+
+          !> PPCON: estimates the reciprocal of the condition number (in the
+          !> 1-norm) of a complex Hermitian positive definite packed matrix using
+          !> the Cholesky factorization A = U**H*U or A = L*L**H computed by
+          !> CPPTRF.
+          !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+          !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+          interface ppcon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cppcon( uplo, n, ap, anorm, rcond, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond,rwork(*)
+                    complex(sp), intent(in) :: ap(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cppcon
+#else 
+               module procedure stdlib_cppcon
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dppcon( uplo, n, ap, anorm, rcond, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: anorm,ap(*)
+                    real(dp), intent(out) :: rcond,work(*)
+               end subroutine dppcon
+#else 
+               module procedure stdlib_dppcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_qppcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sppcon( uplo, n, ap, anorm, rcond, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: anorm,ap(*)
+                    real(sp), intent(out) :: rcond,work(*)
+               end subroutine sppcon
+#else 
+               module procedure stdlib_sppcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_wppcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zppcon( uplo, n, ap, anorm, rcond, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond,rwork(*)
+                    complex(dp), intent(in) :: ap(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zppcon
+#else 
+               module procedure stdlib_zppcon
+#endif
+          end interface ppcon
+
+
+
+          !> PPEQU: computes row and column scalings intended to equilibrate a
+          !> Hermitian positive definite matrix A in packed storage and reduce
+          !> its condition number (with respect to the two-norm).  S contains the
+          !> scale factors, S(i)=1/sqrt(A(i,i)), chosen so that the scaled matrix
+          !> B with elements B(i,j)=S(i)*A(i,j)*S(j) has ones on the diagonal.
+          !> This choice of S puts the condition number of B within a factor N of
+          !> the smallest possible condition number over all possible diagonal
+          !> scalings.
+          interface ppequ
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cppequ( uplo, n, ap, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(out) :: amax,scond,s(*)
+                    complex(sp), intent(in) :: ap(*)
+               end subroutine cppequ
+#else 
+               module procedure stdlib_cppequ
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dppequ( uplo, n, ap, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(out) :: amax,scond,s(*)
+                    real(dp), intent(in) :: ap(*)
+               end subroutine dppequ
+#else 
+               module procedure stdlib_dppequ
+#endif
+#:if WITH_QP
+               module procedure stdlib_qppequ
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sppequ( uplo, n, ap, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(out) :: amax,scond,s(*)
+                    real(sp), intent(in) :: ap(*)
+               end subroutine sppequ
+#else 
+               module procedure stdlib_sppequ
+#endif
+#:if WITH_QP
+               module procedure stdlib_wppequ
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zppequ( uplo, n, ap, s, scond, amax, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(out) :: amax,scond,s(*)
+                    complex(dp), intent(in) :: ap(*)
+               end subroutine zppequ
+#else 
+               module procedure stdlib_zppequ
+#endif
+          end interface ppequ
+
+
+
+          !> PPRFS: improves the computed solution to a system of linear
+          !> equations when the coefficient matrix is Hermitian positive definite
+          !> and packed, and provides error bounds and backward error estimates
+          !> for the solution.
+          interface pprfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr,berr, work, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(sp), intent(in) :: afp(*),ap(*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: x(ldx,*)
+               end subroutine cpprfs
+#else 
+               module procedure stdlib_cpprfs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr,berr, work, &
+                         iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(dp), intent(in) :: afp(*),ap(*),b(ldb,*)
+                    real(dp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(dp), intent(inout) :: x(ldx,*)
+               end subroutine dpprfs
+#else 
+               module procedure stdlib_dpprfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpprfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr,berr, work, &
+                         iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(sp), intent(in) :: afp(*),ap(*),b(ldb,*)
+                    real(sp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(sp), intent(inout) :: x(ldx,*)
+               end subroutine spprfs
+#else 
+               module procedure stdlib_spprfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpprfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr,berr, work, &
+                         rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(dp), intent(in) :: afp(*),ap(*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: x(ldx,*)
+               end subroutine zpprfs
+#else 
+               module procedure stdlib_zpprfs
+#endif
+          end interface pprfs
+
+
+
+          !> PPSV: computes the solution to a complex system of linear equations
+          !> A * X = B,
+          !> where A is an N-by-N Hermitian positive definite matrix stored in
+          !> packed format and X and B are N-by-NRHS matrices.
+          !> The Cholesky decomposition is used to factor A as
+          !> A = U**H * U,  if UPLO = 'U', or
+          !> A = L * L**H,  if UPLO = 'L',
+          !> where U is an upper triangular matrix and L is a lower triangular
+          !> matrix.  The factored form of A is then used to solve the system of
+          !> equations A * X = B.
+          interface ppsv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cppsv( uplo, n, nrhs, ap, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(sp), intent(inout) :: ap(*),b(ldb,*)
+               end subroutine cppsv
+#else 
+               module procedure stdlib_cppsv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dppsv( uplo, n, nrhs, ap, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(dp), intent(inout) :: ap(*),b(ldb,*)
+               end subroutine dppsv
+#else 
+               module procedure stdlib_dppsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qppsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sppsv( uplo, n, nrhs, ap, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(sp), intent(inout) :: ap(*),b(ldb,*)
+               end subroutine sppsv
+#else 
+               module procedure stdlib_sppsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wppsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zppsv( uplo, n, nrhs, ap, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(dp), intent(inout) :: ap(*),b(ldb,*)
+               end subroutine zppsv
+#else 
+               module procedure stdlib_zppsv
+#endif
+          end interface ppsv
+
+
+
+          !> PPTRF: computes the Cholesky factorization of a complex Hermitian
+          !> positive definite matrix A stored in packed format.
+          !> The factorization has the form
+          !> A = U**H * U,  if UPLO = 'U', or
+          !> A = L  * L**H,  if UPLO = 'L',
+          !> where U is an upper triangular matrix and L is lower triangular.
+          interface pptrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpptrf( uplo, n, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(sp), intent(inout) :: ap(*)
+               end subroutine cpptrf
+#else 
+               module procedure stdlib_cpptrf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpptrf( uplo, n, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: ap(*)
+               end subroutine dpptrf
+#else 
+               module procedure stdlib_dpptrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpptrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spptrf( uplo, n, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: ap(*)
+               end subroutine spptrf
+#else 
+               module procedure stdlib_spptrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpptrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpptrf( uplo, n, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(dp), intent(inout) :: ap(*)
+               end subroutine zpptrf
+#else 
+               module procedure stdlib_zpptrf
+#endif
+          end interface pptrf
+
+
+
+          !> PPTRI: computes the inverse of a complex Hermitian positive definite
+          !> matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+          !> computed by CPPTRF.
+          interface pptri
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpptri( uplo, n, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(sp), intent(inout) :: ap(*)
+               end subroutine cpptri
+#else 
+               module procedure stdlib_cpptri
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpptri( uplo, n, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: ap(*)
+               end subroutine dpptri
+#else 
+               module procedure stdlib_dpptri
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpptri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spptri( uplo, n, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: ap(*)
+               end subroutine spptri
+#else 
+               module procedure stdlib_spptri
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpptri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpptri( uplo, n, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(dp), intent(inout) :: ap(*)
+               end subroutine zpptri
+#else 
+               module procedure stdlib_zpptri
+#endif
+          end interface pptri
+
+
+
+          !> PPTRS: solves a system of linear equations A*X = B with a Hermitian
+          !> positive definite matrix A in packed storage using the Cholesky
+          !> factorization A = U**H*U or A = L*L**H computed by CPPTRF.
+          interface pptrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpptrs( uplo, n, nrhs, ap, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(sp), intent(in) :: ap(*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine cpptrs
+#else 
+               module procedure stdlib_cpptrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpptrs( uplo, n, nrhs, ap, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(dp), intent(in) :: ap(*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dpptrs
+#else 
+               module procedure stdlib_dpptrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpptrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spptrs( uplo, n, nrhs, ap, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(sp), intent(in) :: ap(*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine spptrs
+#else 
+               module procedure stdlib_spptrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpptrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpptrs( uplo, n, nrhs, ap, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(dp), intent(in) :: ap(*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zpptrs
+#else 
+               module procedure stdlib_zpptrs
+#endif
+          end interface pptrs
+
+
+
+          !> PSTRF: computes the Cholesky factorization with complete
+          !> pivoting of a complex Hermitian positive semidefinite matrix A.
+          !> The factorization has the form
+          !> P**T * A * P = U**H * U ,  if UPLO = 'U',
+          !> P**T * A * P = L  * L**H,  if UPLO = 'L',
+          !> where U is an upper triangular matrix and L is lower triangular, and
+          !> P is stored as vector PIV.
+          !> This algorithm does not attempt to check that A is positive
+          !> semidefinite. This version of the algorithm calls level 3 BLAS.
+          interface pstrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpstrf( uplo, n, a, lda, piv, rank, tol, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: tol
+                    integer(ilp), intent(out) :: info,rank,piv(n)
+                    integer(ilp), intent(in) :: lda,n
+                    character, intent(in) :: uplo
+                    complex(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: work(2*n)
+               end subroutine cpstrf
+#else 
+               module procedure stdlib_cpstrf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpstrf( uplo, n, a, lda, piv, rank, tol, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: tol
+                    integer(ilp), intent(out) :: info,rank,piv(n)
+                    integer(ilp), intent(in) :: lda,n
+                    character, intent(in) :: uplo
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: work(2*n)
+               end subroutine dpstrf
+#else 
+               module procedure stdlib_dpstrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpstrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spstrf( uplo, n, a, lda, piv, rank, tol, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: tol
+                    integer(ilp), intent(out) :: info,rank,piv(n)
+                    integer(ilp), intent(in) :: lda,n
+                    character, intent(in) :: uplo
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: work(2*n)
+               end subroutine spstrf
+#else 
+               module procedure stdlib_spstrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpstrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpstrf( uplo, n, a, lda, piv, rank, tol, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: tol
+                    integer(ilp), intent(out) :: info,rank,piv(n)
+                    integer(ilp), intent(in) :: lda,n
+                    character, intent(in) :: uplo
+                    complex(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: work(2*n)
+               end subroutine zpstrf
+#else 
+               module procedure stdlib_zpstrf
+#endif
+          end interface pstrf
+
+
+
+          !> PTCON: computes the reciprocal of the condition number (in the
+          !> 1-norm) of a complex Hermitian positive definite tridiagonal matrix
+          !> using the factorization A = L*D*L**H or A = U**H*D*U computed by
+          !> CPTTRF.
+          !> Norm(inv(A)) is computed by a direct method, and the reciprocal of
+          !> the condition number is computed as
+          !> RCOND = 1 / (ANORM * norm(inv(A))).
+          interface ptcon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cptcon( n, d, e, anorm, rcond, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: anorm,d(*)
+                    real(sp), intent(out) :: rcond,rwork(*)
+                    complex(sp), intent(in) :: e(*)
+               end subroutine cptcon
+#else 
+               module procedure stdlib_cptcon
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dptcon( n, d, e, anorm, rcond, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: anorm,d(*),e(*)
+                    real(dp), intent(out) :: rcond,work(*)
+               end subroutine dptcon
+#else 
+               module procedure stdlib_dptcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_qptcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sptcon( n, d, e, anorm, rcond, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: anorm,d(*),e(*)
+                    real(sp), intent(out) :: rcond,work(*)
+               end subroutine sptcon
+#else 
+               module procedure stdlib_sptcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_wptcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zptcon( n, d, e, anorm, rcond, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: anorm,d(*)
+                    real(dp), intent(out) :: rcond,rwork(*)
+                    complex(dp), intent(in) :: e(*)
+               end subroutine zptcon
+#else 
+               module procedure stdlib_zptcon
+#endif
+          end interface ptcon
+
+
+
+          !> PTEQR: computes all eigenvalues and, optionally, eigenvectors of a
+          !> symmetric positive definite tridiagonal matrix by first factoring the
+          !> matrix using SPTTRF and then calling CBDSQR to compute the singular
+          !> values of the bidiagonal factor.
+          !> This routine computes the eigenvalues of the positive definite
+          !> tridiagonal matrix to high relative accuracy.  This means that if the
+          !> eigenvalues range over many orders of magnitude in size, then the
+          !> small eigenvalues and corresponding eigenvectors will be computed
+          !> more accurately than, for example, with the standard QR method.
+          !> The eigenvectors of a full or band positive definite Hermitian matrix
+          !> can also be found if CHETRD, CHPTRD, or CHBTRD has been used to
+          !> reduce this matrix to tridiagonal form.  (The reduction to
+          !> tridiagonal form, however, may preclude the possibility of obtaining
+          !> high relative accuracy in the small eigenvalues of the original
+          !> matrix, if these eigenvalues range over many orders of magnitude.)
+          interface pteqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpteqr( compz, n, d, e, z, ldz, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compz
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: z(ldz,*)
+               end subroutine cpteqr
+#else 
+               module procedure stdlib_cpteqr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpteqr( compz, n, d, e, z, ldz, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compz
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(dp), intent(inout) :: d(*),e(*),z(ldz,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dpteqr
+#else 
+               module procedure stdlib_dpteqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpteqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spteqr( compz, n, d, e, z, ldz, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compz
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(sp), intent(inout) :: d(*),e(*),z(ldz,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine spteqr
+#else 
+               module procedure stdlib_spteqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpteqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpteqr( compz, n, d, e, z, ldz, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compz
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: z(ldz,*)
+               end subroutine zpteqr
+#else 
+               module procedure stdlib_zpteqr
+#endif
+          end interface pteqr
+
+
+
+          !> PTRFS: improves the computed solution to a system of linear
+          !> equations when the coefficient matrix is Hermitian positive definite
+          !> and tridiagonal, and provides error bounds and backward error
+          !> estimates for the solution.
+          interface ptrfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cptrfs( uplo, n, nrhs, d, e, df, ef, b, ldb, x, ldx,ferr, berr, &
+                         work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    real(sp), intent(in) :: d(*),df(*)
+                    complex(sp), intent(in) :: b(ldb,*),e(*),ef(*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: x(ldx,*)
+               end subroutine cptrfs
+#else 
+               module procedure stdlib_cptrfs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dptrfs( n, nrhs, d, e, df, ef, b, ldb, x, ldx, ferr,berr, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(dp), intent(in) :: b(ldb,*),d(*),df(*),e(*),ef(*)
+                    real(dp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(dp), intent(inout) :: x(ldx,*)
+               end subroutine dptrfs
+#else 
+               module procedure stdlib_dptrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qptrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sptrfs( n, nrhs, d, e, df, ef, b, ldb, x, ldx, ferr,berr, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(sp), intent(in) :: b(ldb,*),d(*),df(*),e(*),ef(*)
+                    real(sp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(sp), intent(inout) :: x(ldx,*)
+               end subroutine sptrfs
+#else 
+               module procedure stdlib_sptrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wptrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zptrfs( uplo, n, nrhs, d, e, df, ef, b, ldb, x, ldx,ferr, berr, &
+                         work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    real(dp), intent(in) :: d(*),df(*)
+                    complex(dp), intent(in) :: b(ldb,*),e(*),ef(*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: x(ldx,*)
+               end subroutine zptrfs
+#else 
+               module procedure stdlib_zptrfs
+#endif
+          end interface ptrfs
+
+
+
+          !> PTSV: computes the solution to a complex system of linear equations
+          !> A*X = B, where A is an N-by-N Hermitian positive definite tridiagonal
+          !> matrix, and X and B are N-by-NRHS matrices.
+          !> A is factored as A = L*D*L**H, and the factored form of A is then
+          !> used to solve the system of equations.
+          interface ptsv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cptsv( n, nrhs, d, e, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(sp), intent(inout) :: d(*)
+                    complex(sp), intent(inout) :: b(ldb,*),e(*)
+               end subroutine cptsv
+#else 
+               module procedure stdlib_cptsv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dptsv( n, nrhs, d, e, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(dp), intent(inout) :: b(ldb,*),d(*),e(*)
+               end subroutine dptsv
+#else 
+               module procedure stdlib_dptsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qptsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sptsv( n, nrhs, d, e, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(sp), intent(inout) :: b(ldb,*),d(*),e(*)
+               end subroutine sptsv
+#else 
+               module procedure stdlib_sptsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wptsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zptsv( n, nrhs, d, e, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(dp), intent(inout) :: d(*)
+                    complex(dp), intent(inout) :: b(ldb,*),e(*)
+               end subroutine zptsv
+#else 
+               module procedure stdlib_zptsv
+#endif
+          end interface ptsv
+
+
+
+          !> PTTRF: computes the L*D*L**H factorization of a complex Hermitian
+          !> positive definite tridiagonal matrix A.  The factorization may also
+          !> be regarded as having the form A = U**H *D*U.
+          interface pttrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpttrf( n, d, e, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: d(*)
+                    complex(sp), intent(inout) :: e(*)
+               end subroutine cpttrf
+#else 
+               module procedure stdlib_cpttrf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpttrf( n, d, e, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: d(*),e(*)
+               end subroutine dpttrf
+#else 
+               module procedure stdlib_dpttrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpttrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spttrf( n, d, e, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: d(*),e(*)
+               end subroutine spttrf
+#else 
+               module procedure stdlib_spttrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpttrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpttrf( n, d, e, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: d(*)
+                    complex(dp), intent(inout) :: e(*)
+               end subroutine zpttrf
+#else 
+               module procedure stdlib_zpttrf
+#endif
+          end interface pttrf
+
+
+
+          !> PTTRS: solves a tridiagonal system of the form
+          !> A * X = B
+          !> using the factorization A = U**H*D*U or A = L*D*L**H computed by CPTTRF.
+          !> D is a diagonal matrix specified in the vector D, U (or L) is a unit
+          !> bidiagonal matrix whose superdiagonal (subdiagonal) is specified in
+          !> the vector E, and X and B are N by NRHS matrices.
+          interface pttrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cpttrs( uplo, n, nrhs, d, e, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(sp), intent(in) :: d(*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+                    complex(sp), intent(in) :: e(*)
+               end subroutine cpttrs
+#else 
+               module procedure stdlib_cpttrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dpttrs( n, nrhs, d, e, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(dp), intent(inout) :: b(ldb,*)
+                    real(dp), intent(in) :: d(*),e(*)
+               end subroutine dpttrs
+#else 
+               module procedure stdlib_dpttrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qpttrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine spttrs( n, nrhs, d, e, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(sp), intent(inout) :: b(ldb,*)
+                    real(sp), intent(in) :: d(*),e(*)
+               end subroutine spttrs
+#else 
+               module procedure stdlib_spttrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wpttrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zpttrs( uplo, n, nrhs, d, e, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(dp), intent(in) :: d(*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+                    complex(dp), intent(in) :: e(*)
+               end subroutine zpttrs
+#else 
+               module procedure stdlib_zpttrs
+#endif
+          end interface pttrs
+
+
+
+          !> ROT:   applies a plane rotation, where the cos (C) is real and the
+          !> sin (S) is complex, and the vectors CX and CY are complex.
+          interface rot
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine crot( n, cx, incx, cy, incy, c, s )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(sp), intent(in) :: c
+                    complex(sp), intent(in) :: s
+                    complex(sp), intent(inout) :: cx(*),cy(*)
+               end subroutine crot
+#else 
+               module procedure stdlib_crot
+#endif
+#:if WITH_QP
+               module procedure stdlib_wrot
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zrot( n, cx, incx, cy, incy, c, s )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,incy,n
+                    real(dp), intent(in) :: c
+                    complex(dp), intent(in) :: s
+                    complex(dp), intent(inout) :: cx(*),cy(*)
+               end subroutine zrot
+#else 
+               module procedure stdlib_zrot
+#endif
+          end interface rot
+
+
+
+          !> RSCL: multiplies an n-element real vector x by the real scalar 1/a.
+          !> This is done without overflow or underflow as long as
+          !> the final result x/a does not overflow or underflow.
+          interface rscl
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine drscl( n, sa, sx, incx )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    real(dp), intent(in) :: sa
+                    real(dp), intent(inout) :: sx(*)
+               end subroutine drscl
+#else 
+               module procedure stdlib_drscl
+#endif
+#:if WITH_QP
+               module procedure stdlib_qrscl
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine srscl( n, sa, sx, incx )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx,n
+                    real(sp), intent(in) :: sa
+                    real(sp), intent(inout) :: sx(*)
+               end subroutine srscl
+#else 
+               module procedure stdlib_srscl
+#endif
+          end interface rscl
+
+
+
+          !> SB2ST_KERNELS: is an internal routine used by the DSYTRD_SB2ST
+          !> subroutine.
+          interface sb2st_kernels
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine  dsb2st_kernels( uplo, wantz, ttype,st, ed, sweep, n, nb, ib,a, &
+                         lda, v, tau, ldvt, work)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    logical(lk), intent(in) :: wantz
+                    integer(ilp), intent(in) :: ttype,st,ed,sweep,n,nb,ib,lda,ldvt
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: v(*),tau(*),work(*)
+               end subroutine dsb2st_kernels
+#else 
+               module procedure stdlib_dsb2st_kernels
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsb2st_kernels
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine  ssb2st_kernels( uplo, wantz, ttype,st, ed, sweep, n, nb, ib,a, &
+                         lda, v, tau, ldvt, work)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    logical(lk), intent(in) :: wantz
+                    integer(ilp), intent(in) :: ttype,st,ed,sweep,n,nb,ib,lda,ldvt
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: v(*),tau(*),work(*)
+               end subroutine ssb2st_kernels
+#else 
+               module procedure stdlib_ssb2st_kernels
+#endif
+          end interface sb2st_kernels
+
+
+
+          !> SBEV: computes all the eigenvalues and, optionally, eigenvectors of
+          !> a real symmetric band matrix A.
+          interface sbev
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dsbev( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldz,n
+                    real(dp), intent(inout) :: ab(ldab,*)
+                    real(dp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine dsbev
+#else 
+               module procedure stdlib_dsbev
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsbev
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ssbev( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldz,n
+                    real(sp), intent(inout) :: ab(ldab,*)
+                    real(sp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine ssbev
+#else 
+               module procedure stdlib_ssbev
+#endif
+          end interface sbev
+
+
+
+          !> SBEVD: computes all the eigenvalues and, optionally, eigenvectors of
+          !> a real symmetric band matrix A. If eigenvectors are desired, it uses
+          !> a divide and conquer algorithm.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface sbevd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dsbevd( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,lwork, iwork, &
+                         liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kd,ldab,ldz,liwork,lwork,n
+                    real(dp), intent(inout) :: ab(ldab,*)
+                    real(dp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine dsbevd
+#else 
+               module procedure stdlib_dsbevd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsbevd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ssbevd( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,lwork, iwork, &
+                         liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kd,ldab,ldz,liwork,lwork,n
+                    real(sp), intent(inout) :: ab(ldab,*)
+                    real(sp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine ssbevd
+#else 
+               module procedure stdlib_ssbevd
+#endif
+          end interface sbevd
+
+
+
+          !> SBGST: reduces a real symmetric-definite banded generalized
+          !> eigenproblem  A*x = lambda*B*x  to standard form  C*y = lambda*y,
+          !> such that C has the same bandwidth as A.
+          !> B must have been previously factorized as S**T*S by DPBSTF, using a
+          !> split Cholesky factorization. A is overwritten by C = X**T*A*X, where
+          !> X = S**(-1)*Q and Q is an orthogonal matrix chosen to preserve the
+          !> bandwidth of A.
+          interface sbgst
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsbgst( vect, uplo, n, ka, kb, ab, ldab, bb, ldbb, x,ldx, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ka,kb,ldab,ldbb,ldx,n
+                    real(dp), intent(inout) :: ab(ldab,*)
+                    real(dp), intent(in) :: bb(ldbb,*)
+                    real(dp), intent(out) :: work(*),x(ldx,*)
+               end subroutine dsbgst
+#else 
+               module procedure stdlib_dsbgst
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsbgst
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssbgst( vect, uplo, n, ka, kb, ab, ldab, bb, ldbb, x,ldx, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ka,kb,ldab,ldbb,ldx,n
+                    real(sp), intent(inout) :: ab(ldab,*)
+                    real(sp), intent(in) :: bb(ldbb,*)
+                    real(sp), intent(out) :: work(*),x(ldx,*)
+               end subroutine ssbgst
+#else 
+               module procedure stdlib_ssbgst
+#endif
+          end interface sbgst
+
+
+
+          !> SBGV: computes all the eigenvalues, and optionally, the eigenvectors
+          !> of a real generalized symmetric-definite banded eigenproblem, of
+          !> the form A*x=(lambda)*B*x. Here A and B are assumed to be symmetric
+          !> and banded, and B is also positive definite.
+          interface sbgv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsbgv( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w, z,ldz, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ka,kb,ldab,ldbb,ldz,n
+                    real(dp), intent(inout) :: ab(ldab,*),bb(ldbb,*)
+                    real(dp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine dsbgv
+#else 
+               module procedure stdlib_dsbgv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsbgv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssbgv( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w, z,ldz, work, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ka,kb,ldab,ldbb,ldz,n
+                    real(sp), intent(inout) :: ab(ldab,*),bb(ldbb,*)
+                    real(sp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine ssbgv
+#else 
+               module procedure stdlib_ssbgv
+#endif
+          end interface sbgv
+
+
+
+          !> SBGVD: computes all the eigenvalues, and optionally, the eigenvectors
+          !> of a real generalized symmetric-definite banded eigenproblem, of the
+          !> form A*x=(lambda)*B*x.  Here A and B are assumed to be symmetric and
+          !> banded, and B is also positive definite.  If eigenvectors are
+          !> desired, it uses a divide and conquer algorithm.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface sbgvd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsbgvd( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w,z, ldz, work, &
+                         lwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ka,kb,ldab,ldbb,ldz,liwork,lwork,n
+                    real(dp), intent(inout) :: ab(ldab,*),bb(ldbb,*)
+                    real(dp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine dsbgvd
+#else 
+               module procedure stdlib_dsbgvd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsbgvd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssbgvd( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w,z, ldz, work, &
+                         lwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ka,kb,ldab,ldbb,ldz,liwork,lwork,n
+                    real(sp), intent(inout) :: ab(ldab,*),bb(ldbb,*)
+                    real(sp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine ssbgvd
+#else 
+               module procedure stdlib_ssbgvd
+#endif
+          end interface sbgvd
+
+
+
+          !> SBTRD: reduces a real symmetric band matrix A to symmetric
+          !> tridiagonal form T by an orthogonal similarity transformation:
+          !> Q**T * A * Q = T.
+          interface sbtrd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsbtrd( vect, uplo, n, kd, ab, ldab, d, e, q, ldq,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldq,n
+                    real(dp), intent(inout) :: ab(ldab,*),q(ldq,*)
+                    real(dp), intent(out) :: d(*),e(*),work(*)
+               end subroutine dsbtrd
+#else 
+               module procedure stdlib_dsbtrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsbtrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssbtrd( vect, uplo, n, kd, ab, ldab, d, e, q, ldq,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldq,n
+                    real(sp), intent(inout) :: ab(ldab,*),q(ldq,*)
+                    real(sp), intent(out) :: d(*),e(*),work(*)
+               end subroutine ssbtrd
+#else 
+               module procedure stdlib_ssbtrd
+#endif
+          end interface sbtrd
+
+
+
+          !> Level 3 BLAS like routine for C in RFP Format.
+          !> SFRK: performs one of the symmetric rank--k operations
+          !> C := alpha*A*A**T + beta*C,
+          !> or
+          !> C := alpha*A**T*A + beta*C,
+          !> where alpha and beta are real scalars, C is an n--by--n symmetric
+          !> matrix and A is an n--by--k matrix in the first case and a k--by--n
+          !> matrix in the second case.
+          interface sfrk
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsfrk( transr, uplo, trans, n, k, alpha, a, lda, beta,c )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(dp), intent(in) :: alpha,beta,a(lda,*)
+                    integer(ilp), intent(in) :: k,lda,n
+                    character, intent(in) :: trans,transr,uplo
+                    real(dp), intent(inout) :: c(*)
+               end subroutine dsfrk
+#else 
+               module procedure stdlib_dsfrk
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsfrk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssfrk( transr, uplo, trans, n, k, alpha, a, lda, beta,c )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    real(sp), intent(in) :: alpha,beta,a(lda,*)
+                    integer(ilp), intent(in) :: k,lda,n
+                    character, intent(in) :: trans,transr,uplo
+                    real(sp), intent(inout) :: c(*)
+               end subroutine ssfrk
+#else 
+               module procedure stdlib_ssfrk
+#endif
+          end interface sfrk
+
+
+
+          !> SPCON: estimates the reciprocal of the condition number (in the
+          !> 1-norm) of a complex symmetric packed matrix A using the
+          !> factorization A = U*D*U**T or A = L*D*L**T computed by CSPTRF.
+          !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+          !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+          interface spcon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cspcon( uplo, n, ap, ipiv, anorm, rcond, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond
+                    complex(sp), intent(in) :: ap(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cspcon
+#else 
+               module procedure stdlib_cspcon
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dspcon( uplo, n, ap, ipiv, anorm, rcond, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    real(dp), intent(in) :: anorm,ap(*)
+                    real(dp), intent(out) :: rcond,work(*)
+               end subroutine dspcon
+#else 
+               module procedure stdlib_dspcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_qspcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sspcon( uplo, n, ap, ipiv, anorm, rcond, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    real(sp), intent(in) :: anorm,ap(*)
+                    real(sp), intent(out) :: rcond,work(*)
+               end subroutine sspcon
+#else 
+               module procedure stdlib_sspcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_wspcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zspcon( uplo, n, ap, ipiv, anorm, rcond, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond
+                    complex(dp), intent(in) :: ap(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zspcon
+#else 
+               module procedure stdlib_zspcon
+#endif
+          end interface spcon
+
+
+
+          !> SPEV: computes all the eigenvalues and, optionally, eigenvectors of a
+          !> real symmetric matrix A in packed storage.
+          interface spev
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dspev( jobz, uplo, n, ap, w, z, ldz, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(dp), intent(inout) :: ap(*)
+                    real(dp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine dspev
+#else 
+               module procedure stdlib_dspev
+#endif
+#:if WITH_QP
+               module procedure stdlib_qspev
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sspev( jobz, uplo, n, ap, w, z, ldz, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(sp), intent(inout) :: ap(*)
+                    real(sp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine sspev
+#else 
+               module procedure stdlib_sspev
+#endif
+          end interface spev
+
+
+
+          !> SPEVD: computes all the eigenvalues and, optionally, eigenvectors
+          !> of a real symmetric matrix A in packed storage. If eigenvectors are
+          !> desired, it uses a divide and conquer algorithm.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface spevd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dspevd( jobz, uplo, n, ap, w, z, ldz, work, lwork,iwork, liwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldz,liwork,lwork,n
+                    real(dp), intent(inout) :: ap(*)
+                    real(dp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine dspevd
+#else 
+               module procedure stdlib_dspevd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qspevd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sspevd( jobz, uplo, n, ap, w, z, ldz, work, lwork,iwork, liwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldz,liwork,lwork,n
+                    real(sp), intent(inout) :: ap(*)
+                    real(sp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine sspevd
+#else 
+               module procedure stdlib_sspevd
+#endif
+          end interface spevd
+
+
+
+          !> SPGST: reduces a real symmetric-definite generalized eigenproblem
+          !> to standard form, using packed storage.
+          !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+          !> and A is overwritten by inv(U**T)*A*inv(U) or inv(L)*A*inv(L**T)
+          !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+          !> B*A*x = lambda*x, and A is overwritten by U*A*U**T or L**T*A*L.
+          !> B must have been previously factorized as U**T*U or L*L**T by DPPTRF.
+          interface spgst
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dspgst( itype, uplo, n, ap, bp, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,n
+                    real(dp), intent(inout) :: ap(*)
+                    real(dp), intent(in) :: bp(*)
+               end subroutine dspgst
+#else 
+               module procedure stdlib_dspgst
+#endif
+#:if WITH_QP
+               module procedure stdlib_qspgst
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sspgst( itype, uplo, n, ap, bp, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,n
+                    real(sp), intent(inout) :: ap(*)
+                    real(sp), intent(in) :: bp(*)
+               end subroutine sspgst
+#else 
+               module procedure stdlib_sspgst
+#endif
+          end interface spgst
+
+
+
+          !> SPGV: computes all the eigenvalues and, optionally, the eigenvectors
+          !> of a real generalized symmetric-definite eigenproblem, of the form
+          !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+          !> Here A and B are assumed to be symmetric, stored in packed format,
+          !> and B is also positive definite.
+          interface spgv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dspgv( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,ldz,n
+                    real(dp), intent(inout) :: ap(*),bp(*)
+                    real(dp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine dspgv
+#else 
+               module procedure stdlib_dspgv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qspgv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sspgv( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,ldz,n
+                    real(sp), intent(inout) :: ap(*),bp(*)
+                    real(sp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine sspgv
+#else 
+               module procedure stdlib_sspgv
+#endif
+          end interface spgv
+
+
+
+          !> SPGVD: computes all the eigenvalues, and optionally, the eigenvectors
+          !> of a real generalized symmetric-definite eigenproblem, of the form
+          !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+          !> B are assumed to be symmetric, stored in packed format, and B is also
+          !> positive definite.
+          !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface spgvd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dspgvd( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,lwork, iwork, &
+                         liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: itype,ldz,liwork,lwork,n
+                    real(dp), intent(inout) :: ap(*),bp(*)
+                    real(dp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine dspgvd
+#else 
+               module procedure stdlib_dspgvd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qspgvd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine sspgvd( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,lwork, iwork, &
+                         liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: itype,ldz,liwork,lwork,n
+                    real(sp), intent(inout) :: ap(*),bp(*)
+                    real(sp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine sspgvd
+#else 
+               module procedure stdlib_sspgvd
+#endif
+          end interface spgvd
+
+
+
+          !> SPMV:  performs the matrix-vector operation
+          !> y := alpha*A*x + beta*y,
+          !> where alpha and beta are scalars, x and y are n element vectors and
+          !> A is an n by n symmetric matrix, supplied in packed form.
+          interface spmv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cspmv( uplo, n, alpha, ap, x, incx, beta, y, incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(sp), intent(in) :: alpha,beta,ap(*),x(*)
+                    complex(sp), intent(inout) :: y(*)
+               end subroutine cspmv
+#else 
+               module procedure stdlib_cspmv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wspmv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zspmv( uplo, n, alpha, ap, x, incx, beta, y, incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: incx,incy,n
+                    complex(dp), intent(in) :: alpha,beta,ap(*),x(*)
+                    complex(dp), intent(inout) :: y(*)
+               end subroutine zspmv
+#else 
+               module procedure stdlib_zspmv
+#endif
+          end interface spmv
+
+
+
+          !> SPR:    performs the symmetric rank 1 operation
+          !> A := alpha*x*x**H + A,
+          !> where alpha is a complex scalar, x is an n element vector and A is an
+          !> n by n symmetric matrix, supplied in packed form.
+          interface spr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cspr( uplo, n, alpha, x, incx, ap )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: incx,n
+                    complex(sp), intent(in) :: alpha,x(*)
+                    complex(sp), intent(inout) :: ap(*)
+               end subroutine cspr
+#else 
+               module procedure stdlib_cspr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wspr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zspr( uplo, n, alpha, x, incx, ap )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: incx,n
+                    complex(dp), intent(in) :: alpha,x(*)
+                    complex(dp), intent(inout) :: ap(*)
+               end subroutine zspr
+#else 
+               module procedure stdlib_zspr
+#endif
+          end interface spr
+
+
+
+          !> SPRFS: improves the computed solution to a system of linear
+          !> equations when the coefficient matrix is symmetric indefinite
+          !> and packed, and provides error bounds and backward error estimates
+          !> for the solution.
+          interface sprfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, &
+                         work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs,ipiv(*)
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(sp), intent(in) :: afp(*),ap(*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: x(ldx,*)
+               end subroutine csprfs
+#else 
+               module procedure stdlib_csprfs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, &
+                         work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs,ipiv(*)
+                    real(dp), intent(in) :: afp(*),ap(*),b(ldb,*)
+                    real(dp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(dp), intent(inout) :: x(ldx,*)
+               end subroutine dsprfs
+#else 
+               module procedure stdlib_dsprfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsprfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, &
+                         work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs,ipiv(*)
+                    real(sp), intent(in) :: afp(*),ap(*),b(ldb,*)
+                    real(sp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(sp), intent(inout) :: x(ldx,*)
+               end subroutine ssprfs
+#else 
+               module procedure stdlib_ssprfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsprfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, &
+                         work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs,ipiv(*)
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(dp), intent(in) :: afp(*),ap(*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: x(ldx,*)
+               end subroutine zsprfs
+#else 
+               module procedure stdlib_zsprfs
+#endif
+          end interface sprfs
+
+
+
+          !> SPSV: computes the solution to a complex system of linear equations
+          !> A * X = B,
+          !> where A is an N-by-N symmetric matrix stored in packed format and X
+          !> and B are N-by-NRHS matrices.
+          !> The diagonal pivoting method is used to factor A as
+          !> A = U * D * U**T,  if UPLO = 'U', or
+          !> A = L * D * L**T,  if UPLO = 'L',
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, D is symmetric and block diagonal with 1-by-1
+          !> and 2-by-2 diagonal blocks.  The factored form of A is then used to
+          !> solve the system of equations A * X = B.
+          interface spsv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cspsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(sp), intent(inout) :: ap(*),b(ldb,*)
+               end subroutine cspsv
+#else 
+               module procedure stdlib_cspsv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dspsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(dp), intent(inout) :: ap(*),b(ldb,*)
+               end subroutine dspsv
+#else 
+               module procedure stdlib_dspsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qspsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sspsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(sp), intent(inout) :: ap(*),b(ldb,*)
+               end subroutine sspsv
+#else 
+               module procedure stdlib_sspsv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wspsv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zspsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(dp), intent(inout) :: ap(*),b(ldb,*)
+               end subroutine zspsv
+#else 
+               module procedure stdlib_zspsv
+#endif
+          end interface spsv
+
+
+
+          !> SPTRD: reduces a real symmetric matrix A stored in packed form to
+          !> symmetric tridiagonal form T by an orthogonal similarity
+          !> transformation: Q**T * A * Q = T.
+          interface sptrd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsptrd( uplo, n, ap, d, e, tau, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: ap(*)
+                    real(dp), intent(out) :: d(*),e(*),tau(*)
+               end subroutine dsptrd
+#else 
+               module procedure stdlib_dsptrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsptrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssptrd( uplo, n, ap, d, e, tau, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: ap(*)
+                    real(sp), intent(out) :: d(*),e(*),tau(*)
+               end subroutine ssptrd
+#else 
+               module procedure stdlib_ssptrd
+#endif
+          end interface sptrd
+
+
+
+          !> SPTRF: computes the factorization of a complex symmetric matrix A
+          !> stored in packed format using the Bunch-Kaufman diagonal pivoting
+          !> method:
+          !> A = U*D*U**T  or  A = L*D*L**T
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and D is symmetric and block diagonal with
+          !> 1-by-1 and 2-by-2 diagonal blocks.
+          interface sptrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csptrf( uplo, n, ap, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: n
+                    complex(sp), intent(inout) :: ap(*)
+               end subroutine csptrf
+#else 
+               module procedure stdlib_csptrf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsptrf( uplo, n, ap, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: ap(*)
+               end subroutine dsptrf
+#else 
+               module procedure stdlib_dsptrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsptrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssptrf( uplo, n, ap, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: ap(*)
+               end subroutine ssptrf
+#else 
+               module procedure stdlib_ssptrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsptrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsptrf( uplo, n, ap, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: n
+                    complex(dp), intent(inout) :: ap(*)
+               end subroutine zsptrf
+#else 
+               module procedure stdlib_zsptrf
+#endif
+          end interface sptrf
+
+
+
+          !> SPTRI: computes the inverse of a complex symmetric indefinite matrix
+          !> A in packed storage using the factorization A = U*D*U**T or
+          !> A = L*D*L**T computed by CSPTRF.
+          interface sptri
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csptri( uplo, n, ap, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    complex(sp), intent(inout) :: ap(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csptri
+#else 
+               module procedure stdlib_csptri
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsptri( uplo, n, ap, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    real(dp), intent(inout) :: ap(*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dsptri
+#else 
+               module procedure stdlib_dsptri
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsptri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssptri( uplo, n, ap, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    real(sp), intent(inout) :: ap(*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine ssptri
+#else 
+               module procedure stdlib_ssptri
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsptri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsptri( uplo, n, ap, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,ipiv(*)
+                    complex(dp), intent(inout) :: ap(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsptri
+#else 
+               module procedure stdlib_zsptri
+#endif
+          end interface sptri
+
+
+
+          !> SPTRS: solves a system of linear equations A*X = B with a complex
+          !> symmetric matrix A stored in packed format using the factorization
+          !> A = U*D*U**T or A = L*D*L**T computed by CSPTRF.
+          interface sptrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs,ipiv(*)
+                    complex(sp), intent(in) :: ap(*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine csptrs
+#else 
+               module procedure stdlib_csptrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs,ipiv(*)
+                    real(dp), intent(in) :: ap(*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dsptrs
+#else 
+               module procedure stdlib_dsptrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsptrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs,ipiv(*)
+                    real(sp), intent(in) :: ap(*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine ssptrs
+#else 
+               module procedure stdlib_ssptrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsptrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs,ipiv(*)
+                    complex(dp), intent(in) :: ap(*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zsptrs
+#else 
+               module procedure stdlib_zsptrs
+#endif
+          end interface sptrs
+
+
+
+          !> STEBZ: computes the eigenvalues of a symmetric tridiagonal
+          !> matrix T.  The user may ask for all eigenvalues, all eigenvalues
+          !> in the half-open interval (VL, VU], or the IL-th through IU-th
+          !> eigenvalues.
+          !> To avoid overflow, the matrix must be scaled so that its
+          !> largest element is no greater than overflow**(1/2) * underflow**(1/4) in absolute value, and for greatest
+          !> accuracy, it should not be much smaller than that.
+          !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+          !> Matrix", Report CS41, Computer Science Dept., Stanford
+          !> University, July 21, 1966.
+          interface stebz
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dstebz( range, order, n, vl, vu, il, iu, abstol, d, e,m, nsplit, w,&
+                          iblock, isplit, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: order,range
+                    integer(ilp), intent(in) :: il,iu,n
+                    integer(ilp), intent(out) :: info,m,nsplit,iblock(*),isplit(*),iwork(*)
+                              
+                    real(dp), intent(in) :: abstol,vl,vu,d(*),e(*)
+                    real(dp), intent(out) :: w(*),work(*)
+               end subroutine dstebz
+#else 
+               module procedure stdlib_dstebz
+#endif
+#:if WITH_QP
+               module procedure stdlib_qstebz
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sstebz( range, order, n, vl, vu, il, iu, abstol, d, e,m, nsplit, w,&
+                          iblock, isplit, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: order,range
+                    integer(ilp), intent(in) :: il,iu,n
+                    integer(ilp), intent(out) :: info,m,nsplit,iblock(*),isplit(*),iwork(*)
+                              
+                    real(sp), intent(in) :: abstol,vl,vu,d(*),e(*)
+                    real(sp), intent(out) :: w(*),work(*)
+               end subroutine sstebz
+#else 
+               module procedure stdlib_sstebz
+#endif
+          end interface stebz
+
+
+
+          !> STEDC: computes all eigenvalues and, optionally, eigenvectors of a
+          !> symmetric tridiagonal matrix using the divide and conquer method.
+          !> The eigenvectors of a full or band complex Hermitian matrix can also
+          !> be found if CHETRD or CHPTRD or CHBTRD has been used to reduce this
+          !> matrix to tridiagonal form.
+          !> This code makes very mild assumptions about floating point
+          !> arithmetic. It will work on machines with a guard digit in
+          !> add/subtract, or on those binary machines without guard digits
+          !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+          !> It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.  See SLAED3 for details.
+          interface stedc
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cstedc( compz, n, d, e, z, ldz, work, lwork, rwork,lrwork, iwork, &
+                         liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compz
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldz,liwork,lrwork,lwork,n
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: z(ldz,*)
+               end subroutine cstedc
+#else 
+               module procedure stdlib_cstedc
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dstedc( compz, n, d, e, z, ldz, work, lwork, iwork,liwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compz
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldz,liwork,lwork,n
+                    real(dp), intent(inout) :: d(*),e(*),z(ldz,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dstedc
+#else 
+               module procedure stdlib_dstedc
+#endif
+#:if WITH_QP
+               module procedure stdlib_qstedc
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sstedc( compz, n, d, e, z, ldz, work, lwork, iwork,liwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compz
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldz,liwork,lwork,n
+                    real(sp), intent(inout) :: d(*),e(*),z(ldz,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine sstedc
+#else 
+               module procedure stdlib_sstedc
+#endif
+#:if WITH_QP
+               module procedure stdlib_wstedc
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zstedc( compz, n, d, e, z, ldz, work, lwork, rwork,lrwork, iwork, &
+                         liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compz
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldz,liwork,lrwork,lwork,n
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: z(ldz,*)
+               end subroutine zstedc
+#else 
+               module procedure stdlib_zstedc
+#endif
+          end interface stedc
+
+
+
+          !> STEGR: computes selected eigenvalues and, optionally, eigenvectors
+          !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+          !> a well defined set of pairwise different real eigenvalues, the corresponding
+          !> real eigenvectors are pairwise orthogonal.
+          !> The spectrum may be computed either completely or partially by specifying
+          !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+          !> eigenvalues.
+          !> STEGR is a compatibility wrapper around the improved CSTEMR routine.
+          !> See SSTEMR for further details.
+          !> One important change is that the ABSTOL parameter no longer provides any
+          !> benefit and hence is no longer used.
+          !> Note : STEGR and CSTEMR work only on machines which follow
+          !> IEEE-754 floating-point standard in their handling of infinities and
+          !> NaNs.  Normal execution may create these exceptiona values and hence
+          !> may abort due to a floating point exception in environments which
+          !> do not conform to the IEEE-754 standard.
+          interface stegr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cstegr( jobz, range, n, d, e, vl, vu, il, iu,abstol, m, w, z, ldz, &
+                         isuppz, work, lwork, iwork,liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range
+                    integer(ilp), intent(in) :: il,iu,ldz,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(sp), intent(in) :: abstol,vl,vu
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: w(*),work(*)
+                    complex(sp), intent(out) :: z(ldz,*)
+               end subroutine cstegr
+#else 
+               module procedure stdlib_cstegr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dstegr( jobz, range, n, d, e, vl, vu, il, iu,abstol, m, w, z, ldz, &
+                         isuppz, work, lwork, iwork,liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range
+                    integer(ilp), intent(in) :: il,iu,ldz,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(dp), intent(in) :: abstol,vl,vu
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine dstegr
+#else 
+               module procedure stdlib_dstegr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qstegr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sstegr( jobz, range, n, d, e, vl, vu, il, iu,abstol, m, w, z, ldz, &
+                         isuppz, work, lwork, iwork,liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range
+                    integer(ilp), intent(in) :: il,iu,ldz,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(sp), intent(in) :: abstol,vl,vu
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine sstegr
+#else 
+               module procedure stdlib_sstegr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wstegr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zstegr( jobz, range, n, d, e, vl, vu, il, iu,abstol, m, w, z, ldz, &
+                         isuppz, work, lwork, iwork,liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range
+                    integer(ilp), intent(in) :: il,iu,ldz,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(dp), intent(in) :: abstol,vl,vu
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: w(*),work(*)
+                    complex(dp), intent(out) :: z(ldz,*)
+               end subroutine zstegr
+#else 
+               module procedure stdlib_zstegr
+#endif
+          end interface stegr
+
+
+
+          !> STEIN: computes the eigenvectors of a real symmetric tridiagonal
+          !> matrix T corresponding to specified eigenvalues, using inverse
+          !> iteration.
+          !> The maximum number of iterations allowed for each eigenvector is
+          !> specified by an internal parameter MAXITS (currently set to 5).
+          !> Although the eigenvectors are real, they are stored in a complex
+          !> array, which may be passed to CUNMTR or CUPMTR for back
+          !> transformation to the eigenvectors of a complex Hermitian matrix
+          !> which was reduced to tridiagonal form.
+          interface stein
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cstein( n, d, e, m, w, iblock, isplit, z, ldz, work,iwork, ifail, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ifail(*),iwork(*)
+                    integer(ilp), intent(in) :: ldz,m,n,iblock(*),isplit(*)
+                    real(sp), intent(in) :: d(*),e(*),w(*)
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(out) :: z(ldz,*)
+               end subroutine cstein
+#else 
+               module procedure stdlib_cstein
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dstein( n, d, e, m, w, iblock, isplit, z, ldz, work,iwork, ifail, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ifail(*),iwork(*)
+                    integer(ilp), intent(in) :: ldz,m,n,iblock(*),isplit(*)
+                    real(dp), intent(in) :: d(*),e(*),w(*)
+                    real(dp), intent(out) :: work(*),z(ldz,*)
+               end subroutine dstein
+#else 
+               module procedure stdlib_dstein
+#endif
+#:if WITH_QP
+               module procedure stdlib_qstein
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sstein( n, d, e, m, w, iblock, isplit, z, ldz, work,iwork, ifail, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ifail(*),iwork(*)
+                    integer(ilp), intent(in) :: ldz,m,n,iblock(*),isplit(*)
+                    real(sp), intent(in) :: d(*),e(*),w(*)
+                    real(sp), intent(out) :: work(*),z(ldz,*)
+               end subroutine sstein
+#else 
+               module procedure stdlib_sstein
+#endif
+#:if WITH_QP
+               module procedure stdlib_wstein
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zstein( n, d, e, m, w, iblock, isplit, z, ldz, work,iwork, ifail, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info,ifail(*),iwork(*)
+                    integer(ilp), intent(in) :: ldz,m,n,iblock(*),isplit(*)
+                    real(dp), intent(in) :: d(*),e(*),w(*)
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(out) :: z(ldz,*)
+               end subroutine zstein
+#else 
+               module procedure stdlib_zstein
+#endif
+          end interface stein
+
+
+
+          !> STEMR: computes selected eigenvalues and, optionally, eigenvectors
+          !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+          !> a well defined set of pairwise different real eigenvalues, the corresponding
+          !> real eigenvectors are pairwise orthogonal.
+          !> The spectrum may be computed either completely or partially by specifying
+          !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+          !> eigenvalues.
+          !> Depending on the number of desired eigenvalues, these are computed either
+          !> by bisection or the dqds algorithm. Numerically orthogonal eigenvectors are
+          !> computed by the use of various suitable L D L^T factorizations near clusters
+          !> of close eigenvalues (referred to as RRRs, Relatively Robust
+          !> Representations). An informal sketch of the algorithm follows.
+          !> For each unreduced block (submatrix) of T,
+          !> (a) Compute T - sigma I  = L D L^T, so that L and D
+          !> define all the wanted eigenvalues to high relative accuracy.
+          !> This means that small relative changes in the entries of D and L
+          !> cause only small relative changes in the eigenvalues and
+          !> eigenvectors. The standard (unfactored) representation of the
+          !> tridiagonal matrix T does not have this property in general.
+          !> (b) Compute the eigenvalues to suitable accuracy.
+          !> If the eigenvectors are desired, the algorithm attains full
+          !> accuracy of the computed eigenvalues only right before
+          !> the corresponding vectors have to be computed, see steps c) and d).
+          !> (c) For each cluster of close eigenvalues, select a new
+          !> shift close to the cluster, find a new factorization, and refine
+          !> the shifted eigenvalues to suitable accuracy.
+          !> (d) For each eigenvalue with a large enough relative separation compute
+          !> the corresponding eigenvector by forming a rank revealing twisted
+          !> factorization. Go back to (c) for any clusters that remain.
+          !> For more details, see:
+          !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+          !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+          !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+          !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+          !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+          !> 2004.  Also LAPACK Working Note 154.
+          !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+          !> tridiagonal eigenvalue/eigenvector problem",
+          !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+          !> UC Berkeley, May 1997.
+          !> Further Details
+          !> 1.STEMR works only on machines which follow IEEE-754
+          !> floating-point standard in their handling of infinities and NaNs.
+          !> This permits the use of efficient inner loops avoiding a check for
+          !> zero divisors.
+          !> 2. LAPACK routines can be used to reduce a complex Hermitean matrix to
+          !> real symmetric tridiagonal form.
+          !> (Any complex Hermitean tridiagonal matrix has real values on its diagonal
+          !> and potentially complex numbers on its off-diagonals. By applying a
+          !> similarity transform with an appropriate diagonal matrix
+          !> diag(1,e^{i \phy_1}, ... , e^{i \phy_{n-1}}), the complex Hermitean
+          !> matrix can be transformed into a real symmetric matrix and complex
+          !> arithmetic can be entirely avoided.)
+          !> While the eigenvectors of the real symmetric tridiagonal matrix are real,
+          !> the eigenvectors of original complex Hermitean matrix have complex entries
+          !> in general.
+          !> Since LAPACK drivers overwrite the matrix data with the eigenvectors,
+          !> STEMR accepts complex workspace to facilitate interoperability
+          !> with CUNMTR or CUPMTR.
+          interface stemr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, nzc, &
+                         isuppz, tryrac, work, lwork,iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range
+                    logical(lk), intent(inout) :: tryrac
+                    integer(ilp), intent(in) :: il,iu,ldz,nzc,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(sp), intent(in) :: vl,vu
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: w(*),work(*)
+                    complex(sp), intent(out) :: z(ldz,*)
+               end subroutine cstemr
+#else 
+               module procedure stdlib_cstemr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, nzc, &
+                         isuppz, tryrac, work, lwork,iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range
+                    logical(lk), intent(inout) :: tryrac
+                    integer(ilp), intent(in) :: il,iu,ldz,nzc,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(dp), intent(in) :: vl,vu
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine dstemr
+#else 
+               module procedure stdlib_dstemr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qstemr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, nzc, &
+                         isuppz, tryrac, work, lwork,iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range
+                    logical(lk), intent(inout) :: tryrac
+                    integer(ilp), intent(in) :: il,iu,ldz,nzc,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(sp), intent(in) :: vl,vu
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine sstemr
+#else 
+               module procedure stdlib_sstemr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wstemr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, nzc, &
+                         isuppz, tryrac, work, lwork,iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range
+                    logical(lk), intent(inout) :: tryrac
+                    integer(ilp), intent(in) :: il,iu,ldz,nzc,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(dp), intent(in) :: vl,vu
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: w(*),work(*)
+                    complex(dp), intent(out) :: z(ldz,*)
+               end subroutine zstemr
+#else 
+               module procedure stdlib_zstemr
+#endif
+          end interface stemr
+
+
+
+          !> STEQR: computes all eigenvalues and, optionally, eigenvectors of a
+          !> symmetric tridiagonal matrix using the implicit QL or QR method.
+          !> The eigenvectors of a full or band complex Hermitian matrix can also
+          !> be found if CHETRD or CHPTRD or CHBTRD has been used to reduce this
+          !> matrix to tridiagonal form.
+          interface steqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csteqr( compz, n, d, e, z, ldz, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compz
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: z(ldz,*)
+               end subroutine csteqr
+#else 
+               module procedure stdlib_csteqr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsteqr( compz, n, d, e, z, ldz, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compz
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(dp), intent(inout) :: d(*),e(*),z(ldz,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dsteqr
+#else 
+               module procedure stdlib_dsteqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsteqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssteqr( compz, n, d, e, z, ldz, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compz
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(sp), intent(inout) :: d(*),e(*),z(ldz,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine ssteqr
+#else 
+               module procedure stdlib_ssteqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsteqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsteqr( compz, n, d, e, z, ldz, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compz
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: z(ldz,*)
+               end subroutine zsteqr
+#else 
+               module procedure stdlib_zsteqr
+#endif
+          end interface steqr
+
+
+
+          !> STERF: computes all eigenvalues of a symmetric tridiagonal matrix
+          !> using the Pal-Walker-Kahan variant of the QL or QR algorithm.
+          interface sterf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsterf( n, d, e, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: d(*),e(*)
+               end subroutine dsterf
+#else 
+               module procedure stdlib_dsterf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsterf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssterf( n, d, e, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: d(*),e(*)
+               end subroutine ssterf
+#else 
+               module procedure stdlib_ssterf
+#endif
+          end interface sterf
+
+
+
+          !> STEV: computes all eigenvalues and, optionally, eigenvectors of a
+          !> real symmetric tridiagonal matrix A.
+          interface stev
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dstev( jobz, n, d, e, z, ldz, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: work(*),z(ldz,*)
+               end subroutine dstev
+#else 
+               module procedure stdlib_dstev
+#endif
+#:if WITH_QP
+               module procedure stdlib_qstev
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sstev( jobz, n, d, e, z, ldz, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldz,n
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: work(*),z(ldz,*)
+               end subroutine sstev
+#else 
+               module procedure stdlib_sstev
+#endif
+          end interface stev
+
+
+
+          !> STEVD: computes all eigenvalues and, optionally, eigenvectors of a
+          !> real symmetric tridiagonal matrix. If eigenvectors are desired, it
+          !> uses a divide and conquer algorithm.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface stevd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dstevd( jobz, n, d, e, z, ldz, work, lwork, iwork,liwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldz,liwork,lwork,n
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: work(*),z(ldz,*)
+               end subroutine dstevd
+#else 
+               module procedure stdlib_dstevd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qstevd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sstevd( jobz, n, d, e, z, ldz, work, lwork, iwork,liwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldz,liwork,lwork,n
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: work(*),z(ldz,*)
+               end subroutine sstevd
+#else 
+               module procedure stdlib_sstevd
+#endif
+          end interface stevd
+
+
+
+          !> STEVR: computes selected eigenvalues and, optionally, eigenvectors
+          !> of a real symmetric tridiagonal matrix T.  Eigenvalues and
+          !> eigenvectors can be selected by specifying either a range of values
+          !> or a range of indices for the desired eigenvalues.
+          !> Whenever possible, STEVR calls DSTEMR to compute the
+          !> eigenspectrum using Relatively Robust Representations.  DSTEMR
+          !> computes eigenvalues by the dqds algorithm, while orthogonal
+          !> eigenvectors are computed from various "good" L D L^T representations
+          !> (also known as Relatively Robust Representations). Gram-Schmidt
+          !> orthogonalization is avoided as far as possible. More specifically,
+          !> the various steps of the algorithm are as follows. For the i-th
+          !> unreduced block of T,
+          !> (a) Compute T - sigma_i = L_i D_i L_i^T, such that L_i D_i L_i^T
+          !> is a relatively robust representation,
+          !> (b) Compute the eigenvalues, lambda_j, of L_i D_i L_i^T to high
+          !> relative accuracy by the dqds algorithm,
+          !> (c) If there is a cluster of close eigenvalues, "choose" sigma_i
+          !> close to the cluster, and go to step (a),
+          !> (d) Given the approximate eigenvalue lambda_j of L_i D_i L_i^T,
+          !> compute the corresponding eigenvector by forming a
+          !> rank-revealing twisted factorization.
+          !> The desired accuracy of the output can be specified by the input
+          !> parameter ABSTOL.
+          !> For more details, see "A new O(n^2) algorithm for the symmetric
+          !> tridiagonal eigenvalue/eigenvector problem", by Inderjit Dhillon,
+          !> Computer Science Division Technical Report No. UCB//CSD-97-971,
+          !> UC Berkeley, May 1997.
+          !> Note 1 : STEVR calls DSTEMR when the full spectrum is requested
+          !> on machines which conform to the ieee-754 floating point standard.
+          !> STEVR calls DSTEBZ and DSTEIN on non-ieee machines and
+          !> when partial spectrum requests are made.
+          !> Normal execution of DSTEMR may create NaNs and infinities and
+          !> hence may abort due to a floating point exception in environments
+          !> which do not handle NaNs and infinities in the ieee standard default
+          !> manner.
+          interface stevr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dstevr( jobz, range, n, d, e, vl, vu, il, iu, abstol,m, w, z, ldz, &
+                         isuppz, work, lwork, iwork,liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range
+                    integer(ilp), intent(in) :: il,iu,ldz,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(dp), intent(in) :: abstol,vl,vu
+                    real(dp), intent(inout) :: d(*),e(*)
+                    real(dp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine dstevr
+#else 
+               module procedure stdlib_dstevr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qstevr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine sstevr( jobz, range, n, d, e, vl, vu, il, iu, abstol,m, w, z, ldz, &
+                         isuppz, work, lwork, iwork,liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range
+                    integer(ilp), intent(in) :: il,iu,ldz,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(sp), intent(in) :: abstol,vl,vu
+                    real(sp), intent(inout) :: d(*),e(*)
+                    real(sp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine sstevr
+#else 
+               module procedure stdlib_sstevr
+#endif
+          end interface stevr
+
+
+
+          !> SYCON: estimates the reciprocal of the condition number (in the
+          !> 1-norm) of a complex symmetric matrix A using the factorization
+          !> A = U*D*U**T or A = L*D*L**T computed by CSYTRF.
+          !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+          !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+          interface sycon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csycon( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csycon
+#else 
+               module procedure stdlib_csycon
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsycon( uplo, n, a, lda, ipiv, anorm, rcond, work,iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(dp), intent(in) :: anorm,a(lda,*)
+                    real(dp), intent(out) :: rcond,work(*)
+               end subroutine dsycon
+#else 
+               module procedure stdlib_dsycon
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsycon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssycon( uplo, n, a, lda, ipiv, anorm, rcond, work,iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(sp), intent(in) :: anorm,a(lda,*)
+                    real(sp), intent(out) :: rcond,work(*)
+               end subroutine ssycon
+#else 
+               module procedure stdlib_ssycon
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsycon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsycon( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsycon
+#else 
+               module procedure stdlib_zsycon
+#endif
+          end interface sycon
+
+
+
+          !> SYCON_ROOK: estimates the reciprocal of the condition number (in the
+          !> 1-norm) of a complex symmetric matrix A using the factorization
+          !> A = U*D*U**T or A = L*D*L**T computed by CSYTRF_ROOK.
+          !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+          !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+          interface sycon_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csycon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(sp), intent(in) :: anorm
+                    real(sp), intent(out) :: rcond
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csycon_rook
+#else 
+               module procedure stdlib_csycon_rook
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsycon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,iwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(dp), intent(in) :: anorm,a(lda,*)
+                    real(dp), intent(out) :: rcond,work(*)
+               end subroutine dsycon_rook
+#else 
+               module procedure stdlib_dsycon_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsycon_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssycon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,iwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(sp), intent(in) :: anorm,a(lda,*)
+                    real(sp), intent(out) :: rcond,work(*)
+               end subroutine ssycon_rook
+#else 
+               module procedure stdlib_ssycon_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsycon_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsycon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(dp), intent(in) :: anorm
+                    real(dp), intent(out) :: rcond
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsycon_rook
+#else 
+               module procedure stdlib_zsycon_rook
+#endif
+          end interface sycon_rook
+
+
+
+          !> SYCONV: convert A given by TRF into L and D and vice-versa.
+          !> Get Non-diag elements of D (returned in workspace) and
+          !> apply or reverse permutation done in TRF.
+          interface syconv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csyconv( uplo, way, n, a, lda, ipiv, e, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,way
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: e(*)
+               end subroutine csyconv
+#else 
+               module procedure stdlib_csyconv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsyconv( uplo, way, n, a, lda, ipiv, e, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,way
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: e(*)
+               end subroutine dsyconv
+#else 
+               module procedure stdlib_dsyconv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsyconv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssyconv( uplo, way, n, a, lda, ipiv, e, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,way
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: e(*)
+               end subroutine ssyconv
+#else 
+               module procedure stdlib_ssyconv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsyconv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsyconv( uplo, way, n, a, lda, ipiv, e, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,way
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: e(*)
+               end subroutine zsyconv
+#else 
+               module procedure stdlib_zsyconv
+#endif
+          end interface syconv
+
+
+
+          !> If parameter WAY = 'C':
+          !> SYCONVF: converts the factorization output format used in
+          !> CSYTRF provided on entry in parameter A into the factorization
+          !> output format used in CSYTRF_RK (or CSYTRF_BK) that is stored
+          !> on exit in parameters A and E. It also converts in place details of
+          !> the intechanges stored in IPIV from the format used in CSYTRF into
+          !> the format used in CSYTRF_RK (or CSYTRF_BK).
+          !> If parameter WAY = 'R':
+          !> SYCONVF performs the conversion in reverse direction, i.e.
+          !> converts the factorization output format used in CSYTRF_RK
+          !> (or CSYTRF_BK) provided on entry in parameters A and E into
+          !> the factorization output format used in CSYTRF that is stored
+          !> on exit in parameter A. It also converts in place details of
+          !> the intechanges stored in IPIV from the format used in CSYTRF_RK
+          !> (or CSYTRF_BK) into the format used in CSYTRF.
+          !> SYCONVF can also convert in Hermitian matrix case, i.e. between
+          !> formats used in CHETRF and CHETRF_RK (or CHETRF_BK).
+          interface syconvf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csyconvf( uplo, way, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,way
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    integer(ilp), intent(inout) :: ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*),e(*)
+               end subroutine csyconvf
+#else 
+               module procedure stdlib_csyconvf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsyconvf( uplo, way, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,way
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    integer(ilp), intent(inout) :: ipiv(*)
+                    real(dp), intent(inout) :: a(lda,*),e(*)
+               end subroutine dsyconvf
+#else 
+               module procedure stdlib_dsyconvf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsyconvf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssyconvf( uplo, way, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,way
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    integer(ilp), intent(inout) :: ipiv(*)
+                    real(sp), intent(inout) :: a(lda,*),e(*)
+               end subroutine ssyconvf
+#else 
+               module procedure stdlib_ssyconvf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsyconvf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsyconvf( uplo, way, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,way
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    integer(ilp), intent(inout) :: ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*),e(*)
+               end subroutine zsyconvf
+#else 
+               module procedure stdlib_zsyconvf
+#endif
+          end interface syconvf
+
+
+
+          !> If parameter WAY = 'C':
+          !> SYCONVF_ROOK: converts the factorization output format used in
+          !> CSYTRF_ROOK provided on entry in parameter A into the factorization
+          !> output format used in CSYTRF_RK (or CSYTRF_BK) that is stored
+          !> on exit in parameters A and E. IPIV format for CSYTRF_ROOK and
+          !> CSYTRF_RK (or CSYTRF_BK) is the same and is not converted.
+          !> If parameter WAY = 'R':
+          !> SYCONVF_ROOK performs the conversion in reverse direction, i.e.
+          !> converts the factorization output format used in CSYTRF_RK
+          !> (or CSYTRF_BK) provided on entry in parameters A and E into
+          !> the factorization output format used in CSYTRF_ROOK that is stored
+          !> on exit in parameter A. IPIV format for CSYTRF_ROOK and
+          !> CSYTRF_RK (or CSYTRF_BK) is the same and is not converted.
+          !> SYCONVF_ROOK can also convert in Hermitian matrix case, i.e. between
+          !> formats used in CHETRF_ROOK and CHETRF_RK (or CHETRF_BK).
+          interface syconvf_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csyconvf_rook( uplo, way, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,way
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*),e(*)
+               end subroutine csyconvf_rook
+#else 
+               module procedure stdlib_csyconvf_rook
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsyconvf_rook( uplo, way, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,way
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(dp), intent(inout) :: a(lda,*),e(*)
+               end subroutine dsyconvf_rook
+#else 
+               module procedure stdlib_dsyconvf_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsyconvf_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssyconvf_rook( uplo, way, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,way
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(sp), intent(inout) :: a(lda,*),e(*)
+               end subroutine ssyconvf_rook
+#else 
+               module procedure stdlib_ssyconvf_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsyconvf_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsyconvf_rook( uplo, way, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo,way
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*),e(*)
+               end subroutine zsyconvf_rook
+#else 
+               module procedure stdlib_zsyconvf_rook
+#endif
+          end interface syconvf_rook
+
+
+
+          !> SYEQUB: computes row and column scalings intended to equilibrate a
+          !> symmetric matrix A (with respect to the Euclidean norm) and reduce
+          !> its condition number. The scale factors S are computed by the BIN
+          !> algorithm (see references) so that the scaled matrix B with elements
+          !> B(i,j) = S(i)*A(i,j)*S(j) has a condition number within a factor N of
+          !> the smallest possible condition number over all possible diagonal
+          !> scalings.
+          interface syequb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csyequb( uplo, n, a, lda, s, scond, amax, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: amax,scond,s(*)
+                    character, intent(in) :: uplo
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csyequb
+#else 
+               module procedure stdlib_csyequb
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsyequb( uplo, n, a, lda, s, scond, amax, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: amax,scond,s(*),work(*)
+                    character, intent(in) :: uplo
+                    real(dp), intent(in) :: a(lda,*)
+               end subroutine dsyequb
+#else 
+               module procedure stdlib_dsyequb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsyequb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssyequb( uplo, n, a, lda, s, scond, amax, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: amax,scond,s(*),work(*)
+                    character, intent(in) :: uplo
+                    real(sp), intent(in) :: a(lda,*)
+               end subroutine ssyequb
+#else 
+               module procedure stdlib_ssyequb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsyequb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsyequb( uplo, n, a, lda, s, scond, amax, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: amax,scond,s(*)
+                    character, intent(in) :: uplo
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsyequb
+#else 
+               module procedure stdlib_zsyequb
+#endif
+          end interface syequb
+
+
+
+          !> SYEV: computes all eigenvalues and, optionally, eigenvectors of a
+          !> real symmetric matrix A.
+          interface syev
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dsyev( jobz, uplo, n, a, lda, w, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: w(*),work(*)
+               end subroutine dsyev
+#else 
+               module procedure stdlib_dsyev
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsyev
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ssyev( jobz, uplo, n, a, lda, w, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: w(*),work(*)
+               end subroutine ssyev
+#else 
+               module procedure stdlib_ssyev
+#endif
+          end interface syev
+
+
+
+          !> SYEVD: computes all eigenvalues and, optionally, eigenvectors of a
+          !> real symmetric matrix A. If eigenvectors are desired, it uses a
+          !> divide and conquer algorithm.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          !> Because of large use of BLAS of level 3, SYEVD needs N**2 more
+          !> workspace than DSYEVX.
+          interface syevd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dsyevd( jobz, uplo, n, a, lda, w, work, lwork, iwork,liwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,liwork,lwork,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: w(*),work(*)
+               end subroutine dsyevd
+#else 
+               module procedure stdlib_dsyevd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsyevd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ssyevd( jobz, uplo, n, a, lda, w, work, lwork, iwork,liwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,liwork,lwork,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: w(*),work(*)
+               end subroutine ssyevd
+#else 
+               module procedure stdlib_ssyevd
+#endif
+          end interface syevd
+
+
+
+          !> SYEVR: computes selected eigenvalues and, optionally, eigenvectors
+          !> of a real symmetric matrix A.  Eigenvalues and eigenvectors can be
+          !> selected by specifying either a range of values or a range of
+          !> indices for the desired eigenvalues.
+          !> SYEVR first reduces the matrix A to tridiagonal form T with a call
+          !> to DSYTRD.  Then, whenever possible, SYEVR calls DSTEMR to compute
+          !> the eigenspectrum using Relatively Robust Representations.  DSTEMR
+          !> computes eigenvalues by the dqds algorithm, while orthogonal
+          !> eigenvectors are computed from various "good" L D L^T representations
+          !> (also known as Relatively Robust Representations). Gram-Schmidt
+          !> orthogonalization is avoided as far as possible. More specifically,
+          !> the various steps of the algorithm are as follows.
+          !> For each unreduced block (submatrix) of T,
+          !> (a) Compute T - sigma I  = L D L^T, so that L and D
+          !> define all the wanted eigenvalues to high relative accuracy.
+          !> This means that small relative changes in the entries of D and L
+          !> cause only small relative changes in the eigenvalues and
+          !> eigenvectors. The standard (unfactored) representation of the
+          !> tridiagonal matrix T does not have this property in general.
+          !> (b) Compute the eigenvalues to suitable accuracy.
+          !> If the eigenvectors are desired, the algorithm attains full
+          !> accuracy of the computed eigenvalues only right before
+          !> the corresponding vectors have to be computed, see steps c) and d).
+          !> (c) For each cluster of close eigenvalues, select a new
+          !> shift close to the cluster, find a new factorization, and refine
+          !> the shifted eigenvalues to suitable accuracy.
+          !> (d) For each eigenvalue with a large enough relative separation compute
+          !> the corresponding eigenvector by forming a rank revealing twisted
+          !> factorization. Go back to (c) for any clusters that remain.
+          !> The desired accuracy of the output can be specified by the input
+          !> parameter ABSTOL.
+          !> For more details, see DSTEMR's documentation and:
+          !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+          !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+          !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+          !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+          !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+          !> 2004.  Also LAPACK Working Note 154.
+          !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+          !> tridiagonal eigenvalue/eigenvector problem",
+          !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+          !> UC Berkeley, May 1997.
+          !> Note 1 : SYEVR calls DSTEMR when the full spectrum is requested
+          !> on machines which conform to the ieee-754 floating point standard.
+          !> SYEVR calls DSTEBZ and DSTEIN on non-ieee machines and
+          !> when partial spectrum requests are made.
+          !> Normal execution of DSTEMR may create NaNs and infinities and
+          !> hence may abort due to a floating point exception in environments
+          !> which do not handle NaNs and infinities in the ieee standard default
+          !> manner.
+          interface syevr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dsyevr( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, &
+                         ldz, isuppz, work, lwork,iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range,uplo
+                    integer(ilp), intent(in) :: il,iu,lda,ldz,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(dp), intent(in) :: abstol,vl,vu
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine dsyevr
+#else 
+               module procedure stdlib_dsyevr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsyevr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ssyevr( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, &
+                         ldz, isuppz, work, lwork,iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,range,uplo
+                    integer(ilp), intent(in) :: il,iu,lda,ldz,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,isuppz(*),iwork(*)
+                    real(sp), intent(in) :: abstol,vl,vu
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: w(*),work(*),z(ldz,*)
+               end subroutine ssyevr
+#else 
+               module procedure stdlib_ssyevr
+#endif
+          end interface syevr
+
+
+
+          !> SYGST: reduces a real symmetric-definite generalized eigenproblem
+          !> to standard form.
+          !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+          !> and A is overwritten by inv(U**T)*A*inv(U) or inv(L)*A*inv(L**T)
+          !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+          !> B*A*x = lambda*x, and A is overwritten by U*A*U**T or L**T*A*L.
+          !> B must have been previously factorized as U**T*U or L*L**T by DPOTRF.
+          interface sygst
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsygst( itype, uplo, n, a, lda, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,lda,ldb,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(in) :: b(ldb,*)
+               end subroutine dsygst
+#else 
+               module procedure stdlib_dsygst
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsygst
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssygst( itype, uplo, n, a, lda, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,lda,ldb,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(in) :: b(ldb,*)
+               end subroutine ssygst
+#else 
+               module procedure stdlib_ssygst
+#endif
+          end interface sygst
+
+
+
+          !> SYGV: computes all the eigenvalues, and optionally, the eigenvectors
+          !> of a real generalized symmetric-definite eigenproblem, of the form
+          !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+          !> Here A and B are assumed to be symmetric and B is also
+          !> positive definite.
+          interface sygv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dsygv( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,lda,ldb,lwork,n
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: w(*),work(*)
+               end subroutine dsygv
+#else 
+               module procedure stdlib_dsygv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsygv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ssygv( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: itype,lda,ldb,lwork,n
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: w(*),work(*)
+               end subroutine ssygv
+#else 
+               module procedure stdlib_ssygv
+#endif
+          end interface sygv
+
+
+
+          !> SYGVD: computes all the eigenvalues, and optionally, the eigenvectors
+          !> of a real generalized symmetric-definite eigenproblem, of the form
+          !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+          !> B are assumed to be symmetric and B is also positive definite.
+          !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+          !> The divide and conquer algorithm makes very mild assumptions about
+          !> floating point arithmetic. It will work on machines with a guard
+          !> digit in add/subtract, or on those binary machines without guard
+          !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+          !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+          !> without guard digits, but we know of none.
+          interface sygvd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dsygvd( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, iwork, &
+                         liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: itype,lda,ldb,liwork,lwork,n
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: w(*),work(*)
+               end subroutine dsygvd
+#else 
+               module procedure stdlib_dsygvd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsygvd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ssygvd( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, iwork, &
+                         liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobz,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: itype,lda,ldb,liwork,lwork,n
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: w(*),work(*)
+               end subroutine ssygvd
+#else 
+               module procedure stdlib_ssygvd
+#endif
+          end interface sygvd
+
+
+
+          !> SYMV:  performs the matrix-vector  operation
+          !> y := alpha*A*x + beta*y,
+          !> where alpha and beta are scalars, x and y are n element vectors and
+          !> A is an n by n symmetric matrix.
+          interface symv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csymv( uplo, n, alpha, a, lda, x, incx, beta, y, incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: incx,incy,lda,n
+                    complex(sp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    complex(sp), intent(inout) :: y(*)
+               end subroutine csymv
+#else 
+               module procedure stdlib_csymv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsymv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsymv( uplo, n, alpha, a, lda, x, incx, beta, y, incy )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: incx,incy,lda,n
+                    complex(dp), intent(in) :: alpha,beta,a(lda,*),x(*)
+                    complex(dp), intent(inout) :: y(*)
+               end subroutine zsymv
+#else 
+               module procedure stdlib_zsymv
+#endif
+          end interface symv
+
+
+
+          !> SYR:   performs the symmetric rank 1 operation
+          !> A := alpha*x*x**H + A,
+          !> where alpha is a complex scalar, x is an n element vector and A is an
+          !> n by n symmetric matrix.
+          interface syr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csyr( uplo, n, alpha, x, incx, a, lda )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: incx,lda,n
+                    complex(sp), intent(in) :: alpha,x(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine csyr
+#else 
+               module procedure stdlib_csyr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsyr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsyr( uplo, n, alpha, x, incx, a, lda )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: incx,lda,n
+                    complex(dp), intent(in) :: alpha,x(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zsyr
+#else 
+               module procedure stdlib_zsyr
+#endif
+          end interface syr
+
+
+
+          !> SYRFS: improves the computed solution to a system of linear
+          !> equations when the coefficient matrix is symmetric indefinite, and
+          !> provides error bounds and backward error estimates for the solution.
+          interface syrfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr,&
+                          berr, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs,ipiv(*)
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(sp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: x(ldx,*)
+               end subroutine csyrfs
+#else 
+               module procedure stdlib_csyrfs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr,&
+                          berr, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs,ipiv(*)
+                    real(dp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    real(dp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(dp), intent(inout) :: x(ldx,*)
+               end subroutine dsyrfs
+#else 
+               module procedure stdlib_dsyrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsyrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr,&
+                          berr, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs,ipiv(*)
+                    real(sp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    real(sp), intent(out) :: berr(*),ferr(*),work(*)
+                    real(sp), intent(inout) :: x(ldx,*)
+               end subroutine ssyrfs
+#else 
+               module procedure stdlib_ssyrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsyrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr,&
+                          berr, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldaf,ldb,ldx,n,nrhs,ipiv(*)
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(dp), intent(in) :: a(lda,*),af(ldaf,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: x(ldx,*)
+               end subroutine zsyrfs
+#else 
+               module procedure stdlib_zsyrfs
+#endif
+          end interface syrfs
+
+
+
+          !> SYSV: computes the solution to a complex system of linear equations
+          !> A * X = B,
+          !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+          !> matrices.
+          !> The diagonal pivoting method is used to factor A as
+          !> A = U * D * U**T,  if UPLO = 'U', or
+          !> A = L * D * L**T,  if UPLO = 'L',
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and D is symmetric and block diagonal with
+          !> 1-by-1 and 2-by-2 diagonal blocks.  The factored form of A is then
+          !> used to solve the system of equations A * X = B.
+          interface sysv
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csysv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csysv
+#else 
+               module procedure stdlib_csysv
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsysv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dsysv
+#else 
+               module procedure stdlib_dsysv
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsysv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssysv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine ssysv
+#else 
+               module procedure stdlib_ssysv
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsysv
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsysv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsysv
+#else 
+               module procedure stdlib_zsysv
+#endif
+          end interface sysv
+
+
+
+          !> CSYSV computes the solution to a complex system of linear equations
+          !> A * X = B,
+          !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+          !> matrices.
+          !> Aasen's algorithm is used to factor A as
+          !> A = U**T * T * U,  if UPLO = 'U', or
+          !> A = L * T * L**T,  if UPLO = 'L',
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and T is symmetric tridiagonal. The factored
+          !> form of A is then used to solve the system of equations A * X = B.
+          interface sysv_aa
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csysv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csysv_aa
+#else 
+               module procedure stdlib_csysv_aa
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsysv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dsysv_aa
+#else 
+               module procedure stdlib_dsysv_aa
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsysv_aa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssysv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine ssysv_aa
+#else 
+               module procedure stdlib_ssysv_aa
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsysv_aa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsysv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsysv_aa
+#else 
+               module procedure stdlib_zsysv_aa
+#endif
+          end interface sysv_aa
+
+
+
+          !> SYSV_RK: computes the solution to a complex system of linear
+          !> equations A * X = B, where A is an N-by-N symmetric matrix
+          !> and X and B are N-by-NRHS matrices.
+          !> The bounded Bunch-Kaufman (rook) diagonal pivoting method is used
+          !> to factor A as
+          !> A = P*U*D*(U**T)*(P**T),  if UPLO = 'U', or
+          !> A = P*L*D*(L**T)*(P**T),  if UPLO = 'L',
+          !> where U (or L) is unit upper (or lower) triangular matrix,
+          !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+          !> matrix, P**T is the transpose of P, and D is symmetric and block
+          !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+          !> CSYTRF_RK is called to compute the factorization of a complex
+          !> symmetric matrix.  The factored form of A is then used to solve
+          !> the system of equations A * X = B by calling BLAS3 routine CSYTRS_3.
+          interface sysv_rk
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csysv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, work,lwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: e(*),work(*)
+               end subroutine csysv_rk
+#else 
+               module procedure stdlib_csysv_rk
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsysv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,work, lwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: e(*),work(*)
+               end subroutine dsysv_rk
+#else 
+               module procedure stdlib_dsysv_rk
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsysv_rk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssysv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,work, lwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: e(*),work(*)
+               end subroutine ssysv_rk
+#else 
+               module procedure stdlib_ssysv_rk
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsysv_rk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsysv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, work,lwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: e(*),work(*)
+               end subroutine zsysv_rk
+#else 
+               module procedure stdlib_zsysv_rk
+#endif
+          end interface sysv_rk
+
+
+
+          !> SYSV_ROOK: computes the solution to a complex system of linear
+          !> equations
+          !> A * X = B,
+          !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+          !> matrices.
+          !> The diagonal pivoting method is used to factor A as
+          !> A = U * D * U**T,  if UPLO = 'U', or
+          !> A = L * D * L**T,  if UPLO = 'L',
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and D is symmetric and block diagonal with
+          !> 1-by-1 and 2-by-2 diagonal blocks.
+          !> CSYTRF_ROOK is called to compute the factorization of a complex
+          !> symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+          !> pivoting method.
+          !> The factored form of A is then used to solve the system
+          !> of equations A * X = B by calling CSYTRS_ROOK.
+          interface sysv_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csysv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csysv_rook
+#else 
+               module procedure stdlib_csysv_rook
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsysv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dsysv_rook
+#else 
+               module procedure stdlib_dsysv_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsysv_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssysv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine ssysv_rook
+#else 
+               module procedure stdlib_ssysv_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsysv_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsysv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,ldb,lwork,n,nrhs
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsysv_rook
+#else 
+               module procedure stdlib_zsysv_rook
+#endif
+          end interface sysv_rook
+
+
+
+          !> SYSWAPR: applies an elementary permutation on the rows and the columns of
+          !> a symmetric matrix.
+          interface syswapr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csyswapr( uplo, n, a, lda, i1, i2)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: i1,i2,lda,n
+                    complex(sp), intent(inout) :: a(lda,n)
+               end subroutine csyswapr
+#else 
+               module procedure stdlib_csyswapr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsyswapr( uplo, n, a, lda, i1, i2)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: i1,i2,lda,n
+                    real(dp), intent(inout) :: a(lda,n)
+               end subroutine dsyswapr
+#else 
+               module procedure stdlib_dsyswapr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsyswapr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssyswapr( uplo, n, a, lda, i1, i2)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: i1,i2,lda,n
+                    real(sp), intent(inout) :: a(lda,n)
+               end subroutine ssyswapr
+#else 
+               module procedure stdlib_ssyswapr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsyswapr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsyswapr( uplo, n, a, lda, i1, i2)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: i1,i2,lda,n
+                    complex(dp), intent(inout) :: a(lda,n)
+               end subroutine zsyswapr
+#else 
+               module procedure stdlib_zsyswapr
+#endif
+          end interface syswapr
+
+
+
+          !> SYTF2_RK: computes the factorization of a complex symmetric matrix A
+          !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+          !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+          !> where U (or L) is unit upper (or lower) triangular matrix,
+          !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+          !> matrix, P**T is the transpose of P, and D is symmetric and block
+          !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+          !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+          !> For more information see Further Details section.
+          interface sytf2_rk
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csytf2_rk( uplo, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: e(*)
+               end subroutine csytf2_rk
+#else 
+               module procedure stdlib_csytf2_rk
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytf2_rk( uplo, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: e(*)
+               end subroutine dsytf2_rk
+#else 
+               module procedure stdlib_dsytf2_rk
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytf2_rk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytf2_rk( uplo, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: e(*)
+               end subroutine ssytf2_rk
+#else 
+               module procedure stdlib_ssytf2_rk
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsytf2_rk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsytf2_rk( uplo, n, a, lda, e, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: e(*)
+               end subroutine zsytf2_rk
+#else 
+               module procedure stdlib_zsytf2_rk
+#endif
+          end interface sytf2_rk
+
+
+
+          !> SYTF2_ROOK: computes the factorization of a complex symmetric matrix A
+          !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method:
+          !> A = U*D*U**T  or  A = L*D*L**T
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, U**T is the transpose of U, and D is symmetric and
+          !> block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+          !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+          interface sytf2_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csytf2_rook( uplo, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,n
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine csytf2_rook
+#else 
+               module procedure stdlib_csytf2_rook
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytf2_rook( uplo, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dsytf2_rook
+#else 
+               module procedure stdlib_dsytf2_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytf2_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytf2_rook( uplo, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine ssytf2_rook
+#else 
+               module procedure stdlib_ssytf2_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsytf2_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsytf2_rook( uplo, n, a, lda, ipiv, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,n
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine zsytf2_rook
+#else 
+               module procedure stdlib_zsytf2_rook
+#endif
+          end interface sytf2_rook
+
+
+
+          !> SYTRD: reduces a real symmetric matrix A to real symmetric
+          !> tridiagonal form T by an orthogonal similarity transformation:
+          !> Q**T * A * Q = T.
+          interface sytrd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytrd( uplo, n, a, lda, d, e, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: d(*),e(*),tau(*),work(*)
+               end subroutine dsytrd
+#else 
+               module procedure stdlib_dsytrd
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytrd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytrd( uplo, n, a, lda, d, e, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: d(*),e(*),tau(*),work(*)
+               end subroutine ssytrd
+#else 
+               module procedure stdlib_ssytrd
+#endif
+          end interface sytrd
+
+
+
+          !> SYTRD_SB2ST: reduces a real symmetric band matrix A to real symmetric
+          !> tridiagonal form T by a orthogonal similarity transformation:
+          !> Q**T * A * Q = T.
+          interface sytrd_sb2st
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytrd_sb2st( stage1, vect, uplo, n, kd, ab, ldab,d, e, hous, &
+                         lhous, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: stage1,uplo,vect
+                    integer(ilp), intent(in) :: n,kd,ldab,lhous,lwork
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(out) :: d(*),e(*),hous(*),work(*)
+                    real(dp), intent(inout) :: ab(ldab,*)
+               end subroutine dsytrd_sb2st
+#else 
+               module procedure stdlib_dsytrd_sb2st
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytrd_sb2st
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytrd_sb2st( stage1, vect, uplo, n, kd, ab, ldab,d, e, hous, &
+                         lhous, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: stage1,uplo,vect
+                    integer(ilp), intent(in) :: n,kd,ldab,lhous,lwork
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(out) :: d(*),e(*),hous(*),work(*)
+                    real(sp), intent(inout) :: ab(ldab,*)
+               end subroutine ssytrd_sb2st
+#else 
+               module procedure stdlib_ssytrd_sb2st
+#endif
+          end interface sytrd_sb2st
+
+
+
+          !> SYTRD_SY2SB: reduces a real symmetric matrix A to real symmetric
+          !> band-diagonal form AB by a orthogonal similarity transformation:
+          !> Q**T * A * Q = AB.
+          interface sytrd_sy2sb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytrd_sy2sb( uplo, n, kd, a, lda, ab, ldab, tau,work, lwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldab,lwork,n,kd
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: ab(ldab,*),tau(*),work(*)
+               end subroutine dsytrd_sy2sb
+#else 
+               module procedure stdlib_dsytrd_sy2sb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytrd_sy2sb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytrd_sy2sb( uplo, n, kd, a, lda, ab, ldab, tau,work, lwork, info &
+                         )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldab,lwork,n,kd
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: ab(ldab,*),tau(*),work(*)
+               end subroutine ssytrd_sy2sb
+#else 
+               module procedure stdlib_ssytrd_sy2sb
+#endif
+          end interface sytrd_sy2sb
+
+
+
+          !> SYTRF: computes the factorization of a complex symmetric matrix A
+          !> using the Bunch-Kaufman diagonal pivoting method.  The form of the
+          !> factorization is
+          !> A = U*D*U**T  or  A = L*D*L**T
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and D is symmetric and block diagonal with
+          !> 1-by-1 and 2-by-2 diagonal blocks.
+          !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+          interface sytrf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csytrf
+#else 
+               module procedure stdlib_csytrf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dsytrf
+#else 
+               module procedure stdlib_dsytrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine ssytrf
+#else 
+               module procedure stdlib_ssytrf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsytrf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsytrf
+#else 
+               module procedure stdlib_zsytrf
+#endif
+          end interface sytrf
+
+
+
+          !> SYTRF_AA: computes the factorization of a complex symmetric matrix A
+          !> using the Aasen's algorithm.  The form of the factorization is
+          !> A = U**T*T*U  or  A = L*T*L**T
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and T is a complex symmetric tridiagonal matrix.
+          !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+          interface sytrf_aa
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,lda,lwork
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csytrf_aa
+#else 
+               module procedure stdlib_csytrf_aa
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,lda,lwork
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dsytrf_aa
+#else 
+               module procedure stdlib_dsytrf_aa
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytrf_aa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,lda,lwork
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine ssytrf_aa
+#else 
+               module procedure stdlib_ssytrf_aa
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsytrf_aa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,lda,lwork
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsytrf_aa
+#else 
+               module procedure stdlib_zsytrf_aa
+#endif
+          end interface sytrf_aa
+
+
+
+          !> SYTRF_RK: computes the factorization of a complex symmetric matrix A
+          !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+          !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+          !> where U (or L) is unit upper (or lower) triangular matrix,
+          !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+          !> matrix, P**T is the transpose of P, and D is symmetric and block
+          !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+          !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+          !> For more information see Further Details section.
+          interface sytrf_rk
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: e(*),work(*)
+               end subroutine csytrf_rk
+#else 
+               module procedure stdlib_csytrf_rk
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: e(*),work(*)
+               end subroutine dsytrf_rk
+#else 
+               module procedure stdlib_dsytrf_rk
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytrf_rk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: e(*),work(*)
+               end subroutine ssytrf_rk
+#else 
+               module procedure stdlib_ssytrf_rk
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsytrf_rk
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: e(*),work(*)
+               end subroutine zsytrf_rk
+#else 
+               module procedure stdlib_zsytrf_rk
+#endif
+          end interface sytrf_rk
+
+
+
+          !> SYTRF_ROOK: computes the factorization of a complex symmetric matrix A
+          !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+          !> The form of the factorization is
+          !> A = U*D*U**T  or  A = L*D*L**T
+          !> where U (or L) is a product of permutation and unit upper (lower)
+          !> triangular matrices, and D is symmetric and block diagonal with
+          !> 1-by-1 and 2-by-2 diagonal blocks.
+          !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+          interface sytrf_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csytrf_rook
+#else 
+               module procedure stdlib_csytrf_rook
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dsytrf_rook
+#else 
+               module procedure stdlib_dsytrf_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytrf_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine ssytrf_rook
+#else 
+               module procedure stdlib_ssytrf_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsytrf_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info,ipiv(*)
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsytrf_rook
+#else 
+               module procedure stdlib_zsytrf_rook
+#endif
+          end interface sytrf_rook
+
+
+
+          !> SYTRI: computes the inverse of a complex symmetric indefinite matrix
+          !> A using the factorization A = U*D*U**T or A = L*D*L**T computed by
+          !> CSYTRF.
+          interface sytri
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csytri( uplo, n, a, lda, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csytri
+#else 
+               module procedure stdlib_csytri
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytri( uplo, n, a, lda, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dsytri
+#else 
+               module procedure stdlib_dsytri
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytri( uplo, n, a, lda, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine ssytri
+#else 
+               module procedure stdlib_ssytri
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsytri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsytri( uplo, n, a, lda, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsytri
+#else 
+               module procedure stdlib_zsytri
+#endif
+          end interface sytri
+
+
+
+          !> SYTRI_ROOK: computes the inverse of a complex symmetric
+          !> matrix A using the factorization A = U*D*U**T or A = L*D*L**T
+          !> computed by CSYTRF_ROOK.
+          interface sytri_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csytri_rook( uplo, n, a, lda, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csytri_rook
+#else 
+               module procedure stdlib_csytri_rook
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytri_rook( uplo, n, a, lda, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dsytri_rook
+#else 
+               module procedure stdlib_dsytri_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytri_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytri_rook( uplo, n, a, lda, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine ssytri_rook
+#else 
+               module procedure stdlib_ssytri_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsytri_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsytri_rook( uplo, n, a, lda, ipiv, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n,ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsytri_rook
+#else 
+               module procedure stdlib_zsytri_rook
+#endif
+          end interface sytri_rook
+
+
+
+          !> SYTRS: solves a system of linear equations A*X = B with a complex
+          !> symmetric matrix A using the factorization A = U*D*U**T or
+          !> A = L*D*L**T computed by CSYTRF.
+          interface sytrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine csytrs
+#else 
+               module procedure stdlib_csytrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dsytrs
+#else 
+               module procedure stdlib_dsytrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine ssytrs
+#else 
+               module procedure stdlib_ssytrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsytrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zsytrs
+#else 
+               module procedure stdlib_zsytrs
+#endif
+          end interface sytrs
+
+
+
+          !> SYTRS2: solves a system of linear equations A*X = B with a complex
+          !> symmetric matrix A using the factorization A = U*D*U**T or
+          !> A = L*D*L**T computed by CSYTRF and converted by CSYCONV.
+          interface sytrs2
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csytrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csytrs2
+#else 
+               module procedure stdlib_csytrs2
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dsytrs2
+#else 
+               module procedure stdlib_dsytrs2
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytrs2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine ssytrs2
+#else 
+               module procedure stdlib_ssytrs2
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsytrs2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsytrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsytrs2
+#else 
+               module procedure stdlib_zsytrs2
+#endif
+          end interface sytrs2
+
+
+
+          !> SYTRS_3: solves a system of linear equations A * X = B with a complex
+          !> symmetric matrix A using the factorization computed
+          !> by CSYTRF_RK or CSYTRF_BK:
+          !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+          !> where U (or L) is unit upper (or lower) triangular matrix,
+          !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+          !> matrix, P**T is the transpose of P, and D is symmetric and block
+          !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+          !> This algorithm is using Level 3 BLAS.
+          interface sytrs_3
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(sp), intent(in) :: a(lda,*),e(*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine csytrs_3
+#else 
+               module procedure stdlib_csytrs_3
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    real(dp), intent(in) :: a(lda,*),e(*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dsytrs_3
+#else 
+               module procedure stdlib_dsytrs_3
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytrs_3
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    real(sp), intent(in) :: a(lda,*),e(*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine ssytrs_3
+#else 
+               module procedure stdlib_ssytrs_3
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsytrs_3
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(dp), intent(in) :: a(lda,*),e(*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zsytrs_3
+#else 
+               module procedure stdlib_zsytrs_3
+#endif
+          end interface sytrs_3
+
+
+
+          !> SYTRS_AA: solves a system of linear equations A*X = B with a complex
+          !> symmetric matrix A using the factorization A = U**T*T*U or
+          !> A = L*T*L**T computed by CSYTRF_AA.
+          interface sytrs_aa
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,nrhs,lda,ldb,lwork,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine csytrs_aa
+#else 
+               module procedure stdlib_csytrs_aa
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,nrhs,lda,ldb,lwork,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(inout) :: b(ldb,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dsytrs_aa
+#else 
+               module procedure stdlib_dsytrs_aa
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytrs_aa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,nrhs,lda,ldb,lwork,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(inout) :: b(ldb,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine ssytrs_aa
+#else 
+               module procedure stdlib_ssytrs_aa
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsytrs_aa
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(in) :: n,nrhs,lda,ldb,lwork,ipiv(*)
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zsytrs_aa
+#else 
+               module procedure stdlib_zsytrs_aa
+#endif
+          end interface sytrs_aa
+
+
+
+          !> SYTRS_ROOK: solves a system of linear equations A*X = B with
+          !> a complex symmetric matrix A using the factorization A = U*D*U**T or
+          !> A = L*D*L**T computed by CSYTRF_ROOK.
+          interface sytrs_rook
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine csytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine csytrs_rook
+#else 
+               module procedure stdlib_csytrs_rook
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dsytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dsytrs_rook
+#else 
+               module procedure stdlib_dsytrs_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_qsytrs_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ssytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine ssytrs_rook
+#else 
+               module procedure stdlib_ssytrs_rook
+#endif
+#:if WITH_QP
+               module procedure stdlib_wsytrs_rook
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zsytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs,ipiv(*)
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine zsytrs_rook
+#else 
+               module procedure stdlib_zsytrs_rook
+#endif
+          end interface sytrs_rook
+
+
+
+          !> TBCON: estimates the reciprocal of the condition number of a
+          !> triangular band matrix A, in either the 1-norm or the infinity-norm.
+          !> The norm of A is computed and an estimate is obtained for
+          !> norm(inv(A)), then the reciprocal of the condition number is
+          !> computed as
+          !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+          interface tbcon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ctbcon( norm, uplo, diag, n, kd, ab, ldab, rcond, work,rwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(sp), intent(out) :: rcond,rwork(*)
+                    complex(sp), intent(in) :: ab(ldab,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctbcon
+#else 
+               module procedure stdlib_ctbcon
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dtbcon( norm, uplo, diag, n, kd, ab, ldab, rcond, work,iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(dp), intent(out) :: rcond,work(*)
+                    real(dp), intent(in) :: ab(ldab,*)
+               end subroutine dtbcon
+#else 
+               module procedure stdlib_dtbcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtbcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine stbcon( norm, uplo, diag, n, kd, ab, ldab, rcond, work,iwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(sp), intent(out) :: rcond,work(*)
+                    real(sp), intent(in) :: ab(ldab,*)
+               end subroutine stbcon
+#else 
+               module procedure stdlib_stbcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtbcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ztbcon( norm, uplo, diag, n, kd, ab, ldab, rcond, work,rwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,n
+                    real(dp), intent(out) :: rcond,rwork(*)
+                    complex(dp), intent(in) :: ab(ldab,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztbcon
+#else 
+               module procedure stdlib_ztbcon
+#endif
+          end interface tbcon
+
+
+
+          !> TBRFS: provides error bounds and backward error estimates for the
+          !> solution to a system of linear equations with a triangular band
+          !> coefficient matrix.
+          !> The solution matrix X must be computed by CTBTRS or some other
+          !> means before entering this routine.  TBRFS does not do iterative
+          !> refinement because doing so cannot improve the backward error.
+          interface tbrfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctbrfs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, x, ldx, &
+                         ferr, berr, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,ldx,n,nrhs
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(sp), intent(in) :: ab(ldab,*),b(ldb,*),x(ldx,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctbrfs
+#else 
+               module procedure stdlib_ctbrfs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtbrfs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, x, ldx, &
+                         ferr, berr, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kd,ldab,ldb,ldx,n,nrhs
+                    real(dp), intent(in) :: ab(ldab,*),b(ldb,*),x(ldx,*)
+                    real(dp), intent(out) :: berr(*),ferr(*),work(*)
+               end subroutine dtbrfs
+#else 
+               module procedure stdlib_dtbrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtbrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stbrfs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, x, ldx, &
+                         ferr, berr, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: kd,ldab,ldb,ldx,n,nrhs
+                    real(sp), intent(in) :: ab(ldab,*),b(ldb,*),x(ldx,*)
+                    real(sp), intent(out) :: berr(*),ferr(*),work(*)
+               end subroutine stbrfs
+#else 
+               module procedure stdlib_stbrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtbrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztbrfs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, x, ldx, &
+                         ferr, berr, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,ldx,n,nrhs
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(dp), intent(in) :: ab(ldab,*),b(ldb,*),x(ldx,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztbrfs
+#else 
+               module procedure stdlib_ztbrfs
+#endif
+          end interface tbrfs
+
+
+
+          !> TBTRS: solves a triangular system of the form
+          !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+          !> where A is a triangular band matrix of order N, and B is an
+          !> N-by-NRHS matrix.  A check is made to verify that A is nonsingular.
+          interface tbtrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctbtrs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,n,nrhs
+                    complex(sp), intent(in) :: ab(ldab,*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine ctbtrs
+#else 
+               module procedure stdlib_ctbtrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtbtrs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,n,nrhs
+                    real(dp), intent(in) :: ab(ldab,*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dtbtrs
+#else 
+               module procedure stdlib_dtbtrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtbtrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stbtrs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,n,nrhs
+                    real(sp), intent(in) :: ab(ldab,*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine stbtrs
+#else 
+               module procedure stdlib_stbtrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtbtrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztbtrs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: kd,ldab,ldb,n,nrhs
+                    complex(dp), intent(in) :: ab(ldab,*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine ztbtrs
+#else 
+               module procedure stdlib_ztbtrs
+#endif
+          end interface tbtrs
+
+
+
+          !> Level 3 BLAS like routine for A in RFP Format.
+          !> TFSM: solves the matrix equation
+          !> op( A )*X = alpha*B  or  X*op( A ) = alpha*B
+          !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+          !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+          !> op( A ) = A   or   op( A ) = A**H.
+          !> A is in Rectangular Full Packed (RFP) Format.
+          !> The matrix X is overwritten on B.
+          interface tfsm
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctfsm( transr, side, uplo, trans, diag, m, n, alpha, a,b, ldb )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,diag,side,trans,uplo
+                    integer(ilp), intent(in) :: ldb,m,n
+                    complex(sp), intent(in) :: alpha,a(0:*)
+                    complex(sp), intent(inout) :: b(0:ldb-1,0:*)
+               end subroutine ctfsm
+#else 
+               module procedure stdlib_ctfsm
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtfsm( transr, side, uplo, trans, diag, m, n, alpha, a,b, ldb )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,diag,side,trans,uplo
+                    integer(ilp), intent(in) :: ldb,m,n
+                    real(dp), intent(in) :: alpha,a(0:*)
+                    real(dp), intent(inout) :: b(0:ldb-1,0:*)
+               end subroutine dtfsm
+#else 
+               module procedure stdlib_dtfsm
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtfsm
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stfsm( transr, side, uplo, trans, diag, m, n, alpha, a,b, ldb )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,diag,side,trans,uplo
+                    integer(ilp), intent(in) :: ldb,m,n
+                    real(sp), intent(in) :: alpha,a(0:*)
+                    real(sp), intent(inout) :: b(0:ldb-1,0:*)
+               end subroutine stfsm
+#else 
+               module procedure stdlib_stfsm
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtfsm
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztfsm( transr, side, uplo, trans, diag, m, n, alpha, a,b, ldb )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,diag,side,trans,uplo
+                    integer(ilp), intent(in) :: ldb,m,n
+                    complex(dp), intent(in) :: alpha,a(0:*)
+                    complex(dp), intent(inout) :: b(0:ldb-1,0:*)
+               end subroutine ztfsm
+#else 
+               module procedure stdlib_ztfsm
+#endif
+          end interface tfsm
+
+
+
+          !> TFTRI: computes the inverse of a triangular matrix A stored in RFP
+          !> format.
+          !> This is a Level 3 BLAS version of the algorithm.
+          interface tftri
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctftri( transr, uplo, diag, n, a, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo,diag
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(sp), intent(inout) :: a(0:*)
+               end subroutine ctftri
+#else 
+               module procedure stdlib_ctftri
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtftri( transr, uplo, diag, n, a, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo,diag
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: a(0:*)
+               end subroutine dtftri
+#else 
+               module procedure stdlib_dtftri
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtftri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stftri( transr, uplo, diag, n, a, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo,diag
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: a(0:*)
+               end subroutine stftri
+#else 
+               module procedure stdlib_stftri
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtftri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztftri( transr, uplo, diag, n, a, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo,diag
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(dp), intent(inout) :: a(0:*)
+               end subroutine ztftri
+#else 
+               module procedure stdlib_ztftri
+#endif
+          end interface tftri
+
+
+
+          !> TFTTP: copies a triangular matrix A from rectangular full packed
+          !> format (TF) to standard packed format (TP).
+          interface tfttp
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctfttp( transr, uplo, n, arf, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(sp), intent(out) :: ap(0:*)
+                    complex(sp), intent(in) :: arf(0:*)
+               end subroutine ctfttp
+#else 
+               module procedure stdlib_ctfttp
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtfttp( transr, uplo, n, arf, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(out) :: ap(0:*)
+                    real(dp), intent(in) :: arf(0:*)
+               end subroutine dtfttp
+#else 
+               module procedure stdlib_dtfttp
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtfttp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stfttp( transr, uplo, n, arf, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(out) :: ap(0:*)
+                    real(sp), intent(in) :: arf(0:*)
+               end subroutine stfttp
+#else 
+               module procedure stdlib_stfttp
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtfttp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztfttp( transr, uplo, n, arf, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(dp), intent(out) :: ap(0:*)
+                    complex(dp), intent(in) :: arf(0:*)
+               end subroutine ztfttp
+#else 
+               module procedure stdlib_ztfttp
+#endif
+          end interface tfttp
+
+
+
+          !> TFTTR: copies a triangular matrix A from rectangular full packed
+          !> format (TF) to standard full format (TR).
+          interface tfttr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctfttr( transr, uplo, n, arf, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    complex(sp), intent(out) :: a(0:lda-1,0:*)
+                    complex(sp), intent(in) :: arf(0:*)
+               end subroutine ctfttr
+#else 
+               module procedure stdlib_ctfttr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtfttr( transr, uplo, n, arf, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    real(dp), intent(out) :: a(0:lda-1,0:*)
+                    real(dp), intent(in) :: arf(0:*)
+               end subroutine dtfttr
+#else 
+               module procedure stdlib_dtfttr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtfttr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stfttr( transr, uplo, n, arf, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    real(sp), intent(out) :: a(0:lda-1,0:*)
+                    real(sp), intent(in) :: arf(0:*)
+               end subroutine stfttr
+#else 
+               module procedure stdlib_stfttr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtfttr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztfttr( transr, uplo, n, arf, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    complex(dp), intent(out) :: a(0:lda-1,0:*)
+                    complex(dp), intent(in) :: arf(0:*)
+               end subroutine ztfttr
+#else 
+               module procedure stdlib_ztfttr
+#endif
+          end interface tfttr
+
+
+
+          !> TGEVC: computes some or all of the right and/or left eigenvectors of
+          !> a pair of complex matrices (S,P), where S and P are upper triangular.
+          !> Matrix pairs of this type are produced by the generalized Schur
+          !> factorization of a complex matrix pair (A,B):
+          !> A = Q*S*Z**H,  B = Q*P*Z**H
+          !> as computed by CGGHRD + CHGEQZ.
+          !> The right eigenvector x and the left eigenvector y of (S,P)
+          !> corresponding to an eigenvalue w are defined by:
+          !> S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
+          !> where y**H denotes the conjugate tranpose of y.
+          !> The eigenvalues are not input to this routine, but are computed
+          !> directly from the diagonal elements of S and P.
+          !> This routine returns the matrices X and/or Y of right and left
+          !> eigenvectors of (S,P), or the products Z*X and/or Q*Y,
+          !> where Z and Q are input matrices.
+          !> If Q and Z are the unitary factors from the generalized Schur
+          !> factorization of a matrix pair (A,B), then Z*X and Q*Y
+          !> are the matrices of right and left eigenvectors of (A,B).
+          interface tgevc
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctgevc( side, howmny, select, n, s, lds, p, ldp, vl,ldvl, vr, ldvr,&
+                          mm, m, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,side
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldp,lds,ldvl,ldvr,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(in) :: p(ldp,*),s(lds,*)
+                    complex(sp), intent(inout) :: vl(ldvl,*),vr(ldvr,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctgevc
+#else 
+               module procedure stdlib_ctgevc
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtgevc( side, howmny, select, n, s, lds, p, ldp, vl,ldvl, vr, ldvr,&
+                          mm, m, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,side
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldp,lds,ldvl,ldvr,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(dp), intent(in) :: p(ldp,*),s(lds,*)
+                    real(dp), intent(inout) :: vl(ldvl,*),vr(ldvr,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dtgevc
+#else 
+               module procedure stdlib_dtgevc
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtgevc
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stgevc( side, howmny, select, n, s, lds, p, ldp, vl,ldvl, vr, ldvr,&
+                          mm, m, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,side
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldp,lds,ldvl,ldvr,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(sp), intent(in) :: p(ldp,*),s(lds,*)
+                    real(sp), intent(inout) :: vl(ldvl,*),vr(ldvr,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine stgevc
+#else 
+               module procedure stdlib_stgevc
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtgevc
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztgevc( side, howmny, select, n, s, lds, p, ldp, vl,ldvl, vr, ldvr,&
+                          mm, m, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,side
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldp,lds,ldvl,ldvr,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(in) :: p(ldp,*),s(lds,*)
+                    complex(dp), intent(inout) :: vl(ldvl,*),vr(ldvr,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztgevc
+#else 
+               module procedure stdlib_ztgevc
+#endif
+          end interface tgevc
+
+
+
+          !> TGEXC: reorders the generalized Schur decomposition of a complex
+          !> matrix pair (A,B), using an unitary equivalence transformation
+          !> (A, B) := Q * (A, B) * Z**H, so that the diagonal block of (A, B) with
+          !> row index IFST is moved to row ILST.
+          !> (A, B) must be in generalized Schur canonical form, that is, A and
+          !> B are both upper triangular.
+          !> Optionally, the matrices Q and Z of generalized Schur vectors are
+          !> updated.
+          !> Q(in) * A(in) * Z(in)**H = Q(out) * A(out) * Z(out)**H
+          !> Q(in) * B(in) * Z(in)**H = Q(out) * B(out) * Z(out)**H
+          interface tgexc
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, ifst, ilst,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantq,wantz
+                    integer(ilp), intent(in) :: ifst,lda,ldb,ldq,ldz,n
+                    integer(ilp), intent(inout) :: ilst
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+               end subroutine ctgexc
+#else 
+               module procedure stdlib_ctgexc
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, ifst, ilst,&
+                          work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantq,wantz
+                    integer(ilp), intent(inout) :: ifst,ilst
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldq,ldz,lwork,n
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dtgexc
+#else 
+               module procedure stdlib_dtgexc
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtgexc
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, ifst, ilst,&
+                          work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantq,wantz
+                    integer(ilp), intent(inout) :: ifst,ilst
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldq,ldz,lwork,n
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine stgexc
+#else 
+               module procedure stdlib_stgexc
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtgexc
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, ifst, ilst,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantq,wantz
+                    integer(ilp), intent(in) :: ifst,lda,ldb,ldq,ldz,n
+                    integer(ilp), intent(inout) :: ilst
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+               end subroutine ztgexc
+#else 
+               module procedure stdlib_ztgexc
+#endif
+          end interface tgexc
+
+
+
+          !> TGSEN: reorders the generalized Schur decomposition of a complex
+          !> matrix pair (A, B) (in terms of an unitary equivalence trans-
+          !> formation Q**H * (A, B) * Z), so that a selected cluster of eigenvalues
+          !> appears in the leading diagonal blocks of the pair (A,B). The leading
+          !> columns of Q and Z form unitary bases of the corresponding left and
+          !> right eigenspaces (deflating subspaces). (A, B) must be in
+          !> generalized Schur canonical form, that is, A and B are both upper
+          !> triangular.
+          !> TGSEN also computes the generalized eigenvalues
+          !> w(j)= ALPHA(j) / BETA(j)
+          !> of the reordered matrix pair (A, B).
+          !> Optionally, the routine computes estimates of reciprocal condition
+          !> numbers for eigenvalues and eigenspaces. These are Difu[(A11,B11),
+          !> (A22,B22)] and Difl[(A11,B11), (A22,B22)], i.e. the separation(s)
+          !> between the matrix pairs (A11, B11) and (A22,B22) that correspond to
+          !> the selected cluster and the eigenvalues outside the cluster, resp.,
+          !> and norms of "projections" onto left and right eigenspaces w.r.t.
+          !> the selected cluster in the (1,1)-block.
+          interface tgsen
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctgsen( ijob, wantq, wantz, select, n, a, lda, b, ldb,alpha, beta, &
+                         q, ldq, z, ldz, m, pl, pr, dif,work, lwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantq,wantz,select(*)
+                    integer(ilp), intent(in) :: ijob,lda,ldb,ldq,ldz,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,iwork(*)
+                    real(sp), intent(out) :: pl,pr,dif(*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+                    complex(sp), intent(out) :: alpha(*),beta(*),work(*)
+               end subroutine ctgsen
+#else 
+               module procedure stdlib_ctgsen
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtgsen( ijob, wantq, wantz, select, n, a, lda, b, ldb,alphar, &
+               alphai, beta, q, ldq, z, ldz, m, pl,pr, dif, work, lwork, iwork, liwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantq,wantz,select(*)
+                    integer(ilp), intent(in) :: ijob,lda,ldb,ldq,ldz,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,iwork(*)
+                    real(dp), intent(out) :: pl,pr,alphai(*),alphar(*),beta(*),dif(*),work(*)
+                              
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+               end subroutine dtgsen
+#else 
+               module procedure stdlib_dtgsen
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtgsen
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stgsen( ijob, wantq, wantz, select, n, a, lda, b, ldb,alphar, &
+               alphai, beta, q, ldq, z, ldz, m, pl,pr, dif, work, lwork, iwork, liwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantq,wantz,select(*)
+                    integer(ilp), intent(in) :: ijob,lda,ldb,ldq,ldz,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,iwork(*)
+                    real(sp), intent(out) :: pl,pr,alphai(*),alphar(*),beta(*),dif(*),work(*)
+                              
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+               end subroutine stgsen
+#else 
+               module procedure stdlib_stgsen
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtgsen
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztgsen( ijob, wantq, wantz, select, n, a, lda, b, ldb,alpha, beta, &
+                         q, ldq, z, ldz, m, pl, pr, dif,work, lwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    logical(lk), intent(in) :: wantq,wantz,select(*)
+                    integer(ilp), intent(in) :: ijob,lda,ldb,ldq,ldz,liwork,lwork,n
+                    integer(ilp), intent(out) :: info,m,iwork(*)
+                    real(dp), intent(out) :: pl,pr,dif(*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),z(ldz,*)
+                    complex(dp), intent(out) :: alpha(*),beta(*),work(*)
+               end subroutine ztgsen
+#else 
+               module procedure stdlib_ztgsen
+#endif
+          end interface tgsen
+
+
+
+          !> TGSJA: computes the generalized singular value decomposition (GSVD)
+          !> of two complex upper triangular (or trapezoidal) matrices A and B.
+          !> On entry, it is assumed that matrices A and B have the following
+          !> forms, which may be obtained by the preprocessing subroutine CGGSVP
+          !> from a general M-by-N matrix A and P-by-N matrix B:
+          !> N-K-L  K    L
+          !> A =    K ( 0    A12  A13 ) if M-K-L >= 0;
+          !> L ( 0     0   A23 )
+          !> M-K-L ( 0     0    0  )
+          !> N-K-L  K    L
+          !> A =  K ( 0    A12  A13 ) if M-K-L < 0;
+          !> M-K ( 0     0   A23 )
+          !> N-K-L  K    L
+          !> B =  L ( 0     0   B13 )
+          !> P-L ( 0     0    0  )
+          !> where the K-by-K matrix A12 and L-by-L matrix B13 are nonsingular
+          !> upper triangular; A23 is L-by-L upper triangular if M-K-L >= 0,
+          !> otherwise A23 is (M-K)-by-L upper trapezoidal.
+          !> On exit,
+          !> U**H *A*Q = D1*( 0 R ),    V**H *B*Q = D2*( 0 R ),
+          !> where U, V and Q are unitary matrices.
+          !> R is a nonsingular upper triangular matrix, and D1
+          !> and D2 are ``diagonal'' matrices, which are of the following
+          !> structures:
+          !> If M-K-L >= 0,
+          !> K  L
+          !> D1 =     K ( I  0 )
+          !> L ( 0  C )
+          !> M-K-L ( 0  0 )
+          !> K  L
+          !> D2 = L   ( 0  S )
+          !> P-L ( 0  0 )
+          !> N-K-L  K    L
+          !> ( 0 R ) = K (  0   R11  R12 ) K
+          !> L (  0    0   R22 ) L
+          !> where
+          !> C = diag( ALPHA(K+1), ... , ALPHA(K+L) ),
+          !> S = diag( BETA(K+1),  ... , BETA(K+L) ),
+          !> C**2 + S**2 = I.
+          !> R is stored in A(1:K+L,N-K-L+1:N) on exit.
+          !> If M-K-L < 0,
+          !> K M-K K+L-M
+          !> D1 =   K ( I  0    0   )
+          !> M-K ( 0  C    0   )
+          !> K M-K K+L-M
+          !> D2 =   M-K ( 0  S    0   )
+          !> K+L-M ( 0  0    I   )
+          !> P-L ( 0  0    0   )
+          !> N-K-L  K   M-K  K+L-M
+          !> ( 0 R ) =    K ( 0    R11  R12  R13  )
+          !> M-K ( 0     0   R22  R23  )
+          !> K+L-M ( 0     0    0   R33  )
+          !> where
+          !> C = diag( ALPHA(K+1), ... , ALPHA(M) ),
+          !> S = diag( BETA(K+1),  ... , BETA(M) ),
+          !> C**2 + S**2 = I.
+          !> R = ( R11 R12 R13 ) is stored in A(1:M, N-K-L+1:N) and R33 is stored
+          !> (  0  R22 R23 )
+          !> in B(M-K+1:L,N+M-K-L+1:N) on exit.
+          !> The computation of the unitary transformation matrices U, V or Q
+          !> is optional.  These matrices may either be formed explicitly, or they
+          !> may be postmultiplied into input matrices U1, V1, or Q1.
+          interface tgsja
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctgsja( jobu, jobv, jobq, m, p, n, k, l, a, lda, b,ldb, tola, tolb,&
+                          alpha, beta, u, ldu, v, ldv,q, ldq, work, ncycle, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobq,jobu,jobv
+                    integer(ilp), intent(out) :: info,ncycle
+                    integer(ilp), intent(in) :: k,l,lda,ldb,ldq,ldu,ldv,m,n,p
+                    real(sp), intent(in) :: tola,tolb
+                    real(sp), intent(out) :: alpha(*),beta(*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),u(ldu,*),v(ldv,*)
+                              
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctgsja
+#else 
+               module procedure stdlib_ctgsja
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtgsja( jobu, jobv, jobq, m, p, n, k, l, a, lda, b,ldb, tola, tolb,&
+                          alpha, beta, u, ldu, v, ldv,q, ldq, work, ncycle, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobq,jobu,jobv
+                    integer(ilp), intent(out) :: info,ncycle
+                    integer(ilp), intent(in) :: k,l,lda,ldb,ldq,ldu,ldv,m,n,p
+                    real(dp), intent(in) :: tola,tolb
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),u(ldu,*),v(ldv,*)
+                              
+                    real(dp), intent(out) :: alpha(*),beta(*),work(*)
+               end subroutine dtgsja
+#else 
+               module procedure stdlib_dtgsja
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtgsja
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stgsja( jobu, jobv, jobq, m, p, n, k, l, a, lda, b,ldb, tola, tolb,&
+                          alpha, beta, u, ldu, v, ldv,q, ldq, work, ncycle, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobq,jobu,jobv
+                    integer(ilp), intent(out) :: info,ncycle
+                    integer(ilp), intent(in) :: k,l,lda,ldb,ldq,ldu,ldv,m,n,p
+                    real(sp), intent(in) :: tola,tolb
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),u(ldu,*),v(ldv,*)
+                              
+                    real(sp), intent(out) :: alpha(*),beta(*),work(*)
+               end subroutine stgsja
+#else 
+               module procedure stdlib_stgsja
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtgsja
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztgsja( jobu, jobv, jobq, m, p, n, k, l, a, lda, b,ldb, tola, tolb,&
+                          alpha, beta, u, ldu, v, ldv,q, ldq, work, ncycle, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobq,jobu,jobv
+                    integer(ilp), intent(out) :: info,ncycle
+                    integer(ilp), intent(in) :: k,l,lda,ldb,ldq,ldu,ldv,m,n,p
+                    real(dp), intent(in) :: tola,tolb
+                    real(dp), intent(out) :: alpha(*),beta(*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*),q(ldq,*),u(ldu,*),v(ldv,*)
+                              
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztgsja
+#else 
+               module procedure stdlib_ztgsja
+#endif
+          end interface tgsja
+
+
+
+          !> TGSNA: estimates reciprocal condition numbers for specified
+          !> eigenvalues and/or eigenvectors of a matrix pair (A, B).
+          !> (A, B) must be in generalized Schur canonical form, that is, A and
+          !> B are both upper triangular.
+          interface tgsna
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctgsna( job, howmny, select, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                         s, dif, mm, m, work, lwork,iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,job
+                    integer(ilp), intent(out) :: info,m,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldb,ldvl,ldvr,lwork,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(sp), intent(out) :: dif(*),s(*)
+                    complex(sp), intent(in) :: a(lda,*),b(ldb,*),vl(ldvl,*),vr(ldvr,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctgsna
+#else 
+               module procedure stdlib_ctgsna
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtgsna( job, howmny, select, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                         s, dif, mm, m, work, lwork,iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,job
+                    integer(ilp), intent(out) :: info,m,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldb,ldvl,ldvr,lwork,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(dp), intent(in) :: a(lda,*),b(ldb,*),vl(ldvl,*),vr(ldvr,*)
+                    real(dp), intent(out) :: dif(*),s(*),work(*)
+               end subroutine dtgsna
+#else 
+               module procedure stdlib_dtgsna
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtgsna
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stgsna( job, howmny, select, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                         s, dif, mm, m, work, lwork,iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,job
+                    integer(ilp), intent(out) :: info,m,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldb,ldvl,ldvr,lwork,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(sp), intent(in) :: a(lda,*),b(ldb,*),vl(ldvl,*),vr(ldvr,*)
+                    real(sp), intent(out) :: dif(*),s(*),work(*)
+               end subroutine stgsna
+#else 
+               module procedure stdlib_stgsna
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtgsna
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztgsna( job, howmny, select, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                         s, dif, mm, m, work, lwork,iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,job
+                    integer(ilp), intent(out) :: info,m,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldb,ldvl,ldvr,lwork,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(dp), intent(out) :: dif(*),s(*)
+                    complex(dp), intent(in) :: a(lda,*),b(ldb,*),vl(ldvl,*),vr(ldvr,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztgsna
+#else 
+               module procedure stdlib_ztgsna
+#endif
+          end interface tgsna
+
+
+
+          !> TGSYL: solves the generalized Sylvester equation:
+          !> A * R - L * B = scale * C            (1)
+          !> D * R - L * E = scale * F
+          !> where R and L are unknown m-by-n matrices, (A, D), (B, E) and
+          !> (C, F) are given matrix pairs of size m-by-m, n-by-n and m-by-n,
+          !> respectively, with complex entries. A, B, D and E are upper
+          !> triangular (i.e., (A,D) and (B,E) in generalized Schur form).
+          !> The solution (R, L) overwrites (C, F). 0 <= SCALE <= 1
+          !> is an output scaling factor chosen to avoid overflow.
+          !> In matrix notation (1) is equivalent to solve Zx = scale*b, where Z
+          !> is defined as
+          !> Z = [ kron(In, A)  -kron(B**H, Im) ]        (2)
+          !> [ kron(In, D)  -kron(E**H, Im) ],
+          !> Here Ix is the identity matrix of size x and X**H is the conjugate
+          !> transpose of X. Kron(X, Y) is the Kronecker product between the
+          !> matrices X and Y.
+          !> If TRANS = 'C', y in the conjugate transposed system Z**H *y = scale*b
+          !> is solved for, which is equivalent to solve for R and L in
+          !> A**H * R + D**H * L = scale * C           (3)
+          !> R * B**H + L * E**H = scale * -F
+          !> This case (TRANS = 'C') is used to compute an one-norm-based estimate
+          !> of Dif[(A,D), (B,E)], the separation between the matrix pairs (A,D)
+          !> and (B,E), using CLACON.
+          !> If IJOB >= 1, TGSYL computes a Frobenius norm-based estimate of
+          !> Dif[(A,D),(B,E)]. That is, the reciprocal of a lower bound on the
+          !> reciprocal of the smallest singular value of Z.
+          !> This is a level-3 BLAS algorithm.
+          interface tgsyl
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctgsyl( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, &
+                         f, ldf, scale, dif, work, lwork,iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(in) :: ijob,lda,ldb,ldc,ldd,lde,ldf,lwork,m,n
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(sp), intent(out) :: dif,scale
+                    complex(sp), intent(in) :: a(lda,*),b(ldb,*),d(ldd,*),e(lde,*)
+                    complex(sp), intent(inout) :: c(ldc,*),f(ldf,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctgsyl
+#else 
+               module procedure stdlib_ctgsyl
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtgsyl( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, &
+                         f, ldf, scale, dif, work, lwork,iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(in) :: ijob,lda,ldb,ldc,ldd,lde,ldf,lwork,m,n
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(dp), intent(out) :: dif,scale,work(*)
+                    real(dp), intent(in) :: a(lda,*),b(ldb,*),d(ldd,*),e(lde,*)
+                    real(dp), intent(inout) :: c(ldc,*),f(ldf,*)
+               end subroutine dtgsyl
+#else 
+               module procedure stdlib_dtgsyl
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtgsyl
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stgsyl( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, &
+                         f, ldf, scale, dif, work, lwork,iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(in) :: ijob,lda,ldb,ldc,ldd,lde,ldf,lwork,m,n
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(sp), intent(out) :: dif,scale,work(*)
+                    real(sp), intent(in) :: a(lda,*),b(ldb,*),d(ldd,*),e(lde,*)
+                    real(sp), intent(inout) :: c(ldc,*),f(ldf,*)
+               end subroutine stgsyl
+#else 
+               module procedure stdlib_stgsyl
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtgsyl
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztgsyl( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, &
+                         f, ldf, scale, dif, work, lwork,iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trans
+                    integer(ilp), intent(in) :: ijob,lda,ldb,ldc,ldd,lde,ldf,lwork,m,n
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    real(dp), intent(out) :: dif,scale
+                    complex(dp), intent(in) :: a(lda,*),b(ldb,*),d(ldd,*),e(lde,*)
+                    complex(dp), intent(inout) :: c(ldc,*),f(ldf,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztgsyl
+#else 
+               module procedure stdlib_ztgsyl
+#endif
+          end interface tgsyl
+
+
+
+          !> TPCON: estimates the reciprocal of the condition number of a packed
+          !> triangular matrix A, in either the 1-norm or the infinity-norm.
+          !> The norm of A is computed and an estimate is obtained for
+          !> norm(inv(A)), then the reciprocal of the condition number is
+          !> computed as
+          !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+          interface tpcon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ctpcon( norm, uplo, diag, n, ap, rcond, work, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(out) :: rcond,rwork(*)
+                    complex(sp), intent(in) :: ap(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctpcon
+#else 
+               module procedure stdlib_ctpcon
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dtpcon( norm, uplo, diag, n, ap, rcond, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(out) :: rcond,work(*)
+                    real(dp), intent(in) :: ap(*)
+               end subroutine dtpcon
+#else 
+               module procedure stdlib_dtpcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtpcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine stpcon( norm, uplo, diag, n, ap, rcond, work, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(out) :: rcond,work(*)
+                    real(sp), intent(in) :: ap(*)
+               end subroutine stpcon
+#else 
+               module procedure stdlib_stpcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtpcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ztpcon( norm, uplo, diag, n, ap, rcond, work, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(out) :: rcond,rwork(*)
+                    complex(dp), intent(in) :: ap(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztpcon
+#else 
+               module procedure stdlib_ztpcon
+#endif
+          end interface tpcon
+
+
+
+          !> TPLQT: computes a blocked LQ factorization of a complex
+          !> "triangular-pentagonal" matrix C, which is composed of a
+          !> triangular block A and pentagonal block B, using the compact
+          !> WY representation for Q.
+          interface tplqt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctplqt( m, n, l, mb, a, lda, b, ldb, t, ldt, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l,mb
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: t(ldt,*),work(*)
+               end subroutine ctplqt
+#else 
+               module procedure stdlib_ctplqt
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtplqt( m, n, l, mb, a, lda, b, ldb, t, ldt, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l,mb
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: t(ldt,*),work(*)
+               end subroutine dtplqt
+#else 
+               module procedure stdlib_dtplqt
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtplqt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stplqt( m, n, l, mb, a, lda, b, ldb, t, ldt, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l,mb
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: t(ldt,*),work(*)
+               end subroutine stplqt
+#else 
+               module procedure stdlib_stplqt
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtplqt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztplqt( m, n, l, mb, a, lda, b, ldb, t, ldt, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l,mb
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: t(ldt,*),work(*)
+               end subroutine ztplqt
+#else 
+               module procedure stdlib_ztplqt
+#endif
+          end interface tplqt
+
+
+
+          !> TPLQT2: computes a LQ a factorization of a complex "triangular-pentagonal"
+          !> matrix C, which is composed of a triangular block A and pentagonal block B,
+          !> using the compact WY representation for Q.
+          interface tplqt2
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctplqt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: t(ldt,*)
+               end subroutine ctplqt2
+#else 
+               module procedure stdlib_ctplqt2
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtplqt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: t(ldt,*)
+               end subroutine dtplqt2
+#else 
+               module procedure stdlib_dtplqt2
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtplqt2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stplqt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: t(ldt,*)
+               end subroutine stplqt2
+#else 
+               module procedure stdlib_stplqt2
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtplqt2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztplqt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: t(ldt,*)
+               end subroutine ztplqt2
+#else 
+               module procedure stdlib_ztplqt2
+#endif
+          end interface tplqt2
+
+
+
+          !> TPMLQT: applies a complex unitary matrix Q obtained from a
+          !> "triangular-pentagonal" complex block reflector H to a general
+          !> complex matrix C, which consists of two blocks A and B.
+          interface tpmlqt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctpmlqt( side, trans, m, n, k, l, mb, v, ldv, t, ldt,a, lda, b, &
+                         ldb, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,lda,ldb,m,n,l,mb,ldt
+                    complex(sp), intent(in) :: v(ldv,*),t(ldt,*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctpmlqt
+#else 
+               module procedure stdlib_ctpmlqt
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtpmlqt( side, trans, m, n, k, l, mb, v, ldv, t, ldt,a, lda, b, &
+                         ldb, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,lda,ldb,m,n,l,mb,ldt
+                    real(dp), intent(in) :: v(ldv,*),t(ldt,*)
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dtpmlqt
+#else 
+               module procedure stdlib_dtpmlqt
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtpmlqt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stpmlqt( side, trans, m, n, k, l, mb, v, ldv, t, ldt,a, lda, b, &
+                         ldb, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,lda,ldb,m,n,l,mb,ldt
+                    real(sp), intent(in) :: v(ldv,*),t(ldt,*)
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine stpmlqt
+#else 
+               module procedure stdlib_stpmlqt
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtpmlqt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztpmlqt( side, trans, m, n, k, l, mb, v, ldv, t, ldt,a, lda, b, &
+                         ldb, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,lda,ldb,m,n,l,mb,ldt
+                    complex(dp), intent(in) :: v(ldv,*),t(ldt,*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztpmlqt
+#else 
+               module procedure stdlib_ztpmlqt
+#endif
+          end interface tpmlqt
+
+
+
+          !> TPMQRT: applies a complex orthogonal matrix Q obtained from a
+          !> "triangular-pentagonal" complex block reflector H to a general
+          !> complex matrix C, which consists of two blocks A and B.
+          interface tpmqrt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctpmqrt( side, trans, m, n, k, l, nb, v, ldv, t, ldt,a, lda, b, &
+                         ldb, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,lda,ldb,m,n,l,nb,ldt
+                    complex(sp), intent(in) :: v(ldv,*),t(ldt,*)
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctpmqrt
+#else 
+               module procedure stdlib_ctpmqrt
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtpmqrt( side, trans, m, n, k, l, nb, v, ldv, t, ldt,a, lda, b, &
+                         ldb, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,lda,ldb,m,n,l,nb,ldt
+                    real(dp), intent(in) :: v(ldv,*),t(ldt,*)
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dtpmqrt
+#else 
+               module procedure stdlib_dtpmqrt
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtpmqrt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stpmqrt( side, trans, m, n, k, l, nb, v, ldv, t, ldt,a, lda, b, &
+                         ldb, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,lda,ldb,m,n,l,nb,ldt
+                    real(sp), intent(in) :: v(ldv,*),t(ldt,*)
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine stpmqrt
+#else 
+               module procedure stdlib_stpmqrt
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtpmqrt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztpmqrt( side, trans, m, n, k, l, nb, v, ldv, t, ldt,a, lda, b, &
+                         ldb, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,ldv,lda,ldb,m,n,l,nb,ldt
+                    complex(dp), intent(in) :: v(ldv,*),t(ldt,*)
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztpmqrt
+#else 
+               module procedure stdlib_ztpmqrt
+#endif
+          end interface tpmqrt
+
+
+
+          !> TPQRT: computes a blocked QR factorization of a complex
+          !> "triangular-pentagonal" matrix C, which is composed of a
+          !> triangular block A and pentagonal block B, using the compact
+          !> WY representation for Q.
+          interface tpqrt
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctpqrt( m, n, l, nb, a, lda, b, ldb, t, ldt, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l,nb
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: t(ldt,*),work(*)
+               end subroutine ctpqrt
+#else 
+               module procedure stdlib_ctpqrt
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtpqrt( m, n, l, nb, a, lda, b, ldb, t, ldt, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l,nb
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: t(ldt,*),work(*)
+               end subroutine dtpqrt
+#else 
+               module procedure stdlib_dtpqrt
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtpqrt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stpqrt( m, n, l, nb, a, lda, b, ldb, t, ldt, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l,nb
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: t(ldt,*),work(*)
+               end subroutine stpqrt
+#else 
+               module procedure stdlib_stpqrt
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtpqrt
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztpqrt( m, n, l, nb, a, lda, b, ldb, t, ldt, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l,nb
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: t(ldt,*),work(*)
+               end subroutine ztpqrt
+#else 
+               module procedure stdlib_ztpqrt
+#endif
+          end interface tpqrt
+
+
+
+          !> TPQRT2: computes a QR factorization of a complex "triangular-pentagonal"
+          !> matrix C, which is composed of a triangular block A and pentagonal block B,
+          !> using the compact WY representation for Q.
+          interface tpqrt2
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctpqrt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(out) :: t(ldt,*)
+               end subroutine ctpqrt2
+#else 
+               module procedure stdlib_ctpqrt2
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtpqrt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(out) :: t(ldt,*)
+               end subroutine dtpqrt2
+#else 
+               module procedure stdlib_dtpqrt2
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtpqrt2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stpqrt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(out) :: t(ldt,*)
+               end subroutine stpqrt2
+#else 
+               module procedure stdlib_stpqrt2
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtpqrt2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztpqrt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldt,n,m,l
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(out) :: t(ldt,*)
+               end subroutine ztpqrt2
+#else 
+               module procedure stdlib_ztpqrt2
+#endif
+          end interface tpqrt2
+
+
+
+          !> TPRFB: applies a complex "triangular-pentagonal" block reflector H or its
+          !> conjugate transpose H**H to a complex matrix C, which is composed of two
+          !> blocks A and B, either from the left or right.
+          interface tprfb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctprfb( side, trans, direct, storev, m, n, k, l,v, ldv, t, ldt, a, &
+                         lda, b, ldb, work, ldwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,side,storev,trans
+                    integer(ilp), intent(in) :: k,l,lda,ldb,ldt,ldv,ldwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(in) :: t(ldt,*),v(ldv,*)
+                    complex(sp), intent(out) :: work(ldwork,*)
+               end subroutine ctprfb
+#else 
+               module procedure stdlib_ctprfb
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtprfb( side, trans, direct, storev, m, n, k, l,v, ldv, t, ldt, a, &
+                         lda, b, ldb, work, ldwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,side,storev,trans
+                    integer(ilp), intent(in) :: k,l,lda,ldb,ldt,ldv,ldwork,m,n
+                    real(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(in) :: t(ldt,*),v(ldv,*)
+                    real(dp), intent(out) :: work(ldwork,*)
+               end subroutine dtprfb
+#else 
+               module procedure stdlib_dtprfb
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtprfb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stprfb( side, trans, direct, storev, m, n, k, l,v, ldv, t, ldt, a, &
+                         lda, b, ldb, work, ldwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,side,storev,trans
+                    integer(ilp), intent(in) :: k,l,lda,ldb,ldt,ldv,ldwork,m,n
+                    real(sp), intent(inout) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(in) :: t(ldt,*),v(ldv,*)
+                    real(sp), intent(out) :: work(ldwork,*)
+               end subroutine stprfb
+#else 
+               module procedure stdlib_stprfb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtprfb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztprfb( side, trans, direct, storev, m, n, k, l,v, ldv, t, ldt, a, &
+                         lda, b, ldb, work, ldwork )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: direct,side,storev,trans
+                    integer(ilp), intent(in) :: k,l,lda,ldb,ldt,ldv,ldwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(in) :: t(ldt,*),v(ldv,*)
+                    complex(dp), intent(out) :: work(ldwork,*)
+               end subroutine ztprfb
+#else 
+               module procedure stdlib_ztprfb
+#endif
+          end interface tprfb
+
+
+
+          !> TPRFS: provides error bounds and backward error estimates for the
+          !> solution to a system of linear equations with a triangular packed
+          !> coefficient matrix.
+          !> The solution matrix X must be computed by CTPTRS or some other
+          !> means before entering this routine.  TPRFS does not do iterative
+          !> refinement because doing so cannot improve the backward error.
+          interface tprfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctprfs( uplo, trans, diag, n, nrhs, ap, b, ldb, x, ldx,ferr, berr, &
+                         work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(sp), intent(in) :: ap(*),b(ldb,*),x(ldx,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctprfs
+#else 
+               module procedure stdlib_ctprfs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtprfs( uplo, trans, diag, n, nrhs, ap, b, ldb, x, ldx,ferr, berr, &
+                         work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(dp), intent(in) :: ap(*),b(ldb,*),x(ldx,*)
+                    real(dp), intent(out) :: berr(*),ferr(*),work(*)
+               end subroutine dtprfs
+#else 
+               module procedure stdlib_dtprfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtprfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stprfs( uplo, trans, diag, n, nrhs, ap, b, ldb, x, ldx,ferr, berr, &
+                         work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(sp), intent(in) :: ap(*),b(ldb,*),x(ldx,*)
+                    real(sp), intent(out) :: berr(*),ferr(*),work(*)
+               end subroutine stprfs
+#else 
+               module procedure stdlib_stprfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtprfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztprfs( uplo, trans, diag, n, nrhs, ap, b, ldb, x, ldx,ferr, berr, &
+                         work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,ldx,n,nrhs
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(dp), intent(in) :: ap(*),b(ldb,*),x(ldx,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztprfs
+#else 
+               module procedure stdlib_ztprfs
+#endif
+          end interface tprfs
+
+
+
+          !> TPTRI: computes the inverse of a complex upper or lower triangular
+          !> matrix A stored in packed format.
+          interface tptri
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctptri( uplo, diag, n, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(sp), intent(inout) :: ap(*)
+               end subroutine ctptri
+#else 
+               module procedure stdlib_ctptri
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtptri( uplo, diag, n, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(inout) :: ap(*)
+               end subroutine dtptri
+#else 
+               module procedure stdlib_dtptri
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtptri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stptri( uplo, diag, n, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(inout) :: ap(*)
+               end subroutine stptri
+#else 
+               module procedure stdlib_stptri
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtptri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztptri( uplo, diag, n, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(dp), intent(inout) :: ap(*)
+               end subroutine ztptri
+#else 
+               module procedure stdlib_ztptri
+#endif
+          end interface tptri
+
+
+
+          !> TPTRS: solves a triangular system of the form
+          !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+          !> where A is a triangular matrix of order N stored in packed format,
+          !> and B is an N-by-NRHS matrix.  A check is made to verify that A is
+          !> nonsingular.
+          interface tptrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctptrs( uplo, trans, diag, n, nrhs, ap, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(sp), intent(in) :: ap(*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine ctptrs
+#else 
+               module procedure stdlib_ctptrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtptrs( uplo, trans, diag, n, nrhs, ap, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(dp), intent(in) :: ap(*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dtptrs
+#else 
+               module procedure stdlib_dtptrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtptrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stptrs( uplo, trans, diag, n, nrhs, ap, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    real(sp), intent(in) :: ap(*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine stptrs
+#else 
+               module procedure stdlib_stptrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtptrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztptrs( uplo, trans, diag, n, nrhs, ap, b, ldb, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldb,n,nrhs
+                    complex(dp), intent(in) :: ap(*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine ztptrs
+#else 
+               module procedure stdlib_ztptrs
+#endif
+          end interface tptrs
+
+
+
+          !> TPTTF: copies a triangular matrix A from standard packed format (TP)
+          !> to rectangular full packed format (TF).
+          interface tpttf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctpttf( transr, uplo, n, ap, arf, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(sp), intent(in) :: ap(0:*)
+                    complex(sp), intent(out) :: arf(0:*)
+               end subroutine ctpttf
+#else 
+               module procedure stdlib_ctpttf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtpttf( transr, uplo, n, ap, arf, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(dp), intent(in) :: ap(0:*)
+                    real(dp), intent(out) :: arf(0:*)
+               end subroutine dtpttf
+#else 
+               module procedure stdlib_dtpttf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtpttf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stpttf( transr, uplo, n, ap, arf, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    real(sp), intent(in) :: ap(0:*)
+                    real(sp), intent(out) :: arf(0:*)
+               end subroutine stpttf
+#else 
+               module procedure stdlib_stpttf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtpttf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztpttf( transr, uplo, n, ap, arf, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n
+                    complex(dp), intent(in) :: ap(0:*)
+                    complex(dp), intent(out) :: arf(0:*)
+               end subroutine ztpttf
+#else 
+               module procedure stdlib_ztpttf
+#endif
+          end interface tpttf
+
+
+
+          !> TPTTR: copies a triangular matrix A from standard packed format (TP)
+          !> to standard full format (TR).
+          interface tpttr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctpttr( uplo, n, ap, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    complex(sp), intent(out) :: a(lda,*)
+                    complex(sp), intent(in) :: ap(*)
+               end subroutine ctpttr
+#else 
+               module procedure stdlib_ctpttr
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtpttr( uplo, n, ap, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    real(dp), intent(out) :: a(lda,*)
+                    real(dp), intent(in) :: ap(*)
+               end subroutine dtpttr
+#else 
+               module procedure stdlib_dtpttr
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtpttr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stpttr( uplo, n, ap, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    real(sp), intent(out) :: a(lda,*)
+                    real(sp), intent(in) :: ap(*)
+               end subroutine stpttr
+#else 
+               module procedure stdlib_stpttr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtpttr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztpttr( uplo, n, ap, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    complex(dp), intent(out) :: a(lda,*)
+                    complex(dp), intent(in) :: ap(*)
+               end subroutine ztpttr
+#else 
+               module procedure stdlib_ztpttr
+#endif
+          end interface tpttr
+
+
+
+          !> TRCON: estimates the reciprocal of the condition number of a
+          !> triangular matrix A, in either the 1-norm or the infinity-norm.
+          !> The norm of A is computed and an estimate is obtained for
+          !> norm(inv(A)), then the reciprocal of the condition number is
+          !> computed as
+          !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+          interface trcon
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ctrcon( norm, uplo, diag, n, a, lda, rcond, work,rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: rcond,rwork(*)
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctrcon
+#else 
+               module procedure stdlib_ctrcon
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dtrcon( norm, uplo, diag, n, a, lda, rcond, work,iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: rcond,work(*)
+                    real(dp), intent(in) :: a(lda,*)
+               end subroutine dtrcon
+#else 
+               module procedure stdlib_dtrcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine strcon( norm, uplo, diag, n, a, lda, rcond, work,iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(out) :: rcond,work(*)
+                    real(sp), intent(in) :: a(lda,*)
+               end subroutine strcon
+#else 
+               module procedure stdlib_strcon
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrcon
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ztrcon( norm, uplo, diag, n, a, lda, rcond, work,rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,norm,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(out) :: rcond,rwork(*)
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztrcon
+#else 
+               module procedure stdlib_ztrcon
+#endif
+          end interface trcon
+
+
+
+          !> TREVC: computes some or all of the right and/or left eigenvectors of
+          !> a complex upper triangular matrix T.
+          !> Matrices of this type are produced by the Schur factorization of
+          !> a complex general matrix:  A = Q*T*Q**H, as computed by CHSEQR.
+          !> The right eigenvector x and the left eigenvector y of T corresponding
+          !> to an eigenvalue w are defined by:
+          !> T*x = w*x,     (y**H)*T = w*(y**H)
+          !> where y**H denotes the conjugate transpose of the vector y.
+          !> The eigenvalues are not input to this routine, but are read directly
+          !> from the diagonal of T.
+          !> This routine returns the matrices X and/or Y of right and left
+          !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+          !> input matrix.  If Q is the unitary factor that reduces a matrix A to
+          !> Schur form T, then Q*X and Q*Y are the matrices of right and left
+          !> eigenvectors of A.
+          interface trevc
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctrevc( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m, &
+                         work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,side
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldt,ldvl,ldvr,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: t(ldt,*),vl(ldvl,*),vr(ldvr,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctrevc
+#else 
+               module procedure stdlib_ctrevc
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtrevc( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m, &
+                         work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,side
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldt,ldvl,ldvr,mm,n
+                    logical(lk), intent(inout) :: select(*)
+                    real(dp), intent(in) :: t(ldt,*)
+                    real(dp), intent(inout) :: vl(ldvl,*),vr(ldvr,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dtrevc
+#else 
+               module procedure stdlib_dtrevc
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrevc
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine strevc( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m, &
+                         work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,side
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldt,ldvl,ldvr,mm,n
+                    logical(lk), intent(inout) :: select(*)
+                    real(sp), intent(in) :: t(ldt,*)
+                    real(sp), intent(inout) :: vl(ldvl,*),vr(ldvr,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine strevc
+#else 
+               module procedure stdlib_strevc
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrevc
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztrevc( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m, &
+                         work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,side
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldt,ldvl,ldvr,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: t(ldt,*),vl(ldvl,*),vr(ldvr,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztrevc
+#else 
+               module procedure stdlib_ztrevc
+#endif
+          end interface trevc
+
+
+
+          !> TREVC3: computes some or all of the right and/or left eigenvectors of
+          !> a complex upper triangular matrix T.
+          !> Matrices of this type are produced by the Schur factorization of
+          !> a complex general matrix:  A = Q*T*Q**H, as computed by CHSEQR.
+          !> The right eigenvector x and the left eigenvector y of T corresponding
+          !> to an eigenvalue w are defined by:
+          !> T*x = w*x,     (y**H)*T = w*(y**H)
+          !> where y**H denotes the conjugate transpose of the vector y.
+          !> The eigenvalues are not input to this routine, but are read directly
+          !> from the diagonal of T.
+          !> This routine returns the matrices X and/or Y of right and left
+          !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+          !> input matrix. If Q is the unitary factor that reduces a matrix A to
+          !> Schur form T, then Q*X and Q*Y are the matrices of right and left
+          !> eigenvectors of A.
+          !> This uses a Level 3 BLAS version of the back transformation.
+          interface trevc3
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctrevc3( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m,&
+                          work, lwork, rwork, lrwork, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,side
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldt,ldvl,ldvr,lwork,lrwork,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(sp), intent(out) :: rwork(*)
+                    complex(sp), intent(inout) :: t(ldt,*),vl(ldvl,*),vr(ldvr,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctrevc3
+#else 
+               module procedure stdlib_ctrevc3
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtrevc3( side, howmny, select, n, t, ldt, vl, ldvl,vr, ldvr, mm, m,&
+                          work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,side
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldt,ldvl,ldvr,lwork,mm,n
+                    logical(lk), intent(inout) :: select(*)
+                    real(dp), intent(in) :: t(ldt,*)
+                    real(dp), intent(inout) :: vl(ldvl,*),vr(ldvr,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dtrevc3
+#else 
+               module procedure stdlib_dtrevc3
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrevc3
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine strevc3( side, howmny, select, n, t, ldt, vl, ldvl,vr, ldvr, mm, m,&
+                          work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,side
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldt,ldvl,ldvr,lwork,mm,n
+                    logical(lk), intent(inout) :: select(*)
+                    real(sp), intent(in) :: t(ldt,*)
+                    real(sp), intent(inout) :: vl(ldvl,*),vr(ldvr,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine strevc3
+#else 
+               module procedure stdlib_strevc3
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrevc3
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztrevc3( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m,&
+                          work, lwork, rwork, lrwork, info)
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,side
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldt,ldvl,ldvr,lwork,lrwork,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(dp), intent(out) :: rwork(*)
+                    complex(dp), intent(inout) :: t(ldt,*),vl(ldvl,*),vr(ldvr,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztrevc3
+#else 
+               module procedure stdlib_ztrevc3
+#endif
+          end interface trevc3
+
+
+
+          !> TREXC: reorders the Schur factorization of a complex matrix
+          !> A = Q*T*Q**H, so that the diagonal element of T with row index IFST
+          !> is moved to row ILST.
+          !> The Schur form T is reordered by a unitary similarity transformation
+          !> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by
+          !> postmultplying it with Z.
+          interface trexc
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctrexc( compq, n, t, ldt, q, ldq, ifst, ilst, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq
+                    integer(ilp), intent(in) :: ifst,ilst,ldq,ldt,n
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(inout) :: q(ldq,*),t(ldt,*)
+               end subroutine ctrexc
+#else 
+               module procedure stdlib_ctrexc
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dtrexc( compq, n, t, ldt, q, ldq, ifst, ilst, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq
+                    integer(ilp), intent(inout) :: ifst,ilst
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldq,ldt,n
+                    real(dp), intent(inout) :: q(ldq,*),t(ldt,*)
+                    real(dp), intent(out) :: work(*)
+               end subroutine dtrexc
+#else 
+               module procedure stdlib_dtrexc
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrexc
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine strexc( compq, n, t, ldt, q, ldq, ifst, ilst, work,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq
+                    integer(ilp), intent(inout) :: ifst,ilst
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldq,ldt,n
+                    real(sp), intent(inout) :: q(ldq,*),t(ldt,*)
+                    real(sp), intent(out) :: work(*)
+               end subroutine strexc
+#else 
+               module procedure stdlib_strexc
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrexc
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztrexc( compq, n, t, ldt, q, ldq, ifst, ilst, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq
+                    integer(ilp), intent(in) :: ifst,ilst,ldq,ldt,n
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(inout) :: q(ldq,*),t(ldt,*)
+               end subroutine ztrexc
+#else 
+               module procedure stdlib_ztrexc
+#endif
+          end interface trexc
+
+
+
+          !> TRRFS: provides error bounds and backward error estimates for the
+          !> solution to a system of linear equations with a triangular
+          !> coefficient matrix.
+          !> The solution matrix X must be computed by CTRTRS or some other
+          !> means before entering this routine.  TRRFS does not do iterative
+          !> refinement because doing so cannot improve the backward error.
+          interface trrfs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctrrfs( uplo, trans, diag, n, nrhs, a, lda, b, ldb, x,ldx, ferr, &
+                         berr, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldx,n,nrhs
+                    real(sp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(sp), intent(in) :: a(lda,*),b(ldb,*),x(ldx,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine ctrrfs
+#else 
+               module procedure stdlib_ctrrfs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtrrfs( uplo, trans, diag, n, nrhs, a, lda, b, ldb, x,ldx, ferr, &
+                         berr, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldb,ldx,n,nrhs
+                    real(dp), intent(in) :: a(lda,*),b(ldb,*),x(ldx,*)
+                    real(dp), intent(out) :: berr(*),ferr(*),work(*)
+               end subroutine dtrrfs
+#else 
+               module procedure stdlib_dtrrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine strrfs( uplo, trans, diag, n, nrhs, a, lda, b, ldb, x,ldx, ferr, &
+                         berr, work, iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: lda,ldb,ldx,n,nrhs
+                    real(sp), intent(in) :: a(lda,*),b(ldb,*),x(ldx,*)
+                    real(sp), intent(out) :: berr(*),ferr(*),work(*)
+               end subroutine strrfs
+#else 
+               module procedure stdlib_strrfs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrrfs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztrrfs( uplo, trans, diag, n, nrhs, a, lda, b, ldb, x,ldx, ferr, &
+                         berr, work, rwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,ldx,n,nrhs
+                    real(dp), intent(out) :: berr(*),ferr(*),rwork(*)
+                    complex(dp), intent(in) :: a(lda,*),b(ldb,*),x(ldx,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine ztrrfs
+#else 
+               module procedure stdlib_ztrrfs
+#endif
+          end interface trrfs
+
+
+
+          !> TRSEN: reorders the Schur factorization of a complex matrix
+          !> A = Q*T*Q**H, so that a selected cluster of eigenvalues appears in
+          !> the leading positions on the diagonal of the upper triangular matrix
+          !> T, and the leading columns of Q form an orthonormal basis of the
+          !> corresponding right invariant subspace.
+          !> Optionally the routine computes the reciprocal condition numbers of
+          !> the cluster of eigenvalues and/or the invariant subspace.
+          interface trsen
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ctrsen( job, compq, select, n, t, ldt, q, ldq, w, m, s,sep, work, lwork,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,job
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldq,ldt,lwork,n
+                    real(sp), intent(out) :: s,sep
+                    logical(lk), intent(in) :: select(*)
+                    complex(sp), intent(inout) :: q(ldq,*),t(ldt,*)
+                    complex(sp), intent(out) :: w(*),work(*)
+               end subroutine ctrsen
+#else 
+               module procedure stdlib_ctrsen
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dtrsen( job, compq, select, n, t, ldt, q, ldq, wr, wi,m, s, sep, work, &
+                         lwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,job
+                    integer(ilp), intent(out) :: info,m,iwork(*)
+                    integer(ilp), intent(in) :: ldq,ldt,liwork,lwork,n
+                    real(dp), intent(out) :: s,sep,wi(*),work(*),wr(*)
+                    logical(lk), intent(in) :: select(*)
+                    real(dp), intent(inout) :: q(ldq,*),t(ldt,*)
+               end subroutine dtrsen
+#else 
+               module procedure stdlib_dtrsen
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrsen
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine strsen( job, compq, select, n, t, ldt, q, ldq, wr, wi,m, s, sep, work, &
+                         lwork, iwork, liwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,job
+                    integer(ilp), intent(out) :: info,m,iwork(*)
+                    integer(ilp), intent(in) :: ldq,ldt,liwork,lwork,n
+                    real(sp), intent(out) :: s,sep,wi(*),work(*),wr(*)
+                    logical(lk), intent(in) :: select(*)
+                    real(sp), intent(inout) :: q(ldq,*),t(ldt,*)
+               end subroutine strsen
+#else 
+               module procedure stdlib_strsen
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrsen
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ztrsen( job, compq, select, n, t, ldt, q, ldq, w, m, s,sep, work, lwork,&
+                          info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: compq,job
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldq,ldt,lwork,n
+                    real(dp), intent(out) :: s,sep
+                    logical(lk), intent(in) :: select(*)
+                    complex(dp), intent(inout) :: q(ldq,*),t(ldt,*)
+                    complex(dp), intent(out) :: w(*),work(*)
+               end subroutine ztrsen
+#else 
+               module procedure stdlib_ztrsen
+#endif
+          end interface trsen
+
+
+
+          !> TRSNA: estimates reciprocal condition numbers for specified
+          !> eigenvalues and/or right eigenvectors of a complex upper triangular
+          !> matrix T (or of any matrix Q*T*Q**H with Q unitary).
+          interface trsna
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctrsna( job, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, s, sep, &
+                         mm, m, work, ldwork, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,job
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldt,ldvl,ldvr,ldwork,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(sp), intent(out) :: rwork(*),s(*),sep(*)
+                    complex(sp), intent(in) :: t(ldt,*),vl(ldvl,*),vr(ldvr,*)
+                    complex(sp), intent(out) :: work(ldwork,*)
+               end subroutine ctrsna
+#else 
+               module procedure stdlib_ctrsna
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dtrsna( job, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, s, sep, mm, &
+                         m, work, ldwork, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,job
+                    integer(ilp), intent(out) :: info,m,iwork(*)
+                    integer(ilp), intent(in) :: ldt,ldvl,ldvr,ldwork,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(dp), intent(out) :: s(*),sep(*),work(ldwork,*)
+                    real(dp), intent(in) :: t(ldt,*),vl(ldvl,*),vr(ldvr,*)
+               end subroutine dtrsna
+#else 
+               module procedure stdlib_dtrsna
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrsna
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine strsna( job, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, s, sep, mm, &
+                         m, work, ldwork, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,job
+                    integer(ilp), intent(out) :: info,m,iwork(*)
+                    integer(ilp), intent(in) :: ldt,ldvl,ldvr,ldwork,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(sp), intent(out) :: s(*),sep(*),work(ldwork,*)
+                    real(sp), intent(in) :: t(ldt,*),vl(ldvl,*),vr(ldvr,*)
+               end subroutine strsna
+#else 
+               module procedure stdlib_strsna
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrsna
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztrsna( job, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, s, sep, &
+                         mm, m, work, ldwork, rwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: howmny,job
+                    integer(ilp), intent(out) :: info,m
+                    integer(ilp), intent(in) :: ldt,ldvl,ldvr,ldwork,mm,n
+                    logical(lk), intent(in) :: select(*)
+                    real(dp), intent(out) :: rwork(*),s(*),sep(*)
+                    complex(dp), intent(in) :: t(ldt,*),vl(ldvl,*),vr(ldvr,*)
+                    complex(dp), intent(out) :: work(ldwork,*)
+               end subroutine ztrsna
+#else 
+               module procedure stdlib_ztrsna
+#endif
+          end interface trsna
+
+
+
+          !> TRSYL: solves the complex Sylvester matrix equation:
+          !> op(A)*X + X*op(B) = scale*C or
+          !> op(A)*X - X*op(B) = scale*C,
+          !> where op(A) = A or A**H, and A and B are both upper triangular. A is
+          !> M-by-M and B is N-by-N; the right hand side C and the solution X are
+          !> M-by-N; and scale is an output scale factor, set <= 1 to avoid
+          !> overflow in X.
+          interface trsyl
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ctrsyl( trana, tranb, isgn, m, n, a, lda, b, ldb, c,ldc, scale, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trana,tranb
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: isgn,lda,ldb,ldc,m,n
+                    real(sp), intent(out) :: scale
+                    complex(sp), intent(in) :: a(lda,*),b(ldb,*)
+                    complex(sp), intent(inout) :: c(ldc,*)
+               end subroutine ctrsyl
+#else 
+               module procedure stdlib_ctrsyl
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine dtrsyl( trana, tranb, isgn, m, n, a, lda, b, ldb, c,ldc, scale, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trana,tranb
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: isgn,lda,ldb,ldc,m,n
+                    real(dp), intent(out) :: scale
+                    real(dp), intent(in) :: a(lda,*),b(ldb,*)
+                    real(dp), intent(inout) :: c(ldc,*)
+               end subroutine dtrsyl
+#else 
+               module procedure stdlib_dtrsyl
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrsyl
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine strsyl( trana, tranb, isgn, m, n, a, lda, b, ldb, c,ldc, scale, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trana,tranb
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: isgn,lda,ldb,ldc,m,n
+                    real(sp), intent(out) :: scale
+                    real(sp), intent(in) :: a(lda,*),b(ldb,*)
+                    real(sp), intent(inout) :: c(ldc,*)
+               end subroutine strsyl
+#else 
+               module procedure stdlib_strsyl
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrsyl
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine ztrsyl( trana, tranb, isgn, m, n, a, lda, b, ldb, c,ldc, scale, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: trana,tranb
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: isgn,lda,ldb,ldc,m,n
+                    real(dp), intent(out) :: scale
+                    complex(dp), intent(in) :: a(lda,*),b(ldb,*)
+                    complex(dp), intent(inout) :: c(ldc,*)
+               end subroutine ztrsyl
+#else 
+               module procedure stdlib_ztrsyl
+#endif
+          end interface trsyl
+
+
+
+          !> TRTRI: computes the inverse of a complex upper or lower triangular
+          !> matrix A.
+          !> This is the Level 3 BLAS version of the algorithm.
+          interface trtri
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctrtri( uplo, diag, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    complex(sp), intent(inout) :: a(lda,*)
+               end subroutine ctrtri
+#else 
+               module procedure stdlib_ctrtri
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtrtri( uplo, diag, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(dp), intent(inout) :: a(lda,*)
+               end subroutine dtrtri
+#else 
+               module procedure stdlib_dtrtri
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrtri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine strtri( uplo, diag, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    real(sp), intent(inout) :: a(lda,*)
+               end subroutine strtri
+#else 
+               module procedure stdlib_strtri
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrtri
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztrtri( uplo, diag, n, a, lda, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,n
+                    complex(dp), intent(inout) :: a(lda,*)
+               end subroutine ztrtri
+#else 
+               module procedure stdlib_ztrtri
+#endif
+          end interface trtri
+
+
+
+          !> TRTRS: solves a triangular system of the form
+          !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+          !> where A is a triangular matrix of order N, and B is an N-by-NRHS
+          !> matrix.  A check is made to verify that A is nonsingular.
+          interface trtrs
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctrtrs( uplo, trans, diag, n, nrhs, a, lda, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(inout) :: b(ldb,*)
+               end subroutine ctrtrs
+#else 
+               module procedure stdlib_ctrtrs
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtrtrs( uplo, trans, diag, n, nrhs, a, lda, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(inout) :: b(ldb,*)
+               end subroutine dtrtrs
+#else 
+               module procedure stdlib_dtrtrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrtrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine strtrs( uplo, trans, diag, n, nrhs, a, lda, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(inout) :: b(ldb,*)
+               end subroutine strtrs
+#else 
+               module procedure stdlib_strtrs
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrtrs
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztrtrs( uplo, trans, diag, n, nrhs, a, lda, b, ldb,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: diag,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldb,n,nrhs
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(inout) :: b(ldb,*)
+               end subroutine ztrtrs
+#else 
+               module procedure stdlib_ztrtrs
+#endif
+          end interface trtrs
+
+
+
+          !> TRTTF: copies a triangular matrix A from standard full format (TR)
+          !> to rectangular full packed format (TF) .
+          interface trttf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctrttf( transr, uplo, n, a, lda, arf, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    complex(sp), intent(in) :: a(0:lda-1,0:*)
+                    complex(sp), intent(out) :: arf(0:*)
+               end subroutine ctrttf
+#else 
+               module procedure stdlib_ctrttf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtrttf( transr, uplo, n, a, lda, arf, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    real(dp), intent(in) :: a(0:lda-1,0:*)
+                    real(dp), intent(out) :: arf(0:*)
+               end subroutine dtrttf
+#else 
+               module procedure stdlib_dtrttf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrttf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine strttf( transr, uplo, n, a, lda, arf, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    real(sp), intent(in) :: a(0:lda-1,0:*)
+                    real(sp), intent(out) :: arf(0:*)
+               end subroutine strttf
+#else 
+               module procedure stdlib_strttf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrttf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztrttf( transr, uplo, n, a, lda, arf, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: transr,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    complex(dp), intent(in) :: a(0:lda-1,0:*)
+                    complex(dp), intent(out) :: arf(0:*)
+               end subroutine ztrttf
+#else 
+               module procedure stdlib_ztrttf
+#endif
+          end interface trttf
+
+
+
+          !> TRTTP: copies a triangular matrix A from full format (TR) to standard
+          !> packed format (TP).
+          interface trttp
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctrttp( uplo, n, a, lda, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    complex(sp), intent(in) :: a(lda,*)
+                    complex(sp), intent(out) :: ap(*)
+               end subroutine ctrttp
+#else 
+               module procedure stdlib_ctrttp
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtrttp( uplo, n, a, lda, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    real(dp), intent(in) :: a(lda,*)
+                    real(dp), intent(out) :: ap(*)
+               end subroutine dtrttp
+#else 
+               module procedure stdlib_dtrttp
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtrttp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine strttp( uplo, n, a, lda, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    real(sp), intent(in) :: a(lda,*)
+                    real(sp), intent(out) :: ap(*)
+               end subroutine strttp
+#else 
+               module procedure stdlib_strttp
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtrttp
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztrttp( uplo, n, a, lda, ap, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: n,lda
+                    complex(dp), intent(in) :: a(lda,*)
+                    complex(dp), intent(out) :: ap(*)
+               end subroutine ztrttp
+#else 
+               module procedure stdlib_ztrttp
+#endif
+          end interface trttp
+
+
+
+          !> TZRZF: reduces the M-by-N ( M<=N ) complex upper trapezoidal matrix A
+          !> to upper triangular form by means of unitary transformations.
+          !> The upper trapezoidal matrix A is factored as
+          !> A = ( R  0 ) * Z,
+          !> where Z is an N-by-N unitary matrix and R is an M-by-M upper
+          !> triangular matrix.
+          interface tzrzf
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ctzrzf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: tau(*),work(*)
+               end subroutine ctzrzf
+#else 
+               module procedure stdlib_ctzrzf
+#endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine dtzrzf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(dp), intent(inout) :: a(lda,*)
+                    real(dp), intent(out) :: tau(*),work(*)
+               end subroutine dtzrzf
+#else 
+               module procedure stdlib_dtzrzf
+#endif
+#:if WITH_QP
+               module procedure stdlib_qtzrzf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine stzrzf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    real(sp), intent(inout) :: a(lda,*)
+                    real(sp), intent(out) :: tau(*),work(*)
+               end subroutine stzrzf
+#else 
+               module procedure stdlib_stzrzf
+#endif
+#:if WITH_QP
+               module procedure stdlib_wtzrzf
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine ztzrzf( m, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: tau(*),work(*)
+               end subroutine ztzrzf
+#else 
+               module procedure stdlib_ztzrzf
+#endif
+          end interface tzrzf
+
+
+
+          !> UNBDB: simultaneously bidiagonalizes the blocks of an M-by-M
+          !> partitioned unitary matrix X:
+          !> [ B11 | B12 0  0 ]
+          !> [ X11 | X12 ]   [ P1 |    ] [  0  |  0 -I  0 ] [ Q1 |    ]**H
+          !> X = [-----------] = [---------] [----------------] [---------]   .
+          !> [ X21 | X22 ]   [    | P2 ] [ B21 | B22 0  0 ] [    | Q2 ]
+          !> [  0  |  0  0  I ]
+          !> X11 is P-by-Q. Q must be no larger than P, M-P, or M-Q. (If this is
+          !> not the case, then X must be transposed and/or permuted. This can be
+          !> done in constant time using the TRANS and SIGNS options. See CUNCSD
+          !> for details.)
+          !> The unitary matrices P1, P2, Q1, and Q2 are P-by-P, (M-P)-by-
+          !> (M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. They are
+          !> represented implicitly by Householder vectors.
+          !> B11, B12, B21, and B22 are Q-by-Q bidiagonal matrices represented
+          !> implicitly by angles THETA, PHI.
+          interface unbdb
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+                         ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: signs,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldx11,ldx12,ldx21,ldx22,lwork,m,p,q
+                    real(sp), intent(out) :: phi(*),theta(*)
+                    complex(sp), intent(out) :: taup1(*),taup2(*),tauq1(*),tauq2(*),work(*)
+                              
+                    complex(sp), intent(inout) :: x11(ldx11,*),x12(ldx12,*),x21(ldx21,*),x22(&
+                              ldx22,*)
+               end subroutine cunbdb
+#else 
+               module procedure stdlib_cunbdb
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunbdb
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+                         ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: signs,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldx11,ldx12,ldx21,ldx22,lwork,m,p,q
+                    real(dp), intent(out) :: phi(*),theta(*)
+                    complex(dp), intent(out) :: taup1(*),taup2(*),tauq1(*),tauq2(*),work(*)
+                              
+                    complex(dp), intent(inout) :: x11(ldx11,*),x12(ldx12,*),x21(ldx21,*),x22(&
+                              ldx22,*)
+               end subroutine zunbdb
+#else 
+               module procedure stdlib_zunbdb
+#endif
+          end interface unbdb
+
+
+
+          !> UNBDB1: simultaneously bidiagonalizes the blocks of a tall and skinny
+          !> matrix X with orthonomal columns:
+          !> [ B11 ]
+          !> [ X11 ]   [ P1 |    ] [  0  ]
+          !> [-----] = [---------] [-----] Q1**T .
+          !> [ X21 ]   [    | P2 ] [ B21 ]
+          !> [  0  ]
+          !> X11 is P-by-Q, and X21 is (M-P)-by-Q. Q must be no larger than P,
+          !> M-P, or M-Q. Routines CUNBDB2, CUNBDB3, and CUNBDB4 handle cases in
+          !> which Q is not the minimum dimension.
+          !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+          !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+          !> Householder vectors.
+          !> B11 and B12 are Q-by-Q bidiagonal matrices represented implicitly by
+          !> angles THETA, PHI.
+          interface unbdb1
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(sp), intent(out) :: phi(*),theta(*)
+                    complex(sp), intent(out) :: taup1(*),taup2(*),tauq1(*),work(*)
+                    complex(sp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine cunbdb1
+#else 
+               module procedure stdlib_cunbdb1
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunbdb1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(dp), intent(out) :: phi(*),theta(*)
+                    complex(dp), intent(out) :: taup1(*),taup2(*),tauq1(*),work(*)
+                    complex(dp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine zunbdb1
+#else 
+               module procedure stdlib_zunbdb1
+#endif
+          end interface unbdb1
+
+
+
+          !> UNBDB2: simultaneously bidiagonalizes the blocks of a tall and skinny
+          !> matrix X with orthonomal columns:
+          !> [ B11 ]
+          !> [ X11 ]   [ P1 |    ] [  0  ]
+          !> [-----] = [---------] [-----] Q1**T .
+          !> [ X21 ]   [    | P2 ] [ B21 ]
+          !> [  0  ]
+          !> X11 is P-by-Q, and X21 is (M-P)-by-Q. P must be no larger than M-P,
+          !> Q, or M-Q. Routines CUNBDB1, CUNBDB3, and CUNBDB4 handle cases in
+          !> which P is not the minimum dimension.
+          !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+          !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+          !> Householder vectors.
+          !> B11 and B12 are P-by-P bidiagonal matrices represented implicitly by
+          !> angles THETA, PHI.
+          interface unbdb2
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(sp), intent(out) :: phi(*),theta(*)
+                    complex(sp), intent(out) :: taup1(*),taup2(*),tauq1(*),work(*)
+                    complex(sp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine cunbdb2
+#else 
+               module procedure stdlib_cunbdb2
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunbdb2
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(dp), intent(out) :: phi(*),theta(*)
+                    complex(dp), intent(out) :: taup1(*),taup2(*),tauq1(*),work(*)
+                    complex(dp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine zunbdb2
+#else 
+               module procedure stdlib_zunbdb2
+#endif
+          end interface unbdb2
+
+
+
+          !> UNBDB3: simultaneously bidiagonalizes the blocks of a tall and skinny
+          !> matrix X with orthonomal columns:
+          !> [ B11 ]
+          !> [ X11 ]   [ P1 |    ] [  0  ]
+          !> [-----] = [---------] [-----] Q1**T .
+          !> [ X21 ]   [    | P2 ] [ B21 ]
+          !> [  0  ]
+          !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-P must be no larger than P,
+          !> Q, or M-Q. Routines CUNBDB1, CUNBDB2, and CUNBDB4 handle cases in
+          !> which M-P is not the minimum dimension.
+          !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+          !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+          !> Householder vectors.
+          !> B11 and B12 are (M-P)-by-(M-P) bidiagonal matrices represented
+          !> implicitly by angles THETA, PHI.
+          interface unbdb3
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(sp), intent(out) :: phi(*),theta(*)
+                    complex(sp), intent(out) :: taup1(*),taup2(*),tauq1(*),work(*)
+                    complex(sp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine cunbdb3
+#else 
+               module procedure stdlib_cunbdb3
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunbdb3
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(dp), intent(out) :: phi(*),theta(*)
+                    complex(dp), intent(out) :: taup1(*),taup2(*),tauq1(*),work(*)
+                    complex(dp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine zunbdb3
+#else 
+               module procedure stdlib_zunbdb3
+#endif
+          end interface unbdb3
+
+
+
+          !> UNBDB4: simultaneously bidiagonalizes the blocks of a tall and skinny
+          !> matrix X with orthonomal columns:
+          !> [ B11 ]
+          !> [ X11 ]   [ P1 |    ] [  0  ]
+          !> [-----] = [---------] [-----] Q1**T .
+          !> [ X21 ]   [    | P2 ] [ B21 ]
+          !> [  0  ]
+          !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-Q must be no larger than P,
+          !> M-P, or Q. Routines CUNBDB1, CUNBDB2, and CUNBDB3 handle cases in
+          !> which M-Q is not the minimum dimension.
+          !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+          !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+          !> Householder vectors.
+          !> B11 and B12 are (M-Q)-by-(M-Q) bidiagonal matrices represented
+          !> implicitly by angles THETA, PHI.
+          interface unbdb4
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, phantom, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(sp), intent(out) :: phi(*),theta(*)
+                    complex(sp), intent(out) :: phantom(*),taup1(*),taup2(*),tauq1(*),work(*)
+                              
+                    complex(sp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine cunbdb4
+#else 
+               module procedure stdlib_cunbdb4
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunbdb4
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, &
+                         tauq1, phantom, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lwork,m,p,q,ldx11,ldx21
+                    real(dp), intent(out) :: phi(*),theta(*)
+                    complex(dp), intent(out) :: phantom(*),taup1(*),taup2(*),tauq1(*),work(*)
+                              
+                    complex(dp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine zunbdb4
+#else 
+               module procedure stdlib_zunbdb4
+#endif
+          end interface unbdb4
+
+
+
+          !> UNBDB5: orthogonalizes the column vector
+          !> X = [ X1 ]
+          !> [ X2 ]
+          !> with respect to the columns of
+          !> Q = [ Q1 ] .
+          !> [ Q2 ]
+          !> The columns of Q must be orthonormal.
+          !> If the projection is zero according to Kahan's "twice is enough"
+          !> criterion, then some other vector from the orthogonal complement
+          !> is returned. This vector is chosen in an arbitrary but deterministic
+          !> way.
+          interface unbdb5
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunbdb5( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx1,incx2,ldq1,ldq2,lwork,m1,m2,n
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(in) :: q1(ldq1,*),q2(ldq2,*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: x1(*),x2(*)
+               end subroutine cunbdb5
+#else 
+               module procedure stdlib_cunbdb5
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunbdb5
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunbdb5( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx1,incx2,ldq1,ldq2,lwork,m1,m2,n
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(in) :: q1(ldq1,*),q2(ldq2,*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: x1(*),x2(*)
+               end subroutine zunbdb5
+#else 
+               module procedure stdlib_zunbdb5
+#endif
+          end interface unbdb5
+
+
+
+          !> UNBDB6: orthogonalizes the column vector
+          !> X = [ X1 ]
+          !> [ X2 ]
+          !> with respect to the columns of
+          !> Q = [ Q1 ] .
+          !> [ Q2 ]
+          !> The columns of Q must be orthonormal.
+          !> If the projection is zero according to Kahan's "twice is enough"
+          !> criterion, then the zero vector is returned.
+          interface unbdb6
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx1,incx2,ldq1,ldq2,lwork,m1,m2,n
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(in) :: q1(ldq1,*),q2(ldq2,*)
+                    complex(sp), intent(out) :: work(*)
+                    complex(sp), intent(inout) :: x1(*),x2(*)
+               end subroutine cunbdb6
+#else 
+               module procedure stdlib_cunbdb6
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunbdb6
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: incx1,incx2,ldq1,ldq2,lwork,m1,m2,n
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(in) :: q1(ldq1,*),q2(ldq2,*)
+                    complex(dp), intent(out) :: work(*)
+                    complex(dp), intent(inout) :: x1(*),x2(*)
+               end subroutine zunbdb6
+#else 
+               module procedure stdlib_zunbdb6
+#endif
+          end interface unbdb6
+
+
+
+          !> UNCSD: computes the CS decomposition of an M-by-M partitioned
+          !> unitary matrix X:
+          !> [  I  0  0 |  0  0  0 ]
+          !> [  0  C  0 |  0 -S  0 ]
+          !> [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**H
+          !> X = [-----------] = [---------] [---------------------] [---------]   .
+          !> [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
+          !> [  0  S  0 |  0  C  0 ]
+          !> [  0  0  I |  0  0  0 ]
+          !> X11 is P-by-Q. The unitary matrices U1, U2, V1, and V2 are P-by-P,
+          !> (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
+          !> R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
+          !> which R = MIN(P,M-P,Q,M-Q).
+          interface uncsd
+#ifdef STDLIB_EXTERNAL_LAPACK
+               recursive subroutine cuncsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, &
+               x11, ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, &
+                         ldv1t, v2t,ldv2t, work, lwork, rwork, lrwork,iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu1,jobu2,jobv1t,jobv2t,signs,trans
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldu1,ldu2,ldv1t,ldv2t,ldx11,ldx12,ldx21,ldx22,&
+                              lrwork,lwork,m,p,q
+                    real(sp), intent(out) :: theta(*),rwork(*)
+                    complex(sp), intent(out) :: u1(ldu1,*),u2(ldu2,*),v1t(ldv1t,*),v2t(ldv2t,*),&
+                              work(*)
+                    complex(sp), intent(inout) :: x11(ldx11,*),x12(ldx12,*),x21(ldx21,*),x22(&
+                              ldx22,*)
+               end subroutine cuncsd
+#else 
+               module procedure stdlib_cuncsd
+#endif
+#:if WITH_QP
+               module procedure stdlib_wuncsd
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               recursive subroutine zuncsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, &
+               x11, ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, &
+                         ldv1t, v2t,ldv2t, work, lwork, rwork, lrwork,iwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu1,jobu2,jobv1t,jobv2t,signs,trans
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldu1,ldu2,ldv1t,ldv2t,ldx11,ldx12,ldx21,ldx22,&
+                              lrwork,lwork,m,p,q
+                    real(dp), intent(out) :: theta(*),rwork(*)
+                    complex(dp), intent(out) :: u1(ldu1,*),u2(ldu2,*),v1t(ldv1t,*),v2t(ldv2t,*),&
+                              work(*)
+                    complex(dp), intent(inout) :: x11(ldx11,*),x12(ldx12,*),x21(ldx21,*),x22(&
+                              ldx22,*)
+               end subroutine zuncsd
+#else 
+               module procedure stdlib_zuncsd
+#endif
+          end interface uncsd
+
+
+
+          !> UNCSD2BY1: computes the CS decomposition of an M-by-Q matrix X with
+          !> orthonormal columns that has been partitioned into a 2-by-1 block
+          !> structure:
+          !> [  I1 0  0 ]
+          !> [  0  C  0 ]
+          !> [ X11 ]   [ U1 |    ] [  0  0  0 ]
+          !> X = [-----] = [---------] [----------] V1**T .
+          !> [ X21 ]   [    | U2 ] [  0  0  0 ]
+          !> [  0  S  0 ]
+          !> [  0  0  I2]
+          !> X11 is P-by-Q. The unitary matrices U1, U2, and V1 are P-by-P,
+          !> (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
+          !> nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
+          !> R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
+          !> K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
+          interface uncsd2by1
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine cuncsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta,&
+                          u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, rwork, lrwork, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu1,jobu2,jobv1t
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldu1,ldu2,ldv1t,lwork,ldx11,ldx21,m,p,q,&
+                              lrwork
+                    real(sp), intent(out) :: rwork(*),theta(*)
+                    complex(sp), intent(out) :: u1(ldu1,*),u2(ldu2,*),v1t(ldv1t,*),work(*)
+                    complex(sp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine cuncsd2by1
+#else 
+               module procedure stdlib_cuncsd2by1
+#endif
+#:if WITH_QP
+               module procedure stdlib_wuncsd2by1
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               subroutine zuncsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta,&
+                          u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, rwork, lrwork, iwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: jobu1,jobu2,jobv1t
+                    integer(ilp), intent(out) :: info,iwork(*)
+                    integer(ilp), intent(in) :: ldu1,ldu2,ldv1t,lwork,ldx11,ldx21,m,p,q,&
+                              lrwork
+                    real(dp), intent(out) :: rwork(*),theta(*)
+                    complex(dp), intent(out) :: u1(ldu1,*),u2(ldu2,*),v1t(ldv1t,*),work(*)
+                    complex(dp), intent(inout) :: x11(ldx11,*),x21(ldx21,*)
+               end subroutine zuncsd2by1
+#else 
+               module procedure stdlib_zuncsd2by1
+#endif
+          end interface uncsd2by1
+
+
+
+          !> UNG2L: generates an m by n complex matrix Q with orthonormal columns,
+          !> which is defined as the last n columns of a product of k elementary
+          !> reflectors of order m
+          !> Q  =  H(k) . . . H(2) H(1)
+          !> as returned by CGEQLF.
+          interface ung2l
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cung2l( m, n, k, a, lda, tau, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cung2l
+#else 
+               module procedure stdlib_cung2l
+#endif
+#:if WITH_QP
+               module procedure stdlib_wung2l
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zung2l( m, n, k, a, lda, tau, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zung2l
+#else 
+               module procedure stdlib_zung2l
+#endif
+          end interface ung2l
+
+
+
+          !> UNG2R: generates an m by n complex matrix Q with orthonormal columns,
+          !> which is defined as the first n columns of a product of k elementary
+          !> reflectors of order m
+          !> Q  =  H(1) H(2) . . . H(k)
+          !> as returned by CGEQRF.
+          interface ung2r
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cung2r( m, n, k, a, lda, tau, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cung2r
+#else 
+               module procedure stdlib_cung2r
+#endif
+#:if WITH_QP
+               module procedure stdlib_wung2r
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zung2r( m, n, k, a, lda, tau, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zung2r
+#else 
+               module procedure stdlib_zung2r
+#endif
+          end interface ung2r
+
+
+
+          !> UNGBR: generates one of the complex unitary matrices Q or P**H
+          !> determined by CGEBRD when reducing a complex matrix A to bidiagonal
+          !> form: A = Q * B * P**H.  Q and P**H are defined as products of
+          !> elementary reflectors H(i) or G(i) respectively.
+          !> If VECT = 'Q', A is assumed to have been an M-by-K matrix, and Q
+          !> is of order M:
+          !> if m >= k, Q = H(1) H(2) . . . H(k) and UNGBR returns the first n
+          !> columns of Q, where m >= n >= k;
+          !> if m < k, Q = H(1) H(2) . . . H(m-1) and UNGBR returns Q as an
+          !> M-by-M matrix.
+          !> If VECT = 'P', A is assumed to have been a K-by-N matrix, and P**H
+          !> is of order N:
+          !> if k < n, P**H = G(k) . . . G(2) G(1) and UNGBR returns the first m
+          !> rows of P**H, where n >= m >= k;
+          !> if k >= n, P**H = G(n-1) . . . G(2) G(1) and UNGBR returns P**H as
+          !> an N-by-N matrix.
+          interface ungbr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cungbr( vect, m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cungbr
+#else 
+               module procedure stdlib_cungbr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wungbr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zungbr( vect, m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zungbr
+#else 
+               module procedure stdlib_zungbr
+#endif
+          end interface ungbr
+
+
+
+          !> UNGHR: generates a complex unitary matrix Q which is defined as the
+          !> product of IHI-ILO elementary reflectors of order N, as returned by
+          !> CGEHRD:
+          !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+          interface unghr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunghr( n, ilo, ihi, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ilo,lda,lwork,n
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cunghr
+#else 
+               module procedure stdlib_cunghr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunghr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunghr( n, ilo, ihi, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(in) :: ihi,ilo,lda,lwork,n
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zunghr
+#else 
+               module procedure stdlib_zunghr
+#endif
+          end interface unghr
+
+
+
+          !> UNGLQ: generates an M-by-N complex matrix Q with orthonormal rows,
+          !> which is defined as the first M rows of a product of K elementary
+          !> reflectors of order N
+          !> Q  =  H(k)**H . . . H(2)**H H(1)**H
+          !> as returned by CGELQF.
+          interface unglq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunglq( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cunglq
+#else 
+               module procedure stdlib_cunglq
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunglq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunglq( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zunglq
+#else 
+               module procedure stdlib_zunglq
+#endif
+          end interface unglq
+
+
+
+          !> UNGQL: generates an M-by-N complex matrix Q with orthonormal columns,
+          !> which is defined as the last N columns of a product of K elementary
+          !> reflectors of order M
+          !> Q  =  H(k) . . . H(2) H(1)
+          !> as returned by CGEQLF.
+          interface ungql
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cungql( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cungql
+#else 
+               module procedure stdlib_cungql
+#endif
+#:if WITH_QP
+               module procedure stdlib_wungql
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zungql( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zungql
+#else 
+               module procedure stdlib_zungql
+#endif
+          end interface ungql
+
+
+
+          !> UNGQR: generates an M-by-N complex matrix Q with orthonormal columns,
+          !> which is defined as the first N columns of a product of K elementary
+          !> reflectors of order M
+          !> Q  =  H(1) H(2) . . . H(k)
+          !> as returned by CGEQRF.
+          interface ungqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cungqr( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cungqr
+#else 
+               module procedure stdlib_cungqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wungqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zungqr( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zungqr
+#else 
+               module procedure stdlib_zungqr
+#endif
+          end interface ungqr
+
+
+
+          !> UNGRQ: generates an M-by-N complex matrix Q with orthonormal rows,
+          !> which is defined as the last M rows of a product of K elementary
+          !> reflectors of order N
+          !> Q  =  H(1)**H H(2)**H . . . H(k)**H
+          !> as returned by CGERQF.
+          interface ungrq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cungrq( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cungrq
+#else 
+               module procedure stdlib_cungrq
+#endif
+#:if WITH_QP
+               module procedure stdlib_wungrq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zungrq( m, n, k, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zungrq
+#else 
+               module procedure stdlib_zungrq
+#endif
+          end interface ungrq
+
+
+
+          !> UNGTR: generates a complex unitary matrix Q which is defined as the
+          !> product of n-1 elementary reflectors of order N, as returned by
+          !> CHETRD:
+          !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+          !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+          interface ungtr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cungtr( uplo, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cungtr
+#else 
+               module procedure stdlib_cungtr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wungtr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zungtr( uplo, n, a, lda, tau, work, lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,lwork,n
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zungtr
+#else 
+               module procedure stdlib_zungtr
+#endif
+          end interface ungtr
+
+
+
+          !> UNGTSQR: generates an M-by-N complex matrix Q_out with orthonormal
+          !> columns, which are the first N columns of a product of comlpex unitary
+          !> matrices of order M which are returned by CLATSQR
+          !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+          !> See the documentation for CLATSQR.
+          interface ungtsqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cungtsqr( m, n, mb, nb, a, lda, t, ldt, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,lwork,m,n,mb,nb
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(in) :: t(ldt,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cungtsqr
+#else 
+               module procedure stdlib_cungtsqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wungtsqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zungtsqr( m, n, mb, nb, a, lda, t, ldt, work, lwork,info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,lwork,m,n,mb,nb
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(in) :: t(ldt,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zungtsqr
+#else 
+               module procedure stdlib_zungtsqr
+#endif
+          end interface ungtsqr
+
+
+
+          !> UNGTSQR_ROW: generates an M-by-N complex matrix Q_out with
+          !> orthonormal columns from the output of CLATSQR. These N orthonormal
+          !> columns are the first N columns of a product of complex unitary
+          !> matrices Q(k)_in of order M, which are returned by CLATSQR in
+          !> a special format.
+          !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+          !> The input matrices Q(k)_in are stored in row and column blocks in A.
+          !> See the documentation of CLATSQR for more details on the format of
+          !> Q(k)_in, where each Q(k)_in is represented by block Householder
+          !> transformations. This routine calls an auxiliary routine CLARFB_GETT,
+          !> where the computation is performed on each individual block. The
+          !> algorithm first sweeps NB-sized column blocks from the right to left
+          !> starting in the bottom row block and continues to the top row block
+          !> (hence _ROW in the routine name). This sweep is in reverse order of
+          !> the order in which CLATSQR generates the output blocks.
+          interface ungtsqr_row
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cungtsqr_row( m, n, mb, nb, a, lda, t, ldt, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,lwork,m,n,mb,nb
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(in) :: t(ldt,*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cungtsqr_row
+#else 
+               module procedure stdlib_cungtsqr_row
+#endif
+#:if WITH_QP
+               module procedure stdlib_wungtsqr_row
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zungtsqr_row( m, n, mb, nb, a, lda, t, ldt, work,lwork, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,lwork,m,n,mb,nb
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(in) :: t(ldt,*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zungtsqr_row
+#else 
+               module procedure stdlib_zungtsqr_row
+#endif
+          end interface ungtsqr_row
+
+
+
+          !> UNHR_COL: takes an M-by-N complex matrix Q_in with orthonormal columns
+          !> as input, stored in A, and performs Householder Reconstruction (HR),
+          !> i.e. reconstructs Householder vectors V(i) implicitly representing
+          !> another M-by-N matrix Q_out, with the property that Q_in = Q_out*S,
+          !> where S is an N-by-N diagonal matrix with diagonal entries
+          !> equal to +1 or -1. The Householder vectors (columns V(i) of V) are
+          !> stored in A on output, and the diagonal entries of S are stored in D.
+          !> Block reflectors are also returned in T
+          !> (same output format as CGEQRT).
+          interface unhr_col
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunhr_col( m, n, nb, a, lda, t, ldt, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n,nb
+                    complex(sp), intent(inout) :: a(lda,*)
+                    complex(sp), intent(out) :: d(*),t(ldt,*)
+               end subroutine cunhr_col
+#else 
+               module procedure stdlib_cunhr_col
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunhr_col
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunhr_col( m, n, nb, a, lda, t, ldt, d, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldt,m,n,nb
+                    complex(dp), intent(inout) :: a(lda,*)
+                    complex(dp), intent(out) :: d(*),t(ldt,*)
+               end subroutine zunhr_col
+#else 
+               module procedure stdlib_zunhr_col
+#endif
+          end interface unhr_col
+
+
+
+          !> UNM2L: overwrites the general complex m-by-n matrix C with
+          !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+          !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+          !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+          !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+          !> where Q is a complex unitary matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(k) . . . H(2) H(1)
+          !> as returned by CGEQLF. Q is of order m if SIDE = 'L' and of order n
+          !> if SIDE = 'R'.
+          interface unm2l
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunm2l( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,m,n
+                    complex(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cunm2l
+#else 
+               module procedure stdlib_cunm2l
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunm2l
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunm2l( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,m,n
+                    complex(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zunm2l
+#else 
+               module procedure stdlib_zunm2l
+#endif
+          end interface unm2l
+
+
+
+          !> UNM2R: overwrites the general complex m-by-n matrix C with
+          !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+          !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+          !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+          !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+          !> where Q is a complex unitary matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(1) H(2) . . . H(k)
+          !> as returned by CGEQRF. Q is of order m if SIDE = 'L' and of order n
+          !> if SIDE = 'R'.
+          interface unm2r
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunm2r( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,m,n
+                    complex(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cunm2r
+#else 
+               module procedure stdlib_cunm2r
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunm2r
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunm2r( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,m,n
+                    complex(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zunm2r
+#else 
+               module procedure stdlib_zunm2r
+#endif
+          end interface unm2r
+
+
+
+          !> If VECT = 'Q', UNMBR: overwrites the general complex M-by-N matrix C
+          !> with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'C':      Q**H * C       C * Q**H
+          !> If VECT = 'P', UNMBR overwrites the general complex M-by-N matrix C
+          !> with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      P * C          C * P
+          !> TRANS = 'C':      P**H * C       C * P**H
+          !> Here Q and P**H are the unitary matrices determined by CGEBRD when
+          !> reducing a complex matrix A to bidiagonal form: A = Q * B * P**H. Q
+          !> and P**H are defined as products of elementary reflectors H(i) and
+          !> G(i) respectively.
+          !> Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the
+          !> order of the unitary matrix Q or P**H that is applied.
+          !> If VECT = 'Q', A is assumed to have been an NQ-by-K matrix:
+          !> if nq >= k, Q = H(1) H(2) . . . H(k);
+          !> if nq < k, Q = H(1) H(2) . . . H(nq-1).
+          !> If VECT = 'P', A is assumed to have been a K-by-NQ matrix:
+          !> if k < nq, P = G(1) G(2) . . . G(k);
+          !> if k >= nq, P = G(1) G(2) . . . G(nq-1).
+          interface unmbr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunmbr( vect, side, trans, m, n, k, a, lda, tau, c,ldc, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans,vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cunmbr
+#else 
+               module procedure stdlib_cunmbr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunmbr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunmbr( vect, side, trans, m, n, k, a, lda, tau, c,ldc, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans,vect
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zunmbr
+#else 
+               module procedure stdlib_zunmbr
+#endif
+          end interface unmbr
+
+
+
+          !> UNMHR: overwrites the general complex M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'C':      Q**H * C       C * Q**H
+          !> where Q is a complex unitary matrix of order nq, with nq = m if
+          !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+          !> IHI-ILO elementary reflectors, as returned by CGEHRD:
+          !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+          interface unmhr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunmhr( side, trans, m, n, ilo, ihi, a, lda, tau, c,ldc, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(in) :: ihi,ilo,lda,ldc,lwork,m,n
+                    integer(ilp), intent(out) :: info
+                    complex(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cunmhr
+#else 
+               module procedure stdlib_cunmhr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunmhr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunmhr( side, trans, m, n, ilo, ihi, a, lda, tau, c,ldc, work, &
+                         lwork, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(in) :: ihi,ilo,lda,ldc,lwork,m,n
+                    integer(ilp), intent(out) :: info
+                    complex(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zunmhr
+#else 
+               module procedure stdlib_zunmhr
+#endif
+          end interface unmhr
+
+
+
+          !> UNMLQ: overwrites the general complex M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'C':      Q**H * C       C * Q**H
+          !> where Q is a complex unitary matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(k)**H . . . H(2)**H H(1)**H
+          !> as returned by CGELQF. Q is of order M if SIDE = 'L' and of order N
+          !> if SIDE = 'R'.
+          interface unmlq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunmlq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cunmlq
+#else 
+               module procedure stdlib_cunmlq
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunmlq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunmlq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zunmlq
+#else 
+               module procedure stdlib_zunmlq
+#endif
+          end interface unmlq
+
+
+
+          !> UNMQL: overwrites the general complex M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'C':      Q**H * C       C * Q**H
+          !> where Q is a complex unitary matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(k) . . . H(2) H(1)
+          !> as returned by CGEQLF. Q is of order M if SIDE = 'L' and of order N
+          !> if SIDE = 'R'.
+          interface unmql
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunmql( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cunmql
+#else 
+               module procedure stdlib_cunmql
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunmql
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunmql( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zunmql
+#else 
+               module procedure stdlib_zunmql
+#endif
+          end interface unmql
+
+
+
+          !> UNMQR: overwrites the general complex M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'C':      Q**H * C       C * Q**H
+          !> where Q is a complex unitary matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(1) H(2) . . . H(k)
+          !> as returned by CGEQRF. Q is of order M if SIDE = 'L' and of order N
+          !> if SIDE = 'R'.
+          interface unmqr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunmqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cunmqr
+#else 
+               module procedure stdlib_cunmqr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunmqr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunmqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zunmqr
+#else 
+               module procedure stdlib_zunmqr
+#endif
+          end interface unmqr
+
+
+
+          !> UNMRQ: overwrites the general complex M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'C':      Q**H * C       C * Q**H
+          !> where Q is a complex unitary matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(1)**H H(2)**H . . . H(k)**H
+          !> as returned by CGERQF. Q is of order M if SIDE = 'L' and of order N
+          !> if SIDE = 'R'.
+          interface unmrq
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunmrq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cunmrq
+#else 
+               module procedure stdlib_cunmrq
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunmrq
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunmrq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,lda,ldc,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zunmrq
+#else 
+               module procedure stdlib_zunmrq
+#endif
+          end interface unmrq
+
+
+
+          !> UNMRZ: overwrites the general complex M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'C':      Q**H * C       C * Q**H
+          !> where Q is a complex unitary matrix defined as the product of k
+          !> elementary reflectors
+          !> Q = H(1) H(2) . . . H(k)
+          !> as returned by CTZRZF. Q is of order M if SIDE = 'L' and of order N
+          !> if SIDE = 'R'.
+          interface unmrz
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunmrz( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,l,lda,ldc,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cunmrz
+#else 
+               module procedure stdlib_cunmrz
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunmrz
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunmrz( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: k,l,lda,ldc,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zunmrz
+#else 
+               module procedure stdlib_zunmrz
+#endif
+          end interface unmrz
+
+
+
+          !> UNMTR: overwrites the general complex M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'C':      Q**H * C       C * Q**H
+          !> where Q is a complex unitary matrix of order nq, with nq = m if
+          !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+          !> nq-1 elementary reflectors, as returned by CHETRD:
+          !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+          !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+          interface unmtr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cunmtr( side, uplo, trans, m, n, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldc,lwork,m,n
+                    complex(sp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cunmtr
+#else 
+               module procedure stdlib_cunmtr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wunmtr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zunmtr( side, uplo, trans, m, n, a, lda, tau, c, ldc,work, lwork, &
+                         info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: lda,ldc,lwork,m,n
+                    complex(dp), intent(inout) :: a(lda,*),c(ldc,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zunmtr
+#else 
+               module procedure stdlib_zunmtr
+#endif
+          end interface unmtr
+
+
+
+          !> UPGTR: generates a complex unitary matrix Q which is defined as the
+          !> product of n-1 elementary reflectors H(i) of order n, as returned by
+          !> CHPTRD using packed storage:
+          !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+          !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+          interface upgtr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cupgtr( uplo, n, ap, tau, q, ldq, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldq,n
+                    complex(sp), intent(in) :: ap(*),tau(*)
+                    complex(sp), intent(out) :: q(ldq,*),work(*)
+               end subroutine cupgtr
+#else 
+               module procedure stdlib_cupgtr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wupgtr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zupgtr( uplo, n, ap, tau, q, ldq, work, info )
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldq,n
+                    complex(dp), intent(in) :: ap(*),tau(*)
+                    complex(dp), intent(out) :: q(ldq,*),work(*)
+               end subroutine zupgtr
+#else 
+               module procedure stdlib_zupgtr
+#endif
+          end interface upgtr
+
+
+
+          !> UPMTR: overwrites the general complex M-by-N matrix C with
+          !> SIDE = 'L'     SIDE = 'R'
+          !> TRANS = 'N':      Q * C          C * Q
+          !> TRANS = 'C':      Q**H * C       C * Q**H
+          !> where Q is a complex unitary matrix of order nq, with nq = m if
+          !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+          !> nq-1 elementary reflectors, as returned by CHPTRD using packed
+          !> storage:
+          !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+          !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+          interface upmtr
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine cupmtr( side, uplo, trans, m, n, ap, tau, c, ldc, work,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldc,m,n
+                    complex(sp), intent(inout) :: ap(*),c(ldc,*)
+                    complex(sp), intent(in) :: tau(*)
+                    complex(sp), intent(out) :: work(*)
+               end subroutine cupmtr
+#else 
+               module procedure stdlib_cupmtr
+#endif
+#:if WITH_QP
+               module procedure stdlib_wupmtr
+#:endif
+#ifdef STDLIB_EXTERNAL_LAPACK
+               pure subroutine zupmtr( side, uplo, trans, m, n, ap, tau, c, ldc, work,info )
+                         
+                    import sp,dp,qp,ilp,lk 
+                    implicit none(type,external) 
+                    character, intent(in) :: side,trans,uplo
+                    integer(ilp), intent(out) :: info
+                    integer(ilp), intent(in) :: ldc,m,n
+                    complex(dp), intent(inout) :: ap(*),c(ldc,*)
+                    complex(dp), intent(in) :: tau(*)
+                    complex(dp), intent(out) :: work(*)
+               end subroutine zupmtr
+#else 
+               module procedure stdlib_zupmtr
+#endif
+          end interface upmtr
+
+
+
+
+
+end module stdlib_linalg_lapack
diff --git a/src/stdlib_linalg_lapack_aux.fypp b/src/stdlib_linalg_lapack_aux.fypp
new file mode 100644
index 000000000..f01fb9df8
--- /dev/null
+++ b/src/stdlib_linalg_lapack_aux.fypp
@@ -0,0 +1,1961 @@
+#:include "common.fypp" 
+module stdlib_linalg_lapack_aux
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+     public :: stdlib_chla_transtype
+     public :: stdlib_droundup_lwork
+     public :: stdlib_icmax1
+     public :: stdlib_ieeeck
+     public :: stdlib_ilaclc
+     public :: stdlib_ilaclr
+     public :: stdlib_iladiag
+     public :: stdlib_iladlc
+     public :: stdlib_iladlr
+     public :: stdlib_ilaenv
+     public :: stdlib_ilaenv2stage
+     public :: stdlib_ilaprec
+     public :: stdlib_ilaslc
+     public :: stdlib_ilaslr
+     public :: stdlib_ilatrans
+     public :: stdlib_ilauplo
+     public :: stdlib_ilazlc
+     public :: stdlib_ilazlr
+     public :: stdlib_iparam2stage
+     public :: stdlib_iparmq
+     public :: stdlib_izmax1
+     public :: stdlib_lsamen
+     public :: stdlib_sroundup_lwork
+     public :: stdlib_xerbla
+     public :: stdlib_xerbla_array
+#:if WITH_QP 
+     public :: stdlib_qroundup_lwork
+#:endif 
+#:if WITH_QP 
+     public :: stdlib_ilaqiag
+#:endif 
+#:if WITH_QP 
+     public :: stdlib_ilaqlc
+#:endif 
+#:if WITH_QP 
+     public :: stdlib_ilaqlr
+#:endif 
+#:if WITH_QP 
+     public :: stdlib_ilawlc
+#:endif 
+#:if WITH_QP 
+     public :: stdlib_ilawlr
+#:endif 
+#:if WITH_QP 
+     public :: stdlib_iwmax1
+#:endif 
+     public :: stdlib_selctg_s
+     public :: stdlib_select_s
+     public :: stdlib_selctg_d
+     public :: stdlib_select_d
+#:if WITH_QP
+     public :: stdlib_selctg_q
+     public :: stdlib_select_q
+#:endif
+     public :: stdlib_selctg_c
+     public :: stdlib_select_c
+     public :: stdlib_selctg_z
+     public :: stdlib_select_z
+#:if WITH_QP
+     public :: stdlib_selctg_w
+     public :: stdlib_select_w
+#:endif
+     ! SELCTG is a LOGICAL FUNCTION of three DOUBLE PRECISION arguments 
+     ! used to select eigenvalues to sort to the top left of the Schur form. 
+     ! An eigenvalue (ALPHAR(j)+ALPHAI(j))/BETA(j) is selected if SELCTG is true, i.e., 
+     abstract interface 
+        pure logical(lk) function stdlib_selctg_s(alphar,alphai,beta) 
+            import sp,dp,qp,lk 
+            implicit none 
+            real(sp), intent(in) :: alphar,alphai,beta 
+        end function stdlib_selctg_s 
+        pure logical(lk) function stdlib_select_s(alphar,alphai) 
+            import sp,dp,qp,lk 
+            implicit none 
+            real(sp), intent(in) :: alphar,alphai 
+        end function stdlib_select_s 
+        pure logical(lk) function stdlib_selctg_d(alphar,alphai,beta) 
+            import sp,dp,qp,lk 
+            implicit none 
+            real(dp), intent(in) :: alphar,alphai,beta 
+        end function stdlib_selctg_d 
+        pure logical(lk) function stdlib_select_d(alphar,alphai) 
+            import sp,dp,qp,lk 
+            implicit none 
+            real(dp), intent(in) :: alphar,alphai 
+        end function stdlib_select_d
+#:if WITH_QP 
+        pure logical(lk) function stdlib_selctg_q(alphar,alphai,beta) 
+            import sp,dp,qp,lk 
+            implicit none 
+            real(qp), intent(in) :: alphar,alphai,beta 
+        end function stdlib_selctg_q 
+        pure logical(lk) function stdlib_select_q(alphar,alphai) 
+            import sp,dp,qp,lk 
+            implicit none 
+            real(qp), intent(in) :: alphar,alphai 
+        end function stdlib_select_q 
+#:endif
+        pure logical(lk) function stdlib_selctg_c(alpha,beta) 
+            import sp,dp,qp,lk 
+            implicit none 
+            complex(sp), intent(in) :: alpha,beta 
+        end function stdlib_selctg_c 
+        pure logical(lk) function stdlib_select_c(alpha) 
+            import sp,dp,qp,lk 
+            implicit none 
+            complex(sp), intent(in) :: alpha 
+        end function stdlib_select_c 
+        pure logical(lk) function stdlib_selctg_z(alpha,beta) 
+            import sp,dp,qp,lk 
+            implicit none 
+            complex(dp), intent(in) :: alpha,beta 
+        end function stdlib_selctg_z 
+        pure logical(lk) function stdlib_select_z(alpha) 
+            import sp,dp,qp,lk 
+            implicit none 
+            complex(dp), intent(in) :: alpha 
+        end function stdlib_select_z 
+#:if WITH_QP
+        pure logical(lk) function stdlib_selctg_w(alpha,beta) 
+            import sp,dp,qp,lk 
+            implicit none 
+            complex(qp), intent(in) :: alpha,beta 
+        end function stdlib_selctg_w 
+        pure logical(lk) function stdlib_select_w(alpha) 
+            import sp,dp,qp,lk 
+            implicit none 
+            complex(qp), intent(in) :: alpha 
+        end function stdlib_select_w
+#:endif 
+     end interface 
+
+
+
+     contains
+
+     !> This subroutine translates from a BLAST-specified integer constant to
+     !> the character string specifying a transposition operation.
+     !> CHLA_TRANSTYPE: returns an CHARACTER*1.  If CHLA_TRANSTYPE: is 'X',
+     !> then input is not an integer indicating a transposition operator.
+     !> Otherwise CHLA_TRANSTYPE returns the constant value corresponding to
+     !> TRANS.
+
+     pure character function stdlib_chla_transtype( trans )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: trans
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: blas_no_trans = 111
+           integer(ilp), parameter :: blas_trans = 112
+           integer(ilp), parameter :: blas_conj_trans = 113
+           
+           ! Executable Statements 
+           if( trans==blas_no_trans ) then
+              stdlib_chla_transtype = 'N'
+           else if( trans==blas_trans ) then
+              stdlib_chla_transtype = 'T'
+           else if( trans==blas_conj_trans ) then
+              stdlib_chla_transtype = 'C'
+           else
+              stdlib_chla_transtype = 'X'
+           end if
+           return
+     end function stdlib_chla_transtype
+
+     !> DROUNDUP_LWORK: deals with a subtle bug with returning LWORK as a Float.
+     !> This routine guarantees it is rounded up instead of down by
+     !> multiplying LWORK by 1+eps when it is necessary, where eps is the relative machine precision.
+     !> E.g.,
+     !> float( 9007199254740993            ) == 9007199254740992
+     !> float( 9007199254740993 ) * (1.+eps) == 9007199254740994
+     !> \return DROUNDUP_LWORK
+     !>
+     !> DROUNDUP_LWORK >= LWORK.
+     !> DROUNDUP_LWORK is guaranteed to have zero decimal part.
+
+     pure real(dp) function stdlib_droundup_lwork( lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lwork
+       ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: epsilon,real,int
+           ! Executable Statements 
+           stdlib_droundup_lwork = real( lwork,KIND=dp)
+           if( int( stdlib_droundup_lwork,KIND=ilp) < lwork ) then
+               ! force round up of lwork
+               stdlib_droundup_lwork = stdlib_droundup_lwork * ( 1.0e+0_dp + epsilon(0.0e+0_dp) )
+                         
+           endif
+           return
+     end function stdlib_droundup_lwork
+
+     !> ICMAX1: finds the index of the first vector element of maximum absolute value.
+     !> Based on ICAMAX from Level 1 BLAS.
+     !> The change is to use the 'genuine' absolute value.
+
+     pure integer(ilp) function stdlib_icmax1( n, cx, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(sp), intent(in) :: cx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: smax
+           integer(ilp) :: i, ix
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           stdlib_icmax1 = 0
+           if (n<1 .or. incx<=0) return
+           stdlib_icmax1 = 1
+           if (n==1) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              smax = abs(cx(1))
+              do i = 2,n
+                 if (abs(cx(i))>smax) then
+                    stdlib_icmax1 = i
+                    smax = abs(cx(i))
+                 end if
+              end do
+           else
+              ! code for increment not equal to 1
+              ix = 1
+              smax = abs(cx(1))
+              ix = ix + incx
+              do i = 2,n
+                 if (abs(cx(ix))>smax) then
+                    stdlib_icmax1 = i
+                    smax = abs(cx(ix))
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end function stdlib_icmax1
+
+     !> IEEECK: is called from the ILAENV to verify that Infinity and
+     !> possibly NaN arithmetic is safe (i.e. will not trap).
+
+     pure integer(ilp)          function stdlib_ieeeck( ispec, zero, one )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ispec
+           real(sp), intent(in) :: one, zero
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: nan1, nan2, nan3, nan4, nan5, nan6, neginf, negzro, newzro, posinf
+           ! Executable Statements 
+           stdlib_ieeeck = 1
+           posinf = one / zero
+           if( posinf<=one ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           neginf = -one / zero
+           if( neginf>=zero ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           negzro = one / ( neginf+one )
+           if( negzro/=zero ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           neginf = one / negzro
+           if( neginf>=zero ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           newzro = negzro + zero
+           if( newzro/=zero ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           posinf = one / newzro
+           if( posinf<=one ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           neginf = neginf*posinf
+           if( neginf>=zero ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           posinf = posinf*posinf
+           if( posinf<=one ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           ! return if we were only asked to check infinity arithmetic
+           if( ispec==0 )return
+           nan1 = posinf + neginf
+           nan2 = posinf / neginf
+           nan3 = posinf / posinf
+           nan4 = posinf*zero
+           nan5 = neginf*negzro
+           nan6 = nan5*zero
+           if( nan1==nan1 ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           if( nan2==nan2 ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           if( nan3==nan3 ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           if( nan4==nan4 ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           if( nan5==nan5 ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           if( nan6==nan6 ) then
+              stdlib_ieeeck = 0
+              return
+           end if
+           return
+     end function stdlib_ieeeck
+
+     !> ILACLC: scans A for its last non-zero column.
+
+     pure integer(ilp) function stdlib_ilaclc( m, n, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: m, n, lda
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           complex(sp), parameter :: zero = (0.0e+0,0.0e+0)
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Executable Statements 
+           ! quick test for the common case where one corner is non-zero.
+           if( n==0 ) then
+              stdlib_ilaclc = n
+           else if( a(1, n)/=zero .or. a(m, n)/=zero ) then
+              stdlib_ilaclc = n
+           else
+           ! now scan each column from the end, returning with the first non-zero.
+              do stdlib_ilaclc = n, 1, -1
+                 do i = 1, m
+                    if( a(i, stdlib_ilaclc)/=zero ) return
+                 end do
+              end do
+           end if
+           return
+     end function stdlib_ilaclc
+
+     !> ILACLR: scans A for its last non-zero row.
+
+     pure integer(ilp) function stdlib_ilaclr( m, n, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: m, n, lda
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           complex(sp), parameter :: zero = (0.0e+0,0.0e+0)
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick test for the common case where one corner is non-zero.
+           if( m==0 ) then
+              stdlib_ilaclr = m
+           else if( a(m, 1)/=zero .or. a(m, n)/=zero ) then
+              stdlib_ilaclr = m
+           else
+           ! scan up each column tracking the last zero row seen.
+              stdlib_ilaclr = 0
+              do j = 1, n
+                 i=m
+                 do while((a(max(i,1),j)==zero).and.(i>=1))
+                    i=i-1
+                 enddo
+                 stdlib_ilaclr = max( stdlib_ilaclr, i )
+              end do
+           end if
+           return
+     end function stdlib_ilaclr
+
+     !> This subroutine translated from a character string specifying if a
+     !> matrix has unit diagonal or not to the relevant BLAST-specified
+     !> integer constant.
+     !> ILADIAG: returns an INTEGER.  If ILADIAG: < 0, then the input is not a
+     !> character indicating a unit or non-unit diagonal.  Otherwise ILADIAG
+     !> returns the constant value corresponding to DIAG.
+
+     integer(ilp) function stdlib_iladiag( diag )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character :: diag
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: blas_non_unit_diag = 131
+           integer(ilp), parameter :: blas_unit_diag = 132
+           
+           ! Executable Statements 
+           if( stdlib_lsame( diag, 'N' ) ) then
+              stdlib_iladiag = blas_non_unit_diag
+           else if( stdlib_lsame( diag, 'U' ) ) then
+              stdlib_iladiag = blas_unit_diag
+           else
+              stdlib_iladiag = -1
+           end if
+           return
+     end function stdlib_iladiag
+
+     !> ILADLC: scans A for its last non-zero column.
+
+     pure integer(ilp) function stdlib_iladlc( m, n, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: m, n, lda
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: zero = 0.0d+0
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Executable Statements 
+           ! quick test for the common case where one corner is non-zero.
+           if( n==0 ) then
+              stdlib_iladlc = n
+           else if( a(1, n)/=zero .or. a(m, n)/=zero ) then
+              stdlib_iladlc = n
+           else
+           ! now scan each column from the end, returning with the first non-zero.
+              do stdlib_iladlc = n, 1, -1
+                 do i = 1, m
+                    if( a(i, stdlib_iladlc)/=zero ) return
+                 end do
+              end do
+           end if
+           return
+     end function stdlib_iladlc
+
+     !> ILADLR: scans A for its last non-zero row.
+
+     pure integer(ilp) function stdlib_iladlr( m, n, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: m, n, lda
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: zero = 0.0d+0
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick test for the common case where one corner is non-zero.
+           if( m==0 ) then
+              stdlib_iladlr = m
+           else if( a(m, 1)/=zero .or. a(m, n)/=zero ) then
+              stdlib_iladlr = m
+           else
+           ! scan up each column tracking the last zero row seen.
+              stdlib_iladlr = 0
+              do j = 1, n
+                 i=m
+                 do while((a(max(i,1),j)==zero).and.(i>=1))
+                    i=i-1
+                 enddo
+                 stdlib_iladlr = max( stdlib_iladlr, i )
+              end do
+           end if
+           return
+     end function stdlib_iladlr
+
+     !> This subroutine translated from a character string specifying an
+     !> intermediate precision to the relevant BLAST-specified integer
+     !> constant.
+     !> ILAPREC: returns an INTEGER.  If ILAPREC: < 0, then the input is not a
+     !> character indicating a supported intermediate precision.  Otherwise
+     !> ILAPREC returns the constant value corresponding to PREC.
+
+     integer(ilp) function stdlib_ilaprec( prec )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character :: prec
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: blas_prec_single = 211
+           integer(ilp), parameter :: blas_prec_double = 212
+           integer(ilp), parameter :: blas_prec_indigenous = 213
+           integer(ilp), parameter :: blas_prec_extra = 214
+           
+           ! Executable Statements 
+           if( stdlib_lsame( prec, 'S' ) ) then
+              stdlib_ilaprec = blas_prec_single
+           else if( stdlib_lsame( prec, 'D' ) ) then
+              stdlib_ilaprec = blas_prec_double
+           else if( stdlib_lsame( prec, 'I' ) ) then
+              stdlib_ilaprec = blas_prec_indigenous
+           else if( stdlib_lsame( prec, 'X' ) .or. stdlib_lsame( prec, 'E' ) ) then
+              stdlib_ilaprec = blas_prec_extra
+           else
+              stdlib_ilaprec = -1
+           end if
+           return
+     end function stdlib_ilaprec
+
+     !> ILASLC: scans A for its last non-zero column.
+
+     pure integer(ilp) function stdlib_ilaslc( m, n, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: m, n, lda
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: zero = 0.0e+0
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Executable Statements 
+           ! quick test for the common case where one corner is non-zero.
+           if( n==0 ) then
+              stdlib_ilaslc = n
+           else if( a(1, n)/=zero .or. a(m, n)/=zero ) then
+              stdlib_ilaslc = n
+           else
+           ! now scan each column from the end, returning with the first non-zero.
+              do stdlib_ilaslc = n, 1, -1
+                 do i = 1, m
+                    if( a(i, stdlib_ilaslc)/=zero ) return
+                 end do
+              end do
+           end if
+           return
+     end function stdlib_ilaslc
+
+     !> ILASLR: scans A for its last non-zero row.
+
+     pure integer(ilp) function stdlib_ilaslr( m, n, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: m, n, lda
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: zero = 0.0e+0
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick test for the common case where one corner is non-zero.
+           if( m==0 ) then
+              stdlib_ilaslr = m
+           elseif( a(m, 1)/=zero .or. a(m, n)/=zero ) then
+              stdlib_ilaslr = m
+           else
+           ! scan up each column tracking the last zero row seen.
+              stdlib_ilaslr = 0
+              do j = 1, n
+                 i=m
+                 do while((a(max(i,1),j)==zero).and.(i>=1))
+                    i=i-1
+                 enddo
+                 stdlib_ilaslr = max( stdlib_ilaslr, i )
+              end do
+           end if
+           return
+     end function stdlib_ilaslr
+
+     !> This subroutine translates from a character string specifying a
+     !> transposition operation to the relevant BLAST-specified integer
+     !> constant.
+     !> ILATRANS: returns an INTEGER.  If ILATRANS: < 0, then the input is not
+     !> a character indicating a transposition operator.  Otherwise ILATRANS
+     !> returns the constant value corresponding to TRANS.
+
+     integer(ilp) function stdlib_ilatrans( trans )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character :: trans
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: blas_no_trans = 111
+           integer(ilp), parameter :: blas_trans = 112
+           integer(ilp), parameter :: blas_conj_trans = 113
+           
+           ! Executable Statements 
+           if( stdlib_lsame( trans, 'N' ) ) then
+              stdlib_ilatrans = blas_no_trans
+           else if( stdlib_lsame( trans, 'T' ) ) then
+              stdlib_ilatrans = blas_trans
+           else if( stdlib_lsame( trans, 'C' ) ) then
+              stdlib_ilatrans = blas_conj_trans
+           else
+              stdlib_ilatrans = -1
+           end if
+           return
+     end function stdlib_ilatrans
+
+     !> This subroutine translated from a character string specifying a
+     !> upper- or lower-triangular matrix to the relevant BLAST-specified
+     !> integer constant.
+     !> ILAUPLO: returns an INTEGER.  If ILAUPLO: < 0, then the input is not
+     !> a character indicating an upper- or lower-triangular matrix.
+     !> Otherwise ILAUPLO returns the constant value corresponding to UPLO.
+
+     integer(ilp) function stdlib_ilauplo( uplo )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character :: uplo
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: blas_upper = 121
+           integer(ilp), parameter :: blas_lower = 122
+           
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              stdlib_ilauplo = blas_upper
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              stdlib_ilauplo = blas_lower
+           else
+              stdlib_ilauplo = -1
+           end if
+           return
+     end function stdlib_ilauplo
+
+     !> ILAZLC: scans A for its last non-zero column.
+
+     pure integer(ilp) function stdlib_ilazlc( m, n, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: m, n, lda
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           complex(dp), parameter :: zero = (0.0d+0,0.0d+0)
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Executable Statements 
+           ! quick test for the common case where one corner is non-zero.
+           if( n==0 ) then
+              stdlib_ilazlc = n
+           else if( a(1, n)/=zero .or. a(m, n)/=zero ) then
+              stdlib_ilazlc = n
+           else
+           ! now scan each column from the end, returning with the first non-zero.
+              do stdlib_ilazlc = n, 1, -1
+                 do i = 1, m
+                    if( a(i, stdlib_ilazlc)/=zero ) return
+                 end do
+              end do
+           end if
+           return
+     end function stdlib_ilazlc
+
+     !> ILAZLR: scans A for its last non-zero row.
+
+     pure integer(ilp) function stdlib_ilazlr( m, n, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: m, n, lda
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           complex(dp), parameter :: zero = (0.0d+0,0.0d+0)
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick test for the common case where one corner is non-zero.
+           if( m==0 ) then
+              stdlib_ilazlr = m
+           else if( a(m, 1)/=zero .or. a(m, n)/=zero ) then
+              stdlib_ilazlr = m
+           else
+           ! scan up each column tracking the last zero row seen.
+              stdlib_ilazlr = 0
+              do j = 1, n
+                 i=m
+                 do while((a(max(i,1),j)==zero).and.(i>=1))
+                    i=i-1
+                 enddo
+                 stdlib_ilazlr = max( stdlib_ilazlr, i )
+              end do
+           end if
+           return
+     end function stdlib_ilazlr
+
+     !> This program sets problem and machine dependent parameters
+     !> useful for xHSEQR and related subroutines for eigenvalue
+     !> problems. It is called whenever
+     !> IPARMQ: is called with 12 <= ISPEC <= 16
+
+     pure integer(ilp) function stdlib_iparmq( ispec, name, opts, n, ilo, ihi, lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, ispec, lwork, n
+           character, intent(in) :: name*(*), opts*(*)
+        ! ================================================================
+           ! Parameters 
+           integer(ilp), parameter :: inmin = 12
+           integer(ilp), parameter :: inwin = 13
+           integer(ilp), parameter :: inibl = 14
+           integer(ilp), parameter :: ishfts = 15
+           integer(ilp), parameter :: iacc22 = 16
+           integer(ilp), parameter :: icost = 17
+           integer(ilp), parameter :: nmin = 75
+           integer(ilp), parameter :: k22min = 14
+           integer(ilp), parameter :: kacmin = 14
+           integer(ilp), parameter :: nibble = 14
+           integer(ilp), parameter :: knwswp = 500
+           integer(ilp), parameter :: rcost = 10
+           real(sp), parameter :: two = 2.0
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: nh, ns
+           integer(ilp) :: i, ic, iz
+           character :: subnam*6
+           ! Intrinsic Functions 
+           intrinsic :: log,max,mod,nint,real
+           ! Executable Statements 
+           if( ( ispec==ishfts ) .or. ( ispec==inwin ) .or.( ispec==iacc22 ) ) then
+              ! ==== set the number simultaneous shifts ====
+              nh = ihi - ilo + 1
+              ns = 2
+              if( nh>=30 )ns = 4
+              if( nh>=60 )ns = 10
+              if( nh>=150 )ns = max( 10, nh / nint( log( real( nh,KIND=dp) ) / log( two ),&
+                        KIND=ilp) )
+              if( nh>=590 )ns = 64
+              if( nh>=3000 )ns = 128
+              if( nh>=6000 )ns = 256
+              ns = max( 2, ns-mod( ns, 2 ) )
+           end if
+           if( ispec==inmin ) then
+              ! ===== matrices of order smaller than nmin get sent
+              ! .     to xlahqr, the classic double shift algorithm.
+              ! .     this must be at least 11. ====
+              stdlib_iparmq = nmin
+           else if( ispec==inibl ) then
+              ! ==== inibl: skip a multi-shift qr iteration and
+              ! .    whenever aggressive early deflation finds
+              ! .    at least (nibble*(window size)/100) deflations. ====
+              stdlib_iparmq = nibble
+           else if( ispec==ishfts ) then
+              ! ==== nshfts: the number of simultaneous shifts =====
+              stdlib_iparmq = ns
+           else if( ispec==inwin ) then
+              ! ==== nw: deflation window size.  ====
+              if( nh<=knwswp ) then
+                 stdlib_iparmq = ns
+              else
+                 stdlib_iparmq = 3*ns / 2
+              end if
+           else if( ispec==iacc22 ) then
+              ! ==== iacc22: whether to accumulate reflections
+              ! .     before updating the far-from-diagonal elements
+              ! .     and whether to use 2-by-2 block structure while
+              ! .     doing it.  a small amount of work could be saved
+              ! .     by making this choice dependent also upon the
+              ! .     nh=ihi-ilo+1.
+              ! convert name to upper case if the first character is lower case.
+              stdlib_iparmq = 0
+              subnam = name
+              ic = ichar( subnam( 1: 1 ) )
+              iz = ichar( 'Z' )
+              if( iz==90 .or. iz==122 ) then
+                 ! ascii character set
+                 if( ic>=97 .and. ic<=122 ) then
+                    subnam( 1: 1 ) = char( ic-32 )
+                    do i = 2, 6
+                       ic = ichar( subnam( i: i ) )
+                       if( ic>=97 .and. ic<=122 )subnam( i: i ) = char( ic-32 )
+                    end do
+                 end if
+              else if( iz==233 .or. iz==169 ) then
+                 ! ebcdic character set
+                 if( ( ic>=129 .and. ic<=137 ) .or.( ic>=145 .and. ic<=153 ) .or.( ic>=162 .and. &
+                           ic<=169 ) ) then
+                    subnam( 1: 1 ) = char( ic+64 )
+                    do i = 2, 6
+                       ic = ichar( subnam( i: i ) )
+                       if( ( ic>=129 .and. ic<=137 ) .or.( ic>=145 .and. ic<=153 ) .or.( ic>=162 &
+                                 .and. ic<=169 ) )subnam( i:i ) = char( ic+64 )
+                    end do
+                 end if
+              else if( iz==218 .or. iz==250 ) then
+                 ! prime machines:  ascii+128
+                 if( ic>=225 .and. ic<=250 ) then
+                    subnam( 1: 1 ) = char( ic-32 )
+                    do i = 2, 6
+                       ic = ichar( subnam( i: i ) )
+                       if( ic>=225 .and. ic<=250 )subnam( i: i ) = char( ic-32 )
+                    end do
+                 end if
+              end if
+              if( subnam( 2:6 )=='GGHRD' .or.subnam( 2:6 )=='GGHD3' ) then
+                 stdlib_iparmq = 1
+                 if( nh>=k22min )stdlib_iparmq = 2
+              else if ( subnam( 4:6 )=='EXC' ) then
+                 if( nh>=kacmin )stdlib_iparmq = 1
+                 if( nh>=k22min )stdlib_iparmq = 2
+              else if ( subnam( 2:6 )=='HSEQR' .or.subnam( 2:5 )=='LAQR' ) then
+                 if( ns>=kacmin )stdlib_iparmq = 1
+                 if( ns>=k22min )stdlib_iparmq = 2
+              end if
+           else if( ispec==icost ) then
+              ! === relative cost of near-the-diagonal chase vs
+                  ! blas updates ===
+              stdlib_iparmq = rcost
+           else
+              ! ===== invalid value of ispec =====
+              stdlib_iparmq = -1
+           end if
+     end function stdlib_iparmq
+
+     !> IZMAX1: finds the index of the first vector element of maximum absolute value.
+     !> Based on IZAMAX from Level 1 BLAS.
+     !> The change is to use the 'genuine' absolute value.
+
+     pure integer(ilp) function stdlib_izmax1( n, zx, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(dp), intent(in) :: zx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: dmax
+           integer(ilp) :: i, ix
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           stdlib_izmax1 = 0
+           if (n<1 .or. incx<=0) return
+           stdlib_izmax1 = 1
+           if (n==1) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              dmax = abs(zx(1))
+              do i = 2,n
+                 if (abs(zx(i))>dmax) then
+                    stdlib_izmax1 = i
+                    dmax = abs(zx(i))
+                 end if
+              end do
+           else
+              ! code for increment not equal to 1
+              ix = 1
+              dmax = abs(zx(1))
+              ix = ix + incx
+              do i = 2,n
+                 if (abs(zx(ix))>dmax) then
+                    stdlib_izmax1 = i
+                    dmax = abs(zx(ix))
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end function stdlib_izmax1
+
+     !> LSAMEN:  tests if the first N letters of CA are the same as the
+     !> first N letters of CB, regardless of case.
+     !> LSAMEN returns .TRUE. if CA and CB are equivalent except for case
+     !> and .FALSE. otherwise.  LSAMEN also returns .FALSE. if LEN( CA )
+     !> or LEN( CB ) is less than N.
+
+     pure logical(lk)          function stdlib_lsamen( n, ca, cb )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character(len=*), intent(in) :: ca, cb
+           integer(ilp), intent(in) :: n
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Intrinsic Functions 
+           intrinsic :: len
+           ! Executable Statements 
+           stdlib_lsamen = .false.
+           if( len( ca )<n .or. len( cb )<n )go to 20
+           ! do for each character in the two strings.
+           do i = 1, n
+              ! test if the characters are equal using stdlib_lsame.
+              if( .not.stdlib_lsame( ca( i: i ), cb( i: i ) ) )go to 20
+           end do
+           stdlib_lsamen = .true.
+           20 continue
+           return
+     end function stdlib_lsamen
+
+     !> SROUNDUP_LWORK: deals with a subtle bug with returning LWORK as a Float.
+     !> This routine guarantees it is rounded up instead of down by
+     !> multiplying LWORK by 1+eps when it is necessary, where eps is the relative machine precision.
+     !> E.g.,
+     !> float( 16777217            ) == 16777216
+     !> float( 16777217 ) * (1.+eps) == 16777218
+     !> \return SROUNDUP_LWORK
+     !>
+     !> SROUNDUP_LWORK >= LWORK.
+     !> SROUNDUP_LWORK is guaranteed to have zero decimal part.
+
+     pure real(sp)             function stdlib_sroundup_lwork( lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lwork
+       ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: epsilon,real,int
+           ! Executable Statements 
+           stdlib_sroundup_lwork = real( lwork,KIND=sp)
+           if( int( stdlib_sroundup_lwork,KIND=ilp) < lwork ) then
+               ! force round up of lwork
+               stdlib_sroundup_lwork = stdlib_sroundup_lwork * ( 1.0e+0_sp + epsilon(0.0e+0_sp) )
+                         
+           endif
+           return
+     end function stdlib_sroundup_lwork
+
+#:if WITH_QP 
+
+     !> DROUNDUP_LWORK: deals with a subtle bug with returning LWORK as a Float.
+     !> This routine guarantees it is rounded up instead of down by
+     !> multiplying LWORK by 1+eps when it is necessary, where eps is the relative machine precision.
+     !> E.g.,
+     !> float( 9007199254740993            ) == 9007199254740992
+     !> float( 9007199254740993 ) * (1.+eps) == 9007199254740994
+     !> \return DROUNDUP_LWORK
+     !>
+     !> DROUNDUP_LWORK >= LWORK.
+     !> DROUNDUP_LWORK is guaranteed to have zero decimal part.
+
+     pure real(qp) function stdlib_qroundup_lwork( lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lwork
+       ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: epsilon,real,int
+           ! Executable Statements 
+           stdlib_qroundup_lwork = real( lwork,KIND=qp)
+           if( int( stdlib_qroundup_lwork,KIND=ilp) < lwork ) then
+               ! force round up of lwork
+               stdlib_qroundup_lwork = stdlib_qroundup_lwork * ( 1.0e+0_qp + epsilon(0.0e+0_qp) )
+                         
+           endif
+           return
+     end function stdlib_qroundup_lwork
+#:endif 
+
+#:if WITH_QP 
+
+     !> This subroutine translated from a character string specifying if a
+     !> matrix has unit diagonal or not to the relevant BLAST-specified
+     !> integer constant.
+     !> ILADIAG: returns an INTEGER.  If ILADIAG: < 0, then the input is not a
+     !> character indicating a unit or non-unit diagonal.  Otherwise ILADIAG
+     !> returns the constant value corresponding to DIAG.
+
+     integer(ilp) function stdlib_ilaqiag( diag )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character :: diag
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: blas_non_unit_qiag = 131
+           integer(ilp), parameter :: blas_unit_qiag = 132
+           
+           ! Executable Statements 
+           if( stdlib_lsame( diag, 'N' ) ) then
+              stdlib_ilaqiag = blas_non_unit_qiag
+           else if( stdlib_lsame( diag, 'U' ) ) then
+              stdlib_ilaqiag = blas_unit_qiag
+           else
+              stdlib_ilaqiag = -1
+           end if
+           return
+     end function stdlib_ilaqiag
+#:endif 
+
+#:if WITH_QP 
+
+     !> ILADLC: scans A for its last non-zero column.
+
+     pure integer(ilp) function stdlib_ilaqlc( m, n, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: m, n, lda
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: zero = 0.0d+0
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Executable Statements 
+           ! quick test for the common case where one corner is non-zero.
+           if( n==0 ) then
+              stdlib_ilaqlc = n
+           else if( a(1, n)/=zero .or. a(m, n)/=zero ) then
+              stdlib_ilaqlc = n
+           else
+           ! now scan each column from the end, returning with the first non-zero.
+              do stdlib_ilaqlc = n, 1, -1
+                 do i = 1, m
+                    if( a(i, stdlib_ilaqlc)/=zero ) return
+                 end do
+              end do
+           end if
+           return
+     end function stdlib_ilaqlc
+#:endif 
+
+#:if WITH_QP 
+
+     !> ILADLR: scans A for its last non-zero row.
+
+     pure integer(ilp) function stdlib_ilaqlr( m, n, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: m, n, lda
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: zero = 0.0d+0
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick test for the common case where one corner is non-zero.
+           if( m==0 ) then
+              stdlib_ilaqlr = m
+           else if( a(m, 1)/=zero .or. a(m, n)/=zero ) then
+              stdlib_ilaqlr = m
+           else
+           ! scan up each column tracking the last zero row seen.
+              stdlib_ilaqlr = 0
+              do j = 1, n
+                 i=m
+                 do while((a(max(i,1),j)==zero).and.(i>=1))
+                    i=i-1
+                 enddo
+                 stdlib_ilaqlr = max( stdlib_ilaqlr, i )
+              end do
+           end if
+           return
+     end function stdlib_ilaqlr
+#:endif 
+
+#:if WITH_QP 
+
+     !> ILAZLC: scans A for its last non-zero column.
+
+     pure integer(ilp) function stdlib_ilawlc( m, n, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: m, n, lda
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           complex(qp), parameter :: zero = (0.0d+0,0.0d+0)
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Executable Statements 
+           ! quick test for the common case where one corner is non-zero.
+           if( n==0 ) then
+              stdlib_ilawlc = n
+           else if( a(1, n)/=zero .or. a(m, n)/=zero ) then
+              stdlib_ilawlc = n
+           else
+           ! now scan each column from the end, returning with the first non-zero.
+              do stdlib_ilawlc = n, 1, -1
+                 do i = 1, m
+                    if( a(i, stdlib_ilawlc)/=zero ) return
+                 end do
+              end do
+           end if
+           return
+     end function stdlib_ilawlc
+#:endif 
+
+#:if WITH_QP 
+
+     !> ILAZLR: scans A for its last non-zero row.
+
+     pure integer(ilp) function stdlib_ilawlr( m, n, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: m, n, lda
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           complex(qp), parameter :: zero = (0.0d+0,0.0d+0)
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick test for the common case where one corner is non-zero.
+           if( m==0 ) then
+              stdlib_ilawlr = m
+           else if( a(m, 1)/=zero .or. a(m, n)/=zero ) then
+              stdlib_ilawlr = m
+           else
+           ! scan up each column tracking the last zero row seen.
+              stdlib_ilawlr = 0
+              do j = 1, n
+                 i=m
+                 do while((a(max(i,1),j)==zero).and.(i>=1))
+                    i=i-1
+                 enddo
+                 stdlib_ilawlr = max( stdlib_ilawlr, i )
+              end do
+           end if
+           return
+     end function stdlib_ilawlr
+#:endif 
+
+#:if WITH_QP 
+
+     !> IZMAX1: finds the index of the first vector element of maximum absolute value.
+     !> Based on IZAMAX from Level 1 BLAS.
+     !> The change is to use the 'genuine' absolute value.
+
+     pure integer(ilp) function stdlib_iwmax1( n, zx, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(qp), intent(in) :: zx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: dmax
+           integer(ilp) :: i, ix
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           stdlib_iwmax1 = 0
+           if (n<1 .or. incx<=0) return
+           stdlib_iwmax1 = 1
+           if (n==1) return
+           if (incx==1) then
+              ! code for increment equal to 1
+              dmax = abs(zx(1))
+              do i = 2,n
+                 if (abs(zx(i))>dmax) then
+                    stdlib_iwmax1 = i
+                    dmax = abs(zx(i))
+                 end if
+              end do
+           else
+              ! code for increment not equal to 1
+              ix = 1
+              dmax = abs(zx(1))
+              ix = ix + incx
+              do i = 2,n
+                 if (abs(zx(ix))>dmax) then
+                    stdlib_iwmax1 = i
+                    dmax = abs(zx(ix))
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end function stdlib_iwmax1
+#:endif 
+
+     !> ILAENV: is called from the LAPACK routines to choose problem-dependent
+     !> parameters for the local environment.  See ISPEC for a description of
+     !> the parameters.
+     !> ILAENV returns an INTEGER
+     !> if ILAENV >= 0: ILAENV returns the value of the parameter specified by ISPEC
+     !> if ILAENV < 0:  if ILAENV = -k, the k-th argument had an illegal value.
+     !> This version provides a set of parameters which should give good,
+     !> but not optimal, performance on many of the currently available
+     !> computers.  Users are encouraged to modify this subroutine to set
+     !> the tuning parameters for their particular machine using the option
+     !> and problem size information in the arguments.
+     !> This routine will not function correctly if it is converted to all
+     !> lower case.  Converting it to all upper case is allowed.
+
+     pure integer(ilp) function stdlib_ilaenv( ispec, name, opts, n1, n2, n3, n4 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character(len=*), intent(in) :: name, opts
+           integer(ilp), intent(in) :: ispec, n1, n2, n3, n4
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, iz, nb, nbmin, nx
+           logical(lk) :: cname, sname, twostage
+           character :: c1*1, c2*2, c4*2, c3*3, subnam*16
+           ! Intrinsic Functions 
+           intrinsic :: char,ichar,int,min,real
+           ! Executable Statements 
+           go to ( 10, 10, 10, 80, 90, 100, 110, 120,130, 140, 150, 160, 160, 160, 160, 160, 160)&
+                     ispec
+           ! invalid value for ispec
+           stdlib_ilaenv = -1
+           return
+           10 continue
+           ! convert name to upper case if the first character is lower case.
+           stdlib_ilaenv = 1
+           subnam = name
+           ic = ichar( subnam( 1: 1 ) )
+           iz = ichar( 'Z' )
+           if( iz==90 .or. iz==122 ) then
+              ! ascii character set
+              if( ic>=97 .and. ic<=122 ) then
+                 subnam( 1: 1 ) = char( ic-32 )
+                 do i = 2, 6
+                    ic = ichar( subnam( i: i ) )
+                    if( ic>=97 .and. ic<=122 )subnam( i: i ) = char( ic-32 )
+                 end do
+              end if
+           else if( iz==233 .or. iz==169 ) then
+              ! ebcdic character set
+              if( ( ic>=129 .and. ic<=137 ) .or.( ic>=145 .and. ic<=153 ) .or.( ic>=162 .and. &
+                        ic<=169 ) ) then
+                 subnam( 1: 1 ) = char( ic+64 )
+                 do i = 2, 6
+                    ic = ichar( subnam( i: i ) )
+                    if( ( ic>=129 .and. ic<=137 ) .or.( ic>=145 .and. ic<=153 ) .or.( ic>=162 &
+                              .and. ic<=169 ) )subnam( i:i ) = char( ic+64 )
+                 end do
+              end if
+           else if( iz==218 .or. iz==250 ) then
+              ! prime machines:  ascii+128
+              if( ic>=225 .and. ic<=250 ) then
+                 subnam( 1: 1 ) = char( ic-32 )
+                 do i = 2, 6
+                    ic = ichar( subnam( i: i ) )
+                    if( ic>=225 .and. ic<=250 )subnam( i: i ) = char( ic-32 )
+                 end do
+              end if
+           end if
+           c1 = subnam( 1: 1 )
+           sname = c1=='S' .or. c1=='D'
+           cname = c1=='C' .or. c1=='Z'
+           if( .not.( cname .or. sname ) )return
+           c2 = subnam( 2: 3 )
+           c3 = subnam( 4: 6 )
+           c4 = c3( 2: 3 )
+           twostage = len( subnam )>=11.and. subnam( 11: 11 )=='2'
+           go to ( 50, 60, 70 )ispec
+           50 continue
+           ! ispec = 1:  block size
+           ! in these examples, separate code is provided for setting nb for
+           ! real and complex.  we assume that nb will take the same value in
+           ! single or double precision.
+           nb = 1
+           if( subnam(2:6)=='LAORH' ) then
+              ! this is for *laorhr_getrfnp routine
+              if( sname ) then
+                  nb = 32
+              else
+                  nb = 32
+              end if
+           else if( c2=='GE' ) then
+              if( c3=='TRF' ) then
+                 if( sname ) then
+                    nb = 64
+                 else
+                    nb = 64
+                 end if
+              else if( c3=='QRF' .or. c3=='RQF' .or. c3=='LQF' .or.c3=='QLF' ) then
+                 if( sname ) then
+                    nb = 32
+                 else
+                    nb = 32
+                 end if
+              else if( c3=='QR ') then
+                 if( n3 == 1) then
+                    if( sname ) then
+           ! m*n
+                       if ((n1*n2<=131072).or.(n1<=8192)) then
+                          nb = n1
+                       else
+                          nb = 32768/n2
+                       end if
+                    else
+                       if ((n1*n2<=131072).or.(n1<=8192)) then
+                          nb = n1
+                       else
+                          nb = 32768/n2
+                       end if
+                    end if
+                 else
+                    if( sname ) then
+                       nb = 1
+                    else
+                       nb = 1
+                    end if
+                 end if
+              else if( c3=='LQ ') then
+                 if( n3 == 2) then
+                    if( sname ) then
+           ! m*n
+                       if ((n1*n2<=131072).or.(n1<=8192)) then
+                          nb = n1
+                       else
+                          nb = 32768/n2
+                       end if
+                    else
+                       if ((n1*n2<=131072).or.(n1<=8192)) then
+                          nb = n1
+                       else
+                          nb = 32768/n2
+                       end if
+                    end if
+                 else
+                    if( sname ) then
+                       nb = 1
+                    else
+                       nb = 1
+                    end if
+                 end if
+              else if( c3=='HRD' ) then
+                 if( sname ) then
+                    nb = 32
+                 else
+                    nb = 32
+                 end if
+              else if( c3=='BRD' ) then
+                 if( sname ) then
+                    nb = 32
+                 else
+                    nb = 32
+                 end if
+              else if( c3=='TRI' ) then
+                 if( sname ) then
+                    nb = 64
+                 else
+                    nb = 64
+                 end if
+              end if
+           else if( c2=='PO' ) then
+              if( c3=='TRF' ) then
+                 if( sname ) then
+                    nb = 64
+                 else
+                    nb = 64
+                 end if
+              end if
+           else if( c2=='SY' ) then
+              if( c3=='TRF' ) then
+                 if( sname ) then
+                    if( twostage ) then
+                       nb = 192
+                    else
+                       nb = 64
+                    end if
+                 else
+                    if( twostage ) then
+                       nb = 192
+                    else
+                       nb = 64
+                    end if
+                 end if
+              else if( sname .and. c3=='TRD' ) then
+                 nb = 32
+              else if( sname .and. c3=='GST' ) then
+                 nb = 64
+              end if
+           else if( cname .and. c2=='HE' ) then
+              if( c3=='TRF' ) then
+                 if( twostage ) then
+                    nb = 192
+                 else
+                    nb = 64
+                 end if
+              else if( c3=='TRD' ) then
+                 nb = 32
+              else if( c3=='GST' ) then
+                 nb = 64
+              end if
+           else if( sname .and. c2=='OR' ) then
+              if( c3( 1: 1 )=='G' ) then
+                 if( c4=='QR' .or. c4=='RQ' .or. c4=='LQ' .or. c4=='QL' .or. c4=='HR' .or. &
+                           c4=='TR' .or. c4=='BR' )then
+                    nb = 32
+                 end if
+              else if( c3( 1: 1 )=='M' ) then
+                 if( c4=='QR' .or. c4=='RQ' .or. c4=='LQ' .or. c4=='QL' .or. c4=='HR' .or. &
+                           c4=='TR' .or. c4=='BR' )then
+                    nb = 32
+                 end if
+              end if
+           else if( cname .and. c2=='UN' ) then
+              if( c3( 1: 1 )=='G' ) then
+                 if( c4=='QR' .or. c4=='RQ' .or. c4=='LQ' .or. c4=='QL' .or. c4=='HR' .or. &
+                           c4=='TR' .or. c4=='BR' )then
+                    nb = 32
+                 end if
+              else if( c3( 1: 1 )=='M' ) then
+                 if( c4=='QR' .or. c4=='RQ' .or. c4=='LQ' .or. c4=='QL' .or. c4=='HR' .or. &
+                           c4=='TR' .or. c4=='BR' )then
+                    nb = 32
+                 end if
+              end if
+           else if( c2=='GB' ) then
+              if( c3=='TRF' ) then
+                 if( sname ) then
+                    if( n4<=64 ) then
+                       nb = 1
+                    else
+                       nb = 32
+                    end if
+                 else
+                    if( n4<=64 ) then
+                       nb = 1
+                    else
+                       nb = 32
+                    end if
+                 end if
+              end if
+           else if( c2=='PB' ) then
+              if( c3=='TRF' ) then
+                 if( sname ) then
+                    if( n2<=64 ) then
+                       nb = 1
+                    else
+                       nb = 32
+                    end if
+                 else
+                    if( n2<=64 ) then
+                       nb = 1
+                    else
+                       nb = 32
+                    end if
+                 end if
+              end if
+           else if( c2=='TR' ) then
+              if( c3=='TRI' ) then
+                 if( sname ) then
+                    nb = 64
+                 else
+                    nb = 64
+                 end if
+              else if ( c3=='EVC' ) then
+                 if( sname ) then
+                    nb = 64
+                 else
+                    nb = 64
+                 end if
+              end if
+           else if( c2=='LA' ) then
+              if( c3=='UUM' ) then
+                 if( sname ) then
+                    nb = 64
+                 else
+                    nb = 64
+                 end if
+              end if
+           else if( sname .and. c2=='ST' ) then
+              if( c3=='EBZ' ) then
+                 nb = 1
+              end if
+           else if( c2=='GG' ) then
+              nb = 32
+              if( c3=='HD3' ) then
+                 if( sname ) then
+                    nb = 32
+                 else
+                    nb = 32
+                 end if
+              end if
+           end if
+           stdlib_ilaenv = nb
+           return
+           60 continue
+           ! ispec = 2:  minimum block size
+           nbmin = 2
+           if( c2=='GE' ) then
+              if( c3=='QRF' .or. c3=='RQF' .or. c3=='LQF' .or. c3=='QLF' ) then
+                 if( sname ) then
+                    nbmin = 2
+                 else
+                    nbmin = 2
+                 end if
+              else if( c3=='HRD' ) then
+                 if( sname ) then
+                    nbmin = 2
+                 else
+                    nbmin = 2
+                 end if
+              else if( c3=='BRD' ) then
+                 if( sname ) then
+                    nbmin = 2
+                 else
+                    nbmin = 2
+                 end if
+              else if( c3=='TRI' ) then
+                 if( sname ) then
+                    nbmin = 2
+                 else
+                    nbmin = 2
+                 end if
+              end if
+           else if( c2=='SY' ) then
+              if( c3=='TRF' ) then
+                 if( sname ) then
+                    nbmin = 8
+                 else
+                    nbmin = 8
+                 end if
+              else if( sname .and. c3=='TRD' ) then
+                 nbmin = 2
+              end if
+           else if( cname .and. c2=='HE' ) then
+              if( c3=='TRD' ) then
+                 nbmin = 2
+              end if
+           else if( sname .and. c2=='OR' ) then
+              if( c3( 1: 1 )=='G' ) then
+                 if( c4=='QR' .or. c4=='RQ' .or. c4=='LQ' .or. c4=='QL' .or. c4=='HR' .or. &
+                           c4=='TR' .or. c4=='BR' )then
+                    nbmin = 2
+                 end if
+              else if( c3( 1: 1 )=='M' ) then
+                 if( c4=='QR' .or. c4=='RQ' .or. c4=='LQ' .or. c4=='QL' .or. c4=='HR' .or. &
+                           c4=='TR' .or. c4=='BR' )then
+                    nbmin = 2
+                 end if
+              end if
+           else if( cname .and. c2=='UN' ) then
+              if( c3( 1: 1 )=='G' ) then
+                 if( c4=='QR' .or. c4=='RQ' .or. c4=='LQ' .or. c4=='QL' .or. c4=='HR' .or. &
+                           c4=='TR' .or. c4=='BR' )then
+                    nbmin = 2
+                 end if
+              else if( c3( 1: 1 )=='M' ) then
+                 if( c4=='QR' .or. c4=='RQ' .or. c4=='LQ' .or. c4=='QL' .or. c4=='HR' .or. &
+                           c4=='TR' .or. c4=='BR' )then
+                    nbmin = 2
+                 end if
+              end if
+           else if( c2=='GG' ) then
+              nbmin = 2
+              if( c3=='HD3' ) then
+                 nbmin = 2
+              end if
+           end if
+           stdlib_ilaenv = nbmin
+           return
+           70 continue
+           ! ispec = 3:  crossover point
+           nx = 0
+           if( c2=='GE' ) then
+              if( c3=='QRF' .or. c3=='RQF' .or. c3=='LQF' .or. c3=='QLF' ) then
+                 if( sname ) then
+                    nx = 128
+                 else
+                    nx = 128
+                 end if
+              else if( c3=='HRD' ) then
+                 if( sname ) then
+                    nx = 128
+                 else
+                    nx = 128
+                 end if
+              else if( c3=='BRD' ) then
+                 if( sname ) then
+                    nx = 128
+                 else
+                    nx = 128
+                 end if
+              end if
+           else if( c2=='SY' ) then
+              if( sname .and. c3=='TRD' ) then
+                 nx = 32
+              end if
+           else if( cname .and. c2=='HE' ) then
+              if( c3=='TRD' ) then
+                 nx = 32
+              end if
+           else if( sname .and. c2=='OR' ) then
+              if( c3( 1: 1 )=='G' ) then
+                 if( c4=='QR' .or. c4=='RQ' .or. c4=='LQ' .or. c4=='QL' .or. c4=='HR' .or. &
+                           c4=='TR' .or. c4=='BR' )then
+                    nx = 128
+                 end if
+              end if
+           else if( cname .and. c2=='UN' ) then
+              if( c3( 1: 1 )=='G' ) then
+                 if( c4=='QR' .or. c4=='RQ' .or. c4=='LQ' .or. c4=='QL' .or. c4=='HR' .or. &
+                           c4=='TR' .or. c4=='BR' )then
+                    nx = 128
+                 end if
+              end if
+           else if( c2=='GG' ) then
+              nx = 128
+              if( c3=='HD3' ) then
+                 nx = 128
+              end if
+           end if
+           stdlib_ilaenv = nx
+           return
+           80 continue
+           ! ispec = 4:  number of shifts (used by xhseqr)
+           stdlib_ilaenv = 6
+           return
+           90 continue
+           ! ispec = 5:  minimum column dimension (not used)
+           stdlib_ilaenv = 2
+           return
+           100 continue
+           ! ispec = 6:  crossover point for svd (used by xgelss and xgesvd)
+           stdlib_ilaenv = int( real( min( n1, n2 ),KIND=dp)*1.6e0,KIND=ilp)
+           return
+           110 continue
+           ! ispec = 7:  number of processors (not used)
+           stdlib_ilaenv = 1
+           return
+           120 continue
+           ! ispec = 8:  crossover point for multishift (used by xhseqr)
+           stdlib_ilaenv = 50
+           return
+           130 continue
+           ! ispec = 9:  maximum size of the subproblems at the bottom of the
+                       ! computation tree in the divide-and-conquer algorithm
+                       ! (used by xgelsd and xgesdd)
+           stdlib_ilaenv = 25
+           return
+           140 continue
+           ! ispec = 10: ieee and infinity nan arithmetic can be trusted not to trap
+           ! stdlib_ilaenv = 0
+           stdlib_ilaenv = 1
+           if( stdlib_ilaenv==1 ) then
+              stdlib_ilaenv = stdlib_ieeeck( 1, 0.0, 1.0 )
+           end if
+           return
+           150 continue
+           ! ispec = 11: ieee infinity arithmetic can be trusted not to trap
+           ! stdlib_ilaenv = 0
+           stdlib_ilaenv = 1
+           if( stdlib_ilaenv==1 ) then
+              stdlib_ilaenv = stdlib_ieeeck( 0, 0.0, 1.0 )
+           end if
+           return
+           160 continue
+           ! 12 <= ispec <= 17: xhseqr or related subroutines.
+           stdlib_ilaenv = stdlib_iparmq( ispec, name, opts, n1, n2, n3, n4 )
+           return
+     end function stdlib_ilaenv
+
+     !> This program sets problem and machine dependent parameters
+     !> useful for xHETRD_2STAGE, xHETRD_HE2HB, xHETRD_HB2ST,
+     !> xGEBRD_2STAGE, xGEBRD_GE2GB, xGEBRD_GB2BD
+     !> and related subroutines for eigenvalue problems.
+     !> It is called whenever ILAENV is called with 17 <= ISPEC <= 21.
+     !> It is called whenever ILAENV2STAGE is called with 1 <= ISPEC <= 5
+     !> with a direct conversion ISPEC + 16.
+
+     pure integer(ilp) function stdlib_iparam2stage( ispec, name, opts,ni, nbi, ibi, nxi )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character(len=*), intent(in) :: name, opts
+           integer(ilp), intent(in) :: ispec, ni, nbi, ibi, nxi
+        ! ================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, iz, kd, ib, lhous, lwork, nthreads, factoptnb, qroptnb, &
+                     lqoptnb
+           logical(lk) :: rprec, cprec
+           character :: prec*1, algo*3, stag*5, subnam*12, vect*1
+           ! Intrinsic Functions 
+           intrinsic :: char,ichar,max
+           ! Executable Statements 
+           ! invalid value for ispec
+           if( (ispec<17).or.(ispec>21) ) then
+               stdlib_iparam2stage = -1
+               return
+           endif
+           ! get the number of threads
+           nthreads = 1
+           !$ nthreads = omp_get_num_threads()
+
+           ! write(*,*) 'iparam voici nthreads ispec ',nthreads, ispec
+           if( ispec /= 19 ) then
+              ! convert name to upper case if the first character is lower case.
+              stdlib_iparam2stage = -1
+              subnam = name
+              ic = ichar( subnam( 1: 1 ) )
+              iz = ichar( 'Z' )
+              if( iz==90 .or. iz==122 ) then
+                 ! ascii character set
+                 if( ic>=97 .and. ic<=122 ) then
+                    subnam( 1: 1 ) = char( ic-32 )
+                    do i = 2, 12
+                       ic = ichar( subnam( i: i ) )
+                       if( ic>=97 .and. ic<=122 )subnam( i: i ) = char( ic-32 )
+                    end do
+                 end if
+              else if( iz==233 .or. iz==169 ) then
+                 ! ebcdic character set
+                 if( ( ic>=129 .and. ic<=137 ) .or.( ic>=145 .and. ic<=153 ) .or.( ic>=162 .and. &
+                           ic<=169 ) ) then
+                    subnam( 1: 1 ) = char( ic+64 )
+                    do i = 2, 12
+                       ic = ichar( subnam( i: i ) )
+                       if( ( ic>=129 .and. ic<=137 ) .or.( ic>=145 .and. ic<=153 ) .or.( ic>=162 &
+                                 .and. ic<=169 ) )subnam( i:i ) = char( ic+64 )
+                    end do
+                 end if
+              else if( iz==218 .or. iz==250 ) then
+                 ! prime machines:  ascii+128
+                 if( ic>=225 .and. ic<=250 ) then
+                    subnam( 1: 1 ) = char( ic-32 )
+                    do i = 2, 12
+                      ic = ichar( subnam( i: i ) )
+                      if( ic>=225 .and. ic<=250 )subnam( i: i ) = char( ic-32 )
+                    end do
+                 end if
+              end if
+              prec  = subnam( 1: 1 )
+              algo  = subnam( 4: 6 )
+              stag  = subnam( 8:12 )
+              rprec = prec=='S' .or. prec=='D'
+              cprec = prec=='C' .or. prec=='Z'
+              ! invalid value for precision
+              if( .not.( rprec .or. cprec ) ) then
+                  stdlib_iparam2stage = -1
+                  return
+              endif
+           endif
+            ! write(*,*),'rprec,cprec ',rprec,cprec,
+           ! $           '   algo ',algo,'    stage ',stag
+           if (( ispec == 17 ) .or. ( ispec == 18 )) then
+           ! ispec = 17, 18:  block size kd, ib
+           ! could be also dependent from n but for now it
+           ! depend only on sequential or parallel
+              if( nthreads>4 ) then
+                 if( cprec ) then
+                    kd = 128
+                    ib = 32
+                 else
+                    kd = 160
+                    ib = 40
+                 endif
+              else if( nthreads>1 ) then
+                 if( cprec ) then
+                    kd = 64
+                    ib = 32
+                 else
+                    kd = 64
+                    ib = 32
+                 endif
+              else
+                 if( cprec ) then
+                    kd = 16
+                    ib = 16
+                 else
+                    kd = 32
+                    ib = 16
+                 endif
+              endif
+              if( ispec==17 ) stdlib_iparam2stage = kd
+              if( ispec==18 ) stdlib_iparam2stage = ib
+           else if ( ispec == 19 ) then
+           ! ispec = 19:
+           ! lhous length of the houselholder representation
+           ! matrix (v,t) of the second stage. should be >= 1.
+           ! will add the vect option here next release
+              vect  = opts(1:1)
+              if( vect=='N' ) then
+                 lhous = max( 1, 4*ni )
+              else
+                 ! this is not correct, it need to call the algo and the stage2
+                 lhous = max( 1, 4*ni ) + ibi
+              endif
+              if( lhous>=0 ) then
+                 stdlib_iparam2stage = lhous
+              else
+                 stdlib_iparam2stage = -1
+              endif
+           else if ( ispec == 20 ) then
+           ! ispec = 20: (21 for future use)
+           ! lwork length of the workspace for
+           ! either or both stages for trd and brd. should be >= 1.
+           ! trd:
+           ! trd_stage 1: = lt + lw + ls1 + ls2
+                        ! = ldt*kd + n*kd + n*max(kd,factoptnb) + lds2*kd
+                          ! where ldt=lds2=kd
+                        ! = n*kd + n*max(kd,factoptnb) + 2*kd*kd
+           ! trd_stage 2: = (2nb+1)*n + kd*nthreads
+           ! trd_both   : = max(stage1,stage2) + ab ( ab=(kd+1)*n )
+                        ! = n*kd + n*max(kd+1,factoptnb)
+                          ! + max(2*kd*kd, kd*nthreads)
+                          ! + (kd+1)*n
+              lwork        = -1
+              subnam(1:1)  = prec
+              subnam(2:6)  = 'GEQRF'
+              qroptnb      = stdlib_ilaenv( 1, subnam, ' ', ni, nbi, -1, -1 )
+              subnam(2:6)  = 'GELQF'
+              lqoptnb      = stdlib_ilaenv( 1, subnam, ' ', nbi, ni, -1, -1 )
+              ! could be qr or lq for trd and the max for brd
+              factoptnb    = max(qroptnb, lqoptnb)
+              if( algo=='TRD' ) then
+                 if( stag=='2STAG' ) then
+                    lwork = ni*nbi + ni*max(nbi+1,factoptnb)+ max(2*nbi*nbi, nbi*nthreads)+ (nbi+&
+                              1)*ni
+                 else if( (stag=='HE2HB').or.(stag=='SY2SB') ) then
+                    lwork = ni*nbi + ni*max(nbi,factoptnb) + 2*nbi*nbi
+                 else if( (stag=='HB2ST').or.(stag=='SB2ST') ) then
+                    lwork = (2*nbi+1)*ni + nbi*nthreads
+                 endif
+              else if( algo=='BRD' ) then
+                 if( stag=='2STAG' ) then
+                    lwork = 2*ni*nbi + ni*max(nbi+1,factoptnb)+ max(2*nbi*nbi, nbi*nthreads)+ (&
+                              nbi+1)*ni
+                 else if( stag=='GE2GB' ) then
+                    lwork = ni*nbi + ni*max(nbi,factoptnb) + 2*nbi*nbi
+                 else if( stag=='GB2BD' ) then
+                    lwork = (3*nbi+1)*ni + nbi*nthreads
+                 endif
+              endif
+              lwork = max ( 1, lwork )
+              if( lwork>0 ) then
+                 stdlib_iparam2stage = lwork
+              else
+                 stdlib_iparam2stage = -1
+              endif
+           else if ( ispec == 21 ) then
+           ! ispec = 21 for future use
+              stdlib_iparam2stage = nxi
+           endif
+     end function stdlib_iparam2stage
+
+     !> ILAENV2STAGE: is called from the LAPACK routines to choose problem-dependent
+     !> parameters for the local environment.  See ISPEC for a description of
+     !> the parameters.
+     !> It sets problem and machine dependent parameters useful for *_2STAGE and
+     !> related subroutines.
+     !> ILAENV2STAGE returns an INTEGER
+     !> if ILAENV2STAGE >= 0: ILAENV2STAGE returns the value of the parameter
+     !> specified by ISPEC
+     !> if ILAENV2STAGE < 0:  if ILAENV2STAGE = -k, the k-th argument had an
+     !> illegal value.
+     !> This version provides a set of parameters which should give good,
+     !> but not optimal, performance on many of the currently available
+     !> computers for the 2-stage solvers. Users are encouraged to modify this
+     !> subroutine to set the tuning parameters for their particular machine using
+     !> the option and problem size information in the arguments.
+     !> This routine will not function correctly if it is converted to all
+     !> lower case.  Converting it to all upper case is allowed.
+
+     pure integer(ilp) function stdlib_ilaenv2stage( ispec, name, opts, n1, n2, n3, n4 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! july 2017
+           ! Scalar Arguments 
+           character(len=*), intent(in) :: name, opts
+           integer(ilp), intent(in) :: ispec, n1, n2, n3, n4
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: iispec
+           ! Executable Statements 
+           go to ( 10, 10, 10, 10, 10 )ispec
+           ! invalid value for ispec
+           stdlib_ilaenv2stage = -1
+           return
+           10 continue
+           ! 2stage eigenvalues and svd or related subroutines.
+           iispec = 16 + ispec
+           stdlib_ilaenv2stage = stdlib_iparam2stage( iispec, name, opts,n1, n2, n3, n4 )
+           return
+     end function stdlib_ilaenv2stage
+
+
+
+end module stdlib_linalg_lapack_aux
diff --git a/src/stdlib_linalg_lapack_c.fypp b/src/stdlib_linalg_lapack_c.fypp
new file mode 100644
index 000000000..39aaa2f84
--- /dev/null
+++ b/src/stdlib_linalg_lapack_c.fypp
@@ -0,0 +1,80743 @@
+#:include "common.fypp" 
+module stdlib_linalg_lapack_c
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas
+     use stdlib_linalg_lapack_aux
+     use stdlib_linalg_lapack_s
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+     public :: stdlib_cbbcsd
+     public :: stdlib_cbdsqr
+     public :: stdlib_cgbbrd
+     public :: stdlib_cgbcon
+     public :: stdlib_cgbequ
+     public :: stdlib_cgbequb
+     public :: stdlib_cgbrfs
+     public :: stdlib_cgbsv
+     public :: stdlib_cgbsvx
+     public :: stdlib_cgbtf2
+     public :: stdlib_cgbtrf
+     public :: stdlib_cgbtrs
+     public :: stdlib_cgebak
+     public :: stdlib_cgebal
+     public :: stdlib_cgebd2
+     public :: stdlib_cgebrd
+     public :: stdlib_cgecon
+     public :: stdlib_cgeequ
+     public :: stdlib_cgeequb
+     public :: stdlib_cgees
+     public :: stdlib_cgeesx
+     public :: stdlib_cgeev
+     public :: stdlib_cgeevx
+     public :: stdlib_cgehd2
+     public :: stdlib_cgehrd
+     public :: stdlib_cgejsv
+     public :: stdlib_cgelq
+     public :: stdlib_cgelq2
+     public :: stdlib_cgelqf
+     public :: stdlib_cgelqt
+     public :: stdlib_cgelqt3
+     public :: stdlib_cgels
+     public :: stdlib_cgelsd
+     public :: stdlib_cgelss
+     public :: stdlib_cgelsy
+     public :: stdlib_cgemlq
+     public :: stdlib_cgemlqt
+     public :: stdlib_cgemqr
+     public :: stdlib_cgemqrt
+     public :: stdlib_cgeql2
+     public :: stdlib_cgeqlf
+     public :: stdlib_cgeqp3
+     public :: stdlib_cgeqr
+     public :: stdlib_cgeqr2
+     public :: stdlib_cgeqr2p
+     public :: stdlib_cgeqrf
+     public :: stdlib_cgeqrfp
+     public :: stdlib_cgeqrt
+     public :: stdlib_cgeqrt2
+     public :: stdlib_cgeqrt3
+     public :: stdlib_cgerfs
+     public :: stdlib_cgerq2
+     public :: stdlib_cgerqf
+     public :: stdlib_cgesc2
+     public :: stdlib_cgesdd
+     public :: stdlib_cgesv
+     public :: stdlib_cgesvd
+     public :: stdlib_cgesvdq
+     public :: stdlib_cgesvj
+     public :: stdlib_cgesvx
+     public :: stdlib_cgetc2
+     public :: stdlib_cgetf2
+     public :: stdlib_cgetrf
+     public :: stdlib_cgetrf2
+     public :: stdlib_cgetri
+     public :: stdlib_cgetrs
+     public :: stdlib_cgetsls
+     public :: stdlib_cgetsqrhrt
+     public :: stdlib_cggbak
+     public :: stdlib_cggbal
+     public :: stdlib_cgges
+     public :: stdlib_cgges3
+     public :: stdlib_cggesx
+     public :: stdlib_cggev
+     public :: stdlib_cggev3
+     public :: stdlib_cggevx
+     public :: stdlib_cggglm
+     public :: stdlib_cgghd3
+     public :: stdlib_cgghrd
+     public :: stdlib_cgglse
+     public :: stdlib_cggqrf
+     public :: stdlib_cggrqf
+     public :: stdlib_cgsvj0
+     public :: stdlib_cgsvj1
+     public :: stdlib_cgtcon
+     public :: stdlib_cgtrfs
+     public :: stdlib_cgtsv
+     public :: stdlib_cgtsvx
+     public :: stdlib_cgttrf
+     public :: stdlib_cgttrs
+     public :: stdlib_cgtts2
+     public :: stdlib_chb2st_kernels
+     public :: stdlib_chbev
+     public :: stdlib_chbevd
+     public :: stdlib_chbevx
+     public :: stdlib_chbgst
+     public :: stdlib_chbgv
+     public :: stdlib_chbgvd
+     public :: stdlib_chbgvx
+     public :: stdlib_chbtrd
+     public :: stdlib_checon
+     public :: stdlib_checon_rook
+     public :: stdlib_cheequb
+     public :: stdlib_cheev
+     public :: stdlib_cheevd
+     public :: stdlib_cheevr
+     public :: stdlib_cheevx
+     public :: stdlib_chegs2
+     public :: stdlib_chegst
+     public :: stdlib_chegv
+     public :: stdlib_chegvd
+     public :: stdlib_chegvx
+     public :: stdlib_cherfs
+     public :: stdlib_chesv
+     public :: stdlib_chesv_aa
+     public :: stdlib_chesv_rk
+     public :: stdlib_chesv_rook
+     public :: stdlib_chesvx
+     public :: stdlib_cheswapr
+     public :: stdlib_chetd2
+     public :: stdlib_chetf2
+     public :: stdlib_chetf2_rk
+     public :: stdlib_chetf2_rook
+     public :: stdlib_chetrd
+     public :: stdlib_chetrd_hb2st
+     public :: stdlib_chetrd_he2hb
+     public :: stdlib_chetrf
+     public :: stdlib_chetrf_aa
+     public :: stdlib_chetrf_rk
+     public :: stdlib_chetrf_rook
+     public :: stdlib_chetri
+     public :: stdlib_chetri_rook
+     public :: stdlib_chetrs
+     public :: stdlib_chetrs2
+     public :: stdlib_chetrs_3
+     public :: stdlib_chetrs_aa
+     public :: stdlib_chetrs_rook
+     public :: stdlib_chfrk
+     public :: stdlib_chgeqz
+     public :: stdlib_chpcon
+     public :: stdlib_chpev
+     public :: stdlib_chpevd
+     public :: stdlib_chpevx
+     public :: stdlib_chpgst
+     public :: stdlib_chpgv
+     public :: stdlib_chpgvd
+     public :: stdlib_chpgvx
+     public :: stdlib_chprfs
+     public :: stdlib_chpsv
+     public :: stdlib_chpsvx
+     public :: stdlib_chptrd
+     public :: stdlib_chptrf
+     public :: stdlib_chptri
+     public :: stdlib_chptrs
+     public :: stdlib_chsein
+     public :: stdlib_chseqr
+     public :: stdlib_cla_gbamv
+     public :: stdlib_cla_gbrcond_c
+     public :: stdlib_cla_gbrpvgrw
+     public :: stdlib_cla_geamv
+     public :: stdlib_cla_gercond_c
+     public :: stdlib_cla_gerpvgrw
+     public :: stdlib_cla_heamv
+     public :: stdlib_cla_hercond_c
+     public :: stdlib_cla_herpvgrw
+     public :: stdlib_cla_lin_berr
+     public :: stdlib_cla_porcond_c
+     public :: stdlib_cla_porpvgrw
+     public :: stdlib_cla_syamv
+     public :: stdlib_cla_syrcond_c
+     public :: stdlib_cla_syrpvgrw
+     public :: stdlib_cla_wwaddw
+     public :: stdlib_clabrd
+     public :: stdlib_clacgv
+     public :: stdlib_clacn2
+     public :: stdlib_clacon
+     public :: stdlib_clacp2
+     public :: stdlib_clacpy
+     public :: stdlib_clacrm
+     public :: stdlib_clacrt
+     public :: stdlib_cladiv
+     public :: stdlib_claed0
+     public :: stdlib_claed7
+     public :: stdlib_claed8
+     public :: stdlib_claein
+     public :: stdlib_claesy
+     public :: stdlib_claev2
+     public :: stdlib_clag2z
+     public :: stdlib_clags2
+     public :: stdlib_clagtm
+     public :: stdlib_clahef
+     public :: stdlib_clahef_aa
+     public :: stdlib_clahef_rk
+     public :: stdlib_clahef_rook
+     public :: stdlib_clahqr
+     public :: stdlib_clahr2
+     public :: stdlib_claic1
+     public :: stdlib_clals0
+     public :: stdlib_clalsa
+     public :: stdlib_clalsd
+     public :: stdlib_clamswlq
+     public :: stdlib_clamtsqr
+     public :: stdlib_clangb
+     public :: stdlib_clange
+     public :: stdlib_clangt
+     public :: stdlib_clanhb
+     public :: stdlib_clanhe
+     public :: stdlib_clanhf
+     public :: stdlib_clanhp
+     public :: stdlib_clanhs
+     public :: stdlib_clanht
+     public :: stdlib_clansb
+     public :: stdlib_clansp
+     public :: stdlib_clansy
+     public :: stdlib_clantb
+     public :: stdlib_clantp
+     public :: stdlib_clantr
+     public :: stdlib_clapll
+     public :: stdlib_clapmr
+     public :: stdlib_clapmt
+     public :: stdlib_claqgb
+     public :: stdlib_claqge
+     public :: stdlib_claqhb
+     public :: stdlib_claqhe
+     public :: stdlib_claqhp
+     public :: stdlib_claqp2
+     public :: stdlib_claqps
+     public :: stdlib_claqr0
+     public :: stdlib_claqr1
+     public :: stdlib_claqr2
+     public :: stdlib_claqr3
+     public :: stdlib_claqr4
+     public :: stdlib_claqr5
+     public :: stdlib_claqsb
+     public :: stdlib_claqsp
+     public :: stdlib_claqsy
+     public :: stdlib_claqz0
+     public :: stdlib_claqz1
+     public :: stdlib_claqz2
+     public :: stdlib_claqz3
+     public :: stdlib_clar1v
+     public :: stdlib_clar2v
+     public :: stdlib_clarcm
+     public :: stdlib_clarf
+     public :: stdlib_clarfb
+     public :: stdlib_clarfb_gett
+     public :: stdlib_clarfg
+     public :: stdlib_clarfgp
+     public :: stdlib_clarft
+     public :: stdlib_clarfx
+     public :: stdlib_clarfy
+     public :: stdlib_clargv
+     public :: stdlib_clarnv
+     public :: stdlib_clarrv
+     public :: stdlib_clartg
+     public :: stdlib_clartv
+     public :: stdlib_clarz
+     public :: stdlib_clarzb
+     public :: stdlib_clarzt
+     public :: stdlib_clascl
+     public :: stdlib_claset
+     public :: stdlib_clasr
+     public :: stdlib_classq
+     public :: stdlib_claswlq
+     public :: stdlib_claswp
+     public :: stdlib_clasyf
+     public :: stdlib_clasyf_aa
+     public :: stdlib_clasyf_rk
+     public :: stdlib_clasyf_rook
+     public :: stdlib_clatbs
+     public :: stdlib_clatdf
+     public :: stdlib_clatps
+     public :: stdlib_clatrd
+     public :: stdlib_clatrs
+     public :: stdlib_clatrz
+     public :: stdlib_clatsqr
+     public :: stdlib_claunhr_col_getrfnp
+     public :: stdlib_claunhr_col_getrfnp2
+     public :: stdlib_clauu2
+     public :: stdlib_clauum
+     public :: stdlib_cpbcon
+     public :: stdlib_cpbequ
+     public :: stdlib_cpbrfs
+     public :: stdlib_cpbstf
+     public :: stdlib_cpbsv
+     public :: stdlib_cpbsvx
+     public :: stdlib_cpbtf2
+     public :: stdlib_cpbtrf
+     public :: stdlib_cpbtrs
+     public :: stdlib_cpftrf
+     public :: stdlib_cpftri
+     public :: stdlib_cpftrs
+     public :: stdlib_cpocon
+     public :: stdlib_cpoequ
+     public :: stdlib_cpoequb
+     public :: stdlib_cporfs
+     public :: stdlib_cposv
+     public :: stdlib_cposvx
+     public :: stdlib_cpotf2
+     public :: stdlib_cpotrf
+     public :: stdlib_cpotrf2
+     public :: stdlib_cpotri
+     public :: stdlib_cpotrs
+     public :: stdlib_cppcon
+     public :: stdlib_cppequ
+     public :: stdlib_cpprfs
+     public :: stdlib_cppsv
+     public :: stdlib_cppsvx
+     public :: stdlib_cpptrf
+     public :: stdlib_cpptri
+     public :: stdlib_cpptrs
+     public :: stdlib_cpstf2
+     public :: stdlib_cpstrf
+     public :: stdlib_cptcon
+     public :: stdlib_cpteqr
+     public :: stdlib_cptrfs
+     public :: stdlib_cptsv
+     public :: stdlib_cptsvx
+     public :: stdlib_cpttrf
+     public :: stdlib_cpttrs
+     public :: stdlib_cptts2
+     public :: stdlib_crot
+     public :: stdlib_cspcon
+     public :: stdlib_cspmv
+     public :: stdlib_cspr
+     public :: stdlib_csprfs
+     public :: stdlib_cspsv
+     public :: stdlib_cspsvx
+     public :: stdlib_csptrf
+     public :: stdlib_csptri
+     public :: stdlib_csptrs
+     public :: stdlib_csrscl
+     public :: stdlib_cstedc
+     public :: stdlib_cstegr
+     public :: stdlib_cstein
+     public :: stdlib_cstemr
+     public :: stdlib_csteqr
+     public :: stdlib_csycon
+     public :: stdlib_csycon_rook
+     public :: stdlib_csyconv
+     public :: stdlib_csyconvf
+     public :: stdlib_csyconvf_rook
+     public :: stdlib_csyequb
+     public :: stdlib_csymv
+     public :: stdlib_csyr
+     public :: stdlib_csyrfs
+     public :: stdlib_csysv
+     public :: stdlib_csysv_aa
+     public :: stdlib_csysv_rk
+     public :: stdlib_csysv_rook
+     public :: stdlib_csysvx
+     public :: stdlib_csyswapr
+     public :: stdlib_csytf2
+     public :: stdlib_csytf2_rk
+     public :: stdlib_csytf2_rook
+     public :: stdlib_csytrf
+     public :: stdlib_csytrf_aa
+     public :: stdlib_csytrf_rk
+     public :: stdlib_csytrf_rook
+     public :: stdlib_csytri
+     public :: stdlib_csytri_rook
+     public :: stdlib_csytrs
+     public :: stdlib_csytrs2
+     public :: stdlib_csytrs_3
+     public :: stdlib_csytrs_aa
+     public :: stdlib_csytrs_rook
+     public :: stdlib_ctbcon
+     public :: stdlib_ctbrfs
+     public :: stdlib_ctbtrs
+     public :: stdlib_ctfsm
+     public :: stdlib_ctftri
+     public :: stdlib_ctfttp
+     public :: stdlib_ctfttr
+     public :: stdlib_ctgevc
+     public :: stdlib_ctgex2
+     public :: stdlib_ctgexc
+     public :: stdlib_ctgsen
+     public :: stdlib_ctgsja
+     public :: stdlib_ctgsna
+     public :: stdlib_ctgsy2
+     public :: stdlib_ctgsyl
+     public :: stdlib_ctpcon
+     public :: stdlib_ctplqt
+     public :: stdlib_ctplqt2
+     public :: stdlib_ctpmlqt
+     public :: stdlib_ctpmqrt
+     public :: stdlib_ctpqrt
+     public :: stdlib_ctpqrt2
+     public :: stdlib_ctprfb
+     public :: stdlib_ctprfs
+     public :: stdlib_ctptri
+     public :: stdlib_ctptrs
+     public :: stdlib_ctpttf
+     public :: stdlib_ctpttr
+     public :: stdlib_ctrcon
+     public :: stdlib_ctrevc
+     public :: stdlib_ctrevc3
+     public :: stdlib_ctrexc
+     public :: stdlib_ctrrfs
+     public :: stdlib_ctrsen
+     public :: stdlib_ctrsna
+     public :: stdlib_ctrsyl
+     public :: stdlib_ctrti2
+     public :: stdlib_ctrtri
+     public :: stdlib_ctrtrs
+     public :: stdlib_ctrttf
+     public :: stdlib_ctrttp
+     public :: stdlib_ctzrzf
+     public :: stdlib_cunbdb
+     public :: stdlib_cunbdb1
+     public :: stdlib_cunbdb2
+     public :: stdlib_cunbdb3
+     public :: stdlib_cunbdb4
+     public :: stdlib_cunbdb5
+     public :: stdlib_cunbdb6
+     public :: stdlib_cuncsd
+     public :: stdlib_cuncsd2by1
+     public :: stdlib_cung2l
+     public :: stdlib_cung2r
+     public :: stdlib_cungbr
+     public :: stdlib_cunghr
+     public :: stdlib_cungl2
+     public :: stdlib_cunglq
+     public :: stdlib_cungql
+     public :: stdlib_cungqr
+     public :: stdlib_cungr2
+     public :: stdlib_cungrq
+     public :: stdlib_cungtr
+     public :: stdlib_cungtsqr
+     public :: stdlib_cungtsqr_row
+     public :: stdlib_cunhr_col
+     public :: stdlib_cunm22
+     public :: stdlib_cunm2l
+     public :: stdlib_cunm2r
+     public :: stdlib_cunmbr
+     public :: stdlib_cunmhr
+     public :: stdlib_cunml2
+     public :: stdlib_cunmlq
+     public :: stdlib_cunmql
+     public :: stdlib_cunmqr
+     public :: stdlib_cunmr2
+     public :: stdlib_cunmr3
+     public :: stdlib_cunmrq
+     public :: stdlib_cunmrz
+     public :: stdlib_cunmtr
+     public :: stdlib_cupgtr
+     public :: stdlib_cupmtr
+
+     ! 32-bit real constants 
+     real(sp),    parameter, private ::     negone = -1.00_sp
+     real(sp),    parameter, private ::       zero = 0.00_sp
+     real(sp),    parameter, private ::       half = 0.50_sp
+     real(sp),    parameter, private ::        one = 1.00_sp
+     real(sp),    parameter, private ::        two = 2.00_sp
+     real(sp),    parameter, private ::      three = 3.00_sp
+     real(sp),    parameter, private ::       four = 4.00_sp
+     real(sp),    parameter, private ::      eight = 8.00_sp
+     real(sp),    parameter, private ::        ten = 10.00_sp
+
+     ! 32-bit complex constants 
+     complex(sp), parameter, private :: czero   = ( 0.0_sp,0.0_sp)
+     complex(sp), parameter, private :: chalf   = ( 0.5_sp,0.0_sp)
+     complex(sp), parameter, private :: cone    = ( 1.0_sp,0.0_sp)
+     complex(sp), parameter, private :: cnegone = (-1.0_sp,0.0_sp)
+
+     ! 32-bit scaling constants 
+     integer,     parameter, private :: maxexp = maxexponent(zero) 
+     integer,     parameter, private :: minexp = minexponent(zero) 
+     real(sp),    parameter, private :: rradix = real(radix(zero),sp) 
+     real(sp),    parameter, private :: ulp    = epsilon(zero) 
+     real(sp),    parameter, private :: eps    = ulp*half 
+     real(sp),    parameter, private :: safmin = rradix**max(minexp-1,1-maxexp) 
+     real(sp),    parameter, private :: safmax = one/safmin 
+     real(sp),    parameter, private :: smlnum = safmin/ulp 
+     real(sp),    parameter, private :: bignum = safmax*ulp 
+     real(sp),    parameter, private :: rtmin  = sqrt(smlnum) 
+     real(sp),    parameter, private :: rtmax  = sqrt(bignum) 
+
+     ! 32-bit Blue's scaling constants 
+     ! ssml>=1/s and sbig==1/S with s,S as defined in https://doi.org/10.1145/355769.355771 
+     real(sp),    parameter, private :: tsml   = rradix**ceiling((minexp-1)*half) 
+     real(sp),    parameter, private :: tbig   = rradix**floor((maxexp-digits(zero)+1)*half) 
+     real(sp),    parameter, private :: ssml   = rradix**(-floor((minexp-digits(zero))*half)) 
+     real(sp),    parameter, private :: sbig   = rradix**(-ceiling((maxexp+digits(zero)-1)*half)) 
+
+
+     contains
+
+     !> CGBEQU: computes row and column scalings intended to equilibrate an
+     !> M-by-N band matrix A and reduce its condition number.  R returns the
+     !> row scale factors and C the column scale factors, chosen to try to
+     !> make the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+     !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+     !> number and BIGNUM = largest safe number.  Use of these scaling
+     !> factors is not guaranteed to reduce the condition number of A but
+     !> works well in practice.
+
+     pure subroutine stdlib_cgbequ( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(sp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(sp), intent(out) :: c(*), r(*)
+           complex(sp), intent(in) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(sp) :: bignum, rcmax, rcmin, smlnum
+           complex(sp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,min,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGBEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 r( i ) = max( r( i ), cabs1( ab( kd+i-j, j ) ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i))
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 c( j ) = max( c( j ), cabs1( ab( kd+i-j, j ) )*r( i ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j))
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_cgbequ
+
+     !> CGBEQUB: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+     !> the radix.
+     !> R(i) and C(j) are restricted to be a power of the radix between
+     !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+     !> of these scaling factors is not guaranteed to reduce the condition
+     !> number of A but works well in practice.
+     !> This routine differs from CGEEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled entries' magnitudes are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_cgbequb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(sp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(sp), intent(out) :: c(*), r(*)
+           complex(sp), intent(in) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(sp) :: bignum, rcmax, rcmin, smlnum, radix, logrdx
+           complex(sp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,log,real,aimag
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGBEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.  assume smlnum is a power of the radix.
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           radix = stdlib_slamch( 'B' )
+           logrdx = log(radix)
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 r( i ) = max( r( i ), cabs1( ab( kd+i-j, j ) ) )
+              end do
+           end do
+           do i = 1, m
+              if( r( i )>zero ) then
+                 r( i ) = radix**int( log( r( i ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i)).
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors.
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 c( j ) = max( c( j ), cabs1( ab( kd+i-j, j ) )*r( i ) )
+              end do
+              if( c( j )>zero ) then
+                 c( j ) = radix**int( log( c( j ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j)).
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_cgbequb
+
+     !> CGBTF2: computes an LU factorization of a complex m-by-n band matrix
+     !> A using partial pivoting with row interchanges.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_cgbtf2( m, n, kl, ku, ab, ldab, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jp, ju, km, kv
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! kv is the number of superdiagonals in the factor u, allowing for
+           ! fill-in.
+           kv = ku + kl
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+kv+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGBTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! gaussian elimination with partial pivoting
+           ! set fill-in elements in columns ku+2 to kv to czero.
+           do j = ku + 2, min( kv, n )
+              do i = kv - j + 2, kl
+                 ab( i, j ) = czero
+              end do
+           end do
+           ! ju is the index of the last column affected by the current stage
+           ! of the factorization.
+           ju = 1
+           loop_40: do j = 1, min( m, n )
+              ! set fill-in elements in column j+kv to czero.
+              if( j+kv<=n ) then
+                 do i = 1, kl
+                    ab( i, j+kv ) = czero
+                 end do
+              end if
+              ! find pivot and test for singularity. km is the number of
+              ! subdiagonal elements in the current column.
+              km = min( kl, m-j )
+              jp = stdlib_icamax( km+1, ab( kv+1, j ), 1 )
+              ipiv( j ) = jp + j - 1
+              if( ab( kv+jp, j )/=czero ) then
+                 ju = max( ju, min( j+ku+jp-1, n ) )
+                 ! apply interchange to columns j to ju.
+                 if( jp/=1 )call stdlib_cswap( ju-j+1, ab( kv+jp, j ), ldab-1,ab( kv+1, j ), ldab-&
+                           1 )
+                 if( km>0 ) then
+                    ! compute multipliers.
+                    call stdlib_cscal( km, cone / ab( kv+1, j ), ab( kv+2, j ), 1 )
+                    ! update trailing submatrix within the band.
+                    if( ju>j )call stdlib_cgeru( km, ju-j, -cone, ab( kv+2, j ), 1,ab( kv, j+1 ), &
+                              ldab-1, ab( kv+1, j+1 ),ldab-1 )
+                 end if
+              else
+                 ! if pivot is czero, set info to the index of the pivot
+                 ! unless a czero pivot has already been found.
+                 if( info==0 )info = j
+              end if
+           end do loop_40
+           return
+     end subroutine stdlib_cgbtf2
+
+     !> CGEBAK: forms the right or left eigenvectors of a complex general
+     !> matrix by backward transformation on the computed eigenvectors of the
+     !> balanced matrix output by CGEBAL.
+
+     pure subroutine stdlib_cgebak( job, side, n, ilo, ihi, scale, m, v, ldv,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job, side
+           integer(ilp), intent(in) :: ihi, ilo, ldv, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(in) :: scale(*)
+           complex(sp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: leftv, rightv
+           integer(ilp) :: i, ii, k
+           real(sp) :: s
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! decode and test the input parameters
+           rightv = stdlib_lsame( side, 'R' )
+           leftv = stdlib_lsame( side, 'L' )
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( .not.rightv .and. .not.leftv ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -4
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -5
+           else if( m<0 ) then
+              info = -7
+           else if( ldv<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEBAK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( m==0 )return
+           if( stdlib_lsame( job, 'N' ) )return
+           if( ilo==ihi )go to 30
+           ! backward balance
+           if( stdlib_lsame( job, 'S' ) .or. stdlib_lsame( job, 'B' ) ) then
+              if( rightv ) then
+                 do i = ilo, ihi
+                    s = scale( i )
+                    call stdlib_csscal( m, s, v( i, 1 ), ldv )
+                 end do
+              end if
+              if( leftv ) then
+                 do i = ilo, ihi
+                    s = one / scale( i )
+                    call stdlib_csscal( m, s, v( i, 1 ), ldv )
+                 end do
+              end if
+           end if
+           ! backward permutation
+           ! for  i = ilo-1 step -1 until 1,
+                    ! ihi+1 step 1 until n do --
+                    30 continue
+           if( stdlib_lsame( job, 'P' ) .or. stdlib_lsame( job, 'B' ) ) then
+              if( rightv ) then
+                 loop_40: do ii = 1, n
+                    i = ii
+                    if( i>=ilo .and. i<=ihi )cycle loop_40
+                    if( i<ilo )i = ilo - ii
+                    k = scale( i )
+                    if( k==i )cycle loop_40
+                    call stdlib_cswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_40
+              end if
+              if( leftv ) then
+                 loop_50: do ii = 1, n
+                    i = ii
+                    if( i>=ilo .and. i<=ihi )cycle loop_50
+                    if( i<ilo )i = ilo - ii
+                    k = scale( i )
+                    if( k==i )cycle loop_50
+                    call stdlib_cswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_50
+              end if
+           end if
+           return
+     end subroutine stdlib_cgebak
+
+     !> CGEBAL: balances a general complex matrix A.  This involves, first,
+     !> permuting A by a similarity transformation to isolate eigenvalues
+     !> in the first 1 to ILO-1 and last IHI+1 to N elements on the
+     !> diagonal; and second, applying a diagonal similarity transformation
+     !> to rows and columns ILO to IHI to make the rows and columns as
+     !> close in norm as possible.  Both steps are optional.
+     !> Balancing may reduce the 1-norm of the matrix, and improve the
+     !> accuracy of the computed eigenvalues and/or eigenvectors.
+
+     pure subroutine stdlib_cgebal( job, n, a, lda, ilo, ihi, scale, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(out) :: scale(*)
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sclfac = 2.0e+0_sp
+           real(sp), parameter :: factor = 0.95e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: noconv
+           integer(ilp) :: i, ica, iexc, ira, j, k, l, m
+           real(sp) :: c, ca, f, g, r, ra, s, sfmax1, sfmax2, sfmin1, sfmin2
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,min,real
+           ! test the input parameters
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEBAL', -info )
+              return
+           end if
+           k = 1
+           l = n
+           if( n==0 )go to 210
+           if( stdlib_lsame( job, 'N' ) ) then
+              do i = 1, n
+                 scale( i ) = one
+              end do
+              go to 210
+           end if
+           if( stdlib_lsame( job, 'S' ) )go to 120
+           ! permutation to isolate eigenvalues if possible
+           go to 50
+           ! row and column exchange.
+           20 continue
+           scale( m ) = j
+           if( j==m )go to 30
+           call stdlib_cswap( l, a( 1, j ), 1, a( 1, m ), 1 )
+           call stdlib_cswap( n-k+1, a( j, k ), lda, a( m, k ), lda )
+           30 continue
+           go to ( 40, 80 )iexc
+           ! search for rows isolating an eigenvalue and push them down.
+           40 continue
+           if( l==1 )go to 210
+           l = l - 1
+           50 continue
+           loop_70: do j = l, 1, -1
+              loop_60: do i = 1, l
+                 if( i==j )cycle loop_60
+                 if( real( a( j, i ),KIND=sp)/=zero .or. aimag( a( j, i ) )/=zero )cycle &
+                           loop_70
+              end do loop_60
+              m = l
+              iexc = 1
+              go to 20
+           end do loop_70
+           go to 90
+           ! search for columns isolating an eigenvalue and push them left.
+           80 continue
+           k = k + 1
+           90 continue
+           loop_110: do j = k, l
+              loop_100: do i = k, l
+                 if( i==j )cycle loop_100
+                 if( real( a( i, j ),KIND=sp)/=zero .or. aimag( a( i, j ) )/=zero )cycle &
+                           loop_110
+              end do loop_100
+              m = k
+              iexc = 2
+              go to 20
+           end do loop_110
+           120 continue
+           do i = k, l
+              scale( i ) = one
+           end do
+           if( stdlib_lsame( job, 'P' ) )go to 210
+           ! balance the submatrix in rows k to l.
+           ! iterative loop for norm reduction
+           sfmin1 = stdlib_slamch( 'S' ) / stdlib_slamch( 'P' )
+           sfmax1 = one / sfmin1
+           sfmin2 = sfmin1*sclfac
+           sfmax2 = one / sfmin2
+           140 continue
+           noconv = .false.
+           loop_200: do i = k, l
+              c = stdlib_scnrm2( l-k+1, a( k, i ), 1 )
+              r = stdlib_scnrm2( l-k+1, a( i , k ), lda )
+              ica = stdlib_icamax( l, a( 1, i ), 1 )
+              ca = abs( a( ica, i ) )
+              ira = stdlib_icamax( n-k+1, a( i, k ), lda )
+              ra = abs( a( i, ira+k-1 ) )
+              ! guard against zero c or r due to underflow.
+              if( c==zero .or. r==zero )cycle loop_200
+              g = r / sclfac
+              f = one
+              s = c + r
+              160 continue
+              if( c>=g .or. max( f, c, ca )>=sfmax2 .or.min( r, g, ra )<=sfmin2 )go to 170
+                 if( stdlib_sisnan( c+f+ca+r+g+ra ) ) then
+                 ! exit if nan to avoid infinite loop
+                 info = -3
+                 call stdlib_xerbla( 'CGEBAL', -info )
+                 return
+              end if
+              f = f*sclfac
+              c = c*sclfac
+              ca = ca*sclfac
+              r = r / sclfac
+              g = g / sclfac
+              ra = ra / sclfac
+              go to 160
+              170 continue
+              g = c / sclfac
+              180 continue
+              if( g<r .or. max( r, ra )>=sfmax2 .or.min( f, c, g, ca )<=sfmin2 )go to 190
+              f = f / sclfac
+              c = c / sclfac
+              g = g / sclfac
+              ca = ca / sclfac
+              r = r*sclfac
+              ra = ra*sclfac
+              go to 180
+              ! now balance.
+              190 continue
+              if( ( c+r )>=factor*s )cycle loop_200
+              if( f<one .and. scale( i )<one ) then
+                 if( f*scale( i )<=sfmin1 )cycle loop_200
+              end if
+              if( f>one .and. scale( i )>one ) then
+                 if( scale( i )>=sfmax1 / f )cycle loop_200
+              end if
+              g = one / f
+              scale( i ) = scale( i )*f
+              noconv = .true.
+              call stdlib_csscal( n-k+1, g, a( i, k ), lda )
+              call stdlib_csscal( l, f, a( 1, i ), 1 )
+           end do loop_200
+           if( noconv )go to 140
+           210 continue
+           ilo = k
+           ihi = l
+           return
+     end subroutine stdlib_cgebal
+
+     !> CGEEQU: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+     !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+     !> number and BIGNUM = largest safe number.  Use of these scaling
+     !> factors is not guaranteed to reduce the condition number of A but
+     !> works well in practice.
+
+     pure subroutine stdlib_cgeequ( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           real(sp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(sp), intent(out) :: c(*), r(*)
+           complex(sp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: bignum, rcmax, rcmin, smlnum
+           complex(sp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,min,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           do j = 1, n
+              do i = 1, m
+                 r( i ) = max( r( i ), cabs1( a( i, j ) ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i))
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = 1, m
+                 c( j ) = max( c( j ), cabs1( a( i, j ) )*r( i ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j))
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_cgeequ
+
+     !> CGEEQUB: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+     !> the radix.
+     !> R(i) and C(j) are restricted to be a power of the radix between
+     !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+     !> of these scaling factors is not guaranteed to reduce the condition
+     !> number of A but works well in practice.
+     !> This routine differs from CGEEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled entries' magnitudes are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_cgeequb( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           real(sp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(sp), intent(out) :: c(*), r(*)
+           complex(sp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: bignum, rcmax, rcmin, smlnum, radix, logrdx
+           complex(sp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,log,real,aimag
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.  assume smlnum is a power of the radix.
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           radix = stdlib_slamch( 'B' )
+           logrdx = log( radix )
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           do j = 1, n
+              do i = 1, m
+                 r( i ) = max( r( i ), cabs1( a( i, j ) ) )
+              end do
+           end do
+           do i = 1, m
+              if( r( i )>zero ) then
+                 r( i ) = radix**int( log(r( i ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i)).
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors.
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = 1, m
+                 c( j ) = max( c( j ), cabs1( a( i, j ) )*r( i ) )
+              end do
+              if( c( j )>zero ) then
+                 c( j ) = radix**int( log( c( j ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j)).
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_cgeequb
+
+     !> CGETC2: computes an LU factorization, using complete pivoting, of the
+     !> n-by-n matrix A. The factorization has the form A = P * L * U * Q,
+     !> where P and Q are permutation matrices, L is lower triangular with
+     !> unit diagonal elements and U is upper triangular.
+     !> This is a level 1 BLAS version of the algorithm.
+
+     pure subroutine stdlib_cgetc2( n, a, lda, ipiv, jpiv, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*), jpiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ip, ipv, j, jp, jpv
+           real(sp) :: bignum, eps, smin, smlnum, xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,max
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! handle the case n=1 by itself
+           if( n==1 ) then
+              ipiv( 1 ) = 1
+              jpiv( 1 ) = 1
+              if( abs( a( 1, 1 ) )<smlnum ) then
+                 info = 1
+                 a( 1, 1 ) = cmplx( smlnum, zero,KIND=sp)
+              end if
+              return
+           end if
+           ! factorize a using complete pivoting.
+           ! set pivots less than smin to smin
+           loop_40: do i = 1, n - 1
+              ! find max element in matrix a
+              xmax = zero
+              do ip = i, n
+                 do jp = i, n
+                    if( abs( a( ip, jp ) )>=xmax ) then
+                       xmax = abs( a( ip, jp ) )
+                       ipv = ip
+                       jpv = jp
+                    end if
+                 end do
+              end do
+              if( i==1 )smin = max( eps*xmax, smlnum )
+              ! swap rows
+              if( ipv/=i )call stdlib_cswap( n, a( ipv, 1 ), lda, a( i, 1 ), lda )
+              ipiv( i ) = ipv
+              ! swap columns
+              if( jpv/=i )call stdlib_cswap( n, a( 1, jpv ), 1, a( 1, i ), 1 )
+              jpiv( i ) = jpv
+              ! check for singularity
+              if( abs( a( i, i ) )<smin ) then
+                 info = i
+                 a( i, i ) = cmplx( smin, zero,KIND=sp)
+              end if
+              do j = i + 1, n
+                 a( j, i ) = a( j, i ) / a( i, i )
+              end do
+              call stdlib_cgeru( n-i, n-i, -cmplx( one,KIND=sp), a( i+1, i ), 1,a( i, i+1 ), lda, &
+                        a( i+1, i+1 ), lda )
+           end do loop_40
+           if( abs( a( n, n ) )<smin ) then
+              info = n
+              a( n, n ) = cmplx( smin, zero,KIND=sp)
+           end if
+           ! set last pivots to n
+           ipiv( n ) = n
+           jpiv( n ) = n
+           return
+     end subroutine stdlib_cgetc2
+
+     !> CGETF2: computes an LU factorization of a general m-by-n matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the right-looking Level 2 BLAS version of the algorithm.
+
+     pure subroutine stdlib_cgetf2( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: sfmin
+           integer(ilp) :: i, j, jp
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGETF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch('S')
+           do j = 1, min( m, n )
+              ! find pivot and test for singularity.
+              jp = j - 1 + stdlib_icamax( m-j+1, a( j, j ), 1 )
+              ipiv( j ) = jp
+              if( a( jp, j )/=czero ) then
+                 ! apply the interchange to columns 1:n.
+                 if( jp/=j )call stdlib_cswap( n, a( j, 1 ), lda, a( jp, 1 ), lda )
+                 ! compute elements j+1:m of j-th column.
+                 if( j<m ) then
+                    if( abs(a( j, j )) >= sfmin ) then
+                       call stdlib_cscal( m-j, cone / a( j, j ), a( j+1, j ), 1 )
+                    else
+                       do i = 1, m-j
+                          a( j+i, j ) = a( j+i, j ) / a( j, j )
+                       end do
+                    end if
+                 end if
+              else if( info==0 ) then
+                 info = j
+              end if
+              if( j<min( m, n ) ) then
+                 ! update trailing submatrix.
+                 call stdlib_cgeru( m-j, n-j, -cone, a( j+1, j ), 1, a( j, j+1 ),lda, a( j+1, j+1 &
+                           ), lda )
+              end if
+           end do
+           return
+     end subroutine stdlib_cgetf2
+
+     !> CGGBAK: forms the right or left eigenvectors of a complex generalized
+     !> eigenvalue problem A*x = lambda*B*x, by backward transformation on
+     !> the computed eigenvectors of the balanced pair of matrices output by
+     !> CGGBAL.
+
+     pure subroutine stdlib_cggbak( job, side, n, ilo, ihi, lscale, rscale, m, v,ldv, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job, side
+           integer(ilp), intent(in) :: ihi, ilo, ldv, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(in) :: lscale(*), rscale(*)
+           complex(sp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: leftv, rightv
+           integer(ilp) :: i, k
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           rightv = stdlib_lsame( side, 'R' )
+           leftv = stdlib_lsame( side, 'L' )
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( .not.rightv .and. .not.leftv ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( n==0 .and. ihi==0 .and. ilo/=1 ) then
+              info = -4
+           else if( n>0 .and. ( ihi<ilo .or. ihi>max( 1, n ) ) )then
+              info = -5
+           else if( n==0 .and. ilo==1 .and. ihi/=0 ) then
+              info = -5
+           else if( m<0 ) then
+              info = -8
+           else if( ldv<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGBAK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( m==0 )return
+           if( stdlib_lsame( job, 'N' ) )return
+           if( ilo==ihi )go to 30
+           ! backward balance
+           if( stdlib_lsame( job, 'S' ) .or. stdlib_lsame( job, 'B' ) ) then
+              ! backward transformation on right eigenvectors
+              if( rightv ) then
+                 do i = ilo, ihi
+                    call stdlib_csscal( m, rscale( i ), v( i, 1 ), ldv )
+                 end do
+              end if
+              ! backward transformation on left eigenvectors
+              if( leftv ) then
+                 do i = ilo, ihi
+                    call stdlib_csscal( m, lscale( i ), v( i, 1 ), ldv )
+                 end do
+              end if
+           end if
+           ! backward permutation
+           30 continue
+           if( stdlib_lsame( job, 'P' ) .or. stdlib_lsame( job, 'B' ) ) then
+              ! backward permutation on right eigenvectors
+              if( rightv ) then
+                 if( ilo==1 )go to 50
+                 loop_40: do i = ilo - 1, 1, -1
+                    k = rscale( i )
+                    if( k==i )cycle loop_40
+                    call stdlib_cswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_40
+                 50 continue
+                 if( ihi==n )go to 70
+                 loop_60: do i = ihi + 1, n
+                    k = rscale( i )
+                    if( k==i )cycle loop_60
+                    call stdlib_cswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_60
+              end if
+              ! backward permutation on left eigenvectors
+              70 continue
+              if( leftv ) then
+                 if( ilo==1 )go to 90
+                 loop_80: do i = ilo - 1, 1, -1
+                    k = lscale( i )
+                    if( k==i )cycle loop_80
+                    call stdlib_cswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_80
+                 90 continue
+                 if( ihi==n )go to 110
+                 loop_100: do i = ihi + 1, n
+                    k = lscale( i )
+                    if( k==i )cycle loop_100
+                    call stdlib_cswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_100
+              end if
+           end if
+           110 continue
+           return
+     end subroutine stdlib_cggbak
+
+     !> CGGBAL: balances a pair of general complex matrices (A,B).  This
+     !> involves, first, permuting A and B by similarity transformations to
+     !> isolate eigenvalues in the first 1 to ILO$-$1 and last IHI+1 to N
+     !> elements on the diagonal; and second, applying a diagonal similarity
+     !> transformation to rows and columns ILO to IHI to make the rows
+     !> and columns as close in norm as possible. Both steps are optional.
+     !> Balancing may reduce the 1-norm of the matrices, and improve the
+     !> accuracy of the computed eigenvalues and/or eigenvectors in the
+     !> generalized eigenvalue problem A*x = lambda*B*x.
+
+     pure subroutine stdlib_cggbal( job, n, a, lda, b, ldb, ilo, ihi, lscale,rscale, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldb, n
+           ! Array Arguments 
+           real(sp), intent(out) :: lscale(*), rscale(*), work(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sclfac = 1.0e+1_sp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, icab, iflow, ip1, ir, irab, it, j, jc, jp1, k, kount, l, lcab, lm1, &
+                     lrab, lsfmax, lsfmin, m, nr, nrp2
+           real(sp) :: alpha, basl, beta, cab, cmax, coef, coef2, coef5, cor, ew, ewc, gamma, &
+                     pgamma, rab, sfmax, sfmin, sum, t, ta, tb, tc
+           complex(sp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,int,log10,max,min,real,sign
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGBAL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              ilo = 1
+              ihi = n
+              return
+           end if
+           if( n==1 ) then
+              ilo = 1
+              ihi = n
+              lscale( 1 ) = one
+              rscale( 1 ) = one
+              return
+           end if
+           if( stdlib_lsame( job, 'N' ) ) then
+              ilo = 1
+              ihi = n
+              do i = 1, n
+                 lscale( i ) = one
+                 rscale( i ) = one
+              end do
+              return
+           end if
+           k = 1
+           l = n
+           if( stdlib_lsame( job, 'S' ) )go to 190
+           go to 30
+           ! permute the matrices a and b to isolate the eigenvalues.
+           ! find row with one nonzero in columns 1 through l
+           20 continue
+           l = lm1
+           if( l/=1 )go to 30
+           rscale( 1 ) = one
+           lscale( 1 ) = one
+           go to 190
+           30 continue
+           lm1 = l - 1
+           loop_80: do i = l, 1, -1
+              do j = 1, lm1
+                 jp1 = j + 1
+                 if( a( i, j )/=czero .or. b( i, j )/=czero )go to 50
+              end do
+              j = l
+              go to 70
+              50 continue
+              do j = jp1, l
+                 if( a( i, j )/=czero .or. b( i, j )/=czero )cycle loop_80
+              end do
+              j = jp1 - 1
+              70 continue
+              m = l
+              iflow = 1
+              go to 160
+           end do loop_80
+           go to 100
+           ! find column with one nonzero in rows k through n
+           90 continue
+           k = k + 1
+           100 continue
+           loop_150: do j = k, l
+              do i = k, lm1
+                 ip1 = i + 1
+                 if( a( i, j )/=czero .or. b( i, j )/=czero )go to 120
+              end do
+              i = l
+              go to 140
+              120 continue
+              do i = ip1, l
+                 if( a( i, j )/=czero .or. b( i, j )/=czero )cycle loop_150
+              end do
+              i = ip1 - 1
+              140 continue
+              m = k
+              iflow = 2
+              go to 160
+           end do loop_150
+           go to 190
+           ! permute rows m and i
+           160 continue
+           lscale( m ) = i
+           if( i==m )go to 170
+           call stdlib_cswap( n-k+1, a( i, k ), lda, a( m, k ), lda )
+           call stdlib_cswap( n-k+1, b( i, k ), ldb, b( m, k ), ldb )
+           ! permute columns m and j
+           170 continue
+           rscale( m ) = j
+           if( j==m )go to 180
+           call stdlib_cswap( l, a( 1, j ), 1, a( 1, m ), 1 )
+           call stdlib_cswap( l, b( 1, j ), 1, b( 1, m ), 1 )
+           180 continue
+           go to ( 20, 90 )iflow
+           190 continue
+           ilo = k
+           ihi = l
+           if( stdlib_lsame( job, 'P' ) ) then
+              do i = ilo, ihi
+                 lscale( i ) = one
+                 rscale( i ) = one
+              end do
+              return
+           end if
+           if( ilo==ihi )return
+           ! balance the submatrix in rows ilo to ihi.
+           nr = ihi - ilo + 1
+           do i = ilo, ihi
+              rscale( i ) = zero
+              lscale( i ) = zero
+              work( i ) = zero
+              work( i+n ) = zero
+              work( i+2*n ) = zero
+              work( i+3*n ) = zero
+              work( i+4*n ) = zero
+              work( i+5*n ) = zero
+           end do
+           ! compute right side vector in resulting linear equations
+           basl = log10( sclfac )
+           do i = ilo, ihi
+              do j = ilo, ihi
+                 if( a( i, j )==czero ) then
+                    ta = zero
+                    go to 210
+                 end if
+                 ta = log10( cabs1( a( i, j ) ) ) / basl
+                 210 continue
+                 if( b( i, j )==czero ) then
+                    tb = zero
+                    go to 220
+                 end if
+                 tb = log10( cabs1( b( i, j ) ) ) / basl
+                 220 continue
+                 work( i+4*n ) = work( i+4*n ) - ta - tb
+                 work( j+5*n ) = work( j+5*n ) - ta - tb
+              end do
+           end do
+           coef = one / real( 2*nr,KIND=sp)
+           coef2 = coef*coef
+           coef5 = half*coef2
+           nrp2 = nr + 2
+           beta = zero
+           it = 1
+           ! start generalized conjugate gradient iteration
+           250 continue
+           gamma = stdlib_sdot( nr, work( ilo+4*n ), 1, work( ilo+4*n ), 1 ) +stdlib_sdot( nr, &
+                     work( ilo+5*n ), 1, work( ilo+5*n ), 1 )
+           ew = zero
+           ewc = zero
+           do i = ilo, ihi
+              ew = ew + work( i+4*n )
+              ewc = ewc + work( i+5*n )
+           end do
+           gamma = coef*gamma - coef2*( ew**2+ewc**2 ) - coef5*( ew-ewc )**2
+           if( gamma==zero )go to 350
+           if( it/=1 )beta = gamma / pgamma
+           t = coef5*( ewc-three*ew )
+           tc = coef5*( ew-three*ewc )
+           call stdlib_sscal( nr, beta, work( ilo ), 1 )
+           call stdlib_sscal( nr, beta, work( ilo+n ), 1 )
+           call stdlib_saxpy( nr, coef, work( ilo+4*n ), 1, work( ilo+n ), 1 )
+           call stdlib_saxpy( nr, coef, work( ilo+5*n ), 1, work( ilo ), 1 )
+           do i = ilo, ihi
+              work( i ) = work( i ) + tc
+              work( i+n ) = work( i+n ) + t
+           end do
+           ! apply matrix to vector
+           do i = ilo, ihi
+              kount = 0
+              sum = zero
+              loop_290: do j = ilo, ihi
+                 if( a( i, j )==czero )go to 280
+                 kount = kount + 1
+                 sum = sum + work( j )
+                 280 continue
+                 if( b( i, j )==czero )cycle loop_290
+                 kount = kount + 1
+                 sum = sum + work( j )
+              end do loop_290
+              work( i+2*n ) = real( kount,KIND=sp)*work( i+n ) + sum
+           end do
+           do j = ilo, ihi
+              kount = 0
+              sum = zero
+              loop_320: do i = ilo, ihi
+                 if( a( i, j )==czero )go to 310
+                 kount = kount + 1
+                 sum = sum + work( i+n )
+                 310 continue
+                 if( b( i, j )==czero )cycle loop_320
+                 kount = kount + 1
+                 sum = sum + work( i+n )
+              end do loop_320
+              work( j+3*n ) = real( kount,KIND=sp)*work( j ) + sum
+           end do
+           sum = stdlib_sdot( nr, work( ilo+n ), 1, work( ilo+2*n ), 1 ) +stdlib_sdot( nr, work( &
+                     ilo ), 1, work( ilo+3*n ), 1 )
+           alpha = gamma / sum
+           ! determine correction to current iteration
+           cmax = zero
+           do i = ilo, ihi
+              cor = alpha*work( i+n )
+              if( abs( cor )>cmax )cmax = abs( cor )
+              lscale( i ) = lscale( i ) + cor
+              cor = alpha*work( i )
+              if( abs( cor )>cmax )cmax = abs( cor )
+              rscale( i ) = rscale( i ) + cor
+           end do
+           if( cmax<half )go to 350
+           call stdlib_saxpy( nr, -alpha, work( ilo+2*n ), 1, work( ilo+4*n ), 1 )
+           call stdlib_saxpy( nr, -alpha, work( ilo+3*n ), 1, work( ilo+5*n ), 1 )
+           pgamma = gamma
+           it = it + 1
+           if( it<=nrp2 )go to 250
+           ! end generalized conjugate gradient iteration
+           350 continue
+           sfmin = stdlib_slamch( 'S' )
+           sfmax = one / sfmin
+           lsfmin = int( log10( sfmin ) / basl+one,KIND=ilp)
+           lsfmax = int( log10( sfmax ) / basl,KIND=ilp)
+           do i = ilo, ihi
+              irab = stdlib_icamax( n-ilo+1, a( i, ilo ), lda )
+              rab = abs( a( i, irab+ilo-1 ) )
+              irab = stdlib_icamax( n-ilo+1, b( i, ilo ), ldb )
+              rab = max( rab, abs( b( i, irab+ilo-1 ) ) )
+              lrab = int( log10( rab+sfmin ) / basl+one,KIND=ilp)
+              ir = lscale( i ) + sign( half, lscale( i ) )
+              ir = min( max( ir, lsfmin ), lsfmax, lsfmax-lrab )
+              lscale( i ) = sclfac**ir
+              icab = stdlib_icamax( ihi, a( 1, i ), 1 )
+              cab = abs( a( icab, i ) )
+              icab = stdlib_icamax( ihi, b( 1, i ), 1 )
+              cab = max( cab, abs( b( icab, i ) ) )
+              lcab = int( log10( cab+sfmin ) / basl+one,KIND=ilp)
+              jc = rscale( i ) + sign( half, rscale( i ) )
+              jc = min( max( jc, lsfmin ), lsfmax, lsfmax-lcab )
+              rscale( i ) = sclfac**jc
+           end do
+           ! row scaling of matrices a and b
+           do i = ilo, ihi
+              call stdlib_csscal( n-ilo+1, lscale( i ), a( i, ilo ), lda )
+              call stdlib_csscal( n-ilo+1, lscale( i ), b( i, ilo ), ldb )
+           end do
+           ! column scaling of matrices a and b
+           do j = ilo, ihi
+              call stdlib_csscal( ihi, rscale( j ), a( 1, j ), 1 )
+              call stdlib_csscal( ihi, rscale( j ), b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_cggbal
+
+     !> CGTSV:  solves the equation
+     !> A*X = B,
+     !> where A is an N-by-N tridiagonal matrix, by Gaussian elimination with
+     !> partial pivoting.
+     !> Note that the equation  A**T *X = B  may be solved by interchanging the
+     !> order of the arguments DU and DL.
+
+     pure subroutine stdlib_cgtsv( n, nrhs, dl, d, du, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(sp), intent(inout) :: b(ldb,*), d(*), dl(*), du(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, k
+           complex(sp) :: mult, temp, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGTSV ', -info )
+              return
+           end if
+           if( n==0 )return
+           loop_30: do k = 1, n - 1
+              if( dl( k )==czero ) then
+                 ! subdiagonal is czero, no elimination is required.
+                 if( d( k )==czero ) then
+                    ! diagonal is czero: set info = k and return; a unique
+                    ! solution can not be found.
+                    info = k
+                    return
+                 end if
+              else if( cabs1( d( k ) )>=cabs1( dl( k ) ) ) then
+                 ! no row interchange required
+                 mult = dl( k ) / d( k )
+                 d( k+1 ) = d( k+1 ) - mult*du( k )
+                 do j = 1, nrhs
+                    b( k+1, j ) = b( k+1, j ) - mult*b( k, j )
+                 end do
+                 if( k<( n-1 ) )dl( k ) = czero
+              else
+                 ! interchange rows k and k+1
+                 mult = d( k ) / dl( k )
+                 d( k ) = dl( k )
+                 temp = d( k+1 )
+                 d( k+1 ) = du( k ) - mult*temp
+                 if( k<( n-1 ) ) then
+                    dl( k ) = du( k+1 )
+                    du( k+1 ) = -mult*dl( k )
+                 end if
+                 du( k ) = temp
+                 do j = 1, nrhs
+                    temp = b( k, j )
+                    b( k, j ) = b( k+1, j )
+                    b( k+1, j ) = temp - mult*b( k+1, j )
+                 end do
+              end if
+           end do loop_30
+           if( d( n )==czero ) then
+              info = n
+              return
+           end if
+           ! back solve with the matrix u from the factorization.
+           do j = 1, nrhs
+              b( n, j ) = b( n, j ) / d( n )
+              if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) / d( n-1 )
+              do k = n - 2, 1, -1
+                 b( k, j ) = ( b( k, j )-du( k )*b( k+1, j )-dl( k )*b( k+2, j ) ) / d( k )
+                           
+              end do
+           end do
+           return
+     end subroutine stdlib_cgtsv
+
+     !> CGTTRF: computes an LU factorization of a complex tridiagonal matrix A
+     !> using elimination with partial pivoting and row interchanges.
+     !> The factorization has the form
+     !> A = L * U
+     !> where L is a product of permutation and unit lower bidiagonal
+     !> matrices and U is upper triangular with nonzeros in only the main
+     !> diagonal and first two superdiagonals.
+
+     pure subroutine stdlib_cgttrf( n, dl, d, du, du2, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: d(*), dl(*), du(*)
+           complex(sp), intent(out) :: du2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(sp) :: fact, temp, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'CGTTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize ipiv(i) = i and du2(i) = 0
+           do i = 1, n
+              ipiv( i ) = i
+           end do
+           do i = 1, n - 2
+              du2( i ) = zero
+           end do
+           do i = 1, n - 2
+              if( cabs1( d( i ) )>=cabs1( dl( i ) ) ) then
+                 ! no row interchange required, eliminate dl(i)
+                 if( cabs1( d( i ) )/=zero ) then
+                    fact = dl( i ) / d( i )
+                    dl( i ) = fact
+                    d( i+1 ) = d( i+1 ) - fact*du( i )
+                 end if
+              else
+                 ! interchange rows i and i+1, eliminate dl(i)
+                 fact = d( i ) / dl( i )
+                 d( i ) = dl( i )
+                 dl( i ) = fact
+                 temp = du( i )
+                 du( i ) = d( i+1 )
+                 d( i+1 ) = temp - fact*d( i+1 )
+                 du2( i ) = du( i+1 )
+                 du( i+1 ) = -fact*du( i+1 )
+                 ipiv( i ) = i + 1
+              end if
+           end do
+           if( n>1 ) then
+              i = n - 1
+              if( cabs1( d( i ) )>=cabs1( dl( i ) ) ) then
+                 if( cabs1( d( i ) )/=zero ) then
+                    fact = dl( i ) / d( i )
+                    dl( i ) = fact
+                    d( i+1 ) = d( i+1 ) - fact*du( i )
+                 end if
+              else
+                 fact = d( i ) / dl( i )
+                 d( i ) = dl( i )
+                 dl( i ) = fact
+                 temp = du( i )
+                 du( i ) = d( i+1 )
+                 d( i+1 ) = temp - fact*d( i+1 )
+                 ipiv( i ) = i + 1
+              end if
+           end if
+           ! check for a zero on the diagonal of u.
+           do i = 1, n
+              if( cabs1( d( i ) )==zero ) then
+                 info = i
+                 go to 50
+              end if
+           end do
+           50 continue
+           return
+     end subroutine stdlib_cgttrf
+
+     !> CGTTS2: solves one of the systems of equations
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> with a tridiagonal matrix A using the LU factorization computed
+     !> by CGTTRF.
+
+     pure subroutine stdlib_cgtts2( itrans, n, nrhs, dl, d, du, du2, ipiv, b, ldb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: itrans, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: b(ldb,*)
+           complex(sp), intent(in) :: d(*), dl(*), du(*), du2(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           complex(sp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( itrans==0 ) then
+              ! solve a*x = b using the lu factorization of a,
+              ! overwriting each right hand side vector with its solution.
+              if( nrhs<=1 ) then
+                 j = 1
+                 10 continue
+                 ! solve l*x = b.
+                 do i = 1, n - 1
+                    if( ipiv( i )==i ) then
+                       b( i+1, j ) = b( i+1, j ) - dl( i )*b( i, j )
+                    else
+                       temp = b( i, j )
+                       b( i, j ) = b( i+1, j )
+                       b( i+1, j ) = temp - dl( i )*b( i, j )
+                    end if
+                 end do
+                 ! solve u*x = b.
+                 b( n, j ) = b( n, j ) / d( n )
+                 if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                 do i = n - 2, 1, -1
+                    b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-du2( i )*b( i+2, j ) ) / d( i )
+                              
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 10
+                 end if
+              else
+                 do j = 1, nrhs
+                 ! solve l*x = b.
+                    do i = 1, n - 1
+                       if( ipiv( i )==i ) then
+                          b( i+1, j ) = b( i+1, j ) - dl( i )*b( i, j )
+                       else
+                          temp = b( i, j )
+                          b( i, j ) = b( i+1, j )
+                          b( i+1, j ) = temp - dl( i )*b( i, j )
+                       end if
+                    end do
+                 ! solve u*x = b.
+                    b( n, j ) = b( n, j ) / d( n )
+                    if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                    do i = n - 2, 1, -1
+                       b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-du2( i )*b( i+2, j ) ) / d( i )
+                                 
+                    end do
+                 end do
+              end if
+           else if( itrans==1 ) then
+              ! solve a**t * x = b.
+              if( nrhs<=1 ) then
+                 j = 1
+                 70 continue
+                 ! solve u**t * x = b.
+                 b( 1, j ) = b( 1, j ) / d( 1 )
+                 if( n>1 )b( 2, j ) = ( b( 2, j )-du( 1 )*b( 1, j ) ) / d( 2 )
+                 do i = 3, n
+                    b( i, j ) = ( b( i, j )-du( i-1 )*b( i-1, j )-du2( i-2 )*b( i-2, j ) ) / d( i &
+                              )
+                 end do
+                 ! solve l**t * x = b.
+                 do i = n - 1, 1, -1
+                    if( ipiv( i )==i ) then
+                       b( i, j ) = b( i, j ) - dl( i )*b( i+1, j )
+                    else
+                       temp = b( i+1, j )
+                       b( i+1, j ) = b( i, j ) - dl( i )*temp
+                       b( i, j ) = temp
+                    end if
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 70
+                 end if
+              else
+                 do j = 1, nrhs
+                 ! solve u**t * x = b.
+                    b( 1, j ) = b( 1, j ) / d( 1 )
+                    if( n>1 )b( 2, j ) = ( b( 2, j )-du( 1 )*b( 1, j ) ) / d( 2 )
+                    do i = 3, n
+                       b( i, j ) = ( b( i, j )-du( i-1 )*b( i-1, j )-du2( i-2 )*b( i-2, j ) ) / d(&
+                                  i )
+                    end do
+                 ! solve l**t * x = b.
+                    do i = n - 1, 1, -1
+                       if( ipiv( i )==i ) then
+                          b( i, j ) = b( i, j ) - dl( i )*b( i+1, j )
+                       else
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - dl( i )*temp
+                          b( i, j ) = temp
+                       end if
+                    end do
+                 end do
+              end if
+           else
+              ! solve a**h * x = b.
+              if( nrhs<=1 ) then
+                 j = 1
+                 130 continue
+                 ! solve u**h * x = b.
+                 b( 1, j ) = b( 1, j ) / conjg( d( 1 ) )
+                 if( n>1 )b( 2, j ) = ( b( 2, j )-conjg( du( 1 ) )*b( 1, j ) ) /conjg( d( 2 ) )
+                           
+                 do i = 3, n
+                    b( i, j ) = ( b( i, j )-conjg( du( i-1 ) )*b( i-1, j )-conjg( du2( i-2 ) )*b( &
+                              i-2, j ) ) /conjg( d( i ) )
+                 end do
+                 ! solve l**h * x = b.
+                 do i = n - 1, 1, -1
+                    if( ipiv( i )==i ) then
+                       b( i, j ) = b( i, j ) - conjg( dl( i ) )*b( i+1, j )
+                    else
+                       temp = b( i+1, j )
+                       b( i+1, j ) = b( i, j ) - conjg( dl( i ) )*temp
+                       b( i, j ) = temp
+                    end if
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 130
+                 end if
+              else
+                 do j = 1, nrhs
+                 ! solve u**h * x = b.
+                    b( 1, j ) = b( 1, j ) / conjg( d( 1 ) )
+                    if( n>1 )b( 2, j ) = ( b( 2, j )-conjg( du( 1 ) )*b( 1, j ) ) /conjg( d( 2 ) )
+                              
+                    do i = 3, n
+                       b( i, j ) = ( b( i, j )-conjg( du( i-1 ) )*b( i-1, j )-conjg( du2( i-2 ) )&
+                                 *b( i-2, j ) ) / conjg( d( i ) )
+                    end do
+                 ! solve l**h * x = b.
+                    do i = n - 1, 1, -1
+                       if( ipiv( i )==i ) then
+                          b( i, j ) = b( i, j ) - conjg( dl( i ) )*b( i+1, j )
+                       else
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - conjg( dl( i ) )*temp
+                          b( i, j ) = temp
+                       end if
+                    end do
+                 end do
+              end if
+           end if
+     end subroutine stdlib_cgtts2
+
+     !> CHESWAPR: applies an elementary permutation on the rows and the columns of
+     !> a hermitian matrix.
+
+     pure subroutine stdlib_cheswapr( uplo, n, a, lda, i1, i2)
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: i1, i2, lda, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,n)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           complex(sp) :: tmp
+           ! Executable Statements 
+           upper = stdlib_lsame( uplo, 'U' )
+           if (upper) then
+               ! upper
+               ! first swap
+                ! - swap column i1 and i2 from i1 to i1-1
+              call stdlib_cswap( i1-1, a(1,i1), 1, a(1,i2), 1 )
+                ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap row i1 from i1+1 to i2-1 with col i2 from i1+1 to i2-1
+                ! - swap a(i2,i1) and a(i1,i2)
+              tmp=a(i1,i1)
+              a(i1,i1)=a(i2,i2)
+              a(i2,i2)=tmp
+              do i=1,i2-i1-1
+                 tmp=a(i1,i1+i)
+                 a(i1,i1+i)=conjg(a(i1+i,i2))
+                 a(i1+i,i2)=conjg(tmp)
+              end do
+               a(i1,i2)=conjg(a(i1,i2))
+                ! third swap
+                ! - swap row i1 and i2 from i2+1 to n
+              do i=i2+1,n
+                 tmp=a(i1,i)
+                 a(i1,i)=a(i2,i)
+                 a(i2,i)=tmp
+              end do
+             else
+               ! lower
+               ! first swap
+                ! - swap row i1 and i2 from 1 to i1-1
+              call stdlib_cswap ( i1-1, a(i1,1), lda, a(i2,1), lda )
+               ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap col i1 from i1+1 to i2-1 with row i2 from i1+1 to i2-1
+                ! - swap a(i2,i1) and a(i1,i2)
+               tmp=a(i1,i1)
+               a(i1,i1)=a(i2,i2)
+               a(i2,i2)=tmp
+               do i=1,i2-i1-1
+                  tmp=a(i1+i,i1)
+                  a(i1+i,i1)=conjg(a(i2,i1+i))
+                  a(i2,i1+i)=conjg(tmp)
+               end do
+               a(i2,i1)=conjg(a(i2,i1))
+               ! third swap
+                ! - swap col i1 and i2 from i2+1 to n
+               do i=i2+1,n
+                  tmp=a(i,i1)
+                  a(i,i1)=a(i,i2)
+                  a(i,i2)=tmp
+               end do
+           endif
+     end subroutine stdlib_cheswapr
+
+     !> CHETF2: computes the factorization of a complex Hermitian matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**H  or  A = L*D*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**H is the conjugate transpose of U, and D is
+     !> Hermitian and block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_chetf2( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kk, kp, kstep
+           real(sp) :: absakk, alpha, colmax, d, d11, d22, r1, rowmax, tt
+           complex(sp) :: d12, d21, t, wk, wkm1, wkp1, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,max,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETF2', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 90
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) .or. stdlib_sisnan(absakk) ) then
+                 ! column k is or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=sp)
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = imax + stdlib_icamax( k-imax, a( imax, imax+1 ), lda )
+                    rowmax = cabs1( a( imax, jmax ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_icamax( imax-1, a( 1, imax ), 1 )
+                       rowmax = max( rowmax, cabs1( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( real( a( imax, imax ),KIND=sp) )>=alpha*rowmax )then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_cswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    do j = kp + 1, kk - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    r1 = real( a( kk, kk ),KIND=sp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=sp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       a( k, k ) = real( a( k, k ),KIND=sp)
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 else
+                    a( k, k ) = real( a( k, k ),KIND=sp)
+                    if( kstep==2 )a( k-1, k-1 ) = real( a( k-1, k-1 ),KIND=sp)
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**h = a - w(k)*1/d(k)*w(k)**h
+                    r1 = one / real( a( k, k ),KIND=sp)
+                    call stdlib_cher( uplo, k-1, -r1, a( 1, k ), 1, a, lda )
+                    ! store u(k) in column k
+                    call stdlib_csscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**h
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**h
+                    if( k>2 ) then
+                       d = stdlib_slapy2( real( a( k-1, k ),KIND=sp),aimag( a( k-1, k ) ) )
+                                 
+                       d22 = real( a( k-1, k-1 ),KIND=sp) / d
+                       d11 = real( a( k, k ),KIND=sp) / d
+                       tt = one / ( d11*d22-one )
+                       d12 = a( k-1, k ) / d
+                       d = tt / d
+                       do j = k - 2, 1, -1
+                          wkm1 = d*( d11*a( j, k-1 )-conjg( d12 )*a( j, k ) )
+                          wk = d*( d22*a( j, k )-d12*a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - a( i, k )*conjg( wk ) -a( i, k-1 )*conjg( &
+                                       wkm1 )
+                          end do
+                          a( j, k ) = wk
+                          a( j, k-1 ) = wkm1
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=sp), zero,KIND=sp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              50 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 90
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) .or. stdlib_sisnan(absakk) ) then
+                 ! column k is zero or underflow, contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=sp)
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = k - 1 + stdlib_icamax( imax-k, a( imax, k ), lda )
+                    rowmax = cabs1( a( imax, jmax ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_icamax( n-imax, a( imax+1, imax ), 1 )
+                       rowmax = max( rowmax, cabs1( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( real( a( imax, imax ),KIND=sp) )>=alpha*rowmax )then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_cswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    do j = kk + 1, kp - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    r1 = real( a( kk, kk ),KIND=sp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=sp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       a( k, k ) = real( a( k, k ),KIND=sp)
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 else
+                    a( k, k ) = real( a( k, k ),KIND=sp)
+                    if( kstep==2 )a( k+1, k+1 ) = real( a( k+1, k+1 ),KIND=sp)
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**h = a - w(k)*(1/d(k))*w(k)**h
+                       r1 = one / real( a( k, k ),KIND=sp)
+                       call stdlib_cher( uplo, n-k, -r1, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                 
+                       ! store l(k) in column k
+                       call stdlib_csscal( n-k, r1, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k)
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**h
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**h
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d = stdlib_slapy2( real( a( k+1, k ),KIND=sp),aimag( a( k+1, k ) ) )
+                                 
+                       d11 = real( a( k+1, k+1 ),KIND=sp) / d
+                       d22 = real( a( k, k ),KIND=sp) / d
+                       tt = one / ( d11*d22-one )
+                       d21 = a( k+1, k ) / d
+                       d =  tt / d
+                       do j = k + 2, n
+                          wk = d*( d11*a( j, k )-d21*a( j, k+1 ) )
+                          wkp1 = d*( d22*a( j, k+1 )-conjg( d21 )*a( j, k ) )
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - a( i, k )*conjg( wk ) -a( i, k+1 )*conjg( &
+                                       wkp1 )
+                          end do
+                          a( j, k ) = wk
+                          a( j, k+1 ) = wkp1
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=sp), zero,KIND=sp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 50
+           end if
+           90 continue
+           return
+     end subroutine stdlib_chetf2
+
+     !> CHETF2_RK: computes the factorization of a complex Hermitian matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**H)*(P**T) or A = P*L*D*(L**H)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is Hermitian and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_chetf2_rk( uplo, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: e(*)
+        ! ======================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done, upper
+           integer(ilp) :: i, ii, imax, itemp, j, jmax, k, kk, kp, kstep, p
+           real(sp) :: absakk, alpha, colmax, d, d11, d22, r1, stemp, rowmax, tt, sfmin
+           complex(sp) :: d12, d21, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,max,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=sp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETF2_RK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = czero
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 34
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=sp)
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_icamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_icamax( imax-1, a( 1, imax ), 1 )
+                          stemp = cabs1( a( itemp, imax ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=sp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( a( imax, imax ),KIND=sp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 12
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! for only a 2x2 pivot, interchange rows and columns k and p
+                 ! in the leading submatrix a(1:k,1:k)
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! (1) swap columnar parts
+                    if( p>1 )call stdlib_cswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = p + 1, k - 1
+                       t = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( p, j ) )
+                       a( p, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( p, k ) = conjg( a( p, k ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( k, k ),KIND=sp)
+                    a( k, k ) = real( a( p, p ),KIND=sp)
+                    a( p, p ) = r1
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_cswap( n-k, a( k, k+1 ), lda, a( p, k+1 ), lda )
+                 end if
+                 ! for both 1x1 and 2x2 pivots, interchange rows and
+                 ! columns kk and kp in the leading submatrix a(1:k,1:k)
+                 if( kp/=kk ) then
+                    ! (1) swap columnar parts
+                    if( kp>1 )call stdlib_cswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = kp + 1, kk - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( kk, kk ),KIND=sp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=sp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       ! (*) make sure that diagonal element of pivot is real
+                       a( k, k ) = real( a( k, k ),KIND=sp)
+                       ! (5) swap row elements
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_cswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                 else
+                    ! (*) make sure that diagonal element of pivot is real
+                    a( k, k ) = real( a( k, k ),KIND=sp)
+                    if( kstep==2 )a( k-1, k-1 ) = real( a( k-1, k-1 ),KIND=sp)
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( abs( real( a( k, k ),KIND=sp) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = one / real( a( k, k ),KIND=sp)
+                          call stdlib_cher( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_csscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = real( a( k, k ),KIND=sp)
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_cher( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       ! d = |a12|
+                       d = stdlib_slapy2( real( a( k-1, k ),KIND=sp),aimag( a( k-1, k ) ) )
+                                 
+                       d11 = real( a( k, k ) / d,KIND=sp)
+                       d22 = real( a( k-1, k-1 ) / d,KIND=sp)
+                       d12 = a( k-1, k ) / d
+                       tt = one / ( d11*d22-one )
+                       do j = k - 2, 1, -1
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wkm1 = tt*( d11*a( j, k-1 )-conjg( d12 )*a( j, k ) )
+                          wk = tt*( d22*a( j, k )-d12*a( j, k-1 ) )
+                          ! perform a rank-2 update of a(1:k-2,1:k-2)
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) -( a( i, k ) / d )*conjg( wk ) -( a( i, k-1 ) &
+                                       / d )*conjg( wkm1 )
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d
+                          a( j, k-1 ) = wkm1 / d
+                          ! (*) make sure that diagonal element of pivot is real
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=sp), zero,KIND=sp)
+                       end do
+                    end if
+                    ! copy superdiagonal elements of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    e( k ) = a( k-1, k )
+                    e( k-1 ) = czero
+                    a( k-1, k ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              34 continue
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = czero
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 64
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=sp)
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_icamax( imax-k, a( imax, k ), lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_icamax( n-imax, a( imax+1, imax ),1 )
+                          stemp = cabs1( a( itemp, imax ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=sp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( a( imax, imax ),KIND=sp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 42
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! for only a 2x2 pivot, interchange rows and columns k and p
+                 ! in the trailing submatrix a(k:n,k:n)
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! (1) swap columnar parts
+                    if( p<n )call stdlib_cswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = k + 1, p - 1
+                       t = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( p, j ) )
+                       a( p, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( p, k ) = conjg( a( p, k ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( k, k ),KIND=sp)
+                    a( k, k ) = real( a( p, p ),KIND=sp)
+                    a( p, p ) = r1
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_cswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                 end if
+                 ! for both 1x1 and 2x2 pivots, interchange rows and
+                 ! columns kk and kp in the trailing submatrix a(k:n,k:n)
+                 if( kp/=kk ) then
+                    ! (1) swap columnar parts
+                    if( kp<n )call stdlib_cswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! (2) swap and conjugate middle parts
+                    do j = kk + 1, kp - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( kk, kk ),KIND=sp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=sp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       ! (*) make sure that diagonal element of pivot is real
+                       a( k, k ) = real( a( k, k ),KIND=sp)
+                       ! (5) swap row elements
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_cswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                 else
+                    ! (*) make sure that diagonal element of pivot is real
+                    a( k, k ) = real( a( k, k ),KIND=sp)
+                    if( kstep==2 )a( k+1, k+1 ) = real( a( k+1, k+1 ),KIND=sp)
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of a now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                       ! store l(k) in column k
+                       ! handle division by a small number
+                       if( abs( real( a( k, k ),KIND=sp) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = one / real( a( k, k ),KIND=sp)
+                          call stdlib_cher( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_csscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = real( a( k, k ),KIND=sp)
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_cher( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       ! d = |a21|
+                       d = stdlib_slapy2( real( a( k+1, k ),KIND=sp),aimag( a( k+1, k ) ) )
+                                 
+                       d11 = real( a( k+1, k+1 ),KIND=sp) / d
+                       d22 = real( a( k, k ),KIND=sp) / d
+                       d21 = a( k+1, k ) / d
+                       tt = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = tt*( d11*a( j, k )-d21*a( j, k+1 ) )
+                          wkp1 = tt*( d22*a( j, k+1 )-conjg( d21 )*a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) -( a( i, k ) / d )*conjg( wk ) -( a( i, k+1 ) &
+                                       / d )*conjg( wkp1 )
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d
+                          a( j, k+1 ) = wkp1 / d
+                          ! (*) make sure that diagonal element of pivot is real
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=sp), zero,KIND=sp)
+                       end do
+                    end if
+                    ! copy subdiagonal elements of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    e( k ) = a( k+1, k )
+                    e( k+1 ) = czero
+                    a( k+1, k ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+              64 continue
+           end if
+           return
+     end subroutine stdlib_chetf2_rk
+
+     !> CHETF2_ROOK: computes the factorization of a complex Hermitian matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method:
+     !> A = U*D*U**H  or  A = L*D*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**H is the conjugate transpose of U, and D is
+     !> Hermitian and block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_chetf2_rook( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+        ! ======================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done, upper
+           integer(ilp) :: i, ii, imax, itemp, j, jmax, k, kk, kp, kstep, p
+           real(sp) :: absakk, alpha, colmax, d, d11, d22, r1, stemp, rowmax, tt, sfmin
+           complex(sp) :: d12, d21, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,max,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=sp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETF2_ROOK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=sp)
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_icamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_icamax( imax-1, a( 1, imax ), 1 )
+                          stemp = cabs1( a( itemp, imax ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=sp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( a( imax, imax ),KIND=sp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 12
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! for only a 2x2 pivot, interchange rows and columns k and p
+                 ! in the leading submatrix a(1:k,1:k)
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! (1) swap columnar parts
+                    if( p>1 )call stdlib_cswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = p + 1, k - 1
+                       t = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( p, j ) )
+                       a( p, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( p, k ) = conjg( a( p, k ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( k, k ),KIND=sp)
+                    a( k, k ) = real( a( p, p ),KIND=sp)
+                    a( p, p ) = r1
+                 end if
+                 ! for both 1x1 and 2x2 pivots, interchange rows and
+                 ! columns kk and kp in the leading submatrix a(1:k,1:k)
+                 if( kp/=kk ) then
+                    ! (1) swap columnar parts
+                    if( kp>1 )call stdlib_cswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = kp + 1, kk - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( kk, kk ),KIND=sp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=sp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       ! (*) make sure that diagonal element of pivot is real
+                       a( k, k ) = real( a( k, k ),KIND=sp)
+                       ! (5) swap row elements
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 else
+                    ! (*) make sure that diagonal element of pivot is real
+                    a( k, k ) = real( a( k, k ),KIND=sp)
+                    if( kstep==2 )a( k-1, k-1 ) = real( a( k-1, k-1 ),KIND=sp)
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( abs( real( a( k, k ),KIND=sp) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = one / real( a( k, k ),KIND=sp)
+                          call stdlib_cher( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_csscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = real( a( k, k ),KIND=sp)
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_cher( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       ! d = |a12|
+                       d = stdlib_slapy2( real( a( k-1, k ),KIND=sp),aimag( a( k-1, k ) ) )
+                                 
+                       d11 = real( a( k, k ) / d,KIND=sp)
+                       d22 = real( a( k-1, k-1 ) / d,KIND=sp)
+                       d12 = a( k-1, k ) / d
+                       tt = one / ( d11*d22-one )
+                       do j = k - 2, 1, -1
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wkm1 = tt*( d11*a( j, k-1 )-conjg( d12 )*a( j, k ) )
+                          wk = tt*( d22*a( j, k )-d12*a( j, k-1 ) )
+                          ! perform a rank-2 update of a(1:k-2,1:k-2)
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) -( a( i, k ) / d )*conjg( wk ) -( a( i, k-1 ) &
+                                       / d )*conjg( wkm1 )
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d
+                          a( j, k-1 ) = wkm1 / d
+                          ! (*) make sure that diagonal element of pivot is real
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=sp), zero,KIND=sp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=sp)
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_icamax( imax-k, a( imax, k ), lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_icamax( n-imax, a( imax+1, imax ),1 )
+                          stemp = cabs1( a( itemp, imax ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=sp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( a( imax, imax ),KIND=sp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 42
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! for only a 2x2 pivot, interchange rows and columns k and p
+                 ! in the trailing submatrix a(k:n,k:n)
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! (1) swap columnar parts
+                    if( p<n )call stdlib_cswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = k + 1, p - 1
+                       t = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( p, j ) )
+                       a( p, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( p, k ) = conjg( a( p, k ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( k, k ),KIND=sp)
+                    a( k, k ) = real( a( p, p ),KIND=sp)
+                    a( p, p ) = r1
+                 end if
+                 ! for both 1x1 and 2x2 pivots, interchange rows and
+                 ! columns kk and kp in the trailing submatrix a(k:n,k:n)
+                 if( kp/=kk ) then
+                    ! (1) swap columnar parts
+                    if( kp<n )call stdlib_cswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! (2) swap and conjugate middle parts
+                    do j = kk + 1, kp - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( kk, kk ),KIND=sp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=sp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       ! (*) make sure that diagonal element of pivot is real
+                       a( k, k ) = real( a( k, k ),KIND=sp)
+                       ! (5) swap row elements
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 else
+                    ! (*) make sure that diagonal element of pivot is real
+                    a( k, k ) = real( a( k, k ),KIND=sp)
+                    if( kstep==2 )a( k+1, k+1 ) = real( a( k+1, k+1 ),KIND=sp)
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of a now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                       ! store l(k) in column k
+                       ! handle division by a small number
+                       if( abs( real( a( k, k ),KIND=sp) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = one / real( a( k, k ),KIND=sp)
+                          call stdlib_cher( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_csscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = real( a( k, k ),KIND=sp)
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_cher( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       ! d = |a21|
+                       d = stdlib_slapy2( real( a( k+1, k ),KIND=sp),aimag( a( k+1, k ) ) )
+                                 
+                       d11 = real( a( k+1, k+1 ),KIND=sp) / d
+                       d22 = real( a( k, k ),KIND=sp) / d
+                       d21 = a( k+1, k ) / d
+                       tt = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = tt*( d11*a( j, k )-d21*a( j, k+1 ) )
+                          wkp1 = tt*( d22*a( j, k+1 )-conjg( d21 )*a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) -( a( i, k ) / d )*conjg( wk ) -( a( i, k+1 ) &
+                                       / d )*conjg( wkp1 )
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d
+                          a( j, k+1 ) = wkp1 / d
+                          ! (*) make sure that diagonal element of pivot is real
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=sp), zero,KIND=sp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_chetf2_rook
+
+     !> CHETRI: computes the inverse of a complex Hermitian indefinite matrix
+     !> A using the factorization A = U*D*U**H or A = L*D*L**H computed by
+     !> CHETRF.
+
+     pure subroutine stdlib_chetri( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp, kstep
+           real(sp) :: ak, akp1, d, t
+           complex(sp) :: akkp1, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / real( a( k, k ),KIND=sp)
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_ccopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_chemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - real( stdlib_cdotc( k-1, work, 1, a( 1,k ), 1 ),&
+                              KIND=sp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k+1 ) )
+                 ak = real( a( k, k ),KIND=sp) / t
+                 akp1 = real( a( k+1, k+1 ),KIND=sp) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_ccopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_chemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - real( stdlib_cdotc( k-1, work, 1, a( 1,k ), 1 ),&
+                              KIND=sp)
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_cdotc( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_ccopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_chemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -real( stdlib_cdotc( k-1, work, 1, a( 1, k+1 ),&
+                              1 ),KIND=sp)
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 call stdlib_cswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                 do j = kp + 1, k - 1
+                    temp = conjg( a( j, k ) )
+                    a( j, k ) = conjg( a( kp, j ) )
+                    a( kp, j ) = temp
+                 end do
+                 a( kp, k ) = conjg( a( kp, k ) )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              go to 30
+              50 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              60 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / real( a( k, k ),KIND=sp)
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_ccopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_chemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - real( stdlib_cdotc( n-k, work, 1,a( k+1, k ), 1 ),&
+                              KIND=sp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k-1 ) )
+                 ak = real( a( k-1, k-1 ),KIND=sp) / t
+                 akp1 = real( a( k, k ),KIND=sp) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_ccopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_chemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - real( stdlib_cdotc( n-k, work, 1,a( k+1, k ), 1 ),&
+                              KIND=sp)
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_cdotc( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 &
+                              )
+                    call stdlib_ccopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_chemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -real( stdlib_cdotc( n-k, work, 1, a( k+1, k-1 )&
+                              ,1 ),KIND=sp)
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 if( kp<n )call stdlib_cswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                 do j = k + 1, kp - 1
+                    temp = conjg( a( j, k ) )
+                    a( j, k ) = conjg( a( kp, j ) )
+                    a( kp, j ) = temp
+                 end do
+                 a( kp, k ) = conjg( a( kp, k ) )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              go to 60
+              80 continue
+           end if
+           return
+     end subroutine stdlib_chetri
+
+     !> CHETRI_ROOK: computes the inverse of a complex Hermitian indefinite matrix
+     !> A using the factorization A = U*D*U**H or A = L*D*L**H computed by
+     !> CHETRF_ROOK.
+
+     pure subroutine stdlib_chetri_rook( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp, kstep
+           real(sp) :: ak, akp1, d, t
+           complex(sp) :: akkp1, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRI_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 70
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / real( a( k, k ),KIND=sp)
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_ccopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_chemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - real( stdlib_cdotc( k-1, work, 1, a( 1,k ), 1 ),&
+                              KIND=sp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k+1 ) )
+                 ak = real( a( k, k ),KIND=sp) / t
+                 akp1 = real( a( k+1, k+1 ),KIND=sp) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_ccopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_chemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - real( stdlib_cdotc( k-1, work, 1, a( 1,k ), 1 ),&
+                              KIND=sp)
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_cdotc( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_ccopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_chemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -real( stdlib_cdotc( k-1, work, 1, a( 1, k+1 ),&
+                              1 ),KIND=sp)
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the leading
+                 ! submatrix a(1:k,1:k)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_cswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    do j = kp + 1, k - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k+1 with -ipiv(k) and
+                 ! -ipiv(k+1) in the leading submatrix a(k+1:n,k+1:n)
+                 ! (1) interchange rows and columns k and -ipiv(k)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_cswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    do j = kp + 1, k - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+                 ! (2) interchange rows and columns k+1 and -ipiv(k+1)
+                 k = k + 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_cswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    do j = kp + 1, k - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k + 1
+              go to 30
+              70 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**h.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              80 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 120
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / real( a( k, k ),KIND=sp)
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_ccopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_chemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - real( stdlib_cdotc( n-k, work, 1,a( k+1, k ), 1 ),&
+                              KIND=sp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k-1 ) )
+                 ak = real( a( k-1, k-1 ),KIND=sp) / t
+                 akp1 = real( a( k, k ),KIND=sp) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_ccopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_chemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - real( stdlib_cdotc( n-k, work, 1,a( k+1, k ), 1 ),&
+                              KIND=sp)
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_cdotc( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 &
+                              )
+                    call stdlib_ccopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_chemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -real( stdlib_cdotc( n-k, work, 1, a( k+1, k-1 )&
+                              ,1 ),KIND=sp)
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the trailing
+                 ! submatrix a(k:n,k:n)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_cswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    do j = k + 1, kp - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k-1 with -ipiv(k) and
+                 ! -ipiv(k-1) in the trailing submatrix a(k-1:n,k-1:n)
+                 ! (1) interchange rows and columns k and -ipiv(k)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_cswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    do j = k + 1, kp - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+                 ! (2) interchange rows and columns k-1 and -ipiv(k-1)
+                 k = k - 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_cswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    do j = k + 1, kp - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k - 1
+              go to 80
+              120 continue
+           end if
+           return
+     end subroutine stdlib_chetri_rook
+
+     !> CHETRS_3: solves a system of linear equations A * X = B with a complex
+     !> Hermitian matrix A using the factorization computed
+     !> by CHETRF_RK or CHETRF_BK:
+     !> A = P*U*D*(U**H)*(P**T) or A = P*L*D*(L**H)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is Hermitian and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This algorithm is using Level 3 BLAS.
+
+     pure subroutine stdlib_chetrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*), e(*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j, k, kp
+           real(sp) :: s
+           complex(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,real
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRS_3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! begin upper
+              ! solve a*x = b, where a = u*d*u**h.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+              call stdlib_ctrsm( 'L', 'U', 'N', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i = n
+              do while ( i>=1 )
+                 if( ipiv( i )>0 ) then
+                    s = real( cone,KIND=sp) / real( a( i, i ),KIND=sp)
+                    call stdlib_csscal( nrhs, s, b( i, 1 ), ldb )
+                 else if ( i>1 ) then
+                    akm1k = e( i )
+                    akm1 = a( i-1, i-1 ) / akm1k
+                    ak = a( i, i ) / conjg( akm1k )
+                    denom = akm1*ak - cone
+                    do j = 1, nrhs
+                       bkm1 = b( i-1, j ) / akm1k
+                       bk = b( i, j ) / conjg( akm1k )
+                       b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                       b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i - 1
+                 end if
+                 i = i - 1
+              end do
+              ! compute (u**h \ b) -> b   [ u**h \ (d \ (u \p**t * b) ) ]
+              call stdlib_ctrsm( 'L', 'U', 'C', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! p * b  [ p * (u**h \ (d \ (u \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+           else
+              ! begin lower
+              ! solve a*x = b, where a = l*d*l**h.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+              call stdlib_ctrsm( 'L', 'L', 'N', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i = 1
+              do while ( i<=n )
+                 if( ipiv( i )>0 ) then
+                    s = real( cone,KIND=sp) / real( a( i, i ),KIND=sp)
+                    call stdlib_csscal( nrhs, s, b( i, 1 ), ldb )
+                 else if( i<n ) then
+                    akm1k = e( i )
+                    akm1 = a( i, i ) / conjg( akm1k )
+                    ak = a( i+1, i+1 ) / akm1k
+                    denom = akm1*ak - cone
+                    do  j = 1, nrhs
+                       bkm1 = b( i, j ) / conjg( akm1k )
+                       bk = b( i+1, j ) / akm1k
+                       b( i, j ) = ( ak*bkm1-bk ) / denom
+                       b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i + 1
+                 end if
+                 i = i + 1
+              end do
+              ! compute (l**h \ b) -> b   [ l**h \ (d \ (l \p**t * b) ) ]
+              call stdlib_ctrsm('L', 'L', 'C', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! p * b  [ p * (l**h \ (d \ (l \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! end lower
+           end if
+           return
+     end subroutine stdlib_chetrs_3
+
+     !> Level 3 BLAS like routine for C in RFP Format.
+     !> CHFRK: performs one of the Hermitian rank--k operations
+     !> C := alpha*A*A**H + beta*C,
+     !> or
+     !> C := alpha*A**H*A + beta*C,
+     !> where alpha and beta are real scalars, C is an n--by--n Hermitian
+     !> matrix and A is an n--by--k matrix in the first case and a k--by--n
+     !> matrix in the second case.
+
+     pure subroutine stdlib_chfrk( transr, uplo, trans, n, k, alpha, a, lda, beta,c )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, n
+           character, intent(in) :: trans, transr, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: c(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, normaltransr, nisodd, notrans
+           integer(ilp) :: info, nrowa, j, nk, n1, n2
+           complex(sp) :: calpha, cbeta
+           ! Intrinsic Functions 
+           intrinsic :: max,cmplx
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           notrans = stdlib_lsame( trans, 'N' )
+           if( notrans ) then
+              nrowa = n
+           else
+              nrowa = k
+           end if
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( .not.notrans .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( lda<max( 1, nrowa ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHFRK ', -info )
+              return
+           end if
+           ! quick return if possible.
+           ! the quick return case: ((alpha==0).and.(beta/=zero)) is not
+           ! done (it is in stdlib_cherk for example) and left in the general case.
+           if( ( n==0 ) .or. ( ( ( alpha==zero ) .or. ( k==0 ) ) .and.( beta==one ) ) )&
+                     return
+           if( ( alpha==zero ) .and. ( beta==zero ) ) then
+              do j = 1, ( ( n*( n+1 ) ) / 2 )
+                 c( j ) = czero
+              end do
+              return
+           end if
+           calpha = cmplx( alpha, zero,KIND=sp)
+           cbeta = cmplx( beta, zero,KIND=sp)
+           ! c is n-by-n.
+           ! if n is odd, set nisodd = .true., and n1 and n2.
+           ! if n is even, nisodd = .false., and nk.
+           if( mod( n, 2 )==0 ) then
+              nisodd = .false.
+              nk = n / 2
+           else
+              nisodd = .true.
+              if( lower ) then
+                 n2 = n / 2
+                 n1 = n - n2
+              else
+                 n1 = n / 2
+                 n2 = n - n1
+              end if
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is odd, transr = 'n', uplo = 'l', and trans = 'n'
+                       call stdlib_cherk( 'L', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n )
+                                 
+                       call stdlib_cherk( 'U', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( n+1 )&
+                                 , n )
+                       call stdlib_cgemm( 'N', 'C', n2, n1, k, calpha, a( n1+1, 1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( n1+1 ), n )
+                    else
+                       ! n is odd, transr = 'n', uplo = 'l', and trans = 'c'
+                       call stdlib_cherk( 'L', 'C', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n )
+                                 
+                       call stdlib_cherk( 'U', 'C', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( n+1 )&
+                                 , n )
+                       call stdlib_cgemm( 'C', 'N', n2, n1, k, calpha, a( 1, n1+1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( n1+1 ), n )
+                    end if
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is odd, transr = 'n', uplo = 'u', and trans = 'n'
+                       call stdlib_cherk( 'L', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2+1 ), &
+                                 n )
+                       call stdlib_cherk( 'U', 'N', n2, k, alpha, a( n2, 1 ), lda,beta, c( n1+1 ),&
+                                  n )
+                       call stdlib_cgemm( 'N', 'C', n1, n2, k, calpha, a( 1, 1 ),lda, a( n2, 1 ), &
+                                 lda, cbeta, c( 1 ), n )
+                    else
+                       ! n is odd, transr = 'n', uplo = 'u', and trans = 'c'
+                       call stdlib_cherk( 'L', 'C', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2+1 ), &
+                                 n )
+                       call stdlib_cherk( 'U', 'C', n2, k, alpha, a( 1, n2 ), lda,beta, c( n1+1 ),&
+                                  n )
+                       call stdlib_cgemm( 'C', 'N', n1, n2, k, calpha, a( 1, 1 ),lda, a( 1, n2 ), &
+                                 lda, cbeta, c( 1 ), n )
+                    end if
+                 end if
+              else
+                 ! n is odd, and transr = 'c'
+                 if( lower ) then
+                    ! n is odd, transr = 'c', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is odd, transr = 'c', uplo = 'l', and trans = 'n'
+                       call stdlib_cherk( 'U', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n1 &
+                                 )
+                       call stdlib_cherk( 'L', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( 2 ), &
+                                 n1 )
+                       call stdlib_cgemm( 'N', 'C', n1, n2, k, calpha, a( 1, 1 ),lda, a( n1+1, 1 )&
+                                 , lda, cbeta,c( n1*n1+1 ), n1 )
+                    else
+                       ! n is odd, transr = 'c', uplo = 'l', and trans = 'c'
+                       call stdlib_cherk( 'U', 'C', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n1 &
+                                 )
+                       call stdlib_cherk( 'L', 'C', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( 2 ), &
+                                 n1 )
+                       call stdlib_cgemm( 'C', 'N', n1, n2, k, calpha, a( 1, 1 ),lda, a( 1, n1+1 )&
+                                 , lda, cbeta,c( n1*n1+1 ), n1 )
+                    end if
+                 else
+                    ! n is odd, transr = 'c', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is odd, transr = 'c', uplo = 'u', and trans = 'n'
+                       call stdlib_cherk( 'U', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2*n2+1 &
+                                 ), n2 )
+                       call stdlib_cherk( 'L', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( &
+                                 n1*n2+1 ), n2 )
+                       call stdlib_cgemm( 'N', 'C', n2, n1, k, calpha, a( n1+1, 1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( 1 ), n2 )
+                    else
+                       ! n is odd, transr = 'c', uplo = 'u', and trans = 'c'
+                       call stdlib_cherk( 'U', 'C', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2*n2+1 &
+                                 ), n2 )
+                       call stdlib_cherk( 'L', 'C', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( &
+                                 n1*n2+1 ), n2 )
+                       call stdlib_cgemm( 'C', 'N', n2, n1, k, calpha, a( 1, n1+1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( 1 ), n2 )
+                    end if
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is even, transr = 'n', uplo = 'l', and trans = 'n'
+                       call stdlib_cherk( 'L', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( 2 ), n+&
+                                 1 )
+                       call stdlib_cherk( 'U', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( 1 ), &
+                                 n+1 )
+                       call stdlib_cgemm( 'N', 'C', nk, nk, k, calpha, a( nk+1, 1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( nk+2 ),n+1 )
+                    else
+                       ! n is even, transr = 'n', uplo = 'l', and trans = 'c'
+                       call stdlib_cherk( 'L', 'C', nk, k, alpha, a( 1, 1 ), lda,beta, c( 2 ), n+&
+                                 1 )
+                       call stdlib_cherk( 'U', 'C', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( 1 ), &
+                                 n+1 )
+                       call stdlib_cgemm( 'C', 'N', nk, nk, k, calpha, a( 1, nk+1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( nk+2 ),n+1 )
+                    end if
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is even, transr = 'n', uplo = 'u', and trans = 'n'
+                       call stdlib_cherk( 'L', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+2 ), &
+                                 n+1 )
+                       call stdlib_cherk( 'U', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( nk+1 &
+                                 ), n+1 )
+                       call stdlib_cgemm( 'N', 'C', nk, nk, k, calpha, a( 1, 1 ),lda, a( nk+1, 1 )&
+                                 , lda, cbeta, c( 1 ),n+1 )
+                    else
+                       ! n is even, transr = 'n', uplo = 'u', and trans = 'c'
+                       call stdlib_cherk( 'L', 'C', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+2 ), &
+                                 n+1 )
+                       call stdlib_cherk( 'U', 'C', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( nk+1 &
+                                 ), n+1 )
+                       call stdlib_cgemm( 'C', 'N', nk, nk, k, calpha, a( 1, 1 ),lda, a( 1, nk+1 )&
+                                 , lda, cbeta, c( 1 ),n+1 )
+                    end if
+                 end if
+              else
+                 ! n is even, and transr = 'c'
+                 if( lower ) then
+                    ! n is even, transr = 'c', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is even, transr = 'c', uplo = 'l', and trans = 'n'
+                       call stdlib_cherk( 'U', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+1 ), &
+                                 nk )
+                       call stdlib_cherk( 'L', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( 1 ), &
+                                 nk )
+                       call stdlib_cgemm( 'N', 'C', nk, nk, k, calpha, a( 1, 1 ),lda, a( nk+1, 1 )&
+                                 , lda, cbeta,c( ( ( nk+1 )*nk )+1 ), nk )
+                    else
+                       ! n is even, transr = 'c', uplo = 'l', and trans = 'c'
+                       call stdlib_cherk( 'U', 'C', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+1 ), &
+                                 nk )
+                       call stdlib_cherk( 'L', 'C', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( 1 ), &
+                                 nk )
+                       call stdlib_cgemm( 'C', 'N', nk, nk, k, calpha, a( 1, 1 ),lda, a( 1, nk+1 )&
+                                 , lda, cbeta,c( ( ( nk+1 )*nk )+1 ), nk )
+                    end if
+                 else
+                    ! n is even, transr = 'c', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is even, transr = 'c', uplo = 'u', and trans = 'n'
+                       call stdlib_cherk( 'U', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk*( nk+&
+                                 1 )+1 ), nk )
+                       call stdlib_cherk( 'L', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( &
+                                 nk*nk+1 ), nk )
+                       call stdlib_cgemm( 'N', 'C', nk, nk, k, calpha, a( nk+1, 1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( 1 ), nk )
+                    else
+                       ! n is even, transr = 'c', uplo = 'u', and trans = 'c'
+                       call stdlib_cherk( 'U', 'C', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk*( nk+&
+                                 1 )+1 ), nk )
+                       call stdlib_cherk( 'L', 'C', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( &
+                                 nk*nk+1 ), nk )
+                       call stdlib_cgemm( 'C', 'N', nk, nk, k, calpha, a( 1, nk+1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( 1 ), nk )
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_chfrk
+
+     !> CHPGST: reduces a complex Hermitian-definite generalized
+     !> eigenproblem to standard form, using packed storage.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L.
+     !> B must have been previously factorized as U**H*U or L*L**H by CPPTRF.
+
+     pure subroutine stdlib_chpgst( itype, uplo, n, ap, bp, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: ap(*)
+           complex(sp), intent(in) :: bp(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, j1, j1j1, jj, k, k1, k1k1, kk
+           real(sp) :: ajj, akk, bjj, bkk
+           complex(sp) :: ct
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPGST', -info )
+              return
+           end if
+           if( itype==1 ) then
+              if( upper ) then
+                 ! compute inv(u**h)*a*inv(u)
+                 ! j1 and jj are the indices of a(1,j) and a(j,j)
+                 jj = 0
+                 do j = 1, n
+                    j1 = jj + 1
+                    jj = jj + j
+                    ! compute the j-th column of the upper triangle of a
+                    ap( jj ) = real( ap( jj ),KIND=sp)
+                    bjj = real( bp( jj ),KIND=sp)
+                    call stdlib_ctpsv( uplo, 'CONJUGATE TRANSPOSE', 'NON-UNIT', j,bp, ap( j1 ), 1 &
+                              )
+                    call stdlib_chpmv( uplo, j-1, -cone, ap, bp( j1 ), 1, cone,ap( j1 ), 1 )
+                              
+                    call stdlib_csscal( j-1, one / bjj, ap( j1 ), 1 )
+                    ap( jj ) = ( ap( jj )-stdlib_cdotc( j-1, ap( j1 ), 1, bp( j1 ),1 ) ) / &
+                              bjj
+                 end do
+              else
+                 ! compute inv(l)*a*inv(l**h)
+                 ! kk and k1k1 are the indices of a(k,k) and a(k+1,k+1)
+                 kk = 1
+                 do k = 1, n
+                    k1k1 = kk + n - k + 1
+                    ! update the lower triangle of a(k:n,k:n)
+                    akk = real( ap( kk ),KIND=sp)
+                    bkk = real( bp( kk ),KIND=sp)
+                    akk = akk / bkk**2
+                    ap( kk ) = akk
+                    if( k<n ) then
+                       call stdlib_csscal( n-k, one / bkk, ap( kk+1 ), 1 )
+                       ct = -half*akk
+                       call stdlib_caxpy( n-k, ct, bp( kk+1 ), 1, ap( kk+1 ), 1 )
+                       call stdlib_chpr2( uplo, n-k, -cone, ap( kk+1 ), 1,bp( kk+1 ), 1, ap( k1k1 &
+                                 ) )
+                       call stdlib_caxpy( n-k, ct, bp( kk+1 ), 1, ap( kk+1 ), 1 )
+                       call stdlib_ctpsv( uplo, 'NO TRANSPOSE', 'NON-UNIT', n-k,bp( k1k1 ), ap( &
+                                 kk+1 ), 1 )
+                    end if
+                    kk = k1k1
+                 end do
+              end if
+           else
+              if( upper ) then
+                 ! compute u*a*u**h
+                 ! k1 and kk are the indices of a(1,k) and a(k,k)
+                 kk = 0
+                 do k = 1, n
+                    k1 = kk + 1
+                    kk = kk + k
+                    ! update the upper triangle of a(1:k,1:k)
+                    akk = real( ap( kk ),KIND=sp)
+                    bkk = real( bp( kk ),KIND=sp)
+                    call stdlib_ctpmv( uplo, 'NO TRANSPOSE', 'NON-UNIT', k-1, bp,ap( k1 ), 1 )
+                              
+                    ct = half*akk
+                    call stdlib_caxpy( k-1, ct, bp( k1 ), 1, ap( k1 ), 1 )
+                    call stdlib_chpr2( uplo, k-1, cone, ap( k1 ), 1, bp( k1 ), 1,ap )
+                    call stdlib_caxpy( k-1, ct, bp( k1 ), 1, ap( k1 ), 1 )
+                    call stdlib_csscal( k-1, bkk, ap( k1 ), 1 )
+                    ap( kk ) = akk*bkk**2
+                 end do
+              else
+                 ! compute l**h *a*l
+                 ! jj and j1j1 are the indices of a(j,j) and a(j+1,j+1)
+                 jj = 1
+                 do j = 1, n
+                    j1j1 = jj + n - j + 1
+                    ! compute the j-th column of the lower triangle of a
+                    ajj = real( ap( jj ),KIND=sp)
+                    bjj = real( bp( jj ),KIND=sp)
+                    ap( jj ) = ajj*bjj + stdlib_cdotc( n-j, ap( jj+1 ), 1,bp( jj+1 ), 1 )
+                    call stdlib_csscal( n-j, bjj, ap( jj+1 ), 1 )
+                    call stdlib_chpmv( uplo, n-j, cone, ap( j1j1 ), bp( jj+1 ), 1,cone, ap( jj+1 )&
+                              , 1 )
+                    call stdlib_ctpmv( uplo, 'CONJUGATE TRANSPOSE', 'NON-UNIT',n-j+1, bp( jj ), &
+                              ap( jj ), 1 )
+                    jj = j1j1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_chpgst
+
+     !> CHPTRF: computes the factorization of a complex Hermitian packed
+     !> matrix A using the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**H  or  A = L*D*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+
+     pure subroutine stdlib_chptrf( uplo, n, ap, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kc, kk, knc, kp, kpc, kstep, kx, npp
+           real(sp) :: absakk, alpha, colmax, d, d11, d22, r1, rowmax, tt
+           complex(sp) :: d12, d21, t, wk, wkm1, wkp1, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,max,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPTRF', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              kc = ( n-1 )*n / 2 + 1
+              10 continue
+              knc = kc
+              ! if k < 1, exit from loop
+              if( k<1 )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( ap( kc+k-1 ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, ap( kc ), 1 )
+                 colmax = cabs1( ap( kc+imax-1 ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ap( kc+k-1 ) = real( ap( kc+k-1 ),KIND=sp)
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    rowmax = zero
+                    jmax = imax
+                    kx = imax*( imax+1 ) / 2 + imax
+                    do j = imax + 1, k
+                       if( cabs1( ap( kx ) )>rowmax ) then
+                          rowmax = cabs1( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + j
+                    end do
+                    kpc = ( imax-1 )*imax / 2 + 1
+                    if( imax>1 ) then
+                       jmax = stdlib_icamax( imax-1, ap( kpc ), 1 )
+                       rowmax = max( rowmax, cabs1( ap( kpc+jmax-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( real( ap( kpc+imax-1 ),KIND=sp) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kstep==2 )knc = knc - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_cswap( kp-1, ap( knc ), 1, ap( kpc ), 1 )
+                    kx = kpc + kp - 1
+                    do j = kp + 1, kk - 1
+                       kx = kx + j - 1
+                       t = conjg( ap( knc+j-1 ) )
+                       ap( knc+j-1 ) = conjg( ap( kx ) )
+                       ap( kx ) = t
+                    end do
+                    ap( kx+kk-1 ) = conjg( ap( kx+kk-1 ) )
+                    r1 = real( ap( knc+kk-1 ),KIND=sp)
+                    ap( knc+kk-1 ) = real( ap( kpc+kp-1 ),KIND=sp)
+                    ap( kpc+kp-1 ) = r1
+                    if( kstep==2 ) then
+                       ap( kc+k-1 ) = real( ap( kc+k-1 ),KIND=sp)
+                       t = ap( kc+k-2 )
+                       ap( kc+k-2 ) = ap( kc+kp-1 )
+                       ap( kc+kp-1 ) = t
+                    end if
+                 else
+                    ap( kc+k-1 ) = real( ap( kc+k-1 ),KIND=sp)
+                    if( kstep==2 )ap( kc-1 ) = real( ap( kc-1 ),KIND=sp)
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**h = a - w(k)*1/d(k)*w(k)**h
+                    r1 = one / real( ap( kc+k-1 ),KIND=sp)
+                    call stdlib_chpr( uplo, k-1, -r1, ap( kc ), 1, ap )
+                    ! store u(k) in column k
+                    call stdlib_csscal( k-1, r1, ap( kc ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**h
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**h
+                    if( k>2 ) then
+                       d = stdlib_slapy2( real( ap( k-1+( k-1 )*k / 2 ),KIND=sp),aimag( ap( k-1+( &
+                                 k-1 )*k / 2 ) ) )
+                       d22 = real( ap( k-1+( k-2 )*( k-1 ) / 2 ),KIND=sp) / d
+                       d11 = real( ap( k+( k-1 )*k / 2 ),KIND=sp) / d
+                       tt = one / ( d11*d22-one )
+                       d12 = ap( k-1+( k-1 )*k / 2 ) / d
+                       d = tt / d
+                       do j = k - 2, 1, -1
+                          wkm1 = d*( d11*ap( j+( k-2 )*( k-1 ) / 2 )-conjg( d12 )*ap( j+( k-1 )*k &
+                                    / 2 ) )
+                          wk = d*( d22*ap( j+( k-1 )*k / 2 )-d12*ap( j+( k-2 )*( k-1 ) / 2 ) )
+                                    
+                          do i = j, 1, -1
+                             ap( i+( j-1 )*j / 2 ) = ap( i+( j-1 )*j / 2 ) -ap( i+( k-1 )*k / 2 )&
+                                       *conjg( wk ) -ap( i+( k-2 )*( k-1 ) / 2 )*conjg( wkm1 )
+                          end do
+                          ap( j+( k-1 )*k / 2 ) = wk
+                          ap( j+( k-2 )*( k-1 ) / 2 ) = wkm1
+                          ap( j+( j-1 )*j / 2 ) = cmplx( real( ap( j+( j-1 )*j / 2 ),KIND=sp), &
+                                    zero,KIND=sp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              kc = knc - k
+              go to 10
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              kc = 1
+              npp = n*( n+1 ) / 2
+              60 continue
+              knc = kc
+              ! if k > n, exit from loop
+              if( k>n )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( ap( kc ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, ap( kc+1 ), 1 )
+                 colmax = cabs1( ap( kc+imax-k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ap( kc ) = real( ap( kc ),KIND=sp)
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    rowmax = zero
+                    kx = kc + imax - k
+                    do j = k, imax - 1
+                       if( cabs1( ap( kx ) )>rowmax ) then
+                          rowmax = cabs1( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + n - j
+                    end do
+                    kpc = npp - ( n-imax+1 )*( n-imax+2 ) / 2 + 1
+                    if( imax<n ) then
+                       jmax = imax + stdlib_icamax( n-imax, ap( kpc+1 ), 1 )
+                       rowmax = max( rowmax, cabs1( ap( kpc+jmax-imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( real( ap( kpc ),KIND=sp) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kstep==2 )knc = knc + n - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_cswap( n-kp, ap( knc+kp-kk+1 ), 1, ap( kpc+1 ),1 )
+                              
+                    kx = knc + kp - kk
+                    do j = kk + 1, kp - 1
+                       kx = kx + n - j + 1
+                       t = conjg( ap( knc+j-kk ) )
+                       ap( knc+j-kk ) = conjg( ap( kx ) )
+                       ap( kx ) = t
+                    end do
+                    ap( knc+kp-kk ) = conjg( ap( knc+kp-kk ) )
+                    r1 = real( ap( knc ),KIND=sp)
+                    ap( knc ) = real( ap( kpc ),KIND=sp)
+                    ap( kpc ) = r1
+                    if( kstep==2 ) then
+                       ap( kc ) = real( ap( kc ),KIND=sp)
+                       t = ap( kc+1 )
+                       ap( kc+1 ) = ap( kc+kp-k )
+                       ap( kc+kp-k ) = t
+                    end if
+                 else
+                    ap( kc ) = real( ap( kc ),KIND=sp)
+                    if( kstep==2 )ap( knc ) = real( ap( knc ),KIND=sp)
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**h = a - w(k)*(1/d(k))*w(k)**h
+                       r1 = one / real( ap( kc ),KIND=sp)
+                       call stdlib_chpr( uplo, n-k, -r1, ap( kc+1 ), 1,ap( kc+n-k+1 ) )
+                       ! store l(k) in column k
+                       call stdlib_csscal( n-k, r1, ap( kc+1 ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**h
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**h
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d = stdlib_slapy2( real( ap( k+1+( k-1 )*( 2*n-k ) / 2 ),KIND=sp),aimag( &
+                                 ap( k+1+( k-1 )*( 2*n-k ) / 2 ) ) )
+                       d11 = real( ap( k+1+k*( 2*n-k-1 ) / 2 ),KIND=sp) / d
+                       d22 = real( ap( k+( k-1 )*( 2*n-k ) / 2 ),KIND=sp) / d
+                       tt = one / ( d11*d22-one )
+                       d21 = ap( k+1+( k-1 )*( 2*n-k ) / 2 ) / d
+                       d = tt / d
+                       do j = k + 2, n
+                          wk = d*( d11*ap( j+( k-1 )*( 2*n-k ) / 2 )-d21*ap( j+k*( 2*n-k-1 ) / 2 )&
+                                     )
+                          wkp1 = d*( d22*ap( j+k*( 2*n-k-1 ) / 2 )-conjg( d21 )*ap( j+( k-1 )*( &
+                                    2*n-k ) / 2 ) )
+                          do i = j, n
+                             ap( i+( j-1 )*( 2*n-j ) / 2 ) = ap( i+( j-1 )*( 2*n-j ) / 2 ) - ap( &
+                             i+( k-1 )*( 2*n-k ) /2 )*conjg( wk ) - ap( i+k*( 2*n-k-1 ) / 2 )&
+                                       *conjg( wkp1 )
+                          end do
+                          ap( j+( k-1 )*( 2*n-k ) / 2 ) = wk
+                          ap( j+k*( 2*n-k-1 ) / 2 ) = wkp1
+                          ap( j+( j-1 )*( 2*n-j ) / 2 )= cmplx( real( ap( j+( j-1 )*( 2*n-j ) / 2 &
+                                    ),KIND=sp),zero,KIND=sp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              kc = knc + n - k + 2
+              go to 60
+           end if
+           110 continue
+           return
+     end subroutine stdlib_chptrf
+
+     !> CHPTRI: computes the inverse of a complex Hermitian indefinite matrix
+     !> A in packed storage using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by CHPTRF.
+
+     pure subroutine stdlib_chptri( uplo, n, ap, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: ap(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kcnext, kp, kpc, kstep, kx, npp
+           real(sp) :: ak, akp1, d, t
+           complex(sp) :: akkp1, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              kp = n*( n+1 ) / 2
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. ap( kp )==czero )return
+                 kp = kp - info
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              kp = 1
+              do info = 1, n
+                 if( ipiv( info )>0 .and. ap( kp )==czero )return
+                 kp = kp + n - info + 1
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              kcnext = kc + k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc+k-1 ) = one / real( ap( kc+k-1 ),KIND=sp)
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_ccopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_chpmv( uplo, k-1, -cone, ap, work, 1, czero,ap( kc ), 1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -real( stdlib_cdotc( k-1, work, 1, ap( kc ), 1 ),&
+                              KIND=sp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( ap( kcnext+k-1 ) )
+                 ak = real( ap( kc+k-1 ),KIND=sp) / t
+                 akp1 = real( ap( kcnext+k ),KIND=sp) / t
+                 akkp1 = ap( kcnext+k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 ap( kc+k-1 ) = akp1 / d
+                 ap( kcnext+k ) = ak / d
+                 ap( kcnext+k-1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_ccopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_chpmv( uplo, k-1, -cone, ap, work, 1, czero,ap( kc ), 1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -real( stdlib_cdotc( k-1, work, 1, ap( kc ), 1 ),&
+                              KIND=sp)
+                    ap( kcnext+k-1 ) = ap( kcnext+k-1 ) -stdlib_cdotc( k-1, ap( kc ), 1, ap( &
+                              kcnext ),1 )
+                    call stdlib_ccopy( k-1, ap( kcnext ), 1, work, 1 )
+                    call stdlib_chpmv( uplo, k-1, -cone, ap, work, 1, czero,ap( kcnext ), 1 )
+                              
+                    ap( kcnext+k ) = ap( kcnext+k ) -real( stdlib_cdotc( k-1, work, 1, ap( kcnext &
+                              ),1 ),KIND=sp)
+                 end if
+                 kstep = 2
+                 kcnext = kcnext + k + 1
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kpc = ( kp-1 )*kp / 2 + 1
+                 call stdlib_cswap( kp-1, ap( kc ), 1, ap( kpc ), 1 )
+                 kx = kpc + kp - 1
+                 do j = kp + 1, k - 1
+                    kx = kx + j - 1
+                    temp = conjg( ap( kc+j-1 ) )
+                    ap( kc+j-1 ) = conjg( ap( kx ) )
+                    ap( kx ) = temp
+                 end do
+                 ap( kc+kp-1 ) = conjg( ap( kc+kp-1 ) )
+                 temp = ap( kc+k-1 )
+                 ap( kc+k-1 ) = ap( kpc+kp-1 )
+                 ap( kpc+kp-1 ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc+k+k-1 )
+                    ap( kc+k+k-1 ) = ap( kc+k+kp-1 )
+                    ap( kc+k+kp-1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              kc = kcnext
+              go to 30
+              50 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              npp = n*( n+1 ) / 2
+              k = n
+              kc = npp
+              60 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 80
+              kcnext = kc - ( n-k+2 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc ) = one / real( ap( kc ),KIND=sp)
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_ccopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_chpmv( uplo, n-k, -cone, ap( kc+n-k+1 ), work, 1,czero, ap( kc+1 )&
+                              , 1 )
+                    ap( kc ) = ap( kc ) - real( stdlib_cdotc( n-k, work, 1,ap( kc+1 ), 1 ),&
+                              KIND=sp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( ap( kcnext+1 ) )
+                 ak = real( ap( kcnext ),KIND=sp) / t
+                 akp1 = real( ap( kc ),KIND=sp) / t
+                 akkp1 = ap( kcnext+1 ) / t
+                 d = t*( ak*akp1-one )
+                 ap( kcnext ) = akp1 / d
+                 ap( kc ) = ak / d
+                 ap( kcnext+1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_ccopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_chpmv( uplo, n-k, -cone, ap( kc+( n-k+1 ) ), work,1, czero, ap( &
+                              kc+1 ), 1 )
+                    ap( kc ) = ap( kc ) - real( stdlib_cdotc( n-k, work, 1,ap( kc+1 ), 1 ),&
+                              KIND=sp)
+                    ap( kcnext+1 ) = ap( kcnext+1 ) -stdlib_cdotc( n-k, ap( kc+1 ), 1,ap( kcnext+&
+                              2 ), 1 )
+                    call stdlib_ccopy( n-k, ap( kcnext+2 ), 1, work, 1 )
+                    call stdlib_chpmv( uplo, n-k, -cone, ap( kc+( n-k+1 ) ), work,1, czero, ap( &
+                              kcnext+2 ), 1 )
+                    ap( kcnext ) = ap( kcnext ) -real( stdlib_cdotc( n-k, work, 1, ap( kcnext+2 ),&
+                              1 ),KIND=sp)
+                 end if
+                 kstep = 2
+                 kcnext = kcnext - ( n-k+3 )
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kpc = npp - ( n-kp+1 )*( n-kp+2 ) / 2 + 1
+                 if( kp<n )call stdlib_cswap( n-kp, ap( kc+kp-k+1 ), 1, ap( kpc+1 ), 1 )
+                 kx = kc + kp - k
+                 do j = k + 1, kp - 1
+                    kx = kx + n - j + 1
+                    temp = conjg( ap( kc+j-k ) )
+                    ap( kc+j-k ) = conjg( ap( kx ) )
+                    ap( kx ) = temp
+                 end do
+                 ap( kc+kp-k ) = conjg( ap( kc+kp-k ) )
+                 temp = ap( kc )
+                 ap( kc ) = ap( kpc )
+                 ap( kpc ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc-n+k-1 )
+                    ap( kc-n+k-1 ) = ap( kc-n+kp-1 )
+                    ap( kc-n+kp-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              kc = kcnext
+              go to 60
+              80 continue
+           end if
+           return
+     end subroutine stdlib_chptri
+
+     !> CLA_GBAMV:  performs one of the matrix-vector operations
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_cla_gbamv( trans, m, n, kl, ku, alpha, ab, ldab, x,incx, beta, y, incy )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, ldab, m, n, kl, ku, trans
+           ! Array Arguments 
+           complex(sp), intent(in) :: ab(ldab,*), x(*)
+           real(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(sp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky, lenx, leny, kd, ke
+           complex(sp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,real,aimag,sign
+           ! Statement Functions
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( .not.( ( trans==stdlib_ilatrans( 'N' ) ).or. ( trans==stdlib_ilatrans( 'T' ) )&
+                     .or. ( trans==stdlib_ilatrans( 'C' ) ) ) ) then
+              info = 1
+           else if( m<0 )then
+              info = 2
+           else if( n<0 )then
+              info = 3
+           else if( kl<0 .or. kl>m-1 ) then
+              info = 4
+           else if( ku<0 .or. ku>n-1 ) then
+              info = 5
+           else if( ldab<kl+ku+1 )then
+              info = 6
+           else if( incx==0 )then
+              info = 8
+           else if( incy==0 )then
+              info = 11
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'CLA_GBAMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( m==0 ).or.( n==0 ).or.( ( alpha==czero ).and.( beta==cone ) ) )return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if( trans==stdlib_ilatrans( 'N' ) )then
+              lenx = n
+              leny = m
+           else
+              lenx = m
+              leny = n
+           end if
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( lenx - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( leny - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(m*n) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           kd = ku + 1
+           ke = kl + 1
+           iy = ky
+           if ( incx==1 ) then
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == 0.0_sp ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == 0.0_sp ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= 0.0_sp ) then
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = cabs1( ab( kd+i-j, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == 0.0_sp ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == 0.0_sp ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= 0.0_sp ) then
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = cabs1( ab( ke-i+j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == 0.0_sp ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == 0.0_sp ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= 0.0_sp ) then
+                       jx = kx
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = cabs1( ab( kd+i-j, j ) )
+                          symb_zero = symb_zero .and.( x( jx ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == 0.0_sp ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == 0.0_sp ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= 0.0_sp ) then
+                       jx = kx
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = cabs1( ab( ke-i+j, i ) )
+                          symb_zero = symb_zero .and.( x( jx ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_cla_gbamv
+
+     !> CLA_GBRPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     pure real(sp) function stdlib_cla_gbrpvgrw( n, kl, ku, ncols, ab, ldab, afb,ldafb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, kl, ku, ncols, ldab, ldafb
+           ! Array Arguments 
+           complex(sp), intent(in) :: ab(ldab,*), afb(ldafb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(sp) :: amax, umax, rpvgrw
+           complex(sp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,aimag
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           rpvgrw = one
+           kd = ku + 1
+           do j = 1, ncols
+              amax = zero
+              umax = zero
+              do i = max( j-ku, 1 ), min( j+kl, n )
+                 amax = max( cabs1( ab( kd+i-j, j ) ), amax )
+              end do
+              do i = max( j-ku, 1 ), j
+                 umax = max( cabs1( afb( kd+i-j, j ) ), umax )
+              end do
+              if ( umax /= 0.0_sp ) then
+                 rpvgrw = min( amax / umax, rpvgrw )
+              end if
+           end do
+           stdlib_cla_gbrpvgrw = rpvgrw
+     end function stdlib_cla_gbrpvgrw
+
+     !> CLA_GEAMV:  performs one of the matrix-vector operations
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_cla_geamv( trans, m, n, alpha, a, lda, x, incx, beta,y, incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           integer(ilp), intent(in) :: trans
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), x(*)
+           real(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(sp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky, lenx, leny
+           complex(sp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,real,aimag,sign
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( .not.( ( trans==stdlib_ilatrans( 'N' ) ).or. ( trans==stdlib_ilatrans( 'T' ) )&
+                     .or. ( trans==stdlib_ilatrans( 'C' ) ) ) ) then
+              info = 1
+           else if( m<0 )then
+              info = 2
+           else if( n<0 )then
+              info = 3
+           else if( lda<max( 1, m ) )then
+              info = 6
+           else if( incx==0 )then
+              info = 8
+           else if( incy==0 )then
+              info = 11
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'CLA_GEAMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( m==0 ).or.( n==0 ).or.( ( alpha==czero ).and.( beta==cone ) ) )return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if( trans==stdlib_ilatrans( 'N' ) )then
+              lenx = n
+              leny = m
+           else
+              lenx = m
+              leny = n
+           end if
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( lenx - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( leny - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(m*n) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == 0.0_sp ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == 0.0_sp ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= 0.0_sp ) then
+                       do j = 1, lenx
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero ) y( iy ) =y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == 0.0_sp ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == 0.0_sp ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= 0.0_sp ) then
+                       do j = 1, lenx
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero ) y( iy ) =y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == 0.0_sp ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == 0.0_sp ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= 0.0_sp ) then
+                       jx = kx
+                       do j = 1, lenx
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( jx ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero ) y( iy ) =y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == 0.0_sp ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == 0.0_sp ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= 0.0_sp ) then
+                       jx = kx
+                       do j = 1, lenx
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( jx ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero ) y( iy ) =y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_cla_geamv
+
+     !> CLA_GERPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     pure real(sp) function stdlib_cla_gerpvgrw( n, ncols, a, lda, af, ldaf )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, ncols, lda, ldaf
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), af(ldaf,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: amax, umax, rpvgrw
+           complex(sp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: max,min,abs,real,aimag
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           rpvgrw = one
+           do j = 1, ncols
+              amax = zero
+              umax = zero
+              do i = 1, n
+                 amax = max( cabs1( a( i, j ) ), amax )
+              end do
+              do i = 1, j
+                 umax = max( cabs1( af( i, j ) ), umax )
+              end do
+              if ( umax /= 0.0_sp ) then
+                 rpvgrw = min( amax / umax, rpvgrw )
+              end if
+           end do
+           stdlib_cla_gerpvgrw = rpvgrw
+     end function stdlib_cla_gerpvgrw
+
+     !> CLA_SYAMV  performs the matrix-vector operation
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> n by n symmetric matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_cla_heamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n, uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), x(*)
+           real(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(sp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky
+           complex(sp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign,real,aimag
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag ( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( uplo/=stdlib_ilauplo( 'U' ) .and.uplo/=stdlib_ilauplo( 'L' ) )then
+              info = 1
+           else if( n<0 )then
+              info = 2
+           else if( lda<max( 1, n ) )then
+              info = 5
+           else if( incx==0 )then
+              info = 7
+           else if( incy==0 )then
+              info = 10
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'CHEMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( n - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( n - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(n^2) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if (.not.symb_zero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if (.not.symb_zero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_cla_heamv
+
+     !> CLA_LIN_BERR: computes componentwise relative backward error from
+     !> the formula
+     !> max(i) ( abs(R(i)) / ( abs(op(A_s))*abs(Y) + abs(B_s) )(i) )
+     !> where abs(Z) is the componentwise absolute value of the matrix
+     !> or vector Z.
+
+     pure subroutine stdlib_cla_lin_berr( n, nz, nrhs, res, ayb, berr )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, nz, nrhs
+           ! Array Arguments 
+           real(sp), intent(in) :: ayb(n,nrhs)
+           real(sp), intent(out) :: berr(nrhs)
+           complex(sp), intent(in) :: res(n,nrhs)
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: tmp,safe1
+           integer(ilp) :: i, j
+           complex(sp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           complex(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! adding safe1 to the numerator guards against spuriously zero
+           ! residuals.  a similar safeguard is in the cla_yyamv routine used
+           ! to compute ayb.
+           safe1 = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = (nz+1)*safe1
+           do j = 1, nrhs
+              berr(j) = zero
+              do i = 1, n
+                 if (ayb(i,j) /= 0.0_sp) then
+                    tmp = (safe1 + cabs1(res(i,j)))/ayb(i,j)
+                    berr(j) = max( berr(j), tmp )
+                 end if
+           ! if ayb is exactly 0.0_sp (and if computed by cla_yyamv), then we know
+           ! the true residual also must be exactly zero.
+              end do
+           end do
+     end subroutine stdlib_cla_lin_berr
+
+     !> CLA_PORPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(sp) function stdlib_cla_porpvgrw( uplo, ncols, a, lda, af, ldaf, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: ncols, lda, ldaf
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: amax, umax, rpvgrw
+           logical(lk) :: upper
+           complex(sp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,aimag
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           ! stdlib_spotrf will have factored only the ncolsxncols leading minor, so
+           ! we restrict the growth search to that minor and use only the first
+           ! 2*ncols workspace entries.
+           rpvgrw = one
+           do i = 1, 2*ncols
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column.
+           if ( upper ) then
+              do j = 1, ncols
+                 do i = 1, j
+                    work( ncols+j ) =max( cabs1( a( i, j ) ), work( ncols+j ) )
+                 end do
+              end do
+           else
+              do j = 1, ncols
+                 do i = j, ncols
+                    work( ncols+j ) =max( cabs1( a( i, j ) ), work( ncols+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of the factor in
+           ! af.  no pivoting, so no permutations.
+           if ( stdlib_lsame( 'UPPER', uplo ) ) then
+              do j = 1, ncols
+                 do i = 1, j
+                    work( j ) = max( cabs1( af( i, j ) ), work( j ) )
+                 end do
+              end do
+           else
+              do j = 1, ncols
+                 do i = j, ncols
+                    work( j ) = max( cabs1( af( i, j ) ), work( j ) )
+                 end do
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( stdlib_lsame( 'UPPER', uplo ) ) then
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( ncols+i )
+                 if ( umax /= 0.0_sp ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( ncols+i )
+                 if ( umax /= 0.0_sp ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_cla_porpvgrw = rpvgrw
+     end function stdlib_cla_porpvgrw
+
+     !> CLA_SYAMV:  performs the matrix-vector operation
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> n by n symmetric matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_cla_syamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           integer(ilp), intent(in) :: uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), x(*)
+           real(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(sp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky
+           complex(sp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign,real,aimag
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag ( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( uplo/=stdlib_ilauplo( 'U' ) .and.uplo/=stdlib_ilauplo( 'L' ) )then
+              info = 1
+           else if( n<0 )then
+              info = 2
+           else if( lda<max( 1, n ) )then
+              info = 5
+           else if( incx==0 )then
+              info = 7
+           else if( incy==0 )then
+              info = 10
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'CLA_SYAMV', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( n - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( n - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(n^2) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_cla_syamv
+
+     !> CLA_WWADDW: adds a vector W into a doubled-single vector (X, Y).
+     !> This works for all extant IBM's hex and binary floating point
+     !> arithmetic, but not for decimal.
+
+     pure subroutine stdlib_cla_wwaddw( n, x, y, w )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: x(*), y(*)
+           complex(sp), intent(in) :: w(*)
+        ! =====================================================================
+           ! Local Scalars 
+           complex(sp) :: s
+           integer(ilp) :: i
+           ! Executable Statements 
+           do 10 i = 1, n
+             s = x(i) + w(i)
+             s = (s + s) - s
+             y(i) = ((x(i) - s) + w(i)) + y(i)
+             x(i) = s
+             10 continue
+           return
+     end subroutine stdlib_cla_wwaddw
+
+     !> CLACGV: conjugates a complex vector of length N.
+
+     pure subroutine stdlib_clacgv( n, x, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: x(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ioff
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           if( incx==1 ) then
+              do i = 1, n
+                 x( i ) = conjg( x( i ) )
+              end do
+           else
+              ioff = 1
+              if( incx<0 )ioff = 1 - ( n-1 )*incx
+              do i = 1, n
+                 x( ioff ) = conjg( x( ioff ) )
+                 ioff = ioff + incx
+              end do
+           end if
+           return
+     end subroutine stdlib_clacgv
+
+     !> CLACN2: estimates the 1-norm of a square, complex matrix A.
+     !> Reverse communication is used for evaluating matrix-vector products.
+
+     pure subroutine stdlib_clacn2( n, v, x, est, kase, isave )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(inout) :: kase
+           integer(ilp), intent(in) :: n
+           real(sp), intent(inout) :: est
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: isave(3)
+           complex(sp), intent(out) :: v(*)
+           complex(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, jlast
+           real(sp) :: absxi, altsgn, estold, safmin, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,real
+           ! Executable Statements 
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           if( kase==0 ) then
+              do i = 1, n
+                 x( i ) = cmplx( one / real( n,KIND=sp),KIND=sp)
+              end do
+              kase = 1
+              isave( 1 ) = 1
+              return
+           end if
+           go to ( 20, 40, 70, 90, 120 )isave( 1 )
+           ! ................ entry   (isave( 1 ) = 1)
+           ! first iteration.  x has been overwritten by a*x.
+           20 continue
+           if( n==1 ) then
+              v( 1 ) = x( 1 )
+              est = abs( v( 1 ) )
+              ! ... quit
+              go to 130
+           end if
+           est = stdlib_scsum1( n, x, 1 )
+           do i = 1, n
+              absxi = abs( x( i ) )
+              if( absxi>safmin ) then
+                 x( i ) = cmplx( real( x( i ),KIND=sp) / absxi,aimag( x( i ) ) / absxi,KIND=sp)
+                           
+              else
+                 x( i ) = cone
+              end if
+           end do
+           kase = 2
+           isave( 1 ) = 2
+           return
+           ! ................ entry   (isave( 1 ) = 2)
+           ! first iteration.  x has been overwritten by ctrans(a)*x.
+           40 continue
+           isave( 2 ) = stdlib_icmax1( n, x, 1 )
+           isave( 3 ) = 2
+           ! main loop - iterations 2,3,...,itmax.
+           50 continue
+           do i = 1, n
+              x( i ) = czero
+           end do
+           x( isave( 2 ) ) = cone
+           kase = 1
+           isave( 1 ) = 3
+           return
+           ! ................ entry   (isave( 1 ) = 3)
+           ! x has been overwritten by a*x.
+           70 continue
+           call stdlib_ccopy( n, x, 1, v, 1 )
+           estold = est
+           est = stdlib_scsum1( n, v, 1 )
+           ! test for cycling.
+           if( est<=estold )go to 100
+           do i = 1, n
+              absxi = abs( x( i ) )
+              if( absxi>safmin ) then
+                 x( i ) = cmplx( real( x( i ),KIND=sp) / absxi,aimag( x( i ) ) / absxi,KIND=sp)
+                           
+              else
+                 x( i ) = cone
+              end if
+           end do
+           kase = 2
+           isave( 1 ) = 4
+           return
+           ! ................ entry   (isave( 1 ) = 4)
+           ! x has been overwritten by ctrans(a)*x.
+           90 continue
+           jlast = isave( 2 )
+           isave( 2 ) = stdlib_icmax1( n, x, 1 )
+           if( ( abs( x( jlast ) )/=abs( x( isave( 2 ) ) ) ) .and.( isave( 3 )<itmax ) ) &
+                     then
+              isave( 3 ) = isave( 3 ) + 1
+              go to 50
+           end if
+           ! iteration complete.  final stage.
+           100 continue
+           altsgn = one
+           do i = 1, n
+              x( i ) = cmplx( altsgn*( one + real( i-1,KIND=sp) / real( n-1,KIND=sp) ),KIND=sp)
+                        
+              altsgn = -altsgn
+           end do
+           kase = 1
+           isave( 1 ) = 5
+           return
+           ! ................ entry   (isave( 1 ) = 5)
+           ! x has been overwritten by a*x.
+           120 continue
+           temp = two*( stdlib_scsum1( n, x, 1 ) / real( 3*n,KIND=sp) )
+           if( temp>est ) then
+              call stdlib_ccopy( n, x, 1, v, 1 )
+              est = temp
+           end if
+           130 continue
+           kase = 0
+           return
+     end subroutine stdlib_clacn2
+
+     !> CLACON: estimates the 1-norm of a square, complex matrix A.
+     !> Reverse communication is used for evaluating matrix-vector products.
+
+     subroutine stdlib_clacon( n, v, x, est, kase )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(inout) :: kase
+           integer(ilp), intent(in) :: n
+           real(sp), intent(inout) :: est
+           ! Array Arguments 
+           complex(sp), intent(out) :: v(n)
+           complex(sp), intent(inout) :: x(n)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iter, j, jlast, jump
+           real(sp) :: absxi, altsgn, estold, safmin, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,real
+           ! Save Statement 
+           save
+           ! Executable Statements 
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           if( kase==0 ) then
+              do i = 1, n
+                 x( i ) = cmplx( one / real( n,KIND=sp),KIND=sp)
+              end do
+              kase = 1
+              jump = 1
+              return
+           end if
+           go to ( 20, 40, 70, 90, 120 )jump
+           ! ................ entry   (jump = 1)
+           ! first iteration.  x has been overwritten by a*x.
+           20 continue
+           if( n==1 ) then
+              v( 1 ) = x( 1 )
+              est = abs( v( 1 ) )
+              ! ... quit
+              go to 130
+           end if
+           est = stdlib_scsum1( n, x, 1 )
+           do i = 1, n
+              absxi = abs( x( i ) )
+              if( absxi>safmin ) then
+                 x( i ) = cmplx( real( x( i ),KIND=sp) / absxi,aimag( x( i ) ) / absxi,KIND=sp)
+                           
+              else
+                 x( i ) = cone
+              end if
+           end do
+           kase = 2
+           jump = 2
+           return
+           ! ................ entry   (jump = 2)
+           ! first iteration.  x has been overwritten by ctrans(a)*x.
+           40 continue
+           j = stdlib_icmax1( n, x, 1 )
+           iter = 2
+           ! main loop - iterations 2,3,...,itmax.
+           50 continue
+           do i = 1, n
+              x( i ) = czero
+           end do
+           x( j ) = cone
+           kase = 1
+           jump = 3
+           return
+           ! ................ entry   (jump = 3)
+           ! x has been overwritten by a*x.
+           70 continue
+           call stdlib_ccopy( n, x, 1, v, 1 )
+           estold = est
+           est = stdlib_scsum1( n, v, 1 )
+           ! test for cycling.
+           if( est<=estold )go to 100
+           do i = 1, n
+              absxi = abs( x( i ) )
+              if( absxi>safmin ) then
+                 x( i ) = cmplx( real( x( i ),KIND=sp) / absxi,aimag( x( i ) ) / absxi,KIND=sp)
+                           
+              else
+                 x( i ) = cone
+              end if
+           end do
+           kase = 2
+           jump = 4
+           return
+           ! ................ entry   (jump = 4)
+           ! x has been overwritten by ctrans(a)*x.
+           90 continue
+           jlast = j
+           j = stdlib_icmax1( n, x, 1 )
+           if( ( abs( x( jlast ) )/=abs( x( j ) ) ) .and.( iter<itmax ) ) then
+              iter = iter + 1
+              go to 50
+           end if
+           ! iteration complete.  final stage.
+           100 continue
+           altsgn = one
+           do i = 1, n
+              x( i ) = cmplx( altsgn*( one+real( i-1,KIND=sp) / real( n-1,KIND=sp) ),KIND=sp)
+                        
+              altsgn = -altsgn
+           end do
+           kase = 1
+           jump = 5
+           return
+           ! ................ entry   (jump = 5)
+           ! x has been overwritten by a*x.
+           120 continue
+           temp = two*( stdlib_scsum1( n, x, 1 ) / real( 3*n,KIND=sp) )
+           if( temp>est ) then
+              call stdlib_ccopy( n, x, 1, v, 1 )
+              est = temp
+           end if
+           130 continue
+           kase = 0
+           return
+     end subroutine stdlib_clacon
+
+     !> CLACP2: copies all or part of a real two-dimensional matrix A to a
+     !> complex matrix B.
+
+     pure subroutine stdlib_clacp2( uplo, m, n, a, lda, b, ldb )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(out) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              do j = 1, n
+                 do i = j, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = 1, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_clacp2
+
+     !> CLACPY: copies all or part of a two-dimensional matrix A to another
+     !> matrix B.
+
+     pure subroutine stdlib_clacpy( uplo, m, n, a, lda, b, ldb )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(out) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              do j = 1, n
+                 do i = j, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = 1, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_clacpy
+
+     !> CLACRM: performs a very simple matrix-matrix multiplication:
+     !> C := A * B,
+     !> where A is M by N and complex; B is N by N and real;
+     !> C is M by N and complex.
+
+     pure subroutine stdlib_clacrm( m, n, a, lda, b, ldb, c, ldc, rwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           ! Array Arguments 
+           real(sp), intent(in) :: b(ldb,*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(out) :: c(ldc,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: aimag,cmplx,real
+           ! Executable Statements 
+           ! quick return if possible.
+           if( ( m==0 ) .or. ( n==0 ) )return
+           do j = 1, n
+              do i = 1, m
+                 rwork( ( j-1 )*m+i ) = real( a( i, j ),KIND=sp)
+              end do
+           end do
+           l = m*n + 1
+           call stdlib_sgemm( 'N', 'N', m, n, n, one, rwork, m, b, ldb, zero,rwork( l ), m )
+                     
+           do j = 1, n
+              do i = 1, m
+                 c( i, j ) = rwork( l+( j-1 )*m+i-1 )
+              end do
+           end do
+           do j = 1, n
+              do i = 1, m
+                 rwork( ( j-1 )*m+i ) = aimag( a( i, j ) )
+              end do
+           end do
+           call stdlib_sgemm( 'N', 'N', m, n, n, one, rwork, m, b, ldb, zero,rwork( l ), m )
+                     
+           do j = 1, n
+              do i = 1, m
+                 c( i, j ) = cmplx( real( c( i, j ),KIND=sp),rwork( l+( j-1 )*m+i-1 ),KIND=sp)
+                           
+              end do
+           end do
+           return
+     end subroutine stdlib_clacrm
+
+     !> CLACRT: performs the operation
+     !> (  c  s )( x )  ==> ( x )
+     !> ( -s  c )( y )      ( y )
+     !> where c and s are complex and the vectors x and y are complex.
+
+     pure subroutine stdlib_clacrt( n, cx, incx, cy, incy, c, s )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           complex(sp), intent(in) :: c, s
+           ! Array Arguments 
+           complex(sp), intent(inout) :: cx(*), cy(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           complex(sp) :: ctemp
+           ! Executable Statements 
+           if( n<=0 )return
+           if( incx==1 .and. incy==1 )go to 20
+           ! code for unequal increments or equal increments not equal to 1
+           ix = 1
+           iy = 1
+           if( incx<0 )ix = ( -n+1 )*incx + 1
+           if( incy<0 )iy = ( -n+1 )*incy + 1
+           do i = 1, n
+              ctemp = c*cx( ix ) + s*cy( iy )
+              cy( iy ) = c*cy( iy ) - s*cx( ix )
+              cx( ix ) = ctemp
+              ix = ix + incx
+              iy = iy + incy
+           end do
+           return
+           ! code for both increments equal to 1
+           20 continue
+           do i = 1, n
+              ctemp = c*cx( i ) + s*cy( i )
+              cy( i ) = c*cy( i ) - s*cx( i )
+              cx( i ) = ctemp
+           end do
+           return
+     end subroutine stdlib_clacrt
+
+     !> CLADIV: := X / Y, where X and Y are complex.  The computation of X / Y
+     !> will not overflow on an intermediary step unless the results
+     !> overflows.
+
+     pure complex(sp) function stdlib_cladiv( x, y )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: x, y
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: zi, zr
+           ! Intrinsic Functions 
+           intrinsic :: aimag,cmplx,real
+           ! Executable Statements 
+           call stdlib_sladiv( real( x,KIND=sp), aimag( x ), real( y,KIND=sp), aimag( y ), zr,zi )
+                     
+           stdlib_cladiv = cmplx( zr, zi,KIND=sp)
+           return
+     end function stdlib_cladiv
+
+     !> CLAED8: merges the two sets of eigenvalues together into a single
+     !> sorted set.  Then it tries to deflate the size of the problem.
+     !> There are two ways in which deflation can occur:  when two or more
+     !> eigenvalues are close together or if there is a tiny element in the
+     !> Z vector.  For each such occurrence the order of the related secular
+     !> equation problem is reduced by one.
+
+     pure subroutine stdlib_claed8( k, n, qsiz, q, ldq, d, rho, cutpnt, z, dlamda,q2, ldq2, w, &
+               indxp, indx, indxq, perm, givptr,givcol, givnum, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: cutpnt, ldq, ldq2, n, qsiz
+           integer(ilp), intent(out) :: givptr, info, k
+           real(sp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(2,*), indx(*), indxp(*), perm(*)
+           integer(ilp), intent(inout) :: indxq(*)
+           real(sp), intent(inout) :: d(*), z(*)
+           real(sp), intent(out) :: dlamda(*), givnum(2,*), w(*)
+           complex(sp), intent(inout) :: q(ldq,*)
+           complex(sp), intent(out) :: q2(ldq2,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: mone = -1.0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, imax, j, jlam, jmax, jp, k2, n1, n1p1, n2
+           real(sp) :: c, eps, s, t, tau, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( qsiz<n ) then
+              info = -3
+           else if( ldq<max( 1, n ) ) then
+              info = -5
+           else if( cutpnt<min( 1, n ) .or. cutpnt>n ) then
+              info = -8
+           else if( ldq2<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLAED8', -info )
+              return
+           end if
+           ! need to initialize givptr to o here in case of quick exit
+           ! to prevent an unspecified code behavior (usually sigfault)
+           ! when iwork array on entry to *stedc is not zeroed
+           ! (or at least some iwork entries which used in *laed7 for givptr).
+           givptr = 0
+           ! quick return if possible
+           if( n==0 )return
+           n1 = cutpnt
+           n2 = n - n1
+           n1p1 = n1 + 1
+           if( rho<zero ) then
+              call stdlib_sscal( n2, mone, z( n1p1 ), 1 )
+           end if
+           ! normalize z so that norm(z) = 1
+           t = one / sqrt( two )
+           do j = 1, n
+              indx( j ) = j
+           end do
+           call stdlib_sscal( n, t, z, 1 )
+           rho = abs( two*rho )
+           ! sort the eigenvalues into increasing order
+           do i = cutpnt + 1, n
+              indxq( i ) = indxq( i ) + cutpnt
+           end do
+           do i = 1, n
+              dlamda( i ) = d( indxq( i ) )
+              w( i ) = z( indxq( i ) )
+           end do
+           i = 1
+           j = cutpnt + 1
+           call stdlib_slamrg( n1, n2, dlamda, 1, 1, indx )
+           do i = 1, n
+              d( i ) = dlamda( indx( i ) )
+              z( i ) = w( indx( i ) )
+           end do
+           ! calculate the allowable deflation tolerance
+           imax = stdlib_isamax( n, z, 1 )
+           jmax = stdlib_isamax( n, d, 1 )
+           eps = stdlib_slamch( 'EPSILON' )
+           tol = eight*eps*abs( d( jmax ) )
+           ! if the rank-1 modifier is small enough, no more needs to be done
+           ! -- except to reorganize q so that its columns correspond with the
+           ! elements in d.
+           if( rho*abs( z( imax ) )<=tol ) then
+              k = 0
+              do j = 1, n
+                 perm( j ) = indxq( indx( j ) )
+                 call stdlib_ccopy( qsiz, q( 1, perm( j ) ), 1, q2( 1, j ), 1 )
+              end do
+              call stdlib_clacpy( 'A', qsiz, n, q2( 1, 1 ), ldq2, q( 1, 1 ), ldq )
+              return
+           end if
+           ! if there are multiple eigenvalues then the problem deflates.  here
+           ! the number of equal eigenvalues are found.  as each equal
+           ! eigenvalue is found, an elementary reflector is computed to rotate
+           ! the corresponding eigensubspace so that the corresponding
+           ! components of z are zero in this new basis.
+           k = 0
+           k2 = n + 1
+           do j = 1, n
+              if( rho*abs( z( j ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 indxp( k2 ) = j
+                 if( j==n )go to 100
+              else
+                 jlam = j
+                 go to 70
+              end if
+           end do
+           70 continue
+           j = j + 1
+           if( j>n )go to 90
+           if( rho*abs( z( j ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              indxp( k2 ) = j
+           else
+              ! check if eigenvalues are close enough to allow deflation.
+              s = z( jlam )
+              c = z( j )
+              ! find sqrt(a**2+b**2) without overflow or
+              ! destructive underflow.
+              tau = stdlib_slapy2( c, s )
+              t = d( j ) - d( jlam )
+              c = c / tau
+              s = -s / tau
+              if( abs( t*c*s )<=tol ) then
+                 ! deflation is possible.
+                 z( j ) = tau
+                 z( jlam ) = zero
+                 ! record the appropriate givens rotation
+                 givptr = givptr + 1
+                 givcol( 1, givptr ) = indxq( indx( jlam ) )
+                 givcol( 2, givptr ) = indxq( indx( j ) )
+                 givnum( 1, givptr ) = c
+                 givnum( 2, givptr ) = s
+                 call stdlib_csrot( qsiz, q( 1, indxq( indx( jlam ) ) ), 1,q( 1, indxq( indx( j ) &
+                           ) ), 1, c, s )
+                 t = d( jlam )*c*c + d( j )*s*s
+                 d( j ) = d( jlam )*s*s + d( j )*c*c
+                 d( jlam ) = t
+                 k2 = k2 - 1
+                 i = 1
+                 80 continue
+                 if( k2+i<=n ) then
+                    if( d( jlam )<d( indxp( k2+i ) ) ) then
+                       indxp( k2+i-1 ) = indxp( k2+i )
+                       indxp( k2+i ) = jlam
+                       i = i + 1
+                       go to 80
+                    else
+                       indxp( k2+i-1 ) = jlam
+                    end if
+                 else
+                    indxp( k2+i-1 ) = jlam
+                 end if
+                 jlam = j
+              else
+                 k = k + 1
+                 w( k ) = z( jlam )
+                 dlamda( k ) = d( jlam )
+                 indxp( k ) = jlam
+                 jlam = j
+              end if
+           end if
+           go to 70
+           90 continue
+           ! record the last eigenvalue.
+           k = k + 1
+           w( k ) = z( jlam )
+           dlamda( k ) = d( jlam )
+           indxp( k ) = jlam
+           100 continue
+           ! sort the eigenvalues and corresponding eigenvectors into dlamda
+           ! and q2 respectively.  the eigenvalues/vectors which were not
+           ! deflated go into the first k slots of dlamda and q2 respectively,
+           ! while those which were deflated go into the last n - k slots.
+           do j = 1, n
+              jp = indxp( j )
+              dlamda( j ) = d( jp )
+              perm( j ) = indxq( indx( jp ) )
+              call stdlib_ccopy( qsiz, q( 1, perm( j ) ), 1, q2( 1, j ), 1 )
+           end do
+           ! the deflated eigenvalues and their corresponding vectors go back
+           ! into the last n - k slots of d and q respectively.
+           if( k<n ) then
+              call stdlib_scopy( n-k, dlamda( k+1 ), 1, d( k+1 ), 1 )
+              call stdlib_clacpy( 'A', qsiz, n-k, q2( 1, k+1 ), ldq2, q( 1, k+1 ),ldq )
+           end if
+           return
+     end subroutine stdlib_claed8
+
+     !> CLAESY: computes the eigendecomposition of a 2-by-2 symmetric matrix
+     !> ( ( A, B );( B, C ) )
+     !> provided the norm of the matrix of eigenvectors is larger than
+     !> some threshold value.
+     !> RT1 is the eigenvalue of larger absolute value, and RT2 of
+     !> smaller absolute value.  If the eigenvectors are computed, then
+     !> on return ( CS1, SN1 ) is the unit eigenvector for RT1, hence
+     !> [  CS1     SN1   ] . [ A  B ] . [ CS1    -SN1   ] = [ RT1  0  ]
+     !> [ -SN1     CS1   ]   [ B  C ]   [ SN1     CS1   ]   [  0  RT2 ]
+
+     pure subroutine stdlib_claesy( a, b, c, rt1, rt2, evscal, cs1, sn1 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: a, b, c
+           complex(sp), intent(out) :: cs1, evscal, rt1, rt2, sn1
+       ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1_sp
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           real(sp) :: babs, evnorm, tabs, z
+           complex(sp) :: s, t, tmp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! special case:  the matrix is actually diagonal.
+           ! to avoid divide by zero later, we treat this case separately.
+           if( abs( b )==zero ) then
+              rt1 = a
+              rt2 = c
+              if( abs( rt1 )<abs( rt2 ) ) then
+                 tmp = rt1
+                 rt1 = rt2
+                 rt2 = tmp
+                 cs1 = zero
+                 sn1 = one
+              else
+                 cs1 = one
+                 sn1 = zero
+              end if
+           else
+              ! compute the eigenvalues and eigenvectors.
+              ! the characteristic equation is
+                 ! lambda **2 - (a+c) lambda + (a*c - b*b)
+              ! and we solve it using the quadratic formula.
+              s = ( a+c )*half
+              t = ( a-c )*half
+              ! take the square root carefully to avoid over/under flow.
+              babs = abs( b )
+              tabs = abs( t )
+              z = max( babs, tabs )
+              if( z>zero )t = z*sqrt( ( t / z )**2+( b / z )**2 )
+              ! compute the two eigenvalues.  rt1 and rt2 are exchanged
+              ! if necessary so that rt1 will have the greater magnitude.
+              rt1 = s + t
+              rt2 = s - t
+              if( abs( rt1 )<abs( rt2 ) ) then
+                 tmp = rt1
+                 rt1 = rt2
+                 rt2 = tmp
+              end if
+              ! choose cs1 = 1 and sn1 to satisfy the first equation, then
+              ! scale the components of this eigenvector so that the matrix
+              ! of eigenvectors x satisfies  x * x**t = i .  (no scaling is
+              ! done if the norm of the eigenvalue matrix is less than thresh.)
+              sn1 = ( rt1-a ) / b
+              tabs = abs( sn1 )
+              if( tabs>one ) then
+                 t = tabs*sqrt( ( one / tabs )**2+( sn1 / tabs )**2 )
+              else
+                 t = sqrt( cone+sn1*sn1 )
+              end if
+              evnorm = abs( t )
+              if( evnorm>=thresh ) then
+                 evscal = cone / t
+                 cs1 = evscal
+                 sn1 = sn1*evscal
+              else
+                 evscal = zero
+              end if
+           end if
+           return
+     end subroutine stdlib_claesy
+
+     !> CLAEV2: computes the eigendecomposition of a 2-by-2 Hermitian matrix
+     !> [  A         B  ]
+     !> [  CONJG(B)  C  ].
+     !> On return, RT1 is the eigenvalue of larger absolute value, RT2 is the
+     !> eigenvalue of smaller absolute value, and (CS1,SN1) is the unit right
+     !> eigenvector for RT1, giving the decomposition
+     !> [ CS1  CONJG(SN1) ] [    A     B ] [ CS1 -CONJG(SN1) ] = [ RT1  0  ]
+     !> [-SN1     CS1     ] [ CONJG(B) C ] [ SN1     CS1     ]   [  0  RT2 ].
+
+     pure subroutine stdlib_claev2( a, b, c, rt1, rt2, cs1, sn1 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(out) :: cs1, rt1, rt2
+           complex(sp), intent(in) :: a, b, c
+           complex(sp), intent(out) :: sn1
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(sp) :: t
+           complex(sp) :: w
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,real
+           ! Executable Statements 
+           if( abs( b )==zero ) then
+              w = one
+           else
+              w = conjg( b ) / abs( b )
+           end if
+           call stdlib_slaev2( real( a,KIND=sp), abs( b ), real( c,KIND=sp), rt1, rt2, cs1, t )
+                     
+           sn1 = w*t
+           return
+     end subroutine stdlib_claev2
+
+     !> CLAG2Z: converts a COMPLEX matrix, SA, to a COMPLEX*16 matrix, A.
+     !> Note that while it is possible to overflow while converting
+     !> from double to single, it is not possible to overflow when
+     !> converting from single to double.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_clag2z( m, n, sa, ldsa, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, m, n
+           ! Array Arguments 
+           complex(sp), intent(in) :: sa(ldsa,*)
+           complex(dp), intent(out) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           info = 0
+           do j = 1, n
+              do i = 1, m
+                 a( i, j ) = sa( i, j )
+              end do
+           end do
+           return
+     end subroutine stdlib_clag2z
+
+     !> CLAGTM: performs a matrix-vector product of the form
+     !> B := alpha * A * X + beta * B
+     !> where A is a tridiagonal matrix of order N, B and X are N by NRHS
+     !> matrices, and alpha and beta are real scalars, each of which may be
+     !> 0., 1., or -1.
+
+     pure subroutine stdlib_clagtm( trans, n, nrhs, alpha, dl, d, du, x, ldx, beta,b, ldb )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(sp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           complex(sp), intent(inout) :: b(ldb,*)
+           complex(sp), intent(in) :: d(*), dl(*), du(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           if( n==0 )return
+           ! multiply b by beta if beta/=1.
+           if( beta==zero ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = zero
+                 end do
+              end do
+           else if( beta==-one ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = -b( i, j )
+                 end do
+              end do
+           end if
+           if( alpha==one ) then
+              if( stdlib_lsame( trans, 'N' ) ) then
+                 ! compute b := b + a*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j ) +du( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) + dl( n-1 )*x( n-1, j ) +d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + dl( i-1 )*x( i-1, j ) +d( i )*x( i, j ) + du( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! compute b := b + a**t * x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j ) +dl( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) + du( n-1 )*x( n-1, j ) +d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + du( i-1 )*x( i-1, j ) +d( i )*x( i, j ) + dl( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else if( stdlib_lsame( trans, 'C' ) ) then
+                 ! compute b := b + a**h * x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + conjg( d( 1 ) )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + conjg( d( 1 ) )*x( 1, j ) +conjg( dl( 1 ) )*x( 2, &
+                                 j )
+                       b( n, j ) = b( n, j ) + conjg( du( n-1 ) )*x( n-1, j ) + conjg( d( n ) )*x(&
+                                  n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + conjg( du( i-1 ) )*x( i-1, j ) + conjg( d( i ) )&
+                                    *x( i, j ) + conjg( dl( i ) )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              end if
+           else if( alpha==-one ) then
+              if( stdlib_lsame( trans, 'N' ) ) then
+                 ! compute b := b - a*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j ) -du( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) - dl( n-1 )*x( n-1, j ) -d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - dl( i-1 )*x( i-1, j ) -d( i )*x( i, j ) - du( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! compute b := b - a**t*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j ) -dl( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) - du( n-1 )*x( n-1, j ) -d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - du( i-1 )*x( i-1, j ) -d( i )*x( i, j ) - dl( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else if( stdlib_lsame( trans, 'C' ) ) then
+                 ! compute b := b - a**h*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - conjg( d( 1 ) )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - conjg( d( 1 ) )*x( 1, j ) -conjg( dl( 1 ) )*x( 2, &
+                                 j )
+                       b( n, j ) = b( n, j ) - conjg( du( n-1 ) )*x( n-1, j ) - conjg( d( n ) )*x(&
+                                  n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - conjg( du( i-1 ) )*x( i-1, j ) - conjg( d( i ) )&
+                                    *x( i, j ) - conjg( dl( i ) )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_clagtm
+
+     !> CLAHEF: computes a partial factorization of a complex Hermitian
+     !> matrix A using the Bunch-Kaufman diagonal pivoting method. The
+     !> partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I      0     )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**H U22**H )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**H L21**H )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0      I     )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> Note that U**H denotes the conjugate transpose of U.
+     !> CLAHEF is an auxiliary routine called by CHETRF. It uses blocked code
+     !> (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or
+     !> A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_clahef( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: imax, j, jb, jj, jmax, jp, k, kk, kkw, kp, kstep, kw
+           real(sp) :: absakk, alpha, colmax, r1, rowmax, t
+           complex(sp) :: d11, d21, d22, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,conjg,max,min,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=sp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11 (note that conjg(w) is actually stored)
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              ! copy column k of a to column kw of w and update it
+              call stdlib_ccopy( k-1, a( 1, k ), 1, w( 1, kw ), 1 )
+              w( k, kw ) = real( a( k, k ),KIND=sp)
+              if( k<n ) then
+                 call stdlib_cgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ), lda,w( k, kw+1 ), &
+                           ldw, cone, w( 1, kw ), 1 )
+                 w( k, kw ) = real( w( k, kw ),KIND=sp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, kw ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=sp)
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! begin pivot search along imax row
+                    ! copy column imax to column kw-1 of w and update it
+                    call stdlib_ccopy( imax-1, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                    w( imax, kw-1 ) = real( a( imax, imax ),KIND=sp)
+                    call stdlib_ccopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                              
+                    call stdlib_clacgv( k-imax, w( imax+1, kw-1 ), 1 )
+                    if( k<n ) then
+                       call stdlib_cgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda, w( imax,&
+                                  kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                       w( imax, kw-1 ) = real( w( imax, kw-1 ),KIND=sp)
+                    end if
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value.
+                    ! determine only rowmax.
+                    jmax = imax + stdlib_icamax( k-imax, w( imax+1, kw-1 ), 1 )
+                    rowmax = cabs1( w( jmax, kw-1 ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_icamax( imax-1, w( 1, kw-1 ), 1 )
+                       rowmax = max( rowmax, cabs1( w( jmax, kw-1 ) ) )
+                    end if
+                    ! case(2)
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    ! case(3)
+                    else if( abs( real( w( imax, kw-1 ),KIND=sp) )>=alpha*rowmax )then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column kw-1 of w to column kw of w
+                       call stdlib_ccopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                    ! case(4)
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                    ! end pivot search along imax row
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=sp)
+                    call stdlib_ccopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_clacgv( kk-1-kp, a( kp, kp+1 ), lda )
+                    if( kp>1 )call stdlib_ccopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_cswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_cswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! (1) store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element u(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=sp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_ccopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(4))
+                       r1 = one / real( a( k, k ),KIND=sp)
+                       call stdlib_csscal( k-1, r1, a( 1, k ), 1 )
+                       ! (2) conjugate column w(kw)
+                       call stdlib_clacgv( k-1, w( 1, kw ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! (1) store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! (note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored)
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! = ( conj(d21)*( d11 ) d21*(  -1 ) )
+                         ! (           (  -1 )     ( d22 ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = t/d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0, since in 2x2 pivot case(4)
+                            ! |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / conjg( d21 )
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = one / ( real( d11*d22,KIND=sp)-one )
+                       d21 = t / d21
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = d21*( d11*w( j, kw-1 )-w( j, kw ) )
+                          a( j, k ) = conjg( d21 )*( d22*w( j, kw )-w( j, kw-1 ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                    ! (2) conjugate columns w(kw) and w(kw-1)
+                    call stdlib_clacgv( k-1, w( 1, kw ), 1 )
+                    call stdlib_clacgv( k-2, w( 1, kw-1 ), 1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**h = a11 - u12*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=sp)
+                    call stdlib_cgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=sp)
+                 end do
+                 ! update the rectangular superdiagonal block
+                 call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k,-cone, a( 1, k+1 ), &
+                           lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in of rows in columns k+1:n looping backwards from k+1 to n
+              j = k + 1
+              60 continue
+                 ! undo the interchanges (if any) of rows j and jp
+                 ! at each step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j + 1
+                 end if
+                 ! (note: here, j is used to determine row length. length n-j+1
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j + 1
+                 if( jp/=jj .and. j<=n )call stdlib_cswap( n-j+1, a( jp, j ), lda, a( jj, j ), &
+                           lda )
+              if( j<=n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22 (note that conjg(w) is actually stored)
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              ! copy column k of a to column k of w and update it
+              w( k, k ) = real( a( k, k ),KIND=sp)
+              if( k<n )call stdlib_ccopy( n-k, a( k+1, k ), 1, w( k+1, k ), 1 )
+              call stdlib_cgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ), lda,w( k, 1 ), ldw,&
+                         cone, w( k, k ), 1 )
+              w( k, k ) = real( w( k, k ),KIND=sp)
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, k ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=sp)
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! begin pivot search along imax row
+                    ! copy column imax to column k+1 of w and update it
+                    call stdlib_ccopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1 )
+                    call stdlib_clacgv( imax-k, w( k, k+1 ), 1 )
+                    w( imax, k+1 ) = real( a( imax, imax ),KIND=sp)
+                    if( imax<n )call stdlib_ccopy( n-imax, a( imax+1, imax ), 1,w( imax+1, k+1 ), &
+                              1 )
+                    call stdlib_cgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( imax, &
+                              1 ), ldw, cone, w( k, k+1 ),1 )
+                    w( imax, k+1 ) = real( w( imax, k+1 ),KIND=sp)
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value.
+                    ! determine only rowmax.
+                    jmax = k - 1 + stdlib_icamax( imax-k, w( k, k+1 ), 1 )
+                    rowmax = cabs1( w( jmax, k+1 ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_icamax( n-imax, w( imax+1, k+1 ), 1 )
+                       rowmax = max( rowmax, cabs1( w( jmax, k+1 ) ) )
+                    end if
+                    ! case(2)
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    ! case(3)
+                    else if( abs( real( w( imax, k+1 ),KIND=sp) )>=alpha*rowmax )then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column k+1 of w to column k of w
+                       call stdlib_ccopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                    ! case(4)
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                    ! end pivot search along imax row
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=sp)
+                    call stdlib_ccopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    call stdlib_clacgv( kp-kk-1, a( kp, kk+1 ), lda )
+                    if( kp<n )call stdlib_ccopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (columns k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_cswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_cswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! (1) store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=sp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_ccopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(4))
+                       r1 = one / real( a( k, k ),KIND=sp)
+                       call stdlib_csscal( n-k, r1, a( k+1, k ), 1 )
+                       ! (2) conjugate column w(k)
+                       call stdlib_clacgv( n-k, w( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! (1) store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! (note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored)
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! = ( conj(d21)*( d11 ) d21*(  -1 ) )
+                         ! (           (  -1 )     ( d22 ) )
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = t/d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0, since in 2x2 pivot case(4)
+                            ! |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / conjg( d21 )
+                       t = one / ( real( d11*d22,KIND=sp)-one )
+                       d21 = t / d21
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = conjg( d21 )*( d11*w( j, k )-w( j, k+1 ) )
+                          a( j, k+1 ) = d21*( d22*w( j, k+1 )-w( j, k ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    ! (2) conjugate columns w(k) and w(k+1)
+                    call stdlib_clacgv( n-k, w( k+1, k ), 1 )
+                    call stdlib_clacgv( n-k-1, w( k+2, k+1 ), 1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**h = a22 - l21*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=sp)
+                    call stdlib_cgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=sp)
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! of rows in columns 1:k-1 looping backwards from k-1 to 1
+              j = k - 1
+              120 continue
+                 ! undo the interchanges (if any) of rows j and jp
+                 ! at each step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j - 1
+                 end if
+                 ! (note: here, j is used to determine row length. length j
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j - 1
+                 if( jp/=jj .and. j>=1 )call stdlib_cswap( j, a( jp, 1 ), lda, a( jj, 1 ), lda )
+                           
+              if( j>=1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_clahef
+
+     !> CLAHEF_RK: computes a partial factorization of a complex Hermitian
+     !> matrix A using the bounded Bunch-Kaufman (rook) diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**H U22**H )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**H L21**H )  if UPLO = 'L',
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> CLAHEF_RK is an auxiliary routine called by CHETRF_RK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_clahef_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: w(ldw,*), e(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, ii, j, jb, jj, jmax, k, kk, kkw, kp, kstep, kw, p
+           real(sp) :: absakk, alpha, colmax, stemp, r1, rowmax, t, sfmin
+           complex(sp) :: d11, d21, d22, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,aimag,max,min,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=sp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11 (note that conjg(w) is actually stored)
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = czero
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              if( k>1 )call stdlib_ccopy( k-1, a( 1, k ), 1, w( 1, kw ), 1 )
+              w( k, kw ) = real( a( k, k ),KIND=sp)
+              if( k<n ) then
+                 call stdlib_cgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ), lda,w( k, kw+1 ), &
+                           ldw, cone, w( 1, kw ), 1 )
+                 w( k, kw ) = real( w( k, kw ),KIND=sp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, kw ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( w( k, kw ),KIND=sp)
+                 if( k>1 )call stdlib_ccopy( k-1, w( 1, kw ), 1, a( 1, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! lop until pivot found
+                    done = .false.
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       if( imax>1 )call stdlib_ccopy( imax-1, a( 1, imax ), 1, w( 1, kw-1 ),1 )
+                                 
+                       w( imax, kw-1 ) = real( a( imax, imax ),KIND=sp)
+                       call stdlib_ccopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       call stdlib_clacgv( k-imax, w( imax+1, kw-1 ), 1 )
+                       if( k<n ) then
+                          call stdlib_cgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda, w( &
+                                    imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                          w( imax, kw-1 ) = real( w( imax, kw-1 ),KIND=sp)
+                       end if
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_icamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = cabs1( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_icamax( imax-1, w( 1, kw-1 ), 1 )
+                          stemp = cabs1( w( itemp, kw-1 ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=sp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( w( imax,kw-1 ),KIND=sp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_ccopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_ccopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 12
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns p and k.
+                 ! updated column p is already stored in column kw of w.
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p of submatrix a
+                    ! at step k. no need to copy element into columns
+                    ! k and k-1 of a for 2-by-2 pivot, since these columns
+                    ! will be later overwritten.
+                    a( p, p ) = real( a( k, k ),KIND=sp)
+                    call stdlib_ccopy( k-1-p, a( p+1, k ), 1, a( p, p+1 ),lda )
+                    call stdlib_clacgv( k-1-p, a( p, p+1 ), lda )
+                    if( p>1 )call stdlib_ccopy( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in the last k+1 to n columns of a
+                    ! (columns k and k-1 of a for 2-by-2 pivot will be
+                    ! later overwritten). interchange rows k and p
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_cswap( n-k, a( k, k+1 ), lda, a( p, k+1 ),lda )
+                    call stdlib_cswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ),ldw )
+                 end if
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=sp)
+                    call stdlib_ccopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_clacgv( kk-1-kp, a( kp, kp+1 ), lda )
+                    if( kp>1 )call stdlib_ccopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_cswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_cswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! (1) store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element u(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=sp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_ccopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(3))
+                       ! handle division by a small number
+                       t = real( a( k, k ),KIND=sp)
+                       if( abs( t )>=sfmin ) then
+                          r1 = one / t
+                          call stdlib_csscal( k-1, r1, a( 1, k ), 1 )
+                       else
+                          do ii = 1, k-1
+                             a( ii, k ) = a( ii, k ) / t
+                          end do
+                       end if
+                       ! (2) conjugate column w(kw)
+                       call stdlib_clacgv( k-1, w( 1, kw ), 1 )
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! (1) store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! (note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored)
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! handle division by a small number. (note: order of
+                       ! operations is important)
+                       ! = ( t*(( d11 )/conj(d21)) t*((  -1 )/d21 ) )
+                         ! (   ((  -1 )          )   (( d22 )     ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0 in 2x2 pivot case(4),
+                            ! since |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / conjg( d21 )
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = one / ( real( d11*d22,KIND=sp)-one )
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( ( d11*w( j, kw-1 )-w( j, kw ) ) /d21 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /conjg( d21 ) )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy superdiagonal element of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = czero
+                    a( k, k ) = w( k, kw )
+                    e( k ) = w( k-1, kw )
+                    e( k-1 ) = czero
+                    ! (2) conjugate columns w(kw) and w(kw-1)
+                    call stdlib_clacgv( k-1, w( 1, kw ), 1 )
+                    call stdlib_clacgv( k-2, w( 1, kw-1 ), 1 )
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**h = a11 - u12*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=sp)
+                    call stdlib_cgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=sp)
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k,-cone, a( &
+                           1, k+1 ), lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22 (note that conjg(w) is actually stored)
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = czero
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update column k of w
+              w( k, k ) = real( a( k, k ),KIND=sp)
+              if( k<n )call stdlib_ccopy( n-k, a( k+1, k ), 1, w( k+1, k ), 1 )
+              if( k>1 ) then
+                 call stdlib_cgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( k, 1 ), &
+                           ldw, cone, w( k, k ), 1 )
+                 w( k, k ) = real( w( k, k ),KIND=sp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, k ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( w( k, k ),KIND=sp)
+                 if( k<n )call stdlib_ccopy( n-k, w( k+1, k ), 1, a( k+1, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_ccopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_clacgv( imax-k, w( k, k+1 ), 1 )
+                       w( imax, k+1 ) = real( a( imax, imax ),KIND=sp)
+                       if( imax<n )call stdlib_ccopy( n-imax, a( imax+1, imax ), 1,w( imax+1, k+1 &
+                                 ), 1 )
+                       if( k>1 ) then
+                          call stdlib_cgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone,a( k, 1 ), lda, w( &
+                                    imax, 1 ), ldw,cone, w( k, k+1 ), 1 )
+                          w( imax, k+1 ) = real( w( imax, k+1 ),KIND=sp)
+                       end if
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_icamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = cabs1( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_icamax( n-imax, w( imax+1, k+1 ), 1)
+                          stemp = cabs1( w( itemp, k+1 ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,k+1 ),KIND=sp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( w( imax,k+1 ),KIND=sp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_ccopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_ccopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 72
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns p and k (only for 2-by-2 pivot).
+                 ! updated column p is already stored in column k of w.
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column kk-1 to column p of submatrix a
+                    ! at step k. no need to copy element into columns
+                    ! k and k+1 of a for 2-by-2 pivot, since these columns
+                    ! will be later overwritten.
+                    a( p, p ) = real( a( k, k ),KIND=sp)
+                    call stdlib_ccopy( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                    call stdlib_clacgv( p-k-1, a( p, k+1 ), lda )
+                    if( p<n )call stdlib_ccopy( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    ! interchange rows k and p in first k-1 columns of a
+                    ! (columns k and k+1 of a for 2-by-2 pivot will be
+                    ! later overwritten). interchange rows k and p
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_cswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_cswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=sp)
+                    call stdlib_ccopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    call stdlib_clacgv( kp-kk-1, a( kp, kk+1 ), lda )
+                    if( kp<n )call stdlib_ccopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (column k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_cswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_cswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! (1) store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=sp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_ccopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(3))
+                       ! handle division by a small number
+                       t = real( a( k, k ),KIND=sp)
+                       if( abs( t )>=sfmin ) then
+                          r1 = one / t
+                          call stdlib_csscal( n-k, r1, a( k+1, k ), 1 )
+                       else
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / t
+                          end do
+                       end if
+                       ! (2) conjugate column w(k)
+                       call stdlib_clacgv( n-k, w( k+1, k ), 1 )
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! (1) store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored.
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! handle division by a small number. (note: order of
+                       ! operations is important)
+                       ! = ( t*(( d11 )/conj(d21)) t*((  -1 )/d21 ) )
+                         ! (   ((  -1 )          )   (( d22 )     ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0 in 2x2 pivot case(4),
+                            ! since |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / conjg( d21 )
+                       t = one / ( real( d11*d22,KIND=sp)-one )
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /conjg( d21 ) )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy subdiagonal element of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = czero
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    e( k ) = w( k+1, k )
+                    e( k+1 ) = czero
+                    ! (2) conjugate columns w(k) and w(k+1)
+                    call stdlib_clacgv( n-k, w( k+1, k ), 1 )
+                    call stdlib_clacgv( n-k-1, w( k+2, k+1 ), 1 )
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**h = a22 - l21*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=sp)
+                    call stdlib_cgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=sp)
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_clahef_rk
+
+     !> CLAHEF_ROOK: computes a partial factorization of a complex Hermitian
+     !> matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting
+     !> method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I      0     )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**H U22**H )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**H L21**H )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0      I     )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> Note that U**H denotes the conjugate transpose of U.
+     !> CLAHEF_ROOK is an auxiliary routine called by CHETRF_ROOK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_clahef_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, ii, j, jb, jj, jmax, jp1, jp2, k, kk, kkw, kp, kstep, kw, &
+                     p
+           real(sp) :: absakk, alpha, colmax, stemp, r1, rowmax, t, sfmin
+           complex(sp) :: d11, d21, d22, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,aimag,max,min,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=sp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11 (note that conjg(w) is actually stored)
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              if( k>1 )call stdlib_ccopy( k-1, a( 1, k ), 1, w( 1, kw ), 1 )
+              w( k, kw ) = real( a( k, k ),KIND=sp)
+              if( k<n ) then
+                 call stdlib_cgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ), lda,w( k, kw+1 ), &
+                           ldw, cone, w( 1, kw ), 1 )
+                 w( k, kw ) = real( w( k, kw ),KIND=sp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, kw ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( w( k, kw ),KIND=sp)
+                 if( k>1 )call stdlib_ccopy( k-1, w( 1, kw ), 1, a( 1, k ), 1 )
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! lop until pivot found
+                    done = .false.
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       if( imax>1 )call stdlib_ccopy( imax-1, a( 1, imax ), 1, w( 1, kw-1 ),1 )
+                                 
+                       w( imax, kw-1 ) = real( a( imax, imax ),KIND=sp)
+                       call stdlib_ccopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       call stdlib_clacgv( k-imax, w( imax+1, kw-1 ), 1 )
+                       if( k<n ) then
+                          call stdlib_cgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda, w( &
+                                    imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                          w( imax, kw-1 ) = real( w( imax, kw-1 ),KIND=sp)
+                       end if
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_icamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = cabs1( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_icamax( imax-1, w( 1, kw-1 ), 1 )
+                          stemp = cabs1( w( itemp, kw-1 ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=sp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( w( imax,kw-1 ),KIND=sp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_ccopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_ccopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 12
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns p and k.
+                 ! updated column p is already stored in column kw of w.
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p of submatrix a
+                    ! at step k. no need to copy element into columns
+                    ! k and k-1 of a for 2-by-2 pivot, since these columns
+                    ! will be later overwritten.
+                    a( p, p ) = real( a( k, k ),KIND=sp)
+                    call stdlib_ccopy( k-1-p, a( p+1, k ), 1, a( p, p+1 ),lda )
+                    call stdlib_clacgv( k-1-p, a( p, p+1 ), lda )
+                    if( p>1 )call stdlib_ccopy( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in the last k+1 to n columns of a
+                    ! (columns k and k-1 of a for 2-by-2 pivot will be
+                    ! later overwritten). interchange rows k and p
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_cswap( n-k, a( k, k+1 ), lda, a( p, k+1 ),lda )
+                    call stdlib_cswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ),ldw )
+                 end if
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=sp)
+                    call stdlib_ccopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_clacgv( kk-1-kp, a( kp, kp+1 ), lda )
+                    if( kp>1 )call stdlib_ccopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_cswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_cswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! (1) store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element u(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=sp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_ccopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(3))
+                       ! handle division by a small number
+                       t = real( a( k, k ),KIND=sp)
+                       if( abs( t )>=sfmin ) then
+                          r1 = one / t
+                          call stdlib_csscal( k-1, r1, a( 1, k ), 1 )
+                       else
+                          do ii = 1, k-1
+                             a( ii, k ) = a( ii, k ) / t
+                          end do
+                       end if
+                       ! (2) conjugate column w(kw)
+                       call stdlib_clacgv( k-1, w( 1, kw ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! (1) store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! (note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored)
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! handle division by a small number. (note: order of
+                       ! operations is important)
+                       ! = ( t*(( d11 )/conj(d21)) t*((  -1 )/d21 ) )
+                         ! (   ((  -1 )          )   (( d22 )     ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0 in 2x2 pivot case(4),
+                            ! since |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / conjg( d21 )
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = one / ( real( d11*d22,KIND=sp)-one )
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( ( d11*w( j, kw-1 )-w( j, kw ) ) /d21 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /conjg( d21 ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                    ! (2) conjugate columns w(kw) and w(kw-1)
+                    call stdlib_clacgv( k-1, w( 1, kw ), 1 )
+                    call stdlib_clacgv( k-2, w( 1, kw-1 ), 1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**h = a11 - u12*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=sp)
+                    call stdlib_cgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=sp)
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k,-cone, a( &
+                           1, k+1 ), lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in of rows in columns k+1:n looping backwards from k+1 to n
+              j = k + 1
+              60 continue
+                 ! undo the interchanges (if any) of rows j and jp2
+                 ! (or j and jp2, and j+1 and jp1) at each step j
+                 kstep = 1
+                 jp1 = 1
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    ! (here, j is a diagonal index)
+                    j = j + 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 ! (note: here, j is used to determine row length. length n-j+1
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j + 1
+                 if( jp2/=jj .and. j<=n )call stdlib_cswap( n-j+1, a( jp2, j ), lda, a( jj, j ), &
+                           lda )
+                 jj = jj + 1
+                 if( kstep==2 .and. jp1/=jj .and. j<=n )call stdlib_cswap( n-j+1, a( jp1, j ), &
+                           lda, a( jj, j ), lda )
+              if( j<n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22 (note that conjg(w) is actually stored)
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update column k of w
+              w( k, k ) = real( a( k, k ),KIND=sp)
+              if( k<n )call stdlib_ccopy( n-k, a( k+1, k ), 1, w( k+1, k ), 1 )
+              if( k>1 ) then
+                 call stdlib_cgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( k, 1 ), &
+                           ldw, cone, w( k, k ), 1 )
+                 w( k, k ) = real( w( k, k ),KIND=sp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, k ),KIND=sp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( w( k, k ),KIND=sp)
+                 if( k<n )call stdlib_ccopy( n-k, w( k+1, k ), 1, a( k+1, k ), 1 )
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_ccopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_clacgv( imax-k, w( k, k+1 ), 1 )
+                       w( imax, k+1 ) = real( a( imax, imax ),KIND=sp)
+                       if( imax<n )call stdlib_ccopy( n-imax, a( imax+1, imax ), 1,w( imax+1, k+1 &
+                                 ), 1 )
+                       if( k>1 ) then
+                          call stdlib_cgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone,a( k, 1 ), lda, w( &
+                                    imax, 1 ), ldw,cone, w( k, k+1 ), 1 )
+                          w( imax, k+1 ) = real( w( imax, k+1 ),KIND=sp)
+                       end if
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_icamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = cabs1( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_icamax( n-imax, w( imax+1, k+1 ), 1)
+                          stemp = cabs1( w( itemp, k+1 ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,k+1 ),KIND=sp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( w( imax,k+1 ),KIND=sp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_ccopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_ccopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 72
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns p and k (only for 2-by-2 pivot).
+                 ! updated column p is already stored in column k of w.
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column kk-1 to column p of submatrix a
+                    ! at step k. no need to copy element into columns
+                    ! k and k+1 of a for 2-by-2 pivot, since these columns
+                    ! will be later overwritten.
+                    a( p, p ) = real( a( k, k ),KIND=sp)
+                    call stdlib_ccopy( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                    call stdlib_clacgv( p-k-1, a( p, k+1 ), lda )
+                    if( p<n )call stdlib_ccopy( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    ! interchange rows k and p in first k-1 columns of a
+                    ! (columns k and k+1 of a for 2-by-2 pivot will be
+                    ! later overwritten). interchange rows k and p
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_cswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_cswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=sp)
+                    call stdlib_ccopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    call stdlib_clacgv( kp-kk-1, a( kp, kk+1 ), lda )
+                    if( kp<n )call stdlib_ccopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (column k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_cswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_cswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! (1) store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=sp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_ccopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(3))
+                       ! handle division by a small number
+                       t = real( a( k, k ),KIND=sp)
+                       if( abs( t )>=sfmin ) then
+                          r1 = one / t
+                          call stdlib_csscal( n-k, r1, a( k+1, k ), 1 )
+                       else
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / t
+                          end do
+                       end if
+                       ! (2) conjugate column w(k)
+                       call stdlib_clacgv( n-k, w( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! (1) store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored.
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! handle division by a small number. (note: order of
+                       ! operations is important)
+                       ! = ( t*(( d11 )/conj(d21)) t*((  -1 )/d21 ) )
+                         ! (   ((  -1 )          )   (( d22 )     ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0 in 2x2 pivot case(4),
+                            ! since |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / conjg( d21 )
+                       t = one / ( real( d11*d22,KIND=sp)-one )
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /conjg( d21 ) )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    ! (2) conjugate columns w(k) and w(k+1)
+                    call stdlib_clacgv( n-k, w( k+1, k ), 1 )
+                    call stdlib_clacgv( n-k-1, w( k+2, k+1 ), 1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**h = a22 - l21*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=sp)
+                    call stdlib_cgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=sp)
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! of rows in columns 1:k-1 looping backwards from k-1 to 1
+              j = k - 1
+              120 continue
+                 ! undo the interchanges (if any) of rows j and jp2
+                 ! (or j and jp2, and j-1 and jp1) at each step j
+                 kstep = 1
+                 jp1 = 1
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    ! (here, j is a diagonal index)
+                    j = j - 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 ! (note: here, j is used to determine row length. length j
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j - 1
+                 if( jp2/=jj .and. j>=1 )call stdlib_cswap( j, a( jp2, 1 ), lda, a( jj, 1 ), lda )
+                           
+                 jj = jj -1
+                 if( kstep==2 .and. jp1/=jj .and. j>=1 )call stdlib_cswap( j, a( jp1, 1 ), lda, a(&
+                            jj, 1 ), lda )
+              if( j>1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_clahef_rook
+
+     !> CLAIC1: applies one step of incremental condition estimation in
+     !> its simplest version:
+     !> Let x, twonorm(x) = 1, be an approximate singular vector of an j-by-j
+     !> lower triangular matrix L, such that
+     !> twonorm(L*x) = sest
+     !> Then CLAIC1 computes sestpr, s, c such that
+     !> the vector
+     !> [ s*x ]
+     !> xhat = [  c  ]
+     !> is an approximate singular vector of
+     !> [ L      0  ]
+     !> Lhat = [ w**H gamma ]
+     !> in the sense that
+     !> twonorm(Lhat*xhat) = sestpr.
+     !> Depending on JOB, an estimate for the largest or smallest singular
+     !> value is computed.
+     !> Note that [s c]**H and sestpr**2 is an eigenpair of the system
+     !> diag(sest*sest, 0) + [alpha  gamma] * [ conjg(alpha) ]
+     !> [ conjg(gamma) ]
+     !> where  alpha =  x**H*w.
+
+     pure subroutine stdlib_claic1( job, j, x, sest, w, gamma, sestpr, s, c )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: j, job
+           real(sp), intent(in) :: sest
+           real(sp), intent(out) :: sestpr
+           complex(sp), intent(out) :: c, s
+           complex(sp), intent(in) :: gamma
+           ! Array Arguments 
+           complex(sp), intent(in) :: w(j), x(j)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(sp) :: absalp, absest, absgam, b, eps, norma, s1, s2, scl, t, test, tmp, zeta1, &
+                     zeta2
+           complex(sp) :: alpha, cosine, sine
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,sqrt
+           ! Executable Statements 
+           eps = stdlib_slamch( 'EPSILON' )
+           alpha = stdlib_cdotc( j, x, 1, w, 1 )
+           absalp = abs( alpha )
+           absgam = abs( gamma )
+           absest = abs( sest )
+           if( job==1 ) then
+              ! estimating largest singular value
+              ! special cases
+              if( sest==zero ) then
+                 s1 = max( absgam, absalp )
+                 if( s1==zero ) then
+                    s = zero
+                    c = one
+                    sestpr = zero
+                 else
+                    s = alpha / s1
+                    c = gamma / s1
+                    tmp = real( sqrt( s*conjg( s )+c*conjg( c ) ),KIND=sp)
+                    s = s / tmp
+                    c = c / tmp
+                    sestpr = s1*tmp
+                 end if
+                 return
+              else if( absgam<=eps*absest ) then
+                 s = one
+                 c = zero
+                 tmp = max( absest, absalp )
+                 s1 = absest / tmp
+                 s2 = absalp / tmp
+                 sestpr = tmp*sqrt( s1*s1+s2*s2 )
+                 return
+              else if( absalp<=eps*absest ) then
+                 s1 = absgam
+                 s2 = absest
+                 if( s1<=s2 ) then
+                    s = one
+                    c = zero
+                    sestpr = s2
+                 else
+                    s = zero
+                    c = one
+                    sestpr = s1
+                 end if
+                 return
+              else if( absest<=eps*absalp .or. absest<=eps*absgam ) then
+                 s1 = absgam
+                 s2 = absalp
+                 if( s1<=s2 ) then
+                    tmp = s1 / s2
+                    scl = sqrt( one+tmp*tmp )
+                    sestpr = s2*scl
+                    s = ( alpha / s2 ) / scl
+                    c = ( gamma / s2 ) / scl
+                 else
+                    tmp = s2 / s1
+                    scl = sqrt( one+tmp*tmp )
+                    sestpr = s1*scl
+                    s = ( alpha / s1 ) / scl
+                    c = ( gamma / s1 ) / scl
+                 end if
+                 return
+              else
+                 ! normal case
+                 zeta1 = absalp / absest
+                 zeta2 = absgam / absest
+                 b = ( one-zeta1*zeta1-zeta2*zeta2 )*half
+                 c = zeta1*zeta1
+                 if( b>zero ) then
+                    t = real( c / ( b+sqrt( b*b+c ) ),KIND=sp)
+                 else
+                    t = real( sqrt( b*b+c ) - b,KIND=sp)
+                 end if
+                 sine = -( alpha / absest ) / t
+                 cosine = -( gamma / absest ) / ( one+t )
+                 tmp = real( sqrt( sine * conjg( sine )+ cosine * conjg( cosine ) ),KIND=sp)
+                           
+                 s = sine / tmp
+                 c = cosine / tmp
+                 sestpr = sqrt( t+one )*absest
+                 return
+              end if
+           else if( job==2 ) then
+              ! estimating smallest singular value
+              ! special cases
+              if( sest==zero ) then
+                 sestpr = zero
+                 if( max( absgam, absalp )==zero ) then
+                    sine = one
+                    cosine = zero
+                 else
+                    sine = -conjg( gamma )
+                    cosine = conjg( alpha )
+                 end if
+                 s1 = max( abs( sine ), abs( cosine ) )
+                 s = sine / s1
+                 c = cosine / s1
+                 tmp = real( sqrt( s*conjg( s )+c*conjg( c ) ),KIND=sp)
+                 s = s / tmp
+                 c = c / tmp
+                 return
+              else if( absgam<=eps*absest ) then
+                 s = zero
+                 c = one
+                 sestpr = absgam
+                 return
+              else if( absalp<=eps*absest ) then
+                 s1 = absgam
+                 s2 = absest
+                 if( s1<=s2 ) then
+                    s = zero
+                    c = one
+                    sestpr = s1
+                 else
+                    s = one
+                    c = zero
+                    sestpr = s2
+                 end if
+                 return
+              else if( absest<=eps*absalp .or. absest<=eps*absgam ) then
+                 s1 = absgam
+                 s2 = absalp
+                 if( s1<=s2 ) then
+                    tmp = s1 / s2
+                    scl = sqrt( one+tmp*tmp )
+                    sestpr = absest*( tmp / scl )
+                    s = -( conjg( gamma ) / s2 ) / scl
+                    c = ( conjg( alpha ) / s2 ) / scl
+                 else
+                    tmp = s2 / s1
+                    scl = sqrt( one+tmp*tmp )
+                    sestpr = absest / scl
+                    s = -( conjg( gamma ) / s1 ) / scl
+                    c = ( conjg( alpha ) / s1 ) / scl
+                 end if
+                 return
+              else
+                 ! normal case
+                 zeta1 = absalp / absest
+                 zeta2 = absgam / absest
+                 norma = max( one+zeta1*zeta1+zeta1*zeta2,zeta1*zeta2+zeta2*zeta2 )
+                 ! see if root is closer to zero or to one
+                 test = one + two*( zeta1-zeta2 )*( zeta1+zeta2 )
+                 if( test>=zero ) then
+                    ! root is close to zero, compute directly
+                    b = ( zeta1*zeta1+zeta2*zeta2+one )*half
+                    c = zeta2*zeta2
+                    t = real( c / ( b+sqrt( abs( b*b-c ) ) ),KIND=sp)
+                    sine = ( alpha / absest ) / ( one-t )
+                    cosine = -( gamma / absest ) / t
+                    sestpr = sqrt( t+four*eps*eps*norma )*absest
+                 else
+                    ! root is closer to one, shift by that amount
+                    b = ( zeta2*zeta2+zeta1*zeta1-one )*half
+                    c = zeta1*zeta1
+                    if( b>=zero ) then
+                       t = real( -c / ( b+sqrt( b*b+c ) ),KIND=sp)
+                    else
+                       t = real( b - sqrt( b*b+c ),KIND=sp)
+                    end if
+                    sine = -( alpha / absest ) / t
+                    cosine = -( gamma / absest ) / ( one+t )
+                    sestpr = sqrt( one+t+four*eps*eps*norma )*absest
+                 end if
+                 tmp = real( sqrt( sine * conjg( sine )+ cosine * conjg( cosine ) ),KIND=sp)
+                           
+                 s = sine / tmp
+                 c = cosine / tmp
+                 return
+              end if
+           end if
+           return
+     end subroutine stdlib_claic1
+
+     !> CLAPMR: rearranges the rows of the M by N matrix X as specified
+     !> by the permutation K(1),K(2),...,K(M) of the integers 1,...,M.
+     !> If FORWRD = .TRUE.,  forward permutation:
+     !> X(K(I),*) is moved X(I,*) for I = 1,2,...,M.
+     !> If FORWRD = .FALSE., backward permutation:
+     !> X(I,*) is moved to X(K(I),*) for I = 1,2,...,M.
+
+     pure subroutine stdlib_clapmr( forwrd, m, n, x, ldx, k )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: forwrd
+           integer(ilp), intent(in) :: ldx, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: k(*)
+           complex(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, in, j, jj
+           complex(sp) :: temp
+           ! Executable Statements 
+           if( m<=1 )return
+           do i = 1, m
+              k( i ) = -k( i )
+           end do
+           if( forwrd ) then
+              ! forward permutation
+              do i = 1, m
+                 if( k( i )>0 )go to 40
+                 j = i
+                 k( j ) = -k( j )
+                 in = k( j )
+                 20 continue
+                 if( k( in )>0 )go to 40
+                 do jj = 1, n
+                    temp = x( j, jj )
+                    x( j, jj ) = x( in, jj )
+                    x( in, jj ) = temp
+                 end do
+                 k( in ) = -k( in )
+                 j = in
+                 in = k( in )
+                 go to 20
+                 40 continue
+              end do
+           else
+              ! backward permutation
+              do i = 1, m
+                 if( k( i )>0 )go to 80
+                 k( i ) = -k( i )
+                 j = k( i )
+                 60 continue
+                 if( j==i )go to 80
+                 do jj = 1, n
+                    temp = x( i, jj )
+                    x( i, jj ) = x( j, jj )
+                    x( j, jj ) = temp
+                 end do
+                 k( j ) = -k( j )
+                 j = k( j )
+                 go to 60
+                 80 continue
+              end do
+           end if
+           return
+     end subroutine stdlib_clapmr
+
+     !> CLAPMT: rearranges the columns of the M by N matrix X as specified
+     !> by the permutation K(1),K(2),...,K(N) of the integers 1,...,N.
+     !> If FORWRD = .TRUE.,  forward permutation:
+     !> X(*,K(J)) is moved X(*,J) for J = 1,2,...,N.
+     !> If FORWRD = .FALSE., backward permutation:
+     !> X(*,J) is moved to X(*,K(J)) for J = 1,2,...,N.
+
+     pure subroutine stdlib_clapmt( forwrd, m, n, x, ldx, k )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: forwrd
+           integer(ilp), intent(in) :: ldx, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: k(*)
+           complex(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, in
+           complex(sp) :: temp
+           ! Executable Statements 
+           if( n<=1 )return
+           do i = 1, n
+              k( i ) = -k( i )
+           end do
+           if( forwrd ) then
+              ! forward permutation
+              do i = 1, n
+                 if( k( i )>0 )go to 40
+                 j = i
+                 k( j ) = -k( j )
+                 in = k( j )
+                 20 continue
+                 if( k( in )>0 )go to 40
+                 do ii = 1, m
+                    temp = x( ii, j )
+                    x( ii, j ) = x( ii, in )
+                    x( ii, in ) = temp
+                 end do
+                 k( in ) = -k( in )
+                 j = in
+                 in = k( in )
+                 go to 20
+                 40 continue
+              end do
+           else
+              ! backward permutation
+              do i = 1, n
+                 if( k( i )>0 )go to 100
+                 k( i ) = -k( i )
+                 j = k( i )
+                 80 continue
+                 if( j==i )go to 100
+                 do ii = 1, m
+                    temp = x( ii, i )
+                    x( ii, i ) = x( ii, j )
+                    x( ii, j ) = temp
+                 end do
+                 k( j ) = -k( j )
+                 j = k( j )
+                 go to 80
+                 100 continue
+              end do
+           end if
+           return
+     end subroutine stdlib_clapmt
+
+     !> CLAQGB: equilibrates a general M by N band matrix A with KL
+     !> subdiagonals and KU superdiagonals using the row and scaling factors
+     !> in the vectors R and C.
+
+     pure subroutine stdlib_claqgb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(sp), intent(in) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(sp), intent(in) :: c(*), r(*)
+           complex(sp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1e+0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           large = one / small
+           if( rowcnd>=thresh .and. amax>=small .and. amax<=large )then
+              ! no row scaling
+              if( colcnd>=thresh ) then
+                 ! no column scaling
+                 equed = 'N'
+              else
+                 ! column scaling
+                 do j = 1, n
+                    cj = c( j )
+                    do i = max( 1, j-ku ), min( m, j+kl )
+                       ab( ku+1+i-j, j ) = cj*ab( ku+1+i-j, j )
+                    end do
+                 end do
+                 equed = 'C'
+              end if
+           else if( colcnd>=thresh ) then
+              ! row scaling, no column scaling
+              do j = 1, n
+                 do i = max( 1, j-ku ), min( m, j+kl )
+                    ab( ku+1+i-j, j ) = r( i )*ab( ku+1+i-j, j )
+                 end do
+              end do
+              equed = 'R'
+           else
+              ! row and column scaling
+              do j = 1, n
+                 cj = c( j )
+                 do i = max( 1, j-ku ), min( m, j+kl )
+                    ab( ku+1+i-j, j ) = cj*r( i )*ab( ku+1+i-j, j )
+                 end do
+              end do
+              equed = 'B'
+           end if
+           return
+     end subroutine stdlib_claqgb
+
+     !> CLAQGE: equilibrates a general M by N matrix A using the row and
+     !> column scaling factors in the vectors R and C.
+
+     pure subroutine stdlib_claqge( m, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           integer(ilp), intent(in) :: lda, m, n
+           real(sp), intent(in) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(sp), intent(in) :: c(*), r(*)
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1e+0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           large = one / small
+           if( rowcnd>=thresh .and. amax>=small .and. amax<=large )then
+              ! no row scaling
+              if( colcnd>=thresh ) then
+                 ! no column scaling
+                 equed = 'N'
+              else
+                 ! column scaling
+                 do j = 1, n
+                    cj = c( j )
+                    do i = 1, m
+                       a( i, j ) = cj*a( i, j )
+                    end do
+                 end do
+                 equed = 'C'
+              end if
+           else if( colcnd>=thresh ) then
+              ! row scaling, no column scaling
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = r( i )*a( i, j )
+                 end do
+              end do
+              equed = 'R'
+           else
+              ! row and column scaling
+              do j = 1, n
+                 cj = c( j )
+                 do i = 1, m
+                    a( i, j ) = cj*r( i )*a( i, j )
+                 end do
+              end do
+              equed = 'B'
+           end if
+           return
+     end subroutine stdlib_claqge
+
+     !> CLAQHB: equilibrates an Hermitian band matrix A using the scaling
+     !> factors in the vector S.
+
+     pure subroutine stdlib_claqhb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(sp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(out) :: s(*)
+           complex(sp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1e+0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored in band format.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = max( 1, j-kd ), j - 1
+                       ab( kd+1+i-j, j ) = cj*s( i )*ab( kd+1+i-j, j )
+                    end do
+                    ab( kd+1, j ) = cj*cj*real( ab( kd+1, j ),KIND=sp)
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    ab( 1, j ) = cj*cj*real( ab( 1, j ),KIND=sp)
+                    do i = j + 1, min( n, j+kd )
+                       ab( 1+i-j, j ) = cj*s( i )*ab( 1+i-j, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_claqhb
+
+     !> CLAQHE: equilibrates a Hermitian matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_claqhe( uplo, n, a, lda, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(in) :: s(*)
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1e+0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j - 1
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                    a( j, j ) = cj*cj*real( a( j, j ),KIND=sp)
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    a( j, j ) = cj*cj*real( a( j, j ),KIND=sp)
+                    do i = j + 1, n
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_claqhe
+
+     !> CLAQHP: equilibrates a Hermitian matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_claqhp( uplo, n, ap, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(in) :: s(*)
+           complex(sp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1e+0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jc
+           real(sp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j - 1
+                       ap( jc+i-1 ) = cj*s( i )*ap( jc+i-1 )
+                    end do
+                    ap( jc+j-1 ) = cj*cj*real( ap( jc+j-1 ),KIND=sp)
+                    jc = jc + j
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    ap( jc ) = cj*cj*real( ap( jc ),KIND=sp)
+                    do i = j + 1, n
+                       ap( jc+i-j ) = cj*s( i )*ap( jc+i-j )
+                    end do
+                    jc = jc + n - j + 1
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_claqhp
+
+     !> Given a 2-by-2 or 3-by-3 matrix H, CLAQR1: sets v to a
+     !> scalar multiple of the first column of the product
+     !> (*)  K = (H - s1*I)*(H - s2*I)
+     !> scaling to avoid overflows and most underflows.
+     !> This is useful for starting double implicit shift bulges
+     !> in the QR algorithm.
+
+     pure subroutine stdlib_claqr1( n, h, ldh, s1, s2, v )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(sp), intent(in) :: s1, s2
+           integer(ilp), intent(in) :: ldh, n
+           ! Array Arguments 
+           complex(sp), intent(in) :: h(ldh,*)
+           complex(sp), intent(out) :: v(*)
+        ! ================================================================
+           ! Parameters 
+           real(sp), parameter :: rzero = 0.0_sp
+           
+           
+           ! Local Scalars 
+           complex(sp) :: cdum, h21s, h31s
+           real(sp) :: s
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! quick return if possible
+           if( n/=2 .and. n/=3 ) then
+              return
+           end if
+           if( n==2 ) then
+              s = cabs1( h( 1, 1 )-s2 ) + cabs1( h( 2, 1 ) )
+              if( s==rzero ) then
+                 v( 1 ) = czero
+                 v( 2 ) = czero
+              else
+                 h21s = h( 2, 1 ) / s
+                 v( 1 ) = h21s*h( 1, 2 ) + ( h( 1, 1 )-s1 )*( ( h( 1, 1 )-s2 ) / s )
+                 v( 2 ) = h21s*( h( 1, 1 )+h( 2, 2 )-s1-s2 )
+              end if
+           else
+              s = cabs1( h( 1, 1 )-s2 ) + cabs1( h( 2, 1 ) ) +cabs1( h( 3, 1 ) )
+              if( s==czero ) then
+                 v( 1 ) = czero
+                 v( 2 ) = czero
+                 v( 3 ) = czero
+              else
+                 h21s = h( 2, 1 ) / s
+                 h31s = h( 3, 1 ) / s
+                 v( 1 ) = ( h( 1, 1 )-s1 )*( ( h( 1, 1 )-s2 ) / s ) +h( 1, 2 )*h21s + h( 1, 3 )&
+                           *h31s
+                 v( 2 ) = h21s*( h( 1, 1 )+h( 2, 2 )-s1-s2 ) + h( 2, 3 )*h31s
+                 v( 3 ) = h31s*( h( 1, 1 )+h( 3, 3 )-s1-s2 ) + h21s*h( 3, 2 )
+              end if
+           end if
+     end subroutine stdlib_claqr1
+
+     !> CLAQSB: equilibrates a symmetric band matrix A using the scaling
+     !> factors in the vector S.
+
+     pure subroutine stdlib_claqsb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(sp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(in) :: s(*)
+           complex(sp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1e+0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored in band format.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = max( 1, j-kd ), j
+                       ab( kd+1+i-j, j ) = cj*s( i )*ab( kd+1+i-j, j )
+                    end do
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, min( n, j+kd )
+                       ab( 1+i-j, j ) = cj*s( i )*ab( 1+i-j, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_claqsb
+
+     !> CLAQSP: equilibrates a symmetric matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_claqsp( uplo, n, ap, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(in) :: s(*)
+           complex(sp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1e+0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jc
+           real(sp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j
+                       ap( jc+i-1 ) = cj*s( i )*ap( jc+i-1 )
+                    end do
+                    jc = jc + j
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, n
+                       ap( jc+i-j ) = cj*s( i )*ap( jc+i-j )
+                    end do
+                    jc = jc + n - j + 1
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_claqsp
+
+     !> CLAQSY: equilibrates a symmetric matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_claqsy( uplo, n, a, lda, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(in) :: s(*)
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1e+0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, n
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_claqsy
+
+     !> CLAR1V: computes the (scaled) r-th column of the inverse of
+     !> the sumbmatrix in rows B1 through BN of the tridiagonal matrix
+     !> L D L**T - sigma I. When sigma is close to an eigenvalue, the
+     !> computed vector is an accurate eigenvector. Usually, r corresponds
+     !> to the index where the eigenvector is largest in magnitude.
+     !> The following steps accomplish this computation :
+     !> (a) Stationary qd transform,  L D L**T - sigma I = L(+) D(+) L(+)**T,
+     !> (b) Progressive qd transform, L D L**T - sigma I = U(-) D(-) U(-)**T,
+     !> (c) Computation of the diagonal elements of the inverse of
+     !> L D L**T - sigma I by combining the above transforms, and choosing
+     !> r as the index where the diagonal of the inverse is (one of the)
+     !> largest in magnitude.
+     !> (d) Computation of the (scaled) r-th column of the inverse using the
+     !> twisted factorization obtained by combining the top part of the
+     !> the stationary and the bottom part of the progressive transform.
+
+     pure subroutine stdlib_clar1v( n, b1, bn, lambda, d, l, ld, lld,pivmin, gaptol, z, wantnc, &
+               negcnt, ztz, mingma,r, isuppz, nrminv, resid, rqcorr, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantnc
+           integer(ilp), intent(in) :: b1, bn, n
+           integer(ilp), intent(out) :: negcnt
+           integer(ilp), intent(inout) :: r
+           real(sp), intent(in) :: gaptol, lambda, pivmin
+           real(sp), intent(out) :: mingma, nrminv, resid, rqcorr, ztz
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*)
+           real(sp), intent(in) :: d(*), l(*), ld(*), lld(*)
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: z(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: sawnan1, sawnan2
+           integer(ilp) :: i, indlpl, indp, inds, indumn, neg1, neg2, r1, r2
+           real(sp) :: dminus, dplus, eps, s, tmp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real
+           ! Executable Statements 
+           eps = stdlib_slamch( 'PRECISION' )
+           if( r==0 ) then
+              r1 = b1
+              r2 = bn
+           else
+              r1 = r
+              r2 = r
+           end if
+           ! storage for lplus
+           indlpl = 0
+           ! storage for uminus
+           indumn = n
+           inds = 2*n + 1
+           indp = 3*n + 1
+           if( b1==1 ) then
+              work( inds ) = zero
+           else
+              work( inds+b1-1 ) = lld( b1-1 )
+           end if
+           ! compute the stationary transform (using the differential form)
+           ! until the index r2.
+           sawnan1 = .false.
+           neg1 = 0
+           s = work( inds+b1-1 ) - lambda
+           do i = b1, r1 - 1
+              dplus = d( i ) + s
+              work( indlpl+i ) = ld( i ) / dplus
+              if(dplus<zero) neg1 = neg1 + 1
+              work( inds+i ) = s*work( indlpl+i )*l( i )
+              s = work( inds+i ) - lambda
+           end do
+           sawnan1 = stdlib_sisnan( s )
+           if( sawnan1 ) goto 60
+           do i = r1, r2 - 1
+              dplus = d( i ) + s
+              work( indlpl+i ) = ld( i ) / dplus
+              work( inds+i ) = s*work( indlpl+i )*l( i )
+              s = work( inds+i ) - lambda
+           end do
+           sawnan1 = stdlib_sisnan( s )
+           60 continue
+           if( sawnan1 ) then
+              ! runs a slower version of the above loop if a nan is detected
+              neg1 = 0
+              s = work( inds+b1-1 ) - lambda
+              do i = b1, r1 - 1
+                 dplus = d( i ) + s
+                 if(abs(dplus)<pivmin) dplus = -pivmin
+                 work( indlpl+i ) = ld( i ) / dplus
+                 if(dplus<zero) neg1 = neg1 + 1
+                 work( inds+i ) = s*work( indlpl+i )*l( i )
+                 if( work( indlpl+i )==zero )work( inds+i ) = lld( i )
+                 s = work( inds+i ) - lambda
+              end do
+              do i = r1, r2 - 1
+                 dplus = d( i ) + s
+                 if(abs(dplus)<pivmin) dplus = -pivmin
+                 work( indlpl+i ) = ld( i ) / dplus
+                 work( inds+i ) = s*work( indlpl+i )*l( i )
+                 if( work( indlpl+i )==zero )work( inds+i ) = lld( i )
+                 s = work( inds+i ) - lambda
+              end do
+           end if
+           ! compute the progressive transform (using the differential form)
+           ! until the index r1
+           sawnan2 = .false.
+           neg2 = 0
+           work( indp+bn-1 ) = d( bn ) - lambda
+           do i = bn - 1, r1, -1
+              dminus = lld( i ) + work( indp+i )
+              tmp = d( i ) / dminus
+              if(dminus<zero) neg2 = neg2 + 1
+              work( indumn+i ) = l( i )*tmp
+              work( indp+i-1 ) = work( indp+i )*tmp - lambda
+           end do
+           tmp = work( indp+r1-1 )
+           sawnan2 = stdlib_sisnan( tmp )
+           if( sawnan2 ) then
+              ! runs a slower version of the above loop if a nan is detected
+              neg2 = 0
+              do i = bn-1, r1, -1
+                 dminus = lld( i ) + work( indp+i )
+                 if(abs(dminus)<pivmin) dminus = -pivmin
+                 tmp = d( i ) / dminus
+                 if(dminus<zero) neg2 = neg2 + 1
+                 work( indumn+i ) = l( i )*tmp
+                 work( indp+i-1 ) = work( indp+i )*tmp - lambda
+                 if( tmp==zero )work( indp+i-1 ) = d( i ) - lambda
+              end do
+           end if
+           ! find the index (from r1 to r2) of the largest (in magnitude)
+           ! diagonal element of the inverse
+           mingma = work( inds+r1-1 ) + work( indp+r1-1 )
+           if( mingma<zero ) neg1 = neg1 + 1
+           if( wantnc ) then
+              negcnt = neg1 + neg2
+           else
+              negcnt = -1
+           endif
+           if( abs(mingma)==zero )mingma = eps*work( inds+r1-1 )
+           r = r1
+           do i = r1, r2 - 1
+              tmp = work( inds+i ) + work( indp+i )
+              if( tmp==zero )tmp = eps*work( inds+i )
+              if( abs( tmp )<=abs( mingma ) ) then
+                 mingma = tmp
+                 r = i + 1
+              end if
+           end do
+           ! compute the fp vector: solve n^t v = e_r
+           isuppz( 1 ) = b1
+           isuppz( 2 ) = bn
+           z( r ) = cone
+           ztz = one
+           ! compute the fp vector upwards from r
+           if( .not.sawnan1 .and. .not.sawnan2 ) then
+              do i = r-1, b1, -1
+                 z( i ) = -( work( indlpl+i )*z( i+1 ) )
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i ) = zero
+                    isuppz( 1 ) = i + 1
+                    goto 220
+                 endif
+                 ztz = ztz + real( z( i )*z( i ),KIND=sp)
+              end do
+              220 continue
+           else
+              ! run slower loop if nan occurred.
+              do i = r - 1, b1, -1
+                 if( z( i+1 )==zero ) then
+                    z( i ) = -( ld( i+1 ) / ld( i ) )*z( i+2 )
+                 else
+                    z( i ) = -( work( indlpl+i )*z( i+1 ) )
+                 end if
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i ) = zero
+                    isuppz( 1 ) = i + 1
+                    go to 240
+                 end if
+                 ztz = ztz + real( z( i )*z( i ),KIND=sp)
+              end do
+              240 continue
+           endif
+           ! compute the fp vector downwards from r in blocks of size blksiz
+           if( .not.sawnan1 .and. .not.sawnan2 ) then
+              do i = r, bn-1
+                 z( i+1 ) = -( work( indumn+i )*z( i ) )
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i+1 ) = zero
+                    isuppz( 2 ) = i
+                    go to 260
+                 end if
+                 ztz = ztz + real( z( i+1 )*z( i+1 ),KIND=sp)
+              end do
+              260 continue
+           else
+              ! run slower loop if nan occurred.
+              do i = r, bn - 1
+                 if( z( i )==zero ) then
+                    z( i+1 ) = -( ld( i-1 ) / ld( i ) )*z( i-1 )
+                 else
+                    z( i+1 ) = -( work( indumn+i )*z( i ) )
+                 end if
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i+1 ) = zero
+                    isuppz( 2 ) = i
+                    go to 280
+                 end if
+                 ztz = ztz + real( z( i+1 )*z( i+1 ),KIND=sp)
+              end do
+              280 continue
+           end if
+           ! compute quantities for convergence test
+           tmp = one / ztz
+           nrminv = sqrt( tmp )
+           resid = abs( mingma )*nrminv
+           rqcorr = mingma*tmp
+           return
+     end subroutine stdlib_clar1v
+
+     !> CLAR2V: applies a vector of complex plane rotations with real cosines
+     !> from both sides to a sequence of 2-by-2 complex Hermitian matrices,
+     !> defined by the elements of the vectors x, y and z. For i = 1,2,...,n
+     !> (       x(i)  z(i) ) :=
+     !> ( conjg(z(i)) y(i) )
+     !> (  c(i) conjg(s(i)) ) (       x(i)  z(i) ) ( c(i) -conjg(s(i)) )
+     !> ( -s(i)       c(i)  ) ( conjg(z(i)) y(i) ) ( s(i)        c(i)  )
+
+     pure subroutine stdlib_clar2v( n, x, y, z, incx, c, s, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, n
+           ! Array Arguments 
+           real(sp), intent(in) :: c(*)
+           complex(sp), intent(in) :: s(*)
+           complex(sp), intent(inout) :: x(*), y(*), z(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix
+           real(sp) :: ci, sii, sir, t1i, t1r, t5, t6, xi, yi, zii, zir
+           complex(sp) :: si, t2, t3, t4, zi
+           ! Intrinsic Functions 
+           intrinsic :: aimag,cmplx,conjg,real
+           ! Executable Statements 
+           ix = 1
+           ic = 1
+           do i = 1, n
+              xi = real( x( ix ),KIND=sp)
+              yi = real( y( ix ),KIND=sp)
+              zi = z( ix )
+              zir = real( zi,KIND=sp)
+              zii = aimag( zi )
+              ci = c( ic )
+              si = s( ic )
+              sir = real( si,KIND=sp)
+              sii = aimag( si )
+              t1r = sir*zir - sii*zii
+              t1i = sir*zii + sii*zir
+              t2 = ci*zi
+              t3 = t2 - conjg( si )*xi
+              t4 = conjg( t2 ) + si*yi
+              t5 = ci*xi + t1r
+              t6 = ci*yi - t1r
+              x( ix ) = ci*t5 + ( sir*real( t4,KIND=sp)+sii*aimag( t4 ) )
+              y( ix ) = ci*t6 - ( sir*real( t3,KIND=sp)-sii*aimag( t3 ) )
+              z( ix ) = ci*t3 + conjg( si )*cmplx( t6, t1i,KIND=sp)
+              ix = ix + incx
+              ic = ic + incc
+           end do
+           return
+     end subroutine stdlib_clar2v
+
+     !> CLARCM: performs a very simple matrix-matrix multiplication:
+     !> C := A * B,
+     !> where A is M by M and real; B is M by N and complex;
+     !> C is M by N and complex.
+
+     pure subroutine stdlib_clarcm( m, n, a, lda, b, ldb, c, ldc, rwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(in) :: b(ldb,*)
+           complex(sp), intent(out) :: c(ldc,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: aimag,cmplx,real
+           ! Executable Statements 
+           ! quick return if possible.
+           if( ( m==0 ) .or. ( n==0 ) )return
+           do j = 1, n
+              do i = 1, m
+                 rwork( ( j-1 )*m+i ) = real( b( i, j ),KIND=sp)
+              end do
+           end do
+           l = m*n + 1
+           call stdlib_sgemm( 'N', 'N', m, n, m, one, a, lda, rwork, m, zero,rwork( l ), m )
+                     
+           do j = 1, n
+              do i = 1, m
+                 c( i, j ) = rwork( l+( j-1 )*m+i-1 )
+              end do
+           end do
+           do j = 1, n
+              do i = 1, m
+                 rwork( ( j-1 )*m+i ) = aimag( b( i, j ) )
+              end do
+           end do
+           call stdlib_sgemm( 'N', 'N', m, n, m, one, a, lda, rwork, m, zero,rwork( l ), m )
+                     
+           do j = 1, n
+              do i = 1, m
+                 c( i, j ) = cmplx( real( c( i, j ),KIND=sp),rwork( l+( j-1 )*m+i-1 ),KIND=sp)
+                           
+              end do
+           end do
+           return
+     end subroutine stdlib_clarcm
+
+     !> CLARF: applies a complex elementary reflector H to a complex M-by-N
+     !> matrix C, from either the left or the right. H is represented in the
+     !> form
+     !> H = I - tau * v * v**H
+     !> where tau is a complex scalar and v is a complex vector.
+     !> If tau = 0, then H is taken to be the unit matrix.
+     !> To apply H**H (the conjugate transpose of H), supply conjg(tau) instead
+     !> tau.
+
+     pure subroutine stdlib_clarf( side, m, n, v, incv, tau, c, ldc, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: incv, ldc, m, n
+           complex(sp), intent(in) :: tau
+           ! Array Arguments 
+           complex(sp), intent(inout) :: c(ldc,*)
+           complex(sp), intent(in) :: v(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: applyleft
+           integer(ilp) :: i, lastv, lastc
+           ! Executable Statements 
+           applyleft = stdlib_lsame( side, 'L' )
+           lastv = 0
+           lastc = 0
+           if( tau/=czero ) then
+           ! set up variables for scanning v.  lastv begins pointing to the end
+           ! of v.
+              if( applyleft ) then
+                 lastv = m
+              else
+                 lastv = n
+              end if
+              if( incv>0 ) then
+                 i = 1 + (lastv-1) * incv
+              else
+                 i = 1
+              end if
+           ! look for the last non-czero row in v.
+              do while( lastv>0 .and. v( i )==czero )
+                 lastv = lastv - 1
+                 i = i - incv
+              end do
+              if( applyleft ) then
+           ! scan for the last non-czero column in c(1:lastv,:).
+                 lastc = stdlib_ilaclc(lastv, n, c, ldc)
+              else
+           ! scan for the last non-czero row in c(:,1:lastv).
+                 lastc = stdlib_ilaclr(m, lastv, c, ldc)
+              end if
+           end if
+           ! note that lastc.eq.0_sp renders the blas operations null; no special
+           ! case is needed at this level.
+           if( applyleft ) then
+              ! form  h * c
+              if( lastv>0 ) then
+                 ! w(1:lastc,1) := c(1:lastv,1:lastc)**h * v(1:lastv,1)
+                 call stdlib_cgemv( 'CONJUGATE TRANSPOSE', lastv, lastc, cone,c, ldc, v, incv, &
+                           czero, work, 1 )
+                 ! c(1:lastv,1:lastc) := c(...) - v(1:lastv,1) * w(1:lastc,1)**h
+                 call stdlib_cgerc( lastv, lastc, -tau, v, incv, work, 1, c, ldc )
+              end if
+           else
+              ! form  c * h
+              if( lastv>0 ) then
+                 ! w(1:lastc,1) := c(1:lastc,1:lastv) * v(1:lastv,1)
+                 call stdlib_cgemv( 'NO TRANSPOSE', lastc, lastv, cone, c, ldc,v, incv, czero, &
+                           work, 1 )
+                 ! c(1:lastc,1:lastv) := c(...) - w(1:lastc,1) * v(1:lastv,1)**h
+                 call stdlib_cgerc( lastc, lastv, -tau, work, 1, v, incv, c, ldc )
+              end if
+           end if
+           return
+     end subroutine stdlib_clarf
+
+     !> CLARFB: applies a complex block reflector H or its transpose H**H to a
+     !> complex M-by-N matrix C, from either the left or the right.
+
+     pure subroutine stdlib_clarfb( side, trans, direct, storev, m, n, k, v, ldv,t, ldt, c, ldc, &
+               work, ldwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, ldc, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: c(ldc,*)
+           complex(sp), intent(in) :: t(ldt,*), v(ldv,*)
+           complex(sp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           character :: transt
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           if( stdlib_lsame( trans, 'N' ) ) then
+              transt = 'C'
+           else
+              transt = 'N'
+           end if
+           if( stdlib_lsame( storev, 'C' ) ) then
+              if( stdlib_lsame( direct, 'F' ) ) then
+                 ! let  v =  ( v1 )    (first k rows)
+                           ! ( v2 )
+                 ! where  v1  is unit lower triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**h * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**h * v  =  (c1**h * v1 + c2**h * v2)  (stored in work)
+                    ! w := c1**h
+                    do j = 1, k
+                       call stdlib_ccopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+                       call stdlib_clacgv( n, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n,k, cone, v, &
+                              ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c2**h *v2
+                       call stdlib_cgemm( 'CONJUGATE TRANSPOSE', 'NO TRANSPOSE', n,k, m-k, cone, &
+                                 c( k+1, 1 ), ldc,v( k+1, 1 ), ldv, cone, work, ldwork )
+                    end if
+                    ! w := w * t**h  or  w * t
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', transt, 'NON-UNIT', n, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v * w**h
+                    if( m>k ) then
+                       ! c2 := c2 - v2 * w**h
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',m-k, n, k, -cone, &
+                                 v( k+1, 1 ), ldv, work,ldwork, cone, c( k+1, 1 ), ldc )
+                    end if
+                    ! w := w * v1**h
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', n, k, cone,&
+                               v, ldv, work, ldwork )
+                    ! c1 := c1 - w**h
+                    do j = 1, k
+                       do i = 1, n
+                          c( j, i ) = c( j, i ) - conjg( work( i, j ) )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**h  where  c = ( c1  c2 )
+                    ! w := c * v  =  (c1*v1 + c2*v2)  (stored in work)
+                    ! w := c1
+                    do j = 1, k
+                       call stdlib_ccopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', m,k, cone, v, &
+                              ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c2 * v2
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, k, n-k,cone, c( 1, k+&
+                                 1 ), ldc, v( k+1, 1 ), ldv,cone, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**h
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', trans, 'NON-UNIT', m, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v**h
+                    if( n>k ) then
+                       ! c2 := c2 - w * v2**h
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m,n-k, k, -cone, &
+                                 work, ldwork, v( k+1, 1 ),ldv, cone, c( 1, k+1 ), ldc )
+                    end if
+                    ! w := w * v1**h
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', m, k, cone,&
+                               v, ldv, work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, j ) = c( i, j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              else
+                 ! let  v =  ( v1 )
+                           ! ( v2 )    (last k rows)
+                 ! where  v2  is unit upper triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**h * c  where  c = ( c1 )
+                                                        ! ( c2 )
+                    ! w := c**h * v  =  (c1**h * v1 + c2**h * v2)  (stored in work)
+                    ! w := c2**h
+                    do j = 1, k
+                       call stdlib_ccopy( n, c( m-k+j, 1 ), ldc, work( 1, j ), 1 )
+                       call stdlib_clacgv( n, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', n,k, cone, v( m-&
+                              k+1, 1 ), ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c1**h * v1
+                       call stdlib_cgemm( 'CONJUGATE TRANSPOSE', 'NO TRANSPOSE', n,k, m-k, cone, &
+                                 c, ldc, v, ldv, cone, work,ldwork )
+                    end if
+                    ! w := w * t**h  or  w * t
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v * w**h
+                    if( m>k ) then
+                       ! c1 := c1 - v1 * w**h
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',m-k, n, k, -cone, &
+                                 v, ldv, work, ldwork,cone, c, ldc )
+                    end if
+                    ! w := w * v2**h
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','UNIT', n, k, cone,&
+                               v( m-k+1, 1 ), ldv, work,ldwork )
+                    ! c2 := c2 - w**h
+                    do j = 1, k
+                       do i = 1, n
+                          c( m-k+j, i ) = c( m-k+j, i ) -conjg( work( i, j ) )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**h  where  c = ( c1  c2 )
+                    ! w := c * v  =  (c1*v1 + c2*v2)  (stored in work)
+                    ! w := c2
+                    do j = 1, k
+                       call stdlib_ccopy( m, c( 1, n-k+j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', m,k, cone, v( n-&
+                              k+1, 1 ), ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c1 * v1
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, k, n-k,cone, c, ldc, &
+                                 v, ldv, cone, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**h
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v**h
+                    if( n>k ) then
+                       ! c1 := c1 - w * v1**h
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m,n-k, k, -cone, &
+                                 work, ldwork, v, ldv, cone,c, ldc )
+                    end if
+                    ! w := w * v2**h
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','UNIT', m, k, cone,&
+                               v( n-k+1, 1 ), ldv, work,ldwork )
+                    ! c2 := c2 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, n-k+j ) = c( i, n-k+j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              end if
+           else if( stdlib_lsame( storev, 'R' ) ) then
+              if( stdlib_lsame( direct, 'F' ) ) then
+                 ! let  v =  ( v1  v2 )    (v1: first k columns)
+                 ! where  v1  is unit upper triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**h * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**h * v**h  =  (c1**h * v1**h + c2**h * v2**h) (stored in work)
+                    ! w := c1**h
+                    do j = 1, k
+                       call stdlib_ccopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+                       call stdlib_clacgv( n, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1**h
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','UNIT', n, k, cone,&
+                               v, ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c2**h * v2**h
+                       call stdlib_cgemm( 'CONJUGATE TRANSPOSE','CONJUGATE TRANSPOSE', n, k, m-k, &
+                                 cone,c( k+1, 1 ), ldc, v( 1, k+1 ), ldv, cone,work, ldwork )
+                    end if
+                    ! w := w * t**h  or  w * t
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', transt, 'NON-UNIT', n, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v**h * w**h
+                    if( m>k ) then
+                       ! c2 := c2 - v2**h * w**h
+                       call stdlib_cgemm( 'CONJUGATE TRANSPOSE','CONJUGATE TRANSPOSE', m-k, n, k, &
+                                 -cone,v( 1, k+1 ), ldv, work, ldwork, cone,c( k+1, 1 ), ldc )
+                    end if
+                    ! w := w * v1
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', n,k, cone, v, &
+                              ldv, work, ldwork )
+                    ! c1 := c1 - w**h
+                    do j = 1, k
+                       do i = 1, n
+                          c( j, i ) = c( j, i ) - conjg( work( i, j ) )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**h  where  c = ( c1  c2 )
+                    ! w := c * v**h  =  (c1*v1**h + c2*v2**h)  (stored in work)
+                    ! w := c1
+                    do j = 1, k
+                       call stdlib_ccopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1**h
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','UNIT', m, k, cone,&
+                               v, ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c2 * v2**h
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m,k, n-k, cone, &
+                                 c( 1, k+1 ), ldc,v( 1, k+1 ), ldv, cone, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**h
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', trans, 'NON-UNIT', m, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v
+                    if( n>k ) then
+                       ! c2 := c2 - w * v2
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, n-k, k,-cone, work, &
+                                 ldwork, v( 1, k+1 ), ldv, cone,c( 1, k+1 ), ldc )
+                    end if
+                    ! w := w * v1
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', m,k, cone, v, &
+                              ldv, work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, j ) = c( i, j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              else
+                 ! let  v =  ( v1  v2 )    (v2: last k columns)
+                 ! where  v2  is unit lower triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**h * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**h * v**h  =  (c1**h * v1**h + c2**h * v2**h) (stored in work)
+                    ! w := c2**h
+                    do j = 1, k
+                       call stdlib_ccopy( n, c( m-k+j, 1 ), ldc, work( 1, j ), 1 )
+                       call stdlib_clacgv( n, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2**h
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', n, k, cone,&
+                               v( 1, m-k+1 ), ldv, work,ldwork )
+                    if( m>k ) then
+                       ! w := w + c1**h * v1**h
+                       call stdlib_cgemm( 'CONJUGATE TRANSPOSE','CONJUGATE TRANSPOSE', n, k, m-k, &
+                                 cone, c,ldc, v, ldv, cone, work, ldwork )
+                    end if
+                    ! w := w * t**h  or  w * t
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v**h * w**h
+                    if( m>k ) then
+                       ! c1 := c1 - v1**h * w**h
+                       call stdlib_cgemm( 'CONJUGATE TRANSPOSE','CONJUGATE TRANSPOSE', m-k, n, k, &
+                                 -cone, v,ldv, work, ldwork, cone, c, ldc )
+                    end if
+                    ! w := w * v2
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n,k, cone, v( 1, &
+                              m-k+1 ), ldv, work, ldwork )
+                    ! c2 := c2 - w**h
+                    do j = 1, k
+                       do i = 1, n
+                          c( m-k+j, i ) = c( m-k+j, i ) -conjg( work( i, j ) )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**h  where  c = ( c1  c2 )
+                    ! w := c * v**h  =  (c1*v1**h + c2*v2**h)  (stored in work)
+                    ! w := c2
+                    do j = 1, k
+                       call stdlib_ccopy( m, c( 1, n-k+j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2**h
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', m, k, cone,&
+                               v( 1, n-k+1 ), ldv, work,ldwork )
+                    if( n>k ) then
+                       ! w := w + c1 * v1**h
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m,k, n-k, cone, &
+                                 c, ldc, v, ldv, cone, work,ldwork )
+                    end if
+                    ! w := w * t  or  w * t**h
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v
+                    if( n>k ) then
+                       ! c1 := c1 - w * v1
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, n-k, k,-cone, work, &
+                                 ldwork, v, ldv, cone, c, ldc )
+                    end if
+                    ! w := w * v2
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', m,k, cone, v( 1, &
+                              n-k+1 ), ldv, work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, n-k+j ) = c( i, n-k+j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_clarfb
+
+     !> CLARFB_GETT: applies a complex Householder block reflector H from the
+     !> left to a complex (K+M)-by-N  "triangular-pentagonal" matrix
+     !> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+     !> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+     !> in the array B. The block reflector H is stored in a compact
+     !> WY-representation, where the elementary reflectors are in the
+     !> arrays A, B and T. See Further Details section.
+
+     pure subroutine stdlib_clarfb_gett( ident, m, n, k, t, ldt, a, lda, b, ldb,work, ldwork )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: ident
+           integer(ilp), intent(in) :: k, lda, ldb, ldt, ldwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(in) :: t(ldt,*)
+           complex(sp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnotident
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<0 .or. n<=0 .or. k==0 .or. k>n )return
+           lnotident = .not.stdlib_lsame( ident, 'I' )
+           ! ------------------------------------------------------------------
+           ! first step. computation of the column block 2:
+              ! ( a2 ) := h * ( a2 )
+              ! ( b2 )        ( b2 )
+           ! ------------------------------------------------------------------
+           if( n>k ) then
+              ! col2_(1) compute w2: = a2. therefore, copy a2 = a(1:k, k+1:n)
+              ! into w2=work(1:k, 1:n-k) column-by-column.
+              do j = 1, n-k
+                 call stdlib_ccopy( k, a( 1, k+j ), 1, work( 1, j ), 1 )
+              end do
+              if( lnotident ) then
+                 ! col2_(2) compute w2: = (v1**h) * w2 = (a1**h) * w2,
+                 ! v1 is not an identy matrix, but unit lower-triangular
+                 ! v1 stored in a1 (diagonal ones are not stored).
+                 call stdlib_ctrmm( 'L', 'L', 'C', 'U', k, n-k, cone, a, lda,work, ldwork )
+                           
+              end if
+              ! col2_(3) compute w2: = w2 + (v2**h) * b2 = w2 + (b1**h) * b2
+              ! v2 stored in b1.
+              if( m>0 ) then
+                 call stdlib_cgemm( 'C', 'N', k, n-k, m, cone, b, ldb,b( 1, k+1 ), ldb, cone, &
+                           work, ldwork )
+              end if
+              ! col2_(4) compute w2: = t * w2,
+              ! t is upper-triangular.
+              call stdlib_ctrmm( 'L', 'U', 'N', 'N', k, n-k, cone, t, ldt,work, ldwork )
+              ! col2_(5) compute b2: = b2 - v2 * w2 = b2 - b1 * w2,
+              ! v2 stored in b1.
+              if( m>0 ) then
+                 call stdlib_cgemm( 'N', 'N', m, n-k, k, -cone, b, ldb,work, ldwork, cone, b( 1, &
+                           k+1 ), ldb )
+              end if
+              if( lnotident ) then
+                 ! col2_(6) compute w2: = v1 * w2 = a1 * w2,
+                 ! v1 is not an identity matrix, but unit lower-triangular,
+                 ! v1 stored in a1 (diagonal ones are not stored).
+                 call stdlib_ctrmm( 'L', 'L', 'N', 'U', k, n-k, cone, a, lda,work, ldwork )
+                           
+              end if
+              ! col2_(7) compute a2: = a2 - w2 =
+                                   ! = a(1:k, k+1:n-k) - work(1:k, 1:n-k),
+              ! column-by-column.
+              do j = 1, n-k
+                 do i = 1, k
+                    a( i, k+j ) = a( i, k+j ) - work( i, j )
+                 end do
+              end do
+           end if
+           ! ------------------------------------------------------------------
+           ! second step. computation of the column block 1:
+              ! ( a1 ) := h * ( a1 )
+              ! ( b1 )        (  0 )
+           ! ------------------------------------------------------------------
+           ! col1_(1) compute w1: = a1. copy the upper-triangular
+           ! a1 = a(1:k, 1:k) into the upper-triangular
+           ! w1 = work(1:k, 1:k) column-by-column.
+           do j = 1, k
+              call stdlib_ccopy( j, a( 1, j ), 1, work( 1, j ), 1 )
+           end do
+           ! set the subdiagonal elements of w1 to zero column-by-column.
+           do j = 1, k - 1
+              do i = j + 1, k
+                 work( i, j ) = czero
+              end do
+           end do
+           if( lnotident ) then
+              ! col1_(2) compute w1: = (v1**h) * w1 = (a1**h) * w1,
+              ! v1 is not an identity matrix, but unit lower-triangular
+              ! v1 stored in a1 (diagonal ones are not stored),
+              ! w1 is upper-triangular with zeroes below the diagonal.
+              call stdlib_ctrmm( 'L', 'L', 'C', 'U', k, k, cone, a, lda,work, ldwork )
+           end if
+           ! col1_(3) compute w1: = t * w1,
+           ! t is upper-triangular,
+           ! w1 is upper-triangular with zeroes below the diagonal.
+           call stdlib_ctrmm( 'L', 'U', 'N', 'N', k, k, cone, t, ldt,work, ldwork )
+           ! col1_(4) compute b1: = - v2 * w1 = - b1 * w1,
+           ! v2 = b1, w1 is upper-triangular with zeroes below the diagonal.
+           if( m>0 ) then
+              call stdlib_ctrmm( 'R', 'U', 'N', 'N', m, k, -cone, work, ldwork,b, ldb )
+           end if
+           if( lnotident ) then
+              ! col1_(5) compute w1: = v1 * w1 = a1 * w1,
+              ! v1 is not an identity matrix, but unit lower-triangular
+              ! v1 stored in a1 (diagonal ones are not stored),
+              ! w1 is upper-triangular on input with zeroes below the diagonal,
+              ! and square on output.
+              call stdlib_ctrmm( 'L', 'L', 'N', 'U', k, k, cone, a, lda,work, ldwork )
+              ! col1_(6) compute a1: = a1 - w1 = a(1:k, 1:k) - work(1:k, 1:k)
+              ! column-by-column. a1 is upper-triangular on input.
+              ! if ident, a1 is square on output, and w1 is square,
+              ! if not ident, a1 is upper-triangular on output,
+              ! w1 is upper-triangular.
+              ! col1_(6)_a compute elements of a1 below the diagonal.
+              do j = 1, k - 1
+                 do i = j + 1, k
+                    a( i, j ) = - work( i, j )
+                 end do
+              end do
+           end if
+           ! col1_(6)_b compute elements of a1 on and above the diagonal.
+           do j = 1, k
+              do i = 1, j
+                 a( i, j ) = a( i, j ) - work( i, j )
+              end do
+           end do
+           return
+     end subroutine stdlib_clarfb_gett
+
+     !> CLARFG: generates a complex elementary reflector H of order n, such
+     !> that
+     !> H**H * ( alpha ) = ( beta ),   H**H * H = I.
+     !> (   x   )   (   0  )
+     !> where alpha and beta are scalars, with beta real, and x is an
+     !> (n-1)-element complex vector. H is represented in the form
+     !> H = I - tau * ( 1 ) * ( 1 v**H ) ,
+     !> ( v )
+     !> where tau is a complex scalar and v is a complex (n-1)-element
+     !> vector. Note that H is not hermitian.
+     !> If the elements of x are all zero and alpha is real, then tau = 0
+     !> and H is taken to be the unit matrix.
+     !> Otherwise  1 <= real(tau) <= 2  and  abs(tau-1) <= 1 .
+
+     pure subroutine stdlib_clarfg( n, alpha, x, incx, tau )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           complex(sp), intent(inout) :: alpha
+           complex(sp), intent(out) :: tau
+           ! Array Arguments 
+           complex(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, knt
+           real(sp) :: alphi, alphr, beta, rsafmn, safmin, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,real,sign
+           ! Executable Statements 
+           if( n<=0 ) then
+              tau = zero
+              return
+           end if
+           xnorm = stdlib_scnrm2( n-1, x, incx )
+           alphr = real( alpha,KIND=sp)
+           alphi = aimag( alpha )
+           if( xnorm==zero .and. alphi==zero ) then
+              ! h  =  i
+              tau = zero
+           else
+              ! general case
+              beta = -sign( stdlib_slapy3( alphr, alphi, xnorm ), alphr )
+              safmin = stdlib_slamch( 'S' ) / stdlib_slamch( 'E' )
+              rsafmn = one / safmin
+              knt = 0
+              if( abs( beta )<safmin ) then
+                 ! xnorm, beta may be inaccurate; scale x and recompute them
+                 10 continue
+                 knt = knt + 1
+                 call stdlib_csscal( n-1, rsafmn, x, incx )
+                 beta = beta*rsafmn
+                 alphi = alphi*rsafmn
+                 alphr = alphr*rsafmn
+                 if( (abs( beta )<safmin) .and. (knt < 20) )go to 10
+                 ! new beta is at most 1, at least safmin
+                 xnorm = stdlib_scnrm2( n-1, x, incx )
+                 alpha = cmplx( alphr, alphi,KIND=sp)
+                 beta = -sign( stdlib_slapy3( alphr, alphi, xnorm ), alphr )
+              end if
+              tau = cmplx( ( beta-alphr ) / beta, -alphi / beta,KIND=sp)
+              alpha = stdlib_cladiv( cmplx( one,KIND=sp), alpha-beta )
+              call stdlib_cscal( n-1, alpha, x, incx )
+              ! if alpha is subnormal, it may lose relative accuracy
+              do j = 1, knt
+                 beta = beta*safmin
+              end do
+              alpha = beta
+           end if
+           return
+     end subroutine stdlib_clarfg
+
+     !> CLARFGP: generates a complex elementary reflector H of order n, such
+     !> that
+     !> H**H * ( alpha ) = ( beta ),   H**H * H = I.
+     !> (   x   )   (   0  )
+     !> where alpha and beta are scalars, beta is real and non-negative, and
+     !> x is an (n-1)-element complex vector.  H is represented in the form
+     !> H = I - tau * ( 1 ) * ( 1 v**H ) ,
+     !> ( v )
+     !> where tau is a complex scalar and v is a complex (n-1)-element
+     !> vector. Note that H is not hermitian.
+     !> If the elements of x are all zero and alpha is real, then tau = 0
+     !> and H is taken to be the unit matrix.
+
+     subroutine stdlib_clarfgp( n, alpha, x, incx, tau )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           complex(sp), intent(inout) :: alpha
+           complex(sp), intent(out) :: tau
+           ! Array Arguments 
+           complex(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, knt
+           real(sp) :: alphi, alphr, beta, bignum, smlnum, xnorm
+           complex(sp) :: savealpha
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,real,sign
+           ! Executable Statements 
+           if( n<=0 ) then
+              tau = zero
+              return
+           end if
+           xnorm = stdlib_scnrm2( n-1, x, incx )
+           alphr = real( alpha,KIND=sp)
+           alphi = aimag( alpha )
+           if( xnorm==zero ) then
+              ! h  =  [1-alpha/abs(alpha) 0; 0 i], sign chosen so alpha >= 0.
+              if( alphi==zero ) then
+                 if( alphr>=zero ) then
+                    ! when tau.eq.zero, the vector is special-cased to be
+                    ! all zeros in the application routines.  we do not need
+                    ! to clear it.
+                    tau = zero
+                 else
+                    ! however, the application routines rely on explicit
+                    ! zero checks when tau.ne.zero, and we must clear x.
+                    tau = two
+                    do j = 1, n-1
+                       x( 1 + (j-1)*incx ) = zero
+                    end do
+                    alpha = -alpha
+                 end if
+              else
+                 ! only "reflecting" the diagonal entry to be real and non-negative.
+                 xnorm = stdlib_slapy2( alphr, alphi )
+                 tau = cmplx( one - alphr / xnorm, -alphi / xnorm,KIND=sp)
+                 do j = 1, n-1
+                    x( 1 + (j-1)*incx ) = zero
+                 end do
+                 alpha = xnorm
+              end if
+           else
+              ! general case
+              beta = sign( stdlib_slapy3( alphr, alphi, xnorm ), alphr )
+              smlnum = stdlib_slamch( 'S' ) / stdlib_slamch( 'E' )
+              bignum = one / smlnum
+              knt = 0
+              if( abs( beta )<smlnum ) then
+                 ! xnorm, beta may be inaccurate; scale x and recompute them
+                 10 continue
+                 knt = knt + 1
+                 call stdlib_csscal( n-1, bignum, x, incx )
+                 beta = beta*bignum
+                 alphi = alphi*bignum
+                 alphr = alphr*bignum
+                 if( (abs( beta )<smlnum) .and. (knt < 20) )go to 10
+                 ! new beta is at most 1, at least smlnum
+                 xnorm = stdlib_scnrm2( n-1, x, incx )
+                 alpha = cmplx( alphr, alphi,KIND=sp)
+                 beta = sign( stdlib_slapy3( alphr, alphi, xnorm ), alphr )
+              end if
+              savealpha = alpha
+              alpha = alpha + beta
+              if( beta<zero ) then
+                 beta = -beta
+                 tau = -alpha / beta
+              else
+                 alphr = alphi * (alphi/real( alpha,KIND=sp))
+                 alphr = alphr + xnorm * (xnorm/real( alpha,KIND=sp))
+                 tau = cmplx( alphr/beta, -alphi/beta,KIND=sp)
+                 alpha = cmplx( -alphr, alphi,KIND=sp)
+              end if
+              alpha = stdlib_cladiv( cmplx( one,KIND=sp), alpha )
+              if ( abs(tau)<=smlnum ) then
+                 ! in the case where the computed tau ends up being a denormalized number,
+                 ! it loses relative accuracy. this is a big problem. solution: flush tau
+                 ! to zero (or two or whatever makes a nonnegative real number for beta).
+                 ! (bug report provided by pat quillen from mathworks on jul 29, 2009.)
+                 ! (thanks pat. thanks mathworks.)
+                 alphr = real( savealpha,KIND=sp)
+                 alphi = aimag( savealpha )
+                 if( alphi==zero ) then
+                    if( alphr>=zero ) then
+                       tau = zero
+                    else
+                       tau = two
+                       do j = 1, n-1
+                          x( 1 + (j-1)*incx ) = zero
+                       end do
+                       beta = real( -savealpha,KIND=sp)
+                    end if
+                 else
+                    xnorm = stdlib_slapy2( alphr, alphi )
+                    tau = cmplx( one - alphr / xnorm, -alphi / xnorm,KIND=sp)
+                    do j = 1, n-1
+                       x( 1 + (j-1)*incx ) = zero
+                    end do
+                    beta = xnorm
+                 end if
+              else
+                 ! this is the general case.
+                 call stdlib_cscal( n-1, alpha, x, incx )
+              end if
+              ! if beta is subnormal, it may lose relative accuracy
+              do j = 1, knt
+                 beta = beta*smlnum
+              end do
+              alpha = beta
+           end if
+           return
+     end subroutine stdlib_clarfgp
+
+     !> CLARFT: forms the triangular factor T of a complex block reflector H
+     !> of order n, which is defined as a product of k elementary reflectors.
+     !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+     !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+     !> If STOREV = 'C', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th column of the array V, and
+     !> H  =  I - V * T * V**H
+     !> If STOREV = 'R', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th row of the array V, and
+     !> H  =  I - V**H * T * V
+
+     pure subroutine stdlib_clarft( direct, storev, n, k, v, ldv, tau, t, ldt )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, storev
+           integer(ilp), intent(in) :: k, ldt, ldv, n
+           ! Array Arguments 
+           complex(sp), intent(out) :: t(ldt,*)
+           complex(sp), intent(in) :: tau(*), v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, prevlastv, lastv
+           ! Executable Statements 
+           ! quick return if possible
+           if( n==0 )return
+           if( stdlib_lsame( direct, 'F' ) ) then
+              prevlastv = n
+              do i = 1, k
+                 prevlastv = max( prevlastv, i )
+                 if( tau( i )==czero ) then
+                    ! h(i)  =  i
+                    do j = 1, i
+                       t( j, i ) = czero
+                    end do
+                 else
+                    ! general case
+                    if( stdlib_lsame( storev, 'C' ) ) then
+                       ! skip any trailing zeros.
+                       do lastv = n, i+1, -1
+                          if( v( lastv, i )/=czero ) exit
+                       end do
+                       do j = 1, i-1
+                          t( j, i ) = -tau( i ) * conjg( v( i , j ) )
+                       end do
+                       j = min( lastv, prevlastv )
+                       ! t(1:i-1,i) := - tau(i) * v(i:j,1:i-1)**h * v(i:j,i)
+                       call stdlib_cgemv( 'CONJUGATE TRANSPOSE', j-i, i-1,-tau( i ), v( i+1, 1 ), &
+                                 ldv,v( i+1, i ), 1,cone, t( 1, i ), 1 )
+                    else
+                       ! skip any trailing zeros.
+                       do lastv = n, i+1, -1
+                          if( v( i, lastv )/=czero ) exit
+                       end do
+                       do j = 1, i-1
+                          t( j, i ) = -tau( i ) * v( j , i )
+                       end do
+                       j = min( lastv, prevlastv )
+                       ! t(1:i-1,i) := - tau(i) * v(1:i-1,i:j) * v(i,i:j)**h
+                       call stdlib_cgemm( 'N', 'C', i-1, 1, j-i, -tau( i ),v( 1, i+1 ), ldv, v( i,&
+                                  i+1 ), ldv,cone, t( 1, i ), ldt )
+                    end if
+                    ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+                    call stdlib_ctrmv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', i-1, t,ldt, t( 1, i ),&
+                               1 )
+                    t( i, i ) = tau( i )
+                    if( i>1 ) then
+                       prevlastv = max( prevlastv, lastv )
+                    else
+                       prevlastv = lastv
+                    end if
+                 end if
+              end do
+           else
+              prevlastv = 1
+              do i = k, 1, -1
+                 if( tau( i )==czero ) then
+                    ! h(i)  =  i
+                    do j = i, k
+                       t( j, i ) = czero
+                    end do
+                 else
+                    ! general case
+                    if( i<k ) then
+                       if( stdlib_lsame( storev, 'C' ) ) then
+                          ! skip any leading zeros.
+                          do lastv = 1, i-1
+                             if( v( lastv, i )/=czero ) exit
+                          end do
+                          do j = i+1, k
+                             t( j, i ) = -tau( i ) * conjg( v( n-k+i , j ) )
+                          end do
+                          j = max( lastv, prevlastv )
+                          ! t(i+1:k,i) = -tau(i) * v(j:n-k+i,i+1:k)**h * v(j:n-k+i,i)
+                          call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-k+i-j, k-i,-tau( i ), v( j, &
+                                    i+1 ), ldv, v( j, i ),1, cone, t( i+1, i ), 1 )
+                       else
+                          ! skip any leading zeros.
+                          do lastv = 1, i-1
+                             if( v( i, lastv )/=czero ) exit
+                          end do
+                          do j = i+1, k
+                             t( j, i ) = -tau( i ) * v( j, n-k+i )
+                          end do
+                          j = max( lastv, prevlastv )
+                          ! t(i+1:k,i) = -tau(i) * v(i+1:k,j:n-k+i) * v(i,j:n-k+i)**h
+                          call stdlib_cgemm( 'N', 'C', k-i, 1, n-k+i-j, -tau( i ),v( i+1, j ), &
+                                    ldv, v( i, j ), ldv,cone, t( i+1, i ), ldt )
+                       end if
+                       ! t(i+1:k,i) := t(i+1:k,i+1:k) * t(i+1:k,i)
+                       call stdlib_ctrmv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', k-i,t( i+1, i+1 ), &
+                                 ldt, t( i+1, i ), 1 )
+                       if( i>1 ) then
+                          prevlastv = min( prevlastv, lastv )
+                       else
+                          prevlastv = lastv
+                       end if
+                    end if
+                    t( i, i ) = tau( i )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_clarft
+
+     !> CLARFX: applies a complex elementary reflector H to a complex m by n
+     !> matrix C, from either the left or the right. H is represented in the
+     !> form
+     !> H = I - tau * v * v**H
+     !> where tau is a complex scalar and v is a complex vector.
+     !> If tau = 0, then H is taken to be the unit matrix
+     !> This version uses inline code if H has order < 11.
+
+     pure subroutine stdlib_clarfx( side, m, n, v, tau, c, ldc, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: ldc, m, n
+           complex(sp), intent(in) :: tau
+           ! Array Arguments 
+           complex(sp), intent(inout) :: c(ldc,*)
+           complex(sp), intent(in) :: v(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j
+           complex(sp) :: sum, t1, t10, t2, t3, t4, t5, t6, t7, t8, t9, v1, v10, v2, v3, v4, v5, &
+                     v6, v7, v8, v9
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           if( tau==czero )return
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c, where h has order m.
+              go to ( 10, 30, 50, 70, 90, 110, 130, 150,170, 190 )m
+              ! code for general m
+              call stdlib_clarf( side, m, n, v, 1, tau, c, ldc, work )
+              go to 410
+              10 continue
+              ! special code for 1 x 1 householder
+              t1 = cone - tau*v( 1 )*conjg( v( 1 ) )
+              do j = 1, n
+                 c( 1, j ) = t1*c( 1, j )
+              end do
+              go to 410
+              30 continue
+              ! special code for 2 x 2 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+              end do
+              go to 410
+              50 continue
+              ! special code for 3 x 3 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+              end do
+              go to 410
+              70 continue
+              ! special code for 4 x 4 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+              end do
+              go to 410
+              90 continue
+              ! special code for 5 x 5 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j )
+                           
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+              end do
+              go to 410
+              110 continue
+              ! special code for 6 x 6 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+              end do
+              go to 410
+              130 continue
+              ! special code for 7 x 7 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              v7 = conjg( v( 7 ) )
+              t7 = tau*conjg( v7 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+              end do
+              go to 410
+              150 continue
+              ! special code for 8 x 8 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              v7 = conjg( v( 7 ) )
+              t7 = tau*conjg( v7 )
+              v8 = conjg( v( 8 ) )
+              t8 = tau*conjg( v8 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+              end do
+              go to 410
+              170 continue
+              ! special code for 9 x 9 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              v7 = conjg( v( 7 ) )
+              t7 = tau*conjg( v7 )
+              v8 = conjg( v( 8 ) )
+              t8 = tau*conjg( v8 )
+              v9 = conjg( v( 9 ) )
+              t9 = tau*conjg( v9 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j ) + v9*c( 9, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+                 c( 9, j ) = c( 9, j ) - sum*t9
+              end do
+              go to 410
+              190 continue
+              ! special code for 10 x 10 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              v7 = conjg( v( 7 ) )
+              t7 = tau*conjg( v7 )
+              v8 = conjg( v( 8 ) )
+              t8 = tau*conjg( v8 )
+              v9 = conjg( v( 9 ) )
+              t9 = tau*conjg( v9 )
+              v10 = conjg( v( 10 ) )
+              t10 = tau*conjg( v10 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j ) + v9*c( 9, j ) +v10*c( 10, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+                 c( 9, j ) = c( 9, j ) - sum*t9
+                 c( 10, j ) = c( 10, j ) - sum*t10
+              end do
+              go to 410
+           else
+              ! form  c * h, where h has order n.
+              go to ( 210, 230, 250, 270, 290, 310, 330, 350,370, 390 )n
+              ! code for general n
+              call stdlib_clarf( side, m, n, v, 1, tau, c, ldc, work )
+              go to 410
+              210 continue
+              ! special code for 1 x 1 householder
+              t1 = cone - tau*v( 1 )*conjg( v( 1 ) )
+              do j = 1, m
+                 c( j, 1 ) = t1*c( j, 1 )
+              end do
+              go to 410
+              230 continue
+              ! special code for 2 x 2 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+              end do
+              go to 410
+              250 continue
+              ! special code for 3 x 3 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+              end do
+              go to 410
+              270 continue
+              ! special code for 4 x 4 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+              end do
+              go to 410
+              290 continue
+              ! special code for 5 x 5 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 )
+                           
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+              end do
+              go to 410
+              310 continue
+              ! special code for 6 x 6 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+              end do
+              go to 410
+              330 continue
+              ! special code for 7 x 7 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              v7 = v( 7 )
+              t7 = tau*conjg( v7 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+              end do
+              go to 410
+              350 continue
+              ! special code for 8 x 8 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              v7 = v( 7 )
+              t7 = tau*conjg( v7 )
+              v8 = v( 8 )
+              t8 = tau*conjg( v8 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+              end do
+              go to 410
+              370 continue
+              ! special code for 9 x 9 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              v7 = v( 7 )
+              t7 = tau*conjg( v7 )
+              v8 = v( 8 )
+              t8 = tau*conjg( v8 )
+              v9 = v( 9 )
+              t9 = tau*conjg( v9 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 ) + v9*c( j, 9 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+                 c( j, 9 ) = c( j, 9 ) - sum*t9
+              end do
+              go to 410
+              390 continue
+              ! special code for 10 x 10 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              v7 = v( 7 )
+              t7 = tau*conjg( v7 )
+              v8 = v( 8 )
+              t8 = tau*conjg( v8 )
+              v9 = v( 9 )
+              t9 = tau*conjg( v9 )
+              v10 = v( 10 )
+              t10 = tau*conjg( v10 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 ) + v9*c( j, 9 ) +v10*c( j, 10 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+                 c( j, 9 ) = c( j, 9 ) - sum*t9
+                 c( j, 10 ) = c( j, 10 ) - sum*t10
+              end do
+              go to 410
+           end if
+       410 return
+     end subroutine stdlib_clarfx
+
+     !> CLARFY: applies an elementary reflector, or Householder matrix, H,
+     !> to an n x n Hermitian matrix C, from both the left and the right.
+     !> H is represented in the form
+     !> H = I - tau * v * v'
+     !> where  tau  is a scalar and  v  is a vector.
+     !> If  tau  is  zero, then  H  is taken to be the unit matrix.
+
+     pure subroutine stdlib_clarfy( uplo, n, v, incv, tau, c, ldc, work )
+        ! -- lapack test routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incv, ldc, n
+           complex(sp), intent(in) :: tau
+           ! Array Arguments 
+           complex(sp), intent(inout) :: c(ldc,*)
+           complex(sp), intent(in) :: v(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: alpha
+           ! Executable Statements 
+           if( tau==czero )return
+           ! form  w:= c * v
+           call stdlib_chemv( uplo, n, cone, c, ldc, v, incv, czero, work, 1 )
+           alpha = -chalf*tau*stdlib_cdotc( n, work, 1, v, incv )
+           call stdlib_caxpy( n, alpha, v, incv, work, 1 )
+           ! c := c - v * w' - w * v'
+           call stdlib_cher2( uplo, n, -tau, v, incv, work, 1, c, ldc )
+           return
+     end subroutine stdlib_clarfy
+
+     !> CLARNV: returns a vector of n random complex numbers from a uniform or
+     !> normal distribution.
+
+     pure subroutine stdlib_clarnv( idist, iseed, n, x )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: idist, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: iseed(4)
+           complex(sp), intent(out) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: lv = 128
+           real(sp), parameter :: twopi = 6.28318530717958647692528676655900576839e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, il, iv
+           ! Local Arrays 
+           real(sp) :: u(lv)
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,exp,log,min,sqrt
+           ! Executable Statements 
+           do 60 iv = 1, n, lv / 2
+              il = min( lv / 2, n-iv+1 )
+              ! call stdlib_slaruv to generate 2*il realnumbers from a uniform (0,1,KIND=sp)
+              ! distribution (2*il <= lv)
+              call stdlib_slaruv( iseed, 2*il, u )
+              if( idist==1 ) then
+                 ! copy generated numbers
+                 do i = 1, il
+                    x( iv+i-1 ) = cmplx( u( 2*i-1 ), u( 2*i ),KIND=sp)
+                 end do
+              else if( idist==2 ) then
+                 ! convert generated numbers to uniform (-1,1) distribution
+                 do i = 1, il
+                    x( iv+i-1 ) = cmplx( two*u( 2*i-1 )-one,two*u( 2*i )-one,KIND=sp)
+                 end do
+              else if( idist==3 ) then
+                 ! convert generated numbers to normal (0,1) distribution
+                 do i = 1, il
+                    x( iv+i-1 ) = sqrt( -two*log( u( 2*i-1 ) ) )*exp( cmplx( zero, twopi*u( 2*i ),&
+                              KIND=sp) )
+                 end do
+              else if( idist==4 ) then
+                 ! convert generated numbers to complex numbers uniformly
+                 ! distributed on the unit disk
+                 do i = 1, il
+                    x( iv+i-1 ) = sqrt( u( 2*i-1 ) )*exp( cmplx( zero, twopi*u( 2*i ),KIND=sp) )
+                              
+                 end do
+              else if( idist==5 ) then
+                 ! convert generated numbers to complex numbers uniformly
+                 ! distributed on the unit circle
+                 do i = 1, il
+                    x( iv+i-1 ) = exp( cmplx( zero, twopi*u( 2*i ),KIND=sp) )
+                 end do
+              end if
+              60 continue
+           return
+     end subroutine stdlib_clarnv
+
+     !> !
+     !>
+     !> CLARTG: generates a plane rotation so that
+     !> [  C         S  ] . [ F ]  =  [ R ]
+     !> [ -conjg(S)  C  ]   [ G ]     [ 0 ]
+     !> where C is real and C**2 + |S|**2 = 1.
+     !> The mathematical formulas used for C and S are
+     !> sgn(x) = {  x / |x|,   x != 0
+     !> {  1,         x = 0
+     !> R = sgn(F) * sqrt(|F|**2 + |G|**2)
+     !> C = |F| / sqrt(|F|**2 + |G|**2)
+     !> S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2)
+     !> When F and G are real, the formulas simplify to C = F/R and
+     !> S = G/R, and the returned values of C, S, and R should be
+     !> identical to those returned by CLARTG.
+     !> The algorithm used to compute these quantities incorporates scaling
+     !> to avoid overflow or underflow in computing the square root of the
+     !> sum of squares.
+     !> This is a faster version of the BLAS1 routine CROTG, except for
+     !> the following differences:
+     !> F and G are unchanged on return.
+     !> If G=0, then C=1 and S=0.
+     !> If F=0, then C=0 and S is chosen so that R is real.
+     !> Below, wp=>sp stands for single precision from LA_CONSTANTS module.
+
+     pure subroutine stdlib_clartg( f, g, c, s, r )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! february 2021
+        ! Scalar Arguments 
+        real(sp), intent(out) :: c
+        complex(sp), intent(in) :: f, g
+        complex(sp), intent(out) :: r, s
+        ! Local Scalars 
+        real(sp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w
+        complex(sp) :: fs, gs, t
+        ! Intrinsic Functions 
+        intrinsic :: abs,aimag,conjg,max,min,real,sqrt
+        ! Statement Functions 
+        real(sp) :: abssq
+        ! Statement Function Definitions 
+        abssq( t ) = real( t,KIND=sp)**2 + aimag( t )**2
+        ! Executable Statements 
+        if( g == czero ) then
+           c = one
+           s = czero
+           r = f
+        else if( f == czero ) then
+           c = zero
+           g1 = max( abs(real(g,KIND=sp)), abs(aimag(g)) )
+           if( g1 > rtmin .and. g1 < rtmax ) then
+              ! use unscaled algorithm
+              g2 = abssq( g )
+              d = sqrt( g2 )
+              s = conjg( g ) / d
+              r = d
+           else
+              ! use scaled algorithm
+              u = min( safmax, max( safmin, g1 ) )
+              uu = one / u
+              gs = g*uu
+              g2 = abssq( gs )
+              d = sqrt( g2 )
+              s = conjg( gs ) / d
+              r = d*u
+           end if
+        else
+           f1 = max( abs(real(f,KIND=sp)), abs(aimag(f)) )
+           g1 = max( abs(real(g,KIND=sp)), abs(aimag(g)) )
+     if( f1 > rtmin .and. f1 < rtmax .and.          g1 > rtmin .and. g1 < rtmax ) then
+              ! use unscaled algorithm
+              f2 = abssq( f )
+              g2 = abssq( g )
+              h2 = f2 + g2
+              if( f2 > rtmin .and. h2 < rtmax ) then
+                 d = sqrt( f2*h2 )
+              else
+                 d = sqrt( f2 )*sqrt( h2 )
+              end if
+              p = 1 / d
+              c = f2*p
+              s = conjg( g )*( f*p )
+              r = f*( h2*p )
+           else
+              ! use scaled algorithm
+              u = min( safmax, max( safmin, f1, g1 ) )
+              uu = one / u
+              gs = g*uu
+              g2 = abssq( gs )
+              if( f1*uu < rtmin ) then
+                 ! f is not well-scaled when scaled by g1.
+                 ! use a different scaling for f.
+                 v = min( safmax, max( safmin, f1 ) )
+                 vv = one / v
+                 w = v * uu
+                 fs = f*vv
+                 f2 = abssq( fs )
+                 h2 = f2*w**2 + g2
+              else
+                 ! otherwise use the same scaling for f and g.
+                 w = one
+                 fs = f*uu
+                 f2 = abssq( fs )
+                 h2 = f2 + g2
+              end if
+              if( f2 > rtmin .and. h2 < rtmax ) then
+                 d = sqrt( f2*h2 )
+              else
+                 d = sqrt( f2 )*sqrt( h2 )
+              end if
+              p = 1 / d
+              c = ( f2*p )*w
+              s = conjg( gs )*( fs*p )
+              r = ( fs*( h2*p ) )*u
+           end if
+        end if
+        return
+     end subroutine stdlib_clartg
+
+     !> CLARTV: applies a vector of complex plane rotations with real cosines
+     !> to elements of the complex vectors x and y. For i = 1,2,...,n
+     !> ( x(i) ) := (        c(i)   s(i) ) ( x(i) )
+     !> ( y(i) )    ( -conjg(s(i))  c(i) ) ( y(i) )
+
+     pure subroutine stdlib_clartv( n, x, incx, y, incy, c, s, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, incy, n
+           ! Array Arguments 
+           real(sp), intent(in) :: c(*)
+           complex(sp), intent(in) :: s(*)
+           complex(sp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix, iy
+           complex(sp) :: xi, yi
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ix = 1
+           iy = 1
+           ic = 1
+           do i = 1, n
+              xi = x( ix )
+              yi = y( iy )
+              x( ix ) = c( ic )*xi + s( ic )*yi
+              y( iy ) = c( ic )*yi - conjg( s( ic ) )*xi
+              ix = ix + incx
+              iy = iy + incy
+              ic = ic + incc
+           end do
+           return
+     end subroutine stdlib_clartv
+
+     !> CLARZ: applies a complex elementary reflector H to a complex
+     !> M-by-N matrix C, from either the left or the right. H is represented
+     !> in the form
+     !> H = I - tau * v * v**H
+     !> where tau is a complex scalar and v is a complex vector.
+     !> If tau = 0, then H is taken to be the unit matrix.
+     !> To apply H**H (the conjugate transpose of H), supply conjg(tau) instead
+     !> tau.
+     !> H is a product of k elementary reflectors as returned by CTZRZF.
+
+     pure subroutine stdlib_clarz( side, m, n, l, v, incv, tau, c, ldc, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: incv, l, ldc, m, n
+           complex(sp), intent(in) :: tau
+           ! Array Arguments 
+           complex(sp), intent(inout) :: c(ldc,*)
+           complex(sp), intent(in) :: v(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Executable Statements 
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c
+              if( tau/=czero ) then
+                 ! w( 1:n ) = conjg( c( 1, 1:n ) )
+                 call stdlib_ccopy( n, c, ldc, work, 1 )
+                 call stdlib_clacgv( n, work, 1 )
+                 ! w( 1:n ) = conjg( w( 1:n ) + c( m-l+1:m, 1:n )**h * v( 1:l ) )
+                 call stdlib_cgemv( 'CONJUGATE TRANSPOSE', l, n, cone, c( m-l+1, 1 ),ldc, v, incv,&
+                            cone, work, 1 )
+                 call stdlib_clacgv( n, work, 1 )
+                 ! c( 1, 1:n ) = c( 1, 1:n ) - tau * w( 1:n )
+                 call stdlib_caxpy( n, -tau, work, 1, c, ldc )
+                 ! c( m-l+1:m, 1:n ) = c( m-l+1:m, 1:n ) - ...
+                                     ! tau * v( 1:l ) * w( 1:n )**h
+                 call stdlib_cgeru( l, n, -tau, v, incv, work, 1, c( m-l+1, 1 ),ldc )
+              end if
+           else
+              ! form  c * h
+              if( tau/=czero ) then
+                 ! w( 1:m ) = c( 1:m, 1 )
+                 call stdlib_ccopy( m, c, 1, work, 1 )
+                 ! w( 1:m ) = w( 1:m ) + c( 1:m, n-l+1:n, 1:n ) * v( 1:l )
+                 call stdlib_cgemv( 'NO TRANSPOSE', m, l, cone, c( 1, n-l+1 ), ldc,v, incv, cone, &
+                           work, 1 )
+                 ! c( 1:m, 1 ) = c( 1:m, 1 ) - tau * w( 1:m )
+                 call stdlib_caxpy( m, -tau, work, 1, c, 1 )
+                 ! c( 1:m, n-l+1:n ) = c( 1:m, n-l+1:n ) - ...
+                                     ! tau * w( 1:m ) * v( 1:l )**h
+                 call stdlib_cgerc( m, l, -tau, work, 1, v, incv, c( 1, n-l+1 ),ldc )
+              end if
+           end if
+           return
+     end subroutine stdlib_clarz
+
+     !> CLARZB: applies a complex block reflector H or its transpose H**H
+     !> to a complex distributed M-by-N  C from the left or the right.
+     !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+
+     pure subroutine stdlib_clarzb( side, trans, direct, storev, m, n, k, l, v,ldv, t, ldt, c, &
+               ldc, work, ldwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, l, ldc, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: c(ldc,*), t(ldt,*), v(ldv,*)
+           complex(sp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           character :: transt
+           integer(ilp) :: i, info, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           ! check for currently supported options
+           info = 0
+           if( .not.stdlib_lsame( direct, 'B' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( storev, 'R' ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLARZB', -info )
+              return
+           end if
+           if( stdlib_lsame( trans, 'N' ) ) then
+              transt = 'C'
+           else
+              transt = 'N'
+           end if
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c  or  h**h * c
+              ! w( 1:n, 1:k ) = c( 1:k, 1:n )**h
+              do j = 1, k
+                 call stdlib_ccopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+              end do
+              ! w( 1:n, 1:k ) = w( 1:n, 1:k ) + ...
+                              ! c( m-l+1:m, 1:n )**h * v( 1:k, 1:l )**t
+              if( l>0 )call stdlib_cgemm( 'TRANSPOSE', 'CONJUGATE TRANSPOSE', n, k, l,cone, c( m-&
+                        l+1, 1 ), ldc, v, ldv, cone, work,ldwork )
+              ! w( 1:n, 1:k ) = w( 1:n, 1:k ) * t**t  or  w( 1:m, 1:k ) * t
+              call stdlib_ctrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k, cone, t,ldt, work, &
+                        ldwork )
+              ! c( 1:k, 1:n ) = c( 1:k, 1:n ) - w( 1:n, 1:k )**h
+              do j = 1, n
+                 do i = 1, k
+                    c( i, j ) = c( i, j ) - work( j, i )
+                 end do
+              end do
+              ! c( m-l+1:m, 1:n ) = c( m-l+1:m, 1:n ) - ...
+                                  ! v( 1:k, 1:l )**h * w( 1:n, 1:k )**h
+              if( l>0 )call stdlib_cgemm( 'TRANSPOSE', 'TRANSPOSE', l, n, k, -cone, v, ldv,work, &
+                        ldwork, cone, c( m-l+1, 1 ), ldc )
+           else if( stdlib_lsame( side, 'R' ) ) then
+              ! form  c * h  or  c * h**h
+              ! w( 1:m, 1:k ) = c( 1:m, 1:k )
+              do j = 1, k
+                 call stdlib_ccopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+              end do
+              ! w( 1:m, 1:k ) = w( 1:m, 1:k ) + ...
+                              ! c( 1:m, n-l+1:n ) * v( 1:k, 1:l )**h
+              if( l>0 )call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, k, l, cone,c( 1, n-l+1 )&
+                        , ldc, v, ldv, cone, work, ldwork )
+              ! w( 1:m, 1:k ) = w( 1:m, 1:k ) * conjg( t )  or
+                              ! w( 1:m, 1:k ) * t**h
+              do j = 1, k
+                 call stdlib_clacgv( k-j+1, t( j, j ), 1 )
+              end do
+              call stdlib_ctrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k, cone, t,ldt, work, &
+                        ldwork )
+              do j = 1, k
+                 call stdlib_clacgv( k-j+1, t( j, j ), 1 )
+              end do
+              ! c( 1:m, 1:k ) = c( 1:m, 1:k ) - w( 1:m, 1:k )
+              do j = 1, k
+                 do i = 1, m
+                    c( i, j ) = c( i, j ) - work( i, j )
+                 end do
+              end do
+              ! c( 1:m, n-l+1:n ) = c( 1:m, n-l+1:n ) - ...
+                                  ! w( 1:m, 1:k ) * conjg( v( 1:k, 1:l ) )
+              do j = 1, l
+                 call stdlib_clacgv( k, v( 1, j ), 1 )
+              end do
+              if( l>0 )call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, l, k, -cone,work, &
+                        ldwork, v, ldv, cone, c( 1, n-l+1 ), ldc )
+              do j = 1, l
+                 call stdlib_clacgv( k, v( 1, j ), 1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_clarzb
+
+     !> CLARZT: forms the triangular factor T of a complex block reflector
+     !> H of order > n, which is defined as a product of k elementary
+     !> reflectors.
+     !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+     !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+     !> If STOREV = 'C', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th column of the array V, and
+     !> H  =  I - V * T * V**H
+     !> If STOREV = 'R', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th row of the array V, and
+     !> H  =  I - V**H * T * V
+     !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+
+     pure subroutine stdlib_clarzt( direct, storev, n, k, v, ldv, tau, t, ldt )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, storev
+           integer(ilp), intent(in) :: k, ldt, ldv, n
+           ! Array Arguments 
+           complex(sp), intent(out) :: t(ldt,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j
+           ! Executable Statements 
+           ! check for currently supported options
+           info = 0
+           if( .not.stdlib_lsame( direct, 'B' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( storev, 'R' ) ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLARZT', -info )
+              return
+           end if
+           do i = k, 1, -1
+              if( tau( i )==czero ) then
+                 ! h(i)  =  i
+                 do j = i, k
+                    t( j, i ) = czero
+                 end do
+              else
+                 ! general case
+                 if( i<k ) then
+                    ! t(i+1:k,i) = - tau(i) * v(i+1:k,1:n) * v(i,1:n)**h
+                    call stdlib_clacgv( n, v( i, 1 ), ldv )
+                    call stdlib_cgemv( 'NO TRANSPOSE', k-i, n, -tau( i ),v( i+1, 1 ), ldv, v( i, &
+                              1 ), ldv, czero,t( i+1, i ), 1 )
+                    call stdlib_clacgv( n, v( i, 1 ), ldv )
+                    ! t(i+1:k,i) = t(i+1:k,i+1:k) * t(i+1:k,i)
+                    call stdlib_ctrmv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', k-i,t( i+1, i+1 ), &
+                              ldt, t( i+1, i ), 1 )
+                 end if
+                 t( i, i ) = tau( i )
+              end if
+           end do
+           return
+     end subroutine stdlib_clarzt
+
+     !> CLASCL: multiplies the M by N complex matrix A by the real scalar
+     !> CTO/CFROM.  This is done without over/underflow as long as the final
+     !> result CTO*A(I,J)/CFROM does not over/underflow. TYPE specifies that
+     !> A may be full, upper triangular, lower triangular, upper Hessenberg,
+     !> or banded.
+
+     pure subroutine stdlib_clascl( type, kl, ku, cfrom, cto, m, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: type
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, lda, m, n
+           real(sp), intent(in) :: cfrom, cto
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: i, itype, j, k1, k2, k3, k4
+           real(sp) :: bignum, cfrom1, cfromc, cto1, ctoc, mul, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( stdlib_lsame( type, 'G' ) ) then
+              itype = 0
+           else if( stdlib_lsame( type, 'L' ) ) then
+              itype = 1
+           else if( stdlib_lsame( type, 'U' ) ) then
+              itype = 2
+           else if( stdlib_lsame( type, 'H' ) ) then
+              itype = 3
+           else if( stdlib_lsame( type, 'B' ) ) then
+              itype = 4
+           else if( stdlib_lsame( type, 'Q' ) ) then
+              itype = 5
+           else if( stdlib_lsame( type, 'Z' ) ) then
+              itype = 6
+           else
+              itype = -1
+           end if
+           if( itype==-1 ) then
+              info = -1
+           else if( cfrom==zero .or. stdlib_sisnan(cfrom) ) then
+              info = -4
+           else if( stdlib_sisnan(cto) ) then
+              info = -5
+           else if( m<0 ) then
+              info = -6
+           else if( n<0 .or. ( itype==4 .and. n/=m ) .or.( itype==5 .and. n/=m ) ) then
+              info = -7
+           else if( itype<=3 .and. lda<max( 1, m ) ) then
+              info = -9
+           else if( itype>=4 ) then
+              if( kl<0 .or. kl>max( m-1, 0 ) ) then
+                 info = -2
+              else if( ku<0 .or. ku>max( n-1, 0 ) .or.( ( itype==4 .or. itype==5 ) .and. kl/=ku ) &
+                        )then
+                 info = -3
+              else if( ( itype==4 .and. lda<kl+1 ) .or.( itype==5 .and. lda<ku+1 ) .or.( itype==6 &
+                        .and. lda<2*kl+ku+1 ) ) then
+                 info = -9
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLASCL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 )return
+           ! get machine parameters
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           cfromc = cfrom
+           ctoc = cto
+           10 continue
+           cfrom1 = cfromc*smlnum
+           if( cfrom1==cfromc ) then
+              ! cfromc is an inf.  multiply by a correctly signed zero for
+              ! finite ctoc, or a nan if ctoc is infinite.
+              mul = ctoc / cfromc
+              done = .true.
+              cto1 = ctoc
+           else
+              cto1 = ctoc / bignum
+              if( cto1==ctoc ) then
+                 ! ctoc is either 0 or an inf.  in both cases, ctoc itself
+                 ! serves as the correct multiplication factor.
+                 mul = ctoc
+                 done = .true.
+                 cfromc = one
+              else if( abs( cfrom1 )>abs( ctoc ) .and. ctoc/=zero ) then
+                 mul = smlnum
+                 done = .false.
+                 cfromc = cfrom1
+              else if( abs( cto1 )>abs( cfromc ) ) then
+                 mul = bignum
+                 done = .false.
+                 ctoc = cto1
+              else
+                 mul = ctoc / cfromc
+                 done = .true.
+              end if
+           end if
+           if( itype==0 ) then
+              ! full matrix
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==1 ) then
+              ! lower triangular matrix
+              do j = 1, n
+                 do i = j, m
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==2 ) then
+              ! upper triangular matrix
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==3 ) then
+              ! upper hessenberg matrix
+              do j = 1, n
+                 do i = 1, min( j+1, m )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==4 ) then
+              ! lower chalf of a symmetric band matrix
+              k3 = kl + 1
+              k4 = n + 1
+              do j = 1, n
+                 do i = 1, min( k3, k4-j )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==5 ) then
+              ! upper chalf of a symmetric band matrix
+              k1 = ku + 2
+              k3 = ku + 1
+              do j = 1, n
+                 do i = max( k1-j, 1 ), k3
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==6 ) then
+              ! band matrix
+              k1 = kl + ku + 2
+              k2 = kl + 1
+              k3 = 2*kl + ku + 1
+              k4 = kl + ku + 1 + m
+              do j = 1, n
+                 do i = max( k1-j, k2 ), min( k3, k4-j )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           end if
+           if( .not.done )go to 10
+           return
+     end subroutine stdlib_clascl
+
+     !> CLASET: initializes a 2-D array A to BETA on the diagonal and
+     !> ALPHA on the offdiagonals.
+
+     pure subroutine stdlib_claset( uplo, m, n, alpha, beta, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, m, n
+           complex(sp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           complex(sp), intent(out) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! set the diagonal to beta and the strictly upper triangular
+              ! part of the array to alpha.
+              do j = 2, n
+                 do i = 1, min( j-1, m )
+                    a( i, j ) = alpha
+                 end do
+              end do
+              do i = 1, min( n, m )
+                 a( i, i ) = beta
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              ! set the diagonal to beta and the strictly lower triangular
+              ! part of the array to alpha.
+              do j = 1, min( m, n )
+                 do i = j + 1, m
+                    a( i, j ) = alpha
+                 end do
+              end do
+              do i = 1, min( n, m )
+                 a( i, i ) = beta
+              end do
+           else
+              ! set the array to beta on the diagonal and alpha on the
+              ! offdiagonal.
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = alpha
+                 end do
+              end do
+              do i = 1, min( m, n )
+                 a( i, i ) = beta
+              end do
+           end if
+           return
+     end subroutine stdlib_claset
+
+     !> CLASR: applies a sequence of real plane rotations to a complex matrix
+     !> A, from either the left or the right.
+     !> When SIDE = 'L', the transformation takes the form
+     !> A := P*A
+     !> and when SIDE = 'R', the transformation takes the form
+     !> A := A*P**T
+     !> where P is an orthogonal matrix consisting of a sequence of z plane
+     !> rotations, with z = M when SIDE = 'L' and z = N when SIDE = 'R',
+     !> and P**T is the transpose of P.
+     !> When DIRECT = 'F' (Forward sequence), then
+     !> P = P(z-1) * ... * P(2) * P(1)
+     !> and when DIRECT = 'B' (Backward sequence), then
+     !> P = P(1) * P(2) * ... * P(z-1)
+     !> where P(k) is a plane rotation matrix defined by the 2-by-2 rotation
+     !> R(k) = (  c(k)  s(k) )
+     !> = ( -s(k)  c(k) ).
+     !> When PIVOT = 'V' (Variable pivot), the rotation is performed
+     !> for the plane (k,k+1), i.e., P(k) has the form
+     !> P(k) = (  1                                            )
+     !> (       ...                                     )
+     !> (              1                                )
+     !> (                   c(k)  s(k)                  )
+     !> (                  -s(k)  c(k)                  )
+     !> (                                1              )
+     !> (                                     ...       )
+     !> (                                            1  )
+     !> where R(k) appears as a rank-2 modification to the identity matrix in
+     !> rows and columns k and k+1.
+     !> When PIVOT = 'T' (Top pivot), the rotation is performed for the
+     !> plane (1,k+1), so P(k) has the form
+     !> P(k) = (  c(k)                    s(k)                 )
+     !> (         1                                     )
+     !> (              ...                              )
+     !> (                     1                         )
+     !> ( -s(k)                    c(k)                 )
+     !> (                                 1             )
+     !> (                                      ...      )
+     !> (                                             1 )
+     !> where R(k) appears in rows and columns 1 and k+1.
+     !> Similarly, when PIVOT = 'B' (Bottom pivot), the rotation is
+     !> performed for the plane (k,z), giving P(k) the form
+     !> P(k) = ( 1                                             )
+     !> (      ...                                      )
+     !> (             1                                 )
+     !> (                  c(k)                    s(k) )
+     !> (                         1                     )
+     !> (                              ...              )
+     !> (                                     1         )
+     !> (                 -s(k)                    c(k) )
+     !> where R(k) appears in rows and columns k and z.  The rotations are
+     !> performed without ever forming P(k) explicitly.
+
+     pure subroutine stdlib_clasr( side, pivot, direct, m, n, c, s, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, pivot, side
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(in) :: c(*), s(*)
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j
+           real(sp) :: ctemp, stemp
+           complex(sp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( .not.( stdlib_lsame( side, 'L' ) .or. stdlib_lsame( side, 'R' ) ) ) then
+              info = 1
+           else if( .not.( stdlib_lsame( pivot, 'V' ) .or. stdlib_lsame( pivot,'T' ) .or. &
+                     stdlib_lsame( pivot, 'B' ) ) ) then
+              info = 2
+           else if( .not.( stdlib_lsame( direct, 'F' ) .or. stdlib_lsame( direct, 'B' ) ) )&
+                     then
+              info = 3
+           else if( m<0 ) then
+              info = 4
+           else if( n<0 ) then
+              info = 5
+           else if( lda<max( 1, m ) ) then
+              info = 9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLASR ', info )
+              return
+           end if
+           ! quick return if possible
+           if( ( m==0 ) .or. ( n==0 ) )return
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  p * a
+              if( stdlib_lsame( pivot, 'V' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, m - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j+1, i )
+                             a( j+1, i ) = ctemp*temp - stemp*a( j, i )
+                             a( j, i ) = stemp*temp + ctemp*a( j, i )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j+1, i )
+                             a( j+1, i ) = ctemp*temp - stemp*a( j, i )
+                             a( j, i ) = stemp*temp + ctemp*a( j, i )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'T' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 2, m
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = ctemp*temp - stemp*a( 1, i )
+                             a( 1, i ) = stemp*temp + ctemp*a( 1, i )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m, 2, -1
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = ctemp*temp - stemp*a( 1, i )
+                             a( 1, i ) = stemp*temp + ctemp*a( 1, i )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'B' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, m - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = stemp*a( m, i ) + ctemp*temp
+                             a( m, i ) = ctemp*a( m, i ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = stemp*a( m, i ) + ctemp*temp
+                             a( m, i ) = ctemp*a( m, i ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 end if
+              end if
+           else if( stdlib_lsame( side, 'R' ) ) then
+              ! form a * p**t
+              if( stdlib_lsame( pivot, 'V' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, n - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j+1 )
+                             a( i, j+1 ) = ctemp*temp - stemp*a( i, j )
+                             a( i, j ) = stemp*temp + ctemp*a( i, j )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j+1 )
+                             a( i, j+1 ) = ctemp*temp - stemp*a( i, j )
+                             a( i, j ) = stemp*temp + ctemp*a( i, j )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'T' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 2, n
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = ctemp*temp - stemp*a( i, 1 )
+                             a( i, 1 ) = stemp*temp + ctemp*a( i, 1 )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n, 2, -1
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = ctemp*temp - stemp*a( i, 1 )
+                             a( i, 1 ) = stemp*temp + ctemp*a( i, 1 )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'B' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, n - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = stemp*a( i, n ) + ctemp*temp
+                             a( i, n ) = ctemp*a( i, n ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = stemp*a( i, n ) + ctemp*temp
+                             a( i, n ) = ctemp*a( i, n ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_clasr
+
+     !> !
+     !>
+     !> CLASSQ:  returns the values  scl  and  smsq  such that
+     !> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+     !> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+     !> assumed to be non-negative.
+     !> scale and sumsq must be supplied in SCALE and SUMSQ and
+     !> scl and smsq are overwritten on SCALE and SUMSQ respectively.
+     !> If scale * sqrt( sumsq ) > tbig then
+     !> we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
+     !> and if 0 < scale * sqrt( sumsq ) < tsml then
+     !> we require:   scale <= sqrt( HUGE ) / ssml       on entry,
+     !> where
+     !> tbig -- upper threshold for values whose square is representable;
+     !> sbig -- scaling constant for big numbers; \see la_constants.f90
+     !> tsml -- lower threshold for values whose square is representable;
+     !> ssml -- scaling constant for small numbers; \see la_constants.f90
+     !> and
+     !> TINY*EPS -- tiniest representable number;
+     !> HUGE     -- biggest representable number.
+
+     pure subroutine stdlib_classq( n, x, incx, scl, sumsq )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+        ! Scalar Arguments 
+     integer(ilp), intent(in) :: incx, n
+        real(sp), intent(inout) :: scl, sumsq
+        ! Array Arguments 
+        complex(sp), intent(in) :: x(*)
+        ! Local Scalars 
+     integer(ilp) :: i, ix
+     logical(lk) :: notbig
+        real(sp) :: abig, amed, asml, ax, ymax, ymin
+        ! quick return if possible
+        if( ieee_is_nan(scl) .or. ieee_is_nan(sumsq) ) return
+        if( sumsq == zero ) scl = one
+        if( scl == zero ) then
+           scl = one
+           sumsq = zero
+        end if
+        if (n <= 0) then
+           return
+        end if
+        ! compute the sum of squares in 3 accumulators:
+           ! abig -- sums of squares scaled down to avoid overflow
+           ! asml -- sums of squares scaled up to avoid underflow
+           ! amed -- sums of squares that do not require scaling
+        ! the thresholds and multipliers are
+           ! tbig -- values bigger than this are scaled down by sbig
+           ! tsml -- values smaller than this are scaled up by ssml
+        notbig = .true.
+        asml = zero
+        amed = zero
+        abig = zero
+        ix = 1
+        if( incx < 0 ) ix = 1 - (n-1)*incx
+        do i = 1, n
+           ax = abs(real(x(ix),KIND=sp))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ax = abs(aimag(x(ix)))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ix = ix + incx
+        end do
+        ! put the existing sum of squares into one of the accumulators
+        if( sumsq > zero ) then
+           ax = scl*sqrt( sumsq )
+           if (ax > tbig) then
+              ! we assume scl >= sqrt( tiny*eps ) / sbig
+              abig = abig + (scl*sbig)**2 * sumsq
+           else if (ax < tsml) then
+              ! we assume scl <= sqrt( huge ) / ssml
+              if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+           else
+              amed = amed + scl**2 * sumsq
+           end if
+        end if
+        ! combine abig and amed or amed and asml if more than one
+        ! accumulator was used.
+        if (abig > zero) then
+           ! combine abig and amed if abig > 0.
+           if (amed > zero .or. ieee_is_nan(amed)) then
+              abig = abig + (amed*sbig)*sbig
+           end if
+           scl = one / sbig
+           sumsq = abig
+        else if (asml > zero) then
+           ! combine amed and asml if asml > 0.
+           if (amed > zero .or. ieee_is_nan(amed)) then
+              amed = sqrt(amed)
+              asml = sqrt(asml) / ssml
+              if (asml > amed) then
+                 ymin = amed
+                 ymax = asml
+              else
+                 ymin = asml
+                 ymax = amed
+              end if
+              scl = one
+              sumsq = ymax**2*( one + (ymin/ymax)**2 )
+           else
+              scl = one / ssml
+              sumsq = asml
+           end if
+        else
+           ! otherwise all values are mid-range or zero
+           scl = one
+           sumsq = amed
+        end if
+        return
+     end subroutine stdlib_classq
+
+     !> CLASWP: performs a series of row interchanges on the matrix A.
+     !> One row interchange is initiated for each of rows K1 through K2 of A.
+
+     pure subroutine stdlib_claswp( n, a, lda, k1, k2, ipiv, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k1, k2, lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, i1, i2, inc, ip, ix, ix0, j, k, n32
+           complex(sp) :: temp
+           ! Executable Statements 
+           ! interchange row i with row ipiv(k1+(i-k1)*abs(incx)) for each of rows
+           ! k1 through k2.
+           if( incx>0 ) then
+              ix0 = k1
+              i1 = k1
+              i2 = k2
+              inc = 1
+           else if( incx<0 ) then
+              ix0 = k1 + ( k1-k2 )*incx
+              i1 = k2
+              i2 = k1
+              inc = -1
+           else
+              return
+           end if
+           n32 = ( n / 32 )*32
+           if( n32/=0 ) then
+              do j = 1, n32, 32
+                 ix = ix0
+                 do i = i1, i2, inc
+                    ip = ipiv( ix )
+                    if( ip/=i ) then
+                       do k = j, j + 31
+                          temp = a( i, k )
+                          a( i, k ) = a( ip, k )
+                          a( ip, k ) = temp
+                       end do
+                    end if
+                    ix = ix + incx
+                 end do
+              end do
+           end if
+           if( n32/=n ) then
+              n32 = n32 + 1
+              ix = ix0
+              do i = i1, i2, inc
+                 ip = ipiv( ix )
+                 if( ip/=i ) then
+                    do k = n32, n
+                       temp = a( i, k )
+                       a( i, k ) = a( ip, k )
+                       a( ip, k ) = temp
+                    end do
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end subroutine stdlib_claswp
+
+     !> CLASYF: computes a partial factorization of a complex symmetric matrix
+     !> A using the Bunch-Kaufman diagonal pivoting method. The partial
+     !> factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) ( D    0  ) ( L11**T L21**T )  if UPLO = 'L'
+     !> ( L21  I ) ( 0   A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> Note that U**T denotes the transpose of U.
+     !> CLASYF is an auxiliary routine called by CSYTRF. It uses blocked code
+     !> (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or
+     !> A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_clasyf( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: imax, j, jb, jj, jmax, jp, k, kk, kkw, kp, kstep, kw
+           real(sp) :: absakk, alpha, colmax, rowmax
+           complex(sp) :: d11, d21, d22, r1, t, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,min,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=sp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              ! kw is the column of w which corresponds to column k of a
+              k = n
+              10 continue
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              ! copy column k of a to column kw of w and update it
+              call stdlib_ccopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_cgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ), lda,w( k, &
+                        kw+1 ), ldw, cone, w( 1, kw ), 1 )
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! copy column imax to column kw-1 of w and update it
+                    call stdlib_ccopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                    call stdlib_ccopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                              
+                    if( k<n )call stdlib_cgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda, w(&
+                               imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = imax + stdlib_icamax( k-imax, w( imax+1, kw-1 ), 1 )
+                    rowmax = cabs1( w( jmax, kw-1 ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_icamax( imax-1, w( 1, kw-1 ), 1 )
+                       rowmax = max( rowmax, cabs1( w( jmax, kw-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( w( imax, kw-1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column kw-1 of w to column kw of w
+                       call stdlib_ccopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = a( kk, kk )
+                    call stdlib_ccopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    if( kp>1 )call stdlib_ccopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_cswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_cswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! note: diagonal element u(k,k) is a unit element
+                    ! and not stored.
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    call stdlib_ccopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    r1 = cone / a( k, k )
+                    call stdlib_cscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored.
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! compose the columns of the inverse of 2-by-2 pivot
+                       ! block d in the following way to reduce the number
+                       ! of flops when we myltiply panel ( w(kw-1) w(kw) ) by
+                       ! this inverse
+                       ! d**(-1) = ( d11 d21 )**(-1) =
+                                 ! ( d21 d22 )
+                       ! = 1/(d11*d22-d21**2) * ( ( d22 ) (-d21 ) ) =
+                                              ! ( (-d21 ) ( d11 ) )
+                       ! = 1/d21 * 1/((d11/d21)*(d22/d21)-1) *
+                         ! * ( ( d22/d21 ) (      -1 ) ) =
+                           ! ( (      -1 ) ( d11/d21 ) )
+                       ! = 1/d21 * 1/(d22*d11-1) * ( ( d11 ) (  -1 ) ) =
+                                                 ! ( ( -1  ) ( d22 ) )
+                       ! = 1/d21 * t * ( ( d11 ) (  -1 ) )
+                                     ! ( (  -1 ) ( d22 ) )
+                       ! = d21 * ( ( d11 ) (  -1 ) )
+                               ! ( (  -1 ) ( d22 ) )
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / d21
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = cone / ( d11*d22-cone )
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       d21 = t / d21
+                       do j = 1, k - 2
+                          a( j, k-1 ) = d21*( d11*w( j, kw-1 )-w( j, kw ) )
+                          a( j, k ) = d21*( d22*w( j, kw )-w( j, kw-1 ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_cgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k,-cone, a( 1, k+1 ), &
+                           lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n looping backwards from k+1 to n
+              j = k + 1
+              60 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j + 1
+                 end if
+                 ! (note: here, j is used to determine row length. length n-j+1
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j + 1
+                 if( jp/=jj .and. j<=n )call stdlib_cswap( n-j+1, a( jp, j ), lda, a( jj, j ), &
+                           lda )
+              if( j<n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              ! copy column k of a to column k of w and update it
+              call stdlib_ccopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              call stdlib_cgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ), lda,w( k, 1 ), ldw,&
+                         cone, w( k, k ), 1 )
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! copy column imax to column k+1 of w and update it
+                    call stdlib_ccopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1 )
+                    call stdlib_ccopy( n-imax+1, a( imax, imax ), 1, w( imax, k+1 ),1 )
+                    call stdlib_cgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( imax, &
+                              1 ), ldw, cone, w( k, k+1 ),1 )
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = k - 1 + stdlib_icamax( imax-k, w( k, k+1 ), 1 )
+                    rowmax = cabs1( w( jmax, k+1 ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_icamax( n-imax, w( imax+1, k+1 ), 1 )
+                       rowmax = max( rowmax, cabs1( w( jmax, k+1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( w( imax, k+1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column k+1 of w to column k of w
+                       call stdlib_ccopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = a( kk, kk )
+                    call stdlib_ccopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    if( kp<n )call stdlib_ccopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (columns k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_cswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_cswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    call stdlib_ccopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       r1 = cone / a( k, k )
+                       call stdlib_cscal( n-k, r1, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! (note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored)
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! compose the columns of the inverse of 2-by-2 pivot
+                       ! block d in the following way to reduce the number
+                       ! of flops when we myltiply panel ( w(k) w(k+1) ) by
+                       ! this inverse
+                       ! d**(-1) = ( d11 d21 )**(-1) =
+                                 ! ( d21 d22 )
+                       ! = 1/(d11*d22-d21**2) * ( ( d22 ) (-d21 ) ) =
+                                              ! ( (-d21 ) ( d11 ) )
+                       ! = 1/d21 * 1/((d11/d21)*(d22/d21)-1) *
+                         ! * ( ( d22/d21 ) (      -1 ) ) =
+                           ! ( (      -1 ) ( d11/d21 ) )
+                       ! = 1/d21 * 1/(d22*d11-1) * ( ( d11 ) (  -1 ) ) =
+                                                 ! ( ( -1  ) ( d22 ) )
+                       ! = 1/d21 * t * ( ( d11 ) (  -1 ) )
+                                     ! ( (  -1 ) ( d22 ) )
+                       ! = d21 * ( ( d11 ) (  -1 ) )
+                               ! ( (  -1 ) ( d22 ) )
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       d21 = t / d21
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = d21*( d11*w( j, k )-w( j, k+1 ) )
+                          a( j, k+1 ) = d21*( d22*w( j, k+1 )-w( j, k ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_cgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! of rows in columns 1:k-1 looping backwards from k-1 to 1
+              j = k - 1
+              120 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j - 1
+                 end if
+                 ! (note: here, j is used to determine row length. length j
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j - 1
+                 if( jp/=jj .and. j>=1 )call stdlib_cswap( j, a( jp, 1 ), lda, a( jj, 1 ), lda )
+                           
+              if( j>1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_clasyf
+
+     !> CLASYF_RK: computes a partial factorization of a complex symmetric
+     !> matrix A using the bounded Bunch-Kaufman (rook) diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L',
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> CLASYF_RK is an auxiliary routine called by CSYTRF_RK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_clasyf_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: e(*), w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, j, jb, jj, jmax, k, kk, kw, kkw, kp, kstep, p, ii
+           real(sp) :: absakk, alpha, colmax, rowmax, sfmin, stemp
+           complex(sp) :: d11, d12, d21, d22, r1, t, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,min,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=sp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = czero
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              call stdlib_ccopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_cgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ),lda, w( k, &
+                        kw+1 ), ldw, cone, w( 1, kw ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_ccopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       call stdlib_ccopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                       call stdlib_ccopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       if( k<n )call stdlib_cgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda,&
+                                  w( imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_icamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = cabs1( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_icamax( imax-1, w( 1, kw-1 ), 1 )
+                          stemp = cabs1( w( itemp, kw-1 ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! cabs1( w( imax, kw-1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.(cabs1( w( imax, kw-1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_ccopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_ccopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_ccopy( k-p, a( p+1, k ), 1, a( p, p+1 ), lda )
+                    call stdlib_ccopy( p, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in last n-k+1 columns of a
+                    ! and last n-k+2 columns of w
+                    call stdlib_cswap( n-k+1, a( k, k ), lda, a( p, k ), lda )
+                    call stdlib_cswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kkw of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_ccopy( k-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_ccopy( kp, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last n-kk+1 columns
+                    ! of a and w
+                    call stdlib_cswap( n-kk+1, a( kk, kk ), lda, a( kp, kk ), lda )
+                    call stdlib_cswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! store u(k) in column k of a
+                    call stdlib_ccopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          r1 = cone / a( k, k )
+                          call stdlib_cscal( k-1, r1, a( 1, k ), 1 )
+                       else if( a( k, k )/=czero ) then
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now
+                    ! hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    if( k>2 ) then
+                       ! store u(k) and u(k-1) in columns k and k-1 of a
+                       d12 = w( k-1, kw )
+                       d11 = w( k, kw ) / d12
+                       d22 = w( k-1, kw-1 ) / d12
+                       t = cone / ( d11*d22-cone )
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( (d11*w( j, kw-1 )-w( j, kw ) ) /d12 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /d12 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy superdiagonal element of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = czero
+                    a( k, k ) = w( k, kw )
+                    e( k ) = w( k-1, kw )
+                    e( k-1 ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_cgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb,n-k, -cone, a( &
+                           1, k+1 ), lda, w( j, kw+1 ),ldw, cone, a( 1, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = czero
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update it
+              call stdlib_ccopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              if( k>1 )call stdlib_cgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( k, &
+                        1 ), ldw, cone, w( k, k ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_ccopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_ccopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_ccopy( n-imax+1, a( imax, imax ), 1,w( imax, k+1 ), 1 )
+                       if( k>1 )call stdlib_cgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone,a( k, 1 ), &
+                                 lda, w( imax, 1 ), ldw,cone, w( k, k+1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_icamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = cabs1( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_icamax( n-imax, w( imax+1, k+1 ), 1)
+                          stemp = cabs1( w( itemp, k+1 ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! cabs1( w( imax, k+1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( cabs1( w( imax, k+1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_ccopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_ccopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 72
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_ccopy( p-k, a( k, k ), 1, a( p, k ), lda )
+                    call stdlib_ccopy( n-p+1, a( p, k ), 1, a( p, p ), 1 )
+                    ! interchange rows k and p in first k columns of a
+                    ! and first k+1 columns of w
+                    call stdlib_cswap( k, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_cswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kk of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_ccopy( kp-k-1, a( k+1, kk ), 1, a( kp, k+1 ), lda )
+                    call stdlib_ccopy( n-kp+1, a( kp, kk ), 1, a( kp, kp ), 1 )
+                    ! interchange rows kk and kp in first kk columns of a and w
+                    call stdlib_cswap( kk, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_cswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    ! store l(k) in column k of a
+                    call stdlib_ccopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          r1 = cone / a( k, k )
+                          call stdlib_cscal( n-k, r1, a( k+1, k ), 1 )
+                       else if( a( k, k )/=czero ) then
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! store l(k) and l(k+1) in columns k and k+1 of a
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /d21 )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy subdiagonal element of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = czero
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    e( k ) = w( k+1, k )
+                    e( k+1 ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_cgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_clasyf_rk
+
+     !> CLASYF_ROOK: computes a partial factorization of a complex symmetric
+     !> matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> CLASYF_ROOK is an auxiliary routine called by CSYTRF_ROOK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_clasyf_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, j, jb, jj, jmax, jp1, jp2, k, kk, kw, kkw, kp, kstep, p, &
+                     ii
+           real(sp) :: absakk, alpha, colmax, rowmax, stemp, sfmin
+           complex(sp) :: d11, d12, d21, d22, r1, t, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt,aimag,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=sp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              call stdlib_ccopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_cgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ),lda, w( k, &
+                        kw+1 ), ldw, cone, w( 1, kw ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_ccopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       call stdlib_ccopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                       call stdlib_ccopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       if( k<n )call stdlib_cgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda,&
+                                  w( imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_icamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = cabs1( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_icamax( imax-1, w( 1, kw-1 ), 1 )
+                          stemp = cabs1( w( itemp, kw-1 ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! cabs1( w( imax, kw-1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.(cabs1( w( imax, kw-1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_ccopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_ccopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_ccopy( k-p, a( p+1, k ), 1, a( p, p+1 ), lda )
+                    call stdlib_ccopy( p, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in last n-k+1 columns of a
+                    ! and last n-k+2 columns of w
+                    call stdlib_cswap( n-k+1, a( k, k ), lda, a( p, k ), lda )
+                    call stdlib_cswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kkw of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_ccopy( k-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_ccopy( kp, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last n-kk+1 columns
+                    ! of a and w
+                    call stdlib_cswap( n-kk+1, a( kk, kk ), lda, a( kp, kk ), lda )
+                    call stdlib_cswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! store u(k) in column k of a
+                    call stdlib_ccopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          r1 = cone / a( k, k )
+                          call stdlib_cscal( k-1, r1, a( 1, k ), 1 )
+                       else if( a( k, k )/=czero ) then
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now
+                    ! hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    if( k>2 ) then
+                       ! store u(k) and u(k-1) in columns k and k-1 of a
+                       d12 = w( k-1, kw )
+                       d11 = w( k, kw ) / d12
+                       d22 = w( k-1, kw-1 ) / d12
+                       t = cone / ( d11*d22-cone )
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( (d11*w( j, kw-1 )-w( j, kw ) ) /d12 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /d12 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_cgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb,n-k, -cone, a( &
+                           1, k+1 ), lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n
+              j = k + 1
+              60 continue
+                 kstep = 1
+                 jp1 = 1
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    j = j + 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 j = j + 1
+                 if( jp2/=jj .and. j<=n )call stdlib_cswap( n-j+1, a( jp2, j ), lda, a( jj, j ), &
+                           lda )
+                 jj = j - 1
+                 if( jp1/=jj .and. kstep==2 )call stdlib_cswap( n-j+1, a( jp1, j ), lda, a( jj, j &
+                           ), lda )
+              if( j<=n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update it
+              call stdlib_ccopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              if( k>1 )call stdlib_cgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( k, &
+                        1 ), ldw, cone, w( k, k ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_ccopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_ccopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_ccopy( n-imax+1, a( imax, imax ), 1,w( imax, k+1 ), 1 )
+                       if( k>1 )call stdlib_cgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone,a( k, 1 ), &
+                                 lda, w( imax, 1 ), ldw,cone, w( k, k+1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_icamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = cabs1( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_icamax( n-imax, w( imax+1, k+1 ), 1)
+                          stemp = cabs1( w( itemp, k+1 ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! cabs1( w( imax, k+1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( cabs1( w( imax, k+1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_ccopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_ccopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 72
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_ccopy( p-k, a( k, k ), 1, a( p, k ), lda )
+                    call stdlib_ccopy( n-p+1, a( p, k ), 1, a( p, p ), 1 )
+                    ! interchange rows k and p in first k columns of a
+                    ! and first k+1 columns of w
+                    call stdlib_cswap( k, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_cswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kk of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_ccopy( kp-k-1, a( k+1, kk ), 1, a( kp, k+1 ), lda )
+                    call stdlib_ccopy( n-kp+1, a( kp, kk ), 1, a( kp, kp ), 1 )
+                    ! interchange rows kk and kp in first kk columns of a and w
+                    call stdlib_cswap( kk, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_cswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    ! store l(k) in column k of a
+                    call stdlib_ccopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          r1 = cone / a( k, k )
+                          call stdlib_cscal( n-k, r1, a( k+1, k ), 1 )
+                       else if( a( k, k )/=czero ) then
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! store l(k) and l(k+1) in columns k and k+1 of a
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /d21 )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_cgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ), ldw,cone, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! in columns 1:k-1
+              j = k - 1
+              120 continue
+                 kstep = 1
+                 jp1 = 1
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    j = j - 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 j = j - 1
+                 if( jp2/=jj .and. j>=1 )call stdlib_cswap( j, a( jp2, 1 ), lda, a( jj, 1 ), lda )
+                           
+                 jj = j + 1
+                 if( jp1/=jj .and. kstep==2 )call stdlib_cswap( j, a( jp1, 1 ), lda, a( jj, 1 ), &
+                           lda )
+              if( j>=1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_clasyf_rook
+
+     !> CLATBS: solves one of the triangular systems
+     !> A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+     !> with scaling to prevent overflow, where A is an upper or lower
+     !> triangular band matrix.  Here A**T denotes the transpose of A, x and b
+     !> are n-element vectors, and s is a scaling factor, usually less than
+     !> or equal to 1, chosen so that the components of x will be less than
+     !> the overflow threshold.  If the unscaled problem will not cause
+     !> overflow, the Level 2 BLAS routine CTBSV is called.  If the matrix A
+     !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+     !> non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_clatbs( uplo, trans, diag, normin, n, kd, ab, ldab, x,scale, cnorm, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(sp), intent(out) :: scale
+           ! Array Arguments 
+           real(sp), intent(inout) :: cnorm(*)
+           complex(sp), intent(in) :: ab(ldab,*)
+           complex(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, j, jfirst, jinc, jlast, jlen, maind
+           real(sp) :: bignum, grow, rec, smlnum, tjj, tmax, tscal, xbnd, xj, xmax
+           complex(sp) :: csumj, tjjs, uscal, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,max,min,real
+           ! Statement Functions 
+           real(sp) :: cabs1, cabs2
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           cabs2( zdum ) = abs( real( zdum,KIND=sp) / 2. ) +abs( aimag( zdum ) / 2. )
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( kd<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLATBS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = smlnum / stdlib_slamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 do j = 1, n
+                    jlen = min( kd, j-1 )
+                    cnorm( j ) = stdlib_scasum( jlen, ab( kd+1-jlen, j ), 1 )
+                 end do
+              else
+                 ! a is lower triangular.
+                 do j = 1, n
+                    jlen = min( kd, n-j )
+                    if( jlen>0 ) then
+                       cnorm( j ) = stdlib_scasum( jlen, ab( 2, j ), 1 )
+                    else
+                       cnorm( j ) = zero
+                    end if
+                 end do
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum/2.
+           imax = stdlib_isamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum*half ) then
+              tscal = one
+           else
+              tscal = half / ( smlnum*tmax )
+              call stdlib_sscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_ctbsv can be used.
+           xmax = zero
+           do j = 1, n
+              xmax = max( xmax, cabs2( x( j ) ) )
+           end do
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+                 maind = kd + 1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+                 maind = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 60
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    tjjs = ab( maind, j )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = g(j-1) / abs(a(j,j))
+                       xbnd = min( xbnd, min( one, tjj )*grow )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              60 continue
+           else
+              ! compute the growth in a**t * x = b  or  a**h * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+                 maind = kd + 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+                 maind = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 90
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    tjjs = ab( maind, j )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                       if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              90 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_ctbsv( uplo, trans, diag, n, kd, ab, ldab, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum*half ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = ( bignum*half ) / xmax
+                 call stdlib_csscal( n, scale, x, 1 )
+                 xmax = bignum
+              else
+                 xmax = xmax*two
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 loop_110: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = cabs1( x( j ) )
+                    if( nounit ) then
+                       tjjs = ab( maind, j )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 105
+                    end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                                rec = one / xj
+                                call stdlib_csscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          xj = cabs1( x( j ) )
+                       else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                             rec = ( tjj*bignum ) / xj
+                             if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                                rec = rec / cnorm( j )
+                             end if
+                             call stdlib_csscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          xj = cabs1( x( j ) )
+                       else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          xj = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       105 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_csscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_csscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(max(1,j-kd):j-1) := x(max(1,j-kd):j-1) -
+                                                   ! x(j)* a(max(1,j-kd):j-1,j)
+                          jlen = min( kd, j-1 )
+                          call stdlib_caxpy( jlen, -x( j )*tscal,ab( kd+1-jlen, j ), 1, x( j-jlen &
+                                    ), 1 )
+                          i = stdlib_icamax( j-1, x, 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                    else if( j<n ) then
+                       ! compute the update
+                          ! x(j+1:min(j+kd,n)) := x(j+1:min(j+kd,n)) -
+                                                ! x(j) * a(j+1:min(j+kd,n),j)
+                       jlen = min( kd, n-j )
+                       if( jlen>0 )call stdlib_caxpy( jlen, -x( j )*tscal, ab( 2, j ), 1,x( j+1 ),&
+                                  1 )
+                       i = j + stdlib_icamax( n-j, x( j+1 ), 1 )
+                       xmax = cabs1( x( i ) )
+                    end if
+                 end do loop_110
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! solve a**t * x = b
+                 loop_150: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = ab( maind, j )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                          tjj = cabs1( tjjs )
+                          if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                             rec = min( one, rec*tjj )
+                             uscal = stdlib_cladiv( uscal, tjjs )
+                          end if
+                       if( rec<one ) then
+                          call stdlib_csscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=sp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_cdotu to perform the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          csumj = stdlib_cdotu( jlen, ab( kd+1-jlen, j ), 1,x( j-jlen ), 1 )
+                                    
+                       else
+                          jlen = min( kd, n-j )
+                          if( jlen>1 )csumj = stdlib_cdotu( jlen, ab( 2, j ), 1, x( j+1 ),1 )
+                                    
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          do i = 1, jlen
+                             csumj = csumj + ( ab( kd+i-jlen, j )*uscal )*x( j-jlen-1+i )
+                          end do
+                       else
+                          jlen = min( kd, n-j )
+                          do i = 1, jlen
+                             csumj = csumj + ( ab( i+1, j )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=sp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = ab( maind, j )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 145
+                       end if
+                          tjj = cabs1( tjjs )
+                          if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                             if( tjj<one ) then
+                                if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                   rec = one / xj
+                                   call stdlib_csscal( n, rec, x, 1 )
+                                   scale = scale*rec
+                                   xmax = xmax*rec
+                                end if
+                             end if
+                             x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                             if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                                rec = ( tjj*bignum ) / xj
+                                call stdlib_csscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                             x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**t *x = 0.
+                             do i = 1, n
+                                x( i ) = zero
+                             end do
+                             x( j ) = one
+                             scale = zero
+                             xmax = zero
+                          end if
+                          145 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_cladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                 end do loop_150
+              else
+                 ! solve a**h * x = b
+                 loop_190: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = conjg( ab( maind, j ) )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                          tjj = cabs1( tjjs )
+                          if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                             rec = min( one, rec*tjj )
+                             uscal = stdlib_cladiv( uscal, tjjs )
+                          end if
+                       if( rec<one ) then
+                          call stdlib_csscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=sp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_cdotc to perform the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          csumj = stdlib_cdotc( jlen, ab( kd+1-jlen, j ), 1,x( j-jlen ), 1 )
+                                    
+                       else
+                          jlen = min( kd, n-j )
+                          if( jlen>1 )csumj = stdlib_cdotc( jlen, ab( 2, j ), 1, x( j+1 ),1 )
+                                    
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          do i = 1, jlen
+                             csumj = csumj + ( conjg( ab( kd+i-jlen, j ) )*uscal )*x( j-jlen-1+i )
+                                       
+                          end do
+                       else
+                          jlen = min( kd, n-j )
+                          do i = 1, jlen
+                             csumj = csumj + ( conjg( ab( i+1, j ) )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=sp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = conjg( ab( maind, j ) )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 185
+                       end if
+                          tjj = cabs1( tjjs )
+                          if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                             if( tjj<one ) then
+                                if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                   rec = one / xj
+                                   call stdlib_csscal( n, rec, x, 1 )
+                                   scale = scale*rec
+                                   xmax = xmax*rec
+                                end if
+                             end if
+                             x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                             if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                                rec = ( tjj*bignum ) / xj
+                                call stdlib_csscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                             x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**h *x = 0.
+                             do i = 1, n
+                                x( i ) = zero
+                             end do
+                             x( j ) = one
+                             scale = zero
+                             xmax = zero
+                          end if
+                          185 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_cladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                 end do loop_190
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_sscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_clatbs
+
+     !> CLATPS: solves one of the triangular systems
+     !> A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+     !> with scaling to prevent overflow, where A is an upper or lower
+     !> triangular matrix stored in packed form.  Here A**T denotes the
+     !> transpose of A, A**H denotes the conjugate transpose of A, x and b
+     !> are n-element vectors, and s is a scaling factor, usually less than
+     !> or equal to 1, chosen so that the components of x will be less than
+     !> the overflow threshold.  If the unscaled problem will not cause
+     !> overflow, the Level 2 BLAS routine CTPSV is called. If the matrix A
+     !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+     !> non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_clatps( uplo, trans, diag, normin, n, ap, x, scale,cnorm, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(out) :: scale
+           ! Array Arguments 
+           real(sp), intent(inout) :: cnorm(*)
+           complex(sp), intent(in) :: ap(*)
+           complex(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, ip, j, jfirst, jinc, jlast, jlen
+           real(sp) :: bignum, grow, rec, smlnum, tjj, tmax, tscal, xbnd, xj, xmax
+           complex(sp) :: csumj, tjjs, uscal, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,max,min,real
+           ! Statement Functions 
+           real(sp) :: cabs1, cabs2
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           cabs2( zdum ) = abs( real( zdum,KIND=sp) / 2. ) +abs( aimag( zdum ) / 2. )
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLATPS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = smlnum / stdlib_slamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 ip = 1
+                 do j = 1, n
+                    cnorm( j ) = stdlib_scasum( j-1, ap( ip ), 1 )
+                    ip = ip + j
+                 end do
+              else
+                 ! a is lower triangular.
+                 ip = 1
+                 do j = 1, n - 1
+                    cnorm( j ) = stdlib_scasum( n-j, ap( ip+1 ), 1 )
+                    ip = ip + n - j + 1
+                 end do
+                 cnorm( n ) = zero
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum/2.
+           imax = stdlib_isamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum*half ) then
+              tscal = one
+           else
+              tscal = half / ( smlnum*tmax )
+              call stdlib_sscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_ctpsv can be used.
+           xmax = zero
+           do j = 1, n
+              xmax = max( xmax, cabs2( x( j ) ) )
+           end do
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 60
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = n
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    tjjs = ap( ip )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = g(j-1) / abs(a(j,j))
+                       xbnd = min( xbnd, min( one, tjj )*grow )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                    ip = ip + jinc*jlen
+                    jlen = jlen - 1
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              60 continue
+           else
+              ! compute the growth in a**t * x = b  or  a**h * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 90
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    tjjs = ap( ip )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                       if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              90 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_ctpsv( uplo, trans, diag, n, ap, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum*half ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = ( bignum*half ) / xmax
+                 call stdlib_csscal( n, scale, x, 1 )
+                 xmax = bignum
+              else
+                 xmax = xmax*two
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 loop_110: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = cabs1( x( j ) )
+                    if( nounit ) then
+                       tjjs = ap( ip )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 105
+                    end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                                rec = one / xj
+                                call stdlib_csscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          xj = cabs1( x( j ) )
+                       else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                             rec = ( tjj*bignum ) / xj
+                             if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                                rec = rec / cnorm( j )
+                             end if
+                             call stdlib_csscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          xj = cabs1( x( j ) )
+                       else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          xj = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       105 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_csscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_csscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(1:j-1) := x(1:j-1) - x(j) * a(1:j-1,j)
+                          call stdlib_caxpy( j-1, -x( j )*tscal, ap( ip-j+1 ), 1, x,1 )
+                          i = stdlib_icamax( j-1, x, 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                       ip = ip - j
+                    else
+                       if( j<n ) then
+                          ! compute the update
+                             ! x(j+1:n) := x(j+1:n) - x(j) * a(j+1:n,j)
+                          call stdlib_caxpy( n-j, -x( j )*tscal, ap( ip+1 ), 1,x( j+1 ), 1 )
+                                    
+                          i = j + stdlib_icamax( n-j, x( j+1 ), 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                       ip = ip + n - j + 1
+                    end if
+                 end do loop_110
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! solve a**t * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 loop_150: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = ap( ip )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                          tjj = cabs1( tjjs )
+                          if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                             rec = min( one, rec*tjj )
+                             uscal = stdlib_cladiv( uscal, tjjs )
+                          end if
+                       if( rec<one ) then
+                          call stdlib_csscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=sp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_cdotu to perform the dot product.
+                       if( upper ) then
+                          csumj = stdlib_cdotu( j-1, ap( ip-j+1 ), 1, x, 1 )
+                       else if( j<n ) then
+                          csumj = stdlib_cdotu( n-j, ap( ip+1 ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             csumj = csumj + ( ap( ip-j+i )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = 1, n - j
+                             csumj = csumj + ( ap( ip+i )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=sp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = ap( ip )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 145
+                       end if
+                          tjj = cabs1( tjjs )
+                          if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                             if( tjj<one ) then
+                                if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                   rec = one / xj
+                                   call stdlib_csscal( n, rec, x, 1 )
+                                   scale = scale*rec
+                                   xmax = xmax*rec
+                                end if
+                             end if
+                             x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                             if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                                rec = ( tjj*bignum ) / xj
+                                call stdlib_csscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                             x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**t *x = 0.
+                             do i = 1, n
+                                x( i ) = zero
+                             end do
+                             x( j ) = one
+                             scale = zero
+                             xmax = zero
+                          end if
+                          145 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_cladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do loop_150
+              else
+                 ! solve a**h * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 loop_190: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = conjg( ap( ip ) )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                          tjj = cabs1( tjjs )
+                          if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                             rec = min( one, rec*tjj )
+                             uscal = stdlib_cladiv( uscal, tjjs )
+                          end if
+                       if( rec<one ) then
+                          call stdlib_csscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=sp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_cdotc to perform the dot product.
+                       if( upper ) then
+                          csumj = stdlib_cdotc( j-1, ap( ip-j+1 ), 1, x, 1 )
+                       else if( j<n ) then
+                          csumj = stdlib_cdotc( n-j, ap( ip+1 ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             csumj = csumj + ( conjg( ap( ip-j+i ) )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = 1, n - j
+                             csumj = csumj + ( conjg( ap( ip+i ) )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=sp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = conjg( ap( ip ) )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 185
+                       end if
+                          tjj = cabs1( tjjs )
+                          if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                             if( tjj<one ) then
+                                if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                   rec = one / xj
+                                   call stdlib_csscal( n, rec, x, 1 )
+                                   scale = scale*rec
+                                   xmax = xmax*rec
+                                end if
+                             end if
+                             x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                             if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                                rec = ( tjj*bignum ) / xj
+                                call stdlib_csscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                             x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**h *x = 0.
+                             do i = 1, n
+                                x( i ) = zero
+                             end do
+                             x( j ) = one
+                             scale = zero
+                             xmax = zero
+                          end if
+                          185 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_cladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do loop_190
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_sscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_clatps
+
+     !> CLATRD: reduces NB rows and columns of a complex Hermitian matrix A to
+     !> Hermitian tridiagonal form by a unitary similarity
+     !> transformation Q**H * A * Q, and returns the matrices V and W which are
+     !> needed to apply the transformation to the unreduced part of A.
+     !> If UPLO = 'U', CLATRD reduces the last NB rows and columns of a
+     !> matrix, of which the upper triangle is supplied;
+     !> if UPLO = 'L', CLATRD reduces the first NB rows and columns of a
+     !> matrix, of which the lower triangle is supplied.
+     !> This is an auxiliary routine called by CHETRD.
+
+     pure subroutine stdlib_clatrd( uplo, n, nb, a, lda, e, tau, w, ldw )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           real(sp), intent(out) :: e(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), w(ldw,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iw
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: min,real
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 )return
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! reduce last nb columns of upper triangle
+              loop_10: do i = n, n - nb + 1, -1
+                 iw = i - n + nb
+                 if( i<n ) then
+                    ! update a(1:i,i)
+                    a( i, i ) = real( a( i, i ),KIND=sp)
+                    call stdlib_clacgv( n-i, w( i, iw+1 ), ldw )
+                    call stdlib_cgemv( 'NO TRANSPOSE', i, n-i, -cone, a( 1, i+1 ),lda, w( i, iw+1 &
+                              ), ldw, cone, a( 1, i ), 1 )
+                    call stdlib_clacgv( n-i, w( i, iw+1 ), ldw )
+                    call stdlib_clacgv( n-i, a( i, i+1 ), lda )
+                    call stdlib_cgemv( 'NO TRANSPOSE', i, n-i, -cone, w( 1, iw+1 ),ldw, a( i, i+1 &
+                              ), lda, cone, a( 1, i ), 1 )
+                    call stdlib_clacgv( n-i, a( i, i+1 ), lda )
+                    a( i, i ) = real( a( i, i ),KIND=sp)
+                 end if
+                 if( i>1 ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(1:i-2,i)
+                    alpha = a( i-1, i )
+                    call stdlib_clarfg( i-1, alpha, a( 1, i ), 1, tau( i-1 ) )
+                    e( i-1 ) = real( alpha,KIND=sp)
+                    a( i-1, i ) = cone
+                    ! compute w(1:i-1,i)
+                    call stdlib_chemv( 'UPPER', i-1, cone, a, lda, a( 1, i ), 1,czero, w( 1, iw ),&
+                               1 )
+                    if( i<n ) then
+                       call stdlib_cgemv( 'CONJUGATE TRANSPOSE', i-1, n-i, cone,w( 1, iw+1 ), ldw,&
+                                  a( 1, i ), 1, czero,w( i+1, iw ), 1 )
+                       call stdlib_cgemv( 'NO TRANSPOSE', i-1, n-i, -cone,a( 1, i+1 ), lda, w( i+&
+                                 1, iw ), 1, cone,w( 1, iw ), 1 )
+                       call stdlib_cgemv( 'CONJUGATE TRANSPOSE', i-1, n-i, cone,a( 1, i+1 ), lda, &
+                                 a( 1, i ), 1, czero,w( i+1, iw ), 1 )
+                       call stdlib_cgemv( 'NO TRANSPOSE', i-1, n-i, -cone,w( 1, iw+1 ), ldw, w( i+&
+                                 1, iw ), 1, cone,w( 1, iw ), 1 )
+                    end if
+                    call stdlib_cscal( i-1, tau( i-1 ), w( 1, iw ), 1 )
+                    alpha = -chalf*tau( i-1 )*stdlib_cdotc( i-1, w( 1, iw ), 1,a( 1, i ), 1 )
+                              
+                    call stdlib_caxpy( i-1, alpha, a( 1, i ), 1, w( 1, iw ), 1 )
+                 end if
+              end do loop_10
+           else
+              ! reduce first nb columns of lower triangle
+              loop_20: do i = 1, nb
+                 ! update a(i:n,i)
+                 a( i, i ) = real( a( i, i ),KIND=sp)
+                 call stdlib_clacgv( i-1, w( i, 1 ), ldw )
+                 call stdlib_cgemv( 'NO TRANSPOSE', n-i+1, i-1, -cone, a( i, 1 ),lda, w( i, 1 ), &
+                           ldw, cone, a( i, i ), 1 )
+                 call stdlib_clacgv( i-1, w( i, 1 ), ldw )
+                 call stdlib_clacgv( i-1, a( i, 1 ), lda )
+                 call stdlib_cgemv( 'NO TRANSPOSE', n-i+1, i-1, -cone, w( i, 1 ),ldw, a( i, 1 ), &
+                           lda, cone, a( i, i ), 1 )
+                 call stdlib_clacgv( i-1, a( i, 1 ), lda )
+                 a( i, i ) = real( a( i, i ),KIND=sp)
+                 if( i<n ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(i+2:n,i)
+                    alpha = a( i+1, i )
+                    call stdlib_clarfg( n-i, alpha, a( min( i+2, n ), i ), 1,tau( i ) )
+                    e( i ) = real( alpha,KIND=sp)
+                    a( i+1, i ) = cone
+                    ! compute w(i+1:n,i)
+                    call stdlib_chemv( 'LOWER', n-i, cone, a( i+1, i+1 ), lda,a( i+1, i ), 1, &
+                              czero, w( i+1, i ), 1 )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-i, i-1, cone,w( i+1, 1 ), ldw, a( &
+                              i+1, i ), 1, czero,w( 1, i ), 1 )
+                    call stdlib_cgemv( 'NO TRANSPOSE', n-i, i-1, -cone, a( i+1, 1 ),lda, w( 1, i )&
+                              , 1, cone, w( i+1, i ), 1 )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-i, i-1, cone,a( i+1, 1 ), lda, a( &
+                              i+1, i ), 1, czero,w( 1, i ), 1 )
+                    call stdlib_cgemv( 'NO TRANSPOSE', n-i, i-1, -cone, w( i+1, 1 ),ldw, w( 1, i )&
+                              , 1, cone, w( i+1, i ), 1 )
+                    call stdlib_cscal( n-i, tau( i ), w( i+1, i ), 1 )
+                    alpha = -chalf*tau( i )*stdlib_cdotc( n-i, w( i+1, i ), 1,a( i+1, i ), 1 )
+                              
+                    call stdlib_caxpy( n-i, alpha, a( i+1, i ), 1, w( i+1, i ), 1 )
+                 end if
+              end do loop_20
+           end if
+           return
+     end subroutine stdlib_clatrd
+
+     !> CLATRS: solves one of the triangular systems
+     !> A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+     !> with scaling to prevent overflow.  Here A is an upper or lower
+     !> triangular matrix, A**T denotes the transpose of A, A**H denotes the
+     !> conjugate transpose of A, x and b are n-element vectors, and s is a
+     !> scaling factor, usually less than or equal to 1, chosen so that the
+     !> components of x will be less than the overflow threshold.  If the
+     !> unscaled problem will not cause overflow, the Level 2 BLAS routine
+     !> CTRSV is called. If the matrix A is singular (A(j,j) = 0 for some j),
+     !> then s is set to 0 and a non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_clatrs( uplo, trans, diag, normin, n, a, lda, x, scale,cnorm, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(out) :: scale
+           ! Array Arguments 
+           real(sp), intent(inout) :: cnorm(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, j, jfirst, jinc, jlast
+           real(sp) :: bignum, grow, rec, smlnum, tjj, tmax, tscal, xbnd, xj, xmax
+           complex(sp) :: csumj, tjjs, uscal, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,max,min,real
+           ! Statement Functions 
+           real(sp) :: cabs1, cabs2
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           cabs2( zdum ) = abs( real( zdum,KIND=sp) / 2. ) +abs( aimag( zdum ) / 2. )
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLATRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = smlnum / stdlib_slamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 do j = 1, n
+                    cnorm( j ) = stdlib_scasum( j-1, a( 1, j ), 1 )
+                 end do
+              else
+                 ! a is lower triangular.
+                 do j = 1, n - 1
+                    cnorm( j ) = stdlib_scasum( n-j, a( j+1, j ), 1 )
+                 end do
+                 cnorm( n ) = zero
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum/2.
+           imax = stdlib_isamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum*half ) then
+              tscal = one
+           else
+              tscal = half / ( smlnum*tmax )
+              call stdlib_sscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_ctrsv can be used.
+           xmax = zero
+           do j = 1, n
+              xmax = max( xmax, cabs2( x( j ) ) )
+           end do
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 60
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    tjjs = a( j, j )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = g(j-1) / abs(a(j,j))
+                       xbnd = min( xbnd, min( one, tjj )*grow )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              60 continue
+           else
+              ! compute the growth in a**t * x = b  or  a**h * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 90
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    tjjs = a( j, j )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                       if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              90 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_ctrsv( uplo, trans, diag, n, a, lda, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum*half ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = ( bignum*half ) / xmax
+                 call stdlib_csscal( n, scale, x, 1 )
+                 xmax = bignum
+              else
+                 xmax = xmax*two
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 loop_110: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = cabs1( x( j ) )
+                    if( nounit ) then
+                       tjjs = a( j, j )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 105
+                    end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                                rec = one / xj
+                                call stdlib_csscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          xj = cabs1( x( j ) )
+                       else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                             rec = ( tjj*bignum ) / xj
+                             if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                                rec = rec / cnorm( j )
+                             end if
+                             call stdlib_csscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          xj = cabs1( x( j ) )
+                       else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          xj = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       105 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_csscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_csscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(1:j-1) := x(1:j-1) - x(j) * a(1:j-1,j)
+                          call stdlib_caxpy( j-1, -x( j )*tscal, a( 1, j ), 1, x,1 )
+                          i = stdlib_icamax( j-1, x, 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                    else
+                       if( j<n ) then
+                          ! compute the update
+                             ! x(j+1:n) := x(j+1:n) - x(j) * a(j+1:n,j)
+                          call stdlib_caxpy( n-j, -x( j )*tscal, a( j+1, j ), 1,x( j+1 ), 1 )
+                                    
+                          i = j + stdlib_icamax( n-j, x( j+1 ), 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                    end if
+                 end do loop_110
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! solve a**t * x = b
+                 loop_150: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = a( j, j )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                          tjj = cabs1( tjjs )
+                          if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                             rec = min( one, rec*tjj )
+                             uscal = stdlib_cladiv( uscal, tjjs )
+                          end if
+                       if( rec<one ) then
+                          call stdlib_csscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=sp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_cdotu to perform the dot product.
+                       if( upper ) then
+                          csumj = stdlib_cdotu( j-1, a( 1, j ), 1, x, 1 )
+                       else if( j<n ) then
+                          csumj = stdlib_cdotu( n-j, a( j+1, j ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             csumj = csumj + ( a( i, j )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = j + 1, n
+                             csumj = csumj + ( a( i, j )*uscal )*x( i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=sp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          tjjs = a( j, j )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 145
+                       end if
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjj = cabs1( tjjs )
+                          if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                             if( tjj<one ) then
+                                if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                   rec = one / xj
+                                   call stdlib_csscal( n, rec, x, 1 )
+                                   scale = scale*rec
+                                   xmax = xmax*rec
+                                end if
+                             end if
+                             x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                             if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                                rec = ( tjj*bignum ) / xj
+                                call stdlib_csscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                             x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**t *x = 0.
+                             do i = 1, n
+                                x( i ) = zero
+                             end do
+                             x( j ) = one
+                             scale = zero
+                             xmax = zero
+                          end if
+                          145 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_cladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                 end do loop_150
+              else
+                 ! solve a**h * x = b
+                 loop_190: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = conjg( a( j, j ) )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                          tjj = cabs1( tjjs )
+                          if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                             rec = min( one, rec*tjj )
+                             uscal = stdlib_cladiv( uscal, tjjs )
+                          end if
+                       if( rec<one ) then
+                          call stdlib_csscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=sp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_cdotc to perform the dot product.
+                       if( upper ) then
+                          csumj = stdlib_cdotc( j-1, a( 1, j ), 1, x, 1 )
+                       else if( j<n ) then
+                          csumj = stdlib_cdotc( n-j, a( j+1, j ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             csumj = csumj + ( conjg( a( i, j ) )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = j + 1, n
+                             csumj = csumj + ( conjg( a( i, j ) )*uscal )*x( i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=sp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          tjjs = conjg( a( j, j ) )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 185
+                       end if
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjj = cabs1( tjjs )
+                          if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                             if( tjj<one ) then
+                                if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                   rec = one / xj
+                                   call stdlib_csscal( n, rec, x, 1 )
+                                   scale = scale*rec
+                                   xmax = xmax*rec
+                                end if
+                             end if
+                             x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                             if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                                rec = ( tjj*bignum ) / xj
+                                call stdlib_csscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                             x( j ) = stdlib_cladiv( x( j ), tjjs )
+                          else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**h *x = 0.
+                             do i = 1, n
+                                x( i ) = zero
+                             end do
+                             x( j ) = one
+                             scale = zero
+                             xmax = zero
+                          end if
+                          185 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_cladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                 end do loop_190
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_sscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_clatrs
+
+     !> CLATRZ: factors the M-by-(M+L) complex upper trapezoidal matrix
+     !> [ A1 A2 ] = [ A(1:M,1:M) A(1:M,N-L+1:N) ] as ( R  0 ) * Z by means
+     !> of unitary transformations, where  Z is an (M+L)-by-(M+L) unitary
+     !> matrix and, R and A1 are M-by-M upper triangular matrices.
+
+     pure subroutine stdlib_clatrz( m, n, l, a, lda, tau, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: l, lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( m==0 ) then
+              return
+           else if( m==n ) then
+              do i = 1, n
+                 tau( i ) = czero
+              end do
+              return
+           end if
+           do i = m, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! [ a(i,i) a(i,n-l+1:n) ]
+              call stdlib_clacgv( l, a( i, n-l+1 ), lda )
+              alpha = conjg( a( i, i ) )
+              call stdlib_clarfg( l+1, alpha, a( i, n-l+1 ), lda, tau( i ) )
+              tau( i ) = conjg( tau( i ) )
+              ! apply h(i) to a(1:i-1,i:n) from the right
+              call stdlib_clarz( 'RIGHT', i-1, n-i+1, l, a( i, n-l+1 ), lda,conjg( tau( i ) ), a( &
+                        1, i ), lda, work )
+              a( i, i ) = conjg( alpha )
+           end do
+           return
+     end subroutine stdlib_clatrz
+
+     !> CLAUNHR_COL_GETRFNP2: computes the modified LU factorization without
+     !> pivoting of a complex general M-by-N matrix A. The factorization has
+     !> the form:
+     !> A - S = L * U,
+     !> where:
+     !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+     !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+     !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+     !> i-1 steps of Gaussian elimination. This means that the diagonal
+     !> element at each step of "modified" Gaussian elimination is at
+     !> least one in absolute value (so that division-by-zero not
+     !> possible during the division by the diagonal element);
+     !> L is a M-by-N lower triangular matrix with unit diagonal elements
+     !> (lower trapezoidal if M > N);
+     !> and U is a M-by-N upper triangular matrix
+     !> (upper trapezoidal if M < N).
+     !> This routine is an auxiliary routine used in the Householder
+     !> reconstruction routine CUNHR_COL. In CUNHR_COL, this routine is
+     !> applied to an M-by-N matrix A with orthonormal columns, where each
+     !> element is bounded by one in absolute value. With the choice of
+     !> the matrix S above, one can show that the diagonal element at each
+     !> step of Gaussian elimination is the largest (in absolute value) in
+     !> the column on or below the diagonal, so that no pivoting is required
+     !> for numerical stability [1].
+     !> For more details on the Householder reconstruction algorithm,
+     !> including the modified LU factorization, see [1].
+     !> This is the recursive version of the LU factorization algorithm.
+     !> Denote A - S by B. The algorithm divides the matrix B into four
+     !> submatrices:
+     !> [  B11 | B12  ]  where B11 is n1 by n1,
+     !> B = [ -----|----- ]        B21 is (m-n1) by n1,
+     !> [  B21 | B22  ]        B12 is n1 by n2,
+     !> B22 is (m-n1) by n2,
+     !> with n1 = min(m,n)/2, n2 = n-n1.
+     !> The subroutine calls itself to factor B11, solves for B21,
+     !> solves for B12, updates B22, then calls itself to factor B22.
+     !> For more details on the recursive LU algorithm, see [2].
+     !> CLAUNHR_COL_GETRFNP2 is called to factorize a block by the blocked
+     !> routine CLAUNHR_COL_GETRFNP, which uses blocked code calling
+     !> Level 3 BLAS to update the submatrix. However, CLAUNHR_COL_GETRFNP2
+     !> is self-sufficient and can be used without CLAUNHR_COL_GETRFNP.
+     !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+     !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+     !> E. Solomonik, J. Parallel Distrib. Comput.,
+     !> vol. 85, pp. 3-31, 2015.
+     !> [2] "Recursion leads to automatic variable blocking for dense linear
+     !> algebra algorithms", F. Gustavson, IBM J. of Res. and Dev.,
+     !> vol. 41, no. 6, pp. 737-755, 1997.
+
+     pure recursive subroutine stdlib_claunhr_col_getrfnp2( m, n, a, lda, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: d(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(sp) :: sfmin
+           integer(ilp) :: i, iinfo, n1, n2
+           complex(sp) :: z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,sign,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=sp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLAUNHR_COL_GETRFNP2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 )return
+           if ( m==1 ) then
+              ! one row case, (also recursion termination case),
+              ! use unblocked code
+              ! transfer the sign
+              d( 1 ) = cmplx( -sign( one, real( a( 1, 1 ),KIND=sp) ),KIND=sp)
+              ! construct the row of u
+              a( 1, 1 ) = a( 1, 1 ) - d( 1 )
+           else if( n==1 ) then
+              ! one column case, (also recursion termination case),
+              ! use unblocked code
+              ! transfer the sign
+              d( 1 ) = cmplx( -sign( one, real( a( 1, 1 ),KIND=sp) ),KIND=sp)
+              ! construct the row of u
+              a( 1, 1 ) = a( 1, 1 ) - d( 1 )
+              ! scale the elements 2:m of the column
+              ! determine machine safe minimum
+              sfmin = stdlib_slamch('S')
+              ! construct the subdiagonal elements of l
+              if( cabs1( a( 1, 1 ) ) >= sfmin ) then
+                 call stdlib_cscal( m-1, cone / a( 1, 1 ), a( 2, 1 ), 1 )
+              else
+                 do i = 2, m
+                    a( i, 1 ) = a( i, 1 ) / a( 1, 1 )
+                 end do
+              end if
+           else
+              ! divide the matrix b into four submatrices
+              n1 = min( m, n ) / 2
+              n2 = n-n1
+              ! factor b11, recursive call
+              call stdlib_claunhr_col_getrfnp2( n1, n1, a, lda, d, iinfo )
+              ! solve for b21
+              call stdlib_ctrsm( 'R', 'U', 'N', 'N', m-n1, n1, cone, a, lda,a( n1+1, 1 ), lda )
+                        
+              ! solve for b12
+              call stdlib_ctrsm( 'L', 'L', 'N', 'U', n1, n2, cone, a, lda,a( 1, n1+1 ), lda )
+                        
+              ! update b22, i.e. compute the schur complement
+              ! b22 := b22 - b21*b12
+              call stdlib_cgemm( 'N', 'N', m-n1, n2, n1, -cone, a( n1+1, 1 ), lda,a( 1, n1+1 ), &
+                        lda, cone, a( n1+1, n1+1 ), lda )
+              ! factor b22, recursive call
+              call stdlib_claunhr_col_getrfnp2( m-n1, n2, a( n1+1, n1+1 ), lda,d( n1+1 ), iinfo )
+                        
+           end if
+           return
+     end subroutine stdlib_claunhr_col_getrfnp2
+
+     !> CLAUU2: computes the product U * U**H or L**H * L, where the triangular
+     !> factor U or L is stored in the upper or lower triangular part of
+     !> the array A.
+     !> If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+     !> overwriting the factor U in A.
+     !> If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+     !> overwriting the factor L in A.
+     !> This is the unblocked form of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_clauu2( uplo, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           real(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLAUU2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the product u * u**h.
+              do i = 1, n
+                 aii = real( a( i, i ),KIND=sp)
+                 if( i<n ) then
+                    a( i, i ) = aii*aii + real( stdlib_cdotc( n-i, a( i, i+1 ), lda,a( i, i+1 ), &
+                              lda ),KIND=sp)
+                    call stdlib_clacgv( n-i, a( i, i+1 ), lda )
+                    call stdlib_cgemv( 'NO TRANSPOSE', i-1, n-i, cone, a( 1, i+1 ),lda, a( i, i+1 &
+                              ), lda, cmplx( aii,KIND=sp),a( 1, i ), 1 )
+                    call stdlib_clacgv( n-i, a( i, i+1 ), lda )
+                 else
+                    call stdlib_csscal( i, aii, a( 1, i ), 1 )
+                 end if
+              end do
+           else
+              ! compute the product l**h * l.
+              do i = 1, n
+                 aii = real( a( i, i ),KIND=sp)
+                 if( i<n ) then
+                    a( i, i ) = aii*aii + real( stdlib_cdotc( n-i, a( i+1, i ), 1,a( i+1, i ), 1 )&
+                              ,KIND=sp)
+                    call stdlib_clacgv( i-1, a( i, 1 ), lda )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-i, i-1, cone,a( i+1, 1 ), lda, a( &
+                              i+1, i ), 1,cmplx( aii,KIND=sp), a( i, 1 ), lda )
+                    call stdlib_clacgv( i-1, a( i, 1 ), lda )
+                 else
+                    call stdlib_csscal( i, aii, a( i, 1 ), lda )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_clauu2
+
+     !> CLAUUM: computes the product U * U**H or L**H * L, where the triangular
+     !> factor U or L is stored in the upper or lower triangular part of
+     !> the array A.
+     !> If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+     !> overwriting the factor U in A.
+     !> If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+     !> overwriting the factor L in A.
+     !> This is the blocked form of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_clauum( uplo, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ib, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLAUUM', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'CLAUUM', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_clauu2( uplo, n, a, lda, info )
+           else
+              ! use blocked code
+              if( upper ) then
+                 ! compute the product u * u**h.
+                 do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT', i-1, &
+                              ib, cone, a( i, i ), lda,a( 1, i ), lda )
+                    call stdlib_clauu2( 'UPPER', ib, a( i, i ), lda, info )
+                    if( i+ib<=n ) then
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',i-1, ib, n-i-ib+1,&
+                                  cone, a( 1, i+ib ),lda, a( i, i+ib ), lda, cone, a( 1, i ),lda )
+                       call stdlib_cherk( 'UPPER', 'NO TRANSPOSE', ib, n-i-ib+1,one, a( i, i+ib ),&
+                                  lda, one, a( i, i ),lda )
+                    end if
+                 end do
+              else
+                 ! compute the product l**h * l.
+                 do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    call stdlib_ctrmm( 'LEFT', 'LOWER', 'CONJUGATE TRANSPOSE','NON-UNIT', ib, i-1,&
+                               cone, a( i, i ), lda,a( i, 1 ), lda )
+                    call stdlib_clauu2( 'LOWER', ib, a( i, i ), lda, info )
+                    if( i+ib<=n ) then
+                       call stdlib_cgemm( 'CONJUGATE TRANSPOSE', 'NO TRANSPOSE', ib,i-1, n-i-ib+1,&
+                                  cone, a( i+ib, i ), lda,a( i+ib, 1 ), lda, cone, a( i, 1 ), lda )
+                       call stdlib_cherk( 'LOWER', 'CONJUGATE TRANSPOSE', ib,n-i-ib+1, one, a( i+&
+                                 ib, i ), lda, one,a( i, i ), lda )
+                    end if
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_clauum
+
+     !> CPBEQU: computes row and column scalings intended to equilibrate a
+     !> Hermitian positive definite band matrix A and reduce its condition
+     !> number (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_cpbequ( uplo, n, kd, ab, ldab, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(sp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(out) :: s(*)
+           complex(sp), intent(in) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j
+           real(sp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPBEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           if( upper ) then
+              j = kd + 1
+           else
+              j = 1
+           end if
+           ! initialize smin and amax.
+           s( 1 ) = real( ab( j, 1 ),KIND=sp)
+           smin = s( 1 )
+           amax = s( 1 )
+           ! find the minimum and maximum diagonal elements.
+           do i = 2, n
+              s( i ) = real( ab( j, i ),KIND=sp)
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_cpbequ
+
+     !> CPBSTF: computes a split Cholesky factorization of a complex
+     !> Hermitian positive definite band matrix A.
+     !> This routine is designed to be used in conjunction with CHBGST.
+     !> The factorization has the form  A = S**H*S  where S is a band matrix
+     !> of the same bandwidth as A and the following structure:
+     !> S = ( U    )
+     !> ( M  L )
+     !> where U is upper triangular of order m = (n+kd)/2, and L is lower
+     !> triangular of order n-m.
+
+     pure subroutine stdlib_cpbstf( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, kld, km, m
+           real(sp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPBSTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           kld = max( 1, ldab-1 )
+           ! set the splitting point m.
+           m = ( n+kd ) / 2
+           if( upper ) then
+              ! factorize a(m+1:n,m+1:n) as l**h*l, and update a(1:m,1:m).
+              do j = n, m + 1, -1
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( kd+1, j ),KIND=sp)
+                 if( ajj<=zero ) then
+                    ab( kd+1, j ) = ajj
+                    go to 50
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 km = min( j-1, kd )
+                 ! compute elements j-km:j-1 of the j-th column and update the
+                 ! the leading submatrix within the band.
+                 call stdlib_csscal( km, one / ajj, ab( kd+1-km, j ), 1 )
+                 call stdlib_cher( 'UPPER', km, -one, ab( kd+1-km, j ), 1,ab( kd+1, j-km ), kld )
+                           
+              end do
+              ! factorize the updated submatrix a(1:m,1:m) as u**h*u.
+              do j = 1, m
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( kd+1, j ),KIND=sp)
+                 if( ajj<=zero ) then
+                    ab( kd+1, j ) = ajj
+                    go to 50
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 km = min( kd, m-j )
+                 ! compute elements j+1:j+km of the j-th row and update the
+                 ! trailing submatrix within the band.
+                 if( km>0 ) then
+                    call stdlib_csscal( km, one / ajj, ab( kd, j+1 ), kld )
+                    call stdlib_clacgv( km, ab( kd, j+1 ), kld )
+                    call stdlib_cher( 'UPPER', km, -one, ab( kd, j+1 ), kld,ab( kd+1, j+1 ), kld )
+                              
+                    call stdlib_clacgv( km, ab( kd, j+1 ), kld )
+                 end if
+              end do
+           else
+              ! factorize a(m+1:n,m+1:n) as l**h*l, and update a(1:m,1:m).
+              do j = n, m + 1, -1
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( 1, j ),KIND=sp)
+                 if( ajj<=zero ) then
+                    ab( 1, j ) = ajj
+                    go to 50
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 km = min( j-1, kd )
+                 ! compute elements j-km:j-1 of the j-th row and update the
+                 ! trailing submatrix within the band.
+                 call stdlib_csscal( km, one / ajj, ab( km+1, j-km ), kld )
+                 call stdlib_clacgv( km, ab( km+1, j-km ), kld )
+                 call stdlib_cher( 'LOWER', km, -one, ab( km+1, j-km ), kld,ab( 1, j-km ), kld )
+                           
+                 call stdlib_clacgv( km, ab( km+1, j-km ), kld )
+              end do
+              ! factorize the updated submatrix a(1:m,1:m) as u**h*u.
+              do j = 1, m
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( 1, j ),KIND=sp)
+                 if( ajj<=zero ) then
+                    ab( 1, j ) = ajj
+                    go to 50
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 km = min( kd, m-j )
+                 ! compute elements j+1:j+km of the j-th column and update the
+                 ! trailing submatrix within the band.
+                 if( km>0 ) then
+                    call stdlib_csscal( km, one / ajj, ab( 2, j ), 1 )
+                    call stdlib_cher( 'LOWER', km, -one, ab( 2, j ), 1,ab( 1, j+1 ), kld )
+                 end if
+              end do
+           end if
+           return
+           50 continue
+           info = j
+           return
+     end subroutine stdlib_cpbstf
+
+     !> CPBTF2: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite band matrix A.
+     !> The factorization has the form
+     !> A = U**H * U ,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix, U**H is the conjugate transpose
+     !> of U, and L is lower triangular.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_cpbtf2( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, kld, kn
+           real(sp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPBTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           kld = max( 1, ldab-1 )
+           if( upper ) then
+              ! compute the cholesky factorization a = u**h * u.
+              do j = 1, n
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( kd+1, j ),KIND=sp)
+                 if( ajj<=zero ) then
+                    ab( kd+1, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 ! compute elements j+1:j+kn of row j and update the
+                 ! trailing submatrix within the band.
+                 kn = min( kd, n-j )
+                 if( kn>0 ) then
+                    call stdlib_csscal( kn, one / ajj, ab( kd, j+1 ), kld )
+                    call stdlib_clacgv( kn, ab( kd, j+1 ), kld )
+                    call stdlib_cher( 'UPPER', kn, -one, ab( kd, j+1 ), kld,ab( kd+1, j+1 ), kld )
+                              
+                    call stdlib_clacgv( kn, ab( kd, j+1 ), kld )
+                 end if
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**h.
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( 1, j ),KIND=sp)
+                 if( ajj<=zero ) then
+                    ab( 1, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 ! compute elements j+1:j+kn of column j and update the
+                 ! trailing submatrix within the band.
+                 kn = min( kd, n-j )
+                 if( kn>0 ) then
+                    call stdlib_csscal( kn, one / ajj, ab( 2, j ), 1 )
+                    call stdlib_cher( 'LOWER', kn, -one, ab( 2, j ), 1,ab( 1, j+1 ), kld )
+                 end if
+              end do
+           end if
+           return
+           30 continue
+           info = j
+           return
+     end subroutine stdlib_cpbtf2
+
+     !> CPBTRS: solves a system of linear equations A*X = B with a Hermitian
+     !> positive definite band matrix A using the Cholesky factorization
+     !> A = U**H*U or A = L*L**H computed by CPBTRF.
+
+     pure subroutine stdlib_cpbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           complex(sp), intent(in) :: ab(ldab,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**h *u.
+              do j = 1, nrhs
+                 ! solve u**h *x = b, overwriting b with x.
+                 call stdlib_ctbsv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,kd, ab, ldab, b(&
+                            1, j ), 1 )
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_ctbsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j )&
+                           , 1 )
+              end do
+           else
+              ! solve a*x = b where a = l*l**h.
+              do j = 1, nrhs
+                 ! solve l*x = b, overwriting b with x.
+                 call stdlib_ctbsv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j )&
+                           , 1 )
+                 ! solve l**h *x = b, overwriting b with x.
+                 call stdlib_ctbsv( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,kd, ab, ldab, b(&
+                            1, j ), 1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_cpbtrs
+
+     !> CPOEQU: computes row and column scalings intended to equilibrate a
+     !> Hermitian positive definite matrix A and reduce its condition number
+     !> (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_cpoequ( n, a, lda, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(out) :: s(*)
+           complex(sp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPOEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           ! find the minimum and maximum diagonal elements.
+           s( 1 ) = real( a( 1, 1 ),KIND=sp)
+           smin = s( 1 )
+           amax = s( 1 )
+           do i = 2, n
+              s( i ) = real( a( i, i ),KIND=sp)
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_cpoequ
+
+     !> CPOEQUB: computes row and column scalings intended to equilibrate a
+     !> Hermitian positive definite matrix A and reduce its condition number
+     !> (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+     !> This routine differs from CPOEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled diagonal entries are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_cpoequb( n, a, lda, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(out) :: amax, scond
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: smin, base, tmp
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt,log,int
+           ! Executable Statements 
+           ! test the input parameters.
+           ! positive definite only performs 1 pass of equilibration.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPOEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           base = stdlib_slamch( 'B' )
+           tmp = -0.5_sp / log ( base )
+           ! find the minimum and maximum diagonal elements.
+           s( 1 ) = real( a( 1, 1 ),KIND=sp)
+           smin = s( 1 )
+           amax = s( 1 )
+           do i = 2, n
+              s( i ) = real( a( i, i ),KIND=sp)
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = base ** int( tmp * log( s( i ) ),KIND=ilp)
+              end do
+              ! compute scond = min(s(i)) / max(s(i)).
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_cpoequb
+
+     !> CPOTF2: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**H * U ,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_cpotf2( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j
+           real(sp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPOTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the cholesky factorization a = u**h *u.
+              do j = 1, n
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = real( real( a( j, j ),KIND=sp) - stdlib_cdotc( j-1, a( 1, j ), 1,a( 1, j ),&
+                            1 ),KIND=sp)
+                 if( ajj<=zero.or.stdlib_sisnan( ajj ) ) then
+                    a( j, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of row j.
+                 if( j<n ) then
+                    call stdlib_clacgv( j-1, a( 1, j ), 1 )
+                    call stdlib_cgemv( 'TRANSPOSE', j-1, n-j, -cone, a( 1, j+1 ),lda, a( 1, j ), &
+                              1, cone, a( j, j+1 ), lda )
+                    call stdlib_clacgv( j-1, a( 1, j ), 1 )
+                    call stdlib_csscal( n-j, one / ajj, a( j, j+1 ), lda )
+                 end if
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**h.
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = real( real( a( j, j ),KIND=sp) - stdlib_cdotc( j-1, a( j, 1 ), lda,a( j, 1 &
+                           ), lda ),KIND=sp)
+                 if( ajj<=zero.or.stdlib_sisnan( ajj ) ) then
+                    a( j, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of column j.
+                 if( j<n ) then
+                    call stdlib_clacgv( j-1, a( j, 1 ), lda )
+                    call stdlib_cgemv( 'NO TRANSPOSE', n-j, j-1, -cone, a( j+1, 1 ),lda, a( j, 1 )&
+                              , lda, cone, a( j+1, j ), 1 )
+                    call stdlib_clacgv( j-1, a( j, 1 ), lda )
+                    call stdlib_csscal( n-j, one / ajj, a( j+1, j ), 1 )
+                 end if
+              end do
+           end if
+           go to 40
+           30 continue
+           info = j
+           40 continue
+           return
+     end subroutine stdlib_cpotf2
+
+     !> CPOTRF2: computes the Cholesky factorization of a Hermitian
+     !> positive definite matrix A using the recursive algorithm.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the recursive version of the algorithm. It divides
+     !> the matrix into four submatrices:
+     !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+     !> A = [ -----|----- ]  with n1 = n/2
+     !> [  A21 | A22  ]       n2 = n-n1
+     !> The subroutine calls itself to factor A11. Update and scale A21
+     !> or A12, update A22 then calls itself to factor A22.
+
+     pure recursive subroutine stdlib_cpotrf2( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: n1, n2, iinfo
+           real(sp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPOTRF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! n=1 case
+           if( n==1 ) then
+              ! test for non-positive-definiteness
+              ajj = real( a( 1, 1 ),KIND=sp)
+              if( ajj<=zero.or.stdlib_sisnan( ajj ) ) then
+                 info = 1
+                 return
+              end if
+              ! factor
+              a( 1, 1 ) = sqrt( ajj )
+           ! use recursive code
+           else
+              n1 = n/2
+              n2 = n-n1
+              ! factor a11
+              call stdlib_cpotrf2( uplo, n1, a( 1, 1 ), lda, iinfo )
+              if ( iinfo/=0 ) then
+                 info = iinfo
+                 return
+              end if
+              ! compute the cholesky factorization a = u**h*u
+              if( upper ) then
+                 ! update and scale a12
+                 call stdlib_ctrsm( 'L', 'U', 'C', 'N', n1, n2, cone,a( 1, 1 ), lda, a( 1, n1+1 ),&
+                            lda )
+                 ! update and factor a22
+                 call stdlib_cherk( uplo, 'C', n2, n1, -one, a( 1, n1+1 ), lda,one, a( n1+1, n1+1 &
+                           ), lda )
+                 call stdlib_cpotrf2( uplo, n2, a( n1+1, n1+1 ), lda, iinfo )
+                 if ( iinfo/=0 ) then
+                    info = iinfo + n1
+                    return
+                 end if
+              ! compute the cholesky factorization a = l*l**h
+              else
+                 ! update and scale a21
+                 call stdlib_ctrsm( 'R', 'L', 'C', 'N', n2, n1, cone,a( 1, 1 ), lda, a( n1+1, 1 ),&
+                            lda )
+                 ! update and factor a22
+                 call stdlib_cherk( uplo, 'N', n2, n1, -one, a( n1+1, 1 ), lda,one, a( n1+1, n1+1 &
+                           ), lda )
+                 call stdlib_cpotrf2( uplo, n2, a( n1+1, n1+1 ), lda, iinfo )
+                 if ( iinfo/=0 ) then
+                    info = iinfo + n1
+                    return
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_cpotrf2
+
+     !> CPOTRS: solves a system of linear equations A*X = B with a Hermitian
+     !> positive definite matrix A using the Cholesky factorization
+     !> A = U**H*U or A = L*L**H computed by CPOTRF.
+
+     pure subroutine stdlib_cpotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPOTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**h *u.
+              ! solve u**h *x = b, overwriting b with x.
+              call stdlib_ctrsm( 'LEFT', 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',n, nrhs, cone,&
+                         a, lda, b, ldb )
+              ! solve u*x = b, overwriting b with x.
+              call stdlib_ctrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, cone, a, &
+                        lda, b, ldb )
+           else
+              ! solve a*x = b where a = l*l**h.
+              ! solve l*x = b, overwriting b with x.
+              call stdlib_ctrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, cone, a, &
+                        lda, b, ldb )
+              ! solve l**h *x = b, overwriting b with x.
+              call stdlib_ctrsm( 'LEFT', 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',n, nrhs, cone,&
+                         a, lda, b, ldb )
+           end if
+           return
+     end subroutine stdlib_cpotrs
+
+     !> CPPEQU: computes row and column scalings intended to equilibrate a
+     !> Hermitian positive definite matrix A in packed storage and reduce
+     !> its condition number (with respect to the two-norm).  S contains the
+     !> scale factors, S(i)=1/sqrt(A(i,i)), chosen so that the scaled matrix
+     !> B with elements B(i,j)=S(i)*A(i,j)*S(j) has ones on the diagonal.
+     !> This choice of S puts the condition number of B within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_cppequ( uplo, n, ap, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(out) :: s(*)
+           complex(sp), intent(in) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, jj
+           real(sp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPPEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           ! initialize smin and amax.
+           s( 1 ) = real( ap( 1 ),KIND=sp)
+           smin = s( 1 )
+           amax = s( 1 )
+           if( upper ) then
+              ! uplo = 'u':  upper triangle of a is stored.
+              ! find the minimum and maximum diagonal elements.
+              jj = 1
+              do i = 2, n
+                 jj = jj + i
+                 s( i ) = real( ap( jj ),KIND=sp)
+                 smin = min( smin, s( i ) )
+                 amax = max( amax, s( i ) )
+              end do
+           else
+              ! uplo = 'l':  lower triangle of a is stored.
+              ! find the minimum and maximum diagonal elements.
+              jj = 1
+              do i = 2, n
+                 jj = jj + n - i + 2
+                 s( i ) = real( ap( jj ),KIND=sp)
+                 smin = min( smin, s( i ) )
+                 amax = max( amax, s( i ) )
+              end do
+           end if
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_cppequ
+
+     !> CPPTRF: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite matrix A stored in packed format.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+
+     pure subroutine stdlib_cpptrf( uplo, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jc, jj
+           real(sp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPPTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the cholesky factorization a = u**h * u.
+              jj = 0
+              do j = 1, n
+                 jc = jj + 1
+                 jj = jj + j
+                 ! compute elements 1:j-1 of column j.
+                 if( j>1 )call stdlib_ctpsv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',j-1, ap, &
+                           ap( jc ), 1 )
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = real( real( ap( jj ),KIND=sp) - stdlib_cdotc( j-1,ap( jc ), 1, ap( jc ), 1 &
+                           ),KIND=sp)
+                 if( ajj<=zero ) then
+                    ap( jj ) = ajj
+                    go to 30
+                 end if
+                 ap( jj ) = sqrt( ajj )
+              end do
+           else
+              ! compute the cholesky factorization a = l * l**h.
+              jj = 1
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = real( ap( jj ),KIND=sp)
+                 if( ajj<=zero ) then
+                    ap( jj ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 ap( jj ) = ajj
+                 ! compute elements j+1:n of column j and update the trailing
+                 ! submatrix.
+                 if( j<n ) then
+                    call stdlib_csscal( n-j, one / ajj, ap( jj+1 ), 1 )
+                    call stdlib_chpr( 'LOWER', n-j, -one, ap( jj+1 ), 1,ap( jj+n-j+1 ) )
+                    jj = jj + n - j + 1
+                 end if
+              end do
+           end if
+           go to 40
+           30 continue
+           info = j
+           40 continue
+           return
+     end subroutine stdlib_cpptrf
+
+     !> CPPTRS: solves a system of linear equations A*X = B with a Hermitian
+     !> positive definite matrix A in packed storage using the Cholesky
+     !> factorization A = U**H*U or A = L*L**H computed by CPPTRF.
+
+     pure subroutine stdlib_cpptrs( uplo, n, nrhs, ap, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(sp), intent(in) :: ap(*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**h * u.
+              do i = 1, nrhs
+                 ! solve u**h *x = b, overwriting b with x.
+                 call stdlib_ctpsv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,ap, b( 1, i ), &
+                           1 )
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_ctpsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                           
+              end do
+           else
+              ! solve a*x = b where a = l * l**h.
+              do i = 1, nrhs
+                 ! solve l*y = b, overwriting b with x.
+                 call stdlib_ctpsv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                           
+                 ! solve l**h *x = y, overwriting b with x.
+                 call stdlib_ctpsv( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,ap, b( 1, i ), &
+                           1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_cpptrs
+
+     !> CPSTF2: computes the Cholesky factorization with complete
+     !> pivoting of a complex Hermitian positive semidefinite matrix A.
+     !> The factorization has the form
+     !> P**T * A * P = U**H * U ,  if UPLO = 'U',
+     !> P**T * A * P = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular, and
+     !> P is stored as vector PIV.
+     !> This algorithm does not attempt to check that A is positive
+     !> semidefinite. This version of the algorithm calls level 2 BLAS.
+
+     pure subroutine stdlib_cpstf2( uplo, n, a, lda, piv, rank, tol, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: tol
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(2*n)
+           integer(ilp), intent(out) :: piv(n)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(sp) :: ctemp
+           real(sp) :: ajj, sstop, stemp
+           integer(ilp) :: i, itemp, j, pvt
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPSTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize piv
+           do i = 1, n
+              piv( i ) = i
+           end do
+           ! compute stopping value
+           do i = 1, n
+              work( i ) = real( a( i, i ),KIND=sp)
+           end do
+           pvt = maxloc( work( 1:n ), 1 )
+           ajj = real( a( pvt, pvt ),KIND=sp)
+           if( ajj<=zero.or.stdlib_sisnan( ajj ) ) then
+              rank = 0
+              info = 1
+              go to 200
+           end if
+           ! compute stopping value if not supplied
+           if( tol<zero ) then
+              sstop = n * stdlib_slamch( 'EPSILON' ) * ajj
+           else
+              sstop = tol
+           end if
+           ! set first chalf of work to zero, holds dot products
+           do i = 1, n
+              work( i ) = 0
+           end do
+           if( upper ) then
+              ! compute the cholesky factorization p**t * a * p = u**h * u
+              loop_150: do j = 1, n
+              ! find pivot, test for exit, else swap rows and columns
+              ! update dot products, compute possible pivots which are
+              ! stored in the second chalf of work
+                 do i = j, n
+                    if( j>1 ) then
+                       work( i ) = work( i ) +real( conjg( a( j-1, i ) )*a( j-1, i ),KIND=sp)
+                                 
+                    end if
+                    work( n+i ) = real( a( i, i ),KIND=sp) - work( i )
+                 end do
+                 if( j>1 ) then
+                    itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                    pvt = itemp + j - 1
+                    ajj = work( n+pvt )
+                    if( ajj<=sstop.or.stdlib_sisnan( ajj ) ) then
+                       a( j, j ) = ajj
+                       go to 190
+                    end if
+                 end if
+                 if( j/=pvt ) then
+                    ! pivot ok, so can now swap pivot rows and columns
+                    a( pvt, pvt ) = a( j, j )
+                    call stdlib_cswap( j-1, a( 1, j ), 1, a( 1, pvt ), 1 )
+                    if( pvt<n )call stdlib_cswap( n-pvt, a( j, pvt+1 ), lda,a( pvt, pvt+1 ), lda )
+                              
+                    do i = j + 1, pvt - 1
+                       ctemp = conjg( a( j, i ) )
+                       a( j, i ) = conjg( a( i, pvt ) )
+                       a( i, pvt ) = ctemp
+                    end do
+                    a( j, pvt ) = conjg( a( j, pvt ) )
+                    ! swap dot products and piv
+                    stemp = work( j )
+                    work( j ) = work( pvt )
+                    work( pvt ) = stemp
+                    itemp = piv( pvt )
+                    piv( pvt ) = piv( j )
+                    piv( j ) = itemp
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of row j
+                 if( j<n ) then
+                    call stdlib_clacgv( j-1, a( 1, j ), 1 )
+                    call stdlib_cgemv( 'TRANS', j-1, n-j, -cone, a( 1, j+1 ), lda,a( 1, j ), 1, &
+                              cone, a( j, j+1 ), lda )
+                    call stdlib_clacgv( j-1, a( 1, j ), 1 )
+                    call stdlib_csscal( n-j, one / ajj, a( j, j+1 ), lda )
+                 end if
+              end do loop_150
+           else
+              ! compute the cholesky factorization p**t * a * p = l * l**h
+              loop_180: do j = 1, n
+              ! find pivot, test for exit, else swap rows and columns
+              ! update dot products, compute possible pivots which are
+              ! stored in the second chalf of work
+                 do i = j, n
+                    if( j>1 ) then
+                       work( i ) = work( i ) +real( conjg( a( i, j-1 ) )*a( i, j-1 ),KIND=sp)
+                                 
+                    end if
+                    work( n+i ) = real( a( i, i ),KIND=sp) - work( i )
+                 end do
+                 if( j>1 ) then
+                    itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                    pvt = itemp + j - 1
+                    ajj = work( n+pvt )
+                    if( ajj<=sstop.or.stdlib_sisnan( ajj ) ) then
+                       a( j, j ) = ajj
+                       go to 190
+                    end if
+                 end if
+                 if( j/=pvt ) then
+                    ! pivot ok, so can now swap pivot rows and columns
+                    a( pvt, pvt ) = a( j, j )
+                    call stdlib_cswap( j-1, a( j, 1 ), lda, a( pvt, 1 ), lda )
+                    if( pvt<n )call stdlib_cswap( n-pvt, a( pvt+1, j ), 1, a( pvt+1, pvt ),1 )
+                              
+                    do i = j + 1, pvt - 1
+                       ctemp = conjg( a( i, j ) )
+                       a( i, j ) = conjg( a( pvt, i ) )
+                       a( pvt, i ) = ctemp
+                    end do
+                    a( pvt, j ) = conjg( a( pvt, j ) )
+                    ! swap dot products and piv
+                    stemp = work( j )
+                    work( j ) = work( pvt )
+                    work( pvt ) = stemp
+                    itemp = piv( pvt )
+                    piv( pvt ) = piv( j )
+                    piv( j ) = itemp
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of column j
+                 if( j<n ) then
+                    call stdlib_clacgv( j-1, a( j, 1 ), lda )
+                    call stdlib_cgemv( 'NO TRANS', n-j, j-1, -cone, a( j+1, 1 ),lda, a( j, 1 ), &
+                              lda, cone, a( j+1, j ), 1 )
+                    call stdlib_clacgv( j-1, a( j, 1 ), lda )
+                    call stdlib_csscal( n-j, one / ajj, a( j+1, j ), 1 )
+                 end if
+              end do loop_180
+           end if
+           ! ran to completion, a has full rank
+           rank = n
+           go to 200
+           190 continue
+           ! rank is number of steps completed.  set info = 1 to signal
+           ! that the factorization cannot be used to solve a system.
+           rank = j - 1
+           info = 1
+           200 continue
+           return
+     end subroutine stdlib_cpstf2
+
+     !> CPSTRF: computes the Cholesky factorization with complete
+     !> pivoting of a complex Hermitian positive semidefinite matrix A.
+     !> The factorization has the form
+     !> P**T * A * P = U**H * U ,  if UPLO = 'U',
+     !> P**T * A * P = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular, and
+     !> P is stored as vector PIV.
+     !> This algorithm does not attempt to check that A is positive
+     !> semidefinite. This version of the algorithm calls level 3 BLAS.
+
+     pure subroutine stdlib_cpstrf( uplo, n, a, lda, piv, rank, tol, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: tol
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(2*n)
+           integer(ilp), intent(out) :: piv(n)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(sp) :: ctemp
+           real(sp) :: ajj, sstop, stemp
+           integer(ilp) :: i, itemp, j, jb, k, nb, pvt
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min,real,sqrt,maxloc
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPSTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get block size
+           nb = stdlib_ilaenv( 1, 'CPOTRF', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_cpstf2( uplo, n, a( 1, 1 ), lda, piv, rank, tol, work,info )
+              go to 230
+           else
+           ! initialize piv
+              do i = 1, n
+                 piv( i ) = i
+              end do
+           ! compute stopping value
+              do i = 1, n
+                 work( i ) = real( a( i, i ),KIND=sp)
+              end do
+              pvt = maxloc( work( 1:n ), 1 )
+              ajj = real( a( pvt, pvt ),KIND=sp)
+              if( ajj<=zero.or.stdlib_sisnan( ajj ) ) then
+                 rank = 0
+                 info = 1
+                 go to 230
+              end if
+           ! compute stopping value if not supplied
+              if( tol<zero ) then
+                 sstop = n * stdlib_slamch( 'EPSILON' ) * ajj
+              else
+                 sstop = tol
+              end if
+              if( upper ) then
+                 ! compute the cholesky factorization p**t * a * p = u**h * u
+                 loop_160: do k = 1, n, nb
+                    ! account for last block not being nb wide
+                    jb = min( nb, n-k+1 )
+                    ! set relevant part of first chalf of work to zero,
+                    ! holds dot products
+                    do i = k, n
+                       work( i ) = 0
+                    end do
+                    loop_150: do j = k, k + jb - 1
+                    ! find pivot, test for exit, else swap rows and columns
+                    ! update dot products, compute possible pivots which are
+                    ! stored in the second chalf of work
+                       do i = j, n
+                          if( j>k ) then
+                             work( i ) = work( i ) +real( conjg( a( j-1, i ) )*a( j-1, i ),&
+                                       KIND=sp)
+                          end if
+                          work( n+i ) = real( a( i, i ),KIND=sp) - work( i )
+                       end do
+                       if( j>1 ) then
+                          itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                          pvt = itemp + j - 1
+                          ajj = work( n+pvt )
+                          if( ajj<=sstop.or.stdlib_sisnan( ajj ) ) then
+                             a( j, j ) = ajj
+                             go to 220
+                          end if
+                       end if
+                       if( j/=pvt ) then
+                          ! pivot ok, so can now swap pivot rows and columns
+                          a( pvt, pvt ) = a( j, j )
+                          call stdlib_cswap( j-1, a( 1, j ), 1, a( 1, pvt ), 1 )
+                          if( pvt<n )call stdlib_cswap( n-pvt, a( j, pvt+1 ), lda,a( pvt, pvt+1 ),&
+                                     lda )
+                          do i = j + 1, pvt - 1
+                             ctemp = conjg( a( j, i ) )
+                             a( j, i ) = conjg( a( i, pvt ) )
+                             a( i, pvt ) = ctemp
+                          end do
+                          a( j, pvt ) = conjg( a( j, pvt ) )
+                          ! swap dot products and piv
+                          stemp = work( j )
+                          work( j ) = work( pvt )
+                          work( pvt ) = stemp
+                          itemp = piv( pvt )
+                          piv( pvt ) = piv( j )
+                          piv( j ) = itemp
+                       end if
+                       ajj = sqrt( ajj )
+                       a( j, j ) = ajj
+                       ! compute elements j+1:n of row j.
+                       if( j<n ) then
+                          call stdlib_clacgv( j-1, a( 1, j ), 1 )
+                          call stdlib_cgemv( 'TRANS', j-k, n-j, -cone, a( k, j+1 ),lda, a( k, j ),&
+                                     1, cone, a( j, j+1 ),lda )
+                          call stdlib_clacgv( j-1, a( 1, j ), 1 )
+                          call stdlib_csscal( n-j, one / ajj, a( j, j+1 ), lda )
+                       end if
+                    end do loop_150
+                    ! update trailing matrix, j already incremented
+                    if( k+jb<=n ) then
+                       call stdlib_cherk( 'UPPER', 'CONJ TRANS', n-j+1, jb, -one,a( k, j ), lda, &
+                                 one, a( j, j ), lda )
+                    end if
+                 end do loop_160
+              else
+              ! compute the cholesky factorization p**t * a * p = l * l**h
+                 loop_210: do k = 1, n, nb
+                    ! account for last block not being nb wide
+                    jb = min( nb, n-k+1 )
+                    ! set relevant part of first chalf of work to zero,
+                    ! holds dot products
+                    do i = k, n
+                       work( i ) = 0
+                    end do
+                    loop_200: do j = k, k + jb - 1
+                    ! find pivot, test for exit, else swap rows and columns
+                    ! update dot products, compute possible pivots which are
+                    ! stored in the second chalf of work
+                       do i = j, n
+                          if( j>k ) then
+                             work( i ) = work( i ) +real( conjg( a( i, j-1 ) )*a( i, j-1 ),&
+                                       KIND=sp)
+                          end if
+                          work( n+i ) = real( a( i, i ),KIND=sp) - work( i )
+                       end do
+                       if( j>1 ) then
+                          itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                          pvt = itemp + j - 1
+                          ajj = work( n+pvt )
+                          if( ajj<=sstop.or.stdlib_sisnan( ajj ) ) then
+                             a( j, j ) = ajj
+                             go to 220
+                          end if
+                       end if
+                       if( j/=pvt ) then
+                          ! pivot ok, so can now swap pivot rows and columns
+                          a( pvt, pvt ) = a( j, j )
+                          call stdlib_cswap( j-1, a( j, 1 ), lda, a( pvt, 1 ), lda )
+                          if( pvt<n )call stdlib_cswap( n-pvt, a( pvt+1, j ), 1,a( pvt+1, pvt ), &
+                                    1 )
+                          do i = j + 1, pvt - 1
+                             ctemp = conjg( a( i, j ) )
+                             a( i, j ) = conjg( a( pvt, i ) )
+                             a( pvt, i ) = ctemp
+                          end do
+                          a( pvt, j ) = conjg( a( pvt, j ) )
+                          ! swap dot products and piv
+                          stemp = work( j )
+                          work( j ) = work( pvt )
+                          work( pvt ) = stemp
+                          itemp = piv( pvt )
+                          piv( pvt ) = piv( j )
+                          piv( j ) = itemp
+                       end if
+                       ajj = sqrt( ajj )
+                       a( j, j ) = ajj
+                       ! compute elements j+1:n of column j.
+                       if( j<n ) then
+                          call stdlib_clacgv( j-1, a( j, 1 ), lda )
+                          call stdlib_cgemv( 'NO TRANS', n-j, j-k, -cone,a( j+1, k ), lda, a( j, &
+                                    k ), lda, cone,a( j+1, j ), 1 )
+                          call stdlib_clacgv( j-1, a( j, 1 ), lda )
+                          call stdlib_csscal( n-j, one / ajj, a( j+1, j ), 1 )
+                       end if
+                    end do loop_200
+                    ! update trailing matrix, j already incremented
+                    if( k+jb<=n ) then
+                       call stdlib_cherk( 'LOWER', 'NO TRANS', n-j+1, jb, -one,a( j, k ), lda, &
+                                 one, a( j, j ), lda )
+                    end if
+                 end do loop_210
+              end if
+           end if
+           ! ran to completion, a has full rank
+           rank = n
+           go to 230
+           220 continue
+           ! rank is the number of steps completed.  set info = 1 to signal
+           ! that the factorization cannot be used to solve a system.
+           rank = j - 1
+           info = 1
+           230 continue
+           return
+     end subroutine stdlib_cpstrf
+
+     !> CPTCON: computes the reciprocal of the condition number (in the
+     !> 1-norm) of a complex Hermitian positive definite tridiagonal matrix
+     !> using the factorization A = L*D*L**H or A = U**H*D*U computed by
+     !> CPTTRF.
+     !> Norm(inv(A)) is computed by a direct method, and the reciprocal of
+     !> the condition number is computed as
+     !> RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_cptcon( n, d, e, anorm, rcond, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(in) :: e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ix
+           real(sp) :: ainvnm
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( anorm<zero ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPTCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           ! check that d(1:n) is positive.
+           do i = 1, n
+              if( d( i )<=zero )return
+           end do
+           ! solve m(a) * x = e, where m(a) = (m(i,j)) is given by
+              ! m(i,j) =  abs(a(i,j)), i = j,
+              ! m(i,j) = -abs(a(i,j)), i .ne. j,
+           ! and e = [ 1, 1, ..., 1 ]**t.  note m(a) = m(l)*d*m(l)**h.
+           ! solve m(l) * x = e.
+           rwork( 1 ) = one
+           do i = 2, n
+              rwork( i ) = one + rwork( i-1 )*abs( e( i-1 ) )
+           end do
+           ! solve d * m(l)**h * x = b.
+           rwork( n ) = rwork( n ) / d( n )
+           do i = n - 1, 1, -1
+              rwork( i ) = rwork( i ) / d( i ) + rwork( i+1 )*abs( e( i ) )
+           end do
+           ! compute ainvnm = max(x(i)), 1<=i<=n.
+           ix = stdlib_isamax( n, rwork, 1 )
+           ainvnm = abs( rwork( ix ) )
+           ! compute the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_cptcon
+
+     !> CPTTRF: computes the L*D*L**H factorization of a complex Hermitian
+     !> positive definite tridiagonal matrix A.  The factorization may also
+     !> be regarded as having the form A = U**H *D*U.
+
+     pure subroutine stdlib_cpttrf( n, d, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(inout) :: d(*)
+           complex(sp), intent(inout) :: e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i4
+           real(sp) :: eii, eir, f, g
+           ! Intrinsic Functions 
+           intrinsic :: aimag,cmplx,mod,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'CPTTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! compute the l*d*l**h (or u**h *d*u) factorization of a.
+           i4 = mod( n-1, 4 )
+           do i = 1, i4
+              if( d( i )<=zero ) then
+                 info = i
+                 go to 20
+              end if
+              eir = real( e( i ),KIND=sp)
+              eii = aimag( e( i ) )
+              f = eir / d( i )
+              g = eii / d( i )
+              e( i ) = cmplx( f, g,KIND=sp)
+              d( i+1 ) = d( i+1 ) - f*eir - g*eii
+           end do
+           loop_110: do i = i4+1, n - 4, 4
+              ! drop out of the loop if d(i) <= 0: the matrix is not positive
+              ! definite.
+              if( d( i )<=zero ) then
+                 info = i
+                 go to 20
+              end if
+              ! solve for e(i) and d(i+1).
+              eir = real( e( i ),KIND=sp)
+              eii = aimag( e( i ) )
+              f = eir / d( i )
+              g = eii / d( i )
+              e( i ) = cmplx( f, g,KIND=sp)
+              d( i+1 ) = d( i+1 ) - f*eir - g*eii
+              if( d( i+1 )<=zero ) then
+                 info = i+1
+                 go to 20
+              end if
+              ! solve for e(i+1) and d(i+2).
+              eir = real( e( i+1 ),KIND=sp)
+              eii = aimag( e( i+1 ) )
+              f = eir / d( i+1 )
+              g = eii / d( i+1 )
+              e( i+1 ) = cmplx( f, g,KIND=sp)
+              d( i+2 ) = d( i+2 ) - f*eir - g*eii
+              if( d( i+2 )<=zero ) then
+                 info = i+2
+                 go to 20
+              end if
+              ! solve for e(i+2) and d(i+3).
+              eir = real( e( i+2 ),KIND=sp)
+              eii = aimag( e( i+2 ) )
+              f = eir / d( i+2 )
+              g = eii / d( i+2 )
+              e( i+2 ) = cmplx( f, g,KIND=sp)
+              d( i+3 ) = d( i+3 ) - f*eir - g*eii
+              if( d( i+3 )<=zero ) then
+                 info = i+3
+                 go to 20
+              end if
+              ! solve for e(i+3) and d(i+4).
+              eir = real( e( i+3 ),KIND=sp)
+              eii = aimag( e( i+3 ) )
+              f = eir / d( i+3 )
+              g = eii / d( i+3 )
+              e( i+3 ) = cmplx( f, g,KIND=sp)
+              d( i+4 ) = d( i+4 ) - f*eir - g*eii
+           end do loop_110
+           ! check d(n) for positive definiteness.
+           if( d( n )<=zero )info = n
+           20 continue
+           return
+     end subroutine stdlib_cpttrf
+
+     !> CPTTS2: solves a tridiagonal system of the form
+     !> A * X = B
+     !> using the factorization A = U**H*D*U or A = L*D*L**H computed by CPTTRF.
+     !> D is a diagonal matrix specified in the vector D, U (or L) is a unit
+     !> bidiagonal matrix whose superdiagonal (subdiagonal) is specified in
+     !> the vector E, and X and B are N by NRHS matrices.
+
+     pure subroutine stdlib_cptts2( iuplo, n, nrhs, d, e, b, ldb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: iuplo, ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*)
+           complex(sp), intent(inout) :: b(ldb,*)
+           complex(sp), intent(in) :: e(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 )call stdlib_csscal( nrhs, 1. / d( 1 ), b, ldb )
+              return
+           end if
+           if( iuplo==1 ) then
+              ! solve a * x = b using the factorization a = u**h *d*u,
+              ! overwriting each right hand side vector with its solution.
+              if( nrhs<=2 ) then
+                 j = 1
+                 5 continue
+                 ! solve u**h * x = b.
+                 do i = 2, n
+                    b( i, j ) = b( i, j ) - b( i-1, j )*conjg( e( i-1 ) )
+                 end do
+                 ! solve d * u * x = b.
+                 do i = 1, n
+                    b( i, j ) = b( i, j ) / d( i )
+                 end do
+                 do i = n - 1, 1, -1
+                    b( i, j ) = b( i, j ) - b( i+1, j )*e( i )
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 5
+                 end if
+              else
+                 do j = 1, nrhs
+                    ! solve u**h * x = b.
+                    do i = 2, n
+                       b( i, j ) = b( i, j ) - b( i-1, j )*conjg( e( i-1 ) )
+                    end do
+                    ! solve d * u * x = b.
+                    b( n, j ) = b( n, j ) / d( n )
+                    do i = n - 1, 1, -1
+                       b( i, j ) = b( i, j ) / d( i ) - b( i+1, j )*e( i )
+                    end do
+                 end do
+              end if
+           else
+              ! solve a * x = b using the factorization a = l*d*l**h,
+              ! overwriting each right hand side vector with its solution.
+              if( nrhs<=2 ) then
+                 j = 1
+                 65 continue
+                 ! solve l * x = b.
+                 do i = 2, n
+                    b( i, j ) = b( i, j ) - b( i-1, j )*e( i-1 )
+                 end do
+                 ! solve d * l**h * x = b.
+                 do i = 1, n
+                    b( i, j ) = b( i, j ) / d( i )
+                 end do
+                 do i = n - 1, 1, -1
+                    b( i, j ) = b( i, j ) - b( i+1, j )*conjg( e( i ) )
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 65
+                 end if
+              else
+                 do j = 1, nrhs
+                    ! solve l * x = b.
+                    do i = 2, n
+                       b( i, j ) = b( i, j ) - b( i-1, j )*e( i-1 )
+                    end do
+                    ! solve d * l**h * x = b.
+                    b( n, j ) = b( n, j ) / d( n )
+                    do i = n - 1, 1, -1
+                       b( i, j ) = b( i, j ) / d( i ) -b( i+1, j )*conjg( e( i ) )
+                    end do
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_cptts2
+
+     !> CROT:   applies a plane rotation, where the cos (C) is real and the
+     !> sin (S) is complex, and the vectors CX and CY are complex.
+
+     pure subroutine stdlib_crot( n, cx, incx, cy, incy, c, s )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           real(sp), intent(in) :: c
+           complex(sp), intent(in) :: s
+           ! Array Arguments 
+           complex(sp), intent(inout) :: cx(*), cy(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           complex(sp) :: stemp
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           if( n<=0 )return
+           if( incx==1 .and. incy==1 )go to 20
+           ! code for unequal increments or equal increments not equal to 1
+           ix = 1
+           iy = 1
+           if( incx<0 )ix = ( -n+1 )*incx + 1
+           if( incy<0 )iy = ( -n+1 )*incy + 1
+           do i = 1, n
+              stemp = c*cx( ix ) + s*cy( iy )
+              cy( iy ) = c*cy( iy ) - conjg( s )*cx( ix )
+              cx( ix ) = stemp
+              ix = ix + incx
+              iy = iy + incy
+           end do
+           return
+           ! code for both increments equal to 1
+           20 continue
+           do i = 1, n
+              stemp = c*cx( i ) + s*cy( i )
+              cy( i ) = c*cy( i ) - conjg( s )*cx( i )
+              cx( i ) = stemp
+           end do
+           return
+     end subroutine stdlib_crot
+
+     !> CSPMV:  performs the matrix-vector operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_cspmv( uplo, n, alpha, ap, x, incx, beta, y, incy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incx, incy, n
+           complex(sp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           complex(sp), intent(in) :: ap(*), x(*)
+           complex(sp), intent(inout) :: y(*)
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           complex(sp) :: temp1, temp2
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = 1
+           else if( n<0 ) then
+              info = 2
+           else if( incx==0 ) then
+              info = 6
+           else if( incy==0 ) then
+              info = 9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSPMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ) .or. ( ( alpha==czero ) .and. ( beta==cone ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 ) then
+              kx = 1
+           else
+              kx = 1 - ( n-1 )*incx
+           end if
+           if( incy>0 ) then
+              ky = 1
+           else
+              ky = 1 - ( n-1 )*incy
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           ! first form  y := beta*y.
+           if( beta/=cone ) then
+              if( incy==1 ) then
+                 if( beta==czero ) then
+                    do i = 1, n
+                       y( i ) = czero
+                    end do
+                 else
+                    do i = 1, n
+                       y( i ) = beta*y( i )
+                    end do
+                 end if
+              else
+                 iy = ky
+                 if( beta==czero ) then
+                    do i = 1, n
+                       y( iy ) = czero
+                       iy = iy + incy
+                    end do
+                 else
+                    do i = 1, n
+                       y( iy ) = beta*y( iy )
+                       iy = iy + incy
+                    end do
+                 end if
+              end if
+           end if
+           if( alpha==czero )return
+           kk = 1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! form  y  when ap contains the upper triangle.
+              if( ( incx==1 ) .and. ( incy==1 ) ) then
+                 do j = 1, n
+                    temp1 = alpha*x( j )
+                    temp2 = czero
+                    k = kk
+                    do i = 1, j - 1
+                       y( i ) = y( i ) + temp1*ap( k )
+                       temp2 = temp2 + ap( k )*x( i )
+                       k = k + 1
+                    end do
+                    y( j ) = y( j ) + temp1*ap( kk+j-1 ) + alpha*temp2
+                    kk = kk + j
+                 end do
+              else
+                 jx = kx
+                 jy = ky
+                 do j = 1, n
+                    temp1 = alpha*x( jx )
+                    temp2 = czero
+                    ix = kx
+                    iy = ky
+                    do k = kk, kk + j - 2
+                       y( iy ) = y( iy ) + temp1*ap( k )
+                       temp2 = temp2 + ap( k )*x( ix )
+                       ix = ix + incx
+                       iy = iy + incy
+                    end do
+                    y( jy ) = y( jy ) + temp1*ap( kk+j-1 ) + alpha*temp2
+                    jx = jx + incx
+                    jy = jy + incy
+                    kk = kk + j
+                 end do
+              end if
+           else
+              ! form  y  when ap contains the lower triangle.
+              if( ( incx==1 ) .and. ( incy==1 ) ) then
+                 do j = 1, n
+                    temp1 = alpha*x( j )
+                    temp2 = czero
+                    y( j ) = y( j ) + temp1*ap( kk )
+                    k = kk + 1
+                    do i = j + 1, n
+                       y( i ) = y( i ) + temp1*ap( k )
+                       temp2 = temp2 + ap( k )*x( i )
+                       k = k + 1
+                    end do
+                    y( j ) = y( j ) + alpha*temp2
+                    kk = kk + ( n-j+1 )
+                 end do
+              else
+                 jx = kx
+                 jy = ky
+                 do j = 1, n
+                    temp1 = alpha*x( jx )
+                    temp2 = czero
+                    y( jy ) = y( jy ) + temp1*ap( kk )
+                    ix = jx
+                    iy = jy
+                    do k = kk + 1, kk + n - j
+                       ix = ix + incx
+                       iy = iy + incy
+                       y( iy ) = y( iy ) + temp1*ap( k )
+                       temp2 = temp2 + ap( k )*x( ix )
+                    end do
+                    y( jy ) = y( jy ) + alpha*temp2
+                    jx = jx + incx
+                    jy = jy + incy
+                    kk = kk + ( n-j+1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_cspmv
+
+     !> CSPR:    performs the symmetric rank 1 operation
+     !> A := alpha*x*x**H + A,
+     !> where alpha is a complex scalar, x is an n element vector and A is an
+     !> n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_cspr( uplo, n, alpha, x, incx, ap )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incx, n
+           complex(sp), intent(in) :: alpha
+           ! Array Arguments 
+           complex(sp), intent(inout) :: ap(*)
+           complex(sp), intent(in) :: x(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           complex(sp) :: temp
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = 1
+           else if( n<0 ) then
+              info = 2
+           else if( incx==0 ) then
+              info = 5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSPR  ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ) .or. ( alpha==czero ) )return
+           ! set the start point in x if the increment is not unity.
+           if( incx<=0 ) then
+              kx = 1 - ( n-1 )*incx
+           else if( incx/=1 ) then
+              kx = 1
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           kk = 1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! form  a  when upper triangle is stored in ap.
+              if( incx==1 ) then
+                 do j = 1, n
+                    if( x( j )/=czero ) then
+                       temp = alpha*x( j )
+                       k = kk
+                       do i = 1, j - 1
+                          ap( k ) = ap( k ) + x( i )*temp
+                          k = k + 1
+                       end do
+                       ap( kk+j-1 ) = ap( kk+j-1 ) + x( j )*temp
+                    else
+                       ap( kk+j-1 ) = ap( kk+j-1 )
+                    end if
+                    kk = kk + j
+                 end do
+              else
+                 jx = kx
+                 do j = 1, n
+                    if( x( jx )/=czero ) then
+                       temp = alpha*x( jx )
+                       ix = kx
+                       do k = kk, kk + j - 2
+                          ap( k ) = ap( k ) + x( ix )*temp
+                          ix = ix + incx
+                       end do
+                       ap( kk+j-1 ) = ap( kk+j-1 ) + x( jx )*temp
+                    else
+                       ap( kk+j-1 ) = ap( kk+j-1 )
+                    end if
+                    jx = jx + incx
+                    kk = kk + j
+                 end do
+              end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+              if( incx==1 ) then
+                 do j = 1, n
+                    if( x( j )/=czero ) then
+                       temp = alpha*x( j )
+                       ap( kk ) = ap( kk ) + temp*x( j )
+                       k = kk + 1
+                       do i = j + 1, n
+                          ap( k ) = ap( k ) + x( i )*temp
+                          k = k + 1
+                       end do
+                    else
+                       ap( kk ) = ap( kk )
+                    end if
+                    kk = kk + n - j + 1
+                 end do
+              else
+                 jx = kx
+                 do j = 1, n
+                    if( x( jx )/=czero ) then
+                       temp = alpha*x( jx )
+                       ap( kk ) = ap( kk ) + temp*x( jx )
+                       ix = jx
+                       do k = kk + 1, kk + n - j
+                          ix = ix + incx
+                          ap( k ) = ap( k ) + x( ix )*temp
+                       end do
+                    else
+                       ap( kk ) = ap( kk )
+                    end if
+                    jx = jx + incx
+                    kk = kk + n - j + 1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_cspr
+
+     !> CSPTRF: computes the factorization of a complex symmetric matrix A
+     !> stored in packed format using the Bunch-Kaufman diagonal pivoting
+     !> method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+
+     pure subroutine stdlib_csptrf( uplo, n, ap, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kc, kk, knc, kp, kpc, kstep, kx, npp
+           real(sp) :: absakk, alpha, colmax, rowmax
+           complex(sp) :: d11, d12, d21, d22, r1, t, wk, wkm1, wkp1, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSPTRF', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              kc = ( n-1 )*n / 2 + 1
+              10 continue
+              knc = kc
+              ! if k < 1, exit from loop
+              if( k<1 )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( ap( kc+k-1 ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, ap( kc ), 1 )
+                 colmax = cabs1( ap( kc+imax-1 ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    rowmax = zero
+                    jmax = imax
+                    kx = imax*( imax+1 ) / 2 + imax
+                    do j = imax + 1, k
+                       if( cabs1( ap( kx ) )>rowmax ) then
+                          rowmax = cabs1( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + j
+                    end do
+                    kpc = ( imax-1 )*imax / 2 + 1
+                    if( imax>1 ) then
+                       jmax = stdlib_icamax( imax-1, ap( kpc ), 1 )
+                       rowmax = max( rowmax, cabs1( ap( kpc+jmax-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( ap( kpc+imax-1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kstep==2 )knc = knc - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_cswap( kp-1, ap( knc ), 1, ap( kpc ), 1 )
+                    kx = kpc + kp - 1
+                    do j = kp + 1, kk - 1
+                       kx = kx + j - 1
+                       t = ap( knc+j-1 )
+                       ap( knc+j-1 ) = ap( kx )
+                       ap( kx ) = t
+                    end do
+                    t = ap( knc+kk-1 )
+                    ap( knc+kk-1 ) = ap( kpc+kp-1 )
+                    ap( kpc+kp-1 ) = t
+                    if( kstep==2 ) then
+                       t = ap( kc+k-2 )
+                       ap( kc+k-2 ) = ap( kc+kp-1 )
+                       ap( kc+kp-1 ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**t = a - w(k)*1/d(k)*w(k)**t
+                    r1 = cone / ap( kc+k-1 )
+                    call stdlib_cspr( uplo, k-1, -r1, ap( kc ), 1, ap )
+                    ! store u(k) in column k
+                    call stdlib_cscal( k-1, r1, ap( kc ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**t
+                    if( k>2 ) then
+                       d12 = ap( k-1+( k-1 )*k / 2 )
+                       d22 = ap( k-1+( k-2 )*( k-1 ) / 2 ) / d12
+                       d11 = ap( k+( k-1 )*k / 2 ) / d12
+                       t = cone / ( d11*d22-cone )
+                       d12 = t / d12
+                       do j = k - 2, 1, -1
+                          wkm1 = d12*( d11*ap( j+( k-2 )*( k-1 ) / 2 )-ap( j+( k-1 )*k / 2 ) )
+                                    
+                          wk = d12*( d22*ap( j+( k-1 )*k / 2 )-ap( j+( k-2 )*( k-1 ) / 2 ) )
+                                    
+                          do i = j, 1, -1
+                             ap( i+( j-1 )*j / 2 ) = ap( i+( j-1 )*j / 2 ) -ap( i+( k-1 )*k / 2 )&
+                                       *wk -ap( i+( k-2 )*( k-1 ) / 2 )*wkm1
+                          end do
+                          ap( j+( k-1 )*k / 2 ) = wk
+                          ap( j+( k-2 )*( k-1 ) / 2 ) = wkm1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              kc = knc - k
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              kc = 1
+              npp = n*( n+1 ) / 2
+              60 continue
+              knc = kc
+              ! if k > n, exit from loop
+              if( k>n )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( ap( kc ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, ap( kc+1 ), 1 )
+                 colmax = cabs1( ap( kc+imax-k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    rowmax = zero
+                    kx = kc + imax - k
+                    do j = k, imax - 1
+                       if( cabs1( ap( kx ) )>rowmax ) then
+                          rowmax = cabs1( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + n - j
+                    end do
+                    kpc = npp - ( n-imax+1 )*( n-imax+2 ) / 2 + 1
+                    if( imax<n ) then
+                       jmax = imax + stdlib_icamax( n-imax, ap( kpc+1 ), 1 )
+                       rowmax = max( rowmax, cabs1( ap( kpc+jmax-imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( ap( kpc ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kstep==2 )knc = knc + n - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_cswap( n-kp, ap( knc+kp-kk+1 ), 1, ap( kpc+1 ),1 )
+                              
+                    kx = knc + kp - kk
+                    do j = kk + 1, kp - 1
+                       kx = kx + n - j + 1
+                       t = ap( knc+j-kk )
+                       ap( knc+j-kk ) = ap( kx )
+                       ap( kx ) = t
+                    end do
+                    t = ap( knc )
+                    ap( knc ) = ap( kpc )
+                    ap( kpc ) = t
+                    if( kstep==2 ) then
+                       t = ap( kc+1 )
+                       ap( kc+1 ) = ap( kc+kp-k )
+                       ap( kc+kp-k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**t = a - w(k)*(1/d(k))*w(k)**t
+                       r1 = cone / ap( kc )
+                       call stdlib_cspr( uplo, n-k, -r1, ap( kc+1 ), 1,ap( kc+n-k+1 ) )
+                       ! store l(k) in column k
+                       call stdlib_cscal( n-k, r1, ap( kc+1 ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**t
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**t
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d21 = ap( k+1+( k-1 )*( 2*n-k ) / 2 )
+                       d11 = ap( k+1+k*( 2*n-k-1 ) / 2 ) / d21
+                       d22 = ap( k+( k-1 )*( 2*n-k ) / 2 ) / d21
+                       t = cone / ( d11*d22-cone )
+                       d21 = t / d21
+                       do j = k + 2, n
+                          wk = d21*( d11*ap( j+( k-1 )*( 2*n-k ) / 2 )-ap( j+k*( 2*n-k-1 ) / 2 ) )
+                                    
+                          wkp1 = d21*( d22*ap( j+k*( 2*n-k-1 ) / 2 )-ap( j+( k-1 )*( 2*n-k ) / 2 )&
+                                     )
+                          do i = j, n
+                             ap( i+( j-1 )*( 2*n-j ) / 2 ) = ap( i+( j-1 )*( 2*n-j ) / 2 ) - ap( &
+                                       i+( k-1 )*( 2*n-k ) /2 )*wk - ap( i+k*( 2*n-k-1 ) / 2 )*wkp1
+                          end do
+                          ap( j+( k-1 )*( 2*n-k ) / 2 ) = wk
+                          ap( j+k*( 2*n-k-1 ) / 2 ) = wkp1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              kc = knc + n - k + 2
+              go to 60
+           end if
+           110 continue
+           return
+     end subroutine stdlib_csptrf
+
+     !> CSPTRI: computes the inverse of a complex symmetric indefinite matrix
+     !> A in packed storage using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by CSPTRF.
+
+     pure subroutine stdlib_csptri( uplo, n, ap, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: ap(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kcnext, kp, kpc, kstep, kx, npp
+           complex(sp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              kp = n*( n+1 ) / 2
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. ap( kp )==czero )return
+                 kp = kp - info
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              kp = 1
+              do info = 1, n
+                 if( ipiv( info )>0 .and. ap( kp )==czero )return
+                 kp = kp + n - info + 1
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              kcnext = kc + k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc+k-1 ) = cone / ap( kc+k-1 )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_ccopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_cspmv( uplo, k-1, -cone, ap, work, 1, czero, ap( kc ),1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -stdlib_cdotu( k-1, work, 1, ap( kc ), 1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = ap( kcnext+k-1 )
+                 ak = ap( kc+k-1 ) / t
+                 akp1 = ap( kcnext+k ) / t
+                 akkp1 = ap( kcnext+k-1 ) / t
+                 d = t*( ak*akp1-cone )
+                 ap( kc+k-1 ) = akp1 / d
+                 ap( kcnext+k ) = ak / d
+                 ap( kcnext+k-1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_ccopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_cspmv( uplo, k-1, -cone, ap, work, 1, czero, ap( kc ),1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -stdlib_cdotu( k-1, work, 1, ap( kc ), 1 )
+                    ap( kcnext+k-1 ) = ap( kcnext+k-1 ) -stdlib_cdotu( k-1, ap( kc ), 1, ap( &
+                              kcnext ),1 )
+                    call stdlib_ccopy( k-1, ap( kcnext ), 1, work, 1 )
+                    call stdlib_cspmv( uplo, k-1, -cone, ap, work, 1, czero,ap( kcnext ), 1 )
+                              
+                    ap( kcnext+k ) = ap( kcnext+k ) -stdlib_cdotu( k-1, work, 1, ap( kcnext ), 1 )
+                              
+                 end if
+                 kstep = 2
+                 kcnext = kcnext + k + 1
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kpc = ( kp-1 )*kp / 2 + 1
+                 call stdlib_cswap( kp-1, ap( kc ), 1, ap( kpc ), 1 )
+                 kx = kpc + kp - 1
+                 do j = kp + 1, k - 1
+                    kx = kx + j - 1
+                    temp = ap( kc+j-1 )
+                    ap( kc+j-1 ) = ap( kx )
+                    ap( kx ) = temp
+                 end do
+                 temp = ap( kc+k-1 )
+                 ap( kc+k-1 ) = ap( kpc+kp-1 )
+                 ap( kpc+kp-1 ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc+k+k-1 )
+                    ap( kc+k+k-1 ) = ap( kc+k+kp-1 )
+                    ap( kc+k+kp-1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              kc = kcnext
+              go to 30
+              50 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              npp = n*( n+1 ) / 2
+              k = n
+              kc = npp
+              60 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 80
+              kcnext = kc - ( n-k+2 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc ) = cone / ap( kc )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_ccopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_cspmv( uplo, n-k, -cone, ap( kc+n-k+1 ), work, 1,czero, ap( kc+1 )&
+                              , 1 )
+                    ap( kc ) = ap( kc ) - stdlib_cdotu( n-k, work, 1, ap( kc+1 ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = ap( kcnext+1 )
+                 ak = ap( kcnext ) / t
+                 akp1 = ap( kc ) / t
+                 akkp1 = ap( kcnext+1 ) / t
+                 d = t*( ak*akp1-cone )
+                 ap( kcnext ) = akp1 / d
+                 ap( kc ) = ak / d
+                 ap( kcnext+1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_ccopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_cspmv( uplo, n-k, -cone, ap( kc+( n-k+1 ) ), work, 1,czero, ap( &
+                              kc+1 ), 1 )
+                    ap( kc ) = ap( kc ) - stdlib_cdotu( n-k, work, 1, ap( kc+1 ),1 )
+                    ap( kcnext+1 ) = ap( kcnext+1 ) -stdlib_cdotu( n-k, ap( kc+1 ), 1,ap( kcnext+&
+                              2 ), 1 )
+                    call stdlib_ccopy( n-k, ap( kcnext+2 ), 1, work, 1 )
+                    call stdlib_cspmv( uplo, n-k, -cone, ap( kc+( n-k+1 ) ), work, 1,czero, ap( &
+                              kcnext+2 ), 1 )
+                    ap( kcnext ) = ap( kcnext ) -stdlib_cdotu( n-k, work, 1, ap( kcnext+2 ), 1 )
+                              
+                 end if
+                 kstep = 2
+                 kcnext = kcnext - ( n-k+3 )
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kpc = npp - ( n-kp+1 )*( n-kp+2 ) / 2 + 1
+                 if( kp<n )call stdlib_cswap( n-kp, ap( kc+kp-k+1 ), 1, ap( kpc+1 ), 1 )
+                 kx = kc + kp - k
+                 do j = k + 1, kp - 1
+                    kx = kx + n - j + 1
+                    temp = ap( kc+j-k )
+                    ap( kc+j-k ) = ap( kx )
+                    ap( kx ) = temp
+                 end do
+                 temp = ap( kc )
+                 ap( kc ) = ap( kpc )
+                 ap( kpc ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc-n+k-1 )
+                    ap( kc-n+k-1 ) = ap( kc-n+kp-1 )
+                    ap( kc-n+kp-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              kc = kcnext
+              go to 60
+              80 continue
+           end if
+           return
+     end subroutine stdlib_csptri
+
+     !> CSPTRS: solves a system of linear equations A*X = B with a complex
+     !> symmetric matrix A stored in packed format using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by CSPTRF.
+
+     pure subroutine stdlib_csptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: ap(*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kp
+           complex(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              kc = kc - k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_cgeru( k-1, nrhs, -cone, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_cscal( nrhs, cone / ap( kc+k-1 ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_cswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_cgeru( k-2, nrhs, -cone, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 call stdlib_cgeru( k-2, nrhs, -cone, ap( kc-( k-1 ) ), 1,b( k-1, 1 ), ldb, b( 1, &
+                           1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+k-2 )
+                 akm1 = ap( kc-1 ) / akm1k
+                 ak = ap( kc+k-1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc - k + 1
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_cgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb, ap( kc ),1, cone, b( k,&
+                            1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + k
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 call stdlib_cgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb, ap( kc ),1, cone, b( k,&
+                            1 ), ldb )
+                 call stdlib_cgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb,ap( kc+k ), 1, cone, b( &
+                           k+1, 1 ), ldb )
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + 2*k + 1
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_cgeru( n-k, nrhs, -cone, ap( kc+1 ), 1, b( k, 1 ),ldb, b( k+&
+                           1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_cscal( nrhs, cone / ap( kc ), b( k, 1 ), ldb )
+                 kc = kc + n - k + 1
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_cswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_cgeru( n-k-1, nrhs, -cone, ap( kc+2 ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_cgeru( n-k-1, nrhs, -cone, ap( kc+n-k+2 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+1 )
+                 akm1 = ap( kc ) / akm1k
+                 ak = ap( kc+n-k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc + 2*( n-k ) + 1
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              kc = kc - ( n-k+1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_cgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, ap( &
+                           kc+1 ), 1, cone, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_cgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, ap( kc+1 ),&
+                               1, cone, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, ap( kc-( n-&
+                              k ) ), 1, cone, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc - ( n-k+2 )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_csptrs
+
+     !> CSRSCL: multiplies an n-element complex vector x by the real scalar
+     !> 1/a.  This is done without overflow or underflow as long as
+     !> the final result x/a does not overflow or underflow.
+
+     pure subroutine stdlib_csrscl( n, sa, sx, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           real(sp), intent(in) :: sa
+           ! Array Arguments 
+           complex(sp), intent(inout) :: sx(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           real(sp) :: bignum, cden, cden1, cnum, cnum1, mul, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 )return
+           ! get machine parameters
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! initialize the denominator to sa and the numerator to 1.
+           cden = sa
+           cnum = one
+           10 continue
+           cden1 = cden*smlnum
+           cnum1 = cnum / bignum
+           if( abs( cden1 )>abs( cnum ) .and. cnum/=zero ) then
+              ! pre-multiply x by smlnum if cden is large compared to cnum.
+              mul = smlnum
+              done = .false.
+              cden = cden1
+           else if( abs( cnum1 )>abs( cden ) ) then
+              ! pre-multiply x by bignum if cden is small compared to cnum.
+              mul = bignum
+              done = .false.
+              cnum = cnum1
+           else
+              ! multiply x by cnum / cden and return.
+              mul = cnum / cden
+              done = .true.
+           end if
+           ! scale the vector x by mul
+           call stdlib_csscal( n, mul, sx, incx )
+           if( .not.done )go to 10
+           return
+     end subroutine stdlib_csrscl
+
+     !> CSTEIN: computes the eigenvectors of a real symmetric tridiagonal
+     !> matrix T corresponding to specified eigenvalues, using inverse
+     !> iteration.
+     !> The maximum number of iterations allowed for each eigenvector is
+     !> specified by an internal parameter MAXITS (currently set to 5).
+     !> Although the eigenvectors are real, they are stored in a complex
+     !> array, which may be passed to CUNMTR or CUPMTR for back
+     !> transformation to the eigenvectors of a complex Hermitian matrix
+     !> which was reduced to tridiagonal form.
+
+     pure subroutine stdlib_cstein( n, d, e, m, w, iblock, isplit, z, ldz, work,iwork, ifail, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, m, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: iblock(*), isplit(*)
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(in) :: d(*), e(*), w(*)
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(out) :: z(ldz,*)
+       ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: odm3 = 1.0e-3_sp
+           real(sp), parameter :: odm1 = 1.0e-1_sp
+           integer(ilp), parameter :: maxits = 5
+           integer(ilp), parameter :: extra = 2
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: b1, blksiz, bn, gpind, i, iinfo, indrv1, indrv2, indrv3, indrv4, &
+                     indrv5, its, j, j1, jblk, jmax, jr, nblk, nrmchk
+           real(sp) :: ctr, eps, eps1, nrm, onenrm, ortol, pertol, scl, sep, stpcrt, tol, xj, &
+                     xjm
+           ! Local Arrays 
+           integer(ilp) :: iseed(4)
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,max,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           do i = 1, m
+              ifail( i ) = 0
+           end do
+           if( n<0 ) then
+              info = -1
+           else if( m<0 .or. m>n ) then
+              info = -4
+           else if( ldz<max( 1, n ) ) then
+              info = -9
+           else
+              do j = 2, m
+                 if( iblock( j )<iblock( j-1 ) ) then
+                    info = -6
+                    go to 30
+                 end if
+                 if( iblock( j )==iblock( j-1 ) .and. w( j )<w( j-1 ) )then
+                    info = -5
+                    go to 30
+                 end if
+              end do
+              30 continue
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSTEIN', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) then
+              return
+           else if( n==1 ) then
+              z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           eps = stdlib_slamch( 'PRECISION' )
+           ! initialize seed for random number generator stdlib_slarnv.
+           do i = 1, 4
+              iseed( i ) = 1
+           end do
+           ! initialize pointers.
+           indrv1 = 0
+           indrv2 = indrv1 + n
+           indrv3 = indrv2 + n
+           indrv4 = indrv3 + n
+           indrv5 = indrv4 + n
+           ! compute eigenvectors of matrix blocks.
+           j1 = 1
+           loop_180: do nblk = 1, iblock( m )
+              ! find starting and ending indices of block nblk.
+              if( nblk==1 ) then
+                 b1 = 1
+              else
+                 b1 = isplit( nblk-1 ) + 1
+              end if
+              bn = isplit( nblk )
+              blksiz = bn - b1 + 1
+              if( blksiz==1 )go to 60
+              gpind = j1
+              ! compute reorthogonalization criterion and stopping criterion.
+              onenrm = abs( d( b1 ) ) + abs( e( b1 ) )
+              onenrm = max( onenrm, abs( d( bn ) )+abs( e( bn-1 ) ) )
+              do i = b1 + 1, bn - 1
+                 onenrm = max( onenrm, abs( d( i ) )+abs( e( i-1 ) )+abs( e( i ) ) )
+              end do
+              ortol = odm3*onenrm
+              stpcrt = sqrt( odm1 / blksiz )
+              ! loop through eigenvalues of block nblk.
+              60 continue
+              jblk = 0
+              loop_170: do j = j1, m
+                 if( iblock( j )/=nblk ) then
+                    j1 = j
+                    cycle loop_180
+                 end if
+                 jblk = jblk + 1
+                 xj = w( j )
+                 ! skip all the work if the block size is one.
+                 if( blksiz==1 ) then
+                    work( indrv1+1 ) = one
+                    go to 140
+                 end if
+                 ! if eigenvalues j and j-1 are too close, add a relatively
+                 ! small perturbation.
+                 if( jblk>1 ) then
+                    eps1 = abs( eps*xj )
+                    pertol = ten*eps1
+                    sep = xj - xjm
+                    if( sep<pertol )xj = xjm + pertol
+                 end if
+                 its = 0
+                 nrmchk = 0
+                 ! get random starting vector.
+                 call stdlib_slarnv( 2, iseed, blksiz, work( indrv1+1 ) )
+                 ! copy the matrix t so it won't be destroyed in factorization.
+                 call stdlib_scopy( blksiz, d( b1 ), 1, work( indrv4+1 ), 1 )
+                 call stdlib_scopy( blksiz-1, e( b1 ), 1, work( indrv2+2 ), 1 )
+                 call stdlib_scopy( blksiz-1, e( b1 ), 1, work( indrv3+1 ), 1 )
+                 ! compute lu factors with partial pivoting  ( pt = lu )
+                 tol = zero
+                 call stdlib_slagtf( blksiz, work( indrv4+1 ), xj, work( indrv2+2 ),work( indrv3+&
+                           1 ), tol, work( indrv5+1 ), iwork,iinfo )
+                 ! update iteration count.
+                 70 continue
+                 its = its + 1
+                 if( its>maxits )go to 120
+                 ! normalize and scale the righthand side vector pb.
+                 jmax = stdlib_isamax( blksiz, work( indrv1+1 ), 1 )
+                 scl = blksiz*onenrm*max( eps,abs( work( indrv4+blksiz ) ) ) /abs( work( indrv1+&
+                           jmax ) )
+                 call stdlib_sscal( blksiz, scl, work( indrv1+1 ), 1 )
+                 ! solve the system lu = pb.
+                 call stdlib_slagts( -1, blksiz, work( indrv4+1 ), work( indrv2+2 ),work( indrv3+&
+                           1 ), work( indrv5+1 ), iwork,work( indrv1+1 ), tol, iinfo )
+                 ! reorthogonalize by modified gram-schmidt if eigenvalues are
+                 ! close enough.
+                 if( jblk==1 )go to 110
+                 if( abs( xj-xjm )>ortol )gpind = j
+                 if( gpind/=j ) then
+                    do i = gpind, j - 1
+                       ctr = zero
+                       do jr = 1, blksiz
+                          ctr = ctr + work( indrv1+jr )*real( z( b1-1+jr, i ),KIND=sp)
+                       end do
+                       do jr = 1, blksiz
+                          work( indrv1+jr ) = work( indrv1+jr ) -ctr*real( z( b1-1+jr, i ),&
+                                    KIND=sp)
+                       end do
+                    end do
+                 end if
+                 ! check the infinity norm of the iterate.
+                 110 continue
+                 jmax = stdlib_isamax( blksiz, work( indrv1+1 ), 1 )
+                 nrm = abs( work( indrv1+jmax ) )
+                 ! continue for additional iterations after norm reaches
+                 ! stopping criterion.
+                 if( nrm<stpcrt )go to 70
+                 nrmchk = nrmchk + 1
+                 if( nrmchk<extra+1 )go to 70
+                 go to 130
+                 ! if stopping criterion was not satisfied, update info and
+                 ! store eigenvector number in array ifail.
+                 120 continue
+                 info = info + 1
+                 ifail( info ) = j
+                 ! accept iterate as jth eigenvector.
+                 130 continue
+                 scl = one / stdlib_snrm2( blksiz, work( indrv1+1 ), 1 )
+                 jmax = stdlib_isamax( blksiz, work( indrv1+1 ), 1 )
+                 if( work( indrv1+jmax )<zero )scl = -scl
+                 call stdlib_sscal( blksiz, scl, work( indrv1+1 ), 1 )
+                 140 continue
+                 do i = 1, n
+                    z( i, j ) = czero
+                 end do
+                 do i = 1, blksiz
+                    z( b1+i-1, j ) = cmplx( work( indrv1+i ), zero,KIND=sp)
+                 end do
+                 ! save the shift to check eigenvalue spacing at next
+                 ! iteration.
+                 xjm = xj
+              end do loop_170
+           end do loop_180
+           return
+     end subroutine stdlib_cstein
+
+     !> CSTEQR: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric tridiagonal matrix using the implicit QL or QR method.
+     !> The eigenvectors of a full or band complex Hermitian matrix can also
+     !> be found if CHETRD or CHPTRD or CHBTRD has been used to reduce this
+     !> matrix to tridiagonal form.
+
+     pure subroutine stdlib_csteqr( compz, n, d, e, z, ldz, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 30
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, icompz, ii, iscale, j, jtot, k, l, l1, lend, lendm1, lendp1, lendsv,&
+                      lm1, lsv, m, mm, mm1, nm1, nmaxit
+           real(sp) :: anorm, b, c, eps, eps2, f, g, p, r, rt1, rt2, s, safmax, safmin, ssfmax, &
+                     ssfmin, tst
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or. ( icompz>0 .and. ldz<max( 1,n ) ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSTEQR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz==2 )z( 1, 1 ) = cone
+              return
+           end if
+           ! determine the unit roundoff and over/underflow thresholds.
+           eps = stdlib_slamch( 'E' )
+           eps2 = eps**2
+           safmin = stdlib_slamch( 'S' )
+           safmax = one / safmin
+           ssfmax = sqrt( safmax ) / three
+           ssfmin = sqrt( safmin ) / eps2
+           ! compute the eigenvalues and eigenvectors of the tridiagonal
+           ! matrix.
+           if( icompz==2 )call stdlib_claset( 'FULL', n, n, czero, cone, z, ldz )
+           nmaxit = n*maxit
+           jtot = 0
+           ! determine where the matrix splits and choose ql or qr iteration
+           ! for each block, according to whether top or bottom diagonal
+           ! element is smaller.
+           l1 = 1
+           nm1 = n - 1
+           10 continue
+           if( l1>n )go to 160
+           if( l1>1 )e( l1-1 ) = zero
+           if( l1<=nm1 ) then
+              do m = l1, nm1
+                 tst = abs( e( m ) )
+                 if( tst==zero )go to 30
+                 if( tst<=( sqrt( abs( d( m ) ) )*sqrt( abs( d( m+1 ) ) ) )*eps ) then
+                    e( m ) = zero
+                    go to 30
+                 end if
+              end do
+           end if
+           m = n
+           30 continue
+           l = l1
+           lsv = l
+           lend = m
+           lendsv = lend
+           l1 = m + 1
+           if( lend==l )go to 10
+           ! scale submatrix in rows and columns l to lend
+           anorm = stdlib_slanst( 'I', lend-l+1, d( l ), e( l ) )
+           iscale = 0
+           if( anorm==zero )go to 10
+           if( anorm>ssfmax ) then
+              iscale = 1
+              call stdlib_slascl( 'G', 0, 0, anorm, ssfmax, lend-l+1, 1, d( l ), n,info )
+              call stdlib_slascl( 'G', 0, 0, anorm, ssfmax, lend-l, 1, e( l ), n,info )
+           else if( anorm<ssfmin ) then
+              iscale = 2
+              call stdlib_slascl( 'G', 0, 0, anorm, ssfmin, lend-l+1, 1, d( l ), n,info )
+              call stdlib_slascl( 'G', 0, 0, anorm, ssfmin, lend-l, 1, e( l ), n,info )
+           end if
+           ! choose between ql and qr iteration
+           if( abs( d( lend ) )<abs( d( l ) ) ) then
+              lend = lsv
+              l = lendsv
+           end if
+           if( lend>l ) then
+              ! ql iteration
+              ! look for small subdiagonal element.
+              40 continue
+              if( l/=lend ) then
+                 lendm1 = lend - 1
+                 do m = l, lendm1
+                    tst = abs( e( m ) )**2
+                    if( tst<=( eps2*abs( d( m ) ) )*abs( d( m+1 ) )+safmin )go to 60
+                 end do
+              end if
+              m = lend
+              60 continue
+              if( m<lend )e( m ) = zero
+              p = d( l )
+              if( m==l )go to 80
+              ! if remaining matrix is 2-by-2, use stdlib_slae2 or stdlib_slaev2
+              ! to compute its eigensystem.
+              if( m==l+1 ) then
+                 if( icompz>0 ) then
+                    call stdlib_slaev2( d( l ), e( l ), d( l+1 ), rt1, rt2, c, s )
+                    work( l ) = c
+                    work( n-1+l ) = s
+                    call stdlib_clasr( 'R', 'V', 'B', n, 2, work( l ),work( n-1+l ), z( 1, l ), &
+                              ldz )
+                 else
+                    call stdlib_slae2( d( l ), e( l ), d( l+1 ), rt1, rt2 )
+                 end if
+                 d( l ) = rt1
+                 d( l+1 ) = rt2
+                 e( l ) = zero
+                 l = l + 2
+                 if( l<=lend )go to 40
+                 go to 140
+              end if
+              if( jtot==nmaxit )go to 140
+              jtot = jtot + 1
+              ! form shift.
+              g = ( d( l+1 )-p ) / ( two*e( l ) )
+              r = stdlib_slapy2( g, one )
+              g = d( m ) - p + ( e( l ) / ( g+sign( r, g ) ) )
+              s = one
+              c = one
+              p = zero
+              ! inner loop
+              mm1 = m - 1
+              do i = mm1, l, -1
+                 f = s*e( i )
+                 b = c*e( i )
+                 call stdlib_slartg( g, f, c, s, r )
+                 if( i/=m-1 )e( i+1 ) = r
+                 g = d( i+1 ) - p
+                 r = ( d( i )-g )*s + two*c*b
+                 p = s*r
+                 d( i+1 ) = g + p
+                 g = c*r - b
+                 ! if eigenvectors are desired, then save rotations.
+                 if( icompz>0 ) then
+                    work( i ) = c
+                    work( n-1+i ) = -s
+                 end if
+              end do
+              ! if eigenvectors are desired, then apply saved rotations.
+              if( icompz>0 ) then
+                 mm = m - l + 1
+                 call stdlib_clasr( 'R', 'V', 'B', n, mm, work( l ), work( n-1+l ),z( 1, l ), ldz &
+                           )
+              end if
+              d( l ) = d( l ) - p
+              e( l ) = g
+              go to 40
+              ! eigenvalue found.
+              80 continue
+              d( l ) = p
+              l = l + 1
+              if( l<=lend )go to 40
+              go to 140
+           else
+              ! qr iteration
+              ! look for small superdiagonal element.
+              90 continue
+              if( l/=lend ) then
+                 lendp1 = lend + 1
+                 do m = l, lendp1, -1
+                    tst = abs( e( m-1 ) )**2
+                    if( tst<=( eps2*abs( d( m ) ) )*abs( d( m-1 ) )+safmin )go to 110
+                 end do
+              end if
+              m = lend
+              110 continue
+              if( m>lend )e( m-1 ) = zero
+              p = d( l )
+              if( m==l )go to 130
+              ! if remaining matrix is 2-by-2, use stdlib_slae2 or stdlib_slaev2
+              ! to compute its eigensystem.
+              if( m==l-1 ) then
+                 if( icompz>0 ) then
+                    call stdlib_slaev2( d( l-1 ), e( l-1 ), d( l ), rt1, rt2, c, s )
+                    work( m ) = c
+                    work( n-1+m ) = s
+                    call stdlib_clasr( 'R', 'V', 'F', n, 2, work( m ),work( n-1+m ), z( 1, l-1 ), &
+                              ldz )
+                 else
+                    call stdlib_slae2( d( l-1 ), e( l-1 ), d( l ), rt1, rt2 )
+                 end if
+                 d( l-1 ) = rt1
+                 d( l ) = rt2
+                 e( l-1 ) = zero
+                 l = l - 2
+                 if( l>=lend )go to 90
+                 go to 140
+              end if
+              if( jtot==nmaxit )go to 140
+              jtot = jtot + 1
+              ! form shift.
+              g = ( d( l-1 )-p ) / ( two*e( l-1 ) )
+              r = stdlib_slapy2( g, one )
+              g = d( m ) - p + ( e( l-1 ) / ( g+sign( r, g ) ) )
+              s = one
+              c = one
+              p = zero
+              ! inner loop
+              lm1 = l - 1
+              do i = m, lm1
+                 f = s*e( i )
+                 b = c*e( i )
+                 call stdlib_slartg( g, f, c, s, r )
+                 if( i/=m )e( i-1 ) = r
+                 g = d( i ) - p
+                 r = ( d( i+1 )-g )*s + two*c*b
+                 p = s*r
+                 d( i ) = g + p
+                 g = c*r - b
+                 ! if eigenvectors are desired, then save rotations.
+                 if( icompz>0 ) then
+                    work( i ) = c
+                    work( n-1+i ) = s
+                 end if
+              end do
+              ! if eigenvectors are desired, then apply saved rotations.
+              if( icompz>0 ) then
+                 mm = l - m + 1
+                 call stdlib_clasr( 'R', 'V', 'F', n, mm, work( m ), work( n-1+m ),z( 1, m ), ldz &
+                           )
+              end if
+              d( l ) = d( l ) - p
+              e( lm1 ) = g
+              go to 90
+              ! eigenvalue found.
+              130 continue
+              d( l ) = p
+              l = l - 1
+              if( l>=lend )go to 90
+              go to 140
+           end if
+           ! undo scaling if necessary
+           140 continue
+           if( iscale==1 ) then
+              call stdlib_slascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv+1, 1,d( lsv ), n, info )
+                        
+              call stdlib_slascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv, 1, e( lsv ),n, info )
+                        
+           else if( iscale==2 ) then
+              call stdlib_slascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv+1, 1,d( lsv ), n, info )
+                        
+              call stdlib_slascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv, 1, e( lsv ),n, info )
+                        
+           end if
+           ! check for no convergence to an eigenvalue after a total
+           ! of n*maxit iterations.
+           if( jtot==nmaxit ) then
+              do i = 1, n - 1
+                 if( e( i )/=zero )info = info + 1
+              end do
+              return
+           end if
+           go to 10
+           ! order eigenvalues and eigenvectors.
+           160 continue
+           if( icompz==0 ) then
+              ! use quick sort
+              call stdlib_slasrt( 'I', n, d, info )
+           else
+              ! use selection sort to minimize swaps of eigenvectors
+              do ii = 2, n
+                 i = ii - 1
+                 k = i
+                 p = d( i )
+                 do j = ii, n
+                    if( d( j )<p ) then
+                       k = j
+                       p = d( j )
+                    end if
+                 end do
+                 if( k/=i ) then
+                    d( k ) = d( i )
+                    d( i ) = p
+                    call stdlib_cswap( n, z( 1, i ), 1, z( 1, k ), 1 )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_csteqr
+
+     !> CSYCONV: convert A given by TRF into L and D and vice-versa.
+     !> Get Non-diag elements of D (returned in workspace) and
+     !> apply or reverse permutation done in TRF.
+
+     pure subroutine stdlib_csyconv( uplo, way, n, a, lda, ipiv, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip, j
+           complex(sp) :: temp
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYCONV', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+            ! a is upper
+            ! convert a (a is upper)
+              ! convert value
+              if ( convert ) then
+                 i=n
+                 e(1)=czero
+                 do while ( i > 1 )
+                    if( ipiv(i) < 0 ) then
+                       e(i)=a(i-1,i)
+                       e(i-1)=czero
+                       a(i-1,i)=czero
+                       i=i-1
+                    else
+                       e(i)=czero
+                    endif
+                    i=i-1
+                 end do
+              ! convert permutations
+              i=n
+              do while ( i >= 1 )
+                 if( ipiv(i) > 0) then
+                    ip=ipiv(i)
+                    if( i < n) then
+                       do j= i+1,n
+                         temp=a(ip,j)
+                         a(ip,j)=a(i,j)
+                         a(i,j)=temp
+                       end do
+                    endif
+                 else
+                   ip=-ipiv(i)
+                    if( i < n) then
+                  do j= i+1,n
+                      temp=a(ip,j)
+                      a(ip,j)=a(i-1,j)
+                      a(i-1,j)=temp
+                  end do
+                     endif
+                     i=i-1
+                endif
+                i=i-1
+             end do
+              else
+            ! revert a (a is upper)
+              ! revert permutations
+                 i=1
+                 do while ( i <= n )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if( i < n) then
+                       do j= i+1,n
+                         temp=a(ip,j)
+                         a(ip,j)=a(i,j)
+                         a(i,j)=temp
+                       end do
+                       endif
+                    else
+                      ip=-ipiv(i)
+                      i=i+1
+                      if( i < n) then
+                         do j= i+1,n
+                            temp=a(ip,j)
+                            a(ip,j)=a(i-1,j)
+                            a(i-1,j)=temp
+                         end do
+                      endif
+                    endif
+                    i=i+1
+                 end do
+              ! revert value
+                 i=n
+                 do while ( i > 1 )
+                    if( ipiv(i) < 0 ) then
+                       a(i-1,i)=e(i)
+                       i=i-1
+                    endif
+                    i=i-1
+                 end do
+              end if
+           else
+            ! a is lower
+              if ( convert ) then
+            ! convert a (a is lower)
+              ! convert value
+                 i=1
+                 e(n)=czero
+                 do while ( i <= n )
+                    if( i<n .and. ipiv(i) < 0 ) then
+                       e(i)=a(i+1,i)
+                       e(i+1)=czero
+                       a(i+1,i)=czero
+                       i=i+1
+                    else
+                       e(i)=czero
+                    endif
+                    i=i+1
+                 end do
+              ! convert permutations
+              i=1
+              do while ( i <= n )
+                 if( ipiv(i) > 0 ) then
+                    ip=ipiv(i)
+                    if (i > 1) then
+                    do j= 1,i-1
+                      temp=a(ip,j)
+                      a(ip,j)=a(i,j)
+                      a(i,j)=temp
+                    end do
+                    endif
+                 else
+                   ip=-ipiv(i)
+                   if (i > 1) then
+                   do j= 1,i-1
+                      temp=a(ip,j)
+                      a(ip,j)=a(i+1,j)
+                      a(i+1,j)=temp
+                   end do
+                   endif
+                   i=i+1
+                endif
+                i=i+1
+             end do
+              else
+            ! revert a (a is lower)
+              ! revert permutations
+                 i=n
+                 do while ( i >= 1 )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(i,j)
+                             a(i,j)=a(ip,j)
+                             a(ip,j)=temp
+                          end do
+                       endif
+                    else
+                       ip=-ipiv(i)
+                       i=i-1
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(i+1,j)
+                             a(i+1,j)=a(ip,j)
+                             a(ip,j)=temp
+                          end do
+                       endif
+                    endif
+                    i=i-1
+                 end do
+              ! revert value
+                 i=1
+                 do while ( i <= n-1 )
+                    if( ipiv(i) < 0 ) then
+                       a(i+1,i)=e(i)
+                       i=i+1
+                    endif
+                    i=i+1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_csyconv
+
+     !> If parameter WAY = 'C':
+     !> CSYCONVF: converts the factorization output format used in
+     !> CSYTRF provided on entry in parameter A into the factorization
+     !> output format used in CSYTRF_RK (or CSYTRF_BK) that is stored
+     !> on exit in parameters A and E. It also converts in place details of
+     !> the intechanges stored in IPIV from the format used in CSYTRF into
+     !> the format used in CSYTRF_RK (or CSYTRF_BK).
+     !> If parameter WAY = 'R':
+     !> CSYCONVF performs the conversion in reverse direction, i.e.
+     !> converts the factorization output format used in CSYTRF_RK
+     !> (or CSYTRF_BK) provided on entry in parameters A and E into
+     !> the factorization output format used in CSYTRF that is stored
+     !> on exit in parameter A. It also converts in place details of
+     !> the intechanges stored in IPIV from the format used in CSYTRF_RK
+     !> (or CSYTRF_BK) into the format used in CSYTRF.
+     !> CSYCONVF can also convert in Hermitian matrix case, i.e. between
+     !> formats used in CHETRF and CHETRF_RK (or CHETRF_BK).
+
+     pure subroutine stdlib_csyconvf( uplo, way, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYCONVF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! begin a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 ! assign superdiagonal entries of d to array e and czero out
+                 ! corresponding entries in input storage a
+                 i = n
+                 e( 1 ) = czero
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       e( i ) = a( i-1, i )
+                       e( i-1 ) = czero
+                       a( i-1, i ) = czero
+                       i = i - 1
+                    else
+                       e( i ) = czero
+                    end if
+                    i = i - 1
+                 end do
+                 ! convert permutations and ipiv
+                 ! apply permutations to submatrices of upper part of a
+                 ! in factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_cswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i) in a(1:i,n-i:n)
+                       ip = -ipiv( i )
+                       if( i<n ) then
+                          if( ip/=(i-1) ) then
+                             call stdlib_cswap( n-i, a( i-1, i+1 ), lda,a( ip, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is no interchnge of rows i and and ipiv(i),
+                       ! so this should be reflected in ipiv format for
+                       ! *sytrf_rk ( or *sytrf_bk)
+                       ipiv( i ) = i
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations and ipiv
+                 ! apply permutations to submatrices of upper part of a
+                 ! in reverse factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_cswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i) in a(1:i,n-i:n)
+                       i = i + 1
+                       ip = -ipiv( i )
+                       if( i<n ) then
+                          if( ip/=(i-1) ) then
+                             call stdlib_cswap( n-i, a( ip, i+1 ), lda,a( i-1, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is cone interchange of rows i-1 and ipiv(i-1),
+                       ! so this should be recorded in two consecutive entries
+                       ! in ipiv format for *sytrf
+                       ipiv( i ) = ipiv( i-1 )
+                    end if
+                    i = i + 1
+                 end do
+                 ! revert value
+                 ! assign superdiagonal entries of d from array e to
+                 ! superdiagonal entries of a.
+                 i = n
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       a( i-1, i ) = e( i )
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              ! end a is upper
+              end if
+           else
+              ! begin a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 ! assign subdiagonal entries of d to array e and czero out
+                 ! corresponding entries in input storage a
+                 i = 1
+                 e( n ) = czero
+                 do while ( i<=n )
+                    if( i<n .and. ipiv(i)<0 ) then
+                       e( i ) = a( i+1, i )
+                       e( i+1 ) = czero
+                       a( i+1, i ) = czero
+                       i = i + 1
+                    else
+                       e( i ) = czero
+                    end if
+                    i = i + 1
+                 end do
+                 ! convert permutations and ipiv
+                 ! apply permutations to submatrices of lower part of a
+                 ! in factorization order where k increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_cswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i) in a(i:n,1:i-1)
+                       ip = -ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=(i+1) ) then
+                             call stdlib_cswap( i-1, a( i+1, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is no interchnge of rows i and and ipiv(i),
+                       ! so this should be reflected in ipiv format for
+                       ! *sytrf_rk ( or *sytrf_bk)
+                       ipiv( i ) = i
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations and ipiv
+                 ! apply permutations to submatrices of lower part of a
+                 ! in reverse factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_cswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i) in a(i:n,1:i-1)
+                       i = i - 1
+                       ip = -ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=(i+1) ) then
+                             call stdlib_cswap( i-1, a( ip, 1 ), lda,a( i+1, 1 ), lda )
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is cone interchange of rows i+1 and ipiv(i+1),
+                       ! so this should be recorded in consecutive entries
+                       ! in ipiv format for *sytrf
+                       ipiv( i ) = ipiv( i+1 )
+                    end if
+                    i = i - 1
+                 end do
+                 ! revert value
+                 ! assign subdiagonal entries of d from array e to
+                 ! subgiagonal entries of a.
+                 i = 1
+                 do while ( i<=n-1 )
+                    if( ipiv( i )<0 ) then
+                       a( i + 1, i ) = e( i )
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              end if
+              ! end a is lower
+           end if
+           return
+     end subroutine stdlib_csyconvf
+
+     !> If parameter WAY = 'C':
+     !> CSYCONVF_ROOK: converts the factorization output format used in
+     !> CSYTRF_ROOK provided on entry in parameter A into the factorization
+     !> output format used in CSYTRF_RK (or CSYTRF_BK) that is stored
+     !> on exit in parameters A and E. IPIV format for CSYTRF_ROOK and
+     !> CSYTRF_RK (or CSYTRF_BK) is the same and is not converted.
+     !> If parameter WAY = 'R':
+     !> CSYCONVF_ROOK performs the conversion in reverse direction, i.e.
+     !> converts the factorization output format used in CSYTRF_RK
+     !> (or CSYTRF_BK) provided on entry in parameters A and E into
+     !> the factorization output format used in CSYTRF_ROOK that is stored
+     !> on exit in parameter A. IPIV format for CSYTRF_ROOK and
+     !> CSYTRF_RK (or CSYTRF_BK) is the same and is not converted.
+     !> CSYCONVF_ROOK can also convert in Hermitian matrix case, i.e. between
+     !> formats used in CHETRF_ROOK and CHETRF_RK (or CHETRF_BK).
+
+     pure subroutine stdlib_csyconvf_rook( uplo, way, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip, ip2
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYCONVF_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! begin a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 ! assign superdiagonal entries of d to array e and czero out
+                 ! corresponding entries in input storage a
+                 i = n
+                 e( 1 ) = czero
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       e( i ) = a( i-1, i )
+                       e( i-1 ) = czero
+                       a( i-1, i ) = czero
+                       i = i - 1
+                    else
+                       e( i ) = czero
+                    end if
+                    i = i - 1
+                 end do
+                 ! convert permutations
+                 ! apply permutations to submatrices of upper part of a
+                 ! in factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_cswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i and ipiv(i) and i-1 and ipiv(i-1)
+                       ! in a(1:i,n-i:n)
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i-1 )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_cswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                          if( ip2/=(i-1) ) then
+                             call stdlib_cswap( n-i, a( i-1, i+1 ), lda,a( ip2, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations
+                 ! apply permutations to submatrices of upper part of a
+                 ! in reverse factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_cswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i-1) and i and ipiv(i)
+                       ! in a(1:i,n-i:n)
+                       i = i + 1
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i-1 )
+                       if( i<n ) then
+                          if( ip2/=(i-1) ) then
+                             call stdlib_cswap( n-i, a( ip2, i+1 ), lda,a( i-1, i+1 ), lda )
+                                       
+                          end if
+                          if( ip/=i ) then
+                             call stdlib_cswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    end if
+                    i = i + 1
+                 end do
+                 ! revert value
+                 ! assign superdiagonal entries of d from array e to
+                 ! superdiagonal entries of a.
+                 i = n
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       a( i-1, i ) = e( i )
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              ! end a is upper
+              end if
+           else
+              ! begin a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 ! assign subdiagonal entries of d to array e and czero out
+                 ! corresponding entries in input storage a
+                 i = 1
+                 e( n ) = czero
+                 do while ( i<=n )
+                    if( i<n .and. ipiv(i)<0 ) then
+                       e( i ) = a( i+1, i )
+                       e( i+1 ) = czero
+                       a( i+1, i ) = czero
+                       i = i + 1
+                    else
+                       e( i ) = czero
+                    end if
+                    i = i + 1
+                 end do
+                 ! convert permutations
+                 ! apply permutations to submatrices of lower part of a
+                 ! in factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_cswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i and ipiv(i) and i+1 and ipiv(i+1)
+                       ! in a(i:n,1:i-1)
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i+1 )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_cswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                          if( ip2/=(i+1) ) then
+                             call stdlib_cswap( i-1, a( i+1, 1 ), lda,a( ip2, 1 ), lda )
+                          end if
+                       end if
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations
+                 ! apply permutations to submatrices of lower part of a
+                 ! in reverse factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_cswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i+1) and i and ipiv(i)
+                       ! in a(i:n,1:i-1)
+                       i = i - 1
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i+1 )
+                       if ( i>1 ) then
+                          if( ip2/=(i+1) ) then
+                             call stdlib_cswap( i-1, a( ip2, 1 ), lda,a( i+1, 1 ), lda )
+                          end if
+                          if( ip/=i ) then
+                             call stdlib_cswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    end if
+                    i = i - 1
+                 end do
+                 ! revert value
+                 ! assign subdiagonal entries of d from array e to
+                 ! subgiagonal entries of a.
+                 i = 1
+                 do while ( i<=n-1 )
+                    if( ipiv( i )<0 ) then
+                       a( i + 1, i ) = e( i )
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              end if
+              ! end a is lower
+           end if
+           return
+     end subroutine stdlib_csyconvf_rook
+
+     !> CSYEQUB: computes row and column scalings intended to equilibrate a
+     !> symmetric matrix A (with respect to the Euclidean norm) and reduce
+     !> its condition number. The scale factors S are computed by the BIN
+     !> algorithm (see references) so that the scaled matrix B with elements
+     !> B(i,j) = S(i)*A(i,j)*S(j) has a condition number within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_csyequb( uplo, n, a, lda, s, scond, amax, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(out) :: amax, scond
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+           real(sp), intent(out) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: max_iter = 100
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, iter
+           real(sp) :: avg, std, tol, c0, c1, c2, t, u, si, d, base, smin, smax, smlnum, bignum, &
+                     scale, sumsq
+           logical(lk) :: up
+           complex(sp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,int,log,max,min,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if ( .not. ( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -1
+           else if ( n < 0 ) then
+              info = -2
+           else if ( lda < max( 1, n ) ) then
+              info = -4
+           end if
+           if ( info /= 0 ) then
+              call stdlib_xerbla( 'CSYEQUB', -info )
+              return
+           end if
+           up = stdlib_lsame( uplo, 'U' )
+           amax = zero
+           ! quick return if possible.
+           if ( n == 0 ) then
+              scond = one
+              return
+           end if
+           do i = 1, n
+              s( i ) = zero
+           end do
+           amax = zero
+           if ( up ) then
+              do j = 1, n
+                 do i = 1, j-1
+                    s( i ) = max( s( i ), cabs1( a( i, j ) ) )
+                    s( j ) = max( s( j ), cabs1( a( i, j ) ) )
+                    amax = max( amax, cabs1( a( i, j ) ) )
+                 end do
+                 s( j ) = max( s( j ), cabs1( a( j, j ) ) )
+                 amax = max( amax, cabs1( a( j, j ) ) )
+              end do
+           else
+              do j = 1, n
+                 s( j ) = max( s( j ), cabs1( a( j, j ) ) )
+                 amax = max( amax, cabs1( a( j, j ) ) )
+                 do i = j+1, n
+                    s( i ) = max( s( i ), cabs1( a( i, j ) ) )
+                    s( j ) = max( s( j ), cabs1( a( i, j ) ) )
+                    amax = max( amax, cabs1( a( i, j ) ) )
+                 end do
+              end do
+           end if
+           do j = 1, n
+              s( j ) = 1.0_sp / s( j )
+           end do
+           tol = one / sqrt( 2.0_sp * n )
+           do iter = 1, max_iter
+              scale = zero
+              sumsq = zero
+              ! beta = |a|s
+              do i = 1, n
+                  work( i ) = zero
+              end do
+              if ( up ) then
+                 do j = 1, n
+                    do i = 1, j-1
+                       work( i ) = work( i ) + cabs1( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + cabs1( a( i, j ) ) * s( i )
+                    end do
+                    work( j ) = work( j ) + cabs1( a( j, j ) ) * s( j )
+                 end do
+              else
+                 do j = 1, n
+                    work( j ) = work( j ) + cabs1( a( j, j ) ) * s( j )
+                    do i = j+1, n
+                       work( i ) = work( i ) + cabs1( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + cabs1( a( i, j ) ) * s( i )
+                    end do
+                 end do
+              end if
+              ! avg = s^t beta / n
+              avg = zero
+              do i = 1, n
+                 avg = avg + real( s( i )*work( i ),KIND=sp)
+              end do
+              avg = avg / n
+              std = zero
+              do i = n+1, 2*n
+                 work( i ) = s( i-n ) * work( i-n ) - avg
+              end do
+              call stdlib_classq( n, work( n+1 ), 1, scale, sumsq )
+              std = scale * sqrt( sumsq / n )
+              if ( std < tol * avg ) goto 999
+              do i = 1, n
+                 t = cabs1( a( i, i ) )
+                 si = s( i )
+                 c2 = ( n-1 ) * t
+                 c1 = real( n-2,KIND=sp) * ( real( work( i ),KIND=sp) - t*si )
+                 c0 = -(t*si)*si + 2 * real( work( i ),KIND=sp) * si - n*avg
+                 d = c1*c1 - 4*c0*c2
+                 if ( d <= 0 ) then
+                    info = -1
+                    return
+                 end if
+                 si = -2*c0 / ( c1 + sqrt( d ) )
+                 d = si - s( i )
+                 u = zero
+                 if ( up ) then
+                    do j = 1, i
+                       t = cabs1( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = cabs1( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 else
+                    do j = 1, i
+                       t = cabs1( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = cabs1( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 end if
+                 avg = avg + ( u + real( work( i ),KIND=sp) ) * d / n
+                 s( i ) = si
+              end do
+           end do
+           999 continue
+           smlnum = stdlib_slamch( 'SAFEMIN' )
+           bignum = one / smlnum
+           smin = bignum
+           smax = zero
+           t = one / sqrt( avg )
+           base = stdlib_slamch( 'B' )
+           u = one / log( base )
+           do i = 1, n
+              s( i ) = base ** int( u * log( s( i ) * t ),KIND=ilp)
+              smin = min( smin, s( i ) )
+              smax = max( smax, s( i ) )
+           end do
+           scond = max( smin, smlnum ) / min( smax, bignum )
+     end subroutine stdlib_csyequb
+
+     !> CSYMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric matrix.
+
+     pure subroutine stdlib_csymv( uplo, n, alpha, a, lda, x, incx, beta, y, incy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           complex(sp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), x(*)
+           complex(sp), intent(inout) :: y(*)
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           complex(sp) :: temp1, temp2
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = 1
+           else if( n<0 ) then
+              info = 2
+           else if( lda<max( 1, n ) ) then
+              info = 5
+           else if( incx==0 ) then
+              info = 7
+           else if( incy==0 ) then
+              info = 10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ) .or. ( ( alpha==czero ) .and. ( beta==cone ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 ) then
+              kx = 1
+           else
+              kx = 1 - ( n-1 )*incx
+           end if
+           if( incy>0 ) then
+              ky = 1
+           else
+              ky = 1 - ( n-1 )*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           ! first form  y := beta*y.
+           if( beta/=cone ) then
+              if( incy==1 ) then
+                 if( beta==czero ) then
+                    do i = 1, n
+                       y( i ) = czero
+                    end do
+                 else
+                    do i = 1, n
+                       y( i ) = beta*y( i )
+                    end do
+                 end if
+              else
+                 iy = ky
+                 if( beta==czero ) then
+                    do i = 1, n
+                       y( iy ) = czero
+                       iy = iy + incy
+                    end do
+                 else
+                    do i = 1, n
+                       y( iy ) = beta*y( iy )
+                       iy = iy + incy
+                    end do
+                 end if
+              end if
+           end if
+           if( alpha==czero )return
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! form  y  when a is stored in upper triangle.
+              if( ( incx==1 ) .and. ( incy==1 ) ) then
+                 do j = 1, n
+                    temp1 = alpha*x( j )
+                    temp2 = czero
+                    do i = 1, j - 1
+                       y( i ) = y( i ) + temp1*a( i, j )
+                       temp2 = temp2 + a( i, j )*x( i )
+                    end do
+                    y( j ) = y( j ) + temp1*a( j, j ) + alpha*temp2
+                 end do
+              else
+                 jx = kx
+                 jy = ky
+                 do j = 1, n
+                    temp1 = alpha*x( jx )
+                    temp2 = czero
+                    ix = kx
+                    iy = ky
+                    do i = 1, j - 1
+                       y( iy ) = y( iy ) + temp1*a( i, j )
+                       temp2 = temp2 + a( i, j )*x( ix )
+                       ix = ix + incx
+                       iy = iy + incy
+                    end do
+                    y( jy ) = y( jy ) + temp1*a( j, j ) + alpha*temp2
+                    jx = jx + incx
+                    jy = jy + incy
+                 end do
+              end if
+           else
+              ! form  y  when a is stored in lower triangle.
+              if( ( incx==1 ) .and. ( incy==1 ) ) then
+                 do j = 1, n
+                    temp1 = alpha*x( j )
+                    temp2 = czero
+                    y( j ) = y( j ) + temp1*a( j, j )
+                    do i = j + 1, n
+                       y( i ) = y( i ) + temp1*a( i, j )
+                       temp2 = temp2 + a( i, j )*x( i )
+                    end do
+                    y( j ) = y( j ) + alpha*temp2
+                 end do
+              else
+                 jx = kx
+                 jy = ky
+                 do j = 1, n
+                    temp1 = alpha*x( jx )
+                    temp2 = czero
+                    y( jy ) = y( jy ) + temp1*a( j, j )
+                    ix = jx
+                    iy = jy
+                    do i = j + 1, n
+                       ix = ix + incx
+                       iy = iy + incy
+                       y( iy ) = y( iy ) + temp1*a( i, j )
+                       temp2 = temp2 + a( i, j )*x( ix )
+                    end do
+                    y( jy ) = y( jy ) + alpha*temp2
+                    jx = jx + incx
+                    jy = jy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_csymv
+
+     !> CSYR:   performs the symmetric rank 1 operation
+     !> A := alpha*x*x**H + A,
+     !> where alpha is a complex scalar, x is an n element vector and A is an
+     !> n by n symmetric matrix.
+
+     pure subroutine stdlib_csyr( uplo, n, alpha, x, incx, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incx, lda, n
+           complex(sp), intent(in) :: alpha
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: x(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, ix, j, jx, kx
+           complex(sp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = 1
+           else if( n<0 ) then
+              info = 2
+           else if( incx==0 ) then
+              info = 5
+           else if( lda<max( 1, n ) ) then
+              info = 7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYR  ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ) .or. ( alpha==czero ) )return
+           ! set the start point in x if the increment is not unity.
+           if( incx<=0 ) then
+              kx = 1 - ( n-1 )*incx
+           else if( incx/=1 ) then
+              kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! form  a  when a is stored in upper triangle.
+              if( incx==1 ) then
+                 do j = 1, n
+                    if( x( j )/=czero ) then
+                       temp = alpha*x( j )
+                       do i = 1, j
+                          a( i, j ) = a( i, j ) + x( i )*temp
+                       end do
+                    end if
+                 end do
+              else
+                 jx = kx
+                 do j = 1, n
+                    if( x( jx )/=czero ) then
+                       temp = alpha*x( jx )
+                       ix = kx
+                       do i = 1, j
+                          a( i, j ) = a( i, j ) + x( ix )*temp
+                          ix = ix + incx
+                       end do
+                    end if
+                    jx = jx + incx
+                 end do
+              end if
+           else
+              ! form  a  when a is stored in lower triangle.
+              if( incx==1 ) then
+                 do j = 1, n
+                    if( x( j )/=czero ) then
+                       temp = alpha*x( j )
+                       do i = j, n
+                          a( i, j ) = a( i, j ) + x( i )*temp
+                       end do
+                    end if
+                 end do
+              else
+                 jx = kx
+                 do j = 1, n
+                    if( x( jx )/=czero ) then
+                       temp = alpha*x( jx )
+                       ix = jx
+                       do i = j, n
+                          a( i, j ) = a( i, j ) + x( ix )*temp
+                          ix = ix + incx
+                       end do
+                    end if
+                    jx = jx + incx
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_csyr
+
+     !> CSYSWAPR: applies an elementary permutation on the rows and the columns of
+     !> a symmetric matrix.
+
+     pure subroutine stdlib_csyswapr( uplo, n, a, lda, i1, i2)
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: i1, i2, lda, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,n)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           complex(sp) :: tmp
+           ! Executable Statements 
+           upper = stdlib_lsame( uplo, 'U' )
+           if (upper) then
+               ! upper
+               ! first swap
+                ! - swap column i1 and i2 from i1 to i1-1
+              call stdlib_cswap( i1-1, a(1,i1), 1, a(1,i2), 1 )
+                ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap row i1 from i1+1 to i2-1 with col i2 from i1+1 to i2-1
+              tmp=a(i1,i1)
+              a(i1,i1)=a(i2,i2)
+              a(i2,i2)=tmp
+              do i=1,i2-i1-1
+                 tmp=a(i1,i1+i)
+                 a(i1,i1+i)=a(i1+i,i2)
+                 a(i1+i,i2)=tmp
+              end do
+                ! third swap
+                ! - swap row i1 and i2 from i2+1 to n
+              do i=i2+1,n
+                 tmp=a(i1,i)
+                 a(i1,i)=a(i2,i)
+                 a(i2,i)=tmp
+              end do
+             else
+               ! lower
+               ! first swap
+                ! - swap row i1 and i2 from i1 to i1-1
+              call stdlib_cswap ( i1-1, a(i1,1), lda, a(i2,1), lda )
+               ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap col i1 from i1+1 to i2-1 with row i2 from i1+1 to i2-1
+               tmp=a(i1,i1)
+               a(i1,i1)=a(i2,i2)
+               a(i2,i2)=tmp
+               do i=1,i2-i1-1
+                  tmp=a(i1+i,i1)
+                  a(i1+i,i1)=a(i2,i1+i)
+                  a(i2,i1+i)=tmp
+               end do
+               ! third swap
+                ! - swap col i1 and i2 from i2+1 to n
+               do i=i2+1,n
+                  tmp=a(i,i1)
+                  a(i,i1)=a(i,i2)
+                  a(i,i2)=tmp
+               end do
+           endif
+     end subroutine stdlib_csyswapr
+
+     !> CSYTF2: computes the factorization of a complex symmetric matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**T is the transpose of U, and D is symmetric and
+     !> block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_csytf2( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kk, kp, kstep
+           real(sp) :: absakk, alpha, colmax, rowmax
+           complex(sp) :: d11, d12, d21, d22, r1, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=sp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTF2', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero .or. stdlib_sisnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = imax + stdlib_icamax( k-imax, a( imax, imax+1 ), lda )
+                    rowmax = cabs1( a( imax, jmax ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_icamax( imax-1, a( 1, imax ), 1 )
+                       rowmax = max( rowmax, cabs1( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( a( imax, imax ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_cswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    call stdlib_cswap( kk-kp-1, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**t = a - w(k)*1/d(k)*w(k)**t
+                    r1 = cone / a( k, k )
+                    call stdlib_csyr( uplo, k-1, -r1, a( 1, k ), 1, a, lda )
+                    ! store u(k) in column k
+                    call stdlib_cscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**t
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = cone / ( d11*d22-cone )
+                       d12 = t / d12
+                       do j = k - 2, 1, -1
+                          wkm1 = d12*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = d12*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - a( i, k )*wk -a( i, k-1 )*wkm1
+                          end do
+                          a( j, k ) = wk
+                          a( j, k-1 ) = wkm1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero .or. stdlib_sisnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = k - 1 + stdlib_icamax( imax-k, a( imax, k ), lda )
+                    rowmax = cabs1( a( imax, jmax ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_icamax( n-imax, a( imax+1, imax ), 1 )
+                       rowmax = max( rowmax, cabs1( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( a( imax, imax ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_cswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    call stdlib_cswap( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**t = a - w(k)*(1/d(k))*w(k)**t
+                       r1 = cone / a( k, k )
+                       call stdlib_csyr( uplo, n-k, -r1, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                 
+                       ! store l(k) in column k
+                       call stdlib_cscal( n-k, r1, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k)
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**t
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**t
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       d21 = t / d21
+                       do j = k + 2, n
+                          wk = d21*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = d21*( d22*a( j, k+1 )-a( j, k ) )
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - a( i, k )*wk -a( i, k+1 )*wkp1
+                          end do
+                          a( j, k ) = wk
+                          a( j, k+1 ) = wkp1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_csytf2
+
+     !> CSYTF2_RK: computes the factorization of a complex symmetric matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_csytf2_rk( uplo, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: e(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper, done
+           integer(ilp) :: i, imax, j, jmax, itemp, k, kk, kp, kstep, p, ii
+           real(sp) :: absakk, alpha, colmax, rowmax, stemp, sfmin
+           complex(sp) :: d11, d12, d21, d22, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt,aimag,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=sp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTF2_RK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = czero
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 34
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = czero
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange,
+                    ! use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_icamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_icamax( imax-1, a( 1, imax ), 1 )
+                          stemp = cabs1( a( itemp, imax ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( cabs1( a( imax, imax ) )<alpha*rowmax ))then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the leading
+                    ! submatrix a(1:k,1:k) if we have a 2-by-2 pivot
+                    if( p>1 )call stdlib_cswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    if( p<(k-1) )call stdlib_cswap( k-p-1, a( p+1, k ), 1, a( p, p+1 ),lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_cswap( n-k, a( k, k+1 ), lda, a( p, k+1 ), lda )
+                 end if
+                 ! second swap
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    if( kp>1 )call stdlib_cswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    if( ( kk>1 ) .and. ( kp<(kk-1) ) )call stdlib_cswap( kk-kp-1, a( kp+1, kk ), &
+                              1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_cswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = cone / a( k, k )
+                          call stdlib_csyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_cscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_csyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = cone / ( d11*d22-cone )
+                       do j = k - 2, 1, -1
+                          wkm1 = t*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = t*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - (a( i, k ) / d12 )*wk -( a( i, k-1 ) / d12 )&
+                                       *wkm1
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d12
+                          a( j, k-1 ) = wkm1 / d12
+                       end do
+                    end if
+                    ! copy superdiagonal elements of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    e( k ) = a( k-1, k )
+                    e( k-1 ) = czero
+                    a( k-1, k ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              34 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = czero
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 64
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = czero
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_icamax( imax-k, a( imax, k ), lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_icamax( n-imax, a( imax+1, imax ),1 )
+                          stemp = cabs1( a( itemp, imax ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( cabs1( a( imax, imax ) )<alpha*rowmax ))then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 42
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the trailing
+                    ! submatrix a(k:n,k:n) if we have a 2-by-2 pivot
+                    if( p<n )call stdlib_cswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    if( p>(k+1) )call stdlib_cswap( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_cswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                 end if
+                 ! second swap
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_cswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    if( ( kk<n ) .and. ( kp>(kk+1) ) )call stdlib_cswap( kp-kk-1, a( kk+1, kk ), &
+                              1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_cswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                    ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                    ! store l(k) in column k
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = cone / a( k, k )
+                          call stdlib_csyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_cscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_csyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = t*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = t*( d22*a( j, k+1 )-a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - ( a( i, k ) / d21 )*wk -( a( i, k+1 ) / d21 )&
+                                       *wkp1
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d21
+                          a( j, k+1 ) = wkp1 / d21
+                       end do
+                    end if
+                    ! copy subdiagonal elements of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    e( k ) = a( k+1, k )
+                    e( k+1 ) = czero
+                    a( k+1, k ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+              64 continue
+           end if
+           return
+     end subroutine stdlib_csytf2_rk
+
+     !> CSYTF2_ROOK: computes the factorization of a complex symmetric matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**T is the transpose of U, and D is symmetric and
+     !> block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_csytf2_rook( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper, done
+           integer(ilp) :: i, imax, j, jmax, itemp, k, kk, kp, kstep, p, ii
+           real(sp) :: absakk, alpha, colmax, rowmax, stemp, sfmin
+           complex(sp) :: d11, d12, d21, d22, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt,aimag,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=sp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTF2_ROOK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_icamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange,
+                    ! use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_icamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_icamax( imax-1, a( 1, imax ), 1 )
+                          stemp = cabs1( a( itemp, imax ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! cabs1( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( cabs1(a( imax, imax ))<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the leading
+                    ! submatrix a(1:k,1:k) if we have a 2-by-2 pivot
+                    if( p>1 )call stdlib_cswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    if( p<(k-1) )call stdlib_cswap( k-p-1, a( p+1, k ), 1, a( p, p+1 ),lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                 end if
+                 ! second swap
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    if( kp>1 )call stdlib_cswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    if( ( kk>1 ) .and. ( kp<(kk-1) ) )call stdlib_cswap( kk-kp-1, a( kp+1, kk ), &
+                              1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = cone / a( k, k )
+                          call stdlib_csyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_cscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_csyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = cone / ( d11*d22-cone )
+                       do j = k - 2, 1, -1
+                          wkm1 = t*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = t*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - (a( i, k ) / d12 )*wk -( a( i, k-1 ) / d12 )&
+                                       *wkm1
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d12
+                          a( j, k-1 ) = wkm1 / d12
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_icamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_icamax( imax-k, a( imax, k ), lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_icamax( n-imax, a( imax+1, imax ),1 )
+                          stemp = cabs1( a( itemp, imax ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! cabs1( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( cabs1(a( imax, imax ))<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 42
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the trailing
+                    ! submatrix a(k:n,k:n) if we have a 2-by-2 pivot
+                    if( p<n )call stdlib_cswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    if( p>(k+1) )call stdlib_cswap( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                 end if
+                 ! second swap
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_cswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    if( ( kk<n ) .and. ( kp>(kk+1) ) )call stdlib_cswap( kp-kk-1, a( kk+1, kk ), &
+                              1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                    ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                    ! store l(k) in column k
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = cone / a( k, k )
+                          call stdlib_csyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_cscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_csyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = t*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = t*( d22*a( j, k+1 )-a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - ( a( i, k ) / d21 )*wk -( a( i, k+1 ) / d21 )&
+                                       *wkp1
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d21
+                          a( j, k+1 ) = wkp1 / d21
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_csytf2_rook
+
+     !> CSYTRF: computes the factorization of a complex symmetric matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method.  The form of the
+     !> factorization is
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_csytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'CSYTRF', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'CSYTRF', uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_clasyf;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_clasyf( uplo, k, nb, kb, a, lda, ipiv, work, n, iinfo )
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_csytf2( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_clasyf;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_clasyf( uplo, n-k+1, nb, kb, a( k, k ), lda, ipiv( k ),work, n, &
+                           iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_csytf2( uplo, n-k+1, a( k, k ), lda, ipiv( k ), iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_csytrf
+
+     !> CSYTRF_RK: computes the factorization of a complex symmetric matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_csytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, ip, iws, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'CSYTRF_RK', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTRF_RK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'CSYTRF_RK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_clasyf_rk;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 15
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_clasyf_rk( uplo, k, nb, kb, a, lda, e,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_csytf2_rk( uplo, k, a, lda, e, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k-kb+1:k and apply row permutations to the
+              ! last k+1 colunms k+1:n after that block
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k<n ) then
+                 do i = k, ( k - kb + 1 ), -1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_cswap( n-k, a( i, k+1 ), lda,a( ip, k+1 ), lda )
+                    end if
+                 end do
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+              ! this label is the exit from main loop over k decreasing
+              ! from n to 1 in steps of kb
+              15 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_clasyf_rk;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 35
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_clasyf_rk( uplo, n-k+1, nb, kb, a( k, k ), lda, e( k ),ipiv( k ), &
+                           work, ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_csytf2_rk( uplo, n-k+1, a( k, k ), lda, e( k ),ipiv( k ), iinfo )
+                           
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do i = k, k + kb - 1
+                 if( ipiv( i )>0 ) then
+                    ipiv( i ) = ipiv( i ) + k - 1
+                 else
+                    ipiv( i ) = ipiv( i ) - k + 1
+                 end if
+              end do
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k:k+kb-1 and apply row permutations to the
+              ! first k-1 colunms 1:k-1 before that block
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k>1 ) then
+                 do i = k, ( k + kb - 1 ), 1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_cswap( k-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                    end if
+                 end do
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+              ! this label is the exit from main loop over k increasing
+              ! from 1 to n in steps of kb
+              35 continue
+           ! end lower
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_csytrf_rk
+
+     !> CSYTRF_ROOK: computes the factorization of a complex symmetric matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+     !> The form of the factorization is
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_csytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'CSYTRF_ROOK', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, n*nb )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTRF_ROOK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'CSYTRF_ROOK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_clasyf_rook;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_clasyf_rook( uplo, k, nb, kb, a, lda,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_csytf2_rook( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_clasyf_rook;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_clasyf_rook( uplo, n-k+1, nb, kb, a( k, k ), lda,ipiv( k ), work, &
+                           ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_csytf2_rook( uplo, n-k+1, a( k, k ), lda, ipiv( k ),iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_csytrf_rook
+
+     !> CSYTRI: computes the inverse of a complex symmetric indefinite matrix
+     !> A using the factorization A = U*D*U**T or A = L*D*L**T computed by
+     !> CSYTRF.
+
+     pure subroutine stdlib_csytri( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kp, kstep
+           complex(sp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 40
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = cone / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_ccopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_csymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_cdotu( k-1, work, 1, a( 1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = a( k, k+1 )
+                 ak = a( k, k ) / t
+                 akp1 = a( k+1, k+1 ) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-cone )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_ccopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_csymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_cdotu( k-1, work, 1, a( 1, k ),1 )
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_cdotu( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_ccopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_csymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -stdlib_cdotu( k-1, work, 1, a( 1, k+1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 call stdlib_cswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                 call stdlib_cswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              go to 30
+              40 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              50 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 60
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = cone / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_ccopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_csymv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_cdotu( n-k, work, 1, a( k+1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = a( k, k-1 )
+                 ak = a( k-1, k-1 ) / t
+                 akp1 = a( k, k ) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-cone )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_ccopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_csymv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_cdotu( n-k, work, 1, a( k+1, k ),1 )
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_cdotu( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 &
+                              )
+                    call stdlib_ccopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_csymv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+&
+                              1, k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -stdlib_cdotu( n-k, work, 1, a( k+1, k-1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 if( kp<n )call stdlib_cswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                 call stdlib_cswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              go to 50
+              60 continue
+           end if
+           return
+     end subroutine stdlib_csytri
+
+     !> CSYTRI_ROOK: computes the inverse of a complex symmetric
+     !> matrix A using the factorization A = U*D*U**T or A = L*D*L**T
+     !> computed by CSYTRF_ROOK.
+
+     pure subroutine stdlib_csytri_rook( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kp, kstep
+           complex(sp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTRI_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 40
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = cone / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_ccopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_csymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_cdotu( k-1, work, 1, a( 1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = a( k, k+1 )
+                 ak = a( k, k ) / t
+                 akp1 = a( k+1, k+1 ) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-cone )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_ccopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_csymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_cdotu( k-1, work, 1, a( 1, k ),1 )
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_cdotu( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_ccopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_csymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -stdlib_cdotu( k-1, work, 1, a( 1, k+1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_cswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_cswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k+1 with -ipiv(k) and
+                 ! -ipiv(k+1)in the leading submatrix a(1:k+1,1:k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_cswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_cswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+                 k = k + 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_cswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_cswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k + 1
+              go to 30
+              40 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              50 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 60
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = cone / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_ccopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_csymv( uplo, n-k,-cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+1,&
+                               k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_cdotu( n-k, work, 1, a( k+1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = a( k, k-1 )
+                 ak = a( k-1, k-1 ) / t
+                 akp1 = a( k, k ) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-cone )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_ccopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_csymv( uplo, n-k,-cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+1,&
+                               k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_cdotu( n-k, work, 1, a( k+1, k ),1 )
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_cdotu( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 &
+                              )
+                    call stdlib_ccopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_csymv( uplo, n-k,-cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+1,&
+                               k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -stdlib_cdotu( n-k, work, 1, a( k+1, k-1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_cswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_cswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k-1 with -ipiv(k) and
+                 ! -ipiv(k-1) in the trailing submatrix a(k-1:n,k-1:n)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_cswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_cswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+                 k = k - 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_cswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_cswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k - 1
+              go to 50
+              60 continue
+           end if
+           return
+     end subroutine stdlib_csytri_rook
+
+     !> CSYTRS: solves a system of linear equations A*X = B with a complex
+     !> symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by CSYTRF.
+
+     pure subroutine stdlib_csytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           complex(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_cgeru( k-1, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_cscal( nrhs, cone / a( k, k ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_cswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_cgeru( k-2, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 call stdlib_cgeru( k-2, nrhs, -cone, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ), &
+                           ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_cgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb, a( 1, k ),1, cone, b( &
+                           k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 call stdlib_cgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb, a( 1, k ),1, cone, b( &
+                           k, 1 ), ldb )
+                 call stdlib_cgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb,a( 1, k+1 ), 1, cone, b(&
+                            k+1, 1 ), ldb )
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_cgeru( n-k, nrhs, -cone, a( k+1, k ), 1, b( k, 1 ),ldb, b( &
+                           k+1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_cscal( nrhs, cone / a( k, k ), b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_cswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_cgeru( n-k-1, nrhs, -cone, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_cgeru( n-k-1, nrhs, -cone, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / akm1k
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_cgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+&
+                           1, k ), 1, cone, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_cgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+1, k )&
+                              , 1, cone, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+1, k-&
+                              1 ), 1, cone, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_csytrs
+
+     !> CSYTRS2: solves a system of linear equations A*X = B with a complex
+     !> symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by CSYTRF and converted by CSYCONV.
+
+     pure subroutine stdlib_csytrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, j, k, kp
+           complex(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTRS2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! convert a
+           call stdlib_csyconv( uplo, 'C', n, a, lda, ipiv, work, iinfo )
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+             ! p**t * b
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp==-ipiv( k-1 ) )call stdlib_cswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb &
+                           )
+                 k=k-2
+              end if
+             end do
+        ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+             call stdlib_ctrsm('L','U','N','U',n,nrhs,cone,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i=n
+              do while ( i >= 1 )
+                 if( ipiv(i) > 0 ) then
+                   call stdlib_cscal( nrhs, cone / a( i, i ), b( i, 1 ), ldb )
+                 elseif ( i > 1) then
+                    if ( ipiv(i-1) == ipiv(i) ) then
+                       akm1k = work(i)
+                       akm1 = a( i-1, i-1 ) / akm1k
+                       ak = a( i, i ) / akm1k
+                       denom = akm1*ak - cone
+                       do j = 1, nrhs
+                          bkm1 = b( i-1, j ) / akm1k
+                          bk = b( i, j ) / akm1k
+                          b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                          b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                    i = i - 1
+                    endif
+                 endif
+                 i = i - 1
+              end do
+            ! compute (u**t \ b) -> b   [ u**t \ (d \ (u \p**t * b) ) ]
+              call stdlib_ctrsm('L','U','T','U',n,nrhs,cone,a,lda,b,ldb)
+             ! p * b  [ p * (u**t \ (d \ (u \p**t * b) )) ]
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k < n .and. kp==-ipiv( k+1 ) )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp,&
+                            1 ), ldb )
+                 k=k+2
+              endif
+             end do
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+             ! p**t * b
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k+1).
+                 kp = -ipiv( k+1 )
+                 if( kp==-ipiv( k ) )call stdlib_cswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                           
+                 k=k+2
+              endif
+             end do
+        ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+             call stdlib_ctrsm('L','L','N','U',n,nrhs,cone,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i=1
+              do while ( i <= n )
+                 if( ipiv(i) > 0 ) then
+                   call stdlib_cscal( nrhs, cone / a( i, i ), b( i, 1 ), ldb )
+                 else
+                       akm1k = work(i)
+                       akm1 = a( i, i ) / akm1k
+                       ak = a( i+1, i+1 ) / akm1k
+                       denom = akm1*ak - cone
+                       do j = 1, nrhs
+                          bkm1 = b( i, j ) / akm1k
+                          bk = b( i+1, j ) / akm1k
+                          b( i, j ) = ( ak*bkm1-bk ) / denom
+                          b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                       i = i + 1
+                 endif
+                 i = i + 1
+              end do
+        ! compute (l**t \ b) -> b   [ l**t \ (d \ (l \p**t * b) ) ]
+             call stdlib_ctrsm('L','L','T','U',n,nrhs,cone,a,lda,b,ldb)
+             ! p * b  [ p * (l**t \ (d \ (l \p**t * b) )) ]
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k>1 .and. kp==-ipiv( k-1 ) )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, &
+                           1 ), ldb )
+                 k=k-2
+              endif
+             end do
+           end if
+           ! revert a
+           call stdlib_csyconv( uplo, 'R', n, a, lda, ipiv, work, iinfo )
+           return
+     end subroutine stdlib_csytrs2
+
+     !> CSYTRS_3: solves a system of linear equations A * X = B with a complex
+     !> symmetric matrix A using the factorization computed
+     !> by CSYTRF_RK or CSYTRF_BK:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This algorithm is using Level 3 BLAS.
+
+     pure subroutine stdlib_csytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*), e(*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j, k, kp
+           complex(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTRS_3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! begin upper
+              ! solve a*x = b, where a = u*d*u**t.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+              call stdlib_ctrsm( 'L', 'U', 'N', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i = n
+              do while ( i>=1 )
+                 if( ipiv( i )>0 ) then
+                    call stdlib_cscal( nrhs, cone / a( i, i ), b( i, 1 ), ldb )
+                 else if ( i>1 ) then
+                    akm1k = e( i )
+                    akm1 = a( i-1, i-1 ) / akm1k
+                    ak = a( i, i ) / akm1k
+                    denom = akm1*ak - cone
+                    do j = 1, nrhs
+                       bkm1 = b( i-1, j ) / akm1k
+                       bk = b( i, j ) / akm1k
+                       b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                       b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i - 1
+                 end if
+                 i = i - 1
+              end do
+              ! compute (u**t \ b) -> b   [ u**t \ (d \ (u \p**t * b) ) ]
+              call stdlib_ctrsm( 'L', 'U', 'T', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! p * b  [ p * (u**t \ (d \ (u \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+           else
+              ! begin lower
+              ! solve a*x = b, where a = l*d*l**t.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+              call stdlib_ctrsm( 'L', 'L', 'N', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i = 1
+              do while ( i<=n )
+                 if( ipiv( i )>0 ) then
+                    call stdlib_cscal( nrhs, cone / a( i, i ), b( i, 1 ), ldb )
+                 else if( i<n ) then
+                    akm1k = e( i )
+                    akm1 = a( i, i ) / akm1k
+                    ak = a( i+1, i+1 ) / akm1k
+                    denom = akm1*ak - cone
+                    do  j = 1, nrhs
+                       bkm1 = b( i, j ) / akm1k
+                       bk = b( i+1, j ) / akm1k
+                       b( i, j ) = ( ak*bkm1-bk ) / denom
+                       b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i + 1
+                 end if
+                 i = i + 1
+              end do
+              ! compute (l**t \ b) -> b   [ l**t \ (d \ (l \p**t * b) ) ]
+              call stdlib_ctrsm('L', 'L', 'T', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! p * b  [ p * (l**t \ (d \ (l \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! end lower
+           end if
+           return
+     end subroutine stdlib_csytrs_3
+
+     !> CSYTRS_AA: solves a system of linear equations A*X = B with a complex
+     !> symmetric matrix A using the factorization A = U**T*T*U or
+     !> A = L*T*L**T computed by CSYTRF_AA.
+
+     pure subroutine stdlib_csytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, nrhs, lda, ldb, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           logical(lk) :: lquery, upper
+           integer(ilp) :: k, kp, lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, 3*n-2 ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTRS_AA', -info )
+              return
+           else if( lquery ) then
+              lwkopt = (3*n-2)
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u**t*t*u.
+              ! 1) forward substitution with u**t
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute u**t \ b -> b    [ (u**t \p**t * b) ]
+                 call stdlib_ctrsm( 'L', 'U', 'T', 'U', n-1, nrhs, cone, a( 1, 2 ),lda, b( 2, 1 ),&
+                            ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (u**t \p**t * b) ]
+              call stdlib_clacpy( 'F', 1, n, a( 1, 1 ), lda+1, work( n ), 1)
+              if( n>1 ) then
+                 call stdlib_clacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 1 ), 1 )
+                 call stdlib_clacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 2*n ), 1 )
+              end if
+              call stdlib_cgtsv( n, nrhs, work( 1 ), work( n ), work( 2*n ), b, ldb,info )
+              ! 3) backward substitution with u
+              if( n>1 ) then
+                 ! compute u \ b -> b   [ u \ (t \ (u**t \p**t * b) ) ]
+                 call stdlib_ctrsm( 'L', 'U', 'N', 'U', n-1, nrhs, cone, a( 1, 2 ),lda, b( 2, 1 ),&
+                            ldb)
+                 ! pivot, p * b -> b  [ p * (u**t \ (t \ (u \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           else
+              ! solve a*x = b, where a = l*t*l**t.
+              ! 1) forward substitution with l
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute l \ b -> b    [ (l \p**t * b) ]
+                 call stdlib_ctrsm( 'L', 'L', 'N', 'U', n-1, nrhs, cone, a( 2, 1 ),lda, b( 2, 1 ),&
+                            ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (l \p**t * b) ]
+              call stdlib_clacpy( 'F', 1, n, a(1, 1), lda+1, work(n), 1)
+              if( n>1 ) then
+                 call stdlib_clacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 1 ), 1 )
+                 call stdlib_clacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 2*n ), 1 )
+              end if
+              call stdlib_cgtsv( n, nrhs, work( 1 ), work(n), work( 2*n ), b, ldb,info)
+              ! 3) backward substitution with l**t
+              if( n>1 ) then
+                 ! compute (l**t \ b) -> b   [ l**t \ (t \ (l \p**t * b) ) ]
+                 call stdlib_ctrsm( 'L', 'L', 'T', 'U', n-1, nrhs, cone, a( 2, 1 ),lda, b( 2, 1 ),&
+                            ldb)
+                 ! pivot, p * b -> b  [ p * (l**t \ (t \ (l \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_csytrs_aa
+
+     !> CSYTRS_ROOK: solves a system of linear equations A*X = B with
+     !> a complex symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by CSYTRF_ROOK.
+
+     pure subroutine stdlib_csytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           complex(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTRS_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_cgeru( k-1, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_cscal( nrhs, cone / a( k, k ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k) then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_cswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k>2 ) then
+                    call stdlib_cgeru( k-2, nrhs,-cone, a( 1, k ), 1, b( k, 1 ),ldb, b( 1, 1 ), &
+                              ldb )
+                    call stdlib_cgeru( k-2, nrhs,-cone, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 )&
+                              , ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 )call stdlib_cgemv( 'TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), 1, &
+                           cone, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_cgemv( 'TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), 1, cone, &
+                              b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k+1 ), 1, cone,&
+                               b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k) then k+1 and -ipiv(k+1).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_cswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_cgeru( n-k, nrhs, -cone, a( k+1, k ), 1, b( k, 1 ),ldb, b( &
+                           k+1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_cscal( nrhs, cone / a( k, k ), b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k) then k+1 and -ipiv(k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_cswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_cgeru( n-k-1, nrhs,-cone, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_cgeru( n-k-1, nrhs,-cone, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( k+&
+                              2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / akm1k
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_cgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+&
+                           1, k ), 1, cone, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_cgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+1, k )&
+                              , 1, cone, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+1, k-&
+                              1 ), 1, cone, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k) then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_cswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_csytrs_rook
+
+     !> CTBRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular band
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by CTBTRS or some other
+     !> means before entering this routine.  CTBRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_ctbrfs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, x, ldx, ferr,&
+                berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: ab(ldab,*), b(ldb,*), x(ldx,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transn, transt
+           integer(ilp) :: i, j, k, kase, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,min,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = kd + 2
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_ccopy( n, x( 1, j ), 1, work, 1 )
+              call stdlib_ctbmv( uplo, trans, diag, n, kd, ab, ldab, work, 1 )
+              call stdlib_caxpy( n, -cone, b( 1, j ), 1, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = max( 1, k-kd ), k
+                             rwork( i ) = rwork( i ) +cabs1( ab( kd+1+i-k, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = max( 1, k-kd ), k - 1
+                             rwork( i ) = rwork( i ) +cabs1( ab( kd+1+i-k, k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k, min( n, k+kd )
+                             rwork( i ) = rwork( i ) +cabs1( ab( 1+i-k, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k + 1, min( n, k+kd )
+                             rwork( i ) = rwork( i ) +cabs1( ab( 1+i-k, k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**h)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = max( 1, k-kd ), k
+                             s = s + cabs1( ab( kd+1+i-k, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = max( 1, k-kd ), k - 1
+                             s = s + cabs1( ab( kd+1+i-k, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, min( n, k+kd )
+                             s = s + cabs1( ab( 1+i-k, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = k + 1, min( n, k+kd )
+                             s = s + cabs1( ab( 1+i-k, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_clacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_ctbsv( uplo, transt, diag, n, kd, ab, ldab, work,1 )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_ctbsv( uplo, transn, diag, n, kd, ab, ldab, work,1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_ctbrfs
+
+     !> CTBTRS: solves a triangular system of the form
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> where A is a triangular band matrix of order N, and B is an
+     !> N-by-NRHS matrix.  A check is made to verify that A is nonsingular.
+
+     pure subroutine stdlib_ctbtrs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           complex(sp), intent(in) :: ab(ldab,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nounit = stdlib_lsame( diag, 'N' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              if( upper ) then
+                 do info = 1, n
+                    if( ab( kd+1, info )==czero )return
+                 end do
+              else
+                 do info = 1, n
+                    if( ab( 1, info )==czero )return
+                 end do
+              end if
+           end if
+           info = 0
+           ! solve a * x = b,  a**t * x = b,  or  a**h * x = b.
+           do j = 1, nrhs
+              call stdlib_ctbsv( uplo, trans, diag, n, kd, ab, ldab, b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_ctbtrs
+
+     !> Level 3 BLAS like routine for A in RFP Format.
+     !> CTFSM: solves the matrix equation
+     !> op( A )*X = alpha*B  or  X*op( A ) = alpha*B
+     !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**H.
+     !> A is in Rectangular Full Packed (RFP) Format.
+     !> The matrix X is overwritten on B.
+
+     pure subroutine stdlib_ctfsm( transr, side, uplo, trans, diag, m, n, alpha, a,b, ldb )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, diag, side, trans, uplo
+           integer(ilp), intent(in) :: ldb, m, n
+           complex(sp), intent(in) :: alpha
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(0:*)
+           complex(sp), intent(inout) :: b(0:ldb-1,0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lside, misodd, nisodd, normaltransr, notrans
+           integer(ilp) :: m1, m2, n1, n2, k, info, i, j
+           ! Intrinsic Functions 
+           intrinsic :: max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lside = stdlib_lsame( side, 'L' )
+           lower = stdlib_lsame( uplo, 'L' )
+           notrans = stdlib_lsame( trans, 'N' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lside .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -2
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -3
+           else if( .not.notrans .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -4
+           else if( .not.stdlib_lsame( diag, 'N' ) .and. .not.stdlib_lsame( diag, 'U' ) )&
+                     then
+              info = -5
+           else if( m<0 ) then
+              info = -6
+           else if( n<0 ) then
+              info = -7
+           else if( ldb<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTFSM ', -info )
+              return
+           end if
+           ! quick return when ( (n==0).or.(m==0) )
+           if( ( m==0 ) .or. ( n==0 ) )return
+           ! quick return when alpha==(0e+0_sp,0e+0_sp)
+           if( alpha==czero ) then
+              do j = 0, n - 1
+                 do i = 0, m - 1
+                    b( i, j ) = czero
+                 end do
+              end do
+              return
+           end if
+           if( lside ) then
+              ! side = 'l'
+              ! a is m-by-m.
+              ! if m is odd, set nisodd = .true., and m1 and m2.
+              ! if m is even, nisodd = .false., and m.
+              if( mod( m, 2 )==0 ) then
+                 misodd = .false.
+                 k = m / 2
+              else
+                 misodd = .true.
+                 if( lower ) then
+                    m2 = m / 2
+                    m1 = m - m2
+                 else
+                    m1 = m / 2
+                    m2 = m - m1
+                 end if
+              end if
+              if( misodd ) then
+                 ! side = 'l' and n is odd
+                 if( normaltransr ) then
+                    ! side = 'l', n is odd, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='l', n is odd, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'n'
+                          if( m==1 ) then
+                             call stdlib_ctrsm( 'L', 'L', 'N', diag, m1, n, alpha,a, m, b, ldb )
+                                       
+                          else
+                             call stdlib_ctrsm( 'L', 'L', 'N', diag, m1, n, alpha,a( 0 ), m, b, &
+                                       ldb )
+                             call stdlib_cgemm( 'N', 'N', m2, n, m1, -cone, a( m1 ),m, b, ldb, &
+                                       alpha, b( m1, 0 ), ldb )
+                             call stdlib_ctrsm( 'L', 'U', 'C', diag, m2, n, cone,a( m ), m, b( m1,&
+                                        0 ), ldb )
+                          end if
+                       else
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'c'
+                          if( m==1 ) then
+                             call stdlib_ctrsm( 'L', 'L', 'C', diag, m1, n, alpha,a( 0 ), m, b, &
+                                       ldb )
+                          else
+                             call stdlib_ctrsm( 'L', 'U', 'N', diag, m2, n, alpha,a( m ), m, b( &
+                                       m1, 0 ), ldb )
+                             call stdlib_cgemm( 'C', 'N', m1, n, m2, -cone, a( m1 ),m, b( m1, 0 ),&
+                                        ldb, alpha, b, ldb )
+                             call stdlib_ctrsm( 'L', 'L', 'C', diag, m1, n, cone,a( 0 ), m, b, &
+                                       ldb )
+                          end if
+                       end if
+                    else
+                       ! side  ='l', n is odd, transr = 'n', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_ctrsm( 'L', 'L', 'N', diag, m1, n, alpha,a( m2 ), m, b, ldb &
+                                    )
+                          call stdlib_cgemm( 'C', 'N', m2, n, m1, -cone, a( 0 ), m,b, ldb, alpha, &
+                                    b( m1, 0 ), ldb )
+                          call stdlib_ctrsm( 'L', 'U', 'C', diag, m2, n, cone,a( m1 ), m, b( m1, &
+                                    0 ), ldb )
+                       else
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'c'
+                          call stdlib_ctrsm( 'L', 'U', 'N', diag, m2, n, alpha,a( m1 ), m, b( m1, &
+                                    0 ), ldb )
+                          call stdlib_cgemm( 'N', 'N', m1, n, m2, -cone, a( 0 ), m,b( m1, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_ctrsm( 'L', 'L', 'C', diag, m1, n, cone,a( m2 ), m, b, ldb )
+                                    
+                       end if
+                    end if
+                 else
+                    ! side = 'l', n is odd, and transr = 'c'
+                    if( lower ) then
+                       ! side  ='l', n is odd, transr = 'c', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is odd, transr = 'c', uplo = 'l', and
+                          ! trans = 'n'
+                          if( m==1 ) then
+                             call stdlib_ctrsm( 'L', 'U', 'C', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                          else
+                             call stdlib_ctrsm( 'L', 'U', 'C', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                             call stdlib_cgemm( 'C', 'N', m2, n, m1, -cone,a( m1*m1 ), m1, b, ldb,&
+                                        alpha,b( m1, 0 ), ldb )
+                             call stdlib_ctrsm( 'L', 'L', 'N', diag, m2, n, cone,a( 1 ), m1, b( &
+                                       m1, 0 ), ldb )
+                          end if
+                       else
+                          ! side  ='l', n is odd, transr = 'c', uplo = 'l', and
+                          ! trans = 'c'
+                          if( m==1 ) then
+                             call stdlib_ctrsm( 'L', 'U', 'N', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                          else
+                             call stdlib_ctrsm( 'L', 'L', 'C', diag, m2, n, alpha,a( 1 ), m1, b( &
+                                       m1, 0 ), ldb )
+                             call stdlib_cgemm( 'N', 'N', m1, n, m2, -cone,a( m1*m1 ), m1, b( m1, &
+                                       0 ), ldb,alpha, b, ldb )
+                             call stdlib_ctrsm( 'L', 'U', 'N', diag, m1, n, cone,a( 0 ), m1, b, &
+                                       ldb )
+                          end if
+                       end if
+                    else
+                       ! side  ='l', n is odd, transr = 'c', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is odd, transr = 'c', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_ctrsm( 'L', 'U', 'C', diag, m1, n, alpha,a( m2*m2 ), m2, b, &
+                                    ldb )
+                          call stdlib_cgemm( 'N', 'N', m2, n, m1, -cone, a( 0 ), m2,b, ldb, alpha,&
+                                     b( m1, 0 ), ldb )
+                          call stdlib_ctrsm( 'L', 'L', 'N', diag, m2, n, cone,a( m1*m2 ), m2, b( &
+                                    m1, 0 ), ldb )
+                       else
+                          ! side  ='l', n is odd, transr = 'c', uplo = 'u', and
+                          ! trans = 'c'
+                          call stdlib_ctrsm( 'L', 'L', 'C', diag, m2, n, alpha,a( m1*m2 ), m2, b( &
+                                    m1, 0 ), ldb )
+                          call stdlib_cgemm( 'C', 'N', m1, n, m2, -cone, a( 0 ), m2,b( m1, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_ctrsm( 'L', 'U', 'N', diag, m1, n, cone,a( m2*m2 ), m2, b, &
+                                    ldb )
+                       end if
+                    end if
+                 end if
+              else
+                 ! side = 'l' and n is even
+                 if( normaltransr ) then
+                    ! side = 'l', n is even, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='l', n is even, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_ctrsm( 'L', 'L', 'N', diag, k, n, alpha,a( 1 ), m+1, b, ldb &
+                                    )
+                          call stdlib_cgemm( 'N', 'N', k, n, k, -cone, a( k+1 ),m+1, b, ldb, &
+                                    alpha, b( k, 0 ), ldb )
+                          call stdlib_ctrsm( 'L', 'U', 'C', diag, k, n, cone,a( 0 ), m+1, b( k, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'c'
+                          call stdlib_ctrsm( 'L', 'U', 'N', diag, k, n, alpha,a( 0 ), m+1, b( k, &
+                                    0 ), ldb )
+                          call stdlib_cgemm( 'C', 'N', k, n, k, -cone, a( k+1 ),m+1, b( k, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_ctrsm( 'L', 'L', 'C', diag, k, n, cone,a( 1 ), m+1, b, ldb )
+                                    
+                       end if
+                    else
+                       ! side  ='l', n is even, transr = 'n', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_ctrsm( 'L', 'L', 'N', diag, k, n, alpha,a( k+1 ), m+1, b, &
+                                    ldb )
+                          call stdlib_cgemm( 'C', 'N', k, n, k, -cone, a( 0 ), m+1,b, ldb, alpha, &
+                                    b( k, 0 ), ldb )
+                          call stdlib_ctrsm( 'L', 'U', 'C', diag, k, n, cone,a( k ), m+1, b( k, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'c'
+                          call stdlib_ctrsm( 'L', 'U', 'N', diag, k, n, alpha,a( k ), m+1, b( k, &
+                                    0 ), ldb )
+                          call stdlib_cgemm( 'N', 'N', k, n, k, -cone, a( 0 ), m+1,b( k, 0 ), ldb,&
+                                     alpha, b, ldb )
+                          call stdlib_ctrsm( 'L', 'L', 'C', diag, k, n, cone,a( k+1 ), m+1, b, &
+                                    ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'l', n is even, and transr = 'c'
+                    if( lower ) then
+                       ! side  ='l', n is even, transr = 'c', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is even, transr = 'c', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_ctrsm( 'L', 'U', 'C', diag, k, n, alpha,a( k ), k, b, ldb )
+                                    
+                          call stdlib_cgemm( 'C', 'N', k, n, k, -cone,a( k*( k+1 ) ), k, b, ldb, &
+                                    alpha,b( k, 0 ), ldb )
+                          call stdlib_ctrsm( 'L', 'L', 'N', diag, k, n, cone,a( 0 ), k, b( k, 0 ),&
+                                     ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'c', uplo = 'l',
+                          ! and trans = 'c'
+                          call stdlib_ctrsm( 'L', 'L', 'C', diag, k, n, alpha,a( 0 ), k, b( k, 0 )&
+                                    , ldb )
+                          call stdlib_cgemm( 'N', 'N', k, n, k, -cone,a( k*( k+1 ) ), k, b( k, 0 )&
+                                    , ldb,alpha, b, ldb )
+                          call stdlib_ctrsm( 'L', 'U', 'N', diag, k, n, cone,a( k ), k, b, ldb )
+                                    
+                       end if
+                    else
+                       ! side  ='l', n is even, transr = 'c', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is even, transr = 'c', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_ctrsm( 'L', 'U', 'C', diag, k, n, alpha,a( k*( k+1 ) ), k, &
+                                    b, ldb )
+                          call stdlib_cgemm( 'N', 'N', k, n, k, -cone, a( 0 ), k, b,ldb, alpha, b(&
+                                     k, 0 ), ldb )
+                          call stdlib_ctrsm( 'L', 'L', 'N', diag, k, n, cone,a( k*k ), k, b( k, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'c', uplo = 'u',
+                          ! and trans = 'c'
+                          call stdlib_ctrsm( 'L', 'L', 'C', diag, k, n, alpha,a( k*k ), k, b( k, &
+                                    0 ), ldb )
+                          call stdlib_cgemm( 'C', 'N', k, n, k, -cone, a( 0 ), k,b( k, 0 ), ldb, &
+                                    alpha, b, ldb )
+                          call stdlib_ctrsm( 'L', 'U', 'N', diag, k, n, cone,a( k*( k+1 ) ), k, b,&
+                                     ldb )
+                       end if
+                    end if
+                 end if
+              end if
+           else
+              ! side = 'r'
+              ! a is n-by-n.
+              ! if n is odd, set nisodd = .true., and n1 and n2.
+              ! if n is even, nisodd = .false., and k.
+              if( mod( n, 2 )==0 ) then
+                 nisodd = .false.
+                 k = n / 2
+              else
+                 nisodd = .true.
+                 if( lower ) then
+                    n2 = n / 2
+                    n1 = n - n2
+                 else
+                    n1 = n / 2
+                    n2 = n - n1
+                 end if
+              end if
+              if( nisodd ) then
+                 ! side = 'r' and n is odd
+                 if( normaltransr ) then
+                    ! side = 'r', n is odd, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='r', n is odd, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'n'
+                          call stdlib_ctrsm( 'R', 'U', 'C', diag, m, n2, alpha,a( n ), n, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_cgemm( 'N', 'N', m, n1, n2, -cone, b( 0, n1 ),ldb, a( n1 ), &
+                                    n, alpha, b( 0, 0 ),ldb )
+                          call stdlib_ctrsm( 'R', 'L', 'N', diag, m, n1, cone,a( 0 ), n, b( 0, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'c'
+                          call stdlib_ctrsm( 'R', 'L', 'C', diag, m, n1, alpha,a( 0 ), n, b( 0, 0 &
+                                    ), ldb )
+                          call stdlib_cgemm( 'N', 'C', m, n2, n1, -cone, b( 0, 0 ),ldb, a( n1 ), &
+                                    n, alpha, b( 0, n1 ),ldb )
+                          call stdlib_ctrsm( 'R', 'U', 'N', diag, m, n2, cone,a( n ), n, b( 0, n1 &
+                                    ), ldb )
+                       end if
+                    else
+                       ! side  ='r', n is odd, transr = 'n', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_ctrsm( 'R', 'L', 'C', diag, m, n1, alpha,a( n2 ), n, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_cgemm( 'N', 'N', m, n2, n1, -cone, b( 0, 0 ),ldb, a( 0 ), n,&
+                                     alpha, b( 0, n1 ),ldb )
+                          call stdlib_ctrsm( 'R', 'U', 'N', diag, m, n2, cone,a( n1 ), n, b( 0, &
+                                    n1 ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'c'
+                          call stdlib_ctrsm( 'R', 'U', 'C', diag, m, n2, alpha,a( n1 ), n, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_cgemm( 'N', 'C', m, n1, n2, -cone, b( 0, n1 ),ldb, a( 0 ), &
+                                    n, alpha, b( 0, 0 ), ldb )
+                          call stdlib_ctrsm( 'R', 'L', 'N', diag, m, n1, cone,a( n2 ), n, b( 0, 0 &
+                                    ), ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'r', n is odd, and transr = 'c'
+                    if( lower ) then
+                       ! side  ='r', n is odd, transr = 'c', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'c', uplo = 'l', and
+                          ! trans = 'n'
+                          call stdlib_ctrsm( 'R', 'L', 'N', diag, m, n2, alpha,a( 1 ), n1, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_cgemm( 'N', 'C', m, n1, n2, -cone, b( 0, n1 ),ldb, a( n1*n1 &
+                                    ), n1, alpha, b( 0, 0 ),ldb )
+                          call stdlib_ctrsm( 'R', 'U', 'C', diag, m, n1, cone,a( 0 ), n1, b( 0, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'c', uplo = 'l', and
+                          ! trans = 'c'
+                          call stdlib_ctrsm( 'R', 'U', 'N', diag, m, n1, alpha,a( 0 ), n1, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_cgemm( 'N', 'N', m, n2, n1, -cone, b( 0, 0 ),ldb, a( n1*n1 )&
+                                    , n1, alpha, b( 0, n1 ),ldb )
+                          call stdlib_ctrsm( 'R', 'L', 'C', diag, m, n2, cone,a( 1 ), n1, b( 0, &
+                                    n1 ), ldb )
+                       end if
+                    else
+                       ! side  ='r', n is odd, transr = 'c', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'c', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_ctrsm( 'R', 'U', 'N', diag, m, n1, alpha,a( n2*n2 ), n2, b( &
+                                    0, 0 ), ldb )
+                          call stdlib_cgemm( 'N', 'C', m, n2, n1, -cone, b( 0, 0 ),ldb, a( 0 ), &
+                                    n2, alpha, b( 0, n1 ),ldb )
+                          call stdlib_ctrsm( 'R', 'L', 'C', diag, m, n2, cone,a( n1*n2 ), n2, b( &
+                                    0, n1 ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'c', uplo = 'u', and
+                          ! trans = 'c'
+                          call stdlib_ctrsm( 'R', 'L', 'N', diag, m, n2, alpha,a( n1*n2 ), n2, b( &
+                                    0, n1 ), ldb )
+                          call stdlib_cgemm( 'N', 'N', m, n1, n2, -cone, b( 0, n1 ),ldb, a( 0 ), &
+                                    n2, alpha, b( 0, 0 ),ldb )
+                          call stdlib_ctrsm( 'R', 'U', 'C', diag, m, n1, cone,a( n2*n2 ), n2, b( &
+                                    0, 0 ), ldb )
+                       end if
+                    end if
+                 end if
+              else
+                 ! side = 'r' and n is even
+                 if( normaltransr ) then
+                    ! side = 'r', n is even, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='r', n is even, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_ctrsm( 'R', 'U', 'C', diag, m, k, alpha,a( 0 ), n+1, b( 0, &
+                                    k ), ldb )
+                          call stdlib_cgemm( 'N', 'N', m, k, k, -cone, b( 0, k ),ldb, a( k+1 ), n+&
+                                    1, alpha, b( 0, 0 ),ldb )
+                          call stdlib_ctrsm( 'R', 'L', 'N', diag, m, k, cone,a( 1 ), n+1, b( 0, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'c'
+                          call stdlib_ctrsm( 'R', 'L', 'C', diag, m, k, alpha,a( 1 ), n+1, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_cgemm( 'N', 'C', m, k, k, -cone, b( 0, 0 ),ldb, a( k+1 ), n+&
+                                    1, alpha, b( 0, k ),ldb )
+                          call stdlib_ctrsm( 'R', 'U', 'N', diag, m, k, cone,a( 0 ), n+1, b( 0, k &
+                                    ), ldb )
+                       end if
+                    else
+                       ! side  ='r', n is even, transr = 'n', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_ctrsm( 'R', 'L', 'C', diag, m, k, alpha,a( k+1 ), n+1, b( 0,&
+                                     0 ), ldb )
+                          call stdlib_cgemm( 'N', 'N', m, k, k, -cone, b( 0, 0 ),ldb, a( 0 ), n+1,&
+                                     alpha, b( 0, k ),ldb )
+                          call stdlib_ctrsm( 'R', 'U', 'N', diag, m, k, cone,a( k ), n+1, b( 0, k &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'c'
+                          call stdlib_ctrsm( 'R', 'U', 'C', diag, m, k, alpha,a( k ), n+1, b( 0, &
+                                    k ), ldb )
+                          call stdlib_cgemm( 'N', 'C', m, k, k, -cone, b( 0, k ),ldb, a( 0 ), n+1,&
+                                     alpha, b( 0, 0 ),ldb )
+                          call stdlib_ctrsm( 'R', 'L', 'N', diag, m, k, cone,a( k+1 ), n+1, b( 0, &
+                                    0 ), ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'r', n is even, and transr = 'c'
+                    if( lower ) then
+                       ! side  ='r', n is even, transr = 'c', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'c', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_ctrsm( 'R', 'L', 'N', diag, m, k, alpha,a( 0 ), k, b( 0, k )&
+                                    , ldb )
+                          call stdlib_cgemm( 'N', 'C', m, k, k, -cone, b( 0, k ),ldb, a( ( k+1 )&
+                                    *k ), k, alpha,b( 0, 0 ), ldb )
+                          call stdlib_ctrsm( 'R', 'U', 'C', diag, m, k, cone,a( k ), k, b( 0, 0 ),&
+                                     ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'c', uplo = 'l',
+                          ! and trans = 'c'
+                          call stdlib_ctrsm( 'R', 'U', 'N', diag, m, k, alpha,a( k ), k, b( 0, 0 )&
+                                    , ldb )
+                          call stdlib_cgemm( 'N', 'N', m, k, k, -cone, b( 0, 0 ),ldb, a( ( k+1 )&
+                                    *k ), k, alpha,b( 0, k ), ldb )
+                          call stdlib_ctrsm( 'R', 'L', 'C', diag, m, k, cone,a( 0 ), k, b( 0, k ),&
+                                     ldb )
+                       end if
+                    else
+                       ! side  ='r', n is even, transr = 'c', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'c', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_ctrsm( 'R', 'U', 'N', diag, m, k, alpha,a( ( k+1 )*k ), k, &
+                                    b( 0, 0 ), ldb )
+                          call stdlib_cgemm( 'N', 'C', m, k, k, -cone, b( 0, 0 ),ldb, a( 0 ), k, &
+                                    alpha, b( 0, k ), ldb )
+                          call stdlib_ctrsm( 'R', 'L', 'C', diag, m, k, cone,a( k*k ), k, b( 0, k &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'c', uplo = 'u',
+                          ! and trans = 'c'
+                          call stdlib_ctrsm( 'R', 'L', 'N', diag, m, k, alpha,a( k*k ), k, b( 0, &
+                                    k ), ldb )
+                          call stdlib_cgemm( 'N', 'N', m, k, k, -cone, b( 0, k ),ldb, a( 0 ), k, &
+                                    alpha, b( 0, 0 ), ldb )
+                          call stdlib_ctrsm( 'R', 'U', 'C', diag, m, k, cone,a( ( k+1 )*k ), k, b(&
+                                     0, 0 ), ldb )
+                       end if
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ctfsm
+
+     !> CTFTTP: copies a triangular matrix A from rectangular full packed
+     !> format (TF) to standard packed format (TP).
+
+     pure subroutine stdlib_ctfttp( transr, uplo, n, arf, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(sp), intent(out) :: ap(0:*)
+           complex(sp), intent(in) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt
+           integer(ilp) :: i, j, ij
+           integer(ilp) :: ijp, jp, lda, js
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Intrinsic Functions 
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTFTTP', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( normaltransr ) then
+                 ap( 0 ) = arf( 0 )
+              else
+                 ap( 0 ) = conjg( arf( 0 ) )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           ! set lda of arf^c; arf^c is (0:(n+1)/2-1,0:n-noe)
+           ! where noe = 0 if n is even, noe = 1 if n is odd
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              lda = n + 1
+           else
+              nisodd = .true.
+              lda = n
+           end if
+           ! arf^c has lda rows and n+1-noe cols
+           if( .not.normaltransr )lda = ( n+1 ) / 2
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda = n
+                    ijp = 0
+                    jp = 0
+                    do j = 0, n2
+                       do i = j, n - 1
+                          ij = i + jp
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, n2 - 1
+                       do j = 1 + i, n2
+                          ij = i + j*lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    ijp = 0
+                    do j = 0, n1 - 1
+                       ij = n2 + j
+                       do i = 0, j
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = n1, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ijp = 0
+                    do i = 0, n2
+                       do ij = i*( lda+1 ), n*lda - 1, lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 1
+                    do j = 0, n2 - 1
+                       do ij = js, js + n2 - j - 1
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    ijp = 0
+                    js = n2*lda
+                    do j = 0, n1 - 1
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, n1
+                       do ij = i, i + ( n1+i )*lda, lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    ijp = 0
+                    jp = 0
+                    do j = 0, k - 1
+                       do i = j, n - 1
+                          ij = 1 + i + jp
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, k - 1
+                       do j = i, k - 1
+                          ij = i + j*lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    ijp = 0
+                    do j = 0, k - 1
+                       ij = k + 1 + j
+                       do i = 0, j
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = k, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    ijp = 0
+                    do i = 0, k - 1
+                       do ij = i + ( i+1 )*lda, ( n+1 )*lda - 1, lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 0
+                    do j = 0, k - 1
+                       do ij = js, js + k - j - 1
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    ijp = 0
+                    js = ( k+1 )*lda
+                    do j = 0, k - 1
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, k - 1
+                       do ij = i, i + ( k+i )*lda, lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ctfttp
+
+     !> CTFTTR: copies a triangular matrix A from rectangular full packed
+     !> format (TF) to standard full format (TR).
+
+     pure subroutine stdlib_ctfttr( transr, uplo, n, arf, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           complex(sp), intent(out) :: a(0:lda-1,0:*)
+           complex(sp), intent(in) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt, nx2, np1x2
+           integer(ilp) :: i, j, l, ij
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTFTTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 ) then
+                 if( normaltransr ) then
+                    a( 0, 0 ) = arf( 0 )
+                 else
+                    a( 0, 0 ) = conjg( arf( 0 ) )
+                 end if
+              end if
+              return
+           end if
+           ! size of array arf(1:2,0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower: for n even n1=n2=k
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true., lda=n+1 and a is (n+1)--by--k2.
+           ! if n is even, set k = n/2 and nisodd = .false., lda=n and a is
+           ! n--by--(n+1)/2.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              if( .not.lower )np1x2 = n + n + 2
+           else
+              nisodd = .true.
+              if( .not.lower )nx2 = n + n
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda=n
+                    ij = 0
+                    do j = 0, n2
+                       do i = n1, n2 + j
+                          a( n2+j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0); lda=n
+                    ij = nt - n
+                    do j = n - 1, n1, -1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = j - n1, n1 - 1
+                          a( j-n1, l ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       ij = ij - nx2
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ij = 0
+                    do j = 0, n2 - 1
+                       do i = 0, j
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       do i = n1 + j, n - 1
+                          a( i, n1+j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = n2, n - 1
+                       do i = 0, n1 - 1
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    ij = 0
+                    do j = 0, n1
+                       do i = n1, n - 1
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, n1 - 1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = n2 + j, n - 1
+                          a( n2+j, l ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1); lda=n+1
+                    ij = 0
+                    do j = 0, k - 1
+                       do i = k, k + j
+                          a( k+j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0); lda=n+1
+                    ij = nt - n - 1
+                    do j = n - 1, k, -1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = j - k, k - 1
+                          a( j-k, l ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       ij = ij - np1x2
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper, a=b)
+                    ! t1 -> a(0,1) , t2 -> a(0,0) , s -> a(0,k+1) :
+                    ! t1 -> a(0+k) , t2 -> a(0+0) , s -> a(0+k*(k+1)); lda=k
+                    ij = 0
+                    j = k
+                    do i = k, n - 1
+                       a( i, j ) = arf( ij )
+                       ij = ij + 1
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       do i = k + 1 + j, n - 1
+                          a( i, k+1+j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = k - 1, n - 1
+                       do i = 0, k - 1
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper, a=b)
+                    ! t1 -> a(0,k+1) , t2 -> a(0,k) , s -> a(0,0)
+                    ! t1 -> a(0+k*(k+1)) , t2 -> a(0+k*k) , s -> a(0+0)); lda=k
+                    ij = 0
+                    do j = 0, k
+                       do i = k, n - 1
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = k + 1 + j, n - 1
+                          a( k+1+j, l ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    ! note that here j = k-1
+                    do i = 0, j
+                       a( i, j ) = arf( ij )
+                       ij = ij + 1
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ctfttr
+
+     !> CTGEVC: computes some or all of the right and/or left eigenvectors of
+     !> a pair of complex matrices (S,P), where S and P are upper triangular.
+     !> Matrix pairs of this type are produced by the generalized Schur
+     !> factorization of a complex matrix pair (A,B):
+     !> A = Q*S*Z**H,  B = Q*P*Z**H
+     !> as computed by CGGHRD + CHGEQZ.
+     !> The right eigenvector x and the left eigenvector y of (S,P)
+     !> corresponding to an eigenvalue w are defined by:
+     !> S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
+     !> where y**H denotes the conjugate tranpose of y.
+     !> The eigenvalues are not input to this routine, but are computed
+     !> directly from the diagonal elements of S and P.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of (S,P), or the products Z*X and/or Q*Y,
+     !> where Z and Q are input matrices.
+     !> If Q and Z are the unitary factors from the generalized Schur
+     !> factorization of a matrix pair (A,B), then Z*X and Q*Y
+     !> are the matrices of right and left eigenvectors of (A,B).
+
+     pure subroutine stdlib_ctgevc( side, howmny, select, n, s, lds, p, ldp, vl,ldvl, vr, ldvr, &
+               mm, m, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldp, lds, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(in) :: p(ldp,*), s(lds,*)
+           complex(sp), intent(inout) :: vl(ldvl,*), vr(ldvr,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: compl, compr, ilall, ilback, ilbbad, ilcomp, lsa, lsb
+           integer(ilp) :: i, ibeg, ieig, iend, ihwmny, im, iside, isrc, j, je, jr
+           real(sp) :: acoefa, acoeff, anorm, ascale, bcoefa, big, bignum, bnorm, bscale, dmin, &
+                     safmin, sbeta, scale, small, temp, ulp, xmax
+           complex(sp) :: bcoeff, ca, cb, d, salpha, sum, suma, sumb, x
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,max,min,real
+           ! Statement Functions 
+           real(sp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=sp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode and test the input parameters
+           if( stdlib_lsame( howmny, 'A' ) ) then
+              ihwmny = 1
+              ilall = .true.
+              ilback = .false.
+           else if( stdlib_lsame( howmny, 'S' ) ) then
+              ihwmny = 2
+              ilall = .false.
+              ilback = .false.
+           else if( stdlib_lsame( howmny, 'B' ) ) then
+              ihwmny = 3
+              ilall = .true.
+              ilback = .true.
+           else
+              ihwmny = -1
+           end if
+           if( stdlib_lsame( side, 'R' ) ) then
+              iside = 1
+              compl = .false.
+              compr = .true.
+           else if( stdlib_lsame( side, 'L' ) ) then
+              iside = 2
+              compl = .true.
+              compr = .false.
+           else if( stdlib_lsame( side, 'B' ) ) then
+              iside = 3
+              compl = .true.
+              compr = .true.
+           else
+              iside = -1
+           end if
+           info = 0
+           if( iside<0 ) then
+              info = -1
+           else if( ihwmny<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lds<max( 1, n ) ) then
+              info = -6
+           else if( ldp<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTGEVC', -info )
+              return
+           end if
+           ! count the number of eigenvectors
+           if( .not.ilall ) then
+              im = 0
+              do j = 1, n
+                 if( select( j ) )im = im + 1
+              end do
+           else
+              im = n
+           end if
+           ! check diagonal of b
+           ilbbad = .false.
+           do j = 1, n
+              if( aimag( p( j, j ) )/=zero )ilbbad = .true.
+           end do
+           if( ilbbad ) then
+              info = -7
+           else if( compl .and. ldvl<n .or. ldvl<1 ) then
+              info = -10
+           else if( compr .and. ldvr<n .or. ldvr<1 ) then
+              info = -12
+           else if( mm<im ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTGEVC', -info )
+              return
+           end if
+           ! quick return if possible
+           m = im
+           if( n==0 )return
+           ! machine constants
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           big = one / safmin
+           call stdlib_slabad( safmin, big )
+           ulp = stdlib_slamch( 'EPSILON' )*stdlib_slamch( 'BASE' )
+           small = safmin*n / ulp
+           big = one / small
+           bignum = one / ( safmin*n )
+           ! compute the 1-norm of each column of the strictly upper triangular
+           ! part of a and b to check for possible overflow in the triangular
+           ! solver.
+           anorm = abs1( s( 1, 1 ) )
+           bnorm = abs1( p( 1, 1 ) )
+           rwork( 1 ) = zero
+           rwork( n+1 ) = zero
+           do j = 2, n
+              rwork( j ) = zero
+              rwork( n+j ) = zero
+              do i = 1, j - 1
+                 rwork( j ) = rwork( j ) + abs1( s( i, j ) )
+                 rwork( n+j ) = rwork( n+j ) + abs1( p( i, j ) )
+              end do
+              anorm = max( anorm, rwork( j )+abs1( s( j, j ) ) )
+              bnorm = max( bnorm, rwork( n+j )+abs1( p( j, j ) ) )
+           end do
+           ascale = one / max( anorm, safmin )
+           bscale = one / max( bnorm, safmin )
+           ! left eigenvectors
+           if( compl ) then
+              ieig = 0
+              ! main loop over eigenvalues
+              loop_140: do je = 1, n
+                 if( ilall ) then
+                    ilcomp = .true.
+                 else
+                    ilcomp = select( je )
+                 end if
+                 if( ilcomp ) then
+                    ieig = ieig + 1
+                    if( abs1( s( je, je ) )<=safmin .and.abs( real( p( je, je ),KIND=sp) )&
+                              <=safmin ) then
+                       ! singular matrix pencil -- return unit eigenvector
+                       do jr = 1, n
+                          vl( jr, ieig ) = czero
+                       end do
+                       vl( ieig, ieig ) = cone
+                       cycle loop_140
+                    end if
+                    ! non-singular eigenvalue:
+                    ! compute coefficients  a  and  b  in
+                         ! h
+                       ! y  ( a a - b b ) = 0
+                    temp = one / max( abs1( s( je, je ) )*ascale,abs( real( p( je, je ),KIND=sp) )&
+                              *bscale, safmin )
+                    salpha = ( temp*s( je, je ) )*ascale
+                    sbeta = ( temp*real( p( je, je ),KIND=sp) )*bscale
+                    acoeff = sbeta*ascale
+                    bcoeff = salpha*bscale
+                    ! scale to avoid underflow
+                    lsa = abs( sbeta )>=safmin .and. abs( acoeff )<small
+                    lsb = abs1( salpha )>=safmin .and. abs1( bcoeff )<small
+                    scale = one
+                    if( lsa )scale = ( small / abs( sbeta ) )*min( anorm, big )
+                    if( lsb )scale = max( scale, ( small / abs1( salpha ) )*min( bnorm, big ) )
+                              
+                    if( lsa .or. lsb ) then
+                       scale = min( scale, one /( safmin*max( one, abs( acoeff ),abs1( bcoeff ) ) &
+                                 ) )
+                       if( lsa ) then
+                          acoeff = ascale*( scale*sbeta )
+                       else
+                          acoeff = scale*acoeff
+                       end if
+                       if( lsb ) then
+                          bcoeff = bscale*( scale*salpha )
+                       else
+                          bcoeff = scale*bcoeff
+                       end if
+                    end if
+                    acoefa = abs( acoeff )
+                    bcoefa = abs1( bcoeff )
+                    xmax = one
+                    do jr = 1, n
+                       work( jr ) = czero
+                    end do
+                    work( je ) = cone
+                    dmin = max( ulp*acoefa*anorm, ulp*bcoefa*bnorm, safmin )
+                                                    ! h
+                    ! triangular solve of  (a a - b b)  y = 0
+                                            ! h
+                    ! (rowwise in  (a a - b b) , or columnwise in a a - b b)
+                    loop_100: do j = je + 1, n
+                       ! compute
+                             ! j-1
+                       ! sum = sum  conjg( a*s(k,j) - b*p(k,j) )*x(k)
+                             ! k=je
+                       ! (scale if necessary)
+                       temp = one / xmax
+                       if( acoefa*rwork( j )+bcoefa*rwork( n+j )>bignum*temp ) then
+                          do jr = je, j - 1
+                             work( jr ) = temp*work( jr )
+                          end do
+                          xmax = one
+                       end if
+                       suma = czero
+                       sumb = czero
+                       do jr = je, j - 1
+                          suma = suma + conjg( s( jr, j ) )*work( jr )
+                          sumb = sumb + conjg( p( jr, j ) )*work( jr )
+                       end do
+                       sum = acoeff*suma - conjg( bcoeff )*sumb
+                       ! form x(j) = - sum / conjg( a*s(j,j) - b*p(j,j) )
+                       ! with scaling and perturbation of the denominator
+                       d = conjg( acoeff*s( j, j )-bcoeff*p( j, j ) )
+                       if( abs1( d )<=dmin )d = cmplx( dmin,KIND=sp)
+                       if( abs1( d )<one ) then
+                          if( abs1( sum )>=bignum*abs1( d ) ) then
+                             temp = one / abs1( sum )
+                             do jr = je, j - 1
+                                work( jr ) = temp*work( jr )
+                             end do
+                             xmax = temp*xmax
+                             sum = temp*sum
+                          end if
+                       end if
+                       work( j ) = stdlib_cladiv( -sum, d )
+                       xmax = max( xmax, abs1( work( j ) ) )
+                    end do loop_100
+                    ! back transform eigenvector if howmny='b'.
+                    if( ilback ) then
+                       call stdlib_cgemv( 'N', n, n+1-je, cone, vl( 1, je ), ldvl,work( je ), 1, &
+                                 czero, work( n+1 ), 1 )
+                       isrc = 2
+                       ibeg = 1
+                    else
+                       isrc = 1
+                       ibeg = je
+                    end if
+                    ! copy and scale eigenvector into column of vl
+                    xmax = zero
+                    do jr = ibeg, n
+                       xmax = max( xmax, abs1( work( ( isrc-1 )*n+jr ) ) )
+                    end do
+                    if( xmax>safmin ) then
+                       temp = one / xmax
+                       do jr = ibeg, n
+                          vl( jr, ieig ) = temp*work( ( isrc-1 )*n+jr )
+                       end do
+                    else
+                       ibeg = n + 1
+                    end if
+                    do jr = 1, ibeg - 1
+                       vl( jr, ieig ) = czero
+                    end do
+                 end if
+              end do loop_140
+           end if
+           ! right eigenvectors
+           if( compr ) then
+              ieig = im + 1
+              ! main loop over eigenvalues
+              loop_250: do je = n, 1, -1
+                 if( ilall ) then
+                    ilcomp = .true.
+                 else
+                    ilcomp = select( je )
+                 end if
+                 if( ilcomp ) then
+                    ieig = ieig - 1
+                    if( abs1( s( je, je ) )<=safmin .and.abs( real( p( je, je ),KIND=sp) )&
+                              <=safmin ) then
+                       ! singular matrix pencil -- return unit eigenvector
+                       do jr = 1, n
+                          vr( jr, ieig ) = czero
+                       end do
+                       vr( ieig, ieig ) = cone
+                       cycle loop_250
+                    end if
+                    ! non-singular eigenvalue:
+                    ! compute coefficients  a  and  b  in
+                    ! ( a a - b b ) x  = 0
+                    temp = one / max( abs1( s( je, je ) )*ascale,abs( real( p( je, je ),KIND=sp) )&
+                              *bscale, safmin )
+                    salpha = ( temp*s( je, je ) )*ascale
+                    sbeta = ( temp*real( p( je, je ),KIND=sp) )*bscale
+                    acoeff = sbeta*ascale
+                    bcoeff = salpha*bscale
+                    ! scale to avoid underflow
+                    lsa = abs( sbeta )>=safmin .and. abs( acoeff )<small
+                    lsb = abs1( salpha )>=safmin .and. abs1( bcoeff )<small
+                    scale = one
+                    if( lsa )scale = ( small / abs( sbeta ) )*min( anorm, big )
+                    if( lsb )scale = max( scale, ( small / abs1( salpha ) )*min( bnorm, big ) )
+                              
+                    if( lsa .or. lsb ) then
+                       scale = min( scale, one /( safmin*max( one, abs( acoeff ),abs1( bcoeff ) ) &
+                                 ) )
+                       if( lsa ) then
+                          acoeff = ascale*( scale*sbeta )
+                       else
+                          acoeff = scale*acoeff
+                       end if
+                       if( lsb ) then
+                          bcoeff = bscale*( scale*salpha )
+                       else
+                          bcoeff = scale*bcoeff
+                       end if
+                    end if
+                    acoefa = abs( acoeff )
+                    bcoefa = abs1( bcoeff )
+                    xmax = one
+                    do jr = 1, n
+                       work( jr ) = czero
+                    end do
+                    work( je ) = cone
+                    dmin = max( ulp*acoefa*anorm, ulp*bcoefa*bnorm, safmin )
+                    ! triangular solve of  (a a - b b) x = 0  (columnwise)
+                    ! work(1:j-1) contains sums w,
+                    ! work(j+1:je) contains x
+                    do jr = 1, je - 1
+                       work( jr ) = acoeff*s( jr, je ) - bcoeff*p( jr, je )
+                    end do
+                    work( je ) = cone
+                    loop_210: do j = je - 1, 1, -1
+                       ! form x(j) := - w(j) / d
+                       ! with scaling and perturbation of the denominator
+                       d = acoeff*s( j, j ) - bcoeff*p( j, j )
+                       if( abs1( d )<=dmin )d = cmplx( dmin,KIND=sp)
+                       if( abs1( d )<one ) then
+                          if( abs1( work( j ) )>=bignum*abs1( d ) ) then
+                             temp = one / abs1( work( j ) )
+                             do jr = 1, je
+                                work( jr ) = temp*work( jr )
+                             end do
+                          end if
+                       end if
+                       work( j ) = stdlib_cladiv( -work( j ), d )
+                       if( j>1 ) then
+                          ! w = w + x(j)*(a s(*,j) - b p(*,j) ) with scaling
+                          if( abs1( work( j ) )>one ) then
+                             temp = one / abs1( work( j ) )
+                             if( acoefa*rwork( j )+bcoefa*rwork( n+j )>=bignum*temp ) then
+                                do jr = 1, je
+                                   work( jr ) = temp*work( jr )
+                                end do
+                             end if
+                          end if
+                          ca = acoeff*work( j )
+                          cb = bcoeff*work( j )
+                          do jr = 1, j - 1
+                             work( jr ) = work( jr ) + ca*s( jr, j ) -cb*p( jr, j )
+                          end do
+                       end if
+                    end do loop_210
+                    ! back transform eigenvector if howmny='b'.
+                    if( ilback ) then
+                       call stdlib_cgemv( 'N', n, je, cone, vr, ldvr, work, 1,czero, work( n+1 ), &
+                                 1 )
+                       isrc = 2
+                       iend = n
+                    else
+                       isrc = 1
+                       iend = je
+                    end if
+                    ! copy and scale eigenvector into column of vr
+                    xmax = zero
+                    do jr = 1, iend
+                       xmax = max( xmax, abs1( work( ( isrc-1 )*n+jr ) ) )
+                    end do
+                    if( xmax>safmin ) then
+                       temp = one / xmax
+                       do jr = 1, iend
+                          vr( jr, ieig ) = temp*work( ( isrc-1 )*n+jr )
+                       end do
+                    else
+                       iend = 0
+                    end if
+                    do jr = iend + 1, n
+                       vr( jr, ieig ) = czero
+                    end do
+                 end if
+              end do loop_250
+           end if
+           return
+     end subroutine stdlib_ctgevc
+
+     !> CTGEX2: swaps adjacent diagonal 1 by 1 blocks (A11,B11) and (A22,B22)
+     !> in an upper triangular matrix pair (A, B) by an unitary equivalence
+     !> transformation.
+     !> (A, B) must be in generalized Schur canonical form, that is, A and
+     !> B are both upper triangular.
+     !> Optionally, the matrices Q and Z of generalized Schur vectors are
+     !> updated.
+     !> Q(in) * A(in) * Z(in)**H = Q(out) * A(out) * Z(out)**H
+     !> Q(in) * B(in) * Z(in)**H = Q(out) * B(out) * Z(out)**H
+
+     pure subroutine stdlib_ctgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, j1, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: j1, lda, ldb, ldq, ldz, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: twenty = 2.0e+1_sp
+           integer(ilp), parameter :: ldst = 2
+           logical(lk), parameter :: wands = .true.
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: strong, weak
+           integer(ilp) :: i, m
+           real(sp) :: cq, cz, eps, sa, sb, scale, smlnum, sum, thresha, threshb
+           complex(sp) :: cdum, f, g, sq, sz
+           ! Local Arrays 
+           complex(sp) :: s(ldst,ldst), t(ldst,ldst), work(8)
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,real,sqrt
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=1 )return
+           m = ldst
+           weak = .false.
+           strong = .false.
+           ! make a local copy of selected block in (a, b)
+           call stdlib_clacpy( 'FULL', m, m, a( j1, j1 ), lda, s, ldst )
+           call stdlib_clacpy( 'FULL', m, m, b( j1, j1 ), ldb, t, ldst )
+           ! compute the threshold for testing the acceptance of swapping.
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' ) / eps
+           scale = real( czero,KIND=sp)
+           sum = real( cone,KIND=sp)
+           call stdlib_clacpy( 'FULL', m, m, s, ldst, work, m )
+           call stdlib_clacpy( 'FULL', m, m, t, ldst, work( m*m+1 ), m )
+           call stdlib_classq( m*m, work, 1, scale, sum )
+           sa = scale*sqrt( sum )
+           scale = real( czero,KIND=sp)
+           sum = real( cone,KIND=sp)
+           call stdlib_classq( m*m, work(m*m+1), 1, scale, sum )
+           sb = scale*sqrt( sum )
+           ! thres has been changed from
+              ! thresh = max( ten*eps*sa, smlnum )
+           ! to
+              ! thresh = max( twenty*eps*sa, smlnum )
+           ! on 04/01/10.
+           ! "bug" reported by ondra kamenik, confirmed by julie langou, fixed by
+           ! jim demmel and guillaume revy. see forum post 1783.
+           thresha = max( twenty*eps*sa, smlnum )
+           threshb = max( twenty*eps*sb, smlnum )
+           ! compute unitary ql and rq that swap 1-by-1 and 1-by-1 blocks
+           ! using givens rotations and perform the swap tentatively.
+           f = s( 2, 2 )*t( 1, 1 ) - t( 2, 2 )*s( 1, 1 )
+           g = s( 2, 2 )*t( 1, 2 ) - t( 2, 2 )*s( 1, 2 )
+           sa = abs( s( 2, 2 ) ) * abs( t( 1, 1 ) )
+           sb = abs( s( 1, 1 ) ) * abs( t( 2, 2 ) )
+           call stdlib_clartg( g, f, cz, sz, cdum )
+           sz = -sz
+           call stdlib_crot( 2, s( 1, 1 ), 1, s( 1, 2 ), 1, cz, conjg( sz ) )
+           call stdlib_crot( 2, t( 1, 1 ), 1, t( 1, 2 ), 1, cz, conjg( sz ) )
+           if( sa>=sb ) then
+              call stdlib_clartg( s( 1, 1 ), s( 2, 1 ), cq, sq, cdum )
+           else
+              call stdlib_clartg( t( 1, 1 ), t( 2, 1 ), cq, sq, cdum )
+           end if
+           call stdlib_crot( 2, s( 1, 1 ), ldst, s( 2, 1 ), ldst, cq, sq )
+           call stdlib_crot( 2, t( 1, 1 ), ldst, t( 2, 1 ), ldst, cq, sq )
+           ! weak stability test: |s21| <= o(eps f-norm((a)))
+                                ! and  |t21| <= o(eps f-norm((b)))
+           weak = abs( s( 2, 1 ) )<=thresha .and.abs( t( 2, 1 ) )<=threshb
+           if( .not.weak )go to 20
+           if( wands ) then
+              ! strong stability test:
+                 ! f-norm((a-ql**h*s*qr, b-ql**h*t*qr)) <= o(eps*f-norm((a, b)))
+              call stdlib_clacpy( 'FULL', m, m, s, ldst, work, m )
+              call stdlib_clacpy( 'FULL', m, m, t, ldst, work( m*m+1 ), m )
+              call stdlib_crot( 2, work, 1, work( 3 ), 1, cz, -conjg( sz ) )
+              call stdlib_crot( 2, work( 5 ), 1, work( 7 ), 1, cz, -conjg( sz ) )
+              call stdlib_crot( 2, work, 2, work( 2 ), 2, cq, -sq )
+              call stdlib_crot( 2, work( 5 ), 2, work( 6 ), 2, cq, -sq )
+              do i = 1, 2
+                 work( i ) = work( i ) - a( j1+i-1, j1 )
+                 work( i+2 ) = work( i+2 ) - a( j1+i-1, j1+1 )
+                 work( i+4 ) = work( i+4 ) - b( j1+i-1, j1 )
+                 work( i+6 ) = work( i+6 ) - b( j1+i-1, j1+1 )
+              end do
+              scale = real( czero,KIND=sp)
+              sum = real( cone,KIND=sp)
+              call stdlib_classq( m*m, work, 1, scale, sum )
+              sa = scale*sqrt( sum )
+              scale = real( czero,KIND=sp)
+              sum = real( cone,KIND=sp)
+              call stdlib_classq( m*m, work(m*m+1), 1, scale, sum )
+              sb = scale*sqrt( sum )
+              strong = sa<=thresha .and. sb<=threshb
+              if( .not.strong )go to 20
+           end if
+           ! if the swap is accepted ("weakly" and "strongly"), apply the
+           ! equivalence transformations to the original matrix pair (a,b)
+           call stdlib_crot( j1+1, a( 1, j1 ), 1, a( 1, j1+1 ), 1, cz, conjg( sz ) )
+           call stdlib_crot( j1+1, b( 1, j1 ), 1, b( 1, j1+1 ), 1, cz, conjg( sz ) )
+           call stdlib_crot( n-j1+1, a( j1, j1 ), lda, a( j1+1, j1 ), lda, cq, sq )
+           call stdlib_crot( n-j1+1, b( j1, j1 ), ldb, b( j1+1, j1 ), ldb, cq, sq )
+           ! set  n1 by n2 (2,1) blocks to 0
+           a( j1+1, j1 ) = czero
+           b( j1+1, j1 ) = czero
+           ! accumulate transformations into q and z if requested.
+           if( wantz )call stdlib_crot( n, z( 1, j1 ), 1, z( 1, j1+1 ), 1, cz, conjg( sz ) )
+                     
+           if( wantq )call stdlib_crot( n, q( 1, j1 ), 1, q( 1, j1+1 ), 1, cq, conjg( sq ) )
+                     
+           ! exit with info = 0 if swap was successfully performed.
+           return
+           ! exit with info = 1 if swap was rejected.
+           20 continue
+           info = 1
+           return
+     end subroutine stdlib_ctgex2
+
+     !> CTGEXC: reorders the generalized Schur decomposition of a complex
+     !> matrix pair (A,B), using an unitary equivalence transformation
+     !> (A, B) := Q * (A, B) * Z**H, so that the diagonal block of (A, B) with
+     !> row index IFST is moved to row ILST.
+     !> (A, B) must be in generalized Schur canonical form, that is, A and
+     !> B are both upper triangular.
+     !> Optionally, the matrices Q and Z of generalized Schur vectors are
+     !> updated.
+     !> Q(in) * A(in) * Z(in)**H = Q(out) * A(out) * Z(out)**H
+     !> Q(in) * B(in) * Z(in)**H = Q(out) * B(out) * Z(out)**H
+
+     pure subroutine stdlib_ctgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, ifst, ilst, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(in) :: ifst, lda, ldb, ldq, ldz, n
+           integer(ilp), intent(inout) :: ilst
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: here
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode and test input arguments.
+           info = 0
+           if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldq<1 .or. wantq .and. ( ldq<max( 1, n ) ) ) then
+              info = -9
+           else if( ldz<1 .or. wantz .and. ( ldz<max( 1, n ) ) ) then
+              info = -11
+           else if( ifst<1 .or. ifst>n ) then
+              info = -12
+           else if( ilst<1 .or. ilst>n ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTGEXC', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 )return
+           if( ifst==ilst )return
+           if( ifst<ilst ) then
+              here = ifst
+              10 continue
+              ! swap with next one below
+              call stdlib_ctgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z, ldz,here, info )
+                        
+              if( info/=0 ) then
+                 ilst = here
+                 return
+              end if
+              here = here + 1
+              if( here<ilst )go to 10
+              here = here - 1
+           else
+              here = ifst - 1
+              20 continue
+              ! swap with next one above
+              call stdlib_ctgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z, ldz,here, info )
+                        
+              if( info/=0 ) then
+                 ilst = here
+                 return
+              end if
+              here = here - 1
+              if( here>=ilst )go to 20
+              here = here + 1
+           end if
+           ilst = here
+           return
+     end subroutine stdlib_ctgexc
+
+     !> CTPLQT2: computes a LQ a factorization of a complex "triangular-pentagonal"
+     !> matrix C, which is composed of a triangular block A and pentagonal block B,
+     !> using the compact WY representation for Q.
+
+     pure subroutine stdlib_ctplqt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, p, mp, np
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. l>min(m,n) ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1, m ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTPLQT2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) return
+           do i = 1, m
+              ! generate elementary reflector h(i) to annihilate b(i,:)
+              p = n-l+min( l, i )
+              call stdlib_clarfg( p+1, a( i, i ), b( i, 1 ), ldb, t( 1, i ) )
+              t(1,i)=conjg(t(1,i))
+              if( i<m ) then
+                 do j = 1, p
+                    b( i, j ) = conjg(b(i,j))
+                 end do
+                 ! w(m-i:1) := c(i+1:m,i:n) * c(i,i:n) [use w = t(m,:)]
+                 do j = 1, m-i
+                    t( m, j ) = (a( i+j, i ))
+                 end do
+                 call stdlib_cgemv( 'N', m-i, p, cone, b( i+1, 1 ), ldb,b( i, 1 ), ldb, cone, t( &
+                           m, 1 ), ldt )
+                 ! c(i+1:m,i:n) = c(i+1:m,i:n) + alpha * c(i,i:n)*w(m-1:1)^h
+                 alpha = -(t( 1, i ))
+                 do j = 1, m-i
+                    a( i+j, i ) = a( i+j, i ) + alpha*(t( m, j ))
+                 end do
+                 call stdlib_cgerc( m-i, p, (alpha),  t( m, 1 ), ldt,b( i, 1 ), ldb, b( i+1, 1 ), &
+                           ldb )
+                 do j = 1, p
+                    b( i, j ) = conjg(b(i,j))
+                 end do
+              end if
+           end do
+           do i = 2, m
+              ! t(i,1:i-1) := c(i:i-1,1:n)**h * (alpha * c(i,i:n))
+              alpha = -(t( 1, i ))
+              do j = 1, i-1
+                 t( i, j ) = czero
+              end do
+              p = min( i-1, l )
+              np = min( n-l+1, n )
+              mp = min( p+1, m )
+              do j = 1, n-l+p
+                b(i,j)=conjg(b(i,j))
+              end do
+              ! triangular part of b2
+              do j = 1, p
+                 t( i, j ) = (alpha*b( i, n-l+j ))
+              end do
+              call stdlib_ctrmv( 'L', 'N', 'N', p, b( 1, np ), ldb,t( i, 1 ), ldt )
+              ! rectangular part of b2
+              call stdlib_cgemv( 'N', i-1-p, l,  alpha, b( mp, np ), ldb,b( i, np ), ldb, czero, &
+                        t( i,mp ), ldt )
+              ! b1
+              call stdlib_cgemv( 'N', i-1, n-l, alpha, b, ldb, b( i, 1 ), ldb,cone, t( i, 1 ), &
+                        ldt )
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(i,1:i-1)
+              do j = 1, i-1
+                 t(i,j)=conjg(t(i,j))
+              end do
+              call stdlib_ctrmv( 'L', 'C', 'N', i-1, t, ldt, t( i, 1 ), ldt )
+              do j = 1, i-1
+                 t(i,j)=conjg(t(i,j))
+              end do
+              do j = 1, n-l+p
+                 b(i,j)=conjg(b(i,j))
+              end do
+              ! t(i,i) = tau(i)
+              t( i, i ) = t( 1, i )
+              t( 1, i ) = czero
+           end do
+           do i=1,m
+              do j= i+1,m
+                 t(i,j)=(t(j,i))
+                 t(j,i)=czero
+              end do
+           end do
+     end subroutine stdlib_ctplqt2
+
+     !> CTPQRT2: computes a QR factorization of a complex "triangular-pentagonal"
+     !> matrix C, which is composed of a triangular block A and pentagonal block B,
+     !> using the compact WY representation for Q.
+
+     pure subroutine stdlib_ctpqrt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, p, mp, np
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. l>min(m,n) ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTPQRT2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) return
+           do i = 1, n
+              ! generate elementary reflector h(i) to annihilate b(:,i)
+              p = m-l+min( l, i )
+              call stdlib_clarfg( p+1, a( i, i ), b( 1, i ), 1, t( i, 1 ) )
+              if( i<n ) then
+                 ! w(1:n-i) := c(i:m,i+1:n)**h * c(i:m,i) [use w = t(:,n)]
+                 do j = 1, n-i
+                    t( j, n ) = conjg(a( i, i+j ))
+                 end do
+                 call stdlib_cgemv( 'C', p, n-i, cone, b( 1, i+1 ), ldb,b( 1, i ), 1, cone, t( 1, &
+                           n ), 1 )
+                 ! c(i:m,i+1:n) = c(i:m,i+1:n) + alpha*c(i:m,i)*w(1:n-1)**h
+                 alpha = -conjg(t( i, 1 ))
+                 do j = 1, n-i
+                    a( i, i+j ) = a( i, i+j ) + alpha*conjg(t( j, n ))
+                 end do
+                 call stdlib_cgerc( p, n-i, alpha, b( 1, i ), 1,t( 1, n ), 1, b( 1, i+1 ), ldb )
+                           
+              end if
+           end do
+           do i = 2, n
+              ! t(1:i-1,i) := c(i:m,1:i-1)**h * (alpha * c(i:m,i))
+              alpha = -t( i, 1 )
+              do j = 1, i-1
+                 t( j, i ) = czero
+              end do
+              p = min( i-1, l )
+              mp = min( m-l+1, m )
+              np = min( p+1, n )
+              ! triangular part of b2
+              do j = 1, p
+                 t( j, i ) = alpha*b( m-l+j, i )
+              end do
+              call stdlib_ctrmv( 'U', 'C', 'N', p, b( mp, 1 ), ldb,t( 1, i ), 1 )
+              ! rectangular part of b2
+              call stdlib_cgemv( 'C', l, i-1-p, alpha, b( mp, np ), ldb,b( mp, i ), 1, czero, t( &
+                        np, i ), 1 )
+              ! b1
+              call stdlib_cgemv( 'C', m-l, i-1, alpha, b, ldb, b( 1, i ), 1,cone, t( 1, i ), 1 )
+                        
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+              call stdlib_ctrmv( 'U', 'N', 'N', i-1, t, ldt, t( 1, i ), 1 )
+              ! t(i,i) = tau(i)
+              t( i, i ) = t( i, 1 )
+              t( i, 1 ) = czero
+           end do
+     end subroutine stdlib_ctpqrt2
+
+     !> CTPRFB: applies a complex "triangular-pentagonal" block reflector H or its
+     !> conjugate transpose H**H to a complex matrix C, which is composed of two
+     !> blocks A and B, either from the left or right.
+
+     pure subroutine stdlib_ctprfb( side, trans, direct, storev, m, n, k, l,v, ldv, t, ldt, a, &
+               lda, b, ldb, work, ldwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, l, lda, ldb, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(in) :: t(ldt,*), v(ldv,*)
+           complex(sp), intent(out) :: work(ldwork,*)
+        ! ==========================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, mp, np, kp
+           logical(lk) :: left, forward, column, right, backward, row
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 .or. k<=0 .or. l<0 ) return
+           if( stdlib_lsame( storev, 'C' ) ) then
+              column = .true.
+              row = .false.
+           else if ( stdlib_lsame( storev, 'R' ) ) then
+              column = .false.
+              row = .true.
+           else
+              column = .false.
+              row = .false.
+           end if
+           if( stdlib_lsame( side, 'L' ) ) then
+              left = .true.
+              right = .false.
+           else if( stdlib_lsame( side, 'R' ) ) then
+              left = .false.
+              right = .true.
+           else
+              left = .false.
+              right = .false.
+           end if
+           if( stdlib_lsame( direct, 'F' ) ) then
+              forward = .true.
+              backward = .false.
+           else if( stdlib_lsame( direct, 'B' ) ) then
+              forward = .false.
+              backward = .true.
+           else
+              forward = .false.
+              backward = .false.
+           end if
+       ! ---------------------------------------------------------------------------
+           if( column .and. forward .and. left  ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i ]    (k-by-k)
+                        ! [ v ]    (m-by-k)
+              ! form  h c  or  h**h c  where  c = [ a ]  (k-by-n)
+                                                ! [ b ]  (m-by-n)
+              ! h = i - w t w**h          or  h**h = i - w t**h w**h
+              ! a = a -   t (a + v**h b)  or  a = a -   t**h (a + v**h b)
+              ! b = b - v t (a + v**h b)  or  b = b - v t**h (a + v**h b)
+       ! ---------------------------------------------------------------------------
+              mp = min( m-l+1, m )
+              kp = min( l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( i, j ) = b( m-l+i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'L', 'U', 'C', 'N', l, n, cone, v( mp, 1 ), ldv,work, ldwork )
+                        
+              call stdlib_cgemm( 'C', 'N', l, n, m-l, cone, v, ldv, b, ldb,cone, work, ldwork )
+                        
+              call stdlib_cgemm( 'C', 'N', k-l, n, m, cone, v( 1, kp ), ldv,b, ldb, czero, work( &
+                        kp, 1 ), ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'L', 'U', trans, 'N', k, n, cone, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_cgemm( 'N', 'N', m-l, n, k, -cone, v, ldv, work, ldwork,cone, b, ldb )
+                        
+              call stdlib_cgemm( 'N', 'N', l, n, k-l, -cone, v( mp, kp ), ldv,work( kp, 1 ), &
+                        ldwork, cone, b( mp, 1 ),  ldb )
+              call stdlib_ctrmm( 'L', 'U', 'N', 'N', l, n, cone, v( mp, 1 ), ldv,work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, l
+                    b( m-l+i, j ) = b( m-l+i, j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. forward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i ]    (k-by-k)
+                        ! [ v ]    (n-by-k)
+              ! form  c h or  c h**h  where  c = [ a b ] (a is m-by-k, b is m-by-n)
+              ! h = i - w t w**h          or  h**h = i - w t**h w**h
+              ! a = a - (a + b v) t      or  a = a - (a + b v) t**h
+              ! b = b - (a + b v) t v**h  or  b = b - (a + b v) t**h v**h
+       ! ---------------------------------------------------------------------------
+              np = min( n-l+1, n )
+              kp = min( l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, j ) = b( i, n-l+j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'R', 'U', 'N', 'N', m, l, cone, v( np, 1 ), ldv,work, ldwork )
+                        
+              call stdlib_cgemm( 'N', 'N', m, l, n-l, cone, b, ldb,v, ldv, cone, work, ldwork )
+                        
+              call stdlib_cgemm( 'N', 'N', m, k-l, n, cone, b, ldb,v( 1, kp ), ldv, czero, work( &
+                        1, kp ), ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'R', 'U', trans, 'N', m, k, cone, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_cgemm( 'N', 'C', m, n-l, k, -cone, work, ldwork,v, ldv, cone, b, ldb )
+                        
+              call stdlib_cgemm( 'N', 'C', m, l, k-l, -cone, work( 1, kp ), ldwork,v( np, kp ), &
+                        ldv, cone, b( 1, np ), ldb )
+              call stdlib_ctrmm( 'R', 'U', 'C', 'N', m, l, cone, v( np, 1 ), ldv,work, ldwork )
+                        
+              do j = 1, l
+                 do i = 1, m
+                    b( i, n-l+j ) = b( i, n-l+j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. backward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v ]    (m-by-k)
+                        ! [ i ]    (k-by-k)
+              ! form  h c  or  h**h c  where  c = [ b ]  (m-by-n)
+                                                ! [ a ]  (k-by-n)
+              ! h = i - w t w**h          or  h**h = i - w t**h w**h
+              ! a = a -   t (a + v**h b)  or  a = a -   t**h (a + v**h b)
+              ! b = b - v t (a + v**h b)  or  b = b - v t**h (a + v**h b)
+       ! ---------------------------------------------------------------------------
+              mp = min( l+1, m )
+              kp = min( k-l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( k-l+i, j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'L', 'L', 'C', 'N', l, n, cone, v( 1, kp ), ldv,work( kp, 1 ), &
+                        ldwork )
+              call stdlib_cgemm( 'C', 'N', l, n, m-l, cone, v( mp, kp ), ldv,b( mp, 1 ), ldb, &
+                        cone, work( kp, 1 ), ldwork )
+              call stdlib_cgemm( 'C', 'N', k-l, n, m, cone, v, ldv,b, ldb, czero, work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'L', 'L', trans, 'N', k, n, cone, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_cgemm( 'N', 'N', m-l, n, k, -cone, v( mp, 1 ), ldv,work, ldwork, cone, &
+                        b( mp, 1 ), ldb )
+              call stdlib_cgemm( 'N', 'N', l, n, k-l, -cone, v, ldv,work, ldwork, cone, b,  ldb )
+                        
+              call stdlib_ctrmm( 'L', 'L', 'N', 'N', l, n, cone, v( 1, kp ), ldv,work( kp, 1 ), &
+                        ldwork )
+              do j = 1, n
+                 do i = 1, l
+                    b( i, j ) = b( i, j ) - work( k-l+i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. backward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v ]    (n-by-k)
+                        ! [ i ]    (k-by-k)
+              ! form  c h  or  c h**h  where  c = [ b a ] (b is m-by-n, a is m-by-k)
+              ! h = i - w t w**h          or  h**h = i - w t**h w**h
+              ! a = a - (a + b v) t      or  a = a - (a + b v) t**h
+              ! b = b - (a + b v) t v**h  or  b = b - (a + b v) t**h v**h
+       ! ---------------------------------------------------------------------------
+              np = min( l+1, n )
+              kp = min( k-l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, k-l+j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'R', 'L', 'N', 'N', m, l, cone, v( 1, kp ), ldv,work( 1, kp ), &
+                        ldwork )
+              call stdlib_cgemm( 'N', 'N', m, l, n-l, cone, b( 1, np ), ldb,v( np, kp ), ldv, &
+                        cone, work( 1, kp ), ldwork )
+              call stdlib_cgemm( 'N', 'N', m, k-l, n, cone, b, ldb,v, ldv, czero, work, ldwork )
+                        
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'R', 'L', trans, 'N', m, k, cone, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_cgemm( 'N', 'C', m, n-l, k, -cone, work, ldwork,v( np, 1 ), ldv, cone, &
+                        b( 1, np ), ldb )
+              call stdlib_cgemm( 'N', 'C', m, l, k-l, -cone, work, ldwork,v, ldv, cone, b, ldb )
+                        
+              call stdlib_ctrmm( 'R', 'L', 'C', 'N', m, l, cone, v( 1, kp ), ldv,work( 1, kp ), &
+                        ldwork )
+              do j = 1, l
+                 do i = 1, m
+                    b( i, j ) = b( i, j ) - work( i, k-l+j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. forward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i v ] ( i is k-by-k, v is k-by-m )
+              ! form  h c  or  h**h c  where  c = [ a ]  (k-by-n)
+                                                ! [ b ]  (m-by-n)
+              ! h = i - w**h t w          or  h**h = i - w**h t**h w
+              ! a = a -     t (a + v b)  or  a = a -     t**h (a + v b)
+              ! b = b - v**h t (a + v b)  or  b = b - v**h t**h (a + v b)
+       ! ---------------------------------------------------------------------------
+              mp = min( m-l+1, m )
+              kp = min( l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( i, j ) = b( m-l+i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'L', 'L', 'N', 'N', l, n, cone, v( 1, mp ), ldv,work, ldb )
+                        
+              call stdlib_cgemm( 'N', 'N', l, n, m-l, cone, v, ldv,b, ldb,cone, work, ldwork )
+                        
+              call stdlib_cgemm( 'N', 'N', k-l, n, m, cone, v( kp, 1 ), ldv,b, ldb, czero, work( &
+                        kp, 1 ), ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'L', 'U', trans, 'N', k, n, cone, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_cgemm( 'C', 'N', m-l, n, k, -cone, v, ldv, work, ldwork,cone, b, ldb )
+                        
+              call stdlib_cgemm( 'C', 'N', l, n, k-l, -cone, v( kp, mp ), ldv,work( kp, 1 ), &
+                        ldwork, cone, b( mp, 1 ), ldb )
+              call stdlib_ctrmm( 'L', 'L', 'C', 'N', l, n, cone, v( 1, mp ), ldv,work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, l
+                    b( m-l+i, j ) = b( m-l+i, j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. forward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i v ] ( i is k-by-k, v is k-by-n )
+              ! form  c h  or  c h**h  where  c = [ a b ] (a is m-by-k, b is m-by-n)
+              ! h = i - w**h t w            or  h**h = i - w**h t**h w
+              ! a = a - (a + b v**h) t      or  a = a - (a + b v**h) t**h
+              ! b = b - (a + b v**h) t v    or  b = b - (a + b v**h) t**h v
+       ! ---------------------------------------------------------------------------
+              np = min( n-l+1, n )
+              kp = min( l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, j ) = b( i, n-l+j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'R', 'L', 'C', 'N', m, l, cone, v( 1, np ), ldv,work, ldwork )
+                        
+              call stdlib_cgemm( 'N', 'C', m, l, n-l, cone, b, ldb, v, ldv,cone, work, ldwork )
+                        
+              call stdlib_cgemm( 'N', 'C', m, k-l, n, cone, b, ldb,v( kp, 1 ), ldv, czero, work( &
+                        1, kp ), ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'R', 'U', trans, 'N', m, k, cone, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_cgemm( 'N', 'N', m, n-l, k, -cone, work, ldwork,v, ldv, cone, b, ldb )
+                        
+              call stdlib_cgemm( 'N', 'N', m, l, k-l, -cone, work( 1, kp ), ldwork,v( kp, np ), &
+                        ldv, cone, b( 1, np ), ldb )
+              call stdlib_ctrmm( 'R', 'L', 'N', 'N', m, l, cone, v( 1, np ), ldv,work, ldwork )
+                        
+              do j = 1, l
+                 do i = 1, m
+                    b( i, n-l+j ) = b( i, n-l+j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. backward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v i ] ( i is k-by-k, v is k-by-m )
+              ! form  h c  or  h**h c  where  c = [ b ]  (m-by-n)
+                                                ! [ a ]  (k-by-n)
+              ! h = i - w**h t w          or  h**h = i - w**h t**h w
+              ! a = a -     t (a + v b)  or  a = a -     t**h (a + v b)
+              ! b = b - v**h t (a + v b)  or  b = b - v**h t**h (a + v b)
+       ! ---------------------------------------------------------------------------
+              mp = min( l+1, m )
+              kp = min( k-l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( k-l+i, j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'L', 'U', 'N', 'N', l, n, cone, v( kp, 1 ), ldv,work( kp, 1 ), &
+                        ldwork )
+              call stdlib_cgemm( 'N', 'N', l, n, m-l, cone, v( kp, mp ), ldv,b( mp, 1 ), ldb, &
+                        cone, work( kp, 1 ), ldwork )
+              call stdlib_cgemm( 'N', 'N', k-l, n, m, cone, v, ldv, b, ldb,czero, work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'L', 'L ', trans, 'N', k, n, cone, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_cgemm( 'C', 'N', m-l, n, k, -cone, v( 1, mp ), ldv,work, ldwork, cone, &
+                        b( mp, 1 ), ldb )
+              call stdlib_cgemm( 'C', 'N', l, n, k-l, -cone, v, ldv,work, ldwork, cone, b, ldb )
+                        
+              call stdlib_ctrmm( 'L', 'U', 'C', 'N', l, n, cone, v( kp, 1 ), ldv,work( kp, 1 ), &
+                        ldwork )
+              do j = 1, n
+                 do i = 1, l
+                    b( i, j ) = b( i, j ) - work( k-l+i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. backward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v i ] ( i is k-by-k, v is k-by-n )
+              ! form  c h  or  c h**h  where  c = [ b a ] (a is m-by-k, b is m-by-n)
+              ! h = i - w**h t w            or  h**h = i - w**h t**h w
+              ! a = a - (a + b v**h) t      or  a = a - (a + b v**h) t**h
+              ! b = b - (a + b v**h) t v    or  b = b - (a + b v**h) t**h v
+       ! ---------------------------------------------------------------------------
+              np = min( l+1, n )
+              kp = min( k-l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, k-l+j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'R', 'U', 'C', 'N', m, l, cone, v( kp, 1 ), ldv,work( 1, kp ), &
+                        ldwork )
+              call stdlib_cgemm( 'N', 'C', m, l, n-l, cone, b( 1, np ), ldb,v( kp, np ), ldv, &
+                        cone, work( 1, kp ), ldwork )
+              call stdlib_cgemm( 'N', 'C', m, k-l, n, cone, b, ldb, v, ldv,czero, work, ldwork )
+                        
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'R', 'L', trans, 'N', m, k, cone, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_cgemm( 'N', 'N', m, n-l, k, -cone, work, ldwork,v( 1, np ), ldv, cone, &
+                        b( 1, np ), ldb )
+              call stdlib_cgemm( 'N', 'N', m, l, k-l , -cone, work, ldwork,v, ldv, cone, b, ldb )
+                        
+              call stdlib_ctrmm( 'R', 'U', 'N', 'N', m, l, cone, v( kp, 1 ), ldv,work( 1, kp ), &
+                        ldwork )
+              do j = 1, l
+                 do i = 1, m
+                    b( i, j ) = b( i, j ) - work( i, k-l+j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_ctprfb
+
+     !> CTPRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular packed
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by CTPTRS or some other
+     !> means before entering this routine.  CTPRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_ctprfs( uplo, trans, diag, n, nrhs, ap, b, ldb, x, ldx,ferr, berr, &
+               work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: ap(*), b(ldb,*), x(ldx,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transn, transt
+           integer(ilp) :: i, j, k, kase, kc, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_ccopy( n, x( 1, j ), 1, work, 1 )
+              call stdlib_ctpmv( uplo, trans, diag, n, ap, work, 1 )
+              call stdlib_caxpy( n, -cone, b( 1, j ), 1, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = 1, k
+                             rwork( i ) = rwork( i ) +cabs1( ap( kc+i-1 ) )*xk
+                          end do
+                          kc = kc + k
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = 1, k - 1
+                             rwork( i ) = rwork( i ) +cabs1( ap( kc+i-1 ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                          kc = kc + k
+                       end do
+                    end if
+                 else
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k, n
+                             rwork( i ) = rwork( i ) +cabs1( ap( kc+i-k ) )*xk
+                          end do
+                          kc = kc + n - k + 1
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k + 1, n
+                             rwork( i ) = rwork( i ) +cabs1( ap( kc+i-k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                          kc = kc + n - k + 1
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**h)*abs(x) + abs(b).
+                 if( upper ) then
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = 1, k
+                             s = s + cabs1( ap( kc+i-1 ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                          kc = kc + k
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = 1, k - 1
+                             s = s + cabs1( ap( kc+i-1 ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                          kc = kc + k
+                       end do
+                    end if
+                 else
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, n
+                             s = s + cabs1( ap( kc+i-k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                          kc = kc + n - k + 1
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = k + 1, n
+                             s = s + cabs1( ap( kc+i-k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                          kc = kc + n - k + 1
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_clacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_ctpsv( uplo, transt, diag, n, ap, work, 1 )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_ctpsv( uplo, transn, diag, n, ap, work, 1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_ctprfs
+
+     !> CTPTRI: computes the inverse of a complex upper or lower triangular
+     !> matrix A stored in packed format.
+
+     pure subroutine stdlib_ctptri( uplo, diag, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jc, jclast, jj
+           complex(sp) :: ajj
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTPTRI', -info )
+              return
+           end if
+           ! check for singularity if non-unit.
+           if( nounit ) then
+              if( upper ) then
+                 jj = 0
+                 do info = 1, n
+                    jj = jj + info
+                    if( ap( jj )==czero )return
+                 end do
+              else
+                 jj = 1
+                 do info = 1, n
+                    if( ap( jj )==czero )return
+                    jj = jj + n - info + 1
+                 end do
+              end if
+              info = 0
+           end if
+           if( upper ) then
+              ! compute inverse of upper triangular matrix.
+              jc = 1
+              do j = 1, n
+                 if( nounit ) then
+                    ap( jc+j-1 ) = cone / ap( jc+j-1 )
+                    ajj = -ap( jc+j-1 )
+                 else
+                    ajj = -cone
+                 end if
+                 ! compute elements 1:j-1 of j-th column.
+                 call stdlib_ctpmv( 'UPPER', 'NO TRANSPOSE', diag, j-1, ap,ap( jc ), 1 )
+                 call stdlib_cscal( j-1, ajj, ap( jc ), 1 )
+                 jc = jc + j
+              end do
+           else
+              ! compute inverse of lower triangular matrix.
+              jc = n*( n+1 ) / 2
+              do j = n, 1, -1
+                 if( nounit ) then
+                    ap( jc ) = cone / ap( jc )
+                    ajj = -ap( jc )
+                 else
+                    ajj = -cone
+                 end if
+                 if( j<n ) then
+                    ! compute elements j+1:n of j-th column.
+                    call stdlib_ctpmv( 'LOWER', 'NO TRANSPOSE', diag, n-j,ap( jclast ), ap( jc+1 )&
+                              , 1 )
+                    call stdlib_cscal( n-j, ajj, ap( jc+1 ), 1 )
+                 end if
+                 jclast = jc
+                 jc = jc - n + j - 2
+              end do
+           end if
+           return
+     end subroutine stdlib_ctptri
+
+     !> CTPTRS: solves a triangular system of the form
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> where A is a triangular matrix of order N stored in packed format,
+     !> and B is an N-by-NRHS matrix.  A check is made to verify that A is
+     !> nonsingular.
+
+     pure subroutine stdlib_ctptrs( uplo, trans, diag, n, nrhs, ap, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(sp), intent(in) :: ap(*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jc
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              if( upper ) then
+                 jc = 1
+                 do info = 1, n
+                    if( ap( jc+info-1 )==czero )return
+                    jc = jc + info
+                 end do
+              else
+                 jc = 1
+                 do info = 1, n
+                    if( ap( jc )==czero )return
+                    jc = jc + n - info + 1
+                 end do
+              end if
+           end if
+           info = 0
+           ! solve  a * x = b,  a**t * x = b,  or  a**h * x = b.
+           do j = 1, nrhs
+              call stdlib_ctpsv( uplo, trans, diag, n, ap, b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_ctptrs
+
+     !> CTPTTF: copies a triangular matrix A from standard packed format (TP)
+     !> to rectangular full packed format (TF).
+
+     pure subroutine stdlib_ctpttf( transr, uplo, n, ap, arf, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(sp), intent(in) :: ap(0:*)
+           complex(sp), intent(out) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt
+           integer(ilp) :: i, j, ij
+           integer(ilp) :: ijp, jp, lda, js
+           ! Intrinsic Functions 
+           intrinsic :: conjg,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTPTTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( normaltransr ) then
+                 arf( 0 ) = ap( 0 )
+              else
+                 arf( 0 ) = conjg( ap( 0 ) )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           ! set lda of arf^c; arf^c is (0:(n+1)/2-1,0:n-noe)
+           ! where noe = 0 if n is even, noe = 1 if n is odd
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              lda = n + 1
+           else
+              nisodd = .true.
+              lda = n
+           end if
+           ! arf^c has lda rows and n+1-noe cols
+           if( .not.normaltransr )lda = ( n+1 ) / 2
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda = n
+                    ijp = 0
+                    jp = 0
+                    do j = 0, n2
+                       do i = j, n - 1
+                          ij = i + jp
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, n2 - 1
+                       do j = 1 + i, n2
+                          ij = i + j*lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    ijp = 0
+                    do j = 0, n1 - 1
+                       ij = n2 + j
+                       do i = 0, j
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = n1, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ijp = 0
+                    do i = 0, n2
+                       do ij = i*( lda+1 ), n*lda - 1, lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 1
+                    do j = 0, n2 - 1
+                       do ij = js, js + n2 - j - 1
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    ijp = 0
+                    js = n2*lda
+                    do j = 0, n1 - 1
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, n1
+                       do ij = i, i + ( n1+i )*lda, lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    ijp = 0
+                    jp = 0
+                    do j = 0, k - 1
+                       do i = j, n - 1
+                          ij = 1 + i + jp
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, k - 1
+                       do j = i, k - 1
+                          ij = i + j*lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    ijp = 0
+                    do j = 0, k - 1
+                       ij = k + 1 + j
+                       do i = 0, j
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = k, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    ijp = 0
+                    do i = 0, k - 1
+                       do ij = i + ( i+1 )*lda, ( n+1 )*lda - 1, lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 0
+                    do j = 0, k - 1
+                       do ij = js, js + k - j - 1
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    ijp = 0
+                    js = ( k+1 )*lda
+                    do j = 0, k - 1
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, k - 1
+                       do ij = i, i + ( k+i )*lda, lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ctpttf
+
+     !> CTPTTR: copies a triangular matrix A from standard packed format (TP)
+     !> to standard full format (TR).
+
+     pure subroutine stdlib_ctpttr( uplo, n, ap, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           complex(sp), intent(out) :: a(lda,*)
+           complex(sp), intent(in) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower
+           integer(ilp) :: i, j, k
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTPTTR', -info )
+              return
+           end if
+           if( lower ) then
+              k = 0
+              do j = 1, n
+                 do i = j, n
+                    k = k + 1
+                    a( i, j ) = ap( k )
+                 end do
+              end do
+           else
+              k = 0
+              do j = 1, n
+                 do i = 1, j
+                    k = k + 1
+                    a( i, j ) = ap( k )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_ctpttr
+
+     !> CTREVC: computes some or all of the right and/or left eigenvectors of
+     !> a complex upper triangular matrix T.
+     !> Matrices of this type are produced by the Schur factorization of
+     !> a complex general matrix:  A = Q*T*Q**H, as computed by CHSEQR.
+     !> The right eigenvector x and the left eigenvector y of T corresponding
+     !> to an eigenvalue w are defined by:
+     !> T*x = w*x,     (y**H)*T = w*(y**H)
+     !> where y**H denotes the conjugate transpose of the vector y.
+     !> The eigenvalues are not input to this routine, but are read directly
+     !> from the diagonal of T.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+     !> input matrix.  If Q is the unitary factor that reduces a matrix A to
+     !> Schur form T, then Q*X and Q*Y are the matrices of right and left
+     !> eigenvectors of A.
+
+     pure subroutine stdlib_ctrevc( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m, &
+               work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: t(ldt,*), vl(ldvl,*), vr(ldvr,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           complex(sp), parameter :: cmzero = (0.0e+0_sp,0.0e+0_sp)
+           complex(sp), parameter :: cmone = (1.0e+0_sp,0.0e+0_sp)
+           
+           
+           ! Local Scalars 
+           logical(lk) :: allv, bothv, leftv, over, rightv, somev
+           integer(ilp) :: i, ii, is, j, k, ki
+           real(sp) :: ovfl, remax, scale, smin, smlnum, ulp, unfl
+           complex(sp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! decode and test the input parameters
+           bothv = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv = stdlib_lsame( side, 'L' ) .or. bothv
+           allv = stdlib_lsame( howmny, 'A' )
+           over = stdlib_lsame( howmny, 'B' )
+           somev = stdlib_lsame( howmny, 'S' )
+           ! set m to the number of columns required to store the selected
+           ! eigenvectors.
+           if( somev ) then
+              m = 0
+              do j = 1, n
+                 if( select( j ) )m = m + 1
+              end do
+           else
+              m = n
+           end if
+           info = 0
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.allv .and. .not.over .and. .not.somev ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -10
+           else if( mm<m ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTREVC', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! set the constants to control overflow.
+           unfl = stdlib_slamch( 'SAFE MINIMUM' )
+           ovfl = one / unfl
+           call stdlib_slabad( unfl, ovfl )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           ! store the diagonal elements of t in working array work.
+           do i = 1, n
+              work( i+n ) = t( i, i )
+           end do
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           rwork( 1 ) = zero
+           do j = 2, n
+              rwork( j ) = stdlib_scasum( j-1, t( 1, j ), 1 )
+           end do
+           if( rightv ) then
+              ! compute right eigenvectors.
+              is = m
+              loop_80: do ki = n, 1, -1
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_80
+                 end if
+                 smin = max( ulp*( cabs1( t( ki, ki ) ) ), smlnum )
+                 work( 1 ) = cmone
+                 ! form right-hand side.
+                 do k = 1, ki - 1
+                    work( k ) = -t( k, ki )
+                 end do
+                 ! solve the triangular system:
+                    ! (t(1:ki-1,1:ki-1) - t(ki,ki))*x = scale*work.
+                 do k = 1, ki - 1
+                    t( k, k ) = t( k, k ) - t( ki, ki )
+                    if( cabs1( t( k, k ) )<smin )t( k, k ) = smin
+                 end do
+                 if( ki>1 ) then
+                    call stdlib_clatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', 'Y',ki-1, t, ldt, &
+                              work( 1 ), scale, rwork,info )
+                    work( ki ) = scale
+                 end if
+                 ! copy the vector x or q*x to vr and normalize.
+                 if( .not.over ) then
+                    call stdlib_ccopy( ki, work( 1 ), 1, vr( 1, is ), 1 )
+                    ii = stdlib_icamax( ki, vr( 1, is ), 1 )
+                    remax = one / cabs1( vr( ii, is ) )
+                    call stdlib_csscal( ki, remax, vr( 1, is ), 1 )
+                    do k = ki + 1, n
+                       vr( k, is ) = cmzero
+                    end do
+                 else
+                    if( ki>1 )call stdlib_cgemv( 'N', n, ki-1, cmone, vr, ldvr, work( 1 ),1, &
+                              cmplx( scale,KIND=sp), vr( 1, ki ), 1 )
+                    ii = stdlib_icamax( n, vr( 1, ki ), 1 )
+                    remax = one / cabs1( vr( ii, ki ) )
+                    call stdlib_csscal( n, remax, vr( 1, ki ), 1 )
+                 end if
+                 ! set back the original diagonal elements of t.
+                 do k = 1, ki - 1
+                    t( k, k ) = work( k+n )
+                 end do
+                 is = is - 1
+              end do loop_80
+           end if
+           if( leftv ) then
+              ! compute left eigenvectors.
+              is = 1
+              loop_130: do ki = 1, n
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_130
+                 end if
+                 smin = max( ulp*( cabs1( t( ki, ki ) ) ), smlnum )
+                 work( n ) = cmone
+                 ! form right-hand side.
+                 do k = ki + 1, n
+                    work( k ) = -conjg( t( ki, k ) )
+                 end do
+                 ! solve the triangular system:
+                    ! (t(ki+1:n,ki+1:n) - t(ki,ki))**h*x = scale*work.
+                 do k = ki + 1, n
+                    t( k, k ) = t( k, k ) - t( ki, ki )
+                    if( cabs1( t( k, k ) )<smin )t( k, k ) = smin
+                 end do
+                 if( ki<n ) then
+                    call stdlib_clatrs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT','Y', n-ki, t( &
+                              ki+1, ki+1 ), ldt,work( ki+1 ), scale, rwork, info )
+                    work( ki ) = scale
+                 end if
+                 ! copy the vector x or q*x to vl and normalize.
+                 if( .not.over ) then
+                    call stdlib_ccopy( n-ki+1, work( ki ), 1, vl( ki, is ), 1 )
+                    ii = stdlib_icamax( n-ki+1, vl( ki, is ), 1 ) + ki - 1
+                    remax = one / cabs1( vl( ii, is ) )
+                    call stdlib_csscal( n-ki+1, remax, vl( ki, is ), 1 )
+                    do k = 1, ki - 1
+                       vl( k, is ) = cmzero
+                    end do
+                 else
+                    if( ki<n )call stdlib_cgemv( 'N', n, n-ki, cmone, vl( 1, ki+1 ), ldvl,work( &
+                              ki+1 ), 1, cmplx( scale,KIND=sp),vl( 1, ki ), 1 )
+                    ii = stdlib_icamax( n, vl( 1, ki ), 1 )
+                    remax = one / cabs1( vl( ii, ki ) )
+                    call stdlib_csscal( n, remax, vl( 1, ki ), 1 )
+                 end if
+                 ! set back the original diagonal elements of t.
+                 do k = ki + 1, n
+                    t( k, k ) = work( k+n )
+                 end do
+                 is = is + 1
+              end do loop_130
+           end if
+           return
+     end subroutine stdlib_ctrevc
+
+     !> CTREVC3: computes some or all of the right and/or left eigenvectors of
+     !> a complex upper triangular matrix T.
+     !> Matrices of this type are produced by the Schur factorization of
+     !> a complex general matrix:  A = Q*T*Q**H, as computed by CHSEQR.
+     !> The right eigenvector x and the left eigenvector y of T corresponding
+     !> to an eigenvalue w are defined by:
+     !> T*x = w*x,     (y**H)*T = w*(y**H)
+     !> where y**H denotes the conjugate transpose of the vector y.
+     !> The eigenvalues are not input to this routine, but are read directly
+     !> from the diagonal of T.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+     !> input matrix. If Q is the unitary factor that reduces a matrix A to
+     !> Schur form T, then Q*X and Q*Y are the matrices of right and left
+     !> eigenvectors of A.
+     !> This uses a Level 3 BLAS version of the back transformation.
+
+     pure subroutine stdlib_ctrevc3( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m, &
+               work, lwork, rwork, lrwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, lwork, lrwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: t(ldt,*), vl(ldvl,*), vr(ldvr,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmin = 8
+           integer(ilp), parameter :: nbmax = 128
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: allv, bothv, leftv, lquery, over, rightv, somev
+           integer(ilp) :: i, ii, is, j, k, ki, iv, maxwrk, nb
+           real(sp) :: ovfl, remax, scale, smin, smlnum, ulp, unfl
+           complex(sp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! decode and test the input parameters
+           bothv  = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv  = stdlib_lsame( side, 'L' ) .or. bothv
+           allv  = stdlib_lsame( howmny, 'A' )
+           over  = stdlib_lsame( howmny, 'B' )
+           somev = stdlib_lsame( howmny, 'S' )
+           ! set m to the number of columns required to store the selected
+           ! eigenvectors.
+           if( somev ) then
+              m = 0
+              do j = 1, n
+                 if( select( j ) )m = m + 1
+              end do
+           else
+              m = n
+           end if
+           info = 0
+           nb = stdlib_ilaenv( 1, 'CTREVC', side // howmny, n, -1, -1, -1 )
+           maxwrk = n + 2*n*nb
+           work(1) = maxwrk
+           rwork(1) = n
+           lquery = ( lwork==-1 .or. lrwork==-1 )
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.allv .and. .not.over .and. .not.somev ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -10
+           else if( mm<m ) then
+              info = -11
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -14
+           else if ( lrwork<max( 1, n ) .and. .not.lquery ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTREVC3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! use blocked version of back-transformation if sufficient workspace.
+           ! zero-out the workspace to avoid potential nan propagation.
+           if( over .and. lwork >= n + 2*n*nbmin ) then
+              nb = (lwork - n) / (2*n)
+              nb = min( nb, nbmax )
+              call stdlib_claset( 'F', n, 1+2*nb, czero, czero, work, n )
+           else
+              nb = 1
+           end if
+           ! set the constants to control overflow.
+           unfl = stdlib_slamch( 'SAFE MINIMUM' )
+           ovfl = one / unfl
+           call stdlib_slabad( unfl, ovfl )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           ! store the diagonal elements of t in working array work.
+           do i = 1, n
+              work( i ) = t( i, i )
+           end do
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           rwork( 1 ) = zero
+           do j = 2, n
+              rwork( j ) = stdlib_scasum( j-1, t( 1, j ), 1 )
+           end do
+           if( rightv ) then
+              ! ============================================================
+              ! compute right eigenvectors.
+              ! iv is index of column in current block.
+              ! non-blocked version always uses iv=nb=1;
+              ! blocked     version starts with iv=nb, goes down to 1.
+              ! (note the "0-th" column is used to store the original diagonal.)
+              iv = nb
+              is = m
+              loop_80: do ki = n, 1, -1
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_80
+                 end if
+                 smin = max( ulp*( cabs1( t( ki, ki ) ) ), smlnum )
+                 ! --------------------------------------------------------
+                 ! complex right eigenvector
+                 work( ki + iv*n ) = cone
+                 ! form right-hand side.
+                 do k = 1, ki - 1
+                    work( k + iv*n ) = -t( k, ki )
+                 end do
+                 ! solve upper triangular system:
+                 ! [ t(1:ki-1,1:ki-1) - t(ki,ki) ]*x = scale*work.
+                 do k = 1, ki - 1
+                    t( k, k ) = t( k, k ) - t( ki, ki )
+                    if( cabs1( t( k, k ) )<smin )t( k, k ) = smin
+                 end do
+                 if( ki>1 ) then
+                    call stdlib_clatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', 'Y',ki-1, t, ldt, &
+                              work( 1 + iv*n ), scale,rwork, info )
+                    work( ki + iv*n ) = scale
+                 end if
+                 ! copy the vector x or q*x to vr and normalize.
+                 if( .not.over ) then
+                    ! ------------------------------
+                    ! no back-transform: copy x to vr and normalize.
+                    call stdlib_ccopy( ki, work( 1 + iv*n ), 1, vr( 1, is ), 1 )
+                    ii = stdlib_icamax( ki, vr( 1, is ), 1 )
+                    remax = one / cabs1( vr( ii, is ) )
+                    call stdlib_csscal( ki, remax, vr( 1, is ), 1 )
+                    do k = ki + 1, n
+                       vr( k, is ) = czero
+                    end do
+                 else if( nb==1 ) then
+                    ! ------------------------------
+                    ! version 1: back-transform each vector with gemv, q*x.
+                    if( ki>1 )call stdlib_cgemv( 'N', n, ki-1, cone, vr, ldvr,work( 1 + iv*n ), 1,&
+                               cmplx( scale,KIND=sp),vr( 1, ki ), 1 )
+                    ii = stdlib_icamax( n, vr( 1, ki ), 1 )
+                    remax = one / cabs1( vr( ii, ki ) )
+                    call stdlib_csscal( n, remax, vr( 1, ki ), 1 )
+                 else
+                    ! ------------------------------
+                    ! version 2: back-transform block of vectors with gemm
+                    ! zero out below vector
+                    do k = ki + 1, n
+                       work( k + iv*n ) = czero
+                    end do
+                    ! columns iv:nb of work are valid vectors.
+                    ! when the number of vectors stored reaches nb,
+                    ! or if this was last vector, do the gemm
+                    if( (iv==1) .or. (ki==1) ) then
+                       call stdlib_cgemm( 'N', 'N', n, nb-iv+1, ki+nb-iv, cone,vr, ldvr,work( 1 + &
+                                 (iv)*n    ), n,czero,work( 1 + (nb+iv)*n ), n )
+                       ! normalize vectors
+                       do k = iv, nb
+                          ii = stdlib_icamax( n, work( 1 + (nb+k)*n ), 1 )
+                          remax = one / cabs1( work( ii + (nb+k)*n ) )
+                          call stdlib_csscal( n, remax, work( 1 + (nb+k)*n ), 1 )
+                       end do
+                       call stdlib_clacpy( 'F', n, nb-iv+1,work( 1 + (nb+iv)*n ), n,vr( 1, ki ), &
+                                 ldvr )
+                       iv = nb
+                    else
+                       iv = iv - 1
+                    end if
+                 end if
+                 ! restore the original diagonal elements of t.
+                 do k = 1, ki - 1
+                    t( k, k ) = work( k )
+                 end do
+                 is = is - 1
+              end do loop_80
+           end if
+           if( leftv ) then
+              ! ============================================================
+              ! compute left eigenvectors.
+              ! iv is index of column in current block.
+              ! non-blocked version always uses iv=1;
+              ! blocked     version starts with iv=1, goes up to nb.
+              ! (note the "0-th" column is used to store the original diagonal.)
+              iv = 1
+              is = 1
+              loop_130: do ki = 1, n
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_130
+                 end if
+                 smin = max( ulp*( cabs1( t( ki, ki ) ) ), smlnum )
+                 ! --------------------------------------------------------
+                 ! complex left eigenvector
+                 work( ki + iv*n ) = cone
+                 ! form right-hand side.
+                 do k = ki + 1, n
+                    work( k + iv*n ) = -conjg( t( ki, k ) )
+                 end do
+                 ! solve conjugate-transposed triangular system:
+                 ! [ t(ki+1:n,ki+1:n) - t(ki,ki) ]**h * x = scale*work.
+                 do k = ki + 1, n
+                    t( k, k ) = t( k, k ) - t( ki, ki )
+                    if( cabs1( t( k, k ) )<smin )t( k, k ) = smin
+                 end do
+                 if( ki<n ) then
+                    call stdlib_clatrs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT','Y', n-ki, t( &
+                              ki+1, ki+1 ), ldt,work( ki+1 + iv*n ), scale, rwork, info )
+                    work( ki + iv*n ) = scale
+                 end if
+                 ! copy the vector x or q*x to vl and normalize.
+                 if( .not.over ) then
+                    ! ------------------------------
+                    ! no back-transform: copy x to vl and normalize.
+                    call stdlib_ccopy( n-ki+1, work( ki + iv*n ), 1, vl(ki,is), 1 )
+                    ii = stdlib_icamax( n-ki+1, vl( ki, is ), 1 ) + ki - 1
+                    remax = one / cabs1( vl( ii, is ) )
+                    call stdlib_csscal( n-ki+1, remax, vl( ki, is ), 1 )
+                    do k = 1, ki - 1
+                       vl( k, is ) = czero
+                    end do
+                 else if( nb==1 ) then
+                    ! ------------------------------
+                    ! version 1: back-transform each vector with gemv, q*x.
+                    if( ki<n )call stdlib_cgemv( 'N', n, n-ki, cone, vl( 1, ki+1 ), ldvl,work( ki+&
+                              1 + iv*n ), 1, cmplx( scale,KIND=sp),vl( 1, ki ), 1 )
+                    ii = stdlib_icamax( n, vl( 1, ki ), 1 )
+                    remax = one / cabs1( vl( ii, ki ) )
+                    call stdlib_csscal( n, remax, vl( 1, ki ), 1 )
+                 else
+                    ! ------------------------------
+                    ! version 2: back-transform block of vectors with gemm
+                    ! zero out above vector
+                    ! could go from ki-nv+1 to ki-1
+                    do k = 1, ki - 1
+                       work( k + iv*n ) = czero
+                    end do
+                    ! columns 1:iv of work are valid vectors.
+                    ! when the number of vectors stored reaches nb,
+                    ! or if this was last vector, do the gemm
+                    if( (iv==nb) .or. (ki==n) ) then
+                       call stdlib_cgemm( 'N', 'N', n, iv, n-ki+iv, cone,vl( 1, ki-iv+1 ), ldvl,&
+                                 work( ki-iv+1 + (1)*n ), n,czero,work( 1 + (nb+1)*n ), n )
+                       ! normalize vectors
+                       do k = 1, iv
+                          ii = stdlib_icamax( n, work( 1 + (nb+k)*n ), 1 )
+                          remax = one / cabs1( work( ii + (nb+k)*n ) )
+                          call stdlib_csscal( n, remax, work( 1 + (nb+k)*n ), 1 )
+                       end do
+                       call stdlib_clacpy( 'F', n, iv,work( 1 + (nb+1)*n ), n,vl( 1, ki-iv+1 ), &
+                                 ldvl )
+                       iv = 1
+                    else
+                       iv = iv + 1
+                    end if
+                 end if
+                 ! restore the original diagonal elements of t.
+                 do k = ki + 1, n
+                    t( k, k ) = work( k )
+                 end do
+                 is = is + 1
+              end do loop_130
+           end if
+           return
+     end subroutine stdlib_ctrevc3
+
+     !> CTREXC: reorders the Schur factorization of a complex matrix
+     !> A = Q*T*Q**H, so that the diagonal element of T with row index IFST
+     !> is moved to row ILST.
+     !> The Schur form T is reordered by a unitary similarity transformation
+     !> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by
+     !> postmultplying it with Z.
+
+     pure subroutine stdlib_ctrexc( compq, n, t, ldt, q, ldq, ifst, ilst, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq
+           integer(ilp), intent(in) :: ifst, ilst, ldq, ldt, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(sp), intent(inout) :: q(ldq,*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: wantq
+           integer(ilp) :: k, m1, m2, m3
+           real(sp) :: cs
+           complex(sp) :: sn, t11, t22, temp
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           info = 0
+           wantq = stdlib_lsame( compq, 'V' )
+           if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldt<max( 1, n ) ) then
+              info = -4
+           else if( ldq<1 .or. ( wantq .and. ldq<max( 1, n ) ) ) then
+              info = -6
+           else if(( ifst<1 .or. ifst>n ).and.( n>0 )) then
+              info = -7
+           else if(( ilst<1 .or. ilst>n ).and.( n>0 )) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTREXC', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 .or. ifst==ilst )return
+           if( ifst<ilst ) then
+              ! move the ifst-th diagonal element forward down the diagonal.
+              m1 = 0
+              m2 = -1
+              m3 = 1
+           else
+              ! move the ifst-th diagonal element backward up the diagonal.
+              m1 = -1
+              m2 = 0
+              m3 = -1
+           end if
+           do k = ifst + m1, ilst + m2, m3
+              ! interchange the k-th and (k+1)-th diagonal elements.
+              t11 = t( k, k )
+              t22 = t( k+1, k+1 )
+              ! determine the transformation to perform the interchange.
+              call stdlib_clartg( t( k, k+1 ), t22-t11, cs, sn, temp )
+              ! apply transformation to the matrix t.
+              if( k+2<=n )call stdlib_crot( n-k-1, t( k, k+2 ), ldt, t( k+1, k+2 ), ldt, cs,sn )
+                        
+              call stdlib_crot( k-1, t( 1, k ), 1, t( 1, k+1 ), 1, cs, conjg( sn ) )
+              t( k, k ) = t22
+              t( k+1, k+1 ) = t11
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_crot( n, q( 1, k ), 1, q( 1, k+1 ), 1, cs,conjg( sn ) )
+              end if
+           end do
+           return
+     end subroutine stdlib_ctrexc
+
+     !> CTRRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by CTRTRS or some other
+     !> means before entering this routine.  CTRRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_ctrrfs( uplo, trans, diag, n, nrhs, a, lda, b, ldb, x,ldx, ferr, berr,&
+                work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: a(lda,*), b(ldb,*), x(ldx,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transn, transt
+           integer(ilp) :: i, j, k, kase, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTRRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_ccopy( n, x( 1, j ), 1, work, 1 )
+              call stdlib_ctrmv( uplo, trans, diag, n, a, lda, work, 1 )
+              call stdlib_caxpy( n, -cone, b( 1, j ), 1, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = 1, k
+                             rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = 1, k - 1
+                             rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k, n
+                             rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k + 1, n
+                             rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**h)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = 1, k
+                             s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = 1, k - 1
+                             s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, n
+                             s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = k + 1, n
+                             s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_clacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_ctrsv( uplo, transt, diag, n, a, lda, work, 1 )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_ctrsv( uplo, transn, diag, n, a, lda, work, 1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_ctrrfs
+
+     !> CTRSNA: estimates reciprocal condition numbers for specified
+     !> eigenvalues and/or right eigenvectors of a complex upper triangular
+     !> matrix T (or of any matrix Q*T*Q**H with Q unitary).
+
+     pure subroutine stdlib_ctrsna( job, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, s, sep, mm,&
+                m, work, ldwork, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, ldwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(sp), intent(out) :: rwork(*), s(*), sep(*)
+           complex(sp), intent(in) :: t(ldt,*), vl(ldvl,*), vr(ldvr,*)
+           complex(sp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: somcon, wantbh, wants, wantsp
+           character :: normin
+           integer(ilp) :: i, ierr, ix, j, k, kase, ks
+           real(sp) :: bignum, eps, est, lnrm, rnrm, scale, smlnum, xnorm
+           complex(sp) :: cdum, prod
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           complex(sp) :: dummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantsp = stdlib_lsame( job, 'V' ) .or. wantbh
+           somcon = stdlib_lsame( howmny, 'S' )
+           ! set m to the number of eigenpairs for which condition numbers are
+           ! to be computed.
+           if( somcon ) then
+              m = 0
+              do j = 1, n
+                 if( select( j ) )m = m + 1
+              end do
+           else
+              m = n
+           end if
+           info = 0
+           if( .not.wants .and. .not.wantsp ) then
+              info = -1
+           else if( .not.stdlib_lsame( howmny, 'A' ) .and. .not.somcon ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( wants .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( wants .and. ldvr<n ) ) then
+              info = -10
+           else if( mm<m ) then
+              info = -13
+           else if( ldwork<1 .or. ( wantsp .and. ldwork<n ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTRSNA', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( somcon ) then
+                 if( .not.select( 1 ) )return
+              end if
+              if( wants )s( 1 ) = one
+              if( wantsp )sep( 1 ) = abs( t( 1, 1 ) )
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ks = 1
+           loop_50: do k = 1, n
+              if( somcon ) then
+                 if( .not.select( k ) )cycle loop_50
+              end if
+              if( wants ) then
+                 ! compute the reciprocal condition number of the k-th
+                 ! eigenvalue.
+                 prod = stdlib_cdotc( n, vr( 1, ks ), 1, vl( 1, ks ), 1 )
+                 rnrm = stdlib_scnrm2( n, vr( 1, ks ), 1 )
+                 lnrm = stdlib_scnrm2( n, vl( 1, ks ), 1 )
+                 s( ks ) = abs( prod ) / ( rnrm*lnrm )
+              end if
+              if( wantsp ) then
+                 ! estimate the reciprocal condition number of the k-th
+                 ! eigenvector.
+                 ! copy the matrix t to the array work and swap the k-th
+                 ! diagonal element to the (1,1) position.
+                 call stdlib_clacpy( 'FULL', n, n, t, ldt, work, ldwork )
+                 call stdlib_ctrexc( 'NO Q', n, work, ldwork, dummy, 1, k, 1, ierr )
+                 ! form  c = t22 - lambda*i in work(2:n,2:n).
+                 do i = 2, n
+                    work( i, i ) = work( i, i ) - work( 1, 1 )
+                 end do
+                 ! estimate a lower bound for the 1-norm of inv(c**h). the 1st
+                 ! and (n+1)th columns of work are used to store work vectors.
+                 sep( ks ) = zero
+                 est = zero
+                 kase = 0
+                 normin = 'N'
+                 30 continue
+                 call stdlib_clacn2( n-1, work( 1, n+1 ), work, est, kase, isave )
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve c**h*x = scale*b
+                       call stdlib_clatrs( 'UPPER', 'CONJUGATE TRANSPOSE','NONUNIT', normin, n-1, &
+                                 work( 2, 2 ),ldwork, work, scale, rwork, ierr )
+                    else
+                       ! solve c*x = scale*b
+                       call stdlib_clatrs( 'UPPER', 'NO TRANSPOSE', 'NONUNIT',normin, n-1, work( &
+                                 2, 2 ), ldwork, work,scale, rwork, ierr )
+                    end if
+                    normin = 'Y'
+                    if( scale/=one ) then
+                       ! multiply by 1/scale if doing so will not cause
+                       ! overflow.
+                       ix = stdlib_icamax( n-1, work, 1 )
+                       xnorm = cabs1( work( ix, 1 ) )
+                       if( scale<xnorm*smlnum .or. scale==zero )go to 40
+                       call stdlib_csrscl( n, scale, work, 1 )
+                    end if
+                    go to 30
+                 end if
+                 sep( ks ) = one / max( est, smlnum )
+              end if
+              40 continue
+              ks = ks + 1
+           end do loop_50
+           return
+     end subroutine stdlib_ctrsna
+
+     !> CTRTI2: computes the inverse of a complex upper or lower triangular
+     !> matrix.
+     !> This is the Level 2 BLAS version of the algorithm.
+
+     pure subroutine stdlib_ctrti2( uplo, diag, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j
+           complex(sp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTRTI2', -info )
+              return
+           end if
+           if( upper ) then
+              ! compute inverse of upper triangular matrix.
+              do j = 1, n
+                 if( nounit ) then
+                    a( j, j ) = cone / a( j, j )
+                    ajj = -a( j, j )
+                 else
+                    ajj = -cone
+                 end if
+                 ! compute elements 1:j-1 of j-th column.
+                 call stdlib_ctrmv( 'UPPER', 'NO TRANSPOSE', diag, j-1, a, lda,a( 1, j ), 1 )
+                           
+                 call stdlib_cscal( j-1, ajj, a( 1, j ), 1 )
+              end do
+           else
+              ! compute inverse of lower triangular matrix.
+              do j = n, 1, -1
+                 if( nounit ) then
+                    a( j, j ) = cone / a( j, j )
+                    ajj = -a( j, j )
+                 else
+                    ajj = -cone
+                 end if
+                 if( j<n ) then
+                    ! compute elements j+1:n of j-th column.
+                    call stdlib_ctrmv( 'LOWER', 'NO TRANSPOSE', diag, n-j,a( j+1, j+1 ), lda, a( &
+                              j+1, j ), 1 )
+                    call stdlib_cscal( n-j, ajj, a( j+1, j ), 1 )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_ctrti2
+
+     !> CTRTRI: computes the inverse of a complex upper or lower triangular
+     !> matrix A.
+     !> This is the Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_ctrtri( uplo, diag, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jb, nb, nn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTRTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity if non-unit.
+           if( nounit ) then
+              do info = 1, n
+                 if( a( info, info )==czero )return
+              end do
+              info = 0
+           end if
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'CTRTRI', uplo // diag, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_ctrti2( uplo, diag, n, a, lda, info )
+           else
+              ! use blocked code
+              if( upper ) then
+                 ! compute inverse of upper triangular matrix
+                 do j = 1, n, nb
+                    jb = min( nb, n-j+1 )
+                    ! compute rows 1:j-1 of current block column
+                    call stdlib_ctrmm( 'LEFT', 'UPPER', 'NO TRANSPOSE', diag, j-1,jb, cone, a, &
+                              lda, a( 1, j ), lda )
+                    call stdlib_ctrsm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', diag, j-1,jb, -cone, a( &
+                              j, j ), lda, a( 1, j ), lda )
+                    ! compute inverse of current diagonal block
+                    call stdlib_ctrti2( 'UPPER', diag, jb, a( j, j ), lda, info )
+                 end do
+              else
+                 ! compute inverse of lower triangular matrix
+                 nn = ( ( n-1 ) / nb )*nb + 1
+                 do j = nn, 1, -nb
+                    jb = min( nb, n-j+1 )
+                    if( j+jb<=n ) then
+                       ! compute rows j+jb:n of current block column
+                       call stdlib_ctrmm( 'LEFT', 'LOWER', 'NO TRANSPOSE', diag,n-j-jb+1, jb, &
+                                 cone, a( j+jb, j+jb ), lda,a( j+jb, j ), lda )
+                       call stdlib_ctrsm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', diag,n-j-jb+1, jb, -&
+                                 cone, a( j, j ), lda,a( j+jb, j ), lda )
+                    end if
+                    ! compute inverse of current diagonal block
+                    call stdlib_ctrti2( 'LOWER', diag, jb, a( j, j ), lda, info )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_ctrtri
+
+     !> CTRTRS: solves a triangular system of the form
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> where A is a triangular matrix of order N, and B is an N-by-NRHS
+     !> matrix.  A check is made to verify that A is nonsingular.
+
+     pure subroutine stdlib_ctrtrs( uplo, trans, diag, n, nrhs, a, lda, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTRTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              do info = 1, n
+                 if( a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           ! solve a * x = b,  a**t * x = b,  or  a**h * x = b.
+           call stdlib_ctrsm( 'LEFT', uplo, trans, diag, n, nrhs, cone, a, lda, b,ldb )
+           return
+     end subroutine stdlib_ctrtrs
+
+     !> CTRTTF: copies a triangular matrix A from standard full format (TR)
+     !> to rectangular full packed format (TF) .
+
+     pure subroutine stdlib_ctrttf( transr, uplo, n, a, lda, arf, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(0:lda-1,0:*)
+           complex(sp), intent(out) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: i, ij, j, k, l, n1, n2, nt, nx2, np1x2
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTRTTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 ) then
+                 if( normaltransr ) then
+                    arf( 0 ) = a( 0, 0 )
+                 else
+                    arf( 0 ) = conjg( a( 0, 0 ) )
+                 end if
+              end if
+              return
+           end if
+           ! size of array arf(1:2,0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower: for n even n1=n2=k
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true., lda=n+1 and a is (n+1)--by--k2.
+           ! if n is even, set k = n/2 and nisodd = .false., lda=n and a is
+           ! n--by--(n+1)/2.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              if( .not.lower )np1x2 = n + n + 2
+           else
+              nisodd = .true.
+              if( .not.lower )nx2 = n + n
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda=n
+                    ij = 0
+                    do j = 0, n2
+                       do i = n1, n2 + j
+                          arf( ij ) = conjg( a( n2+j, i ) )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0); lda=n
+                    ij = nt - n
+                    do j = n - 1, n1, -1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = j - n1, n1 - 1
+                          arf( ij ) = conjg( a( j-n1, l ) )
+                          ij = ij + 1
+                       end do
+                       ij = ij - nx2
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ij = 0
+                    do j = 0, n2 - 1
+                       do i = 0, j
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                       do i = n1 + j, n - 1
+                          arf( ij ) = a( i, n1+j )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = n2, n - 1
+                       do i = 0, n1 - 1
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda=n2
+                    ij = 0
+                    do j = 0, n1
+                       do i = n1, n - 1
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, n1 - 1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = n2 + j, n - 1
+                          arf( ij ) = conjg( a( n2+j, l ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1); lda=n+1
+                    ij = 0
+                    do j = 0, k - 1
+                       do i = k, k + j
+                          arf( ij ) = conjg( a( k+j, i ) )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0); lda=n+1
+                    ij = nt - n - 1
+                    do j = n - 1, k, -1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = j - k, k - 1
+                          arf( ij ) = conjg( a( j-k, l ) )
+                          ij = ij + 1
+                       end do
+                       ij = ij - np1x2
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper, a=b)
+                    ! t1 -> a(0,1) , t2 -> a(0,0) , s -> a(0,k+1) :
+                    ! t1 -> a(0+k) , t2 -> a(0+0) , s -> a(0+k*(k+1)); lda=k
+                    ij = 0
+                    j = k
+                    do i = k, n - 1
+                       arf( ij ) = a( i, j )
+                       ij = ij + 1
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                       do i = k + 1 + j, n - 1
+                          arf( ij ) = a( i, k+1+j )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = k - 1, n - 1
+                       do i = 0, k - 1
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper, a=b)
+                    ! t1 -> a(0,k+1) , t2 -> a(0,k) , s -> a(0,0)
+                    ! t1 -> a(0+k*(k+1)) , t2 -> a(0+k*k) , s -> a(0+0)); lda=k
+                    ij = 0
+                    do j = 0, k
+                       do i = k, n - 1
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = k + 1 + j, n - 1
+                          arf( ij ) = conjg( a( k+1+j, l ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    ! note that here j = k-1
+                    do i = 0, j
+                       arf( ij ) = a( i, j )
+                       ij = ij + 1
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ctrttf
+
+     !> CTRTTP: copies a triangular matrix A from full format (TR) to standard
+     !> packed format (TP).
+
+     pure subroutine stdlib_ctrttp( uplo, n, a, lda, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(out) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower
+           integer(ilp) :: i, j, k
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTRTTP', -info )
+              return
+           end if
+           if( lower ) then
+              k = 0
+              do j = 1, n
+                 do i = j, n
+                    k = k + 1
+                    ap( k ) = a( i, j )
+                 end do
+              end do
+           else
+              k = 0
+              do j = 1, n
+                 do i = 1, j
+                    k = k + 1
+                    ap( k ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_ctrttp
+
+     !> CTZRZF: reduces the M-by-N ( M<=N ) complex upper trapezoidal matrix A
+     !> to upper triangular form by means of unitary transformations.
+     !> The upper trapezoidal matrix A is factored as
+     !> A = ( R  0 ) * Z,
+     !> where Z is an N-by-N unitary matrix and R is an M-by-M upper
+     !> triangular matrix.
+
+     pure subroutine stdlib_ctzrzf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iws, ki, kk, ldwork, lwkmin, lwkopt, m1, mu, nb, nbmin, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              if( m==0 .or. m==n ) then
+                 lwkopt = 1
+                 lwkmin = 1
+              else
+                 ! determine the block size.
+                 nb = stdlib_ilaenv( 1, 'CGERQF', ' ', m, n, -1, -1 )
+                 lwkopt = m*nb
+                 lwkmin = max( 1, m )
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTZRZF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 ) then
+              return
+           else if( m==n ) then
+              do i = 1, n
+                 tau( i ) = czero
+              end do
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = m
+           if( nb>1 .and. nb<m ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'CGERQF', ' ', m, n, -1, -1 ) )
+              if( nx<m ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'CGERQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<m .and. nx<m ) then
+              ! use blocked code initially.
+              ! the last kk rows are handled by the block method.
+              m1 = min( m+1, n )
+              ki = ( ( m-nx-1 ) / nb )*nb
+              kk = min( m, ki+nb )
+              do i = m - kk + ki + 1, m - kk + 1, -nb
+                 ib = min( m-i+1, nb )
+                 ! compute the tz factorization of the current block
+                 ! a(i:i+ib-1,i:n)
+                 call stdlib_clatrz( ib, n-i+1, n-m, a( i, i ), lda, tau( i ),work )
+                 if( i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_clarzt( 'BACKWARD', 'ROWWISE', n-m, ib, a( i, m1 ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h to a(1:i-1,i:n) from the right
+                    call stdlib_clarzb( 'RIGHT', 'NO TRANSPOSE', 'BACKWARD','ROWWISE', i-1, n-i+1,&
+                     ib, n-m, a( i, m1 ),lda, work, ldwork, a( 1, i ), lda,work( ib+1 ), ldwork )
+                               
+                 end if
+              end do
+              mu = i + nb - 1
+           else
+              mu = m
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 )call stdlib_clatrz( mu, n, n-m, a, lda, tau, work )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ctzrzf
+
+     !> CUNBDB: simultaneously bidiagonalizes the blocks of an M-by-M
+     !> partitioned unitary matrix X:
+     !> [ B11 | B12 0  0 ]
+     !> [ X11 | X12 ]   [ P1 |    ] [  0  |  0 -I  0 ] [ Q1 |    ]**H
+     !> X = [-----------] = [---------] [----------------] [---------]   .
+     !> [ X21 | X22 ]   [    | P2 ] [ B21 | B22 0  0 ] [    | Q2 ]
+     !> [  0  |  0  0  I ]
+     !> X11 is P-by-Q. Q must be no larger than P, M-P, or M-Q. (If this is
+     !> not the case, then X must be transposed and/or permuted. This can be
+     !> done in constant time using the TRANS and SIGNS options. See CUNCSD
+     !> for details.)
+     !> The unitary matrices P1, P2, Q1, and Q2 are P-by-P, (M-P)-by-
+     !> (M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. They are
+     !> represented implicitly by Householder vectors.
+     !> B11, B12, B21, and B22 are Q-by-Q bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_cunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+               ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: signs, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldx11, ldx12, ldx21, ldx22, lwork, m, p, q
+           ! Array Arguments 
+           real(sp), intent(out) :: phi(*), theta(*)
+           complex(sp), intent(out) :: taup1(*), taup2(*), tauq1(*), tauq2(*), work(*)
+           complex(sp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
+                     
+        ! ====================================================================
+           ! Parameters 
+           real(sp), parameter :: realone = 1.0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: colmajor, lquery
+           integer(ilp) :: i, lworkmin, lworkopt
+           real(sp) :: z1, z2, z3, z4
+           ! Intrinsic Functions
+           intrinsic :: atan2,cos,max,min,sin
+           intrinsic :: cmplx,conjg
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           if( .not. stdlib_lsame( signs, 'O' ) ) then
+              z1 = realone
+              z2 = realone
+              z3 = realone
+              z4 = realone
+           else
+              z1 = realone
+              z2 = -realone
+              z3 = realone
+              z4 = -realone
+           end if
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -3
+           else if( p < 0 .or. p > m ) then
+              info = -4
+           else if( q < 0 .or. q > p .or. q > m-p .or.q > m-q ) then
+              info = -5
+           else if( colmajor .and. ldx11 < max( 1, p ) ) then
+              info = -7
+           else if( .not.colmajor .and. ldx11 < max( 1, q ) ) then
+              info = -7
+           else if( colmajor .and. ldx12 < max( 1, p ) ) then
+              info = -9
+           else if( .not.colmajor .and. ldx12 < max( 1, m-q ) ) then
+              info = -9
+           else if( colmajor .and. ldx21 < max( 1, m-p ) ) then
+              info = -11
+           else if( .not.colmajor .and. ldx21 < max( 1, q ) ) then
+              info = -11
+           else if( colmajor .and. ldx22 < max( 1, m-p ) ) then
+              info = -13
+           else if( .not.colmajor .and. ldx22 < max( 1, m-q ) ) then
+              info = -13
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              lworkopt = m - q
+              lworkmin = m - q
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not. lquery ) then
+                 info = -21
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'XORBDB', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! handle column-major and row-major separately
+           if( colmajor ) then
+              ! reduce columns 1, ..., q of x11, x12, x21, and x22
+              do i = 1, q
+                 if( i == 1 ) then
+                    call stdlib_cscal( p-i+1, cmplx( z1, 0.0_sp,KIND=sp), x11(i,i), 1 )
+                 else
+                    call stdlib_cscal( p-i+1, cmplx( z1*cos(phi(i-1)), 0.0_sp,KIND=sp),x11(i,i), &
+                              1 )
+                    call stdlib_caxpy( p-i+1, cmplx( -z1*z3*z4*sin(phi(i-1)),0.0_sp,KIND=sp), x12(&
+                              i,i-1), 1, x11(i,i), 1 )
+                 end if
+                 if( i == 1 ) then
+                    call stdlib_cscal( m-p-i+1, cmplx( z2, 0.0_sp,KIND=sp), x21(i,i), 1 )
+                 else
+                    call stdlib_cscal( m-p-i+1, cmplx( z2*cos(phi(i-1)), 0.0_sp,KIND=sp),x21(i,i),&
+                               1 )
+                    call stdlib_caxpy( m-p-i+1, cmplx( -z2*z3*z4*sin(phi(i-1)),0.0_sp,KIND=sp), &
+                              x22(i,i-1), 1, x21(i,i), 1 )
+                 end if
+                 theta(i) = atan2( stdlib_scnrm2( m-p-i+1, x21(i,i), 1 ),stdlib_scnrm2( p-i+1, &
+                           x11(i,i), 1 ) )
+                 if( p > i ) then
+                    call stdlib_clarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+                 else if ( p == i ) then
+                    call stdlib_clarfgp( p-i+1, x11(i,i), x11(i,i), 1, taup1(i) )
+                 end if
+                 x11(i,i) = cone
+                 if ( m-p > i ) then
+                    call stdlib_clarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1,taup2(i) )
+                 else if ( m-p == i ) then
+                    call stdlib_clarfgp( m-p-i+1, x21(i,i), x21(i,i), 1,taup2(i) )
+                 end if
+                 x21(i,i) = cone
+                 if ( q > i ) then
+                    call stdlib_clarf( 'L', p-i+1, q-i, x11(i,i), 1,conjg(taup1(i)), x11(i,i+1), &
+                              ldx11, work )
+                    call stdlib_clarf( 'L', m-p-i+1, q-i, x21(i,i), 1,conjg(taup2(i)), x21(i,i+1),&
+                               ldx21, work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_clarf( 'L', p-i+1, m-q-i+1, x11(i,i), 1,conjg(taup1(i)), x12(i,i),&
+                               ldx12, work )
+                    call stdlib_clarf( 'L', m-p-i+1, m-q-i+1, x21(i,i), 1,conjg(taup2(i)), x22(i,&
+                              i), ldx22, work )
+                 end if
+                 if( i < q ) then
+                    call stdlib_cscal( q-i, cmplx( -z1*z3*sin(theta(i)), 0.0_sp,KIND=sp),x11(i,i+&
+                              1), ldx11 )
+                    call stdlib_caxpy( q-i, cmplx( z2*z3*cos(theta(i)), 0.0_sp,KIND=sp),x21(i,i+1)&
+                              , ldx21, x11(i,i+1), ldx11 )
+                 end if
+                 call stdlib_cscal( m-q-i+1, cmplx( -z1*z4*sin(theta(i)), 0.0_sp,KIND=sp),x12(i,i)&
+                           , ldx12 )
+                 call stdlib_caxpy( m-q-i+1, cmplx( z2*z4*cos(theta(i)), 0.0_sp,KIND=sp),x22(i,i),&
+                            ldx22, x12(i,i), ldx12 )
+                 if( i < q )phi(i) = atan2( stdlib_scnrm2( q-i, x11(i,i+1), ldx11 ),stdlib_scnrm2(&
+                            m-q-i+1, x12(i,i), ldx12 ) )
+                 if( i < q ) then
+                    call stdlib_clacgv( q-i, x11(i,i+1), ldx11 )
+                    if ( i == q-1 ) then
+                       call stdlib_clarfgp( q-i, x11(i,i+1), x11(i,i+1), ldx11,tauq1(i) )
+                    else
+                       call stdlib_clarfgp( q-i, x11(i,i+1), x11(i,i+2), ldx11,tauq1(i) )
+                    end if
+                    x11(i,i+1) = cone
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_clacgv( m-q-i+1, x12(i,i), ldx12 )
+                    if ( m-q == i ) then
+                       call stdlib_clarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
+                    else
+                       call stdlib_clarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
+                                 
+                    end if
+                 end if
+                 x12(i,i) = cone
+                 if( i < q ) then
+                    call stdlib_clarf( 'R', p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x11(i+1,i+1), &
+                              ldx11, work )
+                    call stdlib_clarf( 'R', m-p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x21(i+1,i+1), &
+                              ldx21, work )
+                 end if
+                 if ( p > i ) then
+                    call stdlib_clarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
+                              ldx12, work )
+                 end if
+                 if ( m-p > i ) then
+                    call stdlib_clarf( 'R', m-p-i, m-q-i+1, x12(i,i), ldx12,tauq2(i), x22(i+1,i), &
+                              ldx22, work )
+                 end if
+                 if( i < q )call stdlib_clacgv( q-i, x11(i,i+1), ldx11 )
+                 call stdlib_clacgv( m-q-i+1, x12(i,i), ldx12 )
+              end do
+              ! reduce columns q + 1, ..., p of x12, x22
+              do i = q + 1, p
+                 call stdlib_cscal( m-q-i+1, cmplx( -z1*z4, 0.0_sp,KIND=sp), x12(i,i),ldx12 )
+                           
+                 call stdlib_clacgv( m-q-i+1, x12(i,i), ldx12 )
+                 if ( i >= m-q ) then
+                    call stdlib_clarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
+                 else
+                    call stdlib_clarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
+                 end if
+                 x12(i,i) = cone
+                 if ( p > i ) then
+                    call stdlib_clarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
+                              ldx12, work )
+                 end if
+                 if( m-p-q >= 1 )call stdlib_clarf( 'R', m-p-q, m-q-i+1, x12(i,i), ldx12,tauq2(i),&
+                            x22(q+1,i), ldx22, work )
+                 call stdlib_clacgv( m-q-i+1, x12(i,i), ldx12 )
+              end do
+              ! reduce columns p + 1, ..., m - q of x12, x22
+              do i = 1, m - p - q
+                 call stdlib_cscal( m-p-q-i+1, cmplx( z2*z4, 0.0_sp,KIND=sp),x22(q+i,p+i), ldx22 )
+                           
+                 call stdlib_clacgv( m-p-q-i+1, x22(q+i,p+i), ldx22 )
+                 call stdlib_clarfgp( m-p-q-i+1, x22(q+i,p+i), x22(q+i,p+i+1),ldx22, tauq2(p+i) )
+                           
+                 x22(q+i,p+i) = cone
+                 call stdlib_clarf( 'R', m-p-q-i, m-p-q-i+1, x22(q+i,p+i), ldx22,tauq2(p+i), x22(&
+                           q+i+1,p+i), ldx22, work )
+                 call stdlib_clacgv( m-p-q-i+1, x22(q+i,p+i), ldx22 )
+              end do
+           else
+              ! reduce columns 1, ..., q of x11, x12, x21, x22
+              do i = 1, q
+                 if( i == 1 ) then
+                    call stdlib_cscal( p-i+1, cmplx( z1, 0.0_sp,KIND=sp), x11(i,i),ldx11 )
+                 else
+                    call stdlib_cscal( p-i+1, cmplx( z1*cos(phi(i-1)), 0.0_sp,KIND=sp),x11(i,i), &
+                              ldx11 )
+                    call stdlib_caxpy( p-i+1, cmplx( -z1*z3*z4*sin(phi(i-1)),0.0_sp,KIND=sp), x12(&
+                              i-1,i), ldx12, x11(i,i), ldx11 )
+                 end if
+                 if( i == 1 ) then
+                    call stdlib_cscal( m-p-i+1, cmplx( z2, 0.0_sp,KIND=sp), x21(i,i),ldx21 )
+                              
+                 else
+                    call stdlib_cscal( m-p-i+1, cmplx( z2*cos(phi(i-1)), 0.0_sp,KIND=sp),x21(i,i),&
+                               ldx21 )
+                    call stdlib_caxpy( m-p-i+1, cmplx( -z2*z3*z4*sin(phi(i-1)),0.0_sp,KIND=sp), &
+                              x22(i-1,i), ldx22, x21(i,i), ldx21 )
+                 end if
+                 theta(i) = atan2( stdlib_scnrm2( m-p-i+1, x21(i,i), ldx21 ),stdlib_scnrm2( p-i+1,&
+                            x11(i,i), ldx11 ) )
+                 call stdlib_clacgv( p-i+1, x11(i,i), ldx11 )
+                 call stdlib_clacgv( m-p-i+1, x21(i,i), ldx21 )
+                 call stdlib_clarfgp( p-i+1, x11(i,i), x11(i,i+1), ldx11, taup1(i) )
+                 x11(i,i) = cone
+                 if ( i == m-p ) then
+                    call stdlib_clarfgp( m-p-i+1, x21(i,i), x21(i,i), ldx21,taup2(i) )
+                 else
+                    call stdlib_clarfgp( m-p-i+1, x21(i,i), x21(i,i+1), ldx21,taup2(i) )
+                 end if
+                 x21(i,i) = cone
+                 call stdlib_clarf( 'R', q-i, p-i+1, x11(i,i), ldx11, taup1(i),x11(i+1,i), ldx11, &
+                           work )
+                 call stdlib_clarf( 'R', m-q-i+1, p-i+1, x11(i,i), ldx11, taup1(i),x12(i,i), &
+                           ldx12, work )
+                 call stdlib_clarf( 'R', q-i, m-p-i+1, x21(i,i), ldx21, taup2(i),x21(i+1,i), &
+                           ldx21, work )
+                 call stdlib_clarf( 'R', m-q-i+1, m-p-i+1, x21(i,i), ldx21,taup2(i), x22(i,i), &
+                           ldx22, work )
+                 call stdlib_clacgv( p-i+1, x11(i,i), ldx11 )
+                 call stdlib_clacgv( m-p-i+1, x21(i,i), ldx21 )
+                 if( i < q ) then
+                    call stdlib_cscal( q-i, cmplx( -z1*z3*sin(theta(i)), 0.0_sp,KIND=sp),x11(i+1,&
+                              i), 1 )
+                    call stdlib_caxpy( q-i, cmplx( z2*z3*cos(theta(i)), 0.0_sp,KIND=sp),x21(i+1,i)&
+                              , 1, x11(i+1,i), 1 )
+                 end if
+                 call stdlib_cscal( m-q-i+1, cmplx( -z1*z4*sin(theta(i)), 0.0_sp,KIND=sp),x12(i,i)&
+                           , 1 )
+                 call stdlib_caxpy( m-q-i+1, cmplx( z2*z4*cos(theta(i)), 0.0_sp,KIND=sp),x22(i,i),&
+                            1, x12(i,i), 1 )
+                 if( i < q )phi(i) = atan2( stdlib_scnrm2( q-i, x11(i+1,i), 1 ),stdlib_scnrm2( m-&
+                           q-i+1, x12(i,i), 1 ) )
+                 if( i < q ) then
+                    call stdlib_clarfgp( q-i, x11(i+1,i), x11(i+2,i), 1, tauq1(i) )
+                    x11(i+1,i) = cone
+                 end if
+                 call stdlib_clarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1, tauq2(i) )
+                 x12(i,i) = cone
+                 if( i < q ) then
+                    call stdlib_clarf( 'L', q-i, p-i, x11(i+1,i), 1,conjg(tauq1(i)), x11(i+1,i+1),&
+                               ldx11, work )
+                    call stdlib_clarf( 'L', q-i, m-p-i, x11(i+1,i), 1,conjg(tauq1(i)), x21(i+1,i+&
+                              1), ldx21, work )
+                 end if
+                 call stdlib_clarf( 'L', m-q-i+1, p-i, x12(i,i), 1, conjg(tauq2(i)),x12(i,i+1), &
+                           ldx12, work )
+                 if ( m-p > i ) then
+                    call stdlib_clarf( 'L', m-q-i+1, m-p-i, x12(i,i), 1,conjg(tauq2(i)), x22(i,i+&
+                              1), ldx22, work )
+                 end if
+              end do
+              ! reduce columns q + 1, ..., p of x12, x22
+              do i = q + 1, p
+                 call stdlib_cscal( m-q-i+1, cmplx( -z1*z4, 0.0_sp,KIND=sp), x12(i,i), 1 )
+                 call stdlib_clarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1, tauq2(i) )
+                 x12(i,i) = cone
+                 if ( p > i ) then
+                    call stdlib_clarf( 'L', m-q-i+1, p-i, x12(i,i), 1,conjg(tauq2(i)), x12(i,i+1),&
+                               ldx12, work )
+                 end if
+                 if( m-p-q >= 1 )call stdlib_clarf( 'L', m-q-i+1, m-p-q, x12(i,i), 1,conjg(tauq2(&
+                           i)), x22(i,q+1), ldx22, work )
+              end do
+              ! reduce columns p + 1, ..., m - q of x12, x22
+              do i = 1, m - p - q
+                 call stdlib_cscal( m-p-q-i+1, cmplx( z2*z4, 0.0_sp,KIND=sp),x22(p+i,q+i), 1 )
+                           
+                 call stdlib_clarfgp( m-p-q-i+1, x22(p+i,q+i), x22(p+i+1,q+i), 1,tauq2(p+i) )
+                           
+                 x22(p+i,q+i) = cone
+                 if ( m-p-q /= i ) then
+                    call stdlib_clarf( 'L', m-p-q-i+1, m-p-q-i, x22(p+i,q+i), 1,conjg(tauq2(p+i)),&
+                               x22(p+i,q+i+1), ldx22,work )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_cunbdb
+
+     !> CUNBDB6: orthogonalizes the column vector
+     !> X = [ X1 ]
+     !> [ X2 ]
+     !> with respect to the columns of
+     !> Q = [ Q1 ] .
+     !> [ Q2 ]
+     !> The columns of Q must be orthonormal.
+     !> If the projection is zero according to Kahan's "twice is enough"
+     !> criterion, then the zero vector is returned.
+
+     pure subroutine stdlib_cunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(sp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: x1(*), x2(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: alphasq = 0.01_sp
+           real(sp), parameter :: realone = 1.0_sp
+           real(sp), parameter :: realzero = 0.0_sp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: normsq1, normsq2, scl1, scl2, ssq1, ssq2
+           ! Intrinsic Function 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           if( m1 < 0 ) then
+              info = -1
+           else if( m2 < 0 ) then
+              info = -2
+           else if( n < 0 ) then
+              info = -3
+           else if( incx1 < 1 ) then
+              info = -5
+           else if( incx2 < 1 ) then
+              info = -7
+           else if( ldq1 < max( 1, m1 ) ) then
+              info = -9
+           else if( ldq2 < max( 1, m2 ) ) then
+              info = -11
+           else if( lwork < n ) then
+              info = -13
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'CUNBDB6', -info )
+              return
+           end if
+           ! first, project x onto the orthogonal complement of q's column
+           ! space
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_classq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_classq( m2, x2, incx2, scl2, ssq2 )
+           normsq1 = scl1**2*ssq1 + scl2**2*ssq2
+           if( m1 == 0 ) then
+              do i = 1, n
+                 work(i) = czero
+              end do
+           else
+              call stdlib_cgemv( 'C', m1, n, cone, q1, ldq1, x1, incx1, czero, work,1 )
+           end if
+           call stdlib_cgemv( 'C', m2, n, cone, q2, ldq2, x2, incx2, cone, work, 1 )
+           call stdlib_cgemv( 'N', m1, n, cnegone, q1, ldq1, work, 1, cone, x1,incx1 )
+           call stdlib_cgemv( 'N', m2, n, cnegone, q2, ldq2, work, 1, cone, x2,incx2 )
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_classq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_classq( m2, x2, incx2, scl2, ssq2 )
+           normsq2 = scl1**2*ssq1 + scl2**2*ssq2
+           ! if projection is sufficiently large in norm, then stop.
+           ! if projection is czero, then stop.
+           ! otherwise, project again.
+           if( normsq2 >= alphasq*normsq1 ) then
+              return
+           end if
+           if( normsq2 == czero ) then
+              return
+           end if
+           normsq1 = normsq2
+           do i = 1, n
+              work(i) = czero
+           end do
+           if( m1 == 0 ) then
+              do i = 1, n
+                 work(i) = czero
+              end do
+           else
+              call stdlib_cgemv( 'C', m1, n, cone, q1, ldq1, x1, incx1, czero, work,1 )
+           end if
+           call stdlib_cgemv( 'C', m2, n, cone, q2, ldq2, x2, incx2, cone, work, 1 )
+           call stdlib_cgemv( 'N', m1, n, cnegone, q1, ldq1, work, 1, cone, x1,incx1 )
+           call stdlib_cgemv( 'N', m2, n, cnegone, q2, ldq2, work, 1, cone, x2,incx2 )
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_classq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_classq( m1, x1, incx1, scl1, ssq1 )
+           normsq2 = scl1**2*ssq1 + scl2**2*ssq2
+           ! if second projection is sufficiently large in norm, then do
+           ! nothing more. alternatively, if it shrunk significantly, then
+           ! truncate it to czero.
+           if( normsq2 < alphasq*normsq1 ) then
+              do i = 1, m1
+                 x1(i) = czero
+              end do
+              do i = 1, m2
+                 x2(i) = czero
+              end do
+           end if
+           return
+     end subroutine stdlib_cunbdb6
+
+     !> CUNG2L: generates an m by n complex matrix Q with orthonormal columns,
+     !> which is defined as the last n columns of a product of k elementary
+     !> reflectors of order m
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by CGEQLF.
+
+     pure subroutine stdlib_cung2l( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNG2L', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           ! initialise columns 1:n-k to columns of the unit matrix
+           do j = 1, n - k
+              do l = 1, m
+                 a( l, j ) = czero
+              end do
+              a( m-n+j, j ) = cone
+           end do
+           do i = 1, k
+              ii = n - k + i
+              ! apply h(i) to a(1:m-k+i,1:n-k+i) from the left
+              a( m-n+ii, ii ) = cone
+              call stdlib_clarf( 'LEFT', m-n+ii, ii-1, a( 1, ii ), 1, tau( i ), a,lda, work )
+                        
+              call stdlib_cscal( m-n+ii-1, -tau( i ), a( 1, ii ), 1 )
+              a( m-n+ii, ii ) = cone - tau( i )
+              ! set a(m-k+i+1:m,n-k+i) to czero
+              do l = m - n + ii + 1, m
+                 a( l, ii ) = czero
+              end do
+           end do
+           return
+     end subroutine stdlib_cung2l
+
+     !> CUNG2R: generates an m by n complex matrix Q with orthonormal columns,
+     !> which is defined as the first n columns of a product of k elementary
+     !> reflectors of order m
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by CGEQRF.
+
+     pure subroutine stdlib_cung2r( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNG2R', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           ! initialise columns k+1:n to columns of the unit matrix
+           do j = k + 1, n
+              do l = 1, m
+                 a( l, j ) = czero
+              end do
+              a( j, j ) = cone
+           end do
+           do i = k, 1, -1
+              ! apply h(i) to a(i:m,i:n) from the left
+              if( i<n ) then
+                 a( i, i ) = cone
+                 call stdlib_clarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tau( i ),a( i, i+1 ), lda, &
+                           work )
+              end if
+              if( i<m )call stdlib_cscal( m-i, -tau( i ), a( i+1, i ), 1 )
+              a( i, i ) = cone - tau( i )
+              ! set a(1:i-1,i) to czero
+              do l = 1, i - 1
+                 a( l, i ) = czero
+              end do
+           end do
+           return
+     end subroutine stdlib_cung2r
+
+     !> CUNGL2: generates an m-by-n complex matrix Q with orthonormal rows,
+     !> which is defined as the first m rows of a product of k elementary
+     !> reflectors of order n
+     !> Q  =  H(k)**H . . . H(2)**H H(1)**H
+     !> as returned by CGELQF.
+
+     pure subroutine stdlib_cungl2( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNGL2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 )return
+           if( k<m ) then
+              ! initialise rows k+1:m to rows of the unit matrix
+              do j = 1, n
+                 do l = k + 1, m
+                    a( l, j ) = czero
+                 end do
+                 if( j>k .and. j<=m )a( j, j ) = cone
+              end do
+           end if
+           do i = k, 1, -1
+              ! apply h(i)**h to a(i:m,i:n) from the right
+              if( i<n ) then
+                 call stdlib_clacgv( n-i, a( i, i+1 ), lda )
+                 if( i<m ) then
+                    a( i, i ) = cone
+                    call stdlib_clarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda,conjg( tau( i ) ), a( &
+                              i+1, i ), lda, work )
+                 end if
+                 call stdlib_cscal( n-i, -tau( i ), a( i, i+1 ), lda )
+                 call stdlib_clacgv( n-i, a( i, i+1 ), lda )
+              end if
+              a( i, i ) = cone - conjg( tau( i ) )
+              ! set a(i,1:i-1,i) to czero
+              do l = 1, i - 1
+                 a( i, l ) = czero
+              end do
+           end do
+           return
+     end subroutine stdlib_cungl2
+
+     !> CUNGLQ: generates an M-by-N complex matrix Q with orthonormal rows,
+     !> which is defined as the first M rows of a product of K elementary
+     !> reflectors of order N
+     !> Q  =  H(k)**H . . . H(2)**H H(1)**H
+     !> as returned by CGELQF.
+
+     pure subroutine stdlib_cunglq( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, ki, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'CUNGLQ', ' ', m, n, k, -1 )
+           lwkopt = max( 1, m )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( lwork<max( 1, m ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNGLQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'CUNGLQ', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'CUNGLQ', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the last block.
+              ! the first kk rows are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              ! set a(kk+1:m,1:kk) to czero.
+              do j = 1, kk
+                 do i = kk + 1, m
+                    a( i, j ) = czero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the last or only block.
+           if( kk<m )call stdlib_cungl2( m-kk, n-kk, k-kk, a( kk+1, kk+1 ), lda,tau( kk+1 ), work,&
+                      iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = ki + 1, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 if( i+ib<=m ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_clarft( 'FORWARD', 'ROWWISE', n-i+1, ib, a( i, i ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h**h to a(i+ib:m,i:n) from the right
+                    call stdlib_clarfb( 'RIGHT', 'CONJUGATE TRANSPOSE', 'FORWARD','ROWWISE', m-i-&
+                    ib+1, n-i+1, ib, a( i, i ),lda, work, ldwork, a( i+ib, i ), lda,work( ib+1 ), &
+                              ldwork )
+                 end if
+                 ! apply h**h to columns i:n of current block
+                 call stdlib_cungl2( ib, n-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 ! set columns 1:i-1 of current block to czero
+                 do j = 1, i - 1
+                    do l = i, i + ib - 1
+                       a( l, j ) = czero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_cunglq
+
+     !> CUNGQL: generates an M-by-N complex matrix Q with orthonormal columns,
+     !> which is defined as the last N columns of a product of K elementary
+     !> reflectors of order M
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by CGEQLF.
+
+     pure subroutine stdlib_cungql( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'CUNGQL', ' ', m, n, k, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, n ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNGQL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'CUNGQL', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'CUNGQL', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the first block.
+              ! the last kk columns are handled by the block method.
+              kk = min( k, ( ( k-nx+nb-1 ) / nb )*nb )
+              ! set a(m-kk+1:m,1:n-kk) to czero.
+              do j = 1, n - kk
+                 do i = m - kk + 1, m
+                    a( i, j ) = czero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the first or only block.
+           call stdlib_cung2l( m-kk, n-kk, k-kk, a, lda, tau, work, iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = k - kk + 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 if( n-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_clarft( 'BACKWARD', 'COLUMNWISE', m-k+i+ib-1, ib,a( 1, n-k+i ), &
+                              lda, tau( i ), work, ldwork )
+                    ! apply h to a(1:m-k+i+ib-1,1:n-k+i-1) from the left
+                    call stdlib_clarfb( 'LEFT', 'NO TRANSPOSE', 'BACKWARD','COLUMNWISE', m-k+i+ib-&
+                    1, n-k+i-1, ib,a( 1, n-k+i ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+                 ! apply h to rows 1:m-k+i+ib-1 of current block
+                 call stdlib_cung2l( m-k+i+ib-1, ib, ib, a( 1, n-k+i ), lda,tau( i ), work, iinfo &
+                           )
+                 ! set rows m-k+i+ib:m of current block to czero
+                 do j = n - k + i, n - k + i + ib - 1
+                    do l = m - k + i + ib, m
+                       a( l, j ) = czero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_cungql
+
+     !> CUNGQR: generates an M-by-N complex matrix Q with orthonormal columns,
+     !> which is defined as the first N columns of a product of K elementary
+     !> reflectors of order M
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by CGEQRF.
+
+     pure subroutine stdlib_cungqr( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, ki, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'CUNGQR', ' ', m, n, k, -1 )
+           lwkopt = max( 1, n )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNGQR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'CUNGQR', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'CUNGQR', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the last block.
+              ! the first kk columns are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              ! set a(1:kk,kk+1:n) to czero.
+              do j = kk + 1, n
+                 do i = 1, kk
+                    a( i, j ) = czero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the last or only block.
+           if( kk<n )call stdlib_cung2r( m-kk, n-kk, k-kk, a( kk+1, kk+1 ), lda,tau( kk+1 ), work,&
+                      iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = ki + 1, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_clarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h to a(i:m,i+ib:n) from the left
+                    call stdlib_clarfb( 'LEFT', 'NO TRANSPOSE', 'FORWARD','COLUMNWISE', m-i+1, n-&
+                    i-ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 ), &
+                              ldwork )
+                 end if
+                 ! apply h to rows i:m of current block
+                 call stdlib_cung2r( m-i+1, ib, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 ! set rows 1:i-1 of current block to czero
+                 do j = i, i + ib - 1
+                    do l = 1, i - 1
+                       a( l, j ) = czero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_cungqr
+
+     !> CUNGR2: generates an m by n complex matrix Q with orthonormal rows,
+     !> which is defined as the last m rows of a product of k elementary
+     !> reflectors of order n
+     !> Q  =  H(1)**H H(2)**H . . . H(k)**H
+     !> as returned by CGERQF.
+
+     pure subroutine stdlib_cungr2( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, l
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNGR2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 )return
+           if( k<m ) then
+              ! initialise rows 1:m-k to rows of the unit matrix
+              do j = 1, n
+                 do l = 1, m - k
+                    a( l, j ) = czero
+                 end do
+                 if( j>n-m .and. j<=n-k )a( m-n+j, j ) = cone
+              end do
+           end if
+           do i = 1, k
+              ii = m - k + i
+              ! apply h(i)**h to a(1:m-k+i,1:n-k+i) from the right
+              call stdlib_clacgv( n-m+ii-1, a( ii, 1 ), lda )
+              a( ii, n-m+ii ) = cone
+              call stdlib_clarf( 'RIGHT', ii-1, n-m+ii, a( ii, 1 ), lda,conjg( tau( i ) ), a, lda,&
+                         work )
+              call stdlib_cscal( n-m+ii-1, -tau( i ), a( ii, 1 ), lda )
+              call stdlib_clacgv( n-m+ii-1, a( ii, 1 ), lda )
+              a( ii, n-m+ii ) = cone - conjg( tau( i ) )
+              ! set a(m-k+i,n-k+i+1:n) to czero
+              do l = n - m + ii + 1, n
+                 a( ii, l ) = czero
+              end do
+           end do
+           return
+     end subroutine stdlib_cungr2
+
+     !> CUNGRQ: generates an M-by-N complex matrix Q with orthonormal rows,
+     !> which is defined as the last M rows of a product of K elementary
+     !> reflectors of order N
+     !> Q  =  H(1)**H H(2)**H . . . H(k)**H
+     !> as returned by CGERQF.
+
+     pure subroutine stdlib_cungrq( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, ii, iinfo, iws, j, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'CUNGRQ', ' ', m, n, k, -1 )
+                 lwkopt = m*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, m ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNGRQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'CUNGRQ', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'CUNGRQ', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the first block.
+              ! the last kk rows are handled by the block method.
+              kk = min( k, ( ( k-nx+nb-1 ) / nb )*nb )
+              ! set a(1:m-kk,n-kk+1:n) to czero.
+              do j = n - kk + 1, n
+                 do i = 1, m - kk
+                    a( i, j ) = czero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the first or only block.
+           call stdlib_cungr2( m-kk, n-kk, k-kk, a, lda, tau, work, iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = k - kk + 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 ii = m - k + i
+                 if( ii>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_clarft( 'BACKWARD', 'ROWWISE', n-k+i+ib-1, ib,a( ii, 1 ), lda, &
+                              tau( i ), work, ldwork )
+                    ! apply h**h to a(1:m-k+i-1,1:n-k+i+ib-1) from the right
+                    call stdlib_clarfb( 'RIGHT', 'CONJUGATE TRANSPOSE', 'BACKWARD','ROWWISE', ii-&
+                    1, n-k+i+ib-1, ib, a( ii, 1 ),lda, work, ldwork, a, lda, work( ib+1 ),ldwork )
+                              
+                 end if
+                 ! apply h**h to columns 1:n-k+i+ib-1 of current block
+                 call stdlib_cungr2( ib, n-k+i+ib-1, ib, a( ii, 1 ), lda, tau( i ),work, iinfo )
+                           
+                 ! set columns n-k+i+ib:n of current block to czero
+                 do l = n - k + i + ib, n
+                    do j = ii, ii + ib - 1
+                       a( j, l ) = czero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_cungrq
+
+     !> CUNGTSQR_ROW: generates an M-by-N complex matrix Q_out with
+     !> orthonormal columns from the output of CLATSQR. These N orthonormal
+     !> columns are the first N columns of a product of complex unitary
+     !> matrices Q(k)_in of order M, which are returned by CLATSQR in
+     !> a special format.
+     !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+     !> The input matrices Q(k)_in are stored in row and column blocks in A.
+     !> See the documentation of CLATSQR for more details on the format of
+     !> Q(k)_in, where each Q(k)_in is represented by block Householder
+     !> transformations. This routine calls an auxiliary routine CLARFB_GETT,
+     !> where the computation is performed on each individual block. The
+     !> algorithm first sweeps NB-sized column blocks from the right to left
+     !> starting in the bottom row block and continues to the top row block
+     !> (hence _ROW in the routine name). This sweep is in reverse order of
+     !> the order in which CLATSQR generates the output blocks.
+
+     pure subroutine stdlib_cungtsqr_row( m, n, mb, nb, a, lda, t, ldt, work,lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, mb, nb
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: t(ldt,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: nblocal, mb2, m_plus_one, itmp, ib_bottom, lworkopt, &
+                     num_all_row_blocks, jb_t, ib, imb, kb, kb_last, knb, mb1
+           ! Local Arrays 
+           complex(sp) :: dummy(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           lquery  = lwork==-1
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb<=n ) then
+              info = -3
+           else if( nb<1 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           nblocal = min( nb, n )
+           ! determine the workspace size.
+           if( info==0 ) then
+              lworkopt = nblocal * max( nblocal, ( n - nblocal ) )
+           end if
+           ! handle error in the input parameters and handle the workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNGTSQR_ROW', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = cmplx( lworkopt,KIND=sp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = cmplx( lworkopt,KIND=sp)
+              return
+           end if
+           ! (0) set the upper-triangular part of the matrix a to zero and
+           ! its diagonal elements to one.
+           call stdlib_claset('U', m, n, czero, cone, a, lda )
+           ! kb_last is the column index of the last column block reflector
+           ! in the matrices t and v.
+           kb_last = ( ( n-1 ) / nblocal ) * nblocal + 1
+           ! (1) bottom-up loop over row blocks of a, except the top row block.
+           ! note: if mb>=m, then the loop is never executed.
+           if ( mb<m ) then
+              ! mb2 is the row blocking size for the row blocks before the
+              ! first top row block in the matrix a. ib is the row index for
+              ! the row blocks in the matrix a before the first top row block.
+              ! ib_bottom is the row index for the last bottom row block
+              ! in the matrix a. jb_t is the column index of the corresponding
+              ! column block in the matrix t.
+              ! initialize variables.
+              ! num_all_row_blocks is the number of row blocks in the matrix a
+              ! including the first row block.
+              mb2 = mb - n
+              m_plus_one = m + 1
+              itmp = ( m - mb - 1 ) / mb2
+              ib_bottom = itmp * mb2 + mb + 1
+              num_all_row_blocks = itmp + 2
+              jb_t = num_all_row_blocks * n + 1
+              do ib = ib_bottom, mb+1, -mb2
+                 ! determine the block size imb for the current row block
+                 ! in the matrix a.
+                 imb = min( m_plus_one - ib, mb2 )
+                 ! determine the column index jb_t for the current column block
+                 ! in the matrix t.
+                 jb_t = jb_t - n
+                 ! apply column blocks of h in the row block from right to left.
+                 ! kb is the column index of the current column block reflector
+                 ! in the matrices t and v.
+                 do kb = kb_last, 1, -nblocal
+                    ! determine the size of the current column block knb in
+                    ! the matrices t and v.
+                    knb = min( nblocal, n - kb + 1 )
+                    call stdlib_clarfb_gett( 'I', imb, n-kb+1, knb,t( 1, jb_t+kb-1 ), ldt, a( kb, &
+                              kb ), lda,a( ib, kb ), lda, work, knb )
+                 end do
+              end do
+           end if
+           ! (2) top row block of a.
+           ! note: if mb>=m, then we have only one row block of a of size m
+           ! and we work on the entire matrix a.
+           mb1 = min( mb, m )
+           ! apply column blocks of h in the top row block from right to left.
+           ! kb is the column index of the current block reflector in
+           ! the matrices t and v.
+           do kb = kb_last, 1, -nblocal
+              ! determine the size of the current column block knb in
+              ! the matrices t and v.
+              knb = min( nblocal, n - kb + 1 )
+              if( mb1-kb-knb+1==0 ) then
+                 ! in stdlib_slarfb_gett parameters, when m=0, then the matrix b
+                 ! does not exist, hence we need to pass a dummy array
+                 ! reference dummy(1,1) to b with lddummy=1.
+                 call stdlib_clarfb_gett( 'N', 0, n-kb+1, knb,t( 1, kb ), ldt, a( kb, kb ), lda,&
+                           dummy( 1, 1 ), 1, work, knb )
+              else
+                 call stdlib_clarfb_gett( 'N', mb1-kb-knb+1, n-kb+1, knb,t( 1, kb ), ldt, a( kb, &
+                           kb ), lda,a( kb+knb, kb), lda, work, knb )
+              end if
+           end do
+           work( 1 ) = cmplx( lworkopt,KIND=sp)
+           return
+     end subroutine stdlib_cungtsqr_row
+
+
+     pure subroutine stdlib_cunm22( side, trans, m, n, n1, n2, q, ldq, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(in) :: m, n, n1, n2, ldq, ldc, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(sp), intent(in) :: q(ldq,*)
+           complex(sp), intent(inout) :: c(ldc,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, ldwork, len, lwkopt, nb, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q;
+           ! nw is the minimum dimension of work.
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           nw = nq
+           if( n1==0 .or. n2==0 ) nw = 1
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'C' ) )&
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( n1<0 .or. n1+n2/=nq ) then
+              info = -5
+           else if( n2<0 ) then
+              info = -6
+           else if( ldq<max( 1, nq ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              lwkopt = m*n
+              work( 1 ) = cmplx( lwkopt,KIND=sp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNM22', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! degenerate cases (n1 = 0 or n2 = 0) are handled using stdlib_ctrmm.
+           if( n1==0 ) then
+              call stdlib_ctrmm( side, 'UPPER', trans, 'NON-UNIT', m, n, cone,q, ldq, c, ldc )
+                        
+              work( 1 ) = cone
+              return
+           else if( n2==0 ) then
+              call stdlib_ctrmm( side, 'LOWER', trans, 'NON-UNIT', m, n, cone,q, ldq, c, ldc )
+                        
+              work( 1 ) = cone
+              return
+           end if
+           ! compute the largest chunk size available from the workspace.
+           nb = max( 1, min( lwork, lwkopt ) / nq )
+           if( left ) then
+              if( notran ) then
+                 do i = 1, n, nb
+                    len = min( nb, n-i+1 )
+                    ldwork = m
+                    ! multiply bottom part of c by q12.
+                    call stdlib_clacpy( 'ALL', n1, len, c( n2+1, i ), ldc, work,ldwork )
+                    call stdlib_ctrmm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT',n1, len, cone, &
+                              q( 1, n2+1 ), ldq, work,ldwork )
+                    ! multiply top part of c by q11.
+                    call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n1, len, n2,cone, q, ldq, &
+                              c( 1, i ), ldc, cone, work,ldwork )
+                    ! multiply top part of c by q21.
+                    call stdlib_clacpy( 'ALL', n2, len, c( 1, i ), ldc,work( n1+1 ), ldwork )
+                              
+                    call stdlib_ctrmm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',n2, len, cone, &
+                              q( n1+1, 1 ), ldq,work( n1+1 ), ldwork )
+                    ! multiply bottom part of c by q22.
+                    call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n2, len, n1,cone, q( n1+1, &
+                              n2+1 ), ldq, c( n2+1, i ), ldc,cone, work( n1+1 ), ldwork )
+                    ! copy everything back.
+                    call stdlib_clacpy( 'ALL', m, len, work, ldwork, c( 1, i ),ldc )
+                 end do
+              else
+                 do i = 1, n, nb
+                    len = min( nb, n-i+1 )
+                    ldwork = m
+                    ! multiply bottom part of c by q21**h.
+                    call stdlib_clacpy( 'ALL', n2, len, c( n1+1, i ), ldc, work,ldwork )
+                    call stdlib_ctrmm( 'LEFT', 'UPPER', 'CONJUGATE', 'NON-UNIT',n2, len, cone, q( &
+                              n1+1, 1 ), ldq, work,ldwork )
+                    ! multiply top part of c by q11**h.
+                    call stdlib_cgemm( 'CONJUGATE', 'NO TRANSPOSE', n2, len, n1,cone, q, ldq, c( &
+                              1, i ), ldc, cone, work,ldwork )
+                    ! multiply top part of c by q12**h.
+                    call stdlib_clacpy( 'ALL', n1, len, c( 1, i ), ldc,work( n2+1 ), ldwork )
+                              
+                    call stdlib_ctrmm( 'LEFT', 'LOWER', 'CONJUGATE', 'NON-UNIT',n1, len, cone, q( &
+                              1, n2+1 ), ldq,work( n2+1 ), ldwork )
+                    ! multiply bottom part of c by q22**h.
+                    call stdlib_cgemm( 'CONJUGATE', 'NO TRANSPOSE', n1, len, n2,cone, q( n1+1, n2+&
+                              1 ), ldq, c( n1+1, i ), ldc,cone, work( n2+1 ), ldwork )
+                    ! copy everything back.
+                    call stdlib_clacpy( 'ALL', m, len, work, ldwork, c( 1, i ),ldc )
+                 end do
+              end if
+           else
+              if( notran ) then
+                 do i = 1, m, nb
+                    len = min( nb, m-i+1 )
+                    ldwork = len
+                    ! multiply right part of c by q21.
+                    call stdlib_clacpy( 'ALL', len, n2, c( i, n1+1 ), ldc, work,ldwork )
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',len, n2, cone,&
+                               q( n1+1, 1 ), ldq, work,ldwork )
+                    ! multiply left part of c by q11.
+                    call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', len, n2, n1,cone, c( i, 1 )&
+                              , ldc, q, ldq, cone, work,ldwork )
+                    ! multiply left part of c by q12.
+                    call stdlib_clacpy( 'ALL', len, n1, c( i, 1 ), ldc,work( 1 + n2*ldwork ), &
+                              ldwork )
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT',len, n1, cone,&
+                               q( 1, n2+1 ), ldq,work( 1 + n2*ldwork ), ldwork )
+                    ! multiply right part of c by q22.
+                    call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', len, n1, n2,cone, c( i, n1+&
+                              1 ), ldc, q( n1+1, n2+1 ), ldq,cone, work( 1 + n2*ldwork ), ldwork )
+                    ! copy everything back.
+                    call stdlib_clacpy( 'ALL', len, n, work, ldwork, c( i, 1 ),ldc )
+                 end do
+              else
+                 do i = 1, m, nb
+                    len = min( nb, m-i+1 )
+                    ldwork = len
+                    ! multiply right part of c by q12**h.
+                    call stdlib_clacpy( 'ALL', len, n1, c( i, n2+1 ), ldc, work,ldwork )
+                    call stdlib_ctrmm( 'RIGHT', 'LOWER', 'CONJUGATE', 'NON-UNIT',len, n1, cone, q(&
+                               1, n2+1 ), ldq, work,ldwork )
+                    ! multiply left part of c by q11**h.
+                    call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE', len, n1, n2,cone, c( i, 1 ), &
+                              ldc, q, ldq, cone, work,ldwork )
+                    ! multiply left part of c by q21**h.
+                    call stdlib_clacpy( 'ALL', len, n2, c( i, 1 ), ldc,work( 1 + n1*ldwork ), &
+                              ldwork )
+                    call stdlib_ctrmm( 'RIGHT', 'UPPER', 'CONJUGATE', 'NON-UNIT',len, n2, cone, q(&
+                               n1+1, 1 ), ldq,work( 1 + n1*ldwork ), ldwork )
+                    ! multiply right part of c by q22**h.
+                    call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE', len, n2, n1,cone, c( i, n2+1 )&
+                              , ldc, q( n1+1, n2+1 ), ldq,cone, work( 1 + n1*ldwork ), ldwork )
+                    ! copy everything back.
+                    call stdlib_clacpy( 'ALL', len, n, work, ldwork, c( i, 1 ),ldc )
+                 end do
+              end if
+           end if
+           work( 1 ) = cmplx( lwkopt,KIND=sp)
+           return
+     end subroutine stdlib_cunm22
+
+     !> CUNM2L: overwrites the general complex m-by-n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by CGEQLF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_cunm2l( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, mi, ni, nq
+           complex(sp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNM2L', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. notran .or. .not.left .and. .not.notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+           else
+              mi = m
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(1:m-k+i,1:n)
+                 mi = m - k + i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,1:n-k+i)
+                 ni = n - k + i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = tau( i )
+              else
+                 taui = conjg( tau( i ) )
+              end if
+              aii = a( nq-k+i, i )
+              a( nq-k+i, i ) = cone
+              call stdlib_clarf( side, mi, ni, a( 1, i ), 1, taui, c, ldc, work )
+              a( nq-k+i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_cunm2l
+
+     !> CUNM2R: overwrites the general complex m-by-n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by CGEQRF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_cunm2r( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, jc, mi, ni, nq
+           complex(sp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNM2R', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran .or. .not.left .and. notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              jc = 1
+           else
+              mi = m
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = tau( i )
+              else
+                 taui = conjg( tau( i ) )
+              end if
+              aii = a( i, i )
+              a( i, i ) = cone
+              call stdlib_clarf( side, mi, ni, a( i, i ), 1, taui, c( ic, jc ), ldc,work )
+              a( i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_cunm2r
+
+     !> CUNML2: overwrites the general complex m-by-n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k)**H . . . H(2)**H H(1)**H
+     !> as returned by CGELQF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_cunml2( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, jc, mi, ni, nq
+           complex(sp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNML2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. notran .or. .not.left .and. .not.notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              jc = 1
+           else
+              mi = m
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = conjg( tau( i ) )
+              else
+                 taui = tau( i )
+              end if
+              if( i<nq )call stdlib_clacgv( nq-i, a( i, i+1 ), lda )
+              aii = a( i, i )
+              a( i, i ) = cone
+              call stdlib_clarf( side, mi, ni, a( i, i ), lda, taui, c( ic, jc ),ldc, work )
+                        
+              a( i, i ) = aii
+              if( i<nq )call stdlib_clacgv( nq-i, a( i, i+1 ), lda )
+           end do
+           return
+     end subroutine stdlib_cunml2
+
+     !> CUNMLQ: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k)**H . . . H(2)**H H(1)**H
+     !> as returned by CGELQF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_cunmlq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, jc, ldwork, lwkopt, mi, nb, nbmin, &
+                     ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 .or. k==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'CUNMLQ', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNMLQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 ) then
+              return
+           end if
+           ! determine the block size
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'CUNMLQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_cunml2( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. notran ) .or.( .not.left .and. .not.notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              if( notran ) then
+                 transt = 'C'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i) h(i+1) . . . h(i+ib-1)
+                 call stdlib_clarft( 'FORWARD', 'ROWWISE', nq-i+1, ib, a( i, i ),lda, tau( i ), &
+                           work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**h is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**h
+                 call stdlib_clarfb( side, transt, 'FORWARD', 'ROWWISE', mi, ni, ib,a( i, i ), &
+                           lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cunmlq
+
+     !> CUNMQL: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by CGEQLF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_cunmql( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, i1, i2, i3, ib, iinfo, iwt, ldwork, lwkopt, mi, nb, nbmin, ni, nq, &
+                     nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'CUNMQL', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNMQL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! determine the block size
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'CUNMQL', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_cunm2l( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. notran ) .or.( .not.left .and. .not.notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_clarft( 'BACKWARD', 'COLUMNWISE', nq-k+i+ib-1, ib,a( 1, i ), lda, &
+                           tau( i ), work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(1:m-k+i+ib-1,1:n)
+                    mi = m - k + i + ib - 1
+                 else
+                    ! h or h**h is applied to c(1:m,1:n-k+i+ib-1)
+                    ni = n - k + i + ib - 1
+                 end if
+                 ! apply h or h**h
+                 call stdlib_clarfb( side, trans, 'BACKWARD', 'COLUMNWISE', mi, ni,ib, a( 1, i ), &
+                           lda, work( iwt ), ldt, c, ldc,work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cunmql
+
+     !> CUNMQR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by CGEQRF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_cunmqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, jc, ldwork, lwkopt, mi, nb, nbmin, &
+                     ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'CUNMQR', side // trans, m, n, k,-1 ) )
+              lwkopt = nw*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNMQR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'CUNMQR', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_cunm2r( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i) h(i+1) . . . h(i+ib-1)
+                 call stdlib_clarft( 'FORWARD', 'COLUMNWISE', nq-i+1, ib, a( i, i ),lda, tau( i ),&
+                            work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**h is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**h
+                 call stdlib_clarfb( side, trans, 'FORWARD', 'COLUMNWISE', mi, ni,ib, a( i, i ), &
+                           lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cunmqr
+
+     !> CUNMR2: overwrites the general complex m-by-n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1)**H H(2)**H . . . H(k)**H
+     !> as returned by CGERQF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_cunmr2( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, mi, ni, nq
+           complex(sp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNMR2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran .or. .not.left .and. notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+           else
+              mi = m
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(1:m-k+i,1:n)
+                 mi = m - k + i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,1:n-k+i)
+                 ni = n - k + i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = conjg( tau( i ) )
+              else
+                 taui = tau( i )
+              end if
+              call stdlib_clacgv( nq-k+i-1, a( i, 1 ), lda )
+              aii = a( i, nq-k+i )
+              a( i, nq-k+i ) = cone
+              call stdlib_clarf( side, mi, ni, a( i, 1 ), lda, taui, c, ldc, work )
+              a( i, nq-k+i ) = aii
+              call stdlib_clacgv( nq-k+i-1, a( i, 1 ), lda )
+           end do
+           return
+     end subroutine stdlib_cunmr2
+
+     !> CUNMR3: overwrites the general complex m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by CTZRZF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_cunmr3( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, l, lda, ldc, m, n
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), tau(*)
+           complex(sp), intent(inout) :: c(ldc,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, ja, jc, mi, ni, nq
+           complex(sp) :: taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( l<0 .or. ( left .and. ( l>m ) ) .or.( .not.left .and. ( l>n ) ) ) then
+              info = -6
+           else if( lda<max( 1, k ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNMR3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran .or. .not.left .and. notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              ja = m - l + 1
+              jc = 1
+           else
+              mi = m
+              ja = n - l + 1
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = tau( i )
+              else
+                 taui = conjg( tau( i ) )
+              end if
+              call stdlib_clarz( side, mi, ni, l, a( i, ja ), lda, taui,c( ic, jc ), ldc, work )
+                        
+           end do
+           return
+     end subroutine stdlib_cunmr3
+
+     !> CUNMRQ: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1)**H H(2)**H . . . H(k)**H
+     !> as returned by CGERQF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_cunmrq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, iinfo, iwt, ldwork, lwkopt, mi, nb, nbmin, ni, nq, &
+                     nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'CUNMRQ', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNMRQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'CUNMRQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_cunmr2( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              if( notran ) then
+                 transt = 'C'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_clarft( 'BACKWARD', 'ROWWISE', nq-k+i+ib-1, ib,a( i, 1 ), lda, tau( &
+                           i ), work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(1:m-k+i+ib-1,1:n)
+                    mi = m - k + i + ib - 1
+                 else
+                    ! h or h**h is applied to c(1:m,1:n-k+i+ib-1)
+                    ni = n - k + i + ib - 1
+                 end if
+                 ! apply h or h**h
+                 call stdlib_clarfb( side, transt, 'BACKWARD', 'ROWWISE', mi, ni,ib, a( i, 1 ), &
+                           lda, work( iwt ), ldt, c, ldc,work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cunmrq
+
+     !> CUNMRZ: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by CTZRZF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_cunmrz( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, l, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, ja, jc, ldwork, lwkopt, mi, nb, &
+                     nbmin, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( l<0 .or. ( left .and. ( l>m ) ) .or.( .not.left .and. ( l>n ) ) ) then
+              info = -6
+           else if( lda<max( 1, k ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'CUNMRQ', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNMRZ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! determine the block size.
+           nb = min( nbmax, stdlib_ilaenv( 1, 'CUNMRQ', side // trans, m, n, k,-1 ) )
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'CUNMRQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_cunmr3( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, iinfo )
+                        
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+                 ja = m - l + 1
+              else
+                 mi = m
+                 ic = 1
+                 ja = n - l + 1
+              end if
+              if( notran ) then
+                 transt = 'C'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_clarzt( 'BACKWARD', 'ROWWISE', l, ib, a( i, ja ), lda,tau( i ), work(&
+                            iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**h is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**h
+                 call stdlib_clarzb( side, transt, 'BACKWARD', 'ROWWISE', mi, ni,ib, l, a( i, ja )&
+                           , lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cunmrz
+
+     !> CBBCSD: computes the CS decomposition of a unitary matrix in
+     !> bidiagonal-block form,
+     !> [ B11 | B12 0  0 ]
+     !> [  0  |  0 -I  0 ]
+     !> X = [----------------]
+     !> [ B21 | B22 0  0 ]
+     !> [  0  |  0  0  I ]
+     !> [  C | -S  0  0 ]
+     !> [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**H
+     !> = [---------] [---------------] [---------]   .
+     !> [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
+     !> [  0 |  0  0  I ]
+     !> X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
+     !> than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
+     !> transposed and/or permuted. This can be done in constant time using
+     !> the TRANS and SIGNS options. See CUNCSD for details.)
+     !> The bidiagonal matrices B11, B12, B21, and B22 are represented
+     !> implicitly by angles THETA(1:Q) and PHI(1:Q-1).
+     !> The unitary matrices U1, U2, V1T, and V2T are input/output.
+     !> The input matrices are pre- or post-multiplied by the appropriate
+     !> singular vector matrices.
+
+     pure subroutine stdlib_cbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
+     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, rwork, &
+               lrwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lrwork, m, p, q
+           ! Array Arguments 
+           real(sp), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
+                      b22e(*), rwork(*)
+           real(sp), intent(inout) :: phi(*), theta(*)
+           complex(sp), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
+                     
+        ! ===================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxitr = 6
+           real(sp), parameter :: hundred = 100.0_sp
+           real(sp), parameter :: meighth = -0.125_sp
+           real(sp), parameter :: piover2 = 1.57079632679489661923132169163975144210_sp
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
+                     wantu2, wantv1t, wantv2t
+           integer(ilp) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
+                     iv2tcs, iv2tsn, j, lrworkmin, lrworkopt, maxit, mini
+           real(sp) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
+                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
+           ! Intrinsic Functions 
+           intrinsic :: abs,atan2,cos,max,min,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lrwork == -1
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           wantv2t = stdlib_lsame( jobv2t, 'Y' )
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           if( m < 0 ) then
+              info = -6
+           else if( p < 0 .or. p > m ) then
+              info = -7
+           else if( q < 0 .or. q > m ) then
+              info = -8
+           else if( q > p .or. q > m-p .or. q > m-q ) then
+              info = -8
+           else if( wantu1 .and. ldu1 < p ) then
+              info = -12
+           else if( wantu2 .and. ldu2 < m-p ) then
+              info = -14
+           else if( wantv1t .and. ldv1t < q ) then
+              info = -16
+           else if( wantv2t .and. ldv2t < m-q ) then
+              info = -18
+           end if
+           ! quick return if q = 0
+           if( info == 0 .and. q == 0 ) then
+              lrworkmin = 1
+              rwork(1) = lrworkmin
+              return
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              iu1cs = 1
+              iu1sn = iu1cs + q
+              iu2cs = iu1sn + q
+              iu2sn = iu2cs + q
+              iv1tcs = iu2sn + q
+              iv1tsn = iv1tcs + q
+              iv2tcs = iv1tsn + q
+              iv2tsn = iv2tcs + q
+              lrworkopt = iv2tsn + q - 1
+              lrworkmin = lrworkopt
+              rwork(1) = lrworkopt
+              if( lrwork < lrworkmin .and. .not. lquery ) then
+                 info = -28
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'CBBCSD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'EPSILON' )
+           unfl = stdlib_slamch( 'SAFE MINIMUM' )
+           tolmul = max( ten, min( hundred, eps**meighth ) )
+           tol = tolmul*eps
+           thresh = max( tol, maxitr*q*q*unfl )
+           ! test for negligible sines or cosines
+           do i = 1, q
+              if( theta(i) < thresh ) then
+                 theta(i) = zero
+              else if( theta(i) > piover2-thresh ) then
+                 theta(i) = piover2
+              end if
+           end do
+           do i = 1, q-1
+              if( phi(i) < thresh ) then
+                 phi(i) = zero
+              else if( phi(i) > piover2-thresh ) then
+                 phi(i) = piover2
+              end if
+           end do
+           ! initial deflation
+           imax = q
+           do while( imax > 1 )
+              if( phi(imax-1) /= zero ) then
+                 exit
+              end if
+              imax = imax - 1
+           end do
+           imin = imax - 1
+           if  ( imin > 1 ) then
+              do while( phi(imin-1) /= zero )
+                 imin = imin - 1
+                 if  ( imin <= 1 ) exit
+              end do
+           end if
+           ! initialize iteration counter
+           maxit = maxitr*q*q
+           iter = 0
+           ! begin main iteration loop
+           do while( imax > 1 )
+              ! compute the matrix entries
+              b11d(imin) = cos( theta(imin) )
+              b21d(imin) = -sin( theta(imin) )
+              do i = imin, imax - 1
+                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
+                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
+                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
+                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
+                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
+                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
+                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
+                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
+              end do
+              b12d(imax) = sin( theta(imax) )
+              b22d(imax) = cos( theta(imax) )
+              ! abort if not converging; otherwise, increment iter
+              if( iter > maxit ) then
+                 info = 0
+                 do i = 1, q
+                    if( phi(i) /= zero )info = info + 1
+                 end do
+                 return
+              end if
+              iter = iter + imax - imin
+              ! compute shifts
+              thetamax = theta(imin)
+              thetamin = theta(imin)
+              do i = imin+1, imax
+                 if( theta(i) > thetamax )thetamax = theta(i)
+                 if( theta(i) < thetamin )thetamin = theta(i)
+              end do
+              if( thetamax > piover2 - thresh ) then
+                 ! zero on diagonals of b11 and b22; induce deflation with a
+                 ! zero shift
+                 mu = zero
+                 nu = one
+              else if( thetamin < thresh ) then
+                 ! zero on diagonals of b12 and b22; induce deflation with a
+                 ! zero shift
+                 mu = one
+                 nu = zero
+              else
+                 ! compute shifts for b11 and b21 and use the lesser
+                 call stdlib_slas2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
+                           
+                 call stdlib_slas2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
+                           
+                 if( sigma11 <= sigma21 ) then
+                    mu = sigma11
+                    nu = sqrt( one - mu**2 )
+                    if( mu < thresh ) then
+                       mu = zero
+                       nu = one
+                    end if
+                 else
+                    nu = sigma21
+                    mu = sqrt( 1.0_sp - nu**2 )
+                    if( nu < thresh ) then
+                       mu = one
+                       nu = zero
+                    end if
+                 end if
+              end if
+              ! rotate to produce bulges in b11 and b21
+              if( mu <= nu ) then
+                 call stdlib_slartgs( b11d(imin), b11e(imin), mu,rwork(iv1tcs+imin-1), rwork(&
+                           iv1tsn+imin-1) )
+              else
+                 call stdlib_slartgs( b21d(imin), b21e(imin), nu,rwork(iv1tcs+imin-1), rwork(&
+                           iv1tsn+imin-1) )
+              end if
+              temp = rwork(iv1tcs+imin-1)*b11d(imin) +rwork(iv1tsn+imin-1)*b11e(imin)
+              b11e(imin) = rwork(iv1tcs+imin-1)*b11e(imin) -rwork(iv1tsn+imin-1)*b11d(imin)
+                        
+              b11d(imin) = temp
+              b11bulge = rwork(iv1tsn+imin-1)*b11d(imin+1)
+              b11d(imin+1) = rwork(iv1tcs+imin-1)*b11d(imin+1)
+              temp = rwork(iv1tcs+imin-1)*b21d(imin) +rwork(iv1tsn+imin-1)*b21e(imin)
+              b21e(imin) = rwork(iv1tcs+imin-1)*b21e(imin) -rwork(iv1tsn+imin-1)*b21d(imin)
+                        
+              b21d(imin) = temp
+              b21bulge = rwork(iv1tsn+imin-1)*b21d(imin+1)
+              b21d(imin+1) = rwork(iv1tcs+imin-1)*b21d(imin+1)
+              ! compute theta(imin)
+              theta( imin ) = atan2( sqrt( b21d(imin)**2+b21bulge**2 ),sqrt( b11d(imin)**2+&
+                        b11bulge**2 ) )
+              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
+              if( b11d(imin)**2+b11bulge**2 > thresh**2 ) then
+                 call stdlib_slartgp( b11bulge, b11d(imin), rwork(iu1sn+imin-1),rwork(iu1cs+imin-&
+                           1), r )
+              else if( mu <= nu ) then
+                 call stdlib_slartgs( b11e( imin ), b11d( imin + 1 ), mu,rwork(iu1cs+imin-1), &
+                           rwork(iu1sn+imin-1) )
+              else
+                 call stdlib_slartgs( b12d( imin ), b12e( imin ), nu,rwork(iu1cs+imin-1), rwork(&
+                           iu1sn+imin-1) )
+              end if
+              if( b21d(imin)**2+b21bulge**2 > thresh**2 ) then
+                 call stdlib_slartgp( b21bulge, b21d(imin), rwork(iu2sn+imin-1),rwork(iu2cs+imin-&
+                           1), r )
+              else if( nu < mu ) then
+                 call stdlib_slartgs( b21e( imin ), b21d( imin + 1 ), nu,rwork(iu2cs+imin-1), &
+                           rwork(iu2sn+imin-1) )
+              else
+                 call stdlib_slartgs( b22d(imin), b22e(imin), mu,rwork(iu2cs+imin-1), rwork(iu2sn+&
+                           imin-1) )
+              end if
+              rwork(iu2cs+imin-1) = -rwork(iu2cs+imin-1)
+              rwork(iu2sn+imin-1) = -rwork(iu2sn+imin-1)
+              temp = rwork(iu1cs+imin-1)*b11e(imin) +rwork(iu1sn+imin-1)*b11d(imin+1)
+              b11d(imin+1) = rwork(iu1cs+imin-1)*b11d(imin+1) -rwork(iu1sn+imin-1)*b11e(imin)
+                        
+              b11e(imin) = temp
+              if( imax > imin+1 ) then
+                 b11bulge = rwork(iu1sn+imin-1)*b11e(imin+1)
+                 b11e(imin+1) = rwork(iu1cs+imin-1)*b11e(imin+1)
+              end if
+              temp = rwork(iu1cs+imin-1)*b12d(imin) +rwork(iu1sn+imin-1)*b12e(imin)
+              b12e(imin) = rwork(iu1cs+imin-1)*b12e(imin) -rwork(iu1sn+imin-1)*b12d(imin)
+              b12d(imin) = temp
+              b12bulge = rwork(iu1sn+imin-1)*b12d(imin+1)
+              b12d(imin+1) = rwork(iu1cs+imin-1)*b12d(imin+1)
+              temp = rwork(iu2cs+imin-1)*b21e(imin) +rwork(iu2sn+imin-1)*b21d(imin+1)
+              b21d(imin+1) = rwork(iu2cs+imin-1)*b21d(imin+1) -rwork(iu2sn+imin-1)*b21e(imin)
+                        
+              b21e(imin) = temp
+              if( imax > imin+1 ) then
+                 b21bulge = rwork(iu2sn+imin-1)*b21e(imin+1)
+                 b21e(imin+1) = rwork(iu2cs+imin-1)*b21e(imin+1)
+              end if
+              temp = rwork(iu2cs+imin-1)*b22d(imin) +rwork(iu2sn+imin-1)*b22e(imin)
+              b22e(imin) = rwork(iu2cs+imin-1)*b22e(imin) -rwork(iu2sn+imin-1)*b22d(imin)
+              b22d(imin) = temp
+              b22bulge = rwork(iu2sn+imin-1)*b22d(imin+1)
+              b22d(imin+1) = rwork(iu2cs+imin-1)*b22d(imin+1)
+              ! inner loop: chase bulges from b11(imin,imin+2),
+              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
+              ! bottom-right
+              do i = imin+1, imax-1
+                 ! compute phi(i-1)
+                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
+                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
+                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
+                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
+                 phi(i-1) = atan2( sqrt(x1**2+x2**2), sqrt(y1**2+y2**2) )
+                 ! determine if there are bulges to chase or if a new direct
+                 ! summand has been reached
+                 restart11 = b11e(i-1)**2 + b11bulge**2 <= thresh**2
+                 restart21 = b21e(i-1)**2 + b21bulge**2 <= thresh**2
+                 restart12 = b12d(i-1)**2 + b12bulge**2 <= thresh**2
+                 restart22 = b22d(i-1)**2 + b22bulge**2 <= thresh**2
+                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
+                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
+                 ! chasing by applying the original shift again.
+                 if( .not. restart11 .and. .not. restart21 ) then
+                    call stdlib_slartgp( x2, x1, rwork(iv1tsn+i-1),rwork(iv1tcs+i-1), r )
+                 else if( .not. restart11 .and. restart21 ) then
+                    call stdlib_slartgp( b11bulge, b11e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
+                               r )
+                 else if( restart11 .and. .not. restart21 ) then
+                    call stdlib_slartgp( b21bulge, b21e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
+                               r )
+                 else if( mu <= nu ) then
+                    call stdlib_slartgs( b11d(i), b11e(i), mu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
+                               )
+                 else
+                    call stdlib_slartgs( b21d(i), b21e(i), nu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
+                               )
+                 end if
+                 rwork(iv1tcs+i-1) = -rwork(iv1tcs+i-1)
+                 rwork(iv1tsn+i-1) = -rwork(iv1tsn+i-1)
+                 if( .not. restart12 .and. .not. restart22 ) then
+                    call stdlib_slartgp( y2, y1, rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-1-1), r )
+                              
+                 else if( .not. restart12 .and. restart22 ) then
+                    call stdlib_slartgp( b12bulge, b12d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
+                              1-1), r )
+                 else if( restart12 .and. .not. restart22 ) then
+                    call stdlib_slartgp( b22bulge, b22d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
+                              1-1), r )
+                 else if( nu < mu ) then
+                    call stdlib_slartgs( b12e(i-1), b12d(i), nu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
+                              i-1-1) )
+                 else
+                    call stdlib_slartgs( b22e(i-1), b22d(i), mu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
+                              i-1-1) )
+                 end if
+                 temp = rwork(iv1tcs+i-1)*b11d(i) + rwork(iv1tsn+i-1)*b11e(i)
+                 b11e(i) = rwork(iv1tcs+i-1)*b11e(i) -rwork(iv1tsn+i-1)*b11d(i)
+                 b11d(i) = temp
+                 b11bulge = rwork(iv1tsn+i-1)*b11d(i+1)
+                 b11d(i+1) = rwork(iv1tcs+i-1)*b11d(i+1)
+                 temp = rwork(iv1tcs+i-1)*b21d(i) + rwork(iv1tsn+i-1)*b21e(i)
+                 b21e(i) = rwork(iv1tcs+i-1)*b21e(i) -rwork(iv1tsn+i-1)*b21d(i)
+                 b21d(i) = temp
+                 b21bulge = rwork(iv1tsn+i-1)*b21d(i+1)
+                 b21d(i+1) = rwork(iv1tcs+i-1)*b21d(i+1)
+                 temp = rwork(iv2tcs+i-1-1)*b12e(i-1) +rwork(iv2tsn+i-1-1)*b12d(i)
+                 b12d(i) = rwork(iv2tcs+i-1-1)*b12d(i) -rwork(iv2tsn+i-1-1)*b12e(i-1)
+                 b12e(i-1) = temp
+                 b12bulge = rwork(iv2tsn+i-1-1)*b12e(i)
+                 b12e(i) = rwork(iv2tcs+i-1-1)*b12e(i)
+                 temp = rwork(iv2tcs+i-1-1)*b22e(i-1) +rwork(iv2tsn+i-1-1)*b22d(i)
+                 b22d(i) = rwork(iv2tcs+i-1-1)*b22d(i) -rwork(iv2tsn+i-1-1)*b22e(i-1)
+                 b22e(i-1) = temp
+                 b22bulge = rwork(iv2tsn+i-1-1)*b22e(i)
+                 b22e(i) = rwork(iv2tcs+i-1-1)*b22e(i)
+                 ! compute theta(i)
+                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
+                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
+                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
+                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
+                 theta(i) = atan2( sqrt(y1**2+y2**2), sqrt(x1**2+x2**2) )
+                 ! determine if there are bulges to chase or if a new direct
+                 ! summand has been reached
+                 restart11 =   b11d(i)**2 + b11bulge**2 <= thresh**2
+                 restart12 = b12e(i-1)**2 + b12bulge**2 <= thresh**2
+                 restart21 =   b21d(i)**2 + b21bulge**2 <= thresh**2
+                 restart22 = b22e(i-1)**2 + b22bulge**2 <= thresh**2
+                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
+                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
+                 ! chasing by applying the original shift again.
+                 if( .not. restart11 .and. .not. restart12 ) then
+                    call stdlib_slartgp( x2, x1, rwork(iu1sn+i-1), rwork(iu1cs+i-1),r )
+                 else if( .not. restart11 .and. restart12 ) then
+                    call stdlib_slartgp( b11bulge, b11d(i), rwork(iu1sn+i-1),rwork(iu1cs+i-1), r )
+                              
+                 else if( restart11 .and. .not. restart12 ) then
+                    call stdlib_slartgp( b12bulge, b12e(i-1), rwork(iu1sn+i-1),rwork(iu1cs+i-1), &
+                              r )
+                 else if( mu <= nu ) then
+                    call stdlib_slartgs( b11e(i), b11d(i+1), mu, rwork(iu1cs+i-1),rwork(iu1sn+i-1)&
+                               )
+                 else
+                    call stdlib_slartgs( b12d(i), b12e(i), nu, rwork(iu1cs+i-1),rwork(iu1sn+i-1) )
+                              
+                 end if
+                 if( .not. restart21 .and. .not. restart22 ) then
+                    call stdlib_slartgp( y2, y1, rwork(iu2sn+i-1), rwork(iu2cs+i-1),r )
+                 else if( .not. restart21 .and. restart22 ) then
+                    call stdlib_slartgp( b21bulge, b21d(i), rwork(iu2sn+i-1),rwork(iu2cs+i-1), r )
+                              
+                 else if( restart21 .and. .not. restart22 ) then
+                    call stdlib_slartgp( b22bulge, b22e(i-1), rwork(iu2sn+i-1),rwork(iu2cs+i-1), &
+                              r )
+                 else if( nu < mu ) then
+                    call stdlib_slartgs( b21e(i), b21e(i+1), nu, rwork(iu2cs+i-1),rwork(iu2sn+i-1)&
+                               )
+                 else
+                    call stdlib_slartgs( b22d(i), b22e(i), mu, rwork(iu2cs+i-1),rwork(iu2sn+i-1) )
+                              
+                 end if
+                 rwork(iu2cs+i-1) = -rwork(iu2cs+i-1)
+                 rwork(iu2sn+i-1) = -rwork(iu2sn+i-1)
+                 temp = rwork(iu1cs+i-1)*b11e(i) + rwork(iu1sn+i-1)*b11d(i+1)
+                 b11d(i+1) = rwork(iu1cs+i-1)*b11d(i+1) -rwork(iu1sn+i-1)*b11e(i)
+                 b11e(i) = temp
+                 if( i < imax - 1 ) then
+                    b11bulge = rwork(iu1sn+i-1)*b11e(i+1)
+                    b11e(i+1) = rwork(iu1cs+i-1)*b11e(i+1)
+                 end if
+                 temp = rwork(iu2cs+i-1)*b21e(i) + rwork(iu2sn+i-1)*b21d(i+1)
+                 b21d(i+1) = rwork(iu2cs+i-1)*b21d(i+1) -rwork(iu2sn+i-1)*b21e(i)
+                 b21e(i) = temp
+                 if( i < imax - 1 ) then
+                    b21bulge = rwork(iu2sn+i-1)*b21e(i+1)
+                    b21e(i+1) = rwork(iu2cs+i-1)*b21e(i+1)
+                 end if
+                 temp = rwork(iu1cs+i-1)*b12d(i) + rwork(iu1sn+i-1)*b12e(i)
+                 b12e(i) = rwork(iu1cs+i-1)*b12e(i) -rwork(iu1sn+i-1)*b12d(i)
+                 b12d(i) = temp
+                 b12bulge = rwork(iu1sn+i-1)*b12d(i+1)
+                 b12d(i+1) = rwork(iu1cs+i-1)*b12d(i+1)
+                 temp = rwork(iu2cs+i-1)*b22d(i) + rwork(iu2sn+i-1)*b22e(i)
+                 b22e(i) = rwork(iu2cs+i-1)*b22e(i) -rwork(iu2sn+i-1)*b22d(i)
+                 b22d(i) = temp
+                 b22bulge = rwork(iu2sn+i-1)*b22d(i+1)
+                 b22d(i+1) = rwork(iu2cs+i-1)*b22d(i+1)
+              end do
+              ! compute phi(imax-1)
+              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
+              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
+              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
+              phi(imax-1) = atan2( abs(x1), sqrt(y1**2+y2**2) )
+              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
+              restart12 = b12d(imax-1)**2 + b12bulge**2 <= thresh**2
+              restart22 = b22d(imax-1)**2 + b22bulge**2 <= thresh**2
+              if( .not. restart12 .and. .not. restart22 ) then
+                 call stdlib_slartgp( y2, y1, rwork(iv2tsn+imax-1-1),rwork(iv2tcs+imax-1-1), r )
+                           
+              else if( .not. restart12 .and. restart22 ) then
+                 call stdlib_slartgp( b12bulge, b12d(imax-1),rwork(iv2tsn+imax-1-1),rwork(iv2tcs+&
+                           imax-1-1), r )
+              else if( restart12 .and. .not. restart22 ) then
+                 call stdlib_slartgp( b22bulge, b22d(imax-1),rwork(iv2tsn+imax-1-1),rwork(iv2tcs+&
+                           imax-1-1), r )
+              else if( nu < mu ) then
+                 call stdlib_slartgs( b12e(imax-1), b12d(imax), nu,rwork(iv2tcs+imax-1-1),rwork(&
+                           iv2tsn+imax-1-1) )
+              else
+                 call stdlib_slartgs( b22e(imax-1), b22d(imax), mu,rwork(iv2tcs+imax-1-1),rwork(&
+                           iv2tsn+imax-1-1) )
+              end if
+              temp = rwork(iv2tcs+imax-1-1)*b12e(imax-1) +rwork(iv2tsn+imax-1-1)*b12d(imax)
+                        
+              b12d(imax) = rwork(iv2tcs+imax-1-1)*b12d(imax) -rwork(iv2tsn+imax-1-1)*b12e(imax-1)
+                        
+              b12e(imax-1) = temp
+              temp = rwork(iv2tcs+imax-1-1)*b22e(imax-1) +rwork(iv2tsn+imax-1-1)*b22d(imax)
+                        
+              b22d(imax) = rwork(iv2tcs+imax-1-1)*b22d(imax) -rwork(iv2tsn+imax-1-1)*b22e(imax-1)
+                        
+              b22e(imax-1) = temp
+              ! update singular vectors
+              if( wantu1 ) then
+                 if( colmajor ) then
+                    call stdlib_clasr( 'R', 'V', 'F', p, imax-imin+1,rwork(iu1cs+imin-1), rwork(&
+                              iu1sn+imin-1),u1(1,imin), ldu1 )
+                 else
+                    call stdlib_clasr( 'L', 'V', 'F', imax-imin+1, p,rwork(iu1cs+imin-1), rwork(&
+                              iu1sn+imin-1),u1(imin,1), ldu1 )
+                 end if
+              end if
+              if( wantu2 ) then
+                 if( colmajor ) then
+                    call stdlib_clasr( 'R', 'V', 'F', m-p, imax-imin+1,rwork(iu2cs+imin-1), rwork(&
+                              iu2sn+imin-1),u2(1,imin), ldu2 )
+                 else
+                    call stdlib_clasr( 'L', 'V', 'F', imax-imin+1, m-p,rwork(iu2cs+imin-1), rwork(&
+                              iu2sn+imin-1),u2(imin,1), ldu2 )
+                 end if
+              end if
+              if( wantv1t ) then
+                 if( colmajor ) then
+                    call stdlib_clasr( 'L', 'V', 'F', imax-imin+1, q,rwork(iv1tcs+imin-1), rwork(&
+                              iv1tsn+imin-1),v1t(imin,1), ldv1t )
+                 else
+                    call stdlib_clasr( 'R', 'V', 'F', q, imax-imin+1,rwork(iv1tcs+imin-1), rwork(&
+                              iv1tsn+imin-1),v1t(1,imin), ldv1t )
+                 end if
+              end if
+              if( wantv2t ) then
+                 if( colmajor ) then
+                    call stdlib_clasr( 'L', 'V', 'F', imax-imin+1, m-q,rwork(iv2tcs+imin-1), &
+                              rwork(iv2tsn+imin-1),v2t(imin,1), ldv2t )
+                 else
+                    call stdlib_clasr( 'R', 'V', 'F', m-q, imax-imin+1,rwork(iv2tcs+imin-1), &
+                              rwork(iv2tsn+imin-1),v2t(1,imin), ldv2t )
+                 end if
+              end if
+              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
+              if( b11e(imax-1)+b21e(imax-1) > 0 ) then
+                 b11d(imax) = -b11d(imax)
+                 b21d(imax) = -b21d(imax)
+                 if( wantv1t ) then
+                    if( colmajor ) then
+                       call stdlib_cscal( q, cnegone, v1t(imax,1), ldv1t )
+                    else
+                       call stdlib_cscal( q, cnegone, v1t(1,imax), 1 )
+                    end if
+                 end if
+              end if
+              ! compute theta(imax)
+              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
+              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
+              theta(imax) = atan2( abs(y1), abs(x1) )
+              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
+              ! and b22(imax,imax-1)
+              if( b11d(imax)+b12e(imax-1) < 0 ) then
+                 b12d(imax) = -b12d(imax)
+                 if( wantu1 ) then
+                    if( colmajor ) then
+                       call stdlib_cscal( p, cnegone, u1(1,imax), 1 )
+                    else
+                       call stdlib_cscal( p, cnegone, u1(imax,1), ldu1 )
+                    end if
+                 end if
+              end if
+              if( b21d(imax)+b22e(imax-1) > 0 ) then
+                 b22d(imax) = -b22d(imax)
+                 if( wantu2 ) then
+                    if( colmajor ) then
+                       call stdlib_cscal( m-p, cnegone, u2(1,imax), 1 )
+                    else
+                       call stdlib_cscal( m-p, cnegone, u2(imax,1), ldu2 )
+                    end if
+                 end if
+              end if
+              ! fix signs on b12(imax,imax) and b22(imax,imax)
+              if( b12d(imax)+b22d(imax) < 0 ) then
+                 if( wantv2t ) then
+                    if( colmajor ) then
+                       call stdlib_cscal( m-q, cnegone, v2t(imax,1), ldv2t )
+                    else
+                       call stdlib_cscal( m-q, cnegone, v2t(1,imax), 1 )
+                    end if
+                 end if
+              end if
+              ! test for negligible sines or cosines
+              do i = imin, imax
+                 if( theta(i) < thresh ) then
+                    theta(i) = zero
+                 else if( theta(i) > piover2-thresh ) then
+                    theta(i) = piover2
+                 end if
+              end do
+              do i = imin, imax-1
+                 if( phi(i) < thresh ) then
+                    phi(i) = zero
+                 else if( phi(i) > piover2-thresh ) then
+                    phi(i) = piover2
+                 end if
+              end do
+              ! deflate
+              if (imax > 1) then
+                 do while( phi(imax-1) == zero )
+                    imax = imax - 1
+                    if (imax <= 1) exit
+                 end do
+              end if
+              if( imin > imax - 1 )imin = imax - 1
+              if (imin > 1) then
+                 do while (phi(imin-1) /= zero)
+                     imin = imin - 1
+                     if (imin <= 1) exit
+                 end do
+              end if
+              ! repeat main iteration loop
+           end do
+           ! postprocessing: order theta from least to greatest
+           do i = 1, q
+              mini = i
+              thetamin = theta(i)
+              do j = i+1, q
+                 if( theta(j) < thetamin ) then
+                    mini = j
+                    thetamin = theta(j)
+                 end if
+              end do
+              if( mini /= i ) then
+                 theta(mini) = theta(i)
+                 theta(i) = thetamin
+                 if( colmajor ) then
+                    if( wantu1 )call stdlib_cswap( p, u1(1,i), 1, u1(1,mini), 1 )
+                    if( wantu2 )call stdlib_cswap( m-p, u2(1,i), 1, u2(1,mini), 1 )
+                    if( wantv1t )call stdlib_cswap( q, v1t(i,1), ldv1t, v1t(mini,1), ldv1t )
+                              
+                    if( wantv2t )call stdlib_cswap( m-q, v2t(i,1), ldv2t, v2t(mini,1),ldv2t )
+                              
+                 else
+                    if( wantu1 )call stdlib_cswap( p, u1(i,1), ldu1, u1(mini,1), ldu1 )
+                    if( wantu2 )call stdlib_cswap( m-p, u2(i,1), ldu2, u2(mini,1), ldu2 )
+                    if( wantv1t )call stdlib_cswap( q, v1t(1,i), 1, v1t(1,mini), 1 )
+                    if( wantv2t )call stdlib_cswap( m-q, v2t(1,i), 1, v2t(1,mini), 1 )
+                 end if
+              end if
+           end do
+           return
+     end subroutine stdlib_cbbcsd
+
+     !> CBDSQR: computes the singular values and, optionally, the right and/or
+     !> left singular vectors from the singular value decomposition (SVD) of
+     !> a real N-by-N (upper or lower) bidiagonal matrix B using the implicit
+     !> zero-shift QR algorithm.  The SVD of B has the form
+     !> B = Q * S * P**H
+     !> where S is the diagonal matrix of singular values, Q is an orthogonal
+     !> matrix of left singular vectors, and P is an orthogonal matrix of
+     !> right singular vectors.  If left singular vectors are requested, this
+     !> subroutine actually returns U*Q instead of Q, and, if right singular
+     !> vectors are requested, this subroutine returns P**H*VT instead of
+     !> P**H, for given complex input matrices U and VT.  When U and VT are
+     !> the unitary matrices that reduce a general matrix A to bidiagonal
+     !> form: A = U*B*VT, as computed by CGEBRD, then
+     !> A = (U*Q) * S * (P**H*VT)
+     !> is the SVD of A.  Optionally, the subroutine may also compute Q**H*C
+     !> for a given complex input matrix C.
+     !> See "Computing  Small Singular Values of Bidiagonal Matrices With
+     !> Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+     !> LAPACK Working Note #3 (or SIAM J. Sci. Statist. Comput. vol. 11,
+     !> no. 5, pp. 873-912, Sept 1990) and
+     !> "Accurate singular values and differential qd algorithms," by
+     !> B. Parlett and V. Fernando, Technical Report CPAM-554, Mathematics
+     !> Department, University of California at Berkeley, July 1992
+     !> for a detailed description of the algorithm.
+
+     pure subroutine stdlib_cbdsqr( uplo, n, ncvt, nru, ncc, d, e, vt, ldvt, u,ldu, c, ldc, rwork,&
+                info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, ldu, ldvt, n, ncc, ncvt, nru
+           ! Array Arguments 
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: c(ldc,*), u(ldu,*), vt(ldvt,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: hndrth = 0.01_sp
+           real(sp), parameter :: hndrd = 100.0_sp
+           real(sp), parameter :: meigth = -0.125_sp
+           integer(ilp), parameter :: maxitr = 6
+           
+           
+           
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, rotate
+           integer(ilp) :: i, idir, isub, iter, j, ll, lll, m, maxit, nm1, nm12, nm13, oldll, &
+                     oldm
+           real(sp) :: abse, abss, cosl, cosr, cs, eps, f, g, h, mu, oldcs, oldsn, r, shift, &
+           sigmn, sigmx, sinl, sinr, sll, smax, smin, sminl, sminoa, sn, thresh, tol, tolmul, &
+                     unfl
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.lower ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ncvt<0 ) then
+              info = -3
+           else if( nru<0 ) then
+              info = -4
+           else if( ncc<0 ) then
+              info = -5
+           else if( ( ncvt==0 .and. ldvt<1 ) .or.( ncvt>0 .and. ldvt<max( 1, n ) ) ) then
+              info = -9
+           else if( ldu<max( 1, nru ) ) then
+              info = -11
+           else if( ( ncc==0 .and. ldc<1 ) .or.( ncc>0 .and. ldc<max( 1, n ) ) ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CBDSQR', -info )
+              return
+           end if
+           if( n==0 )return
+           if( n==1 )go to 160
+           ! rotate is true if any singular vectors desired, false otherwise
+           rotate = ( ncvt>0 ) .or. ( nru>0 ) .or. ( ncc>0 )
+           ! if no singular vectors desired, use qd algorithm
+           if( .not.rotate ) then
+              call stdlib_slasq1( n, d, e, rwork, info )
+           ! if info equals 2, dqds didn't finish, try to finish
+              if( info /= 2 ) return
+              info = 0
+           end if
+           nm1 = n - 1
+           nm12 = nm1 + nm1
+           nm13 = nm12 + nm1
+           idir = 0
+           ! get machine constants
+           eps = stdlib_slamch( 'EPSILON' )
+           unfl = stdlib_slamch( 'SAFE MINIMUM' )
+           ! if matrix lower bidiagonal, rotate to be upper bidiagonal
+           ! by applying givens rotations on the left
+           if( lower ) then
+              do i = 1, n - 1
+                 call stdlib_slartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 rwork( i ) = cs
+                 rwork( nm1+i ) = sn
+              end do
+              ! update singular vectors if desired
+              if( nru>0 )call stdlib_clasr( 'R', 'V', 'F', nru, n, rwork( 1 ), rwork( n ),u, ldu )
+                        
+              if( ncc>0 )call stdlib_clasr( 'L', 'V', 'F', n, ncc, rwork( 1 ), rwork( n ),c, ldc )
+                        
+           end if
+           ! compute singular values to relative accuracy tol
+           ! (by setting tol to be negative, algorithm will compute
+           ! singular values to absolute accuracy abs(tol)*norm(input matrix))
+           tolmul = max( ten, min( hndrd, eps**meigth ) )
+           tol = tolmul*eps
+           ! compute approximate maximum, minimum singular values
+           smax = zero
+           do i = 1, n
+              smax = max( smax, abs( d( i ) ) )
+           end do
+           do i = 1, n - 1
+              smax = max( smax, abs( e( i ) ) )
+           end do
+           sminl = zero
+           if( tol>=zero ) then
+              ! relative accuracy desired
+              sminoa = abs( d( 1 ) )
+              if( sminoa==zero )go to 50
+              mu = sminoa
+              do i = 2, n
+                 mu = abs( d( i ) )*( mu / ( mu+abs( e( i-1 ) ) ) )
+                 sminoa = min( sminoa, mu )
+                 if( sminoa==zero )go to 50
+              end do
+              50 continue
+              sminoa = sminoa / sqrt( real( n,KIND=sp) )
+              thresh = max( tol*sminoa, maxitr*n*n*unfl )
+           else
+              ! absolute accuracy desired
+              thresh = max( abs( tol )*smax, maxitr*n*n*unfl )
+           end if
+           ! prepare for main iteration loop for the singular values
+           ! (maxit is the maximum number of passes through the inner
+           ! loop permitted before nonconvergence signalled.)
+           maxit = maxitr*n*n
+           iter = 0
+           oldll = -1
+           oldm = -1
+           ! m points to last element of unconverged part of matrix
+           m = n
+           ! begin main iteration loop
+           60 continue
+           ! check for convergence or exceeding iteration count
+           if( m<=1 )go to 160
+           if( iter>maxit )go to 200
+           ! find diagonal block of matrix to work on
+           if( tol<zero .and. abs( d( m ) )<=thresh )d( m ) = zero
+           smax = abs( d( m ) )
+           smin = smax
+           do lll = 1, m - 1
+              ll = m - lll
+              abss = abs( d( ll ) )
+              abse = abs( e( ll ) )
+              if( tol<zero .and. abss<=thresh )d( ll ) = zero
+              if( abse<=thresh )go to 80
+              smin = min( smin, abss )
+              smax = max( smax, abss, abse )
+           end do
+           ll = 0
+           go to 90
+           80 continue
+           e( ll ) = zero
+           ! matrix splits since e(ll) = 0
+           if( ll==m-1 ) then
+              ! convergence of bottom singular value, return to top of loop
+              m = m - 1
+              go to 60
+           end if
+           90 continue
+           ll = ll + 1
+           ! e(ll) through e(m-1) are nonzero, e(ll-1) is zero
+           if( ll==m-1 ) then
+              ! 2 by 2 block, handle separately
+              call stdlib_slasv2( d( m-1 ), e( m-1 ), d( m ), sigmn, sigmx, sinr,cosr, sinl, cosl &
+                        )
+              d( m-1 ) = sigmx
+              e( m-1 ) = zero
+              d( m ) = sigmn
+              ! compute singular vectors, if desired
+              if( ncvt>0 )call stdlib_csrot( ncvt, vt( m-1, 1 ), ldvt, vt( m, 1 ), ldvt,cosr, &
+                        sinr )
+              if( nru>0 )call stdlib_csrot( nru, u( 1, m-1 ), 1, u( 1, m ), 1, cosl, sinl )
+                        
+              if( ncc>0 )call stdlib_csrot( ncc, c( m-1, 1 ), ldc, c( m, 1 ), ldc, cosl,sinl )
+                        
+              m = m - 2
+              go to 60
+           end if
+           ! if working on new submatrix, choose shift direction
+           ! (from larger end diagonal element towards smaller)
+           if( ll>oldm .or. m<oldll ) then
+              if( abs( d( ll ) )>=abs( d( m ) ) ) then
+                 ! chase bulge from top (big end) to bottom (small end)
+                 idir = 1
+              else
+                 ! chase bulge from bottom (big end) to top (small end)
+                 idir = 2
+              end if
+           end if
+           ! apply convergence tests
+           if( idir==1 ) then
+              ! run convergence test in forward direction
+              ! first apply standard test to bottom of matrix
+              if( abs( e( m-1 ) )<=abs( tol )*abs( d( m ) ) .or.( tol<zero .and. abs( e( m-1 ) )&
+                        <=thresh ) ) then
+                 e( m-1 ) = zero
+                 go to 60
+              end if
+              if( tol>=zero ) then
+                 ! if relative accuracy desired,
+                 ! apply convergence criterion forward
+                 mu = abs( d( ll ) )
+                 sminl = mu
+                 do lll = ll, m - 1
+                    if( abs( e( lll ) )<=tol*mu ) then
+                       e( lll ) = zero
+                       go to 60
+                    end if
+                    mu = abs( d( lll+1 ) )*( mu / ( mu+abs( e( lll ) ) ) )
+                    sminl = min( sminl, mu )
+                 end do
+              end if
+           else
+              ! run convergence test in backward direction
+              ! first apply standard test to top of matrix
+              if( abs( e( ll ) )<=abs( tol )*abs( d( ll ) ) .or.( tol<zero .and. abs( e( ll ) )&
+                        <=thresh ) ) then
+                 e( ll ) = zero
+                 go to 60
+              end if
+              if( tol>=zero ) then
+                 ! if relative accuracy desired,
+                 ! apply convergence criterion backward
+                 mu = abs( d( m ) )
+                 sminl = mu
+                 do lll = m - 1, ll, -1
+                    if( abs( e( lll ) )<=tol*mu ) then
+                       e( lll ) = zero
+                       go to 60
+                    end if
+                    mu = abs( d( lll ) )*( mu / ( mu+abs( e( lll ) ) ) )
+                    sminl = min( sminl, mu )
+                 end do
+              end if
+           end if
+           oldll = ll
+           oldm = m
+           ! compute shift.  first, test if shifting would ruin relative
+           ! accuracy, and if so set the shift to zero.
+           if( tol>=zero .and. n*tol*( sminl / smax )<=max( eps, hndrth*tol ) ) then
+              ! use a zero shift to avoid loss of relative accuracy
+              shift = zero
+           else
+              ! compute the shift from 2-by-2 block at end of matrix
+              if( idir==1 ) then
+                 sll = abs( d( ll ) )
+                 call stdlib_slas2( d( m-1 ), e( m-1 ), d( m ), shift, r )
+              else
+                 sll = abs( d( m ) )
+                 call stdlib_slas2( d( ll ), e( ll ), d( ll+1 ), shift, r )
+              end if
+              ! test if shift negligible, and if so set to zero
+              if( sll>zero ) then
+                 if( ( shift / sll )**2<eps )shift = zero
+              end if
+           end if
+           ! increment iteration count
+           iter = iter + m - ll
+           ! if shift = 0, do simplified qr iteration
+           if( shift==zero ) then
+              if( idir==1 ) then
+                 ! chase bulge from top to bottom
+                 ! save cosines and sines for later singular vector updates
+                 cs = one
+                 oldcs = one
+                 do i = ll, m - 1
+                    call stdlib_slartg( d( i )*cs, e( i ), cs, sn, r )
+                    if( i>ll )e( i-1 ) = oldsn*r
+                    call stdlib_slartg( oldcs*r, d( i+1 )*sn, oldcs, oldsn, d( i ) )
+                    rwork( i-ll+1 ) = cs
+                    rwork( i-ll+1+nm1 ) = sn
+                    rwork( i-ll+1+nm12 ) = oldcs
+                    rwork( i-ll+1+nm13 ) = oldsn
+                 end do
+                 h = d( m )*cs
+                 d( m ) = h*oldcs
+                 e( m-1 ) = h*oldsn
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_clasr( 'L', 'V', 'F', m-ll+1, ncvt, rwork( 1 ),rwork( n )&
+                           , vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_clasr( 'R', 'V', 'F', nru, m-ll+1, rwork( nm12+1 ),rwork( &
+                           nm13+1 ), u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_clasr( 'L', 'V', 'F', m-ll+1, ncc, rwork( nm12+1 ),rwork( &
+                           nm13+1 ), c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( m-1 ) )<=thresh )e( m-1 ) = zero
+              else
+                 ! chase bulge from bottom to top
+                 ! save cosines and sines for later singular vector updates
+                 cs = one
+                 oldcs = one
+                 do i = m, ll + 1, -1
+                    call stdlib_slartg( d( i )*cs, e( i-1 ), cs, sn, r )
+                    if( i<m )e( i ) = oldsn*r
+                    call stdlib_slartg( oldcs*r, d( i-1 )*sn, oldcs, oldsn, d( i ) )
+                    rwork( i-ll ) = cs
+                    rwork( i-ll+nm1 ) = -sn
+                    rwork( i-ll+nm12 ) = oldcs
+                    rwork( i-ll+nm13 ) = -oldsn
+                 end do
+                 h = d( ll )*cs
+                 d( ll ) = h*oldcs
+                 e( ll ) = h*oldsn
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_clasr( 'L', 'V', 'B', m-ll+1, ncvt, rwork( nm12+1 ),&
+                           rwork( nm13+1 ), vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_clasr( 'R', 'V', 'B', nru, m-ll+1, rwork( 1 ),rwork( n ), &
+                           u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_clasr( 'L', 'V', 'B', m-ll+1, ncc, rwork( 1 ),rwork( n ), &
+                           c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( ll ) )<=thresh )e( ll ) = zero
+              end if
+           else
+              ! use nonzero shift
+              if( idir==1 ) then
+                 ! chase bulge from top to bottom
+                 ! save cosines and sines for later singular vector updates
+                 f = ( abs( d( ll ) )-shift )*( sign( one, d( ll ) )+shift / d( ll ) )
+                 g = e( ll )
+                 do i = ll, m - 1
+                    call stdlib_slartg( f, g, cosr, sinr, r )
+                    if( i>ll )e( i-1 ) = r
+                    f = cosr*d( i ) + sinr*e( i )
+                    e( i ) = cosr*e( i ) - sinr*d( i )
+                    g = sinr*d( i+1 )
+                    d( i+1 ) = cosr*d( i+1 )
+                    call stdlib_slartg( f, g, cosl, sinl, r )
+                    d( i ) = r
+                    f = cosl*e( i ) + sinl*d( i+1 )
+                    d( i+1 ) = cosl*d( i+1 ) - sinl*e( i )
+                    if( i<m-1 ) then
+                       g = sinl*e( i+1 )
+                       e( i+1 ) = cosl*e( i+1 )
+                    end if
+                    rwork( i-ll+1 ) = cosr
+                    rwork( i-ll+1+nm1 ) = sinr
+                    rwork( i-ll+1+nm12 ) = cosl
+                    rwork( i-ll+1+nm13 ) = sinl
+                 end do
+                 e( m-1 ) = f
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_clasr( 'L', 'V', 'F', m-ll+1, ncvt, rwork( 1 ),rwork( n )&
+                           , vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_clasr( 'R', 'V', 'F', nru, m-ll+1, rwork( nm12+1 ),rwork( &
+                           nm13+1 ), u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_clasr( 'L', 'V', 'F', m-ll+1, ncc, rwork( nm12+1 ),rwork( &
+                           nm13+1 ), c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( m-1 ) )<=thresh )e( m-1 ) = zero
+              else
+                 ! chase bulge from bottom to top
+                 ! save cosines and sines for later singular vector updates
+                 f = ( abs( d( m ) )-shift )*( sign( one, d( m ) )+shift /d( m ) )
+                 g = e( m-1 )
+                 do i = m, ll + 1, -1
+                    call stdlib_slartg( f, g, cosr, sinr, r )
+                    if( i<m )e( i ) = r
+                    f = cosr*d( i ) + sinr*e( i-1 )
+                    e( i-1 ) = cosr*e( i-1 ) - sinr*d( i )
+                    g = sinr*d( i-1 )
+                    d( i-1 ) = cosr*d( i-1 )
+                    call stdlib_slartg( f, g, cosl, sinl, r )
+                    d( i ) = r
+                    f = cosl*e( i-1 ) + sinl*d( i-1 )
+                    d( i-1 ) = cosl*d( i-1 ) - sinl*e( i-1 )
+                    if( i>ll+1 ) then
+                       g = sinl*e( i-2 )
+                       e( i-2 ) = cosl*e( i-2 )
+                    end if
+                    rwork( i-ll ) = cosr
+                    rwork( i-ll+nm1 ) = -sinr
+                    rwork( i-ll+nm12 ) = cosl
+                    rwork( i-ll+nm13 ) = -sinl
+                 end do
+                 e( ll ) = f
+                 ! test convergence
+                 if( abs( e( ll ) )<=thresh )e( ll ) = zero
+                 ! update singular vectors if desired
+                 if( ncvt>0 )call stdlib_clasr( 'L', 'V', 'B', m-ll+1, ncvt, rwork( nm12+1 ),&
+                           rwork( nm13+1 ), vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_clasr( 'R', 'V', 'B', nru, m-ll+1, rwork( 1 ),rwork( n ), &
+                           u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_clasr( 'L', 'V', 'B', m-ll+1, ncc, rwork( 1 ),rwork( n ), &
+                           c( ll, 1 ), ldc )
+              end if
+           end if
+           ! qr iteration finished, go back and check convergence
+           go to 60
+           ! all singular values converged, so make them positive
+           160 continue
+           do i = 1, n
+              if( d( i )<zero ) then
+                 d( i ) = -d( i )
+                 ! change sign of singular vectors, if desired
+                 if( ncvt>0 )call stdlib_csscal( ncvt, negone, vt( i, 1 ), ldvt )
+              end if
+           end do
+           ! sort the singular values into decreasing order (insertion sort on
+           ! singular values, but only one transposition per singular vector)
+           do i = 1, n - 1
+              ! scan for smallest d(i)
+              isub = 1
+              smin = d( 1 )
+              do j = 2, n + 1 - i
+                 if( d( j )<=smin ) then
+                    isub = j
+                    smin = d( j )
+                 end if
+              end do
+              if( isub/=n+1-i ) then
+                 ! swap singular values and vectors
+                 d( isub ) = d( n+1-i )
+                 d( n+1-i ) = smin
+                 if( ncvt>0 )call stdlib_cswap( ncvt, vt( isub, 1 ), ldvt, vt( n+1-i, 1 ),ldvt )
+                           
+                 if( nru>0 )call stdlib_cswap( nru, u( 1, isub ), 1, u( 1, n+1-i ), 1 )
+                 if( ncc>0 )call stdlib_cswap( ncc, c( isub, 1 ), ldc, c( n+1-i, 1 ), ldc )
+                           
+              end if
+           end do
+           go to 220
+           ! maximum number of iterations exceeded, failure to converge
+           200 continue
+           info = 0
+           do i = 1, n - 1
+              if( e( i )/=zero )info = info + 1
+           end do
+           220 continue
+           return
+     end subroutine stdlib_cbdsqr
+
+     !> CGBCON: estimates the reciprocal of the condition number of a complex
+     !> general band matrix A, in either the 1-norm or the infinity-norm,
+     !> using the LU factorization computed by CGBTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     pure subroutine stdlib_cgbcon( norm, n, kl, ku, ab, ldab, ipiv, anorm, rcond,work, rwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(in) :: ab(ldab,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnoti, onenrm
+           character :: normin
+           integer(ilp) :: ix, j, jp, kase, kase1, kd, lm
+           real(sp) :: ainvnm, scale, smlnum
+           complex(sp) :: t, zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,min,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<2*kl+ku+1 ) then
+              info = -6
+           else if( anorm<zero ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           ! estimate the norm of inv(a).
+           ainvnm = zero
+           normin = 'N'
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kd = kl + ku + 1
+           lnoti = kl>0
+           kase = 0
+           10 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(l).
+                 if( lnoti ) then
+                    do j = 1, n - 1
+                       lm = min( kl, n-j )
+                       jp = ipiv( j )
+                       t = work( jp )
+                       if( jp/=j ) then
+                          work( jp ) = work( j )
+                          work( j ) = t
+                       end if
+                       call stdlib_caxpy( lm, -t, ab( kd+1, j ), 1, work( j+1 ), 1 )
+                    end do
+                 end if
+                 ! multiply by inv(u).
+                 call stdlib_clatbs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kl+ku, ab, &
+                           ldab, work, scale, rwork, info )
+              else
+                 ! multiply by inv(u**h).
+                 call stdlib_clatbs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, kl+ku, &
+                           ab, ldab, work, scale, rwork,info )
+                 ! multiply by inv(l**h).
+                 if( lnoti ) then
+                    do j = n - 1, 1, -1
+                       lm = min( kl, n-j )
+                       work( j ) = work( j ) - stdlib_cdotc( lm, ab( kd+1, j ), 1,work( j+1 ), 1 )
+                                 
+                       jp = ipiv( j )
+                       if( jp/=j ) then
+                          t = work( jp )
+                          work( jp ) = work( j )
+                          work( j ) = t
+                       end if
+                    end do
+                 end if
+              end if
+              ! divide x by 1/scale if doing so will not cause overflow.
+              normin = 'Y'
+              if( scale/=one ) then
+                 ix = stdlib_icamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 40
+                 call stdlib_csrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           40 continue
+           return
+     end subroutine stdlib_cgbcon
+
+     !> CGBTRF: computes an LU factorization of a complex m-by-n band matrix A
+     !> using partial pivoting with row interchanges.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_cgbtrf( m, n, kl, ku, ab, ldab, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldwork = nbmax+1
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i2, i3, ii, ip, j, j2, j3, jb, jj, jm, jp, ju, k2, km, kv, nb, &
+                     nw
+           complex(sp) :: temp
+           ! Local Arrays 
+           complex(sp) :: work13(ldwork,nbmax), work31(ldwork,nbmax)
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! kv is the number of superdiagonals in the factor u, allowing for
+           ! fill-in
+           kv = ku + kl
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+kv+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGBTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! determine the block size for this environment
+           nb = stdlib_ilaenv( 1, 'CGBTRF', ' ', m, n, kl, ku )
+           ! the block size must not exceed the limit set by the size of the
+           ! local arrays work13 and work31.
+           nb = min( nb, nbmax )
+           if( nb<=1 .or. nb>kl ) then
+              ! use unblocked code
+              call stdlib_cgbtf2( m, n, kl, ku, ab, ldab, ipiv, info )
+           else
+              ! use blocked code
+              ! czero the superdiagonal elements of the work array work13
+              do j = 1, nb
+                 do i = 1, j - 1
+                    work13( i, j ) = czero
+                 end do
+              end do
+              ! czero the subdiagonal elements of the work array work31
+              do j = 1, nb
+                 do i = j + 1, nb
+                    work31( i, j ) = czero
+                 end do
+              end do
+              ! gaussian elimination with partial pivoting
+              ! set fill-in elements in columns ku+2 to kv to czero
+              do j = ku + 2, min( kv, n )
+                 do i = kv - j + 2, kl
+                    ab( i, j ) = czero
+                 end do
+              end do
+              ! ju is the index of the last column affected by the current
+              ! stage of the factorization
+              ju = 1
+              loop_180: do j = 1, min( m, n ), nb
+                 jb = min( nb, min( m, n )-j+1 )
+                 ! the active part of the matrix is partitioned
+                    ! a11   a12   a13
+                    ! a21   a22   a23
+                    ! a31   a32   a33
+                 ! here a11, a21 and a31 denote the current block of jb columns
+                 ! which is about to be factorized. the number of rows in the
+                 ! partitioning are jb, i2, i3 respectively, and the numbers
+                 ! of columns are jb, j2, j3. the superdiagonal elements of a13
+                 ! and the subdiagonal elements of a31 lie outside the band.
+                 i2 = min( kl-jb, m-j-jb+1 )
+                 i3 = min( jb, m-j-kl+1 )
+                 ! j2 and j3 are computed after ju has been updated.
+                 ! factorize the current block of jb columns
+                 loop_80: do jj = j, j + jb - 1
+                    ! set fill-in elements in column jj+kv to czero
+                    if( jj+kv<=n ) then
+                       do i = 1, kl
+                          ab( i, jj+kv ) = czero
+                       end do
+                    end if
+                    ! find pivot and test for singularity. km is the number of
+                    ! subdiagonal elements in the current column.
+                    km = min( kl, m-jj )
+                    jp = stdlib_icamax( km+1, ab( kv+1, jj ), 1 )
+                    ipiv( jj ) = jp + jj - j
+                    if( ab( kv+jp, jj )/=czero ) then
+                       ju = max( ju, min( jj+ku+jp-1, n ) )
+                       if( jp/=1 ) then
+                          ! apply interchange to columns j to j+jb-1
+                          if( jp+jj-1<j+kl ) then
+                             call stdlib_cswap( jb, ab( kv+1+jj-j, j ), ldab-1,ab( kv+jp+jj-j, j )&
+                                       , ldab-1 )
+                          else
+                             ! the interchange affects columns j to jj-1 of a31
+                             ! which are stored in the work array work31
+                             call stdlib_cswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,work31( jp+jj-j-&
+                                       kl, 1 ), ldwork )
+                             call stdlib_cswap( j+jb-jj, ab( kv+1, jj ), ldab-1,ab( kv+jp, jj ), &
+                                       ldab-1 )
+                          end if
+                       end if
+                       ! compute multipliers
+                       call stdlib_cscal( km, cone / ab( kv+1, jj ), ab( kv+2, jj ),1 )
+                       ! update trailing submatrix within the band and within
+                       ! the current block. jm is the index of the last column
+                       ! which needs to be updated.
+                       jm = min( ju, j+jb-1 )
+                       if( jm>jj )call stdlib_cgeru( km, jm-jj, -cone, ab( kv+2, jj ), 1,ab( kv, &
+                                 jj+1 ), ldab-1,ab( kv+1, jj+1 ), ldab-1 )
+                    else
+                       ! if pivot is czero, set info to the index of the pivot
+                       ! unless a czero pivot has already been found.
+                       if( info==0 )info = jj
+                    end if
+                    ! copy current column of a31 into the work array work31
+                    nw = min( jj-j+1, i3 )
+                    if( nw>0 )call stdlib_ccopy( nw, ab( kv+kl+1-jj+j, jj ), 1,work31( 1, jj-j+1 )&
+                              , 1 )
+                 end do loop_80
+                 if( j+jb<=n ) then
+                    ! apply the row interchanges to the other blocks.
+                    j2 = min( ju-j+1, kv ) - jb
+                    j3 = max( 0, ju-j-kv+1 )
+                    ! use stdlib_claswp to apply the row interchanges to a12, a22, and
+                    ! a32.
+                    call stdlib_claswp( j2, ab( kv+1-jb, j+jb ), ldab-1, 1, jb,ipiv( j ), 1 )
+                              
+                    ! adjust the pivot indices.
+                    do i = j, j + jb - 1
+                       ipiv( i ) = ipiv( i ) + j - 1
+                    end do
+                    ! apply the row interchanges to a13, a23, and a33
+                    ! columnwise.
+                    k2 = j - 1 + jb + j2
+                    do i = 1, j3
+                       jj = k2 + i
+                       do ii = j + i - 1, j + jb - 1
+                          ip = ipiv( ii )
+                          if( ip/=ii ) then
+                             temp = ab( kv+1+ii-jj, jj )
+                             ab( kv+1+ii-jj, jj ) = ab( kv+1+ip-jj, jj )
+                             ab( kv+1+ip-jj, jj ) = temp
+                          end if
+                       end do
+                    end do
+                    ! update the relevant part of the trailing submatrix
+                    if( j2>0 ) then
+                       ! update a12
+                       call stdlib_ctrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT',jb, j2, cone, &
+                                 ab( kv+1, j ), ldab-1,ab( kv+1-jb, j+jb ), ldab-1 )
+                       if( i2>0 ) then
+                          ! update a22
+                          call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i2, j2,jb, -cone, ab(&
+                           kv+1+jb, j ), ldab-1,ab( kv+1-jb, j+jb ), ldab-1, cone,ab( kv+1, j+jb )&
+                                     , ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! update a32
+                          call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i3, j2,jb, -cone, &
+                          work31, ldwork,ab( kv+1-jb, j+jb ), ldab-1, cone,ab( kv+kl+1-jb, j+jb ),&
+                                     ldab-1 )
+                       end if
+                    end if
+                    if( j3>0 ) then
+                       ! copy the lower triangle of a13 into the work array
+                       ! work13
+                       do jj = 1, j3
+                          do ii = jj, jb
+                             work13( ii, jj ) = ab( ii-jj+1, jj+j+kv-1 )
+                          end do
+                       end do
+                       ! update a13 in the work array
+                       call stdlib_ctrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT',jb, j3, cone, &
+                                 ab( kv+1, j ), ldab-1,work13, ldwork )
+                       if( i2>0 ) then
+                          ! update a23
+                          call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i2, j3,jb, -cone, ab(&
+                           kv+1+jb, j ), ldab-1,work13, ldwork, cone, ab( 1+jb, j+kv ),ldab-1 )
+                                     
+                       end if
+                       if( i3>0 ) then
+                          ! update a33
+                          call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i3, j3,jb, -cone, &
+                                    work31, ldwork, work13,ldwork, cone, ab( 1+kl, j+kv ), ldab-1 )
+                       end if
+                       ! copy the lower triangle of a13 back into place
+                       do jj = 1, j3
+                          do ii = jj, jb
+                             ab( ii-jj+1, jj+j+kv-1 ) = work13( ii, jj )
+                          end do
+                       end do
+                    end if
+                 else
+                    ! adjust the pivot indices.
+                    do i = j, j + jb - 1
+                       ipiv( i ) = ipiv( i ) + j - 1
+                    end do
+                 end if
+                 ! partially undo the interchanges in the current block to
+                 ! restore the upper triangular form of a31 and copy the upper
+                 ! triangle of a31 back into place
+                 do jj = j + jb - 1, j, -1
+                    jp = ipiv( jj ) - jj + 1
+                    if( jp/=1 ) then
+                       ! apply interchange to columns j to jj-1
+                       if( jp+jj-1<j+kl ) then
+                          ! the interchange does not affect a31
+                          call stdlib_cswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,ab( kv+jp+jj-j, j ),&
+                                     ldab-1 )
+                       else
+                          ! the interchange does affect a31
+                          call stdlib_cswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,work31( jp+jj-j-kl, &
+                                    1 ), ldwork )
+                       end if
+                    end if
+                    ! copy the current column of a31 back into place
+                    nw = min( i3, jj-j+1 )
+                    if( nw>0 )call stdlib_ccopy( nw, work31( 1, jj-j+1 ), 1,ab( kv+kl+1-jj+j, jj )&
+                              , 1 )
+                 end do
+              end do loop_180
+           end if
+           return
+     end subroutine stdlib_cgbtrf
+
+     !> CGBTRS: solves a system of linear equations
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B
+     !> with a general band matrix A using the LU factorization computed
+     !> by CGBTRF.
+
+     pure subroutine stdlib_cgbtrs( trans, n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: ab(ldab,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnoti, notran
+           integer(ilp) :: i, j, kd, l, lm
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<( 2*kl+ku+1 ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           kd = ku + kl + 1
+           lnoti = kl>0
+           if( notran ) then
+              ! solve  a*x = b.
+              ! solve l*x = b, overwriting b with x.
+              ! l is represented as a product of permutations and unit lower
+              ! triangular matrices l = p(1) * l(1) * ... * p(n-1) * l(n-1),
+              ! where each transformation l(i) is a rank-cone modification of
+              ! the identity matrix.
+              if( lnoti ) then
+                 do j = 1, n - 1
+                    lm = min( kl, n-j )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_cswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                    call stdlib_cgeru( lm, nrhs, -cone, ab( kd+1, j ), 1, b( j, 1 ),ldb, b( j+1, &
+                              1 ), ldb )
+                 end do
+              end if
+              do i = 1, nrhs
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_ctbsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, kl+ku,ab, ldab, b( 1, &
+                           i ), 1 )
+              end do
+           else if( stdlib_lsame( trans, 'T' ) ) then
+              ! solve a**t * x = b.
+              do i = 1, nrhs
+                 ! solve u**t * x = b, overwriting b with x.
+                 call stdlib_ctbsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, kl+ku, ab,ldab, b( 1, i )&
+                           , 1 )
+              end do
+              ! solve l**t * x = b, overwriting b with x.
+              if( lnoti ) then
+                 do j = n - 1, 1, -1
+                    lm = min( kl, n-j )
+                    call stdlib_cgemv( 'TRANSPOSE', lm, nrhs, -cone, b( j+1, 1 ),ldb, ab( kd+1, j &
+                              ), 1, cone, b( j, 1 ), ldb )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_cswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                 end do
+              end if
+           else
+              ! solve a**h * x = b.
+              do i = 1, nrhs
+                 ! solve u**h * x = b, overwriting b with x.
+                 call stdlib_ctbsv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,kl+ku, ab, ldab,&
+                            b( 1, i ), 1 )
+              end do
+              ! solve l**h * x = b, overwriting b with x.
+              if( lnoti ) then
+                 do j = n - 1, 1, -1
+                    lm = min( kl, n-j )
+                    call stdlib_clacgv( nrhs, b( j, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', lm, nrhs, -cone,b( j+1, 1 ), ldb, &
+                              ab( kd+1, j ), 1, cone,b( j, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( j, 1 ), ldb )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_cswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_cgbtrs
+
+     !> CGEBD2: reduces a complex general m by n matrix A to upper or lower
+     !> real bidiagonal form B by a unitary transformation: Q**H * A * P = B.
+     !> If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+     pure subroutine stdlib_cgebd2( m, n, a, lda, d, e, tauq, taup, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(out) :: d(*), e(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: taup(*), tauq(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info<0 ) then
+              call stdlib_xerbla( 'CGEBD2', -info )
+              return
+           end if
+           if( m>=n ) then
+              ! reduce to upper bidiagonal form
+              do i = 1, n
+                 ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+                 alpha = a( i, i )
+                 call stdlib_clarfg( m-i+1, alpha, a( min( i+1, m ), i ), 1,tauq( i ) )
+                 d( i ) = real( alpha,KIND=sp)
+                 a( i, i ) = cone
+                 ! apply h(i)**h to a(i:m,i+1:n) from the left
+                 if( i<n )call stdlib_clarf( 'LEFT', m-i+1, n-i, a( i, i ), 1,conjg( tauq( i ) ), &
+                           a( i, i+1 ), lda, work )
+                 a( i, i ) = d( i )
+                 if( i<n ) then
+                    ! generate elementary reflector g(i) to annihilate
+                    ! a(i,i+2:n)
+                    call stdlib_clacgv( n-i, a( i, i+1 ), lda )
+                    alpha = a( i, i+1 )
+                    call stdlib_clarfg( n-i, alpha, a( i, min( i+2, n ) ),lda, taup( i ) )
+                    e( i ) = real( alpha,KIND=sp)
+                    a( i, i+1 ) = cone
+                    ! apply g(i) to a(i+1:m,i+1:n) from the right
+                    call stdlib_clarf( 'RIGHT', m-i, n-i, a( i, i+1 ), lda,taup( i ), a( i+1, i+1 &
+                              ), lda, work )
+                    call stdlib_clacgv( n-i, a( i, i+1 ), lda )
+                    a( i, i+1 ) = e( i )
+                 else
+                    taup( i ) = czero
+                 end if
+              end do
+           else
+              ! reduce to lower bidiagonal form
+              do i = 1, m
+                 ! generate elementary reflector g(i) to annihilate a(i,i+1:n)
+                 call stdlib_clacgv( n-i+1, a( i, i ), lda )
+                 alpha = a( i, i )
+                 call stdlib_clarfg( n-i+1, alpha, a( i, min( i+1, n ) ), lda,taup( i ) )
+                 d( i ) = real( alpha,KIND=sp)
+                 a( i, i ) = cone
+                 ! apply g(i) to a(i+1:m,i:n) from the right
+                 if( i<m )call stdlib_clarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda,taup( i ), a( i+&
+                           1, i ), lda, work )
+                 call stdlib_clacgv( n-i+1, a( i, i ), lda )
+                 a( i, i ) = d( i )
+                 if( i<m ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(i+2:m,i)
+                    alpha = a( i+1, i )
+                    call stdlib_clarfg( m-i, alpha, a( min( i+2, m ), i ), 1,tauq( i ) )
+                    e( i ) = real( alpha,KIND=sp)
+                    a( i+1, i ) = cone
+                    ! apply h(i)**h to a(i+1:m,i+1:n) from the left
+                    call stdlib_clarf( 'LEFT', m-i, n-i, a( i+1, i ), 1,conjg( tauq( i ) ), a( i+&
+                              1, i+1 ), lda,work )
+                    a( i+1, i ) = e( i )
+                 else
+                    tauq( i ) = czero
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_cgebd2
+
+     !> CGECON: estimates the reciprocal of the condition number of a general
+     !> complex matrix A, in either the 1-norm or the infinity-norm, using
+     !> the LU factorization computed by CGETRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     pure subroutine stdlib_cgecon( norm, n, a, lda, anorm, rcond, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: onenrm
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(sp) :: ainvnm, scale, sl, smlnum, su
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGECON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           ! estimate the norm of inv(a).
+           ainvnm = zero
+           normin = 'N'
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kase = 0
+           10 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(l).
+                 call stdlib_clatrs( 'LOWER', 'NO TRANSPOSE', 'UNIT', normin, n, a,lda, work, sl, &
+                           rwork, info )
+                 ! multiply by inv(u).
+                 call stdlib_clatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           su, rwork( n+1 ), info )
+              else
+                 ! multiply by inv(u**h).
+                 call stdlib_clatrs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, a, lda,&
+                            work, su, rwork( n+1 ),info )
+                 ! multiply by inv(l**h).
+                 call stdlib_clatrs( 'LOWER', 'CONJUGATE TRANSPOSE', 'UNIT', normin,n, a, lda, &
+                           work, sl, rwork, info )
+              end if
+              ! divide x by 1/(sl*su) if doing so will not cause overflow.
+              scale = sl*su
+              normin = 'Y'
+              if( scale/=one ) then
+                 ix = stdlib_icamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_csrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_cgecon
+
+     !> CGEHD2: reduces a complex general matrix A to upper Hessenberg form H
+     !> by a unitary similarity transformation:  Q**H * A * Q = H .
+
+     pure subroutine stdlib_cgehd2( n, ilo, ihi, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEHD2', -info )
+              return
+           end if
+           do i = ilo, ihi - 1
+              ! compute elementary reflector h(i) to annihilate a(i+2:ihi,i)
+              alpha = a( i+1, i )
+              call stdlib_clarfg( ihi-i, alpha, a( min( i+2, n ), i ), 1, tau( i ) )
+              a( i+1, i ) = cone
+              ! apply h(i) to a(1:ihi,i+1:ihi) from the right
+              call stdlib_clarf( 'RIGHT', ihi, ihi-i, a( i+1, i ), 1, tau( i ),a( 1, i+1 ), lda, &
+                        work )
+              ! apply h(i)**h to a(i+1:ihi,i+1:n) from the left
+              call stdlib_clarf( 'LEFT', ihi-i, n-i, a( i+1, i ), 1,conjg( tau( i ) ), a( i+1, i+&
+                        1 ), lda, work )
+              a( i+1, i ) = alpha
+           end do
+           return
+     end subroutine stdlib_cgehd2
+
+     !> CGELQ2: computes an LQ factorization of a complex m-by-n matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a n-by-n orthogonal matrix;
+     !> L is a lower-triangular m-by-m matrix;
+     !> 0 is a m-by-(n-m) zero matrix, if m < n.
+
+     pure subroutine stdlib_cgelq2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGELQ2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i,i+1:n)
+              call stdlib_clacgv( n-i+1, a( i, i ), lda )
+              alpha = a( i, i )
+              call stdlib_clarfg( n-i+1, alpha, a( i, min( i+1, n ) ), lda,tau( i ) )
+              if( i<m ) then
+                 ! apply h(i) to a(i+1:m,i:n) from the right
+                 a( i, i ) = cone
+                 call stdlib_clarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda, tau( i ),a( i+1, i ), &
+                           lda, work )
+              end if
+              a( i, i ) = alpha
+              call stdlib_clacgv( n-i+1, a( i, i ), lda )
+           end do
+           return
+     end subroutine stdlib_cgelq2
+
+     !> CGELQF: computes an LQ factorization of a complex M-by-N matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a N-by-N orthogonal matrix;
+     !> L is a lower-triangular M-by-M matrix;
+     !> 0 is a M-by-(N-M) zero matrix, if M < N.
+
+     pure subroutine stdlib_cgelqf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'CGELQF', ' ', m, n, -1, -1 )
+           lwkopt = m*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, m ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGELQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'CGELQF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'CGELQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the lq factorization of the current block
+                 ! a(i:i+ib-1,i:n)
+                 call stdlib_cgelq2( ib, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=m ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_clarft( 'FORWARD', 'ROWWISE', n-i+1, ib, a( i, i ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h to a(i+ib:m,i:n) from the right
+                    call stdlib_clarfb( 'RIGHT', 'NO TRANSPOSE', 'FORWARD','ROWWISE', m-i-ib+1, n-&
+                    i+1, ib, a( i, i ),lda, work, ldwork, a( i+ib, i ), lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_cgelq2( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_cgelqf
+
+     !> CGELQT3: recursively computes a LQ factorization of a complex M-by-N
+     !> matrix A, using the compact WY representation of Q.
+     !> Based on the algorithm of Elmroth and Gustavson,
+     !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+
+     pure recursive subroutine stdlib_cgelqt3( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, ldt
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, j, j1, m1, m2, iinfo
+           ! Executable Statements 
+           info = 0
+           if( m < 0 ) then
+              info = -1
+           else if( n < m ) then
+              info = -2
+           else if( lda < max( 1, m ) ) then
+              info = -4
+           else if( ldt < max( 1, m ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGELQT3', -info )
+              return
+           end if
+           if( m==1 ) then
+              ! compute householder transform when m=1
+              call stdlib_clarfg( n, a(1,1), a( 1, min( 2, n ) ), lda, t(1,1) )
+              t(1,1)=conjg(t(1,1))
+           else
+              ! otherwise, split a into blocks...
+              m1 = m/2
+              m2 = m-m1
+              i1 = min( m1+1, m )
+              j1 = min( m+1, n )
+              ! compute a(1:m1,1:n) <- (y1,r1,t1), where q1 = i - y1 t1 y1^h
+              call stdlib_cgelqt3( m1, n, a, lda, t, ldt, iinfo )
+              ! compute a(j1:m,1:n) =  a(j1:m,1:n) q1^h [workspace: t(1:n1,j1:n)]
+              do i=1,m2
+                 do j=1,m1
+                    t(  i+m1, j ) = a( i+m1, j )
+                 end do
+              end do
+              call stdlib_ctrmm( 'R', 'U', 'C', 'U', m2, m1, cone,a, lda, t( i1, 1 ), ldt )
+                        
+              call stdlib_cgemm( 'N', 'C', m2, m1, n-m1, cone, a( i1, i1 ), lda,a( 1, i1 ), lda, &
+                        cone, t( i1, 1 ), ldt)
+              call stdlib_ctrmm( 'R', 'U', 'N', 'N', m2, m1, cone,t, ldt, t( i1, 1 ), ldt )
+                        
+              call stdlib_cgemm( 'N', 'N', m2, n-m1, m1, -cone, t( i1, 1 ), ldt,a( 1, i1 ), lda, &
+                        cone, a( i1, i1 ), lda )
+              call stdlib_ctrmm( 'R', 'U', 'N', 'U', m2, m1 , cone,a, lda, t( i1, 1 ), ldt )
+                        
+              do i=1,m2
+                 do j=1,m1
+                    a(  i+m1, j ) = a( i+m1, j ) - t( i+m1, j )
+                    t( i+m1, j )= czero
+                 end do
+              end do
+              ! compute a(j1:m,j1:n) <- (y2,r2,t2) where q2 = i - y2 t2 y2^h
+              call stdlib_cgelqt3( m2, n-m1, a( i1, i1 ), lda,t( i1, i1 ), ldt, iinfo )
+              ! compute t3 = t(j1:n1,1:n) = -t1 y1^h y2 t2
+              do i=1,m2
+                 do j=1,m1
+                    t( j, i+m1  ) = (a( j, i+m1 ))
+                 end do
+              end do
+              call stdlib_ctrmm( 'R', 'U', 'C', 'U', m1, m2, cone,a( i1, i1 ), lda, t( 1, i1 ), &
+                        ldt )
+              call stdlib_cgemm( 'N', 'C', m1, m2, n-m, cone, a( 1, j1 ), lda,a( i1, j1 ), lda, &
+                        cone, t( 1, i1 ), ldt )
+              call stdlib_ctrmm( 'L', 'U', 'N', 'N', m1, m2, -cone, t, ldt,t( 1, i1 ), ldt )
+                        
+              call stdlib_ctrmm( 'R', 'U', 'N', 'N', m1, m2, cone,t( i1, i1 ), ldt, t( 1, i1 ), &
+                        ldt )
+              ! y = (y1,y2); l = [ l1            0  ];  t = [t1 t3]
+                               ! [ a(1:n1,j1:n)  l2 ]       [ 0 t2]
+           end if
+           return
+     end subroutine stdlib_cgelqt3
+
+     !> CGEMLQT: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q C            C Q
+     !> TRANS = 'C':   Q**H C            C Q**H
+     !> where Q is a complex unitary matrix defined as the product of K
+     !> elementary reflectors:
+     !> Q = H(1) H(2) . . . H(K) = I - V T V**H
+     !> generated using the compact WY representation as returned by CGELQT.
+     !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+
+     pure subroutine stdlib_cgemlqt( side, trans, m, n, k, mb, v, ldv, t, ldt,c, ldc, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, ldc, m, n, mb, ldt
+           ! Array Arguments 
+           complex(sp), intent(in) :: v(ldv,*), t(ldt,*)
+           complex(sp), intent(inout) :: c(ldc,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, ldwork, kf, q
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'C' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( left ) then
+              ldwork = max( 1, n )
+              q = m
+           else if ( right ) then
+              ldwork = max( 1, m )
+              q = n
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>q ) then
+              info = -5
+           else if( mb<1 .or. (mb>k .and. k>0)) then
+              info = -6
+           else if( ldv<max( 1, k ) ) then
+               info = -8
+           else if( ldt<mb ) then
+              info = -10
+           else if( ldc<max( 1, m ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEMLQT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. notran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_clarfb( 'L', 'C', 'F', 'R', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. tran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_clarfb( 'R', 'N', 'F', 'R', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           else if( left .and. tran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_clarfb( 'L', 'N', 'F', 'R', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. notran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_clarfb( 'R', 'C', 'F', 'R', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           end if
+           return
+     end subroutine stdlib_cgemlqt
+
+     !> CGEMQRT: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q C            C Q
+     !> TRANS = 'C':    Q**H C            C Q**H
+     !> where Q is a complex orthogonal matrix defined as the product of K
+     !> elementary reflectors:
+     !> Q = H(1) H(2) . . . H(K) = I - V T V**H
+     !> generated using the compact WY representation as returned by CGEQRT.
+     !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+
+     pure subroutine stdlib_cgemqrt( side, trans, m, n, k, nb, v, ldv, t, ldt,c, ldc, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, ldc, m, n, nb, ldt
+           ! Array Arguments 
+           complex(sp), intent(in) :: v(ldv,*), t(ldt,*)
+           complex(sp), intent(inout) :: c(ldc,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, ldwork, kf, q
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'C' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( left ) then
+              ldwork = max( 1, n )
+              q = m
+           else if ( right ) then
+              ldwork = max( 1, m )
+              q = n
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>q ) then
+              info = -5
+           else if( nb<1 .or. (nb>k .and. k>0)) then
+              info = -6
+           else if( ldv<max( 1, q ) ) then
+              info = -8
+           else if( ldt<nb ) then
+              info = -10
+           else if( ldc<max( 1, m ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEMQRT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. tran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_clarfb( 'L', 'C', 'F', 'C', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. notran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_clarfb( 'R', 'N', 'F', 'C', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           else if( left .and. notran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_clarfb( 'L', 'N', 'F', 'C', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. tran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_clarfb( 'R', 'C', 'F', 'C', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           end if
+           return
+     end subroutine stdlib_cgemqrt
+
+     !> CGEQL2: computes a QL factorization of a complex m by n matrix A:
+     !> A = Q * L.
+
+     pure subroutine stdlib_cgeql2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEQL2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = k, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! a(1:m-k+i-1,n-k+i)
+              alpha = a( m-k+i, n-k+i )
+              call stdlib_clarfg( m-k+i, alpha, a( 1, n-k+i ), 1, tau( i ) )
+              ! apply h(i)**h to a(1:m-k+i,1:n-k+i-1) from the left
+              a( m-k+i, n-k+i ) = cone
+              call stdlib_clarf( 'LEFT', m-k+i, n-k+i-1, a( 1, n-k+i ), 1,conjg( tau( i ) ), a, &
+                        lda, work )
+              a( m-k+i, n-k+i ) = alpha
+           end do
+           return
+     end subroutine stdlib_cgeql2
+
+     !> CGEQLF: computes a QL factorization of a complex M-by-N matrix A:
+     !> A = Q * L.
+
+     pure subroutine stdlib_cgeqlf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ki, kk, ldwork, lwkopt, mu, nb, nbmin, nu, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              k = min( m, n )
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'CGEQLF', ' ', m, n, -1, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, n ) .and. .not.lquery ) then
+                 info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEQLF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'CGEQLF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'CGEQLF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially.
+              ! the last kk columns are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              do i = k - kk + ki + 1, k - kk + 1, -nb
+                 ib = min( k-i+1, nb )
+                 ! compute the ql factorization of the current block
+                 ! a(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1)
+                 call stdlib_cgeql2( m-k+i+ib-1, ib, a( 1, n-k+i ), lda, tau( i ),work, iinfo )
+                           
+                 if( n-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_clarft( 'BACKWARD', 'COLUMNWISE', m-k+i+ib-1, ib,a( 1, n-k+i ), &
+                              lda, tau( i ), work, ldwork )
+                    ! apply h**h to a(1:m-k+i+ib-1,1:n-k+i-1) from the left
+                    call stdlib_clarfb( 'LEFT', 'CONJUGATE TRANSPOSE', 'BACKWARD','COLUMNWISE', m-&
+                    k+i+ib-1, n-k+i-1, ib,a( 1, n-k+i ), lda, work, ldwork, a, lda,work( ib+1 ), &
+                              ldwork )
+                 end if
+              end do
+              mu = m - k + i + nb - 1
+              nu = n - k + i + nb - 1
+           else
+              mu = m
+              nu = n
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 .and. nu>0 )call stdlib_cgeql2( mu, nu, a, lda, tau, work, iinfo )
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_cgeqlf
+
+     !> CGEQR2: computes a QR factorization of a complex m-by-n matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a m-by-m orthogonal matrix;
+     !> R is an upper-triangular n-by-n matrix;
+     !> 0 is a (m-n)-by-n zero matrix, if m > n.
+
+     pure subroutine stdlib_cgeqr2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEQR2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+              call stdlib_clarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tau( i ) )
+              if( i<n ) then
+                 ! apply h(i)**h to a(i:m,i+1:n) from the left
+                 alpha = a( i, i )
+                 a( i, i ) = cone
+                 call stdlib_clarf( 'LEFT', m-i+1, n-i, a( i, i ), 1,conjg( tau( i ) ), a( i, i+1 &
+                           ), lda, work )
+                 a( i, i ) = alpha
+              end if
+           end do
+           return
+     end subroutine stdlib_cgeqr2
+
+     !> CGEQR2P: computes a QR factorization of a complex m-by-n matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a m-by-m orthogonal matrix;
+     !> R is an upper-triangular n-by-n matrix with nonnegative diagonal
+     !> entries;
+     !> 0 is a (m-n)-by-n zero matrix, if m > n.
+
+     subroutine stdlib_cgeqr2p( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEQR2P', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+              call stdlib_clarfgp( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tau( i ) )
+              if( i<n ) then
+                 ! apply h(i)**h to a(i:m,i+1:n) from the left
+                 alpha = a( i, i )
+                 a( i, i ) = cone
+                 call stdlib_clarf( 'LEFT', m-i+1, n-i, a( i, i ), 1,conjg( tau( i ) ), a( i, i+1 &
+                           ), lda, work )
+                 a( i, i ) = alpha
+              end if
+           end do
+           return
+     end subroutine stdlib_cgeqr2p
+
+     !> CGEQRF: computes a QR factorization of a complex M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     pure subroutine stdlib_cgeqrf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           k = min( m, n )
+           info = 0
+           nb = stdlib_ilaenv( 1, 'CGEQRF', ' ', m, n, -1, -1 )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( .not.lquery ) then
+              if( lwork<=0 .or. ( m>0 .and. lwork<max( 1, n ) ) )info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEQRF', -info )
+              return
+           else if( lquery ) then
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'CGEQRF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'CGEQRF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the qr factorization of the current block
+                 ! a(i:m,i:i+ib-1)
+                 call stdlib_cgeqr2( m-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_clarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h**h to a(i:m,i+ib:n) from the left
+                    call stdlib_clarfb( 'LEFT', 'CONJUGATE TRANSPOSE', 'FORWARD','COLUMNWISE', m-&
+                    i+1, n-i-ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 )&
+                              , ldwork )
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_cgeqr2( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_cgeqrf
+
+     !> CGEQR2P computes a QR factorization of a complex M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix with nonnegative diagonal
+     !> entries;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     subroutine stdlib_cgeqrfp( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'CGEQRF', ' ', m, n, -1, -1 )
+           lwkopt = n*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEQRFP', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'CGEQRF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'CGEQRF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the qr factorization of the current block
+                 ! a(i:m,i:i+ib-1)
+                 call stdlib_cgeqr2p( m-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_clarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h**h to a(i:m,i+ib:n) from the left
+                    call stdlib_clarfb( 'LEFT', 'CONJUGATE TRANSPOSE', 'FORWARD','COLUMNWISE', m-&
+                    i+1, n-i-ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 )&
+                              , ldwork )
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_cgeqr2p( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_cgeqrfp
+
+     !> CGEQRT2: computes a QR factorization of a complex M-by-N matrix A,
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_cgeqrt2( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(sp) :: aii, alpha
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( m<n ) then
+              info = -1
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEQRT2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elem. refl. h(i) to annihilate a(i+1:m,i), tau(i) -> t(i,1)
+              call stdlib_clarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,t( i, 1 ) )
+              if( i<n ) then
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 aii = a( i, i )
+                 a( i, i ) = cone
+                 ! w(1:n-i) := a(i:m,i+1:n)**h * a(i:m,i) [w = t(:,n)]
+                 call stdlib_cgemv( 'C',m-i+1, n-i, cone, a( i, i+1 ), lda,a( i, i ), 1, czero, t(&
+                            1, n ), 1 )
+                 ! a(i:m,i+1:n) = a(i:m,i+1:n) + alpha*a(i:m,i)*w(1:n-1)**h
+                 alpha = -conjg(t( i, 1 ))
+                 call stdlib_cgerc( m-i+1, n-i, alpha, a( i, i ), 1,t( 1, n ), 1, a( i, i+1 ), &
+                           lda )
+                 a( i, i ) = aii
+              end if
+           end do
+           do i = 2, n
+              aii = a( i, i )
+              a( i, i ) = cone
+              ! t(1:i-1,i) := alpha * a(i:m,1:i-1)**h * a(i:m,i)
+              alpha = -t( i, 1 )
+              call stdlib_cgemv( 'C', m-i+1, i-1, alpha, a( i, 1 ), lda,a( i, i ), 1, czero, t( 1,&
+                         i ), 1 )
+              a( i, i ) = aii
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+              call stdlib_ctrmv( 'U', 'N', 'N', i-1, t, ldt, t( 1, i ), 1 )
+                 ! t(i,i) = tau(i)
+                 t( i, i ) = t( i, 1 )
+                 t( i, 1) = czero
+           end do
+     end subroutine stdlib_cgeqrt2
+
+     !> CGEQRT3: recursively computes a QR factorization of a complex M-by-N matrix A,
+     !> using the compact WY representation of Q.
+     !> Based on the algorithm of Elmroth and Gustavson,
+     !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+
+     pure recursive subroutine stdlib_cgeqrt3( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, ldt
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, j, j1, n1, n2, iinfo
+           ! Executable Statements 
+           info = 0
+           if( n < 0 ) then
+              info = -2
+           else if( m < n ) then
+              info = -1
+           else if( lda < max( 1, m ) ) then
+              info = -4
+           else if( ldt < max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEQRT3', -info )
+              return
+           end if
+           if( n==1 ) then
+              ! compute householder transform when n=1
+              call stdlib_clarfg( m, a(1,1), a( min( 2, m ), 1 ), 1, t(1,1) )
+           else
+              ! otherwise, split a into blocks...
+              n1 = n/2
+              n2 = n-n1
+              j1 = min( n1+1, n )
+              i1 = min( n+1, m )
+              ! compute a(1:m,1:n1) <- (y1,r1,t1), where q1 = i - y1 t1 y1**h
+              call stdlib_cgeqrt3( m, n1, a, lda, t, ldt, iinfo )
+              ! compute a(1:m,j1:n) = q1**h a(1:m,j1:n) [workspace: t(1:n1,j1:n)]
+              do j=1,n2
+                 do i=1,n1
+                    t( i, j+n1 ) = a( i, j+n1 )
+                 end do
+              end do
+              call stdlib_ctrmm( 'L', 'L', 'C', 'U', n1, n2, cone,a, lda, t( 1, j1 ), ldt )
+                        
+              call stdlib_cgemm( 'C', 'N', n1, n2, m-n1, cone, a( j1, 1 ), lda,a( j1, j1 ), lda, &
+                        cone, t( 1, j1 ), ldt)
+              call stdlib_ctrmm( 'L', 'U', 'C', 'N', n1, n2, cone,t, ldt, t( 1, j1 ), ldt )
+                        
+              call stdlib_cgemm( 'N', 'N', m-n1, n2, n1, -cone, a( j1, 1 ), lda,t( 1, j1 ), ldt, &
+                        cone, a( j1, j1 ), lda )
+              call stdlib_ctrmm( 'L', 'L', 'N', 'U', n1, n2, cone,a, lda, t( 1, j1 ), ldt )
+                        
+              do j=1,n2
+                 do i=1,n1
+                    a( i, j+n1 ) = a( i, j+n1 ) - t( i, j+n1 )
+                 end do
+              end do
+              ! compute a(j1:m,j1:n) <- (y2,r2,t2) where q2 = i - y2 t2 y2**h
+              call stdlib_cgeqrt3( m-n1, n2, a( j1, j1 ), lda,t( j1, j1 ), ldt, iinfo )
+              ! compute t3 = t(1:n1,j1:n) = -t1 y1**h y2 t2
+              do i=1,n1
+                 do j=1,n2
+                    t( i, j+n1 ) = conjg(a( j+n1, i ))
+                 end do
+              end do
+              call stdlib_ctrmm( 'R', 'L', 'N', 'U', n1, n2, cone,a( j1, j1 ), lda, t( 1, j1 ), &
+                        ldt )
+              call stdlib_cgemm( 'C', 'N', n1, n2, m-n, cone, a( i1, 1 ), lda,a( i1, j1 ), lda, &
+                        cone, t( 1, j1 ), ldt )
+              call stdlib_ctrmm( 'L', 'U', 'N', 'N', n1, n2, -cone, t, ldt,t( 1, j1 ), ldt )
+                        
+              call stdlib_ctrmm( 'R', 'U', 'N', 'N', n1, n2, cone,t( j1, j1 ), ldt, t( 1, j1 ), &
+                        ldt )
+              ! y = (y1,y2); r = [ r1  a(1:n1,j1:n) ];  t = [t1 t3]
+                               ! [  0        r2     ]       [ 0 t2]
+           end if
+           return
+     end subroutine stdlib_cgeqrt3
+
+     !> CGERQ2: computes an RQ factorization of a complex m by n matrix A:
+     !> A = R * Q.
+
+     pure subroutine stdlib_cgerq2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGERQ2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = k, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! a(m-k+i,1:n-k+i-1)
+              call stdlib_clacgv( n-k+i, a( m-k+i, 1 ), lda )
+              alpha = a( m-k+i, n-k+i )
+              call stdlib_clarfg( n-k+i, alpha, a( m-k+i, 1 ), lda,tau( i ) )
+              ! apply h(i) to a(1:m-k+i-1,1:n-k+i) from the right
+              a( m-k+i, n-k+i ) = cone
+              call stdlib_clarf( 'RIGHT', m-k+i-1, n-k+i, a( m-k+i, 1 ), lda,tau( i ), a, lda, &
+                        work )
+              a( m-k+i, n-k+i ) = alpha
+              call stdlib_clacgv( n-k+i-1, a( m-k+i, 1 ), lda )
+           end do
+           return
+     end subroutine stdlib_cgerq2
+
+     !> CGERQF: computes an RQ factorization of a complex M-by-N matrix A:
+     !> A = R * Q.
+
+     pure subroutine stdlib_cgerqf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ki, kk, ldwork, lwkopt, mu, nb, nbmin, nu, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              k = min( m, n )
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'CGERQF', ' ', m, n, -1, -1 )
+                 lwkopt = m*nb
+              end if
+              work( 1 ) = lwkopt
+              if ( .not.lquery ) then
+                 if( lwork<=0 .or. ( n>0 .and. lwork<max( 1, m ) ) )info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGERQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'CGERQF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'CGERQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially.
+              ! the last kk rows are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              do i = k - kk + ki + 1, k - kk + 1, -nb
+                 ib = min( k-i+1, nb )
+                 ! compute the rq factorization of the current block
+                 ! a(m-k+i:m-k+i+ib-1,1:n-k+i+ib-1)
+                 call stdlib_cgerq2( ib, n-k+i+ib-1, a( m-k+i, 1 ), lda, tau( i ),work, iinfo )
+                           
+                 if( m-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_clarft( 'BACKWARD', 'ROWWISE', n-k+i+ib-1, ib,a( m-k+i, 1 ), lda, &
+                              tau( i ), work, ldwork )
+                    ! apply h to a(1:m-k+i-1,1:n-k+i+ib-1) from the right
+                    call stdlib_clarfb( 'RIGHT', 'NO TRANSPOSE', 'BACKWARD','ROWWISE', m-k+i-1, n-&
+                    k+i+ib-1, ib,a( m-k+i, 1 ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+              mu = m - k + i + nb - 1
+              nu = n - k + i + nb - 1
+           else
+              mu = m
+              nu = n
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 .and. nu>0 )call stdlib_cgerq2( mu, nu, a, lda, tau, work, iinfo )
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_cgerqf
+
+     !> CGESC2: solves a system of linear equations
+     !> A * X = scale* RHS
+     !> with a general N-by-N matrix A using the LU factorization with
+     !> complete pivoting computed by CGETC2.
+
+     pure subroutine stdlib_cgesc2( n, a, lda, rhs, ipiv, jpiv, scale )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(out) :: scale
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*), jpiv(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: rhs(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: bignum, eps, smlnum
+           complex(sp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,real
+           ! Executable Statements 
+           ! set constant to control overflow
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! apply permutations ipiv to rhs
+           call stdlib_claswp( 1, rhs, lda, 1, n-1, ipiv, 1 )
+           ! solve for l part
+           do i = 1, n - 1
+              do j = i + 1, n
+                 rhs( j ) = rhs( j ) - a( j, i )*rhs( i )
+              end do
+           end do
+           ! solve for u part
+           scale = one
+           ! check for scaling
+           i = stdlib_icamax( n, rhs, 1 )
+           if( two*smlnum*abs( rhs( i ) )>abs( a( n, n ) ) ) then
+              temp = cmplx( one / two, zero,KIND=sp) / abs( rhs( i ) )
+              call stdlib_cscal( n, temp, rhs( 1 ), 1 )
+              scale = scale*real( temp,KIND=sp)
+           end if
+           do i = n, 1, -1
+              temp = cmplx( one, zero,KIND=sp) / a( i, i )
+              rhs( i ) = rhs( i )*temp
+              do j = i + 1, n
+                 rhs( i ) = rhs( i ) - rhs( j )*( a( i, j )*temp )
+              end do
+           end do
+           ! apply permutations jpiv to the solution (rhs)
+           call stdlib_claswp( 1, rhs, lda, 1, n-1, jpiv, -1 )
+           return
+     end subroutine stdlib_cgesc2
+
+     !> CGETRF2: computes an LU factorization of a general M-by-N matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the recursive version of the algorithm. It divides
+     !> the matrix into four submatrices:
+     !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+     !> A = [ -----|----- ]  with n1 = min(m,n)/2
+     !> [  A21 | A22  ]       n2 = n-n1
+     !> [ A11 ]
+     !> The subroutine calls itself to factor [ --- ],
+     !> [ A12 ]
+     !> [ A12 ]
+     !> do the swaps on [ --- ], solve A12, update A22,
+     !> [ A22 ]
+     !> then calls itself to factor A22 and do the swaps on A21.
+
+     pure recursive subroutine stdlib_cgetrf2( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: sfmin
+           complex(sp) :: temp
+           integer(ilp) :: i, iinfo, n1, n2
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGETRF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if ( m==1 ) then
+              ! use unblocked code for cone row case
+              ! just need to handle ipiv and info
+              ipiv( 1 ) = 1
+              if ( a(1,1)==czero )info = 1
+           else if( n==1 ) then
+              ! use unblocked code for cone column case
+              ! compute machine safe minimum
+              sfmin = stdlib_slamch('S')
+              ! find pivot and test for singularity
+              i = stdlib_icamax( m, a( 1, 1 ), 1 )
+              ipiv( 1 ) = i
+              if( a( i, 1 )/=czero ) then
+                 ! apply the interchange
+                 if( i/=1 ) then
+                    temp = a( 1, 1 )
+                    a( 1, 1 ) = a( i, 1 )
+                    a( i, 1 ) = temp
+                 end if
+                 ! compute elements 2:m of the column
+                 if( abs(a( 1, 1 )) >= sfmin ) then
+                    call stdlib_cscal( m-1, cone / a( 1, 1 ), a( 2, 1 ), 1 )
+                 else
+                    do i = 1, m-1
+                       a( 1+i, 1 ) = a( 1+i, 1 ) / a( 1, 1 )
+                    end do
+                 end if
+              else
+                 info = 1
+              end if
+           else
+              ! use recursive code
+              n1 = min( m, n ) / 2
+              n2 = n-n1
+                     ! [ a11 ]
+              ! factor [ --- ]
+                     ! [ a21 ]
+              call stdlib_cgetrf2( m, n1, a, lda, ipiv, iinfo )
+              if ( info==0 .and. iinfo>0 )info = iinfo
+                                    ! [ a12 ]
+              ! apply interchanges to [ --- ]
+                                    ! [ a22 ]
+              call stdlib_claswp( n2, a( 1, n1+1 ), lda, 1, n1, ipiv, 1 )
+              ! solve a12
+              call stdlib_ctrsm( 'L', 'L', 'N', 'U', n1, n2, cone, a, lda,a( 1, n1+1 ), lda )
+                        
+              ! update a22
+              call stdlib_cgemm( 'N', 'N', m-n1, n2, n1, -cone, a( n1+1, 1 ), lda,a( 1, n1+1 ), &
+                        lda, cone, a( n1+1, n1+1 ), lda )
+              ! factor a22
+              call stdlib_cgetrf2( m-n1, n2, a( n1+1, n1+1 ), lda, ipiv( n1+1 ),iinfo )
+              ! adjust info and the pivot indices
+              if ( info==0 .and. iinfo>0 )info = iinfo + n1
+              do i = n1+1, min( m, n )
+                 ipiv( i ) = ipiv( i ) + n1
+              end do
+              ! apply interchanges to a21
+              call stdlib_claswp( n1, a( 1, 1 ), lda, n1+1, min( m, n), ipiv, 1 )
+           end if
+           return
+     end subroutine stdlib_cgetrf2
+
+     !> CGETRI: computes the inverse of a matrix using the LU factorization
+     !> computed by CGETRF.
+     !> This method inverts U and then computes inv(A) by solving the system
+     !> inv(A)*L = inv(U) for inv(A).
+
+     pure subroutine stdlib_cgetri( n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iws, j, jb, jj, jp, ldwork, lwkopt, nb, nbmin, nn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nb = stdlib_ilaenv( 1, 'CGETRI', ' ', n, -1, -1, -1 )
+           lwkopt = n*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGETRI', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form inv(u).  if info > 0 from stdlib_ctrtri, then u is singular,
+           ! and the inverse is not computed.
+           call stdlib_ctrtri( 'UPPER', 'NON-UNIT', n, a, lda, info )
+           if( info>0 )return
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = max( ldwork*nb, 1 )
+              if( lwork<iws ) then
+                 nb = lwork / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'CGETRI', ' ', n, -1, -1, -1 ) )
+              end if
+           else
+              iws = n
+           end if
+           ! solve the equation inv(a)*l = inv(u) for inv(a).
+           if( nb<nbmin .or. nb>=n ) then
+              ! use unblocked code.
+              do j = n, 1, -1
+                 ! copy current column of l to work and replace with zeros.
+                 do i = j + 1, n
+                    work( i ) = a( i, j )
+                    a( i, j ) = czero
+                 end do
+                 ! compute current column of inv(a).
+                 if( j<n )call stdlib_cgemv( 'NO TRANSPOSE', n, n-j, -cone, a( 1, j+1 ),lda, work(&
+                            j+1 ), 1, cone, a( 1, j ), 1 )
+              end do
+           else
+              ! use blocked code.
+              nn = ( ( n-1 ) / nb )*nb + 1
+              do j = nn, 1, -nb
+                 jb = min( nb, n-j+1 )
+                 ! copy current block column of l to work and replace with
+                 ! zeros.
+                 do jj = j, j + jb - 1
+                    do i = jj + 1, n
+                       work( i+( jj-j )*ldwork ) = a( i, jj )
+                       a( i, jj ) = czero
+                    end do
+                 end do
+                 ! compute current block column of inv(a).
+                 if( j+jb<=n )call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, jb,n-j-jb+1, -&
+                           cone, a( 1, j+jb ), lda,work( j+jb ), ldwork, cone, a( 1, j ), lda )
+                 call stdlib_ctrsm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n, jb,cone, work( j &
+                           ), ldwork, a( 1, j ), lda )
+              end do
+           end if
+           ! apply column interchanges.
+           do j = n - 1, 1, -1
+              jp = ipiv( j )
+              if( jp/=j )call stdlib_cswap( n, a( 1, j ), 1, a( 1, jp ), 1 )
+           end do
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_cgetri
+
+     !> CGETRS: solves a system of linear equations
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B
+     !> with a general N-by-N matrix A using the LU factorization computed
+     !> by CGETRF.
+
+     pure subroutine stdlib_cgetrs( trans, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGETRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( notran ) then
+              ! solve a * x = b.
+              ! apply row interchanges to the right hand sides.
+              call stdlib_claswp( nrhs, b, ldb, 1, n, ipiv, 1 )
+              ! solve l*x = b, overwriting b with x.
+              call stdlib_ctrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n, nrhs,cone, a, lda, b,&
+                         ldb )
+              ! solve u*x = b, overwriting b with x.
+              call stdlib_ctrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, cone, a, &
+                        lda, b, ldb )
+           else
+              ! solve a**t * x = b  or a**h * x = b.
+              ! solve u**t *x = b or u**h *x = b, overwriting b with x.
+              call stdlib_ctrsm( 'LEFT', 'UPPER', trans, 'NON-UNIT', n, nrhs, cone,a, lda, b, ldb &
+                        )
+              ! solve l**t *x = b, or l**h *x = b overwriting b with x.
+              call stdlib_ctrsm( 'LEFT', 'LOWER', trans, 'UNIT', n, nrhs, cone, a,lda, b, ldb )
+                        
+              ! apply row interchanges to the solution vectors.
+              call stdlib_claswp( nrhs, b, ldb, 1, n, ipiv, -1 )
+           end if
+           return
+     end subroutine stdlib_cgetrs
+
+     !> CGGHRD: reduces a pair of complex matrices (A,B) to generalized upper
+     !> Hessenberg form using unitary transformations, where A is a
+     !> general matrix and B is upper triangular.  The form of the generalized
+     !> eigenvalue problem is
+     !> A*x = lambda*B*x,
+     !> and B is typically made upper triangular by computing its QR
+     !> factorization and moving the unitary matrix Q to the left side
+     !> of the equation.
+     !> This subroutine simultaneously reduces A to a Hessenberg matrix H:
+     !> Q**H*A*Z = H
+     !> and transforms B to another upper triangular matrix T:
+     !> Q**H*B*Z = T
+     !> in order to reduce the problem to its standard form
+     !> H*y = lambda*T*y
+     !> where y = Z**H*x.
+     !> The unitary matrices Q and Z are determined as products of Givens
+     !> rotations.  They may either be formed explicitly, or they may be
+     !> postmultiplied into input matrices Q1 and Z1, so that
+     !> Q1 * A * Z1**H = (Q1*Q) * H * (Z1*Z)**H
+     !> Q1 * B * Z1**H = (Q1*Q) * T * (Z1*Z)**H
+     !> If Q1 is the unitary matrix from the QR factorization of B in the
+     !> original equation A*x = lambda*B*x, then CGGHRD reduces the original
+     !> problem to generalized Hessenberg form.
+
+     pure subroutine stdlib_cgghrd( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldb, ldq, ldz, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilq, ilz
+           integer(ilp) :: icompq, icompz, jcol, jrow
+           real(sp) :: c
+           complex(sp) :: ctemp, s
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! decode compq
+           if( stdlib_lsame( compq, 'N' ) ) then
+              ilq = .false.
+              icompq = 1
+           else if( stdlib_lsame( compq, 'V' ) ) then
+              ilq = .true.
+              icompq = 2
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              ilq = .true.
+              icompq = 3
+           else
+              icompq = 0
+           end if
+           ! decode compz
+           if( stdlib_lsame( compz, 'N' ) ) then
+              ilz = .false.
+              icompz = 1
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              ilz = .true.
+              icompz = 2
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              ilz = .true.
+              icompz = 3
+           else
+              icompz = 0
+           end if
+           ! test the input parameters.
+           info = 0
+           if( icompq<=0 ) then
+              info = -1
+           else if( icompz<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ( ilq .and. ldq<n ) .or. ldq<1 ) then
+              info = -11
+           else if( ( ilz .and. ldz<n ) .or. ldz<1 ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGHRD', -info )
+              return
+           end if
+           ! initialize q and z if desired.
+           if( icompq==3 )call stdlib_claset( 'FULL', n, n, czero, cone, q, ldq )
+           if( icompz==3 )call stdlib_claset( 'FULL', n, n, czero, cone, z, ldz )
+           ! quick return if possible
+           if( n<=1 )return
+           ! zero out lower triangle of b
+           do jcol = 1, n - 1
+              do jrow = jcol + 1, n
+                 b( jrow, jcol ) = czero
+              end do
+           end do
+           ! reduce a and b
+           do jcol = ilo, ihi - 2
+              do jrow = ihi, jcol + 2, -1
+                 ! step 1: rotate rows jrow-1, jrow to kill a(jrow,jcol)
+                 ctemp = a( jrow-1, jcol )
+                 call stdlib_clartg( ctemp, a( jrow, jcol ), c, s,a( jrow-1, jcol ) )
+                 a( jrow, jcol ) = czero
+                 call stdlib_crot( n-jcol, a( jrow-1, jcol+1 ), lda,a( jrow, jcol+1 ), lda, c, s )
+                           
+                 call stdlib_crot( n+2-jrow, b( jrow-1, jrow-1 ), ldb,b( jrow, jrow-1 ), ldb, c, &
+                           s )
+                 if( ilq )call stdlib_crot( n, q( 1, jrow-1 ), 1, q( 1, jrow ), 1, c,conjg( s ) )
+                           
+                 ! step 2: rotate columns jrow, jrow-1 to kill b(jrow,jrow-1)
+                 ctemp = b( jrow, jrow )
+                 call stdlib_clartg( ctemp, b( jrow, jrow-1 ), c, s,b( jrow, jrow ) )
+                 b( jrow, jrow-1 ) = czero
+                 call stdlib_crot( ihi, a( 1, jrow ), 1, a( 1, jrow-1 ), 1, c, s )
+                 call stdlib_crot( jrow-1, b( 1, jrow ), 1, b( 1, jrow-1 ), 1, c,s )
+                 if( ilz )call stdlib_crot( n, z( 1, jrow ), 1, z( 1, jrow-1 ), 1, c, s )
+              end do
+           end do
+           return
+     end subroutine stdlib_cgghrd
+
+     !> CGGQRF: computes a generalized QR factorization of an N-by-M matrix A
+     !> and an N-by-P matrix B:
+     !> A = Q*R,        B = Q*T*Z,
+     !> where Q is an N-by-N unitary matrix, Z is a P-by-P unitary matrix,
+     !> and R and T assume one of the forms:
+     !> if N >= M,  R = ( R11 ) M  ,   or if N < M,  R = ( R11  R12 ) N,
+     !> (  0  ) N-M                         N   M-N
+     !> M
+     !> where R11 is upper triangular, and
+     !> if N <= P,  T = ( 0  T12 ) N,   or if N > P,  T = ( T11 ) N-P,
+     !> P-N  N                           ( T21 ) P
+     !> P
+     !> where T12 or T21 is upper triangular.
+     !> In particular, if B is square and nonsingular, the GQR factorization
+     !> of A and B implicitly gives the QR factorization of inv(B)*A:
+     !> inv(B)*A = Z**H * (inv(T)*R)
+     !> where inv(B) denotes the inverse of the matrix B, and Z' denotes the
+     !> conjugate transpose of matrix Z.
+
+     pure subroutine stdlib_cggqrf( n, m, p, a, lda, taua, b, ldb, taub, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: taua(*), taub(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkopt, nb, nb1, nb2, nb3
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'CGEQRF', ' ', n, m, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'CGERQF', ' ', n, p, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'CUNMQR', ' ', n, m, p, -1 )
+           nb = max( nb1, nb2, nb3 )
+           lwkopt = max( n, m, p)*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( p<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, n, m, p ) .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGQRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! qr factorization of n-by-m matrix a: a = q*r
+           call stdlib_cgeqrf( n, m, a, lda, taua, work, lwork, info )
+           lopt = real( work( 1 ),KIND=sp)
+           ! update b := q**h*b.
+           call stdlib_cunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', n, p, min( n, m ), a,lda, taua, b, &
+                     ldb, work, lwork, info )
+           lopt = max( lopt, int( work( 1 ),KIND=ilp) )
+           ! rq factorization of n-by-p matrix b: b = t*z.
+           call stdlib_cgerqf( n, p, b, ldb, taub, work, lwork, info )
+           work( 1 ) = max( lopt, int( work( 1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_cggqrf
+
+     !> CGGRQF: computes a generalized RQ factorization of an M-by-N matrix A
+     !> and a P-by-N matrix B:
+     !> A = R*Q,        B = Z*T*Q,
+     !> where Q is an N-by-N unitary matrix, Z is a P-by-P unitary
+     !> matrix, and R and T assume one of the forms:
+     !> if M <= N,  R = ( 0  R12 ) M,   or if M > N,  R = ( R11 ) M-N,
+     !> N-M  M                           ( R21 ) N
+     !> N
+     !> where R12 or R21 is upper triangular, and
+     !> if P >= N,  T = ( T11 ) N  ,   or if P < N,  T = ( T11  T12 ) P,
+     !> (  0  ) P-N                         P   N-P
+     !> N
+     !> where T11 is upper triangular.
+     !> In particular, if B is square and nonsingular, the GRQ factorization
+     !> of A and B implicitly gives the RQ factorization of A*inv(B):
+     !> A*inv(B) = (R*inv(T))*Z**H
+     !> where inv(B) denotes the inverse of the matrix B, and Z**H denotes the
+     !> conjugate transpose of the matrix Z.
+
+     pure subroutine stdlib_cggrqf( m, p, n, a, lda, taua, b, ldb, taub, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: taua(*), taub(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkopt, nb, nb1, nb2, nb3
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'CGERQF', ' ', m, n, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'CGEQRF', ' ', p, n, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'CUNMRQ', ' ', m, n, p, -1 )
+           nb = max( nb1, nb2, nb3 )
+           lwkopt = max( n, m, p)*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( p<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, p ) ) then
+              info = -8
+           else if( lwork<max( 1, m, p, n ) .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGRQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! rq factorization of m-by-n matrix a: a = r*q
+           call stdlib_cgerqf( m, n, a, lda, taua, work, lwork, info )
+           lopt = real( work( 1 ),KIND=sp)
+           ! update b := b*q**h
+           call stdlib_cunmrq( 'RIGHT', 'CONJUGATE TRANSPOSE', p, n, min( m, n ),a( max( 1, m-n+1 &
+                     ), 1 ), lda, taua, b, ldb, work,lwork, info )
+           lopt = max( lopt, int( work( 1 ),KIND=ilp) )
+           ! qr factorization of p-by-n matrix b: b = z*t
+           call stdlib_cgeqrf( p, n, b, ldb, taub, work, lwork, info )
+           work( 1 ) = max( lopt, int( work( 1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_cggrqf
+
+     !> CGTTRS: solves one of the systems of equations
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> with a tridiagonal matrix A using the LU factorization computed
+     !> by CGTTRF.
+
+     pure subroutine stdlib_cgttrs( trans, n, nrhs, dl, d, du, du2, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: b(ldb,*)
+           complex(sp), intent(in) :: d(*), dl(*), du(*), du2(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: itrans, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           notran = ( trans=='N' .or. trans=='N' )
+           if( .not.notran .and. .not.( trans=='T' .or. trans=='T' ) .and. .not.( trans=='C' .or. &
+                     trans=='C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( n, 1 ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGTTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! decode trans
+           if( notran ) then
+              itrans = 0
+           else if( trans=='T' .or. trans=='T' ) then
+              itrans = 1
+           else
+              itrans = 2
+           end if
+           ! determine the number of right-hand sides to solve at a time.
+           if( nrhs==1 ) then
+              nb = 1
+           else
+              nb = max( 1, stdlib_ilaenv( 1, 'CGTTRS', trans, n, nrhs, -1, -1 ) )
+           end if
+           if( nb>=nrhs ) then
+              call stdlib_cgtts2( itrans, n, nrhs, dl, d, du, du2, ipiv, b, ldb )
+           else
+              do j = 1, nrhs, nb
+                 jb = min( nrhs-j+1, nb )
+                 call stdlib_cgtts2( itrans, n, jb, dl, d, du, du2, ipiv, b( 1, j ),ldb )
+              end do
+           end if
+     end subroutine stdlib_cgttrs
+
+     !> CHB2ST_KERNELS: is an internal routine used by the CHETRD_HB2ST
+     !> subroutine.
+
+     pure subroutine  stdlib_chb2st_kernels( uplo, wantz, ttype,st, ed, sweep, n, nb, ib,a, lda, &
+               v, tau, ldvt, work)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: wantz
+           integer(ilp), intent(in) :: ttype, st, ed, sweep, n, nb, ib, lda, ldvt
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: v(*), tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j1, j2, lm, ln, vpos, taupos, dpos, ofdpos, ajeter
+           complex(sp) :: ctmp
+           ! Intrinsic Functions 
+           intrinsic :: conjg,mod
+           ! Executable Statements 
+           ajeter = ib + ldvt
+           upper = stdlib_lsame( uplo, 'U' )
+           if( upper ) then
+               dpos    = 2 * nb + 1
+               ofdpos  = 2 * nb
+           else
+               dpos    = 1
+               ofdpos  = 2
+           endif
+           ! upper case
+           if( upper ) then
+               if( wantz ) then
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               else
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               endif
+               if( ttype==1 ) then
+                   lm = ed - st + 1
+                   v( vpos ) = cone
+                   do i = 1, lm-1
+                       v( vpos+i )         = conjg( a( ofdpos-i, st+i ) )
+                       a( ofdpos-i, st+i ) = czero
+                   end do
+                   ctmp = conjg( a( ofdpos, st ) )
+                   call stdlib_clarfg( lm, ctmp, v( vpos+1 ), 1,tau( taupos ) )
+                   a( ofdpos, st ) = ctmp
+                   lm = ed - st + 1
+                   call stdlib_clarfy( uplo, lm, v( vpos ), 1,conjg( tau( taupos ) ),a( dpos, st )&
+                             , lda-1, work)
+               endif
+               if( ttype==3 ) then
+                   lm = ed - st + 1
+                   call stdlib_clarfy( uplo, lm, v( vpos ), 1,conjg( tau( taupos ) ),a( dpos, st )&
+                             , lda-1, work)
+               endif
+               if( ttype==2 ) then
+                   j1 = ed+1
+                   j2 = min( ed+nb, n )
+                   ln = ed-st+1
+                   lm = j2-j1+1
+                   if( lm>0) then
+                       call stdlib_clarfx( 'LEFT', ln, lm, v( vpos ),conjg( tau( taupos ) ),a( &
+                                 dpos-nb, j1 ), lda-1, work)
+                       if( wantz ) then
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       else
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       endif
+                       v( vpos ) = cone
+                       do i = 1, lm-1
+                           v( vpos+i )          =conjg( a( dpos-nb-i, j1+i ) )
+                           a( dpos-nb-i, j1+i ) = czero
+                       end do
+                       ctmp = conjg( a( dpos-nb, j1 ) )
+                       call stdlib_clarfg( lm, ctmp, v( vpos+1 ), 1, tau( taupos ) )
+                       a( dpos-nb, j1 ) = ctmp
+                       call stdlib_clarfx( 'RIGHT', ln-1, lm, v( vpos ),tau( taupos ),a( dpos-nb+&
+                                 1, j1 ), lda-1, work)
+                   endif
+               endif
+           ! lower case
+           else
+               if( wantz ) then
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               else
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               endif
+               if( ttype==1 ) then
+                   lm = ed - st + 1
+                   v( vpos ) = cone
+                   do i = 1, lm-1
+                       v( vpos+i )         = a( ofdpos+i, st-1 )
+                       a( ofdpos+i, st-1 ) = czero
+                   end do
+                   call stdlib_clarfg( lm, a( ofdpos, st-1 ), v( vpos+1 ), 1,tau( taupos ) )
+                             
+                   lm = ed - st + 1
+                   call stdlib_clarfy( uplo, lm, v( vpos ), 1,conjg( tau( taupos ) ),a( dpos, st )&
+                             , lda-1, work)
+               endif
+               if( ttype==3 ) then
+                   lm = ed - st + 1
+                   call stdlib_clarfy( uplo, lm, v( vpos ), 1,conjg( tau( taupos ) ),a( dpos, st )&
+                             , lda-1, work)
+               endif
+               if( ttype==2 ) then
+                   j1 = ed+1
+                   j2 = min( ed+nb, n )
+                   ln = ed-st+1
+                   lm = j2-j1+1
+                   if( lm>0) then
+                       call stdlib_clarfx( 'RIGHT', lm, ln, v( vpos ),tau( taupos ), a( dpos+nb, &
+                                 st ),lda-1, work)
+                       if( wantz ) then
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       else
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       endif
+                       v( vpos ) = cone
+                       do i = 1, lm-1
+                           v( vpos+i )        = a( dpos+nb+i, st )
+                           a( dpos+nb+i, st ) = czero
+                       end do
+                       call stdlib_clarfg( lm, a( dpos+nb, st ), v( vpos+1 ), 1,tau( taupos ) )
+                                 
+                       call stdlib_clarfx( 'LEFT', lm, ln-1, v( vpos ),conjg( tau( taupos ) ),a( &
+                                 dpos+nb-1, st+1 ), lda-1, work)
+                   endif
+               endif
+           endif
+           return
+     end subroutine stdlib_chb2st_kernels
+
+     !> CHEEQUB: computes row and column scalings intended to equilibrate a
+     !> Hermitian matrix A (with respect to the Euclidean norm) and reduce
+     !> its condition number. The scale factors S are computed by the BIN
+     !> algorithm (see references) so that the scaled matrix B with elements
+     !> B(i,j) = S(i)*A(i,j)*S(j) has a condition number within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_cheequb( uplo, n, a, lda, s, scond, amax, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(out) :: amax, scond
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+           real(sp), intent(out) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: max_iter = 100
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, iter
+           real(sp) :: avg, std, tol, c0, c1, c2, t, u, si, d, base, smin, smax, smlnum, bignum, &
+                     scale, sumsq
+           logical(lk) :: up
+           complex(sp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,int,log,max,min,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if ( .not. ( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -1
+           else if ( n < 0 ) then
+              info = -2
+           else if ( lda < max( 1, n ) ) then
+              info = -4
+           end if
+           if ( info /= 0 ) then
+              call stdlib_xerbla( 'CHEEQUB', -info )
+              return
+           end if
+           up = stdlib_lsame( uplo, 'U' )
+           amax = zero
+           ! quick return if possible.
+           if ( n == 0 ) then
+              scond = one
+              return
+           end if
+           do i = 1, n
+              s( i ) = zero
+           end do
+           amax = zero
+           if ( up ) then
+              do j = 1, n
+                 do i = 1, j-1
+                    s( i ) = max( s( i ), cabs1( a( i, j ) ) )
+                    s( j ) = max( s( j ), cabs1( a( i, j ) ) )
+                    amax = max( amax, cabs1( a( i, j ) ) )
+                 end do
+                 s( j ) = max( s( j ), cabs1( a( j, j ) ) )
+                 amax = max( amax, cabs1( a( j, j ) ) )
+              end do
+           else
+              do j = 1, n
+                 s( j ) = max( s( j ), cabs1( a( j, j ) ) )
+                 amax = max( amax, cabs1( a( j, j ) ) )
+                 do i = j+1, n
+                    s( i ) = max( s( i ), cabs1( a( i, j ) ) )
+                    s( j ) = max( s( j ), cabs1( a( i, j ) ) )
+                    amax = max( amax, cabs1( a( i, j ) ) )
+                 end do
+              end do
+           end if
+           do j = 1, n
+              s( j ) = 1.0_sp / s( j )
+           end do
+           tol = one / sqrt( 2.0_sp * n )
+           do iter = 1, max_iter
+              scale = zero
+              sumsq = zero
+              ! beta = |a|s
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              if ( up ) then
+                 do j = 1, n
+                    do i = 1, j-1
+                       work( i ) = work( i ) + cabs1( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + cabs1( a( i, j ) ) * s( i )
+                    end do
+                    work( j ) = work( j ) + cabs1( a( j, j ) ) * s( j )
+                 end do
+              else
+                 do j = 1, n
+                    work( j ) = work( j ) + cabs1( a( j, j ) ) * s( j )
+                    do i = j+1, n
+                       work( i ) = work( i ) + cabs1( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + cabs1( a( i, j ) ) * s( i )
+                    end do
+                 end do
+              end if
+              ! avg = s^t beta / n
+              avg = zero
+              do i = 1, n
+                 avg = avg + real( s( i )*work( i ),KIND=sp)
+              end do
+              avg = avg / n
+              std = zero
+              do i = n+1, 2*n
+                 work( i ) = s( i-n ) * work( i-n ) - avg
+              end do
+              call stdlib_classq( n, work( n+1 ), 1, scale, sumsq )
+              std = scale * sqrt( sumsq / n )
+              if ( std < tol * avg ) goto 999
+              do i = 1, n
+                 t = cabs1( a( i, i ) )
+                 si = s( i )
+                 c2 = ( n-1 ) * t
+                 c1 = real( ( n-2 ) * ( work( i ) - t*si ),KIND=sp)
+                 c0 = real( -(t*si)*si + 2*work( i )*si - n*avg,KIND=sp)
+                 d = c1*c1 - 4*c0*c2
+                 if ( d <= 0 ) then
+                    info = -1
+                    return
+                 end if
+                 si = -2*c0 / ( c1 + sqrt( d ) )
+                 d = si - s( i )
+                 u = zero
+                 if ( up ) then
+                    do j = 1, i
+                       t = cabs1( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = cabs1( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 else
+                    do j = 1, i
+                       t = cabs1( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = cabs1( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 end if
+                 avg = avg + real( ( u + work( i ) ) * d / n,KIND=sp)
+                 s( i ) = si
+              end do
+           end do
+           999 continue
+           smlnum = stdlib_slamch( 'SAFEMIN' )
+           bignum = one / smlnum
+           smin = bignum
+           smax = zero
+           t = one / sqrt( avg )
+           base = stdlib_slamch( 'B' )
+           u = one / log( base )
+           do i = 1, n
+              s( i ) = base ** int( u * log( s( i ) * t ),KIND=ilp)
+              smin = min( smin, s( i ) )
+              smax = max( smax, s( i ) )
+           end do
+           scond = max( smin, smlnum ) / min( smax, bignum )
+     end subroutine stdlib_cheequb
+
+     !> CHEGS2: reduces a complex Hermitian-definite generalized
+     !> eigenproblem to standard form.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H *A*L.
+     !> B must have been previously factorized as U**H *U or L*L**H by ZPOTRF.
+
+     pure subroutine stdlib_chegs2( itype, uplo, n, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k
+           real(sp) :: akk, bkk
+           complex(sp) :: ct
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHEGS2', -info )
+              return
+           end if
+           if( itype==1 ) then
+              if( upper ) then
+                 ! compute inv(u**h)*a*inv(u)
+                 do k = 1, n
+                    ! update the upper triangle of a(k:n,k:n)
+                    akk = real( a( k, k ),KIND=sp)
+                    bkk = real( b( k, k ),KIND=sp)
+                    akk = akk / bkk**2
+                    a( k, k ) = akk
+                    if( k<n ) then
+                       call stdlib_csscal( n-k, one / bkk, a( k, k+1 ), lda )
+                       ct = -half*akk
+                       call stdlib_clacgv( n-k, a( k, k+1 ), lda )
+                       call stdlib_clacgv( n-k, b( k, k+1 ), ldb )
+                       call stdlib_caxpy( n-k, ct, b( k, k+1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_cher2( uplo, n-k, -cone, a( k, k+1 ), lda,b( k, k+1 ), ldb, a( &
+                                 k+1, k+1 ), lda )
+                       call stdlib_caxpy( n-k, ct, b( k, k+1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_clacgv( n-k, b( k, k+1 ), ldb )
+                       call stdlib_ctrsv( uplo, 'CONJUGATE TRANSPOSE', 'NON-UNIT',n-k, b( k+1, k+&
+                                 1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_clacgv( n-k, a( k, k+1 ), lda )
+                    end if
+                 end do
+              else
+                 ! compute inv(l)*a*inv(l**h)
+                 do k = 1, n
+                    ! update the lower triangle of a(k:n,k:n)
+                    akk = real( a( k, k ),KIND=sp)
+                    bkk = real( b( k, k ),KIND=sp)
+                    akk = akk / bkk**2
+                    a( k, k ) = akk
+                    if( k<n ) then
+                       call stdlib_csscal( n-k, one / bkk, a( k+1, k ), 1 )
+                       ct = -half*akk
+                       call stdlib_caxpy( n-k, ct, b( k+1, k ), 1, a( k+1, k ), 1 )
+                       call stdlib_cher2( uplo, n-k, -cone, a( k+1, k ), 1,b( k+1, k ), 1, a( k+1,&
+                                  k+1 ), lda )
+                       call stdlib_caxpy( n-k, ct, b( k+1, k ), 1, a( k+1, k ), 1 )
+                       call stdlib_ctrsv( uplo, 'NO TRANSPOSE', 'NON-UNIT', n-k,b( k+1, k+1 ), &
+                                 ldb, a( k+1, k ), 1 )
+                    end if
+                 end do
+              end if
+           else
+              if( upper ) then
+                 ! compute u*a*u**h
+                 do k = 1, n
+                    ! update the upper triangle of a(1:k,1:k)
+                    akk = real( a( k, k ),KIND=sp)
+                    bkk = real( b( k, k ),KIND=sp)
+                    call stdlib_ctrmv( uplo, 'NO TRANSPOSE', 'NON-UNIT', k-1, b,ldb, a( 1, k ), 1 &
+                              )
+                    ct = half*akk
+                    call stdlib_caxpy( k-1, ct, b( 1, k ), 1, a( 1, k ), 1 )
+                    call stdlib_cher2( uplo, k-1, cone, a( 1, k ), 1, b( 1, k ), 1,a, lda )
+                              
+                    call stdlib_caxpy( k-1, ct, b( 1, k ), 1, a( 1, k ), 1 )
+                    call stdlib_csscal( k-1, bkk, a( 1, k ), 1 )
+                    a( k, k ) = akk*bkk**2
+                 end do
+              else
+                 ! compute l**h *a*l
+                 do k = 1, n
+                    ! update the lower triangle of a(1:k,1:k)
+                    akk = real( a( k, k ),KIND=sp)
+                    bkk = real( b( k, k ),KIND=sp)
+                    call stdlib_clacgv( k-1, a( k, 1 ), lda )
+                    call stdlib_ctrmv( uplo, 'CONJUGATE TRANSPOSE', 'NON-UNIT', k-1,b, ldb, a( k, &
+                              1 ), lda )
+                    ct = half*akk
+                    call stdlib_clacgv( k-1, b( k, 1 ), ldb )
+                    call stdlib_caxpy( k-1, ct, b( k, 1 ), ldb, a( k, 1 ), lda )
+                    call stdlib_cher2( uplo, k-1, cone, a( k, 1 ), lda, b( k, 1 ),ldb, a, lda )
+                              
+                    call stdlib_caxpy( k-1, ct, b( k, 1 ), ldb, a( k, 1 ), lda )
+                    call stdlib_clacgv( k-1, b( k, 1 ), ldb )
+                    call stdlib_csscal( k-1, bkk, a( k, 1 ), lda )
+                    call stdlib_clacgv( k-1, a( k, 1 ), lda )
+                    a( k, k ) = akk*bkk**2
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_chegs2
+
+     !> CHEGST: reduces a complex Hermitian-definite generalized
+     !> eigenproblem to standard form.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L.
+     !> B must have been previously factorized as U**H*U or L*L**H by CPOTRF.
+
+     pure subroutine stdlib_chegst( itype, uplo, n, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHEGST', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'CHEGST', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_chegs2( itype, uplo, n, a, lda, b, ldb, info )
+           else
+              ! use blocked code
+              if( itype==1 ) then
+                 if( upper ) then
+                    ! compute inv(u**h)*a*inv(u)
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the upper triangle of a(k:n,k:n)
+                       call stdlib_chegs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                       if( k+kb<=n ) then
+                          call stdlib_ctrsm( 'LEFT', uplo, 'CONJUGATE TRANSPOSE','NON-UNIT', kb, &
+                                    n-k-kb+1, cone,b( k, k ), ldb, a( k, k+kb ), lda )
+                          call stdlib_chemm( 'LEFT', uplo, kb, n-k-kb+1, -chalf,a( k, k ), lda, b(&
+                                     k, k+kb ), ldb,cone, a( k, k+kb ), lda )
+                          call stdlib_cher2k( uplo, 'CONJUGATE TRANSPOSE', n-k-kb+1,kb, -cone, a( &
+                                    k, k+kb ), lda,b( k, k+kb ), ldb, one,a( k+kb, k+kb ), lda )
+                          call stdlib_chemm( 'LEFT', uplo, kb, n-k-kb+1, -chalf,a( k, k ), lda, b(&
+                                     k, k+kb ), ldb,cone, a( k, k+kb ), lda )
+                          call stdlib_ctrsm( 'RIGHT', uplo, 'NO TRANSPOSE','NON-UNIT', kb, n-k-kb+&
+                                    1, cone,b( k+kb, k+kb ), ldb, a( k, k+kb ),lda )
+                       end if
+                    end do
+                 else
+                    ! compute inv(l)*a*inv(l**h)
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the lower triangle of a(k:n,k:n)
+                       call stdlib_chegs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                       if( k+kb<=n ) then
+                          call stdlib_ctrsm( 'RIGHT', uplo, 'CONJUGATE TRANSPOSE','NON-UNIT', n-k-&
+                                    kb+1, kb, cone,b( k, k ), ldb, a( k+kb, k ), lda )
+                          call stdlib_chemm( 'RIGHT', uplo, n-k-kb+1, kb, -chalf,a( k, k ), lda, &
+                                    b( k+kb, k ), ldb,cone, a( k+kb, k ), lda )
+                          call stdlib_cher2k( uplo, 'NO TRANSPOSE', n-k-kb+1, kb,-cone, a( k+kb, &
+                                    k ), lda,b( k+kb, k ), ldb, one,a( k+kb, k+kb ), lda )
+                          call stdlib_chemm( 'RIGHT', uplo, n-k-kb+1, kb, -chalf,a( k, k ), lda, &
+                                    b( k+kb, k ), ldb,cone, a( k+kb, k ), lda )
+                          call stdlib_ctrsm( 'LEFT', uplo, 'NO TRANSPOSE','NON-UNIT', n-k-kb+1, &
+                                    kb, cone,b( k+kb, k+kb ), ldb, a( k+kb, k ),lda )
+                       end if
+                    end do
+                 end if
+              else
+                 if( upper ) then
+                    ! compute u*a*u**h
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the upper triangle of a(1:k+kb-1,1:k+kb-1)
+                       call stdlib_ctrmm( 'LEFT', uplo, 'NO TRANSPOSE', 'NON-UNIT',k-1, kb, cone, &
+                                 b, ldb, a( 1, k ), lda )
+                       call stdlib_chemm( 'RIGHT', uplo, k-1, kb, chalf, a( k, k ),lda, b( 1, k ),&
+                                  ldb, cone, a( 1, k ),lda )
+                       call stdlib_cher2k( uplo, 'NO TRANSPOSE', k-1, kb, cone,a( 1, k ), lda, b( &
+                                 1, k ), ldb, one, a,lda )
+                       call stdlib_chemm( 'RIGHT', uplo, k-1, kb, chalf, a( k, k ),lda, b( 1, k ),&
+                                  ldb, cone, a( 1, k ),lda )
+                       call stdlib_ctrmm( 'RIGHT', uplo, 'CONJUGATE TRANSPOSE','NON-UNIT', k-1, &
+                                 kb, cone, b( k, k ), ldb,a( 1, k ), lda )
+                       call stdlib_chegs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                    end do
+                 else
+                    ! compute l**h*a*l
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the lower triangle of a(1:k+kb-1,1:k+kb-1)
+                       call stdlib_ctrmm( 'RIGHT', uplo, 'NO TRANSPOSE', 'NON-UNIT',kb, k-1, cone,&
+                                  b, ldb, a( k, 1 ), lda )
+                       call stdlib_chemm( 'LEFT', uplo, kb, k-1, chalf, a( k, k ),lda, b( k, 1 ), &
+                                 ldb, cone, a( k, 1 ),lda )
+                       call stdlib_cher2k( uplo, 'CONJUGATE TRANSPOSE', k-1, kb,cone, a( k, 1 ), &
+                                 lda, b( k, 1 ), ldb,one, a, lda )
+                       call stdlib_chemm( 'LEFT', uplo, kb, k-1, chalf, a( k, k ),lda, b( k, 1 ), &
+                                 ldb, cone, a( k, 1 ),lda )
+                       call stdlib_ctrmm( 'LEFT', uplo, 'CONJUGATE TRANSPOSE','NON-UNIT', kb, k-1,&
+                                  cone, b( k, k ), ldb,a( k, 1 ), lda )
+                       call stdlib_chegs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_chegst
+
+     !> CHETD2: reduces a complex Hermitian matrix A to real symmetric
+     !> tridiagonal form T by a unitary similarity transformation:
+     !> Q**H * A * Q = T.
+
+     pure subroutine stdlib_chetd2( uplo, n, a, lda, d, e, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(out) :: d(*), e(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           complex(sp) :: alpha, taui
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETD2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           if( upper ) then
+              ! reduce the upper triangle of a
+              a( n, n ) = real( a( n, n ),KIND=sp)
+              do i = n - 1, 1, -1
+                 ! generate elementary reflector h(i) = i - tau * v * v**h
+                 ! to annihilate a(1:i-1,i+1)
+                 alpha = a( i, i+1 )
+                 call stdlib_clarfg( i, alpha, a( 1, i+1 ), 1, taui )
+                 e( i ) = real( alpha,KIND=sp)
+                 if( taui/=czero ) then
+                    ! apply h(i) from both sides to a(1:i,1:i)
+                    a( i, i+1 ) = cone
+                    ! compute  x := tau * a * v  storing x in tau(1:i)
+                    call stdlib_chemv( uplo, i, taui, a, lda, a( 1, i+1 ), 1, czero,tau, 1 )
+                              
+                    ! compute  w := x - 1/2 * tau * (x**h * v) * v
+                    alpha = -chalf*taui*stdlib_cdotc( i, tau, 1, a( 1, i+1 ), 1 )
+                    call stdlib_caxpy( i, alpha, a( 1, i+1 ), 1, tau, 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**h - w * v**h
+                    call stdlib_cher2( uplo, i, -cone, a( 1, i+1 ), 1, tau, 1, a,lda )
+                 else
+                    a( i, i ) = real( a( i, i ),KIND=sp)
+                 end if
+                 a( i, i+1 ) = e( i )
+                 d( i+1 ) = real( a( i+1, i+1 ),KIND=sp)
+                 tau( i ) = taui
+              end do
+              d( 1 ) = real( a( 1, 1 ),KIND=sp)
+           else
+              ! reduce the lower triangle of a
+              a( 1, 1 ) = real( a( 1, 1 ),KIND=sp)
+              do i = 1, n - 1
+                 ! generate elementary reflector h(i) = i - tau * v * v**h
+                 ! to annihilate a(i+2:n,i)
+                 alpha = a( i+1, i )
+                 call stdlib_clarfg( n-i, alpha, a( min( i+2, n ), i ), 1, taui )
+                 e( i ) = real( alpha,KIND=sp)
+                 if( taui/=czero ) then
+                    ! apply h(i) from both sides to a(i+1:n,i+1:n)
+                    a( i+1, i ) = cone
+                    ! compute  x := tau * a * v  storing y in tau(i:n-1)
+                    call stdlib_chemv( uplo, n-i, taui, a( i+1, i+1 ), lda,a( i+1, i ), 1, czero, &
+                              tau( i ), 1 )
+                    ! compute  w := x - 1/2 * tau * (x**h * v) * v
+                    alpha = -chalf*taui*stdlib_cdotc( n-i, tau( i ), 1, a( i+1, i ),1 )
+                    call stdlib_caxpy( n-i, alpha, a( i+1, i ), 1, tau( i ), 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**h - w * v**h
+                    call stdlib_cher2( uplo, n-i, -cone, a( i+1, i ), 1, tau( i ), 1,a( i+1, i+1 )&
+                              , lda )
+                 else
+                    a( i+1, i+1 ) = real( a( i+1, i+1 ),KIND=sp)
+                 end if
+                 a( i+1, i ) = e( i )
+                 d( i ) = real( a( i, i ),KIND=sp)
+                 tau( i ) = taui
+              end do
+              d( n ) = real( a( n, n ),KIND=sp)
+           end if
+           return
+     end subroutine stdlib_chetd2
+
+     !> CHETRD: reduces a complex Hermitian matrix A to real symmetric
+     !> tridiagonal form T by a unitary similarity transformation:
+     !> Q**H * A * Q = T.
+
+     pure subroutine stdlib_chetrd( uplo, n, a, lda, d, e, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(sp), intent(out) :: d(*), e(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, iws, j, kk, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              ! determine the block size.
+              nb = stdlib_ilaenv( 1, 'CHETRD', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nx = n
+           iws = 1
+           if( nb>1 .and. nb<n ) then
+              ! determine when to cross over from blocked to unblocked code
+              ! (last block is always handled by unblocked code).
+              nx = max( nb, stdlib_ilaenv( 3, 'CHETRD', uplo, n, -1, -1, -1 ) )
+              if( nx<n ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code by setting nx = n.
+                    nb = max( lwork / ldwork, 1 )
+                    nbmin = stdlib_ilaenv( 2, 'CHETRD', uplo, n, -1, -1, -1 )
+                    if( nb<nbmin )nx = n
+                 end if
+              else
+                 nx = n
+              end if
+           else
+              nb = 1
+           end if
+           if( upper ) then
+              ! reduce the upper triangle of a.
+              ! columns 1:kk are handled by the unblocked method.
+              kk = n - ( ( n-nx+nb-1 ) / nb )*nb
+              do i = n - nb + 1, kk + 1, -nb
+                 ! reduce columns i:i+nb-1 to tridiagonal form and form the
+                 ! matrix w which is needed to update the unreduced part of
+                 ! the matrix
+                 call stdlib_clatrd( uplo, i+nb-1, nb, a, lda, e, tau, work,ldwork )
+                 ! update the unreduced submatrix a(1:i-1,1:i-1), using an
+                 ! update of the form:  a := a - v*w**h - w*v**h
+                 call stdlib_cher2k( uplo, 'NO TRANSPOSE', i-1, nb, -cone,a( 1, i ), lda, work, &
+                           ldwork, one, a, lda )
+                 ! copy superdiagonal elements back into a, and diagonal
+                 ! elements into d
+                 do j = i, i + nb - 1
+                    a( j-1, j ) = e( j-1 )
+                    d( j ) = real( a( j, j ),KIND=sp)
+                 end do
+              end do
+              ! use unblocked code to reduce the last or only block
+              call stdlib_chetd2( uplo, kk, a, lda, d, e, tau, iinfo )
+           else
+              ! reduce the lower triangle of a
+              do i = 1, n - nx, nb
+                 ! reduce columns i:i+nb-1 to tridiagonal form and form the
+                 ! matrix w which is needed to update the unreduced part of
+                 ! the matrix
+                 call stdlib_clatrd( uplo, n-i+1, nb, a( i, i ), lda, e( i ),tau( i ), work, &
+                           ldwork )
+                 ! update the unreduced submatrix a(i+nb:n,i+nb:n), using
+                 ! an update of the form:  a := a - v*w**h - w*v**h
+                 call stdlib_cher2k( uplo, 'NO TRANSPOSE', n-i-nb+1, nb, -cone,a( i+nb, i ), lda, &
+                           work( nb+1 ), ldwork, one,a( i+nb, i+nb ), lda )
+                 ! copy subdiagonal elements back into a, and diagonal
+                 ! elements into d
+                 do j = i, i + nb - 1
+                    a( j+1, j ) = e( j )
+                    d( j ) = real( a( j, j ),KIND=sp)
+                 end do
+              end do
+              ! use unblocked code to reduce the last or only block
+              call stdlib_chetd2( uplo, n-i+1, a( i, i ), lda, d( i ), e( i ),tau( i ), iinfo )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_chetrd
+
+     !> CHETRD_HB2ST: reduces a complex Hermitian band matrix A to real symmetric
+     !> tridiagonal form T by a unitary similarity transformation:
+     !> Q**H * A * Q = T.
+
+     pure subroutine stdlib_chetrd_hb2st( stage1, vect, uplo, n, kd, ab, ldab,d, e, hous, lhous, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: stage1, uplo, vect
+           integer(ilp), intent(in) :: n, kd, ldab, lhous, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(out) :: d(*), e(*)
+           complex(sp), intent(inout) :: ab(ldab,*)
+           complex(sp), intent(out) :: hous(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: rzero = 0.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantq, upper, afters1
+           integer(ilp) :: i, m, k, ib, sweepid, myid, shift, stt, st, ed, stind, edind, &
+           blklastind, colpt, thed, stepercol, grsiz, thgrsiz, thgrnb, thgrid, nbtiles, ttype, &
+           tid, nthreads, debug, abdpos, abofdpos, dpos, ofdpos, awpos, inda, indw, apos, sizea, &
+                     lda, indv, indtau, sicev, sizetau, ldv, lhmin, lwmin
+           real(sp) :: abstmp
+           complex(sp) :: tmp
+           ! Intrinsic Functions 
+           intrinsic :: min,max,ceiling,real
+           ! Executable Statements 
+           ! determine the minimal workspace size required.
+           ! test the input parameters
+           debug   = 0
+           info    = 0
+           afters1 = stdlib_lsame( stage1, 'Y' )
+           wantq   = stdlib_lsame( vect, 'V' )
+           upper   = stdlib_lsame( uplo, 'U' )
+           lquery  = ( lwork==-1 ) .or. ( lhous==-1 )
+           ! determine the block size, the workspace size and the hous size.
+           ib     = stdlib_ilaenv2stage( 2, 'CHETRD_HB2ST', vect, n, kd, -1, -1 )
+           lhmin  = stdlib_ilaenv2stage( 3, 'CHETRD_HB2ST', vect, n, kd, ib, -1 )
+           lwmin  = stdlib_ilaenv2stage( 4, 'CHETRD_HB2ST', vect, n, kd, ib, -1 )
+           if( .not.afters1 .and. .not.stdlib_lsame( stage1, 'N' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -2
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<(kd+1) ) then
+              info = -7
+           else if( lhous<lhmin .and. .not.lquery ) then
+              info = -11
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              hous( 1 ) = lhmin
+              work( 1 ) = lwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRD_HB2ST', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! determine pointer position
+           ldv      = kd + ib
+           sizetau  = 2 * n
+           sicev    = 2 * n
+           indtau   = 1
+           indv     = indtau + sizetau
+           lda      = 2 * kd + 1
+           sizea    = lda * n
+           inda     = 1
+           indw     = inda + sizea
+           nthreads = 1
+           tid      = 0
+           if( upper ) then
+               apos     = inda + kd
+               awpos    = inda
+               dpos     = apos + kd
+               ofdpos   = dpos - 1
+               abdpos   = kd + 1
+               abofdpos = kd
+           else
+               apos     = inda
+               awpos    = inda + kd + 1
+               dpos     = apos
+               ofdpos   = dpos + 1
+               abdpos   = 1
+               abofdpos = 2
+           endif
+           ! case kd=0:
+           ! the matrix is diagonal. we just copy it (convert to "real" for
+           ! complex because d is double and the imaginary part should be 0)
+           ! and store it in d. a sequential code here is better or
+           ! in a parallel environment it might need two cores for d and e
+           if( kd==0 ) then
+               do i = 1, n
+                   d( i ) = real( ab( abdpos, i ),KIND=sp)
+               end do
+               do i = 1, n-1
+                   e( i ) = rzero
+               end do
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! case kd=1:
+           ! the matrix is already tridiagonal. we have to make diagonal
+           ! and offdiagonal elements real, and store them in d and e.
+           ! for that, for real precision just copy the diag and offdiag
+           ! to d and e while for the complex case the bulge chasing is
+           ! performed to convert the hermetian tridiagonal to symmetric
+           ! tridiagonal. a simpler conversion formula might be used, but then
+           ! updating the q matrix will be required and based if q is generated
+           ! or not this might complicate the story.
+           if( kd==1 ) then
+               do i = 1, n
+                   d( i ) = real( ab( abdpos, i ),KIND=sp)
+               end do
+               ! make off-diagonal elements real and copy them to e
+               if( upper ) then
+                   do i = 1, n - 1
+                       tmp = ab( abofdpos, i+1 )
+                       abstmp = abs( tmp )
+                       ab( abofdpos, i+1 ) = abstmp
+                       e( i ) = abstmp
+                       if( abstmp/=rzero ) then
+                          tmp = tmp / abstmp
+                       else
+                          tmp = cone
+                       end if
+                       if( i<n-1 )ab( abofdpos, i+2 ) = ab( abofdpos, i+2 )*tmp
+                        ! if( wantz ) then
+                           ! call stdlib_cscal( n, conjg( tmp ), q( 1, i+1 ), 1 )
+                        ! end if
+                   end do
+               else
+                   do i = 1, n - 1
+                      tmp = ab( abofdpos, i )
+                      abstmp = abs( tmp )
+                      ab( abofdpos, i ) = abstmp
+                      e( i ) = abstmp
+                      if( abstmp/=rzero ) then
+                         tmp = tmp / abstmp
+                      else
+                         tmp = cone
+                      end if
+                      if( i<n-1 )ab( abofdpos, i+1 ) = ab( abofdpos, i+1 )*tmp
+                       ! if( wantq ) then
+                          ! call stdlib_cscal( n, tmp, q( 1, i+1 ), 1 )
+                       ! end if
+                   end do
+               endif
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! main code start here.
+           ! reduce the hermitian band of a to a tridiagonal matrix.
+           thgrsiz   = n
+           grsiz     = 1
+           shift     = 3
+           nbtiles   = ceiling( real(n,KIND=sp)/real(kd,KIND=sp) )
+           stepercol = ceiling( real(shift,KIND=sp)/real(grsiz,KIND=sp) )
+           thgrnb    = ceiling( real(n-1,KIND=sp)/real(thgrsiz,KIND=sp) )
+           call stdlib_clacpy( "A", kd+1, n, ab, ldab, work( apos ), lda )
+           call stdlib_claset( "A", kd,   n, czero, czero, work( awpos ), lda )
+           ! openmp parallelisation start here
+           !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) &
+           !$OMP&         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) &
+           !$OMP&         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) &
+           !$OMP&         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) &
+           !$OMP&         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) &
+           !$OMP&         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+           !$OMP MASTER
+           ! main bulge chasing loop
+           loop_100: do thgrid = 1, thgrnb
+               stt  = (thgrid-1)*thgrsiz+1
+               thed = min( (stt + thgrsiz -1), (n-1))
+               loop_110: do i = stt, n-1
+                   ed = min( i, thed )
+                   if( stt>ed ) exit
+                   loop_120: do m = 1, stepercol
+                       st = stt
+                       loop_130: do sweepid = st, ed
+                           loop_140: do k = 1, grsiz
+                               myid  = (i-sweepid)*(stepercol*grsiz)+ (m-1)*grsiz + k
+                               if ( myid==1 ) then
+                                   ttype = 1
+                               else
+                                   ttype = mod( myid, 2 ) + 2
+                               endif
+                               if( ttype==2 ) then
+                                   colpt      = (myid/2)*kd + sweepid
+                                   stind      = colpt-kd+1
+                                   edind      = min(colpt,n)
+                                   blklastind = colpt
+                               else
+                                   colpt      = ((myid+1)/2)*kd + sweepid
+                                   stind      = colpt-kd+1
+                                   edind      = min(colpt,n)
+                                   if( ( stind>=edind-1 ).and.( edind==n ) ) then
+                                       blklastind = n
+                                   else
+                                       blklastind = 0
+                                   endif
+                               endif
+                               ! call the kernel
+                               !$ if( ttype/=1 ) then
+                                   !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) &
+                                   !$OMP&     DEPEND(in:WORK(MYID-1)) &
+                                   !$OMP&     DEPEND(out:WORK(MYID))
+                                   !$ tid      = omp_get_thread_num()
+                                   !$ call stdlib_chb2st_kernels( uplo, wantq, ttype,stind, edind, &
+                                   !$ sweepid, n, kd, ib,work ( inda ), lda,hous( indv ), hous( &
+                                   !$ indtau ), ldv,work( indw + tid*kd ) )
+                               !$OMP END TASK
+                                   !$ else
+                                   !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) &
+                                   !$OMP&     DEPEND(out:WORK(MYID))
+                                   !$ tid      = omp_get_thread_num()
+                                   call stdlib_chb2st_kernels( uplo, wantq, ttype,stind, edind, &
+                                   sweepid, n, kd, ib,work ( inda ), lda,hous( indv ), hous( &
+                                            indtau ), ldv,work( indw + tid*kd ) )
+                                   !$OMP END TASK
+                               !$ endif
+                               if ( blklastind>=(n-1) ) then
+                                   stt = stt + 1
+                                   exit
+                               endif
+                           end do loop_140
+                       end do loop_130
+                   end do loop_120
+               end do loop_110
+           end do loop_100
+           !$OMP END MASTER
+           !$OMP END PARALLEL
+           ! copy the diagonal from a to d. note that d is real thus only
+           ! the real part is needed, the imaginary part should be czero.
+           do i = 1, n
+               d( i ) = real( work( dpos+(i-1)*lda ),KIND=sp)
+           end do
+           ! copy the off diagonal from a to e. note that e is real thus only
+           ! the real part is needed, the imaginary part should be czero.
+           if( upper ) then
+               do i = 1, n-1
+                  e( i ) = real( work( ofdpos+i*lda ),KIND=sp)
+               end do
+           else
+               do i = 1, n-1
+                  e( i ) = real( work( ofdpos+(i-1)*lda ),KIND=sp)
+               end do
+           endif
+           hous( 1 ) = lhmin
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_chetrd_hb2st
+
+     !> CHETRD_HE2HB: reduces a complex Hermitian matrix A to complex Hermitian
+     !> band-diagonal form AB by a unitary similarity transformation:
+     !> Q**H * A * Q = AB.
+
+     pure subroutine stdlib_chetrd_he2hb( uplo, n, kd, a, lda, ab, ldab, tau,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldab, lwork, n, kd
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: ab(ldab,*), tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: rone = 1.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, j, iinfo, lwmin, pn, pk, lk, ldt, ldw, lds2, lds1, ls2, ls1, lw, lt,&
+                      tpos, wpos, s2pos, s1pos
+           ! Intrinsic Functions 
+           intrinsic :: min,max
+           ! Executable Statements 
+           ! determine the minimal workspace size required
+           ! and test the input parameters
+           info   = 0
+           upper  = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           lwmin  = stdlib_ilaenv2stage( 4, 'CHETRD_HE2HB', '', n, kd, -1, -1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldab<max( 1, kd+1 ) ) then
+              info = -7
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRD_HE2HB', -info )
+              return
+           else if( lquery ) then
+              work( 1 ) = lwmin
+              return
+           end if
+           ! quick return if possible
+           ! copy the upper/lower portion of a into ab
+           if( n<=kd+1 ) then
+               if( upper ) then
+                   do i = 1, n
+                       lk = min( kd+1, i )
+                       call stdlib_ccopy( lk, a( i-lk+1, i ), 1,ab( kd+1-lk+1, i ), 1 )
+                   end do
+               else
+                   do i = 1, n
+                       lk = min( kd+1, n-i+1 )
+                       call stdlib_ccopy( lk, a( i, i ), 1, ab( 1, i ), 1 )
+                   end do
+               endif
+               work( 1 ) = 1
+               return
+           end if
+           ! determine the pointer position for the workspace
+           ldt    = kd
+           lds1   = kd
+           lt     = ldt*kd
+           lw     = n*kd
+           ls1    = lds1*kd
+           ls2    = lwmin - lt - lw - ls1
+            ! ls2 = n*max(kd,factoptnb)
+           tpos   = 1
+           wpos   = tpos  + lt
+           s1pos  = wpos  + lw
+           s2pos  = s1pos + ls1
+           if( upper ) then
+               ldw    = kd
+               lds2   = kd
+           else
+               ldw    = n
+               lds2   = n
+           endif
+           ! set the workspace of the triangular matrix t to czero once such a
+           ! way every time t is generated the upper/lower portion will be always czero
+           call stdlib_claset( "A", ldt, kd, czero, czero, work( tpos ), ldt )
+           if( upper ) then
+               do i = 1, n - kd, kd
+                  pn = n-i-kd+1
+                  pk = min( n-i-kd+1, kd )
+                  ! compute the lq factorization of the current block
+                  call stdlib_cgelqf( kd, pn, a( i, i+kd ), lda,tau( i ), work( s2pos ), ls2, &
+                            iinfo )
+                  ! copy the upper portion of a into ab
+                  do j = i, i+pk-1
+                     lk = min( kd, n-j ) + 1
+                     call stdlib_ccopy( lk, a( j, j ), lda, ab( kd+1, j ), ldab-1 )
+                  end do
+                  call stdlib_claset( 'LOWER', pk, pk, czero, cone,a( i, i+kd ), lda )
+                  ! form the matrix t
+                  call stdlib_clarft( 'FORWARD', 'ROWWISE', pn, pk,a( i, i+kd ), lda, tau( i ),&
+                            work( tpos ), ldt )
+                  ! compute w:
+                  call stdlib_cgemm( 'CONJUGATE', 'NO TRANSPOSE', pk, pn, pk,cone,  work( tpos ), &
+                            ldt,a( i, i+kd ), lda,czero, work( s2pos ), lds2 )
+                  call stdlib_chemm( 'RIGHT', uplo, pk, pn,cone,  a( i+kd, i+kd ), lda,work( &
+                            s2pos ), lds2,czero, work( wpos ), ldw )
+                  call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE', pk, pk, pn,cone,  work( wpos ), &
+                            ldw,work( s2pos ), lds2,czero, work( s1pos ), lds1 )
+                  call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pk, pn, pk,-chalf, work( &
+                            s1pos ), lds1,a( i, i+kd ), lda,cone,   work( wpos ), ldw )
+                  ! update the unreduced submatrix a(i+kd:n,i+kd:n), using
+                  ! an update of the form:  a := a - v'*w - w'*v
+                  call stdlib_cher2k( uplo, 'CONJUGATE', pn, pk,-cone, a( i, i+kd ), lda,work( &
+                            wpos ), ldw,rone, a( i+kd, i+kd ), lda )
+               end do
+              ! copy the upper band to ab which is the band storage matrix
+              do j = n-kd+1, n
+                 lk = min(kd, n-j) + 1
+                 call stdlib_ccopy( lk, a( j, j ), lda, ab( kd+1, j ), ldab-1 )
+              end do
+           else
+               ! reduce the lower triangle of a to lower band matrix
+               loop_40: do i = 1, n - kd, kd
+                  pn = n-i-kd+1
+                  pk = min( n-i-kd+1, kd )
+                  ! compute the qr factorization of the current block
+                  call stdlib_cgeqrf( pn, kd, a( i+kd, i ), lda,tau( i ), work( s2pos ), ls2, &
+                            iinfo )
+                  ! copy the upper portion of a into ab
+                  do j = i, i+pk-1
+                     lk = min( kd, n-j ) + 1
+                     call stdlib_ccopy( lk, a( j, j ), 1, ab( 1, j ), 1 )
+                  end do
+                  call stdlib_claset( 'UPPER', pk, pk, czero, cone,a( i+kd, i ), lda )
+                  ! form the matrix t
+                  call stdlib_clarft( 'FORWARD', 'COLUMNWISE', pn, pk,a( i+kd, i ), lda, tau( i ),&
+                            work( tpos ), ldt )
+                  ! compute w:
+                  call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pn, pk, pk,cone, a( i+kd, i )&
+                            , lda,work( tpos ), ldt,czero, work( s2pos ), lds2 )
+                  call stdlib_chemm( 'LEFT', uplo, pn, pk,cone, a( i+kd, i+kd ), lda,work( s2pos )&
+                            , lds2,czero, work( wpos ), ldw )
+                  call stdlib_cgemm( 'CONJUGATE', 'NO TRANSPOSE', pk, pk, pn,cone, work( s2pos ), &
+                            lds2,work( wpos ), ldw,czero, work( s1pos ), lds1 )
+                  call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pn, pk, pk,-chalf, a( i+kd, &
+                            i ), lda,work( s1pos ), lds1,cone, work( wpos ), ldw )
+                  ! update the unreduced submatrix a(i+kd:n,i+kd:n), using
+                  ! an update of the form:  a := a - v*w' - w*v'
+                  call stdlib_cher2k( uplo, 'NO TRANSPOSE', pn, pk,-cone, a( i+kd, i ), lda,work( &
+                            wpos ), ldw,rone, a( i+kd, i+kd ), lda )
+                  ! ==================================================================
+                  ! restore a for comparison and checking to be removed
+                   ! do 45 j = i, i+pk-1
+                      ! lk = min( kd, n-j ) + 1
+                      ! call stdlib_ccopy( lk, ab( 1, j ), 1, a( j, j ), 1 )
+                      45 continue
+                  ! ==================================================================
+               end do loop_40
+              ! copy the lower band to ab which is the band storage matrix
+              do j = n-kd+1, n
+                 lk = min(kd, n-j) + 1
+                 call stdlib_ccopy( lk, a( j, j ), 1, ab( 1, j ), 1 )
+              end do
+           end if
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_chetrd_he2hb
+
+     !> CHETRF: computes the factorization of a complex Hermitian matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method.  The form of the
+     !> factorization is
+     !> A = U*D*U**H  or  A = L*D*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_chetrf( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'CHETRF', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'CHETRF', uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_clahef;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_clahef( uplo, k, nb, kb, a, lda, ipiv, work, n, iinfo )
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_chetf2( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_clahef;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_clahef( uplo, n-k+1, nb, kb, a( k, k ), lda, ipiv( k ),work, n, &
+                           iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_chetf2( uplo, n-k+1, a( k, k ), lda, ipiv( k ), iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_chetrf
+
+     !> CHETRF_RK: computes the factorization of a complex Hermitian matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**H)*(P**T) or A = P*L*D*(L**H)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is Hermitian and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_chetrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, ip, iws, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'CHETRF_RK', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRF_RK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'CHETRF_RK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_clahef_rk;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 15
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_clahef_rk( uplo, k, nb, kb, a, lda, e,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_chetf2_rk( uplo, k, a, lda, e, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k-kb+1:k and apply row permutations to the
+              ! last k+1 colunms k+1:n after that block
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k<n ) then
+                 do i = k, ( k - kb + 1 ), -1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_cswap( n-k, a( i, k+1 ), lda,a( ip, k+1 ), lda )
+                    end if
+                 end do
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+              ! this label is the exit from main loop over k decreasing
+              ! from n to 1 in steps of kb
+              15 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_clahef_rk;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 35
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_clahef_rk( uplo, n-k+1, nb, kb, a( k, k ), lda, e( k ),ipiv( k ), &
+                           work, ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_chetf2_rk( uplo, n-k+1, a( k, k ), lda, e( k ),ipiv( k ), iinfo )
+                           
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do i = k, k + kb - 1
+                 if( ipiv( i )>0 ) then
+                    ipiv( i ) = ipiv( i ) + k - 1
+                 else
+                    ipiv( i ) = ipiv( i ) - k + 1
+                 end if
+              end do
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k:k+kb-1 and apply row permutations to the
+              ! first k-1 colunms 1:k-1 before that block
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k>1 ) then
+                 do i = k, ( k + kb - 1 ), 1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_cswap( k-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                    end if
+                 end do
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+              ! this label is the exit from main loop over k increasing
+              ! from 1 to n in steps of kb
+              35 continue
+           ! end lower
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_chetrf_rk
+
+     !> CHETRF_ROOK: computes the factorization of a complex Hermitian matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+     !> The form of the factorization is
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_chetrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'CHETRF_ROOK', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, n*nb )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRF_ROOK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'CHETRF_ROOK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_clahef_rook;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_clahef_rook( uplo, k, nb, kb, a, lda,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_chetf2_rook( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_clahef_rook;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_clahef_rook( uplo, n-k+1, nb, kb, a( k, k ), lda,ipiv( k ), work, &
+                           ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_chetf2_rook( uplo, n-k+1, a( k, k ), lda, ipiv( k ),iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_chetrf_rook
+
+     !> CHETRS: solves a system of linear equations A*X = B with a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by CHETRF.
+
+     pure subroutine stdlib_chetrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           real(sp) :: s
+           complex(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,real
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**h.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_cgeru( k-1, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=sp) / real( a( k, k ),KIND=sp)
+                 call stdlib_csscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_cswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_cgeru( k-2, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 call stdlib_cgeru( k-2, nrhs, -cone, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ), &
+                           ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / conjg( akm1k )
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / conjg( akm1k )
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**h *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**h(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 ) then
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**h(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k+1, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k+1 )&
+                              , 1, cone, b( k+1, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**h.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_cgeru( n-k, nrhs, -cone, a( k+1, k ), 1, b( k, 1 ),ldb, b( &
+                           k+1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=sp) / real( a( k, k ),KIND=sp)
+                 call stdlib_csscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_cswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_cgeru( n-k-1, nrhs, -cone, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_cgeru( n-k-1, nrhs, -cone, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / conjg( akm1k )
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / conjg( akm1k )
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**h *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**h(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n ) then
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**h(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k-1, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k-1 ), 1, cone,b( k-1, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k-1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_chetrs
+
+     !> CHETRS2: solves a system of linear equations A*X = B with a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by CHETRF and converted by CSYCONV.
+
+     pure subroutine stdlib_chetrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, j, k, kp
+           real(sp) :: s
+           complex(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,real
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRS2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! convert a
+           call stdlib_csyconv( uplo, 'C', n, a, lda, ipiv, work, iinfo )
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**h.
+             ! p**t * b
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp==-ipiv( k-1 ) )call stdlib_cswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb &
+                           )
+                 k=k-2
+              end if
+             end do
+        ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+             call stdlib_ctrsm('L','U','N','U',n,nrhs,cone,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i=n
+              do while ( i >= 1 )
+                 if( ipiv(i) > 0 ) then
+                   s = real( cone,KIND=sp) / real( a( i, i ),KIND=sp)
+                   call stdlib_csscal( nrhs, s, b( i, 1 ), ldb )
+                 elseif ( i > 1) then
+                    if ( ipiv(i-1) == ipiv(i) ) then
+                       akm1k = work(i)
+                       akm1 = a( i-1, i-1 ) / akm1k
+                       ak = a( i, i ) / conjg( akm1k )
+                       denom = akm1*ak - cone
+                       do j = 1, nrhs
+                          bkm1 = b( i-1, j ) / akm1k
+                          bk = b( i, j ) / conjg( akm1k )
+                          b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                          b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                    i = i - 1
+                    endif
+                 endif
+                 i = i - 1
+              end do
+            ! compute (u**h \ b) -> b   [ u**h \ (d \ (u \p**t * b) ) ]
+              call stdlib_ctrsm('L','U','C','U',n,nrhs,cone,a,lda,b,ldb)
+             ! p * b  [ p * (u**h \ (d \ (u \p**t * b) )) ]
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k < n .and. kp==-ipiv( k+1 ) )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp,&
+                            1 ), ldb )
+                 k=k+2
+              endif
+             end do
+           else
+              ! solve a*x = b, where a = l*d*l**h.
+             ! p**t * b
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k+1).
+                 kp = -ipiv( k+1 )
+                 if( kp==-ipiv( k ) )call stdlib_cswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                           
+                 k=k+2
+              endif
+             end do
+        ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+             call stdlib_ctrsm('L','L','N','U',n,nrhs,cone,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i=1
+              do while ( i <= n )
+                 if( ipiv(i) > 0 ) then
+                   s = real( cone,KIND=sp) / real( a( i, i ),KIND=sp)
+                   call stdlib_csscal( nrhs, s, b( i, 1 ), ldb )
+                 else
+                       akm1k = work(i)
+                       akm1 = a( i, i ) / conjg( akm1k )
+                       ak = a( i+1, i+1 ) / akm1k
+                       denom = akm1*ak - cone
+                       do j = 1, nrhs
+                          bkm1 = b( i, j ) / conjg( akm1k )
+                          bk = b( i+1, j ) / akm1k
+                          b( i, j ) = ( ak*bkm1-bk ) / denom
+                          b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                       i = i + 1
+                 endif
+                 i = i + 1
+              end do
+        ! compute (l**h \ b) -> b   [ l**h \ (d \ (l \p**t * b) ) ]
+             call stdlib_ctrsm('L','L','C','U',n,nrhs,cone,a,lda,b,ldb)
+             ! p * b  [ p * (l**h \ (d \ (l \p**t * b) )) ]
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k>1 .and. kp==-ipiv( k-1 ) )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, &
+                           1 ), ldb )
+                 k=k-2
+              endif
+             end do
+           end if
+           ! revert a
+           call stdlib_csyconv( uplo, 'R', n, a, lda, ipiv, work, iinfo )
+           return
+     end subroutine stdlib_chetrs2
+
+     !> CHETRS_AA: solves a system of linear equations A*X = B with a complex
+     !> hermitian matrix A using the factorization A = U**H*T*U or
+     !> A = L*T*L**H computed by CHETRF_AA.
+
+     pure subroutine stdlib_chetrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, nrhs, lda, ldb, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           logical(lk) :: lquery, upper
+           integer(ilp) :: k, kp, lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, 3*n-2 ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRS_AA', -info )
+              return
+           else if( lquery ) then
+              lwkopt = (3*n-2)
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u**h*t*u.
+              ! 1) forward substitution with u**h
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 k = 1
+                 do while ( k<=n )
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                    k = k + 1
+                 end do
+                 ! compute u**h \ b -> b    [ (u**h \p**t * b) ]
+                 call stdlib_ctrsm( 'L', 'U', 'C', 'U', n-1, nrhs, cone, a( 1, 2 ),lda, b( 2, 1 ),&
+                            ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (u**h \p**t * b) ]
+              call stdlib_clacpy( 'F', 1, n, a(1, 1), lda+1, work(n), 1)
+              if( n>1 ) then
+                  call stdlib_clacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 2*n ), 1)
+                  call stdlib_clacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 1 ), 1)
+                  call stdlib_clacgv( n-1, work( 1 ), 1 )
+              end if
+              call stdlib_cgtsv(n, nrhs, work(1), work(n), work(2*n), b, ldb,info)
+              ! 3) backward substitution with u
+              if( n>1 ) then
+                 ! compute u \ b -> b   [ u \ (t \ (u**h \p**t * b) ) ]
+                 call stdlib_ctrsm( 'L', 'U', 'N', 'U', n-1, nrhs, cone, a( 1, 2 ),lda, b(2, 1), &
+                           ldb)
+                 ! pivot, p * b  -> b [ p * (u \ (t \ (u**h \p**t * b) )) ]
+                 k = n
+                 do while ( k>=1 )
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                    k = k - 1
+                 end do
+              end if
+           else
+              ! solve a*x = b, where a = l*t*l**h.
+              ! 1) forward substitution with l
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 k = 1
+                 do while ( k<=n )
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                    k = k + 1
+                 end do
+                 ! compute l \ b -> b    [ (l \p**t * b) ]
+                 call stdlib_ctrsm( 'L', 'L', 'N', 'U', n-1, nrhs, cone, a( 2, 1),lda, b(2, 1), &
+                           ldb )
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (l \p**t * b) ]
+              call stdlib_clacpy( 'F', 1, n, a(1, 1), lda+1, work(n), 1)
+              if( n>1 ) then
+                  call stdlib_clacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 1 ), 1 )
+                  call stdlib_clacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 2*n ), 1)
+                  call stdlib_clacgv( n-1, work( 2*n ), 1 )
+              end if
+              call stdlib_cgtsv(n, nrhs, work(1), work(n), work(2*n), b, ldb,info)
+              ! 3) backward substitution with l**h
+              if( n>1 ) then
+                 ! compute (l**h \ b) -> b   [ l**h \ (t \ (l \p**t * b) ) ]
+                 call stdlib_ctrsm( 'L', 'L', 'C', 'U', n-1, nrhs, cone, a( 2, 1 ),lda, b( 2, 1 ),&
+                            ldb )
+                 ! pivot, p * b -> b  [ p * (l**h \ (t \ (l \p**t * b) )) ]
+                 k = n
+                 do while ( k>=1 )
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                    k = k - 1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_chetrs_aa
+
+     !> CHETRS_ROOK: solves a system of linear equations A*X = B with a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by CHETRF_ROOK.
+
+     pure subroutine stdlib_chetrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           real(sp) :: s
+           complex(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,real
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRS_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**h.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_cgeru( k-1, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=sp) / real( a( k, k ),KIND=sp)
+                 call stdlib_csscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k), then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1)
+                 if( kp/=k-1 )call stdlib_cswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_cgeru( k-2, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 call stdlib_cgeru( k-2, nrhs, -cone, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ), &
+                           ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / conjg( akm1k )
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / conjg( akm1k )
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**h *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**h(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 ) then
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**h(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k+1, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k+1 )&
+                              , 1, cone, b( k+1, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k), then k+1 and -ipiv(k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_cswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**h.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_cgeru( n-k, nrhs, -cone, a( k+1, k ), 1, b( k, 1 ),ldb, b( &
+                           k+1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=sp) / real( a( k, k ),KIND=sp)
+                 call stdlib_csscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k), then k+1 and -ipiv(k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_cswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_cgeru( n-k-1, nrhs, -cone, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_cgeru( n-k-1, nrhs, -cone, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / conjg( akm1k )
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / conjg( akm1k )
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**h *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**h(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n ) then
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**h(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k-1, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k-1 ), 1, cone,b( k-1, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k-1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k), then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_cswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_chetrs_rook
+
+     !> CHPTRD: reduces a complex Hermitian matrix A stored in packed form to
+     !> real symmetric tridiagonal form T by a unitary similarity
+     !> transformation: Q**H * A * Q = T.
+
+     pure subroutine stdlib_chptrd( uplo, n, ap, d, e, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(out) :: d(*), e(*)
+           complex(sp), intent(inout) :: ap(*)
+           complex(sp), intent(out) :: tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, i1, i1i1, ii
+           complex(sp) :: alpha, taui
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPTRD', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           if( upper ) then
+              ! reduce the upper triangle of a.
+              ! i1 is the index in ap of a(1,i+1).
+              i1 = n*( n-1 ) / 2 + 1
+              ap( i1+n-1 ) = real( ap( i1+n-1 ),KIND=sp)
+              do i = n - 1, 1, -1
+                 ! generate elementary reflector h(i) = i - tau * v * v**h
+                 ! to annihilate a(1:i-1,i+1)
+                 alpha = ap( i1+i-1 )
+                 call stdlib_clarfg( i, alpha, ap( i1 ), 1, taui )
+                 e( i ) = real( alpha,KIND=sp)
+                 if( taui/=czero ) then
+                    ! apply h(i) from both sides to a(1:i,1:i)
+                    ap( i1+i-1 ) = cone
+                    ! compute  y := tau * a * v  storing y in tau(1:i)
+                    call stdlib_chpmv( uplo, i, taui, ap, ap( i1 ), 1, czero, tau,1 )
+                    ! compute  w := y - 1/2 * tau * (y**h *v) * v
+                    alpha = -chalf*taui*stdlib_cdotc( i, tau, 1, ap( i1 ), 1 )
+                    call stdlib_caxpy( i, alpha, ap( i1 ), 1, tau, 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**h - w * v**h
+                    call stdlib_chpr2( uplo, i, -cone, ap( i1 ), 1, tau, 1, ap )
+                 end if
+                 ap( i1+i-1 ) = e( i )
+                 d( i+1 ) = real( ap( i1+i ),KIND=sp)
+                 tau( i ) = taui
+                 i1 = i1 - i
+              end do
+              d( 1 ) = real( ap( 1 ),KIND=sp)
+           else
+              ! reduce the lower triangle of a. ii is the index in ap of
+              ! a(i,i) and i1i1 is the index of a(i+1,i+1).
+              ii = 1
+              ap( 1 ) = real( ap( 1 ),KIND=sp)
+              do i = 1, n - 1
+                 i1i1 = ii + n - i + 1
+                 ! generate elementary reflector h(i) = i - tau * v * v**h
+                 ! to annihilate a(i+2:n,i)
+                 alpha = ap( ii+1 )
+                 call stdlib_clarfg( n-i, alpha, ap( ii+2 ), 1, taui )
+                 e( i ) = real( alpha,KIND=sp)
+                 if( taui/=czero ) then
+                    ! apply h(i) from both sides to a(i+1:n,i+1:n)
+                    ap( ii+1 ) = cone
+                    ! compute  y := tau * a * v  storing y in tau(i:n-1)
+                    call stdlib_chpmv( uplo, n-i, taui, ap( i1i1 ), ap( ii+1 ), 1,czero, tau( i ),&
+                               1 )
+                    ! compute  w := y - 1/2 * tau * (y**h *v) * v
+                    alpha = -chalf*taui*stdlib_cdotc( n-i, tau( i ), 1, ap( ii+1 ),1 )
+                    call stdlib_caxpy( n-i, alpha, ap( ii+1 ), 1, tau( i ), 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**h - w * v**h
+                    call stdlib_chpr2( uplo, n-i, -cone, ap( ii+1 ), 1, tau( i ), 1,ap( i1i1 ) )
+                              
+                 end if
+                 ap( ii+1 ) = e( i )
+                 d( i ) = real( ap( ii ),KIND=sp)
+                 tau( i ) = taui
+                 ii = i1i1
+              end do
+              d( n ) = real( ap( ii ),KIND=sp)
+           end if
+           return
+     end subroutine stdlib_chptrd
+
+     !> CHPTRS: solves a system of linear equations A*X = B with a complex
+     !> Hermitian matrix A stored in packed format using the factorization
+     !> A = U*D*U**H or A = L*D*L**H computed by CHPTRF.
+
+     pure subroutine stdlib_chptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: ap(*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kp
+           real(sp) :: s
+           complex(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,real
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**h.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              kc = kc - k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_cgeru( k-1, nrhs, -cone, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=sp) / real( ap( kc+k-1 ),KIND=sp)
+                 call stdlib_csscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_cswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_cgeru( k-2, nrhs, -cone, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 call stdlib_cgeru( k-2, nrhs, -cone, ap( kc-( k-1 ) ), 1,b( k-1, 1 ), ldb, b( 1, &
+                           1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+k-2 )
+                 akm1 = ap( kc-1 ) / akm1k
+                 ak = ap( kc+k-1 ) / conjg( akm1k )
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / conjg( akm1k )
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc - k + 1
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**h *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**h(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 ) then
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, ap( kc ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + k
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**h(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, ap( kc ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k+1, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, ap( kc+k ),&
+                               1, cone, b( k+1, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + 2*k + 1
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**h.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_cgeru( n-k, nrhs, -cone, ap( kc+1 ), 1, b( k, 1 ),ldb, b( k+&
+                           1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=sp) / real( ap( kc ),KIND=sp)
+                 call stdlib_csscal( nrhs, s, b( k, 1 ), ldb )
+                 kc = kc + n - k + 1
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_cswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_cgeru( n-k-1, nrhs, -cone, ap( kc+2 ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_cgeru( n-k-1, nrhs, -cone, ap( kc+n-k+2 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+1 )
+                 akm1 = ap( kc ) / conjg( akm1k )
+                 ak = ap( kc+n-k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / conjg( akm1k )
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc + 2*( n-k ) + 1
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**h *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              kc = kc - ( n-k+1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**h(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n ) then
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              ap( kc+1 ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**h(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              ap( kc+1 ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k-1, 1 ), ldb )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              ap( kc-( n-k ) ), 1, cone,b( k-1, 1 ), ldb )
+                    call stdlib_clacgv( nrhs, b( k-1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_cswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc - ( n-k+2 )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_chptrs
+
+     !> CLA_GBRCOND_C: Computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a REAL vector.
+
+     real(sp) function stdlib_cla_gbrcond_c( trans, n, kl, ku, ab, ldab, afb,ldafb, ipiv, c, &
+               capply, info, work,rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, kl, ku, ldab, ldafb
+           integer(ilp) :: kd, ke
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: ab(ldab,*), afb(ldafb,*)
+           complex(sp), intent(out) :: work(*)
+           real(sp), intent(in) :: c(*)
+           real(sp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notrans
+           integer(ilp) :: kase, i, j
+           real(sp) :: ainvnm, anorm, tmp
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_cla_gbrcond_c = zero
+           info = 0
+           notrans = stdlib_lsame( trans, 'N' )
+           if ( .not. notrans .and. .not. stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 .or. kl>n-1 ) then
+              info = -3
+           else if( ku<0 .or. ku>n-1 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLA_GBRCOND_C', -info )
+              return
+           end if
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           kd = ku + 1
+           ke = kl + 1
+           if ( notrans ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + cabs1( ab( kd+i-j, j ) ) / c( j )
+                    end do
+                 else
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + cabs1( ab( kd+i-j, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + cabs1( ab( ke-i+j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + cabs1( ab( ke-i+j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_cla_gbrcond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if ( notrans ) then
+                    call stdlib_cgbtrs( 'NO TRANSPOSE', n, kl, ku, 1, afb, ldafb,ipiv, work, n, &
+                              info )
+                 else
+                    call stdlib_cgbtrs( 'CONJUGATE TRANSPOSE', n, kl, ku, 1, afb,ldafb, ipiv, &
+                              work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**h).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( notrans ) then
+                    call stdlib_cgbtrs( 'CONJUGATE TRANSPOSE', n, kl, ku, 1, afb,ldafb, ipiv,  &
+                              work, n, info )
+                 else
+                    call stdlib_cgbtrs( 'NO TRANSPOSE', n, kl, ku, 1, afb, ldafb,ipiv, work, n, &
+                              info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_cla_gbrcond_c = one / ainvnm
+           return
+     end function stdlib_cla_gbrcond_c
+
+     !> CLA_GERCOND_C: computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a REAL vector.
+
+     real(sp) function stdlib_cla_gercond_c( trans, n, a, lda, af, ldaf, ipiv, c,capply, info, &
+               work, rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, lda, ldaf
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*), af(ldaf,*)
+           complex(sp), intent(out) :: work(*)
+           real(sp), intent(in) :: c(*)
+           real(sp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notrans
+           integer(ilp) :: kase, i, j
+           real(sp) :: ainvnm, anorm, tmp
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real,aimag
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_cla_gercond_c = zero
+           info = 0
+           notrans = stdlib_lsame( trans, 'N' )
+           if ( .not. notrans .and. .not. stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLA_GERCOND_C', -info )
+              return
+           end if
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           if ( notrans ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, n
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, n
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, n
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, n
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_cla_gercond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if (notrans) then
+                    call stdlib_cgetrs( 'NO TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                              
+                 else
+                    call stdlib_cgetrs( 'CONJUGATE TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info &
+                              )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**h).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( notrans ) then
+                    call stdlib_cgetrs( 'CONJUGATE TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info &
+                              )
+                 else
+                    call stdlib_cgetrs( 'NO TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                              
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_cla_gercond_c = one / ainvnm
+           return
+     end function stdlib_cla_gercond_c
+
+     !> CLA_HERCOND_C: computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a REAL vector.
+
+     real(sp) function stdlib_cla_hercond_c( uplo, n, a, lda, af, ldaf, ipiv, c,capply, info, &
+               work, rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, lda, ldaf
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*), af(ldaf,*)
+           complex(sp), intent(out) :: work(*)
+           real(sp), intent(in) :: c(*)
+           real(sp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: kase, i, j
+           real(sp) :: ainvnm, anorm, tmp
+           logical(lk) :: up, upper
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_cla_hercond_c = zero
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLA_HERCOND_C', -info )
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_cla_hercond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if ( up ) then
+                    call stdlib_chetrs( 'U', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_chetrs( 'L', n, 1, af, ldaf, ipiv,work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**h).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_chetrs( 'U', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_chetrs( 'L', n, 1, af, ldaf, ipiv,work, n, info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_cla_hercond_c = one / ainvnm
+           return
+     end function stdlib_cla_hercond_c
+
+     !> CLA_HERPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(sp) function stdlib_cla_herpvgrw( uplo, n, info, a, lda, af, ldaf, ipiv,work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, info, lda, ldaf
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: ncols, i, j, k, kp
+           real(sp) :: amax, umax, rpvgrw, tmp
+           logical(lk) :: upper, lsame
+           complex(sp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag ( zdum ) )
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           if ( info==0 ) then
+              if (upper) then
+                 ncols = 1
+              else
+                 ncols = n
+              end if
+           else
+              ncols = info
+           end if
+           rpvgrw = one
+           do i = 1, 2*n
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column of a.  compute the max
+           ! for all n columns so we can apply the pivot permutation while
+           ! looping below.  assume a full factorization is the common case.
+           if ( upper ) then
+              do j = 1, n
+                 do i = 1, j
+                    work( n+i ) = max( cabs1( a( i,j ) ), work( n+i ) )
+                    work( n+j ) = max( cabs1( a( i,j ) ), work( n+j ) )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = j, n
+                    work( n+i ) = max( cabs1( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( cabs1( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of u or l.  also
+           ! permute the magnitudes of a above so they're in the same order as
+           ! the factor.
+           ! the iteration orders and permutations were copied from stdlib_csytrs.
+           ! calls to stdlib_sswap would be severe overkill.
+           if ( upper ) then
+              k = n
+              do while ( k < ncols .and. k>0 )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = 1, k
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                    end do
+                    k = k - 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k-1 )
+                    work( n+k-1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = 1, k-1
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                       work( k-1 ) =max( cabs1( af( i, k-1 ) ), work( k-1 ) )
+                    end do
+                    work( k ) = max( cabs1( af( k, k ) ), work( k ) )
+                    k = k - 2
+                 end if
+              end do
+              k = ncols
+              do while ( k <= n )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k + 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k + 2
+                 end if
+              end do
+           else
+              k = 1
+              do while ( k <= ncols )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = k, n
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                    end do
+                    k = k + 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k+1 )
+                    work( n+k+1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = k+1, n
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                       work( k+1 ) =max( cabs1( af( i, k+1 ) ) , work( k+1 ) )
+                    end do
+                    work(k) = max( cabs1( af( k, k ) ), work( k ) )
+                    k = k + 2
+                 end if
+              end do
+              k = ncols
+              do while ( k >= 1 )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k - 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k - 2
+                 endif
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( upper ) then
+              do i = ncols, n
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= 0.0_sp ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= 0.0_sp ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_cla_herpvgrw = rpvgrw
+     end function stdlib_cla_herpvgrw
+
+     !> CLA_PORCOND_C: Computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a REAL vector
+
+     real(sp) function stdlib_cla_porcond_c( uplo, n, a, lda, af, ldaf, c, capply,info, work, &
+               rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, lda, ldaf
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), af(ldaf,*)
+           complex(sp), intent(out) :: work(*)
+           real(sp), intent(in) :: c(*)
+           real(sp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: kase
+           real(sp) :: ainvnm, anorm, tmp
+           integer(ilp) :: i, j
+           logical(lk) :: up, upper
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real,aimag
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_cla_porcond_c = zero
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLA_PORCOND_C', -info )
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_cla_porcond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if ( up ) then
+                    call stdlib_cpotrs( 'U', n, 1, af, ldaf,work, n, info )
+                 else
+                    call stdlib_cpotrs( 'L', n, 1, af, ldaf,work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**h).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_cpotrs( 'U', n, 1, af, ldaf,work, n, info )
+                 else
+                    call stdlib_cpotrs( 'L', n, 1, af, ldaf,work, n, info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_cla_porcond_c = one / ainvnm
+           return
+     end function stdlib_cla_porcond_c
+
+     !> CLA_SYRCOND_C: Computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a REAL vector.
+
+     real(sp) function stdlib_cla_syrcond_c( uplo, n, a, lda, af, ldaf, ipiv, c,capply, info, &
+               work, rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, lda, ldaf
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*), af(ldaf,*)
+           complex(sp), intent(out) :: work(*)
+           real(sp), intent(in) :: c(*)
+           real(sp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: kase
+           real(sp) :: ainvnm, anorm, tmp
+           integer(ilp) :: i, j
+           logical(lk) :: up, upper
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_cla_syrcond_c = zero
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLA_SYRCOND_C', -info )
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_cla_syrcond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if ( up ) then
+                    call stdlib_csytrs( 'U', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_csytrs( 'L', n, 1, af, ldaf, ipiv,work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_csytrs( 'U', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_csytrs( 'L', n, 1, af, ldaf, ipiv,work, n, info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_cla_syrcond_c = one / ainvnm
+           return
+     end function stdlib_cla_syrcond_c
+
+     !> CLA_SYRPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(sp) function stdlib_cla_syrpvgrw( uplo, n, info, a, lda, af, ldaf, ipiv,work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, info, lda, ldaf
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(sp), intent(out) :: work(*)
+           integer(ilp), intent(in) :: ipiv(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: ncols, i, j, k, kp
+           real(sp) :: amax, umax, rpvgrw, tmp
+           logical(lk) :: upper
+           complex(sp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag ( zdum ) )
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           if ( info==0 ) then
+              if ( upper ) then
+                 ncols = 1
+              else
+                 ncols = n
+              end if
+           else
+              ncols = info
+           end if
+           rpvgrw = one
+           do i = 1, 2*n
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column of a.  compute the max
+           ! for all n columns so we can apply the pivot permutation while
+           ! looping below.  assume a full factorization is the common case.
+           if ( upper ) then
+              do j = 1, n
+                 do i = 1, j
+                    work( n+i ) = max( cabs1( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( cabs1( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = j, n
+                    work( n+i ) = max( cabs1( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( cabs1( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of u or l.  also
+           ! permute the magnitudes of a above so they're in the same order as
+           ! the factor.
+           ! the iteration orders and permutations were copied from stdlib_csytrs.
+           ! calls to stdlib_sswap would be severe overkill.
+           if ( upper ) then
+              k = n
+              do while ( k < ncols .and. k>0 )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = 1, k
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                    end do
+                    k = k - 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k-1 )
+                    work( n+k-1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = 1, k-1
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                       work( k-1 ) =max( cabs1( af( i, k-1 ) ), work( k-1 ) )
+                    end do
+                    work( k ) = max( cabs1( af( k, k ) ), work( k ) )
+                    k = k - 2
+                 end if
+              end do
+              k = ncols
+              do while ( k <= n )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k + 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k + 2
+                 end if
+              end do
+           else
+              k = 1
+              do while ( k <= ncols )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = k, n
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                    end do
+                    k = k + 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k+1 )
+                    work( n+k+1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = k+1, n
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                       work( k+1 ) =max( cabs1( af( i, k+1 ) ), work( k+1 ) )
+                    end do
+                    work( k ) = max( cabs1( af( k, k ) ), work( k ) )
+                    k = k + 2
+                 end if
+              end do
+              k = ncols
+              do while ( k >= 1 )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k - 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k - 2
+                 endif
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( upper ) then
+              do i = ncols, n
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= 0.0_sp ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= 0.0_sp ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_cla_syrpvgrw = rpvgrw
+     end function stdlib_cla_syrpvgrw
+
+     !> CLABRD: reduces the first NB rows and columns of a complex general
+     !> m by n matrix A to upper or lower real bidiagonal form by a unitary
+     !> transformation Q**H * A * P, and returns the matrices X and Y which
+     !> are needed to apply the transformation to the unreduced part of A.
+     !> If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
+     !> bidiagonal form.
+     !> This is an auxiliary routine called by CGEBRD
+
+     pure subroutine stdlib_clabrd( m, n, nb, a, lda, d, e, tauq, taup, x, ldx, y,ldy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldx, ldy, m, n, nb
+           ! Array Arguments 
+           real(sp), intent(out) :: d(*), e(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: taup(*), tauq(*), x(ldx,*), y(ldy,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           if( m>=n ) then
+              ! reduce to upper bidiagonal form
+              loop_10: do i = 1, nb
+                 ! update a(i:m,i)
+                 call stdlib_clacgv( i-1, y( i, 1 ), ldy )
+                 call stdlib_cgemv( 'NO TRANSPOSE', m-i+1, i-1, -cone, a( i, 1 ),lda, y( i, 1 ), &
+                           ldy, cone, a( i, i ), 1 )
+                 call stdlib_clacgv( i-1, y( i, 1 ), ldy )
+                 call stdlib_cgemv( 'NO TRANSPOSE', m-i+1, i-1, -cone, x( i, 1 ),ldx, a( 1, i ), &
+                           1, cone, a( i, i ), 1 )
+                 ! generate reflection q(i) to annihilate a(i+1:m,i)
+                 alpha = a( i, i )
+                 call stdlib_clarfg( m-i+1, alpha, a( min( i+1, m ), i ), 1,tauq( i ) )
+                 d( i ) = real( alpha,KIND=sp)
+                 if( i<n ) then
+                    a( i, i ) = cone
+                    ! compute y(i+1:n,i)
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', m-i+1, n-i, cone,a( i, i+1 ), lda, &
+                              a( i, i ), 1, czero,y( i+1, i ), 1 )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', m-i+1, i-1, cone,a( i, 1 ), lda, a( &
+                              i, i ), 1, czero,y( 1, i ), 1 )
+                    call stdlib_cgemv( 'NO TRANSPOSE', n-i, i-1, -cone, y( i+1, 1 ),ldy, y( 1, i )&
+                              , 1, cone, y( i+1, i ), 1 )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', m-i+1, i-1, cone,x( i, 1 ), ldx, a( &
+                              i, i ), 1, czero,y( 1, i ), 1 )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', i-1, n-i, -cone,a( 1, i+1 ), lda, y(&
+                               1, i ), 1, cone,y( i+1, i ), 1 )
+                    call stdlib_cscal( n-i, tauq( i ), y( i+1, i ), 1 )
+                    ! update a(i,i+1:n)
+                    call stdlib_clacgv( n-i, a( i, i+1 ), lda )
+                    call stdlib_clacgv( i, a( i, 1 ), lda )
+                    call stdlib_cgemv( 'NO TRANSPOSE', n-i, i, -cone, y( i+1, 1 ),ldy, a( i, 1 ), &
+                              lda, cone, a( i, i+1 ), lda )
+                    call stdlib_clacgv( i, a( i, 1 ), lda )
+                    call stdlib_clacgv( i-1, x( i, 1 ), ldx )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', i-1, n-i, -cone,a( 1, i+1 ), lda, x(&
+                               i, 1 ), ldx, cone,a( i, i+1 ), lda )
+                    call stdlib_clacgv( i-1, x( i, 1 ), ldx )
+                    ! generate reflection p(i) to annihilate a(i,i+2:n)
+                    alpha = a( i, i+1 )
+                    call stdlib_clarfg( n-i, alpha, a( i, min( i+2, n ) ),lda, taup( i ) )
+                    e( i ) = real( alpha,KIND=sp)
+                    a( i, i+1 ) = cone
+                    ! compute x(i+1:m,i)
+                    call stdlib_cgemv( 'NO TRANSPOSE', m-i, n-i, cone, a( i+1, i+1 ),lda, a( i, i+&
+                              1 ), lda, czero, x( i+1, i ), 1 )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-i, i, cone,y( i+1, 1 ), ldy, a( i,&
+                               i+1 ), lda, czero,x( 1, i ), 1 )
+                    call stdlib_cgemv( 'NO TRANSPOSE', m-i, i, -cone, a( i+1, 1 ),lda, x( 1, i ), &
+                              1, cone, x( i+1, i ), 1 )
+                    call stdlib_cgemv( 'NO TRANSPOSE', i-1, n-i, cone, a( 1, i+1 ),lda, a( i, i+1 &
+                              ), lda, czero, x( 1, i ), 1 )
+                    call stdlib_cgemv( 'NO TRANSPOSE', m-i, i-1, -cone, x( i+1, 1 ),ldx, x( 1, i )&
+                              , 1, cone, x( i+1, i ), 1 )
+                    call stdlib_cscal( m-i, taup( i ), x( i+1, i ), 1 )
+                    call stdlib_clacgv( n-i, a( i, i+1 ), lda )
+                 end if
+              end do loop_10
+           else
+              ! reduce to lower bidiagonal form
+              loop_20: do i = 1, nb
+                 ! update a(i,i:n)
+                 call stdlib_clacgv( n-i+1, a( i, i ), lda )
+                 call stdlib_clacgv( i-1, a( i, 1 ), lda )
+                 call stdlib_cgemv( 'NO TRANSPOSE', n-i+1, i-1, -cone, y( i, 1 ),ldy, a( i, 1 ), &
+                           lda, cone, a( i, i ), lda )
+                 call stdlib_clacgv( i-1, a( i, 1 ), lda )
+                 call stdlib_clacgv( i-1, x( i, 1 ), ldx )
+                 call stdlib_cgemv( 'CONJUGATE TRANSPOSE', i-1, n-i+1, -cone,a( 1, i ), lda, x( i,&
+                            1 ), ldx, cone, a( i, i ),lda )
+                 call stdlib_clacgv( i-1, x( i, 1 ), ldx )
+                 ! generate reflection p(i) to annihilate a(i,i+1:n)
+                 alpha = a( i, i )
+                 call stdlib_clarfg( n-i+1, alpha, a( i, min( i+1, n ) ), lda,taup( i ) )
+                 d( i ) = real( alpha,KIND=sp)
+                 if( i<m ) then
+                    a( i, i ) = cone
+                    ! compute x(i+1:m,i)
+                    call stdlib_cgemv( 'NO TRANSPOSE', m-i, n-i+1, cone, a( i+1, i ),lda, a( i, i &
+                              ), lda, czero, x( i+1, i ), 1 )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-i+1, i-1, cone,y( i, 1 ), ldy, a( &
+                              i, i ), lda, czero,x( 1, i ), 1 )
+                    call stdlib_cgemv( 'NO TRANSPOSE', m-i, i-1, -cone, a( i+1, 1 ),lda, x( 1, i )&
+                              , 1, cone, x( i+1, i ), 1 )
+                    call stdlib_cgemv( 'NO TRANSPOSE', i-1, n-i+1, cone, a( 1, i ),lda, a( i, i ),&
+                               lda, czero, x( 1, i ), 1 )
+                    call stdlib_cgemv( 'NO TRANSPOSE', m-i, i-1, -cone, x( i+1, 1 ),ldx, x( 1, i )&
+                              , 1, cone, x( i+1, i ), 1 )
+                    call stdlib_cscal( m-i, taup( i ), x( i+1, i ), 1 )
+                    call stdlib_clacgv( n-i+1, a( i, i ), lda )
+                    ! update a(i+1:m,i)
+                    call stdlib_clacgv( i-1, y( i, 1 ), ldy )
+                    call stdlib_cgemv( 'NO TRANSPOSE', m-i, i-1, -cone, a( i+1, 1 ),lda, y( i, 1 )&
+                              , ldy, cone, a( i+1, i ), 1 )
+                    call stdlib_clacgv( i-1, y( i, 1 ), ldy )
+                    call stdlib_cgemv( 'NO TRANSPOSE', m-i, i, -cone, x( i+1, 1 ),ldx, a( 1, i ), &
+                              1, cone, a( i+1, i ), 1 )
+                    ! generate reflection q(i) to annihilate a(i+2:m,i)
+                    alpha = a( i+1, i )
+                    call stdlib_clarfg( m-i, alpha, a( min( i+2, m ), i ), 1,tauq( i ) )
+                    e( i ) = real( alpha,KIND=sp)
+                    a( i+1, i ) = cone
+                    ! compute y(i+1:n,i)
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', m-i, n-i, cone,a( i+1, i+1 ), lda, &
+                              a( i+1, i ), 1, czero,y( i+1, i ), 1 )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', m-i, i-1, cone,a( i+1, 1 ), lda, a( &
+                              i+1, i ), 1, czero,y( 1, i ), 1 )
+                    call stdlib_cgemv( 'NO TRANSPOSE', n-i, i-1, -cone, y( i+1, 1 ),ldy, y( 1, i )&
+                              , 1, cone, y( i+1, i ), 1 )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', m-i, i, cone,x( i+1, 1 ), ldx, a( i+&
+                              1, i ), 1, czero,y( 1, i ), 1 )
+                    call stdlib_cgemv( 'CONJUGATE TRANSPOSE', i, n-i, -cone,a( 1, i+1 ), lda, y( &
+                              1, i ), 1, cone,y( i+1, i ), 1 )
+                    call stdlib_cscal( n-i, tauq( i ), y( i+1, i ), 1 )
+                 else
+                    call stdlib_clacgv( n-i+1, a( i, i ), lda )
+                 end if
+              end do loop_20
+           end if
+           return
+     end subroutine stdlib_clabrd
+
+     !> CLAED7: computes the updated eigensystem of a diagonal
+     !> matrix after modification by a rank-one symmetric matrix. This
+     !> routine is used only for the eigenproblem which requires all
+     !> eigenvalues and optionally eigenvectors of a dense or banded
+     !> Hermitian matrix that has been reduced to tridiagonal form.
+     !> T = Q(in) ( D(in) + RHO * Z*Z**H ) Q**H(in) = Q(out) * D(out) * Q**H(out)
+     !> where Z = Q**Hu, u is a vector of length N with ones in the
+     !> CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+     !> The eigenvectors of the original matrix are stored in Q, and the
+     !> eigenvalues are in D.  The algorithm consists of three stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple eigenvalues or if there is a zero in
+     !> the Z vector.  For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one.  This stage is
+     !> performed by the routine SLAED2.
+     !> The second stage consists of calculating the updated
+     !> eigenvalues. This is done by finding the roots of the secular
+     !> equation via the routine SLAED4 (as called by SLAED3).
+     !> This routine also calculates the eigenvectors of the current
+     !> problem.
+     !> The final stage consists of computing the updated eigenvectors
+     !> directly using the updated eigenvalues.  The eigenvectors for
+     !> the current problem are multiplied with the eigenvectors from
+     !> the overall problem.
+
+     pure subroutine stdlib_claed7( n, cutpnt, qsiz, tlvls, curlvl, curpbm, d, q,ldq, rho, indxq, &
+               qstore, qptr, prmptr, perm,givptr, givcol, givnum, work, rwork, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: curlvl, curpbm, cutpnt, ldq, n, qsiz, tlvls
+           integer(ilp), intent(out) :: info
+           real(sp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: givcol(2,*), givptr(*), perm(*), prmptr(*), qptr(*)
+                     
+           integer(ilp), intent(out) :: indxq(*), iwork(*)
+           real(sp), intent(inout) :: d(*), givnum(2,*), qstore(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: q(ldq,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: coltyp, curr, i, idlmda, indx, indxc, indxp, iq, iw, iz, k, n1, n2, &
+                     ptr
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           ! if( icompq<0 .or. icompq>1 ) then
+              ! info = -1
+           ! else if( n<0 ) then
+           if( n<0 ) then
+              info = -1
+           else if( min( 1, n )>cutpnt .or. n<cutpnt ) then
+              info = -2
+           else if( qsiz<n ) then
+              info = -3
+           else if( ldq<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLAED7', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! the following values are for bookkeeping purposes only.  they are
+           ! integer pointers which indicate the portion of the workspace
+           ! used by a particular array in stdlib_slaed2 and stdlib_slaed3.
+           iz = 1
+           idlmda = iz + n
+           iw = idlmda + n
+           iq = iw + n
+           indx = 1
+           indxc = indx + n
+           coltyp = indxc + n
+           indxp = coltyp + n
+           ! form the z-vector which consists of the last row of q_1 and the
+           ! first row of q_2.
+           ptr = 1 + 2**tlvls
+           do i = 1, curlvl - 1
+              ptr = ptr + 2**( tlvls-i )
+           end do
+           curr = ptr + curpbm
+           call stdlib_slaeda( n, tlvls, curlvl, curpbm, prmptr, perm, givptr,givcol, givnum, &
+                     qstore, qptr, rwork( iz ),rwork( iz+n ), info )
+           ! when solving the final problem, we no longer need the stored data,
+           ! so we will overwrite the data from this level onto the previously
+           ! used storage space.
+           if( curlvl==tlvls ) then
+              qptr( curr ) = 1
+              prmptr( curr ) = 1
+              givptr( curr ) = 1
+           end if
+           ! sort and deflate eigenvalues.
+           call stdlib_claed8( k, n, qsiz, q, ldq, d, rho, cutpnt, rwork( iz ),rwork( idlmda ), &
+           work, qsiz, rwork( iw ),iwork( indxp ), iwork( indx ), indxq,perm( prmptr( curr ) ), &
+           givptr( curr+1 ),givcol( 1, givptr( curr ) ),givnum( 1, givptr( curr ) ), info )
+                     
+           prmptr( curr+1 ) = prmptr( curr ) + n
+           givptr( curr+1 ) = givptr( curr+1 ) + givptr( curr )
+           ! solve secular equation.
+           if( k/=0 ) then
+              call stdlib_slaed9( k, 1, k, n, d, rwork( iq ), k, rho,rwork( idlmda ), rwork( iw ),&
+                        qstore( qptr( curr ) ), k, info )
+              call stdlib_clacrm( qsiz, k, work, qsiz, qstore( qptr( curr ) ), k, q,ldq, rwork( &
+                        iq ) )
+              qptr( curr+1 ) = qptr( curr ) + k**2
+              if( info/=0 ) then
+                 return
+              end if
+           ! prepare the indxq sorting premutation.
+              n1 = k
+              n2 = n - k
+              call stdlib_slamrg( n1, n2, d, 1, -1, indxq )
+           else
+              qptr( curr+1 ) = qptr( curr )
+              do i = 1, n
+                 indxq( i ) = i
+              end do
+           end if
+           return
+     end subroutine stdlib_claed7
+
+     !> CLAEIN: uses inverse iteration to find a right or left eigenvector
+     !> corresponding to the eigenvalue W of a complex upper Hessenberg
+     !> matrix H.
+
+     pure subroutine stdlib_claein( rightv, noinit, n, h, ldh, w, v, b, ldb, rwork,eps3, smlnum, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: noinit, rightv
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldh, n
+           real(sp), intent(in) :: eps3, smlnum
+           complex(sp), intent(in) :: w
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(out) :: b(ldb,*)
+           complex(sp), intent(in) :: h(ldh,*)
+           complex(sp), intent(inout) :: v(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: tenth = 1.0e-1_sp
+           
+           
+           ! Local Scalars 
+           character :: normin, trans
+           integer(ilp) :: i, ierr, its, j
+           real(sp) :: growto, nrmsml, rootn, rtemp, scale, vnorm
+           complex(sp) :: cdum, ei, ej, temp, x
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           info = 0
+           ! growto is the threshold used in the acceptance test for an
+           ! eigenvector.
+           rootn = sqrt( real( n,KIND=sp) )
+           growto = tenth / rootn
+           nrmsml = max( one, eps3*rootn )*smlnum
+           ! form b = h - w*i (except that the subdiagonal elements are not
+           ! stored).
+           do j = 1, n
+              do i = 1, j - 1
+                 b( i, j ) = h( i, j )
+              end do
+              b( j, j ) = h( j, j ) - w
+           end do
+           if( noinit ) then
+              ! initialize v.
+              do i = 1, n
+                 v( i ) = eps3
+              end do
+           else
+              ! scale supplied initial vector.
+              vnorm = stdlib_scnrm2( n, v, 1 )
+              call stdlib_csscal( n, ( eps3*rootn ) / max( vnorm, nrmsml ), v, 1 )
+           end if
+           if( rightv ) then
+              ! lu decomposition with partial pivoting of b, replacing czero
+              ! pivots by eps3.
+              do i = 1, n - 1
+                 ei = h( i+1, i )
+                 if( cabs1( b( i, i ) )<cabs1( ei ) ) then
+                    ! interchange rows and eliminate.
+                    x = stdlib_cladiv( b( i, i ), ei )
+                    b( i, i ) = ei
+                    do j = i + 1, n
+                       temp = b( i+1, j )
+                       b( i+1, j ) = b( i, j ) - x*temp
+                       b( i, j ) = temp
+                    end do
+                 else
+                    ! eliminate without interchange.
+                    if( b( i, i )==czero )b( i, i ) = eps3
+                    x = stdlib_cladiv( ei, b( i, i ) )
+                    if( x/=czero ) then
+                       do j = i + 1, n
+                          b( i+1, j ) = b( i+1, j ) - x*b( i, j )
+                       end do
+                    end if
+                 end if
+              end do
+              if( b( n, n )==czero )b( n, n ) = eps3
+              trans = 'N'
+           else
+              ! ul decomposition with partial pivoting of b, replacing czero
+              ! pivots by eps3.
+              do j = n, 2, -1
+                 ej = h( j, j-1 )
+                 if( cabs1( b( j, j ) )<cabs1( ej ) ) then
+                    ! interchange columns and eliminate.
+                    x = stdlib_cladiv( b( j, j ), ej )
+                    b( j, j ) = ej
+                    do i = 1, j - 1
+                       temp = b( i, j-1 )
+                       b( i, j-1 ) = b( i, j ) - x*temp
+                       b( i, j ) = temp
+                    end do
+                 else
+                    ! eliminate without interchange.
+                    if( b( j, j )==czero )b( j, j ) = eps3
+                    x = stdlib_cladiv( ej, b( j, j ) )
+                    if( x/=czero ) then
+                       do i = 1, j - 1
+                          b( i, j-1 ) = b( i, j-1 ) - x*b( i, j )
+                       end do
+                    end if
+                 end if
+              end do
+              if( b( 1, 1 )==czero )b( 1, 1 ) = eps3
+              trans = 'C'
+           end if
+           normin = 'N'
+           do its = 1, n
+              ! solve u*x = scale*v for a right eigenvector
+                ! or u**h *x = scale*v for a left eigenvector,
+              ! overwriting x on v.
+              call stdlib_clatrs( 'UPPER', trans, 'NONUNIT', normin, n, b, ldb, v,scale, rwork, &
+                        ierr )
+              normin = 'Y'
+              ! test for sufficient growth in the norm of v.
+              vnorm = stdlib_scasum( n, v, 1 )
+              if( vnorm>=growto*scale )go to 120
+              ! choose new orthogonal starting vector and try again.
+              rtemp = eps3 / ( rootn+one )
+              v( 1 ) = eps3
+              do i = 2, n
+                 v( i ) = rtemp
+              end do
+              v( n-its+1 ) = v( n-its+1 ) - eps3*rootn
+           end do
+           ! failure to find eigenvector in n iterations.
+           info = 1
+           120 continue
+           ! normalize eigenvector.
+           i = stdlib_icamax( n, v, 1 )
+           call stdlib_csscal( n, one / cabs1( v( i ) ), v, 1 )
+           return
+     end subroutine stdlib_claein
+
+     !> CLAGS2: computes 2-by-2 unitary matrices U, V and Q, such
+     !> that if ( UPPER ) then
+     !> U**H *A*Q = U**H *( A1 A2 )*Q = ( x  0  )
+     !> ( 0  A3 )     ( x  x  )
+     !> and
+     !> V**H*B*Q = V**H *( B1 B2 )*Q = ( x  0  )
+     !> ( 0  B3 )     ( x  x  )
+     !> or if ( .NOT.UPPER ) then
+     !> U**H *A*Q = U**H *( A1 0  )*Q = ( x  x  )
+     !> ( A2 A3 )     ( 0  x  )
+     !> and
+     !> V**H *B*Q = V**H *( B1 0  )*Q = ( x  x  )
+     !> ( B2 B3 )     ( 0  x  )
+     !> where
+     !> U = (   CSU    SNU ), V = (  CSV    SNV ),
+     !> ( -SNU**H  CSU )      ( -SNV**H CSV )
+     !> Q = (   CSQ    SNQ )
+     !> ( -SNQ**H  CSQ )
+     !> The rows of the transformed A and B are parallel. Moreover, if the
+     !> input 2-by-2 matrix A is not zero, then the transformed (1,1) entry
+     !> of A is not zero. If the input matrices A and B are both not zero,
+     !> then the transformed (2,2) element of B is not zero, except when the
+     !> first rows of input A and B are parallel and the second rows are
+     !> zero.
+
+     pure subroutine stdlib_clags2( upper, a1, a2, a3, b1, b2, b3, csu, snu, csv,snv, csq, snq )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: upper
+           real(sp), intent(in) :: a1, a3, b1, b3
+           real(sp), intent(out) :: csq, csu, csv
+           complex(sp), intent(in) :: a2, b2
+           complex(sp), intent(out) :: snq, snu, snv
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: a, aua11, aua12, aua21, aua22, avb11, avb12, avb21, avb22, csl, csr, d, fb,&
+                      fc, s1, s2, snl, snr, ua11r, ua22r, vb11r, vb22r
+           complex(sp) :: b, c, d1, r, t, ua11, ua12, ua21, ua22, vb11, vb12, vb21, vb22
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,real
+           ! Statement Functions 
+           real(sp) :: abs1
+           ! Statement Function Definitions 
+           abs1( t ) = abs( real( t,KIND=sp) ) + abs( aimag( t ) )
+           ! Executable Statements 
+           if( upper ) then
+              ! input matrices a and b are upper triangular matrices
+              ! form matrix c = a*adj(b) = ( a b )
+                                         ! ( 0 d )
+              a = a1*b3
+              d = a3*b1
+              b = a2*b1 - a1*b2
+              fb = abs( b )
+              ! transform complex 2-by-2 matrix c to real matrix by unitary
+              ! diagonal matrix diag(1,d1).
+              d1 = one
+              if( fb/=zero )d1 = b / fb
+              ! the svd of real 2 by 2 triangular c
+               ! ( csl -snl )*( a b )*(  csr  snr ) = ( r 0 )
+               ! ( snl  csl ) ( 0 d ) ( -snr  csr )   ( 0 t )
+              call stdlib_slasv2( a, fb, d, s1, s2, snr, csr, snl, csl )
+              if( abs( csl )>=abs( snl ) .or. abs( csr )>=abs( snr ) )then
+                 ! compute the (1,1) and (1,2) elements of u**h *a and v**h *b,
+                 ! and (1,2) element of |u|**h *|a| and |v|**h *|b|.
+                 ua11r = csl*a1
+                 ua12 = csl*a2 + d1*snl*a3
+                 vb11r = csr*b1
+                 vb12 = csr*b2 + d1*snr*b3
+                 aua12 = abs( csl )*abs1( a2 ) + abs( snl )*abs( a3 )
+                 avb12 = abs( csr )*abs1( b2 ) + abs( snr )*abs( b3 )
+                 ! zero (1,2) elements of u**h *a and v**h *b
+                 if( ( abs( ua11r )+abs1( ua12 ) )==zero ) then
+                    call stdlib_clartg( -cmplx( vb11r,KIND=sp), conjg( vb12 ), csq, snq,r )
+                              
+                 else if( ( abs( vb11r )+abs1( vb12 ) )==zero ) then
+                    call stdlib_clartg( -cmplx( ua11r,KIND=sp), conjg( ua12 ), csq, snq,r )
+                              
+                 else if( aua12 / ( abs( ua11r )+abs1( ua12 ) )<=avb12 /( abs( vb11r )+abs1( vb12 &
+                           ) ) ) then
+                    call stdlib_clartg( -cmplx( ua11r,KIND=sp), conjg( ua12 ), csq, snq,r )
+                              
+                 else
+                    call stdlib_clartg( -cmplx( vb11r,KIND=sp), conjg( vb12 ), csq, snq,r )
+                              
+                 end if
+                 csu = csl
+                 snu = -d1*snl
+                 csv = csr
+                 snv = -d1*snr
+              else
+                 ! compute the (2,1) and (2,2) elements of u**h *a and v**h *b,
+                 ! and (2,2) element of |u|**h *|a| and |v|**h *|b|.
+                 ua21 = -conjg( d1 )*snl*a1
+                 ua22 = -conjg( d1 )*snl*a2 + csl*a3
+                 vb21 = -conjg( d1 )*snr*b1
+                 vb22 = -conjg( d1 )*snr*b2 + csr*b3
+                 aua22 = abs( snl )*abs1( a2 ) + abs( csl )*abs( a3 )
+                 avb22 = abs( snr )*abs1( b2 ) + abs( csr )*abs( b3 )
+                 ! zero (2,2) elements of u**h *a and v**h *b, and then swap.
+                 if( ( abs1( ua21 )+abs1( ua22 ) )==zero ) then
+                    call stdlib_clartg( -conjg( vb21 ), conjg( vb22 ), csq, snq, r )
+                 else if( ( abs1( vb21 )+abs( vb22 ) )==zero ) then
+                    call stdlib_clartg( -conjg( ua21 ), conjg( ua22 ), csq, snq, r )
+                 else if( aua22 / ( abs1( ua21 )+abs1( ua22 ) )<=avb22 /( abs1( vb21 )+abs1( vb22 &
+                           ) ) ) then
+                    call stdlib_clartg( -conjg( ua21 ), conjg( ua22 ), csq, snq, r )
+                 else
+                    call stdlib_clartg( -conjg( vb21 ), conjg( vb22 ), csq, snq, r )
+                 end if
+                 csu = snl
+                 snu = d1*csl
+                 csv = snr
+                 snv = d1*csr
+              end if
+           else
+              ! input matrices a and b are lower triangular matrices
+              ! form matrix c = a*adj(b) = ( a 0 )
+                                         ! ( c d )
+              a = a1*b3
+              d = a3*b1
+              c = a2*b3 - a3*b2
+              fc = abs( c )
+              ! transform complex 2-by-2 matrix c to real matrix by unitary
+              ! diagonal matrix diag(d1,1).
+              d1 = one
+              if( fc/=zero )d1 = c / fc
+              ! the svd of real 2 by 2 triangular c
+               ! ( csl -snl )*( a 0 )*(  csr  snr ) = ( r 0 )
+               ! ( snl  csl ) ( c d ) ( -snr  csr )   ( 0 t )
+              call stdlib_slasv2( a, fc, d, s1, s2, snr, csr, snl, csl )
+              if( abs( csr )>=abs( snr ) .or. abs( csl )>=abs( snl ) )then
+                 ! compute the (2,1) and (2,2) elements of u**h *a and v**h *b,
+                 ! and (2,1) element of |u|**h *|a| and |v|**h *|b|.
+                 ua21 = -d1*snr*a1 + csr*a2
+                 ua22r = csr*a3
+                 vb21 = -d1*snl*b1 + csl*b2
+                 vb22r = csl*b3
+                 aua21 = abs( snr )*abs( a1 ) + abs( csr )*abs1( a2 )
+                 avb21 = abs( snl )*abs( b1 ) + abs( csl )*abs1( b2 )
+                 ! zero (2,1) elements of u**h *a and v**h *b.
+                 if( ( abs1( ua21 )+abs( ua22r ) )==zero ) then
+                    call stdlib_clartg( cmplx( vb22r,KIND=sp), vb21, csq, snq, r )
+                 else if( ( abs1( vb21 )+abs( vb22r ) )==zero ) then
+                    call stdlib_clartg( cmplx( ua22r,KIND=sp), ua21, csq, snq, r )
+                 else if( aua21 / ( abs1( ua21 )+abs( ua22r ) )<=avb21 /( abs1( vb21 )+abs( vb22r &
+                           ) ) ) then
+                    call stdlib_clartg( cmplx( ua22r,KIND=sp), ua21, csq, snq, r )
+                 else
+                    call stdlib_clartg( cmplx( vb22r,KIND=sp), vb21, csq, snq, r )
+                 end if
+                 csu = csr
+                 snu = -conjg( d1 )*snr
+                 csv = csl
+                 snv = -conjg( d1 )*snl
+              else
+                 ! compute the (1,1) and (1,2) elements of u**h *a and v**h *b,
+                 ! and (1,1) element of |u|**h *|a| and |v|**h *|b|.
+                 ua11 = csr*a1 + conjg( d1 )*snr*a2
+                 ua12 = conjg( d1 )*snr*a3
+                 vb11 = csl*b1 + conjg( d1 )*snl*b2
+                 vb12 = conjg( d1 )*snl*b3
+                 aua11 = abs( csr )*abs( a1 ) + abs( snr )*abs1( a2 )
+                 avb11 = abs( csl )*abs( b1 ) + abs( snl )*abs1( b2 )
+                 ! zero (1,1) elements of u**h *a and v**h *b, and then swap.
+                 if( ( abs1( ua11 )+abs1( ua12 ) )==zero ) then
+                    call stdlib_clartg( vb12, vb11, csq, snq, r )
+                 else if( ( abs1( vb11 )+abs1( vb12 ) )==zero ) then
+                    call stdlib_clartg( ua12, ua11, csq, snq, r )
+                 else if( aua11 / ( abs1( ua11 )+abs1( ua12 ) )<=avb11 /( abs1( vb11 )+abs1( vb12 &
+                           ) ) ) then
+                    call stdlib_clartg( ua12, ua11, csq, snq, r )
+                 else
+                    call stdlib_clartg( vb12, vb11, csq, snq, r )
+                 end if
+                 csu = snr
+                 snu = conjg( d1 )*csr
+                 csv = snl
+                 snv = conjg( d1 )*csl
+              end if
+           end if
+           return
+     end subroutine stdlib_clags2
+
+     !> CLAHQR: is an auxiliary routine called by CHSEQR to update the
+     !> eigenvalues and Schur decomposition already computed by CHSEQR, by
+     !> dealing with the Hessenberg submatrix in rows and columns ILO to
+     !> IHI.
+
+     pure subroutine stdlib_clahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, info &
+               )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(sp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(sp), intent(out) :: w(*)
+        ! =========================================================
+           ! Parameters 
+           real(sp), parameter :: rzero = 0.0_sp
+           real(sp), parameter :: rone = 1.0_sp
+           real(sp), parameter :: dat1 = 3.0_sp/4.0_sp
+           integer(ilp), parameter :: kexsh = 10
+           
+           
+           
+           
+           ! Local Scalars 
+           complex(sp) :: cdum, h11, h11s, h22, sc, sum, t, t1, temp, u, v2, x, y
+           real(sp) :: aa, ab, ba, bb, h10, h21, rtemp, s, safmax, safmin, smlnum, sx, t2, tst, &
+                     ulp
+           integer(ilp) :: i, i1, i2, its, itmax, j, jhi, jlo, k, l, m, nh, nz, kdefl
+           ! Local Arrays 
+           complex(sp) :: v(2)
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,conjg,max,min,real,sqrt
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           if( ilo==ihi ) then
+              w( ilo ) = h( ilo, ilo )
+              return
+           end if
+           ! ==== clear out the trash ====
+           do j = ilo, ihi - 3
+              h( j+2, j ) = czero
+              h( j+3, j ) = czero
+           end do
+           if( ilo<=ihi-2 )h( ihi, ihi-2 ) = czero
+           ! ==== ensure that subdiagonal entries are real ====
+           if( wantt ) then
+              jlo = 1
+              jhi = n
+           else
+              jlo = ilo
+              jhi = ihi
+           end if
+           do i = ilo + 1, ihi
+              if( aimag( h( i, i-1 ) )/=rzero ) then
+                 ! ==== the following redundant normalization
+                 ! .    avoids problems with both gradual and
+                 ! .    sudden underflow in abs(h(i,i-1)) ====
+                 sc = h( i, i-1 ) / cabs1( h( i, i-1 ) )
+                 sc = conjg( sc ) / abs( sc )
+                 h( i, i-1 ) = abs( h( i, i-1 ) )
+                 call stdlib_cscal( jhi-i+1, sc, h( i, i ), ldh )
+                 call stdlib_cscal( min( jhi, i+1 )-jlo+1, conjg( sc ), h( jlo, i ),1 )
+                 if( wantz )call stdlib_cscal( ihiz-iloz+1, conjg( sc ), z( iloz, i ), 1 )
+              end if
+           end do
+           nh = ihi - ilo + 1
+           nz = ihiz - iloz + 1
+           ! set machine-dependent constants for the stopping criterion.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = rone / safmin
+           call stdlib_slabad( safmin, safmax )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin*( real( nh,KIND=sp) / ulp )
+           ! i1 and i2 are the indices of the first row and last column of h
+           ! to which transformations must be applied. if eigenvalues only are
+           ! being computed, i1 and i2 are set inside the main loop.
+           if( wantt ) then
+              i1 = 1
+              i2 = n
+           end if
+           ! itmax is the total number of qr iterations allowed.
+           itmax = 30 * max( 10, nh )
+           ! kdefl counts the number of iterations since a deflation
+           kdefl = 0
+           ! the main loop begins here. i is the loop index and decreases from
+           ! ihi to ilo in steps of 1. each iteration of the loop works
+           ! with the active submatrix in rows and columns l to i.
+           ! eigenvalues i+1 to ihi have already converged. either l = ilo, or
+           ! h(l,l-1) is negligible so that the matrix splits.
+           i = ihi
+           30 continue
+           if( i<ilo )go to 150
+           ! perform qr iterations on rows and columns ilo to i until a
+           ! submatrix of order 1 splits off at the bottom because a
+           ! subdiagonal element has become negligible.
+           l = ilo
+           loop_130: do its = 0, itmax
+              ! look for a single small subdiagonal element.
+              do k = i, l + 1, -1
+                 if( cabs1( h( k, k-1 ) )<=smlnum )go to 50
+                 tst = cabs1( h( k-1, k-1 ) ) + cabs1( h( k, k ) )
+                 if( tst==czero ) then
+                    if( k-2>=ilo )tst = tst + abs( real( h( k-1, k-2 ),KIND=sp) )
+                    if( k+1<=ihi )tst = tst + abs( real( h( k+1, k ),KIND=sp) )
+                 end if
+                 ! ==== the following is a conservative small subdiagonal
+                 ! .    deflation criterion due to ahues
+                 ! .    1997). it has better mathematical foundation and
+                 ! .    improves accuracy in some examples.  ====
+                 if( abs( real( h( k, k-1 ),KIND=sp) )<=ulp*tst ) then
+                    ab = max( cabs1( h( k, k-1 ) ), cabs1( h( k-1, k ) ) )
+                    ba = min( cabs1( h( k, k-1 ) ), cabs1( h( k-1, k ) ) )
+                    aa = max( cabs1( h( k, k ) ),cabs1( h( k-1, k-1 )-h( k, k ) ) )
+                    bb = min( cabs1( h( k, k ) ),cabs1( h( k-1, k-1 )-h( k, k ) ) )
+                    s = aa + ab
+                    if( ba*( ab / s )<=max( smlnum,ulp*( bb*( aa / s ) ) ) )go to 50
+                 end if
+              end do
+              50 continue
+              l = k
+              if( l>ilo ) then
+                 ! h(l,l-1) is negligible
+                 h( l, l-1 ) = czero
+              end if
+              ! exit from loop if a submatrix of order 1 has split off.
+              if( l>=i )go to 140
+              kdefl = kdefl + 1
+              ! now the active submatrix is in rows and columns l to i. if
+              ! eigenvalues only are being computed, only the active submatrix
+              ! need be transformed.
+              if( .not.wantt ) then
+                 i1 = l
+                 i2 = i
+              end if
+              if( mod(kdefl,2*kexsh)==0 ) then
+                 ! exceptional shift.
+                 s = dat1*abs( real( h( i, i-1 ),KIND=sp) )
+                 t = s + h( i, i )
+              else if( mod(kdefl,kexsh)==0 ) then
+                 ! exceptional shift.
+                 s = dat1*abs( real( h( l+1, l ),KIND=sp) )
+                 t = s + h( l, l )
+              else
+                 ! wilkinson's shift.
+                 t = h( i, i )
+                 u = sqrt( h( i-1, i ) )*sqrt( h( i, i-1 ) )
+                 s = cabs1( u )
+                 if( s/=rzero ) then
+                    x = half*( h( i-1, i-1 )-t )
+                    sx = cabs1( x )
+                    s = max( s, cabs1( x ) )
+                    y = s*sqrt( ( x / s )**2+( u / s )**2 )
+                    if( sx>rzero ) then
+                       if( real( x / sx,KIND=sp)*real( y,KIND=sp)+aimag( x / sx )*aimag( y )&
+                                 <rzero )y = -y
+                    end if
+                    t = t - u*stdlib_cladiv( u, ( x+y ) )
+                 end if
+              end if
+              ! look for two consecutive small subdiagonal elements.
+              do m = i - 1, l + 1, -1
+                 ! determine the effect of starting the single-shift qr
+                 ! iteration at row m, and see if this would make h(m,m-1)
+                 ! negligible.
+                 h11 = h( m, m )
+                 h22 = h( m+1, m+1 )
+                 h11s = h11 - t
+                 h21 = real( h( m+1, m ),KIND=sp)
+                 s = cabs1( h11s ) + abs( h21 )
+                 h11s = h11s / s
+                 h21 = h21 / s
+                 v( 1 ) = h11s
+                 v( 2 ) = h21
+                 h10 = real( h( m, m-1 ),KIND=sp)
+                 if( abs( h10 )*abs( h21 )<=ulp*( cabs1( h11s )*( cabs1( h11 )+cabs1( h22 ) ) ) )&
+                           go to 70
+              end do
+              h11 = h( l, l )
+              h22 = h( l+1, l+1 )
+              h11s = h11 - t
+              h21 = real( h( l+1, l ),KIND=sp)
+              s = cabs1( h11s ) + abs( h21 )
+              h11s = h11s / s
+              h21 = h21 / s
+              v( 1 ) = h11s
+              v( 2 ) = h21
+              70 continue
+              ! single-shift qr step
+              loop_120: do k = m, i - 1
+                 ! the first iteration of this loop determines a reflection g
+                 ! from the vector v and applies it from left and right to h,
+                 ! thus creating a nonzero bulge below the subdiagonal.
+                 ! each subsequent iteration determines a reflection g to
+                 ! restore the hessenberg form in the (k-1)th column, and thus
+                 ! chases the bulge cone step toward the bottom of the active
+                 ! submatrix.
+                 ! v(2) is always real before the call to stdlib_clarfg, and hence
+                 ! after the call t2 ( = t1*v(2) ) is also real.
+                 if( k>m )call stdlib_ccopy( 2, h( k, k-1 ), 1, v, 1 )
+                 call stdlib_clarfg( 2, v( 1 ), v( 2 ), 1, t1 )
+                 if( k>m ) then
+                    h( k, k-1 ) = v( 1 )
+                    h( k+1, k-1 ) = czero
+                 end if
+                 v2 = v( 2 )
+                 t2 = real( t1*v2,KIND=sp)
+                 ! apply g from the left to transform the rows of the matrix
+                 ! in columns k to i2.
+                 do j = k, i2
+                    sum = conjg( t1 )*h( k, j ) + t2*h( k+1, j )
+                    h( k, j ) = h( k, j ) - sum
+                    h( k+1, j ) = h( k+1, j ) - sum*v2
+                 end do
+                 ! apply g from the right to transform the columns of the
+                 ! matrix in rows i1 to min(k+2,i).
+                 do j = i1, min( k+2, i )
+                    sum = t1*h( j, k ) + t2*h( j, k+1 )
+                    h( j, k ) = h( j, k ) - sum
+                    h( j, k+1 ) = h( j, k+1 ) - sum*conjg( v2 )
+                 end do
+                 if( wantz ) then
+                    ! accumulate transformations in the matrix z
+                    do j = iloz, ihiz
+                       sum = t1*z( j, k ) + t2*z( j, k+1 )
+                       z( j, k ) = z( j, k ) - sum
+                       z( j, k+1 ) = z( j, k+1 ) - sum*conjg( v2 )
+                    end do
+                 end if
+                 if( k==m .and. m>l ) then
+                    ! if the qr step was started at row m > l because two
+                    ! consecutive small subdiagonals were found, then extra
+                    ! scaling must be performed to ensure that h(m,m-1) remains
+                    ! real.
+                    temp = cone - t1
+                    temp = temp / abs( temp )
+                    h( m+1, m ) = h( m+1, m )*conjg( temp )
+                    if( m+2<=i )h( m+2, m+1 ) = h( m+2, m+1 )*temp
+                    do j = m, i
+                       if( j/=m+1 ) then
+                          if( i2>j )call stdlib_cscal( i2-j, temp, h( j, j+1 ), ldh )
+                          call stdlib_cscal( j-i1, conjg( temp ), h( i1, j ), 1 )
+                          if( wantz ) then
+                             call stdlib_cscal( nz, conjg( temp ), z( iloz, j ), 1 )
+                          end if
+                       end if
+                    end do
+                 end if
+              end do loop_120
+              ! ensure that h(i,i-1) is real.
+              temp = h( i, i-1 )
+              if( aimag( temp )/=rzero ) then
+                 rtemp = abs( temp )
+                 h( i, i-1 ) = rtemp
+                 temp = temp / rtemp
+                 if( i2>i )call stdlib_cscal( i2-i, conjg( temp ), h( i, i+1 ), ldh )
+                 call stdlib_cscal( i-i1, temp, h( i1, i ), 1 )
+                 if( wantz ) then
+                    call stdlib_cscal( nz, temp, z( iloz, i ), 1 )
+                 end if
+              end if
+           end do loop_130
+           ! failure to converge in remaining number of iterations
+           info = i
+           return
+           140 continue
+           ! h(i,i-1) is negligible: cone eigenvalue has converged.
+           w( i ) = h( i, i )
+           ! reset deflation counter
+           kdefl = 0
+           ! return to start of the main loop with new value of i.
+           i = l - 1
+           go to 30
+           150 continue
+           return
+     end subroutine stdlib_clahqr
+
+     !> CLAHR2: reduces the first NB columns of A complex general n-BY-(n-k+1)
+     !> matrix A so that elements below the k-th subdiagonal are zero. The
+     !> reduction is performed by an unitary similarity transformation
+     !> Q**H * A * Q. The routine returns the matrices V and T which determine
+     !> Q as a block reflector I - V*T*v**H, and also the matrix Y = A * V * T.
+     !> This is an auxiliary routine called by CGEHRD.
+
+     pure subroutine stdlib_clahr2( n, k, nb, a, lda, tau, t, ldt, y, ldy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: k, lda, ldt, ldy, n, nb
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: t(ldt,nb), tau(nb), y(ldy,nb)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(sp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 )return
+           loop_10: do i = 1, nb
+              if( i>1 ) then
+                 ! update a(k+1:n,i)
+                 ! update i-th column of a - y * v**h
+                 call stdlib_clacgv( i-1, a( k+i-1, 1 ), lda )
+                 call stdlib_cgemv( 'NO TRANSPOSE', n-k, i-1, -cone, y(k+1,1), ldy,a( k+i-1, 1 ), &
+                           lda, cone, a( k+1, i ), 1 )
+                 call stdlib_clacgv( i-1, a( k+i-1, 1 ), lda )
+                 ! apply i - v * t**h * v**h to this column (call it b) from the
+                 ! left, using the last column of t as workspace
+                 ! let  v = ( v1 )   and   b = ( b1 )   (first i-1 rows)
+                          ! ( v2 )             ( b2 )
+                 ! where v1 is unit lower triangular
+                 ! w := v1**h * b1
+                 call stdlib_ccopy( i-1, a( k+1, i ), 1, t( 1, nb ), 1 )
+                 call stdlib_ctrmv( 'LOWER', 'CONJUGATE TRANSPOSE', 'UNIT',i-1, a( k+1, 1 ),lda, &
+                           t( 1, nb ), 1 )
+                 ! w := w + v2**h * b2
+                 call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-k-i+1, i-1,cone, a( k+i, 1 ),lda, a( &
+                           k+i, i ), 1, cone, t( 1, nb ), 1 )
+                 ! w := t**h * w
+                 call stdlib_ctrmv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',i-1, t, ldt,t( 1, &
+                           nb ), 1 )
+                 ! b2 := b2 - v2*w
+                 call stdlib_cgemv( 'NO TRANSPOSE', n-k-i+1, i-1, -cone,a( k+i, 1 ),lda, t( 1, nb &
+                           ), 1, cone, a( k+i, i ), 1 )
+                 ! b1 := b1 - v1*w
+                 call stdlib_ctrmv( 'LOWER', 'NO TRANSPOSE','UNIT', i-1,a( k+1, 1 ), lda, t( 1, &
+                           nb ), 1 )
+                 call stdlib_caxpy( i-1, -cone, t( 1, nb ), 1, a( k+1, i ), 1 )
+                 a( k+i-1, i-1 ) = ei
+              end if
+              ! generate the elementary reflector h(i) to annihilate
+              ! a(k+i+1:n,i)
+              call stdlib_clarfg( n-k-i+1, a( k+i, i ), a( min( k+i+1, n ), i ), 1,tau( i ) )
+                        
+              ei = a( k+i, i )
+              a( k+i, i ) = cone
+              ! compute  y(k+1:n,i)
+              call stdlib_cgemv( 'NO TRANSPOSE', n-k, n-k-i+1,cone, a( k+1, i+1 ),lda, a( k+i, i )&
+                        , 1, czero, y( k+1, i ), 1 )
+              call stdlib_cgemv( 'CONJUGATE TRANSPOSE', n-k-i+1, i-1,cone, a( k+i, 1 ), lda,a( k+&
+                        i, i ), 1, czero, t( 1, i ), 1 )
+              call stdlib_cgemv( 'NO TRANSPOSE', n-k, i-1, -cone,y( k+1, 1 ), ldy,t( 1, i ), 1, &
+                        cone, y( k+1, i ), 1 )
+              call stdlib_cscal( n-k, tau( i ), y( k+1, i ), 1 )
+              ! compute t(1:i,i)
+              call stdlib_cscal( i-1, -tau( i ), t( 1, i ), 1 )
+              call stdlib_ctrmv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',i-1, t, ldt,t( 1, i ), 1 )
+                        
+              t( i, i ) = tau( i )
+           end do loop_10
+           a( k+nb, nb ) = ei
+           ! compute y(1:k,1:nb)
+           call stdlib_clacpy( 'ALL', k, nb, a( 1, 2 ), lda, y, ldy )
+           call stdlib_ctrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE','UNIT', k, nb,cone, a( k+1, 1 ), &
+                     lda, y, ldy )
+           if( n>k+nb )call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', k,nb, n-k-nb, cone,a( 1,&
+                      2+nb ), lda, a( k+1+nb, 1 ), lda, cone, y,ldy )
+           call stdlib_ctrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE','NON-UNIT', k, nb,cone, t, ldt, y, &
+                     ldy )
+           return
+     end subroutine stdlib_clahr2
+
+     !> CLALS0: applies back the multiplying factors of either the left or the
+     !> right singular vector matrix of a diagonal matrix appended by a row
+     !> to the right hand side matrix B in solving the least squares problem
+     !> using the divide-and-conquer SVD approach.
+     !> For the left singular vector matrix, three types of orthogonal
+     !> matrices are involved:
+     !> (1L) Givens rotations: the number of such rotations is GIVPTR; the
+     !> pairs of columns/rows they were applied to are stored in GIVCOL;
+     !> and the C- and S-values of these rotations are stored in GIVNUM.
+     !> (2L) Permutation. The (NL+1)-st row of B is to be moved to the first
+     !> row, and for J=2:N, PERM(J)-th row of B is to be moved to the
+     !> J-th row.
+     !> (3L) The left singular vector matrix of the remaining matrix.
+     !> For the right singular vector matrix, four types of orthogonal
+     !> matrices are involved:
+     !> (1R) The right singular vector matrix of the remaining matrix.
+     !> (2R) If SQRE = 1, one extra Givens rotation to generate the right
+     !> null space.
+     !> (3R) The inverse transformation of (2L).
+     !> (4R) The inverse transformation of (1L).
+
+     pure subroutine stdlib_clals0( icompq, nl, nr, sqre, nrhs, b, ldb, bx, ldbx,perm, givptr, &
+               givcol, ldgcol, givnum, ldgnum,poles, difl, difr, z, k, c, s, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: givptr, icompq, k, ldb, ldbx, ldgcol, ldgnum, nl, nr, nrhs,&
+                      sqre
+           integer(ilp), intent(out) :: info
+           real(sp), intent(in) :: c, s
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(ldgcol,*), perm(*)
+           real(sp), intent(in) :: difl(*), difr(ldgnum,*), givnum(ldgnum,*), poles(ldgnum,*), z(&
+                     *)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: b(ldb,*)
+           complex(sp), intent(out) :: bx(ldbx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jcol, jrow, m, n, nlp1
+           real(sp) :: diflj, difrj, dj, dsigj, dsigjp, temp
+           ! Intrinsic Functions 
+           intrinsic :: aimag,cmplx,max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           n = nl + nr + 1
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( nl<1 ) then
+              info = -2
+           else if( nr<1 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( nrhs<1 ) then
+              info = -5
+           else if( ldb<n ) then
+              info = -7
+           else if( ldbx<n ) then
+              info = -9
+           else if( givptr<0 ) then
+              info = -11
+           else if( ldgcol<n ) then
+              info = -13
+           else if( ldgnum<n ) then
+              info = -15
+           else if( k<1 ) then
+              info = -20
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLALS0', -info )
+              return
+           end if
+           m = n + sqre
+           nlp1 = nl + 1
+           if( icompq==0 ) then
+              ! apply back orthogonal transformations from the left.
+              ! step (1l): apply back the givens rotations performed.
+              do i = 1, givptr
+                 call stdlib_csrot( nrhs, b( givcol( i, 2 ), 1 ), ldb,b( givcol( i, 1 ), 1 ), ldb,&
+                            givnum( i, 2 ),givnum( i, 1 ) )
+              end do
+              ! step (2l): permute rows of b.
+              call stdlib_ccopy( nrhs, b( nlp1, 1 ), ldb, bx( 1, 1 ), ldbx )
+              do i = 2, n
+                 call stdlib_ccopy( nrhs, b( perm( i ), 1 ), ldb, bx( i, 1 ), ldbx )
+              end do
+              ! step (3l): apply the inverse of the left singular vector
+              ! matrix to bx.
+              if( k==1 ) then
+                 call stdlib_ccopy( nrhs, bx, ldbx, b, ldb )
+                 if( z( 1 )<zero ) then
+                    call stdlib_csscal( nrhs, negone, b, ldb )
+                 end if
+              else
+                 loop_100: do j = 1, k
+                    diflj = difl( j )
+                    dj = poles( j, 1 )
+                    dsigj = -poles( j, 2 )
+                    if( j<k ) then
+                       difrj = -difr( j, 1 )
+                       dsigjp = -poles( j+1, 2 )
+                    end if
+                    if( ( z( j )==zero ) .or. ( poles( j, 2 )==zero ) )then
+                       rwork( j ) = zero
+                    else
+                       rwork( j ) = -poles( j, 2 )*z( j ) / diflj /( poles( j, 2 )+dj )
+                    end if
+                    do i = 1, j - 1
+                       if( ( z( i )==zero ) .or.( poles( i, 2 )==zero ) ) then
+                          rwork( i ) = zero
+                       else
+                          rwork( i ) = poles( i, 2 )*z( i ) /( stdlib_slamc3( poles( i, 2 ), &
+                                    dsigj )-diflj ) / ( poles( i, 2 )+dj )
+                       end if
+                    end do
+                    do i = j + 1, k
+                       if( ( z( i )==zero ) .or.( poles( i, 2 )==zero ) ) then
+                          rwork( i ) = zero
+                       else
+                          rwork( i ) = poles( i, 2 )*z( i ) /( stdlib_slamc3( poles( i, 2 ), &
+                                    dsigjp )+difrj ) / ( poles( i, 2 )+dj )
+                       end if
+                    end do
+                    rwork( 1 ) = negone
+                    temp = stdlib_snrm2( k, rwork, 1 )
+                    ! since b and bx are complex, the following call to stdlib_sgemv
+                    ! is performed in two steps (real and imaginary parts).
+                    ! call stdlib_sgemv( 't', k, nrhs, one, bx, ldbx, work, 1, zero,
+          ! $                     b( j, 1 ), ldb )
+                    i = k + nrhs*2
+                    do jcol = 1, nrhs
+                       do jrow = 1, k
+                          i = i + 1
+                          rwork( i ) = real( bx( jrow, jcol ),KIND=sp)
+                       end do
+                    end do
+                    call stdlib_sgemv( 'T', k, nrhs, one, rwork( 1+k+nrhs*2 ), k,rwork( 1 ), 1, &
+                              zero, rwork( 1+k ), 1 )
+                    i = k + nrhs*2
+                    do jcol = 1, nrhs
+                       do jrow = 1, k
+                          i = i + 1
+                          rwork( i ) = aimag( bx( jrow, jcol ) )
+                       end do
+                    end do
+                    call stdlib_sgemv( 'T', k, nrhs, one, rwork( 1+k+nrhs*2 ), k,rwork( 1 ), 1, &
+                              zero, rwork( 1+k+nrhs ), 1 )
+                    do jcol = 1, nrhs
+                       b( j, jcol ) = cmplx( rwork( jcol+k ),rwork( jcol+k+nrhs ),KIND=sp)
+                    end do
+                    call stdlib_clascl( 'G', 0, 0, temp, one, 1, nrhs, b( j, 1 ),ldb, info )
+                              
+                 end do loop_100
+              end if
+              ! move the deflated rows of bx to b also.
+              if( k<max( m, n ) )call stdlib_clacpy( 'A', n-k, nrhs, bx( k+1, 1 ), ldbx,b( k+1, 1 &
+                        ), ldb )
+           else
+              ! apply back the right orthogonal transformations.
+              ! step (1r): apply back the new right singular vector matrix
+              ! to b.
+              if( k==1 ) then
+                 call stdlib_ccopy( nrhs, b, ldb, bx, ldbx )
+              else
+                 loop_180: do j = 1, k
+                    dsigj = poles( j, 2 )
+                    if( z( j )==zero ) then
+                       rwork( j ) = zero
+                    else
+                       rwork( j ) = -z( j ) / difl( j ) /( dsigj+poles( j, 1 ) ) / difr( j, 2 )
+                                 
+                    end if
+                    do i = 1, j - 1
+                       if( z( j )==zero ) then
+                          rwork( i ) = zero
+                       else
+                          rwork( i ) = z( j ) / ( stdlib_slamc3( dsigj, -poles( i+1,2 ) )-difr( i,&
+                                     1 ) ) /( dsigj+poles( i, 1 ) ) / difr( i, 2 )
+                       end if
+                    end do
+                    do i = j + 1, k
+                       if( z( j )==zero ) then
+                          rwork( i ) = zero
+                       else
+                          rwork( i ) = z( j ) / ( stdlib_slamc3( dsigj, -poles( i,2 ) )-difl( i ) &
+                                    ) /( dsigj+poles( i, 1 ) ) / difr( i, 2 )
+                       end if
+                    end do
+                    ! since b and bx are complex, the following call to stdlib_sgemv
+                    ! is performed in two steps (real and imaginary parts).
+                    ! call stdlib_sgemv( 't', k, nrhs, one, b, ldb, work, 1, zero,
+          ! $                     bx( j, 1 ), ldbx )
+                    i = k + nrhs*2
+                    do jcol = 1, nrhs
+                       do jrow = 1, k
+                          i = i + 1
+                          rwork( i ) = real( b( jrow, jcol ),KIND=sp)
+                       end do
+                    end do
+                    call stdlib_sgemv( 'T', k, nrhs, one, rwork( 1+k+nrhs*2 ), k,rwork( 1 ), 1, &
+                              zero, rwork( 1+k ), 1 )
+                    i = k + nrhs*2
+                    do jcol = 1, nrhs
+                       do jrow = 1, k
+                          i = i + 1
+                          rwork( i ) = aimag( b( jrow, jcol ) )
+                       end do
+                    end do
+                    call stdlib_sgemv( 'T', k, nrhs, one, rwork( 1+k+nrhs*2 ), k,rwork( 1 ), 1, &
+                              zero, rwork( 1+k+nrhs ), 1 )
+                    do jcol = 1, nrhs
+                       bx( j, jcol ) = cmplx( rwork( jcol+k ),rwork( jcol+k+nrhs ),KIND=sp)
+                                 
+                    end do
+                 end do loop_180
+              end if
+              ! step (2r): if sqre = 1, apply back the rotation that is
+              ! related to the right null space of the subproblem.
+              if( sqre==1 ) then
+                 call stdlib_ccopy( nrhs, b( m, 1 ), ldb, bx( m, 1 ), ldbx )
+                 call stdlib_csrot( nrhs, bx( 1, 1 ), ldbx, bx( m, 1 ), ldbx, c, s )
+              end if
+              if( k<max( m, n ) )call stdlib_clacpy( 'A', n-k, nrhs, b( k+1, 1 ), ldb,bx( k+1, 1 )&
+                        , ldbx )
+              ! step (3r): permute rows of b.
+              call stdlib_ccopy( nrhs, bx( 1, 1 ), ldbx, b( nlp1, 1 ), ldb )
+              if( sqre==1 ) then
+                 call stdlib_ccopy( nrhs, bx( m, 1 ), ldbx, b( m, 1 ), ldb )
+              end if
+              do i = 2, n
+                 call stdlib_ccopy( nrhs, bx( i, 1 ), ldbx, b( perm( i ), 1 ), ldb )
+              end do
+              ! step (4r): apply back the givens rotations performed.
+              do i = givptr, 1, -1
+                 call stdlib_csrot( nrhs, b( givcol( i, 2 ), 1 ), ldb,b( givcol( i, 1 ), 1 ), ldb,&
+                            givnum( i, 2 ),-givnum( i, 1 ) )
+              end do
+           end if
+           return
+     end subroutine stdlib_clals0
+
+     !> CLALSA: is an itermediate step in solving the least squares problem
+     !> by computing the SVD of the coefficient matrix in compact form (The
+     !> singular vectors are computed as products of simple orthorgonal
+     !> matrices.).
+     !> If ICOMPQ = 0, CLALSA applies the inverse of the left singular vector
+     !> matrix of an upper bidiagonal matrix to the right hand side; and if
+     !> ICOMPQ = 1, CLALSA applies the right singular vector matrix to the
+     !> right hand side. The singular vector matrices were generated in
+     !> compact form by CLALSA.
+
+     pure subroutine stdlib_clalsa( icompq, smlsiz, n, nrhs, b, ldb, bx, ldbx, u,ldu, vt, k, difl,&
+                difr, z, poles, givptr,givcol, ldgcol, perm, givnum, c, s, rwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, ldb, ldbx, ldgcol, ldu, n, nrhs, smlsiz
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(ldgcol,*), givptr(*), k(*), perm(ldgcol,*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: c(*), difl(ldu,*), difr(ldu,*), givnum(ldu,*), poles(ldu,*), s(&
+                     *), u(ldu,*), vt(ldu,*), z(ldu,*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: b(ldb,*)
+           complex(sp), intent(out) :: bx(ldbx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ic, im1, inode, j, jcol, jimag, jreal, jrow, lf, ll, lvl, lvl2, &
+                     nd, ndb1, ndiml, ndimr, nl, nlf, nlp1, nlvl, nr, nrf, nrp1, sqre
+           ! Intrinsic Functions 
+           intrinsic :: aimag,cmplx,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( smlsiz<3 ) then
+              info = -2
+           else if( n<smlsiz ) then
+              info = -3
+           else if( nrhs<1 ) then
+              info = -4
+           else if( ldb<n ) then
+              info = -6
+           else if( ldbx<n ) then
+              info = -8
+           else if( ldu<n ) then
+              info = -10
+           else if( ldgcol<n ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLALSA', -info )
+              return
+           end if
+           ! book-keeping and  setting up the computation tree.
+           inode = 1
+           ndiml = inode + n
+           ndimr = ndiml + n
+           call stdlib_slasdt( n, nlvl, nd, iwork( inode ), iwork( ndiml ),iwork( ndimr ), smlsiz &
+                     )
+           ! the following code applies back the left singular vector factors.
+           ! for applying back the right singular vector factors, go to 170.
+           if( icompq==1 ) then
+              go to 170
+           end if
+           ! the nodes on the bottom level of the tree were solved
+           ! by stdlib_slasdq. the corresponding left and right singular vector
+           ! matrices are in explicit form. first apply back the left
+           ! singular vector matrices.
+           ndb1 = ( nd+1 ) / 2
+           loop_130: do i = ndb1, nd
+              ! ic : center row of each node
+              ! nl : number of rows of left  subproblem
+              ! nr : number of rows of right subproblem
+              ! nlf: starting row of the left   subproblem
+              ! nrf: starting row of the right  subproblem
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nr = iwork( ndimr+i1 )
+              nlf = ic - nl
+              nrf = ic + 1
+              ! since b and bx are complex, the following call to stdlib_sgemm
+              ! is performed in two steps (real and imaginary parts).
+              ! call stdlib_sgemm( 't', 'n', nl, nrhs, nl, one, u( nlf, 1 ), ldu,
+           ! $               b( nlf, 1 ), ldb, zero, bx( nlf, 1 ), ldbx )
+              j = nl*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nl - 1
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=sp)
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', nl, nrhs, nl, one, u( nlf, 1 ), ldu,rwork( 1+nl*nrhs*2 &
+                        ), nl, zero, rwork( 1 ), nl )
+              j = nl*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nl - 1
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', nl, nrhs, nl, one, u( nlf, 1 ), ldu,rwork( 1+nl*nrhs*2 &
+                        ), nl, zero, rwork( 1+nl*nrhs ),nl )
+              jreal = 0
+              jimag = nl*nrhs
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nl - 1
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    bx( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=sp)
+                 end do
+              end do
+              ! since b and bx are complex, the following call to stdlib_sgemm
+              ! is performed in two steps (real and imaginary parts).
+              ! call stdlib_sgemm( 't', 'n', nr, nrhs, nr, one, u( nrf, 1 ), ldu,
+          ! $               b( nrf, 1 ), ldb, zero, bx( nrf, 1 ), ldbx )
+              j = nr*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nr - 1
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=sp)
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', nr, nrhs, nr, one, u( nrf, 1 ), ldu,rwork( 1+nr*nrhs*2 &
+                        ), nr, zero, rwork( 1 ), nr )
+              j = nr*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nr - 1
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', nr, nrhs, nr, one, u( nrf, 1 ), ldu,rwork( 1+nr*nrhs*2 &
+                        ), nr, zero, rwork( 1+nr*nrhs ),nr )
+              jreal = 0
+              jimag = nr*nrhs
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nr - 1
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    bx( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=sp)
+                 end do
+              end do
+           end do loop_130
+           ! next copy the rows of b that correspond to unchanged rows
+           ! in the bidiagonal matrix to bx.
+           do i = 1, nd
+              ic = iwork( inode+i-1 )
+              call stdlib_ccopy( nrhs, b( ic, 1 ), ldb, bx( ic, 1 ), ldbx )
+           end do
+           ! finally go through the left singular vector matrices of all
+           ! the other subproblems bottom-up on the tree.
+           j = 2**nlvl
+           sqre = 0
+           do lvl = nlvl, 1, -1
+              lvl2 = 2*lvl - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = lf, ll
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 j = j - 1
+                 call stdlib_clals0( icompq, nl, nr, sqre, nrhs, bx( nlf, 1 ), ldbx,b( nlf, 1 ), &
+                 ldb, perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ), ldgcol,givnum( nlf, lvl2 &
+                 ), ldu, poles( nlf, lvl2 ),difl( nlf, lvl ), difr( nlf, lvl2 ),z( nlf, lvl ), k( &
+                           j ), c( j ), s( j ), rwork,info )
+              end do
+           end do
+           go to 330
+           ! icompq = 1: applying back the right singular vector factors.
+           170 continue
+           ! first now go through the right singular vector matrices of all
+           ! the tree nodes top-down.
+           j = 0
+           loop_190: do lvl = 1, nlvl
+              lvl2 = 2*lvl - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl.
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = ll, lf, -1
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 if( i==ll ) then
+                    sqre = 0
+                 else
+                    sqre = 1
+                 end if
+                 j = j + 1
+                 call stdlib_clals0( icompq, nl, nr, sqre, nrhs, b( nlf, 1 ), ldb,bx( nlf, 1 ), &
+                 ldbx, perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ), ldgcol,givnum( nlf, &
+                 lvl2 ), ldu, poles( nlf, lvl2 ),difl( nlf, lvl ), difr( nlf, lvl2 ),z( nlf, lvl )&
+                           , k( j ), c( j ), s( j ), rwork,info )
+              end do
+           end do loop_190
+           ! the nodes on the bottom level of the tree were solved
+           ! by stdlib_slasdq. the corresponding right singular vector
+           ! matrices are in explicit form. apply them back.
+           ndb1 = ( nd+1 ) / 2
+           loop_320: do i = ndb1, nd
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nr = iwork( ndimr+i1 )
+              nlp1 = nl + 1
+              if( i==nd ) then
+                 nrp1 = nr
+              else
+                 nrp1 = nr + 1
+              end if
+              nlf = ic - nl
+              nrf = ic + 1
+              ! since b and bx are complex, the following call to stdlib_sgemm is
+              ! performed in two steps (real and imaginary parts).
+              ! call stdlib_sgemm( 't', 'n', nlp1, nrhs, nlp1, one, vt( nlf, 1 ), ldu,
+          ! $               b( nlf, 1 ), ldb, zero, bx( nlf, 1 ), ldbx )
+              j = nlp1*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nlp1 - 1
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=sp)
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', nlp1, nrhs, nlp1, one, vt( nlf, 1 ), ldu,rwork( 1+&
+                        nlp1*nrhs*2 ), nlp1, zero, rwork( 1 ),nlp1 )
+              j = nlp1*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nlp1 - 1
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', nlp1, nrhs, nlp1, one, vt( nlf, 1 ), ldu,rwork( 1+&
+                        nlp1*nrhs*2 ), nlp1, zero,rwork( 1+nlp1*nrhs ), nlp1 )
+              jreal = 0
+              jimag = nlp1*nrhs
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nlp1 - 1
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    bx( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=sp)
+                 end do
+              end do
+              ! since b and bx are complex, the following call to stdlib_sgemm is
+              ! performed in two steps (real and imaginary parts).
+              ! call stdlib_sgemm( 't', 'n', nrp1, nrhs, nrp1, one, vt( nrf, 1 ), ldu,
+          ! $               b( nrf, 1 ), ldb, zero, bx( nrf, 1 ), ldbx )
+              j = nrp1*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nrp1 - 1
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=sp)
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', nrp1, nrhs, nrp1, one, vt( nrf, 1 ), ldu,rwork( 1+&
+                        nrp1*nrhs*2 ), nrp1, zero, rwork( 1 ),nrp1 )
+              j = nrp1*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nrp1 - 1
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', nrp1, nrhs, nrp1, one, vt( nrf, 1 ), ldu,rwork( 1+&
+                        nrp1*nrhs*2 ), nrp1, zero,rwork( 1+nrp1*nrhs ), nrp1 )
+              jreal = 0
+              jimag = nrp1*nrhs
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nrp1 - 1
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    bx( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=sp)
+                 end do
+              end do
+           end do loop_320
+           330 continue
+           return
+     end subroutine stdlib_clalsa
+
+     !> CLALSD: uses the singular value decomposition of A to solve the least
+     !> squares problem of finding X to minimize the Euclidean norm of each
+     !> column of A*X-B, where A is N-by-N upper bidiagonal, and X and B
+     !> are N-by-NRHS. The solution X overwrites B.
+     !> The singular values of A smaller than RCOND times the largest
+     !> singular value are treated as zero in solving the least squares
+     !> problem; in this case a minimum norm solution is returned.
+     !> The actual singular values are returned in D in ascending order.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_clalsd( uplo, smlsiz, n, nrhs, d, e, b, ldb, rcond,rank, work, rwork, &
+               iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: ldb, n, nrhs, smlsiz
+           real(sp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: bx, bxst, c, difl, difr, givcol, givnum, givptr, i, icmpq1, icmpq2, &
+           irwb, irwib, irwrb, irwu, irwvt, irwwrk, iwk, j, jcol, jimag, jreal, jrow, k, nlvl, &
+           nm1, nrwork, nsize, nsub, perm, poles, s, sizei, smlszp, sqre, st, st1, u, vt, &
+                     z
+           real(sp) :: cs, eps, orgnrm, r, rcnd, sn, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,int,log,real,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -3
+           else if( nrhs<1 ) then
+              info = -4
+           else if( ( ldb<1 ) .or. ( ldb<n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLALSD', -info )
+              return
+           end if
+           eps = stdlib_slamch( 'EPSILON' )
+           ! set up the tolerance.
+           if( ( rcond<=zero ) .or. ( rcond>=one ) ) then
+              rcnd = eps
+           else
+              rcnd = rcond
+           end if
+           rank = 0
+           ! quick return if possible.
+           if( n==0 ) then
+              return
+           else if( n==1 ) then
+              if( d( 1 )==zero ) then
+                 call stdlib_claset( 'A', 1, nrhs, czero, czero, b, ldb )
+              else
+                 rank = 1
+                 call stdlib_clascl( 'G', 0, 0, d( 1 ), one, 1, nrhs, b, ldb, info )
+                 d( 1 ) = abs( d( 1 ) )
+              end if
+              return
+           end if
+           ! rotate the matrix if it is lower bidiagonal.
+           if( uplo=='L' ) then
+              do i = 1, n - 1
+                 call stdlib_slartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( nrhs==1 ) then
+                    call stdlib_csrot( 1, b( i, 1 ), 1, b( i+1, 1 ), 1, cs, sn )
+                 else
+                    rwork( i*2-1 ) = cs
+                    rwork( i*2 ) = sn
+                 end if
+              end do
+              if( nrhs>1 ) then
+                 do i = 1, nrhs
+                    do j = 1, n - 1
+                       cs = rwork( j*2-1 )
+                       sn = rwork( j*2 )
+                       call stdlib_csrot( 1, b( j, i ), 1, b( j+1, i ), 1, cs, sn )
+                    end do
+                 end do
+              end if
+           end if
+           ! scale.
+           nm1 = n - 1
+           orgnrm = stdlib_slanst( 'M', n, d, e )
+           if( orgnrm==zero ) then
+              call stdlib_claset( 'A', n, nrhs, czero, czero, b, ldb )
+              return
+           end if
+           call stdlib_slascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, info )
+           call stdlib_slascl( 'G', 0, 0, orgnrm, one, nm1, 1, e, nm1, info )
+           ! if n is smaller than the minimum divide size smlsiz, then solve
+           ! the problem with another solver.
+           if( n<=smlsiz ) then
+              irwu = 1
+              irwvt = irwu + n*n
+              irwwrk = irwvt + n*n
+              irwrb = irwwrk
+              irwib = irwrb + n*nrhs
+              irwb = irwib + n*nrhs
+              call stdlib_slaset( 'A', n, n, zero, one, rwork( irwu ), n )
+              call stdlib_slaset( 'A', n, n, zero, one, rwork( irwvt ), n )
+              call stdlib_slasdq( 'U', 0, n, n, n, 0, d, e, rwork( irwvt ), n,rwork( irwu ), n, &
+                        rwork( irwwrk ), 1,rwork( irwwrk ), info )
+              if( info/=0 ) then
+                 return
+              end if
+              ! in the real version, b is passed to stdlib_slasdq and multiplied
+              ! internally by q**h. here b is complex and that product is
+              ! computed below in two steps (real and imaginary parts).
+              j = irwb - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=sp)
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', n, nrhs, n, one, rwork( irwu ), n,rwork( irwb ), n, &
+                        zero, rwork( irwrb ), n )
+              j = irwb - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', n, nrhs, n, one, rwork( irwu ), n,rwork( irwb ), n, &
+                        zero, rwork( irwib ), n )
+              jreal = irwrb - 1
+              jimag = irwib - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    b( jrow, jcol ) = cmplx( rwork( jreal ), rwork( jimag ),KIND=sp)
+                 end do
+              end do
+              tol = rcnd*abs( d( stdlib_isamax( n, d, 1 ) ) )
+              do i = 1, n
+                 if( d( i )<=tol ) then
+                    call stdlib_claset( 'A', 1, nrhs, czero, czero, b( i, 1 ), ldb )
+                 else
+                    call stdlib_clascl( 'G', 0, 0, d( i ), one, 1, nrhs, b( i, 1 ),ldb, info )
+                              
+                    rank = rank + 1
+                 end if
+              end do
+              ! since b is complex, the following call to stdlib_sgemm is performed
+              ! in two steps (real and imaginary parts). that is for v * b
+              ! (in the real version of the code v**h is stored in work).
+              ! call stdlib_sgemm( 't', 'n', n, nrhs, n, one, work, n, b, ldb, zero,
+          ! $               work( nwork ), n )
+              j = irwb - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=sp)
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', n, nrhs, n, one, rwork( irwvt ), n,rwork( irwb ), n, &
+                        zero, rwork( irwrb ), n )
+              j = irwb - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', n, nrhs, n, one, rwork( irwvt ), n,rwork( irwb ), n, &
+                        zero, rwork( irwib ), n )
+              jreal = irwrb - 1
+              jimag = irwib - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    b( jrow, jcol ) = cmplx( rwork( jreal ), rwork( jimag ),KIND=sp)
+                 end do
+              end do
+              ! unscale.
+              call stdlib_slascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+              call stdlib_slasrt( 'D', n, d, info )
+              call stdlib_clascl( 'G', 0, 0, orgnrm, one, n, nrhs, b, ldb, info )
+              return
+           end if
+           ! book-keeping and setting up some constants.
+           nlvl = int( log( real( n,KIND=sp) / real( smlsiz+1,KIND=sp) ) / log( two ),KIND=ilp) + &
+                     1
+           smlszp = smlsiz + 1
+           u = 1
+           vt = 1 + smlsiz*n
+           difl = vt + smlszp*n
+           difr = difl + nlvl*n
+           z = difr + nlvl*n*2
+           c = z + nlvl*n
+           s = c + n
+           poles = s + n
+           givnum = poles + 2*nlvl*n
+           nrwork = givnum + 2*nlvl*n
+           bx = 1
+           irwrb = nrwork
+           irwib = irwrb + smlsiz*nrhs
+           irwb = irwib + smlsiz*nrhs
+           sizei = 1 + n
+           k = sizei + n
+           givptr = k + n
+           perm = givptr + n
+           givcol = perm + nlvl*n
+           iwk = givcol + nlvl*n*2
+           st = 1
+           sqre = 0
+           icmpq1 = 1
+           icmpq2 = 0
+           nsub = 0
+           do i = 1, n
+              if( abs( d( i ) )<eps ) then
+                 d( i ) = sign( eps, d( i ) )
+              end if
+           end do
+           loop_240: do i = 1, nm1
+              if( ( abs( e( i ) )<eps ) .or. ( i==nm1 ) ) then
+                 nsub = nsub + 1
+                 iwork( nsub ) = st
+                 ! subproblem found. first determine its size and then
+                 ! apply divide and conquer on it.
+                 if( i<nm1 ) then
+                    ! a subproblem with e(i) small for i < nm1.
+                    nsize = i - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                 else if( abs( e( i ) )>=eps ) then
+                    ! a subproblem with e(nm1) not too small but i = nm1.
+                    nsize = n - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                 else
+                    ! a subproblem with e(nm1) small. this implies an
+                    ! 1-by-1 subproblem at d(n), which is not solved
+                    ! explicitly.
+                    nsize = i - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                    nsub = nsub + 1
+                    iwork( nsub ) = n
+                    iwork( sizei+nsub-1 ) = 1
+                    call stdlib_ccopy( nrhs, b( n, 1 ), ldb, work( bx+nm1 ), n )
+                 end if
+                 st1 = st - 1
+                 if( nsize==1 ) then
+                    ! this is a 1-by-1 subproblem and is not solved
+                    ! explicitly.
+                    call stdlib_ccopy( nrhs, b( st, 1 ), ldb, work( bx+st1 ), n )
+                 else if( nsize<=smlsiz ) then
+                    ! this is a small subproblem and is solved by stdlib_slasdq.
+                    call stdlib_slaset( 'A', nsize, nsize, zero, one,rwork( vt+st1 ), n )
+                    call stdlib_slaset( 'A', nsize, nsize, zero, one,rwork( u+st1 ), n )
+                    call stdlib_slasdq( 'U', 0, nsize, nsize, nsize, 0, d( st ),e( st ), rwork( &
+                    vt+st1 ), n, rwork( u+st1 ),n, rwork( nrwork ), 1, rwork( nrwork ),info )
+                              
+                    if( info/=0 ) then
+                       return
+                    end if
+                    ! in the real version, b is passed to stdlib_slasdq and multiplied
+                    ! internally by q**h. here b is complex and that product is
+                    ! computed below in two steps (real and imaginary parts).
+                    j = irwb - 1
+                    do jcol = 1, nrhs
+                       do jrow = st, st + nsize - 1
+                          j = j + 1
+                          rwork( j ) = real( b( jrow, jcol ),KIND=sp)
+                       end do
+                    end do
+                    call stdlib_sgemm( 'T', 'N', nsize, nrhs, nsize, one,rwork( u+st1 ), n, rwork(&
+                               irwb ), nsize,zero, rwork( irwrb ), nsize )
+                    j = irwb - 1
+                    do jcol = 1, nrhs
+                       do jrow = st, st + nsize - 1
+                          j = j + 1
+                          rwork( j ) = aimag( b( jrow, jcol ) )
+                       end do
+                    end do
+                    call stdlib_sgemm( 'T', 'N', nsize, nrhs, nsize, one,rwork( u+st1 ), n, rwork(&
+                               irwb ), nsize,zero, rwork( irwib ), nsize )
+                    jreal = irwrb - 1
+                    jimag = irwib - 1
+                    do jcol = 1, nrhs
+                       do jrow = st, st + nsize - 1
+                          jreal = jreal + 1
+                          jimag = jimag + 1
+                          b( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=sp)
+                       end do
+                    end do
+                    call stdlib_clacpy( 'A', nsize, nrhs, b( st, 1 ), ldb,work( bx+st1 ), n )
+                              
+                 else
+                    ! a large problem. solve it using divide and conquer.
+                    call stdlib_slasda( icmpq1, smlsiz, nsize, sqre, d( st ),e( st ), rwork( u+&
+                    st1 ), n, rwork( vt+st1 ),iwork( k+st1 ), rwork( difl+st1 ),rwork( difr+st1 ),&
+                     rwork( z+st1 ),rwork( poles+st1 ), iwork( givptr+st1 ),iwork( givcol+st1 ), &
+                     n, iwork( perm+st1 ),rwork( givnum+st1 ), rwork( c+st1 ),rwork( s+st1 ), &
+                               rwork( nrwork ),iwork( iwk ), info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                    bxst = bx + st1
+                    call stdlib_clalsa( icmpq2, smlsiz, nsize, nrhs, b( st, 1 ),ldb, work( bxst ),&
+                     n, rwork( u+st1 ), n,rwork( vt+st1 ), iwork( k+st1 ),rwork( difl+st1 ), &
+                     rwork( difr+st1 ),rwork( z+st1 ), rwork( poles+st1 ),iwork( givptr+st1 ), &
+                     iwork( givcol+st1 ), n,iwork( perm+st1 ), rwork( givnum+st1 ),rwork( c+st1 ),&
+                                rwork( s+st1 ),rwork( nrwork ), iwork( iwk ), info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                 end if
+                 st = i + 1
+              end if
+           end do loop_240
+           ! apply the singular values and treat the tiny ones as zero.
+           tol = rcnd*abs( d( stdlib_isamax( n, d, 1 ) ) )
+           do i = 1, n
+              ! some of the elements in d can be negative because 1-by-1
+              ! subproblems were not solved explicitly.
+              if( abs( d( i ) )<=tol ) then
+                 call stdlib_claset( 'A', 1, nrhs, czero, czero, work( bx+i-1 ), n )
+              else
+                 rank = rank + 1
+                 call stdlib_clascl( 'G', 0, 0, d( i ), one, 1, nrhs,work( bx+i-1 ), n, info )
+                           
+              end if
+              d( i ) = abs( d( i ) )
+           end do
+           ! now apply back the right singular vectors.
+           icmpq2 = 1
+           loop_320: do i = 1, nsub
+              st = iwork( i )
+              st1 = st - 1
+              nsize = iwork( sizei+i-1 )
+              bxst = bx + st1
+              if( nsize==1 ) then
+                 call stdlib_ccopy( nrhs, work( bxst ), n, b( st, 1 ), ldb )
+              else if( nsize<=smlsiz ) then
+                 ! since b and bx are complex, the following call to stdlib_sgemm
+                 ! is performed in two steps (real and imaginary parts).
+                 ! call stdlib_sgemm( 't', 'n', nsize, nrhs, nsize, one,
+          ! $                  rwork( vt+st1 ), n, rwork( bxst ), n, zero,
+          ! $                  b( st, 1 ), ldb )
+                 j = bxst - n - 1
+                 jreal = irwb - 1
+                 do jcol = 1, nrhs
+                    j = j + n
+                    do jrow = 1, nsize
+                       jreal = jreal + 1
+                       rwork( jreal ) = real( work( j+jrow ),KIND=sp)
+                    end do
+                 end do
+                 call stdlib_sgemm( 'T', 'N', nsize, nrhs, nsize, one,rwork( vt+st1 ), n, rwork( &
+                           irwb ), nsize, zero,rwork( irwrb ), nsize )
+                 j = bxst - n - 1
+                 jimag = irwb - 1
+                 do jcol = 1, nrhs
+                    j = j + n
+                    do jrow = 1, nsize
+                       jimag = jimag + 1
+                       rwork( jimag ) = aimag( work( j+jrow ) )
+                    end do
+                 end do
+                 call stdlib_sgemm( 'T', 'N', nsize, nrhs, nsize, one,rwork( vt+st1 ), n, rwork( &
+                           irwb ), nsize, zero,rwork( irwib ), nsize )
+                 jreal = irwrb - 1
+                 jimag = irwib - 1
+                 do jcol = 1, nrhs
+                    do jrow = st, st + nsize - 1
+                       jreal = jreal + 1
+                       jimag = jimag + 1
+                       b( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=sp)
+                    end do
+                 end do
+              else
+                 call stdlib_clalsa( icmpq2, smlsiz, nsize, nrhs, work( bxst ), n,b( st, 1 ), ldb,&
+                  rwork( u+st1 ), n,rwork( vt+st1 ), iwork( k+st1 ),rwork( difl+st1 ), rwork( &
+                  difr+st1 ),rwork( z+st1 ), rwork( poles+st1 ),iwork( givptr+st1 ), iwork( &
+                  givcol+st1 ), n,iwork( perm+st1 ), rwork( givnum+st1 ),rwork( c+st1 ), rwork( s+&
+                            st1 ),rwork( nrwork ), iwork( iwk ), info )
+                 if( info/=0 ) then
+                    return
+                 end if
+              end if
+           end do loop_320
+           ! unscale and sort the singular values.
+           call stdlib_slascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+           call stdlib_slasrt( 'D', n, d, info )
+           call stdlib_clascl( 'G', 0, 0, orgnrm, one, n, nrhs, b, ldb, info )
+           return
+     end subroutine stdlib_clalsd
+
+     !> CLANGB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n band matrix  A,  with kl sub-diagonals and ku super-diagonals.
+
+     real(sp)             function stdlib_clangb( norm, n, kl, ku, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: kl, ku, ldab, n
+           ! Array Arguments 
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(in) :: ab(ldab,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k, l
+           real(sp) :: scale, sum, value, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                    temp = abs( ab( i, j ) )
+                    if( value<temp .or. stdlib_sisnan( temp ) ) value = temp
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                    sum = sum + abs( ab( i, j ) )
+                 end do
+                 if( value<sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 k = ku + 1 - j
+                 do i = max( 1, j-ku ), min( n, j+kl )
+                    work( i ) = work( i ) + abs( ab( k+i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, n
+                 temp = work( i )
+                 if( value<temp .or. stdlib_sisnan( temp ) ) value = temp
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 l = max( 1, j-ku )
+                 k = ku + 1 - j + l
+                 call stdlib_classq( min( n, j+kl )-l+1, ab( k, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_clangb = value
+           return
+     end function stdlib_clangb
+
+     !> CLANGE:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex matrix A.
+
+     real(sp)             function stdlib_clange( norm, m, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: scale, sum, value, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( min( m, n )==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = 1, m
+                    temp = abs( a( i, j ) )
+                    if( value<temp .or. stdlib_sisnan( temp ) ) value = temp
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = 1, m
+                    sum = sum + abs( a( i, j ) )
+                 end do
+                 if( value<sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, m
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 do i = 1, m
+                    work( i ) = work( i ) + abs( a( i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, m
+                 temp = work( i )
+                 if( value<temp .or. stdlib_sisnan( temp ) ) value = temp
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 call stdlib_classq( m, a( 1, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_clange = value
+           return
+     end function stdlib_clange
+
+     !> CLANGT:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex tridiagonal matrix A.
+
+     pure real(sp)             function stdlib_clangt( norm, n, dl, d, du )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(sp), intent(in) :: d(*), dl(*), du(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: anorm, scale, sum, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n<=0 ) then
+              anorm = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              anorm = abs( d( n ) )
+              do i = 1, n - 1
+                 if( anorm<abs( dl( i ) ) .or. stdlib_sisnan( abs( dl( i ) ) ) )anorm = abs(dl(i))
+                           
+                 if( anorm<abs( d( i ) ) .or. stdlib_sisnan( abs( d( i ) ) ) )anorm = abs(d(i))
+                           
+                 if( anorm<abs( du( i ) ) .or. stdlib_sisnan (abs( du( i ) ) ) )anorm = abs(du(i))
+                           
+              end do
+           else if( stdlib_lsame( norm, 'O' ) .or. norm=='1' ) then
+              ! find norm1(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( dl( 1 ) )
+                 temp = abs( d( n ) )+abs( du( n-1 ) )
+                 if( anorm < temp .or. stdlib_sisnan( temp ) ) anorm = temp
+                 do i = 2, n - 1
+                    temp = abs( d( i ) )+abs( dl( i ) )+abs( du( i-1 ) )
+                    if( anorm < temp .or. stdlib_sisnan( temp ) ) anorm = temp
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( du( 1 ) )
+                 temp = abs( d( n ) )+abs( dl( n-1 ) )
+                 if( anorm < temp .or. stdlib_sisnan( temp ) ) anorm = temp
+                 do i = 2, n - 1
+                    temp = abs( d( i ) )+abs( du( i ) )+abs( dl( i-1 ) )
+                    if( anorm < temp .or. stdlib_sisnan( temp ) ) anorm = temp
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              call stdlib_classq( n, d, 1, scale, sum )
+              if( n>1 ) then
+                 call stdlib_classq( n-1, dl, 1, scale, sum )
+                 call stdlib_classq( n-1, du, 1, scale, sum )
+              end if
+              anorm = scale*sqrt( sum )
+           end if
+           stdlib_clangt = anorm
+           return
+     end function stdlib_clangt
+
+     !> CLANHB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n hermitian band matrix A,  with k super-diagonals.
+
+     real(sp)             function stdlib_clanhb( norm, uplo, n, k, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(in) :: ab(ldab,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           real(sp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = max( k+2-j, 1 ), k
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                    sum = abs( real( ab( k+1, j ),KIND=sp) )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    sum = abs( real( ab( 1, j ),KIND=sp) )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    do i = 2, min( n+1-j, k+1 )
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is hermitian).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    l = k + 1 - j
+                    do i = max( 1, j-k ), j - 1
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( real( ab( k+1, j ),KIND=sp) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( real( ab( 1, j ),KIND=sp) )
+                    l = 1 - j
+                    do i = j + 1, min( n, j+k )
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( k>0 ) then
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 2, n
+                       call stdlib_classq( min( j-1, k ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                    l = k + 1
+                 else
+                    do j = 1, n - 1
+                       call stdlib_classq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                    end do
+                    l = 1
+                 end if
+                 sum = 2*sum
+              else
+                 l = 1
+              end if
+              do j = 1, n
+                 if( real( ab( l, j ),KIND=sp)/=zero ) then
+                    absa = abs( real( ab( l, j ),KIND=sp) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_clanhb = value
+           return
+     end function stdlib_clanhb
+
+     !> CLANHE:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex hermitian matrix A.
+
+     real(sp)             function stdlib_clanhe( norm, uplo, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = 1, j - 1
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                    sum = abs( real( a( j, j ),KIND=sp) )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    sum = abs( real( a( j, j ),KIND=sp) )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    do i = j + 1, n
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is hermitian).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( real( a( j, j ),KIND=sp) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( real( a( j, j ),KIND=sp) )
+                    do i = j + 1, n
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_classq( j-1, a( 1, j ), 1, scale, sum )
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_classq( n-j, a( j+1, j ), 1, scale, sum )
+                 end do
+              end if
+              sum = 2*sum
+              do i = 1, n
+                 if( real( a( i, i ),KIND=sp)/=zero ) then
+                    absa = abs( real( a( i, i ),KIND=sp) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_clanhe = value
+           return
+     end function stdlib_clanhe
+
+     !> CLANHF:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex Hermitian matrix A in RFP format.
+
+     real(sp) function stdlib_clanhf( norm, transr, uplo, n, a, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, transr, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(out) :: work(0:*)
+           complex(sp), intent(in) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, ifm, ilu, noe, n1, k, l, lda
+           real(sp) :: scale, s, value, aa, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              stdlib_clanhf = zero
+              return
+           else if( n==1 ) then
+              stdlib_clanhf = abs(real(a(0),KIND=sp))
+              return
+           end if
+           ! set noe = 1 if n is odd. if n is even set noe=0
+           noe = 1
+           if( mod( n, 2 )==0 )noe = 0
+           ! set ifm = 0 when form='c' or 'c' and 1 otherwise
+           ifm = 1
+           if( stdlib_lsame( transr, 'C' ) )ifm = 0
+           ! set ilu = 0 when uplo='u or 'u' and 1 otherwise
+           ilu = 1
+           if( stdlib_lsame( uplo, 'U' ) )ilu = 0
+           ! set lda = (n+1)/2 when ifm = 0
+           ! set lda = n when ifm = 1 and noe = 1
+           ! set lda = n+1 when ifm = 1 and noe = 0
+           if( ifm==1 ) then
+              if( noe==1 ) then
+                 lda = n
+              else
+                 ! noe=0
+                 lda = n + 1
+              end if
+           else
+              ! ifm=0
+              lda = ( n+1 ) / 2
+           end if
+           if( stdlib_lsame( norm, 'M' ) ) then
+             ! find max(abs(a(i,j))).
+              k = ( n+1 ) / 2
+              value = zero
+              if( noe==1 ) then
+                 ! n is odd
+                 if( ifm==1 ) then
+                    ! a is n by k
+                    if( ilu==1 ) then
+                       ! uplo ='l'
+                       j = 0
+                       ! -> l(0,0)
+                       temp = abs( real( a( j+j*lda ),KIND=sp) )
+                       if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       do i = 1, n - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                       do j = 1, k - 1
+                          do i = 0, j - 2
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                          i = j - 1
+                          ! l(k+j,k+j)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          i = j
+                          ! -> l(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          do i = j + 1, n - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                       end do
+                    else
+                       ! uplo = 'u'
+                       do j = 0, k - 2
+                          do i = 0, k + j - 2
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                          i = k + j - 1
+                          ! -> u(i,i)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          i = i + 1
+                          ! =k+j; i -> u(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          do i = k + j + 1, n - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                       end do
+                       do i = 0, n - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          ! j=k-1
+                       end do
+                       ! i=n-1 -> u(n-1,n-1)
+                       temp = abs( real( a( i+j*lda ),KIND=sp) )
+                       if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                    end if
+                 else
+                    ! xpose case; a is k by n
+                    if( ilu==1 ) then
+                       ! uplo ='l'
+                       do j = 0, k - 2
+                          do i = 0, j - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                          i = j
+                          ! l(i,i)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          i = j + 1
+                          ! l(j+k,j+k)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          do i = j + 2, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = k - 1
+                       do i = 0, k - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                       i = k - 1
+                       ! -> l(i,i) is at a(i,j)
+                       temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       do j = k, n - 1
+                          do i = 0, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                       end do
+                    else
+                       ! uplo = 'u'
+                       do j = 0, k - 2
+                          do i = 0, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = k - 1
+                       ! -> u(j,j) is at a(0,j)
+                       temp = abs( real( a( 0+j*lda ),KIND=sp) )
+                       if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       do i = 1, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                       do j = k, n - 1
+                          do i = 0, j - k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                          i = j - k
+                          ! -> u(i,i) at a(i,j)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          i = j - k + 1
+                          ! u(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          do i = j - k + 2, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                       end do
+                    end if
+                 end if
+              else
+                 ! n is even
+                 if( ifm==1 ) then
+                    ! a is n+1 by k
+                    if( ilu==1 ) then
+                       ! uplo ='l'
+                       j = 0
+                       ! -> l(k,k)
+                       temp = abs( real( a( j+j*lda ),KIND=sp) )
+                       if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       temp = abs( real( a( j+1+j*lda ),KIND=sp) )
+                       if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       do i = 2, n
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                       do j = 1, k - 1
+                          do i = 0, j - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                          i = j
+                          ! l(k+j,k+j)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          i = j + 1
+                          ! -> l(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          do i = j + 2, n
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                       end do
+                    else
+                       ! uplo = 'u'
+                       do j = 0, k - 2
+                          do i = 0, k + j - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                          i = k + j
+                          ! -> u(i,i)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          i = i + 1
+                          ! =k+j+1; i -> u(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          do i = k + j + 2, n
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                       end do
+                       do i = 0, n - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       ! j=k-1
+                       end do
+                       ! i=n-1 -> u(n-1,n-1)
+                       temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       i = n
+                       ! -> u(k-1,k-1)
+                       temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                    end if
+                 else
+                    ! xpose case; a is k by n+1
+                    if( ilu==1 ) then
+                       ! uplo ='l'
+                       j = 0
+                       ! -> l(k,k) at a(0,0)
+                       temp = abs( real( a( j+j*lda ),KIND=sp) )
+                       if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       do i = 1, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                       do j = 1, k - 1
+                          do i = 0, j - 2
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                          i = j - 1
+                          ! l(i,i)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          i = j
+                          ! l(j+k,j+k)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          do i = j + 1, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = k
+                       do i = 0, k - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                       i = k - 1
+                       ! -> l(i,i) is at a(i,j)
+                       temp = abs( real( a( i+j*lda ),KIND=sp) )
+                       if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       do j = k + 1, n
+                          do i = 0, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                       end do
+                    else
+                       ! uplo = 'u'
+                       do j = 0, k - 1
+                          do i = 0, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = k
+                       ! -> u(j,j) is at a(0,j)
+                       temp = abs( real( a( 0+j*lda ),KIND=sp) )
+                       if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       do i = 1, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                       do j = k + 1, n - 1
+                          do i = 0, j - k - 2
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                          i = j - k - 1
+                          ! -> u(i,i) at a(i,j)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          i = j - k
+                          ! u(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=sp) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          do i = j - k + 1, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = n
+                       do i = 0, k - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                       i = k - 1
+                       ! u(k,k) at a(i,j)
+                       temp = abs( real( a( i+j*lda ),KIND=sp) )
+                       if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                    end if
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+             ! find normi(a) ( = norm1(a), since a is hermitian).
+              if( ifm==1 ) then
+                 ! a is 'n'
+                 k = n / 2
+                 if( noe==1 ) then
+                    ! n is odd
+                    if( ilu==0 ) then
+                       ! uplo = 'u'
+                       do i = 0, k - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k
+                          s = zero
+                          do i = 0, k + j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(i,j+k)
+                             s = s + aa
+                             work( i ) = work( i ) + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          ! -> a(j+k,j+k)
+                          work( j+k ) = s + aa
+                          if( i==k+k )go to 10
+                          i = i + 1
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          ! -> a(j,j)
+                          work( j ) = work( j ) + aa
+                          s = zero
+                          do l = j + 1, k - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       10 continue
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu = 1
+                       k = k + 1
+                       ! k=(n+1)/2 for n odd and ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = k - 1, 0, -1
+                          s = zero
+                          do i = 0, j - 2
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,i+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + aa
+                          end do
+                          if( j>0 ) then
+                             aa = abs( real( a( i+j*lda ),KIND=sp) )
+                             ! -> a(j+k,j+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + s
+                             ! i=j
+                             i = i + 1
+                          end if
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          ! -> a(j,j)
+                          work( j ) = aa
+                          s = zero
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    end if
+                 else
+                    ! n is even
+                    if( ilu==0 ) then
+                       ! uplo = 'u'
+                       do i = 0, k - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 1
+                          s = zero
+                          do i = 0, k + j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(i,j+k)
+                             s = s + aa
+                             work( i ) = work( i ) + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          ! -> a(j+k,j+k)
+                          work( j+k ) = s + aa
+                          i = i + 1
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          ! -> a(j,j)
+                          work( j ) = work( j ) + aa
+                          s = zero
+                          do l = j + 1, k - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu = 1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = k - 1, 0, -1
+                          s = zero
+                          do i = 0, j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,i+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          ! -> a(j+k,j+k)
+                          s = s + aa
+                          work( i+k ) = work( i+k ) + s
+                          ! i=j
+                          i = i + 1
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          ! -> a(j,j)
+                          work( j ) = aa
+                          s = zero
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    end if
+                 end if
+              else
+                 ! ifm=0
+                 k = n / 2
+                 if( noe==1 ) then
+                    ! n is odd
+                    if( ilu==0 ) then
+                       ! uplo = 'u'
+                       n1 = k
+                       ! n/2
+                       k = k + 1
+                       ! k is the row size and lda
+                       do i = n1, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, n1 - 1
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,n1+i)
+                             work( i+n1 ) = work( i+n1 ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = s
+                       end do
+                       ! j=n1=k-1 is special
+                       s = abs( real( a( 0+j*lda ),KIND=sp) )
+                       ! a(k-1,k-1)
+                       do i = 1, k - 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k-1,i+n1)
+                          work( i+n1 ) = work( i+n1 ) + aa
+                          s = s + aa
+                       end do
+                       work( j ) = work( j ) + s
+                       do j = k, n - 1
+                          s = zero
+                          do i = 0, j - k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(i,j-k)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          ! i=j-k
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          ! a(j-k,j-k)
+                          s = s + aa
+                          work( j-k ) = work( j-k ) + s
+                          i = i + 1
+                          s = abs( real( a( i+j*lda ),KIND=sp) )
+                          ! a(j,j)
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,l)
+                             work( l ) = work( l ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu=1
+                       k = k + 1
+                       ! k=(n+1)/2 for n odd and ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 2
+                          ! process
+                          s = zero
+                          do i = 0, j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          ! i=j so process of a(j,j)
+                          s = s + aa
+                          work( j ) = s
+                          ! is initialised here
+                          i = i + 1
+                          ! i=j process a(j+k,j+k)
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          s = aa
+                          do l = k + j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(l,k+j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( k+j ) = work( k+j ) + s
+                       end do
+                       ! j=k-1 is special :process col a(k-1,0:k-1)
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,i)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( real( a( i+j*lda ),KIND=sp) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = s
+                       ! done with col j=k+1
+                       do j = k, n - 1
+                          ! process col j of a = a(j,0:k-1)
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    end if
+                 else
+                    ! n is even
+                    if( ilu==0 ) then
+                       ! uplo = 'u'
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 1
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i+k)
+                             work( i+k ) = work( i+k ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = s
+                       end do
+                       ! j=k
+                       aa = abs( real( a( 0+j*lda ),KIND=sp) )
+                       ! a(k,k)
+                       s = aa
+                       do i = 1, k - 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,k+i)
+                          work( i+k ) = work( i+k ) + aa
+                          s = s + aa
+                       end do
+                       work( j ) = work( j ) + s
+                       do j = k + 1, n - 1
+                          s = zero
+                          do i = 0, j - 2 - k
+                             aa = abs( a( i+j*lda ) )
+                             ! a(i,j-k-1)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          ! i=j-1-k
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          ! a(j-k-1,j-k-1)
+                          s = s + aa
+                          work( j-k-1 ) = work( j-k-1 ) + s
+                          i = i + 1
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          ! a(j,j)
+                          s = aa
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,l)
+                             work( l ) = work( l ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       ! j=n
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(i,k-1)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( real( a( i+j*lda ),KIND=sp) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = work( i ) + s
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       ! j=0 is special :process col a(k:n-1,k)
+                       s = abs( real( a( 0 ),KIND=sp) )
+                       ! a(k,k)
+                       do i = 1, k - 1
+                          aa = abs( a( i ) )
+                          ! a(k+i,k)
+                          work( i+k ) = work( i+k ) + aa
+                          s = s + aa
+                       end do
+                       work( k ) = work( k ) + s
+                       do j = 1, k - 1
+                          ! process
+                          s = zero
+                          do i = 0, j - 2
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j-1,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          ! i=j-1 so process of a(j-1,j-1)
+                          s = s + aa
+                          work( j-1 ) = s
+                          ! is initialised here
+                          i = i + 1
+                          ! i=j process a(j+k,j+k)
+                          aa = abs( real( a( i+j*lda ),KIND=sp) )
+                          s = aa
+                          do l = k + j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(l,k+j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( k+j ) = work( k+j ) + s
+                       end do
+                       ! j=k is special :process col a(k,0:k-1)
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,i)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( real( a( i+j*lda ),KIND=sp) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = s
+                       ! done with col j=k+1
+                       do j = k + 1, n
+                          ! process col j-1 of a = a(j-1,0:k-1)
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j-1,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          work( j-1 ) = work( j-1 ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    end if
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+             ! find normf(a).
+              k = ( n+1 ) / 2
+              scale = zero
+              s = one
+              if( noe==1 ) then
+                 ! n is odd
+                 if( ifm==1 ) then
+                    ! a is normal
+                    if( ilu==0 ) then
+                       ! a is upper
+                       do j = 0, k - 3
+                          call stdlib_classq( k-j-2, a( k+j+1+j*lda ), 1, scale, s )
+                          ! l at a(k,0)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_classq( k+j-1, a( 0+j*lda ), 1, scale, s )
+                          ! trap u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = k - 1
+                       ! -> u(k,k) at a(k-1,0)
+                       do i = 0, k - 2
+                          aa = real( a( l ),KIND=sp)
+                          ! u(k+i,k+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=sp)
+                          ! u(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                       aa = real( a( l ),KIND=sp)
+                       ! u(n-1,n-1)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                    else
+                       ! ilu=1
+                       do j = 0, k - 1
+                          call stdlib_classq( n-j-1, a( j+1+j*lda ), 1, scale, s )
+                          ! trap l at a(0,0)
+                       end do
+                       do j = 1, k - 2
+                          call stdlib_classq( j, a( 0+( 1+j )*lda ), 1, scale, s )
+                          ! u at a(0,1)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       aa = real( a( 0 ),KIND=sp)
+                       ! l(0,0) at a(0,0)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                       l = lda
+                       ! -> l(k,k) at a(0,1)
+                       do i = 1, k - 1
+                          aa = real( a( l ),KIND=sp)
+                          ! l(k-1+i,k-1+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=sp)
+                          ! l(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                    end if
+                 else
+                    ! a is xpose
+                    if( ilu==0 ) then
+                       ! a**h is upper
+                       do j = 1, k - 2
+                          call stdlib_classq( j, a( 0+( k+j )*lda ), 1, scale, s )
+                          ! u at a(0,k)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_classq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k-1 rect. at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_classq( k-j-1, a( j+1+( j+k-1 )*lda ), 1,scale, s )
+                          ! l at a(0,k-1)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0 + k*lda - lda
+                       ! -> u(k-1,k-1) at a(0,k-1)
+                       aa = real( a( l ),KIND=sp)
+                       ! u(k-1,k-1)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                       l = l + lda
+                       ! -> u(0,0) at a(0,k)
+                       do j = k, n - 1
+                          aa = real( a( l ),KIND=sp)
+                          ! -> u(j-k,j-k)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=sp)
+                          ! -> u(j,j)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                    else
+                       ! a**h is lower
+                       do j = 1, k - 1
+                          call stdlib_classq( j, a( 0+j*lda ), 1, scale, s )
+                          ! u at a(0,0)
+                       end do
+                       do j = k, n - 1
+                          call stdlib_classq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k-1 rect. at a(0,k)
+                       end do
+                       do j = 0, k - 3
+                          call stdlib_classq( k-j-2, a( j+2+j*lda ), 1, scale, s )
+                          ! l at a(1,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0
+                       ! -> l(0,0) at a(0,0)
+                       do i = 0, k - 2
+                          aa = real( a( l ),KIND=sp)
+                          ! l(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=sp)
+                          ! l(k+i,k+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                       ! l-> k-1 + (k-1)*lda or l(k-1,k-1) at a(k-1,k-1)
+                       aa = real( a( l ),KIND=sp)
+                       ! l(k-1,k-1) at a(k-1,k-1)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                    end if
+                 end if
+              else
+                 ! n is even
+                 if( ifm==1 ) then
+                    ! a is normal
+                    if( ilu==0 ) then
+                       ! a is upper
+                       do j = 0, k - 2
+                          call stdlib_classq( k-j-1, a( k+j+2+j*lda ), 1, scale, s )
+                       ! l at a(k+1,0)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_classq( k+j, a( 0+j*lda ), 1, scale, s )
+                       ! trap u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = k
+                       ! -> u(k,k) at a(k,0)
+                       do i = 0, k - 1
+                          aa = real( a( l ),KIND=sp)
+                          ! u(k+i,k+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=sp)
+                          ! u(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                    else
+                       ! ilu=1
+                       do j = 0, k - 1
+                          call stdlib_classq( n-j-1, a( j+2+j*lda ), 1, scale, s )
+                          ! trap l at a(1,0)
+                       end do
+                       do j = 1, k - 1
+                          call stdlib_classq( j, a( 0+j*lda ), 1, scale, s )
+                          ! u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0
+                       ! -> l(k,k) at a(0,0)
+                       do i = 0, k - 1
+                          aa = real( a( l ),KIND=sp)
+                          ! l(k-1+i,k-1+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=sp)
+                          ! l(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                    end if
+                 else
+                    ! a is xpose
+                    if( ilu==0 ) then
+                       ! a**h is upper
+                       do j = 1, k - 1
+                          call stdlib_classq( j, a( 0+( k+1+j )*lda ), 1, scale, s )
+                       ! u at a(0,k+1)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_classq( k, a( 0+j*lda ), 1, scale, s )
+                       ! k by k rect. at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_classq( k-j-1, a( j+1+( j+k )*lda ), 1, scale,s )
+                       ! l at a(0,k)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0 + k*lda
+                       ! -> u(k,k) at a(0,k)
+                       aa = real( a( l ),KIND=sp)
+                       ! u(k,k)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                       l = l + lda
+                       ! -> u(0,0) at a(0,k+1)
+                       do j = k + 1, n - 1
+                          aa = real( a( l ),KIND=sp)
+                          ! -> u(j-k-1,j-k-1)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=sp)
+                          ! -> u(j,j)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                       ! l=k-1+n*lda
+                       ! -> u(k-1,k-1) at a(k-1,n)
+                       aa = real( a( l ),KIND=sp)
+                       ! u(k,k)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                    else
+                       ! a**h is lower
+                       do j = 1, k - 1
+                          call stdlib_classq( j, a( 0+( j+1 )*lda ), 1, scale, s )
+                       ! u at a(0,1)
+                       end do
+                       do j = k + 1, n
+                          call stdlib_classq( k, a( 0+j*lda ), 1, scale, s )
+                       ! k by k rect. at a(0,k+1)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_classq( k-j-1, a( j+1+j*lda ), 1, scale, s )
+                       ! l at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0
+                       ! -> l(k,k) at a(0,0)
+                       aa = real( a( l ),KIND=sp)
+                       ! l(k,k) at a(0,0)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                       l = lda
+                       ! -> l(0,0) at a(0,1)
+                       do i = 0, k - 2
+                          aa = real( a( l ),KIND=sp)
+                          ! l(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=sp)
+                          ! l(k+i+1,k+i+1)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                       ! l-> k - 1 + k*lda or l(k-1,k-1) at a(k-1,k)
+                       aa = real( a( l ),KIND=sp)
+                       ! l(k-1,k-1) at a(k-1,k)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                    end if
+                 end if
+              end if
+              value = scale*sqrt( s )
+           end if
+           stdlib_clanhf = value
+           return
+     end function stdlib_clanhf
+
+     !> CLANHP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex hermitian matrix A,  supplied in packed form.
+
+     real(sp)             function stdlib_clanhp( norm, uplo, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(in) :: ap(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k
+           real(sp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 k = 0
+                 do j = 1, n
+                    do i = k + 1, k + j - 1
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                    k = k + j
+                    sum = abs( real( ap( k ),KIND=sp) )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 k = 1
+                 do j = 1, n
+                    sum = abs( real( ap( k ),KIND=sp) )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    do i = k + 1, k + n - j
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                    k = k + n - j + 1
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is hermitian).
+              value = zero
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    work( j ) = sum + abs( real( ap( k ),KIND=sp) )
+                    k = k + 1
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( real( ap( k ),KIND=sp) )
+                    k = k + 1
+                    do i = j + 1, n
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              k = 2
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_classq( j-1, ap( k ), 1, scale, sum )
+                    k = k + j
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_classq( n-j, ap( k ), 1, scale, sum )
+                    k = k + n - j + 1
+                 end do
+              end if
+              sum = 2*sum
+              k = 1
+              do i = 1, n
+                 if( real( ap( k ),KIND=sp)/=zero ) then
+                    absa = abs( real( ap( k ),KIND=sp) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    k = k + i + 1
+                 else
+                    k = k + n - i + 1
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_clanhp = value
+           return
+     end function stdlib_clanhp
+
+     !> CLANHS:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> Hessenberg matrix A.
+
+     real(sp)             function stdlib_clanhs( norm, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = 1, min( n, j+1 )
+                    sum = abs( a( i, j ) )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = 1, min( n, j+1 )
+                    sum = sum + abs( a( i, j ) )
+                 end do
+                 if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 do i = 1, min( n, j+1 )
+                    work( i ) = work( i ) + abs( a( i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 call stdlib_classq( min( n, j+1 ), a( 1, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_clanhs = value
+           return
+     end function stdlib_clanhs
+
+     !> CLANHT:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex Hermitian tridiagonal matrix A.
+
+     pure real(sp)             function stdlib_clanht( norm, n, d, e )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*)
+           complex(sp), intent(in) :: e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: anorm, scale, sum
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n<=0 ) then
+              anorm = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              anorm = abs( d( n ) )
+              do i = 1, n - 1
+                 sum = abs( d( i ) )
+                 if( anorm < sum .or. stdlib_sisnan( sum ) ) anorm = sum
+                 sum = abs( e( i ) )
+                 if( anorm < sum .or. stdlib_sisnan( sum ) ) anorm = sum
+              end do
+           else if( stdlib_lsame( norm, 'O' ) .or. norm=='1' .or.stdlib_lsame( norm, 'I' ) ) &
+                     then
+              ! find norm1(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( e( 1 ) )
+                 sum = abs( e( n-1 ) )+abs( d( n ) )
+                 if( anorm < sum .or. stdlib_sisnan( sum ) ) anorm = sum
+                 do i = 2, n - 1
+                    sum = abs( d( i ) )+abs( e( i ) )+abs( e( i-1 ) )
+                    if( anorm < sum .or. stdlib_sisnan( sum ) ) anorm = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( n>1 ) then
+                 call stdlib_classq( n-1, e, 1, scale, sum )
+                 sum = 2*sum
+              end if
+              call stdlib_slassq( n, d, 1, scale, sum )
+              anorm = scale*sqrt( sum )
+           end if
+           stdlib_clanht = anorm
+           return
+     end function stdlib_clanht
+
+     !> CLANSB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n symmetric band matrix A,  with k super-diagonals.
+
+     real(sp)             function stdlib_clansb( norm, uplo, n, k, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(in) :: ab(ldab,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           real(sp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = max( k+2-j, 1 ), k + 1
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                 end do
+              else
+                 do j = 1, n
+                    do i = 1, min( n+1-j, k+1 )
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    l = k + 1 - j
+                    do i = max( 1, j-k ), j - 1
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( ab( k+1, j ) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( ab( 1, j ) )
+                    l = 1 - j
+                    do i = j + 1, min( n, j+k )
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( k>0 ) then
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 2, n
+                       call stdlib_classq( min( j-1, k ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                    l = k + 1
+                 else
+                    do j = 1, n - 1
+                       call stdlib_classq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                    end do
+                    l = 1
+                 end if
+                 sum = 2*sum
+              else
+                 l = 1
+              end if
+              call stdlib_classq( n, ab( l, 1 ), ldab, scale, sum )
+              value = scale*sqrt( sum )
+           end if
+           stdlib_clansb = value
+           return
+     end function stdlib_clansb
+
+     !> CLANSP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex symmetric matrix A,  supplied in packed form.
+
+     real(sp)             function stdlib_clansp( norm, uplo, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(in) :: ap(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k
+           real(sp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,real,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 k = 1
+                 do j = 1, n
+                    do i = k, k + j - 1
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                    k = k + j
+                 end do
+              else
+                 k = 1
+                 do j = 1, n
+                    do i = k, k + n - j
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                    k = k + n - j + 1
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    work( j ) = sum + abs( ap( k ) )
+                    k = k + 1
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( ap( k ) )
+                    k = k + 1
+                    do i = j + 1, n
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              k = 2
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_classq( j-1, ap( k ), 1, scale, sum )
+                    k = k + j
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_classq( n-j, ap( k ), 1, scale, sum )
+                    k = k + n - j + 1
+                 end do
+              end if
+              sum = 2*sum
+              k = 1
+              do i = 1, n
+                 if( real( ap( k ),KIND=sp)/=zero ) then
+                    absa = abs( real( ap( k ),KIND=sp) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+                 if( aimag( ap( k ) )/=zero ) then
+                    absa = abs( aimag( ap( k ) ) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    k = k + i + 1
+                 else
+                    k = k + n - i + 1
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_clansp = value
+           return
+     end function stdlib_clansp
+
+     !> CLANSY:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex symmetric matrix A.
+
+     real(sp)             function stdlib_clansy( norm, uplo, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = 1, j
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                 end do
+              else
+                 do j = 1, n
+                    do i = j, n
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( a( j, j ) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( a( j, j ) )
+                    do i = j + 1, n
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_classq( j-1, a( 1, j ), 1, scale, sum )
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_classq( n-j, a( j+1, j ), 1, scale, sum )
+                 end do
+              end if
+              sum = 2*sum
+              call stdlib_classq( n, a, lda+1, scale, sum )
+              value = scale*sqrt( sum )
+           end if
+           stdlib_clansy = value
+           return
+     end function stdlib_clansy
+
+     !> CLANTB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n triangular band matrix A,  with ( k + 1 ) diagonals.
+
+     real(sp)             function stdlib_clantb( norm, uplo, diag, n, k, ab,ldab, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(in) :: ab(ldab,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j, l
+           real(sp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = max( k+2-j, 1 ), k
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = 2, min( n+1-j, k+1 )
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = max( k+2-j, 1 ), k + 1
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = 1, min( n+1-j, k+1 )
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = max( k+2-j, 1 ), k
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = max( k+2-j, 1 ), k + 1
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = 2, min( n+1-j, k+1 )
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = 1, min( n+1-j, k+1 )
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       l = k + 1 - j
+                       do i = max( 1, j-k ), j - 1
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       l = k + 1 - j
+                       do i = max( 1, j-k ), j
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       l = 1 - j
+                       do i = j + 1, min( n, j+k )
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       l = 1 - j
+                       do i = j, min( n, j+k )
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 end if
+              end if
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    if( k>0 ) then
+                       do j = 2, n
+                          call stdlib_classq( min( j-1, k ),ab( max( k+2-j, 1 ), j ), 1, scale,&
+                                    sum )
+                       end do
+                    end if
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_classq( min( j, k+1 ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    if( k>0 ) then
+                       do j = 1, n - 1
+                          call stdlib_classq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                       end do
+                    end if
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_classq( min( n-j+1, k+1 ), ab( 1, j ), 1, scale,sum )
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_clantb = value
+           return
+     end function stdlib_clantb
+
+     !> CLANTP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> triangular matrix A, supplied in packed form.
+
+     real(sp)             function stdlib_clantp( norm, uplo, diag, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(in) :: ap(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j, k
+           real(sp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              k = 1
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = k, k + j - 2
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                       k = k + j
+                    end do
+                 else
+                    do j = 1, n
+                       do i = k + 1, k + n - j
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                       k = k + n - j + 1
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = k, k + j - 1
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                       k = k + j
+                    end do
+                 else
+                    do j = 1, n
+                       do i = k, k + n - j
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                       k = k + n - j + 1
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              k = 1
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = k, k + j - 2
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = k, k + j - 1
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    end if
+                    k = k + j
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = k + 1, k + n - j
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = k, k + n - j
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    end if
+                    k = k + n - j + 1
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       do i = 1, j - 1
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                       k = k + 1
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = 1, j
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       k = k + 1
+                       do i = j + 1, n
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j, n
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 end if
+              end if
+              value = zero
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    k = 2
+                    do j = 2, n
+                       call stdlib_classq( j-1, ap( k ), 1, scale, sum )
+                       k = k + j
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    k = 1
+                    do j = 1, n
+                       call stdlib_classq( j, ap( k ), 1, scale, sum )
+                       k = k + j
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    k = 2
+                    do j = 1, n - 1
+                       call stdlib_classq( n-j, ap( k ), 1, scale, sum )
+                       k = k + n - j + 1
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    k = 1
+                    do j = 1, n
+                       call stdlib_classq( n-j+1, ap( k ), 1, scale, sum )
+                       k = k + n - j + 1
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_clantp = value
+           return
+     end function stdlib_clantp
+
+     !> CLANTR:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> trapezoidal or triangular matrix A.
+
+     real(sp)             function stdlib_clantr( norm, uplo, diag, m, n, a, lda,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j
+           real(sp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( min( m, n )==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = 1, min( m, j-1 )
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = j + 1, m
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = 1, min( m, j )
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = j, m
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( ( udiag ) .and. ( j<=m ) ) then
+                       sum = one
+                       do i = 1, j - 1
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = 1, min( m, j )
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = j + 1, m
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = j, m
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, m
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       do i = 1, min( m, j-1 )
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = 1, min( m, j )
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, min( m, n )
+                       work( i ) = one
+                    end do
+                    do i = n + 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j + 1, m
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j, m
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 end if
+              end if
+              value = zero
+              do i = 1, m
+                 sum = work( i )
+                 if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = min( m, n )
+                    do j = 2, n
+                       call stdlib_classq( min( m, j-1 ), a( 1, j ), 1, scale, sum )
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_classq( min( m, j ), a( 1, j ), 1, scale, sum )
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = min( m, n )
+                    do j = 1, n
+                       call stdlib_classq( m-j, a( min( m, j+1 ), j ), 1, scale,sum )
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_classq( m-j+1, a( j, j ), 1, scale, sum )
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_clantr = value
+           return
+     end function stdlib_clantr
+
+     !> Given two column vectors X and Y, let
+     !> A = ( X Y ).
+     !> The subroutine first computes the QR factorization of A = Q*R,
+     !> and then computes the SVD of the 2-by-2 upper triangular matrix R.
+     !> The smaller singular value of R is returned in SSMIN, which is used
+     !> as the measurement of the linear dependency of the vectors X and Y.
+
+     pure subroutine stdlib_clapll( n, x, incx, y, incy, ssmin )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           real(sp), intent(out) :: ssmin
+           ! Array Arguments 
+           complex(sp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(sp) :: ssmax
+           complex(sp) :: a11, a12, a22, c, tau
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 ) then
+              ssmin = zero
+              return
+           end if
+           ! compute the qr factorization of the n-by-2 matrix ( x y )
+           call stdlib_clarfg( n, x( 1 ), x( 1+incx ), incx, tau )
+           a11 = x( 1 )
+           x( 1 ) = cone
+           c = -conjg( tau )*stdlib_cdotc( n, x, incx, y, incy )
+           call stdlib_caxpy( n, c, x, incx, y, incy )
+           call stdlib_clarfg( n-1, y( 1+incy ), y( 1+2*incy ), incy, tau )
+           a12 = y( 1 )
+           a22 = y( 1+incy )
+           ! compute the svd of 2-by-2 upper triangular matrix.
+           call stdlib_slas2( abs( a11 ), abs( a12 ), abs( a22 ), ssmin, ssmax )
+           return
+     end subroutine stdlib_clapll
+
+     !> CLAQP2: computes a QR factorization with column pivoting of
+     !> the block A(OFFSET+1:M,1:N).
+     !> The block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.
+
+     pure subroutine stdlib_claqp2( m, n, offset, a, lda, jpvt, tau, vn1, vn2,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, m, n, offset
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(sp), intent(inout) :: vn1(*), vn2(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, itemp, j, mn, offpi, pvt
+           real(sp) :: temp, temp2, tol3z
+           complex(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,min,sqrt
+           ! Executable Statements 
+           mn = min( m-offset, n )
+           tol3z = sqrt(stdlib_slamch('EPSILON'))
+           ! compute factorization.
+           loop_20: do i = 1, mn
+              offpi = offset + i
+              ! determine ith pivot column and swap if necessary.
+              pvt = ( i-1 ) + stdlib_isamax( n-i+1, vn1( i ), 1 )
+              if( pvt/=i ) then
+                 call stdlib_cswap( m, a( 1, pvt ), 1, a( 1, i ), 1 )
+                 itemp = jpvt( pvt )
+                 jpvt( pvt ) = jpvt( i )
+                 jpvt( i ) = itemp
+                 vn1( pvt ) = vn1( i )
+                 vn2( pvt ) = vn2( i )
+              end if
+              ! generate elementary reflector h(i).
+              if( offpi<m ) then
+                 call stdlib_clarfg( m-offpi+1, a( offpi, i ), a( offpi+1, i ), 1,tau( i ) )
+                           
+              else
+                 call stdlib_clarfg( 1, a( m, i ), a( m, i ), 1, tau( i ) )
+              end if
+              if( i<n ) then
+                 ! apply h(i)**h to a(offset+i:m,i+1:n) from the left.
+                 aii = a( offpi, i )
+                 a( offpi, i ) = cone
+                 call stdlib_clarf( 'LEFT', m-offpi+1, n-i, a( offpi, i ), 1,conjg( tau( i ) ), a(&
+                            offpi, i+1 ), lda,work( 1 ) )
+                 a( offpi, i ) = aii
+              end if
+              ! update partial column norms.
+              do j = i + 1, n
+                 if( vn1( j )/=zero ) then
+                    ! note: the following 4 lines follow from the analysis in
+                    ! lapack working note 176.
+                    temp = one - ( abs( a( offpi, j ) ) / vn1( j ) )**2
+                    temp = max( temp, zero )
+                    temp2 = temp*( vn1( j ) / vn2( j ) )**2
+                    if( temp2 <= tol3z ) then
+                       if( offpi<m ) then
+                          vn1( j ) = stdlib_scnrm2( m-offpi, a( offpi+1, j ), 1 )
+                          vn2( j ) = vn1( j )
+                       else
+                          vn1( j ) = zero
+                          vn2( j ) = zero
+                       end if
+                    else
+                       vn1( j ) = vn1( j )*sqrt( temp )
+                    end if
+                 end if
+              end do
+           end do loop_20
+           return
+     end subroutine stdlib_claqp2
+
+     !> CLAQPS: computes a step of QR factorization with column pivoting
+     !> of a complex M-by-N matrix A by using Blas-3.  It tries to factorize
+     !> NB columns from A starting from the row OFFSET+1, and updates all
+     !> of the matrix with Blas-3 xGEMM.
+     !> In some cases, due to catastrophic cancellations, it cannot
+     !> factorize NB columns.  Hence, the actual number of factorized
+     !> columns is returned in KB.
+     !> Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.
+
+     pure subroutine stdlib_claqps( m, n, offset, nb, kb, a, lda, jpvt, tau, vn1,vn2, auxv, f, &
+               ldf )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: kb
+           integer(ilp), intent(in) :: lda, ldf, m, n, nb, offset
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(sp), intent(inout) :: vn1(*), vn2(*)
+           complex(sp), intent(inout) :: a(lda,*), auxv(*), f(ldf,*)
+           complex(sp), intent(out) :: tau(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: itemp, j, k, lastrk, lsticc, pvt, rk
+           real(sp) :: temp, temp2, tol3z
+           complex(sp) :: akk
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,min,nint,real,sqrt
+           ! Executable Statements 
+           lastrk = min( m, n+offset )
+           lsticc = 0
+           k = 0
+           tol3z = sqrt(stdlib_slamch('EPSILON'))
+           ! beginning of while loop.
+           10 continue
+           if( ( k<nb ) .and. ( lsticc==0 ) ) then
+              k = k + 1
+              rk = offset + k
+              ! determine ith pivot column and swap if necessary
+              pvt = ( k-1 ) + stdlib_isamax( n-k+1, vn1( k ), 1 )
+              if( pvt/=k ) then
+                 call stdlib_cswap( m, a( 1, pvt ), 1, a( 1, k ), 1 )
+                 call stdlib_cswap( k-1, f( pvt, 1 ), ldf, f( k, 1 ), ldf )
+                 itemp = jpvt( pvt )
+                 jpvt( pvt ) = jpvt( k )
+                 jpvt( k ) = itemp
+                 vn1( pvt ) = vn1( k )
+                 vn2( pvt ) = vn2( k )
+              end if
+              ! apply previous householder reflectors to column k:
+              ! a(rk:m,k) := a(rk:m,k) - a(rk:m,1:k-1)*f(k,1:k-1)**h.
+              if( k>1 ) then
+                 do j = 1, k - 1
+                    f( k, j ) = conjg( f( k, j ) )
+                 end do
+                 call stdlib_cgemv( 'NO TRANSPOSE', m-rk+1, k-1, -cone, a( rk, 1 ),lda, f( k, 1 ),&
+                            ldf, cone, a( rk, k ), 1 )
+                 do j = 1, k - 1
+                    f( k, j ) = conjg( f( k, j ) )
+                 end do
+              end if
+              ! generate elementary reflector h(k).
+              if( rk<m ) then
+                 call stdlib_clarfg( m-rk+1, a( rk, k ), a( rk+1, k ), 1, tau( k ) )
+              else
+                 call stdlib_clarfg( 1, a( rk, k ), a( rk, k ), 1, tau( k ) )
+              end if
+              akk = a( rk, k )
+              a( rk, k ) = cone
+              ! compute kth column of f:
+              ! compute  f(k+1:n,k) := tau(k)*a(rk:m,k+1:n)**h*a(rk:m,k).
+              if( k<n ) then
+                 call stdlib_cgemv( 'CONJUGATE TRANSPOSE', m-rk+1, n-k, tau( k ),a( rk, k+1 ), &
+                           lda, a( rk, k ), 1, czero,f( k+1, k ), 1 )
+              end if
+              ! padding f(1:k,k) with zeros.
+              do j = 1, k
+                 f( j, k ) = czero
+              end do
+              ! incremental updating of f:
+              ! f(1:n,k) := f(1:n,k) - tau(k)*f(1:n,1:k-1)*a(rk:m,1:k-1)**h
+                          ! *a(rk:m,k).
+              if( k>1 ) then
+                 call stdlib_cgemv( 'CONJUGATE TRANSPOSE', m-rk+1, k-1, -tau( k ),a( rk, 1 ), lda,&
+                            a( rk, k ), 1, czero,auxv( 1 ), 1 )
+                 call stdlib_cgemv( 'NO TRANSPOSE', n, k-1, cone, f( 1, 1 ), ldf,auxv( 1 ), 1, &
+                           cone, f( 1, k ), 1 )
+              end if
+              ! update the current row of a:
+              ! a(rk,k+1:n) := a(rk,k+1:n) - a(rk,1:k)*f(k+1:n,1:k)**h.
+              if( k<n ) then
+                 call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', 1, n-k,k, -cone, a( rk,&
+                            1 ), lda, f( k+1, 1 ), ldf,cone, a( rk, k+1 ), lda )
+              end if
+              ! update partial column norms.
+              if( rk<lastrk ) then
+                 do j = k + 1, n
+                    if( vn1( j )/=zero ) then
+                       ! note: the following 4 lines follow from the analysis in
+                       ! lapack working note 176.
+                       temp = abs( a( rk, j ) ) / vn1( j )
+                       temp = max( zero, ( one+temp )*( one-temp ) )
+                       temp2 = temp*( vn1( j ) / vn2( j ) )**2
+                       if( temp2 <= tol3z ) then
+                          vn2( j ) = real( lsticc,KIND=sp)
+                          lsticc = j
+                       else
+                          vn1( j ) = vn1( j )*sqrt( temp )
+                       end if
+                    end if
+                 end do
+              end if
+              a( rk, k ) = akk
+              ! end of while loop.
+              go to 10
+           end if
+           kb = k
+           rk = offset + kb
+           ! apply the block reflector to the rest of the matrix:
+           ! a(offset+kb+1:m,kb+1:n) := a(offset+kb+1:m,kb+1:n) -
+                               ! a(offset+kb+1:m,1:kb)*f(kb+1:n,1:kb)**h.
+           if( kb<min( n, m-offset ) ) then
+              call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m-rk, n-kb,kb, -cone, a( &
+                        rk+1, 1 ), lda, f( kb+1, 1 ), ldf,cone, a( rk+1, kb+1 ), lda )
+           end if
+           ! recomputation of difficult columns.
+           60 continue
+           if( lsticc>0 ) then
+              itemp = nint( vn2( lsticc ),KIND=ilp)
+              vn1( lsticc ) = stdlib_scnrm2( m-rk, a( rk+1, lsticc ), 1 )
+              ! note: the computation of vn1( lsticc ) relies on the fact that
+              ! stdlib_snrm2 does not fail on vectors with norm below the value of
+              ! sqrt(stdlib_dlamch('s'))
+              vn2( lsticc ) = vn1( lsticc )
+              lsticc = itemp
+              go to 60
+           end if
+           return
+     end subroutine stdlib_claqps
+
+     !> CLAQR5: called by CLAQR0 performs a
+     !> single small-bulge multi-shift QR sweep.
+
+     pure subroutine stdlib_claqr5( wantt, wantz, kacc22, n, ktop, kbot, nshfts, s,h, ldh, iloz, &
+               ihiz, z, ldz, v, ldv, u, ldu, nv,wv, ldwv, nh, wh, ldwh )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kacc22, kbot, ktop, ldh, ldu, ldv, ldwh, ldwv, &
+                     ldz, n, nh, nshfts, nv
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(sp), intent(inout) :: h(ldh,*), s(*), z(ldz,*)
+           complex(sp), intent(out) :: u(ldu,*), v(ldv,*), wh(ldwh,*), wv(ldwv,*)
+        ! ================================================================
+           ! Parameters 
+           real(sp), parameter :: rzero = 0.0_sp
+           real(sp), parameter :: rone = 1.0_sp
+           
+           
+           ! Local Scalars 
+           complex(sp) :: alpha, beta, cdum, refsum
+           real(sp) :: h11, h12, h21, h22, safmax, safmin, scl, smlnum, tst1, tst2, ulp
+           integer(ilp) :: i2, i4, incol, j, jbot, jcol, jlen, jrow, jtop, k, k1, kdu, kms, krcol,&
+                      m, m22, mbot, mtop, nbmps, ndcol, ns, nu
+           logical(lk) :: accum, bmp22
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,conjg,max,min,mod,real
+           ! Local Arrays 
+           complex(sp) :: vt(3)
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! ==== if there are no shifts, then there is nothing to do. ====
+           if( nshfts<2 )return
+           ! ==== if the active block is empty or 1-by-1, then there
+           ! .    is nothing to do. ====
+           if( ktop>=kbot )return
+           ! ==== nshfts is supposed to be even, but if it is odd,
+           ! .    then simply reduce it by cone.  ====
+           ns = nshfts - mod( nshfts, 2 )
+           ! ==== machine constants for deflation ====
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = rone / safmin
+           call stdlib_slabad( safmin, safmax )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=sp) / ulp )
+           ! ==== use accumulated reflections to update far-from-diagonal
+           ! .    entries ? ====
+           accum = ( kacc22==1 ) .or. ( kacc22==2 )
+           ! ==== clear trash ====
+           if( ktop+2<=kbot )h( ktop+2, ktop ) = czero
+           ! ==== nbmps = number of 2-shift bulges in the chain ====
+           nbmps = ns / 2
+           ! ==== kdu = width of slab ====
+           kdu = 4*nbmps
+           ! ==== create and chase chains of nbmps bulges ====
+           loop_180: do incol = ktop - 2*nbmps + 1, kbot - 2, 2*nbmps
+              ! jtop = index from which updates from the right start.
+              if( accum ) then
+                 jtop = max( ktop, incol )
+              else if( wantt ) then
+                 jtop = 1
+              else
+                 jtop = ktop
+              end if
+              ndcol = incol + kdu
+              if( accum )call stdlib_claset( 'ALL', kdu, kdu, czero, cone, u, ldu )
+              ! ==== near-the-diagonal bulge chase.  the following loop
+              ! .    performs the near-the-diagonal part of a small bulge
+              ! .    multi-shift qr sweep.  each 4*nbmps column diagonal
+              ! .    chunk extends from column incol to column ndcol
+              ! .    (including both column incol and column ndcol). the
+              ! .    following loop chases a 2*nbmps+1 column long chain of
+              ! .    nbmps bulges 2*nbmps columns to the right.  (incol
+              ! .    may be less than ktop and and ndcol may be greater than
+              ! .    kbot indicating phantom columns from which to chase
+              ! .    bulges before they are actually introduced or to which
+              ! .    to chase bulges beyond column kbot.)  ====
+              loop_145: do krcol = incol, min( incol+2*nbmps-1, kbot-2 )
+                 ! ==== bulges number mtop to mbot are active double implicit
+                 ! .    shift bulges.  there may or may not also be small
+                 ! .    2-by-2 bulge, if there is room.  the inactive bulges
+                 ! .    (if any) must wait until the active bulges have moved
+                 ! .    down the diagonal to make room.  the phantom matrix
+                 ! .    paradigm described above helps keep track.  ====
+                 mtop = max( 1, ( ktop-krcol ) / 2+1 )
+                 mbot = min( nbmps, ( kbot-krcol-1 ) / 2 )
+                 m22 = mbot + 1
+                 bmp22 = ( mbot<nbmps ) .and. ( krcol+2*( m22-1 ) )==( kbot-2 )
+                 ! ==== generate reflections to chase the chain right
+                 ! .    cone column.  (the minimum value of k is ktop-1.) ====
+                 if ( bmp22 ) then
+                    ! ==== special case: 2-by-2 reflection at bottom treated
+                    ! .    separately ====
+                    k = krcol + 2*( m22-1 )
+                    if( k==ktop-1 ) then
+                       call stdlib_claqr1( 2, h( k+1, k+1 ), ldh, s( 2*m22-1 ),s( 2*m22 ), v( 1, &
+                                 m22 ) )
+                       beta = v( 1, m22 )
+                       call stdlib_clarfg( 2, beta, v( 2, m22 ), 1, v( 1, m22 ) )
+                    else
+                       beta = h( k+1, k )
+                       v( 2, m22 ) = h( k+2, k )
+                       call stdlib_clarfg( 2, beta, v( 2, m22 ), 1, v( 1, m22 ) )
+                       h( k+1, k ) = beta
+                       h( k+2, k ) = czero
+                    end if
+                    ! ==== perform update from right within
+                    ! .    computational window. ====
+                    do j = jtop, min( kbot, k+3 )
+                       refsum = v( 1, m22 )*( h( j, k+1 )+v( 2, m22 )*h( j, k+2 ) )
+                       h( j, k+1 ) = h( j, k+1 ) - refsum
+                       h( j, k+2 ) = h( j, k+2 ) -refsum*conjg( v( 2, m22 ) )
+                    end do
+                    ! ==== perform update from left within
+                    ! .    computational window. ====
+                    if( accum ) then
+                       jbot = min( ndcol, kbot )
+                    else if( wantt ) then
+                       jbot = n
+                    else
+                       jbot = kbot
+                    end if
+                    do j = k+1, jbot
+                       refsum = conjg( v( 1, m22 ) )*( h( k+1, j )+conjg( v( 2, m22 ) )*h( k+2, j &
+                                 ) )
+                       h( k+1, j ) = h( k+1, j ) - refsum
+                       h( k+2, j ) = h( k+2, j ) - refsum*v( 2, m22 )
+                    end do
+                    ! ==== the following convergence test requires that
+                    ! .    the tradition small-compared-to-nearby-diagonals
+                    ! .    criterion and the ahues
+                    ! .    criteria both be satisfied.  the latter improves
+                    ! .    accuracy in some examples. falling back on an
+                    ! .    alternate convergence criterion when tst1 or tst2
+                    ! .    is czero (as done here) is traditional but probably
+                    ! .    unnecessary. ====
+                    if( k>=ktop) then
+                       if( h( k+1, k )/=czero ) then
+                          tst1 = cabs1( h( k, k ) ) + cabs1( h( k+1, k+1 ) )
+                          if( tst1==rzero ) then
+                             if( k>=ktop+1 )tst1 = tst1 + cabs1( h( k, k-1 ) )
+                             if( k>=ktop+2 )tst1 = tst1 + cabs1( h( k, k-2 ) )
+                             if( k>=ktop+3 )tst1 = tst1 + cabs1( h( k, k-3 ) )
+                             if( k<=kbot-2 )tst1 = tst1 + cabs1( h( k+2, k+1 ) )
+                             if( k<=kbot-3 )tst1 = tst1 + cabs1( h( k+3, k+1 ) )
+                             if( k<=kbot-4 )tst1 = tst1 + cabs1( h( k+4, k+1 ) )
+                          end if
+                          if( cabs1( h( k+1, k ) )<=max( smlnum, ulp*tst1 ) ) then
+                             h12 = max( cabs1( h( k+1, k ) ),cabs1( h( k, k+1 ) ) )
+                             h21 = min( cabs1( h( k+1, k ) ),cabs1( h( k, k+1 ) ) )
+                             h11 = max( cabs1( h( k+1, k+1 ) ),cabs1( h( k, k )-h( k+1, k+1 ) ) )
+                                       
+                             h22 = min( cabs1( h( k+1, k+1 ) ),cabs1( h( k, k )-h( k+1, k+1 ) ) )
+                                       
+                             scl = h11 + h12
+                             tst2 = h22*( h11 / scl )
+                             if( tst2==rzero .or. h21*( h12 / scl )<=max( smlnum, ulp*tst2 ) )h( &
+                                       k+1, k ) = czero
+                          end if
+                       end if
+                    end if
+                    ! ==== accumulate orthogonal transformations. ====
+                    if( accum ) then
+                       kms = k - incol
+                       do j = max( 1, ktop-incol ), kdu
+                          refsum = v( 1, m22 )*( u( j, kms+1 )+v( 2, m22 )*u( j, kms+2 ) )
+                          u( j, kms+1 ) = u( j, kms+1 ) - refsum
+                          u( j, kms+2 ) = u( j, kms+2 ) -refsum*conjg( v( 2, m22 ) )
+                       end do
+                    else if( wantz ) then
+                       do j = iloz, ihiz
+                          refsum = v( 1, m22 )*( z( j, k+1 )+v( 2, m22 )*z( j, k+2 ) )
+                          z( j, k+1 ) = z( j, k+1 ) - refsum
+                          z( j, k+2 ) = z( j, k+2 ) -refsum*conjg( v( 2, m22 ) )
+                       end do
+                    end if
+                 end if
+                 ! ==== normal case: chain of 3-by-3 reflections ====
+                 loop_80: do m = mbot, mtop, -1
+                    k = krcol + 2*( m-1 )
+                    if( k==ktop-1 ) then
+                       call stdlib_claqr1( 3, h( ktop, ktop ), ldh, s( 2*m-1 ),s( 2*m ), v( 1, m )&
+                                  )
+                       alpha = v( 1, m )
+                       call stdlib_clarfg( 3, alpha, v( 2, m ), 1, v( 1, m ) )
+                    else
+                       ! ==== perform delayed transformation of row below
+                       ! .    mth bulge. exploit fact that first two elements
+                       ! .    of row are actually czero. ====
+                       refsum = v( 1, m )*v( 3, m )*h( k+3, k+2 )
+                       h( k+3, k   ) = -refsum
+                       h( k+3, k+1 ) = -refsum*conjg( v( 2, m ) )
+                       h( k+3, k+2 ) = h( k+3, k+2 ) -refsum*conjg( v( 3, m ) )
+                       ! ==== calculate reflection to move
+                       ! .    mth bulge cone step. ====
+                       beta      = h( k+1, k )
+                       v( 2, m ) = h( k+2, k )
+                       v( 3, m ) = h( k+3, k )
+                       call stdlib_clarfg( 3, beta, v( 2, m ), 1, v( 1, m ) )
+                       ! ==== a bulge may collapse because of vigilant
+                       ! .    deflation or destructive underflow.  in the
+                       ! .    underflow case, try the two-small-subdiagonals
+                       ! .    trick to try to reinflate the bulge.  ====
+                       if( h( k+3, k )/=czero .or. h( k+3, k+1 )/=czero .or. h( k+3, k+2 )==czero &
+                                 ) then
+                          ! ==== typical case: not collapsed (yet). ====
+                          h( k+1, k ) = beta
+                          h( k+2, k ) = czero
+                          h( k+3, k ) = czero
+                       else
+                          ! ==== atypical case: collapsed.  attempt to
+                          ! .    reintroduce ignoring h(k+1,k) and h(k+2,k).
+                          ! .    if the fill resulting from the new
+                          ! .    reflector is too large, then abandon it.
+                          ! .    otherwise, use the new cone. ====
+                          call stdlib_claqr1( 3, h( k+1, k+1 ), ldh, s( 2*m-1 ),s( 2*m ), vt )
+                                    
+                          alpha = vt( 1 )
+                          call stdlib_clarfg( 3, alpha, vt( 2 ), 1, vt( 1 ) )
+                          refsum = conjg( vt( 1 ) )*( h( k+1, k )+conjg( vt( 2 ) )*h( k+2, k ) )
+                                    
+                          if( cabs1( h( k+2, k )-refsum*vt( 2 ) )+cabs1( refsum*vt( 3 ) )>ulp*( &
+                          cabs1( h( k, k ) )+cabs1( h( k+1,k+1 ) )+cabs1( h( k+2, k+2 ) ) ) ) &
+                                    then
+                             ! ==== starting a new bulge here would
+                             ! .    create non-negligible fill.  use
+                             ! .    the old cone with trepidation. ====
+                             h( k+1, k ) = beta
+                             h( k+2, k ) = czero
+                             h( k+3, k ) = czero
+                          else
+                             ! ==== starting a new bulge here would
+                             ! .    create only negligible fill.
+                             ! .    replace the old reflector with
+                             ! .    the new cone. ====
+                             h( k+1, k ) = h( k+1, k ) - refsum
+                             h( k+2, k ) = czero
+                             h( k+3, k ) = czero
+                             v( 1, m ) = vt( 1 )
+                             v( 2, m ) = vt( 2 )
+                             v( 3, m ) = vt( 3 )
+                          end if
+                       end if
+                    end if
+                    ! ====  apply reflection from the right and
+                    ! .     the first column of update from the left.
+                    ! .     these updates are required for the vigilant
+                    ! .     deflation check. we still delay most of the
+                    ! .     updates from the left for efficiency. ====
+                    do j = jtop, min( kbot, k+3 )
+                       refsum = v( 1, m )*( h( j, k+1 )+v( 2, m )*h( j, k+2 )+v( 3, m )*h( j, k+3 &
+                                 ) )
+                       h( j, k+1 ) = h( j, k+1 ) - refsum
+                       h( j, k+2 ) = h( j, k+2 ) -refsum*conjg( v( 2, m ) )
+                       h( j, k+3 ) = h( j, k+3 ) -refsum*conjg( v( 3, m ) )
+                    end do
+                    ! ==== perform update from left for subsequent
+                    ! .    column. ====
+                    refsum =  conjg( v( 1, m ) )*( h( k+1, k+1 )+conjg( v( 2, m ) )*h( k+2, k+1 )+&
+                              conjg( v( 3, m ) )*h( k+3, k+1 ) )
+                    h( k+1, k+1 ) = h( k+1, k+1 ) - refsum
+                    h( k+2, k+1 ) = h( k+2, k+1 ) - refsum*v( 2, m )
+                    h( k+3, k+1 ) = h( k+3, k+1 ) - refsum*v( 3, m )
+                    ! ==== the following convergence test requires that
+                    ! .    the tradition small-compared-to-nearby-diagonals
+                    ! .    criterion and the ahues
+                    ! .    criteria both be satisfied.  the latter improves
+                    ! .    accuracy in some examples. falling back on an
+                    ! .    alternate convergence criterion when tst1 or tst2
+                    ! .    is czero (as done here) is traditional but probably
+                    ! .    unnecessary. ====
+                    if( k<ktop)cycle
+                    if( h( k+1, k )/=czero ) then
+                       tst1 = cabs1( h( k, k ) ) + cabs1( h( k+1, k+1 ) )
+                       if( tst1==rzero ) then
+                          if( k>=ktop+1 )tst1 = tst1 + cabs1( h( k, k-1 ) )
+                          if( k>=ktop+2 )tst1 = tst1 + cabs1( h( k, k-2 ) )
+                          if( k>=ktop+3 )tst1 = tst1 + cabs1( h( k, k-3 ) )
+                          if( k<=kbot-2 )tst1 = tst1 + cabs1( h( k+2, k+1 ) )
+                          if( k<=kbot-3 )tst1 = tst1 + cabs1( h( k+3, k+1 ) )
+                          if( k<=kbot-4 )tst1 = tst1 + cabs1( h( k+4, k+1 ) )
+                       end if
+                       if( cabs1( h( k+1, k ) )<=max( smlnum, ulp*tst1 ) )then
+                          h12 = max( cabs1( h( k+1, k ) ),cabs1( h( k, k+1 ) ) )
+                          h21 = min( cabs1( h( k+1, k ) ),cabs1( h( k, k+1 ) ) )
+                          h11 = max( cabs1( h( k+1, k+1 ) ),cabs1( h( k, k )-h( k+1, k+1 ) ) )
+                                    
+                          h22 = min( cabs1( h( k+1, k+1 ) ),cabs1( h( k, k )-h( k+1, k+1 ) ) )
+                                    
+                          scl = h11 + h12
+                          tst2 = h22*( h11 / scl )
+                          if( tst2==rzero .or. h21*( h12 / scl )<=max( smlnum, ulp*tst2 ) )h( k+1,&
+                                     k ) = czero
+                       end if
+                    end if
+                 end do loop_80
+                 ! ==== multiply h by reflections from the left ====
+                 if( accum ) then
+                    jbot = min( ndcol, kbot )
+                 else if( wantt ) then
+                    jbot = n
+                 else
+                    jbot = kbot
+                 end if
+                 do m = mbot, mtop, -1
+                    k = krcol + 2*( m-1 )
+                    do j = max( ktop, krcol + 2*m ), jbot
+                       refsum = conjg( v( 1, m ) )*( h( k+1, j )+conjg( v( 2, m ) )*h( k+2, j )+&
+                                 conjg( v( 3, m ) )*h( k+3, j ) )
+                       h( k+1, j ) = h( k+1, j ) - refsum
+                       h( k+2, j ) = h( k+2, j ) - refsum*v( 2, m )
+                       h( k+3, j ) = h( k+3, j ) - refsum*v( 3, m )
+                    end do
+                 end do
+                 ! ==== accumulate orthogonal transformations. ====
+                 if( accum ) then
+                    ! ==== accumulate u. (if needed, update z later
+                    ! .    with an efficient matrix-matrix
+                    ! .    multiply.) ====
+                    do m = mbot, mtop, -1
+                       k = krcol + 2*( m-1 )
+                       kms = k - incol
+                       i2 = max( 1, ktop-incol )
+                       i2 = max( i2, kms-(krcol-incol)+1 )
+                       i4 = min( kdu, krcol + 2*( mbot-1 ) - incol + 5 )
+                       do j = i2, i4
+                          refsum = v( 1, m )*( u( j, kms+1 )+v( 2, m )*u( j, kms+2 )+v( 3, m )*u( &
+                                    j, kms+3 ) )
+                          u( j, kms+1 ) = u( j, kms+1 ) - refsum
+                          u( j, kms+2 ) = u( j, kms+2 ) -refsum*conjg( v( 2, m ) )
+                          u( j, kms+3 ) = u( j, kms+3 ) -refsum*conjg( v( 3, m ) )
+                       end do
+                    end do
+                 else if( wantz ) then
+                    ! ==== u is not accumulated, so update z
+                    ! .    now by multiplying by reflections
+                    ! .    from the right. ====
+                    do m = mbot, mtop, -1
+                       k = krcol + 2*( m-1 )
+                       do j = iloz, ihiz
+                          refsum = v( 1, m )*( z( j, k+1 )+v( 2, m )*z( j, k+2 )+v( 3, m )*z( j, &
+                                    k+3 ) )
+                          z( j, k+1 ) = z( j, k+1 ) - refsum
+                          z( j, k+2 ) = z( j, k+2 ) -refsum*conjg( v( 2, m ) )
+                          z( j, k+3 ) = z( j, k+3 ) -refsum*conjg( v( 3, m ) )
+                       end do
+                    end do
+                 end if
+                 ! ==== end of near-the-diagonal bulge chase. ====
+              end do loop_145
+              ! ==== use u (if accumulated) to update far-from-diagonal
+              ! .    entries in h.  if required, use u to update z as
+              ! .    well. ====
+              if( accum ) then
+                 if( wantt ) then
+                    jtop = 1
+                    jbot = n
+                 else
+                    jtop = ktop
+                    jbot = kbot
+                 end if
+                 k1 = max( 1, ktop-incol )
+                 nu = ( kdu-max( 0, ndcol-kbot ) ) - k1 + 1
+                 ! ==== horizontal multiply ====
+                 do jcol = min( ndcol, kbot ) + 1, jbot, nh
+                    jlen = min( nh, jbot-jcol+1 )
+                    call stdlib_cgemm( 'C', 'N', nu, jlen, nu, cone, u( k1, k1 ),ldu, h( incol+k1,&
+                               jcol ), ldh, czero, wh,ldwh )
+                    call stdlib_clacpy( 'ALL', nu, jlen, wh, ldwh,h( incol+k1, jcol ), ldh )
+                              
+                 end do
+                 ! ==== vertical multiply ====
+                 do jrow = jtop, max( ktop, incol ) - 1, nv
+                    jlen = min( nv, max( ktop, incol )-jrow )
+                    call stdlib_cgemm( 'N', 'N', jlen, nu, nu, cone,h( jrow, incol+k1 ), ldh, u( &
+                              k1, k1 ),ldu, czero, wv, ldwv )
+                    call stdlib_clacpy( 'ALL', jlen, nu, wv, ldwv,h( jrow, incol+k1 ), ldh )
+                              
+                 end do
+                 ! ==== z multiply (also vertical) ====
+                 if( wantz ) then
+                    do jrow = iloz, ihiz, nv
+                       jlen = min( nv, ihiz-jrow+1 )
+                       call stdlib_cgemm( 'N', 'N', jlen, nu, nu, cone,z( jrow, incol+k1 ), ldz, &
+                                 u( k1, k1 ),ldu, czero, wv, ldwv )
+                       call stdlib_clacpy( 'ALL', jlen, nu, wv, ldwv,z( jrow, incol+k1 ), ldz )
+                                 
+                    end do
+                 end if
+              end if
+           end do loop_180
+     end subroutine stdlib_claqr5
+
+     !> CLAQZ1: chases a 1x1 shift bulge in a matrix pencil down a single position
+
+     pure subroutine stdlib_claqz1( ilq, ilz, k, istartm, istopm, ihi, a, lda, b,ldb, nq, qstart, &
+               q, ldq, nz, zstart, z, ldz )
+           ! arguments
+           logical(lk), intent( in ) :: ilq, ilz
+           integer(ilp), intent( in ) :: k, lda, ldb, ldq, ldz, istartm, istopm,nq, nz, qstart, &
+                     zstart, ihi
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           
+           
+           ! local variables
+           real(sp) :: c
+           complex(sp) :: s, temp
+           if( k+1 == ihi ) then
+              ! shift is located on the edge of the matrix, remove it
+              call stdlib_clartg( b( ihi, ihi ), b( ihi, ihi-1 ), c, s, temp )
+              b( ihi, ihi ) = temp
+              b( ihi, ihi-1 ) = czero
+              call stdlib_crot( ihi-istartm, b( istartm, ihi ), 1, b( istartm,ihi-1 ), 1, c, s )
+                        
+              call stdlib_crot( ihi-istartm+1, a( istartm, ihi ), 1, a( istartm,ihi-1 ), 1, c, s )
+                        
+              if ( ilz ) then
+                 call stdlib_crot( nz, z( 1, ihi-zstart+1 ), 1, z( 1, ihi-1-zstart+1 ), 1, c, s )
+                           
+              end if
+           else
+              ! normal operation, move bulge down
+              ! apply transformation from the right
+              call stdlib_clartg( b( k+1, k+1 ), b( k+1, k ), c, s, temp )
+              b( k+1, k+1 ) = temp
+              b( k+1, k ) = czero
+              call stdlib_crot( k+2-istartm+1, a( istartm, k+1 ), 1, a( istartm,k ), 1, c, s )
+                        
+              call stdlib_crot( k-istartm+1, b( istartm, k+1 ), 1, b( istartm, k ),1, c, s )
+                        
+              if ( ilz ) then
+                 call stdlib_crot( nz, z( 1, k+1-zstart+1 ), 1, z( 1, k-zstart+1 ),1, c, s )
+                           
+              end if
+              ! apply transformation from the left
+              call stdlib_clartg( a( k+1, k ), a( k+2, k ), c, s, temp )
+              a( k+1, k ) = temp
+              a( k+2, k ) = czero
+              call stdlib_crot( istopm-k, a( k+1, k+1 ), lda, a( k+2, k+1 ), lda, c,s )
+              call stdlib_crot( istopm-k, b( k+1, k+1 ), ldb, b( k+2, k+1 ), ldb, c,s )
+              if ( ilq ) then
+                 call stdlib_crot( nq, q( 1, k+1-qstart+1 ), 1, q( 1, k+2-qstart+1 ), 1, c, conjg(&
+                            s ) )
+              end if
+           end if
+     end subroutine stdlib_claqz1
+
+     !> CLAQZ3: Executes a single multishift QZ sweep
+
+     pure subroutine stdlib_claqz3( ilschur, ilq, ilz, n, ilo, ihi, nshifts,nblock_desired, alpha,&
+                beta, a, lda, b, ldb,q, ldq, z, ldz, qc, ldqc, zc, ldzc, work,lwork, info )
+           ! function arguments
+           logical(lk), intent( in ) :: ilschur, ilq, ilz
+           integer(ilp), intent( in ) :: n, ilo, ihi, lda, ldb, ldq, ldz, lwork,nshifts, &
+                     nblock_desired, ldqc, ldzc
+           complex(sp), intent( inout ) :: a( lda, * ), b( ldb, * ), q( ldq, * ),z( ldz, * ), qc( &
+                     ldqc, * ), zc( ldzc, * ), work( * ),alpha( * ), beta( * )
+           integer(ilp), intent( out ) :: info
+           
+           
+           ! local scalars
+           integer(ilp) :: i, j, ns, istartm, istopm, sheight, swidth, k, np, istartb, istopb, &
+                     ishift, nblock, npos
+           real(sp) :: safmin, safmax, c, scale
+           complex(sp) :: temp, temp2, temp3, s
+           info = 0
+           if ( nblock_desired < nshifts+1 ) then
+              info = -8
+           end if
+           if ( lwork ==-1 ) then
+              ! workspace query, quick return
+              work( 1 ) = n*nblock_desired
+              return
+           else if ( lwork < n*nblock_desired ) then
+              info = -25
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLAQZ3', -info )
+              return
+           end if
+           ! executable statements
+           ! get machine constants
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_slabad( safmin, safmax )
+           if ( ilo >= ihi ) then
+              return
+           end if
+           if ( ilschur ) then
+              istartm = 1
+              istopm = n
+           else
+              istartm = ilo
+              istopm = ihi
+           end if
+           ns = nshifts
+           npos = max( nblock_desired-ns, 1 )
+           ! the following block introduces the shifts and chases
+           ! them down one by one just enough to make space for
+           ! the other shifts. the near-the-diagonal block is
+           ! of size (ns+1) x ns.
+           call stdlib_claset( 'FULL', ns+1, ns+1, czero, cone, qc, ldqc )
+           call stdlib_claset( 'FULL', ns, ns, czero, cone, zc, ldzc )
+           do i = 1, ns
+              ! introduce the shift
+              scale = sqrt( abs( alpha( i ) ) ) * sqrt( abs( beta( i ) ) )
+              if( scale >= safmin .and. scale <= safmax ) then
+                 alpha( i ) = alpha( i )/scale
+                 beta( i ) = beta( i )/scale
+              end if
+              temp2 = beta( i )*a( ilo, ilo )-alpha( i )*b( ilo, ilo )
+              temp3 = beta( i )*a( ilo+1, ilo )
+              if ( abs( temp2 ) > safmax .or.abs( temp3 ) > safmax ) then
+                 temp2 = cone
+                 temp3 = czero
+              end if
+              call stdlib_clartg( temp2, temp3, c, s, temp )
+              call stdlib_crot( ns, a( ilo, ilo ), lda, a( ilo+1, ilo ), lda, c,s )
+              call stdlib_crot( ns, b( ilo, ilo ), ldb, b( ilo+1, ilo ), ldb, c,s )
+              call stdlib_crot( ns+1, qc( 1, 1 ), 1, qc( 1, 2 ), 1, c, conjg( s ) )
+              ! chase the shift down
+              do j = 1, ns-i
+                 call stdlib_claqz1( .true., .true., j, 1, ns, ihi-ilo+1, a( ilo,ilo ), lda, b( &
+                           ilo, ilo ), ldb, ns+1, 1, qc,ldqc, ns, 1, zc, ldzc )
+              end do
+           end do
+           ! update the rest of the pencil
+           ! update a(ilo:ilo+ns,ilo+ns:istopm) and b(ilo:ilo+ns,ilo+ns:istopm)
+           ! from the left with qc(1:ns+1,1:ns+1)'
+           sheight = ns+1
+           swidth = istopm-( ilo+ns )+1
+           if ( swidth > 0 ) then
+              call stdlib_cgemm( 'C', 'N', sheight, swidth, sheight, cone, qc, ldqc,a( ilo, ilo+&
+                        ns ), lda, czero, work, sheight )
+              call stdlib_clacpy( 'ALL', sheight, swidth, work, sheight, a( ilo,ilo+ns ), lda )
+                        
+              call stdlib_cgemm( 'C', 'N', sheight, swidth, sheight, cone, qc, ldqc,b( ilo, ilo+&
+                        ns ), ldb, czero, work, sheight )
+              call stdlib_clacpy( 'ALL', sheight, swidth, work, sheight, b( ilo,ilo+ns ), ldb )
+                        
+           end if
+           if ( ilq ) then
+             call stdlib_cgemm( 'N', 'N', n, sheight, sheight, cone, q( 1, ilo ),ldq, qc, ldqc, &
+                       czero, work, n )
+              call stdlib_clacpy( 'ALL', n, sheight, work, n, q( 1, ilo ), ldq )
+           end if
+           ! update a(istartm:ilo-1,ilo:ilo+ns-1) and b(istartm:ilo-1,ilo:ilo+ns-1)
+           ! from the right with zc(1:ns,1:ns)
+           sheight = ilo-1-istartm+1
+           swidth = ns
+           if ( sheight > 0 ) then
+              call stdlib_cgemm( 'N', 'N', sheight, swidth, swidth, cone,a( istartm, ilo ), lda, &
+                        zc, ldzc, czero, work,sheight )
+              call stdlib_clacpy( 'ALL', sheight, swidth, work, sheight, a( istartm,ilo ), lda )
+                        
+              call stdlib_cgemm( 'N', 'N', sheight, swidth, swidth, cone,b( istartm, ilo ), ldb, &
+                        zc, ldzc, czero, work,sheight )
+              call stdlib_clacpy( 'ALL', sheight, swidth, work, sheight, b( istartm,ilo ), ldb )
+                        
+           end if
+           if ( ilz ) then
+              call stdlib_cgemm( 'N', 'N', n, swidth, swidth, cone, z( 1, ilo ),ldz, zc, ldzc, &
+                        czero, work, n )
+              call stdlib_clacpy( 'ALL', n, swidth, work, n, z( 1, ilo ), ldz )
+           end if
+           ! the following block chases the shifts down to the bottom
+           ! right block. if possible, a shift is moved down npos
+           ! positions at a time
+           k = ilo
+           do while ( k < ihi-ns )
+              np = min( ihi-ns-k, npos )
+              ! size of the near-the-diagonal block
+              nblock = ns+np
+              ! istartb points to the first row we will be updating
+              istartb = k+1
+              ! istopb points to the last column we will be updating
+              istopb = k+nblock-1
+              call stdlib_claset( 'FULL', ns+np, ns+np, czero, cone, qc, ldqc )
+              call stdlib_claset( 'FULL', ns+np, ns+np, czero, cone, zc, ldzc )
+              ! near the diagonal shift chase
+              do i = ns-1, 0, -1
+                 do j = 0, np-1
+                    ! move down the block with index k+i+j, updating
+                    ! the (ns+np x ns+np) block:
+                    ! (k:k+ns+np,k:k+ns+np-1)
+                    call stdlib_claqz1( .true., .true., k+i+j, istartb, istopb, ihi,a, lda, b, &
+                              ldb, nblock, k+1, qc, ldqc,nblock, k, zc, ldzc )
+                 end do
+              end do
+              ! update rest of the pencil
+              ! update a(k+1:k+ns+np, k+ns+np:istopm) and
+              ! b(k+1:k+ns+np, k+ns+np:istopm)
+              ! from the left with qc(1:ns+np,1:ns+np)'
+              sheight = ns+np
+              swidth = istopm-( k+ns+np )+1
+              if ( swidth > 0 ) then
+                 call stdlib_cgemm( 'C', 'N', sheight, swidth, sheight, cone, qc,ldqc, a( k+1, k+&
+                           ns+np ), lda, czero, work,sheight )
+                 call stdlib_clacpy( 'ALL', sheight, swidth, work, sheight, a( k+1,k+ns+np ), lda &
+                           )
+                 call stdlib_cgemm( 'C', 'N', sheight, swidth, sheight, cone, qc,ldqc, b( k+1, k+&
+                           ns+np ), ldb, czero, work,sheight )
+                 call stdlib_clacpy( 'ALL', sheight, swidth, work, sheight, b( k+1,k+ns+np ), ldb &
+                           )
+              end if
+              if ( ilq ) then
+                 call stdlib_cgemm( 'N', 'N', n, nblock, nblock, cone, q( 1, k+1 ),ldq, qc, ldqc, &
+                           czero, work, n )
+                 call stdlib_clacpy( 'ALL', n, nblock, work, n, q( 1, k+1 ), ldq )
+              end if
+              ! update a(istartm:k,k:k+ns+npos-1) and b(istartm:k,k:k+ns+npos-1)
+              ! from the right with zc(1:ns+np,1:ns+np)
+              sheight = k-istartm+1
+              swidth = nblock
+              if ( sheight > 0 ) then
+                 call stdlib_cgemm( 'N', 'N', sheight, swidth, swidth, cone,a( istartm, k ), lda, &
+                           zc, ldzc, czero, work,sheight )
+                 call stdlib_clacpy( 'ALL', sheight, swidth, work, sheight,a( istartm, k ), lda )
+                           
+                 call stdlib_cgemm( 'N', 'N', sheight, swidth, swidth, cone,b( istartm, k ), ldb, &
+                           zc, ldzc, czero, work,sheight )
+                 call stdlib_clacpy( 'ALL', sheight, swidth, work, sheight,b( istartm, k ), ldb )
+                           
+              end if
+              if ( ilz ) then
+                 call stdlib_cgemm( 'N', 'N', n, nblock, nblock, cone, z( 1, k ),ldz, zc, ldzc, &
+                           czero, work, n )
+                 call stdlib_clacpy( 'ALL', n, nblock, work, n, z( 1, k ), ldz )
+              end if
+              k = k+np
+           end do
+           ! the following block removes the shifts from the bottom right corner
+           ! one by one. updates are initially applied to a(ihi-ns+1:ihi,ihi-ns:ihi).
+           call stdlib_claset( 'FULL', ns, ns, czero, cone, qc, ldqc )
+           call stdlib_claset( 'FULL', ns+1, ns+1, czero, cone, zc, ldzc )
+           ! istartb points to the first row we will be updating
+           istartb = ihi-ns+1
+           ! istopb points to the last column we will be updating
+           istopb = ihi
+           do i = 1, ns
+              ! chase the shift down to the bottom right corner
+              do ishift = ihi-i, ihi-1
+                 call stdlib_claqz1( .true., .true., ishift, istartb, istopb, ihi,a, lda, b, ldb, &
+                           ns, ihi-ns+1, qc, ldqc, ns+1,ihi-ns, zc, ldzc )
+              end do
+           end do
+           ! update rest of the pencil
+           ! update a(ihi-ns+1:ihi, ihi+1:istopm)
+           ! from the left with qc(1:ns,1:ns)'
+           sheight = ns
+           swidth = istopm-( ihi+1 )+1
+           if ( swidth > 0 ) then
+              call stdlib_cgemm( 'C', 'N', sheight, swidth, sheight, cone, qc, ldqc,a( ihi-ns+1, &
+                        ihi+1 ), lda, czero, work, sheight )
+              call stdlib_clacpy( 'ALL', sheight, swidth, work, sheight,a( ihi-ns+1, ihi+1 ), lda &
+                        )
+              call stdlib_cgemm( 'C', 'N', sheight, swidth, sheight, cone, qc, ldqc,b( ihi-ns+1, &
+                        ihi+1 ), ldb, czero, work, sheight )
+              call stdlib_clacpy( 'ALL', sheight, swidth, work, sheight,b( ihi-ns+1, ihi+1 ), ldb &
+                        )
+           end if
+           if ( ilq ) then
+              call stdlib_cgemm( 'N', 'N', n, ns, ns, cone, q( 1, ihi-ns+1 ), ldq,qc, ldqc, czero,&
+                         work, n )
+              call stdlib_clacpy( 'ALL', n, ns, work, n, q( 1, ihi-ns+1 ), ldq )
+           end if
+           ! update a(istartm:ihi-ns,ihi-ns:ihi)
+           ! from the right with zc(1:ns+1,1:ns+1)
+           sheight = ihi-ns-istartm+1
+           swidth = ns+1
+           if ( sheight > 0 ) then
+              call stdlib_cgemm( 'N', 'N', sheight, swidth, swidth, cone,a( istartm, ihi-ns ), &
+                        lda, zc, ldzc, czero, work,sheight )
+              call stdlib_clacpy( 'ALL', sheight, swidth, work, sheight, a( istartm,ihi-ns ), lda &
+                        )
+              call stdlib_cgemm( 'N', 'N', sheight, swidth, swidth, cone,b( istartm, ihi-ns ), &
+                        ldb, zc, ldzc, czero, work,sheight )
+              call stdlib_clacpy( 'ALL', sheight, swidth, work, sheight, b( istartm,ihi-ns ), ldb &
+                        )
+           end if
+           if ( ilz ) then
+              call stdlib_cgemm( 'N', 'N', n, ns+1, ns+1, cone, z( 1, ihi-ns ), ldz,zc, ldzc, &
+                        czero, work, n )
+              call stdlib_clacpy( 'ALL', n, ns+1, work, n, z( 1, ihi-ns ), ldz )
+           end if
+     end subroutine stdlib_claqz3
+
+     !> CLARGV: generates a vector of complex plane rotations with real
+     !> cosines, determined by elements of the complex vectors x and y.
+     !> For i = 1,2,...,n
+     !> (        c(i)   s(i) ) ( x(i) ) = ( r(i) )
+     !> ( -conjg(s(i))  c(i) ) ( y(i) ) = (   0  )
+     !> where c(i)**2 + ABS(s(i))**2 = 1
+     !> The following conventions are used (these are the same as in CLARTG,
+     !> but differ from the BLAS1 routine CROTG):
+     !> If y(i)=0, then c(i)=1 and s(i)=0.
+     !> If x(i)=0, then c(i)=0 and s(i) is chosen so that r(i) is real.
+
+     pure subroutine stdlib_clargv( n, x, incx, y, incy, c, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, incy, n
+           ! Array Arguments 
+           real(sp), intent(out) :: c(*)
+           complex(sp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           ! logical            first
+           integer(ilp) :: count, i, ic, ix, iy, j
+           real(sp) :: cs, d, di, dr, eps, f2, f2s, g2, g2s, safmin, safmn2, safmx2, scale
+           complex(sp) :: f, ff, fs, g, gs, r, sn
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,int,log,max,real,sqrt
+           ! Statement Functions 
+           real(sp) :: abs1, abssq
+           ! Save Statement 
+           ! save               first, safmx2, safmin, safmn2
+           ! Data Statements 
+           ! data               first / .true. /
+           ! Statement Function Definitions 
+           abs1( ff ) = max( abs( real( ff,KIND=sp) ), abs( aimag( ff ) ) )
+           abssq( ff ) = real( ff,KIND=sp)**2 + aimag( ff )**2
+           ! Executable Statements 
+           ! if( first ) then
+              ! first = .false.
+              safmin = stdlib_slamch( 'S' )
+              eps = stdlib_slamch( 'E' )
+              safmn2 = stdlib_slamch( 'B' )**int( log( safmin / eps ) /log( stdlib_slamch( 'B' ) )&
+                         / two,KIND=ilp)
+              safmx2 = one / safmn2
+           ! end if
+           ix = 1
+           iy = 1
+           ic = 1
+           loop_60: do i = 1, n
+              f = x( ix )
+              g = y( iy )
+              ! use identical algorithm as in stdlib_clartg
+              scale = max( abs1( f ), abs1( g ) )
+              fs = f
+              gs = g
+              count = 0
+              if( scale>=safmx2 ) then
+              10 continue
+                 count = count + 1
+                 fs = fs*safmn2
+                 gs = gs*safmn2
+                 scale = scale*safmn2
+                 if( scale>=safmx2 .and. count < 20 )go to 10
+              else if( scale<=safmn2 ) then
+                 if( g==czero ) then
+                    cs = one
+                    sn = czero
+                    r = f
+                    go to 50
+                 end if
+                 20 continue
+                 count = count - 1
+                 fs = fs*safmx2
+                 gs = gs*safmx2
+                 scale = scale*safmx2
+                 if( scale<=safmn2 )go to 20
+              end if
+              f2 = abssq( fs )
+              g2 = abssq( gs )
+              if( f2<=max( g2, one )*safmin ) then
+                 ! this is a rare case: f is very small.
+                 if( f==czero ) then
+                    cs = zero
+                    r = stdlib_slapy2( real( g,KIND=sp), aimag( g ) )
+                    ! do complex/real division explicitly with two real
+                    ! divisions
+                    d = stdlib_slapy2( real( gs,KIND=sp), aimag( gs ) )
+                    sn = cmplx( real( gs,KIND=sp) / d, -aimag( gs ) / d,KIND=sp)
+                    go to 50
+                 end if
+                 f2s = stdlib_slapy2( real( fs,KIND=sp), aimag( fs ) )
+                 ! g2 and g2s are accurate
+                 ! g2 is at least safmin, and g2s is at least safmn2
+                 g2s = sqrt( g2 )
+                 ! error in cs from underflow in f2s is at most
+                 ! unfl / safmn2 .lt. sqrt(unfl*eps) .lt. eps
+                 ! if max(g2,one)=g2, then f2 .lt. g2*safmin,
+                 ! and so cs .lt. sqrt(safmin)
+                 ! if max(g2,one)=one, then f2 .lt. safmin
+                 ! and so cs .lt. sqrt(safmin)/safmn2 = sqrt(eps)
+                 ! therefore, cs = f2s/g2s / sqrt( 1 + (f2s/g2s)**2 ) = f2s/g2s
+                 cs = f2s / g2s
+                 ! make sure abs(ff) = 1
+                 ! do complex/real division explicitly with 2 real divisions
+                 if( abs1( f )>one ) then
+                    d = stdlib_slapy2( real( f,KIND=sp), aimag( f ) )
+                    ff = cmplx( real( f,KIND=sp) / d, aimag( f ) / d,KIND=sp)
+                 else
+                    dr = safmx2*real( f,KIND=sp)
+                    di = safmx2*aimag( f )
+                    d = stdlib_slapy2( dr, di )
+                    ff = cmplx( dr / d, di / d,KIND=sp)
+                 end if
+                 sn = ff*cmplx( real( gs,KIND=sp) / g2s, -aimag( gs ) / g2s,KIND=sp)
+                 r = cs*f + sn*g
+              else
+                 ! this is the most common case.
+                 ! neither f2 nor f2/g2 are less than safmin
+                 ! f2s cannot overflow, and it is accurate
+                 f2s = sqrt( one+g2 / f2 )
+                 ! do the f2s(real)*fs(complex) multiply with two real
+                 ! multiplies
+                 r = cmplx( f2s*real( fs,KIND=sp), f2s*aimag( fs ),KIND=sp)
+                 cs = one / f2s
+                 d = f2 + g2
+                 ! do complex/real division explicitly with two real divisions
+                 sn = cmplx( real( r,KIND=sp) / d, aimag( r ) / d,KIND=sp)
+                 sn = sn*conjg( gs )
+                 if( count/=0 ) then
+                    if( count>0 ) then
+                       do j = 1, count
+                          r = r*safmx2
+                       end do
+                    else
+                       do j = 1, -count
+                          r = r*safmn2
+                       end do
+                    end if
+                 end if
+              end if
+              50 continue
+              c( ic ) = cs
+              y( iy ) = sn
+              x( ix ) = r
+              ic = ic + incc
+              iy = iy + incy
+              ix = ix + incx
+           end do loop_60
+           return
+     end subroutine stdlib_clargv
+
+     !> CLARRV: computes the eigenvectors of the tridiagonal matrix
+     !> T = L D L**T given L, D and APPROXIMATIONS to the eigenvalues of L D L**T.
+     !> The input eigenvalues should have been computed by SLARRE.
+
+     pure subroutine stdlib_clarrv( n, vl, vu, d, l, pivmin,isplit, m, dol, dou, minrgp,rtol1, &
+               rtol2, w, werr, wgap,iblock, indexw, gers, z, ldz, isuppz,work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: dol, dou, ldz, m, n
+           integer(ilp), intent(out) :: info
+           real(sp), intent(in) :: minrgp, pivmin, vl, vu
+           real(sp), intent(inout) :: rtol1, rtol2
+           ! Array Arguments 
+           integer(ilp), intent(in) :: iblock(*), indexw(*), isplit(*)
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(sp), intent(inout) :: d(*), l(*), w(*), werr(*), wgap(*)
+           real(sp), intent(in) :: gers(*)
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxitr = 10
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: eskip, needbs, stp2ii, tryrqc, usedbs, usedrq
+           integer(ilp) :: done, i, ibegin, idone, iend, ii, iindc1, iindc2, iindr, iindwk, iinfo,&
+            im, in, indeig, indld, indlld, indwrk, isupmn, isupmx, iter, itmp1, j, jblk, k, &
+            miniwsize, minwsize, nclus, ndepth, negcnt, newcls, newfst, newftt, newlst, newsiz, &
+            offset, oldcls, oldfst, oldien, oldlst, oldncl, p, parity, q, wbegin, wend, windex, &
+                      windmn, windpl, zfrom, zto, zusedl, zusedu, zusedw
+           integer(ilp) :: indin1, indin2
+           real(sp) :: bstres, bstw, eps, fudge, gap, gaptol, gl, gu, lambda, left, lgap, mingma, &
+           nrminv, resid, rgap, right, rqcorr, rqtol, savgap, sgndef, sigma, spdiam, ssigma, tau, &
+                     tmp, tol, ztz
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min
+           intrinsic :: cmplx
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( (n<=0).or.(m<=0) ) then
+              return
+           end if
+           ! the first n entries of work are reserved for the eigenvalues
+           indld = n+1
+           indlld= 2*n+1
+           indin1 = 3*n + 1
+           indin2 = 4*n + 1
+           indwrk = 5*n + 1
+           minwsize = 12 * n
+           do i= 1,minwsize
+              work( i ) = zero
+           end do
+           ! iwork(iindr+1:iindr+n) hold the twist indices r for the
+           ! factorization used to compute the fp vector
+           iindr = 0
+           ! iwork(iindc1+1:iinc2+n) are used to store the clusters of the current
+           ! layer and the one above.
+           iindc1 = n
+           iindc2 = 2*n
+           iindwk = 3*n + 1
+           miniwsize = 7 * n
+           do i= 1,miniwsize
+              iwork( i ) = 0
+           end do
+           zusedl = 1
+           if(dol>1) then
+              ! set lower bound for use of z
+              zusedl = dol-1
+           endif
+           zusedu = m
+           if(dou<m) then
+              ! set lower bound for use of z
+              zusedu = dou+1
+           endif
+           ! the width of the part of z that is used
+           zusedw = zusedu - zusedl + 1
+           call stdlib_claset( 'FULL', n, zusedw, czero, czero,z(1,zusedl), ldz )
+           eps = stdlib_slamch( 'PRECISION' )
+           rqtol = two * eps
+           ! set expert flags for standard code.
+           tryrqc = .true.
+           if((dol==1).and.(dou==m)) then
+           else
+              ! only selected eigenpairs are computed. since the other evalues
+              ! are not refined by rq iteration, bisection has to compute to full
+              ! accuracy.
+              rtol1 = four * eps
+              rtol2 = four * eps
+           endif
+           ! the entries wbegin:wend in w, werr, wgap correspond to the
+           ! desired eigenvalues. the support of the nonzero eigenvector
+           ! entries is contained in the interval ibegin:iend.
+           ! remark that if k eigenpairs are desired, then the eigenvectors
+           ! are stored in k contiguous columns of z.
+           ! done is the number of eigenvectors already computed
+           done = 0
+           ibegin = 1
+           wbegin = 1
+           loop_170: do jblk = 1, iblock( m )
+              iend = isplit( jblk )
+              sigma = l( iend )
+              ! find the eigenvectors of the submatrix indexed ibegin
+              ! through iend.
+              wend = wbegin - 1
+              15 continue
+              if( wend<m ) then
+                 if( iblock( wend+1 )==jblk ) then
+                    wend = wend + 1
+                    go to 15
+                 end if
+              end if
+              if( wend<wbegin ) then
+                 ibegin = iend + 1
+                 cycle loop_170
+              elseif( (wend<dol).or.(wbegin>dou) ) then
+                 ibegin = iend + 1
+                 wbegin = wend + 1
+                 cycle loop_170
+              end if
+              ! find local spectral diameter of the block
+              gl = gers( 2*ibegin-1 )
+              gu = gers( 2*ibegin )
+              do i = ibegin+1 , iend
+                 gl = min( gers( 2*i-1 ), gl )
+                 gu = max( gers( 2*i ), gu )
+              end do
+              spdiam = gu - gl
+              ! oldien is the last index of the previous block
+              oldien = ibegin - 1
+              ! calculate the size of the current block
+              in = iend - ibegin + 1
+              ! the number of eigenvalues in the current block
+              im = wend - wbegin + 1
+              ! this is for a 1x1 block
+              if( ibegin==iend ) then
+                 done = done+1
+                 z( ibegin, wbegin ) = cmplx( one, zero,KIND=sp)
+                 isuppz( 2*wbegin-1 ) = ibegin
+                 isuppz( 2*wbegin ) = ibegin
+                 w( wbegin ) = w( wbegin ) + sigma
+                 work( wbegin ) = w( wbegin )
+                 ibegin = iend + 1
+                 wbegin = wbegin + 1
+                 cycle loop_170
+              end if
+              ! the desired (shifted) eigenvalues are stored in w(wbegin:wend)
+              ! note that these can be approximations, in this case, the corresp.
+              ! entries of werr give the size of the uncertainty interval.
+              ! the eigenvalue approximations will be refined when necessary as
+              ! high relative accuracy is required for the computation of the
+              ! corresponding eigenvectors.
+              call stdlib_scopy( im, w( wbegin ), 1,work( wbegin ), 1 )
+              ! we store in w the eigenvalue approximations w.r.t. the original
+              ! matrix t.
+              do i=1,im
+                 w(wbegin+i-1) = w(wbegin+i-1)+sigma
+              end do
+              ! ndepth is the current depth of the representation tree
+              ndepth = 0
+              ! parity is either 1 or 0
+              parity = 1
+              ! nclus is the number of clusters for the next level of the
+              ! representation tree, we start with nclus = 1 for the root
+              nclus = 1
+              iwork( iindc1+1 ) = 1
+              iwork( iindc1+2 ) = im
+              ! idone is the number of eigenvectors already computed in the current
+              ! block
+              idone = 0
+              ! loop while( idone<im )
+              ! generate the representation tree for the current block and
+              ! compute the eigenvectors
+              40 continue
+              if( idone<im ) then
+                 ! this is a crude protection against infinitely deep trees
+                 if( ndepth>m ) then
+                    info = -2
+                    return
+                 endif
+                 ! breadth first processing of the current level of the representation
+                 ! tree: oldncl = number of clusters on current level
+                 oldncl = nclus
+                 ! reset nclus to count the number of child clusters
+                 nclus = 0
+                 parity = 1 - parity
+                 if( parity==0 ) then
+                    oldcls = iindc1
+                    newcls = iindc2
+                 else
+                    oldcls = iindc2
+                    newcls = iindc1
+                 end if
+                 ! process the clusters on the current level
+                 loop_150: do i = 1, oldncl
+                    j = oldcls + 2*i
+                    ! oldfst, oldlst = first, last index of current cluster.
+                                     ! cluster indices start with 1 and are relative
+                                     ! to wbegin when accessing w, wgap, werr, z
+                    oldfst = iwork( j-1 )
+                    oldlst = iwork( j )
+                    if( ndepth>0 ) then
+                       ! retrieve relatively robust representation (rrr) of cluster
+                       ! that has been computed at the previous level
+                       ! the rrr is stored in z and overwritten once the eigenvectors
+                       ! have been computed or when the cluster is refined
+                       if((dol==1).and.(dou==m)) then
+                          ! get representation from location of the leftmost evalue
+                          ! of the cluster
+                          j = wbegin + oldfst - 1
+                       else
+                          if(wbegin+oldfst-1<dol) then
+                             ! get representation from the left end of z array
+                             j = dol - 1
+                          elseif(wbegin+oldfst-1>dou) then
+                             ! get representation from the right end of z array
+                             j = dou
+                          else
+                             j = wbegin + oldfst - 1
+                          endif
+                       endif
+                       do k = 1, in - 1
+                          d( ibegin+k-1 ) = real( z( ibegin+k-1,j ),KIND=sp)
+                          l( ibegin+k-1 ) = real( z( ibegin+k-1,j+1 ),KIND=sp)
+                       end do
+                       d( iend ) = real( z( iend, j ),KIND=sp)
+                       sigma = real( z( iend, j+1 ),KIND=sp)
+                       ! set the corresponding entries in z to zero
+                       call stdlib_claset( 'FULL', in, 2, czero, czero,z( ibegin, j), ldz )
+                                 
+                    end if
+                    ! compute dl and dll of current rrr
+                    do j = ibegin, iend-1
+                       tmp = d( j )*l( j )
+                       work( indld-1+j ) = tmp
+                       work( indlld-1+j ) = tmp*l( j )
+                    end do
+                    if( ndepth>0 ) then
+                       ! p and q are index of the first and last eigenvalue to compute
+                       ! within the current block
+                       p = indexw( wbegin-1+oldfst )
+                       q = indexw( wbegin-1+oldlst )
+                       ! offset for the arrays work, wgap and werr, i.e., the p-offset
+                       ! through the q-offset elements of these arrays are to be used.
+                        ! offset = p-oldfst
+                       offset = indexw( wbegin ) - 1
+                       ! perform limited bisection (if necessary) to get approximate
+                       ! eigenvalues to the precision needed.
+                       call stdlib_slarrb( in, d( ibegin ),work(indlld+ibegin-1),p, q, rtol1, &
+                       rtol2, offset,work(wbegin),wgap(wbegin),werr(wbegin),work( indwrk ), iwork(&
+                                  iindwk ),pivmin, spdiam, in, iinfo )
+                       if( iinfo/=0 ) then
+                          info = -1
+                          return
+                       endif
+                       ! we also recompute the extremal gaps. w holds all eigenvalues
+                       ! of the unshifted matrix and must be used for computation
+                       ! of wgap, the entries of work might stem from rrrs with
+                       ! different shifts. the gaps from wbegin-1+oldfst to
+                       ! wbegin-1+oldlst are correctly computed in stdlib_slarrb.
+                       ! however, we only allow the gaps to become greater since
+                       ! this is what should happen when we decrease werr
+                       if( oldfst>1) then
+                          wgap( wbegin+oldfst-2 ) =max(wgap(wbegin+oldfst-2),w(wbegin+oldfst-1)-&
+                          werr(wbegin+oldfst-1)- w(wbegin+oldfst-2)-werr(wbegin+oldfst-2) )
+                                    
+                       endif
+                       if( wbegin + oldlst -1 < wend ) then
+                          wgap( wbegin+oldlst-1 ) =max(wgap(wbegin+oldlst-1),w(wbegin+oldlst)-&
+                                    werr(wbegin+oldlst)- w(wbegin+oldlst-1)-werr(wbegin+oldlst-1) )
+                       endif
+                       ! each time the eigenvalues in work get refined, we store
+                       ! the newly found approximation with all shifts applied in w
+                       do j=oldfst,oldlst
+                          w(wbegin+j-1) = work(wbegin+j-1)+sigma
+                       end do
+                    end if
+                    ! process the current node.
+                    newfst = oldfst
+                    loop_140: do j = oldfst, oldlst
+                       if( j==oldlst ) then
+                          ! we are at the right end of the cluster, this is also the
+                          ! boundary of the child cluster
+                          newlst = j
+                       else if ( wgap( wbegin + j -1)>=minrgp* abs( work(wbegin + j -1) ) ) &
+                                 then
+                          ! the right relative gap is big enough, the child cluster
+                          ! (newfst,..,newlst) is well separated from the following
+                          newlst = j
+                        else
+                          ! inside a child cluster, the relative gap is not
+                          ! big enough.
+                          cycle loop_140
+                       end if
+                       ! compute size of child cluster found
+                       newsiz = newlst - newfst + 1
+                       ! newftt is the place in z where the new rrr or the computed
+                       ! eigenvector is to be stored
+                       if((dol==1).and.(dou==m)) then
+                          ! store representation at location of the leftmost evalue
+                          ! of the cluster
+                          newftt = wbegin + newfst - 1
+                       else
+                          if(wbegin+newfst-1<dol) then
+                             ! store representation at the left end of z array
+                             newftt = dol - 1
+                          elseif(wbegin+newfst-1>dou) then
+                             ! store representation at the right end of z array
+                             newftt = dou
+                          else
+                             newftt = wbegin + newfst - 1
+                          endif
+                       endif
+                       if( newsiz>1) then
+                          ! current child is not a singleton but a cluster.
+                          ! compute and store new representation of child.
+                          ! compute left and right cluster gap.
+                          ! lgap and rgap are not computed from work because
+                          ! the eigenvalue approximations may stem from rrrs
+                          ! different shifts. however, w hold all eigenvalues
+                          ! of the unshifted matrix. still, the entries in wgap
+                          ! have to be computed from work since the entries
+                          ! in w might be of the same order so that gaps are not
+                          ! exhibited correctly for very close eigenvalues.
+                          if( newfst==1 ) then
+                             lgap = max( zero,w(wbegin)-werr(wbegin) - vl )
+                         else
+                             lgap = wgap( wbegin+newfst-2 )
+                          endif
+                          rgap = wgap( wbegin+newlst-1 )
+                          ! compute left- and rightmost eigenvalue of child
+                          ! to high precision in order to shift as close
+                          ! as possible and obtain as large relative gaps
+                          ! as possible
+                          do k =1,2
+                             if(k==1) then
+                                p = indexw( wbegin-1+newfst )
+                             else
+                                p = indexw( wbegin-1+newlst )
+                             endif
+                             offset = indexw( wbegin ) - 1
+                             call stdlib_slarrb( in, d(ibegin),work( indlld+ibegin-1 ),p,p,rqtol, &
+                             rqtol, offset,work(wbegin),wgap(wbegin),werr(wbegin),work( indwrk ),&
+                                       iwork( iindwk ), pivmin, spdiam,in, iinfo )
+                          end do
+                          if((wbegin+newlst-1<dol).or.(wbegin+newfst-1>dou)) then
+                             ! if the cluster contains no desired eigenvalues
+                             ! skip the computation of that branch of the rep. tree
+                             ! we could skip before the refinement of the extremal
+                             ! eigenvalues of the child, but then the representation
+                             ! tree could be different from the one when nothing is
+                             ! skipped. for this reason we skip at this place.
+                             idone = idone + newlst - newfst + 1
+                             goto 139
+                          endif
+                          ! compute rrr of child cluster.
+                          ! note that the new rrr is stored in z
+                          ! stdlib_slarrf needs lwork = 2*n
+                          call stdlib_slarrf( in, d( ibegin ), l( ibegin ),work(indld+ibegin-1),&
+                          newfst, newlst, work(wbegin),wgap(wbegin), werr(wbegin),spdiam, lgap, &
+                          rgap, pivmin, tau,work( indin1 ), work( indin2 ),work( indwrk ), iinfo )
+                                    
+                          ! in the complex case, stdlib_slarrf cannot write
+                          ! the new rrr directly into z and needs an intermediate
+                          ! workspace
+                          do k = 1, in-1
+                             z( ibegin+k-1, newftt ) =cmplx( work( indin1+k-1 ), zero,KIND=sp)
+                                       
+                             z( ibegin+k-1, newftt+1 ) =cmplx( work( indin2+k-1 ), zero,KIND=sp)
+                                       
+                          end do
+                          z( iend, newftt ) =cmplx( work( indin1+in-1 ), zero,KIND=sp)
+                          if( iinfo==0 ) then
+                             ! a new rrr for the cluster was found by stdlib_slarrf
+                             ! update shift and store it
+                             ssigma = sigma + tau
+                             z( iend, newftt+1 ) = cmplx( ssigma, zero,KIND=sp)
+                             ! work() are the midpoints and werr() the semi-width
+                             ! note that the entries in w are unchanged.
+                             do k = newfst, newlst
+                                fudge =three*eps*abs(work(wbegin+k-1))
+                                work( wbegin + k - 1 ) =work( wbegin + k - 1) - tau
+                                fudge = fudge +four*eps*abs(work(wbegin+k-1))
+                                ! fudge errors
+                                werr( wbegin + k - 1 ) =werr( wbegin + k - 1 ) + fudge
+                                ! gaps are not fudged. provided that werr is small
+                                ! when eigenvalues are close, a zero gap indicates
+                                ! that a new representation is needed for resolving
+                                ! the cluster. a fudge could lead to a wrong decision
+                                ! of judging eigenvalues 'separated' which in
+                                ! reality are not. this could have a negative impact
+                                ! on the orthogonality of the computed eigenvectors.
+                             end do
+                             nclus = nclus + 1
+                             k = newcls + 2*nclus
+                             iwork( k-1 ) = newfst
+                             iwork( k ) = newlst
+                          else
+                             info = -2
+                             return
+                          endif
+                       else
+                          ! compute eigenvector of singleton
+                          iter = 0
+                          tol = four * log(real(in,KIND=sp)) * eps
+                          k = newfst
+                          windex = wbegin + k - 1
+                          windmn = max(windex - 1,1)
+                          windpl = min(windex + 1,m)
+                          lambda = work( windex )
+                          done = done + 1
+                          ! check if eigenvector computation is to be skipped
+                          if((windex<dol).or.(windex>dou)) then
+                             eskip = .true.
+                             goto 125
+                          else
+                             eskip = .false.
+                          endif
+                          left = work( windex ) - werr( windex )
+                          right = work( windex ) + werr( windex )
+                          indeig = indexw( windex )
+                          ! note that since we compute the eigenpairs for a child,
+                          ! all eigenvalue approximations are w.r.t the same shift.
+                          ! in this case, the entries in work should be used for
+                          ! computing the gaps since they exhibit even very small
+                          ! differences in the eigenvalues, as opposed to the
+                          ! entries in w which might "look" the same.
+                          if( k == 1) then
+                             ! in the case range='i' and with not much initial
+                             ! accuracy in lambda and vl, the formula
+                             ! lgap = max( zero, (sigma - vl) + lambda )
+                             ! can lead to an overestimation of the left gap and
+                             ! thus to inadequately early rqi 'convergence'.
+                             ! prevent this by forcing a small left gap.
+                             lgap = eps*max(abs(left),abs(right))
+                          else
+                             lgap = wgap(windmn)
+                          endif
+                          if( k == im) then
+                             ! in the case range='i' and with not much initial
+                             ! accuracy in lambda and vu, the formula
+                             ! can lead to an overestimation of the right gap and
+                             ! thus to inadequately early rqi 'convergence'.
+                             ! prevent this by forcing a small right gap.
+                             rgap = eps*max(abs(left),abs(right))
+                          else
+                             rgap = wgap(windex)
+                          endif
+                          gap = min( lgap, rgap )
+                          if(( k == 1).or.(k == im)) then
+                             ! the eigenvector support can become wrong
+                             ! because significant entries could be cut off due to a
+                             ! large gaptol parameter in lar1v. prevent this.
+                             gaptol = zero
+                          else
+                             gaptol = gap * eps
+                          endif
+                          isupmn = in
+                          isupmx = 1
+                          ! update wgap so that it holds the minimum gap
+                          ! to the left or the right. this is crucial in the
+                          ! case where bisection is used to ensure that the
+                          ! eigenvalue is refined up to the required precision.
+                          ! the correct value is restored afterwards.
+                          savgap = wgap(windex)
+                          wgap(windex) = gap
+                          ! we want to use the rayleigh quotient correction
+                          ! as often as possible since it converges quadratically
+                          ! when we are close enough to the desired eigenvalue.
+                          ! however, the rayleigh quotient can have the wrong sign
+                          ! and lead us away from the desired eigenvalue. in this
+                          ! case, the best we can do is to use bisection.
+                          usedbs = .false.
+                          usedrq = .false.
+                          ! bisection is initially turned off unless it is forced
+                          needbs =  .not.tryrqc
+                          120 continue
+                          ! check if bisection should be used to refine eigenvalue
+                          if(needbs) then
+                             ! take the bisection as new iterate
+                             usedbs = .true.
+                             itmp1 = iwork( iindr+windex )
+                             offset = indexw( wbegin ) - 1
+                             call stdlib_slarrb( in, d(ibegin),work(indlld+ibegin-1),indeig,&
+                             indeig,zero, two*eps, offset,work(wbegin),wgap(wbegin),werr(wbegin),&
+                                       work( indwrk ),iwork( iindwk ), pivmin, spdiam,itmp1, iinfo )
+                             if( iinfo/=0 ) then
+                                info = -3
+                                return
+                             endif
+                             lambda = work( windex )
+                             ! reset twist index from inaccurate lambda to
+                             ! force computation of true mingma
+                             iwork( iindr+windex ) = 0
+                          endif
+                          ! given lambda, compute the eigenvector.
+                          call stdlib_clar1v( in, 1, in, lambda, d( ibegin ),l( ibegin ), work(&
+                          indld+ibegin-1),work(indlld+ibegin-1),pivmin, gaptol, z( ibegin, windex &
+                          ),.not.usedbs, negcnt, ztz, mingma,iwork( iindr+windex ), isuppz( &
+                                    2*windex-1 ),nrminv, resid, rqcorr, work( indwrk ) )
+                          if(iter == 0) then
+                             bstres = resid
+                             bstw = lambda
+                          elseif(resid<bstres) then
+                             bstres = resid
+                             bstw = lambda
+                          endif
+                          isupmn = min(isupmn,isuppz( 2*windex-1 ))
+                          isupmx = max(isupmx,isuppz( 2*windex ))
+                          iter = iter + 1
+                          ! sin alpha <= |resid|/gap
+                          ! note that both the residual and the gap are
+                          ! proportional to the matrix, so ||t|| doesn't play
+                          ! a role in the quotient
+                          ! convergence test for rayleigh-quotient iteration
+                          ! (omitted when bisection has been used)
+                          if( resid>tol*gap .and. abs( rqcorr )>rqtol*abs( lambda ) .and. .not. &
+                                    usedbs)then
+                             ! we need to check that the rqcorr update doesn't
+                             ! move the eigenvalue away from the desired one and
+                             ! towards a neighbor. -> protection with bisection
+                             if(indeig<=negcnt) then
+                                ! the wanted eigenvalue lies to the left
+                                sgndef = -one
+                             else
+                                ! the wanted eigenvalue lies to the right
+                                sgndef = one
+                             endif
+                             ! we only use the rqcorr if it improves the
+                             ! the iterate reasonably.
+                             if( ( rqcorr*sgndef>=zero ).and.( lambda + rqcorr<= right).and.( &
+                                       lambda + rqcorr>= left)) then
+                                usedrq = .true.
+                                ! store new midpoint of bisection interval in work
+                                if(sgndef==one) then
+                                   ! the current lambda is on the left of the true
+                                   ! eigenvalue
+                                   left = lambda
+                                   ! we prefer to assume that the error estimate
+                                   ! is correct. we could make the interval not
+                                   ! as a bracket but to be modified if the rqcorr
+                                   ! chooses to. in this case, the right side should
+                                   ! be modified as follows:
+                                    ! right = max(right, lambda + rqcorr)
+                                else
+                                   ! the current lambda is on the right of the true
+                                   ! eigenvalue
+                                   right = lambda
+                                   ! see comment about assuming the error estimate is
+                                   ! correct above.
+                                    ! left = min(left, lambda + rqcorr)
+                                endif
+                                work( windex ) =half * (right + left)
+                                ! take rqcorr since it has the correct sign and
+                                ! improves the iterate reasonably
+                                lambda = lambda + rqcorr
+                                ! update width of error interval
+                                werr( windex ) =half * (right-left)
+                             else
+                                needbs = .true.
+                             endif
+                             if(right-left<rqtol*abs(lambda)) then
+                                   ! the eigenvalue is computed to bisection accuracy
+                                   ! compute eigenvector and stop
+                                usedbs = .true.
+                                goto 120
+                             elseif( iter<maxitr ) then
+                                goto 120
+                             elseif( iter==maxitr ) then
+                                needbs = .true.
+                                goto 120
+                             else
+                                info = 5
+                                return
+                             end if
+                          else
+                             stp2ii = .false.
+             if(usedrq .and. usedbs .and.bstres<=resid) then
+                                lambda = bstw
+                                stp2ii = .true.
+                             endif
+                             if (stp2ii) then
+                                ! improve error angle by second step
+                                call stdlib_clar1v( in, 1, in, lambda,d( ibegin ), l( ibegin ),&
+                                work(indld+ibegin-1),work(indlld+ibegin-1),pivmin, gaptol, z( &
+                                ibegin, windex ),.not.usedbs, negcnt, ztz, mingma,iwork( iindr+&
+                                windex ),isuppz( 2*windex-1 ),nrminv, resid, rqcorr, work( indwrk &
+                                          ) )
+                             endif
+                             work( windex ) = lambda
+                          end if
+                          ! compute fp-vector support w.r.t. whole matrix
+                          isuppz( 2*windex-1 ) = isuppz( 2*windex-1 )+oldien
+                          isuppz( 2*windex ) = isuppz( 2*windex )+oldien
+                          zfrom = isuppz( 2*windex-1 )
+                          zto = isuppz( 2*windex )
+                          isupmn = isupmn + oldien
+                          isupmx = isupmx + oldien
+                          ! ensure vector is ok if support in the rqi has changed
+                          if(isupmn<zfrom) then
+                             do ii = isupmn,zfrom-1
+                                z( ii, windex ) = zero
+                             end do
+                          endif
+                          if(isupmx>zto) then
+                             do ii = zto+1,isupmx
+                                z( ii, windex ) = zero
+                             end do
+                          endif
+                          call stdlib_csscal( zto-zfrom+1, nrminv,z( zfrom, windex ), 1 )
+                          125 continue
+                          ! update w
+                          w( windex ) = lambda+sigma
+                          ! recompute the gaps on the left and right
+                          ! but only allow them to become larger and not
+                          ! smaller (which can only happen through "bad"
+                          ! cancellation and doesn't reflect the theory
+                          ! where the initial gaps are underestimated due
+                          ! to werr being too crude.)
+                          if(.not.eskip) then
+                             if( k>1) then
+                                wgap( windmn ) = max( wgap(windmn),w(windex)-werr(windex)- w(&
+                                          windmn)-werr(windmn) )
+                             endif
+                             if( windex<wend ) then
+                                wgap( windex ) = max( savgap,w( windpl )-werr( windpl )- w( &
+                                          windex )-werr( windex) )
+                             endif
+                          endif
+                          idone = idone + 1
+                       endif
+                       ! here ends the code for the current child
+                       139 continue
+                       ! proceed to any remaining child nodes
+                       newfst = j + 1
+                    end do loop_140
+                 end do loop_150
+                 ndepth = ndepth + 1
+                 go to 40
+              end if
+              ibegin = iend + 1
+              wbegin = wend + 1
+           end do loop_170
+           return
+     end subroutine stdlib_clarrv
+
+     !> CLATDF: computes the contribution to the reciprocal Dif-estimate
+     !> by solving for x in Z * x = b, where b is chosen such that the norm
+     !> of x is as large as possible. It is assumed that LU decomposition
+     !> of Z has been computed by CGETC2. On entry RHS = f holds the
+     !> contribution from earlier solved sub-systems, and on return RHS = x.
+     !> The factorization of Z returned by CGETC2 has the form
+     !> Z = P * L * U * Q, where P and Q are permutation matrices. L is lower
+     !> triangular with unit diagonal elements and U is upper triangular.
+
+     pure subroutine stdlib_clatdf( ijob, n, z, ldz, rhs, rdsum, rdscal, ipiv,jpiv )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ijob, ldz, n
+           real(sp), intent(inout) :: rdscal, rdsum
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*), jpiv(*)
+           complex(sp), intent(inout) :: rhs(*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxdim = 2
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j, k
+           real(sp) :: rtemp, scale, sminu, splus
+           complex(sp) :: bm, bp, pmone, temp
+           ! Local Arrays 
+           real(sp) :: rwork(maxdim)
+           complex(sp) :: work(4*maxdim), xm(maxdim), xp(maxdim)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,sqrt
+           ! Executable Statements 
+           if( ijob/=2 ) then
+              ! apply permutations ipiv to rhs
+              call stdlib_claswp( 1, rhs, ldz, 1, n-1, ipiv, 1 )
+              ! solve for l-part choosing rhs either to +1 or -1.
+              pmone = -cone
+              loop_10: do j = 1, n - 1
+                 bp = rhs( j ) + cone
+                 bm = rhs( j ) - cone
+                 splus = one
+                 ! lockahead for l- part rhs(1:n-1) = +-1
+                 ! splus and smin computed more efficiently than in bsolve[1].
+                 splus = splus + real( stdlib_cdotc( n-j, z( j+1, j ), 1, z( j+1,j ), 1 ),KIND=sp)
+                           
+                 sminu = real( stdlib_cdotc( n-j, z( j+1, j ), 1, rhs( j+1 ), 1 ),KIND=sp)
+                 splus = splus*real( rhs( j ),KIND=sp)
+                 if( splus>sminu ) then
+                    rhs( j ) = bp
+                 else if( sminu>splus ) then
+                    rhs( j ) = bm
+                 else
+                    ! in this case the updating sums are equal and we can
+                    ! choose rhs(j) +1 or -1. the first time this happens we
+                    ! choose -1, thereafter +1. this is a simple way to get
+                    ! good estimates of matrices like byers well-known example
+                    ! (see [1]). (not done in bsolve.)
+                    rhs( j ) = rhs( j ) + pmone
+                    pmone = cone
+                 end if
+                 ! compute the remaining r.h.s.
+                 temp = -rhs( j )
+                 call stdlib_caxpy( n-j, temp, z( j+1, j ), 1, rhs( j+1 ), 1 )
+              end do loop_10
+              ! solve for u- part, lockahead for rhs(n) = +-1. this is not done
+              ! in bsolve and will hopefully give us a better estimate because
+              ! any ill-conditioning of the original matrix is transferred to u
+              ! and not to l. u(n, n) is an approximation to sigma_min(lu).
+              call stdlib_ccopy( n-1, rhs, 1, work, 1 )
+              work( n ) = rhs( n ) + cone
+              rhs( n ) = rhs( n ) - cone
+              splus = zero
+              sminu = zero
+              do i = n, 1, -1
+                 temp = cone / z( i, i )
+                 work( i ) = work( i )*temp
+                 rhs( i ) = rhs( i )*temp
+                 do k = i + 1, n
+                    work( i ) = work( i ) - work( k )*( z( i, k )*temp )
+                    rhs( i ) = rhs( i ) - rhs( k )*( z( i, k )*temp )
+                 end do
+                 splus = splus + abs( work( i ) )
+                 sminu = sminu + abs( rhs( i ) )
+              end do
+              if( splus>sminu )call stdlib_ccopy( n, work, 1, rhs, 1 )
+              ! apply the permutations jpiv to the computed solution (rhs)
+              call stdlib_claswp( 1, rhs, ldz, 1, n-1, jpiv, -1 )
+              ! compute the sum of squares
+              call stdlib_classq( n, rhs, 1, rdscal, rdsum )
+              return
+           end if
+           ! entry ijob = 2
+           ! compute approximate nullvector xm of z
+           call stdlib_cgecon( 'I', n, z, ldz, one, rtemp, work, rwork, info )
+           call stdlib_ccopy( n, work( n+1 ), 1, xm, 1 )
+           ! compute rhs
+           call stdlib_claswp( 1, xm, ldz, 1, n-1, ipiv, -1 )
+           temp = cone / sqrt( stdlib_cdotc( n, xm, 1, xm, 1 ) )
+           call stdlib_cscal( n, temp, xm, 1 )
+           call stdlib_ccopy( n, xm, 1, xp, 1 )
+           call stdlib_caxpy( n, cone, rhs, 1, xp, 1 )
+           call stdlib_caxpy( n, -cone, xm, 1, rhs, 1 )
+           call stdlib_cgesc2( n, z, ldz, rhs, ipiv, jpiv, scale )
+           call stdlib_cgesc2( n, z, ldz, xp, ipiv, jpiv, scale )
+           if( stdlib_scasum( n, xp, 1 )>stdlib_scasum( n, rhs, 1 ) )call stdlib_ccopy( n, xp, 1, &
+                     rhs, 1 )
+           ! compute the sum of squares
+           call stdlib_classq( n, rhs, 1, rdscal, rdsum )
+           return
+     end subroutine stdlib_clatdf
+
+     !> CLAUNHR_COL_GETRFNP: computes the modified LU factorization without
+     !> pivoting of a complex general M-by-N matrix A. The factorization has
+     !> the form:
+     !> A - S = L * U,
+     !> where:
+     !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+     !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+     !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+     !> i-1 steps of Gaussian elimination. This means that the diagonal
+     !> element at each step of "modified" Gaussian elimination is
+     !> at least one in absolute value (so that division-by-zero not
+     !> not possible during the division by the diagonal element);
+     !> L is a M-by-N lower triangular matrix with unit diagonal elements
+     !> (lower trapezoidal if M > N);
+     !> and U is a M-by-N upper triangular matrix
+     !> (upper trapezoidal if M < N).
+     !> This routine is an auxiliary routine used in the Householder
+     !> reconstruction routine CUNHR_COL. In CUNHR_COL, this routine is
+     !> applied to an M-by-N matrix A with orthonormal columns, where each
+     !> element is bounded by one in absolute value. With the choice of
+     !> the matrix S above, one can show that the diagonal element at each
+     !> step of Gaussian elimination is the largest (in absolute value) in
+     !> the column on or below the diagonal, so that no pivoting is required
+     !> for numerical stability [1].
+     !> For more details on the Householder reconstruction algorithm,
+     !> including the modified LU factorization, see [1].
+     !> This is the blocked right-looking version of the algorithm,
+     !> calling Level 3 BLAS to update the submatrix. To factorize a block,
+     !> this routine calls the recursive routine CLAUNHR_COL_GETRFNP2.
+     !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+     !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+     !> E. Solomonik, J. Parallel Distrib. Comput.,
+     !> vol. 85, pp. 3-31, 2015.
+
+     pure subroutine stdlib_claunhr_col_getrfnp( m, n, a, lda, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: d(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: iinfo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLAUNHR_COL_GETRFNP', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'CLAUNHR_COL_GETRFNP', ' ', m, n, -1, -1 )
+           if( nb<=1 .or. nb>=min( m, n ) ) then
+              ! use unblocked code.
+              call stdlib_claunhr_col_getrfnp2( m, n, a, lda, d, info )
+           else
+              ! use blocked code.
+              do j = 1, min( m, n ), nb
+                 jb = min( min( m, n )-j+1, nb )
+                 ! factor diagonal and subdiagonal blocks.
+                 call stdlib_claunhr_col_getrfnp2( m-j+1, jb, a( j, j ), lda,d( j ), iinfo )
+                           
+                 if( j+jb<=n ) then
+                    ! compute block row of u.
+                    call stdlib_ctrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', jb,n-j-jb+1, cone,&
+                               a( j, j ), lda, a( j, j+jb ),lda )
+                    if( j+jb<=m ) then
+                       ! update trailing submatrix.
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-j-jb+1,n-j-jb+1, jb, -&
+                       cone, a( j+jb, j ), lda,a( j, j+jb ), lda, cone, a( j+jb, j+jb ),lda )
+                                 
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_claunhr_col_getrfnp
+
+     !> CPBCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex Hermitian positive definite band matrix using
+     !> the Cholesky factorization A = U**H*U or A = L*L**H computed by
+     !> CPBTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_cpbcon( uplo, n, kd, ab, ldab, anorm, rcond, work,rwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(in) :: ab(ldab,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(sp) :: ainvnm, scale, scalel, scaleu, smlnum
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**h).
+                 call stdlib_clatbs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, kd, ab,&
+                            ldab, work, scalel, rwork,info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_clatbs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scaleu, rwork, info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_clatbs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(l**h).
+                 call stdlib_clatbs( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, kd, ab,&
+                            ldab, work, scaleu, rwork,info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_icamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_csrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_cpbcon
+
+     !> CPBRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian positive definite
+     !> and banded, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_cpbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b,ldb, x, ldx, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldafb, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, l, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,min,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldafb<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = min( n+1, 2*kd+2 )
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_ccopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_chbmv( uplo, n, kd, -cone, ab, ldab, x( 1, j ), 1, cone,work, 1 )
+                        
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    l = kd + 1 - k
+                    do i = max( 1, k-kd ), k - 1
+                       rwork( i ) = rwork( i ) + cabs1( ab( l+i, k ) )*xk
+                       s = s + cabs1( ab( l+i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( ab( kd+1, k ),KIND=sp) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( ab( 1, k ),KIND=sp) )*xk
+                    l = 1 - k
+                    do i = k + 1, min( n, k+kd )
+                       rwork( i ) = rwork( i ) + cabs1( ab( l+i, k ) )*xk
+                       s = s + cabs1( ab( l+i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_cpbtrs( uplo, n, kd, 1, afb, ldafb, work, n, info )
+                 call stdlib_caxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_clacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_cpbtrs( uplo, n, kd, 1, afb, ldafb, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_cpbtrs( uplo, n, kd, 1, afb, ldafb, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_cpbrfs
+
+     !> CPBTRF: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite band matrix A.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+
+     pure subroutine stdlib_cpbtrf( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 32
+           integer(ilp), parameter :: ldwork = nbmax+1
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i2, i3, ib, ii, j, jj, nb
+           ! Local Arrays 
+           complex(sp) :: work(ldwork,nbmax)
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( .not.stdlib_lsame( uplo, 'U' ) ) .and.( .not.stdlib_lsame( uplo, 'L' ) ) ) &
+                     then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPBTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment
+           nb = stdlib_ilaenv( 1, 'CPBTRF', uplo, n, kd, -1, -1 )
+           ! the block size must not exceed the semi-bandwidth kd, and must not
+           ! exceed the limit set by the size of the local array work.
+           nb = min( nb, nbmax )
+           if( nb<=1 .or. nb>kd ) then
+              ! use unblocked code
+              call stdlib_cpbtf2( uplo, n, kd, ab, ldab, info )
+           else
+              ! use blocked code
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! compute the cholesky factorization of a hermitian band
+                 ! matrix, given the upper triangle of the matrix in band
+                 ! storage.
+                 ! zero the upper triangle of the work array.
+                 do j = 1, nb
+                    do i = 1, j - 1
+                       work( i, j ) = zero
+                    end do
+                 end do
+                 ! process the band matrix one diagonal block at a time.
+                 loop_70: do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    ! factorize the diagonal block
+                    call stdlib_cpotf2( uplo, ib, ab( kd+1, i ), ldab-1, ii )
+                    if( ii/=0 ) then
+                       info = i + ii - 1
+                       go to 150
+                    end if
+                    if( i+ib<=n ) then
+                       ! update the relevant part of the trailing submatrix.
+                       ! if a11 denotes the diagonal block which has just been
+                       ! factorized, then we need to update the remaining
+                       ! blocks in the diagram:
+                          ! a11   a12   a13
+                                ! a22   a23
+                                      ! a33
+                       ! the numbers of rows and columns in the partitioning
+                       ! are ib, i2, i3 respectively. the blocks a12, a22 and
+                       ! a23 are empty if ib = kd. the upper triangle of a13
+                       ! lies outside the band.
+                       i2 = min( kd-ib, n-i-ib+1 )
+                       i3 = min( ib, n-i-kd+1 )
+                       if( i2>0 ) then
+                          ! update a12
+                          call stdlib_ctrsm( 'LEFT', 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT', &
+                                    ib, i2, cone,ab( kd+1, i ), ldab-1,ab( kd+1-ib, i+ib ), ldab-1 )
+                          ! update a22
+                          call stdlib_cherk( 'UPPER', 'CONJUGATE TRANSPOSE', i2, ib,-one, ab( kd+&
+                                    1-ib, i+ib ), ldab-1, one,ab( kd+1, i+ib ), ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! copy the lower triangle of a13 into the work array.
+                          do jj = 1, i3
+                             do ii = jj, ib
+                                work( ii, jj ) = ab( ii-jj+1, jj+i+kd-1 )
+                             end do
+                          end do
+                          ! update a13 (in the work array).
+                          call stdlib_ctrsm( 'LEFT', 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT', &
+                                    ib, i3, cone,ab( kd+1, i ), ldab-1, work, ldwork )
+                          ! update a23
+                          if( i2>0 )call stdlib_cgemm( 'CONJUGATE TRANSPOSE','NO TRANSPOSE', i2, &
+                          i3, ib, -cone,ab( kd+1-ib, i+ib ), ldab-1, work,ldwork, cone, ab( 1+ib, &
+                                    i+kd ),ldab-1 )
+                          ! update a33
+                          call stdlib_cherk( 'UPPER', 'CONJUGATE TRANSPOSE', i3, ib,-one, work, &
+                                    ldwork, one,ab( kd+1, i+kd ), ldab-1 )
+                          ! copy the lower triangle of a13 back into place.
+                          do jj = 1, i3
+                             do ii = jj, ib
+                                ab( ii-jj+1, jj+i+kd-1 ) = work( ii, jj )
+                             end do
+                          end do
+                       end if
+                    end if
+                 end do loop_70
+              else
+                 ! compute the cholesky factorization of a hermitian band
+                 ! matrix, given the lower triangle of the matrix in band
+                 ! storage.
+                 ! zero the lower triangle of the work array.
+                 do j = 1, nb
+                    do i = j + 1, nb
+                       work( i, j ) = zero
+                    end do
+                 end do
+                 ! process the band matrix one diagonal block at a time.
+                 loop_140: do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    ! factorize the diagonal block
+                    call stdlib_cpotf2( uplo, ib, ab( 1, i ), ldab-1, ii )
+                    if( ii/=0 ) then
+                       info = i + ii - 1
+                       go to 150
+                    end if
+                    if( i+ib<=n ) then
+                       ! update the relevant part of the trailing submatrix.
+                       ! if a11 denotes the diagonal block which has just been
+                       ! factorized, then we need to update the remaining
+                       ! blocks in the diagram:
+                          ! a11
+                          ! a21   a22
+                          ! a31   a32   a33
+                       ! the numbers of rows and columns in the partitioning
+                       ! are ib, i2, i3 respectively. the blocks a21, a22 and
+                       ! a32 are empty if ib = kd. the lower triangle of a31
+                       ! lies outside the band.
+                       i2 = min( kd-ib, n-i-ib+1 )
+                       i3 = min( ib, n-i-kd+1 )
+                       if( i2>0 ) then
+                          ! update a21
+                          call stdlib_ctrsm( 'RIGHT', 'LOWER','CONJUGATE TRANSPOSE', 'NON-UNIT', &
+                                    i2,ib, cone, ab( 1, i ), ldab-1,ab( 1+ib, i ), ldab-1 )
+                          ! update a22
+                          call stdlib_cherk( 'LOWER', 'NO TRANSPOSE', i2, ib, -one,ab( 1+ib, i ), &
+                                    ldab-1, one,ab( 1, i+ib ), ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! copy the upper triangle of a31 into the work array.
+                          do jj = 1, ib
+                             do ii = 1, min( jj, i3 )
+                                work( ii, jj ) = ab( kd+1-jj+ii, jj+i-1 )
+                             end do
+                          end do
+                          ! update a31 (in the work array).
+                          call stdlib_ctrsm( 'RIGHT', 'LOWER','CONJUGATE TRANSPOSE', 'NON-UNIT', &
+                                    i3,ib, cone, ab( 1, i ), ldab-1, work,ldwork )
+                          ! update a32
+                          if( i2>0 )call stdlib_cgemm( 'NO TRANSPOSE','CONJUGATE TRANSPOSE', i3, &
+                          i2, ib,-cone, work, ldwork, ab( 1+ib, i ),ldab-1, cone, ab( 1+kd-ib, i+&
+                                    ib ),ldab-1 )
+                          ! update a33
+                          call stdlib_cherk( 'LOWER', 'NO TRANSPOSE', i3, ib, -one,work, ldwork, &
+                                    one, ab( 1, i+kd ),ldab-1 )
+                          ! copy the upper triangle of a31 back into place.
+                          do jj = 1, ib
+                             do ii = 1, min( jj, i3 )
+                                ab( kd+1-jj+ii, jj+i-1 ) = work( ii, jj )
+                             end do
+                          end do
+                       end if
+                    end if
+                 end do loop_140
+              end if
+           end if
+           return
+           150 continue
+           return
+     end subroutine stdlib_cpbtrf
+
+     !> CPFTRS: solves a system of linear equations A*X = B with a Hermitian
+     !> positive definite matrix A using the Cholesky factorization
+     !> A = U**H*U or A = L*L**H computed by CPFTRF.
+
+     pure subroutine stdlib_cpftrs( transr, uplo, n, nrhs, a, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(0:*)
+           complex(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, normaltransr
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPFTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! start execution: there are two triangular solves
+           if( lower ) then
+              call stdlib_ctfsm( transr, 'L', uplo, 'N', 'N', n, nrhs, cone, a, b,ldb )
+              call stdlib_ctfsm( transr, 'L', uplo, 'C', 'N', n, nrhs, cone, a, b,ldb )
+           else
+              call stdlib_ctfsm( transr, 'L', uplo, 'C', 'N', n, nrhs, cone, a, b,ldb )
+              call stdlib_ctfsm( transr, 'L', uplo, 'N', 'N', n, nrhs, cone, a, b,ldb )
+           end if
+           return
+     end subroutine stdlib_cpftrs
+
+     !> CPOCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex Hermitian positive definite matrix using the
+     !> Cholesky factorization A = U**H*U or A = L*L**H computed by CPOTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_cpocon( uplo, n, a, lda, anorm, rcond, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(sp) :: ainvnm, scale, scalel, scaleu, smlnum
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPOCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of inv(a).
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**h).
+                 call stdlib_clatrs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, a, lda,&
+                            work, scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_clatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           scaleu, rwork, info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_clatrs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(l**h).
+                 call stdlib_clatrs( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, a, lda,&
+                            work, scaleu, rwork, info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_icamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_csrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_cpocon
+
+     !> CPORFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian positive definite,
+     !> and provides error bounds and backward error estimates for the
+     !> solution.
+
+     pure subroutine stdlib_cporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x,ldx, ferr, berr, &
+               work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: x(ldx,*)
+        ! ====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPORFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_ccopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_chemv( uplo, n, -cone, a, lda, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( a( k, k ),KIND=sp) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( a( k, k ),KIND=sp) )*xk
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_cpotrs( uplo, n, 1, af, ldaf, work, n, info )
+                 call stdlib_caxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_clacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_cpotrs( uplo, n, 1, af, ldaf, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_cpotrs( uplo, n, 1, af, ldaf, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_cporfs
+
+     !> CPOTRF: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the block version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_cpotrf( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPOTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'CPOTRF', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code.
+              call stdlib_cpotrf2( uplo, n, a, lda, info )
+           else
+              ! use blocked code.
+              if( upper ) then
+                 ! compute the cholesky factorization a = u**h *u.
+                 do j = 1, n, nb
+                    ! update and factorize the current diagonal block and test
+                    ! for non-positive-definiteness.
+                    jb = min( nb, n-j+1 )
+                    call stdlib_cherk( 'UPPER', 'CONJUGATE TRANSPOSE', jb, j-1,-one, a( 1, j ), &
+                              lda, one, a( j, j ), lda )
+                    call stdlib_cpotrf2( 'UPPER', jb, a( j, j ), lda, info )
+                    if( info/=0 )go to 30
+                    if( j+jb<=n ) then
+                       ! compute the current block row.
+                       call stdlib_cgemm( 'CONJUGATE TRANSPOSE', 'NO TRANSPOSE', jb,n-j-jb+1, j-1,&
+                                  -cone, a( 1, j ), lda,a( 1, j+jb ), lda, cone, a( j, j+jb ),lda )
+                       call stdlib_ctrsm( 'LEFT', 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT', jb, &
+                                 n-j-jb+1, cone, a( j, j ),lda, a( j, j+jb ), lda )
+                    end if
+                 end do
+              else
+                 ! compute the cholesky factorization a = l*l**h.
+                 do j = 1, n, nb
+                    ! update and factorize the current diagonal block and test
+                    ! for non-positive-definiteness.
+                    jb = min( nb, n-j+1 )
+                    call stdlib_cherk( 'LOWER', 'NO TRANSPOSE', jb, j-1, -one,a( j, 1 ), lda, one,&
+                               a( j, j ), lda )
+                    call stdlib_cpotrf2( 'LOWER', jb, a( j, j ), lda, info )
+                    if( info/=0 )go to 30
+                    if( j+jb<=n ) then
+                       ! compute the current block column.
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',n-j-jb+1, jb, j-1,&
+                                  -cone, a( j+jb, 1 ),lda, a( j, 1 ), lda, cone, a( j+jb, j ),lda )
+                       call stdlib_ctrsm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','NON-UNIT', n-j-&
+                                 jb+1, jb, cone, a( j, j ),lda, a( j+jb, j ), lda )
+                    end if
+                 end do
+              end if
+           end if
+           go to 40
+           30 continue
+           info = info + j - 1
+           40 continue
+           return
+     end subroutine stdlib_cpotrf
+
+     !> CPOTRI: computes the inverse of a complex Hermitian positive definite
+     !> matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+     !> computed by CPOTRF.
+
+     pure subroutine stdlib_cpotri( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPOTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_ctrtri( uplo, 'NON-UNIT', n, a, lda, info )
+           if( info>0 )return
+           ! form inv(u) * inv(u)**h or inv(l)**h * inv(l).
+           call stdlib_clauum( uplo, n, a, lda, info )
+           return
+     end subroutine stdlib_cpotri
+
+     !> CPPCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex Hermitian positive definite packed matrix using
+     !> the Cholesky factorization A = U**H*U or A = L*L**H computed by
+     !> CPPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_cppcon( uplo, n, ap, anorm, rcond, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(in) :: ap(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(sp) :: ainvnm, scale, scalel, scaleu, smlnum
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**h).
+                 call stdlib_clatps( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, ap, &
+                           work, scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_clatps( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,ap, work, &
+                           scaleu, rwork, info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_clatps( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,ap, work, &
+                           scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(l**h).
+                 call stdlib_clatps( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, ap, &
+                           work, scaleu, rwork, info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_icamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_csrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_cppcon
+
+     !> CPPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian positive definite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_cpprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr,berr, work, &
+               rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: x(ldx,*)
+        ! ====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_ccopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_chpmv( uplo, n, -cone, ap, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( ap( kk+k-1 ),KIND=sp) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( ap( kk ),KIND=sp) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_cpptrs( uplo, n, 1, afp, work, n, info )
+                 call stdlib_caxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_clacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_cpptrs( uplo, n, 1, afp, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_cpptrs( uplo, n, 1, afp, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_cpprfs
+
+     !> CPPSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix stored in
+     !> packed format and X and B are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is a lower triangular
+     !> matrix.  The factored form of A is then used to solve the system of
+     !> equations A * X = B.
+
+     pure subroutine stdlib_cppsv( uplo, n, nrhs, ap, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(sp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPPSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**h *u or a = l*l**h.
+           call stdlib_cpptrf( uplo, n, ap, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_cpptrs( uplo, n, nrhs, ap, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_cppsv
+
+     !> CPPSVX: uses the Cholesky factorization A = U**H*U or A = L*L**H to
+     !> compute the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix stored in
+     !> packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_cppsvx( fact, uplo, n, nrhs, ap, afp, equed, s, b, ldb,x, ldx, rcond, ferr,&
+                berr, work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(sp), intent(inout) :: s(*)
+           complex(sp), intent(inout) :: afp(*), ap(*), b(ldb,*)
+           complex(sp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ
+           integer(ilp) :: i, infequ, j
+           real(sp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -7
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -8
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -10
+                 else if( ldx<max( 1, n ) ) then
+                    info = -12
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPPSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_cppequ( uplo, n, ap, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_claqhp( uplo, n, ap, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right-hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**h * u or a = l * l**h.
+              call stdlib_ccopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_cpptrf( uplo, n, afp, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_clanhp( 'I', uplo, n, ap, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_cppcon( uplo, n, afp, anorm, rcond, work, rwork, info )
+           ! compute the solution matrix x.
+           call stdlib_clacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_cpptrs( uplo, n, nrhs, afp, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_cpprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr, berr,work, rwork, &
+                     info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_cppsvx
+
+     !> CPPTRI: computes the inverse of a complex Hermitian positive definite
+     !> matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+     !> computed by CPPTRF.
+
+     pure subroutine stdlib_cpptri( uplo, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jc, jj, jjn
+           real(sp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_ctptri( uplo, 'NON-UNIT', n, ap, info )
+           if( info>0 )return
+           if( upper ) then
+              ! compute the product inv(u) * inv(u)**h.
+              jj = 0
+              do j = 1, n
+                 jc = jj + 1
+                 jj = jj + j
+                 if( j>1 )call stdlib_chpr( 'UPPER', j-1, one, ap( jc ), 1, ap )
+                 ajj = real( ap( jj ),KIND=sp)
+                 call stdlib_csscal( j, ajj, ap( jc ), 1 )
+              end do
+           else
+              ! compute the product inv(l)**h * inv(l).
+              jj = 1
+              do j = 1, n
+                 jjn = jj + n - j + 1
+                 ap( jj ) = real( stdlib_cdotc( n-j+1, ap( jj ), 1, ap( jj ), 1 ),KIND=sp)
+                 if( j<n )call stdlib_ctpmv( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',n-j, ap( &
+                           jjn ), ap( jj+1 ), 1 )
+                 jj = jjn
+              end do
+           end if
+           return
+     end subroutine stdlib_cpptri
+
+     !> CPTEQR: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric positive definite tridiagonal matrix by first factoring the
+     !> matrix using SPTTRF and then calling CBDSQR to compute the singular
+     !> values of the bidiagonal factor.
+     !> This routine computes the eigenvalues of the positive definite
+     !> tridiagonal matrix to high relative accuracy.  This means that if the
+     !> eigenvalues range over many orders of magnitude in size, then the
+     !> small eigenvalues and corresponding eigenvectors will be computed
+     !> more accurately than, for example, with the standard QR method.
+     !> The eigenvectors of a full or band positive definite Hermitian matrix
+     !> can also be found if CHETRD, CHPTRD, or CHBTRD has been used to
+     !> reduce this matrix to tridiagonal form.  (The reduction to
+     !> tridiagonal form, however, may preclude the possibility of obtaining
+     !> high relative accuracy in the small eigenvalues of the original
+     !> matrix, if these eigenvalues range over many orders of magnitude.)
+
+     pure subroutine stdlib_cpteqr( compz, n, d, e, z, ldz, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: z(ldz,*)
+        ! ====================================================================
+           
+           ! Local Arrays 
+           complex(sp) :: c(1,1), vt(1,1)
+           ! Local Scalars 
+           integer(ilp) :: i, icompz, nru
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or. ( icompz>0 .and. ldz<max( 1,n ) ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPTEQR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz>0 )z( 1, 1 ) = cone
+              return
+           end if
+           if( icompz==2 )call stdlib_claset( 'FULL', n, n, czero, cone, z, ldz )
+           ! call stdlib_spttrf to factor the matrix.
+           call stdlib_spttrf( n, d, e, info )
+           if( info/=0 )return
+           do i = 1, n
+              d( i ) = sqrt( d( i ) )
+           end do
+           do i = 1, n - 1
+              e( i ) = e( i )*d( i )
+           end do
+           ! call stdlib_cbdsqr to compute the singular values/vectors of the
+           ! bidiagonal factor.
+           if( icompz>0 ) then
+              nru = n
+           else
+              nru = 0
+           end if
+           call stdlib_cbdsqr( 'LOWER', n, 0, nru, 0, d, e, vt, 1, z, ldz, c, 1,work, info )
+                     
+           ! square the singular values.
+           if( info==0 ) then
+              do i = 1, n
+                 d( i ) = d( i )*d( i )
+              end do
+           else
+              info = n + info
+           end if
+           return
+     end subroutine stdlib_cpteqr
+
+     !> CPTTRS: solves a tridiagonal system of the form
+     !> A * X = B
+     !> using the factorization A = U**H*D*U or A = L*D*L**H computed by CPTTRF.
+     !> D is a diagonal matrix specified in the vector D, U (or L) is a unit
+     !> bidiagonal matrix whose superdiagonal (subdiagonal) is specified in
+     !> the vector E, and X and B are N by NRHS matrices.
+
+     pure subroutine stdlib_cpttrs( uplo, n, nrhs, d, e, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*)
+           complex(sp), intent(inout) :: b(ldb,*)
+           complex(sp), intent(in) :: e(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: iuplo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           upper = ( uplo=='U' .or. uplo=='U' )
+           if( .not.upper .and. .not.( uplo=='L' .or. uplo=='L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPTTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! determine the number of right-hand sides to solve at a time.
+           if( nrhs==1 ) then
+              nb = 1
+           else
+              nb = max( 1, stdlib_ilaenv( 1, 'CPTTRS', uplo, n, nrhs, -1, -1 ) )
+           end if
+           ! decode uplo
+           if( upper ) then
+              iuplo = 1
+           else
+              iuplo = 0
+           end if
+           if( nb>=nrhs ) then
+              call stdlib_cptts2( iuplo, n, nrhs, d, e, b, ldb )
+           else
+              do j = 1, nrhs, nb
+                 jb = min( nrhs-j+1, nb )
+                 call stdlib_cptts2( iuplo, n, jb, d, e, b( 1, j ), ldb )
+              end do
+           end if
+           return
+     end subroutine stdlib_cpttrs
+
+     !> CSPCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex symmetric packed matrix A using the
+     !> factorization A = U*D*U**T or A = L*D*L**T computed by CSPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_cspcon( uplo, n, ap, ipiv, anorm, rcond, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: ap(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ip, kase
+           real(sp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              ip = n*( n+1 ) / 2
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip - i
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              ip = 1
+              do i = 1, n
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip + n - i + 1
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_csptrs( uplo, n, 1, ap, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_cspcon
+
+     !> CSPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric indefinite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_csprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, work,&
+                rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_ccopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_cspmv( uplo, n, -cone, ap, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + cabs1( ap( kk+k-1 ) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + cabs1( ap( kk ) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_csptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                 call stdlib_caxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_clacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_csptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_csptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_csprfs
+
+     !> CSPSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix stored in packed format and X
+     !> and B are N-by-NRHS matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, D is symmetric and block diagonal with 1-by-1
+     !> and 2-by-2 diagonal blocks.  The factored form of A is then used to
+     !> solve the system of equations A * X = B.
+
+     pure subroutine stdlib_cspsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSPSV ', -info )
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_csptrf( uplo, n, ap, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_csptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_cspsv
+
+     !> CSPSVX: uses the diagonal pivoting factorization A = U*D*U**T or
+     !> A = L*D*L**T to compute the solution to a complex system of linear
+     !> equations A * X = B, where A is an N-by-N symmetric matrix stored
+     !> in packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_cspsvx( fact, uplo, n, nrhs, ap, afp, ipiv, b, ldb, x,ldx, rcond, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(inout) :: afp(*)
+           complex(sp), intent(in) :: ap(*), b(ldb,*)
+           complex(sp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(sp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSPSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+              call stdlib_ccopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_csptrf( uplo, n, afp, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_clansp( 'I', uplo, n, ap, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_cspcon( uplo, n, afp, ipiv, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_clacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_csptrs( uplo, n, nrhs, afp, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_csprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx, ferr,berr, work, &
+                     rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_cspsvx
+
+     !> CSTEMR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+     !> a well defined set of pairwise different real eigenvalues, the corresponding
+     !> real eigenvectors are pairwise orthogonal.
+     !> The spectrum may be computed either completely or partially by specifying
+     !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+     !> eigenvalues.
+     !> Depending on the number of desired eigenvalues, these are computed either
+     !> by bisection or the dqds algorithm. Numerically orthogonal eigenvectors are
+     !> computed by the use of various suitable L D L^T factorizations near clusters
+     !> of close eigenvalues (referred to as RRRs, Relatively Robust
+     !> Representations). An informal sketch of the algorithm follows.
+     !> For each unreduced block (submatrix) of T,
+     !> (a) Compute T - sigma I  = L D L^T, so that L and D
+     !> define all the wanted eigenvalues to high relative accuracy.
+     !> This means that small relative changes in the entries of D and L
+     !> cause only small relative changes in the eigenvalues and
+     !> eigenvectors. The standard (unfactored) representation of the
+     !> tridiagonal matrix T does not have this property in general.
+     !> (b) Compute the eigenvalues to suitable accuracy.
+     !> If the eigenvectors are desired, the algorithm attains full
+     !> accuracy of the computed eigenvalues only right before
+     !> the corresponding vectors have to be computed, see steps c) and d).
+     !> (c) For each cluster of close eigenvalues, select a new
+     !> shift close to the cluster, find a new factorization, and refine
+     !> the shifted eigenvalues to suitable accuracy.
+     !> (d) For each eigenvalue with a large enough relative separation compute
+     !> the corresponding eigenvector by forming a rank revealing twisted
+     !> factorization. Go back to (c) for any clusters that remain.
+     !> For more details, see:
+     !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+     !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+     !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+     !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+     !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+     !> 2004.  Also LAPACK Working Note 154.
+     !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem",
+     !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Further Details
+     !> 1.CSTEMR works only on machines which follow IEEE-754
+     !> floating-point standard in their handling of infinities and NaNs.
+     !> This permits the use of efficient inner loops avoiding a check for
+     !> zero divisors.
+     !> 2. LAPACK routines can be used to reduce a complex Hermitean matrix to
+     !> real symmetric tridiagonal form.
+     !> (Any complex Hermitean tridiagonal matrix has real values on its diagonal
+     !> and potentially complex numbers on its off-diagonals. By applying a
+     !> similarity transform with an appropriate diagonal matrix
+     !> diag(1,e^{i \phy_1}, ... , e^{i \phy_{n-1}}), the complex Hermitean
+     !> matrix can be transformed into a real symmetric matrix and complex
+     !> arithmetic can be entirely avoided.)
+     !> While the eigenvectors of the real symmetric tridiagonal matrix are real,
+     !> the eigenvectors of original complex Hermitean matrix have complex entries
+     !> in general.
+     !> Since LAPACK drivers overwrite the matrix data with the eigenvectors,
+     !> CSTEMR accepts complex workspace to facilitate interoperability
+     !> with CUNMTR or CUPMTR.
+
+     pure subroutine stdlib_cstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, nzc, &
+               isuppz, tryrac, work, lwork,iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           logical(lk), intent(inout) :: tryrac
+           integer(ilp), intent(in) :: il, iu, ldz, nzc, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: w(*), work(*)
+           complex(sp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: minrgp = 3.0e-3_sp
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lquery, valeig, wantz, zquery
+           integer(ilp) :: i, ibegin, iend, ifirst, iil, iindbl, iindw, iindwk, iinfo, iinspl, &
+           iiu, ilast, in, indd, inde2, inderr, indgp, indgrs, indwrk, itmp, itmp2, j, jblk, jj, &
+                     liwmin, lwmin, nsplit, nzcmin, offset, wbegin, wend
+           real(sp) :: bignum, cs, eps, pivmin, r1, r2, rmax, rmin, rtol1, rtol2, safmin, scale, &
+                     smlnum, sn, thresh, tmp, tnrm, wl, wu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ).or.( liwork==-1 ) )
+           zquery = ( nzc==-1 )
+           ! stdlib_sstemr needs work of size 6*n, iwork of size 3*n.
+           ! in addition, stdlib_slarre needs work of size 6*n, iwork of size 5*n.
+           ! furthermore, stdlib_clarrv needs work of size 12*n, iwork of size 7*n.
+           if( wantz ) then
+              lwmin = 18*n
+              liwmin = 10*n
+           else
+              ! need less workspace if only the eigenvalues are wanted
+              lwmin = 12*n
+              liwmin = 8*n
+           endif
+           wl = zero
+           wu = zero
+           iil = 0
+           iiu = 0
+           nsplit = 0
+           if( valeig ) then
+              ! we do not reference vl, vu in the cases range = 'i','a'
+              ! the interval (wl, wu] contains all the wanted eigenvalues.
+              ! it is either given by the user or computed in stdlib_slarre.
+              wl = vl
+              wu = vu
+           elseif( indeig ) then
+              ! we do not reference il, iu in the cases range = 'v','a'
+              iil = il
+              iiu = iu
+           endif
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( valeig .and. n>0 .and. wu<=wl ) then
+              info = -7
+           else if( indeig .and. ( iil<1 .or. iil>n ) ) then
+              info = -8
+           else if( indeig .and. ( iiu<iil .or. iiu>n ) ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -13
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -17
+           else if( liwork<liwmin .and. .not.lquery ) then
+              info = -19
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( wantz .and. alleig ) then
+                 nzcmin = n
+              else if( wantz .and. valeig ) then
+                 call stdlib_slarrc( 'T', n, vl, vu, d, e, safmin,nzcmin, itmp, itmp2, info )
+                           
+              else if( wantz .and. indeig ) then
+                 nzcmin = iiu-iil+1
+              else
+                 ! wantz == false.
+                 nzcmin = 0
+              endif
+              if( zquery .and. info==0 ) then
+                 z( 1,1 ) = nzcmin
+              else if( nzc<nzcmin .and. .not.zquery ) then
+                 info = -14
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSTEMR', -info )
+              return
+           else if( lquery .or. zquery ) then
+              return
+           end if
+           ! handle n = 0, 1, and 2 cases immediately
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = d( 1 )
+              else
+                 if( wl<d( 1 ) .and. wu>=d( 1 ) ) then
+                    m = 1
+                    w( 1 ) = d( 1 )
+                 end if
+              end if
+              if( wantz.and.(.not.zquery) ) then
+                 z( 1, 1 ) = one
+                 isuppz(1) = 1
+                 isuppz(2) = 1
+              end if
+              return
+           end if
+           if( n==2 ) then
+              if( .not.wantz ) then
+                 call stdlib_slae2( d(1), e(1), d(2), r1, r2 )
+              else if( wantz.and.(.not.zquery) ) then
+                 call stdlib_slaev2( d(1), e(1), d(2), r1, r2, cs, sn )
+              end if
+              if( alleig.or.(valeig.and.(r2>wl).and.(r2<=wu)).or.(indeig.and.(iil==1)) ) &
+                        then
+                 m = m+1
+                 w( m ) = r2
+                 if( wantz.and.(.not.zquery) ) then
+                    z( 1, m ) = -sn
+                    z( 2, m ) = cs
+                    ! note: at most one of sn and cs can be zero.
+                    if (sn/=zero) then
+                       if (cs/=zero) then
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 2
+                       else
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 1
+                       end if
+                    else
+                       isuppz(2*m-1) = 2
+                       isuppz(2*m) = 2
+                    end if
+                 endif
+              endif
+              if( alleig.or.(valeig.and.(r1>wl).and.(r1<=wu)).or.(indeig.and.(iiu==2)) ) &
+                        then
+                 m = m+1
+                 w( m ) = r1
+                 if( wantz.and.(.not.zquery) ) then
+                    z( 1, m ) = cs
+                    z( 2, m ) = sn
+                    ! note: at most one of sn and cs can be zero.
+                    if (sn/=zero) then
+                       if (cs/=zero) then
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 2
+                       else
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 1
+                       end if
+                    else
+                       isuppz(2*m-1) = 2
+                       isuppz(2*m) = 2
+                    end if
+                 endif
+              endif
+           else
+              ! continue with general n
+              indgrs = 1
+              inderr = 2*n + 1
+              indgp = 3*n + 1
+              indd = 4*n + 1
+              inde2 = 5*n + 1
+              indwrk = 6*n + 1
+              iinspl = 1
+              iindbl = n + 1
+              iindw = 2*n + 1
+              iindwk = 3*n + 1
+              ! scale matrix to allowable range, if necessary.
+              ! the allowable range is related to the pivmin parameter; see the
+              ! comments in stdlib_slarrd.  the preference for scaling small values
+              ! up is heuristic; we expect users' matrices not to be close to the
+              ! rmax threshold.
+              scale = one
+              tnrm = stdlib_slanst( 'M', n, d, e )
+              if( tnrm>zero .and. tnrm<rmin ) then
+                 scale = rmin / tnrm
+              else if( tnrm>rmax ) then
+                 scale = rmax / tnrm
+              end if
+              if( scale/=one ) then
+                 call stdlib_sscal( n, scale, d, 1 )
+                 call stdlib_sscal( n-1, scale, e, 1 )
+                 tnrm = tnrm*scale
+                 if( valeig ) then
+                    ! if eigenvalues in interval have to be found,
+                    ! scale (wl, wu] accordingly
+                    wl = wl*scale
+                    wu = wu*scale
+                 endif
+              end if
+              ! compute the desired eigenvalues of the tridiagonal after splitting
+              ! into smaller subblocks if the corresponding off-diagonal elements
+              ! are small
+              ! thresh is the splitting parameter for stdlib_slarre
+              ! a negative thresh forces the old splitting criterion based on the
+              ! size of the off-diagonal. a positive thresh switches to splitting
+              ! which preserves relative accuracy.
+              if( tryrac ) then
+                 ! test whether the matrix warrants the more expensive relative approach.
+                 call stdlib_slarrr( n, d, e, iinfo )
+              else
+                 ! the user does not care about relative accurately eigenvalues
+                 iinfo = -1
+              endif
+              ! set the splitting criterion
+              if (iinfo==0) then
+                 thresh = eps
+              else
+                 thresh = -eps
+                 ! relative accuracy is desired but t does not guarantee it
+                 tryrac = .false.
+              endif
+              if( tryrac ) then
+                 ! copy original diagonal, needed to guarantee relative accuracy
+                 call stdlib_scopy(n,d,1,work(indd),1)
+              endif
+              ! store the squares of the offdiagonal values of t
+              do j = 1, n-1
+                 work( inde2+j-1 ) = e(j)**2
+              end do
+              ! set the tolerance parameters for bisection
+              if( .not.wantz ) then
+                 ! stdlib_slarre computes the eigenvalues to full precision.
+                 rtol1 = four * eps
+                 rtol2 = four * eps
+              else
+                 ! stdlib_slarre computes the eigenvalues to less than full precision.
+                 ! stdlib_clarrv will refine the eigenvalue approximations, and we only
+                 ! need less accurate initial bisection in stdlib_slarre.
+                 ! note: these settings do only affect the subset case and stdlib_slarre
+                 rtol1 = max( sqrt(eps)*5.0e-2_sp, four * eps )
+                 rtol2 = max( sqrt(eps)*5.0e-3_sp, four * eps )
+              endif
+              call stdlib_slarre( range, n, wl, wu, iil, iiu, d, e,work(inde2), rtol1, rtol2, &
+              thresh, nsplit,iwork( iinspl ), m, w, work( inderr ),work( indgp ), iwork( iindbl ),&
+              iwork( iindw ), work( indgrs ), pivmin,work( indwrk ), iwork( iindwk ), iinfo )
+                        
+              if( iinfo/=0 ) then
+                 info = 10 + abs( iinfo )
+                 return
+              end if
+              ! note that if range /= 'v', stdlib_slarre computes bounds on the desired
+              ! part of the spectrum. all desired eigenvalues are contained in
+              ! (wl,wu]
+              if( wantz ) then
+                 ! compute the desired eigenvectors corresponding to the computed
+                 ! eigenvalues
+                 call stdlib_clarrv( n, wl, wu, d, e,pivmin, iwork( iinspl ), m,1, m, minrgp, &
+                 rtol1, rtol2,w, work( inderr ), work( indgp ), iwork( iindbl ),iwork( iindw ), &
+                           work( indgrs ), z, ldz,isuppz, work( indwrk ), iwork( iindwk ), iinfo )
+                 if( iinfo/=0 ) then
+                    info = 20 + abs( iinfo )
+                    return
+                 end if
+              else
+                 ! stdlib_slarre computes eigenvalues of the (shifted) root representation
+                 ! stdlib_clarrv returns the eigenvalues of the unshifted matrix.
+                 ! however, if the eigenvectors are not desired by the user, we need
+                 ! to apply the corresponding shifts from stdlib_slarre to obtain the
+                 ! eigenvalues of the original matrix.
+                 do j = 1, m
+                    itmp = iwork( iindbl+j-1 )
+                    w( j ) = w( j ) + e( iwork( iinspl+itmp-1 ) )
+                 end do
+              end if
+              if ( tryrac ) then
+                 ! refine computed eigenvalues so that they are relatively accurate
+                 ! with respect to the original matrix t.
+                 ibegin = 1
+                 wbegin = 1
+                 loop_39: do jblk = 1, iwork( iindbl+m-1 )
+                    iend = iwork( iinspl+jblk-1 )
+                    in = iend - ibegin + 1
+                    wend = wbegin - 1
+                    ! check if any eigenvalues have to be refined in this block
+                    36 continue
+                    if( wend<m ) then
+                       if( iwork( iindbl+wend )==jblk ) then
+                          wend = wend + 1
+                          go to 36
+                       end if
+                    end if
+                    if( wend<wbegin ) then
+                       ibegin = iend + 1
+                       cycle loop_39
+                    end if
+                    offset = iwork(iindw+wbegin-1)-1
+                    ifirst = iwork(iindw+wbegin-1)
+                    ilast = iwork(iindw+wend-1)
+                    rtol2 = four * eps
+                    call stdlib_slarrj( in,work(indd+ibegin-1), work(inde2+ibegin-1),ifirst, &
+                    ilast, rtol2, offset, w(wbegin),work( inderr+wbegin-1 ),work( indwrk ), iwork(&
+                               iindwk ), pivmin,tnrm, iinfo )
+                    ibegin = iend + 1
+                    wbegin = wend + 1
+                 end do loop_39
+              endif
+              ! if matrix was scaled, then rescale eigenvalues appropriately.
+              if( scale/=one ) then
+                 call stdlib_sscal( m, one / scale, w, 1 )
+              end if
+           end if
+           ! if eigenvalues are not in increasing order, then sort them,
+           ! possibly along with eigenvectors.
+           if( nsplit>1 .or. n==2 ) then
+              if( .not. wantz ) then
+                 call stdlib_slasrt( 'I', m, w, iinfo )
+                 if( iinfo/=0 ) then
+                    info = 3
+                    return
+                 end if
+              else
+                 do j = 1, m - 1
+                    i = 0
+                    tmp = w( j )
+                    do jj = j + 1, m
+                       if( w( jj )<tmp ) then
+                          i = jj
+                          tmp = w( jj )
+                       end if
+                    end do
+                    if( i/=0 ) then
+                       w( i ) = w( j )
+                       w( j ) = tmp
+                       if( wantz ) then
+                          call stdlib_cswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                          itmp = isuppz( 2*i-1 )
+                          isuppz( 2*i-1 ) = isuppz( 2*j-1 )
+                          isuppz( 2*j-1 ) = itmp
+                          itmp = isuppz( 2*i )
+                          isuppz( 2*i ) = isuppz( 2*j )
+                          isuppz( 2*j ) = itmp
+                       end if
+                    end if
+                 end do
+              end if
+           endif
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_cstemr
+
+     !> CSYCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex symmetric matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by CSYTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_csycon( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(sp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_csytrs( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_csycon
+
+     !> CSYCON_ROOK: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex symmetric matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by CSYTRF_ROOK.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_csycon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(sp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYCON_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==czero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_csytrs_rook( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_csycon_rook
+
+     !> CSYRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric indefinite, and
+     !> provides error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_csyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_ccopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_csymv( uplo, n, -cone, a, lda, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + cabs1( a( k, k ) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + cabs1( a( k, k ) )*xk
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_csytrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                 call stdlib_caxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_clacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_csytrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_csytrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_csyrfs
+
+     !> CSYSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.  The factored form of A is then
+     !> used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_csysv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_csytrf( uplo, n, a, lda, ipiv, work, -1, info )
+                 lwkopt = real( work(1),KIND=sp)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYSV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_csytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              if ( lwork<n ) then
+              ! solve with trs ( use level blas 2)
+                 call stdlib_csytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+              else
+              ! solve with trs2 ( use level blas 3)
+                 call stdlib_csytrs2( uplo,n,nrhs,a,lda,ipiv,b,ldb,work,info )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_csysv
+
+     !> CSYSV_RK: computes the solution to a complex system of linear
+     !> equations A * X = B, where A is an N-by-N symmetric matrix
+     !> and X and B are N-by-NRHS matrices.
+     !> The bounded Bunch-Kaufman (rook) diagonal pivoting method is used
+     !> to factor A as
+     !> A = P*U*D*(U**T)*(P**T),  if UPLO = 'U', or
+     !> A = P*L*D*(L**T)*(P**T),  if UPLO = 'L',
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> CSYTRF_RK is called to compute the factorization of a complex
+     !> symmetric matrix.  The factored form of A is then used to solve
+     !> the system of equations A * X = B by calling BLAS3 routine CSYTRS_3.
+
+     pure subroutine stdlib_csysv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_csytrf_rk( uplo, n, a, lda, e, ipiv, work, -1, info )
+                 lwkopt = real( work(1),KIND=sp)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYSV_RK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_csytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b with blas3 solver, overwriting b with x.
+              call stdlib_csytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_csysv_rk
+
+     !> CSYSV_ROOK: computes the solution to a complex system of linear
+     !> equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> CSYTRF_ROOK is called to compute the factorization of a complex
+     !> symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method.
+     !> The factored form of A is then used to solve the system
+     !> of equations A * X = B by calling CSYTRS_ROOK.
+
+     pure subroutine stdlib_csysv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_csytrf_rook( uplo, n, a, lda, ipiv, work, -1, info )
+                 lwkopt = real( work(1),KIND=sp)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYSV_ROOK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_csytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              ! solve with trs_rook ( use level 2 blas)
+              call stdlib_csytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_csysv_rook
+
+     !> CSYSVX: uses the diagonal pivoting factorization to compute the
+     !> solution to a complex system of linear equations A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_csysvx( fact, uplo, n, nrhs, a, lda, af, ldaf, ipiv, b,ldb, x, ldx, rcond, &
+               ferr, berr, work, lwork,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, lwork, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(inout) :: af(ldaf,*)
+           complex(sp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, nofact
+           integer(ilp) :: lwkopt, nb
+           real(sp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -11
+           else if( ldx<max( 1, n ) ) then
+              info = -13
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           if( info==0 ) then
+              lwkopt = max( 1, 2*n )
+              if( nofact ) then
+                 nb = stdlib_ilaenv( 1, 'CSYTRF', uplo, n, -1, -1, -1 )
+                 lwkopt = max( lwkopt, n*nb )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYSVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+              call stdlib_clacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_csytrf( uplo, n, af, ldaf, ipiv, work, lwork, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_clansy( 'I', uplo, n, a, lda, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_csycon( uplo, n, af, ldaf, ipiv, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_clacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_csytrs( uplo, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_csyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_csysvx
+
+     !> CTBCON: estimates the reciprocal of the condition number of a
+     !> triangular band matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_ctbcon( norm, uplo, diag, n, kd, ab, ldab, rcond, work,rwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(in) :: ab(ldab,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(sp) :: ainvnm, anorm, scale, smlnum, xnorm
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )*real( max( n, 1 ),KIND=sp)
+           ! compute the 1-norm of the triangular matrix a or a**h.
+           anorm = stdlib_clantb( norm, uplo, diag, n, kd, ab, ldab, rwork )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the 1-norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_clatbs( uplo, 'NO TRANSPOSE', diag, normin, n, kd,ab, ldab, work, &
+                              scale, rwork, info )
+                 else
+                    ! multiply by inv(a**h).
+                    call stdlib_clatbs( uplo, 'CONJUGATE TRANSPOSE', diag, normin,n, kd, ab, ldab,&
+                               work, scale, rwork, info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_icamax( n, work, 1 )
+                    xnorm = cabs1( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_csrscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_ctbcon
+
+     !> CTFTRI: computes the inverse of a triangular matrix A stored in RFP
+     !> format.
+     !> This is a Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_ctftri( transr, uplo, diag, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo, diag
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( .not.stdlib_lsame( diag, 'N' ) .and. .not.stdlib_lsame( diag, 'U' ) )&
+                     then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTFTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_ctrtri( 'L', diag, n1, a( 0 ), n, info )
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'R', 'L', 'N', diag, n2, n1, -cone, a( 0 ),n, a( n1 ), n )
+                              
+                    call stdlib_ctrtri( 'U', diag, n2, a( n ), n, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'L', 'U', 'C', diag, n2, n1, cone, a( n ), n,a( n1 ), n )
+                              
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_ctrtri( 'L', diag, n1, a( n2 ), n, info )
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'L', 'L', 'C', diag, n1, n2, -cone, a( n2 ),n, a( 0 ), n )
+                              
+                    call stdlib_ctrtri( 'U', diag, n2, a( n1 ), n, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'R', 'U', 'N', diag, n1, n2, cone, a( n1 ),n, a( 0 ), n )
+                              
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0), t2 -> a(1), s -> a(0+n1*n1)
+                    call stdlib_ctrtri( 'U', diag, n1, a( 0 ), n1, info )
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'L', 'U', 'N', diag, n1, n2, -cone, a( 0 ),n1, a( n1*n1 ), &
+                              n1 )
+                    call stdlib_ctrtri( 'L', diag, n2, a( 1 ), n1, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'R', 'L', 'C', diag, n1, n2, cone, a( 1 ),n1, a( n1*n1 ), &
+                              n1 )
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0+n2*n2), t2 -> a(0+n1*n2), s -> a(0)
+                    call stdlib_ctrtri( 'U', diag, n1, a( n2*n2 ), n2, info )
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'R', 'U', 'C', diag, n2, n1, -cone,a( n2*n2 ), n2, a( 0 ), &
+                              n2 )
+                    call stdlib_ctrtri( 'L', diag, n2, a( n1*n2 ), n2, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'L', 'L', 'N', diag, n2, n1, cone,a( n1*n2 ), n2, a( 0 ), &
+                              n2 )
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_ctrtri( 'L', diag, k, a( 1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'R', 'L', 'N', diag, k, k, -cone, a( 1 ),n+1, a( k+1 ), n+&
+                              1 )
+                    call stdlib_ctrtri( 'U', diag, k, a( 0 ), n+1, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'L', 'U', 'C', diag, k, k, cone, a( 0 ), n+1,a( k+1 ), n+1 &
+                              )
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_ctrtri( 'L', diag, k, a( k+1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'L', 'L', 'C', diag, k, k, -cone, a( k+1 ),n+1, a( 0 ), n+&
+                              1 )
+                    call stdlib_ctrtri( 'U', diag, k, a( k ), n+1, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'R', 'U', 'N', diag, k, k, cone, a( k ), n+1,a( 0 ), n+1 )
+                              
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_ctrtri( 'U', diag, k, a( k ), k, info )
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'L', 'U', 'N', diag, k, k, -cone, a( k ), k,a( k*( k+1 ) ),&
+                               k )
+                    call stdlib_ctrtri( 'L', diag, k, a( 0 ), k, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'R', 'L', 'C', diag, k, k, cone, a( 0 ), k,a( k*( k+1 ) ), &
+                              k )
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_ctrtri( 'U', diag, k, a( k*( k+1 ) ), k, info )
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'R', 'U', 'C', diag, k, k, -cone,a( k*( k+1 ) ), k, a( 0 ),&
+                               k )
+                    call stdlib_ctrtri( 'L', diag, k, a( k*k ), k, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_ctrmm( 'L', 'L', 'N', diag, k, k, cone, a( k*k ), k,a( 0 ), k )
+                              
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ctftri
+
+     !> CTGSJA: computes the generalized singular value decomposition (GSVD)
+     !> of two complex upper triangular (or trapezoidal) matrices A and B.
+     !> On entry, it is assumed that matrices A and B have the following
+     !> forms, which may be obtained by the preprocessing subroutine CGGSVP
+     !> from a general M-by-N matrix A and P-by-N matrix B:
+     !> N-K-L  K    L
+     !> A =    K ( 0    A12  A13 ) if M-K-L >= 0;
+     !> L ( 0     0   A23 )
+     !> M-K-L ( 0     0    0  )
+     !> N-K-L  K    L
+     !> A =  K ( 0    A12  A13 ) if M-K-L < 0;
+     !> M-K ( 0     0   A23 )
+     !> N-K-L  K    L
+     !> B =  L ( 0     0   B13 )
+     !> P-L ( 0     0    0  )
+     !> where the K-by-K matrix A12 and L-by-L matrix B13 are nonsingular
+     !> upper triangular; A23 is L-by-L upper triangular if M-K-L >= 0,
+     !> otherwise A23 is (M-K)-by-L upper trapezoidal.
+     !> On exit,
+     !> U**H *A*Q = D1*( 0 R ),    V**H *B*Q = D2*( 0 R ),
+     !> where U, V and Q are unitary matrices.
+     !> R is a nonsingular upper triangular matrix, and D1
+     !> and D2 are ``diagonal'' matrices, which are of the following
+     !> structures:
+     !> If M-K-L >= 0,
+     !> K  L
+     !> D1 =     K ( I  0 )
+     !> L ( 0  C )
+     !> M-K-L ( 0  0 )
+     !> K  L
+     !> D2 = L   ( 0  S )
+     !> P-L ( 0  0 )
+     !> N-K-L  K    L
+     !> ( 0 R ) = K (  0   R11  R12 ) K
+     !> L (  0    0   R22 ) L
+     !> where
+     !> C = diag( ALPHA(K+1), ... , ALPHA(K+L) ),
+     !> S = diag( BETA(K+1),  ... , BETA(K+L) ),
+     !> C**2 + S**2 = I.
+     !> R is stored in A(1:K+L,N-K-L+1:N) on exit.
+     !> If M-K-L < 0,
+     !> K M-K K+L-M
+     !> D1 =   K ( I  0    0   )
+     !> M-K ( 0  C    0   )
+     !> K M-K K+L-M
+     !> D2 =   M-K ( 0  S    0   )
+     !> K+L-M ( 0  0    I   )
+     !> P-L ( 0  0    0   )
+     !> N-K-L  K   M-K  K+L-M
+     !> ( 0 R ) =    K ( 0    R11  R12  R13  )
+     !> M-K ( 0     0   R22  R23  )
+     !> K+L-M ( 0     0    0   R33  )
+     !> where
+     !> C = diag( ALPHA(K+1), ... , ALPHA(M) ),
+     !> S = diag( BETA(K+1),  ... , BETA(M) ),
+     !> C**2 + S**2 = I.
+     !> R = ( R11 R12 R13 ) is stored in A(1:M, N-K-L+1:N) and R33 is stored
+     !> (  0  R22 R23 )
+     !> in B(M-K+1:L,N+M-K-L+1:N) on exit.
+     !> The computation of the unitary transformation matrices U, V or Q
+     !> is optional.  These matrices may either be formed explicitly, or they
+     !> may be postmultiplied into input matrices U1, V1, or Q1.
+
+     pure subroutine stdlib_ctgsja( jobu, jobv, jobq, m, p, n, k, l, a, lda, b,ldb, tola, tolb, &
+               alpha, beta, u, ldu, v, ldv,q, ldq, work, ncycle, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobq, jobu, jobv
+           integer(ilp), intent(out) :: info, ncycle
+           integer(ilp), intent(in) :: k, l, lda, ldb, ldq, ldu, ldv, m, n, p
+           real(sp), intent(in) :: tola, tolb
+           ! Array Arguments 
+           real(sp), intent(out) :: alpha(*), beta(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), u(ldu,*), v(ldv,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 40
+           real(sp), parameter :: hugenum = huge(zero)
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: initq, initu, initv, upper, wantq, wantu, wantv
+           integer(ilp) :: i, j, kcycle
+           real(sp) :: a1, a3, b1, b3, csq, csu, csv, error, gamma, rwk, ssmin
+           complex(sp) :: a2, b2, snq, snu, snv
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,min,real,huge
+           ! Executable Statements 
+           ! decode and test the input parameters
+           initu = stdlib_lsame( jobu, 'I' )
+           wantu = initu .or. stdlib_lsame( jobu, 'U' )
+           initv = stdlib_lsame( jobv, 'I' )
+           wantv = initv .or. stdlib_lsame( jobv, 'V' )
+           initq = stdlib_lsame( jobq, 'I' )
+           wantq = initq .or. stdlib_lsame( jobq, 'Q' )
+           info = 0
+           if( .not.( initu .or. wantu .or. stdlib_lsame( jobu, 'N' ) ) ) then
+              info = -1
+           else if( .not.( initv .or. wantv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -2
+           else if( .not.( initq .or. wantq .or. stdlib_lsame( jobq, 'N' ) ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( p<0 ) then
+              info = -5
+           else if( n<0 ) then
+              info = -6
+           else if( lda<max( 1, m ) ) then
+              info = -10
+           else if( ldb<max( 1, p ) ) then
+              info = -12
+           else if( ldu<1 .or. ( wantu .and. ldu<m ) ) then
+              info = -18
+           else if( ldv<1 .or. ( wantv .and. ldv<p ) ) then
+              info = -20
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -22
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTGSJA', -info )
+              return
+           end if
+           ! initialize u, v and q, if necessary
+           if( initu )call stdlib_claset( 'FULL', m, m, czero, cone, u, ldu )
+           if( initv )call stdlib_claset( 'FULL', p, p, czero, cone, v, ldv )
+           if( initq )call stdlib_claset( 'FULL', n, n, czero, cone, q, ldq )
+           ! loop until convergence
+           upper = .false.
+           loop_40: do kcycle = 1, maxit
+              upper = .not.upper
+              loop_20: do i = 1, l - 1
+                 loop_10: do j = i + 1, l
+                    a1 = zero
+                    a2 = czero
+                    a3 = zero
+                    if( k+i<=m )a1 = real( a( k+i, n-l+i ),KIND=sp)
+                    if( k+j<=m )a3 = real( a( k+j, n-l+j ),KIND=sp)
+                    b1 = real( b( i, n-l+i ),KIND=sp)
+                    b3 = real( b( j, n-l+j ),KIND=sp)
+                    if( upper ) then
+                       if( k+i<=m )a2 = a( k+i, n-l+j )
+                       b2 = b( i, n-l+j )
+                    else
+                       if( k+j<=m )a2 = a( k+j, n-l+i )
+                       b2 = b( j, n-l+i )
+                    end if
+                    call stdlib_clags2( upper, a1, a2, a3, b1, b2, b3, csu, snu,csv, snv, csq, &
+                              snq )
+                    ! update (k+i)-th and (k+j)-th rows of matrix a: u**h *a
+                    if( k+j<=m )call stdlib_crot( l, a( k+j, n-l+1 ), lda, a( k+i, n-l+1 ),lda, &
+                              csu, conjg( snu ) )
+                    ! update i-th and j-th rows of matrix b: v**h *b
+                    call stdlib_crot( l, b( j, n-l+1 ), ldb, b( i, n-l+1 ), ldb,csv, conjg( snv ) &
+                              )
+                    ! update (n-l+i)-th and (n-l+j)-th columns of matrices
+                    ! a and b: a*q and b*q
+                    call stdlib_crot( min( k+l, m ), a( 1, n-l+j ), 1,a( 1, n-l+i ), 1, csq, snq )
+                              
+                    call stdlib_crot( l, b( 1, n-l+j ), 1, b( 1, n-l+i ), 1, csq,snq )
+                    if( upper ) then
+                       if( k+i<=m )a( k+i, n-l+j ) = czero
+                       b( i, n-l+j ) = czero
+                    else
+                       if( k+j<=m )a( k+j, n-l+i ) = czero
+                       b( j, n-l+i ) = czero
+                    end if
+                    ! ensure that the diagonal elements of a and b are real.
+                    if( k+i<=m )a( k+i, n-l+i ) = real( a( k+i, n-l+i ),KIND=sp)
+                    if( k+j<=m )a( k+j, n-l+j ) = real( a( k+j, n-l+j ),KIND=sp)
+                    b( i, n-l+i ) = real( b( i, n-l+i ),KIND=sp)
+                    b( j, n-l+j ) = real( b( j, n-l+j ),KIND=sp)
+                    ! update unitary matrices u, v, q, if desired.
+                    if( wantu .and. k+j<=m )call stdlib_crot( m, u( 1, k+j ), 1, u( 1, k+i ), 1, &
+                              csu,snu )
+                    if( wantv )call stdlib_crot( p, v( 1, j ), 1, v( 1, i ), 1, csv, snv )
+                    if( wantq )call stdlib_crot( n, q( 1, n-l+j ), 1, q( 1, n-l+i ), 1, csq,snq )
+                              
+                 end do loop_10
+              end do loop_20
+              if( .not.upper ) then
+                 ! the matrices a13 and b13 were lower triangular at the start
+                 ! of the cycle, and are now upper triangular.
+                 ! convergence test: test the parallelism of the corresponding
+                 ! rows of a and b.
+                 error = zero
+                 do i = 1, min( l, m-k )
+                    call stdlib_ccopy( l-i+1, a( k+i, n-l+i ), lda, work, 1 )
+                    call stdlib_ccopy( l-i+1, b( i, n-l+i ), ldb, work( l+1 ), 1 )
+                    call stdlib_clapll( l-i+1, work, 1, work( l+1 ), 1, ssmin )
+                    error = max( error, ssmin )
+                 end do
+                 if( abs( error )<=min( tola, tolb ) )go to 50
+              end if
+              ! end of cycle loop
+           end do loop_40
+           ! the algorithm has not converged after maxit cycles.
+           info = 1
+           go to 100
+           50 continue
+           ! if error <= min(tola,tolb), then the algorithm has converged.
+           ! compute the generalized singular value pairs (alpha, beta), and
+           ! set the triangular matrix r to array a.
+           do i = 1, k
+              alpha( i ) = one
+              beta( i ) = zero
+           end do
+           do i = 1, min( l, m-k )
+              a1 = real( a( k+i, n-l+i ),KIND=sp)
+              b1 = real( b( i, n-l+i ),KIND=sp)
+              gamma = b1 / a1
+              if( (gamma<=hugenum).and.(gamma>=-hugenum) ) then
+                 if( gamma<zero ) then
+                    call stdlib_csscal( l-i+1, -one, b( i, n-l+i ), ldb )
+                    if( wantv )call stdlib_csscal( p, -one, v( 1, i ), 1 )
+                 end if
+                 call stdlib_slartg( abs( gamma ), one, beta( k+i ), alpha( k+i ),rwk )
+                 if( alpha( k+i )>=beta( k+i ) ) then
+                    call stdlib_csscal( l-i+1, one / alpha( k+i ), a( k+i, n-l+i ),lda )
+                 else
+                    call stdlib_csscal( l-i+1, one / beta( k+i ), b( i, n-l+i ),ldb )
+                    call stdlib_ccopy( l-i+1, b( i, n-l+i ), ldb, a( k+i, n-l+i ),lda )
+                 end if
+              else
+                 alpha( k+i ) = zero
+                 beta( k+i ) = one
+                 call stdlib_ccopy( l-i+1, b( i, n-l+i ), ldb, a( k+i, n-l+i ),lda )
+              end if
+           end do
+           ! post-assignment
+           do i = m + 1, k + l
+              alpha( i ) = zero
+              beta( i ) = one
+           end do
+           if( k+l<n ) then
+              do i = k + l + 1, n
+                 alpha( i ) = zero
+                 beta( i ) = zero
+              end do
+           end if
+           100 continue
+           ncycle = kcycle
+           return
+     end subroutine stdlib_ctgsja
+
+     !> CTGSY2: solves the generalized Sylvester equation
+     !> A * R - L * B = scale *  C               (1)
+     !> D * R - L * E = scale * F
+     !> using Level 1 and 2 BLAS, where R and L are unknown M-by-N matrices,
+     !> (A, D), (B, E) and (C, F) are given matrix pairs of size M-by-M,
+     !> N-by-N and M-by-N, respectively. A, B, D and E are upper triangular
+     !> (i.e., (A,D) and (B,E) in generalized Schur form).
+     !> The solution (R, L) overwrites (C, F). 0 <= SCALE <= 1 is an output
+     !> scaling factor chosen to avoid overflow.
+     !> In matrix notation solving equation (1) corresponds to solve
+     !> Zx = scale * b, where Z is defined as
+     !> Z = [ kron(In, A)  -kron(B**H, Im) ]             (2)
+     !> [ kron(In, D)  -kron(E**H, Im) ],
+     !> Ik is the identity matrix of size k and X**H is the transpose of X.
+     !> kron(X, Y) is the Kronecker product between the matrices X and Y.
+     !> If TRANS = 'C', y in the conjugate transposed system Z**H*y = scale*b
+     !> is solved for, which is equivalent to solve for R and L in
+     !> A**H * R  + D**H * L   = scale * C           (3)
+     !> R  * B**H + L  * E**H  = scale * -F
+     !> This case is used to compute an estimate of Dif[(A, D), (B, E)] =
+     !> = sigma_min(Z) using reverse communication with CLACON.
+     !> CTGSY2 also (IJOB >= 1) contributes to the computation in CTGSYL
+     !> of an upper bound on the separation between to matrix pairs. Then
+     !> the input (A, D), (B, E) are sub-pencils of two matrix pairs in
+     !> CTGSYL.
+
+     pure subroutine stdlib_ctgsy2( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f, &
+               ldf, scale, rdsum, rdscal,info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldc, ldd, lde, ldf, m, n
+           integer(ilp), intent(out) :: info
+           real(sp), intent(inout) :: rdscal, rdsum
+           real(sp), intent(out) :: scale
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), b(ldb,*), d(ldd,*), e(lde,*)
+           complex(sp), intent(inout) :: c(ldc,*), f(ldf,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ldz = 2
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: i, ierr, j, k
+           real(sp) :: scaloc
+           complex(sp) :: alpha
+           ! Local Arrays 
+           integer(ilp) :: ipiv(ldz), jpiv(ldz)
+           complex(sp) :: rhs(ldz), z(ldz,ldz)
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,conjg,max
+           ! Executable Statements 
+           ! decode and test input parameters
+           info = 0
+           ierr = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -1
+           else if( notran ) then
+              if( ( ijob<0 ) .or. ( ijob>2 ) ) then
+                 info = -2
+              end if
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 info = -3
+              else if( n<=0 ) then
+                 info = -4
+              else if( lda<max( 1, m ) ) then
+                 info = -6
+              else if( ldb<max( 1, n ) ) then
+                 info = -8
+              else if( ldc<max( 1, m ) ) then
+                 info = -10
+              else if( ldd<max( 1, m ) ) then
+                 info = -12
+              else if( lde<max( 1, n ) ) then
+                 info = -14
+              else if( ldf<max( 1, m ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTGSY2', -info )
+              return
+           end if
+           if( notran ) then
+              ! solve (i, j) - system
+                 ! a(i, i) * r(i, j) - l(i, j) * b(j, j) = c(i, j)
+                 ! d(i, i) * r(i, j) - l(i, j) * e(j, j) = f(i, j)
+              ! for i = m, m - 1, ..., 1; j = 1, 2, ..., n
+              scale = one
+              scaloc = one
+              loop_30: do j = 1, n
+                 loop_20: do i = m, 1, -1
+                    ! build 2 by 2 system
+                    z( 1, 1 ) = a( i, i )
+                    z( 2, 1 ) = d( i, i )
+                    z( 1, 2 ) = -b( j, j )
+                    z( 2, 2 ) = -e( j, j )
+                    ! set up right hand side(s)
+                    rhs( 1 ) = c( i, j )
+                    rhs( 2 ) = f( i, j )
+                    ! solve z * x = rhs
+                    call stdlib_cgetc2( ldz, z, ldz, ipiv, jpiv, ierr )
+                    if( ierr>0 )info = ierr
+                    if( ijob==0 ) then
+                       call stdlib_cgesc2( ldz, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_cscal( m, cmplx( scaloc, zero,KIND=sp), c( 1, k ),1 )
+                                       
+                             call stdlib_cscal( m, cmplx( scaloc, zero,KIND=sp), f( 1, k ),1 )
+                                       
+                          end do
+                          scale = scale*scaloc
+                       end if
+                    else
+                       call stdlib_clatdf( ijob, ldz, z, ldz, rhs, rdsum, rdscal,ipiv, jpiv )
+                                 
+                    end if
+                    ! unpack solution vector(s)
+                    c( i, j ) = rhs( 1 )
+                    f( i, j ) = rhs( 2 )
+                    ! substitute r(i, j) and l(i, j) into remaining equation.
+                    if( i>1 ) then
+                       alpha = -rhs( 1 )
+                       call stdlib_caxpy( i-1, alpha, a( 1, i ), 1, c( 1, j ), 1 )
+                       call stdlib_caxpy( i-1, alpha, d( 1, i ), 1, f( 1, j ), 1 )
+                    end if
+                    if( j<n ) then
+                       call stdlib_caxpy( n-j, rhs( 2 ), b( j, j+1 ), ldb,c( i, j+1 ), ldc )
+                                 
+                       call stdlib_caxpy( n-j, rhs( 2 ), e( j, j+1 ), lde,f( i, j+1 ), ldf )
+                                 
+                    end if
+                 end do loop_20
+              end do loop_30
+           else
+              ! solve transposed (i, j) - system:
+                 ! a(i, i)**h * r(i, j) + d(i, i)**h * l(j, j) = c(i, j)
+                 ! r(i, i) * b(j, j) + l(i, j) * e(j, j)   = -f(i, j)
+              ! for i = 1, 2, ..., m, j = n, n - 1, ..., 1
+              scale = one
+              scaloc = one
+              loop_80: do i = 1, m
+                 loop_70: do j = n, 1, -1
+                    ! build 2 by 2 system z**h
+                    z( 1, 1 ) = conjg( a( i, i ) )
+                    z( 2, 1 ) = -conjg( b( j, j ) )
+                    z( 1, 2 ) = conjg( d( i, i ) )
+                    z( 2, 2 ) = -conjg( e( j, j ) )
+                    ! set up right hand side(s)
+                    rhs( 1 ) = c( i, j )
+                    rhs( 2 ) = f( i, j )
+                    ! solve z**h * x = rhs
+                    call stdlib_cgetc2( ldz, z, ldz, ipiv, jpiv, ierr )
+                    if( ierr>0 )info = ierr
+                    call stdlib_cgesc2( ldz, z, ldz, rhs, ipiv, jpiv, scaloc )
+                    if( scaloc/=one ) then
+                       do k = 1, n
+                          call stdlib_cscal( m, cmplx( scaloc, zero,KIND=sp), c( 1, k ),1 )
+                                    
+                          call stdlib_cscal( m, cmplx( scaloc, zero,KIND=sp), f( 1, k ),1 )
+                                    
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    ! unpack solution vector(s)
+                    c( i, j ) = rhs( 1 )
+                    f( i, j ) = rhs( 2 )
+                    ! substitute r(i, j) and l(i, j) into remaining equation.
+                    do k = 1, j - 1
+                       f( i, k ) = f( i, k ) + rhs( 1 )*conjg( b( k, j ) ) +rhs( 2 )*conjg( e( k, &
+                                 j ) )
+                    end do
+                    do k = i + 1, m
+                       c( k, j ) = c( k, j ) - conjg( a( i, k ) )*rhs( 1 ) -conjg( d( i, k ) )&
+                                 *rhs( 2 )
+                    end do
+                 end do loop_70
+              end do loop_80
+           end if
+           return
+     end subroutine stdlib_ctgsy2
+
+     !> CTGSYL: solves the generalized Sylvester equation:
+     !> A * R - L * B = scale * C            (1)
+     !> D * R - L * E = scale * F
+     !> where R and L are unknown m-by-n matrices, (A, D), (B, E) and
+     !> (C, F) are given matrix pairs of size m-by-m, n-by-n and m-by-n,
+     !> respectively, with complex entries. A, B, D and E are upper
+     !> triangular (i.e., (A,D) and (B,E) in generalized Schur form).
+     !> The solution (R, L) overwrites (C, F). 0 <= SCALE <= 1
+     !> is an output scaling factor chosen to avoid overflow.
+     !> In matrix notation (1) is equivalent to solve Zx = scale*b, where Z
+     !> is defined as
+     !> Z = [ kron(In, A)  -kron(B**H, Im) ]        (2)
+     !> [ kron(In, D)  -kron(E**H, Im) ],
+     !> Here Ix is the identity matrix of size x and X**H is the conjugate
+     !> transpose of X. Kron(X, Y) is the Kronecker product between the
+     !> matrices X and Y.
+     !> If TRANS = 'C', y in the conjugate transposed system Z**H *y = scale*b
+     !> is solved for, which is equivalent to solve for R and L in
+     !> A**H * R + D**H * L = scale * C           (3)
+     !> R * B**H + L * E**H = scale * -F
+     !> This case (TRANS = 'C') is used to compute an one-norm-based estimate
+     !> of Dif[(A,D), (B,E)], the separation between the matrix pairs (A,D)
+     !> and (B,E), using CLACON.
+     !> If IJOB >= 1, CTGSYL computes a Frobenius norm-based estimate of
+     !> Dif[(A,D),(B,E)]. That is, the reciprocal of a lower bound on the
+     !> reciprocal of the smallest singular value of Z.
+     !> This is a level-3 BLAS algorithm.
+
+     pure subroutine stdlib_ctgsyl( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f, &
+               ldf, scale, dif, work, lwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldc, ldd, lde, ldf, lwork, m, n
+           integer(ilp), intent(out) :: info
+           real(sp), intent(out) :: dif, scale
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           complex(sp), intent(in) :: a(lda,*), b(ldb,*), d(ldd,*), e(lde,*)
+           complex(sp), intent(inout) :: c(ldc,*), f(ldf,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+        ! replaced various illegal calls to stdlib_ccopy by calls to stdlib_claset.
+        ! sven hammarling, 1/5/02.
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, notran
+           integer(ilp) :: i, ie, ifunc, iround, is, isolve, j, je, js, k, linfo, lwmin, mb, nb, &
+                     p, pq, q
+           real(sp) :: dscale, dsum, scale2, scaloc
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,max,real,sqrt
+           ! Executable Statements 
+           ! decode and test input parameters
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -1
+           else if( notran ) then
+              if( ( ijob<0 ) .or. ( ijob>4 ) ) then
+                 info = -2
+              end if
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 info = -3
+              else if( n<=0 ) then
+                 info = -4
+              else if( lda<max( 1, m ) ) then
+                 info = -6
+              else if( ldb<max( 1, n ) ) then
+                 info = -8
+              else if( ldc<max( 1, m ) ) then
+                 info = -10
+              else if( ldd<max( 1, m ) ) then
+                 info = -12
+              else if( lde<max( 1, n ) ) then
+                 info = -14
+              else if( ldf<max( 1, m ) ) then
+                 info = -16
+              end if
+           end if
+           if( info==0 ) then
+              if( notran ) then
+                 if( ijob==1 .or. ijob==2 ) then
+                    lwmin = max( 1, 2*m*n )
+                 else
+                    lwmin = 1
+                 end if
+              else
+                 lwmin = 1
+              end if
+              work( 1 ) = lwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTGSYL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              scale = 1
+              if( notran ) then
+                 if( ijob/=0 ) then
+                    dif = 0
+                 end if
+              end if
+              return
+           end if
+           ! determine  optimal block sizes mb and nb
+           mb = stdlib_ilaenv( 2, 'CTGSYL', trans, m, n, -1, -1 )
+           nb = stdlib_ilaenv( 5, 'CTGSYL', trans, m, n, -1, -1 )
+           isolve = 1
+           ifunc = 0
+           if( notran ) then
+              if( ijob>=3 ) then
+                 ifunc = ijob - 2
+                 call stdlib_claset( 'F', m, n, czero, czero, c, ldc )
+                 call stdlib_claset( 'F', m, n, czero, czero, f, ldf )
+              else if( ijob>=1 .and. notran ) then
+                 isolve = 2
+              end if
+           end if
+           if( ( mb<=1 .and. nb<=1 ) .or. ( mb>=m .and. nb>=n ) )then
+              ! use unblocked level 2 solver
+              loop_30: do iround = 1, isolve
+                 scale = one
+                 dscale = zero
+                 dsum = one
+                 pq = m*n
+                 call stdlib_ctgsy2( trans, ifunc, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f,&
+                            ldf, scale, dsum, dscale,info )
+                 if( dscale/=zero ) then
+                    if( ijob==1 .or. ijob==3 ) then
+                       dif = sqrt( real( 2*m*n,KIND=sp) ) / ( dscale*sqrt( dsum ) )
+                    else
+                       dif = sqrt( real( pq,KIND=sp) ) / ( dscale*sqrt( dsum ) )
+                    end if
+                 end if
+                 if( isolve==2 .and. iround==1 ) then
+                    if( notran ) then
+                       ifunc = ijob
+                    end if
+                    scale2 = scale
+                    call stdlib_clacpy( 'F', m, n, c, ldc, work, m )
+                    call stdlib_clacpy( 'F', m, n, f, ldf, work( m*n+1 ), m )
+                    call stdlib_claset( 'F', m, n, czero, czero, c, ldc )
+                    call stdlib_claset( 'F', m, n, czero, czero, f, ldf )
+                 else if( isolve==2 .and. iround==2 ) then
+                    call stdlib_clacpy( 'F', m, n, work, m, c, ldc )
+                    call stdlib_clacpy( 'F', m, n, work( m*n+1 ), m, f, ldf )
+                    scale = scale2
+                 end if
+              end do loop_30
+              return
+           end if
+           ! determine block structure of a
+           p = 0
+           i = 1
+           40 continue
+           if( i>m )go to 50
+           p = p + 1
+           iwork( p ) = i
+           i = i + mb
+           if( i>=m )go to 50
+           go to 40
+           50 continue
+           iwork( p+1 ) = m + 1
+           if( iwork( p )==iwork( p+1 ) )p = p - 1
+           ! determine block structure of b
+           q = p + 1
+           j = 1
+           60 continue
+           if( j>n )go to 70
+           q = q + 1
+           iwork( q ) = j
+           j = j + nb
+           if( j>=n )go to 70
+           go to 60
+           70 continue
+           iwork( q+1 ) = n + 1
+           if( iwork( q )==iwork( q+1 ) )q = q - 1
+           if( notran ) then
+              loop_150: do iround = 1, isolve
+                 ! solve (i, j) - subsystem
+                     ! a(i, i) * r(i, j) - l(i, j) * b(j, j) = c(i, j)
+                     ! d(i, i) * r(i, j) - l(i, j) * e(j, j) = f(i, j)
+                 ! for i = p, p - 1, ..., 1; j = 1, 2, ..., q
+                 pq = 0
+                 scale = one
+                 dscale = zero
+                 dsum = one
+                 loop_130: do j = p + 2, q
+                    js = iwork( j )
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    loop_120: do i = p, 1, -1
+                       is = iwork( i )
+                       ie = iwork( i+1 ) - 1
+                       mb = ie - is + 1
+                       call stdlib_ctgsy2( trans, ifunc, mb, nb, a( is, is ), lda,b( js, js ), &
+                       ldb, c( is, js ), ldc,d( is, is ), ldd, e( js, js ), lde,f( is, js ), ldf, &
+                                 scaloc, dsum, dscale,linfo )
+                       if( linfo>0 )info = linfo
+                       pq = pq + mb*nb
+                       if( scaloc/=one ) then
+                          do k = 1, js - 1
+                             call stdlib_cscal( m, cmplx( scaloc, zero,KIND=sp), c( 1, k ),1 )
+                                       
+                             call stdlib_cscal( m, cmplx( scaloc, zero,KIND=sp), f( 1, k ),1 )
+                                       
+                          end do
+                          do k = js, je
+                             call stdlib_cscal( is-1, cmplx( scaloc, zero,KIND=sp),c( 1, k ), 1 )
+                                       
+                             call stdlib_cscal( is-1, cmplx( scaloc, zero,KIND=sp),f( 1, k ), 1 )
+                                       
+                          end do
+                          do k = js, je
+                             call stdlib_cscal( m-ie, cmplx( scaloc, zero,KIND=sp),c( ie+1, k ), &
+                                       1 )
+                             call stdlib_cscal( m-ie, cmplx( scaloc, zero,KIND=sp),f( ie+1, k ), &
+                                       1 )
+                          end do
+                          do k = je + 1, n
+                             call stdlib_cscal( m, cmplx( scaloc, zero,KIND=sp), c( 1, k ),1 )
+                                       
+                             call stdlib_cscal( m, cmplx( scaloc, zero,KIND=sp), f( 1, k ),1 )
+                                       
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! substitute r(i,j) and l(i,j) into remaining equation.
+                       if( i>1 ) then
+                          call stdlib_cgemm( 'N', 'N', is-1, nb, mb,cmplx( -one, zero,KIND=sp), a(&
+                           1, is ), lda,c( is, js ), ldc, cmplx( one, zero,KIND=sp),c( 1, js ), &
+                                     ldc )
+                          call stdlib_cgemm( 'N', 'N', is-1, nb, mb,cmplx( -one, zero,KIND=sp), d(&
+                           1, is ), ldd,c( is, js ), ldc, cmplx( one, zero,KIND=sp),f( 1, js ), &
+                                     ldf )
+                       end if
+                       if( j<q ) then
+                          call stdlib_cgemm( 'N', 'N', mb, n-je, nb,cmplx( one, zero,KIND=sp), f( &
+                          is, js ), ldf,b( js, je+1 ), ldb, cmplx( one, zero,KIND=sp),c( is, je+1 &
+                                    ), ldc )
+                          call stdlib_cgemm( 'N', 'N', mb, n-je, nb,cmplx( one, zero,KIND=sp), f( &
+                          is, js ), ldf,e( js, je+1 ), lde, cmplx( one, zero,KIND=sp),f( is, je+1 &
+                                    ), ldf )
+                       end if
+                    end do loop_120
+                 end do loop_130
+                 if( dscale/=zero ) then
+                    if( ijob==1 .or. ijob==3 ) then
+                       dif = sqrt( real( 2*m*n,KIND=sp) ) / ( dscale*sqrt( dsum ) )
+                    else
+                       dif = sqrt( real( pq,KIND=sp) ) / ( dscale*sqrt( dsum ) )
+                    end if
+                 end if
+                 if( isolve==2 .and. iround==1 ) then
+                    if( notran ) then
+                       ifunc = ijob
+                    end if
+                    scale2 = scale
+                    call stdlib_clacpy( 'F', m, n, c, ldc, work, m )
+                    call stdlib_clacpy( 'F', m, n, f, ldf, work( m*n+1 ), m )
+                    call stdlib_claset( 'F', m, n, czero, czero, c, ldc )
+                    call stdlib_claset( 'F', m, n, czero, czero, f, ldf )
+                 else if( isolve==2 .and. iround==2 ) then
+                    call stdlib_clacpy( 'F', m, n, work, m, c, ldc )
+                    call stdlib_clacpy( 'F', m, n, work( m*n+1 ), m, f, ldf )
+                    scale = scale2
+                 end if
+              end do loop_150
+           else
+              ! solve transposed (i, j)-subsystem
+                  ! a(i, i)**h * r(i, j) + d(i, i)**h * l(i, j) = c(i, j)
+                  ! r(i, j) * b(j, j)  + l(i, j) * e(j, j) = -f(i, j)
+              ! for i = 1,2,..., p; j = q, q-1,..., 1
+              scale = one
+              loop_210: do i = 1, p
+                 is = iwork( i )
+                 ie = iwork( i+1 ) - 1
+                 mb = ie - is + 1
+                 loop_200: do j = q, p + 2, -1
+                    js = iwork( j )
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    call stdlib_ctgsy2( trans, ifunc, mb, nb, a( is, is ), lda,b( js, js ), ldb, &
+                    c( is, js ), ldc,d( is, is ), ldd, e( js, js ), lde,f( is, js ), ldf, scaloc, &
+                              dsum, dscale,linfo )
+                    if( linfo>0 )info = linfo
+                    if( scaloc/=one ) then
+                       do k = 1, js - 1
+                          call stdlib_cscal( m, cmplx( scaloc, zero,KIND=sp), c( 1, k ),1 )
+                                    
+                          call stdlib_cscal( m, cmplx( scaloc, zero,KIND=sp), f( 1, k ),1 )
+                                    
+                       end do
+                       do k = js, je
+                          call stdlib_cscal( is-1, cmplx( scaloc, zero,KIND=sp), c( 1, k ),1 )
+                                    
+                          call stdlib_cscal( is-1, cmplx( scaloc, zero,KIND=sp), f( 1, k ),1 )
+                                    
+                       end do
+                       do k = js, je
+                          call stdlib_cscal( m-ie, cmplx( scaloc, zero,KIND=sp),c( ie+1, k ), 1 )
+                                    
+                          call stdlib_cscal( m-ie, cmplx( scaloc, zero,KIND=sp),f( ie+1, k ), 1 )
+                                    
+                       end do
+                       do k = je + 1, n
+                          call stdlib_cscal( m, cmplx( scaloc, zero,KIND=sp), c( 1, k ),1 )
+                                    
+                          call stdlib_cscal( m, cmplx( scaloc, zero,KIND=sp), f( 1, k ),1 )
+                                    
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    ! substitute r(i,j) and l(i,j) into remaining equation.
+                    if( j>p+2 ) then
+                       call stdlib_cgemm( 'N', 'C', mb, js-1, nb,cmplx( one, zero,KIND=sp), c( is,&
+                        js ), ldc,b( 1, js ), ldb, cmplx( one, zero,KIND=sp),f( is, 1 ), ldf )
+                                  
+                       call stdlib_cgemm( 'N', 'C', mb, js-1, nb,cmplx( one, zero,KIND=sp), f( is,&
+                        js ), ldf,e( 1, js ), lde, cmplx( one, zero,KIND=sp),f( is, 1 ), ldf )
+                                  
+                    end if
+                    if( i<p ) then
+                       call stdlib_cgemm( 'C', 'N', m-ie, nb, mb,cmplx( -one, zero,KIND=sp), a( &
+                       is, ie+1 ), lda,c( is, js ), ldc, cmplx( one, zero,KIND=sp),c( ie+1, js ), &
+                                 ldc )
+                       call stdlib_cgemm( 'C', 'N', m-ie, nb, mb,cmplx( -one, zero,KIND=sp), d( &
+                       is, ie+1 ), ldd,f( is, js ), ldf, cmplx( one, zero,KIND=sp),c( ie+1, js ), &
+                                 ldc )
+                    end if
+                 end do loop_200
+              end do loop_210
+           end if
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_ctgsyl
+
+     !> CTPCON: estimates the reciprocal of the condition number of a packed
+     !> triangular matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_ctpcon( norm, uplo, diag, n, ap, rcond, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(in) :: ap(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(sp) :: ainvnm, anorm, scale, smlnum, xnorm
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=sp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_clantp( norm, uplo, diag, n, ap, rwork )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_clatps( uplo, 'NO TRANSPOSE', diag, normin, n, ap,work, scale, &
+                              rwork, info )
+                 else
+                    ! multiply by inv(a**h).
+                    call stdlib_clatps( uplo, 'CONJUGATE TRANSPOSE', diag, normin,n, ap, work, &
+                              scale, rwork, info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_icamax( n, work, 1 )
+                    xnorm = cabs1( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_csrscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_ctpcon
+
+     !> CTPLQT: computes a blocked LQ factorization of a complex
+     !> "triangular-pentagonal" matrix C, which is composed of a
+     !> triangular block A and pentagonal block B, using the compact
+     !> WY representation for Q.
+
+     pure subroutine stdlib_ctplqt( m, n, l, mb, a, lda, b, ldb, t, ldt, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l, mb
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, lb, nb, iinfo
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. (l>min(m,n) .and. min(m,n)>=0)) then
+              info = -3
+           else if( mb<1 .or. (mb>m .and. m>0)) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m ) ) then
+              info = -8
+           else if( ldt<mb ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTPLQT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) return
+           do i = 1, m, mb
+           ! compute the qr factorization of the current block
+              ib = min( m-i+1, mb )
+              nb = min( n-l+i+ib-1, n )
+              if( i>=l ) then
+                 lb = 0
+              else
+                 lb = nb-n+l-i+1
+              end if
+              call stdlib_ctplqt2( ib, nb, lb, a(i,i), lda, b( i, 1 ), ldb,t(1, i ), ldt, iinfo )
+                        
+           ! update by applying h**t to b(i+ib:m,:) from the right
+              if( i+ib<=m ) then
+                 call stdlib_ctprfb( 'R', 'N', 'F', 'R', m-i-ib+1, nb, ib, lb,b( i, 1 ), ldb, t( &
+                           1, i ), ldt,a( i+ib, i ), lda, b( i+ib, 1 ), ldb,work, m-i-ib+1)
+              end if
+           end do
+           return
+     end subroutine stdlib_ctplqt
+
+     !> CTPMLQT: applies a complex unitary matrix Q obtained from a
+     !> "triangular-pentagonal" complex block reflector H to a general
+     !> complex matrix C, which consists of two blocks A and B.
+
+     pure subroutine stdlib_ctpmlqt( side, trans, m, n, k, l, mb, v, ldv, t, ldt,a, lda, b, ldb, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, lda, ldb, m, n, l, mb, ldt
+           ! Array Arguments 
+           complex(sp), intent(in) :: v(ldv,*), t(ldt,*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, nb, lb, kf, ldaq
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'C' )
+           notran = stdlib_lsame( trans, 'N' )
+           if ( left ) then
+              ldaq = max( 1, k )
+           else if ( right ) then
+              ldaq = max( 1, m )
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( l<0 .or. l>k ) then
+              info = -6
+           else if( mb<1 .or. (mb>k .and. k>0) ) then
+              info = -7
+           else if( ldv<k ) then
+              info = -9
+           else if( ldt<mb ) then
+              info = -11
+           else if( lda<ldaq ) then
+              info = -13
+           else if( ldb<max( 1, m ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTPMLQT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. notran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = 0
+                 end if
+                 call stdlib_ctprfb( 'L', 'C', 'F', 'R', nb, n, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. tran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = nb-n+l-i+1
+                 end if
+                 call stdlib_ctprfb( 'R', 'N', 'F', 'R', m, nb, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           else if( left .and. tran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = 0
+                 end if
+                 call stdlib_ctprfb( 'L', 'N', 'F', 'R', nb, n, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. notran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = nb-n+l-i+1
+                 end if
+                 call stdlib_ctprfb( 'R', 'C', 'F', 'R', m, nb, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           end if
+           return
+     end subroutine stdlib_ctpmlqt
+
+     !> CTPMQRT: applies a complex orthogonal matrix Q obtained from a
+     !> "triangular-pentagonal" complex block reflector H to a general
+     !> complex matrix C, which consists of two blocks A and B.
+
+     pure subroutine stdlib_ctpmqrt( side, trans, m, n, k, l, nb, v, ldv, t, ldt,a, lda, b, ldb, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, lda, ldb, m, n, l, nb, ldt
+           ! Array Arguments 
+           complex(sp), intent(in) :: v(ldv,*), t(ldt,*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, mb, lb, kf, ldaq, ldvq
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'C' )
+           notran = stdlib_lsame( trans, 'N' )
+           if ( left ) then
+              ldvq = max( 1, m )
+              ldaq = max( 1, k )
+           else if ( right ) then
+              ldvq = max( 1, n )
+              ldaq = max( 1, m )
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( l<0 .or. l>k ) then
+              info = -6
+           else if( nb<1 .or. (nb>k .and. k>0) ) then
+              info = -7
+           else if( ldv<ldvq ) then
+              info = -9
+           else if( ldt<nb ) then
+              info = -11
+           else if( lda<ldaq ) then
+              info = -13
+           else if( ldb<max( 1, m ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTPMQRT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. tran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-m+l-i+1
+                 end if
+                 call stdlib_ctprfb( 'L', 'C', 'F', 'C', mb, n, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. notran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-n+l-i+1
+                 end if
+                 call stdlib_ctprfb( 'R', 'N', 'F', 'C', m, mb, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           else if( left .and. notran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-m+l-i+1
+                 end if
+                 call stdlib_ctprfb( 'L', 'N', 'F', 'C', mb, n, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. tran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-n+l-i+1
+                 end if
+                 call stdlib_ctprfb( 'R', 'C', 'F', 'C', m, mb, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           end if
+           return
+     end subroutine stdlib_ctpmqrt
+
+     !> CTPQRT: computes a blocked QR factorization of a complex
+     !> "triangular-pentagonal" matrix C, which is composed of a
+     !> triangular block A and pentagonal block B, using the compact
+     !> WY representation for Q.
+
+     pure subroutine stdlib_ctpqrt( m, n, l, nb, a, lda, b, ldb, t, ldt, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l, nb
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, lb, mb, iinfo
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. (l>min(m,n) .and. min(m,n)>=0)) then
+              info = -3
+           else if( nb<1 .or. (nb>n .and. n>0)) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, m ) ) then
+              info = -8
+           else if( ldt<nb ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTPQRT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) return
+           do i = 1, n, nb
+           ! compute the qr factorization of the current block
+              ib = min( n-i+1, nb )
+              mb = min( m-l+i+ib-1, m )
+              if( i>=l ) then
+                 lb = 0
+              else
+                 lb = mb-m+l-i+1
+              end if
+              call stdlib_ctpqrt2( mb, ib, lb, a(i,i), lda, b( 1, i ), ldb,t(1, i ), ldt, iinfo )
+                        
+           ! update by applying h**h to b(:,i+ib:n) from the left
+              if( i+ib<=n ) then
+                 call stdlib_ctprfb( 'L', 'C', 'F', 'C', mb, n-i-ib+1, ib, lb,b( 1, i ), ldb, t( &
+                           1, i ), ldt,a( i, i+ib ), lda, b( 1, i+ib ), ldb,work, ib )
+              end if
+           end do
+           return
+     end subroutine stdlib_ctpqrt
+
+     !> CTRCON: estimates the reciprocal of the condition number of a
+     !> triangular matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_ctrcon( norm, uplo, diag, n, a, lda, rcond, work,rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(sp) :: ainvnm, anorm, scale, smlnum, xnorm
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTRCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=sp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_clantr( norm, uplo, diag, n, n, a, lda, rwork )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_clatrs( uplo, 'NO TRANSPOSE', diag, normin, n, a,lda, work, scale,&
+                               rwork, info )
+                 else
+                    ! multiply by inv(a**h).
+                    call stdlib_clatrs( uplo, 'CONJUGATE TRANSPOSE', diag, normin,n, a, lda, work,&
+                               scale, rwork, info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_icamax( n, work, 1 )
+                    xnorm = cabs1( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_csrscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_ctrcon
+
+     !> CTRSYL: solves the complex Sylvester matrix equation:
+     !> op(A)*X + X*op(B) = scale*C or
+     !> op(A)*X - X*op(B) = scale*C,
+     !> where op(A) = A or A**H, and A and B are both upper triangular. A is
+     !> M-by-M and B is N-by-N; the right hand side C and the solution X are
+     !> M-by-N; and scale is an output scale factor, set <= 1 to avoid
+     !> overflow in X.
+
+     subroutine stdlib_ctrsyl( trana, tranb, isgn, m, n, a, lda, b, ldb, c,ldc, scale, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trana, tranb
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: isgn, lda, ldb, ldc, m, n
+           real(sp), intent(out) :: scale
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notrna, notrnb
+           integer(ilp) :: j, k, l
+           real(sp) :: bignum, da11, db, eps, scaloc, sgn, smin, smlnum
+           complex(sp) :: a11, suml, sumr, vec, x11
+           ! Local Arrays 
+           real(sp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,max,min,real
+           ! Executable Statements 
+           ! decode and test input parameters
+           notrna = stdlib_lsame( trana, 'N' )
+           notrnb = stdlib_lsame( tranb, 'N' )
+           info = 0
+           if( .not.notrna .and. .not.stdlib_lsame( trana, 'C' ) ) then
+              info = -1
+           else if( .not.notrnb .and. .not.stdlib_lsame( tranb, 'C' ) ) then
+              info = -2
+           else if( isgn/=1 .and. isgn/=-1 ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, m ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTRSYL', -info )
+              return
+           end if
+           ! quick return if possible
+           scale = one
+           if( m==0 .or. n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = smlnum*real( m*n,KIND=sp) / eps
+           bignum = one / smlnum
+           smin = max( smlnum, eps*stdlib_clange( 'M', m, m, a, lda, dum ),eps*stdlib_clange( 'M',&
+                      n, n, b, ldb, dum ) )
+           sgn = isgn
+           if( notrna .and. notrnb ) then
+              ! solve    a*x + isgn*x*b = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! bottom-left corner column by column by
+                  ! a(k,k)*x(k,l) + isgn*x(k,l)*b(l,l) = c(k,l) - r(k,l)
+              ! where
+                          ! m                        l-1
+                ! r(k,l) = sum [a(k,i)*x(i,l)] +isgn*sum [x(k,j)*b(j,l)].
+                        ! i=k+1                      j=1
+              loop_30: do l = 1, n
+                 do k = m, 1, -1
+                    suml = stdlib_cdotu( m-k, a( k, min( k+1, m ) ), lda,c( min( k+1, m ), l ), 1 &
+                              )
+                    sumr = stdlib_cdotu( l-1, c( k, 1 ), ldc, b( 1, l ), 1 )
+                    vec = c( k, l ) - ( suml+sgn*sumr )
+                    scaloc = one
+                    a11 = a( k, k ) + sgn*b( l, l )
+                    da11 = abs( real( a11,KIND=sp) ) + abs( aimag( a11 ) )
+                    if( da11<=smin ) then
+                       a11 = smin
+                       da11 = smin
+                       info = 1
+                    end if
+                    db = abs( real( vec,KIND=sp) ) + abs( aimag( vec ) )
+                    if( da11<one .and. db>one ) then
+                       if( db>bignum*da11 )scaloc = one / db
+                    end if
+                    x11 = stdlib_cladiv( vec*cmplx( scaloc,KIND=sp), a11 )
+                    if( scaloc/=one ) then
+                       do j = 1, n
+                          call stdlib_csscal( m, scaloc, c( 1, j ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    c( k, l ) = x11
+                 end do
+              end do loop_30
+           else if( .not.notrna .and. notrnb ) then
+              ! solve    a**h *x + isgn*x*b = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! upper-left corner column by column by
+                  ! a**h(k,k)*x(k,l) + isgn*x(k,l)*b(l,l) = c(k,l) - r(k,l)
+              ! where
+                         ! k-1                           l-1
+                ! r(k,l) = sum [a**h(i,k)*x(i,l)] + isgn*sum [x(k,j)*b(j,l)]
+                         ! i=1                           j=1
+              loop_60: do l = 1, n
+                 do k = 1, m
+                    suml = stdlib_cdotc( k-1, a( 1, k ), 1, c( 1, l ), 1 )
+                    sumr = stdlib_cdotu( l-1, c( k, 1 ), ldc, b( 1, l ), 1 )
+                    vec = c( k, l ) - ( suml+sgn*sumr )
+                    scaloc = one
+                    a11 = conjg( a( k, k ) ) + sgn*b( l, l )
+                    da11 = abs( real( a11,KIND=sp) ) + abs( aimag( a11 ) )
+                    if( da11<=smin ) then
+                       a11 = smin
+                       da11 = smin
+                       info = 1
+                    end if
+                    db = abs( real( vec,KIND=sp) ) + abs( aimag( vec ) )
+                    if( da11<one .and. db>one ) then
+                       if( db>bignum*da11 )scaloc = one / db
+                    end if
+                    x11 = stdlib_cladiv( vec*cmplx( scaloc,KIND=sp), a11 )
+                    if( scaloc/=one ) then
+                       do j = 1, n
+                          call stdlib_csscal( m, scaloc, c( 1, j ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    c( k, l ) = x11
+                 end do
+              end do loop_60
+           else if( .not.notrna .and. .not.notrnb ) then
+              ! solve    a**h*x + isgn*x*b**h = c.
+              ! the (k,l)th block of x is determined starting from
+              ! upper-right corner column by column by
+                  ! a**h(k,k)*x(k,l) + isgn*x(k,l)*b**h(l,l) = c(k,l) - r(k,l)
+              ! where
+                          ! k-1
+                 ! r(k,l) = sum [a**h(i,k)*x(i,l)] +
+                          ! i=1
+                                 ! n
+                           ! isgn*sum [x(k,j)*b**h(l,j)].
+                                ! j=l+1
+              loop_90: do l = n, 1, -1
+                 do k = 1, m
+                    suml = stdlib_cdotc( k-1, a( 1, k ), 1, c( 1, l ), 1 )
+                    sumr = stdlib_cdotc( n-l, c( k, min( l+1, n ) ), ldc,b( l, min( l+1, n ) ), &
+                              ldb )
+                    vec = c( k, l ) - ( suml+sgn*conjg( sumr ) )
+                    scaloc = one
+                    a11 = conjg( a( k, k )+sgn*b( l, l ) )
+                    da11 = abs( real( a11,KIND=sp) ) + abs( aimag( a11 ) )
+                    if( da11<=smin ) then
+                       a11 = smin
+                       da11 = smin
+                       info = 1
+                    end if
+                    db = abs( real( vec,KIND=sp) ) + abs( aimag( vec ) )
+                    if( da11<one .and. db>one ) then
+                       if( db>bignum*da11 )scaloc = one / db
+                    end if
+                    x11 = stdlib_cladiv( vec*cmplx( scaloc,KIND=sp), a11 )
+                    if( scaloc/=one ) then
+                       do j = 1, n
+                          call stdlib_csscal( m, scaloc, c( 1, j ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    c( k, l ) = x11
+                 end do
+              end do loop_90
+           else if( notrna .and. .not.notrnb ) then
+              ! solve    a*x + isgn*x*b**h = c.
+              ! the (k,l)th block of x is determined starting from
+              ! bottom-left corner column by column by
+                 ! a(k,k)*x(k,l) + isgn*x(k,l)*b**h(l,l) = c(k,l) - r(k,l)
+              ! where
+                          ! m                          n
+                ! r(k,l) = sum [a(k,i)*x(i,l)] + isgn*sum [x(k,j)*b**h(l,j)]
+                        ! i=k+1                      j=l+1
+              loop_120: do l = n, 1, -1
+                 do k = m, 1, -1
+                    suml = stdlib_cdotu( m-k, a( k, min( k+1, m ) ), lda,c( min( k+1, m ), l ), 1 &
+                              )
+                    sumr = stdlib_cdotc( n-l, c( k, min( l+1, n ) ), ldc,b( l, min( l+1, n ) ), &
+                              ldb )
+                    vec = c( k, l ) - ( suml+sgn*conjg( sumr ) )
+                    scaloc = one
+                    a11 = a( k, k ) + sgn*conjg( b( l, l ) )
+                    da11 = abs( real( a11,KIND=sp) ) + abs( aimag( a11 ) )
+                    if( da11<=smin ) then
+                       a11 = smin
+                       da11 = smin
+                       info = 1
+                    end if
+                    db = abs( real( vec,KIND=sp) ) + abs( aimag( vec ) )
+                    if( da11<one .and. db>one ) then
+                       if( db>bignum*da11 )scaloc = one / db
+                    end if
+                    x11 = stdlib_cladiv( vec*cmplx( scaloc,KIND=sp), a11 )
+                    if( scaloc/=one ) then
+                       do j = 1, n
+                          call stdlib_csscal( m, scaloc, c( 1, j ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    c( k, l ) = x11
+                 end do
+              end do loop_120
+           end if
+           return
+     end subroutine stdlib_ctrsyl
+
+     !> CUNBDB5: orthogonalizes the column vector
+     !> X = [ X1 ]
+     !> [ X2 ]
+     !> with respect to the columns of
+     !> Q = [ Q1 ] .
+     !> [ Q2 ]
+     !> The columns of Q must be orthonormal.
+     !> If the projection is zero according to Kahan's "twice is enough"
+     !> criterion, then some other vector from the orthogonal complement
+     !> is returned. This vector is chosen in an arbitrary but deterministic
+     !> way.
+
+     pure subroutine stdlib_cunbdb5( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(sp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: x1(*), x2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: childinfo, i, j
+           ! Intrinsic Function 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           if( m1 < 0 ) then
+              info = -1
+           else if( m2 < 0 ) then
+              info = -2
+           else if( n < 0 ) then
+              info = -3
+           else if( incx1 < 1 ) then
+              info = -5
+           else if( incx2 < 1 ) then
+              info = -7
+           else if( ldq1 < max( 1, m1 ) ) then
+              info = -9
+           else if( ldq2 < max( 1, m2 ) ) then
+              info = -11
+           else if( lwork < n ) then
+              info = -13
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'CUNBDB5', -info )
+              return
+           end if
+           ! project x onto the orthogonal complement of q
+           call stdlib_cunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2, ldq2,work, lwork, &
+                     childinfo )
+           ! if the projection is nonzero, then return
+           if( stdlib_scnrm2(m1,x1,incx1) /= czero.or. stdlib_scnrm2(m2,x2,incx2) /= czero ) &
+                     then
+              return
+           end if
+           ! project each standard basis vector e_1,...,e_m1 in turn, stopping
+           ! when a nonzero projection is found
+           do i = 1, m1
+              do j = 1, m1
+                 x1(j) = czero
+              end do
+              x1(i) = cone
+              do j = 1, m2
+                 x2(j) = czero
+              end do
+              call stdlib_cunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                        lwork, childinfo )
+              if( stdlib_scnrm2(m1,x1,incx1) /= czero.or. stdlib_scnrm2(m2,x2,incx2) /= czero ) &
+                        then
+                 return
+              end if
+           end do
+           ! project each standard basis vector e_(m1+1),...,e_(m1+m2) in turn,
+           ! stopping when a nonzero projection is found
+           do i = 1, m2
+              do j = 1, m1
+                 x1(j) = czero
+              end do
+              do j = 1, m2
+                 x2(j) = czero
+              end do
+              x2(i) = cone
+              call stdlib_cunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                        lwork, childinfo )
+              if( stdlib_scnrm2(m1,x1,incx1) /= czero.or. stdlib_scnrm2(m2,x2,incx2) /= czero ) &
+                        then
+                 return
+              end if
+           end do
+           return
+     end subroutine stdlib_cunbdb5
+
+     !> CUNCSD: computes the CS decomposition of an M-by-M partitioned
+     !> unitary matrix X:
+     !> [  I  0  0 |  0  0  0 ]
+     !> [  0  C  0 |  0 -S  0 ]
+     !> [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**H
+     !> X = [-----------] = [---------] [---------------------] [---------]   .
+     !> [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
+     !> [  0  S  0 |  0  C  0 ]
+     !> [  0  0  I |  0  0  0 ]
+     !> X11 is P-by-Q. The unitary matrices U1, U2, V1, and V2 are P-by-P,
+     !> (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
+     !> R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
+     !> which R = MIN(P,M-P,Q,M-Q).
+
+     recursive subroutine stdlib_cuncsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, x11, &
+     ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, &
+               work, lwork, rwork, lrwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, signs, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, ldx11, ldx12, ldx21, ldx22, &
+                     lrwork, lwork, m, p, q
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: theta(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*), work(*)
+                     
+           complex(sp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
+                     
+        ! ===================================================================
+           
+           ! Local Scalars 
+           character :: transt, signst
+           integer(ilp) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
+           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, itauq2, j, lbbcsdwork, &
+           lbbcsdworkmin, lbbcsdworkopt, lorbdbwork, lorbdbworkmin, lorbdbworkopt, lorglqwork, &
+           lorglqworkmin, lorglqworkopt, lorgqrwork, lorgqrworkmin, lorgqrworkopt, lworkmin, &
+                     lworkopt, p1, q1
+           logical(lk) :: colmajor, defaultsigns, lquery, wantu1, wantu2, wantv1t, wantv2t
+           integer(ilp) :: lrworkmin, lrworkopt
+           logical(lk) :: lrquery
+           ! Intrinsic Functions
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           wantv2t = stdlib_lsame( jobv2t, 'Y' )
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           defaultsigns = .not. stdlib_lsame( signs, 'O' )
+           lquery = lwork == -1
+           lrquery = lrwork == -1
+           if( m < 0 ) then
+              info = -7
+           else if( p < 0 .or. p > m ) then
+              info = -8
+           else if( q < 0 .or. q > m ) then
+              info = -9
+           else if ( colmajor .and.  ldx11 < max( 1, p ) ) then
+             info = -11
+           else if (.not. colmajor .and. ldx11 < max( 1, q ) ) then
+             info = -11
+           else if (colmajor .and. ldx12 < max( 1, p ) ) then
+             info = -13
+           else if (.not. colmajor .and. ldx12 < max( 1, m-q ) ) then
+             info = -13
+           else if (colmajor .and. ldx21 < max( 1, m-p ) ) then
+             info = -15
+           else if (.not. colmajor .and. ldx21 < max( 1, q ) ) then
+             info = -15
+           else if (colmajor .and. ldx22 < max( 1, m-p ) ) then
+             info = -17
+           else if (.not. colmajor .and. ldx22 < max( 1, m-q ) ) then
+             info = -17
+           else if( wantu1 .and. ldu1 < p ) then
+              info = -20
+           else if( wantu2 .and. ldu2 < m-p ) then
+              info = -22
+           else if( wantv1t .and. ldv1t < q ) then
+              info = -24
+           else if( wantv2t .and. ldv2t < m-q ) then
+              info = -26
+           end if
+           ! work with transpose if convenient
+           if( info == 0 .and. min( p, m-p ) < min( q, m-q ) ) then
+              if( colmajor ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              if( defaultsigns ) then
+                 signst = 'O'
+              else
+                 signst = 'D'
+              end if
+              call stdlib_cuncsd( jobv1t, jobv2t, jobu1, jobu2, transt, signst, m,q, p, x11, &
+              ldx11, x21, ldx21, x12, ldx12, x22,ldx22, theta, v1t, ldv1t, v2t, ldv2t, u1, ldu1,&
+                        u2, ldu2, work, lwork, rwork, lrwork, iwork,info )
+              return
+           end if
+           ! work with permutation [ 0 i; i 0 ] * x * [ 0 i; i 0 ] if
+           ! convenient
+           if( info == 0 .and. m-q < q ) then
+              if( defaultsigns ) then
+                 signst = 'O'
+              else
+                 signst = 'D'
+              end if
+              call stdlib_cuncsd( jobu2, jobu1, jobv2t, jobv1t, trans, signst, m,m-p, m-q, x22, &
+              ldx22, x21, ldx21, x12, ldx12, x11,ldx11, theta, u2, ldu2, u1, ldu1, v2t, ldv2t, &
+                        v1t,ldv1t, work, lwork, rwork, lrwork, iwork, info )
+              return
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ! real workspace
+              iphi = 2
+              ib11d = iphi + max( 1, q - 1 )
+              ib11e = ib11d + max( 1, q )
+              ib12d = ib11e + max( 1, q - 1 )
+              ib12e = ib12d + max( 1, q )
+              ib21d = ib12e + max( 1, q - 1 )
+              ib21e = ib21d + max( 1, q )
+              ib22d = ib21e + max( 1, q - 1 )
+              ib22e = ib22d + max( 1, q )
+              ibbcsd = ib22e + max( 1, q - 1 )
+              call stdlib_cbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, theta, u1, &
+              ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, theta, theta, theta, theta, theta,theta, &
+                        theta, theta, rwork, -1, childinfo )
+              lbbcsdworkopt = int( rwork(1),KIND=ilp)
+              lbbcsdworkmin = lbbcsdworkopt
+              lrworkopt = ibbcsd + lbbcsdworkopt - 1
+              lrworkmin = ibbcsd + lbbcsdworkmin - 1
+              rwork(1) = lrworkopt
+              ! complex workspace
+              itaup1 = 2
+              itaup2 = itaup1 + max( 1, p )
+              itauq1 = itaup2 + max( 1, m - p )
+              itauq2 = itauq1 + max( 1, q )
+              iorgqr = itauq2 + max( 1, m - q )
+              call stdlib_cungqr( m-q, m-q, m-q, u1, max(1,m-q), u1, work, -1,childinfo )
+              lorgqrworkopt = int( work(1),KIND=ilp)
+              lorgqrworkmin = max( 1, m - q )
+              iorglq = itauq2 + max( 1, m - q )
+              call stdlib_cunglq( m-q, m-q, m-q, u1, max(1,m-q), u1, work, -1,childinfo )
+              lorglqworkopt = int( work(1),KIND=ilp)
+              lorglqworkmin = max( 1, m - q )
+              iorbdb = itauq2 + max( 1, m - q )
+              call stdlib_cunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+                        ldx22, theta, theta, u1, u2,v1t, v2t, work, -1, childinfo )
+              lorbdbworkopt = int( work(1),KIND=ilp)
+              lorbdbworkmin = lorbdbworkopt
+              lworkopt = max( iorgqr + lorgqrworkopt, iorglq + lorglqworkopt,iorbdb + &
+                        lorbdbworkopt ) - 1
+              lworkmin = max( iorgqr + lorgqrworkmin, iorglq + lorglqworkmin,iorbdb + &
+                        lorbdbworkmin ) - 1
+              work(1) = max(lworkopt,lworkmin)
+              if( lwork < lworkmin.and. .not. ( lquery .or. lrquery ) ) then
+                 info = -22
+              else if( lrwork < lrworkmin.and. .not. ( lquery .or. lrquery ) ) then
+                 info = -24
+              else
+                 lorgqrwork = lwork - iorgqr + 1
+                 lorglqwork = lwork - iorglq + 1
+                 lorbdbwork = lwork - iorbdb + 1
+                 lbbcsdwork = lrwork - ibbcsd + 1
+              end if
+           end if
+           ! abort if any illegal arguments
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'CUNCSD', -info )
+              return
+           else if( lquery .or. lrquery ) then
+              return
+           end if
+           ! transform to bidiagonal block form
+           call stdlib_cunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12, x21,ldx21, x22, &
+           ldx22, theta, rwork(iphi), work(itaup1),work(itaup2), work(itauq1), work(itauq2),work(&
+                     iorbdb), lorbdbwork, childinfo )
+           ! accumulate householder reflectors
+           if( colmajor ) then
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_clacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_cungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqrwork, &
+                           info)
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_clacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_cungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqrwork,&
+                            info )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_clacpy( 'U', q-1, q-1, x11(1,2), ldx11, v1t(2,2),ldv1t )
+                 v1t(1, 1) = cone
+                 do j = 2, q
+                    v1t(1,j) = czero
+                    v1t(j,1) = czero
+                 end do
+                 call stdlib_cunglq( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorglq), &
+                           lorglqwork, info )
+              end if
+              if( wantv2t .and. m-q > 0 ) then
+                 call stdlib_clacpy( 'U', p, m-q, x12, ldx12, v2t, ldv2t )
+                 if( m-p > q ) then
+                    call stdlib_clacpy( 'U', m-p-q, m-p-q, x22(q+1,p+1), ldx22,v2t(p+1,p+1), &
+                              ldv2t )
+                 end if
+                 if( m > q ) then
+                    call stdlib_cunglq( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorglq), &
+                              lorglqwork, info )
+                 end if
+              end if
+           else
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_clacpy( 'U', q, p, x11, ldx11, u1, ldu1 )
+                 call stdlib_cunglq( p, p, q, u1, ldu1, work(itaup1), work(iorglq),lorglqwork, &
+                           info)
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_clacpy( 'U', q, m-p, x21, ldx21, u2, ldu2 )
+                 call stdlib_cunglq( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorglq), lorglqwork,&
+                            info )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_clacpy( 'L', q-1, q-1, x11(2,1), ldx11, v1t(2,2),ldv1t )
+                 v1t(1, 1) = cone
+                 do j = 2, q
+                    v1t(1,j) = czero
+                    v1t(j,1) = czero
+                 end do
+                 call stdlib_cungqr( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorgqr), &
+                           lorgqrwork, info )
+              end if
+              if( wantv2t .and. m-q > 0 ) then
+                 p1 = min( p+1, m )
+                 q1 = min( q+1, m )
+                 call stdlib_clacpy( 'L', m-q, p, x12, ldx12, v2t, ldv2t )
+                 if ( m > p+q ) then
+                    call stdlib_clacpy( 'L', m-p-q, m-p-q, x22(p1,q1), ldx22,v2t(p+1,p+1), ldv2t )
+                              
+                 end if
+                 call stdlib_cungqr( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorgqr), &
+                           lorgqrwork, info )
+              end if
+           end if
+           ! compute the csd of the matrix in bidiagonal-block form
+           call stdlib_cbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q, theta,rwork(iphi), &
+           u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, rwork(ib11d), rwork(ib11e), rwork(ib12d),&
+           rwork(ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd),&
+                     lbbcsdwork, info )
+           ! permute rows and columns to place identity submatrices in top-
+           ! left corner of (1,1)-block and/or bottom-right corner of (1,2)-
+           ! block and/or bottom-right corner of (2,1)-block and/or top-left
+           ! corner of (2,2)-block
+           if( q > 0 .and. wantu2 ) then
+              do i = 1, q
+                 iwork(i) = m - p - q + i
+              end do
+              do i = q + 1, m - p
+                 iwork(i) = i - q
+              end do
+              if( colmajor ) then
+                 call stdlib_clapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              else
+                 call stdlib_clapmr( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           end if
+           if( m > 0 .and. wantv2t ) then
+              do i = 1, p
+                 iwork(i) = m - p - q + i
+              end do
+              do i = p + 1, m - q
+                 iwork(i) = i - p
+              end do
+              if( .not. colmajor ) then
+                 call stdlib_clapmt( .false., m-q, m-q, v2t, ldv2t, iwork )
+              else
+                 call stdlib_clapmr( .false., m-q, m-q, v2t, ldv2t, iwork )
+              end if
+           end if
+           return
+           ! end stdlib_cuncsd
+     end subroutine stdlib_cuncsd
+
+     !> CUNGHR: generates a complex unitary matrix Q which is defined as the
+     !> product of IHI-ILO elementary reflectors of order N, as returned by
+     !> CGEHRD:
+     !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+     pure subroutine stdlib_cunghr( n, ilo, ihi, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, lwkopt, nb, nh
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nh = ihi - ilo
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( lwork<max( 1, nh ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'CUNGQR', ' ', nh, nh, nh, -1 )
+              lwkopt = max( 1, nh )*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNGHR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! shift the vectors which define the elementary reflectors cone
+           ! column to the right, and set the first ilo and the last n-ihi
+           ! rows and columns to those of the unit matrix
+           do j = ihi, ilo + 1, -1
+              do i = 1, j - 1
+                 a( i, j ) = czero
+              end do
+              do i = j + 1, ihi
+                 a( i, j ) = a( i, j-1 )
+              end do
+              do i = ihi + 1, n
+                 a( i, j ) = czero
+              end do
+           end do
+           do j = 1, ilo
+              do i = 1, n
+                 a( i, j ) = czero
+              end do
+              a( j, j ) = cone
+           end do
+           do j = ihi + 1, n
+              do i = 1, n
+                 a( i, j ) = czero
+              end do
+              a( j, j ) = cone
+           end do
+           if( nh>0 ) then
+              ! generate q(ilo+1:ihi,ilo+1:ihi)
+              call stdlib_cungqr( nh, nh, nh, a( ilo+1, ilo+1 ), lda, tau( ilo ),work, lwork, &
+                        iinfo )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cunghr
+
+     !> CUNGTR: generates a complex unitary matrix Q which is defined as the
+     !> product of n-1 elementary reflectors of order N, as returned by
+     !> CHETRD:
+     !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+
+     pure subroutine stdlib_cungtr( uplo, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, j, lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, n-1 ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              if ( upper ) then
+                nb = stdlib_ilaenv( 1, 'CUNGQL', ' ', n-1, n-1, n-1, -1 )
+              else
+                nb = stdlib_ilaenv( 1, 'CUNGQR', ' ', n-1, n-1, n-1, -1 )
+              end if
+              lwkopt = max( 1, n-1 )*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNGTR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( upper ) then
+              ! q was determined by a call to stdlib_chetrd with uplo = 'u'
+              ! shift the vectors which define the elementary reflectors cone
+              ! column to the left, and set the last row and column of q to
+              ! those of the unit matrix
+              do j = 1, n - 1
+                 do i = 1, j - 1
+                    a( i, j ) = a( i, j+1 )
+                 end do
+                 a( n, j ) = czero
+              end do
+              do i = 1, n - 1
+                 a( i, n ) = czero
+              end do
+              a( n, n ) = cone
+              ! generate q(1:n-1,1:n-1)
+              call stdlib_cungql( n-1, n-1, n-1, a, lda, tau, work, lwork, iinfo )
+           else
+              ! q was determined by a call to stdlib_chetrd with uplo = 'l'.
+              ! shift the vectors which define the elementary reflectors cone
+              ! column to the right, and set the first row and column of q to
+              ! those of the unit matrix
+              do j = n, 2, -1
+                 a( 1, j ) = czero
+                 do i = j + 1, n
+                    a( i, j ) = a( i, j-1 )
+                 end do
+              end do
+              a( 1, 1 ) = cone
+              do i = 2, n
+                 a( i, 1 ) = czero
+              end do
+              if( n>1 ) then
+                 ! generate q(2:n,2:n)
+                 call stdlib_cungqr( n-1, n-1, n-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                           
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cungtr
+
+     !> CUNHR_COL: takes an M-by-N complex matrix Q_in with orthonormal columns
+     !> as input, stored in A, and performs Householder Reconstruction (HR),
+     !> i.e. reconstructs Householder vectors V(i) implicitly representing
+     !> another M-by-N matrix Q_out, with the property that Q_in = Q_out*S,
+     !> where S is an N-by-N diagonal matrix with diagonal entries
+     !> equal to +1 or -1. The Householder vectors (columns V(i) of V) are
+     !> stored in A on output, and the diagonal entries of S are stored in D.
+     !> Block reflectors are also returned in T
+     !> (same output format as CGEQRT).
+
+     pure subroutine stdlib_cunhr_col( m, n, nb, a, lda, t, ldt, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, nb
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: d(*), t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo, j, jb, jbtemp1, jbtemp2, jnb, nplusone
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( nb<1 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -7
+           end if
+           ! handle error in the input parameters.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNHR_COL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              return
+           end if
+           ! on input, the m-by-n matrix a contains the unitary
+           ! m-by-n matrix q_in.
+           ! (1) compute the unit lower-trapezoidal v (ones on the diagonal
+           ! are not stored) by performing the "modified" lu-decomposition.
+           ! q_in - ( s ) = v * u = ( v1 ) * u,
+                  ! ( 0 )           ( v2 )
+           ! where 0 is an (m-n)-by-n zero matrix.
+           ! (1-1) factor v1 and u.
+           call stdlib_claunhr_col_getrfnp( n, n, a, lda, d, iinfo )
+           ! (1-2) solve for v2.
+           if( m>n ) then
+              call stdlib_ctrsm( 'R', 'U', 'N', 'N', m-n, n, cone, a, lda,a( n+1, 1 ), lda )
+                        
+           end if
+           ! (2) reconstruct the block reflector t stored in t(1:nb, 1:n)
+           ! as a sequence of upper-triangular blocks with nb-size column
+           ! blocking.
+           ! loop over the column blocks of size nb of the array a(1:m,1:n)
+           ! and the array t(1:nb,1:n), jb is the column index of a column
+           ! block, jnb is the column block size at each step jb.
+           nplusone = n + 1
+           do jb = 1, n, nb
+              ! (2-0) determine the column block size jnb.
+              jnb = min( nplusone-jb, nb )
+              ! (2-1) copy the upper-triangular part of the current jnb-by-jnb
+              ! diagonal block u(jb) (of the n-by-n matrix u) stored
+              ! in a(jb:jb+jnb-1,jb:jb+jnb-1) into the upper-triangular part
+              ! of the current jnb-by-jnb block t(1:jnb,jb:jb+jnb-1)
+              ! column-by-column, total jnb*(jnb+1)/2 elements.
+              jbtemp1 = jb - 1
+              do j = jb, jb+jnb-1
+                 call stdlib_ccopy( j-jbtemp1, a( jb, j ), 1, t( 1, j ), 1 )
+              end do
+              ! (2-2) perform on the upper-triangular part of the current
+              ! jnb-by-jnb diagonal block u(jb) (of the n-by-n matrix u) stored
+              ! in t(1:jnb,jb:jb+jnb-1) the following operation in place:
+              ! (-1)*u(jb)*s(jb), i.e the result will be stored in the upper-
+              ! triangular part of t(1:jnb,jb:jb+jnb-1). this multiplication
+              ! of the jnb-by-jnb diagonal block u(jb) by the jnb-by-jnb
+              ! diagonal block s(jb) of the n-by-n sign matrix s from the
+              ! right means changing the sign of each j-th column of the block
+              ! u(jb) according to the sign of the diagonal element of the block
+              ! s(jb), i.e. s(j,j) that is stored in the array element d(j).
+              do j = jb, jb+jnb-1
+                 if( d( j )==cone ) then
+                    call stdlib_cscal( j-jbtemp1, -cone, t( 1, j ), 1 )
+                 end if
+              end do
+              ! (2-3) perform the triangular solve for the current block
+              ! matrix x(jb):
+                     ! x(jb) * (a(jb)**t) = b(jb), where:
+                     ! a(jb)**t  is a jnb-by-jnb unit upper-triangular
+                               ! coefficient block, and a(jb)=v1(jb), which
+                               ! is a jnb-by-jnb unit lower-triangular block
+                               ! stored in a(jb:jb+jnb-1,jb:jb+jnb-1).
+                               ! the n-by-n matrix v1 is the upper part
+                               ! of the m-by-n lower-trapezoidal matrix v
+                               ! stored in a(1:m,1:n);
+                     ! b(jb)     is a jnb-by-jnb  upper-triangular right-hand
+                               ! side block, b(jb) = (-1)*u(jb)*s(jb), and
+                               ! b(jb) is stored in t(1:jnb,jb:jb+jnb-1);
+                     ! x(jb)     is a jnb-by-jnb upper-triangular solution
+                               ! block, x(jb) is the upper-triangular block
+                               ! reflector t(jb), and x(jb) is stored
+                               ! in t(1:jnb,jb:jb+jnb-1).
+                   ! in other words, we perform the triangular solve for the
+                   ! upper-triangular block t(jb):
+                     ! t(jb) * (v1(jb)**t) = (-1)*u(jb)*s(jb).
+                   ! even though the blocks x(jb) and b(jb) are upper-
+                   ! triangular, the routine stdlib_ctrsm will access all jnb**2
+                   ! elements of the square t(1:jnb,jb:jb+jnb-1). therefore,
+                   ! we need to set to zero the elements of the block
+                   ! t(1:jnb,jb:jb+jnb-1) below the diagonal before the call
+                   ! to stdlib_ctrsm.
+              ! (2-3a) set the elements to zero.
+              jbtemp2 = jb - 2
+              do j = jb, jb+jnb-2
+                 do i = j-jbtemp2, nb
+                    t( i, j ) = czero
+                 end do
+              end do
+              ! (2-3b) perform the triangular solve.
+              call stdlib_ctrsm( 'R', 'L', 'C', 'U', jnb, jnb, cone,a( jb, jb ), lda, t( 1, jb ), &
+                        ldt )
+           end do
+           return
+     end subroutine stdlib_cunhr_col
+
+     !> CUNMHR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> IHI-ILO elementary reflectors, as returned by CGEHRD:
+     !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+     pure subroutine stdlib_cunmhr( side, trans, m, n, ilo, ihi, a, lda, tau, c,ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldc, lwork, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, lquery
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, nh, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nh = ihi - ilo
+           left = stdlib_lsame( side, 'L' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'C' ) )&
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 .or. ilo>max( 1, nq ) ) then
+              info = -5
+           else if( ihi<min( ilo, nq ) .or. ihi>nq ) then
+              info = -6
+           else if( lda<max( 1, nq ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( left ) then
+                 nb = stdlib_ilaenv( 1, 'CUNMQR', side // trans, nh, n, nh, -1 )
+              else
+                 nb = stdlib_ilaenv( 1, 'CUNMQR', side // trans, m, nh, nh, -1 )
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNMHR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. nh==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( left ) then
+              mi = nh
+              ni = n
+              i1 = ilo + 1
+              i2 = 1
+           else
+              mi = m
+              ni = nh
+              i1 = 1
+              i2 = ilo + 1
+           end if
+           call stdlib_cunmqr( side, trans, mi, ni, nh, a( ilo+1, ilo ), lda,tau( ilo ), c( i1, &
+                     i2 ), ldc, work, lwork, iinfo )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cunmhr
+
+     !> CUNMTR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> nq-1 elementary reflectors, as returned by CHETRD:
+     !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+     pure subroutine stdlib_cunmtr( side, uplo, trans, m, n, a, lda, tau, c, ldc,work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, lquery, upper
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'C' ) )&
+                     then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              if( upper ) then
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'CUNMQL', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'CUNMQL', side // trans, m, n-1, n-1,-1 )
+                 end if
+              else
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'CUNMQR', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'CUNMQR', side // trans, m, n-1, n-1,-1 )
+                 end if
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNMTR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. nq==1 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( left ) then
+              mi = m - 1
+              ni = n
+           else
+              mi = m
+              ni = n - 1
+           end if
+           if( upper ) then
+              ! q was determined by a call to stdlib_chetrd with uplo = 'u'
+              call stdlib_cunmql( side, trans, mi, ni, nq-1, a( 1, 2 ), lda, tau, c,ldc, work, &
+                        lwork, iinfo )
+           else
+              ! q was determined by a call to stdlib_chetrd with uplo = 'l'
+              if( left ) then
+                 i1 = 2
+                 i2 = 1
+              else
+                 i1 = 1
+                 i2 = 2
+              end if
+              call stdlib_cunmqr( side, trans, mi, ni, nq-1, a( 2, 1 ), lda, tau,c( i1, i2 ), ldc,&
+                         work, lwork, iinfo )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cunmtr
+
+     !> CUPGTR: generates a complex unitary matrix Q which is defined as the
+     !> product of n-1 elementary reflectors H(i) of order n, as returned by
+     !> CHPTRD using packed storage:
+     !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+
+     pure subroutine stdlib_cupgtr( uplo, n, ap, tau, q, ldq, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldq, n
+           ! Array Arguments 
+           complex(sp), intent(in) :: ap(*), tau(*)
+           complex(sp), intent(out) :: q(ldq,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, ij, j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUPGTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! q was determined by a call to stdlib_chptrd with uplo = 'u'
+              ! unpack the vectors which define the elementary reflectors and
+              ! set the last row and column of q equal to those of the unit
+              ! matrix
+              ij = 2
+              do j = 1, n - 1
+                 do i = 1, j - 1
+                    q( i, j ) = ap( ij )
+                    ij = ij + 1
+                 end do
+                 ij = ij + 2
+                 q( n, j ) = czero
+              end do
+              do i = 1, n - 1
+                 q( i, n ) = czero
+              end do
+              q( n, n ) = cone
+              ! generate q(1:n-1,1:n-1)
+              call stdlib_cung2l( n-1, n-1, n-1, q, ldq, tau, work, iinfo )
+           else
+              ! q was determined by a call to stdlib_chptrd with uplo = 'l'.
+              ! unpack the vectors which define the elementary reflectors and
+              ! set the first row and column of q equal to those of the unit
+              ! matrix
+              q( 1, 1 ) = cone
+              do i = 2, n
+                 q( i, 1 ) = czero
+              end do
+              ij = 3
+              do j = 2, n
+                 q( 1, j ) = czero
+                 do i = j + 1, n
+                    q( i, j ) = ap( ij )
+                    ij = ij + 1
+                 end do
+                 ij = ij + 2
+              end do
+              if( n>1 ) then
+                 ! generate q(2:n,2:n)
+                 call stdlib_cung2r( n-1, n-1, n-1, q( 2, 2 ), ldq, tau, work,iinfo )
+              end if
+           end if
+           return
+     end subroutine stdlib_cupgtr
+
+     !> CUPMTR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> nq-1 elementary reflectors, as returned by CHPTRD using packed
+     !> storage:
+     !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+     pure subroutine stdlib_cupmtr( side, uplo, trans, m, n, ap, tau, c, ldc, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: ap(*), c(ldc,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: forwrd, left, notran, upper
+           integer(ilp) :: i, i1, i2, i3, ic, ii, jc, mi, ni, nq
+           complex(sp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           upper = stdlib_lsame( uplo, 'U' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( ldc<max( 1, m ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUPMTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if( upper ) then
+              ! q was determined by a call to stdlib_chptrd with uplo = 'u'
+              forwrd = ( left .and. notran ) .or.( .not.left .and. .not.notran )
+              if( forwrd ) then
+                 i1 = 1
+                 i2 = nq - 1
+                 i3 = 1
+                 ii = 2
+              else
+                 i1 = nq - 1
+                 i2 = 1
+                 i3 = -1
+                 ii = nq*( nq+1 ) / 2 - 1
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              do i = i1, i2, i3
+                 if( left ) then
+                    ! h(i) or h(i)**h is applied to c(1:i,1:n)
+                    mi = i
+                 else
+                    ! h(i) or h(i)**h is applied to c(1:m,1:i)
+                    ni = i
+                 end if
+                 ! apply h(i) or h(i)**h
+                 if( notran ) then
+                    taui = tau( i )
+                 else
+                    taui = conjg( tau( i ) )
+                 end if
+                 aii = ap( ii )
+                 ap( ii ) = cone
+                 call stdlib_clarf( side, mi, ni, ap( ii-i+1 ), 1, taui, c, ldc,work )
+                 ap( ii ) = aii
+                 if( forwrd ) then
+                    ii = ii + i + 2
+                 else
+                    ii = ii - i - 1
+                 end if
+              end do
+           else
+              ! q was determined by a call to stdlib_chptrd with uplo = 'l'.
+              forwrd = ( left .and. .not.notran ) .or.( .not.left .and. notran )
+              if( forwrd ) then
+                 i1 = 1
+                 i2 = nq - 1
+                 i3 = 1
+                 ii = 2
+              else
+                 i1 = nq - 1
+                 i2 = 1
+                 i3 = -1
+                 ii = nq*( nq+1 ) / 2 - 1
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              loop_20: do i = i1, i2, i3
+                 aii = ap( ii )
+                 ap( ii ) = cone
+                 if( left ) then
+                    ! h(i) or h(i)**h is applied to c(i+1:m,1:n)
+                    mi = m - i
+                    ic = i + 1
+                 else
+                    ! h(i) or h(i)**h is applied to c(1:m,i+1:n)
+                    ni = n - i
+                    jc = i + 1
+                 end if
+                 ! apply h(i) or h(i)**h
+                 if( notran ) then
+                    taui = tau( i )
+                 else
+                    taui = conjg( tau( i ) )
+                 end if
+                 call stdlib_clarf( side, mi, ni, ap( ii ), 1, taui, c( ic, jc ),ldc, work )
+                           
+                 ap( ii ) = aii
+                 if( forwrd ) then
+                    ii = ii + nq - i + 1
+                 else
+                    ii = ii - nq + i - 2
+                 end if
+              end do loop_20
+           end if
+           return
+     end subroutine stdlib_cupmtr
+
+     !> CGBBRD: reduces a complex general m-by-n band matrix A to real upper
+     !> bidiagonal form B by a unitary transformation: Q**H * A * P = B.
+     !> The routine computes B, and optionally forms Q or P**H, or computes
+     !> Q**H*C for a given matrix C.
+
+     pure subroutine stdlib_cgbbrd( vect, m, n, ncc, kl, ku, ab, ldab, d, e, q,ldq, pt, ldpt, c, &
+               ldc, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldc, ldpt, ldq, m, n, ncc
+           ! Array Arguments 
+           real(sp), intent(out) :: d(*), e(*), rwork(*)
+           complex(sp), intent(inout) :: ab(ldab,*), c(ldc,*)
+           complex(sp), intent(out) :: pt(ldpt,*), q(ldq,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: wantb, wantc, wantpt, wantq
+           integer(ilp) :: i, inca, j, j1, j2, kb, kb1, kk, klm, klu1, kun, l, minmn, ml, ml0, mu,&
+                      mu0, nr, nrt
+           real(sp) :: abst, rc
+           complex(sp) :: ra, rb, rs, t
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           wantb = stdlib_lsame( vect, 'B' )
+           wantq = stdlib_lsame( vect, 'Q' ) .or. wantb
+           wantpt = stdlib_lsame( vect, 'P' ) .or. wantb
+           wantc = ncc>0
+           klu1 = kl + ku + 1
+           info = 0
+           if( .not.wantq .and. .not.wantpt .and. .not.stdlib_lsame( vect, 'N' ) )then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ncc<0 ) then
+              info = -4
+           else if( kl<0 ) then
+              info = -5
+           else if( ku<0 ) then
+              info = -6
+           else if( ldab<klu1 ) then
+              info = -8
+           else if( ldq<1 .or. wantq .and. ldq<max( 1, m ) ) then
+              info = -12
+           else if( ldpt<1 .or. wantpt .and. ldpt<max( 1, n ) ) then
+              info = -14
+           else if( ldc<1 .or. wantc .and. ldc<max( 1, m ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGBBRD', -info )
+              return
+           end if
+           ! initialize q and p**h to the unit matrix, if needed
+           if( wantq )call stdlib_claset( 'FULL', m, m, czero, cone, q, ldq )
+           if( wantpt )call stdlib_claset( 'FULL', n, n, czero, cone, pt, ldpt )
+           ! quick return if possible.
+           if( m==0 .or. n==0 )return
+           minmn = min( m, n )
+           if( kl+ku>1 ) then
+              ! reduce to upper bidiagonal form if ku > 0; if ku = 0, reduce
+              ! first to lower bidiagonal form and then transform to upper
+              ! bidiagonal
+              if( ku>0 ) then
+                 ml0 = 1
+                 mu0 = 2
+              else
+                 ml0 = 2
+                 mu0 = 1
+              end if
+              ! wherever possible, plane rotations are generated and applied in
+              ! vector operations of length nr over the index set j1:j2:klu1.
+              ! the complex sines of the plane rotations are stored in work,
+              ! and the real cosines in rwork.
+              klm = min( m-1, kl )
+              kun = min( n-1, ku )
+              kb = klm + kun
+              kb1 = kb + 1
+              inca = kb1*ldab
+              nr = 0
+              j1 = klm + 2
+              j2 = 1 - kun
+              loop_90: do i = 1, minmn
+                 ! reduce i-th column and i-th row of matrix to bidiagonal form
+                 ml = klm + 1
+                 mu = kun + 1
+                 loop_80: do kk = 1, kb
+                    j1 = j1 + kb
+                    j2 = j2 + kb
+                    ! generate plane rotations to annihilate nonzero elements
+                    ! which have been created below the band
+                    if( nr>0 )call stdlib_clargv( nr, ab( klu1, j1-klm-1 ), inca,work( j1 ), kb1, &
+                              rwork( j1 ), kb1 )
+                    ! apply plane rotations from the left
+                    do l = 1, kb
+                       if( j2-klm+l-1>n ) then
+                          nrt = nr - 1
+                       else
+                          nrt = nr
+                       end if
+                       if( nrt>0 )call stdlib_clartv( nrt, ab( klu1-l, j1-klm+l-1 ), inca,ab( &
+                                 klu1-l+1, j1-klm+l-1 ), inca,rwork( j1 ), work( j1 ), kb1 )
+                    end do
+                    if( ml>ml0 ) then
+                       if( ml<=m-i+1 ) then
+                          ! generate plane rotation to annihilate a(i+ml-1,i)
+                          ! within the band, and apply rotation from the left
+                          call stdlib_clartg( ab( ku+ml-1, i ), ab( ku+ml, i ),rwork( i+ml-1 ), &
+                                    work( i+ml-1 ), ra )
+                          ab( ku+ml-1, i ) = ra
+                          if( i<n )call stdlib_crot( min( ku+ml-2, n-i ),ab( ku+ml-2, i+1 ), ldab-&
+                                    1,ab( ku+ml-1, i+1 ), ldab-1,rwork( i+ml-1 ), work( i+ml-1 ) )
+                       end if
+                       nr = nr + 1
+                       j1 = j1 - kb1
+                    end if
+                    if( wantq ) then
+                       ! accumulate product of plane rotations in q
+                       do j = j1, j2, kb1
+                          call stdlib_crot( m, q( 1, j-1 ), 1, q( 1, j ), 1,rwork( j ), conjg( &
+                                    work( j ) ) )
+                       end do
+                    end if
+                    if( wantc ) then
+                       ! apply plane rotations to c
+                       do j = j1, j2, kb1
+                          call stdlib_crot( ncc, c( j-1, 1 ), ldc, c( j, 1 ), ldc,rwork( j ), &
+                                    work( j ) )
+                       end do
+                    end if
+                    if( j2+kun>n ) then
+                       ! adjust j2 to keep within the bounds of the matrix
+                       nr = nr - 1
+                       j2 = j2 - kb1
+                    end if
+                    do j = j1, j2, kb1
+                       ! create nonzero element a(j-1,j+ku) above the band
+                       ! and store it in work(n+1:2*n)
+                       work( j+kun ) = work( j )*ab( 1, j+kun )
+                       ab( 1, j+kun ) = rwork( j )*ab( 1, j+kun )
+                    end do
+                    ! generate plane rotations to annihilate nonzero elements
+                    ! which have been generated above the band
+                    if( nr>0 )call stdlib_clargv( nr, ab( 1, j1+kun-1 ), inca,work( j1+kun ), kb1,&
+                               rwork( j1+kun ),kb1 )
+                    ! apply plane rotations from the right
+                    do l = 1, kb
+                       if( j2+l-1>m ) then
+                          nrt = nr - 1
+                       else
+                          nrt = nr
+                       end if
+                       if( nrt>0 )call stdlib_clartv( nrt, ab( l+1, j1+kun-1 ), inca,ab( l, j1+&
+                                 kun ), inca,rwork( j1+kun ), work( j1+kun ), kb1 )
+                    end do
+                    if( ml==ml0 .and. mu>mu0 ) then
+                       if( mu<=n-i+1 ) then
+                          ! generate plane rotation to annihilate a(i,i+mu-1)
+                          ! within the band, and apply rotation from the right
+                          call stdlib_clartg( ab( ku-mu+3, i+mu-2 ),ab( ku-mu+2, i+mu-1 ),rwork( &
+                                    i+mu-1 ), work( i+mu-1 ), ra )
+                          ab( ku-mu+3, i+mu-2 ) = ra
+                          call stdlib_crot( min( kl+mu-2, m-i ),ab( ku-mu+4, i+mu-2 ), 1,ab( ku-&
+                                    mu+3, i+mu-1 ), 1,rwork( i+mu-1 ), work( i+mu-1 ) )
+                       end if
+                       nr = nr + 1
+                       j1 = j1 - kb1
+                    end if
+                    if( wantpt ) then
+                       ! accumulate product of plane rotations in p**h
+                       do j = j1, j2, kb1
+                          call stdlib_crot( n, pt( j+kun-1, 1 ), ldpt,pt( j+kun, 1 ), ldpt, rwork(&
+                                     j+kun ),conjg( work( j+kun ) ) )
+                       end do
+                    end if
+                    if( j2+kb>m ) then
+                       ! adjust j2 to keep within the bounds of the matrix
+                       nr = nr - 1
+                       j2 = j2 - kb1
+                    end if
+                    do j = j1, j2, kb1
+                       ! create nonzero element a(j+kl+ku,j+ku-1) below the
+                       ! band and store it in work(1:n)
+                       work( j+kb ) = work( j+kun )*ab( klu1, j+kun )
+                       ab( klu1, j+kun ) = rwork( j+kun )*ab( klu1, j+kun )
+                    end do
+                    if( ml>ml0 ) then
+                       ml = ml - 1
+                    else
+                       mu = mu - 1
+                    end if
+                 end do loop_80
+              end do loop_90
+           end if
+           if( ku==0 .and. kl>0 ) then
+              ! a has been reduced to complex lower bidiagonal form
+              ! transform lower bidiagonal form to upper bidiagonal by applying
+              ! plane rotations from the left, overwriting superdiagonal
+              ! elements on subdiagonal elements
+              do i = 1, min( m-1, n )
+                 call stdlib_clartg( ab( 1, i ), ab( 2, i ), rc, rs, ra )
+                 ab( 1, i ) = ra
+                 if( i<n ) then
+                    ab( 2, i ) = rs*ab( 1, i+1 )
+                    ab( 1, i+1 ) = rc*ab( 1, i+1 )
+                 end if
+                 if( wantq )call stdlib_crot( m, q( 1, i ), 1, q( 1, i+1 ), 1, rc,conjg( rs ) )
+                           
+                 if( wantc )call stdlib_crot( ncc, c( i, 1 ), ldc, c( i+1, 1 ), ldc, rc,rs )
+                           
+              end do
+           else
+              ! a has been reduced to complex upper bidiagonal form or is
+              ! diagonal
+              if( ku>0 .and. m<n ) then
+                 ! annihilate a(m,m+1) by applying plane rotations from the
+                 ! right
+                 rb = ab( ku, m+1 )
+                 do i = m, 1, -1
+                    call stdlib_clartg( ab( ku+1, i ), rb, rc, rs, ra )
+                    ab( ku+1, i ) = ra
+                    if( i>1 ) then
+                       rb = -conjg( rs )*ab( ku, i )
+                       ab( ku, i ) = rc*ab( ku, i )
+                    end if
+                    if( wantpt )call stdlib_crot( n, pt( i, 1 ), ldpt, pt( m+1, 1 ), ldpt,rc, &
+                              conjg( rs ) )
+                 end do
+              end if
+           end if
+           ! make diagonal and superdiagonal elements real, storing them in d
+           ! and e
+           t = ab( ku+1, 1 )
+           loop_120: do i = 1, minmn
+              abst = abs( t )
+              d( i ) = abst
+              if( abst/=zero ) then
+                 t = t / abst
+              else
+                 t = cone
+              end if
+              if( wantq )call stdlib_cscal( m, t, q( 1, i ), 1 )
+              if( wantc )call stdlib_cscal( ncc, conjg( t ), c( i, 1 ), ldc )
+              if( i<minmn ) then
+                 if( ku==0 .and. kl==0 ) then
+                    e( i ) = zero
+                    t = ab( 1, i+1 )
+                 else
+                    if( ku==0 ) then
+                       t = ab( 2, i )*conjg( t )
+                    else
+                       t = ab( ku, i+1 )*conjg( t )
+                    end if
+                    abst = abs( t )
+                    e( i ) = abst
+                    if( abst/=zero ) then
+                       t = t / abst
+                    else
+                       t = cone
+                    end if
+                    if( wantpt )call stdlib_cscal( n, t, pt( i+1, 1 ), ldpt )
+                    t = ab( ku+1, i+1 )*conjg( t )
+                 end if
+              end if
+           end do loop_120
+           return
+     end subroutine stdlib_cgbbrd
+
+     !> CGBRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is banded, and provides
+     !> error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_cgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb,ipiv, b, ldb, x, &
+               ldx, ferr, berr, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldafb, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transn, transt
+           integer(ilp) :: count, i, j, k, kase, kk, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,min,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<kl+ku+1 ) then
+              info = -7
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -9
+           else if( ldb<max( 1, n ) ) then
+              info = -12
+           else if( ldx<max( 1, n ) ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = min( kl+ku+2, n+1 )
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_ccopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_cgbmv( trans, n, n, kl, ku, -cone, ab, ldab, x( 1, j ), 1,cone, work, 1 &
+                        )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(op(a))*abs(x) + abs(b).
+              if( notran ) then
+                 do k = 1, n
+                    kk = ku + 1 - k
+                    xk = cabs1( x( k, j ) )
+                    do i = max( 1, k-ku ), min( n, k+kl )
+                       rwork( i ) = rwork( i ) + cabs1( ab( kk+i, k ) )*xk
+                    end do
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    kk = ku + 1 - k
+                    do i = max( 1, k-ku ), min( n, k+kl )
+                       s = s + cabs1( ab( kk+i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_cgbtrs( trans, n, kl, ku, 1, afb, ldafb, ipiv, work, n,info )
+                 call stdlib_caxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_clacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_cgbtrs( transt, n, kl, ku, 1, afb, ldafb, ipiv,work, n, info )
+                              
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_cgbtrs( transn, n, kl, ku, 1, afb, ldafb, ipiv,work, n, info )
+                              
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_cgbrfs
+
+     !> CGBSV: computes the solution to a complex system of linear equations
+     !> A * X = B, where A is a band matrix of order N with KL subdiagonals
+     !> and KU superdiagonals, and X and B are N-by-NRHS matrices.
+     !> The LU decomposition with partial pivoting and row interchanges is
+     !> used to factor A as A = L * U, where L is a product of permutation
+     !> and unit lower triangular matrices with KL subdiagonals, and U is
+     !> upper triangular with KL+KU superdiagonals.  The factored form of A
+     !> is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_cgbsv( n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: ab(ldab,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( kl<0 ) then
+              info = -2
+           else if( ku<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<2*kl+ku+1 ) then
+              info = -6
+           else if( ldb<max( n, 1 ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGBSV ', -info )
+              return
+           end if
+           ! compute the lu factorization of the band matrix a.
+           call stdlib_cgbtrf( n, n, kl, ku, ab, ldab, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_cgbtrs( 'NO TRANSPOSE', n, kl, ku, nrhs, ab, ldab, ipiv,b, ldb, info )
+                        
+           end if
+           return
+     end subroutine stdlib_cgbsv
+
+     !> CGBSVX: uses the LU factorization to compute the solution to a complex
+     !> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+     !> where A is a band matrix of order N with KL subdiagonals and KU
+     !> superdiagonals, and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_cgbsvx( fact, trans, n, kl, ku, nrhs, ab, ldab, afb,ldafb, ipiv, equed, r, &
+               c, b, ldb, x, ldx,rcond, ferr, berr, work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldafb, ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(sp), intent(inout) :: c(*), r(*)
+           complex(sp), intent(inout) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+        ! moved setting of info = n+1 so info does not subsequently get
+        ! overwritten.  sven, 17 mar 05.
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: colequ, equil, nofact, notran, rowequ
+           character :: norm
+           integer(ilp) :: i, infequ, j, j1, j2
+           real(sp) :: amax, anorm, bignum, colcnd, rcmax, rcmin, rowcnd, rpvgrw, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rowequ = .false.
+              colequ = .false.
+           else
+              rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+              colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kl<0 ) then
+              info = -4
+           else if( ku<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kl+ku+1 ) then
+              info = -8
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -10
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rowequ .or. colequ .or. stdlib_lsame( &
+                     equed, 'N' ) ) ) then
+              info = -12
+           else
+              if( rowequ ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, r( j ) )
+                    rcmax = max( rcmax, r( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -13
+                 else if( n>0 ) then
+                    rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    rowcnd = one
+                 end if
+              end if
+              if( colequ .and. info==0 ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, c( j ) )
+                    rcmax = max( rcmax, c( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -14
+                 else if( n>0 ) then
+                    colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    colcnd = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -16
+                 else if( ldx<max( 1, n ) ) then
+                    info = -18
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGBSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_cgbequ( n, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, infequ )
+                        
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_claqgb( n, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+                           
+                 rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+                 colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( notran ) then
+              if( rowequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       b( i, j ) = r( i )*b( i, j )
+                    end do
+                 end do
+              end if
+           else if( colequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = c( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the lu factorization of the band matrix a.
+              do j = 1, n
+                 j1 = max( j-ku, 1 )
+                 j2 = min( j+kl, n )
+                 call stdlib_ccopy( j2-j1+1, ab( ku+1-j+j1, j ), 1,afb( kl+ku+1-j+j1, j ), 1 )
+                           
+              end do
+              call stdlib_cgbtrf( n, n, kl, ku, afb, ldafb, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 ) then
+                 ! compute the reciprocal pivot growth factor of the
+                 ! leading rank-deficient info columns of a.
+                 anorm = zero
+                 do j = 1, info
+                    do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                       anorm = max( anorm, abs( ab( i, j ) ) )
+                    end do
+                 end do
+                 rpvgrw = stdlib_clantb( 'M', 'U', 'N', info, min( info-1, kl+ku ),afb( max( 1, &
+                           kl+ku+2-info ), 1 ), ldafb,rwork )
+                 if( rpvgrw==zero ) then
+                    rpvgrw = one
+                 else
+                    rpvgrw = anorm / rpvgrw
+                 end if
+                 rwork( 1 ) = rpvgrw
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a and the
+           ! reciprocal pivot growth factor rpvgrw.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_clangb( norm, n, kl, ku, ab, ldab, rwork )
+           rpvgrw = stdlib_clantb( 'M', 'U', 'N', n, kl+ku, afb, ldafb, rwork )
+           if( rpvgrw==zero ) then
+              rpvgrw = one
+           else
+              rpvgrw = stdlib_clangb( 'M', n, kl, ku, ab, ldab, rwork ) / rpvgrw
+           end if
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_cgbcon( norm, n, kl, ku, afb, ldafb, ipiv, anorm, rcond,work, rwork, info )
+                     
+           ! compute the solution matrix x.
+           call stdlib_clacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_cgbtrs( trans, n, kl, ku, nrhs, afb, ldafb, ipiv, x, ldx,info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_cgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb, ipiv,b, ldb, x, ldx, &
+                     ferr, berr, work, rwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( notran ) then
+              if( colequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       x( i, j ) = c( i )*x( i, j )
+                    end do
+                 end do
+                 do j = 1, nrhs
+                    ferr( j ) = ferr( j ) / colcnd
+                 end do
+              end if
+           else if( rowequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = r( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / rowcnd
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           rwork( 1 ) = rpvgrw
+           return
+     end subroutine stdlib_cgbsvx
+
+     !> CGEBRD: reduces a general complex M-by-N matrix A to upper or lower
+     !> bidiagonal form B by a unitary transformation: Q**H * A * P = B.
+     !> If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+     pure subroutine stdlib_cgebrd( m, n, a, lda, d, e, tauq, taup, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(out) :: d(*), e(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: taup(*), tauq(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, ldwrkx, ldwrky, lwkopt, minmn, nb, nbmin, nx, ws
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb = max( 1, stdlib_ilaenv( 1, 'CGEBRD', ' ', m, n, -1, -1 ) )
+           lwkopt = ( m+n )*nb
+           work( 1 ) = real( lwkopt,KIND=sp)
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, m, n ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info<0 ) then
+              call stdlib_xerbla( 'CGEBRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           minmn = min( m, n )
+           if( minmn==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ws = max( m, n )
+           ldwrkx = m
+           ldwrky = n
+           if( nb>1 .and. nb<minmn ) then
+              ! set the crossover point nx.
+              nx = max( nb, stdlib_ilaenv( 3, 'CGEBRD', ' ', m, n, -1, -1 ) )
+              ! determine when to switch from blocked to unblocked code.
+              if( nx<minmn ) then
+                 ws = ( m+n )*nb
+                 if( lwork<ws ) then
+                    ! not enough work space for the optimal nb, consider using
+                    ! a smaller block size.
+                    nbmin = stdlib_ilaenv( 2, 'CGEBRD', ' ', m, n, -1, -1 )
+                    if( lwork>=( m+n )*nbmin ) then
+                       nb = lwork / ( m+n )
+                    else
+                       nb = 1
+                       nx = minmn
+                    end if
+                 end if
+              end if
+           else
+              nx = minmn
+           end if
+           do i = 1, minmn - nx, nb
+              ! reduce rows and columns i:i+ib-1 to bidiagonal form and return
+              ! the matrices x and y which are needed to update the unreduced
+              ! part of the matrix
+              call stdlib_clabrd( m-i+1, n-i+1, nb, a( i, i ), lda, d( i ), e( i ),tauq( i ), &
+                        taup( i ), work, ldwrkx,work( ldwrkx*nb+1 ), ldwrky )
+              ! update the trailing submatrix a(i+ib:m,i+ib:n), using
+              ! an update of the form  a := a - v*y**h - x*u**h
+              call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m-i-nb+1,n-i-nb+1, nb, -&
+              cone, a( i+nb, i ), lda,work( ldwrkx*nb+nb+1 ), ldwrky, cone,a( i+nb, i+nb ), lda )
+                        
+              call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-i-nb+1, n-i-nb+1,nb, -cone, &
+                        work( nb+1 ), ldwrkx, a( i, i+nb ), lda,cone, a( i+nb, i+nb ), lda )
+              ! copy diagonal and off-diagonal elements of b back into a
+              if( m>=n ) then
+                 do j = i, i + nb - 1
+                    a( j, j ) = d( j )
+                    a( j, j+1 ) = e( j )
+                 end do
+              else
+                 do j = i, i + nb - 1
+                    a( j, j ) = d( j )
+                    a( j+1, j ) = e( j )
+                 end do
+              end if
+           end do
+           ! use unblocked code to reduce the remainder of the matrix
+           call stdlib_cgebd2( m-i+1, n-i+1, a( i, i ), lda, d( i ), e( i ),tauq( i ), taup( i ), &
+                     work, iinfo )
+           work( 1 ) = ws
+           return
+     end subroutine stdlib_cgebrd
+
+     !> CGEHRD: reduces a complex general matrix A to upper Hessenberg form H by
+     !> an unitary similarity transformation:  Q**H * A * Q = H .
+
+     pure subroutine stdlib_cgehrd( n, ilo, ihi, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iwt, j, ldwork, lwkopt, nb, nbmin, nh, nx
+           complex(sp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'CGEHRD', ' ', n, ilo, ihi, -1 ) )
+              lwkopt = n*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEHRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! set elements 1:ilo-1 and ihi:n-1 of tau to czero
+           do i = 1, ilo - 1
+              tau( i ) = czero
+           end do
+           do i = max( 1, ihi ), n - 1
+              tau( i ) = czero
+           end do
+           ! quick return if possible
+           nh = ihi - ilo + 1
+           if( nh<=1 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! determine the block size
+           nb = min( nbmax, stdlib_ilaenv( 1, 'CGEHRD', ' ', n, ilo, ihi, -1 ) )
+           nbmin = 2
+           if( nb>1 .and. nb<nh ) then
+              ! determine when to cross over from blocked to unblocked code
+              ! (last block is always handled by unblocked code)
+              nx = max( nb, stdlib_ilaenv( 3, 'CGEHRD', ' ', n, ilo, ihi, -1 ) )
+              if( nx<nh ) then
+                 ! determine if workspace is large enough for blocked code
+                 if( lwork<n*nb+tsize ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'CGEHRD', ' ', n, ilo, ihi,-1 ) )
+                    if( lwork>=(n*nbmin+tsize) ) then
+                       nb = (lwork-tsize) / n
+                    else
+                       nb = 1
+                    end if
+                 end if
+              end if
+           end if
+           ldwork = n
+           if( nb<nbmin .or. nb>=nh ) then
+              ! use unblocked code below
+              i = ilo
+           else
+              ! use blocked code
+              iwt = 1 + n*nb
+              do i = ilo, ihi - 1 - nx, nb
+                 ib = min( nb, ihi-i )
+                 ! reduce columns i:i+ib-1 to hessenberg form, returning the
+                 ! matrices v and t of the block reflector h = i - v*t*v**h
+                 ! which performs the reduction, and also the matrix y = a*v*t
+                 call stdlib_clahr2( ihi, i, ib, a( 1, i ), lda, tau( i ),work( iwt ), ldt, work, &
+                           ldwork )
+                 ! apply the block reflector h to a(1:ihi,i+ib:ihi) from the
+                 ! right, computing  a := a - y * v**h. v(i+ib,ib-1) must be set
+                 ! to 1
+                 ei = a( i+ib, i+ib-1 )
+                 a( i+ib, i+ib-1 ) = cone
+                 call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',ihi, ihi-i-ib+1,ib, -&
+                           cone, work, ldwork, a( i+ib, i ), lda, cone,a( 1, i+ib ), lda )
+                 a( i+ib, i+ib-1 ) = ei
+                 ! apply the block reflector h to a(1:i,i+1:i+ib-1) from the
+                 ! right
+                 call stdlib_ctrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', i, ib-1,cone, &
+                           a( i+1, i ), lda, work, ldwork )
+                 do j = 0, ib-2
+                    call stdlib_caxpy( i, -cone, work( ldwork*j+1 ), 1,a( 1, i+j+1 ), 1 )
+                 end do
+                 ! apply the block reflector h to a(i+1:ihi,i+ib:n) from the
+                 ! left
+                 call stdlib_clarfb( 'LEFT', 'CONJUGATE TRANSPOSE', 'FORWARD','COLUMNWISE',ihi-i, &
+                 n-i-ib+1, ib, a( i+1, i ), lda,work( iwt ), ldt, a( i+1, i+ib ), lda,work, &
+                           ldwork )
+              end do
+           end if
+           ! use unblocked code to reduce the rest of the matrix
+           call stdlib_cgehd2( n, i, ihi, a, lda, tau, work, iinfo )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cgehrd
+
+     !> CGELQT: computes a blocked LQ factorization of a complex M-by-N matrix A
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_cgelqt( m, n, mb, a, lda, t, ldt, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, mb
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, iinfo, k
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( mb<1 .or. (mb>min(m,n) .and. min(m,n)>0 ))then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<mb ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGELQT', -info )
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) return
+           ! blocked loop of length k
+           do i = 1, k,  mb
+              ib = min( k-i+1, mb )
+           ! compute the lq factorization of the current block a(i:m,i:i+ib-1)
+              call stdlib_cgelqt3( ib, n-i+1, a(i,i), lda, t(1,i), ldt, iinfo )
+              if( i+ib<=m ) then
+           ! update by applying h**t to a(i:m,i+ib:n) from the right
+              call stdlib_clarfb( 'R', 'N', 'F', 'R', m-i-ib+1, n-i+1, ib,a( i, i ), lda, t( 1, i &
+                        ), ldt,a( i+ib, i ), lda, work , m-i-ib+1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_cgelqt
+
+     !> CGELS: solves overdetermined or underdetermined complex linear systems
+     !> involving an M-by-N matrix A, or its conjugate-transpose, using a QR
+     !> or LQ factorization of A.  It is assumed that A has full rank.
+     !> The following options are provided:
+     !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A*X ||.
+     !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+     !> an underdetermined system A * X = B.
+     !> 3. If TRANS = 'C' and m >= n:  find the minimum norm solution of
+     !> an underdetermined system A**H * X = B.
+     !> 4. If TRANS = 'C' and m < n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A**H * X ||.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+
+     subroutine stdlib_cgels( trans, m, n, nrhs, a, lda, b, ldb, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, tpsd
+           integer(ilp) :: brow, i, iascl, ibscl, j, mn, nb, scllen, wsize
+           real(sp) :: anrm, bignum, bnrm, smlnum
+           ! Local Arrays 
+           real(sp) :: rwork(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( .not.( stdlib_lsame( trans, 'N' ) .or. stdlib_lsame( trans, 'C' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m, n ) ) then
+              info = -8
+           else if( lwork<max( 1, mn+max( mn, nrhs ) ) .and..not.lquery ) then
+              info = -10
+           end if
+           ! figure out optimal block size
+           if( info==0 .or. info==-10 ) then
+              tpsd = .true.
+              if( stdlib_lsame( trans, 'N' ) )tpsd = .false.
+              if( m>=n ) then
+                 nb = stdlib_ilaenv( 1, 'CGEQRF', ' ', m, n, -1, -1 )
+                 if( tpsd ) then
+                    nb = max( nb, stdlib_ilaenv( 1, 'CUNMQR', 'LN', m, nrhs, n,-1 ) )
+                 else
+                    nb = max( nb, stdlib_ilaenv( 1, 'CUNMQR', 'LC', m, nrhs, n,-1 ) )
+                 end if
+              else
+                 nb = stdlib_ilaenv( 1, 'CGELQF', ' ', m, n, -1, -1 )
+                 if( tpsd ) then
+                    nb = max( nb, stdlib_ilaenv( 1, 'CUNMLQ', 'LC', n, nrhs, m,-1 ) )
+                 else
+                    nb = max( nb, stdlib_ilaenv( 1, 'CUNMLQ', 'LN', n, nrhs, m,-1 ) )
+                 end if
+              end if
+              wsize = max( 1, mn + max( mn, nrhs )*nb )
+              work( 1 ) = real( wsize,KIND=sp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGELS ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+              call stdlib_claset( 'FULL', max( m, n ), nrhs, czero, czero, b, ldb )
+              return
+           end if
+           ! get machine parameters
+           smlnum = stdlib_slamch( 'S' ) / stdlib_slamch( 'P' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! scale a, b if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_clascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_clascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_claset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              go to 50
+           end if
+           brow = m
+           if( tpsd )brow = n
+           bnrm = stdlib_clange( 'M', brow, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_clascl( 'G', 0, 0, bnrm, smlnum, brow, nrhs, b, ldb,info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_clascl( 'G', 0, 0, bnrm, bignum, brow, nrhs, b, ldb,info )
+              ibscl = 2
+           end if
+           if( m>=n ) then
+              ! compute qr factorization of a
+              call stdlib_cgeqrf( m, n, a, lda, work( 1 ), work( mn+1 ), lwork-mn,info )
+              ! workspace at least n, optimally n*nb
+              if( .not.tpsd ) then
+                 ! least-squares problem min || a * x - b ||
+                 ! b(1:m,1:nrhs) := q**h * b(1:m,1:nrhs)
+                 call stdlib_cunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', m, nrhs, n, a,lda, work( 1 ), &
+                           b, ldb, work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:n,1:nrhs) := inv(r) * b(1:n,1:nrhs)
+                 call stdlib_ctrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = n
+              else
+                 ! underdetermined system of equations a**t * x = b
+                 ! b(1:n,1:nrhs) := inv(r**h) * b(1:n,1:nrhs)
+                 call stdlib_ctrtrs( 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT',n, nrhs, a, lda, b,&
+                            ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(n+1:m,1:nrhs) = zero
+                 do j = 1, nrhs
+                    do i = n + 1, m
+                       b( i, j ) = czero
+                    end do
+                 end do
+                 ! b(1:m,1:nrhs) := q(1:n,:) * b(1:n,1:nrhs)
+                 call stdlib_cunmqr( 'LEFT', 'NO TRANSPOSE', m, nrhs, n, a, lda,work( 1 ), b, ldb,&
+                            work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = m
+              end if
+           else
+              ! compute lq factorization of a
+              call stdlib_cgelqf( m, n, a, lda, work( 1 ), work( mn+1 ), lwork-mn,info )
+              ! workspace at least m, optimally m*nb.
+              if( .not.tpsd ) then
+                 ! underdetermined system of equations a * x = b
+                 ! b(1:m,1:nrhs) := inv(l) * b(1:m,1:nrhs)
+                 call stdlib_ctrtrs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', m, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(m+1:n,1:nrhs) = 0
+                 do j = 1, nrhs
+                    do i = m + 1, n
+                       b( i, j ) = czero
+                    end do
+                 end do
+                 ! b(1:n,1:nrhs) := q(1:n,:)**h * b(1:m,1:nrhs)
+                 call stdlib_cunmlq( 'LEFT', 'CONJUGATE TRANSPOSE', n, nrhs, m, a,lda, work( 1 ), &
+                           b, ldb, work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = n
+              else
+                 ! overdetermined system min || a**h * x - b ||
+                 ! b(1:n,1:nrhs) := q * b(1:n,1:nrhs)
+                 call stdlib_cunmlq( 'LEFT', 'NO TRANSPOSE', n, nrhs, m, a, lda,work( 1 ), b, ldb,&
+                            work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:m,1:nrhs) := inv(l**h) * b(1:m,1:nrhs)
+                 call stdlib_ctrtrs( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',m, nrhs, a, lda, &
+                           b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = m
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_clascl( 'G', 0, 0, anrm, smlnum, scllen, nrhs, b, ldb,info )
+           else if( iascl==2 ) then
+              call stdlib_clascl( 'G', 0, 0, anrm, bignum, scllen, nrhs, b, ldb,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_clascl( 'G', 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb,info )
+           else if( ibscl==2 ) then
+              call stdlib_clascl( 'G', 0, 0, bignum, bnrm, scllen, nrhs, b, ldb,info )
+           end if
+           50 continue
+           work( 1 ) = real( wsize,KIND=sp)
+           return
+     end subroutine stdlib_cgels
+
+     !> CGEQP3: computes a QR factorization with column pivoting of a
+     !> matrix A:  A*P = Q*R  using Level 3 BLAS.
+
+     pure subroutine stdlib_cgeqp3( m, n, a, lda, jpvt, tau, work, lwork, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: inb = 1
+           integer(ilp), parameter :: inbmin = 2
+           integer(ilp), parameter :: ixover = 3
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: fjb, iws, j, jb, lwkopt, minmn, minws, na, nb, nbmin, nfxd, nx, sm, &
+                     sminmn, sn, topbmn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+        ! ====================
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              minmn = min( m, n )
+              if( minmn==0 ) then
+                 iws = 1
+                 lwkopt = 1
+              else
+                 iws = n + 1
+                 nb = stdlib_ilaenv( inb, 'CGEQRF', ' ', m, n, -1, -1 )
+                 lwkopt = ( n + 1 )*nb
+              end if
+              work( 1 ) = cmplx( lwkopt,KIND=sp)
+              if( ( lwork<iws ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEQP3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! move initial columns up front.
+           nfxd = 1
+           do j = 1, n
+              if( jpvt( j )/=0 ) then
+                 if( j/=nfxd ) then
+                    call stdlib_cswap( m, a( 1, j ), 1, a( 1, nfxd ), 1 )
+                    jpvt( j ) = jpvt( nfxd )
+                    jpvt( nfxd ) = j
+                 else
+                    jpvt( j ) = j
+                 end if
+                 nfxd = nfxd + 1
+              else
+                 jpvt( j ) = j
+              end if
+           end do
+           nfxd = nfxd - 1
+           ! factorize fixed columns
+        ! =======================
+           ! compute the qr factorization of fixed columns and update
+           ! remaining columns.
+           if( nfxd>0 ) then
+              na = min( m, nfxd )
+      ! cc      call stdlib_cgeqr2( m, na, a, lda, tau, work, info )
+              call stdlib_cgeqrf( m, na, a, lda, tau, work, lwork, info )
+              iws = max( iws, int( work( 1 ),KIND=ilp) )
+              if( na<n ) then
+      ! cc         call stdlib_cunm2r( 'left', 'conjugate transpose', m, n-na,
+      ! cc  $                   na, a, lda, tau, a( 1, na+1 ), lda, work,
+      ! cc  $                   info )
+                 call stdlib_cunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', m, n-na, na, a,lda, tau, a( 1,&
+                            na+1 ), lda, work, lwork,info )
+                 iws = max( iws, int( work( 1 ),KIND=ilp) )
+              end if
+           end if
+           ! factorize free columns
+        ! ======================
+           if( nfxd<minmn ) then
+              sm = m - nfxd
+              sn = n - nfxd
+              sminmn = minmn - nfxd
+              ! determine the block size.
+              nb = stdlib_ilaenv( inb, 'CGEQRF', ' ', sm, sn, -1, -1 )
+              nbmin = 2
+              nx = 0
+              if( ( nb>1 ) .and. ( nb<sminmn ) ) then
+                 ! determine when to cross over from blocked to unblocked code.
+                 nx = max( 0, stdlib_ilaenv( ixover, 'CGEQRF', ' ', sm, sn, -1,-1 ) )
+                 if( nx<sminmn ) then
+                    ! determine if workspace is large enough for blocked code.
+                    minws = ( sn+1 )*nb
+                    iws = max( iws, minws )
+                    if( lwork<minws ) then
+                       ! not enough workspace to use optimal nb: reduce nb and
+                       ! determine the minimum value of nb.
+                       nb = lwork / ( sn+1 )
+                       nbmin = max( 2, stdlib_ilaenv( inbmin, 'CGEQRF', ' ', sm, sn,-1, -1 ) )
+                                 
+                    end if
+                 end if
+              end if
+              ! initialize partial column norms. the first n elements of work
+              ! store the exact column norms.
+              do j = nfxd + 1, n
+                 rwork( j ) = stdlib_scnrm2( sm, a( nfxd+1, j ), 1 )
+                 rwork( n+j ) = rwork( j )
+              end do
+              if( ( nb>=nbmin ) .and. ( nb<sminmn ) .and.( nx<sminmn ) ) then
+                 ! use blocked code initially.
+                 j = nfxd + 1
+                 ! compute factorization: while loop.
+                 topbmn = minmn - nx
+                 30 continue
+                 if( j<=topbmn ) then
+                    jb = min( nb, topbmn-j+1 )
+                    ! factorize jb columns among columns j:n.
+                    call stdlib_claqps( m, n-j+1, j-1, jb, fjb, a( 1, j ), lda,jpvt( j ), tau( j )&
+                              , rwork( j ),rwork( n+j ), work( 1 ), work( jb+1 ),n-j+1 )
+                    j = j + fjb
+                    go to 30
+                 end if
+              else
+                 j = nfxd + 1
+              end if
+              ! use unblocked code to factor the last or only block.
+              if( j<=minmn )call stdlib_claqp2( m, n-j+1, j-1, a( 1, j ), lda, jpvt( j ),tau( j ),&
+                         rwork( j ), rwork( n+j ), work( 1 ) )
+           end if
+           work( 1 ) = cmplx( lwkopt,KIND=sp)
+           return
+     end subroutine stdlib_cgeqp3
+
+     !> CGEQRT: computes a blocked QR factorization of a complex M-by-N matrix A
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_cgeqrt( m, n, nb, a, lda, t, ldt, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, nb
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk), parameter :: use_recursive_qr = .true.
+           integer(ilp) :: i, ib, iinfo, k
+           
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nb<1 .or. ( nb>min(m,n) .and. min(m,n)>0 ) )then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<nb ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEQRT', -info )
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) return
+           ! blocked loop of length k
+           do i = 1, k,  nb
+              ib = min( k-i+1, nb )
+           ! compute the qr factorization of the current block a(i:m,i:i+ib-1)
+              if( use_recursive_qr ) then
+                 call stdlib_cgeqrt3( m-i+1, ib, a(i,i), lda, t(1,i), ldt, iinfo )
+              else
+                 call stdlib_cgeqrt2( m-i+1, ib, a(i,i), lda, t(1,i), ldt, iinfo )
+              end if
+              if( i+ib<=n ) then
+           ! update by applying h**h to a(i:m,i+ib:n) from the left
+                 call stdlib_clarfb( 'L', 'C', 'F', 'C', m-i+1, n-i-ib+1, ib,a( i, i ), lda, t( 1,&
+                            i ), ldt,a( i, i+ib ), lda, work , n-i-ib+1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_cgeqrt
+
+     !> CGERFS: improves the computed solution to a system of linear
+     !> equations and provides error bounds and backward error estimates for
+     !> the solution.
+
+     pure subroutine stdlib_cgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transn, transt
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGERFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_ccopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_cgemv( trans, n, n, -cone, a, lda, x( 1, j ), 1, cone, work,1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(op(a))*abs(x) + abs(b).
+              if( notran ) then
+                 do k = 1, n
+                    xk = cabs1( x( k, j ) )
+                    do i = 1, n
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                    end do
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    do i = 1, n
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_cgetrs( trans, n, 1, af, ldaf, ipiv, work, n, info )
+                 call stdlib_caxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_clacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_cgetrs( transt, n, 1, af, ldaf, ipiv, work, n,info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_cgetrs( transn, n, 1, af, ldaf, ipiv, work, n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_cgerfs
+
+     !> CGETRF: computes an LU factorization of a general M-by-N matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the right-looking Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_cgetrf( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGETRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'CGETRF', ' ', m, n, -1, -1 )
+           if( nb<=1 .or. nb>=min( m, n ) ) then
+              ! use unblocked code.
+              call stdlib_cgetrf2( m, n, a, lda, ipiv, info )
+           else
+              ! use blocked code.
+              do j = 1, min( m, n ), nb
+                 jb = min( min( m, n )-j+1, nb )
+                 ! factor diagonal and subdiagonal blocks and test for exact
+                 ! singularity.
+                 call stdlib_cgetrf2( m-j+1, jb, a( j, j ), lda, ipiv( j ), iinfo )
+                 ! adjust info and the pivot indices.
+                 if( info==0 .and. iinfo>0 )info = iinfo + j - 1
+                 do i = j, min( m, j+jb-1 )
+                    ipiv( i ) = j - 1 + ipiv( i )
+                 end do
+                 ! apply interchanges to columns 1:j-1.
+                 call stdlib_claswp( j-1, a, lda, j, j+jb-1, ipiv, 1 )
+                 if( j+jb<=n ) then
+                    ! apply interchanges to columns j+jb:n.
+                    call stdlib_claswp( n-j-jb+1, a( 1, j+jb ), lda, j, j+jb-1,ipiv, 1 )
+                    ! compute block row of u.
+                    call stdlib_ctrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', jb,n-j-jb+1, cone,&
+                               a( j, j ), lda, a( j, j+jb ),lda )
+                    if( j+jb<=m ) then
+                       ! update trailing submatrix.
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-j-jb+1,n-j-jb+1, jb, -&
+                       cone, a( j+jb, j ), lda,a( j, j+jb ), lda, cone, a( j+jb, j+jb ),lda )
+                                 
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_cgetrf
+
+     !> CGGGLM: solves a general Gauss-Markov linear model (GLM) problem:
+     !> minimize || y ||_2   subject to   d = A*x + B*y
+     !> x
+     !> where A is an N-by-M matrix, B is an N-by-P matrix, and d is a
+     !> given N-vector. It is assumed that M <= N <= M+P, and
+     !> rank(A) = M    and    rank( A B ) = N.
+     !> Under these assumptions, the constrained equation is always
+     !> consistent, and there is a unique solution x and a minimal 2-norm
+     !> solution y, which is obtained using a generalized QR factorization
+     !> of the matrices (A, B) given by
+     !> A = Q*(R),   B = Q*T*Z.
+     !> (0)
+     !> In particular, if matrix B is square nonsingular, then the problem
+     !> GLM is equivalent to the following weighted linear least squares
+     !> problem
+     !> minimize || inv(B)*(d-A*x) ||_2
+     !> x
+     !> where inv(B) denotes the inverse of B.
+
+     pure subroutine stdlib_cggglm( n, m, p, a, lda, b, ldb, d, x, y, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*), d(*)
+           complex(sp), intent(out) :: work(*), x(*), y(*)
+        ! ===================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, lopt, lwkmin, lwkopt, nb, nb1, nb2, nb3, nb4, np
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           np = min( n, p )
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( m<0 .or. m>n ) then
+              info = -2
+           else if( p<0 .or. p<n-m ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           ! calculate workspace
+           if( info==0) then
+              if( n==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'CGEQRF', ' ', n, m, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'CGERQF', ' ', n, m, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'CUNMQR', ' ', n, m, p, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'CUNMRQ', ' ', n, m, p, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = m + n + p
+                 lwkopt = m + np + max( n, p )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGGLM', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              do i = 1, m
+                 x(i) = czero
+              end do
+              do i = 1, p
+                 y(i) = czero
+              end do
+              return
+           end if
+           ! compute the gqr factorization of matrices a and b:
+                ! q**h*a = ( r11 ) m,    q**h*b*z**h = ( t11   t12 ) m
+                         ! (  0  ) n-m                 (  0    t22 ) n-m
+                            ! m                         m+p-n  n-m
+           ! where r11 and t22 are upper triangular, and q and z are
+           ! unitary.
+           call stdlib_cggqrf( n, m, p, a, lda, work, b, ldb, work( m+1 ),work( m+np+1 ), lwork-m-&
+                     np, info )
+           lopt = real( work( m+np+1 ),KIND=sp)
+           ! update left-hand-side vector d = q**h*d = ( d1 ) m
+                                                     ! ( d2 ) n-m
+           call stdlib_cunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', n, 1, m, a, lda, work,d, max( 1, n )&
+                     , work( m+np+1 ), lwork-m-np, info )
+           lopt = max( lopt, int( work( m+np+1 ),KIND=ilp) )
+           ! solve t22*y2 = d2 for y2
+           if( n>m ) then
+              call stdlib_ctrtrs( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', n-m, 1,b( m+1, m+p-n+1 ), &
+                        ldb, d( m+1 ), n-m, info )
+              if( info>0 ) then
+                 info = 1
+                 return
+              end if
+              call stdlib_ccopy( n-m, d( m+1 ), 1, y( m+p-n+1 ), 1 )
+           end if
+           ! set y1 = 0
+           do i = 1, m + p - n
+              y( i ) = czero
+           end do
+           ! update d1 = d1 - t12*y2
+           call stdlib_cgemv( 'NO TRANSPOSE', m, n-m, -cone, b( 1, m+p-n+1 ), ldb,y( m+p-n+1 ), 1,&
+                      cone, d, 1 )
+           ! solve triangular system: r11*x = d1
+           if( m>0 ) then
+              call stdlib_ctrtrs( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', m, 1, a, lda,d, m, info )
+                        
+              if( info>0 ) then
+                 info = 2
+                 return
+              end if
+              ! copy d to x
+              call stdlib_ccopy( m, d, 1, x, 1 )
+           end if
+           ! backward transformation y = z**h *y
+           call stdlib_cunmrq( 'LEFT', 'CONJUGATE TRANSPOSE', p, 1, np,b( max( 1, n-p+1 ), 1 ), &
+                     ldb, work( m+1 ), y,max( 1, p ), work( m+np+1 ), lwork-m-np, info )
+           work( 1 ) = m + np + max( lopt, int( work( m+np+1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_cggglm
+
+     !> CGGHD3: reduces a pair of complex matrices (A,B) to generalized upper
+     !> Hessenberg form using unitary transformations, where A is a
+     !> general matrix and B is upper triangular.  The form of the
+     !> generalized eigenvalue problem is
+     !> A*x = lambda*B*x,
+     !> and B is typically made upper triangular by computing its QR
+     !> factorization and moving the unitary matrix Q to the left side
+     !> of the equation.
+     !> This subroutine simultaneously reduces A to a Hessenberg matrix H:
+     !> Q**H*A*Z = H
+     !> and transforms B to another upper triangular matrix T:
+     !> Q**H*B*Z = T
+     !> in order to reduce the problem to its standard form
+     !> H*y = lambda*T*y
+     !> where y = Z**H*x.
+     !> The unitary matrices Q and Z are determined as products of Givens
+     !> rotations.  They may either be formed explicitly, or they may be
+     !> postmultiplied into input matrices Q1 and Z1, so that
+     !> Q1 * A * Z1**H = (Q1*Q) * H * (Z1*Z)**H
+     !> Q1 * B * Z1**H = (Q1*Q) * T * (Z1*Z)**H
+     !> If Q1 is the unitary matrix from the QR factorization of B in the
+     !> original equation A*x = lambda*B*x, then CGGHD3 reduces the original
+     !> problem to generalized Hessenberg form.
+     !> This is a blocked variant of CGGHRD, using matrix-matrix
+     !> multiplications for parts of the computation to enhance performance.
+
+     pure subroutine stdlib_cgghd3( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldb, ldq, ldz, n, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: blk22, initq, initz, lquery, wantq, wantz
+           character :: compq2, compz2
+           integer(ilp) :: cola, i, ierr, j, j0, jcol, jj, jrow, k, kacc22, len, lwkopt, n2nb, nb,&
+                      nblst, nbmin, nh, nnb, nx, ppw, ppwo, pw, top, topq
+           real(sp) :: c
+           complex(sp) :: c1, c2, ctemp, s, s1, s2, temp, temp1, temp2, temp3
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,conjg,max
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           info = 0
+           nb = stdlib_ilaenv( 1, 'CGGHD3', ' ', n, ilo, ihi, -1 )
+           lwkopt = max( 6*n*nb, 1 )
+           work( 1 ) = cmplx( lwkopt,KIND=sp)
+           initq = stdlib_lsame( compq, 'I' )
+           wantq = initq .or. stdlib_lsame( compq, 'V' )
+           initz = stdlib_lsame( compz, 'I' )
+           wantz = initz .or. stdlib_lsame( compz, 'V' )
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -1
+           else if( .not.stdlib_lsame( compz, 'N' ) .and. .not.wantz ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ( wantq .and. ldq<n ) .or. ldq<1 ) then
+              info = -11
+           else if( ( wantz .and. ldz<n ) .or. ldz<1 ) then
+              info = -13
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGHD3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! initialize q and z if desired.
+           if( initq )call stdlib_claset( 'ALL', n, n, czero, cone, q, ldq )
+           if( initz )call stdlib_claset( 'ALL', n, n, czero, cone, z, ldz )
+           ! zero out lower triangle of b.
+           if( n>1 )call stdlib_claset( 'LOWER', n-1, n-1, czero, czero, b(2, 1), ldb )
+           ! quick return if possible
+           nh = ihi - ilo + 1
+           if( nh<=1 ) then
+              work( 1 ) = cone
+              return
+           end if
+           ! determine the blocksize.
+           nbmin = stdlib_ilaenv( 2, 'CGGHD3', ' ', n, ilo, ihi, -1 )
+           if( nb>1 .and. nb<nh ) then
+              ! determine when to use unblocked instead of blocked code.
+              nx = max( nb, stdlib_ilaenv( 3, 'CGGHD3', ' ', n, ilo, ihi, -1 ) )
+              if( nx<nh ) then
+                 ! determine if workspace is large enough for blocked code.
+                 if( lwork<lwkopt ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code.
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'CGGHD3', ' ', n, ilo, ihi,-1 ) )
+                    if( lwork>=6*n*nbmin ) then
+                       nb = lwork / ( 6*n )
+                    else
+                       nb = 1
+                    end if
+                 end if
+              end if
+           end if
+           if( nb<nbmin .or. nb>=nh ) then
+              ! use unblocked code below
+              jcol = ilo
+           else
+              ! use blocked code
+              kacc22 = stdlib_ilaenv( 16, 'CGGHD3', ' ', n, ilo, ihi, -1 )
+              blk22 = kacc22==2
+              do jcol = ilo, ihi-2, nb
+                 nnb = min( nb, ihi-jcol-1 )
+                 ! initialize small unitary factors that will hold the
+                 ! accumulated givens rotations in workspace.
+                 ! n2nb   denotes the number of 2*nnb-by-2*nnb factors
+                 ! nblst  denotes the (possibly smaller) order of the last
+                        ! factor.
+                 n2nb = ( ihi-jcol-1 ) / nnb - 1
+                 nblst = ihi - jcol - n2nb*nnb
+                 call stdlib_claset( 'ALL', nblst, nblst, czero, cone, work, nblst )
+                 pw = nblst * nblst + 1
+                 do i = 1, n2nb
+                    call stdlib_claset( 'ALL', 2*nnb, 2*nnb, czero, cone,work( pw ), 2*nnb )
+                              
+                    pw = pw + 4*nnb*nnb
+                 end do
+                 ! reduce columns jcol:jcol+nnb-1 of a to hessenberg form.
+                 do j = jcol, jcol+nnb-1
+                    ! reduce jth column of a. store cosines and sines in jth
+                    ! column of a and b, respectively.
+                    do i = ihi, j+2, -1
+                       temp = a( i-1, j )
+                       call stdlib_clartg( temp, a( i, j ), c, s, a( i-1, j ) )
+                       a( i, j ) = cmplx( c,KIND=sp)
+                       b( i, j ) = s
+                    end do
+                    ! accumulate givens rotations into workspace array.
+                    ppw  = ( nblst + 1 )*( nblst - 2 ) - j + jcol + 1
+                    len  = 2 + j - jcol
+                    jrow = j + n2nb*nnb + 2
+                    do i = ihi, jrow, -1
+                       ctemp = a( i, j )
+                       s = b( i, j )
+                       do jj = ppw, ppw+len-1
+                          temp = work( jj + nblst )
+                          work( jj + nblst ) = ctemp*temp - s*work( jj )
+                          work( jj ) = conjg( s )*temp + ctemp*work( jj )
+                       end do
+                       len = len + 1
+                       ppw = ppw - nblst - 1
+                    end do
+                    ppwo = nblst*nblst + ( nnb+j-jcol-1 )*2*nnb + nnb
+                    j0 = jrow - nnb
+                    do jrow = j0, j+2, -nnb
+                       ppw = ppwo
+                       len  = 2 + j - jcol
+                       do i = jrow+nnb-1, jrow, -1
+                          ctemp = a( i, j )
+                          s = b( i, j )
+                          do jj = ppw, ppw+len-1
+                             temp = work( jj + 2*nnb )
+                             work( jj + 2*nnb ) = ctemp*temp - s*work( jj )
+                             work( jj ) = conjg( s )*temp + ctemp*work( jj )
+                          end do
+                          len = len + 1
+                          ppw = ppw - 2*nnb - 1
+                       end do
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                    ! top denotes the number of top rows in a and b that will
+                    ! not be updated during the next steps.
+                    if( jcol<=2 ) then
+                       top = 0
+                    else
+                       top = jcol
+                    end if
+                    ! propagate transformations through b and replace stored
+                    ! left sines/cosines by right sines/cosines.
+                    do jj = n, j+1, -1
+                       ! update jjth column of b.
+                       do i = min( jj+1, ihi ), j+2, -1
+                          ctemp = a( i, j )
+                          s = b( i, j )
+                          temp = b( i, jj )
+                          b( i, jj ) = ctemp*temp - conjg( s )*b( i-1, jj )
+                          b( i-1, jj ) = s*temp + ctemp*b( i-1, jj )
+                       end do
+                       ! annihilate b( jj+1, jj ).
+                       if( jj<ihi ) then
+                          temp = b( jj+1, jj+1 )
+                          call stdlib_clartg( temp, b( jj+1, jj ), c, s,b( jj+1, jj+1 ) )
+                          b( jj+1, jj ) = czero
+                          call stdlib_crot( jj-top, b( top+1, jj+1 ), 1,b( top+1, jj ), 1, c, s )
+                                    
+                          a( jj+1, j ) = cmplx( c,KIND=sp)
+                          b( jj+1, j ) = -conjg( s )
+                       end if
+                    end do
+                    ! update a by transformations from right.
+                    jj = mod( ihi-j-1, 3 )
+                    do i = ihi-j-3, jj+1, -3
+                       ctemp = a( j+1+i, j )
+                       s = -b( j+1+i, j )
+                       c1 = a( j+2+i, j )
+                       s1 = -b( j+2+i, j )
+                       c2 = a( j+3+i, j )
+                       s2 = -b( j+3+i, j )
+                       do k = top+1, ihi
+                          temp = a( k, j+i  )
+                          temp1 = a( k, j+i+1 )
+                          temp2 = a( k, j+i+2 )
+                          temp3 = a( k, j+i+3 )
+                          a( k, j+i+3 ) = c2*temp3 + conjg( s2 )*temp2
+                          temp2 = -s2*temp3 + c2*temp2
+                          a( k, j+i+2 ) = c1*temp2 + conjg( s1 )*temp1
+                          temp1 = -s1*temp2 + c1*temp1
+                          a( k, j+i+1 ) = ctemp*temp1 + conjg( s )*temp
+                          a( k, j+i ) = -s*temp1 + ctemp*temp
+                       end do
+                    end do
+                    if( jj>0 ) then
+                       do i = jj, 1, -1
+                          c = real( a( j+1+i, j ),KIND=sp)
+                          call stdlib_crot( ihi-top, a( top+1, j+i+1 ), 1,a( top+1, j+i ), 1, c,-&
+                                    conjg( b( j+1+i, j ) ) )
+                       end do
+                    end if
+                    ! update (j+1)th column of a by transformations from left.
+                    if ( j < jcol + nnb - 1 ) then
+                       len  = 1 + j - jcol
+                       ! multiply with the trailing accumulated unitary
+                       ! matrix, which takes the form
+                              ! [  u11  u12  ]
+                          ! u = [            ],
+                              ! [  u21  u22  ]
+                       ! where u21 is a len-by-len matrix and u12 is lower
+                       ! triangular.
+                       jrow = ihi - nblst + 1
+                       call stdlib_cgemv( 'CONJUGATE', nblst, len, cone, work,nblst, a( jrow, j+1 &
+                                 ), 1, czero,work( pw ), 1 )
+                       ppw = pw + len
+                       do i = jrow, jrow+nblst-len-1
+                          work( ppw ) = a( i, j+1 )
+                          ppw = ppw + 1
+                       end do
+                       call stdlib_ctrmv( 'LOWER', 'CONJUGATE', 'NON-UNIT',nblst-len, work( &
+                                 len*nblst + 1 ), nblst,work( pw+len ), 1 )
+                       call stdlib_cgemv( 'CONJUGATE', len, nblst-len, cone,work( (len+1)*nblst - &
+                       len + 1 ), nblst,a( jrow+nblst-len, j+1 ), 1, cone,work( pw+len ), 1 )
+                                 
+                       ppw = pw
+                       do i = jrow, jrow+nblst-1
+                          a( i, j+1 ) = work( ppw )
+                          ppw = ppw + 1
+                       end do
+                       ! multiply with the other accumulated unitary
+                       ! matrices, which take the form
+                              ! [  u11  u12   0  ]
+                              ! [                ]
+                          ! u = [  u21  u22   0  ],
+                              ! [                ]
+                              ! [   0    0    i  ]
+                       ! where i denotes the (nnb-len)-by-(nnb-len) identity
+                       ! matrix, u21 is a len-by-len upper triangular matrix
+                       ! and u12 is an nnb-by-nnb lower triangular matrix.
+                       ppwo = 1 + nblst*nblst
+                       j0 = jrow - nnb
+                       do jrow = j0, jcol+1, -nnb
+                          ppw = pw + len
+                          do i = jrow, jrow+nnb-1
+                             work( ppw ) = a( i, j+1 )
+                             ppw = ppw + 1
+                          end do
+                          ppw = pw
+                          do i = jrow+nnb, jrow+nnb+len-1
+                             work( ppw ) = a( i, j+1 )
+                             ppw = ppw + 1
+                          end do
+                          call stdlib_ctrmv( 'UPPER', 'CONJUGATE', 'NON-UNIT', len,work( ppwo + &
+                                    nnb ), 2*nnb, work( pw ),1 )
+                          call stdlib_ctrmv( 'LOWER', 'CONJUGATE', 'NON-UNIT', nnb,work( ppwo + &
+                                    2*len*nnb ),2*nnb, work( pw + len ), 1 )
+                          call stdlib_cgemv( 'CONJUGATE', nnb, len, cone,work( ppwo ), 2*nnb, a( &
+                                    jrow, j+1 ), 1,cone, work( pw ), 1 )
+                          call stdlib_cgemv( 'CONJUGATE', len, nnb, cone,work( ppwo + 2*len*nnb + &
+                                    nnb ), 2*nnb,a( jrow+nnb, j+1 ), 1, cone,work( pw+len ), 1 )
+                          ppw = pw
+                          do i = jrow, jrow+len+nnb-1
+                             a( i, j+1 ) = work( ppw )
+                             ppw = ppw + 1
+                          end do
+                          ppwo = ppwo + 4*nnb*nnb
+                       end do
+                    end if
+                 end do
+                 ! apply accumulated unitary matrices to a.
+                 cola = n - jcol - nnb + 1
+                 j = ihi - nblst + 1
+                 call stdlib_cgemm( 'CONJUGATE', 'NO TRANSPOSE', nblst,cola, nblst, cone, work, &
+                           nblst,a( j, jcol+nnb ), lda, czero, work( pw ),nblst )
+                 call stdlib_clacpy( 'ALL', nblst, cola, work( pw ), nblst,a( j, jcol+nnb ), lda )
+                           
+                 ppwo = nblst*nblst + 1
+                 j0 = j - nnb
+                 do j = j0, jcol+1, -nnb
+                    if ( blk22 ) then
+                       ! exploit the structure of
+                              ! [  u11  u12  ]
+                          ! u = [            ]
+                              ! [  u21  u22  ],
+                       ! where all blocks are nnb-by-nnb, u21 is upper
+                       ! triangular and u12 is lower triangular.
+                       call stdlib_cunm22( 'LEFT', 'CONJUGATE', 2*nnb, cola, nnb,nnb, work( ppwo )&
+                                 , 2*nnb,a( j, jcol+nnb ), lda, work( pw ),lwork-pw+1, ierr )
+                    else
+                       ! ignore the structure of u.
+                       call stdlib_cgemm( 'CONJUGATE', 'NO TRANSPOSE', 2*nnb,cola, 2*nnb, cone, &
+                       work( ppwo ), 2*nnb,a( j, jcol+nnb ), lda, czero, work( pw ),2*nnb )
+                                 
+                       call stdlib_clacpy( 'ALL', 2*nnb, cola, work( pw ), 2*nnb,a( j, jcol+nnb ),&
+                                  lda )
+                    end if
+                    ppwo = ppwo + 4*nnb*nnb
+                 end do
+                 ! apply accumulated unitary matrices to q.
+                 if( wantq ) then
+                    j = ihi - nblst + 1
+                    if ( initq ) then
+                       topq = max( 2, j - jcol + 1 )
+                       nh  = ihi - topq + 1
+                    else
+                       topq = 1
+                       nh = n
+                    end if
+                    call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,nblst, nblst, cone, q( &
+                              topq, j ), ldq,work, nblst, czero, work( pw ), nh )
+                    call stdlib_clacpy( 'ALL', nh, nblst, work( pw ), nh,q( topq, j ), ldq )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( initq ) then
+                          topq = max( 2, j - jcol + 1 )
+                          nh  = ihi - topq + 1
+                       end if
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_cunm22( 'RIGHT', 'NO TRANSPOSE', nh, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,q( topq, j ), ldq, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,2*nnb, 2*nnb, &
+                          cone, q( topq, j ), ldq,work( ppwo ), 2*nnb, czero, work( pw ),nh )
+                                    
+                          call stdlib_clacpy( 'ALL', nh, 2*nnb, work( pw ), nh,q( topq, j ), ldq )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+                 ! accumulate right givens rotations if required.
+                 if ( wantz .or. top>0 ) then
+                    ! initialize small unitary factors that will hold the
+                    ! accumulated givens rotations in workspace.
+                    call stdlib_claset( 'ALL', nblst, nblst, czero, cone, work,nblst )
+                    pw = nblst * nblst + 1
+                    do i = 1, n2nb
+                       call stdlib_claset( 'ALL', 2*nnb, 2*nnb, czero, cone,work( pw ), 2*nnb )
+                                 
+                       pw = pw + 4*nnb*nnb
+                    end do
+                    ! accumulate givens rotations into workspace array.
+                    do j = jcol, jcol+nnb-1
+                       ppw  = ( nblst + 1 )*( nblst - 2 ) - j + jcol + 1
+                       len  = 2 + j - jcol
+                       jrow = j + n2nb*nnb + 2
+                       do i = ihi, jrow, -1
+                          ctemp = a( i, j )
+                          a( i, j ) = czero
+                          s = b( i, j )
+                          b( i, j ) = czero
+                          do jj = ppw, ppw+len-1
+                             temp = work( jj + nblst )
+                             work( jj + nblst ) = ctemp*temp -conjg( s )*work( jj )
+                             work( jj ) = s*temp + ctemp*work( jj )
+                          end do
+                          len = len + 1
+                          ppw = ppw - nblst - 1
+                       end do
+                       ppwo = nblst*nblst + ( nnb+j-jcol-1 )*2*nnb + nnb
+                       j0 = jrow - nnb
+                       do jrow = j0, j+2, -nnb
+                          ppw = ppwo
+                          len  = 2 + j - jcol
+                          do i = jrow+nnb-1, jrow, -1
+                             ctemp = a( i, j )
+                             a( i, j ) = czero
+                             s = b( i, j )
+                             b( i, j ) = czero
+                             do jj = ppw, ppw+len-1
+                                temp = work( jj + 2*nnb )
+                                work( jj + 2*nnb ) = ctemp*temp -conjg( s )*work( jj )
+                                work( jj ) = s*temp + ctemp*work( jj )
+                             end do
+                             len = len + 1
+                             ppw = ppw - 2*nnb - 1
+                          end do
+                          ppwo = ppwo + 4*nnb*nnb
+                       end do
+                    end do
+                 else
+                    call stdlib_claset( 'LOWER', ihi - jcol - 1, nnb, czero, czero,a( jcol + 2, &
+                              jcol ), lda )
+                    call stdlib_claset( 'LOWER', ihi - jcol - 1, nnb, czero, czero,b( jcol + 2, &
+                              jcol ), ldb )
+                 end if
+                 ! apply accumulated unitary matrices to a and b.
+                 if ( top>0 ) then
+                    j = ihi - nblst + 1
+                    call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,nblst, nblst, cone, a( &
+                              1, j ), lda,work, nblst, czero, work( pw ), top )
+                    call stdlib_clacpy( 'ALL', top, nblst, work( pw ), top,a( 1, j ), lda )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_cunm22( 'RIGHT', 'NO TRANSPOSE', top, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,a( 1, j ), lda, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,2*nnb, 2*nnb, &
+                          cone, a( 1, j ), lda,work( ppwo ), 2*nnb, czero,work( pw ), top )
+                                    
+                          call stdlib_clacpy( 'ALL', top, 2*nnb, work( pw ), top,a( 1, j ), lda )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                    j = ihi - nblst + 1
+                    call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,nblst, nblst, cone, b( &
+                              1, j ), ldb,work, nblst, czero, work( pw ), top )
+                    call stdlib_clacpy( 'ALL', top, nblst, work( pw ), top,b( 1, j ), ldb )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_cunm22( 'RIGHT', 'NO TRANSPOSE', top, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,b( 1, j ), ldb, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,2*nnb, 2*nnb, &
+                          cone, b( 1, j ), ldb,work( ppwo ), 2*nnb, czero,work( pw ), top )
+                                    
+                          call stdlib_clacpy( 'ALL', top, 2*nnb, work( pw ), top,b( 1, j ), ldb )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+                 ! apply accumulated unitary matrices to z.
+                 if( wantz ) then
+                    j = ihi - nblst + 1
+                    if ( initq ) then
+                       topq = max( 2, j - jcol + 1 )
+                       nh  = ihi - topq + 1
+                    else
+                       topq = 1
+                       nh = n
+                    end if
+                    call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,nblst, nblst, cone, z( &
+                              topq, j ), ldz,work, nblst, czero, work( pw ), nh )
+                    call stdlib_clacpy( 'ALL', nh, nblst, work( pw ), nh,z( topq, j ), ldz )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                          if ( initq ) then
+                          topq = max( 2, j - jcol + 1 )
+                          nh  = ihi - topq + 1
+                       end if
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_cunm22( 'RIGHT', 'NO TRANSPOSE', nh, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,z( topq, j ), ldz, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_cgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,2*nnb, 2*nnb, &
+                          cone, z( topq, j ), ldz,work( ppwo ), 2*nnb, czero, work( pw ),nh )
+                                    
+                          call stdlib_clacpy( 'ALL', nh, 2*nnb, work( pw ), nh,z( topq, j ), ldz )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+              end do
+           end if
+           ! use unblocked code to reduce the rest of the matrix
+           ! avoid re-initialization of modified q and z.
+           compq2 = compq
+           compz2 = compz
+           if ( jcol/=ilo ) then
+              if ( wantq )compq2 = 'V'
+              if ( wantz )compz2 = 'V'
+           end if
+           if ( jcol<ihi )call stdlib_cgghrd( compq2, compz2, n, jcol, ihi, a, lda, b, ldb, q,ldq,&
+                      z, ldz, ierr )
+           work( 1 ) = cmplx( lwkopt,KIND=sp)
+           return
+     end subroutine stdlib_cgghd3
+
+     !> CGGLSE: solves the linear equality-constrained least squares (LSE)
+     !> problem:
+     !> minimize || c - A*x ||_2   subject to   B*x = d
+     !> where A is an M-by-N matrix, B is a P-by-N matrix, c is a given
+     !> M-vector, and d is a given P-vector. It is assumed that
+     !> P <= N <= M+P, and
+     !> rank(B) = P and  rank( (A) ) = N.
+     !> ( (B) )
+     !> These conditions ensure that the LSE problem has a unique solution,
+     !> which is obtained using a generalized RQ factorization of the
+     !> matrices (B, A) given by
+     !> B = (0 R)*Q,   A = Z*T*Q.
+
+     pure subroutine stdlib_cgglse( m, n, p, a, lda, b, ldb, c, d, x, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*), c(*), d(*)
+           complex(sp), intent(out) :: work(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkmin, lwkopt, mn, nb, nb1, nb2, nb3, nb4, nr
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( p<0 .or. p>n .or. p<n-m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, p ) ) then
+              info = -7
+           end if
+           ! calculate workspace
+           if( info==0) then
+              if( n==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'CGEQRF', ' ', m, n, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'CGERQF', ' ', m, n, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'CUNMQR', ' ', m, n, p, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'CUNMRQ', ' ', m, n, p, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = m + n + p
+                 lwkopt = p + mn + max( m, n )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGLSE', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! compute the grq factorization of matrices b and a:
+                  ! b*q**h = (  0  t12 ) p   z**h*a*q**h = ( r11 r12 ) n-p
+                              ! n-p  p                     (  0  r22 ) m+p-n
+                                                            ! n-p  p
+           ! where t12 and r11 are upper triangular, and q and z are
+           ! unitary.
+           call stdlib_cggrqf( p, m, n, b, ldb, work, a, lda, work( p+1 ),work( p+mn+1 ), lwork-p-&
+                     mn, info )
+           lopt = real( work( p+mn+1 ),KIND=sp)
+           ! update c = z**h *c = ( c1 ) n-p
+                             ! ( c2 ) m+p-n
+           call stdlib_cunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', m, 1, mn, a, lda,work( p+1 ), c, &
+                     max( 1, m ), work( p+mn+1 ),lwork-p-mn, info )
+           lopt = max( lopt, int( work( p+mn+1 ),KIND=ilp) )
+           ! solve t12*x2 = d for x2
+           if( p>0 ) then
+              call stdlib_ctrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', p, 1,b( 1, n-p+1 ), ldb, d,&
+                         p, info )
+              if( info>0 ) then
+                 info = 1
+                 return
+              end if
+              ! put the solution in x
+           call stdlib_ccopy( p, d, 1, x( n-p+1 ), 1 )
+              ! update c1
+              call stdlib_cgemv( 'NO TRANSPOSE', n-p, p, -cone, a( 1, n-p+1 ), lda,d, 1, cone, c, &
+                        1 )
+           end if
+           ! solve r11*x1 = c1 for x1
+           if( n>p ) then
+              call stdlib_ctrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n-p, 1,a, lda, c, n-p, &
+                        info )
+              if( info>0 ) then
+                 info = 2
+                 return
+              end if
+              ! put the solutions in x
+              call stdlib_ccopy( n-p, c, 1, x, 1 )
+           end if
+           ! compute the residual vector:
+           if( m<n ) then
+              nr = m + p - n
+              if( nr>0 )call stdlib_cgemv( 'NO TRANSPOSE', nr, n-m, -cone, a( n-p+1, m+1 ),lda, d(&
+                         nr+1 ), 1, cone, c( n-p+1 ), 1 )
+           else
+              nr = p
+           end if
+           if( nr>0 ) then
+              call stdlib_ctrmv( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', nr,a( n-p+1, n-p+1 ), lda, &
+                        d, 1 )
+              call stdlib_caxpy( nr, -cone, d, 1, c( n-p+1 ), 1 )
+           end if
+           ! backward transformation x = q**h*x
+           call stdlib_cunmrq( 'LEFT', 'CONJUGATE TRANSPOSE', n, 1, p, b, ldb,work( 1 ), x, n, &
+                     work( p+mn+1 ), lwork-p-mn, info )
+           work( 1 ) = p + mn + max( lopt, int( work( p+mn+1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_cgglse
+
+     !> CGTCON: estimates the reciprocal of the condition number of a complex
+     !> tridiagonal matrix A using the LU factorization as computed by
+     !> CGTTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_cgtcon( norm, n, dl, d, du, du2, ipiv, anorm, rcond,work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: d(*), dl(*), du(*), du2(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: onenrm
+           integer(ilp) :: i, kase, kase1
+           real(sp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: cmplx
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGTCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           ! check that d(1:n) is non-zero.
+           do i = 1, n
+              if( d( i )==cmplx( zero,KIND=sp) )return
+           end do
+           ainvnm = zero
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kase = 0
+           20 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(u)*inv(l).
+                 call stdlib_cgttrs( 'NO TRANSPOSE', n, 1, dl, d, du, du2, ipiv,work, n, info )
+                           
+              else
+                 ! multiply by inv(l**h)*inv(u**h).
+                 call stdlib_cgttrs( 'CONJUGATE TRANSPOSE', n, 1, dl, d, du, du2,ipiv, work, n, &
+                           info )
+              end if
+              go to 20
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_cgtcon
+
+     !> CGTRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is tridiagonal, and provides
+     !> error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_cgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2,ipiv, b, ldb, x, &
+               ldx, ferr, berr, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: b(ldb,*), d(*), df(*), dl(*), dlf(*), du(*), du2(*), duf(*)
+                     
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transn, transt
+           integer(ilp) :: count, i, j, kase, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -13
+           else if( ldx<max( 1, n ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGTRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = 4
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_110: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_ccopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_clagtm( trans, n, 1, -one, dl, d, du, x( 1, j ), ldx, one,work, n )
+                        
+              ! compute abs(op(a))*abs(x) + abs(b) for use in the backward
+              ! error bound.
+              if( notran ) then
+                 if( n==1 ) then
+                    rwork( 1 ) = cabs1( b( 1, j ) ) +cabs1( d( 1 ) )*cabs1( x( 1, j ) )
+                 else
+                    rwork( 1 ) = cabs1( b( 1, j ) ) +cabs1( d( 1 ) )*cabs1( x( 1, j ) ) +cabs1( &
+                              du( 1 ) )*cabs1( x( 2, j ) )
+                    do i = 2, n - 1
+                       rwork( i ) = cabs1( b( i, j ) ) +cabs1( dl( i-1 ) )*cabs1( x( i-1, j ) ) +&
+                       cabs1( d( i ) )*cabs1( x( i, j ) ) +cabs1( du( i ) )*cabs1( x( i+1, j ) )
+                                 
+                    end do
+                    rwork( n ) = cabs1( b( n, j ) ) +cabs1( dl( n-1 ) )*cabs1( x( n-1, j ) ) +&
+                              cabs1( d( n ) )*cabs1( x( n, j ) )
+                 end if
+              else
+                 if( n==1 ) then
+                    rwork( 1 ) = cabs1( b( 1, j ) ) +cabs1( d( 1 ) )*cabs1( x( 1, j ) )
+                 else
+                    rwork( 1 ) = cabs1( b( 1, j ) ) +cabs1( d( 1 ) )*cabs1( x( 1, j ) ) +cabs1( &
+                              dl( 1 ) )*cabs1( x( 2, j ) )
+                    do i = 2, n - 1
+                       rwork( i ) = cabs1( b( i, j ) ) +cabs1( du( i-1 ) )*cabs1( x( i-1, j ) ) +&
+                       cabs1( d( i ) )*cabs1( x( i, j ) ) +cabs1( dl( i ) )*cabs1( x( i+1, j ) )
+                                 
+                    end do
+                    rwork( n ) = cabs1( b( n, j ) ) +cabs1( du( n-1 ) )*cabs1( x( n-1, j ) ) +&
+                              cabs1( d( n ) )*cabs1( x( n, j ) )
+                 end if
+              end if
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_cgttrs( trans, n, 1, dlf, df, duf, du2, ipiv, work, n,info )
+                 call stdlib_caxpy( n, cmplx( one,KIND=sp), work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_clacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              70 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_cgttrs( transt, n, 1, dlf, df, duf, du2, ipiv, work,n, info )
+                              
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_cgttrs( transn, n, 1, dlf, df, duf, du2, ipiv, work,n, info )
+                              
+                 end if
+                 go to 70
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_110
+           return
+     end subroutine stdlib_cgtrfs
+
+     !> CGTSVX: uses the LU factorization to compute the solution to a complex
+     !> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+     !> where A is a tridiagonal matrix of order N and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     pure subroutine stdlib_cgtsvx( fact, trans, n, nrhs, dl, d, du, dlf, df, duf,du2, ipiv, b, &
+               ldb, x, ldx, rcond, ferr, berr,work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: b(ldb,*), d(*), dl(*), du(*)
+           complex(sp), intent(inout) :: df(*), dlf(*), du2(*), duf(*)
+           complex(sp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact, notran
+           character :: norm
+           real(sp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -14
+           else if( ldx<max( 1, n ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGTSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the lu factorization of a.
+              call stdlib_ccopy( n, d, 1, df, 1 )
+              if( n>1 ) then
+                 call stdlib_ccopy( n-1, dl, 1, dlf, 1 )
+                 call stdlib_ccopy( n-1, du, 1, duf, 1 )
+              end if
+              call stdlib_cgttrf( n, dlf, df, duf, du2, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_clangt( norm, n, dl, d, du )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_cgtcon( norm, n, dlf, df, duf, du2, ipiv, anorm, rcond, work,info )
+           ! compute the solution vectors x.
+           call stdlib_clacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_cgttrs( trans, n, nrhs, dlf, df, duf, du2, ipiv, x, ldx,info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_cgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2, ipiv,b, ldb, x, ldx, &
+                     ferr, berr, work, rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_cgtsvx
+
+     !> CHBGST: reduces a complex Hermitian-definite banded generalized
+     !> eigenproblem  A*x = lambda*B*x  to standard form  C*y = lambda*y,
+     !> such that C has the same bandwidth as A.
+     !> B must have been previously factorized as S**H*S by CPBSTF, using a
+     !> split Cholesky factorization. A is overwritten by C = X**H*A*X, where
+     !> X = S**(-1)*Q and Q is a unitary matrix chosen to preserve the
+     !> bandwidth of A.
+
+     pure subroutine stdlib_chbgst( vect, uplo, n, ka, kb, ab, ldab, bb, ldbb, x,ldx, work, rwork,&
+                info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldx, n
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: ab(ldab,*)
+           complex(sp), intent(in) :: bb(ldbb,*)
+           complex(sp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: update, upper, wantx
+           integer(ilp) :: i, i0, i1, i2, inca, j, j1, j1t, j2, j2t, k, ka1, kb1, kbt, l, m, nr, &
+                     nrt, nx
+           real(sp) :: bii
+           complex(sp) :: ra, ra1, t
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min,real
+           ! Executable Statements 
+           ! test the input parameters
+           wantx = stdlib_lsame( vect, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           ka1 = ka + 1
+           kb1 = kb + 1
+           info = 0
+           if( .not.wantx .and. .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldx<1 .or. wantx .and. ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHBGST', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           inca = ldab*ka1
+           ! initialize x to the unit matrix, if needed
+           if( wantx )call stdlib_claset( 'FULL', n, n, czero, cone, x, ldx )
+           ! set m to the splitting point m. it must be the same value as is
+           ! used in stdlib_cpbstf. the chosen value allows the arrays work and rwork
+           ! to be of dimension (n).
+           m = ( n+kb ) / 2
+           ! the routine works in two phases, corresponding to the two halves
+           ! of the split cholesky factorization of b as s**h*s where
+           ! s = ( u    )
+               ! ( m  l )
+           ! with u upper triangular of order m, and l lower triangular of
+           ! order n-m. s has the same bandwidth as b.
+           ! s is treated as a product of elementary matrices:
+           ! s = s(m)*s(m-1)*...*s(2)*s(1)*s(m+1)*s(m+2)*...*s(n-1)*s(n)
+           ! where s(i) is determined by the i-th row of s.
+           ! in phase 1, the index i takes the values n, n-1, ... , m+1;
+           ! in phase 2, it takes the values 1, 2, ... , m.
+           ! for each value of i, the current matrix a is updated by forming
+           ! inv(s(i))**h*a*inv(s(i)). this creates a triangular bulge outside
+           ! the band of a. the bulge is then pushed down toward the bottom of
+           ! a in phase 1, and up toward the top of a in phase 2, by applying
+           ! plane rotations.
+           ! there are kb*(kb+1)/2 elements in the bulge, but at most 2*kb-1
+           ! of them are linearly independent, so annihilating a bulge requires
+           ! only 2*kb-1 plane rotations. the rotations are divided into a 1st
+           ! set of kb-1 rotations, and a 2nd set of kb rotations.
+           ! wherever possible, rotations are generated and applied in vector
+           ! operations of length nr between the indices j1 and j2 (sometimes
+           ! replaced by modified values nrt, j1t or j2t).
+           ! the real cosines and complex sines of the rotations are stored in
+           ! the arrays rwork and work, those of the 1st set in elements
+           ! 2:m-kb-1, and those of the 2nd set in elements m-kb+1:n.
+           ! the bulges are not formed explicitly; nonzero elements outside the
+           ! band are created only when they are required for generating new
+           ! rotations; they are stored in the array work, in positions where
+           ! they are later overwritten by the sines of the rotations which
+           ! annihilate them.
+           ! **************************** phase 1 *****************************
+           ! the logical structure of this phase is:
+           ! update = .true.
+           ! do i = n, m + 1, -1
+              ! use s(i) to update a and create a new bulge
+              ! apply rotations to push all bulges ka positions downward
+           ! end do
+           ! update = .false.
+           ! do i = m + ka + 1, n - 1
+              ! apply rotations to push all bulges ka positions downward
+           ! end do
+           ! to avoid duplicating code, the two loops are merged.
+           update = .true.
+           i = n + 1
+           10 continue
+           if( update ) then
+              i = i - 1
+              kbt = min( kb, i-1 )
+              i0 = i - 1
+              i1 = min( n, i+ka )
+              i2 = i - kbt + ka1
+              if( i<m+1 ) then
+                 update = .false.
+                 i = i + 1
+                 i0 = m
+                 if( ka==0 )go to 480
+                 go to 10
+              end if
+           else
+              i = i + ka
+              if( i>n-1 )go to 480
+           end if
+           if( upper ) then
+              ! transform a, working with the upper triangle
+              if( update ) then
+                 ! form  inv(s(i))**h * a * inv(s(i))
+                 bii = real( bb( kb1, i ),KIND=sp)
+                 ab( ka1, i ) = ( real( ab( ka1, i ),KIND=sp) / bii ) / bii
+                 do j = i + 1, i1
+                    ab( i-j+ka1, j ) = ab( i-j+ka1, j ) / bii
+                 end do
+                 do j = max( 1, i-ka ), i - 1
+                    ab( j-i+ka1, i ) = ab( j-i+ka1, i ) / bii
+                 end do
+                 do k = i - kbt, i - 1
+                    do j = i - kbt, k
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( j-i+kb1, i )*conjg( ab( k-i+ka1, &
+                       i ) ) -conjg( bb( k-i+kb1, i ) )*ab( j-i+ka1, i ) +real( ab( ka1, i ),&
+                                 KIND=sp)*bb( j-i+kb1, i )*conjg( bb( k-i+kb1, i ) )
+                    end do
+                    do j = max( 1, i-ka ), i - kbt - 1
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -conjg( bb( k-i+kb1, i ) )*ab( j-i+ka1,&
+                                  i )
+                    end do
+                 end do
+                 do j = i, i1
+                    do k = max( j-ka, i-kbt ), i - 1
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( k-i+kb1, i )*ab( i-j+ka1, j )
+                                 
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_csscal( n-m, one / bii, x( m+1, i ), 1 )
+                    if( kbt>0 )call stdlib_cgerc( n-m, kbt, -cone, x( m+1, i ), 1,bb( kb1-kbt, i )&
+                              , 1, x( m+1, i-kbt ),ldx )
+                 end if
+                 ! store a(i,i1) in ra1 for use in next loop over k
+                 ra1 = ab( i-i1+ka1, i1 )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions down toward the bottom of the
+              ! band
+              loop_130: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i-k+ka<n .and. i-k>1 ) then
+                       ! generate rotation to annihilate a(i,i-k+ka+1)
+                       call stdlib_clartg( ab( k+1, i-k+ka ), ra1,rwork( i-k+ka-m ), work( i-k+ka-&
+                                 m ), ra )
+                       ! create nonzero element a(i-k,i-k+ka+1) outside the
+                       ! band and store it in work(i-k)
+                       t = -bb( kb1-k, i )*ra1
+                       work( i-k ) = rwork( i-k+ka-m )*t -conjg( work( i-k+ka-m ) )*ab( 1, i-k+ka &
+                                 )
+                       ab( 1, i-k+ka ) = work( i-k+ka-m )*t +rwork( i-k+ka-m )*ab( 1, i-k+ka )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = max( j2, i+2*ka-k+1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( n-j2t+ka ) / ka1
+                 do j = j2t, j1, ka1
+                    ! create nonzero element a(j-ka,j+1) outside the band
+                    ! and store it in work(j-m)
+                    work( j-m ) = work( j-m )*ab( 1, j+1 )
+                    ab( 1, j+1 ) = rwork( j-m )*ab( 1, j+1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_clargv( nrt, ab( 1, j2t ), inca, work( j2t-m ), ka1,rwork(&
+                            j2t-m ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the right
+                    do l = 1, ka - 1
+                       call stdlib_clartv( nr, ab( ka1-l, j2 ), inca,ab( ka-l, j2+1 ), inca, &
+                                 rwork( j2-m ),work( j2-m ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_clar2v( nr, ab( ka1, j2 ), ab( ka1, j2+1 ),ab( ka, j2+1 ), inca, &
+                              rwork( j2-m ),work( j2-m ), ka1 )
+                    call stdlib_clacgv( nr, work( j2-m ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca, rwork( j2-m ),work( j2-m ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j2, j1, ka1
+                       call stdlib_crot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,rwork( j-m ), &
+                                 conjg( work( j-m ) ) )
+                    end do
+                 end if
+              end do loop_130
+              if( update ) then
+                 if( i2<=n .and. kbt>0 ) then
+                    ! create nonzero element a(i-kbt,i-kbt+ka+1) outside the
+                    ! band and store it in work(i-kbt)
+                    work( i-kbt ) = -bb( kb1-kbt, i )*ra1
+                 end if
+              end if
+              loop_170: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i - k - 1 + max( 2, k-i0+1 )*ka1
+                 else
+                    j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+ka+l ) / ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( l, j2-l+1 ), inca,ab( l+1, j2-l+1 ), &
+                              inca, rwork( j2-ka ),work( j2-ka ), ka1 )
+                 end do
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 do j = j1, j2, -ka1
+                    work( j ) = work( j-ka )
+                    rwork( j ) = rwork( j-ka )
+                 end do
+                 do j = j2, j1, ka1
+                    ! create nonzero element a(j-ka,j+1) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( 1, j+1 )
+                    ab( 1, j+1 ) = rwork( j )*ab( 1, j+1 )
+                 end do
+                 if( update ) then
+                    if( i-k<n-ka .and. k<=kbt )work( i-k+ka ) = work( i-k )
+                 end if
+              end do loop_170
+              loop_210: do k = kb, 1, -1
+                 j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_clargv( nr, ab( 1, j2 ), inca, work( j2 ), ka1,rwork( j2 ), ka1 )
+                              
+                    ! apply rotations in 2nd set from the right
+                    do l = 1, ka - 1
+                       call stdlib_clartv( nr, ab( ka1-l, j2 ), inca,ab( ka-l, j2+1 ), inca, &
+                                 rwork( j2 ),work( j2 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_clar2v( nr, ab( ka1, j2 ), ab( ka1, j2+1 ),ab( ka, j2+1 ), inca, &
+                              rwork( j2 ),work( j2 ), ka1 )
+                    call stdlib_clacgv( nr, work( j2 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca, rwork( j2 ),work( j2 ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j2, j1, ka1
+                       call stdlib_crot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,rwork( j ), conjg( &
+                                 work( j ) ) )
+                    end do
+                 end if
+              end do loop_210
+              do k = 1, kb - 1
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 ! finish applying rotations in 1st set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca, rwork( j2-m ),work( j2-m ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = n - 1, j2 + ka, -1
+                    rwork( j-m ) = rwork( j-ka-m )
+                    work( j-m ) = work( j-ka-m )
+                 end do
+              end if
+           else
+              ! transform a, working with the lower triangle
+              if( update ) then
+                 ! form  inv(s(i))**h * a * inv(s(i))
+                 bii = real( bb( 1, i ),KIND=sp)
+                 ab( 1, i ) = ( real( ab( 1, i ),KIND=sp) / bii ) / bii
+                 do j = i + 1, i1
+                    ab( j-i+1, i ) = ab( j-i+1, i ) / bii
+                 end do
+                 do j = max( 1, i-ka ), i - 1
+                    ab( i-j+1, j ) = ab( i-j+1, j ) / bii
+                 end do
+                 do k = i - kbt, i - 1
+                    do j = i - kbt, k
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( i-j+1, j )*conjg( ab( i-k+1,k ) ) - &
+                       conjg( bb( i-k+1, k ) )*ab( i-j+1, j ) + real( ab( 1, i ),KIND=sp)*bb( i-j+&
+                                 1, j )*conjg( bb( i-k+1,k ) )
+                    end do
+                    do j = max( 1, i-ka ), i - kbt - 1
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -conjg( bb( i-k+1, k ) )*ab( i-j+1, j )
+                                 
+                    end do
+                 end do
+                 do j = i, i1
+                    do k = max( j-ka, i-kbt ), i - 1
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( i-k+1, k )*ab( j-i+1, i )
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_csscal( n-m, one / bii, x( m+1, i ), 1 )
+                    if( kbt>0 )call stdlib_cgeru( n-m, kbt, -cone, x( m+1, i ), 1,bb( kbt+1, i-&
+                              kbt ), ldbb-1,x( m+1, i-kbt ), ldx )
+                 end if
+                 ! store a(i1,i) in ra1 for use in next loop over k
+                 ra1 = ab( i1-i+1, i )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions down toward the bottom of the
+              ! band
+              loop_360: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i-k+ka<n .and. i-k>1 ) then
+                       ! generate rotation to annihilate a(i-k+ka+1,i)
+                       call stdlib_clartg( ab( ka1-k, i ), ra1, rwork( i-k+ka-m ),work( i-k+ka-m )&
+                                 , ra )
+                       ! create nonzero element a(i-k+ka+1,i-k) outside the
+                       ! band and store it in work(i-k)
+                       t = -bb( k+1, i-k )*ra1
+                       work( i-k ) = rwork( i-k+ka-m )*t -conjg( work( i-k+ka-m ) )*ab( ka1, i-k )
+                                 
+                       ab( ka1, i-k ) = work( i-k+ka-m )*t +rwork( i-k+ka-m )*ab( ka1, i-k )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = max( j2, i+2*ka-k+1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( n-j2t+ka ) / ka1
+                 do j = j2t, j1, ka1
+                    ! create nonzero element a(j+1,j-ka) outside the band
+                    ! and store it in work(j-m)
+                    work( j-m ) = work( j-m )*ab( ka1, j-ka+1 )
+                    ab( ka1, j-ka+1 ) = rwork( j-m )*ab( ka1, j-ka+1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_clargv( nrt, ab( ka1, j2t-ka ), inca, work( j2t-m ),ka1, &
+                           rwork( j2t-m ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the left
+                    do l = 1, ka - 1
+                       call stdlib_clartv( nr, ab( l+1, j2-l ), inca,ab( l+2, j2-l ), inca, rwork(&
+                                  j2-m ),work( j2-m ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_clar2v( nr, ab( 1, j2 ), ab( 1, j2+1 ), ab( 2, j2 ),inca, rwork( &
+                              j2-m ), work( j2-m ), ka1 )
+                    call stdlib_clacgv( nr, work( j2-m ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, rwork( j2-m ),work( j2-m ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j2, j1, ka1
+                       call stdlib_crot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,rwork( j-m ), work(&
+                                  j-m ) )
+                    end do
+                 end if
+              end do loop_360
+              if( update ) then
+                 if( i2<=n .and. kbt>0 ) then
+                    ! create nonzero element a(i-kbt+ka+1,i-kbt) outside the
+                    ! band and store it in work(i-kbt)
+                    work( i-kbt ) = -bb( kbt+1, i-kbt )*ra1
+                 end if
+              end if
+              loop_400: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i - k - 1 + max( 2, k-i0+1 )*ka1
+                 else
+                    j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+ka+l ) / ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( ka1-l+1, j2-ka ), inca,ab( ka1-l, j2-&
+                              ka+1 ), inca,rwork( j2-ka ), work( j2-ka ), ka1 )
+                 end do
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 do j = j1, j2, -ka1
+                    work( j ) = work( j-ka )
+                    rwork( j ) = rwork( j-ka )
+                 end do
+                 do j = j2, j1, ka1
+                    ! create nonzero element a(j+1,j-ka) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( ka1, j-ka+1 )
+                    ab( ka1, j-ka+1 ) = rwork( j )*ab( ka1, j-ka+1 )
+                 end do
+                 if( update ) then
+                    if( i-k<n-ka .and. k<=kbt )work( i-k+ka ) = work( i-k )
+                 end if
+              end do loop_400
+              loop_440: do k = kb, 1, -1
+                 j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_clargv( nr, ab( ka1, j2-ka ), inca, work( j2 ), ka1,rwork( j2 ), &
+                              ka1 )
+                    ! apply rotations in 2nd set from the left
+                    do l = 1, ka - 1
+                       call stdlib_clartv( nr, ab( l+1, j2-l ), inca,ab( l+2, j2-l ), inca, rwork(&
+                                  j2 ),work( j2 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_clar2v( nr, ab( 1, j2 ), ab( 1, j2+1 ), ab( 2, j2 ),inca, rwork( &
+                              j2 ), work( j2 ), ka1 )
+                    call stdlib_clacgv( nr, work( j2 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, rwork( j2 ),work( j2 ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j2, j1, ka1
+                       call stdlib_crot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,rwork( j ), work( &
+                                 j ) )
+                    end do
+                 end if
+              end do loop_440
+              do k = 1, kb - 1
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 ! finish applying rotations in 1st set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, rwork( j2-m ),work( j2-m ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = n - 1, j2 + ka, -1
+                    rwork( j-m ) = rwork( j-ka-m )
+                    work( j-m ) = work( j-ka-m )
+                 end do
+              end if
+           end if
+           go to 10
+           480 continue
+           ! **************************** phase 2 *****************************
+           ! the logical structure of this phase is:
+           ! update = .true.
+           ! do i = 1, m
+              ! use s(i) to update a and create a new bulge
+              ! apply rotations to push all bulges ka positions upward
+           ! end do
+           ! update = .false.
+           ! do i = m - ka - 1, 2, -1
+              ! apply rotations to push all bulges ka positions upward
+           ! end do
+           ! to avoid duplicating code, the two loops are merged.
+           update = .true.
+           i = 0
+           490 continue
+           if( update ) then
+              i = i + 1
+              kbt = min( kb, m-i )
+              i0 = i + 1
+              i1 = max( 1, i-ka )
+              i2 = i + kbt - ka1
+              if( i>m ) then
+                 update = .false.
+                 i = i - 1
+                 i0 = m + 1
+                 if( ka==0 )return
+                 go to 490
+              end if
+           else
+              i = i - ka
+              if( i<2 )return
+           end if
+           if( i<m-kbt ) then
+              nx = m
+           else
+              nx = n
+           end if
+           if( upper ) then
+              ! transform a, working with the upper triangle
+              if( update ) then
+                 ! form  inv(s(i))**h * a * inv(s(i))
+                 bii = real( bb( kb1, i ),KIND=sp)
+                 ab( ka1, i ) = ( real( ab( ka1, i ),KIND=sp) / bii ) / bii
+                 do j = i1, i - 1
+                    ab( j-i+ka1, i ) = ab( j-i+ka1, i ) / bii
+                 end do
+                 do j = i + 1, min( n, i+ka )
+                    ab( i-j+ka1, j ) = ab( i-j+ka1, j ) / bii
+                 end do
+                 do k = i + 1, i + kbt
+                    do j = k, i + kbt
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( i-j+kb1, j )*conjg( ab( i-k+ka1, &
+                       k ) ) -conjg( bb( i-k+kb1, k ) )*ab( i-j+ka1, j ) +real( ab( ka1, i ),&
+                                 KIND=sp)*bb( i-j+kb1, j )*conjg( bb( i-k+kb1, k ) )
+                    end do
+                    do j = i + kbt + 1, min( n, i+ka )
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -conjg( bb( i-k+kb1, k ) )*ab( i-j+ka1,&
+                                  j )
+                    end do
+                 end do
+                 do j = i1, i
+                    do k = i + 1, min( j+ka, i+kbt )
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( i-k+kb1, k )*ab( j-i+ka1, i )
+                                 
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_csscal( nx, one / bii, x( 1, i ), 1 )
+                    if( kbt>0 )call stdlib_cgeru( nx, kbt, -cone, x( 1, i ), 1,bb( kb, i+1 ), &
+                              ldbb-1, x( 1, i+1 ), ldx )
+                 end if
+                 ! store a(i1,i) in ra1 for use in next loop over k
+                 ra1 = ab( i1-i+ka1, i )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions up toward the top of the band
+              loop_610: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i+k-ka1>0 .and. i+k<m ) then
+                       ! generate rotation to annihilate a(i+k-ka-1,i)
+                       call stdlib_clartg( ab( k+1, i ), ra1, rwork( i+k-ka ),work( i+k-ka ), ra )
+                                 
+                       ! create nonzero element a(i+k-ka-1,i+k) outside the
+                       ! band and store it in work(m-kb+i+k)
+                       t = -bb( kb1-k, i+k )*ra1
+                       work( m-kb+i+k ) = rwork( i+k-ka )*t -conjg( work( i+k-ka ) )*ab( 1, i+k )
+                                 
+                       ab( 1, i+k ) = work( i+k-ka )*t +rwork( i+k-ka )*ab( 1, i+k )
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = min( j2, i-2*ka+k-1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( j2t+ka-1 ) / ka1
+                 do j = j1, j2t, ka1
+                    ! create nonzero element a(j-1,j+ka) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( 1, j+ka-1 )
+                    ab( 1, j+ka-1 ) = rwork( j )*ab( 1, j+ka-1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_clargv( nrt, ab( 1, j1+ka ), inca, work( j1 ), ka1,rwork( &
+                           j1 ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the left
+                    do l = 1, ka - 1
+                       call stdlib_clartv( nr, ab( ka1-l, j1+l ), inca,ab( ka-l, j1+l ), inca, &
+                                 rwork( j1 ),work( j1 ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_clar2v( nr, ab( ka1, j1 ), ab( ka1, j1-1 ),ab( ka, j1 ), inca, &
+                              rwork( j1 ), work( j1 ),ka1 )
+                    call stdlib_clacgv( nr, work( j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                               rwork( j1t ),work( j1t ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j1, j2, ka1
+                       call stdlib_crot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,rwork( j ), work( j ) )
+                                 
+                    end do
+                 end if
+              end do loop_610
+              if( update ) then
+                 if( i2>0 .and. kbt>0 ) then
+                    ! create nonzero element a(i+kbt-ka-1,i+kbt) outside the
+                    ! band and store it in work(m-kb+i+kbt)
+                    work( m-kb+i+kbt ) = -bb( kb1-kbt, i+kbt )*ra1
+                 end if
+              end if
+              loop_650: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i + k + 1 - max( 2, k+i0-m )*ka1
+                 else
+                    j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+ka+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( l, j1t+ka ), inca,ab( l+1, j1t+ka-1 ),&
+                               inca,rwork( m-kb+j1t+ka ),work( m-kb+j1t+ka ), ka1 )
+                 end do
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 do j = j1, j2, ka1
+                    work( m-kb+j ) = work( m-kb+j+ka )
+                    rwork( m-kb+j ) = rwork( m-kb+j+ka )
+                 end do
+                 do j = j1, j2, ka1
+                    ! create nonzero element a(j-1,j+ka) outside the band
+                    ! and store it in work(m-kb+j)
+                    work( m-kb+j ) = work( m-kb+j )*ab( 1, j+ka-1 )
+                    ab( 1, j+ka-1 ) = rwork( m-kb+j )*ab( 1, j+ka-1 )
+                 end do
+                 if( update ) then
+                    if( i+k>ka1 .and. k<=kbt )work( m-kb+i+k-ka ) = work( m-kb+i+k )
+                 end if
+              end do loop_650
+              loop_690: do k = kb, 1, -1
+                 j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_clargv( nr, ab( 1, j1+ka ), inca, work( m-kb+j1 ),ka1, rwork( m-&
+                              kb+j1 ), ka1 )
+                    ! apply rotations in 2nd set from the left
+                    do l = 1, ka - 1
+                       call stdlib_clartv( nr, ab( ka1-l, j1+l ), inca,ab( ka-l, j1+l ), inca, &
+                                 rwork( m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_clar2v( nr, ab( ka1, j1 ), ab( ka1, j1-1 ),ab( ka, j1 ), inca, &
+                              rwork( m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                    call stdlib_clacgv( nr, work( m-kb+j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                              rwork( m-kb+j1t ), work( m-kb+j1t ),ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j1, j2, ka1
+                       call stdlib_crot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,rwork( m-kb+j ), work( &
+                                 m-kb+j ) )
+                    end do
+                 end if
+              end do loop_690
+              do k = 1, kb - 1
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 ! finish applying rotations in 1st set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                               rwork( j1t ),work( j1t ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = 2, i2 - ka
+                    rwork( j ) = rwork( j+ka )
+                    work( j ) = work( j+ka )
+                 end do
+              end if
+           else
+              ! transform a, working with the lower triangle
+              if( update ) then
+                 ! form  inv(s(i))**h * a * inv(s(i))
+                 bii = real( bb( 1, i ),KIND=sp)
+                 ab( 1, i ) = ( real( ab( 1, i ),KIND=sp) / bii ) / bii
+                 do j = i1, i - 1
+                    ab( i-j+1, j ) = ab( i-j+1, j ) / bii
+                 end do
+                 do j = i + 1, min( n, i+ka )
+                    ab( j-i+1, i ) = ab( j-i+1, i ) / bii
+                 end do
+                 do k = i + 1, i + kbt
+                    do j = k, i + kbt
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( j-i+1, i )*conjg( ab( k-i+1,i ) ) - &
+                       conjg( bb( k-i+1, i ) )*ab( j-i+1, i ) + real( ab( 1, i ),KIND=sp)*bb( j-i+&
+                                 1, i )*conjg( bb( k-i+1,i ) )
+                    end do
+                    do j = i + kbt + 1, min( n, i+ka )
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -conjg( bb( k-i+1, i ) )*ab( j-i+1, i )
+                                 
+                    end do
+                 end do
+                 do j = i1, i
+                    do k = i + 1, min( j+ka, i+kbt )
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( k-i+1, i )*ab( i-j+1, j )
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_csscal( nx, one / bii, x( 1, i ), 1 )
+                    if( kbt>0 )call stdlib_cgerc( nx, kbt, -cone, x( 1, i ), 1, bb( 2, i ),1, x( &
+                              1, i+1 ), ldx )
+                 end if
+                 ! store a(i,i1) in ra1 for use in next loop over k
+                 ra1 = ab( i-i1+1, i1 )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions up toward the top of the band
+              loop_840: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i+k-ka1>0 .and. i+k<m ) then
+                       ! generate rotation to annihilate a(i,i+k-ka-1)
+                       call stdlib_clartg( ab( ka1-k, i+k-ka ), ra1,rwork( i+k-ka ), work( i+k-ka &
+                                 ), ra )
+                       ! create nonzero element a(i+k,i+k-ka-1) outside the
+                       ! band and store it in work(m-kb+i+k)
+                       t = -bb( k+1, i )*ra1
+                       work( m-kb+i+k ) = rwork( i+k-ka )*t -conjg( work( i+k-ka ) )*ab( ka1, i+k-&
+                                 ka )
+                       ab( ka1, i+k-ka ) = work( i+k-ka )*t +rwork( i+k-ka )*ab( ka1, i+k-ka )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = min( j2, i-2*ka+k-1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( j2t+ka-1 ) / ka1
+                 do j = j1, j2t, ka1
+                    ! create nonzero element a(j+ka,j-1) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( ka1, j-1 )
+                    ab( ka1, j-1 ) = rwork( j )*ab( ka1, j-1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_clargv( nrt, ab( ka1, j1 ), inca, work( j1 ), ka1,rwork( &
+                           j1 ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the right
+                    do l = 1, ka - 1
+                       call stdlib_clartv( nr, ab( l+1, j1 ), inca, ab( l+2, j1-1 ),inca, rwork( &
+                                 j1 ), work( j1 ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_clar2v( nr, ab( 1, j1 ), ab( 1, j1-1 ),ab( 2, j1-1 ), inca, rwork(&
+                               j1 ),work( j1 ), ka1 )
+                    call stdlib_clacgv( nr, work( j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,rwork( j1t ), work( j1t ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j1, j2, ka1
+                       call stdlib_crot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,rwork( j ), conjg( work(&
+                                  j ) ) )
+                    end do
+                 end if
+              end do loop_840
+              if( update ) then
+                 if( i2>0 .and. kbt>0 ) then
+                    ! create nonzero element a(i+kbt,i+kbt-ka-1) outside the
+                    ! band and store it in work(m-kb+i+kbt)
+                    work( m-kb+i+kbt ) = -bb( kbt+1, i )*ra1
+                 end if
+              end if
+              loop_880: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i + k + 1 - max( 2, k+i0-m )*ka1
+                 else
+                    j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+ka+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( ka1-l+1, j1t+l-1 ), inca,ab( ka1-l, &
+                              j1t+l-1 ), inca,rwork( m-kb+j1t+ka ),work( m-kb+j1t+ka ), ka1 )
+                 end do
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 do j = j1, j2, ka1
+                    work( m-kb+j ) = work( m-kb+j+ka )
+                    rwork( m-kb+j ) = rwork( m-kb+j+ka )
+                 end do
+                 do j = j1, j2, ka1
+                    ! create nonzero element a(j+ka,j-1) outside the band
+                    ! and store it in work(m-kb+j)
+                    work( m-kb+j ) = work( m-kb+j )*ab( ka1, j-1 )
+                    ab( ka1, j-1 ) = rwork( m-kb+j )*ab( ka1, j-1 )
+                 end do
+                 if( update ) then
+                    if( i+k>ka1 .and. k<=kbt )work( m-kb+i+k-ka ) = work( m-kb+i+k )
+                 end if
+              end do loop_880
+              loop_920: do k = kb, 1, -1
+                 j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_clargv( nr, ab( ka1, j1 ), inca, work( m-kb+j1 ),ka1, rwork( m-kb+&
+                              j1 ), ka1 )
+                    ! apply rotations in 2nd set from the right
+                    do l = 1, ka - 1
+                       call stdlib_clartv( nr, ab( l+1, j1 ), inca, ab( l+2, j1-1 ),inca, rwork( &
+                                 m-kb+j1 ), work( m-kb+j1 ),ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_clar2v( nr, ab( 1, j1 ), ab( 1, j1-1 ),ab( 2, j1-1 ), inca, rwork(&
+                               m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                    call stdlib_clacgv( nr, work( m-kb+j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,rwork( m-kb+j1t ), work( m-kb+j1t ),ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j1, j2, ka1
+                       call stdlib_crot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,rwork( m-kb+j ), conjg( &
+                                 work( m-kb+j ) ) )
+                    end do
+                 end if
+              end do loop_920
+              do k = 1, kb - 1
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 ! finish applying rotations in 1st set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_clartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,rwork( j1t ), work( j1t ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = 2, i2 - ka
+                    rwork( j ) = rwork( j+ka )
+                    work( j ) = work( j+ka )
+                 end do
+              end if
+           end if
+           go to 490
+     end subroutine stdlib_chbgst
+
+     !> CHBTRD: reduces a complex Hermitian band matrix A to real symmetric
+     !> tridiagonal form T by a unitary similarity transformation:
+     !> Q**H * A * Q = T.
+
+     pure subroutine stdlib_chbtrd( vect, uplo, n, kd, ab, ldab, d, e, q, ldq,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldq, n
+           ! Array Arguments 
+           real(sp), intent(out) :: d(*), e(*)
+           complex(sp), intent(inout) :: ab(ldab,*), q(ldq,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: initq, upper, wantq
+           integer(ilp) :: i, i2, ibl, inca, incx, iqaend, iqb, iqend, j, j1, j1end, j1inc, j2, &
+                     jend, jin, jinc, k, kd1, kdm1, kdn, l, last, lend, nq, nr, nrt
+           real(sp) :: abst
+           complex(sp) :: t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,min,real
+           ! Executable Statements 
+           ! test the input parameters
+           initq = stdlib_lsame( vect, 'V' )
+           wantq = initq .or. stdlib_lsame( vect, 'U' )
+           upper = stdlib_lsame( uplo, 'U' )
+           kd1 = kd + 1
+           kdm1 = kd - 1
+           incx = ldab - 1
+           iqend = 1
+           info = 0
+           if( .not.wantq .and. .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd1 ) then
+              info = -6
+           else if( ldq<max( 1, n ) .and. wantq ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHBTRD', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize q to the unit matrix, if needed
+           if( initq )call stdlib_claset( 'FULL', n, n, czero, cone, q, ldq )
+           ! wherever possible, plane rotations are generated and applied in
+           ! vector operations of length nr over the index set j1:j2:kd1.
+           ! the real cosines and complex sines of the plane rotations are
+           ! stored in the arrays d and work.
+           inca = kd1*ldab
+           kdn = min( n-1, kd )
+           if( upper ) then
+              if( kd>1 ) then
+                 ! reduce to complex hermitian tridiagonal form, working with
+                 ! the upper triangle
+                 nr = 0
+                 j1 = kdn + 2
+                 j2 = 1
+                 ab( kd1, 1 ) = real( ab( kd1, 1 ),KIND=sp)
+                 loop_90: do i = 1, n - 2
+                    ! reduce i-th row of matrix to tridiagonal form
+                    loop_80: do k = kdn + 1, 2, -1
+                       j1 = j1 + kdn
+                       j2 = j2 + kdn
+                       if( nr>0 ) then
+                          ! generate plane rotations to annihilate nonzero
+                          ! elements which have been created outside the band
+                          call stdlib_clargv( nr, ab( 1, j1-1 ), inca, work( j1 ),kd1, d( j1 ), &
+                                    kd1 )
+                          ! apply rotations from the right
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_clartv or stdlib_crot is used
+                          if( nr>=2*kd-1 ) then
+                             do l = 1, kd - 1
+                                call stdlib_clartv( nr, ab( l+1, j1-1 ), inca,ab( l, j1 ), inca, &
+                                          d( j1 ),work( j1 ), kd1 )
+                             end do
+                          else
+                             jend = j1 + ( nr-1 )*kd1
+                             do jinc = j1, jend, kd1
+                                call stdlib_crot( kdm1, ab( 2, jinc-1 ), 1,ab( 1, jinc ), 1, d( &
+                                          jinc ),work( jinc ) )
+                             end do
+                          end if
+                       end if
+                       if( k>2 ) then
+                          if( k<=n-i+1 ) then
+                             ! generate plane rotation to annihilate a(i,i+k-1)
+                             ! within the band
+                             call stdlib_clartg( ab( kd-k+3, i+k-2 ),ab( kd-k+2, i+k-1 ), d( i+k-&
+                                       1 ),work( i+k-1 ), temp )
+                             ab( kd-k+3, i+k-2 ) = temp
+                             ! apply rotation from the right
+                             call stdlib_crot( k-3, ab( kd-k+4, i+k-2 ), 1,ab( kd-k+3, i+k-1 ), 1,&
+                                        d( i+k-1 ),work( i+k-1 ) )
+                          end if
+                          nr = nr + 1
+                          j1 = j1 - kdn - 1
+                       end if
+                       ! apply plane rotations from both sides to diagonal
+                       ! blocks
+                       if( nr>0 )call stdlib_clar2v( nr, ab( kd1, j1-1 ), ab( kd1, j1 ),ab( kd, &
+                                 j1 ), inca, d( j1 ),work( j1 ), kd1 )
+                       ! apply plane rotations from the left
+                       if( nr>0 ) then
+                          call stdlib_clacgv( nr, work( j1 ), kd1 )
+                          if( 2*kd-1<nr ) then
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_clartv or stdlib_crot is used
+                             do l = 1, kd - 1
+                                if( j2+l>n ) then
+                                   nrt = nr - 1
+                                else
+                                   nrt = nr
+                                end if
+                                if( nrt>0 )call stdlib_clartv( nrt, ab( kd-l, j1+l ), inca,ab( kd-&
+                                          l+1, j1+l ), inca,d( j1 ), work( j1 ), kd1 )
+                             end do
+                          else
+                             j1end = j1 + kd1*( nr-2 )
+                             if( j1end>=j1 ) then
+                                do jin = j1, j1end, kd1
+                                   call stdlib_crot( kd-1, ab( kd-1, jin+1 ), incx,ab( kd, jin+1 )&
+                                             , incx,d( jin ), work( jin ) )
+                                end do
+                             end if
+                             lend = min( kdm1, n-j2 )
+                             last = j1end + kd1
+                             if( lend>0 )call stdlib_crot( lend, ab( kd-1, last+1 ), incx,ab( kd, &
+                                       last+1 ), incx, d( last ),work( last ) )
+                          end if
+                       end if
+                       if( wantq ) then
+                          ! accumulate product of plane rotations in q
+                          if( initq ) then
+                       ! take advantage of the fact that q was
+                       ! initially the identity matrix
+                             iqend = max( iqend, j2 )
+                             i2 = max( 0, k-3 )
+                             iqaend = 1 + i*kd
+                             if( k==2 )iqaend = iqaend + kd
+                             iqaend = min( iqaend, iqend )
+                             do j = j1, j2, kd1
+                                ibl = i - i2 / kdm1
+                                i2 = i2 + 1
+                                iqb = max( 1, j-ibl )
+                                nq = 1 + iqaend - iqb
+                                iqaend = min( iqaend+kd, iqend )
+                                call stdlib_crot( nq, q( iqb, j-1 ), 1, q( iqb, j ),1, d( j ), &
+                                          conjg( work( j ) ) )
+                             end do
+                          else
+                             do j = j1, j2, kd1
+                                call stdlib_crot( n, q( 1, j-1 ), 1, q( 1, j ), 1,d( j ), conjg( &
+                                          work( j ) ) )
+                             end do
+                          end if
+                       end if
+                       if( j2+kdn>n ) then
+                          ! adjust j2 to keep within the bounds of the matrix
+                          nr = nr - 1
+                          j2 = j2 - kdn - 1
+                       end if
+                       do j = j1, j2, kd1
+                          ! create nonzero element a(j-1,j+kd) outside the band
+                          ! and store it in work
+                          work( j+kd ) = work( j )*ab( 1, j+kd )
+                          ab( 1, j+kd ) = d( j )*ab( 1, j+kd )
+                       end do
+                    end do loop_80
+                 end do loop_90
+              end if
+              if( kd>0 ) then
+                 ! make off-diagonal elements real and copy them to e
+                 do i = 1, n - 1
+                    t = ab( kd, i+1 )
+                    abst = abs( t )
+                    ab( kd, i+1 ) = abst
+                    e( i ) = abst
+                    if( abst/=zero ) then
+                       t = t / abst
+                    else
+                       t = cone
+                    end if
+                    if( i<n-1 )ab( kd, i+2 ) = ab( kd, i+2 )*t
+                    if( wantq ) then
+                       call stdlib_cscal( n, conjg( t ), q( 1, i+1 ), 1 )
+                    end if
+                 end do
+              else
+                 ! set e to zero if original matrix was diagonal
+                 do i = 1, n - 1
+                    e( i ) = zero
+                 end do
+              end if
+              ! copy diagonal elements to d
+              do i = 1, n
+                 d( i ) = real( ab( kd1, i ),KIND=sp)
+              end do
+           else
+              if( kd>1 ) then
+                 ! reduce to complex hermitian tridiagonal form, working with
+                 ! the lower triangle
+                 nr = 0
+                 j1 = kdn + 2
+                 j2 = 1
+                 ab( 1, 1 ) = real( ab( 1, 1 ),KIND=sp)
+                 loop_210: do i = 1, n - 2
+                    ! reduce i-th column of matrix to tridiagonal form
+                    loop_200: do k = kdn + 1, 2, -1
+                       j1 = j1 + kdn
+                       j2 = j2 + kdn
+                       if( nr>0 ) then
+                          ! generate plane rotations to annihilate nonzero
+                          ! elements which have been created outside the band
+                          call stdlib_clargv( nr, ab( kd1, j1-kd1 ), inca,work( j1 ), kd1, d( j1 )&
+                                    , kd1 )
+                          ! apply plane rotations from one side
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_clartv or stdlib_crot is used
+                          if( nr>2*kd-1 ) then
+                             do l = 1, kd - 1
+                                call stdlib_clartv( nr, ab( kd1-l, j1-kd1+l ), inca,ab( kd1-l+1, &
+                                          j1-kd1+l ), inca,d( j1 ), work( j1 ), kd1 )
+                             end do
+                          else
+                             jend = j1 + kd1*( nr-1 )
+                             do jinc = j1, jend, kd1
+                                call stdlib_crot( kdm1, ab( kd, jinc-kd ), incx,ab( kd1, jinc-kd )&
+                                          , incx,d( jinc ), work( jinc ) )
+                             end do
+                          end if
+                       end if
+                       if( k>2 ) then
+                          if( k<=n-i+1 ) then
+                             ! generate plane rotation to annihilate a(i+k-1,i)
+                             ! within the band
+                             call stdlib_clartg( ab( k-1, i ), ab( k, i ),d( i+k-1 ), work( i+k-1 &
+                                       ), temp )
+                             ab( k-1, i ) = temp
+                             ! apply rotation from the left
+                             call stdlib_crot( k-3, ab( k-2, i+1 ), ldab-1,ab( k-1, i+1 ), ldab-1,&
+                                        d( i+k-1 ),work( i+k-1 ) )
+                          end if
+                          nr = nr + 1
+                          j1 = j1 - kdn - 1
+                       end if
+                       ! apply plane rotations from both sides to diagonal
+                       ! blocks
+                       if( nr>0 )call stdlib_clar2v( nr, ab( 1, j1-1 ), ab( 1, j1 ),ab( 2, j1-1 ),&
+                                  inca, d( j1 ),work( j1 ), kd1 )
+                       ! apply plane rotations from the right
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_clartv or stdlib_crot is used
+                       if( nr>0 ) then
+                          call stdlib_clacgv( nr, work( j1 ), kd1 )
+                          if( nr>2*kd-1 ) then
+                             do l = 1, kd - 1
+                                if( j2+l>n ) then
+                                   nrt = nr - 1
+                                else
+                                   nrt = nr
+                                end if
+                                if( nrt>0 )call stdlib_clartv( nrt, ab( l+2, j1-1 ), inca,ab( l+1,&
+                                           j1 ), inca, d( j1 ),work( j1 ), kd1 )
+                             end do
+                          else
+                             j1end = j1 + kd1*( nr-2 )
+                             if( j1end>=j1 ) then
+                                do j1inc = j1, j1end, kd1
+                                   call stdlib_crot( kdm1, ab( 3, j1inc-1 ), 1,ab( 2, j1inc ), 1, &
+                                             d( j1inc ),work( j1inc ) )
+                                end do
+                             end if
+                             lend = min( kdm1, n-j2 )
+                             last = j1end + kd1
+                             if( lend>0 )call stdlib_crot( lend, ab( 3, last-1 ), 1,ab( 2, last ),&
+                                        1, d( last ),work( last ) )
+                          end if
+                       end if
+                       if( wantq ) then
+                          ! accumulate product of plane rotations in q
+                          if( initq ) then
+                       ! take advantage of the fact that q was
+                       ! initially the identity matrix
+                             iqend = max( iqend, j2 )
+                             i2 = max( 0, k-3 )
+                             iqaend = 1 + i*kd
+                             if( k==2 )iqaend = iqaend + kd
+                             iqaend = min( iqaend, iqend )
+                             do j = j1, j2, kd1
+                                ibl = i - i2 / kdm1
+                                i2 = i2 + 1
+                                iqb = max( 1, j-ibl )
+                                nq = 1 + iqaend - iqb
+                                iqaend = min( iqaend+kd, iqend )
+                                call stdlib_crot( nq, q( iqb, j-1 ), 1, q( iqb, j ),1, d( j ), &
+                                          work( j ) )
+                             end do
+                          else
+                             do j = j1, j2, kd1
+                                call stdlib_crot( n, q( 1, j-1 ), 1, q( 1, j ), 1,d( j ), work( j &
+                                          ) )
+                             end do
+                          end if
+                       end if
+                       if( j2+kdn>n ) then
+                          ! adjust j2 to keep within the bounds of the matrix
+                          nr = nr - 1
+                          j2 = j2 - kdn - 1
+                       end if
+                       do j = j1, j2, kd1
+                          ! create nonzero element a(j+kd,j-1) outside the
+                          ! band and store it in work
+                          work( j+kd ) = work( j )*ab( kd1, j )
+                          ab( kd1, j ) = d( j )*ab( kd1, j )
+                       end do
+                    end do loop_200
+                 end do loop_210
+              end if
+              if( kd>0 ) then
+                 ! make off-diagonal elements real and copy them to e
+                 do i = 1, n - 1
+                    t = ab( 2, i )
+                    abst = abs( t )
+                    ab( 2, i ) = abst
+                    e( i ) = abst
+                    if( abst/=zero ) then
+                       t = t / abst
+                    else
+                       t = cone
+                    end if
+                    if( i<n-1 )ab( 2, i+1 ) = ab( 2, i+1 )*t
+                    if( wantq ) then
+                       call stdlib_cscal( n, t, q( 1, i+1 ), 1 )
+                    end if
+                 end do
+              else
+                 ! set e to zero if original matrix was diagonal
+                 do i = 1, n - 1
+                    e( i ) = zero
+                 end do
+              end if
+              ! copy diagonal elements to d
+              do i = 1, n
+                 d( i ) = real( ab( 1, i ),KIND=sp)
+              end do
+           end if
+           return
+     end subroutine stdlib_chbtrd
+
+     !> CHECON: estimates the reciprocal of the condition number of a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by CHETRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_checon( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(sp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHECON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**h) or inv(u*d*u**h).
+              call stdlib_chetrs( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_checon
+
+     !> CHECON_ROOK: estimates the reciprocal of the condition number of a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by CHETRF_ROOK.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_checon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(sp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHECON_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**h) or inv(u*d*u**h).
+              call stdlib_chetrs_rook( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_checon_rook
+
+     !> CHEEV: computes all eigenvalues and, optionally, eigenvectors of a
+     !> complex Hermitian matrix A.
+
+     subroutine stdlib_cheev( jobz, uplo, n, a, lda, w, work, lwork, rwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indtau, indwrk, iscale, llwork, lwkopt, nb
+           real(sp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'CHETRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, ( nb+1 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, 2*n-1 ) .and. .not.lquery )info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHEEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              return
+           end if
+           if( n==1 ) then
+              w( 1 ) = real( a( 1, 1 ),KIND=sp)
+              work( 1 ) = 1
+              if( wantz )a( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_clanhe( 'M', uplo, n, a, lda, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 )call stdlib_clascl( uplo, 0, 0, one, sigma, n, n, a, lda, info )
+           ! call stdlib_chetrd to reduce hermitian matrix to tridiagonal form.
+           inde = 1
+           indtau = 1
+           indwrk = indtau + n
+           llwork = lwork - indwrk + 1
+           call stdlib_chetrd( uplo, n, a, lda, w, rwork( inde ), work( indtau ),work( indwrk ), &
+                     llwork, iinfo )
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, first call
+           ! stdlib_cungtr to generate the unitary matrix, then call stdlib_csteqr.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_cungtr( uplo, n, a, lda, work( indtau ), work( indwrk ),llwork, iinfo )
+                        
+              indwrk = inde + n
+              call stdlib_csteqr( jobz, n, w, rwork( inde ), a, lda,rwork( indwrk ), info )
+                        
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           ! set work(1) to optimal complex workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cheev
+
+     !> CHEEVR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex Hermitian matrix A.  Eigenvalues and eigenvectors can
+     !> be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+     !> CHEEVR first reduces the matrix A to tridiagonal form T with a call
+     !> to CHETRD.  Then, whenever possible, CHEEVR calls CSTEMR to compute
+     !> the eigenspectrum using Relatively Robust Representations.  CSTEMR
+     !> computes eigenvalues by the dqds algorithm, while orthogonal
+     !> eigenvectors are computed from various "good" L D L^T representations
+     !> (also known as Relatively Robust Representations). Gram-Schmidt
+     !> orthogonalization is avoided as far as possible. More specifically,
+     !> the various steps of the algorithm are as follows.
+     !> For each unreduced block (submatrix) of T,
+     !> (a) Compute T - sigma I  = L D L^T, so that L and D
+     !> define all the wanted eigenvalues to high relative accuracy.
+     !> This means that small relative changes in the entries of D and L
+     !> cause only small relative changes in the eigenvalues and
+     !> eigenvectors. The standard (unfactored) representation of the
+     !> tridiagonal matrix T does not have this property in general.
+     !> (b) Compute the eigenvalues to suitable accuracy.
+     !> If the eigenvectors are desired, the algorithm attains full
+     !> accuracy of the computed eigenvalues only right before
+     !> the corresponding vectors have to be computed, see steps c) and d).
+     !> (c) For each cluster of close eigenvalues, select a new
+     !> shift close to the cluster, find a new factorization, and refine
+     !> the shifted eigenvalues to suitable accuracy.
+     !> (d) For each eigenvalue with a large enough relative separation compute
+     !> the corresponding eigenvector by forming a rank revealing twisted
+     !> factorization. Go back to (c) for any clusters that remain.
+     !> The desired accuracy of the output can be specified by the input
+     !> parameter ABSTOL.
+     !> For more details, see CSTEMR's documentation and:
+     !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+     !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+     !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+     !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+     !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+     !> 2004.  Also LAPACK Working Note 154.
+     !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem",
+     !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Note 1 : CHEEVR calls CSTEMR when the full spectrum is requested
+     !> on machines which conform to the ieee-754 floating point standard.
+     !> CHEEVR calls SSTEBZ and CSTEIN on non-ieee machines and
+     !> when partial spectrum requests are made.
+     !> Normal execution of CSTEMR may create NaNs and infinities and
+     !> hence may abort due to a floating point exception in environments
+     !> which do not handle NaNs and infinities in the ieee standard default
+     !> manner.
+
+     subroutine stdlib_cheevr( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               isuppz, work, lwork,rwork, lrwork, iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, lda, ldz, liwork, lrwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*), z(ldz,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, lquery, test, valeig, wantz, tryrac
+           character :: order
+           integer(ilp) :: i, ieeeok, iinfo, imax, indibl, indifl, indisp, indiwo, indrd, indrdd, &
+           indre, indree, indrwk, indtau, indwk, indwkn, iscale, itmp1, j, jj, liwmin, llwork, &
+                     llrwork, llwrkn, lrwmin, lwkopt, lwmin, nb, nsplit
+           real(sp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           ieeeok = stdlib_ilaenv( 10, 'CHEEVR', 'N', 1, 2, 3, 4 )
+           lower = stdlib_lsame( uplo, 'L' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ) .or. ( lrwork==-1 ) .or.( liwork==-1 ) )
+           lrwmin = max( 1, 24*n )
+           liwmin = max( 1, 10*n )
+           lwmin = max( 1, 2*n )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -8
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -9
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -15
+              end if
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'CHETRD', uplo, n, -1, -1, -1 )
+              nb = max( nb, stdlib_ilaenv( 1, 'CUNMTR', uplo, n, -1, -1, -1 ) )
+              lwkopt = max( ( nb+1 )*n, lwmin )
+              work( 1 ) = lwkopt
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -18
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -20
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -22
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHEEVR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( n==1 ) then
+              work( 1 ) = 2
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = real( a( 1, 1 ),KIND=sp)
+              else
+                 if( vl<real( a( 1, 1 ),KIND=sp) .and. vu>=real( a( 1, 1 ),KIND=sp) )then
+                    m = 1
+                    w( 1 ) = real( a( 1, 1 ),KIND=sp)
+                 end if
+              end if
+              if( wantz ) then
+                 z( 1, 1 ) = one
+                 isuppz( 1 ) = 1
+                 isuppz( 2 ) = 1
+              end if
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if (valeig) then
+              vll = vl
+              vuu = vu
+           end if
+           anrm = stdlib_clansy( 'M', uplo, n, a, lda, rwork )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 do j = 1, n
+                    call stdlib_csscal( n-j+1, sigma, a( j, j ), 1 )
+                 end do
+              else
+                 do j = 1, n
+                    call stdlib_csscal( j, sigma, a( 1, j ), 1 )
+                 end do
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! initialize indices into workspaces.  note: the iwork indices are
+           ! used only if stdlib_ssterf or stdlib_cstemr fail.
+           ! work(indtau:indtau+n-1) stores the complex scalar factors of the
+           ! elementary reflectors used in stdlib_chetrd.
+           indtau = 1
+           ! indwk is the starting offset of the remaining complex workspace,
+           ! and llwork is the remaining complex workspace size.
+           indwk = indtau + n
+           llwork = lwork - indwk + 1
+           ! rwork(indrd:indrd+n-1) stores the real tridiagonal's diagonal
+           ! entries.
+           indrd = 1
+           ! rwork(indre:indre+n-1) stores the off-diagonal entries of the
+           ! tridiagonal matrix from stdlib_chetrd.
+           indre = indrd + n
+           ! rwork(indrdd:indrdd+n-1) is a copy of the diagonal entries over
+           ! -written by stdlib_cstemr (the stdlib_ssterf path copies the diagonal to w).
+           indrdd = indre + n
+           ! rwork(indree:indree+n-1) is a copy of the off-diagonal entries over
+           ! -written while computing the eigenvalues in stdlib_ssterf and stdlib_cstemr.
+           indree = indrdd + n
+           ! indrwk is the starting offset of the left-over real workspace, and
+           ! llrwork is the remaining workspace size.
+           indrwk = indree + n
+           llrwork = lrwork - indrwk + 1
+           ! iwork(indibl:indibl+m-1) corresponds to iblock in stdlib_sstebz and
+           ! stores the block indices of each of the m<=n eigenvalues.
+           indibl = 1
+           ! iwork(indisp:indisp+nsplit-1) corresponds to isplit in stdlib_sstebz and
+           ! stores the starting and finishing indices of each block.
+           indisp = indibl + n
+           ! iwork(indifl:indifl+n-1) stores the indices of eigenvectors
+           ! that corresponding to eigenvectors that fail to converge in
+           ! stdlib_sstein.  this information is discarded; if any fail, the driver
+           ! returns info > 0.
+           indifl = indisp + n
+           ! indiwo is the offset of the remaining integer workspace.
+           indiwo = indifl + n
+           ! call stdlib_chetrd to reduce hermitian matrix to tridiagonal form.
+           call stdlib_chetrd( uplo, n, a, lda, rwork( indrd ), rwork( indre ),work( indtau ), &
+                     work( indwk ), llwork, iinfo )
+           ! if all eigenvalues are desired
+           ! then call stdlib_ssterf or stdlib_cstemr and stdlib_cunmtr.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig.or.test ) .and. ( ieeeok==1 ) ) then
+              if( .not.wantz ) then
+                 call stdlib_scopy( n, rwork( indrd ), 1, w, 1 )
+                 call stdlib_scopy( n-1, rwork( indre ), 1, rwork( indree ), 1 )
+                 call stdlib_ssterf( n, w, rwork( indree ), info )
+              else
+                 call stdlib_scopy( n-1, rwork( indre ), 1, rwork( indree ), 1 )
+                 call stdlib_scopy( n, rwork( indrd ), 1, rwork( indrdd ), 1 )
+                 if (abstol <= two*n*eps) then
+                    tryrac = .true.
+                 else
+                    tryrac = .false.
+                 end if
+                 call stdlib_cstemr( jobz, 'A', n, rwork( indrdd ),rwork( indree ), vl, vu, il, &
+                 iu, m, w,z, ldz, n, isuppz, tryrac,rwork( indrwk ), llrwork,iwork, liwork, info )
+                           
+                 ! apply unitary matrix used in reduction to tridiagonal
+                 ! form to eigenvectors returned by stdlib_cstemr.
+                 if( wantz .and. info==0 ) then
+                    indwkn = indwk
+                    llwrkn = lwork - indwkn + 1
+                    call stdlib_cunmtr( 'L', uplo, 'N', n, m, a, lda,work( indtau ), z, ldz, work(&
+                               indwkn ),llwrkn, iinfo )
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_sstebz and, if eigenvectors are desired, stdlib_cstein.
+           ! also call stdlib_sstebz and stdlib_cstein if stdlib_cstemr fails.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           call stdlib_sstebz( range, order, n, vll, vuu, il, iu, abstll,rwork( indrd ), rwork( &
+           indre ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwo )&
+                     , info )
+           if( wantz ) then
+              call stdlib_cstein( n, rwork( indrd ), rwork( indre ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwo ), iwork( indifl ),info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_cstein.
+              indwkn = indwk
+              llwrkn = lwork - indwkn + 1
+              call stdlib_cunmtr( 'L', uplo, 'N', n, m, a, lda, work( indtau ), z,ldz, work( &
+                        indwkn ), llwrkn, iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           30 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_cswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                 end if
+              end do
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_cheevr
+
+     !> CHEEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex Hermitian matrix A.  Eigenvalues and eigenvectors can
+     !> be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+
+     subroutine stdlib_cheevx( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               work, lwork, rwork,iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, lda, ldz, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, lquery, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwk, indrwk, &
+                     indtau, indwrk, iscale, itmp1, j, jj, llwork, lwkmin, lwkopt, nb, nsplit
+           real(sp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           lower = stdlib_lsame( uplo, 'L' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -8
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -9
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -15
+              end if
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwkmin = 1
+                 work( 1 ) = lwkmin
+              else
+                 lwkmin = 2*n
+                 nb = stdlib_ilaenv( 1, 'CHETRD', uplo, n, -1, -1, -1 )
+                 nb = max( nb, stdlib_ilaenv( 1, 'CUNMTR', uplo, n, -1, -1, -1 ) )
+                 lwkopt = max( 1, ( nb + 1 )*n )
+                 work( 1 ) = lwkopt
+              end if
+              if( lwork<lwkmin .and. .not.lquery )info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHEEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              return
+           end if
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = real( a( 1, 1 ),KIND=sp)
+              else if( valeig ) then
+                 if( vl<real( a( 1, 1 ),KIND=sp) .and. vu>=real( a( 1, 1 ),KIND=sp) )then
+                    m = 1
+                    w( 1 ) = real( a( 1, 1 ),KIND=sp)
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           end if
+           anrm = stdlib_clanhe( 'M', uplo, n, a, lda, rwork )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 do j = 1, n
+                    call stdlib_csscal( n-j+1, sigma, a( j, j ), 1 )
+                 end do
+              else
+                 do j = 1, n
+                    call stdlib_csscal( j, sigma, a( 1, j ), 1 )
+                 end do
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_chetrd to reduce hermitian matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indrwk = inde + n
+           indtau = 1
+           indwrk = indtau + n
+           llwork = lwork - indwrk + 1
+           call stdlib_chetrd( uplo, n, a, lda, rwork( indd ), rwork( inde ),work( indtau ), work(&
+                      indwrk ), llwork, iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal to
+           ! zero, then call stdlib_ssterf or stdlib_cungtr and stdlib_csteqr.  if this fails for
+           ! some eigenvalue, then try stdlib_sstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_scopy( n, rwork( indd ), 1, w, 1 )
+              indee = indrwk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_scopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_ssterf( n, w, rwork( indee ), info )
+              else
+                 call stdlib_clacpy( 'A', n, n, a, lda, z, ldz )
+                 call stdlib_cungtr( uplo, n, z, ldz, work( indtau ),work( indwrk ), llwork, &
+                           iinfo )
+                 call stdlib_scopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_csteqr( jobz, n, w, rwork( indee ), z, ldz,rwork( indrwk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 40
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_sstebz and, if eigenvectors are desired, stdlib_cstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwk = indisp + n
+           call stdlib_sstebz( range, order, n, vll, vuu, il, iu, abstll,rwork( indd ), rwork( &
+           inde ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwk ),&
+                      info )
+           if( wantz ) then
+              call stdlib_cstein( n, rwork( indd ), rwork( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwk ), ifail, info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_cstein.
+              call stdlib_cunmtr( 'L', uplo, 'N', n, m, a, lda, work( indtau ), z,ldz, work( &
+                        indwrk ), llwork, iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           40 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_cswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           ! set work(1) to optimal complex workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cheevx
+
+     !> CHEGV: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+     !> Here A and B are assumed to be Hermitian and B is also
+     !> positive definite.
+
+     subroutine stdlib_chegv( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, rwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, lwork, n
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: lwkopt, nb, neig
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork== -1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'CHETRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, ( nb + 1 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, 2*n-1 ) .and. .not.lquery ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHEGV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_cpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_chegst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_cheev( jobz, uplo, n, a, lda, w, work, lwork, rwork, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 call stdlib_ctrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, neig, cone,b, ldb, a, lda &
+                           )
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h*y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_ctrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, neig, cone,b, ldb, a, lda &
+                           )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_chegv
+
+     !> CHEGVX: computes selected eigenvalues, and optionally, eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be Hermitian and B is also positive definite.
+     !> Eigenvalues and eigenvectors can be selected by specifying either a
+     !> range of values or a range of indices for the desired eigenvalues.
+
+     subroutine stdlib_chegvx( itype, jobz, range, uplo, n, a, lda, b, ldb,vl, vu, il, iu, abstol,&
+                m, w, z, ldz, work,lwork, rwork, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, itype, iu, lda, ldb, ldz, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lquery, upper, valeig, wantz
+           character :: trans
+           integer(ilp) :: lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -3
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -11
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -12
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -13
+                 end if
+              end if
+           end if
+           if (info==0) then
+              if (ldz<1 .or. (wantz .and. ldz<n)) then
+                 info = -18
+              end if
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'CHETRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, ( nb + 1 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHEGVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              return
+           end if
+           ! form a cholesky factorization of b.
+           call stdlib_cpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_chegst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_cheevx( jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol,m, w, z, ldz, &
+                     work, lwork, rwork, iwork, ifail,info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              if( info>0 )m = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 call stdlib_ctrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, m, cone, b,ldb, z, ldz )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h*y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_ctrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, m, cone, b,ldb, z, ldz )
+                           
+              end if
+           end if
+           ! set work(1) to optimal complex workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_chegvx
+
+     !> CHERFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian indefinite, and
+     !> provides error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_cherfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHERFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_ccopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_chemv( uplo, n, -cone, a, lda, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( a( k, k ),KIND=sp) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( a( k, k ),KIND=sp) )*xk
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_chetrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                 call stdlib_caxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_clacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_chetrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_chetrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_cherfs
+
+     !> CHESV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**H,  if UPLO = 'U', or
+     !> A = L * D * L**H,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.  The factored form of A is then
+     !> used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_chesv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'CHETRF', uplo, n, -1, -1, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHESV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+           call stdlib_chetrf( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              if ( lwork<n ) then
+              ! solve with trs ( use level blas 2)
+                 call stdlib_chetrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+              else
+              ! solve with trs2 ( use level blas 3)
+                 call stdlib_chetrs2( uplo,n,nrhs,a,lda,ipiv,b,ldb,work,info )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_chesv
+
+     !> CHESV_RK: computes the solution to a complex system of linear
+     !> equations A * X = B, where A is an N-by-N Hermitian matrix
+     !> and X and B are N-by-NRHS matrices.
+     !> The bounded Bunch-Kaufman (rook) diagonal pivoting method is used
+     !> to factor A as
+     !> A = P*U*D*(U**H)*(P**T),  if UPLO = 'U', or
+     !> A = P*L*D*(L**H)*(P**T),  if UPLO = 'L',
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is Hermitian and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> CHETRF_RK is called to compute the factorization of a complex
+     !> Hermitian matrix.  The factored form of A is then used to solve
+     !> the system of equations A * X = B by calling BLAS3 routine CHETRS_3.
+
+     pure subroutine stdlib_chesv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_chetrf_rk( uplo, n, a, lda, e, ipiv, work, -1, info )
+                 lwkopt = real( work(1),KIND=sp)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHESV_RK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_chetrf_rk( uplo, n, a, lda, e, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b with blas3 solver, overwriting b with x.
+              call stdlib_chetrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_chesv_rk
+
+     !> CHESV_ROOK: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The bounded Bunch-Kaufman ("rook") diagonal pivoting method is used
+     !> to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> CHETRF_ROOK is called to compute the factorization of a complex
+     !> Hermition matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method.
+     !> The factored form of A is then used to solve the system
+     !> of equations A * X = B by calling CHETRS_ROOK (uses BLAS 2).
+
+     pure subroutine stdlib_chesv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'CHETRF_ROOK', uplo, n, -1, -1, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHESV_ROOK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+           call stdlib_chetrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              ! solve with trs ( use level blas 2)
+              call stdlib_chetrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_chesv_rook
+
+     !> CHESVX: uses the diagonal pivoting factorization to compute the
+     !> solution to a complex system of linear equations A * X = B,
+     !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_chesvx( fact, uplo, n, nrhs, a, lda, af, ldaf, ipiv, b,ldb, x, ldx, rcond, &
+               ferr, berr, work, lwork,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, lwork, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(inout) :: af(ldaf,*)
+           complex(sp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, nofact
+           integer(ilp) :: lwkopt, nb
+           real(sp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -11
+           else if( ldx<max( 1, n ) ) then
+              info = -13
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           if( info==0 ) then
+              lwkopt = max( 1, 2*n )
+              if( nofact ) then
+                 nb = stdlib_ilaenv( 1, 'CHETRF', uplo, n, -1, -1, -1 )
+                 lwkopt = max( lwkopt, n*nb )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHESVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+              call stdlib_clacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_chetrf( uplo, n, af, ldaf, ipiv, work, lwork, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_clanhe( 'I', uplo, n, a, lda, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_checon( uplo, n, af, ldaf, ipiv, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_clacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_chetrs( uplo, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_cherfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_chesvx
+
+     !> CHGEQZ: computes the eigenvalues of a complex matrix pair (H,T),
+     !> where H is an upper Hessenberg matrix and T is upper triangular,
+     !> using the single-shift QZ method.
+     !> Matrix pairs of this type are produced by the reduction to
+     !> generalized upper Hessenberg form of a complex matrix pair (A,B):
+     !> A = Q1*H*Z1**H,  B = Q1*T*Z1**H,
+     !> as computed by CGGHRD.
+     !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+     !> also reduced to generalized Schur form,
+     !> H = Q*S*Z**H,  T = Q*P*Z**H,
+     !> where Q and Z are unitary matrices and S and P are upper triangular.
+     !> Optionally, the unitary matrix Q from the generalized Schur
+     !> factorization may be postmultiplied into an input matrix Q1, and the
+     !> unitary matrix Z may be postmultiplied into an input matrix Z1.
+     !> If Q1 and Z1 are the unitary matrices from CGGHRD that reduced
+     !> the matrix pair (A,B) to generalized Hessenberg form, then the output
+     !> matrices Q1*Q and Z1*Z are the unitary factors from the generalized
+     !> Schur factorization of (A,B):
+     !> A = (Q1*Q)*S*(Z1*Z)**H,  B = (Q1*Q)*P*(Z1*Z)**H.
+     !> To avoid overflow, eigenvalues of the matrix pair (H,T)
+     !> (equivalently, of (A,B)) are computed as a pair of complex values
+     !> (alpha,beta).  If beta is nonzero, lambda = alpha / beta is an
+     !> eigenvalue of the generalized nonsymmetric eigenvalue problem (GNEP)
+     !> A*x = lambda*B*x
+     !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+     !> alternate form of the GNEP
+     !> mu*A*y = B*y.
+     !> The values of alpha and beta for the i-th eigenvalue can be read
+     !> directly from the generalized Schur form:  alpha = S(i,i),
+     !> beta = P(i,i).
+     !> Ref: C.B. Moler
+     !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+     !> pp. 241--256.
+
+     subroutine stdlib_chgeqz( job, compq, compz, n, ilo, ihi, h, ldh, t, ldt,alpha, beta, q, ldq,&
+                z, ldz, work, lwork,rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz, job
+           integer(ilp), intent(in) :: ihi, ilo, ldh, ldq, ldt, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(out) :: alpha(*), beta(*), work(*)
+           complex(sp), intent(inout) :: h(ldh,*), q(ldq,*), t(ldt,*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ilazr2, ilazro, ilq, ilschr, ilz, lquery
+           integer(ilp) :: icompq, icompz, ifirst, ifrstm, iiter, ilast, ilastm, in, ischur, &
+                     istart, j, jc, jch, jiter, jr, maxit
+           real(sp) :: absb, anorm, ascale, atol, bnorm, bscale, btol, c, safmin, temp, temp2, &
+                     tempr, ulp
+           complex(sp) :: abi22, ad11, ad12, ad21, ad22, ctemp, ctemp2, ctemp3, eshift, s, shift, &
+                     signbc, u12, x, abi12, y
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,max,min,real,sqrt
+           ! Statement Functions 
+           real(sp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=sp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode job, compq, compz
+           if( stdlib_lsame( job, 'E' ) ) then
+              ilschr = .false.
+              ischur = 1
+           else if( stdlib_lsame( job, 'S' ) ) then
+              ilschr = .true.
+              ischur = 2
+           else
+              ilschr = .true.
+              ischur = 0
+           end if
+           if( stdlib_lsame( compq, 'N' ) ) then
+              ilq = .false.
+              icompq = 1
+           else if( stdlib_lsame( compq, 'V' ) ) then
+              ilq = .true.
+              icompq = 2
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              ilq = .true.
+              icompq = 3
+           else
+              ilq = .true.
+              icompq = 0
+           end if
+           if( stdlib_lsame( compz, 'N' ) ) then
+              ilz = .false.
+              icompz = 1
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              ilz = .true.
+              icompz = 2
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              ilz = .true.
+              icompz = 3
+           else
+              ilz = .true.
+              icompz = 0
+           end if
+           ! check argument values
+           info = 0
+           work( 1 ) = max( 1, n )
+           lquery = ( lwork==-1 )
+           if( ischur==0 ) then
+              info = -1
+           else if( icompq==0 ) then
+              info = -2
+           else if( icompz==0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 ) then
+              info = -5
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -6
+           else if( ldh<n ) then
+              info = -8
+           else if( ldt<n ) then
+              info = -10
+           else if( ldq<1 .or. ( ilq .and. ldq<n ) ) then
+              info = -14
+           else if( ldz<1 .or. ( ilz .and. ldz<n ) ) then
+              info = -16
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHGEQZ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           ! work( 1 ) = cmplx( 1,KIND=sp)
+           if( n<=0 ) then
+              work( 1 ) = cmplx( 1,KIND=sp)
+              return
+           end if
+           ! initialize q and z
+           if( icompq==3 )call stdlib_claset( 'FULL', n, n, czero, cone, q, ldq )
+           if( icompz==3 )call stdlib_claset( 'FULL', n, n, czero, cone, z, ldz )
+           ! machine constants
+           in = ihi + 1 - ilo
+           safmin = stdlib_slamch( 'S' )
+           ulp = stdlib_slamch( 'E' )*stdlib_slamch( 'B' )
+           anorm = stdlib_clanhs( 'F', in, h( ilo, ilo ), ldh, rwork )
+           bnorm = stdlib_clanhs( 'F', in, t( ilo, ilo ), ldt, rwork )
+           atol = max( safmin, ulp*anorm )
+           btol = max( safmin, ulp*bnorm )
+           ascale = one / max( safmin, anorm )
+           bscale = one / max( safmin, bnorm )
+           ! set eigenvalues ihi+1:n
+           do j = ihi + 1, n
+              absb = abs( t( j, j ) )
+              if( absb>safmin ) then
+                 signbc = conjg( t( j, j ) / absb )
+                 t( j, j ) = absb
+                 if( ilschr ) then
+                    call stdlib_cscal( j-1, signbc, t( 1, j ), 1 )
+                    call stdlib_cscal( j, signbc, h( 1, j ), 1 )
+                 else
+                    call stdlib_cscal( 1, signbc, h( j, j ), 1 )
+                 end if
+                 if( ilz )call stdlib_cscal( n, signbc, z( 1, j ), 1 )
+              else
+                 t( j, j ) = czero
+              end if
+              alpha( j ) = h( j, j )
+              beta( j ) = t( j, j )
+           end do
+           ! if ihi < ilo, skip qz steps
+           if( ihi<ilo )go to 190
+           ! main qz iteration loop
+           ! initialize dynamic indices
+           ! eigenvalues ilast+1:n have been found.
+              ! column operations modify rows ifrstm:whatever
+              ! row operations modify columns whatever:ilastm
+           ! if only eigenvalues are being computed, then
+              ! ifrstm is the row of the last splitting row above row ilast;
+              ! this is always at least ilo.
+           ! iiter counts iterations since the last eigenvalue was found,
+              ! to tell when to use an extraordinary shift.
+           ! maxit is the maximum number of qz sweeps allowed.
+           ilast = ihi
+           if( ilschr ) then
+              ifrstm = 1
+              ilastm = n
+           else
+              ifrstm = ilo
+              ilastm = ihi
+           end if
+           iiter = 0
+           eshift = czero
+           maxit = 30*( ihi-ilo+1 )
+           loop_170: do jiter = 1, maxit
+              ! check for too many iterations.
+              if( jiter>maxit )go to 180
+              ! split the matrix if possible.
+              ! two tests:
+                 ! 1: h(j,j-1)=0  or  j=ilo
+                 ! 2: t(j,j)=0
+              ! special case: j=ilast
+              if( ilast==ilo ) then
+                 go to 60
+              else
+                 if( abs1( h( ilast, ilast-1 ) )<=max( safmin, ulp*(abs1( h( ilast, ilast ) ) + &
+                           abs1( h( ilast-1, ilast-1 )) ) ) ) then
+                    h( ilast, ilast-1 ) = czero
+                    go to 60
+                 end if
+              end if
+              if( abs( t( ilast, ilast ) )<=max( safmin, ulp*(abs( t( ilast - 1, ilast ) ) + abs( &
+                        t( ilast-1, ilast-1 )) ) ) ) then
+                 t( ilast, ilast ) = czero
+                 go to 50
+              end if
+              ! general case: j<ilast
+              loop_40: do j = ilast - 1, ilo, -1
+                 ! test 1: for h(j,j-1)=0 or j=ilo
+                 if( j==ilo ) then
+                    ilazro = .true.
+                 else
+                    if( abs1( h( j, j-1 ) )<=max( safmin, ulp*(abs1( h( j, j ) ) + abs1( h( j-1, &
+                              j-1 ) )) ) ) then
+                       h( j, j-1 ) = czero
+                       ilazro = .true.
+                    else
+                       ilazro = .false.
+                    end if
+                 end if
+                 ! test 2: for t(j,j)=0
+                 temp = abs ( t( j, j + 1 ) )
+                 if ( j > ilo )temp = temp + abs ( t( j - 1, j ) )
+                 if( abs( t( j, j ) )<max( safmin,ulp*temp ) ) then
+                    t( j, j ) = czero
+                    ! test 1a: check for 2 consecutive small subdiagonals in a
+                    ilazr2 = .false.
+                    if( .not.ilazro ) then
+                       if( abs1( h( j, j-1 ) )*( ascale*abs1( h( j+1,j ) ) )<=abs1( h( j, j ) )*( &
+                                 ascale*atol ) )ilazr2 = .true.
+                    end if
+                    ! if both tests pass (1
+                    ! element of b in the block is zero, split a 1x1 block off
+                    ! at the top. (i.e., at the j-th row/column) the leading
+                    ! diagonal element of the remainder can also be zero, so
+                    ! this may have to be done repeatedly.
+                    if( ilazro .or. ilazr2 ) then
+                       do jch = j, ilast - 1
+                          ctemp = h( jch, jch )
+                          call stdlib_clartg( ctemp, h( jch+1, jch ), c, s,h( jch, jch ) )
+                          h( jch+1, jch ) = czero
+                          call stdlib_crot( ilastm-jch, h( jch, jch+1 ), ldh,h( jch+1, jch+1 ), &
+                                    ldh, c, s )
+                          call stdlib_crot( ilastm-jch, t( jch, jch+1 ), ldt,t( jch+1, jch+1 ), &
+                                    ldt, c, s )
+                          if( ilq )call stdlib_crot( n, q( 1, jch ), 1, q( 1, jch+1 ), 1,c, conjg(&
+                                     s ) )
+                          if( ilazr2 )h( jch, jch-1 ) = h( jch, jch-1 )*c
+                          ilazr2 = .false.
+                          if( abs1( t( jch+1, jch+1 ) )>=btol ) then
+                             if( jch+1>=ilast ) then
+                                go to 60
+                             else
+                                ifirst = jch + 1
+                                go to 70
+                             end if
+                          end if
+                          t( jch+1, jch+1 ) = czero
+                       end do
+                       go to 50
+                    else
+                       ! only test 2 passed -- chase the zero to t(ilast,ilast)
+                       ! then process as in the case t(ilast,ilast)=0
+                       do jch = j, ilast - 1
+                          ctemp = t( jch, jch+1 )
+                          call stdlib_clartg( ctemp, t( jch+1, jch+1 ), c, s,t( jch, jch+1 ) )
+                                    
+                          t( jch+1, jch+1 ) = czero
+                          if( jch<ilastm-1 )call stdlib_crot( ilastm-jch-1, t( jch, jch+2 ), ldt,&
+                                    t( jch+1, jch+2 ), ldt, c, s )
+                          call stdlib_crot( ilastm-jch+2, h( jch, jch-1 ), ldh,h( jch+1, jch-1 ), &
+                                    ldh, c, s )
+                          if( ilq )call stdlib_crot( n, q( 1, jch ), 1, q( 1, jch+1 ), 1,c, conjg(&
+                                     s ) )
+                          ctemp = h( jch+1, jch )
+                          call stdlib_clartg( ctemp, h( jch+1, jch-1 ), c, s,h( jch+1, jch ) )
+                                    
+                          h( jch+1, jch-1 ) = czero
+                          call stdlib_crot( jch+1-ifrstm, h( ifrstm, jch ), 1,h( ifrstm, jch-1 ), &
+                                    1, c, s )
+                          call stdlib_crot( jch-ifrstm, t( ifrstm, jch ), 1,t( ifrstm, jch-1 ), 1,&
+                                     c, s )
+                          if( ilz )call stdlib_crot( n, z( 1, jch ), 1, z( 1, jch-1 ), 1,c, s )
+                                    
+                       end do
+                       go to 50
+                    end if
+                 else if( ilazro ) then
+                    ! only test 1 passed -- work on j:ilast
+                    ifirst = j
+                    go to 70
+                 end if
+                 ! neither test passed -- try next j
+              end do loop_40
+              ! (drop-through is "impossible")
+              info = 2*n + 1
+              go to 210
+              ! t(ilast,ilast)=0 -- clear h(ilast,ilast-1) to split off a
+              ! 1x1 block.
+              50 continue
+              ctemp = h( ilast, ilast )
+              call stdlib_clartg( ctemp, h( ilast, ilast-1 ), c, s,h( ilast, ilast ) )
+              h( ilast, ilast-1 ) = czero
+              call stdlib_crot( ilast-ifrstm, h( ifrstm, ilast ), 1,h( ifrstm, ilast-1 ), 1, c, s &
+                        )
+              call stdlib_crot( ilast-ifrstm, t( ifrstm, ilast ), 1,t( ifrstm, ilast-1 ), 1, c, s &
+                        )
+              if( ilz )call stdlib_crot( n, z( 1, ilast ), 1, z( 1, ilast-1 ), 1, c, s )
+              ! h(ilast,ilast-1)=0 -- standardize b, set alpha and beta
+              60 continue
+              absb = abs( t( ilast, ilast ) )
+              if( absb>safmin ) then
+                 signbc = conjg( t( ilast, ilast ) / absb )
+                 t( ilast, ilast ) = absb
+                 if( ilschr ) then
+                    call stdlib_cscal( ilast-ifrstm, signbc, t( ifrstm, ilast ), 1 )
+                    call stdlib_cscal( ilast+1-ifrstm, signbc, h( ifrstm, ilast ),1 )
+                 else
+                    call stdlib_cscal( 1, signbc, h( ilast, ilast ), 1 )
+                 end if
+                 if( ilz )call stdlib_cscal( n, signbc, z( 1, ilast ), 1 )
+              else
+                 t( ilast, ilast ) = czero
+              end if
+              alpha( ilast ) = h( ilast, ilast )
+              beta( ilast ) = t( ilast, ilast )
+              ! go to next block -- exit if finished.
+              ilast = ilast - 1
+              if( ilast<ilo )go to 190
+              ! reset counters
+              iiter = 0
+              eshift = czero
+              if( .not.ilschr ) then
+                 ilastm = ilast
+                 if( ifrstm>ilast )ifrstm = ilo
+              end if
+              go to 160
+              ! qz step
+              ! this iteration only involves rows/columns ifirst:ilast.  we
+              ! assume ifirst < ilast, and that the diagonal of b is non-zero.
+              70 continue
+              iiter = iiter + 1
+              if( .not.ilschr ) then
+                 ifrstm = ifirst
+              end if
+              ! compute the shift.
+              ! at this point, ifirst < ilast, and the diagonal elements of
+              ! t(ifirst:ilast,ifirst,ilast) are larger than btol (in
+              ! magnitude)
+              if( ( iiter / 10 )*10/=iiter ) then
+                 ! the wilkinson shift (aep p.512_sp), i.e., the eigenvalue of
+                 ! the bottom-right 2x2 block of a inv(b) which is nearest to
+                 ! the bottom-right element.
+                 ! we factor b as u*d, where u has unit diagonals, and
+                 ! compute (a*inv(d))*inv(u).
+                 u12 = ( bscale*t( ilast-1, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 ad11 = ( ascale*h( ilast-1, ilast-1 ) ) /( bscale*t( ilast-1, ilast-1 ) )
+                 ad21 = ( ascale*h( ilast, ilast-1 ) ) /( bscale*t( ilast-1, ilast-1 ) )
+                 ad12 = ( ascale*h( ilast-1, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 ad22 = ( ascale*h( ilast, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 abi22 = ad22 - u12*ad21
+                 abi12 = ad12 - u12*ad11
+                 shift = abi22
+                 ctemp = sqrt( abi12 )*sqrt( ad21 )
+                 temp = abs1( ctemp )
+                 if( ctemp/=zero ) then
+                    x = half*( ad11-shift )
+                    temp2 = abs1( x )
+                    temp = max( temp, abs1( x ) )
+                    y = temp*sqrt( ( x / temp )**2+( ctemp / temp )**2 )
+                    if( temp2>zero ) then
+                       if( real( x / temp2,KIND=sp)*real( y,KIND=sp)+aimag( x / temp2 )*aimag( y )&
+                                 <zero )y = -y
+                    end if
+                    shift = shift - ctemp*stdlib_cladiv( ctemp, ( x+y ) )
+                 end if
+              else
+                 ! exceptional shift.  chosen for no particularly good reason.
+                 if( ( iiter / 20 )*20==iiter .and.bscale*abs1(t( ilast, ilast ))>safmin ) &
+                           then
+                    eshift = eshift + ( ascale*h( ilast,ilast ) )/( bscale*t( ilast, ilast ) )
+                              
+                 else
+                    eshift = eshift + ( ascale*h( ilast,ilast-1 ) )/( bscale*t( ilast-1, ilast-1 )&
+                               )
+                 end if
+                 shift = eshift
+              end if
+              ! now check for two consecutive small subdiagonals.
+              do j = ilast - 1, ifirst + 1, -1
+                 istart = j
+                 ctemp = ascale*h( j, j ) - shift*( bscale*t( j, j ) )
+                 temp = abs1( ctemp )
+                 temp2 = ascale*abs1( h( j+1, j ) )
+                 tempr = max( temp, temp2 )
+                 if( tempr<one .and. tempr/=zero ) then
+                    temp = temp / tempr
+                    temp2 = temp2 / tempr
+                 end if
+                 if( abs1( h( j, j-1 ) )*temp2<=temp*atol )go to 90
+              end do
+              istart = ifirst
+              ctemp = ascale*h( ifirst, ifirst ) -shift*( bscale*t( ifirst, ifirst ) )
+              90 continue
+              ! do an implicit-shift qz sweep.
+              ! initial q
+              ctemp2 = ascale*h( istart+1, istart )
+              call stdlib_clartg( ctemp, ctemp2, c, s, ctemp3 )
+              ! sweep
+              loop_150: do j = istart, ilast - 1
+                 if( j>istart ) then
+                    ctemp = h( j, j-1 )
+                    call stdlib_clartg( ctemp, h( j+1, j-1 ), c, s, h( j, j-1 ) )
+                    h( j+1, j-1 ) = czero
+                 end if
+                 do jc = j, ilastm
+                    ctemp = c*h( j, jc ) + s*h( j+1, jc )
+                    h( j+1, jc ) = -conjg( s )*h( j, jc ) + c*h( j+1, jc )
+                    h( j, jc ) = ctemp
+                    ctemp2 = c*t( j, jc ) + s*t( j+1, jc )
+                    t( j+1, jc ) = -conjg( s )*t( j, jc ) + c*t( j+1, jc )
+                    t( j, jc ) = ctemp2
+                 end do
+                 if( ilq ) then
+                    do jr = 1, n
+                       ctemp = c*q( jr, j ) + conjg( s )*q( jr, j+1 )
+                       q( jr, j+1 ) = -s*q( jr, j ) + c*q( jr, j+1 )
+                       q( jr, j ) = ctemp
+                    end do
+                 end if
+                 ctemp = t( j+1, j+1 )
+                 call stdlib_clartg( ctemp, t( j+1, j ), c, s, t( j+1, j+1 ) )
+                 t( j+1, j ) = czero
+                 do jr = ifrstm, min( j+2, ilast )
+                    ctemp = c*h( jr, j+1 ) + s*h( jr, j )
+                    h( jr, j ) = -conjg( s )*h( jr, j+1 ) + c*h( jr, j )
+                    h( jr, j+1 ) = ctemp
+                 end do
+                 do jr = ifrstm, j
+                    ctemp = c*t( jr, j+1 ) + s*t( jr, j )
+                    t( jr, j ) = -conjg( s )*t( jr, j+1 ) + c*t( jr, j )
+                    t( jr, j+1 ) = ctemp
+                 end do
+                 if( ilz ) then
+                    do jr = 1, n
+                       ctemp = c*z( jr, j+1 ) + s*z( jr, j )
+                       z( jr, j ) = -conjg( s )*z( jr, j+1 ) + c*z( jr, j )
+                       z( jr, j+1 ) = ctemp
+                    end do
+                 end if
+              end do loop_150
+              160 continue
+           end do loop_170
+           ! drop-through = non-convergence
+           180 continue
+           info = ilast
+           go to 210
+           ! successful completion of all qz steps
+           190 continue
+           ! set eigenvalues 1:ilo-1
+           do j = 1, ilo - 1
+              absb = abs( t( j, j ) )
+              if( absb>safmin ) then
+                 signbc = conjg( t( j, j ) / absb )
+                 t( j, j ) = absb
+                 if( ilschr ) then
+                    call stdlib_cscal( j-1, signbc, t( 1, j ), 1 )
+                    call stdlib_cscal( j, signbc, h( 1, j ), 1 )
+                 else
+                    call stdlib_cscal( 1, signbc, h( j, j ), 1 )
+                 end if
+                 if( ilz )call stdlib_cscal( n, signbc, z( 1, j ), 1 )
+              else
+                 t( j, j ) = czero
+              end if
+              alpha( j ) = h( j, j )
+              beta( j ) = t( j, j )
+           end do
+           ! normal termination
+           info = 0
+           ! exit (other than argument error) -- return optimal workspace size
+           210 continue
+           work( 1 ) = cmplx( n,KIND=sp)
+           return
+     end subroutine stdlib_chgeqz
+
+     !> CHPCON: estimates the reciprocal of the condition number of a complex
+     !> Hermitian packed matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by CHPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_chpcon( uplo, n, ap, ipiv, anorm, rcond, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(sp), intent(in) :: ap(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ip, kase
+           real(sp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              ip = n*( n+1 ) / 2
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip - i
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              ip = 1
+              do i = 1, n
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip + n - i + 1
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_clacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**h) or inv(u*d*u**h).
+              call stdlib_chptrs( uplo, n, 1, ap, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_chpcon
+
+     !> CHPEV: computes all the eigenvalues and, optionally, eigenvectors of a
+     !> complex Hermitian matrix in packed storage.
+
+     subroutine stdlib_chpev( jobz, uplo, n, ap, w, z, ldz, work, rwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: ap(*)
+           complex(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantz
+           integer(ilp) :: iinfo, imax, inde, indrwk, indtau, indwrk, iscale
+           real(sp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( stdlib_lsame( uplo, 'L' ) .or. stdlib_lsame( uplo, 'U' ) ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = real( ap( 1 ),KIND=sp)
+              rwork( 1 ) = 1
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_clanhp( 'M', uplo, n, ap, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_csscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+           end if
+           ! call stdlib_chptrd to reduce hermitian packed matrix to tridiagonal form.
+           inde = 1
+           indtau = 1
+           call stdlib_chptrd( uplo, n, ap, w, rwork( inde ), work( indtau ),iinfo )
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, first call
+           ! stdlib_cupgtr to generate the orthogonal matrix, then call stdlib_csteqr.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, rwork( inde ), info )
+           else
+              indwrk = indtau + n
+              call stdlib_cupgtr( uplo, n, ap, work( indtau ), z, ldz,work( indwrk ), iinfo )
+                        
+              indrwk = inde + n
+              call stdlib_csteqr( jobz, n, w, rwork( inde ), z, ldz,rwork( indrwk ), info )
+                        
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           return
+     end subroutine stdlib_chpev
+
+     !> CHPEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex Hermitian matrix A in packed storage.
+     !> Eigenvalues/vectors can be selected by specifying either a range of
+     !> values or a range of indices for the desired eigenvalues.
+
+     subroutine stdlib_chpevx( jobz, range, uplo, n, ap, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               work, rwork, iwork,ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: ap(*)
+           complex(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwk, indrwk, &
+                     indtau, indwrk, iscale, itmp1, j, jj, nsplit
+           real(sp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( stdlib_lsame( uplo, 'L' ) .or. stdlib_lsame( uplo, 'U' ) ) )&
+                     then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -7
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -8
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -9
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = real( ap( 1 ),KIND=sp)
+              else
+                 if( vl<real( ap( 1 ),KIND=sp) .and. vu>=real( ap( 1 ),KIND=sp) ) then
+                    m = 1
+                    w( 1 ) = real( ap( 1 ),KIND=sp)
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if ( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           endif
+           anrm = stdlib_clanhp( 'M', uplo, n, ap, rwork )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_csscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_chptrd to reduce hermitian packed matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indrwk = inde + n
+           indtau = 1
+           indwrk = indtau + n
+           call stdlib_chptrd( uplo, n, ap, rwork( indd ), rwork( inde ),work( indtau ), iinfo )
+                     
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_ssterf or stdlib_cupgtr and stdlib_csteqr.  if this fails
+           ! for some eigenvalue, then try stdlib_sstebz.
+           test = .false.
+           if (indeig) then
+              if (il==1 .and. iu==n) then
+                 test = .true.
+              end if
+           end if
+           if ((alleig .or. test) .and. (abstol<=zero)) then
+              call stdlib_scopy( n, rwork( indd ), 1, w, 1 )
+              indee = indrwk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_scopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_ssterf( n, w, rwork( indee ), info )
+              else
+                 call stdlib_cupgtr( uplo, n, ap, work( indtau ), z, ldz,work( indwrk ), iinfo )
+                           
+                 call stdlib_scopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_csteqr( jobz, n, w, rwork( indee ), z, ldz,rwork( indrwk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 20
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_sstebz and, if eigenvectors are desired, stdlib_cstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwk = indisp + n
+           call stdlib_sstebz( range, order, n, vll, vuu, il, iu, abstll,rwork( indd ), rwork( &
+           inde ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwk ),&
+                      info )
+           if( wantz ) then
+              call stdlib_cstein( n, rwork( indd ), rwork( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwk ), ifail, info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_cstein.
+              indwrk = indtau + n
+              call stdlib_cupmtr( 'L', uplo, 'N', n, m, ap, work( indtau ), z, ldz,work( indwrk ),&
+                         iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           20 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_cswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_chpevx
+
+     !> CHPGV: computes all the eigenvalues and, optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+     !> Here A and B are assumed to be Hermitian, stored in packed format,
+     !> and B is also positive definite.
+
+     subroutine stdlib_chpgv( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, ldz, n
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: ap(*), bp(*)
+           complex(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper, wantz
+           character :: trans
+           integer(ilp) :: j, neig
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPGV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_cpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_chpgst( itype, uplo, n, ap, bp, info )
+           call stdlib_chpev( jobz, uplo, n, ap, w, z, ldz, work, rwork, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 do j = 1, neig
+                    call stdlib_ctpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h*y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, neig
+                    call stdlib_ctpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_chpgv
+
+     !> CHPGVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be Hermitian, stored in packed format, and B is also
+     !> positive definite.  Eigenvalues and eigenvectors can be selected by
+     !> specifying either a range of values or a range of indices for the
+     !> desired eigenvalues.
+
+     subroutine stdlib_chpgvx( itype, jobz, range, uplo, n, ap, bp, vl, vu,il, iu, abstol, m, w, &
+               z, ldz, work, rwork,iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, itype, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: ap(*), bp(*)
+           complex(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, upper, valeig, wantz
+           character :: trans
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -3
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl ) then
+                    info = -9
+                 end if
+              else if( indeig ) then
+                 if( il<1 ) then
+                    info = -10
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -11
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPGVX', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_cpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_chpgst( itype, uplo, n, ap, bp, info )
+           call stdlib_chpevx( jobz, range, uplo, n, ap, vl, vu, il, iu, abstol, m,w, z, ldz, &
+                     work, rwork, iwork, ifail, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              if( info>0 )m = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 do j = 1, m
+                    call stdlib_ctpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h*y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, m
+                    call stdlib_ctpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_chpgvx
+
+     !> CHPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian indefinite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_chprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, work,&
+                rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(sp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_ccopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_chpmv( uplo, n, -cone, ap, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( ap( kk+k-1 ),KIND=sp) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( ap( kk ),KIND=sp) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_chptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                 call stdlib_caxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_clacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_clacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_chptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_chptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_chprfs
+
+     !> CHPSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian matrix stored in packed format and X
+     !> and B are N-by-NRHS matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**H,  if UPLO = 'U', or
+     !> A = L * D * L**H,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, D is Hermitian and block diagonal with 1-by-1
+     !> and 2-by-2 diagonal blocks.  The factored form of A is then used to
+     !> solve the system of equations A * X = B.
+
+     pure subroutine stdlib_chpsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPSV ', -info )
+              return
+           end if
+           ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+           call stdlib_chptrf( uplo, n, ap, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_chptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_chpsv
+
+     !> CHPSVX: uses the diagonal pivoting factorization A = U*D*U**H or
+     !> A = L*D*L**H to compute the solution to a complex system of linear
+     !> equations A * X = B, where A is an N-by-N Hermitian matrix stored
+     !> in packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_chpsvx( fact, uplo, n, nrhs, ap, afp, ipiv, b, ldb, x,ldx, rcond, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(sp), intent(inout) :: afp(*)
+           complex(sp), intent(in) :: ap(*), b(ldb,*)
+           complex(sp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(sp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+              call stdlib_ccopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_chptrf( uplo, n, afp, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_clanhp( 'I', uplo, n, ap, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_chpcon( uplo, n, afp, ipiv, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_clacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_chptrs( uplo, n, nrhs, afp, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_chprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx, ferr,berr, work, &
+                     rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_chpsvx
+
+     !> CHSEIN: uses inverse iteration to find specified right and/or left
+     !> eigenvectors of a complex upper Hessenberg matrix H.
+     !> The right eigenvector x and the left eigenvector y of the matrix H
+     !> corresponding to an eigenvalue w are defined by:
+     !> H * x = w * x,     y**h * H = w * y**h
+     !> where y**h denotes the conjugate transpose of the vector y.
+
+     subroutine stdlib_chsein( side, eigsrc, initv, select, n, h, ldh, w, vl,ldvl, vr, ldvr, mm, &
+               m, work, rwork, ifaill,ifailr, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: eigsrc, initv, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldh, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: ifaill(*), ifailr(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(in) :: h(ldh,*)
+           complex(sp), intent(inout) :: vl(ldvl,*), vr(ldvr,*), w(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: rzero = 0.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: bothv, fromqr, leftv, noinit, rightv
+           integer(ilp) :: i, iinfo, k, kl, kln, kr, ks, ldwork
+           real(sp) :: eps3, hnorm, smlnum, ulp, unfl
+           complex(sp) :: cdum, wk
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           bothv = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv = stdlib_lsame( side, 'L' ) .or. bothv
+           fromqr = stdlib_lsame( eigsrc, 'Q' )
+           noinit = stdlib_lsame( initv, 'N' )
+           ! set m to the number of columns required to store the selected
+           ! eigenvectors.
+           m = 0
+           do k = 1, n
+              if( select( k ) )m = m + 1
+           end do
+           info = 0
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.fromqr .and. .not.stdlib_lsame( eigsrc, 'N' ) ) then
+              info = -2
+           else if( .not.noinit .and. .not.stdlib_lsame( initv, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( ldh<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -10
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -12
+           else if( mm<m ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHSEIN', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! set machine-dependent constants.
+           unfl = stdlib_slamch( 'SAFE MINIMUM' )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           ldwork = n
+           kl = 1
+           kln = 0
+           if( fromqr ) then
+              kr = 0
+           else
+              kr = n
+           end if
+           ks = 1
+           loop_100: do k = 1, n
+              if( select( k ) ) then
+                 ! compute eigenvector(s) corresponding to w(k).
+                 if( fromqr ) then
+                    ! if affiliation of eigenvalues is known, check whether
+                    ! the matrix splits.
+                    ! determine kl and kr such that 1 <= kl <= k <= kr <= n
+                    ! and h(kl,kl-1) and h(kr+1,kr) are czero (or kl = 1 or
+                    ! kr = n).
+                    ! then inverse iteration can be performed with the
+                    ! submatrix h(kl:n,kl:n) for a left eigenvector, and with
+                    ! the submatrix h(1:kr,1:kr) for a right eigenvector.
+                    do i = k, kl + 1, -1
+                       if( h( i, i-1 )==czero )go to 30
+                    end do
+                    30 continue
+                    kl = i
+                    if( k>kr ) then
+                       do i = k, n - 1
+                          if( h( i+1, i )==czero )go to 50
+                       end do
+                       50 continue
+                       kr = i
+                    end if
+                 end if
+                 if( kl/=kln ) then
+                    kln = kl
+                    ! compute infinity-norm of submatrix h(kl:kr,kl:kr) if it
+                    ! has not ben computed before.
+                    hnorm = stdlib_clanhs( 'I', kr-kl+1, h( kl, kl ), ldh, rwork )
+                    if( stdlib_sisnan( hnorm ) ) then
+                       info = -6
+                       return
+                    else if( (hnorm>rzero) ) then
+                       eps3 = hnorm*ulp
+                    else
+                       eps3 = smlnum
+                    end if
+                 end if
+                 ! perturb eigenvalue if it is close to any previous
+                 ! selected eigenvalues affiliated to the submatrix
+                 ! h(kl:kr,kl:kr). close roots are modified by eps3.
+                 wk = w( k )
+                 60 continue
+                 do i = k - 1, kl, -1
+                    if( select( i ) .and. cabs1( w( i )-wk )<eps3 ) then
+                       wk = wk + eps3
+                       go to 60
+                    end if
+                 end do
+                 w( k ) = wk
+                 if( leftv ) then
+                    ! compute left eigenvector.
+                    call stdlib_claein( .false., noinit, n-kl+1, h( kl, kl ), ldh,wk, vl( kl, ks )&
+                              , work, ldwork, rwork, eps3,smlnum, iinfo )
+                    if( iinfo>0 ) then
+                       info = info + 1
+                       ifaill( ks ) = k
+                    else
+                       ifaill( ks ) = 0
+                    end if
+                    do i = 1, kl - 1
+                       vl( i, ks ) = czero
+                    end do
+                 end if
+                 if( rightv ) then
+                    ! compute right eigenvector.
+                    call stdlib_claein( .true., noinit, kr, h, ldh, wk, vr( 1, ks ),work, ldwork, &
+                              rwork, eps3, smlnum, iinfo )
+                    if( iinfo>0 ) then
+                       info = info + 1
+                       ifailr( ks ) = k
+                    else
+                       ifailr( ks ) = 0
+                    end if
+                    do i = kr + 1, n
+                       vr( i, ks ) = czero
+                    end do
+                 end if
+                 ks = ks + 1
+              end if
+           end do loop_100
+           return
+     end subroutine stdlib_chsein
+
+     !> Using the divide and conquer method, CLAED0: computes all eigenvalues
+     !> of a symmetric tridiagonal matrix which is one diagonal block of
+     !> those from reducing a dense or band Hermitian matrix and
+     !> corresponding eigenvectors of the dense or band matrix.
+
+     pure subroutine stdlib_claed0( qsiz, n, d, e, q, ldq, qstore, ldqs, rwork,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldq, ldqs, n, qsiz
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: q(ldq,*)
+           complex(sp), intent(out) :: qstore(ldqs,*)
+        ! =====================================================================
+        ! warning:      n could be as big as qsiz!
+           
+           ! Local Scalars 
+           integer(ilp) :: curlvl, curprb, curr, i, igivcl, igivnm, igivpt, indxq, iperm, iprmpt, &
+           iq, iqptr, iwrem, j, k, lgn, ll, matsiz, msd2, smlsiz, smm1, spm1, spm2, submat, &
+                     subpbs, tlvls
+           real(sp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           ! if( icompq < 0 .or. icompq > 2 ) then
+              ! info = -1
+           ! else if( ( icompq == 1 ) .and. ( qsiz < max( 0, n ) ) )
+          ! $        then
+           if( qsiz<max( 0, n ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           else if( ldqs<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLAED0', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           smlsiz = stdlib_ilaenv( 9, 'CLAED0', ' ', 0, 0, 0, 0 )
+           ! determine the size and placement of the submatrices, and save in
+           ! the leading elements of iwork.
+           iwork( 1 ) = n
+           subpbs = 1
+           tlvls = 0
+           10 continue
+           if( iwork( subpbs )>smlsiz ) then
+              do j = subpbs, 1, -1
+                 iwork( 2*j ) = ( iwork( j )+1 ) / 2
+                 iwork( 2*j-1 ) = iwork( j ) / 2
+              end do
+              tlvls = tlvls + 1
+              subpbs = 2*subpbs
+              go to 10
+           end if
+           do j = 2, subpbs
+              iwork( j ) = iwork( j ) + iwork( j-1 )
+           end do
+           ! divide the matrix into subpbs submatrices of size at most smlsiz+1
+           ! using rank-1 modifications (cuts).
+           spm1 = subpbs - 1
+           do i = 1, spm1
+              submat = iwork( i ) + 1
+              smm1 = submat - 1
+              d( smm1 ) = d( smm1 ) - abs( e( smm1 ) )
+              d( submat ) = d( submat ) - abs( e( smm1 ) )
+           end do
+           indxq = 4*n + 3
+           ! set up workspaces for eigenvalues only/accumulate new vectors
+           ! routine
+           temp = log( real( n,KIND=sp) ) / log( two )
+           lgn = int( temp,KIND=ilp)
+           if( 2**lgn<n )lgn = lgn + 1
+           if( 2**lgn<n )lgn = lgn + 1
+           iprmpt = indxq + n + 1
+           iperm = iprmpt + n*lgn
+           iqptr = iperm + n*lgn
+           igivpt = iqptr + n + 2
+           igivcl = igivpt + n*lgn
+           igivnm = 1
+           iq = igivnm + 2*n*lgn
+           iwrem = iq + n**2 + 1
+           ! initialize pointers
+           do i = 0, subpbs
+              iwork( iprmpt+i ) = 1
+              iwork( igivpt+i ) = 1
+           end do
+           iwork( iqptr ) = 1
+           ! solve each submatrix eigenproblem at the bottom of the divide and
+           ! conquer tree.
+           curr = 0
+           do i = 0, spm1
+              if( i==0 ) then
+                 submat = 1
+                 matsiz = iwork( 1 )
+              else
+                 submat = iwork( i ) + 1
+                 matsiz = iwork( i+1 ) - iwork( i )
+              end if
+              ll = iq - 1 + iwork( iqptr+curr )
+              call stdlib_ssteqr( 'I', matsiz, d( submat ), e( submat ),rwork( ll ), matsiz, &
+                        rwork, info )
+              call stdlib_clacrm( qsiz, matsiz, q( 1, submat ), ldq, rwork( ll ),matsiz, qstore( &
+                        1, submat ), ldqs,rwork( iwrem ) )
+              iwork( iqptr+curr+1 ) = iwork( iqptr+curr ) + matsiz**2
+              curr = curr + 1
+              if( info>0 ) then
+                 info = submat*( n+1 ) + submat + matsiz - 1
+                 return
+              end if
+              k = 1
+              do j = submat, iwork( i+1 )
+                 iwork( indxq+j ) = k
+                 k = k + 1
+              end do
+           end do
+           ! successively merge eigensystems of adjacent submatrices
+           ! into eigensystem for the corresponding larger matrix.
+           ! while ( subpbs > 1 )
+           curlvl = 1
+           80 continue
+           if( subpbs>1 ) then
+              spm2 = subpbs - 2
+              do i = 0, spm2, 2
+                 if( i==0 ) then
+                    submat = 1
+                    matsiz = iwork( 2 )
+                    msd2 = iwork( 1 )
+                    curprb = 0
+                 else
+                    submat = iwork( i ) + 1
+                    matsiz = iwork( i+2 ) - iwork( i )
+                    msd2 = matsiz / 2
+                    curprb = curprb + 1
+                 end if
+           ! merge lower order eigensystems (of size msd2 and matsiz - msd2)
+           ! into an eigensystem of size matsiz.  stdlib_claed7 handles the case
+           ! when the eigenvectors of a full or band hermitian matrix (which
+           ! was reduced to tridiagonal form) are desired.
+           ! i am free to use q as a valuable working space until loop 150.
+                 call stdlib_claed7( matsiz, msd2, qsiz, tlvls, curlvl, curprb,d( submat ), &
+                 qstore( 1, submat ), ldqs,e( submat+msd2-1 ), iwork( indxq+submat ),rwork( iq ), &
+                 iwork( iqptr ), iwork( iprmpt ),iwork( iperm ), iwork( igivpt ),iwork( igivcl ), &
+                           rwork( igivnm ),q( 1, submat ), rwork( iwrem ),iwork( subpbs+1 ), info )
+                 if( info>0 ) then
+                    info = submat*( n+1 ) + submat + matsiz - 1
+                    return
+                 end if
+                 iwork( i / 2+1 ) = iwork( i+2 )
+              end do
+              subpbs = subpbs / 2
+              curlvl = curlvl + 1
+              go to 80
+           end if
+           ! end while
+           ! re-merge the eigenvalues/vectors which were deflated at the final
+           ! merge step.
+           do i = 1, n
+              j = iwork( indxq+i )
+              rwork( i ) = d( j )
+              call stdlib_ccopy( qsiz, qstore( 1, j ), 1, q( 1, i ), 1 )
+           end do
+           call stdlib_scopy( n, rwork, 1, d, 1 )
+           return
+     end subroutine stdlib_claed0
+
+     !> CLAMSWLQ: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of blocked
+     !> elementary reflectors computed by short wide LQ
+     !> factorization (CLASWLQ)
+
+     pure subroutine stdlib_clamswlq( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, mb, nb, ldt, lwork, ldc
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), t(ldt,*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: c(ldc,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: i, ii, kk, lw, ctr
+           ! External Subroutines 
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork<0
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'C' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           if (left) then
+             lw = n * mb
+           else
+             lw = m * mb
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( k<0 ) then
+             info = -5
+           else if( m<k ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<mb .or. mb<1) then
+             info = -6
+           else if( lda<max( 1, k ) ) then
+             info = -9
+           else if( ldt<max( 1, mb) ) then
+             info = -11
+           else if( ldc<max( 1, m ) ) then
+              info = -13
+           else if(( lwork<max(1,lw)).and.(.not.lquery)) then
+             info = -15
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'CLAMSWLQ', -info )
+             work(1) = lw
+             return
+           else if (lquery) then
+             work(1) = lw
+             return
+           end if
+           ! quick return if possible
+           if( min(m,n,k)==0 ) then
+             return
+           end if
+           if((nb<=k).or.(nb>=max(m,n,k))) then
+             call stdlib_cgemlqt( side, trans, m, n, k, mb, a, lda,t, ldt, c, ldc, work, info)
+                       
+             return
+           end if
+           if(left.and.tran) then
+               ! multiply q to the last block of c
+               kk = mod((m-k),(nb-k))
+               ctr = (m-k)/(nb-k)
+               if (kk>0) then
+                 ii=m-kk+1
+                 call stdlib_ctpmlqt('L','C',kk , n, k, 0, mb, a(1,ii), lda,t(1,ctr*k+1), ldt, c(&
+                           1,1), ldc,c(ii,1), ldc, work, info )
+               else
+                 ii=m+1
+               end if
+               do i=ii-(nb-k),nb+1,-(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+nb)
+                 ctr = ctr - 1
+                 call stdlib_ctpmlqt('L','C',nb-k , n, k, 0,mb, a(1,i), lda,t(1,ctr*k+1),ldt, c(1,&
+                           1), ldc,c(i,1), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:nb)
+               call stdlib_cgemlqt('L','C',nb , n, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (left.and.notran) then
+               ! multiply q to the first block of c
+              kk  = mod((m-k),(nb-k))
+              ii  = m-kk+1
+              ctr = 1
+              call stdlib_cgemlqt('L','N',nb , n, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=nb+1,ii-nb+k,(nb-k)
+               ! multiply q to the current block of c (i:i+nb,1:n)
+               call stdlib_ctpmlqt('L','N',nb-k , n, k, 0,mb, a(1,i), lda,t(1, ctr *k+1), ldt, c(&
+                         1,1), ldc,c(i,1), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=m) then
+               ! multiply q to the last block of c
+               call stdlib_ctpmlqt('L','N',kk , n, k, 0, mb, a(1,ii), lda,t(1, ctr*k+1), ldt, c(1,&
+                         1), ldc,c(ii,1), ldc, work, info )
+              end if
+           else if(right.and.notran) then
+               ! multiply q to the last block of c
+               kk = mod((n-k),(nb-k))
+               ctr = (n-k)/(nb-k)
+               if (kk>0) then
+                 ii=n-kk+1
+                 call stdlib_ctpmlqt('R','N',m , kk, k, 0, mb, a(1, ii), lda,t(1,ctr*k+1), ldt, c(&
+                           1,1), ldc,c(1,ii), ldc, work, info )
+               else
+                 ii=n+1
+               end if
+               do i=ii-(nb-k),nb+1,-(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+                   ctr = ctr - 1
+                   call stdlib_ctpmlqt('R','N', m, nb-k, k, 0, mb, a(1, i), lda,t(1,ctr*k+1), ldt,&
+                              c(1,1), ldc,c(1,i), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:mb)
+               call stdlib_cgemlqt('R','N',m , nb, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (right.and.tran) then
+             ! multiply q to the first block of c
+              kk = mod((n-k),(nb-k))
+              ii=n-kk+1
+              ctr = 1
+              call stdlib_cgemlqt('R','C',m , nb, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=nb+1,ii-nb+k,(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               call stdlib_ctpmlqt('R','C',m , nb-k, k, 0,mb, a(1,i), lda,t(1,ctr*k+1), ldt, c(1,&
+                         1), ldc,c(1,i), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=n) then
+             ! multiply q to the last block of c
+               call stdlib_ctpmlqt('R','C',m , kk, k, 0,mb, a(1,ii), lda,t(1,ctr*k+1),ldt, c(1,1),&
+                          ldc,c(1,ii), ldc, work, info )
+              end if
+           end if
+           work(1) = lw
+           return
+     end subroutine stdlib_clamswlq
+
+     !> CLAMTSQR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product
+     !> of blocked elementary reflectors computed by tall skinny
+     !> QR factorization (CLATSQR)
+
+     pure subroutine stdlib_clamtsqr( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, mb, nb, ldt, lwork, ldc
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), t(ldt,*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: c(ldc,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: i, ii, kk, lw, ctr, q
+           ! External Subroutines 
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork<0
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'C' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           if (left) then
+             lw = n * nb
+             q = m
+           else
+             lw = m * nb
+             q = n
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<k ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 ) then
+             info = -5
+           else if( k<nb .or. nb<1 ) then
+             info = -7
+           else if( lda<max( 1, q ) ) then
+             info = -9
+           else if( ldt<max( 1, nb) ) then
+             info = -11
+           else if( ldc<max( 1, m ) ) then
+              info = -13
+           else if(( lwork<max(1,lw)).and.(.not.lquery)) then
+             info = -15
+           end if
+           ! determine the block size if it is tall skinny or short and wide
+           if( info==0)  then
+               work(1) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'CLAMTSQR', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n,k)==0 ) then
+             return
+           end if
+           if((mb<=k).or.(mb>=max(m,n,k))) then
+             call stdlib_cgemqrt( side, trans, m, n, k, nb, a, lda,t, ldt, c, ldc, work, info)
+                       
+             return
+            end if
+           if(left.and.notran) then
+               ! multiply q to the last block of c
+              kk = mod((m-k),(mb-k))
+              ctr = (m-k)/(mb-k)
+              if (kk>0) then
+                ii=m-kk+1
+                call stdlib_ctpmqrt('L','N',kk , n, k, 0, nb, a(ii,1), lda,t(1, ctr*k+1),ldt , c(&
+                          1,1), ldc,c(ii,1), ldc, work, info )
+              else
+                ii=m+1
+              end if
+              do i=ii-(mb-k),mb+1,-(mb-k)
+               ! multiply q to the current block of c (i:i+mb,1:n)
+                ctr = ctr - 1
+                call stdlib_ctpmqrt('L','N',mb-k , n, k, 0,nb, a(i,1), lda,t(1,ctr*k+1),ldt, c(1,&
+                          1), ldc,c(i,1), ldc, work, info )
+              end do
+               ! multiply q to the first block of c (1:mb,1:n)
+              call stdlib_cgemqrt('L','N',mb , n, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+           else if (left.and.tran) then
+               ! multiply q to the first block of c
+              kk = mod((m-k),(mb-k))
+              ii=m-kk+1
+              ctr = 1
+              call stdlib_cgemqrt('L','C',mb , n, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=mb+1,ii-mb+k,(mb-k)
+               ! multiply q to the current block of c (i:i+mb,1:n)
+               call stdlib_ctpmqrt('L','C',mb-k , n, k, 0,nb, a(i,1), lda,t(1, ctr*k+1),ldt, c(1,&
+                         1), ldc,c(i,1), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=m) then
+               ! multiply q to the last block of c
+               call stdlib_ctpmqrt('L','C',kk , n, k, 0,nb, a(ii,1), lda,t(1,ctr*k+1), ldt, c(1,1)&
+                         , ldc,c(ii,1), ldc, work, info )
+              end if
+           else if(right.and.tran) then
+               ! multiply q to the last block of c
+               kk = mod((n-k),(mb-k))
+               ctr = (n-k)/(mb-k)
+               if (kk>0) then
+                 ii=n-kk+1
+                 call stdlib_ctpmqrt('R','C',m , kk, k, 0, nb, a(ii,1), lda,t(1, ctr*k+1), ldt, c(&
+                           1,1), ldc,c(1,ii), ldc, work, info )
+               else
+                 ii=n+1
+               end if
+               do i=ii-(mb-k),mb+1,-(mb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+                 ctr = ctr - 1
+                 call stdlib_ctpmqrt('R','C',m , mb-k, k, 0,nb, a(i,1), lda,t(1,ctr*k+1), ldt, c(&
+                           1,1), ldc,c(1,i), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:mb)
+               call stdlib_cgemqrt('R','C',m , mb, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (right.and.notran) then
+               ! multiply q to the first block of c
+              kk = mod((n-k),(mb-k))
+              ii=n-kk+1
+              ctr = 1
+              call stdlib_cgemqrt('R','N', m, mb , k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=mb+1,ii-mb+k,(mb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               call stdlib_ctpmqrt('R','N', m, mb-k, k, 0,nb, a(i,1), lda,t(1,ctr*k+1),ldt, c(1,1)&
+                         , ldc,c(1,i), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=n) then
+               ! multiply q to the last block of c
+               call stdlib_ctpmqrt('R','N', m, kk , k, 0,nb, a(ii,1), lda,t(1,ctr*k+1),ldt, c(1,1)&
+                         , ldc,c(1,ii), ldc, work, info )
+              end if
+           end if
+           work(1) = lw
+           return
+     end subroutine stdlib_clamtsqr
+
+     !> CLAQR2: is identical to CLAQR3 except that it avoids
+     !> recursion by calling CLAHQR instead of CLAQR4.
+     !> Aggressive early deflation:
+     !> This subroutine accepts as input an upper Hessenberg matrix
+     !> H and performs an unitary similarity transformation
+     !> designed to detect and deflate fully converged eigenvalues from
+     !> a trailing principal submatrix.  On output H has been over-
+     !> written by a new Hessenberg matrix that is a perturbation of
+     !> an unitary similarity transformation of H.  It is to be
+     !> hoped that the final version of H has many zero subdiagonal
+     !> entries.
+
+     pure subroutine stdlib_claqr2( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+               ns, nd, sh, v, ldv, nh, t, ldt,nv, wv, ldwv, work, lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kbot, ktop, ldh, ldt, ldv, ldwv, ldz, lwork, n,&
+                      nh, nv, nw
+           integer(ilp), intent(out) :: nd, ns
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(sp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(sp), intent(out) :: sh(*), t(ldt,*), v(ldv,*), work(*), wv(ldwv,*)
+        ! ================================================================
+           ! Parameters 
+           real(sp), parameter :: rzero = 0.0_sp
+           real(sp), parameter :: rone = 1.0_sp
+           
+           
+           ! Local Scalars 
+           complex(sp) :: beta, cdum, s, tau
+           real(sp) :: foo, safmax, safmin, smlnum, ulp
+           integer(ilp) :: i, ifst, ilst, info, infqr, j, jw, kcol, kln, knt, krow, kwtop, ltop, &
+                     lwk1, lwk2, lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,int,max,min,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! ==== estimate optimal workspace. ====
+           jw = min( nw, kbot-ktop+1 )
+           if( jw<=2 ) then
+              lwkopt = 1
+           else
+              ! ==== workspace query call to stdlib_cgehrd ====
+              call stdlib_cgehrd( jw, 1, jw-1, t, ldt, work, work, -1, info )
+              lwk1 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_cunmhr ====
+              call stdlib_cunmhr( 'R', 'N', jw, jw, 1, jw-1, t, ldt, work, v, ldv,work, -1, info )
+                        
+              lwk2 = int( work( 1 ),KIND=ilp)
+              ! ==== optimal workspace ====
+              lwkopt = jw + max( lwk1, lwk2 )
+           end if
+           ! ==== quick return in case of workspace query. ====
+           if( lwork==-1 ) then
+              work( 1 ) = cmplx( lwkopt, 0,KIND=sp)
+              return
+           end if
+           ! ==== nothing to do ...
+           ! ... for an empty active block ... ====
+           ns = 0
+           nd = 0
+           work( 1 ) = cone
+           if( ktop>kbot )return
+           ! ... nor for an empty deflation window. ====
+           if( nw<1 )return
+           ! ==== machine constants ====
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = rone / safmin
+           call stdlib_slabad( safmin, safmax )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=sp) / ulp )
+           ! ==== setup deflation window ====
+           jw = min( nw, kbot-ktop+1 )
+           kwtop = kbot - jw + 1
+           if( kwtop==ktop ) then
+              s = czero
+           else
+              s = h( kwtop, kwtop-1 )
+           end if
+           if( kbot==kwtop ) then
+              ! ==== 1-by-1 deflation window: not much to do ====
+              sh( kwtop ) = h( kwtop, kwtop )
+              ns = 1
+              nd = 0
+              if( cabs1( s )<=max( smlnum, ulp*cabs1( h( kwtop,kwtop ) ) ) ) then
+                 ns = 0
+                 nd = 1
+                 if( kwtop>ktop )h( kwtop, kwtop-1 ) = czero
+              end if
+              work( 1 ) = cone
+              return
+           end if
+           ! ==== convert to spike-triangular form.  (in case of a
+           ! .    rare qr failure, this routine continues to do
+           ! .    aggressive early deflation using that part of
+           ! .    the deflation window that converged using infqr
+           ! .    here and there to keep track.) ====
+           call stdlib_clacpy( 'U', jw, jw, h( kwtop, kwtop ), ldh, t, ldt )
+           call stdlib_ccopy( jw-1, h( kwtop+1, kwtop ), ldh+1, t( 2, 1 ), ldt+1 )
+           call stdlib_claset( 'A', jw, jw, czero, cone, v, ldv )
+           call stdlib_clahqr( .true., .true., jw, 1, jw, t, ldt, sh( kwtop ), 1,jw, v, ldv, &
+                     infqr )
+           ! ==== deflation detection loop ====
+           ns = jw
+           ilst = infqr + 1
+           do knt = infqr + 1, jw
+              ! ==== small spike tip deflation test ====
+              foo = cabs1( t( ns, ns ) )
+              if( foo==rzero )foo = cabs1( s )
+              if( cabs1( s )*cabs1( v( 1, ns ) )<=max( smlnum, ulp*foo ) )then
+                 ! ==== cone more converged eigenvalue ====
+                 ns = ns - 1
+              else
+                 ! ==== cone undeflatable eigenvalue.  move it up out of the
+                 ! .    way.   (stdlib_ctrexc can not fail in this case.) ====
+                 ifst = ns
+                 call stdlib_ctrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, info )
+                 ilst = ilst + 1
+              end if
+           end do
+              ! ==== return to hessenberg form ====
+           if( ns==0 )s = czero
+           if( ns<jw ) then
+              ! ==== sorting the diagonal of t improves accuracy for
+              ! .    graded matrices.  ====
+              do i = infqr + 1, ns
+                 ifst = i
+                 do j = i + 1, ns
+                    if( cabs1( t( j, j ) )>cabs1( t( ifst, ifst ) ) )ifst = j
+                 end do
+                 ilst = i
+                 if( ifst/=ilst )call stdlib_ctrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, info )
+                           
+              end do
+           end if
+           ! ==== restore shift/eigenvalue array from t ====
+           do i = infqr + 1, jw
+              sh( kwtop+i-1 ) = t( i, i )
+           end do
+           if( ns<jw .or. s==czero ) then
+              if( ns>1 .and. s/=czero ) then
+                 ! ==== reflect spike back into lower triangle ====
+                 call stdlib_ccopy( ns, v, ldv, work, 1 )
+                 do i = 1, ns
+                    work( i ) = conjg( work( i ) )
+                 end do
+                 beta = work( 1 )
+                 call stdlib_clarfg( ns, beta, work( 2 ), 1, tau )
+                 work( 1 ) = cone
+                 call stdlib_claset( 'L', jw-2, jw-2, czero, czero, t( 3, 1 ), ldt )
+                 call stdlib_clarf( 'L', ns, jw, work, 1, conjg( tau ), t, ldt,work( jw+1 ) )
+                           
+                 call stdlib_clarf( 'R', ns, ns, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_clarf( 'R', jw, ns, work, 1, tau, v, ldv,work( jw+1 ) )
+                 call stdlib_cgehrd( jw, 1, ns, t, ldt, work, work( jw+1 ),lwork-jw, info )
+                           
+              end if
+              ! ==== copy updated reduced window into place ====
+              if( kwtop>1 )h( kwtop, kwtop-1 ) = s*conjg( v( 1, 1 ) )
+              call stdlib_clacpy( 'U', jw, jw, t, ldt, h( kwtop, kwtop ), ldh )
+              call stdlib_ccopy( jw-1, t( 2, 1 ), ldt+1, h( kwtop+1, kwtop ),ldh+1 )
+              ! ==== accumulate orthogonal matrix in order update
+              ! .    h and z, if requested.  ====
+              if( ns>1 .and. s/=czero )call stdlib_cunmhr( 'R', 'N', jw, ns, 1, ns, t, ldt, work, &
+                        v, ldv,work( jw+1 ), lwork-jw, info )
+              ! ==== update vertical slab in h ====
+              if( wantt ) then
+                 ltop = 1
+              else
+                 ltop = ktop
+              end if
+              do krow = ltop, kwtop - 1, nv
+                 kln = min( nv, kwtop-krow )
+                 call stdlib_cgemm( 'N', 'N', kln, jw, jw, cone, h( krow, kwtop ),ldh, v, ldv, &
+                           czero, wv, ldwv )
+                 call stdlib_clacpy( 'A', kln, jw, wv, ldwv, h( krow, kwtop ), ldh )
+              end do
+              ! ==== update horizontal slab in h ====
+              if( wantt ) then
+                 do kcol = kbot + 1, n, nh
+                    kln = min( nh, n-kcol+1 )
+                    call stdlib_cgemm( 'C', 'N', jw, kln, jw, cone, v, ldv,h( kwtop, kcol ), ldh, &
+                              czero, t, ldt )
+                    call stdlib_clacpy( 'A', jw, kln, t, ldt, h( kwtop, kcol ),ldh )
+                 end do
+              end if
+              ! ==== update vertical slab in z ====
+              if( wantz ) then
+                 do krow = iloz, ihiz, nv
+                    kln = min( nv, ihiz-krow+1 )
+                    call stdlib_cgemm( 'N', 'N', kln, jw, jw, cone, z( krow, kwtop ),ldz, v, ldv, &
+                              czero, wv, ldwv )
+                    call stdlib_clacpy( 'A', kln, jw, wv, ldwv, z( krow, kwtop ),ldz )
+                 end do
+              end if
+           end if
+           ! ==== return the number of deflations ... ====
+           nd = jw - ns
+           ! ==== ... and the number of shifts. (subtracting
+           ! .    infqr from the spike length takes care
+           ! .    of the case of a rare qr failure while
+           ! .    calculating eigenvalues of the deflation
+           ! .    window.)  ====
+           ns = ns - infqr
+            ! ==== return optimal workspace. ====
+           work( 1 ) = cmplx( lwkopt, 0,KIND=sp)
+     end subroutine stdlib_claqr2
+
+     !> CLASWLQ: computes a blocked Tall-Skinny LQ factorization of
+     !> a complex M-by-N matrix A for M <= N:
+     !> A = ( L 0 ) *  Q,
+     !> where:
+     !> Q is a n-by-N orthogonal matrix, stored on exit in an implicit
+     !> form in the elements above the diagonal of the array A and in
+     !> the elements of the array T;
+     !> L is a lower-triangular M-by-M matrix stored on exit in
+     !> the elements on and below the diagonal of the array A.
+     !> 0 is a M-by-(N-M) zero matrix, if M < N, and is not stored.
+
+     pure subroutine stdlib_claswlq( m, n, mb, nb, a, lda, t, ldt, work, lwork,info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, mb, nb, lwork, ldt
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ii, kk, ctr
+           ! External Subroutines 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+             info = -1
+           else if( n<0 .or. n<m ) then
+             info = -2
+           else if( mb<1 .or. ( mb>m .and. m>0 )) then
+             info = -3
+           else if( nb<=0 ) then
+             info = -4
+           else if( lda<max( 1, m ) ) then
+             info = -6
+           else if( ldt<mb ) then
+             info = -8
+           else if( ( lwork<m*mb) .and. (.not.lquery) ) then
+             info = -10
+           end if
+           if( info==0)  then
+           work(1) = mb*m
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'CLASWLQ', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n)==0 ) then
+               return
+           end if
+           ! the lq decomposition
+            if((m>=n).or.(nb<=m).or.(nb>=n)) then
+             call stdlib_cgelqt( m, n, mb, a, lda, t, ldt, work, info)
+             return
+            end if
+            kk = mod((n-m),(nb-m))
+            ii=n-kk+1
+            ! compute the lq factorization of the first block a(1:m,1:nb)
+            call stdlib_cgelqt( m, nb, mb, a(1,1), lda, t, ldt, work, info)
+            ctr = 1
+            do i = nb+1, ii-nb+m , (nb-m)
+            ! compute the qr factorization of the current block a(1:m,i:i+nb-m)
+              call stdlib_ctplqt( m, nb-m, 0, mb, a(1,1), lda, a( 1, i ),lda, t(1,ctr*m+1),ldt, &
+                        work, info )
+              ctr = ctr + 1
+            end do
+           ! compute the qr factorization of the last block a(1:m,ii:n)
+            if (ii<=n) then
+             call stdlib_ctplqt( m, kk, 0, mb, a(1,1), lda, a( 1, ii ),lda, t(1,ctr*m+1), ldt,&
+                       work, info )
+            end if
+           work( 1 ) = m * mb
+           return
+     end subroutine stdlib_claswlq
+
+     !> CLATSQR: computes a blocked Tall-Skinny QR factorization of
+     !> a complex M-by-N matrix A for M >= N:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix, stored on exit in an implicit
+     !> form in the elements below the diagonal of the array A and in
+     !> the elements of the array T;
+     !> R is an upper-triangular N-by-N matrix, stored on exit in
+     !> the elements on and above the diagonal of the array A.
+     !> 0 is a (M-N)-by-N zero matrix, and is not stored.
+
+     pure subroutine stdlib_clatsqr( m, n, mb, nb, a, lda, t, ldt, work,lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, mb, nb, ldt, lwork
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ii, kk, ctr
+           ! External Subroutines 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+             info = -1
+           else if( n<0 .or. m<n ) then
+             info = -2
+           else if( mb<1 ) then
+             info = -3
+           else if( nb<1 .or. ( nb>n .and. n>0 )) then
+             info = -4
+           else if( lda<max( 1, m ) ) then
+             info = -6
+           else if( ldt<nb ) then
+             info = -8
+           else if( lwork<(n*nb) .and. (.not.lquery) ) then
+             info = -10
+           end if
+           if( info==0)  then
+             work(1) = nb*n
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'CLATSQR', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n)==0 ) then
+               return
+           end if
+           ! the qr decomposition
+            if ((mb<=n).or.(mb>=m)) then
+              call stdlib_cgeqrt( m, n, nb, a, lda, t, ldt, work, info)
+              return
+            end if
+            kk = mod((m-n),(mb-n))
+            ii=m-kk+1
+            ! compute the qr factorization of the first block a(1:mb,1:n)
+            call stdlib_cgeqrt( mb, n, nb, a(1,1), lda, t, ldt, work, info )
+            ctr = 1
+            do i = mb+1, ii-mb+n ,  (mb-n)
+            ! compute the qr factorization of the current block a(i:i+mb-n,1:n)
+              call stdlib_ctpqrt( mb-n, n, 0, nb, a(1,1), lda, a( i, 1 ), lda,t(1,ctr * n + 1),&
+                        ldt, work, info )
+              ctr = ctr + 1
+            end do
+            ! compute the qr factorization of the last block a(ii:m,1:n)
+            if (ii<=m) then
+              call stdlib_ctpqrt( kk, n, 0, nb, a(1,1), lda, a( ii, 1 ), lda,t(1, ctr * n + 1), &
+                        ldt,work, info )
+            end if
+           work( 1 ) = n*nb
+           return
+     end subroutine stdlib_clatsqr
+
+     !> CPBSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite band matrix and X
+     !> and B are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular band matrix, and L is a lower
+     !> triangular band matrix, with the same number of superdiagonals or
+     !> subdiagonals as A.  The factored form of A is then used to solve the
+     !> system of equations A * X = B.
+
+     pure subroutine stdlib_cpbsv( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           complex(sp), intent(inout) :: ab(ldab,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPBSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**h*u or a = l*l**h.
+           call stdlib_cpbtrf( uplo, n, kd, ab, ldab, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_cpbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_cpbsv
+
+     !> CPBSVX: uses the Cholesky factorization A = U**H*U or A = L*L**H to
+     !> compute the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite band matrix and X
+     !> and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_cpbsvx( fact, uplo, n, kd, nrhs, ab, ldab, afb, ldafb,equed, s, b, ldb, x, &
+               ldx, rcond, ferr, berr,work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldafb, ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(sp), intent(inout) :: s(*)
+           complex(sp), intent(inout) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ, upper
+           integer(ilp) :: i, infequ, j, j1, j2
+           real(sp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           else if( ldafb<kd+1 ) then
+              info = -9
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -10
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -11
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -13
+                 else if( ldx<max( 1, n ) ) then
+                    info = -15
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPBSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_cpbequ( uplo, n, kd, ab, ldab, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_claqhb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right-hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**h *u or a = l*l**h.
+              if( upper ) then
+                 do j = 1, n
+                    j1 = max( j-kd, 1 )
+                    call stdlib_ccopy( j-j1+1, ab( kd+1-j+j1, j ), 1,afb( kd+1-j+j1, j ), 1 )
+                              
+                 end do
+              else
+                 do j = 1, n
+                    j2 = min( j+kd, n )
+                    call stdlib_ccopy( j2-j+1, ab( 1, j ), 1, afb( 1, j ), 1 )
+                 end do
+              end if
+              call stdlib_cpbtrf( uplo, n, kd, afb, ldafb, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_clanhb( '1', uplo, n, kd, ab, ldab, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_cpbcon( uplo, n, kd, afb, ldafb, anorm, rcond, work, rwork,info )
+           ! compute the solution matrix x.
+           call stdlib_clacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_cpbtrs( uplo, n, kd, nrhs, afb, ldafb, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_cpbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b, ldb, x,ldx, ferr, berr,&
+                      work, rwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_cpbsvx
+
+     !> CPFTRF: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the block version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_cpftrf( transr, uplo, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(in) :: n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPFTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_cpotrf( 'L', n1, a( 0 ), n, info )
+                    if( info>0 )return
+                    call stdlib_ctrsm( 'R', 'L', 'C', 'N', n2, n1, cone, a( 0 ), n,a( n1 ), n )
+                              
+                    call stdlib_cherk( 'U', 'N', n2, n1, -one, a( n1 ), n, one,a( n ), n )
+                    call stdlib_cpotrf( 'U', n2, a( n ), n, info )
+                    if( info>0 )info = info + n1
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_cpotrf( 'L', n1, a( n2 ), n, info )
+                    if( info>0 )return
+                    call stdlib_ctrsm( 'L', 'L', 'N', 'N', n1, n2, cone, a( n2 ), n,a( 0 ), n )
+                              
+                    call stdlib_cherk( 'U', 'C', n2, n1, -one, a( 0 ), n, one,a( n1 ), n )
+                    call stdlib_cpotrf( 'U', n2, a( n1 ), n, info )
+                    if( info>0 )info = info + n1
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    call stdlib_cpotrf( 'U', n1, a( 0 ), n1, info )
+                    if( info>0 )return
+                    call stdlib_ctrsm( 'L', 'U', 'C', 'N', n1, n2, cone, a( 0 ), n1,a( n1*n1 ), &
+                              n1 )
+                    call stdlib_cherk( 'L', 'C', n2, n1, -one, a( n1*n1 ), n1, one,a( 1 ), n1 )
+                              
+                    call stdlib_cpotrf( 'L', n2, a( 1 ), n1, info )
+                    if( info>0 )info = info + n1
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    call stdlib_cpotrf( 'U', n1, a( n2*n2 ), n2, info )
+                    if( info>0 )return
+                    call stdlib_ctrsm( 'R', 'U', 'N', 'N', n2, n1, cone, a( n2*n2 ),n2, a( 0 ), &
+                              n2 )
+                    call stdlib_cherk( 'L', 'N', n2, n1, -one, a( 0 ), n2, one,a( n1*n2 ), n2 )
+                              
+                    call stdlib_cpotrf( 'L', n2, a( n1*n2 ), n2, info )
+                    if( info>0 )info = info + n1
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_cpotrf( 'L', k, a( 1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_ctrsm( 'R', 'L', 'C', 'N', k, k, cone, a( 1 ), n+1,a( k+1 ), n+1 )
+                              
+                    call stdlib_cherk( 'U', 'N', k, k, -one, a( k+1 ), n+1, one,a( 0 ), n+1 )
+                              
+                    call stdlib_cpotrf( 'U', k, a( 0 ), n+1, info )
+                    if( info>0 )info = info + k
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_cpotrf( 'L', k, a( k+1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_ctrsm( 'L', 'L', 'N', 'N', k, k, cone, a( k+1 ),n+1, a( 0 ), n+1 )
+                              
+                    call stdlib_cherk( 'U', 'C', k, k, -one, a( 0 ), n+1, one,a( k ), n+1 )
+                              
+                    call stdlib_cpotrf( 'U', k, a( k ), n+1, info )
+                    if( info>0 )info = info + k
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_cpotrf( 'U', k, a( 0+k ), k, info )
+                    if( info>0 )return
+                    call stdlib_ctrsm( 'L', 'U', 'C', 'N', k, k, cone, a( k ), n1,a( k*( k+1 ) ), &
+                              k )
+                    call stdlib_cherk( 'L', 'C', k, k, -one, a( k*( k+1 ) ), k, one,a( 0 ), k )
+                              
+                    call stdlib_cpotrf( 'L', k, a( 0 ), k, info )
+                    if( info>0 )info = info + k
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_cpotrf( 'U', k, a( k*( k+1 ) ), k, info )
+                    if( info>0 )return
+                    call stdlib_ctrsm( 'R', 'U', 'N', 'N', k, k, cone,a( k*( k+1 ) ), k, a( 0 ), &
+                              k )
+                    call stdlib_cherk( 'L', 'N', k, k, -one, a( 0 ), k, one,a( k*k ), k )
+                    call stdlib_cpotrf( 'L', k, a( k*k ), k, info )
+                    if( info>0 )info = info + k
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_cpftrf
+
+     !> CPFTRI: computes the inverse of a complex Hermitian positive definite
+     !> matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+     !> computed by CPFTRF.
+
+     pure subroutine stdlib_cpftri( transr, uplo, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPFTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_ctftri( transr, uplo, 'N', n, a, info )
+           if( info>0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution of triangular matrix multiply: inv(u)*inv(u)^c or
+           ! inv(l)^c*inv(l). there are eight cases.
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                    ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                    ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_clauum( 'L', n1, a( 0 ), n, info )
+                    call stdlib_cherk( 'L', 'C', n1, n2, one, a( n1 ), n, one,a( 0 ), n )
+                    call stdlib_ctrmm( 'L', 'U', 'N', 'N', n2, n1, cone, a( n ), n,a( n1 ), n )
+                              
+                    call stdlib_clauum( 'U', n2, a( n ), n, info )
+                 else
+                    ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                    ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                    ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_clauum( 'L', n1, a( n2 ), n, info )
+                    call stdlib_cherk( 'L', 'N', n1, n2, one, a( 0 ), n, one,a( n2 ), n )
+                    call stdlib_ctrmm( 'R', 'U', 'C', 'N', n1, n2, cone, a( n1 ), n,a( 0 ), n )
+                              
+                    call stdlib_clauum( 'U', n2, a( n1 ), n, info )
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose, and n is odd
+                    ! t1 -> a(0), t2 -> a(1), s -> a(0+n1*n1)
+                    call stdlib_clauum( 'U', n1, a( 0 ), n1, info )
+                    call stdlib_cherk( 'U', 'N', n1, n2, one, a( n1*n1 ), n1, one,a( 0 ), n1 )
+                              
+                    call stdlib_ctrmm( 'R', 'L', 'N', 'N', n1, n2, cone, a( 1 ), n1,a( n1*n1 ), &
+                              n1 )
+                    call stdlib_clauum( 'L', n2, a( 1 ), n1, info )
+                 else
+                    ! srpa for upper, transpose, and n is odd
+                    ! t1 -> a(0+n2*n2), t2 -> a(0+n1*n2), s -> a(0)
+                    call stdlib_clauum( 'U', n1, a( n2*n2 ), n2, info )
+                    call stdlib_cherk( 'U', 'C', n1, n2, one, a( 0 ), n2, one,a( n2*n2 ), n2 )
+                              
+                    call stdlib_ctrmm( 'L', 'L', 'C', 'N', n2, n1, cone, a( n1*n2 ),n2, a( 0 ), &
+                              n2 )
+                    call stdlib_clauum( 'L', n2, a( n1*n2 ), n2, info )
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_clauum( 'L', k, a( 1 ), n+1, info )
+                    call stdlib_cherk( 'L', 'C', k, k, one, a( k+1 ), n+1, one,a( 1 ), n+1 )
+                              
+                    call stdlib_ctrmm( 'L', 'U', 'N', 'N', k, k, cone, a( 0 ), n+1,a( k+1 ), n+1 )
+                              
+                    call stdlib_clauum( 'U', k, a( 0 ), n+1, info )
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_clauum( 'L', k, a( k+1 ), n+1, info )
+                    call stdlib_cherk( 'L', 'N', k, k, one, a( 0 ), n+1, one,a( k+1 ), n+1 )
+                              
+                    call stdlib_ctrmm( 'R', 'U', 'C', 'N', k, k, cone, a( k ), n+1,a( 0 ), n+1 )
+                              
+                    call stdlib_clauum( 'U', k, a( k ), n+1, info )
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose, and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1),
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_clauum( 'U', k, a( k ), k, info )
+                    call stdlib_cherk( 'U', 'N', k, k, one, a( k*( k+1 ) ), k, one,a( k ), k )
+                              
+                    call stdlib_ctrmm( 'R', 'L', 'N', 'N', k, k, cone, a( 0 ), k,a( k*( k+1 ) ), &
+                              k )
+                    call stdlib_clauum( 'L', k, a( 0 ), k, info )
+                 else
+                    ! srpa for upper, transpose, and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0),
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_clauum( 'U', k, a( k*( k+1 ) ), k, info )
+                    call stdlib_cherk( 'U', 'C', k, k, one, a( 0 ), k, one,a( k*( k+1 ) ), k )
+                              
+                    call stdlib_ctrmm( 'L', 'L', 'C', 'N', k, k, cone, a( k*k ), k,a( 0 ), k )
+                              
+                    call stdlib_clauum( 'L', k, a( k*k ), k, info )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_cpftri
+
+     !> CPOSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**H* U,  if UPLO = 'U', or
+     !> A = L * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and  L is a lower triangular
+     !> matrix.  The factored form of A is then used to solve the system of
+     !> equations A * X = B.
+
+     pure subroutine stdlib_cposv( uplo, n, nrhs, a, lda, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPOSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**h*u or a = l*l**h.
+           call stdlib_cpotrf( uplo, n, a, lda, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_cpotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_cposv
+
+     !> CPOSVX: uses the Cholesky factorization A = U**H*U or A = L*L**H to
+     !> compute the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_cposvx( fact, uplo, n, nrhs, a, lda, af, ldaf, equed,s, b, ldb, x, ldx, &
+               rcond, ferr, berr, work,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(sp), intent(inout) :: s(*)
+           complex(sp), intent(inout) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ
+           integer(ilp) :: i, infequ, j
+           real(sp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -9
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -10
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -12
+                 else if( ldx<max( 1, n ) ) then
+                    info = -14
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPOSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_cpoequ( n, a, lda, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_claqhe( uplo, n, a, lda, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**h *u or a = l*l**h.
+              call stdlib_clacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_cpotrf( uplo, n, af, ldaf, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_clanhe( '1', uplo, n, a, lda, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_cpocon( uplo, n, af, ldaf, anorm, rcond, work, rwork, info )
+           ! compute the solution matrix x.
+           call stdlib_clacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_cpotrs( uplo, n, nrhs, af, ldaf, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_cporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x, ldx,ferr, berr, work, &
+                     rwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_cposvx
+
+     !> CPTRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian positive definite
+     !> and tridiagonal, and provides error bounds and backward error
+     !> estimates for the solution.
+
+     pure subroutine stdlib_cptrfs( uplo, n, nrhs, d, e, df, ef, b, ldb, x, ldx,ferr, berr, work, &
+               rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(sp), intent(in) :: d(*), df(*)
+           complex(sp), intent(in) :: b(ldb,*), e(*), ef(*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ix, j, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin
+           complex(sp) :: bi, cx, dx, ex, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,max,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=sp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPTRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = 4
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_100: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x.  also compute
+              ! abs(a)*abs(x) + abs(b) for use in the backward error bound.
+              if( upper ) then
+                 if( n==1 ) then
+                    bi = b( 1, j )
+                    dx = d( 1 )*x( 1, j )
+                    work( 1 ) = bi - dx
+                    rwork( 1 ) = cabs1( bi ) + cabs1( dx )
+                 else
+                    bi = b( 1, j )
+                    dx = d( 1 )*x( 1, j )
+                    ex = e( 1 )*x( 2, j )
+                    work( 1 ) = bi - dx - ex
+                    rwork( 1 ) = cabs1( bi ) + cabs1( dx ) +cabs1( e( 1 ) )*cabs1( x( 2, j ) )
+                              
+                    do i = 2, n - 1
+                       bi = b( i, j )
+                       cx = conjg( e( i-1 ) )*x( i-1, j )
+                       dx = d( i )*x( i, j )
+                       ex = e( i )*x( i+1, j )
+                       work( i ) = bi - cx - dx - ex
+                       rwork( i ) = cabs1( bi ) +cabs1( e( i-1 ) )*cabs1( x( i-1, j ) ) +cabs1( &
+                                 dx ) + cabs1( e( i ) )*cabs1( x( i+1, j ) )
+                    end do
+                    bi = b( n, j )
+                    cx = conjg( e( n-1 ) )*x( n-1, j )
+                    dx = d( n )*x( n, j )
+                    work( n ) = bi - cx - dx
+                    rwork( n ) = cabs1( bi ) + cabs1( e( n-1 ) )*cabs1( x( n-1, j ) ) + cabs1( dx &
+                              )
+                 end if
+              else
+                 if( n==1 ) then
+                    bi = b( 1, j )
+                    dx = d( 1 )*x( 1, j )
+                    work( 1 ) = bi - dx
+                    rwork( 1 ) = cabs1( bi ) + cabs1( dx )
+                 else
+                    bi = b( 1, j )
+                    dx = d( 1 )*x( 1, j )
+                    ex = conjg( e( 1 ) )*x( 2, j )
+                    work( 1 ) = bi - dx - ex
+                    rwork( 1 ) = cabs1( bi ) + cabs1( dx ) +cabs1( e( 1 ) )*cabs1( x( 2, j ) )
+                              
+                    do i = 2, n - 1
+                       bi = b( i, j )
+                       cx = e( i-1 )*x( i-1, j )
+                       dx = d( i )*x( i, j )
+                       ex = conjg( e( i ) )*x( i+1, j )
+                       work( i ) = bi - cx - dx - ex
+                       rwork( i ) = cabs1( bi ) +cabs1( e( i-1 ) )*cabs1( x( i-1, j ) ) +cabs1( &
+                                 dx ) + cabs1( e( i ) )*cabs1( x( i+1, j ) )
+                    end do
+                    bi = b( n, j )
+                    cx = e( n-1 )*x( n-1, j )
+                    dx = d( n )*x( n, j )
+                    work( n ) = bi - cx - dx
+                    rwork( n ) = cabs1( bi ) + cabs1( e( n-1 ) )*cabs1( x( n-1, j ) ) + cabs1( dx &
+                              )
+                 end if
+              end if
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_cpttrs( uplo, n, 1, df, ef, work, n, info )
+                 call stdlib_caxpy( n, cmplx( one,KIND=sp), work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              ix = stdlib_isamax( n, rwork, 1 )
+              ferr( j ) = rwork( ix )
+              ! estimate the norm of inv(a).
+              ! solve m(a) * x = e, where m(a) = (m(i,j)) is given by
+                 ! m(i,j) =  abs(a(i,j)), i = j,
+                 ! m(i,j) = -abs(a(i,j)), i .ne. j,
+              ! and e = [ 1, 1, ..., 1 ]**t.  note m(a) = m(l)*d*m(l)**h.
+              ! solve m(l) * x = e.
+              rwork( 1 ) = one
+              do i = 2, n
+                 rwork( i ) = one + rwork( i-1 )*abs( ef( i-1 ) )
+              end do
+              ! solve d * m(l)**h * x = b.
+              rwork( n ) = rwork( n ) / df( n )
+              do i = n - 1, 1, -1
+                 rwork( i ) = rwork( i ) / df( i ) +rwork( i+1 )*abs( ef( i ) )
+              end do
+              ! compute norm(inv(a)) = max(x(i)), 1<=i<=n.
+              ix = stdlib_isamax( n, rwork, 1 )
+              ferr( j ) = ferr( j )*abs( rwork( ix ) )
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_100
+           return
+     end subroutine stdlib_cptrfs
+
+     !> CPTSV: computes the solution to a complex system of linear equations
+     !> A*X = B, where A is an N-by-N Hermitian positive definite tridiagonal
+     !> matrix, and X and B are N-by-NRHS matrices.
+     !> A is factored as A = L*D*L**H, and the factored form of A is then
+     !> used to solve the system of equations.
+
+     pure subroutine stdlib_cptsv( n, nrhs, d, e, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(inout) :: d(*)
+           complex(sp), intent(inout) :: b(ldb,*), e(*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPTSV ', -info )
+              return
+           end if
+           ! compute the l*d*l**h (or u**h*d*u) factorization of a.
+           call stdlib_cpttrf( n, d, e, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_cpttrs( 'LOWER', n, nrhs, d, e, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_cptsv
+
+     !> CPTSVX: uses the factorization A = L*D*L**H to compute the solution
+     !> to a complex system of linear equations A*X = B, where A is an
+     !> N-by-N Hermitian positive definite tridiagonal matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     pure subroutine stdlib_cptsvx( fact, n, nrhs, d, e, df, ef, b, ldb, x, ldx,rcond, ferr, berr,&
+                work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(sp), intent(in) :: d(*)
+           real(sp), intent(inout) :: df(*)
+           complex(sp), intent(in) :: b(ldb,*), e(*)
+           complex(sp), intent(inout) :: ef(*)
+           complex(sp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(sp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CPTSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the l*d*l**h (or u**h*d*u) factorization of a.
+              call stdlib_scopy( n, d, 1, df, 1 )
+              if( n>1 )call stdlib_ccopy( n-1, e, 1, ef, 1 )
+              call stdlib_cpttrf( n, df, ef, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_clanht( '1', n, d, e )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_cptcon( n, df, ef, anorm, rcond, rwork, info )
+           ! compute the solution vectors x.
+           call stdlib_clacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_cpttrs( 'LOWER', n, nrhs, df, ef, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_cptrfs( 'LOWER', n, nrhs, d, e, df, ef, b, ldb, x, ldx, ferr,berr, work, &
+                     rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_cptsvx
+
+     !> CSTEDC: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric tridiagonal matrix using the divide and conquer method.
+     !> The eigenvectors of a full or band complex Hermitian matrix can also
+     !> be found if CHETRD or CHPTRD or CHBTRD has been used to reduce this
+     !> matrix to tridiagonal form.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.  See SLAED3 for details.
+
+     pure subroutine stdlib_cstedc( compz, n, d, e, z, ldz, work, lwork, rwork,lrwork, iwork, &
+               liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(out) :: work(*)
+           complex(sp), intent(inout) :: z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: finish, i, icompz, ii, j, k, lgn, liwmin, ll, lrwmin, lwmin, m, smlsiz,&
+                      start
+           real(sp) :: eps, orgnrm, p, tiny
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,mod,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or.( icompz>0 .and. ldz<max( 1, n ) ) ) then
+              info = -6
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              smlsiz = stdlib_ilaenv( 9, 'CSTEDC', ' ', 0, 0, 0, 0 )
+              if( n<=1 .or. icompz==0 ) then
+                 lwmin = 1
+                 liwmin = 1
+                 lrwmin = 1
+              else if( n<=smlsiz ) then
+                 lwmin = 1
+                 liwmin = 1
+                 lrwmin = 2*( n - 1 )
+              else if( icompz==1 ) then
+                 lgn = int( log( real( n,KIND=sp) ) / log( two ),KIND=ilp)
+                 if( 2**lgn<n )lgn = lgn + 1
+                 if( 2**lgn<n )lgn = lgn + 1
+                 lwmin = n*n
+                 lrwmin = 1 + 3*n + 2*n*lgn + 4*n**2
+                 liwmin = 6 + 6*n + 5*n*lgn
+              else if( icompz==2 ) then
+                 lwmin = 1
+                 lrwmin = 1 + 4*n + 2*n**2
+                 liwmin = 3 + 5*n
+              end if
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -8
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -10
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSTEDC', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz/=0 )z( 1, 1 ) = one
+              return
+           end if
+           ! if the following conditional clause is removed, then the routine
+           ! will use the divide and conquer routine to compute only the
+           ! eigenvalues, which requires (3n + 3n**2) real workspace and
+           ! (2 + 5n + 2n lg(n)) integer workspace.
+           ! since on many architectures stdlib_ssterf is much faster than any other
+           ! algorithm for finding eigenvalues only, it is used here
+           ! as the default. if the conditional clause is removed, then
+           ! information on the size of workspace needs to be changed.
+           ! if compz = 'n', use stdlib_ssterf to compute the eigenvalues.
+           if( icompz==0 ) then
+              call stdlib_ssterf( n, d, e, info )
+              go to 70
+           end if
+           ! if n is smaller than the minimum divide size (smlsiz+1), then
+           ! solve the problem with another solver.
+           if( n<=smlsiz ) then
+              call stdlib_csteqr( compz, n, d, e, z, ldz, rwork, info )
+           else
+              ! if compz = 'i', we simply call stdlib_sstedc instead.
+              if( icompz==2 ) then
+                 call stdlib_slaset( 'FULL', n, n, zero, one, rwork, n )
+                 ll = n*n + 1
+                 call stdlib_sstedc( 'I', n, d, e, rwork, n,rwork( ll ), lrwork-ll+1, iwork, &
+                           liwork, info )
+                 do j = 1, n
+                    do i = 1, n
+                       z( i, j ) = rwork( ( j-1 )*n+i )
+                    end do
+                 end do
+                 go to 70
+              end if
+              ! from now on, only option left to be handled is compz = 'v',
+              ! i.e. icompz = 1.
+              ! scale.
+              orgnrm = stdlib_slanst( 'M', n, d, e )
+              if( orgnrm==zero )go to 70
+              eps = stdlib_slamch( 'EPSILON' )
+              start = 1
+              ! while ( start <= n )
+              30 continue
+              if( start<=n ) then
+                 ! let finish be the position of the next subdiagonal entry
+                 ! such that e( finish ) <= tiny or finish = n if no such
+                 ! subdiagonal exists.  the matrix identified by the elements
+                 ! between start and finish constitutes an independent
+                 ! sub-problem.
+                 finish = start
+                 40 continue
+                 if( finish<n ) then
+                    tiny = eps*sqrt( abs( d( finish ) ) )*sqrt( abs( d( finish+1 ) ) )
+                    if( abs( e( finish ) )>tiny ) then
+                       finish = finish + 1
+                       go to 40
+                    end if
+                 end if
+                 ! (sub) problem determined.  compute its size and solve it.
+                 m = finish - start + 1
+                 if( m>smlsiz ) then
+                    ! scale.
+                    orgnrm = stdlib_slanst( 'M', m, d( start ), e( start ) )
+                    call stdlib_slascl( 'G', 0, 0, orgnrm, one, m, 1, d( start ), m,info )
+                    call stdlib_slascl( 'G', 0, 0, orgnrm, one, m-1, 1, e( start ),m-1, info )
+                              
+                    call stdlib_claed0( n, m, d( start ), e( start ), z( 1, start ),ldz, work, n, &
+                              rwork, iwork, info )
+                    if( info>0 ) then
+                       info = ( info / ( m+1 )+start-1 )*( n+1 ) +mod( info, ( m+1 ) ) + start - &
+                                 1
+                       go to 70
+                    end if
+                    ! scale back.
+                    call stdlib_slascl( 'G', 0, 0, one, orgnrm, m, 1, d( start ), m,info )
+                 else
+                    call stdlib_ssteqr( 'I', m, d( start ), e( start ), rwork, m,rwork( m*m+1 ), &
+                              info )
+                    call stdlib_clacrm( n, m, z( 1, start ), ldz, rwork, m, work, n,rwork( m*m+1 )&
+                               )
+                    call stdlib_clacpy( 'A', n, m, work, n, z( 1, start ), ldz )
+                    if( info>0 ) then
+                       info = start*( n+1 ) + finish
+                       go to 70
+                    end if
+                 end if
+                 start = finish + 1
+                 go to 30
+              end if
+              ! endwhile
+              ! use selection sort to minimize swaps of eigenvectors
+              do ii = 2, n
+                i = ii - 1
+                k = i
+                p = d( i )
+                do j = ii, n
+                   if( d( j )<p ) then
+                      k = j
+                      p = d( j )
+                   end if
+                end do
+                if( k/=i ) then
+                   d( k ) = d( i )
+                   d( i ) = p
+                   call stdlib_cswap( n, z( 1, i ), 1, z( 1, k ), 1 )
+                end if
+              end do
+           end if
+           70 continue
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_cstedc
+
+     !> CSTEGR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+     !> a well defined set of pairwise different real eigenvalues, the corresponding
+     !> real eigenvectors are pairwise orthogonal.
+     !> The spectrum may be computed either completely or partially by specifying
+     !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+     !> eigenvalues.
+     !> CSTEGR is a compatibility wrapper around the improved CSTEMR routine.
+     !> See SSTEMR for further details.
+     !> One important change is that the ABSTOL parameter no longer provides any
+     !> benefit and hence is no longer used.
+     !> Note : CSTEGR and CSTEMR work only on machines which follow
+     !> IEEE-754 floating-point standard in their handling of infinities and
+     !> NaNs.  Normal execution may create these exceptiona values and hence
+     !> may abort due to a floating point exception in environments which
+     !> do not conform to the IEEE-754 standard.
+
+     pure subroutine stdlib_cstegr( jobz, range, n, d, e, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               isuppz, work, lwork, iwork,liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           integer(ilp), intent(in) :: il, iu, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: w(*), work(*)
+           complex(sp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: tryrac
+           ! Executable Statements 
+           info = 0
+           tryrac = .false.
+           call stdlib_cstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, n, isuppz, &
+                     tryrac, work, lwork,iwork, liwork, info )
+     end subroutine stdlib_cstegr
+
+     !> CTGSEN: reorders the generalized Schur decomposition of a complex
+     !> matrix pair (A, B) (in terms of an unitary equivalence trans-
+     !> formation Q**H * (A, B) * Z), so that a selected cluster of eigenvalues
+     !> appears in the leading diagonal blocks of the pair (A,B). The leading
+     !> columns of Q and Z form unitary bases of the corresponding left and
+     !> right eigenspaces (deflating subspaces). (A, B) must be in
+     !> generalized Schur canonical form, that is, A and B are both upper
+     !> triangular.
+     !> CTGSEN also computes the generalized eigenvalues
+     !> w(j)= ALPHA(j) / BETA(j)
+     !> of the reordered matrix pair (A, B).
+     !> Optionally, the routine computes estimates of reciprocal condition
+     !> numbers for eigenvalues and eigenspaces. These are Difu[(A11,B11),
+     !> (A22,B22)] and Difl[(A11,B11), (A22,B22)], i.e. the separation(s)
+     !> between the matrix pairs (A11, B11) and (A22,B22) that correspond to
+     !> the selected cluster and the eigenvalues outside the cluster, resp.,
+     !> and norms of "projections" onto left and right eigenspaces w.r.t.
+     !> the selected cluster in the (1,1)-block.
+
+     pure subroutine stdlib_ctgsen( ijob, wantq, wantz, select, n, a, lda, b, ldb,alpha, beta, q, &
+               ldq, z, ldz, m, pl, pr, dif,work, lwork, iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldq, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(out) :: pl, pr
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: dif(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           complex(sp), intent(out) :: alpha(*), beta(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: idifjb = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, swap, wantd, wantd1, wantd2, wantp
+           integer(ilp) :: i, ierr, ijb, k, kase, ks, liwmin, lwmin, mn2, n1, n2
+           real(sp) :: dscale, dsum, rdscal, safmin
+           complex(sp) :: temp1, temp2
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,conjg,max,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           info = 0
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( ijob<0 .or. ijob>5 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -13
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTGSEN', -info )
+              return
+           end if
+           ierr = 0
+           wantp = ijob==1 .or. ijob>=4
+           wantd1 = ijob==2 .or. ijob==4
+           wantd2 = ijob==3 .or. ijob==5
+           wantd = wantd1 .or. wantd2
+           ! set m to the dimension of the specified pair of deflating
+           ! subspaces.
+           m = 0
+           if( .not.lquery .or. ijob/=0 ) then
+           do k = 1, n
+              alpha( k ) = a( k, k )
+              beta( k ) = b( k, k )
+              if( k<n ) then
+                 if( select( k ) )m = m + 1
+              else
+                 if( select( n ) )m = m + 1
+              end if
+           end do
+           end if
+           if( ijob==1 .or. ijob==2 .or. ijob==4 ) then
+              lwmin = max( 1, 2*m*(n-m) )
+              liwmin = max( 1, n+2 )
+           else if( ijob==3 .or. ijob==5 ) then
+              lwmin = max( 1, 4*m*(n-m) )
+              liwmin = max( 1, 2*m*(n-m), n+2 )
+           else
+              lwmin = 1
+              liwmin = 1
+           end if
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           if( lwork<lwmin .and. .not.lquery ) then
+              info = -21
+           else if( liwork<liwmin .and. .not.lquery ) then
+              info = -23
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTGSEN', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( m==n .or. m==0 ) then
+              if( wantp ) then
+                 pl = one
+                 pr = one
+              end if
+              if( wantd ) then
+                 dscale = zero
+                 dsum = one
+                 do i = 1, n
+                    call stdlib_classq( n, a( 1, i ), 1, dscale, dsum )
+                    call stdlib_classq( n, b( 1, i ), 1, dscale, dsum )
+                 end do
+                 dif( 1 ) = dscale*sqrt( dsum )
+                 dif( 2 ) = dif( 1 )
+              end if
+              go to 70
+           end if
+           ! get machine constant
+           safmin = stdlib_slamch( 'S' )
+           ! collect the selected blocks at the top-left corner of (a, b).
+           ks = 0
+           do k = 1, n
+              swap = select( k )
+              if( swap ) then
+                 ks = ks + 1
+                 ! swap the k-th block to position ks. compute unitary q
+                 ! and z that will swap adjacent diagonal blocks in (a, b).
+                 if( k/=ks )call stdlib_ctgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, k,&
+                            ks, ierr )
+                 if( ierr>0 ) then
+                    ! swap is rejected: exit.
+                    info = 1
+                    if( wantp ) then
+                       pl = zero
+                       pr = zero
+                    end if
+                    if( wantd ) then
+                       dif( 1 ) = zero
+                       dif( 2 ) = zero
+                    end if
+                    go to 70
+                 end if
+              end if
+           end do
+           if( wantp ) then
+              ! solve generalized sylvester equation for r and l:
+                         ! a11 * r - l * a22 = a12
+                         ! b11 * r - l * b22 = b12
+              n1 = m
+              n2 = n - m
+              i = n1 + 1
+              call stdlib_clacpy( 'FULL', n1, n2, a( 1, i ), lda, work, n1 )
+              call stdlib_clacpy( 'FULL', n1, n2, b( 1, i ), ldb, work( n1*n2+1 ),n1 )
+              ijb = 0
+              call stdlib_ctgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda, work,n1, b, ldb, b( i,&
+               i ), ldb, work( n1*n2+1 ), n1,dscale, dif( 1 ), work( n1*n2*2+1 ),lwork-2*n1*n2, &
+                         iwork, ierr )
+              ! estimate the reciprocal of norms of "projections" onto
+              ! left and right eigenspaces
+              rdscal = zero
+              dsum = one
+              call stdlib_classq( n1*n2, work, 1, rdscal, dsum )
+              pl = rdscal*sqrt( dsum )
+              if( pl==zero ) then
+                 pl = one
+              else
+                 pl = dscale / ( sqrt( dscale*dscale / pl+pl )*sqrt( pl ) )
+              end if
+              rdscal = zero
+              dsum = one
+              call stdlib_classq( n1*n2, work( n1*n2+1 ), 1, rdscal, dsum )
+              pr = rdscal*sqrt( dsum )
+              if( pr==zero ) then
+                 pr = one
+              else
+                 pr = dscale / ( sqrt( dscale*dscale / pr+pr )*sqrt( pr ) )
+              end if
+           end if
+           if( wantd ) then
+              ! compute estimates difu and difl.
+              if( wantd1 ) then
+                 n1 = m
+                 n2 = n - m
+                 i = n1 + 1
+                 ijb = idifjb
+                 ! frobenius norm-based difu estimate.
+                 call stdlib_ctgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda, work,n1, b, ldb, b(&
+                  i, i ), ldb, work( n1*n2+1 ),n1, dscale, dif( 1 ), work( n1*n2*2+1 ),lwork-&
+                            2*n1*n2, iwork, ierr )
+                 ! frobenius norm-based difl estimate.
+                 call stdlib_ctgsyl( 'N', ijb, n2, n1, a( i, i ), lda, a, lda, work,n2, b( i, i ),&
+                  ldb, b, ldb, work( n1*n2+1 ),n2, dscale, dif( 2 ), work( n1*n2*2+1 ),lwork-&
+                            2*n1*n2, iwork, ierr )
+              else
+                 ! compute 1-norm-based estimates of difu and difl using
+                 ! reversed communication with stdlib_clacn2. in each step a
+                 ! generalized sylvester equation or a transposed variant
+                 ! is solved.
+                 kase = 0
+                 n1 = m
+                 n2 = n - m
+                 i = n1 + 1
+                 ijb = 0
+                 mn2 = 2*n1*n2
+                 ! 1-norm-based estimate of difu.
+                 40 continue
+                 call stdlib_clacn2( mn2, work( mn2+1 ), work, dif( 1 ), kase,isave )
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve generalized sylvester equation
+                       call stdlib_ctgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda,work, n1, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n1, dscale, dif( 1 ),work( n1*n2*2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    else
+                       ! solve the transposed variant.
+                       call stdlib_ctgsyl( 'C', ijb, n1, n2, a, lda, a( i, i ), lda,work, n1, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n1, dscale, dif( 1 ),work( n1*n2*2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    end if
+                    go to 40
+                 end if
+                 dif( 1 ) = dscale / dif( 1 )
+                 ! 1-norm-based estimate of difl.
+                 50 continue
+                 call stdlib_clacn2( mn2, work( mn2+1 ), work, dif( 2 ), kase,isave )
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve generalized sylvester equation
+                       call stdlib_ctgsyl( 'N', ijb, n2, n1, a( i, i ), lda, a, lda,work, n2, b( &
+                       i, i ), ldb, b, ldb,work( n1*n2+1 ), n2, dscale, dif( 2 ),work( n1*n2*2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    else
+                       ! solve the transposed variant.
+                       call stdlib_ctgsyl( 'C', ijb, n2, n1, a( i, i ), lda, a, lda,work, n2, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n2, dscale, dif( 2 ),work( n1*n2*2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    end if
+                    go to 50
+                 end if
+                 dif( 2 ) = dscale / dif( 2 )
+              end if
+           end if
+           ! if b(k,k) is complex, make it real and positive (normalization
+           ! of the generalized schur form) and store the generalized
+           ! eigenvalues of reordered pair (a, b)
+           do k = 1, n
+              dscale = abs( b( k, k ) )
+              if( dscale>safmin ) then
+                 temp1 = conjg( b( k, k ) / dscale )
+                 temp2 = b( k, k ) / dscale
+                 b( k, k ) = dscale
+                 call stdlib_cscal( n-k, temp1, b( k, k+1 ), ldb )
+                 call stdlib_cscal( n-k+1, temp1, a( k, k ), lda )
+                 if( wantq )call stdlib_cscal( n, temp2, q( 1, k ), 1 )
+              else
+                 b( k, k ) = cmplx( zero, zero,KIND=sp)
+              end if
+              alpha( k ) = a( k, k )
+              beta( k ) = b( k, k )
+           end do
+           70 continue
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_ctgsen
+
+     !> CTGSNA: estimates reciprocal condition numbers for specified
+     !> eigenvalues and/or eigenvectors of a matrix pair (A, B).
+     !> (A, B) must be in generalized Schur canonical form, that is, A and
+     !> B are both upper triangular.
+
+     pure subroutine stdlib_ctgsna( job, howmny, select, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, s, &
+               dif, mm, m, work, lwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: dif(*), s(*)
+           complex(sp), intent(in) :: a(lda,*), b(ldb,*), vl(ldvl,*), vr(ldvr,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: idifjb = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, somcon, wantbh, wantdf, wants
+           integer(ilp) :: i, ierr, ifst, ilst, k, ks, lwmin, n1, n2
+           real(sp) :: bignum, cond, eps, lnrm, rnrm, scale, smlnum
+           complex(sp) :: yhax, yhbx
+           ! Local Arrays 
+           complex(sp) :: dummy(1), dummy1(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,max
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantdf = stdlib_lsame( job, 'V' ) .or. wantbh
+           somcon = stdlib_lsame( howmny, 'S' )
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.wants .and. .not.wantdf ) then
+              info = -1
+           else if( .not.stdlib_lsame( howmny, 'A' ) .and. .not.somcon ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( wants .and. ldvl<n ) then
+              info = -10
+           else if( wants .and. ldvr<n ) then
+              info = -12
+           else
+              ! set m to the number of eigenpairs for which condition numbers
+              ! are required, and test mm.
+              if( somcon ) then
+                 m = 0
+                 do k = 1, n
+                    if( select( k ) )m = m + 1
+                 end do
+              else
+                 m = n
+              end if
+              if( n==0 ) then
+                 lwmin = 1
+              else if( stdlib_lsame( job, 'V' ) .or. stdlib_lsame( job, 'B' ) ) then
+                 lwmin = 2*n*n
+              else
+                 lwmin = n
+              end if
+              work( 1 ) = lwmin
+              if( mm<m ) then
+                 info = -15
+              else if( lwork<lwmin .and. .not.lquery ) then
+                 info = -18
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTGSNA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ks = 0
+           loop_20: do k = 1, n
+              ! determine whether condition numbers are required for the k-th
+              ! eigenpair.
+              if( somcon ) then
+                 if( .not.select( k ) )cycle loop_20
+              end if
+              ks = ks + 1
+              if( wants ) then
+                 ! compute the reciprocal condition number of the k-th
+                 ! eigenvalue.
+                 rnrm = stdlib_scnrm2( n, vr( 1, ks ), 1 )
+                 lnrm = stdlib_scnrm2( n, vl( 1, ks ), 1 )
+                 call stdlib_cgemv( 'N', n, n, cmplx( one, zero,KIND=sp), a, lda,vr( 1, ks ), 1, &
+                           cmplx( zero, zero,KIND=sp), work, 1 )
+                 yhax = stdlib_cdotc( n, work, 1, vl( 1, ks ), 1 )
+                 call stdlib_cgemv( 'N', n, n, cmplx( one, zero,KIND=sp), b, ldb,vr( 1, ks ), 1, &
+                           cmplx( zero, zero,KIND=sp), work, 1 )
+                 yhbx = stdlib_cdotc( n, work, 1, vl( 1, ks ), 1 )
+                 cond = stdlib_slapy2( abs( yhax ), abs( yhbx ) )
+                 if( cond==zero ) then
+                    s( ks ) = -one
+                 else
+                    s( ks ) = cond / ( rnrm*lnrm )
+                 end if
+              end if
+              if( wantdf ) then
+                 if( n==1 ) then
+                    dif( ks ) = stdlib_slapy2( abs( a( 1, 1 ) ), abs( b( 1, 1 ) ) )
+                 else
+                    ! estimate the reciprocal condition number of the k-th
+                    ! eigenvectors.
+                    ! copy the matrix (a, b) to the array work and move the
+                    ! (k,k)th pair to the (1,1) position.
+                    call stdlib_clacpy( 'FULL', n, n, a, lda, work, n )
+                    call stdlib_clacpy( 'FULL', n, n, b, ldb, work( n*n+1 ), n )
+                    ifst = k
+                    ilst = 1
+                    call stdlib_ctgexc( .false., .false., n, work, n, work( n*n+1 ),n, dummy, 1, &
+                              dummy1, 1, ifst, ilst, ierr )
+                    if( ierr>0 ) then
+                       ! ill-conditioned problem - swap rejected.
+                       dif( ks ) = zero
+                    else
+                       ! reordering successful, solve generalized sylvester
+                       ! equation for r and l,
+                                  ! a22 * r - l * a11 = a12
+                                  ! b22 * r - l * b11 = b12,
+                       ! and compute estimate of difl[(a11,b11), (a22, b22)].
+                       n1 = 1
+                       n2 = n - n1
+                       i = n*n + 1
+                       call stdlib_ctgsyl( 'N', idifjb, n2, n1, work( n*n1+n1+1 ),n, work, n, &
+                       work( n1+1 ), n,work( n*n1+n1+i ), n, work( i ), n,work( n1+i ), n, scale, &
+                                 dif( ks ), dummy,1, iwork, ierr )
+                    end if
+                 end if
+              end if
+           end do loop_20
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_ctgsna
+
+     !> CTRSEN: reorders the Schur factorization of a complex matrix
+     !> A = Q*T*Q**H, so that a selected cluster of eigenvalues appears in
+     !> the leading positions on the diagonal of the upper triangular matrix
+     !> T, and the leading columns of Q form an orthonormal basis of the
+     !> corresponding right invariant subspace.
+     !> Optionally the routine computes the reciprocal condition numbers of
+     !> the cluster of eigenvalues and/or the invariant subspace.
+
+     subroutine stdlib_ctrsen( job, compq, select, n, t, ldt, q, ldq, w, m, s,sep, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldq, ldt, lwork, n
+           real(sp), intent(out) :: s, sep
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           complex(sp), intent(inout) :: q(ldq,*), t(ldt,*)
+           complex(sp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantbh, wantq, wants, wantsp
+           integer(ilp) :: ierr, k, kase, ks, lwmin, n1, n2, nn
+           real(sp) :: est, rnorm, scale
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           real(sp) :: rwork(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantsp = stdlib_lsame( job, 'V' ) .or. wantbh
+           wantq = stdlib_lsame( compq, 'V' )
+           ! set m to the number of selected eigenvalues.
+           m = 0
+           do k = 1, n
+              if( select( k ) )m = m + 1
+           end do
+           n1 = m
+           n2 = n - m
+           nn = n1*n2
+           info = 0
+           lquery = ( lwork==-1 )
+           if( wantsp ) then
+              lwmin = max( 1, 2*nn )
+           else if( stdlib_lsame( job, 'N' ) ) then
+              lwmin = 1
+           else if( stdlib_lsame( job, 'E' ) ) then
+              lwmin = max( 1, nn )
+           end if
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.wants .and. .not.wantsp )then
+              info = -1
+           else if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -8
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -14
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CTRSEN', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==n .or. m==0 ) then
+              if( wants )s = one
+              if( wantsp )sep = stdlib_clange( '1', n, n, t, ldt, rwork )
+              go to 40
+           end if
+           ! collect the selected eigenvalues at the top left corner of t.
+           ks = 0
+           do k = 1, n
+              if( select( k ) ) then
+                 ks = ks + 1
+                 ! swap the k-th eigenvalue to position ks.
+                 if( k/=ks )call stdlib_ctrexc( compq, n, t, ldt, q, ldq, k, ks, ierr )
+              end if
+           end do
+           if( wants ) then
+              ! solve the sylvester equation for r:
+                 ! t11*r - r*t22 = scale*t12
+              call stdlib_clacpy( 'F', n1, n2, t( 1, n1+1 ), ldt, work, n1 )
+              call stdlib_ctrsyl( 'N', 'N', -1, n1, n2, t, ldt, t( n1+1, n1+1 ),ldt, work, n1, &
+                        scale, ierr )
+              ! estimate the reciprocal of the condition number of the cluster
+              ! of eigenvalues.
+              rnorm = stdlib_clange( 'F', n1, n2, work, n1, rwork )
+              if( rnorm==zero ) then
+                 s = one
+              else
+                 s = scale / ( sqrt( scale*scale / rnorm+rnorm )*sqrt( rnorm ) )
+              end if
+           end if
+           if( wantsp ) then
+              ! estimate sep(t11,t22).
+              est = zero
+              kase = 0
+              30 continue
+              call stdlib_clacn2( nn, work( nn+1 ), work, est, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! solve t11*r - r*t22 = scale*x.
+                    call stdlib_ctrsyl( 'N', 'N', -1, n1, n2, t, ldt,t( n1+1, n1+1 ), ldt, work, &
+                              n1, scale,ierr )
+                 else
+                    ! solve t11**h*r - r*t22**h = scale*x.
+                    call stdlib_ctrsyl( 'C', 'C', -1, n1, n2, t, ldt,t( n1+1, n1+1 ), ldt, work, &
+                              n1, scale,ierr )
+                 end if
+                 go to 30
+              end if
+              sep = scale / est
+           end if
+           40 continue
+           ! copy reordered eigenvalues to w.
+           do k = 1, n
+              w( k ) = t( k, k )
+           end do
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_ctrsen
+
+     !> CUNBDB1: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. Q must be no larger than P,
+     !> M-P, or M-Q. Routines CUNBDB2, CUNBDB3, and CUNBDB4 handle cases in
+     !> which Q is not the minimum dimension.
+     !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are Q-by-Q bidiagonal matrices represented implicitly by
+     !> angles THETA, PHI.
+
+     subroutine stdlib_cunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(sp), intent(out) :: phi(*), theta(*)
+           complex(sp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           complex(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < q .or. m-p < q ) then
+              info = -2
+           else if( q < 0 .or. m-q < q ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p-1, m-p-1, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-2
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'CUNBDB1', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce columns 1, ..., q of x11 and x21
+           do i = 1, q
+              call stdlib_clarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              call stdlib_clarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              theta(i) = atan2( real( x21(i,i),KIND=sp), real( x11(i,i),KIND=sp) )
+              c = cos( theta(i) )
+              s = sin( theta(i) )
+              x11(i,i) = cone
+              x21(i,i) = cone
+              call stdlib_clarf( 'L', p-i+1, q-i, x11(i,i), 1, conjg(taup1(i)),x11(i,i+1), ldx11, &
+                        work(ilarf) )
+              call stdlib_clarf( 'L', m-p-i+1, q-i, x21(i,i), 1, conjg(taup2(i)),x21(i,i+1), &
+                        ldx21, work(ilarf) )
+              if( i < q ) then
+                 call stdlib_csrot( q-i, x11(i,i+1), ldx11, x21(i,i+1), ldx21, c,s )
+                 call stdlib_clacgv( q-i, x21(i,i+1), ldx21 )
+                 call stdlib_clarfgp( q-i, x21(i,i+1), x21(i,i+2), ldx21, tauq1(i) )
+                 s = real( x21(i,i+1),KIND=sp)
+                 x21(i,i+1) = cone
+                 call stdlib_clarf( 'R', p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x11(i+1,i+1), &
+                           ldx11, work(ilarf) )
+                 call stdlib_clarf( 'R', m-p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x21(i+1,i+1), &
+                           ldx21, work(ilarf) )
+                 call stdlib_clacgv( q-i, x21(i,i+1), ldx21 )
+                 c = sqrt( stdlib_scnrm2( p-i, x11(i+1,i+1), 1 )**2+ stdlib_scnrm2( m-p-i, x21(i+&
+                           1,i+1), 1 )**2 )
+                 phi(i) = atan2( s, c )
+                 call stdlib_cunbdb5( p-i, m-p-i, q-i-1, x11(i+1,i+1), 1,x21(i+1,i+1), 1, x11(i+1,&
+                           i+2), ldx11,x21(i+1,i+2), ldx21, work(iorbdb5), lorbdb5,childinfo )
+              end if
+           end do
+           return
+     end subroutine stdlib_cunbdb1
+
+     !> CUNBDB2: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. P must be no larger than M-P,
+     !> Q, or M-Q. Routines CUNBDB1, CUNBDB3, and CUNBDB4 handle cases in
+     !> which P is not the minimum dimension.
+     !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are P-by-P bidiagonal matrices represented implicitly by
+     !> angles THETA, PHI.
+
+     subroutine stdlib_cunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(sp), intent(out) :: phi(*), theta(*)
+           complex(sp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           complex(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < 0 .or. p > m-p ) then
+              info = -2
+           else if( q < 0 .or. q < p .or. m-q < p ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p-1, m-p, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-1
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'CUNBDB2', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce rows 1, ..., p of x11 and x21
+           do i = 1, p
+              if( i > 1 ) then
+                 call stdlib_csrot( q-i+1, x11(i,i), ldx11, x21(i-1,i), ldx21, c,s )
+              end if
+              call stdlib_clacgv( q-i+1, x11(i,i), ldx11 )
+              call stdlib_clarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
+              c = real( x11(i,i),KIND=sp)
+              x11(i,i) = cone
+              call stdlib_clarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_clarf( 'R', m-p-i+1, q-i+1, x11(i,i), ldx11, tauq1(i),x21(i,i), ldx21, &
+                        work(ilarf) )
+              call stdlib_clacgv( q-i+1, x11(i,i), ldx11 )
+              s = sqrt( stdlib_scnrm2( p-i, x11(i+1,i), 1 )**2+ stdlib_scnrm2( m-p-i+1, x21(i,i), &
+                        1 )**2 )
+              theta(i) = atan2( s, c )
+              call stdlib_cunbdb5( p-i, m-p-i+1, q-i, x11(i+1,i), 1, x21(i,i), 1,x11(i+1,i+1), &
+                        ldx11, x21(i,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
+              call stdlib_cscal( p-i, cnegone, x11(i+1,i), 1 )
+              call stdlib_clarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              if( i < p ) then
+                 call stdlib_clarfgp( p-i, x11(i+1,i), x11(i+2,i), 1, taup1(i) )
+                 phi(i) = atan2( real( x11(i+1,i),KIND=sp), real( x21(i,i),KIND=sp) )
+                 c = cos( phi(i) )
+                 s = sin( phi(i) )
+                 x11(i+1,i) = cone
+                 call stdlib_clarf( 'L', p-i, q-i, x11(i+1,i), 1, conjg(taup1(i)),x11(i+1,i+1), &
+                           ldx11, work(ilarf) )
+              end if
+              x21(i,i) = cone
+              call stdlib_clarf( 'L', m-p-i+1, q-i, x21(i,i), 1, conjg(taup2(i)),x21(i,i+1), &
+                        ldx21, work(ilarf) )
+           end do
+           ! reduce the bottom-right portion of x21 to the identity matrix
+           do i = p + 1, q
+              call stdlib_clarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              x21(i,i) = cone
+              call stdlib_clarf( 'L', m-p-i+1, q-i, x21(i,i), 1, conjg(taup2(i)),x21(i,i+1), &
+                        ldx21, work(ilarf) )
+           end do
+           return
+     end subroutine stdlib_cunbdb2
+
+     !> CUNBDB3: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-P must be no larger than P,
+     !> Q, or M-Q. Routines CUNBDB1, CUNBDB2, and CUNBDB4 handle cases in
+     !> which M-P is not the minimum dimension.
+     !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are (M-P)-by-(M-P) bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_cunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(sp), intent(out) :: phi(*), theta(*)
+           complex(sp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           complex(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( 2*p < m .or. p > m ) then
+              info = -2
+           else if( q < m-p .or. m-q < m-p ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p, m-p-1, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-1
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'CUNBDB3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce rows 1, ..., m-p of x11 and x21
+           do i = 1, m-p
+              if( i > 1 ) then
+                 call stdlib_csrot( q-i+1, x11(i-1,i), ldx11, x21(i,i), ldx11, c,s )
+              end if
+              call stdlib_clacgv( q-i+1, x21(i,i), ldx21 )
+              call stdlib_clarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
+              s = real( x21(i,i),KIND=sp)
+              x21(i,i) = cone
+              call stdlib_clarf( 'R', p-i+1, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_clarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
+                        work(ilarf) )
+              call stdlib_clacgv( q-i+1, x21(i,i), ldx21 )
+              c = sqrt( stdlib_scnrm2( p-i+1, x11(i,i), 1 )**2+ stdlib_scnrm2( m-p-i, x21(i+1,i), &
+                        1 )**2 )
+              theta(i) = atan2( s, c )
+              call stdlib_cunbdb5( p-i+1, m-p-i, q-i, x11(i,i), 1, x21(i+1,i), 1,x11(i,i+1), &
+                        ldx11, x21(i+1,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
+              call stdlib_clarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              if( i < m-p ) then
+                 call stdlib_clarfgp( m-p-i, x21(i+1,i), x21(i+2,i), 1, taup2(i) )
+                 phi(i) = atan2( real( x21(i+1,i),KIND=sp), real( x11(i,i),KIND=sp) )
+                 c = cos( phi(i) )
+                 s = sin( phi(i) )
+                 x21(i+1,i) = cone
+                 call stdlib_clarf( 'L', m-p-i, q-i, x21(i+1,i), 1, conjg(taup2(i)),x21(i+1,i+1), &
+                           ldx21, work(ilarf) )
+              end if
+              x11(i,i) = cone
+              call stdlib_clarf( 'L', p-i+1, q-i, x11(i,i), 1, conjg(taup1(i)),x11(i,i+1), ldx11, &
+                        work(ilarf) )
+           end do
+           ! reduce the bottom-right portion of x11 to the identity matrix
+           do i = m-p + 1, q
+              call stdlib_clarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              x11(i,i) = cone
+              call stdlib_clarf( 'L', p-i+1, q-i, x11(i,i), 1, conjg(taup1(i)),x11(i,i+1), ldx11, &
+                        work(ilarf) )
+           end do
+           return
+     end subroutine stdlib_cunbdb3
+
+     !> CUNBDB4: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-Q must be no larger than P,
+     !> M-P, or Q. Routines CUNBDB1, CUNBDB2, and CUNBDB3 handle cases in
+     !> which M-Q is not the minimum dimension.
+     !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are (M-Q)-by-(M-Q) bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_cunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               phantom, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(sp), intent(out) :: phi(*), theta(*)
+           complex(sp), intent(out) :: phantom(*), taup1(*), taup2(*), tauq1(*), work(*)
+           complex(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, j, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < m-q .or. m-p < m-q ) then
+              info = -2
+           else if( q < m-q .or. q > m ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( q-1, p-1, m-p-1 )
+              iorbdb5 = 2
+              lorbdb5 = q
+              lworkopt = ilarf + llarf - 1
+              lworkopt = max( lworkopt, iorbdb5 + lorbdb5 - 1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'CUNBDB4', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce columns 1, ..., m-q of x11 and x21
+           do i = 1, m-q
+              if( i == 1 ) then
+                 do j = 1, m
+                    phantom(j) = czero
+                 end do
+                 call stdlib_cunbdb5( p, m-p, q, phantom(1), 1, phantom(p+1), 1,x11, ldx11, x21, &
+                           ldx21, work(iorbdb5),lorbdb5, childinfo )
+                 call stdlib_cscal( p, cnegone, phantom(1), 1 )
+                 call stdlib_clarfgp( p, phantom(1), phantom(2), 1, taup1(1) )
+                 call stdlib_clarfgp( m-p, phantom(p+1), phantom(p+2), 1, taup2(1) )
+                 theta(i) = atan2( real( phantom(1),KIND=sp), real( phantom(p+1),KIND=sp) )
+                           
+                 c = cos( theta(i) )
+                 s = sin( theta(i) )
+                 phantom(1) = cone
+                 phantom(p+1) = cone
+                 call stdlib_clarf( 'L', p, q, phantom(1), 1, conjg(taup1(1)), x11,ldx11, work(&
+                           ilarf) )
+                 call stdlib_clarf( 'L', m-p, q, phantom(p+1), 1, conjg(taup2(1)),x21, ldx21, &
+                           work(ilarf) )
+              else
+                 call stdlib_cunbdb5( p-i+1, m-p-i+1, q-i+1, x11(i,i-1), 1,x21(i,i-1), 1, x11(i,i)&
+                           , ldx11, x21(i,i),ldx21, work(iorbdb5), lorbdb5, childinfo )
+                 call stdlib_cscal( p-i+1, cnegone, x11(i,i-1), 1 )
+                 call stdlib_clarfgp( p-i+1, x11(i,i-1), x11(i+1,i-1), 1, taup1(i) )
+                 call stdlib_clarfgp( m-p-i+1, x21(i,i-1), x21(i+1,i-1), 1,taup2(i) )
+                 theta(i) = atan2( real( x11(i,i-1),KIND=sp), real( x21(i,i-1),KIND=sp) )
+                 c = cos( theta(i) )
+                 s = sin( theta(i) )
+                 x11(i,i-1) = cone
+                 x21(i,i-1) = cone
+                 call stdlib_clarf( 'L', p-i+1, q-i+1, x11(i,i-1), 1,conjg(taup1(i)), x11(i,i), &
+                           ldx11, work(ilarf) )
+                 call stdlib_clarf( 'L', m-p-i+1, q-i+1, x21(i,i-1), 1,conjg(taup2(i)), x21(i,i), &
+                           ldx21, work(ilarf) )
+              end if
+              call stdlib_csrot( q-i+1, x11(i,i), ldx11, x21(i,i), ldx21, s, -c )
+              call stdlib_clacgv( q-i+1, x21(i,i), ldx21 )
+              call stdlib_clarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
+              c = real( x21(i,i),KIND=sp)
+              x21(i,i) = cone
+              call stdlib_clarf( 'R', p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_clarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
+                        work(ilarf) )
+              call stdlib_clacgv( q-i+1, x21(i,i), ldx21 )
+              if( i < m-q ) then
+                 s = sqrt( stdlib_scnrm2( p-i, x11(i+1,i), 1 )**2+ stdlib_scnrm2( m-p-i, x21(i+1,&
+                           i), 1 )**2 )
+                 phi(i) = atan2( s, c )
+              end if
+           end do
+           ! reduce the bottom-right portion of x11 to [ i 0 ]
+           do i = m - q + 1, p
+              call stdlib_clacgv( q-i+1, x11(i,i), ldx11 )
+              call stdlib_clarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
+              x11(i,i) = cone
+              call stdlib_clarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_clarf( 'R', q-p, q-i+1, x11(i,i), ldx11, tauq1(i),x21(m-q+1,i), ldx21, &
+                        work(ilarf) )
+              call stdlib_clacgv( q-i+1, x11(i,i), ldx11 )
+           end do
+           ! reduce the bottom-right portion of x21 to [ 0 i ]
+           do i = p + 1, q
+              call stdlib_clacgv( q-i+1, x21(m-q+i-p,i), ldx21 )
+              call stdlib_clarfgp( q-i+1, x21(m-q+i-p,i), x21(m-q+i-p,i+1), ldx21,tauq1(i) )
+                        
+              x21(m-q+i-p,i) = cone
+              call stdlib_clarf( 'R', q-i, q-i+1, x21(m-q+i-p,i), ldx21, tauq1(i),x21(m-q+i-p+1,i)&
+                        , ldx21, work(ilarf) )
+              call stdlib_clacgv( q-i+1, x21(m-q+i-p,i), ldx21 )
+           end do
+           return
+     end subroutine stdlib_cunbdb4
+
+     !> CUNCSD2BY1: computes the CS decomposition of an M-by-Q matrix X with
+     !> orthonormal columns that has been partitioned into a 2-by-1 block
+     !> structure:
+     !> [  I1 0  0 ]
+     !> [  0  C  0 ]
+     !> [ X11 ]   [ U1 |    ] [  0  0  0 ]
+     !> X = [-----] = [---------] [----------] V1**T .
+     !> [ X21 ]   [    | U2 ] [  0  0  0 ]
+     !> [  0  S  0 ]
+     !> [  0  0  I2]
+     !> X11 is P-by-Q. The unitary matrices U1, U2, and V1 are P-by-P,
+     !> (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
+     !> nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
+     !> R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
+     !> K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
+
+     subroutine stdlib_cuncsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta, &
+               u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, rwork, lrwork, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, lwork, ldx11, ldx21, m, p, q
+           integer(ilp), intent(in) :: lrwork
+           integer(ilp) :: lrworkmin, lrworkopt
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           real(sp), intent(out) :: theta(*)
+           complex(sp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), work(*)
+           complex(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
+           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, j, lbbcsd, lorbdb, &
+           lorglq, lorglqmin, lorglqopt, lorgqr, lorgqrmin, lorgqropt, lworkmin, lworkopt, &
+                     r
+           logical(lk) :: lquery, wantu1, wantu2, wantv1t
+           ! Local Arrays 
+           real(sp) :: dum(1)
+           complex(sp) :: cdum(1,1)
+           ! Intrinsic Function 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           lquery = ( lwork==-1 ) .or. ( lrwork==-1 )
+           if( m < 0 ) then
+              info = -4
+           else if( p < 0 .or. p > m ) then
+              info = -5
+           else if( q < 0 .or. q > m ) then
+              info = -6
+           else if( ldx11 < max( 1, p ) ) then
+              info = -8
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -10
+           else if( wantu1 .and. ldu1 < max( 1, p ) ) then
+              info = -13
+           else if( wantu2 .and. ldu2 < max( 1, m - p ) ) then
+              info = -15
+           else if( wantv1t .and. ldv1t < max( 1, q ) ) then
+              info = -17
+           end if
+           r = min( p, m-p, q, m-q )
+           ! compute workspace
+             ! work layout:
+           ! |-----------------------------------------|
+           ! | lworkopt (1)                            |
+           ! |-----------------------------------------|
+           ! | taup1 (max(1,p))                        |
+           ! | taup2 (max(1,m-p))                      |
+           ! | tauq1 (max(1,q))                        |
+           ! |-----------------------------------------|
+           ! | stdlib_cunbdb work | stdlib_cungqr work | stdlib_cunglq work |
+           ! |             |             |             |
+           ! |             |             |             |
+           ! |             |             |             |
+           ! |             |             |             |
+           ! |-----------------------------------------|
+             ! rwork layout:
+           ! |------------------|
+           ! | lrworkopt (1)    |
+           ! |------------------|
+           ! | phi (max(1,r-1)) |
+           ! |------------------|
+           ! | b11d (r)         |
+           ! | b11e (r-1)       |
+           ! | b12d (r)         |
+           ! | b12e (r-1)       |
+           ! | b21d (r)         |
+           ! | b21e (r-1)       |
+           ! | b22d (r)         |
+           ! | b22e (r-1)       |
+           ! | stdlib_cbbcsd rwork     |
+           ! |------------------|
+           if( info == 0 ) then
+              iphi = 2
+              ib11d = iphi + max( 1, r-1 )
+              ib11e = ib11d + max( 1, r )
+              ib12d = ib11e + max( 1, r - 1 )
+              ib12e = ib12d + max( 1, r )
+              ib21d = ib12e + max( 1, r - 1 )
+              ib21e = ib21d + max( 1, r )
+              ib22d = ib21e + max( 1, r - 1 )
+              ib22e = ib22d + max( 1, r )
+              ibbcsd = ib22e + max( 1, r - 1 )
+              itaup1 = 2
+              itaup2 = itaup1 + max( 1, p )
+              itauq1 = itaup2 + max( 1, m-p )
+              iorbdb = itauq1 + max( 1, q )
+              iorgqr = itauq1 + max( 1, q )
+              iorglq = itauq1 + max( 1, q )
+              lorgqrmin = 1
+              lorgqropt = 1
+              lorglqmin = 1
+              lorglqopt = 1
+              if( r == q ) then
+                 call stdlib_cunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,dum, cdum, cdum, &
+                           cdum, work, -1,childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_cungqr( p, p, q, u1, ldu1, cdum, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 endif
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_cungqr( m-p, m-p, q, u2, ldu2, cdum, work(1), -1,childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_cunglq( q-1, q-1, q-1, v1t, ldv1t,cdum, work(1), -1, childinfo )
+                              
+                    lorglqmin = max( lorglqmin, q-1 )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_cbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,dum(1), u1, &
+                 ldu1, u2, ldu2, v1t, ldv1t, cdum,1, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
+                           1), -1, childinfo )
+                 lbbcsd = int( rwork(1),KIND=ilp)
+              else if( r == p ) then
+                 call stdlib_cunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
+                           cdum, work(1), -1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_cungqr( p-1, p-1, p-1, u1(2,2), ldu1, cdum, work(1),-1, childinfo &
+                              )
+                    lorgqrmin = max( lorgqrmin, p-1 )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_cungqr( m-p, m-p, q, u2, ldu2, cdum, work(1), -1,childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_cunglq( q, q, r, v1t, ldv1t, cdum, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_cbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,dum, v1t, &
+                 ldv1t, cdum, 1, u1, ldu1, u2, ldu2,dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
+                           1), -1, childinfo )
+                 lbbcsd = int( rwork(1),KIND=ilp)
+              else if( r == m-p ) then
+                 call stdlib_cunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
+                           cdum, work(1), -1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_cungqr( p, p, q, u1, ldu1, cdum, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_cungqr( m-p-1, m-p-1, m-p-1, u2(2,2), ldu2, cdum,work(1), -1, &
+                              childinfo )
+                    lorgqrmin = max( lorgqrmin, m-p-1 )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_cunglq( q, q, r, v1t, ldv1t, cdum, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_cbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, dum, cdum,&
+                  1, v1t, ldv1t, u2, ldu2, u1,ldu1, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
+                            1), -1, childinfo )
+                 lbbcsd = int( rwork(1),KIND=ilp)
+              else
+                 call stdlib_cunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
+                           cdum, cdum, work(1), -1, childinfo)
+                 lorbdb = m + int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_cungqr( p, p, m-q, u1, ldu1, cdum, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_cungqr( m-p, m-p, m-q, u2, ldu2, cdum, work(1), -1,childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_cunglq( q, q, q, v1t, ldv1t, cdum, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_cbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, dum, u2, &
+                 ldu2, u1, ldu1, cdum, 1, v1t,ldv1t, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
+                           1), -1, childinfo )
+                 lbbcsd = int( rwork(1),KIND=ilp)
+              end if
+              lrworkmin = ibbcsd+lbbcsd-1
+              lrworkopt = lrworkmin
+              rwork(1) = lrworkopt
+              lworkmin = max( iorbdb+lorbdb-1,iorgqr+lorgqrmin-1,iorglq+lorglqmin-1 )
+              lworkopt = max( iorbdb+lorbdb-1,iorgqr+lorgqropt-1,iorglq+lorglqopt-1 )
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                 info = -19
+              end if
+              if( lrwork < lrworkmin .and. .not.lquery ) then
+                 info = -21
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'CUNCSD2BY1', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           lorgqr = lwork-iorgqr+1
+           lorglq = lwork-iorglq+1
+           ! handle four cases separately: r = q, r = p, r = m-p, and r = m-q,
+           ! in which r = min(p,m-p,q,m-q)
+           if( r == q ) then
+              ! case 1: r = q
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_cunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
+                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_clacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_cungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_clacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_cungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 v1t(1,1) = cone
+                 do j = 2, q
+                    v1t(1,j) = czero
+                    v1t(j,1) = czero
+                 end do
+                 call stdlib_clacpy( 'U', q-1, q-1, x21(1,2), ldx21, v1t(2,2),ldv1t )
+                 call stdlib_cunglq( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorglq), &
+                           lorglq, childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_cbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,rwork(iphi), u1, &
+              ldu1, u2, ldu2, v1t, ldv1t, cdum,1, rwork(ib11d), rwork(ib11e), rwork(ib12d),rwork(&
+              ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd),lrwork-&
+                        ibbcsd+1, childinfo )
+              ! permute rows and columns to place czero submatrices in
+              ! preferred positions
+              if( q > 0 .and. wantu2 ) then
+                 do i = 1, q
+                    iwork(i) = m - p - q + i
+                 end do
+                 do i = q + 1, m - p
+                    iwork(i) = i - q
+                 end do
+                 call stdlib_clapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           else if( r == p ) then
+              ! case 2: r = p
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_cunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
+                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 u1(1,1) = cone
+                 do j = 2, p
+                    u1(1,j) = czero
+                    u1(j,1) = czero
+                 end do
+                 call stdlib_clacpy( 'L', p-1, p-1, x11(2,1), ldx11, u1(2,2), ldu1 )
+                 call stdlib_cungqr( p-1, p-1, p-1, u1(2,2), ldu1, work(itaup1),work(iorgqr), &
+                           lorgqr, childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_clacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_cungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_clacpy( 'U', p, q, x11, ldx11, v1t, ldv1t )
+                 call stdlib_cunglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_cbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,rwork(iphi), v1t,&
+               ldv1t, cdum, 1, u1, ldu1, u2,ldu2, rwork(ib11d), rwork(ib11e), rwork(ib12d),rwork(&
+               ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd), &
+                         lbbcsd,childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( q > 0 .and. wantu2 ) then
+                 do i = 1, q
+                    iwork(i) = m - p - q + i
+                 end do
+                 do i = q + 1, m - p
+                    iwork(i) = i - q
+                 end do
+                 call stdlib_clapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           else if( r == m-p ) then
+              ! case 3: r = m-p
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_cunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
+                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_clacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_cungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 u2(1,1) = cone
+                 do j = 2, m-p
+                    u2(1,j) = czero
+                    u2(j,1) = czero
+                 end do
+                 call stdlib_clacpy( 'L', m-p-1, m-p-1, x21(2,1), ldx21, u2(2,2),ldu2 )
+                 call stdlib_cungqr( m-p-1, m-p-1, m-p-1, u2(2,2), ldu2,work(itaup2), work(iorgqr)&
+                           , lorgqr, childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_clacpy( 'U', m-p, q, x21, ldx21, v1t, ldv1t )
+                 call stdlib_cunglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_cbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, rwork(iphi), &
+              cdum, 1, v1t, ldv1t, u2, ldu2,u1, ldu1, rwork(ib11d), rwork(ib11e),rwork(ib12d), &
+              rwork(ib12e), rwork(ib21d),rwork(ib21e), rwork(ib22d), rwork(ib22e),rwork(ibbcsd), &
+                        lbbcsd, childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( q > r ) then
+                 do i = 1, r
+                    iwork(i) = q - r + i
+                 end do
+                 do i = r + 1, q
+                    iwork(i) = i - r
+                 end do
+                 if( wantu1 ) then
+                    call stdlib_clapmt( .false., p, q, u1, ldu1, iwork )
+                 end if
+                 if( wantv1t ) then
+                    call stdlib_clapmr( .false., q, q, v1t, ldv1t, iwork )
+                 end if
+              end if
+           else
+              ! case 4: r = m-q
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_cunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
+              itaup1), work(itaup2),work(itauq1), work(iorbdb), work(iorbdb+m),lorbdb-m, &
+                        childinfo )
+              ! accumulate householder reflectors
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_ccopy( m-p, work(iorbdb+p), 1, u2, 1 )
+              end if
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_ccopy( p, work(iorbdb), 1, u1, 1 )
+                 do j = 2, p
+                    u1(1,j) = czero
+                 end do
+                 call stdlib_clacpy( 'L', p-1, m-q-1, x11(2,1), ldx11, u1(2,2),ldu1 )
+                 call stdlib_cungqr( p, p, m-q, u1, ldu1, work(itaup1),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 do j = 2, m-p
+                    u2(1,j) = czero
+                 end do
+                 call stdlib_clacpy( 'L', m-p-1, m-q-1, x21(2,1), ldx21, u2(2,2),ldu2 )
+                 call stdlib_cungqr( m-p, m-p, m-q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_clacpy( 'U', m-q, q, x21, ldx21, v1t, ldv1t )
+                 call stdlib_clacpy( 'U', p-(m-q), q-(m-q), x11(m-q+1,m-q+1), ldx11,v1t(m-q+1,m-q+&
+                           1), ldv1t )
+                 call stdlib_clacpy( 'U', -p+q, q-p, x21(m-q+1,p+1), ldx21,v1t(p+1,p+1), ldv1t )
+                           
+                 call stdlib_cunglq( q, q, q, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_cbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, rwork(iphi), &
+              u2, ldu2, u1, ldu1, cdum, 1,v1t, ldv1t, rwork(ib11d), rwork(ib11e),rwork(ib12d), &
+              rwork(ib12e), rwork(ib21d),rwork(ib21e), rwork(ib22d), rwork(ib22e),rwork(ibbcsd), &
+                        lbbcsd, childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( p > r ) then
+                 do i = 1, r
+                    iwork(i) = p - r + i
+                 end do
+                 do i = r + 1, p
+                    iwork(i) = i - r
+                 end do
+                 if( wantu1 ) then
+                    call stdlib_clapmt( .false., p, p, u1, ldu1, iwork )
+                 end if
+                 if( wantv1t ) then
+                    call stdlib_clapmr( .false., p, q, v1t, ldv1t, iwork )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_cuncsd2by1
+
+     !> CUNGBR: generates one of the complex unitary matrices Q or P**H
+     !> determined by CGEBRD when reducing a complex matrix A to bidiagonal
+     !> form: A = Q * B * P**H.  Q and P**H are defined as products of
+     !> elementary reflectors H(i) or G(i) respectively.
+     !> If VECT = 'Q', A is assumed to have been an M-by-K matrix, and Q
+     !> is of order M:
+     !> if m >= k, Q = H(1) H(2) . . . H(k) and CUNGBR returns the first n
+     !> columns of Q, where m >= n >= k;
+     !> if m < k, Q = H(1) H(2) . . . H(m-1) and CUNGBR returns Q as an
+     !> M-by-M matrix.
+     !> If VECT = 'P', A is assumed to have been a K-by-N matrix, and P**H
+     !> is of order N:
+     !> if k < n, P**H = G(k) . . . G(2) G(1) and CUNGBR returns the first m
+     !> rows of P**H, where n >= m >= k;
+     !> if k >= n, P**H = G(n-1) . . . G(2) G(1) and CUNGBR returns P**H as
+     !> an N-by-N matrix.
+
+     pure subroutine stdlib_cungbr( vect, m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantq
+           integer(ilp) :: i, iinfo, j, lwkopt, mn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           wantq = stdlib_lsame( vect, 'Q' )
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( .not.wantq .and. .not.stdlib_lsame( vect, 'P' ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 .or. ( wantq .and. ( n>m .or. n<min( m,k ) ) ) .or. ( .not.wantq .and. ( &
+                     m>n .or. m<min( n, k ) ) ) ) then
+              info = -3
+           else if( k<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( lwork<max( 1, mn ) .and. .not.lquery ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              work( 1 ) = 1
+              if( wantq ) then
+                 if( m>=k ) then
+                    call stdlib_cungqr( m, n, k, a, lda, tau, work, -1, iinfo )
+                 else
+                    if( m>1 ) then
+                       call stdlib_cungqr( m-1, m-1, m-1, a, lda, tau, work, -1,iinfo )
+                    end if
+                 end if
+              else
+                 if( k<n ) then
+                    call stdlib_cunglq( m, n, k, a, lda, tau, work, -1, iinfo )
+                 else
+                    if( n>1 ) then
+                       call stdlib_cunglq( n-1, n-1, n-1, a, lda, tau, work, -1,iinfo )
+                    end if
+                 end if
+              end if
+              lwkopt = real( work( 1 ),KIND=sp)
+              lwkopt = max (lwkopt, mn)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNGBR', -info )
+              return
+           else if( lquery ) then
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( wantq ) then
+              ! form q, determined by a call to stdlib_cgebrd to reduce an m-by-k
+              ! matrix
+              if( m>=k ) then
+                 ! if m >= k, assume m >= n >= k
+                 call stdlib_cungqr( m, n, k, a, lda, tau, work, lwork, iinfo )
+              else
+                 ! if m < k, assume m = n
+                 ! shift the vectors which define the elementary reflectors cone
+                 ! column to the right, and set the first row and column of q
+                 ! to those of the unit matrix
+                 do j = m, 2, -1
+                    a( 1, j ) = czero
+                    do i = j + 1, m
+                       a( i, j ) = a( i, j-1 )
+                    end do
+                 end do
+                 a( 1, 1 ) = cone
+                 do i = 2, m
+                    a( i, 1 ) = czero
+                 end do
+                 if( m>1 ) then
+                    ! form q(2:m,2:m)
+                    call stdlib_cungqr( m-1, m-1, m-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                              
+                 end if
+              end if
+           else
+              ! form p**h, determined by a call to stdlib_cgebrd to reduce a k-by-n
+              ! matrix
+              if( k<n ) then
+                 ! if k < n, assume k <= m <= n
+                 call stdlib_cunglq( m, n, k, a, lda, tau, work, lwork, iinfo )
+              else
+                 ! if k >= n, assume m = n
+                 ! shift the vectors which define the elementary reflectors cone
+                 ! row downward, and set the first row and column of p**h to
+                 ! those of the unit matrix
+                 a( 1, 1 ) = cone
+                 do i = 2, n
+                    a( i, 1 ) = czero
+                 end do
+                 do j = 2, n
+                    do i = j - 1, 2, -1
+                       a( i, j ) = a( i-1, j )
+                    end do
+                    a( 1, j ) = czero
+                 end do
+                 if( n>1 ) then
+                    ! form p**h(2:n,2:n)
+                    call stdlib_cunglq( n-1, n-1, n-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                              
+                 end if
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cungbr
+
+     !> CUNGTSQR: generates an M-by-N complex matrix Q_out with orthonormal
+     !> columns, which are the first N columns of a product of comlpex unitary
+     !> matrices of order M which are returned by CLATSQR
+     !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+     !> See the documentation for CLATSQR.
+
+     pure subroutine stdlib_cungtsqr( m, n, mb, nb, a, lda, t, ldt, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, mb, nb
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(in) :: t(ldt,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: iinfo, ldc, lworkopt, lc, lw, nblocal, j
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           lquery  = lwork==-1
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb<=n ) then
+              info = -3
+           else if( nb<1 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -8
+           else
+              ! test the input lwork for the dimension of the array work.
+              ! this workspace is used to store array c(ldc, n) and work(lwork)
+              ! in the call to stdlib_clamtsqr. see the documentation for stdlib_clamtsqr.
+              if( lwork<2 .and. (.not.lquery) ) then
+                 info = -10
+              else
+                 ! set block size for column blocks
+                 nblocal = min( nb, n )
+                 ! lwork = -1, then set the size for the array c(ldc,n)
+                 ! in stdlib_clamtsqr call and set the optimal size of the work array
+                 ! work(lwork) in stdlib_clamtsqr call.
+                 ldc = m
+                 lc = ldc*n
+                 lw = n * nblocal
+                 lworkopt = lc+lw
+                 if( ( lwork<max( 1, lworkopt ) ).and.(.not.lquery) ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           ! handle error in the input parameters and return workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNGTSQR', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = cmplx( lworkopt,KIND=sp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = cmplx( lworkopt,KIND=sp)
+              return
+           end if
+           ! (1) form explicitly the tall-skinny m-by-n left submatrix q1_in
+           ! of m-by-m orthogonal matrix q_in, which is implicitly stored in
+           ! the subdiagonal part of input array a and in the input array t.
+           ! perform by the following operation using the routine stdlib_clamtsqr.
+               ! q1_in = q_in * ( i ), where i is a n-by-n identity matrix,
+                              ! ( 0 )        0 is a (m-n)-by-n zero matrix.
+           ! (1a) form m-by-n matrix in the array work(1:ldc*n) with ones
+           ! on the diagonal and zeros elsewhere.
+           call stdlib_claset( 'F', m, n, czero, cone, work, ldc )
+           ! (1b)  on input, work(1:ldc*n) stores ( i );
+                                                ! ( 0 )
+                 ! on output, work(1:ldc*n) stores q1_in.
+           call stdlib_clamtsqr( 'L', 'N', m, n, n, mb, nblocal, a, lda, t, ldt,work, ldc, work( &
+                     lc+1 ), lw, iinfo )
+           ! (2) copy the result from the part of the work array (1:m,1:n)
+           ! with the leading dimension ldc that starts at work(1) into
+           ! the output array a(1:m,1:n) column-by-column.
+           do j = 1, n
+              call stdlib_ccopy( m, work( (j-1)*ldc + 1 ), 1, a( 1, j ), 1 )
+           end do
+           work( 1 ) = cmplx( lworkopt,KIND=sp)
+           return
+     end subroutine stdlib_cungtsqr
+
+     !> If VECT = 'Q', CUNMBR: overwrites the general complex M-by-N matrix C
+     !> with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> If VECT = 'P', CUNMBR overwrites the general complex M-by-N matrix C
+     !> with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      P * C          C * P
+     !> TRANS = 'C':      P**H * C       C * P**H
+     !> Here Q and P**H are the unitary matrices determined by CGEBRD when
+     !> reducing a complex matrix A to bidiagonal form: A = Q * B * P**H. Q
+     !> and P**H are defined as products of elementary reflectors H(i) and
+     !> G(i) respectively.
+     !> Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the
+     !> order of the unitary matrix Q or P**H that is applied.
+     !> If VECT = 'Q', A is assumed to have been an NQ-by-K matrix:
+     !> if nq >= k, Q = H(1) H(2) . . . H(k);
+     !> if nq < k, Q = H(1) H(2) . . . H(nq-1).
+     !> If VECT = 'P', A is assumed to have been a K-by-NQ matrix:
+     !> if k < nq, P = G(1) G(2) . . . G(k);
+     !> if k >= nq, P = G(1) G(2) . . . G(nq-1).
+
+     pure subroutine stdlib_cunmbr( vect, side, trans, m, n, k, a, lda, tau, c,ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(sp), intent(in) :: tau(*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: applyq, left, lquery, notran
+           character :: transt
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           applyq = stdlib_lsame( vect, 'Q' )
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q or p and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.applyq .and. .not.stdlib_lsame( vect, 'P' ) ) then
+              info = -1
+           else if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -2
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( k<0 ) then
+              info = -6
+           else if( ( applyq .and. lda<max( 1, nq ) ) .or.( .not.applyq .and. lda<max( 1, min( nq,&
+                      k ) ) ) )then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( m>0 .and. n>0 ) then
+                 if( applyq ) then
+                    if( left ) then
+                       nb = stdlib_ilaenv( 1, 'CUNMQR', side // trans, m-1, n, m-1,-1 )
+                    else
+                       nb = stdlib_ilaenv( 1, 'CUNMQR', side // trans, m, n-1, n-1,-1 )
+                    end if
+                 else
+                    if( left ) then
+                       nb = stdlib_ilaenv( 1, 'CUNMLQ', side // trans, m-1, n, m-1,-1 )
+                    else
+                       nb = stdlib_ilaenv( 1, 'CUNMLQ', side // trans, m, n-1, n-1,-1 )
+                    end if
+                 end if
+                 lwkopt = nw*nb
+              else
+                 lwkopt = 1
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CUNMBR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if( applyq ) then
+              ! apply q
+              if( nq>=k ) then
+                 ! q was determined by a call to stdlib_cgebrd with nq >= k
+                 call stdlib_cunmqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, iinfo &
+                           )
+              else if( nq>1 ) then
+                 ! q was determined by a call to stdlib_cgebrd with nq < k
+                 if( left ) then
+                    mi = m - 1
+                    ni = n
+                    i1 = 2
+                    i2 = 1
+                 else
+                    mi = m
+                    ni = n - 1
+                    i1 = 1
+                    i2 = 2
+                 end if
+                 call stdlib_cunmqr( side, trans, mi, ni, nq-1, a( 2, 1 ), lda, tau,c( i1, i2 ), &
+                           ldc, work, lwork, iinfo )
+              end if
+           else
+              ! apply p
+              if( notran ) then
+                 transt = 'C'
+              else
+                 transt = 'N'
+              end if
+              if( nq>k ) then
+                 ! p was determined by a call to stdlib_cgebrd with nq > k
+                 call stdlib_cunmlq( side, transt, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                           iinfo )
+              else if( nq>1 ) then
+                 ! p was determined by a call to stdlib_cgebrd with nq <= k
+                 if( left ) then
+                    mi = m - 1
+                    ni = n
+                    i1 = 2
+                    i2 = 1
+                 else
+                    mi = m
+                    ni = n - 1
+                    i1 = 1
+                    i2 = 2
+                 end if
+                 call stdlib_cunmlq( side, transt, mi, ni, nq-1, a( 1, 2 ), lda,tau, c( i1, i2 ), &
+                           ldc, work, lwork, iinfo )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cunmbr
+
+     !> CGELQ: computes an LQ factorization of a complex M-by-N matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a N-by-N orthogonal matrix;
+     !> L is a lower-triangular M-by-M matrix;
+     !> 0 is a M-by-(N-M) zero matrix, if M < N.
+
+     pure subroutine stdlib_cgelq( m, n, a, lda, t, tsize, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, tsize, lwork
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: t(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, lminws, mint, minw
+           integer(ilp) :: mb, nb, mintsz, nblcks, lwmin, lwopt, lwreq
+           ! Intrinsic Functions 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( tsize==-1 .or. tsize==-2 .or.lwork==-1 .or. lwork==-2 )
+           mint = .false.
+           minw = .false.
+           if( tsize==-2 .or. lwork==-2 ) then
+             if( tsize/=-1 ) mint = .true.
+             if( lwork/=-1 ) minw = .true.
+           end if
+           ! determine the block size
+           if( min( m, n )>0 ) then
+             mb = stdlib_ilaenv( 1, 'CGELQ ', ' ', m, n, 1, -1 )
+             nb = stdlib_ilaenv( 1, 'CGELQ ', ' ', m, n, 2, -1 )
+           else
+             mb = 1
+             nb = n
+           end if
+           if( mb>min( m, n ) .or. mb<1 ) mb = 1
+           if( nb>n .or. nb<=m ) nb = n
+           mintsz = m + 5
+           if( nb>m .and. n>m ) then
+             if( mod( n - m, nb - m )==0 ) then
+               nblcks = ( n - m ) / ( nb - m )
+             else
+               nblcks = ( n - m ) / ( nb - m ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           ! determine if the workspace size satisfies minimal size
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+              lwmin = max( 1, n )
+              lwopt = max( 1, mb*n )
+           else
+              lwmin = max( 1, m )
+              lwopt = max( 1, mb*m )
+           end if
+           lminws = .false.
+           if( ( tsize<max( 1, mb*m*nblcks + 5 ) .or. lwork<lwopt ).and. ( lwork>=lwmin ) .and. ( &
+                     tsize>=mintsz ).and. ( .not.lquery ) ) then
+             if( tsize<max( 1, mb*m*nblcks + 5 ) ) then
+               lminws = .true.
+               mb = 1
+               nb = n
+             end if
+             if( lwork<lwopt ) then
+               lminws = .true.
+               mb = 1
+             end if
+           end if
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+              lwreq = max( 1, mb*n )
+           else
+              lwreq = max( 1, mb*m )
+           end if
+           if( m<0 ) then
+             info = -1
+           else if( n<0 ) then
+             info = -2
+           else if( lda<max( 1, m ) ) then
+             info = -4
+           else if( tsize<max( 1, mb*m*nblcks + 5 ).and. ( .not.lquery ) .and. ( .not.lminws ) ) &
+                     then
+             info = -6
+           else if( ( lwork<lwreq ) .and.( .not.lquery ).and. ( .not.lminws ) ) then
+             info = -8
+           end if
+           if( info==0 ) then
+             if( mint ) then
+               t( 1 ) = mintsz
+             else
+               t( 1 ) = mb*m*nblcks + 5
+             end if
+             t( 2 ) = mb
+             t( 3 ) = nb
+             if( minw ) then
+               work( 1 ) = lwmin
+             else
+               work( 1 ) = lwreq
+             end if
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'CGELQ', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+             return
+           end if
+           ! the lq decomposition
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+             call stdlib_cgelqt( m, n, mb, a, lda, t( 6 ), mb, work, info )
+           else
+             call stdlib_claswlq( m, n, mb, nb, a, lda, t( 6 ), mb, work,lwork, info )
+           end if
+           work( 1 ) = lwreq
+           return
+     end subroutine stdlib_cgelq
+
+     !> CGELSD: computes the minimum-norm solution to a real linear least
+     !> squares problem:
+     !> minimize 2-norm(| b - A*x |)
+     !> using the singular value decomposition (SVD) of A. A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+     !> The problem is solved in three steps:
+     !> (1) Reduce the coefficient matrix A to bidiagonal form with
+     !> Householder transformations, reducing the original problem
+     !> into a "bidiagonal least squares problem" (BLS)
+     !> (2) Solve the BLS using a divide and conquer approach.
+     !> (3) Apply back all the Householder transformations to solve
+     !> the original least squares problem.
+     !> The effective rank of A is determined by treating as zero those
+     !> singular values which are less than RCOND times the largest singular
+     !> value.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_cgelsd( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, rwork, &
+               iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(sp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: rwork(*), s(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: iascl, ibscl, ie, il, itau, itaup, itauq, ldwork, liwork, lrwork, &
+                     maxmn, maxwrk, minmn, minwrk, mm, mnthr, nlvl, nrwork, nwork, smlsiz
+           real(sp) :: anrm, bignum, bnrm, eps, sfmin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: int,log,max,min,real
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           minmn = min( m, n )
+           maxmn = max( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, maxmn ) ) then
+              info = -7
+           end if
+           ! compute workspace.
+           ! (note: comments in the code beginning "workspace:" describe the
+           ! minimal amount of workspace needed at that point in the code,
+           ! as well as the preferred amount for good performance.
+           ! nb refers to the optimal block size for the immediately
+           ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              liwork = 1
+              lrwork = 1
+              if( minmn>0 ) then
+                 smlsiz = stdlib_ilaenv( 9, 'CGELSD', ' ', 0, 0, 0, 0 )
+                 mnthr = stdlib_ilaenv( 6, 'CGELSD', ' ', m, n, nrhs, -1 )
+                 nlvl = max( int( log( real( minmn,KIND=sp) / real( smlsiz + 1,KIND=sp) ) /log( &
+                           two ),KIND=ilp) + 1, 0 )
+                 liwork = 3*minmn*nlvl + 11*minmn
+                 mm = m
+                 if( m>=n .and. m>=mnthr ) then
+                    ! path 1a - overdetermined, with many more rows than
+                              ! columns.
+                    mm = n
+                    maxwrk = max( maxwrk, n*stdlib_ilaenv( 1, 'CGEQRF', ' ', m, n,-1, -1 ) )
+                              
+                    maxwrk = max( maxwrk, nrhs*stdlib_ilaenv( 1, 'CUNMQR', 'LC', m,nrhs, n, -1 ) )
+                              
+                 end if
+                 if( m>=n ) then
+                    ! path 1 - overdetermined or exactly determined.
+                    lrwork = 10*n + 2*n*smlsiz + 8*n*nlvl + 3*smlsiz*nrhs +max( (smlsiz+1)**2, n*(&
+                              1+nrhs) + 2*nrhs )
+                    maxwrk = max( maxwrk, 2*n + ( mm + n )*stdlib_ilaenv( 1,'CGEBRD', ' ', mm, n, &
+                              -1, -1 ) )
+                    maxwrk = max( maxwrk, 2*n + nrhs*stdlib_ilaenv( 1, 'CUNMBR','QLC', mm, nrhs, &
+                              n, -1 ) )
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'CUNMBR', 'PLN', n, &
+                              nrhs, n, -1 ) )
+                    maxwrk = max( maxwrk, 2*n + n*nrhs )
+                    minwrk = max( 2*n + mm, 2*n + n*nrhs )
+                 end if
+                 if( n>m ) then
+                    lrwork = 10*m + 2*m*smlsiz + 8*m*nlvl + 3*smlsiz*nrhs +max( (smlsiz+1)**2, n*(&
+                              1+nrhs) + 2*nrhs )
+                    if( n>=mnthr ) then
+                       ! path 2a - underdetermined, with many more columns
+                                 ! than rows.
+                       maxwrk = m + m*stdlib_ilaenv( 1, 'CGELQF', ' ', m, n, -1,-1 )
+                       maxwrk = max( maxwrk, m*m + 4*m + 2*m*stdlib_ilaenv( 1,'CGEBRD', ' ', m, m,&
+                                  -1, -1 ) )
+                       maxwrk = max( maxwrk, m*m + 4*m + nrhs*stdlib_ilaenv( 1,'CUNMBR', 'QLC', m,&
+                                  nrhs, m, -1 ) )
+                       maxwrk = max( maxwrk, m*m + 4*m + ( m - 1 )*stdlib_ilaenv( 1,'CUNMLQ', &
+                                 'LC', n, nrhs, m, -1 ) )
+                       if( nrhs>1 ) then
+                          maxwrk = max( maxwrk, m*m + m + m*nrhs )
+                       else
+                          maxwrk = max( maxwrk, m*m + 2*m )
+                       end if
+                       maxwrk = max( maxwrk, m*m + 4*m + m*nrhs )
+           ! xxx: ensure the path 2a case below is triggered.  the workspace
+           ! calculation should use queries for all routines eventually.
+                       maxwrk = max( maxwrk,4*m+m*m+max( m, 2*m-4, nrhs, n-3*m ) )
+                    else
+                       ! path 2 - underdetermined.
+                       maxwrk = 2*m + ( n + m )*stdlib_ilaenv( 1, 'CGEBRD', ' ', m,n, -1, -1 )
+                                 
+                       maxwrk = max( maxwrk, 2*m + nrhs*stdlib_ilaenv( 1, 'CUNMBR','QLC', m, nrhs,&
+                                  m, -1 ) )
+                       maxwrk = max( maxwrk, 2*m + m*stdlib_ilaenv( 1, 'CUNMBR','PLN', n, nrhs, m,&
+                                  -1 ) )
+                       maxwrk = max( maxwrk, 2*m + m*nrhs )
+                    end if
+                    minwrk = max( 2*m + n, 2*m + m*nrhs )
+                 end if
+              end if
+              minwrk = min( minwrk, maxwrk )
+              work( 1 ) = maxwrk
+              iwork( 1 ) = liwork
+              rwork( 1 ) = lrwork
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGELSD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters.
+           eps = stdlib_slamch( 'P' )
+           sfmin = stdlib_slamch( 'S' )
+           smlnum = sfmin / eps
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! scale a if max entry outside range [smlnum,bignum].
+           anrm = stdlib_clange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_clascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum.
+              call stdlib_clascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_claset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              call stdlib_slaset( 'F', minmn, 1, zero, zero, s, 1 )
+              rank = 0
+              go to 10
+           end if
+           ! scale b if max entry outside range [smlnum,bignum].
+           bnrm = stdlib_clange( 'M', m, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum.
+              call stdlib_clascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum.
+              call stdlib_clascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! if m < n make sure b(m+1:n,:) = 0
+           if( m<n )call stdlib_claset( 'F', n-m, nrhs, czero, czero, b( m+1, 1 ), ldb )
+           ! overdetermined case.
+           if( m>=n ) then
+              ! path 1 - overdetermined or exactly determined.
+              mm = m
+              if( m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns
+                 mm = n
+                 itau = 1
+                 nwork = itau + n
+                 ! compute a=q*r.
+                 ! (rworkspace: need n)
+                 ! (cworkspace: need n, prefer n*nb)
+                 call stdlib_cgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                           info )
+                 ! multiply b by transpose(q).
+                 ! (rworkspace: need n)
+                 ! (cworkspace: need nrhs, prefer nrhs*nb)
+                 call stdlib_cunmqr( 'L', 'C', m, nrhs, n, a, lda, work( itau ), b,ldb, work( &
+                           nwork ), lwork-nwork+1, info )
+                 ! zero out below r.
+                 if( n>1 ) then
+                    call stdlib_claset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+                 end if
+              end if
+              itauq = 1
+              itaup = itauq + n
+              nwork = itaup + n
+              ie = 1
+              nrwork = ie + n
+              ! bidiagonalize r in a.
+              ! (rworkspace: need n)
+              ! (cworkspace: need 2*n+mm, prefer 2*n+(mm+n)*nb)
+              call stdlib_cgebrd( mm, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                        work( nwork ), lwork-nwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors of r.
+              ! (cworkspace: need 2*n+nrhs, prefer 2*n+nrhs*nb)
+              call stdlib_cunmbr( 'Q', 'L', 'C', mm, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_clalsd( 'U', smlsiz, n, nrhs, s, rwork( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), rwork( nrwork ),iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of r.
+              call stdlib_cunmbr( 'P', 'L', 'N', n, nrhs, n, a, lda, work( itaup ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+           else if( n>=mnthr .and. lwork>=4*m+m*m+max( m, 2*m-4, nrhs, n-3*m ) ) then
+              ! path 2a - underdetermined, with many more columns than rows
+              ! and sufficient workspace for an efficient algorithm.
+              ldwork = m
+              if( lwork>=max( 4*m+m*lda+max( m, 2*m-4, nrhs, n-3*m ),m*lda+m+m*nrhs ) )ldwork = &
+                        lda
+              itau = 1
+              nwork = m + 1
+              ! compute a=l*q.
+              ! (cworkspace: need 2*m, prefer m+m*nb)
+              call stdlib_cgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, info )
+                        
+              il = nwork
+              ! copy l to work(il), zeroing out above its diagonal.
+              call stdlib_clacpy( 'L', m, m, a, lda, work( il ), ldwork )
+              call stdlib_claset( 'U', m-1, m-1, czero, czero, work( il+ldwork ),ldwork )
+              itauq = il + ldwork*m
+              itaup = itauq + m
+              nwork = itaup + m
+              ie = 1
+              nrwork = ie + m
+              ! bidiagonalize l in work(il).
+              ! (rworkspace: need m)
+              ! (cworkspace: need m*m+4*m, prefer m*m+4*m+2*m*nb)
+              call stdlib_cgebrd( m, m, work( il ), ldwork, s, rwork( ie ),work( itauq ), work( &
+                        itaup ), work( nwork ),lwork-nwork+1, info )
+              ! multiply b by transpose of left bidiagonalizing vectors of l.
+              ! (cworkspace: need m*m+4*m+nrhs, prefer m*m+4*m+nrhs*nb)
+              call stdlib_cunmbr( 'Q', 'L', 'C', m, nrhs, m, work( il ), ldwork,work( itauq ), b, &
+                        ldb, work( nwork ),lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_clalsd( 'U', smlsiz, m, nrhs, s, rwork( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), rwork( nrwork ),iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of l.
+              call stdlib_cunmbr( 'P', 'L', 'N', m, nrhs, m, work( il ), ldwork,work( itaup ), b, &
+                        ldb, work( nwork ),lwork-nwork+1, info )
+              ! zero out below first m rows of b.
+              call stdlib_claset( 'F', n-m, nrhs, czero, czero, b( m+1, 1 ), ldb )
+              nwork = itau + m
+              ! multiply transpose(q) by b.
+              ! (cworkspace: need nrhs, prefer nrhs*nb)
+              call stdlib_cunmlq( 'L', 'C', n, nrhs, m, a, lda, work( itau ), b,ldb, work( nwork )&
+                        , lwork-nwork+1, info )
+           else
+              ! path 2 - remaining underdetermined cases.
+              itauq = 1
+              itaup = itauq + m
+              nwork = itaup + m
+              ie = 1
+              nrwork = ie + m
+              ! bidiagonalize a.
+              ! (rworkspace: need m)
+              ! (cworkspace: need 2*m+n, prefer 2*m+(m+n)*nb)
+              call stdlib_cgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), work(&
+                         nwork ), lwork-nwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors.
+              ! (cworkspace: need 2*m+nrhs, prefer 2*m+nrhs*nb)
+              call stdlib_cunmbr( 'Q', 'L', 'C', m, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_clalsd( 'L', smlsiz, m, nrhs, s, rwork( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), rwork( nrwork ),iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of a.
+              call stdlib_cunmbr( 'P', 'L', 'N', n, nrhs, m, a, lda, work( itaup ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+           end if
+           ! undo scaling.
+           if( iascl==1 ) then
+              call stdlib_clascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_slascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,info )
+           else if( iascl==2 ) then
+              call stdlib_clascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_slascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_clascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_clascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           10 continue
+           work( 1 ) = maxwrk
+           iwork( 1 ) = liwork
+           rwork( 1 ) = lrwork
+           return
+     end subroutine stdlib_cgelsd
+
+     !> CGELSS: computes the minimum norm solution to a complex linear
+     !> least squares problem:
+     !> Minimize 2-norm(| b - A*x |).
+     !> using the singular value decomposition (SVD) of A. A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution matrix
+     !> X.
+     !> The effective rank of A is determined by treating as zero those
+     !> singular values which are less than RCOND times the largest singular
+     !> value.
+
+     subroutine stdlib_cgelss( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, rwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(sp), intent(in) :: rcond
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*), s(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: bl, chunk, i, iascl, ibscl, ie, il, irwork, itau, itaup, itauq, iwork, &
+                     ldwork, maxmn, maxwrk, minmn, minwrk, mm, mnthr
+           integer(ilp) :: lwork_cgeqrf, lwork_cunmqr, lwork_cgebrd, lwork_cunmbr, lwork_cungbr, &
+                     lwork_cunmlq, lwork_cgelqf
+           real(sp) :: anrm, bignum, bnrm, eps, sfmin, smlnum, thr
+           ! Local Arrays 
+           complex(sp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           minmn = min( m, n )
+           maxmn = max( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, maxmn ) ) then
+              info = -7
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace refers
+             ! to real workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( minmn>0 ) then
+                 mm = m
+                 mnthr = stdlib_ilaenv( 6, 'CGELSS', ' ', m, n, nrhs, -1 )
+                 if( m>=n .and. m>=mnthr ) then
+                    ! path 1a - overdetermined, with many more rows than
+                              ! columns
+                    ! compute space needed for stdlib_cgeqrf
+                    call stdlib_cgeqrf( m, n, a, lda, dum(1), dum(1), -1, info )
+                    lwork_cgeqrf = real( dum(1),KIND=sp)
+                    ! compute space needed for stdlib_cunmqr
+                    call stdlib_cunmqr( 'L', 'C', m, nrhs, n, a, lda, dum(1), b,ldb, dum(1), -1, &
+                              info )
+                    lwork_cunmqr = real( dum(1),KIND=sp)
+                    mm = n
+                    maxwrk = max( maxwrk, n + n*stdlib_ilaenv( 1, 'CGEQRF', ' ', m,n, -1, -1 ) )
+                              
+                    maxwrk = max( maxwrk, n + nrhs*stdlib_ilaenv( 1, 'CUNMQR', 'LC',m, nrhs, n, -&
+                              1 ) )
+                 end if
+                 if( m>=n ) then
+                    ! path 1 - overdetermined or exactly determined
+                    ! compute space needed for stdlib_cgebrd
+                    call stdlib_cgebrd( mm, n, a, lda, s, s, dum(1), dum(1), dum(1),-1, info )
+                              
+                    lwork_cgebrd = real( dum(1),KIND=sp)
+                    ! compute space needed for stdlib_cunmbr
+                    call stdlib_cunmbr( 'Q', 'L', 'C', mm, nrhs, n, a, lda, dum(1),b, ldb, dum(1),&
+                               -1, info )
+                    lwork_cunmbr = real( dum(1),KIND=sp)
+                    ! compute space needed for stdlib_cungbr
+                    call stdlib_cungbr( 'P', n, n, n, a, lda, dum(1),dum(1), -1, info )
+                    lwork_cungbr = real( dum(1),KIND=sp)
+                    ! compute total workspace needed
+                    maxwrk = max( maxwrk, 2*n + lwork_cgebrd )
+                    maxwrk = max( maxwrk, 2*n + lwork_cunmbr )
+                    maxwrk = max( maxwrk, 2*n + lwork_cungbr )
+                    maxwrk = max( maxwrk, n*nrhs )
+                    minwrk = 2*n + max( nrhs, m )
+                 end if
+                 if( n>m ) then
+                    minwrk = 2*m + max( nrhs, n )
+                    if( n>=mnthr ) then
+                       ! path 2a - underdetermined, with many more columns
+                       ! than rows
+                       ! compute space needed for stdlib_cgelqf
+                       call stdlib_cgelqf( m, n, a, lda, dum(1), dum(1),-1, info )
+                       lwork_cgelqf = real( dum(1),KIND=sp)
+                       ! compute space needed for stdlib_cgebrd
+                       call stdlib_cgebrd( m, m, a, lda, s, s, dum(1), dum(1),dum(1), -1, info )
+                                 
+                       lwork_cgebrd = real( dum(1),KIND=sp)
+                       ! compute space needed for stdlib_cunmbr
+                       call stdlib_cunmbr( 'Q', 'L', 'C', m, nrhs, n, a, lda,dum(1), b, ldb, dum(&
+                                 1), -1, info )
+                       lwork_cunmbr = real( dum(1),KIND=sp)
+                       ! compute space needed for stdlib_cungbr
+                       call stdlib_cungbr( 'P', m, m, m, a, lda, dum(1),dum(1), -1, info )
+                       lwork_cungbr = real( dum(1),KIND=sp)
+                       ! compute space needed for stdlib_cunmlq
+                       call stdlib_cunmlq( 'L', 'C', n, nrhs, m, a, lda, dum(1),b, ldb, dum(1), -&
+                                 1, info )
+                       lwork_cunmlq = real( dum(1),KIND=sp)
+                       ! compute total workspace needed
+                       maxwrk = m + lwork_cgelqf
+                       maxwrk = max( maxwrk, 3*m + m*m + lwork_cgebrd )
+                       maxwrk = max( maxwrk, 3*m + m*m + lwork_cunmbr )
+                       maxwrk = max( maxwrk, 3*m + m*m + lwork_cungbr )
+                       if( nrhs>1 ) then
+                          maxwrk = max( maxwrk, m*m + m + m*nrhs )
+                       else
+                          maxwrk = max( maxwrk, m*m + 2*m )
+                       end if
+                       maxwrk = max( maxwrk, m + lwork_cunmlq )
+                    else
+                       ! path 2 - underdetermined
+                       ! compute space needed for stdlib_cgebrd
+                       call stdlib_cgebrd( m, n, a, lda, s, s, dum(1), dum(1),dum(1), -1, info )
+                                 
+                       lwork_cgebrd = real( dum(1),KIND=sp)
+                       ! compute space needed for stdlib_cunmbr
+                       call stdlib_cunmbr( 'Q', 'L', 'C', m, nrhs, m, a, lda,dum(1), b, ldb, dum(&
+                                 1), -1, info )
+                       lwork_cunmbr = real( dum(1),KIND=sp)
+                       ! compute space needed for stdlib_cungbr
+                       call stdlib_cungbr( 'P', m, n, m, a, lda, dum(1),dum(1), -1, info )
+                       lwork_cungbr = real( dum(1),KIND=sp)
+                       maxwrk = 2*m + lwork_cgebrd
+                       maxwrk = max( maxwrk, 2*m + lwork_cunmbr )
+                       maxwrk = max( maxwrk, 2*m + lwork_cungbr )
+                       maxwrk = max( maxwrk, n*nrhs )
+                    end if
+                 end if
+                 maxwrk = max( minwrk, maxwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery )info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGELSS', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters
+           eps = stdlib_slamch( 'P' )
+           sfmin = stdlib_slamch( 'S' )
+           smlnum = sfmin / eps
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_clascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_clascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_claset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              call stdlib_slaset( 'F', minmn, 1, zero, zero, s, minmn )
+              rank = 0
+              go to 70
+           end if
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_clange( 'M', m, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_clascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_clascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! overdetermined case
+           if( m>=n ) then
+              ! path 1 - overdetermined or exactly determined
+              mm = m
+              if( m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns
+                 mm = n
+                 itau = 1
+                 iwork = itau + n
+                 ! compute a=q*r
+                 ! (cworkspace: need 2*n, prefer n+n*nb)
+                 ! (rworkspace: none)
+                 call stdlib_cgeqrf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                           info )
+                 ! multiply b by transpose(q)
+                 ! (cworkspace: need n+nrhs, prefer n+nrhs*nb)
+                 ! (rworkspace: none)
+                 call stdlib_cunmqr( 'L', 'C', m, nrhs, n, a, lda, work( itau ), b,ldb, work( &
+                           iwork ), lwork-iwork+1, info )
+                 ! zero out below r
+                 if( n>1 )call stdlib_claset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+              end if
+              ie = 1
+              itauq = 1
+              itaup = itauq + n
+              iwork = itaup + n
+              ! bidiagonalize r in a
+              ! (cworkspace: need 2*n+mm, prefer 2*n+(mm+n)*nb)
+              ! (rworkspace: need n)
+              call stdlib_cgebrd( mm, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                        work( iwork ), lwork-iwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors of r
+              ! (cworkspace: need 2*n+nrhs, prefer 2*n+nrhs*nb)
+              ! (rworkspace: none)
+              call stdlib_cunmbr( 'Q', 'L', 'C', mm, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        iwork ), lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors of r in a
+              ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_cungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), lwork-iwork+&
+                        1, info )
+              irwork = ie + n
+              ! perform bidiagonal qr iteration
+                ! multiply b by transpose of left singular vectors
+                ! compute right singular vectors in a
+              ! (cworkspace: none)
+              ! (rworkspace: need bdspac)
+              call stdlib_cbdsqr( 'U', n, n, 0, nrhs, s, rwork( ie ), a, lda, dum,1, b, ldb, &
+                        rwork( irwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, n
+                 if( s( i )>thr ) then
+                    call stdlib_csrscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_claset( 'F', 1, nrhs, czero, czero, b( i, 1 ), ldb )
+                 end if
+              end do
+              ! multiply b by right singular vectors
+              ! (cworkspace: need n, prefer n*nrhs)
+              ! (rworkspace: none)
+              if( lwork>=ldb*nrhs .and. nrhs>1 ) then
+                 call stdlib_cgemm( 'C', 'N', n, nrhs, n, cone, a, lda, b, ldb,czero, work, ldb )
+                           
+                 call stdlib_clacpy( 'G', n, nrhs, work, ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = lwork / n
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_cgemm( 'C', 'N', n, bl, n, cone, a, lda, b( 1, i ),ldb, czero, &
+                              work, n )
+                    call stdlib_clacpy( 'G', n, bl, work, n, b( 1, i ), ldb )
+                 end do
+              else
+                 call stdlib_cgemv( 'C', n, n, cone, a, lda, b, 1, czero, work, 1 )
+                 call stdlib_ccopy( n, work, 1, b, 1 )
+              end if
+           else if( n>=mnthr .and. lwork>=3*m+m*m+max( m, nrhs, n-2*m ) )then
+              ! underdetermined case, m much less than n
+              ! path 2a - underdetermined, with many more columns than rows
+              ! and sufficient workspace for an efficient algorithm
+              ldwork = m
+              if( lwork>=3*m+m*lda+max( m, nrhs, n-2*m ) )ldwork = lda
+              itau = 1
+              iwork = m + 1
+              ! compute a=l*q
+              ! (cworkspace: need 2*m, prefer m+m*nb)
+              ! (rworkspace: none)
+              call stdlib_cgelqf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, info )
+                        
+              il = iwork
+              ! copy l to work(il), zeroing out above it
+              call stdlib_clacpy( 'L', m, m, a, lda, work( il ), ldwork )
+              call stdlib_claset( 'U', m-1, m-1, czero, czero, work( il+ldwork ),ldwork )
+              ie = 1
+              itauq = il + ldwork*m
+              itaup = itauq + m
+              iwork = itaup + m
+              ! bidiagonalize l in work(il)
+              ! (cworkspace: need m*m+4*m, prefer m*m+3*m+2*m*nb)
+              ! (rworkspace: need m)
+              call stdlib_cgebrd( m, m, work( il ), ldwork, s, rwork( ie ),work( itauq ), work( &
+                        itaup ), work( iwork ),lwork-iwork+1, info )
+              ! multiply b by transpose of left bidiagonalizing vectors of l
+              ! (cworkspace: need m*m+3*m+nrhs, prefer m*m+3*m+nrhs*nb)
+              ! (rworkspace: none)
+              call stdlib_cunmbr( 'Q', 'L', 'C', m, nrhs, m, work( il ), ldwork,work( itauq ), b, &
+                        ldb, work( iwork ),lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors of r in work(il)
+              ! (cworkspace: need m*m+4*m-1, prefer m*m+3*m+(m-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_cungbr( 'P', m, m, m, work( il ), ldwork, work( itaup ),work( iwork ), &
+                        lwork-iwork+1, info )
+              irwork = ie + m
+              ! perform bidiagonal qr iteration, computing right singular
+              ! vectors of l in work(il) and multiplying b by transpose of
+              ! left singular vectors
+              ! (cworkspace: need m*m)
+              ! (rworkspace: need bdspac)
+              call stdlib_cbdsqr( 'U', m, m, 0, nrhs, s, rwork( ie ), work( il ),ldwork, a, lda, &
+                        b, ldb, rwork( irwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, m
+                 if( s( i )>thr ) then
+                    call stdlib_csrscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_claset( 'F', 1, nrhs, czero, czero, b( i, 1 ), ldb )
+                 end if
+              end do
+              iwork = il + m*ldwork
+              ! multiply b by right singular vectors of l in work(il)
+              ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nrhs)
+              ! (rworkspace: none)
+              if( lwork>=ldb*nrhs+iwork-1 .and. nrhs>1 ) then
+                 call stdlib_cgemm( 'C', 'N', m, nrhs, m, cone, work( il ), ldwork,b, ldb, czero, &
+                           work( iwork ), ldb )
+                 call stdlib_clacpy( 'G', m, nrhs, work( iwork ), ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = ( lwork-iwork+1 ) / m
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_cgemm( 'C', 'N', m, bl, m, cone, work( il ), ldwork,b( 1, i ), &
+                              ldb, czero, work( iwork ), m )
+                    call stdlib_clacpy( 'G', m, bl, work( iwork ), m, b( 1, i ),ldb )
+                 end do
+              else
+                 call stdlib_cgemv( 'C', m, m, cone, work( il ), ldwork, b( 1, 1 ),1, czero, work(&
+                            iwork ), 1 )
+                 call stdlib_ccopy( m, work( iwork ), 1, b( 1, 1 ), 1 )
+              end if
+              ! zero out below first m rows of b
+              call stdlib_claset( 'F', n-m, nrhs, czero, czero, b( m+1, 1 ), ldb )
+              iwork = itau + m
+              ! multiply transpose(q) by b
+              ! (cworkspace: need m+nrhs, prefer m+nhrs*nb)
+              ! (rworkspace: none)
+              call stdlib_cunmlq( 'L', 'C', n, nrhs, m, a, lda, work( itau ), b,ldb, work( iwork )&
+                        , lwork-iwork+1, info )
+           else
+              ! path 2 - remaining underdetermined cases
+              ie = 1
+              itauq = 1
+              itaup = itauq + m
+              iwork = itaup + m
+              ! bidiagonalize a
+              ! (cworkspace: need 3*m, prefer 2*m+(m+n)*nb)
+              ! (rworkspace: need n)
+              call stdlib_cgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), work(&
+                         iwork ), lwork-iwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors
+              ! (cworkspace: need 2*m+nrhs, prefer 2*m+nrhs*nb)
+              ! (rworkspace: none)
+              call stdlib_cunmbr( 'Q', 'L', 'C', m, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        iwork ), lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors in a
+              ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+              ! (rworkspace: none)
+              call stdlib_cungbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), lwork-iwork+&
+                        1, info )
+              irwork = ie + m
+              ! perform bidiagonal qr iteration,
+                 ! computing right singular vectors of a in a and
+                 ! multiplying b by transpose of left singular vectors
+              ! (cworkspace: none)
+              ! (rworkspace: need bdspac)
+              call stdlib_cbdsqr( 'L', m, n, 0, nrhs, s, rwork( ie ), a, lda, dum,1, b, ldb, &
+                        rwork( irwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, m
+                 if( s( i )>thr ) then
+                    call stdlib_csrscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_claset( 'F', 1, nrhs, czero, czero, b( i, 1 ), ldb )
+                 end if
+              end do
+              ! multiply b by right singular vectors of a
+              ! (cworkspace: need n, prefer n*nrhs)
+              ! (rworkspace: none)
+              if( lwork>=ldb*nrhs .and. nrhs>1 ) then
+                 call stdlib_cgemm( 'C', 'N', n, nrhs, m, cone, a, lda, b, ldb,czero, work, ldb )
+                           
+                 call stdlib_clacpy( 'G', n, nrhs, work, ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = lwork / n
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_cgemm( 'C', 'N', n, bl, m, cone, a, lda, b( 1, i ),ldb, czero, &
+                              work, n )
+                    call stdlib_clacpy( 'F', n, bl, work, n, b( 1, i ), ldb )
+                 end do
+              else
+                 call stdlib_cgemv( 'C', m, n, cone, a, lda, b, 1, czero, work, 1 )
+                 call stdlib_ccopy( n, work, 1, b, 1 )
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_clascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_slascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,info )
+           else if( iascl==2 ) then
+              call stdlib_clascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_slascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_clascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_clascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           70 continue
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_cgelss
+
+     !> CGELSY: computes the minimum-norm solution to a complex linear least
+     !> squares problem:
+     !> minimize || A * X - B ||
+     !> using a complete orthogonal factorization of A.  A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+     !> The routine first computes a QR factorization with column pivoting:
+     !> A * P = Q * [ R11 R12 ]
+     !> [  0  R22 ]
+     !> with R11 defined as the largest leading submatrix whose estimated
+     !> condition number is less than 1/RCOND.  The order of R11, RANK,
+     !> is the effective rank of A.
+     !> Then, R22 is considered to be negligible, and R12 is annihilated
+     !> by unitary transformations from the right, arriving at the
+     !> complete orthogonal factorization:
+     !> A * P = Q * [ T11 0 ] * Z
+     !> [  0  0 ]
+     !> The minimum-norm solution is then
+     !> X = P * Z**H [ inv(T11)*Q1**H*B ]
+     !> [        0         ]
+     !> where Q1 consists of the first RANK columns of Q.
+     !> This routine is basically identical to the original xGELSX except
+     !> three differences:
+     !> o The permutation of matrix B (the right hand side) is faster and
+     !> more simple.
+     !> o The call to the subroutine xGEQPF has been substituted by the
+     !> the call to the subroutine xGEQP3. This subroutine is a Blas-3
+     !> version of the QR factorization with column pivoting.
+     !> o Matrix B (the right hand side) is updated with Blas-3.
+
+     subroutine stdlib_cgelsy( m, n, nrhs, a, lda, b, ldb, jpvt, rcond, rank,work, lwork, rwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(sp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: imax = 1
+           integer(ilp), parameter :: imin = 2
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iascl, ibscl, ismax, ismin, j, lwkopt, mn, nb, nb1, nb2, nb3, &
+                     nb4
+           real(sp) :: anrm, bignum, bnrm, smax, smaxpr, smin, sminpr, smlnum, wsize
+           complex(sp) :: c1, c2, s1, s2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,cmplx
+           ! Executable Statements 
+           mn = min( m, n )
+           ismin = mn + 1
+           ismax = 2*mn + 1
+           ! test the input arguments.
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'CGEQRF', ' ', m, n, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'CGERQF', ' ', m, n, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'CUNMQR', ' ', m, n, nrhs, -1 )
+           nb4 = stdlib_ilaenv( 1, 'CUNMRQ', ' ', m, n, nrhs, -1 )
+           nb = max( nb1, nb2, nb3, nb4 )
+           lwkopt = max( 1, mn+2*n+nb*(n+1), 2*mn+nb*nrhs )
+           work( 1 ) = cmplx( lwkopt,KIND=sp)
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, m, n ) ) then
+              info = -7
+           else if( lwork<( mn+max( 2*mn, n+1, mn+nrhs ) ) .and..not.lquery ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGELSY', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters
+           smlnum = stdlib_slamch( 'S' ) / stdlib_slamch( 'P' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! scale a, b if max entries outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_clascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_clascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_claset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              rank = 0
+              go to 70
+           end if
+           bnrm = stdlib_clange( 'M', m, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_clascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_clascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! compute qr factorization with column pivoting of a:
+              ! a * p = q * r
+           call stdlib_cgeqp3( m, n, a, lda, jpvt, work( 1 ), work( mn+1 ),lwork-mn, rwork, info )
+                     
+           wsize = mn + real( work( mn+1 ),KIND=sp)
+           ! complex workspace: mn+nb*(n+1). real workspace 2*n.
+           ! details of householder rotations stored in work(1:mn).
+           ! determine rank using incremental condition estimation
+           work( ismin ) = cone
+           work( ismax ) = cone
+           smax = abs( a( 1, 1 ) )
+           smin = smax
+           if( abs( a( 1, 1 ) )==zero ) then
+              rank = 0
+              call stdlib_claset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              go to 70
+           else
+              rank = 1
+           end if
+           10 continue
+           if( rank<mn ) then
+              i = rank + 1
+              call stdlib_claic1( imin, rank, work( ismin ), smin, a( 1, i ),a( i, i ), sminpr, &
+                        s1, c1 )
+              call stdlib_claic1( imax, rank, work( ismax ), smax, a( 1, i ),a( i, i ), smaxpr, &
+                        s2, c2 )
+              if( smaxpr*rcond<=sminpr ) then
+                 do i = 1, rank
+                    work( ismin+i-1 ) = s1*work( ismin+i-1 )
+                    work( ismax+i-1 ) = s2*work( ismax+i-1 )
+                 end do
+                 work( ismin+rank ) = c1
+                 work( ismax+rank ) = c2
+                 smin = sminpr
+                 smax = smaxpr
+                 rank = rank + 1
+                 go to 10
+              end if
+           end if
+           ! complex workspace: 3*mn.
+           ! logically partition r = [ r11 r12 ]
+                                   ! [  0  r22 ]
+           ! where r11 = r(1:rank,1:rank)
+           ! [r11,r12] = [ t11, 0 ] * y
+           if( rank<n )call stdlib_ctzrzf( rank, n, a, lda, work( mn+1 ), work( 2*mn+1 ),lwork-&
+                     2*mn, info )
+           ! complex workspace: 2*mn.
+           ! details of householder rotations stored in work(mn+1:2*mn)
+           ! b(1:m,1:nrhs) := q**h * b(1:m,1:nrhs)
+           call stdlib_cunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', m, nrhs, mn, a, lda,work( 1 ), b, &
+                     ldb, work( 2*mn+1 ), lwork-2*mn, info )
+           wsize = max( wsize, 2*mn+real( work( 2*mn+1 ),KIND=sp) )
+           ! complex workspace: 2*mn+nb*nrhs.
+           ! b(1:rank,1:nrhs) := inv(t11) * b(1:rank,1:nrhs)
+           call stdlib_ctrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', rank,nrhs, cone, a, &
+                     lda, b, ldb )
+           do j = 1, nrhs
+              do i = rank + 1, n
+                 b( i, j ) = czero
+              end do
+           end do
+           ! b(1:n,1:nrhs) := y**h * b(1:n,1:nrhs)
+           if( rank<n ) then
+              call stdlib_cunmrz( 'LEFT', 'CONJUGATE TRANSPOSE', n, nrhs, rank,n-rank, a, lda, &
+                        work( mn+1 ), b, ldb,work( 2*mn+1 ), lwork-2*mn, info )
+           end if
+           ! complex workspace: 2*mn+nrhs.
+           ! b(1:n,1:nrhs) := p * b(1:n,1:nrhs)
+           do j = 1, nrhs
+              do i = 1, n
+                 work( jpvt( i ) ) = b( i, j )
+              end do
+              call stdlib_ccopy( n, work( 1 ), 1, b( 1, j ), 1 )
+           end do
+           ! complex workspace: n.
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_clascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_clascl( 'U', 0, 0, smlnum, anrm, rank, rank, a, lda,info )
+           else if( iascl==2 ) then
+              call stdlib_clascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_clascl( 'U', 0, 0, bignum, anrm, rank, rank, a, lda,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_clascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_clascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           70 continue
+           work( 1 ) = cmplx( lwkopt,KIND=sp)
+           return
+     end subroutine stdlib_cgelsy
+
+     !> CGEMLQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product
+     !> of blocked elementary reflectors computed by short wide
+     !> LQ factorization (CGELQ)
+
+     pure subroutine stdlib_cgemlq( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, tsize, lwork, ldc
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), t(*)
+           complex(sp), intent(inout) :: c(ldc,*)
+           complex(sp), intent(out) :: work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: mb, nb, lw, nblcks, mn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork==-1
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'C' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           mb = int( t( 2 ),KIND=ilp)
+           nb = int( t( 3 ),KIND=ilp)
+           if( left ) then
+             lw = n * mb
+             mn = m
+           else
+             lw = m * mb
+             mn = n
+           end if
+           if( ( nb>k ) .and. ( mn>k ) ) then
+             if( mod( mn - k, nb - k ) == 0 ) then
+               nblcks = ( mn - k ) / ( nb - k )
+             else
+               nblcks = ( mn - k ) / ( nb - k ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+             info = -1
+           else if( .not.tran .and. .not.notran ) then
+             info = -2
+           else if( m<0 ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 .or. k>mn ) then
+             info = -5
+           else if( lda<max( 1, k ) ) then
+             info = -7
+           else if( tsize<5 ) then
+             info = -9
+           else if( ldc<max( 1, m ) ) then
+             info = -11
+           else if( ( lwork<max( 1, lw ) ) .and. ( .not.lquery ) ) then
+             info = -13
+           end if
+           if( info==0 ) then
+             work( 1 ) = real( lw,KIND=sp)
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'CGEMLQ', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n, k )==0 ) then
+             return
+           end if
+           if( ( left .and. m<=k ) .or. ( right .and. n<=k ).or. ( nb<=k ) .or. ( nb>=max( m, n, &
+                     k ) ) ) then
+             call stdlib_cgemlqt( side, trans, m, n, k, mb, a, lda,t( 6 ), mb, c, ldc, work, info &
+                       )
+           else
+             call stdlib_clamswlq( side, trans, m, n, k, mb, nb, a, lda, t( 6 ),mb, c, ldc, work, &
+                       lwork, info )
+           end if
+           work( 1 ) = real( lw,KIND=sp)
+           return
+     end subroutine stdlib_cgemlq
+
+     !> CGEMQR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product
+     !> of blocked elementary reflectors computed by tall skinny
+     !> QR factorization (CGEQR)
+
+     pure subroutine stdlib_cgemqr( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, tsize, lwork, ldc
+           ! Array Arguments 
+           complex(sp), intent(in) :: a(lda,*), t(*)
+           complex(sp), intent(inout) :: c(ldc,*)
+           complex(sp), intent(out) :: work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: mb, nb, lw, nblcks, mn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork==-1
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'C' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           mb = int( t( 2 ),KIND=ilp)
+           nb = int( t( 3 ),KIND=ilp)
+           if( left ) then
+             lw = n * nb
+             mn = m
+           else
+             lw = mb * nb
+             mn = n
+           end if
+           if( ( mb>k ) .and. ( mn>k ) ) then
+             if( mod( mn - k, mb - k )==0 ) then
+               nblcks = ( mn - k ) / ( mb - k )
+             else
+               nblcks = ( mn - k ) / ( mb - k ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+             info = -1
+           else if( .not.tran .and. .not.notran ) then
+             info = -2
+           else if( m<0 ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 .or. k>mn ) then
+             info = -5
+           else if( lda<max( 1, mn ) ) then
+             info = -7
+           else if( tsize<5 ) then
+             info = -9
+           else if( ldc<max( 1, m ) ) then
+             info = -11
+           else if( ( lwork<max( 1, lw ) ) .and. ( .not.lquery ) ) then
+             info = -13
+           end if
+           if( info==0 ) then
+             work( 1 ) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'CGEMQR', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n, k )==0 ) then
+             return
+           end if
+           if( ( left .and. m<=k ) .or. ( right .and. n<=k ).or. ( mb<=k ) .or. ( mb>=max( m, n, &
+                     k ) ) ) then
+             call stdlib_cgemqrt( side, trans, m, n, k, nb, a, lda, t( 6 ),nb, c, ldc, work, info &
+                       )
+           else
+             call stdlib_clamtsqr( side, trans, m, n, k, mb, nb, a, lda, t( 6 ),nb, c, ldc, work, &
+                       lwork, info )
+           end if
+           work( 1 ) = lw
+           return
+     end subroutine stdlib_cgemqr
+
+     !> CGEQR: computes a QR factorization of a complex M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     pure subroutine stdlib_cgeqr( m, n, a, lda, t, tsize, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, tsize, lwork
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: t(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, lminws, mint, minw
+           integer(ilp) :: mb, nb, mintsz, nblcks
+           ! Intrinsic Functions 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( tsize==-1 .or. tsize==-2 .or.lwork==-1 .or. lwork==-2 )
+           mint = .false.
+           minw = .false.
+           if( tsize==-2 .or. lwork==-2 ) then
+             if( tsize/=-1 ) mint = .true.
+             if( lwork/=-1 ) minw = .true.
+           end if
+           ! determine the block size
+           if( min( m, n )>0 ) then
+             mb = stdlib_ilaenv( 1, 'CGEQR ', ' ', m, n, 1, -1 )
+             nb = stdlib_ilaenv( 1, 'CGEQR ', ' ', m, n, 2, -1 )
+           else
+             mb = m
+             nb = 1
+           end if
+           if( mb>m .or. mb<=n ) mb = m
+           if( nb>min( m, n ) .or. nb<1 ) nb = 1
+           mintsz = n + 5
+           if( mb>n .and. m>n ) then
+             if( mod( m - n, mb - n )==0 ) then
+               nblcks = ( m - n ) / ( mb - n )
+             else
+               nblcks = ( m - n ) / ( mb - n ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           ! determine if the workspace size satisfies minimal size
+           lminws = .false.
+           if( ( tsize<max( 1, nb*n*nblcks + 5 ) .or. lwork<nb*n ).and. ( lwork>=n ) .and. ( &
+                     tsize>=mintsz ).and. ( .not.lquery ) ) then
+             if( tsize<max( 1, nb*n*nblcks + 5 ) ) then
+               lminws = .true.
+               nb = 1
+               mb = m
+             end if
+             if( lwork<nb*n ) then
+               lminws = .true.
+               nb = 1
+             end if
+           end if
+           if( m<0 ) then
+             info = -1
+           else if( n<0 ) then
+             info = -2
+           else if( lda<max( 1, m ) ) then
+             info = -4
+           else if( tsize<max( 1, nb*n*nblcks + 5 ).and. ( .not.lquery ) .and. ( .not.lminws ) ) &
+                     then
+             info = -6
+           else if( ( lwork<max( 1, n*nb ) ) .and. ( .not.lquery ).and. ( .not.lminws ) ) &
+                     then
+             info = -8
+           end if
+           if( info==0 ) then
+             if( mint ) then
+               t( 1 ) = mintsz
+             else
+               t( 1 ) = nb*n*nblcks + 5
+             end if
+             t( 2 ) = mb
+             t( 3 ) = nb
+             if( minw ) then
+               work( 1 ) = max( 1, n )
+             else
+               work( 1 ) = max( 1, nb*n )
+             end if
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'CGEQR', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+             return
+           end if
+           ! the qr decomposition
+           if( ( m<=n ) .or. ( mb<=n ) .or. ( mb>=m ) ) then
+             call stdlib_cgeqrt( m, n, nb, a, lda, t( 6 ), nb, work, info )
+           else
+             call stdlib_clatsqr( m, n, mb, nb, a, lda, t( 6 ), nb, work,lwork, info )
+           end if
+           work( 1 ) = max( 1, nb*n )
+           return
+     end subroutine stdlib_cgeqr
+
+     !> CGESDD: computes the singular value decomposition (SVD) of a complex
+     !> M-by-N matrix A, optionally computing the left and/or right singular
+     !> vectors, by using divide-and-conquer method. The SVD is written
+     !> A = U * SIGMA * conjugate-transpose(V)
+     !> where SIGMA is an M-by-N matrix which is zero except for its
+     !> min(m,n) diagonal elements, U is an M-by-M unitary matrix, and
+     !> V is an N-by-N unitary matrix.  The diagonal elements of SIGMA
+     !> are the singular values of A; they are real and non-negative, and
+     !> are returned in descending order.  The first min(m,n) columns of
+     !> U and V are the left and right singular vectors of A.
+     !> Note that the routine returns VT = V**H, not V.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_cgesdd( jobz, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, rwork, iwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldvt, lwork, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: rwork(*), s(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wntqa, wntqas, wntqn, wntqo, wntqs
+           integer(ilp) :: blk, chunk, i, ie, ierr, il, ir, iru, irvt, iscl, itau, itaup, itauq, &
+           iu, ivt, ldwkvt, ldwrkl, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr1, mnthr2, nrwork,&
+                      nwork, wrkbl
+           integer(ilp) :: lwork_cgebrd_mn, lwork_cgebrd_mm, lwork_cgebrd_nn, lwork_cgelqf_mn, &
+           lwork_cgeqrf_mn, lwork_cungbr_p_mn, lwork_cungbr_p_nn, lwork_cungbr_q_mn, &
+           lwork_cungbr_q_mm, lwork_cunglq_mn, lwork_cunglq_nn, lwork_cungqr_mm, lwork_cungqr_mn, &
+           lwork_cunmbr_prc_mm, lwork_cunmbr_qln_mm, lwork_cunmbr_prc_mn, lwork_cunmbr_qln_mn, &
+                     lwork_cunmbr_prc_nn, lwork_cunmbr_qln_nn
+           real(sp) :: anrm, bignum, eps, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(sp) :: dum(1)
+           complex(sp) :: cdum(1)
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info   = 0
+           minmn  = min( m, n )
+           mnthr1 = int( minmn*17.0_sp / 9.0_sp,KIND=ilp)
+           mnthr2 = int( minmn*5.0_sp / 3.0_sp,KIND=ilp)
+           wntqa  = stdlib_lsame( jobz, 'A' )
+           wntqs  = stdlib_lsame( jobz, 'S' )
+           wntqas = wntqa .or. wntqs
+           wntqo  = stdlib_lsame( jobz, 'O' )
+           wntqn  = stdlib_lsame( jobz, 'N' )
+           lquery = ( lwork==-1 )
+           minwrk = 1
+           maxwrk = 1
+           if( .not.( wntqa .or. wntqs .or. wntqo .or. wntqn ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldu<1 .or. ( wntqas .and. ldu<m ) .or.( wntqo .and. m<n .and. ldu<m ) ) &
+                     then
+              info = -8
+           else if( ldvt<1 .or. ( wntqa .and. ldvt<n ) .or.( wntqs .and. ldvt<minmn ) .or.( wntqo &
+                     .and. m>=n .and. ldvt<n ) ) then
+              info = -10
+           end if
+           ! compute workspace
+             ! note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace allocated at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to
+             ! real workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( m>=n .and. minmn>0 ) then
+                 ! there is no complex work space needed for bidiagonal svd
+                 ! the realwork space needed for bidiagonal svd (stdlib_sbdsdc,KIND=sp) is
+                 ! bdspac = 3*n*n + 4*n for singular values and vectors;
+                 ! bdspac = 4*n         for singular values only;
+                 ! not including e, ru, and rvt matrices.
+                 ! compute space preferred for each routine
+                 call stdlib_cgebrd( m, n, cdum(1), m, dum(1), dum(1), cdum(1),cdum(1), cdum(1), -&
+                           1, ierr )
+                 lwork_cgebrd_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_cgebrd( n, n, cdum(1), n, dum(1), dum(1), cdum(1),cdum(1), cdum(1), -&
+                           1, ierr )
+                 lwork_cgebrd_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_cgeqrf( m, n, cdum(1), m, cdum(1), cdum(1), -1, ierr )
+                 lwork_cgeqrf_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_cungbr( 'P', n, n, n, cdum(1), n, cdum(1), cdum(1),-1, ierr )
+                 lwork_cungbr_p_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_cungbr( 'Q', m, m, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_cungbr_q_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_cungbr( 'Q', m, n, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_cungbr_q_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_cungqr( m, m, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_cungqr_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_cungqr( m, n, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_cungqr_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_cunmbr( 'P', 'R', 'C', n, n, n, cdum(1), n, cdum(1),cdum(1), n, cdum(&
+                           1), -1, ierr )
+                 lwork_cunmbr_prc_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_cunmbr( 'Q', 'L', 'N', m, m, n, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_cunmbr_qln_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_cunmbr( 'Q', 'L', 'N', m, n, n, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_cunmbr_qln_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_cunmbr( 'Q', 'L', 'N', n, n, n, cdum(1), n, cdum(1),cdum(1), n, cdum(&
+                           1), -1, ierr )
+                 lwork_cunmbr_qln_nn = int( cdum(1),KIND=ilp)
+                 if( m>=mnthr1 ) then
+                    if( wntqn ) then
+                       ! path 1 (m >> n, jobz='n')
+                       maxwrk = n + lwork_cgeqrf_mn
+                       maxwrk = max( maxwrk, 2*n + lwork_cgebrd_nn )
+                       minwrk = 3*n
+                    else if( wntqo ) then
+                       ! path 2 (m >> n, jobz='o')
+                       wrkbl = n + lwork_cgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_cungqr_mn )
+                       wrkbl = max( wrkbl, 2*n + lwork_cgebrd_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_cunmbr_qln_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_cunmbr_prc_nn )
+                       maxwrk = m*n + n*n + wrkbl
+                       minwrk = 2*n*n + 3*n
+                    else if( wntqs ) then
+                       ! path 3 (m >> n, jobz='s')
+                       wrkbl = n + lwork_cgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_cungqr_mn )
+                       wrkbl = max( wrkbl, 2*n + lwork_cgebrd_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_cunmbr_qln_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_cunmbr_prc_nn )
+                       maxwrk = n*n + wrkbl
+                       minwrk = n*n + 3*n
+                    else if( wntqa ) then
+                       ! path 4 (m >> n, jobz='a')
+                       wrkbl = n + lwork_cgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_cungqr_mm )
+                       wrkbl = max( wrkbl, 2*n + lwork_cgebrd_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_cunmbr_qln_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_cunmbr_prc_nn )
+                       maxwrk = n*n + wrkbl
+                       minwrk = n*n + max( 3*n, n + m )
+                    end if
+                 else if( m>=mnthr2 ) then
+                    ! path 5 (m >> n, but not as much as mnthr1)
+                    maxwrk = 2*n + lwork_cgebrd_mn
+                    minwrk = 2*n + m
+                    if( wntqo ) then
+                       ! path 5o (m >> n, jobz='o')
+                       maxwrk = max( maxwrk, 2*n + lwork_cungbr_p_nn )
+                       maxwrk = max( maxwrk, 2*n + lwork_cungbr_q_mn )
+                       maxwrk = maxwrk + m*n
+                       minwrk = minwrk + n*n
+                    else if( wntqs ) then
+                       ! path 5s (m >> n, jobz='s')
+                       maxwrk = max( maxwrk, 2*n + lwork_cungbr_p_nn )
+                       maxwrk = max( maxwrk, 2*n + lwork_cungbr_q_mn )
+                    else if( wntqa ) then
+                       ! path 5a (m >> n, jobz='a')
+                       maxwrk = max( maxwrk, 2*n + lwork_cungbr_p_nn )
+                       maxwrk = max( maxwrk, 2*n + lwork_cungbr_q_mm )
+                    end if
+                 else
+                    ! path 6 (m >= n, but not much larger)
+                    maxwrk = 2*n + lwork_cgebrd_mn
+                    minwrk = 2*n + m
+                    if( wntqo ) then
+                       ! path 6o (m >= n, jobz='o')
+                       maxwrk = max( maxwrk, 2*n + lwork_cunmbr_prc_nn )
+                       maxwrk = max( maxwrk, 2*n + lwork_cunmbr_qln_mn )
+                       maxwrk = maxwrk + m*n
+                       minwrk = minwrk + n*n
+                    else if( wntqs ) then
+                       ! path 6s (m >= n, jobz='s')
+                       maxwrk = max( maxwrk, 2*n + lwork_cunmbr_qln_mn )
+                       maxwrk = max( maxwrk, 2*n + lwork_cunmbr_prc_nn )
+                    else if( wntqa ) then
+                       ! path 6a (m >= n, jobz='a')
+                       maxwrk = max( maxwrk, 2*n + lwork_cunmbr_qln_mm )
+                       maxwrk = max( maxwrk, 2*n + lwork_cunmbr_prc_nn )
+                    end if
+                 end if
+              else if( minmn>0 ) then
+                 ! there is no complex work space needed for bidiagonal svd
+                 ! the realwork space needed for bidiagonal svd (stdlib_sbdsdc,KIND=sp) is
+                 ! bdspac = 3*m*m + 4*m for singular values and vectors;
+                 ! bdspac = 4*m         for singular values only;
+                 ! not including e, ru, and rvt matrices.
+                 ! compute space preferred for each routine
+                 call stdlib_cgebrd( m, n, cdum(1), m, dum(1), dum(1), cdum(1),cdum(1), cdum(1), -&
+                           1, ierr )
+                 lwork_cgebrd_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_cgebrd( m, m, cdum(1), m, dum(1), dum(1), cdum(1),cdum(1), cdum(1), -&
+                           1, ierr )
+                 lwork_cgebrd_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_cgelqf( m, n, cdum(1), m, cdum(1), cdum(1), -1, ierr )
+                 lwork_cgelqf_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_cungbr( 'P', m, n, m, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_cungbr_p_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_cungbr( 'P', n, n, m, cdum(1), n, cdum(1), cdum(1),-1, ierr )
+                 lwork_cungbr_p_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_cungbr( 'Q', m, m, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_cungbr_q_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_cunglq( m, n, m, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_cunglq_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_cunglq( n, n, m, cdum(1), n, cdum(1), cdum(1),-1, ierr )
+                 lwork_cunglq_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_cunmbr( 'P', 'R', 'C', m, m, m, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_cunmbr_prc_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_cunmbr( 'P', 'R', 'C', m, n, m, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_cunmbr_prc_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_cunmbr( 'P', 'R', 'C', n, n, m, cdum(1), n, cdum(1),cdum(1), n, cdum(&
+                           1), -1, ierr )
+                 lwork_cunmbr_prc_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_cunmbr( 'Q', 'L', 'N', m, m, m, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_cunmbr_qln_mm = int( cdum(1),KIND=ilp)
+                 if( n>=mnthr1 ) then
+                    if( wntqn ) then
+                       ! path 1t (n >> m, jobz='n')
+                       maxwrk = m + lwork_cgelqf_mn
+                       maxwrk = max( maxwrk, 2*m + lwork_cgebrd_mm )
+                       minwrk = 3*m
+                    else if( wntqo ) then
+                       ! path 2t (n >> m, jobz='o')
+                       wrkbl = m + lwork_cgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_cunglq_mn )
+                       wrkbl = max( wrkbl, 2*m + lwork_cgebrd_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_cunmbr_qln_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_cunmbr_prc_mm )
+                       maxwrk = m*n + m*m + wrkbl
+                       minwrk = 2*m*m + 3*m
+                    else if( wntqs ) then
+                       ! path 3t (n >> m, jobz='s')
+                       wrkbl = m + lwork_cgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_cunglq_mn )
+                       wrkbl = max( wrkbl, 2*m + lwork_cgebrd_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_cunmbr_qln_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_cunmbr_prc_mm )
+                       maxwrk = m*m + wrkbl
+                       minwrk = m*m + 3*m
+                    else if( wntqa ) then
+                       ! path 4t (n >> m, jobz='a')
+                       wrkbl = m + lwork_cgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_cunglq_nn )
+                       wrkbl = max( wrkbl, 2*m + lwork_cgebrd_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_cunmbr_qln_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_cunmbr_prc_mm )
+                       maxwrk = m*m + wrkbl
+                       minwrk = m*m + max( 3*m, m + n )
+                    end if
+                 else if( n>=mnthr2 ) then
+                    ! path 5t (n >> m, but not as much as mnthr1)
+                    maxwrk = 2*m + lwork_cgebrd_mn
+                    minwrk = 2*m + n
+                    if( wntqo ) then
+                       ! path 5to (n >> m, jobz='o')
+                       maxwrk = max( maxwrk, 2*m + lwork_cungbr_q_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_cungbr_p_mn )
+                       maxwrk = maxwrk + m*n
+                       minwrk = minwrk + m*m
+                    else if( wntqs ) then
+                       ! path 5ts (n >> m, jobz='s')
+                       maxwrk = max( maxwrk, 2*m + lwork_cungbr_q_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_cungbr_p_mn )
+                    else if( wntqa ) then
+                       ! path 5ta (n >> m, jobz='a')
+                       maxwrk = max( maxwrk, 2*m + lwork_cungbr_q_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_cungbr_p_nn )
+                    end if
+                 else
+                    ! path 6t (n > m, but not much larger)
+                    maxwrk = 2*m + lwork_cgebrd_mn
+                    minwrk = 2*m + n
+                    if( wntqo ) then
+                       ! path 6to (n > m, jobz='o')
+                       maxwrk = max( maxwrk, 2*m + lwork_cunmbr_qln_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_cunmbr_prc_mn )
+                       maxwrk = maxwrk + m*n
+                       minwrk = minwrk + m*m
+                    else if( wntqs ) then
+                       ! path 6ts (n > m, jobz='s')
+                       maxwrk = max( maxwrk, 2*m + lwork_cunmbr_qln_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_cunmbr_prc_mn )
+                    else if( wntqa ) then
+                       ! path 6ta (n > m, jobz='a')
+                       maxwrk = max( maxwrk, 2*m + lwork_cunmbr_qln_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_cunmbr_prc_nn )
+                    end if
+                 end if
+              end if
+              maxwrk = max( maxwrk, minwrk )
+           end if
+           if( info==0 ) then
+              work( 1 ) = stdlib_sroundup_lwork( maxwrk )
+              if( lwork<minwrk .and. .not. lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGESDD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = sqrt( stdlib_slamch( 'S' ) ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', m, n, a, lda, dum )
+           if( stdlib_sisnan ( anrm ) ) then
+               info = -4
+               return
+           end if
+           iscl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              iscl = 1
+              call stdlib_clascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, ierr )
+           else if( anrm>bignum ) then
+              iscl = 1
+              call stdlib_clascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, ierr )
+           end if
+           if( m>=n ) then
+              ! a has at least as many rows as columns. if a has sufficiently
+              ! more rows than columns, first reduce using the qr
+              ! decomposition (if sufficient workspace available)
+              if( m>=mnthr1 ) then
+                 if( wntqn ) then
+                    ! path 1 (m >> n, jobz='n')
+                    ! no singular vectors to be computed
+                    itau = 1
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! cworkspace: need   n [tau] + n    [work]
+                    ! cworkspace: prefer n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! zero out below r
+                    call stdlib_claset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+                    ie = 1
+                    itauq = 1
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! cworkspace: need   2*n [tauq, taup] + n      [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + 2*n*nb [work]
+                    ! rworkspace: need   n [e]
+                    call stdlib_cgebrd( n, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( nwork ), lwork-nwork+1,ierr )
+                    nrwork = ie + n
+                    ! perform bidiagonal svd, compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + bdspac
+                    call stdlib_sbdsdc( 'U', 'N', n, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 2 (m >> n, jobz='o')
+                    ! n left singular vectors to be overwritten on a and
+                    ! n right singular vectors to be computed in vt
+                    iu = 1
+                    ! work(iu) is n by n
+                    ldwrku = n
+                    ir = iu + ldwrku*n
+                    if( lwork >= m*n + n*n + 3*n ) then
+                       ! work(ir) is m by n
+                       ldwrkr = m
+                    else
+                       ldwrkr = ( lwork - n*n - 3*n ) / n
+                    end if
+                    itau = ir + ldwrkr*n
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! cworkspace: need   n*n [u] + n*n [r] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! copy r to work( ir ), zeroing out below it
+                    call stdlib_clacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                    call stdlib_claset( 'L', n-1, n-1, czero, czero, work( ir+1 ),ldwrkr )
+                    ! generate q in a
+                    ! cworkspace: need   n*n [u] + n*n [r] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cungqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in work(ir)
+                    ! cworkspace: need   n*n [u] + n*n [r] + 2*n [tauq, taup] + n      [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + 2*n [tauq, taup] + 2*n*nb [work]
+                    ! rworkspace: need   n [e]
+                    call stdlib_cgebrd( n, n, work( ir ), ldwrkr, s, rwork( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of r in work(iru) and computing right singular vectors
+                    ! of r in work(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = ie + n
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_sbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=sp) to complex matrix work(iu)
+                    ! overwrite work(iu) by the left singular vectors of r
+                    ! cworkspace: need   n*n [u] + n*n [r] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacp2( 'F', n, n, rwork( iru ), n, work( iu ),ldwrku )
+                    call stdlib_cunmbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                              work( iu ), ldwrku,work( nwork ), lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=sp) to complex matrix vt
+                    ! overwrite vt by the right singular vectors of r
+                    ! cworkspace: need   n*n [u] + n*n [r] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_cunmbr( 'P', 'R', 'C', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork-nwork+1, ierr )
+                    ! multiply q in a by left singular vectors of r in
+                    ! work(iu), storing result in work(ir) and copying to a
+                    ! cworkspace: need   n*n [u] + n*n [r]
+                    ! cworkspace: prefer n*n [u] + m*n [r]
+                    ! rworkspace: need   0
+                    do i = 1, m, ldwrkr
+                       chunk = min( m-i+1, ldwrkr )
+                       call stdlib_cgemm( 'N', 'N', chunk, n, n, cone, a( i, 1 ),lda, work( iu ), &
+                                 ldwrku, czero,work( ir ), ldwrkr )
+                       call stdlib_clacpy( 'F', chunk, n, work( ir ), ldwrkr,a( i, 1 ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 3 (m >> n, jobz='s')
+                    ! n left singular vectors to be computed in u and
+                    ! n right singular vectors to be computed in vt
+                    ir = 1
+                    ! work(ir) is n by n
+                    ldwrkr = n
+                    itau = ir + ldwrkr*n
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! cworkspace: need   n*n [r] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! copy r to work(ir), zeroing out below it
+                    call stdlib_clacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                    call stdlib_claset( 'L', n-1, n-1, czero, czero, work( ir+1 ),ldwrkr )
+                    ! generate q in a
+                    ! cworkspace: need   n*n [r] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cungqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in work(ir)
+                    ! cworkspace: need   n*n [r] + 2*n [tauq, taup] + n      [work]
+                    ! cworkspace: prefer n*n [r] + 2*n [tauq, taup] + 2*n*nb [work]
+                    ! rworkspace: need   n [e]
+                    call stdlib_cgebrd( n, n, work( ir ), ldwrkr, s, rwork( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = ie + n
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_sbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=sp) to complex matrix u
+                    ! overwrite u by left singular vectors of r
+                    ! cworkspace: need   n*n [r] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [r] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacp2( 'F', n, n, rwork( iru ), n, u, ldu )
+                    call stdlib_cunmbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                              u, ldu, work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=sp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of r
+                    ! cworkspace: need   n*n [r] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [r] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_cunmbr( 'P', 'R', 'C', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork-nwork+1, ierr )
+                    ! multiply q in a by left singular vectors of r in
+                    ! work(ir), storing result in u
+                    ! cworkspace: need   n*n [r]
+                    ! rworkspace: need   0
+                    call stdlib_clacpy( 'F', n, n, u, ldu, work( ir ), ldwrkr )
+                    call stdlib_cgemm( 'N', 'N', m, n, n, cone, a, lda, work( ir ),ldwrkr, czero, &
+                              u, ldu )
+                 else if( wntqa ) then
+                    ! path 4 (m >> n, jobz='a')
+                    ! m left singular vectors to be computed in u and
+                    ! n right singular vectors to be computed in vt
+                    iu = 1
+                    ! work(iu) is n by n
+                    ldwrku = n
+                    itau = iu + ldwrku*n
+                    nwork = itau + n
+                    ! compute a=q*r, copying result to u
+                    ! cworkspace: need   n*n [u] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    call stdlib_clacpy( 'L', m, n, a, lda, u, ldu )
+                    ! generate q in u
+                    ! cworkspace: need   n*n [u] + n [tau] + m    [work]
+                    ! cworkspace: prefer n*n [u] + n [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cungqr( m, m, n, u, ldu, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ! produce r in a, zeroing out below it
+                    call stdlib_claset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! cworkspace: need   n*n [u] + 2*n [tauq, taup] + n      [work]
+                    ! cworkspace: prefer n*n [u] + 2*n [tauq, taup] + 2*n*nb [work]
+                    ! rworkspace: need   n [e]
+                    call stdlib_cgebrd( n, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( nwork ), lwork-nwork+1,ierr )
+                    iru = ie + n
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    call stdlib_sbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=sp) to complex matrix work(iu)
+                    ! overwrite work(iu) by left singular vectors of r
+                    ! cworkspace: need   n*n [u] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [u] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacp2( 'F', n, n, rwork( iru ), n, work( iu ),ldwrku )
+                    call stdlib_cunmbr( 'Q', 'L', 'N', n, n, n, a, lda,work( itauq ), work( iu ), &
+                              ldwrku,work( nwork ), lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=sp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of r
+                    ! cworkspace: need   n*n [u] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [u] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_cunmbr( 'P', 'R', 'C', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! multiply q in u by left singular vectors of r in
+                    ! work(iu), storing result in a
+                    ! cworkspace: need   n*n [u]
+                    ! rworkspace: need   0
+                    call stdlib_cgemm( 'N', 'N', m, n, n, cone, u, ldu, work( iu ),ldwrku, czero, &
+                              a, lda )
+                    ! copy left singular vectors of a from a to u
+                    call stdlib_clacpy( 'F', m, n, a, lda, u, ldu )
+                 end if
+              else if( m>=mnthr2 ) then
+                 ! mnthr2 <= m < mnthr1
+                 ! path 5 (m >> n, but not as much as mnthr1)
+                 ! reduce to bidiagonal form without qr decomposition, use
+                 ! stdlib_cungbr and matrix multiplication to compute singular vectors
+                 ie = 1
+                 nrwork = ie + n
+                 itauq = 1
+                 itaup = itauq + n
+                 nwork = itaup + n
+                 ! bidiagonalize a
+                 ! cworkspace: need   2*n [tauq, taup] + m        [work]
+                 ! cworkspace: prefer 2*n [tauq, taup] + (m+n)*nb [work]
+                 ! rworkspace: need   n [e]
+                 call stdlib_cgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 5n (m >> n, jobz='n')
+                    ! compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + bdspac
+                    call stdlib_sbdsdc( 'U', 'N', n, s, rwork( ie ), dum, 1,dum,1,dum, idum, &
+                              rwork( nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    iu = nwork
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    ! path 5o (m >> n, jobz='o')
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_cungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! generate q in a
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cungbr( 'Q', m, n, n, a, lda, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    if( lwork >= m*n + 3*n ) then
+                       ! work( iu ) is m by n
+                       ldwrku = m
+                    else
+                       ! work(iu) is ldwrku by n
+                       ldwrku = ( lwork - 3*n ) / n
+                    end if
+                    nwork = iu + ldwrku*n
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    call stdlib_sbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply realmatrix rwork(irvt,KIND=sp) by p**h in vt,
+                    ! storing the result in work(iu), copying to vt
+                    ! cworkspace: need   2*n [tauq, taup] + n*n [u]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + 2*n*n [rwork]
+                    call stdlib_clarcm( n, n, rwork( irvt ), n, vt, ldvt,work( iu ), ldwrku, &
+                              rwork( nrwork ) )
+                    call stdlib_clacpy( 'F', n, n, work( iu ), ldwrku, vt, ldvt )
+                    ! multiply q in a by realmatrix rwork(iru,KIND=sp), storing the
+                    ! result in work(iu), copying to a
+                    ! cworkspace: need   2*n [tauq, taup] + n*n [u]
+                    ! cworkspace: prefer 2*n [tauq, taup] + m*n [u]
+                    ! rworkspace: need   n [e] + n*n [ru] + 2*n*n [rwork]
+                    ! rworkspace: prefer n [e] + n*n [ru] + 2*m*n [rwork] < n + 5*n*n since m < 2*n here
+                    nrwork = irvt
+                    do i = 1, m, ldwrku
+                       chunk = min( m-i+1, ldwrku )
+                       call stdlib_clacrm( chunk, n, a( i, 1 ), lda, rwork( iru ),n, work( iu ), &
+                                 ldwrku, rwork( nrwork ) )
+                       call stdlib_clacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 5s (m >> n, jobz='s')
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_cungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacpy( 'L', m, n, a, lda, u, ldu )
+                    call stdlib_cungbr( 'Q', m, n, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_sbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply realmatrix rwork(irvt,KIND=sp) by p**h in vt,
+                    ! storing the result in a, copying to vt
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + 2*n*n [rwork]
+                    call stdlib_clarcm( n, n, rwork( irvt ), n, vt, ldvt, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_clacpy( 'F', n, n, a, lda, vt, ldvt )
+                    ! multiply q in u by realmatrix rwork(iru,KIND=sp), storing the
+                    ! result in a, copying to u
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + 2*m*n [rwork] < n + 5*n*n since m < 2*n here
+                    nrwork = irvt
+                    call stdlib_clacrm( m, n, u, ldu, rwork( iru ), n, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_clacpy( 'F', m, n, a, lda, u, ldu )
+                 else
+                    ! path 5a (m >> n, jobz='a')
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_cungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*n [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacpy( 'L', m, n, a, lda, u, ldu )
+                    call stdlib_cungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_sbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply realmatrix rwork(irvt,KIND=sp) by p**h in vt,
+                    ! storing the result in a, copying to vt
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + 2*n*n [rwork]
+                    call stdlib_clarcm( n, n, rwork( irvt ), n, vt, ldvt, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_clacpy( 'F', n, n, a, lda, vt, ldvt )
+                    ! multiply q in u by realmatrix rwork(iru,KIND=sp), storing the
+                    ! result in a, copying to u
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + 2*m*n [rwork] < n + 5*n*n since m < 2*n here
+                    nrwork = irvt
+                    call stdlib_clacrm( m, n, u, ldu, rwork( iru ), n, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_clacpy( 'F', m, n, a, lda, u, ldu )
+                 end if
+              else
+                 ! m < mnthr2
+                 ! path 6 (m >= n, but not much larger)
+                 ! reduce to bidiagonal form without qr decomposition
+                 ! use stdlib_cunmbr to compute singular vectors
+                 ie = 1
+                 nrwork = ie + n
+                 itauq = 1
+                 itaup = itauq + n
+                 nwork = itaup + n
+                 ! bidiagonalize a
+                 ! cworkspace: need   2*n [tauq, taup] + m        [work]
+                 ! cworkspace: prefer 2*n [tauq, taup] + (m+n)*nb [work]
+                 ! rworkspace: need   n [e]
+                 call stdlib_cgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 6n (m >= n, jobz='n')
+                    ! compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + bdspac
+                    call stdlib_sbdsdc( 'U', 'N', n, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    iu = nwork
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    if( lwork >= m*n + 3*n ) then
+                       ! work( iu ) is m by n
+                       ldwrku = m
+                    else
+                       ! work( iu ) is ldwrku by n
+                       ldwrku = ( lwork - 3*n ) / n
+                    end if
+                    nwork = iu + ldwrku*n
+                    ! path 6o (m >= n, jobz='o')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    call stdlib_sbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(irvt,KIND=sp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + n*n [u] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*n [u] + n*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_clacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_cunmbr( 'P', 'R', 'C', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    if( lwork >= m*n + 3*n ) then
+                       ! path 6o-fast
+                       ! copy realmatrix rwork(iru,KIND=sp) to complex matrix work(iu)
+                       ! overwrite work(iu) by left singular vectors of a, copying
+                       ! to a
+                       ! cworkspace: need   2*n [tauq, taup] + m*n [u] + n    [work]
+                       ! cworkspace: prefer 2*n [tauq, taup] + m*n [u] + n*nb [work]
+                       ! rworkspace: need   n [e] + n*n [ru]
+                       call stdlib_claset( 'F', m, n, czero, czero, work( iu ),ldwrku )
+                       call stdlib_clacp2( 'F', n, n, rwork( iru ), n, work( iu ),ldwrku )
+                       call stdlib_cunmbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), work( iu &
+                                 ), ldwrku,work( nwork ), lwork-nwork+1, ierr )
+                       call stdlib_clacpy( 'F', m, n, work( iu ), ldwrku, a, lda )
+                    else
+                       ! path 6o-slow
+                       ! generate q in a
+                       ! cworkspace: need   2*n [tauq, taup] + n*n [u] + n    [work]
+                       ! cworkspace: prefer 2*n [tauq, taup] + n*n [u] + n*nb [work]
+                       ! rworkspace: need   0
+                       call stdlib_cungbr( 'Q', m, n, n, a, lda, work( itauq ),work( nwork ), &
+                                 lwork-nwork+1, ierr )
+                       ! multiply q in a by realmatrix rwork(iru,KIND=sp), storing the
+                       ! result in work(iu), copying to a
+                       ! cworkspace: need   2*n [tauq, taup] + n*n [u]
+                       ! cworkspace: prefer 2*n [tauq, taup] + m*n [u]
+                       ! rworkspace: need   n [e] + n*n [ru] + 2*n*n [rwork]
+                       ! rworkspace: prefer n [e] + n*n [ru] + 2*m*n [rwork] < n + 5*n*n since m < 2*n here
+                       nrwork = irvt
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_clacrm( chunk, n, a( i, 1 ), lda,rwork( iru ), n, work( iu )&
+                                    , ldwrku,rwork( nrwork ) )
+                          call stdlib_clacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    end if
+                 else if( wntqs ) then
+                    ! path 6s (m >= n, jobz='s')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_sbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=sp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_claset( 'F', m, n, czero, czero, u, ldu )
+                    call stdlib_clacp2( 'F', n, n, rwork( iru ), n, u, ldu )
+                    call stdlib_cunmbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=sp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_clacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_cunmbr( 'P', 'R', 'C', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                 else
+                    ! path 6a (m >= n, jobz='a')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_sbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! set the right corner of u to identity matrix
+                    call stdlib_claset( 'F', m, m, czero, czero, u, ldu )
+                    if( m>n ) then
+                       call stdlib_claset( 'F', m-n, m-n, czero, cone,u( n+1, n+1 ), ldu )
+                    end if
+                    ! copy realmatrix rwork(iru,KIND=sp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_clacp2( 'F', n, n, rwork( iru ), n, u, ldu )
+                    call stdlib_cunmbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=sp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_clacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_cunmbr( 'P', 'R', 'C', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                 end if
+              end if
+           else
+              ! a has more columns than rows. if a has sufficiently more
+              ! columns than rows, first reduce using the lq decomposition (if
+              ! sufficient workspace available)
+              if( n>=mnthr1 ) then
+                 if( wntqn ) then
+                    ! path 1t (n >> m, jobz='n')
+                    ! no singular vectors to be computed
+                    itau = 1
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! cworkspace: need   m [tau] + m    [work]
+                    ! cworkspace: prefer m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! zero out above l
+                    call stdlib_claset( 'U', m-1, m-1, czero, czero, a( 1, 2 ),lda )
+                    ie = 1
+                    itauq = 1
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! cworkspace: need   2*m [tauq, taup] + m      [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + 2*m*nb [work]
+                    ! rworkspace: need   m [e]
+                    call stdlib_cgebrd( m, m, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( nwork ), lwork-nwork+1,ierr )
+                    nrwork = ie + m
+                    ! perform bidiagonal svd, compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + bdspac
+                    call stdlib_sbdsdc( 'U', 'N', m, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 2t (n >> m, jobz='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! m left singular vectors to be computed in u
+                    ivt = 1
+                    ldwkvt = m
+                    ! work(ivt) is m by m
+                    il = ivt + ldwkvt*m
+                    if( lwork >= m*n + m*m + 3*m ) then
+                       ! work(il) m by n
+                       ldwrkl = m
+                       chunk = n
+                    else
+                       ! work(il) is m by chunk
+                       ldwrkl = m
+                       chunk = ( lwork - m*m - 3*m ) / m
+                    end if
+                    itau = il + ldwrkl*chunk
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! cworkspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! copy l to work(il), zeroing about above it
+                    call stdlib_clacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
+                    call stdlib_claset( 'U', m-1, m-1, czero, czero,work( il+ldwrkl ), ldwrkl )
+                              
+                    ! generate q in a
+                    ! cworkspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cunglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in work(il)
+                    ! cworkspace: need   m*m [vt] + m*m [l] + 2*m [tauq, taup] + m      [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + 2*m [tauq, taup] + 2*m*nb [work]
+                    ! rworkspace: need   m [e]
+                    call stdlib_cgebrd( m, m, work( il ), ldwrkl, s, rwork( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [ru] + m*m [rvt] + bdspac
+                    iru = ie + m
+                    irvt = iru + m*m
+                    nrwork = irvt + m*m
+                    call stdlib_sbdsdc( 'U', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=sp) to complex matrix work(iu)
+                    ! overwrite work(iu) by the left singular vectors of l
+                    ! cworkspace: need   m*m [vt] + m*m [l] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_cunmbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
+                              u, ldu, work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=sp) to complex matrix work(ivt)
+                    ! overwrite work(ivt) by the right singular vectors of l
+                    ! cworkspace: need   m*m [vt] + m*m [l] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacp2( 'F', m, m, rwork( irvt ), m, work( ivt ),ldwkvt )
+                    call stdlib_cunmbr( 'P', 'R', 'C', m, m, m, work( il ), ldwrkl,work( itaup ), &
+                              work( ivt ), ldwkvt,work( nwork ), lwork-nwork+1, ierr )
+                    ! multiply right singular vectors of l in work(il) by q
+                    ! in a, storing result in work(il) and copying to a
+                    ! cworkspace: need   m*m [vt] + m*m [l]
+                    ! cworkspace: prefer m*m [vt] + m*n [l]
+                    ! rworkspace: need   0
+                    do i = 1, n, chunk
+                       blk = min( n-i+1, chunk )
+                       call stdlib_cgemm( 'N', 'N', m, blk, m, cone, work( ivt ), m,a( 1, i ), &
+                                 lda, czero, work( il ),ldwrkl )
+                       call stdlib_clacpy( 'F', m, blk, work( il ), ldwrkl,a( 1, i ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 3t (n >> m, jobz='s')
+                    ! m right singular vectors to be computed in vt and
+                    ! m left singular vectors to be computed in u
+                    il = 1
+                    ! work(il) is m by m
+                    ldwrkl = m
+                    itau = il + ldwrkl*m
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! cworkspace: need   m*m [l] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [l] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! copy l to work(il), zeroing out above it
+                    call stdlib_clacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
+                    call stdlib_claset( 'U', m-1, m-1, czero, czero,work( il+ldwrkl ), ldwrkl )
+                              
+                    ! generate q in a
+                    ! cworkspace: need   m*m [l] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [l] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cunglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in work(il)
+                    ! cworkspace: need   m*m [l] + 2*m [tauq, taup] + m      [work]
+                    ! cworkspace: prefer m*m [l] + 2*m [tauq, taup] + 2*m*nb [work]
+                    ! rworkspace: need   m [e]
+                    call stdlib_cgebrd( m, m, work( il ), ldwrkl, s, rwork( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [ru] + m*m [rvt] + bdspac
+                    iru = ie + m
+                    irvt = iru + m*m
+                    nrwork = irvt + m*m
+                    call stdlib_sbdsdc( 'U', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=sp) to complex matrix u
+                    ! overwrite u by left singular vectors of l
+                    ! cworkspace: need   m*m [l] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [l] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_cunmbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
+                              u, ldu, work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=sp) to complex matrix vt
+                    ! overwrite vt by left singular vectors of l
+                    ! cworkspace: need   m*m [l] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [l] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacp2( 'F', m, m, rwork( irvt ), m, vt, ldvt )
+                    call stdlib_cunmbr( 'P', 'R', 'C', m, m, m, work( il ), ldwrkl,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork-nwork+1, ierr )
+                    ! copy vt to work(il), multiply right singular vectors of l
+                    ! in work(il) by q in a, storing result in vt
+                    ! cworkspace: need   m*m [l]
+                    ! rworkspace: need   0
+                    call stdlib_clacpy( 'F', m, m, vt, ldvt, work( il ), ldwrkl )
+                    call stdlib_cgemm( 'N', 'N', m, n, m, cone, work( il ), ldwrkl,a, lda, czero, &
+                              vt, ldvt )
+                 else if( wntqa ) then
+                    ! path 4t (n >> m, jobz='a')
+                    ! n right singular vectors to be computed in vt and
+                    ! m left singular vectors to be computed in u
+                    ivt = 1
+                    ! work(ivt) is m by m
+                    ldwkvt = m
+                    itau = ivt + ldwkvt*m
+                    nwork = itau + m
+                    ! compute a=l*q, copying result to vt
+                    ! cworkspace: need   m*m [vt] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    call stdlib_clacpy( 'U', m, n, a, lda, vt, ldvt )
+                    ! generate q in vt
+                    ! cworkspace: need   m*m [vt] + m [tau] + n    [work]
+                    ! cworkspace: prefer m*m [vt] + m [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cunglq( n, n, m, vt, ldvt, work( itau ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! produce l in a, zeroing out above it
+                    call stdlib_claset( 'U', m-1, m-1, czero, czero, a( 1, 2 ),lda )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! cworkspace: need   m*m [vt] + 2*m [tauq, taup] + m      [work]
+                    ! cworkspace: prefer m*m [vt] + 2*m [tauq, taup] + 2*m*nb [work]
+                    ! rworkspace: need   m [e]
+                    call stdlib_cgebrd( m, m, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( nwork ), lwork-nwork+1,ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [ru] + m*m [rvt] + bdspac
+                    iru = ie + m
+                    irvt = iru + m*m
+                    nrwork = irvt + m*m
+                    call stdlib_sbdsdc( 'U', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=sp) to complex matrix u
+                    ! overwrite u by left singular vectors of l
+                    ! cworkspace: need   m*m [vt] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_cunmbr( 'Q', 'L', 'N', m, m, m, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=sp) to complex matrix work(ivt)
+                    ! overwrite work(ivt) by right singular vectors of l
+                    ! cworkspace: need   m*m [vt] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacp2( 'F', m, m, rwork( irvt ), m, work( ivt ),ldwkvt )
+                    call stdlib_cunmbr( 'P', 'R', 'C', m, m, m, a, lda,work( itaup ), work( ivt ),&
+                               ldwkvt,work( nwork ), lwork-nwork+1, ierr )
+                    ! multiply right singular vectors of l in work(ivt) by
+                    ! q in vt, storing result in a
+                    ! cworkspace: need   m*m [vt]
+                    ! rworkspace: need   0
+                    call stdlib_cgemm( 'N', 'N', m, n, m, cone, work( ivt ), ldwkvt,vt, ldvt, &
+                              czero, a, lda )
+                    ! copy right singular vectors of a from a to vt
+                    call stdlib_clacpy( 'F', m, n, a, lda, vt, ldvt )
+                 end if
+              else if( n>=mnthr2 ) then
+                 ! mnthr2 <= n < mnthr1
+                 ! path 5t (n >> m, but not as much as mnthr1)
+                 ! reduce to bidiagonal form without qr decomposition, use
+                 ! stdlib_cungbr and matrix multiplication to compute singular vectors
+                 ie = 1
+                 nrwork = ie + m
+                 itauq = 1
+                 itaup = itauq + m
+                 nwork = itaup + m
+                 ! bidiagonalize a
+                 ! cworkspace: need   2*m [tauq, taup] + n        [work]
+                 ! cworkspace: prefer 2*m [tauq, taup] + (m+n)*nb [work]
+                 ! rworkspace: need   m [e]
+                 call stdlib_cgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 5tn (n >> m, jobz='n')
+                    ! compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + bdspac
+                    call stdlib_sbdsdc( 'L', 'N', m, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    ivt = nwork
+                    ! path 5to (n >> m, jobz='o')
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_cungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! generate p**h in a
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_cungbr( 'P', m, n, m, a, lda, work( itaup ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ldwkvt = m
+                    if( lwork >= m*n + 3*m ) then
+                       ! work( ivt ) is m by n
+                       nwork = ivt + ldwkvt*n
+                       chunk = n
+                    else
+                       ! work( ivt ) is m by chunk
+                       chunk = ( lwork - 3*m ) / m
+                       nwork = ivt + ldwkvt*chunk
+                    end if
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    call stdlib_sbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply q in u by realmatrix rwork(irvt,KIND=sp)
+                    ! storing the result in work(ivt), copying to u
+                    ! cworkspace: need   2*m [tauq, taup] + m*m [vt]
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + 2*m*m [rwork]
+                    call stdlib_clacrm( m, m, u, ldu, rwork( iru ), m, work( ivt ),ldwkvt, rwork( &
+                              nrwork ) )
+                    call stdlib_clacpy( 'F', m, m, work( ivt ), ldwkvt, u, ldu )
+                    ! multiply rwork(irvt) by p**h in a, storing the
+                    ! result in work(ivt), copying to a
+                    ! cworkspace: need   2*m [tauq, taup] + m*m [vt]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*n [vt]
+                    ! rworkspace: need   m [e] + m*m [rvt] + 2*m*m [rwork]
+                    ! rworkspace: prefer m [e] + m*m [rvt] + 2*m*n [rwork] < m + 5*m*m since n < 2*m here
+                    nrwork = iru
+                    do i = 1, n, chunk
+                       blk = min( n-i+1, chunk )
+                       call stdlib_clarcm( m, blk, rwork( irvt ), m, a( 1, i ), lda,work( ivt ), &
+                                 ldwkvt, rwork( nrwork ) )
+                       call stdlib_clacpy( 'F', m, blk, work( ivt ), ldwkvt,a( 1, i ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 5ts (n >> m, jobz='s')
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_cungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacpy( 'U', m, n, a, lda, vt, ldvt )
+                    call stdlib_cungbr( 'P', m, n, m, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_sbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply q in u by realmatrix rwork(iru,KIND=sp), storing the
+                    ! result in a, copying to u
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + 2*m*m [rwork]
+                    call stdlib_clacrm( m, m, u, ldu, rwork( iru ), m, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_clacpy( 'F', m, m, a, lda, u, ldu )
+                    ! multiply realmatrix rwork(irvt,KIND=sp) by p**h in vt,
+                    ! storing the result in a, copying to vt
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + 2*m*n [rwork] < m + 5*m*m since n < 2*m here
+                    nrwork = iru
+                    call stdlib_clarcm( m, n, rwork( irvt ), m, vt, ldvt, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_clacpy( 'F', m, n, a, lda, vt, ldvt )
+                 else
+                    ! path 5ta (n >> m, jobz='a')
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_cungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*m [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_clacpy( 'U', m, n, a, lda, vt, ldvt )
+                    call stdlib_cungbr( 'P', n, n, m, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_sbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply q in u by realmatrix rwork(iru,KIND=sp), storing the
+                    ! result in a, copying to u
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + 2*m*m [rwork]
+                    call stdlib_clacrm( m, m, u, ldu, rwork( iru ), m, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_clacpy( 'F', m, m, a, lda, u, ldu )
+                    ! multiply realmatrix rwork(irvt,KIND=sp) by p**h in vt,
+                    ! storing the result in a, copying to vt
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + 2*m*n [rwork] < m + 5*m*m since n < 2*m here
+                    nrwork = iru
+                    call stdlib_clarcm( m, n, rwork( irvt ), m, vt, ldvt, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_clacpy( 'F', m, n, a, lda, vt, ldvt )
+                 end if
+              else
+                 ! n < mnthr2
+                 ! path 6t (n > m, but not much larger)
+                 ! reduce to bidiagonal form without lq decomposition
+                 ! use stdlib_cunmbr to compute singular vectors
+                 ie = 1
+                 nrwork = ie + m
+                 itauq = 1
+                 itaup = itauq + m
+                 nwork = itaup + m
+                 ! bidiagonalize a
+                 ! cworkspace: need   2*m [tauq, taup] + n        [work]
+                 ! cworkspace: prefer 2*m [tauq, taup] + (m+n)*nb [work]
+                 ! rworkspace: need   m [e]
+                 call stdlib_cgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 6tn (n > m, jobz='n')
+                    ! compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + bdspac
+                    call stdlib_sbdsdc( 'L', 'N', m, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 6to (n > m, jobz='o')
+                    ldwkvt = m
+                    ivt = nwork
+                    if( lwork >= m*n + 3*m ) then
+                       ! work( ivt ) is m by n
+                       call stdlib_claset( 'F', m, n, czero, czero, work( ivt ),ldwkvt )
+                       nwork = ivt + ldwkvt*n
+                    else
+                       ! work( ivt ) is m by chunk
+                       chunk = ( lwork - 3*m ) / m
+                       nwork = ivt + ldwkvt*chunk
+                    end if
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_sbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=sp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + m*m [vt] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*m [vt] + m*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru]
+                    call stdlib_clacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_cunmbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    if( lwork >= m*n + 3*m ) then
+                       ! path 6to-fast
+                       ! copy realmatrix rwork(irvt,KIND=sp) to complex matrix work(ivt)
+                       ! overwrite work(ivt) by right singular vectors of a,
+                       ! copying to a
+                       ! cworkspace: need   2*m [tauq, taup] + m*n [vt] + m    [work]
+                       ! cworkspace: prefer 2*m [tauq, taup] + m*n [vt] + m*nb [work]
+                       ! rworkspace: need   m [e] + m*m [rvt]
+                       call stdlib_clacp2( 'F', m, m, rwork( irvt ), m, work( ivt ),ldwkvt )
+                                 
+                       call stdlib_cunmbr( 'P', 'R', 'C', m, n, m, a, lda,work( itaup ), work( &
+                                 ivt ), ldwkvt,work( nwork ), lwork-nwork+1, ierr )
+                       call stdlib_clacpy( 'F', m, n, work( ivt ), ldwkvt, a, lda )
+                    else
+                       ! path 6to-slow
+                       ! generate p**h in a
+                       ! cworkspace: need   2*m [tauq, taup] + m*m [vt] + m    [work]
+                       ! cworkspace: prefer 2*m [tauq, taup] + m*m [vt] + m*nb [work]
+                       ! rworkspace: need   0
+                       call stdlib_cungbr( 'P', m, n, m, a, lda, work( itaup ),work( nwork ), &
+                                 lwork-nwork+1, ierr )
+                       ! multiply q in a by realmatrix rwork(iru,KIND=sp), storing the
+                       ! result in work(iu), copying to a
+                       ! cworkspace: need   2*m [tauq, taup] + m*m [vt]
+                       ! cworkspace: prefer 2*m [tauq, taup] + m*n [vt]
+                       ! rworkspace: need   m [e] + m*m [rvt] + 2*m*m [rwork]
+                       ! rworkspace: prefer m [e] + m*m [rvt] + 2*m*n [rwork] < m + 5*m*m since n < 2*m here
+                       nrwork = iru
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_clarcm( m, blk, rwork( irvt ), m, a( 1, i ),lda, work( ivt )&
+                                    , ldwkvt,rwork( nrwork ) )
+                          call stdlib_clacpy( 'F', m, blk, work( ivt ), ldwkvt,a( 1, i ), lda )
+                                    
+                       end do
+                    end if
+                 else if( wntqs ) then
+                    ! path 6ts (n > m, jobz='s')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_sbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=sp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru]
+                    call stdlib_clacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_cunmbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=sp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt]
+                    call stdlib_claset( 'F', m, n, czero, czero, vt, ldvt )
+                    call stdlib_clacp2( 'F', m, m, rwork( irvt ), m, vt, ldvt )
+                    call stdlib_cunmbr( 'P', 'R', 'C', m, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                 else
+                    ! path 6ta (n > m, jobz='a')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_sbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=sp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru]
+                    call stdlib_clacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_cunmbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! set all of vt to identity matrix
+                    call stdlib_claset( 'F', n, n, czero, cone, vt, ldvt )
+                    ! copy realmatrix rwork(irvt,KIND=sp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt]
+                    call stdlib_clacp2( 'F', m, m, rwork( irvt ), m, vt, ldvt )
+                    call stdlib_cunmbr( 'P', 'R', 'C', n, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                 end if
+              end if
+           end if
+           ! undo scaling if necessary
+           if( iscl==1 ) then
+              if( anrm>bignum )call stdlib_slascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm>bignum )call stdlib_slascl( 'G', 0, 0, bignum, anrm, minmn-1,&
+                         1,rwork( ie ), minmn, ierr )
+              if( anrm<smlnum )call stdlib_slascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm<smlnum )call stdlib_slascl( 'G', 0, 0, smlnum, anrm, minmn-1,&
+                         1,rwork( ie ), minmn, ierr )
+           end if
+           ! return optimal workspace in work(1)
+           work( 1 ) = stdlib_sroundup_lwork( maxwrk )
+           return
+     end subroutine stdlib_cgesdd
+
+     !> CGESV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> The LU decomposition with partial pivoting and row interchanges is
+     !> used to factor A as
+     !> A = P * L * U,
+     !> where P is a permutation matrix, L is unit lower triangular, and U is
+     !> upper triangular.  The factored form of A is then used to solve the
+     !> system of equations A * X = B.
+
+     pure subroutine stdlib_cgesv( n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGESV ', -info )
+              return
+           end if
+           ! compute the lu factorization of a.
+           call stdlib_cgetrf( n, n, a, lda, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_cgetrs( 'NO TRANSPOSE', n, nrhs, a, lda, ipiv, b, ldb,info )
+           end if
+           return
+     end subroutine stdlib_cgesv
+
+     !> CGESVD: computes the singular value decomposition (SVD) of a complex
+     !> M-by-N matrix A, optionally computing the left and/or right singular
+     !> vectors. The SVD is written
+     !> A = U * SIGMA * conjugate-transpose(V)
+     !> where SIGMA is an M-by-N matrix which is zero except for its
+     !> min(m,n) diagonal elements, U is an M-by-M unitary matrix, and
+     !> V is an N-by-N unitary matrix.  The diagonal elements of SIGMA
+     !> are the singular values of A; they are real and non-negative, and
+     !> are returned in descending order.  The first min(m,n) columns of
+     !> U and V are the left and right singular vectors of A.
+     !> Note that the routine returns V**H, not V.
+
+     subroutine stdlib_cgesvd( jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, rwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu, jobvt
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldvt, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*), s(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wntua, wntuas, wntun, wntuo, wntus, wntva, wntvas, wntvn, wntvo,&
+                      wntvs
+           integer(ilp) :: blk, chunk, i, ie, ierr, ir, irwork, iscl, itau, itaup, itauq, iu, &
+           iwork, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr, ncu, ncvt, nru, nrvt, &
+                     wrkbl
+           integer(ilp) :: lwork_cgeqrf, lwork_cungqr_n, lwork_cungqr_m, lwork_cgebrd, &
+                     lwork_cungbr_p, lwork_cungbr_q, lwork_cgelqf, lwork_cunglq_n, lwork_cunglq_m
+           real(sp) :: anrm, bignum, eps, smlnum
+           ! Local Arrays 
+           real(sp) :: dum(1)
+           complex(sp) :: cdum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           minmn = min( m, n )
+           wntua = stdlib_lsame( jobu, 'A' )
+           wntus = stdlib_lsame( jobu, 'S' )
+           wntuas = wntua .or. wntus
+           wntuo = stdlib_lsame( jobu, 'O' )
+           wntun = stdlib_lsame( jobu, 'N' )
+           wntva = stdlib_lsame( jobvt, 'A' )
+           wntvs = stdlib_lsame( jobvt, 'S' )
+           wntvas = wntva .or. wntvs
+           wntvo = stdlib_lsame( jobvt, 'O' )
+           wntvn = stdlib_lsame( jobvt, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.( wntua .or. wntus .or. wntuo .or. wntun ) ) then
+              info = -1
+           else if( .not.( wntva .or. wntvs .or. wntvo .or. wntvn ) .or.( wntvo .and. wntuo ) ) &
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldu<1 .or. ( wntuas .and. ldu<m ) ) then
+              info = -9
+           else if( ldvt<1 .or. ( wntva .and. ldvt<n ) .or.( wntvs .and. ldvt<minmn ) ) &
+                     then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to
+             ! real workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( m>=n .and. minmn>0 ) then
+                 ! space needed for stdlib_zbdsqr is bdspac = 5*n
+                 mnthr = stdlib_ilaenv( 6, 'CGESVD', jobu // jobvt, m, n, 0, 0 )
+                 ! compute space needed for stdlib_cgeqrf
+                 call stdlib_cgeqrf( m, n, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_cgeqrf = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_cungqr
+                 call stdlib_cungqr( m, n, n, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_cungqr_n = int( cdum(1),KIND=ilp)
+                 call stdlib_cungqr( m, m, n, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_cungqr_m = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_cgebrd
+                 call stdlib_cgebrd( n, n, a, lda, s, dum(1), cdum(1),cdum(1), cdum(1), -1, ierr )
+                           
+                 lwork_cgebrd = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_cungbr
+                 call stdlib_cungbr( 'P', n, n, n, a, lda, cdum(1),cdum(1), -1, ierr )
+                 lwork_cungbr_p = int( cdum(1),KIND=ilp)
+                 call stdlib_cungbr( 'Q', n, n, n, a, lda, cdum(1),cdum(1), -1, ierr )
+                 lwork_cungbr_q = int( cdum(1),KIND=ilp)
+                 mnthr = stdlib_ilaenv( 6, 'CGESVD', jobu // jobvt, m, n, 0, 0 )
+                 if( m>=mnthr ) then
+                    if( wntun ) then
+                       ! path 1 (m much larger than n, jobu='n')
+                       maxwrk = n + lwork_cgeqrf
+                       maxwrk = max( maxwrk, 2*n+lwork_cgebrd )
+                       if( wntvo .or. wntvas )maxwrk = max( maxwrk, 2*n+lwork_cungbr_p )
+                       minwrk = 3*n
+                    else if( wntuo .and. wntvn ) then
+                       ! path 2 (m much larger than n, jobu='o', jobvt='n')
+                       wrkbl = n + lwork_cgeqrf
+                       wrkbl = max( wrkbl, n+lwork_cungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_cungbr_q )
+                       maxwrk = max( n*n+wrkbl, n*n+m*n )
+                       minwrk = 2*n + m
+                    else if( wntuo .and. wntvas ) then
+                       ! path 3 (m much larger than n, jobu='o', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_cgeqrf
+                       wrkbl = max( wrkbl, n+lwork_cungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_cungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_cungbr_p )
+                       maxwrk = max( n*n+wrkbl, n*n+m*n )
+                       minwrk = 2*n + m
+                    else if( wntus .and. wntvn ) then
+                       ! path 4 (m much larger than n, jobu='s', jobvt='n')
+                       wrkbl = n + lwork_cgeqrf
+                       wrkbl = max( wrkbl, n+lwork_cungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_cungbr_q )
+                       maxwrk = n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntus .and. wntvo ) then
+                       ! path 5 (m much larger than n, jobu='s', jobvt='o')
+                       wrkbl = n + lwork_cgeqrf
+                       wrkbl = max( wrkbl, n+lwork_cungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_cungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_cungbr_p )
+                       maxwrk = 2*n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntus .and. wntvas ) then
+                       ! path 6 (m much larger than n, jobu='s', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_cgeqrf
+                       wrkbl = max( wrkbl, n+lwork_cungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_cungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_cungbr_p )
+                       maxwrk = n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntua .and. wntvn ) then
+                       ! path 7 (m much larger than n, jobu='a', jobvt='n')
+                       wrkbl = n + lwork_cgeqrf
+                       wrkbl = max( wrkbl, n+lwork_cungqr_m )
+                       wrkbl = max( wrkbl, 2*n+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_cungbr_q )
+                       maxwrk = n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntua .and. wntvo ) then
+                       ! path 8 (m much larger than n, jobu='a', jobvt='o')
+                       wrkbl = n + lwork_cgeqrf
+                       wrkbl = max( wrkbl, n+lwork_cungqr_m )
+                       wrkbl = max( wrkbl, 2*n+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_cungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_cungbr_p )
+                       maxwrk = 2*n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntua .and. wntvas ) then
+                       ! path 9 (m much larger than n, jobu='a', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_cgeqrf
+                       wrkbl = max( wrkbl, n+lwork_cungqr_m )
+                       wrkbl = max( wrkbl, 2*n+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_cungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_cungbr_p )
+                       maxwrk = n*n + wrkbl
+                       minwrk = 2*n + m
+                    end if
+                 else
+                    ! path 10 (m at least n, but not much larger)
+                    call stdlib_cgebrd( m, n, a, lda, s, dum(1), cdum(1),cdum(1), cdum(1), -1, &
+                              ierr )
+                    lwork_cgebrd = int( cdum(1),KIND=ilp)
+                    maxwrk = 2*n + lwork_cgebrd
+                    if( wntus .or. wntuo ) then
+                       call stdlib_cungbr( 'Q', m, n, n, a, lda, cdum(1),cdum(1), -1, ierr )
+                                 
+                       lwork_cungbr_q = int( cdum(1),KIND=ilp)
+                       maxwrk = max( maxwrk, 2*n+lwork_cungbr_q )
+                    end if
+                    if( wntua ) then
+                       call stdlib_cungbr( 'Q', m, m, n, a, lda, cdum(1),cdum(1), -1, ierr )
+                                 
+                       lwork_cungbr_q = int( cdum(1),KIND=ilp)
+                       maxwrk = max( maxwrk, 2*n+lwork_cungbr_q )
+                    end if
+                    if( .not.wntvn ) then
+                       maxwrk = max( maxwrk, 2*n+lwork_cungbr_p )
+                    end if
+                    minwrk = 2*n + m
+                 end if
+              else if( minmn>0 ) then
+                 ! space needed for stdlib_cbdsqr is bdspac = 5*m
+                 mnthr = stdlib_ilaenv( 6, 'CGESVD', jobu // jobvt, m, n, 0, 0 )
+                 ! compute space needed for stdlib_cgelqf
+                 call stdlib_cgelqf( m, n, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_cgelqf = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_cunglq
+                 call stdlib_cunglq( n, n, m, cdum(1), n, cdum(1), cdum(1), -1,ierr )
+                 lwork_cunglq_n = int( cdum(1),KIND=ilp)
+                 call stdlib_cunglq( m, n, m, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_cunglq_m = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_cgebrd
+                 call stdlib_cgebrd( m, m, a, lda, s, dum(1), cdum(1),cdum(1), cdum(1), -1, ierr )
+                           
+                 lwork_cgebrd = int( cdum(1),KIND=ilp)
+                  ! compute space needed for stdlib_cungbr p
+                 call stdlib_cungbr( 'P', m, m, m, a, n, cdum(1),cdum(1), -1, ierr )
+                 lwork_cungbr_p = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_cungbr q
+                 call stdlib_cungbr( 'Q', m, m, m, a, n, cdum(1),cdum(1), -1, ierr )
+                 lwork_cungbr_q = int( cdum(1),KIND=ilp)
+                 if( n>=mnthr ) then
+                    if( wntvn ) then
+                       ! path 1t(n much larger than m, jobvt='n')
+                       maxwrk = m + lwork_cgelqf
+                       maxwrk = max( maxwrk, 2*m+lwork_cgebrd )
+                       if( wntuo .or. wntuas )maxwrk = max( maxwrk, 2*m+lwork_cungbr_q )
+                       minwrk = 3*m
+                    else if( wntvo .and. wntun ) then
+                       ! path 2t(n much larger than m, jobu='n', jobvt='o')
+                       wrkbl = m + lwork_cgelqf
+                       wrkbl = max( wrkbl, m+lwork_cunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_cungbr_p )
+                       maxwrk = max( m*m+wrkbl, m*m+m*n )
+                       minwrk = 2*m + n
+                    else if( wntvo .and. wntuas ) then
+                       ! path 3t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='o')
+                       wrkbl = m + lwork_cgelqf
+                       wrkbl = max( wrkbl, m+lwork_cunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_cungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_cungbr_q )
+                       maxwrk = max( m*m+wrkbl, m*m+m*n )
+                       minwrk = 2*m + n
+                    else if( wntvs .and. wntun ) then
+                       ! path 4t(n much larger than m, jobu='n', jobvt='s')
+                       wrkbl = m + lwork_cgelqf
+                       wrkbl = max( wrkbl, m+lwork_cunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_cungbr_p )
+                       maxwrk = m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntvs .and. wntuo ) then
+                       ! path 5t(n much larger than m, jobu='o', jobvt='s')
+                       wrkbl = m + lwork_cgelqf
+                       wrkbl = max( wrkbl, m+lwork_cunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_cungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_cungbr_q )
+                       maxwrk = 2*m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntvs .and. wntuas ) then
+                       ! path 6t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='s')
+                       wrkbl = m + lwork_cgelqf
+                       wrkbl = max( wrkbl, m+lwork_cunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_cungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_cungbr_q )
+                       maxwrk = m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntva .and. wntun ) then
+                       ! path 7t(n much larger than m, jobu='n', jobvt='a')
+                       wrkbl = m + lwork_cgelqf
+                       wrkbl = max( wrkbl, m+lwork_cunglq_n )
+                       wrkbl = max( wrkbl, 2*m+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_cungbr_p )
+                       maxwrk = m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntva .and. wntuo ) then
+                       ! path 8t(n much larger than m, jobu='o', jobvt='a')
+                       wrkbl = m + lwork_cgelqf
+                       wrkbl = max( wrkbl, m+lwork_cunglq_n )
+                       wrkbl = max( wrkbl, 2*m+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_cungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_cungbr_q )
+                       maxwrk = 2*m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntva .and. wntuas ) then
+                       ! path 9t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='a')
+                       wrkbl = m + lwork_cgelqf
+                       wrkbl = max( wrkbl, m+lwork_cunglq_n )
+                       wrkbl = max( wrkbl, 2*m+lwork_cgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_cungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_cungbr_q )
+                       maxwrk = m*m + wrkbl
+                       minwrk = 2*m + n
+                    end if
+                 else
+                    ! path 10t(n greater than m, but not much larger)
+                    call stdlib_cgebrd( m, n, a, lda, s, dum(1), cdum(1),cdum(1), cdum(1), -1, &
+                              ierr )
+                    lwork_cgebrd = int( cdum(1),KIND=ilp)
+                    maxwrk = 2*m + lwork_cgebrd
+                    if( wntvs .or. wntvo ) then
+                      ! compute space needed for stdlib_cungbr p
+                      call stdlib_cungbr( 'P', m, n, m, a, n, cdum(1),cdum(1), -1, ierr )
+                      lwork_cungbr_p = int( cdum(1),KIND=ilp)
+                      maxwrk = max( maxwrk, 2*m+lwork_cungbr_p )
+                    end if
+                    if( wntva ) then
+                      call stdlib_cungbr( 'P', n,  n, m, a, n, cdum(1),cdum(1), -1, ierr )
+                      lwork_cungbr_p = int( cdum(1),KIND=ilp)
+                      maxwrk = max( maxwrk, 2*m+lwork_cungbr_p )
+                    end if
+                    if( .not.wntun ) then
+                       maxwrk = max( maxwrk, 2*m+lwork_cungbr_q )
+                    end if
+                    minwrk = 2*m + n
+                 end if
+              end if
+              maxwrk = max( minwrk, maxwrk )
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGESVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = sqrt( stdlib_slamch( 'S' ) ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', m, n, a, lda, dum )
+           iscl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              iscl = 1
+              call stdlib_clascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, ierr )
+           else if( anrm>bignum ) then
+              iscl = 1
+              call stdlib_clascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, ierr )
+           end if
+           if( m>=n ) then
+              ! a has at least as many rows as columns. if a has sufficiently
+              ! more rows than columns, first reduce using the qr
+              ! decomposition (if sufficient workspace available)
+              if( m>=mnthr ) then
+                 if( wntun ) then
+                    ! path 1 (m much larger than n, jobu='n')
+                    ! no left singular vectors to be computed
+                    itau = 1
+                    iwork = itau + n
+                    ! compute a=q*r
+                    ! (cworkspace: need 2*n, prefer n+n*nb)
+                    ! (rworkspace: need 0)
+                    call stdlib_cgeqrf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                              ierr )
+                    ! zero out below r
+                    if( n > 1 ) then
+                       call stdlib_claset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+                    end if
+                    ie = 1
+                    itauq = 1
+                    itaup = itauq + n
+                    iwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                    ! (rworkspace: need n)
+                    call stdlib_cgebrd( n, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( iwork ), lwork-iwork+1,ierr )
+                    ncvt = 0
+                    if( wntvo .or. wntvas ) then
+                       ! if right singular vectors desired, generate p'.
+                       ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       ncvt = n
+                    end if
+                    irwork = ie + n
+                    ! perform bidiagonal qr iteration, computing right
+                    ! singular vectors of a in a if desired
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_cbdsqr( 'U', n, ncvt, 0, 0, s, rwork( ie ), a, lda,cdum, 1, cdum, &
+                              1, rwork( irwork ), info )
+                    ! if right singular vectors desired in vt, copy them there
+                    if( wntvas )call stdlib_clacpy( 'F', n, n, a, lda, vt, ldvt )
+                 else if( wntuo .and. wntvn ) then
+                    ! path 2 (m much larger than n, jobu='o', jobvt='n')
+                    ! n left singular vectors to be overwritten on a and
+                    ! no right singular vectors to be computed
+                    if( lwork>=n*n+3*n ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n )+lda*n ) then
+                          ! work(iu) is lda by n, work(ir) is lda by n
+                          ldwrku = lda
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n )+n*n ) then
+                          ! work(iu) is lda by n, work(ir) is n by n
+                          ldwrku = lda
+                          ldwrkr = n
+                       else
+                          ! work(iu) is ldwrku by n, work(ir) is n by n
+                          ldwrku = ( lwork-n*n ) / n
+                          ldwrkr = n
+                       end if
+                       itau = ir + ldwrkr*n
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to work(ir) and zero out below it
+                       call stdlib_clacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                       call stdlib_claset( 'L', n-1, n-1, czero, czero,work( ir+1 ), ldwrkr )
+                                 
+                       ! generate q in a
+                       ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in work(ir)
+                       ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                       ! (rworkspace: need n)
+                       call stdlib_cgebrd( n, n, work( ir ), ldwrkr, s, rwork( ie ),work( itauq ),&
+                                  work( itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing r
+                       ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                       ! (rworkspace: need 0)
+                       call stdlib_cungbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       irwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of r in work(ir)
+                       ! (cworkspace: need n*n)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_cbdsqr( 'U', n, 0, n, 0, s, rwork( ie ), cdum, 1,work( ir ), &
+                                 ldwrkr, cdum, 1,rwork( irwork ), info )
+                       iu = itauq
+                       ! multiply q in a by left singular vectors of r in
+                       ! work(ir), storing result in work(iu) and copying to a
+                       ! (cworkspace: need n*n+n, prefer n*n+m*n)
+                       ! (rworkspace: 0)
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_cgemm( 'N', 'N', chunk, n, n, cone, a( i, 1 ),lda, work( ir &
+                                    ), ldwrkr, czero,work( iu ), ldwrku )
+                          call stdlib_clacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       ie = 1
+                       itauq = 1
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize a
+                       ! (cworkspace: need 2*n+m, prefer 2*n+(m+n)*nb)
+                       ! (rworkspace: n)
+                       call stdlib_cgebrd( m, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing a
+                       ! (cworkspace: need 3*n, prefer 2*n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungbr( 'Q', m, n, n, a, lda, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in a
+                       ! (cworkspace: need 0)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_cbdsqr( 'U', n, 0, m, 0, s, rwork( ie ), cdum, 1,a, lda, cdum, &
+                                 1, rwork( irwork ), info )
+                    end if
+                 else if( wntuo .and. wntvas ) then
+                    ! path 3 (m much larger than n, jobu='o', jobvt='s' or 'a')
+                    ! n left singular vectors to be overwritten on a and
+                    ! n right singular vectors to be computed in vt
+                    if( lwork>=n*n+3*n ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n )+lda*n ) then
+                          ! work(iu) is lda by n and work(ir) is lda by n
+                          ldwrku = lda
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n )+n*n ) then
+                          ! work(iu) is lda by n and work(ir) is n by n
+                          ldwrku = lda
+                          ldwrkr = n
+                       else
+                          ! work(iu) is ldwrku by n and work(ir) is n by n
+                          ldwrku = ( lwork-n*n ) / n
+                          ldwrkr = n
+                       end if
+                       itau = ir + ldwrkr*n
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to vt, zeroing out below it
+                       call stdlib_clacpy( 'U', n, n, a, lda, vt, ldvt )
+                       if( n>1 )call stdlib_claset( 'L', n-1, n-1, czero, czero,vt( 2, 1 ), ldvt )
+                                 
+                       ! generate q in a
+                       ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in vt, copying result to work(ir)
+                       ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                       ! (rworkspace: need n)
+                       call stdlib_cgebrd( n, n, vt, ldvt, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       call stdlib_clacpy( 'L', n, n, vt, ldvt, work( ir ), ldwrkr )
+                       ! generate left vectors bidiagonalizing r in work(ir)
+                       ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing r in vt
+                       ! (cworkspace: need n*n+3*n-1, prefer n*n+2*n+(n-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of r in work(ir) and computing right
+                       ! singular vectors of r in vt
+                       ! (cworkspace: need n*n)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_cbdsqr( 'U', n, n, n, 0, s, rwork( ie ), vt,ldvt, work( ir ), &
+                                 ldwrkr, cdum, 1,rwork( irwork ), info )
+                       iu = itauq
+                       ! multiply q in a by left singular vectors of r in
+                       ! work(ir), storing result in work(iu) and copying to a
+                       ! (cworkspace: need n*n+n, prefer n*n+m*n)
+                       ! (rworkspace: 0)
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_cgemm( 'N', 'N', chunk, n, n, cone, a( i, 1 ),lda, work( ir &
+                                    ), ldwrkr, czero,work( iu ), ldwrku )
+                          call stdlib_clacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       itau = 1
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (cworkspace: need 2*n, prefer n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to vt, zeroing out below it
+                       call stdlib_clacpy( 'U', n, n, a, lda, vt, ldvt )
+                       if( n>1 )call stdlib_claset( 'L', n-1, n-1, czero, czero,vt( 2, 1 ), ldvt )
+                                 
+                       ! generate q in a
+                       ! (cworkspace: need 2*n, prefer n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in vt
+                       ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                       ! (rworkspace: n)
+                       call stdlib_cgebrd( n, n, vt, ldvt, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! multiply q in a by left vectors bidiagonalizing r
+                       ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cunmbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), a, lda,&
+                                  work( iwork ),lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing r in vt
+                       ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in a and computing right
+                       ! singular vectors of a in vt
+                       ! (cworkspace: 0)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_cbdsqr( 'U', n, n, m, 0, s, rwork( ie ), vt,ldvt, a, lda, cdum,&
+                                  1, rwork( irwork ),info )
+                    end if
+                 else if( wntus ) then
+                    if( wntvn ) then
+                       ! path 4 (m much larger than n, jobu='s', jobvt='n')
+                       ! n left singular vectors to be computed in u and
+                       ! no right singular vectors to be computed
+                       if( lwork>=n*n+3*n ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(ir) is lda by n
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is n by n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(ir), zeroing out below it
+                          call stdlib_clacpy( 'U', n, n, a, lda, work( ir ),ldwrkr )
+                          call stdlib_claset( 'L', n-1, n-1, czero, czero,work( ir+1 ), ldwrkr )
+                                    
+                          ! generate q in a
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(ir)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_cgebrd( n, n, work( ir ), ldwrkr, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left vectors bidiagonalizing r in work(ir)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(ir)
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', n, 0, n, 0, s, rwork( ie ), cdum,1, work( ir ),&
+                                     ldwrkr, cdum, 1,rwork( irwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(ir), storing result in u
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_cgemm( 'N', 'N', m, n, n, cone, a, lda,work( ir ), ldwrkr, &
+                                    czero, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_claset( 'L', n-1, n-1, czero, czero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_cgebrd( n, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left vectors bidiagonalizing r
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunmbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', n, 0, m, 0, s, rwork( ie ), cdum,1, u, ldu, &
+                                    cdum, 1, rwork( irwork ),info )
+                       end if
+                    else if( wntvo ) then
+                       ! path 5 (m much larger than n, jobu='s', jobvt='o')
+                       ! n left singular vectors to be computed in u and
+                       ! n right singular vectors to be overwritten on a
+                       if( lwork>=2*n*n+3*n ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*n ) then
+                             ! work(iu) is lda by n and work(ir) is lda by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+n )*n ) then
+                             ! work(iu) is lda by n and work(ir) is n by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          else
+                             ! work(iu) is n by n and work(ir) is n by n
+                             ldwrku = n
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (cworkspace: need 2*n*n+2*n, prefer 2*n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_clacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_claset( 'L', n-1, n-1, czero, czero,work( iu+1 ), ldwrku )
+                                    
+                          ! generate q in a
+                          ! (cworkspace: need 2*n*n+2*n, prefer 2*n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to
+                          ! work(ir)
+                          ! (cworkspace: need   2*n*n+3*n,
+                                       ! prefer 2*n*n+2*n+2*n*nb)
+                          ! (rworkspace: need   n)
+                          call stdlib_cgebrd( n, n, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_clacpy( 'U', n, n, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need 2*n*n+3*n, prefer 2*n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need   2*n*n+3*n-1,
+                                       ! prefer 2*n*n+2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'P', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in work(ir)
+                          ! (cworkspace: need 2*n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', n, n, n, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                    work( iu ),ldwrku, cdum, 1, rwork( irwork ),info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(iu), storing result in u
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_cgemm( 'N', 'N', m, n, n, cone, a, lda,work( iu ), ldwrku, &
+                                    czero, u, ldu )
+                          ! copy right singular vectors of r to a
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_clacpy( 'F', n, n, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_claset( 'L', n-1, n-1, czero, czero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_cgebrd( n, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left vectors bidiagonalizing r
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunmbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right vectors bidiagonalizing r in a
+                          ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in a
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', n, n, m, 0, s, rwork( ie ), a,lda, u, ldu, &
+                                    cdum, 1, rwork( irwork ),info )
+                       end if
+                    else if( wntvas ) then
+                       ! path 6 (m much larger than n, jobu='s', jobvt='s'
+                               ! or 'a')
+                       ! n left singular vectors to be computed in u and
+                       ! n right singular vectors to be computed in vt
+                       if( lwork>=n*n+3*n ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is n by n
+                             ldwrku = n
+                          end if
+                          itau = iu + ldwrku*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_clacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_claset( 'L', n-1, n-1, czero, czero,work( iu+1 ), ldwrku )
+                                    
+                          ! generate q in a
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to vt
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_cgebrd( n, n, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_clacpy( 'U', n, n, work( iu ), ldwrku, vt,ldvt )
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (cworkspace: need   n*n+3*n-1,
+                                       ! prefer n*n+2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in vt
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', n, n, n, 0, s, rwork( ie ), vt,ldvt, work( iu )&
+                                    , ldwrku, cdum, 1,rwork( irwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(iu), storing result in u
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_cgemm( 'N', 'N', m, n, n, cone, a, lda,work( iu ), ldwrku, &
+                                    czero, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to vt, zeroing out below it
+                          call stdlib_clacpy( 'U', n, n, a, lda, vt, ldvt )
+                          if( n>1 )call stdlib_claset( 'L', n-1, n-1, czero, czero,vt( 2, 1 ), &
+                                    ldvt )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in vt
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_cgebrd( n, n, vt, ldvt, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in vt
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunmbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', n, n, m, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    end if
+                 else if( wntua ) then
+                    if( wntvn ) then
+                       ! path 7 (m much larger than n, jobu='a', jobvt='n')
+                       ! m left singular vectors to be computed in u and
+                       ! no right singular vectors to be computed
+                       if( lwork>=n*n+max( n+m, 3*n ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(ir) is lda by n
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is n by n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'L', m, n, a, lda, u, ldu )
+                          ! copy r to work(ir), zeroing out below it
+                          call stdlib_clacpy( 'U', n, n, a, lda, work( ir ),ldwrkr )
+                          call stdlib_claset( 'L', n-1, n-1, czero, czero,work( ir+1 ), ldwrkr )
+                                    
+                          ! generate q in u
+                          ! (cworkspace: need n*n+n+m, prefer n*n+n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(ir)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_cgebrd( n, n, work( ir ), ldwrkr, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(ir)
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', n, 0, n, 0, s, rwork( ie ), cdum,1, work( ir ),&
+                                     ldwrkr, cdum, 1,rwork( irwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(ir), storing result in a
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_cgemm( 'N', 'N', m, n, n, cone, u, ldu,work( ir ), ldwrkr, &
+                                    czero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_clacpy( 'F', m, n, a, lda, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need n+m, prefer n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_claset( 'L', n-1, n-1, czero, czero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_cgebrd( n, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in a
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunmbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', n, 0, m, 0, s, rwork( ie ), cdum,1, u, ldu, &
+                                    cdum, 1, rwork( irwork ),info )
+                       end if
+                    else if( wntvo ) then
+                       ! path 8 (m much larger than n, jobu='a', jobvt='o')
+                       ! m left singular vectors to be computed in u and
+                       ! n right singular vectors to be overwritten on a
+                       if( lwork>=2*n*n+max( n+m, 3*n ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*n ) then
+                             ! work(iu) is lda by n and work(ir) is lda by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+n )*n ) then
+                             ! work(iu) is lda by n and work(ir) is n by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          else
+                             ! work(iu) is n by n and work(ir) is n by n
+                             ldwrku = n
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n*n+2*n, prefer 2*n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need 2*n*n+n+m, prefer 2*n*n+n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_clacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_claset( 'L', n-1, n-1, czero, czero,work( iu+1 ), ldwrku )
+                                    
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to
+                          ! work(ir)
+                          ! (cworkspace: need   2*n*n+3*n,
+                                       ! prefer 2*n*n+2*n+2*n*nb)
+                          ! (rworkspace: need   n)
+                          call stdlib_cgebrd( n, n, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_clacpy( 'U', n, n, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need 2*n*n+3*n, prefer 2*n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need   2*n*n+3*n-1,
+                                       ! prefer 2*n*n+2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'P', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in work(ir)
+                          ! (cworkspace: need 2*n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', n, n, n, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                    work( iu ),ldwrku, cdum, 1, rwork( irwork ),info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(iu), storing result in a
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_cgemm( 'N', 'N', m, n, n, cone, u, ldu,work( iu ), ldwrku, &
+                                    czero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_clacpy( 'F', m, n, a, lda, u, ldu )
+                          ! copy right singular vectors of r from work(ir) to a
+                          call stdlib_clacpy( 'F', n, n, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need n+m, prefer n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_claset( 'L', n-1, n-1, czero, czero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_cgebrd( n, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in a
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunmbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in a
+                          ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in a
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', n, n, m, 0, s, rwork( ie ), a,lda, u, ldu, &
+                                    cdum, 1, rwork( irwork ),info )
+                       end if
+                    else if( wntvas ) then
+                       ! path 9 (m much larger than n, jobu='a', jobvt='s'
+                               ! or 'a')
+                       ! m left singular vectors to be computed in u and
+                       ! n right singular vectors to be computed in vt
+                       if( lwork>=n*n+max( n+m, 3*n ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is n by n
+                             ldwrku = n
+                          end if
+                          itau = iu + ldwrku*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need n*n+n+m, prefer n*n+n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_clacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_claset( 'L', n-1, n-1, czero, czero,work( iu+1 ), ldwrku )
+                                    
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to vt
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_cgebrd( n, n, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_clacpy( 'U', n, n, work( iu ), ldwrku, vt,ldvt )
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (cworkspace: need   n*n+3*n-1,
+                                       ! prefer n*n+2*n+(n-1)*nb)
+                          ! (rworkspace: need   0)
+                          call stdlib_cungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in vt
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', n, n, n, 0, s, rwork( ie ), vt,ldvt, work( iu )&
+                                    , ldwrku, cdum, 1,rwork( irwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(iu), storing result in a
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_cgemm( 'N', 'N', m, n, n, cone, u, ldu,work( iu ), ldwrku, &
+                                    czero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_clacpy( 'F', m, n, a, lda, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need n+m, prefer n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r from a to vt, zeroing out below it
+                          call stdlib_clacpy( 'U', n, n, a, lda, vt, ldvt )
+                          if( n>1 )call stdlib_claset( 'L', n-1, n-1, czero, czero,vt( 2, 1 ), &
+                                    ldvt )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in vt
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_cgebrd( n, n, vt, ldvt, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in vt
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunmbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', n, n, m, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    end if
+                 end if
+              else
+                 ! m < mnthr
+                 ! path 10 (m at least n, but not much larger)
+                 ! reduce to bidiagonal form without qr decomposition
+                 ie = 1
+                 itauq = 1
+                 itaup = itauq + n
+                 iwork = itaup + n
+                 ! bidiagonalize a
+                 ! (cworkspace: need 2*n+m, prefer 2*n+(m+n)*nb)
+                 ! (rworkspace: need n)
+                 call stdlib_cgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( iwork ), lwork-iwork+1,ierr )
+                 if( wntuas ) then
+                    ! if left singular vectors desired in u, copy result to u
+                    ! and generate left bidiagonalizing vectors in u
+                    ! (cworkspace: need 2*n+ncu, prefer 2*n+ncu*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_clacpy( 'L', m, n, a, lda, u, ldu )
+                    if( wntus )ncu = n
+                    if( wntua )ncu = m
+                    call stdlib_cungbr( 'Q', m, ncu, n, u, ldu, work( itauq ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntvas ) then
+                    ! if right singular vectors desired in vt, copy result to
+                    ! vt and generate right bidiagonalizing vectors in vt
+                    ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_clacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_cungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntuo ) then
+                    ! if left singular vectors desired in a, generate left
+                    ! bidiagonalizing vectors in a
+                    ! (cworkspace: need 3*n, prefer 2*n+n*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_cungbr( 'Q', m, n, n, a, lda, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvo ) then
+                    ! if right singular vectors desired in a, generate right
+                    ! bidiagonalizing vectors in a
+                    ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_cungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 irwork = ie + n
+                 if( wntuas .or. wntuo )nru = m
+                 if( wntun )nru = 0
+                 if( wntvas .or. wntvo )ncvt = n
+                 if( wntvn )ncvt = 0
+                 if( ( .not.wntuo ) .and. ( .not.wntvo ) ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in vt
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_cbdsqr( 'U', n, ncvt, nru, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                              cdum, 1, rwork( irwork ),info )
+                 else if( ( .not.wntuo ) .and. wntvo ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in a
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_cbdsqr( 'U', n, ncvt, nru, 0, s, rwork( ie ), a,lda, u, ldu, cdum,&
+                               1, rwork( irwork ),info )
+                 else
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in a and computing right singular
+                    ! vectors in vt
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_cbdsqr( 'U', n, ncvt, nru, 0, s, rwork( ie ), vt,ldvt, a, lda, &
+                              cdum, 1, rwork( irwork ),info )
+                 end if
+              end if
+           else
+              ! a has more columns than rows. if a has sufficiently more
+              ! columns than rows, first reduce using the lq decomposition (if
+              ! sufficient workspace available)
+              if( n>=mnthr ) then
+                 if( wntvn ) then
+                    ! path 1t(n much larger than m, jobvt='n')
+                    ! no right singular vectors to be computed
+                    itau = 1
+                    iwork = itau + m
+                    ! compute a=l*q
+                    ! (cworkspace: need 2*m, prefer m+m*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_cgelqf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                              ierr )
+                    ! zero out above l
+                    call stdlib_claset( 'U', m-1, m-1, czero, czero, a( 1, 2 ),lda )
+                    ie = 1
+                    itauq = 1
+                    itaup = itauq + m
+                    iwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                    ! (rworkspace: need m)
+                    call stdlib_cgebrd( m, m, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( iwork ), lwork-iwork+1,ierr )
+                    if( wntuo .or. wntuas ) then
+                       ! if left singular vectors desired, generate q
+                       ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                    end if
+                    irwork = ie + m
+                    nru = 0
+                    if( wntuo .or. wntuas )nru = m
+                    ! perform bidiagonal qr iteration, computing left singular
+                    ! vectors of a in a if desired
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_cbdsqr( 'U', m, 0, nru, 0, s, rwork( ie ), cdum, 1,a, lda, cdum, &
+                              1, rwork( irwork ), info )
+                    ! if left singular vectors desired in u, copy them there
+                    if( wntuas )call stdlib_clacpy( 'F', m, m, a, lda, u, ldu )
+                 else if( wntvo .and. wntun ) then
+                    ! path 2t(n much larger than m, jobu='n', jobvt='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! no left singular vectors to be computed
+                    if( lwork>=m*m+3*m ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n )+lda*m ) then
+                          ! work(iu) is lda by n and work(ir) is lda by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n )+m*m ) then
+                          ! work(iu) is lda by n and work(ir) is m by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = m
+                       else
+                          ! work(iu) is m by chunk and work(ir) is m by m
+                          ldwrku = m
+                          chunk = ( lwork-m*m ) / m
+                          ldwrkr = m
+                       end if
+                       itau = ir + ldwrkr*m
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to work(ir) and zero out above it
+                       call stdlib_clacpy( 'L', m, m, a, lda, work( ir ), ldwrkr )
+                       call stdlib_claset( 'U', m-1, m-1, czero, czero,work( ir+ldwrkr ), ldwrkr )
+                                 
+                       ! generate q in a
+                       ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in work(ir)
+                       ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                       ! (rworkspace: need m)
+                       call stdlib_cgebrd( m, m, work( ir ), ldwrkr, s, rwork( ie ),work( itauq ),&
+                                  work( itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing l
+                       ! (cworkspace: need m*m+3*m-1, prefer m*m+2*m+(m-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       irwork = ie + m
+                       ! perform bidiagonal qr iteration, computing right
+                       ! singular vectors of l in work(ir)
+                       ! (cworkspace: need m*m)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_cbdsqr( 'U', m, m, 0, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                 cdum, 1, cdum, 1,rwork( irwork ), info )
+                       iu = itauq
+                       ! multiply right singular vectors of l in work(ir) by q
+                       ! in a, storing result in work(iu) and copying to a
+                       ! (cworkspace: need m*m+m, prefer m*m+m*n)
+                       ! (rworkspace: 0)
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_cgemm( 'N', 'N', m, blk, m, cone, work( ir ),ldwrkr, a( 1, &
+                                    i ), lda, czero,work( iu ), ldwrku )
+                          call stdlib_clacpy( 'F', m, blk, work( iu ), ldwrku,a( 1, i ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       ie = 1
+                       itauq = 1
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize a
+                       ! (cworkspace: need 2*m+n, prefer 2*m+(m+n)*nb)
+                       ! (rworkspace: need m)
+                       call stdlib_cgebrd( m, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing a
+                       ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + m
+                       ! perform bidiagonal qr iteration, computing right
+                       ! singular vectors of a in a
+                       ! (cworkspace: 0)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_cbdsqr( 'L', m, n, 0, 0, s, rwork( ie ), a, lda,cdum, 1, cdum, &
+                                 1, rwork( irwork ), info )
+                    end if
+                 else if( wntvo .and. wntuas ) then
+                    ! path 3t(n much larger than m, jobu='s' or 'a', jobvt='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! m left singular vectors to be computed in u
+                    if( lwork>=m*m+3*m ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n )+lda*m ) then
+                          ! work(iu) is lda by n and work(ir) is lda by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n )+m*m ) then
+                          ! work(iu) is lda by n and work(ir) is m by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = m
+                       else
+                          ! work(iu) is m by chunk and work(ir) is m by m
+                          ldwrku = m
+                          chunk = ( lwork-m*m ) / m
+                          ldwrkr = m
+                       end if
+                       itau = ir + ldwrkr*m
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to u, zeroing about above it
+                       call stdlib_clacpy( 'L', m, m, a, lda, u, ldu )
+                       call stdlib_claset( 'U', m-1, m-1, czero, czero, u( 1, 2 ),ldu )
+                       ! generate q in a
+                       ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in u, copying result to work(ir)
+                       ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                       ! (rworkspace: need m)
+                       call stdlib_cgebrd( m, m, u, ldu, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       call stdlib_clacpy( 'U', m, m, u, ldu, work( ir ), ldwrkr )
+                       ! generate right vectors bidiagonalizing l in work(ir)
+                       ! (cworkspace: need m*m+3*m-1, prefer m*m+2*m+(m-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing l in u
+                       ! (cworkspace: need m*m+3*m, prefer m*m+2*m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + m
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of l in u, and computing right
+                       ! singular vectors of l in work(ir)
+                       ! (cworkspace: need m*m)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_cbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( ir ), ldwrkr, u, &
+                                 ldu, cdum, 1,rwork( irwork ), info )
+                       iu = itauq
+                       ! multiply right singular vectors of l in work(ir) by q
+                       ! in a, storing result in work(iu) and copying to a
+                       ! (cworkspace: need m*m+m, prefer m*m+m*n))
+                       ! (rworkspace: 0)
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_cgemm( 'N', 'N', m, blk, m, cone, work( ir ),ldwrkr, a( 1, &
+                                    i ), lda, czero,work( iu ), ldwrku )
+                          call stdlib_clacpy( 'F', m, blk, work( iu ), ldwrku,a( 1, i ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       itau = 1
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (cworkspace: need 2*m, prefer m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to u, zeroing out above it
+                       call stdlib_clacpy( 'L', m, m, a, lda, u, ldu )
+                       call stdlib_claset( 'U', m-1, m-1, czero, czero, u( 1, 2 ),ldu )
+                       ! generate q in a
+                       ! (cworkspace: need 2*m, prefer m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in u
+                       ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                       ! (rworkspace: need m)
+                       call stdlib_cgebrd( m, m, u, ldu, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! multiply right vectors bidiagonalizing l by q in a
+                       ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cunmbr( 'P', 'L', 'C', m, n, m, u, ldu,work( itaup ), a, lda, &
+                                 work( iwork ),lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing l in u
+                       ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_cungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + m
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in u and computing right
+                       ! singular vectors of a in a
+                       ! (cworkspace: 0)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_cbdsqr( 'U', m, n, m, 0, s, rwork( ie ), a, lda,u, ldu, cdum, &
+                                 1, rwork( irwork ), info )
+                    end if
+                 else if( wntvs ) then
+                    if( wntun ) then
+                       ! path 4t(n much larger than m, jobu='n', jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! no left singular vectors to be computed
+                       if( lwork>=m*m+3*m ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(ir) is lda by m
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is m by m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(ir), zeroing out above it
+                          call stdlib_clacpy( 'L', m, m, a, lda, work( ir ),ldwrkr )
+                          call stdlib_claset( 'U', m-1, m-1, czero, czero,work( ir+ldwrkr ), &
+                                    ldwrkr )
+                          ! generate q in a
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(ir)
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_cgebrd( m, m, work( ir ), ldwrkr, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right vectors bidiagonalizing l in
+                          ! work(ir)
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of l in work(ir)
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', m, m, 0, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                    cdum, 1, cdum, 1,rwork( irwork ), info )
+                          ! multiply right singular vectors of l in work(ir) by
+                          ! q in a, storing result in vt
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_cgemm( 'N', 'N', m, n, m, cone, work( ir ),ldwrkr, a, lda, &
+                                    czero, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy result to vt
+                          call stdlib_clacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_claset( 'U', m-1, m-1, czero, czero,a( 1, 2 ), lda )
+                          ! bidiagonalize l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_cgebrd( m, m, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right vectors bidiagonalizing l by q in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunmbr( 'P', 'L', 'C', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', m, n, 0, 0, s, rwork( ie ), vt,ldvt, cdum, 1, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    else if( wntuo ) then
+                       ! path 5t(n much larger than m, jobu='o', jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! m left singular vectors to be overwritten on a
+                       if( lwork>=2*m*m+3*m ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*m ) then
+                             ! work(iu) is lda by m and work(ir) is lda by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+m )*m ) then
+                             ! work(iu) is lda by m and work(ir) is m by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          else
+                             ! work(iu) is m by m and work(ir) is m by m
+                             ldwrku = m
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (cworkspace: need 2*m*m+2*m, prefer 2*m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out below it
+                          call stdlib_clacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_claset( 'U', m-1, m-1, czero, czero,work( iu+ldwrku ), &
+                                    ldwrku )
+                          ! generate q in a
+                          ! (cworkspace: need 2*m*m+2*m, prefer 2*m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to
+                          ! work(ir)
+                          ! (cworkspace: need   2*m*m+3*m,
+                                       ! prefer 2*m*m+2*m+2*m*nb)
+                          ! (rworkspace: need   m)
+                          call stdlib_cgebrd( m, m, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_clacpy( 'L', m, m, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need   2*m*m+3*m-1,
+                                       ! prefer 2*m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need 2*m*m+3*m, prefer 2*m*m+2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', m, m, m, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in work(ir) and computing
+                          ! right singular vectors of l in work(iu)
+                          ! (cworkspace: need 2*m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( iu ), ldwrku, &
+                                    work( ir ),ldwrkr, cdum, 1, rwork( irwork ),info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in a, storing result in vt
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_cgemm( 'N', 'N', m, n, m, cone, work( iu ),ldwrku, a, lda, &
+                                    czero, vt, ldvt )
+                          ! copy left singular vectors of l to a
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_clacpy( 'F', m, m, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_claset( 'U', m-1, m-1, czero, czero,a( 1, 2 ), lda )
+                          ! bidiagonalize l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_cgebrd( m, m, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right vectors bidiagonalizing l by q in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunmbr( 'P', 'L', 'C', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors of l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in a and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', m, n, m, 0, s, rwork( ie ), vt,ldvt, a, lda, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    else if( wntuas ) then
+                       ! path 6t(n much larger than m, jobu='s' or 'a',
+                               ! jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! m left singular vectors to be computed in u
+                       if( lwork>=m*m+3*m ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is lda by m
+                             ldwrku = m
+                          end if
+                          itau = iu + ldwrku*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_clacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_claset( 'U', m-1, m-1, czero, czero,work( iu+ldwrku ), &
+                                    ldwrku )
+                          ! generate q in a
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to u
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_cgebrd( m, m, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_clacpy( 'L', m, m, work( iu ), ldwrku, u,ldu )
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need   m*m+3*m-1,
+                                       ! prefer m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in u and computing right
+                          ! singular vectors of l in work(iu)
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( iu ), ldwrku, &
+                                    u, ldu, cdum, 1,rwork( irwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in a, storing result in vt
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_cgemm( 'N', 'N', m, n, m, cone, work( iu ),ldwrku, a, lda, &
+                                    czero, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to u, zeroing out above it
+                          call stdlib_clacpy( 'L', m, m, a, lda, u, ldu )
+                          call stdlib_claset( 'U', m-1, m-1, czero, czero,u( 1, 2 ), ldu )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in u
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_cgebrd( m, m, u, ldu, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in u by q
+                          ! in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunmbr( 'P', 'L', 'C', m, n, m, u, ldu,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', m, n, m, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    end if
+                 else if( wntva ) then
+                    if( wntun ) then
+                       ! path 7t(n much larger than m, jobu='n', jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! no left singular vectors to be computed
+                       if( lwork>=m*m+max( n+m, 3*m ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(ir) is lda by m
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is m by m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! copy l to work(ir), zeroing out above it
+                          call stdlib_clacpy( 'L', m, m, a, lda, work( ir ),ldwrkr )
+                          call stdlib_claset( 'U', m-1, m-1, czero, czero,work( ir+ldwrkr ), &
+                                    ldwrkr )
+                          ! generate q in vt
+                          ! (cworkspace: need m*m+m+n, prefer m*m+m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(ir)
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_cgebrd( m, m, work( ir ), ldwrkr, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need   m*m+3*m-1,
+                                       ! prefer m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of l in work(ir)
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', m, m, 0, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                    cdum, 1, cdum, 1,rwork( irwork ), info )
+                          ! multiply right singular vectors of l in work(ir) by
+                          ! q in vt, storing result in a
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_cgemm( 'N', 'N', m, n, m, cone, work( ir ),ldwrkr, vt, ldvt,&
+                                     czero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_clacpy( 'F', m, n, a, lda, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need m+n, prefer m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_claset( 'U', m-1, m-1, czero, czero,a( 1, 2 ), lda )
+                          ! bidiagonalize l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_cgebrd( m, m, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in a by q
+                          ! in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunmbr( 'P', 'L', 'C', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', m, n, 0, 0, s, rwork( ie ), vt,ldvt, cdum, 1, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    else if( wntuo ) then
+                       ! path 8t(n much larger than m, jobu='o', jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! m left singular vectors to be overwritten on a
+                       if( lwork>=2*m*m+max( n+m, 3*m ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*m ) then
+                             ! work(iu) is lda by m and work(ir) is lda by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+m )*m ) then
+                             ! work(iu) is lda by m and work(ir) is m by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          else
+                             ! work(iu) is m by m and work(ir) is m by m
+                             ldwrku = m
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m*m+2*m, prefer 2*m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need 2*m*m+m+n, prefer 2*m*m+m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_clacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_claset( 'U', m-1, m-1, czero, czero,work( iu+ldwrku ), &
+                                    ldwrku )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to
+                          ! work(ir)
+                          ! (cworkspace: need   2*m*m+3*m,
+                                       ! prefer 2*m*m+2*m+2*m*nb)
+                          ! (rworkspace: need   m)
+                          call stdlib_cgebrd( m, m, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_clacpy( 'L', m, m, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need   2*m*m+3*m-1,
+                                       ! prefer 2*m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need 2*m*m+3*m, prefer 2*m*m+2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', m, m, m, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in work(ir) and computing
+                          ! right singular vectors of l in work(iu)
+                          ! (cworkspace: need 2*m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( iu ), ldwrku, &
+                                    work( ir ),ldwrkr, cdum, 1, rwork( irwork ),info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in vt, storing result in a
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_cgemm( 'N', 'N', m, n, m, cone, work( iu ),ldwrku, vt, ldvt,&
+                                     czero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_clacpy( 'F', m, n, a, lda, vt, ldvt )
+                          ! copy left singular vectors of a from work(ir) to a
+                          call stdlib_clacpy( 'F', m, m, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need m+n, prefer m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_claset( 'U', m-1, m-1, czero, czero,a( 1, 2 ), lda )
+                          ! bidiagonalize l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_cgebrd( m, m, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in a by q
+                          ! in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunmbr( 'P', 'L', 'C', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in a
+                          ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in a and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', m, n, m, 0, s, rwork( ie ), vt,ldvt, a, lda, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    else if( wntuas ) then
+                       ! path 9t(n much larger than m, jobu='s' or 'a',
+                               ! jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! m left singular vectors to be computed in u
+                       if( lwork>=m*m+max( n+m, 3*m ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(iu) is lda by m
+                             ldwrku = lda
+                          else
+                             ! work(iu) is m by m
+                             ldwrku = m
+                          end if
+                          itau = iu + ldwrku*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need m*m+m+n, prefer m*m+m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_clacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_claset( 'U', m-1, m-1, czero, czero,work( iu+ldwrku ), &
+                                    ldwrku )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to u
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_cgebrd( m, m, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_clacpy( 'L', m, m, work( iu ), ldwrku, u,ldu )
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in u and computing right
+                          ! singular vectors of l in work(iu)
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( iu ), ldwrku, &
+                                    u, ldu, cdum, 1,rwork( irwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in vt, storing result in a
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_cgemm( 'N', 'N', m, n, m, cone, work( iu ),ldwrku, vt, ldvt,&
+                                     czero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_clacpy( 'F', m, n, a, lda, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_clacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need m+n, prefer m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to u, zeroing out above it
+                          call stdlib_clacpy( 'L', m, m, a, lda, u, ldu )
+                          call stdlib_claset( 'U', m-1, m-1, czero, czero,u( 1, 2 ), ldu )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in u
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_cgebrd( m, m, u, ldu, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in u by q
+                          ! in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cunmbr( 'P', 'L', 'C', m, n, m, u, ldu,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_cungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_cbdsqr( 'U', m, n, m, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    end if
+                 end if
+              else
+                 ! n < mnthr
+                 ! path 10t(n greater than m, but not much larger)
+                 ! reduce to bidiagonal form without lq decomposition
+                 ie = 1
+                 itauq = 1
+                 itaup = itauq + m
+                 iwork = itaup + m
+                 ! bidiagonalize a
+                 ! (cworkspace: need 2*m+n, prefer 2*m+(m+n)*nb)
+                 ! (rworkspace: m)
+                 call stdlib_cgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( iwork ), lwork-iwork+1,ierr )
+                 if( wntuas ) then
+                    ! if left singular vectors desired in u, copy result to u
+                    ! and generate left bidiagonalizing vectors in u
+                    ! (cworkspace: need 3*m-1, prefer 2*m+(m-1)*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_clacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_cungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvas ) then
+                    ! if right singular vectors desired in vt, copy result to
+                    ! vt and generate right bidiagonalizing vectors in vt
+                    ! (cworkspace: need 2*m+nrvt, prefer 2*m+nrvt*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_clacpy( 'U', m, n, a, lda, vt, ldvt )
+                    if( wntva )nrvt = n
+                    if( wntvs )nrvt = m
+                    call stdlib_cungbr( 'P', nrvt, n, m, vt, ldvt, work( itaup ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntuo ) then
+                    ! if left singular vectors desired in a, generate left
+                    ! bidiagonalizing vectors in a
+                    ! (cworkspace: need 3*m-1, prefer 2*m+(m-1)*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_cungbr( 'Q', m, m, n, a, lda, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvo ) then
+                    ! if right singular vectors desired in a, generate right
+                    ! bidiagonalizing vectors in a
+                    ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_cungbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 irwork = ie + m
+                 if( wntuas .or. wntuo )nru = m
+                 if( wntun )nru = 0
+                 if( wntvas .or. wntvo )ncvt = n
+                 if( wntvn )ncvt = 0
+                 if( ( .not.wntuo ) .and. ( .not.wntvo ) ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in vt
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_cbdsqr( 'L', m, ncvt, nru, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                              cdum, 1, rwork( irwork ),info )
+                 else if( ( .not.wntuo ) .and. wntvo ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in a
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_cbdsqr( 'L', m, ncvt, nru, 0, s, rwork( ie ), a,lda, u, ldu, cdum,&
+                               1, rwork( irwork ),info )
+                 else
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in a and computing right singular
+                    ! vectors in vt
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_cbdsqr( 'L', m, ncvt, nru, 0, s, rwork( ie ), vt,ldvt, a, lda, &
+                              cdum, 1, rwork( irwork ),info )
+                 end if
+              end if
+           end if
+           ! undo scaling if necessary
+           if( iscl==1 ) then
+              if( anrm>bignum )call stdlib_slascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm>bignum )call stdlib_slascl( 'G', 0, 0, bignum, anrm, minmn-1,&
+                         1,rwork( ie ), minmn, ierr )
+              if( anrm<smlnum )call stdlib_slascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm<smlnum )call stdlib_slascl( 'G', 0, 0, smlnum, anrm, minmn-1,&
+                         1,rwork( ie ), minmn, ierr )
+           end if
+           ! return optimal workspace in work(1)
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_cgesvd
+
+     !> CGESVDQ: computes the singular value decomposition (SVD) of a complex
+     !> M-by-N matrix A, where M >= N. The SVD of A is written as
+     !> [++]   [xx]   [x0]   [xx]
+     !> A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+     !> [++]   [xx]
+     !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+     !> matrix, and V is an N-by-N unitary matrix. The diagonal elements
+     !> of SIGMA are the singular values of A. The columns of U and V are the
+     !> left and the right singular vectors of A, respectively.
+
+     subroutine stdlib_cgesvdq( joba, jobp, jobr, jobu, jobv, m, n, a, lda,s, u, ldu, v, ldv, &
+               numrank, iwork, liwork,cwork, lcwork, rwork, lrwork, info )
+           ! Scalar Arguments 
+           character, intent(in) :: joba, jobp, jobr, jobu, jobv
+           integer(ilp), intent(in) :: m, n, lda, ldu, ldv, liwork, lrwork
+           integer(ilp), intent(out) :: numrank, info
+           integer(ilp), intent(inout) :: lcwork
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: u(ldu,*), v(ldv,*), cwork(*)
+           real(sp), intent(out) :: s(*), rwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: ierr, nr, n1, optratio, p, q
+           integer(ilp) :: lwcon, lwqp3, lwrk_cgelqf, lwrk_cgesvd, lwrk_cgesvd2, lwrk_cgeqp3, &
+           lwrk_cgeqrf, lwrk_cunmlq, lwrk_cunmqr, lwrk_cunmqr2, lwlqf, lwqrf, lwsvd, lwsvd2, &
+                     lwunq, lwunq2, lwunlq, minwrk, minwrk2, optwrk, optwrk2, iminwrk, rminwrk
+           logical(lk) :: accla, acclm, acclh, ascaled, conda, dntwu, dntwv, lquery, lsvc0, lsvec,&
+                      rowprm, rsvec, rtrans, wntua, wntuf, wntur, wntus, wntva, wntvr
+           real(sp) :: big, epsln, rtmp, sconda, sfmin
+           complex(sp) :: ctmp
+           ! Local Arrays
+           complex(sp) :: cdummy(1)
+           real(sp) :: rdummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,min,real,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           wntus  = stdlib_lsame( jobu, 'S' ) .or. stdlib_lsame( jobu, 'U' )
+           wntur  = stdlib_lsame( jobu, 'R' )
+           wntua  = stdlib_lsame( jobu, 'A' )
+           wntuf  = stdlib_lsame( jobu, 'F' )
+           lsvc0  = wntus .or. wntur .or. wntua
+           lsvec  = lsvc0 .or. wntuf
+           dntwu  = stdlib_lsame( jobu, 'N' )
+           wntvr  = stdlib_lsame( jobv, 'R' )
+           wntva  = stdlib_lsame( jobv, 'A' ) .or. stdlib_lsame( jobv, 'V' )
+           rsvec  = wntvr .or. wntva
+           dntwv  = stdlib_lsame( jobv, 'N' )
+           accla  = stdlib_lsame( joba, 'A' )
+           acclm  = stdlib_lsame( joba, 'M' )
+           conda  = stdlib_lsame( joba, 'E' )
+           acclh  = stdlib_lsame( joba, 'H' ) .or. conda
+           rowprm = stdlib_lsame( jobp, 'P' )
+           rtrans = stdlib_lsame( jobr, 'T' )
+           if ( rowprm ) then
+              iminwrk = max( 1, n + m - 1 )
+              rminwrk = max( 2, m, 5*n )
+           else
+              iminwrk = max( 1, n )
+              rminwrk = max( 2, 5*n )
+           end if
+           lquery = (liwork == -1 .or. lcwork == -1 .or. lrwork == -1)
+           info  = 0
+           if ( .not. ( accla .or. acclm .or. acclh ) ) then
+              info = -1
+           else if ( .not.( rowprm .or. stdlib_lsame( jobp, 'N' ) ) ) then
+               info = -2
+           else if ( .not.( rtrans .or. stdlib_lsame( jobr, 'N' ) ) ) then
+               info = -3
+           else if ( .not.( lsvec .or. dntwu ) ) then
+              info = -4
+           else if ( wntur .and. wntva ) then
+              info = -5
+           else if ( .not.( rsvec .or. dntwv )) then
+              info = -5
+           else if ( m<0 ) then
+              info = -6
+           else if ( ( n<0 ) .or. ( n>m ) ) then
+              info = -7
+           else if ( lda<max( 1, m ) ) then
+              info = -9
+           else if ( ldu<1 .or. ( lsvc0 .and. ldu<m ) .or.( wntuf .and. ldu<n ) ) then
+              info = -12
+           else if ( ldv<1 .or. ( rsvec .and. ldv<n ) .or.( conda .and. ldv<n ) ) then
+              info = -14
+           else if ( liwork < iminwrk .and. .not. lquery ) then
+              info = -17
+           end if
+           if ( info == 0 ) then
+           ! compute workspace
+              ! Compute The Minimal And The Optimal Workspace Lengths
+              ! [[the expressions for computing the minimal and the optimal
+              ! values of lcwork are written with a lot of redundancy and
+              ! can be simplified. however, this detailed form is easier for
+              ! maintenance and modifications of the code.]]
+              ! Minimal Workspace Length For Stdlib_Cgeqp3 Of An M X N Matrix
+              lwqp3 = n+1
+              ! Minimal Workspace Length For Stdlib_Cunmqr To Build Left Singular Vectors
+              if ( wntus .or. wntur ) then
+                  lwunq  = max( n  , 1 )
+              else if ( wntua ) then
+                  lwunq = max( m , 1 )
+              end if
+              ! Minimal Workspace Length For Stdlib_Cpocon Of An N X N Matrix
+              lwcon = 2 * n
+              ! Stdlib_Cgesvd Of An N X N Matrix
+              lwsvd = max( 3 * n, 1 )
+              if ( lquery ) then
+                  call stdlib_cgeqp3( m, n, a, lda, iwork, cdummy, cdummy, -1,rdummy, ierr )
+                            
+                  lwrk_cgeqp3 = int( cdummy(1),KIND=ilp)
+                  if ( wntus .or. wntur ) then
+                      call stdlib_cunmqr( 'L', 'N', m, n, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_cunmqr = int( cdummy(1),KIND=ilp)
+                  else if ( wntua ) then
+                      call stdlib_cunmqr( 'L', 'N', m, m, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_cunmqr = int( cdummy(1),KIND=ilp)
+                  else
+                      lwrk_cunmqr = 0
+                  end if
+              end if
+              minwrk = 2
+              optwrk = 2
+              if ( .not. (lsvec .or. rsvec )) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If
+                  ! only the singular values are requested
+                  if ( conda ) then
+                     minwrk = max( n+lwqp3, lwcon, lwsvd )
+                  else
+                     minwrk = max( n+lwqp3, lwsvd )
+                  end if
+                  if ( lquery ) then
+                      call stdlib_cgesvd( 'N', 'N', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                rdummy, ierr )
+                      lwrk_cgesvd = int( cdummy(1),KIND=ilp)
+                      if ( conda ) then
+                         optwrk = max( n+lwrk_cgeqp3, n+lwcon, lwrk_cgesvd )
+                      else
+                         optwrk = max( n+lwrk_cgeqp3, lwrk_cgesvd )
+                      end if
+                  end if
+              else if ( lsvec .and. (.not.rsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! singular values and the left singular vectors are requested
+                  if ( conda ) then
+                      minwrk = n + max( lwqp3, lwcon, lwsvd, lwunq )
+                  else
+                      minwrk = n + max( lwqp3, lwsvd, lwunq )
+                  end if
+                  if ( lquery ) then
+                     if ( rtrans ) then
+                        call stdlib_cgesvd( 'N', 'O', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                  rdummy, ierr )
+                     else
+                        call stdlib_cgesvd( 'O', 'N', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                  rdummy, ierr )
+                     end if
+                     lwrk_cgesvd = int( cdummy(1),KIND=ilp)
+                     if ( conda ) then
+                         optwrk = n + max( lwrk_cgeqp3, lwcon, lwrk_cgesvd,lwrk_cunmqr )
+                     else
+                         optwrk = n + max( lwrk_cgeqp3, lwrk_cgesvd,lwrk_cunmqr )
+                     end if
+                  end if
+              else if ( rsvec .and. (.not.lsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! singular values and the right singular vectors are requested
+                  if ( conda ) then
+                      minwrk = n + max( lwqp3, lwcon, lwsvd )
+                  else
+                      minwrk = n + max( lwqp3, lwsvd )
+                  end if
+                  if ( lquery ) then
+                      if ( rtrans ) then
+                          call stdlib_cgesvd( 'O', 'N', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -&
+                                    1, rdummy, ierr )
+                      else
+                          call stdlib_cgesvd( 'N', 'O', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -&
+                                    1, rdummy, ierr )
+                      end if
+                      lwrk_cgesvd = int( cdummy(1),KIND=ilp)
+                      if ( conda ) then
+                          optwrk = n + max( lwrk_cgeqp3, lwcon, lwrk_cgesvd )
+                      else
+                          optwrk = n + max( lwrk_cgeqp3, lwrk_cgesvd )
+                      end if
+                  end if
+              else
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! full svd is requested
+                  if ( rtrans ) then
+                      minwrk = max( lwqp3, lwsvd, lwunq )
+                      if ( conda ) minwrk = max( minwrk, lwcon )
+                      minwrk = minwrk + n
+                      if ( wntva ) then
+                         ! .. minimal workspace length for n x n/2 stdlib_cgeqrf
+                         lwqrf  = max( n/2, 1 )
+                         ! .. minimal workspace length for n/2 x n/2 stdlib_cgesvd
+                         lwsvd2 = max( 3 * (n/2), 1 )
+                         lwunq2 = max( n, 1 )
+                         minwrk2 = max( lwqp3, n/2+lwqrf, n/2+lwsvd2,n/2+lwunq2, lwunq )
+                         if ( conda ) minwrk2 = max( minwrk2, lwcon )
+                         minwrk2 = n + minwrk2
+                         minwrk = max( minwrk, minwrk2 )
+                      end if
+                  else
+                      minwrk = max( lwqp3, lwsvd, lwunq )
+                      if ( conda ) minwrk = max( minwrk, lwcon )
+                      minwrk = minwrk + n
+                      if ( wntva ) then
+                         ! .. minimal workspace length for n/2 x n stdlib_cgelqf
+                         lwlqf  = max( n/2, 1 )
+                         lwsvd2 = max( 3 * (n/2), 1 )
+                         lwunlq = max( n , 1 )
+                         minwrk2 = max( lwqp3, n/2+lwlqf, n/2+lwsvd2,n/2+lwunlq, lwunq )
+                         if ( conda ) minwrk2 = max( minwrk2, lwcon )
+                         minwrk2 = n + minwrk2
+                         minwrk = max( minwrk, minwrk2 )
+                      end if
+                  end if
+                  if ( lquery ) then
+                     if ( rtrans ) then
+                        call stdlib_cgesvd( 'O', 'A', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                  rdummy, ierr )
+                        lwrk_cgesvd = int( cdummy(1),KIND=ilp)
+                        optwrk = max(lwrk_cgeqp3,lwrk_cgesvd,lwrk_cunmqr)
+                        if ( conda ) optwrk = max( optwrk, lwcon )
+                        optwrk = n + optwrk
+                        if ( wntva ) then
+                            call stdlib_cgeqrf(n,n/2,u,ldu,cdummy,cdummy,-1,ierr)
+                            lwrk_cgeqrf = int( cdummy(1),KIND=ilp)
+                            call stdlib_cgesvd( 'S', 'O', n/2,n/2, v,ldv, s, u,ldu,v, ldv, cdummy,&
+                                       -1, rdummy, ierr )
+                            lwrk_cgesvd2 = int( cdummy(1),KIND=ilp)
+                            call stdlib_cunmqr( 'R', 'C', n, n, n/2, u, ldu, cdummy,v, ldv, &
+                                      cdummy, -1, ierr )
+                            lwrk_cunmqr2 = int( cdummy(1),KIND=ilp)
+                            optwrk2 = max( lwrk_cgeqp3, n/2+lwrk_cgeqrf,n/2+lwrk_cgesvd2, n/2+&
+                                      lwrk_cunmqr2 )
+                            if ( conda ) optwrk2 = max( optwrk2, lwcon )
+                            optwrk2 = n + optwrk2
+                            optwrk = max( optwrk, optwrk2 )
+                        end if
+                     else
+                        call stdlib_cgesvd( 'S', 'O', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                  rdummy, ierr )
+                        lwrk_cgesvd = int( cdummy(1),KIND=ilp)
+                        optwrk = max(lwrk_cgeqp3,lwrk_cgesvd,lwrk_cunmqr)
+                        if ( conda ) optwrk = max( optwrk, lwcon )
+                        optwrk = n + optwrk
+                        if ( wntva ) then
+                           call stdlib_cgelqf(n/2,n,u,ldu,cdummy,cdummy,-1,ierr)
+                           lwrk_cgelqf = int( cdummy(1),KIND=ilp)
+                           call stdlib_cgesvd( 'S','O', n/2,n/2, v, ldv, s, u, ldu,v, ldv, cdummy,&
+                                      -1, rdummy, ierr )
+                           lwrk_cgesvd2 = int( cdummy(1),KIND=ilp)
+                           call stdlib_cunmlq( 'R', 'N', n, n, n/2, u, ldu, cdummy,v, ldv, cdummy,&
+                                     -1,ierr )
+                           lwrk_cunmlq = int( cdummy(1),KIND=ilp)
+                           optwrk2 = max( lwrk_cgeqp3, n/2+lwrk_cgelqf,n/2+lwrk_cgesvd2, n/2+&
+                                     lwrk_cunmlq )
+                            if ( conda ) optwrk2 = max( optwrk2, lwcon )
+                            optwrk2 = n + optwrk2
+                            optwrk = max( optwrk, optwrk2 )
+                        end if
+                     end if
+                  end if
+              end if
+              minwrk = max( 2, minwrk )
+              optwrk = max( 2, optwrk )
+              if ( lcwork < minwrk .and. (.not.lquery) ) info = -19
+           end if
+           if (info == 0 .and. lrwork < rminwrk .and. .not. lquery) then
+              info = -21
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGESVDQ', -info )
+              return
+           else if ( lquery ) then
+           ! return optimal workspace
+               iwork(1) = iminwrk
+               cwork(1) = optwrk
+               cwork(2) = minwrk
+               rwork(1) = rminwrk
+               return
+           end if
+           ! quick return if the matrix is void.
+           if( ( m==0 ) .or. ( n==0 ) ) then
+           ! All Output Is Void
+              return
+           end if
+           big = stdlib_slamch('O')
+           ascaled = .false.
+           if ( rowprm ) then
+                 ! Reordering The Rows In Decreasing Sequence In The
+                 ! ell-infinity norm - this enhances numerical robustness in
+                 ! the case of differently scaled rows.
+                 do p = 1, m
+                     ! rwork(p) = abs( a(p,stdlib_icamax(n,a(p,1),lda)) )
+                     ! [[stdlib_clange will return nan if an entry of the p-th row is nan]]
+                     rwork(p) = stdlib_clange( 'M', 1, n, a(p,1), lda, rdummy )
+                     ! .. check for nan's and inf's
+                     if ( ( rwork(p) /= rwork(p) ) .or.( (rwork(p)*zero) /= zero ) ) then
+                         info = - 8
+                         call stdlib_xerbla( 'CGESVDQ', -info )
+                         return
+                     end if
+                 end do
+                 do p = 1, m - 1
+                 q = stdlib_isamax( m-p+1, rwork(p), 1 ) + p - 1
+                 iwork(n+p) = q
+                 if ( p /= q ) then
+                    rtmp     = rwork(p)
+                    rwork(p) = rwork(q)
+                    rwork(q) = rtmp
+                 end if
+                 end do
+                 if ( rwork(1) == zero ) then
+                    ! quick return: a is the m x n zero matrix.
+                    numrank = 0
+                    call stdlib_slaset( 'G', n, 1, zero, zero, s, n )
+                    if ( wntus ) call stdlib_claset('G', m, n, czero, cone, u, ldu)
+                    if ( wntua ) call stdlib_claset('G', m, m, czero, cone, u, ldu)
+                    if ( wntva ) call stdlib_claset('G', n, n, czero, cone, v, ldv)
+                    if ( wntuf ) then
+                        call stdlib_claset( 'G', n, 1, czero, czero, cwork, n )
+                        call stdlib_claset( 'G', m, n, czero, cone, u, ldu )
+                    end if
+                    do p = 1, n
+                        iwork(p) = p
+                    end do
+                    if ( rowprm ) then
+                        do p = n + 1, n + m - 1
+                            iwork(p) = p - n
+                        end do
+                    end if
+                    if ( conda ) rwork(1) = -1
+                    rwork(2) = -1
+                    return
+                 end if
+                 if ( rwork(1) > big / sqrt(real(m,KIND=sp)) ) then
+                     ! .. to prevent overflow in the qr factorization, scale the
+                     ! matrix by 1/sqrt(m) if too large entry detected
+                     call stdlib_clascl('G',0,0,sqrt(real(m,KIND=sp)),one, m,n, a,lda, ierr)
+                               
+                     ascaled = .true.
+                 end if
+                 call stdlib_claswp( n, a, lda, 1, m-1, iwork(n+1), 1 )
+           end if
+          ! .. at this stage, preemptive scaling is done only to avoid column
+          ! norms overflows during the qr factorization. the svd procedure should
+          ! have its own scaling to save the singular values from overflows and
+          ! underflows. that depends on the svd procedure.
+           if ( .not.rowprm ) then
+               rtmp = stdlib_clange( 'M', m, n, a, lda, rwork )
+               if ( ( rtmp /= rtmp ) .or.( (rtmp*zero) /= zero ) ) then
+                    info = - 8
+                    call stdlib_xerbla( 'CGESVDQ', -info )
+                    return
+               end if
+               if ( rtmp > big / sqrt(real(m,KIND=sp)) ) then
+                   ! .. to prevent overflow in the qr factorization, scale the
+                   ! matrix by 1/sqrt(m) if too large entry detected
+                   call stdlib_clascl('G',0,0, sqrt(real(m,KIND=sp)),one, m,n, a,lda, ierr)
+                             
+                   ascaled = .true.
+               end if
+           end if
+           ! Qr Factorization With Column Pivoting
+           ! a * p = q * [ r ]
+                       ! [ 0 ]
+           do p = 1, n
+              ! All Columns Are Free Columns
+              iwork(p) = 0
+           end do
+           call stdlib_cgeqp3( m, n, a, lda, iwork, cwork, cwork(n+1), lcwork-n,rwork, ierr )
+                     
+          ! if the user requested accuracy level allows truncation in the
+          ! computed upper triangular factor, the matrix r is examined and,
+          ! if possible, replaced with its leading upper trapezoidal part.
+           epsln = stdlib_slamch('E')
+           sfmin = stdlib_slamch('S')
+           ! small = sfmin / epsln
+           nr = n
+           if ( accla ) then
+              ! standard absolute error bound suffices. all sigma_i with
+              ! sigma_i < n*eps*||a||_f are flushed to zero. this is an
+              ! aggressive enforcement of lower numerical rank by introducing a
+              ! backward error of the order of n*eps*||a||_f.
+              nr = 1
+              rtmp = sqrt(real(n,KIND=sp))*epsln
+              do p = 2, n
+                 if ( abs(a(p,p)) < (rtmp*abs(a(1,1))) ) go to 3002
+                    nr = nr + 1
+              end do
+              3002 continue
+           elseif ( acclm ) then
+              ! .. similarly as above, only slightly more gentle (less aggressive).
+              ! sudden drop on the diagonal of r is used as the criterion for being
+              ! close-to-rank-deficient. the threshold is set to epsln=stdlib_slamch('e').
+              ! [[this can be made more flexible by replacing this hard-coded value
+              ! with a user specified threshold.]] also, the values that underflow
+              ! will be truncated.
+              nr = 1
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < (epsln*abs(a(p-1,p-1))) ) .or.( abs(a(p,p)) < sfmin ) ) go &
+                           to 3402
+                 nr = nr + 1
+              end do
+              3402 continue
+           else
+              ! Rrqr Not Authorized To Determine Numerical Rank Except In The
+              ! obvious case of zero pivots.
+              ! .. inspect r for exact zeros on the diagonal;
+              ! r(i,i)=0 => r(i:n,i:n)=0.
+              nr = 1
+              do p = 2, n
+                 if ( abs(a(p,p)) == zero ) go to 3502
+                 nr = nr + 1
+              end do
+              3502 continue
+              if ( conda ) then
+                 ! estimate the scaled condition number of a. use the fact that it is
+                 ! the same as the scaled condition number of r.
+                    ! V Is Used As Workspace
+                    call stdlib_clacpy( 'U', n, n, a, lda, v, ldv )
+                    ! only the leading nr x nr submatrix of the triangular factor
+                    ! is considered. only if nr=n will this give a reliable error
+                    ! bound. however, even for nr < n, this can be used on an
+                    ! expert level and obtain useful information in the sense of
+                    ! perturbation theory.
+                    do p = 1, nr
+                       rtmp = stdlib_scnrm2( p, v(1,p), 1 )
+                       call stdlib_csscal( p, one/rtmp, v(1,p), 1 )
+                    end do
+                    if ( .not. ( lsvec .or. rsvec ) ) then
+                        call stdlib_cpocon( 'U', nr, v, ldv, one, rtmp,cwork, rwork, ierr )
+                                  
+                    else
+                        call stdlib_cpocon( 'U', nr, v, ldv, one, rtmp,cwork(n+1), rwork, ierr )
+                                  
+                    end if
+                    sconda = one / sqrt(rtmp)
+                 ! for nr=n, sconda is an estimate of sqrt(||(r^* * r)^(-1)||_1),
+                 ! n^(-1/4) * sconda <= ||r^(-1)||_2 <= n^(1/4) * sconda
+                 ! see the reference [1] for more details.
+              end if
+           endif
+           if ( wntur ) then
+               n1 = nr
+           else if ( wntus .or. wntuf) then
+               n1 = n
+           else if ( wntua ) then
+               n1 = m
+           end if
+           if ( .not. ( rsvec .or. lsvec ) ) then
+      ! .......................................................................
+              ! Only The Singular Values Are Requested
+      ! .......................................................................
+              if ( rtrans ) then
+               ! .. compute the singular values of r**h = [a](1:nr,1:n)**h
+                 ! .. set the lower triangle of [a] to [a](1:nr,1:n)**h and
+                 ! the upper triangle of [a] to zero.
+                 do p = 1, min( n, nr )
+                    a(p,p) = conjg(a(p,p))
+                    do q = p + 1, n
+                       a(q,p) = conjg(a(p,q))
+                       if ( q <= nr ) a(p,q) = czero
+                    end do
+                 end do
+                 call stdlib_cgesvd( 'N', 'N', n, nr, a, lda, s, u, ldu,v, ldv, cwork, lcwork, &
+                           rwork, info )
+              else
+                 ! .. compute the singular values of r = [a](1:nr,1:n)
+                 if ( nr > 1 )call stdlib_claset( 'L', nr-1,nr-1, czero,czero, a(2,1), lda )
+                           
+                 call stdlib_cgesvd( 'N', 'N', nr, n, a, lda, s, u, ldu,v, ldv, cwork, lcwork, &
+                           rwork, info )
+              end if
+           else if ( lsvec .and. ( .not. rsvec) ) then
+      ! .......................................................................
+             ! The Singular Values And The Left Singular Vectors Requested
+      ! .......................................................................""""""""
+              if ( rtrans ) then
+                  ! .. apply stdlib_cgesvd to r**h
+                  ! .. copy r**h into [u] and overwrite [u] with the right singular
+                  ! vectors of r
+                 do p = 1, nr
+                    do q = p, n
+                       u(q,p) = conjg(a(p,q))
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_claset( 'U', nr-1,nr-1, czero,czero, u(1,2), ldu )
+                           
+                 ! .. the left singular vectors not computed, the nr right singular
+                 ! vectors overwrite [u](1:nr,1:nr) as conjugate transposed. these
+                 ! will be pre-multiplied by q to build the left singular vectors of a.
+                    call stdlib_cgesvd( 'N', 'O', n, nr, u, ldu, s, u, ldu,u, ldu, cwork(n+1), &
+                              lcwork-n, rwork, info )
+                    do p = 1, nr
+                        u(p,p) = conjg(u(p,p))
+                        do q = p + 1, nr
+                           ctmp   = conjg(u(q,p))
+                           u(q,p) = conjg(u(p,q))
+                           u(p,q) = ctmp
+                        end do
+                    end do
+              else
+                  ! Apply Stdlib_Cgesvd To R
+                  ! .. copy r into [u] and overwrite [u] with the left singular vectors
+                  call stdlib_clacpy( 'U', nr, n, a, lda, u, ldu )
+                  if ( nr > 1 )call stdlib_claset( 'L', nr-1, nr-1, czero, czero, u(2,1), ldu )
+                            
+                  ! .. the right singular vectors not computed, the nr left singular
+                  ! vectors overwrite [u](1:nr,1:nr)
+                     call stdlib_cgesvd( 'O', 'N', nr, n, u, ldu, s, u, ldu,v, ldv, cwork(n+1), &
+                               lcwork-n, rwork, info )
+                     ! .. now [u](1:nr,1:nr) contains the nr left singular vectors of
+                     ! r. these will be pre-multiplied by q to build the left singular
+                     ! vectors of a.
+              end if
+                 ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+              if ( ( nr < m ) .and. ( .not.wntuf ) ) then
+                  call stdlib_claset('A', m-nr, nr, czero, czero, u(nr+1,1), ldu)
+                  if ( nr < n1 ) then
+                     call stdlib_claset( 'A',nr,n1-nr,czero,czero,u(1,nr+1), ldu )
+                     call stdlib_claset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1), ldu )
+                  end if
+              end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! vectors matrix u.
+              if ( .not.wntuf )call stdlib_cunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, &
+                        cwork(n+1), lcwork-n, ierr )
+              if ( rowprm .and. .not.wntuf )call stdlib_claswp( n1, u, ldu, 1, m-1, iwork(n+1), -&
+                        1 )
+           else if ( rsvec .and. ( .not. lsvec ) ) then
+      ! .......................................................................
+             ! The Singular Values And The Right Singular Vectors Requested
+      ! .......................................................................
+               if ( rtrans ) then
+                  ! .. apply stdlib_cgesvd to r**h
+                  ! .. copy r**h into v and overwrite v with the left singular vectors
+                 do p = 1, nr
+                    do q = p, n
+                       v(q,p) = conjg(a(p,q))
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_claset( 'U', nr-1,nr-1, czero,czero, v(1,2), ldv )
+                           
+                 ! .. the left singular vectors of r**h overwrite v, the right singular
+                 ! vectors not computed
+                 if ( wntvr .or. ( nr == n ) ) then
+                    call stdlib_cgesvd( 'O', 'N', n, nr, v, ldv, s, u, ldu,u, ldu, cwork(n+1), &
+                              lcwork-n, rwork, info )
+                    do p = 1, nr
+                        v(p,p) = conjg(v(p,p))
+                        do q = p + 1, nr
+                           ctmp   = conjg(v(q,p))
+                           v(q,p) = conjg(v(p,q))
+                           v(p,q) = ctmp
+                        end do
+                    end do
+                    if ( nr < n ) then
+                        do p = 1, nr
+                           do q = nr + 1, n
+                               v(p,q) = conjg(v(q,p))
+                           end do
+                        end do
+                    end if
+                    call stdlib_clapmt( .false., nr, n, v, ldv, iwork )
+                 else
+                     ! .. need all n right singular vectors and nr < n
+                     ! [!] this is simple implementation that augments [v](1:n,1:nr)
+                     ! by padding a zero block. in the case nr << n, a more efficient
+                     ! way is to first use the qr factorization. for more details
+                     ! how to implement this, see the " full svd " branch.
+                     call stdlib_claset('G', n, n-nr, czero, czero, v(1,nr+1), ldv)
+                     call stdlib_cgesvd( 'O', 'N', n, n, v, ldv, s, u, ldu,u, ldu, cwork(n+1), &
+                               lcwork-n, rwork, info )
+                     do p = 1, n
+                        v(p,p) = conjg(v(p,p))
+                        do q = p + 1, n
+                           ctmp   = conjg(v(q,p))
+                           v(q,p) = conjg(v(p,q))
+                           v(p,q) = ctmp
+                        end do
+                     end do
+                     call stdlib_clapmt( .false., n, n, v, ldv, iwork )
+                 end if
+               else
+                  ! Aply Stdlib_Cgesvd To R
+                  ! Copy R Into V And Overwrite V With The Right Singular Vectors
+                  call stdlib_clacpy( 'U', nr, n, a, lda, v, ldv )
+                  if ( nr > 1 )call stdlib_claset( 'L', nr-1, nr-1, czero, czero, v(2,1), ldv )
+                            
+                  ! .. the right singular vectors overwrite v, the nr left singular
+                  ! vectors stored in u(1:nr,1:nr)
+                  if ( wntvr .or. ( nr == n ) ) then
+                     call stdlib_cgesvd( 'N', 'O', nr, n, v, ldv, s, u, ldu,v, ldv, cwork(n+1), &
+                               lcwork-n, rwork, info )
+                     call stdlib_clapmt( .false., nr, n, v, ldv, iwork )
+                     ! .. now [v](1:nr,1:n) contains v(1:n,1:nr)**h
+                  else
+                     ! .. need all n right singular vectors and nr < n
+                     ! [!] this is simple implementation that augments [v](1:nr,1:n)
+                     ! by padding a zero block. in the case nr << n, a more efficient
+                     ! way is to first use the lq factorization. for more details
+                     ! how to implement this, see the " full svd " branch.
+                      call stdlib_claset('G', n-nr, n, czero,czero, v(nr+1,1), ldv)
+                      call stdlib_cgesvd( 'N', 'O', n, n, v, ldv, s, u, ldu,v, ldv, cwork(n+1), &
+                                lcwork-n, rwork, info )
+                      call stdlib_clapmt( .false., n, n, v, ldv, iwork )
+                  end if
+                  ! .. now [v] contains the adjoint of the matrix of the right singular
+                  ! vectors of a.
+               end if
+           else
+      ! .......................................................................
+             ! Full Svd Requested
+      ! .......................................................................
+              if ( rtrans ) then
+                  ! .. apply stdlib_cgesvd to r**h [[this option is left for r
+                 if ( wntvr .or. ( nr == n ) ) then
+                  ! .. copy r**h into [v] and overwrite [v] with the left singular
+                  ! vectors of r**h
+                 do p = 1, nr
+                    do q = p, n
+                       v(q,p) = conjg(a(p,q))
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_claset( 'U', nr-1,nr-1, czero,czero, v(1,2), ldv )
+                           
+                 ! .. the left singular vectors of r**h overwrite [v], the nr right
+                 ! singular vectors of r**h stored in [u](1:nr,1:nr) as conjugate
+                 ! transposed
+                    call stdlib_cgesvd( 'O', 'A', n, nr, v, ldv, s, v, ldv,u, ldu, cwork(n+1), &
+                              lcwork-n, rwork, info )
+                    ! Assemble V
+                    do p = 1, nr
+                       v(p,p) = conjg(v(p,p))
+                       do q = p + 1, nr
+                          ctmp   = conjg(v(q,p))
+                          v(q,p) = conjg(v(p,q))
+                          v(p,q) = ctmp
+                       end do
+                    end do
+                    if ( nr < n ) then
+                        do p = 1, nr
+                           do q = nr+1, n
+                              v(p,q) = conjg(v(q,p))
+                           end do
+                        end do
+                    end if
+                    call stdlib_clapmt( .false., nr, n, v, ldv, iwork )
+                     do p = 1, nr
+                        u(p,p) = conjg(u(p,p))
+                        do q = p + 1, nr
+                           ctmp   = conjg(u(q,p))
+                           u(q,p) = conjg(u(p,q))
+                           u(p,q) = ctmp
+                        end do
+                     end do
+                     if ( ( nr < m ) .and. .not.(wntuf)) then
+                       call stdlib_claset('A', m-nr,nr, czero,czero, u(nr+1,1), ldu)
+                       if ( nr < n1 ) then
+                          call stdlib_claset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                          call stdlib_claset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1), ldu )
+                                    
+                       end if
+                    end if
+                 else
+                     ! .. need all n right singular vectors and nr < n
+                  ! .. copy r**h into [v] and overwrite [v] with the left singular
+                  ! vectors of r**h
+                     ! [[the optimal ratio n/nr for using qrf instead of padding
+                       ! with zeros. here hard coded to 2; it must be at least
+                       ! two due to work space constraints.]]
+                     ! optratio = stdlib_ilaenv(6, 'cgesvd', 's' // 'o', nr,n,0,0)
+                     ! optratio = max( optratio, 2 )
+                     optratio = 2
+                     if ( optratio*nr > n ) then
+                        do p = 1, nr
+                           do q = p, n
+                              v(q,p) = conjg(a(p,q))
+                           end do
+                        end do
+                        if ( nr > 1 )call stdlib_claset('U',nr-1,nr-1, czero,czero, v(1,2),ldv)
+                                  
+                        call stdlib_claset('A',n,n-nr,czero,czero,v(1,nr+1),ldv)
+                        call stdlib_cgesvd( 'O', 'A', n, n, v, ldv, s, v, ldv,u, ldu, cwork(n+1), &
+                                  lcwork-n, rwork, info )
+                        do p = 1, n
+                           v(p,p) = conjg(v(p,p))
+                           do q = p + 1, n
+                              ctmp   = conjg(v(q,p))
+                              v(q,p) = conjg(v(p,q))
+                              v(p,q) = ctmp
+                           end do
+                        end do
+                        call stdlib_clapmt( .false., n, n, v, ldv, iwork )
+                    ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x n1), i.e. (m x n) or (m x m).
+                        do p = 1, n
+                           u(p,p) = conjg(u(p,p))
+                           do q = p + 1, n
+                              ctmp   = conjg(u(q,p))
+                              u(q,p) = conjg(u(p,q))
+                              u(p,q) = ctmp
+                           end do
+                        end do
+                        if ( ( n < m ) .and. .not.(wntuf)) then
+                           call stdlib_claset('A',m-n,n,czero,czero,u(n+1,1),ldu)
+                           if ( n < n1 ) then
+                             call stdlib_claset('A',n,n1-n,czero,czero,u(1,n+1),ldu)
+                             call stdlib_claset('A',m-n,n1-n,czero,cone,u(n+1,n+1), ldu )
+                           end if
+                        end if
+                     else
+                        ! .. copy r**h into [u] and overwrite [u] with the right
+                        ! singular vectors of r
+                        do p = 1, nr
+                           do q = p, n
+                              u(q,nr+p) = conjg(a(p,q))
+                           end do
+                        end do
+                        if ( nr > 1 )call stdlib_claset('U',nr-1,nr-1,czero,czero,u(1,nr+2),ldu)
+                                  
+                        call stdlib_cgeqrf( n, nr, u(1,nr+1), ldu, cwork(n+1),cwork(n+nr+1), &
+                                  lcwork-n-nr, ierr )
+                        do p = 1, nr
+                            do q = 1, n
+                                v(q,p) = conjg(u(p,nr+q))
+                            end do
+                        end do
+                       call stdlib_claset('U',nr-1,nr-1,czero,czero,v(1,2),ldv)
+                       call stdlib_cgesvd( 'S', 'O', nr, nr, v, ldv, s, u, ldu,v,ldv, cwork(n+nr+&
+                                 1),lcwork-n-nr,rwork, info )
+                       call stdlib_claset('A',n-nr,nr,czero,czero,v(nr+1,1),ldv)
+                       call stdlib_claset('A',nr,n-nr,czero,czero,v(1,nr+1),ldv)
+                       call stdlib_claset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                       call stdlib_cunmqr('R','C', n, n, nr, u(1,nr+1), ldu,cwork(n+1),v,ldv,&
+                                 cwork(n+nr+1),lcwork-n-nr,ierr)
+                       call stdlib_clapmt( .false., n, n, v, ldv, iwork )
+                       ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                       ! (m x nr) or (m x n) or (m x m).
+                       if ( ( nr < m ) .and. .not.(wntuf)) then
+                          call stdlib_claset('A',m-nr,nr,czero,czero,u(nr+1,1),ldu)
+                          if ( nr < n1 ) then
+                          call stdlib_claset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                          call stdlib_claset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1),ldu)
+                          end if
+                       end if
+                     end if
+                 end if
+              else
+                  ! .. apply stdlib_cgesvd to r [[this is the recommended option]]
+                  if ( wntvr .or. ( nr == n ) ) then
+                      ! .. copy r into [v] and overwrite v with the right singular vectors
+                      call stdlib_clacpy( 'U', nr, n, a, lda, v, ldv )
+                     if ( nr > 1 )call stdlib_claset( 'L', nr-1,nr-1, czero,czero, v(2,1), ldv )
+                               
+                     ! .. the right singular vectors of r overwrite [v], the nr left
+                     ! singular vectors of r stored in [u](1:nr,1:nr)
+                     call stdlib_cgesvd( 'S', 'O', nr, n, v, ldv, s, u, ldu,v, ldv, cwork(n+1), &
+                               lcwork-n, rwork, info )
+                     call stdlib_clapmt( .false., nr, n, v, ldv, iwork )
+                     ! .. now [v](1:nr,1:n) contains v(1:n,1:nr)**h
+                     ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+                    if ( ( nr < m ) .and. .not.(wntuf)) then
+                       call stdlib_claset('A', m-nr,nr, czero,czero, u(nr+1,1), ldu)
+                       if ( nr < n1 ) then
+                          call stdlib_claset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                          call stdlib_claset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1), ldu )
+                                    
+                       end if
+                    end if
+                  else
+                    ! .. need all n right singular vectors and nr < n
+                    ! The Requested Number Of The Left Singular Vectors
+                     ! is then n1 (n or m)
+                     ! [[the optimal ratio n/nr for using lq instead of padding
+                       ! with zeros. here hard coded to 2; it must be at least
+                       ! two due to work space constraints.]]
+                     ! optratio = stdlib_ilaenv(6, 'cgesvd', 's' // 'o', nr,n,0,0)
+                     ! optratio = max( optratio, 2 )
+                    optratio = 2
+                    if ( optratio * nr > n ) then
+                       call stdlib_clacpy( 'U', nr, n, a, lda, v, ldv )
+                       if ( nr > 1 )call stdlib_claset('L', nr-1,nr-1, czero,czero, v(2,1),ldv)
+                                 
+                    ! .. the right singular vectors of r overwrite [v], the nr left
+                       ! singular vectors of r stored in [u](1:nr,1:nr)
+                       call stdlib_claset('A', n-nr,n, czero,czero, v(nr+1,1),ldv)
+                       call stdlib_cgesvd( 'S', 'O', n, n, v, ldv, s, u, ldu,v, ldv, cwork(n+1), &
+                                 lcwork-n, rwork, info )
+                       call stdlib_clapmt( .false., n, n, v, ldv, iwork )
+                       ! .. now [v] contains the adjoint of the matrix of the right
+                       ! singular vectors of a. the leading n left singular vectors
+                       ! are in [u](1:n,1:n)
+                       ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                       ! (m x n1), i.e. (m x n) or (m x m).
+                       if ( ( n < m ) .and. .not.(wntuf)) then
+                           call stdlib_claset('A',m-n,n,czero,czero,u(n+1,1),ldu)
+                           if ( n < n1 ) then
+                             call stdlib_claset('A',n,n1-n,czero,czero,u(1,n+1),ldu)
+                             call stdlib_claset( 'A',m-n,n1-n,czero,cone,u(n+1,n+1), ldu )
+                           end if
+                       end if
+                    else
+                       call stdlib_clacpy( 'U', nr, n, a, lda, u(nr+1,1), ldu )
+                       if ( nr > 1 )call stdlib_claset('L',nr-1,nr-1,czero,czero,u(nr+2,1),ldu)
+                                 
+                       call stdlib_cgelqf( nr, n, u(nr+1,1), ldu, cwork(n+1),cwork(n+nr+1), &
+                                 lcwork-n-nr, ierr )
+                       call stdlib_clacpy('L',nr,nr,u(nr+1,1),ldu,v,ldv)
+                       if ( nr > 1 )call stdlib_claset('U',nr-1,nr-1,czero,czero,v(1,2),ldv)
+                                 
+                       call stdlib_cgesvd( 'S', 'O', nr, nr, v, ldv, s, u, ldu,v, ldv, cwork(n+nr+&
+                                 1), lcwork-n-nr, rwork, info )
+                       call stdlib_claset('A',n-nr,nr,czero,czero,v(nr+1,1),ldv)
+                       call stdlib_claset('A',nr,n-nr,czero,czero,v(1,nr+1),ldv)
+                       call stdlib_claset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                       call stdlib_cunmlq('R','N',n,n,nr,u(nr+1,1),ldu,cwork(n+1),v, ldv, cwork(n+&
+                                 nr+1),lcwork-n-nr,ierr)
+                       call stdlib_clapmt( .false., n, n, v, ldv, iwork )
+                     ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+                       if ( ( nr < m ) .and. .not.(wntuf)) then
+                          call stdlib_claset('A',m-nr,nr,czero,czero,u(nr+1,1),ldu)
+                          if ( nr < n1 ) then
+                          call stdlib_claset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                          call stdlib_claset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1), ldu )
+                                    
+                          end if
+                       end if
+                    end if
+                  end if
+              ! .. end of the "r**h or r" branch
+              end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! vectors matrix u.
+              if ( .not. wntuf )call stdlib_cunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, &
+                        cwork(n+1), lcwork-n, ierr )
+              if ( rowprm .and. .not.wntuf )call stdlib_claswp( n1, u, ldu, 1, m-1, iwork(n+1), -&
+                        1 )
+           ! ... end of the "full svd" branch
+           end if
+           ! check whether some singular values are returned as zeros, e.g.
+           ! due to underflow, and update the numerical rank.
+           p = nr
+           do q = p, 1, -1
+               if ( s(q) > zero ) go to 4002
+               nr = nr - 1
+           end do
+           4002 continue
+           ! .. if numerical rank deficiency is detected, the truncated
+           ! singular values are set to zero.
+           if ( nr < n ) call stdlib_slaset( 'G', n-nr,1, zero,zero, s(nr+1), n )
+           ! .. undo scaling; this may cause overflow in the largest singular
+           ! values.
+           if ( ascaled )call stdlib_slascl( 'G',0,0, one,sqrt(real(m,KIND=sp)), nr,1, s, n, ierr &
+                     )
+           if ( conda ) rwork(1) = sconda
+           rwork(2) = p - nr
+           ! .. p-nr is the number of singular values that are computed as
+           ! exact zeros in stdlib_cgesvd() applied to the (possibly truncated)
+           ! full row rank triangular (trapezoidal) factor of a.
+           numrank = nr
+           return
+     end subroutine stdlib_cgesvdq
+
+     !> CGESVX: uses the LU factorization to compute the solution to a complex
+     !> system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_cgesvx( fact, trans, n, nrhs, a, lda, af, ldaf, ipiv,equed, r, c, b, ldb, &
+               x, ldx, rcond, ferr, berr,work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(sp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(sp), intent(inout) :: c(*), r(*)
+           complex(sp), intent(inout) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: colequ, equil, nofact, notran, rowequ
+           character :: norm
+           integer(ilp) :: i, infequ, j
+           real(sp) :: amax, anorm, bignum, colcnd, rcmax, rcmin, rowcnd, rpvgrw, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rowequ = .false.
+              colequ = .false.
+           else
+              rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+              colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rowequ .or. colequ .or. stdlib_lsame( &
+                     equed, 'N' ) ) ) then
+              info = -10
+           else
+              if( rowequ ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, r( j ) )
+                    rcmax = max( rcmax, r( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -11
+                 else if( n>0 ) then
+                    rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    rowcnd = one
+                 end if
+              end if
+              if( colequ .and. info==0 ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, c( j ) )
+                    rcmax = max( rcmax, c( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -12
+                 else if( n>0 ) then
+                    colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    colcnd = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -14
+                 else if( ldx<max( 1, n ) ) then
+                    info = -16
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGESVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_cgeequ( n, n, a, lda, r, c, rowcnd, colcnd, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_claqge( n, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+                 rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+                 colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( notran ) then
+              if( rowequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       b( i, j ) = r( i )*b( i, j )
+                    end do
+                 end do
+              end if
+           else if( colequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = c( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the lu factorization of a.
+              call stdlib_clacpy( 'FULL', n, n, a, lda, af, ldaf )
+              call stdlib_cgetrf( n, n, af, ldaf, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 ) then
+                 ! compute the reciprocal pivot growth factor of the
+                 ! leading rank-deficient info columns of a.
+                 rpvgrw = stdlib_clantr( 'M', 'U', 'N', info, info, af, ldaf,rwork )
+                 if( rpvgrw==zero ) then
+                    rpvgrw = one
+                 else
+                    rpvgrw = stdlib_clange( 'M', n, info, a, lda, rwork ) /rpvgrw
+                 end if
+                 rwork( 1 ) = rpvgrw
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a and the
+           ! reciprocal pivot growth factor rpvgrw.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_clange( norm, n, n, a, lda, rwork )
+           rpvgrw = stdlib_clantr( 'M', 'U', 'N', n, n, af, ldaf, rwork )
+           if( rpvgrw==zero ) then
+              rpvgrw = one
+           else
+              rpvgrw = stdlib_clange( 'M', n, n, a, lda, rwork ) / rpvgrw
+           end if
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_cgecon( norm, n, af, ldaf, anorm, rcond, work, rwork, info )
+           ! compute the solution matrix x.
+           call stdlib_clacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_cgetrs( trans, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_cgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, rwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( notran ) then
+              if( colequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       x( i, j ) = c( i )*x( i, j )
+                    end do
+                 end do
+                 do j = 1, nrhs
+                    ferr( j ) = ferr( j ) / colcnd
+                 end do
+              end if
+           else if( rowequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = r( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / rowcnd
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           rwork( 1 ) = rpvgrw
+           return
+     end subroutine stdlib_cgesvx
+
+     !> CGETSLS: solves overdetermined or underdetermined complex linear systems
+     !> involving an M-by-N matrix A, using a tall skinny QR or short wide LQ
+     !> factorization of A.  It is assumed that A has full rank.
+     !> The following options are provided:
+     !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A*X ||.
+     !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+     !> an underdetermined system A * X = B.
+     !> 3. If TRANS = 'C' and m >= n:  find the minimum norm solution of
+     !> an undetermined system A**T * X = B.
+     !> 4. If TRANS = 'C' and m < n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A**T * X ||.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+
+     subroutine stdlib_cgetsls( trans, m, n, nrhs, a, lda, b, ldb,work, lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, tran
+           integer(ilp) :: i, iascl, ibscl, j, maxmn, brow, scllen, tszo, tszm, lwo, lwm, lw1, &
+                     lw2, wsizeo, wsizem, info2
+           real(sp) :: anrm, bignum, bnrm, smlnum, dum(1)
+           complex(sp) :: tq(5), workq(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,int
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           maxmn = max( m, n )
+           tran  = stdlib_lsame( trans, 'C' )
+           lquery = ( lwork==-1 .or. lwork==-2 )
+           if( .not.( stdlib_lsame( trans, 'N' ) .or.stdlib_lsame( trans, 'C' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+           ! determine the optimum and minimum lwork
+            if( m>=n ) then
+              call stdlib_cgeqr( m, n, a, lda, tq, -1, workq, -1, info2 )
+              tszo = int( tq( 1 ),KIND=ilp)
+              lwo  = int( workq( 1 ),KIND=ilp)
+              call stdlib_cgemqr( 'L', trans, m, nrhs, n, a, lda, tq,tszo, b, ldb, workq, -1, &
+                        info2 )
+              lwo  = max( lwo, int( workq( 1 ),KIND=ilp) )
+              call stdlib_cgeqr( m, n, a, lda, tq, -2, workq, -2, info2 )
+              tszm = int( tq( 1 ),KIND=ilp)
+              lwm  = int( workq( 1 ),KIND=ilp)
+              call stdlib_cgemqr( 'L', trans, m, nrhs, n, a, lda, tq,tszm, b, ldb, workq, -1, &
+                        info2 )
+              lwm = max( lwm, int( workq( 1 ),KIND=ilp) )
+              wsizeo = tszo + lwo
+              wsizem = tszm + lwm
+            else
+              call stdlib_cgelq( m, n, a, lda, tq, -1, workq, -1, info2 )
+              tszo = int( tq( 1 ),KIND=ilp)
+              lwo  = int( workq( 1 ),KIND=ilp)
+              call stdlib_cgemlq( 'L', trans, n, nrhs, m, a, lda, tq,tszo, b, ldb, workq, -1, &
+                        info2 )
+              lwo  = max( lwo, int( workq( 1 ),KIND=ilp) )
+              call stdlib_cgelq( m, n, a, lda, tq, -2, workq, -2, info2 )
+              tszm = int( tq( 1 ),KIND=ilp)
+              lwm  = int( workq( 1 ),KIND=ilp)
+              call stdlib_cgemlq( 'L', trans, n, nrhs, m, a, lda, tq,tszm, b, ldb, workq, -1, &
+                        info2 )
+              lwm  = max( lwm, int( workq( 1 ),KIND=ilp) )
+              wsizeo = tszo + lwo
+              wsizem = tszm + lwm
+            end if
+            if( ( lwork<wsizem ).and.( .not.lquery ) ) then
+               info = -10
+            end if
+            work( 1 ) = real( wsizeo,KIND=sp)
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'CGETSLS', -info )
+             return
+           end if
+           if( lquery ) then
+             if( lwork==-2 ) work( 1 ) = real( wsizem,KIND=sp)
+             return
+           end if
+           if( lwork<wsizeo ) then
+             lw1 = tszm
+             lw2 = lwm
+           else
+             lw1 = tszo
+             lw2 = lwo
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+                call stdlib_claset( 'FULL', max( m, n ), nrhs, czero, czero,b, ldb )
+                return
+           end if
+           ! get machine parameters
+            smlnum = stdlib_slamch( 'S' ) / stdlib_slamch( 'P' )
+            bignum = one / smlnum
+            call stdlib_slabad( smlnum, bignum )
+           ! scale a, b if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', m, n, a, lda, dum )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_clascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_clascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_claset( 'F', maxmn, nrhs, czero, czero, b, ldb )
+              go to 50
+           end if
+           brow = m
+           if ( tran ) then
+             brow = n
+           end if
+           bnrm = stdlib_clange( 'M', brow, nrhs, b, ldb, dum )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_clascl( 'G', 0, 0, bnrm, smlnum, brow, nrhs, b, ldb,info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_clascl( 'G', 0, 0, bnrm, bignum, brow, nrhs, b, ldb,info )
+              ibscl = 2
+           end if
+           if ( m>=n ) then
+              ! compute qr factorization of a
+             call stdlib_cgeqr( m, n, a, lda, work( lw2+1 ), lw1,work( 1 ), lw2, info )
+             if ( .not.tran ) then
+                 ! least-squares problem min || a * x - b ||
+                 ! b(1:m,1:nrhs) := q**t * b(1:m,1:nrhs)
+               call stdlib_cgemqr( 'L' , 'C', m, nrhs, n, a, lda,work( lw2+1 ), lw1, b, ldb, work(&
+                          1 ), lw2,info )
+                 ! b(1:n,1:nrhs) := inv(r) * b(1:n,1:nrhs)
+               call stdlib_ctrtrs( 'U', 'N', 'N', n, nrhs,a, lda, b, ldb, info )
+               if( info>0 ) then
+                 return
+               end if
+               scllen = n
+             else
+                 ! overdetermined system of equations a**t * x = b
+                 ! b(1:n,1:nrhs) := inv(r**t) * b(1:n,1:nrhs)
+                 call stdlib_ctrtrs( 'U', 'C', 'N', n, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(n+1:m,1:nrhs) = czero
+                 do j = 1, nrhs
+                    do i = n + 1, m
+                       b( i, j ) = czero
+                    end do
+                 end do
+                 ! b(1:m,1:nrhs) := q(1:n,:) * b(1:n,1:nrhs)
+                 call stdlib_cgemqr( 'L', 'N', m, nrhs, n, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 scllen = m
+              end if
+           else
+              ! compute lq factorization of a
+              call stdlib_cgelq( m, n, a, lda, work( lw2+1 ), lw1,work( 1 ), lw2, info )
+              ! workspace at least m, optimally m*nb.
+              if( .not.tran ) then
+                 ! underdetermined system of equations a * x = b
+                 ! b(1:m,1:nrhs) := inv(l) * b(1:m,1:nrhs)
+                 call stdlib_ctrtrs( 'L', 'N', 'N', m, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(m+1:n,1:nrhs) = 0
+                 do j = 1, nrhs
+                    do i = m + 1, n
+                       b( i, j ) = czero
+                    end do
+                 end do
+                 ! b(1:n,1:nrhs) := q(1:n,:)**t * b(1:m,1:nrhs)
+                 call stdlib_cgemlq( 'L', 'C', n, nrhs, m, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = n
+              else
+                 ! overdetermined system min || a**t * x - b ||
+                 ! b(1:n,1:nrhs) := q * b(1:n,1:nrhs)
+                 call stdlib_cgemlq( 'L', 'N', n, nrhs, m, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:m,1:nrhs) := inv(l**t) * b(1:m,1:nrhs)
+                 call stdlib_ctrtrs( 'L', 'C', 'N', m, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = m
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+             call stdlib_clascl( 'G', 0, 0, anrm, smlnum, scllen, nrhs, b, ldb,info )
+           else if( iascl==2 ) then
+             call stdlib_clascl( 'G', 0, 0, anrm, bignum, scllen, nrhs, b, ldb,info )
+           end if
+           if( ibscl==1 ) then
+             call stdlib_clascl( 'G', 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb,info )
+           else if( ibscl==2 ) then
+             call stdlib_clascl( 'G', 0, 0, bignum, bnrm, scllen, nrhs, b, ldb,info )
+           end if
+           50 continue
+           work( 1 ) = real( tszo + lwo,KIND=sp)
+           return
+     end subroutine stdlib_cgetsls
+
+     !> CGETSQRHRT: computes a NB2-sized column blocked QR-factorization
+     !> of a complex M-by-N matrix A with M >= N,
+     !> A = Q * R.
+     !> The routine uses internally a NB1-sized column blocked and MB1-sized
+     !> row blocked TSQR-factorization and perfors the reconstruction
+     !> of the Householder vectors from the TSQR output. The routine also
+     !> converts the R_tsqr factor from the TSQR-factorization output into
+     !> the R factor that corresponds to the Householder QR-factorization,
+     !> A = Q_tsqr * R_tsqr = Q * R.
+     !> The output Q and R factors are stored in the same format as in CGEQRT
+     !> (Q is in blocked compact WY-representation). See the documentation
+     !> of CGEQRT for more details on the format.
+
+     pure subroutine stdlib_cgetsqrhrt( m, n, mb1, nb1, nb2, a, lda, t, ldt, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, nb1, nb2, mb1
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: t(ldt,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, lw1, lw2, lwt, ldwt, lworkopt, nb1local, nb2local, &
+                     num_all_row_blocks
+           ! Intrinsic Functions 
+           intrinsic :: ceiling,real,cmplx,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery  = lwork==-1
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb1<=n ) then
+              info = -3
+           else if( nb1<1 ) then
+              info = -4
+           else if( nb2<1 ) then
+              info = -5
+           else if( lda<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1,  min( nb2, n ) ) ) then
+              info = -9
+           else
+              ! test the input lwork for the dimension of the array work.
+              ! this workspace is used to store array:
+              ! a) matrix t and work for stdlib_clatsqr;
+              ! b) n-by-n upper-triangular factor r_tsqr;
+              ! c) matrix t and array work for stdlib_cungtsqr_row;
+              ! d) diagonal d for stdlib_cunhr_col.
+              if( lwork<n*n+1 .and. .not.lquery ) then
+                 info = -11
+              else
+                 ! set block size for column blocks
+                 nb1local = min( nb1, n )
+                 num_all_row_blocks = max( 1,ceiling( real( m - n,KIND=sp) / real( mb1 - n,&
+                           KIND=sp) ) )
+                 ! length and leading dimension of work array to place
+                 ! t array in tsqr.
+                 lwt = num_all_row_blocks * n * nb1local
+                 ldwt = nb1local
+                 ! length of tsqr work array
+                 lw1 = nb1local * n
+                 ! length of stdlib_cungtsqr_row work array.
+                 lw2 = nb1local * max( nb1local, ( n - nb1local ) )
+                 lworkopt = max( lwt + lw1, max( lwt+n*n+lw2, lwt+n*n+n ) )
+                 if( ( lwork<max( 1, lworkopt ) ).and.(.not.lquery) ) then
+                    info = -11
+                 end if
+              end if
+           end if
+           ! handle error in the input parameters and return workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGETSQRHRT', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = cmplx( lworkopt,KIND=sp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = cmplx( lworkopt,KIND=sp)
+              return
+           end if
+           nb2local = min( nb2, n )
+           ! (1) perform tsqr-factorization of the m-by-n matrix a.
+           call stdlib_clatsqr( m, n, mb1, nb1local, a, lda, work, ldwt,work(lwt+1), lw1, iinfo )
+                     
+           ! (2) copy the factor r_tsqr stored in the upper-triangular part
+               ! of a into the square matrix in the work array
+               ! work(lwt+1:lwt+n*n) column-by-column.
+           do j = 1, n
+              call stdlib_ccopy( j, a( 1, j ), 1, work( lwt + n*(j-1)+1 ), 1 )
+           end do
+           ! (3) generate a m-by-n matrix q with orthonormal columns from
+           ! the result stored below the diagonal in the array a in place.
+           call stdlib_cungtsqr_row( m, n, mb1, nb1local, a, lda, work, ldwt,work( lwt+n*n+1 ), &
+                     lw2, iinfo )
+           ! (4) perform the reconstruction of householder vectors from
+           ! the matrix q (stored in a) in place.
+           call stdlib_cunhr_col( m, n, nb2local, a, lda, t, ldt,work( lwt+n*n+1 ), iinfo )
+                     
+           ! (5) copy the factor r_tsqr stored in the square matrix in the
+           ! work array work(lwt+1:lwt+n*n) into the upper-triangular
+           ! part of a.
+           ! (6) compute from r_tsqr the factor r_hr corresponding to
+           ! the reconstructed householder vectors, i.e. r_hr = s * r_tsqr.
+           ! this multiplication by the sign matrix s on the left means
+           ! changing the sign of i-th row of the matrix r_tsqr according
+           ! to sign of the i-th diagonal element diag(i) of the matrix s.
+           ! diag is stored in work( lwt+n*n+1 ) from the stdlib_cunhr_col output.
+           ! (5) and (6) can be combined in a single loop, so the rows in a
+           ! are accessed only once.
+           do i = 1, n
+              if( work( lwt+n*n+i )==-cone ) then
+                 do j = i, n
+                    a( i, j ) = -cone * work( lwt+n*(j-1)+i )
+                 end do
+              else
+                 call stdlib_ccopy( n-i+1, work(lwt+n*(i-1)+i), n, a( i, i ), lda )
+              end if
+           end do
+           work( 1 ) = cmplx( lworkopt,KIND=sp)
+           return
+     end subroutine stdlib_cgetsqrhrt
+
+     !> CGGES: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, the generalized complex Schur
+     !> form (S, T), and optionally left and/or right Schur vectors (VSL
+     !> and VSR). This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL)*S*(VSR)**H, (VSL)*T*(VSR)**H )
+     !> where (VSR)**H is the conjugate-transpose of VSR.
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> triangular matrix S and the upper triangular matrix T. The leading
+     !> columns of VSL and VSR then form an unitary basis for the
+     !> corresponding left and right eigenspaces (deflating subspaces).
+     !> (If only the generalized eigenvalues are needed, use the driver
+     !> CGGEV instead, which is faster.)
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0, and even for both being zero.
+     !> A pair of matrices (S,T) is in generalized complex Schur form if S
+     !> and T are upper triangular and, in addition, the diagonal elements
+     !> of T are non-negative real numbers.
+
+     subroutine stdlib_cgges( jobvsl, jobvsr, sort, selctg, n, a, lda, b, ldb,sdim, alpha, beta, &
+               vsl, ldvsl, vsr, ldvsr, work,lwork, rwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: alpha(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), work(*)
+                     
+           ! Function Arguments 
+           procedure(stdlib_selctg_c) :: selctg
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, wantst
+           integer(ilp) :: i, icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, iright, irows, irwrk, &
+                     itau, iwrk, lwkmin, lwkopt
+           real(sp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pvsl, pvsr, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(sp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -14
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -16
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              lwkmin = max( 1, 2*n )
+              lwkopt = max( 1, n + n*stdlib_ilaenv( 1, 'CGEQRF', ' ', n, 1, n, 0 ) )
+              lwkopt = max( lwkopt, n +n*stdlib_ilaenv( 1, 'CUNMQR', ' ', n, 1, n, -1 ) )
+              if( ilvsl ) then
+                 lwkopt = max( lwkopt, n +n*stdlib_ilaenv( 1, 'CUNGQR', ' ', n, 1, n, -1 ) )
+                           
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery )info = -18
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGES ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_clascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_clange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_clascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (real workspace: need 6*n)
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_cggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (complex workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_cgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (complex workspace: need n, prefer n*nb)
+           call stdlib_cunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           ! (complex workspace: need n, prefer n*nb)
+           if( ilvsl ) then
+              call stdlib_claset( 'FULL', n, n, czero, cone, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_clacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_cungqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_claset( 'FULL', n, n, czero, cone, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           call stdlib_cgghrd( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      ierr )
+           sdim = 0
+           ! perform qz algorithm, computing schur vectors if desired
+           ! (complex workspace: need n)
+           ! (real workspace: need n)
+           iwrk = itau
+           call stdlib_chgeqz( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vsl, &
+                     ldvsl, vsr, ldvsr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 30
+           end if
+           ! sort eigenvalues alpha/beta if desired
+           ! (workspace: none needed)
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selecting
+              if( ilascl )call stdlib_clascl( 'G', 0, 0, anrm, anrmto, n, 1, alpha, n, ierr )
+                        
+              if( ilbscl )call stdlib_clascl( 'G', 0, 0, bnrm, bnrmto, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alpha( i ), beta( i ) )
+              end do
+              call stdlib_ctgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb, alpha,beta, vsl, &
+              ldvsl, vsr, ldvsr, sdim, pvsl, pvsr,dif, work( iwrk ), lwork-iwrk+1, idum, 1, ierr )
+                        
+              if( ierr==1 )info = n + 3
+           end if
+           ! apply back-permutation to vsl and vsr
+           ! (workspace: none needed)
+           if( ilvsl )call stdlib_cggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_cggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsr, ldvsr, ierr )
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_clascl( 'U', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_clascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_clascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_clascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              sdim = 0
+              do i = 1, n
+                 cursl = selctg( alpha( i ), beta( i ) )
+                 if( cursl )sdim = sdim + 1
+                 if( cursl .and. .not.lastsl )info = n + 2
+                 lastsl = cursl
+              end do
+           end if
+           30 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cgges
+
+     !> CGGESX: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, the complex Schur form (S,T),
+     !> and, optionally, the left and/or right matrices of Schur vectors (VSL
+     !> and VSR).  This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL) S (VSR)**H, (VSL) T (VSR)**H )
+     !> where (VSR)**H is the conjugate-transpose of VSR.
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> triangular matrix S and the upper triangular matrix T; computes
+     !> a reciprocal condition number for the average of the selected
+     !> eigenvalues (RCONDE); and computes a reciprocal condition number for
+     !> the right and left deflating subspaces corresponding to the selected
+     !> eigenvalues (RCONDV). The leading columns of VSL and VSR then form
+     !> an orthonormal basis for the corresponding left and right eigenspaces
+     !> (deflating subspaces).
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0 or for both being zero.
+     !> A pair of matrices (S,T) is in generalized complex Schur form if T is
+     !> upper triangular with non-negative diagonal and S is upper
+     !> triangular.
+
+     subroutine stdlib_cggesx( jobvsl, jobvsr, sort, selctg, sense, n, a, lda,b, ldb, sdim, alpha,&
+      beta, vsl, ldvsl, vsr,ldvsr, rconde, rcondv, work, lwork, rwork,iwork, liwork, bwork, info )
+                
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sense, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, liwork, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: rconde(2), rcondv(2), rwork(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: alpha(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), work(*)
+                     
+           ! Function Arguments 
+           procedure(stdlib_selctg_c) :: selctg
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, wantsb, wantse, &
+                     wantsn, wantst, wantsv
+           integer(ilp) :: i, icols, ierr, ihi, ijob, ijobvl, ijobvr, ileft, ilo, iright, irows, &
+                     irwrk, itau, iwrk, liwmin, lwrk, maxwrk, minwrk
+           real(sp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pl, pr, smlnum
+           ! Local Arrays 
+           real(sp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( wantsn ) then
+              ijob = 0
+           else if( wantse ) then
+              ijob = 1
+           else if( wantsv ) then
+              ijob = 2
+           else if( wantsb ) then
+              ijob = 4
+           end if
+           ! test the input arguments
+           info = 0
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( .not.( wantsn .or. wantse .or. wantsv .or. wantsb ) .or.( .not.wantst .and. &
+                     .not.wantsn ) ) then
+              info = -5
+           else if( n<0 ) then
+              info = -6
+           else if( lda<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -15
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -17
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              if( n>0) then
+                 minwrk = 2*n
+                 maxwrk = n*(1 + stdlib_ilaenv( 1, 'CGEQRF', ' ', n, 1, n, 0 ) )
+                 maxwrk = max( maxwrk, n*( 1 +stdlib_ilaenv( 1, 'CUNMQR', ' ', n, 1, n, -1 ) ) )
+                           
+                 if( ilvsl ) then
+                    maxwrk = max( maxwrk, n*( 1 +stdlib_ilaenv( 1, 'CUNGQR', ' ', n, 1, n, -1 ) ) &
+                              )
+                 end if
+                 lwrk = maxwrk
+                 if( ijob>=1 )lwrk = max( lwrk, n*n/2 )
+              else
+                 minwrk = 1
+                 maxwrk = 1
+                 lwrk   = 1
+              end if
+              work( 1 ) = lwrk
+              if( wantsn .or. n==0 ) then
+                 liwmin = 1
+              else
+                 liwmin = n + 2
+              end if
+              iwork( 1 ) = liwmin
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -21
+              else if( liwork<liwmin  .and. .not.lquery) then
+                 info = -24
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGESX', -info )
+              return
+           else if (lquery) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_clascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_clange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_clascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (real workspace: need 6*n)
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_cggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (complex workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_cgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the unitary transformation to matrix a
+           ! (complex workspace: need n, prefer n*nb)
+           call stdlib_cunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           ! (complex workspace: need n, prefer n*nb)
+           if( ilvsl ) then
+              call stdlib_claset( 'FULL', n, n, czero, cone, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_clacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_cungqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_claset( 'FULL', n, n, czero, cone, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           call stdlib_cgghrd( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      ierr )
+           sdim = 0
+           ! perform qz algorithm, computing schur vectors if desired
+           ! (complex workspace: need n)
+           ! (real workspace:    need n)
+           iwrk = itau
+           call stdlib_chgeqz( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vsl, &
+                     ldvsl, vsr, ldvsr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 40
+           end if
+           ! sort eigenvalues alpha/beta and compute the reciprocal of
+           ! condition number(s)
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selctging
+              if( ilascl )call stdlib_clascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+                        
+              if( ilbscl )call stdlib_clascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alpha( i ), beta( i ) )
+              end do
+              ! reorder eigenvalues, transform generalized schur vectors, and
+              ! compute reciprocal condition numbers
+              ! (complex workspace: if ijob >= 1, need max(1, 2*sdim*(n-sdim))
+                                  ! otherwise, need 1 )
+              call stdlib_ctgsen( ijob, ilvsl, ilvsr, bwork, n, a, lda, b, ldb,alpha, beta, vsl, &
+              ldvsl, vsr, ldvsr, sdim, pl, pr,dif, work( iwrk ), lwork-iwrk+1, iwork, liwork,ierr &
+                        )
+              if( ijob>=1 )maxwrk = max( maxwrk, 2*sdim*( n-sdim ) )
+              if( ierr==-21 ) then
+                  ! not enough complex workspace
+                 info = -21
+              else
+                 if( ijob==1 .or. ijob==4 ) then
+                    rconde( 1 ) = pl
+                    rconde( 2 ) = pr
+                 end if
+                 if( ijob==2 .or. ijob==4 ) then
+                    rcondv( 1 ) = dif( 1 )
+                    rcondv( 2 ) = dif( 2 )
+                 end if
+                 if( ierr==1 )info = n + 3
+              end if
+           end if
+           ! apply permutation to vsl and vsr
+           ! (workspace: none needed)
+           if( ilvsl )call stdlib_cggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_cggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsr, ldvsr, ierr )
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_clascl( 'U', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_clascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_clascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_clascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              sdim = 0
+              do i = 1, n
+                 cursl = selctg( alpha( i ), beta( i ) )
+                 if( cursl )sdim = sdim + 1
+                 if( cursl .and. .not.lastsl )info = n + 2
+                 lastsl = cursl
+              end do
+           end if
+           40 continue
+           work( 1 ) = maxwrk
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_cggesx
+
+     !> CGGEV: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, and optionally, the left and/or
+     !> right generalized eigenvectors.
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right generalized eigenvector v(j) corresponding to the
+     !> generalized eigenvalue lambda(j) of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j).
+     !> The left generalized eigenvector u(j) corresponding to the
+     !> generalized eigenvalues lambda(j) of (A,B) satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H * B
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_cggev( jobvl, jobvr, n, a, lda, b, ldb, alpha, beta,vl, ldvl, vr, ldvr, &
+               work, lwork, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: alpha(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery
+           character :: chtemp
+           integer(ilp) :: icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, in, iright, irows, irwrk,&
+                      itau, iwrk, jc, jr, lwkmin, lwkopt
+           real(sp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           complex(sp) :: x
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real,sqrt
+           ! Statement Functions 
+           real(sp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=sp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -11
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -13
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv. the workspace is
+             ! computed assuming ilo = 1 and ihi = n, the worst case.)
+           if( info==0 ) then
+              lwkmin = max( 1, 2*n )
+              lwkopt = max( 1, n + n*stdlib_ilaenv( 1, 'CGEQRF', ' ', n, 1, n, 0 ) )
+              lwkopt = max( lwkopt, n +n*stdlib_ilaenv( 1, 'CUNMQR', ' ', n, 1, n, 0 ) )
+              if( ilvl ) then
+                 lwkopt = max( lwkopt, n +n*stdlib_ilaenv( 1, 'CUNGQR', ' ', n, 1, n, -1 ) )
+                           
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery )info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_slamch( 'E' )*stdlib_slamch( 'B' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_clascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_clange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_clascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrices a, b to isolate eigenvalues if possible
+           ! (real workspace: need 6*n)
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_cggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (complex workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           if( ilv ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_cgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (complex workspace: need n, prefer n*nb)
+           call stdlib_cunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl
+           ! (complex workspace: need n, prefer n*nb)
+           if( ilvl ) then
+              call stdlib_claset( 'FULL', n, n, czero, cone, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_clacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_cungqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vr
+           if( ilvr )call stdlib_claset( 'FULL', n, n, czero, cone, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           if( ilv ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_cgghrd( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        ierr )
+           else
+              call stdlib_cgghrd( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur form and schur vectors)
+           ! (complex workspace: need n)
+           ! (real workspace: need n)
+           iwrk = itau
+           if( ilv ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_chgeqz( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vl, &
+                     ldvl, vr, ldvr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 70
+           end if
+           ! compute eigenvectors
+           ! (real workspace: need 2*n)
+           ! (complex workspace: need 2*n)
+           if( ilv ) then
+              if( ilvl ) then
+                 if( ilvr ) then
+                    chtemp = 'B'
+                 else
+                    chtemp = 'L'
+                 end if
+              else
+                 chtemp = 'R'
+              end if
+              call stdlib_ctgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, n, &
+                        in, work( iwrk ), rwork( irwrk ),ierr )
+              if( ierr/=0 ) then
+                 info = n + 2
+                 go to 70
+              end if
+              ! undo balancing on vl and vr and normalization
+              ! (workspace: none needed)
+              if( ilvl ) then
+                 call stdlib_cggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), n, vl,&
+                            ldvl, ierr )
+                 loop_30: do jc = 1, n
+                    temp = zero
+                    do jr = 1, n
+                       temp = max( temp, abs1( vl( jr, jc ) ) )
+                    end do
+                    if( temp<smlnum )cycle loop_30
+                    temp = one / temp
+                    do jr = 1, n
+                       vl( jr, jc ) = vl( jr, jc )*temp
+                    end do
+                 end do loop_30
+              end if
+              if( ilvr ) then
+                 call stdlib_cggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), n, vr,&
+                            ldvr, ierr )
+                 loop_60: do jc = 1, n
+                    temp = zero
+                    do jr = 1, n
+                       temp = max( temp, abs1( vr( jr, jc ) ) )
+                    end do
+                    if( temp<smlnum )cycle loop_60
+                    temp = one / temp
+                    do jr = 1, n
+                       vr( jr, jc ) = vr( jr, jc )*temp
+                    end do
+                 end do loop_60
+              end if
+           end if
+           ! undo scaling if necessary
+           70 continue
+           if( ilascl )call stdlib_clascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           if( ilbscl )call stdlib_clascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_cggev
+
+     !> CGGEVX: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B) the generalized eigenvalues, and optionally, the left and/or
+     !> right generalized eigenvectors.
+     !> Optionally, it also computes a balancing transformation to improve
+     !> the conditioning of the eigenvalues and eigenvectors (ILO, IHI,
+     !> LSCALE, RSCALE, ABNRM, and BBNRM), reciprocal condition numbers for
+     !> the eigenvalues (RCONDE), and reciprocal condition numbers for the
+     !> right eigenvectors (RCONDV).
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right eigenvector v(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j) .
+     !> The left eigenvector u(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> u(j)**H * A  = lambda(j) * u(j)**H * B.
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_cggevx( balanc, jobvl, jobvr, sense, n, a, lda, b, ldb,alpha, beta, vl, &
+     ldvl, vr, ldvr, ilo, ihi,lscale, rscale, abnrm, bbnrm, rconde, rcondv,work, lwork, rwork, &
+               iwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: balanc, jobvl, jobvr, sense
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           real(sp), intent(out) :: abnrm, bbnrm
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: lscale(*), rconde(*), rcondv(*), rscale(*), rwork(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: alpha(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery, noscl, wantsb, wantse, wantsn, &
+                     wantsv
+           character :: chtemp
+           integer(ilp) :: i, icols, ierr, ijobvl, ijobvr, in, irows, itau, iwrk, iwrk1, j, jc, &
+                     jr, m, maxwrk, minwrk
+           real(sp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           complex(sp) :: x
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real,sqrt
+           ! Statement Functions 
+           real(sp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=sp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           noscl  = stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc, 'P' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.( noscl .or. stdlib_lsame( balanc,'S' ) .or.stdlib_lsame( balanc, 'B' ) ) ) &
+                     then
+              info = -1
+           else if( ijobvl<=0 ) then
+              info = -2
+           else if( ijobvr<=0 ) then
+              info = -3
+           else if( .not.( wantsn .or. wantse .or. wantsb .or. wantsv ) )then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -13
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -15
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv. the workspace is
+             ! computed assuming ilo = 1 and ihi = n, the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 minwrk = 2*n
+                 if( wantse ) then
+                    minwrk = 4*n
+                 else if( wantsv .or. wantsb ) then
+                    minwrk = 2*n*( n + 1)
+                 end if
+                 maxwrk = minwrk
+                 maxwrk = max( maxwrk,n + n*stdlib_ilaenv( 1, 'CGEQRF', ' ', n, 1, n, 0 ) )
+                           
+                 maxwrk = max( maxwrk,n + n*stdlib_ilaenv( 1, 'CUNMQR', ' ', n, 1, n, 0 ) )
+                           
+                 if( ilvl ) then
+                    maxwrk = max( maxwrk, n +n*stdlib_ilaenv( 1, 'CUNGQR', ' ', n, 1, n, 0 ) )
+                              
+                 end if
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -25
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_clascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_clange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_clascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute and/or balance the matrix pair (a,b)
+           ! (real workspace: need 6*n if balanc = 's' or 'b', 1 otherwise)
+           call stdlib_cggbal( balanc, n, a, lda, b, ldb, ilo, ihi, lscale, rscale,rwork, ierr )
+                     
+           ! compute abnrm and bbnrm
+           abnrm = stdlib_clange( '1', n, n, a, lda, rwork( 1 ) )
+           if( ilascl ) then
+              rwork( 1 ) = abnrm
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, 1, 1, rwork( 1 ), 1,ierr )
+              abnrm = rwork( 1 )
+           end if
+           bbnrm = stdlib_clange( '1', n, n, b, ldb, rwork( 1 ) )
+           if( ilbscl ) then
+              rwork( 1 ) = bbnrm
+              call stdlib_slascl( 'G', 0, 0, bnrmto, bnrm, 1, 1, rwork( 1 ), 1,ierr )
+              bbnrm = rwork( 1 )
+           end if
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (complex workspace: need n, prefer n*nb )
+           irows = ihi + 1 - ilo
+           if( ilv .or. .not.wantsn ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_cgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the unitary transformation to a
+           ! (complex workspace: need n, prefer n*nb)
+           call stdlib_cunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl and/or vr
+           ! (workspace: need n, prefer n*nb)
+           if( ilvl ) then
+              call stdlib_claset( 'FULL', n, n, czero, cone, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_clacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_cungqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           if( ilvr )call stdlib_claset( 'FULL', n, n, czero, cone, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           if( ilv .or. .not.wantsn ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_cgghrd( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        ierr )
+           else
+              call stdlib_cgghrd( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur forms and schur vectors)
+           ! (complex workspace: need n)
+           ! (real workspace: need n)
+           iwrk = itau
+           if( ilv .or. .not.wantsn ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_chgeqz( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vl, &
+                     ldvl, vr, ldvr, work( iwrk ),lwork+1-iwrk, rwork, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 90
+           end if
+           ! compute eigenvectors and estimate condition numbers if desired
+           ! stdlib_ctgevc: (complex workspace: need 2*n )
+                   ! (real workspace:    need 2*n )
+           ! stdlib_ctgsna: (complex workspace: need 2*n*n if sense='v' or 'b')
+                   ! (integer workspace: need n+2 )
+           if( ilv .or. .not.wantsn ) then
+              if( ilv ) then
+                 if( ilvl ) then
+                    if( ilvr ) then
+                       chtemp = 'B'
+                    else
+                       chtemp = 'L'
+                    end if
+                 else
+                    chtemp = 'R'
+                 end if
+                 call stdlib_ctgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, n,&
+                            in, work( iwrk ), rwork,ierr )
+                 if( ierr/=0 ) then
+                    info = n + 2
+                    go to 90
+                 end if
+              end if
+              if( .not.wantsn ) then
+                 ! compute eigenvectors (stdlib_ctgevc) and estimate condition
+                 ! numbers (stdlib_ctgsna). note that the definition of the condition
+                 ! number is not invariant under transformation (u,v) to
+                 ! (q*u, z*v), where (u,v) are eigenvectors of the generalized
+                 ! schur form (s,t), q and z are orthogonal matrices. in order
+                 ! to avoid using extra 2*n*n workspace, we have to
+                 ! re-calculate eigenvectors and estimate the condition numbers
+                 ! one at a time.
+                 do i = 1, n
+                    do j = 1, n
+                       bwork( j ) = .false.
+                    end do
+                    bwork( i ) = .true.
+                    iwrk = n + 1
+                    iwrk1 = iwrk + n
+                    if( wantse .or. wantsb ) then
+                       call stdlib_ctgevc( 'B', 'S', bwork, n, a, lda, b, ldb,work( 1 ), n, work( &
+                                 iwrk ), n, 1, m,work( iwrk1 ), rwork, ierr )
+                       if( ierr/=0 ) then
+                          info = n + 2
+                          go to 90
+                       end if
+                    end if
+                    call stdlib_ctgsna( sense, 'S', bwork, n, a, lda, b, ldb,work( 1 ), n, work( &
+                    iwrk ), n, rconde( i ),rcondv( i ), 1, m, work( iwrk1 ),lwork-iwrk1+1, iwork, &
+                              ierr )
+                 end do
+              end if
+           end if
+           ! undo balancing on vl and vr and normalization
+           ! (workspace: none needed)
+           if( ilvl ) then
+              call stdlib_cggbak( balanc, 'L', n, ilo, ihi, lscale, rscale, n, vl,ldvl, ierr )
+                        
+              loop_50: do jc = 1, n
+                 temp = zero
+                 do jr = 1, n
+                    temp = max( temp, abs1( vl( jr, jc ) ) )
+                 end do
+                 if( temp<smlnum )cycle loop_50
+                 temp = one / temp
+                 do jr = 1, n
+                    vl( jr, jc ) = vl( jr, jc )*temp
+                 end do
+              end do loop_50
+           end if
+           if( ilvr ) then
+              call stdlib_cggbak( balanc, 'R', n, ilo, ihi, lscale, rscale, n, vr,ldvr, ierr )
+                        
+              loop_80: do jc = 1, n
+                 temp = zero
+                 do jr = 1, n
+                    temp = max( temp, abs1( vr( jr, jc ) ) )
+                 end do
+                 if( temp<smlnum )cycle loop_80
+                 temp = one / temp
+                 do jr = 1, n
+                    vr( jr, jc ) = vr( jr, jc )*temp
+                 end do
+              end do loop_80
+           end if
+           ! undo scaling if necessary
+           90 continue
+           if( ilascl )call stdlib_clascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           if( ilbscl )call stdlib_clascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_cggevx
+
+     !> CHBEV: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a complex Hermitian band matrix A.
+
+     subroutine stdlib_chbev( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldz, n
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: ab(ldab,*)
+           complex(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, wantz
+           integer(ilp) :: iinfo, imax, inde, indrwk, iscale
+           real(sp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHBEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( lower ) then
+                 w( 1 ) = real( ab( 1, 1 ),KIND=sp)
+              else
+                 w( 1 ) = real( ab( kd+1, 1 ),KIND=sp)
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_clanhb( 'M', uplo, n, kd, ab, ldab, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_clascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_clascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+           end if
+           ! call stdlib_chbtrd to reduce hermitian band matrix to tridiagonal form.
+           inde = 1
+           call stdlib_chbtrd( jobz, uplo, n, kd, ab, ldab, w, rwork( inde ), z,ldz, work, iinfo )
+                     
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, call stdlib_csteqr.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, rwork( inde ), info )
+           else
+              indrwk = inde + n
+              call stdlib_csteqr( jobz, n, w, rwork( inde ), z, ldz,rwork( indrwk ), info )
+                        
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           return
+     end subroutine stdlib_chbev
+
+     !> CHBEVD: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a complex Hermitian band matrix A.  If eigenvectors are desired, it
+     !> uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_chbevd( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,lwork, rwork, lrwork, &
+               iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: ab(ldab,*)
+           complex(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indwk2, indwrk, iscale, liwmin, llrwk, llwk2, &
+                     lrwmin, lwmin
+           real(sp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 .or. liwork==-1 .or. lrwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              lwmin = 1
+              lrwmin = 1
+              liwmin = 1
+           else
+              if( wantz ) then
+                 lwmin = 2*n**2
+                 lrwmin = 1 + 5*n + 2*n**2
+                 liwmin = 3 + 5*n
+              else
+                 lwmin = n
+                 lrwmin = n
+                 liwmin = 1
+              end if
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -13
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHBEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = real( ab( 1, 1 ),KIND=sp)
+              if( wantz )z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_clanhb( 'M', uplo, n, kd, ab, ldab, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_clascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_clascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+           end if
+           ! call stdlib_chbtrd to reduce hermitian band matrix to tridiagonal form.
+           inde = 1
+           indwrk = inde + n
+           indwk2 = 1 + n*n
+           llwk2 = lwork - indwk2 + 1
+           llrwk = lrwork - indwrk + 1
+           call stdlib_chbtrd( jobz, uplo, n, kd, ab, ldab, w, rwork( inde ), z,ldz, work, iinfo )
+                     
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, call stdlib_cstedc.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_cstedc( 'I', n, w, rwork( inde ), work, n, work( indwk2 ),llwk2, rwork( &
+                        indwrk ), llrwk, iwork, liwork,info )
+              call stdlib_cgemm( 'N', 'N', n, n, n, cone, z, ldz, work, n, czero,work( indwk2 ), &
+                        n )
+              call stdlib_clacpy( 'A', n, n, work( indwk2 ), n, z, ldz )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_chbevd
+
+     !> CHBEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex Hermitian band matrix A.  Eigenvalues and eigenvectors
+     !> can be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+
+     subroutine stdlib_chbevx( jobz, range, uplo, n, kd, ab, ldab, q, ldq, vl,vu, il, iu, abstol, &
+               m, w, z, ldz, work, rwork,iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, kd, ldab, ldq, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: ab(ldab,*)
+           complex(sp), intent(out) :: q(ldq,*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwk, indrwk, &
+                     indwrk, iscale, itmp1, j, jj, nsplit
+           real(sp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           complex(sp) :: ctmp1
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lower = stdlib_lsame( uplo, 'L' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           else if( wantz .and. ldq<max( 1, n ) ) then
+              info = -9
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -11
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -12
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -13
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -18
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHBEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              m = 1
+              if( lower ) then
+                 ctmp1 = ab( 1, 1 )
+              else
+                 ctmp1 = ab( kd+1, 1 )
+              end if
+              tmp1 = real( ctmp1,KIND=sp)
+              if( valeig ) then
+                 if( .not.( vl<tmp1 .and. vu>=tmp1 ) )m = 0
+              end if
+              if( m==1 ) then
+                 w( 1 ) = real( ctmp1,KIND=sp)
+                 if( wantz )z( 1, 1 ) = cone
+              end if
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if ( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           endif
+           anrm = stdlib_clanhb( 'M', uplo, n, kd, ab, ldab, rwork )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_clascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_clascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_chbtrd to reduce hermitian band matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indrwk = inde + n
+           indwrk = 1
+           call stdlib_chbtrd( jobz, uplo, n, kd, ab, ldab, rwork( indd ),rwork( inde ), q, ldq, &
+                     work( indwrk ), iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_ssterf or stdlib_csteqr.  if this fails for some
+           ! eigenvalue, then try stdlib_sstebz.
+           test = .false.
+           if (indeig) then
+              if (il==1 .and. iu==n) then
+                 test = .true.
+              end if
+           end if
+           if ((alleig .or. test) .and. (abstol<=zero)) then
+              call stdlib_scopy( n, rwork( indd ), 1, w, 1 )
+              indee = indrwk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_scopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_ssterf( n, w, rwork( indee ), info )
+              else
+                 call stdlib_clacpy( 'A', n, n, q, ldq, z, ldz )
+                 call stdlib_scopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_csteqr( jobz, n, w, rwork( indee ), z, ldz,rwork( indrwk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_sstebz and, if eigenvectors are desired, stdlib_cstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwk = indisp + n
+           call stdlib_sstebz( range, order, n, vll, vuu, il, iu, abstll,rwork( indd ), rwork( &
+           inde ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwk ),&
+                      info )
+           if( wantz ) then
+              call stdlib_cstein( n, rwork( indd ), rwork( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwk ), ifail, info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_cstein.
+              do j = 1, m
+                 call stdlib_ccopy( n, z( 1, j ), 1, work( 1 ), 1 )
+                 call stdlib_cgemv( 'N', n, n, cone, q, ldq, work, 1, czero,z( 1, j ), 1 )
+              end do
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           30 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_cswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_chbevx
+
+     !> CHBGV: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x. Here A and B are assumed to be Hermitian
+     !> and banded, and B is also positive definite.
+
+     pure subroutine stdlib_chbgv( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w, z,ldz, work, &
+               rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldz, n
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           complex(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper, wantz
+           character :: vect
+           integer(ilp) :: iinfo, inde, indwrk
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHBGV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_cpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           inde = 1
+           indwrk = inde + n
+           call stdlib_chbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, z, ldz,work, rwork( &
+                     indwrk ), iinfo )
+           ! reduce to tridiagonal form.
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_chbtrd( vect, uplo, n, ka, ab, ldab, w, rwork( inde ), z,ldz, work, iinfo )
+                     
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, call stdlib_csteqr.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_csteqr( jobz, n, w, rwork( inde ), z, ldz,rwork( indwrk ), info )
+                        
+           end if
+           return
+     end subroutine stdlib_chbgv
+
+     !> CHBGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x. Here A and B are assumed to be Hermitian
+     !> and banded, and B is also positive definite.  If eigenvectors are
+     !> desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_chbgvd( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w,z, ldz, work, &
+               lwork, rwork, lrwork, iwork,liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           complex(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: vect
+           integer(ilp) :: iinfo, inde, indwk2, indwrk, liwmin, llrwk, llwk2, lrwmin, &
+                     lwmin
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              lwmin = 1+n
+              lrwmin = 1+n
+              liwmin = 1
+           else if( wantz ) then
+              lwmin = 2*n**2
+              lrwmin = 1 + 5*n + 2*n**2
+              liwmin = 3 + 5*n
+           else
+              lwmin = n
+              lrwmin = n
+              liwmin = 1
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -14
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -16
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -18
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHBGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_cpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           inde = 1
+           indwrk = inde + n
+           indwk2 = 1 + n*n
+           llwk2 = lwork - indwk2 + 2
+           llrwk = lrwork - indwrk + 2
+           call stdlib_chbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, z, ldz,work, rwork, &
+                     iinfo )
+           ! reduce hermitian band matrix to tridiagonal form.
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_chbtrd( vect, uplo, n, ka, ab, ldab, w, rwork( inde ), z,ldz, work, iinfo )
+                     
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, call stdlib_cstedc.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_cstedc( 'I', n, w, rwork( inde ), work, n, work( indwk2 ),llwk2, rwork( &
+                        indwrk ), llrwk, iwork, liwork,info )
+              call stdlib_cgemm( 'N', 'N', n, n, n, cone, z, ldz, work, n, czero,work( indwk2 ), &
+                        n )
+              call stdlib_clacpy( 'A', n, n, work( indwk2 ), n, z, ldz )
+           end if
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_chbgvd
+
+     !> CHBGVX: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x. Here A and B are assumed to be Hermitian
+     !> and banded, and B is also positive definite.  Eigenvalues and
+     !> eigenvectors can be selected by specifying either all eigenvalues,
+     !> a range of values or a range of indices for the desired eigenvalues.
+
+     pure subroutine stdlib_chbgvx( jobz, range, uplo, n, ka, kb, ab, ldab, bb,ldbb, q, ldq, vl, &
+               vu, il, iu, abstol, m, w, z,ldz, work, rwork, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, ka, kb, ldab, ldbb, ldq, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           complex(sp), intent(out) :: q(ldq,*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, upper, valeig, wantz
+           character :: order, vect
+           integer(ilp) :: i, iinfo, indd, inde, indee, indibl, indisp, indiwk, indrwk, indwrk, &
+                     itmp1, j, jj, nsplit
+           real(sp) :: tmp1
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ka<0 ) then
+              info = -5
+           else if( kb<0 .or. kb>ka ) then
+              info = -6
+           else if( ldab<ka+1 ) then
+              info = -8
+           else if( ldbb<kb+1 ) then
+              info = -10
+           else if( ldq<1 .or. ( wantz .and. ldq<n ) ) then
+              info = -12
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -14
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -15
+                 else if ( iu<min( n, il ) .or. iu>n ) then
+                    info = -16
+                 end if
+              end if
+           end if
+           if( info==0) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -21
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHBGVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_cpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           call stdlib_chbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, q, ldq,work, rwork, &
+                     iinfo )
+           ! solve the standard eigenvalue problem.
+           ! reduce hermitian band matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indrwk = inde + n
+           indwrk = 1
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_chbtrd( vect, uplo, n, ka, ab, ldab, rwork( indd ),rwork( inde ), q, ldq, &
+                     work( indwrk ), iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_ssterf or stdlib_csteqr.  if this fails for some
+           ! eigenvalue, then try stdlib_sstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_scopy( n, rwork( indd ), 1, w, 1 )
+              indee = indrwk + 2*n
+              call stdlib_scopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+              if( .not.wantz ) then
+                 call stdlib_ssterf( n, w, rwork( indee ), info )
+              else
+                 call stdlib_clacpy( 'A', n, n, q, ldq, z, ldz )
+                 call stdlib_csteqr( jobz, n, w, rwork( indee ), z, ldz,rwork( indrwk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_sstebz and, if eigenvectors are desired,
+           ! call stdlib_cstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwk = indisp + n
+           call stdlib_sstebz( range, order, n, vl, vu, il, iu, abstol,rwork( indd ), rwork( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwk ), &
+                     info )
+           if( wantz ) then
+              call stdlib_cstein( n, rwork( indd ), rwork( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwk ), ifail, info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_cstein.
+              do j = 1, m
+                 call stdlib_ccopy( n, z( 1, j ), 1, work( 1 ), 1 )
+                 call stdlib_cgemv( 'N', n, n, cone, q, ldq, work, 1, czero,z( 1, j ), 1 )
+              end do
+           end if
+           30 continue
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_cswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_chbgvx
+
+     !> CHEEVD: computes all eigenvalues and, optionally, eigenvectors of a
+     !> complex Hermitian matrix A.  If eigenvectors are desired, it uses a
+     !> divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_cheevd( jobz, uplo, n, a, lda, w, work, lwork, rwork,lrwork, iwork, liwork,&
+                info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indrwk, indtau, indwk2, indwrk, iscale, liopt, &
+                     liwmin, llrwk, llwork, llwrk2, lopt, lropt, lrwmin, lwmin
+           real(sp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwmin = 1
+                 lrwmin = 1
+                 liwmin = 1
+                 lopt = lwmin
+                 lropt = lrwmin
+                 liopt = liwmin
+              else
+                 if( wantz ) then
+                    lwmin = 2*n + n*n
+                    lrwmin = 1 + 5*n + 2*n**2
+                    liwmin = 3 + 5*n
+                 else
+                    lwmin = n + 1
+                    lrwmin = n
+                    liwmin = 1
+                 end if
+                 lopt = max( lwmin, n +stdlib_ilaenv( 1, 'CHETRD', uplo, n, -1, -1, -1 ) )
+                 lropt = lrwmin
+                 liopt = liwmin
+              end if
+              work( 1 ) = lopt
+              rwork( 1 ) = lropt
+              iwork( 1 ) = liopt
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -8
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -10
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHEEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = real( a( 1, 1 ),KIND=sp)
+              if( wantz )a( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_clanhe( 'M', uplo, n, a, lda, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 )call stdlib_clascl( uplo, 0, 0, one, sigma, n, n, a, lda, info )
+           ! call stdlib_chetrd to reduce hermitian matrix to tridiagonal form.
+           inde = 1
+           indtau = 1
+           indwrk = indtau + n
+           indrwk = inde + n
+           indwk2 = indwrk + n*n
+           llwork = lwork - indwrk + 1
+           llwrk2 = lwork - indwk2 + 1
+           llrwk = lrwork - indrwk + 1
+           call stdlib_chetrd( uplo, n, a, lda, w, rwork( inde ), work( indtau ),work( indwrk ), &
+                     llwork, iinfo )
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, first call
+           ! stdlib_cstedc to generate the eigenvector matrix, work(indwrk), of the
+           ! tridiagonal matrix, then call stdlib_cunmtr to multiply it to the
+           ! householder transformations represented as householder vectors in
+           ! a.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_cstedc( 'I', n, w, rwork( inde ), work( indwrk ), n,work( indwk2 ), &
+                        llwrk2, rwork( indrwk ), llrwk,iwork, liwork, info )
+              call stdlib_cunmtr( 'L', uplo, 'N', n, n, a, lda, work( indtau ),work( indwrk ), n, &
+                        work( indwk2 ), llwrk2, iinfo )
+              call stdlib_clacpy( 'A', n, n, work( indwrk ), n, a, lda )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           work( 1 ) = lopt
+           rwork( 1 ) = lropt
+           iwork( 1 ) = liopt
+           return
+     end subroutine stdlib_cheevd
+
+     !> CHEGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be Hermitian and B is also positive definite.
+     !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_chegvd( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, rwork, lrwork,&
+                iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: liopt, liwmin, lopt, lropt, lrwmin, lwmin
+           ! Intrinsic Functions 
+           intrinsic :: max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              lwmin = 1
+              lrwmin = 1
+              liwmin = 1
+           else if( wantz ) then
+              lwmin = 2*n + n*n
+              lrwmin = 1 + 5*n + 2*n*n
+              liwmin = 3 + 5*n
+           else
+              lwmin = n + 1
+              lrwmin = n
+              liwmin = 1
+           end if
+           lopt = lwmin
+           lropt = lrwmin
+           liopt = liwmin
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              work( 1 ) = lopt
+              rwork( 1 ) = lropt
+              iwork( 1 ) = liopt
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -13
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHEGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_cpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_chegst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_cheevd( jobz, uplo, n, a, lda, w, work, lwork, rwork, lrwork,iwork, liwork,&
+                      info )
+           lopt = max( real( lopt,KIND=sp), real( work( 1 ),KIND=sp) )
+           lropt = max( real( lropt,KIND=sp), real( rwork( 1 ),KIND=sp) )
+           liopt = max( real( liopt,KIND=sp), real( iwork( 1 ),KIND=sp) )
+           if( wantz .and. info==0 ) then
+              ! backtransform eigenvectors to the original problem.
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 call stdlib_ctrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, n, cone,b, ldb, a, lda )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_ctrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, n, cone,b, ldb, a, lda )
+                           
+              end if
+           end if
+           work( 1 ) = lopt
+           rwork( 1 ) = lropt
+           iwork( 1 ) = liopt
+           return
+     end subroutine stdlib_chegvd
+
+     !> CHPEVD: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a complex Hermitian matrix A in packed storage.  If eigenvectors are
+     !> desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_chpevd( jobz, uplo, n, ap, w, z, ldz, work, lwork,rwork, lrwork, iwork, &
+               liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: ap(*)
+           complex(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indrwk, indtau, indwrk, iscale, liwmin, llrwk, &
+                     llwrk, lrwmin, lwmin
+           real(sp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( stdlib_lsame( uplo, 'L' ) .or. stdlib_lsame( uplo, 'U' ) ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwmin = 1
+                 liwmin = 1
+                 lrwmin = 1
+              else
+                 if( wantz ) then
+                    lwmin = 2*n
+                    lrwmin = 1 + 5*n + 2*n**2
+                    liwmin = 3 + 5*n
+                 else
+                    lwmin = n
+                    lrwmin = n
+                    liwmin = 1
+                 end if
+              end if
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -9
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -11
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = real( ap( 1 ),KIND=sp)
+              if( wantz )z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_clanhp( 'M', uplo, n, ap, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_csscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+           end if
+           ! call stdlib_chptrd to reduce hermitian packed matrix to tridiagonal form.
+           inde = 1
+           indtau = 1
+           indrwk = inde + n
+           indwrk = indtau + n
+           llwrk = lwork - indwrk + 1
+           llrwk = lrwork - indrwk + 1
+           call stdlib_chptrd( uplo, n, ap, w, rwork( inde ), work( indtau ),iinfo )
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, first call
+           ! stdlib_cupgtr to generate the orthogonal matrix, then call stdlib_cstedc.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_cstedc( 'I', n, w, rwork( inde ), z, ldz, work( indwrk ),llwrk, rwork( &
+                        indrwk ), llrwk, iwork, liwork,info )
+              call stdlib_cupmtr( 'L', uplo, 'N', n, n, ap, work( indtau ), z, ldz,work( indwrk ),&
+                         iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_chpevd
+
+     !> CHPGVD: computes all the eigenvalues and, optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be Hermitian, stored in packed format, and B is also
+     !> positive definite.
+     !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_chpgvd( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,lwork, rwork, lrwork,&
+                iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: rwork(*), w(*)
+           complex(sp), intent(inout) :: ap(*), bp(*)
+           complex(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: j, liwmin, lrwmin, lwmin, neig
+           ! Intrinsic Functions 
+           intrinsic :: max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwmin = 1
+                 liwmin = 1
+                 lrwmin = 1
+              else
+                 if( wantz ) then
+                    lwmin = 2*n
+                    lrwmin = 1 + 5*n + 2*n**2
+                    liwmin = 3 + 5*n
+                 else
+                    lwmin = n
+                    lrwmin = n
+                    liwmin = 1
+                 end if
+              end if
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -13
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHPGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_cpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_chpgst( itype, uplo, n, ap, bp, info )
+           call stdlib_chpevd( jobz, uplo, n, ap, w, z, ldz, work, lwork, rwork,lrwork, iwork, &
+                     liwork, info )
+           lwmin = max( real( lwmin,KIND=sp), real( work( 1 ),KIND=sp) )
+           lrwmin = max( real( lrwmin,KIND=sp), real( rwork( 1 ),KIND=sp) )
+           liwmin = max( real( liwmin,KIND=sp), real( iwork( 1 ),KIND=sp) )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 do j = 1, neig
+                    call stdlib_ctpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, neig
+                    call stdlib_ctpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_chpgvd
+
+     !> CGEES: computes for an N-by-N complex nonsymmetric matrix A, the
+     !> eigenvalues, the Schur form T, and, optionally, the matrix of Schur
+     !> vectors Z.  This gives the Schur factorization A = Z*T*(Z**H).
+     !> Optionally, it also orders the eigenvalues on the diagonal of the
+     !> Schur form so that selected eigenvalues are at the top left.
+     !> The leading columns of Z then form an orthonormal basis for the
+     !> invariant subspace corresponding to the selected eigenvalues.
+     !> A complex matrix is in Schur form if it is upper triangular.
+
+     subroutine stdlib_cgees( jobvs, sort, select, n, a, lda, sdim, w, vs,ldvs, work, lwork, &
+               rwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvs, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldvs, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: vs(ldvs,*), w(*), work(*)
+           ! Function Arguments 
+           procedure(stdlib_select_c) :: select
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantst, wantvs
+           integer(ilp) :: hswork, i, ibal, icond, ierr, ieval, ihi, ilo, itau, iwrk, maxwrk, &
+                     minwrk
+           real(sp) :: anrm, bignum, cscale, eps, s, sep, smlnum
+           ! Local Arrays 
+           real(sp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvs = stdlib_lsame( jobvs, 'V' )
+           wantst = stdlib_lsame( sort, 'S' )
+           if( ( .not.wantvs ) .and. ( .not.stdlib_lsame( jobvs, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldvs<1 .or. ( wantvs .and. ldvs<n ) ) then
+              info = -10
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to real
+             ! workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_chseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'CGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 2*n
+                 call stdlib_chseqr( 'S', jobvs, n, 1, n, a, lda, w, vs, ldvs,work, -1, ieval )
+                           
+                 hswork = real( work( 1 ),KIND=sp)
+                 if( .not.wantvs ) then
+                    maxwrk = max( maxwrk, hswork )
+                 else
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'CUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    maxwrk = max( maxwrk, hswork )
+                 end if
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEES ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_clascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (cworkspace: none)
+           ! (rworkspace: need n)
+           ibal = 1
+           call stdlib_cgebal( 'P', n, a, lda, ilo, ihi, rwork( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (cworkspace: need 2*n, prefer n+n*nb)
+           ! (rworkspace: none)
+           itau = 1
+           iwrk = n + itau
+           call stdlib_cgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvs ) then
+              ! copy householder vectors to vs
+              call stdlib_clacpy( 'L', n, n, a, lda, vs, ldvs )
+              ! generate unitary matrix in vs
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_cunghr( n, ilo, ihi, vs, ldvs, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+           end if
+           sdim = 0
+           ! perform qr iteration, accumulating schur vectors in vs if desired
+           ! (cworkspace: need 1, prefer hswork (see comments) )
+           ! (rworkspace: none)
+           iwrk = itau
+           call stdlib_chseqr( 'S', jobvs, n, ilo, ihi, a, lda, w, vs, ldvs,work( iwrk ), lwork-&
+                     iwrk+1, ieval )
+           if( ieval>0 )info = ieval
+           ! sort eigenvalues if desired
+           if( wantst .and. info==0 ) then
+              if( scalea )call stdlib_clascl( 'G', 0, 0, cscale, anrm, n, 1, w, n, ierr )
+              do i = 1, n
+                 bwork( i ) = select( w( i ) )
+              end do
+              ! reorder eigenvalues and transform schur vectors
+              ! (cworkspace: none)
+              ! (rworkspace: none)
+              call stdlib_ctrsen( 'N', jobvs, bwork, n, a, lda, vs, ldvs, w, sdim,s, sep, work( &
+                        iwrk ), lwork-iwrk+1, icond )
+           end if
+           if( wantvs ) then
+              ! undo balancing
+              ! (cworkspace: none)
+              ! (rworkspace: need n)
+              call stdlib_cgebak( 'P', 'R', n, ilo, ihi, rwork( ibal ), n, vs, ldvs,ierr )
+           end if
+           if( scalea ) then
+              ! undo scaling for the schur form of a
+              call stdlib_clascl( 'U', 0, 0, cscale, anrm, n, n, a, lda, ierr )
+              call stdlib_ccopy( n, a, lda+1, w, 1 )
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_cgees
+
+     !> CGEESX: computes for an N-by-N complex nonsymmetric matrix A, the
+     !> eigenvalues, the Schur form T, and, optionally, the matrix of Schur
+     !> vectors Z.  This gives the Schur factorization A = Z*T*(Z**H).
+     !> Optionally, it also orders the eigenvalues on the diagonal of the
+     !> Schur form so that selected eigenvalues are at the top left;
+     !> computes a reciprocal condition number for the average of the
+     !> selected eigenvalues (RCONDE); and computes a reciprocal condition
+     !> number for the right invariant subspace corresponding to the
+     !> selected eigenvalues (RCONDV).  The leading columns of Z form an
+     !> orthonormal basis for this invariant subspace.
+     !> For further explanation of the reciprocal condition numbers RCONDE
+     !> and RCONDV, see Section 4.10_sp of the LAPACK Users' Guide (where
+     !> these quantities are called s and sep respectively).
+     !> A complex matrix is in Schur form if it is upper triangular.
+
+     subroutine stdlib_cgeesx( jobvs, sort, select, sense, n, a, lda, sdim, w,vs, ldvs, rconde, &
+               rcondv, work, lwork, rwork,bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvs, sense, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldvs, lwork, n
+           real(sp), intent(out) :: rconde, rcondv
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: vs(ldvs,*), w(*), work(*)
+           ! Function Arguments 
+           procedure(stdlib_select_c) :: select
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantsb, wantse, wantsn, wantst, wantsv, wantvs
+           integer(ilp) :: hswork, i, ibal, icond, ierr, ieval, ihi, ilo, itau, iwrk, lwrk, &
+                     maxwrk, minwrk
+           real(sp) :: anrm, bignum, cscale, eps, smlnum
+           ! Local Arrays 
+           real(sp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           wantvs = stdlib_lsame( jobvs, 'V' )
+           wantst = stdlib_lsame( sort, 'S' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           lquery = ( lwork==-1 )
+           if( ( .not.wantvs ) .and. ( .not.stdlib_lsame( jobvs, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -2
+           else if( .not.( wantsn .or. wantse .or. wantsv .or. wantsb ) .or.( .not.wantst .and. &
+                     .not.wantsn ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldvs<1 .or. ( wantvs .and. ldvs<n ) ) then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of real workspace needed at that point in the
+             ! code, as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to real
+             ! workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_chseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.
+             ! if sense = 'e', 'v' or 'b', then the amount of workspace needed
+             ! depends on sdim, which is computed by the routine stdlib_ctrsen later
+             ! in the code.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 lwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'CGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 2*n
+                 call stdlib_chseqr( 'S', jobvs, n, 1, n, a, lda, w, vs, ldvs,work, -1, ieval )
+                           
+                 hswork = real( work( 1 ),KIND=sp)
+                 if( .not.wantvs ) then
+                    maxwrk = max( maxwrk, hswork )
+                 else
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'CUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    maxwrk = max( maxwrk, hswork )
+                 end if
+                 lwrk = maxwrk
+                 if( .not.wantsn )lwrk = max( lwrk, ( n*n )/2 )
+              end if
+              work( 1 ) = lwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEESX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_clascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (cworkspace: none)
+           ! (rworkspace: need n)
+           ibal = 1
+           call stdlib_cgebal( 'P', n, a, lda, ilo, ihi, rwork( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (cworkspace: need 2*n, prefer n+n*nb)
+           ! (rworkspace: none)
+           itau = 1
+           iwrk = n + itau
+           call stdlib_cgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvs ) then
+              ! copy householder vectors to vs
+              call stdlib_clacpy( 'L', n, n, a, lda, vs, ldvs )
+              ! generate unitary matrix in vs
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_cunghr( n, ilo, ihi, vs, ldvs, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+           end if
+           sdim = 0
+           ! perform qr iteration, accumulating schur vectors in vs if desired
+           ! (cworkspace: need 1, prefer hswork (see comments) )
+           ! (rworkspace: none)
+           iwrk = itau
+           call stdlib_chseqr( 'S', jobvs, n, ilo, ihi, a, lda, w, vs, ldvs,work( iwrk ), lwork-&
+                     iwrk+1, ieval )
+           if( ieval>0 )info = ieval
+           ! sort eigenvalues if desired
+           if( wantst .and. info==0 ) then
+              if( scalea )call stdlib_clascl( 'G', 0, 0, cscale, anrm, n, 1, w, n, ierr )
+              do i = 1, n
+                 bwork( i ) = select( w( i ) )
+              end do
+              ! reorder eigenvalues, transform schur vectors, and compute
+              ! reciprocal condition numbers
+              ! (cworkspace: if sense is not 'n', need 2*sdim*(n-sdim)
+                           ! otherwise, need none )
+              ! (rworkspace: none)
+              call stdlib_ctrsen( sense, jobvs, bwork, n, a, lda, vs, ldvs, w, sdim,rconde, &
+                        rcondv, work( iwrk ), lwork-iwrk+1,icond )
+              if( .not.wantsn )maxwrk = max( maxwrk, 2*sdim*( n-sdim ) )
+              if( icond==-14 ) then
+                 ! not enough complex workspace
+                 info = -15
+              end if
+           end if
+           if( wantvs ) then
+              ! undo balancing
+              ! (cworkspace: none)
+              ! (rworkspace: need n)
+              call stdlib_cgebak( 'P', 'R', n, ilo, ihi, rwork( ibal ), n, vs, ldvs,ierr )
+           end if
+           if( scalea ) then
+              ! undo scaling for the schur form of a
+              call stdlib_clascl( 'U', 0, 0, cscale, anrm, n, n, a, lda, ierr )
+              call stdlib_ccopy( n, a, lda+1, w, 1 )
+              if( ( wantsv .or. wantsb ) .and. info==0 ) then
+                 dum( 1 ) = rcondv
+                 call stdlib_slascl( 'G', 0, 0, cscale, anrm, 1, 1, dum, 1, ierr )
+                 rcondv = dum( 1 )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_cgeesx
+
+     !> CGEEV: computes for an N-by-N complex nonsymmetric matrix A, the
+     !> eigenvalues and, optionally, the left and/or right eigenvectors.
+     !> The right eigenvector v(j) of A satisfies
+     !> A * v(j) = lambda(j) * v(j)
+     !> where lambda(j) is its eigenvalue.
+     !> The left eigenvector u(j) of A satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H
+     !> where u(j)**H denotes the conjugate transpose of u(j).
+     !> The computed eigenvectors are normalized to have Euclidean norm
+     !> equal to 1 and largest component real.
+
+     subroutine stdlib_cgeev( jobvl, jobvr, n, a, lda, w, vl, ldvl, vr, ldvr,work, lwork, rwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: vl(ldvl,*), vr(ldvr,*), w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantvl, wantvr
+           character :: side
+           integer(ilp) :: hswork, i, ibal, ierr, ihi, ilo, irwork, itau, iwrk, k, lwork_trevc, &
+                     maxwrk, minwrk, nout
+           real(sp) :: anrm, bignum, cscale, eps, scl, smlnum
+           complex(sp) :: tmp
+           ! Local Arrays 
+           logical(lk) :: select(1)
+           real(sp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,conjg,aimag,max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvl = stdlib_lsame( jobvl, 'V' )
+           wantvr = stdlib_lsame( jobvr, 'V' )
+           if( ( .not.wantvl ) .and. ( .not.stdlib_lsame( jobvl, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantvr ) .and. ( .not.stdlib_lsame( jobvr, 'N' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldvl<1 .or. ( wantvl .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( wantvr .and. ldvr<n ) ) then
+              info = -10
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to real
+             ! workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_chseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'CGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 2*n
+                 if( wantvl ) then
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'CUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    call stdlib_ctrevc3( 'L', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, rwork, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    call stdlib_chseqr( 'S', 'V', n, 1, n, a, lda, w, vl, ldvl,work, -1, info )
+                              
+                 else if( wantvr ) then
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'CUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    call stdlib_ctrevc3( 'R', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, rwork, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    call stdlib_chseqr( 'S', 'V', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                              
+                 else
+                    call stdlib_chseqr( 'E', 'N', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                              
+                 end if
+                 hswork = int( work(1),KIND=ilp)
+                 maxwrk = max( maxwrk, hswork, minwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_clascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! balance the matrix
+           ! (cworkspace: none)
+           ! (rworkspace: need n)
+           ibal = 1
+           call stdlib_cgebal( 'B', n, a, lda, ilo, ihi, rwork( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (cworkspace: need 2*n, prefer n+n*nb)
+           ! (rworkspace: none)
+           itau = 1
+           iwrk = itau + n
+           call stdlib_cgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvl ) then
+              ! want left eigenvectors
+              ! copy householder vectors to vl
+              side = 'L'
+              call stdlib_clacpy( 'L', n, n, a, lda, vl, ldvl )
+              ! generate unitary matrix in vl
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_cunghr( n, ilo, ihi, vl, ldvl, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vl
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_chseqr( 'S', 'V', n, ilo, ihi, a, lda, w, vl, ldvl,work( iwrk ), lwork-&
+                        iwrk+1, info )
+              if( wantvr ) then
+                 ! want left and right eigenvectors
+                 ! copy schur vectors to vr
+                 side = 'B'
+                 call stdlib_clacpy( 'F', n, n, vl, ldvl, vr, ldvr )
+              end if
+           else if( wantvr ) then
+              ! want right eigenvectors
+              ! copy householder vectors to vr
+              side = 'R'
+              call stdlib_clacpy( 'L', n, n, a, lda, vr, ldvr )
+              ! generate unitary matrix in vr
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_cunghr( n, ilo, ihi, vr, ldvr, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vr
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_chseqr( 'S', 'V', n, ilo, ihi, a, lda, w, vr, ldvr,work( iwrk ), lwork-&
+                        iwrk+1, info )
+           else
+              ! compute eigenvalues only
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_chseqr( 'E', 'N', n, ilo, ihi, a, lda, w, vr, ldvr,work( iwrk ), lwork-&
+                        iwrk+1, info )
+           end if
+           ! if info /= 0 from stdlib_chseqr, then quit
+           if( info/=0 )go to 50
+           if( wantvl .or. wantvr ) then
+              ! compute left and/or right eigenvectors
+              ! (cworkspace: need 2*n, prefer n + 2*n*nb)
+              ! (rworkspace: need 2*n)
+              irwork = ibal + n
+              call stdlib_ctrevc3( side, 'B', select, n, a, lda, vl, ldvl, vr, ldvr,n, nout, work(&
+                         iwrk ), lwork-iwrk+1,rwork( irwork ), n, ierr )
+           end if
+           if( wantvl ) then
+              ! undo balancing of left eigenvectors
+              ! (cworkspace: none)
+              ! (rworkspace: need n)
+              call stdlib_cgebak( 'B', 'L', n, ilo, ihi, rwork( ibal ), n, vl, ldvl,ierr )
+              ! normalize left eigenvectors and make largest component real
+              do i = 1, n
+                 scl = one / stdlib_scnrm2( n, vl( 1, i ), 1 )
+                 call stdlib_csscal( n, scl, vl( 1, i ), 1 )
+                 do k = 1, n
+                    rwork( irwork+k-1 ) = real( vl( k, i ),KIND=sp)**2 +aimag( vl( k, i ) )&
+                              **2
+                 end do
+                 k = stdlib_isamax( n, rwork( irwork ), 1 )
+                 tmp = conjg( vl( k, i ) ) / sqrt( rwork( irwork+k-1 ) )
+                 call stdlib_cscal( n, tmp, vl( 1, i ), 1 )
+                 vl( k, i ) = cmplx( real( vl( k, i ),KIND=sp), zero,KIND=sp)
+              end do
+           end if
+           if( wantvr ) then
+              ! undo balancing of right eigenvectors
+              ! (cworkspace: none)
+              ! (rworkspace: need n)
+              call stdlib_cgebak( 'B', 'R', n, ilo, ihi, rwork( ibal ), n, vr, ldvr,ierr )
+              ! normalize right eigenvectors and make largest component real
+              do i = 1, n
+                 scl = one / stdlib_scnrm2( n, vr( 1, i ), 1 )
+                 call stdlib_csscal( n, scl, vr( 1, i ), 1 )
+                 do k = 1, n
+                    rwork( irwork+k-1 ) = real( vr( k, i ),KIND=sp)**2 +aimag( vr( k, i ) )&
+                              **2
+                 end do
+                 k = stdlib_isamax( n, rwork( irwork ), 1 )
+                 tmp = conjg( vr( k, i ) ) / sqrt( rwork( irwork+k-1 ) )
+                 call stdlib_cscal( n, tmp, vr( 1, i ), 1 )
+                 vr( k, i ) = cmplx( real( vr( k, i ),KIND=sp), zero,KIND=sp)
+              end do
+           end if
+           ! undo scaling if necessary
+           50 continue
+           if( scalea ) then
+              call stdlib_clascl( 'G', 0, 0, cscale, anrm, n-info, 1, w( info+1 ),max( n-info, 1 )&
+                        , ierr )
+              if( info>0 ) then
+                 call stdlib_clascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, w, n, ierr )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_cgeev
+
+     !> CGEEVX: computes for an N-by-N complex nonsymmetric matrix A, the
+     !> eigenvalues and, optionally, the left and/or right eigenvectors.
+     !> Optionally also, it computes a balancing transformation to improve
+     !> the conditioning of the eigenvalues and eigenvectors (ILO, IHI,
+     !> SCALE, and ABNRM), reciprocal condition numbers for the eigenvalues
+     !> (RCONDE), and reciprocal condition numbers for the right
+     !> eigenvectors (RCONDV).
+     !> The right eigenvector v(j) of A satisfies
+     !> A * v(j) = lambda(j) * v(j)
+     !> where lambda(j) is its eigenvalue.
+     !> The left eigenvector u(j) of A satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H
+     !> where u(j)**H denotes the conjugate transpose of u(j).
+     !> The computed eigenvectors are normalized to have Euclidean norm
+     !> equal to 1 and largest component real.
+     !> Balancing a matrix means permuting the rows and columns to make it
+     !> more nearly upper triangular, and applying a diagonal similarity
+     !> transformation D * A * D**(-1), where D is a diagonal matrix, to
+     !> make its rows and columns closer in norm and the condition numbers
+     !> of its eigenvalues and eigenvectors smaller.  The computed
+     !> reciprocal condition numbers correspond to the balanced matrix.
+     !> Permuting rows and columns will not change the condition numbers
+     !> (in exact arithmetic) but diagonal scaling will.  For further
+     !> explanation of balancing, see section 4.10.2_sp of the LAPACK
+     !> Users' Guide.
+
+     subroutine stdlib_cgeevx( balanc, jobvl, jobvr, sense, n, a, lda, w, vl,ldvl, vr, ldvr, ilo, &
+               ihi, scale, abnrm, rconde,rcondv, work, lwork, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: balanc, jobvl, jobvr, sense
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldvl, ldvr, lwork, n
+           real(sp), intent(out) :: abnrm
+           ! Array Arguments 
+           real(sp), intent(out) :: rconde(*), rcondv(*), rwork(*), scale(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: vl(ldvl,*), vr(ldvr,*), w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantvl, wantvr, wntsnb, wntsne, wntsnn, wntsnv
+           character :: job, side
+           integer(ilp) :: hswork, i, icond, ierr, itau, iwrk, k, lwork_trevc, maxwrk, minwrk, &
+                     nout
+           real(sp) :: anrm, bignum, cscale, eps, scl, smlnum
+           complex(sp) :: tmp
+           ! Local Arrays 
+           logical(lk) :: select(1)
+           real(sp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,conjg,aimag,max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvl = stdlib_lsame( jobvl, 'V' )
+           wantvr = stdlib_lsame( jobvr, 'V' )
+           wntsnn = stdlib_lsame( sense, 'N' )
+           wntsne = stdlib_lsame( sense, 'E' )
+           wntsnv = stdlib_lsame( sense, 'V' )
+           wntsnb = stdlib_lsame( sense, 'B' )
+           if( .not.( stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc, 'S' ) &
+                     .or.stdlib_lsame( balanc, 'P' ) .or. stdlib_lsame( balanc, 'B' ) ) ) then
+              info = -1
+           else if( ( .not.wantvl ) .and. ( .not.stdlib_lsame( jobvl, 'N' ) ) ) then
+              info = -2
+           else if( ( .not.wantvr ) .and. ( .not.stdlib_lsame( jobvr, 'N' ) ) ) then
+              info = -3
+           else if( .not.( wntsnn .or. wntsne .or. wntsnb .or. wntsnv ) .or.( ( wntsne .or. &
+                     wntsnb ) .and. .not.( wantvl .and.wantvr ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( wantvl .and. ldvl<n ) ) then
+              info = -10
+           else if( ldvr<1 .or. ( wantvr .and. ldvr<n ) ) then
+              info = -12
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to real
+             ! workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_chseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'CGEHRD', ' ', n, 1, n, 0 )
+                 if( wantvl ) then
+                    call stdlib_ctrevc3( 'L', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, rwork, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, lwork_trevc )
+                    call stdlib_chseqr( 'S', 'V', n, 1, n, a, lda, w, vl, ldvl,work, -1, info )
+                              
+                 else if( wantvr ) then
+                    call stdlib_ctrevc3( 'R', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, rwork, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, lwork_trevc )
+                    call stdlib_chseqr( 'S', 'V', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                              
+                 else
+                    if( wntsnn ) then
+                       call stdlib_chseqr( 'E', 'N', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                                 
+                    else
+                       call stdlib_chseqr( 'S', 'N', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                                 
+                    end if
+                 end if
+                 hswork = int( work(1),KIND=ilp)
+                 if( ( .not.wantvl ) .and. ( .not.wantvr ) ) then
+                    minwrk = 2*n
+                    if( .not.( wntsnn .or. wntsne ) )minwrk = max( minwrk, n*n + 2*n )
+                    maxwrk = max( maxwrk, hswork )
+                    if( .not.( wntsnn .or. wntsne ) )maxwrk = max( maxwrk, n*n + 2*n )
+                 else
+                    minwrk = 2*n
+                    if( .not.( wntsnn .or. wntsne ) )minwrk = max( minwrk, n*n + 2*n )
+                    maxwrk = max( maxwrk, hswork )
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'CUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    if( .not.( wntsnn .or. wntsne ) )maxwrk = max( maxwrk, n*n + 2*n )
+                    maxwrk = max( maxwrk, 2*n )
+                 end if
+                 maxwrk = max( maxwrk, minwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGEEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           icond = 0
+           anrm = stdlib_clange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_clascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! balance the matrix and compute abnrm
+           call stdlib_cgebal( balanc, n, a, lda, ilo, ihi, scale, ierr )
+           abnrm = stdlib_clange( '1', n, n, a, lda, dum )
+           if( scalea ) then
+              dum( 1 ) = abnrm
+              call stdlib_slascl( 'G', 0, 0, cscale, anrm, 1, 1, dum, 1, ierr )
+              abnrm = dum( 1 )
+           end if
+           ! reduce to upper hessenberg form
+           ! (cworkspace: need 2*n, prefer n+n*nb)
+           ! (rworkspace: none)
+           itau = 1
+           iwrk = itau + n
+           call stdlib_cgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvl ) then
+              ! want left eigenvectors
+              ! copy householder vectors to vl
+              side = 'L'
+              call stdlib_clacpy( 'L', n, n, a, lda, vl, ldvl )
+              ! generate unitary matrix in vl
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_cunghr( n, ilo, ihi, vl, ldvl, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vl
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_chseqr( 'S', 'V', n, ilo, ihi, a, lda, w, vl, ldvl,work( iwrk ), lwork-&
+                        iwrk+1, info )
+              if( wantvr ) then
+                 ! want left and right eigenvectors
+                 ! copy schur vectors to vr
+                 side = 'B'
+                 call stdlib_clacpy( 'F', n, n, vl, ldvl, vr, ldvr )
+              end if
+           else if( wantvr ) then
+              ! want right eigenvectors
+              ! copy householder vectors to vr
+              side = 'R'
+              call stdlib_clacpy( 'L', n, n, a, lda, vr, ldvr )
+              ! generate unitary matrix in vr
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_cunghr( n, ilo, ihi, vr, ldvr, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vr
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_chseqr( 'S', 'V', n, ilo, ihi, a, lda, w, vr, ldvr,work( iwrk ), lwork-&
+                        iwrk+1, info )
+           else
+              ! compute eigenvalues only
+              ! if condition numbers desired, compute schur form
+              if( wntsnn ) then
+                 job = 'E'
+              else
+                 job = 'S'
+              end if
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_chseqr( job, 'N', n, ilo, ihi, a, lda, w, vr, ldvr,work( iwrk ), lwork-&
+                        iwrk+1, info )
+           end if
+           ! if info /= 0 from stdlib_chseqr, then quit
+           if( info/=0 )go to 50
+           if( wantvl .or. wantvr ) then
+              ! compute left and/or right eigenvectors
+              ! (cworkspace: need 2*n, prefer n + 2*n*nb)
+              ! (rworkspace: need n)
+              call stdlib_ctrevc3( side, 'B', select, n, a, lda, vl, ldvl, vr, ldvr,n, nout, work(&
+                         iwrk ), lwork-iwrk+1,rwork, n, ierr )
+           end if
+           ! compute condition numbers if desired
+           ! (cworkspace: need n*n+2*n unless sense = 'e')
+           ! (rworkspace: need 2*n unless sense = 'e')
+           if( .not.wntsnn ) then
+              call stdlib_ctrsna( sense, 'A', select, n, a, lda, vl, ldvl, vr, ldvr,rconde, &
+                        rcondv, n, nout, work( iwrk ), n, rwork,icond )
+           end if
+           if( wantvl ) then
+              ! undo balancing of left eigenvectors
+              call stdlib_cgebak( balanc, 'L', n, ilo, ihi, scale, n, vl, ldvl,ierr )
+              ! normalize left eigenvectors and make largest component real
+              do i = 1, n
+                 scl = one / stdlib_scnrm2( n, vl( 1, i ), 1 )
+                 call stdlib_csscal( n, scl, vl( 1, i ), 1 )
+                 do k = 1, n
+                    rwork( k ) = real( vl( k, i ),KIND=sp)**2 +aimag( vl( k, i ) )**2
+                 end do
+                 k = stdlib_isamax( n, rwork, 1 )
+                 tmp = conjg( vl( k, i ) ) / sqrt( rwork( k ) )
+                 call stdlib_cscal( n, tmp, vl( 1, i ), 1 )
+                 vl( k, i ) = cmplx( real( vl( k, i ),KIND=sp), zero,KIND=sp)
+              end do
+           end if
+           if( wantvr ) then
+              ! undo balancing of right eigenvectors
+              call stdlib_cgebak( balanc, 'R', n, ilo, ihi, scale, n, vr, ldvr,ierr )
+              ! normalize right eigenvectors and make largest component real
+              do i = 1, n
+                 scl = one / stdlib_scnrm2( n, vr( 1, i ), 1 )
+                 call stdlib_csscal( n, scl, vr( 1, i ), 1 )
+                 do k = 1, n
+                    rwork( k ) = real( vr( k, i ),KIND=sp)**2 +aimag( vr( k, i ) )**2
+                 end do
+                 k = stdlib_isamax( n, rwork, 1 )
+                 tmp = conjg( vr( k, i ) ) / sqrt( rwork( k ) )
+                 call stdlib_cscal( n, tmp, vr( 1, i ), 1 )
+                 vr( k, i ) = cmplx( real( vr( k, i ),KIND=sp), zero,KIND=sp)
+              end do
+           end if
+           ! undo scaling if necessary
+           50 continue
+           if( scalea ) then
+              call stdlib_clascl( 'G', 0, 0, cscale, anrm, n-info, 1, w( info+1 ),max( n-info, 1 )&
+                        , ierr )
+              if( info==0 ) then
+                 if( ( wntsnv .or. wntsnb ) .and. icond==0 )call stdlib_slascl( 'G', 0, 0, cscale,&
+                            anrm, n, 1, rcondv, n,ierr )
+              else
+                 call stdlib_clascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, w, n, ierr )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_cgeevx
+
+     !> CGEJSV: computes the singular value decomposition (SVD) of a complex M-by-N
+     !> matrix [A], where M >= N. The SVD of [A] is written as
+     !> [A] = [U] * [SIGMA] * [V]^*,
+     !> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+     !> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and
+     !> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are
+     !> the singular values of [A]. The columns of [U] and [V] are the left and
+     !> the right singular vectors of [A], respectively. The matrices [U] and [V]
+     !> are computed and stored in the arrays U and V, respectively. The diagonal
+     !> of [SIGMA] is computed and stored in the array SVA.
+
+     pure subroutine stdlib_cgejsv( joba, jobu, jobv, jobr, jobt, jobp,m, n, a, lda, sva, u, ldu, &
+               v, ldv,cwork, lwork, rwork, lrwork, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldv, lwork, lrwork, m, n
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: u(ldu,*), v(ldv,*), cwork(lwork)
+           real(sp), intent(out) :: sva(n), rwork(lrwork)
+           integer(ilp), intent(out) :: iwork(*)
+           character, intent(in) :: joba, jobp, jobr, jobt, jobu, jobv
+        ! ===========================================================================
+           
+           
+           ! Local Scalars 
+           complex(sp) :: ctemp
+           real(sp) :: aapp, aaqq, aatmax, aatmin, big, big1, cond_ok, condr1, condr2, entra, &
+                     entrat, epsln, maxprj, scalem, sconda, sfmin, small, temp1, uscal1, uscal2, xsc
+           integer(ilp) :: ierr, n1, nr, numrank, p, q, warning
+           logical(lk) :: almort, defr, errest, goscal, jracc, kill, lquery, lsvec, l2aber, &
+                     l2kill, l2pert, l2rank, l2tran, noscal, rowpiv, rsvec, transp
+           integer(ilp) :: optwrk, minwrk, minrwrk, miniwrk
+           integer(ilp) :: lwcon, lwlqf, lwqp3, lwqrf, lwunmlq, lwunmqr, lwunmqrm, lwsvdj, &
+                     lwsvdjv, lrwqp3, lrwcon, lrwsvdj, iwoff
+           integer(ilp) :: lwrk_cgelqf, lwrk_cgeqp3, lwrk_cgeqp3n, lwrk_cgeqrf, lwrk_cgesvj, &
+                     lwrk_cgesvjv, lwrk_cgesvju, lwrk_cunmlq, lwrk_cunmqr, lwrk_cunmqrm
+           ! Local Arrays
+           complex(sp) :: cdummy(1)
+           real(sp) :: rdummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,conjg,log,max,min,real,nint,sqrt
+           ! test the input arguments
+           lsvec  = stdlib_lsame( jobu, 'U' ) .or. stdlib_lsame( jobu, 'F' )
+           jracc  = stdlib_lsame( jobv, 'J' )
+           rsvec  = stdlib_lsame( jobv, 'V' ) .or. jracc
+           rowpiv = stdlib_lsame( joba, 'F' ) .or. stdlib_lsame( joba, 'G' )
+           l2rank = stdlib_lsame( joba, 'R' )
+           l2aber = stdlib_lsame( joba, 'A' )
+           errest = stdlib_lsame( joba, 'E' ) .or. stdlib_lsame( joba, 'G' )
+           l2tran = stdlib_lsame( jobt, 'T' ) .and. ( m == n )
+           l2kill = stdlib_lsame( jobr, 'R' )
+           defr   = stdlib_lsame( jobr, 'N' )
+           l2pert = stdlib_lsame( jobp, 'P' )
+           lquery = ( lwork == -1 ) .or. ( lrwork == -1 )
+           if ( .not.(rowpiv .or. l2rank .or. l2aber .or.errest .or. stdlib_lsame( joba, 'C' ) )) &
+                     then
+              info = - 1
+           else if ( .not.( lsvec .or. stdlib_lsame( jobu, 'N' ) .or.( stdlib_lsame( jobu, 'W' ) &
+                     .and. rsvec .and. l2tran ) ) ) then
+              info = - 2
+           else if ( .not.( rsvec .or. stdlib_lsame( jobv, 'N' ) .or.( stdlib_lsame( jobv, 'W' ) &
+                     .and. lsvec .and. l2tran ) ) ) then
+              info = - 3
+           else if ( .not. ( l2kill .or. defr ) )    then
+              info = - 4
+           else if ( .not. ( stdlib_lsame(jobt,'T') .or. stdlib_lsame(jobt,'N') ) ) then
+              info = - 5
+           else if ( .not. ( l2pert .or. stdlib_lsame( jobp, 'N' ) ) ) then
+              info = - 6
+           else if ( m < 0 ) then
+              info = - 7
+           else if ( ( n < 0 ) .or. ( n > m ) ) then
+              info = - 8
+           else if ( lda < m ) then
+              info = - 10
+           else if ( lsvec .and. ( ldu < m ) ) then
+              info = - 13
+           else if ( rsvec .and. ( ldv < n ) ) then
+              info = - 15
+           else
+              ! #:)
+              info = 0
+           end if
+           if ( info == 0 ) then
+               ! Compute The Minimal And The Optimal Workspace Lengths
+               ! [[the expressions for computing the minimal and the optimal
+               ! values of lcwork, lrwork are written with a lot of redundancy and
+               ! can be simplified. however, this verbose form is useful for
+               ! maintenance and modifications of the code.]]
+              ! .. minimal workspace length for stdlib_cgeqp3 of an m x n matrix,
+               ! stdlib_cgeqrf of an n x n matrix, stdlib_cgelqf of an n x n matrix,
+               ! stdlib_cunmlq for computing n x n matrix, stdlib_cunmqr for computing n x n
+               ! matrix, stdlib_cunmqr for computing m x n matrix, respectively.
+               lwqp3 = n+1
+               lwqrf = max( 1, n )
+               lwlqf = max( 1, n )
+               lwunmlq  = max( 1, n )
+               lwunmqr  = max( 1, n )
+               lwunmqrm = max( 1, m )
+              ! Minimal Workspace Length For Stdlib_Cpocon Of An N X N Matrix
+               lwcon = 2 * n
+              ! .. minimal workspace length for stdlib_cgesvj of an n x n matrix,
+               ! without and with explicit accumulation of jacobi rotations
+               lwsvdj  = max( 2 * n, 1 )
+               lwsvdjv = max( 2 * n, 1 )
+               ! .. minimal real workspace length for stdlib_cgeqp3, stdlib_cpocon, stdlib_cgesvj
+               lrwqp3  = 2 * n
+               lrwcon  = n
+               lrwsvdj = n
+               if ( lquery ) then
+                   call stdlib_cgeqp3( m, n, a, lda, iwork, cdummy, cdummy, -1,rdummy, ierr )
+                             
+                   lwrk_cgeqp3 = real( cdummy(1),KIND=sp)
+                   call stdlib_cgeqrf( n, n, a, lda, cdummy, cdummy,-1, ierr )
+                   lwrk_cgeqrf = real( cdummy(1),KIND=sp)
+                   call stdlib_cgelqf( n, n, a, lda, cdummy, cdummy,-1, ierr )
+                   lwrk_cgelqf = real( cdummy(1),KIND=sp)
+               end if
+               minwrk  = 2
+               optwrk  = 2
+               miniwrk = n
+               if ( .not. (lsvec .or. rsvec ) ) then
+                   ! Minimal And Optimal Sizes Of The Complex Workspace If
+                   ! only the singular values are requested
+                   if ( errest ) then
+                       minwrk = max( n+lwqp3, n**2+lwcon, n+lwqrf, lwsvdj )
+                   else
+                       minwrk = max( n+lwqp3, n+lwqrf, lwsvdj )
+                   end if
+                   if ( lquery ) then
+                       call stdlib_cgesvj( 'L', 'N', 'N', n, n, a, lda, sva, n, v,ldv, cdummy, -1,&
+                                  rdummy, -1, ierr )
+                       lwrk_cgesvj = real( cdummy(1),KIND=sp)
+                       if ( errest ) then
+                           optwrk = max( n+lwrk_cgeqp3, n**2+lwcon,n+lwrk_cgeqrf, lwrk_cgesvj )
+                                     
+                       else
+                           optwrk = max( n+lwrk_cgeqp3, n+lwrk_cgeqrf,lwrk_cgesvj )
+                       end if
+                   end if
+                   if ( l2tran .or. rowpiv ) then
+                       if ( errest ) then
+                          minrwrk = max( 7, 2*m,  lrwqp3, lrwcon, lrwsvdj )
+                       else
+                          minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj )
+                       end if
+                   else
+                       if ( errest ) then
+                          minrwrk = max( 7, lrwqp3, lrwcon, lrwsvdj )
+                       else
+                          minrwrk = max( 7, lrwqp3, lrwsvdj )
+                       end if
+                   end if
+                   if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+               else if ( rsvec .and. (.not.lsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! singular values and the right singular vectors are requested
+                  if ( errest ) then
+                      minwrk = max( n+lwqp3, lwcon, lwsvdj, n+lwlqf,2*n+lwqrf, n+lwsvdj, n+&
+                                lwunmlq )
+                  else
+                      minwrk = max( n+lwqp3, lwsvdj, n+lwlqf, 2*n+lwqrf,n+lwsvdj, n+lwunmlq )
+                                
+                  end if
+                  if ( lquery ) then
+                      call stdlib_cgesvj( 'L', 'U', 'N', n,n, u, ldu, sva, n, a,lda, cdummy, -1, &
+                                rdummy, -1, ierr )
+                      lwrk_cgesvj = real( cdummy(1),KIND=sp)
+                      call stdlib_cunmlq( 'L', 'C', n, n, n, a, lda, cdummy,v, ldv, cdummy, -1, &
+                                ierr )
+                      lwrk_cunmlq = real( cdummy(1),KIND=sp)
+                      if ( errest ) then
+                      optwrk = max( n+lwrk_cgeqp3, lwcon, lwrk_cgesvj,n+lwrk_cgelqf, 2*n+&
+                                lwrk_cgeqrf,n+lwrk_cgesvj,  n+lwrk_cunmlq )
+                      else
+                      optwrk = max( n+lwrk_cgeqp3, lwrk_cgesvj,n+lwrk_cgelqf,2*n+lwrk_cgeqrf, n+&
+                                lwrk_cgesvj,n+lwrk_cunmlq )
+                      end if
+                  end if
+                  if ( l2tran .or. rowpiv ) then
+                       if ( errest ) then
+                          minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj, lrwcon )
+                       else
+                          minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj )
+                       end if
+                  else
+                       if ( errest ) then
+                          minrwrk = max( 7, lrwqp3, lrwsvdj, lrwcon )
+                       else
+                          minrwrk = max( 7, lrwqp3, lrwsvdj )
+                       end if
+                  end if
+                  if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+               else if ( lsvec .and. (.not.rsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! singular values and the left singular vectors are requested
+                  if ( errest ) then
+                      minwrk = n + max( lwqp3,lwcon,n+lwqrf,lwsvdj,lwunmqrm )
+                  else
+                      minwrk = n + max( lwqp3, n+lwqrf, lwsvdj, lwunmqrm )
+                  end if
+                  if ( lquery ) then
+                      call stdlib_cgesvj( 'L', 'U', 'N', n,n, u, ldu, sva, n, a,lda, cdummy, -1, &
+                                rdummy, -1, ierr )
+                      lwrk_cgesvj = real( cdummy(1),KIND=sp)
+                      call stdlib_cunmqr( 'L', 'N', m, n, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_cunmqrm = real( cdummy(1),KIND=sp)
+                      if ( errest ) then
+                      optwrk = n + max( lwrk_cgeqp3, lwcon, n+lwrk_cgeqrf,lwrk_cgesvj, &
+                                lwrk_cunmqrm )
+                      else
+                      optwrk = n + max( lwrk_cgeqp3, n+lwrk_cgeqrf,lwrk_cgesvj, lwrk_cunmqrm )
+                                
+                      end if
+                  end if
+                  if ( l2tran .or. rowpiv ) then
+                      if ( errest ) then
+                         minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj, lrwcon )
+                      else
+                         minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj )
+                      end if
+                  else
+                      if ( errest ) then
+                         minrwrk = max( 7, lrwqp3, lrwsvdj, lrwcon )
+                      else
+                         minrwrk = max( 7, lrwqp3, lrwsvdj )
+                      end if
+                  end if
+                  if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+               else
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! full svd is requested
+                  if ( .not. jracc ) then
+                      if ( errest ) then
+                         minwrk = max( n+lwqp3, n+lwcon,  2*n+n**2+lwcon,2*n+lwqrf,         2*n+&
+                         lwqp3,2*n+n**2+n+lwlqf,  2*n+n**2+n+n**2+lwcon,2*n+n**2+n+lwsvdj, 2*n+&
+                         n**2+n+lwsvdjv,2*n+n**2+n+lwunmqr,2*n+n**2+n+lwunmlq,n+n**2+lwsvdj,   n+&
+                                   lwunmqrm )
+                      else
+                         minwrk = max( n+lwqp3,        2*n+n**2+lwcon,2*n+lwqrf,         2*n+&
+                         lwqp3,2*n+n**2+n+lwlqf,  2*n+n**2+n+n**2+lwcon,2*n+n**2+n+lwsvdj, 2*n+&
+                         n**2+n+lwsvdjv,2*n+n**2+n+lwunmqr,2*n+n**2+n+lwunmlq,n+n**2+lwsvdj,      &
+                                   n+lwunmqrm )
+                      end if
+                      miniwrk = miniwrk + n
+                      if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+                  else
+                      if ( errest ) then
+                         minwrk = max( n+lwqp3, n+lwcon, 2*n+lwqrf,2*n+n**2+lwsvdjv, 2*n+n**2+n+&
+                                   lwunmqr,n+lwunmqrm )
+                      else
+                         minwrk = max( n+lwqp3, 2*n+lwqrf,2*n+n**2+lwsvdjv, 2*n+n**2+n+lwunmqr,n+&
+                                   lwunmqrm )
+                      end if
+                      if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+                  end if
+                  if ( lquery ) then
+                      call stdlib_cunmqr( 'L', 'N', m, n, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_cunmqrm = real( cdummy(1),KIND=sp)
+                      call stdlib_cunmqr( 'L', 'N', n, n, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_cunmqr = real( cdummy(1),KIND=sp)
+                      if ( .not. jracc ) then
+                          call stdlib_cgeqp3( n,n, a, lda, iwork, cdummy,cdummy, -1,rdummy, ierr )
+                                    
+                          lwrk_cgeqp3n = real( cdummy(1),KIND=sp)
+                          call stdlib_cgesvj( 'L', 'U', 'N', n, n, u, ldu, sva,n, v, ldv, cdummy, &
+                                    -1, rdummy, -1, ierr )
+                          lwrk_cgesvj = real( cdummy(1),KIND=sp)
+                          call stdlib_cgesvj( 'U', 'U', 'N', n, n, u, ldu, sva,n, v, ldv, cdummy, &
+                                    -1, rdummy, -1, ierr )
+                          lwrk_cgesvju = real( cdummy(1),KIND=sp)
+                          call stdlib_cgesvj( 'L', 'U', 'V', n, n, u, ldu, sva,n, v, ldv, cdummy, &
+                                    -1, rdummy, -1, ierr )
+                          lwrk_cgesvjv = real( cdummy(1),KIND=sp)
+                          call stdlib_cunmlq( 'L', 'C', n, n, n, a, lda, cdummy,v, ldv, cdummy, -&
+                                    1, ierr )
+                          lwrk_cunmlq = real( cdummy(1),KIND=sp)
+                          if ( errest ) then
+                            optwrk = max( n+lwrk_cgeqp3, n+lwcon,2*n+n**2+lwcon, 2*n+lwrk_cgeqrf,&
+                            2*n+lwrk_cgeqp3n,2*n+n**2+n+lwrk_cgelqf,2*n+n**2+n+n**2+lwcon,2*n+&
+                            n**2+n+lwrk_cgesvj,2*n+n**2+n+lwrk_cgesvjv,2*n+n**2+n+lwrk_cunmqr,2*n+&
+                                      n**2+n+lwrk_cunmlq,n+n**2+lwrk_cgesvju,n+lwrk_cunmqrm )
+                          else
+                            optwrk = max( n+lwrk_cgeqp3,2*n+n**2+lwcon, 2*n+lwrk_cgeqrf,2*n+&
+                            lwrk_cgeqp3n,2*n+n**2+n+lwrk_cgelqf,2*n+n**2+n+n**2+lwcon,2*n+n**2+n+&
+                            lwrk_cgesvj,2*n+n**2+n+lwrk_cgesvjv,2*n+n**2+n+lwrk_cunmqr,2*n+n**2+n+&
+                                      lwrk_cunmlq,n+n**2+lwrk_cgesvju,n+lwrk_cunmqrm )
+                          end if
+                      else
+                          call stdlib_cgesvj( 'L', 'U', 'V', n, n, u, ldu, sva,n, v, ldv, cdummy, &
+                                    -1, rdummy, -1, ierr )
+                          lwrk_cgesvjv = real( cdummy(1),KIND=sp)
+                          call stdlib_cunmqr( 'L', 'N', n, n, n, cdummy, n, cdummy,v, ldv, cdummy,&
+                                     -1, ierr )
+                          lwrk_cunmqr = real( cdummy(1),KIND=sp)
+                          call stdlib_cunmqr( 'L', 'N', m, n, n, a, lda, cdummy, u,ldu, cdummy, -&
+                                    1, ierr )
+                          lwrk_cunmqrm = real( cdummy(1),KIND=sp)
+                          if ( errest ) then
+                             optwrk = max( n+lwrk_cgeqp3, n+lwcon,2*n+lwrk_cgeqrf, 2*n+n**2,2*n+&
+                                       n**2+lwrk_cgesvjv,2*n+n**2+n+lwrk_cunmqr,n+lwrk_cunmqrm )
+                          else
+                             optwrk = max( n+lwrk_cgeqp3, 2*n+lwrk_cgeqrf,2*n+n**2, 2*n+n**2+&
+                                       lwrk_cgesvjv,2*n+n**2+n+lwrk_cunmqr,n+lwrk_cunmqrm )
+                          end if
+                      end if
+                  end if
+                  if ( l2tran .or. rowpiv ) then
+                      minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj, lrwcon )
+                  else
+                      minrwrk = max( 7, lrwqp3, lrwsvdj, lrwcon )
+                  end if
+               end if
+               minwrk = max( 2, minwrk )
+               optwrk = max( optwrk, minwrk )
+               if ( lwork  < minwrk  .and. (.not.lquery) ) info = - 17
+               if ( lrwork < minrwrk .and. (.not.lquery) ) info = - 19
+           end if
+           if ( info /= 0 ) then
+             ! #:(
+              call stdlib_xerbla( 'CGEJSV', - info )
+              return
+           else if ( lquery ) then
+               cwork(1) = optwrk
+               cwork(2) = minwrk
+               rwork(1) = minrwrk
+               iwork(1) = max( 4, miniwrk )
+               return
+           end if
+           ! quick return for void matrix (y3k safe)
+       ! #:)
+           if ( ( m == 0 ) .or. ( n == 0 ) ) then
+              iwork(1:4) = 0
+              rwork(1:7) = 0
+              return
+           endif
+           ! determine whether the matrix u should be m x n or m x m
+           if ( lsvec ) then
+              n1 = n
+              if ( stdlib_lsame( jobu, 'F' ) ) n1 = m
+           end if
+           ! set numerical parameters
+      ! !    note: make sure stdlib_slamch() does not fail on the target architecture.
+           epsln = stdlib_slamch('EPSILON')
+           sfmin = stdlib_slamch('SAFEMINIMUM')
+           small = sfmin / epsln
+           big   = stdlib_slamch('O')
+           ! big   = one / sfmin
+           ! initialize sva(1:n) = diag( ||a e_i||_2 )_1^n
+      ! (!)  if necessary, scale sva() to protect the largest norm from
+           ! overflow. it is possible that this scaling pushes the smallest
+           ! column norm left from the underflow threshold (extreme case).
+           scalem  = one / sqrt(real(m,KIND=sp)*real(n,KIND=sp))
+           noscal  = .true.
+           goscal  = .true.
+           do p = 1, n
+              aapp = zero
+              aaqq = one
+              call stdlib_classq( m, a(1,p), 1, aapp, aaqq )
+              if ( aapp > big ) then
+                 info = - 9
+                 call stdlib_xerbla( 'CGEJSV', -info )
+                 return
+              end if
+              aaqq = sqrt(aaqq)
+              if ( ( aapp < (big / aaqq) ) .and. noscal  ) then
+                 sva(p)  = aapp * aaqq
+              else
+                 noscal  = .false.
+                 sva(p)  = aapp * ( aaqq * scalem )
+                 if ( goscal ) then
+                    goscal = .false.
+                    call stdlib_sscal( p-1, scalem, sva, 1 )
+                 end if
+              end if
+           end do
+           if ( noscal ) scalem = one
+           aapp = zero
+           aaqq = big
+           do p = 1, n
+              aapp = max( aapp, sva(p) )
+              if ( sva(p) /= zero ) aaqq = min( aaqq, sva(p) )
+           end do
+           ! quick return for zero m x n matrix
+       ! #:)
+           if ( aapp == zero ) then
+              if ( lsvec ) call stdlib_claset( 'G', m, n1, czero, cone, u, ldu )
+              if ( rsvec ) call stdlib_claset( 'G', n, n,  czero, cone, v, ldv )
+              rwork(1) = one
+              rwork(2) = one
+              if ( errest ) rwork(3) = one
+              if ( lsvec .and. rsvec ) then
+                 rwork(4) = one
+                 rwork(5) = one
+              end if
+              if ( l2tran ) then
+                 rwork(6) = zero
+                 rwork(7) = zero
+              end if
+              iwork(1) = 0
+              iwork(2) = 0
+              iwork(3) = 0
+              iwork(4) = -1
+              return
+           end if
+           ! issue warning if denormalized column norms detected. override the
+           ! high relative accuracy request. issue licence to kill nonzero columns
+           ! (set them to zero) whose norm is less than sigma_max / big (roughly).
+       ! #:(
+           warning = 0
+           if ( aaqq <= sfmin ) then
+              l2rank = .true.
+              l2kill = .true.
+              warning = 1
+           end if
+           ! quick return for one-column matrix
+       ! #:)
+           if ( n == 1 ) then
+              if ( lsvec ) then
+                 call stdlib_clascl( 'G',0,0,sva(1),scalem, m,1,a(1,1),lda,ierr )
+                 call stdlib_clacpy( 'A', m, 1, a, lda, u, ldu )
+                 ! computing all m left singular vectors of the m x 1 matrix
+                 if ( n1 /= n  ) then
+                   call stdlib_cgeqrf( m, n, u,ldu, cwork, cwork(n+1),lwork-n,ierr )
+                   call stdlib_cungqr( m,n1,1, u,ldu,cwork,cwork(n+1),lwork-n,ierr )
+                   call stdlib_ccopy( m, a(1,1), 1, u(1,1), 1 )
+                 end if
+              end if
+              if ( rsvec ) then
+                  v(1,1) = cone
+              end if
+              if ( sva(1) < (big*scalem) ) then
+                 sva(1)  = sva(1) / scalem
+                 scalem  = one
+              end if
+              rwork(1) = one / scalem
+              rwork(2) = one
+              if ( sva(1) /= zero ) then
+                 iwork(1) = 1
+                 if ( ( sva(1) / scalem) >= sfmin ) then
+                    iwork(2) = 1
+                 else
+                    iwork(2) = 0
+                 end if
+              else
+                 iwork(1) = 0
+                 iwork(2) = 0
+              end if
+              iwork(3) = 0
+              iwork(4) = -1
+              if ( errest ) rwork(3) = one
+              if ( lsvec .and. rsvec ) then
+                 rwork(4) = one
+                 rwork(5) = one
+              end if
+              if ( l2tran ) then
+                 rwork(6) = zero
+                 rwork(7) = zero
+              end if
+              return
+           end if
+           transp = .false.
+           aatmax = -one
+           aatmin =  big
+           if ( rowpiv .or. l2tran ) then
+           ! compute the row norms, needed to determine row pivoting sequence
+           ! (in the case of heavily row weighted a, row pivoting is strongly
+           ! advised) and to collect information needed to compare the
+           ! structures of a * a^* and a^* * a (in the case l2tran==.true.).
+              if ( l2tran ) then
+                 do p = 1, m
+                    xsc   = zero
+                    temp1 = one
+                    call stdlib_classq( n, a(p,1), lda, xsc, temp1 )
+                    ! stdlib_classq gets both the ell_2 and the ell_infinity norm
+                    ! in one pass through the vector
+                    rwork(m+p)  = xsc * scalem
+                    rwork(p)    = xsc * (scalem*sqrt(temp1))
+                    aatmax = max( aatmax, rwork(p) )
+                    if (rwork(p) /= zero)aatmin = min(aatmin,rwork(p))
+                 end do
+              else
+                 do p = 1, m
+                    rwork(m+p) = scalem*abs( a(p,stdlib_icamax(n,a(p,1),lda)) )
+                    aatmax = max( aatmax, rwork(m+p) )
+                    aatmin = min( aatmin, rwork(m+p) )
+                 end do
+              end if
+           end if
+           ! for square matrix a try to determine whether a^*  would be better
+           ! input for the preconditioned jacobi svd, with faster convergence.
+           ! the decision is based on an o(n) function of the vector of column
+           ! and row norms of a, based on the shannon entropy. this should give
+           ! the right choice in most cases when the difference actually matters.
+           ! it may fail and pick the slower converging side.
+           entra  = zero
+           entrat = zero
+           if ( l2tran ) then
+              xsc   = zero
+              temp1 = one
+              call stdlib_slassq( n, sva, 1, xsc, temp1 )
+              temp1 = one / temp1
+              entra = zero
+              do p = 1, n
+                 big1  = ( ( sva(p) / xsc )**2 ) * temp1
+                 if ( big1 /= zero ) entra = entra + big1 * log(big1)
+              end do
+              entra = - entra / log(real(n,KIND=sp))
+              ! now, sva().^2/trace(a^* * a) is a point in the probability simplex.
+              ! it is derived from the diagonal of  a^* * a.  do the same with the
+              ! diagonal of a * a^*, compute the entropy of the corresponding
+              ! probability distribution. note that a * a^* and a^* * a have the
+              ! same trace.
+              entrat = zero
+              do p = 1, m
+                 big1 = ( ( rwork(p) / xsc )**2 ) * temp1
+                 if ( big1 /= zero ) entrat = entrat + big1 * log(big1)
+              end do
+              entrat = - entrat / log(real(m,KIND=sp))
+              ! analyze the entropies and decide a or a^*. smaller entropy
+              ! usually means better input for the algorithm.
+              transp = ( entrat < entra )
+              ! if a^* is better than a, take the adjoint of a. this is allowed
+              ! only for square matrices, m=n.
+              if ( transp ) then
+                 ! in an optimal implementation, this trivial transpose
+                 ! should be replaced with faster transpose.
+                 do p = 1, n - 1
+                    a(p,p) = conjg(a(p,p))
+                    do q = p + 1, n
+                        ctemp = conjg(a(q,p))
+                       a(q,p) = conjg(a(p,q))
+                       a(p,q) = ctemp
+                    end do
+                 end do
+                 a(n,n) = conjg(a(n,n))
+                 do p = 1, n
+                    rwork(m+p) = sva(p)
+                    sva(p) = rwork(p)
+                    ! previously computed row 2-norms are now column 2-norms
+                    ! of the transposed matrix
+                 end do
+                 temp1  = aapp
+                 aapp   = aatmax
+                 aatmax = temp1
+                 temp1  = aaqq
+                 aaqq   = aatmin
+                 aatmin = temp1
+                 kill   = lsvec
+                 lsvec  = rsvec
+                 rsvec  = kill
+                 if ( lsvec ) n1 = n
+                 rowpiv = .true.
+              end if
+           end if
+           ! end if l2tran
+           ! scale the matrix so that its maximal singular value remains less
+           ! than sqrt(big) -- the matrix is scaled so that its maximal column
+           ! has euclidean norm equal to sqrt(big/n). the only reason to keep
+           ! sqrt(big) instead of big is the fact that stdlib_cgejsv uses lapack and
+           ! blas routines that, in some implementations, are not capable of
+           ! working in the full interval [sfmin,big] and that they may provoke
+           ! overflows in the intermediate results. if the singular values spread
+           ! from sfmin to big, then stdlib_cgesvj will compute them. so, in that case,
+           ! one should use stdlib_cgesvj instead of stdlib_cgejsv.
+           big1   = sqrt( big )
+           temp1  = sqrt( big / real(n,KIND=sp) )
+           ! >> for future updates: allow bigger range, i.e. the largest column
+           ! will be allowed up to big/n and stdlib_cgesvj will do the rest. however, for
+           ! this all other (lapack) components must allow such a range.
+           ! temp1  = big/real(n,KIND=sp)
+           ! temp1  = big * epsln  this should 'almost' work with current lapack components
+           call stdlib_slascl( 'G', 0, 0, aapp, temp1, n, 1, sva, n, ierr )
+           if ( aaqq > (aapp * sfmin) ) then
+               aaqq = ( aaqq / aapp ) * temp1
+           else
+               aaqq = ( aaqq * temp1 ) / aapp
+           end if
+           temp1 = temp1 * scalem
+           call stdlib_clascl( 'G', 0, 0, aapp, temp1, m, n, a, lda, ierr )
+           ! to undo scaling at the end of this procedure, multiply the
+           ! computed singular values with uscal2 / uscal1.
+           uscal1 = temp1
+           uscal2 = aapp
+           if ( l2kill ) then
+              ! l2kill enforces computation of nonzero singular values in
+              ! the restricted range of condition number of the initial a,
+              ! sigma_max(a) / sigma_min(a) approx. sqrt(big)/sqrt(sfmin).
+              xsc = sqrt( sfmin )
+           else
+              xsc = small
+              ! now, if the condition number of a is too big,
+              ! sigma_max(a) / sigma_min(a) > sqrt(big/n) * epsln / sfmin,
+              ! as a precaution measure, the full svd is computed using stdlib_cgesvj
+              ! with accumulated jacobi rotations. this provides numerically
+              ! more robust computation, at the cost of slightly increased run
+              ! time. depending on the concrete implementation of blas and lapack
+              ! (i.e. how they behave in presence of extreme ill-conditioning) the
+              ! implementor may decide to remove this switch.
+              if ( ( aaqq<sqrt(sfmin) ) .and. lsvec .and. rsvec ) then
+                 jracc = .true.
+              end if
+           end if
+           if ( aaqq < xsc ) then
+              do p = 1, n
+                 if ( sva(p) < xsc ) then
+                    call stdlib_claset( 'A', m, 1, czero, czero, a(1,p), lda )
+                    sva(p) = zero
+                 end if
+              end do
+           end if
+           ! preconditioning using qr factorization with pivoting
+           if ( rowpiv ) then
+              ! optional row permutation (bjoerck row pivoting):
+              ! a result by cox and higham shows that the bjoerck's
+              ! row pivoting combined with standard column pivoting
+              ! has similar effect as powell-reid complete pivoting.
+              ! the ell-infinity norms of a are made nonincreasing.
+              if ( ( lsvec .and. rsvec ) .and. .not.( jracc ) ) then
+                   iwoff = 2*n
+              else
+                   iwoff = n
+              end if
+              do p = 1, m - 1
+                 q = stdlib_isamax( m-p+1, rwork(m+p), 1 ) + p - 1
+                 iwork(iwoff+p) = q
+                 if ( p /= q ) then
+                    temp1      = rwork(m+p)
+                    rwork(m+p) = rwork(m+q)
+                    rwork(m+q) = temp1
+                 end if
+              end do
+              call stdlib_claswp( n, a, lda, 1, m-1, iwork(iwoff+1), 1 )
+           end if
+           ! end of the preparation phase (scaling, optional sorting and
+           ! transposing, optional flushing of small columns).
+           ! preconditioning
+           ! if the full svd is needed, the right singular vectors are computed
+           ! from a matrix equation, and for that we need theoretical analysis
+           ! of the businger-golub pivoting. so we use stdlib_cgeqp3 as the first rr qrf.
+           ! in all other cases the first rr qrf can be chosen by other criteria
+           ! (eg speed by replacing global with restricted window pivoting, such
+           ! as in xgeqpx from toms # 782). good results will be obtained using
+           ! xgeqpx with properly (!) chosen numerical parameters.
+           ! any improvement of stdlib_cgeqp3 improves overall performance of stdlib_cgejsv.
+           ! a * p1 = q1 * [ r1^* 0]^*:
+           do p = 1, n
+              ! All Columns Are Free Columns
+              iwork(p) = 0
+           end do
+           call stdlib_cgeqp3( m, n, a, lda, iwork, cwork, cwork(n+1), lwork-n,rwork, ierr )
+                     
+           ! the upper triangular matrix r1 from the first qrf is inspected for
+           ! rank deficiency and possibilities for deflation, or possible
+           ! ill-conditioning. depending on the user specified flag l2rank,
+           ! the procedure explores possibilities to reduce the numerical
+           ! rank by inspecting the computed upper triangular factor. if
+           ! l2rank or l2aber are up, then stdlib_cgejsv will compute the svd of
+           ! a + da, where ||da|| <= f(m,n)*epsln.
+           nr = 1
+           if ( l2aber ) then
+              ! standard absolute error bound suffices. all sigma_i with
+              ! sigma_i < n*epsln*||a|| are flushed to zero. this is an
+              ! aggressive enforcement of lower numerical rank by introducing a
+              ! backward error of the order of n*epsln*||a||.
+              temp1 = sqrt(real(n,KIND=sp))*epsln
+              do p = 2, n
+                 if ( abs(a(p,p)) >= (temp1*abs(a(1,1))) ) then
+                    nr = nr + 1
+                 else
+                    go to 3002
+                 end if
+              end do
+              3002 continue
+           else if ( l2rank ) then
+              ! .. similarly as above, only slightly more gentle (less aggressive).
+              ! sudden drop on the diagonal of r1 is used as the criterion for
+              ! close-to-rank-deficient.
+              temp1 = sqrt(sfmin)
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < (epsln*abs(a(p-1,p-1))) ) .or.( abs(a(p,p)) < small ) .or.( &
+                           l2kill .and. (abs(a(p,p)) < temp1) ) ) go to 3402
+                 nr = nr + 1
+              end do
+              3402 continue
+           else
+              ! the goal is high relative accuracy. however, if the matrix
+              ! has high scaled condition number the relative accuracy is in
+              ! general not feasible. later on, a condition number estimator
+              ! will be deployed to estimate the scaled condition number.
+              ! here we just remove the underflowed part of the triangular
+              ! factor. this prevents the situation in which the code is
+              ! working hard to get the accuracy not warranted by the data.
+              temp1  = sqrt(sfmin)
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < small ) .or.( l2kill .and. (abs(a(p,p)) < temp1) ) ) go to &
+                           3302
+                 nr = nr + 1
+              end do
+              3302 continue
+           end if
+           almort = .false.
+           if ( nr == n ) then
+              maxprj = one
+              do p = 2, n
+                 temp1  = abs(a(p,p)) / sva(iwork(p))
+                 maxprj = min( maxprj, temp1 )
+              end do
+              if ( maxprj**2 >= one - real(n,KIND=sp)*epsln ) almort = .true.
+           end if
+           sconda = - one
+           condr1 = - one
+           condr2 = - one
+           if ( errest ) then
+              if ( n == nr ) then
+                 if ( rsvec ) then
+                    ! V Is Available As Workspace
+                    call stdlib_clacpy( 'U', n, n, a, lda, v, ldv )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_csscal( p, one/temp1, v(1,p), 1 )
+                    end do
+                    if ( lsvec )then
+                        call stdlib_cpocon( 'U', n, v, ldv, one, temp1,cwork(n+1), rwork, ierr )
+                                  
+                    else
+                        call stdlib_cpocon( 'U', n, v, ldv, one, temp1,cwork, rwork, ierr )
+                                  
+                    end if
+                 else if ( lsvec ) then
+                    ! U Is Available As Workspace
+                    call stdlib_clacpy( 'U', n, n, a, lda, u, ldu )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_csscal( p, one/temp1, u(1,p), 1 )
+                    end do
+                    call stdlib_cpocon( 'U', n, u, ldu, one, temp1,cwork(n+1), rwork, ierr )
+                              
+                 else
+                    call stdlib_clacpy( 'U', n, n, a, lda, cwork, n )
+      ! []            call stdlib_clacpy( 'u', n, n, a, lda, cwork(n+1), n )
+                    ! change: here index shifted by n to the left, cwork(1:n)
+                    ! not needed for sigma only computation
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+      ! []               call stdlib_csscal( p, one/temp1, cwork(n+(p-1)*n+1), 1 )
+                       call stdlib_csscal( p, one/temp1, cwork((p-1)*n+1), 1 )
+                    end do
+                 ! The Columns Of R Are Scaled To Have Unit Euclidean Lengths
+      ! []               call stdlib_cpocon( 'u', n, cwork(n+1), n, one, temp1,
+      ! []     $              cwork(n+n*n+1), rwork, ierr )
+                    call stdlib_cpocon( 'U', n, cwork, n, one, temp1,cwork(n*n+1), rwork, ierr )
+                              
+                 end if
+                 if ( temp1 /= zero ) then
+                    sconda = one / sqrt(temp1)
+                 else
+                    sconda = - one
+                 end if
+                 ! sconda is an estimate of sqrt(||(r^* * r)^(-1)||_1).
+                 ! n^(-1/4) * sconda <= ||r^(-1)||_2 <= n^(1/4) * sconda
+              else
+                 sconda = - one
+              end if
+           end if
+           l2pert = l2pert .and. ( abs( a(1,1)/a(nr,nr) ) > sqrt(big1) )
+           ! if there is no violent scaling, artificial perturbation is not needed.
+           ! phase 3:
+           if ( .not. ( rsvec .or. lsvec ) ) then
+               ! singular values only
+               ! .. transpose a(1:nr,1:n)
+              do p = 1, min( n-1, nr )
+                 call stdlib_ccopy( n-p, a(p,p+1), lda, a(p+1,p), 1 )
+                 call stdlib_clacgv( n-p+1, a(p,p), 1 )
+              end do
+              if ( nr == n ) a(n,n) = conjg(a(n,n))
+              ! the following two do-loops introduce small relative perturbation
+              ! into the strict upper triangle of the lower triangular matrix.
+              ! small entries below the main diagonal are also changed.
+              ! this modification is useful if the computing environment does not
+              ! provide/allow flush to zero underflow, for it prevents many
+              ! annoying denormalized numbers in case of strongly scaled matrices.
+              ! the perturbation is structured so that it does not introduce any
+              ! new perturbation of the singular values, and it does not destroy
+              ! the job done by the preconditioner.
+              ! the licence for this perturbation is in the variable l2pert, which
+              ! should be .false. if flush to zero underflow is active.
+              if ( .not. almort ) then
+                 if ( l2pert ) then
+                    ! xsc = sqrt(small)
+                    xsc = epsln / real(n,KIND=sp)
+                    do q = 1, nr
+                       ctemp = cmplx(xsc*abs(a(q,q)),zero,KIND=sp)
+                       do p = 1, n
+                          if ( ( (p>q) .and. (abs(a(p,q))<=temp1) ).or. ( p < q ) )a(p,q) = &
+                                    ctemp
+           ! $                     a(p,q) = temp1 * ( a(p,q) / abs(a(p,q)) )
+                       end do
+                    end do
+                 else
+                    call stdlib_claset( 'U', nr-1,nr-1, czero,czero, a(1,2),lda )
+                 end if
+                  ! Second Preconditioning Using The Qr Factorization
+                 call stdlib_cgeqrf( n,nr, a,lda, cwork, cwork(n+1),lwork-n, ierr )
+                 ! And Transpose Upper To Lower Triangular
+                 do p = 1, nr - 1
+                    call stdlib_ccopy( nr-p, a(p,p+1), lda, a(p+1,p), 1 )
+                    call stdlib_clacgv( nr-p+1, a(p,p), 1 )
+                 end do
+              end if
+                 ! row-cyclic jacobi svd algorithm with column pivoting
+                 ! .. again some perturbation (a "background noise") is added
+                 ! to drown denormals
+                 if ( l2pert ) then
+                    ! xsc = sqrt(small)
+                    xsc = epsln / real(n,KIND=sp)
+                    do q = 1, nr
+                       ctemp = cmplx(xsc*abs(a(q,q)),zero,KIND=sp)
+                       do p = 1, nr
+                          if ( ( (p>q) .and. (abs(a(p,q))<=temp1) ).or. ( p < q ) )a(p,q) = &
+                                    ctemp
+           ! $                   a(p,q) = temp1 * ( a(p,q) / abs(a(p,q)) )
+                       end do
+                    end do
+                 else
+                    call stdlib_claset( 'U', nr-1, nr-1, czero, czero, a(1,2), lda )
+                 end if
+                 ! .. and one-sided jacobi rotations are started on a lower
+                 ! triangular matrix (plus perturbation which is ignored in
+                 ! the part which destroys triangular form (confusing?!))
+                 call stdlib_cgesvj( 'L', 'N', 'N', nr, nr, a, lda, sva,n, v, ldv, cwork, lwork, &
+                           rwork, lrwork, info )
+                 scalem  = rwork(1)
+                 numrank = nint(rwork(2),KIND=ilp)
+           else if ( ( rsvec .and. ( .not. lsvec ) .and. ( .not. jracc ) ).or.( jracc .and. ( &
+                     .not. lsvec ) .and. ( nr /= n ) ) ) then
+              ! -> singular values and right singular vectors <-
+              if ( almort ) then
+                 ! In This Case Nr Equals N
+                 do p = 1, nr
+                    call stdlib_ccopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                    call stdlib_clacgv( n-p+1, v(p,p), 1 )
+                 end do
+                 call stdlib_claset( 'U', nr-1,nr-1, czero, czero, v(1,2), ldv )
+                 call stdlib_cgesvj( 'L','U','N', n, nr, v, ldv, sva, nr, a, lda,cwork, lwork, &
+                           rwork, lrwork, info )
+                 scalem  = rwork(1)
+                 numrank = nint(rwork(2),KIND=ilp)
+              else
+              ! .. two more qr factorizations ( one qrf is not enough, two require
+              ! accumulated product of jacobi rotations, three are perfect )
+                 call stdlib_claset( 'L', nr-1,nr-1, czero, czero, a(2,1), lda )
+                 call stdlib_cgelqf( nr,n, a, lda, cwork, cwork(n+1), lwork-n, ierr)
+                 call stdlib_clacpy( 'L', nr, nr, a, lda, v, ldv )
+                 call stdlib_claset( 'U', nr-1,nr-1, czero, czero, v(1,2), ldv )
+                 call stdlib_cgeqrf( nr, nr, v, ldv, cwork(n+1), cwork(2*n+1),lwork-2*n, ierr )
+                           
+                 do p = 1, nr
+                    call stdlib_ccopy( nr-p+1, v(p,p), ldv, v(p,p), 1 )
+                    call stdlib_clacgv( nr-p+1, v(p,p), 1 )
+                 end do
+                 call stdlib_claset('U', nr-1, nr-1, czero, czero, v(1,2), ldv)
+                 call stdlib_cgesvj( 'L', 'U','N', nr, nr, v,ldv, sva, nr, u,ldu, cwork(n+1), &
+                           lwork-n, rwork, lrwork, info )
+                 scalem  = rwork(1)
+                 numrank = nint(rwork(2),KIND=ilp)
+                 if ( nr < n ) then
+                    call stdlib_claset( 'A',n-nr, nr, czero,czero, v(nr+1,1),  ldv )
+                    call stdlib_claset( 'A',nr, n-nr, czero,czero, v(1,nr+1),  ldv )
+                    call stdlib_claset( 'A',n-nr,n-nr,czero,cone, v(nr+1,nr+1),ldv )
+                 end if
+              call stdlib_cunmlq( 'L', 'C', n, n, nr, a, lda, cwork,v, ldv, cwork(n+1), lwork-n, &
+                        ierr )
+              end if
+               ! Permute The Rows Of V
+               ! do 8991 p = 1, n
+                  ! call stdlib_ccopy( n, v(p,1), ldv, a(iwork(p),1), lda )
+                  8991 continue
+               ! call stdlib_clacpy( 'all', n, n, a, lda, v, ldv )
+              call stdlib_clapmr( .false., n, n, v, ldv, iwork )
+               if ( transp ) then
+                 call stdlib_clacpy( 'A', n, n, v, ldv, u, ldu )
+               end if
+           else if ( jracc .and. (.not. lsvec) .and. ( nr== n ) ) then
+              call stdlib_claset( 'L', n-1,n-1, czero, czero, a(2,1), lda )
+              call stdlib_cgesvj( 'U','N','V', n, n, a, lda, sva, n, v, ldv,cwork, lwork, rwork, &
+                        lrwork, info )
+               scalem  = rwork(1)
+               numrank = nint(rwork(2),KIND=ilp)
+               call stdlib_clapmr( .false., n, n, v, ldv, iwork )
+           else if ( lsvec .and. ( .not. rsvec ) ) then
+              ! Singular Values And Left Singular Vectors                 
+              ! Second Preconditioning Step To Avoid Need To Accumulate
+              ! jacobi rotations in the jacobi iterations.
+              do p = 1, nr
+                 call stdlib_ccopy( n-p+1, a(p,p), lda, u(p,p), 1 )
+                 call stdlib_clacgv( n-p+1, u(p,p), 1 )
+              end do
+              call stdlib_claset( 'U', nr-1, nr-1, czero, czero, u(1,2), ldu )
+              call stdlib_cgeqrf( n, nr, u, ldu, cwork(n+1), cwork(2*n+1),lwork-2*n, ierr )
+                        
+              do p = 1, nr - 1
+                 call stdlib_ccopy( nr-p, u(p,p+1), ldu, u(p+1,p), 1 )
+                 call stdlib_clacgv( n-p+1, u(p,p), 1 )
+              end do
+              call stdlib_claset( 'U', nr-1, nr-1, czero, czero, u(1,2), ldu )
+              call stdlib_cgesvj( 'L', 'U', 'N', nr,nr, u, ldu, sva, nr, a,lda, cwork(n+1), lwork-&
+                        n, rwork, lrwork, info )
+              scalem  = rwork(1)
+              numrank = nint(rwork(2),KIND=ilp)
+              if ( nr < m ) then
+                 call stdlib_claset( 'A',  m-nr, nr,czero, czero, u(nr+1,1), ldu )
+                 if ( nr < n1 ) then
+                    call stdlib_claset( 'A',nr, n1-nr, czero, czero, u(1,nr+1),ldu )
+                    call stdlib_claset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1),ldu )
+                 end if
+              end if
+              call stdlib_cunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, cwork(n+1), lwork-n, &
+                        ierr )
+              if ( rowpiv )call stdlib_claswp( n1, u, ldu, 1, m-1, iwork(iwoff+1), -1 )
+              do p = 1, n1
+                 xsc = one / stdlib_scnrm2( m, u(1,p), 1 )
+                 call stdlib_csscal( m, xsc, u(1,p), 1 )
+              end do
+              if ( transp ) then
+                 call stdlib_clacpy( 'A', n, n, u, ldu, v, ldv )
+              end if
+           else
+              ! Full Svd 
+              if ( .not. jracc ) then
+              if ( .not. almort ) then
+                 ! second preconditioning step (qrf [with pivoting])
+                 ! note that the composition of transpose, qrf and transpose is
+                 ! equivalent to an lqf call. since in many libraries the qrf
+                 ! seems to be better optimized than the lqf, we do explicit
+                 ! transpose and use the qrf. this is subject to changes in an
+                 ! optimized implementation of stdlib_cgejsv.
+                 do p = 1, nr
+                    call stdlib_ccopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                    call stdlib_clacgv( n-p+1, v(p,p), 1 )
+                 end do
+                 ! The Following Two Loops Perturb Small Entries To Avoid
+                 ! denormals in the second qr factorization, where they are
+                 ! as good as zeros. this is done to avoid painfully slow
+                 ! computation with denormals. the relative size of the perturbation
+                 ! is a parameter that can be changed by the implementer.
+                 ! this perturbation device will be obsolete on machines with
+                 ! properly implemented arithmetic.
+                 ! to switch it off, set l2pert=.false. to remove it from  the
+                 ! code, remove the action under l2pert=.true., leave the else part.
+                 ! the following two loops should be blocked and fused with the
+                 ! transposed copy above.
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do q = 1, nr
+                       ctemp = cmplx(xsc*abs( v(q,q) ),zero,KIND=sp)
+                       do p = 1, n
+                          if ( ( p > q ) .and. ( abs(v(p,q)) <= temp1 ).or. ( p < q ) )v(p,q) = &
+                                    ctemp
+           ! $                   v(p,q) = temp1 * ( v(p,q) / abs(v(p,q)) )
+                          if ( p < q ) v(p,q) = - v(p,q)
+                       end do
+                    end do
+                 else
+                    call stdlib_claset( 'U', nr-1, nr-1, czero, czero, v(1,2), ldv )
+                 end if
+                 ! estimate the row scaled condition number of r1
+                 ! (if r1 is rectangular, n > nr, then the condition number
+                 ! of the leading nr x nr submatrix is estimated.)
+                 call stdlib_clacpy( 'L', nr, nr, v, ldv, cwork(2*n+1), nr )
+                 do p = 1, nr
+                    temp1 = stdlib_scnrm2(nr-p+1,cwork(2*n+(p-1)*nr+p),1)
+                    call stdlib_csscal(nr-p+1,one/temp1,cwork(2*n+(p-1)*nr+p),1)
+                 end do
+                 call stdlib_cpocon('L',nr,cwork(2*n+1),nr,one,temp1,cwork(2*n+nr*nr+1),rwork,&
+                           ierr)
+                 condr1 = one / sqrt(temp1)
+                 ! Here Need A Second Opinion On The Condition Number
+                 ! Then Assume Worst Case Scenario
+                 ! r1 is ok for inverse <=> condr1 < real(n,KIND=sp)
+                 ! more conservative    <=> condr1 < sqrt(real(n,KIND=sp))
+                 cond_ok = sqrt(sqrt(real(nr,KIND=sp)))
+      ! [tp]       cond_ok is a tuning parameter.
+                 if ( condr1 < cond_ok ) then
+                    ! .. the second qrf without pivoting. note: in an optimized
+                    ! implementation, this qrf should be implemented as the qrf
+                    ! of a lower triangular matrix.
+                    ! r1^* = q2 * r2
+                    call stdlib_cgeqrf( n, nr, v, ldv, cwork(n+1), cwork(2*n+1),lwork-2*n, ierr )
+                              
+                    if ( l2pert ) then
+                       xsc = sqrt(small)/epsln
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             ctemp=cmplx(xsc*min(abs(v(p,p)),abs(v(q,q))),zero,KIND=sp)
+                             if ( abs(v(q,p)) <= temp1 )v(q,p) = ctemp
+           ! $                     v(q,p) = temp1 * ( v(q,p) / abs(v(q,p)) )
+                          end do
+                       end do
+                    end if
+                    if ( nr /= n )call stdlib_clacpy( 'A', n, nr, v, ldv, cwork(2*n+1), n )
+                              
+                    ! .. save ...
+                 ! This Transposed Copy Should Be Better Than Naive
+                    do p = 1, nr - 1
+                       call stdlib_ccopy( nr-p, v(p,p+1), ldv, v(p+1,p), 1 )
+                       call stdlib_clacgv(nr-p+1, v(p,p), 1 )
+                    end do
+                    v(nr,nr)=conjg(v(nr,nr))
+                    condr2 = condr1
+                 else
+                    ! .. ill-conditioned case: second qrf with pivoting
+                    ! note that windowed pivoting would be equally good
+                    ! numerically, and more run-time efficient. so, in
+                    ! an optimal implementation, the next call to stdlib_cgeqp3
+                    ! should be replaced with eg. call cgeqpx (acm toms #782)
+                    ! with properly (carefully) chosen parameters.
+                    ! r1^* * p2 = q2 * r2
+                    do p = 1, nr
+                       iwork(n+p) = 0
+                    end do
+                    call stdlib_cgeqp3( n, nr, v, ldv, iwork(n+1), cwork(n+1),cwork(2*n+1), lwork-&
+                              2*n, rwork, ierr )
+      ! *               call stdlib_cgeqrf( n, nr, v, ldv, cwork(n+1), cwork(2*n+1),
+      ! *     $              lwork-2*n, ierr )
+                    if ( l2pert ) then
+                       xsc = sqrt(small)
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             ctemp=cmplx(xsc*min(abs(v(p,p)),abs(v(q,q))),zero,KIND=sp)
+                             if ( abs(v(q,p)) <= temp1 )v(q,p) = ctemp
+           ! $                     v(q,p) = temp1 * ( v(q,p) / abs(v(q,p)) )
+                          end do
+                       end do
+                    end if
+                    call stdlib_clacpy( 'A', n, nr, v, ldv, cwork(2*n+1), n )
+                    if ( l2pert ) then
+                       xsc = sqrt(small)
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             ctemp=cmplx(xsc*min(abs(v(p,p)),abs(v(q,q))),zero,KIND=sp)
+                              ! v(p,q) = - temp1*( v(q,p) / abs(v(q,p)) )
+                             v(p,q) = - ctemp
+                          end do
+                       end do
+                    else
+                       call stdlib_claset( 'L',nr-1,nr-1,czero,czero,v(2,1),ldv )
+                    end if
+                    ! now, compute r2 = l3 * q3, the lq factorization.
+                    call stdlib_cgelqf( nr, nr, v, ldv, cwork(2*n+n*nr+1),cwork(2*n+n*nr+nr+1), &
+                              lwork-2*n-n*nr-nr, ierr )
+                    ! And Estimate The Condition Number
+                    call stdlib_clacpy( 'L',nr,nr,v,ldv,cwork(2*n+n*nr+nr+1),nr )
+                    do p = 1, nr
+                       temp1 = stdlib_scnrm2( p, cwork(2*n+n*nr+nr+p), nr )
+                       call stdlib_csscal( p, one/temp1, cwork(2*n+n*nr+nr+p), nr )
+                    end do
+                    call stdlib_cpocon( 'L',nr,cwork(2*n+n*nr+nr+1),nr,one,temp1,cwork(2*n+n*nr+&
+                              nr+nr*nr+1),rwork,ierr )
+                    condr2 = one / sqrt(temp1)
+                    if ( condr2 >= cond_ok ) then
+                       ! Save The Householder Vectors Used For Q3
+                       ! (this overwrites the copy of r2, as it will not be
+                       ! needed in this branch, but it does not overwritte the
+                       ! huseholder vectors of q2.).
+                       call stdlib_clacpy( 'U', nr, nr, v, ldv, cwork(2*n+1), n )
+                       ! And The Rest Of The Information On Q3 Is In
+                       ! work(2*n+n*nr+1:2*n+n*nr+n)
+                    end if
+                 end if
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do q = 2, nr
+                       ctemp = xsc * v(q,q)
+                       do p = 1, q - 1
+                           ! v(p,q) = - temp1*( v(p,q) / abs(v(p,q)) )
+                          v(p,q) = - ctemp
+                       end do
+                    end do
+                 else
+                    call stdlib_claset( 'U', nr-1,nr-1, czero,czero, v(1,2), ldv )
+                 end if
+              ! second preconditioning finished; continue with jacobi svd
+              ! the input matrix is lower trinagular.
+              ! recover the right singular vectors as solution of a well
+              ! conditioned triangular matrix equation.
+                 if ( condr1 < cond_ok ) then
+                    call stdlib_cgesvj( 'L','U','N',nr,nr,v,ldv,sva,nr,u, ldu,cwork(2*n+n*nr+nr+1)&
+                              ,lwork-2*n-n*nr-nr,rwork,lrwork, info )
+                    scalem  = rwork(1)
+                    numrank = nint(rwork(2),KIND=ilp)
+                    do p = 1, nr
+                       call stdlib_ccopy(  nr, v(1,p), 1, u(1,p), 1 )
+                       call stdlib_csscal( nr, sva(p),    v(1,p), 1 )
+                    end do
+              ! Pick The Right Matrix Equation And Solve It
+                    if ( nr == n ) then
+       ! :))             .. best case, r1 is inverted. the solution of this matrix
+                       ! equation is q2*v2 = the product of the jacobi rotations
+                       ! used in stdlib_cgesvj, premultiplied with the orthogonal matrix
+                       ! from the second qr factorization.
+                       call stdlib_ctrsm('L','U','N','N', nr,nr,cone, a,lda, v,ldv)
+                    else
+                       ! .. r1 is well conditioned, but non-square. adjoint of r2
+                       ! is inverted to get the product of the jacobi rotations
+                       ! used in stdlib_cgesvj. the q-factor from the second qr
+                       ! factorization is then built in explicitly.
+                       call stdlib_ctrsm('L','U','C','N',nr,nr,cone,cwork(2*n+1),n,v,ldv)
+                       if ( nr < n ) then
+                       call stdlib_claset('A',n-nr,nr,czero,czero,v(nr+1,1),ldv)
+                       call stdlib_claset('A',nr,n-nr,czero,czero,v(1,nr+1),ldv)
+                       call stdlib_claset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                       end if
+                       call stdlib_cunmqr('L','N',n,n,nr,cwork(2*n+1),n,cwork(n+1),v,ldv,cwork(&
+                                 2*n+n*nr+nr+1),lwork-2*n-n*nr-nr,ierr)
+                    end if
+                 else if ( condr2 < cond_ok ) then
+                    ! the matrix r2 is inverted. the solution of the matrix equation
+                    ! is q3^* * v3 = the product of the jacobi rotations (appplied to
+                    ! the lower triangular l3 from the lq factorization of
+                    ! r2=l3*q3), pre-multiplied with the transposed q3.
+                    call stdlib_cgesvj( 'L', 'U', 'N', nr, nr, v, ldv, sva, nr, u,ldu, cwork(2*n+&
+                              n*nr+nr+1), lwork-2*n-n*nr-nr,rwork, lrwork, info )
+                    scalem  = rwork(1)
+                    numrank = nint(rwork(2),KIND=ilp)
+                    do p = 1, nr
+                       call stdlib_ccopy( nr, v(1,p), 1, u(1,p), 1 )
+                       call stdlib_csscal( nr, sva(p),    u(1,p), 1 )
+                    end do
+                    call stdlib_ctrsm('L','U','N','N',nr,nr,cone,cwork(2*n+1),n,u,ldu)
+                    ! Apply The Permutation From The Second Qr Factorization
+                    do q = 1, nr
+                       do p = 1, nr
+                          cwork(2*n+n*nr+nr+iwork(n+p)) = u(p,q)
+                       end do
+                       do p = 1, nr
+                          u(p,q) = cwork(2*n+n*nr+nr+p)
+                       end do
+                    end do
+                    if ( nr < n ) then
+                       call stdlib_claset( 'A',n-nr,nr,czero,czero,v(nr+1,1),ldv )
+                       call stdlib_claset( 'A',nr,n-nr,czero,czero,v(1,nr+1),ldv )
+                       call stdlib_claset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                    end if
+                    call stdlib_cunmqr( 'L','N',n,n,nr,cwork(2*n+1),n,cwork(n+1),v,ldv,cwork(2*n+&
+                              n*nr+nr+1),lwork-2*n-n*nr-nr,ierr )
+                 else
+                    ! last line of defense.
+       ! #:(          this is a rather pathological case: no scaled condition
+                    ! improvement after two pivoted qr factorizations. other
+                    ! possibility is that the rank revealing qr factorization
+                    ! or the condition estimator has failed, or the cond_ok
+                    ! is set very close to one (which is unnecessary). normally,
+                    ! this branch should never be executed, but in rare cases of
+                    ! failure of the rrqr or condition estimator, the last line of
+                    ! defense ensures that stdlib_cgejsv completes the task.
+                    ! compute the full svd of l3 using stdlib_cgesvj with explicit
+                    ! accumulation of jacobi rotations.
+                    call stdlib_cgesvj( 'L', 'U', 'V', nr, nr, v, ldv, sva, nr, u,ldu, cwork(2*n+&
+                              n*nr+nr+1), lwork-2*n-n*nr-nr,rwork, lrwork, info )
+                    scalem  = rwork(1)
+                    numrank = nint(rwork(2),KIND=ilp)
+                    if ( nr < n ) then
+                       call stdlib_claset( 'A',n-nr,nr,czero,czero,v(nr+1,1),ldv )
+                       call stdlib_claset( 'A',nr,n-nr,czero,czero,v(1,nr+1),ldv )
+                       call stdlib_claset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                    end if
+                    call stdlib_cunmqr( 'L','N',n,n,nr,cwork(2*n+1),n,cwork(n+1),v,ldv,cwork(2*n+&
+                              n*nr+nr+1),lwork-2*n-n*nr-nr,ierr )
+                    call stdlib_cunmlq( 'L', 'C', nr, nr, nr, cwork(2*n+1), n,cwork(2*n+n*nr+1), &
+                              u, ldu, cwork(2*n+n*nr+nr+1),lwork-2*n-n*nr-nr, ierr )
+                    do q = 1, nr
+                       do p = 1, nr
+                          cwork(2*n+n*nr+nr+iwork(n+p)) = u(p,q)
+                       end do
+                       do p = 1, nr
+                          u(p,q) = cwork(2*n+n*nr+nr+p)
+                       end do
+                    end do
+                 end if
+                 ! permute the rows of v using the (column) permutation from the
+                 ! first qrf. also, scale the columns to make them unit in
+                 ! euclidean norm. this applies to all cases.
+                 temp1 = sqrt(real(n,KIND=sp)) * epsln
+                 do q = 1, n
+                    do p = 1, n
+                       cwork(2*n+n*nr+nr+iwork(p)) = v(p,q)
+                    end do
+                    do p = 1, n
+                       v(p,q) = cwork(2*n+n*nr+nr+p)
+                    end do
+                    xsc = one / stdlib_scnrm2( n, v(1,q), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_csscal( n, xsc,&
+                               v(1,q), 1 )
+                 end do
+                 ! at this moment, v contains the right singular vectors of a.
+                 ! next, assemble the left singular vector matrix u (m x n).
+                 if ( nr < m ) then
+                    call stdlib_claset('A', m-nr, nr, czero, czero, u(nr+1,1), ldu)
+                    if ( nr < n1 ) then
+                       call stdlib_claset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                       call stdlib_claset('A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1),ldu)
+                    end if
+                 end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! matrix u. this applies to all cases.
+                 call stdlib_cunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, cwork(n+1), lwork-&
+                           n, ierr )
+                 ! the columns of u are normalized. the cost is o(m*n) flops.
+                 temp1 = sqrt(real(m,KIND=sp)) * epsln
+                 do p = 1, nr
+                    xsc = one / stdlib_scnrm2( m, u(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_csscal( m, xsc,&
+                               u(1,p), 1 )
+                 end do
+                 ! if the initial qrf is computed with row pivoting, the left
+                 ! singular vectors must be adjusted.
+                 if ( rowpiv )call stdlib_claswp( n1, u, ldu, 1, m-1, iwork(iwoff+1), -1 )
+              else
+              ! The Initial Matrix A Has Almost Orthogonal Columns And
+              ! the second qrf is not needed
+                 call stdlib_clacpy( 'U', n, n, a, lda, cwork(n+1), n )
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do p = 2, n
+                       ctemp = xsc * cwork( n + (p-1)*n + p )
+                       do q = 1, p - 1
+                           ! cwork(n+(q-1)*n+p)=-temp1 * ( cwork(n+(p-1)*n+q) /
+           ! $                                        abs(cwork(n+(p-1)*n+q)) )
+                          cwork(n+(q-1)*n+p)=-ctemp
+                       end do
+                    end do
+                 else
+                    call stdlib_claset( 'L',n-1,n-1,czero,czero,cwork(n+2),n )
+                 end if
+                 call stdlib_cgesvj( 'U', 'U', 'N', n, n, cwork(n+1), n, sva,n, u, ldu, cwork(n+&
+                           n*n+1), lwork-n-n*n, rwork, lrwork,info )
+                 scalem  = rwork(1)
+                 numrank = nint(rwork(2),KIND=ilp)
+                 do p = 1, n
+                    call stdlib_ccopy( n, cwork(n+(p-1)*n+1), 1, u(1,p), 1 )
+                    call stdlib_csscal( n, sva(p), cwork(n+(p-1)*n+1), 1 )
+                 end do
+                 call stdlib_ctrsm( 'L', 'U', 'N', 'N', n, n,cone, a, lda, cwork(n+1), n )
+                 do p = 1, n
+                    call stdlib_ccopy( n, cwork(n+p), n, v(iwork(p),1), ldv )
+                 end do
+                 temp1 = sqrt(real(n,KIND=sp))*epsln
+                 do p = 1, n
+                    xsc = one / stdlib_scnrm2( n, v(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_csscal( n, xsc,&
+                               v(1,p), 1 )
+                 end do
+                 ! assemble the left singular vector matrix u (m x n).
+                 if ( n < m ) then
+                    call stdlib_claset( 'A',  m-n, n, czero, czero, u(n+1,1), ldu )
+                    if ( n < n1 ) then
+                       call stdlib_claset('A',n,  n1-n, czero, czero,  u(1,n+1),ldu)
+                       call stdlib_claset( 'A',m-n,n1-n, czero, cone,u(n+1,n+1),ldu)
+                    end if
+                 end if
+                 call stdlib_cunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, cwork(n+1), lwork-&
+                           n, ierr )
+                 temp1 = sqrt(real(m,KIND=sp))*epsln
+                 do p = 1, n1
+                    xsc = one / stdlib_scnrm2( m, u(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_csscal( m, xsc,&
+                               u(1,p), 1 )
+                 end do
+                 if ( rowpiv )call stdlib_claswp( n1, u, ldu, 1, m-1, iwork(iwoff+1), -1 )
+              end if
+              ! end of the  >> almost orthogonal case <<  in the full svd
+              else
+              ! this branch deploys a preconditioned jacobi svd with explicitly
+              ! accumulated rotations. it is included as optional, mainly for
+              ! experimental purposes. it does perform well, and can also be used.
+              ! in this implementation, this branch will be automatically activated
+              ! if the  condition number sigma_max(a) / sigma_min(a) is predicted
+              ! to be greater than the overflow threshold. this is because the
+              ! a posteriori computation of the singular vectors assumes robust
+              ! implementation of blas and some lapack procedures, capable of working
+              ! in presence of extreme values, e.g. when the singular values spread from
+              ! the underflow to the overflow threshold.
+              do p = 1, nr
+                 call stdlib_ccopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                 call stdlib_clacgv( n-p+1, v(p,p), 1 )
+              end do
+              if ( l2pert ) then
+                 xsc = sqrt(small/epsln)
+                 do q = 1, nr
+                    ctemp = cmplx(xsc*abs( v(q,q) ),zero,KIND=sp)
+                    do p = 1, n
+                       if ( ( p > q ) .and. ( abs(v(p,q)) <= temp1 ).or. ( p < q ) )v(p,q) = &
+                                 ctemp
+           ! $                v(p,q) = temp1 * ( v(p,q) / abs(v(p,q)) )
+                       if ( p < q ) v(p,q) = - v(p,q)
+                    end do
+                 end do
+              else
+                 call stdlib_claset( 'U', nr-1, nr-1, czero, czero, v(1,2), ldv )
+              end if
+              call stdlib_cgeqrf( n, nr, v, ldv, cwork(n+1), cwork(2*n+1),lwork-2*n, ierr )
+                        
+              call stdlib_clacpy( 'L', n, nr, v, ldv, cwork(2*n+1), n )
+              do p = 1, nr
+                 call stdlib_ccopy( nr-p+1, v(p,p), ldv, u(p,p), 1 )
+                 call stdlib_clacgv( nr-p+1, u(p,p), 1 )
+              end do
+              if ( l2pert ) then
+                 xsc = sqrt(small/epsln)
+                 do q = 2, nr
+                    do p = 1, q - 1
+                       ctemp = cmplx(xsc * min(abs(u(p,p)),abs(u(q,q))),zero,KIND=sp)
+                        ! u(p,q) = - temp1 * ( u(q,p) / abs(u(q,p)) )
+                       u(p,q) = - ctemp
+                    end do
+                 end do
+              else
+                 call stdlib_claset('U', nr-1, nr-1, czero, czero, u(1,2), ldu )
+              end if
+              call stdlib_cgesvj( 'L', 'U', 'V', nr, nr, u, ldu, sva,n, v, ldv, cwork(2*n+n*nr+1),&
+                         lwork-2*n-n*nr,rwork, lrwork, info )
+              scalem  = rwork(1)
+              numrank = nint(rwork(2),KIND=ilp)
+              if ( nr < n ) then
+                 call stdlib_claset( 'A',n-nr,nr,czero,czero,v(nr+1,1),ldv )
+                 call stdlib_claset( 'A',nr,n-nr,czero,czero,v(1,nr+1),ldv )
+                 call stdlib_claset( 'A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv )
+              end if
+              call stdlib_cunmqr( 'L','N',n,n,nr,cwork(2*n+1),n,cwork(n+1),v,ldv,cwork(2*n+n*nr+&
+                        nr+1),lwork-2*n-n*nr-nr,ierr )
+                 ! permute the rows of v using the (column) permutation from the
+                 ! first qrf. also, scale the columns to make them unit in
+                 ! euclidean norm. this applies to all cases.
+                 temp1 = sqrt(real(n,KIND=sp)) * epsln
+                 do q = 1, n
+                    do p = 1, n
+                       cwork(2*n+n*nr+nr+iwork(p)) = v(p,q)
+                    end do
+                    do p = 1, n
+                       v(p,q) = cwork(2*n+n*nr+nr+p)
+                    end do
+                    xsc = one / stdlib_scnrm2( n, v(1,q), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_csscal( n, xsc,&
+                               v(1,q), 1 )
+                 end do
+                 ! at this moment, v contains the right singular vectors of a.
+                 ! next, assemble the left singular vector matrix u (m x n).
+              if ( nr < m ) then
+                 call stdlib_claset( 'A',  m-nr, nr, czero, czero, u(nr+1,1), ldu )
+                 if ( nr < n1 ) then
+                    call stdlib_claset('A',nr,  n1-nr, czero, czero,  u(1,nr+1),ldu)
+                    call stdlib_claset('A',m-nr,n1-nr, czero, cone,u(nr+1,nr+1),ldu)
+                 end if
+              end if
+              call stdlib_cunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, cwork(n+1), lwork-n, &
+                        ierr )
+                 if ( rowpiv )call stdlib_claswp( n1, u, ldu, 1, m-1, iwork(iwoff+1), -1 )
+              end if
+              if ( transp ) then
+                 ! .. swap u and v because the procedure worked on a^*
+                 do p = 1, n
+                    call stdlib_cswap( n, u(1,p), 1, v(1,p), 1 )
+                 end do
+              end if
+           end if
+           ! end of the full svd
+           ! undo scaling, if necessary (and possible)
+           if ( uscal2 <= (big/sva(1))*uscal1 ) then
+              call stdlib_slascl( 'G', 0, 0, uscal1, uscal2, nr, 1, sva, n, ierr )
+              uscal1 = one
+              uscal2 = one
+           end if
+           if ( nr < n ) then
+              do p = nr+1, n
+                 sva(p) = zero
+              end do
+           end if
+           rwork(1) = uscal2 * scalem
+           rwork(2) = uscal1
+           if ( errest ) rwork(3) = sconda
+           if ( lsvec .and. rsvec ) then
+              rwork(4) = condr1
+              rwork(5) = condr2
+           end if
+           if ( l2tran ) then
+              rwork(6) = entra
+              rwork(7) = entrat
+           end if
+           iwork(1) = nr
+           iwork(2) = numrank
+           iwork(3) = warning
+           if ( transp ) then
+               iwork(4) =  1
+           else
+               iwork(4) = -1
+           end if
+           return
+     end subroutine stdlib_cgejsv
+
+     !> CGESVJ: computes the singular value decomposition (SVD) of a complex
+     !> M-by-N matrix A, where M >= N. The SVD of A is written as
+     !> [++]   [xx]   [x0]   [xx]
+     !> A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+     !> [++]   [xx]
+     !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+     !> matrix, and V is an N-by-N unitary matrix. The diagonal elements
+     !> of SIGMA are the singular values of A. The columns of U and V are the
+     !> left and the right singular vectors of A, respectively.
+
+     pure subroutine stdlib_cgesvj( joba, jobu, jobv, m, n, a, lda, sva, mv, v,ldv, cwork, lwork, &
+               rwork, lrwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, lrwork, m, mv, n
+           character, intent(in) :: joba, jobu, jobv
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), v(ldv,*), cwork(lwork)
+           real(sp), intent(inout) :: rwork(lrwork)
+           real(sp), intent(out) :: sva(n)
+        ! =====================================================================
+           ! Local Parameters 
+           integer(ilp), parameter :: nsweep = 30
+           
+           
+           
+           ! Local Scalars 
+           complex(sp) :: aapq, ompq
+           real(sp) :: aapp, aapp0, aapq1, aaqq, apoaq, aqoap, big, bigtheta, cs, ctol, epsln, &
+           mxaapq, mxsinj, rootbig, rooteps, rootsfmin, roottol, skl, sfmin, small, sn, t, temp1, &
+                     theta, thsign, tol
+           integer(ilp) :: blskip, emptsw, i, ibr, ierr, igl, ijblsk, ir1, iswrot, jbc, jgl, kbl, &
+                     lkahead, mvl, n2, n34, n4, nbl, notrot, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, goscale, lower, lquery, lsvec, noscale, rotok, rsvec, uctol, &
+                     upper
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,conjg,real,sign,sqrt
+           ! from lapack
+           ! from lapack
+           ! Executable Statements 
+           ! test the input arguments
+           lsvec = stdlib_lsame( jobu, 'U' ) .or. stdlib_lsame( jobu, 'F' )
+           uctol = stdlib_lsame( jobu, 'C' )
+           rsvec = stdlib_lsame( jobv, 'V' ) .or. stdlib_lsame( jobv, 'J' )
+           applv = stdlib_lsame( jobv, 'A' )
+           upper = stdlib_lsame( joba, 'U' )
+           lower = stdlib_lsame( joba, 'L' )
+           lquery = ( lwork == -1 ) .or. ( lrwork == -1 )
+           if( .not.( upper .or. lower .or. stdlib_lsame( joba, 'G' ) ) ) then
+              info = -1
+           else if( .not.( lsvec .or. uctol .or. stdlib_lsame( jobu, 'N' ) ) ) then
+              info = -2
+           else if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -5
+           else if( lda<m ) then
+              info = -7
+           else if( mv<0 ) then
+              info = -9
+           else if( ( rsvec .and. ( ldv<n ) ) .or.( applv .and. ( ldv<mv ) ) ) then
+              info = -11
+           else if( uctol .and. ( rwork( 1 )<=one ) ) then
+              info = -12
+           else if( lwork<( m+n ) .and. ( .not.lquery ) ) then
+              info = -13
+           else if( lrwork<max( n, 6 ) .and. ( .not.lquery ) ) then
+              info = -15
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGESVJ', -info )
+              return
+           else if ( lquery ) then
+              cwork(1) = m + n
+              rwork(1) = max( n, 6 )
+              return
+           end if
+       ! #:) quick return for void matrix
+           if( ( m==0 ) .or. ( n==0 ) )return
+           ! set numerical parameters
+           ! the stopping criterion for jacobi rotations is
+           ! max_{i<>j}|a(:,i)^* * a(:,j)| / (||a(:,i)||*||a(:,j)||) < ctol*eps
+           ! where eps is the round-off and ctol is defined as follows:
+           if( uctol ) then
+              ! ... user controlled
+              ctol = rwork( 1 )
+           else
+              ! ... default
+              if( lsvec .or. rsvec .or. applv ) then
+                 ctol = sqrt( real( m,KIND=sp) )
+              else
+                 ctol = real( m,KIND=sp)
+              end if
+           end if
+           ! ... and the machine dependent parameters are
+      ! [!]  (make sure that stdlib_slamch() works properly on the target machine.)
+           epsln = stdlib_slamch( 'EPSILON' )
+           rooteps = sqrt( epsln )
+           sfmin = stdlib_slamch( 'SAFEMINIMUM' )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / epsln
+            ! big = stdlib_slamch( 'overflow' )
+           big     = one  / sfmin
+           rootbig = one / rootsfmin
+           ! large = big / sqrt( real( m*n,KIND=sp) )
+           bigtheta = one / rooteps
+           tol = ctol*epsln
+           roottol = sqrt( tol )
+           if( real( m,KIND=sp)*epsln>=one ) then
+              info = -4
+              call stdlib_xerbla( 'CGESVJ', -info )
+              return
+           end if
+           ! initialize the right singular vector matrix.
+           if( rsvec ) then
+              mvl = n
+              call stdlib_claset( 'A', mvl, n, czero, cone, v, ldv )
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           ! initialize sva( 1:n ) = ( ||a e_i||_2, i = 1:n )
+      ! (!)  if necessary, scale a to protect the largest singular value
+           ! from overflow. it is possible that saving the largest singular
+           ! value destroys the information about the small ones.
+           ! this initial scaling is almost minimal in the sense that the
+           ! goal is to make sure that no column norm overflows, and that
+           ! sqrt(n)*max_i sva(i) does not overflow. if infinite entries
+           ! in a are detected, the procedure returns with info=-6.
+           skl = one / sqrt( real( m,KIND=sp)*real( n,KIND=sp) )
+           noscale = .true.
+           goscale = .true.
+           if( lower ) then
+              ! the input matrix is m-by-n lower triangular (trapezoidal)
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_classq( m-p+1, a( p, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'CGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl )
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           else if( upper ) then
+              ! the input matrix is m-by-n upper triangular (trapezoidal)
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_classq( p, a( 1, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'CGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl )
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           else
+              ! the input matrix is m-by-n general dense
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_classq( m, a( 1, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'CGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl )
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           end if
+           if( noscale )skl = one
+           ! move the smaller part of the spectrum from the underflow threshold
+      ! (!)  start by determining the position of the nonzero entries of the
+           ! array sva() relative to ( sfmin, big ).
+           aapp = zero
+           aaqq = big
+           do p = 1, n
+              if( sva( p )/=zero )aaqq = min( aaqq, sva( p ) )
+              aapp = max( aapp, sva( p ) )
+           end do
+       ! #:) quick return for zero matrix
+           if( aapp==zero ) then
+              if( lsvec )call stdlib_claset( 'G', m, n, czero, cone, a, lda )
+              rwork( 1 ) = one
+              rwork( 2 ) = zero
+              rwork( 3 ) = zero
+              rwork( 4 ) = zero
+              rwork( 5 ) = zero
+              rwork( 6 ) = zero
+              return
+           end if
+       ! #:) quick return for one-column matrix
+           if( n==1 ) then
+              if( lsvec )call stdlib_clascl( 'G', 0, 0, sva( 1 ), skl, m, 1,a( 1, 1 ), lda, ierr )
+                        
+              rwork( 1 ) = one / skl
+              if( sva( 1 )>=sfmin ) then
+                 rwork( 2 ) = one
+              else
+                 rwork( 2 ) = zero
+              end if
+              rwork( 3 ) = zero
+              rwork( 4 ) = zero
+              rwork( 5 ) = zero
+              rwork( 6 ) = zero
+              return
+           end if
+           ! protect small singular values from underflow, and try to
+           ! avoid underflows/overflows in computing jacobi rotations.
+           sn = sqrt( sfmin / epsln )
+           temp1 = sqrt( big / real( n,KIND=sp) )
+           if( ( aapp<=sn ) .or. ( aaqq>=temp1 ) .or.( ( sn<=aaqq ) .and. ( aapp<=temp1 ) ) ) &
+                     then
+              temp1 = min( big, temp1 / aapp )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq<=sn ) .and. ( aapp<=temp1 ) ) then
+              temp1 = min( sn / aaqq, big / ( aapp*sqrt( real( n,KIND=sp) ) ) )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq>=sn ) .and. ( aapp>=temp1 ) ) then
+              temp1 = max( sn / aaqq, temp1 / aapp )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq<=sn ) .and. ( aapp>=temp1 ) ) then
+              temp1 = min( sn / aaqq, big / ( sqrt( real( n,KIND=sp) )*aapp ) )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else
+              temp1 = one
+           end if
+           ! scale, if necessary
+           if( temp1/=one ) then
+              call stdlib_slascl( 'G', 0, 0, one, temp1, n, 1, sva, n, ierr )
+           end if
+           skl = temp1*skl
+           if( skl/=one ) then
+              call stdlib_clascl( joba, 0, 0, one, skl, m, n, a, lda, ierr )
+              skl = one / skl
+           end if
+           ! row-cyclic jacobi svd algorithm with column pivoting
+           emptsw = ( n*( n-1 ) ) / 2
+           notrot = 0
+           do q = 1, n
+              cwork( q ) = cone
+           end do
+           swband = 3
+      ! [tp] swband is a tuning parameter [tp]. it is meaningful and effective
+           ! if stdlib_cgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_cgejsv. for sweeps i=1:swband the procedure
+           ! works on pivots inside a band-like region around the diagonal.
+           ! the boundaries are determined dynamically, based on the number of
+           ! pivots above a threshold.
+           kbl = min( 8, n )
+      ! [tp] kbl is a tuning parameter that defines the tile size in the
+           ! tiling of the p-q loops of pivot pairs. in general, an optimal
+           ! value of kbl depends on the matrix dimensions and on the
+           ! parameters of the computer's memory.
+           nbl = n / kbl
+           if( ( nbl*kbl )/=n )nbl = nbl + 1
+           blskip = kbl**2
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           lkahead = 1
+      ! [tp] lkahead is a tuning parameter.
+           ! quasi block transformations, using the lower (upper) triangular
+           ! structure of the input matrix. the quasi-block-cycling usually
+           ! invokes cubic convergence. big part of this cycle is done inside
+           ! canonical subspaces of dimensions less than m.
+           if( ( lower .or. upper ) .and. ( n>max( 64, 4*kbl ) ) ) then
+      ! [tp] the number of partition levels and the actual partition are
+           ! tuning parameters.
+              n4 = n / 4
+              n2 = n / 2
+              n34 = 3*n4
+              if( applv ) then
+                 q = 0
+              else
+                 q = 1
+              end if
+              if( lower ) then
+           ! this works very well on lower triangular matrices, in particular
+           ! in the framework of the preconditioned jacobi svd (xgejsv).
+           ! the idea is simple:
+           ! [+ 0 0 0]   note that jacobi transformations of [0 0]
+           ! [+ + 0 0]                                       [0 0]
+           ! [+ + x 0]   actually work on [x 0]              [x 0]
+           ! [+ + x x]                    [x x].             [x x]
+                 call stdlib_cgsvj0( jobv, m-n34, n-n34, a( n34+1, n34+1 ), lda,cwork( n34+1 ), &
+                 sva( n34+1 ), mvl,v( n34*q+1, n34+1 ), ldv, epsln, sfmin, tol,2, cwork( n+1 ), &
+                           lwork-n, ierr )
+                 call stdlib_cgsvj0( jobv, m-n2, n34-n2, a( n2+1, n2+1 ), lda,cwork( n2+1 ), sva( &
+                 n2+1 ), mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 2,cwork( n+1 ), lwork-n, &
+                           ierr )
+                 call stdlib_cgsvj1( jobv, m-n2, n-n2, n4, a( n2+1, n2+1 ), lda,cwork( n2+1 ), &
+                 sva( n2+1 ), mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 1,cwork( n+1 ), &
+                           lwork-n, ierr )
+                 call stdlib_cgsvj0( jobv, m-n4, n2-n4, a( n4+1, n4+1 ), lda,cwork( n4+1 ), sva( &
+                 n4+1 ), mvl,v( n4*q+1, n4+1 ), ldv, epsln, sfmin, tol, 1,cwork( n+1 ), lwork-n, &
+                           ierr )
+                 call stdlib_cgsvj0( jobv, m, n4, a, lda, cwork, sva, mvl, v, ldv,epsln, sfmin, &
+                           tol, 1, cwork( n+1 ), lwork-n,ierr )
+                 call stdlib_cgsvj1( jobv, m, n2, n4, a, lda, cwork, sva, mvl, v,ldv, epsln, &
+                           sfmin, tol, 1, cwork( n+1 ),lwork-n, ierr )
+              else if( upper ) then
+                 call stdlib_cgsvj0( jobv, n4, n4, a, lda, cwork, sva, mvl, v, ldv,epsln, sfmin, &
+                           tol, 2, cwork( n+1 ), lwork-n,ierr )
+                 call stdlib_cgsvj0( jobv, n2, n4, a( 1, n4+1 ), lda, cwork( n4+1 ),sva( n4+1 ), &
+                 mvl, v( n4*q+1, n4+1 ), ldv,epsln, sfmin, tol, 1, cwork( n+1 ), lwork-n,ierr )
+                           
+                 call stdlib_cgsvj1( jobv, n2, n2, n4, a, lda, cwork, sva, mvl, v,ldv, epsln, &
+                           sfmin, tol, 1, cwork( n+1 ),lwork-n, ierr )
+                 call stdlib_cgsvj0( jobv, n2+n4, n4, a( 1, n2+1 ), lda,cwork( n2+1 ), sva( n2+1 )&
+                 , mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 1,cwork( n+1 ), lwork-n, ierr )
+                           
+              end if
+           end if
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+           ! each sweep is unrolled using kbl-by-kbl tiles over the pivot pairs
+           ! 1 <= p < q <= n. this is the first step toward a blocked implementation
+           ! of the rotations. new implementation, based on block transformations,
+           ! is under development.
+              loop_2000: do ibr = 1, nbl
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_1002: do ir1 = 0, min( lkahead, nbl-ibr )
+                    igl = igl + ir1*kbl
+                    loop_2001: do p = igl, min( igl+kbl-1, n-1 )
+           ! .. de rijk's pivoting
+                       q = stdlib_isamax( n-p+1, sva( p ), 1 ) + p - 1
+                       if( p/=q ) then
+                          call stdlib_cswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                          if( rsvec )call stdlib_cswap( mvl, v( 1, p ), 1,v( 1, q ), 1 )
+                          temp1 = sva( p )
+                          sva( p ) = sva( q )
+                          sva( q ) = temp1
+                          aapq = cwork(p)
+                          cwork(p) = cwork(q)
+                          cwork(q) = aapq
+                       end if
+                       if( ir1==0 ) then
+              ! column norms are periodically updated by explicit
+              ! norm computation.
+      ! [!]     caveat:
+              ! unfortunately, some blas implementations compute stdlib_scnrm2(m,a(1,p),1)
+              ! as sqrt(s=stdlib_cdotc(m,a(1,p),1,a(1,p),1)), which may cause the result to
+              ! overflow for ||a(:,p)||_2 > sqrt(overflow_threshold), and to
+              ! underflow for ||a(:,p)||_2 < sqrt(underflow_threshold).
+              ! hence, stdlib_scnrm2 cannot be trusted, not even in the case when
+              ! the true norm is far from the under(over)flow boundaries.
+              ! if properly implemented stdlib_scnrm2 is available, the if-then-else-end if
+              ! below should be replaced with "aapp = stdlib_scnrm2( m, a(1,p), 1 )".
+                          if( ( sva( p )<rootbig ) .and.( sva( p )>rootsfmin ) ) then
+                             sva( p ) = stdlib_scnrm2( m, a( 1, p ), 1 )
+                          else
+                             temp1 = zero
+                             aapp = one
+                             call stdlib_classq( m, a( 1, p ), 1, temp1, aapp )
+                             sva( p ) = temp1*sqrt( aapp )
+                          end if
+                          aapp = sva( p )
+                       else
+                          aapp = sva( p )
+                       end if
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2002: do q = p + 1, min( igl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+                                if( aaqq>=one ) then
+                                   rotok = ( small*aapp )<=aaqq
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_cdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_ccopy( m, a( 1, p ), 1,cwork(n+1), 1 )
+                                      call stdlib_clascl( 'G', 0, 0, aapp, one,m, 1, cwork(n+1), &
+                                                lda, ierr )
+                                      aapq = stdlib_cdotc( m, cwork(n+1), 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   rotok = aapp<=( aaqq / small )
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_cdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aapp ) / aaqq
+                                   else
+                                      call stdlib_ccopy( m, a( 1, q ), 1,cwork(n+1), 1 )
+                                      call stdlib_clascl( 'G', 0, 0, aaqq,one, m, 1,cwork(n+1), &
+                                                lda, ierr )
+                                      aapq = stdlib_cdotc( m, a(1, p ), 1,cwork(n+1), 1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg( cwork(p) ) * cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                    ompq = aapq / abs(aapq)
+                 ! Rotate
+      ! [rtd]      rotated = rotated + one
+                                   if( ir1==0 ) then
+                                      notrot = 0
+                                      pskipped = 0
+                                      iswrot = iswrot + 1
+                                   end if
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/aapq1
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_crot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if ( rsvec ) then
+                                             call stdlib_crot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_crot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if ( rsvec ) then
+                                             call stdlib_crot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      cwork(p) = -cwork(q) * ompq
+                                      else
+                    ! .. have to use modified gram-schmidt like transformation
+                                      call stdlib_ccopy( m, a( 1, p ), 1,cwork(n+1), 1 )
+                                      call stdlib_clascl( 'G', 0, 0, aapp, one, m,1, cwork(n+1), &
+                                                lda,ierr )
+                                      call stdlib_clascl( 'G', 0, 0, aaqq, one, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      call stdlib_caxpy( m, -aapq, cwork(n+1), 1,a( 1, q ), 1 )
+                                                
+                                      call stdlib_clascl( 'G', 0, 0, one, aaqq, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                      mxsinj = max( mxsinj, sfmin )
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! recompute sva(q), sva(p).
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_scnrm2( m, a( 1, q ), 1 )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_classq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_scnrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_classq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                                else
+                                   ! a(:,p) and a(:,q) already numerically orthogonal
+                                   if( ir1==0 )notrot = notrot + 1
+      ! [rtd]      skipped  = skipped + 1
+                                   pskipped = pskipped + 1
+                                end if
+                             else
+                                ! a(:,q) is zero column
+                                if( ir1==0 )notrot = notrot + 1
+                                pskipped = pskipped + 1
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                if( ir1==0 )aapp = -aapp
+                                notrot = 0
+                                go to 2103
+                             end if
+                          end do loop_2002
+           ! end q-loop
+           2103 continue
+           ! bailed out of q-loop
+                          sva( p ) = aapp
+                       else
+                          sva( p ) = aapp
+                          if( ( ir1==0 ) .and. ( aapp==zero ) )notrot = notrot + min( igl+kbl-1, &
+                                    n ) - p
+                       end if
+                    end do loop_2001
+           ! end of the p-loop
+           ! end of doing the block ( ibr, ibr )
+                 end do loop_1002
+           ! end of ir1-loop
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = ibr + 1, nbl
+                    jgl = ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! safe gram matrix computation
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_cdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_ccopy( m, a( 1, p ), 1,cwork(n+1), 1 )
+                                      call stdlib_clascl( 'G', 0, 0, aapp,one, m, 1,cwork(n+1), &
+                                                lda, ierr )
+                                      aapq = stdlib_cdotc( m, cwork(n+1), 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_cdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / max(&
+                                                aaqq,aapp) )/ min(aaqq,aapp)
+                                   else
+                                      call stdlib_ccopy( m, a( 1, q ), 1,cwork(n+1), 1 )
+                                      call stdlib_clascl( 'G', 0, 0, aaqq,one, m, 1,cwork(n+1), &
+                                                lda, ierr )
+                                      aapq = stdlib_cdotc( m, a( 1, p ), 1,cwork(n+1),  1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg(cwork(p))*cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                   ompq = aapq / abs(aapq)
+                                   notrot = 0
+      ! [rtd]      rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/ aapq1
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_crot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if( rsvec ) then
+                                             call stdlib_crot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_crot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if( rsvec ) then
+                                             call stdlib_crot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      cwork(p) = -cwork(q) * ompq
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                    if( aapp>aaqq ) then
+                                         call stdlib_ccopy( m, a( 1, p ), 1,cwork(n+1), 1 )
+                                                   
+                                         call stdlib_clascl( 'G', 0, 0, aapp, one,m, 1, cwork(n+1)&
+                                                   ,lda,ierr )
+                                         call stdlib_clascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         call stdlib_caxpy( m, -aapq, cwork(n+1),1, a( 1, q ), 1 )
+                                                   
+                                         call stdlib_clascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    else
+                                        call stdlib_ccopy( m, a( 1, q ), 1,cwork(n+1), 1 )
+                                         call stdlib_clascl( 'G', 0, 0, aaqq, one,m, 1, cwork(n+1)&
+                                                   ,lda,ierr )
+                                         call stdlib_clascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         call stdlib_caxpy( m, -conjg(aapq),cwork(n+1), 1, a( 1, &
+                                                   p ), 1 )
+                                         call stdlib_clascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! .. recompute sva(q), sva(p)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_scnrm2( m, a( 1, q ), 1)
+                                       else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_classq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_scnrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_classq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_scnrm2( m, a( 1, n ), 1 )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_classq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<sqrt( real( n,KIND=sp) )*tol ) .and. ( real( n,&
+                        KIND=sp)*mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:( reaching this point means that the procedure has not converged.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means numerical convergence after the i-th
+           ! sweep.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the singular values and find how many are above
+           ! the underflow threshold.
+           n2 = 0
+           n4 = 0
+           do p = 1, n - 1
+              q = stdlib_isamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 call stdlib_cswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_cswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+              if( sva( p )/=zero ) then
+                 n4 = n4 + 1
+                 if( sva( p )*skl>sfmin )n2 = n2 + 1
+              end if
+           end do
+           if( sva( n )/=zero ) then
+              n4 = n4 + 1
+              if( sva( n )*skl>sfmin )n2 = n2 + 1
+           end if
+           ! normalize the left singular vectors.
+           if( lsvec .or. uctol ) then
+              do p = 1, n4
+                 ! call stdlib_csscal( m, one / sva( p ), a( 1, p ), 1 )
+                 call stdlib_clascl( 'G',0,0, sva(p), one, m, 1, a(1,p), m, ierr )
+              end do
+           end if
+           ! scale the product of jacobi rotations.
+           if( rsvec ) then
+                 do p = 1, n
+                    temp1 = one / stdlib_scnrm2( mvl, v( 1, p ), 1 )
+                    call stdlib_csscal( mvl, temp1, v( 1, p ), 1 )
+                 end do
+           end if
+           ! undo scaling, if necessary (and possible).
+           if( ( ( skl>one ) .and. ( sva( 1 )<( big / skl ) ) ).or. ( ( skl<one ) .and. ( sva( &
+                     max( n2, 1 ) ) >( sfmin / skl ) ) ) ) then
+              do p = 1, n
+                 sva( p ) = skl*sva( p )
+              end do
+              skl = one
+           end if
+           rwork( 1 ) = skl
+           ! the singular values of a are skl*sva(1:n). if skl/=one
+           ! then some of the singular values may overflow or underflow and
+           ! the spectrum is given in this factored representation.
+           rwork( 2 ) = real( n4,KIND=sp)
+           ! n4 is the number of computed nonzero singular values of a.
+           rwork( 3 ) = real( n2,KIND=sp)
+           ! n2 is the number of singular values of a greater than sfmin.
+           ! if n2<n, sva(n2:n) contains zeros and/or denormalized numbers
+           ! that may carry some information.
+           rwork( 4 ) = real( i,KIND=sp)
+           ! i is the index of the last sweep before declaring convergence.
+           rwork( 5 ) = mxaapq
+           ! mxaapq is the largest absolute value of scaled pivots in the
+           ! last sweep
+           rwork( 6 ) = mxsinj
+           ! mxsinj is the largest absolute value of the sines of jacobi angles
+           ! in the last sweep
+           return
+     end subroutine stdlib_cgesvj
+
+     !> CGGES3: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, the generalized complex Schur
+     !> form (S, T), and optionally left and/or right Schur vectors (VSL
+     !> and VSR). This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL)*S*(VSR)**H, (VSL)*T*(VSR)**H )
+     !> where (VSR)**H is the conjugate-transpose of VSR.
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> triangular matrix S and the upper triangular matrix T. The leading
+     !> columns of VSL and VSR then form an unitary basis for the
+     !> corresponding left and right eigenspaces (deflating subspaces).
+     !> (If only the generalized eigenvalues are needed, use the driver
+     !> CGGEV instead, which is faster.)
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0, and even for both being zero.
+     !> A pair of matrices (S,T) is in generalized complex Schur form if S
+     !> and T are upper triangular and, in addition, the diagonal elements
+     !> of T are non-negative real numbers.
+
+     subroutine stdlib_cgges3( jobvsl, jobvsr, sort, selctg, n, a, lda, b,ldb, sdim, alpha, beta, &
+               vsl, ldvsl, vsr, ldvsr,work, lwork, rwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: alpha(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), work(*)
+                     
+           ! Function Arguments 
+           procedure(stdlib_selctg_c) :: selctg
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, wantst
+           integer(ilp) :: i, icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, iright, irows, irwrk, &
+                     itau, iwrk, lwkopt
+           real(sp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pvsl, pvsr, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(sp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -14
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -16
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           ! compute workspace
+           if( info==0 ) then
+              call stdlib_cgeqrf( n, n, b, ldb, work, work, -1, ierr )
+              lwkopt = max( 1,  n + int( work( 1 ),KIND=ilp) )
+              call stdlib_cunmqr( 'L', 'C', n, n, n, b, ldb, work, a, lda, work,-1, ierr )
+              lwkopt = max( lwkopt, n + int( work( 1 ),KIND=ilp) )
+              if( ilvsl ) then
+                 call stdlib_cungqr( n, n, n, vsl, ldvsl, work, work, -1,ierr )
+                 lwkopt = max( lwkopt, n + int( work( 1 ),KIND=ilp) )
+              end if
+              call stdlib_cgghd3( jobvsl, jobvsr, n, 1, n, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr, &
+                        work, -1, ierr )
+              lwkopt = max( lwkopt, n + int( work( 1 ),KIND=ilp) )
+              call stdlib_claqz0( 'S', jobvsl, jobvsr, n, 1, n, a, lda, b, ldb,alpha, beta, vsl, &
+                        ldvsl, vsr, ldvsr, work, -1,rwork, 0, ierr )
+              lwkopt = max( lwkopt, int( work( 1 ),KIND=ilp) )
+              if( wantst ) then
+                 call stdlib_ctgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb,alpha, beta, vsl, &
+                           ldvsl, vsr, ldvsr, sdim,pvsl, pvsr, dif, work, -1, idum, 1, ierr )
+                 lwkopt = max( lwkopt, int( work( 1 ),KIND=ilp) )
+              end if
+              work( 1 ) = cmplx( lwkopt,KIND=sp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGES3 ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_clascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_clange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_clascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_cggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_cgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           call stdlib_cunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           if( ilvsl ) then
+              call stdlib_claset( 'FULL', n, n, czero, cone, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_clacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_cungqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_claset( 'FULL', n, n, czero, cone, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           call stdlib_cgghd3( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      work( iwrk ), lwork+1-iwrk, ierr )
+           sdim = 0
+           ! perform qz algorithm, computing schur vectors if desired
+           iwrk = itau
+           call stdlib_claqz0( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vsl, &
+                     ldvsl, vsr, ldvsr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), 0, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 30
+           end if
+           ! sort eigenvalues alpha/beta if desired
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selecting
+              if( ilascl )call stdlib_clascl( 'G', 0, 0, anrm, anrmto, n, 1, alpha, n, ierr )
+                        
+              if( ilbscl )call stdlib_clascl( 'G', 0, 0, bnrm, bnrmto, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alpha( i ), beta( i ) )
+              end do
+              call stdlib_ctgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb, alpha,beta, vsl, &
+              ldvsl, vsr, ldvsr, sdim, pvsl, pvsr,dif, work( iwrk ), lwork-iwrk+1, idum, 1, ierr )
+                        
+              if( ierr==1 )info = n + 3
+           end if
+           ! apply back-permutation to vsl and vsr
+           if( ilvsl )call stdlib_cggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_cggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsr, ldvsr, ierr )
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_clascl( 'U', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_clascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_clascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_clascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              sdim = 0
+              do i = 1, n
+                 cursl = selctg( alpha( i ), beta( i ) )
+                 if( cursl )sdim = sdim + 1
+                 if( cursl .and. .not.lastsl )info = n + 2
+                 lastsl = cursl
+              end do
+           end if
+           30 continue
+           work( 1 ) = cmplx( lwkopt,KIND=sp)
+           return
+     end subroutine stdlib_cgges3
+
+     !> CGGEV3: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, and optionally, the left and/or
+     !> right generalized eigenvectors.
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right generalized eigenvector v(j) corresponding to the
+     !> generalized eigenvalue lambda(j) of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j).
+     !> The left generalized eigenvector u(j) corresponding to the
+     !> generalized eigenvalues lambda(j) of (A,B) satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H * B
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_cggev3( jobvl, jobvr, n, a, lda, b, ldb, alpha, beta,vl, ldvl, vr, ldvr, &
+               work, lwork, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(sp), intent(out) :: rwork(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: alpha(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery
+           character :: chtemp
+           integer(ilp) :: icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, in, iright, irows, irwrk,&
+                      itau, iwrk, jc, jr, lwkopt
+           real(sp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           complex(sp) :: x
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,max,real,sqrt
+           ! Statement Functions 
+           real(sp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=sp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -11
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -13
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -15
+           end if
+           ! compute workspace
+           if( info==0 ) then
+              call stdlib_cgeqrf( n, n, b, ldb, work, work, -1, ierr )
+              lwkopt = max( n,  n+int( work( 1 ),KIND=ilp) )
+              call stdlib_cunmqr( 'L', 'C', n, n, n, b, ldb, work, a, lda, work,-1, ierr )
+              lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+              if( ilvl ) then
+                 call stdlib_cungqr( n, n, n, vl, ldvl, work, work, -1, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+              end if
+              if( ilv ) then
+                 call stdlib_cgghd3( jobvl, jobvr, n, 1, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                           work, -1, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+                 call stdlib_claqz0( 'S', jobvl, jobvr, n, 1, n, a, lda, b, ldb,alpha, beta, vl, &
+                           ldvl, vr, ldvr, work, -1,rwork, 0, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+              else
+                 call stdlib_cgghd3( 'N', 'N', n, 1, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, work, -&
+                           1, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+                 call stdlib_claqz0( 'E', jobvl, jobvr, n, 1, n, a, lda, b, ldb,alpha, beta, vl, &
+                           ldvl, vr, ldvr, work, -1,rwork, 0, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+              end if
+              work( 1 ) = cmplx( lwkopt,KIND=sp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGGEV3 ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_slamch( 'E' )*stdlib_slamch( 'B' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_clange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_clascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_clange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_clascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrices a, b to isolate eigenvalues if possible
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_cggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           irows = ihi + 1 - ilo
+           if( ilv ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_cgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           call stdlib_cunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl
+           if( ilvl ) then
+              call stdlib_claset( 'FULL', n, n, czero, cone, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_clacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_cungqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vr
+           if( ilvr )call stdlib_claset( 'FULL', n, n, czero, cone, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           if( ilv ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_cgghd3( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        work( iwrk ), lwork+1-iwrk,ierr )
+           else
+              call stdlib_cgghd3( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr,work( iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur form and schur vectors)
+           iwrk = itau
+           if( ilv ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_claqz0( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vl, &
+                     ldvl, vr, ldvr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), 0, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 70
+           end if
+           ! compute eigenvectors
+           if( ilv ) then
+              if( ilvl ) then
+                 if( ilvr ) then
+                    chtemp = 'B'
+                 else
+                    chtemp = 'L'
+                 end if
+              else
+                 chtemp = 'R'
+              end if
+              call stdlib_ctgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, n, &
+                        in, work( iwrk ), rwork( irwrk ),ierr )
+              if( ierr/=0 ) then
+                 info = n + 2
+                 go to 70
+              end if
+              ! undo balancing on vl and vr and normalization
+              if( ilvl ) then
+                 call stdlib_cggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), n, vl,&
+                            ldvl, ierr )
+                 loop_30: do jc = 1, n
+                    temp = zero
+                    do jr = 1, n
+                       temp = max( temp, abs1( vl( jr, jc ) ) )
+                    end do
+                    if( temp<smlnum )cycle loop_30
+                    temp = one / temp
+                    do jr = 1, n
+                       vl( jr, jc ) = vl( jr, jc )*temp
+                    end do
+                 end do loop_30
+              end if
+              if( ilvr ) then
+                 call stdlib_cggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), n, vr,&
+                            ldvr, ierr )
+                 loop_60: do jc = 1, n
+                    temp = zero
+                    do jr = 1, n
+                       temp = max( temp, abs1( vr( jr, jc ) ) )
+                    end do
+                    if( temp<smlnum )cycle loop_60
+                    temp = one / temp
+                    do jr = 1, n
+                       vr( jr, jc ) = vr( jr, jc )*temp
+                    end do
+                 end do loop_60
+              end if
+           end if
+           ! undo scaling if necessary
+           70 continue
+           if( ilascl )call stdlib_clascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           if( ilbscl )call stdlib_clascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           work( 1 ) = cmplx( lwkopt,KIND=sp)
+           return
+     end subroutine stdlib_cggev3
+
+     !> CGSVJ0: is called from CGESVJ as a pre-processor and that is its main
+     !> purpose. It applies Jacobi rotations in the same way as CGESVJ does, but
+     !> it does not check convergence (stopping criterion). Few tuning
+     !> parameters (marked by [TP]) are available for the implementer.
+
+     pure subroutine stdlib_cgsvj0( jobv, m, n, a, lda, d, sva, mv, v, ldv, eps,sfmin, tol, &
+               nsweep, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n, nsweep
+           real(sp), intent(in) :: eps, sfmin, tol
+           character, intent(in) :: jobv
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), d(n), v(ldv,*)
+           complex(sp), intent(out) :: work(lwork)
+           real(sp), intent(inout) :: sva(n)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(sp) :: aapq, ompq
+           real(sp) :: aapp, aapp0, aapq1, aaqq, apoaq, aqoap, big, bigtheta, cs, mxaapq, mxsinj, &
+                     rootbig, rooteps, rootsfmin, roottol, small, sn, t, temp1, theta, thsign
+           integer(ilp) :: blskip, emptsw, i, ibr, ierr, igl, ijblsk, ir1, iswrot, jbc, jgl, kbl, &
+                     lkahead, mvl, nbl, notrot, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, rotok, rsvec
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,conjg,real,min,sign,sqrt
+           ! from lapack
+           ! Executable Statements 
+           ! test the input parameters.
+           applv = stdlib_lsame( jobv, 'A' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -3
+           else if( lda<m ) then
+              info = -5
+           else if( ( rsvec.or.applv ) .and. ( mv<0 ) ) then
+              info = -8
+           else if( ( rsvec.and.( ldv<n ) ).or.( applv.and.( ldv<mv ) ) ) then
+              info = -10
+           else if( tol<=eps ) then
+              info = -13
+           else if( nsweep<0 ) then
+              info = -14
+           else if( lwork<m ) then
+              info = -16
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGSVJ0', -info )
+              return
+           end if
+           if( rsvec ) then
+              mvl = n
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           rooteps = sqrt( eps )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / eps
+           big = one / sfmin
+           rootbig = one / rootsfmin
+           bigtheta = one / rooteps
+           roottol = sqrt( tol )
+           ! .. row-cyclic jacobi svd algorithm with column pivoting ..
+           emptsw = ( n*( n-1 ) ) / 2
+           notrot = 0
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           swband = 0
+      ! [tp] swband is a tuning parameter [tp]. it is meaningful and effective
+           ! if stdlib_cgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_cgejsv. for sweeps i=1:swband the procedure
+           ! works on pivots inside a band-like region around the diagonal.
+           ! the boundaries are determined dynamically, based on the number of
+           ! pivots above a threshold.
+           kbl = min( 8, n )
+      ! [tp] kbl is a tuning parameter that defines the tile size in the
+           ! tiling of the p-q loops of pivot pairs. in general, an optimal
+           ! value of kbl depends on the matrix dimensions and on the
+           ! parameters of the computer's memory.
+           nbl = n / kbl
+           if( ( nbl*kbl )/=n )nbl = nbl + 1
+           blskip = kbl**2
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           lkahead = 1
+      ! [tp] lkahead is a tuning parameter.
+           ! quasi block transformations, using the lower (upper) triangular
+           ! structure of the input matrix. the quasi-block-cycling usually
+           ! invokes cubic convergence. big part of this cycle is done inside
+           ! canonical subspaces of dimensions less than m.
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+           ! each sweep is unrolled using kbl-by-kbl tiles over the pivot pairs
+           ! 1 <= p < q <= n. this is the first step toward a blocked implementation
+           ! of the rotations. new implementation, based on block transformations,
+           ! is under development.
+              loop_2000: do ibr = 1, nbl
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_1002: do ir1 = 0, min( lkahead, nbl-ibr )
+                    igl = igl + ir1*kbl
+                    loop_2001: do p = igl, min( igl+kbl-1, n-1 )
+           ! .. de rijk's pivoting
+                       q = stdlib_isamax( n-p+1, sva( p ), 1 ) + p - 1
+                       if( p/=q ) then
+                          call stdlib_cswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                          if( rsvec )call stdlib_cswap( mvl, v( 1, p ), 1,v( 1, q ), 1 )
+                          temp1 = sva( p )
+                          sva( p ) = sva( q )
+                          sva( q ) = temp1
+                          aapq = d(p)
+                          d(p) = d(q)
+                          d(q) = aapq
+                       end if
+                       if( ir1==0 ) then
+              ! column norms are periodically updated by explicit
+              ! norm computation.
+              ! caveat:
+              ! unfortunately, some blas implementations compute sncrm2(m,a(1,p),1)
+              ! as sqrt(s=stdlib_cdotc(m,a(1,p),1,a(1,p),1)), which may cause the result to
+              ! overflow for ||a(:,p)||_2 > sqrt(overflow_threshold), and to
+              ! underflow for ||a(:,p)||_2 < sqrt(underflow_threshold).
+              ! hence, stdlib_scnrm2 cannot be trusted, not even in the case when
+              ! the true norm is far from the under(over)flow boundaries.
+              ! if properly implemented stdlib_scnrm2 is available, the if-then-else-end if
+              ! below should be replaced with "aapp = stdlib_scnrm2( m, a(1,p), 1 )".
+                          if( ( sva( p )<rootbig ) .and.( sva( p )>rootsfmin ) ) then
+                             sva( p ) = stdlib_scnrm2( m, a( 1, p ), 1 )
+                          else
+                             temp1 = zero
+                             aapp = one
+                             call stdlib_classq( m, a( 1, p ), 1, temp1, aapp )
+                             sva( p ) = temp1*sqrt( aapp )
+                          end if
+                          aapp = sva( p )
+                       else
+                          aapp = sva( p )
+                       end if
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2002: do q = p + 1, min( igl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+                                if( aaqq>=one ) then
+                                   rotok = ( small*aapp )<=aaqq
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_cdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_ccopy( m, a( 1, p ), 1,work, 1 )
+                                      call stdlib_clascl( 'G', 0, 0, aapp, one,m, 1, work, lda, &
+                                                ierr )
+                                      aapq = stdlib_cdotc( m, work, 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   rotok = aapp<=( aaqq / small )
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_cdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aapp ) / aaqq
+                                   else
+                                      call stdlib_ccopy( m, a( 1, q ), 1,work, 1 )
+                                      call stdlib_clascl( 'G', 0, 0, aaqq,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_cdotc( m, a( 1, p ), 1,work, 1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg( cwork(p) ) * cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                   ompq = aapq / abs(aapq)
+                 ! Rotate
+      ! [rtd]      rotated = rotated + one
+                                   if( ir1==0 ) then
+                                      notrot = 0
+                                      pskipped = 0
+                                      iswrot = iswrot + 1
+                                   end if
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/aapq1
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_crot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if ( rsvec ) then
+                                             call stdlib_crot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_crot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if ( rsvec ) then
+                                             call stdlib_crot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      d(p) = -d(q) * ompq
+                                      else
+                    ! .. have to use modified gram-schmidt like transformation
+                                      call stdlib_ccopy( m, a( 1, p ), 1,work, 1 )
+                                      call stdlib_clascl( 'G', 0, 0, aapp, one, m,1, work, lda,&
+                                                ierr )
+                                      call stdlib_clascl( 'G', 0, 0, aaqq, one, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      call stdlib_caxpy( m, -aapq, work, 1,a( 1, q ), 1 )
+                                      call stdlib_clascl( 'G', 0, 0, one, aaqq, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                      mxsinj = max( mxsinj, sfmin )
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! recompute sva(q), sva(p).
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_scnrm2( m, a( 1, q ), 1 )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_classq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_scnrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_classq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                                else
+              ! a(:,p) and a(:,q) already numerically orthogonal
+                                   if( ir1==0 )notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                end if
+                             else
+              ! a(:,q) is zero column
+                                if( ir1==0 )notrot = notrot + 1
+                                pskipped = pskipped + 1
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                if( ir1==0 )aapp = -aapp
+                                notrot = 0
+                                go to 2103
+                             end if
+                          end do loop_2002
+           ! end q-loop
+           2103 continue
+           ! bailed out of q-loop
+                          sva( p ) = aapp
+                       else
+                          sva( p ) = aapp
+                          if( ( ir1==0 ) .and. ( aapp==zero ) )notrot = notrot + min( igl+kbl-1, &
+                                    n ) - p
+                       end if
+                    end do loop_2001
+           ! end of the p-loop
+           ! end of doing the block ( ibr, ibr )
+                 end do loop_1002
+           ! end of ir1-loop
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = ibr + 1, nbl
+                    jgl = ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! safe gram matrix computation
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_cdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_ccopy( m, a( 1, p ), 1,work, 1 )
+                                      call stdlib_clascl( 'G', 0, 0, aapp,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_cdotc( m, work, 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_cdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / max(&
+                                                aaqq,aapp) )/ min(aaqq,aapp)
+                                   else
+                                      call stdlib_ccopy( m, a( 1, q ), 1,work, 1 )
+                                      call stdlib_clascl( 'G', 0, 0, aaqq,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_cdotc( m, a( 1, p ), 1,work, 1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg(cwork(p))*cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                   ompq = aapq / abs(aapq)
+                                   notrot = 0
+      ! [rtd]      rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/ aapq1
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_crot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if( rsvec ) then
+                                             call stdlib_crot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_crot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if( rsvec ) then
+                                             call stdlib_crot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      d(p) = -d(q) * ompq
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                    if( aapp>aaqq ) then
+                                         call stdlib_ccopy( m, a( 1, p ), 1,work, 1 )
+                                         call stdlib_clascl( 'G', 0, 0, aapp, one,m, 1, work,lda,&
+                                                   ierr )
+                                         call stdlib_clascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         call stdlib_caxpy( m, -aapq, work,1, a( 1, q ), 1 )
+                                                   
+                                         call stdlib_clascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    else
+                                        call stdlib_ccopy( m, a( 1, q ), 1,work, 1 )
+                                         call stdlib_clascl( 'G', 0, 0, aaqq, one,m, 1, work,lda,&
+                                                   ierr )
+                                         call stdlib_clascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         call stdlib_caxpy( m, -conjg(aapq),work, 1, a( 1, p ), 1 &
+                                                   )
+                                         call stdlib_clascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! .. recompute sva(q), sva(p)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_scnrm2( m, a( 1, q ), 1)
+                                       else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_classq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_scnrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_classq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_scnrm2( m, a( 1, n ), 1 )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_classq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<sqrt( real( n,KIND=sp) )*tol ) .and. ( real( n,&
+                        KIND=sp)*mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:( reaching this point means that the procedure has not converged.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means numerical convergence after the i-th
+           ! sweep.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the vector sva() of column norms.
+           do p = 1, n - 1
+              q = stdlib_isamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 aapq = d( p )
+                 d( p ) = d( q )
+                 d( q ) = aapq
+                 call stdlib_cswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_cswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_cgsvj0
+
+     !> CGSVJ1: is called from CGESVJ as a pre-processor and that is its main
+     !> purpose. It applies Jacobi rotations in the same way as CGESVJ does, but
+     !> it targets only particular pivots and it does not check convergence
+     !> (stopping criterion). Few tuning parameters (marked by [TP]) are
+     !> available for the implementer.
+     !> Further Details
+     !> ~~~~~~~~~~~~~~~
+     !> CGSVJ1 applies few sweeps of Jacobi rotations in the column space of
+     !> the input M-by-N matrix A. The pivot pairs are taken from the (1,2)
+     !> off-diagonal block in the corresponding N-by-N Gram matrix A^T * A. The
+     !> block-entries (tiles) of the (1,2) off-diagonal block are marked by the
+     !> [x]'s in the following scheme:
+     !> | *  *  * [x] [x] [x]|
+     !> | *  *  * [x] [x] [x]|    Row-cycling in the nblr-by-nblc [x] blocks.
+     !> | *  *  * [x] [x] [x]|    Row-cyclic pivoting inside each [x] block.
+     !> |[x] [x] [x] *  *  * |
+     !> |[x] [x] [x] *  *  * |
+     !> |[x] [x] [x] *  *  * |
+     !> In terms of the columns of A, the first N1 columns are rotated 'against'
+     !> the remaining N-N1 columns, trying to increase the angle between the
+     !> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is
+     !> tiled using quadratic tiles of side KBL. Here, KBL is a tuning parameter.
+     !> The number of sweeps is given in NSWEEP and the orthogonality threshold
+     !> is given in TOL.
+
+     pure subroutine stdlib_cgsvj1( jobv, m, n, n1, a, lda, d, sva, mv, v, ldv,eps, sfmin, tol, &
+               nsweep, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: eps, sfmin, tol
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n, n1, nsweep
+           character, intent(in) :: jobv
+           ! Array Arguments 
+           complex(sp), intent(inout) :: a(lda,*), d(n), v(ldv,*)
+           complex(sp), intent(out) :: work(lwork)
+           real(sp), intent(inout) :: sva(n)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(sp) :: aapq, ompq
+           real(sp) :: aapp, aapp0, aapq1, aaqq, apoaq, aqoap, big, bigtheta, cs, mxaapq, mxsinj, &
+                     rootbig, rooteps, rootsfmin, roottol, small, sn, t, temp1, theta, thsign
+           integer(ilp) :: blskip, emptsw, i, ibr, igl, ierr, ijblsk, iswrot, jbc, jgl, kbl, mvl, &
+                     notrot, nblc, nblr, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, rotok, rsvec
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,conjg,real,min,sign,sqrt
+           ! From Lapack
+           ! Executable Statements 
+           ! test the input parameters.
+           applv = stdlib_lsame( jobv, 'A' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -3
+           else if( n1<0 ) then
+              info = -4
+           else if( lda<m ) then
+              info = -6
+           else if( ( rsvec.or.applv ) .and. ( mv<0 ) ) then
+              info = -9
+           else if( ( rsvec.and.( ldv<n ) ).or.( applv.and.( ldv<mv ) )  ) then
+              info = -11
+           else if( tol<=eps ) then
+              info = -14
+           else if( nsweep<0 ) then
+              info = -15
+           else if( lwork<m ) then
+              info = -17
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CGSVJ1', -info )
+              return
+           end if
+           if( rsvec ) then
+              mvl = n
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           rooteps = sqrt( eps )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / eps
+           big = one / sfmin
+           rootbig = one / rootsfmin
+           ! large = big / sqrt( real( m*n,KIND=sp) )
+           bigtheta = one / rooteps
+           roottol = sqrt( tol )
+           ! Initialize The Right Singular Vector Matrix 
+           ! rsvec = stdlib_lsame( jobv, 'y' )
+           emptsw = n1*( n-n1 )
+           notrot = 0
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           kbl = min( 8, n )
+           nblr = n1 / kbl
+           if( ( nblr*kbl )/=n1 )nblr = nblr + 1
+           ! .. the tiling is nblr-by-nblc [tiles]
+           nblc = ( n-n1 ) / kbl
+           if( ( nblc*kbl )/=( n-n1 ) )nblc = nblc + 1
+           blskip = ( kbl**2 ) + 1
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           swband = 0
+      ! [tp] swband is a tuning parameter. it is meaningful and effective
+           ! if stdlib_cgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_cgejsv.
+           ! | *   *   * [x] [x] [x]|
+           ! | *   *   * [x] [x] [x]|    row-cycling in the nblr-by-nblc [x] blocks.
+           ! | *   *   * [x] [x] [x]|    row-cyclic pivoting inside each [x] block.
+           ! |[x] [x] [x] *   *   * |
+           ! |[x] [x] [x] *   *   * |
+           ! |[x] [x] [x] *   *   * |
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+           ! each sweep is unrolled using kbl-by-kbl tiles over the pivot pairs
+           ! 1 <= p < q <= n. this is the first step toward a blocked implementation
+           ! of the rotations. new implementation, based on block transformations,
+           ! is under development.
+              loop_2000: do ibr = 1, nblr
+                 igl = ( ibr-1 )*kbl + 1
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                  ! do 2010 jbc = ibr + 1, nbl
+                 loop_2010: do jbc = 1, nblc
+                    jgl = ( jbc-1 )*kbl + n1 + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n1 )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! safe gram matrix computation
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_cdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_ccopy( m, a( 1, p ), 1,work, 1 )
+                                      call stdlib_clascl( 'G', 0, 0, aapp,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_cdotc( m, work, 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_cdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / max(&
+                                                aaqq,aapp) )/ min(aaqq,aapp)
+                                   else
+                                      call stdlib_ccopy( m, a( 1, q ), 1,work, 1 )
+                                      call stdlib_clascl( 'G', 0, 0, aaqq,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_cdotc( m, a( 1, p ), 1,work, 1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg(cwork(p))*cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                   ompq = aapq / abs(aapq)
+                                   notrot = 0
+      ! [rtd]      rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/ aapq1
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_crot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if( rsvec ) then
+                                             call stdlib_crot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_crot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if( rsvec ) then
+                                             call stdlib_crot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      d(p) = -d(q) * ompq
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                    if( aapp>aaqq ) then
+                                         call stdlib_ccopy( m, a( 1, p ), 1,work, 1 )
+                                         call stdlib_clascl( 'G', 0, 0, aapp, one,m, 1, work,lda,&
+                                                   ierr )
+                                         call stdlib_clascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         call stdlib_caxpy( m, -aapq, work,1, a( 1, q ), 1 )
+                                                   
+                                         call stdlib_clascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    else
+                                        call stdlib_ccopy( m, a( 1, q ), 1,work, 1 )
+                                         call stdlib_clascl( 'G', 0, 0, aaqq, one,m, 1, work,lda,&
+                                                   ierr )
+                                         call stdlib_clascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         call stdlib_caxpy( m, -conjg(aapq),work, 1, a( 1, p ), 1 &
+                                                   )
+                                         call stdlib_clascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! .. recompute sva(q), sva(p)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_scnrm2( m, a( 1, q ), 1)
+                                       else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_classq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_scnrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_classq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_scnrm2( m, a( 1, n ), 1 )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_classq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<sqrt( real( n,KIND=sp) )*tol ) .and. ( real( n,&
+                        KIND=sp)*mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:( reaching this point means that the procedure has not converged.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means numerical convergence after the i-th
+           ! sweep.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the vector sva() of column norms.
+           do p = 1, n - 1
+              q = stdlib_isamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 aapq = d( p )
+                 d( p ) = d( q )
+                 d( q ) = aapq
+                 call stdlib_cswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_cswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_cgsvj1
+
+     !> CHESV_AA: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Aasen's algorithm is used to factor A as
+     !> A = U**H * T * U,  if UPLO = 'U', or
+     !> A = L * T * L**H,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is Hermitian and tridiagonal. The factored form
+     !> of A is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_chesv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, lwkopt_hetrf, lwkopt_hetrs
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 2*n, 3*n-2 ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              call stdlib_chetrf_aa( uplo, n, a, lda, ipiv, work, -1, info )
+              lwkopt_hetrf = int( work(1),KIND=ilp)
+              call stdlib_chetrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,-1, info )
+              lwkopt_hetrs = int( work(1),KIND=ilp)
+              lwkopt = max( lwkopt_hetrf, lwkopt_hetrs )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHESV_AA ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u**h*t*u or a = l*t*l**h.
+           call stdlib_chetrf_aa( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_chetrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_chesv_aa
+
+     !> CHETRF_AA: computes the factorization of a complex hermitian matrix A
+     !> using the Aasen's algorithm.  The form of the factorization is
+     !> A = U**H*T*U  or  A = L*T*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is a hermitian tridiagonal matrix.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_chetrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: j, lwkopt
+           integer(ilp) :: nb, mj, nj, k1, k2, j1, j2, j3, jb
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Executable Statements 
+           ! determine the block size
+           nb = stdlib_ilaenv( 1, 'CHETRF_AA', uplo, n, -1, -1, -1 )
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<( 2*n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              lwkopt = (nb+1)*n
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CHETRF_AA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return
+           if ( n==0 ) then
+               return
+           endif
+           ipiv( 1 ) = 1
+           if ( n==1 ) then
+              a( 1, 1 ) = real( a( 1, 1 ),KIND=sp)
+              return
+           end if
+           ! adjust block size based on the workspace size
+           if( lwork<((1+nb)*n) ) then
+              nb = ( lwork-n ) / n
+           end if
+           if( upper ) then
+              ! .....................................................
+              ! factorize a as u**h*d*u using the upper triangle of a
+              ! .....................................................
+              ! copy first row a(1, 1:n) into h(1:n) (stored in work(1:n))
+              call stdlib_ccopy( n, a( 1, 1 ), lda, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_clahef;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              10 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j + 1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_clahef_aa( uplo, 2-k1, n-j, jb,a( max(1, j), j+1 ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_cswap( j1-k1-2, a( 1, j2 ), 1,a( 1, ipiv(j2) ), 1 )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+               ! the row a(j1-1, j2-1:n) stores u(j1, j2+1:n) and
+               ! work stores the current block of the auxiriarly matrix h
+              if( j<n ) then
+                ! if the first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = conjg( a( j, j+1 ) )
+                    a( j, j+1 ) = cone
+                    call stdlib_ccopy( n-j, a( j-1, j+1 ), lda,work( (j+1-j1+1)+jb*n ), 1 )
+                              
+                    call stdlib_cscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=0 and k2=1 for the first panel,
+                     ! and k1=1 and k2=0 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_cgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_cgemm( 'CONJUGATE TRANSPOSE', 'TRANSPOSE',1, mj, jb+1,-cone,&
+                           a( j1-k2, j3 ), lda,work( (j3-j1+1)+k1*n ), n,cone, a( j3, j3 ), lda )
+                                     
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block of j2-th block row with stdlib_cgemm
+                       call stdlib_cgemm( 'CONJUGATE TRANSPOSE', 'TRANSPOSE',nj, n-j3+1, jb+1,-&
+                       cone, a( j1-k2, j2 ), lda,work( (j3-j1+1)+k1*n ), n,cone, a( j2, j3 ), lda &
+                                 )
+                    end do
+                    ! recover t( j, j+1 )
+                    a( j, j+1 ) = conjg( alpha )
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_ccopy( n-j, a( j+1, j+1 ), lda, work( 1 ), 1 )
+              end if
+              go to 10
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! .....................................................
+              ! copy first column a(1:n, 1) into h(1:n, 1)
+               ! (stored in work(1:n))
+              call stdlib_ccopy( n, a( 1, 1 ), 1, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_clahef;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              11 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j+1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_clahef_aa( uplo, 2-k1, n-j, jb,a( j+1, max(1, j) ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_cswap( j1-k1-2, a( j2, 1 ), lda,a( ipiv(j2), 1 ), lda )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+                ! a(j2+1, j1-1) stores l(j2+1, j1) and
+                ! work(j2+1, 1) stores h(j2+1, 1)
+              if( j<n ) then
+                ! if the first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = conjg( a( j+1, j ) )
+                    a( j+1, j ) = cone
+                    call stdlib_ccopy( n-j, a( j+1, j-1 ), 1,work( (j+1-j1+1)+jb*n ), 1 )
+                    call stdlib_cscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=0 and k2=1 for the first panel,
+                     ! and k1=1 and k2=0 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_cgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',mj, 1, jb+1,-&
+                          cone, work( (j3-j1+1)+k1*n ), n,a( j3, j1-k2 ), lda,cone, a( j3, j3 ), &
+                                    lda )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block of j2-th block column with stdlib_cgemm
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',n-j3+1, nj, jb+1,-&
+                       cone, work( (j3-j1+1)+k1*n ), n,a( j2, j1-k2 ), lda,cone, a( j3, j2 ), lda &
+                                 )
+                    end do
+                    ! recover t( j+1, j )
+                    a( j+1, j ) = conjg( alpha )
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_ccopy( n-j, a( j+1, j+1 ), 1, work( 1 ), 1 )
+              end if
+              go to 11
+           end if
+           20 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_chetrf_aa
+
+     !> CHSEQR: computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**H, where T is an upper triangular matrix (the
+     !> Schur form), and Z is the unitary matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input unitary
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*T*(QZ)**H.
+
+     pure subroutine stdlib_chseqr( job, compz, n, ilo, ihi, h, ldh, w, z, ldz,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           character, intent(in) :: compz, job
+           ! Array Arguments 
+           complex(sp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(sp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: nl = 49
+           real(sp), parameter :: rzero = 0.0_sp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_clahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== nl allocates some local workspace to help small matrices
+           ! .    through a rare stdlib_clahqr failure.  nl > ntiny = 15 is
+           ! .    required and nl <= nmin = stdlib_ilaenv(ispec=12,...) is recom-
+           ! .    mended.  (the default value of nmin is 75.)  using nl = 49
+           ! .    allows up to six simultaneous shifts and a 16-by-16
+           ! .    deflation window.  ====
+           
+           
+           
+           ! Local Arrays 
+           complex(sp) :: hl(nl,nl), workl(nl)
+           ! Local Scalars 
+           integer(ilp) :: kbot, nmin
+           logical(lk) :: initz, lquery, wantt, wantz
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,max,min,real
+           ! Executable Statements 
+           ! ==== decode and check the input parameters. ====
+           wantt = stdlib_lsame( job, 'S' )
+           initz = stdlib_lsame( compz, 'I' )
+           wantz = initz .or. stdlib_lsame( compz, 'V' )
+           work( 1 ) = cmplx( real( max( 1, n ),KIND=sp), rzero,KIND=sp)
+           lquery = lwork==-1
+           info = 0
+           if( .not.stdlib_lsame( job, 'E' ) .and. .not.wantt ) then
+              info = -1
+           else if( .not.stdlib_lsame( compz, 'N' ) .and. .not.wantz ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -4
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -5
+           else if( ldh<max( 1, n ) ) then
+              info = -7
+           else if( ldz<1 .or. ( wantz .and. ldz<max( 1, n ) ) ) then
+              info = -10
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              ! ==== quick return in case of invalid argument. ====
+              call stdlib_xerbla( 'CHSEQR', -info )
+              return
+           else if( n==0 ) then
+              ! ==== quick return in case n = 0; nothing to do. ====
+              return
+           else if( lquery ) then
+              ! ==== quick return in case of a workspace query ====
+              call stdlib_claqr0( wantt, wantz, n, ilo, ihi, h, ldh, w, ilo, ihi, z,ldz, work, &
+                        lwork, info )
+              ! ==== ensure reported workspace size is backward-compatible with
+              ! .    previous lapack versions. ====
+              work( 1 ) = cmplx( max( real( work( 1 ),KIND=sp), real( max( 1,n ),KIND=sp) ), &
+                        rzero,KIND=sp)
+              return
+           else
+              ! ==== copy eigenvalues isolated by stdlib_cgebal ====
+              if( ilo>1 )call stdlib_ccopy( ilo-1, h, ldh+1, w, 1 )
+              if( ihi<n )call stdlib_ccopy( n-ihi, h( ihi+1, ihi+1 ), ldh+1, w( ihi+1 ), 1 )
+                        
+              ! ==== initialize z, if requested ====
+              if( initz )call stdlib_claset( 'A', n, n, czero, cone, z, ldz )
+              ! ==== quick return if possible ====
+              if( ilo==ihi ) then
+                 w( ilo ) = h( ilo, ilo )
+                 return
+              end if
+              ! ==== stdlib_clahqr/stdlib_claqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'CHSEQR', job( : 1 ) // compz( : 1 ), n,ilo, ihi, lwork )
+                        
+              nmin = max( ntiny, nmin )
+              ! ==== stdlib_claqr0 for big matrices; stdlib_clahqr for small ones ====
+              if( n>nmin ) then
+                 call stdlib_claqr0( wantt, wantz, n, ilo, ihi, h, ldh, w, ilo, ihi,z, ldz, work, &
+                           lwork, info )
+              else
+                 ! ==== small matrix ====
+                 call stdlib_clahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, ilo, ihi,z, ldz, info )
+                           
+                 if( info>0 ) then
+                    ! ==== a rare stdlib_clahqr failure!  stdlib_claqr0 sometimes succeeds
+                    ! .    when stdlib_clahqr fails. ====
+                    kbot = info
+                    if( n>=nl ) then
+                       ! ==== larger matrices have enough subdiagonal scratch
+                       ! .    space to call stdlib_claqr0 directly. ====
+                       call stdlib_claqr0( wantt, wantz, n, ilo, kbot, h, ldh, w,ilo, ihi, z, ldz,&
+                                  work, lwork, info )
+                    else
+                       ! ==== tiny matrices don't have enough subdiagonal
+                       ! .    scratch space to benefit from stdlib_claqr0.  hence,
+                       ! .    tiny matrices must be copied into a larger
+                       ! .    array before calling stdlib_claqr0. ====
+                       call stdlib_clacpy( 'A', n, n, h, ldh, hl, nl )
+                       hl( n+1, n ) = czero
+                       call stdlib_claset( 'A', nl, nl-n, czero, czero, hl( 1, n+1 ),nl )
+                       call stdlib_claqr0( wantt, wantz, nl, ilo, kbot, hl, nl, w,ilo, ihi, z, &
+                                 ldz, workl, nl, info )
+                       if( wantt .or. info/=0 )call stdlib_clacpy( 'A', n, n, hl, nl, h, ldh )
+                                 
+                    end if
+                 end if
+              end if
+              ! ==== clear out the trash, if necessary. ====
+              if( ( wantt .or. info/=0 ) .and. n>2 )call stdlib_claset( 'L', n-2, n-2, czero, &
+                        czero, h( 3, 1 ), ldh )
+              ! ==== ensure reported workspace size is backward-compatible with
+              ! .    previous lapack versions. ====
+              work( 1 ) = cmplx( max( real( max( 1, n ),KIND=sp),real( work( 1 ),KIND=sp) ), &
+                        rzero,KIND=sp)
+           end if
+     end subroutine stdlib_chseqr
+
+     !> CLAHEF_AA: factorizes a panel of a complex hermitian matrix A using
+     !> the Aasen's algorithm. The panel consists of a set of NB rows of A
+     !> when UPLO is U, or a set of NB columns when UPLO is L.
+     !> In order to factorize the panel, the Aasen's algorithm requires the
+     !> last row, or column, of the previous panel. The first row, or column,
+     !> of A is set to be the first row, or column, of an identity matrix,
+     !> which is used to factorize the first panel.
+     !> The resulting J-th row of U, or J-th column of L, is stored in the
+     !> (J-1)-th row, or column, of A (without the unit diagonals), while
+     !> the diagonal and subdiagonal of A are overwritten by those of T.
+
+     pure subroutine stdlib_clahef_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: m, nb, j1, lda, ldh
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), h(ldh,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, k, k1, i1, i2, mj
+           complex(sp) :: piv, alpha
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Executable Statements 
+           j = 1
+           ! k1 is the first column of the panel to be factorized
+           ! i.e.,  k1 is 2 for the first block column, and 1 for the rest of the blocks
+           k1 = (2-j1)+1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              10 continue
+              if ( j>min(m, nb) )go to 20
+              ! k is the column to be factorized
+               ! when being called from stdlib_chetrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:n, j) := a(j, j:n) - h(j:n, 1:(j-1)) * l(j1:(j-1), j),
+               ! where h(j:n, j) has been initialized to be a(j, j:n)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_clacgv( j-k1, a( 1, j ), 1 )
+                 call stdlib_cgemv( 'NO TRANSPOSE', mj, j-k1,-cone, h( j, k1 ), ldh,a( 1, j ), 1,&
+                           cone, h( j, j ), 1 )
+                 call stdlib_clacgv( j-k1, a( 1, j ), 1 )
+              end if
+              ! copy h(i:n, i) into work
+              call stdlib_ccopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j-1, j:n) * t(j-1,j),
+                  ! where a(j-1, j) stores t(j-1, j) and a(j-2, j:n) stores u(j-1, j:n)
+                 alpha = -conjg( a( k-1, j ) )
+                 call stdlib_caxpy( mj, alpha, a( k-2, j ), lda, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( k, j ) = real( work( 1 ),KIND=sp)
+              if( j<m ) then
+                 ! compute work(2:n) = t(j, j) l(j, (j+1):n)
+                  ! where a(j, j) stores t(j, j) and a(j-1, (j+1):n) stores u(j, (j+1):n)
+                 if( k>1 ) then
+                    alpha = -a( k, j )
+                    call stdlib_caxpy( m-j, alpha, a( k-1, j+1 ), lda,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:n)|)
+                 i2 = stdlib_icamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply hermitian pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1, i1+1:n) with a(i1+1:n, i2)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_cswap( i2-i1-1, a( j1+i1-1, i1+1 ), lda,a( j1+i1, i2 ), 1 )
+                              
+                    call stdlib_clacgv( i2-i1, a( j1+i1-1, i1+1 ), lda )
+                    call stdlib_clacgv( i2-i1-1, a( j1+i1, i2 ), 1 )
+                    ! swap a(i1, i2+1:n) with a(i2, i2+1:n)
+                    if( i2<m )call stdlib_cswap( m-i2, a( j1+i1-1, i2+1 ), lda,a( j1+i2-1, i2+1 ),&
+                               lda )
+                    ! swap a(i1, i1) with a(i2,i2)
+                    piv = a( i1+j1-1, i1 )
+                    a( j1+i1-1, i1 ) = a( j1+i2-1, i2 )
+                    a( j1+i2-1, i2 ) = piv
+                    ! swap h(i1, 1:j1) with h(i2, 1:j1)
+                    call stdlib_cswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_cswap( i1-k1+1, a( 1, i1 ), 1,a( 1, i2 ), 1 )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j, j+1) = t(j, j+1)
+                 a( k, j+1 ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:n, j+1) into h(j:n, j),
+                    call stdlib_ccopy( m-j, a( k+1, j+1 ), lda,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:n ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:n, j) = l(j+2:n, j+1)
+                 if( j<(m-1) ) then
+                    if( a( k, j+1 )/=czero ) then
+                       alpha = cone / a( k, j+1 )
+                       call stdlib_ccopy( m-j-1, work( 3 ), 1, a( k, j+2 ), lda )
+                       call stdlib_cscal( m-j-1, alpha, a( k, j+2 ), lda )
+                    else
+                       call stdlib_claset( 'FULL', 1, m-j-1, czero, czero,a( k, j+2 ), lda)
+                                 
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 10
+              20 continue
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              30 continue
+              if( j>min( m, nb ) )go to 40
+              ! k is the column to be factorized
+               ! when being called from stdlib_chetrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:n, j) := a(j:n, j) - h(j:n, 1:(j-1)) * l(j, j1:(j-1))^t,
+               ! where h(j:n, j) has been initialized to be a(j:n, j)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_clacgv( j-k1, a( j, 1 ), lda )
+                 call stdlib_cgemv( 'NO TRANSPOSE', mj, j-k1,-cone, h( j, k1 ), ldh,a( j, 1 ), &
+                           lda,cone, h( j, j ), 1 )
+                 call stdlib_clacgv( j-k1, a( j, 1 ), lda )
+              end if
+              ! copy h(j:n, j) into work
+              call stdlib_ccopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j:n, j-1) * t(j-1,j),
+                  ! where a(j-1, j) = t(j-1, j) and a(j, j-2) = l(j, j-1)
+                 alpha = -conjg( a( j, k-1 ) )
+                 call stdlib_caxpy( mj, alpha, a( j, k-2 ), 1, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( j, k ) = real( work( 1 ),KIND=sp)
+              if( j<m ) then
+                 ! compute work(2:n) = t(j, j) l((j+1):n, j)
+                  ! where a(j, j) = t(j, j) and a((j+1):n, j-1) = l((j+1):n, j)
+                 if( k>1 ) then
+                    alpha = -a( j, k )
+                    call stdlib_caxpy( m-j, alpha, a( j+1, k-1 ), 1,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:n)|)
+                 i2 = stdlib_icamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply hermitian pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1+1:n, i1) with a(i2, i1+1:n)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_cswap( i2-i1-1, a( i1+1, j1+i1-1 ), 1,a( i2, j1+i1 ), lda )
+                              
+                    call stdlib_clacgv( i2-i1, a( i1+1, j1+i1-1 ), 1 )
+                    call stdlib_clacgv( i2-i1-1, a( i2, j1+i1 ), lda )
+                    ! swap a(i2+1:n, i1) with a(i2+1:n, i2)
+                    if( i2<m )call stdlib_cswap( m-i2, a( i2+1, j1+i1-1 ), 1,a( i2+1, j1+i2-1 ), &
+                              1 )
+                    ! swap a(i1, i1) with a(i2, i2)
+                    piv = a( i1, j1+i1-1 )
+                    a( i1, j1+i1-1 ) = a( i2, j1+i2-1 )
+                    a( i2, j1+i2-1 ) = piv
+                    ! swap h(i1, i1:j1) with h(i2, i2:j1)
+                    call stdlib_cswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_cswap( i1-k1+1, a( i1, 1 ), lda,a( i2, 1 ), lda )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j+1, j) = t(j+1, j)
+                 a( j+1, k ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:n, j+1) into h(j+1:n, j),
+                    call stdlib_ccopy( m-j, a( j+1, k+1 ), 1,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:n ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:n, j) = l(j+2:n, j+1)
+                 if( j<(m-1) ) then
+                    if( a( j+1, k )/=czero ) then
+                       alpha = cone / a( j+1, k )
+                       call stdlib_ccopy( m-j-1, work( 3 ), 1, a( j+2, k ), 1 )
+                       call stdlib_cscal( m-j-1, alpha, a( j+2, k ), 1 )
+                    else
+                       call stdlib_claset( 'FULL', m-j-1, 1, czero, czero,a( j+2, k ), lda )
+                                 
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 30
+              40 continue
+           end if
+           return
+     end subroutine stdlib_clahef_aa
+
+     !> CLAQR0: computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**H, where T is an upper triangular matrix (the
+     !> Schur form), and Z is the unitary matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input unitary
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+
+     pure subroutine stdlib_claqr0( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, work,&
+                lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(sp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(sp), intent(out) :: w(*), work(*)
+        ! ================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: kexnw = 5
+           integer(ilp), parameter :: kexsh = 6
+           real(sp), parameter :: wilk1 = 0.75_sp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_clahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== exceptional deflation windows:  try to cure rare
+           ! .    slow convergence by varying the size of the
+           ! .    deflation window after kexnw iterations. ====
+           
+           ! ==== exceptional shifts: try to cure rare slow convergence
+           ! .    with ad-hoc exceptional shifts every kexsh iterations.
+           ! .    ====
+           
+           ! ==== the constant wilk1 is used to form the exceptional
+           ! .    shifts. ====
+           
+           
+           
+           ! Local Scalars 
+           complex(sp) :: aa, bb, cc, cdum, dd, det, rtdisc, swap, tr2
+           real(sp) :: s
+           integer(ilp) :: i, inf, it, itmax, k, kacc22, kbot, kdu, ks, kt, ktop, ku, kv, kwh, &
+           kwtop, kwv, ld, ls, lwkopt, ndec, ndfl, nh, nho, nibble, nmin, ns, nsmax, nsr, nve, nw,&
+                      nwmax, nwr, nwupbd
+           logical(lk) :: sorted
+           character :: jbcmpz*2
+           ! Local Arrays 
+           complex(sp) :: zdum(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,int,max,min,mod,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           info = 0
+           ! ==== quick return for n = 0: nothing to do. ====
+           if( n==0 ) then
+              work( 1 ) = cone
+              return
+           end if
+           if( n<=ntiny ) then
+              ! ==== tiny matrices must use stdlib_clahqr. ====
+              lwkopt = 1
+              if( lwork/=-1 )call stdlib_clahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, &
+                        z, ldz, info )
+           else
+              ! ==== use small bulge multi-shift qr with aggressive early
+              ! .    deflation on larger-than-tiny matrices. ====
+              ! ==== hope for the best. ====
+              info = 0
+              ! ==== set up job flags for stdlib_ilaenv. ====
+              if( wantt ) then
+                 jbcmpz( 1: 1 ) = 'S'
+              else
+                 jbcmpz( 1: 1 ) = 'E'
+              end if
+              if( wantz ) then
+                 jbcmpz( 2: 2 ) = 'V'
+              else
+                 jbcmpz( 2: 2 ) = 'N'
+              end if
+              ! ==== nwr = recommended deflation window size.  at this
+              ! .    point,  n > ntiny = 15, so there is enough
+              ! .    subdiagonal workspace for nwr>=2 as required.
+              ! .    (in fact, there is enough subdiagonal space for
+              ! .    nwr>=4.) ====
+              nwr = stdlib_ilaenv( 13, 'CLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nwr = max( 2, nwr )
+              nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+              ! ==== nsr = recommended number of simultaneous shifts.
+              ! .    at this point n > ntiny = 15, so there is at
+              ! .    enough subdiagonal workspace for nsr to be even
+              ! .    and greater than or equal to two as required. ====
+              nsr = stdlib_ilaenv( 15, 'CLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nsr = min( nsr, ( n-3 ) / 6, ihi-ilo )
+              nsr = max( 2, nsr-mod( nsr, 2 ) )
+              ! ==== estimate optimal workspace ====
+              ! ==== workspace query call to stdlib_claqr3 ====
+              call stdlib_claqr3( wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz,ihiz, z, ldz, ls,&
+                         ld, w, h, ldh, n, h, ldh, n, h,ldh, work, -1 )
+              ! ==== optimal workspace = max(stdlib_claqr5, stdlib_claqr3) ====
+              lwkopt = max( 3*nsr / 2, int( work( 1 ),KIND=ilp) )
+              ! ==== quick return in case of workspace query. ====
+              if( lwork==-1 ) then
+                 work( 1 ) = cmplx( lwkopt, 0,KIND=sp)
+                 return
+              end if
+              ! ==== stdlib_clahqr/stdlib_claqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'CLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nmin = max( ntiny, nmin )
+              ! ==== nibble crossover point ====
+              nibble = stdlib_ilaenv( 14, 'CLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nibble = max( 0, nibble )
+              ! ==== accumulate reflections during ttswp?  use block
+              ! .    2-by-2 structure during matrix-matrix multiply? ====
+              kacc22 = stdlib_ilaenv( 16, 'CLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              kacc22 = max( 0, kacc22 )
+              kacc22 = min( 2, kacc22 )
+              ! ==== nwmax = the largest possible deflation window for
+              ! .    which there is sufficient workspace. ====
+              nwmax = min( ( n-1 ) / 3, lwork / 2 )
+              nw = nwmax
+              ! ==== nsmax = the largest number of simultaneous shifts
+              ! .    for which there is sufficient workspace. ====
+              nsmax = min( ( n-3 ) / 6, 2*lwork / 3 )
+              nsmax = nsmax - mod( nsmax, 2 )
+              ! ==== ndfl: an iteration count restarted at deflation. ====
+              ndfl = 1
+              ! ==== itmax = iteration limit ====
+              itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )
+              ! ==== last row and column in the active block ====
+              kbot = ihi
+              ! ==== main loop ====
+              loop_70: do it = 1, itmax
+                 ! ==== done when kbot falls below ilo ====
+                 if( kbot<ilo )go to 80
+                 ! ==== locate active block ====
+                 do k = kbot, ilo + 1, -1
+                    if( h( k, k-1 )==czero )go to 20
+                 end do
+                 k = ilo
+                 20 continue
+                 ktop = k
+                 ! ==== select deflation window size:
+                 ! .    typical case:
+                 ! .      if possible and advisable, nibble the entire
+                 ! .      active block.  if not, use size min(nwr,nwmax)
+                 ! .      or min(nwr+1,nwmax) depending upon which has
+                 ! .      the smaller corresponding subdiagonal entry
+                 ! .      (a heuristic).
+                 ! .    exceptional case:
+                 ! .      if there have been no deflations in kexnw or
+                 ! .      more iterations, then vary the deflation window
+                 ! .      size.   at first, because, larger windows are,
+                 ! .      in general, more powerful than smaller ones,
+                 ! .      rapidly increase the window to the maximum possible.
+                 ! .      then, gradually reduce the window size. ====
+                 nh = kbot - ktop + 1
+                 nwupbd = min( nh, nwmax )
+                 if( ndfl<kexnw ) then
+                    nw = min( nwupbd, nwr )
+                 else
+                    nw = min( nwupbd, 2*nw )
+                 end if
+                 if( nw<nwmax ) then
+                    if( nw>=nh-1 ) then
+                       nw = nh
+                    else
+                       kwtop = kbot - nw + 1
+                       if( cabs1( h( kwtop, kwtop-1 ) )>cabs1( h( kwtop-1, kwtop-2 ) ) )nw = nw + &
+                                 1
+                    end if
+                 end if
+                 if( ndfl<kexnw ) then
+                    ndec = -1
+                 else if( ndec>=0 .or. nw>=nwupbd ) then
+                    ndec = ndec + 1
+                    if( nw-ndec<2 )ndec = 0
+                    nw = nw - ndec
+                 end if
+                 ! ==== aggressive early deflation:
+                 ! .    split workspace under the subdiagonal into
+                 ! .      - an nw-by-nw work array v in the lower
+                 ! .        left-hand-corner,
+                 ! .      - an nw-by-at-least-nw-but-more-is-better
+                 ! .        (nw-by-nho) horizontal work array along
+                 ! .        the bottom edge,
+                 ! .      - an at-least-nw-but-more-is-better (nhv-by-nw)
+                 ! .        vertical work array along the left-hand-edge.
+                 ! .        ====
+                 kv = n - nw + 1
+                 kt = nw + 1
+                 nho = ( n-nw-1 ) - kt + 1
+                 kwv = nw + 2
+                 nve = ( n-nw ) - kwv + 1
+                 ! ==== aggressive early deflation ====
+                 call stdlib_claqr3( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+                 ls, ld, w, h( kv, 1 ), ldh, nho,h( kv, kt ), ldh, nve, h( kwv, 1 ), ldh, work,&
+                           lwork )
+                 ! ==== adjust kbot accounting for new deflations. ====
+                 kbot = kbot - ld
+                 ! ==== ks points to the shifts. ====
+                 ks = kbot - ls + 1
+                 ! ==== skip an expensive qr sweep if there is a (partly
+                 ! .    heuristic) reason to expect that many eigenvalues
+                 ! .    will deflate without it.  here, the qr sweep is
+                 ! .    skipped if many eigenvalues have just been deflated
+                 ! .    or if the remaining active block is small.
+                 if( ( ld==0 ) .or. ( ( 100*ld<=nw*nibble ) .and. ( kbot-ktop+1>min( nmin, nwmax )&
+                            ) ) ) then
+                    ! ==== ns = nominal number of simultaneous shifts.
+                    ! .    this may be lowered (slightly) if stdlib_claqr3
+                    ! .    did not provide that many shifts. ====
+                    ns = min( nsmax, nsr, max( 2, kbot-ktop ) )
+                    ns = ns - mod( ns, 2 )
+                    ! ==== if there have been no deflations
+                    ! .    in a multiple of kexsh iterations,
+                    ! .    then try exceptional shifts.
+                    ! .    otherwise use shifts provided by
+                    ! .    stdlib_claqr3 above or from the eigenvalues
+                    ! .    of a trailing principal submatrix. ====
+                    if( mod( ndfl, kexsh )==0 ) then
+                       ks = kbot - ns + 1
+                       do i = kbot, ks + 1, -2
+                          w( i ) = h( i, i ) + wilk1*cabs1( h( i, i-1 ) )
+                          w( i-1 ) = w( i )
+                       end do
+                    else
+                       ! ==== got ns/2 or fewer shifts? use stdlib_claqr4 or
+                       ! .    stdlib_clahqr on a trailing principal submatrix to
+                       ! .    get more. (since ns<=nsmax<=(n-3)/6,
+                       ! .    there is enough space below the subdiagonal
+                       ! .    to fit an ns-by-ns scratch array.) ====
+                       if( kbot-ks+1<=ns / 2 ) then
+                          ks = kbot - ns + 1
+                          kt = n - ns + 1
+                          call stdlib_clacpy( 'A', ns, ns, h( ks, ks ), ldh,h( kt, 1 ), ldh )
+                                    
+                          if( ns>nmin ) then
+                             call stdlib_claqr4( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, w( &
+                                       ks ), 1, 1,zdum, 1, work, lwork, inf )
+                          else
+                             call stdlib_clahqr( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, w( &
+                                       ks ), 1, 1,zdum, 1, inf )
+                          end if
+                          ks = ks + inf
+                          ! ==== in case of a rare qr failure use
+                          ! .    eigenvalues of the trailing 2-by-2
+                          ! .    principal submatrix.  scale to avoid
+                          ! .    overflows, underflows and subnormals.
+                          ! .    (the scale factor s can not be czero,
+                          ! .    because h(kbot,kbot-1) is nonzero.) ====
+                          if( ks>=kbot ) then
+                             s = cabs1( h( kbot-1, kbot-1 ) ) +cabs1( h( kbot, kbot-1 ) ) +cabs1( &
+                                       h( kbot-1, kbot ) ) +cabs1( h( kbot, kbot ) )
+                             aa = h( kbot-1, kbot-1 ) / s
+                             cc = h( kbot, kbot-1 ) / s
+                             bb = h( kbot-1, kbot ) / s
+                             dd = h( kbot, kbot ) / s
+                             tr2 = ( aa+dd ) / two
+                             det = ( aa-tr2 )*( dd-tr2 ) - bb*cc
+                             rtdisc = sqrt( -det )
+                             w( kbot-1 ) = ( tr2+rtdisc )*s
+                             w( kbot ) = ( tr2-rtdisc )*s
+                             ks = kbot - 1
+                          end if
+                       end if
+                       if( kbot-ks+1>ns ) then
+                          ! ==== sort the shifts (helps a little) ====
+                          sorted = .false.
+                          do k = kbot, ks + 1, -1
+                             if( sorted )go to 60
+                             sorted = .true.
+                             do i = ks, k - 1
+                                if( cabs1( w( i ) )<cabs1( w( i+1 ) ) )then
+                                   sorted = .false.
+                                   swap = w( i )
+                                   w( i ) = w( i+1 )
+                                   w( i+1 ) = swap
+                                end if
+                             end do
+                          end do
+                          60 continue
+                       end if
+                    end if
+                    ! ==== if there are only two shifts, then use
+                    ! .    only cone.  ====
+                    if( kbot-ks+1==2 ) then
+                       if( cabs1( w( kbot )-h( kbot, kbot ) )<cabs1( w( kbot-1 )-h( kbot, kbot ) )&
+                                  ) then
+                          w( kbot-1 ) = w( kbot )
+                       else
+                          w( kbot ) = w( kbot-1 )
+                       end if
+                    end if
+                    ! ==== use up to ns of the the smallest magnitude
+                    ! .    shifts.  if there aren't ns shifts available,
+                    ! .    then use them all, possibly dropping cone to
+                    ! .    make the number of shifts even. ====
+                    ns = min( ns, kbot-ks+1 )
+                    ns = ns - mod( ns, 2 )
+                    ks = kbot - ns + 1
+                    ! ==== small-bulge multi-shift qr sweep:
+                    ! .    split workspace under the subdiagonal into
+                    ! .    - a kdu-by-kdu work array u in the lower
+                    ! .      left-hand-corner,
+                    ! .    - a kdu-by-at-least-kdu-but-more-is-better
+                    ! .      (kdu-by-nho) horizontal work array wh along
+                    ! .      the bottom edge,
+                    ! .    - and an at-least-kdu-but-more-is-better-by-kdu
+                    ! .      (nve-by-kdu) vertical work wv arrow along
+                    ! .      the left-hand-edge. ====
+                    kdu = 2*ns
+                    ku = n - kdu + 1
+                    kwh = kdu + 1
+                    nho = ( n-kdu+1-4 ) - ( kdu+1 ) + 1
+                    kwv = kdu + 4
+                    nve = n - kdu - kwv + 1
+                    ! ==== small-bulge multi-shift qr sweep ====
+                    call stdlib_claqr5( wantt, wantz, kacc22, n, ktop, kbot, ns,w( ks ), h, ldh, &
+                    iloz, ihiz, z, ldz, work,3, h( ku, 1 ), ldh, nve, h( kwv, 1 ), ldh,nho, h( ku,&
+                               kwh ), ldh )
+                 end if
+                 ! ==== note progress (or the lack of it). ====
+                 if( ld>0 ) then
+                    ndfl = 1
+                 else
+                    ndfl = ndfl + 1
+                 end if
+                 ! ==== end of main loop ====
+              end do loop_70
+              ! ==== iteration limit exceeded.  set info to show where
+              ! .    the problem occurred and exit. ====
+              info = kbot
+              80 continue
+           end if
+           ! ==== return the optimal value of lwork. ====
+           work( 1 ) = cmplx( lwkopt, 0,KIND=sp)
+     end subroutine stdlib_claqr0
+
+     !> Aggressive early deflation:
+     !> CLAQR3: accepts as input an upper Hessenberg matrix
+     !> H and performs an unitary similarity transformation
+     !> designed to detect and deflate fully converged eigenvalues from
+     !> a trailing principal submatrix.  On output H has been over-
+     !> written by a new Hessenberg matrix that is a perturbation of
+     !> an unitary similarity transformation of H.  It is to be
+     !> hoped that the final version of H has many zero subdiagonal
+     !> entries.
+
+     pure subroutine stdlib_claqr3( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+               ns, nd, sh, v, ldv, nh, t, ldt,nv, wv, ldwv, work, lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kbot, ktop, ldh, ldt, ldv, ldwv, ldz, lwork, n,&
+                      nh, nv, nw
+           integer(ilp), intent(out) :: nd, ns
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(sp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(sp), intent(out) :: sh(*), t(ldt,*), v(ldv,*), work(*), wv(ldwv,*)
+        ! ================================================================
+           ! Parameters 
+           real(sp), parameter :: rzero = 0.0_sp
+           real(sp), parameter :: rone = 1.0_sp
+           
+           
+           ! Local Scalars 
+           complex(sp) :: beta, cdum, s, tau
+           real(sp) :: foo, safmax, safmin, smlnum, ulp
+           integer(ilp) :: i, ifst, ilst, info, infqr, j, jw, kcol, kln, knt, krow, kwtop, ltop, &
+                     lwk1, lwk2, lwk3, lwkopt, nmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,conjg,int,max,min,real
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! ==== estimate optimal workspace. ====
+           jw = min( nw, kbot-ktop+1 )
+           if( jw<=2 ) then
+              lwkopt = 1
+           else
+              ! ==== workspace query call to stdlib_cgehrd ====
+              call stdlib_cgehrd( jw, 1, jw-1, t, ldt, work, work, -1, info )
+              lwk1 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_cunmhr ====
+              call stdlib_cunmhr( 'R', 'N', jw, jw, 1, jw-1, t, ldt, work, v, ldv,work, -1, info )
+                        
+              lwk2 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_claqr4 ====
+              call stdlib_claqr4( .true., .true., jw, 1, jw, t, ldt, sh, 1, jw, v,ldv, work, -1, &
+                        infqr )
+              lwk3 = int( work( 1 ),KIND=ilp)
+              ! ==== optimal workspace ====
+              lwkopt = max( jw+max( lwk1, lwk2 ), lwk3 )
+           end if
+           ! ==== quick return in case of workspace query. ====
+           if( lwork==-1 ) then
+              work( 1 ) = cmplx( lwkopt, 0,KIND=sp)
+              return
+           end if
+           ! ==== nothing to do ...
+           ! ... for an empty active block ... ====
+           ns = 0
+           nd = 0
+           work( 1 ) = cone
+           if( ktop>kbot )return
+           ! ... nor for an empty deflation window. ====
+           if( nw<1 )return
+           ! ==== machine constants ====
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = rone / safmin
+           call stdlib_slabad( safmin, safmax )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=sp) / ulp )
+           ! ==== setup deflation window ====
+           jw = min( nw, kbot-ktop+1 )
+           kwtop = kbot - jw + 1
+           if( kwtop==ktop ) then
+              s = czero
+           else
+              s = h( kwtop, kwtop-1 )
+           end if
+           if( kbot==kwtop ) then
+              ! ==== 1-by-1 deflation window: not much to do ====
+              sh( kwtop ) = h( kwtop, kwtop )
+              ns = 1
+              nd = 0
+              if( cabs1( s )<=max( smlnum, ulp*cabs1( h( kwtop,kwtop ) ) ) ) then
+                 ns = 0
+                 nd = 1
+                 if( kwtop>ktop )h( kwtop, kwtop-1 ) = czero
+              end if
+              work( 1 ) = cone
+              return
+           end if
+           ! ==== convert to spike-triangular form.  (in case of a
+           ! .    rare qr failure, this routine continues to do
+           ! .    aggressive early deflation using that part of
+           ! .    the deflation window that converged using infqr
+           ! .    here and there to keep track.) ====
+           call stdlib_clacpy( 'U', jw, jw, h( kwtop, kwtop ), ldh, t, ldt )
+           call stdlib_ccopy( jw-1, h( kwtop+1, kwtop ), ldh+1, t( 2, 1 ), ldt+1 )
+           call stdlib_claset( 'A', jw, jw, czero, cone, v, ldv )
+           nmin = stdlib_ilaenv( 12, 'CLAQR3', 'SV', jw, 1, jw, lwork )
+           if( jw>nmin ) then
+              call stdlib_claqr4( .true., .true., jw, 1, jw, t, ldt, sh( kwtop ), 1,jw, v, ldv, &
+                        work, lwork, infqr )
+           else
+              call stdlib_clahqr( .true., .true., jw, 1, jw, t, ldt, sh( kwtop ), 1,jw, v, ldv, &
+                        infqr )
+           end if
+           ! ==== deflation detection loop ====
+           ns = jw
+           ilst = infqr + 1
+           do knt = infqr + 1, jw
+              ! ==== small spike tip deflation test ====
+              foo = cabs1( t( ns, ns ) )
+              if( foo==rzero )foo = cabs1( s )
+              if( cabs1( s )*cabs1( v( 1, ns ) )<=max( smlnum, ulp*foo ) )then
+                 ! ==== cone more converged eigenvalue ====
+                 ns = ns - 1
+              else
+                 ! ==== cone undeflatable eigenvalue.  move it up out of the
+                 ! .    way.   (stdlib_ctrexc can not fail in this case.) ====
+                 ifst = ns
+                 call stdlib_ctrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, info )
+                 ilst = ilst + 1
+              end if
+           end do
+              ! ==== return to hessenberg form ====
+           if( ns==0 )s = czero
+           if( ns<jw ) then
+              ! ==== sorting the diagonal of t improves accuracy for
+              ! .    graded matrices.  ====
+              do i = infqr + 1, ns
+                 ifst = i
+                 do j = i + 1, ns
+                    if( cabs1( t( j, j ) )>cabs1( t( ifst, ifst ) ) )ifst = j
+                 end do
+                 ilst = i
+                 if( ifst/=ilst )call stdlib_ctrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, info )
+                           
+              end do
+           end if
+           ! ==== restore shift/eigenvalue array from t ====
+           do i = infqr + 1, jw
+              sh( kwtop+i-1 ) = t( i, i )
+           end do
+           if( ns<jw .or. s==czero ) then
+              if( ns>1 .and. s/=czero ) then
+                 ! ==== reflect spike back into lower triangle ====
+                 call stdlib_ccopy( ns, v, ldv, work, 1 )
+                 do i = 1, ns
+                    work( i ) = conjg( work( i ) )
+                 end do
+                 beta = work( 1 )
+                 call stdlib_clarfg( ns, beta, work( 2 ), 1, tau )
+                 work( 1 ) = cone
+                 call stdlib_claset( 'L', jw-2, jw-2, czero, czero, t( 3, 1 ), ldt )
+                 call stdlib_clarf( 'L', ns, jw, work, 1, conjg( tau ), t, ldt,work( jw+1 ) )
+                           
+                 call stdlib_clarf( 'R', ns, ns, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_clarf( 'R', jw, ns, work, 1, tau, v, ldv,work( jw+1 ) )
+                 call stdlib_cgehrd( jw, 1, ns, t, ldt, work, work( jw+1 ),lwork-jw, info )
+                           
+              end if
+              ! ==== copy updated reduced window into place ====
+              if( kwtop>1 )h( kwtop, kwtop-1 ) = s*conjg( v( 1, 1 ) )
+              call stdlib_clacpy( 'U', jw, jw, t, ldt, h( kwtop, kwtop ), ldh )
+              call stdlib_ccopy( jw-1, t( 2, 1 ), ldt+1, h( kwtop+1, kwtop ),ldh+1 )
+              ! ==== accumulate orthogonal matrix in order update
+              ! .    h and z, if requested.  ====
+              if( ns>1 .and. s/=czero )call stdlib_cunmhr( 'R', 'N', jw, ns, 1, ns, t, ldt, work, &
+                        v, ldv,work( jw+1 ), lwork-jw, info )
+              ! ==== update vertical slab in h ====
+              if( wantt ) then
+                 ltop = 1
+              else
+                 ltop = ktop
+              end if
+              do krow = ltop, kwtop - 1, nv
+                 kln = min( nv, kwtop-krow )
+                 call stdlib_cgemm( 'N', 'N', kln, jw, jw, cone, h( krow, kwtop ),ldh, v, ldv, &
+                           czero, wv, ldwv )
+                 call stdlib_clacpy( 'A', kln, jw, wv, ldwv, h( krow, kwtop ), ldh )
+              end do
+              ! ==== update horizontal slab in h ====
+              if( wantt ) then
+                 do kcol = kbot + 1, n, nh
+                    kln = min( nh, n-kcol+1 )
+                    call stdlib_cgemm( 'C', 'N', jw, kln, jw, cone, v, ldv,h( kwtop, kcol ), ldh, &
+                              czero, t, ldt )
+                    call stdlib_clacpy( 'A', jw, kln, t, ldt, h( kwtop, kcol ),ldh )
+                 end do
+              end if
+              ! ==== update vertical slab in z ====
+              if( wantz ) then
+                 do krow = iloz, ihiz, nv
+                    kln = min( nv, ihiz-krow+1 )
+                    call stdlib_cgemm( 'N', 'N', kln, jw, jw, cone, z( krow, kwtop ),ldz, v, ldv, &
+                              czero, wv, ldwv )
+                    call stdlib_clacpy( 'A', kln, jw, wv, ldwv, z( krow, kwtop ),ldz )
+                 end do
+              end if
+           end if
+           ! ==== return the number of deflations ... ====
+           nd = jw - ns
+           ! ==== ... and the number of shifts. (subtracting
+           ! .    infqr from the spike length takes care
+           ! .    of the case of a rare qr failure while
+           ! .    calculating eigenvalues of the deflation
+           ! .    window.)  ====
+           ns = ns - infqr
+            ! ==== return optimal workspace. ====
+           work( 1 ) = cmplx( lwkopt, 0,KIND=sp)
+     end subroutine stdlib_claqr3
+
+     !> CLAQR4: implements one level of recursion for CLAQR0.
+     !> It is a complete implementation of the small bulge multi-shift
+     !> QR algorithm.  It may be called by CLAQR0 and, for large enough
+     !> deflation window size, it may be called by CLAQR3.  This
+     !> subroutine is identical to CLAQR0 except that it calls CLAQR2
+     !> instead of CLAQR3.
+     !> CLAQR4 computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**H, where T is an upper triangular matrix (the
+     !> Schur form), and Z is the unitary matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input unitary
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+
+     pure subroutine stdlib_claqr4( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, work,&
+                lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(sp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(sp), intent(out) :: w(*), work(*)
+        ! ================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: kexnw = 5
+           integer(ilp), parameter :: kexsh = 6
+           real(sp), parameter :: wilk1 = 0.75_sp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_clahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== exceptional deflation windows:  try to cure rare
+           ! .    slow convergence by varying the size of the
+           ! .    deflation window after kexnw iterations. ====
+           
+           ! ==== exceptional shifts: try to cure rare slow convergence
+           ! .    with ad-hoc exceptional shifts every kexsh iterations.
+           ! .    ====
+           
+           ! ==== the constant wilk1 is used to form the exceptional
+           ! .    shifts. ====
+           
+           
+           
+           ! Local Scalars 
+           complex(sp) :: aa, bb, cc, cdum, dd, det, rtdisc, swap, tr2
+           real(sp) :: s
+           integer(ilp) :: i, inf, it, itmax, k, kacc22, kbot, kdu, ks, kt, ktop, ku, kv, kwh, &
+           kwtop, kwv, ld, ls, lwkopt, ndec, ndfl, nh, nho, nibble, nmin, ns, nsmax, nsr, nve, nw,&
+                      nwmax, nwr, nwupbd
+           logical(lk) :: sorted
+           character :: jbcmpz*2
+           ! Local Arrays 
+           complex(sp) :: zdum(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,aimag,cmplx,int,max,min,mod,real,sqrt
+           ! Statement Functions 
+           real(sp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=sp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           info = 0
+           ! ==== quick return for n = 0: nothing to do. ====
+           if( n==0 ) then
+              work( 1 ) = cone
+              return
+           end if
+           if( n<=ntiny ) then
+              ! ==== tiny matrices must use stdlib_clahqr. ====
+              lwkopt = 1
+              if( lwork/=-1 )call stdlib_clahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, &
+                        z, ldz, info )
+           else
+              ! ==== use small bulge multi-shift qr with aggressive early
+              ! .    deflation on larger-than-tiny matrices. ====
+              ! ==== hope for the best. ====
+              info = 0
+              ! ==== set up job flags for stdlib_ilaenv. ====
+              if( wantt ) then
+                 jbcmpz( 1: 1 ) = 'S'
+              else
+                 jbcmpz( 1: 1 ) = 'E'
+              end if
+              if( wantz ) then
+                 jbcmpz( 2: 2 ) = 'V'
+              else
+                 jbcmpz( 2: 2 ) = 'N'
+              end if
+              ! ==== nwr = recommended deflation window size.  at this
+              ! .    point,  n > ntiny = 15, so there is enough
+              ! .    subdiagonal workspace for nwr>=2 as required.
+              ! .    (in fact, there is enough subdiagonal space for
+              ! .    nwr>=4.) ====
+              nwr = stdlib_ilaenv( 13, 'CLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nwr = max( 2, nwr )
+              nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+              ! ==== nsr = recommended number of simultaneous shifts.
+              ! .    at this point n > ntiny = 15, so there is at
+              ! .    enough subdiagonal workspace for nsr to be even
+              ! .    and greater than or equal to two as required. ====
+              nsr = stdlib_ilaenv( 15, 'CLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nsr = min( nsr, ( n-3 ) / 6, ihi-ilo )
+              nsr = max( 2, nsr-mod( nsr, 2 ) )
+              ! ==== estimate optimal workspace ====
+              ! ==== workspace query call to stdlib_claqr2 ====
+              call stdlib_claqr2( wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz,ihiz, z, ldz, ls,&
+                         ld, w, h, ldh, n, h, ldh, n, h,ldh, work, -1 )
+              ! ==== optimal workspace = max(stdlib_claqr5, stdlib_claqr2) ====
+              lwkopt = max( 3*nsr / 2, int( work( 1 ),KIND=ilp) )
+              ! ==== quick return in case of workspace query. ====
+              if( lwork==-1 ) then
+                 work( 1 ) = cmplx( lwkopt, 0,KIND=sp)
+                 return
+              end if
+              ! ==== stdlib_clahqr/stdlib_claqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'CLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nmin = max( ntiny, nmin )
+              ! ==== nibble crossover point ====
+              nibble = stdlib_ilaenv( 14, 'CLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nibble = max( 0, nibble )
+              ! ==== accumulate reflections during ttswp?  use block
+              ! .    2-by-2 structure during matrix-matrix multiply? ====
+              kacc22 = stdlib_ilaenv( 16, 'CLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              kacc22 = max( 0, kacc22 )
+              kacc22 = min( 2, kacc22 )
+              ! ==== nwmax = the largest possible deflation window for
+              ! .    which there is sufficient workspace. ====
+              nwmax = min( ( n-1 ) / 3, lwork / 2 )
+              nw = nwmax
+              ! ==== nsmax = the largest number of simultaneous shifts
+              ! .    for which there is sufficient workspace. ====
+              nsmax = min( ( n-3 ) / 6, 2*lwork / 3 )
+              nsmax = nsmax - mod( nsmax, 2 )
+              ! ==== ndfl: an iteration count restarted at deflation. ====
+              ndfl = 1
+              ! ==== itmax = iteration limit ====
+              itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )
+              ! ==== last row and column in the active block ====
+              kbot = ihi
+              ! ==== main loop ====
+              loop_70: do it = 1, itmax
+                 ! ==== done when kbot falls below ilo ====
+                 if( kbot<ilo )go to 80
+                 ! ==== locate active block ====
+                 do k = kbot, ilo + 1, -1
+                    if( h( k, k-1 )==czero )go to 20
+                 end do
+                 k = ilo
+                 20 continue
+                 ktop = k
+                 ! ==== select deflation window size:
+                 ! .    typical case:
+                 ! .      if possible and advisable, nibble the entire
+                 ! .      active block.  if not, use size min(nwr,nwmax)
+                 ! .      or min(nwr+1,nwmax) depending upon which has
+                 ! .      the smaller corresponding subdiagonal entry
+                 ! .      (a heuristic).
+                 ! .    exceptional case:
+                 ! .      if there have been no deflations in kexnw or
+                 ! .      more iterations, then vary the deflation window
+                 ! .      size.   at first, because, larger windows are,
+                 ! .      in general, more powerful than smaller ones,
+                 ! .      rapidly increase the window to the maximum possible.
+                 ! .      then, gradually reduce the window size. ====
+                 nh = kbot - ktop + 1
+                 nwupbd = min( nh, nwmax )
+                 if( ndfl<kexnw ) then
+                    nw = min( nwupbd, nwr )
+                 else
+                    nw = min( nwupbd, 2*nw )
+                 end if
+                 if( nw<nwmax ) then
+                    if( nw>=nh-1 ) then
+                       nw = nh
+                    else
+                       kwtop = kbot - nw + 1
+                       if( cabs1( h( kwtop, kwtop-1 ) )>cabs1( h( kwtop-1, kwtop-2 ) ) )nw = nw + &
+                                 1
+                    end if
+                 end if
+                 if( ndfl<kexnw ) then
+                    ndec = -1
+                 else if( ndec>=0 .or. nw>=nwupbd ) then
+                    ndec = ndec + 1
+                    if( nw-ndec<2 )ndec = 0
+                    nw = nw - ndec
+                 end if
+                 ! ==== aggressive early deflation:
+                 ! .    split workspace under the subdiagonal into
+                 ! .      - an nw-by-nw work array v in the lower
+                 ! .        left-hand-corner,
+                 ! .      - an nw-by-at-least-nw-but-more-is-better
+                 ! .        (nw-by-nho) horizontal work array along
+                 ! .        the bottom edge,
+                 ! .      - an at-least-nw-but-more-is-better (nhv-by-nw)
+                 ! .        vertical work array along the left-hand-edge.
+                 ! .        ====
+                 kv = n - nw + 1
+                 kt = nw + 1
+                 nho = ( n-nw-1 ) - kt + 1
+                 kwv = nw + 2
+                 nve = ( n-nw ) - kwv + 1
+                 ! ==== aggressive early deflation ====
+                 call stdlib_claqr2( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+                 ls, ld, w, h( kv, 1 ), ldh, nho,h( kv, kt ), ldh, nve, h( kwv, 1 ), ldh, work,&
+                           lwork )
+                 ! ==== adjust kbot accounting for new deflations. ====
+                 kbot = kbot - ld
+                 ! ==== ks points to the shifts. ====
+                 ks = kbot - ls + 1
+                 ! ==== skip an expensive qr sweep if there is a (partly
+                 ! .    heuristic) reason to expect that many eigenvalues
+                 ! .    will deflate without it.  here, the qr sweep is
+                 ! .    skipped if many eigenvalues have just been deflated
+                 ! .    or if the remaining active block is small.
+                 if( ( ld==0 ) .or. ( ( 100*ld<=nw*nibble ) .and. ( kbot-ktop+1>min( nmin, nwmax )&
+                            ) ) ) then
+                    ! ==== ns = nominal number of simultaneous shifts.
+                    ! .    this may be lowered (slightly) if stdlib_claqr2
+                    ! .    did not provide that many shifts. ====
+                    ns = min( nsmax, nsr, max( 2, kbot-ktop ) )
+                    ns = ns - mod( ns, 2 )
+                    ! ==== if there have been no deflations
+                    ! .    in a multiple of kexsh iterations,
+                    ! .    then try exceptional shifts.
+                    ! .    otherwise use shifts provided by
+                    ! .    stdlib_claqr2 above or from the eigenvalues
+                    ! .    of a trailing principal submatrix. ====
+                    if( mod( ndfl, kexsh )==0 ) then
+                       ks = kbot - ns + 1
+                       do i = kbot, ks + 1, -2
+                          w( i ) = h( i, i ) + wilk1*cabs1( h( i, i-1 ) )
+                          w( i-1 ) = w( i )
+                       end do
+                    else
+                       ! ==== got ns/2 or fewer shifts? use stdlib_clahqr
+                       ! .    on a trailing principal submatrix to
+                       ! .    get more. (since ns<=nsmax<=(n-3)/6,
+                       ! .    there is enough space below the subdiagonal
+                       ! .    to fit an ns-by-ns scratch array.) ====
+                       if( kbot-ks+1<=ns / 2 ) then
+                          ks = kbot - ns + 1
+                          kt = n - ns + 1
+                          call stdlib_clacpy( 'A', ns, ns, h( ks, ks ), ldh,h( kt, 1 ), ldh )
+                                    
+                          call stdlib_clahqr( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, w( ks )&
+                                    , 1, 1, zdum,1, inf )
+                          ks = ks + inf
+                          ! ==== in case of a rare qr failure use
+                          ! .    eigenvalues of the trailing 2-by-2
+                          ! .    principal submatrix.  scale to avoid
+                          ! .    overflows, underflows and subnormals.
+                          ! .    (the scale factor s can not be czero,
+                          ! .    because h(kbot,kbot-1) is nonzero.) ====
+                          if( ks>=kbot ) then
+                             s = cabs1( h( kbot-1, kbot-1 ) ) +cabs1( h( kbot, kbot-1 ) ) +cabs1( &
+                                       h( kbot-1, kbot ) ) +cabs1( h( kbot, kbot ) )
+                             aa = h( kbot-1, kbot-1 ) / s
+                             cc = h( kbot, kbot-1 ) / s
+                             bb = h( kbot-1, kbot ) / s
+                             dd = h( kbot, kbot ) / s
+                             tr2 = ( aa+dd ) / two
+                             det = ( aa-tr2 )*( dd-tr2 ) - bb*cc
+                             rtdisc = sqrt( -det )
+                             w( kbot-1 ) = ( tr2+rtdisc )*s
+                             w( kbot ) = ( tr2-rtdisc )*s
+                             ks = kbot - 1
+                          end if
+                       end if
+                       if( kbot-ks+1>ns ) then
+                          ! ==== sort the shifts (helps a little) ====
+                          sorted = .false.
+                          do k = kbot, ks + 1, -1
+                             if( sorted )go to 60
+                             sorted = .true.
+                             do i = ks, k - 1
+                                if( cabs1( w( i ) )<cabs1( w( i+1 ) ) )then
+                                   sorted = .false.
+                                   swap = w( i )
+                                   w( i ) = w( i+1 )
+                                   w( i+1 ) = swap
+                                end if
+                             end do
+                          end do
+                          60 continue
+                       end if
+                    end if
+                    ! ==== if there are only two shifts, then use
+                    ! .    only cone.  ====
+                    if( kbot-ks+1==2 ) then
+                       if( cabs1( w( kbot )-h( kbot, kbot ) )<cabs1( w( kbot-1 )-h( kbot, kbot ) )&
+                                  ) then
+                          w( kbot-1 ) = w( kbot )
+                       else
+                          w( kbot ) = w( kbot-1 )
+                       end if
+                    end if
+                    ! ==== use up to ns of the the smallest magnitude
+                    ! .    shifts.  if there aren't ns shifts available,
+                    ! .    then use them all, possibly dropping cone to
+                    ! .    make the number of shifts even. ====
+                    ns = min( ns, kbot-ks+1 )
+                    ns = ns - mod( ns, 2 )
+                    ks = kbot - ns + 1
+                    ! ==== small-bulge multi-shift qr sweep:
+                    ! .    split workspace under the subdiagonal into
+                    ! .    - a kdu-by-kdu work array u in the lower
+                    ! .      left-hand-corner,
+                    ! .    - a kdu-by-at-least-kdu-but-more-is-better
+                    ! .      (kdu-by-nho) horizontal work array wh along
+                    ! .      the bottom edge,
+                    ! .    - and an at-least-kdu-but-more-is-better-by-kdu
+                    ! .      (nve-by-kdu) vertical work wv arrow along
+                    ! .      the left-hand-edge. ====
+                    kdu = 2*ns
+                    ku = n - kdu + 1
+                    kwh = kdu + 1
+                    nho = ( n-kdu+1-4 ) - ( kdu+1 ) + 1
+                    kwv = kdu + 4
+                    nve = n - kdu - kwv + 1
+                    ! ==== small-bulge multi-shift qr sweep ====
+                    call stdlib_claqr5( wantt, wantz, kacc22, n, ktop, kbot, ns,w( ks ), h, ldh, &
+                    iloz, ihiz, z, ldz, work,3, h( ku, 1 ), ldh, nve, h( kwv, 1 ), ldh,nho, h( ku,&
+                               kwh ), ldh )
+                 end if
+                 ! ==== note progress (or the lack of it). ====
+                 if( ld>0 ) then
+                    ndfl = 1
+                 else
+                    ndfl = ndfl + 1
+                 end if
+                 ! ==== end of main loop ====
+              end do loop_70
+              ! ==== iteration limit exceeded.  set info to show where
+              ! .    the problem occurred and exit. ====
+              info = kbot
+              80 continue
+           end if
+           ! ==== return the optimal value of lwork. ====
+           work( 1 ) = cmplx( lwkopt, 0,KIND=sp)
+     end subroutine stdlib_claqr4
+
+     !> CLAQZ0: computes the eigenvalues of a matrix pair (H,T),
+     !> where H is an upper Hessenberg matrix and T is upper triangular,
+     !> using the double-shift QZ method.
+     !> Matrix pairs of this type are produced by the reduction to
+     !> generalized upper Hessenberg form of a matrix pair (A,B):
+     !> A = Q1*H*Z1**H,  B = Q1*T*Z1**H,
+     !> as computed by CGGHRD.
+     !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+     !> also reduced to generalized Schur form,
+     !> H = Q*S*Z**H,  T = Q*P*Z**H,
+     !> where Q and Z are unitary matrices, P and S are an upper triangular
+     !> matrices.
+     !> Optionally, the unitary matrix Q from the generalized Schur
+     !> factorization may be postmultiplied into an input matrix Q1, and the
+     !> unitary matrix Z may be postmultiplied into an input matrix Z1.
+     !> If Q1 and Z1 are the unitary matrices from CGGHRD that reduced
+     !> the matrix pair (A,B) to generalized upper Hessenberg form, then the
+     !> output matrices Q1*Q and Z1*Z are the unitary factors from the
+     !> generalized Schur factorization of (A,B):
+     !> A = (Q1*Q)*S*(Z1*Z)**H,  B = (Q1*Q)*P*(Z1*Z)**H.
+     !> To avoid overflow, eigenvalues of the matrix pair (H,T) (equivalently,
+     !> of (A,B)) are computed as a pair of values (alpha,beta), where alpha is
+     !> complex and beta real.
+     !> If beta is nonzero, lambda = alpha / beta is an eigenvalue of the
+     !> generalized nonsymmetric eigenvalue problem (GNEP)
+     !> A*x = lambda*B*x
+     !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+     !> alternate form of the GNEP
+     !> mu*A*y = B*y.
+     !> Eigenvalues can be read directly from the generalized Schur
+     !> form:
+     !> alpha = S(i,i), beta = P(i,i).
+     !> Ref: C.B. Moler
+     !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+     !> pp. 241--256.
+     !> Ref: B. Kagstrom, D. Kressner, "Multishift Variants of the QZ
+     !> Algorithm with Aggressive Early Deflation", SIAM J. Numer.
+     !> Anal., 29(2006), pp. 199--227.
+     !> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
+     !> multipole rational QZ method with agressive early deflation"
+
+     recursive subroutine stdlib_claqz0( wants, wantq, wantz, n, ilo, ihi, a,lda, b, ldb, alpha, &
+               beta, q, ldq, z,ldz, work, lwork, rwork, rec,info )
+           ! arguments
+           character, intent( in ) :: wants, wantq, wantz
+           integer(ilp), intent( in ) :: n, ilo, ihi, lda, ldb, ldq, ldz, lwork,rec
+           integer(ilp), intent( out ) :: info
+           complex(sp), intent( inout ) :: a( lda, * ), b( ldb, * ), q( ldq, * ),z( ldz, * ), &
+                     alpha( * ), beta( * ), work( * )
+           real(sp), intent( out ) :: rwork( * )
+           
+           
+           ! local scalars
+           real(sp) :: smlnum, ulp, safmin, safmax, c1, tempr
+           complex(sp) :: eshift, s1, temp
+           integer(ilp) :: istart, istop, iiter, maxit, istart2, k, ld, nshifts, nblock, nw, nmin,&
+            nibble, n_undeflated, n_deflated, ns, sweep_info, shiftpos, lworkreq, k2, istartm, &
+            istopm, iwants, iwantq, iwantz, norm_info, aed_info, nwr, nbr, nsr, itemp1, itemp2, &
+                      rcost
+           logical(lk) :: ilschur, ilq, ilz
+           character :: jbcmpz*3
+           if( stdlib_lsame( wants, 'E' ) ) then
+              ilschur = .false.
+              iwants = 1
+           else if( stdlib_lsame( wants, 'S' ) ) then
+              ilschur = .true.
+              iwants = 2
+           else
+              iwants = 0
+           end if
+           if( stdlib_lsame( wantq, 'N' ) ) then
+              ilq = .false.
+              iwantq = 1
+           else if( stdlib_lsame( wantq, 'V' ) ) then
+              ilq = .true.
+              iwantq = 2
+           else if( stdlib_lsame( wantq, 'I' ) ) then
+              ilq = .true.
+              iwantq = 3
+           else
+              iwantq = 0
+           end if
+           if( stdlib_lsame( wantz, 'N' ) ) then
+              ilz = .false.
+              iwantz = 1
+           else if( stdlib_lsame( wantz, 'V' ) ) then
+              ilz = .true.
+              iwantz = 2
+           else if( stdlib_lsame( wantz, 'I' ) ) then
+              ilz = .true.
+              iwantz = 3
+           else
+              iwantz = 0
+           end if
+           ! check argument values
+           info = 0
+           if( iwants==0 ) then
+              info = -1
+           else if( iwantq==0 ) then
+              info = -2
+           else if( iwantz==0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 ) then
+              info = -5
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -6
+           else if( lda<n ) then
+              info = -8
+           else if( ldb<n ) then
+              info = -10
+           else if( ldq<1 .or. ( ilq .and. ldq<n ) ) then
+              info = -15
+           else if( ldz<1 .or. ( ilz .and. ldz<n ) ) then
+              info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLAQZ0', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = real( 1,KIND=sp)
+              return
+           end if
+           ! get the parameters
+           jbcmpz( 1:1 ) = wants
+           jbcmpz( 2:2 ) = wantq
+           jbcmpz( 3:3 ) = wantz
+           nmin = stdlib_ilaenv( 12, 'CLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nwr = stdlib_ilaenv( 13, 'CLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nwr = max( 2, nwr )
+           nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+           nibble = stdlib_ilaenv( 14, 'CLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nsr = stdlib_ilaenv( 15, 'CLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nsr = min( nsr, ( n+6 ) / 9, ihi-ilo )
+           nsr = max( 2, nsr-mod( nsr, 2 ) )
+           rcost = stdlib_ilaenv( 17, 'CLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           itemp1 = int( nsr/sqrt( 1+2*nsr/( real( rcost,KIND=sp)/100*n ) ),KIND=ilp)
+           itemp1 = ( ( itemp1-1 )/4 )*4+4
+           nbr = nsr+itemp1
+           if( n < nmin .or. rec >= 2 ) then
+              call stdlib_chgeqz( wants, wantq, wantz, n, ilo, ihi, a, lda, b, ldb,alpha, beta, q,&
+                         ldq, z, ldz, work, lwork, rwork,info )
+              return
+           end if
+           ! find out required workspace
+           ! workspace query to stdlib_claqz2
+           nw = max( nwr, nmin )
+           call stdlib_claqz2( ilschur, ilq, ilz, n, ilo, ihi, nw, a, lda, b, ldb,q, ldq, z, ldz, &
+           n_undeflated, n_deflated, alpha,beta, work, nw, work, nw, work, -1, rwork, rec,&
+                     aed_info )
+           itemp1 = int( work( 1 ),KIND=ilp)
+           ! workspace query to stdlib_claqz3
+           call stdlib_claqz3( ilschur, ilq, ilz, n, ilo, ihi, nsr, nbr, alpha,beta, a, lda, b, &
+                     ldb, q, ldq, z, ldz, work, nbr,work, nbr, work, -1, sweep_info )
+           itemp2 = int( work( 1 ),KIND=ilp)
+           lworkreq = max( itemp1+2*nw**2, itemp2+2*nbr**2 )
+           if ( lwork ==-1 ) then
+              work( 1 ) = real( lworkreq,KIND=sp)
+              return
+           else if ( lwork < lworkreq ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLAQZ0', info )
+              return
+           end if
+           ! initialize q and z
+           if( iwantq==3 ) call stdlib_claset( 'FULL', n, n, czero, cone, q,ldq )
+           if( iwantz==3 ) call stdlib_claset( 'FULL', n, n, czero, cone, z,ldz )
+           ! get machine constants
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_slabad( safmin, safmax )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=sp)/ulp )
+           istart = ilo
+           istop = ihi
+           maxit = 30*( ihi-ilo+1 )
+           ld = 0
+           do iiter = 1, maxit
+              if( iiter >= maxit ) then
+                 info = istop+1
+                 goto 80
+              end if
+              if ( istart+1 >= istop ) then
+                 istop = istart
+                 exit
+              end if
+              ! check deflations at the end
+              if ( abs( a( istop, istop-1 ) ) <= max( smlnum,ulp*( abs( a( istop, istop ) )+abs( &
+                        a( istop-1,istop-1 ) ) ) ) ) then
+                 a( istop, istop-1 ) = czero
+                 istop = istop-1
+                 ld = 0
+                 eshift = czero
+              end if
+              ! check deflations at the start
+              if ( abs( a( istart+1, istart ) ) <= max( smlnum,ulp*( abs( a( istart, istart ) )+&
+                        abs( a( istart+1,istart+1 ) ) ) ) ) then
+                 a( istart+1, istart ) = czero
+                 istart = istart+1
+                 ld = 0
+                 eshift = czero
+              end if
+              if ( istart+1 >= istop ) then
+                 exit
+              end if
+              ! check interior deflations
+              istart2 = istart
+              do k = istop, istart+1, -1
+                 if ( abs( a( k, k-1 ) ) <= max( smlnum, ulp*( abs( a( k,k ) )+abs( a( k-1, k-1 ) &
+                           ) ) ) ) then
+                    a( k, k-1 ) = czero
+                    istart2 = k
+                    exit
+                 end if
+              end do
+              ! get range to apply rotations to
+              if ( ilschur ) then
+                 istartm = 1
+                 istopm = n
+              else
+                 istartm = istart2
+                 istopm = istop
+              end if
+              ! check infinite eigenvalues, this is done without blocking so might
+              ! slow down the method when many infinite eigenvalues are present
+              k = istop
+              do while ( k>=istart2 )
+                 tempr = zero
+                 if( k < istop ) then
+                    tempr = tempr+abs( b( k, k+1 ) )
+                 end if
+                 if( k > istart2 ) then
+                    tempr = tempr+abs( b( k-1, k ) )
+                 end if
+                 if( abs( b( k, k ) ) < max( smlnum, ulp*tempr ) ) then
+                    ! a diagonal element of b is negligable, move it
+                    ! to the top and deflate it
+                    do k2 = k, istart2+1, -1
+                       call stdlib_clartg( b( k2-1, k2 ), b( k2-1, k2-1 ), c1, s1,temp )
+                       b( k2-1, k2 ) = temp
+                       b( k2-1, k2-1 ) = czero
+                       call stdlib_crot( k2-2-istartm+1, b( istartm, k2 ), 1,b( istartm, k2-1 ), &
+                                 1, c1, s1 )
+                       call stdlib_crot( min( k2+1, istop )-istartm+1, a( istartm,k2 ), 1, a( &
+                                 istartm, k2-1 ), 1, c1, s1 )
+                       if ( ilz ) then
+                          call stdlib_crot( n, z( 1, k2 ), 1, z( 1, k2-1 ), 1, c1,s1 )
+                       end if
+                       if( k2<istop ) then
+                          call stdlib_clartg( a( k2, k2-1 ), a( k2+1, k2-1 ), c1,s1, temp )
+                                    
+                          a( k2, k2-1 ) = temp
+                          a( k2+1, k2-1 ) = czero
+                          call stdlib_crot( istopm-k2+1, a( k2, k2 ), lda, a( k2+1,k2 ), lda, c1, &
+                                    s1 )
+                          call stdlib_crot( istopm-k2+1, b( k2, k2 ), ldb, b( k2+1,k2 ), ldb, c1, &
+                                    s1 )
+                          if( ilq ) then
+                             call stdlib_crot( n, q( 1, k2 ), 1, q( 1, k2+1 ), 1,c1, conjg( s1 ) )
+                                       
+                          end if
+                       end if
+                    end do
+                    if( istart2<istop )then
+                       call stdlib_clartg( a( istart2, istart2 ), a( istart2+1,istart2 ), c1, s1, &
+                                 temp )
+                       a( istart2, istart2 ) = temp
+                       a( istart2+1, istart2 ) = czero
+                       call stdlib_crot( istopm-( istart2+1 )+1, a( istart2,istart2+1 ), lda, a( &
+                                 istart2+1,istart2+1 ), lda, c1, s1 )
+                       call stdlib_crot( istopm-( istart2+1 )+1, b( istart2,istart2+1 ), ldb, b( &
+                                 istart2+1,istart2+1 ), ldb, c1, s1 )
+                       if( ilq ) then
+                          call stdlib_crot( n, q( 1, istart2 ), 1, q( 1,istart2+1 ), 1, c1, conjg(&
+                                     s1 ) )
+                       end if
+                    end if
+                    istart2 = istart2+1
+                 end if
+                 k = k-1
+              end do
+              ! istart2 now points to the top of the bottom right
+              ! unreduced hessenberg block
+              if ( istart2 >= istop ) then
+                 istop = istart2-1
+                 ld = 0
+                 eshift = czero
+                 cycle
+              end if
+              nw = nwr
+              nshifts = nsr
+              nblock = nbr
+              if ( istop-istart2+1 < nmin ) then
+                 ! setting nw to the size of the subblock will make aed deflate
+                 ! all the eigenvalues. this is slightly more efficient than just
+                 ! using stdlib_chgeqz because the off diagonal part gets updated via blas.
+                 if ( istop-istart+1 < nmin ) then
+                    nw = istop-istart+1
+                    istart2 = istart
+                 else
+                    nw = istop-istart2+1
+                 end if
+              end if
+              ! time for aed
+              call stdlib_claqz2( ilschur, ilq, ilz, n, istart2, istop, nw, a, lda,b, ldb, q, ldq,&
+               z, ldz, n_undeflated, n_deflated,alpha, beta, work, nw, work( nw**2+1 ), nw,work( &
+                         2*nw**2+1 ), lwork-2*nw**2, rwork, rec,aed_info )
+              if ( n_deflated > 0 ) then
+                 istop = istop-n_deflated
+                 ld = 0
+                 eshift = czero
+              end if
+              if ( 100*n_deflated > nibble*( n_deflated+n_undeflated ) .or.istop-istart2+1 < nmin &
+                        ) then
+                 ! aed has uncovered many eigenvalues. skip a qz sweep and run
+                 ! aed again.
+                 cycle
+              end if
+              ld = ld+1
+              ns = min( nshifts, istop-istart2 )
+              ns = min( ns, n_undeflated )
+              shiftpos = istop-n_deflated-n_undeflated+1
+              if ( mod( ld, 6 ) == 0 ) then
+                 ! exceptional shift.  chosen for no particularly good reason.
+                 if( ( real( maxit,KIND=sp)*safmin )*abs( a( istop,istop-1 ) )<abs( a( istop-1, &
+                           istop-1 ) ) ) then
+                    eshift = a( istop, istop-1 )/b( istop-1, istop-1 )
+                 else
+                    eshift = eshift+cone/( safmin*real( maxit,KIND=sp) )
+                 end if
+                 alpha( shiftpos ) = cone
+                 beta( shiftpos ) = eshift
+                 ns = 1
+              end if
+              ! time for a qz sweep
+              call stdlib_claqz3( ilschur, ilq, ilz, n, istart2, istop, ns, nblock,alpha( &
+              shiftpos ), beta( shiftpos ), a, lda, b,ldb, q, ldq, z, ldz, work, nblock, work( &
+                        nblock**2+1 ), nblock, work( 2*nblock**2+1 ),lwork-2*nblock**2, sweep_info )
+           end do
+           ! call stdlib_chgeqz to normalize the eigenvalue blocks and set the eigenvalues
+           ! if all the eigenvalues have been found, stdlib_chgeqz will not do any iterations
+           ! and only normalize the blocks. in case of a rare convergence failure,
+           ! the single shift might perform better.
+        80 call stdlib_chgeqz( wants, wantq, wantz, n, ilo, ihi, a, lda, b, ldb,alpha, beta, q, &
+                  ldq, z, ldz, work, lwork, rwork,norm_info )
+           info = norm_info
+     end subroutine stdlib_claqz0
+
+     !> CLAQZ2: performs AED
+
+     recursive subroutine stdlib_claqz2( ilschur, ilq, ilz, n, ilo, ihi, nw,a, lda, b, ldb, q, &
+               ldq, z, ldz, ns,nd, alpha, beta, qc, ldqc, zc, ldzc,work, lwork, rwork, rec, info )
+           ! arguments
+           logical(lk), intent( in ) :: ilschur, ilq, ilz
+           integer(ilp), intent( in ) :: n, ilo, ihi, nw, lda, ldb, ldq, ldz,ldqc, ldzc, lwork, &
+                     rec
+           complex(sp), intent( inout ) :: a( lda, * ), b( ldb, * ), q( ldq, * ),z( ldz, * ), &
+                     alpha( * ), beta( * )
+           integer(ilp), intent( out ) :: ns, nd, info
+           complex(sp), intent(inout) :: qc(ldqc,*), zc(ldzc,*)
+           complex(sp), intent(out) :: work(*)
+           real(sp), intent(out) :: rwork(*)
+           
+           
+           ! local scalars
+           integer(ilp) :: jw, kwtop, kwbot, istopm, istartm, k, k2, ctgexc_info, ifst, ilst, &
+                     lworkreq, qz_small_info
+           real(sp) :: smlnum, ulp, safmin, safmax, c1, tempr
+           complex(sp) :: s, s1, temp
+           info = 0
+           ! set up deflation window
+           jw = min( nw, ihi-ilo+1 )
+           kwtop = ihi-jw+1
+           if ( kwtop == ilo ) then
+              s = czero
+           else
+              s = a( kwtop, kwtop-1 )
+           end if
+           ! determine required workspace
+           ifst = 1
+           ilst = jw
+           call stdlib_claqz0( 'S', 'V', 'V', jw, 1, jw, a( kwtop, kwtop ), lda,b( kwtop, kwtop ),&
+                      ldb, alpha, beta, qc, ldqc, zc,ldzc, work, -1, rwork, rec+1, qz_small_info )
+           lworkreq = int( work( 1 ),KIND=ilp)+2*jw**2
+           lworkreq = max( lworkreq, n*nw, 2*nw**2+n )
+           if ( lwork ==-1 ) then
+              ! workspace query, quick return
+              work( 1 ) = lworkreq
+              return
+           else if ( lwork < lworkreq ) then
+              info = -26
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CLAQZ2', -info )
+              return
+           end if
+           ! get machine constants
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_slabad( safmin, safmax )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=sp)/ulp )
+           if ( ihi == kwtop ) then
+              ! 1 by 1 deflation window, just try a regular deflation
+              alpha( kwtop ) = a( kwtop, kwtop )
+              beta( kwtop ) = b( kwtop, kwtop )
+              ns = 1
+              nd = 0
+              if ( abs( s ) <= max( smlnum, ulp*abs( a( kwtop,kwtop ) ) ) ) then
+                 ns = 0
+                 nd = 1
+                 if ( kwtop > ilo ) then
+                    a( kwtop, kwtop-1 ) = czero
+                 end if
+              end if
+           end if
+           ! store window in case of convergence failure
+           call stdlib_clacpy( 'ALL', jw, jw, a( kwtop, kwtop ), lda, work, jw )
+           call stdlib_clacpy( 'ALL', jw, jw, b( kwtop, kwtop ), ldb, work( jw**2+1 ), jw )
+                     
+           ! transform window to real schur form
+           call stdlib_claset( 'FULL', jw, jw, czero, cone, qc, ldqc )
+           call stdlib_claset( 'FULL', jw, jw, czero, cone, zc, ldzc )
+           call stdlib_claqz0( 'S', 'V', 'V', jw, 1, jw, a( kwtop, kwtop ), lda,b( kwtop, kwtop ),&
+            ldb, alpha, beta, qc, ldqc, zc,ldzc, work( 2*jw**2+1 ), lwork-2*jw**2, rwork,rec+1, &
+                      qz_small_info )
+           if( qz_small_info /= 0 ) then
+              ! convergence failure, restore the window and exit
+              nd = 0
+              ns = jw-qz_small_info
+              call stdlib_clacpy( 'ALL', jw, jw, work, jw, a( kwtop, kwtop ), lda )
+              call stdlib_clacpy( 'ALL', jw, jw, work( jw**2+1 ), jw, b( kwtop,kwtop ), ldb )
+                        
+              return
+           end if
+           ! deflation detection loop
+           if ( kwtop == ilo .or. s == czero ) then
+              kwbot = kwtop-1
+           else
+              kwbot = ihi
+              k = 1
+              k2 = 1
+              do while ( k <= jw )
+                    ! try to deflate eigenvalue
+                    tempr = abs( a( kwbot, kwbot ) )
+                    if( tempr == zero ) then
+                       tempr = abs( s )
+                    end if
+                    if ( ( abs( s*qc( 1, kwbot-kwtop+1 ) ) ) <= max( ulp*tempr, smlnum ) ) &
+                              then
+                       ! deflatable
+                       kwbot = kwbot-1
+                    else
+                       ! not deflatable, move out of the way
+                       ifst = kwbot-kwtop+1
+                       ilst = k2
+                       call stdlib_ctgexc( .true., .true., jw, a( kwtop, kwtop ),lda, b( kwtop, &
+                                 kwtop ), ldb, qc, ldqc,zc, ldzc, ifst, ilst, ctgexc_info )
+                       k2 = k2+1
+                    end if
+                    k = k+1
+              end do
+           end if
+           ! store eigenvalues
+           nd = ihi-kwbot
+           ns = jw-nd
+           k = kwtop
+           do while ( k <= ihi )
+              alpha( k ) = a( k, k )
+              beta( k ) = b( k, k )
+              k = k+1
+           end do
+           if ( kwtop /= ilo .and. s /= czero ) then
+              ! reflect spike back, this will create optimally packed bulges
+              a( kwtop:kwbot, kwtop-1 ) = a( kwtop, kwtop-1 ) *conjg( qc( 1,1:jw-nd ) )
+              do k = kwbot-1, kwtop, -1
+                 call stdlib_clartg( a( k, kwtop-1 ), a( k+1, kwtop-1 ), c1, s1,temp )
+                 a( k, kwtop-1 ) = temp
+                 a( k+1, kwtop-1 ) = czero
+                 k2 = max( kwtop, k-1 )
+                 call stdlib_crot( ihi-k2+1, a( k, k2 ), lda, a( k+1, k2 ), lda, c1,s1 )
+                 call stdlib_crot( ihi-( k-1 )+1, b( k, k-1 ), ldb, b( k+1, k-1 ),ldb, c1, s1 )
+                           
+                 call stdlib_crot( jw, qc( 1, k-kwtop+1 ), 1, qc( 1, k+1-kwtop+1 ),1, c1, conjg( &
+                           s1 ) )
+              end do
+              ! chase bulges down
+              istartm = kwtop
+              istopm = ihi
+              k = kwbot-1
+              do while ( k >= kwtop )
+                 ! move bulge down and remove it
+                 do k2 = k, kwbot-1
+                    call stdlib_claqz1( .true., .true., k2, kwtop, kwtop+jw-1,kwbot, a, lda, b, &
+                              ldb, jw, kwtop, qc, ldqc,jw, kwtop, zc, ldzc )
+                 end do
+                 k = k-1
+              end do
+           end if
+           ! apply qc and zc to rest of the matrix
+           if ( ilschur ) then
+              istartm = 1
+              istopm = n
+           else
+              istartm = ilo
+              istopm = ihi
+           end if
+           if ( istopm-ihi > 0 ) then
+              call stdlib_cgemm( 'C', 'N', jw, istopm-ihi, jw, cone, qc, ldqc,a( kwtop, ihi+1 ), &
+                        lda, czero, work, jw )
+              call stdlib_clacpy( 'ALL', jw, istopm-ihi, work, jw, a( kwtop,ihi+1 ), lda )
+              call stdlib_cgemm( 'C', 'N', jw, istopm-ihi, jw, cone, qc, ldqc,b( kwtop, ihi+1 ), &
+                        ldb, czero, work, jw )
+              call stdlib_clacpy( 'ALL', jw, istopm-ihi, work, jw, b( kwtop,ihi+1 ), ldb )
+           end if
+           if ( ilq ) then
+              call stdlib_cgemm( 'N', 'N', n, jw, jw, cone, q( 1, kwtop ), ldq, qc,ldqc, czero, &
+                        work, n )
+              call stdlib_clacpy( 'ALL', n, jw, work, n, q( 1, kwtop ), ldq )
+           end if
+           if ( kwtop-1-istartm+1 > 0 ) then
+              call stdlib_cgemm( 'N', 'N', kwtop-istartm, jw, jw, cone, a( istartm,kwtop ), lda, &
+                        zc, ldzc, czero, work,kwtop-istartm )
+             call stdlib_clacpy( 'ALL', kwtop-istartm, jw, work, kwtop-istartm,a( istartm, kwtop )&
+                       , lda )
+              call stdlib_cgemm( 'N', 'N', kwtop-istartm, jw, jw, cone, b( istartm,kwtop ), ldb, &
+                        zc, ldzc, czero, work,kwtop-istartm )
+             call stdlib_clacpy( 'ALL', kwtop-istartm, jw, work, kwtop-istartm,b( istartm, kwtop )&
+                       , ldb )
+           end if
+           if ( ilz ) then
+              call stdlib_cgemm( 'N', 'N', n, jw, jw, cone, z( 1, kwtop ), ldz, zc,ldzc, czero, &
+                        work, n )
+              call stdlib_clacpy( 'ALL', n, jw, work, n, z( 1, kwtop ), ldz )
+           end if
+     end subroutine stdlib_claqz2
+
+     !> DLATRF_AA factorizes a panel of a complex symmetric matrix A using
+     !> the Aasen's algorithm. The panel consists of a set of NB rows of A
+     !> when UPLO is U, or a set of NB columns when UPLO is L.
+     !> In order to factorize the panel, the Aasen's algorithm requires the
+     !> last row, or column, of the previous panel. The first row, or column,
+     !> of A is set to be the first row, or column, of an identity matrix,
+     !> which is used to factorize the first panel.
+     !> The resulting J-th row of U, or J-th column of L, is stored in the
+     !> (J-1)-th row, or column, of A (without the unit diagonals), while
+     !> the diagonal and subdiagonal of A are overwritten by those of T.
+
+     pure subroutine stdlib_clasyf_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: m, nb, j1, lda, ldh
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), h(ldh,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, k, k1, i1, i2, mj
+           complex(sp) :: piv, alpha
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           j = 1
+           ! k1 is the first column of the panel to be factorized
+           ! i.e.,  k1 is 2 for the first block column, and 1 for the rest of the blocks
+           k1 = (2-j1)+1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              10 continue
+              if ( j>min(m, nb) )go to 20
+              ! k is the column to be factorized
+               ! when being called from stdlib_csytrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:m, j) := a(j, j:m) - h(j:m, 1:(j-1)) * l(j1:(j-1), j),
+               ! where h(j:m, j) has been initialized to be a(j, j:m)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_cgemv( 'NO TRANSPOSE', mj, j-k1,-cone, h( j, k1 ), ldh,a( 1, j ), 1,&
+                           cone, h( j, j ), 1 )
+              end if
+              ! copy h(i:m, i) into work
+              call stdlib_ccopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j-1, j:m) * t(j-1,j),
+                  ! where a(j-1, j) stores t(j-1, j) and a(j-2, j:m) stores u(j-1, j:m)
+                 alpha = -a( k-1, j )
+                 call stdlib_caxpy( mj, alpha, a( k-2, j ), lda, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( k, j ) = work( 1 )
+              if( j<m ) then
+                 ! compute work(2:m) = t(j, j) l(j, (j+1):m)
+                  ! where a(j, j) stores t(j, j) and a(j-1, (j+1):m) stores u(j, (j+1):m)
+                 if( k>1 ) then
+                    alpha = -a( k, j )
+                    call stdlib_caxpy( m-j, alpha, a( k-1, j+1 ), lda,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:m)|)
+                 i2 = stdlib_icamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply symmetric pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1, i1+1:m) with a(i1+1:m, i2)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_cswap( i2-i1-1, a( j1+i1-1, i1+1 ), lda,a( j1+i1, i2 ), 1 )
+                              
+                    ! swap a(i1, i2+1:m) with a(i2, i2+1:m)
+                    if( i2<m )call stdlib_cswap( m-i2, a( j1+i1-1, i2+1 ), lda,a( j1+i2-1, i2+1 ),&
+                               lda )
+                    ! swap a(i1, i1) with a(i2,i2)
+                    piv = a( i1+j1-1, i1 )
+                    a( j1+i1-1, i1 ) = a( j1+i2-1, i2 )
+                    a( j1+i2-1, i2 ) = piv
+                    ! swap h(i1, 1:j1) with h(i2, 1:j1)
+                    call stdlib_cswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_cswap( i1-k1+1, a( 1, i1 ), 1,a( 1, i2 ), 1 )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j, j+1) = t(j, j+1)
+                 a( k, j+1 ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:m, j+1) into h(j:m, j),
+                    call stdlib_ccopy( m-j, a( k+1, j+1 ), lda,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:m ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:m, j) = l(j+2:m, j+1)
+                 if( j<(m-1) ) then
+                    if( a( k, j+1 )/=czero ) then
+                       alpha = cone / a( k, j+1 )
+                       call stdlib_ccopy( m-j-1, work( 3 ), 1, a( k, j+2 ), lda )
+                       call stdlib_cscal( m-j-1, alpha, a( k, j+2 ), lda )
+                    else
+                       call stdlib_claset( 'FULL', 1, m-j-1, czero, czero,a( k, j+2 ), lda)
+                                 
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 10
+              20 continue
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              30 continue
+              if( j>min( m, nb ) )go to 40
+              ! k is the column to be factorized
+               ! when being called from stdlib_csytrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:m, j) := a(j:m, j) - h(j:m, 1:(j-1)) * l(j, j1:(j-1))^t,
+               ! where h(j:m, j) has been initialized to be a(j:m, j)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_cgemv( 'NO TRANSPOSE', mj, j-k1,-cone, h( j, k1 ), ldh,a( j, 1 ), &
+                           lda,cone, h( j, j ), 1 )
+              end if
+              ! copy h(j:m, j) into work
+              call stdlib_ccopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j:m, j-1) * t(j-1,j),
+                  ! where a(j-1, j) = t(j-1, j) and a(j, j-2) = l(j, j-1)
+                 alpha = -a( j, k-1 )
+                 call stdlib_caxpy( mj, alpha, a( j, k-2 ), 1, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( j, k ) = work( 1 )
+              if( j<m ) then
+                 ! compute work(2:m) = t(j, j) l((j+1):m, j)
+                  ! where a(j, j) = t(j, j) and a((j+1):m, j-1) = l((j+1):m, j)
+                 if( k>1 ) then
+                    alpha = -a( j, k )
+                    call stdlib_caxpy( m-j, alpha, a( j+1, k-1 ), 1,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:m)|)
+                 i2 = stdlib_icamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply symmetric pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1+1:m, i1) with a(i2, i1+1:m)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_cswap( i2-i1-1, a( i1+1, j1+i1-1 ), 1,a( i2, j1+i1 ), lda )
+                              
+                    ! swap a(i2+1:m, i1) with a(i2+1:m, i2)
+                    if( i2<m )call stdlib_cswap( m-i2, a( i2+1, j1+i1-1 ), 1,a( i2+1, j1+i2-1 ), &
+                              1 )
+                    ! swap a(i1, i1) with a(i2, i2)
+                    piv = a( i1, j1+i1-1 )
+                    a( i1, j1+i1-1 ) = a( i2, j1+i2-1 )
+                    a( i2, j1+i2-1 ) = piv
+                    ! swap h(i1, i1:j1) with h(i2, i2:j1)
+                    call stdlib_cswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_cswap( i1-k1+1, a( i1, 1 ), lda,a( i2, 1 ), lda )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j+1, j) = t(j+1, j)
+                 a( j+1, k ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:m, j+1) into h(j+1:m, j),
+                    call stdlib_ccopy( m-j, a( j+1, k+1 ), 1,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:m ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:m, j) = l(j+2:m, j+1)
+                 if( j<(m-1) ) then
+                    if( a( j+1, k )/=czero ) then
+                       alpha = cone / a( j+1, k )
+                       call stdlib_ccopy( m-j-1, work( 3 ), 1, a( j+2, k ), 1 )
+                       call stdlib_cscal( m-j-1, alpha, a( j+2, k ), 1 )
+                    else
+                       call stdlib_claset( 'FULL', m-j-1, 1, czero, czero,a( j+2, k ), lda )
+                                 
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 30
+              40 continue
+           end if
+           return
+     end subroutine stdlib_clasyf_aa
+
+     !> CSYSV computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Aasen's algorithm is used to factor A as
+     !> A = U**T * T * U,  if UPLO = 'U', or
+     !> A = L * T * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is symmetric tridiagonal. The factored
+     !> form of A is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_csysv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, lwkopt_sytrf, lwkopt_sytrs
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max(2*n, 3*n-2) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              call stdlib_csytrf_aa( uplo, n, a, lda, ipiv, work, -1, info )
+              lwkopt_sytrf = int( work(1),KIND=ilp)
+              call stdlib_csytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,-1, info )
+              lwkopt_sytrs = int( work(1),KIND=ilp)
+              lwkopt = max( lwkopt_sytrf, lwkopt_sytrs )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYSV_AA ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u**t*t*u or a = l*t*l**t.
+           call stdlib_csytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_csytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_csysv_aa
+
+     !> CSYTRF_AA: computes the factorization of a complex symmetric matrix A
+     !> using the Aasen's algorithm.  The form of the factorization is
+     !> A = U**T*T*U  or  A = L*T*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is a complex symmetric tridiagonal matrix.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_csytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(sp), intent(inout) :: a(lda,*)
+           complex(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: j, lwkopt
+           integer(ilp) :: nb, mj, nj, k1, k2, j1, j2, j3, jb
+           complex(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! determine the block size
+           nb = stdlib_ilaenv( 1, 'CSYTRF_AA', uplo, n, -1, -1, -1 )
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              lwkopt = (nb+1)*n
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'CSYTRF_AA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return
+           if ( n==0 ) then
+               return
+           endif
+           ipiv( 1 ) = 1
+           if ( n==1 ) then
+              return
+           end if
+           ! adjust block size based on the workspace size
+           if( lwork<((1+nb)*n) ) then
+              nb = ( lwork-n ) / n
+           end if
+           if( upper ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              ! copy first row a(1, 1:n) into h(1:n) (stored in work(1:n))
+              call stdlib_ccopy( n, a( 1, 1 ), lda, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_clasyf;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              10 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j + 1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_clasyf_aa( uplo, 2-k1, n-j, jb,a( max(1, j), j+1 ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_cswap( j1-k1-2, a( 1, j2 ), 1,a( 1, ipiv(j2) ), 1 )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+               ! the row a(j1-1, j2-1:n) stores u(j1, j2+1:n) and
+               ! work stores the current block of the auxiriarly matrix h
+              if( j<n ) then
+                 ! if first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = a( j, j+1 )
+                    a( j, j+1 ) = cone
+                    call stdlib_ccopy( n-j, a( j-1, j+1 ), lda,work( (j+1-j1+1)+jb*n ), 1 )
+                              
+                    call stdlib_cscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=1 and k2= 0 for the first panel,
+                     ! while k1=0 and k2=1 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_cgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_cgemv( 'NO TRANSPOSE', mj, jb+1,-cone, work( j3-j1+1+k1*n ),&
+                                     n,a( j1-k2, j3 ), 1,cone, a( j3, j3 ), lda )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block of j2-th block row with stdlib_cgemm
+                       call stdlib_cgemm( 'TRANSPOSE', 'TRANSPOSE',nj, n-j3+1, jb+1,-cone, a( j1-&
+                                 k2, j2 ), lda,work( j3-j1+1+k1*n ), n,cone, a( j2, j3 ), lda )
+                    end do
+                    ! recover t( j, j+1 )
+                    a( j, j+1 ) = alpha
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_ccopy( n-j, a( j+1, j+1 ), lda, work( 1 ), 1 )
+              end if
+              go to 10
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              ! copy first column a(1:n, 1) into h(1:n, 1)
+               ! (stored in work(1:n))
+              call stdlib_ccopy( n, a( 1, 1 ), 1, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_clasyf;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              11 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j+1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_clasyf_aa( uplo, 2-k1, n-j, jb,a( j+1, max(1, j) ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_cswap( j1-k1-2, a( j2, 1 ), lda,a( ipiv(j2), 1 ), lda )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+                ! a(j2+1, j1-1) stores l(j2+1, j1) and
+                ! work(j2+1, 1) stores h(j2+1, 1)
+              if( j<n ) then
+                 ! if first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = a( j+1, j )
+                    a( j+1, j ) = cone
+                    call stdlib_ccopy( n-j, a( j+1, j-1 ), 1,work( (j+1-j1+1)+jb*n ), 1 )
+                    call stdlib_cscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=1 and k2= 0 for the first panel,
+                     ! while k1=0 and k2=1 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_cgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_cgemv( 'NO TRANSPOSE', mj, jb+1,-cone, work( j3-j1+1+k1*n ),&
+                                     n,a( j3, j1-k2 ), lda,cone, a( j3, j3 ), 1 )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block in j2-th block column with stdlib_cgemm
+                       call stdlib_cgemm( 'NO TRANSPOSE', 'TRANSPOSE',n-j3+1, nj, jb+1,-cone, &
+                       work( j3-j1+1+k1*n ), n,a( j2, j1-k2 ), lda,cone, a( j3, j2 ), lda )
+                                 
+                    end do
+                    ! recover t( j+1, j )
+                    a( j+1, j ) = alpha
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_ccopy( n-j, a( j+1, j+1 ), 1, work( 1 ), 1 )
+              end if
+              go to 11
+           end if
+           20 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_csytrf_aa
+
+
+
+end module stdlib_linalg_lapack_c
diff --git a/src/stdlib_linalg_lapack_d.fypp b/src/stdlib_linalg_lapack_d.fypp
new file mode 100644
index 000000000..e1eed9a92
--- /dev/null
+++ b/src/stdlib_linalg_lapack_d.fypp
@@ -0,0 +1,85276 @@
+#:include "common.fypp" 
+module stdlib_linalg_lapack_d
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas
+     use stdlib_linalg_lapack_aux
+     use stdlib_linalg_lapack_s
+     use stdlib_linalg_lapack_c
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+     public :: stdlib_dbbcsd
+     public :: stdlib_dbdsdc
+     public :: stdlib_dbdsqr
+     public :: stdlib_ddisna
+     public :: stdlib_dgbbrd
+     public :: stdlib_dgbcon
+     public :: stdlib_dgbequ
+     public :: stdlib_dgbequb
+     public :: stdlib_dgbrfs
+     public :: stdlib_dgbsv
+     public :: stdlib_dgbsvx
+     public :: stdlib_dgbtf2
+     public :: stdlib_dgbtrf
+     public :: stdlib_dgbtrs
+     public :: stdlib_dgebak
+     public :: stdlib_dgebal
+     public :: stdlib_dgebd2
+     public :: stdlib_dgebrd
+     public :: stdlib_dgecon
+     public :: stdlib_dgeequ
+     public :: stdlib_dgeequb
+     public :: stdlib_dgees
+     public :: stdlib_dgeesx
+     public :: stdlib_dgeev
+     public :: stdlib_dgeevx
+     public :: stdlib_dgehd2
+     public :: stdlib_dgehrd
+     public :: stdlib_dgejsv
+     public :: stdlib_dgelq
+     public :: stdlib_dgelq2
+     public :: stdlib_dgelqf
+     public :: stdlib_dgelqt
+     public :: stdlib_dgelqt3
+     public :: stdlib_dgels
+     public :: stdlib_dgelsd
+     public :: stdlib_dgelss
+     public :: stdlib_dgelsy
+     public :: stdlib_dgemlq
+     public :: stdlib_dgemlqt
+     public :: stdlib_dgemqr
+     public :: stdlib_dgemqrt
+     public :: stdlib_dgeql2
+     public :: stdlib_dgeqlf
+     public :: stdlib_dgeqp3
+     public :: stdlib_dgeqr
+     public :: stdlib_dgeqr2
+     public :: stdlib_dgeqr2p
+     public :: stdlib_dgeqrf
+     public :: stdlib_dgeqrfp
+     public :: stdlib_dgeqrt
+     public :: stdlib_dgeqrt2
+     public :: stdlib_dgeqrt3
+     public :: stdlib_dgerfs
+     public :: stdlib_dgerq2
+     public :: stdlib_dgerqf
+     public :: stdlib_dgesc2
+     public :: stdlib_dgesdd
+     public :: stdlib_dgesv
+     public :: stdlib_dgesvd
+     public :: stdlib_dgesvdq
+     public :: stdlib_dgesvj
+     public :: stdlib_dgesvx
+     public :: stdlib_dgetc2
+     public :: stdlib_dgetf2
+     public :: stdlib_dgetrf
+     public :: stdlib_dgetrf2
+     public :: stdlib_dgetri
+     public :: stdlib_dgetrs
+     public :: stdlib_dgetsls
+     public :: stdlib_dgetsqrhrt
+     public :: stdlib_dggbak
+     public :: stdlib_dggbal
+     public :: stdlib_dgges
+     public :: stdlib_dgges3
+     public :: stdlib_dggesx
+     public :: stdlib_dggev
+     public :: stdlib_dggev3
+     public :: stdlib_dggevx
+     public :: stdlib_dggglm
+     public :: stdlib_dgghd3
+     public :: stdlib_dgghrd
+     public :: stdlib_dgglse
+     public :: stdlib_dggqrf
+     public :: stdlib_dggrqf
+     public :: stdlib_dgsvj0
+     public :: stdlib_dgsvj1
+     public :: stdlib_dgtcon
+     public :: stdlib_dgtrfs
+     public :: stdlib_dgtsv
+     public :: stdlib_dgtsvx
+     public :: stdlib_dgttrf
+     public :: stdlib_dgttrs
+     public :: stdlib_dgtts2
+     public :: stdlib_dhgeqz
+     public :: stdlib_dhsein
+     public :: stdlib_dhseqr
+     public :: stdlib_disnan
+     public :: stdlib_dla_gbamv
+     public :: stdlib_dla_gbrcond
+     public :: stdlib_dla_gbrpvgrw
+     public :: stdlib_dla_geamv
+     public :: stdlib_dla_gercond
+     public :: stdlib_dla_gerpvgrw
+     public :: stdlib_dla_lin_berr
+     public :: stdlib_dla_porcond
+     public :: stdlib_dla_porpvgrw
+     public :: stdlib_dla_syamv
+     public :: stdlib_dla_syrcond
+     public :: stdlib_dla_syrpvgrw
+     public :: stdlib_dla_wwaddw
+     public :: stdlib_dlabad
+     public :: stdlib_dlabrd
+     public :: stdlib_dlacn2
+     public :: stdlib_dlacon
+     public :: stdlib_dlacpy
+     public :: stdlib_dladiv
+     public :: stdlib_dladiv1
+     public :: stdlib_dladiv2
+     public :: stdlib_dlae2
+     public :: stdlib_dlaebz
+     public :: stdlib_dlaed0
+     public :: stdlib_dlaed1
+     public :: stdlib_dlaed2
+     public :: stdlib_dlaed3
+     public :: stdlib_dlaed4
+     public :: stdlib_dlaed5
+     public :: stdlib_dlaed6
+     public :: stdlib_dlaed7
+     public :: stdlib_dlaed8
+     public :: stdlib_dlaed9
+     public :: stdlib_dlaeda
+     public :: stdlib_dlaein
+     public :: stdlib_dlaev2
+     public :: stdlib_dlaexc
+     public :: stdlib_dlag2
+     public :: stdlib_dlag2s
+     public :: stdlib_dlags2
+     public :: stdlib_dlagtf
+     public :: stdlib_dlagtm
+     public :: stdlib_dlagts
+     public :: stdlib_dlagv2
+     public :: stdlib_dlahqr
+     public :: stdlib_dlahr2
+     public :: stdlib_dlaic1
+     public :: stdlib_dlaisnan
+     public :: stdlib_dlaln2
+     public :: stdlib_dlals0
+     public :: stdlib_dlalsa
+     public :: stdlib_dlalsd
+     public :: stdlib_dlamch
+     public :: stdlib_dlamc3
+     public :: stdlib_dlamrg
+     public :: stdlib_dlamswlq
+     public :: stdlib_dlamtsqr
+     public :: stdlib_dlaneg
+     public :: stdlib_dlangb
+     public :: stdlib_dlange
+     public :: stdlib_dlangt
+     public :: stdlib_dlanhs
+     public :: stdlib_dlansb
+     public :: stdlib_dlansf
+     public :: stdlib_dlansp
+     public :: stdlib_dlanst
+     public :: stdlib_dlansy
+     public :: stdlib_dlantb
+     public :: stdlib_dlantp
+     public :: stdlib_dlantr
+     public :: stdlib_dlanv2
+     public :: stdlib_dlaorhr_col_getrfnp
+     public :: stdlib_dlaorhr_col_getrfnp2
+     public :: stdlib_dlapll
+     public :: stdlib_dlapmr
+     public :: stdlib_dlapmt
+     public :: stdlib_dlapy2
+     public :: stdlib_dlapy3
+     public :: stdlib_dlaqgb
+     public :: stdlib_dlaqge
+     public :: stdlib_dlaqp2
+     public :: stdlib_dlaqps
+     public :: stdlib_dlaqr0
+     public :: stdlib_dlaqr1
+     public :: stdlib_dlaqr2
+     public :: stdlib_dlaqr3
+     public :: stdlib_dlaqr4
+     public :: stdlib_dlaqr5
+     public :: stdlib_dlaqsb
+     public :: stdlib_dlaqsp
+     public :: stdlib_dlaqsy
+     public :: stdlib_dlaqtr
+     public :: stdlib_dlaqz0
+     public :: stdlib_dlaqz1
+     public :: stdlib_dlaqz2
+     public :: stdlib_dlaqz3
+     public :: stdlib_dlaqz4
+     public :: stdlib_dlar1v
+     public :: stdlib_dlar2v
+     public :: stdlib_dlarf
+     public :: stdlib_dlarfb
+     public :: stdlib_dlarfb_gett
+     public :: stdlib_dlarfg
+     public :: stdlib_dlarfgp
+     public :: stdlib_dlarft
+     public :: stdlib_dlarfx
+     public :: stdlib_dlarfy
+     public :: stdlib_dlargv
+     public :: stdlib_dlarnv
+     public :: stdlib_dlarra
+     public :: stdlib_dlarrb
+     public :: stdlib_dlarrc
+     public :: stdlib_dlarrd
+     public :: stdlib_dlarre
+     public :: stdlib_dlarrf
+     public :: stdlib_dlarrj
+     public :: stdlib_dlarrk
+     public :: stdlib_dlarrr
+     public :: stdlib_dlarrv
+     public :: stdlib_dlartg
+     public :: stdlib_dlartgp
+     public :: stdlib_dlartgs
+     public :: stdlib_dlartv
+     public :: stdlib_dlaruv
+     public :: stdlib_dlarz
+     public :: stdlib_dlarzb
+     public :: stdlib_dlarzt
+     public :: stdlib_dlas2
+     public :: stdlib_dlascl
+     public :: stdlib_dlasd0
+     public :: stdlib_dlasd1
+     public :: stdlib_dlasd2
+     public :: stdlib_dlasd3
+     public :: stdlib_dlasd4
+     public :: stdlib_dlasd5
+     public :: stdlib_dlasd6
+     public :: stdlib_dlasd7
+     public :: stdlib_dlasd8
+     public :: stdlib_dlasda
+     public :: stdlib_dlasdq
+     public :: stdlib_dlasdt
+     public :: stdlib_dlaset
+     public :: stdlib_dlasq1
+     public :: stdlib_dlasq2
+     public :: stdlib_dlasq3
+     public :: stdlib_dlasq4
+     public :: stdlib_dlasq5
+     public :: stdlib_dlasq6
+     public :: stdlib_dlasr
+     public :: stdlib_dlasrt
+     public :: stdlib_dlassq
+     public :: stdlib_dlasv2
+     public :: stdlib_dlaswlq
+     public :: stdlib_dlaswp
+     public :: stdlib_dlasy2
+     public :: stdlib_dlasyf
+     public :: stdlib_dlasyf_aa
+     public :: stdlib_dlasyf_rk
+     public :: stdlib_dlasyf_rook
+     public :: stdlib_dlat2s
+     public :: stdlib_dlatbs
+     public :: stdlib_dlatdf
+     public :: stdlib_dlatps
+     public :: stdlib_dlatrd
+     public :: stdlib_dlatrs
+     public :: stdlib_dlatrz
+     public :: stdlib_dlatsqr
+     public :: stdlib_dlauu2
+     public :: stdlib_dlauum
+     public :: stdlib_dopgtr
+     public :: stdlib_dopmtr
+     public :: stdlib_dorbdb
+     public :: stdlib_dorbdb1
+     public :: stdlib_dorbdb2
+     public :: stdlib_dorbdb3
+     public :: stdlib_dorbdb4
+     public :: stdlib_dorbdb5
+     public :: stdlib_dorbdb6
+     public :: stdlib_dorcsd
+     public :: stdlib_dorcsd2by1
+     public :: stdlib_dorg2l
+     public :: stdlib_dorg2r
+     public :: stdlib_dorgbr
+     public :: stdlib_dorghr
+     public :: stdlib_dorgl2
+     public :: stdlib_dorglq
+     public :: stdlib_dorgql
+     public :: stdlib_dorgqr
+     public :: stdlib_dorgr2
+     public :: stdlib_dorgrq
+     public :: stdlib_dorgtr
+     public :: stdlib_dorgtsqr
+     public :: stdlib_dorgtsqr_row
+     public :: stdlib_dorhr_col
+     public :: stdlib_dorm22
+     public :: stdlib_dorm2l
+     public :: stdlib_dorm2r
+     public :: stdlib_dormbr
+     public :: stdlib_dormhr
+     public :: stdlib_dorml2
+     public :: stdlib_dormlq
+     public :: stdlib_dormql
+     public :: stdlib_dormqr
+     public :: stdlib_dormr2
+     public :: stdlib_dormr3
+     public :: stdlib_dormrq
+     public :: stdlib_dormrz
+     public :: stdlib_dormtr
+     public :: stdlib_dpbcon
+     public :: stdlib_dpbequ
+     public :: stdlib_dpbrfs
+     public :: stdlib_dpbstf
+     public :: stdlib_dpbsv
+     public :: stdlib_dpbsvx
+     public :: stdlib_dpbtf2
+     public :: stdlib_dpbtrf
+     public :: stdlib_dpbtrs
+     public :: stdlib_dpftrf
+     public :: stdlib_dpftri
+     public :: stdlib_dpftrs
+     public :: stdlib_dpocon
+     public :: stdlib_dpoequ
+     public :: stdlib_dpoequb
+     public :: stdlib_dporfs
+     public :: stdlib_dposv
+     public :: stdlib_dposvx
+     public :: stdlib_dpotf2
+     public :: stdlib_dpotrf
+     public :: stdlib_dpotrf2
+     public :: stdlib_dpotri
+     public :: stdlib_dpotrs
+     public :: stdlib_dppcon
+     public :: stdlib_dppequ
+     public :: stdlib_dpprfs
+     public :: stdlib_dppsv
+     public :: stdlib_dppsvx
+     public :: stdlib_dpptrf
+     public :: stdlib_dpptri
+     public :: stdlib_dpptrs
+     public :: stdlib_dpstf2
+     public :: stdlib_dpstrf
+     public :: stdlib_dptcon
+     public :: stdlib_dpteqr
+     public :: stdlib_dptrfs
+     public :: stdlib_dptsv
+     public :: stdlib_dptsvx
+     public :: stdlib_dpttrf
+     public :: stdlib_dpttrs
+     public :: stdlib_dptts2
+     public :: stdlib_drscl
+     public :: stdlib_dsb2st_kernels
+     public :: stdlib_dsbev
+     public :: stdlib_dsbevd
+     public :: stdlib_dsbevx
+     public :: stdlib_dsbgst
+     public :: stdlib_dsbgv
+     public :: stdlib_dsbgvd
+     public :: stdlib_dsbgvx
+     public :: stdlib_dsbtrd
+     public :: stdlib_dsfrk
+     public :: stdlib_dsgesv
+     public :: stdlib_dspcon
+     public :: stdlib_dspev
+     public :: stdlib_dspevd
+     public :: stdlib_dspevx
+     public :: stdlib_dspgst
+     public :: stdlib_dspgv
+     public :: stdlib_dspgvd
+     public :: stdlib_dspgvx
+     public :: stdlib_dsposv
+     public :: stdlib_dsprfs
+     public :: stdlib_dspsv
+     public :: stdlib_dspsvx
+     public :: stdlib_dsptrd
+     public :: stdlib_dsptrf
+     public :: stdlib_dsptri
+     public :: stdlib_dsptrs
+     public :: stdlib_dstebz
+     public :: stdlib_dstedc
+     public :: stdlib_dstegr
+     public :: stdlib_dstein
+     public :: stdlib_dstemr
+     public :: stdlib_dsteqr
+     public :: stdlib_dsterf
+     public :: stdlib_dstev
+     public :: stdlib_dstevd
+     public :: stdlib_dstevr
+     public :: stdlib_dstevx
+     public :: stdlib_dsycon
+     public :: stdlib_dsycon_rook
+     public :: stdlib_dsyconv
+     public :: stdlib_dsyconvf
+     public :: stdlib_dsyconvf_rook
+     public :: stdlib_dsyequb
+     public :: stdlib_dsyev
+     public :: stdlib_dsyevd
+     public :: stdlib_dsyevr
+     public :: stdlib_dsyevx
+     public :: stdlib_dsygs2
+     public :: stdlib_dsygst
+     public :: stdlib_dsygv
+     public :: stdlib_dsygvd
+     public :: stdlib_dsygvx
+     public :: stdlib_dsyrfs
+     public :: stdlib_dsysv
+     public :: stdlib_dsysv_aa
+     public :: stdlib_dsysv_rk
+     public :: stdlib_dsysv_rook
+     public :: stdlib_dsysvx
+     public :: stdlib_dsyswapr
+     public :: stdlib_dsytd2
+     public :: stdlib_dsytf2
+     public :: stdlib_dsytf2_rk
+     public :: stdlib_dsytf2_rook
+     public :: stdlib_dsytrd
+     public :: stdlib_dsytrd_sb2st
+     public :: stdlib_dsytrd_sy2sb
+     public :: stdlib_dsytrf
+     public :: stdlib_dsytrf_aa
+     public :: stdlib_dsytrf_rk
+     public :: stdlib_dsytrf_rook
+     public :: stdlib_dsytri
+     public :: stdlib_dsytri_rook
+     public :: stdlib_dsytrs
+     public :: stdlib_dsytrs2
+     public :: stdlib_dsytrs_3
+     public :: stdlib_dsytrs_aa
+     public :: stdlib_dsytrs_rook
+     public :: stdlib_dtbcon
+     public :: stdlib_dtbrfs
+     public :: stdlib_dtbtrs
+     public :: stdlib_dtfsm
+     public :: stdlib_dtftri
+     public :: stdlib_dtfttp
+     public :: stdlib_dtfttr
+     public :: stdlib_dtgevc
+     public :: stdlib_dtgex2
+     public :: stdlib_dtgexc
+     public :: stdlib_dtgsen
+     public :: stdlib_dtgsja
+     public :: stdlib_dtgsna
+     public :: stdlib_dtgsy2
+     public :: stdlib_dtgsyl
+     public :: stdlib_dtpcon
+     public :: stdlib_dtplqt
+     public :: stdlib_dtplqt2
+     public :: stdlib_dtpmlqt
+     public :: stdlib_dtpmqrt
+     public :: stdlib_dtpqrt
+     public :: stdlib_dtpqrt2
+     public :: stdlib_dtprfb
+     public :: stdlib_dtprfs
+     public :: stdlib_dtptri
+     public :: stdlib_dtptrs
+     public :: stdlib_dtpttf
+     public :: stdlib_dtpttr
+     public :: stdlib_dtrcon
+     public :: stdlib_dtrevc
+     public :: stdlib_dtrevc3
+     public :: stdlib_dtrexc
+     public :: stdlib_dtrrfs
+     public :: stdlib_dtrsen
+     public :: stdlib_dtrsna
+     public :: stdlib_dtrsyl
+     public :: stdlib_dtrti2
+     public :: stdlib_dtrtri
+     public :: stdlib_dtrtrs
+     public :: stdlib_dtrttf
+     public :: stdlib_dtrttp
+     public :: stdlib_dtzrzf
+     public :: stdlib_dzsum1
+#:if WITH_QP
+     public :: stdlib_dlag2q
+#:endif
+
+     ! 64-bit real constants 
+     real(dp),    parameter, private ::     negone = -1.00_dp
+     real(dp),    parameter, private ::       zero = 0.00_dp
+     real(dp),    parameter, private ::       half = 0.50_dp
+     real(dp),    parameter, private ::        one = 1.00_dp
+     real(dp),    parameter, private ::        two = 2.00_dp
+     real(dp),    parameter, private ::      three = 3.00_dp
+     real(dp),    parameter, private ::       four = 4.00_dp
+     real(dp),    parameter, private ::      eight = 8.00_dp
+     real(dp),    parameter, private ::        ten = 10.00_dp
+
+     ! 64-bit complex constants 
+     complex(dp), parameter, private :: czero   = ( 0.0_dp,0.0_dp)
+     complex(dp), parameter, private :: chalf   = ( 0.5_dp,0.0_dp)
+     complex(dp), parameter, private :: cone    = ( 1.0_dp,0.0_dp)
+     complex(dp), parameter, private :: cnegone = (-1.0_dp,0.0_dp)
+
+     ! 64-bit scaling constants 
+     integer,     parameter, private :: maxexp = maxexponent(zero) 
+     integer,     parameter, private :: minexp = minexponent(zero) 
+     real(dp),    parameter, private :: rradix = real(radix(zero),dp) 
+     real(dp),    parameter, private :: ulp    = epsilon(zero) 
+     real(dp),    parameter, private :: eps    = ulp*half 
+     real(dp),    parameter, private :: safmin = rradix**max(minexp-1,1-maxexp) 
+     real(dp),    parameter, private :: safmax = one/safmin 
+     real(dp),    parameter, private :: smlnum = safmin/ulp 
+     real(dp),    parameter, private :: bignum = safmax*ulp 
+     real(dp),    parameter, private :: rtmin  = sqrt(smlnum) 
+     real(dp),    parameter, private :: rtmax  = sqrt(bignum) 
+
+     ! 64-bit Blue's scaling constants 
+     ! ssml>=1/s and sbig==1/S with s,S as defined in https://doi.org/10.1145/355769.355771 
+     real(dp),    parameter, private :: tsml   = rradix**ceiling((minexp-1)*half) 
+     real(dp),    parameter, private :: tbig   = rradix**floor((maxexp-digits(zero)+1)*half) 
+     real(dp),    parameter, private :: ssml   = rradix**(-floor((minexp-digits(zero))*half)) 
+     real(dp),    parameter, private :: sbig   = rradix**(-ceiling((maxexp+digits(zero)-1)*half)) 
+
+
+     contains
+
+     !> DGBTF2: computes an LU factorization of a real m-by-n band matrix A
+     !> using partial pivoting with row interchanges.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_dgbtf2( m, n, kl, ku, ab, ldab, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jp, ju, km, kv
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! kv is the number of superdiagonals in the factor u, allowing for
+           ! fill-in.
+           kv = ku + kl
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+kv+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! gaussian elimination with partial pivoting
+           ! set fill-in elements in columns ku+2 to kv to zero.
+           do j = ku + 2, min( kv, n )
+              do i = kv - j + 2, kl
+                 ab( i, j ) = zero
+              end do
+           end do
+           ! ju is the index of the last column affected by the current stage
+           ! of the factorization.
+           ju = 1
+           loop_40: do j = 1, min( m, n )
+              ! set fill-in elements in column j+kv to zero.
+              if( j+kv<=n ) then
+                 do i = 1, kl
+                    ab( i, j+kv ) = zero
+                 end do
+              end if
+              ! find pivot and test for singularity. km is the number of
+              ! subdiagonal elements in the current column.
+              km = min( kl, m-j )
+              jp = stdlib_idamax( km+1, ab( kv+1, j ), 1 )
+              ipiv( j ) = jp + j - 1
+              if( ab( kv+jp, j )/=zero ) then
+                 ju = max( ju, min( j+ku+jp-1, n ) )
+                 ! apply interchange to columns j to ju.
+                 if( jp/=1 )call stdlib_dswap( ju-j+1, ab( kv+jp, j ), ldab-1,ab( kv+1, j ), ldab-&
+                           1 )
+                 if( km>0 ) then
+                    ! compute multipliers.
+                    call stdlib_dscal( km, one / ab( kv+1, j ), ab( kv+2, j ), 1 )
+                    ! update trailing submatrix within the band.
+                    if( ju>j )call stdlib_dger( km, ju-j, -one, ab( kv+2, j ), 1,ab( kv, j+1 ), &
+                              ldab-1, ab( kv+1, j+1 ),ldab-1 )
+                 end if
+              else
+                 ! if pivot is zero, set info to the index of the pivot
+                 ! unless a zero pivot has already been found.
+                 if( info==0 )info = j
+              end if
+           end do loop_40
+           return
+     end subroutine stdlib_dgbtf2
+
+     !> DGBTRS: solves a system of linear equations
+     !> A * X = B  or  A**T * X = B
+     !> with a general band matrix A using the LU factorization computed
+     !> by DGBTRF.
+
+     pure subroutine stdlib_dgbtrs( trans, n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(in) :: ab(ldab,*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnoti, notran
+           integer(ilp) :: i, j, kd, l, lm
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<( 2*kl+ku+1 ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           kd = ku + kl + 1
+           lnoti = kl>0
+           if( notran ) then
+              ! solve  a*x = b.
+              ! solve l*x = b, overwriting b with x.
+              ! l is represented as a product of permutations and unit lower
+              ! triangular matrices l = p(1) * l(1) * ... * p(n-1) * l(n-1),
+              ! where each transformation l(i) is a rank-one modification of
+              ! the identity matrix.
+              if( lnoti ) then
+                 do j = 1, n - 1
+                    lm = min( kl, n-j )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_dswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                    call stdlib_dger( lm, nrhs, -one, ab( kd+1, j ), 1, b( j, 1 ),ldb, b( j+1, 1 )&
+                              , ldb )
+                 end do
+              end if
+              do i = 1, nrhs
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_dtbsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, kl+ku,ab, ldab, b( 1, &
+                           i ), 1 )
+              end do
+           else
+              ! solve a**t*x = b.
+              do i = 1, nrhs
+                 ! solve u**t*x = b, overwriting b with x.
+                 call stdlib_dtbsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, kl+ku, ab,ldab, b( 1, i )&
+                           , 1 )
+              end do
+              ! solve l**t*x = b, overwriting b with x.
+              if( lnoti ) then
+                 do j = n - 1, 1, -1
+                    lm = min( kl, n-j )
+                    call stdlib_dgemv( 'TRANSPOSE', lm, nrhs, -one, b( j+1, 1 ),ldb, ab( kd+1, j )&
+                              , 1, one, b( j, 1 ), ldb )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_dswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_dgbtrs
+
+     !> DGEBAK: forms the right or left eigenvectors of a real general matrix
+     !> by backward transformation on the computed eigenvectors of the
+     !> balanced matrix output by DGEBAL.
+
+     pure subroutine stdlib_dgebak( job, side, n, ilo, ihi, scale, m, v, ldv,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job, side
+           integer(ilp), intent(in) :: ihi, ilo, ldv, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(in) :: scale(*)
+           real(dp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: leftv, rightv
+           integer(ilp) :: i, ii, k
+           real(dp) :: s
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! decode and test the input parameters
+           rightv = stdlib_lsame( side, 'R' )
+           leftv = stdlib_lsame( side, 'L' )
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( .not.rightv .and. .not.leftv ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -4
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -5
+           else if( m<0 ) then
+              info = -7
+           else if( ldv<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEBAK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( m==0 )return
+           if( stdlib_lsame( job, 'N' ) )return
+           if( ilo==ihi )go to 30
+           ! backward balance
+           if( stdlib_lsame( job, 'S' ) .or. stdlib_lsame( job, 'B' ) ) then
+              if( rightv ) then
+                 do i = ilo, ihi
+                    s = scale( i )
+                    call stdlib_dscal( m, s, v( i, 1 ), ldv )
+                 end do
+              end if
+              if( leftv ) then
+                 do i = ilo, ihi
+                    s = one / scale( i )
+                    call stdlib_dscal( m, s, v( i, 1 ), ldv )
+                 end do
+              end if
+           end if
+           ! backward permutation
+           ! for  i = ilo-1 step -1 until 1,
+                    ! ihi+1 step 1 until n do --
+                    30 continue
+           if( stdlib_lsame( job, 'P' ) .or. stdlib_lsame( job, 'B' ) ) then
+              if( rightv ) then
+                 loop_40: do ii = 1, n
+                    i = ii
+                    if( i>=ilo .and. i<=ihi )cycle loop_40
+                    if( i<ilo )i = ilo - ii
+                    k = scale( i )
+                    if( k==i )cycle loop_40
+                    call stdlib_dswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_40
+              end if
+              if( leftv ) then
+                 loop_50: do ii = 1, n
+                    i = ii
+                    if( i>=ilo .and. i<=ihi )cycle loop_50
+                    if( i<ilo )i = ilo - ii
+                    k = scale( i )
+                    if( k==i )cycle loop_50
+                    call stdlib_dswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_50
+              end if
+           end if
+           return
+     end subroutine stdlib_dgebak
+
+     !> DGGBAK: forms the right or left eigenvectors of a real generalized
+     !> eigenvalue problem A*x = lambda*B*x, by backward transformation on
+     !> the computed eigenvectors of the balanced pair of matrices output by
+     !> DGGBAL.
+
+     pure subroutine stdlib_dggbak( job, side, n, ilo, ihi, lscale, rscale, m, v,ldv, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job, side
+           integer(ilp), intent(in) :: ihi, ilo, ldv, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(in) :: lscale(*), rscale(*)
+           real(dp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: leftv, rightv
+           integer(ilp) :: i, k
+           ! Intrinsic Functions 
+           intrinsic :: max,int
+           ! Executable Statements 
+           ! test the input parameters
+           rightv = stdlib_lsame( side, 'R' )
+           leftv = stdlib_lsame( side, 'L' )
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( .not.rightv .and. .not.leftv ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( n==0 .and. ihi==0 .and. ilo/=1 ) then
+              info = -4
+           else if( n>0 .and. ( ihi<ilo .or. ihi>max( 1, n ) ) )then
+              info = -5
+           else if( n==0 .and. ilo==1 .and. ihi/=0 ) then
+              info = -5
+           else if( m<0 ) then
+              info = -8
+           else if( ldv<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGBAK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( m==0 )return
+           if( stdlib_lsame( job, 'N' ) )return
+           if( ilo==ihi )go to 30
+           ! backward balance
+           if( stdlib_lsame( job, 'S' ) .or. stdlib_lsame( job, 'B' ) ) then
+              ! backward transformation on right eigenvectors
+              if( rightv ) then
+                 do i = ilo, ihi
+                    call stdlib_dscal( m, rscale( i ), v( i, 1 ), ldv )
+                 end do
+              end if
+              ! backward transformation on left eigenvectors
+              if( leftv ) then
+                 do i = ilo, ihi
+                    call stdlib_dscal( m, lscale( i ), v( i, 1 ), ldv )
+                 end do
+              end if
+           end if
+           ! backward permutation
+           30 continue
+           if( stdlib_lsame( job, 'P' ) .or. stdlib_lsame( job, 'B' ) ) then
+              ! backward permutation on right eigenvectors
+              if( rightv ) then
+                 if( ilo==1 )go to 50
+                 loop_40: do i = ilo - 1, 1, -1
+                    k = int(rscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_40
+                    call stdlib_dswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_40
+                 50 continue
+                 if( ihi==n )go to 70
+                 loop_60: do i = ihi + 1, n
+                    k = int(rscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_60
+                    call stdlib_dswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_60
+              end if
+              ! backward permutation on left eigenvectors
+              70 continue
+              if( leftv ) then
+                 if( ilo==1 )go to 90
+                 loop_80: do i = ilo - 1, 1, -1
+                    k = int(lscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_80
+                    call stdlib_dswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_80
+                 90 continue
+                 if( ihi==n )go to 110
+                 loop_100: do i = ihi + 1, n
+                    k = int(lscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_100
+                    call stdlib_dswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_100
+              end if
+           end if
+           110 continue
+           return
+     end subroutine stdlib_dggbak
+
+     !> DGTSV:  solves the equation
+     !> A*X = B,
+     !> where A is an n by n tridiagonal matrix, by Gaussian elimination with
+     !> partial pivoting.
+     !> Note that the equation  A**T*X = B  may be solved by interchanging the
+     !> order of the arguments DU and DL.
+
+     pure subroutine stdlib_dgtsv( n, nrhs, dl, d, du, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(inout) :: b(ldb,*), d(*), dl(*), du(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: fact, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGTSV ', -info )
+              return
+           end if
+           if( n==0 )return
+           if( nrhs==1 ) then
+              loop_10: do i = 1, n - 2
+                 if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                    ! no row interchange required
+                    if( d( i )/=zero ) then
+                       fact = dl( i ) / d( i )
+                       d( i+1 ) = d( i+1 ) - fact*du( i )
+                       b( i+1, 1 ) = b( i+1, 1 ) - fact*b( i, 1 )
+                    else
+                       info = i
+                       return
+                    end if
+                    dl( i ) = zero
+                 else
+                    ! interchange rows i and i+1
+                    fact = d( i ) / dl( i )
+                    d( i ) = dl( i )
+                    temp = d( i+1 )
+                    d( i+1 ) = du( i ) - fact*temp
+                    dl( i ) = du( i+1 )
+                    du( i+1 ) = -fact*dl( i )
+                    du( i ) = temp
+                    temp = b( i, 1 )
+                    b( i, 1 ) = b( i+1, 1 )
+                    b( i+1, 1 ) = temp - fact*b( i+1, 1 )
+                 end if
+              end do loop_10
+              if( n>1 ) then
+                 i = n - 1
+                 if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                    if( d( i )/=zero ) then
+                       fact = dl( i ) / d( i )
+                       d( i+1 ) = d( i+1 ) - fact*du( i )
+                       b( i+1, 1 ) = b( i+1, 1 ) - fact*b( i, 1 )
+                    else
+                       info = i
+                       return
+                    end if
+                 else
+                    fact = d( i ) / dl( i )
+                    d( i ) = dl( i )
+                    temp = d( i+1 )
+                    d( i+1 ) = du( i ) - fact*temp
+                    du( i ) = temp
+                    temp = b( i, 1 )
+                    b( i, 1 ) = b( i+1, 1 )
+                    b( i+1, 1 ) = temp - fact*b( i+1, 1 )
+                 end if
+              end if
+              if( d( n )==zero ) then
+                 info = n
+                 return
+              end if
+           else
+              loop_40: do i = 1, n - 2
+                 if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                    ! no row interchange required
+                    if( d( i )/=zero ) then
+                       fact = dl( i ) / d( i )
+                       d( i+1 ) = d( i+1 ) - fact*du( i )
+                       do j = 1, nrhs
+                          b( i+1, j ) = b( i+1, j ) - fact*b( i, j )
+                       end do
+                    else
+                       info = i
+                       return
+                    end if
+                    dl( i ) = zero
+                 else
+                    ! interchange rows i and i+1
+                    fact = d( i ) / dl( i )
+                    d( i ) = dl( i )
+                    temp = d( i+1 )
+                    d( i+1 ) = du( i ) - fact*temp
+                    dl( i ) = du( i+1 )
+                    du( i+1 ) = -fact*dl( i )
+                    du( i ) = temp
+                    do j = 1, nrhs
+                       temp = b( i, j )
+                       b( i, j ) = b( i+1, j )
+                       b( i+1, j ) = temp - fact*b( i+1, j )
+                    end do
+                 end if
+              end do loop_40
+              if( n>1 ) then
+                 i = n - 1
+                 if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                    if( d( i )/=zero ) then
+                       fact = dl( i ) / d( i )
+                       d( i+1 ) = d( i+1 ) - fact*du( i )
+                       do j = 1, nrhs
+                          b( i+1, j ) = b( i+1, j ) - fact*b( i, j )
+                       end do
+                    else
+                       info = i
+                       return
+                    end if
+                 else
+                    fact = d( i ) / dl( i )
+                    d( i ) = dl( i )
+                    temp = d( i+1 )
+                    d( i+1 ) = du( i ) - fact*temp
+                    du( i ) = temp
+                    do j = 1, nrhs
+                       temp = b( i, j )
+                       b( i, j ) = b( i+1, j )
+                       b( i+1, j ) = temp - fact*b( i+1, j )
+                    end do
+                 end if
+              end if
+              if( d( n )==zero ) then
+                 info = n
+                 return
+              end if
+           end if
+           ! back solve with the matrix u from the factorization.
+           if( nrhs<=2 ) then
+              j = 1
+              70 continue
+              b( n, j ) = b( n, j ) / d( n )
+              if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) / d( n-1 )
+              do i = n - 2, 1, -1
+                 b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-dl( i )*b( i+2, j ) ) / d( i )
+                           
+              end do
+              if( j<nrhs ) then
+                 j = j + 1
+                 go to 70
+              end if
+           else
+              do j = 1, nrhs
+                 b( n, j ) = b( n, j ) / d( n )
+                 if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                 do i = n - 2, 1, -1
+                    b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-dl( i )*b( i+2, j ) ) / d( i )
+                              
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_dgtsv
+
+     !> DGTTRF: computes an LU factorization of a real tridiagonal matrix A
+     !> using elimination with partial pivoting and row interchanges.
+     !> The factorization has the form
+     !> A = L * U
+     !> where L is a product of permutation and unit lower bidiagonal
+     !> matrices and U is upper triangular with nonzeros in only the main
+     !> diagonal and first two superdiagonals.
+
+     pure subroutine stdlib_dgttrf( n, dl, d, du, du2, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: d(*), dl(*), du(*)
+           real(dp), intent(out) :: du2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: fact, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'DGTTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize ipiv(i) = i and du2(i) = 0
+           do i = 1, n
+              ipiv( i ) = i
+           end do
+           do i = 1, n - 2
+              du2( i ) = zero
+           end do
+           do i = 1, n - 2
+              if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                 ! no row interchange required, eliminate dl(i)
+                 if( d( i )/=zero ) then
+                    fact = dl( i ) / d( i )
+                    dl( i ) = fact
+                    d( i+1 ) = d( i+1 ) - fact*du( i )
+                 end if
+              else
+                 ! interchange rows i and i+1, eliminate dl(i)
+                 fact = d( i ) / dl( i )
+                 d( i ) = dl( i )
+                 dl( i ) = fact
+                 temp = du( i )
+                 du( i ) = d( i+1 )
+                 d( i+1 ) = temp - fact*d( i+1 )
+                 du2( i ) = du( i+1 )
+                 du( i+1 ) = -fact*du( i+1 )
+                 ipiv( i ) = i + 1
+              end if
+           end do
+           if( n>1 ) then
+              i = n - 1
+              if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                 if( d( i )/=zero ) then
+                    fact = dl( i ) / d( i )
+                    dl( i ) = fact
+                    d( i+1 ) = d( i+1 ) - fact*du( i )
+                 end if
+              else
+                 fact = d( i ) / dl( i )
+                 d( i ) = dl( i )
+                 dl( i ) = fact
+                 temp = du( i )
+                 du( i ) = d( i+1 )
+                 d( i+1 ) = temp - fact*d( i+1 )
+                 ipiv( i ) = i + 1
+              end if
+           end if
+           ! check for a zero on the diagonal of u.
+           do i = 1, n
+              if( d( i )==zero ) then
+                 info = i
+                 go to 50
+              end if
+           end do
+           50 continue
+           return
+     end subroutine stdlib_dgttrf
+
+     !> DGTTS2: solves one of the systems of equations
+     !> A*X = B  or  A**T*X = B,
+     !> with a tridiagonal matrix A using the LU factorization computed
+     !> by DGTTRF.
+
+     pure subroutine stdlib_dgtts2( itrans, n, nrhs, dl, d, du, du2, ipiv, b, ldb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: itrans, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(inout) :: b(ldb,*)
+           real(dp), intent(in) :: d(*), dl(*), du(*), du2(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ip, j
+           real(dp) :: temp
+           ! Executable Statements 
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( itrans==0 ) then
+              ! solve a*x = b using the lu factorization of a,
+              ! overwriting each right hand side vector with its solution.
+              if( nrhs<=1 ) then
+                 j = 1
+                 10 continue
+                 ! solve l*x = b.
+                 do i = 1, n - 1
+                    ip = ipiv( i )
+                    temp = b( i+1-ip+i, j ) - dl( i )*b( ip, j )
+                    b( i, j ) = b( ip, j )
+                    b( i+1, j ) = temp
+                 end do
+                 ! solve u*x = b.
+                 b( n, j ) = b( n, j ) / d( n )
+                 if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                 do i = n - 2, 1, -1
+                    b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-du2( i )*b( i+2, j ) ) / d( i )
+                              
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 10
+                 end if
+              else
+                 do j = 1, nrhs
+                    ! solve l*x = b.
+                    do i = 1, n - 1
+                       if( ipiv( i )==i ) then
+                          b( i+1, j ) = b( i+1, j ) - dl( i )*b( i, j )
+                       else
+                          temp = b( i, j )
+                          b( i, j ) = b( i+1, j )
+                          b( i+1, j ) = temp - dl( i )*b( i, j )
+                       end if
+                    end do
+                    ! solve u*x = b.
+                    b( n, j ) = b( n, j ) / d( n )
+                    if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                    do i = n - 2, 1, -1
+                       b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-du2( i )*b( i+2, j ) ) / d( i )
+                                 
+                    end do
+                 end do
+              end if
+           else
+              ! solve a**t * x = b.
+              if( nrhs<=1 ) then
+                 ! solve u**t*x = b.
+                 j = 1
+                 70 continue
+                 b( 1, j ) = b( 1, j ) / d( 1 )
+                 if( n>1 )b( 2, j ) = ( b( 2, j )-du( 1 )*b( 1, j ) ) / d( 2 )
+                 do i = 3, n
+                    b( i, j ) = ( b( i, j )-du( i-1 )*b( i-1, j )-du2( i-2 )*b( i-2, j ) ) / d( i &
+                              )
+                 end do
+                 ! solve l**t*x = b.
+                 do i = n - 1, 1, -1
+                    ip = ipiv( i )
+                    temp = b( i, j ) - dl( i )*b( i+1, j )
+                    b( i, j ) = b( ip, j )
+                    b( ip, j ) = temp
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 70
+                 end if
+              else
+                 do j = 1, nrhs
+                    ! solve u**t*x = b.
+                    b( 1, j ) = b( 1, j ) / d( 1 )
+                    if( n>1 )b( 2, j ) = ( b( 2, j )-du( 1 )*b( 1, j ) ) / d( 2 )
+                    do i = 3, n
+                       b( i, j ) = ( b( i, j )-du( i-1 )*b( i-1, j )-du2( i-2 )*b( i-2, j ) ) / d(&
+                                  i )
+                    end do
+                    do i = n - 1, 1, -1
+                       if( ipiv( i )==i ) then
+                          b( i, j ) = b( i, j ) - dl( i )*b( i+1, j )
+                       else
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - dl( i )*temp
+                          b( i, j ) = temp
+                       end if
+                    end do
+                 end do
+              end if
+           end if
+     end subroutine stdlib_dgtts2
+
+     !> DLA_GBRPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     pure real(dp) function stdlib_dla_gbrpvgrw( n, kl, ku, ncols, ab,ldab, afb, ldafb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, kl, ku, ncols, ldab, ldafb
+           ! Array Arguments 
+           real(dp), intent(in) :: ab(ldab,*), afb(ldafb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(dp) :: amax, umax, rpvgrw
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           rpvgrw = one
+           kd = ku + 1
+           do j = 1, ncols
+              amax = zero
+              umax = zero
+              do i = max( j-ku, 1 ), min( j+kl, n )
+                 amax = max( abs( ab( kd+i-j, j)), amax )
+              end do
+              do i = max( j-ku, 1 ), j
+                 umax = max( abs( afb( kd+i-j, j ) ), umax )
+              end do
+              if ( umax /= zero ) then
+                 rpvgrw = min( amax / umax, rpvgrw )
+              end if
+           end do
+           stdlib_dla_gbrpvgrw = rpvgrw
+     end function stdlib_dla_gbrpvgrw
+
+     !> DLA_GERPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     pure real(dp) function stdlib_dla_gerpvgrw( n, ncols, a, lda, af,ldaf )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, ncols, lda, ldaf
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), af(ldaf,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: amax, umax, rpvgrw
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           rpvgrw = one
+           do j = 1, ncols
+              amax = zero
+              umax = zero
+              do i = 1, n
+                 amax = max( abs( a( i, j ) ), amax )
+              end do
+              do i = 1, j
+                 umax = max( abs( af( i, j ) ), umax )
+              end do
+              if ( umax /= zero ) then
+                 rpvgrw = min( amax / umax, rpvgrw )
+              end if
+           end do
+           stdlib_dla_gerpvgrw = rpvgrw
+     end function stdlib_dla_gerpvgrw
+
+     !> DLA_WWADDW: adds a vector W into a doubled-single vector (X, Y).
+     !> This works for all extant IBM's hex and binary floating point
+     !> arithmetic, but not for decimal.
+
+     pure subroutine stdlib_dla_wwaddw( n, x, y, w )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(inout) :: x(*), y(*)
+           real(dp), intent(in) :: w(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: s
+           integer(ilp) :: i
+           ! Executable Statements 
+           do 10 i = 1, n
+             s = x(i) + w(i)
+             s = (s + s) - s
+             y(i) = ((x(i) - s) + w(i)) + y(i)
+             x(i) = s
+             10 continue
+           return
+     end subroutine stdlib_dla_wwaddw
+
+     !> DLABAD: takes as input the values computed by DLAMCH for underflow and
+     !> overflow, and returns the square root of each of these values if the
+     !> log of LARGE is sufficiently large.  This subroutine is intended to
+     !> identify machines with a large exponent range, such as the Crays, and
+     !> redefine the underflow and overflow limits to be the square roots of
+     !> the values computed by DLAMCH.  This subroutine is needed because
+     !> DLAMCH does not compensate for poor arithmetic in the upper half of
+     !> the exponent range, as is found on a Cray.
+
+     pure subroutine stdlib_dlabad( small, large )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(inout) :: large, small
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: log10,sqrt
+           ! Executable Statements 
+           ! if it looks like we're on a cray, take the square root of
+           ! small and large to avoid overflow and underflow problems.
+           if( log10( large )>2000._dp ) then
+              small = sqrt( small )
+              large = sqrt( large )
+           end if
+           return
+     end subroutine stdlib_dlabad
+
+     !> DLACN2: estimates the 1-norm of a square, real matrix A.
+     !> Reverse communication is used for evaluating matrix-vector products.
+
+     pure subroutine stdlib_dlacn2( n, v, x, isgn, est, kase, isave )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(inout) :: kase
+           integer(ilp), intent(in) :: n
+           real(dp), intent(inout) :: est
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isgn(*)
+           integer(ilp), intent(inout) :: isave(3)
+           real(dp), intent(out) :: v(*)
+           real(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, jlast
+           real(dp) :: altsgn, estold, temp, xs
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,nint
+           ! Executable Statements 
+           if( kase==0 ) then
+              do i = 1, n
+                 x( i ) = one / real( n,KIND=dp)
+              end do
+              kase = 1
+              isave( 1 ) = 1
+              return
+           end if
+           go to ( 20, 40, 70, 110, 140 )isave( 1 )
+           ! ................ entry   (isave( 1 ) = 1)
+           ! first iteration.  x has been overwritten by a*x.
+           20 continue
+           if( n==1 ) then
+              v( 1 ) = x( 1 )
+              est = abs( v( 1 ) )
+              ! ... quit
+              go to 150
+           end if
+           est = stdlib_dasum( n, x, 1 )
+           do i = 1, n
+              if( x(i)>=zero ) then
+                 x(i) = one
+              else
+                 x(i) = -one
+              end if
+              isgn( i ) = nint( x( i ),KIND=ilp)
+           end do
+           kase = 2
+           isave( 1 ) = 2
+           return
+           ! ................ entry   (isave( 1 ) = 2)
+           ! first iteration.  x has been overwritten by transpose(a)*x.
+           40 continue
+           isave( 2 ) = stdlib_idamax( n, x, 1 )
+           isave( 3 ) = 2
+           ! main loop - iterations 2,3,...,itmax.
+           50 continue
+           do i = 1, n
+              x( i ) = zero
+           end do
+           x( isave( 2 ) ) = one
+           kase = 1
+           isave( 1 ) = 3
+           return
+           ! ................ entry   (isave( 1 ) = 3)
+           ! x has been overwritten by a*x.
+           70 continue
+           call stdlib_dcopy( n, x, 1, v, 1 )
+           estold = est
+           est = stdlib_dasum( n, v, 1 )
+           do i = 1, n
+              if( x(i)>=zero ) then
+                 xs = one
+              else
+                 xs = -one
+              end if
+              if( nint( xs,KIND=ilp)/=isgn( i ) )go to 90
+           end do
+           ! repeated sign vector detected, hence algorithm has converged.
+           go to 120
+           90 continue
+           ! test for cycling.
+           if( est<=estold )go to 120
+           do i = 1, n
+              if( x(i)>=zero ) then
+                 x(i) = one
+              else
+                 x(i) = -one
+              end if
+              isgn( i ) = nint( x( i ),KIND=ilp)
+           end do
+           kase = 2
+           isave( 1 ) = 4
+           return
+           ! ................ entry   (isave( 1 ) = 4)
+           ! x has been overwritten by transpose(a)*x.
+           110 continue
+           jlast = isave( 2 )
+           isave( 2 ) = stdlib_idamax( n, x, 1 )
+           if( ( x( jlast )/=abs( x( isave( 2 ) ) ) ) .and.( isave( 3 )<itmax ) ) then
+              isave( 3 ) = isave( 3 ) + 1
+              go to 50
+           end if
+           ! iteration complete.  final stage.
+           120 continue
+           altsgn = one
+           do i = 1, n
+              x( i ) = altsgn*( one+real( i-1,KIND=dp) / real( n-1,KIND=dp) )
+              altsgn = -altsgn
+           end do
+           kase = 1
+           isave( 1 ) = 5
+           return
+           ! ................ entry   (isave( 1 ) = 5)
+           ! x has been overwritten by a*x.
+           140 continue
+           temp = two*( stdlib_dasum( n, x, 1 ) / real( 3*n,KIND=dp) )
+           if( temp>est ) then
+              call stdlib_dcopy( n, x, 1, v, 1 )
+              est = temp
+           end if
+           150 continue
+           kase = 0
+           return
+     end subroutine stdlib_dlacn2
+
+     !> DLACON: estimates the 1-norm of a square, real matrix A.
+     !> Reverse communication is used for evaluating matrix-vector products.
+
+     subroutine stdlib_dlacon( n, v, x, isgn, est, kase )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(inout) :: kase
+           integer(ilp), intent(in) :: n
+           real(dp), intent(inout) :: est
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isgn(*)
+           real(dp), intent(out) :: v(*)
+           real(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iter, j, jlast, jump
+           real(dp) :: altsgn, estold, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,nint,sign
+           ! Save Statement 
+           save
+           ! Executable Statements 
+           if( kase==0 ) then
+              do i = 1, n
+                 x( i ) = one / real( n,KIND=dp)
+              end do
+              kase = 1
+              jump = 1
+              return
+           end if
+           go to ( 20, 40, 70, 110, 140 )jump
+           ! ................ entry   (jump = 1)
+           ! first iteration.  x has been overwritten by a*x.
+           20 continue
+           if( n==1 ) then
+              v( 1 ) = x( 1 )
+              est = abs( v( 1 ) )
+              ! ... quit
+              go to 150
+           end if
+           est = stdlib_dasum( n, x, 1 )
+           do i = 1, n
+              x( i ) = sign( one, x( i ) )
+              isgn( i ) = nint( x( i ),KIND=ilp)
+           end do
+           kase = 2
+           jump = 2
+           return
+           ! ................ entry   (jump = 2)
+           ! first iteration.  x has been overwritten by transpose(a)*x.
+           40 continue
+           j = stdlib_idamax( n, x, 1 )
+           iter = 2
+           ! main loop - iterations 2,3,...,itmax.
+           50 continue
+           do i = 1, n
+              x( i ) = zero
+           end do
+           x( j ) = one
+           kase = 1
+           jump = 3
+           return
+           ! ................ entry   (jump = 3)
+           ! x has been overwritten by a*x.
+           70 continue
+           call stdlib_dcopy( n, x, 1, v, 1 )
+           estold = est
+           est = stdlib_dasum( n, v, 1 )
+           do i = 1, n
+              if( nint( sign( one, x( i ) ),KIND=ilp)/=isgn( i ) )go to 90
+           end do
+           ! repeated sign vector detected, hence algorithm has converged.
+           go to 120
+           90 continue
+           ! test for cycling.
+           if( est<=estold )go to 120
+           do i = 1, n
+              x( i ) = sign( one, x( i ) )
+              isgn( i ) = nint( x( i ),KIND=ilp)
+           end do
+           kase = 2
+           jump = 4
+           return
+           ! ................ entry   (jump = 4)
+           ! x has been overwritten by transpose(a)*x.
+           110 continue
+           jlast = j
+           j = stdlib_idamax( n, x, 1 )
+           if( ( x( jlast )/=abs( x( j ) ) ) .and. ( iter<itmax ) ) then
+              iter = iter + 1
+              go to 50
+           end if
+           ! iteration complete.  final stage.
+           120 continue
+           altsgn = one
+           do i = 1, n
+              x( i ) = altsgn*( one+real( i-1,KIND=dp) / real( n-1,KIND=dp) )
+              altsgn = -altsgn
+           end do
+           kase = 1
+           jump = 5
+           return
+           ! ................ entry   (jump = 5)
+           ! x has been overwritten by a*x.
+           140 continue
+           temp = two*( stdlib_dasum( n, x, 1 ) / real( 3*n,KIND=dp) )
+           if( temp>est ) then
+              call stdlib_dcopy( n, x, 1, v, 1 )
+              est = temp
+           end if
+           150 continue
+           kase = 0
+           return
+     end subroutine stdlib_dlacon
+
+     !> DLACPY: copies all or part of a two-dimensional matrix A to another
+     !> matrix B.
+
+     pure subroutine stdlib_dlacpy( uplo, m, n, a, lda, b, ldb )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              do j = 1, n
+                 do i = j, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = 1, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_dlacpy
+
+
+     pure real(dp) function stdlib_dladiv2( a, b, c, d, r, t )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: a, b, c, d, r, t
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: br
+           ! Executable Statements 
+           if( r/=zero ) then
+              br = b * r
+              if( br/=zero ) then
+                 stdlib_dladiv2 = (a + br) * t
+              else
+                 stdlib_dladiv2 = a * t + (b * t) * r
+              end if
+           else
+              stdlib_dladiv2 = (a + d * (b / c)) * t
+           end if
+           return
+     end function stdlib_dladiv2
+
+     !> DLAE2:  computes the eigenvalues of a 2-by-2 symmetric matrix
+     !> [  A   B  ]
+     !> [  B   C  ].
+     !> On return, RT1 is the eigenvalue of larger absolute value, and RT2
+     !> is the eigenvalue of smaller absolute value.
+
+     pure subroutine stdlib_dlae2( a, b, c, rt1, rt2 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: a, b, c
+           real(dp), intent(out) :: rt1, rt2
+       ! =====================================================================
+           
+           
+           
+           
+           ! Local Scalars 
+           real(dp) :: ab, acmn, acmx, adf, df, rt, sm, tb
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           ! compute the eigenvalues
+           sm = a + c
+           df = a - c
+           adf = abs( df )
+           tb = b + b
+           ab = abs( tb )
+           if( abs( a )>abs( c ) ) then
+              acmx = a
+              acmn = c
+           else
+              acmx = c
+              acmn = a
+           end if
+           if( adf>ab ) then
+              rt = adf*sqrt( one+( ab / adf )**2 )
+           else if( adf<ab ) then
+              rt = ab*sqrt( one+( adf / ab )**2 )
+           else
+              ! includes case ab=adf=0
+              rt = ab*sqrt( two )
+           end if
+           if( sm<zero ) then
+              rt1 = half*( sm-rt )
+              ! order of execution important.
+              ! to get fully accurate smaller eigenvalue,
+              ! next line needs to be executed in higher precision.
+              rt2 = ( acmx / rt1 )*acmn - ( b / rt1 )*b
+           else if( sm>zero ) then
+              rt1 = half*( sm+rt )
+              ! order of execution important.
+              ! to get fully accurate smaller eigenvalue,
+              ! next line needs to be executed in higher precision.
+              rt2 = ( acmx / rt1 )*acmn - ( b / rt1 )*b
+           else
+              ! includes case rt1 = rt2 = 0
+              rt1 = half*rt
+              rt2 = -half*rt
+           end if
+           return
+     end subroutine stdlib_dlae2
+
+     !> DLAEBZ: contains the iteration loops which compute and use the
+     !> function N(w), which is the count of eigenvalues of a symmetric
+     !> tridiagonal matrix T less than or equal to its argument  w.  It
+     !> performs a choice of two types of loops:
+     !> IJOB=1, followed by
+     !> IJOB=2: It takes as input a list of intervals and returns a list of
+     !> sufficiently small intervals whose union contains the same
+     !> eigenvalues as the union of the original intervals.
+     !> The input intervals are (AB(j,1),AB(j,2)], j=1,...,MINP.
+     !> The output interval (AB(j,1),AB(j,2)] will contain
+     !> eigenvalues NAB(j,1)+1,...,NAB(j,2), where 1 <= j <= MOUT.
+     !> IJOB=3: It performs a binary search in each input interval
+     !> (AB(j,1),AB(j,2)] for a point  w(j)  such that
+     !> N(w(j))=NVAL(j), and uses  C(j)  as the starting point of
+     !> the search.  If such a w(j) is found, then on output
+     !> AB(j,1)=AB(j,2)=w.  If no such w(j) is found, then on output
+     !> (AB(j,1),AB(j,2)] will be a small interval containing the
+     !> point where N(w) jumps through NVAL(j), unless that point
+     !> lies outside the initial interval.
+     !> Note that the intervals are in all cases half-open intervals,
+     !> i.e., of the form  (a,b] , which includes  b  but not  a .
+     !> To avoid underflow, the matrix should be scaled so that its largest
+     !> element is no greater than  overflow**(1/2) * underflow**(1/4)
+     !> in absolute value.  To assure the most accurate computation
+     !> of small eigenvalues, the matrix should be scaled to be
+     !> not much smaller than that, either.
+     !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+     !> Matrix", Report CS41, Computer Science Dept., Stanford
+     !> University, July 21, 1966
+     !> Note: the arguments are, in general, *not* checked for unreasonable
+     !> values.
+
+     pure subroutine stdlib_dlaebz( ijob, nitmax, n, mmax, minp, nbmin, abstol,reltol, pivmin, d, &
+               e, e2, nval, ab, c, mout,nab, work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ijob, minp, mmax, n, nbmin, nitmax
+           integer(ilp), intent(out) :: info, mout
+           real(dp), intent(in) :: abstol, pivmin, reltol
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           integer(ilp), intent(inout) :: nab(mmax,*), nval(*)
+           real(dp), intent(inout) :: ab(mmax,*), c(*)
+           real(dp), intent(in) :: d(*), e(*), e2(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: itmp1, itmp2, j, ji, jit, jp, kf, kfnew, kl, klnew
+           real(dp) :: tmp1, tmp2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! check for errors
+           info = 0
+           if( ijob<1 .or. ijob>3 ) then
+              info = -1
+              return
+           end if
+           ! initialize nab
+           if( ijob==1 ) then
+              ! compute the number of eigenvalues in the initial intervals.
+              mout = 0
+              do ji = 1, minp
+                 do jp = 1, 2
+                    tmp1 = d( 1 ) - ab( ji, jp )
+                    if( abs( tmp1 )<pivmin )tmp1 = -pivmin
+                    nab( ji, jp ) = 0
+                    if( tmp1<=zero )nab( ji, jp ) = 1
+                    do j = 2, n
+                       tmp1 = d( j ) - e2( j-1 ) / tmp1 - ab( ji, jp )
+                       if( abs( tmp1 )<pivmin )tmp1 = -pivmin
+                       if( tmp1<=zero )nab( ji, jp ) = nab( ji, jp ) + 1
+                    end do
+                 end do
+                 mout = mout + nab( ji, 2 ) - nab( ji, 1 )
+              end do
+              return
+           end if
+           ! initialize for loop
+           ! kf and kl have the following meaning:
+              ! intervals 1,...,kf-1 have converged.
+              ! intervals kf,...,kl  still need to be refined.
+           kf = 1
+           kl = minp
+           ! if ijob=2, initialize c.
+           ! if ijob=3, use the user-supplied starting point.
+           if( ijob==2 ) then
+              do ji = 1, minp
+                 c( ji ) = half*( ab( ji, 1 )+ab( ji, 2 ) )
+              end do
+           end if
+           ! iteration loop
+           loop_130: do jit = 1, nitmax
+              ! loop over intervals
+              if( kl-kf+1>=nbmin .and. nbmin>0 ) then
+                 ! begin of parallel version of the loop
+                 do ji = kf, kl
+                    ! compute n(c), the number of eigenvalues less than c
+                    work( ji ) = d( 1 ) - c( ji )
+                    iwork( ji ) = 0
+                    if( work( ji )<=pivmin ) then
+                       iwork( ji ) = 1
+                       work( ji ) = min( work( ji ), -pivmin )
+                    end if
+                    do j = 2, n
+                       work( ji ) = d( j ) - e2( j-1 ) / work( ji ) - c( ji )
+                       if( work( ji )<=pivmin ) then
+                          iwork( ji ) = iwork( ji ) + 1
+                          work( ji ) = min( work( ji ), -pivmin )
+                       end if
+                    end do
+                 end do
+                 if( ijob<=2 ) then
+                    ! ijob=2: choose all intervals containing eigenvalues.
+                    klnew = kl
+                    loop_70: do ji = kf, kl
+                       ! insure that n(w) is monotone
+                       iwork( ji ) = min( nab( ji, 2 ),max( nab( ji, 1 ), iwork( ji ) ) )
+                       ! update the queue -- add intervals if both halves
+                       ! contain eigenvalues.
+                       if( iwork( ji )==nab( ji, 2 ) ) then
+                          ! no eigenvalue in the upper interval:
+                          ! just use the lower interval.
+                          ab( ji, 2 ) = c( ji )
+                       else if( iwork( ji )==nab( ji, 1 ) ) then
+                          ! no eigenvalue in the lower interval:
+                          ! just use the upper interval.
+                          ab( ji, 1 ) = c( ji )
+                       else
+                          klnew = klnew + 1
+                          if( klnew<=mmax ) then
+                             ! eigenvalue in both intervals -- add upper to
+                             ! queue.
+                             ab( klnew, 2 ) = ab( ji, 2 )
+                             nab( klnew, 2 ) = nab( ji, 2 )
+                             ab( klnew, 1 ) = c( ji )
+                             nab( klnew, 1 ) = iwork( ji )
+                             ab( ji, 2 ) = c( ji )
+                             nab( ji, 2 ) = iwork( ji )
+                          else
+                             info = mmax + 1
+                          end if
+                       end if
+                    end do loop_70
+                    if( info/=0 )return
+                    kl = klnew
+                 else
+                    ! ijob=3: binary search.  keep only the interval containing
+                            ! w   s.t. n(w) = nval
+                    do ji = kf, kl
+                       if( iwork( ji )<=nval( ji ) ) then
+                          ab( ji, 1 ) = c( ji )
+                          nab( ji, 1 ) = iwork( ji )
+                       end if
+                       if( iwork( ji )>=nval( ji ) ) then
+                          ab( ji, 2 ) = c( ji )
+                          nab( ji, 2 ) = iwork( ji )
+                       end if
+                    end do
+                 end if
+              else
+                 ! end of parallel version of the loop
+                 ! begin of serial version of the loop
+                 klnew = kl
+                 loop_100: do ji = kf, kl
+                    ! compute n(w), the number of eigenvalues less than w
+                    tmp1 = c( ji )
+                    tmp2 = d( 1 ) - tmp1
+                    itmp1 = 0
+                    if( tmp2<=pivmin ) then
+                       itmp1 = 1
+                       tmp2 = min( tmp2, -pivmin )
+                    end if
+                    do j = 2, n
+                       tmp2 = d( j ) - e2( j-1 ) / tmp2 - tmp1
+                       if( tmp2<=pivmin ) then
+                          itmp1 = itmp1 + 1
+                          tmp2 = min( tmp2, -pivmin )
+                       end if
+                    end do
+                    if( ijob<=2 ) then
+                       ! ijob=2: choose all intervals containing eigenvalues.
+                       ! insure that n(w) is monotone
+                       itmp1 = min( nab( ji, 2 ),max( nab( ji, 1 ), itmp1 ) )
+                       ! update the queue -- add intervals if both halves
+                       ! contain eigenvalues.
+                       if( itmp1==nab( ji, 2 ) ) then
+                          ! no eigenvalue in the upper interval:
+                          ! just use the lower interval.
+                          ab( ji, 2 ) = tmp1
+                       else if( itmp1==nab( ji, 1 ) ) then
+                          ! no eigenvalue in the lower interval:
+                          ! just use the upper interval.
+                          ab( ji, 1 ) = tmp1
+                       else if( klnew<mmax ) then
+                          ! eigenvalue in both intervals -- add upper to queue.
+                          klnew = klnew + 1
+                          ab( klnew, 2 ) = ab( ji, 2 )
+                          nab( klnew, 2 ) = nab( ji, 2 )
+                          ab( klnew, 1 ) = tmp1
+                          nab( klnew, 1 ) = itmp1
+                          ab( ji, 2 ) = tmp1
+                          nab( ji, 2 ) = itmp1
+                       else
+                          info = mmax + 1
+                          return
+                       end if
+                    else
+                       ! ijob=3: binary search.  keep only the interval
+                               ! containing  w  s.t. n(w) = nval
+                       if( itmp1<=nval( ji ) ) then
+                          ab( ji, 1 ) = tmp1
+                          nab( ji, 1 ) = itmp1
+                       end if
+                       if( itmp1>=nval( ji ) ) then
+                          ab( ji, 2 ) = tmp1
+                          nab( ji, 2 ) = itmp1
+                       end if
+                    end if
+                 end do loop_100
+                 kl = klnew
+              end if
+              ! check for convergence
+              kfnew = kf
+              loop_110: do ji = kf, kl
+                 tmp1 = abs( ab( ji, 2 )-ab( ji, 1 ) )
+                 tmp2 = max( abs( ab( ji, 2 ) ), abs( ab( ji, 1 ) ) )
+                 if( tmp1<max( abstol, pivmin, reltol*tmp2 ) .or.nab( ji, 1 )>=nab( ji, 2 ) ) &
+                           then
+                    ! converged -- swap with position kfnew,
+                                 ! then increment kfnew
+                    if( ji>kfnew ) then
+                       tmp1 = ab( ji, 1 )
+                       tmp2 = ab( ji, 2 )
+                       itmp1 = nab( ji, 1 )
+                       itmp2 = nab( ji, 2 )
+                       ab( ji, 1 ) = ab( kfnew, 1 )
+                       ab( ji, 2 ) = ab( kfnew, 2 )
+                       nab( ji, 1 ) = nab( kfnew, 1 )
+                       nab( ji, 2 ) = nab( kfnew, 2 )
+                       ab( kfnew, 1 ) = tmp1
+                       ab( kfnew, 2 ) = tmp2
+                       nab( kfnew, 1 ) = itmp1
+                       nab( kfnew, 2 ) = itmp2
+                       if( ijob==3 ) then
+                          itmp1 = nval( ji )
+                          nval( ji ) = nval( kfnew )
+                          nval( kfnew ) = itmp1
+                       end if
+                    end if
+                    kfnew = kfnew + 1
+                 end if
+              end do loop_110
+              kf = kfnew
+              ! choose midpoints
+              do ji = kf, kl
+                 c( ji ) = half*( ab( ji, 1 )+ab( ji, 2 ) )
+              end do
+              ! if no more intervals to refine, quit.
+              if( kf>kl )go to 140
+           end do loop_130
+           ! converged
+           140 continue
+           info = max( kl+1-kf, 0 )
+           mout = kl
+           return
+     end subroutine stdlib_dlaebz
+
+     !> This subroutine computes the I-th eigenvalue of a symmetric rank-one
+     !> modification of a 2-by-2 diagonal matrix
+     !> diag( D )  +  RHO * Z * transpose(Z) .
+     !> The diagonal elements in the array D are assumed to satisfy
+     !> D(i) < D(j)  for  i < j .
+     !> We also assume RHO > 0 and that the Euclidean norm of the vector
+     !> Z is one.
+
+     pure subroutine stdlib_dlaed5( i, d, z, delta, rho, dlam )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i
+           real(dp), intent(out) :: dlam
+           real(dp), intent(in) :: rho
+           ! Array Arguments 
+           real(dp), intent(in) :: d(2), z(2)
+           real(dp), intent(out) :: delta(2)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: b, c, del, tau, temp, w
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           del = d( 2 ) - d( 1 )
+           if( i==1 ) then
+              w = one + two*rho*( z( 2 )*z( 2 )-z( 1 )*z( 1 ) ) / del
+              if( w>zero ) then
+                 b = del + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+                 c = rho*z( 1 )*z( 1 )*del
+                 ! b > zero, always
+                 tau = two*c / ( b+sqrt( abs( b*b-four*c ) ) )
+                 dlam = d( 1 ) + tau
+                 delta( 1 ) = -z( 1 ) / tau
+                 delta( 2 ) = z( 2 ) / ( del-tau )
+              else
+                 b = -del + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+                 c = rho*z( 2 )*z( 2 )*del
+                 if( b>zero ) then
+                    tau = -two*c / ( b+sqrt( b*b+four*c ) )
+                 else
+                    tau = ( b-sqrt( b*b+four*c ) ) / two
+                 end if
+                 dlam = d( 2 ) + tau
+                 delta( 1 ) = -z( 1 ) / ( del+tau )
+                 delta( 2 ) = -z( 2 ) / tau
+              end if
+              temp = sqrt( delta( 1 )*delta( 1 )+delta( 2 )*delta( 2 ) )
+              delta( 1 ) = delta( 1 ) / temp
+              delta( 2 ) = delta( 2 ) / temp
+           else
+           ! now i=2
+              b = -del + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+              c = rho*z( 2 )*z( 2 )*del
+              if( b>zero ) then
+                 tau = ( b+sqrt( b*b+four*c ) ) / two
+              else
+                 tau = two*c / ( -b+sqrt( b*b+four*c ) )
+              end if
+              dlam = d( 2 ) + tau
+              delta( 1 ) = -z( 1 ) / ( del+tau )
+              delta( 2 ) = -z( 2 ) / tau
+              temp = sqrt( delta( 1 )*delta( 1 )+delta( 2 )*delta( 2 ) )
+              delta( 1 ) = delta( 1 ) / temp
+              delta( 2 ) = delta( 2 ) / temp
+           end if
+           return
+     end subroutine stdlib_dlaed5
+
+     !> DLAEDA: computes the Z vector corresponding to the merge step in the
+     !> CURLVLth step of the merge process with TLVLS steps for the CURPBMth
+     !> problem.
+
+     pure subroutine stdlib_dlaeda( n, tlvls, curlvl, curpbm, prmptr, perm, givptr,givcol, givnum,&
+                q, qptr, z, ztemp, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: curlvl, curpbm, n, tlvls
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(2,*), givptr(*), perm(*), prmptr(*), qptr(*)
+           real(dp), intent(in) :: givnum(2,*), q(*)
+           real(dp), intent(out) :: z(*), ztemp(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: bsiz1, bsiz2, curr, i, k, mid, psiz1, psiz2, ptr, zptr1
+           ! Intrinsic Functions 
+           intrinsic :: real,int,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAEDA', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine location of first number in second half.
+           mid = n / 2 + 1
+           ! gather last/first rows of appropriate eigenblocks into center of z
+           ptr = 1
+           ! determine location of lowest level subproblem in the full storage
+           ! scheme
+           curr = ptr + curpbm*2**curlvl + 2**( curlvl-1 ) - 1
+           ! determine size of these matrices.  we add half to the value of
+           ! the sqrt in case the machine underestimates one of these square
+           ! roots.
+           bsiz1 = int( half+sqrt( real( qptr( curr+1 )-qptr( curr ),KIND=dp) ),KIND=ilp)
+           bsiz2 = int( half+sqrt( real( qptr( curr+2 )-qptr( curr+1 ),KIND=dp) ),KIND=ilp)
+                     
+           do k = 1, mid - bsiz1 - 1
+              z( k ) = zero
+           end do
+           call stdlib_dcopy( bsiz1, q( qptr( curr )+bsiz1-1 ), bsiz1,z( mid-bsiz1 ), 1 )
+           call stdlib_dcopy( bsiz2, q( qptr( curr+1 ) ), bsiz2, z( mid ), 1 )
+           do k = mid + bsiz2, n
+              z( k ) = zero
+           end do
+           ! loop through remaining levels 1 -> curlvl applying the givens
+           ! rotations and permutation and then multiplying the center matrices
+           ! against the current z.
+           ptr = 2**tlvls + 1
+           loop_70: do k = 1, curlvl - 1
+              curr = ptr + curpbm*2**( curlvl-k ) + 2**( curlvl-k-1 ) - 1
+              psiz1 = prmptr( curr+1 ) - prmptr( curr )
+              psiz2 = prmptr( curr+2 ) - prmptr( curr+1 )
+              zptr1 = mid - psiz1
+             ! apply givens at curr and curr+1
+              do i = givptr( curr ), givptr( curr+1 ) - 1
+                 call stdlib_drot( 1, z( zptr1+givcol( 1, i )-1 ), 1,z( zptr1+givcol( 2, i )-1 ), &
+                           1, givnum( 1, i ),givnum( 2, i ) )
+              end do
+              do i = givptr( curr+1 ), givptr( curr+2 ) - 1
+                 call stdlib_drot( 1, z( mid-1+givcol( 1, i ) ), 1,z( mid-1+givcol( 2, i ) ), 1, &
+                           givnum( 1, i ),givnum( 2, i ) )
+              end do
+              psiz1 = prmptr( curr+1 ) - prmptr( curr )
+              psiz2 = prmptr( curr+2 ) - prmptr( curr+1 )
+              do i = 0, psiz1 - 1
+                 ztemp( i+1 ) = z( zptr1+perm( prmptr( curr )+i )-1 )
+              end do
+              do i = 0, psiz2 - 1
+                 ztemp( psiz1+i+1 ) = z( mid+perm( prmptr( curr+1 )+i )-1 )
+              end do
+              ! multiply blocks at curr and curr+1
+              ! determine size of these matrices.  we add half to the value of
+              ! the sqrt in case the machine underestimates one of these
+              ! square roots.
+              bsiz1 = int( half+sqrt( real( qptr( curr+1 )-qptr( curr ),KIND=dp) ),KIND=ilp)
+                        
+              bsiz2 = int( half+sqrt( real( qptr( curr+2 )-qptr( curr+1 ),KIND=dp) ),KIND=ilp)
+                        
+              if( bsiz1>0 ) then
+                 call stdlib_dgemv( 'T', bsiz1, bsiz1, one, q( qptr( curr ) ),bsiz1, ztemp( 1 ), &
+                           1, zero, z( zptr1 ), 1 )
+              end if
+              call stdlib_dcopy( psiz1-bsiz1, ztemp( bsiz1+1 ), 1, z( zptr1+bsiz1 ),1 )
+              if( bsiz2>0 ) then
+                 call stdlib_dgemv( 'T', bsiz2, bsiz2, one, q( qptr( curr+1 ) ),bsiz2, ztemp( &
+                           psiz1+1 ), 1, zero, z( mid ), 1 )
+              end if
+              call stdlib_dcopy( psiz2-bsiz2, ztemp( psiz1+bsiz2+1 ), 1,z( mid+bsiz2 ), 1 )
+                        
+              ptr = ptr + 2**( tlvls-k )
+           end do loop_70
+           return
+     end subroutine stdlib_dlaeda
+
+     !> DLAEV2: computes the eigendecomposition of a 2-by-2 symmetric matrix
+     !> [  A   B  ]
+     !> [  B   C  ].
+     !> On return, RT1 is the eigenvalue of larger absolute value, RT2 is the
+     !> eigenvalue of smaller absolute value, and (CS1,SN1) is the unit right
+     !> eigenvector for RT1, giving the decomposition
+     !> [ CS1  SN1 ] [  A   B  ] [ CS1 -SN1 ]  =  [ RT1  0  ]
+     !> [-SN1  CS1 ] [  B   C  ] [ SN1  CS1 ]     [  0  RT2 ].
+
+     pure subroutine stdlib_dlaev2( a, b, c, rt1, rt2, cs1, sn1 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: a, b, c
+           real(dp), intent(out) :: cs1, rt1, rt2, sn1
+       ! =====================================================================
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: sgn1, sgn2
+           real(dp) :: ab, acmn, acmx, acs, adf, cs, ct, df, rt, sm, tb, tn
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           ! compute the eigenvalues
+           sm = a + c
+           df = a - c
+           adf = abs( df )
+           tb = b + b
+           ab = abs( tb )
+           if( abs( a )>abs( c ) ) then
+              acmx = a
+              acmn = c
+           else
+              acmx = c
+              acmn = a
+           end if
+           if( adf>ab ) then
+              rt = adf*sqrt( one+( ab / adf )**2 )
+           else if( adf<ab ) then
+              rt = ab*sqrt( one+( adf / ab )**2 )
+           else
+              ! includes case ab=adf=0
+              rt = ab*sqrt( two )
+           end if
+           if( sm<zero ) then
+              rt1 = half*( sm-rt )
+              sgn1 = -1
+              ! order of execution important.
+              ! to get fully accurate smaller eigenvalue,
+              ! next line needs to be executed in higher precision.
+              rt2 = ( acmx / rt1 )*acmn - ( b / rt1 )*b
+           else if( sm>zero ) then
+              rt1 = half*( sm+rt )
+              sgn1 = 1
+              ! order of execution important.
+              ! to get fully accurate smaller eigenvalue,
+              ! next line needs to be executed in higher precision.
+              rt2 = ( acmx / rt1 )*acmn - ( b / rt1 )*b
+           else
+              ! includes case rt1 = rt2 = 0
+              rt1 = half*rt
+              rt2 = -half*rt
+              sgn1 = 1
+           end if
+           ! compute the eigenvector
+           if( df>=zero ) then
+              cs = df + rt
+              sgn2 = 1
+           else
+              cs = df - rt
+              sgn2 = -1
+           end if
+           acs = abs( cs )
+           if( acs>ab ) then
+              ct = -tb / cs
+              sn1 = one / sqrt( one+ct*ct )
+              cs1 = ct*sn1
+           else
+              if( ab==zero ) then
+                 cs1 = one
+                 sn1 = zero
+              else
+                 tn = -cs / tb
+                 cs1 = one / sqrt( one+tn*tn )
+                 sn1 = tn*cs1
+              end if
+           end if
+           if( sgn1==sgn2 ) then
+              tn = cs1
+              cs1 = -sn1
+              sn1 = tn
+           end if
+           return
+     end subroutine stdlib_dlaev2
+
+     !> DLAG2: computes the eigenvalues of a 2 x 2 generalized eigenvalue
+     !> problem  A - w B, with scaling as necessary to avoid over-/underflow.
+     !> The scaling factor "s" results in a modified eigenvalue equation
+     !> s A - w B
+     !> where  s  is a non-negative scaling factor chosen so that  w,  w B,
+     !> and  s A  do not overflow and, if possible, do not underflow, either.
+
+     pure subroutine stdlib_dlag2( a, lda, b, ldb, safmin, scale1, scale2, wr1,wr2, wi )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldb
+           real(dp), intent(in) :: safmin
+           real(dp), intent(out) :: scale1, scale2, wi, wr1, wr2
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: fuzzy1 = one+1.0e-5_dp
+           
+           
+           
+           ! Local Scalars 
+           real(dp) :: a11, a12, a21, a22, abi22, anorm, as11, as12, as22, ascale, b11, b12, b22, &
+           binv11, binv22, bmin, bnorm, bscale, bsize, c1, c2, c3, c4, c5, diff, discr, pp, qq, r,&
+            rtmax, rtmin, s1, s2, safmax, shift, ss, sum, wabs, wbig, wdet, wscale, wsize, &
+                      wsmall
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sign,sqrt
+           ! Executable Statements 
+           rtmin = sqrt( safmin )
+           rtmax = one / rtmin
+           safmax = one / safmin
+           ! scale a
+           anorm = max( abs( a( 1, 1 ) )+abs( a( 2, 1 ) ),abs( a( 1, 2 ) )+abs( a( 2, 2 ) ), &
+                     safmin )
+           ascale = one / anorm
+           a11 = ascale*a( 1, 1 )
+           a21 = ascale*a( 2, 1 )
+           a12 = ascale*a( 1, 2 )
+           a22 = ascale*a( 2, 2 )
+           ! perturb b if necessary to insure non-singularity
+           b11 = b( 1, 1 )
+           b12 = b( 1, 2 )
+           b22 = b( 2, 2 )
+           bmin = rtmin*max( abs( b11 ), abs( b12 ), abs( b22 ), rtmin )
+           if( abs( b11 )<bmin )b11 = sign( bmin, b11 )
+           if( abs( b22 )<bmin )b22 = sign( bmin, b22 )
+           ! scale b
+           bnorm = max( abs( b11 ), abs( b12 )+abs( b22 ), safmin )
+           bsize = max( abs( b11 ), abs( b22 ) )
+           bscale = one / bsize
+           b11 = b11*bscale
+           b12 = b12*bscale
+           b22 = b22*bscale
+           ! compute larger eigenvalue by method described by c. van loan
+           ! ( as is a shifted by -shift*b )
+           binv11 = one / b11
+           binv22 = one / b22
+           s1 = a11*binv11
+           s2 = a22*binv22
+           if( abs( s1 )<=abs( s2 ) ) then
+              as12 = a12 - s1*b12
+              as22 = a22 - s1*b22
+              ss = a21*( binv11*binv22 )
+              abi22 = as22*binv22 - ss*b12
+              pp = half*abi22
+              shift = s1
+           else
+              as12 = a12 - s2*b12
+              as11 = a11 - s2*b11
+              ss = a21*( binv11*binv22 )
+              abi22 = -ss*b12
+              pp = half*( as11*binv11+abi22 )
+              shift = s2
+           end if
+           qq = ss*as12
+           if( abs( pp*rtmin )>=one ) then
+              discr = ( rtmin*pp )**2 + qq*safmin
+              r = sqrt( abs( discr ) )*rtmax
+           else
+              if( pp**2+abs( qq )<=safmin ) then
+                 discr = ( rtmax*pp )**2 + qq*safmax
+                 r = sqrt( abs( discr ) )*rtmin
+              else
+                 discr = pp**2 + qq
+                 r = sqrt( abs( discr ) )
+              end if
+           end if
+           ! note: the test of r in the following if is to cover the case when
+                 ! discr is small and negative and is flushed to zero during
+                 ! the calculation of r.  on machines which have a consistent
+                 ! flush-to-zero threshold and handle numbers above that
+                 ! threshold correctly, it would not be necessary.
+           if( discr>=zero .or. r==zero ) then
+              sum = pp + sign( r, pp )
+              diff = pp - sign( r, pp )
+              wbig = shift + sum
+              ! compute smaller eigenvalue
+              wsmall = shift + diff
+              if( half*abs( wbig )>max( abs( wsmall ), safmin ) ) then
+                 wdet = ( a11*a22-a12*a21 )*( binv11*binv22 )
+                 wsmall = wdet / wbig
+              end if
+              ! choose (real) eigenvalue closest to 2,2 element of a*b**(-1)
+              ! for wr1.
+              if( pp>abi22 ) then
+                 wr1 = min( wbig, wsmall )
+                 wr2 = max( wbig, wsmall )
+              else
+                 wr1 = max( wbig, wsmall )
+                 wr2 = min( wbig, wsmall )
+              end if
+              wi = zero
+           else
+              ! complex eigenvalues
+              wr1 = shift + pp
+              wr2 = wr1
+              wi = r
+           end if
+           ! further scaling to avoid underflow and overflow in computing
+           ! scale1 and overflow in computing w*b.
+           ! this scale factor (wscale) is bounded from above using c1 and c2,
+           ! and from below using c3 and c4.
+              ! c1 implements the condition  s a  must never overflow.
+              ! c2 implements the condition  w b  must never overflow.
+              ! c3, with c2,
+                 ! implement the condition that s a - w b must never overflow.
+              ! c4 implements the condition  s    should not underflow.
+              ! c5 implements the condition  max(s,|w|) should be at least 2.
+           c1 = bsize*( safmin*max( one, ascale ) )
+           c2 = safmin*max( one, bnorm )
+           c3 = bsize*safmin
+           if( ascale<=one .and. bsize<=one ) then
+              c4 = min( one, ( ascale / safmin )*bsize )
+           else
+              c4 = one
+           end if
+           if( ascale<=one .or. bsize<=one ) then
+              c5 = min( one, ascale*bsize )
+           else
+              c5 = one
+           end if
+           ! scale first eigenvalue
+           wabs = abs( wr1 ) + abs( wi )
+           wsize = max( safmin, c1, fuzzy1*( wabs*c2+c3 ),min( c4, half*max( wabs, c5 ) ) )
+                     
+           if( wsize/=one ) then
+              wscale = one / wsize
+              if( wsize>one ) then
+                 scale1 = ( max( ascale, bsize )*wscale )*min( ascale, bsize )
+              else
+                 scale1 = ( min( ascale, bsize )*wscale )*max( ascale, bsize )
+              end if
+              wr1 = wr1*wscale
+              if( wi/=zero ) then
+                 wi = wi*wscale
+                 wr2 = wr1
+                 scale2 = scale1
+              end if
+           else
+              scale1 = ascale*bsize
+              scale2 = scale1
+           end if
+           ! scale second eigenvalue (if real)
+           if( wi==zero ) then
+              wsize = max( safmin, c1, fuzzy1*( abs( wr2 )*c2+c3 ),min( c4, half*max( abs( wr2 ), &
+                        c5 ) ) )
+              if( wsize/=one ) then
+                 wscale = one / wsize
+                 if( wsize>one ) then
+                    scale2 = ( max( ascale, bsize )*wscale )*min( ascale, bsize )
+                 else
+                    scale2 = ( min( ascale, bsize )*wscale )*max( ascale, bsize )
+                 end if
+                 wr2 = wr2*wscale
+              else
+                 scale2 = ascale*bsize
+              end if
+           end if
+           return
+     end subroutine stdlib_dlag2
+
+     !> DLAG2S: converts a DOUBLE PRECISION matrix, SA, to a SINGLE
+     !> PRECISION matrix, A.
+     !> RMAX is the overflow for the SINGLE PRECISION arithmetic
+     !> DLAG2S checks that all the entries of A are between -RMAX and
+     !> RMAX. If not the conversion is aborted and a flag is raised.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_dlag2s( m, n, a, lda, sa, ldsa, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, m, n
+           ! Array Arguments 
+           real(sp), intent(out) :: sa(ldsa,*)
+           real(dp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: rmax
+           ! Executable Statements 
+           rmax = stdlib_slamch( 'O' )
+           do j = 1, n
+              do i = 1, m
+                 if( ( a( i, j )<-rmax ) .or. ( a( i, j )>rmax ) ) then
+                    info = 1
+                    go to 30
+                 end if
+                 sa( i, j ) = a( i, j )
+              end do
+           end do
+           info = 0
+           30 continue
+           return
+     end subroutine stdlib_dlag2s
+
+     !> DLAGTM: performs a matrix-vector product of the form
+     !> B := alpha * A * X + beta * B
+     !> where A is a tridiagonal matrix of order N, B and X are N by NRHS
+     !> matrices, and alpha and beta are real scalars, each of which may be
+     !> 0., 1., or -1.
+
+     pure subroutine stdlib_dlagtm( trans, n, nrhs, alpha, dl, d, du, x, ldx, beta,b, ldb )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(dp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           real(dp), intent(inout) :: b(ldb,*)
+           real(dp), intent(in) :: d(*), dl(*), du(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           if( n==0 )return
+           ! multiply b by beta if beta/=1.
+           if( beta==zero ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = zero
+                 end do
+              end do
+           else if( beta==-one ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = -b( i, j )
+                 end do
+              end do
+           end if
+           if( alpha==one ) then
+              if( stdlib_lsame( trans, 'N' ) ) then
+                 ! compute b := b + a*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j ) +du( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) + dl( n-1 )*x( n-1, j ) +d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + dl( i-1 )*x( i-1, j ) +d( i )*x( i, j ) + du( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else
+                 ! compute b := b + a**t*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j ) +dl( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) + du( n-1 )*x( n-1, j ) +d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + du( i-1 )*x( i-1, j ) +d( i )*x( i, j ) + dl( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              end if
+           else if( alpha==-one ) then
+              if( stdlib_lsame( trans, 'N' ) ) then
+                 ! compute b := b - a*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j ) -du( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) - dl( n-1 )*x( n-1, j ) -d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - dl( i-1 )*x( i-1, j ) -d( i )*x( i, j ) - du( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else
+                 ! compute b := b - a**t*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j ) -dl( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) - du( n-1 )*x( n-1, j ) -d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - du( i-1 )*x( i-1, j ) -d( i )*x( i, j ) - dl( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_dlagtm
+
+     !> This routine is not for general use.  It exists solely to avoid
+     !> over-optimization in DISNAN.
+     !> DLAISNAN: checks for NaNs by comparing its two arguments for
+     !> inequality.  NaN is the only floating-point value where NaN != NaN
+     !> returns .TRUE.  To check for NaNs, pass the same variable as both
+     !> arguments.
+     !> A compiler must assume that the two arguments are
+     !> not the same variable, and the test will not be optimized away.
+     !> Interprocedural or whole-program optimization may delete this
+     !> test.  The ISNAN functions will be replaced by the correct
+     !> Fortran 03 intrinsic once the intrinsic is widely available.
+
+     pure logical(lk) function stdlib_dlaisnan( din1, din2 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: din1, din2
+        ! =====================================================================
+        ! Executable Statements 
+           stdlib_dlaisnan = (din1/=din2)
+           return
+     end function stdlib_dlaisnan
+
+     !> DLAMCH: determines double precision machine parameters.
+
+     pure real(dp) function stdlib_dlamch( cmach )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: cmach
+       ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: rnd, eps, sfmin, small, rmach
+           ! Intrinsic Functions 
+           intrinsic :: digits,epsilon,huge,maxexponent,minexponent,radix,tiny
+           ! Executable Statements 
+           ! assume rounding, not chopping. always.
+           rnd = one
+           if( one==rnd ) then
+              eps = epsilon(zero) * 0.5
+           else
+              eps = epsilon(zero)
+           end if
+           if( stdlib_lsame( cmach, 'E' ) ) then
+              rmach = eps
+           else if( stdlib_lsame( cmach, 'S' ) ) then
+              sfmin = tiny(zero)
+              small = one / huge(zero)
+              if( small>=sfmin ) then
+                 ! use small plus a bit, to avoid the possibility of rounding
+                 ! causing overflow when computing  1/sfmin.
+                 sfmin = small*( one+eps )
+              end if
+              rmach = sfmin
+           else if( stdlib_lsame( cmach, 'B' ) ) then
+              rmach = radix(zero)
+           else if( stdlib_lsame( cmach, 'P' ) ) then
+              rmach = eps * radix(zero)
+           else if( stdlib_lsame( cmach, 'N' ) ) then
+              rmach = digits(zero)
+           else if( stdlib_lsame( cmach, 'R' ) ) then
+              rmach = rnd
+           else if( stdlib_lsame( cmach, 'M' ) ) then
+              rmach = minexponent(zero)
+           else if( stdlib_lsame( cmach, 'U' ) ) then
+              rmach = tiny(zero)
+           else if( stdlib_lsame( cmach, 'L' ) ) then
+              rmach = maxexponent(zero)
+           else if( stdlib_lsame( cmach, 'O' ) ) then
+              rmach = huge(zero)
+           else
+              rmach = zero
+           end if
+           stdlib_dlamch = rmach
+           return
+     end function stdlib_dlamch
+
+
+     pure real(dp) function stdlib_dlamc3( a, b )
+        ! -- lapack auxiliary routine --
+           ! univ. of tennessee, univ. of california berkeley and nag ltd..
+           ! Scalar Arguments 
+           real(dp), intent(in) :: a, b
+       ! =====================================================================
+           ! Executable Statements 
+           stdlib_dlamc3 = a + b
+           return
+     end function stdlib_dlamc3
+
+     !> DLAMRG: will create a permutation list which will merge the elements
+     !> of A (which is composed of two independently sorted sets) into a
+     !> single set which is sorted in ascending order.
+
+     pure subroutine stdlib_dlamrg( n1, n2, a, dtrd1, dtrd2, index )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: dtrd1, dtrd2, n1, n2
+           ! Array Arguments 
+           integer(ilp), intent(out) :: index(*)
+           real(dp), intent(in) :: a(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ind1, ind2, n1sv, n2sv
+           ! Executable Statements 
+           n1sv = n1
+           n2sv = n2
+           if( dtrd1>0 ) then
+              ind1 = 1
+           else
+              ind1 = n1
+           end if
+           if( dtrd2>0 ) then
+              ind2 = 1 + n1
+           else
+              ind2 = n1 + n2
+           end if
+           i = 1
+           ! while ( (n1sv > 0)
+           10 continue
+           if( n1sv>0 .and. n2sv>0 ) then
+              if( a( ind1 )<=a( ind2 ) ) then
+                 index( i ) = ind1
+                 i = i + 1
+                 ind1 = ind1 + dtrd1
+                 n1sv = n1sv - 1
+              else
+                 index( i ) = ind2
+                 i = i + 1
+                 ind2 = ind2 + dtrd2
+                 n2sv = n2sv - 1
+              end if
+              go to 10
+           end if
+           ! end while
+           if( n1sv==0 ) then
+              do n1sv = 1, n2sv
+                 index( i ) = ind2
+                 i = i + 1
+                 ind2 = ind2 + dtrd2
+              end do
+           else
+           ! n2sv == 0
+              do n2sv = 1, n1sv
+                 index( i ) = ind1
+                 i = i + 1
+                 ind1 = ind1 + dtrd1
+              end do
+           end if
+           return
+     end subroutine stdlib_dlamrg
+
+     !> DLAORHR_COL_GETRFNP2: computes the modified LU factorization without
+     !> pivoting of a real general M-by-N matrix A. The factorization has
+     !> the form:
+     !> A - S = L * U,
+     !> where:
+     !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+     !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+     !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+     !> i-1 steps of Gaussian elimination. This means that the diagonal
+     !> element at each step of "modified" Gaussian elimination is at
+     !> least one in absolute value (so that division-by-zero not
+     !> possible during the division by the diagonal element);
+     !> L is a M-by-N lower triangular matrix with unit diagonal elements
+     !> (lower trapezoidal if M > N);
+     !> and U is a M-by-N upper triangular matrix
+     !> (upper trapezoidal if M < N).
+     !> This routine is an auxiliary routine used in the Householder
+     !> reconstruction routine DORHR_COL. In DORHR_COL, this routine is
+     !> applied to an M-by-N matrix A with orthonormal columns, where each
+     !> element is bounded by one in absolute value. With the choice of
+     !> the matrix S above, one can show that the diagonal element at each
+     !> step of Gaussian elimination is the largest (in absolute value) in
+     !> the column on or below the diagonal, so that no pivoting is required
+     !> for numerical stability [1].
+     !> For more details on the Householder reconstruction algorithm,
+     !> including the modified LU factorization, see [1].
+     !> This is the recursive version of the LU factorization algorithm.
+     !> Denote A - S by B. The algorithm divides the matrix B into four
+     !> submatrices:
+     !> [  B11 | B12  ]  where B11 is n1 by n1,
+     !> B = [ -----|----- ]        B21 is (m-n1) by n1,
+     !> [  B21 | B22  ]        B12 is n1 by n2,
+     !> B22 is (m-n1) by n2,
+     !> with n1 = min(m,n)/2, n2 = n-n1.
+     !> The subroutine calls itself to factor B11, solves for B21,
+     !> solves for B12, updates B22, then calls itself to factor B22.
+     !> For more details on the recursive LU algorithm, see [2].
+     !> DLAORHR_COL_GETRFNP2 is called to factorize a block by the blocked
+     !> routine DLAORHR_COL_GETRFNP, which uses blocked code calling
+     !> Level 3 BLAS to update the submatrix. However, DLAORHR_COL_GETRFNP2
+     !> is self-sufficient and can be used without DLAORHR_COL_GETRFNP.
+     !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+     !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+     !> E. Solomonik, J. Parallel Distrib. Comput.,
+     !> vol. 85, pp. 3-31, 2015.
+     !> [2] "Recursion leads to automatic variable blocking for dense linear
+     !> algebra algorithms", F. Gustavson, IBM J. of Res. and Dev.,
+     !> vol. 41, no. 6, pp. 737-755, 1997.
+
+     pure recursive subroutine stdlib_dlaorhr_col_getrfnp2( m, n, a, lda, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: d(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: sfmin
+           integer(ilp) :: i, iinfo, n1, n2
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAORHR_COL_GETRFNP2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 )return
+           if ( m==1 ) then
+              ! one row case, (also recursion termination case),
+              ! use unblocked code
+              ! transfer the sign
+              d( 1 ) = -sign( one, a( 1, 1 ) )
+              ! construct the row of u
+              a( 1, 1 ) = a( 1, 1 ) - d( 1 )
+           else if( n==1 ) then
+              ! one column case, (also recursion termination case),
+              ! use unblocked code
+              ! transfer the sign
+              d( 1 ) = -sign( one, a( 1, 1 ) )
+              ! construct the row of u
+              a( 1, 1 ) = a( 1, 1 ) - d( 1 )
+              ! scale the elements 2:m of the column
+              ! determine machine safe minimum
+              sfmin = stdlib_dlamch('S')
+              ! construct the subdiagonal elements of l
+              if( abs( a( 1, 1 ) ) >= sfmin ) then
+                 call stdlib_dscal( m-1, one / a( 1, 1 ), a( 2, 1 ), 1 )
+              else
+                 do i = 2, m
+                    a( i, 1 ) = a( i, 1 ) / a( 1, 1 )
+                 end do
+              end if
+           else
+              ! divide the matrix b into four submatrices
+              n1 = min( m, n ) / 2
+              n2 = n-n1
+              ! factor b11, recursive call
+              call stdlib_dlaorhr_col_getrfnp2( n1, n1, a, lda, d, iinfo )
+              ! solve for b21
+              call stdlib_dtrsm( 'R', 'U', 'N', 'N', m-n1, n1, one, a, lda,a( n1+1, 1 ), lda )
+                        
+              ! solve for b12
+              call stdlib_dtrsm( 'L', 'L', 'N', 'U', n1, n2, one, a, lda,a( 1, n1+1 ), lda )
+                        
+              ! update b22, i.e. compute the schur complement
+              ! b22 := b22 - b21*b12
+              call stdlib_dgemm( 'N', 'N', m-n1, n2, n1, -one, a( n1+1, 1 ), lda,a( 1, n1+1 ), &
+                        lda, one, a( n1+1, n1+1 ), lda )
+              ! factor b22, recursive call
+              call stdlib_dlaorhr_col_getrfnp2( m-n1, n2, a( n1+1, n1+1 ), lda,d( n1+1 ), iinfo )
+                        
+           end if
+           return
+     end subroutine stdlib_dlaorhr_col_getrfnp2
+
+     !> DLAPMR: rearranges the rows of the M by N matrix X as specified
+     !> by the permutation K(1),K(2),...,K(M) of the integers 1,...,M.
+     !> If FORWRD = .TRUE.,  forward permutation:
+     !> X(K(I),*) is moved X(I,*) for I = 1,2,...,M.
+     !> If FORWRD = .FALSE., backward permutation:
+     !> X(I,*) is moved to X(K(I),*) for I = 1,2,...,M.
+
+     pure subroutine stdlib_dlapmr( forwrd, m, n, x, ldx, k )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: forwrd
+           integer(ilp), intent(in) :: ldx, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: k(*)
+           real(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, in, j, jj
+           real(dp) :: temp
+           ! Executable Statements 
+           if( m<=1 )return
+           do i = 1, m
+              k( i ) = -k( i )
+           end do
+           if( forwrd ) then
+              ! forward permutation
+              do i = 1, m
+                 if( k( i )>0 )go to 40
+                 j = i
+                 k( j ) = -k( j )
+                 in = k( j )
+                 20 continue
+                 if( k( in )>0 )go to 40
+                 do jj = 1, n
+                    temp = x( j, jj )
+                    x( j, jj ) = x( in, jj )
+                    x( in, jj ) = temp
+                 end do
+                 k( in ) = -k( in )
+                 j = in
+                 in = k( in )
+                 go to 20
+                 40 continue
+              end do
+           else
+              ! backward permutation
+              do i = 1, m
+                 if( k( i )>0 )go to 80
+                 k( i ) = -k( i )
+                 j = k( i )
+                 60 continue
+                 if( j==i )go to 80
+                 do jj = 1, n
+                    temp = x( i, jj )
+                    x( i, jj ) = x( j, jj )
+                    x( j, jj ) = temp
+                 end do
+                 k( j ) = -k( j )
+                 j = k( j )
+                 go to 60
+                 80 continue
+              end do
+           end if
+           return
+     end subroutine stdlib_dlapmr
+
+     !> DLAPMT: rearranges the columns of the M by N matrix X as specified
+     !> by the permutation K(1),K(2),...,K(N) of the integers 1,...,N.
+     !> If FORWRD = .TRUE.,  forward permutation:
+     !> X(*,K(J)) is moved X(*,J) for J = 1,2,...,N.
+     !> If FORWRD = .FALSE., backward permutation:
+     !> X(*,J) is moved to X(*,K(J)) for J = 1,2,...,N.
+
+     pure subroutine stdlib_dlapmt( forwrd, m, n, x, ldx, k )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: forwrd
+           integer(ilp), intent(in) :: ldx, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: k(*)
+           real(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ii, in, j
+           real(dp) :: temp
+           ! Executable Statements 
+           if( n<=1 )return
+           do i = 1, n
+              k( i ) = -k( i )
+           end do
+           if( forwrd ) then
+              ! forward permutation
+              do i = 1, n
+                 if( k( i )>0 )go to 40
+                 j = i
+                 k( j ) = -k( j )
+                 in = k( j )
+                 20 continue
+                 if( k( in )>0 )go to 40
+                 do ii = 1, m
+                    temp = x( ii, j )
+                    x( ii, j ) = x( ii, in )
+                    x( ii, in ) = temp
+                 end do
+                 k( in ) = -k( in )
+                 j = in
+                 in = k( in )
+                 go to 20
+                 40 continue
+              end do
+           else
+              ! backward permutation
+              do i = 1, n
+                 if( k( i )>0 )go to 80
+                 k( i ) = -k( i )
+                 j = k( i )
+                 60 continue
+                 if( j==i )go to 80
+                 do ii = 1, m
+                    temp = x( ii, i )
+                    x( ii, i ) = x( ii, j )
+                    x( ii, j ) = temp
+                 end do
+                 k( j ) = -k( j )
+                 j = k( j )
+                 go to 60
+                 80 continue
+              end do
+           end if
+           return
+     end subroutine stdlib_dlapmt
+
+     !> DLAPY3: returns sqrt(x**2+y**2+z**2), taking care not to cause
+     !> unnecessary overflow and unnecessary underflow.
+
+     pure real(dp) function stdlib_dlapy3( x, y, z )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: x, y, z
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: w, xabs, yabs, zabs, hugeval
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           hugeval = stdlib_dlamch( 'OVERFLOW' )
+           xabs = abs( x )
+           yabs = abs( y )
+           zabs = abs( z )
+           w = max( xabs, yabs, zabs )
+           if( w==zero .or. w>hugeval ) then
+           ! w can be zero for max(0,nan,0)
+           ! adding all three entries together will make sure
+           ! nan will not disappear.
+              stdlib_dlapy3 =  xabs + yabs + zabs
+           else
+              stdlib_dlapy3 = w*sqrt( ( xabs / w )**2+( yabs / w )**2+( zabs / w )**2 )
+           end if
+           return
+     end function stdlib_dlapy3
+
+     !> DLAQGB: equilibrates a general M by N band matrix A with KL
+     !> subdiagonals and KU superdiagonals using the row and scaling factors
+     !> in the vectors R and C.
+
+     pure subroutine stdlib_dlaqgb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(dp), intent(in) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(dp), intent(inout) :: ab(ldab,*)
+           real(dp), intent(in) :: c(*), r(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1e+0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           large = one / small
+           if( rowcnd>=thresh .and. amax>=small .and. amax<=large )then
+              ! no row scaling
+              if( colcnd>=thresh ) then
+                 ! no column scaling
+                 equed = 'N'
+              else
+                 ! column scaling
+                 do j = 1, n
+                    cj = c( j )
+                    do i = max( 1, j-ku ), min( m, j+kl )
+                       ab( ku+1+i-j, j ) = cj*ab( ku+1+i-j, j )
+                    end do
+                 end do
+                 equed = 'C'
+              end if
+           else if( colcnd>=thresh ) then
+              ! row scaling, no column scaling
+              do j = 1, n
+                 do i = max( 1, j-ku ), min( m, j+kl )
+                    ab( ku+1+i-j, j ) = r( i )*ab( ku+1+i-j, j )
+                 end do
+              end do
+              equed = 'R'
+           else
+              ! row and column scaling
+              do j = 1, n
+                 cj = c( j )
+                 do i = max( 1, j-ku ), min( m, j+kl )
+                    ab( ku+1+i-j, j ) = cj*r( i )*ab( ku+1+i-j, j )
+                 end do
+              end do
+              equed = 'B'
+           end if
+           return
+     end subroutine stdlib_dlaqgb
+
+     !> DLAQGE: equilibrates a general M by N matrix A using the row and
+     !> column scaling factors in the vectors R and C.
+
+     pure subroutine stdlib_dlaqge( m, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           integer(ilp), intent(in) :: lda, m, n
+           real(dp), intent(in) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: c(*), r(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1e+0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           large = one / small
+           if( rowcnd>=thresh .and. amax>=small .and. amax<=large )then
+              ! no row scaling
+              if( colcnd>=thresh ) then
+                 ! no column scaling
+                 equed = 'N'
+              else
+                 ! column scaling
+                 do j = 1, n
+                    cj = c( j )
+                    do i = 1, m
+                       a( i, j ) = cj*a( i, j )
+                    end do
+                 end do
+                 equed = 'C'
+              end if
+           else if( colcnd>=thresh ) then
+              ! row scaling, no column scaling
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = r( i )*a( i, j )
+                 end do
+              end do
+              equed = 'R'
+           else
+              ! row and column scaling
+              do j = 1, n
+                 cj = c( j )
+                 do i = 1, m
+                    a( i, j ) = cj*r( i )*a( i, j )
+                 end do
+              end do
+              equed = 'B'
+           end if
+           return
+     end subroutine stdlib_dlaqge
+
+     !> Given a 2-by-2 or 3-by-3 matrix H, DLAQR1: sets v to a
+     !> scalar multiple of the first column of the product
+     !> (*)  K = (H - (sr1 + i*si1)*I)*(H - (sr2 + i*si2)*I)
+     !> scaling to avoid overflows and most underflows. It
+     !> is assumed that either
+     !> 1) sr1 = sr2 and si1 = -si2
+     !> or
+     !> 2) si1 = si2 = 0.
+     !> This is useful for starting double implicit shift bulges
+     !> in the QR algorithm.
+
+     pure subroutine stdlib_dlaqr1( n, h, ldh, sr1, si1, sr2, si2, v )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: si1, si2, sr1, sr2
+           integer(ilp), intent(in) :: ldh, n
+           ! Array Arguments 
+           real(dp), intent(in) :: h(ldh,*)
+           real(dp), intent(out) :: v(*)
+        ! ================================================================
+           
+           ! Local Scalars 
+           real(dp) :: h21s, h31s, s
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! quick return if possible
+           if( n/=2 .and. n/=3 ) then
+              return
+           end if
+           if( n==2 ) then
+              s = abs( h( 1, 1 )-sr2 ) + abs( si2 ) + abs( h( 2, 1 ) )
+              if( s==zero ) then
+                 v( 1 ) = zero
+                 v( 2 ) = zero
+              else
+                 h21s = h( 2, 1 ) / s
+                 v( 1 ) = h21s*h( 1, 2 ) + ( h( 1, 1 )-sr1 )*( ( h( 1, 1 )-sr2 ) / s ) - si1*( &
+                           si2 / s )
+                 v( 2 ) = h21s*( h( 1, 1 )+h( 2, 2 )-sr1-sr2 )
+              end if
+           else
+              s = abs( h( 1, 1 )-sr2 ) + abs( si2 ) + abs( h( 2, 1 ) ) +abs( h( 3, 1 ) )
+              if( s==zero ) then
+                 v( 1 ) = zero
+                 v( 2 ) = zero
+                 v( 3 ) = zero
+              else
+                 h21s = h( 2, 1 ) / s
+                 h31s = h( 3, 1 ) / s
+                 v( 1 ) = ( h( 1, 1 )-sr1 )*( ( h( 1, 1 )-sr2 ) / s ) -si1*( si2 / s ) + h( 1, 2 )&
+                           *h21s + h( 1, 3 )*h31s
+                 v( 2 ) = h21s*( h( 1, 1 )+h( 2, 2 )-sr1-sr2 ) +h( 2, 3 )*h31s
+                 v( 3 ) = h31s*( h( 1, 1 )+h( 3, 3 )-sr1-sr2 ) +h21s*h( 3, 2 )
+              end if
+           end if
+     end subroutine stdlib_dlaqr1
+
+     !> DLAQSB: equilibrates a symmetric band matrix A using the scaling
+     !> factors in the vector S.
+
+     pure subroutine stdlib_dlaqsb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(dp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(inout) :: ab(ldab,*)
+           real(dp), intent(in) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1e+0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored in band format.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = max( 1, j-kd ), j
+                       ab( kd+1+i-j, j ) = cj*s( i )*ab( kd+1+i-j, j )
+                    end do
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, min( n, j+kd )
+                       ab( 1+i-j, j ) = cj*s( i )*ab( 1+i-j, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_dlaqsb
+
+     !> DLAQSP: equilibrates a symmetric matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_dlaqsp( uplo, n, ap, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(inout) :: ap(*)
+           real(dp), intent(in) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1e+0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jc
+           real(dp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j
+                       ap( jc+i-1 ) = cj*s( i )*ap( jc+i-1 )
+                    end do
+                    jc = jc + j
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, n
+                       ap( jc+i-j ) = cj*s( i )*ap( jc+i-j )
+                    end do
+                    jc = jc + n - j + 1
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_dlaqsp
+
+     !> DLAQSY: equilibrates a symmetric matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_dlaqsy( uplo, n, a, lda, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1e+0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, n
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_dlaqsy
+
+     !> DLAR2V: applies a vector of real plane rotations from both sides to
+     !> a sequence of 2-by-2 real symmetric matrices, defined by the elements
+     !> of the vectors x, y and z. For i = 1,2,...,n
+     !> ( x(i)  z(i) ) := (  c(i)  s(i) ) ( x(i)  z(i) ) ( c(i) -s(i) )
+     !> ( z(i)  y(i) )    ( -s(i)  c(i) ) ( z(i)  y(i) ) ( s(i)  c(i) )
+
+     pure subroutine stdlib_dlar2v( n, x, y, z, incx, c, s, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, n
+           ! Array Arguments 
+           real(dp), intent(in) :: c(*), s(*)
+           real(dp), intent(inout) :: x(*), y(*), z(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix
+           real(dp) :: ci, si, t1, t2, t3, t4, t5, t6, xi, yi, zi
+           ! Executable Statements 
+           ix = 1
+           ic = 1
+           do i = 1, n
+              xi = x( ix )
+              yi = y( ix )
+              zi = z( ix )
+              ci = c( ic )
+              si = s( ic )
+              t1 = si*zi
+              t2 = ci*zi
+              t3 = t2 - si*xi
+              t4 = t2 + si*yi
+              t5 = ci*xi + t1
+              t6 = ci*yi - t1
+              x( ix ) = ci*t5 + si*t4
+              y( ix ) = ci*t6 - si*t3
+              z( ix ) = ci*t4 - si*t5
+              ix = ix + incx
+              ic = ic + incc
+           end do
+           return
+     end subroutine stdlib_dlar2v
+
+     !> DLARF: applies a real elementary reflector H to a real m by n matrix
+     !> C, from either the left or the right. H is represented in the form
+     !> H = I - tau * v * v**T
+     !> where tau is a real scalar and v is a real vector.
+     !> If tau = 0, then H is taken to be the unit matrix.
+
+     pure subroutine stdlib_dlarf( side, m, n, v, incv, tau, c, ldc, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: incv, ldc, m, n
+           real(dp), intent(in) :: tau
+           ! Array Arguments 
+           real(dp), intent(inout) :: c(ldc,*)
+           real(dp), intent(in) :: v(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: applyleft
+           integer(ilp) :: i, lastv, lastc
+           ! Executable Statements 
+           applyleft = stdlib_lsame( side, 'L' )
+           lastv = 0
+           lastc = 0
+           if( tau/=zero ) then
+           ! set up variables for scanning v.  lastv begins pointing to the end
+           ! of v.
+              if( applyleft ) then
+                 lastv = m
+              else
+                 lastv = n
+              end if
+              if( incv>0 ) then
+                 i = 1 + (lastv-1) * incv
+              else
+                 i = 1
+              end if
+           ! look for the last non-zero row in v.
+              do while( lastv>0 .and. v( i )==zero )
+                 lastv = lastv - 1
+                 i = i - incv
+              end do
+              if( applyleft ) then
+           ! scan for the last non-zero column in c(1:lastv,:).
+                 lastc = stdlib_iladlc(lastv, n, c, ldc)
+              else
+           ! scan for the last non-zero row in c(:,1:lastv).
+                 lastc = stdlib_iladlr(m, lastv, c, ldc)
+              end if
+           end if
+           ! note that lastc.eq.0_dp renders the blas operations null; no special
+           ! case is needed at this level.
+           if( applyleft ) then
+              ! form  h * c
+              if( lastv>0 ) then
+                 ! w(1:lastc,1) := c(1:lastv,1:lastc)**t * v(1:lastv,1)
+                 call stdlib_dgemv( 'TRANSPOSE', lastv, lastc, one, c, ldc, v, incv,zero, work, 1 &
+                           )
+                 ! c(1:lastv,1:lastc) := c(...) - v(1:lastv,1) * w(1:lastc,1)**t
+                 call stdlib_dger( lastv, lastc, -tau, v, incv, work, 1, c, ldc )
+              end if
+           else
+              ! form  c * h
+              if( lastv>0 ) then
+                 ! w(1:lastc,1) := c(1:lastc,1:lastv) * v(1:lastv,1)
+                 call stdlib_dgemv( 'NO TRANSPOSE', lastc, lastv, one, c, ldc,v, incv, zero, work,&
+                            1 )
+                 ! c(1:lastc,1:lastv) := c(...) - w(1:lastc,1) * v(1:lastv,1)**t
+                 call stdlib_dger( lastc, lastv, -tau, work, 1, v, incv, c, ldc )
+              end if
+           end if
+           return
+     end subroutine stdlib_dlarf
+
+     !> DLARFB: applies a real block reflector H or its transpose H**T to a
+     !> real m by n matrix C, from either the left or the right.
+
+     pure subroutine stdlib_dlarfb( side, trans, direct, storev, m, n, k, v, ldv,t, ldt, c, ldc, &
+               work, ldwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, ldc, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: c(ldc,*)
+           real(dp), intent(in) :: t(ldt,*), v(ldv,*)
+           real(dp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           character :: transt
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           if( stdlib_lsame( trans, 'N' ) ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           if( stdlib_lsame( storev, 'C' ) ) then
+              if( stdlib_lsame( direct, 'F' ) ) then
+                 ! let  v =  ( v1 )    (first k rows)
+                           ! ( v2 )
+                 ! where  v1  is unit lower triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**t * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**t * v  =  (c1**t * v1 + c2**t * v2)  (stored in work)
+                    ! w := c1**t
+                    do j = 1, k
+                       call stdlib_dcopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n,k, one, v, ldv,&
+                               work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c2**t * v2
+                       call stdlib_dgemm( 'TRANSPOSE', 'NO TRANSPOSE', n, k, m-k,one, c( k+1, 1 ),&
+                                  ldc, v( k+1, 1 ), ldv,one, work, ldwork )
+                    end if
+                    ! w := w * t**t  or  w * t
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', transt, 'NON-UNIT', n, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v * w**t
+                    if( m>k ) then
+                       ! c2 := c2 - v2 * w**t
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', m-k, n, k,-one, v( k+1, 1 )&
+                                 , ldv, work, ldwork, one,c( k+1, 1 ), ldc )
+                    end if
+                    ! w := w * v1**t
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'UNIT', n, k,one, v, ldv, &
+                              work, ldwork )
+                    ! c1 := c1 - w**t
+                    do j = 1, k
+                       do i = 1, n
+                          c( j, i ) = c( j, i ) - work( i, j )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**t  where  c = ( c1  c2 )
+                    ! w := c * v  =  (c1*v1 + c2*v2)  (stored in work)
+                    ! w := c1
+                    do j = 1, k
+                       call stdlib_dcopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', m,k, one, v, ldv,&
+                               work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c2 * v2
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, k, n-k,one, c( 1, k+&
+                                 1 ), ldc, v( k+1, 1 ), ldv,one, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**t
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', trans, 'NON-UNIT', m, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v**t
+                    if( n>k ) then
+                       ! c2 := c2 - w * v2**t
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, n-k, k,-one, work, &
+                                 ldwork, v( k+1, 1 ), ldv, one,c( 1, k+1 ), ldc )
+                    end if
+                    ! w := w * v1**t
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'UNIT', m, k,one, v, ldv, &
+                              work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, j ) = c( i, j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              else
+                 ! let  v =  ( v1 )
+                           ! ( v2 )    (last k rows)
+                 ! where  v2  is unit upper triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**t * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**t * v  =  (c1**t * v1 + c2**t * v2)  (stored in work)
+                    ! w := c2**t
+                    do j = 1, k
+                       call stdlib_dcopy( n, c( m-k+j, 1 ), ldc, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', n,k, one, v( m-k+&
+                              1, 1 ), ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c1**t * v1
+                       call stdlib_dgemm( 'TRANSPOSE', 'NO TRANSPOSE', n, k, m-k,one, c, ldc, v, &
+                                 ldv, one, work, ldwork )
+                    end if
+                    ! w := w * t**t  or  w * t
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v * w**t
+                    if( m>k ) then
+                       ! c1 := c1 - v1 * w**t
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', m-k, n, k,-one, v, ldv, &
+                                 work, ldwork, one, c, ldc )
+                    end if
+                    ! w := w * v2**t
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'UNIT', n, k,one, v( m-k+1, &
+                              1 ), ldv, work, ldwork )
+                    ! c2 := c2 - w**t
+                    do j = 1, k
+                       do i = 1, n
+                          c( m-k+j, i ) = c( m-k+j, i ) - work( i, j )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**t  where  c = ( c1  c2 )
+                    ! w := c * v  =  (c1*v1 + c2*v2)  (stored in work)
+                    ! w := c2
+                    do j = 1, k
+                       call stdlib_dcopy( m, c( 1, n-k+j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', m,k, one, v( n-k+&
+                              1, 1 ), ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c1 * v1
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, k, n-k,one, c, ldc, &
+                                 v, ldv, one, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**t
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v**t
+                    if( n>k ) then
+                       ! c1 := c1 - w * v1**t
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, n-k, k,-one, work, &
+                                 ldwork, v, ldv, one, c, ldc )
+                    end if
+                    ! w := w * v2**t
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'UNIT', m, k,one, v( n-k+1, &
+                              1 ), ldv, work, ldwork )
+                    ! c2 := c2 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, n-k+j ) = c( i, n-k+j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              end if
+           else if( stdlib_lsame( storev, 'R' ) ) then
+              if( stdlib_lsame( direct, 'F' ) ) then
+                 ! let  v =  ( v1  v2 )    (v1: first k columns)
+                 ! where  v1  is unit upper triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**t * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**t * v**t  =  (c1**t * v1**t + c2**t * v2**t) (stored in work)
+                    ! w := c1**t
+                    do j = 1, k
+                       call stdlib_dcopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1**t
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'UNIT', n, k,one, v, ldv, &
+                              work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c2**t * v2**t
+                       call stdlib_dgemm( 'TRANSPOSE', 'TRANSPOSE', n, k, m-k, one,c( k+1, 1 ), &
+                                 ldc, v( 1, k+1 ), ldv, one,work, ldwork )
+                    end if
+                    ! w := w * t**t  or  w * t
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', transt, 'NON-UNIT', n, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v**t * w**t
+                    if( m>k ) then
+                       ! c2 := c2 - v2**t * w**t
+                       call stdlib_dgemm( 'TRANSPOSE', 'TRANSPOSE', m-k, n, k, -one,v( 1, k+1 ), &
+                                 ldv, work, ldwork, one,c( k+1, 1 ), ldc )
+                    end if
+                    ! w := w * v1
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', n,k, one, v, ldv,&
+                               work, ldwork )
+                    ! c1 := c1 - w**t
+                    do j = 1, k
+                       do i = 1, n
+                          c( j, i ) = c( j, i ) - work( i, j )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**t  where  c = ( c1  c2 )
+                    ! w := c * v**t  =  (c1*v1**t + c2*v2**t)  (stored in work)
+                    ! w := c1
+                    do j = 1, k
+                       call stdlib_dcopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1**t
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'UNIT', m, k,one, v, ldv, &
+                              work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c2 * v2**t
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, k, n-k,one, c( 1, k+1 ),&
+                                  ldc, v( 1, k+1 ), ldv,one, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**t
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', trans, 'NON-UNIT', m, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v
+                    if( n>k ) then
+                       ! c2 := c2 - w * v2
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, n-k, k,-one, work, &
+                                 ldwork, v( 1, k+1 ), ldv, one,c( 1, k+1 ), ldc )
+                    end if
+                    ! w := w * v1
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', m,k, one, v, ldv,&
+                               work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, j ) = c( i, j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              else
+                 ! let  v =  ( v1  v2 )    (v2: last k columns)
+                 ! where  v2  is unit lower triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**t * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**t * v**t  =  (c1**t * v1**t + c2**t * v2**t) (stored in work)
+                    ! w := c2**t
+                    do j = 1, k
+                       call stdlib_dcopy( n, c( m-k+j, 1 ), ldc, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2**t
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'UNIT', n, k,one, v( 1, m-k+&
+                              1 ), ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c1**t * v1**t
+                       call stdlib_dgemm( 'TRANSPOSE', 'TRANSPOSE', n, k, m-k, one,c, ldc, v, ldv,&
+                                  one, work, ldwork )
+                    end if
+                    ! w := w * t**t  or  w * t
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v**t * w**t
+                    if( m>k ) then
+                       ! c1 := c1 - v1**t * w**t
+                       call stdlib_dgemm( 'TRANSPOSE', 'TRANSPOSE', m-k, n, k, -one,v, ldv, work, &
+                                 ldwork, one, c, ldc )
+                    end if
+                    ! w := w * v2
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n,k, one, v( 1, &
+                              m-k+1 ), ldv, work, ldwork )
+                    ! c2 := c2 - w**t
+                    do j = 1, k
+                       do i = 1, n
+                          c( m-k+j, i ) = c( m-k+j, i ) - work( i, j )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h'  where  c = ( c1  c2 )
+                    ! w := c * v**t  =  (c1*v1**t + c2*v2**t)  (stored in work)
+                    ! w := c2
+                    do j = 1, k
+                       call stdlib_dcopy( m, c( 1, n-k+j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2**t
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'UNIT', m, k,one, v( 1, n-k+&
+                              1 ), ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c1 * v1**t
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, k, n-k,one, c, ldc, v, &
+                                 ldv, one, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**t
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v
+                    if( n>k ) then
+                       ! c1 := c1 - w * v1
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, n-k, k,-one, work, &
+                                 ldwork, v, ldv, one, c, ldc )
+                    end if
+                    ! w := w * v2
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', m,k, one, v( 1, &
+                              n-k+1 ), ldv, work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, n-k+j ) = c( i, n-k+j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dlarfb
+
+     !> DLARFB_GETT: applies a real Householder block reflector H from the
+     !> left to a real (K+M)-by-N  "triangular-pentagonal" matrix
+     !> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+     !> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+     !> in the array B. The block reflector H is stored in a compact
+     !> WY-representation, where the elementary reflectors are in the
+     !> arrays A, B and T. See Further Details section.
+
+     pure subroutine stdlib_dlarfb_gett( ident, m, n, k, t, ldt, a, lda, b, ldb,work, ldwork )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: ident
+           integer(ilp), intent(in) :: k, lda, ldb, ldt, ldwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(in) :: t(ldt,*)
+           real(dp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnotident
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<0 .or. n<=0 .or. k==0 .or. k>n )return
+           lnotident = .not.stdlib_lsame( ident, 'I' )
+           ! ------------------------------------------------------------------
+           ! first step. computation of the column block 2:
+              ! ( a2 ) := h * ( a2 )
+              ! ( b2 )        ( b2 )
+           ! ------------------------------------------------------------------
+           if( n>k ) then
+              ! col2_(1) compute w2: = a2. therefore, copy a2 = a(1:k, k+1:n)
+              ! into w2=work(1:k, 1:n-k) column-by-column.
+              do j = 1, n-k
+                 call stdlib_dcopy( k, a( 1, k+j ), 1, work( 1, j ), 1 )
+              end do
+              if( lnotident ) then
+                 ! col2_(2) compute w2: = (v1**t) * w2 = (a1**t) * w2,
+                 ! v1 is not an identy matrix, but unit lower-triangular
+                 ! v1 stored in a1 (diagonal ones are not stored).
+                 call stdlib_dtrmm( 'L', 'L', 'T', 'U', k, n-k, one, a, lda,work, ldwork )
+              end if
+              ! col2_(3) compute w2: = w2 + (v2**t) * b2 = w2 + (b1**t) * b2
+              ! v2 stored in b1.
+              if( m>0 ) then
+                 call stdlib_dgemm( 'T', 'N', k, n-k, m, one, b, ldb,b( 1, k+1 ), ldb, one, work, &
+                           ldwork )
+              end if
+              ! col2_(4) compute w2: = t * w2,
+              ! t is upper-triangular.
+              call stdlib_dtrmm( 'L', 'U', 'N', 'N', k, n-k, one, t, ldt,work, ldwork )
+              ! col2_(5) compute b2: = b2 - v2 * w2 = b2 - b1 * w2,
+              ! v2 stored in b1.
+              if( m>0 ) then
+                 call stdlib_dgemm( 'N', 'N', m, n-k, k, -one, b, ldb,work, ldwork, one, b( 1, k+&
+                           1 ), ldb )
+              end if
+              if( lnotident ) then
+                 ! col2_(6) compute w2: = v1 * w2 = a1 * w2,
+                 ! v1 is not an identity matrix, but unit lower-triangular,
+                 ! v1 stored in a1 (diagonal ones are not stored).
+                 call stdlib_dtrmm( 'L', 'L', 'N', 'U', k, n-k, one, a, lda,work, ldwork )
+              end if
+              ! col2_(7) compute a2: = a2 - w2 =
+                                   ! = a(1:k, k+1:n-k) - work(1:k, 1:n-k),
+              ! column-by-column.
+              do j = 1, n-k
+                 do i = 1, k
+                    a( i, k+j ) = a( i, k+j ) - work( i, j )
+                 end do
+              end do
+           end if
+           ! ------------------------------------------------------------------
+           ! second step. computation of the column block 1:
+              ! ( a1 ) := h * ( a1 )
+              ! ( b1 )        (  0 )
+           ! ------------------------------------------------------------------
+           ! col1_(1) compute w1: = a1. copy the upper-triangular
+           ! a1 = a(1:k, 1:k) into the upper-triangular
+           ! w1 = work(1:k, 1:k) column-by-column.
+           do j = 1, k
+              call stdlib_dcopy( j, a( 1, j ), 1, work( 1, j ), 1 )
+           end do
+           ! set the subdiagonal elements of w1 to zero column-by-column.
+           do j = 1, k - 1
+              do i = j + 1, k
+                 work( i, j ) = zero
+              end do
+           end do
+           if( lnotident ) then
+              ! col1_(2) compute w1: = (v1**t) * w1 = (a1**t) * w1,
+              ! v1 is not an identity matrix, but unit lower-triangular
+              ! v1 stored in a1 (diagonal ones are not stored),
+              ! w1 is upper-triangular with zeroes below the diagonal.
+              call stdlib_dtrmm( 'L', 'L', 'T', 'U', k, k, one, a, lda,work, ldwork )
+           end if
+           ! col1_(3) compute w1: = t * w1,
+           ! t is upper-triangular,
+           ! w1 is upper-triangular with zeroes below the diagonal.
+           call stdlib_dtrmm( 'L', 'U', 'N', 'N', k, k, one, t, ldt,work, ldwork )
+           ! col1_(4) compute b1: = - v2 * w1 = - b1 * w1,
+           ! v2 = b1, w1 is upper-triangular with zeroes below the diagonal.
+           if( m>0 ) then
+              call stdlib_dtrmm( 'R', 'U', 'N', 'N', m, k, -one, work, ldwork,b, ldb )
+           end if
+           if( lnotident ) then
+              ! col1_(5) compute w1: = v1 * w1 = a1 * w1,
+              ! v1 is not an identity matrix, but unit lower-triangular
+              ! v1 stored in a1 (diagonal ones are not stored),
+              ! w1 is upper-triangular on input with zeroes below the diagonal,
+              ! and square on output.
+              call stdlib_dtrmm( 'L', 'L', 'N', 'U', k, k, one, a, lda,work, ldwork )
+              ! col1_(6) compute a1: = a1 - w1 = a(1:k, 1:k) - work(1:k, 1:k)
+              ! column-by-column. a1 is upper-triangular on input.
+              ! if ident, a1 is square on output, and w1 is square,
+              ! if not ident, a1 is upper-triangular on output,
+              ! w1 is upper-triangular.
+              ! col1_(6)_a compute elements of a1 below the diagonal.
+              do j = 1, k - 1
+                 do i = j + 1, k
+                    a( i, j ) = - work( i, j )
+                 end do
+              end do
+           end if
+           ! col1_(6)_b compute elements of a1 on and above the diagonal.
+           do j = 1, k
+              do i = 1, j
+                 a( i, j ) = a( i, j ) - work( i, j )
+              end do
+           end do
+           return
+     end subroutine stdlib_dlarfb_gett
+
+     !> DLARFT: forms the triangular factor T of a real block reflector H
+     !> of order n, which is defined as a product of k elementary reflectors.
+     !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+     !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+     !> If STOREV = 'C', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th column of the array V, and
+     !> H  =  I - V * T * V**T
+     !> If STOREV = 'R', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th row of the array V, and
+     !> H  =  I - V**T * T * V
+
+     pure subroutine stdlib_dlarft( direct, storev, n, k, v, ldv, tau, t, ldt )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, storev
+           integer(ilp), intent(in) :: k, ldt, ldv, n
+           ! Array Arguments 
+           real(dp), intent(out) :: t(ldt,*)
+           real(dp), intent(in) :: tau(*), v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, prevlastv, lastv
+           ! Executable Statements 
+           ! quick return if possible
+           if( n==0 )return
+           if( stdlib_lsame( direct, 'F' ) ) then
+              prevlastv = n
+              do i = 1, k
+                 prevlastv = max( i, prevlastv )
+                 if( tau( i )==zero ) then
+                    ! h(i)  =  i
+                    do j = 1, i
+                       t( j, i ) = zero
+                    end do
+                 else
+                    ! general case
+                    if( stdlib_lsame( storev, 'C' ) ) then
+                       ! skip any trailing zeros.
+                       do lastv = n, i+1, -1
+                          if( v( lastv, i )/=zero ) exit
+                       end do
+                       do j = 1, i-1
+                          t( j, i ) = -tau( i ) * v( i , j )
+                       end do
+                       j = min( lastv, prevlastv )
+                       ! t(1:i-1,i) := - tau(i) * v(i:j,1:i-1)**t * v(i:j,i)
+                       call stdlib_dgemv( 'TRANSPOSE', j-i, i-1, -tau( i ),v( i+1, 1 ), ldv, v( i+&
+                                 1, i ), 1, one,t( 1, i ), 1 )
+                    else
+                       ! skip any trailing zeros.
+                       do lastv = n, i+1, -1
+                          if( v( i, lastv )/=zero ) exit
+                       end do
+                       do j = 1, i-1
+                          t( j, i ) = -tau( i ) * v( j , i )
+                       end do
+                       j = min( lastv, prevlastv )
+                       ! t(1:i-1,i) := - tau(i) * v(1:i-1,i:j) * v(i,i:j)**t
+                       call stdlib_dgemv( 'NO TRANSPOSE', i-1, j-i, -tau( i ),v( 1, i+1 ), ldv, v(&
+                                  i, i+1 ), ldv, one,t( 1, i ), 1 )
+                    end if
+                    ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+                    call stdlib_dtrmv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', i-1, t,ldt, t( 1, i ),&
+                               1 )
+                    t( i, i ) = tau( i )
+                    if( i>1 ) then
+                       prevlastv = max( prevlastv, lastv )
+                    else
+                       prevlastv = lastv
+                    end if
+                 end if
+              end do
+           else
+              prevlastv = 1
+              do i = k, 1, -1
+                 if( tau( i )==zero ) then
+                    ! h(i)  =  i
+                    do j = i, k
+                       t( j, i ) = zero
+                    end do
+                 else
+                    ! general case
+                    if( i<k ) then
+                       if( stdlib_lsame( storev, 'C' ) ) then
+                          ! skip any leading zeros.
+                          do lastv = 1, i-1
+                             if( v( lastv, i )/=zero ) exit
+                          end do
+                          do j = i+1, k
+                             t( j, i ) = -tau( i ) * v( n-k+i , j )
+                          end do
+                          j = max( lastv, prevlastv )
+                          ! t(i+1:k,i) = -tau(i) * v(j:n-k+i,i+1:k)**t * v(j:n-k+i,i)
+                          call stdlib_dgemv( 'TRANSPOSE', n-k+i-j, k-i, -tau( i ),v( j, i+1 ), &
+                                    ldv, v( j, i ), 1, one,t( i+1, i ), 1 )
+                       else
+                          ! skip any leading zeros.
+                          do lastv = 1, i-1
+                             if( v( i, lastv )/=zero ) exit
+                          end do
+                          do j = i+1, k
+                             t( j, i ) = -tau( i ) * v( j, n-k+i )
+                          end do
+                          j = max( lastv, prevlastv )
+                          ! t(i+1:k,i) = -tau(i) * v(i+1:k,j:n-k+i) * v(i,j:n-k+i)**t
+                          call stdlib_dgemv( 'NO TRANSPOSE', k-i, n-k+i-j,-tau( i ), v( i+1, j ), &
+                                    ldv, v( i, j ), ldv,one, t( i+1, i ), 1 )
+                       end if
+                       ! t(i+1:k,i) := t(i+1:k,i+1:k) * t(i+1:k,i)
+                       call stdlib_dtrmv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', k-i,t( i+1, i+1 ), &
+                                 ldt, t( i+1, i ), 1 )
+                       if( i>1 ) then
+                          prevlastv = min( prevlastv, lastv )
+                       else
+                          prevlastv = lastv
+                       end if
+                    end if
+                    t( i, i ) = tau( i )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_dlarft
+
+     !> DLARFX: applies a real elementary reflector H to a real m by n
+     !> matrix C, from either the left or the right. H is represented in the
+     !> form
+     !> H = I - tau * v * v**T
+     !> where tau is a real scalar and v is a real vector.
+     !> If tau = 0, then H is taken to be the unit matrix
+     !> This version uses inline code if H has order < 11.
+
+     pure subroutine stdlib_dlarfx( side, m, n, v, tau, c, ldc, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: ldc, m, n
+           real(dp), intent(in) :: tau
+           ! Array Arguments 
+           real(dp), intent(inout) :: c(ldc,*)
+           real(dp), intent(in) :: v(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j
+           real(dp) :: sum, t1, t10, t2, t3, t4, t5, t6, t7, t8, t9, v1, v10, v2, v3, v4, v5, v6, &
+                     v7, v8, v9
+           ! Executable Statements 
+           if( tau==zero )return
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c, where h has order m.
+              go to ( 10, 30, 50, 70, 90, 110, 130, 150,170, 190 )m
+              ! code for general m
+              call stdlib_dlarf( side, m, n, v, 1, tau, c, ldc, work )
+              go to 410
+              10 continue
+              ! special code for 1 x 1 householder
+              t1 = one - tau*v( 1 )*v( 1 )
+              do j = 1, n
+                 c( 1, j ) = t1*c( 1, j )
+              end do
+              go to 410
+              30 continue
+              ! special code for 2 x 2 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+              end do
+              go to 410
+              50 continue
+              ! special code for 3 x 3 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+              end do
+              go to 410
+              70 continue
+              ! special code for 4 x 4 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+              end do
+              go to 410
+              90 continue
+              ! special code for 5 x 5 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j )
+                           
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+              end do
+              go to 410
+              110 continue
+              ! special code for 6 x 6 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+              end do
+              go to 410
+              130 continue
+              ! special code for 7 x 7 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+              end do
+              go to 410
+              150 continue
+              ! special code for 8 x 8 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+              end do
+              go to 410
+              170 continue
+              ! special code for 9 x 9 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              v9 = v( 9 )
+              t9 = tau*v9
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j ) + v9*c( 9, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+                 c( 9, j ) = c( 9, j ) - sum*t9
+              end do
+              go to 410
+              190 continue
+              ! special code for 10 x 10 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              v9 = v( 9 )
+              t9 = tau*v9
+              v10 = v( 10 )
+              t10 = tau*v10
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j ) + v9*c( 9, j ) +v10*c( 10, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+                 c( 9, j ) = c( 9, j ) - sum*t9
+                 c( 10, j ) = c( 10, j ) - sum*t10
+              end do
+              go to 410
+           else
+              ! form  c * h, where h has order n.
+              go to ( 210, 230, 250, 270, 290, 310, 330, 350,370, 390 )n
+              ! code for general n
+              call stdlib_dlarf( side, m, n, v, 1, tau, c, ldc, work )
+              go to 410
+              210 continue
+              ! special code for 1 x 1 householder
+              t1 = one - tau*v( 1 )*v( 1 )
+              do j = 1, m
+                 c( j, 1 ) = t1*c( j, 1 )
+              end do
+              go to 410
+              230 continue
+              ! special code for 2 x 2 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+              end do
+              go to 410
+              250 continue
+              ! special code for 3 x 3 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+              end do
+              go to 410
+              270 continue
+              ! special code for 4 x 4 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+              end do
+              go to 410
+              290 continue
+              ! special code for 5 x 5 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 )
+                           
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+              end do
+              go to 410
+              310 continue
+              ! special code for 6 x 6 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+              end do
+              go to 410
+              330 continue
+              ! special code for 7 x 7 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+              end do
+              go to 410
+              350 continue
+              ! special code for 8 x 8 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+              end do
+              go to 410
+              370 continue
+              ! special code for 9 x 9 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              v9 = v( 9 )
+              t9 = tau*v9
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 ) + v9*c( j, 9 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+                 c( j, 9 ) = c( j, 9 ) - sum*t9
+              end do
+              go to 410
+              390 continue
+              ! special code for 10 x 10 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              v9 = v( 9 )
+              t9 = tau*v9
+              v10 = v( 10 )
+              t10 = tau*v10
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 ) + v9*c( j, 9 ) +v10*c( j, 10 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+                 c( j, 9 ) = c( j, 9 ) - sum*t9
+                 c( j, 10 ) = c( j, 10 ) - sum*t10
+              end do
+              go to 410
+           end if
+           410 continue
+           return
+     end subroutine stdlib_dlarfx
+
+     !> DLARFY: applies an elementary reflector, or Householder matrix, H,
+     !> to an n x n symmetric matrix C, from both the left and the right.
+     !> H is represented in the form
+     !> H = I - tau * v * v'
+     !> where  tau  is a scalar and  v  is a vector.
+     !> If  tau  is  zero, then  H  is taken to be the unit matrix.
+
+     pure subroutine stdlib_dlarfy( uplo, n, v, incv, tau, c, ldc, work )
+        ! -- lapack test routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incv, ldc, n
+           real(dp), intent(in) :: tau
+           ! Array Arguments 
+           real(dp), intent(inout) :: c(ldc,*)
+           real(dp), intent(in) :: v(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: alpha
+           ! Executable Statements 
+           if( tau==zero )return
+           ! form  w:= c * v
+           call stdlib_dsymv( uplo, n, one, c, ldc, v, incv, zero, work, 1 )
+           alpha = -half*tau*stdlib_ddot( n, work, 1, v, incv )
+           call stdlib_daxpy( n, alpha, v, incv, work, 1 )
+           ! c := c - v * w' - w * v'
+           call stdlib_dsyr2( uplo, n, -tau, v, incv, work, 1, c, ldc )
+           return
+     end subroutine stdlib_dlarfy
+
+     !> DLARGV: generates a vector of real plane rotations, determined by
+     !> elements of the real vectors x and y. For i = 1,2,...,n
+     !> (  c(i)  s(i) ) ( x(i) ) = ( a(i) )
+     !> ( -s(i)  c(i) ) ( y(i) ) = (   0  )
+
+     pure subroutine stdlib_dlargv( n, x, incx, y, incy, c, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, incy, n
+           ! Array Arguments 
+           real(dp), intent(out) :: c(*)
+           real(dp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix, iy
+           real(dp) :: f, g, t, tt
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           ix = 1
+           iy = 1
+           ic = 1
+           loop_10: do i = 1, n
+              f = x( ix )
+              g = y( iy )
+              if( g==zero ) then
+                 c( ic ) = one
+              else if( f==zero ) then
+                 c( ic ) = zero
+                 y( iy ) = one
+                 x( ix ) = g
+              else if( abs( f )>abs( g ) ) then
+                 t = g / f
+                 tt = sqrt( one+t*t )
+                 c( ic ) = one / tt
+                 y( iy ) = t*c( ic )
+                 x( ix ) = f*tt
+              else
+                 t = f / g
+                 tt = sqrt( one+t*t )
+                 y( iy ) = one / tt
+                 c( ic ) = t*y( iy )
+                 x( ix ) = g*tt
+              end if
+              ic = ic + incc
+              iy = iy + incy
+              ix = ix + incx
+           end do loop_10
+           return
+     end subroutine stdlib_dlargv
+
+     !> Compute the splitting points with threshold SPLTOL.
+     !> DLARRA: sets any "small" off-diagonal elements to zero.
+
+     pure subroutine stdlib_dlarra( n, d, e, e2, spltol, tnrm,nsplit, isplit, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, nsplit
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: spltol, tnrm
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isplit(*)
+           real(dp), intent(in) :: d(*)
+           real(dp), intent(inout) :: e(*), e2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: eabs, tmp1
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           ! compute splitting points
+           nsplit = 1
+           if(spltol<zero) then
+              ! criterion based on absolute off-diagonal value
+              tmp1 = abs(spltol)* tnrm
+              do i = 1, n-1
+                 eabs = abs( e(i) )
+                 if( eabs <= tmp1) then
+                    e(i) = zero
+                    e2(i) = zero
+                    isplit( nsplit ) = i
+                    nsplit = nsplit + 1
+                 end if
+              end do
+           else
+              ! criterion that guarantees relative accuracy
+              do i = 1, n-1
+                 eabs = abs( e(i) )
+                 if( eabs <= spltol * sqrt(abs(d(i)))*sqrt(abs(d(i+1))) )then
+                    e(i) = zero
+                    e2(i) = zero
+                    isplit( nsplit ) = i
+                    nsplit = nsplit + 1
+                 end if
+              end do
+           endif
+           isplit( nsplit ) = n
+           return
+     end subroutine stdlib_dlarra
+
+     !> Find the number of eigenvalues of the symmetric tridiagonal matrix T
+     !> that are in the interval (VL,VU] if JOBT = 'T', and of L D L^T
+     !> if JOBT = 'L'.
+
+     pure subroutine stdlib_dlarrc( jobt, n, vl, vu, d, e, pivmin,eigcnt, lcnt, rcnt, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobt
+           integer(ilp), intent(out) :: eigcnt, info, lcnt, rcnt
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: pivmin, vl, vu
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*), e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           logical(lk) :: matt
+           real(dp) :: lpivot, rpivot, sl, su, tmp, tmp2
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           lcnt = 0
+           rcnt = 0
+           eigcnt = 0
+           matt = stdlib_lsame( jobt, 'T' )
+           if (matt) then
+              ! sturm sequence count on t
+              lpivot = d( 1 ) - vl
+              rpivot = d( 1 ) - vu
+              if( lpivot<=zero ) then
+                 lcnt = lcnt + 1
+              endif
+              if( rpivot<=zero ) then
+                 rcnt = rcnt + 1
+              endif
+              do i = 1, n-1
+                 tmp = e(i)**2
+                 lpivot = ( d( i+1 )-vl ) - tmp/lpivot
+                 rpivot = ( d( i+1 )-vu ) - tmp/rpivot
+                 if( lpivot<=zero ) then
+                    lcnt = lcnt + 1
+                 endif
+                 if( rpivot<=zero ) then
+                    rcnt = rcnt + 1
+                 endif
+              end do
+           else
+              ! sturm sequence count on l d l^t
+              sl = -vl
+              su = -vu
+              do i = 1, n - 1
+                 lpivot = d( i ) + sl
+                 rpivot = d( i ) + su
+                 if( lpivot<=zero ) then
+                    lcnt = lcnt + 1
+                 endif
+                 if( rpivot<=zero ) then
+                    rcnt = rcnt + 1
+                 endif
+                 tmp = e(i) * d(i) * e(i)
+                 tmp2 = tmp / lpivot
+                 if( tmp2==zero ) then
+                    sl =  tmp - vl
+                 else
+                    sl = sl*tmp2 - vl
+                 end if
+                 tmp2 = tmp / rpivot
+                 if( tmp2==zero ) then
+                    su =  tmp - vu
+                 else
+                    su = su*tmp2 - vu
+                 end if
+              end do
+              lpivot = d( n ) + sl
+              rpivot = d( n ) + su
+              if( lpivot<=zero ) then
+                 lcnt = lcnt + 1
+              endif
+              if( rpivot<=zero ) then
+                 rcnt = rcnt + 1
+              endif
+           endif
+           eigcnt = rcnt - lcnt
+           return
+     end subroutine stdlib_dlarrc
+
+     !> DLARRD: computes the eigenvalues of a symmetric tridiagonal
+     !> matrix T to suitable accuracy. This is an auxiliary code to be
+     !> called from DSTEMR.
+     !> The user may ask for all eigenvalues, all eigenvalues
+     !> in the half-open interval (VL, VU], or the IL-th through IU-th
+     !> eigenvalues.
+     !> To avoid overflow, the matrix must be scaled so that its
+     !> largest element is no greater than overflow**(1/2) * underflow**(1/4) in absolute value, and for greatest
+     !> accuracy, it should not be much smaller than that.
+     !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+     !> Matrix", Report CS41, Computer Science Dept., Stanford
+     !> University, July 21, 1966.
+
+     pure subroutine stdlib_dlarrd( range, order, n, vl, vu, il, iu, gers,reltol, d, e, e2, &
+               pivmin, nsplit, isplit,m, w, werr, wl, wu, iblock, indexw,work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: order, range
+           integer(ilp), intent(in) :: il, iu, n, nsplit
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: pivmin, reltol, vl, vu
+           real(dp), intent(out) :: wl, wu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iblock(*), indexw(*), iwork(*)
+           integer(ilp), intent(in) :: isplit(*)
+           real(dp), intent(in) :: d(*), e(*), e2(*), gers(*)
+           real(dp), intent(out) :: w(*), werr(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: fudge = two
+           integer(ilp), parameter :: allrng = 1
+           integer(ilp), parameter :: valrng = 2
+           integer(ilp), parameter :: indrng = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ncnvrg, toofew
+           integer(ilp) :: i, ib, ibegin, idiscl, idiscu, ie, iend, iinfo, im, in, ioff, iout, &
+                     irange, itmax, itmp1, itmp2, iw, iwoff, j, jblk, jdisc, je, jee, nb, nwl, nwu
+           real(dp) :: atoli, eps, gl, gu, rtoli, tmp1, tmp2, tnorm, uflow, wkill, wlu, &
+                     wul
+           ! Local Arrays 
+           integer(ilp) :: idumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,min
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           ! decode range
+           if( stdlib_lsame( range, 'A' ) ) then
+              irange = allrng
+           else if( stdlib_lsame( range, 'V' ) ) then
+              irange = valrng
+           else if( stdlib_lsame( range, 'I' ) ) then
+              irange = indrng
+           else
+              irange = 0
+           end if
+           ! check for errors
+           if( irange<=0 ) then
+              info = -1
+           else if( .not.(stdlib_lsame(order,'B').or.stdlib_lsame(order,'E')) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( irange==valrng ) then
+              if( vl>=vu )info = -5
+           else if( irange==indrng .and.( il<1 .or. il>max( 1, n ) ) ) then
+              info = -6
+           else if( irange==indrng .and.( iu<min( n, il ) .or. iu>n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              return
+           end if
+           ! initialize error flags
+           info = 0
+           ncnvrg = .false.
+           toofew = .false.
+           ! quick return if possible
+           m = 0
+           if( n==0 ) return
+           ! simplification:
+           if( irange==indrng .and. il==1 .and. iu==n ) irange = 1
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           uflow = stdlib_dlamch( 'U' )
+           ! special case when n=1
+           ! treat case of 1x1 matrix for quick return
+           if( n==1 ) then
+              if( (irange==allrng).or.((irange==valrng).and.(d(1)>vl).and.(d(1)<=vu)).or.((&
+                        irange==indrng).and.(il==1).and.(iu==1)) ) then
+                 m = 1
+                 w(1) = d(1)
+                 ! the computation error of the eigenvalue is zero
+                 werr(1) = zero
+                 iblock( 1 ) = 1
+                 indexw( 1 ) = 1
+              endif
+              return
+           end if
+           ! nb is the minimum vector length for vector bisection, or 0
+           ! if only scalar is to be done.
+           nb = stdlib_ilaenv( 1, 'DSTEBZ', ' ', n, -1, -1, -1 )
+           if( nb<=1 ) nb = 0
+           ! find global spectral radius
+           gl = d(1)
+           gu = d(1)
+           do i = 1,n
+              gl =  min( gl, gers( 2*i - 1))
+              gu = max( gu, gers(2*i) )
+           end do
+           ! compute global gerschgorin bounds and spectral diameter
+           tnorm = max( abs( gl ), abs( gu ) )
+           gl = gl - fudge*tnorm*eps*n - fudge*two*pivmin
+           gu = gu + fudge*tnorm*eps*n + fudge*two*pivmin
+           ! [jan/28/2009] remove the line below since spdiam variable not use
+           ! spdiam = gu - gl
+           ! input arguments for stdlib_dlaebz:
+           ! the relative tolerance.  an interval (a,b] lies within
+           ! "relative tolerance" if  b-a < reltol*max(|a|,|b|),
+           rtoli = reltol
+           ! set the absolute tolerance for interval convergence to zero to force
+           ! interval convergence based on relative size of the interval.
+           ! this is dangerous because intervals might not converge when reltol is
+           ! small. but at least a very small number should be selected so that for
+           ! strongly graded matrices, the code can get relatively accurate
+           ! eigenvalues.
+           atoli = fudge*two*uflow + fudge*two*pivmin
+           if( irange==indrng ) then
+              ! range='i': compute an interval containing eigenvalues
+              ! il through iu. the initial interval [gl,gu] from the global
+              ! gerschgorin bounds gl and gu is refined by stdlib_dlaebz.
+              itmax = int( ( log( tnorm+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+              work( n+1 ) = gl
+              work( n+2 ) = gl
+              work( n+3 ) = gu
+              work( n+4 ) = gu
+              work( n+5 ) = gl
+              work( n+6 ) = gu
+              iwork( 1 ) = -1
+              iwork( 2 ) = -1
+              iwork( 3 ) = n + 1
+              iwork( 4 ) = n + 1
+              iwork( 5 ) = il - 1
+              iwork( 6 ) = iu
+              call stdlib_dlaebz( 3, itmax, n, 2, 2, nb, atoli, rtoli, pivmin,d, e, e2, iwork( 5 )&
+                        , work( n+1 ), work( n+5 ), iout,iwork, w, iblock, iinfo )
+              if( iinfo /= 0 ) then
+                 info = iinfo
+                 return
+              end if
+              ! on exit, output intervals may not be ordered by ascending negcount
+              if( iwork( 6 )==iu ) then
+                 wl = work( n+1 )
+                 wlu = work( n+3 )
+                 nwl = iwork( 1 )
+                 wu = work( n+4 )
+                 wul = work( n+2 )
+                 nwu = iwork( 4 )
+              else
+                 wl = work( n+2 )
+                 wlu = work( n+4 )
+                 nwl = iwork( 2 )
+                 wu = work( n+3 )
+                 wul = work( n+1 )
+                 nwu = iwork( 3 )
+              end if
+              ! on exit, the interval [wl, wlu] contains a value with negcount nwl,
+              ! and [wul, wu] contains a value with negcount nwu.
+              if( nwl<0 .or. nwl>=n .or. nwu<1 .or. nwu>n ) then
+                 info = 4
+                 return
+              end if
+           elseif( irange==valrng ) then
+              wl = vl
+              wu = vu
+           elseif( irange==allrng ) then
+              wl = gl
+              wu = gu
+           endif
+           ! find eigenvalues -- loop over blocks and recompute nwl and nwu.
+           ! nwl accumulates the number of eigenvalues .le. wl,
+           ! nwu accumulates the number of eigenvalues .le. wu
+           m = 0
+           iend = 0
+           info = 0
+           nwl = 0
+           nwu = 0
+           loop_70: do jblk = 1, nsplit
+              ioff = iend
+              ibegin = ioff + 1
+              iend = isplit( jblk )
+              in = iend - ioff
+              if( in==1 ) then
+                 ! 1x1 block
+                 if( wl>=d( ibegin )-pivmin )nwl = nwl + 1
+                 if( wu>=d( ibegin )-pivmin )nwu = nwu + 1
+                 if( irange==allrng .or.( wl<d( ibegin )-pivmin.and. wu>= d( ibegin )-pivmin ) ) &
+                           then
+                    m = m + 1
+                    w( m ) = d( ibegin )
+                    werr(m) = zero
+                    ! the gap for a single block doesn't matter for the later
+                    ! algorithm and is assigned an arbitrary large value
+                    iblock( m ) = jblk
+                    indexw( m ) = 1
+                 end if
+              ! disabled 2x2 case because of a failure on the following matrix
+              ! range = 'i', il = iu = 4
+                ! original tridiagonal, d = [
+                 ! -0.150102010615740e+00_dp
+                 ! -0.849897989384260e+00_dp
+                 ! -0.128208148052635e-15_dp
+                  ! 0.128257718286320e-15_dp
+                ! ];
+                ! e = [
+                 ! -0.357171383266986e+00_dp
+                 ! -0.180411241501588e-15_dp
+                 ! -0.175152352710251e-15_dp
+                ! ];
+               ! else if( in==2 ) then
+      ! *           2x2 block
+                  ! disc = sqrt( (half*(d(ibegin)-d(iend)))**2 + e(ibegin)**2 )
+                  ! tmp1 = half*(d(ibegin)+d(iend))
+                  ! l1 = tmp1 - disc
+                  ! if( wl>= l1-pivmin )
+           ! $         nwl = nwl + 1
+                  ! if( wu>= l1-pivmin )
+           ! $         nwu = nwu + 1
+                  ! if( irange==allrng .or. ( wl<l1-pivmin .and. wu>=
+           ! $          l1-pivmin ) ) then
+                     ! m = m + 1
+                     ! w( m ) = l1
+      ! *              the uncertainty of eigenvalues of a 2x2 matrix is very small
+                     ! werr( m ) = eps * abs( w( m ) ) * two
+                     ! iblock( m ) = jblk
+                     ! indexw( m ) = 1
+                  ! endif
+                  ! l2 = tmp1 + disc
+                  ! if( wl>= l2-pivmin )
+           ! $         nwl = nwl + 1
+                  ! if( wu>= l2-pivmin )
+           ! $         nwu = nwu + 1
+                  ! if( irange==allrng .or. ( wl<l2-pivmin .and. wu>=
+           ! $          l2-pivmin ) ) then
+                     ! m = m + 1
+                     ! w( m ) = l2
+      ! *              the uncertainty of eigenvalues of a 2x2 matrix is very small
+                     ! werr( m ) = eps * abs( w( m ) ) * two
+                     ! iblock( m ) = jblk
+                     ! indexw( m ) = 2
+                  ! endif
+              else
+                 ! general case - block of size in >= 2
+                 ! compute local gerschgorin interval and use it as the initial
+                 ! interval for stdlib_dlaebz
+                 gu = d( ibegin )
+                 gl = d( ibegin )
+                 tmp1 = zero
+                 do j = ibegin, iend
+                    gl =  min( gl, gers( 2*j - 1))
+                    gu = max( gu, gers(2*j) )
+                 end do
+                 ! [jan/28/2009]
+                 ! change spdiam by tnorm in lines 2 and 3 thereafter
+                 ! line 1: remove computation of spdiam (not useful anymore)
+                 ! spdiam = gu - gl
+                 ! gl = gl - fudge*spdiam*eps*in - fudge*pivmin
+                 ! gu = gu + fudge*spdiam*eps*in + fudge*pivmin
+                 gl = gl - fudge*tnorm*eps*in - fudge*pivmin
+                 gu = gu + fudge*tnorm*eps*in + fudge*pivmin
+                 if( irange>1 ) then
+                    if( gu<wl ) then
+                       ! the local block contains none of the wanted eigenvalues
+                       nwl = nwl + in
+                       nwu = nwu + in
+                       cycle loop_70
+                    end if
+                    ! refine search interval if possible, only range (wl,wu] matters
+                    gl = max( gl, wl )
+                    gu = min( gu, wu )
+                    if( gl>=gu )cycle loop_70
+                 end if
+                 ! find negcount of initial interval boundaries gl and gu
+                 work( n+1 ) = gl
+                 work( n+in+1 ) = gu
+                 call stdlib_dlaebz( 1, 0, in, in, 1, nb, atoli, rtoli, pivmin,d( ibegin ), e( &
+                 ibegin ), e2( ibegin ),idumma, work( n+1 ), work( n+2*in+1 ), im,iwork, w( m+1 ),&
+                            iblock( m+1 ), iinfo )
+                 if( iinfo /= 0 ) then
+                    info = iinfo
+                    return
+                 end if
+                 nwl = nwl + iwork( 1 )
+                 nwu = nwu + iwork( in+1 )
+                 iwoff = m - iwork( 1 )
+                 ! compute eigenvalues
+                 itmax = int( ( log( gu-gl+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + &
+                           2
+                 call stdlib_dlaebz( 2, itmax, in, in, 1, nb, atoli, rtoli, pivmin,d( ibegin ), e(&
+                  ibegin ), e2( ibegin ),idumma, work( n+1 ), work( n+2*in+1 ), iout,iwork, w( m+&
+                            1 ), iblock( m+1 ), iinfo )
+                 if( iinfo /= 0 ) then
+                    info = iinfo
+                    return
+                 end if
+                 ! copy eigenvalues into w and iblock
+                 ! use -jblk for block number for unconverged eigenvalues.
+                 ! loop over the number of output intervals from stdlib_dlaebz
+                 do j = 1, iout
+                    ! eigenvalue approximation is middle point of interval
+                    tmp1 = half*( work( j+n )+work( j+in+n ) )
+                    ! semi length of error interval
+                    tmp2 = half*abs( work( j+n )-work( j+in+n ) )
+                    if( j>iout-iinfo ) then
+                       ! flag non-convergence.
+                       ncnvrg = .true.
+                       ib = -jblk
+                    else
+                       ib = jblk
+                    end if
+                    do je = iwork( j ) + 1 + iwoff,iwork( j+in ) + iwoff
+                       w( je ) = tmp1
+                       werr( je ) = tmp2
+                       indexw( je ) = je - iwoff
+                       iblock( je ) = ib
+                    end do
+                 end do
+                 m = m + im
+              end if
+           end do loop_70
+           ! if range='i', then (wl,wu) contains eigenvalues nwl+1,...,nwu
+           ! if nwl+1 < il or nwu > iu, discard extra eigenvalues.
+           if( irange==indrng ) then
+              idiscl = il - 1 - nwl
+              idiscu = nwu - iu
+              if( idiscl>0 ) then
+                 im = 0
+                 do je = 1, m
+                    ! remove some of the smallest eigenvalues from the left so that
+                    ! at the end idiscl =0. move all eigenvalues up to the left.
+                    if( w( je )<=wlu .and. idiscl>0 ) then
+                       idiscl = idiscl - 1
+                    else
+                       im = im + 1
+                       w( im ) = w( je )
+                       werr( im ) = werr( je )
+                       indexw( im ) = indexw( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 m = im
+              end if
+              if( idiscu>0 ) then
+                 ! remove some of the largest eigenvalues from the right so that
+                 ! at the end idiscu =0. move all eigenvalues up to the left.
+                 im=m+1
+                 do je = m, 1, -1
+                    if( w( je )>=wul .and. idiscu>0 ) then
+                       idiscu = idiscu - 1
+                    else
+                       im = im - 1
+                       w( im ) = w( je )
+                       werr( im ) = werr( je )
+                       indexw( im ) = indexw( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 jee = 0
+                 do je = im, m
+                    jee = jee + 1
+                    w( jee ) = w( je )
+                    werr( jee ) = werr( je )
+                    indexw( jee ) = indexw( je )
+                    iblock( jee ) = iblock( je )
+                 end do
+                 m = m-im+1
+              end if
+              if( idiscl>0 .or. idiscu>0 ) then
+                 ! code to deal with effects of bad arithmetic. (if n(w) is
+                 ! monotone non-decreasing, this should never happen.)
+                 ! some low eigenvalues to be discarded are not in (wl,wlu],
+                 ! or high eigenvalues to be discarded are not in (wul,wu]
+                 ! so just kill off the smallest idiscl/largest idiscu
+                 ! eigenvalues, by marking the corresponding iblock = 0
+                 if( idiscl>0 ) then
+                    wkill = wu
+                    do jdisc = 1, idiscl
+                       iw = 0
+                       do je = 1, m
+                          if( iblock( je )/=0 .and.( w( je )<wkill .or. iw==0 ) ) then
+                             iw = je
+                             wkill = w( je )
+                          end if
+                       end do
+                       iblock( iw ) = 0
+                    end do
+                 end if
+                 if( idiscu>0 ) then
+                    wkill = wl
+                    do jdisc = 1, idiscu
+                       iw = 0
+                       do je = 1, m
+                          if( iblock( je )/=0 .and.( w( je )>=wkill .or. iw==0 ) ) then
+                             iw = je
+                             wkill = w( je )
+                          end if
+                       end do
+                       iblock( iw ) = 0
+                    end do
+                 end if
+                 ! now erase all eigenvalues with iblock set to zero
+                 im = 0
+                 do je = 1, m
+                    if( iblock( je )/=0 ) then
+                       im = im + 1
+                       w( im ) = w( je )
+                       werr( im ) = werr( je )
+                       indexw( im ) = indexw( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 m = im
+              end if
+              if( idiscl<0 .or. idiscu<0 ) then
+                 toofew = .true.
+              end if
+           end if
+           if(( irange==allrng .and. m/=n ).or.( irange==indrng .and. m/=iu-il+1 ) ) then
+              toofew = .true.
+           end if
+           ! if order='b', do nothing the eigenvalues are already sorted by
+              ! block.
+           ! if order='e', sort the eigenvalues from smallest to largest
+           if( stdlib_lsame(order,'E') .and. nsplit>1 ) then
+              do je = 1, m - 1
+                 ie = 0
+                 tmp1 = w( je )
+                 do j = je + 1, m
+                    if( w( j )<tmp1 ) then
+                       ie = j
+                       tmp1 = w( j )
+                    end if
+                 end do
+                 if( ie/=0 ) then
+                    tmp2 = werr( ie )
+                    itmp1 = iblock( ie )
+                    itmp2 = indexw( ie )
+                    w( ie ) = w( je )
+                    werr( ie ) = werr( je )
+                    iblock( ie ) = iblock( je )
+                    indexw( ie ) = indexw( je )
+                    w( je ) = tmp1
+                    werr( je ) = tmp2
+                    iblock( je ) = itmp1
+                    indexw( je ) = itmp2
+                 end if
+              end do
+           end if
+           info = 0
+           if( ncnvrg )info = info + 1
+           if( toofew )info = info + 2
+           return
+     end subroutine stdlib_dlarrd
+
+     !> Given the initial eigenvalue approximations of T, DLARRJ:
+     !> does  bisection to refine the eigenvalues of T,
+     !> W( IFIRST-OFFSET ) through W( ILAST-OFFSET ), to more accuracy. Initial
+     !> guesses for these eigenvalues are input in W, the corresponding estimate
+     !> of the error in these guesses in WERR. During bisection, intervals
+     !> [left, right] are maintained by storing their mid-points and
+     !> semi-widths in the arrays W and WERR respectively.
+
+     pure subroutine stdlib_dlarrj( n, d, e2, ifirst, ilast,rtol, offset, w, werr, work, iwork,&
+               pivmin, spdiam, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ifirst, ilast, n, offset
+           integer(ilp), intent(out) :: info
+           real(dp), intent(in) :: pivmin, rtol, spdiam
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: d(*), e2(*)
+           real(dp), intent(inout) :: w(*), werr(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           integer(ilp) :: maxitr
+           ! Local Scalars 
+           integer(ilp) :: cnt, i, i1, i2, ii, iter, j, k, next, nint, olnint, p, prev, &
+                     savi1
+           real(dp) :: dplus, fac, left, mid, right, s, tmp, width
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           maxitr = int( ( log( spdiam+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+           ! initialize unconverged intervals in [ work(2*i-1), work(2*i) ].
+           ! the sturm count, count( work(2*i-1) ) is arranged to be i-1, while
+           ! count( work(2*i) ) is stored in iwork( 2*i ). the integer iwork( 2*i-1 )
+           ! for an unconverged interval is set to the index of the next unconverged
+           ! interval, and is -1 or 0 for a converged interval. thus a linked
+           ! list of unconverged intervals is set up.
+           i1 = ifirst
+           i2 = ilast
+           ! the number of unconverged intervals
+           nint = 0
+           ! the last unconverged interval found
+           prev = 0
+           loop_75: do i = i1, i2
+              k = 2*i
+              ii = i - offset
+              left = w( ii ) - werr( ii )
+              mid = w(ii)
+              right = w( ii ) + werr( ii )
+              width = right - mid
+              tmp = max( abs( left ), abs( right ) )
+              ! the following test prevents the test of converged intervals
+              if( width<rtol*tmp ) then
+                 ! this interval has already converged and does not need refinement.
+                 ! (note that the gaps might change through refining the
+                  ! eigenvalues, however, they can only get bigger.)
+                 ! remove it from the list.
+                 iwork( k-1 ) = -1
+                 ! make sure that i1 always points to the first unconverged interval
+                 if((i==i1).and.(i<i2)) i1 = i + 1
+                 if((prev>=i1).and.(i<=i2)) iwork( 2*prev-1 ) = i + 1
+              else
+                 ! unconverged interval found
+                 prev = i
+                 ! make sure that [left,right] contains the desired eigenvalue
+                 ! do while( cnt(left)>i-1 )
+                 fac = one
+                 20 continue
+                 cnt = 0
+                 s = left
+                 dplus = d( 1 ) - s
+                 if( dplus<zero ) cnt = cnt + 1
+                 do j = 2, n
+                    dplus = d( j ) - s - e2( j-1 )/dplus
+                    if( dplus<zero ) cnt = cnt + 1
+                 end do
+                 if( cnt>i-1 ) then
+                    left = left - werr( ii )*fac
+                    fac = two*fac
+                    go to 20
+                 end if
+                 ! do while( cnt(right)<i )
+                 fac = one
+                 50 continue
+                 cnt = 0
+                 s = right
+                 dplus = d( 1 ) - s
+                 if( dplus<zero ) cnt = cnt + 1
+                 do j = 2, n
+                    dplus = d( j ) - s - e2( j-1 )/dplus
+                    if( dplus<zero ) cnt = cnt + 1
+                 end do
+                 if( cnt<i ) then
+                    right = right + werr( ii )*fac
+                    fac = two*fac
+                    go to 50
+                 end if
+                 nint = nint + 1
+                 iwork( k-1 ) = i + 1
+                 iwork( k ) = cnt
+              end if
+              work( k-1 ) = left
+              work( k ) = right
+           end do loop_75
+           savi1 = i1
+           ! do while( nint>0 ), i.e. there are still unconverged intervals
+           ! and while (iter<maxitr)
+           iter = 0
+           80 continue
+           prev = i1 - 1
+           i = i1
+           olnint = nint
+           loop_100: do p = 1, olnint
+              k = 2*i
+              ii = i - offset
+              next = iwork( k-1 )
+              left = work( k-1 )
+              right = work( k )
+              mid = half*( left + right )
+              ! semiwidth of interval
+              width = right - mid
+              tmp = max( abs( left ), abs( right ) )
+              if( ( width<rtol*tmp ) .or.(iter==maxitr) )then
+                 ! reduce number of unconverged intervals
+                 nint = nint - 1
+                 ! mark interval as converged.
+                 iwork( k-1 ) = 0
+                 if( i1==i ) then
+                    i1 = next
+                 else
+                    ! prev holds the last unconverged interval previously examined
+                    if(prev>=i1) iwork( 2*prev-1 ) = next
+                 end if
+                 i = next
+                 cycle loop_100
+              end if
+              prev = i
+              ! perform one bisection step
+              cnt = 0
+              s = mid
+              dplus = d( 1 ) - s
+              if( dplus<zero ) cnt = cnt + 1
+              do j = 2, n
+                 dplus = d( j ) - s - e2( j-1 )/dplus
+                 if( dplus<zero ) cnt = cnt + 1
+              end do
+              if( cnt<=i-1 ) then
+                 work( k-1 ) = mid
+              else
+                 work( k ) = mid
+              end if
+              i = next
+           end do loop_100
+           iter = iter + 1
+           ! do another loop if there are still unconverged intervals
+           ! however, in the last iteration, all intervals are accepted
+           ! since this is the best we can do.
+           if( ( nint>0 ).and.(iter<=maxitr) ) go to 80
+           ! at this point, all the intervals have converged
+           do i = savi1, ilast
+              k = 2*i
+              ii = i - offset
+              ! all intervals marked by '0' have been refined.
+              if( iwork( k-1 )==0 ) then
+                 w( ii ) = half*( work( k-1 )+work( k ) )
+                 werr( ii ) = work( k ) - w( ii )
+              end if
+           end do
+           return
+     end subroutine stdlib_dlarrj
+
+     !> DLARRK: computes one eigenvalue of a symmetric tridiagonal
+     !> matrix T to suitable accuracy. This is an auxiliary code to be
+     !> called from DSTEMR.
+     !> To avoid overflow, the matrix must be scaled so that its
+     !> largest element is no greater than overflow**(1/2) * underflow**(1/4) in absolute value, and for greatest
+     !> accuracy, it should not be much smaller than that.
+     !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+     !> Matrix", Report CS41, Computer Science Dept., Stanford
+     !> University, July 21, 1966.
+
+     pure subroutine stdlib_dlarrk( n, iw, gl, gu,d, e2, pivmin, reltol, w, werr, info)
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: iw, n
+           real(dp), intent(in) :: pivmin, reltol, gl, gu
+           real(dp), intent(out) :: w, werr
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*), e2(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: fudge = two
+           
+           ! Local Scalars 
+           integer(ilp) :: i, it, itmax, negcnt
+           real(dp) :: atoli, eps, left, mid, right, rtoli, tmp1, tmp2, tnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              info = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           tnorm = max( abs( gl ), abs( gu ) )
+           rtoli = reltol
+           atoli = fudge*two*pivmin
+           itmax = int( ( log( tnorm+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+           info = -1
+           left = gl - fudge*tnorm*eps*n - fudge*two*pivmin
+           right = gu + fudge*tnorm*eps*n + fudge*two*pivmin
+           it = 0
+           10 continue
+           ! check if interval converged or maximum number of iterations reached
+           tmp1 = abs( right - left )
+           tmp2 = max( abs(right), abs(left) )
+           if( tmp1<max( atoli, pivmin, rtoli*tmp2 ) ) then
+              info = 0
+              goto 30
+           endif
+           if(it>itmax)goto 30
+           ! count number of negative pivots for mid-point
+           it = it + 1
+           mid = half * (left + right)
+           negcnt = 0
+           tmp1 = d( 1 ) - mid
+           if( abs( tmp1 )<pivmin )tmp1 = -pivmin
+           if( tmp1<=zero )negcnt = negcnt + 1
+           do i = 2, n
+              tmp1 = d( i ) - e2( i-1 ) / tmp1 - mid
+              if( abs( tmp1 )<pivmin )tmp1 = -pivmin
+              if( tmp1<=zero )negcnt = negcnt + 1
+           end do
+           if(negcnt>=iw) then
+              right = mid
+           else
+              left = mid
+           endif
+           goto 10
+           30 continue
+           ! converged or maximum number of iterations reached
+           w = half * (left + right)
+           werr = half * abs( right - left )
+           return
+     end subroutine stdlib_dlarrk
+
+     !> Perform tests to decide whether the symmetric tridiagonal matrix T
+     !> warrants expensive computations which guarantee high relative accuracy
+     !> in the eigenvalues.
+
+     pure subroutine stdlib_dlarrr( n, d, e, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*)
+           real(dp), intent(inout) :: e(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: relcond = 0.999_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           logical(lk) :: yesrel
+           real(dp) :: eps, safmin, smlnum, rmin, tmp, tmp2, offdig, offdig2
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              info = 0
+              return
+           end if
+           ! as a default, do not go for relative-accuracy preserving computations.
+           info = 1
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           rmin = sqrt( smlnum )
+           ! tests for relative accuracy
+           ! test for scaled diagonal dominance
+           ! scale the diagonal entries to one and check whether the sum of the
+           ! off-diagonals is less than one
+           ! the sdd relative error bounds have a 1/(1- 2*x) factor in them,
+           ! x = max(offdig + offdig2), so when x is close to 1/2, no relative
+           ! accuracy is promised.  in the notation of the code fragment below,
+           ! 1/(1 - (offdig + offdig2)) is the condition number.
+           ! we don't think it is worth going into "sdd mode" unless the relative
+           ! condition number is reasonable, not 1/macheps.
+           ! the threshold should be compatible with other thresholds used in the
+           ! code. we set  offdig + offdig2 <= .999_dp =: relcond, it corresponds
+           ! to losing at most 3 decimal digits: 1 / (1 - (offdig + offdig2)) <= 1000
+           ! instead of the current offdig + offdig2 < 1
+           yesrel = .true.
+           offdig = zero
+           tmp = sqrt(abs(d(1)))
+           if (tmp<rmin) yesrel = .false.
+           if(.not.yesrel) goto 11
+           do i = 2, n
+              tmp2 = sqrt(abs(d(i)))
+              if (tmp2<rmin) yesrel = .false.
+              if(.not.yesrel) goto 11
+              offdig2 = abs(e(i-1))/(tmp*tmp2)
+              if(offdig+offdig2>=relcond) yesrel = .false.
+              if(.not.yesrel) goto 11
+              tmp = tmp2
+              offdig = offdig2
+           end do
+           11 continue
+           if( yesrel ) then
+              info = 0
+              return
+           else
+           endif
+           ! *** more to be implemented ***
+           ! test if the lower bidiagonal matrix l from t = l d l^t
+           ! (zero shift facto) is well conditioned
+           ! test if the upper bidiagonal matrix u from t = u d u^t
+           ! (zero shift facto) is well conditioned.
+           ! in this case, the matrix needs to be flipped and, at the end
+           ! of the eigenvector computation, the flip needs to be applied
+           ! to the computed eigenvectors (and the support)
+           return
+     end subroutine stdlib_dlarrr
+
+     !> !
+     !>
+     !> DLARTG: generates a plane rotation so that
+     !> [  C  S  ]  .  [ F ]  =  [ R ]
+     !> [ -S  C  ]     [ G ]     [ 0 ]
+     !> where C**2 + S**2 = 1.
+     !> The mathematical formulas used for C and S are
+     !> R = sign(F) * sqrt(F**2 + G**2)
+     !> C = F / R
+     !> S = G / R
+     !> Hence C >= 0. The algorithm used to compute these quantities
+     !> incorporates scaling to avoid overflow or underflow in computing the
+     !> square root of the sum of squares.
+     !> This version is discontinuous in R at F = 0 but it returns the same
+     !> C and S as ZLARTG for complex inputs (F,0) and (G,0).
+     !> This is a more accurate version of the BLAS1 routine DROTG,
+     !> with the following other differences:
+     !> F and G are unchanged on return.
+     !> If G=0, then C=1 and S=0.
+     !> If F=0 and (G .ne. 0), then C=0 and S=sign(1,G) without doing any
+     !> floating point operations (saves work in DBDSQR when
+     !> there are zeros on the diagonal).
+     !> If F exceeds G in magnitude, C will be positive.
+     !> Below, wp=>dp stands for double precision from LA_CONSTANTS module.
+
+     pure subroutine stdlib_dlartg( f, g, c, s, r )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! february 2021
+        ! Scalar Arguments 
+        real(dp), intent(out) :: c, r, s
+        real(dp), intent(in) :: f, g
+        ! Local Scalars 
+        real(dp) :: d, f1, fs, g1, gs, p, u, uu
+        ! Intrinsic Functions 
+        intrinsic :: abs,sign,sqrt
+        ! Executable Statements 
+        f1 = abs( f )
+        g1 = abs( g )
+        if( g == zero ) then
+           c = one
+           s = zero
+           r = f
+        else if( f == zero ) then
+           c = zero
+           s = sign( one, g )
+           r = g1
+     else if( f1 > rtmin .and. f1 < rtmax .and.            g1 > rtmin .and. g1 < rtmax ) &
+               then
+           d = sqrt( f*f + g*g )
+           p = one / d
+           c = f1*p
+           s = g*sign( p, f )
+           r = sign( d, f )
+        else
+           u = min( safmax, max( safmin, f1, g1 ) )
+           uu = one / u
+           fs = f*uu
+           gs = g*uu
+           d = sqrt( fs*fs + gs*gs )
+           p = one / d
+           c = abs( fs )*p
+           s = gs*sign( p, f )
+           r = sign( d, f )*u
+        end if
+        return
+     end subroutine stdlib_dlartg
+
+     !> DLARTGP: generates a plane rotation so that
+     !> [  CS  SN  ]  .  [ F ]  =  [ R ]   where CS**2 + SN**2 = 1.
+     !> [ -SN  CS  ]     [ G ]     [ 0 ]
+     !> This is a slower, more accurate version of the Level 1 BLAS routine DROTG,
+     !> with the following other differences:
+     !> F and G are unchanged on return.
+     !> If G=0, then CS=(+/-)1 and SN=0.
+     !> If F=0 and (G .ne. 0), then CS=0 and SN=(+/-)1.
+     !> The sign is chosen so that R >= 0.
+
+     pure subroutine stdlib_dlartgp( f, g, cs, sn, r )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(out) :: cs, r, sn
+           real(dp), intent(in) :: f, g
+        ! =====================================================================
+           
+           
+           
+           ! Local Scalars 
+           ! logical            first
+           integer(ilp) :: count, i
+           real(dp) :: eps, f1, g1, safmin, safmn2, safmx2, scale
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,sign,sqrt
+           ! Save Statement 
+           ! save               first, safmx2, safmin, safmn2
+           ! Data Statements 
+           ! data               first / .true. /
+           ! Executable Statements 
+           ! if( first ) then
+              safmin = stdlib_dlamch( 'S' )
+              eps = stdlib_dlamch( 'E' )
+              safmn2 = stdlib_dlamch( 'B' )**int( log( safmin / eps ) /log( stdlib_dlamch( 'B' ) )&
+                         / two,KIND=ilp)
+              safmx2 = one / safmn2
+              ! first = .false.
+           ! end if
+           if( g==zero ) then
+              cs = sign( one, f )
+              sn = zero
+              r = abs( f )
+           else if( f==zero ) then
+              cs = zero
+              sn = sign( one, g )
+              r = abs( g )
+           else
+              f1 = f
+              g1 = g
+              scale = max( abs( f1 ), abs( g1 ) )
+              if( scale>=safmx2 ) then
+                 count = 0
+                 10 continue
+                 count = count + 1
+                 f1 = f1*safmn2
+                 g1 = g1*safmn2
+                 scale = max( abs( f1 ), abs( g1 ) )
+                 if( scale>=safmx2 .and. count < 20 )go to 10
+                 r = sqrt( f1**2+g1**2 )
+                 cs = f1 / r
+                 sn = g1 / r
+                 do i = 1, count
+                    r = r*safmx2
+                 end do
+              else if( scale<=safmn2 ) then
+                 count = 0
+                 30 continue
+                 count = count + 1
+                 f1 = f1*safmx2
+                 g1 = g1*safmx2
+                 scale = max( abs( f1 ), abs( g1 ) )
+                 if( scale<=safmn2 )go to 30
+                 r = sqrt( f1**2+g1**2 )
+                 cs = f1 / r
+                 sn = g1 / r
+                 do i = 1, count
+                    r = r*safmn2
+                 end do
+              else
+                 r = sqrt( f1**2+g1**2 )
+                 cs = f1 / r
+                 sn = g1 / r
+              end if
+              if( r<zero ) then
+                 cs = -cs
+                 sn = -sn
+                 r = -r
+              end if
+           end if
+           return
+     end subroutine stdlib_dlartgp
+
+     !> DLARTGS: generates a plane rotation designed to introduce a bulge in
+     !> Golub-Reinsch-style implicit QR iteration for the bidiagonal SVD
+     !> problem. X and Y are the top-row entries, and SIGMA is the shift.
+     !> The computed CS and SN define a plane rotation satisfying
+     !> [  CS  SN  ]  .  [ X^2 - SIGMA ]  =  [ R ],
+     !> [ -SN  CS  ]     [    X * Y    ]     [ 0 ]
+     !> with R nonnegative.  If X^2 - SIGMA and X * Y are 0, then the
+     !> rotation is by PI/2.
+
+     pure subroutine stdlib_dlartgs( x, y, sigma, cs, sn )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(out) :: cs, sn
+           real(dp), intent(in) :: sigma, x, y
+        ! ===================================================================
+           
+           ! Local Scalars 
+           real(dp) :: r, s, thresh, w, z
+           thresh = stdlib_dlamch('E')
+           ! compute the first column of b**t*b - sigma^2*i, up to a scale
+           ! factor.
+           if( (sigma == zero .and. abs(x) < thresh) .or.(abs(x) == sigma .and. y == zero) ) &
+                     then
+              z = zero
+              w = zero
+           else if( sigma == zero ) then
+              if( x >= zero ) then
+                 z = x
+                 w = y
+              else
+                 z = -x
+                 w = -y
+              end if
+           else if( abs(x) < thresh ) then
+              z = -sigma*sigma
+              w = zero
+           else
+              if( x >= zero ) then
+                 s = one
+              else
+                 s = negone
+              end if
+              z = s * (abs(x)-sigma) * (s+sigma/x)
+              w = s * y
+           end if
+           ! generate the rotation.
+           ! call stdlib_dlartgp( z, w, cs, sn, r ) might seem more natural;
+           ! reordering the arguments ensures that if z = 0 then the rotation
+           ! is by pi/2.
+           call stdlib_dlartgp( w, z, sn, cs, r )
+           return
+           ! end stdlib_dlartgs
+     end subroutine stdlib_dlartgs
+
+     !> DLARTV: applies a vector of real plane rotations to elements of the
+     !> real vectors x and y. For i = 1,2,...,n
+     !> ( x(i) ) := (  c(i)  s(i) ) ( x(i) )
+     !> ( y(i) )    ( -s(i)  c(i) ) ( y(i) )
+
+     pure subroutine stdlib_dlartv( n, x, incx, y, incy, c, s, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, incy, n
+           ! Array Arguments 
+           real(dp), intent(in) :: c(*), s(*)
+           real(dp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix, iy
+           real(dp) :: xi, yi
+           ! Executable Statements 
+           ix = 1
+           iy = 1
+           ic = 1
+           do i = 1, n
+              xi = x( ix )
+              yi = y( iy )
+              x( ix ) = c( ic )*xi + s( ic )*yi
+              y( iy ) = c( ic )*yi - s( ic )*xi
+              ix = ix + incx
+              iy = iy + incy
+              ic = ic + incc
+           end do
+           return
+     end subroutine stdlib_dlartv
+
+     !> DLARUV: returns a vector of n random real numbers from a uniform (0,1)
+     !> distribution (n <= 128).
+     !> This is an auxiliary routine called by DLARNV and ZLARNV.
+
+     pure subroutine stdlib_dlaruv( iseed, n, x )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: iseed(4)
+           real(dp), intent(out) :: x(n)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: lv = 128
+           integer(ilp), parameter :: ipw2 = 4096
+           real(dp), parameter :: r = one/ipw2
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, i2, i3, i4, it1, it2, it3, it4, j
+           ! Local Arrays 
+           integer(ilp) :: mm(lv,4)
+           ! Intrinsic Functions 
+           intrinsic :: real,min,mod
+           ! Data Statements 
+           mm(1,1:4)=[494,322,2508,2549]
+           mm(2,1:4)=[2637,789,3754,1145]
+           mm(3,1:4)=[255,1440,1766,2253]
+           mm(4,1:4)=[2008,752,3572,305]
+           mm(5,1:4)=[1253,2859,2893,3301]
+           mm(6,1:4)=[3344,123,307,1065]
+           mm(7,1:4)=[4084,1848,1297,3133]
+           mm(8,1:4)=[1739,643,3966,2913]
+           mm(9,1:4)=[3143,2405,758,3285]
+           mm(10,1:4)=[3468,2638,2598,1241]
+           mm(11,1:4)=[688,2344,3406,1197]
+           mm(12,1:4)=[1657,46,2922,3729]
+           mm(13,1:4)=[1238,3814,1038,2501]
+           mm(14,1:4)=[3166,913,2934,1673]
+           mm(15,1:4)=[1292,3649,2091,541]
+           mm(16,1:4)=[3422,339,2451,2753]
+           mm(17,1:4)=[1270,3808,1580,949]
+           mm(18,1:4)=[2016,822,1958,2361]
+           mm(19,1:4)=[154,2832,2055,1165]
+           mm(20,1:4)=[2862,3078,1507,4081]
+           mm(21,1:4)=[697,3633,1078,2725]
+           mm(22,1:4)=[1706,2970,3273,3305]
+           mm(23,1:4)=[491,637,17,3069]
+           mm(24,1:4)=[931,2249,854,3617]
+           mm(25,1:4)=[1444,2081,2916,3733]
+           mm(26,1:4)=[444,4019,3971,409]
+           mm(27,1:4)=[3577,1478,2889,2157]
+           mm(28,1:4)=[3944,242,3831,1361]
+           mm(29,1:4)=[2184,481,2621,3973]
+           mm(30,1:4)=[1661,2075,1541,1865]
+           mm(31,1:4)=[3482,4058,893,2525]
+           mm(32,1:4)=[657,622,736,1409]
+           mm(33,1:4)=[3023,3376,3992,3445]
+           mm(34,1:4)=[3618,812,787,3577]
+           mm(35,1:4)=[1267,234,2125,77]
+           mm(36,1:4)=[1828,641,2364,3761]
+           mm(37,1:4)=[164,4005,2460,2149]
+           mm(38,1:4)=[3798,1122,257,1449]
+           mm(39,1:4)=[3087,3135,1574,3005]
+           mm(40,1:4)=[2400,2640,3912,225]
+           mm(41,1:4)=[2870,2302,1216,85]
+           mm(42,1:4)=[3876,40,3248,3673]
+           mm(43,1:4)=[1905,1832,3401,3117]
+           mm(44,1:4)=[1593,2247,2124,3089]
+           mm(45,1:4)=[1797,2034,2762,1349]
+           mm(46,1:4)=[1234,2637,149,2057]
+           mm(47,1:4)=[3460,1287,2245,413]
+           mm(48,1:4)=[328,1691,166,65]
+           mm(49,1:4)=[2861,496,466,1845]
+           mm(50,1:4)=[1950,1597,4018,697]
+           mm(51,1:4)=[617,2394,1399,3085]
+           mm(52,1:4)=[2070,2584,190,3441]
+           mm(53,1:4)=[3331,1843,2879,1573]
+           mm(54,1:4)=[769,336,153,3689]
+           mm(55,1:4)=[1558,1472,2320,2941]
+           mm(56,1:4)=[2412,2407,18,929]
+           mm(57,1:4)=[2800,433,712,533]
+           mm(58,1:4)=[189,2096,2159,2841]
+           mm(59,1:4)=[287,1761,2318,4077]
+           mm(60,1:4)=[2045,2810,2091,721]
+           mm(61,1:4)=[1227,566,3443,2821]
+           mm(62,1:4)=[2838,442,1510,2249]
+           mm(63,1:4)=[209,41,449,2397]
+           mm(64,1:4)=[2770,1238,1956,2817]
+           mm(65,1:4)=[3654,1086,2201,245]
+           mm(66,1:4)=[3993,603,3137,1913]
+           mm(67,1:4)=[192,840,3399,1997]
+           mm(68,1:4)=[2253,3168,1321,3121]
+           mm(69,1:4)=[3491,1499,2271,997]
+           mm(70,1:4)=[2889,1084,3667,1833]
+           mm(71,1:4)=[2857,3438,2703,2877]
+           mm(72,1:4)=[2094,2408,629,1633]
+           mm(73,1:4)=[1818,1589,2365,981]
+           mm(74,1:4)=[688,2391,2431,2009]
+           mm(75,1:4)=[1407,288,1113,941]
+           mm(76,1:4)=[634,26,3922,2449]
+           mm(77,1:4)=[3231,512,2554,197]
+           mm(78,1:4)=[815,1456,184,2441]
+           mm(79,1:4)=[3524,171,2099,285]
+           mm(80,1:4)=[1914,1677,3228,1473]
+           mm(81,1:4)=[516,2657,4012,2741]
+           mm(82,1:4)=[164,2270,1921,3129]
+           mm(83,1:4)=[303,2587,3452,909]
+           mm(84,1:4)=[2144,2961,3901,2801]
+           mm(85,1:4)=[3480,1970,572,421]
+           mm(86,1:4)=[119,1817,3309,4073]
+           mm(87,1:4)=[3357,676,3171,2813]
+           mm(88,1:4)=[837,1410,817,2337]
+           mm(89,1:4)=[2826,3723,3039,1429]
+           mm(90,1:4)=[2332,2803,1696,1177]
+           mm(91,1:4)=[2089,3185,1256,1901]
+           mm(92,1:4)=[3780,184,3715,81]
+           mm(93,1:4)=[1700,663,2077,1669]
+           mm(94,1:4)=[3712,499,3019,2633]
+           mm(95,1:4)=[150,3784,1497,2269]
+           mm(96,1:4)=[2000,1631,1101,129]
+           mm(97,1:4)=[3375,1925,717,1141]
+           mm(98,1:4)=[1621,3912,51,249]
+           mm(99,1:4)=[3090,1398,981,3917]
+           mm(100,1:4)=[3765,1349,1978,2481]
+           mm(101,1:4)=[1149,1441,1813,3941]
+           mm(102,1:4)=[3146,2224,3881,2217]
+           mm(103,1:4)=[33,2411,76,2749]
+           mm(104,1:4)=[3082,1907,3846,3041]
+           mm(105,1:4)=[2741,3192,3694,1877]
+           mm(106,1:4)=[359,2786,1682,345]
+           mm(107,1:4)=[3316,382,124,2861]
+           mm(108,1:4)=[1749,37,1660,1809]
+           mm(109,1:4)=[185,759,3997,3141]
+           mm(110,1:4)=[2784,2948,479,2825]
+           mm(111,1:4)=[2202,1862,1141,157]
+           mm(112,1:4)=[2199,3802,886,2881]
+           mm(113,1:4)=[1364,2423,3514,3637]
+           mm(114,1:4)=[1244,2051,1301,1465]
+           mm(115,1:4)=[2020,2295,3604,2829]
+           mm(116,1:4)=[3160,1332,1888,2161]
+           mm(117,1:4)=[2785,1832,1836,3365]
+           mm(118,1:4)=[2772,2405,1990,361]
+           mm(119,1:4)=[1217,3638,2058,2685]
+           mm(120,1:4)=[1822,3661,692,3745]
+           mm(121,1:4)=[1245,327,1194,2325]
+           mm(122,1:4)=[2252,3660,20,3609]
+           mm(123,1:4)=[3904,716,3285,3821]
+           mm(124,1:4)=[2774,1842,2046,3537]
+           mm(125,1:4)=[997,3987,2107,517]
+           mm(126,1:4)=[2573,1368,3508,3017]
+           mm(127,1:4)=[1148,1848,3525,2141]
+           mm(128,1:4)=[545,2366,3801,1537]
+           ! Executable Statements 
+           i1 = iseed( 1 )
+           i2 = iseed( 2 )
+           i3 = iseed( 3 )
+           i4 = iseed( 4 )
+           loop_10: do i = 1, min( n, lv )
+           20 continue
+              ! multiply the seed by i-th power of the multiplier modulo 2**48
+              it4 = i4*mm( i, 4 )
+              it3 = it4 / ipw2
+              it4 = it4 - ipw2*it3
+              it3 = it3 + i3*mm( i, 4 ) + i4*mm( i, 3 )
+              it2 = it3 / ipw2
+              it3 = it3 - ipw2*it2
+              it2 = it2 + i2*mm( i, 4 ) + i3*mm( i, 3 ) + i4*mm( i, 2 )
+              it1 = it2 / ipw2
+              it2 = it2 - ipw2*it1
+              it1 = it1 + i1*mm( i, 4 ) + i2*mm( i, 3 ) + i3*mm( i, 2 ) +i4*mm( i, 1 )
+              it1 = mod( it1, ipw2 )
+              ! convert 48-bit integer to a realnumber in the interval (0,1,KIND=dp)
+              x( i ) = r*( real( it1,KIND=dp)+r*( real( it2,KIND=dp)+r*( real( it3,KIND=dp)+&
+                        r*real( it4,KIND=dp) ) ) )
+              if (x( i )==1.0_dp) then
+                 ! if a real number has n bits of precision, and the first
+                 ! n bits of the 48-bit integer above happen to be all 1 (which
+                 ! will occur about once every 2**n calls), then x( i ) will
+                 ! be rounded to exactly one.
+                 ! since x( i ) is not supposed to return exactly 0.0_dp or 1.0_dp,
+                 ! the statistically correct thing to do in this situation is
+                 ! simply to iterate again.
+                 ! n.b. the case x( i ) = 0.0_dp should not be possible.
+                 i1 = i1 + 2
+                 i2 = i2 + 2
+                 i3 = i3 + 2
+                 i4 = i4 + 2
+                 goto 20
+              end if
+           end do loop_10
+           ! return final value of seed
+           iseed( 1 ) = it1
+           iseed( 2 ) = it2
+           iseed( 3 ) = it3
+           iseed( 4 ) = it4
+           return
+     end subroutine stdlib_dlaruv
+
+     !> DLARZ: applies a real elementary reflector H to a real M-by-N
+     !> matrix C, from either the left or the right. H is represented in the
+     !> form
+     !> H = I - tau * v * v**T
+     !> where tau is a real scalar and v is a real vector.
+     !> If tau = 0, then H is taken to be the unit matrix.
+     !> H is a product of k elementary reflectors as returned by DTZRZF.
+
+     pure subroutine stdlib_dlarz( side, m, n, l, v, incv, tau, c, ldc, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: incv, l, ldc, m, n
+           real(dp), intent(in) :: tau
+           ! Array Arguments 
+           real(dp), intent(inout) :: c(ldc,*)
+           real(dp), intent(in) :: v(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Executable Statements 
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c
+              if( tau/=zero ) then
+                 ! w( 1:n ) = c( 1, 1:n )
+                 call stdlib_dcopy( n, c, ldc, work, 1 )
+                 ! w( 1:n ) = w( 1:n ) + c( m-l+1:m, 1:n )**t * v( 1:l )
+                 call stdlib_dgemv( 'TRANSPOSE', l, n, one, c( m-l+1, 1 ), ldc, v,incv, one, work,&
+                            1 )
+                 ! c( 1, 1:n ) = c( 1, 1:n ) - tau * w( 1:n )
+                 call stdlib_daxpy( n, -tau, work, 1, c, ldc )
+                 ! c( m-l+1:m, 1:n ) = c( m-l+1:m, 1:n ) - ...
+                                     ! tau * v( 1:l ) * w( 1:n )**t
+                 call stdlib_dger( l, n, -tau, v, incv, work, 1, c( m-l+1, 1 ),ldc )
+              end if
+           else
+              ! form  c * h
+              if( tau/=zero ) then
+                 ! w( 1:m ) = c( 1:m, 1 )
+                 call stdlib_dcopy( m, c, 1, work, 1 )
+                 ! w( 1:m ) = w( 1:m ) + c( 1:m, n-l+1:n, 1:n ) * v( 1:l )
+                 call stdlib_dgemv( 'NO TRANSPOSE', m, l, one, c( 1, n-l+1 ), ldc,v, incv, one, &
+                           work, 1 )
+                 ! c( 1:m, 1 ) = c( 1:m, 1 ) - tau * w( 1:m )
+                 call stdlib_daxpy( m, -tau, work, 1, c, 1 )
+                 ! c( 1:m, n-l+1:n ) = c( 1:m, n-l+1:n ) - ...
+                                     ! tau * w( 1:m ) * v( 1:l )**t
+                 call stdlib_dger( m, l, -tau, work, 1, v, incv, c( 1, n-l+1 ),ldc )
+              end if
+           end if
+           return
+     end subroutine stdlib_dlarz
+
+     !> DLARZB: applies a real block reflector H or its transpose H**T to
+     !> a real distributed M-by-N  C from the left or the right.
+     !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+
+     pure subroutine stdlib_dlarzb( side, trans, direct, storev, m, n, k, l, v,ldv, t, ldt, c, &
+               ldc, work, ldwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, l, ldc, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: c(ldc,*), t(ldt,*), v(ldv,*)
+           real(dp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           character :: transt
+           integer(ilp) :: i, info, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           ! check for currently supported options
+           info = 0
+           if( .not.stdlib_lsame( direct, 'B' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( storev, 'R' ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLARZB', -info )
+              return
+           end if
+           if( stdlib_lsame( trans, 'N' ) ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c  or  h**t * c
+              ! w( 1:n, 1:k ) = c( 1:k, 1:n )**t
+              do j = 1, k
+                 call stdlib_dcopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+              end do
+              ! w( 1:n, 1:k ) = w( 1:n, 1:k ) + ...
+                              ! c( m-l+1:m, 1:n )**t * v( 1:k, 1:l )**t
+              if( l>0 )call stdlib_dgemm( 'TRANSPOSE', 'TRANSPOSE', n, k, l, one,c( m-l+1, 1 ), &
+                        ldc, v, ldv, one, work, ldwork )
+              ! w( 1:n, 1:k ) = w( 1:n, 1:k ) * t**t  or  w( 1:m, 1:k ) * t
+              call stdlib_dtrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k, one, t,ldt, work, &
+                        ldwork )
+              ! c( 1:k, 1:n ) = c( 1:k, 1:n ) - w( 1:n, 1:k )**t
+              do j = 1, n
+                 do i = 1, k
+                    c( i, j ) = c( i, j ) - work( j, i )
+                 end do
+              end do
+              ! c( m-l+1:m, 1:n ) = c( m-l+1:m, 1:n ) - ...
+                                  ! v( 1:k, 1:l )**t * w( 1:n, 1:k )**t
+              if( l>0 )call stdlib_dgemm( 'TRANSPOSE', 'TRANSPOSE', l, n, k, -one, v, ldv,work, &
+                        ldwork, one, c( m-l+1, 1 ), ldc )
+           else if( stdlib_lsame( side, 'R' ) ) then
+              ! form  c * h  or  c * h**t
+              ! w( 1:m, 1:k ) = c( 1:m, 1:k )
+              do j = 1, k
+                 call stdlib_dcopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+              end do
+              ! w( 1:m, 1:k ) = w( 1:m, 1:k ) + ...
+                              ! c( 1:m, n-l+1:n ) * v( 1:k, 1:l )**t
+              if( l>0 )call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, k, l, one,c( 1, n-l+1 ),&
+                         ldc, v, ldv, one, work, ldwork )
+              ! w( 1:m, 1:k ) = w( 1:m, 1:k ) * t  or  w( 1:m, 1:k ) * t**t
+              call stdlib_dtrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k, one, t,ldt, work, &
+                        ldwork )
+              ! c( 1:m, 1:k ) = c( 1:m, 1:k ) - w( 1:m, 1:k )
+              do j = 1, k
+                 do i = 1, m
+                    c( i, j ) = c( i, j ) - work( i, j )
+                 end do
+              end do
+              ! c( 1:m, n-l+1:n ) = c( 1:m, n-l+1:n ) - ...
+                                  ! w( 1:m, 1:k ) * v( 1:k, 1:l )
+              if( l>0 )call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, l, k, -one,work, &
+                        ldwork, v, ldv, one, c( 1, n-l+1 ), ldc )
+           end if
+           return
+     end subroutine stdlib_dlarzb
+
+     !> DLARZT: forms the triangular factor T of a real block reflector
+     !> H of order > n, which is defined as a product of k elementary
+     !> reflectors.
+     !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+     !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+     !> If STOREV = 'C', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th column of the array V, and
+     !> H  =  I - V * T * V**T
+     !> If STOREV = 'R', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th row of the array V, and
+     !> H  =  I - V**T * T * V
+     !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+
+     pure subroutine stdlib_dlarzt( direct, storev, n, k, v, ldv, tau, t, ldt )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, storev
+           integer(ilp), intent(in) :: k, ldt, ldv, n
+           ! Array Arguments 
+           real(dp), intent(out) :: t(ldt,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j
+           ! Executable Statements 
+           ! check for currently supported options
+           info = 0
+           if( .not.stdlib_lsame( direct, 'B' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( storev, 'R' ) ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLARZT', -info )
+              return
+           end if
+           do i = k, 1, -1
+              if( tau( i )==zero ) then
+                 ! h(i)  =  i
+                 do j = i, k
+                    t( j, i ) = zero
+                 end do
+              else
+                 ! general case
+                 if( i<k ) then
+                    ! t(i+1:k,i) = - tau(i) * v(i+1:k,1:n) * v(i,1:n)**t
+                    call stdlib_dgemv( 'NO TRANSPOSE', k-i, n, -tau( i ),v( i+1, 1 ), ldv, v( i, &
+                              1 ), ldv, zero,t( i+1, i ), 1 )
+                    ! t(i+1:k,i) = t(i+1:k,i+1:k) * t(i+1:k,i)
+                    call stdlib_dtrmv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', k-i,t( i+1, i+1 ), &
+                              ldt, t( i+1, i ), 1 )
+                 end if
+                 t( i, i ) = tau( i )
+              end if
+           end do
+           return
+     end subroutine stdlib_dlarzt
+
+     !> DLAS2:  computes the singular values of the 2-by-2 matrix
+     !> [  F   G  ]
+     !> [  0   H  ].
+     !> On return, SSMIN is the smaller singular value and SSMAX is the
+     !> larger singular value.
+
+     pure subroutine stdlib_dlas2( f, g, h, ssmin, ssmax )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: f, g, h
+           real(dp), intent(out) :: ssmax, ssmin
+        ! ====================================================================
+           
+           
+           
+           ! Local Scalars 
+           real(dp) :: as, at, au, c, fa, fhmn, fhmx, ga, ha
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           fa = abs( f )
+           ga = abs( g )
+           ha = abs( h )
+           fhmn = min( fa, ha )
+           fhmx = max( fa, ha )
+           if( fhmn==zero ) then
+              ssmin = zero
+              if( fhmx==zero ) then
+                 ssmax = ga
+              else
+                 ssmax = max( fhmx, ga )*sqrt( one+( min( fhmx, ga ) / max( fhmx, ga ) )**2 )
+                           
+              end if
+           else
+              if( ga<fhmx ) then
+                 as = one + fhmn / fhmx
+                 at = ( fhmx-fhmn ) / fhmx
+                 au = ( ga / fhmx )**2
+                 c = two / ( sqrt( as*as+au )+sqrt( at*at+au ) )
+                 ssmin = fhmn*c
+                 ssmax = fhmx / c
+              else
+                 au = fhmx / ga
+                 if( au==zero ) then
+                    ! avoid possible harmful underflow if exponent range
+                    ! asymmetric (true ssmin may not underflow even if
+                    ! au underflows)
+                    ssmin = ( fhmn*fhmx ) / ga
+                    ssmax = ga
+                 else
+                    as = one + fhmn / fhmx
+                    at = ( fhmx-fhmn ) / fhmx
+                    c = one / ( sqrt( one+( as*au )**2 )+sqrt( one+( at*au )**2 ) )
+                    ssmin = ( fhmn*c )*au
+                    ssmin = ssmin + ssmin
+                    ssmax = ga / ( c+c )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dlas2
+
+     !> This subroutine computes the square root of the I-th eigenvalue
+     !> of a positive symmetric rank-one modification of a 2-by-2 diagonal
+     !> matrix
+     !> diag( D ) * diag( D ) +  RHO * Z * transpose(Z) .
+     !> The diagonal entries in the array D are assumed to satisfy
+     !> 0 <= D(i) < D(j)  for  i < j .
+     !> We also assume RHO > 0 and that the Euclidean norm of the vector
+     !> Z is one.
+
+     pure subroutine stdlib_dlasd5( i, d, z, delta, rho, dsigma, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i
+           real(dp), intent(out) :: dsigma
+           real(dp), intent(in) :: rho
+           ! Array Arguments 
+           real(dp), intent(in) :: d(2), z(2)
+           real(dp), intent(out) :: delta(2), work(2)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: b, c, del, delsq, tau, w
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           del = d( 2 ) - d( 1 )
+           delsq = del*( d( 2 )+d( 1 ) )
+           if( i==1 ) then
+              w = one + four*rho*( z( 2 )*z( 2 ) / ( d( 1 )+three*d( 2 ) )-z( 1 )*z( 1 ) / ( &
+                        three*d( 1 )+d( 2 ) ) ) / del
+              if( w>zero ) then
+                 b = delsq + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+                 c = rho*z( 1 )*z( 1 )*delsq
+                 ! b > zero, always
+                 ! the following tau is dsigma * dsigma - d( 1 ) * d( 1 )
+                 tau = two*c / ( b+sqrt( abs( b*b-four*c ) ) )
+                 ! the following tau is dsigma - d( 1 )
+                 tau = tau / ( d( 1 )+sqrt( d( 1 )*d( 1 )+tau ) )
+                 dsigma = d( 1 ) + tau
+                 delta( 1 ) = -tau
+                 delta( 2 ) = del - tau
+                 work( 1 ) = two*d( 1 ) + tau
+                 work( 2 ) = ( d( 1 )+tau ) + d( 2 )
+                 ! delta( 1 ) = -z( 1 ) / tau
+                 ! delta( 2 ) = z( 2 ) / ( del-tau )
+              else
+                 b = -delsq + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+                 c = rho*z( 2 )*z( 2 )*delsq
+                 ! the following tau is dsigma * dsigma - d( 2 ) * d( 2 )
+                 if( b>zero ) then
+                    tau = -two*c / ( b+sqrt( b*b+four*c ) )
+                 else
+                    tau = ( b-sqrt( b*b+four*c ) ) / two
+                 end if
+                 ! the following tau is dsigma - d( 2 )
+                 tau = tau / ( d( 2 )+sqrt( abs( d( 2 )*d( 2 )+tau ) ) )
+                 dsigma = d( 2 ) + tau
+                 delta( 1 ) = -( del+tau )
+                 delta( 2 ) = -tau
+                 work( 1 ) = d( 1 ) + tau + d( 2 )
+                 work( 2 ) = two*d( 2 ) + tau
+                 ! delta( 1 ) = -z( 1 ) / ( del+tau )
+                 ! delta( 2 ) = -z( 2 ) / tau
+              end if
+              ! temp = sqrt( delta( 1 )*delta( 1 )+delta( 2 )*delta( 2 ) )
+              ! delta( 1 ) = delta( 1 ) / temp
+              ! delta( 2 ) = delta( 2 ) / temp
+           else
+              ! now i=2
+              b = -delsq + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+              c = rho*z( 2 )*z( 2 )*delsq
+              ! the following tau is dsigma * dsigma - d( 2 ) * d( 2 )
+              if( b>zero ) then
+                 tau = ( b+sqrt( b*b+four*c ) ) / two
+              else
+                 tau = two*c / ( -b+sqrt( b*b+four*c ) )
+              end if
+              ! the following tau is dsigma - d( 2 )
+              tau = tau / ( d( 2 )+sqrt( d( 2 )*d( 2 )+tau ) )
+              dsigma = d( 2 ) + tau
+              delta( 1 ) = -( del+tau )
+              delta( 2 ) = -tau
+              work( 1 ) = d( 1 ) + tau + d( 2 )
+              work( 2 ) = two*d( 2 ) + tau
+              ! delta( 1 ) = -z( 1 ) / ( del+tau )
+              ! delta( 2 ) = -z( 2 ) / tau
+              ! temp = sqrt( delta( 1 )*delta( 1 )+delta( 2 )*delta( 2 ) )
+              ! delta( 1 ) = delta( 1 ) / temp
+              ! delta( 2 ) = delta( 2 ) / temp
+           end if
+           return
+     end subroutine stdlib_dlasd5
+
+     !> DLASDT: creates a tree of subproblems for bidiagonal divide and
+     !> conquer.
+
+     pure subroutine stdlib_dlasdt( n, lvl, nd, inode, ndiml, ndimr, msub )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: lvl, nd
+           integer(ilp), intent(in) :: msub, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: inode(*), ndiml(*), ndimr(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, il, ir, llst, maxn, ncrnt, nlvl
+           real(dp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: real,int,log,max
+           ! Executable Statements 
+           ! find the number of levels on the tree.
+           maxn = max( 1, n )
+           temp = log( real( maxn,KIND=dp) / real( msub+1,KIND=dp) ) / log( two )
+           lvl = int( temp,KIND=ilp) + 1
+           i = n / 2
+           inode( 1 ) = i + 1
+           ndiml( 1 ) = i
+           ndimr( 1 ) = n - i - 1
+           il = 0
+           ir = 1
+           llst = 1
+           do nlvl = 1, lvl - 1
+              ! constructing the tree at (nlvl+1)-st level. the number of
+              ! nodes created on this level is llst * 2.
+              do i = 0, llst - 1
+                 il = il + 2
+                 ir = ir + 2
+                 ncrnt = llst + i
+                 ndiml( il ) = ndiml( ncrnt ) / 2
+                 ndimr( il ) = ndiml( ncrnt ) - ndiml( il ) - 1
+                 inode( il ) = inode( ncrnt ) - ndimr( il ) - 1
+                 ndiml( ir ) = ndimr( ncrnt ) / 2
+                 ndimr( ir ) = ndimr( ncrnt ) - ndiml( ir ) - 1
+                 inode( ir ) = inode( ncrnt ) + ndiml( ir ) + 1
+              end do
+              llst = llst*2
+           end do
+           nd = llst*2 - 1
+           return
+     end subroutine stdlib_dlasdt
+
+     !> DLASET: initializes an m-by-n matrix A to BETA on the diagonal and
+     !> ALPHA on the offdiagonals.
+
+     pure subroutine stdlib_dlaset( uplo, m, n, alpha, beta, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, m, n
+           real(dp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           real(dp), intent(out) :: a(lda,*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! set the strictly upper triangular or trapezoidal part of the
+              ! array to alpha.
+              do j = 2, n
+                 do i = 1, min( j-1, m )
+                    a( i, j ) = alpha
+                 end do
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              ! set the strictly lower triangular or trapezoidal part of the
+              ! array to alpha.
+              do j = 1, min( m, n )
+                 do i = j + 1, m
+                    a( i, j ) = alpha
+                 end do
+              end do
+           else
+              ! set the leading m-by-n submatrix to alpha.
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = alpha
+                 end do
+              end do
+           end if
+           ! set the first min(m,n) diagonal elements to beta.
+           do i = 1, min( m, n )
+              a( i, i ) = beta
+           end do
+           return
+     end subroutine stdlib_dlaset
+
+     !> DLASQ4: computes an approximation TAU to the smallest eigenvalue
+     !> using values of d from the previous transform.
+
+     pure subroutine stdlib_dlasq4( i0, n0, z, pp, n0in, dmin, dmin1, dmin2, dn,dn1, dn2, tau, &
+               ttype, g )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i0, n0, n0in, pp
+           integer(ilp), intent(out) :: ttype
+           real(dp), intent(in) :: dmin, dmin1, dmin2, dn, dn1, dn2
+           real(dp), intent(inout) :: g
+           real(dp), intent(out) :: tau
+           ! Array Arguments 
+           real(dp), intent(in) :: z(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: cnst1 = 0.5630_dp
+           real(dp), parameter :: cnst2 = 1.010_dp
+           real(dp), parameter :: cnst3 = 1.050_dp
+           real(dp), parameter :: qurtr = 0.250_dp
+           real(dp), parameter :: third = 0.3330_dp
+           real(dp), parameter :: hundrd = 100.0_dp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i4, nn, np
+           real(dp) :: a2, b1, b2, gam, gap1, gap2, s
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! a negative dmin forces the shift to take that absolute value
+           ! ttype records the type of shift.
+           if( dmin<=zero ) then
+              tau = -dmin
+              ttype = -1
+              return
+           end if
+           nn = 4*n0 + pp
+           if( n0in==n0 ) then
+              ! no eigenvalues deflated.
+              if( dmin==dn .or. dmin==dn1 ) then
+                 b1 = sqrt( z( nn-3 ) )*sqrt( z( nn-5 ) )
+                 b2 = sqrt( z( nn-7 ) )*sqrt( z( nn-9 ) )
+                 a2 = z( nn-7 ) + z( nn-5 )
+                 ! cases 2 and 3.
+                 if( dmin==dn .and. dmin1==dn1 ) then
+                    gap2 = dmin2 - a2 - dmin2*qurtr
+                    if( gap2>zero .and. gap2>b2 ) then
+                       gap1 = a2 - dn - ( b2 / gap2 )*b2
+                    else
+                       gap1 = a2 - dn - ( b1+b2 )
+                    end if
+                    if( gap1>zero .and. gap1>b1 ) then
+                       s = max( dn-( b1 / gap1 )*b1, half*dmin )
+                       ttype = -2
+                    else
+                       s = zero
+                       if( dn>b1 )s = dn - b1
+                       if( a2>( b1+b2 ) )s = min( s, a2-( b1+b2 ) )
+                       s = max( s, third*dmin )
+                       ttype = -3
+                    end if
+                 else
+                    ! case 4.
+                    ttype = -4
+                    s = qurtr*dmin
+                    if( dmin==dn ) then
+                       gam = dn
+                       a2 = zero
+                       if( z( nn-5 ) > z( nn-7 ) )return
+                       b2 = z( nn-5 ) / z( nn-7 )
+                       np = nn - 9
+                    else
+                       np = nn - 2*pp
+                       gam = dn1
+                       if( z( np-4 ) > z( np-2 ) )return
+                       a2 = z( np-4 ) / z( np-2 )
+                       if( z( nn-9 ) > z( nn-11 ) )return
+                       b2 = z( nn-9 ) / z( nn-11 )
+                       np = nn - 13
+                    end if
+                    ! approximate contribution to norm squared from i < nn-1.
+                    a2 = a2 + b2
+                    do i4 = np, 4*i0 - 1 + pp, -4
+                       if( b2==zero )go to 20
+                       b1 = b2
+                       if( z( i4 ) > z( i4-2 ) )return
+                       b2 = b2*( z( i4 ) / z( i4-2 ) )
+                       a2 = a2 + b2
+                       if( hundrd*max( b2, b1 )<a2 .or. cnst1<a2 )go to 20
+                    end do
+                    20 continue
+                    a2 = cnst3*a2
+                    ! rayleigh quotient residual bound.
+                    if( a2<cnst1 )s = gam*( one-sqrt( a2 ) ) / ( one+a2 )
+                 end if
+              else if( dmin==dn2 ) then
+                 ! case 5.
+                 ttype = -5
+                 s = qurtr*dmin
+                 ! compute contribution to norm squared from i > nn-2.
+                 np = nn - 2*pp
+                 b1 = z( np-2 )
+                 b2 = z( np-6 )
+                 gam = dn2
+                 if( z( np-8 )>b2 .or. z( np-4 )>b1 )return
+                 a2 = ( z( np-8 ) / b2 )*( one+z( np-4 ) / b1 )
+                 ! approximate contribution to norm squared from i < nn-2.
+                 if( n0-i0>2 ) then
+                    b2 = z( nn-13 ) / z( nn-15 )
+                    a2 = a2 + b2
+                    do i4 = nn - 17, 4*i0 - 1 + pp, -4
+                       if( b2==zero )go to 40
+                       b1 = b2
+                       if( z( i4 ) > z( i4-2 ) )return
+                       b2 = b2*( z( i4 ) / z( i4-2 ) )
+                       a2 = a2 + b2
+                       if( hundrd*max( b2, b1 )<a2 .or. cnst1<a2 )go to 40
+                    end do
+                    40 continue
+                    a2 = cnst3*a2
+                 end if
+                 if( a2<cnst1 )s = gam*( one-sqrt( a2 ) ) / ( one+a2 )
+              else
+                 ! case 6, no information to guide us.
+                 if( ttype==-6 ) then
+                    g = g + third*( one-g )
+                 else if( ttype==-18 ) then
+                    g = qurtr*third
+                 else
+                    g = qurtr
+                 end if
+                 s = g*dmin
+                 ttype = -6
+              end if
+           else if( n0in==( n0+1 ) ) then
+              ! one eigenvalue just deflated. use dmin1, dn1 for dmin and dn.
+              if( dmin1==dn1 .and. dmin2==dn2 ) then
+                 ! cases 7 and 8.
+                 ttype = -7
+                 s = third*dmin1
+                 if( z( nn-5 )>z( nn-7 ) )return
+                 b1 = z( nn-5 ) / z( nn-7 )
+                 b2 = b1
+                 if( b2==zero )go to 60
+                 do i4 = 4*n0 - 9 + pp, 4*i0 - 1 + pp, -4
+                    a2 = b1
+                    if( z( i4 )>z( i4-2 ) )return
+                    b1 = b1*( z( i4 ) / z( i4-2 ) )
+                    b2 = b2 + b1
+                    if( hundrd*max( b1, a2 )<b2 )go to 60
+                 end do
+                 60 continue
+                 b2 = sqrt( cnst3*b2 )
+                 a2 = dmin1 / ( one+b2**2 )
+                 gap2 = half*dmin2 - a2
+                 if( gap2>zero .and. gap2>b2*a2 ) then
+                    s = max( s, a2*( one-cnst2*a2*( b2 / gap2 )*b2 ) )
+                 else
+                    s = max( s, a2*( one-cnst2*b2 ) )
+                    ttype = -8
+                 end if
+              else
+                 ! case 9.
+                 s = qurtr*dmin1
+                 if( dmin1==dn1 )s = half*dmin1
+                 ttype = -9
+              end if
+           else if( n0in==( n0+2 ) ) then
+              ! two eigenvalues deflated. use dmin2, dn2 for dmin and dn.
+              ! cases 10 and 11.
+              if( dmin2==dn2 .and. two*z( nn-5 )<z( nn-7 ) ) then
+                 ttype = -10
+                 s = third*dmin2
+                 if( z( nn-5 )>z( nn-7 ) )return
+                 b1 = z( nn-5 ) / z( nn-7 )
+                 b2 = b1
+                 if( b2==zero )go to 80
+                 do i4 = 4*n0 - 9 + pp, 4*i0 - 1 + pp, -4
+                    if( z( i4 )>z( i4-2 ) )return
+                    b1 = b1*( z( i4 ) / z( i4-2 ) )
+                    b2 = b2 + b1
+                    if( hundrd*b1<b2 )go to 80
+                 end do
+                 80 continue
+                 b2 = sqrt( cnst3*b2 )
+                 a2 = dmin2 / ( one+b2**2 )
+                 gap2 = z( nn-7 ) + z( nn-9 ) -sqrt( z( nn-11 ) )*sqrt( z( nn-9 ) ) - a2
+                 if( gap2>zero .and. gap2>b2*a2 ) then
+                    s = max( s, a2*( one-cnst2*a2*( b2 / gap2 )*b2 ) )
+                 else
+                    s = max( s, a2*( one-cnst2*b2 ) )
+                 end if
+              else
+                 s = qurtr*dmin2
+                 ttype = -11
+              end if
+           else if( n0in>( n0+2 ) ) then
+              ! case 12, more than two eigenvalues deflated. no information.
+              s = zero
+              ttype = -12
+           end if
+           tau = s
+           return
+     end subroutine stdlib_dlasq4
+
+     !> DLASQ5: computes one dqds transform in ping-pong form, one
+     !> version for IEEE machines another for non IEEE machines.
+
+     pure subroutine stdlib_dlasq5( i0, n0, z, pp, tau, sigma, dmin, dmin1, dmin2,dn, dnm1, dnm2, &
+               ieee, eps )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: ieee
+           integer(ilp), intent(in) :: i0, n0, pp
+           real(dp), intent(out) :: dmin, dmin1, dmin2, dn, dnm1, dnm2
+           real(dp), intent(inout) :: tau
+           real(dp), intent(in) :: sigma, eps
+           ! Array Arguments 
+           real(dp), intent(inout) :: z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j4, j4p2
+           real(dp) :: d, emin, temp, dthresh
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( ( n0-i0-1 )<=0 )return
+           dthresh = eps*(sigma+tau)
+           if( tau<dthresh*half ) tau = zero
+           if( tau/=zero ) then
+           j4 = 4*i0 + pp - 3
+           emin = z( j4+4 )
+           d = z( j4 ) - tau
+           dmin = d
+           dmin1 = -z( j4 )
+           if( ieee ) then
+              ! code for ieee arithmetic.
+              if( pp==0 ) then
+                 do j4 = 4*i0, 4*( n0-3 ), 4
+                    z( j4-2 ) = d + z( j4-1 )
+                    temp = z( j4+1 ) / z( j4-2 )
+                    d = d*temp - tau
+                    dmin = min( dmin, d )
+                    z( j4 ) = z( j4-1 )*temp
+                    emin = min( z( j4 ), emin )
+                 end do
+              else
+                 do j4 = 4*i0, 4*( n0-3 ), 4
+                    z( j4-3 ) = d + z( j4 )
+                    temp = z( j4+2 ) / z( j4-3 )
+                    d = d*temp - tau
+                    dmin = min( dmin, d )
+                    z( j4-1 ) = z( j4 )*temp
+                    emin = min( z( j4-1 ), emin )
+                 end do
+              end if
+              ! unroll last two steps.
+              dnm2 = d
+              dmin2 = dmin
+              j4 = 4*( n0-2 ) - pp
+              j4p2 = j4 + 2*pp - 1
+              z( j4-2 ) = dnm2 + z( j4p2 )
+              z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+              dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) ) - tau
+              dmin = min( dmin, dnm1 )
+              dmin1 = dmin
+              j4 = j4 + 4
+              j4p2 = j4 + 2*pp - 1
+              z( j4-2 ) = dnm1 + z( j4p2 )
+              z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+              dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) ) - tau
+              dmin = min( dmin, dn )
+           else
+              ! code for non ieee arithmetic.
+              if( pp==0 ) then
+                 do j4 = 4*i0, 4*( n0-3 ), 4
+                    z( j4-2 ) = d + z( j4-1 )
+                    if( d<zero ) then
+                       return
+                    else
+                       z( j4 ) = z( j4+1 )*( z( j4-1 ) / z( j4-2 ) )
+                       d = z( j4+1 )*( d / z( j4-2 ) ) - tau
+                    end if
+                    dmin = min( dmin, d )
+                    emin = min( emin, z( j4 ) )
+                 end do
+              else
+                 do j4 = 4*i0, 4*( n0-3 ), 4
+                    z( j4-3 ) = d + z( j4 )
+                    if( d<zero ) then
+                       return
+                    else
+                       z( j4-1 ) = z( j4+2 )*( z( j4 ) / z( j4-3 ) )
+                       d = z( j4+2 )*( d / z( j4-3 ) ) - tau
+                    end if
+                    dmin = min( dmin, d )
+                    emin = min( emin, z( j4-1 ) )
+                 end do
+              end if
+              ! unroll last two steps.
+              dnm2 = d
+              dmin2 = dmin
+              j4 = 4*( n0-2 ) - pp
+              j4p2 = j4 + 2*pp - 1
+              z( j4-2 ) = dnm2 + z( j4p2 )
+              if( dnm2<zero ) then
+                 return
+              else
+                 z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                 dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) ) - tau
+              end if
+              dmin = min( dmin, dnm1 )
+              dmin1 = dmin
+              j4 = j4 + 4
+              j4p2 = j4 + 2*pp - 1
+              z( j4-2 ) = dnm1 + z( j4p2 )
+              if( dnm1<zero ) then
+                 return
+              else
+                 z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                 dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) ) - tau
+              end if
+              dmin = min( dmin, dn )
+           end if
+           else
+           ! this is the version that sets d's to zero if they are small enough
+              j4 = 4*i0 + pp - 3
+              emin = z( j4+4 )
+              d = z( j4 ) - tau
+              dmin = d
+              dmin1 = -z( j4 )
+              if( ieee ) then
+           ! code for ieee arithmetic.
+                 if( pp==0 ) then
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-2 ) = d + z( j4-1 )
+                       temp = z( j4+1 ) / z( j4-2 )
+                       d = d*temp - tau
+                       if( d<dthresh ) d = zero
+                       dmin = min( dmin, d )
+                       z( j4 ) = z( j4-1 )*temp
+                       emin = min( z( j4 ), emin )
+                    end do
+                 else
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-3 ) = d + z( j4 )
+                       temp = z( j4+2 ) / z( j4-3 )
+                       d = d*temp - tau
+                       if( d<dthresh ) d = zero
+                       dmin = min( dmin, d )
+                       z( j4-1 ) = z( j4 )*temp
+                       emin = min( z( j4-1 ), emin )
+                    end do
+                 end if
+           ! unroll last two steps.
+                 dnm2 = d
+                 dmin2 = dmin
+                 j4 = 4*( n0-2 ) - pp
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm2 + z( j4p2 )
+                 z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                 dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) ) - tau
+                 dmin = min( dmin, dnm1 )
+                 dmin1 = dmin
+                 j4 = j4 + 4
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm1 + z( j4p2 )
+                 z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                 dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) ) - tau
+                 dmin = min( dmin, dn )
+              else
+           ! code for non ieee arithmetic.
+                 if( pp==0 ) then
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-2 ) = d + z( j4-1 )
+                       if( d<zero ) then
+                          return
+                       else
+                          z( j4 ) = z( j4+1 )*( z( j4-1 ) / z( j4-2 ) )
+                          d = z( j4+1 )*( d / z( j4-2 ) ) - tau
+                       end if
+                       if( d<dthresh) d = zero
+                       dmin = min( dmin, d )
+                       emin = min( emin, z( j4 ) )
+                    end do
+                 else
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-3 ) = d + z( j4 )
+                       if( d<zero ) then
+                          return
+                       else
+                          z( j4-1 ) = z( j4+2 )*( z( j4 ) / z( j4-3 ) )
+                          d = z( j4+2 )*( d / z( j4-3 ) ) - tau
+                       end if
+                       if( d<dthresh) d = zero
+                       dmin = min( dmin, d )
+                       emin = min( emin, z( j4-1 ) )
+                    end do
+                 end if
+           ! unroll last two steps.
+                 dnm2 = d
+                 dmin2 = dmin
+                 j4 = 4*( n0-2 ) - pp
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm2 + z( j4p2 )
+                 if( dnm2<zero ) then
+                    return
+                 else
+                    z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                    dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) ) - tau
+                 end if
+                 dmin = min( dmin, dnm1 )
+                 dmin1 = dmin
+                 j4 = j4 + 4
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm1 + z( j4p2 )
+                 if( dnm1<zero ) then
+                    return
+                 else
+                    z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                    dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) ) - tau
+                 end if
+                 dmin = min( dmin, dn )
+              end if
+           end if
+           z( j4+2 ) = dn
+           z( 4*n0-pp ) = emin
+           return
+     end subroutine stdlib_dlasq5
+
+     !> DLASQ6: computes one dqd (shift equal to zero) transform in
+     !> ping-pong form, with protection against underflow and overflow.
+
+     pure subroutine stdlib_dlasq6( i0, n0, z, pp, dmin, dmin1, dmin2, dn,dnm1, dnm2 )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i0, n0, pp
+           real(dp), intent(out) :: dmin, dmin1, dmin2, dn, dnm1, dnm2
+           ! Array Arguments 
+           real(dp), intent(inout) :: z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j4, j4p2
+           real(dp) :: d, emin, safmin, temp
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( ( n0-i0-1 )<=0 )return
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           j4 = 4*i0 + pp - 3
+           emin = z( j4+4 )
+           d = z( j4 )
+           dmin = d
+           if( pp==0 ) then
+              do j4 = 4*i0, 4*( n0-3 ), 4
+                 z( j4-2 ) = d + z( j4-1 )
+                 if( z( j4-2 )==zero ) then
+                    z( j4 ) = zero
+                    d = z( j4+1 )
+                    dmin = d
+                    emin = zero
+                 else if( safmin*z( j4+1 )<z( j4-2 ) .and.safmin*z( j4-2 )<z( j4+1 ) ) &
+                           then
+                    temp = z( j4+1 ) / z( j4-2 )
+                    z( j4 ) = z( j4-1 )*temp
+                    d = d*temp
+                 else
+                    z( j4 ) = z( j4+1 )*( z( j4-1 ) / z( j4-2 ) )
+                    d = z( j4+1 )*( d / z( j4-2 ) )
+                 end if
+                 dmin = min( dmin, d )
+                 emin = min( emin, z( j4 ) )
+              end do
+           else
+              do j4 = 4*i0, 4*( n0-3 ), 4
+                 z( j4-3 ) = d + z( j4 )
+                 if( z( j4-3 )==zero ) then
+                    z( j4-1 ) = zero
+                    d = z( j4+2 )
+                    dmin = d
+                    emin = zero
+                 else if( safmin*z( j4+2 )<z( j4-3 ) .and.safmin*z( j4-3 )<z( j4+2 ) ) &
+                           then
+                    temp = z( j4+2 ) / z( j4-3 )
+                    z( j4-1 ) = z( j4 )*temp
+                    d = d*temp
+                 else
+                    z( j4-1 ) = z( j4+2 )*( z( j4 ) / z( j4-3 ) )
+                    d = z( j4+2 )*( d / z( j4-3 ) )
+                 end if
+                 dmin = min( dmin, d )
+                 emin = min( emin, z( j4-1 ) )
+              end do
+           end if
+           ! unroll last two steps.
+           dnm2 = d
+           dmin2 = dmin
+           j4 = 4*( n0-2 ) - pp
+           j4p2 = j4 + 2*pp - 1
+           z( j4-2 ) = dnm2 + z( j4p2 )
+           if( z( j4-2 )==zero ) then
+              z( j4 ) = zero
+              dnm1 = z( j4p2+2 )
+              dmin = dnm1
+              emin = zero
+           else if( safmin*z( j4p2+2 )<z( j4-2 ) .and.safmin*z( j4-2 )<z( j4p2+2 ) ) then
+              temp = z( j4p2+2 ) / z( j4-2 )
+              z( j4 ) = z( j4p2 )*temp
+              dnm1 = dnm2*temp
+           else
+              z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+              dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) )
+           end if
+           dmin = min( dmin, dnm1 )
+           dmin1 = dmin
+           j4 = j4 + 4
+           j4p2 = j4 + 2*pp - 1
+           z( j4-2 ) = dnm1 + z( j4p2 )
+           if( z( j4-2 )==zero ) then
+              z( j4 ) = zero
+              dn = z( j4p2+2 )
+              dmin = dn
+              emin = zero
+           else if( safmin*z( j4p2+2 )<z( j4-2 ) .and.safmin*z( j4-2 )<z( j4p2+2 ) ) then
+              temp = z( j4p2+2 ) / z( j4-2 )
+              z( j4 ) = z( j4p2 )*temp
+              dn = dnm1*temp
+           else
+              z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+              dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) )
+           end if
+           dmin = min( dmin, dn )
+           z( j4+2 ) = dn
+           z( 4*n0-pp ) = emin
+           return
+     end subroutine stdlib_dlasq6
+
+     !> DLASR: applies a sequence of plane rotations to a real matrix A,
+     !> from either the left or the right.
+     !> When SIDE = 'L', the transformation takes the form
+     !> A := P*A
+     !> and when SIDE = 'R', the transformation takes the form
+     !> A := A*P**T
+     !> where P is an orthogonal matrix consisting of a sequence of z plane
+     !> rotations, with z = M when SIDE = 'L' and z = N when SIDE = 'R',
+     !> and P**T is the transpose of P.
+     !> When DIRECT = 'F' (Forward sequence), then
+     !> P = P(z-1) * ... * P(2) * P(1)
+     !> and when DIRECT = 'B' (Backward sequence), then
+     !> P = P(1) * P(2) * ... * P(z-1)
+     !> where P(k) is a plane rotation matrix defined by the 2-by-2 rotation
+     !> R(k) = (  c(k)  s(k) )
+     !> = ( -s(k)  c(k) ).
+     !> When PIVOT = 'V' (Variable pivot), the rotation is performed
+     !> for the plane (k,k+1), i.e., P(k) has the form
+     !> P(k) = (  1                                            )
+     !> (       ...                                     )
+     !> (              1                                )
+     !> (                   c(k)  s(k)                  )
+     !> (                  -s(k)  c(k)                  )
+     !> (                                1              )
+     !> (                                     ...       )
+     !> (                                            1  )
+     !> where R(k) appears as a rank-2 modification to the identity matrix in
+     !> rows and columns k and k+1.
+     !> When PIVOT = 'T' (Top pivot), the rotation is performed for the
+     !> plane (1,k+1), so P(k) has the form
+     !> P(k) = (  c(k)                    s(k)                 )
+     !> (         1                                     )
+     !> (              ...                              )
+     !> (                     1                         )
+     !> ( -s(k)                    c(k)                 )
+     !> (                                 1             )
+     !> (                                      ...      )
+     !> (                                             1 )
+     !> where R(k) appears in rows and columns 1 and k+1.
+     !> Similarly, when PIVOT = 'B' (Bottom pivot), the rotation is
+     !> performed for the plane (k,z), giving P(k) the form
+     !> P(k) = ( 1                                             )
+     !> (      ...                                      )
+     !> (             1                                 )
+     !> (                  c(k)                    s(k) )
+     !> (                         1                     )
+     !> (                              ...              )
+     !> (                                     1         )
+     !> (                 -s(k)                    c(k) )
+     !> where R(k) appears in rows and columns k and z.  The rotations are
+     !> performed without ever forming P(k) explicitly.
+
+     pure subroutine stdlib_dlasr( side, pivot, direct, m, n, c, s, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, pivot, side
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: c(*), s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j
+           real(dp) :: ctemp, stemp, temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( .not.( stdlib_lsame( side, 'L' ) .or. stdlib_lsame( side, 'R' ) ) ) then
+              info = 1
+           else if( .not.( stdlib_lsame( pivot, 'V' ) .or. stdlib_lsame( pivot,'T' ) .or. &
+                     stdlib_lsame( pivot, 'B' ) ) ) then
+              info = 2
+           else if( .not.( stdlib_lsame( direct, 'F' ) .or. stdlib_lsame( direct, 'B' ) ) )&
+                     then
+              info = 3
+           else if( m<0 ) then
+              info = 4
+           else if( n<0 ) then
+              info = 5
+           else if( lda<max( 1, m ) ) then
+              info = 9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASR ', info )
+              return
+           end if
+           ! quick return if possible
+           if( ( m==0 ) .or. ( n==0 ) )return
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  p * a
+              if( stdlib_lsame( pivot, 'V' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, m - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j+1, i )
+                             a( j+1, i ) = ctemp*temp - stemp*a( j, i )
+                             a( j, i ) = stemp*temp + ctemp*a( j, i )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j+1, i )
+                             a( j+1, i ) = ctemp*temp - stemp*a( j, i )
+                             a( j, i ) = stemp*temp + ctemp*a( j, i )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'T' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 2, m
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = ctemp*temp - stemp*a( 1, i )
+                             a( 1, i ) = stemp*temp + ctemp*a( 1, i )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m, 2, -1
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = ctemp*temp - stemp*a( 1, i )
+                             a( 1, i ) = stemp*temp + ctemp*a( 1, i )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'B' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, m - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = stemp*a( m, i ) + ctemp*temp
+                             a( m, i ) = ctemp*a( m, i ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = stemp*a( m, i ) + ctemp*temp
+                             a( m, i ) = ctemp*a( m, i ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 end if
+              end if
+           else if( stdlib_lsame( side, 'R' ) ) then
+              ! form a * p**t
+              if( stdlib_lsame( pivot, 'V' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, n - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j+1 )
+                             a( i, j+1 ) = ctemp*temp - stemp*a( i, j )
+                             a( i, j ) = stemp*temp + ctemp*a( i, j )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j+1 )
+                             a( i, j+1 ) = ctemp*temp - stemp*a( i, j )
+                             a( i, j ) = stemp*temp + ctemp*a( i, j )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'T' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 2, n
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = ctemp*temp - stemp*a( i, 1 )
+                             a( i, 1 ) = stemp*temp + ctemp*a( i, 1 )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n, 2, -1
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = ctemp*temp - stemp*a( i, 1 )
+                             a( i, 1 ) = stemp*temp + ctemp*a( i, 1 )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'B' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, n - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = stemp*a( i, n ) + ctemp*temp
+                             a( i, n ) = ctemp*a( i, n ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = stemp*a( i, n ) + ctemp*temp
+                             a( i, n ) = ctemp*a( i, n ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dlasr
+
+     !> Sort the numbers in D in increasing order (if ID = 'I') or
+     !> in decreasing order (if ID = 'D' ).
+     !> Use Quick Sort, reverting to Insertion sort on arrays of
+     !> size <= 20. Dimension of STACK limits N to about 2**32.
+
+     pure subroutine stdlib_dlasrt( id, n, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: id
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(inout) :: d(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: select = 20
+           
+           ! Local Scalars 
+           integer(ilp) :: dir, endd, i, j, start, stkpnt
+           real(dp) :: d1, d2, d3, dmnmx, tmp
+           ! Local Arrays 
+           integer(ilp) :: stack(2,32)
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           dir = -1
+           if( stdlib_lsame( id, 'D' ) ) then
+              dir = 0
+           else if( stdlib_lsame( id, 'I' ) ) then
+              dir = 1
+           end if
+           if( dir==-1 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASRT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 )return
+           stkpnt = 1
+           stack( 1, 1 ) = 1
+           stack( 2, 1 ) = n
+           10 continue
+           start = stack( 1, stkpnt )
+           endd = stack( 2, stkpnt )
+           stkpnt = stkpnt - 1
+           if( endd-start<=select .and. endd-start>0 ) then
+              ! do insertion sort on d( start:endd )
+              if( dir==0 ) then
+                 ! sort into decreasing order
+                 loop_30: do i = start + 1, endd
+                    do j = i, start + 1, -1
+                       if( d( j )>d( j-1 ) ) then
+                          dmnmx = d( j )
+                          d( j ) = d( j-1 )
+                          d( j-1 ) = dmnmx
+                       else
+                          cycle loop_30
+                       end if
+                    end do
+                 end do loop_30
+              else
+                 ! sort into increasing order
+                 loop_50: do i = start + 1, endd
+                    do j = i, start + 1, -1
+                       if( d( j )<d( j-1 ) ) then
+                          dmnmx = d( j )
+                          d( j ) = d( j-1 )
+                          d( j-1 ) = dmnmx
+                       else
+                          cycle loop_50
+                       end if
+                    end do
+                 end do loop_50
+              end if
+           else if( endd-start>select ) then
+              ! partition d( start:endd ) and stack parts, largest one first
+              ! choose partition entry as median of 3
+              d1 = d( start )
+              d2 = d( endd )
+              i = ( start+endd ) / 2
+              d3 = d( i )
+              if( d1<d2 ) then
+                 if( d3<d1 ) then
+                    dmnmx = d1
+                 else if( d3<d2 ) then
+                    dmnmx = d3
+                 else
+                    dmnmx = d2
+                 end if
+              else
+                 if( d3<d2 ) then
+                    dmnmx = d2
+                 else if( d3<d1 ) then
+                    dmnmx = d3
+                 else
+                    dmnmx = d1
+                 end if
+              end if
+              if( dir==0 ) then
+                 ! sort into decreasing order
+                 i = start - 1
+                 j = endd + 1
+                 60 continue
+                 70 continue
+                 j = j - 1
+                 if( d( j )<dmnmx )go to 70
+                 80 continue
+                 i = i + 1
+                 if( d( i )>dmnmx )go to 80
+                 if( i<j ) then
+                    tmp = d( i )
+                    d( i ) = d( j )
+                    d( j ) = tmp
+                    go to 60
+                 end if
+                 if( j-start>endd-j-1 ) then
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = start
+                    stack( 2, stkpnt ) = j
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = j + 1
+                    stack( 2, stkpnt ) = endd
+                 else
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = j + 1
+                    stack( 2, stkpnt ) = endd
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = start
+                    stack( 2, stkpnt ) = j
+                 end if
+              else
+                 ! sort into increasing order
+                 i = start - 1
+                 j = endd + 1
+                 90 continue
+                 100 continue
+                 j = j - 1
+                 if( d( j )>dmnmx )go to 100
+                 110 continue
+                 i = i + 1
+                 if( d( i )<dmnmx )go to 110
+                 if( i<j ) then
+                    tmp = d( i )
+                    d( i ) = d( j )
+                    d( j ) = tmp
+                    go to 90
+                 end if
+                 if( j-start>endd-j-1 ) then
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = start
+                    stack( 2, stkpnt ) = j
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = j + 1
+                    stack( 2, stkpnt ) = endd
+                 else
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = j + 1
+                    stack( 2, stkpnt ) = endd
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = start
+                    stack( 2, stkpnt ) = j
+                 end if
+              end if
+           end if
+           if( stkpnt>0 )go to 10
+           return
+     end subroutine stdlib_dlasrt
+
+     !> !
+     !>
+     !> DLASSQ:  returns the values  scl  and  smsq  such that
+     !> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+     !> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+     !> assumed to be non-negative.
+     !> scale and sumsq must be supplied in SCALE and SUMSQ and
+     !> scl and smsq are overwritten on SCALE and SUMSQ respectively.
+     !> If scale * sqrt( sumsq ) > tbig then
+     !> we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
+     !> and if 0 < scale * sqrt( sumsq ) < tsml then
+     !> we require:   scale <= sqrt( HUGE ) / ssml       on entry,
+     !> where
+     !> tbig -- upper threshold for values whose square is representable;
+     !> sbig -- scaling constant for big numbers; \see la_constants.f90
+     !> tsml -- lower threshold for values whose square is representable;
+     !> ssml -- scaling constant for small numbers; \see la_constants.f90
+     !> and
+     !> TINY*EPS -- tiniest representable number;
+     !> HUGE     -- biggest representable number.
+
+     pure subroutine stdlib_dlassq( n, x, incx, scl, sumsq )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+        ! Scalar Arguments 
+     integer(ilp), intent(in) :: incx, n
+        real(dp), intent(inout) :: scl, sumsq
+        ! Array Arguments 
+        real(dp), intent(in) :: x(*)
+        ! Local Scalars 
+     integer(ilp) :: i, ix
+     logical(lk) :: notbig
+        real(dp) :: abig, amed, asml, ax, ymax, ymin
+        ! quick return if possible
+        if( ieee_is_nan(scl) .or. ieee_is_nan(sumsq) ) return
+        if( sumsq == zero ) scl = one
+        if( scl == zero ) then
+           scl = one
+           sumsq = zero
+        end if
+        if (n <= 0) then
+           return
+        end if
+        ! compute the sum of squares in 3 accumulators:
+           ! abig -- sums of squares scaled down to avoid overflow
+           ! asml -- sums of squares scaled up to avoid underflow
+           ! amed -- sums of squares that do not require scaling
+        ! the thresholds and multipliers are
+           ! tbig -- values bigger than this are scaled down by sbig
+           ! tsml -- values smaller than this are scaled up by ssml
+        notbig = .true.
+        asml = zero
+        amed = zero
+        abig = zero
+        ix = 1
+        if( incx < 0 ) ix = 1 - (n-1)*incx
+        do i = 1, n
+           ax = abs(x(ix))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ix = ix + incx
+        end do
+        ! put the existing sum of squares into one of the accumulators
+        if( sumsq > zero ) then
+           ax = scl*sqrt( sumsq )
+           if (ax > tbig) then
+              ! we assume scl >= sqrt( tiny*eps ) / sbig
+              abig = abig + (scl*sbig)**2 * sumsq
+           else if (ax < tsml) then
+              ! we assume scl <= sqrt( huge ) / ssml
+              if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+           else
+              amed = amed + scl**2 * sumsq
+           end if
+        end if
+        ! combine abig and amed or amed and asml if more than one
+        ! accumulator was used.
+        if (abig > zero) then
+           ! combine abig and amed if abig > 0.
+           if (amed > zero .or. ieee_is_nan(amed)) then
+              abig = abig + (amed*sbig)*sbig
+           end if
+           scl = one / sbig
+           sumsq = abig
+        else if (asml > zero) then
+           ! combine amed and asml if asml > 0.
+           if (amed > zero .or. ieee_is_nan(amed)) then
+              amed = sqrt(amed)
+              asml = sqrt(asml) / ssml
+              if (asml > amed) then
+                 ymin = amed
+                 ymax = asml
+              else
+                 ymin = asml
+                 ymax = amed
+              end if
+              scl = one
+              sumsq = ymax**2*( one + (ymin/ymax)**2 )
+           else
+              scl = one / ssml
+              sumsq = asml
+           end if
+        else
+           ! otherwise all values are mid-range or zero
+           scl = one
+           sumsq = amed
+        end if
+        return
+     end subroutine stdlib_dlassq
+
+     !> DLASV2: computes the singular value decomposition of a 2-by-2
+     !> triangular matrix
+     !> [  F   G  ]
+     !> [  0   H  ].
+     !> On return, abs(SSMAX) is the larger singular value, abs(SSMIN) is the
+     !> smaller singular value, and (CSL,SNL) and (CSR,SNR) are the left and
+     !> right singular vectors for abs(SSMAX), giving the decomposition
+     !> [ CSL  SNL ] [  F   G  ] [ CSR -SNR ]  =  [ SSMAX   0   ]
+     !> [-SNL  CSL ] [  0   H  ] [ SNR  CSR ]     [  0    SSMIN ].
+
+     pure subroutine stdlib_dlasv2( f, g, h, ssmin, ssmax, snr, csr, snl, csl )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(out) :: csl, csr, snl, snr, ssmax, ssmin
+           real(dp), intent(in) :: f, g, h
+       ! =====================================================================
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: gasmal, swap
+           integer(ilp) :: pmax
+           real(dp) :: a, clt, crt, d, fa, ft, ga, gt, ha, ht, l, m, mm, r, s, slt, srt, t, temp, &
+                     tsign, tt
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,sqrt
+           ! Executable Statements 
+           ft = f
+           fa = abs( ft )
+           ht = h
+           ha = abs( h )
+           ! pmax points to the maximum absolute element of matrix
+             ! pmax = 1 if f largest in absolute values
+             ! pmax = 2 if g largest in absolute values
+             ! pmax = 3 if h largest in absolute values
+           pmax = 1
+           swap = ( ha>fa )
+           if( swap ) then
+              pmax = 3
+              temp = ft
+              ft = ht
+              ht = temp
+              temp = fa
+              fa = ha
+              ha = temp
+              ! now fa .ge. ha
+           end if
+           gt = g
+           ga = abs( gt )
+           if( ga==zero ) then
+              ! diagonal matrix
+              ssmin = ha
+              ssmax = fa
+              clt = one
+              crt = one
+              slt = zero
+              srt = zero
+           else
+              gasmal = .true.
+              if( ga>fa ) then
+                 pmax = 2
+                 if( ( fa / ga )<stdlib_dlamch( 'EPS' ) ) then
+                    ! case of very large ga
+                    gasmal = .false.
+                    ssmax = ga
+                    if( ha>one ) then
+                       ssmin = fa / ( ga / ha )
+                    else
+                       ssmin = ( fa / ga )*ha
+                    end if
+                    clt = one
+                    slt = ht / gt
+                    srt = one
+                    crt = ft / gt
+                 end if
+              end if
+              if( gasmal ) then
+                 ! normal case
+                 d = fa - ha
+                 if( d==fa ) then
+                    ! copes with infinite f or h
+                    l = one
+                 else
+                    l = d / fa
+                 end if
+                 ! note that 0 .le. l .le. 1
+                 m = gt / ft
+                 ! note that abs(m) .le. 1/macheps
+                 t = two - l
+                 ! note that t .ge. 1
+                 mm = m*m
+                 tt = t*t
+                 s = sqrt( tt+mm )
+                 ! note that 1 .le. s .le. 1 + 1/macheps
+                 if( l==zero ) then
+                    r = abs( m )
+                 else
+                    r = sqrt( l*l+mm )
+                 end if
+                 ! note that 0 .le. r .le. 1 + 1/macheps
+                 a = half*( s+r )
+                 ! note that 1 .le. a .le. 1 + abs(m)
+                 ssmin = ha / a
+                 ssmax = fa*a
+                 if( mm==zero ) then
+                    ! note that m is very tiny
+                    if( l==zero ) then
+                       t = sign( two, ft )*sign( one, gt )
+                    else
+                       t = gt / sign( d, ft ) + m / t
+                    end if
+                 else
+                    t = ( m / ( s+t )+m / ( r+l ) )*( one+a )
+                 end if
+                 l = sqrt( t*t+four )
+                 crt = two / l
+                 srt = t / l
+                 clt = ( crt+srt*m ) / a
+                 slt = ( ht / ft )*srt / a
+              end if
+           end if
+           if( swap ) then
+              csl = srt
+              snl = crt
+              csr = slt
+              snr = clt
+           else
+              csl = clt
+              snl = slt
+              csr = crt
+              snr = srt
+           end if
+           ! correct signs of ssmax and ssmin
+           if( pmax==1 )tsign = sign( one, csr )*sign( one, csl )*sign( one, f )
+           if( pmax==2 )tsign = sign( one, snr )*sign( one, csl )*sign( one, g )
+           if( pmax==3 )tsign = sign( one, snr )*sign( one, snl )*sign( one, h )
+           ssmax = sign( ssmax, tsign )
+           ssmin = sign( ssmin, tsign*sign( one, f )*sign( one, h ) )
+           return
+     end subroutine stdlib_dlasv2
+
+     !> DLASWP: performs a series of row interchanges on the matrix A.
+     !> One row interchange is initiated for each of rows K1 through K2 of A.
+
+     pure subroutine stdlib_dlaswp( n, a, lda, k1, k2, ipiv, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k1, k2, lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, i1, i2, inc, ip, ix, ix0, j, k, n32
+           real(dp) :: temp
+           ! Executable Statements 
+           ! interchange row i with row ipiv(k1+(i-k1)*abs(incx)) for each of rows
+           ! k1 through k2.
+           if( incx>0 ) then
+              ix0 = k1
+              i1 = k1
+              i2 = k2
+              inc = 1
+           else if( incx<0 ) then
+              ix0 = k1 + ( k1-k2 )*incx
+              i1 = k2
+              i2 = k1
+              inc = -1
+           else
+              return
+           end if
+           n32 = ( n / 32 )*32
+           if( n32/=0 ) then
+              do j = 1, n32, 32
+                 ix = ix0
+                 do i = i1, i2, inc
+                    ip = ipiv( ix )
+                    if( ip/=i ) then
+                       do k = j, j + 31
+                          temp = a( i, k )
+                          a( i, k ) = a( ip, k )
+                          a( ip, k ) = temp
+                       end do
+                    end if
+                    ix = ix + incx
+                 end do
+              end do
+           end if
+           if( n32/=n ) then
+              n32 = n32 + 1
+              ix = ix0
+              do i = i1, i2, inc
+                 ip = ipiv( ix )
+                 if( ip/=i ) then
+                    do k = n32, n
+                       temp = a( i, k )
+                       a( i, k ) = a( ip, k )
+                       a( ip, k ) = temp
+                    end do
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end subroutine stdlib_dlaswp
+
+     !> DLASY2: solves for the N1 by N2 matrix X, 1 <= N1,N2 <= 2, in
+     !> op(TL)*X + ISGN*X*op(TR) = SCALE*B,
+     !> where TL is N1 by N1, TR is N2 by N2, B is N1 by N2, and ISGN = 1 or
+     !> -1.  op(T) = T or T**T, where T**T denotes the transpose of T.
+
+     pure subroutine stdlib_dlasy2( ltranl, ltranr, isgn, n1, n2, tl, ldtl, tr,ldtr, b, ldb, &
+               scale, x, ldx, xnorm, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: ltranl, ltranr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: isgn, ldb, ldtl, ldtr, ldx, n1, n2
+           real(dp), intent(out) :: scale, xnorm
+           ! Array Arguments 
+           real(dp), intent(in) :: b(ldb,*), tl(ldtl,*), tr(ldtr,*)
+           real(dp), intent(out) :: x(ldx,*)
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: bswap, xswap
+           integer(ilp) :: i, ip, ipiv, ipsv, j, jp, jpsv, k
+           real(dp) :: bet, eps, gam, l21, sgn, smin, smlnum, tau1, temp, u11, u12, u22, &
+                     xmax
+           ! Local Arrays 
+           logical(lk) :: bswpiv(4), xswpiv(4)
+           integer(ilp) :: jpiv(4), locl21(4), locu12(4), locu22(4)
+           real(dp) :: btmp(4), t16(4,4), tmp(4), x2(2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Data Statements 
+           locu12 = [3,4,1,2]
+           locl21 = [2,1,4,3]
+           locu22 = [4,3,2,1]
+           xswpiv = [.false.,.false.,.true.,.true.]
+           bswpiv = [.false.,.true.,.false.,.true.]
+           ! Executable Statements 
+           ! do not check the input parameters for errors
+           info = 0
+           ! quick return if possible
+           if( n1==0 .or. n2==0 )return
+           ! set constants to control overflow
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' ) / eps
+           sgn = isgn
+           k = n1 + n1 + n2 - 2
+           go to ( 10, 20, 30, 50 )k
+           ! 1 by 1: tl11*x + sgn*x*tr11 = b11
+           10 continue
+           tau1 = tl( 1, 1 ) + sgn*tr( 1, 1 )
+           bet = abs( tau1 )
+           if( bet<=smlnum ) then
+              tau1 = smlnum
+              bet = smlnum
+              info = 1
+           end if
+           scale = one
+           gam = abs( b( 1, 1 ) )
+           if( smlnum*gam>bet )scale = one / gam
+           x( 1, 1 ) = ( b( 1, 1 )*scale ) / tau1
+           xnorm = abs( x( 1, 1 ) )
+           return
+           ! 1 by 2:
+           ! tl11*[x11 x12] + isgn*[x11 x12]*op[tr11 tr12]  = [b11 b12]
+                                             ! [tr21 tr22]
+                                             20 continue
+           smin = max( eps*max( abs( tl( 1, 1 ) ), abs( tr( 1, 1 ) ),abs( tr( 1, 2 ) ), abs( tr( &
+                     2, 1 ) ), abs( tr( 2, 2 ) ) ),smlnum )
+           tmp( 1 ) = tl( 1, 1 ) + sgn*tr( 1, 1 )
+           tmp( 4 ) = tl( 1, 1 ) + sgn*tr( 2, 2 )
+           if( ltranr ) then
+              tmp( 2 ) = sgn*tr( 2, 1 )
+              tmp( 3 ) = sgn*tr( 1, 2 )
+           else
+              tmp( 2 ) = sgn*tr( 1, 2 )
+              tmp( 3 ) = sgn*tr( 2, 1 )
+           end if
+           btmp( 1 ) = b( 1, 1 )
+           btmp( 2 ) = b( 1, 2 )
+           go to 40
+           ! 2 by 1:
+                ! op[tl11 tl12]*[x11] + isgn* [x11]*tr11  = [b11]
+                  ! [tl21 tl22] [x21]         [x21]         [b21]
+                  30 continue
+           smin = max( eps*max( abs( tr( 1, 1 ) ), abs( tl( 1, 1 ) ),abs( tl( 1, 2 ) ), abs( tl( &
+                     2, 1 ) ), abs( tl( 2, 2 ) ) ),smlnum )
+           tmp( 1 ) = tl( 1, 1 ) + sgn*tr( 1, 1 )
+           tmp( 4 ) = tl( 2, 2 ) + sgn*tr( 1, 1 )
+           if( ltranl ) then
+              tmp( 2 ) = tl( 1, 2 )
+              tmp( 3 ) = tl( 2, 1 )
+           else
+              tmp( 2 ) = tl( 2, 1 )
+              tmp( 3 ) = tl( 1, 2 )
+           end if
+           btmp( 1 ) = b( 1, 1 )
+           btmp( 2 ) = b( 2, 1 )
+           40 continue
+           ! solve 2 by 2 system using complete pivoting.
+           ! set pivots less than smin to smin.
+           ipiv = stdlib_idamax( 4, tmp, 1 )
+           u11 = tmp( ipiv )
+           if( abs( u11 )<=smin ) then
+              info = 1
+              u11 = smin
+           end if
+           u12 = tmp( locu12( ipiv ) )
+           l21 = tmp( locl21( ipiv ) ) / u11
+           u22 = tmp( locu22( ipiv ) ) - u12*l21
+           xswap = xswpiv( ipiv )
+           bswap = bswpiv( ipiv )
+           if( abs( u22 )<=smin ) then
+              info = 1
+              u22 = smin
+           end if
+           if( bswap ) then
+              temp = btmp( 2 )
+              btmp( 2 ) = btmp( 1 ) - l21*temp
+              btmp( 1 ) = temp
+           else
+              btmp( 2 ) = btmp( 2 ) - l21*btmp( 1 )
+           end if
+           scale = one
+           if( ( two*smlnum )*abs( btmp( 2 ) )>abs( u22 ) .or.( two*smlnum )*abs( btmp( 1 ) )>abs(&
+                      u11 ) ) then
+              scale = half / max( abs( btmp( 1 ) ), abs( btmp( 2 ) ) )
+              btmp( 1 ) = btmp( 1 )*scale
+              btmp( 2 ) = btmp( 2 )*scale
+           end if
+           x2( 2 ) = btmp( 2 ) / u22
+           x2( 1 ) = btmp( 1 ) / u11 - ( u12 / u11 )*x2( 2 )
+           if( xswap ) then
+              temp = x2( 2 )
+              x2( 2 ) = x2( 1 )
+              x2( 1 ) = temp
+           end if
+           x( 1, 1 ) = x2( 1 )
+           if( n1==1 ) then
+              x( 1, 2 ) = x2( 2 )
+              xnorm = abs( x( 1, 1 ) ) + abs( x( 1, 2 ) )
+           else
+              x( 2, 1 ) = x2( 2 )
+              xnorm = max( abs( x( 1, 1 ) ), abs( x( 2, 1 ) ) )
+           end if
+           return
+           ! 2 by 2:
+           ! op[tl11 tl12]*[x11 x12] +isgn* [x11 x12]*op[tr11 tr12] = [b11 b12]
+             ! [tl21 tl22] [x21 x22]        [x21 x22]   [tr21 tr22]   [b21 b22]
+           ! solve equivalent 4 by 4 system using complete pivoting.
+           ! set pivots less than smin to smin.
+           50 continue
+           smin = max( abs( tr( 1, 1 ) ), abs( tr( 1, 2 ) ),abs( tr( 2, 1 ) ), abs( tr( 2, 2 ) ) )
+                     
+           smin = max( smin, abs( tl( 1, 1 ) ), abs( tl( 1, 2 ) ),abs( tl( 2, 1 ) ), abs( tl( 2, &
+                     2 ) ) )
+           smin = max( eps*smin, smlnum )
+           btmp( 1 ) = zero
+           call stdlib_dcopy( 16, btmp, 0, t16, 1 )
+           t16( 1, 1 ) = tl( 1, 1 ) + sgn*tr( 1, 1 )
+           t16( 2, 2 ) = tl( 2, 2 ) + sgn*tr( 1, 1 )
+           t16( 3, 3 ) = tl( 1, 1 ) + sgn*tr( 2, 2 )
+           t16( 4, 4 ) = tl( 2, 2 ) + sgn*tr( 2, 2 )
+           if( ltranl ) then
+              t16( 1, 2 ) = tl( 2, 1 )
+              t16( 2, 1 ) = tl( 1, 2 )
+              t16( 3, 4 ) = tl( 2, 1 )
+              t16( 4, 3 ) = tl( 1, 2 )
+           else
+              t16( 1, 2 ) = tl( 1, 2 )
+              t16( 2, 1 ) = tl( 2, 1 )
+              t16( 3, 4 ) = tl( 1, 2 )
+              t16( 4, 3 ) = tl( 2, 1 )
+           end if
+           if( ltranr ) then
+              t16( 1, 3 ) = sgn*tr( 1, 2 )
+              t16( 2, 4 ) = sgn*tr( 1, 2 )
+              t16( 3, 1 ) = sgn*tr( 2, 1 )
+              t16( 4, 2 ) = sgn*tr( 2, 1 )
+           else
+              t16( 1, 3 ) = sgn*tr( 2, 1 )
+              t16( 2, 4 ) = sgn*tr( 2, 1 )
+              t16( 3, 1 ) = sgn*tr( 1, 2 )
+              t16( 4, 2 ) = sgn*tr( 1, 2 )
+           end if
+           btmp( 1 ) = b( 1, 1 )
+           btmp( 2 ) = b( 2, 1 )
+           btmp( 3 ) = b( 1, 2 )
+           btmp( 4 ) = b( 2, 2 )
+           ! perform elimination
+           loop_100: do i = 1, 3
+              xmax = zero
+              do ip = i, 4
+                 do jp = i, 4
+                    if( abs( t16( ip, jp ) )>=xmax ) then
+                       xmax = abs( t16( ip, jp ) )
+                       ipsv = ip
+                       jpsv = jp
+                    end if
+                 end do
+              end do
+              if( ipsv/=i ) then
+                 call stdlib_dswap( 4, t16( ipsv, 1 ), 4, t16( i, 1 ), 4 )
+                 temp = btmp( i )
+                 btmp( i ) = btmp( ipsv )
+                 btmp( ipsv ) = temp
+              end if
+              if( jpsv/=i )call stdlib_dswap( 4, t16( 1, jpsv ), 1, t16( 1, i ), 1 )
+              jpiv( i ) = jpsv
+              if( abs( t16( i, i ) )<smin ) then
+                 info = 1
+                 t16( i, i ) = smin
+              end if
+              do j = i + 1, 4
+                 t16( j, i ) = t16( j, i ) / t16( i, i )
+                 btmp( j ) = btmp( j ) - t16( j, i )*btmp( i )
+                 do k = i + 1, 4
+                    t16( j, k ) = t16( j, k ) - t16( j, i )*t16( i, k )
+                 end do
+              end do
+           end do loop_100
+           if( abs( t16( 4, 4 ) )<smin ) then
+              info = 1
+              t16( 4, 4 ) = smin
+           end if
+           scale = one
+           if( ( eight*smlnum )*abs( btmp( 1 ) )>abs( t16( 1, 1 ) ) .or.( eight*smlnum )*abs( &
+           btmp( 2 ) )>abs( t16( 2, 2 ) ) .or.( eight*smlnum )*abs( btmp( 3 ) )>abs( t16( 3, 3 ) )&
+                      .or.( eight*smlnum )*abs( btmp( 4 ) )>abs( t16( 4, 4 ) ) ) then
+              scale = ( one / eight ) / max( abs( btmp( 1 ) ),abs( btmp( 2 ) ), abs( btmp( 3 ) ), &
+                        abs( btmp( 4 ) ) )
+              btmp( 1 ) = btmp( 1 )*scale
+              btmp( 2 ) = btmp( 2 )*scale
+              btmp( 3 ) = btmp( 3 )*scale
+              btmp( 4 ) = btmp( 4 )*scale
+           end if
+           do i = 1, 4
+              k = 5 - i
+              temp = one / t16( k, k )
+              tmp( k ) = btmp( k )*temp
+              do j = k + 1, 4
+                 tmp( k ) = tmp( k ) - ( temp*t16( k, j ) )*tmp( j )
+              end do
+           end do
+           do i = 1, 3
+              if( jpiv( 4-i )/=4-i ) then
+                 temp = tmp( 4-i )
+                 tmp( 4-i ) = tmp( jpiv( 4-i ) )
+                 tmp( jpiv( 4-i ) ) = temp
+              end if
+           end do
+           x( 1, 1 ) = tmp( 1 )
+           x( 2, 1 ) = tmp( 2 )
+           x( 1, 2 ) = tmp( 3 )
+           x( 2, 2 ) = tmp( 4 )
+           xnorm = max( abs( tmp( 1 ) )+abs( tmp( 3 ) ),abs( tmp( 2 ) )+abs( tmp( 4 ) ) )
+           return
+     end subroutine stdlib_dlasy2
+
+     !> DLASYF: computes a partial factorization of a real symmetric matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method. The partial
+     !> factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> DLASYF is an auxiliary routine called by DSYTRF. It uses blocked code
+     !> (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or
+     !> A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_dlasyf( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: imax, j, jb, jj, jmax, jp, k, kk, kkw, kp, kstep, kw
+           real(dp) :: absakk, alpha, colmax, d11, d21, d22, r1, rowmax, t
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              ! kw is the column of w which corresponds to column k of a
+              k = n
+              10 continue
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              ! copy column k of a to column kw of w and update it
+              call stdlib_dcopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_dgemv( 'NO TRANSPOSE', k, n-k, -one, a( 1, k+1 ), lda,w( k, kw+&
+                        1 ), ldw, one, w( 1, kw ), 1 )
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_idamax( k-1, w( 1, kw ), 1 )
+                 colmax = abs( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! copy column imax to column kw-1 of w and update it
+                    call stdlib_dcopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                    call stdlib_dcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                              
+                    if( k<n )call stdlib_dgemv( 'NO TRANSPOSE', k, n-k, -one, a( 1, k+1 ),lda, w( &
+                              imax, kw+1 ), ldw, one,w( 1, kw-1 ), 1 )
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = imax + stdlib_idamax( k-imax, w( imax+1, kw-1 ), 1 )
+                    rowmax = abs( w( jmax, kw-1 ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_idamax( imax-1, w( 1, kw-1 ), 1 )
+                       rowmax = max( rowmax, abs( w( jmax, kw-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( w( imax, kw-1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column kw-1 of w to column kw of w
+                       call stdlib_dcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = a( kk, kk )
+                    call stdlib_dcopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    if( kp>1 )call stdlib_dcopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_dswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_dswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! note: diagonal element u(k,k) is a unit element
+                    ! and not stored.
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    call stdlib_dcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    r1 = one / a( k, k )
+                    call stdlib_dscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored.
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! compose the columns of the inverse of 2-by-2 pivot
+                       ! block d in the following way to reduce the number
+                       ! of flops when we myltiply panel ( w(kw-1) w(kw) ) by
+                       ! this inverse
+                       ! d**(-1) = ( d11 d21 )**(-1) =
+                                 ! ( d21 d22 )
+                       ! = 1/(d11*d22-d21**2) * ( ( d22 ) (-d21 ) ) =
+                                              ! ( (-d21 ) ( d11 ) )
+                       ! = 1/d21 * 1/((d11/d21)*(d22/d21)-1) *
+                         ! * ( ( d22/d21 ) (      -1 ) ) =
+                           ! ( (      -1 ) ( d11/d21 ) )
+                       ! = 1/d21 * 1/(d22*d11-1) * ( ( d11 ) (  -1 ) ) =
+                                                 ! ( ( -1  ) ( d22 ) )
+                       ! = 1/d21 * t * ( ( d11 ) (  -1 ) )
+                                     ! ( (  -1 ) ( d22 ) )
+                       ! = d21 * ( ( d11 ) (  -1 ) )
+                               ! ( (  -1 ) ( d22 ) )
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / d21
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = one / ( d11*d22-one )
+                       d21 = t / d21
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = d21*( d11*w( j, kw-1 )-w( j, kw ) )
+                          a( j, k ) = d21*( d22*w( j, kw )-w( j, kw-1 ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_dgemv( 'NO TRANSPOSE', jj-j+1, n-k, -one,a( j, k+1 ), lda, w( jj, &
+                              kw+1 ), ldw, one,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k, -one,a( 1, k+1 ), &
+                           lda, w( j, kw+1 ), ldw, one,a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n looping backwards from k+1 to n
+              j = k + 1
+              60 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j + 1
+                 end if
+                 ! (note: here, j is used to determine row length. length n-j+1
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j + 1
+                 if( jp/=jj .and. j<=n )call stdlib_dswap( n-j+1, a( jp, j ), lda, a( jj, j ), &
+                           lda )
+              if( j<n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              ! copy column k of a to column k of w and update it
+              call stdlib_dcopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              call stdlib_dgemv( 'NO TRANSPOSE', n-k+1, k-1, -one, a( k, 1 ), lda,w( k, 1 ), ldw, &
+                        one, w( k, k ), 1 )
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_idamax( n-k, w( k+1, k ), 1 )
+                 colmax = abs( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! copy column imax to column k+1 of w and update it
+                    call stdlib_dcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1 )
+                    call stdlib_dcopy( n-imax+1, a( imax, imax ), 1, w( imax, k+1 ),1 )
+                    call stdlib_dgemv( 'NO TRANSPOSE', n-k+1, k-1, -one, a( k, 1 ),lda, w( imax, &
+                              1 ), ldw, one, w( k, k+1 ), 1 )
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = k - 1 + stdlib_idamax( imax-k, w( k, k+1 ), 1 )
+                    rowmax = abs( w( jmax, k+1 ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_idamax( n-imax, w( imax+1, k+1 ), 1 )
+                       rowmax = max( rowmax, abs( w( jmax, k+1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( w( imax, k+1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column k+1 of w to column k of w
+                       call stdlib_dcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = a( kk, kk )
+                    call stdlib_dcopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    if( kp<n )call stdlib_dcopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (columns k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_dswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_dswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    call stdlib_dcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       r1 = one / a( k, k )
+                       call stdlib_dscal( n-k, r1, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! (note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored)
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! compose the columns of the inverse of 2-by-2 pivot
+                       ! block d in the following way to reduce the number
+                       ! of flops when we myltiply panel ( w(k) w(k+1) ) by
+                       ! this inverse
+                       ! d**(-1) = ( d11 d21 )**(-1) =
+                                 ! ( d21 d22 )
+                       ! = 1/(d11*d22-d21**2) * ( ( d22 ) (-d21 ) ) =
+                                              ! ( (-d21 ) ( d11 ) )
+                       ! = 1/d21 * 1/((d11/d21)*(d22/d21)-1) *
+                         ! * ( ( d22/d21 ) (      -1 ) ) =
+                           ! ( (      -1 ) ( d11/d21 ) )
+                       ! = 1/d21 * 1/(d22*d11-1) * ( ( d11 ) (  -1 ) ) =
+                                                 ! ( ( -1  ) ( d22 ) )
+                       ! = 1/d21 * t * ( ( d11 ) (  -1 ) )
+                                     ! ( (  -1 ) ( d22 ) )
+                       ! = d21 * ( ( d11 ) (  -1 ) )
+                               ! ( (  -1 ) ( d22 ) )
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       d21 = t / d21
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = d21*( d11*w( j, k )-w( j, k+1 ) )
+                          a( j, k+1 ) = d21*( d22*w( j, k+1 )-w( j, k ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_dgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -one,a( jj, 1 ), lda, w( jj, &
+                              1 ), ldw, one,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           one, a( j+jb, 1 ), lda, w( j, 1 ), ldw,one, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! of rows in columns 1:k-1 looping backwards from k-1 to 1
+              j = k - 1
+              120 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j - 1
+                 end if
+                 ! (note: here, j is used to determine row length. length j
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j - 1
+                 if( jp/=jj .and. j>=1 )call stdlib_dswap( j, a( jp, 1 ), lda, a( jj, 1 ), lda )
+                           
+              if( j>1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_dlasyf
+
+     !> DLASYF_RK: computes a partial factorization of a real symmetric
+     !> matrix A using the bounded Bunch-Kaufman (rook) diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L',
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> DLASYF_RK is an auxiliary routine called by DSYTRF_RK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_dlasyf_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: e(*), w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, j, jb, jj, jmax, k, kk, kw, kkw, kp, kstep, p, ii
+           real(dp) :: absakk, alpha, colmax, d11, d12, d21, d22, dtemp, r1, rowmax, t, &
+                     sfmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = zero
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              call stdlib_dcopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_dgemv( 'NO TRANSPOSE', k, n-k, -one, a( 1, k+1 ),lda, w( k, kw+&
+                        1 ), ldw, one, w( 1, kw ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_idamax( k-1, w( 1, kw ), 1 )
+                 colmax = abs( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_dcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = zero
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       call stdlib_dcopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                       call stdlib_dcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       if( k<n )call stdlib_dgemv( 'NO TRANSPOSE', k, n-k, -one,a( 1, k+1 ), lda, &
+                                 w( imax, kw+1 ), ldw,one, w( 1, kw-1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_idamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = abs( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_idamax( imax-1, w( 1, kw-1 ), 1 )
+                          dtemp = abs( w( itemp, kw-1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! abs( w( imax, kw-1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.(abs( w( imax, kw-1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_dcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_dcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_dcopy( k-p, a( p+1, k ), 1, a( p, p+1 ), lda )
+                    call stdlib_dcopy( p, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in last n-k+1 columns of a
+                    ! and last n-k+2 columns of w
+                    call stdlib_dswap( n-k+1, a( k, k ), lda, a( p, k ), lda )
+                    call stdlib_dswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kkw of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_dcopy( k-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_dcopy( kp, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last n-kk+1 columns
+                    ! of a and w
+                    call stdlib_dswap( n-kk+1, a( kk, kk ), lda, a( kp, kk ), lda )
+                    call stdlib_dswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! store u(k) in column k of a
+                    call stdlib_dcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          r1 = one / a( k, k )
+                          call stdlib_dscal( k-1, r1, a( 1, k ), 1 )
+                       else if( a( k, k )/=zero ) then
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = zero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now
+                    ! hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    if( k>2 ) then
+                       ! store u(k) and u(k-1) in columns k and k-1 of a
+                       d12 = w( k-1, kw )
+                       d11 = w( k, kw ) / d12
+                       d22 = w( k-1, kw-1 ) / d12
+                       t = one / ( d11*d22-one )
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( (d11*w( j, kw-1 )-w( j, kw ) ) /d12 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /d12 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy superdiagonal element of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = zero
+                    a( k, k ) = w( k, kw )
+                    e( k ) = w( k-1, kw )
+                    e( k-1 ) = zero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_dgemv( 'NO TRANSPOSE', jj-j+1, n-k, -one,a( j, k+1 ), lda, w( jj, &
+                              kw+1 ), ldw, one,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb,n-k, -one, a( &
+                           1, k+1 ), lda, w( j, kw+1 ),ldw, one, a( 1, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = zero
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update it
+              call stdlib_dcopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              if( k>1 )call stdlib_dgemv( 'NO TRANSPOSE', n-k+1, k-1, -one, a( k, 1 ),lda, w( k, &
+                        1 ), ldw, one, w( k, k ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_idamax( n-k, w( k+1, k ), 1 )
+                 colmax = abs( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_dcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = zero
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_dcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_dcopy( n-imax+1, a( imax, imax ), 1,w( imax, k+1 ), 1 )
+                       if( k>1 )call stdlib_dgemv( 'NO TRANSPOSE', n-k+1, k-1, -one,a( k, 1 ), &
+                                 lda, w( imax, 1 ), ldw,one, w( k, k+1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_idamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = abs( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_idamax( n-imax, w( imax+1, k+1 ), 1)
+                          dtemp = abs( w( itemp, k+1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! abs( w( imax, k+1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( w( imax, k+1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_dcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_dcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 72
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_dcopy( p-k, a( k, k ), 1, a( p, k ), lda )
+                    call stdlib_dcopy( n-p+1, a( p, k ), 1, a( p, p ), 1 )
+                    ! interchange rows k and p in first k columns of a
+                    ! and first k+1 columns of w
+                    call stdlib_dswap( k, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_dswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kk of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_dcopy( kp-k-1, a( k+1, kk ), 1, a( kp, k+1 ), lda )
+                    call stdlib_dcopy( n-kp+1, a( kp, kk ), 1, a( kp, kp ), 1 )
+                    ! interchange rows kk and kp in first kk columns of a and w
+                    call stdlib_dswap( kk, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_dswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    ! store l(k) in column k of a
+                    call stdlib_dcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          r1 = one / a( k, k )
+                          call stdlib_dscal( n-k, r1, a( k+1, k ), 1 )
+                       else if( a( k, k )/=zero ) then
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = zero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! store l(k) and l(k+1) in columns k and k+1 of a
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /d21 )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy subdiagonal element of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = zero
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    e( k ) = w( k+1, k )
+                    e( k+1 ) = zero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_dgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -one,a( jj, 1 ), lda, w( jj, &
+                              1 ), ldw, one,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           one, a( j+jb, 1 ), lda, w( j, 1 ),ldw, one, a( j+jb, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_dlasyf_rk
+
+     !> DLASYF_ROOK: computes a partial factorization of a real symmetric
+     !> matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> DLASYF_ROOK is an auxiliary routine called by DSYTRF_ROOK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_dlasyf_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, j, jb, jj, jmax, jp1, jp2, k, kk, kw, kkw, kp, kstep, p, &
+                     ii
+           real(dp) :: absakk, alpha, colmax, d11, d12, d21, d22, dtemp, r1, rowmax, t, &
+                     sfmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              call stdlib_dcopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_dgemv( 'NO TRANSPOSE', k, n-k, -one, a( 1, k+1 ),lda, w( k, kw+&
+                        1 ), ldw, one, w( 1, kw ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_idamax( k-1, w( 1, kw ), 1 )
+                 colmax = abs( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_dcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       call stdlib_dcopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                       call stdlib_dcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       if( k<n )call stdlib_dgemv( 'NO TRANSPOSE', k, n-k, -one,a( 1, k+1 ), lda, &
+                                 w( imax, kw+1 ), ldw,one, w( 1, kw-1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_idamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = abs( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_idamax( imax-1, w( 1, kw-1 ), 1 )
+                          dtemp = abs( w( itemp, kw-1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! abs( w( imax, kw-1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.(abs( w( imax, kw-1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_dcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_dcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_dcopy( k-p, a( p+1, k ), 1, a( p, p+1 ), lda )
+                    call stdlib_dcopy( p, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in last n-k+1 columns of a
+                    ! and last n-k+2 columns of w
+                    call stdlib_dswap( n-k+1, a( k, k ), lda, a( p, k ), lda )
+                    call stdlib_dswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kkw of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_dcopy( k-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_dcopy( kp, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last n-kk+1 columns
+                    ! of a and w
+                    call stdlib_dswap( n-kk+1, a( kk, kk ), lda, a( kp, kk ), lda )
+                    call stdlib_dswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! store u(k) in column k of a
+                    call stdlib_dcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          r1 = one / a( k, k )
+                          call stdlib_dscal( k-1, r1, a( 1, k ), 1 )
+                       else if( a( k, k )/=zero ) then
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now
+                    ! hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    if( k>2 ) then
+                       ! store u(k) and u(k-1) in columns k and k-1 of a
+                       d12 = w( k-1, kw )
+                       d11 = w( k, kw ) / d12
+                       d22 = w( k-1, kw-1 ) / d12
+                       t = one / ( d11*d22-one )
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( (d11*w( j, kw-1 )-w( j, kw ) ) /d12 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /d12 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_dgemv( 'NO TRANSPOSE', jj-j+1, n-k, -one,a( j, k+1 ), lda, w( jj, &
+                              kw+1 ), ldw, one,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb,n-k, -one, a( &
+                           1, k+1 ), lda, w( j, kw+1 ), ldw,one, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n
+              j = k + 1
+              60 continue
+                 kstep = 1
+                 jp1 = 1
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    j = j + 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 j = j + 1
+                 if( jp2/=jj .and. j<=n )call stdlib_dswap( n-j+1, a( jp2, j ), lda, a( jj, j ), &
+                           lda )
+                 jj = j - 1
+                 if( jp1/=jj .and. kstep==2 )call stdlib_dswap( n-j+1, a( jp1, j ), lda, a( jj, j &
+                           ), lda )
+              if( j<=n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update it
+              call stdlib_dcopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              if( k>1 )call stdlib_dgemv( 'NO TRANSPOSE', n-k+1, k-1, -one, a( k, 1 ),lda, w( k, &
+                        1 ), ldw, one, w( k, k ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_idamax( n-k, w( k+1, k ), 1 )
+                 colmax = abs( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_dcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_dcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_dcopy( n-imax+1, a( imax, imax ), 1,w( imax, k+1 ), 1 )
+                       if( k>1 )call stdlib_dgemv( 'NO TRANSPOSE', n-k+1, k-1, -one,a( k, 1 ), &
+                                 lda, w( imax, 1 ), ldw,one, w( k, k+1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_idamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = abs( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_idamax( n-imax, w( imax+1, k+1 ), 1)
+                          dtemp = abs( w( itemp, k+1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! abs( w( imax, k+1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( w( imax, k+1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_dcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_dcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 72
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_dcopy( p-k, a( k, k ), 1, a( p, k ), lda )
+                    call stdlib_dcopy( n-p+1, a( p, k ), 1, a( p, p ), 1 )
+                    ! interchange rows k and p in first k columns of a
+                    ! and first k+1 columns of w
+                    call stdlib_dswap( k, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_dswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kk of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_dcopy( kp-k-1, a( k+1, kk ), 1, a( kp, k+1 ), lda )
+                    call stdlib_dcopy( n-kp+1, a( kp, kk ), 1, a( kp, kp ), 1 )
+                    ! interchange rows kk and kp in first kk columns of a and w
+                    call stdlib_dswap( kk, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_dswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    ! store l(k) in column k of a
+                    call stdlib_dcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          r1 = one / a( k, k )
+                          call stdlib_dscal( n-k, r1, a( k+1, k ), 1 )
+                       else if( a( k, k )/=zero ) then
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! store l(k) and l(k+1) in columns k and k+1 of a
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /d21 )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_dgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -one,a( jj, 1 ), lda, w( jj, &
+                              1 ), ldw, one,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           one, a( j+jb, 1 ), lda, w( j, 1 ), ldw,one, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! in columns 1:k-1
+              j = k - 1
+              120 continue
+                 kstep = 1
+                 jp1 = 1
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    j = j - 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 j = j - 1
+                 if( jp2/=jj .and. j>=1 )call stdlib_dswap( j, a( jp2, 1 ), lda, a( jj, 1 ), lda )
+                           
+                 jj = j + 1
+                 if( jp1/=jj .and. kstep==2 )call stdlib_dswap( j, a( jp1, 1 ), lda, a( jj, 1 ), &
+                           lda )
+              if( j>=1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_dlasyf_rook
+
+     !> DLAT2S: converts a DOUBLE PRECISION triangular matrix, SA, to a SINGLE
+     !> PRECISION triangular matrix, A.
+     !> RMAX is the overflow for the SINGLE PRECISION arithmetic
+     !> DLAS2S checks that all the entries of A are between -RMAX and
+     !> RMAX. If not the conversion is aborted and a flag is raised.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_dlat2s( uplo, n, a, lda, sa, ldsa, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, n
+           ! Array Arguments 
+           real(sp), intent(out) :: sa(ldsa,*)
+           real(dp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: rmax
+           logical(lk) :: upper
+           ! Executable Statements 
+           rmax = stdlib_slamch( 'O' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( upper ) then
+              do j = 1, n
+                 do i = 1, j
+                    if( ( a( i, j )<-rmax ) .or. ( a( i, j )>rmax ) )then
+                       info = 1
+                       go to 50
+                    end if
+                    sa( i, j ) = a( i, j )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = j, n
+                    if( ( a( i, j )<-rmax ) .or. ( a( i, j )>rmax ) )then
+                       info = 1
+                       go to 50
+                    end if
+                    sa( i, j ) = a( i, j )
+                 end do
+              end do
+           end if
+           50 continue
+           return
+     end subroutine stdlib_dlat2s
+
+     !> DLATBS: solves one of the triangular systems
+     !> A *x = s*b  or  A**T*x = s*b
+     !> with scaling to prevent overflow, where A is an upper or lower
+     !> triangular band matrix.  Here A**T denotes the transpose of A, x and b
+     !> are n-element vectors, and s is a scaling factor, usually less than
+     !> or equal to 1, chosen so that the components of x will be less than
+     !> the overflow threshold.  If the unscaled problem will not cause
+     !> overflow, the Level 2 BLAS routine DTBSV is called.  If the matrix A
+     !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+     !> non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_dlatbs( uplo, trans, diag, normin, n, kd, ab, ldab, x,scale, cnorm, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(dp), intent(out) :: scale
+           ! Array Arguments 
+           real(dp), intent(in) :: ab(ldab,*)
+           real(dp), intent(inout) :: cnorm(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, j, jfirst, jinc, jlast, jlen, maind
+           real(dp) :: bignum, grow, rec, smlnum, sumj, tjj, tjjs, tmax, tscal, uscal, xbnd, xj, &
+                     xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( kd<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLATBS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 do j = 1, n
+                    jlen = min( kd, j-1 )
+                    cnorm( j ) = stdlib_dasum( jlen, ab( kd+1-jlen, j ), 1 )
+                 end do
+              else
+                 ! a is lower triangular.
+                 do j = 1, n
+                    jlen = min( kd, n-j )
+                    if( jlen>0 ) then
+                       cnorm( j ) = stdlib_dasum( jlen, ab( 2, j ), 1 )
+                    else
+                       cnorm( j ) = zero
+                    end if
+                 end do
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum.
+           imax = stdlib_idamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum ) then
+              tscal = one
+           else
+              tscal = one / ( smlnum*tmax )
+              call stdlib_dscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_dtbsv can be used.
+           j = stdlib_idamax( n, x, 1 )
+           xmax = abs( x( j ) )
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+                 maind = kd + 1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+                 maind = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 50
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! m(j) = g(j-1) / abs(a(j,j))
+                    tjj = abs( ab( maind, j ) )
+                    xbnd = min( xbnd, min( one, tjj )*grow )
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              50 continue
+           else
+              ! compute the growth in a**t * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+                 maind = kd + 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+                 maind = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 80
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                    tjj = abs( ab( maind, j ) )
+                    if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              80 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_dtbsv( uplo, trans, diag, n, kd, ab, ldab, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = bignum / xmax
+                 call stdlib_dscal( n, scale, x, 1 )
+                 xmax = bignum
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 loop_110: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = abs( x( j ) )
+                    if( nounit ) then
+                       tjjs = ab( maind, j )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 100
+                    end if
+                    tjj = abs( tjjs )
+                    if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                       if( tjj<one ) then
+                          if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                             rec = one / xj
+                             call stdlib_dscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j ) = x( j ) / tjjs
+                       xj = abs( x( j ) )
+                    else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                       if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                          rec = ( tjj*bignum ) / xj
+                          if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                             rec = rec / cnorm( j )
+                          end if
+                          call stdlib_dscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                       x( j ) = x( j ) / tjjs
+                       xj = abs( x( j ) )
+                    else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                       do i = 1, n
+                          x( i ) = zero
+                       end do
+                       x( j ) = one
+                       xj = one
+                       scale = zero
+                       xmax = zero
+                    end if
+                    100 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_dscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_dscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(max(1,j-kd):j-1) := x(max(1,j-kd):j-1) -
+                                                   ! x(j)* a(max(1,j-kd):j-1,j)
+                          jlen = min( kd, j-1 )
+                          call stdlib_daxpy( jlen, -x( j )*tscal,ab( kd+1-jlen, j ), 1, x( j-jlen &
+                                    ), 1 )
+                          i = stdlib_idamax( j-1, x, 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                    else if( j<n ) then
+                       ! compute the update
+                          ! x(j+1:min(j+kd,n)) := x(j+1:min(j+kd,n)) -
+                                                ! x(j) * a(j+1:min(j+kd,n),j)
+                       jlen = min( kd, n-j )
+                       if( jlen>0 )call stdlib_daxpy( jlen, -x( j )*tscal, ab( 2, j ), 1,x( j+1 ),&
+                                  1 )
+                       i = j + stdlib_idamax( n-j, x( j+1 ), 1 )
+                       xmax = abs( x( i ) )
+                    end if
+                 end do loop_110
+              else
+                 ! solve a**t * x = b
+                 loop_160: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = abs( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = ab( maind, j )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = abs( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = uscal / tjjs
+                       end if
+                       if( rec<one ) then
+                          call stdlib_dscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    sumj = zero
+                    if( uscal==one ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_ddot to perform the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          sumj = stdlib_ddot( jlen, ab( kd+1-jlen, j ), 1,x( j-jlen ), 1 )
+                       else
+                          jlen = min( kd, n-j )
+                          if( jlen>0 )sumj = stdlib_ddot( jlen, ab( 2, j ), 1, x( j+1 ), 1 )
+                                    
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          do i = 1, jlen
+                             sumj = sumj + ( ab( kd+i-jlen, j )*uscal )*x( j-jlen-1+i )
+                          end do
+                       else
+                          jlen = min( kd, n-j )
+                          do i = 1, jlen
+                             sumj = sumj + ( ab( i+1, j )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==tscal ) then
+                       ! compute x(j) := ( x(j) - sumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - sumj
+                       xj = abs( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = ab( maind, j )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 150
+                       end if
+                       tjj = abs( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_dscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = x( j ) / tjjs
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_dscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = x( j ) / tjjs
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0, and compute a solution to a**t*x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       150 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - sumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = x( j ) / tjjs - sumj
+                    end if
+                    xmax = max( xmax, abs( x( j ) ) )
+                 end do loop_160
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_dscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_dlatbs
+
+     !> DLATPS: solves one of the triangular systems
+     !> A *x = s*b  or  A**T*x = s*b
+     !> with scaling to prevent overflow, where A is an upper or lower
+     !> triangular matrix stored in packed form.  Here A**T denotes the
+     !> transpose of A, x and b are n-element vectors, and s is a scaling
+     !> factor, usually less than or equal to 1, chosen so that the
+     !> components of x will be less than the overflow threshold.  If the
+     !> unscaled problem will not cause overflow, the Level 2 BLAS routine
+     !> DTPSV is called. If the matrix A is singular (A(j,j) = 0 for some j),
+     !> then s is set to 0 and a non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_dlatps( uplo, trans, diag, normin, n, ap, x, scale,cnorm, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(out) :: scale
+           ! Array Arguments 
+           real(dp), intent(in) :: ap(*)
+           real(dp), intent(inout) :: cnorm(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, ip, j, jfirst, jinc, jlast, jlen
+           real(dp) :: bignum, grow, rec, smlnum, sumj, tjj, tjjs, tmax, tscal, uscal, xbnd, xj, &
+                     xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLATPS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 ip = 1
+                 do j = 1, n
+                    cnorm( j ) = stdlib_dasum( j-1, ap( ip ), 1 )
+                    ip = ip + j
+                 end do
+              else
+                 ! a is lower triangular.
+                 ip = 1
+                 do j = 1, n - 1
+                    cnorm( j ) = stdlib_dasum( n-j, ap( ip+1 ), 1 )
+                    ip = ip + n - j + 1
+                 end do
+                 cnorm( n ) = zero
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum.
+           imax = stdlib_idamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum ) then
+              tscal = one
+           else
+              tscal = one / ( smlnum*tmax )
+              call stdlib_dscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_dtpsv can be used.
+           j = stdlib_idamax( n, x, 1 )
+           xmax = abs( x( j ) )
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 50
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = n
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! m(j) = g(j-1) / abs(a(j,j))
+                    tjj = abs( ap( ip ) )
+                    xbnd = min( xbnd, min( one, tjj )*grow )
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                    ip = ip + jinc*jlen
+                    jlen = jlen - 1
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              50 continue
+           else
+              ! compute the growth in a**t * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 80
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                    tjj = abs( ap( ip ) )
+                    if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              80 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_dtpsv( uplo, trans, diag, n, ap, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = bignum / xmax
+                 call stdlib_dscal( n, scale, x, 1 )
+                 xmax = bignum
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 loop_110: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = abs( x( j ) )
+                    if( nounit ) then
+                       tjjs = ap( ip )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 100
+                    end if
+                    tjj = abs( tjjs )
+                    if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                       if( tjj<one ) then
+                          if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                             rec = one / xj
+                             call stdlib_dscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j ) = x( j ) / tjjs
+                       xj = abs( x( j ) )
+                    else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                       if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                          rec = ( tjj*bignum ) / xj
+                          if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                             rec = rec / cnorm( j )
+                          end if
+                          call stdlib_dscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                       x( j ) = x( j ) / tjjs
+                       xj = abs( x( j ) )
+                    else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                       do i = 1, n
+                          x( i ) = zero
+                       end do
+                       x( j ) = one
+                       xj = one
+                       scale = zero
+                       xmax = zero
+                    end if
+                    100 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_dscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_dscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(1:j-1) := x(1:j-1) - x(j) * a(1:j-1,j)
+                          call stdlib_daxpy( j-1, -x( j )*tscal, ap( ip-j+1 ), 1, x,1 )
+                          i = stdlib_idamax( j-1, x, 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                       ip = ip - j
+                    else
+                       if( j<n ) then
+                          ! compute the update
+                             ! x(j+1:n) := x(j+1:n) - x(j) * a(j+1:n,j)
+                          call stdlib_daxpy( n-j, -x( j )*tscal, ap( ip+1 ), 1,x( j+1 ), 1 )
+                                    
+                          i = j + stdlib_idamax( n-j, x( j+1 ), 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                       ip = ip + n - j + 1
+                    end if
+                 end do loop_110
+              else
+                 ! solve a**t * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 loop_160: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = abs( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = ap( ip )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = abs( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = uscal / tjjs
+                       end if
+                       if( rec<one ) then
+                          call stdlib_dscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    sumj = zero
+                    if( uscal==one ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_ddot to perform the dot product.
+                       if( upper ) then
+                          sumj = stdlib_ddot( j-1, ap( ip-j+1 ), 1, x, 1 )
+                       else if( j<n ) then
+                          sumj = stdlib_ddot( n-j, ap( ip+1 ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             sumj = sumj + ( ap( ip-j+i )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = 1, n - j
+                             sumj = sumj + ( ap( ip+i )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==tscal ) then
+                       ! compute x(j) := ( x(j) - sumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - sumj
+                       xj = abs( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = ap( ip )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 150
+                       end if
+                       tjj = abs( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_dscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = x( j ) / tjjs
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_dscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = x( j ) / tjjs
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0, and compute a solution to a**t*x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       150 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j)  - sumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = x( j ) / tjjs - sumj
+                    end if
+                    xmax = max( xmax, abs( x( j ) ) )
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do loop_160
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_dscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_dlatps
+
+     !> DLATRS: solves one of the triangular systems
+     !> A *x = s*b  or  A**T *x = s*b
+     !> with scaling to prevent overflow.  Here A is an upper or lower
+     !> triangular matrix, A**T denotes the transpose of A, x and b are
+     !> n-element vectors, and s is a scaling factor, usually less than
+     !> or equal to 1, chosen so that the components of x will be less than
+     !> the overflow threshold.  If the unscaled problem will not cause
+     !> overflow, the Level 2 BLAS routine DTRSV is called.  If the matrix A
+     !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+     !> non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_dlatrs( uplo, trans, diag, normin, n, a, lda, x, scale,cnorm, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(out) :: scale
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: cnorm(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, j, jfirst, jinc, jlast
+           real(dp) :: bignum, grow, rec, smlnum, sumj, tjj, tjjs, tmax, tscal, uscal, xbnd, xj, &
+                     xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLATRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 do j = 1, n
+                    cnorm( j ) = stdlib_dasum( j-1, a( 1, j ), 1 )
+                 end do
+              else
+                 ! a is lower triangular.
+                 do j = 1, n - 1
+                    cnorm( j ) = stdlib_dasum( n-j, a( j+1, j ), 1 )
+                 end do
+                 cnorm( n ) = zero
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum.
+           imax = stdlib_idamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum ) then
+              tscal = one
+           else
+              tscal = one / ( smlnum*tmax )
+              call stdlib_dscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_dtrsv can be used.
+           j = stdlib_idamax( n, x, 1 )
+           xmax = abs( x( j ) )
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 50
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! m(j) = g(j-1) / abs(a(j,j))
+                    tjj = abs( a( j, j ) )
+                    xbnd = min( xbnd, min( one, tjj )*grow )
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              50 continue
+           else
+              ! compute the growth in a**t * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 80
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                    tjj = abs( a( j, j ) )
+                    if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              80 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_dtrsv( uplo, trans, diag, n, a, lda, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = bignum / xmax
+                 call stdlib_dscal( n, scale, x, 1 )
+                 xmax = bignum
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 loop_110: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = abs( x( j ) )
+                    if( nounit ) then
+                       tjjs = a( j, j )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 100
+                    end if
+                    tjj = abs( tjjs )
+                    if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                       if( tjj<one ) then
+                          if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                             rec = one / xj
+                             call stdlib_dscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j ) = x( j ) / tjjs
+                       xj = abs( x( j ) )
+                    else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                       if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                          rec = ( tjj*bignum ) / xj
+                          if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                             rec = rec / cnorm( j )
+                          end if
+                          call stdlib_dscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                       x( j ) = x( j ) / tjjs
+                       xj = abs( x( j ) )
+                    else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                       do i = 1, n
+                          x( i ) = zero
+                       end do
+                       x( j ) = one
+                       xj = one
+                       scale = zero
+                       xmax = zero
+                    end if
+                    100 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_dscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_dscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(1:j-1) := x(1:j-1) - x(j) * a(1:j-1,j)
+                          call stdlib_daxpy( j-1, -x( j )*tscal, a( 1, j ), 1, x,1 )
+                          i = stdlib_idamax( j-1, x, 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                    else
+                       if( j<n ) then
+                          ! compute the update
+                             ! x(j+1:n) := x(j+1:n) - x(j) * a(j+1:n,j)
+                          call stdlib_daxpy( n-j, -x( j )*tscal, a( j+1, j ), 1,x( j+1 ), 1 )
+                                    
+                          i = j + stdlib_idamax( n-j, x( j+1 ), 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                    end if
+                 end do loop_110
+              else
+                 ! solve a**t * x = b
+                 loop_160: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = abs( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = a( j, j )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = abs( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = uscal / tjjs
+                       end if
+                       if( rec<one ) then
+                          call stdlib_dscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    sumj = zero
+                    if( uscal==one ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_ddot to perform the dot product.
+                       if( upper ) then
+                          sumj = stdlib_ddot( j-1, a( 1, j ), 1, x, 1 )
+                       else if( j<n ) then
+                          sumj = stdlib_ddot( n-j, a( j+1, j ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             sumj = sumj + ( a( i, j )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = j + 1, n
+                             sumj = sumj + ( a( i, j )*uscal )*x( i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==tscal ) then
+                       ! compute x(j) := ( x(j) - sumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - sumj
+                       xj = abs( x( j ) )
+                       if( nounit ) then
+                          tjjs = a( j, j )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 150
+                       end if
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                       tjj = abs( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_dscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = x( j ) / tjjs
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_dscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = x( j ) / tjjs
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0, and compute a solution to a**t*x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       150 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j)  - sumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = x( j ) / tjjs - sumj
+                    end if
+                    xmax = max( xmax, abs( x( j ) ) )
+                 end do loop_160
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_dscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_dlatrs
+
+     !> DLAUU2: computes the product U * U**T or L**T * L, where the triangular
+     !> factor U or L is stored in the upper or lower triangular part of
+     !> the array A.
+     !> If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+     !> overwriting the factor U in A.
+     !> If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+     !> overwriting the factor L in A.
+     !> This is the unblocked form of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_dlauu2( uplo, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           real(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAUU2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the product u * u**t.
+              do i = 1, n
+                 aii = a( i, i )
+                 if( i<n ) then
+                    a( i, i ) = stdlib_ddot( n-i+1, a( i, i ), lda, a( i, i ), lda )
+                    call stdlib_dgemv( 'NO TRANSPOSE', i-1, n-i, one, a( 1, i+1 ),lda, a( i, i+1 )&
+                              , lda, aii, a( 1, i ), 1 )
+                 else
+                    call stdlib_dscal( i, aii, a( 1, i ), 1 )
+                 end if
+              end do
+           else
+              ! compute the product l**t * l.
+              do i = 1, n
+                 aii = a( i, i )
+                 if( i<n ) then
+                    a( i, i ) = stdlib_ddot( n-i+1, a( i, i ), 1, a( i, i ), 1 )
+                    call stdlib_dgemv( 'TRANSPOSE', n-i, i-1, one, a( i+1, 1 ), lda,a( i+1, i ), &
+                              1, aii, a( i, 1 ), lda )
+                 else
+                    call stdlib_dscal( i, aii, a( i, 1 ), lda )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_dlauu2
+
+     !> DLAUUM: computes the product U * U**T or L**T * L, where the triangular
+     !> factor U or L is stored in the upper or lower triangular part of
+     !> the array A.
+     !> If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+     !> overwriting the factor U in A.
+     !> If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+     !> overwriting the factor L in A.
+     !> This is the blocked form of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_dlauum( uplo, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ib, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAUUM', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'DLAUUM', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_dlauu2( uplo, n, a, lda, info )
+           else
+              ! use blocked code
+              if( upper ) then
+                 ! compute the product u * u**t.
+                 do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'NON-UNIT',i-1, ib, one, a( &
+                              i, i ), lda, a( 1, i ),lda )
+                    call stdlib_dlauu2( 'UPPER', ib, a( i, i ), lda, info )
+                    if( i+ib<=n ) then
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', i-1, ib,n-i-ib+1, one, a( &
+                                 1, i+ib ), lda,a( i, i+ib ), lda, one, a( 1, i ), lda )
+                       call stdlib_dsyrk( 'UPPER', 'NO TRANSPOSE', ib, n-i-ib+1,one, a( i, i+ib ),&
+                                  lda, one, a( i, i ),lda )
+                    end if
+                 end do
+              else
+                 ! compute the product l**t * l.
+                 do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    call stdlib_dtrmm( 'LEFT', 'LOWER', 'TRANSPOSE', 'NON-UNIT', ib,i-1, one, a( &
+                              i, i ), lda, a( i, 1 ), lda )
+                    call stdlib_dlauu2( 'LOWER', ib, a( i, i ), lda, info )
+                    if( i+ib<=n ) then
+                       call stdlib_dgemm( 'TRANSPOSE', 'NO TRANSPOSE', ib, i-1,n-i-ib+1, one, a( &
+                                 i+ib, i ), lda,a( i+ib, 1 ), lda, one, a( i, 1 ), lda )
+                       call stdlib_dsyrk( 'LOWER', 'TRANSPOSE', ib, n-i-ib+1, one,a( i+ib, i ), &
+                                 lda, one, a( i, i ), lda )
+                    end if
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_dlauum
+
+     !> DORBDB6: orthogonalizes the column vector
+     !> X = [ X1 ]
+     !> [ X2 ]
+     !> with respect to the columns of
+     !> Q = [ Q1 ] .
+     !> [ Q2 ]
+     !> The columns of Q must be orthonormal.
+     !> If the projection is zero according to Kahan's "twice is enough"
+     !> criterion, then the zero vector is returned.
+
+     pure subroutine stdlib_dorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
+           real(dp), intent(out) :: work(*)
+           real(dp), intent(inout) :: x1(*), x2(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: alphasq = 0.01_dp
+           real(dp), parameter :: realone = 1.0_dp
+           real(dp), parameter :: realzero = 0.0_dp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: normsq1, normsq2, scl1, scl2, ssq1, ssq2
+           ! Intrinsic Function 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           if( m1 < 0 ) then
+              info = -1
+           else if( m2 < 0 ) then
+              info = -2
+           else if( n < 0 ) then
+              info = -3
+           else if( incx1 < 1 ) then
+              info = -5
+           else if( incx2 < 1 ) then
+              info = -7
+           else if( ldq1 < max( 1, m1 ) ) then
+              info = -9
+           else if( ldq2 < max( 1, m2 ) ) then
+              info = -11
+           else if( lwork < n ) then
+              info = -13
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORBDB6', -info )
+              return
+           end if
+           ! first, project x onto the orthogonal complement of q's column
+           ! space
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_dlassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_dlassq( m2, x2, incx2, scl2, ssq2 )
+           normsq1 = scl1**2*ssq1 + scl2**2*ssq2
+           if( m1 == 0 ) then
+              do i = 1, n
+                 work(i) = zero
+              end do
+           else
+              call stdlib_dgemv( 'C', m1, n, one, q1, ldq1, x1, incx1, zero, work,1 )
+           end if
+           call stdlib_dgemv( 'C', m2, n, one, q2, ldq2, x2, incx2, one, work, 1 )
+           call stdlib_dgemv( 'N', m1, n, negone, q1, ldq1, work, 1, one, x1,incx1 )
+           call stdlib_dgemv( 'N', m2, n, negone, q2, ldq2, work, 1, one, x2,incx2 )
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_dlassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_dlassq( m2, x2, incx2, scl2, ssq2 )
+           normsq2 = scl1**2*ssq1 + scl2**2*ssq2
+           ! if projection is sufficiently large in norm, then stop.
+           ! if projection is zero, then stop.
+           ! otherwise, project again.
+           if( normsq2 >= alphasq*normsq1 ) then
+              return
+           end if
+           if( normsq2 == zero ) then
+              return
+           end if
+           normsq1 = normsq2
+           do i = 1, n
+              work(i) = zero
+           end do
+           if( m1 == 0 ) then
+              do i = 1, n
+                 work(i) = zero
+              end do
+           else
+              call stdlib_dgemv( 'C', m1, n, one, q1, ldq1, x1, incx1, zero, work,1 )
+           end if
+           call stdlib_dgemv( 'C', m2, n, one, q2, ldq2, x2, incx2, one, work, 1 )
+           call stdlib_dgemv( 'N', m1, n, negone, q1, ldq1, work, 1, one, x1,incx1 )
+           call stdlib_dgemv( 'N', m2, n, negone, q2, ldq2, work, 1, one, x2,incx2 )
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_dlassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_dlassq( m1, x1, incx1, scl1, ssq1 )
+           normsq2 = scl1**2*ssq1 + scl2**2*ssq2
+           ! if second projection is sufficiently large in norm, then do
+           ! nothing more. alternatively, if it shrunk significantly, then
+           ! truncate it to zero.
+           if( normsq2 < alphasq*normsq1 ) then
+              do i = 1, m1
+                 x1(i) = zero
+              end do
+              do i = 1, m2
+                 x2(i) = zero
+              end do
+           end if
+           return
+     end subroutine stdlib_dorbdb6
+
+     !> DORG2L: generates an m by n real matrix Q with orthonormal columns,
+     !> which is defined as the last n columns of a product of k elementary
+     !> reflectors of order m
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by DGEQLF.
+
+     pure subroutine stdlib_dorg2l( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORG2L', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           ! initialise columns 1:n-k to columns of the unit matrix
+           do j = 1, n - k
+              do l = 1, m
+                 a( l, j ) = zero
+              end do
+              a( m-n+j, j ) = one
+           end do
+           do i = 1, k
+              ii = n - k + i
+              ! apply h(i) to a(1:m-k+i,1:n-k+i) from the left
+              a( m-n+ii, ii ) = one
+              call stdlib_dlarf( 'LEFT', m-n+ii, ii-1, a( 1, ii ), 1, tau( i ), a,lda, work )
+                        
+              call stdlib_dscal( m-n+ii-1, -tau( i ), a( 1, ii ), 1 )
+              a( m-n+ii, ii ) = one - tau( i )
+              ! set a(m-k+i+1:m,n-k+i) to zero
+              do l = m - n + ii + 1, m
+                 a( l, ii ) = zero
+              end do
+           end do
+           return
+     end subroutine stdlib_dorg2l
+
+     !> DORG2R: generates an m by n real matrix Q with orthonormal columns,
+     !> which is defined as the first n columns of a product of k elementary
+     !> reflectors of order m
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by DGEQRF.
+
+     pure subroutine stdlib_dorg2r( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORG2R', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           ! initialise columns k+1:n to columns of the unit matrix
+           do j = k + 1, n
+              do l = 1, m
+                 a( l, j ) = zero
+              end do
+              a( j, j ) = one
+           end do
+           do i = k, 1, -1
+              ! apply h(i) to a(i:m,i:n) from the left
+              if( i<n ) then
+                 a( i, i ) = one
+                 call stdlib_dlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tau( i ),a( i, i+1 ), lda, &
+                           work )
+              end if
+              if( i<m )call stdlib_dscal( m-i, -tau( i ), a( i+1, i ), 1 )
+              a( i, i ) = one - tau( i )
+              ! set a(1:i-1,i) to zero
+              do l = 1, i - 1
+                 a( l, i ) = zero
+              end do
+           end do
+           return
+     end subroutine stdlib_dorg2r
+
+     !> DORGL2: generates an m by n real matrix Q with orthonormal rows,
+     !> which is defined as the first m rows of a product of k elementary
+     !> reflectors of order n
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by DGELQF.
+
+     pure subroutine stdlib_dorgl2( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGL2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 )return
+           if( k<m ) then
+              ! initialise rows k+1:m to rows of the unit matrix
+              do j = 1, n
+                 do l = k + 1, m
+                    a( l, j ) = zero
+                 end do
+                 if( j>k .and. j<=m )a( j, j ) = one
+              end do
+           end if
+           do i = k, 1, -1
+              ! apply h(i) to a(i:m,i:n) from the right
+              if( i<n ) then
+                 if( i<m ) then
+                    a( i, i ) = one
+                    call stdlib_dlarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda,tau( i ), a( i+1, i ), &
+                              lda, work )
+                 end if
+                 call stdlib_dscal( n-i, -tau( i ), a( i, i+1 ), lda )
+              end if
+              a( i, i ) = one - tau( i )
+              ! set a(i,1:i-1) to zero
+              do l = 1, i - 1
+                 a( i, l ) = zero
+              end do
+           end do
+           return
+     end subroutine stdlib_dorgl2
+
+     !> DORGLQ: generates an M-by-N real matrix Q with orthonormal rows,
+     !> which is defined as the first M rows of a product of K elementary
+     !> reflectors of order N
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by DGELQF.
+
+     pure subroutine stdlib_dorglq( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, ki, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DORGLQ', ' ', m, n, k, -1 )
+           lwkopt = max( 1, m )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( lwork<max( 1, m ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGLQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DORGLQ', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DORGLQ', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the last block.
+              ! the first kk rows are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              ! set a(kk+1:m,1:kk) to zero.
+              do j = 1, kk
+                 do i = kk + 1, m
+                    a( i, j ) = zero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the last or only block.
+           if( kk<m )call stdlib_dorgl2( m-kk, n-kk, k-kk, a( kk+1, kk+1 ), lda,tau( kk+1 ), work,&
+                      iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = ki + 1, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 if( i+ib<=m ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_dlarft( 'FORWARD', 'ROWWISE', n-i+1, ib, a( i, i ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h**t to a(i+ib:m,i:n) from the right
+                    call stdlib_dlarfb( 'RIGHT', 'TRANSPOSE', 'FORWARD', 'ROWWISE',m-i-ib+1, n-i+&
+                    1, ib, a( i, i ), lda, work,ldwork, a( i+ib, i ), lda, work( ib+1 ),ldwork )
+                              
+                 end if
+                 ! apply h**t to columns i:n of current block
+                 call stdlib_dorgl2( ib, n-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 ! set columns 1:i-1 of current block to zero
+                 do j = 1, i - 1
+                    do l = i, i + ib - 1
+                       a( l, j ) = zero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_dorglq
+
+     !> DORGQL: generates an M-by-N real matrix Q with orthonormal columns,
+     !> which is defined as the last N columns of a product of K elementary
+     !> reflectors of order M
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by DGEQLF.
+
+     pure subroutine stdlib_dorgql( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'DORGQL', ' ', m, n, k, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, n ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGQL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DORGQL', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DORGQL', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the first block.
+              ! the last kk columns are handled by the block method.
+              kk = min( k, ( ( k-nx+nb-1 ) / nb )*nb )
+              ! set a(m-kk+1:m,1:n-kk) to zero.
+              do j = 1, n - kk
+                 do i = m - kk + 1, m
+                    a( i, j ) = zero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the first or only block.
+           call stdlib_dorg2l( m-kk, n-kk, k-kk, a, lda, tau, work, iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = k - kk + 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 if( n-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_dlarft( 'BACKWARD', 'COLUMNWISE', m-k+i+ib-1, ib,a( 1, n-k+i ), &
+                              lda, tau( i ), work, ldwork )
+                    ! apply h to a(1:m-k+i+ib-1,1:n-k+i-1) from the left
+                    call stdlib_dlarfb( 'LEFT', 'NO TRANSPOSE', 'BACKWARD','COLUMNWISE', m-k+i+ib-&
+                    1, n-k+i-1, ib,a( 1, n-k+i ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+                 ! apply h to rows 1:m-k+i+ib-1 of current block
+                 call stdlib_dorg2l( m-k+i+ib-1, ib, ib, a( 1, n-k+i ), lda,tau( i ), work, iinfo &
+                           )
+                 ! set rows m-k+i+ib:m of current block to zero
+                 do j = n - k + i, n - k + i + ib - 1
+                    do l = m - k + i + ib, m
+                       a( l, j ) = zero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_dorgql
+
+     !> DORGQR: generates an M-by-N real matrix Q with orthonormal columns,
+     !> which is defined as the first N columns of a product of K elementary
+     !> reflectors of order M
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by DGEQRF.
+
+     pure subroutine stdlib_dorgqr( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, ki, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DORGQR', ' ', m, n, k, -1 )
+           lwkopt = max( 1, n )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGQR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DORGQR', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DORGQR', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the last block.
+              ! the first kk columns are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              ! set a(1:kk,kk+1:n) to zero.
+              do j = kk + 1, n
+                 do i = 1, kk
+                    a( i, j ) = zero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the last or only block.
+           if( kk<n )call stdlib_dorg2r( m-kk, n-kk, k-kk, a( kk+1, kk+1 ), lda,tau( kk+1 ), work,&
+                      iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = ki + 1, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_dlarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h to a(i:m,i+ib:n) from the left
+                    call stdlib_dlarfb( 'LEFT', 'NO TRANSPOSE', 'FORWARD','COLUMNWISE', m-i+1, n-&
+                    i-ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 ), &
+                              ldwork )
+                 end if
+                 ! apply h to rows i:m of current block
+                 call stdlib_dorg2r( m-i+1, ib, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 ! set rows 1:i-1 of current block to zero
+                 do j = i, i + ib - 1
+                    do l = 1, i - 1
+                       a( l, j ) = zero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_dorgqr
+
+     !> DORGR2: generates an m by n real matrix Q with orthonormal rows,
+     !> which is defined as the last m rows of a product of k elementary
+     !> reflectors of order n
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by DGERQF.
+
+     pure subroutine stdlib_dorgr2( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGR2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 )return
+           if( k<m ) then
+              ! initialise rows 1:m-k to rows of the unit matrix
+              do j = 1, n
+                 do l = 1, m - k
+                    a( l, j ) = zero
+                 end do
+                 if( j>n-m .and. j<=n-k )a( m-n+j, j ) = one
+              end do
+           end if
+           do i = 1, k
+              ii = m - k + i
+              ! apply h(i) to a(1:m-k+i,1:n-k+i) from the right
+              a( ii, n-m+ii ) = one
+              call stdlib_dlarf( 'RIGHT', ii-1, n-m+ii, a( ii, 1 ), lda, tau( i ),a, lda, work )
+                        
+              call stdlib_dscal( n-m+ii-1, -tau( i ), a( ii, 1 ), lda )
+              a( ii, n-m+ii ) = one - tau( i )
+              ! set a(m-k+i,n-k+i+1:n) to zero
+              do l = n - m + ii + 1, n
+                 a( ii, l ) = zero
+              end do
+           end do
+           return
+     end subroutine stdlib_dorgr2
+
+     !> DORGRQ: generates an M-by-N real matrix Q with orthonormal rows,
+     !> which is defined as the last M rows of a product of K elementary
+     !> reflectors of order N
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by DGERQF.
+
+     pure subroutine stdlib_dorgrq( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, ii, iinfo, iws, j, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'DORGRQ', ' ', m, n, k, -1 )
+                 lwkopt = m*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, m ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGRQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DORGRQ', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DORGRQ', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the first block.
+              ! the last kk rows are handled by the block method.
+              kk = min( k, ( ( k-nx+nb-1 ) / nb )*nb )
+              ! set a(1:m-kk,n-kk+1:n) to zero.
+              do j = n - kk + 1, n
+                 do i = 1, m - kk
+                    a( i, j ) = zero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the first or only block.
+           call stdlib_dorgr2( m-kk, n-kk, k-kk, a, lda, tau, work, iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = k - kk + 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 ii = m - k + i
+                 if( ii>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_dlarft( 'BACKWARD', 'ROWWISE', n-k+i+ib-1, ib,a( ii, 1 ), lda, &
+                              tau( i ), work, ldwork )
+                    ! apply h**t to a(1:m-k+i-1,1:n-k+i+ib-1) from the right
+                    call stdlib_dlarfb( 'RIGHT', 'TRANSPOSE', 'BACKWARD', 'ROWWISE',ii-1, n-k+i+&
+                              ib-1, ib, a( ii, 1 ), lda, work,ldwork, a, lda, work( ib+1 ), ldwork )
+                 end if
+                 ! apply h**t to columns 1:n-k+i+ib-1 of current block
+                 call stdlib_dorgr2( ib, n-k+i+ib-1, ib, a( ii, 1 ), lda, tau( i ),work, iinfo )
+                           
+                 ! set columns n-k+i+ib:n of current block to zero
+                 do l = n - k + i + ib, n
+                    do j = ii, ii + ib - 1
+                       a( j, l ) = zero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_dorgrq
+
+     !> DORGTSQR_ROW: generates an M-by-N real matrix Q_out with
+     !> orthonormal columns from the output of DLATSQR. These N orthonormal
+     !> columns are the first N columns of a product of complex unitary
+     !> matrices Q(k)_in of order M, which are returned by DLATSQR in
+     !> a special format.
+     !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+     !> The input matrices Q(k)_in are stored in row and column blocks in A.
+     !> See the documentation of DLATSQR for more details on the format of
+     !> Q(k)_in, where each Q(k)_in is represented by block Householder
+     !> transformations. This routine calls an auxiliary routine DLARFB_GETT,
+     !> where the computation is performed on each individual block. The
+     !> algorithm first sweeps NB-sized column blocks from the right to left
+     !> starting in the bottom row block and continues to the top row block
+     !> (hence _ROW in the routine name). This sweep is in reverse order of
+     !> the order in which DLATSQR generates the output blocks.
+
+     pure subroutine stdlib_dorgtsqr_row( m, n, mb, nb, a, lda, t, ldt, work,lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, mb, nb
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: t(ldt,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: nblocal, mb2, m_plus_one, itmp, ib_bottom, lworkopt, &
+                     num_all_row_blocks, jb_t, ib, imb, kb, kb_last, knb, mb1
+           ! Local Arrays 
+           real(dp) :: dummy(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           lquery  = lwork==-1
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb<=n ) then
+              info = -3
+           else if( nb<1 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           nblocal = min( nb, n )
+           ! determine the workspace size.
+           if( info==0 ) then
+              lworkopt = nblocal * max( nblocal, ( n - nblocal ) )
+           end if
+           ! handle error in the input parameters and handle the workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGTSQR_ROW', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = real( lworkopt,KIND=dp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = real( lworkopt,KIND=dp)
+              return
+           end if
+           ! (0) set the upper-triangular part of the matrix a to zero and
+           ! its diagonal elements to one.
+           call stdlib_dlaset('U', m, n, zero, one, a, lda )
+           ! kb_last is the column index of the last column block reflector
+           ! in the matrices t and v.
+           kb_last = ( ( n-1 ) / nblocal ) * nblocal + 1
+           ! (1) bottom-up loop over row blocks of a, except the top row block.
+           ! note: if mb>=m, then the loop is never executed.
+           if ( mb<m ) then
+              ! mb2 is the row blocking size for the row blocks before the
+              ! first top row block in the matrix a. ib is the row index for
+              ! the row blocks in the matrix a before the first top row block.
+              ! ib_bottom is the row index for the last bottom row block
+              ! in the matrix a. jb_t is the column index of the corresponding
+              ! column block in the matrix t.
+              ! initialize variables.
+              ! num_all_row_blocks is the number of row blocks in the matrix a
+              ! including the first row block.
+              mb2 = mb - n
+              m_plus_one = m + 1
+              itmp = ( m - mb - 1 ) / mb2
+              ib_bottom = itmp * mb2 + mb + 1
+              num_all_row_blocks = itmp + 2
+              jb_t = num_all_row_blocks * n + 1
+              do ib = ib_bottom, mb+1, -mb2
+                 ! determine the block size imb for the current row block
+                 ! in the matrix a.
+                 imb = min( m_plus_one - ib, mb2 )
+                 ! determine the column index jb_t for the current column block
+                 ! in the matrix t.
+                 jb_t = jb_t - n
+                 ! apply column blocks of h in the row block from right to left.
+                 ! kb is the column index of the current column block reflector
+                 ! in the matrices t and v.
+                 do kb = kb_last, 1, -nblocal
+                    ! determine the size of the current column block knb in
+                    ! the matrices t and v.
+                    knb = min( nblocal, n - kb + 1 )
+                    call stdlib_dlarfb_gett( 'I', imb, n-kb+1, knb,t( 1, jb_t+kb-1 ), ldt, a( kb, &
+                              kb ), lda,a( ib, kb ), lda, work, knb )
+                 end do
+              end do
+           end if
+           ! (2) top row block of a.
+           ! note: if mb>=m, then we have only one row block of a of size m
+           ! and we work on the entire matrix a.
+           mb1 = min( mb, m )
+           ! apply column blocks of h in the top row block from right to left.
+           ! kb is the column index of the current block reflector in
+           ! the matrices t and v.
+           do kb = kb_last, 1, -nblocal
+              ! determine the size of the current column block knb in
+              ! the matrices t and v.
+              knb = min( nblocal, n - kb + 1 )
+              if( mb1-kb-knb+1==0 ) then
+                 ! in stdlib_slarfb_gett parameters, when m=0, then the matrix b
+                 ! does not exist, hence we need to pass a dummy array
+                 ! reference dummy(1,1) to b with lddummy=1.
+                 call stdlib_dlarfb_gett( 'N', 0, n-kb+1, knb,t( 1, kb ), ldt, a( kb, kb ), lda,&
+                           dummy( 1, 1 ), 1, work, knb )
+              else
+                 call stdlib_dlarfb_gett( 'N', mb1-kb-knb+1, n-kb+1, knb,t( 1, kb ), ldt, a( kb, &
+                           kb ), lda,a( kb+knb, kb), lda, work, knb )
+              end if
+           end do
+           work( 1 ) = real( lworkopt,KIND=dp)
+           return
+     end subroutine stdlib_dorgtsqr_row
+
+
+     pure subroutine stdlib_dorm22( side, trans, m, n, n1, n2, q, ldq, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(in) :: m, n, n1, n2, ldq, ldc, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(in) :: q(ldq,*)
+           real(dp), intent(inout) :: c(ldc,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, ldwork, len, lwkopt, nb, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q;
+           ! nw is the minimum dimension of work.
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           nw = nq
+           if( n1==0 .or. n2==0 ) nw = 1
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) )&
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( n1<0 .or. n1+n2/=nq ) then
+              info = -5
+           else if( n2<0 ) then
+              info = -6
+           else if( ldq<max( 1, nq ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              lwkopt = m*n
+              work( 1 ) = real( lwkopt,KIND=dp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORM22', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! degenerate cases (n1 = 0 or n2 = 0) are handled using stdlib_dtrmm.
+           if( n1==0 ) then
+              call stdlib_dtrmm( side, 'UPPER', trans, 'NON-UNIT', m, n, one,q, ldq, c, ldc )
+                        
+              work( 1 ) = one
+              return
+           else if( n2==0 ) then
+              call stdlib_dtrmm( side, 'LOWER', trans, 'NON-UNIT', m, n, one,q, ldq, c, ldc )
+                        
+              work( 1 ) = one
+              return
+           end if
+           ! compute the largest chunk size available from the workspace.
+           nb = max( 1, min( lwork, lwkopt ) / nq )
+           if( left ) then
+              if( notran ) then
+                 do i = 1, n, nb
+                    len = min( nb, n-i+1 )
+                    ldwork = m
+                    ! multiply bottom part of c by q12.
+                    call stdlib_dlacpy( 'ALL', n1, len, c( n2+1, i ), ldc, work,ldwork )
+                    call stdlib_dtrmm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT',n1, len, one, &
+                              q( 1, n2+1 ), ldq, work,ldwork )
+                    ! multiply top part of c by q11.
+                    call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n1, len, n2,one, q, ldq, c(&
+                               1, i ), ldc, one, work,ldwork )
+                    ! multiply top part of c by q21.
+                    call stdlib_dlacpy( 'ALL', n2, len, c( 1, i ), ldc,work( n1+1 ), ldwork )
+                              
+                    call stdlib_dtrmm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',n2, len, one, &
+                              q( n1+1, 1 ), ldq,work( n1+1 ), ldwork )
+                    ! multiply bottom part of c by q22.
+                    call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n2, len, n1,one, q( n1+1, &
+                              n2+1 ), ldq, c( n2+1, i ), ldc,one, work( n1+1 ), ldwork )
+                    ! copy everything back.
+                    call stdlib_dlacpy( 'ALL', m, len, work, ldwork, c( 1, i ),ldc )
+                 end do
+              else
+                 do i = 1, n, nb
+                    len = min( nb, n-i+1 )
+                    ldwork = m
+                    ! multiply bottom part of c by q21**t.
+                    call stdlib_dlacpy( 'ALL', n2, len, c( n1+1, i ), ldc, work,ldwork )
+                    call stdlib_dtrmm( 'LEFT', 'UPPER', 'TRANSPOSE', 'NON-UNIT',n2, len, one, q( &
+                              n1+1, 1 ), ldq, work,ldwork )
+                    ! multiply top part of c by q11**t.
+                    call stdlib_dgemm( 'TRANSPOSE', 'NO TRANSPOSE', n2, len, n1,one, q, ldq, c( 1,&
+                               i ), ldc, one, work,ldwork )
+                    ! multiply top part of c by q12**t.
+                    call stdlib_dlacpy( 'ALL', n1, len, c( 1, i ), ldc,work( n2+1 ), ldwork )
+                              
+                    call stdlib_dtrmm( 'LEFT', 'LOWER', 'TRANSPOSE', 'NON-UNIT',n1, len, one, q( &
+                              1, n2+1 ), ldq,work( n2+1 ), ldwork )
+                    ! multiply bottom part of c by q22**t.
+                    call stdlib_dgemm( 'TRANSPOSE', 'NO TRANSPOSE', n1, len, n2,one, q( n1+1, n2+&
+                              1 ), ldq, c( n1+1, i ), ldc,one, work( n2+1 ), ldwork )
+                    ! copy everything back.
+                    call stdlib_dlacpy( 'ALL', m, len, work, ldwork, c( 1, i ),ldc )
+                 end do
+              end if
+           else
+              if( notran ) then
+                 do i = 1, m, nb
+                    len = min( nb, m-i+1 )
+                    ldwork = len
+                    ! multiply right part of c by q21.
+                    call stdlib_dlacpy( 'ALL', len, n2, c( i, n1+1 ), ldc, work,ldwork )
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',len, n2, one, &
+                              q( n1+1, 1 ), ldq, work,ldwork )
+                    ! multiply left part of c by q11.
+                    call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', len, n2, n1,one, c( i, 1 ),&
+                               ldc, q, ldq, one, work,ldwork )
+                    ! multiply left part of c by q12.
+                    call stdlib_dlacpy( 'ALL', len, n1, c( i, 1 ), ldc,work( 1 + n2*ldwork ), &
+                              ldwork )
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT',len, n1, one, &
+                              q( 1, n2+1 ), ldq,work( 1 + n2*ldwork ), ldwork )
+                    ! multiply right part of c by q22.
+                    call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', len, n1, n2,one, c( i, n1+&
+                              1 ), ldc, q( n1+1, n2+1 ), ldq,one, work( 1 + n2*ldwork ), ldwork )
+                    ! copy everything back.
+                    call stdlib_dlacpy( 'ALL', len, n, work, ldwork, c( i, 1 ),ldc )
+                 end do
+              else
+                 do i = 1, m, nb
+                    len = min( nb, m-i+1 )
+                    ldwork = len
+                    ! multiply right part of c by q12**t.
+                    call stdlib_dlacpy( 'ALL', len, n1, c( i, n2+1 ), ldc, work,ldwork )
+                    call stdlib_dtrmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'NON-UNIT',len, n1, one, q( &
+                              1, n2+1 ), ldq, work,ldwork )
+                    ! multiply left part of c by q11**t.
+                    call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', len, n1, n2,one, c( i, 1 ), &
+                              ldc, q, ldq, one, work,ldwork )
+                    ! multiply left part of c by q21**t.
+                    call stdlib_dlacpy( 'ALL', len, n2, c( i, 1 ), ldc,work( 1 + n1*ldwork ), &
+                              ldwork )
+                    call stdlib_dtrmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'NON-UNIT',len, n2, one, q( &
+                              n1+1, 1 ), ldq,work( 1 + n1*ldwork ), ldwork )
+                    ! multiply right part of c by q22**t.
+                    call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', len, n2, n1,one, c( i, n2+1 ),&
+                               ldc, q( n1+1, n2+1 ), ldq,one, work( 1 + n1*ldwork ), ldwork )
+                    ! copy everything back.
+                    call stdlib_dlacpy( 'ALL', len, n, work, ldwork, c( i, 1 ),ldc )
+                 end do
+              end if
+           end if
+           work( 1 ) = real( lwkopt,KIND=dp)
+           return
+     end subroutine stdlib_dorm22
+
+     !> DORM2L: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T * C  if SIDE = 'L' and TRANS = 'T', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by DGEQLF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_dorm2l( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, mi, ni, nq
+           real(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORM2L', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. notran ) .or. ( .not.left .and. .not.notran ) )then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+           else
+              mi = m
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) is applied to c(1:m-k+i,1:n)
+                 mi = m - k + i
+              else
+                 ! h(i) is applied to c(1:m,1:n-k+i)
+                 ni = n - k + i
+              end if
+              ! apply h(i)
+              aii = a( nq-k+i, i )
+              a( nq-k+i, i ) = one
+              call stdlib_dlarf( side, mi, ni, a( 1, i ), 1, tau( i ), c, ldc,work )
+              a( nq-k+i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_dorm2l
+
+     !> DORM2R: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T* C  if SIDE = 'L' and TRANS = 'T', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by DGEQRF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_dorm2r( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, jc, mi, ni, nq
+           real(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORM2R', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran ) .or. ( .not.left .and. notran ) )then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              jc = 1
+           else
+              mi = m
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i)
+              aii = a( i, i )
+              a( i, i ) = one
+              call stdlib_dlarf( side, mi, ni, a( i, i ), 1, tau( i ), c( ic, jc ),ldc, work )
+                        
+              a( i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_dorm2r
+
+     !> DORML2: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T* C  if SIDE = 'L' and TRANS = 'T', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by DGELQF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_dorml2( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, jc, mi, ni, nq
+           real(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORML2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. notran ) .or. ( .not.left .and. .not.notran ) )then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              jc = 1
+           else
+              mi = m
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i)
+              aii = a( i, i )
+              a( i, i ) = one
+              call stdlib_dlarf( side, mi, ni, a( i, i ), lda, tau( i ),c( ic, jc ), ldc, work )
+                        
+              a( i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_dorml2
+
+     !> DORMLQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by DGELQF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_dormlq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, jc, ldwork, lwkopt, mi, nb, nbmin, &
+                     ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'DORMLQ', side // trans, m, n, k,-1 ) )
+              lwkopt = nw*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMLQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DORMLQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_dorml2( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. notran ) .or.( .not.left .and. .not.notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              if( notran ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i) h(i+1) . . . h(i+ib-1)
+                 call stdlib_dlarft( 'FORWARD', 'ROWWISE', nq-i+1, ib, a( i, i ),lda, tau( i ), &
+                           work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**t is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**t
+                 call stdlib_dlarfb( side, transt, 'FORWARD', 'ROWWISE', mi, ni, ib,a( i, i ), &
+                           lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dormlq
+
+     !> DORMQL: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by DGEQLF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_dormql( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, i1, i2, i3, ib, iinfo, iwt, ldwork, lwkopt, mi, nb, nbmin, ni, nq, &
+                     nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'DORMQL', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMQL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DORMQL', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_dorm2l( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. notran ) .or.( .not.left .and. .not.notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_dlarft( 'BACKWARD', 'COLUMNWISE', nq-k+i+ib-1, ib,a( 1, i ), lda, &
+                           tau( i ), work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(1:m-k+i+ib-1,1:n)
+                    mi = m - k + i + ib - 1
+                 else
+                    ! h or h**t is applied to c(1:m,1:n-k+i+ib-1)
+                    ni = n - k + i + ib - 1
+                 end if
+                 ! apply h or h**t
+                 call stdlib_dlarfb( side, trans, 'BACKWARD', 'COLUMNWISE', mi, ni,ib, a( 1, i ), &
+                           lda, work( iwt ), ldt, c, ldc,work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dormql
+
+     !> DORMQR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by DGEQRF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_dormqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, jc, ldwork, lwkopt, mi, nb, nbmin, &
+                     ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'DORMQR', side // trans, m, n, k,-1 ) )
+              lwkopt = nw*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMQR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DORMQR', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_dorm2r( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i) h(i+1) . . . h(i+ib-1)
+                 call stdlib_dlarft( 'FORWARD', 'COLUMNWISE', nq-i+1, ib, a( i, i ),lda, tau( i ),&
+                            work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**t is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**t
+                 call stdlib_dlarfb( side, trans, 'FORWARD', 'COLUMNWISE', mi, ni,ib, a( i, i ), &
+                           lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dormqr
+
+     !> DORMR2: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T* C  if SIDE = 'L' and TRANS = 'T', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by DGERQF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_dormr2( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, mi, ni, nq
+           real(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMR2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran ) .or. ( .not.left .and. notran ) )then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+           else
+              mi = m
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) is applied to c(1:m-k+i,1:n)
+                 mi = m - k + i
+              else
+                 ! h(i) is applied to c(1:m,1:n-k+i)
+                 ni = n - k + i
+              end if
+              ! apply h(i)
+              aii = a( i, nq-k+i )
+              a( i, nq-k+i ) = one
+              call stdlib_dlarf( side, mi, ni, a( i, 1 ), lda, tau( i ), c, ldc,work )
+              a( i, nq-k+i ) = aii
+           end do
+           return
+     end subroutine stdlib_dormr2
+
+     !> DORMR3: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by DTZRZF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_dormr3( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, l, lda, ldc, m, n
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), tau(*)
+           real(dp), intent(inout) :: c(ldc,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, ja, jc, mi, ni, nq
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( l<0 .or. ( left .and. ( l>m ) ) .or.( .not.left .and. ( l>n ) ) ) then
+              info = -6
+           else if( lda<max( 1, k ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMR3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran .or. .not.left .and. notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              ja = m - l + 1
+              jc = 1
+           else
+              mi = m
+              ja = n - l + 1
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**t is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) or h(i)**t is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i) or h(i)**t
+              call stdlib_dlarz( side, mi, ni, l, a( i, ja ), lda, tau( i ),c( ic, jc ), ldc, &
+                        work )
+           end do
+           return
+     end subroutine stdlib_dormr3
+
+     !> DORMRQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by DGERQF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_dormrq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, iinfo, iwt, ldwork, lwkopt, mi, nb, nbmin, ni, nq, &
+                     nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'DORMRQ', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMRQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DORMRQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_dormr2( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              if( notran ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_dlarft( 'BACKWARD', 'ROWWISE', nq-k+i+ib-1, ib,a( i, 1 ), lda, tau( &
+                           i ), work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(1:m-k+i+ib-1,1:n)
+                    mi = m - k + i + ib - 1
+                 else
+                    ! h or h**t is applied to c(1:m,1:n-k+i+ib-1)
+                    ni = n - k + i + ib - 1
+                 end if
+                 ! apply h or h**t
+                 call stdlib_dlarfb( side, transt, 'BACKWARD', 'ROWWISE', mi, ni,ib, a( i, 1 ), &
+                           lda, work( iwt ), ldt, c, ldc,work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dormrq
+
+     !> DORMRZ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by DTZRZF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_dormrz( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, l, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, ja, jc, ldwork, lwkopt, mi, nb, &
+                     nbmin, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( l<0 .or. ( left .and. ( l>m ) ) .or.( .not.left .and. ( l>n ) ) ) then
+              info = -6
+           else if( lda<max( 1, k ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'DORMRQ', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMRZ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DORMRQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_dormr3( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, iinfo )
+                        
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+                 ja = m - l + 1
+              else
+                 mi = m
+                 ic = 1
+                 ja = n - l + 1
+              end if
+              if( notran ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_dlarzt( 'BACKWARD', 'ROWWISE', l, ib, a( i, ja ), lda,tau( i ), work(&
+                            iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**t is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**t
+                 call stdlib_dlarzb( side, transt, 'BACKWARD', 'ROWWISE', mi, ni,ib, l, a( i, ja )&
+                           , lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dormrz
+
+     !> DPBEQU: computes row and column scalings intended to equilibrate a
+     !> symmetric positive definite band matrix A and reduce its condition
+     !> number (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_dpbequ( uplo, n, kd, ab, ldab, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(dp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(in) :: ab(ldab,*)
+           real(dp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j
+           real(dp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           if( upper ) then
+              j = kd + 1
+           else
+              j = 1
+           end if
+           ! initialize smin and amax.
+           s( 1 ) = ab( j, 1 )
+           smin = s( 1 )
+           amax = s( 1 )
+           ! find the minimum and maximum diagonal elements.
+           do i = 2, n
+              s( i ) = ab( j, i )
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_dpbequ
+
+     !> DPBSTF: computes a split Cholesky factorization of a real
+     !> symmetric positive definite band matrix A.
+     !> This routine is designed to be used in conjunction with DSBGST.
+     !> The factorization has the form  A = S**T*S  where S is a band matrix
+     !> of the same bandwidth as A and the following structure:
+     !> S = ( U    )
+     !> ( M  L )
+     !> where U is upper triangular of order m = (n+kd)/2, and L is lower
+     !> triangular of order n-m.
+
+     pure subroutine stdlib_dpbstf( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, kld, km, m
+           real(dp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBSTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           kld = max( 1, ldab-1 )
+           ! set the splitting point m.
+           m = ( n+kd ) / 2
+           if( upper ) then
+              ! factorize a(m+1:n,m+1:n) as l**t*l, and update a(1:m,1:m).
+              do j = n, m + 1, -1
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = ab( kd+1, j )
+                 if( ajj<=zero )go to 50
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 km = min( j-1, kd )
+                 ! compute elements j-km:j-1 of the j-th column and update the
+                 ! the leading submatrix within the band.
+                 call stdlib_dscal( km, one / ajj, ab( kd+1-km, j ), 1 )
+                 call stdlib_dsyr( 'UPPER', km, -one, ab( kd+1-km, j ), 1,ab( kd+1, j-km ), kld )
+                           
+              end do
+              ! factorize the updated submatrix a(1:m,1:m) as u**t*u.
+              do j = 1, m
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = ab( kd+1, j )
+                 if( ajj<=zero )go to 50
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 km = min( kd, m-j )
+                 ! compute elements j+1:j+km of the j-th row and update the
+                 ! trailing submatrix within the band.
+                 if( km>0 ) then
+                    call stdlib_dscal( km, one / ajj, ab( kd, j+1 ), kld )
+                    call stdlib_dsyr( 'UPPER', km, -one, ab( kd, j+1 ), kld,ab( kd+1, j+1 ), kld )
+                              
+                 end if
+              end do
+           else
+              ! factorize a(m+1:n,m+1:n) as l**t*l, and update a(1:m,1:m).
+              do j = n, m + 1, -1
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = ab( 1, j )
+                 if( ajj<=zero )go to 50
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 km = min( j-1, kd )
+                 ! compute elements j-km:j-1 of the j-th row and update the
+                 ! trailing submatrix within the band.
+                 call stdlib_dscal( km, one / ajj, ab( km+1, j-km ), kld )
+                 call stdlib_dsyr( 'LOWER', km, -one, ab( km+1, j-km ), kld,ab( 1, j-km ), kld )
+                           
+              end do
+              ! factorize the updated submatrix a(1:m,1:m) as u**t*u.
+              do j = 1, m
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = ab( 1, j )
+                 if( ajj<=zero )go to 50
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 km = min( kd, m-j )
+                 ! compute elements j+1:j+km of the j-th column and update the
+                 ! trailing submatrix within the band.
+                 if( km>0 ) then
+                    call stdlib_dscal( km, one / ajj, ab( 2, j ), 1 )
+                    call stdlib_dsyr( 'LOWER', km, -one, ab( 2, j ), 1,ab( 1, j+1 ), kld )
+                 end if
+              end do
+           end if
+           return
+           50 continue
+           info = j
+           return
+     end subroutine stdlib_dpbstf
+
+     !> DPBTF2: computes the Cholesky factorization of a real symmetric
+     !> positive definite band matrix A.
+     !> The factorization has the form
+     !> A = U**T * U ,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix, U**T is the transpose of U, and
+     !> L is lower triangular.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_dpbtf2( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, kld, kn
+           real(dp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           kld = max( 1, ldab-1 )
+           if( upper ) then
+              ! compute the cholesky factorization a = u**t*u.
+              do j = 1, n
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = ab( kd+1, j )
+                 if( ajj<=zero )go to 30
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 ! compute elements j+1:j+kn of row j and update the
+                 ! trailing submatrix within the band.
+                 kn = min( kd, n-j )
+                 if( kn>0 ) then
+                    call stdlib_dscal( kn, one / ajj, ab( kd, j+1 ), kld )
+                    call stdlib_dsyr( 'UPPER', kn, -one, ab( kd, j+1 ), kld,ab( kd+1, j+1 ), kld )
+                              
+                 end if
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**t.
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = ab( 1, j )
+                 if( ajj<=zero )go to 30
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 ! compute elements j+1:j+kn of column j and update the
+                 ! trailing submatrix within the band.
+                 kn = min( kd, n-j )
+                 if( kn>0 ) then
+                    call stdlib_dscal( kn, one / ajj, ab( 2, j ), 1 )
+                    call stdlib_dsyr( 'LOWER', kn, -one, ab( 2, j ), 1,ab( 1, j+1 ), kld )
+                 end if
+              end do
+           end if
+           return
+           30 continue
+           info = j
+           return
+     end subroutine stdlib_dpbtf2
+
+     !> DPBTRS: solves a system of linear equations A*X = B with a symmetric
+     !> positive definite band matrix A using the Cholesky factorization
+     !> A = U**T*U or A = L*L**T computed by DPBTRF.
+
+     pure subroutine stdlib_dpbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(in) :: ab(ldab,*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**t *u.
+              do j = 1, nrhs
+                 ! solve u**t *x = b, overwriting b with x.
+                 call stdlib_dtbsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j ), &
+                           1 )
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_dtbsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j )&
+                           , 1 )
+              end do
+           else
+              ! solve a*x = b where a = l*l**t.
+              do j = 1, nrhs
+                 ! solve l*x = b, overwriting b with x.
+                 call stdlib_dtbsv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j )&
+                           , 1 )
+                 ! solve l**t *x = b, overwriting b with x.
+                 call stdlib_dtbsv( 'LOWER', 'TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j ), &
+                           1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_dpbtrs
+
+     !> DPOEQU: computes row and column scalings intended to equilibrate a
+     !> symmetric positive definite matrix A and reduce its condition number
+     !> (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_dpoequ( n, a, lda, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           ! find the minimum and maximum diagonal elements.
+           s( 1 ) = a( 1, 1 )
+           smin = s( 1 )
+           amax = s( 1 )
+           do i = 2, n
+              s( i ) = a( i, i )
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_dpoequ
+
+     !> DPOEQUB: computes row and column scalings intended to equilibrate a
+     !> symmetric positive definite matrix A and reduce its condition number
+     !> (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+     !> This routine differs from DPOEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled diagonal entries are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_dpoequb( n, a, lda, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: smin, base, tmp
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt,log,int
+           ! Executable Statements 
+           ! test the input parameters.
+           ! positive definite only performs 1 pass of equilibration.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           base = stdlib_dlamch( 'B' )
+           tmp = -0.5e+0_dp / log ( base )
+           ! find the minimum and maximum diagonal elements.
+           s( 1 ) = a( 1, 1 )
+           smin = s( 1 )
+           amax = s( 1 )
+           do i = 2, n
+              s( i ) = a( i, i )
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = base ** int( tmp * log( s( i ) ),KIND=ilp)
+              end do
+              ! compute scond = min(s(i)) / max(s(i)).
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_dpoequb
+
+     !> DPOTRS: solves a system of linear equations A*X = B with a symmetric
+     !> positive definite matrix A using the Cholesky factorization
+     !> A = U**T*U or A = L*L**T computed by DPOTRF.
+
+     pure subroutine stdlib_dpotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**t *u.
+              ! solve u**t *x = b, overwriting b with x.
+              call stdlib_dtrsm( 'LEFT', 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, nrhs,one, a, lda, b,&
+                         ldb )
+              ! solve u*x = b, overwriting b with x.
+              call stdlib_dtrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, one, a, lda,&
+                         b, ldb )
+           else
+              ! solve a*x = b where a = l*l**t.
+              ! solve l*x = b, overwriting b with x.
+              call stdlib_dtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, one, a, lda,&
+                         b, ldb )
+              ! solve l**t *x = b, overwriting b with x.
+              call stdlib_dtrsm( 'LEFT', 'LOWER', 'TRANSPOSE', 'NON-UNIT', n, nrhs,one, a, lda, b,&
+                         ldb )
+           end if
+           return
+     end subroutine stdlib_dpotrs
+
+     !> DPPEQU: computes row and column scalings intended to equilibrate a
+     !> symmetric positive definite matrix A in packed storage and reduce
+     !> its condition number (with respect to the two-norm).  S contains the
+     !> scale factors, S(i)=1/sqrt(A(i,i)), chosen so that the scaled matrix
+     !> B with elements B(i,j)=S(i)*A(i,j)*S(j) has ones on the diagonal.
+     !> This choice of S puts the condition number of B within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_dppequ( uplo, n, ap, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(in) :: ap(*)
+           real(dp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, jj
+           real(dp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           ! initialize smin and amax.
+           s( 1 ) = ap( 1 )
+           smin = s( 1 )
+           amax = s( 1 )
+           if( upper ) then
+              ! uplo = 'u':  upper triangle of a is stored.
+              ! find the minimum and maximum diagonal elements.
+              jj = 1
+              do i = 2, n
+                 jj = jj + i
+                 s( i ) = ap( jj )
+                 smin = min( smin, s( i ) )
+                 amax = max( amax, s( i ) )
+              end do
+           else
+              ! uplo = 'l':  lower triangle of a is stored.
+              ! find the minimum and maximum diagonal elements.
+              jj = 1
+              do i = 2, n
+                 jj = jj + n - i + 2
+                 s( i ) = ap( jj )
+                 smin = min( smin, s( i ) )
+                 amax = max( amax, s( i ) )
+              end do
+           end if
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_dppequ
+
+     !> DPPTRF: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A stored in packed format.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+
+     pure subroutine stdlib_dpptrf( uplo, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jc, jj
+           real(dp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the cholesky factorization a = u**t*u.
+              jj = 0
+              do j = 1, n
+                 jc = jj + 1
+                 jj = jj + j
+                 ! compute elements 1:j-1 of column j.
+                 if( j>1 )call stdlib_dtpsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', j-1, ap,ap( jc ), &
+                           1 )
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = ap( jj ) - stdlib_ddot( j-1, ap( jc ), 1, ap( jc ), 1 )
+                 if( ajj<=zero ) then
+                    ap( jj ) = ajj
+                    go to 30
+                 end if
+                 ap( jj ) = sqrt( ajj )
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**t.
+              jj = 1
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = ap( jj )
+                 if( ajj<=zero ) then
+                    ap( jj ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 ap( jj ) = ajj
+                 ! compute elements j+1:n of column j and update the trailing
+                 ! submatrix.
+                 if( j<n ) then
+                    call stdlib_dscal( n-j, one / ajj, ap( jj+1 ), 1 )
+                    call stdlib_dspr( 'LOWER', n-j, -one, ap( jj+1 ), 1,ap( jj+n-j+1 ) )
+                    jj = jj + n - j + 1
+                 end if
+              end do
+           end if
+           go to 40
+           30 continue
+           info = j
+           40 continue
+           return
+     end subroutine stdlib_dpptrf
+
+     !> DPPTRS: solves a system of linear equations A*X = B with a symmetric
+     !> positive definite matrix A in packed storage using the Cholesky
+     !> factorization A = U**T*U or A = L*L**T computed by DPPTRF.
+
+     pure subroutine stdlib_dpptrs( uplo, n, nrhs, ap, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(in) :: ap(*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**t * u.
+              do i = 1, nrhs
+                 ! solve u**t *x = b, overwriting b with x.
+                 call stdlib_dtpsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_dtpsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                           
+              end do
+           else
+              ! solve a*x = b where a = l * l**t.
+              do i = 1, nrhs
+                 ! solve l*y = b, overwriting b with x.
+                 call stdlib_dtpsv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                           
+                 ! solve l**t *x = y, overwriting b with x.
+                 call stdlib_dtpsv( 'LOWER', 'TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_dpptrs
+
+     !> DPTCON: computes the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric positive definite tridiagonal matrix
+     !> using the factorization A = L*D*L**T or A = U**T*D*U computed by
+     !> DPTTRF.
+     !> Norm(inv(A)) is computed by a direct method, and the reciprocal of
+     !> the condition number is computed as
+     !> RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_dptcon( n, d, e, anorm, rcond, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*), e(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ix
+           real(dp) :: ainvnm
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( anorm<zero ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPTCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           ! check that d(1:n) is positive.
+           do i = 1, n
+              if( d( i )<=zero )return
+           end do
+           ! solve m(a) * x = e, where m(a) = (m(i,j)) is given by
+              ! m(i,j) =  abs(a(i,j)), i = j,
+              ! m(i,j) = -abs(a(i,j)), i .ne. j,
+           ! and e = [ 1, 1, ..., 1 ]**t.  note m(a) = m(l)*d*m(l)**t.
+           ! solve m(l) * x = e.
+           work( 1 ) = one
+           do i = 2, n
+              work( i ) = one + work( i-1 )*abs( e( i-1 ) )
+           end do
+           ! solve d * m(l)**t * x = b.
+           work( n ) = work( n ) / d( n )
+           do i = n - 1, 1, -1
+              work( i ) = work( i ) / d( i ) + work( i+1 )*abs( e( i ) )
+           end do
+           ! compute ainvnm = max(x(i)), 1<=i<=n.
+           ix = stdlib_idamax( n, work, 1 )
+           ainvnm = abs( work( ix ) )
+           ! compute the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_dptcon
+
+     !> DPTTRF: computes the L*D*L**T factorization of a real symmetric
+     !> positive definite tridiagonal matrix A.  The factorization may also
+     !> be regarded as having the form A = U**T*D*U.
+
+     pure subroutine stdlib_dpttrf( n, d, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(inout) :: d(*), e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i4
+           real(dp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'DPTTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! compute the l*d*l**t (or u**t*d*u) factorization of a.
+           i4 = mod( n-1, 4 )
+           do i = 1, i4
+              if( d( i )<=zero ) then
+                 info = i
+                 go to 30
+              end if
+              ei = e( i )
+              e( i ) = ei / d( i )
+              d( i+1 ) = d( i+1 ) - e( i )*ei
+           end do
+           loop_20: do i = i4 + 1, n - 4, 4
+              ! drop out of the loop if d(i) <= 0: the matrix is not positive
+              ! definite.
+              if( d( i )<=zero ) then
+                 info = i
+                 go to 30
+              end if
+              ! solve for e(i) and d(i+1).
+              ei = e( i )
+              e( i ) = ei / d( i )
+              d( i+1 ) = d( i+1 ) - e( i )*ei
+              if( d( i+1 )<=zero ) then
+                 info = i + 1
+                 go to 30
+              end if
+              ! solve for e(i+1) and d(i+2).
+              ei = e( i+1 )
+              e( i+1 ) = ei / d( i+1 )
+              d( i+2 ) = d( i+2 ) - e( i+1 )*ei
+              if( d( i+2 )<=zero ) then
+                 info = i + 2
+                 go to 30
+              end if
+              ! solve for e(i+2) and d(i+3).
+              ei = e( i+2 )
+              e( i+2 ) = ei / d( i+2 )
+              d( i+3 ) = d( i+3 ) - e( i+2 )*ei
+              if( d( i+3 )<=zero ) then
+                 info = i + 3
+                 go to 30
+              end if
+              ! solve for e(i+3) and d(i+4).
+              ei = e( i+3 )
+              e( i+3 ) = ei / d( i+3 )
+              d( i+4 ) = d( i+4 ) - e( i+3 )*ei
+           end do loop_20
+           ! check d(n) for positive definiteness.
+           if( d( n )<=zero )info = n
+           30 continue
+           return
+     end subroutine stdlib_dpttrf
+
+     !> DPTTS2: solves a tridiagonal system of the form
+     !> A * X = B
+     !> using the L*D*L**T factorization of A computed by DPTTRF.  D is a
+     !> diagonal matrix specified in the vector D, L is a unit bidiagonal
+     !> matrix whose subdiagonal is specified in the vector E, and X and B
+     !> are N by NRHS matrices.
+
+     pure subroutine stdlib_dptts2( n, nrhs, d, e, b, ldb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(inout) :: b(ldb,*)
+           real(dp), intent(in) :: d(*), e(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 )call stdlib_dscal( nrhs, 1._dp / d( 1 ), b, ldb )
+              return
+           end if
+           ! solve a * x = b using the factorization a = l*d*l**t,
+           ! overwriting each right hand side vector with its solution.
+           do j = 1, nrhs
+                 ! solve l * x = b.
+              do i = 2, n
+                 b( i, j ) = b( i, j ) - b( i-1, j )*e( i-1 )
+              end do
+                 ! solve d * l**t * x = b.
+              b( n, j ) = b( n, j ) / d( n )
+              do i = n - 1, 1, -1
+                 b( i, j ) = b( i, j ) / d( i ) - b( i+1, j )*e( i )
+              end do
+           end do
+           return
+     end subroutine stdlib_dptts2
+
+     !> DRSCL: multiplies an n-element real vector x by the real scalar 1/a.
+     !> This is done without overflow or underflow as long as
+     !> the final result x/a does not overflow or underflow.
+
+     pure subroutine stdlib_drscl( n, sa, sx, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           real(dp), intent(in) :: sa
+           ! Array Arguments 
+           real(dp), intent(inout) :: sx(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           real(dp) :: bignum, cden, cden1, cnum, cnum1, mul, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 )return
+           ! get machine parameters
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! initialize the denominator to sa and the numerator to 1.
+           cden = sa
+           cnum = one
+           10 continue
+           cden1 = cden*smlnum
+           cnum1 = cnum / bignum
+           if( abs( cden1 )>abs( cnum ) .and. cnum/=zero ) then
+              ! pre-multiply x by smlnum if cden is large compared to cnum.
+              mul = smlnum
+              done = .false.
+              cden = cden1
+           else if( abs( cnum1 )>abs( cden ) ) then
+              ! pre-multiply x by bignum if cden is small compared to cnum.
+              mul = bignum
+              done = .false.
+              cnum = cnum1
+           else
+              ! multiply x by cnum / cden and return.
+              mul = cnum / cden
+              done = .true.
+           end if
+           ! scale the vector x by mul
+           call stdlib_dscal( n, mul, sx, incx )
+           if( .not.done )go to 10
+           return
+     end subroutine stdlib_drscl
+
+     !> DSBGST: reduces a real symmetric-definite banded generalized
+     !> eigenproblem  A*x = lambda*B*x  to standard form  C*y = lambda*y,
+     !> such that C has the same bandwidth as A.
+     !> B must have been previously factorized as S**T*S by DPBSTF, using a
+     !> split Cholesky factorization. A is overwritten by C = X**T*A*X, where
+     !> X = S**(-1)*Q and Q is an orthogonal matrix chosen to preserve the
+     !> bandwidth of A.
+
+     pure subroutine stdlib_dsbgst( vect, uplo, n, ka, kb, ab, ldab, bb, ldbb, x,ldx, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldx, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ab(ldab,*)
+           real(dp), intent(in) :: bb(ldbb,*)
+           real(dp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: update, upper, wantx
+           integer(ilp) :: i, i0, i1, i2, inca, j, j1, j1t, j2, j2t, k, ka1, kb1, kbt, l, m, nr, &
+                     nrt, nx
+           real(dp) :: bii, ra, ra1, t
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           wantx = stdlib_lsame( vect, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           ka1 = ka + 1
+           kb1 = kb + 1
+           info = 0
+           if( .not.wantx .and. .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldx<1 .or. wantx .and. ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBGST', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           inca = ldab*ka1
+           ! initialize x to the unit matrix, if needed
+           if( wantx )call stdlib_dlaset( 'FULL', n, n, zero, one, x, ldx )
+           ! set m to the splitting point m. it must be the same value as is
+           ! used in stdlib_dpbstf. the chosen value allows the arrays work and rwork
+           ! to be of dimension (n).
+           m = ( n+kb ) / 2
+           ! the routine works in two phases, corresponding to the two halves
+           ! of the split cholesky factorization of b as s**t*s where
+           ! s = ( u    )
+               ! ( m  l )
+           ! with u upper triangular of order m, and l lower triangular of
+           ! order n-m. s has the same bandwidth as b.
+           ! s is treated as a product of elementary matrices:
+           ! s = s(m)*s(m-1)*...*s(2)*s(1)*s(m+1)*s(m+2)*...*s(n-1)*s(n)
+           ! where s(i) is determined by the i-th row of s.
+           ! in phase 1, the index i takes the values n, n-1, ... , m+1;
+           ! in phase 2, it takes the values 1, 2, ... , m.
+           ! for each value of i, the current matrix a is updated by forming
+           ! inv(s(i))**t*a*inv(s(i)). this creates a triangular bulge outside
+           ! the band of a. the bulge is then pushed down toward the bottom of
+           ! a in phase 1, and up toward the top of a in phase 2, by applying
+           ! plane rotations.
+           ! there are kb*(kb+1)/2 elements in the bulge, but at most 2*kb-1
+           ! of them are linearly independent, so annihilating a bulge requires
+           ! only 2*kb-1 plane rotations. the rotations are divided into a 1st
+           ! set of kb-1 rotations, and a 2nd set of kb rotations.
+           ! wherever possible, rotations are generated and applied in vector
+           ! operations of length nr between the indices j1 and j2 (sometimes
+           ! replaced by modified values nrt, j1t or j2t).
+           ! the cosines and sines of the rotations are stored in the array
+           ! work. the cosines of the 1st set of rotations are stored in
+           ! elements n+2:n+m-kb-1 and the sines of the 1st set in elements
+           ! 2:m-kb-1; the cosines of the 2nd set are stored in elements
+           ! n+m-kb+1:2*n and the sines of the second set in elements m-kb+1:n.
+           ! the bulges are not formed explicitly; nonzero elements outside the
+           ! band are created only when they are required for generating new
+           ! rotations; they are stored in the array work, in positions where
+           ! they are later overwritten by the sines of the rotations which
+           ! annihilate them.
+           ! **************************** phase 1 *****************************
+           ! the logical structure of this phase is:
+           ! update = .true.
+           ! do i = n, m + 1, -1
+              ! use s(i) to update a and create a new bulge
+              ! apply rotations to push all bulges ka positions downward
+           ! end do
+           ! update = .false.
+           ! do i = m + ka + 1, n - 1
+              ! apply rotations to push all bulges ka positions downward
+           ! end do
+           ! to avoid duplicating code, the two loops are merged.
+           update = .true.
+           i = n + 1
+           10 continue
+           if( update ) then
+              i = i - 1
+              kbt = min( kb, i-1 )
+              i0 = i - 1
+              i1 = min( n, i+ka )
+              i2 = i - kbt + ka1
+              if( i<m+1 ) then
+                 update = .false.
+                 i = i + 1
+                 i0 = m
+                 if( ka==0 )go to 480
+                 go to 10
+              end if
+           else
+              i = i + ka
+              if( i>n-1 )go to 480
+           end if
+           if( upper ) then
+              ! transform a, working with the upper triangle
+              if( update ) then
+                 ! form  inv(s(i))**t * a * inv(s(i))
+                 bii = bb( kb1, i )
+                 do j = i, i1
+                    ab( i-j+ka1, j ) = ab( i-j+ka1, j ) / bii
+                 end do
+                 do j = max( 1, i-ka ), i
+                    ab( j-i+ka1, i ) = ab( j-i+ka1, i ) / bii
+                 end do
+                 do k = i - kbt, i - 1
+                    do j = i - kbt, k
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( j-i+kb1, i )*ab( k-i+ka1, i ) -bb(&
+                        k-i+kb1, i )*ab( j-i+ka1, i ) +ab( ka1, i )*bb( j-i+kb1, i )*bb( k-i+kb1, &
+                                  i )
+                    end do
+                    do j = max( 1, i-ka ), i - kbt - 1
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( k-i+kb1, i )*ab( j-i+ka1, i )
+                                 
+                    end do
+                 end do
+                 do j = i, i1
+                    do k = max( j-ka, i-kbt ), i - 1
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( k-i+kb1, i )*ab( i-j+ka1, j )
+                                 
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_dscal( n-m, one / bii, x( m+1, i ), 1 )
+                    if( kbt>0 )call stdlib_dger( n-m, kbt, -one, x( m+1, i ), 1,bb( kb1-kbt, i ), &
+                              1, x( m+1, i-kbt ), ldx )
+                 end if
+                 ! store a(i,i1) in ra1 for use in next loop over k
+                 ra1 = ab( i-i1+ka1, i1 )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions down toward the bottom of the
+              ! band
+              loop_130: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i-k+ka<n .and. i-k>1 ) then
+                       ! generate rotation to annihilate a(i,i-k+ka+1)
+                       call stdlib_dlartg( ab( k+1, i-k+ka ), ra1,work( n+i-k+ka-m ), work( i-k+&
+                                 ka-m ),ra )
+                       ! create nonzero element a(i-k,i-k+ka+1) outside the
+                       ! band and store it in work(i-k)
+                       t = -bb( kb1-k, i )*ra1
+                       work( i-k ) = work( n+i-k+ka-m )*t -work( i-k+ka-m )*ab( 1, i-k+ka )
+                                 
+                       ab( 1, i-k+ka ) = work( i-k+ka-m )*t +work( n+i-k+ka-m )*ab( 1, i-k+ka )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = max( j2, i+2*ka-k+1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( n-j2t+ka ) / ka1
+                 do j = j2t, j1, ka1
+                    ! create nonzero element a(j-ka,j+1) outside the band
+                    ! and store it in work(j-m)
+                    work( j-m ) = work( j-m )*ab( 1, j+1 )
+                    ab( 1, j+1 ) = work( n+j-m )*ab( 1, j+1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_dlargv( nrt, ab( 1, j2t ), inca, work( j2t-m ), ka1,work( &
+                           n+j2t-m ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the right
+                    do l = 1, ka - 1
+                       call stdlib_dlartv( nr, ab( ka1-l, j2 ), inca,ab( ka-l, j2+1 ), inca, work(&
+                                  n+j2-m ),work( j2-m ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_dlar2v( nr, ab( ka1, j2 ), ab( ka1, j2+1 ),ab( ka, j2+1 ), inca, &
+                              work( n+j2-m ),work( j2-m ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca,work( n+j2-m ), work( j2-m ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j2, j1, ka1
+                       call stdlib_drot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,work( n+j-m ), &
+                                 work( j-m ) )
+                    end do
+                 end if
+              end do loop_130
+              if( update ) then
+                 if( i2<=n .and. kbt>0 ) then
+                    ! create nonzero element a(i-kbt,i-kbt+ka+1) outside the
+                    ! band and store it in work(i-kbt)
+                    work( i-kbt ) = -bb( kb1-kbt, i )*ra1
+                 end if
+              end if
+              loop_170: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i - k - 1 + max( 2, k-i0+1 )*ka1
+                 else
+                    j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+ka+l ) / ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( l, j2-l+1 ), inca,ab( l+1, j2-l+1 ), &
+                              inca, work( n+j2-ka ),work( j2-ka ), ka1 )
+                 end do
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 do j = j1, j2, -ka1
+                    work( j ) = work( j-ka )
+                    work( n+j ) = work( n+j-ka )
+                 end do
+                 do j = j2, j1, ka1
+                    ! create nonzero element a(j-ka,j+1) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( 1, j+1 )
+                    ab( 1, j+1 ) = work( n+j )*ab( 1, j+1 )
+                 end do
+                 if( update ) then
+                    if( i-k<n-ka .and. k<=kbt )work( i-k+ka ) = work( i-k )
+                 end if
+              end do loop_170
+              loop_210: do k = kb, 1, -1
+                 j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_dlargv( nr, ab( 1, j2 ), inca, work( j2 ), ka1,work( n+j2 ), ka1 )
+                              
+                    ! apply rotations in 2nd set from the right
+                    do l = 1, ka - 1
+                       call stdlib_dlartv( nr, ab( ka1-l, j2 ), inca,ab( ka-l, j2+1 ), inca, work(&
+                                  n+j2 ),work( j2 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_dlar2v( nr, ab( ka1, j2 ), ab( ka1, j2+1 ),ab( ka, j2+1 ), inca, &
+                              work( n+j2 ),work( j2 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca, work( n+j2 ),work( j2 ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j2, j1, ka1
+                       call stdlib_drot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,work( n+j ), work( &
+                                 j ) )
+                    end do
+                 end if
+              end do loop_210
+              do k = 1, kb - 1
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 ! finish applying rotations in 1st set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca,work( n+j2-m ), work( j2-m ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = n - 1, i - kb + 2*ka + 1, -1
+                    work( n+j-m ) = work( n+j-ka-m )
+                    work( j-m ) = work( j-ka-m )
+                 end do
+              end if
+           else
+              ! transform a, working with the lower triangle
+              if( update ) then
+                 ! form  inv(s(i))**t * a * inv(s(i))
+                 bii = bb( 1, i )
+                 do j = i, i1
+                    ab( j-i+1, i ) = ab( j-i+1, i ) / bii
+                 end do
+                 do j = max( 1, i-ka ), i
+                    ab( i-j+1, j ) = ab( i-j+1, j ) / bii
+                 end do
+                 do k = i - kbt, i - 1
+                    do j = i - kbt, k
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( i-j+1, j )*ab( i-k+1, k ) -bb( i-k+1, &
+                                 k )*ab( i-j+1, j ) +ab( 1, i )*bb( i-j+1, j )*bb( i-k+1, k )
+                    end do
+                    do j = max( 1, i-ka ), i - kbt - 1
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( i-k+1, k )*ab( i-j+1, j )
+                    end do
+                 end do
+                 do j = i, i1
+                    do k = max( j-ka, i-kbt ), i - 1
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( i-k+1, k )*ab( j-i+1, i )
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_dscal( n-m, one / bii, x( m+1, i ), 1 )
+                    if( kbt>0 )call stdlib_dger( n-m, kbt, -one, x( m+1, i ), 1,bb( kbt+1, i-kbt )&
+                              , ldbb-1,x( m+1, i-kbt ), ldx )
+                 end if
+                 ! store a(i1,i) in ra1 for use in next loop over k
+                 ra1 = ab( i1-i+1, i )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions down toward the bottom of the
+              ! band
+              loop_360: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i-k+ka<n .and. i-k>1 ) then
+                       ! generate rotation to annihilate a(i-k+ka+1,i)
+                       call stdlib_dlartg( ab( ka1-k, i ), ra1, work( n+i-k+ka-m ),work( i-k+ka-m &
+                                 ), ra )
+                       ! create nonzero element a(i-k+ka+1,i-k) outside the
+                       ! band and store it in work(i-k)
+                       t = -bb( k+1, i-k )*ra1
+                       work( i-k ) = work( n+i-k+ka-m )*t -work( i-k+ka-m )*ab( ka1, i-k )
+                       ab( ka1, i-k ) = work( i-k+ka-m )*t +work( n+i-k+ka-m )*ab( ka1, i-k )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = max( j2, i+2*ka-k+1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( n-j2t+ka ) / ka1
+                 do j = j2t, j1, ka1
+                    ! create nonzero element a(j+1,j-ka) outside the band
+                    ! and store it in work(j-m)
+                    work( j-m ) = work( j-m )*ab( ka1, j-ka+1 )
+                    ab( ka1, j-ka+1 ) = work( n+j-m )*ab( ka1, j-ka+1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_dlargv( nrt, ab( ka1, j2t-ka ), inca, work( j2t-m ),ka1, &
+                           work( n+j2t-m ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the left
+                    do l = 1, ka - 1
+                       call stdlib_dlartv( nr, ab( l+1, j2-l ), inca,ab( l+2, j2-l ), inca, work( &
+                                 n+j2-m ),work( j2-m ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_dlar2v( nr, ab( 1, j2 ), ab( 1, j2+1 ), ab( 2, j2 ),inca, work( n+&
+                              j2-m ), work( j2-m ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, work( n+j2-m ),work( j2-m ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j2, j1, ka1
+                       call stdlib_drot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,work( n+j-m ), &
+                                 work( j-m ) )
+                    end do
+                 end if
+              end do loop_360
+              if( update ) then
+                 if( i2<=n .and. kbt>0 ) then
+                    ! create nonzero element a(i-kbt+ka+1,i-kbt) outside the
+                    ! band and store it in work(i-kbt)
+                    work( i-kbt ) = -bb( kbt+1, i-kbt )*ra1
+                 end if
+              end if
+              loop_400: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i - k - 1 + max( 2, k-i0+1 )*ka1
+                 else
+                    j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+ka+l ) / ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( ka1-l+1, j2-ka ), inca,ab( ka1-l, j2-&
+                              ka+1 ), inca,work( n+j2-ka ), work( j2-ka ), ka1 )
+                 end do
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 do j = j1, j2, -ka1
+                    work( j ) = work( j-ka )
+                    work( n+j ) = work( n+j-ka )
+                 end do
+                 do j = j2, j1, ka1
+                    ! create nonzero element a(j+1,j-ka) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( ka1, j-ka+1 )
+                    ab( ka1, j-ka+1 ) = work( n+j )*ab( ka1, j-ka+1 )
+                 end do
+                 if( update ) then
+                    if( i-k<n-ka .and. k<=kbt )work( i-k+ka ) = work( i-k )
+                 end if
+              end do loop_400
+              loop_440: do k = kb, 1, -1
+                 j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_dlargv( nr, ab( ka1, j2-ka ), inca, work( j2 ), ka1,work( n+j2 ), &
+                              ka1 )
+                    ! apply rotations in 2nd set from the left
+                    do l = 1, ka - 1
+                       call stdlib_dlartv( nr, ab( l+1, j2-l ), inca,ab( l+2, j2-l ), inca, work( &
+                                 n+j2 ),work( j2 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_dlar2v( nr, ab( 1, j2 ), ab( 1, j2+1 ), ab( 2, j2 ),inca, work( n+&
+                              j2 ), work( j2 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, work( n+j2 ),work( j2 ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j2, j1, ka1
+                       call stdlib_drot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,work( n+j ), work( &
+                                 j ) )
+                    end do
+                 end if
+              end do loop_440
+              do k = 1, kb - 1
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 ! finish applying rotations in 1st set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, work( n+j2-m ),work( j2-m ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = n - 1, i - kb + 2*ka + 1, -1
+                    work( n+j-m ) = work( n+j-ka-m )
+                    work( j-m ) = work( j-ka-m )
+                 end do
+              end if
+           end if
+           go to 10
+           480 continue
+           ! **************************** phase 2 *****************************
+           ! the logical structure of this phase is:
+           ! update = .true.
+           ! do i = 1, m
+              ! use s(i) to update a and create a new bulge
+              ! apply rotations to push all bulges ka positions upward
+           ! end do
+           ! update = .false.
+           ! do i = m - ka - 1, 2, -1
+              ! apply rotations to push all bulges ka positions upward
+           ! end do
+           ! to avoid duplicating code, the two loops are merged.
+           update = .true.
+           i = 0
+           490 continue
+           if( update ) then
+              i = i + 1
+              kbt = min( kb, m-i )
+              i0 = i + 1
+              i1 = max( 1, i-ka )
+              i2 = i + kbt - ka1
+              if( i>m ) then
+                 update = .false.
+                 i = i - 1
+                 i0 = m + 1
+                 if( ka==0 )return
+                 go to 490
+              end if
+           else
+              i = i - ka
+              if( i<2 )return
+           end if
+           if( i<m-kbt ) then
+              nx = m
+           else
+              nx = n
+           end if
+           if( upper ) then
+              ! transform a, working with the upper triangle
+              if( update ) then
+                 ! form  inv(s(i))**t * a * inv(s(i))
+                 bii = bb( kb1, i )
+                 do j = i1, i
+                    ab( j-i+ka1, i ) = ab( j-i+ka1, i ) / bii
+                 end do
+                 do j = i, min( n, i+ka )
+                    ab( i-j+ka1, j ) = ab( i-j+ka1, j ) / bii
+                 end do
+                 do k = i + 1, i + kbt
+                    do j = k, i + kbt
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( i-j+kb1, j )*ab( i-k+ka1, k ) -bb(&
+                        i-k+kb1, k )*ab( i-j+ka1, j ) +ab( ka1, i )*bb( i-j+kb1, j )*bb( i-k+kb1, &
+                                  k )
+                    end do
+                    do j = i + kbt + 1, min( n, i+ka )
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( i-k+kb1, k )*ab( i-j+ka1, j )
+                                 
+                    end do
+                 end do
+                 do j = i1, i
+                    do k = i + 1, min( j+ka, i+kbt )
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( i-k+kb1, k )*ab( j-i+ka1, i )
+                                 
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_dscal( nx, one / bii, x( 1, i ), 1 )
+                    if( kbt>0 )call stdlib_dger( nx, kbt, -one, x( 1, i ), 1, bb( kb, i+1 ),ldbb-&
+                              1, x( 1, i+1 ), ldx )
+                 end if
+                 ! store a(i1,i) in ra1 for use in next loop over k
+                 ra1 = ab( i1-i+ka1, i )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions up toward the top of the band
+              loop_610: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i+k-ka1>0 .and. i+k<m ) then
+                       ! generate rotation to annihilate a(i+k-ka-1,i)
+                       call stdlib_dlartg( ab( k+1, i ), ra1, work( n+i+k-ka ),work( i+k-ka ), ra &
+                                 )
+                       ! create nonzero element a(i+k-ka-1,i+k) outside the
+                       ! band and store it in work(m-kb+i+k)
+                       t = -bb( kb1-k, i+k )*ra1
+                       work( m-kb+i+k ) = work( n+i+k-ka )*t -work( i+k-ka )*ab( 1, i+k )
+                       ab( 1, i+k ) = work( i+k-ka )*t +work( n+i+k-ka )*ab( 1, i+k )
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = min( j2, i-2*ka+k-1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( j2t+ka-1 ) / ka1
+                 do j = j1, j2t, ka1
+                    ! create nonzero element a(j-1,j+ka) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( 1, j+ka-1 )
+                    ab( 1, j+ka-1 ) = work( n+j )*ab( 1, j+ka-1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_dlargv( nrt, ab( 1, j1+ka ), inca, work( j1 ), ka1,work( &
+                           n+j1 ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the left
+                    do l = 1, ka - 1
+                       call stdlib_dlartv( nr, ab( ka1-l, j1+l ), inca,ab( ka-l, j1+l ), inca, &
+                                 work( n+j1 ),work( j1 ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_dlar2v( nr, ab( ka1, j1 ), ab( ka1, j1-1 ),ab( ka, j1 ), inca, &
+                              work( n+j1 ),work( j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                               work( n+j1t ),work( j1t ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j1, j2, ka1
+                       call stdlib_drot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,work( n+j ), work( j ) )
+                                 
+                    end do
+                 end if
+              end do loop_610
+              if( update ) then
+                 if( i2>0 .and. kbt>0 ) then
+                    ! create nonzero element a(i+kbt-ka-1,i+kbt) outside the
+                    ! band and store it in work(m-kb+i+kbt)
+                    work( m-kb+i+kbt ) = -bb( kb1-kbt, i+kbt )*ra1
+                 end if
+              end if
+              loop_650: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i + k + 1 - max( 2, k+i0-m )*ka1
+                 else
+                    j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+ka+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( l, j1t+ka ), inca,ab( l+1, j1t+ka-1 ),&
+                               inca,work( n+m-kb+j1t+ka ),work( m-kb+j1t+ka ), ka1 )
+                 end do
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 do j = j1, j2, ka1
+                    work( m-kb+j ) = work( m-kb+j+ka )
+                    work( n+m-kb+j ) = work( n+m-kb+j+ka )
+                 end do
+                 do j = j1, j2, ka1
+                    ! create nonzero element a(j-1,j+ka) outside the band
+                    ! and store it in work(m-kb+j)
+                    work( m-kb+j ) = work( m-kb+j )*ab( 1, j+ka-1 )
+                    ab( 1, j+ka-1 ) = work( n+m-kb+j )*ab( 1, j+ka-1 )
+                 end do
+                 if( update ) then
+                    if( i+k>ka1 .and. k<=kbt )work( m-kb+i+k-ka ) = work( m-kb+i+k )
+                 end if
+              end do loop_650
+              loop_690: do k = kb, 1, -1
+                 j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_dlargv( nr, ab( 1, j1+ka ), inca, work( m-kb+j1 ),ka1, work( n+m-&
+                              kb+j1 ), ka1 )
+                    ! apply rotations in 2nd set from the left
+                    do l = 1, ka - 1
+                       call stdlib_dlartv( nr, ab( ka1-l, j1+l ), inca,ab( ka-l, j1+l ), inca,&
+                                 work( n+m-kb+j1 ), work( m-kb+j1 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_dlar2v( nr, ab( ka1, j1 ), ab( ka1, j1-1 ),ab( ka, j1 ), inca, &
+                              work( n+m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                              work( n+m-kb+j1t ), work( m-kb+j1t ),ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j1, j2, ka1
+                       call stdlib_drot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,work( n+m-kb+j ), work( &
+                                 m-kb+j ) )
+                    end do
+                 end if
+              end do loop_690
+              do k = 1, kb - 1
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 ! finish applying rotations in 1st set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                               work( n+j1t ),work( j1t ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = 2, min( i+kb, m ) - 2*ka - 1
+                    work( n+j ) = work( n+j+ka )
+                    work( j ) = work( j+ka )
+                 end do
+              end if
+           else
+              ! transform a, working with the lower triangle
+              if( update ) then
+                 ! form  inv(s(i))**t * a * inv(s(i))
+                 bii = bb( 1, i )
+                 do j = i1, i
+                    ab( i-j+1, j ) = ab( i-j+1, j ) / bii
+                 end do
+                 do j = i, min( n, i+ka )
+                    ab( j-i+1, i ) = ab( j-i+1, i ) / bii
+                 end do
+                 do k = i + 1, i + kbt
+                    do j = k, i + kbt
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( j-i+1, i )*ab( k-i+1, i ) -bb( k-i+1, &
+                                 i )*ab( j-i+1, i ) +ab( 1, i )*bb( j-i+1, i )*bb( k-i+1, i )
+                    end do
+                    do j = i + kbt + 1, min( n, i+ka )
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( k-i+1, i )*ab( j-i+1, i )
+                    end do
+                 end do
+                 do j = i1, i
+                    do k = i + 1, min( j+ka, i+kbt )
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( k-i+1, i )*ab( i-j+1, j )
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_dscal( nx, one / bii, x( 1, i ), 1 )
+                    if( kbt>0 )call stdlib_dger( nx, kbt, -one, x( 1, i ), 1, bb( 2, i ), 1,x( 1, &
+                              i+1 ), ldx )
+                 end if
+                 ! store a(i,i1) in ra1 for use in next loop over k
+                 ra1 = ab( i-i1+1, i1 )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions up toward the top of the band
+              loop_840: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i+k-ka1>0 .and. i+k<m ) then
+                       ! generate rotation to annihilate a(i,i+k-ka-1)
+                       call stdlib_dlartg( ab( ka1-k, i+k-ka ), ra1,work( n+i+k-ka ), work( i+k-&
+                                 ka ), ra )
+                       ! create nonzero element a(i+k,i+k-ka-1) outside the
+                       ! band and store it in work(m-kb+i+k)
+                       t = -bb( k+1, i )*ra1
+                       work( m-kb+i+k ) = work( n+i+k-ka )*t -work( i+k-ka )*ab( ka1, i+k-ka )
+                                 
+                       ab( ka1, i+k-ka ) = work( i+k-ka )*t +work( n+i+k-ka )*ab( ka1, i+k-ka )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = min( j2, i-2*ka+k-1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( j2t+ka-1 ) / ka1
+                 do j = j1, j2t, ka1
+                    ! create nonzero element a(j+ka,j-1) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( ka1, j-1 )
+                    ab( ka1, j-1 ) = work( n+j )*ab( ka1, j-1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_dlargv( nrt, ab( ka1, j1 ), inca, work( j1 ), ka1,work( n+&
+                           j1 ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the right
+                    do l = 1, ka - 1
+                       call stdlib_dlartv( nr, ab( l+1, j1 ), inca, ab( l+2, j1-1 ),inca, work( n+&
+                                 j1 ), work( j1 ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_dlar2v( nr, ab( 1, j1 ), ab( 1, j1-1 ),ab( 2, j1-1 ), inca, work( &
+                              n+j1 ),work( j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,work( n+j1t ), work( j1t ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j1, j2, ka1
+                       call stdlib_drot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,work( n+j ), work( j ) )
+                                 
+                    end do
+                 end if
+              end do loop_840
+              if( update ) then
+                 if( i2>0 .and. kbt>0 ) then
+                    ! create nonzero element a(i+kbt,i+kbt-ka-1) outside the
+                    ! band and store it in work(m-kb+i+kbt)
+                    work( m-kb+i+kbt ) = -bb( kbt+1, i )*ra1
+                 end if
+              end if
+              loop_880: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i + k + 1 - max( 2, k+i0-m )*ka1
+                 else
+                    j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+ka+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( ka1-l+1, j1t+l-1 ), inca,ab( ka1-l, &
+                              j1t+l-1 ), inca,work( n+m-kb+j1t+ka ),work( m-kb+j1t+ka ), ka1 )
+                 end do
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 do j = j1, j2, ka1
+                    work( m-kb+j ) = work( m-kb+j+ka )
+                    work( n+m-kb+j ) = work( n+m-kb+j+ka )
+                 end do
+                 do j = j1, j2, ka1
+                    ! create nonzero element a(j+ka,j-1) outside the band
+                    ! and store it in work(m-kb+j)
+                    work( m-kb+j ) = work( m-kb+j )*ab( ka1, j-1 )
+                    ab( ka1, j-1 ) = work( n+m-kb+j )*ab( ka1, j-1 )
+                 end do
+                 if( update ) then
+                    if( i+k>ka1 .and. k<=kbt )work( m-kb+i+k-ka ) = work( m-kb+i+k )
+                 end if
+              end do loop_880
+              loop_920: do k = kb, 1, -1
+                 j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_dlargv( nr, ab( ka1, j1 ), inca, work( m-kb+j1 ),ka1, work( n+m-&
+                              kb+j1 ), ka1 )
+                    ! apply rotations in 2nd set from the right
+                    do l = 1, ka - 1
+                       call stdlib_dlartv( nr, ab( l+1, j1 ), inca, ab( l+2, j1-1 ),inca, work( n+&
+                                 m-kb+j1 ), work( m-kb+j1 ),ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_dlar2v( nr, ab( 1, j1 ), ab( 1, j1-1 ),ab( 2, j1-1 ), inca, work( &
+                              n+m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,work( n+m-kb+j1t ), work( m-kb+j1t ),ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j1, j2, ka1
+                       call stdlib_drot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,work( n+m-kb+j ), work( &
+                                 m-kb+j ) )
+                    end do
+                 end if
+              end do loop_920
+              do k = 1, kb - 1
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 ! finish applying rotations in 1st set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_dlartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,work( n+j1t ), work( j1t ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = 2, min( i+kb, m ) - 2*ka - 1
+                    work( n+j ) = work( n+j+ka )
+                    work( j ) = work( j+ka )
+                 end do
+              end if
+           end if
+           go to 490
+     end subroutine stdlib_dsbgst
+
+     !> DSBTRD: reduces a real symmetric band matrix A to symmetric
+     !> tridiagonal form T by an orthogonal similarity transformation:
+     !> Q**T * A * Q = T.
+
+     pure subroutine stdlib_dsbtrd( vect, uplo, n, kd, ab, ldab, d, e, q, ldq,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldq, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ab(ldab,*), q(ldq,*)
+           real(dp), intent(out) :: d(*), e(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: initq, upper, wantq
+           integer(ilp) :: i, i2, ibl, inca, incx, iqaend, iqb, iqend, j, j1, j1end, j1inc, j2, &
+                     jend, jin, jinc, k, kd1, kdm1, kdn, l, last, lend, nq, nr, nrt
+           real(dp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           initq = stdlib_lsame( vect, 'V' )
+           wantq = initq .or. stdlib_lsame( vect, 'U' )
+           upper = stdlib_lsame( uplo, 'U' )
+           kd1 = kd + 1
+           kdm1 = kd - 1
+           incx = ldab - 1
+           iqend = 1
+           info = 0
+           if( .not.wantq .and. .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd1 ) then
+              info = -6
+           else if( ldq<max( 1, n ) .and. wantq ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBTRD', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize q to the unit matrix, if needed
+           if( initq )call stdlib_dlaset( 'FULL', n, n, zero, one, q, ldq )
+           ! wherever possible, plane rotations are generated and applied in
+           ! vector operations of length nr over the index set j1:j2:kd1.
+           ! the cosines and sines of the plane rotations are stored in the
+           ! arrays d and work.
+           inca = kd1*ldab
+           kdn = min( n-1, kd )
+           if( upper ) then
+              if( kd>1 ) then
+                 ! reduce to tridiagonal form, working with upper triangle
+                 nr = 0
+                 j1 = kdn + 2
+                 j2 = 1
+                 loop_90: do i = 1, n - 2
+                    ! reduce i-th row of matrix to tridiagonal form
+                    loop_80: do k = kdn + 1, 2, -1
+                       j1 = j1 + kdn
+                       j2 = j2 + kdn
+                       if( nr>0 ) then
+                          ! generate plane rotations to annihilate nonzero
+                          ! elements which have been created outside the band
+                          call stdlib_dlargv( nr, ab( 1, j1-1 ), inca, work( j1 ),kd1, d( j1 ), &
+                                    kd1 )
+                          ! apply rotations from the right
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_dlartv or stdlib_drot is used
+                          if( nr>=2*kd-1 ) then
+                             do l = 1, kd - 1
+                                call stdlib_dlartv( nr, ab( l+1, j1-1 ), inca,ab( l, j1 ), inca, &
+                                          d( j1 ),work( j1 ), kd1 )
+                             end do
+                          else
+                             jend = j1 + ( nr-1 )*kd1
+                             do jinc = j1, jend, kd1
+                                call stdlib_drot( kdm1, ab( 2, jinc-1 ), 1,ab( 1, jinc ), 1, d( &
+                                          jinc ),work( jinc ) )
+                             end do
+                          end if
+                       end if
+                       if( k>2 ) then
+                          if( k<=n-i+1 ) then
+                             ! generate plane rotation to annihilate a(i,i+k-1)
+                             ! within the band
+                             call stdlib_dlartg( ab( kd-k+3, i+k-2 ),ab( kd-k+2, i+k-1 ), d( i+k-&
+                                       1 ),work( i+k-1 ), temp )
+                             ab( kd-k+3, i+k-2 ) = temp
+                             ! apply rotation from the right
+                             call stdlib_drot( k-3, ab( kd-k+4, i+k-2 ), 1,ab( kd-k+3, i+k-1 ), 1,&
+                                        d( i+k-1 ),work( i+k-1 ) )
+                          end if
+                          nr = nr + 1
+                          j1 = j1 - kdn - 1
+                       end if
+                       ! apply plane rotations from both sides to diagonal
+                       ! blocks
+                       if( nr>0 )call stdlib_dlar2v( nr, ab( kd1, j1-1 ), ab( kd1, j1 ),ab( kd, &
+                                 j1 ), inca, d( j1 ),work( j1 ), kd1 )
+                       ! apply plane rotations from the left
+                       if( nr>0 ) then
+                          if( 2*kd-1<nr ) then
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_dlartv or stdlib_drot is used
+                             do l = 1, kd - 1
+                                if( j2+l>n ) then
+                                   nrt = nr - 1
+                                else
+                                   nrt = nr
+                                end if
+                                if( nrt>0 )call stdlib_dlartv( nrt, ab( kd-l, j1+l ), inca,ab( kd-&
+                                          l+1, j1+l ), inca,d( j1 ), work( j1 ), kd1 )
+                             end do
+                          else
+                             j1end = j1 + kd1*( nr-2 )
+                             if( j1end>=j1 ) then
+                                do jin = j1, j1end, kd1
+                                   call stdlib_drot( kd-1, ab( kd-1, jin+1 ), incx,ab( kd, jin+1 )&
+                                             , incx,d( jin ), work( jin ) )
+                                end do
+                             end if
+                             lend = min( kdm1, n-j2 )
+                             last = j1end + kd1
+                             if( lend>0 )call stdlib_drot( lend, ab( kd-1, last+1 ), incx,ab( kd, &
+                                       last+1 ), incx, d( last ),work( last ) )
+                          end if
+                       end if
+                       if( wantq ) then
+                          ! accumulate product of plane rotations in q
+                          if( initq ) then
+                       ! take advantage of the fact that q was
+                       ! initially the identity matrix
+                             iqend = max( iqend, j2 )
+                             i2 = max( 0, k-3 )
+                             iqaend = 1 + i*kd
+                             if( k==2 )iqaend = iqaend + kd
+                             iqaend = min( iqaend, iqend )
+                             do j = j1, j2, kd1
+                                ibl = i - i2 / kdm1
+                                i2 = i2 + 1
+                                iqb = max( 1, j-ibl )
+                                nq = 1 + iqaend - iqb
+                                iqaend = min( iqaend+kd, iqend )
+                                call stdlib_drot( nq, q( iqb, j-1 ), 1, q( iqb, j ),1, d( j ), &
+                                          work( j ) )
+                             end do
+                          else
+                             do j = j1, j2, kd1
+                                call stdlib_drot( n, q( 1, j-1 ), 1, q( 1, j ), 1,d( j ), work( j &
+                                          ) )
+                             end do
+                          end if
+                       end if
+                       if( j2+kdn>n ) then
+                          ! adjust j2 to keep within the bounds of the matrix
+                          nr = nr - 1
+                          j2 = j2 - kdn - 1
+                       end if
+                       do j = j1, j2, kd1
+                          ! create nonzero element a(j-1,j+kd) outside the band
+                          ! and store it in work
+                          work( j+kd ) = work( j )*ab( 1, j+kd )
+                          ab( 1, j+kd ) = d( j )*ab( 1, j+kd )
+                       end do
+                    end do loop_80
+                 end do loop_90
+              end if
+              if( kd>0 ) then
+                 ! copy off-diagonal elements to e
+                 do i = 1, n - 1
+                    e( i ) = ab( kd, i+1 )
+                 end do
+              else
+                 ! set e to zero if original matrix was diagonal
+                 do i = 1, n - 1
+                    e( i ) = zero
+                 end do
+              end if
+              ! copy diagonal elements to d
+              do i = 1, n
+                 d( i ) = ab( kd1, i )
+              end do
+           else
+              if( kd>1 ) then
+                 ! reduce to tridiagonal form, working with lower triangle
+                 nr = 0
+                 j1 = kdn + 2
+                 j2 = 1
+                 loop_210: do i = 1, n - 2
+                    ! reduce i-th column of matrix to tridiagonal form
+                    loop_200: do k = kdn + 1, 2, -1
+                       j1 = j1 + kdn
+                       j2 = j2 + kdn
+                       if( nr>0 ) then
+                          ! generate plane rotations to annihilate nonzero
+                          ! elements which have been created outside the band
+                          call stdlib_dlargv( nr, ab( kd1, j1-kd1 ), inca,work( j1 ), kd1, d( j1 )&
+                                    , kd1 )
+                          ! apply plane rotations from one side
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_dlartv or stdlib_drot is used
+                          if( nr>2*kd-1 ) then
+                             do l = 1, kd - 1
+                                call stdlib_dlartv( nr, ab( kd1-l, j1-kd1+l ), inca,ab( kd1-l+1, &
+                                          j1-kd1+l ), inca,d( j1 ), work( j1 ), kd1 )
+                             end do
+                          else
+                             jend = j1 + kd1*( nr-1 )
+                             do jinc = j1, jend, kd1
+                                call stdlib_drot( kdm1, ab( kd, jinc-kd ), incx,ab( kd1, jinc-kd )&
+                                          , incx,d( jinc ), work( jinc ) )
+                             end do
+                          end if
+                       end if
+                       if( k>2 ) then
+                          if( k<=n-i+1 ) then
+                             ! generate plane rotation to annihilate a(i+k-1,i)
+                             ! within the band
+                             call stdlib_dlartg( ab( k-1, i ), ab( k, i ),d( i+k-1 ), work( i+k-1 &
+                                       ), temp )
+                             ab( k-1, i ) = temp
+                             ! apply rotation from the left
+                             call stdlib_drot( k-3, ab( k-2, i+1 ), ldab-1,ab( k-1, i+1 ), ldab-1,&
+                                        d( i+k-1 ),work( i+k-1 ) )
+                          end if
+                          nr = nr + 1
+                          j1 = j1 - kdn - 1
+                       end if
+                       ! apply plane rotations from both sides to diagonal
+                       ! blocks
+                       if( nr>0 )call stdlib_dlar2v( nr, ab( 1, j1-1 ), ab( 1, j1 ),ab( 2, j1-1 ),&
+                                  inca, d( j1 ),work( j1 ), kd1 )
+                       ! apply plane rotations from the right
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_dlartv or stdlib_drot is used
+                       if( nr>0 ) then
+                          if( nr>2*kd-1 ) then
+                             do l = 1, kd - 1
+                                if( j2+l>n ) then
+                                   nrt = nr - 1
+                                else
+                                   nrt = nr
+                                end if
+                                if( nrt>0 )call stdlib_dlartv( nrt, ab( l+2, j1-1 ), inca,ab( l+1,&
+                                           j1 ), inca, d( j1 ),work( j1 ), kd1 )
+                             end do
+                          else
+                             j1end = j1 + kd1*( nr-2 )
+                             if( j1end>=j1 ) then
+                                do j1inc = j1, j1end, kd1
+                                   call stdlib_drot( kdm1, ab( 3, j1inc-1 ), 1,ab( 2, j1inc ), 1, &
+                                             d( j1inc ),work( j1inc ) )
+                                end do
+                             end if
+                             lend = min( kdm1, n-j2 )
+                             last = j1end + kd1
+                             if( lend>0 )call stdlib_drot( lend, ab( 3, last-1 ), 1,ab( 2, last ),&
+                                        1, d( last ),work( last ) )
+                          end if
+                       end if
+                       if( wantq ) then
+                          ! accumulate product of plane rotations in q
+                          if( initq ) then
+                       ! take advantage of the fact that q was
+                       ! initially the identity matrix
+                             iqend = max( iqend, j2 )
+                             i2 = max( 0, k-3 )
+                             iqaend = 1 + i*kd
+                             if( k==2 )iqaend = iqaend + kd
+                             iqaend = min( iqaend, iqend )
+                             do j = j1, j2, kd1
+                                ibl = i - i2 / kdm1
+                                i2 = i2 + 1
+                                iqb = max( 1, j-ibl )
+                                nq = 1 + iqaend - iqb
+                                iqaend = min( iqaend+kd, iqend )
+                                call stdlib_drot( nq, q( iqb, j-1 ), 1, q( iqb, j ),1, d( j ), &
+                                          work( j ) )
+                             end do
+                          else
+                             do j = j1, j2, kd1
+                                call stdlib_drot( n, q( 1, j-1 ), 1, q( 1, j ), 1,d( j ), work( j &
+                                          ) )
+                             end do
+                          end if
+                       end if
+                       if( j2+kdn>n ) then
+                          ! adjust j2 to keep within the bounds of the matrix
+                          nr = nr - 1
+                          j2 = j2 - kdn - 1
+                       end if
+                       do j = j1, j2, kd1
+                          ! create nonzero element a(j+kd,j-1) outside the
+                          ! band and store it in work
+                          work( j+kd ) = work( j )*ab( kd1, j )
+                          ab( kd1, j ) = d( j )*ab( kd1, j )
+                       end do
+                    end do loop_200
+                 end do loop_210
+              end if
+              if( kd>0 ) then
+                 ! copy off-diagonal elements to e
+                 do i = 1, n - 1
+                    e( i ) = ab( 2, i )
+                 end do
+              else
+                 ! set e to zero if original matrix was diagonal
+                 do i = 1, n - 1
+                    e( i ) = zero
+                 end do
+              end if
+              ! copy diagonal elements to d
+              do i = 1, n
+                 d( i ) = ab( 1, i )
+              end do
+           end if
+           return
+     end subroutine stdlib_dsbtrd
+
+     !> Level 3 BLAS like routine for C in RFP Format.
+     !> DSFRK: performs one of the symmetric rank--k operations
+     !> C := alpha*A*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*A + beta*C,
+     !> where alpha and beta are real scalars, C is an n--by--n symmetric
+     !> matrix and A is an n--by--k matrix in the first case and a k--by--n
+     !> matrix in the second case.
+
+     pure subroutine stdlib_dsfrk( transr, uplo, trans, n, k, alpha, a, lda, beta,c )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, n
+           character, intent(in) :: trans, transr, uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: c(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, normaltransr, nisodd, notrans
+           integer(ilp) :: info, nrowa, j, nk, n1, n2
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           notrans = stdlib_lsame( trans, 'N' )
+           if( notrans ) then
+              nrowa = n
+           else
+              nrowa = k
+           end if
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( .not.notrans .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( lda<max( 1, nrowa ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSFRK ', -info )
+              return
+           end if
+           ! quick return if possible.
+           ! the quick return case: ((alpha==0).and.(beta/=zero)) is not
+           ! done (it is in stdlib_dsyrk for example) and left in the general case.
+           if( ( n==0 ) .or. ( ( ( alpha==zero ) .or. ( k==0 ) ) .and.( beta==one ) ) )&
+                     return
+           if( ( alpha==zero ) .and. ( beta==zero ) ) then
+              do j = 1, ( ( n*( n+1 ) ) / 2 )
+                 c( j ) = zero
+              end do
+              return
+           end if
+           ! c is n-by-n.
+           ! if n is odd, set nisodd = .true., and n1 and n2.
+           ! if n is even, nisodd = .false., and nk.
+           if( mod( n, 2 )==0 ) then
+              nisodd = .false.
+              nk = n / 2
+           else
+              nisodd = .true.
+              if( lower ) then
+                 n2 = n / 2
+                 n1 = n - n2
+              else
+                 n1 = n / 2
+                 n2 = n - n1
+              end if
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is odd, transr = 'n', uplo = 'l', and trans = 'n'
+                       call stdlib_dsyrk( 'L', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n )
+                                 
+                       call stdlib_dsyrk( 'U', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( n+1 )&
+                                 , n )
+                       call stdlib_dgemm( 'N', 'T', n2, n1, k, alpha, a( n1+1, 1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( n1+1 ), n )
+                    else
+                       ! n is odd, transr = 'n', uplo = 'l', and trans = 't'
+                       call stdlib_dsyrk( 'L', 'T', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n )
+                                 
+                       call stdlib_dsyrk( 'U', 'T', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( n+1 )&
+                                 , n )
+                       call stdlib_dgemm( 'T', 'N', n2, n1, k, alpha, a( 1, n1+1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( n1+1 ), n )
+                    end if
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is odd, transr = 'n', uplo = 'u', and trans = 'n'
+                       call stdlib_dsyrk( 'L', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2+1 ), &
+                                 n )
+                       call stdlib_dsyrk( 'U', 'N', n2, k, alpha, a( n2, 1 ), lda,beta, c( n1+1 ),&
+                                  n )
+                       call stdlib_dgemm( 'N', 'T', n1, n2, k, alpha, a( 1, 1 ),lda, a( n2, 1 ), &
+                                 lda, beta, c( 1 ), n )
+                    else
+                       ! n is odd, transr = 'n', uplo = 'u', and trans = 't'
+                       call stdlib_dsyrk( 'L', 'T', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2+1 ), &
+                                 n )
+                       call stdlib_dsyrk( 'U', 'T', n2, k, alpha, a( 1, n2 ), lda,beta, c( n1+1 ),&
+                                  n )
+                       call stdlib_dgemm( 'T', 'N', n1, n2, k, alpha, a( 1, 1 ),lda, a( 1, n2 ), &
+                                 lda, beta, c( 1 ), n )
+                    end if
+                 end if
+              else
+                 ! n is odd, and transr = 't'
+                 if( lower ) then
+                    ! n is odd, transr = 't', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is odd, transr = 't', uplo = 'l', and trans = 'n'
+                       call stdlib_dsyrk( 'U', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n1 &
+                                 )
+                       call stdlib_dsyrk( 'L', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( 2 ), &
+                                 n1 )
+                       call stdlib_dgemm( 'N', 'T', n1, n2, k, alpha, a( 1, 1 ),lda, a( n1+1, 1 ),&
+                                  lda, beta,c( n1*n1+1 ), n1 )
+                    else
+                       ! n is odd, transr = 't', uplo = 'l', and trans = 't'
+                       call stdlib_dsyrk( 'U', 'T', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n1 &
+                                 )
+                       call stdlib_dsyrk( 'L', 'T', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( 2 ), &
+                                 n1 )
+                       call stdlib_dgemm( 'T', 'N', n1, n2, k, alpha, a( 1, 1 ),lda, a( 1, n1+1 ),&
+                                  lda, beta,c( n1*n1+1 ), n1 )
+                    end if
+                 else
+                    ! n is odd, transr = 't', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is odd, transr = 't', uplo = 'u', and trans = 'n'
+                       call stdlib_dsyrk( 'U', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2*n2+1 &
+                                 ), n2 )
+                       call stdlib_dsyrk( 'L', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( &
+                                 n1*n2+1 ), n2 )
+                       call stdlib_dgemm( 'N', 'T', n2, n1, k, alpha, a( n1+1, 1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( 1 ), n2 )
+                    else
+                       ! n is odd, transr = 't', uplo = 'u', and trans = 't'
+                       call stdlib_dsyrk( 'U', 'T', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2*n2+1 &
+                                 ), n2 )
+                       call stdlib_dsyrk( 'L', 'T', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( &
+                                 n1*n2+1 ), n2 )
+                       call stdlib_dgemm( 'T', 'N', n2, n1, k, alpha, a( 1, n1+1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( 1 ), n2 )
+                    end if
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is even, transr = 'n', uplo = 'l', and trans = 'n'
+                       call stdlib_dsyrk( 'L', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( 2 ), n+&
+                                 1 )
+                       call stdlib_dsyrk( 'U', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( 1 ), &
+                                 n+1 )
+                       call stdlib_dgemm( 'N', 'T', nk, nk, k, alpha, a( nk+1, 1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( nk+2 ),n+1 )
+                    else
+                       ! n is even, transr = 'n', uplo = 'l', and trans = 't'
+                       call stdlib_dsyrk( 'L', 'T', nk, k, alpha, a( 1, 1 ), lda,beta, c( 2 ), n+&
+                                 1 )
+                       call stdlib_dsyrk( 'U', 'T', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( 1 ), &
+                                 n+1 )
+                       call stdlib_dgemm( 'T', 'N', nk, nk, k, alpha, a( 1, nk+1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( nk+2 ),n+1 )
+                    end if
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is even, transr = 'n', uplo = 'u', and trans = 'n'
+                       call stdlib_dsyrk( 'L', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+2 ), &
+                                 n+1 )
+                       call stdlib_dsyrk( 'U', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( nk+1 &
+                                 ), n+1 )
+                       call stdlib_dgemm( 'N', 'T', nk, nk, k, alpha, a( 1, 1 ),lda, a( nk+1, 1 ),&
+                                  lda, beta, c( 1 ),n+1 )
+                    else
+                       ! n is even, transr = 'n', uplo = 'u', and trans = 't'
+                       call stdlib_dsyrk( 'L', 'T', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+2 ), &
+                                 n+1 )
+                       call stdlib_dsyrk( 'U', 'T', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( nk+1 &
+                                 ), n+1 )
+                       call stdlib_dgemm( 'T', 'N', nk, nk, k, alpha, a( 1, 1 ),lda, a( 1, nk+1 ),&
+                                  lda, beta, c( 1 ),n+1 )
+                    end if
+                 end if
+              else
+                 ! n is even, and transr = 't'
+                 if( lower ) then
+                    ! n is even, transr = 't', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is even, transr = 't', uplo = 'l', and trans = 'n'
+                       call stdlib_dsyrk( 'U', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+1 ), &
+                                 nk )
+                       call stdlib_dsyrk( 'L', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( 1 ), &
+                                 nk )
+                       call stdlib_dgemm( 'N', 'T', nk, nk, k, alpha, a( 1, 1 ),lda, a( nk+1, 1 ),&
+                                  lda, beta,c( ( ( nk+1 )*nk )+1 ), nk )
+                    else
+                       ! n is even, transr = 't', uplo = 'l', and trans = 't'
+                       call stdlib_dsyrk( 'U', 'T', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+1 ), &
+                                 nk )
+                       call stdlib_dsyrk( 'L', 'T', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( 1 ), &
+                                 nk )
+                       call stdlib_dgemm( 'T', 'N', nk, nk, k, alpha, a( 1, 1 ),lda, a( 1, nk+1 ),&
+                                  lda, beta,c( ( ( nk+1 )*nk )+1 ), nk )
+                    end if
+                 else
+                    ! n is even, transr = 't', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is even, transr = 't', uplo = 'u', and trans = 'n'
+                       call stdlib_dsyrk( 'U', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk*( nk+&
+                                 1 )+1 ), nk )
+                       call stdlib_dsyrk( 'L', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( &
+                                 nk*nk+1 ), nk )
+                       call stdlib_dgemm( 'N', 'T', nk, nk, k, alpha, a( nk+1, 1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( 1 ), nk )
+                    else
+                       ! n is even, transr = 't', uplo = 'u', and trans = 't'
+                       call stdlib_dsyrk( 'U', 'T', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk*( nk+&
+                                 1 )+1 ), nk )
+                       call stdlib_dsyrk( 'L', 'T', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( &
+                                 nk*nk+1 ), nk )
+                       call stdlib_dgemm( 'T', 'N', nk, nk, k, alpha, a( 1, nk+1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( 1 ), nk )
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dsfrk
+
+     !> DSPGST: reduces a real symmetric-definite generalized eigenproblem
+     !> to standard form, using packed storage.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**T)*A*inv(U) or inv(L)*A*inv(L**T)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**T or L**T*A*L.
+     !> B must have been previously factorized as U**T*U or L*L**T by DPPTRF.
+
+     pure subroutine stdlib_dspgst( itype, uplo, n, ap, bp, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ap(*)
+           real(dp), intent(in) :: bp(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, j1, j1j1, jj, k, k1, k1k1, kk
+           real(dp) :: ajj, akk, bjj, bkk, ct
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPGST', -info )
+              return
+           end if
+           if( itype==1 ) then
+              if( upper ) then
+                 ! compute inv(u**t)*a*inv(u)
+                 ! j1 and jj are the indices of a(1,j) and a(j,j)
+                 jj = 0
+                 do j = 1, n
+                    j1 = jj + 1
+                    jj = jj + j
+                    ! compute the j-th column of the upper triangle of a
+                    bjj = bp( jj )
+                    call stdlib_dtpsv( uplo, 'TRANSPOSE', 'NONUNIT', j, bp,ap( j1 ), 1 )
+                    call stdlib_dspmv( uplo, j-1, -one, ap, bp( j1 ), 1, one,ap( j1 ), 1 )
+                    call stdlib_dscal( j-1, one / bjj, ap( j1 ), 1 )
+                    ap( jj ) = ( ap( jj )-stdlib_ddot( j-1, ap( j1 ), 1, bp( j1 ),1 ) ) / &
+                              bjj
+                 end do
+              else
+                 ! compute inv(l)*a*inv(l**t)
+                 ! kk and k1k1 are the indices of a(k,k) and a(k+1,k+1)
+                 kk = 1
+                 do k = 1, n
+                    k1k1 = kk + n - k + 1
+                    ! update the lower triangle of a(k:n,k:n)
+                    akk = ap( kk )
+                    bkk = bp( kk )
+                    akk = akk / bkk**2
+                    ap( kk ) = akk
+                    if( k<n ) then
+                       call stdlib_dscal( n-k, one / bkk, ap( kk+1 ), 1 )
+                       ct = -half*akk
+                       call stdlib_daxpy( n-k, ct, bp( kk+1 ), 1, ap( kk+1 ), 1 )
+                       call stdlib_dspr2( uplo, n-k, -one, ap( kk+1 ), 1,bp( kk+1 ), 1, ap( k1k1 )&
+                                  )
+                       call stdlib_daxpy( n-k, ct, bp( kk+1 ), 1, ap( kk+1 ), 1 )
+                       call stdlib_dtpsv( uplo, 'NO TRANSPOSE', 'NON-UNIT', n-k,bp( k1k1 ), ap( &
+                                 kk+1 ), 1 )
+                    end if
+                    kk = k1k1
+                 end do
+              end if
+           else
+              if( upper ) then
+                 ! compute u*a*u**t
+                 ! k1 and kk are the indices of a(1,k) and a(k,k)
+                 kk = 0
+                 do k = 1, n
+                    k1 = kk + 1
+                    kk = kk + k
+                    ! update the upper triangle of a(1:k,1:k)
+                    akk = ap( kk )
+                    bkk = bp( kk )
+                    call stdlib_dtpmv( uplo, 'NO TRANSPOSE', 'NON-UNIT', k-1, bp,ap( k1 ), 1 )
+                              
+                    ct = half*akk
+                    call stdlib_daxpy( k-1, ct, bp( k1 ), 1, ap( k1 ), 1 )
+                    call stdlib_dspr2( uplo, k-1, one, ap( k1 ), 1, bp( k1 ), 1,ap )
+                    call stdlib_daxpy( k-1, ct, bp( k1 ), 1, ap( k1 ), 1 )
+                    call stdlib_dscal( k-1, bkk, ap( k1 ), 1 )
+                    ap( kk ) = akk*bkk**2
+                 end do
+              else
+                 ! compute l**t *a*l
+                 ! jj and j1j1 are the indices of a(j,j) and a(j+1,j+1)
+                 jj = 1
+                 do j = 1, n
+                    j1j1 = jj + n - j + 1
+                    ! compute the j-th column of the lower triangle of a
+                    ajj = ap( jj )
+                    bjj = bp( jj )
+                    ap( jj ) = ajj*bjj + stdlib_ddot( n-j, ap( jj+1 ), 1,bp( jj+1 ), 1 )
+                    call stdlib_dscal( n-j, bjj, ap( jj+1 ), 1 )
+                    call stdlib_dspmv( uplo, n-j, one, ap( j1j1 ), bp( jj+1 ), 1,one, ap( jj+1 ), &
+                              1 )
+                    call stdlib_dtpmv( uplo, 'TRANSPOSE', 'NON-UNIT', n-j+1,bp( jj ), ap( jj ), 1 &
+                              )
+                    jj = j1j1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_dspgst
+
+     !> DSPTRF: computes the factorization of a real symmetric matrix A stored
+     !> in packed format using the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+
+     pure subroutine stdlib_dsptrf( uplo, n, ap, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kc, kk, knc, kp, kpc, kstep, kx, npp
+           real(dp) :: absakk, alpha, colmax, d11, d12, d21, d22, r1, rowmax, t, wk, wkm1, &
+                     wkp1
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPTRF', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              kc = ( n-1 )*n / 2 + 1
+              10 continue
+              knc = kc
+              ! if k < 1, exit from loop
+              if( k<1 )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( ap( kc+k-1 ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k>1 ) then
+                 imax = stdlib_idamax( k-1, ap( kc ), 1 )
+                 colmax = abs( ap( kc+imax-1 ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    rowmax = zero
+                    jmax = imax
+                    kx = imax*( imax+1 ) / 2 + imax
+                    do j = imax + 1, k
+                       if( abs( ap( kx ) )>rowmax ) then
+                          rowmax = abs( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + j
+                    end do
+                    kpc = ( imax-1 )*imax / 2 + 1
+                    if( imax>1 ) then
+                       jmax = stdlib_idamax( imax-1, ap( kpc ), 1 )
+                       rowmax = max( rowmax, abs( ap( kpc+jmax-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( ap( kpc+imax-1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kstep==2 )knc = knc - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_dswap( kp-1, ap( knc ), 1, ap( kpc ), 1 )
+                    kx = kpc + kp - 1
+                    do j = kp + 1, kk - 1
+                       kx = kx + j - 1
+                       t = ap( knc+j-1 )
+                       ap( knc+j-1 ) = ap( kx )
+                       ap( kx ) = t
+                    end do
+                    t = ap( knc+kk-1 )
+                    ap( knc+kk-1 ) = ap( kpc+kp-1 )
+                    ap( kpc+kp-1 ) = t
+                    if( kstep==2 ) then
+                       t = ap( kc+k-2 )
+                       ap( kc+k-2 ) = ap( kc+kp-1 )
+                       ap( kc+kp-1 ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**t = a - w(k)*1/d(k)*w(k)**t
+                    r1 = one / ap( kc+k-1 )
+                    call stdlib_dspr( uplo, k-1, -r1, ap( kc ), 1, ap )
+                    ! store u(k) in column k
+                    call stdlib_dscal( k-1, r1, ap( kc ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**t
+                    if( k>2 ) then
+                       d12 = ap( k-1+( k-1 )*k / 2 )
+                       d22 = ap( k-1+( k-2 )*( k-1 ) / 2 ) / d12
+                       d11 = ap( k+( k-1 )*k / 2 ) / d12
+                       t = one / ( d11*d22-one )
+                       d12 = t / d12
+                       do j = k - 2, 1, -1
+                          wkm1 = d12*( d11*ap( j+( k-2 )*( k-1 ) / 2 )-ap( j+( k-1 )*k / 2 ) )
+                                    
+                          wk = d12*( d22*ap( j+( k-1 )*k / 2 )-ap( j+( k-2 )*( k-1 ) / 2 ) )
+                                    
+                          do i = j, 1, -1
+                             ap( i+( j-1 )*j / 2 ) = ap( i+( j-1 )*j / 2 ) -ap( i+( k-1 )*k / 2 )&
+                                       *wk -ap( i+( k-2 )*( k-1 ) / 2 )*wkm1
+                          end do
+                          ap( j+( k-1 )*k / 2 ) = wk
+                          ap( j+( k-2 )*( k-1 ) / 2 ) = wkm1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              kc = knc - k
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              kc = 1
+              npp = n*( n+1 ) / 2
+              60 continue
+              knc = kc
+              ! if k > n, exit from loop
+              if( k>n )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( ap( kc ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k<n ) then
+                 imax = k + stdlib_idamax( n-k, ap( kc+1 ), 1 )
+                 colmax = abs( ap( kc+imax-k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    rowmax = zero
+                    kx = kc + imax - k
+                    do j = k, imax - 1
+                       if( abs( ap( kx ) )>rowmax ) then
+                          rowmax = abs( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + n - j
+                    end do
+                    kpc = npp - ( n-imax+1 )*( n-imax+2 ) / 2 + 1
+                    if( imax<n ) then
+                       jmax = imax + stdlib_idamax( n-imax, ap( kpc+1 ), 1 )
+                       rowmax = max( rowmax, abs( ap( kpc+jmax-imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( ap( kpc ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kstep==2 )knc = knc + n - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_dswap( n-kp, ap( knc+kp-kk+1 ), 1, ap( kpc+1 ),1 )
+                              
+                    kx = knc + kp - kk
+                    do j = kk + 1, kp - 1
+                       kx = kx + n - j + 1
+                       t = ap( knc+j-kk )
+                       ap( knc+j-kk ) = ap( kx )
+                       ap( kx ) = t
+                    end do
+                    t = ap( knc )
+                    ap( knc ) = ap( kpc )
+                    ap( kpc ) = t
+                    if( kstep==2 ) then
+                       t = ap( kc+1 )
+                       ap( kc+1 ) = ap( kc+kp-k )
+                       ap( kc+kp-k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**t = a - w(k)*(1/d(k))*w(k)**t
+                       r1 = one / ap( kc )
+                       call stdlib_dspr( uplo, n-k, -r1, ap( kc+1 ), 1,ap( kc+n-k+1 ) )
+                       ! store l(k) in column k
+                       call stdlib_dscal( n-k, r1, ap( kc+1 ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**t
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**t
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d21 = ap( k+1+( k-1 )*( 2*n-k ) / 2 )
+                       d11 = ap( k+1+k*( 2*n-k-1 ) / 2 ) / d21
+                       d22 = ap( k+( k-1 )*( 2*n-k ) / 2 ) / d21
+                       t = one / ( d11*d22-one )
+                       d21 = t / d21
+                       do j = k + 2, n
+                          wk = d21*( d11*ap( j+( k-1 )*( 2*n-k ) / 2 )-ap( j+k*( 2*n-k-1 ) / 2 ) )
+                                    
+                          wkp1 = d21*( d22*ap( j+k*( 2*n-k-1 ) / 2 )-ap( j+( k-1 )*( 2*n-k ) / 2 )&
+                                     )
+                          do i = j, n
+                             ap( i+( j-1 )*( 2*n-j ) / 2 ) = ap( i+( j-1 )*( 2*n-j ) / 2 ) - ap( &
+                                       i+( k-1 )*( 2*n-k ) /2 )*wk - ap( i+k*( 2*n-k-1 ) / 2 )*wkp1
+                          end do
+                          ap( j+( k-1 )*( 2*n-k ) / 2 ) = wk
+                          ap( j+k*( 2*n-k-1 ) / 2 ) = wkp1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              kc = knc + n - k + 2
+              go to 60
+           end if
+           110 continue
+           return
+     end subroutine stdlib_dsptrf
+
+     !> DSPTRI: computes the inverse of a real symmetric indefinite matrix
+     !> A in packed storage using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by DSPTRF.
+
+     pure subroutine stdlib_dsptri( uplo, n, ap, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(inout) :: ap(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kcnext, kp, kpc, kstep, kx, npp
+           real(dp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              kp = n*( n+1 ) / 2
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. ap( kp )==zero )return
+                 kp = kp - info
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              kp = 1
+              do info = 1, n
+                 if( ipiv( info )>0 .and. ap( kp )==zero )return
+                 kp = kp + n - info + 1
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              kcnext = kc + k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc+k-1 ) = one / ap( kc+k-1 )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_dcopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_dspmv( uplo, k-1, -one, ap, work, 1, zero, ap( kc ),1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -stdlib_ddot( k-1, work, 1, ap( kc ), 1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( ap( kcnext+k-1 ) )
+                 ak = ap( kc+k-1 ) / t
+                 akp1 = ap( kcnext+k ) / t
+                 akkp1 = ap( kcnext+k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 ap( kc+k-1 ) = akp1 / d
+                 ap( kcnext+k ) = ak / d
+                 ap( kcnext+k-1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_dcopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_dspmv( uplo, k-1, -one, ap, work, 1, zero, ap( kc ),1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -stdlib_ddot( k-1, work, 1, ap( kc ), 1 )
+                    ap( kcnext+k-1 ) = ap( kcnext+k-1 ) -stdlib_ddot( k-1, ap( kc ), 1, ap( &
+                              kcnext ),1 )
+                    call stdlib_dcopy( k-1, ap( kcnext ), 1, work, 1 )
+                    call stdlib_dspmv( uplo, k-1, -one, ap, work, 1, zero,ap( kcnext ), 1 )
+                              
+                    ap( kcnext+k ) = ap( kcnext+k ) -stdlib_ddot( k-1, work, 1, ap( kcnext ), 1 )
+                              
+                 end if
+                 kstep = 2
+                 kcnext = kcnext + k + 1
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kpc = ( kp-1 )*kp / 2 + 1
+                 call stdlib_dswap( kp-1, ap( kc ), 1, ap( kpc ), 1 )
+                 kx = kpc + kp - 1
+                 do j = kp + 1, k - 1
+                    kx = kx + j - 1
+                    temp = ap( kc+j-1 )
+                    ap( kc+j-1 ) = ap( kx )
+                    ap( kx ) = temp
+                 end do
+                 temp = ap( kc+k-1 )
+                 ap( kc+k-1 ) = ap( kpc+kp-1 )
+                 ap( kpc+kp-1 ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc+k+k-1 )
+                    ap( kc+k+k-1 ) = ap( kc+k+kp-1 )
+                    ap( kc+k+kp-1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              kc = kcnext
+              go to 30
+              50 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              npp = n*( n+1 ) / 2
+              k = n
+              kc = npp
+              60 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 80
+              kcnext = kc - ( n-k+2 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc ) = one / ap( kc )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_dcopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_dspmv( uplo, n-k, -one, ap( kc+n-k+1 ), work, 1,zero, ap( kc+1 ), &
+                              1 )
+                    ap( kc ) = ap( kc ) - stdlib_ddot( n-k, work, 1, ap( kc+1 ), 1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( ap( kcnext+1 ) )
+                 ak = ap( kcnext ) / t
+                 akp1 = ap( kc ) / t
+                 akkp1 = ap( kcnext+1 ) / t
+                 d = t*( ak*akp1-one )
+                 ap( kcnext ) = akp1 / d
+                 ap( kc ) = ak / d
+                 ap( kcnext+1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_dcopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_dspmv( uplo, n-k, -one, ap( kc+( n-k+1 ) ), work, 1,zero, ap( kc+&
+                              1 ), 1 )
+                    ap( kc ) = ap( kc ) - stdlib_ddot( n-k, work, 1, ap( kc+1 ), 1 )
+                    ap( kcnext+1 ) = ap( kcnext+1 ) -stdlib_ddot( n-k, ap( kc+1 ), 1,ap( kcnext+2 &
+                              ), 1 )
+                    call stdlib_dcopy( n-k, ap( kcnext+2 ), 1, work, 1 )
+                    call stdlib_dspmv( uplo, n-k, -one, ap( kc+( n-k+1 ) ), work, 1,zero, ap( &
+                              kcnext+2 ), 1 )
+                    ap( kcnext ) = ap( kcnext ) -stdlib_ddot( n-k, work, 1, ap( kcnext+2 ), 1 )
+                              
+                 end if
+                 kstep = 2
+                 kcnext = kcnext - ( n-k+3 )
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kpc = npp - ( n-kp+1 )*( n-kp+2 ) / 2 + 1
+                 if( kp<n )call stdlib_dswap( n-kp, ap( kc+kp-k+1 ), 1, ap( kpc+1 ), 1 )
+                 kx = kc + kp - k
+                 do j = k + 1, kp - 1
+                    kx = kx + n - j + 1
+                    temp = ap( kc+j-k )
+                    ap( kc+j-k ) = ap( kx )
+                    ap( kx ) = temp
+                 end do
+                 temp = ap( kc )
+                 ap( kc ) = ap( kpc )
+                 ap( kpc ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc-n+k-1 )
+                    ap( kc-n+k-1 ) = ap( kc-n+kp-1 )
+                    ap( kc-n+kp-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              kc = kcnext
+              go to 60
+              80 continue
+           end if
+           return
+     end subroutine stdlib_dsptri
+
+     !> DSPTRS: solves a system of linear equations A*X = B with a real
+     !> symmetric matrix A stored in packed format using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by DSPTRF.
+
+     pure subroutine stdlib_dsptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(in) :: ap(*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kp
+           real(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              kc = kc - k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_dger( k-1, nrhs, -one, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_dscal( nrhs, one / ap( kc+k-1 ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_dswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_dger( k-2, nrhs, -one, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 call stdlib_dger( k-2, nrhs, -one, ap( kc-( k-1 ) ), 1,b( k-1, 1 ), ldb, b( 1, 1 &
+                           ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+k-2 )
+                 akm1 = ap( kc-1 ) / akm1k
+                 ak = ap( kc+k-1 ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc - k + 1
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_dgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb, ap( kc ),1, one, b( k, &
+                           1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + k
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 call stdlib_dgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb, ap( kc ),1, one, b( k, &
+                           1 ), ldb )
+                 call stdlib_dgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb,ap( kc+k ), 1, one, b( k+&
+                           1, 1 ), ldb )
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + 2*k + 1
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_dger( n-k, nrhs, -one, ap( kc+1 ), 1, b( k, 1 ),ldb, b( k+1,&
+                            1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_dscal( nrhs, one / ap( kc ), b( k, 1 ), ldb )
+                 kc = kc + n - k + 1
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_dswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_dger( n-k-1, nrhs, -one, ap( kc+2 ), 1, b( k, 1 ),ldb, b( k+2, 1 )&
+                              , ldb )
+                    call stdlib_dger( n-k-1, nrhs, -one, ap( kc+n-k+2 ), 1,b( k+1, 1 ), ldb, b( k+&
+                              2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+1 )
+                 akm1 = ap( kc ) / akm1k
+                 ak = ap( kc+n-k+1 ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc + 2*( n-k ) + 1
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              kc = kc - ( n-k+1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_dgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, ap( &
+                           kc+1 ), 1, one, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_dgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, ap( kc+1 ), &
+                              1, one, b( k, 1 ), ldb )
+                    call stdlib_dgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, ap( kc-( n-&
+                              k ) ), 1, one, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc - ( n-k+2 )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_dsptrs
+
+     !> DSTEBZ: computes the eigenvalues of a symmetric tridiagonal
+     !> matrix T.  The user may ask for all eigenvalues, all eigenvalues
+     !> in the half-open interval (VL, VU], or the IL-th through IU-th
+     !> eigenvalues.
+     !> To avoid overflow, the matrix must be scaled so that its
+     !> largest element is no greater than overflow**(1/2) * underflow**(1/4) in absolute value, and for greatest
+     !> accuracy, it should not be much smaller than that.
+     !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+     !> Matrix", Report CS41, Computer Science Dept., Stanford
+     !> University, July 21, 1966.
+
+     pure subroutine stdlib_dstebz( range, order, n, vl, vu, il, iu, abstol, d, e,m, nsplit, w, &
+               iblock, isplit, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: order, range
+           integer(ilp), intent(in) :: il, iu, n
+           integer(ilp), intent(out) :: info, m, nsplit
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iblock(*), isplit(*), iwork(*)
+           real(dp), intent(in) :: d(*), e(*)
+           real(dp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: fudge = 2.1_dp
+           real(dp), parameter :: relfac = 2.0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ncnvrg, toofew
+           integer(ilp) :: ib, ibegin, idiscl, idiscu, ie, iend, iinfo, im, in, ioff, iorder, &
+                     iout, irange, itmax, itmp1, iw, iwoff, j, jb, jdisc, je, nb, nwl, nwu
+           real(dp) :: atoli, bnorm, gl, gu, pivmin, rtoli, safemn, tmp1, tmp2, tnorm, ulp, wkill,&
+                      wl, wlu, wu, wul
+           ! Local Arrays 
+           integer(ilp) :: idumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! decode range
+           if( stdlib_lsame( range, 'A' ) ) then
+              irange = 1
+           else if( stdlib_lsame( range, 'V' ) ) then
+              irange = 2
+           else if( stdlib_lsame( range, 'I' ) ) then
+              irange = 3
+           else
+              irange = 0
+           end if
+           ! decode order
+           if( stdlib_lsame( order, 'B' ) ) then
+              iorder = 2
+           else if( stdlib_lsame( order, 'E' ) ) then
+              iorder = 1
+           else
+              iorder = 0
+           end if
+           ! check for errors
+           if( irange<=0 ) then
+              info = -1
+           else if( iorder<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( irange==2 ) then
+              if( vl>=vu )info = -5
+           else if( irange==3 .and. ( il<1 .or. il>max( 1, n ) ) )then
+              info = -6
+           else if( irange==3 .and. ( iu<min( n, il ) .or. iu>n ) )then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEBZ', -info )
+              return
+           end if
+           ! initialize error flags
+           info = 0
+           ncnvrg = .false.
+           toofew = .false.
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           ! simplifications:
+           if( irange==3 .and. il==1 .and. iu==n )irange = 1
+           ! get machine constants
+           ! nb is the minimum vector length for vector bisection, or 0
+           ! if only scalar is to be done.
+           safemn = stdlib_dlamch( 'S' )
+           ulp = stdlib_dlamch( 'P' )
+           rtoli = ulp*relfac
+           nb = stdlib_ilaenv( 1, 'DSTEBZ', ' ', n, -1, -1, -1 )
+           if( nb<=1 )nb = 0
+           ! special case when n=1
+           if( n==1 ) then
+              nsplit = 1
+              isplit( 1 ) = 1
+              if( irange==2 .and. ( vl>=d( 1 ) .or. vu<d( 1 ) ) ) then
+                 m = 0
+              else
+                 w( 1 ) = d( 1 )
+                 iblock( 1 ) = 1
+                 m = 1
+              end if
+              return
+           end if
+           ! compute splitting points
+           nsplit = 1
+           work( n ) = zero
+           pivmin = one
+           do j = 2, n
+              tmp1 = e( j-1 )**2
+              if( abs( d( j )*d( j-1 ) )*ulp**2+safemn>tmp1 ) then
+                 isplit( nsplit ) = j - 1
+                 nsplit = nsplit + 1
+                 work( j-1 ) = zero
+              else
+                 work( j-1 ) = tmp1
+                 pivmin = max( pivmin, tmp1 )
+              end if
+           end do
+           isplit( nsplit ) = n
+           pivmin = pivmin*safemn
+           ! compute interval and atoli
+           if( irange==3 ) then
+              ! range='i': compute the interval containing eigenvalues
+                         ! il through iu.
+              ! compute gershgorin interval for entire (split) matrix
+              ! and use it as the initial interval
+              gu = d( 1 )
+              gl = d( 1 )
+              tmp1 = zero
+              do j = 1, n - 1
+                 tmp2 = sqrt( work( j ) )
+                 gu = max( gu, d( j )+tmp1+tmp2 )
+                 gl = min( gl, d( j )-tmp1-tmp2 )
+                 tmp1 = tmp2
+              end do
+              gu = max( gu, d( n )+tmp1 )
+              gl = min( gl, d( n )-tmp1 )
+              tnorm = max( abs( gl ), abs( gu ) )
+              gl = gl - fudge*tnorm*ulp*n - fudge*two*pivmin
+              gu = gu + fudge*tnorm*ulp*n + fudge*pivmin
+              ! compute iteration parameters
+              itmax = int( ( log( tnorm+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+              if( abstol<=zero ) then
+                 atoli = ulp*tnorm
+              else
+                 atoli = abstol
+              end if
+              work( n+1 ) = gl
+              work( n+2 ) = gl
+              work( n+3 ) = gu
+              work( n+4 ) = gu
+              work( n+5 ) = gl
+              work( n+6 ) = gu
+              iwork( 1 ) = -1
+              iwork( 2 ) = -1
+              iwork( 3 ) = n + 1
+              iwork( 4 ) = n + 1
+              iwork( 5 ) = il - 1
+              iwork( 6 ) = iu
+              call stdlib_dlaebz( 3, itmax, n, 2, 2, nb, atoli, rtoli, pivmin, d, e,work, iwork( &
+                        5 ), work( n+1 ), work( n+5 ), iout,iwork, w, iblock, iinfo )
+              if( iwork( 6 )==iu ) then
+                 wl = work( n+1 )
+                 wlu = work( n+3 )
+                 nwl = iwork( 1 )
+                 wu = work( n+4 )
+                 wul = work( n+2 )
+                 nwu = iwork( 4 )
+              else
+                 wl = work( n+2 )
+                 wlu = work( n+4 )
+                 nwl = iwork( 2 )
+                 wu = work( n+3 )
+                 wul = work( n+1 )
+                 nwu = iwork( 3 )
+              end if
+              if( nwl<0 .or. nwl>=n .or. nwu<1 .or. nwu>n ) then
+                 info = 4
+                 return
+              end if
+           else
+              ! range='a' or 'v' -- set atoli
+              tnorm = max( abs( d( 1 ) )+abs( e( 1 ) ),abs( d( n ) )+abs( e( n-1 ) ) )
+              do j = 2, n - 1
+                 tnorm = max( tnorm, abs( d( j ) )+abs( e( j-1 ) )+abs( e( j ) ) )
+              end do
+              if( abstol<=zero ) then
+                 atoli = ulp*tnorm
+              else
+                 atoli = abstol
+              end if
+              if( irange==2 ) then
+                 wl = vl
+                 wu = vu
+              else
+                 wl = zero
+                 wu = zero
+              end if
+           end if
+           ! find eigenvalues -- loop over blocks and recompute nwl and nwu.
+           ! nwl accumulates the number of eigenvalues .le. wl,
+           ! nwu accumulates the number of eigenvalues .le. wu
+           m = 0
+           iend = 0
+           info = 0
+           nwl = 0
+           nwu = 0
+           loop_70: do jb = 1, nsplit
+              ioff = iend
+              ibegin = ioff + 1
+              iend = isplit( jb )
+              in = iend - ioff
+              if( in==1 ) then
+                 ! special case -- in=1
+                 if( irange==1 .or. wl>=d( ibegin )-pivmin )nwl = nwl + 1
+                 if( irange==1 .or. wu>=d( ibegin )-pivmin )nwu = nwu + 1
+                 if( irange==1 .or. ( wl<d( ibegin )-pivmin .and. wu>=d( ibegin )-pivmin ) ) &
+                           then
+                    m = m + 1
+                    w( m ) = d( ibegin )
+                    iblock( m ) = jb
+                 end if
+              else
+                 ! general case -- in > 1
+                 ! compute gershgorin interval
+                 ! and use it as the initial interval
+                 gu = d( ibegin )
+                 gl = d( ibegin )
+                 tmp1 = zero
+                 do j = ibegin, iend - 1
+                    tmp2 = abs( e( j ) )
+                    gu = max( gu, d( j )+tmp1+tmp2 )
+                    gl = min( gl, d( j )-tmp1-tmp2 )
+                    tmp1 = tmp2
+                 end do
+                 gu = max( gu, d( iend )+tmp1 )
+                 gl = min( gl, d( iend )-tmp1 )
+                 bnorm = max( abs( gl ), abs( gu ) )
+                 gl = gl - fudge*bnorm*ulp*in - fudge*pivmin
+                 gu = gu + fudge*bnorm*ulp*in + fudge*pivmin
+                 ! compute atoli for the current submatrix
+                 if( abstol<=zero ) then
+                    atoli = ulp*max( abs( gl ), abs( gu ) )
+                 else
+                    atoli = abstol
+                 end if
+                 if( irange>1 ) then
+                    if( gu<wl ) then
+                       nwl = nwl + in
+                       nwu = nwu + in
+                       cycle loop_70
+                    end if
+                    gl = max( gl, wl )
+                    gu = min( gu, wu )
+                    if( gl>=gu )cycle loop_70
+                 end if
+                 ! set up initial interval
+                 work( n+1 ) = gl
+                 work( n+in+1 ) = gu
+                 call stdlib_dlaebz( 1, 0, in, in, 1, nb, atoli, rtoli, pivmin,d( ibegin ), e( &
+                 ibegin ), work( ibegin ),idumma, work( n+1 ), work( n+2*in+1 ), im,iwork, w( m+1 &
+                           ), iblock( m+1 ), iinfo )
+                 nwl = nwl + iwork( 1 )
+                 nwu = nwu + iwork( in+1 )
+                 iwoff = m - iwork( 1 )
+                 ! compute eigenvalues
+                 itmax = int( ( log( gu-gl+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + &
+                           2
+                 call stdlib_dlaebz( 2, itmax, in, in, 1, nb, atoli, rtoli, pivmin,d( ibegin ), e(&
+                  ibegin ), work( ibegin ),idumma, work( n+1 ), work( n+2*in+1 ), iout,iwork, w( &
+                            m+1 ), iblock( m+1 ), iinfo )
+                 ! copy eigenvalues into w and iblock
+                 ! use -jb for block number for unconverged eigenvalues.
+                 do j = 1, iout
+                    tmp1 = half*( work( j+n )+work( j+in+n ) )
+                    ! flag non-convergence.
+                    if( j>iout-iinfo ) then
+                       ncnvrg = .true.
+                       ib = -jb
+                    else
+                       ib = jb
+                    end if
+                    do je = iwork( j ) + 1 + iwoff,iwork( j+in ) + iwoff
+                       w( je ) = tmp1
+                       iblock( je ) = ib
+                    end do
+                 end do
+                 m = m + im
+              end if
+           end do loop_70
+           ! if range='i', then (wl,wu) contains eigenvalues nwl+1,...,nwu
+           ! if nwl+1 < il or nwu > iu, discard extra eigenvalues.
+           if( irange==3 ) then
+              im = 0
+              idiscl = il - 1 - nwl
+              idiscu = nwu - iu
+              if( idiscl>0 .or. idiscu>0 ) then
+                 do je = 1, m
+                    if( w( je )<=wlu .and. idiscl>0 ) then
+                       idiscl = idiscl - 1
+                    else if( w( je )>=wul .and. idiscu>0 ) then
+                       idiscu = idiscu - 1
+                    else
+                       im = im + 1
+                       w( im ) = w( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 m = im
+              end if
+              if( idiscl>0 .or. idiscu>0 ) then
+                 ! code to deal with effects of bad arithmetic:
+                 ! some low eigenvalues to be discarded are not in (wl,wlu],
+                 ! or high eigenvalues to be discarded are not in (wul,wu]
+                 ! so just kill off the smallest idiscl/largest idiscu
+                 ! eigenvalues, by simply finding the smallest/largest
+                 ! eigenvalue(s).
+                 ! (if n(w) is monotone non-decreasing, this should never
+                     ! happen.)
+                 if( idiscl>0 ) then
+                    wkill = wu
+                    do jdisc = 1, idiscl
+                       iw = 0
+                       do je = 1, m
+                          if( iblock( je )/=0 .and.( w( je )<wkill .or. iw==0 ) ) then
+                             iw = je
+                             wkill = w( je )
+                          end if
+                       end do
+                       iblock( iw ) = 0
+                    end do
+                 end if
+                 if( idiscu>0 ) then
+                    wkill = wl
+                    do jdisc = 1, idiscu
+                       iw = 0
+                       do je = 1, m
+                          if( iblock( je )/=0 .and.( w( je )>wkill .or. iw==0 ) ) then
+                             iw = je
+                             wkill = w( je )
+                          end if
+                       end do
+                       iblock( iw ) = 0
+                    end do
+                 end if
+                 im = 0
+                 do je = 1, m
+                    if( iblock( je )/=0 ) then
+                       im = im + 1
+                       w( im ) = w( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 m = im
+              end if
+              if( idiscl<0 .or. idiscu<0 ) then
+                 toofew = .true.
+              end if
+           end if
+           ! if order='b', do nothing -- the eigenvalues are already sorted
+              ! by block.
+           ! if order='e', sort the eigenvalues from smallest to largest
+           if( iorder==1 .and. nsplit>1 ) then
+              do je = 1, m - 1
+                 ie = 0
+                 tmp1 = w( je )
+                 do j = je + 1, m
+                    if( w( j )<tmp1 ) then
+                       ie = j
+                       tmp1 = w( j )
+                    end if
+                 end do
+                 if( ie/=0 ) then
+                    itmp1 = iblock( ie )
+                    w( ie ) = w( je )
+                    iblock( ie ) = iblock( je )
+                    w( je ) = tmp1
+                    iblock( je ) = itmp1
+                 end if
+              end do
+           end if
+           info = 0
+           if( ncnvrg )info = info + 1
+           if( toofew )info = info + 2
+           return
+     end subroutine stdlib_dstebz
+
+     !> DSYCONV: convert A given by TRF into L and D and vice-versa.
+     !> Get Non-diag elements of D (returned in workspace) and
+     !> apply or reverse permutation done in TRF.
+
+     pure subroutine stdlib_dsyconv( uplo, way, n, a, lda, ipiv, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip, j
+           real(dp) :: temp
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYCONV', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+            ! a is upper
+            ! convert a (a is upper)
+              ! convert value
+              if ( convert ) then
+                 i=n
+                 e(1)=zero
+                 do while ( i > 1 )
+                    if( ipiv(i) < 0 ) then
+                       e(i)=a(i-1,i)
+                       e(i-1)=zero
+                       a(i-1,i)=zero
+                       i=i-1
+                    else
+                       e(i)=zero
+                    endif
+                    i=i-1
+                 end do
+              ! convert permutations
+              i=n
+              do while ( i >= 1 )
+                 if( ipiv(i) > 0) then
+                    ip=ipiv(i)
+                    if( i < n) then
+                       do j= i+1,n
+                         temp=a(ip,j)
+                         a(ip,j)=a(i,j)
+                         a(i,j)=temp
+                       end do
+                    endif
+                 else
+                   ip=-ipiv(i)
+                    if( i < n) then
+                  do j= i+1,n
+                      temp=a(ip,j)
+                      a(ip,j)=a(i-1,j)
+                      a(i-1,j)=temp
+                  end do
+                     endif
+                     i=i-1
+                endif
+                i=i-1
+             end do
+              else
+            ! revert a (a is upper)
+              ! revert permutations
+                 i=1
+                 do while ( i <= n )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if( i < n) then
+                       do j= i+1,n
+                         temp=a(ip,j)
+                         a(ip,j)=a(i,j)
+                         a(i,j)=temp
+                       end do
+                       endif
+                    else
+                      ip=-ipiv(i)
+                      i=i+1
+                      if( i < n) then
+                         do j= i+1,n
+                            temp=a(ip,j)
+                            a(ip,j)=a(i-1,j)
+                            a(i-1,j)=temp
+                         end do
+                      endif
+                    endif
+                    i=i+1
+                 end do
+              ! revert value
+                 i=n
+                 do while ( i > 1 )
+                    if( ipiv(i) < 0 ) then
+                       a(i-1,i)=e(i)
+                       i=i-1
+                    endif
+                    i=i-1
+                 end do
+              end if
+           else
+            ! a is lower
+              if ( convert ) then
+            ! convert a (a is lower)
+              ! convert value
+                 i=1
+                 e(n)=zero
+                 do while ( i <= n )
+                    if( i<n .and. ipiv(i) < 0 ) then
+                       e(i)=a(i+1,i)
+                       e(i+1)=zero
+                       a(i+1,i)=zero
+                       i=i+1
+                    else
+                       e(i)=zero
+                    endif
+                    i=i+1
+                 end do
+              ! convert permutations
+              i=1
+              do while ( i <= n )
+                 if( ipiv(i) > 0 ) then
+                    ip=ipiv(i)
+                    if (i > 1) then
+                    do j= 1,i-1
+                      temp=a(ip,j)
+                      a(ip,j)=a(i,j)
+                      a(i,j)=temp
+                    end do
+                    endif
+                 else
+                   ip=-ipiv(i)
+                   if (i > 1) then
+                   do j= 1,i-1
+                      temp=a(ip,j)
+                      a(ip,j)=a(i+1,j)
+                      a(i+1,j)=temp
+                   end do
+                   endif
+                   i=i+1
+                endif
+                i=i+1
+             end do
+              else
+            ! revert a (a is lower)
+              ! revert permutations
+                 i=n
+                 do while ( i >= 1 )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(i,j)
+                             a(i,j)=a(ip,j)
+                             a(ip,j)=temp
+                          end do
+                       endif
+                    else
+                       ip=-ipiv(i)
+                       i=i-1
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(i+1,j)
+                             a(i+1,j)=a(ip,j)
+                             a(ip,j)=temp
+                          end do
+                       endif
+                    endif
+                    i=i-1
+                 end do
+              ! revert value
+                 i=1
+                 do while ( i <= n-1 )
+                    if( ipiv(i) < 0 ) then
+                       a(i+1,i)=e(i)
+                       i=i+1
+                    endif
+                    i=i+1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_dsyconv
+
+     !> If parameter WAY = 'C':
+     !> DSYCONVF: converts the factorization output format used in
+     !> DSYTRF provided on entry in parameter A into the factorization
+     !> output format used in DSYTRF_RK (or DSYTRF_BK) that is stored
+     !> on exit in parameters A and E. It also converts in place details of
+     !> the intechanges stored in IPIV from the format used in DSYTRF into
+     !> the format used in DSYTRF_RK (or DSYTRF_BK).
+     !> If parameter WAY = 'R':
+     !> DSYCONVF performs the conversion in reverse direction, i.e.
+     !> converts the factorization output format used in DSYTRF_RK
+     !> (or DSYTRF_BK) provided on entry in parameters A and E into
+     !> the factorization output format used in DSYTRF that is stored
+     !> on exit in parameter A. It also converts in place details of
+     !> the intechanges stored in IPIV from the format used in DSYTRF_RK
+     !> (or DSYTRF_BK) into the format used in DSYTRF.
+
+     pure subroutine stdlib_dsyconvf( uplo, way, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*), e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYCONVF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! begin a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 ! assign superdiagonal entries of d to array e and zero out
+                 ! corresponding entries in input storage a
+                 i = n
+                 e( 1 ) = zero
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       e( i ) = a( i-1, i )
+                       e( i-1 ) = zero
+                       a( i-1, i ) = zero
+                       i = i - 1
+                    else
+                       e( i ) = zero
+                    end if
+                    i = i - 1
+                 end do
+                 ! convert permutations and ipiv
+                 ! apply permutations to submatrices of upper part of a
+                 ! in factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_dswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i) in a(1:i,n-i:n)
+                       ip = -ipiv( i )
+                       if( i<n ) then
+                          if( ip/=(i-1) ) then
+                             call stdlib_dswap( n-i, a( i-1, i+1 ), lda,a( ip, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is no interchnge of rows i and and ipiv(i),
+                       ! so this should be reflected in ipiv format for
+                       ! *sytrf_rk ( or *sytrf_bk)
+                       ipiv( i ) = i
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations and ipiv
+                 ! apply permutations to submatrices of upper part of a
+                 ! in reverse factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_dswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i) in a(1:i,n-i:n)
+                       i = i + 1
+                       ip = -ipiv( i )
+                       if( i<n ) then
+                          if( ip/=(i-1) ) then
+                             call stdlib_dswap( n-i, a( ip, i+1 ), lda,a( i-1, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is one interchange of rows i-1 and ipiv(i-1),
+                       ! so this should be recorded in two consecutive entries
+                       ! in ipiv format for *sytrf
+                       ipiv( i ) = ipiv( i-1 )
+                    end if
+                    i = i + 1
+                 end do
+                 ! revert value
+                 ! assign superdiagonal entries of d from array e to
+                 ! superdiagonal entries of a.
+                 i = n
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       a( i-1, i ) = e( i )
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              ! end a is upper
+              end if
+           else
+              ! begin a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 ! assign subdiagonal entries of d to array e and zero out
+                 ! corresponding entries in input storage a
+                 i = 1
+                 e( n ) = zero
+                 do while ( i<=n )
+                    if( i<n .and. ipiv(i)<0 ) then
+                       e( i ) = a( i+1, i )
+                       e( i+1 ) = zero
+                       a( i+1, i ) = zero
+                       i = i + 1
+                    else
+                       e( i ) = zero
+                    end if
+                    i = i + 1
+                 end do
+                 ! convert permutations and ipiv
+                 ! apply permutations to submatrices of lower part of a
+                 ! in factorization order where k increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_dswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i) in a(i:n,1:i-1)
+                       ip = -ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=(i+1) ) then
+                             call stdlib_dswap( i-1, a( i+1, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is no interchnge of rows i and and ipiv(i),
+                       ! so this should be reflected in ipiv format for
+                       ! *sytrf_rk ( or *sytrf_bk)
+                       ipiv( i ) = i
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations and ipiv
+                 ! apply permutations to submatrices of lower part of a
+                 ! in reverse factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_dswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i) in a(i:n,1:i-1)
+                       i = i - 1
+                       ip = -ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=(i+1) ) then
+                             call stdlib_dswap( i-1, a( ip, 1 ), lda,a( i+1, 1 ), lda )
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is one interchange of rows i+1 and ipiv(i+1),
+                       ! so this should be recorded in consecutive entries
+                       ! in ipiv format for *sytrf
+                       ipiv( i ) = ipiv( i+1 )
+                    end if
+                    i = i - 1
+                 end do
+                 ! revert value
+                 ! assign subdiagonal entries of d from array e to
+                 ! subgiagonal entries of a.
+                 i = 1
+                 do while ( i<=n-1 )
+                    if( ipiv( i )<0 ) then
+                       a( i + 1, i ) = e( i )
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              end if
+              ! end a is lower
+           end if
+           return
+     end subroutine stdlib_dsyconvf
+
+     !> If parameter WAY = 'C':
+     !> DSYCONVF_ROOK: converts the factorization output format used in
+     !> DSYTRF_ROOK provided on entry in parameter A into the factorization
+     !> output format used in DSYTRF_RK (or DSYTRF_BK) that is stored
+     !> on exit in parameters A and E. IPIV format for DSYTRF_ROOK and
+     !> DSYTRF_RK (or DSYTRF_BK) is the same and is not converted.
+     !> If parameter WAY = 'R':
+     !> DSYCONVF_ROOK performs the conversion in reverse direction, i.e.
+     !> converts the factorization output format used in DSYTRF_RK
+     !> (or DSYTRF_BK) provided on entry in parameters A and E into
+     !> the factorization output format used in DSYTRF_ROOK that is stored
+     !> on exit in parameter A. IPIV format for DSYTRF_ROOK and
+     !> DSYTRF_RK (or DSYTRF_BK) is the same and is not converted.
+
+     pure subroutine stdlib_dsyconvf_rook( uplo, way, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*), e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip, ip2
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYCONVF_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! begin a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 ! assign superdiagonal entries of d to array e and zero out
+                 ! corresponding entries in input storage a
+                 i = n
+                 e( 1 ) = zero
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       e( i ) = a( i-1, i )
+                       e( i-1 ) = zero
+                       a( i-1, i ) = zero
+                       i = i - 1
+                    else
+                       e( i ) = zero
+                    end if
+                    i = i - 1
+                 end do
+                 ! convert permutations
+                 ! apply permutations to submatrices of upper part of a
+                 ! in factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_dswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i and ipiv(i) and i-1 and ipiv(i-1)
+                       ! in a(1:i,n-i:n)
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i-1 )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_dswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                          if( ip2/=(i-1) ) then
+                             call stdlib_dswap( n-i, a( i-1, i+1 ), lda,a( ip2, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations
+                 ! apply permutations to submatrices of upper part of a
+                 ! in reverse factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_dswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i-1) and i and ipiv(i)
+                       ! in a(1:i,n-i:n)
+                       i = i + 1
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i-1 )
+                       if( i<n ) then
+                          if( ip2/=(i-1) ) then
+                             call stdlib_dswap( n-i, a( ip2, i+1 ), lda,a( i-1, i+1 ), lda )
+                                       
+                          end if
+                          if( ip/=i ) then
+                             call stdlib_dswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    end if
+                    i = i + 1
+                 end do
+                 ! revert value
+                 ! assign superdiagonal entries of d from array e to
+                 ! superdiagonal entries of a.
+                 i = n
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       a( i-1, i ) = e( i )
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              ! end a is upper
+              end if
+           else
+              ! begin a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 ! assign subdiagonal entries of d to array e and zero out
+                 ! corresponding entries in input storage a
+                 i = 1
+                 e( n ) = zero
+                 do while ( i<=n )
+                    if( i<n .and. ipiv(i)<0 ) then
+                       e( i ) = a( i+1, i )
+                       e( i+1 ) = zero
+                       a( i+1, i ) = zero
+                       i = i + 1
+                    else
+                       e( i ) = zero
+                    end if
+                    i = i + 1
+                 end do
+                 ! convert permutations
+                 ! apply permutations to submatrices of lower part of a
+                 ! in factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_dswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i and ipiv(i) and i+1 and ipiv(i+1)
+                       ! in a(i:n,1:i-1)
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i+1 )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_dswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                          if( ip2/=(i+1) ) then
+                             call stdlib_dswap( i-1, a( i+1, 1 ), lda,a( ip2, 1 ), lda )
+                          end if
+                       end if
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations
+                 ! apply permutations to submatrices of lower part of a
+                 ! in reverse factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_dswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i+1) and i and ipiv(i)
+                       ! in a(i:n,1:i-1)
+                       i = i - 1
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i+1 )
+                       if ( i>1 ) then
+                          if( ip2/=(i+1) ) then
+                             call stdlib_dswap( i-1, a( ip2, 1 ), lda,a( i+1, 1 ), lda )
+                          end if
+                          if( ip/=i ) then
+                             call stdlib_dswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    end if
+                    i = i - 1
+                 end do
+                 ! revert value
+                 ! assign subdiagonal entries of d from array e to
+                 ! subgiagonal entries of a.
+                 i = 1
+                 do while ( i<=n-1 )
+                    if( ipiv( i )<0 ) then
+                       a( i + 1, i ) = e( i )
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              end if
+              ! end a is lower
+           end if
+           return
+     end subroutine stdlib_dsyconvf_rook
+
+     !> DSYEQUB: computes row and column scalings intended to equilibrate a
+     !> symmetric matrix A (with respect to the Euclidean norm) and reduce
+     !> its condition number. The scale factors S are computed by the BIN
+     !> algorithm (see references) so that the scaled matrix B with elements
+     !> B(i,j) = S(i)*A(i,j)*S(j) has a condition number within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_dsyequb( uplo, n, a, lda, s, scond, amax, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(out) :: amax, scond
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: s(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: max_iter = 100
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, iter
+           real(dp) :: avg, std, tol, c0, c1, c2, t, u, si, d, base, smin, smax, smlnum, bignum, &
+                     scale, sumsq
+           logical(lk) :: up
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if ( .not. ( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -1
+           else if ( n < 0 ) then
+              info = -2
+           else if ( lda < max( 1, n ) ) then
+              info = -4
+           end if
+           if ( info /= 0 ) then
+              call stdlib_xerbla( 'DSYEQUB', -info )
+              return
+           end if
+           up = stdlib_lsame( uplo, 'U' )
+           amax = zero
+           ! quick return if possible.
+           if ( n == 0 ) then
+              scond = one
+              return
+           end if
+           do i = 1, n
+              s( i ) = zero
+           end do
+           amax = zero
+           if ( up ) then
+              do j = 1, n
+                 do i = 1, j-1
+                    s( i ) = max( s( i ), abs( a( i, j ) ) )
+                    s( j ) = max( s( j ), abs( a( i, j ) ) )
+                    amax = max( amax, abs( a( i, j ) ) )
+                 end do
+                 s( j ) = max( s( j ), abs( a( j, j ) ) )
+                 amax = max( amax, abs( a( j, j ) ) )
+              end do
+           else
+              do j = 1, n
+                 s( j ) = max( s( j ), abs( a( j, j ) ) )
+                 amax = max( amax, abs( a( j, j ) ) )
+                 do i = j+1, n
+                    s( i ) = max( s( i ), abs( a( i, j ) ) )
+                    s( j ) = max( s( j ), abs( a( i, j ) ) )
+                    amax = max( amax, abs( a( i, j ) ) )
+                 end do
+              end do
+           end if
+           do j = 1, n
+              s( j ) = 1.0_dp / s( j )
+           end do
+           tol = one / sqrt( 2.0_dp * n )
+           do iter = 1, max_iter
+              scale = zero
+              sumsq = zero
+              ! beta = |a|s
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              if ( up ) then
+                 do j = 1, n
+                    do i = 1, j-1
+                       work( i ) = work( i ) + abs( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + abs( a( i, j ) ) * s( i )
+                    end do
+                    work( j ) = work( j ) + abs( a( j, j ) ) * s( j )
+                 end do
+              else
+                 do j = 1, n
+                    work( j ) = work( j ) + abs( a( j, j ) ) * s( j )
+                    do i = j+1, n
+                       work( i ) = work( i ) + abs( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + abs( a( i, j ) ) * s( i )
+                    end do
+                 end do
+              end if
+              ! avg = s^t beta / n
+              avg = zero
+              do i = 1, n
+                 avg = avg + s( i )*work( i )
+              end do
+              avg = avg / n
+              std = zero
+              do i = n+1, 2*n
+                 work( i ) = s( i-n ) * work( i-n ) - avg
+              end do
+              call stdlib_dlassq( n, work( n+1 ), 1, scale, sumsq )
+              std = scale * sqrt( sumsq / n )
+              if ( std < tol * avg ) goto 999
+              do i = 1, n
+                 t = abs( a( i, i ) )
+                 si = s( i )
+                 c2 = ( n-1 ) * t
+                 c1 = ( n-2 ) * ( work( i ) - t*si )
+                 c0 = -(t*si)*si + 2*work( i )*si - n*avg
+                 d = c1*c1 - 4*c0*c2
+                 if ( d <= 0 ) then
+                    info = -1
+                    return
+                 end if
+                 si = -2*c0 / ( c1 + sqrt( d ) )
+                 d = si - s( i )
+                 u = zero
+                 if ( up ) then
+                    do j = 1, i
+                       t = abs( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = abs( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 else
+                    do j = 1, i
+                       t = abs( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = abs( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 end if
+                 avg = avg + ( u + work( i ) ) * d / n
+                 s( i ) = si
+              end do
+           end do
+           999 continue
+           smlnum = stdlib_dlamch( 'SAFEMIN' )
+           bignum = one / smlnum
+           smin = bignum
+           smax = zero
+           t = one / sqrt( avg )
+           base = stdlib_dlamch( 'B' )
+           u = one / log( base )
+           do i = 1, n
+              s( i ) = base ** int( u * log( s( i ) * t ),KIND=ilp)
+              smin = min( smin, s( i ) )
+              smax = max( smax, s( i ) )
+           end do
+           scond = max( smin, smlnum ) / min( smax, bignum )
+     end subroutine stdlib_dsyequb
+
+     !> DSYGS2: reduces a real symmetric-definite generalized eigenproblem
+     !> to standard form.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**T)*A*inv(U) or inv(L)*A*inv(L**T)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**T or L**T *A*L.
+     !> B must have been previously factorized as U**T *U or L*L**T by DPOTRF.
+
+     pure subroutine stdlib_dsygs2( itype, uplo, n, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k
+           real(dp) :: akk, bkk, ct
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYGS2', -info )
+              return
+           end if
+           if( itype==1 ) then
+              if( upper ) then
+                 ! compute inv(u**t)*a*inv(u)
+                 do k = 1, n
+                    ! update the upper triangle of a(k:n,k:n)
+                    akk = a( k, k )
+                    bkk = b( k, k )
+                    akk = akk / bkk**2
+                    a( k, k ) = akk
+                    if( k<n ) then
+                       call stdlib_dscal( n-k, one / bkk, a( k, k+1 ), lda )
+                       ct = -half*akk
+                       call stdlib_daxpy( n-k, ct, b( k, k+1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_dsyr2( uplo, n-k, -one, a( k, k+1 ), lda,b( k, k+1 ), ldb, a( &
+                                 k+1, k+1 ), lda )
+                       call stdlib_daxpy( n-k, ct, b( k, k+1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_dtrsv( uplo, 'TRANSPOSE', 'NON-UNIT', n-k,b( k+1, k+1 ), ldb, &
+                                 a( k, k+1 ), lda )
+                    end if
+                 end do
+              else
+                 ! compute inv(l)*a*inv(l**t)
+                 do k = 1, n
+                    ! update the lower triangle of a(k:n,k:n)
+                    akk = a( k, k )
+                    bkk = b( k, k )
+                    akk = akk / bkk**2
+                    a( k, k ) = akk
+                    if( k<n ) then
+                       call stdlib_dscal( n-k, one / bkk, a( k+1, k ), 1 )
+                       ct = -half*akk
+                       call stdlib_daxpy( n-k, ct, b( k+1, k ), 1, a( k+1, k ), 1 )
+                       call stdlib_dsyr2( uplo, n-k, -one, a( k+1, k ), 1,b( k+1, k ), 1, a( k+1, &
+                                 k+1 ), lda )
+                       call stdlib_daxpy( n-k, ct, b( k+1, k ), 1, a( k+1, k ), 1 )
+                       call stdlib_dtrsv( uplo, 'NO TRANSPOSE', 'NON-UNIT', n-k,b( k+1, k+1 ), &
+                                 ldb, a( k+1, k ), 1 )
+                    end if
+                 end do
+              end if
+           else
+              if( upper ) then
+                 ! compute u*a*u**t
+                 do k = 1, n
+                    ! update the upper triangle of a(1:k,1:k)
+                    akk = a( k, k )
+                    bkk = b( k, k )
+                    call stdlib_dtrmv( uplo, 'NO TRANSPOSE', 'NON-UNIT', k-1, b,ldb, a( 1, k ), 1 &
+                              )
+                    ct = half*akk
+                    call stdlib_daxpy( k-1, ct, b( 1, k ), 1, a( 1, k ), 1 )
+                    call stdlib_dsyr2( uplo, k-1, one, a( 1, k ), 1, b( 1, k ), 1,a, lda )
+                    call stdlib_daxpy( k-1, ct, b( 1, k ), 1, a( 1, k ), 1 )
+                    call stdlib_dscal( k-1, bkk, a( 1, k ), 1 )
+                    a( k, k ) = akk*bkk**2
+                 end do
+              else
+                 ! compute l**t *a*l
+                 do k = 1, n
+                    ! update the lower triangle of a(1:k,1:k)
+                    akk = a( k, k )
+                    bkk = b( k, k )
+                    call stdlib_dtrmv( uplo, 'TRANSPOSE', 'NON-UNIT', k-1, b, ldb,a( k, 1 ), lda )
+                              
+                    ct = half*akk
+                    call stdlib_daxpy( k-1, ct, b( k, 1 ), ldb, a( k, 1 ), lda )
+                    call stdlib_dsyr2( uplo, k-1, one, a( k, 1 ), lda, b( k, 1 ),ldb, a, lda )
+                              
+                    call stdlib_daxpy( k-1, ct, b( k, 1 ), ldb, a( k, 1 ), lda )
+                    call stdlib_dscal( k-1, bkk, a( k, 1 ), lda )
+                    a( k, k ) = akk*bkk**2
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_dsygs2
+
+     !> DSYGST: reduces a real symmetric-definite generalized eigenproblem
+     !> to standard form.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**T)*A*inv(U) or inv(L)*A*inv(L**T)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**T or L**T*A*L.
+     !> B must have been previously factorized as U**T*U or L*L**T by DPOTRF.
+
+     pure subroutine stdlib_dsygst( itype, uplo, n, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYGST', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'DSYGST', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_dsygs2( itype, uplo, n, a, lda, b, ldb, info )
+           else
+              ! use blocked code
+              if( itype==1 ) then
+                 if( upper ) then
+                    ! compute inv(u**t)*a*inv(u)
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the upper triangle of a(k:n,k:n)
+                       call stdlib_dsygs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                       if( k+kb<=n ) then
+                          call stdlib_dtrsm( 'LEFT', uplo, 'TRANSPOSE', 'NON-UNIT',kb, n-k-kb+1, &
+                                    one, b( k, k ), ldb,a( k, k+kb ), lda )
+                          call stdlib_dsymm( 'LEFT', uplo, kb, n-k-kb+1, -half,a( k, k ), lda, b( &
+                                    k, k+kb ), ldb, one,a( k, k+kb ), lda )
+                          call stdlib_dsyr2k( uplo, 'TRANSPOSE', n-k-kb+1, kb, -one,a( k, k+kb ), &
+                                    lda, b( k, k+kb ), ldb,one, a( k+kb, k+kb ), lda )
+                          call stdlib_dsymm( 'LEFT', uplo, kb, n-k-kb+1, -half,a( k, k ), lda, b( &
+                                    k, k+kb ), ldb, one,a( k, k+kb ), lda )
+                          call stdlib_dtrsm( 'RIGHT', uplo, 'NO TRANSPOSE','NON-UNIT', kb, n-k-kb+&
+                                    1, one,b( k+kb, k+kb ), ldb, a( k, k+kb ),lda )
+                       end if
+                    end do
+                 else
+                    ! compute inv(l)*a*inv(l**t)
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the lower triangle of a(k:n,k:n)
+                       call stdlib_dsygs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                       if( k+kb<=n ) then
+                          call stdlib_dtrsm( 'RIGHT', uplo, 'TRANSPOSE', 'NON-UNIT',n-k-kb+1, kb, &
+                                    one, b( k, k ), ldb,a( k+kb, k ), lda )
+                          call stdlib_dsymm( 'RIGHT', uplo, n-k-kb+1, kb, -half,a( k, k ), lda, b(&
+                                     k+kb, k ), ldb, one,a( k+kb, k ), lda )
+                          call stdlib_dsyr2k( uplo, 'NO TRANSPOSE', n-k-kb+1, kb,-one, a( k+kb, k &
+                                    ), lda, b( k+kb, k ),ldb, one, a( k+kb, k+kb ), lda )
+                          call stdlib_dsymm( 'RIGHT', uplo, n-k-kb+1, kb, -half,a( k, k ), lda, b(&
+                                     k+kb, k ), ldb, one,a( k+kb, k ), lda )
+                          call stdlib_dtrsm( 'LEFT', uplo, 'NO TRANSPOSE','NON-UNIT', n-k-kb+1, &
+                                    kb, one,b( k+kb, k+kb ), ldb, a( k+kb, k ),lda )
+                       end if
+                    end do
+                 end if
+              else
+                 if( upper ) then
+                    ! compute u*a*u**t
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the upper triangle of a(1:k+kb-1,1:k+kb-1)
+                       call stdlib_dtrmm( 'LEFT', uplo, 'NO TRANSPOSE', 'NON-UNIT',k-1, kb, one, &
+                                 b, ldb, a( 1, k ), lda )
+                       call stdlib_dsymm( 'RIGHT', uplo, k-1, kb, half, a( k, k ),lda, b( 1, k ), &
+                                 ldb, one, a( 1, k ), lda )
+                       call stdlib_dsyr2k( uplo, 'NO TRANSPOSE', k-1, kb, one,a( 1, k ), lda, b( &
+                                 1, k ), ldb, one, a,lda )
+                       call stdlib_dsymm( 'RIGHT', uplo, k-1, kb, half, a( k, k ),lda, b( 1, k ), &
+                                 ldb, one, a( 1, k ), lda )
+                       call stdlib_dtrmm( 'RIGHT', uplo, 'TRANSPOSE', 'NON-UNIT',k-1, kb, one, b( &
+                                 k, k ), ldb, a( 1, k ),lda )
+                       call stdlib_dsygs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                    end do
+                 else
+                    ! compute l**t*a*l
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the lower triangle of a(1:k+kb-1,1:k+kb-1)
+                       call stdlib_dtrmm( 'RIGHT', uplo, 'NO TRANSPOSE', 'NON-UNIT',kb, k-1, one, &
+                                 b, ldb, a( k, 1 ), lda )
+                       call stdlib_dsymm( 'LEFT', uplo, kb, k-1, half, a( k, k ),lda, b( k, 1 ), &
+                                 ldb, one, a( k, 1 ), lda )
+                       call stdlib_dsyr2k( uplo, 'TRANSPOSE', k-1, kb, one,a( k, 1 ), lda, b( k, &
+                                 1 ), ldb, one, a,lda )
+                       call stdlib_dsymm( 'LEFT', uplo, kb, k-1, half, a( k, k ),lda, b( k, 1 ), &
+                                 ldb, one, a( k, 1 ), lda )
+                       call stdlib_dtrmm( 'LEFT', uplo, 'TRANSPOSE', 'NON-UNIT', kb,k-1, one, b( &
+                                 k, k ), ldb, a( k, 1 ), lda )
+                       call stdlib_dsygs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dsygst
+
+     !> DSYSWAPR: applies an elementary permutation on the rows and the columns of
+     !> a symmetric matrix.
+
+     pure subroutine stdlib_dsyswapr( uplo, n, a, lda, i1, i2)
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: i1, i2, lda, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,n)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           real(dp) :: tmp
+           ! Executable Statements 
+           upper = stdlib_lsame( uplo, 'U' )
+           if (upper) then
+               ! upper
+               ! first swap
+                ! - swap column i1 and i2 from i1 to i1-1
+              call stdlib_dswap( i1-1, a(1,i1), 1, a(1,i2), 1 )
+                ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap row i1 from i1+1 to i2-1 with col i2 from i1+1 to i2-1
+              tmp=a(i1,i1)
+              a(i1,i1)=a(i2,i2)
+              a(i2,i2)=tmp
+              do i=1,i2-i1-1
+                 tmp=a(i1,i1+i)
+                 a(i1,i1+i)=a(i1+i,i2)
+                 a(i1+i,i2)=tmp
+              end do
+                ! third swap
+                ! - swap row i1 and i2 from i2+1 to n
+              do i=i2+1,n
+                 tmp=a(i1,i)
+                 a(i1,i)=a(i2,i)
+                 a(i2,i)=tmp
+              end do
+             else
+               ! lower
+               ! first swap
+                ! - swap row i1 and i2 from i1 to i1-1
+              call stdlib_dswap( i1-1, a(i1,1), lda, a(i2,1), lda )
+               ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap col i1 from i1+1 to i2-1 with row i2 from i1+1 to i2-1
+               tmp=a(i1,i1)
+               a(i1,i1)=a(i2,i2)
+               a(i2,i2)=tmp
+               do i=1,i2-i1-1
+                  tmp=a(i1+i,i1)
+                  a(i1+i,i1)=a(i2,i1+i)
+                  a(i2,i1+i)=tmp
+               end do
+               ! third swap
+                ! - swap col i1 and i2 from i2+1 to n
+               do i=i2+1,n
+                  tmp=a(i,i1)
+                  a(i,i1)=a(i,i2)
+                  a(i,i2)=tmp
+               end do
+           endif
+     end subroutine stdlib_dsyswapr
+
+     !> DSYTF2_RK: computes the factorization of a real symmetric matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_dsytf2_rk( uplo, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: e(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper, done
+           integer(ilp) :: i, imax, j, jmax, itemp, k, kk, kp, kstep, p, ii
+           real(dp) :: absakk, alpha, colmax, d11, d12, d21, d22, rowmax, dtemp, t, wk, wkm1, &
+                     wkp1, sfmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTF2_RK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = zero
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 34
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_idamax( k-1, a( 1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = zero
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange,
+                    ! use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_idamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = abs( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_idamax( imax-1, a( 1, imax ), 1 )
+                          dtemp = abs( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( abs( a( imax, imax ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the leading
+                    ! submatrix a(1:k,1:k) if we have a 2-by-2 pivot
+                    if( p>1 )call stdlib_dswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    if( p<(k-1) )call stdlib_dswap( k-p-1, a( p+1, k ), 1, a( p, p+1 ),lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_dswap( n-k, a( k, k+1 ), lda, a( p, k+1 ), lda )
+                 end if
+                 ! second swap
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    if( kp>1 )call stdlib_dswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    if( ( kk>1 ) .and. ( kp<(kk-1) ) )call stdlib_dswap( kk-kp-1, a( kp+1, kk ), &
+                              1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_dswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = one / a( k, k )
+                          call stdlib_dsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_dscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_dsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = zero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = one / ( d11*d22-one )
+                       do j = k - 2, 1, -1
+                          wkm1 = t*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = t*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - (a( i, k ) / d12 )*wk -( a( i, k-1 ) / d12 )&
+                                       *wkm1
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d12
+                          a( j, k-1 ) = wkm1 / d12
+                       end do
+                    end if
+                    ! copy superdiagonal elements of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    e( k ) = a( k-1, k )
+                    e( k-1 ) = zero
+                    a( k-1, k ) = zero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              34 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = zero
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 64
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_idamax( n-k, a( k+1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = zero
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_idamax( imax-k, a( imax, k ), lda )
+                          rowmax = abs( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_idamax( n-imax, a( imax+1, imax ),1 )
+                          dtemp = abs( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( abs( a( imax, imax ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 42
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the trailing
+                    ! submatrix a(k:n,k:n) if we have a 2-by-2 pivot
+                    if( p<n )call stdlib_dswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    if( p>(k+1) )call stdlib_dswap( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_dswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                 end if
+                 ! second swap
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_dswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    if( ( kk<n ) .and. ( kp>(kk+1) ) )call stdlib_dswap( kp-kk-1, a( kk+1, kk ), &
+                              1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_dswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                    ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                    ! store l(k) in column k
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = one / a( k, k )
+                          call stdlib_dsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_dscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_dsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = zero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = t*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = t*( d22*a( j, k+1 )-a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - ( a( i, k ) / d21 )*wk -( a( i, k+1 ) / d21 )&
+                                       *wkp1
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d21
+                          a( j, k+1 ) = wkp1 / d21
+                       end do
+                    end if
+                    ! copy subdiagonal elements of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    e( k ) = a( k+1, k )
+                    e( k+1 ) = zero
+                    a( k+1, k ) = zero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+              64 continue
+           end if
+           return
+     end subroutine stdlib_dsytf2_rk
+
+     !> DSYTF2_ROOK: computes the factorization of a real symmetric matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**T is the transpose of U, and D is symmetric and
+     !> block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_dsytf2_rook( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper, done
+           integer(ilp) :: i, imax, j, jmax, itemp, k, kk, kp, kstep, p, ii
+           real(dp) :: absakk, alpha, colmax, d11, d12, d21, d22, rowmax, dtemp, t, wk, wkm1, &
+                     wkp1, sfmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTF2_ROOK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_idamax( k-1, a( 1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange,
+                    ! use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_idamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = abs( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_idamax( imax-1, a( 1, imax ), 1 )
+                          dtemp = abs( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( abs( a( imax, imax ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the leading
+                    ! submatrix a(1:k,1:k) if we have a 2-by-2 pivot
+                    if( p>1 )call stdlib_dswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    if( p<(k-1) )call stdlib_dswap( k-p-1, a( p+1, k ), 1, a( p, p+1 ),lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                 end if
+                 ! second swap
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    if( kp>1 )call stdlib_dswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    if( ( kk>1 ) .and. ( kp<(kk-1) ) )call stdlib_dswap( kk-kp-1, a( kp+1, kk ), &
+                              1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = one / a( k, k )
+                          call stdlib_dsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_dscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_dsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = one / ( d11*d22-one )
+                       do j = k - 2, 1, -1
+                          wkm1 = t*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = t*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - (a( i, k ) / d12 )*wk -( a( i, k-1 ) / d12 )&
+                                       *wkm1
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d12
+                          a( j, k-1 ) = wkm1 / d12
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_idamax( n-k, a( k+1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_idamax( imax-k, a( imax, k ), lda )
+                          rowmax = abs( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_idamax( n-imax, a( imax+1, imax ),1 )
+                          dtemp = abs( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( abs( a( imax, imax ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 42
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the trailing
+                    ! submatrix a(k:n,k:n) if we have a 2-by-2 pivot
+                    if( p<n )call stdlib_dswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    if( p>(k+1) )call stdlib_dswap( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                 end if
+                 ! second swap
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_dswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    if( ( kk<n ) .and. ( kp>(kk+1) ) )call stdlib_dswap( kp-kk-1, a( kk+1, kk ), &
+                              1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                    ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                    ! store l(k) in column k
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = one / a( k, k )
+                          call stdlib_dsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_dscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_dsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = t*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = t*( d22*a( j, k+1 )-a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - ( a( i, k ) / d21 )*wk -( a( i, k+1 ) / d21 )&
+                                       *wkp1
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d21
+                          a( j, k+1 ) = wkp1 / d21
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_dsytf2_rook
+
+     !> DSYTRF_RK: computes the factorization of a real symmetric matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_dsytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, ip, iws, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'DSYTRF_RK', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRF_RK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DSYTRF_RK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_dlasyf_rk;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 15
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_dlasyf_rk( uplo, k, nb, kb, a, lda, e,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_dsytf2_rk( uplo, k, a, lda, e, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k-kb+1:k and apply row permutations to the
+              ! last k+1 colunms k+1:n after that block
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k<n ) then
+                 do i = k, ( k - kb + 1 ), -1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_dswap( n-k, a( i, k+1 ), lda,a( ip, k+1 ), lda )
+                    end if
+                 end do
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+              ! this label is the exit from main loop over k decreasing
+              ! from n to 1 in steps of kb
+              15 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_dlasyf_rk;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 35
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_dlasyf_rk( uplo, n-k+1, nb, kb, a( k, k ), lda, e( k ),ipiv( k ), &
+                           work, ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_dsytf2_rk( uplo, n-k+1, a( k, k ), lda, e( k ),ipiv( k ), iinfo )
+                           
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do i = k, k + kb - 1
+                 if( ipiv( i )>0 ) then
+                    ipiv( i ) = ipiv( i ) + k - 1
+                 else
+                    ipiv( i ) = ipiv( i ) - k + 1
+                 end if
+              end do
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k:k+kb-1 and apply row permutations to the
+              ! first k-1 colunms 1:k-1 before that block
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k>1 ) then
+                 do i = k, ( k + kb - 1 ), 1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_dswap( k-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                    end if
+                 end do
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+              ! this label is the exit from main loop over k increasing
+              ! from 1 to n in steps of kb
+              35 continue
+           ! end lower
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsytrf_rk
+
+     !> DSYTRF_ROOK: computes the factorization of a real symmetric matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+     !> The form of the factorization is
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_dsytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'DSYTRF_ROOK', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, n*nb )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRF_ROOK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DSYTRF_ROOK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_dlasyf_rook;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_dlasyf_rook( uplo, k, nb, kb, a, lda,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_dsytf2_rook( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_dlasyf_rook;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_dlasyf_rook( uplo, n-k+1, nb, kb, a( k, k ), lda,ipiv( k ), work, &
+                           ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_dsytf2_rook( uplo, n-k+1, a( k, k ), lda, ipiv( k ),iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsytrf_rook
+
+     !> DSYTRI: computes the inverse of a real symmetric indefinite matrix
+     !> A using the factorization A = U*D*U**T or A = L*D*L**T computed by
+     !> DSYTRF.
+
+     pure subroutine stdlib_dsytri( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kp, kstep
+           real(dp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==zero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 40
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_dcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_dsymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_ddot( k-1, work, 1, a( 1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k+1 ) )
+                 ak = a( k, k ) / t
+                 akp1 = a( k+1, k+1 ) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_dcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_dsymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_ddot( k-1, work, 1, a( 1, k ),1 )
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_ddot( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_dcopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_dsymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -stdlib_ddot( k-1, work, 1, a( 1, k+1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 call stdlib_dswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                 call stdlib_dswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              go to 30
+              40 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              50 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 60
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_dcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_dsymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_ddot( n-k, work, 1, a( k+1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k-1 ) )
+                 ak = a( k-1, k-1 ) / t
+                 akp1 = a( k, k ) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_dcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_dsymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_ddot( n-k, work, 1, a( k+1, k ),1 )
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_ddot( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 )
+                              
+                    call stdlib_dcopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_dsymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -stdlib_ddot( n-k, work, 1, a( k+1, k-1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 if( kp<n )call stdlib_dswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                 call stdlib_dswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              go to 50
+              60 continue
+           end if
+           return
+     end subroutine stdlib_dsytri
+
+     !> DSYTRI_ROOK: computes the inverse of a real symmetric
+     !> matrix A using the factorization A = U*D*U**T or A = L*D*L**T
+     !> computed by DSYTRF_ROOK.
+
+     pure subroutine stdlib_dsytri_rook( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kp, kstep
+           real(dp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRI_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==zero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 40
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_dcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_dsymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_ddot( k-1, work, 1, a( 1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k+1 ) )
+                 ak = a( k, k ) / t
+                 akp1 = a( k+1, k+1 ) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_dcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_dsymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_ddot( k-1, work, 1, a( 1, k ),1 )
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_ddot( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_dcopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_dsymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -stdlib_ddot( k-1, work, 1, a( 1, k+1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_dswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_dswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k+1 with -ipiv(k) and
+                 ! -ipiv(k+1)in the leading submatrix a(1:k+1,1:k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_dswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_dswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+                 k = k + 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_dswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_dswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k + 1
+              go to 30
+              40 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              50 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 60
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_dcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_dsymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_ddot( n-k, work, 1, a( k+1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k-1 ) )
+                 ak = a( k-1, k-1 ) / t
+                 akp1 = a( k, k ) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_dcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_dsymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_ddot( n-k, work, 1, a( k+1, k ),1 )
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_ddot( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 )
+                              
+                    call stdlib_dcopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_dsymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -stdlib_ddot( n-k, work, 1, a( k+1, k-1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_dswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_dswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k-1 with -ipiv(k) and
+                 ! -ipiv(k-1) in the trailing submatrix a(k-1:n,k-1:n)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_dswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_dswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+                 k = k - 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_dswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_dswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k - 1
+              go to 50
+              60 continue
+           end if
+           return
+     end subroutine stdlib_dsytri_rook
+
+     !> DSYTRS: solves a system of linear equations A*X = B with a real
+     !> symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by DSYTRF.
+
+     pure subroutine stdlib_dsytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           real(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_dger( k-1, nrhs, -one, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_dscal( nrhs, one / a( k, k ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_dswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_dger( k-2, nrhs, -one, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 call stdlib_dger( k-2, nrhs, -one, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ), &
+                           ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_dgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb, a( 1, k ),1, one, b( k, &
+                           1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 call stdlib_dgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb, a( 1, k ),1, one, b( k, &
+                           1 ), ldb )
+                 call stdlib_dgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb,a( 1, k+1 ), 1, one, b( &
+                           k+1, 1 ), ldb )
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_dger( n-k, nrhs, -one, a( k+1, k ), 1, b( k, 1 ),ldb, b( k+&
+                           1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_dscal( nrhs, one / a( k, k ), b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_dswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_dger( n-k-1, nrhs, -one, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, 1 &
+                              ), ldb )
+                    call stdlib_dger( n-k-1, nrhs, -one, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( k+&
+                              2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / akm1k
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_dgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+&
+                           1, k ), 1, one, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_dgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+1, k ),&
+                               1, one, b( k, 1 ), ldb )
+                    call stdlib_dgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+1, k-1 &
+                              ), 1, one, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_dsytrs
+
+     !> DSYTRS2: solves a system of linear equations A*X = B with a real
+     !> symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by DSYTRF and converted by DSYCONV.
+
+     pure subroutine stdlib_dsytrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, j, k, kp
+           real(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRS2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! convert a
+           call stdlib_dsyconv( uplo, 'C', n, a, lda, ipiv, work, iinfo )
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+             ! p**t * b
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp==-ipiv( k-1 ) )call stdlib_dswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb &
+                           )
+                 k=k-2
+              end if
+             end do
+        ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+             call stdlib_dtrsm('L','U','N','U',n,nrhs,one,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i=n
+              do while ( i >= 1 )
+                 if( ipiv(i) > 0 ) then
+                   call stdlib_dscal( nrhs, one / a( i, i ), b( i, 1 ), ldb )
+                 elseif ( i > 1) then
+                    if ( ipiv(i-1) == ipiv(i) ) then
+                       akm1k = work(i)
+                       akm1 = a( i-1, i-1 ) / akm1k
+                       ak = a( i, i ) / akm1k
+                       denom = akm1*ak - one
+                       do j = 1, nrhs
+                          bkm1 = b( i-1, j ) / akm1k
+                          bk = b( i, j ) / akm1k
+                          b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                          b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                    i = i - 1
+                    endif
+                 endif
+                 i = i - 1
+              end do
+            ! compute (u**t \ b) -> b   [ u**t \ (d \ (u \p**t * b) ) ]
+              call stdlib_dtrsm('L','U','T','U',n,nrhs,one,a,lda,b,ldb)
+             ! p * b  [ p * (u**t \ (d \ (u \p**t * b) )) ]
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k < n .and. kp==-ipiv( k+1 ) )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp,&
+                            1 ), ldb )
+                 k=k+2
+              endif
+             end do
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+             ! p**t * b
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k+1).
+                 kp = -ipiv( k+1 )
+                 if( kp==-ipiv( k ) )call stdlib_dswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                           
+                 k=k+2
+              endif
+             end do
+        ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+             call stdlib_dtrsm('L','L','N','U',n,nrhs,one,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i=1
+              do while ( i <= n )
+                 if( ipiv(i) > 0 ) then
+                   call stdlib_dscal( nrhs, one / a( i, i ), b( i, 1 ), ldb )
+                 else
+                       akm1k = work(i)
+                       akm1 = a( i, i ) / akm1k
+                       ak = a( i+1, i+1 ) / akm1k
+                       denom = akm1*ak - one
+                       do j = 1, nrhs
+                          bkm1 = b( i, j ) / akm1k
+                          bk = b( i+1, j ) / akm1k
+                          b( i, j ) = ( ak*bkm1-bk ) / denom
+                          b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                       i = i + 1
+                 endif
+                 i = i + 1
+              end do
+        ! compute (l**t \ b) -> b   [ l**t \ (d \ (l \p**t * b) ) ]
+             call stdlib_dtrsm('L','L','T','U',n,nrhs,one,a,lda,b,ldb)
+             ! p * b  [ p * (l**t \ (d \ (l \p**t * b) )) ]
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k>1 .and. kp==-ipiv( k-1 ) )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, &
+                           1 ), ldb )
+                 k=k-2
+              endif
+             end do
+           end if
+           ! revert a
+           call stdlib_dsyconv( uplo, 'R', n, a, lda, ipiv, work, iinfo )
+           return
+     end subroutine stdlib_dsytrs2
+
+     !> DSYTRS_3: solves a system of linear equations A * X = B with a real
+     !> symmetric matrix A using the factorization computed
+     !> by DSYTRF_RK or DSYTRF_BK:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This algorithm is using Level 3 BLAS.
+
+     pure subroutine stdlib_dsytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(in) :: a(lda,*), e(*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j, k, kp
+           real(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRS_3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! begin upper
+              ! solve a*x = b, where a = u*d*u**t.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+              call stdlib_dtrsm( 'L', 'U', 'N', 'U', n, nrhs, one, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i = n
+              do while ( i>=1 )
+                 if( ipiv( i )>0 ) then
+                    call stdlib_dscal( nrhs, one / a( i, i ), b( i, 1 ), ldb )
+                 else if ( i>1 ) then
+                    akm1k = e( i )
+                    akm1 = a( i-1, i-1 ) / akm1k
+                    ak = a( i, i ) / akm1k
+                    denom = akm1*ak - one
+                    do j = 1, nrhs
+                       bkm1 = b( i-1, j ) / akm1k
+                       bk = b( i, j ) / akm1k
+                       b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                       b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i - 1
+                 end if
+                 i = i - 1
+              end do
+              ! compute (u**t \ b) -> b   [ u**t \ (d \ (u \p**t * b) ) ]
+              call stdlib_dtrsm( 'L', 'U', 'T', 'U', n, nrhs, one, a, lda, b, ldb )
+              ! p * b  [ p * (u**t \ (d \ (u \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+           else
+              ! begin lower
+              ! solve a*x = b, where a = l*d*l**t.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+              call stdlib_dtrsm( 'L', 'L', 'N', 'U', n, nrhs, one, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i = 1
+              do while ( i<=n )
+                 if( ipiv( i )>0 ) then
+                    call stdlib_dscal( nrhs, one / a( i, i ), b( i, 1 ), ldb )
+                 else if( i<n ) then
+                    akm1k = e( i )
+                    akm1 = a( i, i ) / akm1k
+                    ak = a( i+1, i+1 ) / akm1k
+                    denom = akm1*ak - one
+                    do  j = 1, nrhs
+                       bkm1 = b( i, j ) / akm1k
+                       bk = b( i+1, j ) / akm1k
+                       b( i, j ) = ( ak*bkm1-bk ) / denom
+                       b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i + 1
+                 end if
+                 i = i + 1
+              end do
+              ! compute (l**t \ b) -> b   [ l**t \ (d \ (l \p**t * b) ) ]
+              call stdlib_dtrsm('L', 'L', 'T', 'U', n, nrhs, one, a, lda, b, ldb )
+              ! p * b  [ p * (l**t \ (d \ (l \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! end lower
+           end if
+           return
+     end subroutine stdlib_dsytrs_3
+
+     !> DSYTRS_AA: solves a system of linear equations A*X = B with a real
+     !> symmetric matrix A using the factorization A = U**T*T*U or
+     !> A = L*T*L**T computed by DSYTRF_AA.
+
+     pure subroutine stdlib_dsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, nrhs, lda, ldb, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: b(ldb,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           logical(lk) :: lquery, upper
+           integer(ilp) :: k, kp, lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, 3*n-2 ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRS_AA', -info )
+              return
+           else if( lquery ) then
+              lwkopt = (3*n-2)
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u**t*t*u.
+              ! 1) forward substitution with u**t
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute u**t \ b -> b    [ (u**t \p**t * b) ]
+                 call stdlib_dtrsm('L', 'U', 'T', 'U', n-1, nrhs, one, a( 1, 2 ),lda, b( 2, 1 ), &
+                           ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (u**t \p**t * b) ]
+              call stdlib_dlacpy( 'F', 1, n, a( 1, 1 ), lda+1, work( n ), 1)
+              if( n>1 ) then
+                 call stdlib_dlacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 1 ), 1 )
+                 call stdlib_dlacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 2*n ), 1 )
+              end if
+              call stdlib_dgtsv( n, nrhs, work( 1 ), work( n ), work( 2*n ), b, ldb,info )
+              ! 3) backward substitution with u
+              if( n>1 ) then
+                 ! compute u \ b -> b   [ u \ (t \ (u**t \p**t * b) ) ]
+                 call stdlib_dtrsm( 'L', 'U', 'N', 'U', n-1, nrhs, one, a( 1, 2 ),lda, b( 2, 1 ), &
+                           ldb)
+                 ! pivot, p * b -> b  [ p * (u \ (t \ (u**t \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           else
+              ! solve a*x = b, where a = l*t*l**t.
+              ! 1) forward substitution with l
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute l \ b -> b    [ (l \p**t * b) ]
+                 call stdlib_dtrsm( 'L', 'L', 'N', 'U', n-1, nrhs, one, a( 2, 1 ),lda, b( 2, 1 ), &
+                           ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (l \p**t * b) ]
+              call stdlib_dlacpy( 'F', 1, n, a(1, 1), lda+1, work(n), 1)
+              if( n>1 ) then
+                 call stdlib_dlacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 1 ), 1 )
+                 call stdlib_dlacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 2*n ), 1 )
+              end if
+              call stdlib_dgtsv( n, nrhs, work( 1 ), work(n), work( 2*n ), b, ldb,info)
+              ! 3) backward substitution with l**t
+              if( n>1 ) then
+                 ! compute (l**t \ b) -> b   [ l**t \ (t \ (l \p**t * b) ) ]
+                 call stdlib_dtrsm( 'L', 'L', 'T', 'U', n-1, nrhs, one, a( 2, 1 ),lda, b( 2, 1 ), &
+                           ldb)
+                 ! pivot, p * b -> b  [ p * (l**t \ (t \ (l \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_dsytrs_aa
+
+     !> DSYTRS_ROOK: solves a system of linear equations A*X = B with
+     !> a real symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by DSYTRF_ROOK.
+
+     pure subroutine stdlib_dsytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           real(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRS_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_dger( k-1, nrhs, -one, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_dscal( nrhs, one / a( k, k ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k) then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_dswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k>2 ) then
+                    call stdlib_dger( k-2, nrhs, -one, a( 1, k ), 1, b( k, 1 ),ldb, b( 1, 1 ), &
+                              ldb )
+                    call stdlib_dger( k-2, nrhs, -one, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ),&
+                               ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 )call stdlib_dgemv( 'TRANSPOSE', k-1, nrhs, -one, b,ldb, a( 1, k ), 1, &
+                           one, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_dgemv( 'TRANSPOSE', k-1, nrhs, -one, b,ldb, a( 1, k ), 1, one, b( &
+                              k, 1 ), ldb )
+                    call stdlib_dgemv( 'TRANSPOSE', k-1, nrhs, -one, b,ldb, a( 1, k+1 ), 1, one, &
+                              b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k) then k+1 and -ipiv(k+1).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_dswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_dger( n-k, nrhs, -one, a( k+1, k ), 1, b( k, 1 ),ldb, b( k+&
+                           1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_dscal( nrhs, one / a( k, k ), b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k) then k+1 and -ipiv(k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_dswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_dger( n-k-1, nrhs, -one, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, 1 &
+                              ), ldb )
+                    call stdlib_dger( n-k-1, nrhs, -one, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( k+&
+                              2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / akm1k
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_dgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+&
+                           1, k ), 1, one, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_dgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+1, k ),&
+                               1, one, b( k, 1 ), ldb )
+                    call stdlib_dgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+1, k-1 &
+                              ), 1, one, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k) then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_dswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_dswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_dsytrs_rook
+
+     !> DTBRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular band
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by DTBTRS or some other
+     !> means before entering this routine.  DTBRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_dtbrfs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, x, ldx, ferr,&
+                berr, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: ab(ldab,*), b(ldb,*), x(ldx,*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transt
+           integer(ilp) :: i, j, k, kase, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = kd + 2
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a or a**t, depending on trans.
+              call stdlib_dcopy( n, x( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_dtbmv( uplo, trans, diag, n, kd, ab, ldab, work( n+1 ),1 )
+              call stdlib_daxpy( n, -one, b( 1, j ), 1, work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = max( 1, k-kd ), k
+                             work( i ) = work( i ) +abs( ab( kd+1+i-k, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = max( 1, k-kd ), k - 1
+                             work( i ) = work( i ) +abs( ab( kd+1+i-k, k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k, min( n, k+kd )
+                             work( i ) = work( i ) + abs( ab( 1+i-k, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k + 1, min( n, k+kd )
+                             work( i ) = work( i ) + abs( ab( 1+i-k, k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**t)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = max( 1, k-kd ), k
+                             s = s + abs( ab( kd+1+i-k, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = max( 1, k-kd ), k - 1
+                             s = s + abs( ab( kd+1+i-k, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, min( n, k+kd )
+                             s = s + abs( ab( 1+i-k, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = k + 1, min( n, k+kd )
+                             s = s + abs( ab( 1+i-k, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_dlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_dlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_dtbsv( uplo, transt, diag, n, kd, ab, ldab,work( n+1 ), 1 )
+                              
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_dtbsv( uplo, trans, diag, n, kd, ab, ldab,work( n+1 ), 1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_dtbrfs
+
+     !> DTBTRS: solves a triangular system of the form
+     !> A * X = B  or  A**T * X = B,
+     !> where A is a triangular band matrix of order N, and B is an
+     !> N-by NRHS matrix.  A check is made to verify that A is nonsingular.
+
+     pure subroutine stdlib_dtbtrs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(in) :: ab(ldab,*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nounit = stdlib_lsame( diag, 'N' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              if( upper ) then
+                 do info = 1, n
+                    if( ab( kd+1, info )==zero )return
+                 end do
+              else
+                 do info = 1, n
+                    if( ab( 1, info )==zero )return
+                 end do
+              end if
+           end if
+           info = 0
+           ! solve a * x = b  or  a**t * x = b.
+           do j = 1, nrhs
+              call stdlib_dtbsv( uplo, trans, diag, n, kd, ab, ldab, b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_dtbtrs
+
+     !> Level 3 BLAS like routine for A in RFP Format.
+     !> DTFSM:  solves the matrix equation
+     !> op( A )*X = alpha*B  or  X*op( A ) = alpha*B
+     !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T.
+     !> A is in Rectangular Full Packed (RFP) Format.
+     !> The matrix X is overwritten on B.
+
+     pure subroutine stdlib_dtfsm( transr, side, uplo, trans, diag, m, n, alpha, a,b, ldb )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, diag, side, trans, uplo
+           integer(ilp), intent(in) :: ldb, m, n
+           real(dp), intent(in) :: alpha
+           ! Array Arguments 
+           real(dp), intent(in) :: a(0:*)
+           real(dp), intent(inout) :: b(0:ldb-1,0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lside, misodd, nisodd, normaltransr, notrans
+           integer(ilp) :: m1, m2, n1, n2, k, info, i, j
+           ! Intrinsic Functions 
+           intrinsic :: max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lside = stdlib_lsame( side, 'L' )
+           lower = stdlib_lsame( uplo, 'L' )
+           notrans = stdlib_lsame( trans, 'N' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lside .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -2
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -3
+           else if( .not.notrans .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -4
+           else if( .not.stdlib_lsame( diag, 'N' ) .and. .not.stdlib_lsame( diag, 'U' ) )&
+                     then
+              info = -5
+           else if( m<0 ) then
+              info = -6
+           else if( n<0 ) then
+              info = -7
+           else if( ldb<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTFSM ', -info )
+              return
+           end if
+           ! quick return when ( (n==0).or.(m==0) )
+           if( ( m==0 ) .or. ( n==0 ) )return
+           ! quick return when alpha==(0e+0_dp)
+           if( alpha==zero ) then
+              do j = 0, n - 1
+                 do i = 0, m - 1
+                    b( i, j ) = zero
+                 end do
+              end do
+              return
+           end if
+           if( lside ) then
+              ! side = 'l'
+              ! a is m-by-m.
+              ! if m is odd, set nisodd = .true., and m1 and m2.
+              ! if m is even, nisodd = .false., and m.
+              if( mod( m, 2 )==0 ) then
+                 misodd = .false.
+                 k = m / 2
+              else
+                 misodd = .true.
+                 if( lower ) then
+                    m2 = m / 2
+                    m1 = m - m2
+                 else
+                    m1 = m / 2
+                    m2 = m - m1
+                 end if
+              end if
+              if( misodd ) then
+                 ! side = 'l' and n is odd
+                 if( normaltransr ) then
+                    ! side = 'l', n is odd, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='l', n is odd, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'n'
+                          if( m==1 ) then
+                             call stdlib_dtrsm( 'L', 'L', 'N', diag, m1, n, alpha,a, m, b, ldb )
+                                       
+                          else
+                             call stdlib_dtrsm( 'L', 'L', 'N', diag, m1, n, alpha,a( 0 ), m, b, &
+                                       ldb )
+                             call stdlib_dgemm( 'N', 'N', m2, n, m1, -one, a( m1 ),m, b, ldb, &
+                                       alpha, b( m1, 0 ), ldb )
+                             call stdlib_dtrsm( 'L', 'U', 'T', diag, m2, n, one,a( m ), m, b( m1, &
+                                       0 ), ldb )
+                          end if
+                       else
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 't'
+                          if( m==1 ) then
+                             call stdlib_dtrsm( 'L', 'L', 'T', diag, m1, n, alpha,a( 0 ), m, b, &
+                                       ldb )
+                          else
+                             call stdlib_dtrsm( 'L', 'U', 'N', diag, m2, n, alpha,a( m ), m, b( &
+                                       m1, 0 ), ldb )
+                             call stdlib_dgemm( 'T', 'N', m1, n, m2, -one, a( m1 ),m, b( m1, 0 ), &
+                                       ldb, alpha, b, ldb )
+                             call stdlib_dtrsm( 'L', 'L', 'T', diag, m1, n, one,a( 0 ), m, b, ldb &
+                                       )
+                          end if
+                       end if
+                    else
+                       ! side  ='l', n is odd, transr = 'n', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_dtrsm( 'L', 'L', 'N', diag, m1, n, alpha,a( m2 ), m, b, ldb &
+                                    )
+                          call stdlib_dgemm( 'T', 'N', m2, n, m1, -one, a( 0 ), m,b, ldb, alpha, &
+                                    b( m1, 0 ), ldb )
+                          call stdlib_dtrsm( 'L', 'U', 'T', diag, m2, n, one,a( m1 ), m, b( m1, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 't'
+                          call stdlib_dtrsm( 'L', 'U', 'N', diag, m2, n, alpha,a( m1 ), m, b( m1, &
+                                    0 ), ldb )
+                          call stdlib_dgemm( 'N', 'N', m1, n, m2, -one, a( 0 ), m,b( m1, 0 ), ldb,&
+                                     alpha, b, ldb )
+                          call stdlib_dtrsm( 'L', 'L', 'T', diag, m1, n, one,a( m2 ), m, b, ldb )
+                                    
+                       end if
+                    end if
+                 else
+                    ! side = 'l', n is odd, and transr = 't'
+                    if( lower ) then
+                       ! side  ='l', n is odd, transr = 't', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is odd, transr = 't', uplo = 'l', and
+                          ! trans = 'n'
+                          if( m==1 ) then
+                             call stdlib_dtrsm( 'L', 'U', 'T', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                          else
+                             call stdlib_dtrsm( 'L', 'U', 'T', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                             call stdlib_dgemm( 'T', 'N', m2, n, m1, -one,a( m1*m1 ), m1, b, ldb, &
+                                       alpha,b( m1, 0 ), ldb )
+                             call stdlib_dtrsm( 'L', 'L', 'N', diag, m2, n, one,a( 1 ), m1, b( m1,&
+                                        0 ), ldb )
+                          end if
+                       else
+                          ! side  ='l', n is odd, transr = 't', uplo = 'l', and
+                          ! trans = 't'
+                          if( m==1 ) then
+                             call stdlib_dtrsm( 'L', 'U', 'N', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                          else
+                             call stdlib_dtrsm( 'L', 'L', 'T', diag, m2, n, alpha,a( 1 ), m1, b( &
+                                       m1, 0 ), ldb )
+                             call stdlib_dgemm( 'N', 'N', m1, n, m2, -one,a( m1*m1 ), m1, b( m1, &
+                                       0 ), ldb,alpha, b, ldb )
+                             call stdlib_dtrsm( 'L', 'U', 'N', diag, m1, n, one,a( 0 ), m1, b, &
+                                       ldb )
+                          end if
+                       end if
+                    else
+                       ! side  ='l', n is odd, transr = 't', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is odd, transr = 't', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_dtrsm( 'L', 'U', 'T', diag, m1, n, alpha,a( m2*m2 ), m2, b, &
+                                    ldb )
+                          call stdlib_dgemm( 'N', 'N', m2, n, m1, -one, a( 0 ), m2,b, ldb, alpha, &
+                                    b( m1, 0 ), ldb )
+                          call stdlib_dtrsm( 'L', 'L', 'N', diag, m2, n, one,a( m1*m2 ), m2, b( &
+                                    m1, 0 ), ldb )
+                       else
+                          ! side  ='l', n is odd, transr = 't', uplo = 'u', and
+                          ! trans = 't'
+                          call stdlib_dtrsm( 'L', 'L', 'T', diag, m2, n, alpha,a( m1*m2 ), m2, b( &
+                                    m1, 0 ), ldb )
+                          call stdlib_dgemm( 'T', 'N', m1, n, m2, -one, a( 0 ), m2,b( m1, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_dtrsm( 'L', 'U', 'N', diag, m1, n, one,a( m2*m2 ), m2, b, &
+                                    ldb )
+                       end if
+                    end if
+                 end if
+              else
+                 ! side = 'l' and n is even
+                 if( normaltransr ) then
+                    ! side = 'l', n is even, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='l', n is even, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_dtrsm( 'L', 'L', 'N', diag, k, n, alpha,a( 1 ), m+1, b, ldb &
+                                    )
+                          call stdlib_dgemm( 'N', 'N', k, n, k, -one, a( k+1 ),m+1, b, ldb, alpha,&
+                                     b( k, 0 ), ldb )
+                          call stdlib_dtrsm( 'L', 'U', 'T', diag, k, n, one,a( 0 ), m+1, b( k, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 't'
+                          call stdlib_dtrsm( 'L', 'U', 'N', diag, k, n, alpha,a( 0 ), m+1, b( k, &
+                                    0 ), ldb )
+                          call stdlib_dgemm( 'T', 'N', k, n, k, -one, a( k+1 ),m+1, b( k, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_dtrsm( 'L', 'L', 'T', diag, k, n, one,a( 1 ), m+1, b, ldb )
+                                    
+                       end if
+                    else
+                       ! side  ='l', n is even, transr = 'n', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_dtrsm( 'L', 'L', 'N', diag, k, n, alpha,a( k+1 ), m+1, b, &
+                                    ldb )
+                          call stdlib_dgemm( 'T', 'N', k, n, k, -one, a( 0 ), m+1,b, ldb, alpha, &
+                                    b( k, 0 ), ldb )
+                          call stdlib_dtrsm( 'L', 'U', 'T', diag, k, n, one,a( k ), m+1, b( k, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 't'
+                          call stdlib_dtrsm( 'L', 'U', 'N', diag, k, n, alpha,a( k ), m+1, b( k, &
+                                    0 ), ldb )
+                          call stdlib_dgemm( 'N', 'N', k, n, k, -one, a( 0 ), m+1,b( k, 0 ), ldb, &
+                                    alpha, b, ldb )
+                          call stdlib_dtrsm( 'L', 'L', 'T', diag, k, n, one,a( k+1 ), m+1, b, ldb &
+                                    )
+                       end if
+                    end if
+                 else
+                    ! side = 'l', n is even, and transr = 't'
+                    if( lower ) then
+                       ! side  ='l', n is even, transr = 't', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is even, transr = 't', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_dtrsm( 'L', 'U', 'T', diag, k, n, alpha,a( k ), k, b, ldb )
+                                    
+                          call stdlib_dgemm( 'T', 'N', k, n, k, -one,a( k*( k+1 ) ), k, b, ldb, &
+                                    alpha,b( k, 0 ), ldb )
+                          call stdlib_dtrsm( 'L', 'L', 'N', diag, k, n, one,a( 0 ), k, b( k, 0 ), &
+                                    ldb )
+                       else
+                          ! side  ='l', n is even, transr = 't', uplo = 'l',
+                          ! and trans = 't'
+                          call stdlib_dtrsm( 'L', 'L', 'T', diag, k, n, alpha,a( 0 ), k, b( k, 0 )&
+                                    , ldb )
+                          call stdlib_dgemm( 'N', 'N', k, n, k, -one,a( k*( k+1 ) ), k, b( k, 0 ),&
+                                     ldb,alpha, b, ldb )
+                          call stdlib_dtrsm( 'L', 'U', 'N', diag, k, n, one,a( k ), k, b, ldb )
+                                    
+                       end if
+                    else
+                       ! side  ='l', n is even, transr = 't', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is even, transr = 't', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_dtrsm( 'L', 'U', 'T', diag, k, n, alpha,a( k*( k+1 ) ), k, &
+                                    b, ldb )
+                          call stdlib_dgemm( 'N', 'N', k, n, k, -one, a( 0 ), k, b,ldb, alpha, b( &
+                                    k, 0 ), ldb )
+                          call stdlib_dtrsm( 'L', 'L', 'N', diag, k, n, one,a( k*k ), k, b( k, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='l', n is even, transr = 't', uplo = 'u',
+                          ! and trans = 't'
+                          call stdlib_dtrsm( 'L', 'L', 'T', diag, k, n, alpha,a( k*k ), k, b( k, &
+                                    0 ), ldb )
+                          call stdlib_dgemm( 'T', 'N', k, n, k, -one, a( 0 ), k,b( k, 0 ), ldb, &
+                                    alpha, b, ldb )
+                          call stdlib_dtrsm( 'L', 'U', 'N', diag, k, n, one,a( k*( k+1 ) ), k, b, &
+                                    ldb )
+                       end if
+                    end if
+                 end if
+              end if
+           else
+              ! side = 'r'
+              ! a is n-by-n.
+              ! if n is odd, set nisodd = .true., and n1 and n2.
+              ! if n is even, nisodd = .false., and k.
+              if( mod( n, 2 )==0 ) then
+                 nisodd = .false.
+                 k = n / 2
+              else
+                 nisodd = .true.
+                 if( lower ) then
+                    n2 = n / 2
+                    n1 = n - n2
+                 else
+                    n1 = n / 2
+                    n2 = n - n1
+                 end if
+              end if
+              if( nisodd ) then
+                 ! side = 'r' and n is odd
+                 if( normaltransr ) then
+                    ! side = 'r', n is odd, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='r', n is odd, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'n'
+                          call stdlib_dtrsm( 'R', 'U', 'T', diag, m, n2, alpha,a( n ), n, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_dgemm( 'N', 'N', m, n1, n2, -one, b( 0, n1 ),ldb, a( n1 ), &
+                                    n, alpha, b( 0, 0 ),ldb )
+                          call stdlib_dtrsm( 'R', 'L', 'N', diag, m, n1, one,a( 0 ), n, b( 0, 0 ),&
+                                     ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 't'
+                          call stdlib_dtrsm( 'R', 'L', 'T', diag, m, n1, alpha,a( 0 ), n, b( 0, 0 &
+                                    ), ldb )
+                          call stdlib_dgemm( 'N', 'T', m, n2, n1, -one, b( 0, 0 ),ldb, a( n1 ), n,&
+                                     alpha, b( 0, n1 ),ldb )
+                          call stdlib_dtrsm( 'R', 'U', 'N', diag, m, n2, one,a( n ), n, b( 0, n1 )&
+                                    , ldb )
+                       end if
+                    else
+                       ! side  ='r', n is odd, transr = 'n', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_dtrsm( 'R', 'L', 'T', diag, m, n1, alpha,a( n2 ), n, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_dgemm( 'N', 'N', m, n2, n1, -one, b( 0, 0 ),ldb, a( 0 ), n, &
+                                    alpha, b( 0, n1 ),ldb )
+                          call stdlib_dtrsm( 'R', 'U', 'N', diag, m, n2, one,a( n1 ), n, b( 0, n1 &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 't'
+                          call stdlib_dtrsm( 'R', 'U', 'T', diag, m, n2, alpha,a( n1 ), n, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_dgemm( 'N', 'T', m, n1, n2, -one, b( 0, n1 ),ldb, a( 0 ), n,&
+                                     alpha, b( 0, 0 ), ldb )
+                          call stdlib_dtrsm( 'R', 'L', 'N', diag, m, n1, one,a( n2 ), n, b( 0, 0 )&
+                                    , ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'r', n is odd, and transr = 't'
+                    if( lower ) then
+                       ! side  ='r', n is odd, transr = 't', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 't', uplo = 'l', and
+                          ! trans = 'n'
+                          call stdlib_dtrsm( 'R', 'L', 'N', diag, m, n2, alpha,a( 1 ), n1, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_dgemm( 'N', 'T', m, n1, n2, -one, b( 0, n1 ),ldb, a( n1*n1 )&
+                                    , n1, alpha, b( 0, 0 ),ldb )
+                          call stdlib_dtrsm( 'R', 'U', 'T', diag, m, n1, one,a( 0 ), n1, b( 0, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 't', uplo = 'l', and
+                          ! trans = 't'
+                          call stdlib_dtrsm( 'R', 'U', 'N', diag, m, n1, alpha,a( 0 ), n1, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_dgemm( 'N', 'N', m, n2, n1, -one, b( 0, 0 ),ldb, a( n1*n1 ),&
+                                     n1, alpha, b( 0, n1 ),ldb )
+                          call stdlib_dtrsm( 'R', 'L', 'T', diag, m, n2, one,a( 1 ), n1, b( 0, n1 &
+                                    ), ldb )
+                       end if
+                    else
+                       ! side  ='r', n is odd, transr = 't', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 't', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_dtrsm( 'R', 'U', 'N', diag, m, n1, alpha,a( n2*n2 ), n2, b( &
+                                    0, 0 ), ldb )
+                          call stdlib_dgemm( 'N', 'T', m, n2, n1, -one, b( 0, 0 ),ldb, a( 0 ), n2,&
+                                     alpha, b( 0, n1 ),ldb )
+                          call stdlib_dtrsm( 'R', 'L', 'T', diag, m, n2, one,a( n1*n2 ), n2, b( 0,&
+                                     n1 ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 't', uplo = 'u', and
+                          ! trans = 't'
+                          call stdlib_dtrsm( 'R', 'L', 'N', diag, m, n2, alpha,a( n1*n2 ), n2, b( &
+                                    0, n1 ), ldb )
+                          call stdlib_dgemm( 'N', 'N', m, n1, n2, -one, b( 0, n1 ),ldb, a( 0 ), &
+                                    n2, alpha, b( 0, 0 ),ldb )
+                          call stdlib_dtrsm( 'R', 'U', 'T', diag, m, n1, one,a( n2*n2 ), n2, b( 0,&
+                                     0 ), ldb )
+                       end if
+                    end if
+                 end if
+              else
+                 ! side = 'r' and n is even
+                 if( normaltransr ) then
+                    ! side = 'r', n is even, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='r', n is even, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_dtrsm( 'R', 'U', 'T', diag, m, k, alpha,a( 0 ), n+1, b( 0, &
+                                    k ), ldb )
+                          call stdlib_dgemm( 'N', 'N', m, k, k, -one, b( 0, k ),ldb, a( k+1 ), n+&
+                                    1, alpha, b( 0, 0 ),ldb )
+                          call stdlib_dtrsm( 'R', 'L', 'N', diag, m, k, one,a( 1 ), n+1, b( 0, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 't'
+                          call stdlib_dtrsm( 'R', 'L', 'T', diag, m, k, alpha,a( 1 ), n+1, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_dgemm( 'N', 'T', m, k, k, -one, b( 0, 0 ),ldb, a( k+1 ), n+&
+                                    1, alpha, b( 0, k ),ldb )
+                          call stdlib_dtrsm( 'R', 'U', 'N', diag, m, k, one,a( 0 ), n+1, b( 0, k )&
+                                    , ldb )
+                       end if
+                    else
+                       ! side  ='r', n is even, transr = 'n', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_dtrsm( 'R', 'L', 'T', diag, m, k, alpha,a( k+1 ), n+1, b( 0,&
+                                     0 ), ldb )
+                          call stdlib_dgemm( 'N', 'N', m, k, k, -one, b( 0, 0 ),ldb, a( 0 ), n+1, &
+                                    alpha, b( 0, k ),ldb )
+                          call stdlib_dtrsm( 'R', 'U', 'N', diag, m, k, one,a( k ), n+1, b( 0, k )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 't'
+                          call stdlib_dtrsm( 'R', 'U', 'T', diag, m, k, alpha,a( k ), n+1, b( 0, &
+                                    k ), ldb )
+                          call stdlib_dgemm( 'N', 'T', m, k, k, -one, b( 0, k ),ldb, a( 0 ), n+1, &
+                                    alpha, b( 0, 0 ),ldb )
+                          call stdlib_dtrsm( 'R', 'L', 'N', diag, m, k, one,a( k+1 ), n+1, b( 0, &
+                                    0 ), ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'r', n is even, and transr = 't'
+                    if( lower ) then
+                       ! side  ='r', n is even, transr = 't', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 't', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_dtrsm( 'R', 'L', 'N', diag, m, k, alpha,a( 0 ), k, b( 0, k )&
+                                    , ldb )
+                          call stdlib_dgemm( 'N', 'T', m, k, k, -one, b( 0, k ),ldb, a( ( k+1 )*k &
+                                    ), k, alpha,b( 0, 0 ), ldb )
+                          call stdlib_dtrsm( 'R', 'U', 'T', diag, m, k, one,a( k ), k, b( 0, 0 ), &
+                                    ldb )
+                       else
+                          ! side  ='r', n is even, transr = 't', uplo = 'l',
+                          ! and trans = 't'
+                          call stdlib_dtrsm( 'R', 'U', 'N', diag, m, k, alpha,a( k ), k, b( 0, 0 )&
+                                    , ldb )
+                          call stdlib_dgemm( 'N', 'N', m, k, k, -one, b( 0, 0 ),ldb, a( ( k+1 )*k &
+                                    ), k, alpha,b( 0, k ), ldb )
+                          call stdlib_dtrsm( 'R', 'L', 'T', diag, m, k, one,a( 0 ), k, b( 0, k ), &
+                                    ldb )
+                       end if
+                    else
+                       ! side  ='r', n is even, transr = 't', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 't', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_dtrsm( 'R', 'U', 'N', diag, m, k, alpha,a( ( k+1 )*k ), k, &
+                                    b( 0, 0 ), ldb )
+                          call stdlib_dgemm( 'N', 'T', m, k, k, -one, b( 0, 0 ),ldb, a( 0 ), k, &
+                                    alpha, b( 0, k ), ldb )
+                          call stdlib_dtrsm( 'R', 'L', 'T', diag, m, k, one,a( k*k ), k, b( 0, k )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is even, transr = 't', uplo = 'u',
+                          ! and trans = 't'
+                          call stdlib_dtrsm( 'R', 'L', 'N', diag, m, k, alpha,a( k*k ), k, b( 0, &
+                                    k ), ldb )
+                          call stdlib_dgemm( 'N', 'N', m, k, k, -one, b( 0, k ),ldb, a( 0 ), k, &
+                                    alpha, b( 0, 0 ), ldb )
+                          call stdlib_dtrsm( 'R', 'U', 'T', diag, m, k, one,a( ( k+1 )*k ), k, b( &
+                                    0, 0 ), ldb )
+                       end if
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dtfsm
+
+     !> DTFTTP: copies a triangular matrix A from rectangular full packed
+     !> format (TF) to standard packed format (TP).
+
+     pure subroutine stdlib_dtfttp( transr, uplo, n, arf, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(out) :: ap(0:*)
+           real(dp), intent(in) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt
+           integer(ilp) :: i, j, ij
+           integer(ilp) :: ijp, jp, lda, js
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTFTTP', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( normaltransr ) then
+                 ap( 0 ) = arf( 0 )
+              else
+                 ap( 0 ) = arf( 0 )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           ! set lda of arf^c; arf^c is (0:(n+1)/2-1,0:n-noe)
+           ! where noe = 0 if n is even, noe = 1 if n is odd
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              lda = n + 1
+           else
+              nisodd = .true.
+              lda = n
+           end if
+           ! arf^c has lda rows and n+1-noe cols
+           if( .not.normaltransr )lda = ( n+1 ) / 2
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda = n
+                    ijp = 0
+                    jp = 0
+                    do j = 0, n2
+                       do i = j, n - 1
+                          ij = i + jp
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, n2 - 1
+                       do j = 1 + i, n2
+                          ij = i + j*lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    ijp = 0
+                    do j = 0, n1 - 1
+                       ij = n2 + j
+                       do i = 0, j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = n1, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ijp = 0
+                    do i = 0, n2
+                       do ij = i*( lda+1 ), n*lda - 1, lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 1
+                    do j = 0, n2 - 1
+                       do ij = js, js + n2 - j - 1
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    ijp = 0
+                    js = n2*lda
+                    do j = 0, n1 - 1
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, n1
+                       do ij = i, i + ( n1+i )*lda, lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    ijp = 0
+                    jp = 0
+                    do j = 0, k - 1
+                       do i = j, n - 1
+                          ij = 1 + i + jp
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, k - 1
+                       do j = i, k - 1
+                          ij = i + j*lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    ijp = 0
+                    do j = 0, k - 1
+                       ij = k + 1 + j
+                       do i = 0, j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = k, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    ijp = 0
+                    do i = 0, k - 1
+                       do ij = i + ( i+1 )*lda, ( n+1 )*lda - 1, lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 0
+                    do j = 0, k - 1
+                       do ij = js, js + k - j - 1
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    ijp = 0
+                    js = ( k+1 )*lda
+                    do j = 0, k - 1
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, k - 1
+                       do ij = i, i + ( k+i )*lda, lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dtfttp
+
+     !> DTFTTR: copies a triangular matrix A from rectangular full packed
+     !> format (TF) to standard full format (TR).
+
+     pure subroutine stdlib_dtfttr( transr, uplo, n, arf, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           real(dp), intent(out) :: a(0:lda-1,0:*)
+           real(dp), intent(in) :: arf(0:*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt, nx2, np1x2
+           integer(ilp) :: i, j, l, ij
+           ! Intrinsic Functions 
+           intrinsic :: max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTFTTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 ) then
+                 a( 0, 0 ) = arf( 0 )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower: for n even n1=n2=k
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true., lda=n+1 and a is (n+1)--by--k2.
+           ! if n is even, set k = n/2 and nisodd = .false., lda=n and a is
+           ! n--by--(n+1)/2.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              if( .not.lower )np1x2 = n + n + 2
+           else
+              nisodd = .true.
+              if( .not.lower )nx2 = n + n
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    ij = 0
+                    do j = 0, n2
+                       do i = n1, n2 + j
+                          a( n2+j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    ij = nt - n
+                    do j = n - 1, n1, -1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = j - n1, n1 - 1
+                          a( j-n1, l ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       ij = ij - nx2
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! n is odd, transr = 't', and uplo = 'l'
+                    ij = 0
+                    do j = 0, n2 - 1
+                       do i = 0, j
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do i = n1 + j, n - 1
+                          a( i, n1+j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = n2, n - 1
+                       do i = 0, n1 - 1
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 't', and uplo = 'u'
+                    ij = 0
+                    do j = 0, n1
+                       do i = n1, n - 1
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, n1 - 1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = n2 + j, n - 1
+                          a( n2+j, l ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    ij = 0
+                    do j = 0, k - 1
+                       do i = k, k + j
+                          a( k+j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    ij = nt - n - 1
+                    do j = n - 1, k, -1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = j - k, k - 1
+                          a( j-k, l ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       ij = ij - np1x2
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! n is even, transr = 't', and uplo = 'l'
+                    ij = 0
+                    j = k
+                    do i = k, n - 1
+                       a( i, j ) = arf( ij )
+                       ij = ij + 1
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do i = k + 1 + j, n - 1
+                          a( i, k+1+j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = k - 1, n - 1
+                       do i = 0, k - 1
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 't', and uplo = 'u'
+                    ij = 0
+                    do j = 0, k
+                       do i = k, n - 1
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = k + 1 + j, n - 1
+                          a( k+1+j, l ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    ! note that here, on exit of the loop, j = k-1
+                    do i = 0, j
+                       a( i, j ) = arf( ij )
+                       ij = ij + 1
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dtfttr
+
+     !> DTPRFB: applies a real "triangular-pentagonal" block reflector H or its
+     !> transpose H**T to a real matrix C, which is composed of two
+     !> blocks A and B, either from the left or right.
+
+     pure subroutine stdlib_dtprfb( side, trans, direct, storev, m, n, k, l,v, ldv, t, ldt, a, &
+               lda, b, ldb, work, ldwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, l, lda, ldb, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(in) :: t(ldt,*), v(ldv,*)
+           real(dp), intent(out) :: work(ldwork,*)
+        ! ==========================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, mp, np, kp
+           logical(lk) :: left, forward, column, right, backward, row
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 .or. k<=0 .or. l<0 ) return
+           if( stdlib_lsame( storev, 'C' ) ) then
+              column = .true.
+              row = .false.
+           else if ( stdlib_lsame( storev, 'R' ) ) then
+              column = .false.
+              row = .true.
+           else
+              column = .false.
+              row = .false.
+           end if
+           if( stdlib_lsame( side, 'L' ) ) then
+              left = .true.
+              right = .false.
+           else if( stdlib_lsame( side, 'R' ) ) then
+              left = .false.
+              right = .true.
+           else
+              left = .false.
+              right = .false.
+           end if
+           if( stdlib_lsame( direct, 'F' ) ) then
+              forward = .true.
+              backward = .false.
+           else if( stdlib_lsame( direct, 'B' ) ) then
+              forward = .false.
+              backward = .true.
+           else
+              forward = .false.
+              backward = .false.
+           end if
+       ! ---------------------------------------------------------------------------
+           if( column .and. forward .and. left  ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i ]    (k-by-k)
+                        ! [ v ]    (m-by-k)
+              ! form  h c  or  h**t c  where  c = [ a ]  (k-by-n)
+                                                ! [ b ]  (m-by-n)
+              ! h = i - w t w**t          or  h**t = i - w t**t w**t
+              ! a = a -   t (a + v**t b)  or  a = a -   t**t (a + v**t b)
+              ! b = b - v t (a + v**t b)  or  b = b - v t**t (a + v**t b)
+       ! ---------------------------------------------------------------------------
+              mp = min( m-l+1, m )
+              kp = min( l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( i, j ) = b( m-l+i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'L', 'U', 'T', 'N', l, n, one, v( mp, 1 ), ldv,work, ldwork )
+                        
+              call stdlib_dgemm( 'T', 'N', l, n, m-l, one, v, ldv, b, ldb,one, work, ldwork )
+                        
+              call stdlib_dgemm( 'T', 'N', k-l, n, m, one, v( 1, kp ), ldv,b, ldb, zero, work( kp,&
+                         1 ), ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'L', 'U', trans, 'N', k, n, one, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_dgemm( 'N', 'N', m-l, n, k, -one, v, ldv, work, ldwork,one, b, ldb )
+                        
+              call stdlib_dgemm( 'N', 'N', l, n, k-l, -one, v( mp, kp ), ldv,work( kp, 1 ), &
+                        ldwork, one, b( mp, 1 ),  ldb )
+              call stdlib_dtrmm( 'L', 'U', 'N', 'N', l, n, one, v( mp, 1 ), ldv,work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, l
+                    b( m-l+i, j ) = b( m-l+i, j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. forward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i ]    (k-by-k)
+                        ! [ v ]    (n-by-k)
+              ! form  c h or  c h**t  where  c = [ a b ] (a is m-by-k, b is m-by-n)
+              ! h = i - w t w**t          or  h**t = i - w t**t w**t
+              ! a = a - (a + b v) t      or  a = a - (a + b v) t**t
+              ! b = b - (a + b v) t v**t  or  b = b - (a + b v) t**t v**t
+       ! ---------------------------------------------------------------------------
+              np = min( n-l+1, n )
+              kp = min( l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, j ) = b( i, n-l+j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'R', 'U', 'N', 'N', m, l, one, v( np, 1 ), ldv,work, ldwork )
+                        
+              call stdlib_dgemm( 'N', 'N', m, l, n-l, one, b, ldb,v, ldv, one, work, ldwork )
+                        
+              call stdlib_dgemm( 'N', 'N', m, k-l, n, one, b, ldb,v( 1, kp ), ldv, zero, work( 1, &
+                        kp ), ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'R', 'U', trans, 'N', m, k, one, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_dgemm( 'N', 'T', m, n-l, k, -one, work, ldwork,v, ldv, one, b, ldb )
+                        
+              call stdlib_dgemm( 'N', 'T', m, l, k-l, -one, work( 1, kp ), ldwork,v( np, kp ), &
+                        ldv, one, b( 1, np ), ldb )
+              call stdlib_dtrmm( 'R', 'U', 'T', 'N', m, l, one, v( np, 1 ), ldv,work, ldwork )
+                        
+              do j = 1, l
+                 do i = 1, m
+                    b( i, n-l+j ) = b( i, n-l+j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. backward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v ]    (m-by-k)
+                        ! [ i ]    (k-by-k)
+              ! form  h c  or  h**t c  where  c = [ b ]  (m-by-n)
+                                                ! [ a ]  (k-by-n)
+              ! h = i - w t w**t          or  h**t = i - w t**t w**t
+              ! a = a -   t (a + v**t b)  or  a = a -   t**t (a + v**t b)
+              ! b = b - v t (a + v**t b)  or  b = b - v t**t (a + v**t b)
+       ! ---------------------------------------------------------------------------
+              mp = min( l+1, m )
+              kp = min( k-l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( k-l+i, j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'L', 'L', 'T', 'N', l, n, one, v( 1, kp ), ldv,work( kp, 1 ), &
+                        ldwork )
+              call stdlib_dgemm( 'T', 'N', l, n, m-l, one, v( mp, kp ), ldv,b( mp, 1 ), ldb, one, &
+                        work( kp, 1 ), ldwork )
+              call stdlib_dgemm( 'T', 'N', k-l, n, m, one, v, ldv,b, ldb, zero, work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'L', 'L', trans, 'N', k, n, one, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_dgemm( 'N', 'N', m-l, n, k, -one, v( mp, 1 ), ldv,work, ldwork, one, b( &
+                        mp, 1 ), ldb )
+              call stdlib_dgemm( 'N', 'N', l, n, k-l, -one, v, ldv,work, ldwork, one, b,  ldb )
+                        
+              call stdlib_dtrmm( 'L', 'L', 'N', 'N', l, n, one, v( 1, kp ), ldv,work( kp, 1 ), &
+                        ldwork )
+              do j = 1, n
+                 do i = 1, l
+                    b( i, j ) = b( i, j ) - work( k-l+i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. backward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v ]    (n-by-k)
+                        ! [ i ]    (k-by-k)
+              ! form  c h  or  c h**t  where  c = [ b a ] (b is m-by-n, a is m-by-k)
+              ! h = i - w t w**t          or  h**t = i - w t**t w**t
+              ! a = a - (a + b v) t      or  a = a - (a + b v) t**t
+              ! b = b - (a + b v) t v**t  or  b = b - (a + b v) t**t v**t
+       ! ---------------------------------------------------------------------------
+              np = min( l+1, n )
+              kp = min( k-l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, k-l+j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'R', 'L', 'N', 'N', m, l, one, v( 1, kp ), ldv,work( 1, kp ), &
+                        ldwork )
+              call stdlib_dgemm( 'N', 'N', m, l, n-l, one, b( 1, np ), ldb,v( np, kp ), ldv, one, &
+                        work( 1, kp ), ldwork )
+              call stdlib_dgemm( 'N', 'N', m, k-l, n, one, b, ldb,v, ldv, zero, work, ldwork )
+                        
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'R', 'L', trans, 'N', m, k, one, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_dgemm( 'N', 'T', m, n-l, k, -one, work, ldwork,v( np, 1 ), ldv, one, b( &
+                        1, np ), ldb )
+              call stdlib_dgemm( 'N', 'T', m, l, k-l, -one, work, ldwork,v, ldv, one, b, ldb )
+                        
+              call stdlib_dtrmm( 'R', 'L', 'T', 'N', m, l, one, v( 1, kp ), ldv,work( 1, kp ), &
+                        ldwork )
+              do j = 1, l
+                 do i = 1, m
+                    b( i, j ) = b( i, j ) - work( i, k-l+j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. forward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i v ] ( i is k-by-k, v is k-by-m )
+              ! form  h c  or  h**t c  where  c = [ a ]  (k-by-n)
+                                                ! [ b ]  (m-by-n)
+              ! h = i - w**t t w          or  h**t = i - w**t t**t w
+              ! a = a -     t (a + v b)  or  a = a -     t**t (a + v b)
+              ! b = b - v**t t (a + v b)  or  b = b - v**t t**t (a + v b)
+       ! ---------------------------------------------------------------------------
+              mp = min( m-l+1, m )
+              kp = min( l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( i, j ) = b( m-l+i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'L', 'L', 'N', 'N', l, n, one, v( 1, mp ), ldv,work, ldb )
+                        
+              call stdlib_dgemm( 'N', 'N', l, n, m-l, one, v, ldv,b, ldb,one, work, ldwork )
+                        
+              call stdlib_dgemm( 'N', 'N', k-l, n, m, one, v( kp, 1 ), ldv,b, ldb, zero, work( kp,&
+                         1 ), ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'L', 'U', trans, 'N', k, n, one, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', m-l, n, k, -one, v, ldv, work, ldwork,one, b, ldb )
+                        
+              call stdlib_dgemm( 'T', 'N', l, n, k-l, -one, v( kp, mp ), ldv,work( kp, 1 ), &
+                        ldwork, one, b( mp, 1 ), ldb )
+              call stdlib_dtrmm( 'L', 'L', 'T', 'N', l, n, one, v( 1, mp ), ldv,work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, l
+                    b( m-l+i, j ) = b( m-l+i, j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. forward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i v ] ( i is k-by-k, v is k-by-n )
+              ! form  c h  or  c h**t  where  c = [ a b ] (a is m-by-k, b is m-by-n)
+              ! h = i - w**t t w            or  h**t = i - w**t t**t w
+              ! a = a - (a + b v**t) t      or  a = a - (a + b v**t) t**t
+              ! b = b - (a + b v**t) t v    or  b = b - (a + b v**t) t**t v
+       ! ---------------------------------------------------------------------------
+              np = min( n-l+1, n )
+              kp = min( l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, j ) = b( i, n-l+j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'R', 'L', 'T', 'N', m, l, one, v( 1, np ), ldv,work, ldwork )
+                        
+              call stdlib_dgemm( 'N', 'T', m, l, n-l, one, b, ldb, v, ldv,one, work, ldwork )
+                        
+              call stdlib_dgemm( 'N', 'T', m, k-l, n, one, b, ldb,v( kp, 1 ), ldv, zero, work( 1, &
+                        kp ), ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'R', 'U', trans, 'N', m, k, one, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_dgemm( 'N', 'N', m, n-l, k, -one, work, ldwork,v, ldv, one, b, ldb )
+                        
+              call stdlib_dgemm( 'N', 'N', m, l, k-l, -one, work( 1, kp ), ldwork,v( kp, np ), &
+                        ldv, one, b( 1, np ), ldb )
+              call stdlib_dtrmm( 'R', 'L', 'N', 'N', m, l, one, v( 1, np ), ldv,work, ldwork )
+                        
+              do j = 1, l
+                 do i = 1, m
+                    b( i, n-l+j ) = b( i, n-l+j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. backward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v i ] ( i is k-by-k, v is k-by-m )
+              ! form  h c  or  h**t c  where  c = [ b ]  (m-by-n)
+                                                ! [ a ]  (k-by-n)
+              ! h = i - w**t t w          or  h**t = i - w**t t**t w
+              ! a = a -     t (a + v b)  or  a = a -     t**t (a + v b)
+              ! b = b - v**t t (a + v b)  or  b = b - v**t t**t (a + v b)
+       ! ---------------------------------------------------------------------------
+              mp = min( l+1, m )
+              kp = min( k-l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( k-l+i, j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'L', 'U', 'N', 'N', l, n, one, v( kp, 1 ), ldv,work( kp, 1 ), &
+                        ldwork )
+              call stdlib_dgemm( 'N', 'N', l, n, m-l, one, v( kp, mp ), ldv,b( mp, 1 ), ldb, one, &
+                        work( kp, 1 ), ldwork )
+              call stdlib_dgemm( 'N', 'N', k-l, n, m, one, v, ldv, b, ldb,zero, work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'L', 'L ', trans, 'N', k, n, one, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', m-l, n, k, -one, v( 1, mp ), ldv,work, ldwork, one, b( &
+                        mp, 1 ), ldb )
+              call stdlib_dgemm( 'T', 'N', l, n, k-l, -one, v, ldv,work, ldwork, one, b, ldb )
+                        
+              call stdlib_dtrmm( 'L', 'U', 'T', 'N', l, n, one, v( kp, 1 ), ldv,work( kp, 1 ), &
+                        ldwork )
+              do j = 1, n
+                 do i = 1, l
+                    b( i, j ) = b( i, j ) - work( k-l+i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. backward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v i ] ( i is k-by-k, v is k-by-n )
+              ! form  c h  or  c h**t  where  c = [ b a ] (a is m-by-k, b is m-by-n)
+              ! h = i - w**t t w            or  h**t = i - w**t t**t w
+              ! a = a - (a + b v**t) t      or  a = a - (a + b v**t) t**t
+              ! b = b - (a + b v**t) t v    or  b = b - (a + b v**t) t**t v
+       ! ---------------------------------------------------------------------------
+              np = min( l+1, n )
+              kp = min( k-l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, k-l+j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'R', 'U', 'T', 'N', m, l, one, v( kp, 1 ), ldv,work( 1, kp ), &
+                        ldwork )
+              call stdlib_dgemm( 'N', 'T', m, l, n-l, one, b( 1, np ), ldb,v( kp, np ), ldv, one, &
+                        work( 1, kp ), ldwork )
+              call stdlib_dgemm( 'N', 'T', m, k-l, n, one, b, ldb, v, ldv,zero, work, ldwork )
+                        
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'R', 'L', trans, 'N', m, k, one, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_dgemm( 'N', 'N', m, n-l, k, -one, work, ldwork,v( 1, np ), ldv, one, b( &
+                        1, np ), ldb )
+              call stdlib_dgemm( 'N', 'N', m, l, k-l , -one, work, ldwork,v, ldv, one, b, ldb )
+                        
+              call stdlib_dtrmm( 'R', 'U', 'N', 'N', m, l, one, v( kp, 1 ), ldv,work( 1, kp ), &
+                        ldwork )
+              do j = 1, l
+                 do i = 1, m
+                    b( i, j ) = b( i, j ) - work( i, k-l+j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_dtprfb
+
+     !> DTPRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular packed
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by DTPTRS or some other
+     !> means before entering this routine.  DTPRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_dtprfs( uplo, trans, diag, n, nrhs, ap, b, ldb, x, ldx,ferr, berr, &
+               work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: ap(*), b(ldb,*), x(ldx,*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transt
+           integer(ilp) :: i, j, k, kase, kc, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a or a**t, depending on trans.
+              call stdlib_dcopy( n, x( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_dtpmv( uplo, trans, diag, n, ap, work( n+1 ), 1 )
+              call stdlib_daxpy( n, -one, b( 1, j ), 1, work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = 1, k
+                             work( i ) = work( i ) + abs( ap( kc+i-1 ) )*xk
+                          end do
+                          kc = kc + k
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = 1, k - 1
+                             work( i ) = work( i ) + abs( ap( kc+i-1 ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                          kc = kc + k
+                       end do
+                    end if
+                 else
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k, n
+                             work( i ) = work( i ) + abs( ap( kc+i-k ) )*xk
+                          end do
+                          kc = kc + n - k + 1
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k + 1, n
+                             work( i ) = work( i ) + abs( ap( kc+i-k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                          kc = kc + n - k + 1
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**t)*abs(x) + abs(b).
+                 if( upper ) then
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = 1, k
+                             s = s + abs( ap( kc+i-1 ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                          kc = kc + k
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = 1, k - 1
+                             s = s + abs( ap( kc+i-1 ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                          kc = kc + k
+                       end do
+                    end if
+                 else
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, n
+                             s = s + abs( ap( kc+i-k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                          kc = kc + n - k + 1
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = k + 1, n
+                             s = s + abs( ap( kc+i-k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                          kc = kc + n - k + 1
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_dlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_dlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_dtpsv( uplo, transt, diag, n, ap, work( n+1 ), 1 )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_dtpsv( uplo, trans, diag, n, ap, work( n+1 ), 1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_dtprfs
+
+     !> DTPTRI: computes the inverse of a real upper or lower triangular
+     !> matrix A stored in packed format.
+
+     pure subroutine stdlib_dtptri( uplo, diag, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jc, jclast, jj
+           real(dp) :: ajj
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPTRI', -info )
+              return
+           end if
+           ! check for singularity if non-unit.
+           if( nounit ) then
+              if( upper ) then
+                 jj = 0
+                 do info = 1, n
+                    jj = jj + info
+                    if( ap( jj )==zero )return
+                 end do
+              else
+                 jj = 1
+                 do info = 1, n
+                    if( ap( jj )==zero )return
+                    jj = jj + n - info + 1
+                 end do
+              end if
+              info = 0
+           end if
+           if( upper ) then
+              ! compute inverse of upper triangular matrix.
+              jc = 1
+              do j = 1, n
+                 if( nounit ) then
+                    ap( jc+j-1 ) = one / ap( jc+j-1 )
+                    ajj = -ap( jc+j-1 )
+                 else
+                    ajj = -one
+                 end if
+                 ! compute elements 1:j-1 of j-th column.
+                 call stdlib_dtpmv( 'UPPER', 'NO TRANSPOSE', diag, j-1, ap,ap( jc ), 1 )
+                 call stdlib_dscal( j-1, ajj, ap( jc ), 1 )
+                 jc = jc + j
+              end do
+           else
+              ! compute inverse of lower triangular matrix.
+              jc = n*( n+1 ) / 2
+              do j = n, 1, -1
+                 if( nounit ) then
+                    ap( jc ) = one / ap( jc )
+                    ajj = -ap( jc )
+                 else
+                    ajj = -one
+                 end if
+                 if( j<n ) then
+                    ! compute elements j+1:n of j-th column.
+                    call stdlib_dtpmv( 'LOWER', 'NO TRANSPOSE', diag, n-j,ap( jclast ), ap( jc+1 )&
+                              , 1 )
+                    call stdlib_dscal( n-j, ajj, ap( jc+1 ), 1 )
+                 end if
+                 jclast = jc
+                 jc = jc - n + j - 2
+              end do
+           end if
+           return
+     end subroutine stdlib_dtptri
+
+     !> DTPTRS: solves a triangular system of the form
+     !> A * X = B  or  A**T * X = B,
+     !> where A is a triangular matrix of order N stored in packed format,
+     !> and B is an N-by-NRHS matrix.  A check is made to verify that A is
+     !> nonsingular.
+
+     pure subroutine stdlib_dtptrs( uplo, trans, diag, n, nrhs, ap, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(in) :: ap(*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jc
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              if( upper ) then
+                 jc = 1
+                 do info = 1, n
+                    if( ap( jc+info-1 )==zero )return
+                    jc = jc + info
+                 end do
+              else
+                 jc = 1
+                 do info = 1, n
+                    if( ap( jc )==zero )return
+                    jc = jc + n - info + 1
+                 end do
+              end if
+           end if
+           info = 0
+           ! solve a * x = b  or  a**t * x = b.
+           do j = 1, nrhs
+              call stdlib_dtpsv( uplo, trans, diag, n, ap, b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_dtptrs
+
+     !> DTPTTF: copies a triangular matrix A from standard packed format (TP)
+     !> to rectangular full packed format (TF).
+
+     pure subroutine stdlib_dtpttf( transr, uplo, n, ap, arf, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(in) :: ap(0:*)
+           real(dp), intent(out) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt
+           integer(ilp) :: i, j, ij
+           integer(ilp) :: ijp, jp, lda, js
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPTTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( normaltransr ) then
+                 arf( 0 ) = ap( 0 )
+              else
+                 arf( 0 ) = ap( 0 )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           ! set lda of arf^c; arf^c is (0:(n+1)/2-1,0:n-noe)
+           ! where noe = 0 if n is even, noe = 1 if n is odd
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              lda = n + 1
+           else
+              nisodd = .true.
+              lda = n
+           end if
+           ! arf^c has lda rows and n+1-noe cols
+           if( .not.normaltransr )lda = ( n+1 ) / 2
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    ijp = 0
+                    jp = 0
+                    do j = 0, n2
+                       do i = j, n - 1
+                          ij = i + jp
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, n2 - 1
+                       do j = 1 + i, n2
+                          ij = i + j*lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    ijp = 0
+                    do j = 0, n1 - 1
+                       ij = n2 + j
+                       do i = 0, j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = n1, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! n is odd, transr = 't', and uplo = 'l'
+                    ijp = 0
+                    do i = 0, n2
+                       do ij = i*( lda+1 ), n*lda - 1, lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 1
+                    do j = 0, n2 - 1
+                       do ij = js, js + n2 - j - 1
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! n is odd, transr = 't', and uplo = 'u'
+                    ijp = 0
+                    js = n2*lda
+                    do j = 0, n1 - 1
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, n1
+                       do ij = i, i + ( n1+i )*lda, lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    ijp = 0
+                    jp = 0
+                    do j = 0, k - 1
+                       do i = j, n - 1
+                          ij = 1 + i + jp
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, k - 1
+                       do j = i, k - 1
+                          ij = i + j*lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    ijp = 0
+                    do j = 0, k - 1
+                       ij = k + 1 + j
+                       do i = 0, j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = k, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! n is even, transr = 't', and uplo = 'l'
+                    ijp = 0
+                    do i = 0, k - 1
+                       do ij = i + ( i+1 )*lda, ( n+1 )*lda - 1, lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 0
+                    do j = 0, k - 1
+                       do ij = js, js + k - j - 1
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! n is even, transr = 't', and uplo = 'u'
+                    ijp = 0
+                    js = ( k+1 )*lda
+                    do j = 0, k - 1
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, k - 1
+                       do ij = i, i + ( k+i )*lda, lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dtpttf
+
+     !> DTPTTR: copies a triangular matrix A from standard packed format (TP)
+     !> to standard full format (TR).
+
+     pure subroutine stdlib_dtpttr( uplo, n, ap, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           real(dp), intent(out) :: a(lda,*)
+           real(dp), intent(in) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower
+           integer(ilp) :: i, j, k
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPTTR', -info )
+              return
+           end if
+           if( lower ) then
+              k = 0
+              do j = 1, n
+                 do i = j, n
+                    k = k + 1
+                    a( i, j ) = ap( k )
+                 end do
+              end do
+           else
+              k = 0
+              do j = 1, n
+                 do i = 1, j
+                    k = k + 1
+                    a( i, j ) = ap( k )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_dtpttr
+
+     !> DTRRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by DTRTRS or some other
+     !> means before entering this routine.  DTRRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_dtrrfs( uplo, trans, diag, n, nrhs, a, lda, b, ldb, x,ldx, ferr, berr,&
+                work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: a(lda,*), b(ldb,*), x(ldx,*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transt
+           integer(ilp) :: i, j, k, kase, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a or a**t, depending on trans.
+              call stdlib_dcopy( n, x( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_dtrmv( uplo, trans, diag, n, a, lda, work( n+1 ), 1 )
+              call stdlib_daxpy( n, -one, b( 1, j ), 1, work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = 1, k
+                             work( i ) = work( i ) + abs( a( i, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = 1, k - 1
+                             work( i ) = work( i ) + abs( a( i, k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k, n
+                             work( i ) = work( i ) + abs( a( i, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k + 1, n
+                             work( i ) = work( i ) + abs( a( i, k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**t)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = 1, k
+                             s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = 1, k - 1
+                             s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, n
+                             s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = k + 1, n
+                             s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_dlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_dlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_dtrsv( uplo, transt, diag, n, a, lda, work( n+1 ),1 )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_dtrsv( uplo, trans, diag, n, a, lda, work( n+1 ),1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_dtrrfs
+
+     !> DTRTI2: computes the inverse of a real upper or lower triangular
+     !> matrix.
+     !> This is the Level 2 BLAS version of the algorithm.
+
+     pure subroutine stdlib_dtrti2( uplo, diag, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j
+           real(dp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRTI2', -info )
+              return
+           end if
+           if( upper ) then
+              ! compute inverse of upper triangular matrix.
+              do j = 1, n
+                 if( nounit ) then
+                    a( j, j ) = one / a( j, j )
+                    ajj = -a( j, j )
+                 else
+                    ajj = -one
+                 end if
+                 ! compute elements 1:j-1 of j-th column.
+                 call stdlib_dtrmv( 'UPPER', 'NO TRANSPOSE', diag, j-1, a, lda,a( 1, j ), 1 )
+                           
+                 call stdlib_dscal( j-1, ajj, a( 1, j ), 1 )
+              end do
+           else
+              ! compute inverse of lower triangular matrix.
+              do j = n, 1, -1
+                 if( nounit ) then
+                    a( j, j ) = one / a( j, j )
+                    ajj = -a( j, j )
+                 else
+                    ajj = -one
+                 end if
+                 if( j<n ) then
+                    ! compute elements j+1:n of j-th column.
+                    call stdlib_dtrmv( 'LOWER', 'NO TRANSPOSE', diag, n-j,a( j+1, j+1 ), lda, a( &
+                              j+1, j ), 1 )
+                    call stdlib_dscal( n-j, ajj, a( j+1, j ), 1 )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_dtrti2
+
+     !> DTRTRI: computes the inverse of a real upper or lower triangular
+     !> matrix A.
+     !> This is the Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_dtrtri( uplo, diag, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jb, nb, nn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity if non-unit.
+           if( nounit ) then
+              do info = 1, n
+                 if( a( info, info )==zero )return
+              end do
+              info = 0
+           end if
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'DTRTRI', uplo // diag, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_dtrti2( uplo, diag, n, a, lda, info )
+           else
+              ! use blocked code
+              if( upper ) then
+                 ! compute inverse of upper triangular matrix
+                 do j = 1, n, nb
+                    jb = min( nb, n-j+1 )
+                    ! compute rows 1:j-1 of current block column
+                    call stdlib_dtrmm( 'LEFT', 'UPPER', 'NO TRANSPOSE', diag, j-1,jb, one, a, lda,&
+                               a( 1, j ), lda )
+                    call stdlib_dtrsm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', diag, j-1,jb, -one, a( j,&
+                               j ), lda, a( 1, j ), lda )
+                    ! compute inverse of current diagonal block
+                    call stdlib_dtrti2( 'UPPER', diag, jb, a( j, j ), lda, info )
+                 end do
+              else
+                 ! compute inverse of lower triangular matrix
+                 nn = ( ( n-1 ) / nb )*nb + 1
+                 do j = nn, 1, -nb
+                    jb = min( nb, n-j+1 )
+                    if( j+jb<=n ) then
+                       ! compute rows j+jb:n of current block column
+                       call stdlib_dtrmm( 'LEFT', 'LOWER', 'NO TRANSPOSE', diag,n-j-jb+1, jb, one,&
+                                  a( j+jb, j+jb ), lda,a( j+jb, j ), lda )
+                       call stdlib_dtrsm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', diag,n-j-jb+1, jb, -&
+                                 one, a( j, j ), lda,a( j+jb, j ), lda )
+                    end if
+                    ! compute inverse of current diagonal block
+                    call stdlib_dtrti2( 'LOWER', diag, jb, a( j, j ), lda, info )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_dtrtri
+
+     !> DTRTRS: solves a triangular system of the form
+     !> A * X = B  or  A**T * X = B,
+     !> where A is a triangular matrix of order N, and B is an N-by-NRHS
+     !> matrix.  A check is made to verify that A is nonsingular.
+
+     pure subroutine stdlib_dtrtrs( uplo, trans, diag, n, nrhs, a, lda, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              do info = 1, n
+                 if( a( info, info )==zero )return
+              end do
+           end if
+           info = 0
+           ! solve a * x = b  or  a**t * x = b.
+           call stdlib_dtrsm( 'LEFT', uplo, trans, diag, n, nrhs, one, a, lda, b,ldb )
+           return
+     end subroutine stdlib_dtrtrs
+
+     !> DTRTTF: copies a triangular matrix A from standard full format (TR)
+     !> to rectangular full packed format (TF) .
+
+     pure subroutine stdlib_dtrttf( transr, uplo, n, a, lda, arf, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           real(dp), intent(in) :: a(0:lda-1,0:*)
+           real(dp), intent(out) :: arf(0:*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: i, ij, j, k, l, n1, n2, nt, nx2, np1x2
+           ! Intrinsic Functions 
+           intrinsic :: max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRTTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 ) then
+                 arf( 0 ) = a( 0, 0 )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower: for n even n1=n2=k
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true., lda=n+1 and a is (n+1)--by--k2.
+           ! if n is even, set k = n/2 and nisodd = .false., lda=n and a is
+           ! n--by--(n+1)/2.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              if( .not.lower )np1x2 = n + n + 2
+           else
+              nisodd = .true.
+              if( .not.lower )nx2 = n + n
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    ij = 0
+                    do j = 0, n2
+                       do i = n1, n2 + j
+                          arf( ij ) = a( n2+j, i )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    ij = nt - n
+                    do j = n - 1, n1, -1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = j - n1, n1 - 1
+                          arf( ij ) = a( j-n1, l )
+                          ij = ij + 1
+                       end do
+                       ij = ij - nx2
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! n is odd, transr = 't', and uplo = 'l'
+                    ij = 0
+                    do j = 0, n2 - 1
+                       do i = 0, j
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                       do i = n1 + j, n - 1
+                          arf( ij ) = a( i, n1+j )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = n2, n - 1
+                       do i = 0, n1 - 1
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 't', and uplo = 'u'
+                    ij = 0
+                    do j = 0, n1
+                       do i = n1, n - 1
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, n1 - 1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = n2 + j, n - 1
+                          arf( ij ) = a( n2+j, l )
+                          ij = ij + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    ij = 0
+                    do j = 0, k - 1
+                       do i = k, k + j
+                          arf( ij ) = a( k+j, i )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    ij = nt - n - 1
+                    do j = n - 1, k, -1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = j - k, k - 1
+                          arf( ij ) = a( j-k, l )
+                          ij = ij + 1
+                       end do
+                       ij = ij - np1x2
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! n is even, transr = 't', and uplo = 'l'
+                    ij = 0
+                    j = k
+                    do i = k, n - 1
+                       arf( ij ) = a( i, j )
+                       ij = ij + 1
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                       do i = k + 1 + j, n - 1
+                          arf( ij ) = a( i, k+1+j )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = k - 1, n - 1
+                       do i = 0, k - 1
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 't', and uplo = 'u'
+                    ij = 0
+                    do j = 0, k
+                       do i = k, n - 1
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = k + 1 + j, n - 1
+                          arf( ij ) = a( k+1+j, l )
+                          ij = ij + 1
+                       end do
+                    end do
+                    ! note that here, on exit of the loop, j = k-1
+                    do i = 0, j
+                       arf( ij ) = a( i, j )
+                       ij = ij + 1
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dtrttf
+
+     !> DTRTTP: copies a triangular matrix A from full format (TR) to standard
+     !> packed format (TP).
+
+     pure subroutine stdlib_dtrttp( uplo, n, a, lda, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower
+           integer(ilp) :: i, j, k
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRTTP', -info )
+              return
+           end if
+           if( lower ) then
+              k = 0
+              do j = 1, n
+                 do i = j, n
+                    k = k + 1
+                    ap( k ) = a( i, j )
+                 end do
+              end do
+           else
+              k = 0
+              do j = 1, n
+                 do i = 1, j
+                    k = k + 1
+                    ap( k ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_dtrttp
+
+     !> DZSUM1: takes the sum of the absolute values of a complex
+     !> vector and returns a double precision result.
+     !> Based on DZASUM from the Level 1 BLAS.
+     !> The change is to use the 'genuine' absolute value.
+
+     pure real(dp) function stdlib_dzsum1( n, cx, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(dp), intent(in) :: cx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, nincx
+           real(dp) :: stemp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           stdlib_dzsum1 = zero
+           stemp = zero
+           if( n<=0 )return
+           if( incx==1 )go to 20
+           ! code for increment not equal to 1
+           nincx = n*incx
+           do i = 1, nincx, incx
+              ! next line modified.
+              stemp = stemp + abs( cx( i ) )
+           end do
+           stdlib_dzsum1 = stemp
+           return
+           ! code for increment equal to 1
+           20 continue
+           do i = 1, n
+              ! next line modified.
+              stemp = stemp + abs( cx( i ) )
+           end do
+           stdlib_dzsum1 = stemp
+           return
+     end function stdlib_dzsum1
+
+#:if WITH_QP
+     !> DLAG2Q: converts a SINGLE PRECISION matrix, SA, to a DOUBLE
+     !> PRECISION matrix, A.
+     !> Note that while it is possible to overflow while converting
+     !> from double to single, it is not possible to overflow when
+     !> converting from single to double.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_dlag2q( m, n, sa, ldsa, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, m, n
+           ! Array Arguments 
+           real(dp), intent(in) :: sa(ldsa,*)
+           real(qp), intent(out) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           info = 0
+           do j = 1, n
+              do i = 1, m
+                 a( i, j ) = sa( i, j )
+              end do
+           end do
+           return
+     end subroutine stdlib_dlag2q
+#:endif
+     !> DBBCSD: computes the CS decomposition of an orthogonal matrix in
+     !> bidiagonal-block form,
+     !> [ B11 | B12 0  0 ]
+     !> [  0  |  0 -I  0 ]
+     !> X = [----------------]
+     !> [ B21 | B22 0  0 ]
+     !> [  0  |  0  0  I ]
+     !> [  C | -S  0  0 ]
+     !> [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**T
+     !> = [---------] [---------------] [---------]   .
+     !> [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
+     !> [  0 |  0  0  I ]
+     !> X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
+     !> than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
+     !> transposed and/or permuted. This can be done in constant time using
+     !> the TRANS and SIGNS options. See DORCSD for details.)
+     !> The bidiagonal matrices B11, B12, B21, and B22 are represented
+     !> implicitly by angles THETA(1:Q) and PHI(1:Q-1).
+     !> The orthogonal matrices U1, U2, V1T, and V2T are input/output.
+     !> The input matrices are pre- or post-multiplied by the appropriate
+     !> singular vector matrices.
+
+     pure subroutine stdlib_dbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
+     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lwork, m, p, q
+           ! Array Arguments 
+           real(dp), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
+                      b22e(*), work(*)
+           real(dp), intent(inout) :: phi(*), theta(*)
+           real(dp), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
+        ! ===================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxitr = 6
+           real(dp), parameter :: hundred = 100.0_dp
+           real(dp), parameter :: meighth = -0.125_dp
+           real(dp), parameter :: piover2 = 1.57079632679489661923132169163975144210_dp
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
+                     wantu2, wantv1t, wantv2t
+           integer(ilp) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
+                     iv2tcs, iv2tsn, j, lworkmin, lworkopt, maxit, mini
+           real(dp) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
+                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
+           ! Intrinsic Functions 
+           intrinsic :: abs,atan2,cos,max,min,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           wantv2t = stdlib_lsame( jobv2t, 'Y' )
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           if( m < 0 ) then
+              info = -6
+           else if( p < 0 .or. p > m ) then
+              info = -7
+           else if( q < 0 .or. q > m ) then
+              info = -8
+           else if( q > p .or. q > m-p .or. q > m-q ) then
+              info = -8
+           else if( wantu1 .and. ldu1 < p ) then
+              info = -12
+           else if( wantu2 .and. ldu2 < m-p ) then
+              info = -14
+           else if( wantv1t .and. ldv1t < q ) then
+              info = -16
+           else if( wantv2t .and. ldv2t < m-q ) then
+              info = -18
+           end if
+           ! quick return if q = 0
+           if( info == 0 .and. q == 0 ) then
+              lworkmin = 1
+              work(1) = lworkmin
+              return
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              iu1cs = 1
+              iu1sn = iu1cs + q
+              iu2cs = iu1sn + q
+              iu2sn = iu2cs + q
+              iv1tcs = iu2sn + q
+              iv1tsn = iv1tcs + q
+              iv2tcs = iv1tsn + q
+              iv2tsn = iv2tcs + q
+              lworkopt = iv2tsn + q - 1
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not. lquery ) then
+                 info = -28
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DBBCSD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'EPSILON' )
+           unfl = stdlib_dlamch( 'SAFE MINIMUM' )
+           tolmul = max( ten, min( hundred, eps**meighth ) )
+           tol = tolmul*eps
+           thresh = max( tol, maxitr*q*q*unfl )
+           ! test for negligible sines or cosines
+           do i = 1, q
+              if( theta(i) < thresh ) then
+                 theta(i) = zero
+              else if( theta(i) > piover2-thresh ) then
+                 theta(i) = piover2
+              end if
+           end do
+           do i = 1, q-1
+              if( phi(i) < thresh ) then
+                 phi(i) = zero
+              else if( phi(i) > piover2-thresh ) then
+                 phi(i) = piover2
+              end if
+           end do
+           ! initial deflation
+           imax = q
+           do while( imax > 1 )
+              if( phi(imax-1) /= zero ) then
+                 exit
+              end if
+              imax = imax - 1
+           end do
+           imin = imax - 1
+           if  ( imin > 1 ) then
+              do while( phi(imin-1) /= zero )
+                 imin = imin - 1
+                 if  ( imin <= 1 ) exit
+              end do
+           end if
+           ! initialize iteration counter
+           maxit = maxitr*q*q
+           iter = 0
+           ! begin main iteration loop
+           do while( imax > 1 )
+              ! compute the matrix entries
+              b11d(imin) = cos( theta(imin) )
+              b21d(imin) = -sin( theta(imin) )
+              do i = imin, imax - 1
+                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
+                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
+                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
+                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
+                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
+                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
+                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
+                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
+              end do
+              b12d(imax) = sin( theta(imax) )
+              b22d(imax) = cos( theta(imax) )
+              ! abort if not converging; otherwise, increment iter
+              if( iter > maxit ) then
+                 info = 0
+                 do i = 1, q
+                    if( phi(i) /= zero )info = info + 1
+                 end do
+                 return
+              end if
+              iter = iter + imax - imin
+              ! compute shifts
+              thetamax = theta(imin)
+              thetamin = theta(imin)
+              do i = imin+1, imax
+                 if( theta(i) > thetamax )thetamax = theta(i)
+                 if( theta(i) < thetamin )thetamin = theta(i)
+              end do
+              if( thetamax > piover2 - thresh ) then
+                 ! zero on diagonals of b11 and b22; induce deflation with a
+                 ! zero shift
+                 mu = zero
+                 nu = one
+              else if( thetamin < thresh ) then
+                 ! zero on diagonals of b12 and b22; induce deflation with a
+                 ! zero shift
+                 mu = one
+                 nu = zero
+              else
+                 ! compute shifts for b11 and b21 and use the lesser
+                 call stdlib_dlas2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
+                           
+                 call stdlib_dlas2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
+                           
+                 if( sigma11 <= sigma21 ) then
+                    mu = sigma11
+                    nu = sqrt( one - mu**2 )
+                    if( mu < thresh ) then
+                       mu = zero
+                       nu = one
+                    end if
+                 else
+                    nu = sigma21
+                    mu = sqrt( 1.0_dp - nu**2 )
+                    if( nu < thresh ) then
+                       mu = one
+                       nu = zero
+                    end if
+                 end if
+              end if
+              ! rotate to produce bulges in b11 and b21
+              if( mu <= nu ) then
+                 call stdlib_dlartgs( b11d(imin), b11e(imin), mu,work(iv1tcs+imin-1), work(iv1tsn+&
+                           imin-1) )
+              else
+                 call stdlib_dlartgs( b21d(imin), b21e(imin), nu,work(iv1tcs+imin-1), work(iv1tsn+&
+                           imin-1) )
+              end if
+              temp = work(iv1tcs+imin-1)*b11d(imin) +work(iv1tsn+imin-1)*b11e(imin)
+              b11e(imin) = work(iv1tcs+imin-1)*b11e(imin) -work(iv1tsn+imin-1)*b11d(imin)
+              b11d(imin) = temp
+              b11bulge = work(iv1tsn+imin-1)*b11d(imin+1)
+              b11d(imin+1) = work(iv1tcs+imin-1)*b11d(imin+1)
+              temp = work(iv1tcs+imin-1)*b21d(imin) +work(iv1tsn+imin-1)*b21e(imin)
+              b21e(imin) = work(iv1tcs+imin-1)*b21e(imin) -work(iv1tsn+imin-1)*b21d(imin)
+              b21d(imin) = temp
+              b21bulge = work(iv1tsn+imin-1)*b21d(imin+1)
+              b21d(imin+1) = work(iv1tcs+imin-1)*b21d(imin+1)
+              ! compute theta(imin)
+              theta( imin ) = atan2( sqrt( b21d(imin)**2+b21bulge**2 ),sqrt( b11d(imin)**2+&
+                        b11bulge**2 ) )
+              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
+              if( b11d(imin)**2+b11bulge**2 > thresh**2 ) then
+                 call stdlib_dlartgp( b11bulge, b11d(imin), work(iu1sn+imin-1),work(iu1cs+imin-1),&
+                            r )
+              else if( mu <= nu ) then
+                 call stdlib_dlartgs( b11e( imin ), b11d( imin + 1 ), mu,work(iu1cs+imin-1), work(&
+                           iu1sn+imin-1) )
+              else
+                 call stdlib_dlartgs( b12d( imin ), b12e( imin ), nu,work(iu1cs+imin-1), work(&
+                           iu1sn+imin-1) )
+              end if
+              if( b21d(imin)**2+b21bulge**2 > thresh**2 ) then
+                 call stdlib_dlartgp( b21bulge, b21d(imin), work(iu2sn+imin-1),work(iu2cs+imin-1),&
+                            r )
+              else if( nu < mu ) then
+                 call stdlib_dlartgs( b21e( imin ), b21d( imin + 1 ), nu,work(iu2cs+imin-1), work(&
+                           iu2sn+imin-1) )
+              else
+                 call stdlib_dlartgs( b22d(imin), b22e(imin), mu,work(iu2cs+imin-1), work(iu2sn+&
+                           imin-1) )
+              end if
+              work(iu2cs+imin-1) = -work(iu2cs+imin-1)
+              work(iu2sn+imin-1) = -work(iu2sn+imin-1)
+              temp = work(iu1cs+imin-1)*b11e(imin) +work(iu1sn+imin-1)*b11d(imin+1)
+              b11d(imin+1) = work(iu1cs+imin-1)*b11d(imin+1) -work(iu1sn+imin-1)*b11e(imin)
+                        
+              b11e(imin) = temp
+              if( imax > imin+1 ) then
+                 b11bulge = work(iu1sn+imin-1)*b11e(imin+1)
+                 b11e(imin+1) = work(iu1cs+imin-1)*b11e(imin+1)
+              end if
+              temp = work(iu1cs+imin-1)*b12d(imin) +work(iu1sn+imin-1)*b12e(imin)
+              b12e(imin) = work(iu1cs+imin-1)*b12e(imin) -work(iu1sn+imin-1)*b12d(imin)
+              b12d(imin) = temp
+              b12bulge = work(iu1sn+imin-1)*b12d(imin+1)
+              b12d(imin+1) = work(iu1cs+imin-1)*b12d(imin+1)
+              temp = work(iu2cs+imin-1)*b21e(imin) +work(iu2sn+imin-1)*b21d(imin+1)
+              b21d(imin+1) = work(iu2cs+imin-1)*b21d(imin+1) -work(iu2sn+imin-1)*b21e(imin)
+                        
+              b21e(imin) = temp
+              if( imax > imin+1 ) then
+                 b21bulge = work(iu2sn+imin-1)*b21e(imin+1)
+                 b21e(imin+1) = work(iu2cs+imin-1)*b21e(imin+1)
+              end if
+              temp = work(iu2cs+imin-1)*b22d(imin) +work(iu2sn+imin-1)*b22e(imin)
+              b22e(imin) = work(iu2cs+imin-1)*b22e(imin) -work(iu2sn+imin-1)*b22d(imin)
+              b22d(imin) = temp
+              b22bulge = work(iu2sn+imin-1)*b22d(imin+1)
+              b22d(imin+1) = work(iu2cs+imin-1)*b22d(imin+1)
+              ! inner loop: chase bulges from b11(imin,imin+2),
+              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
+              ! bottom-right
+              do i = imin+1, imax-1
+                 ! compute phi(i-1)
+                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
+                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
+                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
+                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
+                 phi(i-1) = atan2( sqrt(x1**2+x2**2), sqrt(y1**2+y2**2) )
+                 ! determine if there are bulges to chase or if a new direct
+                 ! summand has been reached
+                 restart11 = b11e(i-1)**2 + b11bulge**2 <= thresh**2
+                 restart21 = b21e(i-1)**2 + b21bulge**2 <= thresh**2
+                 restart12 = b12d(i-1)**2 + b12bulge**2 <= thresh**2
+                 restart22 = b22d(i-1)**2 + b22bulge**2 <= thresh**2
+                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
+                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
+                 ! chasing by applying the original shift again.
+                 if( .not. restart11 .and. .not. restart21 ) then
+                    call stdlib_dlartgp( x2, x1, work(iv1tsn+i-1), work(iv1tcs+i-1),r )
+                 else if( .not. restart11 .and. restart21 ) then
+                    call stdlib_dlartgp( b11bulge, b11e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
+                              r )
+                 else if( restart11 .and. .not. restart21 ) then
+                    call stdlib_dlartgp( b21bulge, b21e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
+                              r )
+                 else if( mu <= nu ) then
+                    call stdlib_dlartgs( b11d(i), b11e(i), mu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
+                              
+                 else
+                    call stdlib_dlartgs( b21d(i), b21e(i), nu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
+                              
+                 end if
+                 work(iv1tcs+i-1) = -work(iv1tcs+i-1)
+                 work(iv1tsn+i-1) = -work(iv1tsn+i-1)
+                 if( .not. restart12 .and. .not. restart22 ) then
+                    call stdlib_dlartgp( y2, y1, work(iv2tsn+i-1-1),work(iv2tcs+i-1-1), r )
+                              
+                 else if( .not. restart12 .and. restart22 ) then
+                    call stdlib_dlartgp( b12bulge, b12d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
+                              1), r )
+                 else if( restart12 .and. .not. restart22 ) then
+                    call stdlib_dlartgp( b22bulge, b22d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
+                              1), r )
+                 else if( nu < mu ) then
+                    call stdlib_dlartgs( b12e(i-1), b12d(i), nu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
+                              1-1) )
+                 else
+                    call stdlib_dlartgs( b22e(i-1), b22d(i), mu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
+                              1-1) )
+                 end if
+                 temp = work(iv1tcs+i-1)*b11d(i) + work(iv1tsn+i-1)*b11e(i)
+                 b11e(i) = work(iv1tcs+i-1)*b11e(i) -work(iv1tsn+i-1)*b11d(i)
+                 b11d(i) = temp
+                 b11bulge = work(iv1tsn+i-1)*b11d(i+1)
+                 b11d(i+1) = work(iv1tcs+i-1)*b11d(i+1)
+                 temp = work(iv1tcs+i-1)*b21d(i) + work(iv1tsn+i-1)*b21e(i)
+                 b21e(i) = work(iv1tcs+i-1)*b21e(i) -work(iv1tsn+i-1)*b21d(i)
+                 b21d(i) = temp
+                 b21bulge = work(iv1tsn+i-1)*b21d(i+1)
+                 b21d(i+1) = work(iv1tcs+i-1)*b21d(i+1)
+                 temp = work(iv2tcs+i-1-1)*b12e(i-1) +work(iv2tsn+i-1-1)*b12d(i)
+                 b12d(i) = work(iv2tcs+i-1-1)*b12d(i) -work(iv2tsn+i-1-1)*b12e(i-1)
+                 b12e(i-1) = temp
+                 b12bulge = work(iv2tsn+i-1-1)*b12e(i)
+                 b12e(i) = work(iv2tcs+i-1-1)*b12e(i)
+                 temp = work(iv2tcs+i-1-1)*b22e(i-1) +work(iv2tsn+i-1-1)*b22d(i)
+                 b22d(i) = work(iv2tcs+i-1-1)*b22d(i) -work(iv2tsn+i-1-1)*b22e(i-1)
+                 b22e(i-1) = temp
+                 b22bulge = work(iv2tsn+i-1-1)*b22e(i)
+                 b22e(i) = work(iv2tcs+i-1-1)*b22e(i)
+                 ! compute theta(i)
+                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
+                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
+                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
+                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
+                 theta(i) = atan2( sqrt(y1**2+y2**2), sqrt(x1**2+x2**2) )
+                 ! determine if there are bulges to chase or if a new direct
+                 ! summand has been reached
+                 restart11 =   b11d(i)**2 + b11bulge**2 <= thresh**2
+                 restart12 = b12e(i-1)**2 + b12bulge**2 <= thresh**2
+                 restart21 =   b21d(i)**2 + b21bulge**2 <= thresh**2
+                 restart22 = b22e(i-1)**2 + b22bulge**2 <= thresh**2
+                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
+                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
+                 ! chasing by applying the original shift again.
+                 if( .not. restart11 .and. .not. restart12 ) then
+                    call stdlib_dlartgp( x2, x1, work(iu1sn+i-1), work(iu1cs+i-1),r )
+                 else if( .not. restart11 .and. restart12 ) then
+                    call stdlib_dlartgp( b11bulge, b11d(i), work(iu1sn+i-1),work(iu1cs+i-1), r )
+                              
+                 else if( restart11 .and. .not. restart12 ) then
+                    call stdlib_dlartgp( b12bulge, b12e(i-1), work(iu1sn+i-1),work(iu1cs+i-1), r )
+                              
+                 else if( mu <= nu ) then
+                    call stdlib_dlartgs( b11e(i), b11d(i+1), mu, work(iu1cs+i-1),work(iu1sn+i-1) )
+                              
+                 else
+                    call stdlib_dlartgs( b12d(i), b12e(i), nu, work(iu1cs+i-1),work(iu1sn+i-1) )
+                              
+                 end if
+                 if( .not. restart21 .and. .not. restart22 ) then
+                    call stdlib_dlartgp( y2, y1, work(iu2sn+i-1), work(iu2cs+i-1),r )
+                 else if( .not. restart21 .and. restart22 ) then
+                    call stdlib_dlartgp( b21bulge, b21d(i), work(iu2sn+i-1),work(iu2cs+i-1), r )
+                              
+                 else if( restart21 .and. .not. restart22 ) then
+                    call stdlib_dlartgp( b22bulge, b22e(i-1), work(iu2sn+i-1),work(iu2cs+i-1), r )
+                              
+                 else if( nu < mu ) then
+                    call stdlib_dlartgs( b21e(i), b21e(i+1), nu, work(iu2cs+i-1),work(iu2sn+i-1) )
+                              
+                 else
+                    call stdlib_dlartgs( b22d(i), b22e(i), mu, work(iu2cs+i-1),work(iu2sn+i-1) )
+                              
+                 end if
+                 work(iu2cs+i-1) = -work(iu2cs+i-1)
+                 work(iu2sn+i-1) = -work(iu2sn+i-1)
+                 temp = work(iu1cs+i-1)*b11e(i) + work(iu1sn+i-1)*b11d(i+1)
+                 b11d(i+1) = work(iu1cs+i-1)*b11d(i+1) -work(iu1sn+i-1)*b11e(i)
+                 b11e(i) = temp
+                 if( i < imax - 1 ) then
+                    b11bulge = work(iu1sn+i-1)*b11e(i+1)
+                    b11e(i+1) = work(iu1cs+i-1)*b11e(i+1)
+                 end if
+                 temp = work(iu2cs+i-1)*b21e(i) + work(iu2sn+i-1)*b21d(i+1)
+                 b21d(i+1) = work(iu2cs+i-1)*b21d(i+1) -work(iu2sn+i-1)*b21e(i)
+                 b21e(i) = temp
+                 if( i < imax - 1 ) then
+                    b21bulge = work(iu2sn+i-1)*b21e(i+1)
+                    b21e(i+1) = work(iu2cs+i-1)*b21e(i+1)
+                 end if
+                 temp = work(iu1cs+i-1)*b12d(i) + work(iu1sn+i-1)*b12e(i)
+                 b12e(i) = work(iu1cs+i-1)*b12e(i) - work(iu1sn+i-1)*b12d(i)
+                 b12d(i) = temp
+                 b12bulge = work(iu1sn+i-1)*b12d(i+1)
+                 b12d(i+1) = work(iu1cs+i-1)*b12d(i+1)
+                 temp = work(iu2cs+i-1)*b22d(i) + work(iu2sn+i-1)*b22e(i)
+                 b22e(i) = work(iu2cs+i-1)*b22e(i) - work(iu2sn+i-1)*b22d(i)
+                 b22d(i) = temp
+                 b22bulge = work(iu2sn+i-1)*b22d(i+1)
+                 b22d(i+1) = work(iu2cs+i-1)*b22d(i+1)
+              end do
+              ! compute phi(imax-1)
+              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
+              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
+              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
+              phi(imax-1) = atan2( abs(x1), sqrt(y1**2+y2**2) )
+              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
+              restart12 = b12d(imax-1)**2 + b12bulge**2 <= thresh**2
+              restart22 = b22d(imax-1)**2 + b22bulge**2 <= thresh**2
+              if( .not. restart12 .and. .not. restart22 ) then
+                 call stdlib_dlartgp( y2, y1, work(iv2tsn+imax-1-1),work(iv2tcs+imax-1-1), r )
+                           
+              else if( .not. restart12 .and. restart22 ) then
+                 call stdlib_dlartgp( b12bulge, b12d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
+                           imax-1-1), r )
+              else if( restart12 .and. .not. restart22 ) then
+                 call stdlib_dlartgp( b22bulge, b22d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
+                           imax-1-1), r )
+              else if( nu < mu ) then
+                 call stdlib_dlartgs( b12e(imax-1), b12d(imax), nu,work(iv2tcs+imax-1-1), work(&
+                           iv2tsn+imax-1-1) )
+              else
+                 call stdlib_dlartgs( b22e(imax-1), b22d(imax), mu,work(iv2tcs+imax-1-1), work(&
+                           iv2tsn+imax-1-1) )
+              end if
+              temp = work(iv2tcs+imax-1-1)*b12e(imax-1) +work(iv2tsn+imax-1-1)*b12d(imax)
+              b12d(imax) = work(iv2tcs+imax-1-1)*b12d(imax) -work(iv2tsn+imax-1-1)*b12e(imax-1)
+                        
+              b12e(imax-1) = temp
+              temp = work(iv2tcs+imax-1-1)*b22e(imax-1) +work(iv2tsn+imax-1-1)*b22d(imax)
+              b22d(imax) = work(iv2tcs+imax-1-1)*b22d(imax) -work(iv2tsn+imax-1-1)*b22e(imax-1)
+                        
+              b22e(imax-1) = temp
+              ! update singular vectors
+              if( wantu1 ) then
+                 if( colmajor ) then
+                    call stdlib_dlasr( 'R', 'V', 'F', p, imax-imin+1,work(iu1cs+imin-1), work(&
+                              iu1sn+imin-1),u1(1,imin), ldu1 )
+                 else
+                    call stdlib_dlasr( 'L', 'V', 'F', imax-imin+1, p,work(iu1cs+imin-1), work(&
+                              iu1sn+imin-1),u1(imin,1), ldu1 )
+                 end if
+              end if
+              if( wantu2 ) then
+                 if( colmajor ) then
+                    call stdlib_dlasr( 'R', 'V', 'F', m-p, imax-imin+1,work(iu2cs+imin-1), work(&
+                              iu2sn+imin-1),u2(1,imin), ldu2 )
+                 else
+                    call stdlib_dlasr( 'L', 'V', 'F', imax-imin+1, m-p,work(iu2cs+imin-1), work(&
+                              iu2sn+imin-1),u2(imin,1), ldu2 )
+                 end if
+              end if
+              if( wantv1t ) then
+                 if( colmajor ) then
+                    call stdlib_dlasr( 'L', 'V', 'F', imax-imin+1, q,work(iv1tcs+imin-1), work(&
+                              iv1tsn+imin-1),v1t(imin,1), ldv1t )
+                 else
+                    call stdlib_dlasr( 'R', 'V', 'F', q, imax-imin+1,work(iv1tcs+imin-1), work(&
+                              iv1tsn+imin-1),v1t(1,imin), ldv1t )
+                 end if
+              end if
+              if( wantv2t ) then
+                 if( colmajor ) then
+                    call stdlib_dlasr( 'L', 'V', 'F', imax-imin+1, m-q,work(iv2tcs+imin-1), work(&
+                              iv2tsn+imin-1),v2t(imin,1), ldv2t )
+                 else
+                    call stdlib_dlasr( 'R', 'V', 'F', m-q, imax-imin+1,work(iv2tcs+imin-1), work(&
+                              iv2tsn+imin-1),v2t(1,imin), ldv2t )
+                 end if
+              end if
+              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
+              if( b11e(imax-1)+b21e(imax-1) > 0 ) then
+                 b11d(imax) = -b11d(imax)
+                 b21d(imax) = -b21d(imax)
+                 if( wantv1t ) then
+                    if( colmajor ) then
+                       call stdlib_dscal( q, negone, v1t(imax,1), ldv1t )
+                    else
+                       call stdlib_dscal( q, negone, v1t(1,imax), 1 )
+                    end if
+                 end if
+              end if
+              ! compute theta(imax)
+              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
+              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
+              theta(imax) = atan2( abs(y1), abs(x1) )
+              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
+              ! and b22(imax,imax-1)
+              if( b11d(imax)+b12e(imax-1) < 0 ) then
+                 b12d(imax) = -b12d(imax)
+                 if( wantu1 ) then
+                    if( colmajor ) then
+                       call stdlib_dscal( p, negone, u1(1,imax), 1 )
+                    else
+                       call stdlib_dscal( p, negone, u1(imax,1), ldu1 )
+                    end if
+                 end if
+              end if
+              if( b21d(imax)+b22e(imax-1) > 0 ) then
+                 b22d(imax) = -b22d(imax)
+                 if( wantu2 ) then
+                    if( colmajor ) then
+                       call stdlib_dscal( m-p, negone, u2(1,imax), 1 )
+                    else
+                       call stdlib_dscal( m-p, negone, u2(imax,1), ldu2 )
+                    end if
+                 end if
+              end if
+              ! fix signs on b12(imax,imax) and b22(imax,imax)
+              if( b12d(imax)+b22d(imax) < 0 ) then
+                 if( wantv2t ) then
+                    if( colmajor ) then
+                       call stdlib_dscal( m-q, negone, v2t(imax,1), ldv2t )
+                    else
+                       call stdlib_dscal( m-q, negone, v2t(1,imax), 1 )
+                    end if
+                 end if
+              end if
+              ! test for negligible sines or cosines
+              do i = imin, imax
+                 if( theta(i) < thresh ) then
+                    theta(i) = zero
+                 else if( theta(i) > piover2-thresh ) then
+                    theta(i) = piover2
+                 end if
+              end do
+              do i = imin, imax-1
+                 if( phi(i) < thresh ) then
+                    phi(i) = zero
+                 else if( phi(i) > piover2-thresh ) then
+                    phi(i) = piover2
+                 end if
+              end do
+              ! deflate
+              if (imax > 1) then
+                 do while( phi(imax-1) == zero )
+                    imax = imax - 1
+                    if (imax <= 1) exit
+                 end do
+              end if
+              if( imin > imax - 1 )imin = imax - 1
+              if (imin > 1) then
+                 do while (phi(imin-1) /= zero)
+                     imin = imin - 1
+                     if (imin <= 1) exit
+                 end do
+              end if
+              ! repeat main iteration loop
+           end do
+           ! postprocessing: order theta from least to greatest
+           do i = 1, q
+              mini = i
+              thetamin = theta(i)
+              do j = i+1, q
+                 if( theta(j) < thetamin ) then
+                    mini = j
+                    thetamin = theta(j)
+                 end if
+              end do
+              if( mini /= i ) then
+                 theta(mini) = theta(i)
+                 theta(i) = thetamin
+                 if( colmajor ) then
+                    if( wantu1 )call stdlib_dswap( p, u1(1,i), 1, u1(1,mini), 1 )
+                    if( wantu2 )call stdlib_dswap( m-p, u2(1,i), 1, u2(1,mini), 1 )
+                    if( wantv1t )call stdlib_dswap( q, v1t(i,1), ldv1t, v1t(mini,1), ldv1t )
+                              
+                    if( wantv2t )call stdlib_dswap( m-q, v2t(i,1), ldv2t, v2t(mini,1),ldv2t )
+                              
+                 else
+                    if( wantu1 )call stdlib_dswap( p, u1(i,1), ldu1, u1(mini,1), ldu1 )
+                    if( wantu2 )call stdlib_dswap( m-p, u2(i,1), ldu2, u2(mini,1), ldu2 )
+                    if( wantv1t )call stdlib_dswap( q, v1t(1,i), 1, v1t(1,mini), 1 )
+                    if( wantv2t )call stdlib_dswap( m-q, v2t(1,i), 1, v2t(1,mini), 1 )
+                 end if
+              end if
+           end do
+           return
+     end subroutine stdlib_dbbcsd
+
+     !> DDISNA: computes the reciprocal condition numbers for the eigenvectors
+     !> of a real symmetric or complex Hermitian matrix or for the left or
+     !> right singular vectors of a general m-by-n matrix. The reciprocal
+     !> condition number is the 'gap' between the corresponding eigenvalue or
+     !> singular value and the nearest other one.
+     !> The bound on the error, measured by angle in radians, in the I-th
+     !> computed vector is given by
+     !> DLAMCH( 'E' ) * ( ANORM / SEP( I ) )
+     !> where ANORM = 2-norm(A) = max( abs( D(j) ) ).  SEP(I) is not allowed
+     !> to be smaller than DLAMCH( 'E' )*ANORM in order to limit the size of
+     !> the error bound.
+     !> DDISNA may also be used to compute error bounds for eigenvectors of
+     !> the generalized symmetric definite eigenproblem.
+
+     pure subroutine stdlib_ddisna( job, m, n, d, sep, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: m, n
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*)
+           real(dp), intent(out) :: sep(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: decr, eigen, incr, left, right, sing
+           integer(ilp) :: i, k
+           real(dp) :: anorm, eps, newgap, oldgap, safmin, thresh
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           eigen = stdlib_lsame( job, 'E' )
+           left = stdlib_lsame( job, 'L' )
+           right = stdlib_lsame( job, 'R' )
+           sing = left .or. right
+           if( eigen ) then
+              k = m
+           else if( sing ) then
+              k = min( m, n )
+           end if
+           if( .not.eigen .and. .not.sing ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( k<0 ) then
+              info = -3
+           else
+              incr = .true.
+              decr = .true.
+              do i = 1, k - 1
+                 if( incr )incr = incr .and. d( i )<=d( i+1 )
+                 if( decr )decr = decr .and. d( i )>=d( i+1 )
+              end do
+              if( sing .and. k>0 ) then
+                 if( incr )incr = incr .and. zero<=d( 1 )
+                 if( decr )decr = decr .and. d( k )>=zero
+              end if
+              if( .not.( incr .or. decr ) )info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DDISNA', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==0 )return
+           ! compute reciprocal condition numbers
+           if( k==1 ) then
+              sep( 1 ) = stdlib_dlamch( 'O' )
+           else
+              oldgap = abs( d( 2 )-d( 1 ) )
+              sep( 1 ) = oldgap
+              do i = 2, k - 1
+                 newgap = abs( d( i+1 )-d( i ) )
+                 sep( i ) = min( oldgap, newgap )
+                 oldgap = newgap
+              end do
+              sep( k ) = oldgap
+           end if
+           if( sing ) then
+              if( ( left .and. m>n ) .or. ( right .and. m<n ) ) then
+                 if( incr )sep( 1 ) = min( sep( 1 ), d( 1 ) )
+                 if( decr )sep( k ) = min( sep( k ), d( k ) )
+              end if
+           end if
+           ! ensure that reciprocal condition numbers are not less than
+           ! threshold, in order to limit the size of the error bound
+           eps = stdlib_dlamch( 'E' )
+           safmin = stdlib_dlamch( 'S' )
+           anorm = max( abs( d( 1 ) ), abs( d( k ) ) )
+           if( anorm==zero ) then
+              thresh = eps
+           else
+              thresh = max( eps*anorm, safmin )
+           end if
+           do i = 1, k
+              sep( i ) = max( sep( i ), thresh )
+           end do
+           return
+     end subroutine stdlib_ddisna
+
+     !> DGBBRD: reduces a real general m-by-n band matrix A to upper
+     !> bidiagonal form B by an orthogonal transformation: Q**T * A * P = B.
+     !> The routine computes B, and optionally forms Q or P**T, or computes
+     !> Q**T*C for a given matrix C.
+
+     pure subroutine stdlib_dgbbrd( vect, m, n, ncc, kl, ku, ab, ldab, d, e, q,ldq, pt, ldpt, c, &
+               ldc, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldc, ldpt, ldq, m, n, ncc
+           ! Array Arguments 
+           real(dp), intent(inout) :: ab(ldab,*), c(ldc,*)
+           real(dp), intent(out) :: d(*), e(*), pt(ldpt,*), q(ldq,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantb, wantc, wantpt, wantq
+           integer(ilp) :: i, inca, j, j1, j2, kb, kb1, kk, klm, klu1, kun, l, minmn, ml, ml0, mn,&
+                      mu, mu0, nr, nrt
+           real(dp) :: ra, rb, rc, rs
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           wantb = stdlib_lsame( vect, 'B' )
+           wantq = stdlib_lsame( vect, 'Q' ) .or. wantb
+           wantpt = stdlib_lsame( vect, 'P' ) .or. wantb
+           wantc = ncc>0
+           klu1 = kl + ku + 1
+           info = 0
+           if( .not.wantq .and. .not.wantpt .and. .not.stdlib_lsame( vect, 'N' ) )then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ncc<0 ) then
+              info = -4
+           else if( kl<0 ) then
+              info = -5
+           else if( ku<0 ) then
+              info = -6
+           else if( ldab<klu1 ) then
+              info = -8
+           else if( ldq<1 .or. wantq .and. ldq<max( 1, m ) ) then
+              info = -12
+           else if( ldpt<1 .or. wantpt .and. ldpt<max( 1, n ) ) then
+              info = -14
+           else if( ldc<1 .or. wantc .and. ldc<max( 1, m ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBBRD', -info )
+              return
+           end if
+           ! initialize q and p**t to the unit matrix, if needed
+           if( wantq )call stdlib_dlaset( 'FULL', m, m, zero, one, q, ldq )
+           if( wantpt )call stdlib_dlaset( 'FULL', n, n, zero, one, pt, ldpt )
+           ! quick return if possible.
+           if( m==0 .or. n==0 )return
+           minmn = min( m, n )
+           if( kl+ku>1 ) then
+              ! reduce to upper bidiagonal form if ku > 0; if ku = 0, reduce
+              ! first to lower bidiagonal form and then transform to upper
+              ! bidiagonal
+              if( ku>0 ) then
+                 ml0 = 1
+                 mu0 = 2
+              else
+                 ml0 = 2
+                 mu0 = 1
+              end if
+              ! wherever possible, plane rotations are generated and applied in
+              ! vector operations of length nr over the index set j1:j2:klu1.
+              ! the sines of the plane rotations are stored in work(1:max(m,n))
+              ! and the cosines in work(max(m,n)+1:2*max(m,n)).
+              mn = max( m, n )
+              klm = min( m-1, kl )
+              kun = min( n-1, ku )
+              kb = klm + kun
+              kb1 = kb + 1
+              inca = kb1*ldab
+              nr = 0
+              j1 = klm + 2
+              j2 = 1 - kun
+              loop_90: do i = 1, minmn
+                 ! reduce i-th column and i-th row of matrix to bidiagonal form
+                 ml = klm + 1
+                 mu = kun + 1
+                 loop_80: do kk = 1, kb
+                    j1 = j1 + kb
+                    j2 = j2 + kb
+                    ! generate plane rotations to annihilate nonzero elements
+                    ! which have been created below the band
+                    if( nr>0 )call stdlib_dlargv( nr, ab( klu1, j1-klm-1 ), inca,work( j1 ), kb1, &
+                              work( mn+j1 ), kb1 )
+                    ! apply plane rotations from the left
+                    do l = 1, kb
+                       if( j2-klm+l-1>n ) then
+                          nrt = nr - 1
+                       else
+                          nrt = nr
+                       end if
+                       if( nrt>0 )call stdlib_dlartv( nrt, ab( klu1-l, j1-klm+l-1 ), inca,ab( &
+                                 klu1-l+1, j1-klm+l-1 ), inca,work( mn+j1 ), work( j1 ), kb1 )
+                    end do
+                    if( ml>ml0 ) then
+                       if( ml<=m-i+1 ) then
+                          ! generate plane rotation to annihilate a(i+ml-1,i)
+                          ! within the band, and apply rotation from the left
+                          call stdlib_dlartg( ab( ku+ml-1, i ), ab( ku+ml, i ),work( mn+i+ml-1 ), &
+                                    work( i+ml-1 ),ra )
+                          ab( ku+ml-1, i ) = ra
+                          if( i<n )call stdlib_drot( min( ku+ml-2, n-i ),ab( ku+ml-2, i+1 ), ldab-&
+                                    1,ab( ku+ml-1, i+1 ), ldab-1,work( mn+i+ml-1 ), work( i+ml-1 ) )
+                       end if
+                       nr = nr + 1
+                       j1 = j1 - kb1
+                    end if
+                    if( wantq ) then
+                       ! accumulate product of plane rotations in q
+                       do j = j1, j2, kb1
+                          call stdlib_drot( m, q( 1, j-1 ), 1, q( 1, j ), 1,work( mn+j ), work( j &
+                                    ) )
+                       end do
+                    end if
+                    if( wantc ) then
+                       ! apply plane rotations to c
+                       do j = j1, j2, kb1
+                          call stdlib_drot( ncc, c( j-1, 1 ), ldc, c( j, 1 ), ldc,work( mn+j ), &
+                                    work( j ) )
+                       end do
+                    end if
+                    if( j2+kun>n ) then
+                       ! adjust j2 to keep within the bounds of the matrix
+                       nr = nr - 1
+                       j2 = j2 - kb1
+                    end if
+                    do j = j1, j2, kb1
+                       ! create nonzero element a(j-1,j+ku) above the band
+                       ! and store it in work(n+1:2*n)
+                       work( j+kun ) = work( j )*ab( 1, j+kun )
+                       ab( 1, j+kun ) = work( mn+j )*ab( 1, j+kun )
+                    end do
+                    ! generate plane rotations to annihilate nonzero elements
+                    ! which have been generated above the band
+                    if( nr>0 )call stdlib_dlargv( nr, ab( 1, j1+kun-1 ), inca,work( j1+kun ), kb1,&
+                               work( mn+j1+kun ),kb1 )
+                    ! apply plane rotations from the right
+                    do l = 1, kb
+                       if( j2+l-1>m ) then
+                          nrt = nr - 1
+                       else
+                          nrt = nr
+                       end if
+                       if( nrt>0 )call stdlib_dlartv( nrt, ab( l+1, j1+kun-1 ), inca,ab( l, j1+&
+                                 kun ), inca,work( mn+j1+kun ), work( j1+kun ),kb1 )
+                    end do
+                    if( ml==ml0 .and. mu>mu0 ) then
+                       if( mu<=n-i+1 ) then
+                          ! generate plane rotation to annihilate a(i,i+mu-1)
+                          ! within the band, and apply rotation from the right
+                          call stdlib_dlartg( ab( ku-mu+3, i+mu-2 ),ab( ku-mu+2, i+mu-1 ),work( &
+                                    mn+i+mu-1 ), work( i+mu-1 ),ra )
+                          ab( ku-mu+3, i+mu-2 ) = ra
+                          call stdlib_drot( min( kl+mu-2, m-i ),ab( ku-mu+4, i+mu-2 ), 1,ab( ku-&
+                                    mu+3, i+mu-1 ), 1,work( mn+i+mu-1 ), work( i+mu-1 ) )
+                       end if
+                       nr = nr + 1
+                       j1 = j1 - kb1
+                    end if
+                    if( wantpt ) then
+                       ! accumulate product of plane rotations in p**t
+                       do j = j1, j2, kb1
+                          call stdlib_drot( n, pt( j+kun-1, 1 ), ldpt,pt( j+kun, 1 ), ldpt, work( &
+                                    mn+j+kun ),work( j+kun ) )
+                       end do
+                    end if
+                    if( j2+kb>m ) then
+                       ! adjust j2 to keep within the bounds of the matrix
+                       nr = nr - 1
+                       j2 = j2 - kb1
+                    end if
+                    do j = j1, j2, kb1
+                       ! create nonzero element a(j+kl+ku,j+ku-1) below the
+                       ! band and store it in work(1:n)
+                       work( j+kb ) = work( j+kun )*ab( klu1, j+kun )
+                       ab( klu1, j+kun ) = work( mn+j+kun )*ab( klu1, j+kun )
+                    end do
+                    if( ml>ml0 ) then
+                       ml = ml - 1
+                    else
+                       mu = mu - 1
+                    end if
+                 end do loop_80
+              end do loop_90
+           end if
+           if( ku==0 .and. kl>0 ) then
+              ! a has been reduced to lower bidiagonal form
+              ! transform lower bidiagonal form to upper bidiagonal by applying
+              ! plane rotations from the left, storing diagonal elements in d
+              ! and off-diagonal elements in e
+              do i = 1, min( m-1, n )
+                 call stdlib_dlartg( ab( 1, i ), ab( 2, i ), rc, rs, ra )
+                 d( i ) = ra
+                 if( i<n ) then
+                    e( i ) = rs*ab( 1, i+1 )
+                    ab( 1, i+1 ) = rc*ab( 1, i+1 )
+                 end if
+                 if( wantq )call stdlib_drot( m, q( 1, i ), 1, q( 1, i+1 ), 1, rc, rs )
+                 if( wantc )call stdlib_drot( ncc, c( i, 1 ), ldc, c( i+1, 1 ), ldc, rc,rs )
+                           
+              end do
+              if( m<=n )d( m ) = ab( 1, m )
+           else if( ku>0 ) then
+              ! a has been reduced to upper bidiagonal form
+              if( m<n ) then
+                 ! annihilate a(m,m+1) by applying plane rotations from the
+                 ! right, storing diagonal elements in d and off-diagonal
+                 ! elements in e
+                 rb = ab( ku, m+1 )
+                 do i = m, 1, -1
+                    call stdlib_dlartg( ab( ku+1, i ), rb, rc, rs, ra )
+                    d( i ) = ra
+                    if( i>1 ) then
+                       rb = -rs*ab( ku, i )
+                       e( i-1 ) = rc*ab( ku, i )
+                    end if
+                    if( wantpt )call stdlib_drot( n, pt( i, 1 ), ldpt, pt( m+1, 1 ), ldpt,rc, rs )
+                              
+                 end do
+              else
+                 ! copy off-diagonal elements to e and diagonal elements to d
+                 do i = 1, minmn - 1
+                    e( i ) = ab( ku, i+1 )
+                 end do
+                 do i = 1, minmn
+                    d( i ) = ab( ku+1, i )
+                 end do
+              end if
+           else
+              ! a is diagonal. set elements of e to zero and copy diagonal
+              ! elements to d.
+              do i = 1, minmn - 1
+                 e( i ) = zero
+              end do
+              do i = 1, minmn
+                 d( i ) = ab( 1, i )
+              end do
+           end if
+           return
+     end subroutine stdlib_dgbbrd
+
+     !> DGBCON: estimates the reciprocal of the condition number of a real
+     !> general band matrix A, in either the 1-norm or the infinity-norm,
+     !> using the LU factorization computed by DGBTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     pure subroutine stdlib_dgbcon( norm, n, kl, ku, ab, ldab, ipiv, anorm, rcond,work, iwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: ab(ldab,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnoti, onenrm
+           character :: normin
+           integer(ilp) :: ix, j, jp, kase, kase1, kd, lm
+           real(dp) :: ainvnm, scale, smlnum, t
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<2*kl+ku+1 ) then
+              info = -6
+           else if( anorm<zero ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           ! estimate the norm of inv(a).
+           ainvnm = zero
+           normin = 'N'
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kd = kl + ku + 1
+           lnoti = kl>0
+           kase = 0
+           10 continue
+           call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(l).
+                 if( lnoti ) then
+                    do j = 1, n - 1
+                       lm = min( kl, n-j )
+                       jp = ipiv( j )
+                       t = work( jp )
+                       if( jp/=j ) then
+                          work( jp ) = work( j )
+                          work( j ) = t
+                       end if
+                       call stdlib_daxpy( lm, -t, ab( kd+1, j ), 1, work( j+1 ), 1 )
+                    end do
+                 end if
+                 ! multiply by inv(u).
+                 call stdlib_dlatbs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kl+ku, ab, &
+                           ldab, work, scale, work( 2*n+1 ),info )
+              else
+                 ! multiply by inv(u**t).
+                 call stdlib_dlatbs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n,kl+ku, ab, ldab, &
+                           work, scale, work( 2*n+1 ),info )
+                 ! multiply by inv(l**t).
+                 if( lnoti ) then
+                    do j = n - 1, 1, -1
+                       lm = min( kl, n-j )
+                       work( j ) = work( j ) - stdlib_ddot( lm, ab( kd+1, j ), 1,work( j+1 ), 1 )
+                                 
+                       jp = ipiv( j )
+                       if( jp/=j ) then
+                          t = work( jp )
+                          work( jp ) = work( j )
+                          work( j ) = t
+                       end if
+                    end do
+                 end if
+              end if
+              ! divide x by 1/scale if doing so will not cause overflow.
+              normin = 'Y'
+              if( scale/=one ) then
+                 ix = stdlib_idamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 40
+                 call stdlib_drscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           40 continue
+           return
+     end subroutine stdlib_dgbcon
+
+     !> DGBEQU: computes row and column scalings intended to equilibrate an
+     !> M-by-N band matrix A and reduce its condition number.  R returns the
+     !> row scale factors and C the column scale factors, chosen to try to
+     !> make the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+     !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+     !> number and BIGNUM = largest safe number.  Use of these scaling
+     !> factors is not guaranteed to reduce the condition number of A but
+     !> works well in practice.
+
+     pure subroutine stdlib_dgbequ( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(dp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(dp), intent(in) :: ab(ldab,*)
+           real(dp), intent(out) :: c(*), r(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(dp) :: bignum, rcmax, rcmin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 r( i ) = max( r( i ), abs( ab( kd+i-j, j ) ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i))
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 c( j ) = max( c( j ), abs( ab( kd+i-j, j ) )*r( i ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j))
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_dgbequ
+
+     !> DGBEQUB: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+     !> the radix.
+     !> R(i) and C(j) are restricted to be a power of the radix between
+     !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+     !> of these scaling factors is not guaranteed to reduce the condition
+     !> number of A but works well in practice.
+     !> This routine differs from DGEEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled entries' magnitudes are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_dgbequb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(dp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(dp), intent(in) :: ab(ldab,*)
+           real(dp), intent(out) :: c(*), r(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(dp) :: bignum, rcmax, rcmin, smlnum, radix, logrdx
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,log
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.  assume smlnum is a power of the radix.
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           radix = stdlib_dlamch( 'B' )
+           logrdx = log(radix)
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 r( i ) = max( r( i ), abs( ab( kd+i-j, j ) ) )
+              end do
+           end do
+           do i = 1, m
+              if( r( i )>zero ) then
+                 r( i ) = radix**int( log( r( i ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i)).
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors.
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 c( j ) = max( c( j ), abs( ab( kd+i-j, j ) )*r( i ) )
+              end do
+              if( c( j )>zero ) then
+                 c( j ) = radix**int( log( c( j ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j)).
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_dgbequb
+
+     !> DGBRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is banded, and provides
+     !> error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_dgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb,ipiv, b, ldb, x, &
+               ldx, ferr, berr, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldafb, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*)
+           real(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transt
+           integer(ilp) :: count, i, j, k, kase, kk, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<kl+ku+1 ) then
+              info = -7
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -9
+           else if( ldb<max( 1, n ) ) then
+              info = -12
+           else if( ldx<max( 1, n ) ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = min( kl+ku+2, n+1 )
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_dcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_dgbmv( trans, n, n, kl, ku, -one, ab, ldab, x( 1, j ), 1,one, work( n+1 &
+                        ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(op(a))*abs(x) + abs(b).
+              if( notran ) then
+                 do k = 1, n
+                    kk = ku + 1 - k
+                    xk = abs( x( k, j ) )
+                    do i = max( 1, k-ku ), min( n, k+kl )
+                       work( i ) = work( i ) + abs( ab( kk+i, k ) )*xk
+                    end do
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    kk = ku + 1 - k
+                    do i = max( 1, k-ku ), min( n, k+kl )
+                       s = s + abs( ab( kk+i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_dgbtrs( trans, n, kl, ku, 1, afb, ldafb, ipiv,work( n+1 ), n, info )
+                           
+                 call stdlib_daxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_dlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_dlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_dgbtrs( transt, n, kl, ku, 1, afb, ldafb, ipiv,work( n+1 ), n, &
+                              info )
+                    do i = 1, n
+                       work( n+i ) = work( n+i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( n+i )*work( i )
+                    end do
+                    call stdlib_dgbtrs( trans, n, kl, ku, 1, afb, ldafb, ipiv,work( n+1 ), n, &
+                              info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_dgbrfs
+
+     !> DGBTRF: computes an LU factorization of a real m-by-n band matrix A
+     !> using partial pivoting with row interchanges.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_dgbtrf( m, n, kl, ku, ab, ldab, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldwork = nbmax+1
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i2, i3, ii, ip, j, j2, j3, jb, jj, jm, jp, ju, k2, km, kv, nb, &
+                     nw
+           real(dp) :: temp
+           ! Local Arrays 
+           real(dp) :: work13(ldwork,nbmax), work31(ldwork,nbmax)
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! kv is the number of superdiagonals in the factor u, allowing for
+           ! fill-in
+           kv = ku + kl
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+kv+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! determine the block size for this environment
+           nb = stdlib_ilaenv( 1, 'DGBTRF', ' ', m, n, kl, ku )
+           ! the block size must not exceed the limit set by the size of the
+           ! local arrays work13 and work31.
+           nb = min( nb, nbmax )
+           if( nb<=1 .or. nb>kl ) then
+              ! use unblocked code
+              call stdlib_dgbtf2( m, n, kl, ku, ab, ldab, ipiv, info )
+           else
+              ! use blocked code
+              ! zero the superdiagonal elements of the work array work13
+              do j = 1, nb
+                 do i = 1, j - 1
+                    work13( i, j ) = zero
+                 end do
+              end do
+              ! zero the subdiagonal elements of the work array work31
+              do j = 1, nb
+                 do i = j + 1, nb
+                    work31( i, j ) = zero
+                 end do
+              end do
+              ! gaussian elimination with partial pivoting
+              ! set fill-in elements in columns ku+2 to kv to zero
+              do j = ku + 2, min( kv, n )
+                 do i = kv - j + 2, kl
+                    ab( i, j ) = zero
+                 end do
+              end do
+              ! ju is the index of the last column affected by the current
+              ! stage of the factorization
+              ju = 1
+              loop_180: do j = 1, min( m, n ), nb
+                 jb = min( nb, min( m, n )-j+1 )
+                 ! the active part of the matrix is partitioned
+                    ! a11   a12   a13
+                    ! a21   a22   a23
+                    ! a31   a32   a33
+                 ! here a11, a21 and a31 denote the current block of jb columns
+                 ! which is about to be factorized. the number of rows in the
+                 ! partitioning are jb, i2, i3 respectively, and the numbers
+                 ! of columns are jb, j2, j3. the superdiagonal elements of a13
+                 ! and the subdiagonal elements of a31 lie outside the band.
+                 i2 = min( kl-jb, m-j-jb+1 )
+                 i3 = min( jb, m-j-kl+1 )
+                 ! j2 and j3 are computed after ju has been updated.
+                 ! factorize the current block of jb columns
+                 loop_80: do jj = j, j + jb - 1
+                    ! set fill-in elements in column jj+kv to zero
+                    if( jj+kv<=n ) then
+                       do i = 1, kl
+                          ab( i, jj+kv ) = zero
+                       end do
+                    end if
+                    ! find pivot and test for singularity. km is the number of
+                    ! subdiagonal elements in the current column.
+                    km = min( kl, m-jj )
+                    jp = stdlib_idamax( km+1, ab( kv+1, jj ), 1 )
+                    ipiv( jj ) = jp + jj - j
+                    if( ab( kv+jp, jj )/=zero ) then
+                       ju = max( ju, min( jj+ku+jp-1, n ) )
+                       if( jp/=1 ) then
+                          ! apply interchange to columns j to j+jb-1
+                          if( jp+jj-1<j+kl ) then
+                             call stdlib_dswap( jb, ab( kv+1+jj-j, j ), ldab-1,ab( kv+jp+jj-j, j )&
+                                       , ldab-1 )
+                          else
+                             ! the interchange affects columns j to jj-1 of a31
+                             ! which are stored in the work array work31
+                             call stdlib_dswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,work31( jp+jj-j-&
+                                       kl, 1 ), ldwork )
+                             call stdlib_dswap( j+jb-jj, ab( kv+1, jj ), ldab-1,ab( kv+jp, jj ), &
+                                       ldab-1 )
+                          end if
+                       end if
+                       ! compute multipliers
+                       call stdlib_dscal( km, one / ab( kv+1, jj ), ab( kv+2, jj ),1 )
+                       ! update trailing submatrix within the band and within
+                       ! the current block. jm is the index of the last column
+                       ! which needs to be updated.
+                       jm = min( ju, j+jb-1 )
+                       if( jm>jj )call stdlib_dger( km, jm-jj, -one, ab( kv+2, jj ), 1,ab( kv, jj+&
+                                 1 ), ldab-1,ab( kv+1, jj+1 ), ldab-1 )
+                    else
+                       ! if pivot is zero, set info to the index of the pivot
+                       ! unless a zero pivot has already been found.
+                       if( info==0 )info = jj
+                    end if
+                    ! copy current column of a31 into the work array work31
+                    nw = min( jj-j+1, i3 )
+                    if( nw>0 )call stdlib_dcopy( nw, ab( kv+kl+1-jj+j, jj ), 1,work31( 1, jj-j+1 )&
+                              , 1 )
+                 end do loop_80
+                 if( j+jb<=n ) then
+                    ! apply the row interchanges to the other blocks.
+                    j2 = min( ju-j+1, kv ) - jb
+                    j3 = max( 0, ju-j-kv+1 )
+                    ! use stdlib_dlaswp to apply the row interchanges to a12, a22, and
+                    ! a32.
+                    call stdlib_dlaswp( j2, ab( kv+1-jb, j+jb ), ldab-1, 1, jb,ipiv( j ), 1 )
+                              
+                    ! adjust the pivot indices.
+                    do i = j, j + jb - 1
+                       ipiv( i ) = ipiv( i ) + j - 1
+                    end do
+                    ! apply the row interchanges to a13, a23, and a33
+                    ! columnwise.
+                    k2 = j - 1 + jb + j2
+                    do i = 1, j3
+                       jj = k2 + i
+                       do ii = j + i - 1, j + jb - 1
+                          ip = ipiv( ii )
+                          if( ip/=ii ) then
+                             temp = ab( kv+1+ii-jj, jj )
+                             ab( kv+1+ii-jj, jj ) = ab( kv+1+ip-jj, jj )
+                             ab( kv+1+ip-jj, jj ) = temp
+                          end if
+                       end do
+                    end do
+                    ! update the relevant part of the trailing submatrix
+                    if( j2>0 ) then
+                       ! update a12
+                       call stdlib_dtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT',jb, j2, one, ab(&
+                                  kv+1, j ), ldab-1,ab( kv+1-jb, j+jb ), ldab-1 )
+                       if( i2>0 ) then
+                          ! update a22
+                          call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i2, j2,jb, -one, ab( &
+                          kv+1+jb, j ), ldab-1,ab( kv+1-jb, j+jb ), ldab-1, one,ab( kv+1, j+jb ), &
+                                    ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! update a32
+                          call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i3, j2,jb, -one, &
+                          work31, ldwork,ab( kv+1-jb, j+jb ), ldab-1, one,ab( kv+kl+1-jb, j+jb ), &
+                                    ldab-1 )
+                       end if
+                    end if
+                    if( j3>0 ) then
+                       ! copy the lower triangle of a13 into the work array
+                       ! work13
+                       do jj = 1, j3
+                          do ii = jj, jb
+                             work13( ii, jj ) = ab( ii-jj+1, jj+j+kv-1 )
+                          end do
+                       end do
+                       ! update a13 in the work array
+                       call stdlib_dtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT',jb, j3, one, ab(&
+                                  kv+1, j ), ldab-1,work13, ldwork )
+                       if( i2>0 ) then
+                          ! update a23
+                          call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i2, j3,jb, -one, ab( &
+                          kv+1+jb, j ), ldab-1,work13, ldwork, one, ab( 1+jb, j+kv ),ldab-1 )
+                                    
+                       end if
+                       if( i3>0 ) then
+                          ! update a33
+                          call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i3, j3,jb, -one, &
+                                    work31, ldwork, work13,ldwork, one, ab( 1+kl, j+kv ), ldab-1 )
+                       end if
+                       ! copy the lower triangle of a13 back into place
+                       do jj = 1, j3
+                          do ii = jj, jb
+                             ab( ii-jj+1, jj+j+kv-1 ) = work13( ii, jj )
+                          end do
+                       end do
+                    end if
+                 else
+                    ! adjust the pivot indices.
+                    do i = j, j + jb - 1
+                       ipiv( i ) = ipiv( i ) + j - 1
+                    end do
+                 end if
+                 ! partially undo the interchanges in the current block to
+                 ! restore the upper triangular form of a31 and copy the upper
+                 ! triangle of a31 back into place
+                 do jj = j + jb - 1, j, -1
+                    jp = ipiv( jj ) - jj + 1
+                    if( jp/=1 ) then
+                       ! apply interchange to columns j to jj-1
+                       if( jp+jj-1<j+kl ) then
+                          ! the interchange does not affect a31
+                          call stdlib_dswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,ab( kv+jp+jj-j, j ),&
+                                     ldab-1 )
+                       else
+                          ! the interchange does affect a31
+                          call stdlib_dswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,work31( jp+jj-j-kl, &
+                                    1 ), ldwork )
+                       end if
+                    end if
+                    ! copy the current column of a31 back into place
+                    nw = min( i3, jj-j+1 )
+                    if( nw>0 )call stdlib_dcopy( nw, work31( 1, jj-j+1 ), 1,ab( kv+kl+1-jj+j, jj )&
+                              , 1 )
+                 end do
+              end do loop_180
+           end if
+           return
+     end subroutine stdlib_dgbtrf
+
+     !> DGECON: estimates the reciprocal of the condition number of a general
+     !> real matrix A, in either the 1-norm or the infinity-norm, using
+     !> the LU factorization computed by DGETRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     pure subroutine stdlib_dgecon( norm, n, a, lda, anorm, rcond, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: onenrm
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(dp) :: ainvnm, scale, sl, smlnum, su
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGECON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           ! estimate the norm of inv(a).
+           ainvnm = zero
+           normin = 'N'
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kase = 0
+           10 continue
+           call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(l).
+                 call stdlib_dlatrs( 'LOWER', 'NO TRANSPOSE', 'UNIT', normin, n, a,lda, work, sl, &
+                           work( 2*n+1 ), info )
+                 ! multiply by inv(u).
+                 call stdlib_dlatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           su, work( 3*n+1 ), info )
+              else
+                 ! multiply by inv(u**t).
+                 call stdlib_dlatrs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n, a,lda, work, su,&
+                            work( 3*n+1 ), info )
+                 ! multiply by inv(l**t).
+                 call stdlib_dlatrs( 'LOWER', 'TRANSPOSE', 'UNIT', normin, n, a,lda, work, sl, &
+                           work( 2*n+1 ), info )
+              end if
+              ! divide x by 1/(sl*su) if doing so will not cause overflow.
+              scale = sl*su
+              normin = 'Y'
+              if( scale/=one ) then
+                 ix = stdlib_idamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_drscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_dgecon
+
+     !> DGEEQU: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+     !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+     !> number and BIGNUM = largest safe number.  Use of these scaling
+     !> factors is not guaranteed to reduce the condition number of A but
+     !> works well in practice.
+
+     pure subroutine stdlib_dgeequ( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           real(dp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: c(*), r(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: bignum, rcmax, rcmin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           do j = 1, n
+              do i = 1, m
+                 r( i ) = max( r( i ), abs( a( i, j ) ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i))
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = 1, m
+                 c( j ) = max( c( j ), abs( a( i, j ) )*r( i ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j))
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_dgeequ
+
+     !> DGEEQUB: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+     !> the radix.
+     !> R(i) and C(j) are restricted to be a power of the radix between
+     !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+     !> of these scaling factors is not guaranteed to reduce the condition
+     !> number of A but works well in practice.
+     !> This routine differs from DGEEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled entries' magnitudes are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_dgeequb( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           real(dp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: c(*), r(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: bignum, rcmax, rcmin, smlnum, radix, logrdx
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,log
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.  assume smlnum is a power of the radix.
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           radix = stdlib_dlamch( 'B' )
+           logrdx = log( radix )
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           do j = 1, n
+              do i = 1, m
+                 r( i ) = max( r( i ), abs( a( i, j ) ) )
+              end do
+           end do
+           do i = 1, m
+              if( r( i )>zero ) then
+                 r( i ) = radix**int( log( r( i ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i)).
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = 1, m
+                 c( j ) = max( c( j ), abs( a( i, j ) )*r( i ) )
+              end do
+              if( c( j )>zero ) then
+                 c( j ) = radix**int( log( c( j ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j)).
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_dgeequb
+
+     !> DGEMLQT: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q C            C Q
+     !> TRANS = 'T':   Q**T C            C Q**T
+     !> where Q is a real orthogonal matrix defined as the product of K
+     !> elementary reflectors:
+     !> Q = H(1) H(2) . . . H(K) = I - V T V**T
+     !> generated using the compact WY representation as returned by DGELQT.
+     !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+
+     pure subroutine stdlib_dgemlqt( side, trans, m, n, k, mb, v, ldv, t, ldt,c, ldc, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, ldc, m, n, mb, ldt
+           ! Array Arguments 
+           real(dp), intent(in) :: v(ldv,*), t(ldt,*)
+           real(dp), intent(inout) :: c(ldc,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, ldwork, kf, q
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'T' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( left ) then
+              ldwork = max( 1, n )
+              q = m
+           else if ( right ) then
+              ldwork = max( 1, m )
+              q = n
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>q ) then
+              info = -5
+           else if( mb<1 .or. (mb>k .and. k>0)) then
+              info = -6
+           else if( ldv<max( 1, k ) ) then
+               info = -8
+           else if( ldt<mb ) then
+              info = -10
+           else if( ldc<max( 1, m ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEMLQT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. notran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_dlarfb( 'L', 'T', 'F', 'R', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. tran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_dlarfb( 'R', 'N', 'F', 'R', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           else if( left .and. tran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_dlarfb( 'L', 'N', 'F', 'R', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. notran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_dlarfb( 'R', 'T', 'F', 'R', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           end if
+           return
+     end subroutine stdlib_dgemlqt
+
+     !> DGEMQRT: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q C            C Q
+     !> TRANS = 'T':   Q**T C            C Q**T
+     !> where Q is a real orthogonal matrix defined as the product of K
+     !> elementary reflectors:
+     !> Q = H(1) H(2) . . . H(K) = I - V T V**T
+     !> generated using the compact WY representation as returned by DGEQRT.
+     !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+
+     pure subroutine stdlib_dgemqrt( side, trans, m, n, k, nb, v, ldv, t, ldt,c, ldc, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, ldc, m, n, nb, ldt
+           ! Array Arguments 
+           real(dp), intent(in) :: v(ldv,*), t(ldt,*)
+           real(dp), intent(inout) :: c(ldc,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, ldwork, kf, q
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'T' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( left ) then
+              ldwork = max( 1, n )
+              q = m
+           else if ( right ) then
+              ldwork = max( 1, m )
+              q = n
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>q ) then
+              info = -5
+           else if( nb<1 .or. (nb>k .and. k>0)) then
+              info = -6
+           else if( ldv<max( 1, q ) ) then
+              info = -8
+           else if( ldt<nb ) then
+              info = -10
+           else if( ldc<max( 1, m ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEMQRT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. tran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_dlarfb( 'L', 'T', 'F', 'C', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. notran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_dlarfb( 'R', 'N', 'F', 'C', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           else if( left .and. notran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_dlarfb( 'L', 'N', 'F', 'C', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. tran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_dlarfb( 'R', 'T', 'F', 'C', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           end if
+           return
+     end subroutine stdlib_dgemqrt
+
+     !> DGESC2: solves a system of linear equations
+     !> A * X = scale* RHS
+     !> with a general N-by-N matrix A using the LU factorization with
+     !> complete pivoting computed by DGETC2.
+
+     pure subroutine stdlib_dgesc2( n, a, lda, rhs, ipiv, jpiv, scale )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(out) :: scale
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*), jpiv(*)
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: rhs(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: bignum, eps, smlnum, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+            ! set constant to control overflow
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! apply permutations ipiv to rhs
+           call stdlib_dlaswp( 1, rhs, lda, 1, n-1, ipiv, 1 )
+           ! solve for l part
+           do i = 1, n - 1
+              do j = i + 1, n
+                 rhs( j ) = rhs( j ) - a( j, i )*rhs( i )
+              end do
+           end do
+           ! solve for u part
+           scale = one
+           ! check for scaling
+           i = stdlib_idamax( n, rhs, 1 )
+           if( two*smlnum*abs( rhs( i ) )>abs( a( n, n ) ) ) then
+              temp = ( one / two ) / abs( rhs( i ) )
+              call stdlib_dscal( n, temp, rhs( 1 ), 1 )
+              scale = scale*temp
+           end if
+           do i = n, 1, -1
+              temp = one / a( i, i )
+              rhs( i ) = rhs( i )*temp
+              do j = i + 1, n
+                 rhs( i ) = rhs( i ) - rhs( j )*( a( i, j )*temp )
+              end do
+           end do
+           ! apply permutations jpiv to the solution (rhs)
+           call stdlib_dlaswp( 1, rhs, lda, 1, n-1, jpiv, -1 )
+           return
+     end subroutine stdlib_dgesc2
+
+     !> DGETC2: computes an LU factorization with complete pivoting of the
+     !> n-by-n matrix A. The factorization has the form A = P * L * U * Q,
+     !> where P and Q are permutation matrices, L is lower triangular with
+     !> unit diagonal elements and U is upper triangular.
+     !> This is the Level 2 BLAS algorithm.
+
+     pure subroutine stdlib_dgetc2( n, a, lda, ipiv, jpiv, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*), jpiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ip, ipv, j, jp, jpv
+           real(dp) :: bignum, eps, smin, smlnum, xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! handle the case n=1 by itself
+           if( n==1 ) then
+              ipiv( 1 ) = 1
+              jpiv( 1 ) = 1
+              if( abs( a( 1, 1 ) )<smlnum ) then
+                 info = 1
+                 a( 1, 1 ) = smlnum
+              end if
+              return
+           end if
+           ! factorize a using complete pivoting.
+           ! set pivots less than smin to smin.
+           loop_40: do i = 1, n - 1
+              ! find max element in matrix a
+              xmax = zero
+              do ip = i, n
+                 do jp = i, n
+                    if( abs( a( ip, jp ) )>=xmax ) then
+                       xmax = abs( a( ip, jp ) )
+                       ipv = ip
+                       jpv = jp
+                    end if
+                 end do
+              end do
+              if( i==1 )smin = max( eps*xmax, smlnum )
+              ! swap rows
+              if( ipv/=i )call stdlib_dswap( n, a( ipv, 1 ), lda, a( i, 1 ), lda )
+              ipiv( i ) = ipv
+              ! swap columns
+              if( jpv/=i )call stdlib_dswap( n, a( 1, jpv ), 1, a( 1, i ), 1 )
+              jpiv( i ) = jpv
+              ! check for singularity
+              if( abs( a( i, i ) )<smin ) then
+                 info = i
+                 a( i, i ) = smin
+              end if
+              do j = i + 1, n
+                 a( j, i ) = a( j, i ) / a( i, i )
+              end do
+              call stdlib_dger( n-i, n-i, -one, a( i+1, i ), 1, a( i, i+1 ), lda,a( i+1, i+1 ), &
+                        lda )
+           end do loop_40
+           if( abs( a( n, n ) )<smin ) then
+              info = n
+              a( n, n ) = smin
+           end if
+           ! set last pivots to n
+           ipiv( n ) = n
+           jpiv( n ) = n
+           return
+     end subroutine stdlib_dgetc2
+
+     !> DGETF2: computes an LU factorization of a general m-by-n matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the right-looking Level 2 BLAS version of the algorithm.
+
+     pure subroutine stdlib_dgetf2( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: sfmin
+           integer(ilp) :: i, j, jp
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGETF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch('S')
+           do j = 1, min( m, n )
+              ! find pivot and test for singularity.
+              jp = j - 1 + stdlib_idamax( m-j+1, a( j, j ), 1 )
+              ipiv( j ) = jp
+              if( a( jp, j )/=zero ) then
+                 ! apply the interchange to columns 1:n.
+                 if( jp/=j )call stdlib_dswap( n, a( j, 1 ), lda, a( jp, 1 ), lda )
+                 ! compute elements j+1:m of j-th column.
+                 if( j<m ) then
+                    if( abs(a( j, j )) >= sfmin ) then
+                       call stdlib_dscal( m-j, one / a( j, j ), a( j+1, j ), 1 )
+                    else
+                      do i = 1, m-j
+                         a( j+i, j ) = a( j+i, j ) / a( j, j )
+                      end do
+                    end if
+                 end if
+              else if( info==0 ) then
+                 info = j
+              end if
+              if( j<min( m, n ) ) then
+                 ! update trailing submatrix.
+                 call stdlib_dger( m-j, n-j, -one, a( j+1, j ), 1, a( j, j+1 ), lda,a( j+1, j+1 ),&
+                            lda )
+              end if
+           end do
+           return
+     end subroutine stdlib_dgetf2
+
+     !> DGETRF2: computes an LU factorization of a general M-by-N matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the recursive version of the algorithm. It divides
+     !> the matrix into four submatrices:
+     !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+     !> A = [ -----|----- ]  with n1 = min(m,n)/2
+     !> [  A21 | A22  ]       n2 = n-n1
+     !> [ A11 ]
+     !> The subroutine calls itself to factor [ --- ],
+     !> [ A12 ]
+     !> [ A12 ]
+     !> do the swaps on [ --- ], solve A12, update A22,
+     !> [ A22 ]
+     !> then calls itself to factor A22 and do the swaps on A21.
+
+     pure recursive subroutine stdlib_dgetrf2( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: sfmin, temp
+           integer(ilp) :: i, iinfo, n1, n2
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGETRF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if ( m==1 ) then
+              ! use unblocked code for one row case
+              ! just need to handle ipiv and info
+              ipiv( 1 ) = 1
+              if ( a(1,1)==zero )info = 1
+           else if( n==1 ) then
+              ! use unblocked code for one column case
+              ! compute machine safe minimum
+              sfmin = stdlib_dlamch('S')
+              ! find pivot and test for singularity
+              i = stdlib_idamax( m, a( 1, 1 ), 1 )
+              ipiv( 1 ) = i
+              if( a( i, 1 )/=zero ) then
+                 ! apply the interchange
+                 if( i/=1 ) then
+                    temp = a( 1, 1 )
+                    a( 1, 1 ) = a( i, 1 )
+                    a( i, 1 ) = temp
+                 end if
+                 ! compute elements 2:m of the column
+                 if( abs(a( 1, 1 )) >= sfmin ) then
+                    call stdlib_dscal( m-1, one / a( 1, 1 ), a( 2, 1 ), 1 )
+                 else
+                    do i = 1, m-1
+                       a( 1+i, 1 ) = a( 1+i, 1 ) / a( 1, 1 )
+                    end do
+                 end if
+              else
+                 info = 1
+              end if
+           else
+              ! use recursive code
+              n1 = min( m, n ) / 2
+              n2 = n-n1
+                     ! [ a11 ]
+              ! factor [ --- ]
+                     ! [ a21 ]
+              call stdlib_dgetrf2( m, n1, a, lda, ipiv, iinfo )
+              if ( info==0 .and. iinfo>0 )info = iinfo
+                                    ! [ a12 ]
+              ! apply interchanges to [ --- ]
+                                    ! [ a22 ]
+              call stdlib_dlaswp( n2, a( 1, n1+1 ), lda, 1, n1, ipiv, 1 )
+              ! solve a12
+              call stdlib_dtrsm( 'L', 'L', 'N', 'U', n1, n2, one, a, lda,a( 1, n1+1 ), lda )
+                        
+              ! update a22
+              call stdlib_dgemm( 'N', 'N', m-n1, n2, n1, -one, a( n1+1, 1 ), lda,a( 1, n1+1 ), &
+                        lda, one, a( n1+1, n1+1 ), lda )
+              ! factor a22
+              call stdlib_dgetrf2( m-n1, n2, a( n1+1, n1+1 ), lda, ipiv( n1+1 ),iinfo )
+              ! adjust info and the pivot indices
+              if ( info==0 .and. iinfo>0 )info = iinfo + n1
+              do i = n1+1, min( m, n )
+                 ipiv( i ) = ipiv( i ) + n1
+              end do
+              ! apply interchanges to a21
+              call stdlib_dlaswp( n1, a( 1, 1 ), lda, n1+1, min( m, n), ipiv, 1 )
+           end if
+           return
+     end subroutine stdlib_dgetrf2
+
+     !> DGETRI: computes the inverse of a matrix using the LU factorization
+     !> computed by DGETRF.
+     !> This method inverts U and then computes inv(A) by solving the system
+     !> inv(A)*L = inv(U) for inv(A).
+
+     pure subroutine stdlib_dgetri( n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iws, j, jb, jj, jp, ldwork, lwkopt, nb, nbmin, nn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DGETRI', ' ', n, -1, -1, -1 )
+           lwkopt = n*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGETRI', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form inv(u).  if info > 0 from stdlib_dtrtri, then u is singular,
+           ! and the inverse is not computed.
+           call stdlib_dtrtri( 'UPPER', 'NON-UNIT', n, a, lda, info )
+           if( info>0 )return
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = max( ldwork*nb, 1 )
+              if( lwork<iws ) then
+                 nb = lwork / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DGETRI', ' ', n, -1, -1, -1 ) )
+              end if
+           else
+              iws = n
+           end if
+           ! solve the equation inv(a)*l = inv(u) for inv(a).
+           if( nb<nbmin .or. nb>=n ) then
+              ! use unblocked code.
+              do j = n, 1, -1
+                 ! copy current column of l to work and replace with zeros.
+                 do i = j + 1, n
+                    work( i ) = a( i, j )
+                    a( i, j ) = zero
+                 end do
+                 ! compute current column of inv(a).
+                 if( j<n )call stdlib_dgemv( 'NO TRANSPOSE', n, n-j, -one, a( 1, j+1 ),lda, work( &
+                           j+1 ), 1, one, a( 1, j ), 1 )
+              end do
+           else
+              ! use blocked code.
+              nn = ( ( n-1 ) / nb )*nb + 1
+              do j = nn, 1, -nb
+                 jb = min( nb, n-j+1 )
+                 ! copy current block column of l to work and replace with
+                 ! zeros.
+                 do jj = j, j + jb - 1
+                    do i = jj + 1, n
+                       work( i+( jj-j )*ldwork ) = a( i, jj )
+                       a( i, jj ) = zero
+                    end do
+                 end do
+                 ! compute current block column of inv(a).
+                 if( j+jb<=n )call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, jb,n-j-jb+1, -&
+                           one, a( 1, j+jb ), lda,work( j+jb ), ldwork, one, a( 1, j ), lda )
+                 call stdlib_dtrsm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n, jb,one, work( j )&
+                           , ldwork, a( 1, j ), lda )
+              end do
+           end if
+           ! apply column interchanges.
+           do j = n - 1, 1, -1
+              jp = ipiv( j )
+              if( jp/=j )call stdlib_dswap( n, a( 1, j ), 1, a( 1, jp ), 1 )
+           end do
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_dgetri
+
+     !> DGETRS: solves a system of linear equations
+     !> A * X = B  or  A**T * X = B
+     !> with a general N-by-N matrix A using the LU factorization computed
+     !> by DGETRF.
+
+     pure subroutine stdlib_dgetrs( trans, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGETRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( notran ) then
+              ! solve a * x = b.
+              ! apply row interchanges to the right hand sides.
+              call stdlib_dlaswp( nrhs, b, ldb, 1, n, ipiv, 1 )
+              ! solve l*x = b, overwriting b with x.
+              call stdlib_dtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n, nrhs,one, a, lda, b, &
+                        ldb )
+              ! solve u*x = b, overwriting b with x.
+              call stdlib_dtrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, one, a, lda,&
+                         b, ldb )
+           else
+              ! solve a**t * x = b.
+              ! solve u**t *x = b, overwriting b with x.
+              call stdlib_dtrsm( 'LEFT', 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, nrhs,one, a, lda, b,&
+                         ldb )
+              ! solve l**t *x = b, overwriting b with x.
+              call stdlib_dtrsm( 'LEFT', 'LOWER', 'TRANSPOSE', 'UNIT', n, nrhs, one,a, lda, b, &
+                        ldb )
+              ! apply row interchanges to the solution vectors.
+              call stdlib_dlaswp( nrhs, b, ldb, 1, n, ipiv, -1 )
+           end if
+           return
+     end subroutine stdlib_dgetrs
+
+     !> DGGBAL: balances a pair of general real matrices (A,B).  This
+     !> involves, first, permuting A and B by similarity transformations to
+     !> isolate eigenvalues in the first 1 to ILO$-$1 and last IHI+1 to N
+     !> elements on the diagonal; and second, applying a diagonal similarity
+     !> transformation to rows and columns ILO to IHI to make the rows
+     !> and columns as close in norm as possible. Both steps are optional.
+     !> Balancing may reduce the 1-norm of the matrices, and improve the
+     !> accuracy of the computed eigenvalues and/or eigenvectors in the
+     !> generalized eigenvalue problem A*x = lambda*B*x.
+
+     pure subroutine stdlib_dggbal( job, n, a, lda, b, ldb, ilo, ihi, lscale,rscale, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldb, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: lscale(*), rscale(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sclfac = 1.0e+1_dp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, icab, iflow, ip1, ir, irab, it, j, jc, jp1, k, kount, l, lcab, lm1, &
+                     lrab, lsfmax, lsfmin, m, nr, nrp2
+           real(dp) :: alpha, basl, beta, cab, cmax, coef, coef2, coef5, cor, ew, ewc, gamma, &
+                     pgamma, rab, sfmax, sfmin, sum, t, ta, tb, tc
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log10,max,min,sign
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGBAL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              ilo = 1
+              ihi = n
+              return
+           end if
+           if( n==1 ) then
+              ilo = 1
+              ihi = n
+              lscale( 1 ) = one
+              rscale( 1 ) = one
+              return
+           end if
+           if( stdlib_lsame( job, 'N' ) ) then
+              ilo = 1
+              ihi = n
+              do i = 1, n
+                 lscale( i ) = one
+                 rscale( i ) = one
+              end do
+              return
+           end if
+           k = 1
+           l = n
+           if( stdlib_lsame( job, 'S' ) )go to 190
+           go to 30
+           ! permute the matrices a and b to isolate the eigenvalues.
+           ! find row with one nonzero in columns 1 through l
+           20 continue
+           l = lm1
+           if( l/=1 )go to 30
+           rscale( 1 ) = one
+           lscale( 1 ) = one
+           go to 190
+           30 continue
+           lm1 = l - 1
+           loop_80: do i = l, 1, -1
+              do j = 1, lm1
+                 jp1 = j + 1
+                 if( a( i, j )/=zero .or. b( i, j )/=zero )go to 50
+              end do
+              j = l
+              go to 70
+              50 continue
+              do j = jp1, l
+                 if( a( i, j )/=zero .or. b( i, j )/=zero )cycle loop_80
+              end do
+              j = jp1 - 1
+              70 continue
+              m = l
+              iflow = 1
+              go to 160
+           end do loop_80
+           go to 100
+           ! find column with one nonzero in rows k through n
+           90 continue
+           k = k + 1
+           100 continue
+           loop_150: do j = k, l
+              do i = k, lm1
+                 ip1 = i + 1
+                 if( a( i, j )/=zero .or. b( i, j )/=zero )go to 120
+              end do
+              i = l
+              go to 140
+              120 continue
+              do i = ip1, l
+                 if( a( i, j )/=zero .or. b( i, j )/=zero )cycle loop_150
+              end do
+              i = ip1 - 1
+              140 continue
+              m = k
+              iflow = 2
+              go to 160
+           end do loop_150
+           go to 190
+           ! permute rows m and i
+           160 continue
+           lscale( m ) = i
+           if( i==m )go to 170
+           call stdlib_dswap( n-k+1, a( i, k ), lda, a( m, k ), lda )
+           call stdlib_dswap( n-k+1, b( i, k ), ldb, b( m, k ), ldb )
+           ! permute columns m and j
+           170 continue
+           rscale( m ) = j
+           if( j==m )go to 180
+           call stdlib_dswap( l, a( 1, j ), 1, a( 1, m ), 1 )
+           call stdlib_dswap( l, b( 1, j ), 1, b( 1, m ), 1 )
+           180 continue
+           go to ( 20, 90 )iflow
+           190 continue
+           ilo = k
+           ihi = l
+           if( stdlib_lsame( job, 'P' ) ) then
+              do i = ilo, ihi
+                 lscale( i ) = one
+                 rscale( i ) = one
+              end do
+              return
+           end if
+           if( ilo==ihi )return
+           ! balance the submatrix in rows ilo to ihi.
+           nr = ihi - ilo + 1
+           do i = ilo, ihi
+              rscale( i ) = zero
+              lscale( i ) = zero
+              work( i ) = zero
+              work( i+n ) = zero
+              work( i+2*n ) = zero
+              work( i+3*n ) = zero
+              work( i+4*n ) = zero
+              work( i+5*n ) = zero
+           end do
+           ! compute right side vector in resulting linear equations
+           basl = log10( sclfac )
+           do i = ilo, ihi
+              do j = ilo, ihi
+                 tb = b( i, j )
+                 ta = a( i, j )
+                 if( ta==zero )go to 210
+                 ta = log10( abs( ta ) ) / basl
+                 210 continue
+                 if( tb==zero )go to 220
+                 tb = log10( abs( tb ) ) / basl
+                 220 continue
+                 work( i+4*n ) = work( i+4*n ) - ta - tb
+                 work( j+5*n ) = work( j+5*n ) - ta - tb
+              end do
+           end do
+           coef = one / real( 2*nr,KIND=dp)
+           coef2 = coef*coef
+           coef5 = half*coef2
+           nrp2 = nr + 2
+           beta = zero
+           it = 1
+           ! start generalized conjugate gradient iteration
+           250 continue
+           gamma = stdlib_ddot( nr, work( ilo+4*n ), 1, work( ilo+4*n ), 1 ) +stdlib_ddot( nr, &
+                     work( ilo+5*n ), 1, work( ilo+5*n ), 1 )
+           ew = zero
+           ewc = zero
+           do i = ilo, ihi
+              ew = ew + work( i+4*n )
+              ewc = ewc + work( i+5*n )
+           end do
+           gamma = coef*gamma - coef2*( ew**2+ewc**2 ) - coef5*( ew-ewc )**2
+           if( gamma==zero )go to 350
+           if( it/=1 )beta = gamma / pgamma
+           t = coef5*( ewc-three*ew )
+           tc = coef5*( ew-three*ewc )
+           call stdlib_dscal( nr, beta, work( ilo ), 1 )
+           call stdlib_dscal( nr, beta, work( ilo+n ), 1 )
+           call stdlib_daxpy( nr, coef, work( ilo+4*n ), 1, work( ilo+n ), 1 )
+           call stdlib_daxpy( nr, coef, work( ilo+5*n ), 1, work( ilo ), 1 )
+           do i = ilo, ihi
+              work( i ) = work( i ) + tc
+              work( i+n ) = work( i+n ) + t
+           end do
+           ! apply matrix to vector
+           do i = ilo, ihi
+              kount = 0
+              sum = zero
+              loop_290: do j = ilo, ihi
+                 if( a( i, j )==zero )go to 280
+                 kount = kount + 1
+                 sum = sum + work( j )
+                 280 continue
+                 if( b( i, j )==zero )cycle loop_290
+                 kount = kount + 1
+                 sum = sum + work( j )
+              end do loop_290
+              work( i+2*n ) = real( kount,KIND=dp)*work( i+n ) + sum
+           end do
+           do j = ilo, ihi
+              kount = 0
+              sum = zero
+              loop_320: do i = ilo, ihi
+                 if( a( i, j )==zero )go to 310
+                 kount = kount + 1
+                 sum = sum + work( i+n )
+                 310 continue
+                 if( b( i, j )==zero )cycle loop_320
+                 kount = kount + 1
+                 sum = sum + work( i+n )
+              end do loop_320
+              work( j+3*n ) = real( kount,KIND=dp)*work( j ) + sum
+           end do
+           sum = stdlib_ddot( nr, work( ilo+n ), 1, work( ilo+2*n ), 1 ) +stdlib_ddot( nr, work( &
+                     ilo ), 1, work( ilo+3*n ), 1 )
+           alpha = gamma / sum
+           ! determine correction to current iteration
+           cmax = zero
+           do i = ilo, ihi
+              cor = alpha*work( i+n )
+              if( abs( cor )>cmax )cmax = abs( cor )
+              lscale( i ) = lscale( i ) + cor
+              cor = alpha*work( i )
+              if( abs( cor )>cmax )cmax = abs( cor )
+              rscale( i ) = rscale( i ) + cor
+           end do
+           if( cmax<half )go to 350
+           call stdlib_daxpy( nr, -alpha, work( ilo+2*n ), 1, work( ilo+4*n ), 1 )
+           call stdlib_daxpy( nr, -alpha, work( ilo+3*n ), 1, work( ilo+5*n ), 1 )
+           pgamma = gamma
+           it = it + 1
+           if( it<=nrp2 )go to 250
+           ! end generalized conjugate gradient iteration
+           350 continue
+           sfmin = stdlib_dlamch( 'S' )
+           sfmax = one / sfmin
+           lsfmin = int( log10( sfmin ) / basl+one,KIND=ilp)
+           lsfmax = int( log10( sfmax ) / basl,KIND=ilp)
+           do i = ilo, ihi
+              irab = stdlib_idamax( n-ilo+1, a( i, ilo ), lda )
+              rab = abs( a( i, irab+ilo-1 ) )
+              irab = stdlib_idamax( n-ilo+1, b( i, ilo ), ldb )
+              rab = max( rab, abs( b( i, irab+ilo-1 ) ) )
+              lrab = int( log10( rab+sfmin ) / basl+one,KIND=ilp)
+              ir = int(lscale( i ) + sign( half, lscale( i ) ),KIND=ilp)
+              ir = min( max( ir, lsfmin ), lsfmax, lsfmax-lrab )
+              lscale( i ) = sclfac**ir
+              icab = stdlib_idamax( ihi, a( 1, i ), 1 )
+              cab = abs( a( icab, i ) )
+              icab = stdlib_idamax( ihi, b( 1, i ), 1 )
+              cab = max( cab, abs( b( icab, i ) ) )
+              lcab = int( log10( cab+sfmin ) / basl+one,KIND=ilp)
+              jc = int(rscale( i ) + sign( half, rscale( i ) ),KIND=ilp)
+              jc = min( max( jc, lsfmin ), lsfmax, lsfmax-lcab )
+              rscale( i ) = sclfac**jc
+           end do
+           ! row scaling of matrices a and b
+           do i = ilo, ihi
+              call stdlib_dscal( n-ilo+1, lscale( i ), a( i, ilo ), lda )
+              call stdlib_dscal( n-ilo+1, lscale( i ), b( i, ilo ), ldb )
+           end do
+           ! column scaling of matrices a and b
+           do j = ilo, ihi
+              call stdlib_dscal( ihi, rscale( j ), a( 1, j ), 1 )
+              call stdlib_dscal( ihi, rscale( j ), b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_dggbal
+
+     !> DGGHRD: reduces a pair of real matrices (A,B) to generalized upper
+     !> Hessenberg form using orthogonal transformations, where A is a
+     !> general matrix and B is upper triangular.  The form of the
+     !> generalized eigenvalue problem is
+     !> A*x = lambda*B*x,
+     !> and B is typically made upper triangular by computing its QR
+     !> factorization and moving the orthogonal matrix Q to the left side
+     !> of the equation.
+     !> This subroutine simultaneously reduces A to a Hessenberg matrix H:
+     !> Q**T*A*Z = H
+     !> and transforms B to another upper triangular matrix T:
+     !> Q**T*B*Z = T
+     !> in order to reduce the problem to its standard form
+     !> H*y = lambda*T*y
+     !> where y = Z**T*x.
+     !> The orthogonal matrices Q and Z are determined as products of Givens
+     !> rotations.  They may either be formed explicitly, or they may be
+     !> postmultiplied into input matrices Q1 and Z1, so that
+     !> Q1 * A * Z1**T = (Q1*Q) * H * (Z1*Z)**T
+     !> Q1 * B * Z1**T = (Q1*Q) * T * (Z1*Z)**T
+     !> If Q1 is the orthogonal matrix from the QR factorization of B in the
+     !> original equation A*x = lambda*B*x, then DGGHRD reduces the original
+     !> problem to generalized Hessenberg form.
+
+     pure subroutine stdlib_dgghrd( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldb, ldq, ldz, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilq, ilz
+           integer(ilp) :: icompq, icompz, jcol, jrow
+           real(dp) :: c, s, temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode compq
+           if( stdlib_lsame( compq, 'N' ) ) then
+              ilq = .false.
+              icompq = 1
+           else if( stdlib_lsame( compq, 'V' ) ) then
+              ilq = .true.
+              icompq = 2
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              ilq = .true.
+              icompq = 3
+           else
+              icompq = 0
+           end if
+           ! decode compz
+           if( stdlib_lsame( compz, 'N' ) ) then
+              ilz = .false.
+              icompz = 1
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              ilz = .true.
+              icompz = 2
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              ilz = .true.
+              icompz = 3
+           else
+              icompz = 0
+           end if
+           ! test the input parameters.
+           info = 0
+           if( icompq<=0 ) then
+              info = -1
+           else if( icompz<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ( ilq .and. ldq<n ) .or. ldq<1 ) then
+              info = -11
+           else if( ( ilz .and. ldz<n ) .or. ldz<1 ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGHRD', -info )
+              return
+           end if
+           ! initialize q and z if desired.
+           if( icompq==3 )call stdlib_dlaset( 'FULL', n, n, zero, one, q, ldq )
+           if( icompz==3 )call stdlib_dlaset( 'FULL', n, n, zero, one, z, ldz )
+           ! quick return if possible
+           if( n<=1 )return
+           ! zero out lower triangle of b
+           do jcol = 1, n - 1
+              do jrow = jcol + 1, n
+                 b( jrow, jcol ) = zero
+              end do
+           end do
+           ! reduce a and b
+           do jcol = ilo, ihi - 2
+              do jrow = ihi, jcol + 2, -1
+                 ! step 1: rotate rows jrow-1, jrow to kill a(jrow,jcol)
+                 temp = a( jrow-1, jcol )
+                 call stdlib_dlartg( temp, a( jrow, jcol ), c, s,a( jrow-1, jcol ) )
+                 a( jrow, jcol ) = zero
+                 call stdlib_drot( n-jcol, a( jrow-1, jcol+1 ), lda,a( jrow, jcol+1 ), lda, c, s )
+                           
+                 call stdlib_drot( n+2-jrow, b( jrow-1, jrow-1 ), ldb,b( jrow, jrow-1 ), ldb, c, &
+                           s )
+                 if( ilq )call stdlib_drot( n, q( 1, jrow-1 ), 1, q( 1, jrow ), 1, c, s )
+                 ! step 2: rotate columns jrow, jrow-1 to kill b(jrow,jrow-1)
+                 temp = b( jrow, jrow )
+                 call stdlib_dlartg( temp, b( jrow, jrow-1 ), c, s,b( jrow, jrow ) )
+                 b( jrow, jrow-1 ) = zero
+                 call stdlib_drot( ihi, a( 1, jrow ), 1, a( 1, jrow-1 ), 1, c, s )
+                 call stdlib_drot( jrow-1, b( 1, jrow ), 1, b( 1, jrow-1 ), 1, c,s )
+                 if( ilz )call stdlib_drot( n, z( 1, jrow ), 1, z( 1, jrow-1 ), 1, c, s )
+              end do
+           end do
+           return
+     end subroutine stdlib_dgghrd
+
+     !> DGTTRS: solves one of the systems of equations
+     !> A*X = B  or  A**T*X = B,
+     !> with a tridiagonal matrix A using the LU factorization computed
+     !> by DGTTRF.
+
+     pure subroutine stdlib_dgttrs( trans, n, nrhs, dl, d, du, du2, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(inout) :: b(ldb,*)
+           real(dp), intent(in) :: d(*), dl(*), du(*), du2(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: itrans, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           notran = ( trans=='N' .or. trans=='N' )
+           if( .not.notran .and. .not.( trans=='T' .or. trans=='T' ) .and. .not.( trans=='C' .or. &
+                     trans=='C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( n, 1 ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGTTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! decode trans
+           if( notran ) then
+              itrans = 0
+           else
+              itrans = 1
+           end if
+           ! determine the number of right-hand sides to solve at a time.
+           if( nrhs==1 ) then
+              nb = 1
+           else
+              nb = max( 1, stdlib_ilaenv( 1, 'DGTTRS', trans, n, nrhs, -1, -1 ) )
+           end if
+           if( nb>=nrhs ) then
+              call stdlib_dgtts2( itrans, n, nrhs, dl, d, du, du2, ipiv, b, ldb )
+           else
+              do j = 1, nrhs, nb
+                 jb = min( nrhs-j+1, nb )
+                 call stdlib_dgtts2( itrans, n, jb, dl, d, du, du2, ipiv, b( 1, j ),ldb )
+              end do
+           end if
+     end subroutine stdlib_dgttrs
+
+     !> DISNAN: returns .TRUE. if its argument is NaN, and .FALSE.
+     !> otherwise.  To be replaced by the Fortran 2003 intrinsic in the
+     !> future.
+
+     pure logical(lk) function stdlib_disnan( din )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: din
+        ! =====================================================================
+        ! Executable Statements 
+           stdlib_disnan = stdlib_dlaisnan(din,din)
+           return
+     end function stdlib_disnan
+
+     !> DLA_GBAMV:  performs one of the matrix-vector operations
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_dla_gbamv( trans, m, n, kl, ku, alpha, ab, ldab, x,incx, beta, y, incy )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, ldab, m, n, kl, ku, trans
+           ! Array Arguments 
+           real(dp), intent(in) :: ab(ldab,*), x(*)
+           real(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(dp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky, lenx, leny, kd, ke
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( .not.( ( trans==stdlib_ilatrans( 'N' ) ).or. ( trans==stdlib_ilatrans( 'T' ) )&
+                     .or. ( trans==stdlib_ilatrans( 'C' ) ) ) ) then
+              info = 1
+           else if( m<0 )then
+              info = 2
+           else if( n<0 )then
+              info = 3
+           else if( kl<0 .or. kl>m-1 ) then
+              info = 4
+           else if( ku<0 .or. ku>n-1 ) then
+              info = 5
+           else if( ldab<kl+ku+1 )then
+              info = 6
+           else if( incx==0 )then
+              info = 8
+           else if( incy==0 )then
+              info = 11
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'DLA_GBAMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( m==0 ).or.( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if( trans==stdlib_ilatrans( 'N' ) )then
+              lenx = n
+              leny = m
+           else
+              lenx = m
+              leny = n
+           end if
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( lenx - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( leny - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(m*n) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           kd = ku + 1
+           ke = kl + 1
+           iy = ky
+           if ( incx==1 ) then
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = abs( ab( kd+i-j, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = abs( ab( ke-i+j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       jx = kx
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = abs( ab( kd+i-j, j ) )
+                          symb_zero = symb_zero .and.( x( jx ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       jx = kx
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = abs( ab( ke-i+j, i ) )
+                          symb_zero = symb_zero .and.( x( jx ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_dla_gbamv
+
+     !> DLA_GBRCOND: Estimates the Skeel condition number of  op(A) * op2(C)
+     !> where op2 is determined by CMODE as follows
+     !> CMODE =  1    op2(C) = C
+     !> CMODE =  0    op2(C) = I
+     !> CMODE = -1    op2(C) = inv(C)
+     !> The Skeel condition number  cond(A) = norminf( |inv(A)||A| )
+     !> is computed by computing scaling factors R such that
+     !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+     !> infinity-norm condition number.
+
+     real(dp) function stdlib_dla_gbrcond( trans, n, kl, ku, ab, ldab,afb, ldafb, ipiv, cmode, c,&
+               info, work, iwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: n, ldab, ldafb, kl, ku, cmode
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(in) :: ab(ldab,*), afb(ldafb,*), c(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notrans
+           integer(ilp) :: kase, i, j, kd, ke
+           real(dp) :: ainvnm, tmp
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           stdlib_dla_gbrcond = zero
+           info = 0
+           notrans = stdlib_lsame( trans, 'N' )
+           if ( .not. notrans .and. .not. stdlib_lsame(trans, 'T').and. .not. stdlib_lsame(trans, &
+                     'C') ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 .or. kl>n-1 ) then
+              info = -3
+           else if( ku<0 .or. ku>n-1 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLA_GBRCOND', -info )
+              return
+           end if
+           if( n==0 ) then
+              stdlib_dla_gbrcond = one
+              return
+           end if
+           ! compute the equilibration matrix r such that
+           ! inv(r)*a*c has unit 1-norm.
+           kd = ku + 1
+           ke = kl + 1
+           if ( notrans ) then
+              do i = 1, n
+                 tmp = zero
+                    if ( cmode == 1 ) then
+                       do j = max( i-kl, 1 ), min( i+ku, n )
+                          tmp = tmp + abs( ab( kd+i-j, j ) * c( j ) )
+                       end do
+                    else if ( cmode == 0 ) then
+                       do j = max( i-kl, 1 ), min( i+ku, n )
+                          tmp = tmp + abs( ab( kd+i-j, j ) )
+                       end do
+                    else
+                       do j = max( i-kl, 1 ), min( i+ku, n )
+                          tmp = tmp + abs( ab( kd+i-j, j ) / c( j ) )
+                       end do
+                    end if
+                 work( 2*n+i ) = tmp
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + abs( ab( ke-i+j, i ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + abs( ab( ke-i+j, i ) )
+                    end do
+                 else
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + abs( ab( ke-i+j, i ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+                 if ( notrans ) then
+                    call stdlib_dgbtrs( 'NO TRANSPOSE', n, kl, ku, 1, afb, ldafb,ipiv, work, n, &
+                              info )
+                 else
+                    call stdlib_dgbtrs( 'TRANSPOSE', n, kl, ku, 1, afb, ldafb, ipiv,work, n, info &
+                              )
+                 end if
+                 ! multiply by inv(c).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( notrans ) then
+                    call stdlib_dgbtrs( 'TRANSPOSE', n, kl, ku, 1, afb, ldafb, ipiv,work, n, info &
+                              )
+                 else
+                    call stdlib_dgbtrs( 'NO TRANSPOSE', n, kl, ku, 1, afb, ldafb,ipiv, work, n, &
+                              info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_dla_gbrcond = ( one / ainvnm )
+           return
+     end function stdlib_dla_gbrcond
+
+     !> DLA_GEAMV:  performs one of the matrix-vector operations
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_dla_geamv ( trans, m, n, alpha, a, lda, x, incx, beta,y, incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, m, n, trans
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), x(*)
+           real(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(dp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky, lenx, leny
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( .not.( ( trans==stdlib_ilatrans( 'N' ) ).or. ( trans==stdlib_ilatrans( 'T' ) )&
+                     .or. ( trans==stdlib_ilatrans( 'C' )) ) ) then
+              info = 1
+           else if( m<0 )then
+              info = 2
+           else if( n<0 )then
+              info = 3
+           else if( lda<max( 1, m ) )then
+              info = 6
+           else if( incx==0 )then
+              info = 8
+           else if( incy==0 )then
+              info = 11
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'DLA_GEAMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( m==0 ).or.( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if( trans==stdlib_ilatrans( 'N' ) )then
+              lenx = n
+              leny = m
+           else
+              lenx = m
+              leny = n
+           end if
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( lenx - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( leny - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(m*n) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, lenx
+                          temp = abs( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, lenx
+                          temp = abs( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       jx = kx
+                       do j = 1, lenx
+                          temp = abs( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( jx ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if (.not.symb_zero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       jx = kx
+                       do j = 1, lenx
+                          temp = abs( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( jx ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if (.not.symb_zero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_dla_geamv
+
+     !> DLA_GERCOND: estimates the Skeel condition number of op(A) * op2(C)
+     !> where op2 is determined by CMODE as follows
+     !> CMODE =  1    op2(C) = C
+     !> CMODE =  0    op2(C) = I
+     !> CMODE = -1    op2(C) = inv(C)
+     !> The Skeel condition number cond(A) = norminf( |inv(A)||A| )
+     !> is computed by computing scaling factors R such that
+     !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+     !> infinity-norm condition number.
+
+     real(dp) function stdlib_dla_gercond( trans, n, a, lda, af,ldaf, ipiv, cmode, c,info, work, &
+               iwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: n, lda, ldaf, cmode
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: a(lda,*), af(ldaf,*), c(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notrans
+           integer(ilp) :: kase, i, j
+           real(dp) :: ainvnm, tmp
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           stdlib_dla_gercond = zero
+           info = 0
+           notrans = stdlib_lsame( trans, 'N' )
+           if ( .not. notrans .and. .not. stdlib_lsame(trans, 'T').and. .not. stdlib_lsame(trans, &
+                     'C') ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLA_GERCOND', -info )
+              return
+           end if
+           if( n==0 ) then
+              stdlib_dla_gercond = one
+              return
+           end if
+           ! compute the equilibration matrix r such that
+           ! inv(r)*a*c has unit 1-norm.
+           if (notrans) then
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, n
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, n
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                 else
+                    do j = 1, n
+                       tmp = tmp + abs( a( i, j ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, n
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, n
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                 else
+                    do j = 1, n
+                       tmp = tmp + abs( a( j, i ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work(i) = work(i) * work(2*n+i)
+                 end do
+                 if (notrans) then
+                    call stdlib_dgetrs( 'NO TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                              
+                 else
+                    call stdlib_dgetrs( 'TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                 end if
+                 ! multiply by inv(c).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if (notrans) then
+                    call stdlib_dgetrs( 'TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_dgetrs( 'NO TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                              
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_dla_gercond = ( one / ainvnm )
+           return
+     end function stdlib_dla_gercond
+
+     !> DLA_LIN_BERR: computes component-wise relative backward error from
+     !> the formula
+     !> max(i) ( abs(R(i)) / ( abs(op(A_s))*abs(Y) + abs(B_s) )(i) )
+     !> where abs(Z) is the component-wise absolute value of the matrix
+     !> or vector Z.
+
+     pure subroutine stdlib_dla_lin_berr ( n, nz, nrhs, res, ayb, berr )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, nz, nrhs
+           ! Array Arguments 
+           real(dp), intent(in) :: ayb(n,nrhs)
+           real(dp), intent(out) :: berr(nrhs)
+           real(dp), intent(in) :: res(n,nrhs)
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: tmp,safe1
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! adding safe1 to the numerator guards against spuriously zero
+           ! residuals.  a similar safeguard is in the sla_yyamv routine used
+           ! to compute ayb.
+           safe1 = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = (nz+1)*safe1
+           do j = 1, nrhs
+              berr(j) = zero
+              do i = 1, n
+                 if (ayb(i,j) /= zero) then
+                    tmp = (safe1+abs(res(i,j)))/ayb(i,j)
+                    berr(j) = max( berr(j), tmp )
+                 end if
+           ! if ayb is exactly 0.0_dp (and if computed by sla_yyamv), then we know
+           ! the true residual also must be exactly zero.
+              end do
+           end do
+     end subroutine stdlib_dla_lin_berr
+
+     !> DLA_PORCOND: Estimates the Skeel condition number of  op(A) * op2(C)
+     !> where op2 is determined by CMODE as follows
+     !> CMODE =  1    op2(C) = C
+     !> CMODE =  0    op2(C) = I
+     !> CMODE = -1    op2(C) = inv(C)
+     !> The Skeel condition number  cond(A) = norminf( |inv(A)||A| )
+     !> is computed by computing scaling factors R such that
+     !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+     !> infinity-norm condition number.
+
+     real(dp) function stdlib_dla_porcond( uplo, n, a, lda, af, ldaf,cmode, c, info, work,iwork )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, ldaf, cmode
+           integer(ilp), intent(out) :: info
+           real(dp), intent(in) :: a(lda,*), af(ldaf,*), c(*)
+           real(dp), intent(out) :: work(*)
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: kase, i, j
+           real(dp) :: ainvnm, tmp
+           logical(lk) :: up
+           ! Array Arguments 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           stdlib_dla_porcond = zero
+           info = 0
+           if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLA_PORCOND', -info )
+              return
+           end if
+           if( n==0 ) then
+              stdlib_dla_porcond = one
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute the equilibration matrix r such that
+           ! inv(r)*a*c has unit 1-norm.
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + abs( a( j ,i ) / c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) / c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           endif
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+                 if (up) then
+                    call stdlib_dpotrs( 'UPPER', n, 1, af, ldaf, work, n, info )
+                 else
+                    call stdlib_dpotrs( 'LOWER', n, 1, af, ldaf, work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_dpotrs( 'UPPER', n, 1, af, ldaf, work, n, info )
+                 else
+                    call stdlib_dpotrs( 'LOWER', n, 1, af, ldaf, work, n, info )
+                 endif
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_dla_porcond = ( one / ainvnm )
+           return
+     end function stdlib_dla_porcond
+
+     !> DLA_SYAMV:  performs the matrix-vector operation
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> n by n symmetric matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_dla_syamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n, uplo
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), x(*)
+           real(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(dp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( uplo/=stdlib_ilauplo( 'U' ) .and.uplo/=stdlib_ilauplo( 'L' ) ) then
+              info = 1
+           else if( n<0 )then
+              info = 2
+           else if( lda<max( 1, n ) )then
+              info = 5
+           else if( incx==0 )then
+              info = 7
+           else if( incy==0 )then
+              info = 10
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'DLA_SYAMV', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( n - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( n - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(n^2) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = abs( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = abs( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = abs( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = abs( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = abs( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = abs( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = abs( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = abs( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_dla_syamv
+
+     !> DLA_SYRCOND: estimates the Skeel condition number of  op(A) * op2(C)
+     !> where op2 is determined by CMODE as follows
+     !> CMODE =  1    op2(C) = C
+     !> CMODE =  0    op2(C) = I
+     !> CMODE = -1    op2(C) = inv(C)
+     !> The Skeel condition number cond(A) = norminf( |inv(A)||A| )
+     !> is computed by computing scaling factors R such that
+     !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+     !> infinity-norm condition number.
+
+     real(dp) function stdlib_dla_syrcond( uplo, n, a, lda, af, ldaf,ipiv, cmode, c, info, work,&
+               iwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, ldaf, cmode
+           integer(ilp), intent(out) :: info
+           ! Array Arguments
+           integer(ilp), intent(out) :: iwork(*)
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(in) :: a(lda,*), af(ldaf,*), c(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           character :: normin
+           integer(ilp) :: kase, i, j
+           real(dp) :: ainvnm, smlnum, tmp
+           logical(lk) :: up
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           stdlib_dla_syrcond = zero
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLA_SYRCOND', -info )
+              return
+           end if
+           if( n==0 ) then
+              stdlib_dla_syrcond = one
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute the equilibration matrix r such that
+           ! inv(r)*a*c has unit 1-norm.
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) / c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j) / c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           endif
+           ! estimate the norm of inv(op(a)).
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           ainvnm = zero
+           normin = 'N'
+           kase = 0
+           10 continue
+           call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+                 if ( up ) then
+                    call stdlib_dsytrs( 'U', n, 1, af, ldaf, ipiv, work, n, info )
+                 else
+                    call stdlib_dsytrs( 'L', n, 1, af, ldaf, ipiv, work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_dsytrs( 'U', n, 1, af, ldaf, ipiv, work, n, info )
+                 else
+                    call stdlib_dsytrs( 'L', n, 1, af, ldaf, ipiv, work, n, info )
+                 endif
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_dla_syrcond = ( one / ainvnm )
+           return
+     end function stdlib_dla_syrcond
+
+     !> DLA_SYRPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(dp) function stdlib_dla_syrpvgrw( uplo, n, info, a, lda, af,ldaf, ipiv, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, info, lda, ldaf
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: ncols, i, j, k, kp
+           real(dp) :: amax, umax, rpvgrw, tmp
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           if ( info==0 ) then
+              if ( upper ) then
+                 ncols = 1
+              else
+                 ncols = n
+              end if
+           else
+              ncols = info
+           end if
+           rpvgrw = one
+           do i = 1, 2*n
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column of a.  compute the max
+           ! for all n columns so we can apply the pivot permutation while
+           ! looping below.  assume a full factorization is the common case.
+           if ( upper ) then
+              do j = 1, n
+                 do i = 1, j
+                    work( n+i ) = max( abs( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( abs( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = j, n
+                    work( n+i ) = max( abs( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( abs( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of u or l.  also
+           ! permute the magnitudes of a above so they're in the same order as
+           ! the factor.
+           ! the iteration orders and permutations were copied from stdlib_dsytrs.
+           ! calls to stdlib_sswap would be severe overkill.
+           if ( upper ) then
+              k = n
+              do while ( k < ncols .and. k>0 )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = 1, k
+                       work( k ) = max( abs( af( i, k ) ), work( k ) )
+                    end do
+                    k = k - 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k-1 )
+                    work( n+k-1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = 1, k-1
+                       work( k ) = max( abs( af( i, k ) ), work( k ) )
+                       work( k-1 ) = max( abs( af( i, k-1 ) ), work( k-1 ) )
+                    end do
+                    work( k ) = max( abs( af( k, k ) ), work( k ) )
+                    k = k - 2
+                 end if
+              end do
+              k = ncols
+              do while ( k <= n )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k + 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k + 2
+                 end if
+              end do
+           else
+              k = 1
+              do while ( k <= ncols )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = k, n
+                       work( k ) = max( abs( af( i, k ) ), work( k ) )
+                    end do
+                    k = k + 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k+1 )
+                    work( n+k+1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = k+1, n
+                       work( k ) = max( abs( af( i, k ) ), work( k ) )
+                       work( k+1 ) = max( abs( af(i, k+1 ) ), work( k+1 ) )
+                    end do
+                    work( k ) = max( abs( af( k, k ) ), work( k ) )
+                    k = k + 2
+                 end if
+              end do
+              k = ncols
+              do while ( k >= 1 )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k - 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k - 2
+                 endif
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( upper ) then
+              do i = ncols, n
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_dla_syrpvgrw = rpvgrw
+     end function stdlib_dla_syrpvgrw
+
+
+     pure subroutine stdlib_dladiv1( a, b, c, d, p, q )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(inout) :: a
+           real(dp), intent(in) :: b, c, d
+           real(dp), intent(out) :: p, q
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: r, t
+           ! Executable Statements 
+           r = d / c
+           t = one / (c + d * r)
+           p = stdlib_dladiv2(a, b, c, d, r, t)
+           a = -a
+           q = stdlib_dladiv2(b, a, c, d, r, t)
+           return
+     end subroutine stdlib_dladiv1
+
+     !> DLAED6: computes the positive or negative root (closest to the origin)
+     !> of
+     !> z(1)        z(2)        z(3)
+     !> f(x) =   rho + --------- + ---------- + ---------
+     !> d(1)-x      d(2)-x      d(3)-x
+     !> It is assumed that
+     !> if ORGATI = .true. the root is between d(2) and d(3);
+     !> otherwise it is between d(1) and d(2)
+     !> This routine will be called by DLAED4 when necessary. In most cases,
+     !> the root sought is the smallest in magnitude, though it might not be
+     !> in some extremely rare situations.
+
+     pure subroutine stdlib_dlaed6( kniter, orgati, rho, d, z, finit, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: orgati
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kniter
+           real(dp), intent(in) :: finit, rho
+           real(dp), intent(out) :: tau
+           ! Array Arguments 
+           real(dp), intent(in) :: d(3), z(3)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 40
+           
+           
+           ! Local Arrays 
+           real(dp) :: dscale(3), zscale(3)
+           ! Local Scalars 
+           logical(lk) :: scale
+           integer(ilp) :: i, iter, niter
+           real(dp) :: a, b, base, c, ddf, df, eps, erretm, eta, f, fc, sclfac, sclinv, small1, &
+                     small2, sminv1, sminv2, temp, temp1, temp2, temp3, temp4, lbd, ubd
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           if( orgati ) then
+              lbd = d(2)
+              ubd = d(3)
+           else
+              lbd = d(1)
+              ubd = d(2)
+           end if
+           if( finit < zero )then
+              lbd = zero
+           else
+              ubd = zero
+           end if
+           niter = 1
+           tau = zero
+           if( kniter==2 ) then
+              if( orgati ) then
+                 temp = ( d( 3 )-d( 2 ) ) / two
+                 c = rho + z( 1 ) / ( ( d( 1 )-d( 2 ) )-temp )
+                 a = c*( d( 2 )+d( 3 ) ) + z( 2 ) + z( 3 )
+                 b = c*d( 2 )*d( 3 ) + z( 2 )*d( 3 ) + z( 3 )*d( 2 )
+              else
+                 temp = ( d( 1 )-d( 2 ) ) / two
+                 c = rho + z( 3 ) / ( ( d( 3 )-d( 2 ) )-temp )
+                 a = c*( d( 1 )+d( 2 ) ) + z( 1 ) + z( 2 )
+                 b = c*d( 1 )*d( 2 ) + z( 1 )*d( 2 ) + z( 2 )*d( 1 )
+              end if
+              temp = max( abs( a ), abs( b ), abs( c ) )
+              a = a / temp
+              b = b / temp
+              c = c / temp
+              if( c==zero ) then
+                 tau = b / a
+              else if( a<=zero ) then
+                 tau = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+              else
+                 tau = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+              end if
+              if( tau < lbd .or. tau > ubd )tau = ( lbd+ubd )/two
+              if( d(1)==tau .or. d(2)==tau .or. d(3)==tau ) then
+                 tau = zero
+              else
+                 temp = finit + tau*z(1)/( d(1)*( d( 1 )-tau ) ) +tau*z(2)/( d(2)*( d( 2 )-tau ) )&
+                            +tau*z(3)/( d(3)*( d( 3 )-tau ) )
+                 if( temp <= zero )then
+                    lbd = tau
+                 else
+                    ubd = tau
+                 end if
+                 if( abs( finit )<=abs( temp ) )tau = zero
+              end if
+           end if
+           ! get machine parameters for possible scaling to avoid overflow
+           ! modified by sven: parameters small1, sminv1, small2,
+           ! sminv2, eps are not saved anymore between one call to the
+           ! others but recomputed at each call
+           eps = stdlib_dlamch( 'EPSILON' )
+           base = stdlib_dlamch( 'BASE' )
+           small1 = base**( int( log( stdlib_dlamch( 'SAFMIN' ) ) / log( base ) /three,KIND=ilp) )
+                     
+           sminv1 = one / small1
+           small2 = small1*small1
+           sminv2 = sminv1*sminv1
+           ! determine if scaling of inputs necessary to avoid overflow
+           ! when computing 1/temp**3
+           if( orgati ) then
+              temp = min( abs( d( 2 )-tau ), abs( d( 3 )-tau ) )
+           else
+              temp = min( abs( d( 1 )-tau ), abs( d( 2 )-tau ) )
+           end if
+           scale = .false.
+           if( temp<=small1 ) then
+              scale = .true.
+              if( temp<=small2 ) then
+              ! scale up by power of radix nearest 1/safmin**(2/3)
+                 sclfac = sminv2
+                 sclinv = small2
+              else
+              ! scale up by power of radix nearest 1/safmin**(1/3)
+                 sclfac = sminv1
+                 sclinv = small1
+              end if
+              ! scaling up safe because d, z, tau scaled elsewhere to be o(1)
+              do i = 1, 3
+                 dscale( i ) = d( i )*sclfac
+                 zscale( i ) = z( i )*sclfac
+              end do
+              tau = tau*sclfac
+              lbd = lbd*sclfac
+              ubd = ubd*sclfac
+           else
+              ! copy d and z to dscale and zscale
+              do i = 1, 3
+                 dscale( i ) = d( i )
+                 zscale( i ) = z( i )
+              end do
+           end if
+           fc = zero
+           df = zero
+           ddf = zero
+           do i = 1, 3
+              temp = one / ( dscale( i )-tau )
+              temp1 = zscale( i )*temp
+              temp2 = temp1*temp
+              temp3 = temp2*temp
+              fc = fc + temp1 / dscale( i )
+              df = df + temp2
+              ddf = ddf + temp3
+           end do
+           f = finit + tau*fc
+           if( abs( f )<=zero )go to 60
+           if( f <= zero )then
+              lbd = tau
+           else
+              ubd = tau
+           end if
+              ! iteration begins -- use gragg-thornton-warner cubic convergent
+                                  ! scheme
+           ! it is not hard to see that
+                 ! 1) iterations will go up monotonically
+                    ! if finit < 0;
+                 ! 2) iterations will go down monotonically
+                    ! if finit > 0.
+           iter = niter + 1
+           loop_50: do niter = iter, maxit
+              if( orgati ) then
+                 temp1 = dscale( 2 ) - tau
+                 temp2 = dscale( 3 ) - tau
+              else
+                 temp1 = dscale( 1 ) - tau
+                 temp2 = dscale( 2 ) - tau
+              end if
+              a = ( temp1+temp2 )*f - temp1*temp2*df
+              b = temp1*temp2*f
+              c = f - ( temp1+temp2 )*df + temp1*temp2*ddf
+              temp = max( abs( a ), abs( b ), abs( c ) )
+              a = a / temp
+              b = b / temp
+              c = c / temp
+              if( c==zero ) then
+                 eta = b / a
+              else if( a<=zero ) then
+                 eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+              else
+                 eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+              end if
+              if( f*eta>=zero ) then
+                 eta = -f / df
+              end if
+              tau = tau + eta
+              if( tau < lbd .or. tau > ubd )tau = ( lbd + ubd )/two
+              fc = zero
+              erretm = zero
+              df = zero
+              ddf = zero
+              do i = 1, 3
+                 if ( ( dscale( i )-tau )/=zero ) then
+                    temp = one / ( dscale( i )-tau )
+                    temp1 = zscale( i )*temp
+                    temp2 = temp1*temp
+                    temp3 = temp2*temp
+                    temp4 = temp1 / dscale( i )
+                    fc = fc + temp4
+                    erretm = erretm + abs( temp4 )
+                    df = df + temp2
+                    ddf = ddf + temp3
+                 else
+                    go to 60
+                 end if
+              end do
+              f = finit + tau*fc
+              erretm = eight*( abs( finit )+abs( tau )*erretm ) +abs( tau )*df
+              if( ( abs( f )<=four*eps*erretm ) .or.( (ubd-lbd)<=four*eps*abs(tau) )  )go to &
+                        60
+              if( f <= zero )then
+                 lbd = tau
+              else
+                 ubd = tau
+              end if
+           end do loop_50
+           info = 1
+           60 continue
+           ! undo scaling
+           if( scale )tau = tau*sclinv
+           return
+     end subroutine stdlib_dlaed6
+
+     !> DLAGS2: computes 2-by-2 orthogonal matrices U, V and Q, such
+     !> that if ( UPPER ) then
+     !> U**T *A*Q = U**T *( A1 A2 )*Q = ( x  0  )
+     !> ( 0  A3 )     ( x  x  )
+     !> and
+     !> V**T*B*Q = V**T *( B1 B2 )*Q = ( x  0  )
+     !> ( 0  B3 )     ( x  x  )
+     !> or if ( .NOT.UPPER ) then
+     !> U**T *A*Q = U**T *( A1 0  )*Q = ( x  x  )
+     !> ( A2 A3 )     ( 0  x  )
+     !> and
+     !> V**T*B*Q = V**T*( B1 0  )*Q = ( x  x  )
+     !> ( B2 B3 )     ( 0  x  )
+     !> The rows of the transformed A and B are parallel, where
+     !> U = (  CSU  SNU ), V = (  CSV SNV ), Q = (  CSQ   SNQ )
+     !> ( -SNU  CSU )      ( -SNV CSV )      ( -SNQ   CSQ )
+     !> Z**T denotes the transpose of Z.
+
+     pure subroutine stdlib_dlags2( upper, a1, a2, a3, b1, b2, b3, csu, snu, csv,snv, csq, snq )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: upper
+           real(dp), intent(in) :: a1, a2, a3, b1, b2, b3
+           real(dp), intent(out) :: csq, csu, csv, snq, snu, snv
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: a, aua11, aua12, aua21, aua22, avb11, avb12, avb21, avb22, b, c, csl, csr, &
+           d, r, s1, s2, snl, snr, ua11, ua11r, ua12, ua21, ua22, ua22r, vb11, vb11r, vb12, vb21, &
+                     vb22, vb22r
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           if( upper ) then
+              ! input matrices a and b are upper triangular matrices
+              ! form matrix c = a*adj(b) = ( a b )
+                                         ! ( 0 d )
+              a = a1*b3
+              d = a3*b1
+              b = a2*b1 - a1*b2
+              ! the svd of real 2-by-2 triangular c
+               ! ( csl -snl )*( a b )*(  csr  snr ) = ( r 0 )
+               ! ( snl  csl ) ( 0 d ) ( -snr  csr )   ( 0 t )
+              call stdlib_dlasv2( a, b, d, s1, s2, snr, csr, snl, csl )
+              if( abs( csl )>=abs( snl ) .or. abs( csr )>=abs( snr ) )then
+                 ! compute the (1,1) and (1,2) elements of u**t *a and v**t *b,
+                 ! and (1,2) element of |u|**t *|a| and |v|**t *|b|.
+                 ua11r = csl*a1
+                 ua12 = csl*a2 + snl*a3
+                 vb11r = csr*b1
+                 vb12 = csr*b2 + snr*b3
+                 aua12 = abs( csl )*abs( a2 ) + abs( snl )*abs( a3 )
+                 avb12 = abs( csr )*abs( b2 ) + abs( snr )*abs( b3 )
+                 ! zero (1,2) elements of u**t *a and v**t *b
+                 if( ( abs( ua11r )+abs( ua12 ) )/=zero ) then
+                    if( aua12 / ( abs( ua11r )+abs( ua12 ) )<=avb12 /( abs( vb11r )+abs( vb12 ) ) &
+                              ) then
+                       call stdlib_dlartg( -ua11r, ua12, csq, snq, r )
+                    else
+                       call stdlib_dlartg( -vb11r, vb12, csq, snq, r )
+                    end if
+                 else
+                    call stdlib_dlartg( -vb11r, vb12, csq, snq, r )
+                 end if
+                 csu = csl
+                 snu = -snl
+                 csv = csr
+                 snv = -snr
+              else
+                 ! compute the (2,1) and (2,2) elements of u**t *a and v**t *b,
+                 ! and (2,2) element of |u|**t *|a| and |v|**t *|b|.
+                 ua21 = -snl*a1
+                 ua22 = -snl*a2 + csl*a3
+                 vb21 = -snr*b1
+                 vb22 = -snr*b2 + csr*b3
+                 aua22 = abs( snl )*abs( a2 ) + abs( csl )*abs( a3 )
+                 avb22 = abs( snr )*abs( b2 ) + abs( csr )*abs( b3 )
+                 ! zero (2,2) elements of u**t*a and v**t*b, and then swap.
+                 if( ( abs( ua21 )+abs( ua22 ) )/=zero ) then
+                    if( aua22 / ( abs( ua21 )+abs( ua22 ) )<=avb22 /( abs( vb21 )+abs( vb22 ) ) ) &
+                              then
+                       call stdlib_dlartg( -ua21, ua22, csq, snq, r )
+                    else
+                       call stdlib_dlartg( -vb21, vb22, csq, snq, r )
+                    end if
+                 else
+                    call stdlib_dlartg( -vb21, vb22, csq, snq, r )
+                 end if
+                 csu = snl
+                 snu = csl
+                 csv = snr
+                 snv = csr
+              end if
+           else
+              ! input matrices a and b are lower triangular matrices
+              ! form matrix c = a*adj(b) = ( a 0 )
+                                         ! ( c d )
+              a = a1*b3
+              d = a3*b1
+              c = a2*b3 - a3*b2
+              ! the svd of real 2-by-2 triangular c
+               ! ( csl -snl )*( a 0 )*(  csr  snr ) = ( r 0 )
+               ! ( snl  csl ) ( c d ) ( -snr  csr )   ( 0 t )
+              call stdlib_dlasv2( a, c, d, s1, s2, snr, csr, snl, csl )
+              if( abs( csr )>=abs( snr ) .or. abs( csl )>=abs( snl ) )then
+                 ! compute the (2,1) and (2,2) elements of u**t *a and v**t *b,
+                 ! and (2,1) element of |u|**t *|a| and |v|**t *|b|.
+                 ua21 = -snr*a1 + csr*a2
+                 ua22r = csr*a3
+                 vb21 = -snl*b1 + csl*b2
+                 vb22r = csl*b3
+                 aua21 = abs( snr )*abs( a1 ) + abs( csr )*abs( a2 )
+                 avb21 = abs( snl )*abs( b1 ) + abs( csl )*abs( b2 )
+                 ! zero (2,1) elements of u**t *a and v**t *b.
+                 if( ( abs( ua21 )+abs( ua22r ) )/=zero ) then
+                    if( aua21 / ( abs( ua21 )+abs( ua22r ) )<=avb21 /( abs( vb21 )+abs( vb22r ) ) &
+                              ) then
+                       call stdlib_dlartg( ua22r, ua21, csq, snq, r )
+                    else
+                       call stdlib_dlartg( vb22r, vb21, csq, snq, r )
+                    end if
+                 else
+                    call stdlib_dlartg( vb22r, vb21, csq, snq, r )
+                 end if
+                 csu = csr
+                 snu = -snr
+                 csv = csl
+                 snv = -snl
+              else
+                 ! compute the (1,1) and (1,2) elements of u**t *a and v**t *b,
+                 ! and (1,1) element of |u|**t *|a| and |v|**t *|b|.
+                 ua11 = csr*a1 + snr*a2
+                 ua12 = snr*a3
+                 vb11 = csl*b1 + snl*b2
+                 vb12 = snl*b3
+                 aua11 = abs( csr )*abs( a1 ) + abs( snr )*abs( a2 )
+                 avb11 = abs( csl )*abs( b1 ) + abs( snl )*abs( b2 )
+                 ! zero (1,1) elements of u**t*a and v**t*b, and then swap.
+                 if( ( abs( ua11 )+abs( ua12 ) )/=zero ) then
+                    if( aua11 / ( abs( ua11 )+abs( ua12 ) )<=avb11 /( abs( vb11 )+abs( vb12 ) ) ) &
+                              then
+                       call stdlib_dlartg( ua12, ua11, csq, snq, r )
+                    else
+                       call stdlib_dlartg( vb12, vb11, csq, snq, r )
+                    end if
+                 else
+                    call stdlib_dlartg( vb12, vb11, csq, snq, r )
+                 end if
+                 csu = snr
+                 snu = csr
+                 csv = snl
+                 snv = csl
+              end if
+           end if
+           return
+     end subroutine stdlib_dlags2
+
+     !> DLAGTF: factorizes the matrix (T - lambda*I), where T is an n by n
+     !> tridiagonal matrix and lambda is a scalar, as
+     !> T - lambda*I = PLU,
+     !> where P is a permutation matrix, L is a unit lower tridiagonal matrix
+     !> with at most one non-zero sub-diagonal elements per column and U is
+     !> an upper triangular matrix with at most two non-zero super-diagonal
+     !> elements per column.
+     !> The factorization is obtained by Gaussian elimination with partial
+     !> pivoting and implicit row scaling.
+     !> The parameter LAMBDA is included in the routine so that DLAGTF may
+     !> be used, in conjunction with DLAGTS, to obtain eigenvectors of T by
+     !> inverse iteration.
+
+     pure subroutine stdlib_dlagtf( n, a, lambda, b, c, tol, d, in, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: lambda, tol
+           ! Array Arguments 
+           integer(ilp), intent(out) :: in(*)
+           real(dp), intent(inout) :: a(*), b(*), c(*)
+           real(dp), intent(out) :: d(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: k
+           real(dp) :: eps, mult, piv1, piv2, scale1, scale2, temp, tl
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'DLAGTF', -info )
+              return
+           end if
+           if( n==0 )return
+           a( 1 ) = a( 1 ) - lambda
+           in( n ) = 0
+           if( n==1 ) then
+              if( a( 1 )==zero )in( 1 ) = 1
+              return
+           end if
+           eps = stdlib_dlamch( 'EPSILON' )
+           tl = max( tol, eps )
+           scale1 = abs( a( 1 ) ) + abs( b( 1 ) )
+           loop_10: do k = 1, n - 1
+              a( k+1 ) = a( k+1 ) - lambda
+              scale2 = abs( c( k ) ) + abs( a( k+1 ) )
+              if( k<( n-1 ) )scale2 = scale2 + abs( b( k+1 ) )
+              if( a( k )==zero ) then
+                 piv1 = zero
+              else
+                 piv1 = abs( a( k ) ) / scale1
+              end if
+              if( c( k )==zero ) then
+                 in( k ) = 0
+                 piv2 = zero
+                 scale1 = scale2
+                 if( k<( n-1 ) )d( k ) = zero
+              else
+                 piv2 = abs( c( k ) ) / scale2
+                 if( piv2<=piv1 ) then
+                    in( k ) = 0
+                    scale1 = scale2
+                    c( k ) = c( k ) / a( k )
+                    a( k+1 ) = a( k+1 ) - c( k )*b( k )
+                    if( k<( n-1 ) )d( k ) = zero
+                 else
+                    in( k ) = 1
+                    mult = a( k ) / c( k )
+                    a( k ) = c( k )
+                    temp = a( k+1 )
+                    a( k+1 ) = b( k ) - mult*temp
+                    if( k<( n-1 ) ) then
+                       d( k ) = b( k+1 )
+                       b( k+1 ) = -mult*d( k )
+                    end if
+                    b( k ) = temp
+                    c( k ) = mult
+                 end if
+              end if
+              if( ( max( piv1, piv2 )<=tl ) .and. ( in( n )==0 ) )in( n ) = k
+           end do loop_10
+           if( ( abs( a( n ) )<=scale1*tl ) .and. ( in( n )==0 ) )in( n ) = n
+           return
+     end subroutine stdlib_dlagtf
+
+     !> DLAGTS: may be used to solve one of the systems of equations
+     !> (T - lambda*I)*x = y   or   (T - lambda*I)**T*x = y,
+     !> where T is an n by n tridiagonal matrix, for x, following the
+     !> factorization of (T - lambda*I) as
+     !> (T - lambda*I) = P*L*U ,
+     !> by routine DLAGTF. The choice of equation to be solved is
+     !> controlled by the argument JOB, and in each case there is an option
+     !> to perturb zero or very small diagonal elements of U, this option
+     !> being intended for use in applications such as inverse iteration.
+
+     pure subroutine stdlib_dlagts( job, n, a, b, c, d, in, y, tol, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: job, n
+           real(dp), intent(inout) :: tol
+           ! Array Arguments 
+           integer(ilp), intent(in) :: in(*)
+           real(dp), intent(in) :: a(*), b(*), c(*), d(*)
+           real(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: k
+           real(dp) :: absak, ak, bignum, eps, pert, sfmin, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sign
+           ! Executable Statements 
+           info = 0
+           if( ( abs( job )>2 ) .or. ( job==0 ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAGTS', -info )
+              return
+           end if
+           if( n==0 )return
+           eps = stdlib_dlamch( 'EPSILON' )
+           sfmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           bignum = one / sfmin
+           if( job<0 ) then
+              if( tol<=zero ) then
+                 tol = abs( a( 1 ) )
+                 if( n>1 )tol = max( tol, abs( a( 2 ) ), abs( b( 1 ) ) )
+                 do k = 3, n
+                    tol = max( tol, abs( a( k ) ), abs( b( k-1 ) ),abs( d( k-2 ) ) )
+                 end do
+                 tol = tol*eps
+                 if( tol==zero )tol = eps
+              end if
+           end if
+           if( abs( job )==1 ) then
+              do k = 2, n
+                 if( in( k-1 )==0 ) then
+                    y( k ) = y( k ) - c( k-1 )*y( k-1 )
+                 else
+                    temp = y( k-1 )
+                    y( k-1 ) = y( k )
+                    y( k ) = temp - c( k-1 )*y( k )
+                 end if
+              end do
+              if( job==1 ) then
+                 loop_30: do k = n, 1, -1
+                    if( k<=n-2 ) then
+                       temp = y( k ) - b( k )*y( k+1 ) - d( k )*y( k+2 )
+                    else if( k==n-1 ) then
+                       temp = y( k ) - b( k )*y( k+1 )
+                    else
+                       temp = y( k )
+                    end if
+                    ak = a( k )
+                    absak = abs( ak )
+                    if( absak<one ) then
+                       if( absak<sfmin ) then
+                          if( absak==zero .or. abs( temp )*sfmin>absak )then
+                             info = k
+                             return
+                          else
+                             temp = temp*bignum
+                             ak = ak*bignum
+                          end if
+                       else if( abs( temp )>absak*bignum ) then
+                          info = k
+                          return
+                       end if
+                    end if
+                    y( k ) = temp / ak
+                 end do loop_30
+              else
+                 loop_50: do k = n, 1, -1
+                    if( k<=n-2 ) then
+                       temp = y( k ) - b( k )*y( k+1 ) - d( k )*y( k+2 )
+                    else if( k==n-1 ) then
+                       temp = y( k ) - b( k )*y( k+1 )
+                    else
+                       temp = y( k )
+                    end if
+                    ak = a( k )
+                    pert = sign( tol, ak )
+                    40 continue
+                    absak = abs( ak )
+                    if( absak<one ) then
+                       if( absak<sfmin ) then
+                          if( absak==zero .or. abs( temp )*sfmin>absak )then
+                             ak = ak + pert
+                             pert = 2*pert
+                             go to 40
+                          else
+                             temp = temp*bignum
+                             ak = ak*bignum
+                          end if
+                       else if( abs( temp )>absak*bignum ) then
+                          ak = ak + pert
+                          pert = 2*pert
+                          go to 40
+                       end if
+                    end if
+                    y( k ) = temp / ak
+                 end do loop_50
+              end if
+           else
+              ! come to here if  job = 2 or -2
+              if( job==2 ) then
+                 loop_60: do k = 1, n
+                    if( k>=3 ) then
+                       temp = y( k ) - b( k-1 )*y( k-1 ) - d( k-2 )*y( k-2 )
+                    else if( k==2 ) then
+                       temp = y( k ) - b( k-1 )*y( k-1 )
+                    else
+                       temp = y( k )
+                    end if
+                    ak = a( k )
+                    absak = abs( ak )
+                    if( absak<one ) then
+                       if( absak<sfmin ) then
+                          if( absak==zero .or. abs( temp )*sfmin>absak )then
+                             info = k
+                             return
+                          else
+                             temp = temp*bignum
+                             ak = ak*bignum
+                          end if
+                       else if( abs( temp )>absak*bignum ) then
+                          info = k
+                          return
+                       end if
+                    end if
+                    y( k ) = temp / ak
+                 end do loop_60
+              else
+                 loop_80: do k = 1, n
+                    if( k>=3 ) then
+                       temp = y( k ) - b( k-1 )*y( k-1 ) - d( k-2 )*y( k-2 )
+                    else if( k==2 ) then
+                       temp = y( k ) - b( k-1 )*y( k-1 )
+                    else
+                       temp = y( k )
+                    end if
+                    ak = a( k )
+                    pert = sign( tol, ak )
+                    70 continue
+                    absak = abs( ak )
+                    if( absak<one ) then
+                       if( absak<sfmin ) then
+                          if( absak==zero .or. abs( temp )*sfmin>absak )then
+                             ak = ak + pert
+                             pert = 2*pert
+                             go to 70
+                          else
+                             temp = temp*bignum
+                             ak = ak*bignum
+                          end if
+                       else if( abs( temp )>absak*bignum ) then
+                          ak = ak + pert
+                          pert = 2*pert
+                          go to 70
+                       end if
+                    end if
+                    y( k ) = temp / ak
+                 end do loop_80
+              end if
+              do k = n, 2, -1
+                 if( in( k-1 )==0 ) then
+                    y( k-1 ) = y( k-1 ) - c( k-1 )*y( k )
+                 else
+                    temp = y( k-1 )
+                    y( k-1 ) = y( k )
+                    y( k ) = temp - c( k-1 )*y( k )
+                 end if
+              end do
+           end if
+     end subroutine stdlib_dlagts
+
+     !> DLAIC1: applies one step of incremental condition estimation in
+     !> its simplest version:
+     !> Let x, twonorm(x) = 1, be an approximate singular vector of an j-by-j
+     !> lower triangular matrix L, such that
+     !> twonorm(L*x) = sest
+     !> Then DLAIC1 computes sestpr, s, c such that
+     !> the vector
+     !> [ s*x ]
+     !> xhat = [  c  ]
+     !> is an approximate singular vector of
+     !> [ L       0  ]
+     !> Lhat = [ w**T gamma ]
+     !> in the sense that
+     !> twonorm(Lhat*xhat) = sestpr.
+     !> Depending on JOB, an estimate for the largest or smallest singular
+     !> value is computed.
+     !> Note that [s c]**T and sestpr**2 is an eigenpair of the system
+     !> diag(sest*sest, 0) + [alpha  gamma] * [ alpha ]
+     !> [ gamma ]
+     !> where  alpha =  x**T*w.
+
+     pure subroutine stdlib_dlaic1( job, j, x, sest, w, gamma, sestpr, s, c )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: j, job
+           real(dp), intent(out) :: c, s, sestpr
+           real(dp), intent(in) :: gamma, sest
+           ! Array Arguments 
+           real(dp), intent(in) :: w(j), x(j)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(dp) :: absalp, absest, absgam, alpha, b, cosine, eps, norma, s1, s2, sine, t, &
+                     test, tmp, zeta1, zeta2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sign,sqrt
+           ! Executable Statements 
+           eps = stdlib_dlamch( 'EPSILON' )
+           alpha = stdlib_ddot( j, x, 1, w, 1 )
+           absalp = abs( alpha )
+           absgam = abs( gamma )
+           absest = abs( sest )
+           if( job==1 ) then
+              ! estimating largest singular value
+              ! special cases
+              if( sest==zero ) then
+                 s1 = max( absgam, absalp )
+                 if( s1==zero ) then
+                    s = zero
+                    c = one
+                    sestpr = zero
+                 else
+                    s = alpha / s1
+                    c = gamma / s1
+                    tmp = sqrt( s*s+c*c )
+                    s = s / tmp
+                    c = c / tmp
+                    sestpr = s1*tmp
+                 end if
+                 return
+              else if( absgam<=eps*absest ) then
+                 s = one
+                 c = zero
+                 tmp = max( absest, absalp )
+                 s1 = absest / tmp
+                 s2 = absalp / tmp
+                 sestpr = tmp*sqrt( s1*s1+s2*s2 )
+                 return
+              else if( absalp<=eps*absest ) then
+                 s1 = absgam
+                 s2 = absest
+                 if( s1<=s2 ) then
+                    s = one
+                    c = zero
+                    sestpr = s2
+                 else
+                    s = zero
+                    c = one
+                    sestpr = s1
+                 end if
+                 return
+              else if( absest<=eps*absalp .or. absest<=eps*absgam ) then
+                 s1 = absgam
+                 s2 = absalp
+                 if( s1<=s2 ) then
+                    tmp = s1 / s2
+                    s = sqrt( one+tmp*tmp )
+                    sestpr = s2*s
+                    c = ( gamma / s2 ) / s
+                    s = sign( one, alpha ) / s
+                 else
+                    tmp = s2 / s1
+                    c = sqrt( one+tmp*tmp )
+                    sestpr = s1*c
+                    s = ( alpha / s1 ) / c
+                    c = sign( one, gamma ) / c
+                 end if
+                 return
+              else
+                 ! normal case
+                 zeta1 = alpha / absest
+                 zeta2 = gamma / absest
+                 b = ( one-zeta1*zeta1-zeta2*zeta2 )*half
+                 c = zeta1*zeta1
+                 if( b>zero ) then
+                    t = c / ( b+sqrt( b*b+c ) )
+                 else
+                    t = sqrt( b*b+c ) - b
+                 end if
+                 sine = -zeta1 / t
+                 cosine = -zeta2 / ( one+t )
+                 tmp = sqrt( sine*sine+cosine*cosine )
+                 s = sine / tmp
+                 c = cosine / tmp
+                 sestpr = sqrt( t+one )*absest
+                 return
+              end if
+           else if( job==2 ) then
+              ! estimating smallest singular value
+              ! special cases
+              if( sest==zero ) then
+                 sestpr = zero
+                 if( max( absgam, absalp )==zero ) then
+                    sine = one
+                    cosine = zero
+                 else
+                    sine = -gamma
+                    cosine = alpha
+                 end if
+                 s1 = max( abs( sine ), abs( cosine ) )
+                 s = sine / s1
+                 c = cosine / s1
+                 tmp = sqrt( s*s+c*c )
+                 s = s / tmp
+                 c = c / tmp
+                 return
+              else if( absgam<=eps*absest ) then
+                 s = zero
+                 c = one
+                 sestpr = absgam
+                 return
+              else if( absalp<=eps*absest ) then
+                 s1 = absgam
+                 s2 = absest
+                 if( s1<=s2 ) then
+                    s = zero
+                    c = one
+                    sestpr = s1
+                 else
+                    s = one
+                    c = zero
+                    sestpr = s2
+                 end if
+                 return
+              else if( absest<=eps*absalp .or. absest<=eps*absgam ) then
+                 s1 = absgam
+                 s2 = absalp
+                 if( s1<=s2 ) then
+                    tmp = s1 / s2
+                    c = sqrt( one+tmp*tmp )
+                    sestpr = absest*( tmp / c )
+                    s = -( gamma / s2 ) / c
+                    c = sign( one, alpha ) / c
+                 else
+                    tmp = s2 / s1
+                    s = sqrt( one+tmp*tmp )
+                    sestpr = absest / s
+                    c = ( alpha / s1 ) / s
+                    s = -sign( one, gamma ) / s
+                 end if
+                 return
+              else
+                 ! normal case
+                 zeta1 = alpha / absest
+                 zeta2 = gamma / absest
+                 norma = max( one+zeta1*zeta1+abs( zeta1*zeta2 ),abs( zeta1*zeta2 )+zeta2*zeta2 )
+                           
+                 ! see if root is closer to zero or to one
+                 test = one + two*( zeta1-zeta2 )*( zeta1+zeta2 )
+                 if( test>=zero ) then
+                    ! root is close to zero, compute directly
+                    b = ( zeta1*zeta1+zeta2*zeta2+one )*half
+                    c = zeta2*zeta2
+                    t = c / ( b+sqrt( abs( b*b-c ) ) )
+                    sine = zeta1 / ( one-t )
+                    cosine = -zeta2 / t
+                    sestpr = sqrt( t+four*eps*eps*norma )*absest
+                 else
+                    ! root is closer to one, shift by that amount
+                    b = ( zeta2*zeta2+zeta1*zeta1-one )*half
+                    c = zeta1*zeta1
+                    if( b>=zero ) then
+                       t = -c / ( b+sqrt( b*b+c ) )
+                    else
+                       t = b - sqrt( b*b+c )
+                    end if
+                    sine = -zeta1 / t
+                    cosine = -zeta2 / ( one+t )
+                    sestpr = sqrt( one+t+four*eps*eps*norma )*absest
+                 end if
+                 tmp = sqrt( sine*sine+cosine*cosine )
+                 s = sine / tmp
+                 c = cosine / tmp
+                 return
+              end if
+           end if
+           return
+     end subroutine stdlib_dlaic1
+
+     !> DLANEG: computes the Sturm count, the number of negative pivots
+     !> encountered while factoring tridiagonal T - sigma I = L D L^T.
+     !> This implementation works directly on the factors without forming
+     !> the tridiagonal matrix T.  The Sturm count is also the number of
+     !> eigenvalues of T less than sigma.
+     !> This routine is called from DLARRB.
+     !> The current routine does not use the PIVMIN parameter but rather
+     !> requires IEEE-754 propagation of Infinities and NaNs.  This
+     !> routine also has no input range restrictions but does require
+     !> default exception handling such that x/0 produces Inf when x is
+     !> non-zero, and Inf/Inf produces NaN.  For more information, see:
+     !> Marques, Riedy, and Voemel, "Benefits of IEEE-754 Features in
+     !> Modern Symmetric Tridiagonal Eigensolvers," SIAM Journal on
+     !> Scientific Computing, v28, n5, 2006.  DOI 10.1137/050641624
+     !> (Tech report version in LAWN 172 with the same title.)
+
+     pure integer(ilp) function stdlib_dlaneg( n, d, lld, sigma, pivmin, r )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, r
+           real(dp), intent(in) :: pivmin, sigma
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*), lld(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: blklen = 128
+           
+           ! some architectures propagate infinities and nans very slowly, so
+           ! the code computes counts in blklen chunks.  then a nan can
+           ! propagate at most blklen columns before being detected.  this is
+           ! not a general tuning parameter; it needs only to be just large
+           ! enough that the overhead is tiny in common cases.
+           
+           ! Local Scalars 
+           integer(ilp) :: bj, j, neg1, neg2, negcnt
+           real(dp) :: bsav, dminus, dplus, gamma, p, t, tmp
+           logical(lk) :: sawnan
+           ! Intrinsic Functions 
+           intrinsic :: min,max
+           ! Executable Statements 
+           negcnt = 0
+           ! i) upper part: l d l^t - sigma i = l+ d+ l+^t
+           t = -sigma
+           loop_210: do bj = 1, r-1, blklen
+              neg1 = 0
+              bsav = t
+              do j = bj, min(bj+blklen-1, r-1)
+                 dplus = d( j ) + t
+                 if( dplus<zero ) neg1 = neg1 + 1
+                 tmp = t / dplus
+                 t = tmp * lld( j ) - sigma
+              end do
+              sawnan = stdlib_disnan( t )
+           ! run a slower version of the above loop if a nan is detected.
+           ! a nan should occur only with a zero pivot after an infinite
+           ! pivot.  in that case, substituting 1 for t/dplus is the
+           ! correct limit.
+              if( sawnan ) then
+                 neg1 = 0
+                 t = bsav
+                 do j = bj, min(bj+blklen-1, r-1)
+                    dplus = d( j ) + t
+                    if( dplus<zero ) neg1 = neg1 + 1
+                    tmp = t / dplus
+                    if (stdlib_disnan(tmp)) tmp = one
+                    t = tmp * lld(j) - sigma
+                 end do
+              end if
+              negcnt = negcnt + neg1
+           end do loop_210
+           ! ii) lower part: l d l^t - sigma i = u- d- u-^t
+           p = d( n ) - sigma
+           do bj = n-1, r, -blklen
+              neg2 = 0
+              bsav = p
+              do j = bj, max(bj-blklen+1, r), -1
+                 dminus = lld( j ) + p
+                 if( dminus<zero ) neg2 = neg2 + 1
+                 tmp = p / dminus
+                 p = tmp * d( j ) - sigma
+              end do
+              sawnan = stdlib_disnan( p )
+           ! as above, run a slower version that substitutes 1 for inf/inf.
+              if( sawnan ) then
+                 neg2 = 0
+                 p = bsav
+                 do j = bj, max(bj-blklen+1, r), -1
+                    dminus = lld( j ) + p
+                    if( dminus<zero ) neg2 = neg2 + 1
+                    tmp = p / dminus
+                    if (stdlib_disnan(tmp)) tmp = one
+                    p = tmp * d(j) - sigma
+                 end do
+              end if
+              negcnt = negcnt + neg2
+           end do
+           ! iii) twist index
+             ! t was shifted by sigma initially.
+           gamma = (t + sigma) + p
+           if( gamma<zero ) negcnt = negcnt+1
+           stdlib_dlaneg = negcnt
+     end function stdlib_dlaneg
+
+     !> DLANGB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n band matrix  A,  with kl sub-diagonals and ku super-diagonals.
+
+     real(dp) function stdlib_dlangb( norm, n, kl, ku, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: kl, ku, ldab, n
+           ! Array Arguments 
+           real(dp), intent(in) :: ab(ldab,*)
+           real(dp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k, l
+           real(dp) :: scale, sum, value, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                    temp = abs( ab( i, j ) )
+                    if( value<temp .or. stdlib_disnan( temp ) ) value = temp
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                    sum = sum + abs( ab( i, j ) )
+                 end do
+                 if( value<sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 k = ku + 1 - j
+                 do i = max( 1, j-ku ), min( n, j+kl )
+                    work( i ) = work( i ) + abs( ab( k+i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, n
+                 temp = work( i )
+                 if( value<temp .or. stdlib_disnan( temp ) ) value = temp
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 l = max( 1, j-ku )
+                 k = ku + 1 - j + l
+                 call stdlib_dlassq( min( n, j+kl )-l+1, ab( k, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_dlangb = value
+           return
+     end function stdlib_dlangb
+
+     !> DLANGE:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real matrix A.
+
+     real(dp) function stdlib_dlange( norm, m, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: scale, sum, value, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( min( m, n )==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = 1, m
+                    temp = abs( a( i, j ) )
+                    if( value<temp .or. stdlib_disnan( temp ) ) value = temp
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = 1, m
+                    sum = sum + abs( a( i, j ) )
+                 end do
+                 if( value<sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, m
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 do i = 1, m
+                    work( i ) = work( i ) + abs( a( i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, m
+                 temp = work( i )
+                 if( value<temp .or. stdlib_disnan( temp ) ) value = temp
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 call stdlib_dlassq( m, a( 1, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_dlange = value
+           return
+     end function stdlib_dlange
+
+     !> DLANGT:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real tridiagonal matrix A.
+
+     pure real(dp) function stdlib_dlangt( norm, n, dl, d, du )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*), dl(*), du(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: anorm, scale, sum, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n<=0 ) then
+              anorm = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              anorm = abs( d( n ) )
+              do i = 1, n - 1
+                 if( anorm<abs( dl( i ) ) .or. stdlib_disnan( abs( dl( i ) ) ) )anorm = abs(dl(i))
+                           
+                 if( anorm<abs( d( i ) ) .or. stdlib_disnan( abs( d( i ) ) ) )anorm = abs(d(i))
+                           
+                 if( anorm<abs( du( i ) ) .or. stdlib_disnan (abs( du( i ) ) ) )anorm = abs(du(i))
+                           
+              end do
+           else if( stdlib_lsame( norm, 'O' ) .or. norm=='1' ) then
+              ! find norm1(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( dl( 1 ) )
+                 temp = abs( d( n ) )+abs( du( n-1 ) )
+                 if( anorm < temp .or. stdlib_disnan( temp ) ) anorm = temp
+                 do i = 2, n - 1
+                    temp = abs( d( i ) )+abs( dl( i ) )+abs( du( i-1 ) )
+                    if( anorm < temp .or. stdlib_disnan( temp ) ) anorm = temp
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( du( 1 ) )
+                 temp = abs( d( n ) )+abs( dl( n-1 ) )
+                 if( anorm < temp .or. stdlib_disnan( temp ) ) anorm = temp
+                 do i = 2, n - 1
+                    temp = abs( d( i ) )+abs( du( i ) )+abs( dl( i-1 ) )
+                    if( anorm < temp .or. stdlib_disnan( temp ) ) anorm = temp
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              call stdlib_dlassq( n, d, 1, scale, sum )
+              if( n>1 ) then
+                 call stdlib_dlassq( n-1, dl, 1, scale, sum )
+                 call stdlib_dlassq( n-1, du, 1, scale, sum )
+              end if
+              anorm = scale*sqrt( sum )
+           end if
+           stdlib_dlangt = anorm
+           return
+     end function stdlib_dlangt
+
+     !> DLANHS:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> Hessenberg matrix A.
+
+     real(dp) function stdlib_dlanhs( norm, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = 1, min( n, j+1 )
+                    sum = abs( a( i, j ) )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = 1, min( n, j+1 )
+                    sum = sum + abs( a( i, j ) )
+                 end do
+                 if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 do i = 1, min( n, j+1 )
+                    work( i ) = work( i ) + abs( a( i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 call stdlib_dlassq( min( n, j+1 ), a( 1, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_dlanhs = value
+           return
+     end function stdlib_dlanhs
+
+     !> DLANSB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n symmetric band matrix A,  with k super-diagonals.
+
+     real(dp) function stdlib_dlansb( norm, uplo, n, k, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(dp), intent(in) :: ab(ldab,*)
+           real(dp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           real(dp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = max( k+2-j, 1 ), k + 1
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                 end do
+              else
+                 do j = 1, n
+                    do i = 1, min( n+1-j, k+1 )
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    l = k + 1 - j
+                    do i = max( 1, j-k ), j - 1
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( ab( k+1, j ) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( ab( 1, j ) )
+                    l = 1 - j
+                    do i = j + 1, min( n, j+k )
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( k>0 ) then
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 2, n
+                       call stdlib_dlassq( min( j-1, k ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                    l = k + 1
+                 else
+                    do j = 1, n - 1
+                       call stdlib_dlassq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                    end do
+                    l = 1
+                 end if
+                 sum = 2*sum
+              else
+                 l = 1
+              end if
+              call stdlib_dlassq( n, ab( l, 1 ), ldab, scale, sum )
+              value = scale*sqrt( sum )
+           end if
+           stdlib_dlansb = value
+           return
+     end function stdlib_dlansb
+
+     !> DLANSF: returns the value of the one norm, or the Frobenius norm, or
+     !> the infinity norm, or the element of largest absolute value of a
+     !> real symmetric matrix A in RFP format.
+
+     real(dp) function stdlib_dlansf( norm, transr, uplo, n, a, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, transr, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(in) :: a(0:*)
+           real(dp), intent(out) :: work(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, ifm, ilu, noe, n1, k, l, lda
+           real(dp) :: scale, s, value, aa, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              stdlib_dlansf = zero
+              return
+           else if( n==1 ) then
+              stdlib_dlansf = abs( a(0) )
+              return
+           end if
+           ! set noe = 1 if n is odd. if n is even set noe=0
+           noe = 1
+           if( mod( n, 2 )==0 )noe = 0
+           ! set ifm = 0 when form='t or 't' and 1 otherwise
+           ifm = 1
+           if( stdlib_lsame( transr, 'T' ) )ifm = 0
+           ! set ilu = 0 when uplo='u or 'u' and 1 otherwise
+           ilu = 1
+           if( stdlib_lsame( uplo, 'U' ) )ilu = 0
+           ! set lda = (n+1)/2 when ifm = 0
+           ! set lda = n when ifm = 1 and noe = 1
+           ! set lda = n+1 when ifm = 1 and noe = 0
+           if( ifm==1 ) then
+              if( noe==1 ) then
+                 lda = n
+              else
+                 ! noe=0
+                 lda = n + 1
+              end if
+           else
+              ! ifm=0
+              lda = ( n+1 ) / 2
+           end if
+           if( stdlib_lsame( norm, 'M' ) ) then
+             ! find max(abs(a(i,j))).
+              k = ( n+1 ) / 2
+              value = zero
+              if( noe==1 ) then
+                 ! n is odd
+                 if( ifm==1 ) then
+                 ! a is n by k
+                    do j = 0, k - 1
+                       do i = 0, n - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    end do
+                 else
+                    ! xpose case; a is k by n
+                    do j = 0, n - 1
+                       do i = 0, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    end do
+                 end if
+              else
+                 ! n is even
+                 if( ifm==1 ) then
+                    ! a is n+1 by k
+                    do j = 0, k - 1
+                       do i = 0, n
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    end do
+                 else
+                    ! xpose case; a is k by n+1
+                    do j = 0, n
+                       do i = 0, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              if( ifm==1 ) then
+                 k = n / 2
+                 if( noe==1 ) then
+                    ! n is odd
+                    if( ilu==0 ) then
+                       do i = 0, k - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k
+                          s = zero
+                          do i = 0, k + j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(i,j+k)
+                             s = s + aa
+                             work( i ) = work( i ) + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j+k,j+k)
+                          work( j+k ) = s + aa
+                          if( i==k+k )go to 10
+                          i = i + 1
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j,j)
+                          work( j ) = work( j ) + aa
+                          s = zero
+                          do l = j + 1, k - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       10 continue
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu = 1
+                       k = k + 1
+                       ! k=(n+1)/2 for n odd and ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = k - 1, 0, -1
+                          s = zero
+                          do i = 0, j - 2
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,i+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + aa
+                          end do
+                          if( j>0 ) then
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,j+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + s
+                             ! i=j
+                             i = i + 1
+                          end if
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j,j)
+                          work( j ) = aa
+                          s = zero
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    end if
+                 else
+                    ! n is even
+                    if( ilu==0 ) then
+                       do i = 0, k - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 1
+                          s = zero
+                          do i = 0, k + j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(i,j+k)
+                             s = s + aa
+                             work( i ) = work( i ) + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j+k,j+k)
+                          work( j+k ) = s + aa
+                          i = i + 1
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j,j)
+                          work( j ) = work( j ) + aa
+                          s = zero
+                          do l = j + 1, k - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu = 1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = k - 1, 0, -1
+                          s = zero
+                          do i = 0, j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,i+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j+k,j+k)
+                          s = s + aa
+                          work( i+k ) = work( i+k ) + s
+                          ! i=j
+                          i = i + 1
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j,j)
+                          work( j ) = aa
+                          s = zero
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    end if
+                 end if
+              else
+                 ! ifm=0
+                 k = n / 2
+                 if( noe==1 ) then
+                    ! n is odd
+                    if( ilu==0 ) then
+                       n1 = k
+                       ! n/2
+                       k = k + 1
+                       ! k is the row size and lda
+                       do i = n1, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, n1 - 1
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,n1+i)
+                             work( i+n1 ) = work( i+n1 ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = s
+                       end do
+                       ! j=n1=k-1 is special
+                       s = abs( a( 0+j*lda ) )
+                       ! a(k-1,k-1)
+                       do i = 1, k - 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k-1,i+n1)
+                          work( i+n1 ) = work( i+n1 ) + aa
+                          s = s + aa
+                       end do
+                       work( j ) = work( j ) + s
+                       do j = k, n - 1
+                          s = zero
+                          do i = 0, j - k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(i,j-k)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          ! i=j-k
+                          aa = abs( a( i+j*lda ) )
+                          ! a(j-k,j-k)
+                          s = s + aa
+                          work( j-k ) = work( j-k ) + s
+                          i = i + 1
+                          s = abs( a( i+j*lda ) )
+                          ! a(j,j)
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,l)
+                             work( l ) = work( l ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu=1
+                       k = k + 1
+                       ! k=(n+1)/2 for n odd and ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 2
+                          ! process
+                          s = zero
+                          do i = 0, j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! i=j so process of a(j,j)
+                          s = s + aa
+                          work( j ) = s
+                          ! is initialised here
+                          i = i + 1
+                          ! i=j process a(j+k,j+k)
+                          aa = abs( a( i+j*lda ) )
+                          s = aa
+                          do l = k + j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(l,k+j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( k+j ) = work( k+j ) + s
+                       end do
+                       ! j=k-1 is special :process col a(k-1,0:k-1)
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,i)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( a( i+j*lda ) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = s
+                       ! done with col j=k+1
+                       do j = k, n - 1
+                          ! process col j of a = a(j,0:k-1)
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    end if
+                 else
+                    ! n is even
+                    if( ilu==0 ) then
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 1
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i+k)
+                             work( i+k ) = work( i+k ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = s
+                       end do
+                       ! j=k
+                       aa = abs( a( 0+j*lda ) )
+                       ! a(k,k)
+                       s = aa
+                       do i = 1, k - 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,k+i)
+                          work( i+k ) = work( i+k ) + aa
+                          s = s + aa
+                       end do
+                       work( j ) = work( j ) + s
+                       do j = k + 1, n - 1
+                          s = zero
+                          do i = 0, j - 2 - k
+                             aa = abs( a( i+j*lda ) )
+                             ! a(i,j-k-1)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                           ! i=j-1-k
+                          aa = abs( a( i+j*lda ) )
+                          ! a(j-k-1,j-k-1)
+                          s = s + aa
+                          work( j-k-1 ) = work( j-k-1 ) + s
+                          i = i + 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(j,j)
+                          s = aa
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,l)
+                             work( l ) = work( l ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       ! j=n
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(i,k-1)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( a( i+j*lda ) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = work( i ) + s
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       ! j=0 is special :process col a(k:n-1,k)
+                       s = abs( a( 0 ) )
+                       ! a(k,k)
+                       do i = 1, k - 1
+                          aa = abs( a( i ) )
+                          ! a(k+i,k)
+                          work( i+k ) = work( i+k ) + aa
+                          s = s + aa
+                       end do
+                       work( k ) = work( k ) + s
+                       do j = 1, k - 1
+                          ! process
+                          s = zero
+                          do i = 0, j - 2
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j-1,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! i=j-1 so process of a(j-1,j-1)
+                          s = s + aa
+                          work( j-1 ) = s
+                          ! is initialised here
+                          i = i + 1
+                          ! i=j process a(j+k,j+k)
+                          aa = abs( a( i+j*lda ) )
+                          s = aa
+                          do l = k + j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(l,k+j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( k+j ) = work( k+j ) + s
+                       end do
+                       ! j=k is special :process col a(k,0:k-1)
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,i)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( a( i+j*lda ) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = s
+                       ! done with col j=k+1
+                       do j = k + 1, n
+                          ! process col j-1 of a = a(j-1,0:k-1)
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j-1,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          work( j-1 ) = work( j-1 ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    end if
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+             ! find normf(a).
+              k = ( n+1 ) / 2
+              scale = zero
+              s = one
+              if( noe==1 ) then
+                 ! n is odd
+                 if( ifm==1 ) then
+                    ! a is normal
+                    if( ilu==0 ) then
+                       ! a is upper
+                       do j = 0, k - 3
+                          call stdlib_dlassq( k-j-2, a( k+j+1+j*lda ), 1, scale, s )
+                          ! l at a(k,0)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_dlassq( k+j-1, a( 0+j*lda ), 1, scale, s )
+                          ! trap u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_dlassq( k-1, a( k ), lda+1, scale, s )
+                       ! tri l at a(k,0)
+                       call stdlib_dlassq( k, a( k-1 ), lda+1, scale, s )
+                       ! tri u at a(k-1,0)
+                    else
+                       ! ilu=1
+                       do j = 0, k - 1
+                          call stdlib_dlassq( n-j-1, a( j+1+j*lda ), 1, scale, s )
+                          ! trap l at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_dlassq( j, a( 0+( 1+j )*lda ), 1, scale, s )
+                          ! u at a(0,1)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_dlassq( k, a( 0 ), lda+1, scale, s )
+                       ! tri l at a(0,0)
+                       call stdlib_dlassq( k-1, a( 0+lda ), lda+1, scale, s )
+                       ! tri u at a(0,1)
+                    end if
+                 else
+                    ! a is xpose
+                    if( ilu==0 ) then
+                       ! a**t is upper
+                       do j = 1, k - 2
+                          call stdlib_dlassq( j, a( 0+( k+j )*lda ), 1, scale, s )
+                          ! u at a(0,k)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_dlassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k-1 rect. at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_dlassq( k-j-1, a( j+1+( j+k-1 )*lda ), 1,scale, s )
+                          ! l at a(0,k-1)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_dlassq( k-1, a( 0+k*lda ), lda+1, scale, s )
+                       ! tri u at a(0,k)
+                       call stdlib_dlassq( k, a( 0+( k-1 )*lda ), lda+1, scale, s )
+                       ! tri l at a(0,k-1)
+                    else
+                       ! a**t is lower
+                       do j = 1, k - 1
+                          call stdlib_dlassq( j, a( 0+j*lda ), 1, scale, s )
+                          ! u at a(0,0)
+                       end do
+                       do j = k, n - 1
+                          call stdlib_dlassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k-1 rect. at a(0,k)
+                       end do
+                       do j = 0, k - 3
+                          call stdlib_dlassq( k-j-2, a( j+2+j*lda ), 1, scale, s )
+                          ! l at a(1,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_dlassq( k, a( 0 ), lda+1, scale, s )
+                       ! tri u at a(0,0)
+                       call stdlib_dlassq( k-1, a( 1 ), lda+1, scale, s )
+                       ! tri l at a(1,0)
+                    end if
+                 end if
+              else
+                 ! n is even
+                 if( ifm==1 ) then
+                    ! a is normal
+                    if( ilu==0 ) then
+                       ! a is upper
+                       do j = 0, k - 2
+                          call stdlib_dlassq( k-j-1, a( k+j+2+j*lda ), 1, scale, s )
+                          ! l at a(k+1,0)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_dlassq( k+j, a( 0+j*lda ), 1, scale, s )
+                          ! trap u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_dlassq( k, a( k+1 ), lda+1, scale, s )
+                       ! tri l at a(k+1,0)
+                       call stdlib_dlassq( k, a( k ), lda+1, scale, s )
+                       ! tri u at a(k,0)
+                    else
+                       ! ilu=1
+                       do j = 0, k - 1
+                          call stdlib_dlassq( n-j-1, a( j+2+j*lda ), 1, scale, s )
+                          ! trap l at a(1,0)
+                       end do
+                       do j = 1, k - 1
+                          call stdlib_dlassq( j, a( 0+j*lda ), 1, scale, s )
+                          ! u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_dlassq( k, a( 1 ), lda+1, scale, s )
+                       ! tri l at a(1,0)
+                       call stdlib_dlassq( k, a( 0 ), lda+1, scale, s )
+                       ! tri u at a(0,0)
+                    end if
+                 else
+                    ! a is xpose
+                    if( ilu==0 ) then
+                       ! a**t is upper
+                       do j = 1, k - 1
+                          call stdlib_dlassq( j, a( 0+( k+1+j )*lda ), 1, scale, s )
+                          ! u at a(0,k+1)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_dlassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k rect. at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_dlassq( k-j-1, a( j+1+( j+k )*lda ), 1, scale,s )
+                          ! l at a(0,k)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_dlassq( k, a( 0+( k+1 )*lda ), lda+1, scale, s )
+                       ! tri u at a(0,k+1)
+                       call stdlib_dlassq( k, a( 0+k*lda ), lda+1, scale, s )
+                       ! tri l at a(0,k)
+                    else
+                       ! a**t is lower
+                       do j = 1, k - 1
+                          call stdlib_dlassq( j, a( 0+( j+1 )*lda ), 1, scale, s )
+                          ! u at a(0,1)
+                       end do
+                       do j = k + 1, n
+                          call stdlib_dlassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k rect. at a(0,k+1)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_dlassq( k-j-1, a( j+1+j*lda ), 1, scale, s )
+                          ! l at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_dlassq( k, a( lda ), lda+1, scale, s )
+                       ! tri l at a(0,1)
+                       call stdlib_dlassq( k, a( 0 ), lda+1, scale, s )
+                       ! tri u at a(0,0)
+                    end if
+                 end if
+              end if
+              value = scale*sqrt( s )
+           end if
+           stdlib_dlansf = value
+           return
+     end function stdlib_dlansf
+
+     !> DLANSP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real symmetric matrix A,  supplied in packed form.
+
+     real(dp) function stdlib_dlansp( norm, uplo, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(in) :: ap(*)
+           real(dp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k
+           real(dp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 k = 1
+                 do j = 1, n
+                    do i = k, k + j - 1
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                    k = k + j
+                 end do
+              else
+                 k = 1
+                 do j = 1, n
+                    do i = k, k + n - j
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                    k = k + n - j + 1
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    work( j ) = sum + abs( ap( k ) )
+                    k = k + 1
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( ap( k ) )
+                    k = k + 1
+                    do i = j + 1, n
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              k = 2
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_dlassq( j-1, ap( k ), 1, scale, sum )
+                    k = k + j
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_dlassq( n-j, ap( k ), 1, scale, sum )
+                    k = k + n - j + 1
+                 end do
+              end if
+              sum = 2*sum
+              k = 1
+              do i = 1, n
+                 if( ap( k )/=zero ) then
+                    absa = abs( ap( k ) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    k = k + i + 1
+                 else
+                    k = k + n - i + 1
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_dlansp = value
+           return
+     end function stdlib_dlansp
+
+     !> DLANST:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real symmetric tridiagonal matrix A.
+
+     pure real(dp) function stdlib_dlanst( norm, n, d, e )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*), e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: anorm, scale, sum
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n<=0 ) then
+              anorm = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              anorm = abs( d( n ) )
+              do i = 1, n - 1
+                 sum = abs( d( i ) )
+                 if( anorm < sum .or. stdlib_disnan( sum ) ) anorm = sum
+                 sum = abs( e( i ) )
+                 if( anorm < sum .or. stdlib_disnan( sum ) ) anorm = sum
+              end do
+           else if( stdlib_lsame( norm, 'O' ) .or. norm=='1' .or.stdlib_lsame( norm, 'I' ) ) &
+                     then
+              ! find norm1(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( e( 1 ) )
+                 sum = abs( e( n-1 ) )+abs( d( n ) )
+                 if( anorm < sum .or. stdlib_disnan( sum ) ) anorm = sum
+                 do i = 2, n - 1
+                    sum = abs( d( i ) )+abs( e( i ) )+abs( e( i-1 ) )
+                    if( anorm < sum .or. stdlib_disnan( sum ) ) anorm = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( n>1 ) then
+                 call stdlib_dlassq( n-1, e, 1, scale, sum )
+                 sum = 2*sum
+              end if
+              call stdlib_dlassq( n, d, 1, scale, sum )
+              anorm = scale*sqrt( sum )
+           end if
+           stdlib_dlanst = anorm
+           return
+     end function stdlib_dlanst
+
+     !> DLANSY:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real symmetric matrix A.
+
+     real(dp) function stdlib_dlansy( norm, uplo, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = 1, j
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                 end do
+              else
+                 do j = 1, n
+                    do i = j, n
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( a( j, j ) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( a( j, j ) )
+                    do i = j + 1, n
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_dlassq( j-1, a( 1, j ), 1, scale, sum )
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_dlassq( n-j, a( j+1, j ), 1, scale, sum )
+                 end do
+              end if
+              sum = 2*sum
+              call stdlib_dlassq( n, a, lda+1, scale, sum )
+              value = scale*sqrt( sum )
+           end if
+           stdlib_dlansy = value
+           return
+     end function stdlib_dlansy
+
+     !> DLANTB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n triangular band matrix A,  with ( k + 1 ) diagonals.
+
+     real(dp) function stdlib_dlantb( norm, uplo, diag, n, k, ab,ldab, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(dp), intent(in) :: ab(ldab,*)
+           real(dp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j, l
+           real(dp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = max( k+2-j, 1 ), k
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = 2, min( n+1-j, k+1 )
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = max( k+2-j, 1 ), k + 1
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = 1, min( n+1-j, k+1 )
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = max( k+2-j, 1 ), k
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = max( k+2-j, 1 ), k + 1
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = 2, min( n+1-j, k+1 )
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = 1, min( n+1-j, k+1 )
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       l = k + 1 - j
+                       do i = max( 1, j-k ), j - 1
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       l = k + 1 - j
+                       do i = max( 1, j-k ), j
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       l = 1 - j
+                       do i = j + 1, min( n, j+k )
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       l = 1 - j
+                       do i = j, min( n, j+k )
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 end if
+              end if
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    if( k>0 ) then
+                       do j = 2, n
+                          call stdlib_dlassq( min( j-1, k ),ab( max( k+2-j, 1 ), j ), 1, scale,&
+                                    sum )
+                       end do
+                    end if
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_dlassq( min( j, k+1 ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    if( k>0 ) then
+                       do j = 1, n - 1
+                          call stdlib_dlassq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                       end do
+                    end if
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_dlassq( min( n-j+1, k+1 ), ab( 1, j ), 1, scale,sum )
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_dlantb = value
+           return
+     end function stdlib_dlantb
+
+     !> DLANTP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> triangular matrix A, supplied in packed form.
+
+     real(dp) function stdlib_dlantp( norm, uplo, diag, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(in) :: ap(*)
+           real(dp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j, k
+           real(dp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              k = 1
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = k, k + j - 2
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                       k = k + j
+                    end do
+                 else
+                    do j = 1, n
+                       do i = k + 1, k + n - j
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                       k = k + n - j + 1
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = k, k + j - 1
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                       k = k + j
+                    end do
+                 else
+                    do j = 1, n
+                       do i = k, k + n - j
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                       k = k + n - j + 1
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              k = 1
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = k, k + j - 2
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = k, k + j - 1
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    end if
+                    k = k + j
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = k + 1, k + n - j
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = k, k + n - j
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    end if
+                    k = k + n - j + 1
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       do i = 1, j - 1
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                       k = k + 1
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = 1, j
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       k = k + 1
+                       do i = j + 1, n
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j, n
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 end if
+              end if
+              value = zero
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    k = 2
+                    do j = 2, n
+                       call stdlib_dlassq( j-1, ap( k ), 1, scale, sum )
+                       k = k + j
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    k = 1
+                    do j = 1, n
+                       call stdlib_dlassq( j, ap( k ), 1, scale, sum )
+                       k = k + j
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    k = 2
+                    do j = 1, n - 1
+                       call stdlib_dlassq( n-j, ap( k ), 1, scale, sum )
+                       k = k + n - j + 1
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    k = 1
+                    do j = 1, n
+                       call stdlib_dlassq( n-j+1, ap( k ), 1, scale, sum )
+                       k = k + n - j + 1
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_dlantp = value
+           return
+     end function stdlib_dlantp
+
+     !> DLANTR:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> trapezoidal or triangular matrix A.
+
+     real(dp) function stdlib_dlantr( norm, uplo, diag, m, n, a, lda,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j
+           real(dp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( min( m, n )==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = 1, min( m, j-1 )
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = j + 1, m
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = 1, min( m, j )
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = j, m
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( ( udiag ) .and. ( j<=m ) ) then
+                       sum = one
+                       do i = 1, j - 1
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = 1, min( m, j )
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = j + 1, m
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = j, m
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, m
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       do i = 1, min( m, j-1 )
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = 1, min( m, j )
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, min( m, n )
+                       work( i ) = one
+                    end do
+                    do i = n + 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j + 1, m
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j, m
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 end if
+              end if
+              value = zero
+              do i = 1, m
+                 sum = work( i )
+                 if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = min( m, n )
+                    do j = 2, n
+                       call stdlib_dlassq( min( m, j-1 ), a( 1, j ), 1, scale, sum )
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_dlassq( min( m, j ), a( 1, j ), 1, scale, sum )
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = min( m, n )
+                    do j = 1, n
+                       call stdlib_dlassq( m-j, a( min( m, j+1 ), j ), 1, scale,sum )
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_dlassq( m-j+1, a( j, j ), 1, scale, sum )
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_dlantr = value
+           return
+     end function stdlib_dlantr
+
+     !> DLAORHR_COL_GETRFNP: computes the modified LU factorization without
+     !> pivoting of a real general M-by-N matrix A. The factorization has
+     !> the form:
+     !> A - S = L * U,
+     !> where:
+     !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+     !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+     !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+     !> i-1 steps of Gaussian elimination. This means that the diagonal
+     !> element at each step of "modified" Gaussian elimination is
+     !> at least one in absolute value (so that division-by-zero not
+     !> not possible during the division by the diagonal element);
+     !> L is a M-by-N lower triangular matrix with unit diagonal elements
+     !> (lower trapezoidal if M > N);
+     !> and U is a M-by-N upper triangular matrix
+     !> (upper trapezoidal if M < N).
+     !> This routine is an auxiliary routine used in the Householder
+     !> reconstruction routine DORHR_COL. In DORHR_COL, this routine is
+     !> applied to an M-by-N matrix A with orthonormal columns, where each
+     !> element is bounded by one in absolute value. With the choice of
+     !> the matrix S above, one can show that the diagonal element at each
+     !> step of Gaussian elimination is the largest (in absolute value) in
+     !> the column on or below the diagonal, so that no pivoting is required
+     !> for numerical stability [1].
+     !> For more details on the Householder reconstruction algorithm,
+     !> including the modified LU factorization, see [1].
+     !> This is the blocked right-looking version of the algorithm,
+     !> calling Level 3 BLAS to update the submatrix. To factorize a block,
+     !> this routine calls the recursive routine DLAORHR_COL_GETRFNP2.
+     !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+     !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+     !> E. Solomonik, J. Parallel Distrib. Comput.,
+     !> vol. 85, pp. 3-31, 2015.
+
+     pure subroutine stdlib_dlaorhr_col_getrfnp( m, n, a, lda, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: d(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: iinfo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAORHR_COL_GETRFNP', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'DLAORHR_COL_GETRFNP', ' ', m, n, -1, -1 )
+           if( nb<=1 .or. nb>=min( m, n ) ) then
+              ! use unblocked code.
+              call stdlib_dlaorhr_col_getrfnp2( m, n, a, lda, d, info )
+           else
+              ! use blocked code.
+              do j = 1, min( m, n ), nb
+                 jb = min( min( m, n )-j+1, nb )
+                 ! factor diagonal and subdiagonal blocks.
+                 call stdlib_dlaorhr_col_getrfnp2( m-j+1, jb, a( j, j ), lda,d( j ), iinfo )
+                           
+                 if( j+jb<=n ) then
+                    ! compute block row of u.
+                    call stdlib_dtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', jb,n-j-jb+1, one, &
+                              a( j, j ), lda, a( j, j+jb ),lda )
+                    if( j+jb<=m ) then
+                       ! update trailing submatrix.
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-j-jb+1,n-j-jb+1, jb, -&
+                       one, a( j+jb, j ), lda,a( j, j+jb ), lda, one, a( j+jb, j+jb ),lda )
+                                 
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_dlaorhr_col_getrfnp
+
+     !> DLAPY2: returns sqrt(x**2+y**2), taking care not to cause unnecessary
+     !> overflow and unnecessary underflow.
+
+     pure real(dp) function stdlib_dlapy2( x, y )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: x, y
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(dp) :: w, xabs, yabs, z, hugeval
+           logical(lk) :: x_is_nan, y_is_nan
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           x_is_nan = stdlib_disnan( x )
+           y_is_nan = stdlib_disnan( y )
+           if ( x_is_nan ) stdlib_dlapy2 = x
+           if ( y_is_nan ) stdlib_dlapy2 = y
+           hugeval = stdlib_dlamch( 'OVERFLOW' )
+           if ( .not.( x_is_nan.or.y_is_nan ) ) then
+              xabs = abs( x )
+              yabs = abs( y )
+              w = max( xabs, yabs )
+              z = min( xabs, yabs )
+              if( z==zero .or. w>hugeval ) then
+                 stdlib_dlapy2 = w
+              else
+                 stdlib_dlapy2 = w*sqrt( one+( z / w )**2 )
+              end if
+           end if
+           return
+     end function stdlib_dlapy2
+
+     !> Given a 3-by-3 matrix pencil (A,B), DLAQZ1: sets v to a
+     !> scalar multiple of the first column of the product
+     !> (*)  K = (A - (beta2*sr2 - i*si)*B)*B^(-1)*(beta1*A - (sr2 + i*si2)*B)*B^(-1).
+     !> It is assumed that either
+     !> 1) sr1 = sr2
+     !> or
+     !> 2) si = 0.
+     !> This is useful for starting double implicit shift bulges
+     !> in the QZ algorithm.
+
+     pure subroutine stdlib_dlaqz1( a, lda, b, ldb, sr1, sr2, si, beta1, beta2,v )
+           ! arguments
+           integer(ilp), intent( in ) :: lda, ldb
+           real(dp), intent( in ) :: a( lda, * ), b( ldb, * ), sr1,sr2, si, beta1, beta2
+           real(dp), intent( out ) :: v( * )
+           
+           ! local scalars
+           real(dp) :: w(2), safmin, safmax, scale1, scale2
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           ! calculate first shifted vector
+           w( 1 ) = beta1*a( 1, 1 )-sr1*b( 1, 1 )
+           w( 2 ) = beta1*a( 2, 1 )-sr1*b( 2, 1 )
+           scale1 = sqrt( abs( w( 1 ) ) ) * sqrt( abs( w( 2 ) ) )
+           if( scale1 >= safmin .and. scale1 <= safmax ) then
+              w( 1 ) = w( 1 )/scale1
+              w( 2 ) = w( 2 )/scale1
+           end if
+           ! solve linear system
+           w( 2 ) = w( 2 )/b( 2, 2 )
+           w( 1 ) = ( w( 1 )-b( 1, 2 )*w( 2 ) )/b( 1, 1 )
+           scale2 = sqrt( abs( w( 1 ) ) ) * sqrt( abs( w( 2 ) ) )
+           if( scale2 >= safmin .and. scale2 <= safmax ) then
+              w( 1 ) = w( 1 )/scale2
+              w( 2 ) = w( 2 )/scale2
+           end if
+           ! apply second shift
+           v( 1 ) = beta2*( a( 1, 1 )*w( 1 )+a( 1, 2 )*w( 2 ) )-sr2*( b( 1,1 )*w( 1 )+b( 1, 2 )*w(&
+                      2 ) )
+           v( 2 ) = beta2*( a( 2, 1 )*w( 1 )+a( 2, 2 )*w( 2 ) )-sr2*( b( 2,1 )*w( 1 )+b( 2, 2 )*w(&
+                      2 ) )
+           v( 3 ) = beta2*( a( 3, 1 )*w( 1 )+a( 3, 2 )*w( 2 ) )-sr2*( b( 3,1 )*w( 1 )+b( 3, 2 )*w(&
+                      2 ) )
+           ! account for imaginary part
+           v( 1 ) = v( 1 )+si*si*b( 1, 1 )/scale1/scale2
+           ! check for overflow
+           if( abs( v( 1 ) )>safmax .or. abs( v( 2 ) ) > safmax .or.abs( v( 3 ) )>safmax .or. &
+           stdlib_disnan( v( 1 ) ) .or.stdlib_disnan( v( 2 ) ) .or. stdlib_disnan( v( 3 ) ) ) &
+                     then
+              v( 1 ) = zero
+              v( 2 ) = zero
+              v( 3 ) = zero
+           end if
+     end subroutine stdlib_dlaqz1
+
+     !> DLAQZ2: chases a 2x2 shift bulge in a matrix pencil down a single position
+
+     pure subroutine stdlib_dlaqz2( ilq, ilz, k, istartm, istopm, ihi, a, lda, b,ldb, nq, qstart, &
+               q, ldq, nz, zstart, z, ldz )
+           ! arguments
+           logical(lk), intent( in ) :: ilq, ilz
+           integer(ilp), intent( in ) :: k, lda, ldb, ldq, ldz, istartm, istopm,nq, nz, qstart, &
+                     zstart, ihi
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           
+           ! local variables
+           real(dp) :: h(2,3), c1, s1, c2, s2, temp
+           if( k+2 == ihi ) then
+              ! shift is located on the edge of the matrix, remove it
+              h = b( ihi-1:ihi, ihi-2:ihi )
+              ! make h upper triangular
+              call stdlib_dlartg( h( 1, 1 ), h( 2, 1 ), c1, s1, temp )
+              h( 2, 1 ) = zero
+              h( 1, 1 ) = temp
+              call stdlib_drot( 2, h( 1, 2 ), 2, h( 2, 2 ), 2, c1, s1 )
+              call stdlib_dlartg( h( 2, 3 ), h( 2, 2 ), c1, s1, temp )
+              call stdlib_drot( 1, h( 1, 3 ), 1, h( 1, 2 ), 1, c1, s1 )
+              call stdlib_dlartg( h( 1, 2 ), h( 1, 1 ), c2, s2, temp )
+              call stdlib_drot( ihi-istartm+1, b( istartm, ihi ), 1, b( istartm,ihi-1 ), 1, c1, &
+                        s1 )
+              call stdlib_drot( ihi-istartm+1, b( istartm, ihi-1 ), 1, b( istartm,ihi-2 ), 1, c2, &
+                        s2 )
+              b( ihi-1, ihi-2 ) = zero
+              b( ihi, ihi-2 ) = zero
+              call stdlib_drot( ihi-istartm+1, a( istartm, ihi ), 1, a( istartm,ihi-1 ), 1, c1, &
+                        s1 )
+              call stdlib_drot( ihi-istartm+1, a( istartm, ihi-1 ), 1, a( istartm,ihi-2 ), 1, c2, &
+                        s2 )
+              if ( ilz ) then
+                 call stdlib_drot( nz, z( 1, ihi-zstart+1 ), 1, z( 1, ihi-1-zstart+1 ), 1, c1, s1 &
+                           )
+                 call stdlib_drot( nz, z( 1, ihi-1-zstart+1 ), 1, z( 1,ihi-2-zstart+1 ), 1, c2, &
+                           s2 )
+              end if
+              call stdlib_dlartg( a( ihi-1, ihi-2 ), a( ihi, ihi-2 ), c1, s1,temp )
+              a( ihi-1, ihi-2 ) = temp
+              a( ihi, ihi-2 ) = zero
+              call stdlib_drot( istopm-ihi+2, a( ihi-1, ihi-1 ), lda, a( ihi,ihi-1 ), lda, c1, s1 &
+                        )
+              call stdlib_drot( istopm-ihi+2, b( ihi-1, ihi-1 ), ldb, b( ihi,ihi-1 ), ldb, c1, s1 &
+                        )
+              if ( ilq ) then
+                 call stdlib_drot( nq, q( 1, ihi-1-qstart+1 ), 1, q( 1, ihi-qstart+1 ), 1, c1, s1 &
+                           )
+              end if
+              call stdlib_dlartg( b( ihi, ihi ), b( ihi, ihi-1 ), c1, s1, temp )
+              b( ihi, ihi ) = temp
+              b( ihi, ihi-1 ) = zero
+              call stdlib_drot( ihi-istartm, b( istartm, ihi ), 1, b( istartm,ihi-1 ), 1, c1, s1 )
+                        
+              call stdlib_drot( ihi-istartm+1, a( istartm, ihi ), 1, a( istartm,ihi-1 ), 1, c1, &
+                        s1 )
+              if ( ilz ) then
+                 call stdlib_drot( nz, z( 1, ihi-zstart+1 ), 1, z( 1, ihi-1-zstart+1 ), 1, c1, s1 &
+                           )
+              end if
+           else
+              ! normal operation, move bulge down
+              h = b( k+1:k+2, k:k+2 )
+              ! make h upper triangular
+              call stdlib_dlartg( h( 1, 1 ), h( 2, 1 ), c1, s1, temp )
+              h( 2, 1 ) = zero
+              h( 1, 1 ) = temp
+              call stdlib_drot( 2, h( 1, 2 ), 2, h( 2, 2 ), 2, c1, s1 )
+              ! calculate z1 and z2
+              call stdlib_dlartg( h( 2, 3 ), h( 2, 2 ), c1, s1, temp )
+              call stdlib_drot( 1, h( 1, 3 ), 1, h( 1, 2 ), 1, c1, s1 )
+              call stdlib_dlartg( h( 1, 2 ), h( 1, 1 ), c2, s2, temp )
+              ! apply transformations from the right
+              call stdlib_drot( k+3-istartm+1, a( istartm, k+2 ), 1, a( istartm,k+1 ), 1, c1, s1 )
+                        
+              call stdlib_drot( k+3-istartm+1, a( istartm, k+1 ), 1, a( istartm,k ), 1, c2, s2 )
+                        
+              call stdlib_drot( k+2-istartm+1, b( istartm, k+2 ), 1, b( istartm,k+1 ), 1, c1, s1 )
+                        
+              call stdlib_drot( k+2-istartm+1, b( istartm, k+1 ), 1, b( istartm,k ), 1, c2, s2 )
+                        
+              if ( ilz ) then
+                 call stdlib_drot( nz, z( 1, k+2-zstart+1 ), 1, z( 1, k+1-zstart+1 ), 1, c1, s1 )
+                           
+                 call stdlib_drot( nz, z( 1, k+1-zstart+1 ), 1, z( 1, k-zstart+1 ),1, c2, s2 )
+                           
+              end if
+              b( k+1, k ) = zero
+              b( k+2, k ) = zero
+              ! calculate q1 and q2
+              call stdlib_dlartg( a( k+2, k ), a( k+3, k ), c1, s1, temp )
+              a( k+2, k ) = temp
+              a( k+3, k ) = zero
+              call stdlib_dlartg( a( k+1, k ), a( k+2, k ), c2, s2, temp )
+              a( k+1, k ) = temp
+              a( k+2, k ) = zero
+              ! apply transformations from the left
+              call stdlib_drot( istopm-k, a( k+2, k+1 ), lda, a( k+3, k+1 ), lda,c1, s1 )
+              call stdlib_drot( istopm-k, a( k+1, k+1 ), lda, a( k+2, k+1 ), lda,c2, s2 )
+              call stdlib_drot( istopm-k, b( k+2, k+1 ), ldb, b( k+3, k+1 ), ldb,c1, s1 )
+              call stdlib_drot( istopm-k, b( k+1, k+1 ), ldb, b( k+2, k+1 ), ldb,c2, s2 )
+              if ( ilq ) then
+                 call stdlib_drot( nq, q( 1, k+2-qstart+1 ), 1, q( 1, k+3-qstart+1 ), 1, c1, s1 )
+                           
+                 call stdlib_drot( nq, q( 1, k+1-qstart+1 ), 1, q( 1, k+2-qstart+1 ), 1, c2, s2 )
+                           
+              end if
+           end if
+     end subroutine stdlib_dlaqz2
+
+     !> DLAQZ4: Executes a single multishift QZ sweep
+
+     pure subroutine stdlib_dlaqz4( ilschur, ilq, ilz, n, ilo, ihi, nshifts,nblock_desired, sr, &
+               si, ss, a, lda, b, ldb, q,ldq, z, ldz, qc, ldqc, zc, ldzc, work, lwork,info )
+           ! function arguments
+           logical(lk), intent( in ) :: ilschur, ilq, ilz
+           integer(ilp), intent( in ) :: n, ilo, ihi, lda, ldb, ldq, ldz, lwork,nshifts, &
+                     nblock_desired, ldqc, ldzc
+           real(dp), intent( inout ) :: a( lda, * ), b( ldb, * ),q( ldq, * ), z( ldz, * ), qc( &
+                     ldqc, * ),zc( ldzc, * ), work( * ), sr( * ), si( * ),ss( * )
+           integer(ilp), intent( out ) :: info
+           
+           ! local scalars
+           integer(ilp) :: i, j, ns, istartm, istopm, sheight, swidth, k, np, istartb, istopb, &
+                     ishift, nblock, npos
+           real(dp) :: temp, v(3), c1, s1, c2, s2, swap
+           info = 0
+           if ( nblock_desired < nshifts+1 ) then
+              info = -8
+           end if
+           if ( lwork ==-1 ) then
+              ! workspace query, quick return
+              work( 1 ) = n*nblock_desired
+              return
+           else if ( lwork < n*nblock_desired ) then
+              info = -25
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAQZ4', -info )
+              return
+           end if
+           ! executable statements
+           if ( nshifts < 2 ) then
+              return
+           end if
+           if ( ilo >= ihi ) then
+              return
+           end if
+           if ( ilschur ) then
+              istartm = 1
+              istopm = n
+           else
+              istartm = ilo
+              istopm = ihi
+           end if
+           ! shuffle shifts into pairs of real shifts and pairs
+           ! of complex conjugate shifts assuming complex
+           ! conjugate shifts are already adjacent to one
+           ! another
+           do i = 1, nshifts-2, 2
+              if( si( i )/=-si( i+1 ) ) then
+                 swap = sr( i )
+                 sr( i ) = sr( i+1 )
+                 sr( i+1 ) = sr( i+2 )
+                 sr( i+2 ) = swap
+                 swap = si( i )
+                 si( i ) = si( i+1 )
+                 si( i+1 ) = si( i+2 )
+                 si( i+2 ) = swap
+                 swap = ss( i )
+                 ss( i ) = ss( i+1 )
+                 ss( i+1 ) = ss( i+2 )
+                 ss( i+2 ) = swap
+              end if
+           end do
+           ! nshfts is supposed to be even, but if it is odd,
+           ! then simply reduce it by one.  the shuffle above
+           ! ensures that the dropped shift is real and that
+           ! the remaining shifts are paired.
+           ns = nshifts-mod( nshifts, 2 )
+           npos = max( nblock_desired-ns, 1 )
+           ! the following block introduces the shifts and chases
+           ! them down one by one just enough to make space for
+           ! the other shifts. the near-the-diagonal block is
+           ! of size (ns+1) x ns.
+           call stdlib_dlaset( 'FULL', ns+1, ns+1, zero, one, qc, ldqc )
+           call stdlib_dlaset( 'FULL', ns, ns, zero, one, zc, ldzc )
+           do i = 1, ns, 2
+              ! introduce the shift
+              call stdlib_dlaqz1( a( ilo, ilo ), lda, b( ilo, ilo ), ldb, sr( i ),sr( i+1 ), si( &
+                        i ), ss( i ), ss( i+1 ), v )
+              temp = v( 2 )
+              call stdlib_dlartg( temp, v( 3 ), c1, s1, v( 2 ) )
+              call stdlib_dlartg( v( 1 ), v( 2 ), c2, s2, temp )
+              call stdlib_drot( ns, a( ilo+1, ilo ), lda, a( ilo+2, ilo ), lda, c1,s1 )
+              call stdlib_drot( ns, a( ilo, ilo ), lda, a( ilo+1, ilo ), lda, c2,s2 )
+              call stdlib_drot( ns, b( ilo+1, ilo ), ldb, b( ilo+2, ilo ), ldb, c1,s1 )
+              call stdlib_drot( ns, b( ilo, ilo ), ldb, b( ilo+1, ilo ), ldb, c2,s2 )
+              call stdlib_drot( ns+1, qc( 1, 2 ), 1, qc( 1, 3 ), 1, c1, s1 )
+              call stdlib_drot( ns+1, qc( 1, 1 ), 1, qc( 1, 2 ), 1, c2, s2 )
+              ! chase the shift down
+              do j = 1, ns-1-i
+                 call stdlib_dlaqz2( .true., .true., j, 1, ns, ihi-ilo+1, a( ilo,ilo ), lda, b( &
+                           ilo, ilo ), ldb, ns+1, 1, qc,ldqc, ns, 1, zc, ldzc )
+              end do
+           end do
+           ! update the rest of the pencil
+           ! update a(ilo:ilo+ns,ilo+ns:istopm) and b(ilo:ilo+ns,ilo+ns:istopm)
+           ! from the left with qc(1:ns+1,1:ns+1)'
+           sheight = ns+1
+           swidth = istopm-( ilo+ns )+1
+           if ( swidth > 0 ) then
+              call stdlib_dgemm( 'T', 'N', sheight, swidth, sheight, one, qc, ldqc,a( ilo, ilo+ns &
+                        ), lda, zero, work, sheight )
+              call stdlib_dlacpy( 'ALL', sheight, swidth, work, sheight, a( ilo,ilo+ns ), lda )
+                        
+              call stdlib_dgemm( 'T', 'N', sheight, swidth, sheight, one, qc, ldqc,b( ilo, ilo+ns &
+                        ), ldb, zero, work, sheight )
+              call stdlib_dlacpy( 'ALL', sheight, swidth, work, sheight, b( ilo,ilo+ns ), ldb )
+                        
+           end if
+           if ( ilq ) then
+              call stdlib_dgemm( 'N', 'N', n, sheight, sheight, one, q( 1, ilo ),ldq, qc, ldqc, &
+                        zero, work, n )
+              call stdlib_dlacpy( 'ALL', n, sheight, work, n, q( 1, ilo ), ldq )
+           end if
+           ! update a(istartm:ilo-1,ilo:ilo+ns-1) and b(istartm:ilo-1,ilo:ilo+ns-1)
+           ! from the right with zc(1:ns,1:ns)
+           sheight = ilo-1-istartm+1
+           swidth = ns
+           if ( sheight > 0 ) then
+              call stdlib_dgemm( 'N', 'N', sheight, swidth, swidth, one, a( istartm,ilo ), lda, &
+                        zc, ldzc, zero, work, sheight )
+              call stdlib_dlacpy( 'ALL', sheight, swidth, work, sheight, a( istartm,ilo ), lda )
+                        
+              call stdlib_dgemm( 'N', 'N', sheight, swidth, swidth, one, b( istartm,ilo ), ldb, &
+                        zc, ldzc, zero, work, sheight )
+              call stdlib_dlacpy( 'ALL', sheight, swidth, work, sheight, b( istartm,ilo ), ldb )
+                        
+           end if
+           if ( ilz ) then
+              call stdlib_dgemm( 'N', 'N', n, swidth, swidth, one, z( 1, ilo ), ldz,zc, ldzc, &
+                        zero, work, n )
+              call stdlib_dlacpy( 'ALL', n, swidth, work, n, z( 1, ilo ), ldz )
+           end if
+           ! the following block chases the shifts down to the bottom
+           ! right block. if possible, a shift is moved down npos
+           ! positions at a time
+           k = ilo
+           do while ( k < ihi-ns )
+              np = min( ihi-ns-k, npos )
+              ! size of the near-the-diagonal block
+              nblock = ns+np
+              ! istartb points to the first row we will be updating
+              istartb = k+1
+              ! istopb points to the last column we will be updating
+              istopb = k+nblock-1
+              call stdlib_dlaset( 'FULL', ns+np, ns+np, zero, one, qc, ldqc )
+              call stdlib_dlaset( 'FULL', ns+np, ns+np, zero, one, zc, ldzc )
+              ! near the diagonal shift chase
+              do i = ns-1, 0, -2
+                 do j = 0, np-1
+                    ! move down the block with index k+i+j-1, updating
+                    ! the (ns+np x ns+np) block:
+                    ! (k:k+ns+np,k:k+ns+np-1)
+                    call stdlib_dlaqz2( .true., .true., k+i+j-1, istartb, istopb,ihi, a, lda, b, &
+                              ldb, nblock, k+1, qc, ldqc,nblock, k, zc, ldzc )
+                 end do
+              end do
+              ! update rest of the pencil
+              ! update a(k+1:k+ns+np, k+ns+np:istopm) and
+              ! b(k+1:k+ns+np, k+ns+np:istopm)
+              ! from the left with qc(1:ns+np,1:ns+np)'
+              sheight = ns+np
+              swidth = istopm-( k+ns+np )+1
+              if ( swidth > 0 ) then
+                 call stdlib_dgemm( 'T', 'N', sheight, swidth, sheight, one, qc,ldqc, a( k+1, k+&
+                           ns+np ), lda, zero, work,sheight )
+                 call stdlib_dlacpy( 'ALL', sheight, swidth, work, sheight, a( k+1,k+ns+np ), lda &
+                           )
+                 call stdlib_dgemm( 'T', 'N', sheight, swidth, sheight, one, qc,ldqc, b( k+1, k+&
+                           ns+np ), ldb, zero, work,sheight )
+                 call stdlib_dlacpy( 'ALL', sheight, swidth, work, sheight, b( k+1,k+ns+np ), ldb &
+                           )
+              end if
+              if ( ilq ) then
+                 call stdlib_dgemm( 'N', 'N', n, nblock, nblock, one, q( 1, k+1 ),ldq, qc, ldqc, &
+                           zero, work, n )
+                 call stdlib_dlacpy( 'ALL', n, nblock, work, n, q( 1, k+1 ), ldq )
+              end if
+              ! update a(istartm:k,k:k+ns+npos-1) and b(istartm:k,k:k+ns+npos-1)
+              ! from the right with zc(1:ns+np,1:ns+np)
+              sheight = k-istartm+1
+              swidth = nblock
+              if ( sheight > 0 ) then
+                 call stdlib_dgemm( 'N', 'N', sheight, swidth, swidth, one,a( istartm, k ), lda, &
+                           zc, ldzc, zero, work,sheight )
+                 call stdlib_dlacpy( 'ALL', sheight, swidth, work, sheight,a( istartm, k ), lda )
+                           
+                 call stdlib_dgemm( 'N', 'N', sheight, swidth, swidth, one,b( istartm, k ), ldb, &
+                           zc, ldzc, zero, work,sheight )
+                 call stdlib_dlacpy( 'ALL', sheight, swidth, work, sheight,b( istartm, k ), ldb )
+                           
+              end if
+              if ( ilz ) then
+                 call stdlib_dgemm( 'N', 'N', n, nblock, nblock, one, z( 1, k ),ldz, zc, ldzc, &
+                           zero, work, n )
+                 call stdlib_dlacpy( 'ALL', n, nblock, work, n, z( 1, k ), ldz )
+              end if
+              k = k+np
+           end do
+           ! the following block removes the shifts from the bottom right corner
+           ! one by one. updates are initially applied to a(ihi-ns+1:ihi,ihi-ns:ihi).
+           call stdlib_dlaset( 'FULL', ns, ns, zero, one, qc, ldqc )
+           call stdlib_dlaset( 'FULL', ns+1, ns+1, zero, one, zc, ldzc )
+           ! istartb points to the first row we will be updating
+           istartb = ihi-ns+1
+           ! istopb points to the last column we will be updating
+           istopb = ihi
+           do i = 1, ns, 2
+              ! chase the shift down to the bottom right corner
+              do ishift = ihi-i-1, ihi-2
+                 call stdlib_dlaqz2( .true., .true., ishift, istartb, istopb, ihi,a, lda, b, ldb, &
+                           ns, ihi-ns+1, qc, ldqc, ns+1,ihi-ns, zc, ldzc )
+              end do
+           end do
+           ! update rest of the pencil
+           ! update a(ihi-ns+1:ihi, ihi+1:istopm)
+           ! from the left with qc(1:ns,1:ns)'
+           sheight = ns
+           swidth = istopm-( ihi+1 )+1
+           if ( swidth > 0 ) then
+              call stdlib_dgemm( 'T', 'N', sheight, swidth, sheight, one, qc, ldqc,a( ihi-ns+1, &
+                        ihi+1 ), lda, zero, work, sheight )
+              call stdlib_dlacpy( 'ALL', sheight, swidth, work, sheight,a( ihi-ns+1, ihi+1 ), lda &
+                        )
+              call stdlib_dgemm( 'T', 'N', sheight, swidth, sheight, one, qc, ldqc,b( ihi-ns+1, &
+                        ihi+1 ), ldb, zero, work, sheight )
+              call stdlib_dlacpy( 'ALL', sheight, swidth, work, sheight,b( ihi-ns+1, ihi+1 ), ldb &
+                        )
+           end if
+           if ( ilq ) then
+              call stdlib_dgemm( 'N', 'N', n, ns, ns, one, q( 1, ihi-ns+1 ), ldq,qc, ldqc, zero, &
+                        work, n )
+              call stdlib_dlacpy( 'ALL', n, ns, work, n, q( 1, ihi-ns+1 ), ldq )
+           end if
+           ! update a(istartm:ihi-ns,ihi-ns:ihi)
+           ! from the right with zc(1:ns+1,1:ns+1)
+           sheight = ihi-ns-istartm+1
+           swidth = ns+1
+           if ( sheight > 0 ) then
+              call stdlib_dgemm( 'N', 'N', sheight, swidth, swidth, one, a( istartm,ihi-ns ), lda,&
+                         zc, ldzc, zero, work, sheight )
+              call stdlib_dlacpy( 'ALL', sheight, swidth, work, sheight, a( istartm,ihi-ns ), lda &
+                        )
+              call stdlib_dgemm( 'N', 'N', sheight, swidth, swidth, one, b( istartm,ihi-ns ), ldb,&
+                         zc, ldzc, zero, work, sheight )
+              call stdlib_dlacpy( 'ALL', sheight, swidth, work, sheight, b( istartm,ihi-ns ), ldb &
+                        )
+           end if
+           if ( ilz ) then
+              call stdlib_dgemm( 'N', 'N', n, ns+1, ns+1, one, z( 1, ihi-ns ), ldz,zc, ldzc, zero,&
+                         work, n )
+              call stdlib_dlacpy( 'ALL', n, ns+1, work, n, z( 1, ihi-ns ), ldz )
+           end if
+     end subroutine stdlib_dlaqz4
+
+     !> DLAR1V: computes the (scaled) r-th column of the inverse of
+     !> the sumbmatrix in rows B1 through BN of the tridiagonal matrix
+     !> L D L**T - sigma I. When sigma is close to an eigenvalue, the
+     !> computed vector is an accurate eigenvector. Usually, r corresponds
+     !> to the index where the eigenvector is largest in magnitude.
+     !> The following steps accomplish this computation :
+     !> (a) Stationary qd transform,  L D L**T - sigma I = L(+) D(+) L(+)**T,
+     !> (b) Progressive qd transform, L D L**T - sigma I = U(-) D(-) U(-)**T,
+     !> (c) Computation of the diagonal elements of the inverse of
+     !> L D L**T - sigma I by combining the above transforms, and choosing
+     !> r as the index where the diagonal of the inverse is (one of the)
+     !> largest in magnitude.
+     !> (d) Computation of the (scaled) r-th column of the inverse using the
+     !> twisted factorization obtained by combining the top part of the
+     !> the stationary and the bottom part of the progressive transform.
+
+     pure subroutine stdlib_dlar1v( n, b1, bn, lambda, d, l, ld, lld,pivmin, gaptol, z, wantnc, &
+               negcnt, ztz, mingma,r, isuppz, nrminv, resid, rqcorr, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantnc
+           integer(ilp), intent(in) :: b1, bn, n
+           integer(ilp), intent(out) :: negcnt
+           integer(ilp), intent(inout) :: r
+           real(dp), intent(in) :: gaptol, lambda, pivmin
+           real(dp), intent(out) :: mingma, nrminv, resid, rqcorr, ztz
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*)
+           real(dp), intent(in) :: d(*), l(*), ld(*), lld(*)
+           real(dp), intent(out) :: work(*)
+           real(dp), intent(inout) :: z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: sawnan1, sawnan2
+           integer(ilp) :: i, indlpl, indp, inds, indumn, neg1, neg2, r1, r2
+           real(dp) :: dminus, dplus, eps, s, tmp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           eps = stdlib_dlamch( 'PRECISION' )
+           if( r==0 ) then
+              r1 = b1
+              r2 = bn
+           else
+              r1 = r
+              r2 = r
+           end if
+           ! storage for lplus
+           indlpl = 0
+           ! storage for uminus
+           indumn = n
+           inds = 2*n + 1
+           indp = 3*n + 1
+           if( b1==1 ) then
+              work( inds ) = zero
+           else
+              work( inds+b1-1 ) = lld( b1-1 )
+           end if
+           ! compute the stationary transform (using the differential form)
+           ! until the index r2.
+           sawnan1 = .false.
+           neg1 = 0
+           s = work( inds+b1-1 ) - lambda
+           do i = b1, r1 - 1
+              dplus = d( i ) + s
+              work( indlpl+i ) = ld( i ) / dplus
+              if(dplus<zero) neg1 = neg1 + 1
+              work( inds+i ) = s*work( indlpl+i )*l( i )
+              s = work( inds+i ) - lambda
+           end do
+           sawnan1 = stdlib_disnan( s )
+           if( sawnan1 ) goto 60
+           do i = r1, r2 - 1
+              dplus = d( i ) + s
+              work( indlpl+i ) = ld( i ) / dplus
+              work( inds+i ) = s*work( indlpl+i )*l( i )
+              s = work( inds+i ) - lambda
+           end do
+           sawnan1 = stdlib_disnan( s )
+           60 continue
+           if( sawnan1 ) then
+              ! runs a slower version of the above loop if a nan is detected
+              neg1 = 0
+              s = work( inds+b1-1 ) - lambda
+              do i = b1, r1 - 1
+                 dplus = d( i ) + s
+                 if(abs(dplus)<pivmin) dplus = -pivmin
+                 work( indlpl+i ) = ld( i ) / dplus
+                 if(dplus<zero) neg1 = neg1 + 1
+                 work( inds+i ) = s*work( indlpl+i )*l( i )
+                 if( work( indlpl+i )==zero )work( inds+i ) = lld( i )
+                 s = work( inds+i ) - lambda
+              end do
+              do i = r1, r2 - 1
+                 dplus = d( i ) + s
+                 if(abs(dplus)<pivmin) dplus = -pivmin
+                 work( indlpl+i ) = ld( i ) / dplus
+                 work( inds+i ) = s*work( indlpl+i )*l( i )
+                 if( work( indlpl+i )==zero )work( inds+i ) = lld( i )
+                 s = work( inds+i ) - lambda
+              end do
+           end if
+           ! compute the progressive transform (using the differential form)
+           ! until the index r1
+           sawnan2 = .false.
+           neg2 = 0
+           work( indp+bn-1 ) = d( bn ) - lambda
+           do i = bn - 1, r1, -1
+              dminus = lld( i ) + work( indp+i )
+              tmp = d( i ) / dminus
+              if(dminus<zero) neg2 = neg2 + 1
+              work( indumn+i ) = l( i )*tmp
+              work( indp+i-1 ) = work( indp+i )*tmp - lambda
+           end do
+           tmp = work( indp+r1-1 )
+           sawnan2 = stdlib_disnan( tmp )
+           if( sawnan2 ) then
+              ! runs a slower version of the above loop if a nan is detected
+              neg2 = 0
+              do i = bn-1, r1, -1
+                 dminus = lld( i ) + work( indp+i )
+                 if(abs(dminus)<pivmin) dminus = -pivmin
+                 tmp = d( i ) / dminus
+                 if(dminus<zero) neg2 = neg2 + 1
+                 work( indumn+i ) = l( i )*tmp
+                 work( indp+i-1 ) = work( indp+i )*tmp - lambda
+                 if( tmp==zero )work( indp+i-1 ) = d( i ) - lambda
+              end do
+           end if
+           ! find the index (from r1 to r2) of the largest (in magnitude)
+           ! diagonal element of the inverse
+           mingma = work( inds+r1-1 ) + work( indp+r1-1 )
+           if( mingma<zero ) neg1 = neg1 + 1
+           if( wantnc ) then
+              negcnt = neg1 + neg2
+           else
+              negcnt = -1
+           endif
+           if( abs(mingma)==zero )mingma = eps*work( inds+r1-1 )
+           r = r1
+           do i = r1, r2 - 1
+              tmp = work( inds+i ) + work( indp+i )
+              if( tmp==zero )tmp = eps*work( inds+i )
+              if( abs( tmp )<=abs( mingma ) ) then
+                 mingma = tmp
+                 r = i + 1
+              end if
+           end do
+           ! compute the fp vector: solve n^t v = e_r
+           isuppz( 1 ) = b1
+           isuppz( 2 ) = bn
+           z( r ) = one
+           ztz = one
+           ! compute the fp vector upwards from r
+           if( .not.sawnan1 .and. .not.sawnan2 ) then
+              do i = r-1, b1, -1
+                 z( i ) = -( work( indlpl+i )*z( i+1 ) )
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i ) = zero
+                    isuppz( 1 ) = i + 1
+                    goto 220
+                 endif
+                 ztz = ztz + z( i )*z( i )
+              end do
+              220 continue
+           else
+              ! run slower loop if nan occurred.
+              do i = r - 1, b1, -1
+                 if( z( i+1 )==zero ) then
+                    z( i ) = -( ld( i+1 ) / ld( i ) )*z( i+2 )
+                 else
+                    z( i ) = -( work( indlpl+i )*z( i+1 ) )
+                 end if
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i ) = zero
+                    isuppz( 1 ) = i + 1
+                    go to 240
+                 end if
+                 ztz = ztz + z( i )*z( i )
+              end do
+              240 continue
+           endif
+           ! compute the fp vector downwards from r in blocks of size blksiz
+           if( .not.sawnan1 .and. .not.sawnan2 ) then
+              do i = r, bn-1
+                 z( i+1 ) = -( work( indumn+i )*z( i ) )
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i+1 ) = zero
+                    isuppz( 2 ) = i
+                    go to 260
+                 end if
+                 ztz = ztz + z( i+1 )*z( i+1 )
+              end do
+              260 continue
+           else
+              ! run slower loop if nan occurred.
+              do i = r, bn - 1
+                 if( z( i )==zero ) then
+                    z( i+1 ) = -( ld( i-1 ) / ld( i ) )*z( i-1 )
+                 else
+                    z( i+1 ) = -( work( indumn+i )*z( i ) )
+                 end if
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i+1 ) = zero
+                    isuppz( 2 ) = i
+                    go to 280
+                 end if
+                 ztz = ztz + z( i+1 )*z( i+1 )
+              end do
+              280 continue
+           end if
+           ! compute quantities for convergence test
+           tmp = one / ztz
+           nrminv = sqrt( tmp )
+           resid = abs( mingma )*nrminv
+           rqcorr = mingma*tmp
+           return
+     end subroutine stdlib_dlar1v
+
+     !> DLARFG: generates a real elementary reflector H of order n, such
+     !> that
+     !> H * ( alpha ) = ( beta ),   H**T * H = I.
+     !> (   x   )   (   0  )
+     !> where alpha and beta are scalars, and x is an (n-1)-element real
+     !> vector. H is represented in the form
+     !> H = I - tau * ( 1 ) * ( 1 v**T ) ,
+     !> ( v )
+     !> where tau is a real scalar and v is a real (n-1)-element
+     !> vector.
+     !> If the elements of x are all zero, then tau = 0 and H is taken to be
+     !> the unit matrix.
+     !> Otherwise  1 <= tau <= 2.
+
+     pure subroutine stdlib_dlarfg( n, alpha, x, incx, tau )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           real(dp), intent(inout) :: alpha
+           real(dp), intent(out) :: tau
+           ! Array Arguments 
+           real(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, knt
+           real(dp) :: beta, rsafmn, safmin, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign
+           ! Executable Statements 
+           if( n<=1 ) then
+              tau = zero
+              return
+           end if
+           xnorm = stdlib_dnrm2( n-1, x, incx )
+           if( xnorm==zero ) then
+              ! h  =  i
+              tau = zero
+           else
+              ! general case
+              beta = -sign( stdlib_dlapy2( alpha, xnorm ), alpha )
+              safmin = stdlib_dlamch( 'S' ) / stdlib_dlamch( 'E' )
+              knt = 0
+              if( abs( beta )<safmin ) then
+                 ! xnorm, beta may be inaccurate; scale x and recompute them
+                 rsafmn = one / safmin
+                 10 continue
+                 knt = knt + 1
+                 call stdlib_dscal( n-1, rsafmn, x, incx )
+                 beta = beta*rsafmn
+                 alpha = alpha*rsafmn
+                 if( (abs( beta )<safmin) .and. (knt < 20) )go to 10
+                 ! new beta is at most 1, at least safmin
+                 xnorm = stdlib_dnrm2( n-1, x, incx )
+                 beta = -sign( stdlib_dlapy2( alpha, xnorm ), alpha )
+              end if
+              tau = ( beta-alpha ) / beta
+              call stdlib_dscal( n-1, one / ( alpha-beta ), x, incx )
+              ! if alpha is subnormal, it may lose relative accuracy
+              do j = 1, knt
+                 beta = beta*safmin
+              end do
+              alpha = beta
+           end if
+           return
+     end subroutine stdlib_dlarfg
+
+     !> DLARFGP: generates a real elementary reflector H of order n, such
+     !> that
+     !> H * ( alpha ) = ( beta ),   H**T * H = I.
+     !> (   x   )   (   0  )
+     !> where alpha and beta are scalars, beta is non-negative, and x is
+     !> an (n-1)-element real vector.  H is represented in the form
+     !> H = I - tau * ( 1 ) * ( 1 v**T ) ,
+     !> ( v )
+     !> where tau is a real scalar and v is a real (n-1)-element
+     !> vector.
+     !> If the elements of x are all zero, then tau = 0 and H is taken to be
+     !> the unit matrix.
+
+     subroutine stdlib_dlarfgp( n, alpha, x, incx, tau )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           real(dp), intent(inout) :: alpha
+           real(dp), intent(out) :: tau
+           ! Array Arguments 
+           real(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, knt
+           real(dp) :: beta, bignum, savealpha, smlnum, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign
+           ! Executable Statements 
+           if( n<=0 ) then
+              tau = zero
+              return
+           end if
+           xnorm = stdlib_dnrm2( n-1, x, incx )
+           if( xnorm==zero ) then
+              ! h  =  [+/-1, 0; i], sign chosen so alpha >= 0
+              if( alpha>=zero ) then
+                 ! when tau.eq.zero, the vector is special-cased to be
+                 ! all zeros in the application routines.  we do not need
+                 ! to clear it.
+                 tau = zero
+              else
+                 ! however, the application routines rely on explicit
+                 ! zero checks when tau.ne.zero, and we must clear x.
+                 tau = two
+                 do j = 1, n-1
+                    x( 1 + (j-1)*incx ) = 0
+                 end do
+                 alpha = -alpha
+              end if
+           else
+              ! general case
+              beta = sign( stdlib_dlapy2( alpha, xnorm ), alpha )
+              smlnum = stdlib_dlamch( 'S' ) / stdlib_dlamch( 'E' )
+              knt = 0
+              if( abs( beta )<smlnum ) then
+                 ! xnorm, beta may be inaccurate; scale x and recompute them
+                 bignum = one / smlnum
+                 10 continue
+                 knt = knt + 1
+                 call stdlib_dscal( n-1, bignum, x, incx )
+                 beta = beta*bignum
+                 alpha = alpha*bignum
+                 if( (abs( beta )<smlnum) .and. (knt < 20) )go to 10
+                 ! new beta is at most 1, at least smlnum
+                 xnorm = stdlib_dnrm2( n-1, x, incx )
+                 beta = sign( stdlib_dlapy2( alpha, xnorm ), alpha )
+              end if
+              savealpha = alpha
+              alpha = alpha + beta
+              if( beta<zero ) then
+                 beta = -beta
+                 tau = -alpha / beta
+              else
+                 alpha = xnorm * (xnorm/alpha)
+                 tau = alpha / beta
+                 alpha = -alpha
+              end if
+              if ( abs(tau)<=smlnum ) then
+                 ! in the case where the computed tau ends up being a denormalized number,
+                 ! it loses relative accuracy. this is a big problem. solution: flush tau
+                 ! to zero. this explains the next if statement.
+                 ! (bug report provided by pat quillen from mathworks on jul 29, 2009.)
+                 ! (thanks pat. thanks mathworks.)
+                 if( savealpha>=zero ) then
+                    tau = zero
+                 else
+                    tau = two
+                    do j = 1, n-1
+                       x( 1 + (j-1)*incx ) = 0
+                    end do
+                    beta = -savealpha
+                 end if
+              else
+                 ! this is the general case.
+                 call stdlib_dscal( n-1, one / alpha, x, incx )
+              end if
+              ! if beta is subnormal, it may lose relative accuracy
+              do j = 1, knt
+                 beta = beta*smlnum
+              end do
+              alpha = beta
+           end if
+           return
+     end subroutine stdlib_dlarfgp
+
+     !> DLARNV: returns a vector of n random real numbers from a uniform or
+     !> normal distribution.
+
+     pure subroutine stdlib_dlarnv( idist, iseed, n, x )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: idist, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: iseed(4)
+           real(dp), intent(out) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: lv = 128
+           real(dp), parameter :: twopi = 6.28318530717958647692528676655900576839e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, il, il2, iv
+           ! Local Arrays 
+           real(dp) :: u(lv)
+           ! Intrinsic Functions 
+           intrinsic :: cos,log,min,sqrt
+           ! Executable Statements 
+           do 40 iv = 1, n, lv / 2
+              il = min( lv / 2, n-iv+1 )
+              if( idist==3 ) then
+                 il2 = 2*il
+              else
+                 il2 = il
+              end if
+              ! call stdlib_dlaruv to generate il2 numbers from a uniform (0,1)
+              ! distribution (il2 <= lv)
+              call stdlib_dlaruv( iseed, il2, u )
+              if( idist==1 ) then
+                 ! copy generated numbers
+                 do i = 1, il
+                    x( iv+i-1 ) = u( i )
+                 end do
+              else if( idist==2 ) then
+                 ! convert generated numbers to uniform (-1,1) distribution
+                 do i = 1, il
+                    x( iv+i-1 ) = two*u( i ) - one
+                 end do
+              else if( idist==3 ) then
+                 ! convert generated numbers to normal (0,1) distribution
+                 do i = 1, il
+                    x( iv+i-1 ) = sqrt( -two*log( u( 2*i-1 ) ) )*cos( twopi*u( 2*i ) )
+                 end do
+              end if
+              40 continue
+           return
+     end subroutine stdlib_dlarnv
+
+     !> Given the relatively robust representation(RRR) L D L^T, DLARRB:
+     !> does "limited" bisection to refine the eigenvalues of L D L^T,
+     !> W( IFIRST-OFFSET ) through W( ILAST-OFFSET ), to more accuracy. Initial
+     !> guesses for these eigenvalues are input in W, the corresponding estimate
+     !> of the error in these guesses and their gaps are input in WERR
+     !> and WGAP, respectively. During bisection, intervals
+     !> [left, right] are maintained by storing their mid-points and
+     !> semi-widths in the arrays W and WERR respectively.
+
+     pure subroutine stdlib_dlarrb( n, d, lld, ifirst, ilast, rtol1,rtol2, offset, w, wgap, werr, &
+               work, iwork,pivmin, spdiam, twist, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ifirst, ilast, n, offset, twist
+           integer(ilp), intent(out) :: info
+           real(dp), intent(in) :: pivmin, rtol1, rtol2, spdiam
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: d(*), lld(*)
+           real(dp), intent(inout) :: w(*), werr(*), wgap(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           integer(ilp) :: maxitr
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ii, ip, iter, k, negcnt, next, nint, olnint, prev, r
+           real(dp) :: back, cvrgd, gap, left, lgap, mid, mnwdth, rgap, right, tmp, width
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           maxitr = int( ( log( spdiam+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+           mnwdth = two * pivmin
+           r = twist
+           if((r<1).or.(r>n)) r = n
+           ! initialize unconverged intervals in [ work(2*i-1), work(2*i) ].
+           ! the sturm count, count( work(2*i-1) ) is arranged to be i-1, while
+           ! count( work(2*i) ) is stored in iwork( 2*i ). the integer iwork( 2*i-1 )
+           ! for an unconverged interval is set to the index of the next unconverged
+           ! interval, and is -1 or 0 for a converged interval. thus a linked
+           ! list of unconverged intervals is set up.
+           i1 = ifirst
+           ! the number of unconverged intervals
+           nint = 0
+           ! the last unconverged interval found
+           prev = 0
+           rgap = wgap( i1-offset )
+           loop_75: do i = i1, ilast
+              k = 2*i
+              ii = i - offset
+              left = w( ii ) - werr( ii )
+              right = w( ii ) + werr( ii )
+              lgap = rgap
+              rgap = wgap( ii )
+              gap = min( lgap, rgap )
+              ! make sure that [left,right] contains the desired eigenvalue
+              ! compute negcount from dstqds facto l+d+l+^t = l d l^t - left
+              ! do while( negcnt(left)>i-1 )
+              back = werr( ii )
+              20 continue
+              negcnt = stdlib_dlaneg( n, d, lld, left, pivmin, r )
+              if( negcnt>i-1 ) then
+                 left = left - back
+                 back = two*back
+                 go to 20
+              end if
+              ! do while( negcnt(right)<i )
+              ! compute negcount from dstqds facto l+d+l+^t = l d l^t - right
+              back = werr( ii )
+              50 continue
+              negcnt = stdlib_dlaneg( n, d, lld, right, pivmin, r )
+               if( negcnt<i ) then
+                  right = right + back
+                  back = two*back
+                  go to 50
+               end if
+              width = half*abs( left - right )
+              tmp = max( abs( left ), abs( right ) )
+              cvrgd = max(rtol1*gap,rtol2*tmp)
+              if( width<=cvrgd .or. width<=mnwdth ) then
+                 ! this interval has already converged and does not need refinement.
+                 ! (note that the gaps might change through refining the
+                  ! eigenvalues, however, they can only get bigger.)
+                 ! remove it from the list.
+                 iwork( k-1 ) = -1
+                 ! make sure that i1 always points to the first unconverged interval
+                 if((i==i1).and.(i<ilast)) i1 = i + 1
+                 if((prev>=i1).and.(i<=ilast)) iwork( 2*prev-1 ) = i + 1
+              else
+                 ! unconverged interval found
+                 prev = i
+                 nint = nint + 1
+                 iwork( k-1 ) = i + 1
+                 iwork( k ) = negcnt
+              end if
+              work( k-1 ) = left
+              work( k ) = right
+           end do loop_75
+           ! do while( nint>0 ), i.e. there are still unconverged intervals
+           ! and while (iter<maxitr)
+           iter = 0
+           80 continue
+           prev = i1 - 1
+           i = i1
+           olnint = nint
+           loop_100: do ip = 1, olnint
+              k = 2*i
+              ii = i - offset
+              rgap = wgap( ii )
+              lgap = rgap
+              if(ii>1) lgap = wgap( ii-1 )
+              gap = min( lgap, rgap )
+              next = iwork( k-1 )
+              left = work( k-1 )
+              right = work( k )
+              mid = half*( left + right )
+              ! semiwidth of interval
+              width = right - mid
+              tmp = max( abs( left ), abs( right ) )
+              cvrgd = max(rtol1*gap,rtol2*tmp)
+              if( ( width<=cvrgd ) .or. ( width<=mnwdth ).or.( iter==maxitr ) )then
+                 ! reduce number of unconverged intervals
+                 nint = nint - 1
+                 ! mark interval as converged.
+                 iwork( k-1 ) = 0
+                 if( i1==i ) then
+                    i1 = next
+                 else
+                    ! prev holds the last unconverged interval previously examined
+                    if(prev>=i1) iwork( 2*prev-1 ) = next
+                 end if
+                 i = next
+                 cycle loop_100
+              end if
+              prev = i
+              ! perform one bisection step
+              negcnt = stdlib_dlaneg( n, d, lld, mid, pivmin, r )
+              if( negcnt<=i-1 ) then
+                 work( k-1 ) = mid
+              else
+                 work( k ) = mid
+              end if
+              i = next
+           end do loop_100
+           iter = iter + 1
+           ! do another loop if there are still unconverged intervals
+           ! however, in the last iteration, all intervals are accepted
+           ! since this is the best we can do.
+           if( ( nint>0 ).and.(iter<=maxitr) ) go to 80
+           ! at this point, all the intervals have converged
+           do i = ifirst, ilast
+              k = 2*i
+              ii = i - offset
+              ! all intervals marked by '0' have been refined.
+              if( iwork( k-1 )==0 ) then
+                 w( ii ) = half*( work( k-1 )+work( k ) )
+                 werr( ii ) = work( k ) - w( ii )
+              end if
+           end do
+           do i = ifirst+1, ilast
+              k = 2*i
+              ii = i - offset
+              wgap( ii-1 ) = max( zero,w(ii) - werr (ii) - w( ii-1 ) - werr( ii-1 ))
+           end do
+           return
+     end subroutine stdlib_dlarrb
+
+     !> Given the initial representation L D L^T and its cluster of close
+     !> eigenvalues (in a relative measure), W( CLSTRT ), W( CLSTRT+1 ), ...
+     !> W( CLEND ), DLARRF: finds a new relatively robust representation
+     !> L D L^T - SIGMA I = L(+) D(+) L(+)^T such that at least one of the
+     !> eigenvalues of L(+) D(+) L(+)^T is relatively isolated.
+
+     pure subroutine stdlib_dlarrf( n, d, l, ld, clstrt, clend,w, wgap, werr,spdiam, clgapl, &
+               clgapr, pivmin, sigma,dplus, lplus, work, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: clstrt, clend, n
+           integer(ilp), intent(out) :: info
+           real(dp), intent(in) :: clgapl, clgapr, pivmin, spdiam
+           real(dp), intent(out) :: sigma
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*), l(*), ld(*), w(*), werr(*)
+           real(dp), intent(out) :: dplus(*), lplus(*), work(*)
+           real(dp), intent(inout) :: wgap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: quart = 0.25_dp
+           real(dp), parameter :: maxgrowth1 = 8._dp
+           real(dp), parameter :: maxgrowth2 = 8._dp
+           integer(ilp), parameter :: ktrymax = 1
+           integer(ilp), parameter :: sleft = 1
+           integer(ilp), parameter :: sright = 2
+           
+           ! Local Scalars 
+           logical(lk) :: dorrr1, forcer, nofail, sawnan1, sawnan2, tryrrr1
+           integer(ilp) :: i, indx, ktry,    shift
+           real(dp) :: avgap, bestshift, clwdth, eps, fact, fail, fail2, growthbound, ldelta, &
+           ldmax, lsigma, max1, max2, mingap, oldp, prod, rdelta, rdmax, rrr1, rrr2, rsigma, s, &
+                     smlgrowth, tmp, znm2
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           fact = real(2**ktrymax,KIND=dp)
+           eps = stdlib_dlamch( 'PRECISION' )
+           shift = 0
+           forcer = .false.
+           ! note that we cannot guarantee that for any of the shifts tried,
+           ! the factorization has a small or even moderate element growth.
+           ! there could be ritz values at both ends of the cluster and despite
+           ! backing off, there are examples where all factorizations tried
+           ! (in ieee mode, allowing zero pivots
+           ! element growth.
+           ! for this reason, we should use pivmin in this subroutine so that at
+           ! least the l d l^t factorization exists. it can be checked afterwards
+           ! whether the element growth caused bad residuals/orthogonality.
+           ! decide whether the code should accept the best among all
+           ! representations despite large element growth or signal info=1
+           ! setting nofail to .false. for quick fix for bug 113
+           nofail = .false.
+           ! compute the average gap length of the cluster
+           clwdth = abs(w(clend)-w(clstrt)) + werr(clend) + werr(clstrt)
+           avgap = clwdth / real(clend-clstrt,KIND=dp)
+           mingap = min(clgapl, clgapr)
+           ! initial values for shifts to both ends of cluster
+           lsigma = min(w( clstrt ),w( clend )) - werr( clstrt )
+           rsigma = max(w( clstrt ),w( clend )) + werr( clend )
+           ! use a small fudge to make sure that we really shift to the outside
+           lsigma = lsigma - abs(lsigma)* four * eps
+           rsigma = rsigma + abs(rsigma)* four * eps
+           ! compute upper bounds for how much to back off the initial shifts
+           ldmax = quart * mingap + two * pivmin
+           rdmax = quart * mingap + two * pivmin
+           ldelta = max(avgap,wgap( clstrt ))/fact
+           rdelta = max(avgap,wgap( clend-1 ))/fact
+           ! initialize the record of the best representation found
+           s = stdlib_dlamch( 'S' )
+           smlgrowth = one / s
+           fail = real(n-1,KIND=dp)*mingap/(spdiam*eps)
+           fail2 = real(n-1,KIND=dp)*mingap/(spdiam*sqrt(eps))
+           bestshift = lsigma
+           ! while (ktry <= ktrymax)
+           ktry = 0
+           growthbound = maxgrowth1*spdiam
+           5 continue
+           sawnan1 = .false.
+           sawnan2 = .false.
+           ! ensure that we do not back off too much of the initial shifts
+           ldelta = min(ldmax,ldelta)
+           rdelta = min(rdmax,rdelta)
+           ! compute the element growth when shifting to both ends of the cluster
+           ! accept the shift if there is no element growth at one of the two ends
+           ! left end
+           s = -lsigma
+           dplus( 1 ) = d( 1 ) + s
+           if(abs(dplus(1))<pivmin) then
+              dplus(1) = -pivmin
+              ! need to set sawnan1 because refined rrr test should not be used
+              ! in this case
+              sawnan1 = .true.
+           endif
+           max1 = abs( dplus( 1 ) )
+           do i = 1, n - 1
+              lplus( i ) = ld( i ) / dplus( i )
+              s = s*lplus( i )*l( i ) - lsigma
+              dplus( i+1 ) = d( i+1 ) + s
+              if(abs(dplus(i+1))<pivmin) then
+                 dplus(i+1) = -pivmin
+                 ! need to set sawnan1 because refined rrr test should not be used
+                 ! in this case
+                 sawnan1 = .true.
+              endif
+              max1 = max( max1,abs(dplus(i+1)) )
+           end do
+           sawnan1 = sawnan1 .or.  stdlib_disnan( max1 )
+           if( forcer .or.(max1<=growthbound .and. .not.sawnan1 ) ) then
+              sigma = lsigma
+              shift = sleft
+              goto 100
+           endif
+           ! right end
+           s = -rsigma
+           work( 1 ) = d( 1 ) + s
+           if(abs(work(1))<pivmin) then
+              work(1) = -pivmin
+              ! need to set sawnan2 because refined rrr test should not be used
+              ! in this case
+              sawnan2 = .true.
+           endif
+           max2 = abs( work( 1 ) )
+           do i = 1, n - 1
+              work( n+i ) = ld( i ) / work( i )
+              s = s*work( n+i )*l( i ) - rsigma
+              work( i+1 ) = d( i+1 ) + s
+              if(abs(work(i+1))<pivmin) then
+                 work(i+1) = -pivmin
+                 ! need to set sawnan2 because refined rrr test should not be used
+                 ! in this case
+                 sawnan2 = .true.
+              endif
+              max2 = max( max2,abs(work(i+1)) )
+           end do
+           sawnan2 = sawnan2 .or.  stdlib_disnan( max2 )
+           if( forcer .or.(max2<=growthbound .and. .not.sawnan2 ) ) then
+              sigma = rsigma
+              shift = sright
+              goto 100
+           endif
+           ! if we are at this point, both shifts led to too much element growth
+           ! record the better of the two shifts (provided it didn't lead to nan)
+           if(sawnan1.and.sawnan2) then
+              ! both max1 and max2 are nan
+              goto 50
+           else
+              if( .not.sawnan1 ) then
+                 indx = 1
+                 if(max1<=smlgrowth) then
+                    smlgrowth = max1
+                    bestshift = lsigma
+                 endif
+              endif
+              if( .not.sawnan2 ) then
+                 if(sawnan1 .or. max2<=max1) indx = 2
+                 if(max2<=smlgrowth) then
+                    smlgrowth = max2
+                    bestshift = rsigma
+                 endif
+              endif
+           endif
+           ! if we are here, both the left and the right shift led to
+           ! element growth. if the element growth is moderate, then
+           ! we may still accept the representation, if it passes a
+           ! refined test for rrr. this test supposes that no nan occurred.
+           ! moreover, we use the refined rrr test only for isolated clusters.
+           if((clwdth<mingap/real(128,KIND=dp)) .and.(min(max1,max2)<fail2).and.(.not.sawnan1)&
+                     .and.(.not.sawnan2)) then
+              dorrr1 = .true.
+           else
+              dorrr1 = .false.
+           endif
+           tryrrr1 = .true.
+           if( tryrrr1 .and. dorrr1 ) then
+           if(indx==1) then
+              tmp = abs( dplus( n ) )
+              znm2 = one
+              prod = one
+              oldp = one
+              do i = n-1, 1, -1
+                 if( prod <= eps ) then
+                    prod =((dplus(i+1)*work(n+i+1))/(dplus(i)*work(n+i)))*oldp
+                 else
+                    prod = prod*abs(work(n+i))
+                 end if
+                 oldp = prod
+                 znm2 = znm2 + prod**2
+                 tmp = max( tmp, abs( dplus( i ) * prod ))
+              end do
+              rrr1 = tmp/( spdiam * sqrt( znm2 ) )
+              if (rrr1<=maxgrowth2) then
+                 sigma = lsigma
+                 shift = sleft
+                 goto 100
+              endif
+           else if(indx==2) then
+              tmp = abs( work( n ) )
+              znm2 = one
+              prod = one
+              oldp = one
+              do i = n-1, 1, -1
+                 if( prod <= eps ) then
+                    prod = ((work(i+1)*lplus(i+1))/(work(i)*lplus(i)))*oldp
+                 else
+                    prod = prod*abs(lplus(i))
+                 end if
+                 oldp = prod
+                 znm2 = znm2 + prod**2
+                 tmp = max( tmp, abs( work( i ) * prod ))
+              end do
+              rrr2 = tmp/( spdiam * sqrt( znm2 ) )
+              if (rrr2<=maxgrowth2) then
+                 sigma = rsigma
+                 shift = sright
+                 goto 100
+              endif
+           end if
+           endif
+           50 continue
+           if (ktry<ktrymax) then
+              ! if we are here, both shifts failed also the rrr test.
+              ! back off to the outside
+              lsigma = max( lsigma - ldelta,lsigma - ldmax)
+              rsigma = min( rsigma + rdelta,rsigma + rdmax )
+              ldelta = two * ldelta
+              rdelta = two * rdelta
+              ktry = ktry + 1
+              goto 5
+           else
+              ! none of the representations investigated satisfied our
+              ! criteria. take the best one we found.
+              if((smlgrowth<fail).or.nofail) then
+                 lsigma = bestshift
+                 rsigma = bestshift
+                 forcer = .true.
+                 goto 5
+              else
+                 info = 1
+                 return
+              endif
+           end if
+           100 continue
+           if (shift==sleft) then
+           elseif (shift==sright) then
+              ! store new l and d back into dplus, lplus
+              call stdlib_dcopy( n, work, 1, dplus, 1 )
+              call stdlib_dcopy( n-1, work(n+1), 1, lplus, 1 )
+           endif
+           return
+     end subroutine stdlib_dlarrf
+
+     !> DLARRV: computes the eigenvectors of the tridiagonal matrix
+     !> T = L D L**T given L, D and APPROXIMATIONS to the eigenvalues of L D L**T.
+     !> The input eigenvalues should have been computed by DLARRE.
+
+     pure subroutine stdlib_dlarrv( n, vl, vu, d, l, pivmin,isplit, m, dol, dou, minrgp,rtol1, &
+               rtol2, w, werr, wgap,iblock, indexw, gers, z, ldz, isuppz,work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: dol, dou, ldz, m, n
+           integer(ilp), intent(out) :: info
+           real(dp), intent(in) :: minrgp, pivmin, vl, vu
+           real(dp), intent(inout) :: rtol1, rtol2
+           ! Array Arguments 
+           integer(ilp), intent(in) :: iblock(*), indexw(*), isplit(*)
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(dp), intent(inout) :: d(*), l(*), w(*), werr(*), wgap(*)
+           real(dp), intent(in) :: gers(*)
+           real(dp), intent(out) :: work(*)
+           real(dp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxitr = 10
+           
+           
+           ! Local Scalars 
+           logical(lk) :: eskip, needbs, stp2ii, tryrqc, usedbs, usedrq
+           integer(ilp) :: done, i, ibegin, idone, iend, ii, iindc1, iindc2, iindr, iindwk, iinfo,&
+            im, in, indeig, indld, indlld, indwrk, isupmn, isupmx, iter, itmp1, j, jblk, k, &
+            miniwsize, minwsize, nclus, ndepth, negcnt, newcls, newfst, newftt, newlst, newsiz, &
+            offset, oldcls, oldfst, oldien, oldlst, oldncl, p, parity, q, wbegin, wend, windex, &
+                      windmn, windpl, zfrom, zto, zusedl, zusedu, zusedw
+           real(dp) :: bstres, bstw, eps, fudge, gap, gaptol, gl, gu, lambda, left, lgap, mingma, &
+           nrminv, resid, rgap, right, rqcorr, rqtol, savgap, sgndef, sigma, spdiam, ssigma, tau, &
+                     tmp, tol, ztz
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( (n<=0).or.(m<=0) ) then
+              return
+           end if
+           ! the first n entries of work are reserved for the eigenvalues
+           indld = n+1
+           indlld= 2*n+1
+           indwrk= 3*n+1
+           minwsize = 12 * n
+           do i= 1,minwsize
+              work( i ) = zero
+           end do
+           ! iwork(iindr+1:iindr+n) hold the twist indices r for the
+           ! factorization used to compute the fp vector
+           iindr = 0
+           ! iwork(iindc1+1:iinc2+n) are used to store the clusters of the current
+           ! layer and the one above.
+           iindc1 = n
+           iindc2 = 2*n
+           iindwk = 3*n + 1
+           miniwsize = 7 * n
+           do i= 1,miniwsize
+              iwork( i ) = 0
+           end do
+           zusedl = 1
+           if(dol>1) then
+              ! set lower bound for use of z
+              zusedl = dol-1
+           endif
+           zusedu = m
+           if(dou<m) then
+              ! set lower bound for use of z
+              zusedu = dou+1
+           endif
+           ! the width of the part of z that is used
+           zusedw = zusedu - zusedl + 1
+           call stdlib_dlaset( 'FULL', n, zusedw, zero, zero,z(1,zusedl), ldz )
+           eps = stdlib_dlamch( 'PRECISION' )
+           rqtol = two * eps
+           ! set expert flags for standard code.
+           tryrqc = .true.
+           if((dol==1).and.(dou==m)) then
+           else
+              ! only selected eigenpairs are computed. since the other evalues
+              ! are not refined by rq iteration, bisection has to compute to full
+              ! accuracy.
+              rtol1 = four * eps
+              rtol2 = four * eps
+           endif
+           ! the entries wbegin:wend in w, werr, wgap correspond to the
+           ! desired eigenvalues. the support of the nonzero eigenvector
+           ! entries is contained in the interval ibegin:iend.
+           ! remark that if k eigenpairs are desired, then the eigenvectors
+           ! are stored in k contiguous columns of z.
+           ! done is the number of eigenvectors already computed
+           done = 0
+           ibegin = 1
+           wbegin = 1
+           loop_170: do jblk = 1, iblock( m )
+              iend = isplit( jblk )
+              sigma = l( iend )
+              ! find the eigenvectors of the submatrix indexed ibegin
+              ! through iend.
+              wend = wbegin - 1
+              15 continue
+              if( wend<m ) then
+                 if( iblock( wend+1 )==jblk ) then
+                    wend = wend + 1
+                    go to 15
+                 end if
+              end if
+              if( wend<wbegin ) then
+                 ibegin = iend + 1
+                 cycle loop_170
+              elseif( (wend<dol).or.(wbegin>dou) ) then
+                 ibegin = iend + 1
+                 wbegin = wend + 1
+                 cycle loop_170
+              end if
+              ! find local spectral diameter of the block
+              gl = gers( 2*ibegin-1 )
+              gu = gers( 2*ibegin )
+              do i = ibegin+1 , iend
+                 gl = min( gers( 2*i-1 ), gl )
+                 gu = max( gers( 2*i ), gu )
+              end do
+              spdiam = gu - gl
+              ! oldien is the last index of the previous block
+              oldien = ibegin - 1
+              ! calculate the size of the current block
+              in = iend - ibegin + 1
+              ! the number of eigenvalues in the current block
+              im = wend - wbegin + 1
+              ! this is for a 1x1 block
+              if( ibegin==iend ) then
+                 done = done+1
+                 z( ibegin, wbegin ) = one
+                 isuppz( 2*wbegin-1 ) = ibegin
+                 isuppz( 2*wbegin ) = ibegin
+                 w( wbegin ) = w( wbegin ) + sigma
+                 work( wbegin ) = w( wbegin )
+                 ibegin = iend + 1
+                 wbegin = wbegin + 1
+                 cycle loop_170
+              end if
+              ! the desired (shifted) eigenvalues are stored in w(wbegin:wend)
+              ! note that these can be approximations, in this case, the corresp.
+              ! entries of werr give the size of the uncertainty interval.
+              ! the eigenvalue approximations will be refined when necessary as
+              ! high relative accuracy is required for the computation of the
+              ! corresponding eigenvectors.
+              call stdlib_dcopy( im, w( wbegin ), 1,work( wbegin ), 1 )
+              ! we store in w the eigenvalue approximations w.r.t. the original
+              ! matrix t.
+              do i=1,im
+                 w(wbegin+i-1) = w(wbegin+i-1)+sigma
+              end do
+              ! ndepth is the current depth of the representation tree
+              ndepth = 0
+              ! parity is either 1 or 0
+              parity = 1
+              ! nclus is the number of clusters for the next level of the
+              ! representation tree, we start with nclus = 1 for the root
+              nclus = 1
+              iwork( iindc1+1 ) = 1
+              iwork( iindc1+2 ) = im
+              ! idone is the number of eigenvectors already computed in the current
+              ! block
+              idone = 0
+              ! loop while( idone<im )
+              ! generate the representation tree for the current block and
+              ! compute the eigenvectors
+              40 continue
+              if( idone<im ) then
+                 ! this is a crude protection against infinitely deep trees
+                 if( ndepth>m ) then
+                    info = -2
+                    return
+                 endif
+                 ! breadth first processing of the current level of the representation
+                 ! tree: oldncl = number of clusters on current level
+                 oldncl = nclus
+                 ! reset nclus to count the number of child clusters
+                 nclus = 0
+                 parity = 1 - parity
+                 if( parity==0 ) then
+                    oldcls = iindc1
+                    newcls = iindc2
+                 else
+                    oldcls = iindc2
+                    newcls = iindc1
+                 end if
+                 ! process the clusters on the current level
+                 loop_150: do i = 1, oldncl
+                    j = oldcls + 2*i
+                    ! oldfst, oldlst = first, last index of current cluster.
+                                     ! cluster indices start with 1 and are relative
+                                     ! to wbegin when accessing w, wgap, werr, z
+                    oldfst = iwork( j-1 )
+                    oldlst = iwork( j )
+                    if( ndepth>0 ) then
+                       ! retrieve relatively robust representation (rrr) of cluster
+                       ! that has been computed at the previous level
+                       ! the rrr is stored in z and overwritten once the eigenvectors
+                       ! have been computed or when the cluster is refined
+                       if((dol==1).and.(dou==m)) then
+                          ! get representation from location of the leftmost evalue
+                          ! of the cluster
+                          j = wbegin + oldfst - 1
+                       else
+                          if(wbegin+oldfst-1<dol) then
+                             ! get representation from the left end of z array
+                             j = dol - 1
+                          elseif(wbegin+oldfst-1>dou) then
+                             ! get representation from the right end of z array
+                             j = dou
+                          else
+                             j = wbegin + oldfst - 1
+                          endif
+                       endif
+                       call stdlib_dcopy( in, z( ibegin, j ), 1, d( ibegin ), 1 )
+                       call stdlib_dcopy( in-1, z( ibegin, j+1 ), 1, l( ibegin ),1 )
+                       sigma = z( iend, j+1 )
+                       ! set the corresponding entries in z to zero
+                       call stdlib_dlaset( 'FULL', in, 2, zero, zero,z( ibegin, j), ldz )
+                    end if
+                    ! compute dl and dll of current rrr
+                    do j = ibegin, iend-1
+                       tmp = d( j )*l( j )
+                       work( indld-1+j ) = tmp
+                       work( indlld-1+j ) = tmp*l( j )
+                    end do
+                    if( ndepth>0 ) then
+                       ! p and q are index of the first and last eigenvalue to compute
+                       ! within the current block
+                       p = indexw( wbegin-1+oldfst )
+                       q = indexw( wbegin-1+oldlst )
+                       ! offset for the arrays work, wgap and werr, i.e., the p-offset
+                       ! through the q-offset elements of these arrays are to be used.
+                        ! offset = p-oldfst
+                       offset = indexw( wbegin ) - 1
+                       ! perform limited bisection (if necessary) to get approximate
+                       ! eigenvalues to the precision needed.
+                       call stdlib_dlarrb( in, d( ibegin ),work(indlld+ibegin-1),p, q, rtol1, &
+                       rtol2, offset,work(wbegin),wgap(wbegin),werr(wbegin),work( indwrk ), iwork(&
+                                  iindwk ),pivmin, spdiam, in, iinfo )
+                       if( iinfo/=0 ) then
+                          info = -1
+                          return
+                       endif
+                       ! we also recompute the extremal gaps. w holds all eigenvalues
+                       ! of the unshifted matrix and must be used for computation
+                       ! of wgap, the entries of work might stem from rrrs with
+                       ! different shifts. the gaps from wbegin-1+oldfst to
+                       ! wbegin-1+oldlst are correctly computed in stdlib_dlarrb.
+                       ! however, we only allow the gaps to become greater since
+                       ! this is what should happen when we decrease werr
+                       if( oldfst>1) then
+                          wgap( wbegin+oldfst-2 ) =max(wgap(wbegin+oldfst-2),w(wbegin+oldfst-1)-&
+                          werr(wbegin+oldfst-1)- w(wbegin+oldfst-2)-werr(wbegin+oldfst-2) )
+                                    
+                       endif
+                       if( wbegin + oldlst -1 < wend ) then
+                          wgap( wbegin+oldlst-1 ) =max(wgap(wbegin+oldlst-1),w(wbegin+oldlst)-&
+                                    werr(wbegin+oldlst)- w(wbegin+oldlst-1)-werr(wbegin+oldlst-1) )
+                       endif
+                       ! each time the eigenvalues in work get refined, we store
+                       ! the newly found approximation with all shifts applied in w
+                       do j=oldfst,oldlst
+                          w(wbegin+j-1) = work(wbegin+j-1)+sigma
+                       end do
+                    end if
+                    ! process the current node.
+                    newfst = oldfst
+                    loop_140: do j = oldfst, oldlst
+                       if( j==oldlst ) then
+                          ! we are at the right end of the cluster, this is also the
+                          ! boundary of the child cluster
+                          newlst = j
+                       else if ( wgap( wbegin + j -1)>=minrgp* abs( work(wbegin + j -1) ) ) &
+                                 then
+                          ! the right relative gap is big enough, the child cluster
+                          ! (newfst,..,newlst) is well separated from the following
+                          newlst = j
+                        else
+                          ! inside a child cluster, the relative gap is not
+                          ! big enough.
+                          cycle loop_140
+                       end if
+                       ! compute size of child cluster found
+                       newsiz = newlst - newfst + 1
+                       ! newftt is the place in z where the new rrr or the computed
+                       ! eigenvector is to be stored
+                       if((dol==1).and.(dou==m)) then
+                          ! store representation at location of the leftmost evalue
+                          ! of the cluster
+                          newftt = wbegin + newfst - 1
+                       else
+                          if(wbegin+newfst-1<dol) then
+                             ! store representation at the left end of z array
+                             newftt = dol - 1
+                          elseif(wbegin+newfst-1>dou) then
+                             ! store representation at the right end of z array
+                             newftt = dou
+                          else
+                             newftt = wbegin + newfst - 1
+                          endif
+                       endif
+                       if( newsiz>1) then
+                          ! current child is not a singleton but a cluster.
+                          ! compute and store new representation of child.
+                          ! compute left and right cluster gap.
+                          ! lgap and rgap are not computed from work because
+                          ! the eigenvalue approximations may stem from rrrs
+                          ! different shifts. however, w hold all eigenvalues
+                          ! of the unshifted matrix. still, the entries in wgap
+                          ! have to be computed from work since the entries
+                          ! in w might be of the same order so that gaps are not
+                          ! exhibited correctly for very close eigenvalues.
+                          if( newfst==1 ) then
+                             lgap = max( zero,w(wbegin)-werr(wbegin) - vl )
+                         else
+                             lgap = wgap( wbegin+newfst-2 )
+                          endif
+                          rgap = wgap( wbegin+newlst-1 )
+                          ! compute left- and rightmost eigenvalue of child
+                          ! to high precision in order to shift as close
+                          ! as possible and obtain as large relative gaps
+                          ! as possible
+                          do k =1,2
+                             if(k==1) then
+                                p = indexw( wbegin-1+newfst )
+                             else
+                                p = indexw( wbegin-1+newlst )
+                             endif
+                             offset = indexw( wbegin ) - 1
+                             call stdlib_dlarrb( in, d(ibegin),work( indlld+ibegin-1 ),p,p,rqtol, &
+                             rqtol, offset,work(wbegin),wgap(wbegin),werr(wbegin),work( indwrk ),&
+                                       iwork( iindwk ), pivmin, spdiam,in, iinfo )
+                          end do
+                          if((wbegin+newlst-1<dol).or.(wbegin+newfst-1>dou)) then
+                             ! if the cluster contains no desired eigenvalues
+                             ! skip the computation of that branch of the rep. tree
+                             ! we could skip before the refinement of the extremal
+                             ! eigenvalues of the child, but then the representation
+                             ! tree could be different from the one when nothing is
+                             ! skipped. for this reason we skip at this place.
+                             idone = idone + newlst - newfst + 1
+                             goto 139
+                          endif
+                          ! compute rrr of child cluster.
+                          ! note that the new rrr is stored in z
+                          ! stdlib_dlarrf needs lwork = 2*n
+                          call stdlib_dlarrf( in, d( ibegin ), l( ibegin ),work(indld+ibegin-1),&
+                          newfst, newlst, work(wbegin),wgap(wbegin), werr(wbegin),spdiam, lgap, &
+                          rgap, pivmin, tau,z(ibegin, newftt),z(ibegin, newftt+1),work( indwrk ), &
+                                    iinfo )
+                          if( iinfo==0 ) then
+                             ! a new rrr for the cluster was found by stdlib_dlarrf
+                             ! update shift and store it
+                             ssigma = sigma + tau
+                             z( iend, newftt+1 ) = ssigma
+                             ! work() are the midpoints and werr() the semi-width
+                             ! note that the entries in w are unchanged.
+                             do k = newfst, newlst
+                                fudge =three*eps*abs(work(wbegin+k-1))
+                                work( wbegin + k - 1 ) =work( wbegin + k - 1) - tau
+                                fudge = fudge +four*eps*abs(work(wbegin+k-1))
+                                ! fudge errors
+                                werr( wbegin + k - 1 ) =werr( wbegin + k - 1 ) + fudge
+                                ! gaps are not fudged. provided that werr is small
+                                ! when eigenvalues are close, a zero gap indicates
+                                ! that a new representation is needed for resolving
+                                ! the cluster. a fudge could lead to a wrong decision
+                                ! of judging eigenvalues 'separated' which in
+                                ! reality are not. this could have a negative impact
+                                ! on the orthogonality of the computed eigenvectors.
+                             end do
+                             nclus = nclus + 1
+                             k = newcls + 2*nclus
+                             iwork( k-1 ) = newfst
+                             iwork( k ) = newlst
+                          else
+                             info = -2
+                             return
+                          endif
+                       else
+                          ! compute eigenvector of singleton
+                          iter = 0
+                          tol = four * log(real(in,KIND=dp)) * eps
+                          k = newfst
+                          windex = wbegin + k - 1
+                          windmn = max(windex - 1,1)
+                          windpl = min(windex + 1,m)
+                          lambda = work( windex )
+                          done = done + 1
+                          ! check if eigenvector computation is to be skipped
+                          if((windex<dol).or.(windex>dou)) then
+                             eskip = .true.
+                             goto 125
+                          else
+                             eskip = .false.
+                          endif
+                          left = work( windex ) - werr( windex )
+                          right = work( windex ) + werr( windex )
+                          indeig = indexw( windex )
+                          ! note that since we compute the eigenpairs for a child,
+                          ! all eigenvalue approximations are w.r.t the same shift.
+                          ! in this case, the entries in work should be used for
+                          ! computing the gaps since they exhibit even very small
+                          ! differences in the eigenvalues, as opposed to the
+                          ! entries in w which might "look" the same.
+                          if( k == 1) then
+                             ! in the case range='i' and with not much initial
+                             ! accuracy in lambda and vl, the formula
+                             ! lgap = max( zero, (sigma - vl) + lambda )
+                             ! can lead to an overestimation of the left gap and
+                             ! thus to inadequately early rqi 'convergence'.
+                             ! prevent this by forcing a small left gap.
+                             lgap = eps*max(abs(left),abs(right))
+                          else
+                             lgap = wgap(windmn)
+                          endif
+                          if( k == im) then
+                             ! in the case range='i' and with not much initial
+                             ! accuracy in lambda and vu, the formula
+                             ! can lead to an overestimation of the right gap and
+                             ! thus to inadequately early rqi 'convergence'.
+                             ! prevent this by forcing a small right gap.
+                             rgap = eps*max(abs(left),abs(right))
+                          else
+                             rgap = wgap(windex)
+                          endif
+                          gap = min( lgap, rgap )
+                          if(( k == 1).or.(k == im)) then
+                             ! the eigenvector support can become wrong
+                             ! because significant entries could be cut off due to a
+                             ! large gaptol parameter in lar1v. prevent this.
+                             gaptol = zero
+                          else
+                             gaptol = gap * eps
+                          endif
+                          isupmn = in
+                          isupmx = 1
+                          ! update wgap so that it holds the minimum gap
+                          ! to the left or the right. this is crucial in the
+                          ! case where bisection is used to ensure that the
+                          ! eigenvalue is refined up to the required precision.
+                          ! the correct value is restored afterwards.
+                          savgap = wgap(windex)
+                          wgap(windex) = gap
+                          ! we want to use the rayleigh quotient correction
+                          ! as often as possible since it converges quadratically
+                          ! when we are close enough to the desired eigenvalue.
+                          ! however, the rayleigh quotient can have the wrong sign
+                          ! and lead us away from the desired eigenvalue. in this
+                          ! case, the best we can do is to use bisection.
+                          usedbs = .false.
+                          usedrq = .false.
+                          ! bisection is initially turned off unless it is forced
+                          needbs =  .not.tryrqc
+                          120 continue
+                          ! check if bisection should be used to refine eigenvalue
+                          if(needbs) then
+                             ! take the bisection as new iterate
+                             usedbs = .true.
+                             itmp1 = iwork( iindr+windex )
+                             offset = indexw( wbegin ) - 1
+                             call stdlib_dlarrb( in, d(ibegin),work(indlld+ibegin-1),indeig,&
+                             indeig,zero, two*eps, offset,work(wbegin),wgap(wbegin),werr(wbegin),&
+                                       work( indwrk ),iwork( iindwk ), pivmin, spdiam,itmp1, iinfo )
+                             if( iinfo/=0 ) then
+                                info = -3
+                                return
+                             endif
+                             lambda = work( windex )
+                             ! reset twist index from inaccurate lambda to
+                             ! force computation of true mingma
+                             iwork( iindr+windex ) = 0
+                          endif
+                          ! given lambda, compute the eigenvector.
+                          call stdlib_dlar1v( in, 1, in, lambda, d( ibegin ),l( ibegin ), work(&
+                          indld+ibegin-1),work(indlld+ibegin-1),pivmin, gaptol, z( ibegin, windex &
+                          ),.not.usedbs, negcnt, ztz, mingma,iwork( iindr+windex ), isuppz( &
+                                    2*windex-1 ),nrminv, resid, rqcorr, work( indwrk ) )
+                          if(iter == 0) then
+                             bstres = resid
+                             bstw = lambda
+                          elseif(resid<bstres) then
+                             bstres = resid
+                             bstw = lambda
+                          endif
+                          isupmn = min(isupmn,isuppz( 2*windex-1 ))
+                          isupmx = max(isupmx,isuppz( 2*windex ))
+                          iter = iter + 1
+                          ! sin alpha <= |resid|/gap
+                          ! note that both the residual and the gap are
+                          ! proportional to the matrix, so ||t|| doesn't play
+                          ! a role in the quotient
+                          ! convergence test for rayleigh-quotient iteration
+                          ! (omitted when bisection has been used)
+                          if( resid>tol*gap .and. abs( rqcorr )>rqtol*abs( lambda ) .and. .not. &
+                                    usedbs)then
+                             ! we need to check that the rqcorr update doesn't
+                             ! move the eigenvalue away from the desired one and
+                             ! towards a neighbor. -> protection with bisection
+                             if(indeig<=negcnt) then
+                                ! the wanted eigenvalue lies to the left
+                                sgndef = -one
+                             else
+                                ! the wanted eigenvalue lies to the right
+                                sgndef = one
+                             endif
+                             ! we only use the rqcorr if it improves the
+                             ! the iterate reasonably.
+                             if( ( rqcorr*sgndef>=zero ).and.( lambda + rqcorr<= right).and.( &
+                                       lambda + rqcorr>= left)) then
+                                usedrq = .true.
+                                ! store new midpoint of bisection interval in work
+                                if(sgndef==one) then
+                                   ! the current lambda is on the left of the true
+                                   ! eigenvalue
+                                   left = lambda
+                                   ! we prefer to assume that the error estimate
+                                   ! is correct. we could make the interval not
+                                   ! as a bracket but to be modified if the rqcorr
+                                   ! chooses to. in this case, the right side should
+                                   ! be modified as follows:
+                                    ! right = max(right, lambda + rqcorr)
+                                else
+                                   ! the current lambda is on the right of the true
+                                   ! eigenvalue
+                                   right = lambda
+                                   ! see comment about assuming the error estimate is
+                                   ! correct above.
+                                    ! left = min(left, lambda + rqcorr)
+                                endif
+                                work( windex ) =half * (right + left)
+                                ! take rqcorr since it has the correct sign and
+                                ! improves the iterate reasonably
+                                lambda = lambda + rqcorr
+                                ! update width of error interval
+                                werr( windex ) =half * (right-left)
+                             else
+                                needbs = .true.
+                             endif
+                             if(right-left<rqtol*abs(lambda)) then
+                                   ! the eigenvalue is computed to bisection accuracy
+                                   ! compute eigenvector and stop
+                                usedbs = .true.
+                                goto 120
+                             elseif( iter<maxitr ) then
+                                goto 120
+                             elseif( iter==maxitr ) then
+                                needbs = .true.
+                                goto 120
+                             else
+                                info = 5
+                                return
+                             end if
+                          else
+                             stp2ii = .false.
+             if(usedrq .and. usedbs .and.bstres<=resid) then
+                                lambda = bstw
+                                stp2ii = .true.
+                             endif
+                             if (stp2ii) then
+                                ! improve error angle by second step
+                                call stdlib_dlar1v( in, 1, in, lambda,d( ibegin ), l( ibegin ),&
+                                work(indld+ibegin-1),work(indlld+ibegin-1),pivmin, gaptol, z( &
+                                ibegin, windex ),.not.usedbs, negcnt, ztz, mingma,iwork( iindr+&
+                                windex ),isuppz( 2*windex-1 ),nrminv, resid, rqcorr, work( indwrk &
+                                          ) )
+                             endif
+                             work( windex ) = lambda
+                          end if
+                          ! compute fp-vector support w.r.t. whole matrix
+                          isuppz( 2*windex-1 ) = isuppz( 2*windex-1 )+oldien
+                          isuppz( 2*windex ) = isuppz( 2*windex )+oldien
+                          zfrom = isuppz( 2*windex-1 )
+                          zto = isuppz( 2*windex )
+                          isupmn = isupmn + oldien
+                          isupmx = isupmx + oldien
+                          ! ensure vector is ok if support in the rqi has changed
+                          if(isupmn<zfrom) then
+                             do ii = isupmn,zfrom-1
+                                z( ii, windex ) = zero
+                             end do
+                          endif
+                          if(isupmx>zto) then
+                             do ii = zto+1,isupmx
+                                z( ii, windex ) = zero
+                             end do
+                          endif
+                          call stdlib_dscal( zto-zfrom+1, nrminv,z( zfrom, windex ), 1 )
+                          125 continue
+                          ! update w
+                          w( windex ) = lambda+sigma
+                          ! recompute the gaps on the left and right
+                          ! but only allow them to become larger and not
+                          ! smaller (which can only happen through "bad"
+                          ! cancellation and doesn't reflect the theory
+                          ! where the initial gaps are underestimated due
+                          ! to werr being too crude.)
+                          if(.not.eskip) then
+                             if( k>1) then
+                                wgap( windmn ) = max( wgap(windmn),w(windex)-werr(windex)- w(&
+                                          windmn)-werr(windmn) )
+                             endif
+                             if( windex<wend ) then
+                                wgap( windex ) = max( savgap,w( windpl )-werr( windpl )- w( &
+                                          windex )-werr( windex) )
+                             endif
+                          endif
+                          idone = idone + 1
+                       endif
+                       ! here ends the code for the current child
+                       139 continue
+                       ! proceed to any remaining child nodes
+                       newfst = j + 1
+                    end do loop_140
+                 end do loop_150
+                 ndepth = ndepth + 1
+                 go to 40
+              end if
+              ibegin = iend + 1
+              wbegin = wend + 1
+           end do loop_170
+           return
+     end subroutine stdlib_dlarrv
+
+     !> DLASCL: multiplies the M by N real matrix A by the real scalar
+     !> CTO/CFROM.  This is done without over/underflow as long as the final
+     !> result CTO*A(I,J)/CFROM does not over/underflow. TYPE specifies that
+     !> A may be full, upper triangular, lower triangular, upper Hessenberg,
+     !> or banded.
+
+     pure subroutine stdlib_dlascl( type, kl, ku, cfrom, cto, m, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: type
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, lda, m, n
+           real(dp), intent(in) :: cfrom, cto
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: i, itype, j, k1, k2, k3, k4
+           real(dp) :: bignum, cfrom1, cfromc, cto1, ctoc, mul, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( stdlib_lsame( type, 'G' ) ) then
+              itype = 0
+           else if( stdlib_lsame( type, 'L' ) ) then
+              itype = 1
+           else if( stdlib_lsame( type, 'U' ) ) then
+              itype = 2
+           else if( stdlib_lsame( type, 'H' ) ) then
+              itype = 3
+           else if( stdlib_lsame( type, 'B' ) ) then
+              itype = 4
+           else if( stdlib_lsame( type, 'Q' ) ) then
+              itype = 5
+           else if( stdlib_lsame( type, 'Z' ) ) then
+              itype = 6
+           else
+              itype = -1
+           end if
+           if( itype==-1 ) then
+              info = -1
+           else if( cfrom==zero .or. stdlib_disnan(cfrom) ) then
+              info = -4
+           else if( stdlib_disnan(cto) ) then
+              info = -5
+           else if( m<0 ) then
+              info = -6
+           else if( n<0 .or. ( itype==4 .and. n/=m ) .or.( itype==5 .and. n/=m ) ) then
+              info = -7
+           else if( itype<=3 .and. lda<max( 1, m ) ) then
+              info = -9
+           else if( itype>=4 ) then
+              if( kl<0 .or. kl>max( m-1, 0 ) ) then
+                 info = -2
+              else if( ku<0 .or. ku>max( n-1, 0 ) .or.( ( itype==4 .or. itype==5 ) .and. kl/=ku ) &
+                        )then
+                 info = -3
+              else if( ( itype==4 .and. lda<kl+1 ) .or.( itype==5 .and. lda<ku+1 ) .or.( itype==6 &
+                        .and. lda<2*kl+ku+1 ) ) then
+                 info = -9
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASCL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 )return
+           ! get machine parameters
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           cfromc = cfrom
+           ctoc = cto
+           10 continue
+           cfrom1 = cfromc*smlnum
+           if( cfrom1==cfromc ) then
+              ! cfromc is an inf.  multiply by a correctly signed zero for
+              ! finite ctoc, or a nan if ctoc is infinite.
+              mul = ctoc / cfromc
+              done = .true.
+              cto1 = ctoc
+           else
+              cto1 = ctoc / bignum
+              if( cto1==ctoc ) then
+                 ! ctoc is either 0 or an inf.  in both cases, ctoc itself
+                 ! serves as the correct multiplication factor.
+                 mul = ctoc
+                 done = .true.
+                 cfromc = one
+              else if( abs( cfrom1 )>abs( ctoc ) .and. ctoc/=zero ) then
+                 mul = smlnum
+                 done = .false.
+                 cfromc = cfrom1
+              else if( abs( cto1 )>abs( cfromc ) ) then
+                 mul = bignum
+                 done = .false.
+                 ctoc = cto1
+              else
+                 mul = ctoc / cfromc
+                 done = .true.
+              end if
+           end if
+           if( itype==0 ) then
+              ! full matrix
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==1 ) then
+              ! lower triangular matrix
+              do j = 1, n
+                 do i = j, m
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==2 ) then
+              ! upper triangular matrix
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==3 ) then
+              ! upper hessenberg matrix
+              do j = 1, n
+                 do i = 1, min( j+1, m )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==4 ) then
+              ! lower half of a symmetric band matrix
+              k3 = kl + 1
+              k4 = n + 1
+              do j = 1, n
+                 do i = 1, min( k3, k4-j )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==5 ) then
+              ! upper half of a symmetric band matrix
+              k1 = ku + 2
+              k3 = ku + 1
+              do j = 1, n
+                 do i = max( k1-j, 1 ), k3
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==6 ) then
+              ! band matrix
+              k1 = kl + ku + 2
+              k2 = kl + 1
+              k3 = 2*kl + ku + 1
+              k4 = kl + ku + 1 + m
+              do j = 1, n
+                 do i = max( k1-j, k2 ), min( k3, k4-j )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           end if
+           if( .not.done )go to 10
+           return
+     end subroutine stdlib_dlascl
+
+     !> This subroutine computes the square root of the I-th updated
+     !> eigenvalue of a positive symmetric rank-one modification to
+     !> a positive diagonal matrix whose entries are given as the squares
+     !> of the corresponding entries in the array d, and that
+     !> 0 <= D(i) < D(j)  for  i < j
+     !> and that RHO > 0. This is arranged by the calling routine, and is
+     !> no loss in generality.  The rank-one modified system is thus
+     !> diag( D ) * diag( D ) +  RHO * Z * Z_transpose.
+     !> where we assume the Euclidean norm of Z is 1.
+     !> The method consists of approximating the rational functions in the
+     !> secular equation by simpler interpolating rational functions.
+
+     pure subroutine stdlib_dlasd4( n, i, d, z, delta, rho, sigma, work, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i, n
+           integer(ilp), intent(out) :: info
+           real(dp), intent(in) :: rho
+           real(dp), intent(out) :: sigma
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*), z(*)
+           real(dp), intent(out) :: delta(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 400
+           
+           
+           ! Local Scalars 
+           logical(lk) :: orgati, swtch, swtch3, geomavg
+           integer(ilp) :: ii, iim1, iip1, ip1, iter, j, niter
+           real(dp) :: a, b, c, delsq, delsq2, sq2, dphi, dpsi, dtiim, dtiip, dtipsq, dtisq, &
+           dtnsq, dtnsq1, dw, eps, erretm, eta, phi, prew, psi, rhoinv, sglb, sgub, tau, tau2, &
+                     temp, temp1, temp2, w
+           ! Local Arrays 
+           real(dp) :: dd(3), zz(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! since this routine is called in an inner loop, we do no argument
+           ! checking.
+           ! quick return for n=1 and 2.
+           info = 0
+           if( n==1 ) then
+              ! presumably, i=1 upon entry
+              sigma = sqrt( d( 1 )*d( 1 )+rho*z( 1 )*z( 1 ) )
+              delta( 1 ) = one
+              work( 1 ) = one
+              return
+           end if
+           if( n==2 ) then
+              call stdlib_dlasd5( i, d, z, delta, rho, sigma, work )
+              return
+           end if
+           ! compute machine epsilon
+           eps = stdlib_dlamch( 'EPSILON' )
+           rhoinv = one / rho
+           tau2= zero
+           ! the case i = n
+           if( i==n ) then
+              ! initialize some basic variables
+              ii = n - 1
+              niter = 1
+              ! calculate initial guess
+              temp = rho / two
+              ! if ||z||_2 is not one, then temp should be set to
+              ! rho * ||z||_2^2 / two
+              temp1 = temp / ( d( n )+sqrt( d( n )*d( n )+temp ) )
+              do j = 1, n
+                 work( j ) = d( j ) + d( n ) + temp1
+                 delta( j ) = ( d( j )-d( n ) ) - temp1
+              end do
+              psi = zero
+              do j = 1, n - 2
+                 psi = psi + z( j )*z( j ) / ( delta( j )*work( j ) )
+              end do
+              c = rhoinv + psi
+              w = c + z( ii )*z( ii ) / ( delta( ii )*work( ii ) ) +z( n )*z( n ) / ( delta( n )&
+                        *work( n ) )
+              if( w<=zero ) then
+                 temp1 = sqrt( d( n )*d( n )+rho )
+                 temp = z( n-1 )*z( n-1 ) / ( ( d( n-1 )+temp1 )*( d( n )-d( n-1 )+rho / ( d( n )+&
+                           temp1 ) ) ) +z( n )*z( n ) / rho
+                 ! the following tau2 is to approximate
+                 ! sigma_n^2 - d( n )*d( n )
+                 if( c<=temp ) then
+                    tau = rho
+                 else
+                    delsq = ( d( n )-d( n-1 ) )*( d( n )+d( n-1 ) )
+                    a = -c*delsq + z( n-1 )*z( n-1 ) + z( n )*z( n )
+                    b = z( n )*z( n )*delsq
+                    if( a<zero ) then
+                       tau2 = two*b / ( sqrt( a*a+four*b*c )-a )
+                    else
+                       tau2 = ( a+sqrt( a*a+four*b*c ) ) / ( two*c )
+                    end if
+                    tau = tau2 / ( d( n )+sqrt( d( n )*d( n )+tau2 ) )
+                 end if
+                 ! it can be proved that
+                     ! d(n)^2+rho/2 <= sigma_n^2 < d(n)^2+tau2 <= d(n)^2+rho
+              else
+                 delsq = ( d( n )-d( n-1 ) )*( d( n )+d( n-1 ) )
+                 a = -c*delsq + z( n-1 )*z( n-1 ) + z( n )*z( n )
+                 b = z( n )*z( n )*delsq
+                 ! the following tau2 is to approximate
+                 ! sigma_n^2 - d( n )*d( n )
+                 if( a<zero ) then
+                    tau2 = two*b / ( sqrt( a*a+four*b*c )-a )
+                 else
+                    tau2 = ( a+sqrt( a*a+four*b*c ) ) / ( two*c )
+                 end if
+                 tau = tau2 / ( d( n )+sqrt( d( n )*d( n )+tau2 ) )
+                 ! it can be proved that
+                 ! d(n)^2 < d(n)^2+tau2 < sigma(n)^2 < d(n)^2+rho/2
+              end if
+              ! the following tau is to approximate sigma_n - d( n )
+               ! tau = tau2 / ( d( n )+sqrt( d( n )*d( n )+tau2 ) )
+              sigma = d( n ) + tau
+              do j = 1, n
+                 delta( j ) = ( d( j )-d( n ) ) - tau
+                 work( j ) = d( j ) + d( n ) + tau
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, ii
+                 temp = z( j ) / ( delta( j )*work( j ) )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              temp = z( n ) / ( delta( n )*work( n ) )
+              phi = z( n )*temp
+              dphi = temp*temp
+              erretm = eight*( -phi-psi ) + erretm - phi + rhoinv
+          ! $          + abs( tau2 )*( dpsi+dphi )
+              w = rhoinv + phi + psi
+              ! test for convergence
+              if( abs( w )<=eps*erretm ) then
+                 go to 240
+              end if
+              ! calculate the new step
+              niter = niter + 1
+              dtnsq1 = work( n-1 )*delta( n-1 )
+              dtnsq = work( n )*delta( n )
+              c = w - dtnsq1*dpsi - dtnsq*dphi
+              a = ( dtnsq+dtnsq1 )*w - dtnsq*dtnsq1*( dpsi+dphi )
+              b = dtnsq*dtnsq1*w
+              if( c<zero )c = abs( c )
+              if( c==zero ) then
+                 eta = rho - sigma*sigma
+              else if( a>=zero ) then
+                 eta = ( a+sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+              else
+                 eta = two*b / ( a-sqrt( abs( a*a-four*b*c ) ) )
+              end if
+              ! note, eta should be positive if w is negative, and
+              ! eta should be negative otherwise. however,
+              ! if for some reason caused by roundoff, eta*w > 0,
+              ! we simply use one newton step instead. this way
+              ! will guarantee eta*w < 0.
+              if( w*eta>zero )eta = -w / ( dpsi+dphi )
+              temp = eta - dtnsq
+              if( temp>rho )eta = rho + dtnsq
+              eta = eta / ( sigma+sqrt( eta+sigma*sigma ) )
+              tau = tau + eta
+              sigma = sigma + eta
+              do j = 1, n
+                 delta( j ) = delta( j ) - eta
+                 work( j ) = work( j ) + eta
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, ii
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              tau2 = work( n )*delta( n )
+              temp = z( n ) / tau2
+              phi = z( n )*temp
+              dphi = temp*temp
+              erretm = eight*( -phi-psi ) + erretm - phi + rhoinv
+          ! $          + abs( tau2 )*( dpsi+dphi )
+              w = rhoinv + phi + psi
+              ! main loop to update the values of the array   delta
+              iter = niter + 1
+              loop_90: do niter = iter, maxit
+                 ! test for convergence
+                 if( abs( w )<=eps*erretm ) then
+                    go to 240
+                 end if
+                 ! calculate the new step
+                 dtnsq1 = work( n-1 )*delta( n-1 )
+                 dtnsq = work( n )*delta( n )
+                 c = w - dtnsq1*dpsi - dtnsq*dphi
+                 a = ( dtnsq+dtnsq1 )*w - dtnsq1*dtnsq*( dpsi+dphi )
+                 b = dtnsq1*dtnsq*w
+                 if( a>=zero ) then
+                    eta = ( a+sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 else
+                    eta = two*b / ( a-sqrt( abs( a*a-four*b*c ) ) )
+                 end if
+                 ! note, eta should be positive if w is negative, and
+                 ! eta should be negative otherwise. however,
+                 ! if for some reason caused by roundoff, eta*w > 0,
+                 ! we simply use one newton step instead. this way
+                 ! will guarantee eta*w < 0.
+                 if( w*eta>zero )eta = -w / ( dpsi+dphi )
+                 temp = eta - dtnsq
+                 if( temp<=zero )eta = eta / two
+                 eta = eta / ( sigma+sqrt( eta+sigma*sigma ) )
+                 tau = tau + eta
+                 sigma = sigma + eta
+                 do j = 1, n
+                    delta( j ) = delta( j ) - eta
+                    work( j ) = work( j ) + eta
+                 end do
+                 ! evaluate psi and the derivative dpsi
+                 dpsi = zero
+                 psi = zero
+                 erretm = zero
+                 do j = 1, ii
+                    temp = z( j ) / ( work( j )*delta( j ) )
+                    psi = psi + z( j )*temp
+                    dpsi = dpsi + temp*temp
+                    erretm = erretm + psi
+                 end do
+                 erretm = abs( erretm )
+                 ! evaluate phi and the derivative dphi
+                 tau2 = work( n )*delta( n )
+                 temp = z( n ) / tau2
+                 phi = z( n )*temp
+                 dphi = temp*temp
+                 erretm = eight*( -phi-psi ) + erretm - phi + rhoinv
+          ! $             + abs( tau2 )*( dpsi+dphi )
+                 w = rhoinv + phi + psi
+              end do loop_90
+              ! return with info = 1, niter = maxit and not converged
+              info = 1
+              go to 240
+              ! end for the case i = n
+           else
+              ! the case for i < n
+              niter = 1
+              ip1 = i + 1
+              ! calculate initial guess
+              delsq = ( d( ip1 )-d( i ) )*( d( ip1 )+d( i ) )
+              delsq2 = delsq / two
+              sq2=sqrt( ( d( i )*d( i )+d( ip1 )*d( ip1 ) ) / two )
+              temp = delsq2 / ( d( i )+sq2 )
+              do j = 1, n
+                 work( j ) = d( j ) + d( i ) + temp
+                 delta( j ) = ( d( j )-d( i ) ) - temp
+              end do
+              psi = zero
+              do j = 1, i - 1
+                 psi = psi + z( j )*z( j ) / ( work( j )*delta( j ) )
+              end do
+              phi = zero
+              do j = n, i + 2, -1
+                 phi = phi + z( j )*z( j ) / ( work( j )*delta( j ) )
+              end do
+              c = rhoinv + psi + phi
+              w = c + z( i )*z( i ) / ( work( i )*delta( i ) ) +z( ip1 )*z( ip1 ) / ( work( ip1 )&
+                        *delta( ip1 ) )
+              geomavg = .false.
+              if( w>zero ) then
+                 ! d(i)^2 < the ith sigma^2 < (d(i)^2+d(i+1)^2)/2
+                 ! we choose d(i) as origin.
+                 orgati = .true.
+                 ii = i
+                 sglb = zero
+                 sgub = delsq2  / ( d( i )+sq2 )
+                 a = c*delsq + z( i )*z( i ) + z( ip1 )*z( ip1 )
+                 b = z( i )*z( i )*delsq
+                 if( a>zero ) then
+                    tau2 = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                 else
+                    tau2 = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 end if
+                 ! tau2 now is an estimation of sigma^2 - d( i )^2. the
+                 ! following, however, is the corresponding estimation of
+                 ! sigma - d( i ).
+                 tau = tau2 / ( d( i )+sqrt( d( i )*d( i )+tau2 ) )
+                 temp = sqrt(eps)
+                 if( (d(i)<=temp*d(ip1)).and.(abs(z(i))<=temp).and.(d(i)>zero) ) then
+                    tau = min( ten*d(i), sgub )
+                    geomavg = .true.
+                 end if
+              else
+                 ! (d(i)^2+d(i+1)^2)/2 <= the ith sigma^2 < d(i+1)^2/2
+                 ! we choose d(i+1) as origin.
+                 orgati = .false.
+                 ii = ip1
+                 sglb = -delsq2  / ( d( ii )+sq2 )
+                 sgub = zero
+                 a = c*delsq - z( i )*z( i ) - z( ip1 )*z( ip1 )
+                 b = z( ip1 )*z( ip1 )*delsq
+                 if( a<zero ) then
+                    tau2 = two*b / ( a-sqrt( abs( a*a+four*b*c ) ) )
+                 else
+                    tau2 = -( a+sqrt( abs( a*a+four*b*c ) ) ) / ( two*c )
+                 end if
+                 ! tau2 now is an estimation of sigma^2 - d( ip1 )^2. the
+                 ! following, however, is the corresponding estimation of
+                 ! sigma - d( ip1 ).
+                 tau = tau2 / ( d( ip1 )+sqrt( abs( d( ip1 )*d( ip1 )+tau2 ) ) )
+              end if
+              sigma = d( ii ) + tau
+              do j = 1, n
+                 work( j ) = d( j ) + d( ii ) + tau
+                 delta( j ) = ( d( j )-d( ii ) ) - tau
+              end do
+              iim1 = ii - 1
+              iip1 = ii + 1
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, iim1
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              dphi = zero
+              phi = zero
+              do j = n, iip1, -1
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 phi = phi + z( j )*temp
+                 dphi = dphi + temp*temp
+                 erretm = erretm + phi
+              end do
+              w = rhoinv + phi + psi
+              ! w is the value of the secular function with
+              ! its ii-th element removed.
+              swtch3 = .false.
+              if( orgati ) then
+                 if( w<zero )swtch3 = .true.
+              else
+                 if( w>zero )swtch3 = .true.
+              end if
+              if( ii==1 .or. ii==n )swtch3 = .false.
+              temp = z( ii ) / ( work( ii )*delta( ii ) )
+              dw = dpsi + dphi + temp*temp
+              temp = z( ii )*temp
+              w = w + temp
+              erretm = eight*( phi-psi ) + erretm + two*rhoinv+ three*abs( temp )
+          ! $          + abs( tau2 )*dw
+              ! test for convergence
+              if( abs( w )<=eps*erretm ) then
+                 go to 240
+              end if
+              if( w<=zero ) then
+                 sglb = max( sglb, tau )
+              else
+                 sgub = min( sgub, tau )
+              end if
+              ! calculate the new step
+              niter = niter + 1
+              if( .not.swtch3 ) then
+                 dtipsq = work( ip1 )*delta( ip1 )
+                 dtisq = work( i )*delta( i )
+                 if( orgati ) then
+                    c = w - dtipsq*dw + delsq*( z( i ) / dtisq )**2
+                 else
+                    c = w - dtisq*dw - delsq*( z( ip1 ) / dtipsq )**2
+                 end if
+                 a = ( dtipsq+dtisq )*w - dtipsq*dtisq*dw
+                 b = dtipsq*dtisq*w
+                 if( c==zero ) then
+                    if( a==zero ) then
+                       if( orgati ) then
+                          a = z( i )*z( i ) + dtipsq*dtipsq*( dpsi+dphi )
+                       else
+                          a = z( ip1 )*z( ip1 ) + dtisq*dtisq*( dpsi+dphi )
+                       end if
+                    end if
+                    eta = b / a
+                 else if( a<=zero ) then
+                    eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 else
+                    eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                 end if
+              else
+                 ! interpolation using three most relevant poles
+                 dtiim = work( iim1 )*delta( iim1 )
+                 dtiip = work( iip1 )*delta( iip1 )
+                 temp = rhoinv + psi + phi
+                 if( orgati ) then
+                    temp1 = z( iim1 ) / dtiim
+                    temp1 = temp1*temp1
+                    c = ( temp - dtiip*( dpsi+dphi ) ) -( d( iim1 )-d( iip1 ) )*( d( iim1 )+d( &
+                              iip1 ) )*temp1
+                    zz( 1 ) = z( iim1 )*z( iim1 )
+                    if( dpsi<temp1 ) then
+                       zz( 3 ) = dtiip*dtiip*dphi
+                    else
+                       zz( 3 ) = dtiip*dtiip*( ( dpsi-temp1 )+dphi )
+                    end if
+                 else
+                    temp1 = z( iip1 ) / dtiip
+                    temp1 = temp1*temp1
+                    c = ( temp - dtiim*( dpsi+dphi ) ) -( d( iip1 )-d( iim1 ) )*( d( iim1 )+d( &
+                              iip1 ) )*temp1
+                    if( dphi<temp1 ) then
+                       zz( 1 ) = dtiim*dtiim*dpsi
+                    else
+                       zz( 1 ) = dtiim*dtiim*( dpsi+( dphi-temp1 ) )
+                    end if
+                    zz( 3 ) = z( iip1 )*z( iip1 )
+                 end if
+                 zz( 2 ) = z( ii )*z( ii )
+                 dd( 1 ) = dtiim
+                 dd( 2 ) = delta( ii )*work( ii )
+                 dd( 3 ) = dtiip
+                 call stdlib_dlaed6( niter, orgati, c, dd, zz, w, eta, info )
+                 if( info/=0 ) then
+                    ! if info is not 0, i.e., stdlib_dlaed6 failed, switch back
+                    ! to 2 pole interpolation.
+                    swtch3 = .false.
+                    info = 0
+                    dtipsq = work( ip1 )*delta( ip1 )
+                    dtisq = work( i )*delta( i )
+                    if( orgati ) then
+                       c = w - dtipsq*dw + delsq*( z( i ) / dtisq )**2
+                    else
+                       c = w - dtisq*dw - delsq*( z( ip1 ) / dtipsq )**2
+                    end if
+                    a = ( dtipsq+dtisq )*w - dtipsq*dtisq*dw
+                    b = dtipsq*dtisq*w
+                    if( c==zero ) then
+                       if( a==zero ) then
+                          if( orgati ) then
+                             a = z( i )*z( i ) + dtipsq*dtipsq*( dpsi+dphi )
+                          else
+                             a = z( ip1 )*z( ip1 ) + dtisq*dtisq*( dpsi+dphi)
+                          end if
+                       end if
+                       eta = b / a
+                    else if( a<=zero ) then
+                       eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                    else
+                       eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                    end if
+                 end if
+              end if
+              ! note, eta should be positive if w is negative, and
+              ! eta should be negative otherwise. however,
+              ! if for some reason caused by roundoff, eta*w > 0,
+              ! we simply use one newton step instead. this way
+              ! will guarantee eta*w < 0.
+              if( w*eta>=zero )eta = -w / dw
+              eta = eta / ( sigma+sqrt( sigma*sigma+eta ) )
+              temp = tau + eta
+              if( temp>sgub .or. temp<sglb ) then
+                 if( w<zero ) then
+                    eta = ( sgub-tau ) / two
+                 else
+                    eta = ( sglb-tau ) / two
+                 end if
+                 if( geomavg ) then
+                    if( w < zero ) then
+                       if( tau > zero ) then
+                          eta = sqrt(sgub*tau)-tau
+                       end if
+                    else
+                       if( sglb > zero ) then
+                          eta = sqrt(sglb*tau)-tau
+                       end if
+                    end if
+                 end if
+              end if
+              prew = w
+              tau = tau + eta
+              sigma = sigma + eta
+              do j = 1, n
+                 work( j ) = work( j ) + eta
+                 delta( j ) = delta( j ) - eta
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, iim1
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              dphi = zero
+              phi = zero
+              do j = n, iip1, -1
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 phi = phi + z( j )*temp
+                 dphi = dphi + temp*temp
+                 erretm = erretm + phi
+              end do
+              tau2 = work( ii )*delta( ii )
+              temp = z( ii ) / tau2
+              dw = dpsi + dphi + temp*temp
+              temp = z( ii )*temp
+              w = rhoinv + phi + psi + temp
+              erretm = eight*( phi-psi ) + erretm + two*rhoinv+ three*abs( temp )
+          ! $          + abs( tau2 )*dw
+              swtch = .false.
+              if( orgati ) then
+                 if( -w>abs( prew ) / ten )swtch = .true.
+              else
+                 if( w>abs( prew ) / ten )swtch = .true.
+              end if
+              ! main loop to update the values of the array   delta and work
+              iter = niter + 1
+              loop_230: do niter = iter, maxit
+                 ! test for convergence
+                 if( abs( w )<=eps*erretm ) then
+           ! $          .or. (sgub-sglb)<=eight*abs(sgub+sglb) ) then
+                    go to 240
+                 end if
+                 if( w<=zero ) then
+                    sglb = max( sglb, tau )
+                 else
+                    sgub = min( sgub, tau )
+                 end if
+                 ! calculate the new step
+                 if( .not.swtch3 ) then
+                    dtipsq = work( ip1 )*delta( ip1 )
+                    dtisq = work( i )*delta( i )
+                    if( .not.swtch ) then
+                       if( orgati ) then
+                          c = w - dtipsq*dw + delsq*( z( i ) / dtisq )**2
+                       else
+                          c = w - dtisq*dw - delsq*( z( ip1 ) / dtipsq )**2
+                       end if
+                    else
+                       temp = z( ii ) / ( work( ii )*delta( ii ) )
+                       if( orgati ) then
+                          dpsi = dpsi + temp*temp
+                       else
+                          dphi = dphi + temp*temp
+                       end if
+                       c = w - dtisq*dpsi - dtipsq*dphi
+                    end if
+                    a = ( dtipsq+dtisq )*w - dtipsq*dtisq*dw
+                    b = dtipsq*dtisq*w
+                    if( c==zero ) then
+                       if( a==zero ) then
+                          if( .not.swtch ) then
+                             if( orgati ) then
+                                a = z( i )*z( i ) + dtipsq*dtipsq*( dpsi+dphi )
+                             else
+                                a = z( ip1 )*z( ip1 ) +dtisq*dtisq*( dpsi+dphi )
+                             end if
+                          else
+                             a = dtisq*dtisq*dpsi + dtipsq*dtipsq*dphi
+                          end if
+                       end if
+                       eta = b / a
+                    else if( a<=zero ) then
+                       eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                    else
+                       eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                    end if
+                 else
+                    ! interpolation using three most relevant poles
+                    dtiim = work( iim1 )*delta( iim1 )
+                    dtiip = work( iip1 )*delta( iip1 )
+                    temp = rhoinv + psi + phi
+                    if( swtch ) then
+                       c = temp - dtiim*dpsi - dtiip*dphi
+                       zz( 1 ) = dtiim*dtiim*dpsi
+                       zz( 3 ) = dtiip*dtiip*dphi
+                    else
+                       if( orgati ) then
+                          temp1 = z( iim1 ) / dtiim
+                          temp1 = temp1*temp1
+                          temp2 = ( d( iim1 )-d( iip1 ) )*( d( iim1 )+d( iip1 ) )*temp1
+                          c = temp - dtiip*( dpsi+dphi ) - temp2
+                          zz( 1 ) = z( iim1 )*z( iim1 )
+                          if( dpsi<temp1 ) then
+                             zz( 3 ) = dtiip*dtiip*dphi
+                          else
+                             zz( 3 ) = dtiip*dtiip*( ( dpsi-temp1 )+dphi )
+                          end if
+                       else
+                          temp1 = z( iip1 ) / dtiip
+                          temp1 = temp1*temp1
+                          temp2 = ( d( iip1 )-d( iim1 ) )*( d( iim1 )+d( iip1 ) )*temp1
+                          c = temp - dtiim*( dpsi+dphi ) - temp2
+                          if( dphi<temp1 ) then
+                             zz( 1 ) = dtiim*dtiim*dpsi
+                          else
+                             zz( 1 ) = dtiim*dtiim*( dpsi+( dphi-temp1 ) )
+                          end if
+                          zz( 3 ) = z( iip1 )*z( iip1 )
+                       end if
+                    end if
+                    dd( 1 ) = dtiim
+                    dd( 2 ) = delta( ii )*work( ii )
+                    dd( 3 ) = dtiip
+                    call stdlib_dlaed6( niter, orgati, c, dd, zz, w, eta, info )
+                    if( info/=0 ) then
+                       ! if info is not 0, i.e., stdlib_dlaed6 failed, switch
+                       ! back to two pole interpolation
+                       swtch3 = .false.
+                       info = 0
+                       dtipsq = work( ip1 )*delta( ip1 )
+                       dtisq = work( i )*delta( i )
+                       if( .not.swtch ) then
+                          if( orgati ) then
+                             c = w - dtipsq*dw + delsq*( z( i )/dtisq )**2
+                          else
+                             c = w - dtisq*dw - delsq*( z( ip1 )/dtipsq )**2
+                          end if
+                       else
+                          temp = z( ii ) / ( work( ii )*delta( ii ) )
+                          if( orgati ) then
+                             dpsi = dpsi + temp*temp
+                          else
+                             dphi = dphi + temp*temp
+                          end if
+                          c = w - dtisq*dpsi - dtipsq*dphi
+                       end if
+                       a = ( dtipsq+dtisq )*w - dtipsq*dtisq*dw
+                       b = dtipsq*dtisq*w
+                       if( c==zero ) then
+                          if( a==zero ) then
+                             if( .not.swtch ) then
+                                if( orgati ) then
+                                   a = z( i )*z( i ) + dtipsq*dtipsq*( dpsi+dphi )
+                                else
+                                   a = z( ip1 )*z( ip1 ) +dtisq*dtisq*( dpsi+dphi )
+                                end if
+                             else
+                                a = dtisq*dtisq*dpsi + dtipsq*dtipsq*dphi
+                             end if
+                          end if
+                          eta = b / a
+                       else if( a<=zero ) then
+                          eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                       else
+                          eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                       end if
+                    end if
+                 end if
+                 ! note, eta should be positive if w is negative, and
+                 ! eta should be negative otherwise. however,
+                 ! if for some reason caused by roundoff, eta*w > 0,
+                 ! we simply use one newton step instead. this way
+                 ! will guarantee eta*w < 0.
+                 if( w*eta>=zero )eta = -w / dw
+                 eta = eta / ( sigma+sqrt( sigma*sigma+eta ) )
+                 temp=tau+eta
+                 if( temp>sgub .or. temp<sglb ) then
+                    if( w<zero ) then
+                       eta = ( sgub-tau ) / two
+                    else
+                       eta = ( sglb-tau ) / two
+                    end if
+                    if( geomavg ) then
+                       if( w < zero ) then
+                          if( tau > zero ) then
+                             eta = sqrt(sgub*tau)-tau
+                          end if
+                       else
+                          if( sglb > zero ) then
+                             eta = sqrt(sglb*tau)-tau
+                          end if
+                       end if
+                    end if
+                 end if
+                 prew = w
+                 tau = tau + eta
+                 sigma = sigma + eta
+                 do j = 1, n
+                    work( j ) = work( j ) + eta
+                    delta( j ) = delta( j ) - eta
+                 end do
+                 ! evaluate psi and the derivative dpsi
+                 dpsi = zero
+                 psi = zero
+                 erretm = zero
+                 do j = 1, iim1
+                    temp = z( j ) / ( work( j )*delta( j ) )
+                    psi = psi + z( j )*temp
+                    dpsi = dpsi + temp*temp
+                    erretm = erretm + psi
+                 end do
+                 erretm = abs( erretm )
+                 ! evaluate phi and the derivative dphi
+                 dphi = zero
+                 phi = zero
+                 do j = n, iip1, -1
+                    temp = z( j ) / ( work( j )*delta( j ) )
+                    phi = phi + z( j )*temp
+                    dphi = dphi + temp*temp
+                    erretm = erretm + phi
+                 end do
+                 tau2 = work( ii )*delta( ii )
+                 temp = z( ii ) / tau2
+                 dw = dpsi + dphi + temp*temp
+                 temp = z( ii )*temp
+                 w = rhoinv + phi + psi + temp
+                 erretm = eight*( phi-psi ) + erretm + two*rhoinv+ three*abs( temp )
+          ! $             + abs( tau2 )*dw
+                 if( w*prew>zero .and. abs( w )>abs( prew ) / ten )swtch = .not.swtch
+              end do loop_230
+              ! return with info = 1, niter = maxit and not converged
+              info = 1
+           end if
+           240 continue
+           return
+     end subroutine stdlib_dlasd4
+
+     !> DLASD7: merges the two sets of singular values together into a single
+     !> sorted set. Then it tries to deflate the size of the problem. There
+     !> are two ways in which deflation can occur:  when two or more singular
+     !> values are close together or if there is a tiny entry in the Z
+     !> vector. For each such occurrence the order of the related
+     !> secular equation problem is reduced by one.
+     !> DLASD7 is called from DLASD6.
+
+     pure subroutine stdlib_dlasd7( icompq, nl, nr, sqre, k, d, z, zw, vf, vfw, vl,vlw, alpha, &
+     beta, dsigma, idx, idxp, idxq,perm, givptr, givcol, ldgcol, givnum, ldgnum,c, s, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: givptr, info, k
+           integer(ilp), intent(in) :: icompq, ldgcol, ldgnum, nl, nr, sqre
+           real(dp), intent(in) :: alpha, beta
+           real(dp), intent(out) :: c, s
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(ldgcol,*), idx(*), idxp(*), perm(*)
+           integer(ilp), intent(inout) :: idxq(*)
+           real(dp), intent(inout) :: d(*), vf(*), vl(*)
+           real(dp), intent(out) :: dsigma(*), givnum(ldgnum,*), vfw(*), vlw(*), z(*), zw(*)
+                     
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, idxi, idxj, idxjp, j, jp, jprev, k2, m, n, nlp1, nlp2
+           real(dp) :: eps, hlftol, tau, tol, z1
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           n = nl + nr + 1
+           m = n + sqre
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( nl<1 ) then
+              info = -2
+           else if( nr<1 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( ldgcol<n ) then
+              info = -22
+           else if( ldgnum<n ) then
+              info = -24
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD7', -info )
+              return
+           end if
+           nlp1 = nl + 1
+           nlp2 = nl + 2
+           if( icompq==1 ) then
+              givptr = 0
+           end if
+           ! generate the first part of the vector z and move the singular
+           ! values in the first part of d one position backward.
+           z1 = alpha*vl( nlp1 )
+           vl( nlp1 ) = zero
+           tau = vf( nlp1 )
+           do i = nl, 1, -1
+              z( i+1 ) = alpha*vl( i )
+              vl( i ) = zero
+              vf( i+1 ) = vf( i )
+              d( i+1 ) = d( i )
+              idxq( i+1 ) = idxq( i ) + 1
+           end do
+           vf( 1 ) = tau
+           ! generate the second part of the vector z.
+           do i = nlp2, m
+              z( i ) = beta*vf( i )
+              vf( i ) = zero
+           end do
+           ! sort the singular values into increasing order
+           do i = nlp2, n
+              idxq( i ) = idxq( i ) + nlp1
+           end do
+           ! dsigma, idxc, idxc, and zw are used as storage space.
+           do i = 2, n
+              dsigma( i ) = d( idxq( i ) )
+              zw( i ) = z( idxq( i ) )
+              vfw( i ) = vf( idxq( i ) )
+              vlw( i ) = vl( idxq( i ) )
+           end do
+           call stdlib_dlamrg( nl, nr, dsigma( 2 ), 1, 1, idx( 2 ) )
+           do i = 2, n
+              idxi = 1 + idx( i )
+              d( i ) = dsigma( idxi )
+              z( i ) = zw( idxi )
+              vf( i ) = vfw( idxi )
+              vl( i ) = vlw( idxi )
+           end do
+           ! calculate the allowable deflation tolerance
+           eps = stdlib_dlamch( 'EPSILON' )
+           tol = max( abs( alpha ), abs( beta ) )
+           tol = eight*eight*eps*max( abs( d( n ) ), tol )
+           ! there are 2 kinds of deflation -- first a value in the z-vector
+           ! is small, second two (or more) singular values are very close
+           ! together (their difference is small).
+           ! if the value in the z-vector is small, we simply permute the
+           ! array so that the corresponding singular value is moved to the
+           ! end.
+           ! if two values in the d-vector are close, we perform a two-sided
+           ! rotation designed to make one of the corresponding z-vector
+           ! entries zero, and then permute the array so that the deflated
+           ! singular value is moved to the end.
+           ! if there are multiple singular values then the problem deflates.
+           ! here the number of equal singular values are found.  as each equal
+           ! singular value is found, an elementary reflector is computed to
+           ! rotate the corresponding singular subspace so that the
+           ! corresponding components of z are zero in this new basis.
+           k = 1
+           k2 = n + 1
+           do j = 2, n
+              if( abs( z( j ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 idxp( k2 ) = j
+                 if( j==n )go to 100
+              else
+                 jprev = j
+                 go to 70
+              end if
+           end do
+           70 continue
+           j = jprev
+           80 continue
+           j = j + 1
+           if( j>n )go to 90
+           if( abs( z( j ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              idxp( k2 ) = j
+           else
+              ! check if singular values are close enough to allow deflation.
+              if( abs( d( j )-d( jprev ) )<=tol ) then
+                 ! deflation is possible.
+                 s = z( jprev )
+                 c = z( j )
+                 ! find sqrt(a**2+b**2) without overflow or
+                 ! destructive underflow.
+                 tau = stdlib_dlapy2( c, s )
+                 z( j ) = tau
+                 z( jprev ) = zero
+                 c = c / tau
+                 s = -s / tau
+                 ! record the appropriate givens rotation
+                 if( icompq==1 ) then
+                    givptr = givptr + 1
+                    idxjp = idxq( idx( jprev )+1 )
+                    idxj = idxq( idx( j )+1 )
+                    if( idxjp<=nlp1 ) then
+                       idxjp = idxjp - 1
+                    end if
+                    if( idxj<=nlp1 ) then
+                       idxj = idxj - 1
+                    end if
+                    givcol( givptr, 2 ) = idxjp
+                    givcol( givptr, 1 ) = idxj
+                    givnum( givptr, 2 ) = c
+                    givnum( givptr, 1 ) = s
+                 end if
+                 call stdlib_drot( 1, vf( jprev ), 1, vf( j ), 1, c, s )
+                 call stdlib_drot( 1, vl( jprev ), 1, vl( j ), 1, c, s )
+                 k2 = k2 - 1
+                 idxp( k2 ) = jprev
+                 jprev = j
+              else
+                 k = k + 1
+                 zw( k ) = z( jprev )
+                 dsigma( k ) = d( jprev )
+                 idxp( k ) = jprev
+                 jprev = j
+              end if
+           end if
+           go to 80
+           90 continue
+           ! record the last singular value.
+           k = k + 1
+           zw( k ) = z( jprev )
+           dsigma( k ) = d( jprev )
+           idxp( k ) = jprev
+           100 continue
+           ! sort the singular values into dsigma. the singular values which
+           ! were not deflated go into the first k slots of dsigma, except
+           ! that dsigma(1) is treated separately.
+           do j = 2, n
+              jp = idxp( j )
+              dsigma( j ) = d( jp )
+              vfw( j ) = vf( jp )
+              vlw( j ) = vl( jp )
+           end do
+           if( icompq==1 ) then
+              do j = 2, n
+                 jp = idxp( j )
+                 perm( j ) = idxq( idx( jp )+1 )
+                 if( perm( j )<=nlp1 ) then
+                    perm( j ) = perm( j ) - 1
+                 end if
+              end do
+           end if
+           ! the deflated singular values go back into the last n - k slots of
+           ! d.
+           call stdlib_dcopy( n-k, dsigma( k+1 ), 1, d( k+1 ), 1 )
+           ! determine dsigma(1), dsigma(2), z(1), vf(1), vl(1), vf(m), and
+           ! vl(m).
+           dsigma( 1 ) = zero
+           hlftol = tol / two
+           if( abs( dsigma( 2 ) )<=hlftol )dsigma( 2 ) = hlftol
+           if( m>n ) then
+              z( 1 ) = stdlib_dlapy2( z1, z( m ) )
+              if( z( 1 )<=tol ) then
+                 c = one
+                 s = zero
+                 z( 1 ) = tol
+              else
+                 c = z1 / z( 1 )
+                 s = -z( m ) / z( 1 )
+              end if
+              call stdlib_drot( 1, vf( m ), 1, vf( 1 ), 1, c, s )
+              call stdlib_drot( 1, vl( m ), 1, vl( 1 ), 1, c, s )
+           else
+              if( abs( z1 )<=tol ) then
+                 z( 1 ) = tol
+              else
+                 z( 1 ) = z1
+              end if
+           end if
+           ! restore z, vf, and vl.
+           call stdlib_dcopy( k-1, zw( 2 ), 1, z( 2 ), 1 )
+           call stdlib_dcopy( n-1, vfw( 2 ), 1, vf( 2 ), 1 )
+           call stdlib_dcopy( n-1, vlw( 2 ), 1, vl( 2 ), 1 )
+           return
+     end subroutine stdlib_dlasd7
+
+     !> DLASD8: finds the square roots of the roots of the secular equation,
+     !> as defined by the values in DSIGMA and Z. It makes the appropriate
+     !> calls to DLASD4, and stores, for each  element in D, the distance
+     !> to its two nearest poles (elements in DSIGMA). It also updates
+     !> the arrays VF and VL, the first and last components of all the
+     !> right singular vectors of the original bidiagonal matrix.
+     !> DLASD8 is called from DLASD6.
+
+     pure subroutine stdlib_dlasd8( icompq, k, d, z, vf, vl, difl, difr, lddifr,dsigma, work, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, k, lddifr
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(out) :: d(*), difl(*), difr(lddifr,*), work(*)
+           real(dp), intent(inout) :: dsigma(*), vf(*), vl(*), z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iwk1, iwk2, iwk2i, iwk3, iwk3i, j
+           real(dp) :: diflj, difrj, dj, dsigj, dsigjp, rho, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( k<1 ) then
+              info = -2
+           else if( lddifr<k ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD8', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==1 ) then
+              d( 1 ) = abs( z( 1 ) )
+              difl( 1 ) = d( 1 )
+              if( icompq==1 ) then
+                 difl( 2 ) = one
+                 difr( 1, 2 ) = one
+              end if
+              return
+           end if
+           ! modify values dsigma(i) to make sure all dsigma(i)-dsigma(j) can
+           ! be computed with high relative accuracy (barring over/underflow).
+           ! this is a problem on machines without a guard digit in
+           ! add/subtract (cray xmp, cray ymp, cray c 90 and cray 2).
+           ! the following code replaces dsigma(i) by 2*dsigma(i)-dsigma(i),
+           ! which on any of these machines zeros out the bottommost
+           ! bit of dsigma(i) if it is 1; this makes the subsequent
+           ! subtractions dsigma(i)-dsigma(j) unproblematic when cancellation
+           ! occurs. on binary machines with a guard digit (almost all
+           ! machines) it does not change dsigma(i) at all. on hexadecimal
+           ! and decimal machines with a guard digit, it slightly
+           ! changes the bottommost bits of dsigma(i). it does not account
+           ! for hexadecimal or decimal machines without guard digits
+           ! (we know of none). we use a subroutine call to compute
+           ! 2*dlambda(i) to prevent optimizing compilers from eliminating
+           ! this code.
+           do i = 1, k
+              dsigma( i ) = stdlib_dlamc3( dsigma( i ), dsigma( i ) ) - dsigma( i )
+           end do
+           ! book keeping.
+           iwk1 = 1
+           iwk2 = iwk1 + k
+           iwk3 = iwk2 + k
+           iwk2i = iwk2 - 1
+           iwk3i = iwk3 - 1
+           ! normalize z.
+           rho = stdlib_dnrm2( k, z, 1 )
+           call stdlib_dlascl( 'G', 0, 0, rho, one, k, 1, z, k, info )
+           rho = rho*rho
+           ! initialize work(iwk3).
+           call stdlib_dlaset( 'A', k, 1, one, one, work( iwk3 ), k )
+           ! compute the updated singular values, the arrays difl, difr,
+           ! and the updated z.
+           do j = 1, k
+              call stdlib_dlasd4( k, j, dsigma, z, work( iwk1 ), rho, d( j ),work( iwk2 ), info )
+                        
+              ! if the root finder fails, report the convergence failure.
+              if( info/=0 ) then
+                 return
+              end if
+              work( iwk3i+j ) = work( iwk3i+j )*work( j )*work( iwk2i+j )
+              difl( j ) = -work( j )
+              difr( j, 1 ) = -work( j+1 )
+              do i = 1, j - 1
+                 work( iwk3i+i ) = work( iwk3i+i )*work( i )*work( iwk2i+i ) / ( dsigma( i )-&
+                           dsigma( j ) ) / ( dsigma( i )+dsigma( j ) )
+              end do
+              do i = j + 1, k
+                 work( iwk3i+i ) = work( iwk3i+i )*work( i )*work( iwk2i+i ) / ( dsigma( i )-&
+                           dsigma( j ) ) / ( dsigma( i )+dsigma( j ) )
+              end do
+           end do
+           ! compute updated z.
+           do i = 1, k
+              z( i ) = sign( sqrt( abs( work( iwk3i+i ) ) ), z( i ) )
+           end do
+           ! update vf and vl.
+           do j = 1, k
+              diflj = difl( j )
+              dj = d( j )
+              dsigj = -dsigma( j )
+              if( j<k ) then
+                 difrj = -difr( j, 1 )
+                 dsigjp = -dsigma( j+1 )
+              end if
+              work( j ) = -z( j ) / diflj / ( dsigma( j )+dj )
+              do i = 1, j - 1
+                 work( i ) = z( i ) / ( stdlib_dlamc3( dsigma( i ), dsigj )-diflj )/ ( dsigma( i )&
+                           +dj )
+              end do
+              do i = j + 1, k
+                 work( i ) = z( i ) / ( stdlib_dlamc3( dsigma( i ), dsigjp )+difrj )/ ( dsigma( i &
+                           )+dj )
+              end do
+              temp = stdlib_dnrm2( k, work, 1 )
+              work( iwk2i+j ) = stdlib_ddot( k, work, 1, vf, 1 ) / temp
+              work( iwk3i+j ) = stdlib_ddot( k, work, 1, vl, 1 ) / temp
+              if( icompq==1 ) then
+                 difr( j, 2 ) = temp
+              end if
+           end do
+           call stdlib_dcopy( k, work( iwk2 ), 1, vf, 1 )
+           call stdlib_dcopy( k, work( iwk3 ), 1, vl, 1 )
+           return
+     end subroutine stdlib_dlasd8
+
+     !> DLASQ3: checks for deflation, computes a shift (TAU) and calls dqds.
+     !> In case of failure it changes shifts, and tries again until output
+     !> is positive.
+
+     pure subroutine stdlib_dlasq3( i0, n0, z, pp, dmin, sigma, desig, qmax, nfail,iter, ndiv, &
+               ieee, ttype, dmin1, dmin2, dn, dn1,dn2, g, tau )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: ieee
+           integer(ilp), intent(in) :: i0
+           integer(ilp), intent(inout) :: iter, n0, ndiv, nfail, pp
+           real(dp), intent(inout) :: desig, dmin1, dmin2, dn, dn1, dn2, g, qmax, tau
+           real(dp), intent(out) :: dmin, sigma
+           ! Array Arguments 
+           real(dp), intent(inout) :: z(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: cbias = 1.50_dp
+           real(dp), parameter :: qurtr = 0.250_dp
+           real(dp), parameter :: hundrd = 100.0_dp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: ipn4, j4, n0in, nn
+           integer(ilp), intent(inout) :: ttype
+           real(dp) :: eps, s, t, temp, tol, tol2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           n0in = n0
+           eps = stdlib_dlamch( 'PRECISION' )
+           tol = eps*hundrd
+           tol2 = tol**2
+           ! check for deflation.
+           10 continue
+           if( n0<i0 )return
+           if( n0==i0 )go to 20
+           nn = 4*n0 + pp
+           if( n0==( i0+1 ) )go to 40
+           ! check whether e(n0-1) is negligible, 1 eigenvalue.
+           if( z( nn-5 )>tol2*( sigma+z( nn-3 ) ) .and.z( nn-2*pp-4 )>tol2*z( nn-7 ) )go to &
+                     30
+           20 continue
+           z( 4*n0-3 ) = z( 4*n0+pp-3 ) + sigma
+           n0 = n0 - 1
+           go to 10
+           ! check  whether e(n0-2) is negligible, 2 eigenvalues.
+           30 continue
+           if( z( nn-9 )>tol2*sigma .and.z( nn-2*pp-8 )>tol2*z( nn-11 ) )go to 50
+           40 continue
+           if( z( nn-3 )>z( nn-7 ) ) then
+              s = z( nn-3 )
+              z( nn-3 ) = z( nn-7 )
+              z( nn-7 ) = s
+           end if
+           t = half*( ( z( nn-7 )-z( nn-3 ) )+z( nn-5 ) )
+           if( z( nn-5 )>z( nn-3 )*tol2.and.t/=zero ) then
+              s = z( nn-3 )*( z( nn-5 ) / t )
+              if( s<=t ) then
+                 s = z( nn-3 )*( z( nn-5 ) /( t*( one+sqrt( one+s / t ) ) ) )
+              else
+                 s = z( nn-3 )*( z( nn-5 ) / ( t+sqrt( t )*sqrt( t+s ) ) )
+              end if
+              t = z( nn-7 ) + ( s+z( nn-5 ) )
+              z( nn-3 ) = z( nn-3 )*( z( nn-7 ) / t )
+              z( nn-7 ) = t
+           end if
+           z( 4*n0-7 ) = z( nn-7 ) + sigma
+           z( 4*n0-3 ) = z( nn-3 ) + sigma
+           n0 = n0 - 2
+           go to 10
+           50 continue
+           if( pp==2 )pp = 0
+           ! reverse the qd-array, if warranted.
+           if( dmin<=zero .or. n0<n0in ) then
+              if( cbias*z( 4*i0+pp-3 )<z( 4*n0+pp-3 ) ) then
+                 ipn4 = 4*( i0+n0 )
+                 do j4 = 4*i0, 2*( i0+n0-1 ), 4
+                    temp = z( j4-3 )
+                    z( j4-3 ) = z( ipn4-j4-3 )
+                    z( ipn4-j4-3 ) = temp
+                    temp = z( j4-2 )
+                    z( j4-2 ) = z( ipn4-j4-2 )
+                    z( ipn4-j4-2 ) = temp
+                    temp = z( j4-1 )
+                    z( j4-1 ) = z( ipn4-j4-5 )
+                    z( ipn4-j4-5 ) = temp
+                    temp = z( j4 )
+                    z( j4 ) = z( ipn4-j4-4 )
+                    z( ipn4-j4-4 ) = temp
+                 end do
+                 if( n0-i0<=4 ) then
+                    z( 4*n0+pp-1 ) = z( 4*i0+pp-1 )
+                    z( 4*n0-pp ) = z( 4*i0-pp )
+                 end if
+                 dmin2 = min( dmin2, z( 4*n0+pp-1 ) )
+                 z( 4*n0+pp-1 ) = min( z( 4*n0+pp-1 ), z( 4*i0+pp-1 ),z( 4*i0+pp+3 ) )
+                 z( 4*n0-pp ) = min( z( 4*n0-pp ), z( 4*i0-pp ),z( 4*i0-pp+4 ) )
+                 qmax = max( qmax, z( 4*i0+pp-3 ), z( 4*i0+pp+1 ) )
+                 dmin = -zero
+              end if
+           end if
+           ! choose a shift.
+           call stdlib_dlasq4( i0, n0, z, pp, n0in, dmin, dmin1, dmin2, dn, dn1,dn2, tau, ttype, &
+                     g )
+           ! call dqds until dmin > 0.
+           70 continue
+           call stdlib_dlasq5( i0, n0, z, pp, tau, sigma, dmin, dmin1, dmin2, dn,dn1, dn2, ieee, &
+                     eps )
+           ndiv = ndiv + ( n0-i0+2 )
+           iter = iter + 1
+           ! check status.
+           if( dmin>=zero .and. dmin1>=zero ) then
+              ! success.
+              go to 90
+           else if( dmin<zero .and. dmin1>zero .and.z( 4*( n0-1 )-pp )<tol*( sigma+dn1 ) .and.abs(&
+                      dn )<tol*sigma ) then
+              ! convergence hidden by negative dn.
+              z( 4*( n0-1 )-pp+2 ) = zero
+              dmin = zero
+              go to 90
+           else if( dmin<zero ) then
+              ! tau too big. select new tau and try again.
+              nfail = nfail + 1
+              if( ttype<-22 ) then
+                 ! failed twice. play it safe.
+                 tau = zero
+              else if( dmin1>zero ) then
+                 ! late failure. gives excellent shift.
+                 tau = ( tau+dmin )*( one-two*eps )
+                 ttype = ttype - 11
+              else
+                 ! early failure. divide by 4.
+                 tau = qurtr*tau
+                 ttype = ttype - 12
+              end if
+              go to 70
+           else if( stdlib_disnan( dmin ) ) then
+              ! nan.
+              if( tau==zero ) then
+                 go to 80
+              else
+                 tau = zero
+                 go to 70
+              end if
+           else
+              ! possible underflow. play it safe.
+              go to 80
+           end if
+           ! risk of underflow.
+           80 continue
+           call stdlib_dlasq6( i0, n0, z, pp, dmin, dmin1, dmin2, dn, dn1, dn2 )
+           ndiv = ndiv + ( n0-i0+2 )
+           iter = iter + 1
+           tau = zero
+           90 continue
+           if( tau<sigma ) then
+              desig = desig + tau
+              t = sigma + desig
+              desig = desig - ( t-sigma )
+           else
+              t = sigma + tau
+              desig = sigma - ( t-tau ) + desig
+           end if
+           sigma = t
+           return
+     end subroutine stdlib_dlasq3
+
+     !> DLATDF: uses the LU factorization of the n-by-n matrix Z computed by
+     !> DGETC2 and computes a contribution to the reciprocal Dif-estimate
+     !> by solving Z * x = b for x, and choosing the r.h.s. b such that
+     !> the norm of x is as large as possible. On entry RHS = b holds the
+     !> contribution from earlier solved sub-systems, and on return RHS = x.
+     !> The factorization of Z returned by DGETC2 has the form Z = P*L*U*Q,
+     !> where P and Q are permutation matrices. L is lower triangular with
+     !> unit diagonal elements and U is upper triangular.
+
+     pure subroutine stdlib_dlatdf( ijob, n, z, ldz, rhs, rdsum, rdscal, ipiv,jpiv )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ijob, ldz, n
+           real(dp), intent(inout) :: rdscal, rdsum
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*), jpiv(*)
+           real(dp), intent(inout) :: rhs(*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxdim = 8
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j, k
+           real(dp) :: bm, bp, pmone, sminu, splus, temp
+           ! Local Arrays 
+           integer(ilp) :: iwork(maxdim)
+           real(dp) :: work(4*maxdim), xm(maxdim), xp(maxdim)
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( ijob/=2 ) then
+              ! apply permutations ipiv to rhs
+              call stdlib_dlaswp( 1, rhs, ldz, 1, n-1, ipiv, 1 )
+              ! solve for l-part choosing rhs either to +1 or -1.
+              pmone = -one
+              loop_10: do j = 1, n - 1
+                 bp = rhs( j ) + one
+                 bm = rhs( j ) - one
+                 splus = one
+                 ! look-ahead for l-part rhs(1:n-1) = + or -1, splus and
+                 ! smin computed more efficiently than in bsolve [1].
+                 splus = splus + stdlib_ddot( n-j, z( j+1, j ), 1, z( j+1, j ), 1 )
+                 sminu = stdlib_ddot( n-j, z( j+1, j ), 1, rhs( j+1 ), 1 )
+                 splus = splus*rhs( j )
+                 if( splus>sminu ) then
+                    rhs( j ) = bp
+                 else if( sminu>splus ) then
+                    rhs( j ) = bm
+                 else
+                    ! in this case the updating sums are equal and we can
+                    ! choose rhs(j) +1 or -1. the first time this happens
+                    ! we choose -1, thereafter +1. this is a simple way to
+                    ! get good estimates of matrices like byers well-known
+                    ! example (see [1]). (not done in bsolve.)
+                    rhs( j ) = rhs( j ) + pmone
+                    pmone = one
+                 end if
+                 ! compute the remaining r.h.s.
+                 temp = -rhs( j )
+                 call stdlib_daxpy( n-j, temp, z( j+1, j ), 1, rhs( j+1 ), 1 )
+              end do loop_10
+              ! solve for u-part, look-ahead for rhs(n) = +-1. this is not done
+              ! in bsolve and will hopefully give us a better estimate because
+              ! any ill-conditioning of the original matrix is transferred to u
+              ! and not to l. u(n, n) is an approximation to sigma_min(lu).
+              call stdlib_dcopy( n-1, rhs, 1, xp, 1 )
+              xp( n ) = rhs( n ) + one
+              rhs( n ) = rhs( n ) - one
+              splus = zero
+              sminu = zero
+              do i = n, 1, -1
+                 temp = one / z( i, i )
+                 xp( i ) = xp( i )*temp
+                 rhs( i ) = rhs( i )*temp
+                 do k = i + 1, n
+                    xp( i ) = xp( i ) - xp( k )*( z( i, k )*temp )
+                    rhs( i ) = rhs( i ) - rhs( k )*( z( i, k )*temp )
+                 end do
+                 splus = splus + abs( xp( i ) )
+                 sminu = sminu + abs( rhs( i ) )
+              end do
+              if( splus>sminu )call stdlib_dcopy( n, xp, 1, rhs, 1 )
+              ! apply the permutations jpiv to the computed solution (rhs)
+              call stdlib_dlaswp( 1, rhs, ldz, 1, n-1, jpiv, -1 )
+              ! compute the sum of squares
+              call stdlib_dlassq( n, rhs, 1, rdscal, rdsum )
+           else
+              ! ijob = 2, compute approximate nullvector xm of z
+              call stdlib_dgecon( 'I', n, z, ldz, one, temp, work, iwork, info )
+              call stdlib_dcopy( n, work( n+1 ), 1, xm, 1 )
+              ! compute rhs
+              call stdlib_dlaswp( 1, xm, ldz, 1, n-1, ipiv, -1 )
+              temp = one / sqrt( stdlib_ddot( n, xm, 1, xm, 1 ) )
+              call stdlib_dscal( n, temp, xm, 1 )
+              call stdlib_dcopy( n, xm, 1, xp, 1 )
+              call stdlib_daxpy( n, one, rhs, 1, xp, 1 )
+              call stdlib_daxpy( n, -one, xm, 1, rhs, 1 )
+              call stdlib_dgesc2( n, z, ldz, rhs, ipiv, jpiv, temp )
+              call stdlib_dgesc2( n, z, ldz, xp, ipiv, jpiv, temp )
+              if( stdlib_dasum( n, xp, 1 )>stdlib_dasum( n, rhs, 1 ) )call stdlib_dcopy( n, xp, 1,&
+                         rhs, 1 )
+              ! compute the sum of squares
+              call stdlib_dlassq( n, rhs, 1, rdscal, rdsum )
+           end if
+           return
+     end subroutine stdlib_dlatdf
+
+     !> DLATRD: reduces NB rows and columns of a real symmetric matrix A to
+     !> symmetric tridiagonal form by an orthogonal similarity
+     !> transformation Q**T * A * Q, and returns the matrices V and W which are
+     !> needed to apply the transformation to the unreduced part of A.
+     !> If UPLO = 'U', DLATRD reduces the last NB rows and columns of a
+     !> matrix, of which the upper triangle is supplied;
+     !> if UPLO = 'L', DLATRD reduces the first NB rows and columns of a
+     !> matrix, of which the lower triangle is supplied.
+     !> This is an auxiliary routine called by DSYTRD.
+
+     pure subroutine stdlib_dlatrd( uplo, n, nb, a, lda, e, tau, w, ldw )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: e(*), tau(*), w(ldw,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iw
+           real(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 )return
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! reduce last nb columns of upper triangle
+              loop_10: do i = n, n - nb + 1, -1
+                 iw = i - n + nb
+                 if( i<n ) then
+                    ! update a(1:i,i)
+                    call stdlib_dgemv( 'NO TRANSPOSE', i, n-i, -one, a( 1, i+1 ),lda, w( i, iw+1 )&
+                              , ldw, one, a( 1, i ), 1 )
+                    call stdlib_dgemv( 'NO TRANSPOSE', i, n-i, -one, w( 1, iw+1 ),ldw, a( i, i+1 )&
+                              , lda, one, a( 1, i ), 1 )
+                 end if
+                 if( i>1 ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(1:i-2,i)
+                    call stdlib_dlarfg( i-1, a( i-1, i ), a( 1, i ), 1, tau( i-1 ) )
+                    e( i-1 ) = a( i-1, i )
+                    a( i-1, i ) = one
+                    ! compute w(1:i-1,i)
+                    call stdlib_dsymv( 'UPPER', i-1, one, a, lda, a( 1, i ), 1,zero, w( 1, iw ), &
+                              1 )
+                    if( i<n ) then
+                       call stdlib_dgemv( 'TRANSPOSE', i-1, n-i, one, w( 1, iw+1 ),ldw, a( 1, i ),&
+                                  1, zero, w( i+1, iw ), 1 )
+                       call stdlib_dgemv( 'NO TRANSPOSE', i-1, n-i, -one,a( 1, i+1 ), lda, w( i+1,&
+                                  iw ), 1, one,w( 1, iw ), 1 )
+                       call stdlib_dgemv( 'TRANSPOSE', i-1, n-i, one, a( 1, i+1 ),lda, a( 1, i ), &
+                                 1, zero, w( i+1, iw ), 1 )
+                       call stdlib_dgemv( 'NO TRANSPOSE', i-1, n-i, -one,w( 1, iw+1 ), ldw, w( i+&
+                                 1, iw ), 1, one,w( 1, iw ), 1 )
+                    end if
+                    call stdlib_dscal( i-1, tau( i-1 ), w( 1, iw ), 1 )
+                    alpha = -half*tau( i-1 )*stdlib_ddot( i-1, w( 1, iw ), 1,a( 1, i ), 1 )
+                              
+                    call stdlib_daxpy( i-1, alpha, a( 1, i ), 1, w( 1, iw ), 1 )
+                 end if
+              end do loop_10
+           else
+              ! reduce first nb columns of lower triangle
+              do i = 1, nb
+                 ! update a(i:n,i)
+                 call stdlib_dgemv( 'NO TRANSPOSE', n-i+1, i-1, -one, a( i, 1 ),lda, w( i, 1 ), &
+                           ldw, one, a( i, i ), 1 )
+                 call stdlib_dgemv( 'NO TRANSPOSE', n-i+1, i-1, -one, w( i, 1 ),ldw, a( i, 1 ), &
+                           lda, one, a( i, i ), 1 )
+                 if( i<n ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(i+2:n,i)
+                    call stdlib_dlarfg( n-i, a( i+1, i ), a( min( i+2, n ), i ), 1,tau( i ) )
+                              
+                    e( i ) = a( i+1, i )
+                    a( i+1, i ) = one
+                    ! compute w(i+1:n,i)
+                    call stdlib_dsymv( 'LOWER', n-i, one, a( i+1, i+1 ), lda,a( i+1, i ), 1, zero,&
+                               w( i+1, i ), 1 )
+                    call stdlib_dgemv( 'TRANSPOSE', n-i, i-1, one, w( i+1, 1 ), ldw,a( i+1, i ), &
+                              1, zero, w( 1, i ), 1 )
+                    call stdlib_dgemv( 'NO TRANSPOSE', n-i, i-1, -one, a( i+1, 1 ),lda, w( 1, i ),&
+                               1, one, w( i+1, i ), 1 )
+                    call stdlib_dgemv( 'TRANSPOSE', n-i, i-1, one, a( i+1, 1 ), lda,a( i+1, i ), &
+                              1, zero, w( 1, i ), 1 )
+                    call stdlib_dgemv( 'NO TRANSPOSE', n-i, i-1, -one, w( i+1, 1 ),ldw, w( 1, i ),&
+                               1, one, w( i+1, i ), 1 )
+                    call stdlib_dscal( n-i, tau( i ), w( i+1, i ), 1 )
+                    alpha = -half*tau( i )*stdlib_ddot( n-i, w( i+1, i ), 1,a( i+1, i ), 1 )
+                              
+                    call stdlib_daxpy( n-i, alpha, a( i+1, i ), 1, w( i+1, i ), 1 )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_dlatrd
+
+     !> DLATRZ: factors the M-by-(M+L) real upper trapezoidal matrix
+     !> [ A1 A2 ] = [ A(1:M,1:M) A(1:M,N-L+1:N) ] as ( R  0 ) * Z, by means
+     !> of orthogonal transformations.  Z is an (M+L)-by-(M+L) orthogonal
+     !> matrix and, R and A1 are M-by-M upper triangular matrices.
+
+     pure subroutine stdlib_dlatrz( m, n, l, a, lda, tau, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: l, lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Executable Statements 
+           ! test the input arguments
+           ! quick return if possible
+           if( m==0 ) then
+              return
+           else if( m==n ) then
+              do i = 1, n
+                 tau( i ) = zero
+              end do
+              return
+           end if
+           do i = m, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! [ a(i,i) a(i,n-l+1:n) ]
+              call stdlib_dlarfg( l+1, a( i, i ), a( i, n-l+1 ), lda, tau( i ) )
+              ! apply h(i) to a(1:i-1,i:n) from the right
+              call stdlib_dlarz( 'RIGHT', i-1, n-i+1, l, a( i, n-l+1 ), lda,tau( i ), a( 1, i ), &
+                        lda, work )
+           end do
+           return
+     end subroutine stdlib_dlatrz
+
+     !> DORBDB: simultaneously bidiagonalizes the blocks of an M-by-M
+     !> partitioned orthogonal matrix X:
+     !> [ B11 | B12 0  0 ]
+     !> [ X11 | X12 ]   [ P1 |    ] [  0  |  0 -I  0 ] [ Q1 |    ]**T
+     !> X = [-----------] = [---------] [----------------] [---------]   .
+     !> [ X21 | X22 ]   [    | P2 ] [ B21 | B22 0  0 ] [    | Q2 ]
+     !> [  0  |  0  0  I ]
+     !> X11 is P-by-Q. Q must be no larger than P, M-P, or M-Q. (If this is
+     !> not the case, then X must be transposed and/or permuted. This can be
+     !> done in constant time using the TRANS and SIGNS options. See DORCSD
+     !> for details.)
+     !> The orthogonal matrices P1, P2, Q1, and Q2 are P-by-P, (M-P)-by-
+     !> (M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. They are
+     !> represented implicitly by Householder vectors.
+     !> B11, B12, B21, and B22 are Q-by-Q bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_dorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+               ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: signs, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldx11, ldx12, ldx21, ldx22, lwork, m, p, q
+           ! Array Arguments 
+           real(dp), intent(out) :: phi(*), theta(*)
+           real(dp), intent(out) :: taup1(*), taup2(*), tauq1(*), tauq2(*), work(*)
+           real(dp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
+                     
+        ! ====================================================================
+           ! Parameters 
+           real(dp), parameter :: realone = 1.0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: colmajor, lquery
+           integer(ilp) :: i, lworkmin, lworkopt
+           real(dp) :: z1, z2, z3, z4
+           ! Intrinsic Functions
+           intrinsic :: atan2,cos,max,sin
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           if( .not. stdlib_lsame( signs, 'O' ) ) then
+              z1 = realone
+              z2 = realone
+              z3 = realone
+              z4 = realone
+           else
+              z1 = realone
+              z2 = -realone
+              z3 = realone
+              z4 = -realone
+           end if
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -3
+           else if( p < 0 .or. p > m ) then
+              info = -4
+           else if( q < 0 .or. q > p .or. q > m-p .or.q > m-q ) then
+              info = -5
+           else if( colmajor .and. ldx11 < max( 1, p ) ) then
+              info = -7
+           else if( .not.colmajor .and. ldx11 < max( 1, q ) ) then
+              info = -7
+           else if( colmajor .and. ldx12 < max( 1, p ) ) then
+              info = -9
+           else if( .not.colmajor .and. ldx12 < max( 1, m-q ) ) then
+              info = -9
+           else if( colmajor .and. ldx21 < max( 1, m-p ) ) then
+              info = -11
+           else if( .not.colmajor .and. ldx21 < max( 1, q ) ) then
+              info = -11
+           else if( colmajor .and. ldx22 < max( 1, m-p ) ) then
+              info = -13
+           else if( .not.colmajor .and. ldx22 < max( 1, m-q ) ) then
+              info = -13
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              lworkopt = m - q
+              lworkmin = m - q
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not. lquery ) then
+                 info = -21
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'XORBDB', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! handle column-major and row-major separately
+           if( colmajor ) then
+              ! reduce columns 1, ..., q of x11, x12, x21, and x22
+              do i = 1, q
+                 if( i == 1 ) then
+                    call stdlib_dscal( p-i+1, z1, x11(i,i), 1 )
+                 else
+                    call stdlib_dscal( p-i+1, z1*cos(phi(i-1)), x11(i,i), 1 )
+                    call stdlib_daxpy( p-i+1, -z1*z3*z4*sin(phi(i-1)), x12(i,i-1),1, x11(i,i), 1 )
+                              
+                 end if
+                 if( i == 1 ) then
+                    call stdlib_dscal( m-p-i+1, z2, x21(i,i), 1 )
+                 else
+                    call stdlib_dscal( m-p-i+1, z2*cos(phi(i-1)), x21(i,i), 1 )
+                    call stdlib_daxpy( m-p-i+1, -z2*z3*z4*sin(phi(i-1)), x22(i,i-1),1, x21(i,i), &
+                              1 )
+                 end if
+                 theta(i) = atan2( stdlib_dnrm2( m-p-i+1, x21(i,i), 1 ),stdlib_dnrm2( p-i+1, x11(&
+                           i,i), 1 ) )
+                 if( p > i ) then
+                    call stdlib_dlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+                 else if( p == i ) then
+                    call stdlib_dlarfgp( p-i+1, x11(i,i), x11(i,i), 1, taup1(i) )
+                 end if
+                 x11(i,i) = one
+                 if ( m-p > i ) then
+                    call stdlib_dlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1,taup2(i) )
+                 else if ( m-p == i ) then
+                    call stdlib_dlarfgp( m-p-i+1, x21(i,i), x21(i,i), 1, taup2(i) )
+                 end if
+                 x21(i,i) = one
+                 if ( q > i ) then
+                    call stdlib_dlarf( 'L', p-i+1, q-i, x11(i,i), 1, taup1(i),x11(i,i+1), ldx11, &
+                              work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_dlarf( 'L', p-i+1, m-q-i+1, x11(i,i), 1, taup1(i),x12(i,i), ldx12,&
+                               work )
+                 end if
+                 if ( q > i ) then
+                    call stdlib_dlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, taup2(i),x21(i,i+1), ldx21,&
+                               work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_dlarf( 'L', m-p-i+1, m-q-i+1, x21(i,i), 1, taup2(i),x22(i,i), &
+                              ldx22, work )
+                 end if
+                 if( i < q ) then
+                    call stdlib_dscal( q-i, -z1*z3*sin(theta(i)), x11(i,i+1),ldx11 )
+                    call stdlib_daxpy( q-i, z2*z3*cos(theta(i)), x21(i,i+1), ldx21,x11(i,i+1), &
+                              ldx11 )
+                 end if
+                 call stdlib_dscal( m-q-i+1, -z1*z4*sin(theta(i)), x12(i,i), ldx12 )
+                 call stdlib_daxpy( m-q-i+1, z2*z4*cos(theta(i)), x22(i,i), ldx22,x12(i,i), ldx12 &
+                           )
+                 if( i < q )phi(i) = atan2( stdlib_dnrm2( q-i, x11(i,i+1), ldx11 ),stdlib_dnrm2( &
+                           m-q-i+1, x12(i,i), ldx12 ) )
+                 if( i < q ) then
+                    if ( q-i == 1 ) then
+                       call stdlib_dlarfgp( q-i, x11(i,i+1), x11(i,i+1), ldx11,tauq1(i) )
+                    else
+                       call stdlib_dlarfgp( q-i, x11(i,i+1), x11(i,i+2), ldx11,tauq1(i) )
+                    end if
+                    x11(i,i+1) = one
+                 end if
+                 if ( q+i-1 < m ) then
+                    if ( m-q == i ) then
+                       call stdlib_dlarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
+                    else
+                       call stdlib_dlarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
+                                 
+                    end if
+                 end if
+                 x12(i,i) = one
+                 if( i < q ) then
+                    call stdlib_dlarf( 'R', p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x11(i+1,i+1), &
+                              ldx11, work )
+                    call stdlib_dlarf( 'R', m-p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x21(i+1,i+1), &
+                              ldx21, work )
+                 end if
+                 if ( p > i ) then
+                    call stdlib_dlarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
+                              ldx12, work )
+                 end if
+                 if ( m-p > i ) then
+                    call stdlib_dlarf( 'R', m-p-i, m-q-i+1, x12(i,i), ldx12,tauq2(i), x22(i+1,i), &
+                              ldx22, work )
+                 end if
+              end do
+              ! reduce columns q + 1, ..., p of x12, x22
+              do i = q + 1, p
+                 call stdlib_dscal( m-q-i+1, -z1*z4, x12(i,i), ldx12 )
+                 if ( i >= m-q ) then
+                    call stdlib_dlarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
+                 else
+                    call stdlib_dlarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
+                 end if
+                 x12(i,i) = one
+                 if ( p > i ) then
+                    call stdlib_dlarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
+                              ldx12, work )
+                 end if
+                 if( m-p-q >= 1 )call stdlib_dlarf( 'R', m-p-q, m-q-i+1, x12(i,i), ldx12,tauq2(i),&
+                            x22(q+1,i), ldx22, work )
+              end do
+              ! reduce columns p + 1, ..., m - q of x12, x22
+              do i = 1, m - p - q
+                 call stdlib_dscal( m-p-q-i+1, z2*z4, x22(q+i,p+i), ldx22 )
+                 if ( i == m-p-q ) then
+                    call stdlib_dlarfgp( m-p-q-i+1, x22(q+i,p+i), x22(q+i,p+i),ldx22, tauq2(p+i) )
+                              
+                 else
+                    call stdlib_dlarfgp( m-p-q-i+1, x22(q+i,p+i), x22(q+i,p+i+1),ldx22, tauq2(p+i)&
+                               )
+                 end if
+                 x22(q+i,p+i) = one
+                 if ( i < m-p-q ) then
+                    call stdlib_dlarf( 'R', m-p-q-i, m-p-q-i+1, x22(q+i,p+i), ldx22,tauq2(p+i), &
+                              x22(q+i+1,p+i), ldx22, work )
+                 end if
+              end do
+           else
+              ! reduce columns 1, ..., q of x11, x12, x21, x22
+              do i = 1, q
+                 if( i == 1 ) then
+                    call stdlib_dscal( p-i+1, z1, x11(i,i), ldx11 )
+                 else
+                    call stdlib_dscal( p-i+1, z1*cos(phi(i-1)), x11(i,i), ldx11 )
+                    call stdlib_daxpy( p-i+1, -z1*z3*z4*sin(phi(i-1)), x12(i-1,i),ldx12, x11(i,i),&
+                               ldx11 )
+                 end if
+                 if( i == 1 ) then
+                    call stdlib_dscal( m-p-i+1, z2, x21(i,i), ldx21 )
+                 else
+                    call stdlib_dscal( m-p-i+1, z2*cos(phi(i-1)), x21(i,i), ldx21 )
+                    call stdlib_daxpy( m-p-i+1, -z2*z3*z4*sin(phi(i-1)), x22(i-1,i),ldx22, x21(i,&
+                              i), ldx21 )
+                 end if
+                 theta(i) = atan2( stdlib_dnrm2( m-p-i+1, x21(i,i), ldx21 ),stdlib_dnrm2( p-i+1, &
+                           x11(i,i), ldx11 ) )
+                 call stdlib_dlarfgp( p-i+1, x11(i,i), x11(i,i+1), ldx11, taup1(i) )
+                 x11(i,i) = one
+                 if ( i == m-p ) then
+                    call stdlib_dlarfgp( m-p-i+1, x21(i,i), x21(i,i), ldx21,taup2(i) )
+                 else
+                    call stdlib_dlarfgp( m-p-i+1, x21(i,i), x21(i,i+1), ldx21,taup2(i) )
+                 end if
+                 x21(i,i) = one
+                 if ( q > i ) then
+                    call stdlib_dlarf( 'R', q-i, p-i+1, x11(i,i), ldx11, taup1(i),x11(i+1,i), &
+                              ldx11, work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_dlarf( 'R', m-q-i+1, p-i+1, x11(i,i), ldx11,taup1(i), x12(i,i), &
+                              ldx12, work )
+                 end if
+                 if ( q > i ) then
+                    call stdlib_dlarf( 'R', q-i, m-p-i+1, x21(i,i), ldx21, taup2(i),x21(i+1,i), &
+                              ldx21, work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_dlarf( 'R', m-q-i+1, m-p-i+1, x21(i,i), ldx21,taup2(i), x22(i,i), &
+                              ldx22, work )
+                 end if
+                 if( i < q ) then
+                    call stdlib_dscal( q-i, -z1*z3*sin(theta(i)), x11(i+1,i), 1 )
+                    call stdlib_daxpy( q-i, z2*z3*cos(theta(i)), x21(i+1,i), 1,x11(i+1,i), 1 )
+                              
+                 end if
+                 call stdlib_dscal( m-q-i+1, -z1*z4*sin(theta(i)), x12(i,i), 1 )
+                 call stdlib_daxpy( m-q-i+1, z2*z4*cos(theta(i)), x22(i,i), 1,x12(i,i), 1 )
+                           
+                 if( i < q )phi(i) = atan2( stdlib_dnrm2( q-i, x11(i+1,i), 1 ),stdlib_dnrm2( m-q-&
+                           i+1, x12(i,i), 1 ) )
+                 if( i < q ) then
+                    if ( q-i == 1) then
+                       call stdlib_dlarfgp( q-i, x11(i+1,i), x11(i+1,i), 1,tauq1(i) )
+                    else
+                       call stdlib_dlarfgp( q-i, x11(i+1,i), x11(i+2,i), 1,tauq1(i) )
+                    end if
+                    x11(i+1,i) = one
+                 end if
+                 if ( m-q > i ) then
+                    call stdlib_dlarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1,tauq2(i) )
+                 else
+                    call stdlib_dlarfgp( m-q-i+1, x12(i,i), x12(i,i), 1,tauq2(i) )
+                 end if
+                 x12(i,i) = one
+                 if( i < q ) then
+                    call stdlib_dlarf( 'L', q-i, p-i, x11(i+1,i), 1, tauq1(i),x11(i+1,i+1), ldx11,&
+                               work )
+                    call stdlib_dlarf( 'L', q-i, m-p-i, x11(i+1,i), 1, tauq1(i),x21(i+1,i+1), &
+                              ldx21, work )
+                 end if
+                 call stdlib_dlarf( 'L', m-q-i+1, p-i, x12(i,i), 1, tauq2(i),x12(i,i+1), ldx12, &
+                           work )
+                 if ( m-p-i > 0 ) then
+                    call stdlib_dlarf( 'L', m-q-i+1, m-p-i, x12(i,i), 1, tauq2(i),x22(i,i+1), &
+                              ldx22, work )
+                 end if
+              end do
+              ! reduce columns q + 1, ..., p of x12, x22
+              do i = q + 1, p
+                 call stdlib_dscal( m-q-i+1, -z1*z4, x12(i,i), 1 )
+                 call stdlib_dlarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1, tauq2(i) )
+                 x12(i,i) = one
+                 if ( p > i ) then
+                    call stdlib_dlarf( 'L', m-q-i+1, p-i, x12(i,i), 1, tauq2(i),x12(i,i+1), ldx12,&
+                               work )
+                 end if
+                 if( m-p-q >= 1 )call stdlib_dlarf( 'L', m-q-i+1, m-p-q, x12(i,i), 1, tauq2(i),&
+                           x22(i,q+1), ldx22, work )
+              end do
+              ! reduce columns p + 1, ..., m - q of x12, x22
+              do i = 1, m - p - q
+                 call stdlib_dscal( m-p-q-i+1, z2*z4, x22(p+i,q+i), 1 )
+                 if ( m-p-q == i ) then
+                    call stdlib_dlarfgp( m-p-q-i+1, x22(p+i,q+i), x22(p+i,q+i), 1,tauq2(p+i) )
+                              
+                 else
+                    call stdlib_dlarfgp( m-p-q-i+1, x22(p+i,q+i), x22(p+i+1,q+i), 1,tauq2(p+i) )
+                              
+                    call stdlib_dlarf( 'L', m-p-q-i+1, m-p-q-i, x22(p+i,q+i), 1,tauq2(p+i), x22(p+&
+                              i,q+i+1), ldx22, work )
+                 end if
+                 x22(p+i,q+i) = one
+              end do
+           end if
+           return
+     end subroutine stdlib_dorbdb
+
+     !> DORBDB5: orthogonalizes the column vector
+     !> X = [ X1 ]
+     !> [ X2 ]
+     !> with respect to the columns of
+     !> Q = [ Q1 ] .
+     !> [ Q2 ]
+     !> The columns of Q must be orthonormal.
+     !> If the projection is zero according to Kahan's "twice is enough"
+     !> criterion, then some other vector from the orthogonal complement
+     !> is returned. This vector is chosen in an arbitrary but deterministic
+     !> way.
+
+     pure subroutine stdlib_dorbdb5( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
+           real(dp), intent(out) :: work(*)
+           real(dp), intent(inout) :: x1(*), x2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: childinfo, i, j
+           ! Intrinsic Function 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           if( m1 < 0 ) then
+              info = -1
+           else if( m2 < 0 ) then
+              info = -2
+           else if( n < 0 ) then
+              info = -3
+           else if( incx1 < 1 ) then
+              info = -5
+           else if( incx2 < 1 ) then
+              info = -7
+           else if( ldq1 < max( 1, m1 ) ) then
+              info = -9
+           else if( ldq2 < max( 1, m2 ) ) then
+              info = -11
+           else if( lwork < n ) then
+              info = -13
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORBDB5', -info )
+              return
+           end if
+           ! project x onto the orthogonal complement of q
+           call stdlib_dorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2, ldq2,work, lwork, &
+                     childinfo )
+           ! if the projection is nonzero, then return
+           if( stdlib_dnrm2(m1,x1,incx1) /= zero.or. stdlib_dnrm2(m2,x2,incx2) /= zero ) &
+                     then
+              return
+           end if
+           ! project each standard basis vector e_1,...,e_m1 in turn, stopping
+           ! when a nonzero projection is found
+           do i = 1, m1
+              do j = 1, m1
+                 x1(j) = zero
+              end do
+              x1(i) = one
+              do j = 1, m2
+                 x2(j) = zero
+              end do
+              call stdlib_dorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                        lwork, childinfo )
+              if( stdlib_dnrm2(m1,x1,incx1) /= zero.or. stdlib_dnrm2(m2,x2,incx2) /= zero ) &
+                        then
+                 return
+              end if
+           end do
+           ! project each standard basis vector e_(m1+1),...,e_(m1+m2) in turn,
+           ! stopping when a nonzero projection is found
+           do i = 1, m2
+              do j = 1, m1
+                 x1(j) = zero
+              end do
+              do j = 1, m2
+                 x2(j) = zero
+              end do
+              x2(i) = one
+              call stdlib_dorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                        lwork, childinfo )
+              if( stdlib_dnrm2(m1,x1,incx1) /= zero.or. stdlib_dnrm2(m2,x2,incx2) /= zero ) &
+                        then
+                 return
+              end if
+           end do
+           return
+     end subroutine stdlib_dorbdb5
+
+     !> DORCSD: computes the CS decomposition of an M-by-M partitioned
+     !> orthogonal matrix X:
+     !> [  I  0  0 |  0  0  0 ]
+     !> [  0  C  0 |  0 -S  0 ]
+     !> [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**T
+     !> X = [-----------] = [---------] [---------------------] [---------]   .
+     !> [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
+     !> [  0  S  0 |  0  C  0 ]
+     !> [  0  0  I |  0  0  0 ]
+     !> X11 is P-by-Q. The orthogonal matrices U1, U2, V1, and V2 are P-by-P,
+     !> (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
+     !> R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
+     !> which R = MIN(P,M-P,Q,M-Q).
+
+     recursive subroutine stdlib_dorcsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, x11, &
+     ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, &
+               work, lwork, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, signs, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, ldx11, ldx12, ldx21, ldx22, &
+                     lwork, m, p, q
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: theta(*)
+           real(dp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*), work(*)
+                     
+           real(dp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
+                     
+        ! ===================================================================
+           
+           ! Local Scalars 
+           character :: transt, signst
+           integer(ilp) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
+           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, itauq2, j, lbbcsdwork, &
+           lbbcsdworkmin, lbbcsdworkopt, lorbdbwork, lorbdbworkmin, lorbdbworkopt, lorglqwork, &
+           lorglqworkmin, lorglqworkopt, lorgqrwork, lorgqrworkmin, lorgqrworkopt, lworkmin, &
+                     lworkopt
+           logical(lk) :: colmajor, defaultsigns, lquery, wantu1, wantu2, wantv1t, wantv2t
+           ! Intrinsic Functions
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           wantv2t = stdlib_lsame( jobv2t, 'Y' )
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           defaultsigns = .not. stdlib_lsame( signs, 'O' )
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -7
+           else if( p < 0 .or. p > m ) then
+              info = -8
+           else if( q < 0 .or. q > m ) then
+              info = -9
+           else if ( colmajor .and.  ldx11 < max( 1, p ) ) then
+             info = -11
+           else if (.not. colmajor .and. ldx11 < max( 1, q ) ) then
+             info = -11
+           else if (colmajor .and. ldx12 < max( 1, p ) ) then
+             info = -13
+           else if (.not. colmajor .and. ldx12 < max( 1, m-q ) ) then
+             info = -13
+           else if (colmajor .and. ldx21 < max( 1, m-p ) ) then
+             info = -15
+           else if (.not. colmajor .and. ldx21 < max( 1, q ) ) then
+             info = -15
+           else if (colmajor .and. ldx22 < max( 1, m-p ) ) then
+             info = -17
+           else if (.not. colmajor .and. ldx22 < max( 1, m-q ) ) then
+             info = -17
+           else if( wantu1 .and. ldu1 < p ) then
+              info = -20
+           else if( wantu2 .and. ldu2 < m-p ) then
+              info = -22
+           else if( wantv1t .and. ldv1t < q ) then
+              info = -24
+           else if( wantv2t .and. ldv2t < m-q ) then
+              info = -26
+           end if
+           ! work with transpose if convenient
+           if( info == 0 .and. min( p, m-p ) < min( q, m-q ) ) then
+              if( colmajor ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              if( defaultsigns ) then
+                 signst = 'O'
+              else
+                 signst = 'D'
+              end if
+              call stdlib_dorcsd( jobv1t, jobv2t, jobu1, jobu2, transt, signst, m,q, p, x11, &
+              ldx11, x21, ldx21, x12, ldx12, x22,ldx22, theta, v1t, ldv1t, v2t, ldv2t, u1, ldu1,&
+                        u2, ldu2, work, lwork, iwork, info )
+              return
+           end if
+           ! work with permutation [ 0 i; i 0 ] * x * [ 0 i; i 0 ] if
+           ! convenient
+           if( info == 0 .and. m-q < q ) then
+              if( defaultsigns ) then
+                 signst = 'O'
+              else
+                 signst = 'D'
+              end if
+              call stdlib_dorcsd( jobu2, jobu1, jobv2t, jobv1t, trans, signst, m,m-p, m-q, x22, &
+              ldx22, x21, ldx21, x12, ldx12, x11,ldx11, theta, u2, ldu2, u1, ldu1, v2t, ldv2t, &
+                        v1t,ldv1t, work, lwork, iwork, info )
+              return
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              iphi = 2
+              itaup1 = iphi + max( 1, q - 1 )
+              itaup2 = itaup1 + max( 1, p )
+              itauq1 = itaup2 + max( 1, m - p )
+              itauq2 = itauq1 + max( 1, q )
+              iorgqr = itauq2 + max( 1, m - q )
+              call stdlib_dorgqr( m-q, m-q, m-q, u1, max(1,m-q), u1, work, -1,childinfo )
+              lorgqrworkopt = int( work(1),KIND=ilp)
+              lorgqrworkmin = max( 1, m - q )
+              iorglq = itauq2 + max( 1, m - q )
+              call stdlib_dorglq( m-q, m-q, m-q, u1, max(1,m-q), u1, work, -1,childinfo )
+              lorglqworkopt = int( work(1),KIND=ilp)
+              lorglqworkmin = max( 1, m - q )
+              iorbdb = itauq2 + max( 1, m - q )
+              call stdlib_dorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+                        ldx22, theta, v1t, u1, u2, v1t,v2t, work, -1, childinfo )
+              lorbdbworkopt = int( work(1),KIND=ilp)
+              lorbdbworkmin = lorbdbworkopt
+              ib11d = itauq2 + max( 1, m - q )
+              ib11e = ib11d + max( 1, q )
+              ib12d = ib11e + max( 1, q - 1 )
+              ib12e = ib12d + max( 1, q )
+              ib21d = ib12e + max( 1, q - 1 )
+              ib21e = ib21d + max( 1, q )
+              ib22d = ib21e + max( 1, q - 1 )
+              ib22e = ib22d + max( 1, q )
+              ibbcsd = ib22e + max( 1, q - 1 )
+              call stdlib_dbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, theta, u1, &
+              ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, u1, u1, u1, u1, u1, u1, u1, u1, work, -1,&
+                        childinfo )
+              lbbcsdworkopt = int( work(1),KIND=ilp)
+              lbbcsdworkmin = lbbcsdworkopt
+              lworkopt = max( iorgqr + lorgqrworkopt, iorglq + lorglqworkopt,iorbdb + &
+                        lorbdbworkopt, ibbcsd + lbbcsdworkopt ) - 1
+              lworkmin = max( iorgqr + lorgqrworkmin, iorglq + lorglqworkmin,iorbdb + &
+                        lorbdbworkopt, ibbcsd + lbbcsdworkmin ) - 1
+              work(1) = max(lworkopt,lworkmin)
+              if( lwork < lworkmin .and. .not. lquery ) then
+                 info = -22
+              else
+                 lorgqrwork = lwork - iorgqr + 1
+                 lorglqwork = lwork - iorglq + 1
+                 lorbdbwork = lwork - iorbdb + 1
+                 lbbcsdwork = lwork - ibbcsd + 1
+              end if
+           end if
+           ! abort if any illegal arguments
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORCSD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! transform to bidiagonal block form
+           call stdlib_dorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12, x21,ldx21, x22, &
+           ldx22, theta, work(iphi), work(itaup1),work(itaup2), work(itauq1), work(itauq2),work(&
+                     iorbdb), lorbdbwork, childinfo )
+           ! accumulate householder reflectors
+           if( colmajor ) then
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_dlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_dorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqrwork, &
+                           info)
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_dlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_dorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqrwork,&
+                            info )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_dlacpy( 'U', q-1, q-1, x11(1,2), ldx11, v1t(2,2),ldv1t )
+                 v1t(1, 1) = one
+                 do j = 2, q
+                    v1t(1,j) = zero
+                    v1t(j,1) = zero
+                 end do
+                 call stdlib_dorglq( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorglq), &
+                           lorglqwork, info )
+              end if
+              if( wantv2t .and. m-q > 0 ) then
+                 call stdlib_dlacpy( 'U', p, m-q, x12, ldx12, v2t, ldv2t )
+                 if (m-p > q) then
+                    call stdlib_dlacpy( 'U', m-p-q, m-p-q, x22(q+1,p+1), ldx22,v2t(p+1,p+1), &
+                              ldv2t )
+                 end if
+                 if (m > q) then
+                    call stdlib_dorglq( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorglq), &
+                              lorglqwork, info )
+                 end if
+              end if
+           else
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_dlacpy( 'U', q, p, x11, ldx11, u1, ldu1 )
+                 call stdlib_dorglq( p, p, q, u1, ldu1, work(itaup1), work(iorglq),lorglqwork, &
+                           info)
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_dlacpy( 'U', q, m-p, x21, ldx21, u2, ldu2 )
+                 call stdlib_dorglq( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorglq), lorglqwork,&
+                            info )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_dlacpy( 'L', q-1, q-1, x11(2,1), ldx11, v1t(2,2),ldv1t )
+                 v1t(1, 1) = one
+                 do j = 2, q
+                    v1t(1,j) = zero
+                    v1t(j,1) = zero
+                 end do
+                 call stdlib_dorgqr( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorgqr), &
+                           lorgqrwork, info )
+              end if
+              if( wantv2t .and. m-q > 0 ) then
+                 call stdlib_dlacpy( 'L', m-q, p, x12, ldx12, v2t, ldv2t )
+                 call stdlib_dlacpy( 'L', m-p-q, m-p-q, x22(p+1,q+1), ldx22,v2t(p+1,p+1), ldv2t )
+                           
+                 call stdlib_dorgqr( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorgqr), &
+                           lorgqrwork, info )
+              end if
+           end if
+           ! compute the csd of the matrix in bidiagonal-block form
+           call stdlib_dbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q, theta,work(iphi), u1,&
+            ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, work(ib11d), work(ib11e), work(ib12d),work(&
+            ib12e), work(ib21d), work(ib21e), work(ib22d),work(ib22e), work(ibbcsd), lbbcsdwork, &
+                      info )
+           ! permute rows and columns to place identity submatrices in top-
+           ! left corner of (1,1)-block and/or bottom-right corner of (1,2)-
+           ! block and/or bottom-right corner of (2,1)-block and/or top-left
+           ! corner of (2,2)-block
+           if( q > 0 .and. wantu2 ) then
+              do i = 1, q
+                 iwork(i) = m - p - q + i
+              end do
+              do i = q + 1, m - p
+                 iwork(i) = i - q
+              end do
+              if( colmajor ) then
+                 call stdlib_dlapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              else
+                 call stdlib_dlapmr( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           end if
+           if( m > 0 .and. wantv2t ) then
+              do i = 1, p
+                 iwork(i) = m - p - q + i
+              end do
+              do i = p + 1, m - q
+                 iwork(i) = i - p
+              end do
+              if( .not. colmajor ) then
+                 call stdlib_dlapmt( .false., m-q, m-q, v2t, ldv2t, iwork )
+              else
+                 call stdlib_dlapmr( .false., m-q, m-q, v2t, ldv2t, iwork )
+              end if
+           end if
+           return
+           ! end stdlib_dorcsd
+     end subroutine stdlib_dorcsd
+
+     !> DORGHR: generates a real orthogonal matrix Q which is defined as the
+     !> product of IHI-ILO elementary reflectors of order N, as returned by
+     !> DGEHRD:
+     !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+     pure subroutine stdlib_dorghr( n, ilo, ihi, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, lwkopt, nb, nh
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nh = ihi - ilo
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( lwork<max( 1, nh ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'DORGQR', ' ', nh, nh, nh, -1 )
+              lwkopt = max( 1, nh )*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGHR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! shift the vectors which define the elementary reflectors one
+           ! column to the right, and set the first ilo and the last n-ihi
+           ! rows and columns to those of the unit matrix
+           do j = ihi, ilo + 1, -1
+              do i = 1, j - 1
+                 a( i, j ) = zero
+              end do
+              do i = j + 1, ihi
+                 a( i, j ) = a( i, j-1 )
+              end do
+              do i = ihi + 1, n
+                 a( i, j ) = zero
+              end do
+           end do
+           do j = 1, ilo
+              do i = 1, n
+                 a( i, j ) = zero
+              end do
+              a( j, j ) = one
+           end do
+           do j = ihi + 1, n
+              do i = 1, n
+                 a( i, j ) = zero
+              end do
+              a( j, j ) = one
+           end do
+           if( nh>0 ) then
+              ! generate q(ilo+1:ihi,ilo+1:ihi)
+              call stdlib_dorgqr( nh, nh, nh, a( ilo+1, ilo+1 ), lda, tau( ilo ),work, lwork, &
+                        iinfo )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dorghr
+
+     !> DORHR_COL: takes an M-by-N real matrix Q_in with orthonormal columns
+     !> as input, stored in A, and performs Householder Reconstruction (HR),
+     !> i.e. reconstructs Householder vectors V(i) implicitly representing
+     !> another M-by-N matrix Q_out, with the property that Q_in = Q_out*S,
+     !> where S is an N-by-N diagonal matrix with diagonal entries
+     !> equal to +1 or -1. The Householder vectors (columns V(i) of V) are
+     !> stored in A on output, and the diagonal entries of S are stored in D.
+     !> Block reflectors are also returned in T
+     !> (same output format as DGEQRT).
+
+     pure subroutine stdlib_dorhr_col( m, n, nb, a, lda, t, ldt, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, nb
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: d(*), t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo, j, jb, jbtemp1, jbtemp2, jnb, nplusone
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( nb<1 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -7
+           end if
+           ! handle error in the input parameters.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORHR_COL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              return
+           end if
+           ! on input, the m-by-n matrix a contains the orthogonal
+           ! m-by-n matrix q_in.
+           ! (1) compute the unit lower-trapezoidal v (ones on the diagonal
+           ! are not stored) by performing the "modified" lu-decomposition.
+           ! q_in - ( s ) = v * u = ( v1 ) * u,
+                  ! ( 0 )           ( v2 )
+           ! where 0 is an (m-n)-by-n zero matrix.
+           ! (1-1) factor v1 and u.
+           call stdlib_dlaorhr_col_getrfnp( n, n, a, lda, d, iinfo )
+           ! (1-2) solve for v2.
+           if( m>n ) then
+              call stdlib_dtrsm( 'R', 'U', 'N', 'N', m-n, n, one, a, lda,a( n+1, 1 ), lda )
+                        
+           end if
+           ! (2) reconstruct the block reflector t stored in t(1:nb, 1:n)
+           ! as a sequence of upper-triangular blocks with nb-size column
+           ! blocking.
+           ! loop over the column blocks of size nb of the array a(1:m,1:n)
+           ! and the array t(1:nb,1:n), jb is the column index of a column
+           ! block, jnb is the column block size at each step jb.
+           nplusone = n + 1
+           do jb = 1, n, nb
+              ! (2-0) determine the column block size jnb.
+              jnb = min( nplusone-jb, nb )
+              ! (2-1) copy the upper-triangular part of the current jnb-by-jnb
+              ! diagonal block u(jb) (of the n-by-n matrix u) stored
+              ! in a(jb:jb+jnb-1,jb:jb+jnb-1) into the upper-triangular part
+              ! of the current jnb-by-jnb block t(1:jnb,jb:jb+jnb-1)
+              ! column-by-column, total jnb*(jnb+1)/2 elements.
+              jbtemp1 = jb - 1
+              do j = jb, jb+jnb-1
+                 call stdlib_dcopy( j-jbtemp1, a( jb, j ), 1, t( 1, j ), 1 )
+              end do
+              ! (2-2) perform on the upper-triangular part of the current
+              ! jnb-by-jnb diagonal block u(jb) (of the n-by-n matrix u) stored
+              ! in t(1:jnb,jb:jb+jnb-1) the following operation in place:
+              ! (-1)*u(jb)*s(jb), i.e the result will be stored in the upper-
+              ! triangular part of t(1:jnb,jb:jb+jnb-1). this multiplication
+              ! of the jnb-by-jnb diagonal block u(jb) by the jnb-by-jnb
+              ! diagonal block s(jb) of the n-by-n sign matrix s from the
+              ! right means changing the sign of each j-th column of the block
+              ! u(jb) according to the sign of the diagonal element of the block
+              ! s(jb), i.e. s(j,j) that is stored in the array element d(j).
+              do j = jb, jb+jnb-1
+                 if( d( j )==one ) then
+                    call stdlib_dscal( j-jbtemp1, -one, t( 1, j ), 1 )
+                 end if
+              end do
+              ! (2-3) perform the triangular solve for the current block
+              ! matrix x(jb):
+                     ! x(jb) * (a(jb)**t) = b(jb), where:
+                     ! a(jb)**t  is a jnb-by-jnb unit upper-triangular
+                               ! coefficient block, and a(jb)=v1(jb), which
+                               ! is a jnb-by-jnb unit lower-triangular block
+                               ! stored in a(jb:jb+jnb-1,jb:jb+jnb-1).
+                               ! the n-by-n matrix v1 is the upper part
+                               ! of the m-by-n lower-trapezoidal matrix v
+                               ! stored in a(1:m,1:n);
+                     ! b(jb)     is a jnb-by-jnb  upper-triangular right-hand
+                               ! side block, b(jb) = (-1)*u(jb)*s(jb), and
+                               ! b(jb) is stored in t(1:jnb,jb:jb+jnb-1);
+                     ! x(jb)     is a jnb-by-jnb upper-triangular solution
+                               ! block, x(jb) is the upper-triangular block
+                               ! reflector t(jb), and x(jb) is stored
+                               ! in t(1:jnb,jb:jb+jnb-1).
+                   ! in other words, we perform the triangular solve for the
+                   ! upper-triangular block t(jb):
+                     ! t(jb) * (v1(jb)**t) = (-1)*u(jb)*s(jb).
+                   ! even though the blocks x(jb) and b(jb) are upper-
+                   ! triangular, the routine stdlib_dtrsm will access all jnb**2
+                   ! elements of the square t(1:jnb,jb:jb+jnb-1). therefore,
+                   ! we need to set to zero the elements of the block
+                   ! t(1:jnb,jb:jb+jnb-1) below the diagonal before the call
+                   ! to stdlib_dtrsm.
+              ! (2-3a) set the elements to zero.
+              jbtemp2 = jb - 2
+              do j = jb, jb+jnb-2
+                 do i = j-jbtemp2, nb
+                    t( i, j ) = zero
+                 end do
+              end do
+              ! (2-3b) perform the triangular solve.
+              call stdlib_dtrsm( 'R', 'L', 'T', 'U', jnb, jnb, one,a( jb, jb ), lda, t( 1, jb ), &
+                        ldt )
+           end do
+           return
+     end subroutine stdlib_dorhr_col
+
+     !> DORMHR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> IHI-ILO elementary reflectors, as returned by DGEHRD:
+     !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+     pure subroutine stdlib_dormhr( side, trans, m, n, ilo, ihi, a, lda, tau, c,ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldc, lwork, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, lquery
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, nh, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nh = ihi - ilo
+           left = stdlib_lsame( side, 'L' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) )&
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 .or. ilo>max( 1, nq ) ) then
+              info = -5
+           else if( ihi<min( ilo, nq ) .or. ihi>nq ) then
+              info = -6
+           else if( lda<max( 1, nq ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( left ) then
+                 nb = stdlib_ilaenv( 1, 'DORMQR', side // trans, nh, n, nh, -1 )
+              else
+                 nb = stdlib_ilaenv( 1, 'DORMQR', side // trans, m, nh, nh, -1 )
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMHR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. nh==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( left ) then
+              mi = nh
+              ni = n
+              i1 = ilo + 1
+              i2 = 1
+           else
+              mi = m
+              ni = nh
+              i1 = 1
+              i2 = ilo + 1
+           end if
+           call stdlib_dormqr( side, trans, mi, ni, nh, a( ilo+1, ilo ), lda,tau( ilo ), c( i1, &
+                     i2 ), ldc, work, lwork, iinfo )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dormhr
+
+     !> DPBCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric positive definite band matrix using the
+     !> Cholesky factorization A = U**T*U or A = L*L**T computed by DPBTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_dpbcon( uplo, n, kd, ab, ldab, anorm, rcond, work,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: ab(ldab,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(dp) :: ainvnm, scale, scalel, scaleu, smlnum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**t).
+                 call stdlib_dlatbs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scalel, work( 2*n+1 ),info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_dlatbs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scaleu, work( 2*n+1 ),info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_dlatbs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scalel, work( 2*n+1 ),info )
+                 normin = 'Y'
+                 ! multiply by inv(l**t).
+                 call stdlib_dlatbs( 'LOWER', 'TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scaleu, work( 2*n+1 ),info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_idamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_drscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_dpbcon
+
+     !> DPBRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric positive definite
+     !> and banded, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_dpbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b,ldb, x, ldx, ferr, &
+               berr, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldafb, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*)
+           real(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, l, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldafb<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = min( n+1, 2*kd+2 )
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_dcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_dsbmv( uplo, n, kd, -one, ab, ldab, x( 1, j ), 1, one,work( n+1 ), 1 )
+                        
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    l = kd + 1 - k
+                    do i = max( 1, k-kd ), k - 1
+                       work( i ) = work( i ) + abs( ab( l+i, k ) )*xk
+                       s = s + abs( ab( l+i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + abs( ab( kd+1, k ) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( ab( 1, k ) )*xk
+                    l = 1 - k
+                    do i = k + 1, min( n, k+kd )
+                       work( i ) = work( i ) + abs( ab( l+i, k ) )*xk
+                       s = s + abs( ab( l+i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_dpbtrs( uplo, n, kd, 1, afb, ldafb, work( n+1 ), n,info )
+                 call stdlib_daxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_dlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_dlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_dpbtrs( uplo, n, kd, 1, afb, ldafb, work( n+1 ), n,info )
+                    do i = 1, n
+                       work( n+i ) = work( n+i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( n+i )*work( i )
+                    end do
+                    call stdlib_dpbtrs( uplo, n, kd, 1, afb, ldafb, work( n+1 ), n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_dpbrfs
+
+     !> DPFTRS: solves a system of linear equations A*X = B with a symmetric
+     !> positive definite matrix A using the Cholesky factorization
+     !> A = U**T*U or A = L*L**T computed by DPFTRF.
+
+     pure subroutine stdlib_dpftrs( transr, uplo, n, nrhs, a, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(in) :: a(0:*)
+           real(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, normaltransr
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPFTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! start execution: there are two triangular solves
+           if( lower ) then
+              call stdlib_dtfsm( transr, 'L', uplo, 'N', 'N', n, nrhs, one, a, b,ldb )
+              call stdlib_dtfsm( transr, 'L', uplo, 'T', 'N', n, nrhs, one, a, b,ldb )
+           else
+              call stdlib_dtfsm( transr, 'L', uplo, 'T', 'N', n, nrhs, one, a, b,ldb )
+              call stdlib_dtfsm( transr, 'L', uplo, 'N', 'N', n, nrhs, one, a, b,ldb )
+           end if
+           return
+     end subroutine stdlib_dpftrs
+
+     !> DPOCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric positive definite matrix using the
+     !> Cholesky factorization A = U**T*U or A = L*L**T computed by DPOTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_dpocon( uplo, n, a, lda, anorm, rcond, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(dp) :: ainvnm, scale, scalel, scaleu, smlnum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of inv(a).
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**t).
+                 call stdlib_dlatrs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n, a,lda, work, &
+                           scalel, work( 2*n+1 ), info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_dlatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           scaleu, work( 2*n+1 ), info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_dlatrs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           scalel, work( 2*n+1 ), info )
+                 normin = 'Y'
+                 ! multiply by inv(l**t).
+                 call stdlib_dlatrs( 'LOWER', 'TRANSPOSE', 'NON-UNIT', normin, n, a,lda, work, &
+                           scaleu, work( 2*n+1 ), info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_idamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_drscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_dpocon
+
+     !> DPORFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric positive definite,
+     !> and provides error bounds and backward error estimates for the
+     !> solution.
+
+     pure subroutine stdlib_dporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x,ldx, ferr, berr, &
+               work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*)
+           real(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPORFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_dcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_dsymv( uplo, n, -one, a, lda, x( 1, j ), 1, one,work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    do i = 1, k - 1
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + abs( a( k, k ) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( a( k, k ) )*xk
+                    do i = k + 1, n
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_dpotrs( uplo, n, 1, af, ldaf, work( n+1 ), n, info )
+                 call stdlib_daxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_dlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_dlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_dpotrs( uplo, n, 1, af, ldaf, work( n+1 ), n, info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_dpotrs( uplo, n, 1, af, ldaf, work( n+1 ), n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_dporfs
+
+     !> DPOTF2: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**T * U ,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_dpotf2( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j
+           real(dp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the cholesky factorization a = u**t *u.
+              do j = 1, n
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = a( j, j ) - stdlib_ddot( j-1, a( 1, j ), 1, a( 1, j ), 1 )
+                 if( ajj<=zero.or.stdlib_disnan( ajj ) ) then
+                    a( j, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of row j.
+                 if( j<n ) then
+                    call stdlib_dgemv( 'TRANSPOSE', j-1, n-j, -one, a( 1, j+1 ),lda, a( 1, j ), 1,&
+                               one, a( j, j+1 ), lda )
+                    call stdlib_dscal( n-j, one / ajj, a( j, j+1 ), lda )
+                 end if
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**t.
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = a( j, j ) - stdlib_ddot( j-1, a( j, 1 ), lda, a( j, 1 ),lda )
+                 if( ajj<=zero.or.stdlib_disnan( ajj ) ) then
+                    a( j, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of column j.
+                 if( j<n ) then
+                    call stdlib_dgemv( 'NO TRANSPOSE', n-j, j-1, -one, a( j+1, 1 ),lda, a( j, 1 ),&
+                               lda, one, a( j+1, j ), 1 )
+                    call stdlib_dscal( n-j, one / ajj, a( j+1, j ), 1 )
+                 end if
+              end do
+           end if
+           go to 40
+           30 continue
+           info = j
+           40 continue
+           return
+     end subroutine stdlib_dpotf2
+
+     !> DPOTRF2: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A using the recursive algorithm.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the recursive version of the algorithm. It divides
+     !> the matrix into four submatrices:
+     !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+     !> A = [ -----|----- ]  with n1 = n/2
+     !> [  A21 | A22  ]       n2 = n-n1
+     !> The subroutine calls itself to factor A11. Update and scale A21
+     !> or A12, update A22 then calls itself to factor A22.
+
+     pure recursive subroutine stdlib_dpotrf2( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: n1, n2, iinfo
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOTRF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! n=1 case
+           if( n==1 ) then
+              ! test for non-positive-definiteness
+              if( a( 1, 1 )<=zero.or.stdlib_disnan( a( 1, 1 ) ) ) then
+                 info = 1
+                 return
+              end if
+              ! factor
+              a( 1, 1 ) = sqrt( a( 1, 1 ) )
+           ! use recursive code
+           else
+              n1 = n/2
+              n2 = n-n1
+              ! factor a11
+              call stdlib_dpotrf2( uplo, n1, a( 1, 1 ), lda, iinfo )
+              if ( iinfo/=0 ) then
+                 info = iinfo
+                 return
+              end if
+              ! compute the cholesky factorization a = u**t*u
+              if( upper ) then
+                 ! update and scale a12
+                 call stdlib_dtrsm( 'L', 'U', 'T', 'N', n1, n2, one,a( 1, 1 ), lda, a( 1, n1+1 ), &
+                           lda )
+                 ! update and factor a22
+                 call stdlib_dsyrk( uplo, 'T', n2, n1, -one, a( 1, n1+1 ), lda,one, a( n1+1, n1+1 &
+                           ), lda )
+                 call stdlib_dpotrf2( uplo, n2, a( n1+1, n1+1 ), lda, iinfo )
+                 if ( iinfo/=0 ) then
+                    info = iinfo + n1
+                    return
+                 end if
+              ! compute the cholesky factorization a = l*l**t
+              else
+                 ! update and scale a21
+                 call stdlib_dtrsm( 'R', 'L', 'T', 'N', n2, n1, one,a( 1, 1 ), lda, a( n1+1, 1 ), &
+                           lda )
+                 ! update and factor a22
+                 call stdlib_dsyrk( uplo, 'N', n2, n1, -one, a( n1+1, 1 ), lda,one, a( n1+1, n1+1 &
+                           ), lda )
+                 call stdlib_dpotrf2( uplo, n2, a( n1+1, n1+1 ), lda, iinfo )
+                 if ( iinfo/=0 ) then
+                    info = iinfo + n1
+                    return
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dpotrf2
+
+     !> DPOTRI: computes the inverse of a real symmetric positive definite
+     !> matrix A using the Cholesky factorization A = U**T*U or A = L*L**T
+     !> computed by DPOTRF.
+
+     pure subroutine stdlib_dpotri( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_dtrtri( uplo, 'NON-UNIT', n, a, lda, info )
+           if( info>0 )return
+           ! form inv(u) * inv(u)**t or inv(l)**t * inv(l).
+           call stdlib_dlauum( uplo, n, a, lda, info )
+           return
+     end subroutine stdlib_dpotri
+
+     !> DPPCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric positive definite packed matrix using
+     !> the Cholesky factorization A = U**T*U or A = L*L**T computed by
+     !> DPPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_dppcon( uplo, n, ap, anorm, rcond, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: ap(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(dp) :: ainvnm, scale, scalel, scaleu, smlnum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**t).
+                 call stdlib_dlatps( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n,ap, work, scalel,&
+                            work( 2*n+1 ), info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_dlatps( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,ap, work, &
+                           scaleu, work( 2*n+1 ), info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_dlatps( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,ap, work, &
+                           scalel, work( 2*n+1 ), info )
+                 normin = 'Y'
+                 ! multiply by inv(l**t).
+                 call stdlib_dlatps( 'LOWER', 'TRANSPOSE', 'NON-UNIT', normin, n,ap, work, scaleu,&
+                            work( 2*n+1 ), info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_idamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_drscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_dppcon
+
+     !> DPPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric positive definite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_dpprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr,berr, work, &
+               iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*)
+           real(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_dcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_dspmv( uplo, n, -one, ap, x( 1, j ), 1, one, work( n+1 ),1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       work( i ) = work( i ) + abs( ap( ik ) )*xk
+                       s = s + abs( ap( ik ) )*abs( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    work( k ) = work( k ) + abs( ap( kk+k-1 ) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( ap( kk ) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       work( i ) = work( i ) + abs( ap( ik ) )*xk
+                       s = s + abs( ap( ik ) )*abs( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    work( k ) = work( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_dpptrs( uplo, n, 1, afp, work( n+1 ), n, info )
+                 call stdlib_daxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_dlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_dlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_dpptrs( uplo, n, 1, afp, work( n+1 ), n, info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_dpptrs( uplo, n, 1, afp, work( n+1 ), n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_dpprfs
+
+     !> DPPSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix stored in
+     !> packed format and X and B are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**T* U,  if UPLO = 'U', or
+     !> A = L * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is a lower triangular
+     !> matrix.  The factored form of A is then used to solve the system of
+     !> equations A * X = B.
+
+     pure subroutine stdlib_dppsv( uplo, n, nrhs, ap, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**t*u or a = l*l**t.
+           call stdlib_dpptrf( uplo, n, ap, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_dpptrs( uplo, n, nrhs, ap, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_dppsv
+
+     !> DPPSVX: uses the Cholesky factorization A = U**T*U or A = L*L**T to
+     !> compute the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix stored in
+     !> packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_dppsvx( fact, uplo, n, nrhs, ap, afp, equed, s, b, ldb,x, ldx, rcond, ferr,&
+                berr, work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: afp(*), ap(*), b(ldb,*), s(*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ
+           integer(ilp) :: i, infequ, j
+           real(dp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -7
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -8
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -10
+                 else if( ldx<max( 1, n ) ) then
+                    info = -12
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_dppequ( uplo, n, ap, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_dlaqsp( uplo, n, ap, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right-hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**t * u or a = l * l**t.
+              call stdlib_dcopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_dpptrf( uplo, n, afp, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_dlansp( 'I', uplo, n, ap, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_dppcon( uplo, n, afp, anorm, rcond, work, iwork, info )
+           ! compute the solution matrix x.
+           call stdlib_dlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_dpptrs( uplo, n, nrhs, afp, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_dpprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr, berr,work, iwork, &
+                     info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_dppsvx
+
+     !> DPPTRI: computes the inverse of a real symmetric positive definite
+     !> matrix A using the Cholesky factorization A = U**T*U or A = L*L**T
+     !> computed by DPPTRF.
+
+     pure subroutine stdlib_dpptri( uplo, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jc, jj, jjn
+           real(dp) :: ajj
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_dtptri( uplo, 'NON-UNIT', n, ap, info )
+           if( info>0 )return
+           if( upper ) then
+              ! compute the product inv(u) * inv(u)**t.
+              jj = 0
+              do j = 1, n
+                 jc = jj + 1
+                 jj = jj + j
+                 if( j>1 )call stdlib_dspr( 'UPPER', j-1, one, ap( jc ), 1, ap )
+                 ajj = ap( jj )
+                 call stdlib_dscal( j, ajj, ap( jc ), 1 )
+              end do
+           else
+              ! compute the product inv(l)**t * inv(l).
+              jj = 1
+              do j = 1, n
+                 jjn = jj + n - j + 1
+                 ap( jj ) = stdlib_ddot( n-j+1, ap( jj ), 1, ap( jj ), 1 )
+                 if( j<n )call stdlib_dtpmv( 'LOWER', 'TRANSPOSE', 'NON-UNIT', n-j,ap( jjn ), ap( &
+                           jj+1 ), 1 )
+                 jj = jjn
+              end do
+           end if
+           return
+     end subroutine stdlib_dpptri
+
+     !> DPSTF2: computes the Cholesky factorization with complete
+     !> pivoting of a real symmetric positive semidefinite matrix A.
+     !> The factorization has the form
+     !> P**T * A * P = U**T * U ,  if UPLO = 'U',
+     !> P**T * A * P = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular, and
+     !> P is stored as vector PIV.
+     !> This algorithm does not attempt to check that A is positive
+     !> semidefinite. This version of the algorithm calls level 2 BLAS.
+
+     pure subroutine stdlib_dpstf2( uplo, n, a, lda, piv, rank, tol, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: tol
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(2*n)
+           integer(ilp), intent(out) :: piv(n)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: ajj, dstop, dtemp
+           integer(ilp) :: i, itemp, j, pvt
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt,maxloc
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPSTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize piv
+           do i = 1, n
+              piv( i ) = i
+           end do
+           ! compute stopping value
+           pvt = 1
+           ajj = a( pvt, pvt )
+           do i = 2, n
+              if( a( i, i )>ajj ) then
+                 pvt = i
+                 ajj = a( pvt, pvt )
+              end if
+           end do
+           if( ajj<=zero.or.stdlib_disnan( ajj ) ) then
+              rank = 0
+              info = 1
+              go to 170
+           end if
+           ! compute stopping value if not supplied
+           if( tol<zero ) then
+              dstop = n * stdlib_dlamch( 'EPSILON' ) * ajj
+           else
+              dstop = tol
+           end if
+           ! set first half of work to zero, holds dot products
+           do i = 1, n
+              work( i ) = 0
+           end do
+           if( upper ) then
+              ! compute the cholesky factorization p**t * a * p = u**t * u
+              loop_130: do j = 1, n
+              ! find pivot, test for exit, else swap rows and columns
+              ! update dot products, compute possible pivots which are
+              ! stored in the second half of work
+                 do i = j, n
+                    if( j>1 ) then
+                       work( i ) = work( i ) + a( j-1, i )**2
+                    end if
+                    work( n+i ) = a( i, i ) - work( i )
+                 end do
+                 if( j>1 ) then
+                    itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                    pvt = itemp + j - 1
+                    ajj = work( n+pvt )
+                    if( ajj<=dstop.or.stdlib_disnan( ajj ) ) then
+                       a( j, j ) = ajj
+                       go to 160
+                    end if
+                 end if
+                 if( j/=pvt ) then
+                    ! pivot ok, so can now swap pivot rows and columns
+                    a( pvt, pvt ) = a( j, j )
+                    call stdlib_dswap( j-1, a( 1, j ), 1, a( 1, pvt ), 1 )
+                    if( pvt<n )call stdlib_dswap( n-pvt, a( j, pvt+1 ), lda,a( pvt, pvt+1 ), lda )
+                              
+                    call stdlib_dswap( pvt-j-1, a( j, j+1 ), lda, a( j+1, pvt ), 1 )
+                    ! swap dot products and piv
+                    dtemp = work( j )
+                    work( j ) = work( pvt )
+                    work( pvt ) = dtemp
+                    itemp = piv( pvt )
+                    piv( pvt ) = piv( j )
+                    piv( j ) = itemp
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of row j
+                 if( j<n ) then
+                    call stdlib_dgemv( 'TRANS', j-1, n-j, -one, a( 1, j+1 ), lda,a( 1, j ), 1, &
+                              one, a( j, j+1 ), lda )
+                    call stdlib_dscal( n-j, one / ajj, a( j, j+1 ), lda )
+                 end if
+              end do loop_130
+           else
+              ! compute the cholesky factorization p**t * a * p = l * l**t
+              loop_150: do j = 1, n
+              ! find pivot, test for exit, else swap rows and columns
+              ! update dot products, compute possible pivots which are
+              ! stored in the second half of work
+                 do i = j, n
+                    if( j>1 ) then
+                       work( i ) = work( i ) + a( i, j-1 )**2
+                    end if
+                    work( n+i ) = a( i, i ) - work( i )
+                 end do
+                 if( j>1 ) then
+                    itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                    pvt = itemp + j - 1
+                    ajj = work( n+pvt )
+                    if( ajj<=dstop.or.stdlib_disnan( ajj ) ) then
+                       a( j, j ) = ajj
+                       go to 160
+                    end if
+                 end if
+                 if( j/=pvt ) then
+                    ! pivot ok, so can now swap pivot rows and columns
+                    a( pvt, pvt ) = a( j, j )
+                    call stdlib_dswap( j-1, a( j, 1 ), lda, a( pvt, 1 ), lda )
+                    if( pvt<n )call stdlib_dswap( n-pvt, a( pvt+1, j ), 1, a( pvt+1, pvt ),1 )
+                              
+                    call stdlib_dswap( pvt-j-1, a( j+1, j ), 1, a( pvt, j+1 ), lda )
+                    ! swap dot products and piv
+                    dtemp = work( j )
+                    work( j ) = work( pvt )
+                    work( pvt ) = dtemp
+                    itemp = piv( pvt )
+                    piv( pvt ) = piv( j )
+                    piv( j ) = itemp
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of column j
+                 if( j<n ) then
+                    call stdlib_dgemv( 'NO TRANS', n-j, j-1, -one, a( j+1, 1 ), lda,a( j, 1 ), &
+                              lda, one, a( j+1, j ), 1 )
+                    call stdlib_dscal( n-j, one / ajj, a( j+1, j ), 1 )
+                 end if
+              end do loop_150
+           end if
+           ! ran to completion, a has full rank
+           rank = n
+           go to 170
+           160 continue
+           ! rank is number of steps completed.  set info = 1 to signal
+           ! that the factorization cannot be used to solve a system.
+           rank = j - 1
+           info = 1
+           170 continue
+           return
+     end subroutine stdlib_dpstf2
+
+     !> DPSTRF: computes the Cholesky factorization with complete
+     !> pivoting of a real symmetric positive semidefinite matrix A.
+     !> The factorization has the form
+     !> P**T * A * P = U**T * U ,  if UPLO = 'U',
+     !> P**T * A * P = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular, and
+     !> P is stored as vector PIV.
+     !> This algorithm does not attempt to check that A is positive
+     !> semidefinite. This version of the algorithm calls level 3 BLAS.
+
+     pure subroutine stdlib_dpstrf( uplo, n, a, lda, piv, rank, tol, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: tol
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(2*n)
+           integer(ilp), intent(out) :: piv(n)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: ajj, dstop, dtemp
+           integer(ilp) :: i, itemp, j, jb, k, nb, pvt
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt,maxloc
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPSTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get block size
+           nb = stdlib_ilaenv( 1, 'DPOTRF', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_dpstf2( uplo, n, a( 1, 1 ), lda, piv, rank, tol, work,info )
+              go to 200
+           else
+           ! initialize piv
+              do i = 1, n
+                 piv( i ) = i
+              end do
+           ! compute stopping value
+              pvt = 1
+              ajj = a( pvt, pvt )
+              do i = 2, n
+                 if( a( i, i )>ajj ) then
+                    pvt = i
+                    ajj = a( pvt, pvt )
+                 end if
+              end do
+              if( ajj<=zero.or.stdlib_disnan( ajj ) ) then
+                 rank = 0
+                 info = 1
+                 go to 200
+              end if
+           ! compute stopping value if not supplied
+              if( tol<zero ) then
+                 dstop = n * stdlib_dlamch( 'EPSILON' ) * ajj
+              else
+                 dstop = tol
+              end if
+              if( upper ) then
+                 ! compute the cholesky factorization p**t * a * p = u**t * u
+                 loop_140: do k = 1, n, nb
+                    ! account for last block not being nb wide
+                    jb = min( nb, n-k+1 )
+                    ! set relevant part of first half of work to zero,
+                    ! holds dot products
+                    do i = k, n
+                       work( i ) = 0
+                    end do
+                    loop_130: do j = k, k + jb - 1
+                    ! find pivot, test for exit, else swap rows and columns
+                    ! update dot products, compute possible pivots which are
+                    ! stored in the second half of work
+                       do i = j, n
+                          if( j>k ) then
+                             work( i ) = work( i ) + a( j-1, i )**2
+                          end if
+                          work( n+i ) = a( i, i ) - work( i )
+                       end do
+                       if( j>1 ) then
+                          itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                          pvt = itemp + j - 1
+                          ajj = work( n+pvt )
+                          if( ajj<=dstop.or.stdlib_disnan( ajj ) ) then
+                             a( j, j ) = ajj
+                             go to 190
+                          end if
+                       end if
+                       if( j/=pvt ) then
+                          ! pivot ok, so can now swap pivot rows and columns
+                          a( pvt, pvt ) = a( j, j )
+                          call stdlib_dswap( j-1, a( 1, j ), 1, a( 1, pvt ), 1 )
+                          if( pvt<n )call stdlib_dswap( n-pvt, a( j, pvt+1 ), lda,a( pvt, pvt+1 ),&
+                                     lda )
+                          call stdlib_dswap( pvt-j-1, a( j, j+1 ), lda,a( j+1, pvt ), 1 )
+                          ! swap dot products and piv
+                          dtemp = work( j )
+                          work( j ) = work( pvt )
+                          work( pvt ) = dtemp
+                          itemp = piv( pvt )
+                          piv( pvt ) = piv( j )
+                          piv( j ) = itemp
+                       end if
+                       ajj = sqrt( ajj )
+                       a( j, j ) = ajj
+                       ! compute elements j+1:n of row j.
+                       if( j<n ) then
+                          call stdlib_dgemv( 'TRANS', j-k, n-j, -one, a( k, j+1 ),lda, a( k, j ), &
+                                    1, one, a( j, j+1 ),lda )
+                          call stdlib_dscal( n-j, one / ajj, a( j, j+1 ), lda )
+                       end if
+                    end do loop_130
+                    ! update trailing matrix, j already incremented
+                    if( k+jb<=n ) then
+                       call stdlib_dsyrk( 'UPPER', 'TRANS', n-j+1, jb, -one,a( k, j ), lda, one, &
+                                 a( j, j ), lda )
+                    end if
+                 end do loop_140
+              else
+              ! compute the cholesky factorization p**t * a * p = l * l**t
+                 loop_180: do k = 1, n, nb
+                    ! account for last block not being nb wide
+                    jb = min( nb, n-k+1 )
+                    ! set relevant part of first half of work to zero,
+                    ! holds dot products
+                    do i = k, n
+                       work( i ) = 0
+                    end do
+                    loop_170: do j = k, k + jb - 1
+                    ! find pivot, test for exit, else swap rows and columns
+                    ! update dot products, compute possible pivots which are
+                    ! stored in the second half of work
+                       do i = j, n
+                          if( j>k ) then
+                             work( i ) = work( i ) + a( i, j-1 )**2
+                          end if
+                          work( n+i ) = a( i, i ) - work( i )
+                       end do
+                       if( j>1 ) then
+                          itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                          pvt = itemp + j - 1
+                          ajj = work( n+pvt )
+                          if( ajj<=dstop.or.stdlib_disnan( ajj ) ) then
+                             a( j, j ) = ajj
+                             go to 190
+                          end if
+                       end if
+                       if( j/=pvt ) then
+                          ! pivot ok, so can now swap pivot rows and columns
+                          a( pvt, pvt ) = a( j, j )
+                          call stdlib_dswap( j-1, a( j, 1 ), lda, a( pvt, 1 ), lda )
+                          if( pvt<n )call stdlib_dswap( n-pvt, a( pvt+1, j ), 1,a( pvt+1, pvt ), &
+                                    1 )
+                          call stdlib_dswap( pvt-j-1, a( j+1, j ), 1, a( pvt, j+1 ),lda )
+                          ! swap dot products and piv
+                          dtemp = work( j )
+                          work( j ) = work( pvt )
+                          work( pvt ) = dtemp
+                          itemp = piv( pvt )
+                          piv( pvt ) = piv( j )
+                          piv( j ) = itemp
+                       end if
+                       ajj = sqrt( ajj )
+                       a( j, j ) = ajj
+                       ! compute elements j+1:n of column j.
+                       if( j<n ) then
+                          call stdlib_dgemv( 'NO TRANS', n-j, j-k, -one,a( j+1, k ), lda, a( j, k &
+                                    ), lda, one,a( j+1, j ), 1 )
+                          call stdlib_dscal( n-j, one / ajj, a( j+1, j ), 1 )
+                       end if
+                    end do loop_170
+                    ! update trailing matrix, j already incremented
+                    if( k+jb<=n ) then
+                       call stdlib_dsyrk( 'LOWER', 'NO TRANS', n-j+1, jb, -one,a( j, k ), lda, &
+                                 one, a( j, j ), lda )
+                    end if
+                 end do loop_180
+              end if
+           end if
+           ! ran to completion, a has full rank
+           rank = n
+           go to 200
+           190 continue
+           ! rank is the number of steps completed.  set info = 1 to signal
+           ! that the factorization cannot be used to solve a system.
+           rank = j - 1
+           info = 1
+           200 continue
+           return
+     end subroutine stdlib_dpstrf
+
+     !> DPTTRS: solves a tridiagonal system of the form
+     !> A * X = B
+     !> using the L*D*L**T factorization of A computed by DPTTRF.  D is a
+     !> diagonal matrix specified in the vector D, L is a unit bidiagonal
+     !> matrix whose subdiagonal is specified in the vector E, and X and B
+     !> are N by NRHS matrices.
+
+     pure subroutine stdlib_dpttrs( n, nrhs, d, e, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(inout) :: b(ldb,*)
+           real(dp), intent(in) :: d(*), e(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPTTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! determine the number of right-hand sides to solve at a time.
+           if( nrhs==1 ) then
+              nb = 1
+           else
+              nb = max( 1, stdlib_ilaenv( 1, 'DPTTRS', ' ', n, nrhs, -1, -1 ) )
+           end if
+           if( nb>=nrhs ) then
+              call stdlib_dptts2( n, nrhs, d, e, b, ldb )
+           else
+              do j = 1, nrhs, nb
+                 jb = min( nrhs-j+1, nb )
+                 call stdlib_dptts2( n, jb, d, e, b( 1, j ), ldb )
+              end do
+           end if
+           return
+     end subroutine stdlib_dpttrs
+
+     !> DSB2ST_KERNELS: is an internal routine used by the DSYTRD_SB2ST
+     !> subroutine.
+
+     pure subroutine  stdlib_dsb2st_kernels( uplo, wantz, ttype,st, ed, sweep, n, nb, ib,a, lda, &
+               v, tau, ldvt, work)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: wantz
+           integer(ilp), intent(in) :: ttype, st, ed, sweep, n, nb, ib, lda, ldvt
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: v(*), tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j1, j2, lm, ln, vpos, taupos, dpos, ofdpos, ajeter
+           real(dp) :: ctmp
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ajeter = ib + ldvt
+           upper = stdlib_lsame( uplo, 'U' )
+           if( upper ) then
+               dpos    = 2 * nb + 1
+               ofdpos  = 2 * nb
+           else
+               dpos    = 1
+               ofdpos  = 2
+           endif
+           ! upper case
+           if( upper ) then
+               if( wantz ) then
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               else
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               endif
+               if( ttype==1 ) then
+                   lm = ed - st + 1
+                   v( vpos ) = one
+                   do i = 1, lm-1
+                       v( vpos+i )         = ( a( ofdpos-i, st+i ) )
+                       a( ofdpos-i, st+i ) = zero
+                   end do
+                   ctmp = ( a( ofdpos, st ) )
+                   call stdlib_dlarfg( lm, ctmp, v( vpos+1 ), 1,tau( taupos ) )
+                   a( ofdpos, st ) = ctmp
+                   lm = ed - st + 1
+                   call stdlib_dlarfy( uplo, lm, v( vpos ), 1,( tau( taupos ) ),a( dpos, st ), &
+                             lda-1, work)
+               endif
+               if( ttype==3 ) then
+                   lm = ed - st + 1
+                   call stdlib_dlarfy( uplo, lm, v( vpos ), 1,( tau( taupos ) ),a( dpos, st ), &
+                             lda-1, work)
+               endif
+               if( ttype==2 ) then
+                   j1 = ed+1
+                   j2 = min( ed+nb, n )
+                   ln = ed-st+1
+                   lm = j2-j1+1
+                   if( lm>0) then
+                       call stdlib_dlarfx( 'LEFT', ln, lm, v( vpos ),( tau( taupos ) ),a( dpos-nb,&
+                                  j1 ), lda-1, work)
+                       if( wantz ) then
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       else
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       endif
+                       v( vpos ) = one
+                       do i = 1, lm-1
+                           v( vpos+i )          =( a( dpos-nb-i, j1+i ) )
+                           a( dpos-nb-i, j1+i ) = zero
+                       end do
+                       ctmp = ( a( dpos-nb, j1 ) )
+                       call stdlib_dlarfg( lm, ctmp, v( vpos+1 ), 1, tau( taupos ) )
+                       a( dpos-nb, j1 ) = ctmp
+                       call stdlib_dlarfx( 'RIGHT', ln-1, lm, v( vpos ),tau( taupos ),a( dpos-nb+&
+                                 1, j1 ), lda-1, work)
+                   endif
+               endif
+           ! lower case
+           else
+               if( wantz ) then
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               else
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               endif
+               if( ttype==1 ) then
+                   lm = ed - st + 1
+                   v( vpos ) = one
+                   do i = 1, lm-1
+                       v( vpos+i )         = a( ofdpos+i, st-1 )
+                       a( ofdpos+i, st-1 ) = zero
+                   end do
+                   call stdlib_dlarfg( lm, a( ofdpos, st-1 ), v( vpos+1 ), 1,tau( taupos ) )
+                             
+                   lm = ed - st + 1
+                   call stdlib_dlarfy( uplo, lm, v( vpos ), 1,( tau( taupos ) ),a( dpos, st ), &
+                             lda-1, work)
+               endif
+               if( ttype==3 ) then
+                   lm = ed - st + 1
+                   call stdlib_dlarfy( uplo, lm, v( vpos ), 1,( tau( taupos ) ),a( dpos, st ), &
+                             lda-1, work)
+               endif
+               if( ttype==2 ) then
+                   j1 = ed+1
+                   j2 = min( ed+nb, n )
+                   ln = ed-st+1
+                   lm = j2-j1+1
+                   if( lm>0) then
+                       call stdlib_dlarfx( 'RIGHT', lm, ln, v( vpos ),tau( taupos ), a( dpos+nb, &
+                                 st ),lda-1, work)
+                       if( wantz ) then
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       else
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       endif
+                       v( vpos ) = one
+                       do i = 1, lm-1
+                           v( vpos+i )        = a( dpos+nb+i, st )
+                           a( dpos+nb+i, st ) = zero
+                       end do
+                       call stdlib_dlarfg( lm, a( dpos+nb, st ), v( vpos+1 ), 1,tau( taupos ) )
+                                 
+                       call stdlib_dlarfx( 'LEFT', lm, ln-1, v( vpos ),( tau( taupos ) ),a( dpos+&
+                                 nb-1, st+1 ), lda-1, work)
+                   endif
+               endif
+           endif
+           return
+     end subroutine stdlib_dsb2st_kernels
+
+     !> DSPCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric packed matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by DSPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_dspcon( uplo, n, ap, ipiv, anorm, rcond, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: ap(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ip, kase
+           real(dp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              ip = n*( n+1 ) / 2
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip - i
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              ip = 1
+              do i = 1, n
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip + n - i + 1
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_dsptrs( uplo, n, 1, ap, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_dspcon
+
+     !> DSPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric indefinite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_dsprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, work,&
+                iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*)
+           real(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_dcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_dspmv( uplo, n, -one, ap, x( 1, j ), 1, one, work( n+1 ),1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       work( i ) = work( i ) + abs( ap( ik ) )*xk
+                       s = s + abs( ap( ik ) )*abs( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    work( k ) = work( k ) + abs( ap( kk+k-1 ) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( ap( kk ) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       work( i ) = work( i ) + abs( ap( ik ) )*xk
+                       s = s + abs( ap( ik ) )*abs( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    work( k ) = work( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_dsptrs( uplo, n, 1, afp, ipiv, work( n+1 ), n, info )
+                 call stdlib_daxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_dlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_dlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_dsptrs( uplo, n, 1, afp, ipiv, work( n+1 ), n,info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_dsptrs( uplo, n, 1, afp, ipiv, work( n+1 ), n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_dsprfs
+
+     !> DSPSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix stored in packed format and X
+     !> and B are N-by-NRHS matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, D is symmetric and block diagonal with 1-by-1
+     !> and 2-by-2 diagonal blocks.  The factored form of A is then used to
+     !> solve the system of equations A * X = B.
+
+     pure subroutine stdlib_dspsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPSV ', -info )
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_dsptrf( uplo, n, ap, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_dsptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_dspsv
+
+     !> DSPSVX: uses the diagonal pivoting factorization A = U*D*U**T or
+     !> A = L*D*L**T to compute the solution to a real system of linear
+     !> equations A * X = B, where A is an N-by-N symmetric matrix stored
+     !> in packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_dspsvx( fact, uplo, n, nrhs, ap, afp, ipiv, b, ldb, x,ldx, rcond, ferr, &
+               berr, work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: afp(*)
+           real(dp), intent(in) :: ap(*), b(ldb,*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(dp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+              call stdlib_dcopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_dsptrf( uplo, n, afp, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_dlansp( 'I', uplo, n, ap, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_dspcon( uplo, n, afp, ipiv, anorm, rcond, work, iwork, info )
+           ! compute the solution vectors x.
+           call stdlib_dlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_dsptrs( uplo, n, nrhs, afp, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_dsprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx, ferr,berr, work, &
+                     iwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_dspsvx
+
+     !> DSPTRD: reduces a real symmetric matrix A stored in packed form to
+     !> symmetric tridiagonal form T by an orthogonal similarity
+     !> transformation: Q**T * A * Q = T.
+
+     pure subroutine stdlib_dsptrd( uplo, n, ap, d, e, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ap(*)
+           real(dp), intent(out) :: d(*), e(*), tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, i1, i1i1, ii
+           real(dp) :: alpha, taui
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPTRD', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           if( upper ) then
+              ! reduce the upper triangle of a.
+              ! i1 is the index in ap of a(1,i+1).
+              i1 = n*( n-1 ) / 2 + 1
+              do i = n - 1, 1, -1
+                 ! generate elementary reflector h(i) = i - tau * v * v**t
+                 ! to annihilate a(1:i-1,i+1)
+                 call stdlib_dlarfg( i, ap( i1+i-1 ), ap( i1 ), 1, taui )
+                 e( i ) = ap( i1+i-1 )
+                 if( taui/=zero ) then
+                    ! apply h(i) from both sides to a(1:i,1:i)
+                    ap( i1+i-1 ) = one
+                    ! compute  y := tau * a * v  storing y in tau(1:i)
+                    call stdlib_dspmv( uplo, i, taui, ap, ap( i1 ), 1, zero, tau,1 )
+                    ! compute  w := y - 1/2 * tau * (y**t *v) * v
+                    alpha = -half*taui*stdlib_ddot( i, tau, 1, ap( i1 ), 1 )
+                    call stdlib_daxpy( i, alpha, ap( i1 ), 1, tau, 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**t - w * v**t
+                    call stdlib_dspr2( uplo, i, -one, ap( i1 ), 1, tau, 1, ap )
+                    ap( i1+i-1 ) = e( i )
+                 end if
+                 d( i+1 ) = ap( i1+i )
+                 tau( i ) = taui
+                 i1 = i1 - i
+              end do
+              d( 1 ) = ap( 1 )
+           else
+              ! reduce the lower triangle of a. ii is the index in ap of
+              ! a(i,i) and i1i1 is the index of a(i+1,i+1).
+              ii = 1
+              do i = 1, n - 1
+                 i1i1 = ii + n - i + 1
+                 ! generate elementary reflector h(i) = i - tau * v * v**t
+                 ! to annihilate a(i+2:n,i)
+                 call stdlib_dlarfg( n-i, ap( ii+1 ), ap( ii+2 ), 1, taui )
+                 e( i ) = ap( ii+1 )
+                 if( taui/=zero ) then
+                    ! apply h(i) from both sides to a(i+1:n,i+1:n)
+                    ap( ii+1 ) = one
+                    ! compute  y := tau * a * v  storing y in tau(i:n-1)
+                    call stdlib_dspmv( uplo, n-i, taui, ap( i1i1 ), ap( ii+1 ), 1,zero, tau( i ), &
+                              1 )
+                    ! compute  w := y - 1/2 * tau * (y**t *v) * v
+                    alpha = -half*taui*stdlib_ddot( n-i, tau( i ), 1, ap( ii+1 ),1 )
+                    call stdlib_daxpy( n-i, alpha, ap( ii+1 ), 1, tau( i ), 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**t - w * v**t
+                    call stdlib_dspr2( uplo, n-i, -one, ap( ii+1 ), 1, tau( i ), 1,ap( i1i1 ) )
+                              
+                    ap( ii+1 ) = e( i )
+                 end if
+                 d( i ) = ap( ii )
+                 tau( i ) = taui
+                 ii = i1i1
+              end do
+              d( n ) = ap( ii )
+           end if
+           return
+     end subroutine stdlib_dsptrd
+
+     !> DSTEIN: computes the eigenvectors of a real symmetric tridiagonal
+     !> matrix T corresponding to specified eigenvalues, using inverse
+     !> iteration.
+     !> The maximum number of iterations allowed for each eigenvector is
+     !> specified by an internal parameter MAXITS (currently set to 5).
+
+     pure subroutine stdlib_dstein( n, d, e, m, w, iblock, isplit, z, ldz, work,iwork, ifail, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, m, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: iblock(*), isplit(*)
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(in) :: d(*), e(*), w(*)
+           real(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: odm3 = 1.0e-3_dp
+           real(dp), parameter :: odm1 = 1.0e-1_dp
+           integer(ilp), parameter :: maxits = 5
+           integer(ilp), parameter :: extra = 2
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: b1, blksiz, bn, gpind, i, iinfo, indrv1, indrv2, indrv3, indrv4, &
+                     indrv5, its, j, j1, jblk, jmax, nblk, nrmchk
+           real(dp) :: dtpcrt, eps, eps1, nrm, onenrm, ortol, pertol, scl, sep, tol, xj, xjm, &
+                     ztr
+           ! Local Arrays 
+           integer(ilp) :: iseed(4)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           do i = 1, m
+              ifail( i ) = 0
+           end do
+           if( n<0 ) then
+              info = -1
+           else if( m<0 .or. m>n ) then
+              info = -4
+           else if( ldz<max( 1, n ) ) then
+              info = -9
+           else
+              do j = 2, m
+                 if( iblock( j )<iblock( j-1 ) ) then
+                    info = -6
+                    go to 30
+                 end if
+                 if( iblock( j )==iblock( j-1 ) .and. w( j )<w( j-1 ) )then
+                    info = -5
+                    go to 30
+                 end if
+              end do
+              30 continue
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEIN', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) then
+              return
+           else if( n==1 ) then
+              z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           eps = stdlib_dlamch( 'PRECISION' )
+           ! initialize seed for random number generator stdlib_dlarnv.
+           do i = 1, 4
+              iseed( i ) = 1
+           end do
+           ! initialize pointers.
+           indrv1 = 0
+           indrv2 = indrv1 + n
+           indrv3 = indrv2 + n
+           indrv4 = indrv3 + n
+           indrv5 = indrv4 + n
+           ! compute eigenvectors of matrix blocks.
+           j1 = 1
+           loop_160: do nblk = 1, iblock( m )
+              ! find starting and ending indices of block nblk.
+              if( nblk==1 ) then
+                 b1 = 1
+              else
+                 b1 = isplit( nblk-1 ) + 1
+              end if
+              bn = isplit( nblk )
+              blksiz = bn - b1 + 1
+              if( blksiz==1 )go to 60
+              gpind = j1
+              ! compute reorthogonalization criterion and stopping criterion.
+              onenrm = abs( d( b1 ) ) + abs( e( b1 ) )
+              onenrm = max( onenrm, abs( d( bn ) )+abs( e( bn-1 ) ) )
+              do i = b1 + 1, bn - 1
+                 onenrm = max( onenrm, abs( d( i ) )+abs( e( i-1 ) )+abs( e( i ) ) )
+              end do
+              ortol = odm3*onenrm
+              dtpcrt = sqrt( odm1 / blksiz )
+              ! loop through eigenvalues of block nblk.
+              60 continue
+              jblk = 0
+              loop_150: do j = j1, m
+                 if( iblock( j )/=nblk ) then
+                    j1 = j
+                    cycle loop_160
+                 end if
+                 jblk = jblk + 1
+                 xj = w( j )
+                 ! skip all the work if the block size is one.
+                 if( blksiz==1 ) then
+                    work( indrv1+1 ) = one
+                    go to 120
+                 end if
+                 ! if eigenvalues j and j-1 are too close, add a relatively
+                 ! small perturbation.
+                 if( jblk>1 ) then
+                    eps1 = abs( eps*xj )
+                    pertol = ten*eps1
+                    sep = xj - xjm
+                    if( sep<pertol )xj = xjm + pertol
+                 end if
+                 its = 0
+                 nrmchk = 0
+                 ! get random starting vector.
+                 call stdlib_dlarnv( 2, iseed, blksiz, work( indrv1+1 ) )
+                 ! copy the matrix t so it won't be destroyed in factorization.
+                 call stdlib_dcopy( blksiz, d( b1 ), 1, work( indrv4+1 ), 1 )
+                 call stdlib_dcopy( blksiz-1, e( b1 ), 1, work( indrv2+2 ), 1 )
+                 call stdlib_dcopy( blksiz-1, e( b1 ), 1, work( indrv3+1 ), 1 )
+                 ! compute lu factors with partial pivoting  ( pt = lu )
+                 tol = zero
+                 call stdlib_dlagtf( blksiz, work( indrv4+1 ), xj, work( indrv2+2 ),work( indrv3+&
+                           1 ), tol, work( indrv5+1 ), iwork,iinfo )
+                 ! update iteration count.
+                 70 continue
+                 its = its + 1
+                 if( its>maxits )go to 100
+                 ! normalize and scale the righthand side vector pb.
+                 jmax = stdlib_idamax( blksiz, work( indrv1+1 ), 1 )
+                 scl = blksiz*onenrm*max( eps,abs( work( indrv4+blksiz ) ) ) /abs( work( indrv1+&
+                           jmax ) )
+                 call stdlib_dscal( blksiz, scl, work( indrv1+1 ), 1 )
+                 ! solve the system lu = pb.
+                 call stdlib_dlagts( -1, blksiz, work( indrv4+1 ), work( indrv2+2 ),work( indrv3+&
+                           1 ), work( indrv5+1 ), iwork,work( indrv1+1 ), tol, iinfo )
+                 ! reorthogonalize by modified gram-schmidt if eigenvalues are
+                 ! close enough.
+                 if( jblk==1 )go to 90
+                 if( abs( xj-xjm )>ortol )gpind = j
+                 if( gpind/=j ) then
+                    do i = gpind, j - 1
+                       ztr = -stdlib_ddot( blksiz, work( indrv1+1 ), 1, z( b1, i ),1 )
+                       call stdlib_daxpy( blksiz, ztr, z( b1, i ), 1,work( indrv1+1 ), 1 )
+                    end do
+                 end if
+                 ! check the infinity norm of the iterate.
+                 90 continue
+                 jmax = stdlib_idamax( blksiz, work( indrv1+1 ), 1 )
+                 nrm = abs( work( indrv1+jmax ) )
+                 ! continue for additional iterations after norm reaches
+                 ! stopping criterion.
+                 if( nrm<dtpcrt )go to 70
+                 nrmchk = nrmchk + 1
+                 if( nrmchk<extra+1 )go to 70
+                 go to 110
+                 ! if stopping criterion was not satisfied, update info and
+                 ! store eigenvector number in array ifail.
+                 100 continue
+                 info = info + 1
+                 ifail( info ) = j
+                 ! accept iterate as jth eigenvector.
+                 110 continue
+                 scl = one / stdlib_dnrm2( blksiz, work( indrv1+1 ), 1 )
+                 jmax = stdlib_idamax( blksiz, work( indrv1+1 ), 1 )
+                 if( work( indrv1+jmax )<zero )scl = -scl
+                 call stdlib_dscal( blksiz, scl, work( indrv1+1 ), 1 )
+                 120 continue
+                 do i = 1, n
+                    z( i, j ) = zero
+                 end do
+                 do i = 1, blksiz
+                    z( b1+i-1, j ) = work( indrv1+i )
+                 end do
+                 ! save the shift to check eigenvalue spacing at next
+                 ! iteration.
+                 xjm = xj
+              end do loop_150
+           end do loop_160
+           return
+     end subroutine stdlib_dstein
+
+     !> DSTEQR: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric tridiagonal matrix using the implicit QL or QR method.
+     !> The eigenvectors of a full or band symmetric matrix can also be found
+     !> if DSYTRD or DSPTRD or DSBTRD has been used to reduce this matrix to
+     !> tridiagonal form.
+
+     pure subroutine stdlib_dsteqr( compz, n, d, e, z, ldz, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: d(*), e(*), z(ldz,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 30
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, icompz, ii, iscale, j, jtot, k, l, l1, lend, lendm1, lendp1, lendsv,&
+                      lm1, lsv, m, mm, mm1, nm1, nmaxit
+           real(dp) :: anorm, b, c, eps, eps2, f, g, p, r, rt1, rt2, s, safmax, safmin, ssfmax, &
+                     ssfmin, tst
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or. ( icompz>0 .and. ldz<max( 1,n ) ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEQR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz==2 )z( 1, 1 ) = one
+              return
+           end if
+           ! determine the unit roundoff and over/underflow thresholds.
+           eps = stdlib_dlamch( 'E' )
+           eps2 = eps**2
+           safmin = stdlib_dlamch( 'S' )
+           safmax = one / safmin
+           ssfmax = sqrt( safmax ) / three
+           ssfmin = sqrt( safmin ) / eps2
+           ! compute the eigenvalues and eigenvectors of the tridiagonal
+           ! matrix.
+           if( icompz==2 )call stdlib_dlaset( 'FULL', n, n, zero, one, z, ldz )
+           nmaxit = n*maxit
+           jtot = 0
+           ! determine where the matrix splits and choose ql or qr iteration
+           ! for each block, according to whether top or bottom diagonal
+           ! element is smaller.
+           l1 = 1
+           nm1 = n - 1
+           10 continue
+           if( l1>n )go to 160
+           if( l1>1 )e( l1-1 ) = zero
+           if( l1<=nm1 ) then
+              do m = l1, nm1
+                 tst = abs( e( m ) )
+                 if( tst==zero )go to 30
+                 if( tst<=( sqrt( abs( d( m ) ) )*sqrt( abs( d( m+1 ) ) ) )*eps ) then
+                    e( m ) = zero
+                    go to 30
+                 end if
+              end do
+           end if
+           m = n
+           30 continue
+           l = l1
+           lsv = l
+           lend = m
+           lendsv = lend
+           l1 = m + 1
+           if( lend==l )go to 10
+           ! scale submatrix in rows and columns l to lend
+           anorm = stdlib_dlanst( 'M', lend-l+1, d( l ), e( l ) )
+           iscale = 0
+           if( anorm==zero )go to 10
+           if( anorm>ssfmax ) then
+              iscale = 1
+              call stdlib_dlascl( 'G', 0, 0, anorm, ssfmax, lend-l+1, 1, d( l ), n,info )
+              call stdlib_dlascl( 'G', 0, 0, anorm, ssfmax, lend-l, 1, e( l ), n,info )
+           else if( anorm<ssfmin ) then
+              iscale = 2
+              call stdlib_dlascl( 'G', 0, 0, anorm, ssfmin, lend-l+1, 1, d( l ), n,info )
+              call stdlib_dlascl( 'G', 0, 0, anorm, ssfmin, lend-l, 1, e( l ), n,info )
+           end if
+           ! choose between ql and qr iteration
+           if( abs( d( lend ) )<abs( d( l ) ) ) then
+              lend = lsv
+              l = lendsv
+           end if
+           if( lend>l ) then
+              ! ql iteration
+              ! look for small subdiagonal element.
+              40 continue
+              if( l/=lend ) then
+                 lendm1 = lend - 1
+                 do m = l, lendm1
+                    tst = abs( e( m ) )**2
+                    if( tst<=( eps2*abs( d( m ) ) )*abs( d( m+1 ) )+safmin )go to 60
+                 end do
+              end if
+              m = lend
+              60 continue
+              if( m<lend )e( m ) = zero
+              p = d( l )
+              if( m==l )go to 80
+              ! if remaining matrix is 2-by-2, use stdlib_dlae2 or stdlib_slaev2
+              ! to compute its eigensystem.
+              if( m==l+1 ) then
+                 if( icompz>0 ) then
+                    call stdlib_dlaev2( d( l ), e( l ), d( l+1 ), rt1, rt2, c, s )
+                    work( l ) = c
+                    work( n-1+l ) = s
+                    call stdlib_dlasr( 'R', 'V', 'B', n, 2, work( l ),work( n-1+l ), z( 1, l ), &
+                              ldz )
+                 else
+                    call stdlib_dlae2( d( l ), e( l ), d( l+1 ), rt1, rt2 )
+                 end if
+                 d( l ) = rt1
+                 d( l+1 ) = rt2
+                 e( l ) = zero
+                 l = l + 2
+                 if( l<=lend )go to 40
+                 go to 140
+              end if
+              if( jtot==nmaxit )go to 140
+              jtot = jtot + 1
+              ! form shift.
+              g = ( d( l+1 )-p ) / ( two*e( l ) )
+              r = stdlib_dlapy2( g, one )
+              g = d( m ) - p + ( e( l ) / ( g+sign( r, g ) ) )
+              s = one
+              c = one
+              p = zero
+              ! inner loop
+              mm1 = m - 1
+              do i = mm1, l, -1
+                 f = s*e( i )
+                 b = c*e( i )
+                 call stdlib_dlartg( g, f, c, s, r )
+                 if( i/=m-1 )e( i+1 ) = r
+                 g = d( i+1 ) - p
+                 r = ( d( i )-g )*s + two*c*b
+                 p = s*r
+                 d( i+1 ) = g + p
+                 g = c*r - b
+                 ! if eigenvectors are desired, then save rotations.
+                 if( icompz>0 ) then
+                    work( i ) = c
+                    work( n-1+i ) = -s
+                 end if
+              end do
+              ! if eigenvectors are desired, then apply saved rotations.
+              if( icompz>0 ) then
+                 mm = m - l + 1
+                 call stdlib_dlasr( 'R', 'V', 'B', n, mm, work( l ), work( n-1+l ),z( 1, l ), ldz &
+                           )
+              end if
+              d( l ) = d( l ) - p
+              e( l ) = g
+              go to 40
+              ! eigenvalue found.
+              80 continue
+              d( l ) = p
+              l = l + 1
+              if( l<=lend )go to 40
+              go to 140
+           else
+              ! qr iteration
+              ! look for small superdiagonal element.
+              90 continue
+              if( l/=lend ) then
+                 lendp1 = lend + 1
+                 do m = l, lendp1, -1
+                    tst = abs( e( m-1 ) )**2
+                    if( tst<=( eps2*abs( d( m ) ) )*abs( d( m-1 ) )+safmin )go to 110
+                 end do
+              end if
+              m = lend
+              110 continue
+              if( m>lend )e( m-1 ) = zero
+              p = d( l )
+              if( m==l )go to 130
+              ! if remaining matrix is 2-by-2, use stdlib_dlae2 or stdlib_slaev2
+              ! to compute its eigensystem.
+              if( m==l-1 ) then
+                 if( icompz>0 ) then
+                    call stdlib_dlaev2( d( l-1 ), e( l-1 ), d( l ), rt1, rt2, c, s )
+                    work( m ) = c
+                    work( n-1+m ) = s
+                    call stdlib_dlasr( 'R', 'V', 'F', n, 2, work( m ),work( n-1+m ), z( 1, l-1 ), &
+                              ldz )
+                 else
+                    call stdlib_dlae2( d( l-1 ), e( l-1 ), d( l ), rt1, rt2 )
+                 end if
+                 d( l-1 ) = rt1
+                 d( l ) = rt2
+                 e( l-1 ) = zero
+                 l = l - 2
+                 if( l>=lend )go to 90
+                 go to 140
+              end if
+              if( jtot==nmaxit )go to 140
+              jtot = jtot + 1
+              ! form shift.
+              g = ( d( l-1 )-p ) / ( two*e( l-1 ) )
+              r = stdlib_dlapy2( g, one )
+              g = d( m ) - p + ( e( l-1 ) / ( g+sign( r, g ) ) )
+              s = one
+              c = one
+              p = zero
+              ! inner loop
+              lm1 = l - 1
+              do i = m, lm1
+                 f = s*e( i )
+                 b = c*e( i )
+                 call stdlib_dlartg( g, f, c, s, r )
+                 if( i/=m )e( i-1 ) = r
+                 g = d( i ) - p
+                 r = ( d( i+1 )-g )*s + two*c*b
+                 p = s*r
+                 d( i ) = g + p
+                 g = c*r - b
+                 ! if eigenvectors are desired, then save rotations.
+                 if( icompz>0 ) then
+                    work( i ) = c
+                    work( n-1+i ) = s
+                 end if
+              end do
+              ! if eigenvectors are desired, then apply saved rotations.
+              if( icompz>0 ) then
+                 mm = l - m + 1
+                 call stdlib_dlasr( 'R', 'V', 'F', n, mm, work( m ), work( n-1+m ),z( 1, m ), ldz &
+                           )
+              end if
+              d( l ) = d( l ) - p
+              e( lm1 ) = g
+              go to 90
+              ! eigenvalue found.
+              130 continue
+              d( l ) = p
+              l = l - 1
+              if( l>=lend )go to 90
+              go to 140
+           end if
+           ! undo scaling if necessary
+           140 continue
+           if( iscale==1 ) then
+              call stdlib_dlascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv+1, 1,d( lsv ), n, info )
+                        
+              call stdlib_dlascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv, 1, e( lsv ),n, info )
+                        
+           else if( iscale==2 ) then
+              call stdlib_dlascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv+1, 1,d( lsv ), n, info )
+                        
+              call stdlib_dlascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv, 1, e( lsv ),n, info )
+                        
+           end if
+           ! check for no convergence to an eigenvalue after a total
+           ! of n*maxit iterations.
+           if( jtot<nmaxit )go to 10
+           do i = 1, n - 1
+              if( e( i )/=zero )info = info + 1
+           end do
+           go to 190
+           ! order eigenvalues and eigenvectors.
+           160 continue
+           if( icompz==0 ) then
+              ! use quick sort
+              call stdlib_dlasrt( 'I', n, d, info )
+           else
+              ! use selection sort to minimize swaps of eigenvectors
+              do ii = 2, n
+                 i = ii - 1
+                 k = i
+                 p = d( i )
+                 do j = ii, n
+                    if( d( j )<p ) then
+                       k = j
+                       p = d( j )
+                    end if
+                 end do
+                 if( k/=i ) then
+                    d( k ) = d( i )
+                    d( i ) = p
+                    call stdlib_dswap( n, z( 1, i ), 1, z( 1, k ), 1 )
+                 end if
+              end do
+           end if
+           190 continue
+           return
+     end subroutine stdlib_dsteqr
+
+     !> DSTERF: computes all eigenvalues of a symmetric tridiagonal matrix
+     !> using the Pal-Walker-Kahan variant of the QL or QR algorithm.
+
+     pure subroutine stdlib_dsterf( n, d, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(inout) :: d(*), e(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 30
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iscale, jtot, l, l1, lend, lendsv, lsv, m, nmaxit
+           real(dp) :: alpha, anorm, bb, c, eps, eps2, gamma, oldc, oldgam, p, r, rt1, rt2, rte, &
+                     s, safmax, safmin, sigma, ssfmax, ssfmin, rmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           ! quick return if possible
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'DSTERF', -info )
+              return
+           end if
+           if( n<=1 )return
+           ! determine the unit roundoff for this environment.
+           eps = stdlib_dlamch( 'E' )
+           eps2 = eps**2
+           safmin = stdlib_dlamch( 'S' )
+           safmax = one / safmin
+           ssfmax = sqrt( safmax ) / three
+           ssfmin = sqrt( safmin ) / eps2
+           rmax = stdlib_dlamch( 'O' )
+           ! compute the eigenvalues of the tridiagonal matrix.
+           nmaxit = n*maxit
+           sigma = zero
+           jtot = 0
+           ! determine where the matrix splits and choose ql or qr iteration
+           ! for each block, according to whether top or bottom diagonal
+           ! element is smaller.
+           l1 = 1
+           10 continue
+           if( l1>n )go to 170
+           if( l1>1 )e( l1-1 ) = zero
+           do m = l1, n - 1
+              if( abs( e( m ) )<=( sqrt( abs( d( m ) ) )*sqrt( abs( d( m+1 ) ) ) )*eps ) &
+                        then
+                 e( m ) = zero
+                 go to 30
+              end if
+           end do
+           m = n
+           30 continue
+           l = l1
+           lsv = l
+           lend = m
+           lendsv = lend
+           l1 = m + 1
+           if( lend==l )go to 10
+           ! scale submatrix in rows and columns l to lend
+           anorm = stdlib_dlanst( 'M', lend-l+1, d( l ), e( l ) )
+           iscale = 0
+           if( anorm==zero )go to 10
+           if( (anorm>ssfmax) ) then
+              iscale = 1
+              call stdlib_dlascl( 'G', 0, 0, anorm, ssfmax, lend-l+1, 1, d( l ), n,info )
+              call stdlib_dlascl( 'G', 0, 0, anorm, ssfmax, lend-l, 1, e( l ), n,info )
+           else if( anorm<ssfmin ) then
+              iscale = 2
+              call stdlib_dlascl( 'G', 0, 0, anorm, ssfmin, lend-l+1, 1, d( l ), n,info )
+              call stdlib_dlascl( 'G', 0, 0, anorm, ssfmin, lend-l, 1, e( l ), n,info )
+           end if
+           do i = l, lend - 1
+              e( i ) = e( i )**2
+           end do
+           ! choose between ql and qr iteration
+           if( abs( d( lend ) )<abs( d( l ) ) ) then
+              lend = lsv
+              l = lendsv
+           end if
+           if( lend>=l ) then
+              ! ql iteration
+              ! look for small subdiagonal element.
+              50 continue
+              if( l/=lend ) then
+                 do m = l, lend - 1
+                    if( abs( e( m ) )<=eps2*abs( d( m )*d( m+1 ) ) )go to 70
+                 end do
+              end if
+              m = lend
+              70 continue
+              if( m<lend )e( m ) = zero
+              p = d( l )
+              if( m==l )go to 90
+              ! if remaining matrix is 2 by 2, use stdlib_dlae2 to compute its
+              ! eigenvalues.
+              if( m==l+1 ) then
+                 rte = sqrt( e( l ) )
+                 call stdlib_dlae2( d( l ), rte, d( l+1 ), rt1, rt2 )
+                 d( l ) = rt1
+                 d( l+1 ) = rt2
+                 e( l ) = zero
+                 l = l + 2
+                 if( l<=lend )go to 50
+                 go to 150
+              end if
+              if( jtot==nmaxit )go to 150
+              jtot = jtot + 1
+              ! form shift.
+              rte = sqrt( e( l ) )
+              sigma = ( d( l+1 )-p ) / ( two*rte )
+              r = stdlib_dlapy2( sigma, one )
+              sigma = p - ( rte / ( sigma+sign( r, sigma ) ) )
+              c = one
+              s = zero
+              gamma = d( m ) - sigma
+              p = gamma*gamma
+              ! inner loop
+              do i = m - 1, l, -1
+                 bb = e( i )
+                 r = p + bb
+                 if( i/=m-1 )e( i+1 ) = s*r
+                 oldc = c
+                 c = p / r
+                 s = bb / r
+                 oldgam = gamma
+                 alpha = d( i )
+                 gamma = c*( alpha-sigma ) - s*oldgam
+                 d( i+1 ) = oldgam + ( alpha-gamma )
+                 if( c/=zero ) then
+                    p = ( gamma*gamma ) / c
+                 else
+                    p = oldc*bb
+                 end if
+              end do
+              e( l ) = s*p
+              d( l ) = sigma + gamma
+              go to 50
+              ! eigenvalue found.
+              90 continue
+              d( l ) = p
+              l = l + 1
+              if( l<=lend )go to 50
+              go to 150
+           else
+              ! qr iteration
+              ! look for small superdiagonal element.
+              100 continue
+              do m = l, lend + 1, -1
+                 if( abs( e( m-1 ) )<=eps2*abs( d( m )*d( m-1 ) ) )go to 120
+              end do
+              m = lend
+              120 continue
+              if( m>lend )e( m-1 ) = zero
+              p = d( l )
+              if( m==l )go to 140
+              ! if remaining matrix is 2 by 2, use stdlib_dlae2 to compute its
+              ! eigenvalues.
+              if( m==l-1 ) then
+                 rte = sqrt( e( l-1 ) )
+                 call stdlib_dlae2( d( l ), rte, d( l-1 ), rt1, rt2 )
+                 d( l ) = rt1
+                 d( l-1 ) = rt2
+                 e( l-1 ) = zero
+                 l = l - 2
+                 if( l>=lend )go to 100
+                 go to 150
+              end if
+              if( jtot==nmaxit )go to 150
+              jtot = jtot + 1
+              ! form shift.
+              rte = sqrt( e( l-1 ) )
+              sigma = ( d( l-1 )-p ) / ( two*rte )
+              r = stdlib_dlapy2( sigma, one )
+              sigma = p - ( rte / ( sigma+sign( r, sigma ) ) )
+              c = one
+              s = zero
+              gamma = d( m ) - sigma
+              p = gamma*gamma
+              ! inner loop
+              do i = m, l - 1
+                 bb = e( i )
+                 r = p + bb
+                 if( i/=m )e( i-1 ) = s*r
+                 oldc = c
+                 c = p / r
+                 s = bb / r
+                 oldgam = gamma
+                 alpha = d( i+1 )
+                 gamma = c*( alpha-sigma ) - s*oldgam
+                 d( i ) = oldgam + ( alpha-gamma )
+                 if( c/=zero ) then
+                    p = ( gamma*gamma ) / c
+                 else
+                    p = oldc*bb
+                 end if
+              end do
+              e( l-1 ) = s*p
+              d( l ) = sigma + gamma
+              go to 100
+              ! eigenvalue found.
+              140 continue
+              d( l ) = p
+              l = l - 1
+              if( l>=lend )go to 100
+              go to 150
+           end if
+           ! undo scaling if necessary
+           150 continue
+           if( iscale==1 )call stdlib_dlascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv+1, 1,d( lsv ), &
+                     n, info )
+           if( iscale==2 )call stdlib_dlascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv+1, 1,d( lsv ), &
+                     n, info )
+           ! check for no convergence to an eigenvalue after a total
+           ! of n*maxit iterations.
+           if( jtot<nmaxit )go to 10
+           do i = 1, n - 1
+              if( e( i )/=zero )info = info + 1
+           end do
+           go to 180
+           ! sort eigenvalues in increasing order.
+           170 continue
+           call stdlib_dlasrt( 'I', n, d, info )
+           180 continue
+           return
+     end subroutine stdlib_dsterf
+
+     !> DSTEV: computes all eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric tridiagonal matrix A.
+
+     pure subroutine stdlib_dstev( jobz, n, d, e, z, ldz, work, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantz
+           integer(ilp) :: imax, iscale
+           real(dp) :: bignum, eps, rmax, rmin, safmin, sigma, smlnum, tnrm
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           tnrm = stdlib_dlanst( 'M', n, d, e )
+           if( tnrm>zero .and. tnrm<rmin ) then
+              iscale = 1
+              sigma = rmin / tnrm
+           else if( tnrm>rmax ) then
+              iscale = 1
+              sigma = rmax / tnrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_dscal( n, sigma, d, 1 )
+              call stdlib_dscal( n-1, sigma, e( 1 ), 1 )
+           end if
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvalues and
+           ! eigenvectors, call stdlib_dsteqr.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, d, e, info )
+           else
+              call stdlib_dsteqr( 'I', n, d, e, z, ldz, work, info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, d, 1 )
+           end if
+           return
+     end subroutine stdlib_dstev
+
+     !> DSTEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix A.  Eigenvalues and
+     !> eigenvectors can be selected by specifying either a range of values
+     !> or a range of indices for the desired eigenvalues.
+
+     pure subroutine stdlib_dstevx( jobz, range, n, d, e, vl, vu, il, iu, abstol,m, w, z, ldz, &
+               work, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           integer(ilp), intent(in) :: il, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, imax, indibl, indisp, indiwo, indwrk, iscale, itmp1, j, jj, &
+                     nsplit
+           real(dp) :: bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, tnrm, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -7
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -8
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -9
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = d( 1 )
+              else
+                 if( vl<d( 1 ) .and. vu>=d( 1 ) ) then
+                    m = 1
+                    w( 1 ) = d( 1 )
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           end if
+           tnrm = stdlib_dlanst( 'M', n, d, e )
+           if( tnrm>zero .and. tnrm<rmin ) then
+              iscale = 1
+              sigma = rmin / tnrm
+           else if( tnrm>rmax ) then
+              iscale = 1
+              sigma = rmax / tnrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_dscal( n, sigma, d, 1 )
+              call stdlib_dscal( n-1, sigma, e( 1 ), 1 )
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! if all eigenvalues are desired and abstol is less than zero, then
+           ! call stdlib_dsterf or stdlib_ssteqr.  if this fails for some eigenvalue, then
+           ! try stdlib_dstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_dcopy( n, d, 1, w, 1 )
+              call stdlib_dcopy( n-1, e( 1 ), 1, work( 1 ), 1 )
+              indwrk = n + 1
+              if( .not.wantz ) then
+                 call stdlib_dsterf( n, w, work, info )
+              else
+                 call stdlib_dsteqr( 'I', n, w, work, z, ldz, work( indwrk ), info )
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 20
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_dstebz and, if eigenvectors are desired, stdlib_sstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indwrk = 1
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_dstebz( range, order, n, vll, vuu, il, iu, abstol, d, e, m,nsplit, w, &
+                     iwork( indibl ), iwork( indisp ),work( indwrk ), iwork( indiwo ), info )
+           if( wantz ) then
+              call stdlib_dstein( n, d, e, m, w, iwork( indibl ), iwork( indisp ),z, ldz, work( &
+                        indwrk ), iwork( indiwo ), ifail,info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           20 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_dswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_dstevx
+
+     !> DSYCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by DSYTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_dsycon( uplo, n, a, lda, ipiv, anorm, rcond, work,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(dp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_dsytrs( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_dsycon
+
+     !> DSYCON_ROOK: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by DSYTRF_ROOK.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_dsycon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(dp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYCON_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_dsytrs_rook( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_dsycon_rook
+
+     !> DSYRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric indefinite, and
+     !> provides error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_dsyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*)
+           real(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_dcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_dsymv( uplo, n, -one, a, lda, x( 1, j ), 1, one,work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    do i = 1, k - 1
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + abs( a( k, k ) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( a( k, k ) )*xk
+                    do i = k + 1, n
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_dsytrs( uplo, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                 call stdlib_daxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_dlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_dlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_dsytrs( uplo, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_dsytrs( uplo, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_dsyrfs
+
+     !> DSYSV_RK: computes the solution to a real system of linear
+     !> equations A * X = B, where A is an N-by-N symmetric matrix
+     !> and X and B are N-by-NRHS matrices.
+     !> The bounded Bunch-Kaufman (rook) diagonal pivoting method is used
+     !> to factor A as
+     !> A = P*U*D*(U**T)*(P**T),  if UPLO = 'U', or
+     !> A = P*L*D*(L**T)*(P**T),  if UPLO = 'L',
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> DSYTRF_RK is called to compute the factorization of a real
+     !> symmetric matrix.  The factored form of A is then used to solve
+     !> the system of equations A * X = B by calling BLAS3 routine DSYTRS_3.
+
+     pure subroutine stdlib_dsysv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,work, lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_dsytrf_rk( uplo, n, a, lda, e, ipiv, work, -1, info )
+                 lwkopt = work(1)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYSV_RK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = p*u*d*(u**t)*(p**t) or
+           ! a = p*u*d*(u**t)*(p**t).
+           call stdlib_dsytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b with blas3 solver, overwriting b with x.
+              call stdlib_dsytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsysv_rk
+
+     !> DSYSV_ROOK: computes the solution to a real system of linear
+     !> equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> DSYTRF_ROOK is called to compute the factorization of a real
+     !> symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method.
+     !> The factored form of A is then used to solve the system
+     !> of equations A * X = B by calling DSYTRS_ROOK.
+
+     pure subroutine stdlib_dsysv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_dsytrf_rook( uplo, n, a, lda, ipiv, work, -1, info )
+                 lwkopt = work(1)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYSV_ROOK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_dsytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              ! solve with trs_rook ( use level 2 blas)
+              call stdlib_dsytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsysv_rook
+
+     !> DSYTD2: reduces a real symmetric matrix A to symmetric tridiagonal
+     !> form T by an orthogonal similarity transformation: Q**T * A * Q = T.
+
+     pure subroutine stdlib_dsytd2( uplo, n, a, lda, d, e, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: d(*), e(*), tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           real(dp) :: alpha, taui
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTD2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           if( upper ) then
+              ! reduce the upper triangle of a
+              do i = n - 1, 1, -1
+                 ! generate elementary reflector h(i) = i - tau * v * v**t
+                 ! to annihilate a(1:i-1,i+1)
+                 call stdlib_dlarfg( i, a( i, i+1 ), a( 1, i+1 ), 1, taui )
+                 e( i ) = a( i, i+1 )
+                 if( taui/=zero ) then
+                    ! apply h(i) from both sides to a(1:i,1:i)
+                    a( i, i+1 ) = one
+                    ! compute  x := tau * a * v  storing x in tau(1:i)
+                    call stdlib_dsymv( uplo, i, taui, a, lda, a( 1, i+1 ), 1, zero,tau, 1 )
+                              
+                    ! compute  w := x - 1/2 * tau * (x**t * v) * v
+                    alpha = -half*taui*stdlib_ddot( i, tau, 1, a( 1, i+1 ), 1 )
+                    call stdlib_daxpy( i, alpha, a( 1, i+1 ), 1, tau, 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**t - w * v**t
+                    call stdlib_dsyr2( uplo, i, -one, a( 1, i+1 ), 1, tau, 1, a,lda )
+                    a( i, i+1 ) = e( i )
+                 end if
+                 d( i+1 ) = a( i+1, i+1 )
+                 tau( i ) = taui
+              end do
+              d( 1 ) = a( 1, 1 )
+           else
+              ! reduce the lower triangle of a
+              do i = 1, n - 1
+                 ! generate elementary reflector h(i) = i - tau * v * v**t
+                 ! to annihilate a(i+2:n,i)
+                 call stdlib_dlarfg( n-i, a( i+1, i ), a( min( i+2, n ), i ), 1,taui )
+                 e( i ) = a( i+1, i )
+                 if( taui/=zero ) then
+                    ! apply h(i) from both sides to a(i+1:n,i+1:n)
+                    a( i+1, i ) = one
+                    ! compute  x := tau * a * v  storing y in tau(i:n-1)
+                    call stdlib_dsymv( uplo, n-i, taui, a( i+1, i+1 ), lda,a( i+1, i ), 1, zero, &
+                              tau( i ), 1 )
+                    ! compute  w := x - 1/2 * tau * (x**t * v) * v
+                    alpha = -half*taui*stdlib_ddot( n-i, tau( i ), 1, a( i+1, i ),1 )
+                    call stdlib_daxpy( n-i, alpha, a( i+1, i ), 1, tau( i ), 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**t - w * v**t
+                    call stdlib_dsyr2( uplo, n-i, -one, a( i+1, i ), 1, tau( i ), 1,a( i+1, i+1 ),&
+                               lda )
+                    a( i+1, i ) = e( i )
+                 end if
+                 d( i ) = a( i, i )
+                 tau( i ) = taui
+              end do
+              d( n ) = a( n, n )
+           end if
+           return
+     end subroutine stdlib_dsytd2
+
+     !> DSYTF2: computes the factorization of a real symmetric matrix A using
+     !> the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**T is the transpose of U, and D is symmetric and
+     !> block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_dsytf2( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kk, kp, kstep
+           real(dp) :: absakk, alpha, colmax, d11, d12, d21, d22, r1, rowmax, t, wk, wkm1, &
+                     wkp1
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTF2', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_idamax( k-1, a( 1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) .or. stdlib_disnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = imax + stdlib_idamax( k-imax, a( imax, imax+1 ), lda )
+                    rowmax = abs( a( imax, jmax ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_idamax( imax-1, a( 1, imax ), 1 )
+                       rowmax = max( rowmax, abs( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( a( imax, imax ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_dswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    call stdlib_dswap( kk-kp-1, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**t = a - w(k)*1/d(k)*w(k)**t
+                    r1 = one / a( k, k )
+                    call stdlib_dsyr( uplo, k-1, -r1, a( 1, k ), 1, a, lda )
+                    ! store u(k) in column k
+                    call stdlib_dscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**t
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = one / ( d11*d22-one )
+                       d12 = t / d12
+                       do j = k - 2, 1, -1
+                          wkm1 = d12*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = d12*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - a( i, k )*wk -a( i, k-1 )*wkm1
+                          end do
+                          a( j, k ) = wk
+                          a( j, k-1 ) = wkm1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_idamax( n-k, a( k+1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) .or. stdlib_disnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = k - 1 + stdlib_idamax( imax-k, a( imax, k ), lda )
+                    rowmax = abs( a( imax, jmax ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_idamax( n-imax, a( imax+1, imax ), 1 )
+                       rowmax = max( rowmax, abs( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( a( imax, imax ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_dswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    call stdlib_dswap( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**t = a - w(k)*(1/d(k))*w(k)**t
+                       d11 = one / a( k, k )
+                       call stdlib_dsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                 
+                       ! store l(k) in column k
+                       call stdlib_dscal( n-k, d11, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k)
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( (a(k) a(k+1))*d(k)**(-1) ) * (a(k) a(k+1))**t
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       d21 = t / d21
+                       do j = k + 2, n
+                          wk = d21*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = d21*( d22*a( j, k+1 )-a( j, k ) )
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - a( i, k )*wk -a( i, k+1 )*wkp1
+                          end do
+                          a( j, k ) = wk
+                          a( j, k+1 ) = wkp1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_dsytf2
+
+     !> DSYTRD: reduces a real symmetric matrix A to real symmetric
+     !> tridiagonal form T by an orthogonal similarity transformation:
+     !> Q**T * A * Q = T.
+
+     pure subroutine stdlib_dsytrd( uplo, n, a, lda, d, e, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: d(*), e(*), tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, iws, j, kk, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              ! determine the block size.
+              nb = stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nx = n
+           iws = 1
+           if( nb>1 .and. nb<n ) then
+              ! determine when to cross over from blocked to unblocked code
+              ! (last block is always handled by unblocked code).
+              nx = max( nb, stdlib_ilaenv( 3, 'DSYTRD', uplo, n, -1, -1, -1 ) )
+              if( nx<n ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code by setting nx = n.
+                    nb = max( lwork / ldwork, 1 )
+                    nbmin = stdlib_ilaenv( 2, 'DSYTRD', uplo, n, -1, -1, -1 )
+                    if( nb<nbmin )nx = n
+                 end if
+              else
+                 nx = n
+              end if
+           else
+              nb = 1
+           end if
+           if( upper ) then
+              ! reduce the upper triangle of a.
+              ! columns 1:kk are handled by the unblocked method.
+              kk = n - ( ( n-nx+nb-1 ) / nb )*nb
+              do i = n - nb + 1, kk + 1, -nb
+                 ! reduce columns i:i+nb-1 to tridiagonal form and form the
+                 ! matrix w which is needed to update the unreduced part of
+                 ! the matrix
+                 call stdlib_dlatrd( uplo, i+nb-1, nb, a, lda, e, tau, work,ldwork )
+                 ! update the unreduced submatrix a(1:i-1,1:i-1), using an
+                 ! update of the form:  a := a - v*w**t - w*v**t
+                 call stdlib_dsyr2k( uplo, 'NO TRANSPOSE', i-1, nb, -one, a( 1, i ),lda, work, &
+                           ldwork, one, a, lda )
+                 ! copy superdiagonal elements back into a, and diagonal
+                 ! elements into d
+                 do j = i, i + nb - 1
+                    a( j-1, j ) = e( j-1 )
+                    d( j ) = a( j, j )
+                 end do
+              end do
+              ! use unblocked code to reduce the last or only block
+              call stdlib_dsytd2( uplo, kk, a, lda, d, e, tau, iinfo )
+           else
+              ! reduce the lower triangle of a
+              do i = 1, n - nx, nb
+                 ! reduce columns i:i+nb-1 to tridiagonal form and form the
+                 ! matrix w which is needed to update the unreduced part of
+                 ! the matrix
+                 call stdlib_dlatrd( uplo, n-i+1, nb, a( i, i ), lda, e( i ),tau( i ), work, &
+                           ldwork )
+                 ! update the unreduced submatrix a(i+ib:n,i+ib:n), using
+                 ! an update of the form:  a := a - v*w**t - w*v**t
+                 call stdlib_dsyr2k( uplo, 'NO TRANSPOSE', n-i-nb+1, nb, -one,a( i+nb, i ), lda, &
+                           work( nb+1 ), ldwork, one,a( i+nb, i+nb ), lda )
+                 ! copy subdiagonal elements back into a, and diagonal
+                 ! elements into d
+                 do j = i, i + nb - 1
+                    a( j+1, j ) = e( j )
+                    d( j ) = a( j, j )
+                 end do
+              end do
+              ! use unblocked code to reduce the last or only block
+              call stdlib_dsytd2( uplo, n-i+1, a( i, i ), lda, d( i ), e( i ),tau( i ), iinfo )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsytrd
+
+     !> DSYTRD_SB2ST: reduces a real symmetric band matrix A to real symmetric
+     !> tridiagonal form T by a orthogonal similarity transformation:
+     !> Q**T * A * Q = T.
+
+     pure subroutine stdlib_dsytrd_sb2st( stage1, vect, uplo, n, kd, ab, ldab,d, e, hous, lhous, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: stage1, uplo, vect
+           integer(ilp), intent(in) :: n, kd, ldab, lhous, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(out) :: d(*), e(*)
+           real(dp), intent(inout) :: ab(ldab,*)
+           real(dp), intent(out) :: hous(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: rzero = 0.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantq, upper, afters1
+           integer(ilp) :: i, m, k, ib, sweepid, myid, shift, stt, st, ed, stind, edind, &
+           blklastind, colpt, thed, stepercol, grsiz, thgrsiz, thgrnb, thgrid, nbtiles, ttype, &
+           tid, nthreads, debug, abdpos, abofdpos, dpos, ofdpos, awpos, inda, indw, apos, sizea, &
+                     lda, indv, indtau, sidev, sizetau, ldv, lhmin, lwmin
+           ! Intrinsic Functions 
+           intrinsic :: min,max,ceiling,real
+           ! Executable Statements 
+           ! determine the minimal workspace size required.
+           ! test the input parameters
+           debug   = 0
+           info    = 0
+           afters1 = stdlib_lsame( stage1, 'Y' )
+           wantq   = stdlib_lsame( vect, 'V' )
+           upper   = stdlib_lsame( uplo, 'U' )
+           lquery  = ( lwork==-1 ) .or. ( lhous==-1 )
+           ! determine the block size, the workspace size and the hous size.
+           ib     = stdlib_ilaenv2stage( 2, 'DSYTRD_SB2ST', vect, n, kd, -1, -1 )
+           lhmin  = stdlib_ilaenv2stage( 3, 'DSYTRD_SB2ST', vect, n, kd, ib, -1 )
+           lwmin  = stdlib_ilaenv2stage( 4, 'DSYTRD_SB2ST', vect, n, kd, ib, -1 )
+           if( .not.afters1 .and. .not.stdlib_lsame( stage1, 'N' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -2
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<(kd+1) ) then
+              info = -7
+           else if( lhous<lhmin .and. .not.lquery ) then
+              info = -11
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              hous( 1 ) = lhmin
+              work( 1 ) = lwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRD_SB2ST', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! determine pointer position
+           ldv      = kd + ib
+           sizetau  = 2 * n
+           sidev    = 2 * n
+           indtau   = 1
+           indv     = indtau + sizetau
+           lda      = 2 * kd + 1
+           sizea    = lda * n
+           inda     = 1
+           indw     = inda + sizea
+           nthreads = 1
+           tid      = 0
+           if( upper ) then
+               apos     = inda + kd
+               awpos    = inda
+               dpos     = apos + kd
+               ofdpos   = dpos - 1
+               abdpos   = kd + 1
+               abofdpos = kd
+           else
+               apos     = inda
+               awpos    = inda + kd + 1
+               dpos     = apos
+               ofdpos   = dpos + 1
+               abdpos   = 1
+               abofdpos = 2
+           endif
+           ! case kd=0:
+           ! the matrix is diagonal. we just copy it (convert to "real" for
+           ! real because d is double and the imaginary part should be 0)
+           ! and store it in d. a sequential code here is better or
+           ! in a parallel environment it might need two cores for d and e
+           if( kd==0 ) then
+               do i = 1, n
+                   d( i ) = ( ab( abdpos, i ) )
+               end do
+               do i = 1, n-1
+                   e( i ) = rzero
+               end do
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! case kd=1:
+           ! the matrix is already tridiagonal. we have to make diagonal
+           ! and offdiagonal elements real, and store them in d and e.
+           ! for that, for real precision just copy the diag and offdiag
+           ! to d and e while for the complex case the bulge chasing is
+           ! performed to convert the hermetian tridiagonal to symmetric
+           ! tridiagonal. a simpler conversion formula might be used, but then
+           ! updating the q matrix will be required and based if q is generated
+           ! or not this might complicate the story.
+           if( kd==1 ) then
+               do i = 1, n
+                   d( i ) = ( ab( abdpos, i ) )
+               end do
+               if( upper ) then
+                   do i = 1, n-1
+                      e( i ) = ( ab( abofdpos, i+1 ) )
+                   end do
+               else
+                   do i = 1, n-1
+                      e( i ) = ( ab( abofdpos, i ) )
+                   end do
+               endif
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! main code start here.
+           ! reduce the symmetric band of a to a tridiagonal matrix.
+           thgrsiz   = n
+           grsiz     = 1
+           shift     = 3
+           nbtiles   = ceiling( real(n,KIND=dp)/real(kd,KIND=dp) )
+           stepercol = ceiling( real(shift,KIND=dp)/real(grsiz,KIND=dp) )
+           thgrnb    = ceiling( real(n-1,KIND=dp)/real(thgrsiz,KIND=dp) )
+           call stdlib_dlacpy( "A", kd+1, n, ab, ldab, work( apos ), lda )
+           call stdlib_dlaset( "A", kd,   n, zero, zero, work( awpos ), lda )
+           ! openmp parallelisation start here
+           !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) &
+           !$OMP&         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) &
+           !$OMP&         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) &
+           !$OMP&         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) &
+           !$OMP&         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) &
+           !$OMP&         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+           !$OMP MASTER
+           ! main bulge chasing loop
+           loop_100: do thgrid = 1, thgrnb
+               stt  = (thgrid-1)*thgrsiz+1
+               thed = min( (stt + thgrsiz -1), (n-1))
+               loop_110: do i = stt, n-1
+                   ed = min( i, thed )
+                   if( stt>ed ) exit
+                   loop_120: do m = 1, stepercol
+                       st = stt
+                       loop_130: do sweepid = st, ed
+                           loop_140: do k = 1, grsiz
+                               myid  = (i-sweepid)*(stepercol*grsiz)+ (m-1)*grsiz + k
+                               if ( myid==1 ) then
+                                   ttype = 1
+                               else
+                                   ttype = mod( myid, 2 ) + 2
+                               endif
+                               if( ttype==2 ) then
+                                   colpt      = (myid/2)*kd + sweepid
+                                   stind      = colpt-kd+1
+                                   edind      = min(colpt,n)
+                                   blklastind = colpt
+                               else
+                                   colpt      = ((myid+1)/2)*kd + sweepid
+                                   stind      = colpt-kd+1
+                                   edind      = min(colpt,n)
+                                   if( ( stind>=edind-1 ).and.( edind==n ) ) then
+                                       blklastind = n
+                                   else
+                                       blklastind = 0
+                                   endif
+                               endif
+                               ! call the kernel
+                               !$ if( ttype/=1 ) then
+                                   !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) &
+                                   !$OMP&     DEPEND(in:WORK(MYID-1)) &
+                                   !$OMP&     DEPEND(out:WORK(MYID))
+                                   !$ tid      = omp_get_thread_num()
+                                   !$ call stdlib_dsb2st_kernels( uplo, wantq, ttype,stind, edind, &
+                                   !$ sweepid, n, kd, ib,work ( inda ), lda,hous( indv ), hous( &
+                                   !$ indtau ), ldv,work( indw + tid*kd ) )
+                               !$OMP END TASK
+                                   !$ else
+                                   !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) &
+                                   !$OMP&     DEPEND(out:WORK(MYID))
+                                   !$ tid      = omp_get_thread_num()
+                                   call stdlib_dsb2st_kernels( uplo, wantq, ttype,stind, edind, &
+                                   sweepid, n, kd, ib,work ( inda ), lda,hous( indv ), hous( &
+                                             indtau ), ldv,work( indw + tid*kd ) )
+                                   !$OMP END TASK
+                               !$ endif
+                               if ( blklastind>=(n-1) ) then
+                                   stt = stt + 1
+                                   exit
+                               endif
+                           end do loop_140
+                       end do loop_130
+                   end do loop_120
+               end do loop_110
+           end do loop_100
+           !$OMP END MASTER
+           !$OMP END PARALLEL
+           ! copy the diagonal from a to d. note that d is real thus only
+           ! the real part is needed, the imaginary part should be zero.
+           do i = 1, n
+               d( i ) = ( work( dpos+(i-1)*lda ) )
+           end do
+           ! copy the off diagonal from a to e. note that e is real thus only
+           ! the real part is needed, the imaginary part should be zero.
+           if( upper ) then
+               do i = 1, n-1
+                  e( i ) = ( work( ofdpos+i*lda ) )
+               end do
+           else
+               do i = 1, n-1
+                  e( i ) = ( work( ofdpos+(i-1)*lda ) )
+               end do
+           endif
+           hous( 1 ) = lhmin
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_dsytrd_sb2st
+
+     !> DSYTRF: computes the factorization of a real symmetric matrix A using
+     !> the Bunch-Kaufman diagonal pivoting method.  The form of the
+     !> factorization is
+     !> A = U**T*D*U  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_dsytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'DSYTRF', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DSYTRF', uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_dlasyf;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_dlasyf( uplo, k, nb, kb, a, lda, ipiv, work, ldwork,iinfo )
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_dsytf2( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_dlasyf;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_dlasyf( uplo, n-k+1, nb, kb, a( k, k ), lda, ipiv( k ),work, ldwork, &
+                           iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_dsytf2( uplo, n-k+1, a( k, k ), lda, ipiv( k ), iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsytrf
+
+     !> DTBCON: estimates the reciprocal of the condition number of a
+     !> triangular band matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_dtbcon( norm, uplo, diag, n, kd, ab, ldab, rcond, work,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: ab(ldab,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(dp) :: ainvnm, anorm, scale, smlnum, xnorm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=dp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_dlantb( norm, uplo, diag, n, kd, ab, ldab, work )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_dlatbs( uplo, 'NO TRANSPOSE', diag, normin, n, kd,ab, ldab, work, &
+                              scale, work( 2*n+1 ), info )
+                 else
+                    ! multiply by inv(a**t).
+                    call stdlib_dlatbs( uplo, 'TRANSPOSE', diag, normin, n, kd, ab,ldab, work, &
+                              scale, work( 2*n+1 ), info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_idamax( n, work, 1 )
+                    xnorm = abs( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_drscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_dtbcon
+
+     !> DTFTRI: computes the inverse of a triangular matrix A stored in RFP
+     !> format.
+     !> This is a Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_dtftri( transr, uplo, diag, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo, diag
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( .not.stdlib_lsame( diag, 'N' ) .and. .not.stdlib_lsame( diag, 'U' ) )&
+                     then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTFTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_dtrtri( 'L', diag, n1, a( 0 ), n, info )
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'R', 'L', 'N', diag, n2, n1, -one, a( 0 ),n, a( n1 ), n )
+                              
+                    call stdlib_dtrtri( 'U', diag, n2, a( n ), n, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'L', 'U', 'T', diag, n2, n1, one, a( n ), n,a( n1 ), n )
+                              
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_dtrtri( 'L', diag, n1, a( n2 ), n, info )
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'L', 'L', 'T', diag, n1, n2, -one, a( n2 ),n, a( 0 ), n )
+                              
+                    call stdlib_dtrtri( 'U', diag, n2, a( n1 ), n, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'R', 'U', 'N', diag, n1, n2, one, a( n1 ),n, a( 0 ), n )
+                              
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0), t2 -> a(1), s -> a(0+n1*n1)
+                    call stdlib_dtrtri( 'U', diag, n1, a( 0 ), n1, info )
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'L', 'U', 'N', diag, n1, n2, -one, a( 0 ),n1, a( n1*n1 ), &
+                              n1 )
+                    call stdlib_dtrtri( 'L', diag, n2, a( 1 ), n1, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'R', 'L', 'T', diag, n1, n2, one, a( 1 ),n1, a( n1*n1 ), &
+                              n1 )
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0+n2*n2), t2 -> a(0+n1*n2), s -> a(0)
+                    call stdlib_dtrtri( 'U', diag, n1, a( n2*n2 ), n2, info )
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'R', 'U', 'T', diag, n2, n1, -one,a( n2*n2 ), n2, a( 0 ), &
+                              n2 )
+                    call stdlib_dtrtri( 'L', diag, n2, a( n1*n2 ), n2, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'L', 'L', 'N', diag, n2, n1, one,a( n1*n2 ), n2, a( 0 ), &
+                              n2 )
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_dtrtri( 'L', diag, k, a( 1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'R', 'L', 'N', diag, k, k, -one, a( 1 ),n+1, a( k+1 ), n+1 &
+                              )
+                    call stdlib_dtrtri( 'U', diag, k, a( 0 ), n+1, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'L', 'U', 'T', diag, k, k, one, a( 0 ), n+1,a( k+1 ), n+1 )
+                              
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_dtrtri( 'L', diag, k, a( k+1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'L', 'L', 'T', diag, k, k, -one, a( k+1 ),n+1, a( 0 ), n+1 &
+                              )
+                    call stdlib_dtrtri( 'U', diag, k, a( k ), n+1, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'R', 'U', 'N', diag, k, k, one, a( k ), n+1,a( 0 ), n+1 )
+                              
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_dtrtri( 'U', diag, k, a( k ), k, info )
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'L', 'U', 'N', diag, k, k, -one, a( k ), k,a( k*( k+1 ) ), &
+                              k )
+                    call stdlib_dtrtri( 'L', diag, k, a( 0 ), k, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'R', 'L', 'T', diag, k, k, one, a( 0 ), k,a( k*( k+1 ) ), &
+                              k )
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_dtrtri( 'U', diag, k, a( k*( k+1 ) ), k, info )
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'R', 'U', 'T', diag, k, k, -one,a( k*( k+1 ) ), k, a( 0 ), &
+                              k )
+                    call stdlib_dtrtri( 'L', diag, k, a( k*k ), k, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_dtrmm( 'L', 'L', 'N', diag, k, k, one, a( k*k ), k,a( 0 ), k )
+                              
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dtftri
+
+     !> DTGSY2: solves the generalized Sylvester equation:
+     !> A * R - L * B = scale * C                (1)
+     !> D * R - L * E = scale * F,
+     !> using Level 1 and 2 BLAS. where R and L are unknown M-by-N matrices,
+     !> (A, D), (B, E) and (C, F) are given matrix pairs of size M-by-M,
+     !> N-by-N and M-by-N, respectively, with real entries. (A, D) and (B, E)
+     !> must be in generalized Schur canonical form, i.e. A, B are upper
+     !> quasi triangular and D, E are upper triangular. The solution (R, L)
+     !> overwrites (C, F). 0 <= SCALE <= 1 is an output scaling factor
+     !> chosen to avoid overflow.
+     !> In matrix notation solving equation (1) corresponds to solve
+     !> Z*x = scale*b, where Z is defined as
+     !> Z = [ kron(In, A)  -kron(B**T, Im) ]             (2)
+     !> [ kron(In, D)  -kron(E**T, Im) ],
+     !> Ik is the identity matrix of size k and X**T is the transpose of X.
+     !> kron(X, Y) is the Kronecker product between the matrices X and Y.
+     !> In the process of solving (1), we solve a number of such systems
+     !> where Dim(In), Dim(In) = 1 or 2.
+     !> If TRANS = 'T', solve the transposed system Z**T*y = scale*b for y,
+     !> which is equivalent to solve for R and L in
+     !> A**T * R  + D**T * L   = scale * C           (3)
+     !> R  * B**T + L  * E**T  = scale * -F
+     !> This case is used to compute an estimate of Dif[(A, D), (B, E)] =
+     !> sigma_min(Z) using reverse communication with DLACON.
+     !> DTGSY2 also (IJOB >= 1) contributes to the computation in DTGSYL
+     !> of an upper bound on the separation between to matrix pairs. Then
+     !> the input (A, D), (B, E) are sub-pencils of the matrix pair in
+     !> DTGSYL. See DTGSYL for details.
+
+     pure subroutine stdlib_dtgsy2( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f, &
+               ldf, scale, rdsum, rdscal,iwork, pq, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldc, ldd, lde, ldf, m, n
+           integer(ilp), intent(out) :: info, pq
+           real(dp), intent(inout) :: rdscal, rdsum
+           real(dp), intent(out) :: scale
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: a(lda,*), b(ldb,*), d(ldd,*), e(lde,*)
+           real(dp), intent(inout) :: c(ldc,*), f(ldf,*)
+        ! =====================================================================
+        ! replaced various illegal calls to stdlib_dcopy by calls to stdlib_dlaset.
+        ! sven hammarling, 27/5/02.
+           ! Parameters 
+           integer(ilp), parameter :: ldz = 8
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: i, ie, ierr, ii, is, isp1, j, je, jj, js, jsp1, k, mb, nb, p, q, &
+                     zdim
+           real(dp) :: alpha, scaloc
+           ! Local Arrays 
+           integer(ilp) :: ipiv(ldz), jpiv(ldz)
+           real(dp) :: rhs(ldz), z(ldz,ldz)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode and test input parameters
+           info = 0
+           ierr = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -1
+           else if( notran ) then
+              if( ( ijob<0 ) .or. ( ijob>2 ) ) then
+                 info = -2
+              end if
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 info = -3
+              else if( n<=0 ) then
+                 info = -4
+              else if( lda<max( 1, m ) ) then
+                 info = -6
+              else if( ldb<max( 1, n ) ) then
+                 info = -8
+              else if( ldc<max( 1, m ) ) then
+                 info = -10
+              else if( ldd<max( 1, m ) ) then
+                 info = -12
+              else if( lde<max( 1, n ) ) then
+                 info = -14
+              else if( ldf<max( 1, m ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGSY2', -info )
+              return
+           end if
+           ! determine block structure of a
+           pq = 0
+           p = 0
+           i = 1
+           10 continue
+           if( i>m )go to 20
+           p = p + 1
+           iwork( p ) = i
+           if( i==m )go to 20
+           if( a( i+1, i )/=zero ) then
+              i = i + 2
+           else
+              i = i + 1
+           end if
+           go to 10
+           20 continue
+           iwork( p+1 ) = m + 1
+           ! determine block structure of b
+           q = p + 1
+           j = 1
+           30 continue
+           if( j>n )go to 40
+           q = q + 1
+           iwork( q ) = j
+           if( j==n )go to 40
+           if( b( j+1, j )/=zero ) then
+              j = j + 2
+           else
+              j = j + 1
+           end if
+           go to 30
+           40 continue
+           iwork( q+1 ) = n + 1
+           pq = p*( q-p-1 )
+           if( notran ) then
+              ! solve (i, j) - subsystem
+                 ! a(i, i) * r(i, j) - l(i, j) * b(j, j) = c(i, j)
+                 ! d(i, i) * r(i, j) - l(i, j) * e(j, j) = f(i, j)
+              ! for i = p, p - 1, ..., 1; j = 1, 2, ..., q
+              scale = one
+              scaloc = one
+              loop_120: do j = p + 2, q
+                 js = iwork( j )
+                 jsp1 = js + 1
+                 je = iwork( j+1 ) - 1
+                 nb = je - js + 1
+                 loop_110: do i = p, 1, -1
+                    is = iwork( i )
+                    isp1 = is + 1
+                    ie = iwork( i+1 ) - 1
+                    mb = ie - is + 1
+                    zdim = mb*nb*2
+                    if( ( mb==1 ) .and. ( nb==1 ) ) then
+                       ! build a 2-by-2 system z * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = d( is, is )
+                       z( 1, 2 ) = -b( js, js )
+                       z( 2, 2 ) = -e( js, js )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = f( is, js )
+                       ! solve z * x = rhs
+                       call stdlib_dgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       if( ijob==0 ) then
+                          call stdlib_dgesc2( zdim, z, ldz, rhs, ipiv, jpiv,scaloc )
+                          if( scaloc/=one ) then
+                             do k = 1, n
+                                call stdlib_dscal( m, scaloc, c( 1, k ), 1 )
+                                call stdlib_dscal( m, scaloc, f( 1, k ), 1 )
+                             end do
+                             scale = scale*scaloc
+                          end if
+                       else
+                          call stdlib_dlatdf( ijob, zdim, z, ldz, rhs, rdsum,rdscal, ipiv, jpiv )
+                                    
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       f( is, js ) = rhs( 2 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          alpha = -rhs( 1 )
+                          call stdlib_daxpy( is-1, alpha, a( 1, is ), 1, c( 1, js ),1 )
+                          call stdlib_daxpy( is-1, alpha, d( 1, is ), 1, f( 1, js ),1 )
+                       end if
+                       if( j<q ) then
+                          call stdlib_daxpy( n-je, rhs( 2 ), b( js, je+1 ), ldb,c( is, je+1 ), &
+                                    ldc )
+                          call stdlib_daxpy( n-je, rhs( 2 ), e( js, je+1 ), lde,f( is, je+1 ), &
+                                    ldf )
+                       end if
+                    else if( ( mb==1 ) .and. ( nb==2 ) ) then
+                       ! build a 4-by-4 system z * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = zero
+                       z( 3, 1 ) = d( is, is )
+                       z( 4, 1 ) = zero
+                       z( 1, 2 ) = zero
+                       z( 2, 2 ) = a( is, is )
+                       z( 3, 2 ) = zero
+                       z( 4, 2 ) = d( is, is )
+                       z( 1, 3 ) = -b( js, js )
+                       z( 2, 3 ) = -b( js, jsp1 )
+                       z( 3, 3 ) = -e( js, js )
+                       z( 4, 3 ) = -e( js, jsp1 )
+                       z( 1, 4 ) = -b( jsp1, js )
+                       z( 2, 4 ) = -b( jsp1, jsp1 )
+                       z( 3, 4 ) = zero
+                       z( 4, 4 ) = -e( jsp1, jsp1 )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = c( is, jsp1 )
+                       rhs( 3 ) = f( is, js )
+                       rhs( 4 ) = f( is, jsp1 )
+                       ! solve z * x = rhs
+                       call stdlib_dgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       if( ijob==0 ) then
+                          call stdlib_dgesc2( zdim, z, ldz, rhs, ipiv, jpiv,scaloc )
+                          if( scaloc/=one ) then
+                             do k = 1, n
+                                call stdlib_dscal( m, scaloc, c( 1, k ), 1 )
+                                call stdlib_dscal( m, scaloc, f( 1, k ), 1 )
+                             end do
+                             scale = scale*scaloc
+                          end if
+                       else
+                          call stdlib_dlatdf( ijob, zdim, z, ldz, rhs, rdsum,rdscal, ipiv, jpiv )
+                                    
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       c( is, jsp1 ) = rhs( 2 )
+                       f( is, js ) = rhs( 3 )
+                       f( is, jsp1 ) = rhs( 4 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          call stdlib_dger( is-1, nb, -one, a( 1, is ), 1, rhs( 1 ),1, c( 1, js ),&
+                                     ldc )
+                          call stdlib_dger( is-1, nb, -one, d( 1, is ), 1, rhs( 1 ),1, f( 1, js ),&
+                                     ldf )
+                       end if
+                       if( j<q ) then
+                          call stdlib_daxpy( n-je, rhs( 3 ), b( js, je+1 ), ldb,c( is, je+1 ), &
+                                    ldc )
+                          call stdlib_daxpy( n-je, rhs( 3 ), e( js, je+1 ), lde,f( is, je+1 ), &
+                                    ldf )
+                          call stdlib_daxpy( n-je, rhs( 4 ), b( jsp1, je+1 ), ldb,c( is, je+1 ), &
+                                    ldc )
+                          call stdlib_daxpy( n-je, rhs( 4 ), e( jsp1, je+1 ), lde,f( is, je+1 ), &
+                                    ldf )
+                       end if
+                    else if( ( mb==2 ) .and. ( nb==1 ) ) then
+                       ! build a 4-by-4 system z * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = a( isp1, is )
+                       z( 3, 1 ) = d( is, is )
+                       z( 4, 1 ) = zero
+                       z( 1, 2 ) = a( is, isp1 )
+                       z( 2, 2 ) = a( isp1, isp1 )
+                       z( 3, 2 ) = d( is, isp1 )
+                       z( 4, 2 ) = d( isp1, isp1 )
+                       z( 1, 3 ) = -b( js, js )
+                       z( 2, 3 ) = zero
+                       z( 3, 3 ) = -e( js, js )
+                       z( 4, 3 ) = zero
+                       z( 1, 4 ) = zero
+                       z( 2, 4 ) = -b( js, js )
+                       z( 3, 4 ) = zero
+                       z( 4, 4 ) = -e( js, js )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = c( isp1, js )
+                       rhs( 3 ) = f( is, js )
+                       rhs( 4 ) = f( isp1, js )
+                       ! solve z * x = rhs
+                       call stdlib_dgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       if( ijob==0 ) then
+                          call stdlib_dgesc2( zdim, z, ldz, rhs, ipiv, jpiv,scaloc )
+                          if( scaloc/=one ) then
+                             do k = 1, n
+                                call stdlib_dscal( m, scaloc, c( 1, k ), 1 )
+                                call stdlib_dscal( m, scaloc, f( 1, k ), 1 )
+                             end do
+                             scale = scale*scaloc
+                          end if
+                       else
+                          call stdlib_dlatdf( ijob, zdim, z, ldz, rhs, rdsum,rdscal, ipiv, jpiv )
+                                    
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       c( isp1, js ) = rhs( 2 )
+                       f( is, js ) = rhs( 3 )
+                       f( isp1, js ) = rhs( 4 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          call stdlib_dgemv( 'N', is-1, mb, -one, a( 1, is ), lda,rhs( 1 ), 1, &
+                                    one, c( 1, js ), 1 )
+                          call stdlib_dgemv( 'N', is-1, mb, -one, d( 1, is ), ldd,rhs( 1 ), 1, &
+                                    one, f( 1, js ), 1 )
+                       end if
+                       if( j<q ) then
+                          call stdlib_dger( mb, n-je, one, rhs( 3 ), 1,b( js, je+1 ), ldb, c( is, &
+                                    je+1 ), ldc )
+                          call stdlib_dger( mb, n-je, one, rhs( 3 ), 1,e( js, je+1 ), lde, f( is, &
+                                    je+1 ), ldf )
+                       end if
+                    else if( ( mb==2 ) .and. ( nb==2 ) ) then
+                       ! build an 8-by-8 system z * x = rhs
+                       call stdlib_dlaset( 'F', ldz, ldz, zero, zero, z, ldz )
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = a( isp1, is )
+                       z( 5, 1 ) = d( is, is )
+                       z( 1, 2 ) = a( is, isp1 )
+                       z( 2, 2 ) = a( isp1, isp1 )
+                       z( 5, 2 ) = d( is, isp1 )
+                       z( 6, 2 ) = d( isp1, isp1 )
+                       z( 3, 3 ) = a( is, is )
+                       z( 4, 3 ) = a( isp1, is )
+                       z( 7, 3 ) = d( is, is )
+                       z( 3, 4 ) = a( is, isp1 )
+                       z( 4, 4 ) = a( isp1, isp1 )
+                       z( 7, 4 ) = d( is, isp1 )
+                       z( 8, 4 ) = d( isp1, isp1 )
+                       z( 1, 5 ) = -b( js, js )
+                       z( 3, 5 ) = -b( js, jsp1 )
+                       z( 5, 5 ) = -e( js, js )
+                       z( 7, 5 ) = -e( js, jsp1 )
+                       z( 2, 6 ) = -b( js, js )
+                       z( 4, 6 ) = -b( js, jsp1 )
+                       z( 6, 6 ) = -e( js, js )
+                       z( 8, 6 ) = -e( js, jsp1 )
+                       z( 1, 7 ) = -b( jsp1, js )
+                       z( 3, 7 ) = -b( jsp1, jsp1 )
+                       z( 7, 7 ) = -e( jsp1, jsp1 )
+                       z( 2, 8 ) = -b( jsp1, js )
+                       z( 4, 8 ) = -b( jsp1, jsp1 )
+                       z( 8, 8 ) = -e( jsp1, jsp1 )
+                       ! set up right hand side(s)
+                       k = 1
+                       ii = mb*nb + 1
+                       do jj = 0, nb - 1
+                          call stdlib_dcopy( mb, c( is, js+jj ), 1, rhs( k ), 1 )
+                          call stdlib_dcopy( mb, f( is, js+jj ), 1, rhs( ii ), 1 )
+                          k = k + mb
+                          ii = ii + mb
+                       end do
+                       ! solve z * x = rhs
+                       call stdlib_dgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       if( ijob==0 ) then
+                          call stdlib_dgesc2( zdim, z, ldz, rhs, ipiv, jpiv,scaloc )
+                          if( scaloc/=one ) then
+                             do k = 1, n
+                                call stdlib_dscal( m, scaloc, c( 1, k ), 1 )
+                                call stdlib_dscal( m, scaloc, f( 1, k ), 1 )
+                             end do
+                             scale = scale*scaloc
+                          end if
+                       else
+                          call stdlib_dlatdf( ijob, zdim, z, ldz, rhs, rdsum,rdscal, ipiv, jpiv )
+                                    
+                       end if
+                       ! unpack solution vector(s)
+                       k = 1
+                       ii = mb*nb + 1
+                       do jj = 0, nb - 1
+                          call stdlib_dcopy( mb, rhs( k ), 1, c( is, js+jj ), 1 )
+                          call stdlib_dcopy( mb, rhs( ii ), 1, f( is, js+jj ), 1 )
+                          k = k + mb
+                          ii = ii + mb
+                       end do
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          call stdlib_dgemm( 'N', 'N', is-1, nb, mb, -one,a( 1, is ), lda, rhs( 1 &
+                                    ), mb, one,c( 1, js ), ldc )
+                          call stdlib_dgemm( 'N', 'N', is-1, nb, mb, -one,d( 1, is ), ldd, rhs( 1 &
+                                    ), mb, one,f( 1, js ), ldf )
+                       end if
+                       if( j<q ) then
+                          k = mb*nb + 1
+                          call stdlib_dgemm( 'N', 'N', mb, n-je, nb, one, rhs( k ),mb, b( js, je+&
+                                    1 ), ldb, one,c( is, je+1 ), ldc )
+                          call stdlib_dgemm( 'N', 'N', mb, n-je, nb, one, rhs( k ),mb, e( js, je+&
+                                    1 ), lde, one,f( is, je+1 ), ldf )
+                       end if
+                    end if
+                 end do loop_110
+              end do loop_120
+           else
+              ! solve (i, j) - subsystem
+                   ! a(i, i)**t * r(i, j) + d(i, i)**t * l(j, j)  =  c(i, j)
+                   ! r(i, i)  * b(j, j) + l(i, j)  * e(j, j)  = -f(i, j)
+              ! for i = 1, 2, ..., p, j = q, q - 1, ..., 1
+              scale = one
+              scaloc = one
+              loop_200: do i = 1, p
+                 is = iwork( i )
+                 isp1 = is + 1
+                 ie = iwork ( i+1 ) - 1
+                 mb = ie - is + 1
+                 loop_190: do j = q, p + 2, -1
+                    js = iwork( j )
+                    jsp1 = js + 1
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    zdim = mb*nb*2
+                    if( ( mb==1 ) .and. ( nb==1 ) ) then
+                       ! build a 2-by-2 system z**t * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = -b( js, js )
+                       z( 1, 2 ) = d( is, is )
+                       z( 2, 2 ) = -e( js, js )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = f( is, js )
+                       ! solve z**t * x = rhs
+                       call stdlib_dgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       call stdlib_dgesc2( zdim, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_dscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       f( is, js ) = rhs( 2 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( j>p+2 ) then
+                          alpha = rhs( 1 )
+                          call stdlib_daxpy( js-1, alpha, b( 1, js ), 1, f( is, 1 ),ldf )
+                          alpha = rhs( 2 )
+                          call stdlib_daxpy( js-1, alpha, e( 1, js ), 1, f( is, 1 ),ldf )
+                       end if
+                       if( i<p ) then
+                          alpha = -rhs( 1 )
+                          call stdlib_daxpy( m-ie, alpha, a( is, ie+1 ), lda,c( ie+1, js ), 1 )
+                                    
+                          alpha = -rhs( 2 )
+                          call stdlib_daxpy( m-ie, alpha, d( is, ie+1 ), ldd,c( ie+1, js ), 1 )
+                                    
+                       end if
+                    else if( ( mb==1 ) .and. ( nb==2 ) ) then
+                       ! build a 4-by-4 system z**t * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = zero
+                       z( 3, 1 ) = -b( js, js )
+                       z( 4, 1 ) = -b( jsp1, js )
+                       z( 1, 2 ) = zero
+                       z( 2, 2 ) = a( is, is )
+                       z( 3, 2 ) = -b( js, jsp1 )
+                       z( 4, 2 ) = -b( jsp1, jsp1 )
+                       z( 1, 3 ) = d( is, is )
+                       z( 2, 3 ) = zero
+                       z( 3, 3 ) = -e( js, js )
+                       z( 4, 3 ) = zero
+                       z( 1, 4 ) = zero
+                       z( 2, 4 ) = d( is, is )
+                       z( 3, 4 ) = -e( js, jsp1 )
+                       z( 4, 4 ) = -e( jsp1, jsp1 )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = c( is, jsp1 )
+                       rhs( 3 ) = f( is, js )
+                       rhs( 4 ) = f( is, jsp1 )
+                       ! solve z**t * x = rhs
+                       call stdlib_dgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       call stdlib_dgesc2( zdim, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_dscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       c( is, jsp1 ) = rhs( 2 )
+                       f( is, js ) = rhs( 3 )
+                       f( is, jsp1 ) = rhs( 4 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( j>p+2 ) then
+                          call stdlib_daxpy( js-1, rhs( 1 ), b( 1, js ), 1,f( is, 1 ), ldf )
+                                    
+                          call stdlib_daxpy( js-1, rhs( 2 ), b( 1, jsp1 ), 1,f( is, 1 ), ldf )
+                                    
+                          call stdlib_daxpy( js-1, rhs( 3 ), e( 1, js ), 1,f( is, 1 ), ldf )
+                                    
+                          call stdlib_daxpy( js-1, rhs( 4 ), e( 1, jsp1 ), 1,f( is, 1 ), ldf )
+                                    
+                       end if
+                       if( i<p ) then
+                          call stdlib_dger( m-ie, nb, -one, a( is, ie+1 ), lda,rhs( 1 ), 1, c( ie+&
+                                    1, js ), ldc )
+                          call stdlib_dger( m-ie, nb, -one, d( is, ie+1 ), ldd,rhs( 3 ), 1, c( ie+&
+                                    1, js ), ldc )
+                       end if
+                    else if( ( mb==2 ) .and. ( nb==1 ) ) then
+                       ! build a 4-by-4 system z**t * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = a( is, isp1 )
+                       z( 3, 1 ) = -b( js, js )
+                       z( 4, 1 ) = zero
+                       z( 1, 2 ) = a( isp1, is )
+                       z( 2, 2 ) = a( isp1, isp1 )
+                       z( 3, 2 ) = zero
+                       z( 4, 2 ) = -b( js, js )
+                       z( 1, 3 ) = d( is, is )
+                       z( 2, 3 ) = d( is, isp1 )
+                       z( 3, 3 ) = -e( js, js )
+                       z( 4, 3 ) = zero
+                       z( 1, 4 ) = zero
+                       z( 2, 4 ) = d( isp1, isp1 )
+                       z( 3, 4 ) = zero
+                       z( 4, 4 ) = -e( js, js )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = c( isp1, js )
+                       rhs( 3 ) = f( is, js )
+                       rhs( 4 ) = f( isp1, js )
+                       ! solve z**t * x = rhs
+                       call stdlib_dgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       call stdlib_dgesc2( zdim, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_dscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       c( isp1, js ) = rhs( 2 )
+                       f( is, js ) = rhs( 3 )
+                       f( isp1, js ) = rhs( 4 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( j>p+2 ) then
+                          call stdlib_dger( mb, js-1, one, rhs( 1 ), 1, b( 1, js ),1, f( is, 1 ), &
+                                    ldf )
+                          call stdlib_dger( mb, js-1, one, rhs( 3 ), 1, e( 1, js ),1, f( is, 1 ), &
+                                    ldf )
+                       end if
+                       if( i<p ) then
+                          call stdlib_dgemv( 'T', mb, m-ie, -one, a( is, ie+1 ),lda, rhs( 1 ), 1, &
+                                    one, c( ie+1, js ),1 )
+                          call stdlib_dgemv( 'T', mb, m-ie, -one, d( is, ie+1 ),ldd, rhs( 3 ), 1, &
+                                    one, c( ie+1, js ),1 )
+                       end if
+                    else if( ( mb==2 ) .and. ( nb==2 ) ) then
+                       ! build an 8-by-8 system z**t * x = rhs
+                       call stdlib_dlaset( 'F', ldz, ldz, zero, zero, z, ldz )
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = a( is, isp1 )
+                       z( 5, 1 ) = -b( js, js )
+                       z( 7, 1 ) = -b( jsp1, js )
+                       z( 1, 2 ) = a( isp1, is )
+                       z( 2, 2 ) = a( isp1, isp1 )
+                       z( 6, 2 ) = -b( js, js )
+                       z( 8, 2 ) = -b( jsp1, js )
+                       z( 3, 3 ) = a( is, is )
+                       z( 4, 3 ) = a( is, isp1 )
+                       z( 5, 3 ) = -b( js, jsp1 )
+                       z( 7, 3 ) = -b( jsp1, jsp1 )
+                       z( 3, 4 ) = a( isp1, is )
+                       z( 4, 4 ) = a( isp1, isp1 )
+                       z( 6, 4 ) = -b( js, jsp1 )
+                       z( 8, 4 ) = -b( jsp1, jsp1 )
+                       z( 1, 5 ) = d( is, is )
+                       z( 2, 5 ) = d( is, isp1 )
+                       z( 5, 5 ) = -e( js, js )
+                       z( 2, 6 ) = d( isp1, isp1 )
+                       z( 6, 6 ) = -e( js, js )
+                       z( 3, 7 ) = d( is, is )
+                       z( 4, 7 ) = d( is, isp1 )
+                       z( 5, 7 ) = -e( js, jsp1 )
+                       z( 7, 7 ) = -e( jsp1, jsp1 )
+                       z( 4, 8 ) = d( isp1, isp1 )
+                       z( 6, 8 ) = -e( js, jsp1 )
+                       z( 8, 8 ) = -e( jsp1, jsp1 )
+                       ! set up right hand side(s)
+                       k = 1
+                       ii = mb*nb + 1
+                       do jj = 0, nb - 1
+                          call stdlib_dcopy( mb, c( is, js+jj ), 1, rhs( k ), 1 )
+                          call stdlib_dcopy( mb, f( is, js+jj ), 1, rhs( ii ), 1 )
+                          k = k + mb
+                          ii = ii + mb
+                       end do
+                       ! solve z**t * x = rhs
+                       call stdlib_dgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       call stdlib_dgesc2( zdim, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_dscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! unpack solution vector(s)
+                       k = 1
+                       ii = mb*nb + 1
+                       do jj = 0, nb - 1
+                          call stdlib_dcopy( mb, rhs( k ), 1, c( is, js+jj ), 1 )
+                          call stdlib_dcopy( mb, rhs( ii ), 1, f( is, js+jj ), 1 )
+                          k = k + mb
+                          ii = ii + mb
+                       end do
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( j>p+2 ) then
+                          call stdlib_dgemm( 'N', 'T', mb, js-1, nb, one,c( is, js ), ldc, b( 1, &
+                                    js ), ldb, one,f( is, 1 ), ldf )
+                          call stdlib_dgemm( 'N', 'T', mb, js-1, nb, one,f( is, js ), ldf, e( 1, &
+                                    js ), lde, one,f( is, 1 ), ldf )
+                       end if
+                       if( i<p ) then
+                          call stdlib_dgemm( 'T', 'N', m-ie, nb, mb, -one,a( is, ie+1 ), lda, c( &
+                                    is, js ), ldc,one, c( ie+1, js ), ldc )
+                          call stdlib_dgemm( 'T', 'N', m-ie, nb, mb, -one,d( is, ie+1 ), ldd, f( &
+                                    is, js ), ldf,one, c( ie+1, js ), ldc )
+                       end if
+                    end if
+                 end do loop_190
+              end do loop_200
+           end if
+           return
+     end subroutine stdlib_dtgsy2
+
+     !> DTGSYL: solves the generalized Sylvester equation:
+     !> A * R - L * B = scale * C                 (1)
+     !> D * R - L * E = scale * F
+     !> where R and L are unknown m-by-n matrices, (A, D), (B, E) and
+     !> (C, F) are given matrix pairs of size m-by-m, n-by-n and m-by-n,
+     !> respectively, with real entries. (A, D) and (B, E) must be in
+     !> generalized (real) Schur canonical form, i.e. A, B are upper quasi
+     !> triangular and D, E are upper triangular.
+     !> The solution (R, L) overwrites (C, F). 0 <= SCALE <= 1 is an output
+     !> scaling factor chosen to avoid overflow.
+     !> In matrix notation (1) is equivalent to solve  Zx = scale b, where
+     !> Z is defined as
+     !> Z = [ kron(In, A)  -kron(B**T, Im) ]         (2)
+     !> [ kron(In, D)  -kron(E**T, Im) ].
+     !> Here Ik is the identity matrix of size k and X**T is the transpose of
+     !> X. kron(X, Y) is the Kronecker product between the matrices X and Y.
+     !> If TRANS = 'T', DTGSYL solves the transposed system Z**T*y = scale*b,
+     !> which is equivalent to solve for R and L in
+     !> A**T * R + D**T * L = scale * C           (3)
+     !> R * B**T + L * E**T = scale * -F
+     !> This case (TRANS = 'T') is used to compute an one-norm-based estimate
+     !> of Dif[(A,D), (B,E)], the separation between the matrix pairs (A,D)
+     !> and (B,E), using DLACON.
+     !> If IJOB >= 1, DTGSYL computes a Frobenius norm-based estimate
+     !> of Dif[(A,D),(B,E)]. That is, the reciprocal of a lower bound on the
+     !> reciprocal of the smallest singular value of Z. See [1-2] for more
+     !> information.
+     !> This is a level 3 BLAS algorithm.
+
+     pure subroutine stdlib_dtgsyl( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f, &
+               ldf, scale, dif, work, lwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldc, ldd, lde, ldf, lwork, m, n
+           integer(ilp), intent(out) :: info
+           real(dp), intent(out) :: dif, scale
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: a(lda,*), b(ldb,*), d(ldd,*), e(lde,*)
+           real(dp), intent(inout) :: c(ldc,*), f(ldf,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+        ! replaced various illegal calls to stdlib_dcopy by calls to stdlib_dlaset.
+        ! sven hammarling, 1/5/02.
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, notran
+           integer(ilp) :: i, ie, ifunc, iround, is, isolve, j, je, js, k, linfo, lwmin, mb, nb, &
+                     p, ppqq, pq, q
+           real(dp) :: dscale, dsum, scale2, scaloc
+           ! Intrinsic Functions 
+           intrinsic :: real,max,sqrt
+           ! Executable Statements 
+           ! decode and test input parameters
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -1
+           else if( notran ) then
+              if( ( ijob<0 ) .or. ( ijob>4 ) ) then
+                 info = -2
+              end if
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 info = -3
+              else if( n<=0 ) then
+                 info = -4
+              else if( lda<max( 1, m ) ) then
+                 info = -6
+              else if( ldb<max( 1, n ) ) then
+                 info = -8
+              else if( ldc<max( 1, m ) ) then
+                 info = -10
+              else if( ldd<max( 1, m ) ) then
+                 info = -12
+              else if( lde<max( 1, n ) ) then
+                 info = -14
+              else if( ldf<max( 1, m ) ) then
+                 info = -16
+              end if
+           end if
+           if( info==0 ) then
+              if( notran ) then
+                 if( ijob==1 .or. ijob==2 ) then
+                    lwmin = max( 1, 2*m*n )
+                 else
+                    lwmin = 1
+                 end if
+              else
+                 lwmin = 1
+              end if
+              work( 1 ) = lwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGSYL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              scale = 1
+              if( notran ) then
+                 if( ijob/=0 ) then
+                    dif = 0
+                 end if
+              end if
+              return
+           end if
+           ! determine optimal block sizes mb and nb
+           mb = stdlib_ilaenv( 2, 'DTGSYL', trans, m, n, -1, -1 )
+           nb = stdlib_ilaenv( 5, 'DTGSYL', trans, m, n, -1, -1 )
+           isolve = 1
+           ifunc = 0
+           if( notran ) then
+              if( ijob>=3 ) then
+                 ifunc = ijob - 2
+                 call stdlib_dlaset( 'F', m, n, zero, zero, c, ldc )
+                 call stdlib_dlaset( 'F', m, n, zero, zero, f, ldf )
+              else if( ijob>=1 ) then
+                 isolve = 2
+              end if
+           end if
+           if( ( mb<=1 .and. nb<=1 ) .or. ( mb>=m .and. nb>=n ) )then
+              loop_30: do iround = 1, isolve
+                 ! use unblocked level 2 solver
+                 dscale = zero
+                 dsum = one
+                 pq = 0
+                 call stdlib_dtgsy2( trans, ifunc, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f,&
+                            ldf, scale, dsum, dscale,iwork, pq, info )
+                 if( dscale/=zero ) then
+                    if( ijob==1 .or. ijob==3 ) then
+                       dif = sqrt( real( 2*m*n,KIND=dp) ) / ( dscale*sqrt( dsum ) )
+                    else
+                       dif = sqrt( real( pq,KIND=dp) ) / ( dscale*sqrt( dsum ) )
+                    end if
+                 end if
+                 if( isolve==2 .and. iround==1 ) then
+                    if( notran ) then
+                       ifunc = ijob
+                    end if
+                    scale2 = scale
+                    call stdlib_dlacpy( 'F', m, n, c, ldc, work, m )
+                    call stdlib_dlacpy( 'F', m, n, f, ldf, work( m*n+1 ), m )
+                    call stdlib_dlaset( 'F', m, n, zero, zero, c, ldc )
+                    call stdlib_dlaset( 'F', m, n, zero, zero, f, ldf )
+                 else if( isolve==2 .and. iround==2 ) then
+                    call stdlib_dlacpy( 'F', m, n, work, m, c, ldc )
+                    call stdlib_dlacpy( 'F', m, n, work( m*n+1 ), m, f, ldf )
+                    scale = scale2
+                 end if
+              end do loop_30
+              return
+           end if
+           ! determine block structure of a
+           p = 0
+           i = 1
+           40 continue
+           if( i>m )go to 50
+           p = p + 1
+           iwork( p ) = i
+           i = i + mb
+           if( i>=m )go to 50
+           if( a( i, i-1 )/=zero )i = i + 1
+           go to 40
+           50 continue
+           iwork( p+1 ) = m + 1
+           if( iwork( p )==iwork( p+1 ) )p = p - 1
+           ! determine block structure of b
+           q = p + 1
+           j = 1
+           60 continue
+           if( j>n )go to 70
+           q = q + 1
+           iwork( q ) = j
+           j = j + nb
+           if( j>=n )go to 70
+           if( b( j, j-1 )/=zero )j = j + 1
+           go to 60
+           70 continue
+           iwork( q+1 ) = n + 1
+           if( iwork( q )==iwork( q+1 ) )q = q - 1
+           if( notran ) then
+              loop_150: do iround = 1, isolve
+                 ! solve (i, j)-subsystem
+                     ! a(i, i) * r(i, j) - l(i, j) * b(j, j) = c(i, j)
+                     ! d(i, i) * r(i, j) - l(i, j) * e(j, j) = f(i, j)
+                 ! for i = p, p - 1,..., 1; j = 1, 2,..., q
+                 dscale = zero
+                 dsum = one
+                 pq = 0
+                 scale = one
+                 loop_130: do j = p + 2, q
+                    js = iwork( j )
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    loop_120: do i = p, 1, -1
+                       is = iwork( i )
+                       ie = iwork( i+1 ) - 1
+                       mb = ie - is + 1
+                       ppqq = 0
+                       call stdlib_dtgsy2( trans, ifunc, mb, nb, a( is, is ), lda,b( js, js ), &
+                       ldb, c( is, js ), ldc,d( is, is ), ldd, e( js, js ), lde,f( is, js ), ldf, &
+                                 scaloc, dsum, dscale,iwork( q+2 ), ppqq, linfo )
+                       if( linfo>0 )info = linfo
+                       pq = pq + ppqq
+                       if( scaloc/=one ) then
+                          do k = 1, js - 1
+                             call stdlib_dscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_dscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          do k = js, je
+                             call stdlib_dscal( is-1, scaloc, c( 1, k ), 1 )
+                             call stdlib_dscal( is-1, scaloc, f( 1, k ), 1 )
+                          end do
+                          do k = js, je
+                             call stdlib_dscal( m-ie, scaloc, c( ie+1, k ), 1 )
+                             call stdlib_dscal( m-ie, scaloc, f( ie+1, k ), 1 )
+                          end do
+                          do k = je + 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_dscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          call stdlib_dgemm( 'N', 'N', is-1, nb, mb, -one,a( 1, is ), lda, c( is, &
+                                    js ), ldc, one,c( 1, js ), ldc )
+                          call stdlib_dgemm( 'N', 'N', is-1, nb, mb, -one,d( 1, is ), ldd, c( is, &
+                                    js ), ldc, one,f( 1, js ), ldf )
+                       end if
+                       if( j<q ) then
+                          call stdlib_dgemm( 'N', 'N', mb, n-je, nb, one,f( is, js ), ldf, b( js, &
+                                    je+1 ), ldb,one, c( is, je+1 ), ldc )
+                          call stdlib_dgemm( 'N', 'N', mb, n-je, nb, one,f( is, js ), ldf, e( js, &
+                                    je+1 ), lde,one, f( is, je+1 ), ldf )
+                       end if
+                    end do loop_120
+                 end do loop_130
+                 if( dscale/=zero ) then
+                    if( ijob==1 .or. ijob==3 ) then
+                       dif = sqrt( real( 2*m*n,KIND=dp) ) / ( dscale*sqrt( dsum ) )
+                    else
+                       dif = sqrt( real( pq,KIND=dp) ) / ( dscale*sqrt( dsum ) )
+                    end if
+                 end if
+                 if( isolve==2 .and. iround==1 ) then
+                    if( notran ) then
+                       ifunc = ijob
+                    end if
+                    scale2 = scale
+                    call stdlib_dlacpy( 'F', m, n, c, ldc, work, m )
+                    call stdlib_dlacpy( 'F', m, n, f, ldf, work( m*n+1 ), m )
+                    call stdlib_dlaset( 'F', m, n, zero, zero, c, ldc )
+                    call stdlib_dlaset( 'F', m, n, zero, zero, f, ldf )
+                 else if( isolve==2 .and. iround==2 ) then
+                    call stdlib_dlacpy( 'F', m, n, work, m, c, ldc )
+                    call stdlib_dlacpy( 'F', m, n, work( m*n+1 ), m, f, ldf )
+                    scale = scale2
+                 end if
+              end do loop_150
+           else
+              ! solve transposed (i, j)-subsystem
+                   ! a(i, i)**t * r(i, j)  + d(i, i)**t * l(i, j)  =  c(i, j)
+                   ! r(i, j)  * b(j, j)**t + l(i, j)  * e(j, j)**t = -f(i, j)
+              ! for i = 1,2,..., p; j = q, q-1,..., 1
+              scale = one
+              loop_210: do i = 1, p
+                 is = iwork( i )
+                 ie = iwork( i+1 ) - 1
+                 mb = ie - is + 1
+                 loop_200: do j = q, p + 2, -1
+                    js = iwork( j )
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    call stdlib_dtgsy2( trans, ifunc, mb, nb, a( is, is ), lda,b( js, js ), ldb, &
+                    c( is, js ), ldc,d( is, is ), ldd, e( js, js ), lde,f( is, js ), ldf, scaloc, &
+                              dsum, dscale,iwork( q+2 ), ppqq, linfo )
+                    if( linfo>0 )info = linfo
+                    if( scaloc/=one ) then
+                       do k = 1, js - 1
+                          call stdlib_dscal( m, scaloc, c( 1, k ), 1 )
+                          call stdlib_dscal( m, scaloc, f( 1, k ), 1 )
+                       end do
+                       do k = js, je
+                          call stdlib_dscal( is-1, scaloc, c( 1, k ), 1 )
+                          call stdlib_dscal( is-1, scaloc, f( 1, k ), 1 )
+                       end do
+                       do k = js, je
+                          call stdlib_dscal( m-ie, scaloc, c( ie+1, k ), 1 )
+                          call stdlib_dscal( m-ie, scaloc, f( ie+1, k ), 1 )
+                       end do
+                       do k = je + 1, n
+                          call stdlib_dscal( m, scaloc, c( 1, k ), 1 )
+                          call stdlib_dscal( m, scaloc, f( 1, k ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    ! substitute r(i, j) and l(i, j) into remaining equation.
+                    if( j>p+2 ) then
+                       call stdlib_dgemm( 'N', 'T', mb, js-1, nb, one, c( is, js ),ldc, b( 1, js )&
+                                 , ldb, one, f( is, 1 ),ldf )
+                       call stdlib_dgemm( 'N', 'T', mb, js-1, nb, one, f( is, js ),ldf, e( 1, js )&
+                                 , lde, one, f( is, 1 ),ldf )
+                    end if
+                    if( i<p ) then
+                       call stdlib_dgemm( 'T', 'N', m-ie, nb, mb, -one,a( is, ie+1 ), lda, c( is, &
+                                 js ), ldc, one,c( ie+1, js ), ldc )
+                       call stdlib_dgemm( 'T', 'N', m-ie, nb, mb, -one,d( is, ie+1 ), ldd, f( is, &
+                                 js ), ldf, one,c( ie+1, js ), ldc )
+                    end if
+                 end do loop_200
+              end do loop_210
+           end if
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_dtgsyl
+
+     !> DTPCON: estimates the reciprocal of the condition number of a packed
+     !> triangular matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_dtpcon( norm, uplo, diag, n, ap, rcond, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: ap(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(dp) :: ainvnm, anorm, scale, smlnum, xnorm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=dp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_dlantp( norm, uplo, diag, n, ap, work )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_dlatps( uplo, 'NO TRANSPOSE', diag, normin, n, ap,work, scale, &
+                              work( 2*n+1 ), info )
+                 else
+                    ! multiply by inv(a**t).
+                    call stdlib_dlatps( uplo, 'TRANSPOSE', diag, normin, n, ap,work, scale, work( &
+                              2*n+1 ), info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_idamax( n, work, 1 )
+                    xnorm = abs( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_drscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_dtpcon
+
+     !> DTPLQT2: computes a LQ a factorization of a real "triangular-pentagonal"
+     !> matrix C, which is composed of a triangular block A and pentagonal block B,
+     !> using the compact WY representation for Q.
+
+     pure subroutine stdlib_dtplqt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, p, mp, np
+           real(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. l>min(m,n) ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1, m ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPLQT2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) return
+           do i = 1, m
+              ! generate elementary reflector h(i) to annihilate b(i,:)
+              p = n-l+min( l, i )
+              call stdlib_dlarfg( p+1, a( i, i ), b( i, 1 ), ldb, t( 1, i ) )
+              if( i<m ) then
+                 ! w(m-i:1) := c(i+1:m,i:n) * c(i,i:n) [use w = t(m,:)]
+                 do j = 1, m-i
+                    t( m, j ) = (a( i+j, i ))
+                 end do
+                 call stdlib_dgemv( 'N', m-i, p, one, b( i+1, 1 ), ldb,b( i, 1 ), ldb, one, t( m, &
+                           1 ), ldt )
+                 ! c(i+1:m,i:n) = c(i+1:m,i:n) + alpha * c(i,i:n)*w(m-1:1)^h
+                 alpha = -(t( 1, i ))
+                 do j = 1, m-i
+                    a( i+j, i ) = a( i+j, i ) + alpha*(t( m, j ))
+                 end do
+                 call stdlib_dger( m-i, p, alpha,  t( m, 1 ), ldt,b( i, 1 ), ldb, b( i+1, 1 ), &
+                           ldb )
+              end if
+           end do
+           do i = 2, m
+              ! t(i,1:i-1) := c(i:i-1,1:n) * (alpha * c(i,i:n)^h)
+              alpha = -t( 1, i )
+              do j = 1, i-1
+                 t( i, j ) = zero
+              end do
+              p = min( i-1, l )
+              np = min( n-l+1, n )
+              mp = min( p+1, m )
+              ! triangular part of b2
+              do j = 1, p
+                 t( i, j ) = alpha*b( i, n-l+j )
+              end do
+              call stdlib_dtrmv( 'L', 'N', 'N', p, b( 1, np ), ldb,t( i, 1 ), ldt )
+              ! rectangular part of b2
+              call stdlib_dgemv( 'N', i-1-p, l,  alpha, b( mp, np ), ldb,b( i, np ), ldb, zero, t(&
+                         i,mp ), ldt )
+              ! b1
+              call stdlib_dgemv( 'N', i-1, n-l, alpha, b, ldb, b( i, 1 ), ldb,one, t( i, 1 ), ldt &
+                        )
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(i,1:i-1)
+             call stdlib_dtrmv( 'L', 'T', 'N', i-1, t, ldt, t( i, 1 ), ldt )
+              ! t(i,i) = tau(i)
+              t( i, i ) = t( 1, i )
+              t( 1, i ) = zero
+           end do
+           do i=1,m
+              do j= i+1,m
+                 t(i,j)=t(j,i)
+                 t(j,i)= zero
+              end do
+           end do
+     end subroutine stdlib_dtplqt2
+
+     !> DTPMQRT applies a real orthogonal matrix Q obtained from a
+     !> "triangular-pentagonal" real block reflector H to a general
+     !> real matrix C, which consists of two blocks A and B.
+
+     pure subroutine stdlib_dtpmlqt( side, trans, m, n, k, l, mb, v, ldv, t, ldt,a, lda, b, ldb, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, lda, ldb, m, n, l, mb, ldt
+           ! Array Arguments 
+           real(dp), intent(in) :: v(ldv,*), t(ldt,*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, nb, lb, kf, ldaq
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'T' )
+           notran = stdlib_lsame( trans, 'N' )
+           if ( left ) then
+              ldaq = max( 1, k )
+           else if ( right ) then
+              ldaq = max( 1, m )
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( l<0 .or. l>k ) then
+              info = -6
+           else if( mb<1 .or. (mb>k .and. k>0) ) then
+              info = -7
+           else if( ldv<k ) then
+              info = -9
+           else if( ldt<mb ) then
+              info = -11
+           else if( lda<ldaq ) then
+              info = -13
+           else if( ldb<max( 1, m ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPMLQT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. notran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = 0
+                 end if
+                 call stdlib_dtprfb( 'L', 'T', 'F', 'R', nb, n, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. tran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = nb-n+l-i+1
+                 end if
+                 call stdlib_dtprfb( 'R', 'N', 'F', 'R', m, nb, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           else if( left .and. tran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = 0
+                 end if
+                 call stdlib_dtprfb( 'L', 'N', 'F', 'R', nb, n, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. notran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = nb-n+l-i+1
+                 end if
+                 call stdlib_dtprfb( 'R', 'T', 'F', 'R', m, nb, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           end if
+           return
+     end subroutine stdlib_dtpmlqt
+
+     !> DTPMQRT: applies a real orthogonal matrix Q obtained from a
+     !> "triangular-pentagonal" real block reflector H to a general
+     !> real matrix C, which consists of two blocks A and B.
+
+     pure subroutine stdlib_dtpmqrt( side, trans, m, n, k, l, nb, v, ldv, t, ldt,a, lda, b, ldb, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, lda, ldb, m, n, l, nb, ldt
+           ! Array Arguments 
+           real(dp), intent(in) :: v(ldv,*), t(ldt,*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, mb, lb, kf, ldaq, ldvq
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'T' )
+           notran = stdlib_lsame( trans, 'N' )
+           if ( left ) then
+              ldvq = max( 1, m )
+              ldaq = max( 1, k )
+           else if ( right ) then
+              ldvq = max( 1, n )
+              ldaq = max( 1, m )
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( l<0 .or. l>k ) then
+              info = -6
+           else if( nb<1 .or. (nb>k .and. k>0) ) then
+              info = -7
+           else if( ldv<ldvq ) then
+              info = -9
+           else if( ldt<nb ) then
+              info = -11
+           else if( lda<ldaq ) then
+              info = -13
+           else if( ldb<max( 1, m ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPMQRT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. tran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-m+l-i+1
+                 end if
+                 call stdlib_dtprfb( 'L', 'T', 'F', 'C', mb, n, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. notran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-n+l-i+1
+                 end if
+                 call stdlib_dtprfb( 'R', 'N', 'F', 'C', m, mb, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           else if( left .and. notran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-m+l-i+1
+                 end if
+                 call stdlib_dtprfb( 'L', 'N', 'F', 'C', mb, n, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. tran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-n+l-i+1
+                 end if
+                 call stdlib_dtprfb( 'R', 'T', 'F', 'C', m, mb, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           end if
+           return
+     end subroutine stdlib_dtpmqrt
+
+     !> DTPQRT2: computes a QR factorization of a real "triangular-pentagonal"
+     !> matrix C, which is composed of a triangular block A and pentagonal block B,
+     !> using the compact WY representation for Q.
+
+     pure subroutine stdlib_dtpqrt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, p, mp, np
+           real(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. l>min(m,n) ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPQRT2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) return
+           do i = 1, n
+              ! generate elementary reflector h(i) to annihilate b(:,i)
+              p = m-l+min( l, i )
+              call stdlib_dlarfg( p+1, a( i, i ), b( 1, i ), 1, t( i, 1 ) )
+              if( i<n ) then
+                 ! w(1:n-i) := c(i:m,i+1:n)^h * c(i:m,i) [use w = t(:,n)]
+                 do j = 1, n-i
+                    t( j, n ) = (a( i, i+j ))
+                 end do
+                 call stdlib_dgemv( 'T', p, n-i, one, b( 1, i+1 ), ldb,b( 1, i ), 1, one, t( 1, n &
+                           ), 1 )
+                 ! c(i:m,i+1:n) = c(i:m,i+1:n) + alpha*c(i:m,i)*w(1:n-1)^h
+                 alpha = -(t( i, 1 ))
+                 do j = 1, n-i
+                    a( i, i+j ) = a( i, i+j ) + alpha*(t( j, n ))
+                 end do
+                 call stdlib_dger( p, n-i, alpha, b( 1, i ), 1,t( 1, n ), 1, b( 1, i+1 ), ldb )
+                           
+              end if
+           end do
+           do i = 2, n
+              ! t(1:i-1,i) := c(i:m,1:i-1)^h * (alpha * c(i:m,i))
+              alpha = -t( i, 1 )
+              do j = 1, i-1
+                 t( j, i ) = zero
+              end do
+              p = min( i-1, l )
+              mp = min( m-l+1, m )
+              np = min( p+1, n )
+              ! triangular part of b2
+              do j = 1, p
+                 t( j, i ) = alpha*b( m-l+j, i )
+              end do
+              call stdlib_dtrmv( 'U', 'T', 'N', p, b( mp, 1 ), ldb,t( 1, i ), 1 )
+              ! rectangular part of b2
+              call stdlib_dgemv( 'T', l, i-1-p, alpha, b( mp, np ), ldb,b( mp, i ), 1, zero, t( &
+                        np, i ), 1 )
+              ! b1
+              call stdlib_dgemv( 'T', m-l, i-1, alpha, b, ldb, b( 1, i ), 1,one, t( 1, i ), 1 )
+                        
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+              call stdlib_dtrmv( 'U', 'N', 'N', i-1, t, ldt, t( 1, i ), 1 )
+              ! t(i,i) = tau(i)
+              t( i, i ) = t( i, 1 )
+              t( i, 1 ) = zero
+           end do
+     end subroutine stdlib_dtpqrt2
+
+     !> DTRCON: estimates the reciprocal of the condition number of a
+     !> triangular matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_dtrcon( norm, uplo, diag, n, a, lda, rcond, work,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(dp) :: ainvnm, anorm, scale, smlnum, xnorm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=dp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_dlantr( norm, uplo, diag, n, n, a, lda, work )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_dlatrs( uplo, 'NO TRANSPOSE', diag, normin, n, a,lda, work, scale,&
+                               work( 2*n+1 ), info )
+                 else
+                    ! multiply by inv(a**t).
+                    call stdlib_dlatrs( uplo, 'TRANSPOSE', diag, normin, n, a, lda,work, scale, &
+                              work( 2*n+1 ), info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_idamax( n, work, 1 )
+                    xnorm = abs( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_drscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_dtrcon
+
+     !> DTZRZF: reduces the M-by-N ( M<=N ) real upper trapezoidal matrix A
+     !> to upper triangular form by means of orthogonal transformations.
+     !> The upper trapezoidal matrix A is factored as
+     !> A = ( R  0 ) * Z,
+     !> where Z is an N-by-N orthogonal matrix and R is an M-by-M upper
+     !> triangular matrix.
+
+     pure subroutine stdlib_dtzrzf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iws, ki, kk, ldwork, lwkmin, lwkopt, m1, mu, nb, nbmin, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              if( m==0 .or. m==n ) then
+                 lwkopt = 1
+                 lwkmin = 1
+              else
+                 ! determine the block size.
+                 nb = stdlib_ilaenv( 1, 'DGERQF', ' ', m, n, -1, -1 )
+                 lwkopt = m*nb
+                 lwkmin = max( 1, m )
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTZRZF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 ) then
+              return
+           else if( m==n ) then
+              do i = 1, n
+                 tau( i ) = zero
+              end do
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = m
+           if( nb>1 .and. nb<m ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DGERQF', ' ', m, n, -1, -1 ) )
+              if( nx<m ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGERQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<m .and. nx<m ) then
+              ! use blocked code initially.
+              ! the last kk rows are handled by the block method.
+              m1 = min( m+1, n )
+              ki = ( ( m-nx-1 ) / nb )*nb
+              kk = min( m, ki+nb )
+              do i = m - kk + ki + 1, m - kk + 1, -nb
+                 ib = min( m-i+1, nb )
+                 ! compute the tz factorization of the current block
+                 ! a(i:i+ib-1,i:n)
+                 call stdlib_dlatrz( ib, n-i+1, n-m, a( i, i ), lda, tau( i ),work )
+                 if( i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_dlarzt( 'BACKWARD', 'ROWWISE', n-m, ib, a( i, m1 ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h to a(1:i-1,i:n) from the right
+                    call stdlib_dlarzb( 'RIGHT', 'NO TRANSPOSE', 'BACKWARD','ROWWISE', i-1, n-i+1,&
+                     ib, n-m, a( i, m1 ),lda, work, ldwork, a( 1, i ), lda,work( ib+1 ), ldwork )
+                               
+                 end if
+              end do
+              mu = i + nb - 1
+           else
+              mu = m
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 )call stdlib_dlatrz( mu, n, n-m, a, lda, tau, work )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dtzrzf
+
+     !> DGBSV: computes the solution to a real system of linear equations
+     !> A * X = B, where A is a band matrix of order N with KL subdiagonals
+     !> and KU superdiagonals, and X and B are N-by-NRHS matrices.
+     !> The LU decomposition with partial pivoting and row interchanges is
+     !> used to factor A as A = L * U, where L is a product of permutation
+     !> and unit lower triangular matrices with KL subdiagonals, and U is
+     !> upper triangular with KL+KU superdiagonals.  The factored form of A
+     !> is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_dgbsv( n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: ab(ldab,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( kl<0 ) then
+              info = -2
+           else if( ku<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<2*kl+ku+1 ) then
+              info = -6
+           else if( ldb<max( n, 1 ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBSV ', -info )
+              return
+           end if
+           ! compute the lu factorization of the band matrix a.
+           call stdlib_dgbtrf( n, n, kl, ku, ab, ldab, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_dgbtrs( 'NO TRANSPOSE', n, kl, ku, nrhs, ab, ldab, ipiv,b, ldb, info )
+                        
+           end if
+           return
+     end subroutine stdlib_dgbsv
+
+     !> DGBSVX: uses the LU factorization to compute the solution to a real
+     !> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+     !> where A is a band matrix of order N with KL subdiagonals and KU
+     !> superdiagonals, and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_dgbsvx( fact, trans, n, kl, ku, nrhs, ab, ldab, afb,ldafb, ipiv, equed, r, &
+               c, b, ldb, x, ldx,rcond, ferr, berr, work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldafb, ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: ab(ldab,*), afb(ldafb,*), b(ldb,*), c(*), r(*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: colequ, equil, nofact, notran, rowequ
+           character :: norm
+           integer(ilp) :: i, infequ, j, j1, j2
+           real(dp) :: amax, anorm, bignum, colcnd, rcmax, rcmin, rowcnd, rpvgrw, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rowequ = .false.
+              colequ = .false.
+           else
+              rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+              colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kl<0 ) then
+              info = -4
+           else if( ku<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kl+ku+1 ) then
+              info = -8
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -10
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rowequ .or. colequ .or. stdlib_lsame( &
+                     equed, 'N' ) ) ) then
+              info = -12
+           else
+              if( rowequ ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, r( j ) )
+                    rcmax = max( rcmax, r( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -13
+                 else if( n>0 ) then
+                    rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    rowcnd = one
+                 end if
+              end if
+              if( colequ .and. info==0 ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, c( j ) )
+                    rcmax = max( rcmax, c( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -14
+                 else if( n>0 ) then
+                    colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    colcnd = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -16
+                 else if( ldx<max( 1, n ) ) then
+                    info = -18
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_dgbequ( n, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, infequ )
+                        
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_dlaqgb( n, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+                           
+                 rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+                 colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( notran ) then
+              if( rowequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       b( i, j ) = r( i )*b( i, j )
+                    end do
+                 end do
+              end if
+           else if( colequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = c( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the lu factorization of the band matrix a.
+              do j = 1, n
+                 j1 = max( j-ku, 1 )
+                 j2 = min( j+kl, n )
+                 call stdlib_dcopy( j2-j1+1, ab( ku+1-j+j1, j ), 1,afb( kl+ku+1-j+j1, j ), 1 )
+                           
+              end do
+              call stdlib_dgbtrf( n, n, kl, ku, afb, ldafb, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 ) then
+                 ! compute the reciprocal pivot growth factor of the
+                 ! leading rank-deficient info columns of a.
+                 anorm = zero
+                 do j = 1, info
+                    do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                       anorm = max( anorm, abs( ab( i, j ) ) )
+                    end do
+                 end do
+                 rpvgrw = stdlib_dlantb( 'M', 'U', 'N', info, min( info-1, kl+ku ),afb( max( 1, &
+                           kl+ku+2-info ), 1 ), ldafb,work )
+                 if( rpvgrw==zero ) then
+                    rpvgrw = one
+                 else
+                    rpvgrw = anorm / rpvgrw
+                 end if
+                 work( 1 ) = rpvgrw
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a and the
+           ! reciprocal pivot growth factor rpvgrw.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_dlangb( norm, n, kl, ku, ab, ldab, work )
+           rpvgrw = stdlib_dlantb( 'M', 'U', 'N', n, kl+ku, afb, ldafb, work )
+           if( rpvgrw==zero ) then
+              rpvgrw = one
+           else
+              rpvgrw = stdlib_dlangb( 'M', n, kl, ku, ab, ldab, work ) / rpvgrw
+           end if
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_dgbcon( norm, n, kl, ku, afb, ldafb, ipiv, anorm, rcond,work, iwork, info )
+                     
+           ! compute the solution matrix x.
+           call stdlib_dlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_dgbtrs( trans, n, kl, ku, nrhs, afb, ldafb, ipiv, x, ldx,info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_dgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb, ipiv,b, ldb, x, ldx, &
+                     ferr, berr, work, iwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( notran ) then
+              if( colequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       x( i, j ) = c( i )*x( i, j )
+                    end do
+                 end do
+                 do j = 1, nrhs
+                    ferr( j ) = ferr( j ) / colcnd
+                 end do
+              end if
+           else if( rowequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = r( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / rowcnd
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           work( 1 ) = rpvgrw
+           return
+     end subroutine stdlib_dgbsvx
+
+     !> DGEBAL: balances a general real matrix A.  This involves, first,
+     !> permuting A by a similarity transformation to isolate eigenvalues
+     !> in the first 1 to ILO-1 and last IHI+1 to N elements on the
+     !> diagonal; and second, applying a diagonal similarity transformation
+     !> to rows and columns ILO to IHI to make the rows and columns as
+     !> close in norm as possible.  Both steps are optional.
+     !> Balancing may reduce the 1-norm of the matrix, and improve the
+     !> accuracy of the computed eigenvalues and/or eigenvectors.
+
+     pure subroutine stdlib_dgebal( job, n, a, lda, ilo, ihi, scale, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: scale(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sclfac = 2.0e+0_dp
+           real(dp), parameter :: factor = 0.95e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: noconv
+           integer(ilp) :: i, ica, iexc, ira, j, k, l, m
+           real(dp) :: c, ca, f, g, r, ra, s, sfmax1, sfmax2, sfmin1, sfmin2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! test the input parameters
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEBAL', -info )
+              return
+           end if
+           k = 1
+           l = n
+           if( n==0 )go to 210
+           if( stdlib_lsame( job, 'N' ) ) then
+              do i = 1, n
+                 scale( i ) = one
+              end do
+              go to 210
+           end if
+           if( stdlib_lsame( job, 'S' ) )go to 120
+           ! permutation to isolate eigenvalues if possible
+           go to 50
+           ! row and column exchange.
+           20 continue
+           scale( m ) = j
+           if( j==m )go to 30
+           call stdlib_dswap( l, a( 1, j ), 1, a( 1, m ), 1 )
+           call stdlib_dswap( n-k+1, a( j, k ), lda, a( m, k ), lda )
+           30 continue
+           go to ( 40, 80 )iexc
+           ! search for rows isolating an eigenvalue and push them down.
+           40 continue
+           if( l==1 )go to 210
+           l = l - 1
+           50 continue
+           loop_70: do j = l, 1, -1
+              loop_60: do i = 1, l
+                 if( i==j )cycle loop_60
+                 if( a( j, i )/=zero )cycle loop_70
+              end do loop_60
+              m = l
+              iexc = 1
+              go to 20
+           end do loop_70
+           go to 90
+           ! search for columns isolating an eigenvalue and push them left.
+           80 continue
+           k = k + 1
+           90 continue
+           loop_110: do j = k, l
+              loop_100: do i = k, l
+                 if( i==j )cycle loop_100
+                 if( a( i, j )/=zero )cycle loop_110
+              end do loop_100
+              m = k
+              iexc = 2
+              go to 20
+           end do loop_110
+           120 continue
+           do i = k, l
+              scale( i ) = one
+           end do
+           if( stdlib_lsame( job, 'P' ) )go to 210
+           ! balance the submatrix in rows k to l.
+           ! iterative loop for norm reduction
+           sfmin1 = stdlib_dlamch( 'S' ) / stdlib_dlamch( 'P' )
+           sfmax1 = one / sfmin1
+           sfmin2 = sfmin1*sclfac
+           sfmax2 = one / sfmin2
+           140 continue
+           noconv = .false.
+           loop_200: do i = k, l
+              c = stdlib_dnrm2( l-k+1, a( k, i ), 1 )
+              r = stdlib_dnrm2( l-k+1, a( i, k ), lda )
+              ica = stdlib_idamax( l, a( 1, i ), 1 )
+              ca = abs( a( ica, i ) )
+              ira = stdlib_idamax( n-k+1, a( i, k ), lda )
+              ra = abs( a( i, ira+k-1 ) )
+              ! guard against zero c or r due to underflow.
+              if( c==zero .or. r==zero )cycle loop_200
+              g = r / sclfac
+              f = one
+              s = c + r
+              160 continue
+              if( c>=g .or. max( f, c, ca )>=sfmax2 .or.min( r, g, ra )<=sfmin2 )go to 170
+                 if( stdlib_disnan( c+f+ca+r+g+ra ) ) then
+                 ! exit if nan to avoid infinite loop
+                 info = -3
+                 call stdlib_xerbla( 'DGEBAL', -info )
+                 return
+              end if
+              f = f*sclfac
+              c = c*sclfac
+              ca = ca*sclfac
+              r = r / sclfac
+              g = g / sclfac
+              ra = ra / sclfac
+              go to 160
+              170 continue
+              g = c / sclfac
+              180 continue
+              if( g<r .or. max( r, ra )>=sfmax2 .or.min( f, c, g, ca )<=sfmin2 )go to 190
+              f = f / sclfac
+              c = c / sclfac
+              g = g / sclfac
+              ca = ca / sclfac
+              r = r*sclfac
+              ra = ra*sclfac
+              go to 180
+              ! now balance.
+              190 continue
+              if( ( c+r )>=factor*s )cycle loop_200
+              if( f<one .and. scale( i )<one ) then
+                 if( f*scale( i )<=sfmin1 )cycle loop_200
+              end if
+              if( f>one .and. scale( i )>one ) then
+                 if( scale( i )>=sfmax1 / f )cycle loop_200
+              end if
+              g = one / f
+              scale( i ) = scale( i )*f
+              noconv = .true.
+              call stdlib_dscal( n-k+1, g, a( i, k ), lda )
+              call stdlib_dscal( l, f, a( 1, i ), 1 )
+           end do loop_200
+           if( noconv )go to 140
+           210 continue
+           ilo = k
+           ihi = l
+           return
+     end subroutine stdlib_dgebal
+
+     !> DGEBD2: reduces a real general m by n matrix A to upper or lower
+     !> bidiagonal form B by an orthogonal transformation: Q**T * A * P = B.
+     !> If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+     pure subroutine stdlib_dgebd2( m, n, a, lda, d, e, tauq, taup, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: d(*), e(*), taup(*), tauq(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info<0 ) then
+              call stdlib_xerbla( 'DGEBD2', -info )
+              return
+           end if
+           if( m>=n ) then
+              ! reduce to upper bidiagonal form
+              do i = 1, n
+                 ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+                 call stdlib_dlarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tauq( i ) )
+                           
+                 d( i ) = a( i, i )
+                 a( i, i ) = one
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 if( i<n )call stdlib_dlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tauq( i ),a( i, i+&
+                           1 ), lda, work )
+                 a( i, i ) = d( i )
+                 if( i<n ) then
+                    ! generate elementary reflector g(i) to annihilate
+                    ! a(i,i+2:n)
+                    call stdlib_dlarfg( n-i, a( i, i+1 ), a( i, min( i+2, n ) ),lda, taup( i ) )
+                              
+                    e( i ) = a( i, i+1 )
+                    a( i, i+1 ) = one
+                    ! apply g(i) to a(i+1:m,i+1:n) from the right
+                    call stdlib_dlarf( 'RIGHT', m-i, n-i, a( i, i+1 ), lda,taup( i ), a( i+1, i+1 &
+                              ), lda, work )
+                    a( i, i+1 ) = e( i )
+                 else
+                    taup( i ) = zero
+                 end if
+              end do
+           else
+              ! reduce to lower bidiagonal form
+              do i = 1, m
+                 ! generate elementary reflector g(i) to annihilate a(i,i+1:n)
+                 call stdlib_dlarfg( n-i+1, a( i, i ), a( i, min( i+1, n ) ), lda,taup( i ) )
+                           
+                 d( i ) = a( i, i )
+                 a( i, i ) = one
+                 ! apply g(i) to a(i+1:m,i:n) from the right
+                 if( i<m )call stdlib_dlarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda,taup( i ), a( i+&
+                           1, i ), lda, work )
+                 a( i, i ) = d( i )
+                 if( i<m ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(i+2:m,i)
+                    call stdlib_dlarfg( m-i, a( i+1, i ), a( min( i+2, m ), i ), 1,tauq( i ) )
+                              
+                    e( i ) = a( i+1, i )
+                    a( i+1, i ) = one
+                    ! apply h(i) to a(i+1:m,i+1:n) from the left
+                    call stdlib_dlarf( 'LEFT', m-i, n-i, a( i+1, i ), 1, tauq( i ),a( i+1, i+1 ), &
+                              lda, work )
+                    a( i+1, i ) = e( i )
+                 else
+                    tauq( i ) = zero
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_dgebd2
+
+     !> DGEHD2: reduces a real general matrix A to upper Hessenberg form H by
+     !> an orthogonal similarity transformation:  Q**T * A * Q = H .
+
+     pure subroutine stdlib_dgehd2( n, ilo, ihi, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEHD2', -info )
+              return
+           end if
+           do i = ilo, ihi - 1
+              ! compute elementary reflector h(i) to annihilate a(i+2:ihi,i)
+              call stdlib_dlarfg( ihi-i, a( i+1, i ), a( min( i+2, n ), i ), 1,tau( i ) )
+              aii = a( i+1, i )
+              a( i+1, i ) = one
+              ! apply h(i) to a(1:ihi,i+1:ihi) from the right
+              call stdlib_dlarf( 'RIGHT', ihi, ihi-i, a( i+1, i ), 1, tau( i ),a( 1, i+1 ), lda, &
+                        work )
+              ! apply h(i) to a(i+1:ihi,i+1:n) from the left
+              call stdlib_dlarf( 'LEFT', ihi-i, n-i, a( i+1, i ), 1, tau( i ),a( i+1, i+1 ), lda, &
+                        work )
+              a( i+1, i ) = aii
+           end do
+           return
+     end subroutine stdlib_dgehd2
+
+     !> DGELQ2: computes an LQ factorization of a real m-by-n matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a n-by-n orthogonal matrix;
+     !> L is a lower-triangular m-by-m matrix;
+     !> 0 is a m-by-(n-m) zero matrix, if m < n.
+
+     pure subroutine stdlib_dgelq2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELQ2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i,i+1:n)
+              call stdlib_dlarfg( n-i+1, a( i, i ), a( i, min( i+1, n ) ), lda,tau( i ) )
+              if( i<m ) then
+                 ! apply h(i) to a(i+1:m,i:n) from the right
+                 aii = a( i, i )
+                 a( i, i ) = one
+                 call stdlib_dlarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda, tau( i ),a( i+1, i ), &
+                           lda, work )
+                 a( i, i ) = aii
+              end if
+           end do
+           return
+     end subroutine stdlib_dgelq2
+
+     !> DGELQF: computes an LQ factorization of a real M-by-N matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a N-by-N orthogonal matrix;
+     !> L is a lower-triangular M-by-M matrix;
+     !> 0 is a M-by-(N-M) zero matrix, if M < N.
+
+     pure subroutine stdlib_dgelqf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DGELQF', ' ', m, n, -1, -1 )
+           lwkopt = m*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, m ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DGELQF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGELQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the lq factorization of the current block
+                 ! a(i:i+ib-1,i:n)
+                 call stdlib_dgelq2( ib, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=m ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_dlarft( 'FORWARD', 'ROWWISE', n-i+1, ib, a( i, i ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h to a(i+ib:m,i:n) from the right
+                    call stdlib_dlarfb( 'RIGHT', 'NO TRANSPOSE', 'FORWARD','ROWWISE', m-i-ib+1, n-&
+                    i+1, ib, a( i, i ),lda, work, ldwork, a( i+ib, i ), lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_dgelq2( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_dgelqf
+
+     !> DGELQT3: recursively computes a LQ factorization of a real M-by-N
+     !> matrix A, using the compact WY representation of Q.
+     !> Based on the algorithm of Elmroth and Gustavson,
+     !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+
+     pure recursive subroutine stdlib_dgelqt3( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, ldt
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, j, j1, m1, m2, iinfo
+           ! Executable Statements 
+           info = 0
+           if( m < 0 ) then
+              info = -1
+           else if( n < m ) then
+              info = -2
+           else if( lda < max( 1, m ) ) then
+              info = -4
+           else if( ldt < max( 1, m ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELQT3', -info )
+              return
+           end if
+           if( m==1 ) then
+              ! compute householder transform when m=1
+              call stdlib_dlarfg( n, a(1,1), a( 1, min( 2, n ) ), lda, t(1,1) )
+           else
+              ! otherwise, split a into blocks...
+              m1 = m/2
+              m2 = m-m1
+              i1 = min( m1+1, m )
+              j1 = min( m+1, n )
+              ! compute a(1:m1,1:n) <- (y1,r1,t1), where q1 = i - y1 t1 y1^h
+              call stdlib_dgelqt3( m1, n, a, lda, t, ldt, iinfo )
+              ! compute a(j1:m,1:n) = q1^h a(j1:m,1:n) [workspace: t(1:n1,j1:n)]
+              do i=1,m2
+                 do j=1,m1
+                    t(  i+m1, j ) = a( i+m1, j )
+                 end do
+              end do
+              call stdlib_dtrmm( 'R', 'U', 'T', 'U', m2, m1, one,a, lda, t( i1, 1 ), ldt )
+              call stdlib_dgemm( 'N', 'T', m2, m1, n-m1, one, a( i1, i1 ), lda,a( 1, i1 ), lda, &
+                        one, t( i1, 1 ), ldt)
+              call stdlib_dtrmm( 'R', 'U', 'N', 'N', m2, m1, one,t, ldt, t( i1, 1 ), ldt )
+              call stdlib_dgemm( 'N', 'N', m2, n-m1, m1, -one, t( i1, 1 ), ldt,a( 1, i1 ), lda, &
+                        one, a( i1, i1 ), lda )
+              call stdlib_dtrmm( 'R', 'U', 'N', 'U', m2, m1 , one,a, lda, t( i1, 1 ), ldt )
+                        
+              do i=1,m2
+                 do j=1,m1
+                    a(  i+m1, j ) = a( i+m1, j ) - t( i+m1, j )
+                    t( i+m1, j )=0
+                 end do
+              end do
+              ! compute a(j1:m,j1:n) <- (y2,r2,t2) where q2 = i - y2 t2 y2^h
+              call stdlib_dgelqt3( m2, n-m1, a( i1, i1 ), lda,t( i1, i1 ), ldt, iinfo )
+              ! compute t3 = t(j1:n1,1:n) = -t1 y1^h y2 t2
+              do i=1,m2
+                 do j=1,m1
+                    t( j, i+m1  ) = (a( j, i+m1 ))
+                 end do
+              end do
+              call stdlib_dtrmm( 'R', 'U', 'T', 'U', m1, m2, one,a( i1, i1 ), lda, t( 1, i1 ), &
+                        ldt )
+              call stdlib_dgemm( 'N', 'T', m1, m2, n-m, one, a( 1, j1 ), lda,a( i1, j1 ), lda, &
+                        one, t( 1, i1 ), ldt )
+              call stdlib_dtrmm( 'L', 'U', 'N', 'N', m1, m2, -one, t, ldt,t( 1, i1 ), ldt )
+                        
+              call stdlib_dtrmm( 'R', 'U', 'N', 'N', m1, m2, one,t( i1, i1 ), ldt, t( 1, i1 ), &
+                        ldt )
+              ! y = (y1,y2); l = [ l1            0  ];  t = [t1 t3]
+                               ! [ a(1:n1,j1:n)  l2 ]       [ 0 t2]
+           end if
+           return
+     end subroutine stdlib_dgelqt3
+
+     !> DGEQL2: computes a QL factorization of a real m by n matrix A:
+     !> A = Q * L.
+
+     pure subroutine stdlib_dgeql2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQL2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = k, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! a(1:m-k+i-1,n-k+i)
+              call stdlib_dlarfg( m-k+i, a( m-k+i, n-k+i ), a( 1, n-k+i ), 1,tau( i ) )
+              ! apply h(i) to a(1:m-k+i,1:n-k+i-1) from the left
+              aii = a( m-k+i, n-k+i )
+              a( m-k+i, n-k+i ) = one
+              call stdlib_dlarf( 'LEFT', m-k+i, n-k+i-1, a( 1, n-k+i ), 1, tau( i ),a, lda, work )
+                        
+              a( m-k+i, n-k+i ) = aii
+           end do
+           return
+     end subroutine stdlib_dgeql2
+
+     !> DGEQLF: computes a QL factorization of a real M-by-N matrix A:
+     !> A = Q * L.
+
+     pure subroutine stdlib_dgeqlf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ki, kk, ldwork, lwkopt, mu, nb, nbmin, nu, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              k = min( m, n )
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'DGEQLF', ' ', m, n, -1, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, n ) .and. .not.lquery ) then
+                 info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQLF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DGEQLF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGEQLF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially.
+              ! the last kk columns are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              do i = k - kk + ki + 1, k - kk + 1, -nb
+                 ib = min( k-i+1, nb )
+                 ! compute the ql factorization of the current block
+                 ! a(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1)
+                 call stdlib_dgeql2( m-k+i+ib-1, ib, a( 1, n-k+i ), lda, tau( i ),work, iinfo )
+                           
+                 if( n-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_dlarft( 'BACKWARD', 'COLUMNWISE', m-k+i+ib-1, ib,a( 1, n-k+i ), &
+                              lda, tau( i ), work, ldwork )
+                    ! apply h**t to a(1:m-k+i+ib-1,1:n-k+i-1) from the left
+                    call stdlib_dlarfb( 'LEFT', 'TRANSPOSE', 'BACKWARD','COLUMNWISE', m-k+i+ib-1, &
+                    n-k+i-1, ib,a( 1, n-k+i ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+              mu = m - k + i + nb - 1
+              nu = n - k + i + nb - 1
+           else
+              mu = m
+              nu = n
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 .and. nu>0 )call stdlib_dgeql2( mu, nu, a, lda, tau, work, iinfo )
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_dgeqlf
+
+     !> DGEQR2: computes a QR factorization of a real m-by-n matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a m-by-m orthogonal matrix;
+     !> R is an upper-triangular n-by-n matrix;
+     !> 0 is a (m-n)-by-n zero matrix, if m > n.
+
+     pure subroutine stdlib_dgeqr2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQR2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+              call stdlib_dlarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tau( i ) )
+              if( i<n ) then
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 aii = a( i, i )
+                 a( i, i ) = one
+                 call stdlib_dlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tau( i ),a( i, i+1 ), lda, &
+                           work )
+                 a( i, i ) = aii
+              end if
+           end do
+           return
+     end subroutine stdlib_dgeqr2
+
+     !> DGEQR2P: computes a QR factorization of a real m-by-n matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a m-by-m orthogonal matrix;
+     !> R is an upper-triangular n-by-n matrix with nonnegative diagonal
+     !> entries;
+     !> 0 is a (m-n)-by-n zero matrix, if m > n.
+
+     subroutine stdlib_dgeqr2p( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQR2P', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+              call stdlib_dlarfgp( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tau( i ) )
+              if( i<n ) then
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 aii = a( i, i )
+                 a( i, i ) = one
+                 call stdlib_dlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tau( i ),a( i, i+1 ), lda, &
+                           work )
+                 a( i, i ) = aii
+              end if
+           end do
+           return
+     end subroutine stdlib_dgeqr2p
+
+     !> DGEQRF: computes a QR factorization of a real M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     pure subroutine stdlib_dgeqrf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           k = min( m, n )
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DGEQRF', ' ', m, n, -1, -1 )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( .not.lquery ) then
+              if( lwork<=0 .or. ( m>0 .and. lwork<max( 1, n ) ) )info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQRF', -info )
+              return
+           else if( lquery ) then
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DGEQRF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGEQRF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the qr factorization of the current block
+                 ! a(i:m,i:i+ib-1)
+                 call stdlib_dgeqr2( m-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_dlarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h**t to a(i:m,i+ib:n) from the left
+                    call stdlib_dlarfb( 'LEFT', 'TRANSPOSE', 'FORWARD','COLUMNWISE', m-i+1, n-i-&
+                    ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 ), ldwork &
+                              )
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_dgeqr2( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_dgeqrf
+
+     !> DGEQR2P computes a QR factorization of a real M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix with nonnegative diagonal
+     !> entries;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     subroutine stdlib_dgeqrfp( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DGEQRF', ' ', m, n, -1, -1 )
+           lwkopt = n*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQRFP', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DGEQRF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGEQRF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the qr factorization of the current block
+                 ! a(i:m,i:i+ib-1)
+                 call stdlib_dgeqr2p( m-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_dlarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h**t to a(i:m,i+ib:n) from the left
+                    call stdlib_dlarfb( 'LEFT', 'TRANSPOSE', 'FORWARD','COLUMNWISE', m-i+1, n-i-&
+                    ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 ), ldwork &
+                              )
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_dgeqr2p( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_dgeqrfp
+
+     !> DGEQRT2: computes a QR factorization of a real M-by-N matrix A,
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_dgeqrt2( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(dp) :: aii, alpha
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( m<n ) then
+              info = -1
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQRT2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elem. refl. h(i) to annihilate a(i+1:m,i), tau(i) -> t(i,1)
+              call stdlib_dlarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,t( i, 1 ) )
+              if( i<n ) then
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 aii = a( i, i )
+                 a( i, i ) = one
+                 ! w(1:n-i) := a(i:m,i+1:n)^h * a(i:m,i) [w = t(:,n)]
+                 call stdlib_dgemv( 'T',m-i+1, n-i, one, a( i, i+1 ), lda,a( i, i ), 1, zero, t( &
+                           1, n ), 1 )
+                 ! a(i:m,i+1:n) = a(i:m,i+1:n) + alpha*a(i:m,i)*w(1:n-1)^h
+                 alpha = -(t( i, 1 ))
+                 call stdlib_dger( m-i+1, n-i, alpha, a( i, i ), 1,t( 1, n ), 1, a( i, i+1 ), lda &
+                           )
+                 a( i, i ) = aii
+              end if
+           end do
+           do i = 2, n
+              aii = a( i, i )
+              a( i, i ) = one
+              ! t(1:i-1,i) := alpha * a(i:m,1:i-1)**t * a(i:m,i)
+              alpha = -t( i, 1 )
+              call stdlib_dgemv( 'T', m-i+1, i-1, alpha, a( i, 1 ), lda,a( i, i ), 1, zero, t( 1, &
+                        i ), 1 )
+              a( i, i ) = aii
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+              call stdlib_dtrmv( 'U', 'N', 'N', i-1, t, ldt, t( 1, i ), 1 )
+                 ! t(i,i) = tau(i)
+                 t( i, i ) = t( i, 1 )
+                 t( i, 1) = zero
+           end do
+     end subroutine stdlib_dgeqrt2
+
+     !> DGEQRT3: recursively computes a QR factorization of a real M-by-N
+     !> matrix A, using the compact WY representation of Q.
+     !> Based on the algorithm of Elmroth and Gustavson,
+     !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+
+     pure recursive subroutine stdlib_dgeqrt3( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, ldt
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, j, j1, n1, n2, iinfo
+           ! Executable Statements 
+           info = 0
+           if( n < 0 ) then
+              info = -2
+           else if( m < n ) then
+              info = -1
+           else if( lda < max( 1, m ) ) then
+              info = -4
+           else if( ldt < max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQRT3', -info )
+              return
+           end if
+           if( n==1 ) then
+              ! compute householder transform when n=1
+              call stdlib_dlarfg( m, a(1,1), a( min( 2, m ), 1 ), 1, t(1,1) )
+           else
+              ! otherwise, split a into blocks...
+              n1 = n/2
+              n2 = n-n1
+              j1 = min( n1+1, n )
+              i1 = min( n+1, m )
+              ! compute a(1:m,1:n1) <- (y1,r1,t1), where q1 = i - y1 t1 y1^h
+              call stdlib_dgeqrt3( m, n1, a, lda, t, ldt, iinfo )
+              ! compute a(1:m,j1:n) = q1^h a(1:m,j1:n) [workspace: t(1:n1,j1:n)]
+              do j=1,n2
+                 do i=1,n1
+                    t( i, j+n1 ) = a( i, j+n1 )
+                 end do
+              end do
+              call stdlib_dtrmm( 'L', 'L', 'T', 'U', n1, n2, one,a, lda, t( 1, j1 ), ldt )
+              call stdlib_dgemm( 'T', 'N', n1, n2, m-n1, one, a( j1, 1 ), lda,a( j1, j1 ), lda, &
+                        one, t( 1, j1 ), ldt)
+              call stdlib_dtrmm( 'L', 'U', 'T', 'N', n1, n2, one,t, ldt, t( 1, j1 ), ldt )
+              call stdlib_dgemm( 'N', 'N', m-n1, n2, n1, -one, a( j1, 1 ), lda,t( 1, j1 ), ldt, &
+                        one, a( j1, j1 ), lda )
+              call stdlib_dtrmm( 'L', 'L', 'N', 'U', n1, n2, one,a, lda, t( 1, j1 ), ldt )
+              do j=1,n2
+                 do i=1,n1
+                    a( i, j+n1 ) = a( i, j+n1 ) - t( i, j+n1 )
+                 end do
+              end do
+              ! compute a(j1:m,j1:n) <- (y2,r2,t2) where q2 = i - y2 t2 y2^h
+              call stdlib_dgeqrt3( m-n1, n2, a( j1, j1 ), lda,t( j1, j1 ), ldt, iinfo )
+              ! compute t3 = t(1:n1,j1:n) = -t1 y1^h y2 t2
+              do i=1,n1
+                 do j=1,n2
+                    t( i, j+n1 ) = (a( j+n1, i ))
+                 end do
+              end do
+              call stdlib_dtrmm( 'R', 'L', 'N', 'U', n1, n2, one,a( j1, j1 ), lda, t( 1, j1 ), &
+                        ldt )
+              call stdlib_dgemm( 'T', 'N', n1, n2, m-n, one, a( i1, 1 ), lda,a( i1, j1 ), lda, &
+                        one, t( 1, j1 ), ldt )
+              call stdlib_dtrmm( 'L', 'U', 'N', 'N', n1, n2, -one, t, ldt,t( 1, j1 ), ldt )
+                        
+              call stdlib_dtrmm( 'R', 'U', 'N', 'N', n1, n2, one,t( j1, j1 ), ldt, t( 1, j1 ), &
+                        ldt )
+              ! y = (y1,y2); r = [ r1  a(1:n1,j1:n) ];  t = [t1 t3]
+                               ! [  0        r2     ]       [ 0 t2]
+           end if
+           return
+     end subroutine stdlib_dgeqrt3
+
+     !> DGERFS: improves the computed solution to a system of linear
+     !> equations and provides error bounds and backward error estimates for
+     !> the solution.
+
+     pure subroutine stdlib_dgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*)
+           real(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transt
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGERFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_dcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_dgemv( trans, n, n, -one, a, lda, x( 1, j ), 1, one,work( n+1 ), 1 )
+                        
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(op(a))*abs(x) + abs(b).
+              if( notran ) then
+                 do k = 1, n
+                    xk = abs( x( k, j ) )
+                    do i = 1, n
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                    end do
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    do i = 1, n
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_dgetrs( trans, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                 call stdlib_daxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_dlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_dlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_dgetrs( transt, n, 1, af, ldaf, ipiv, work( n+1 ),n, info )
+                              
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_dgetrs( trans, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_dgerfs
+
+     !> DGERQ2: computes an RQ factorization of a real m by n matrix A:
+     !> A = R * Q.
+
+     pure subroutine stdlib_dgerq2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGERQ2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = k, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! a(m-k+i,1:n-k+i-1)
+              call stdlib_dlarfg( n-k+i, a( m-k+i, n-k+i ), a( m-k+i, 1 ), lda,tau( i ) )
+              ! apply h(i) to a(1:m-k+i-1,1:n-k+i) from the right
+              aii = a( m-k+i, n-k+i )
+              a( m-k+i, n-k+i ) = one
+              call stdlib_dlarf( 'RIGHT', m-k+i-1, n-k+i, a( m-k+i, 1 ), lda,tau( i ), a, lda, &
+                        work )
+              a( m-k+i, n-k+i ) = aii
+           end do
+           return
+     end subroutine stdlib_dgerq2
+
+     !> DGERQF: computes an RQ factorization of a real M-by-N matrix A:
+     !> A = R * Q.
+
+     pure subroutine stdlib_dgerqf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ki, kk, ldwork, lwkopt, mu, nb, nbmin, nu, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              k = min( m, n )
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'DGERQF', ' ', m, n, -1, -1 )
+                 lwkopt = m*nb
+              end if
+              work( 1 ) = lwkopt
+              if ( .not.lquery ) then
+                 if( lwork<=0 .or. ( n>0 .and. lwork<max( 1, m ) ) )info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGERQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DGERQF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGERQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially.
+              ! the last kk rows are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              do i = k - kk + ki + 1, k - kk + 1, -nb
+                 ib = min( k-i+1, nb )
+                 ! compute the rq factorization of the current block
+                 ! a(m-k+i:m-k+i+ib-1,1:n-k+i+ib-1)
+                 call stdlib_dgerq2( ib, n-k+i+ib-1, a( m-k+i, 1 ), lda, tau( i ),work, iinfo )
+                           
+                 if( m-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_dlarft( 'BACKWARD', 'ROWWISE', n-k+i+ib-1, ib,a( m-k+i, 1 ), lda, &
+                              tau( i ), work, ldwork )
+                    ! apply h to a(1:m-k+i-1,1:n-k+i+ib-1) from the right
+                    call stdlib_dlarfb( 'RIGHT', 'NO TRANSPOSE', 'BACKWARD','ROWWISE', m-k+i-1, n-&
+                    k+i+ib-1, ib,a( m-k+i, 1 ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+              mu = m - k + i + nb - 1
+              nu = n - k + i + nb - 1
+           else
+              mu = m
+              nu = n
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 .and. nu>0 )call stdlib_dgerq2( mu, nu, a, lda, tau, work, iinfo )
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_dgerqf
+
+     !> DGETRF: computes an LU factorization of a general M-by-N matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the right-looking Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_dgetrf( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGETRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'DGETRF', ' ', m, n, -1, -1 )
+           if( nb<=1 .or. nb>=min( m, n ) ) then
+              ! use unblocked code.
+              call stdlib_dgetrf2( m, n, a, lda, ipiv, info )
+           else
+              ! use blocked code.
+              do j = 1, min( m, n ), nb
+                 jb = min( min( m, n )-j+1, nb )
+                 ! factor diagonal and subdiagonal blocks and test for exact
+                 ! singularity.
+                 call stdlib_dgetrf2( m-j+1, jb, a( j, j ), lda, ipiv( j ), iinfo )
+                 ! adjust info and the pivot indices.
+                 if( info==0 .and. iinfo>0 )info = iinfo + j - 1
+                 do i = j, min( m, j+jb-1 )
+                    ipiv( i ) = j - 1 + ipiv( i )
+                 end do
+                 ! apply interchanges to columns 1:j-1.
+                 call stdlib_dlaswp( j-1, a, lda, j, j+jb-1, ipiv, 1 )
+                 if( j+jb<=n ) then
+                    ! apply interchanges to columns j+jb:n.
+                    call stdlib_dlaswp( n-j-jb+1, a( 1, j+jb ), lda, j, j+jb-1,ipiv, 1 )
+                    ! compute block row of u.
+                    call stdlib_dtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', jb,n-j-jb+1, one, &
+                              a( j, j ), lda, a( j, j+jb ),lda )
+                    if( j+jb<=m ) then
+                       ! update trailing submatrix.
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-j-jb+1,n-j-jb+1, jb, -&
+                       one, a( j+jb, j ), lda,a( j, j+jb ), lda, one, a( j+jb, j+jb ),lda )
+                                 
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_dgetrf
+
+     !> DGGHD3: reduces a pair of real matrices (A,B) to generalized upper
+     !> Hessenberg form using orthogonal transformations, where A is a
+     !> general matrix and B is upper triangular.  The form of the
+     !> generalized eigenvalue problem is
+     !> A*x = lambda*B*x,
+     !> and B is typically made upper triangular by computing its QR
+     !> factorization and moving the orthogonal matrix Q to the left side
+     !> of the equation.
+     !> This subroutine simultaneously reduces A to a Hessenberg matrix H:
+     !> Q**T*A*Z = H
+     !> and transforms B to another upper triangular matrix T:
+     !> Q**T*B*Z = T
+     !> in order to reduce the problem to its standard form
+     !> H*y = lambda*T*y
+     !> where y = Z**T*x.
+     !> The orthogonal matrices Q and Z are determined as products of Givens
+     !> rotations.  They may either be formed explicitly, or they may be
+     !> postmultiplied into input matrices Q1 and Z1, so that
+     !> Q1 * A * Z1**T = (Q1*Q) * H * (Z1*Z)**T
+     !> Q1 * B * Z1**T = (Q1*Q) * T * (Z1*Z)**T
+     !> If Q1 is the orthogonal matrix from the QR factorization of B in the
+     !> original equation A*x = lambda*B*x, then DGGHD3 reduces the original
+     !> problem to generalized Hessenberg form.
+     !> This is a blocked variant of DGGHRD, using matrix-matrix
+     !> multiplications for parts of the computation to enhance performance.
+
+     pure subroutine stdlib_dgghd3( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldb, ldq, ldz, n, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           real(dp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: blk22, initq, initz, lquery, wantq, wantz
+           character :: compq2, compz2
+           integer(ilp) :: cola, i, ierr, j, j0, jcol, jj, jrow, k, kacc22, len, lwkopt, n2nb, nb,&
+                      nblst, nbmin, nh, nnb, nx, ppw, ppwo, pw, top, topq
+           real(dp) :: c, c1, c2, s, s1, s2, temp, temp1, temp2, temp3
+           ! Intrinsic Functions 
+           intrinsic :: real,max
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DGGHD3', ' ', n, ilo, ihi, -1 )
+           lwkopt = max( 6*n*nb, 1 )
+           work( 1 ) = real( lwkopt,KIND=dp)
+           initq = stdlib_lsame( compq, 'I' )
+           wantq = initq .or. stdlib_lsame( compq, 'V' )
+           initz = stdlib_lsame( compz, 'I' )
+           wantz = initz .or. stdlib_lsame( compz, 'V' )
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -1
+           else if( .not.stdlib_lsame( compz, 'N' ) .and. .not.wantz ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ( wantq .and. ldq<n ) .or. ldq<1 ) then
+              info = -11
+           else if( ( wantz .and. ldz<n ) .or. ldz<1 ) then
+              info = -13
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGHD3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! initialize q and z if desired.
+           if( initq )call stdlib_dlaset( 'ALL', n, n, zero, one, q, ldq )
+           if( initz )call stdlib_dlaset( 'ALL', n, n, zero, one, z, ldz )
+           ! zero out lower triangle of b.
+           if( n>1 )call stdlib_dlaset( 'LOWER', n-1, n-1, zero, zero, b(2, 1), ldb )
+           ! quick return if possible
+           nh = ihi - ilo + 1
+           if( nh<=1 ) then
+              work( 1 ) = one
+              return
+           end if
+           ! determine the blocksize.
+           nbmin = stdlib_ilaenv( 2, 'DGGHD3', ' ', n, ilo, ihi, -1 )
+           if( nb>1 .and. nb<nh ) then
+              ! determine when to use unblocked instead of blocked code.
+              nx = max( nb, stdlib_ilaenv( 3, 'DGGHD3', ' ', n, ilo, ihi, -1 ) )
+              if( nx<nh ) then
+                 ! determine if workspace is large enough for blocked code.
+                 if( lwork<lwkopt ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code.
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGGHD3', ' ', n, ilo, ihi,-1 ) )
+                    if( lwork>=6*n*nbmin ) then
+                       nb = lwork / ( 6*n )
+                    else
+                       nb = 1
+                    end if
+                 end if
+              end if
+           end if
+           if( nb<nbmin .or. nb>=nh ) then
+              ! use unblocked code below
+              jcol = ilo
+           else
+              ! use blocked code
+              kacc22 = stdlib_ilaenv( 16, 'DGGHD3', ' ', n, ilo, ihi, -1 )
+              blk22 = kacc22==2
+              do jcol = ilo, ihi-2, nb
+                 nnb = min( nb, ihi-jcol-1 )
+                 ! initialize small orthogonal factors that will hold the
+                 ! accumulated givens rotations in workspace.
+                 ! n2nb   denotes the number of 2*nnb-by-2*nnb factors
+                 ! nblst  denotes the (possibly smaller) order of the last
+                        ! factor.
+                 n2nb = ( ihi-jcol-1 ) / nnb - 1
+                 nblst = ihi - jcol - n2nb*nnb
+                 call stdlib_dlaset( 'ALL', nblst, nblst, zero, one, work, nblst )
+                 pw = nblst * nblst + 1
+                 do i = 1, n2nb
+                    call stdlib_dlaset( 'ALL', 2*nnb, 2*nnb, zero, one,work( pw ), 2*nnb )
+                    pw = pw + 4*nnb*nnb
+                 end do
+                 ! reduce columns jcol:jcol+nnb-1 of a to hessenberg form.
+                 do j = jcol, jcol+nnb-1
+                    ! reduce jth column of a. store cosines and sines in jth
+                    ! column of a and b, respectively.
+                    do i = ihi, j+2, -1
+                       temp = a( i-1, j )
+                       call stdlib_dlartg( temp, a( i, j ), c, s, a( i-1, j ) )
+                       a( i, j ) = c
+                       b( i, j ) = s
+                    end do
+                    ! accumulate givens rotations into workspace array.
+                    ppw  = ( nblst + 1 )*( nblst - 2 ) - j + jcol + 1
+                    len  = 2 + j - jcol
+                    jrow = j + n2nb*nnb + 2
+                    do i = ihi, jrow, -1
+                       c = a( i, j )
+                       s = b( i, j )
+                       do jj = ppw, ppw+len-1
+                          temp = work( jj + nblst )
+                          work( jj + nblst ) = c*temp - s*work( jj )
+                          work( jj ) = s*temp + c*work( jj )
+                       end do
+                       len = len + 1
+                       ppw = ppw - nblst - 1
+                    end do
+                    ppwo = nblst*nblst + ( nnb+j-jcol-1 )*2*nnb + nnb
+                    j0 = jrow - nnb
+                    do jrow = j0, j+2, -nnb
+                       ppw = ppwo
+                       len  = 2 + j - jcol
+                       do i = jrow+nnb-1, jrow, -1
+                          c = a( i, j )
+                          s = b( i, j )
+                          do jj = ppw, ppw+len-1
+                             temp = work( jj + 2*nnb )
+                             work( jj + 2*nnb ) = c*temp - s*work( jj )
+                             work( jj ) = s*temp + c*work( jj )
+                          end do
+                          len = len + 1
+                          ppw = ppw - 2*nnb - 1
+                       end do
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                    ! top denotes the number of top rows in a and b that will
+                    ! not be updated during the next steps.
+                    if( jcol<=2 ) then
+                       top = 0
+                    else
+                       top = jcol
+                    end if
+                    ! propagate transformations through b and replace stored
+                    ! left sines/cosines by right sines/cosines.
+                    do jj = n, j+1, -1
+                       ! update jjth column of b.
+                       do i = min( jj+1, ihi ), j+2, -1
+                          c = a( i, j )
+                          s = b( i, j )
+                          temp = b( i, jj )
+                          b( i, jj ) = c*temp - s*b( i-1, jj )
+                          b( i-1, jj ) = s*temp + c*b( i-1, jj )
+                       end do
+                       ! annihilate b( jj+1, jj ).
+                       if( jj<ihi ) then
+                          temp = b( jj+1, jj+1 )
+                          call stdlib_dlartg( temp, b( jj+1, jj ), c, s,b( jj+1, jj+1 ) )
+                          b( jj+1, jj ) = zero
+                          call stdlib_drot( jj-top, b( top+1, jj+1 ), 1,b( top+1, jj ), 1, c, s )
+                                    
+                          a( jj+1, j ) = c
+                          b( jj+1, j ) = -s
+                       end if
+                    end do
+                    ! update a by transformations from right.
+                    ! explicit loop unrolling provides better performance
+                    ! compared to stdlib_dlasr.
+                     ! call stdlib_dlasr( 'right', 'variable', 'backward', ihi-top,
+           ! $                     ihi-j, a( j+2, j ), b( j+2, j ),
+           ! $                     a( top+1, j+1 ), lda )
+                    jj = mod( ihi-j-1, 3 )
+                    do i = ihi-j-3, jj+1, -3
+                       c = a( j+1+i, j )
+                       s = -b( j+1+i, j )
+                       c1 = a( j+2+i, j )
+                       s1 = -b( j+2+i, j )
+                       c2 = a( j+3+i, j )
+                       s2 = -b( j+3+i, j )
+                       do k = top+1, ihi
+                          temp = a( k, j+i  )
+                          temp1 = a( k, j+i+1 )
+                          temp2 = a( k, j+i+2 )
+                          temp3 = a( k, j+i+3 )
+                          a( k, j+i+3 ) = c2*temp3 + s2*temp2
+                          temp2 = -s2*temp3 + c2*temp2
+                          a( k, j+i+2 ) = c1*temp2 + s1*temp1
+                          temp1 = -s1*temp2 + c1*temp1
+                          a( k, j+i+1 ) = c*temp1 + s*temp
+                          a( k, j+i ) = -s*temp1 + c*temp
+                       end do
+                    end do
+                    if( jj>0 ) then
+                       do i = jj, 1, -1
+                          call stdlib_drot( ihi-top, a( top+1, j+i+1 ), 1,a( top+1, j+i ), 1, a( &
+                                    j+1+i, j ),-b( j+1+i, j ) )
+                       end do
+                    end if
+                    ! update (j+1)th column of a by transformations from left.
+                    if ( j < jcol + nnb - 1 ) then
+                       len  = 1 + j - jcol
+                       ! multiply with the trailing accumulated orthogonal
+                       ! matrix, which takes the form
+                              ! [  u11  u12  ]
+                          ! u = [            ],
+                              ! [  u21  u22  ]
+                       ! where u21 is a len-by-len matrix and u12 is lower
+                       ! triangular.
+                       jrow = ihi - nblst + 1
+                       call stdlib_dgemv( 'TRANSPOSE', nblst, len, one, work,nblst, a( jrow, j+1 )&
+                                 , 1, zero,work( pw ), 1 )
+                       ppw = pw + len
+                       do i = jrow, jrow+nblst-len-1
+                          work( ppw ) = a( i, j+1 )
+                          ppw = ppw + 1
+                       end do
+                       call stdlib_dtrmv( 'LOWER', 'TRANSPOSE', 'NON-UNIT',nblst-len, work( &
+                                 len*nblst + 1 ), nblst,work( pw+len ), 1 )
+                       call stdlib_dgemv( 'TRANSPOSE', len, nblst-len, one,work( (len+1)*nblst - &
+                       len + 1 ), nblst,a( jrow+nblst-len, j+1 ), 1, one,work( pw+len ), 1 )
+                                 
+                       ppw = pw
+                       do i = jrow, jrow+nblst-1
+                          a( i, j+1 ) = work( ppw )
+                          ppw = ppw + 1
+                       end do
+                       ! multiply with the other accumulated orthogonal
+                       ! matrices, which take the form
+                              ! [  u11  u12   0  ]
+                              ! [                ]
+                          ! u = [  u21  u22   0  ],
+                              ! [                ]
+                              ! [   0    0    i  ]
+                       ! where i denotes the (nnb-len)-by-(nnb-len) identity
+                       ! matrix, u21 is a len-by-len upper triangular matrix
+                       ! and u12 is an nnb-by-nnb lower triangular matrix.
+                       ppwo = 1 + nblst*nblst
+                       j0 = jrow - nnb
+                       do jrow = j0, jcol+1, -nnb
+                          ppw = pw + len
+                          do i = jrow, jrow+nnb-1
+                             work( ppw ) = a( i, j+1 )
+                             ppw = ppw + 1
+                          end do
+                          ppw = pw
+                          do i = jrow+nnb, jrow+nnb+len-1
+                             work( ppw ) = a( i, j+1 )
+                             ppw = ppw + 1
+                          end do
+                          call stdlib_dtrmv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', len,work( ppwo + &
+                                    nnb ), 2*nnb, work( pw ),1 )
+                          call stdlib_dtrmv( 'LOWER', 'TRANSPOSE', 'NON-UNIT', nnb,work( ppwo + &
+                                    2*len*nnb ),2*nnb, work( pw + len ), 1 )
+                          call stdlib_dgemv( 'TRANSPOSE', nnb, len, one,work( ppwo ), 2*nnb, a( &
+                                    jrow, j+1 ), 1,one, work( pw ), 1 )
+                          call stdlib_dgemv( 'TRANSPOSE', len, nnb, one,work( ppwo + 2*len*nnb + &
+                                    nnb ), 2*nnb,a( jrow+nnb, j+1 ), 1, one,work( pw+len ), 1 )
+                          ppw = pw
+                          do i = jrow, jrow+len+nnb-1
+                             a( i, j+1 ) = work( ppw )
+                             ppw = ppw + 1
+                          end do
+                          ppwo = ppwo + 4*nnb*nnb
+                       end do
+                    end if
+                 end do
+                 ! apply accumulated orthogonal matrices to a.
+                 cola = n - jcol - nnb + 1
+                 j = ihi - nblst + 1
+                 call stdlib_dgemm( 'TRANSPOSE', 'NO TRANSPOSE', nblst,cola, nblst, one, work, &
+                           nblst,a( j, jcol+nnb ), lda, zero, work( pw ),nblst )
+                 call stdlib_dlacpy( 'ALL', nblst, cola, work( pw ), nblst,a( j, jcol+nnb ), lda )
+                           
+                 ppwo = nblst*nblst + 1
+                 j0 = j - nnb
+                 do j = j0, jcol+1, -nnb
+                    if ( blk22 ) then
+                       ! exploit the structure of
+                              ! [  u11  u12  ]
+                          ! u = [            ]
+                              ! [  u21  u22  ],
+                       ! where all blocks are nnb-by-nnb, u21 is upper
+                       ! triangular and u12 is lower triangular.
+                       call stdlib_dorm22( 'LEFT', 'TRANSPOSE', 2*nnb, cola, nnb,nnb, work( ppwo )&
+                                 , 2*nnb,a( j, jcol+nnb ), lda, work( pw ),lwork-pw+1, ierr )
+                    else
+                       ! ignore the structure of u.
+                       call stdlib_dgemm( 'TRANSPOSE', 'NO TRANSPOSE', 2*nnb,cola, 2*nnb, one, &
+                                 work( ppwo ), 2*nnb,a( j, jcol+nnb ), lda, zero, work( pw ),2*nnb )
+                       call stdlib_dlacpy( 'ALL', 2*nnb, cola, work( pw ), 2*nnb,a( j, jcol+nnb ),&
+                                  lda )
+                    end if
+                    ppwo = ppwo + 4*nnb*nnb
+                 end do
+                 ! apply accumulated orthogonal matrices to q.
+                 if( wantq ) then
+                    j = ihi - nblst + 1
+                    if ( initq ) then
+                       topq = max( 2, j - jcol + 1 )
+                       nh  = ihi - topq + 1
+                    else
+                       topq = 1
+                       nh = n
+                    end if
+                    call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,nblst, nblst, one, q( &
+                              topq, j ), ldq,work, nblst, zero, work( pw ), nh )
+                    call stdlib_dlacpy( 'ALL', nh, nblst, work( pw ), nh,q( topq, j ), ldq )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( initq ) then
+                          topq = max( 2, j - jcol + 1 )
+                          nh  = ihi - topq + 1
+                       end if
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_dorm22( 'RIGHT', 'NO TRANSPOSE', nh, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,q( topq, j ), ldq, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,2*nnb, 2*nnb, one,&
+                                     q( topq, j ), ldq,work( ppwo ), 2*nnb, zero, work( pw ),nh )
+                          call stdlib_dlacpy( 'ALL', nh, 2*nnb, work( pw ), nh,q( topq, j ), ldq )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+                 ! accumulate right givens rotations if required.
+                 if ( wantz .or. top>0 ) then
+                    ! initialize small orthogonal factors that will hold the
+                    ! accumulated givens rotations in workspace.
+                    call stdlib_dlaset( 'ALL', nblst, nblst, zero, one, work,nblst )
+                    pw = nblst * nblst + 1
+                    do i = 1, n2nb
+                       call stdlib_dlaset( 'ALL', 2*nnb, 2*nnb, zero, one,work( pw ), 2*nnb )
+                                 
+                       pw = pw + 4*nnb*nnb
+                    end do
+                    ! accumulate givens rotations into workspace array.
+                    do j = jcol, jcol+nnb-1
+                       ppw  = ( nblst + 1 )*( nblst - 2 ) - j + jcol + 1
+                       len  = 2 + j - jcol
+                       jrow = j + n2nb*nnb + 2
+                       do i = ihi, jrow, -1
+                          c = a( i, j )
+                          a( i, j ) = zero
+                          s = b( i, j )
+                          b( i, j ) = zero
+                          do jj = ppw, ppw+len-1
+                             temp = work( jj + nblst )
+                             work( jj + nblst ) = c*temp - s*work( jj )
+                             work( jj ) = s*temp + c*work( jj )
+                          end do
+                          len = len + 1
+                          ppw = ppw - nblst - 1
+                       end do
+                       ppwo = nblst*nblst + ( nnb+j-jcol-1 )*2*nnb + nnb
+                       j0 = jrow - nnb
+                       do jrow = j0, j+2, -nnb
+                          ppw = ppwo
+                          len  = 2 + j - jcol
+                          do i = jrow+nnb-1, jrow, -1
+                             c = a( i, j )
+                             a( i, j ) = zero
+                             s = b( i, j )
+                             b( i, j ) = zero
+                             do jj = ppw, ppw+len-1
+                                temp = work( jj + 2*nnb )
+                                work( jj + 2*nnb ) = c*temp - s*work( jj )
+                                work( jj ) = s*temp + c*work( jj )
+                             end do
+                             len = len + 1
+                             ppw = ppw - 2*nnb - 1
+                          end do
+                          ppwo = ppwo + 4*nnb*nnb
+                       end do
+                    end do
+                 else
+                    call stdlib_dlaset( 'LOWER', ihi - jcol - 1, nnb, zero, zero,a( jcol + 2, &
+                              jcol ), lda )
+                    call stdlib_dlaset( 'LOWER', ihi - jcol - 1, nnb, zero, zero,b( jcol + 2, &
+                              jcol ), ldb )
+                 end if
+                 ! apply accumulated orthogonal matrices to a and b.
+                 if ( top>0 ) then
+                    j = ihi - nblst + 1
+                    call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,nblst, nblst, one, a( &
+                              1, j ), lda,work, nblst, zero, work( pw ), top )
+                    call stdlib_dlacpy( 'ALL', top, nblst, work( pw ), top,a( 1, j ), lda )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_dorm22( 'RIGHT', 'NO TRANSPOSE', top, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,a( 1, j ), lda, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,2*nnb, 2*nnb, &
+                                    one, a( 1, j ), lda,work( ppwo ), 2*nnb, zero,work( pw ), top )
+                          call stdlib_dlacpy( 'ALL', top, 2*nnb, work( pw ), top,a( 1, j ), lda )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                    j = ihi - nblst + 1
+                    call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,nblst, nblst, one, b( &
+                              1, j ), ldb,work, nblst, zero, work( pw ), top )
+                    call stdlib_dlacpy( 'ALL', top, nblst, work( pw ), top,b( 1, j ), ldb )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_dorm22( 'RIGHT', 'NO TRANSPOSE', top, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,b( 1, j ), ldb, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,2*nnb, 2*nnb, &
+                                    one, b( 1, j ), ldb,work( ppwo ), 2*nnb, zero,work( pw ), top )
+                          call stdlib_dlacpy( 'ALL', top, 2*nnb, work( pw ), top,b( 1, j ), ldb )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+                 ! apply accumulated orthogonal matrices to z.
+                 if( wantz ) then
+                    j = ihi - nblst + 1
+                    if ( initq ) then
+                       topq = max( 2, j - jcol + 1 )
+                       nh  = ihi - topq + 1
+                    else
+                       topq = 1
+                       nh = n
+                    end if
+                    call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,nblst, nblst, one, z( &
+                              topq, j ), ldz,work, nblst, zero, work( pw ), nh )
+                    call stdlib_dlacpy( 'ALL', nh, nblst, work( pw ), nh,z( topq, j ), ldz )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                          if ( initq ) then
+                          topq = max( 2, j - jcol + 1 )
+                          nh  = ihi - topq + 1
+                       end if
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_dorm22( 'RIGHT', 'NO TRANSPOSE', nh, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,z( topq, j ), ldz, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,2*nnb, 2*nnb, one,&
+                                     z( topq, j ), ldz,work( ppwo ), 2*nnb, zero, work( pw ),nh )
+                          call stdlib_dlacpy( 'ALL', nh, 2*nnb, work( pw ), nh,z( topq, j ), ldz )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+              end do
+           end if
+           ! use unblocked code to reduce the rest of the matrix
+           ! avoid re-initialization of modified q and z.
+           compq2 = compq
+           compz2 = compz
+           if ( jcol/=ilo ) then
+              if ( wantq )compq2 = 'V'
+              if ( wantz )compz2 = 'V'
+           end if
+           if ( jcol<ihi )call stdlib_dgghrd( compq2, compz2, n, jcol, ihi, a, lda, b, ldb, q,ldq,&
+                      z, ldz, ierr )
+           work( 1 ) = real( lwkopt,KIND=dp)
+           return
+     end subroutine stdlib_dgghd3
+
+     !> DGGQRF: computes a generalized QR factorization of an N-by-M matrix A
+     !> and an N-by-P matrix B:
+     !> A = Q*R,        B = Q*T*Z,
+     !> where Q is an N-by-N orthogonal matrix, Z is a P-by-P orthogonal
+     !> matrix, and R and T assume one of the forms:
+     !> if N >= M,  R = ( R11 ) M  ,   or if N < M,  R = ( R11  R12 ) N,
+     !> (  0  ) N-M                         N   M-N
+     !> M
+     !> where R11 is upper triangular, and
+     !> if N <= P,  T = ( 0  T12 ) N,   or if N > P,  T = ( T11 ) N-P,
+     !> P-N  N                           ( T21 ) P
+     !> P
+     !> where T12 or T21 is upper triangular.
+     !> In particular, if B is square and nonsingular, the GQR factorization
+     !> of A and B implicitly gives the QR factorization of inv(B)*A:
+     !> inv(B)*A = Z**T*(inv(T)*R)
+     !> where inv(B) denotes the inverse of the matrix B, and Z**T denotes the
+     !> transpose of the matrix Z.
+
+     pure subroutine stdlib_dggqrf( n, m, p, a, lda, taua, b, ldb, taub, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: taua(*), taub(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkopt, nb, nb1, nb2, nb3
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'DGEQRF', ' ', n, m, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'DGERQF', ' ', n, p, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'DORMQR', ' ', n, m, p, -1 )
+           nb = max( nb1, nb2, nb3 )
+           lwkopt = max( n, m, p )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( p<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, n, m, p ) .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGQRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! qr factorization of n-by-m matrix a: a = q*r
+           call stdlib_dgeqrf( n, m, a, lda, taua, work, lwork, info )
+           lopt = work( 1 )
+           ! update b := q**t*b.
+           call stdlib_dormqr( 'LEFT', 'TRANSPOSE', n, p, min( n, m ), a, lda, taua,b, ldb, work, &
+                     lwork, info )
+           lopt = max( lopt, int( work( 1 ),KIND=ilp) )
+           ! rq factorization of n-by-p matrix b: b = t*z.
+           call stdlib_dgerqf( n, p, b, ldb, taub, work, lwork, info )
+           work( 1 ) = max( lopt, int( work( 1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_dggqrf
+
+     !> DGGRQF: computes a generalized RQ factorization of an M-by-N matrix A
+     !> and a P-by-N matrix B:
+     !> A = R*Q,        B = Z*T*Q,
+     !> where Q is an N-by-N orthogonal matrix, Z is a P-by-P orthogonal
+     !> matrix, and R and T assume one of the forms:
+     !> if M <= N,  R = ( 0  R12 ) M,   or if M > N,  R = ( R11 ) M-N,
+     !> N-M  M                           ( R21 ) N
+     !> N
+     !> where R12 or R21 is upper triangular, and
+     !> if P >= N,  T = ( T11 ) N  ,   or if P < N,  T = ( T11  T12 ) P,
+     !> (  0  ) P-N                         P   N-P
+     !> N
+     !> where T11 is upper triangular.
+     !> In particular, if B is square and nonsingular, the GRQ factorization
+     !> of A and B implicitly gives the RQ factorization of A*inv(B):
+     !> A*inv(B) = (R*inv(T))*Z**T
+     !> where inv(B) denotes the inverse of the matrix B, and Z**T denotes the
+     !> transpose of the matrix Z.
+
+     pure subroutine stdlib_dggrqf( m, p, n, a, lda, taua, b, ldb, taub, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: taua(*), taub(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkopt, nb, nb1, nb2, nb3
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'DGERQF', ' ', m, n, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'DGEQRF', ' ', p, n, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'DORMRQ', ' ', m, n, p, -1 )
+           nb = max( nb1, nb2, nb3 )
+           lwkopt = max( n, m, p )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( p<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, p ) ) then
+              info = -8
+           else if( lwork<max( 1, m, p, n ) .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGRQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! rq factorization of m-by-n matrix a: a = r*q
+           call stdlib_dgerqf( m, n, a, lda, taua, work, lwork, info )
+           lopt = work( 1 )
+           ! update b := b*q**t
+           call stdlib_dormrq( 'RIGHT', 'TRANSPOSE', p, n, min( m, n ),a( max( 1, m-n+1 ), 1 ), &
+                     lda, taua, b, ldb, work,lwork, info )
+           lopt = max( lopt, int( work( 1 ),KIND=ilp) )
+           ! qr factorization of p-by-n matrix b: b = z*t
+           call stdlib_dgeqrf( p, n, b, ldb, taub, work, lwork, info )
+           work( 1 ) = max( lopt, int( work( 1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_dggrqf
+
+     !> DGSVJ0: is called from DGESVJ as a pre-processor and that is its main
+     !> purpose. It applies Jacobi rotations in the same way as DGESVJ does, but
+     !> it does not check convergence (stopping criterion). Few tuning
+     !> parameters (marked by [TP]) are available for the implementer.
+
+     pure subroutine stdlib_dgsvj0( jobv, m, n, a, lda, d, sva, mv, v, ldv, eps,sfmin, tol, &
+               nsweep, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n, nsweep
+           real(dp), intent(in) :: eps, sfmin, tol
+           character, intent(in) :: jobv
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), sva(n), d(n), v(ldv,*)
+           real(dp), intent(out) :: work(lwork)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: aapp, aapp0, aapq, aaqq, apoaq, aqoap, big, bigtheta, cs, mxaapq, mxsinj, &
+                     rootbig, rooteps, rootsfmin, roottol, small, sn, t, temp1, theta, thsign
+           integer(ilp) :: blskip, emptsw, i, ibr, ierr, igl, ijblsk, ir1, iswrot, jbc, jgl, kbl, &
+                     lkahead, mvl, nbl, notrot, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, rotok, rsvec
+           ! Local Arrays 
+           real(dp) :: fastr(5)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real,min,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           applv = stdlib_lsame( jobv, 'A' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -3
+           else if( lda<m ) then
+              info = -5
+           else if( ( rsvec.or.applv ) .and. ( mv<0 ) ) then
+              info = -8
+           else if( ( rsvec.and.( ldv<n ) ).or.( applv.and.( ldv<mv ) ) ) then
+              info = -10
+           else if( tol<=eps ) then
+              info = -13
+           else if( nsweep<0 ) then
+              info = -14
+           else if( lwork<m ) then
+              info = -16
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGSVJ0', -info )
+              return
+           end if
+           if( rsvec ) then
+              mvl = n
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           rooteps = sqrt( eps )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / eps
+           big = one / sfmin
+           rootbig = one / rootsfmin
+           bigtheta = one / rooteps
+           roottol = sqrt( tol )
+           ! -#- row-cyclic jacobi svd algorithm with column pivoting -#-
+           emptsw = ( n*( n-1 ) ) / 2
+           notrot = 0
+           fastr( 1 ) = zero
+           ! -#- row-cyclic pivot strategy with de rijk's pivoting -#-
+           swband = 0
+      ! [tp] swband is a tuning parameter. it is meaningful and effective
+           ! if stdlib_sgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_sgesvj. for sweeps i=1:swband the procedure
+           ! ......
+           kbl = min( 8, n )
+      ! [tp] kbl is a tuning parameter that defines the tile size in the
+           ! tiling of the p-q loops of pivot pairs. in general, an optimal
+           ! value of kbl depends on the matrix dimensions and on the
+           ! parameters of the computer's memory.
+           nbl = n / kbl
+           if( ( nbl*kbl )/=n )nbl = nbl + 1
+           blskip = ( kbl**2 ) + 1
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           lkahead = 1
+      ! [tp] lkahead is a tuning parameter.
+           swband = 0
+           pskipped = 0
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+              loop_2000: do ibr = 1, nbl
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_1002: do ir1 = 0, min( lkahead, nbl-ibr )
+                    igl = igl + ir1*kbl
+                    loop_2001: do p = igl, min( igl+kbl-1, n-1 )
+           ! .. de rijk's pivoting
+                       q = stdlib_idamax( n-p+1, sva( p ), 1 ) + p - 1
+                       if( p/=q ) then
+                          call stdlib_dswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                          if( rsvec )call stdlib_dswap( mvl, v( 1, p ), 1,v( 1, q ), 1 )
+                          temp1 = sva( p )
+                          sva( p ) = sva( q )
+                          sva( q ) = temp1
+                          temp1 = d( p )
+                          d( p ) = d( q )
+                          d( q ) = temp1
+                       end if
+                       if( ir1==0 ) then
+              ! column norms are periodically updated by explicit
+              ! norm computation.
+              ! caveat:
+              ! some blas implementations compute stdlib_dnrm2(m,a(1,p),1)
+              ! as sqrt(stdlib_ddot(m,a(1,p),1,a(1,p),1)), which may result in
+              ! overflow for ||a(:,p)||_2 > sqrt(overflow_threshold), and
+              ! underflow for ||a(:,p)||_2 < sqrt(underflow_threshold).
+              ! hence, stdlib_dnrm2 cannot be trusted, not even in the case when
+              ! the true norm is far from the under(over)flow boundaries.
+              ! if properly implemented stdlib_dnrm2 is available, the if-then-else
+              ! below should read "aapp = stdlib_dnrm2( m, a(1,p), 1 ) * d(p)".
+                          if( ( sva( p )<rootbig ) .and.( sva( p )>rootsfmin ) ) then
+                             sva( p ) = stdlib_dnrm2( m, a( 1, p ), 1 )*d( p )
+                          else
+                             temp1 = zero
+                             aapp = one
+                             call stdlib_dlassq( m, a( 1, p ), 1, temp1, aapp )
+                             sva( p ) = temp1*sqrt( aapp )*d( p )
+                          end if
+                          aapp = sva( p )
+                       else
+                          aapp = sva( p )
+                       end if
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2002: do q = p + 1, min( igl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+                                if( aaqq>=one ) then
+                                   rotok = ( small*aapp )<=aaqq
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_ddot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_dcopy( m, a( 1, p ), 1, work, 1 )
+                                      call stdlib_dlascl( 'G', 0, 0, aapp, d( p ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_ddot( m, work, 1, a( 1, q ),1 )*d( q ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   rotok = aapp<=( aaqq / small )
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_ddot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_dcopy( m, a( 1, q ), 1, work, 1 )
+                                      call stdlib_dlascl( 'G', 0, 0, aaqq, d( q ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_ddot( m, work, 1, a( 1, p ),1 )*d( p ) / &
+                                                aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                 ! Rotate
+                 ! rotated = rotated + one
+                                   if( ir1==0 ) then
+                                      notrot = 0
+                                      pskipped = 0
+                                      iswrot = iswrot + 1
+                                   end if
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/aapq
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*d( p ) / d( q )
+                                         fastr( 4 ) = -t*d( q ) / d( p )
+                                         call stdlib_drotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_drotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = d( p ) / d( q )
+                                         aqoap = d( q ) / d( p )
+                                         if( d( p )>=one ) then
+                                            if( d( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q )*cs
+                                               call stdlib_drotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_drotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_daxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_daxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q ) / cs
+                                               if( rsvec ) then
+                                                  call stdlib_daxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_daxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                            end if
+                                         else
+                                            if( d( q )>=one ) then
+                                               call stdlib_daxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_daxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               d( p ) = d( p ) / cs
+                                               d( q ) = d( q )*cs
+                                               if( rsvec ) then
+                                                  call stdlib_daxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_daxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                            else
+                                               if( d( p )>=d( q ) ) then
+                                                  call stdlib_daxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_daxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  d( p ) = d( p )*cs
+                                                  d( q ) = d( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_daxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_daxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_daxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_daxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  d( p ) = d( p ) / cs
+                                                  d( q ) = d( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_daxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_daxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                      call stdlib_dcopy( m, a( 1, p ), 1, work, 1 )
+                                      call stdlib_dlascl( 'G', 0, 0, aapp, one, m,1, work, lda, &
+                                                ierr )
+                                      call stdlib_dlascl( 'G', 0, 0, aaqq, one, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      temp1 = -aapq*d( p ) / d( q )
+                                      call stdlib_daxpy( m, temp1, work, 1,a( 1, q ), 1 )
+                                      call stdlib_dlascl( 'G', 0, 0, one, aaqq, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                      mxsinj = max( mxsinj, sfmin )
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! recompute sva(q), sva(p).
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_dnrm2( m, a( 1, q ), 1 )*d( q )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_dlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*d( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_dnrm2( m, a( 1, p ), 1 )*d( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_dlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*d( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                                else
+              ! a(:,p) and a(:,q) already numerically orthogonal
+                                   if( ir1==0 )notrot = notrot + 1
+                                   pskipped = pskipped + 1
+                                end if
+                             else
+              ! a(:,q) is zero column
+                                if( ir1==0 )notrot = notrot + 1
+                                pskipped = pskipped + 1
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                if( ir1==0 )aapp = -aapp
+                                notrot = 0
+                                go to 2103
+                             end if
+                          end do loop_2002
+           ! end q-loop
+           2103 continue
+           ! bailed out of q-loop
+                          sva( p ) = aapp
+                       else
+                          sva( p ) = aapp
+                          if( ( ir1==0 ) .and. ( aapp==zero ) )notrot = notrot + min( igl+kbl-1, &
+                                    n ) - p
+                       end if
+                    end do loop_2001
+           ! end of the p-loop
+           ! end of doing the block ( ibr, ibr )
+                 end do loop_1002
+           ! end of ir1-loop
+      ! ........................................................
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = ibr + 1, nbl
+                    jgl = ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! -#- m x 2 jacobi svd -#-
+              ! -#- safe gram matrix computation -#-
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_ddot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_dcopy( m, a( 1, p ), 1, work, 1 )
+                                      call stdlib_dlascl( 'G', 0, 0, aapp, d( p ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_ddot( m, work, 1, a( 1, q ),1 )*d( q ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_ddot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_dcopy( m, a( 1, q ), 1, work, 1 )
+                                      call stdlib_dlascl( 'G', 0, 0, aaqq, d( q ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_ddot( m, work, 1, a( 1, p ),1 )*d( p ) / &
+                                                aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                                   notrot = 0
+                 ! rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/aapq
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*d( p ) / d( q )
+                                         fastr( 4 ) = -t*d( q ) / d( p )
+                                         call stdlib_drotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_drotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = d( p ) / d( q )
+                                         aqoap = d( q ) / d( p )
+                                         if( d( p )>=one ) then
+                                            if( d( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q )*cs
+                                               call stdlib_drotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_drotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_daxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_daxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_daxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_daxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q ) / cs
+                                            end if
+                                         else
+                                            if( d( q )>=one ) then
+                                               call stdlib_daxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_daxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_daxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_daxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                               d( p ) = d( p ) / cs
+                                               d( q ) = d( q )*cs
+                                            else
+                                               if( d( p )>=d( q ) ) then
+                                                  call stdlib_daxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_daxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  d( p ) = d( p )*cs
+                                                  d( q ) = d( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_daxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_daxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_daxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_daxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  d( p ) = d( p ) / cs
+                                                  d( q ) = d( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_daxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_daxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                                      if( aapp>aaqq ) then
+                                         call stdlib_dcopy( m, a( 1, p ), 1, work,1 )
+                                         call stdlib_dlascl( 'G', 0, 0, aapp, one,m, 1, work, lda,&
+                                                    ierr )
+                                         call stdlib_dlascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*d( p ) / d( q )
+                                         call stdlib_daxpy( m, temp1, work, 1,a( 1, q ), 1 )
+                                                   
+                                         call stdlib_dlascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      else
+                                         call stdlib_dcopy( m, a( 1, q ), 1, work,1 )
+                                         call stdlib_dlascl( 'G', 0, 0, aaqq, one,m, 1, work, lda,&
+                                                    ierr )
+                                         call stdlib_dlascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*d( q ) / d( p )
+                                         call stdlib_daxpy( m, temp1, work, 1,a( 1, p ), 1 )
+                                                   
+                                         call stdlib_dlascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q)
+                 ! .. recompute sva(q)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_dnrm2( m, a( 1, q ), 1 )*d( q )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_dlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*d( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_dnrm2( m, a( 1, p ), 1 )*d( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_dlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*d( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_dnrm2( m, a( 1, n ), 1 )*d( n )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_dlassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )*d( n )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<real( n,KIND=dp)*tol ) .and.( real( n,KIND=dp)&
+                        *mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:) reaching this point means that the procedure has completed the given
+           ! number of iterations.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means that during the i-th sweep all pivots were
+           ! below the given tolerance, causing early exit.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the vector d.
+           do p = 1, n - 1
+              q = stdlib_idamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 temp1 = d( p )
+                 d( p ) = d( q )
+                 d( q ) = temp1
+                 call stdlib_dswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_dswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_dgsvj0
+
+     !> DGSVJ1: is called from DGESVJ as a pre-processor and that is its main
+     !> purpose. It applies Jacobi rotations in the same way as DGESVJ does, but
+     !> it targets only particular pivots and it does not check convergence
+     !> (stopping criterion). Few tuning parameters (marked by [TP]) are
+     !> available for the implementer.
+     !> Further Details
+     !> ~~~~~~~~~~~~~~~
+     !> DGSVJ1 applies few sweeps of Jacobi rotations in the column space of
+     !> the input M-by-N matrix A. The pivot pairs are taken from the (1,2)
+     !> off-diagonal block in the corresponding N-by-N Gram matrix A^T * A. The
+     !> block-entries (tiles) of the (1,2) off-diagonal block are marked by the
+     !> [x]'s in the following scheme:
+     !> | *  *  * [x] [x] [x]|
+     !> | *  *  * [x] [x] [x]|    Row-cycling in the nblr-by-nblc [x] blocks.
+     !> | *  *  * [x] [x] [x]|    Row-cyclic pivoting inside each [x] block.
+     !> |[x] [x] [x] *  *  * |
+     !> |[x] [x] [x] *  *  * |
+     !> |[x] [x] [x] *  *  * |
+     !> In terms of the columns of A, the first N1 columns are rotated 'against'
+     !> the remaining N-N1 columns, trying to increase the angle between the
+     !> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is
+     !> tiled using quadratic tiles of side KBL. Here, KBL is a tuning parameter.
+     !> The number of sweeps is given in NSWEEP and the orthogonality threshold
+     !> is given in TOL.
+
+     pure subroutine stdlib_dgsvj1( jobv, m, n, n1, a, lda, d, sva, mv, v, ldv,eps, sfmin, tol, &
+               nsweep, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: eps, sfmin, tol
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n, n1, nsweep
+           character, intent(in) :: jobv
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), d(n), sva(n), v(ldv,*)
+           real(dp), intent(out) :: work(lwork)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: aapp, aapp0, aapq, aaqq, apoaq, aqoap, big, bigtheta, cs, large, mxaapq, &
+           mxsinj, rootbig, rooteps, rootsfmin, roottol, small, sn, t, temp1, theta, &
+                     thsign
+           integer(ilp) :: blskip, emptsw, i, ibr, igl, ierr, ijblsk, iswrot, jbc, jgl, kbl, mvl, &
+                     notrot, nblc, nblr, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, rotok, rsvec
+           ! Local Arrays 
+           real(dp) :: fastr(5)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real,min,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           applv = stdlib_lsame( jobv, 'A' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -3
+           else if( n1<0 ) then
+              info = -4
+           else if( lda<m ) then
+              info = -6
+           else if( ( rsvec.or.applv ) .and. ( mv<0 ) ) then
+              info = -9
+           else if( ( rsvec.and.( ldv<n ) ).or.( applv.and.( ldv<mv ) )  ) then
+              info = -11
+           else if( tol<=eps ) then
+              info = -14
+           else if( nsweep<0 ) then
+              info = -15
+           else if( lwork<m ) then
+              info = -17
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGSVJ1', -info )
+              return
+           end if
+           if( rsvec ) then
+              mvl = n
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           rooteps = sqrt( eps )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / eps
+           big = one / sfmin
+           rootbig = one / rootsfmin
+           large = big / sqrt( real( m*n,KIND=dp) )
+           bigtheta = one / rooteps
+           roottol = sqrt( tol )
+           ! Initialize The Right Singular Vector Matrix 
+           ! rsvec = stdlib_lsame( jobv, 'y' )
+           emptsw = n1*( n-n1 )
+           notrot = 0
+           fastr( 1 ) = zero
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           kbl = min( 8, n )
+           nblr = n1 / kbl
+           if( ( nblr*kbl )/=n1 )nblr = nblr + 1
+           ! .. the tiling is nblr-by-nblc [tiles]
+           nblc = ( n-n1 ) / kbl
+           if( ( nblc*kbl )/=( n-n1 ) )nblc = nblc + 1
+           blskip = ( kbl**2 ) + 1
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           swband = 0
+      ! [tp] swband is a tuning parameter. it is meaningful and effective
+           ! if stdlib_sgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_sgesvj.
+           ! | *   *   * [x] [x] [x]|
+           ! | *   *   * [x] [x] [x]|    row-cycling in the nblr-by-nblc [x] blocks.
+           ! | *   *   * [x] [x] [x]|    row-cyclic pivoting inside each [x] block.
+           ! |[x] [x] [x] *   *   * |
+           ! |[x] [x] [x] *   *   * |
+           ! |[x] [x] [x] *   *   * |
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+              loop_2000: do ibr = 1, nblr
+                 igl = ( ibr-1 )*kbl + 1
+      ! ........................................................
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = 1, nblc
+                    jgl = n1 + ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n1 )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! Safe Gram Matrix Computation 
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_ddot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_dcopy( m, a( 1, p ), 1, work, 1 )
+                                      call stdlib_dlascl( 'G', 0, 0, aapp, d( p ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_ddot( m, work, 1, a( 1, q ),1 )*d( q ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_ddot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_dcopy( m, a( 1, q ), 1, work, 1 )
+                                      call stdlib_dlascl( 'G', 0, 0, aaqq, d( q ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_ddot( m, work, 1, a( 1, p ),1 )*d( p ) / &
+                                                aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                                   notrot = 0
+                 ! rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs(aqoap-apoaq) / aapq
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*d( p ) / d( q )
+                                         fastr( 4 ) = -t*d( q ) / d( p )
+                                         call stdlib_drotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_drotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = d( p ) / d( q )
+                                         aqoap = d( q ) / d( p )
+                                         if( d( p )>=one ) then
+                                            if( d( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q )*cs
+                                               call stdlib_drotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_drotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_daxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_daxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_daxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_daxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q ) / cs
+                                            end if
+                                         else
+                                            if( d( q )>=one ) then
+                                               call stdlib_daxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_daxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_daxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_daxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                               d( p ) = d( p ) / cs
+                                               d( q ) = d( q )*cs
+                                            else
+                                               if( d( p )>=d( q ) ) then
+                                                  call stdlib_daxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_daxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  d( p ) = d( p )*cs
+                                                  d( q ) = d( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_daxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_daxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_daxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_daxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  d( p ) = d( p ) / cs
+                                                  d( q ) = d( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_daxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_daxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                                      if( aapp>aaqq ) then
+                                         call stdlib_dcopy( m, a( 1, p ), 1, work,1 )
+                                         call stdlib_dlascl( 'G', 0, 0, aapp, one,m, 1, work, lda,&
+                                                    ierr )
+                                         call stdlib_dlascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*d( p ) / d( q )
+                                         call stdlib_daxpy( m, temp1, work, 1,a( 1, q ), 1 )
+                                                   
+                                         call stdlib_dlascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      else
+                                         call stdlib_dcopy( m, a( 1, q ), 1, work,1 )
+                                         call stdlib_dlascl( 'G', 0, 0, aaqq, one,m, 1, work, lda,&
+                                                    ierr )
+                                         call stdlib_dlascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*d( q ) / d( p )
+                                         call stdlib_daxpy( m, temp1, work, 1,a( 1, p ), 1 )
+                                                   
+                                         call stdlib_dlascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q)
+                 ! .. recompute sva(q)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_dnrm2( m, a( 1, q ), 1 )*d( q )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_dlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*d( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_dnrm2( m, a( 1, p ), 1 )*d( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_dlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*d( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+                 ! skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+            ! if ( notrot >= emptsw )  go to 2011
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+      ! **      if ( notrot >= emptsw )  go to 2011
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **   if ( notrot >= emptsw ) go to 1994
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_dnrm2( m, a( 1, n ), 1 )*d( n )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_dlassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )*d( n )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<real( n,KIND=dp)*tol ) .and.( real( n,KIND=dp)&
+                        *mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:) reaching this point means that the procedure has completed the given
+           ! number of sweeps.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means that during the i-th sweep all pivots were
+           ! below the given threshold, causing early exit.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the vector d
+           do p = 1, n - 1
+              q = stdlib_idamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 temp1 = d( p )
+                 d( p ) = d( q )
+                 d( q ) = temp1
+                 call stdlib_dswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_dswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_dgsvj1
+
+     !> DGTCON: estimates the reciprocal of the condition number of a real
+     !> tridiagonal matrix A using the LU factorization as computed by
+     !> DGTTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_dgtcon( norm, n, dl, d, du, du2, ipiv, anorm, rcond,work, iwork, info &
+               )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: d(*), dl(*), du(*), du2(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: onenrm
+           integer(ilp) :: i, kase, kase1
+           real(dp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGTCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           ! check that d(1:n) is non-zero.
+           do i = 1, n
+              if( d( i )==zero )return
+           end do
+           ainvnm = zero
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kase = 0
+           20 continue
+           call stdlib_dlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(u)*inv(l).
+                 call stdlib_dgttrs( 'NO TRANSPOSE', n, 1, dl, d, du, du2, ipiv,work, n, info )
+                           
+              else
+                 ! multiply by inv(l**t)*inv(u**t).
+                 call stdlib_dgttrs( 'TRANSPOSE', n, 1, dl, d, du, du2, ipiv, work,n, info )
+                           
+              end if
+              go to 20
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_dgtcon
+
+     !> DGTRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is tridiagonal, and provides
+     !> error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_dgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2,ipiv, b, ldb, x, &
+               ldx, ferr, berr, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: b(ldb,*), d(*), df(*), dl(*), dlf(*), du(*), du2(*), duf(*)
+                     
+           real(dp), intent(out) :: berr(*), ferr(*), work(*)
+           real(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transn, transt
+           integer(ilp) :: count, i, j, kase, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -13
+           else if( ldx<max( 1, n ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGTRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'T'
+           else
+              transn = 'T'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = 4
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_110: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_dcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_dlagtm( trans, n, 1, -one, dl, d, du, x( 1, j ), ldx, one,work( n+1 ), &
+                        n )
+              ! compute abs(op(a))*abs(x) + abs(b) for use in the backward
+              ! error bound.
+              if( notran ) then
+                 if( n==1 ) then
+                    work( 1 ) = abs( b( 1, j ) ) + abs( d( 1 )*x( 1, j ) )
+                 else
+                    work( 1 ) = abs( b( 1, j ) ) + abs( d( 1 )*x( 1, j ) ) +abs( du( 1 )*x( 2, j )&
+                               )
+                    do i = 2, n - 1
+                       work( i ) = abs( b( i, j ) ) +abs( dl( i-1 )*x( i-1, j ) ) +abs( d( i )*x( &
+                                 i, j ) ) +abs( du( i )*x( i+1, j ) )
+                    end do
+                    work( n ) = abs( b( n, j ) ) +abs( dl( n-1 )*x( n-1, j ) ) +abs( d( n )*x( n, &
+                              j ) )
+                 end if
+              else
+                 if( n==1 ) then
+                    work( 1 ) = abs( b( 1, j ) ) + abs( d( 1 )*x( 1, j ) )
+                 else
+                    work( 1 ) = abs( b( 1, j ) ) + abs( d( 1 )*x( 1, j ) ) +abs( dl( 1 )*x( 2, j )&
+                               )
+                    do i = 2, n - 1
+                       work( i ) = abs( b( i, j ) ) +abs( du( i-1 )*x( i-1, j ) ) +abs( d( i )*x( &
+                                 i, j ) ) +abs( dl( i )*x( i+1, j ) )
+                    end do
+                    work( n ) = abs( b( n, j ) ) +abs( du( n-1 )*x( n-1, j ) ) +abs( d( n )*x( n, &
+                              j ) )
+                 end if
+              end if
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_dgttrs( trans, n, 1, dlf, df, duf, du2, ipiv,work( n+1 ), n, info )
+                           
+                 call stdlib_daxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_dlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              70 continue
+              call stdlib_dlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_dgttrs( transt, n, 1, dlf, df, duf, du2, ipiv,work( n+1 ), n, &
+                              info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_dgttrs( transn, n, 1, dlf, df, duf, du2, ipiv,work( n+1 ), n, &
+                              info )
+                 end if
+                 go to 70
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_110
+           return
+     end subroutine stdlib_dgtrfs
+
+     !> DGTSVX: uses the LU factorization to compute the solution to a real
+     !> system of linear equations A * X = B or A**T * X = B,
+     !> where A is a tridiagonal matrix of order N and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     pure subroutine stdlib_dgtsvx( fact, trans, n, nrhs, dl, d, du, dlf, df, duf,du2, ipiv, b, &
+               ldb, x, ldx, rcond, ferr, berr,work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: b(ldb,*), d(*), dl(*), du(*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+           real(dp), intent(inout) :: df(*), dlf(*), du2(*), duf(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact, notran
+           character :: norm
+           real(dp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -14
+           else if( ldx<max( 1, n ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGTSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the lu factorization of a.
+              call stdlib_dcopy( n, d, 1, df, 1 )
+              if( n>1 ) then
+                 call stdlib_dcopy( n-1, dl, 1, dlf, 1 )
+                 call stdlib_dcopy( n-1, du, 1, duf, 1 )
+              end if
+              call stdlib_dgttrf( n, dlf, df, duf, du2, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_dlangt( norm, n, dl, d, du )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_dgtcon( norm, n, dlf, df, duf, du2, ipiv, anorm, rcond, work,iwork, info )
+                     
+           ! compute the solution vectors x.
+           call stdlib_dlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_dgttrs( trans, n, nrhs, dlf, df, duf, du2, ipiv, x, ldx,info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_dgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2, ipiv,b, ldb, x, ldx, &
+                     ferr, berr, work, iwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_dgtsvx
+
+     !> DHGEQZ: computes the eigenvalues of a real matrix pair (H,T),
+     !> where H is an upper Hessenberg matrix and T is upper triangular,
+     !> using the double-shift QZ method.
+     !> Matrix pairs of this type are produced by the reduction to
+     !> generalized upper Hessenberg form of a real matrix pair (A,B):
+     !> A = Q1*H*Z1**T,  B = Q1*T*Z1**T,
+     !> as computed by DGGHRD.
+     !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+     !> also reduced to generalized Schur form,
+     !> H = Q*S*Z**T,  T = Q*P*Z**T,
+     !> where Q and Z are orthogonal matrices, P is an upper triangular
+     !> matrix, and S is a quasi-triangular matrix with 1-by-1 and 2-by-2
+     !> diagonal blocks.
+     !> The 1-by-1 blocks correspond to real eigenvalues of the matrix pair
+     !> (H,T) and the 2-by-2 blocks correspond to complex conjugate pairs of
+     !> eigenvalues.
+     !> Additionally, the 2-by-2 upper triangular diagonal blocks of P
+     !> corresponding to 2-by-2 blocks of S are reduced to positive diagonal
+     !> form, i.e., if S(j+1,j) is non-zero, then P(j+1,j) = P(j,j+1) = 0,
+     !> P(j,j) > 0, and P(j+1,j+1) > 0.
+     !> Optionally, the orthogonal matrix Q from the generalized Schur
+     !> factorization may be postmultiplied into an input matrix Q1, and the
+     !> orthogonal matrix Z may be postmultiplied into an input matrix Z1.
+     !> If Q1 and Z1 are the orthogonal matrices from DGGHRD that reduced
+     !> the matrix pair (A,B) to generalized upper Hessenberg form, then the
+     !> output matrices Q1*Q and Z1*Z are the orthogonal factors from the
+     !> generalized Schur factorization of (A,B):
+     !> A = (Q1*Q)*S*(Z1*Z)**T,  B = (Q1*Q)*P*(Z1*Z)**T.
+     !> To avoid overflow, eigenvalues of the matrix pair (H,T) (equivalently,
+     !> of (A,B)) are computed as a pair of values (alpha,beta), where alpha is
+     !> complex and beta real.
+     !> If beta is nonzero, lambda = alpha / beta is an eigenvalue of the
+     !> generalized nonsymmetric eigenvalue problem (GNEP)
+     !> A*x = lambda*B*x
+     !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+     !> alternate form of the GNEP
+     !> mu*A*y = B*y.
+     !> Real eigenvalues can be read directly from the generalized Schur
+     !> form:
+     !> alpha = S(i,i), beta = P(i,i).
+     !> Ref: C.B. Moler
+     !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+     !> pp. 241--256.
+
+     subroutine stdlib_dhgeqz( job, compq, compz, n, ilo, ihi, h, ldh, t, ldt,alphar, alphai, &
+               beta, q, ldq, z, ldz, work,lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz, job
+           integer(ilp), intent(in) :: ihi, ilo, ldh, ldq, ldt, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(out) :: alphai(*), alphar(*), beta(*), work(*)
+           real(dp), intent(inout) :: h(ldh,*), q(ldq,*), t(ldt,*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: safety = 1.0e+2_dp
+          ! $                     safety = one )
+           
+           ! Local Scalars 
+           logical(lk) :: ilazr2, ilazro, ilpivt, ilq, ilschr, ilz, lquery
+           integer(ilp) :: icompq, icompz, ifirst, ifrstm, iiter, ilast, ilastm, in, ischur, &
+                     istart, j, jc, jch, jiter, jr, maxit
+           real(dp) :: a11, a12, a1i, a1r, a21, a22, a2i, a2r, ad11, ad11l, ad12, ad12l, ad21, &
+           ad21l, ad22, ad22l, ad32l, an, anorm, ascale, atol, b11, b1a, b1i, b1r, b22, b2a, b2i, &
+           b2r, bn, bnorm, bscale, btol, c, c11i, c11r, c12, c21, c22i, c22r, cl, cq, cr, cz, &
+           eshift, s, s1, s1inv, s2, safmax, safmin, scale, sl, sqi, sqr, sr, szi, szr, t1, tau, &
+           temp, temp2, tempi, tempr, u1, u12, u12l, u2, ulp, vs, w11, w12, w21, w22, wabs, wi, &
+                     wr, wr2
+           ! Local Arrays 
+           real(dp) :: v(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,sqrt
+           ! Executable Statements 
+           ! decode job, compq, compz
+           if( stdlib_lsame( job, 'E' ) ) then
+              ilschr = .false.
+              ischur = 1
+           else if( stdlib_lsame( job, 'S' ) ) then
+              ilschr = .true.
+              ischur = 2
+           else
+              ischur = 0
+           end if
+           if( stdlib_lsame( compq, 'N' ) ) then
+              ilq = .false.
+              icompq = 1
+           else if( stdlib_lsame( compq, 'V' ) ) then
+              ilq = .true.
+              icompq = 2
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              ilq = .true.
+              icompq = 3
+           else
+              icompq = 0
+           end if
+           if( stdlib_lsame( compz, 'N' ) ) then
+              ilz = .false.
+              icompz = 1
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              ilz = .true.
+              icompz = 2
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              ilz = .true.
+              icompz = 3
+           else
+              icompz = 0
+           end if
+           ! check argument values
+           info = 0
+           work( 1 ) = max( 1, n )
+           lquery = ( lwork==-1 )
+           if( ischur==0 ) then
+              info = -1
+           else if( icompq==0 ) then
+              info = -2
+           else if( icompz==0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 ) then
+              info = -5
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -6
+           else if( ldh<n ) then
+              info = -8
+           else if( ldt<n ) then
+              info = -10
+           else if( ldq<1 .or. ( ilq .and. ldq<n ) ) then
+              info = -15
+           else if( ldz<1 .or. ( ilz .and. ldz<n ) ) then
+              info = -17
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DHGEQZ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = real( 1,KIND=dp)
+              return
+           end if
+           ! initialize q and z
+           if( icompq==3 )call stdlib_dlaset( 'FULL', n, n, zero, one, q, ldq )
+           if( icompz==3 )call stdlib_dlaset( 'FULL', n, n, zero, one, z, ldz )
+           ! machine constants
+           in = ihi + 1 - ilo
+           safmin = stdlib_dlamch( 'S' )
+           safmax = one / safmin
+           ulp = stdlib_dlamch( 'E' )*stdlib_dlamch( 'B' )
+           anorm = stdlib_dlanhs( 'F', in, h( ilo, ilo ), ldh, work )
+           bnorm = stdlib_dlanhs( 'F', in, t( ilo, ilo ), ldt, work )
+           atol = max( safmin, ulp*anorm )
+           btol = max( safmin, ulp*bnorm )
+           ascale = one / max( safmin, anorm )
+           bscale = one / max( safmin, bnorm )
+           ! set eigenvalues ihi+1:n
+           do j = ihi + 1, n
+              if( t( j, j )<zero ) then
+                 if( ilschr ) then
+                    do jr = 1, j
+                       h( jr, j ) = -h( jr, j )
+                       t( jr, j ) = -t( jr, j )
+                    end do
+                 else
+                    h( j, j ) = -h( j, j )
+                    t( j, j ) = -t( j, j )
+                 end if
+                 if( ilz ) then
+                    do jr = 1, n
+                       z( jr, j ) = -z( jr, j )
+                    end do
+                 end if
+              end if
+              alphar( j ) = h( j, j )
+              alphai( j ) = zero
+              beta( j ) = t( j, j )
+           end do
+           ! if ihi < ilo, skip qz steps
+           if( ihi<ilo )go to 380
+           ! main qz iteration loop
+           ! initialize dynamic indices
+           ! eigenvalues ilast+1:n have been found.
+              ! column operations modify rows ifrstm:whatever.
+              ! row operations modify columns whatever:ilastm.
+           ! if only eigenvalues are being computed, then
+              ! ifrstm is the row of the last splitting row above row ilast;
+              ! this is always at least ilo.
+           ! iiter counts iterations since the last eigenvalue was found,
+              ! to tell when to use an extraordinary shift.
+           ! maxit is the maximum number of qz sweeps allowed.
+           ilast = ihi
+           if( ilschr ) then
+              ifrstm = 1
+              ilastm = n
+           else
+              ifrstm = ilo
+              ilastm = ihi
+           end if
+           iiter = 0
+           eshift = zero
+           maxit = 30*( ihi-ilo+1 )
+           loop_360: do jiter = 1, maxit
+              ! split the matrix if possible.
+              ! two tests:
+                 ! 1: h(j,j-1)=0  or  j=ilo
+                 ! 2: t(j,j)=0
+              if( ilast==ilo ) then
+                 ! special case: j=ilast
+                 go to 80
+              else
+                 if( abs( h( ilast, ilast-1 ) )<=max( safmin, ulp*(abs( h( ilast, ilast ) ) + abs(&
+                            h( ilast-1, ilast-1 ) )) ) ) then
+                    h( ilast, ilast-1 ) = zero
+                    go to 80
+                 end if
+              end if
+              if( abs( t( ilast, ilast ) )<=max( safmin, ulp*(abs( t( ilast - 1, ilast ) ) + abs( &
+                        t( ilast-1, ilast-1 )) ) ) ) then
+                 t( ilast, ilast ) = zero
+                 go to 70
+              end if
+              ! general case: j<ilast
+              loop_60: do j = ilast - 1, ilo, -1
+                 ! test 1: for h(j,j-1)=0 or j=ilo
+                 if( j==ilo ) then
+                    ilazro = .true.
+                 else
+                    if( abs( h( j, j-1 ) )<=max( safmin, ulp*(abs( h( j, j ) ) + abs( h( j-1, j-1 &
+                              ) )) ) ) then
+                       h( j, j-1 ) = zero
+                       ilazro = .true.
+                    else
+                       ilazro = .false.
+                    end if
+                 end if
+                 ! test 2: for t(j,j)=0
+                 temp = abs ( t( j, j + 1 ) )
+                 if ( j > ilo )temp = temp + abs ( t( j - 1, j ) )
+                 if( abs( t( j, j ) )<max( safmin,ulp*temp ) ) then
+                    t( j, j ) = zero
+                    ! test 1a: check for 2 consecutive small subdiagonals in a
+                    ilazr2 = .false.
+                    if( .not.ilazro ) then
+                       temp = abs( h( j, j-1 ) )
+                       temp2 = abs( h( j, j ) )
+                       tempr = max( temp, temp2 )
+                       if( tempr<one .and. tempr/=zero ) then
+                          temp = temp / tempr
+                          temp2 = temp2 / tempr
+                       end if
+                       if( temp*( ascale*abs( h( j+1, j ) ) )<=temp2*( ascale*atol ) )ilazr2 = &
+                                 .true.
+                    end if
+                    ! if both tests pass (1
+                    ! element of b in the block is zero, split a 1x1 block off
+                    ! at the top. (i.e., at the j-th row/column) the leading
+                    ! diagonal element of the remainder can also be zero, so
+                    ! this may have to be done repeatedly.
+                    if( ilazro .or. ilazr2 ) then
+                       do jch = j, ilast - 1
+                          temp = h( jch, jch )
+                          call stdlib_dlartg( temp, h( jch+1, jch ), c, s,h( jch, jch ) )
+                          h( jch+1, jch ) = zero
+                          call stdlib_drot( ilastm-jch, h( jch, jch+1 ), ldh,h( jch+1, jch+1 ), &
+                                    ldh, c, s )
+                          call stdlib_drot( ilastm-jch, t( jch, jch+1 ), ldt,t( jch+1, jch+1 ), &
+                                    ldt, c, s )
+                          if( ilq )call stdlib_drot( n, q( 1, jch ), 1, q( 1, jch+1 ), 1,c, s )
+                                    
+                          if( ilazr2 )h( jch, jch-1 ) = h( jch, jch-1 )*c
+                          ilazr2 = .false.
+                          if( abs( t( jch+1, jch+1 ) )>=btol ) then
+                             if( jch+1>=ilast ) then
+                                go to 80
+                             else
+                                ifirst = jch + 1
+                                go to 110
+                             end if
+                          end if
+                          t( jch+1, jch+1 ) = zero
+                       end do
+                       go to 70
+                    else
+                       ! only test 2 passed -- chase the zero to t(ilast,ilast)
+                       ! then process as in the case t(ilast,ilast)=0
+                       do jch = j, ilast - 1
+                          temp = t( jch, jch+1 )
+                          call stdlib_dlartg( temp, t( jch+1, jch+1 ), c, s,t( jch, jch+1 ) )
+                                    
+                          t( jch+1, jch+1 ) = zero
+                          if( jch<ilastm-1 )call stdlib_drot( ilastm-jch-1, t( jch, jch+2 ), ldt,&
+                                    t( jch+1, jch+2 ), ldt, c, s )
+                          call stdlib_drot( ilastm-jch+2, h( jch, jch-1 ), ldh,h( jch+1, jch-1 ), &
+                                    ldh, c, s )
+                          if( ilq )call stdlib_drot( n, q( 1, jch ), 1, q( 1, jch+1 ), 1,c, s )
+                                    
+                          temp = h( jch+1, jch )
+                          call stdlib_dlartg( temp, h( jch+1, jch-1 ), c, s,h( jch+1, jch ) )
+                                    
+                          h( jch+1, jch-1 ) = zero
+                          call stdlib_drot( jch+1-ifrstm, h( ifrstm, jch ), 1,h( ifrstm, jch-1 ), &
+                                    1, c, s )
+                          call stdlib_drot( jch-ifrstm, t( ifrstm, jch ), 1,t( ifrstm, jch-1 ), 1,&
+                                     c, s )
+                          if( ilz )call stdlib_drot( n, z( 1, jch ), 1, z( 1, jch-1 ), 1,c, s )
+                                    
+                       end do
+                       go to 70
+                    end if
+                 else if( ilazro ) then
+                    ! only test 1 passed -- work on j:ilast
+                    ifirst = j
+                    go to 110
+                 end if
+                 ! neither test passed -- try next j
+              end do loop_60
+              ! (drop-through is "impossible")
+              info = n + 1
+              go to 420
+              ! t(ilast,ilast)=0 -- clear h(ilast,ilast-1) to split off a
+              ! 1x1 block.
+              70 continue
+              temp = h( ilast, ilast )
+              call stdlib_dlartg( temp, h( ilast, ilast-1 ), c, s,h( ilast, ilast ) )
+              h( ilast, ilast-1 ) = zero
+              call stdlib_drot( ilast-ifrstm, h( ifrstm, ilast ), 1,h( ifrstm, ilast-1 ), 1, c, s &
+                        )
+              call stdlib_drot( ilast-ifrstm, t( ifrstm, ilast ), 1,t( ifrstm, ilast-1 ), 1, c, s &
+                        )
+              if( ilz )call stdlib_drot( n, z( 1, ilast ), 1, z( 1, ilast-1 ), 1, c, s )
+              ! h(ilast,ilast-1)=0 -- standardize b, set alphar, alphai,
+                                    ! and beta
+                                    80 continue
+              if( t( ilast, ilast )<zero ) then
+                 if( ilschr ) then
+                    do j = ifrstm, ilast
+                       h( j, ilast ) = -h( j, ilast )
+                       t( j, ilast ) = -t( j, ilast )
+                    end do
+                 else
+                    h( ilast, ilast ) = -h( ilast, ilast )
+                    t( ilast, ilast ) = -t( ilast, ilast )
+                 end if
+                 if( ilz ) then
+                    do j = 1, n
+                       z( j, ilast ) = -z( j, ilast )
+                    end do
+                 end if
+              end if
+              alphar( ilast ) = h( ilast, ilast )
+              alphai( ilast ) = zero
+              beta( ilast ) = t( ilast, ilast )
+              ! go to next block -- exit if finished.
+              ilast = ilast - 1
+              if( ilast<ilo )go to 380
+              ! reset counters
+              iiter = 0
+              eshift = zero
+              if( .not.ilschr ) then
+                 ilastm = ilast
+                 if( ifrstm>ilast )ifrstm = ilo
+              end if
+              go to 350
+              ! qz step
+              ! this iteration only involves rows/columns ifirst:ilast. we
+              ! assume ifirst < ilast, and that the diagonal of b is non-zero.
+              110 continue
+              iiter = iiter + 1
+              if( .not.ilschr ) then
+                 ifrstm = ifirst
+              end if
+              ! compute single shifts.
+              ! at this point, ifirst < ilast, and the diagonal elements of
+              ! t(ifirst:ilast,ifirst,ilast) are larger than btol (in
+              ! magnitude)
+              if( ( iiter / 10 )*10==iiter ) then
+                 ! exceptional shift.  chosen for no particularly good reason.
+                 ! (single shift only.)
+                 if( ( real( maxit,KIND=dp)*safmin )*abs( h( ilast, ilast-1 ) )<abs( t( ilast-1, &
+                           ilast-1 ) ) ) then
+                    eshift = h( ilast, ilast-1 ) /t( ilast-1, ilast-1 )
+                 else
+                    eshift = eshift + one / ( safmin*real( maxit,KIND=dp) )
+                 end if
+                 s1 = one
+                 wr = eshift
+              else
+                 ! shifts based on the generalized eigenvalues of the
+                 ! bottom-right 2x2 block of a and b. the first eigenvalue
+                 ! returned by stdlib_dlag2 is the wilkinson shift (aep p.512_dp),
+                 call stdlib_dlag2( h( ilast-1, ilast-1 ), ldh,t( ilast-1, ilast-1 ), ldt, &
+                           safmin*safety, s1,s2, wr, wr2, wi )
+                 if ( abs( (wr/s1)*t( ilast, ilast ) - h( ilast, ilast ) )> abs( (wr2/s2)*t( &
+                           ilast, ilast )- h( ilast, ilast ) ) ) then
+                    temp = wr
+                    wr = wr2
+                    wr2 = temp
+                    temp = s1
+                    s1 = s2
+                    s2 = temp
+                 end if
+                 temp = max( s1, safmin*max( one, abs( wr ), abs( wi ) ) )
+                 if( wi/=zero )go to 200
+              end if
+              ! fiddle with shift to avoid overflow
+              temp = min( ascale, one )*( half*safmax )
+              if( s1>temp ) then
+                 scale = temp / s1
+              else
+                 scale = one
+              end if
+              temp = min( bscale, one )*( half*safmax )
+              if( abs( wr )>temp )scale = min( scale, temp / abs( wr ) )
+              s1 = scale*s1
+              wr = scale*wr
+              ! now check for two consecutive small subdiagonals.
+              do j = ilast - 1, ifirst + 1, -1
+                 istart = j
+                 temp = abs( s1*h( j, j-1 ) )
+                 temp2 = abs( s1*h( j, j )-wr*t( j, j ) )
+                 tempr = max( temp, temp2 )
+                 if( tempr<one .and. tempr/=zero ) then
+                    temp = temp / tempr
+                    temp2 = temp2 / tempr
+                 end if
+                 if( abs( ( ascale*h( j+1, j ) )*temp )<=( ascale*atol )*temp2 )go to 130
+              end do
+              istart = ifirst
+              130 continue
+              ! do an implicit single-shift qz sweep.
+              ! initial q
+              temp = s1*h( istart, istart ) - wr*t( istart, istart )
+              temp2 = s1*h( istart+1, istart )
+              call stdlib_dlartg( temp, temp2, c, s, tempr )
+              ! sweep
+              loop_190: do j = istart, ilast - 1
+                 if( j>istart ) then
+                    temp = h( j, j-1 )
+                    call stdlib_dlartg( temp, h( j+1, j-1 ), c, s, h( j, j-1 ) )
+                    h( j+1, j-1 ) = zero
+                 end if
+                 do jc = j, ilastm
+                    temp = c*h( j, jc ) + s*h( j+1, jc )
+                    h( j+1, jc ) = -s*h( j, jc ) + c*h( j+1, jc )
+                    h( j, jc ) = temp
+                    temp2 = c*t( j, jc ) + s*t( j+1, jc )
+                    t( j+1, jc ) = -s*t( j, jc ) + c*t( j+1, jc )
+                    t( j, jc ) = temp2
+                 end do
+                 if( ilq ) then
+                    do jr = 1, n
+                       temp = c*q( jr, j ) + s*q( jr, j+1 )
+                       q( jr, j+1 ) = -s*q( jr, j ) + c*q( jr, j+1 )
+                       q( jr, j ) = temp
+                    end do
+                 end if
+                 temp = t( j+1, j+1 )
+                 call stdlib_dlartg( temp, t( j+1, j ), c, s, t( j+1, j+1 ) )
+                 t( j+1, j ) = zero
+                 do jr = ifrstm, min( j+2, ilast )
+                    temp = c*h( jr, j+1 ) + s*h( jr, j )
+                    h( jr, j ) = -s*h( jr, j+1 ) + c*h( jr, j )
+                    h( jr, j+1 ) = temp
+                 end do
+                 do jr = ifrstm, j
+                    temp = c*t( jr, j+1 ) + s*t( jr, j )
+                    t( jr, j ) = -s*t( jr, j+1 ) + c*t( jr, j )
+                    t( jr, j+1 ) = temp
+                 end do
+                 if( ilz ) then
+                    do jr = 1, n
+                       temp = c*z( jr, j+1 ) + s*z( jr, j )
+                       z( jr, j ) = -s*z( jr, j+1 ) + c*z( jr, j )
+                       z( jr, j+1 ) = temp
+                    end do
+                 end if
+              end do loop_190
+              go to 350
+              ! use francis double-shift
+              ! note: the francis double-shift should work with real shifts,
+                    ! but only if the block is at least 3x3.
+                    ! this code may break if this point is reached with
+                    ! a 2x2 block with real eigenvalues.
+                    200 continue
+              if( ifirst+1==ilast ) then
+                 ! special case -- 2x2 block with complex eigenvectors
+                 ! step 1: standardize, that is, rotate so that
+                             ! ( b11  0  )
+                         ! b = (         )  with b11 non-negative.
+                             ! (  0  b22 )
+                 call stdlib_dlasv2( t( ilast-1, ilast-1 ), t( ilast-1, ilast ),t( ilast, ilast ),&
+                            b22, b11, sr, cr, sl, cl )
+                 if( b11<zero ) then
+                    cr = -cr
+                    sr = -sr
+                    b11 = -b11
+                    b22 = -b22
+                 end if
+                 call stdlib_drot( ilastm+1-ifirst, h( ilast-1, ilast-1 ), ldh,h( ilast, ilast-1 )&
+                           , ldh, cl, sl )
+                 call stdlib_drot( ilast+1-ifrstm, h( ifrstm, ilast-1 ), 1,h( ifrstm, ilast ), 1, &
+                           cr, sr )
+                 if( ilast<ilastm )call stdlib_drot( ilastm-ilast, t( ilast-1, ilast+1 ), ldt,t( &
+                           ilast, ilast+1 ), ldt, cl, sl )
+                 if( ifrstm<ilast-1 )call stdlib_drot( ifirst-ifrstm, t( ifrstm, ilast-1 ), 1,t( &
+                           ifrstm, ilast ), 1, cr, sr )
+                 if( ilq )call stdlib_drot( n, q( 1, ilast-1 ), 1, q( 1, ilast ), 1, cl,sl )
+                           
+                 if( ilz )call stdlib_drot( n, z( 1, ilast-1 ), 1, z( 1, ilast ), 1, cr,sr )
+                           
+                 t( ilast-1, ilast-1 ) = b11
+                 t( ilast-1, ilast ) = zero
+                 t( ilast, ilast-1 ) = zero
+                 t( ilast, ilast ) = b22
+                 ! if b22 is negative, negate column ilast
+                 if( b22<zero ) then
+                    do j = ifrstm, ilast
+                       h( j, ilast ) = -h( j, ilast )
+                       t( j, ilast ) = -t( j, ilast )
+                    end do
+                    if( ilz ) then
+                       do j = 1, n
+                          z( j, ilast ) = -z( j, ilast )
+                       end do
+                    end if
+                    b22 = -b22
+                 end if
+                 ! step 2: compute alphar, alphai, and beta (see refs.)
+                 ! recompute shift
+                 call stdlib_dlag2( h( ilast-1, ilast-1 ), ldh,t( ilast-1, ilast-1 ), ldt, &
+                           safmin*safety, s1,temp, wr, temp2, wi )
+                 ! if standardization has perturbed the shift onto real line,
+                 ! do another (real single-shift) qr step.
+                 if( wi==zero )go to 350
+                 s1inv = one / s1
+                 ! do eispack (qzval) computation of alpha and beta
+                 a11 = h( ilast-1, ilast-1 )
+                 a21 = h( ilast, ilast-1 )
+                 a12 = h( ilast-1, ilast )
+                 a22 = h( ilast, ilast )
+                 ! compute complex givens rotation on right
+                 ! (assume some element of c = (sa - wb) > unfl )
+                                  ! __
+                 ! (sa - wb) ( cz   -sz )
+                           ! ( sz    cz )
+                 c11r = s1*a11 - wr*b11
+                 c11i = -wi*b11
+                 c12 = s1*a12
+                 c21 = s1*a21
+                 c22r = s1*a22 - wr*b22
+                 c22i = -wi*b22
+                 if( abs( c11r )+abs( c11i )+abs( c12 )>abs( c21 )+abs( c22r )+abs( c22i ) ) &
+                           then
+                    t1 = stdlib_dlapy3( c12, c11r, c11i )
+                    cz = c12 / t1
+                    szr = -c11r / t1
+                    szi = -c11i / t1
+                 else
+                    cz = stdlib_dlapy2( c22r, c22i )
+                    if( cz<=safmin ) then
+                       cz = zero
+                       szr = one
+                       szi = zero
+                    else
+                       tempr = c22r / cz
+                       tempi = c22i / cz
+                       t1 = stdlib_dlapy2( cz, c21 )
+                       cz = cz / t1
+                       szr = -c21*tempr / t1
+                       szi = c21*tempi / t1
+                    end if
+                 end if
+                 ! compute givens rotation on left
+                 ! (  cq   sq )
+                 ! (  __      )  a or b
+                 ! ( -sq   cq )
+                 an = abs( a11 ) + abs( a12 ) + abs( a21 ) + abs( a22 )
+                 bn = abs( b11 ) + abs( b22 )
+                 wabs = abs( wr ) + abs( wi )
+                 if( s1*an>wabs*bn ) then
+                    cq = cz*b11
+                    sqr = szr*b22
+                    sqi = -szi*b22
+                 else
+                    a1r = cz*a11 + szr*a12
+                    a1i = szi*a12
+                    a2r = cz*a21 + szr*a22
+                    a2i = szi*a22
+                    cq = stdlib_dlapy2( a1r, a1i )
+                    if( cq<=safmin ) then
+                       cq = zero
+                       sqr = one
+                       sqi = zero
+                    else
+                       tempr = a1r / cq
+                       tempi = a1i / cq
+                       sqr = tempr*a2r + tempi*a2i
+                       sqi = tempi*a2r - tempr*a2i
+                    end if
+                 end if
+                 t1 = stdlib_dlapy3( cq, sqr, sqi )
+                 cq = cq / t1
+                 sqr = sqr / t1
+                 sqi = sqi / t1
+                 ! compute diagonal elements of qbz
+                 tempr = sqr*szr - sqi*szi
+                 tempi = sqr*szi + sqi*szr
+                 b1r = cq*cz*b11 + tempr*b22
+                 b1i = tempi*b22
+                 b1a = stdlib_dlapy2( b1r, b1i )
+                 b2r = cq*cz*b22 + tempr*b11
+                 b2i = -tempi*b11
+                 b2a = stdlib_dlapy2( b2r, b2i )
+                 ! normalize so beta > 0, and im( alpha1 ) > 0
+                 beta( ilast-1 ) = b1a
+                 beta( ilast ) = b2a
+                 alphar( ilast-1 ) = ( wr*b1a )*s1inv
+                 alphai( ilast-1 ) = ( wi*b1a )*s1inv
+                 alphar( ilast ) = ( wr*b2a )*s1inv
+                 alphai( ilast ) = -( wi*b2a )*s1inv
+                 ! step 3: go to next block -- exit if finished.
+                 ilast = ifirst - 1
+                 if( ilast<ilo )go to 380
+                 ! reset counters
+                 iiter = 0
+                 eshift = zero
+                 if( .not.ilschr ) then
+                    ilastm = ilast
+                    if( ifrstm>ilast )ifrstm = ilo
+                 end if
+                 go to 350
+              else
+                 ! usual case: 3x3 or larger block, using francis implicit
+                             ! double-shift
+                                          ! 2
+                 ! eigenvalue equation is  w  - c w + d = 0,
+                                               ! -1 2        -1
+                 ! so compute 1st column of  (a b  )  - c a b   + d
+                 ! using the formula in qzit (from eispack)
+                 ! we assume that the block is at least 3x3
+                 ad11 = ( ascale*h( ilast-1, ilast-1 ) ) /( bscale*t( ilast-1, ilast-1 ) )
+                 ad21 = ( ascale*h( ilast, ilast-1 ) ) /( bscale*t( ilast-1, ilast-1 ) )
+                 ad12 = ( ascale*h( ilast-1, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 ad22 = ( ascale*h( ilast, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 u12 = t( ilast-1, ilast ) / t( ilast, ilast )
+                 ad11l = ( ascale*h( ifirst, ifirst ) ) /( bscale*t( ifirst, ifirst ) )
+                 ad21l = ( ascale*h( ifirst+1, ifirst ) ) /( bscale*t( ifirst, ifirst ) )
+                 ad12l = ( ascale*h( ifirst, ifirst+1 ) ) /( bscale*t( ifirst+1, ifirst+1 ) )
+                           
+                 ad22l = ( ascale*h( ifirst+1, ifirst+1 ) ) /( bscale*t( ifirst+1, ifirst+1 ) )
+                           
+                 ad32l = ( ascale*h( ifirst+2, ifirst+1 ) ) /( bscale*t( ifirst+1, ifirst+1 ) )
+                           
+                 u12l = t( ifirst, ifirst+1 ) / t( ifirst+1, ifirst+1 )
+                 v( 1 ) = ( ad11-ad11l )*( ad22-ad11l ) - ad12*ad21 +ad21*u12*ad11l + ( ad12l-&
+                           ad11l*u12l )*ad21l
+                 v( 2 ) = ( ( ad22l-ad11l )-ad21l*u12l-( ad11-ad11l )-( ad22-ad11l )+ad21*u12 )&
+                           *ad21l
+                 v( 3 ) = ad32l*ad21l
+                 istart = ifirst
+                 call stdlib_dlarfg( 3, v( 1 ), v( 2 ), 1, tau )
+                 v( 1 ) = one
+                 ! sweep
+                 loop_290: do j = istart, ilast - 2
+                    ! all but last elements: use 3x3 householder transforms.
+                    ! zero (j-1)st column of a
+                    if( j>istart ) then
+                       v( 1 ) = h( j, j-1 )
+                       v( 2 ) = h( j+1, j-1 )
+                       v( 3 ) = h( j+2, j-1 )
+                       call stdlib_dlarfg( 3, h( j, j-1 ), v( 2 ), 1, tau )
+                       v( 1 ) = one
+                       h( j+1, j-1 ) = zero
+                       h( j+2, j-1 ) = zero
+                    end if
+                    do jc = j, ilastm
+                       temp = tau*( h( j, jc )+v( 2 )*h( j+1, jc )+v( 3 )*h( j+2, jc ) )
+                       h( j, jc ) = h( j, jc ) - temp
+                       h( j+1, jc ) = h( j+1, jc ) - temp*v( 2 )
+                       h( j+2, jc ) = h( j+2, jc ) - temp*v( 3 )
+                       temp2 = tau*( t( j, jc )+v( 2 )*t( j+1, jc )+v( 3 )*t( j+2, jc ) )
+                       t( j, jc ) = t( j, jc ) - temp2
+                       t( j+1, jc ) = t( j+1, jc ) - temp2*v( 2 )
+                       t( j+2, jc ) = t( j+2, jc ) - temp2*v( 3 )
+                    end do
+                    if( ilq ) then
+                       do jr = 1, n
+                          temp = tau*( q( jr, j )+v( 2 )*q( jr, j+1 )+v( 3 )*q( jr, j+2 ) )
+                                    
+                          q( jr, j ) = q( jr, j ) - temp
+                          q( jr, j+1 ) = q( jr, j+1 ) - temp*v( 2 )
+                          q( jr, j+2 ) = q( jr, j+2 ) - temp*v( 3 )
+                       end do
+                    end if
+                    ! zero j-th column of b (see dlagbc for details)
+                    ! swap rows to pivot
+                    ilpivt = .false.
+                    temp = max( abs( t( j+1, j+1 ) ), abs( t( j+1, j+2 ) ) )
+                    temp2 = max( abs( t( j+2, j+1 ) ), abs( t( j+2, j+2 ) ) )
+                    if( max( temp, temp2 )<safmin ) then
+                       scale = zero
+                       u1 = one
+                       u2 = zero
+                       go to 250
+                    else if( temp>=temp2 ) then
+                       w11 = t( j+1, j+1 )
+                       w21 = t( j+2, j+1 )
+                       w12 = t( j+1, j+2 )
+                       w22 = t( j+2, j+2 )
+                       u1 = t( j+1, j )
+                       u2 = t( j+2, j )
+                    else
+                       w21 = t( j+1, j+1 )
+                       w11 = t( j+2, j+1 )
+                       w22 = t( j+1, j+2 )
+                       w12 = t( j+2, j+2 )
+                       u2 = t( j+1, j )
+                       u1 = t( j+2, j )
+                    end if
+                    ! swap columns if nec.
+                    if( abs( w12 )>abs( w11 ) ) then
+                       ilpivt = .true.
+                       temp = w12
+                       temp2 = w22
+                       w12 = w11
+                       w22 = w21
+                       w11 = temp
+                       w21 = temp2
+                    end if
+                    ! lu-factor
+                    temp = w21 / w11
+                    u2 = u2 - temp*u1
+                    w22 = w22 - temp*w12
+                    w21 = zero
+                    ! compute scale
+                    scale = one
+                    if( abs( w22 )<safmin ) then
+                       scale = zero
+                       u2 = one
+                       u1 = -w12 / w11
+                       go to 250
+                    end if
+                    if( abs( w22 )<abs( u2 ) )scale = abs( w22 / u2 )
+                    if( abs( w11 )<abs( u1 ) )scale = min( scale, abs( w11 / u1 ) )
+                    ! solve
+                    u2 = ( scale*u2 ) / w22
+                    u1 = ( scale*u1-w12*u2 ) / w11
+                    250 continue
+                    if( ilpivt ) then
+                       temp = u2
+                       u2 = u1
+                       u1 = temp
+                    end if
+                    ! compute householder vector
+                    t1 = sqrt( scale**2+u1**2+u2**2 )
+                    tau = one + scale / t1
+                    vs = -one / ( scale+t1 )
+                    v( 1 ) = one
+                    v( 2 ) = vs*u1
+                    v( 3 ) = vs*u2
+                    ! apply transformations from the right.
+                    do jr = ifrstm, min( j+3, ilast )
+                       temp = tau*( h( jr, j )+v( 2 )*h( jr, j+1 )+v( 3 )*h( jr, j+2 ) )
+                       h( jr, j ) = h( jr, j ) - temp
+                       h( jr, j+1 ) = h( jr, j+1 ) - temp*v( 2 )
+                       h( jr, j+2 ) = h( jr, j+2 ) - temp*v( 3 )
+                    end do
+                    do jr = ifrstm, j + 2
+                       temp = tau*( t( jr, j )+v( 2 )*t( jr, j+1 )+v( 3 )*t( jr, j+2 ) )
+                       t( jr, j ) = t( jr, j ) - temp
+                       t( jr, j+1 ) = t( jr, j+1 ) - temp*v( 2 )
+                       t( jr, j+2 ) = t( jr, j+2 ) - temp*v( 3 )
+                    end do
+                    if( ilz ) then
+                       do jr = 1, n
+                          temp = tau*( z( jr, j )+v( 2 )*z( jr, j+1 )+v( 3 )*z( jr, j+2 ) )
+                                    
+                          z( jr, j ) = z( jr, j ) - temp
+                          z( jr, j+1 ) = z( jr, j+1 ) - temp*v( 2 )
+                          z( jr, j+2 ) = z( jr, j+2 ) - temp*v( 3 )
+                       end do
+                    end if
+                    t( j+1, j ) = zero
+                    t( j+2, j ) = zero
+                 end do loop_290
+                 ! last elements: use givens rotations
+                 ! rotations from the left
+                 j = ilast - 1
+                 temp = h( j, j-1 )
+                 call stdlib_dlartg( temp, h( j+1, j-1 ), c, s, h( j, j-1 ) )
+                 h( j+1, j-1 ) = zero
+                 do jc = j, ilastm
+                    temp = c*h( j, jc ) + s*h( j+1, jc )
+                    h( j+1, jc ) = -s*h( j, jc ) + c*h( j+1, jc )
+                    h( j, jc ) = temp
+                    temp2 = c*t( j, jc ) + s*t( j+1, jc )
+                    t( j+1, jc ) = -s*t( j, jc ) + c*t( j+1, jc )
+                    t( j, jc ) = temp2
+                 end do
+                 if( ilq ) then
+                    do jr = 1, n
+                       temp = c*q( jr, j ) + s*q( jr, j+1 )
+                       q( jr, j+1 ) = -s*q( jr, j ) + c*q( jr, j+1 )
+                       q( jr, j ) = temp
+                    end do
+                 end if
+                 ! rotations from the right.
+                 temp = t( j+1, j+1 )
+                 call stdlib_dlartg( temp, t( j+1, j ), c, s, t( j+1, j+1 ) )
+                 t( j+1, j ) = zero
+                 do jr = ifrstm, ilast
+                    temp = c*h( jr, j+1 ) + s*h( jr, j )
+                    h( jr, j ) = -s*h( jr, j+1 ) + c*h( jr, j )
+                    h( jr, j+1 ) = temp
+                 end do
+                 do jr = ifrstm, ilast - 1
+                    temp = c*t( jr, j+1 ) + s*t( jr, j )
+                    t( jr, j ) = -s*t( jr, j+1 ) + c*t( jr, j )
+                    t( jr, j+1 ) = temp
+                 end do
+                 if( ilz ) then
+                    do jr = 1, n
+                       temp = c*z( jr, j+1 ) + s*z( jr, j )
+                       z( jr, j ) = -s*z( jr, j+1 ) + c*z( jr, j )
+                       z( jr, j+1 ) = temp
+                    end do
+                 end if
+                 ! end of double-shift code
+              end if
+              go to 350
+              ! end of iteration loop
+              350 continue
+           end do loop_360
+           ! drop-through = non-convergence
+           info = ilast
+           go to 420
+           ! successful completion of all qz steps
+           380 continue
+           ! set eigenvalues 1:ilo-1
+           do j = 1, ilo - 1
+              if( t( j, j )<zero ) then
+                 if( ilschr ) then
+                    do jr = 1, j
+                       h( jr, j ) = -h( jr, j )
+                       t( jr, j ) = -t( jr, j )
+                    end do
+                 else
+                    h( j, j ) = -h( j, j )
+                    t( j, j ) = -t( j, j )
+                 end if
+                 if( ilz ) then
+                    do jr = 1, n
+                       z( jr, j ) = -z( jr, j )
+                    end do
+                 end if
+              end if
+              alphar( j ) = h( j, j )
+              alphai( j ) = zero
+              beta( j ) = t( j, j )
+           end do
+           ! normal termination
+           info = 0
+           ! exit (other than argument error) -- return optimal workspace size
+           420 continue
+           work( 1 ) = real( n,KIND=dp)
+           return
+     end subroutine stdlib_dhgeqz
+
+     !> DLABRD: reduces the first NB rows and columns of a real general
+     !> m by n matrix A to upper or lower bidiagonal form by an orthogonal
+     !> transformation Q**T * A * P, and returns the matrices X and Y which
+     !> are needed to apply the transformation to the unreduced part of A.
+     !> If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
+     !> bidiagonal form.
+     !> This is an auxiliary routine called by DGEBRD
+
+     pure subroutine stdlib_dlabrd( m, n, nb, a, lda, d, e, tauq, taup, x, ldx, y,ldy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldx, ldy, m, n, nb
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: d(*), e(*), taup(*), tauq(*), x(ldx,*), y(ldy,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           if( m>=n ) then
+              ! reduce to upper bidiagonal form
+              loop_10: do i = 1, nb
+                 ! update a(i:m,i)
+                 call stdlib_dgemv( 'NO TRANSPOSE', m-i+1, i-1, -one, a( i, 1 ),lda, y( i, 1 ), &
+                           ldy, one, a( i, i ), 1 )
+                 call stdlib_dgemv( 'NO TRANSPOSE', m-i+1, i-1, -one, x( i, 1 ),ldx, a( 1, i ), 1,&
+                            one, a( i, i ), 1 )
+                 ! generate reflection q(i) to annihilate a(i+1:m,i)
+                 call stdlib_dlarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tauq( i ) )
+                           
+                 d( i ) = a( i, i )
+                 if( i<n ) then
+                    a( i, i ) = one
+                    ! compute y(i+1:n,i)
+                    call stdlib_dgemv( 'TRANSPOSE', m-i+1, n-i, one, a( i, i+1 ),lda, a( i, i ), &
+                              1, zero, y( i+1, i ), 1 )
+                    call stdlib_dgemv( 'TRANSPOSE', m-i+1, i-1, one, a( i, 1 ), lda,a( i, i ), 1, &
+                              zero, y( 1, i ), 1 )
+                    call stdlib_dgemv( 'NO TRANSPOSE', n-i, i-1, -one, y( i+1, 1 ),ldy, y( 1, i ),&
+                               1, one, y( i+1, i ), 1 )
+                    call stdlib_dgemv( 'TRANSPOSE', m-i+1, i-1, one, x( i, 1 ), ldx,a( i, i ), 1, &
+                              zero, y( 1, i ), 1 )
+                    call stdlib_dgemv( 'TRANSPOSE', i-1, n-i, -one, a( 1, i+1 ),lda, y( 1, i ), 1,&
+                               one, y( i+1, i ), 1 )
+                    call stdlib_dscal( n-i, tauq( i ), y( i+1, i ), 1 )
+                    ! update a(i,i+1:n)
+                    call stdlib_dgemv( 'NO TRANSPOSE', n-i, i, -one, y( i+1, 1 ),ldy, a( i, 1 ), &
+                              lda, one, a( i, i+1 ), lda )
+                    call stdlib_dgemv( 'TRANSPOSE', i-1, n-i, -one, a( 1, i+1 ),lda, x( i, 1 ), &
+                              ldx, one, a( i, i+1 ), lda )
+                    ! generate reflection p(i) to annihilate a(i,i+2:n)
+                    call stdlib_dlarfg( n-i, a( i, i+1 ), a( i, min( i+2, n ) ),lda, taup( i ) )
+                              
+                    e( i ) = a( i, i+1 )
+                    a( i, i+1 ) = one
+                    ! compute x(i+1:m,i)
+                    call stdlib_dgemv( 'NO TRANSPOSE', m-i, n-i, one, a( i+1, i+1 ),lda, a( i, i+&
+                              1 ), lda, zero, x( i+1, i ), 1 )
+                    call stdlib_dgemv( 'TRANSPOSE', n-i, i, one, y( i+1, 1 ), ldy,a( i, i+1 ), &
+                              lda, zero, x( 1, i ), 1 )
+                    call stdlib_dgemv( 'NO TRANSPOSE', m-i, i, -one, a( i+1, 1 ),lda, x( 1, i ), &
+                              1, one, x( i+1, i ), 1 )
+                    call stdlib_dgemv( 'NO TRANSPOSE', i-1, n-i, one, a( 1, i+1 ),lda, a( i, i+1 )&
+                              , lda, zero, x( 1, i ), 1 )
+                    call stdlib_dgemv( 'NO TRANSPOSE', m-i, i-1, -one, x( i+1, 1 ),ldx, x( 1, i ),&
+                               1, one, x( i+1, i ), 1 )
+                    call stdlib_dscal( m-i, taup( i ), x( i+1, i ), 1 )
+                 end if
+              end do loop_10
+           else
+              ! reduce to lower bidiagonal form
+              loop_20: do i = 1, nb
+                 ! update a(i,i:n)
+                 call stdlib_dgemv( 'NO TRANSPOSE', n-i+1, i-1, -one, y( i, 1 ),ldy, a( i, 1 ), &
+                           lda, one, a( i, i ), lda )
+                 call stdlib_dgemv( 'TRANSPOSE', i-1, n-i+1, -one, a( 1, i ), lda,x( i, 1 ), ldx, &
+                           one, a( i, i ), lda )
+                 ! generate reflection p(i) to annihilate a(i,i+1:n)
+                 call stdlib_dlarfg( n-i+1, a( i, i ), a( i, min( i+1, n ) ), lda,taup( i ) )
+                           
+                 d( i ) = a( i, i )
+                 if( i<m ) then
+                    a( i, i ) = one
+                    ! compute x(i+1:m,i)
+                    call stdlib_dgemv( 'NO TRANSPOSE', m-i, n-i+1, one, a( i+1, i ),lda, a( i, i )&
+                              , lda, zero, x( i+1, i ), 1 )
+                    call stdlib_dgemv( 'TRANSPOSE', n-i+1, i-1, one, y( i, 1 ), ldy,a( i, i ), &
+                              lda, zero, x( 1, i ), 1 )
+                    call stdlib_dgemv( 'NO TRANSPOSE', m-i, i-1, -one, a( i+1, 1 ),lda, x( 1, i ),&
+                               1, one, x( i+1, i ), 1 )
+                    call stdlib_dgemv( 'NO TRANSPOSE', i-1, n-i+1, one, a( 1, i ),lda, a( i, i ), &
+                              lda, zero, x( 1, i ), 1 )
+                    call stdlib_dgemv( 'NO TRANSPOSE', m-i, i-1, -one, x( i+1, 1 ),ldx, x( 1, i ),&
+                               1, one, x( i+1, i ), 1 )
+                    call stdlib_dscal( m-i, taup( i ), x( i+1, i ), 1 )
+                    ! update a(i+1:m,i)
+                    call stdlib_dgemv( 'NO TRANSPOSE', m-i, i-1, -one, a( i+1, 1 ),lda, y( i, 1 ),&
+                               ldy, one, a( i+1, i ), 1 )
+                    call stdlib_dgemv( 'NO TRANSPOSE', m-i, i, -one, x( i+1, 1 ),ldx, a( 1, i ), &
+                              1, one, a( i+1, i ), 1 )
+                    ! generate reflection q(i) to annihilate a(i+2:m,i)
+                    call stdlib_dlarfg( m-i, a( i+1, i ), a( min( i+2, m ), i ), 1,tauq( i ) )
+                              
+                    e( i ) = a( i+1, i )
+                    a( i+1, i ) = one
+                    ! compute y(i+1:n,i)
+                    call stdlib_dgemv( 'TRANSPOSE', m-i, n-i, one, a( i+1, i+1 ),lda, a( i+1, i ),&
+                               1, zero, y( i+1, i ), 1 )
+                    call stdlib_dgemv( 'TRANSPOSE', m-i, i-1, one, a( i+1, 1 ), lda,a( i+1, i ), &
+                              1, zero, y( 1, i ), 1 )
+                    call stdlib_dgemv( 'NO TRANSPOSE', n-i, i-1, -one, y( i+1, 1 ),ldy, y( 1, i ),&
+                               1, one, y( i+1, i ), 1 )
+                    call stdlib_dgemv( 'TRANSPOSE', m-i, i, one, x( i+1, 1 ), ldx,a( i+1, i ), 1, &
+                              zero, y( 1, i ), 1 )
+                    call stdlib_dgemv( 'TRANSPOSE', i, n-i, -one, a( 1, i+1 ), lda,y( 1, i ), 1, &
+                              one, y( i+1, i ), 1 )
+                    call stdlib_dscal( n-i, tauq( i ), y( i+1, i ), 1 )
+                 end if
+              end do loop_20
+           end if
+           return
+     end subroutine stdlib_dlabrd
+
+     !> DLADIV: performs complex division in  real arithmetic
+     !> a + i*b
+     !> p + i*q = ---------
+     !> c + i*d
+     !> The algorithm is due to Michael Baudin and Robert L. Smith
+     !> and can be found in the paper
+     !> "A Robust Complex Division in Scilab"
+
+     pure subroutine stdlib_dladiv( a, b, c, d, p, q )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: a, b, c, d
+           real(dp), intent(out) :: p, q
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: bs = 2.0_dp
+           
+           
+           
+           ! Local Scalars 
+           real(dp) :: aa, bb, cc, dd, ab, cd, s, ov, un, be, eps
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           aa = a
+           bb = b
+           cc = c
+           dd = d
+           ab = max( abs(a), abs(b) )
+           cd = max( abs(c), abs(d) )
+           s = one
+           ov = stdlib_dlamch( 'OVERFLOW THRESHOLD' )
+           un = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'EPSILON' )
+           be = bs / (eps*eps)
+           if( ab >= half*ov ) then
+              aa = half * aa
+              bb = half * bb
+              s  = two * s
+           end if
+           if( cd >= half*ov ) then
+              cc = half * cc
+              dd = half * dd
+              s  = half * s
+           end if
+           if( ab <= un*bs/eps ) then
+              aa = aa * be
+              bb = bb * be
+              s  = s / be
+           end if
+           if( cd <= un*bs/eps ) then
+              cc = cc * be
+              dd = dd * be
+              s  = s * be
+           end if
+           if( abs( d )<=abs( c ) ) then
+              call stdlib_dladiv1(aa, bb, cc, dd, p, q)
+           else
+              call stdlib_dladiv1(bb, aa, dd, cc, p, q)
+              q = -q
+           end if
+           p = p * s
+           q = q * s
+           return
+     end subroutine stdlib_dladiv
+
+     !> This subroutine computes the I-th updated eigenvalue of a symmetric
+     !> rank-one modification to a diagonal matrix whose elements are
+     !> given in the array d, and that
+     !> D(i) < D(j)  for  i < j
+     !> and that RHO > 0.  This is arranged by the calling routine, and is
+     !> no loss in generality.  The rank-one modified system is thus
+     !> diag( D )  +  RHO * Z * Z_transpose.
+     !> where we assume the Euclidean norm of Z is 1.
+     !> The method consists of approximating the rational functions in the
+     !> secular equation by simpler interpolating rational functions.
+
+     pure subroutine stdlib_dlaed4( n, i, d, z, delta, rho, dlam, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i, n
+           integer(ilp), intent(out) :: info
+           real(dp), intent(out) :: dlam
+           real(dp), intent(in) :: rho
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*), z(*)
+           real(dp), intent(out) :: delta(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 30
+           
+           
+           ! Local Scalars 
+           logical(lk) :: orgati, swtch, swtch3
+           integer(ilp) :: ii, iim1, iip1, ip1, iter, j, niter
+           real(dp) :: a, b, c, del, dltlb, dltub, dphi, dpsi, dw, eps, erretm, eta, midpt, phi, &
+                     prew, psi, rhoinv, tau, temp, temp1, w
+           ! Local Arrays 
+           real(dp) :: zz(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! since this routine is called in an inner loop, we do no argument
+           ! checking.
+           ! quick return for n=1 and 2.
+           info = 0
+           if( n==1 ) then
+               ! presumably, i=1 upon entry
+              dlam = d( 1 ) + rho*z( 1 )*z( 1 )
+              delta( 1 ) = one
+              return
+           end if
+           if( n==2 ) then
+              call stdlib_dlaed5( i, d, z, delta, rho, dlam )
+              return
+           end if
+           ! compute machine epsilon
+           eps = stdlib_dlamch( 'EPSILON' )
+           rhoinv = one / rho
+           ! the case i = n
+           if( i==n ) then
+              ! initialize some basic variables
+              ii = n - 1
+              niter = 1
+              ! calculate initial guess
+              midpt = rho / two
+              ! if ||z||_2 is not one, then temp should be set to
+              ! rho * ||z||_2^2 / two
+              do j = 1, n
+                 delta( j ) = ( d( j )-d( i ) ) - midpt
+              end do
+              psi = zero
+              do j = 1, n - 2
+                 psi = psi + z( j )*z( j ) / delta( j )
+              end do
+              c = rhoinv + psi
+              w = c + z( ii )*z( ii ) / delta( ii ) +z( n )*z( n ) / delta( n )
+              if( w<=zero ) then
+                 temp = z( n-1 )*z( n-1 ) / ( d( n )-d( n-1 )+rho ) +z( n )*z( n ) / rho
+                 if( c<=temp ) then
+                    tau = rho
+                 else
+                    del = d( n ) - d( n-1 )
+                    a = -c*del + z( n-1 )*z( n-1 ) + z( n )*z( n )
+                    b = z( n )*z( n )*del
+                    if( a<zero ) then
+                       tau = two*b / ( sqrt( a*a+four*b*c )-a )
+                    else
+                       tau = ( a+sqrt( a*a+four*b*c ) ) / ( two*c )
+                    end if
+                 end if
+                 ! it can be proved that
+                     ! d(n)+rho/2 <= lambda(n) < d(n)+tau <= d(n)+rho
+                 dltlb = midpt
+                 dltub = rho
+              else
+                 del = d( n ) - d( n-1 )
+                 a = -c*del + z( n-1 )*z( n-1 ) + z( n )*z( n )
+                 b = z( n )*z( n )*del
+                 if( a<zero ) then
+                    tau = two*b / ( sqrt( a*a+four*b*c )-a )
+                 else
+                    tau = ( a+sqrt( a*a+four*b*c ) ) / ( two*c )
+                 end if
+                 ! it can be proved that
+                     ! d(n) < d(n)+tau < lambda(n) < d(n)+rho/2
+                 dltlb = zero
+                 dltub = midpt
+              end if
+              do j = 1, n
+                 delta( j ) = ( d( j )-d( i ) ) - tau
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, ii
+                 temp = z( j ) / delta( j )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              temp = z( n ) / delta( n )
+              phi = z( n )*temp
+              dphi = temp*temp
+              erretm = eight*( -phi-psi ) + erretm - phi + rhoinv +abs( tau )*( dpsi+dphi )
+                        
+              w = rhoinv + phi + psi
+              ! test for convergence
+              if( abs( w )<=eps*erretm ) then
+                 dlam = d( i ) + tau
+                 go to 250
+              end if
+              if( w<=zero ) then
+                 dltlb = max( dltlb, tau )
+              else
+                 dltub = min( dltub, tau )
+              end if
+              ! calculate the new step
+              niter = niter + 1
+              c = w - delta( n-1 )*dpsi - delta( n )*dphi
+              a = ( delta( n-1 )+delta( n ) )*w -delta( n-1 )*delta( n )*( dpsi+dphi )
+              b = delta( n-1 )*delta( n )*w
+              if( c<zero )c = abs( c )
+              if( c==zero ) then
+                ! eta = b/a
+                 ! eta = rho - tau
+                 eta = dltub - tau
+              else if( a>=zero ) then
+                 eta = ( a+sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+              else
+                 eta = two*b / ( a-sqrt( abs( a*a-four*b*c ) ) )
+              end if
+              ! note, eta should be positive if w is negative, and
+              ! eta should be negative otherwise. however,
+              ! if for some reason caused by roundoff, eta*w > 0,
+              ! we simply use one newton step instead. this way
+              ! will guarantee eta*w < 0.
+              if( w*eta>zero )eta = -w / ( dpsi+dphi )
+              temp = tau + eta
+              if( temp>dltub .or. temp<dltlb ) then
+                 if( w<zero ) then
+                    eta = ( dltub-tau ) / two
+                 else
+                    eta = ( dltlb-tau ) / two
+                 end if
+              end if
+              do j = 1, n
+                 delta( j ) = delta( j ) - eta
+              end do
+              tau = tau + eta
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, ii
+                 temp = z( j ) / delta( j )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              temp = z( n ) / delta( n )
+              phi = z( n )*temp
+              dphi = temp*temp
+              erretm = eight*( -phi-psi ) + erretm - phi + rhoinv +abs( tau )*( dpsi+dphi )
+                        
+              w = rhoinv + phi + psi
+              ! main loop to update the values of the array   delta
+              iter = niter + 1
+              loop_90: do niter = iter, maxit
+                 ! test for convergence
+                 if( abs( w )<=eps*erretm ) then
+                    dlam = d( i ) + tau
+                    go to 250
+                 end if
+                 if( w<=zero ) then
+                    dltlb = max( dltlb, tau )
+                 else
+                    dltub = min( dltub, tau )
+                 end if
+                 ! calculate the new step
+                 c = w - delta( n-1 )*dpsi - delta( n )*dphi
+                 a = ( delta( n-1 )+delta( n ) )*w -delta( n-1 )*delta( n )*( dpsi+dphi )
+                 b = delta( n-1 )*delta( n )*w
+                 if( a>=zero ) then
+                    eta = ( a+sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 else
+                    eta = two*b / ( a-sqrt( abs( a*a-four*b*c ) ) )
+                 end if
+                 ! note, eta should be positive if w is negative, and
+                 ! eta should be negative otherwise. however,
+                 ! if for some reason caused by roundoff, eta*w > 0,
+                 ! we simply use one newton step instead. this way
+                 ! will guarantee eta*w < 0.
+                 if( w*eta>zero )eta = -w / ( dpsi+dphi )
+                 temp = tau + eta
+                 if( temp>dltub .or. temp<dltlb ) then
+                    if( w<zero ) then
+                       eta = ( dltub-tau ) / two
+                    else
+                       eta = ( dltlb-tau ) / two
+                    end if
+                 end if
+                 do j = 1, n
+                    delta( j ) = delta( j ) - eta
+                 end do
+                 tau = tau + eta
+                 ! evaluate psi and the derivative dpsi
+                 dpsi = zero
+                 psi = zero
+                 erretm = zero
+                 do j = 1, ii
+                    temp = z( j ) / delta( j )
+                    psi = psi + z( j )*temp
+                    dpsi = dpsi + temp*temp
+                    erretm = erretm + psi
+                 end do
+                 erretm = abs( erretm )
+                 ! evaluate phi and the derivative dphi
+                 temp = z( n ) / delta( n )
+                 phi = z( n )*temp
+                 dphi = temp*temp
+                 erretm = eight*( -phi-psi ) + erretm - phi + rhoinv +abs( tau )*( dpsi+dphi )
+                           
+                 w = rhoinv + phi + psi
+              end do loop_90
+              ! return with info = 1, niter = maxit and not converged
+              info = 1
+              dlam = d( i ) + tau
+              go to 250
+              ! end for the case i = n
+           else
+              ! the case for i < n
+              niter = 1
+              ip1 = i + 1
+              ! calculate initial guess
+              del = d( ip1 ) - d( i )
+              midpt = del / two
+              do j = 1, n
+                 delta( j ) = ( d( j )-d( i ) ) - midpt
+              end do
+              psi = zero
+              do j = 1, i - 1
+                 psi = psi + z( j )*z( j ) / delta( j )
+              end do
+              phi = zero
+              do j = n, i + 2, -1
+                 phi = phi + z( j )*z( j ) / delta( j )
+              end do
+              c = rhoinv + psi + phi
+              w = c + z( i )*z( i ) / delta( i ) +z( ip1 )*z( ip1 ) / delta( ip1 )
+              if( w>zero ) then
+                 ! d(i)< the ith eigenvalue < (d(i)+d(i+1))/2
+                 ! we choose d(i) as origin.
+                 orgati = .true.
+                 a = c*del + z( i )*z( i ) + z( ip1 )*z( ip1 )
+                 b = z( i )*z( i )*del
+                 if( a>zero ) then
+                    tau = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                 else
+                    tau = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 end if
+                 dltlb = zero
+                 dltub = midpt
+              else
+                 ! (d(i)+d(i+1))/2 <= the ith eigenvalue < d(i+1)
+                 ! we choose d(i+1) as origin.
+                 orgati = .false.
+                 a = c*del - z( i )*z( i ) - z( ip1 )*z( ip1 )
+                 b = z( ip1 )*z( ip1 )*del
+                 if( a<zero ) then
+                    tau = two*b / ( a-sqrt( abs( a*a+four*b*c ) ) )
+                 else
+                    tau = -( a+sqrt( abs( a*a+four*b*c ) ) ) / ( two*c )
+                 end if
+                 dltlb = -midpt
+                 dltub = zero
+              end if
+              if( orgati ) then
+                 do j = 1, n
+                    delta( j ) = ( d( j )-d( i ) ) - tau
+                 end do
+              else
+                 do j = 1, n
+                    delta( j ) = ( d( j )-d( ip1 ) ) - tau
+                 end do
+              end if
+              if( orgati ) then
+                 ii = i
+              else
+                 ii = i + 1
+              end if
+              iim1 = ii - 1
+              iip1 = ii + 1
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, iim1
+                 temp = z( j ) / delta( j )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              dphi = zero
+              phi = zero
+              do j = n, iip1, -1
+                 temp = z( j ) / delta( j )
+                 phi = phi + z( j )*temp
+                 dphi = dphi + temp*temp
+                 erretm = erretm + phi
+              end do
+              w = rhoinv + phi + psi
+              ! w is the value of the secular function with
+              ! its ii-th element removed.
+              swtch3 = .false.
+              if( orgati ) then
+                 if( w<zero )swtch3 = .true.
+              else
+                 if( w>zero )swtch3 = .true.
+              end if
+              if( ii==1 .or. ii==n )swtch3 = .false.
+              temp = z( ii ) / delta( ii )
+              dw = dpsi + dphi + temp*temp
+              temp = z( ii )*temp
+              w = w + temp
+              erretm = eight*( phi-psi ) + erretm + two*rhoinv +three*abs( temp ) + abs( tau )&
+                        *dw
+              ! test for convergence
+              if( abs( w )<=eps*erretm ) then
+                 if( orgati ) then
+                    dlam = d( i ) + tau
+                 else
+                    dlam = d( ip1 ) + tau
+                 end if
+                 go to 250
+              end if
+              if( w<=zero ) then
+                 dltlb = max( dltlb, tau )
+              else
+                 dltub = min( dltub, tau )
+              end if
+              ! calculate the new step
+              niter = niter + 1
+              if( .not.swtch3 ) then
+                 if( orgati ) then
+                    c = w - delta( ip1 )*dw - ( d( i )-d( ip1 ) )*( z( i ) / delta( i ) )&
+                              **2
+                 else
+                    c = w - delta( i )*dw - ( d( ip1 )-d( i ) )*( z( ip1 ) / delta( ip1 ) )&
+                              **2
+                 end if
+                 a = ( delta( i )+delta( ip1 ) )*w -delta( i )*delta( ip1 )*dw
+                 b = delta( i )*delta( ip1 )*w
+                 if( c==zero ) then
+                    if( a==zero ) then
+                       if( orgati ) then
+                          a = z( i )*z( i ) + delta( ip1 )*delta( ip1 )*( dpsi+dphi )
+                       else
+                          a = z( ip1 )*z( ip1 ) + delta( i )*delta( i )*( dpsi+dphi )
+                       end if
+                    end if
+                    eta = b / a
+                 else if( a<=zero ) then
+                    eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 else
+                    eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                 end if
+              else
+                 ! interpolation using three most relevant poles
+                 temp = rhoinv + psi + phi
+                 if( orgati ) then
+                    temp1 = z( iim1 ) / delta( iim1 )
+                    temp1 = temp1*temp1
+                    c = temp - delta( iip1 )*( dpsi+dphi ) -( d( iim1 )-d( iip1 ) )*temp1
+                    zz( 1 ) = z( iim1 )*z( iim1 )
+                    zz( 3 ) = delta( iip1 )*delta( iip1 )*( ( dpsi-temp1 )+dphi )
+                 else
+                    temp1 = z( iip1 ) / delta( iip1 )
+                    temp1 = temp1*temp1
+                    c = temp - delta( iim1 )*( dpsi+dphi ) -( d( iip1 )-d( iim1 ) )*temp1
+                    zz( 1 ) = delta( iim1 )*delta( iim1 )*( dpsi+( dphi-temp1 ) )
+                    zz( 3 ) = z( iip1 )*z( iip1 )
+                 end if
+                 zz( 2 ) = z( ii )*z( ii )
+                 call stdlib_dlaed6( niter, orgati, c, delta( iim1 ), zz, w, eta,info )
+                 if( info/=0 )go to 250
+              end if
+              ! note, eta should be positive if w is negative, and
+              ! eta should be negative otherwise. however,
+              ! if for some reason caused by roundoff, eta*w > 0,
+              ! we simply use one newton step instead. this way
+              ! will guarantee eta*w < 0.
+              if( w*eta>=zero )eta = -w / dw
+              temp = tau + eta
+              if( temp>dltub .or. temp<dltlb ) then
+                 if( w<zero ) then
+                    eta = ( dltub-tau ) / two
+                 else
+                    eta = ( dltlb-tau ) / two
+                 end if
+              end if
+              prew = w
+              do j = 1, n
+                 delta( j ) = delta( j ) - eta
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, iim1
+                 temp = z( j ) / delta( j )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              dphi = zero
+              phi = zero
+              do j = n, iip1, -1
+                 temp = z( j ) / delta( j )
+                 phi = phi + z( j )*temp
+                 dphi = dphi + temp*temp
+                 erretm = erretm + phi
+              end do
+              temp = z( ii ) / delta( ii )
+              dw = dpsi + dphi + temp*temp
+              temp = z( ii )*temp
+              w = rhoinv + phi + psi + temp
+              erretm = eight*( phi-psi ) + erretm + two*rhoinv +three*abs( temp ) + abs( tau+eta )&
+                        *dw
+              swtch = .false.
+              if( orgati ) then
+                 if( -w>abs( prew ) / ten )swtch = .true.
+              else
+                 if( w>abs( prew ) / ten )swtch = .true.
+              end if
+              tau = tau + eta
+              ! main loop to update the values of the array   delta
+              iter = niter + 1
+              loop_240: do niter = iter, maxit
+                 ! test for convergence
+                 if( abs( w )<=eps*erretm ) then
+                    if( orgati ) then
+                       dlam = d( i ) + tau
+                    else
+                       dlam = d( ip1 ) + tau
+                    end if
+                    go to 250
+                 end if
+                 if( w<=zero ) then
+                    dltlb = max( dltlb, tau )
+                 else
+                    dltub = min( dltub, tau )
+                 end if
+                 ! calculate the new step
+                 if( .not.swtch3 ) then
+                    if( .not.swtch ) then
+                       if( orgati ) then
+                          c = w - delta( ip1 )*dw -( d( i )-d( ip1 ) )*( z( i ) / delta( i ) )&
+                                    **2
+                       else
+                          c = w - delta( i )*dw - ( d( ip1 )-d( i ) )*( z( ip1 ) / delta( ip1 ) )&
+                                    **2
+                       end if
+                    else
+                       temp = z( ii ) / delta( ii )
+                       if( orgati ) then
+                          dpsi = dpsi + temp*temp
+                       else
+                          dphi = dphi + temp*temp
+                       end if
+                       c = w - delta( i )*dpsi - delta( ip1 )*dphi
+                    end if
+                    a = ( delta( i )+delta( ip1 ) )*w -delta( i )*delta( ip1 )*dw
+                    b = delta( i )*delta( ip1 )*w
+                    if( c==zero ) then
+                       if( a==zero ) then
+                          if( .not.swtch ) then
+                             if( orgati ) then
+                                a = z( i )*z( i ) + delta( ip1 )*delta( ip1 )*( dpsi+dphi )
+                                          
+                             else
+                                a = z( ip1 )*z( ip1 ) +delta( i )*delta( i )*( dpsi+dphi )
+                             end if
+                          else
+                             a = delta( i )*delta( i )*dpsi +delta( ip1 )*delta( ip1 )&
+                                       *dphi
+                          end if
+                       end if
+                       eta = b / a
+                    else if( a<=zero ) then
+                       eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                    else
+                       eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                    end if
+                 else
+                    ! interpolation using three most relevant poles
+                    temp = rhoinv + psi + phi
+                    if( swtch ) then
+                       c = temp - delta( iim1 )*dpsi - delta( iip1 )*dphi
+                       zz( 1 ) = delta( iim1 )*delta( iim1 )*dpsi
+                       zz( 3 ) = delta( iip1 )*delta( iip1 )*dphi
+                    else
+                       if( orgati ) then
+                          temp1 = z( iim1 ) / delta( iim1 )
+                          temp1 = temp1*temp1
+                          c = temp - delta( iip1 )*( dpsi+dphi ) -( d( iim1 )-d( iip1 ) )&
+                                    *temp1
+                          zz( 1 ) = z( iim1 )*z( iim1 )
+                          zz( 3 ) = delta( iip1 )*delta( iip1 )*( ( dpsi-temp1 )+dphi )
+                       else
+                          temp1 = z( iip1 ) / delta( iip1 )
+                          temp1 = temp1*temp1
+                          c = temp - delta( iim1 )*( dpsi+dphi ) -( d( iip1 )-d( iim1 ) )&
+                                    *temp1
+                          zz( 1 ) = delta( iim1 )*delta( iim1 )*( dpsi+( dphi-temp1 ) )
+                          zz( 3 ) = z( iip1 )*z( iip1 )
+                       end if
+                    end if
+                    call stdlib_dlaed6( niter, orgati, c, delta( iim1 ), zz, w, eta,info )
+                    if( info/=0 )go to 250
+                 end if
+                 ! note, eta should be positive if w is negative, and
+                 ! eta should be negative otherwise. however,
+                 ! if for some reason caused by roundoff, eta*w > 0,
+                 ! we simply use one newton step instead. this way
+                 ! will guarantee eta*w < 0.
+                 if( w*eta>=zero )eta = -w / dw
+                 temp = tau + eta
+                 if( temp>dltub .or. temp<dltlb ) then
+                    if( w<zero ) then
+                       eta = ( dltub-tau ) / two
+                    else
+                       eta = ( dltlb-tau ) / two
+                    end if
+                 end if
+                 do j = 1, n
+                    delta( j ) = delta( j ) - eta
+                 end do
+                 tau = tau + eta
+                 prew = w
+                 ! evaluate psi and the derivative dpsi
+                 dpsi = zero
+                 psi = zero
+                 erretm = zero
+                 do j = 1, iim1
+                    temp = z( j ) / delta( j )
+                    psi = psi + z( j )*temp
+                    dpsi = dpsi + temp*temp
+                    erretm = erretm + psi
+                 end do
+                 erretm = abs( erretm )
+                 ! evaluate phi and the derivative dphi
+                 dphi = zero
+                 phi = zero
+                 do j = n, iip1, -1
+                    temp = z( j ) / delta( j )
+                    phi = phi + z( j )*temp
+                    dphi = dphi + temp*temp
+                    erretm = erretm + phi
+                 end do
+                 temp = z( ii ) / delta( ii )
+                 dw = dpsi + dphi + temp*temp
+                 temp = z( ii )*temp
+                 w = rhoinv + phi + psi + temp
+                 erretm = eight*( phi-psi ) + erretm + two*rhoinv +three*abs( temp ) + abs( tau )&
+                           *dw
+                 if( w*prew>zero .and. abs( w )>abs( prew ) / ten )swtch = .not.swtch
+              end do loop_240
+              ! return with info = 1, niter = maxit and not converged
+              info = 1
+              if( orgati ) then
+                 dlam = d( i ) + tau
+              else
+                 dlam = d( ip1 ) + tau
+              end if
+           end if
+           250 continue
+           return
+     end subroutine stdlib_dlaed4
+
+     !> DLAED8: merges the two sets of eigenvalues together into a single
+     !> sorted set.  Then it tries to deflate the size of the problem.
+     !> There are two ways in which deflation can occur:  when two or more
+     !> eigenvalues are close together or if there is a tiny element in the
+     !> Z vector.  For each such occurrence the order of the related secular
+     !> equation problem is reduced by one.
+
+     pure subroutine stdlib_dlaed8( icompq, k, n, qsiz, d, q, ldq, indxq, rho,cutpnt, z, dlamda, &
+               q2, ldq2, w, perm, givptr,givcol, givnum, indxp, indx, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: cutpnt, icompq, ldq, ldq2, n, qsiz
+           integer(ilp), intent(out) :: givptr, info, k
+           real(dp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(2,*), indx(*), indxp(*), perm(*)
+           integer(ilp), intent(inout) :: indxq(*)
+           real(dp), intent(inout) :: d(*), q(ldq,*), z(*)
+           real(dp), intent(out) :: dlamda(*), givnum(2,*), q2(ldq2,*), w(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: mone = -1.0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, imax, j, jlam, jmax, jp, k2, n1, n1p1, n2
+           real(dp) :: c, eps, s, t, tau, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( icompq<0 .or. icompq>1 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -3
+           else if( icompq==1 .and. qsiz<n ) then
+              info = -4
+           else if( ldq<max( 1, n ) ) then
+              info = -7
+           else if( cutpnt<min( 1, n ) .or. cutpnt>n ) then
+              info = -10
+           else if( ldq2<max( 1, n ) ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED8', -info )
+              return
+           end if
+           ! need to initialize givptr to o here in case of quick exit
+           ! to prevent an unspecified code behavior (usually sigfault)
+           ! when iwork array on entry to *stedc is not zeroed
+           ! (or at least some iwork entries which used in *laed7 for givptr).
+           givptr = 0
+           ! quick return if possible
+           if( n==0 )return
+           n1 = cutpnt
+           n2 = n - n1
+           n1p1 = n1 + 1
+           if( rho<zero ) then
+              call stdlib_dscal( n2, mone, z( n1p1 ), 1 )
+           end if
+           ! normalize z so that norm(z) = 1
+           t = one / sqrt( two )
+           do j = 1, n
+              indx( j ) = j
+           end do
+           call stdlib_dscal( n, t, z, 1 )
+           rho = abs( two*rho )
+           ! sort the eigenvalues into increasing order
+           do i = cutpnt + 1, n
+              indxq( i ) = indxq( i ) + cutpnt
+           end do
+           do i = 1, n
+              dlamda( i ) = d( indxq( i ) )
+              w( i ) = z( indxq( i ) )
+           end do
+           i = 1
+           j = cutpnt + 1
+           call stdlib_dlamrg( n1, n2, dlamda, 1, 1, indx )
+           do i = 1, n
+              d( i ) = dlamda( indx( i ) )
+              z( i ) = w( indx( i ) )
+           end do
+           ! calculate the allowable deflation tolerance
+           imax = stdlib_idamax( n, z, 1 )
+           jmax = stdlib_idamax( n, d, 1 )
+           eps = stdlib_dlamch( 'EPSILON' )
+           tol = eight*eps*abs( d( jmax ) )
+           ! if the rank-1 modifier is small enough, no more needs to be done
+           ! except to reorganize q so that its columns correspond with the
+           ! elements in d.
+           if( rho*abs( z( imax ) )<=tol ) then
+              k = 0
+              if( icompq==0 ) then
+                 do j = 1, n
+                    perm( j ) = indxq( indx( j ) )
+                 end do
+              else
+                 do j = 1, n
+                    perm( j ) = indxq( indx( j ) )
+                    call stdlib_dcopy( qsiz, q( 1, perm( j ) ), 1, q2( 1, j ), 1 )
+                 end do
+                 call stdlib_dlacpy( 'A', qsiz, n, q2( 1, 1 ), ldq2, q( 1, 1 ),ldq )
+              end if
+              return
+           end if
+           ! if there are multiple eigenvalues then the problem deflates.  here
+           ! the number of equal eigenvalues are found.  as each equal
+           ! eigenvalue is found, an elementary reflector is computed to rotate
+           ! the corresponding eigensubspace so that the corresponding
+           ! components of z are zero in this new basis.
+           k = 0
+           k2 = n + 1
+           do j = 1, n
+              if( rho*abs( z( j ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 indxp( k2 ) = j
+                 if( j==n )go to 110
+              else
+                 jlam = j
+                 go to 80
+              end if
+           end do
+           80 continue
+           j = j + 1
+           if( j>n )go to 100
+           if( rho*abs( z( j ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              indxp( k2 ) = j
+           else
+              ! check if eigenvalues are close enough to allow deflation.
+              s = z( jlam )
+              c = z( j )
+              ! find sqrt(a**2+b**2) without overflow or
+              ! destructive underflow.
+              tau = stdlib_dlapy2( c, s )
+              t = d( j ) - d( jlam )
+              c = c / tau
+              s = -s / tau
+              if( abs( t*c*s )<=tol ) then
+                 ! deflation is possible.
+                 z( j ) = tau
+                 z( jlam ) = zero
+                 ! record the appropriate givens rotation
+                 givptr = givptr + 1
+                 givcol( 1, givptr ) = indxq( indx( jlam ) )
+                 givcol( 2, givptr ) = indxq( indx( j ) )
+                 givnum( 1, givptr ) = c
+                 givnum( 2, givptr ) = s
+                 if( icompq==1 ) then
+                    call stdlib_drot( qsiz, q( 1, indxq( indx( jlam ) ) ), 1,q( 1, indxq( indx( j &
+                              ) ) ), 1, c, s )
+                 end if
+                 t = d( jlam )*c*c + d( j )*s*s
+                 d( j ) = d( jlam )*s*s + d( j )*c*c
+                 d( jlam ) = t
+                 k2 = k2 - 1
+                 i = 1
+                 90 continue
+                 if( k2+i<=n ) then
+                    if( d( jlam )<d( indxp( k2+i ) ) ) then
+                       indxp( k2+i-1 ) = indxp( k2+i )
+                       indxp( k2+i ) = jlam
+                       i = i + 1
+                       go to 90
+                    else
+                       indxp( k2+i-1 ) = jlam
+                    end if
+                 else
+                    indxp( k2+i-1 ) = jlam
+                 end if
+                 jlam = j
+              else
+                 k = k + 1
+                 w( k ) = z( jlam )
+                 dlamda( k ) = d( jlam )
+                 indxp( k ) = jlam
+                 jlam = j
+              end if
+           end if
+           go to 80
+           100 continue
+           ! record the last eigenvalue.
+           k = k + 1
+           w( k ) = z( jlam )
+           dlamda( k ) = d( jlam )
+           indxp( k ) = jlam
+           110 continue
+           ! sort the eigenvalues and corresponding eigenvectors into dlamda
+           ! and q2 respectively.  the eigenvalues/vectors which were not
+           ! deflated go into the first k slots of dlamda and q2 respectively,
+           ! while those which were deflated go into the last n - k slots.
+           if( icompq==0 ) then
+              do j = 1, n
+                 jp = indxp( j )
+                 dlamda( j ) = d( jp )
+                 perm( j ) = indxq( indx( jp ) )
+              end do
+           else
+              do j = 1, n
+                 jp = indxp( j )
+                 dlamda( j ) = d( jp )
+                 perm( j ) = indxq( indx( jp ) )
+                 call stdlib_dcopy( qsiz, q( 1, perm( j ) ), 1, q2( 1, j ), 1 )
+              end do
+           end if
+           ! the deflated eigenvalues and their corresponding vectors go back
+           ! into the last n - k slots of d and q respectively.
+           if( k<n ) then
+              if( icompq==0 ) then
+                 call stdlib_dcopy( n-k, dlamda( k+1 ), 1, d( k+1 ), 1 )
+              else
+                 call stdlib_dcopy( n-k, dlamda( k+1 ), 1, d( k+1 ), 1 )
+                 call stdlib_dlacpy( 'A', qsiz, n-k, q2( 1, k+1 ), ldq2,q( 1, k+1 ), ldq )
+              end if
+           end if
+           return
+     end subroutine stdlib_dlaed8
+
+     !> DLAED9: finds the roots of the secular equation, as defined by the
+     !> values in D, Z, and RHO, between KSTART and KSTOP.  It makes the
+     !> appropriate calls to DLAED4 and then stores the new matrix of
+     !> eigenvectors for use in calculating the next level of Z vectors.
+
+     pure subroutine stdlib_dlaed9( k, kstart, kstop, n, d, q, ldq, rho, dlamda, w,s, lds, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, kstart, kstop, ldq, lds, n
+           real(dp), intent(in) :: rho
+           ! Array Arguments 
+           real(dp), intent(out) :: d(*), q(ldq,*), s(lds,*)
+           real(dp), intent(inout) :: dlamda(*), w(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( k<0 ) then
+              info = -1
+           else if( kstart<1 .or. kstart>max( 1, k ) ) then
+              info = -2
+           else if( max( 1, kstop )<kstart .or. kstop>max( 1, k ) )then
+              info = -3
+           else if( n<k ) then
+              info = -4
+           else if( ldq<max( 1, k ) ) then
+              info = -7
+           else if( lds<max( 1, k ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED9', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==0 )return
+           ! modify values dlamda(i) to make sure all dlamda(i)-dlamda(j) can
+           ! be computed with high relative accuracy (barring over/underflow).
+           ! this is a problem on machines without a guard digit in
+           ! add/subtract (cray xmp, cray ymp, cray c 90 and cray 2).
+           ! the following code replaces dlamda(i) by 2*dlamda(i)-dlamda(i),
+           ! which on any of these machines zeros out the bottommost
+           ! bit of dlamda(i) if it is 1; this makes the subsequent
+           ! subtractions dlamda(i)-dlamda(j) unproblematic when cancellation
+           ! occurs. on binary machines with a guard digit (almost all
+           ! machines) it does not change dlamda(i) at all. on hexadecimal
+           ! and decimal machines with a guard digit, it slightly
+           ! changes the bottommost bits of dlamda(i). it does not account
+           ! for hexadecimal or decimal machines without guard digits
+           ! (we know of none). we use a subroutine call to compute
+           ! 2*dlambda(i) to prevent optimizing compilers from eliminating
+           ! this code.
+           do i = 1, n
+              dlamda( i ) = stdlib_dlamc3( dlamda( i ), dlamda( i ) ) - dlamda( i )
+           end do
+           do j = kstart, kstop
+              call stdlib_dlaed4( k, j, dlamda, w, q( 1, j ), rho, d( j ), info )
+              ! if the zero finder fails, the computation is terminated.
+              if( info/=0 )go to 120
+           end do
+           if( k==1 .or. k==2 ) then
+              do i = 1, k
+                 do j = 1, k
+                    s( j, i ) = q( j, i )
+                 end do
+              end do
+              go to 120
+           end if
+           ! compute updated w.
+           call stdlib_dcopy( k, w, 1, s, 1 )
+           ! initialize w(i) = q(i,i)
+           call stdlib_dcopy( k, q, ldq+1, w, 1 )
+           do j = 1, k
+              do i = 1, j - 1
+                 w( i ) = w( i )*( q( i, j ) / ( dlamda( i )-dlamda( j ) ) )
+              end do
+              do i = j + 1, k
+                 w( i ) = w( i )*( q( i, j ) / ( dlamda( i )-dlamda( j ) ) )
+              end do
+           end do
+           do i = 1, k
+              w( i ) = sign( sqrt( -w( i ) ), s( i, 1 ) )
+           end do
+           ! compute eigenvectors of the modified rank-1 modification.
+           do j = 1, k
+              do i = 1, k
+                 q( i, j ) = w( i ) / q( i, j )
+              end do
+              temp = stdlib_dnrm2( k, q( 1, j ), 1 )
+              do i = 1, k
+                 s( i, j ) = q( i, j ) / temp
+              end do
+           end do
+           120 continue
+           return
+     end subroutine stdlib_dlaed9
+
+     !> DLAEIN: uses inverse iteration to find a right or left eigenvector
+     !> corresponding to the eigenvalue (WR,WI) of a real upper Hessenberg
+     !> matrix H.
+
+     pure subroutine stdlib_dlaein( rightv, noinit, n, h, ldh, wr, wi, vr, vi, b,ldb, work, eps3, &
+               smlnum, bignum, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: noinit, rightv
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldh, n
+           real(dp), intent(in) :: bignum, eps3, smlnum, wi, wr
+           ! Array Arguments 
+           real(dp), intent(out) :: b(ldb,*), work(*)
+           real(dp), intent(in) :: h(ldh,*)
+           real(dp), intent(inout) :: vi(*), vr(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: tenth = 1.0e-1_dp
+           
+           ! Local Scalars 
+           character :: normin, trans
+           integer(ilp) :: i, i1, i2, i3, ierr, its, j
+           real(dp) :: absbii, absbjj, ei, ej, growto, norm, nrmsml, rec, rootn, scale, temp, &
+                     vcrit, vmax, vnorm, w, w1, x, xi, xr, y
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,sqrt
+           ! Executable Statements 
+           info = 0
+           ! growto is the threshold used in the acceptance test for an
+           ! eigenvector.
+           rootn = sqrt( real( n,KIND=dp) )
+           growto = tenth / rootn
+           nrmsml = max( one, eps3*rootn )*smlnum
+           ! form b = h - (wr,wi)*i (except that the subdiagonal elements and
+           ! the imaginary parts of the diagonal elements are not stored).
+           do j = 1, n
+              do i = 1, j - 1
+                 b( i, j ) = h( i, j )
+              end do
+              b( j, j ) = h( j, j ) - wr
+           end do
+           if( wi==zero ) then
+              ! real eigenvalue.
+              if( noinit ) then
+                 ! set initial vector.
+                 do i = 1, n
+                    vr( i ) = eps3
+                 end do
+              else
+                 ! scale supplied initial vector.
+                 vnorm = stdlib_dnrm2( n, vr, 1 )
+                 call stdlib_dscal( n, ( eps3*rootn ) / max( vnorm, nrmsml ), vr,1 )
+              end if
+              if( rightv ) then
+                 ! lu decomposition with partial pivoting of b, replacing zero
+                 ! pivots by eps3.
+                 do i = 1, n - 1
+                    ei = h( i+1, i )
+                    if( abs( b( i, i ) )<abs( ei ) ) then
+                       ! interchange rows and eliminate.
+                       x = b( i, i ) / ei
+                       b( i, i ) = ei
+                       do j = i + 1, n
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - x*temp
+                          b( i, j ) = temp
+                       end do
+                    else
+                       ! eliminate without interchange.
+                       if( b( i, i )==zero )b( i, i ) = eps3
+                       x = ei / b( i, i )
+                       if( x/=zero ) then
+                          do j = i + 1, n
+                             b( i+1, j ) = b( i+1, j ) - x*b( i, j )
+                          end do
+                       end if
+                    end if
+                 end do
+                 if( b( n, n )==zero )b( n, n ) = eps3
+                 trans = 'N'
+              else
+                 ! ul decomposition with partial pivoting of b, replacing zero
+                 ! pivots by eps3.
+                 do j = n, 2, -1
+                    ej = h( j, j-1 )
+                    if( abs( b( j, j ) )<abs( ej ) ) then
+                       ! interchange columns and eliminate.
+                       x = b( j, j ) / ej
+                       b( j, j ) = ej
+                       do i = 1, j - 1
+                          temp = b( i, j-1 )
+                          b( i, j-1 ) = b( i, j ) - x*temp
+                          b( i, j ) = temp
+                       end do
+                    else
+                       ! eliminate without interchange.
+                       if( b( j, j )==zero )b( j, j ) = eps3
+                       x = ej / b( j, j )
+                       if( x/=zero ) then
+                          do i = 1, j - 1
+                             b( i, j-1 ) = b( i, j-1 ) - x*b( i, j )
+                          end do
+                       end if
+                    end if
+                 end do
+                 if( b( 1, 1 )==zero )b( 1, 1 ) = eps3
+                 trans = 'T'
+              end if
+              normin = 'N'
+              do its = 1, n
+                 ! solve u*x = scale*v for a right eigenvector
+                   ! or u**t*x = scale*v for a left eigenvector,
+                 ! overwriting x on v.
+                 call stdlib_dlatrs( 'UPPER', trans, 'NONUNIT', normin, n, b, ldb,vr, scale, work,&
+                            ierr )
+                 normin = 'Y'
+                 ! test for sufficient growth in the norm of v.
+                 vnorm = stdlib_dasum( n, vr, 1 )
+                 if( vnorm>=growto*scale )go to 120
+                 ! choose new orthogonal starting vector and try again.
+                 temp = eps3 / ( rootn+one )
+                 vr( 1 ) = eps3
+                 do i = 2, n
+                    vr( i ) = temp
+                 end do
+                 vr( n-its+1 ) = vr( n-its+1 ) - eps3*rootn
+              end do
+              ! failure to find eigenvector in n iterations.
+              info = 1
+              120 continue
+              ! normalize eigenvector.
+              i = stdlib_idamax( n, vr, 1 )
+              call stdlib_dscal( n, one / abs( vr( i ) ), vr, 1 )
+           else
+              ! complex eigenvalue.
+              if( noinit ) then
+                 ! set initial vector.
+                 do i = 1, n
+                    vr( i ) = eps3
+                    vi( i ) = zero
+                 end do
+              else
+                 ! scale supplied initial vector.
+                 norm = stdlib_dlapy2( stdlib_dnrm2( n, vr, 1 ), stdlib_dnrm2( n, vi, 1 ) )
+                           
+                 rec = ( eps3*rootn ) / max( norm, nrmsml )
+                 call stdlib_dscal( n, rec, vr, 1 )
+                 call stdlib_dscal( n, rec, vi, 1 )
+              end if
+              if( rightv ) then
+                 ! lu decomposition with partial pivoting of b, replacing zero
+                 ! pivots by eps3.
+                 ! the imaginary part of the (i,j)-th element of u is stored in
+                 ! b(j+1,i).
+                 b( 2, 1 ) = -wi
+                 do i = 2, n
+                    b( i+1, 1 ) = zero
+                 end do
+                 loop_170: do i = 1, n - 1
+                    absbii = stdlib_dlapy2( b( i, i ), b( i+1, i ) )
+                    ei = h( i+1, i )
+                    if( absbii<abs( ei ) ) then
+                       ! interchange rows and eliminate.
+                       xr = b( i, i ) / ei
+                       xi = b( i+1, i ) / ei
+                       b( i, i ) = ei
+                       b( i+1, i ) = zero
+                       do j = i + 1, n
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - xr*temp
+                          b( j+1, i+1 ) = b( j+1, i ) - xi*temp
+                          b( i, j ) = temp
+                          b( j+1, i ) = zero
+                       end do
+                       b( i+2, i ) = -wi
+                       b( i+1, i+1 ) = b( i+1, i+1 ) - xi*wi
+                       b( i+2, i+1 ) = b( i+2, i+1 ) + xr*wi
+                    else
+                       ! eliminate without interchanging rows.
+                       if( absbii==zero ) then
+                          b( i, i ) = eps3
+                          b( i+1, i ) = zero
+                          absbii = eps3
+                       end if
+                       ei = ( ei / absbii ) / absbii
+                       xr = b( i, i )*ei
+                       xi = -b( i+1, i )*ei
+                       do j = i + 1, n
+                          b( i+1, j ) = b( i+1, j ) - xr*b( i, j ) +xi*b( j+1, i )
+                          b( j+1, i+1 ) = -xr*b( j+1, i ) - xi*b( i, j )
+                       end do
+                       b( i+2, i+1 ) = b( i+2, i+1 ) - wi
+                    end if
+                    ! compute 1-norm of offdiagonal elements of i-th row.
+                    work( i ) = stdlib_dasum( n-i, b( i, i+1 ), ldb ) +stdlib_dasum( n-i, b( i+2, &
+                              i ), 1 )
+                 end do loop_170
+                 if( b( n, n )==zero .and. b( n+1, n )==zero )b( n, n ) = eps3
+                 work( n ) = zero
+                 i1 = n
+                 i2 = 1
+                 i3 = -1
+              else
+                 ! ul decomposition with partial pivoting of conjg(b),
+                 ! replacing zero pivots by eps3.
+                 ! the imaginary part of the (i,j)-th element of u is stored in
+                 ! b(j+1,i).
+                 b( n+1, n ) = wi
+                 do j = 1, n - 1
+                    b( n+1, j ) = zero
+                 end do
+                 loop_210: do j = n, 2, -1
+                    ej = h( j, j-1 )
+                    absbjj = stdlib_dlapy2( b( j, j ), b( j+1, j ) )
+                    if( absbjj<abs( ej ) ) then
+                       ! interchange columns and eliminate
+                       xr = b( j, j ) / ej
+                       xi = b( j+1, j ) / ej
+                       b( j, j ) = ej
+                       b( j+1, j ) = zero
+                       do i = 1, j - 1
+                          temp = b( i, j-1 )
+                          b( i, j-1 ) = b( i, j ) - xr*temp
+                          b( j, i ) = b( j+1, i ) - xi*temp
+                          b( i, j ) = temp
+                          b( j+1, i ) = zero
+                       end do
+                       b( j+1, j-1 ) = wi
+                       b( j-1, j-1 ) = b( j-1, j-1 ) + xi*wi
+                       b( j, j-1 ) = b( j, j-1 ) - xr*wi
+                    else
+                       ! eliminate without interchange.
+                       if( absbjj==zero ) then
+                          b( j, j ) = eps3
+                          b( j+1, j ) = zero
+                          absbjj = eps3
+                       end if
+                       ej = ( ej / absbjj ) / absbjj
+                       xr = b( j, j )*ej
+                       xi = -b( j+1, j )*ej
+                       do i = 1, j - 1
+                          b( i, j-1 ) = b( i, j-1 ) - xr*b( i, j ) +xi*b( j+1, i )
+                          b( j, i ) = -xr*b( j+1, i ) - xi*b( i, j )
+                       end do
+                       b( j, j-1 ) = b( j, j-1 ) + wi
+                    end if
+                    ! compute 1-norm of offdiagonal elements of j-th column.
+                    work( j ) = stdlib_dasum( j-1, b( 1, j ), 1 ) +stdlib_dasum( j-1, b( j+1, 1 ),&
+                               ldb )
+                 end do loop_210
+                 if( b( 1, 1 )==zero .and. b( 2, 1 )==zero )b( 1, 1 ) = eps3
+                 work( 1 ) = zero
+                 i1 = 1
+                 i2 = n
+                 i3 = 1
+              end if
+              loop_270: do its = 1, n
+                 scale = one
+                 vmax = one
+                 vcrit = bignum
+                 ! solve u*(xr,xi) = scale*(vr,vi) for a right eigenvector,
+                   ! or u**t*(xr,xi) = scale*(vr,vi) for a left eigenvector,
+                 ! overwriting (xr,xi) on (vr,vi).
+                 loop_250: do i = i1, i2, i3
+                    if( work( i )>vcrit ) then
+                       rec = one / vmax
+                       call stdlib_dscal( n, rec, vr, 1 )
+                       call stdlib_dscal( n, rec, vi, 1 )
+                       scale = scale*rec
+                       vmax = one
+                       vcrit = bignum
+                    end if
+                    xr = vr( i )
+                    xi = vi( i )
+                    if( rightv ) then
+                       do j = i + 1, n
+                          xr = xr - b( i, j )*vr( j ) + b( j+1, i )*vi( j )
+                          xi = xi - b( i, j )*vi( j ) - b( j+1, i )*vr( j )
+                       end do
+                    else
+                       do j = 1, i - 1
+                          xr = xr - b( j, i )*vr( j ) + b( i+1, j )*vi( j )
+                          xi = xi - b( j, i )*vi( j ) - b( i+1, j )*vr( j )
+                       end do
+                    end if
+                    w = abs( b( i, i ) ) + abs( b( i+1, i ) )
+                    if( w>smlnum ) then
+                       if( w<one ) then
+                          w1 = abs( xr ) + abs( xi )
+                          if( w1>w*bignum ) then
+                             rec = one / w1
+                             call stdlib_dscal( n, rec, vr, 1 )
+                             call stdlib_dscal( n, rec, vi, 1 )
+                             xr = vr( i )
+                             xi = vi( i )
+                             scale = scale*rec
+                             vmax = vmax*rec
+                          end if
+                       end if
+                       ! divide by diagonal element of b.
+                       call stdlib_dladiv( xr, xi, b( i, i ), b( i+1, i ), vr( i ),vi( i ) )
+                                 
+                       vmax = max( abs( vr( i ) )+abs( vi( i ) ), vmax )
+                       vcrit = bignum / vmax
+                    else
+                       do j = 1, n
+                          vr( j ) = zero
+                          vi( j ) = zero
+                       end do
+                       vr( i ) = one
+                       vi( i ) = one
+                       scale = zero
+                       vmax = one
+                       vcrit = bignum
+                    end if
+                 end do loop_250
+                 ! test for sufficient growth in the norm of (vr,vi).
+                 vnorm = stdlib_dasum( n, vr, 1 ) + stdlib_dasum( n, vi, 1 )
+                 if( vnorm>=growto*scale )go to 280
+                 ! choose a new orthogonal starting vector and try again.
+                 y = eps3 / ( rootn+one )
+                 vr( 1 ) = eps3
+                 vi( 1 ) = zero
+                 do i = 2, n
+                    vr( i ) = y
+                    vi( i ) = zero
+                 end do
+                 vr( n-its+1 ) = vr( n-its+1 ) - eps3*rootn
+              end do loop_270
+              ! failure to find eigenvector in n iterations
+              info = 1
+              280 continue
+              ! normalize eigenvector.
+              vnorm = zero
+              do i = 1, n
+                 vnorm = max( vnorm, abs( vr( i ) )+abs( vi( i ) ) )
+              end do
+              call stdlib_dscal( n, one / vnorm, vr, 1 )
+              call stdlib_dscal( n, one / vnorm, vi, 1 )
+           end if
+           return
+     end subroutine stdlib_dlaein
+
+     !> DLAGV2: computes the Generalized Schur factorization of a real 2-by-2
+     !> matrix pencil (A,B) where B is upper triangular. This routine
+     !> computes orthogonal (rotation) matrices given by CSL, SNL and CSR,
+     !> SNR such that
+     !> 1) if the pencil (A,B) has two real eigenvalues (include 0/0 or 1/0
+     !> types), then
+     !> [ a11 a12 ] := [  CSL  SNL ] [ a11 a12 ] [  CSR -SNR ]
+     !> [  0  a22 ]    [ -SNL  CSL ] [ a21 a22 ] [  SNR  CSR ]
+     !> [ b11 b12 ] := [  CSL  SNL ] [ b11 b12 ] [  CSR -SNR ]
+     !> [  0  b22 ]    [ -SNL  CSL ] [  0  b22 ] [  SNR  CSR ],
+     !> 2) if the pencil (A,B) has a pair of complex conjugate eigenvalues,
+     !> then
+     !> [ a11 a12 ] := [  CSL  SNL ] [ a11 a12 ] [  CSR -SNR ]
+     !> [ a21 a22 ]    [ -SNL  CSL ] [ a21 a22 ] [  SNR  CSR ]
+     !> [ b11  0  ] := [  CSL  SNL ] [ b11 b12 ] [  CSR -SNR ]
+     !> [  0  b22 ]    [ -SNL  CSL ] [  0  b22 ] [  SNR  CSR ]
+     !> where b11 >= b22 > 0.
+
+     pure subroutine stdlib_dlagv2( a, lda, b, ldb, alphar, alphai, beta, csl, snl,csr, snr )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldb
+           real(dp), intent(out) :: csl, csr, snl, snr
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: alphai(2), alphar(2), beta(2)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: anorm, ascale, bnorm, bscale, h1, h2, h3, qq, r, rr, safmin, scale1, &
+                     scale2, t, ulp, wi, wr1, wr2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           safmin = stdlib_dlamch( 'S' )
+           ulp = stdlib_dlamch( 'P' )
+           ! scale a
+           anorm = max( abs( a( 1, 1 ) )+abs( a( 2, 1 ) ),abs( a( 1, 2 ) )+abs( a( 2, 2 ) ), &
+                     safmin )
+           ascale = one / anorm
+           a( 1, 1 ) = ascale*a( 1, 1 )
+           a( 1, 2 ) = ascale*a( 1, 2 )
+           a( 2, 1 ) = ascale*a( 2, 1 )
+           a( 2, 2 ) = ascale*a( 2, 2 )
+           ! scale b
+           bnorm = max( abs( b( 1, 1 ) ), abs( b( 1, 2 ) )+abs( b( 2, 2 ) ),safmin )
+           bscale = one / bnorm
+           b( 1, 1 ) = bscale*b( 1, 1 )
+           b( 1, 2 ) = bscale*b( 1, 2 )
+           b( 2, 2 ) = bscale*b( 2, 2 )
+           ! check if a can be deflated
+           if( abs( a( 2, 1 ) )<=ulp ) then
+              csl = one
+              snl = zero
+              csr = one
+              snr = zero
+              a( 2, 1 ) = zero
+              b( 2, 1 ) = zero
+              wi = zero
+           ! check if b is singular
+           else if( abs( b( 1, 1 ) )<=ulp ) then
+              call stdlib_dlartg( a( 1, 1 ), a( 2, 1 ), csl, snl, r )
+              csr = one
+              snr = zero
+              call stdlib_drot( 2, a( 1, 1 ), lda, a( 2, 1 ), lda, csl, snl )
+              call stdlib_drot( 2, b( 1, 1 ), ldb, b( 2, 1 ), ldb, csl, snl )
+              a( 2, 1 ) = zero
+              b( 1, 1 ) = zero
+              b( 2, 1 ) = zero
+              wi = zero
+           else if( abs( b( 2, 2 ) )<=ulp ) then
+              call stdlib_dlartg( a( 2, 2 ), a( 2, 1 ), csr, snr, t )
+              snr = -snr
+              call stdlib_drot( 2, a( 1, 1 ), 1, a( 1, 2 ), 1, csr, snr )
+              call stdlib_drot( 2, b( 1, 1 ), 1, b( 1, 2 ), 1, csr, snr )
+              csl = one
+              snl = zero
+              a( 2, 1 ) = zero
+              b( 2, 1 ) = zero
+              b( 2, 2 ) = zero
+              wi = zero
+           else
+              ! b is nonsingular, first compute the eigenvalues of (a,b)
+              call stdlib_dlag2( a, lda, b, ldb, safmin, scale1, scale2, wr1, wr2,wi )
+              if( wi==zero ) then
+                 ! two real eigenvalues, compute s*a-w*b
+                 h1 = scale1*a( 1, 1 ) - wr1*b( 1, 1 )
+                 h2 = scale1*a( 1, 2 ) - wr1*b( 1, 2 )
+                 h3 = scale1*a( 2, 2 ) - wr1*b( 2, 2 )
+                 rr = stdlib_dlapy2( h1, h2 )
+                 qq = stdlib_dlapy2( scale1*a( 2, 1 ), h3 )
+                 if( rr>qq ) then
+                    ! find right rotation matrix to zero 1,1 element of
+                    ! (sa - wb)
+                    call stdlib_dlartg( h2, h1, csr, snr, t )
+                 else
+                    ! find right rotation matrix to zero 2,1 element of
+                    ! (sa - wb)
+                    call stdlib_dlartg( h3, scale1*a( 2, 1 ), csr, snr, t )
+                 end if
+                 snr = -snr
+                 call stdlib_drot( 2, a( 1, 1 ), 1, a( 1, 2 ), 1, csr, snr )
+                 call stdlib_drot( 2, b( 1, 1 ), 1, b( 1, 2 ), 1, csr, snr )
+                 ! compute inf norms of a and b
+                 h1 = max( abs( a( 1, 1 ) )+abs( a( 1, 2 ) ),abs( a( 2, 1 ) )+abs( a( 2, 2 ) ) )
+                           
+                 h2 = max( abs( b( 1, 1 ) )+abs( b( 1, 2 ) ),abs( b( 2, 1 ) )+abs( b( 2, 2 ) ) )
+                           
+                 if( ( scale1*h1 )>=abs( wr1 )*h2 ) then
+                    ! find left rotation matrix q to zero out b(2,1)
+                    call stdlib_dlartg( b( 1, 1 ), b( 2, 1 ), csl, snl, r )
+                 else
+                    ! find left rotation matrix q to zero out a(2,1)
+                    call stdlib_dlartg( a( 1, 1 ), a( 2, 1 ), csl, snl, r )
+                 end if
+                 call stdlib_drot( 2, a( 1, 1 ), lda, a( 2, 1 ), lda, csl, snl )
+                 call stdlib_drot( 2, b( 1, 1 ), ldb, b( 2, 1 ), ldb, csl, snl )
+                 a( 2, 1 ) = zero
+                 b( 2, 1 ) = zero
+              else
+                 ! a pair of complex conjugate eigenvalues
+                 ! first compute the svd of the matrix b
+                 call stdlib_dlasv2( b( 1, 1 ), b( 1, 2 ), b( 2, 2 ), r, t, snr,csr, snl, csl )
+                           
+                 ! form (a,b) := q(a,b)z**t where q is left rotation matrix and
+                 ! z is right rotation matrix computed from stdlib_dlasv2
+                 call stdlib_drot( 2, a( 1, 1 ), lda, a( 2, 1 ), lda, csl, snl )
+                 call stdlib_drot( 2, b( 1, 1 ), ldb, b( 2, 1 ), ldb, csl, snl )
+                 call stdlib_drot( 2, a( 1, 1 ), 1, a( 1, 2 ), 1, csr, snr )
+                 call stdlib_drot( 2, b( 1, 1 ), 1, b( 1, 2 ), 1, csr, snr )
+                 b( 2, 1 ) = zero
+                 b( 1, 2 ) = zero
+              end if
+           end if
+           ! unscaling
+           a( 1, 1 ) = anorm*a( 1, 1 )
+           a( 2, 1 ) = anorm*a( 2, 1 )
+           a( 1, 2 ) = anorm*a( 1, 2 )
+           a( 2, 2 ) = anorm*a( 2, 2 )
+           b( 1, 1 ) = bnorm*b( 1, 1 )
+           b( 2, 1 ) = bnorm*b( 2, 1 )
+           b( 1, 2 ) = bnorm*b( 1, 2 )
+           b( 2, 2 ) = bnorm*b( 2, 2 )
+           if( wi==zero ) then
+              alphar( 1 ) = a( 1, 1 )
+              alphar( 2 ) = a( 2, 2 )
+              alphai( 1 ) = zero
+              alphai( 2 ) = zero
+              beta( 1 ) = b( 1, 1 )
+              beta( 2 ) = b( 2, 2 )
+           else
+              alphar( 1 ) = anorm*wr1 / scale1 / bnorm
+              alphai( 1 ) = anorm*wi / scale1 / bnorm
+              alphar( 2 ) = alphar( 1 )
+              alphai( 2 ) = -alphai( 1 )
+              beta( 1 ) = one
+              beta( 2 ) = one
+           end if
+           return
+     end subroutine stdlib_dlagv2
+
+     !> DLAHR2: reduces the first NB columns of A real general n-BY-(n-k+1)
+     !> matrix A so that elements below the k-th subdiagonal are zero. The
+     !> reduction is performed by an orthogonal similarity transformation
+     !> Q**T * A * Q. The routine returns the matrices V and T which determine
+     !> Q as a block reflector I - V*T*V**T, and also the matrix Y = A * V * T.
+     !> This is an auxiliary routine called by DGEHRD.
+
+     pure subroutine stdlib_dlahr2( n, k, nb, a, lda, tau, t, ldt, y, ldy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: k, lda, ldt, ldy, n, nb
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: t(ldt,nb), tau(nb), y(ldy,nb)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 )return
+           loop_10: do i = 1, nb
+              if( i>1 ) then
+                 ! update a(k+1:n,i)
+                 ! update i-th column of a - y * v**t
+                 call stdlib_dgemv( 'NO TRANSPOSE', n-k, i-1, -one, y(k+1,1), ldy,a( k+i-1, 1 ), &
+                           lda, one, a( k+1, i ), 1 )
+                 ! apply i - v * t**t * v**t to this column (call it b) from the
+                 ! left, using the last column of t as workspace
+                 ! let  v = ( v1 )   and   b = ( b1 )   (first i-1 rows)
+                          ! ( v2 )             ( b2 )
+                 ! where v1 is unit lower triangular
+                 ! w := v1**t * b1
+                 call stdlib_dcopy( i-1, a( k+1, i ), 1, t( 1, nb ), 1 )
+                 call stdlib_dtrmv( 'LOWER', 'TRANSPOSE', 'UNIT',i-1, a( k+1, 1 ),lda, t( 1, nb ),&
+                            1 )
+                 ! w := w + v2**t * b2
+                 call stdlib_dgemv( 'TRANSPOSE', n-k-i+1, i-1,one, a( k+i, 1 ),lda, a( k+i, i ), &
+                           1, one, t( 1, nb ), 1 )
+                 ! w := t**t * w
+                 call stdlib_dtrmv( 'UPPER', 'TRANSPOSE', 'NON-UNIT',i-1, t, ldt,t( 1, nb ), 1 )
+                           
+                 ! b2 := b2 - v2*w
+                 call stdlib_dgemv( 'NO TRANSPOSE', n-k-i+1, i-1, -one,a( k+i, 1 ),lda, t( 1, nb )&
+                           , 1, one, a( k+i, i ), 1 )
+                 ! b1 := b1 - v1*w
+                 call stdlib_dtrmv( 'LOWER', 'NO TRANSPOSE','UNIT', i-1,a( k+1, 1 ), lda, t( 1, &
+                           nb ), 1 )
+                 call stdlib_daxpy( i-1, -one, t( 1, nb ), 1, a( k+1, i ), 1 )
+                 a( k+i-1, i-1 ) = ei
+              end if
+              ! generate the elementary reflector h(i) to annihilate
+              ! a(k+i+1:n,i)
+              call stdlib_dlarfg( n-k-i+1, a( k+i, i ), a( min( k+i+1, n ), i ), 1,tau( i ) )
+                        
+              ei = a( k+i, i )
+              a( k+i, i ) = one
+              ! compute  y(k+1:n,i)
+              call stdlib_dgemv( 'NO TRANSPOSE', n-k, n-k-i+1,one, a( k+1, i+1 ),lda, a( k+i, i ),&
+                         1, zero, y( k+1, i ), 1 )
+              call stdlib_dgemv( 'TRANSPOSE', n-k-i+1, i-1,one, a( k+i, 1 ), lda,a( k+i, i ), 1, &
+                        zero, t( 1, i ), 1 )
+              call stdlib_dgemv( 'NO TRANSPOSE', n-k, i-1, -one,y( k+1, 1 ), ldy,t( 1, i ), 1, &
+                        one, y( k+1, i ), 1 )
+              call stdlib_dscal( n-k, tau( i ), y( k+1, i ), 1 )
+              ! compute t(1:i,i)
+              call stdlib_dscal( i-1, -tau( i ), t( 1, i ), 1 )
+              call stdlib_dtrmv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',i-1, t, ldt,t( 1, i ), 1 )
+                        
+              t( i, i ) = tau( i )
+           end do loop_10
+           a( k+nb, nb ) = ei
+           ! compute y(1:k,1:nb)
+           call stdlib_dlacpy( 'ALL', k, nb, a( 1, 2 ), lda, y, ldy )
+           call stdlib_dtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE','UNIT', k, nb,one, a( k+1, 1 ), &
+                     lda, y, ldy )
+           if( n>k+nb )call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', k,nb, n-k-nb, one,a( 1, &
+                     2+nb ), lda, a( k+1+nb, 1 ), lda, one, y,ldy )
+           call stdlib_dtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE','NON-UNIT', k, nb,one, t, ldt, y, &
+                     ldy )
+           return
+     end subroutine stdlib_dlahr2
+
+     !> DLALN2: solves a system of the form  (ca A - w D ) X = s B
+     !> or (ca A**T - w D) X = s B   with possible scaling ("s") and
+     !> perturbation of A.  (A**T means A-transpose.)
+     !> A is an NA x NA real matrix, ca is a real scalar, D is an NA x NA
+     !> real diagonal matrix, w is a real or complex value, and X and B are
+     !> NA x 1 matrices -- real if w is real, complex if w is complex.  NA
+     !> may be 1 or 2.
+     !> If w is complex, X and B are represented as NA x 2 matrices,
+     !> the first column of each being the real part and the second
+     !> being the imaginary part.
+     !> "s" is a scaling factor (<= 1), computed by DLALN2, which is
+     !> so chosen that X can be computed without overflow.  X is further
+     !> scaled if necessary to assure that norm(ca A - w D)*norm(X) is less
+     !> than overflow.
+     !> If both singular values of (ca A - w D) are less than SMIN,
+     !> SMIN*identity will be used instead of (ca A - w D).  If only one
+     !> singular value is less than SMIN, one element of (ca A - w D) will be
+     !> perturbed enough to make the smallest singular value roughly SMIN.
+     !> If both singular values are at least SMIN, (ca A - w D) will not be
+     !> perturbed.  In any case, the perturbation will be at most some small
+     !> multiple of max( SMIN, ulp*norm(ca A - w D) ).  The singular values
+     !> are computed by infinity-norm approximations, and thus will only be
+     !> correct to a factor of 2 or so.
+     !> Note: all input quantities are assumed to be smaller than overflow
+     !> by a reasonable factor.  (See BIGNUM.)
+
+     pure subroutine stdlib_dlaln2( ltrans, na, nw, smin, ca, a, lda, d1, d2, b,ldb, wr, wi, x, &
+               ldx, scale, xnorm, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: ltrans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldx, na, nw
+           real(dp), intent(in) :: ca, d1, d2, smin, wi, wr
+           real(dp), intent(out) :: scale, xnorm
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: x(ldx,*)
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: icmax, j
+           real(dp) :: bbnd, bi1, bi2, bignum, bnorm, br1, br2, ci21, ci22, cmax, cnorm, cr21, &
+           cr22, csi, csr, li21, lr21, smini, smlnum, temp, u22abs, ui11, ui11r, ui12, ui12s, &
+                     ui22, ur11, ur11r, ur12, ur12s, ur22, xi1, xi2, xr1, xr2
+           ! Local Arrays 
+           logical(lk) :: rswap(4), zswap(4)
+           integer(ilp) :: ipivot(4,4)
+           real(dp) :: ci(2,2), civ(4), cr(2,2), crv(4)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Equivalences 
+           equivalence        ( ci( 1, 1 ), civ( 1 ) ),( cr( 1, 1 ), crv( 1 ) )
+           ! Data Statements 
+           zswap = [.false.,.false.,.true.,.true.]
+           rswap = [.false.,.true.,.false.,.true.]
+           ipivot = reshape([1,2,3,4,2,1,4,3,3,4,1,2,4,3,2,1],[4,4])
+           ! Executable Statements 
+           ! compute bignum
+           smlnum = two*stdlib_dlamch( 'SAFE MINIMUM' )
+           bignum = one / smlnum
+           smini = max( smin, smlnum )
+           ! don't check for input errors
+           info = 0
+           ! standard initializations
+           scale = one
+           if( na==1 ) then
+              ! 1 x 1  (i.e., scalar) system   c x = b
+              if( nw==1 ) then
+                 ! real 1x1 system.
+                 ! c = ca a - w d
+                 csr = ca*a( 1, 1 ) - wr*d1
+                 cnorm = abs( csr )
+                 ! if | c | < smini, use c = smini
+                 if( cnorm<smini ) then
+                    csr = smini
+                    cnorm = smini
+                    info = 1
+                 end if
+                 ! check scaling for  x = b / c
+                 bnorm = abs( b( 1, 1 ) )
+                 if( cnorm<one .and. bnorm>one ) then
+                    if( bnorm>bignum*cnorm )scale = one / bnorm
+                 end if
+                 ! compute x
+                 x( 1, 1 ) = ( b( 1, 1 )*scale ) / csr
+                 xnorm = abs( x( 1, 1 ) )
+              else
+                 ! complex 1x1 system (w is complex)
+                 ! c = ca a - w d
+                 csr = ca*a( 1, 1 ) - wr*d1
+                 csi = -wi*d1
+                 cnorm = abs( csr ) + abs( csi )
+                 ! if | c | < smini, use c = smini
+                 if( cnorm<smini ) then
+                    csr = smini
+                    csi = zero
+                    cnorm = smini
+                    info = 1
+                 end if
+                 ! check scaling for  x = b / c
+                 bnorm = abs( b( 1, 1 ) ) + abs( b( 1, 2 ) )
+                 if( cnorm<one .and. bnorm>one ) then
+                    if( bnorm>bignum*cnorm )scale = one / bnorm
+                 end if
+                 ! compute x
+                 call stdlib_dladiv( scale*b( 1, 1 ), scale*b( 1, 2 ), csr, csi,x( 1, 1 ), x( 1, &
+                           2 ) )
+                 xnorm = abs( x( 1, 1 ) ) + abs( x( 1, 2 ) )
+              end if
+           else
+              ! 2x2 system
+              ! compute the realpart of  c = ca a - w d  (or  ca a**t - w d,KIND=dp)
+              cr( 1, 1 ) = ca*a( 1, 1 ) - wr*d1
+              cr( 2, 2 ) = ca*a( 2, 2 ) - wr*d2
+              if( ltrans ) then
+                 cr( 1, 2 ) = ca*a( 2, 1 )
+                 cr( 2, 1 ) = ca*a( 1, 2 )
+              else
+                 cr( 2, 1 ) = ca*a( 2, 1 )
+                 cr( 1, 2 ) = ca*a( 1, 2 )
+              end if
+              if( nw==1 ) then
+                 ! real2x2 system  (w is real,KIND=dp)
+                 ! find the largest element in c
+                 cmax = zero
+                 icmax = 0
+                 do j = 1, 4
+                    if( abs( crv( j ) )>cmax ) then
+                       cmax = abs( crv( j ) )
+                       icmax = j
+                    end if
+                 end do
+                 ! if norm(c) < smini, use smini*identity.
+                 if( cmax<smini ) then
+                    bnorm = max( abs( b( 1, 1 ) ), abs( b( 2, 1 ) ) )
+                    if( smini<one .and. bnorm>one ) then
+                       if( bnorm>bignum*smini )scale = one / bnorm
+                    end if
+                    temp = scale / smini
+                    x( 1, 1 ) = temp*b( 1, 1 )
+                    x( 2, 1 ) = temp*b( 2, 1 )
+                    xnorm = temp*bnorm
+                    info = 1
+                    return
+                 end if
+                 ! gaussian elimination with complete pivoting.
+                 ur11 = crv( icmax )
+                 cr21 = crv( ipivot( 2, icmax ) )
+                 ur12 = crv( ipivot( 3, icmax ) )
+                 cr22 = crv( ipivot( 4, icmax ) )
+                 ur11r = one / ur11
+                 lr21 = ur11r*cr21
+                 ur22 = cr22 - ur12*lr21
+                 ! if smaller pivot < smini, use smini
+                 if( abs( ur22 )<smini ) then
+                    ur22 = smini
+                    info = 1
+                 end if
+                 if( rswap( icmax ) ) then
+                    br1 = b( 2, 1 )
+                    br2 = b( 1, 1 )
+                 else
+                    br1 = b( 1, 1 )
+                    br2 = b( 2, 1 )
+                 end if
+                 br2 = br2 - lr21*br1
+                 bbnd = max( abs( br1*( ur22*ur11r ) ), abs( br2 ) )
+                 if( bbnd>one .and. abs( ur22 )<one ) then
+                    if( bbnd>=bignum*abs( ur22 ) )scale = one / bbnd
+                 end if
+                 xr2 = ( br2*scale ) / ur22
+                 xr1 = ( scale*br1 )*ur11r - xr2*( ur11r*ur12 )
+                 if( zswap( icmax ) ) then
+                    x( 1, 1 ) = xr2
+                    x( 2, 1 ) = xr1
+                 else
+                    x( 1, 1 ) = xr1
+                    x( 2, 1 ) = xr2
+                 end if
+                 xnorm = max( abs( xr1 ), abs( xr2 ) )
+                 ! further scaling if  norm(a) norm(x) > overflow
+                 if( xnorm>one .and. cmax>one ) then
+                    if( xnorm>bignum / cmax ) then
+                       temp = cmax / bignum
+                       x( 1, 1 ) = temp*x( 1, 1 )
+                       x( 2, 1 ) = temp*x( 2, 1 )
+                       xnorm = temp*xnorm
+                       scale = temp*scale
+                    end if
+                 end if
+              else
+                 ! complex 2x2 system  (w is complex)
+                 ! find the largest element in c
+                 ci( 1, 1 ) = -wi*d1
+                 ci( 2, 1 ) = zero
+                 ci( 1, 2 ) = zero
+                 ci( 2, 2 ) = -wi*d2
+                 cmax = zero
+                 icmax = 0
+                 do j = 1, 4
+                    if( abs( crv( j ) )+abs( civ( j ) )>cmax ) then
+                       cmax = abs( crv( j ) ) + abs( civ( j ) )
+                       icmax = j
+                    end if
+                 end do
+                 ! if norm(c) < smini, use smini*identity.
+                 if( cmax<smini ) then
+                    bnorm = max( abs( b( 1, 1 ) )+abs( b( 1, 2 ) ),abs( b( 2, 1 ) )+abs( b( 2, 2 )&
+                               ) )
+                    if( smini<one .and. bnorm>one ) then
+                       if( bnorm>bignum*smini )scale = one / bnorm
+                    end if
+                    temp = scale / smini
+                    x( 1, 1 ) = temp*b( 1, 1 )
+                    x( 2, 1 ) = temp*b( 2, 1 )
+                    x( 1, 2 ) = temp*b( 1, 2 )
+                    x( 2, 2 ) = temp*b( 2, 2 )
+                    xnorm = temp*bnorm
+                    info = 1
+                    return
+                 end if
+                 ! gaussian elimination with complete pivoting.
+                 ur11 = crv( icmax )
+                 ui11 = civ( icmax )
+                 cr21 = crv( ipivot( 2, icmax ) )
+                 ci21 = civ( ipivot( 2, icmax ) )
+                 ur12 = crv( ipivot( 3, icmax ) )
+                 ui12 = civ( ipivot( 3, icmax ) )
+                 cr22 = crv( ipivot( 4, icmax ) )
+                 ci22 = civ( ipivot( 4, icmax ) )
+                 if( icmax==1 .or. icmax==4 ) then
+                    ! code when off-diagonals of pivoted c are real
+                    if( abs( ur11 )>abs( ui11 ) ) then
+                       temp = ui11 / ur11
+                       ur11r = one / ( ur11*( one+temp**2 ) )
+                       ui11r = -temp*ur11r
+                    else
+                       temp = ur11 / ui11
+                       ui11r = -one / ( ui11*( one+temp**2 ) )
+                       ur11r = -temp*ui11r
+                    end if
+                    lr21 = cr21*ur11r
+                    li21 = cr21*ui11r
+                    ur12s = ur12*ur11r
+                    ui12s = ur12*ui11r
+                    ur22 = cr22 - ur12*lr21
+                    ui22 = ci22 - ur12*li21
+                 else
+                    ! code when diagonals of pivoted c are real
+                    ur11r = one / ur11
+                    ui11r = zero
+                    lr21 = cr21*ur11r
+                    li21 = ci21*ur11r
+                    ur12s = ur12*ur11r
+                    ui12s = ui12*ur11r
+                    ur22 = cr22 - ur12*lr21 + ui12*li21
+                    ui22 = -ur12*li21 - ui12*lr21
+                 end if
+                 u22abs = abs( ur22 ) + abs( ui22 )
+                 ! if smaller pivot < smini, use smini
+                 if( u22abs<smini ) then
+                    ur22 = smini
+                    ui22 = zero
+                    info = 1
+                 end if
+                 if( rswap( icmax ) ) then
+                    br2 = b( 1, 1 )
+                    br1 = b( 2, 1 )
+                    bi2 = b( 1, 2 )
+                    bi1 = b( 2, 2 )
+                 else
+                    br1 = b( 1, 1 )
+                    br2 = b( 2, 1 )
+                    bi1 = b( 1, 2 )
+                    bi2 = b( 2, 2 )
+                 end if
+                 br2 = br2 - lr21*br1 + li21*bi1
+                 bi2 = bi2 - li21*br1 - lr21*bi1
+                 bbnd = max( ( abs( br1 )+abs( bi1 ) )*( u22abs*( abs( ur11r )+abs( ui11r ) ) ),&
+                           abs( br2 )+abs( bi2 ) )
+                 if( bbnd>one .and. u22abs<one ) then
+                    if( bbnd>=bignum*u22abs ) then
+                       scale = one / bbnd
+                       br1 = scale*br1
+                       bi1 = scale*bi1
+                       br2 = scale*br2
+                       bi2 = scale*bi2
+                    end if
+                 end if
+                 call stdlib_dladiv( br2, bi2, ur22, ui22, xr2, xi2 )
+                 xr1 = ur11r*br1 - ui11r*bi1 - ur12s*xr2 + ui12s*xi2
+                 xi1 = ui11r*br1 + ur11r*bi1 - ui12s*xr2 - ur12s*xi2
+                 if( zswap( icmax ) ) then
+                    x( 1, 1 ) = xr2
+                    x( 2, 1 ) = xr1
+                    x( 1, 2 ) = xi2
+                    x( 2, 2 ) = xi1
+                 else
+                    x( 1, 1 ) = xr1
+                    x( 2, 1 ) = xr2
+                    x( 1, 2 ) = xi1
+                    x( 2, 2 ) = xi2
+                 end if
+                 xnorm = max( abs( xr1 )+abs( xi1 ), abs( xr2 )+abs( xi2 ) )
+                 ! further scaling if  norm(a) norm(x) > overflow
+                 if( xnorm>one .and. cmax>one ) then
+                    if( xnorm>bignum / cmax ) then
+                       temp = cmax / bignum
+                       x( 1, 1 ) = temp*x( 1, 1 )
+                       x( 2, 1 ) = temp*x( 2, 1 )
+                       x( 1, 2 ) = temp*x( 1, 2 )
+                       x( 2, 2 ) = temp*x( 2, 2 )
+                       xnorm = temp*xnorm
+                       scale = temp*scale
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dlaln2
+
+     !> DLALS0: applies back the multiplying factors of either the left or the
+     !> right singular vector matrix of a diagonal matrix appended by a row
+     !> to the right hand side matrix B in solving the least squares problem
+     !> using the divide-and-conquer SVD approach.
+     !> For the left singular vector matrix, three types of orthogonal
+     !> matrices are involved:
+     !> (1L) Givens rotations: the number of such rotations is GIVPTR; the
+     !> pairs of columns/rows they were applied to are stored in GIVCOL;
+     !> and the C- and S-values of these rotations are stored in GIVNUM.
+     !> (2L) Permutation. The (NL+1)-st row of B is to be moved to the first
+     !> row, and for J=2:N, PERM(J)-th row of B is to be moved to the
+     !> J-th row.
+     !> (3L) The left singular vector matrix of the remaining matrix.
+     !> For the right singular vector matrix, four types of orthogonal
+     !> matrices are involved:
+     !> (1R) The right singular vector matrix of the remaining matrix.
+     !> (2R) If SQRE = 1, one extra Givens rotation to generate the right
+     !> null space.
+     !> (3R) The inverse transformation of (2L).
+     !> (4R) The inverse transformation of (1L).
+
+     pure subroutine stdlib_dlals0( icompq, nl, nr, sqre, nrhs, b, ldb, bx, ldbx,perm, givptr, &
+               givcol, ldgcol, givnum, ldgnum,poles, difl, difr, z, k, c, s, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: givptr, icompq, k, ldb, ldbx, ldgcol, ldgnum, nl, nr, nrhs,&
+                      sqre
+           integer(ilp), intent(out) :: info
+           real(dp), intent(in) :: c, s
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(ldgcol,*), perm(*)
+           real(dp), intent(inout) :: b(ldb,*)
+           real(dp), intent(out) :: bx(ldbx,*), work(*)
+           real(dp), intent(in) :: difl(*), difr(ldgnum,*), givnum(ldgnum,*), poles(ldgnum,*), z(&
+                     *)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, m, n, nlp1
+           real(dp) :: diflj, difrj, dj, dsigj, dsigjp, temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           n = nl + nr + 1
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( nl<1 ) then
+              info = -2
+           else if( nr<1 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( nrhs<1 ) then
+              info = -5
+           else if( ldb<n ) then
+              info = -7
+           else if( ldbx<n ) then
+              info = -9
+           else if( givptr<0 ) then
+              info = -11
+           else if( ldgcol<n ) then
+              info = -13
+           else if( ldgnum<n ) then
+              info = -15
+           else if( k<1 ) then
+              info = -20
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLALS0', -info )
+              return
+           end if
+           m = n + sqre
+           nlp1 = nl + 1
+           if( icompq==0 ) then
+              ! apply back orthogonal transformations from the left.
+              ! step (1l): apply back the givens rotations performed.
+              do i = 1, givptr
+                 call stdlib_drot( nrhs, b( givcol( i, 2 ), 1 ), ldb,b( givcol( i, 1 ), 1 ), ldb, &
+                           givnum( i, 2 ),givnum( i, 1 ) )
+              end do
+              ! step (2l): permute rows of b.
+              call stdlib_dcopy( nrhs, b( nlp1, 1 ), ldb, bx( 1, 1 ), ldbx )
+              do i = 2, n
+                 call stdlib_dcopy( nrhs, b( perm( i ), 1 ), ldb, bx( i, 1 ), ldbx )
+              end do
+              ! step (3l): apply the inverse of the left singular vector
+              ! matrix to bx.
+              if( k==1 ) then
+                 call stdlib_dcopy( nrhs, bx, ldbx, b, ldb )
+                 if( z( 1 )<zero ) then
+                    call stdlib_dscal( nrhs, negone, b, ldb )
+                 end if
+              else
+                 loop_50: do j = 1, k
+                    diflj = difl( j )
+                    dj = poles( j, 1 )
+                    dsigj = -poles( j, 2 )
+                    if( j<k ) then
+                       difrj = -difr( j, 1 )
+                       dsigjp = -poles( j+1, 2 )
+                    end if
+                    if( ( z( j )==zero ) .or. ( poles( j, 2 )==zero ) )then
+                       work( j ) = zero
+                    else
+                       work( j ) = -poles( j, 2 )*z( j ) / diflj /( poles( j, 2 )+dj )
+                    end if
+                    do i = 1, j - 1
+                       if( ( z( i )==zero ) .or.( poles( i, 2 )==zero ) ) then
+                          work( i ) = zero
+                       else
+                          work( i ) = poles( i, 2 )*z( i ) /( stdlib_dlamc3( poles( i, 2 ), dsigj &
+                                    )-diflj ) / ( poles( i, 2 )+dj )
+                       end if
+                    end do
+                    do i = j + 1, k
+                       if( ( z( i )==zero ) .or.( poles( i, 2 )==zero ) ) then
+                          work( i ) = zero
+                       else
+                          work( i ) = poles( i, 2 )*z( i ) /( stdlib_dlamc3( poles( i, 2 ), &
+                                    dsigjp )+difrj ) / ( poles( i, 2 )+dj )
+                       end if
+                    end do
+                    work( 1 ) = negone
+                    temp = stdlib_dnrm2( k, work, 1 )
+                    call stdlib_dgemv( 'T', k, nrhs, one, bx, ldbx, work, 1, zero,b( j, 1 ), ldb )
+                              
+                    call stdlib_dlascl( 'G', 0, 0, temp, one, 1, nrhs, b( j, 1 ),ldb, info )
+                              
+                 end do loop_50
+              end if
+              ! move the deflated rows of bx to b also.
+              if( k<max( m, n ) )call stdlib_dlacpy( 'A', n-k, nrhs, bx( k+1, 1 ), ldbx,b( k+1, 1 &
+                        ), ldb )
+           else
+              ! apply back the right orthogonal transformations.
+              ! step (1r): apply back the new right singular vector matrix
+              ! to b.
+              if( k==1 ) then
+                 call stdlib_dcopy( nrhs, b, ldb, bx, ldbx )
+              else
+                 do j = 1, k
+                    dsigj = poles( j, 2 )
+                    if( z( j )==zero ) then
+                       work( j ) = zero
+                    else
+                       work( j ) = -z( j ) / difl( j ) /( dsigj+poles( j, 1 ) ) / difr( j, 2 )
+                                 
+                    end if
+                    do i = 1, j - 1
+                       if( z( j )==zero ) then
+                          work( i ) = zero
+                       else
+                          work( i ) = z( j ) / ( stdlib_dlamc3( dsigj, -poles( i+1,2 ) )-difr( i, &
+                                    1 ) ) /( dsigj+poles( i, 1 ) ) / difr( i, 2 )
+                       end if
+                    end do
+                    do i = j + 1, k
+                       if( z( j )==zero ) then
+                          work( i ) = zero
+                       else
+                          work( i ) = z( j ) / ( stdlib_dlamc3( dsigj, -poles( i,2 ) )-difl( i ) )&
+                                     /( dsigj+poles( i, 1 ) ) / difr( i, 2 )
+                       end if
+                    end do
+                    call stdlib_dgemv( 'T', k, nrhs, one, b, ldb, work, 1, zero,bx( j, 1 ), ldbx )
+                              
+                 end do
+              end if
+              ! step (2r): if sqre = 1, apply back the rotation that is
+              ! related to the right null space of the subproblem.
+              if( sqre==1 ) then
+                 call stdlib_dcopy( nrhs, b( m, 1 ), ldb, bx( m, 1 ), ldbx )
+                 call stdlib_drot( nrhs, bx( 1, 1 ), ldbx, bx( m, 1 ), ldbx, c, s )
+              end if
+              if( k<max( m, n ) )call stdlib_dlacpy( 'A', n-k, nrhs, b( k+1, 1 ), ldb, bx( k+1, 1 &
+                        ),ldbx )
+              ! step (3r): permute rows of b.
+              call stdlib_dcopy( nrhs, bx( 1, 1 ), ldbx, b( nlp1, 1 ), ldb )
+              if( sqre==1 ) then
+                 call stdlib_dcopy( nrhs, bx( m, 1 ), ldbx, b( m, 1 ), ldb )
+              end if
+              do i = 2, n
+                 call stdlib_dcopy( nrhs, bx( i, 1 ), ldbx, b( perm( i ), 1 ), ldb )
+              end do
+              ! step (4r): apply back the givens rotations performed.
+              do i = givptr, 1, -1
+                 call stdlib_drot( nrhs, b( givcol( i, 2 ), 1 ), ldb,b( givcol( i, 1 ), 1 ), ldb, &
+                           givnum( i, 2 ),-givnum( i, 1 ) )
+              end do
+           end if
+           return
+     end subroutine stdlib_dlals0
+
+     !> DLAMSWLQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of blocked
+     !> elementary reflectors computed by short wide LQ
+     !> factorization (DLASWLQ)
+
+     pure subroutine stdlib_dlamswlq( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, mb, nb, ldt, lwork, ldc
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), t(ldt,*)
+           real(dp), intent(out) :: work(*)
+           real(dp), intent(inout) :: c(ldc,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: i, ii, kk, ctr, lw
+           ! External Subroutines 
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork<0
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'T' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           if (left) then
+             lw = n * mb
+           else
+             lw = m * mb
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( k<0 ) then
+             info = -5
+           else if( m<k ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<mb .or. mb<1) then
+             info = -6
+           else if( lda<max( 1, k ) ) then
+             info = -9
+           else if( ldt<max( 1, mb) ) then
+             info = -11
+           else if( ldc<max( 1, m ) ) then
+              info = -13
+           else if(( lwork<max(1,lw)).and.(.not.lquery)) then
+             info = -15
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DLAMSWLQ', -info )
+             work(1) = lw
+             return
+           else if (lquery) then
+             work(1) = lw
+             return
+           end if
+           ! quick return if possible
+           if( min(m,n,k)==0 ) then
+             return
+           end if
+           if((nb<=k).or.(nb>=max(m,n,k))) then
+             call stdlib_dgemlqt( side, trans, m, n, k, mb, a, lda,t, ldt, c, ldc, work, info)
+                       
+             return
+           end if
+           if(left.and.tran) then
+               ! multiply q to the last block of c
+               kk = mod((m-k),(nb-k))
+               ctr = (m-k)/(nb-k)
+               if (kk>0) then
+                 ii=m-kk+1
+                 call stdlib_dtpmlqt('L','T',kk , n, k, 0, mb, a(1,ii), lda,t(1,ctr*k+1), ldt, c(&
+                           1,1), ldc,c(ii,1), ldc, work, info )
+               else
+                 ii=m+1
+               end if
+               do i=ii-(nb-k),nb+1,-(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+nb)
+                 ctr = ctr - 1
+                 call stdlib_dtpmlqt('L','T',nb-k , n, k, 0,mb, a(1,i), lda,t(1, ctr*k+1),ldt, c(&
+                           1,1), ldc,c(i,1), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:nb)
+               call stdlib_dgemlqt('L','T',nb , n, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (left.and.notran) then
+               ! multiply q to the first block of c
+              kk = mod((m-k),(nb-k))
+              ii=m-kk+1
+              ctr = 1
+              call stdlib_dgemlqt('L','N',nb , n, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=nb+1,ii-nb+k,(nb-k)
+               ! multiply q to the current block of c (i:i+nb,1:n)
+               call stdlib_dtpmlqt('L','N',nb-k , n, k, 0,mb, a(1,i), lda,t(1,ctr*k+1), ldt, c(1,&
+                         1), ldc,c(i,1), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=m) then
+               ! multiply q to the last block of c
+               call stdlib_dtpmlqt('L','N',kk , n, k, 0, mb, a(1,ii), lda,t(1,ctr*k+1), ldt, c(1,&
+                         1), ldc,c(ii,1), ldc, work, info )
+              end if
+           else if(right.and.notran) then
+               ! multiply q to the last block of c
+               kk = mod((n-k),(nb-k))
+               ctr = (n-k)/(nb-k)
+               if (kk>0) then
+                 ii=n-kk+1
+                 call stdlib_dtpmlqt('R','N',m , kk, k, 0, mb, a(1, ii), lda,t(1,ctr *k+1), ldt, &
+                           c(1,1), ldc,c(1,ii), ldc, work, info )
+               else
+                 ii=n+1
+               end if
+               do i=ii-(nb-k),nb+1,-(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+                  ctr = ctr - 1
+                  call stdlib_dtpmlqt('R','N', m, nb-k, k, 0, mb, a(1, i), lda,t(1,ctr*k+1), ldt, &
+                            c(1,1), ldc,c(1,i), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:mb)
+               call stdlib_dgemlqt('R','N',m , nb, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (right.and.tran) then
+             ! multiply q to the first block of c
+              kk = mod((n-k),(nb-k))
+              ctr = 1
+              ii=n-kk+1
+              call stdlib_dgemlqt('R','T',m , nb, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=nb+1,ii-nb+k,(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               call stdlib_dtpmlqt('R','T',m , nb-k, k, 0,mb, a(1,i), lda,t(1,ctr*k+1), ldt, c(1,&
+                         1), ldc,c(1,i), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=n) then
+             ! multiply q to the last block of c
+               call stdlib_dtpmlqt('R','T',m , kk, k, 0,mb, a(1,ii), lda,t(1,ctr*k+1),ldt, c(1,1),&
+                          ldc,c(1,ii), ldc, work, info )
+              end if
+           end if
+           work(1) = lw
+           return
+     end subroutine stdlib_dlamswlq
+
+     !> DLAMTSQR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product
+     !> of blocked elementary reflectors computed by tall skinny
+     !> QR factorization (DLATSQR)
+
+     pure subroutine stdlib_dlamtsqr( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, mb, nb, ldt, lwork, ldc
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), t(ldt,*)
+           real(dp), intent(out) :: work(*)
+           real(dp), intent(inout) :: c(ldc,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: i, ii, kk, lw, ctr, q
+           ! External Subroutines 
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork<0
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'T' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           if (left) then
+             lw = n * nb
+             q = m
+           else
+             lw = mb * nb
+             q = n
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<k ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 ) then
+             info = -5
+           else if( k<nb .or. nb<1 ) then
+             info = -7
+           else if( lda<max( 1, q ) ) then
+             info = -9
+           else if( ldt<max( 1, nb) ) then
+             info = -11
+           else if( ldc<max( 1, m ) ) then
+              info = -13
+           else if(( lwork<max(1,lw)).and.(.not.lquery)) then
+             info = -15
+           end if
+           ! determine the block size if it is tall skinny or short and wide
+           if( info==0)  then
+               work(1) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DLAMTSQR', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n,k)==0 ) then
+             return
+           end if
+           if((mb<=k).or.(mb>=max(m,n,k))) then
+             call stdlib_dgemqrt( side, trans, m, n, k, nb, a, lda,t, ldt, c, ldc, work, info)
+                       
+             return
+            end if
+           if(left.and.notran) then
+               ! multiply q to the last block of c
+              kk = mod((m-k),(mb-k))
+              ctr = (m-k)/(mb-k)
+              if (kk>0) then
+                ii=m-kk+1
+                call stdlib_dtpmqrt('L','N',kk , n, k, 0, nb, a(ii,1), lda,t(1,ctr*k+1),ldt , c(1,&
+                          1), ldc,c(ii,1), ldc, work, info )
+              else
+                ii=m+1
+              end if
+              do i=ii-(mb-k),mb+1,-(mb-k)
+               ! multiply q to the current block of c (i:i+mb,1:n)
+                ctr = ctr - 1
+                call stdlib_dtpmqrt('L','N',mb-k , n, k, 0,nb, a(i,1), lda,t(1,ctr*k+1),ldt, c(1,&
+                          1), ldc,c(i,1), ldc, work, info )
+              end do
+               ! multiply q to the first block of c (1:mb,1:n)
+              call stdlib_dgemqrt('L','N',mb , n, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+           else if (left.and.tran) then
+               ! multiply q to the first block of c
+              kk = mod((m-k),(mb-k))
+              ii=m-kk+1
+              ctr = 1
+              call stdlib_dgemqrt('L','T',mb , n, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=mb+1,ii-mb+k,(mb-k)
+               ! multiply q to the current block of c (i:i+mb,1:n)
+               call stdlib_dtpmqrt('L','T',mb-k , n, k, 0,nb, a(i,1), lda,t(1,ctr * k + 1),ldt, c(&
+                         1,1), ldc,c(i,1), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=m) then
+               ! multiply q to the last block of c
+               call stdlib_dtpmqrt('L','T',kk , n, k, 0,nb, a(ii,1), lda,t(1,ctr * k + 1), ldt, c(&
+                         1,1), ldc,c(ii,1), ldc, work, info )
+              end if
+           else if(right.and.tran) then
+               ! multiply q to the last block of c
+               kk = mod((n-k),(mb-k))
+               ctr = (n-k)/(mb-k)
+               if (kk>0) then
+                 ii=n-kk+1
+                 call stdlib_dtpmqrt('R','T',m , kk, k, 0, nb, a(ii,1), lda,t(1,ctr*k+1), ldt, c(&
+                           1,1), ldc,c(1,ii), ldc, work, info )
+               else
+                 ii=n+1
+               end if
+               do i=ii-(mb-k),mb+1,-(mb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+                 ctr = ctr - 1
+                 call stdlib_dtpmqrt('R','T',m , mb-k, k, 0,nb, a(i,1), lda,t(1,ctr*k+1), ldt, c(&
+                           1,1), ldc,c(1,i), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:mb)
+               call stdlib_dgemqrt('R','T',m , mb, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (right.and.notran) then
+               ! multiply q to the first block of c
+              kk = mod((n-k),(mb-k))
+              ii=n-kk+1
+              ctr = 1
+              call stdlib_dgemqrt('R','N', m, mb , k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=mb+1,ii-mb+k,(mb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               call stdlib_dtpmqrt('R','N', m, mb-k, k, 0,nb, a(i,1), lda,t(1, ctr * k + 1),ldt, &
+                         c(1,1), ldc,c(1,i), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=n) then
+               ! multiply q to the last block of c
+               call stdlib_dtpmqrt('R','N', m, kk , k, 0,nb, a(ii,1), lda,t(1, ctr * k + 1),ldt, &
+                         c(1,1), ldc,c(1,ii), ldc, work, info )
+              end if
+           end if
+           work(1) = lw
+           return
+     end subroutine stdlib_dlamtsqr
+
+     !> DLANV2: computes the Schur factorization of a real 2-by-2 nonsymmetric
+     !> matrix in standard form:
+     !> [ A  B ] = [ CS -SN ] [ AA  BB ] [ CS  SN ]
+     !> [ C  D ]   [ SN  CS ] [ CC  DD ] [-SN  CS ]
+     !> where either
+     !> 1) CC = 0 so that AA and DD are real eigenvalues of the matrix, or
+     !> 2) AA = DD and BB*CC < 0, so that AA + or - sqrt(BB*CC) are complex
+     !> conjugate eigenvalues.
+
+     pure subroutine stdlib_dlanv2( a, b, c, d, rt1r, rt1i, rt2r, rt2i, cs, sn )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(inout) :: a, b, c, d
+           real(dp), intent(out) :: cs, rt1i, rt1r, rt2i, rt2r, sn
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: multpl = 4.0e+0_dp
+           
+           
+           ! Local Scalars 
+           real(dp) :: aa, bb, bcmax, bcmis, cc, cs1, dd, eps, p, sab, sac, scale, sigma, sn1, &
+                     tau, temp, z, safmin, safmn2, safmx2
+           integer(ilp) :: count
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sign,sqrt
+           ! Executable Statements 
+           safmin = stdlib_dlamch( 'S' )
+           eps = stdlib_dlamch( 'P' )
+           safmn2 = stdlib_dlamch( 'B' )**int( log( safmin / eps ) /log( stdlib_dlamch( 'B' ) ) / &
+                     two,KIND=ilp)
+           safmx2 = one / safmn2
+           if( c==zero ) then
+              cs = one
+              sn = zero
+           else if( b==zero ) then
+              ! swap rows and columns
+              cs = zero
+              sn = one
+              temp = d
+              d = a
+              a = temp
+              b = -c
+              c = zero
+           else if( ( a-d )==zero .and. sign( one, b )/=sign( one, c ) )then
+              cs = one
+              sn = zero
+           else
+              temp = a - d
+              p = half*temp
+              bcmax = max( abs( b ), abs( c ) )
+              bcmis = min( abs( b ), abs( c ) )*sign( one, b )*sign( one, c )
+              scale = max( abs( p ), bcmax )
+              z = ( p / scale )*p + ( bcmax / scale )*bcmis
+              ! if z is of the order of the machine accuracy, postpone the
+              ! decision on the nature of eigenvalues
+              if( z>=multpl*eps ) then
+                 ! real eigenvalues. compute a and d.
+                 z = p + sign( sqrt( scale )*sqrt( z ), p )
+                 a = d + z
+                 d = d - ( bcmax / z )*bcmis
+                 ! compute b and the rotation matrix
+                 tau = stdlib_dlapy2( c, z )
+                 cs = z / tau
+                 sn = c / tau
+                 b = b - c
+                 c = zero
+              else
+                 ! complex eigenvalues, or real(almost,KIND=dp) equal eigenvalues.
+                 ! make diagonal elements equal.
+                 count = 0
+                 sigma = b + c
+                 10 continue
+                 count = count + 1
+                 scale = max( abs(temp), abs(sigma) )
+                 if( scale>=safmx2 ) then
+                    sigma = sigma * safmn2
+                    temp = temp * safmn2
+                    if (count <= 20)goto 10
+                 end if
+                 if( scale<=safmn2 ) then
+                    sigma = sigma * safmx2
+                    temp = temp * safmx2
+                    if (count <= 20)goto 10
+                 end if
+                 p = half*temp
+                 tau = stdlib_dlapy2( sigma, temp )
+                 cs = sqrt( half*( one+abs( sigma ) / tau ) )
+                 sn = -( p / ( tau*cs ) )*sign( one, sigma )
+                 ! compute [ aa  bb ] = [ a  b ] [ cs -sn ]
+                         ! [ cc  dd ]   [ c  d ] [ sn  cs ]
+                 aa = a*cs + b*sn
+                 bb = -a*sn + b*cs
+                 cc = c*cs + d*sn
+                 dd = -c*sn + d*cs
+                 ! compute [ a  b ] = [ cs  sn ] [ aa  bb ]
+                         ! [ c  d ]   [-sn  cs ] [ cc  dd ]
+                 a = aa*cs + cc*sn
+                 b = bb*cs + dd*sn
+                 c = -aa*sn + cc*cs
+                 d = -bb*sn + dd*cs
+                 temp = half*( a+d )
+                 a = temp
+                 d = temp
+                 if( c/=zero ) then
+                    if( b/=zero ) then
+                       if( sign( one, b )==sign( one, c ) ) then
+                          ! real eigenvalues: reduce to upper triangular form
+                          sab = sqrt( abs( b ) )
+                          sac = sqrt( abs( c ) )
+                          p = sign( sab*sac, c )
+                          tau = one / sqrt( abs( b+c ) )
+                          a = temp + p
+                          d = temp - p
+                          b = b - c
+                          c = zero
+                          cs1 = sab*tau
+                          sn1 = sac*tau
+                          temp = cs*cs1 - sn*sn1
+                          sn = cs*sn1 + sn*cs1
+                          cs = temp
+                       end if
+                    else
+                       b = -c
+                       c = zero
+                       temp = cs
+                       cs = -sn
+                       sn = temp
+                    end if
+                 end if
+              end if
+           end if
+           ! store eigenvalues in (rt1r,rt1i) and (rt2r,rt2i).
+           rt1r = a
+           rt2r = d
+           if( c==zero ) then
+              rt1i = zero
+              rt2i = zero
+           else
+              rt1i = sqrt( abs( b ) )*sqrt( abs( c ) )
+              rt2i = -rt1i
+           end if
+           return
+     end subroutine stdlib_dlanv2
+
+     !> Given two column vectors X and Y, let
+     !> A = ( X Y ).
+     !> The subroutine first computes the QR factorization of A = Q*R,
+     !> and then computes the SVD of the 2-by-2 upper triangular matrix R.
+     !> The smaller singular value of R is returned in SSMIN, which is used
+     !> as the measurement of the linear dependency of the vectors X and Y.
+
+     pure subroutine stdlib_dlapll( n, x, incx, y, incy, ssmin )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           real(dp), intent(out) :: ssmin
+           ! Array Arguments 
+           real(dp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: a11, a12, a22, c, ssmax, tau
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 ) then
+              ssmin = zero
+              return
+           end if
+           ! compute the qr factorization of the n-by-2 matrix ( x y )
+           call stdlib_dlarfg( n, x( 1 ), x( 1+incx ), incx, tau )
+           a11 = x( 1 )
+           x( 1 ) = one
+           c = -tau*stdlib_ddot( n, x, incx, y, incy )
+           call stdlib_daxpy( n, c, x, incx, y, incy )
+           call stdlib_dlarfg( n-1, y( 1+incy ), y( 1+2*incy ), incy, tau )
+           a12 = y( 1 )
+           a22 = y( 1+incy )
+           ! compute the svd of 2-by-2 upper triangular matrix.
+           call stdlib_dlas2( a11, a12, a22, ssmin, ssmax )
+           return
+     end subroutine stdlib_dlapll
+
+     !> DLAQP2: computes a QR factorization with column pivoting of
+     !> the block A(OFFSET+1:M,1:N).
+     !> The block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.
+
+     pure subroutine stdlib_dlaqp2( m, n, offset, a, lda, jpvt, tau, vn1, vn2,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, m, n, offset
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(dp), intent(inout) :: a(lda,*), vn1(*), vn2(*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, itemp, j, mn, offpi, pvt
+           real(dp) :: aii, temp, temp2, tol3z
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           mn = min( m-offset, n )
+           tol3z = sqrt(stdlib_dlamch('EPSILON'))
+           ! compute factorization.
+           loop_20: do i = 1, mn
+              offpi = offset + i
+              ! determine ith pivot column and swap if necessary.
+              pvt = ( i-1 ) + stdlib_idamax( n-i+1, vn1( i ), 1 )
+              if( pvt/=i ) then
+                 call stdlib_dswap( m, a( 1, pvt ), 1, a( 1, i ), 1 )
+                 itemp = jpvt( pvt )
+                 jpvt( pvt ) = jpvt( i )
+                 jpvt( i ) = itemp
+                 vn1( pvt ) = vn1( i )
+                 vn2( pvt ) = vn2( i )
+              end if
+              ! generate elementary reflector h(i).
+              if( offpi<m ) then
+                 call stdlib_dlarfg( m-offpi+1, a( offpi, i ), a( offpi+1, i ), 1,tau( i ) )
+                           
+              else
+                 call stdlib_dlarfg( 1, a( m, i ), a( m, i ), 1, tau( i ) )
+              end if
+              if( i<n ) then
+                 ! apply h(i)**t to a(offset+i:m,i+1:n) from the left.
+                 aii = a( offpi, i )
+                 a( offpi, i ) = one
+                 call stdlib_dlarf( 'LEFT', m-offpi+1, n-i, a( offpi, i ), 1,tau( i ), a( offpi, &
+                           i+1 ), lda, work( 1 ) )
+                 a( offpi, i ) = aii
+              end if
+              ! update partial column norms.
+              do j = i + 1, n
+                 if( vn1( j )/=zero ) then
+                    ! note: the following 4 lines follow from the analysis in
+                    ! lapack working note 176.
+                    temp = one - ( abs( a( offpi, j ) ) / vn1( j ) )**2
+                    temp = max( temp, zero )
+                    temp2 = temp*( vn1( j ) / vn2( j ) )**2
+                    if( temp2 <= tol3z ) then
+                       if( offpi<m ) then
+                          vn1( j ) = stdlib_dnrm2( m-offpi, a( offpi+1, j ), 1 )
+                          vn2( j ) = vn1( j )
+                       else
+                          vn1( j ) = zero
+                          vn2( j ) = zero
+                       end if
+                    else
+                       vn1( j ) = vn1( j )*sqrt( temp )
+                    end if
+                 end if
+              end do
+           end do loop_20
+           return
+     end subroutine stdlib_dlaqp2
+
+     !> DLAQPS: computes a step of QR factorization with column pivoting
+     !> of a real M-by-N matrix A by using Blas-3.  It tries to factorize
+     !> NB columns from A starting from the row OFFSET+1, and updates all
+     !> of the matrix with Blas-3 xGEMM.
+     !> In some cases, due to catastrophic cancellations, it cannot
+     !> factorize NB columns.  Hence, the actual number of factorized
+     !> columns is returned in KB.
+     !> Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.
+
+     pure subroutine stdlib_dlaqps( m, n, offset, nb, kb, a, lda, jpvt, tau, vn1,vn2, auxv, f, &
+               ldf )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: kb
+           integer(ilp), intent(in) :: lda, ldf, m, n, nb, offset
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(dp), intent(inout) :: a(lda,*), auxv(*), f(ldf,*), vn1(*), vn2(*)
+           real(dp), intent(out) :: tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: itemp, j, k, lastrk, lsticc, pvt, rk
+           real(dp) :: akk, temp, temp2, tol3z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,nint,sqrt
+           ! Executable Statements 
+           lastrk = min( m, n+offset )
+           lsticc = 0
+           k = 0
+           tol3z = sqrt(stdlib_dlamch('EPSILON'))
+           ! beginning of while loop.
+           10 continue
+           if( ( k<nb ) .and. ( lsticc==0 ) ) then
+              k = k + 1
+              rk = offset + k
+              ! determine ith pivot column and swap if necessary
+              pvt = ( k-1 ) + stdlib_idamax( n-k+1, vn1( k ), 1 )
+              if( pvt/=k ) then
+                 call stdlib_dswap( m, a( 1, pvt ), 1, a( 1, k ), 1 )
+                 call stdlib_dswap( k-1, f( pvt, 1 ), ldf, f( k, 1 ), ldf )
+                 itemp = jpvt( pvt )
+                 jpvt( pvt ) = jpvt( k )
+                 jpvt( k ) = itemp
+                 vn1( pvt ) = vn1( k )
+                 vn2( pvt ) = vn2( k )
+              end if
+              ! apply previous householder reflectors to column k:
+              ! a(rk:m,k) := a(rk:m,k) - a(rk:m,1:k-1)*f(k,1:k-1)**t.
+              if( k>1 ) then
+                 call stdlib_dgemv( 'NO TRANSPOSE', m-rk+1, k-1, -one, a( rk, 1 ),lda, f( k, 1 ), &
+                           ldf, one, a( rk, k ), 1 )
+              end if
+              ! generate elementary reflector h(k).
+              if( rk<m ) then
+                 call stdlib_dlarfg( m-rk+1, a( rk, k ), a( rk+1, k ), 1, tau( k ) )
+              else
+                 call stdlib_dlarfg( 1, a( rk, k ), a( rk, k ), 1, tau( k ) )
+              end if
+              akk = a( rk, k )
+              a( rk, k ) = one
+              ! compute kth column of f:
+              ! compute  f(k+1:n,k) := tau(k)*a(rk:m,k+1:n)**t*a(rk:m,k).
+              if( k<n ) then
+                 call stdlib_dgemv( 'TRANSPOSE', m-rk+1, n-k, tau( k ),a( rk, k+1 ), lda, a( rk, &
+                           k ), 1, zero,f( k+1, k ), 1 )
+              end if
+              ! padding f(1:k,k) with zeros.
+              do j = 1, k
+                 f( j, k ) = zero
+              end do
+              ! incremental updating of f:
+              ! f(1:n,k) := f(1:n,k) - tau(k)*f(1:n,1:k-1)*a(rk:m,1:k-1)**t
+                          ! *a(rk:m,k).
+              if( k>1 ) then
+                 call stdlib_dgemv( 'TRANSPOSE', m-rk+1, k-1, -tau( k ), a( rk, 1 ),lda, a( rk, k &
+                           ), 1, zero, auxv( 1 ), 1 )
+                 call stdlib_dgemv( 'NO TRANSPOSE', n, k-1, one, f( 1, 1 ), ldf,auxv( 1 ), 1, one,&
+                            f( 1, k ), 1 )
+              end if
+              ! update the current row of a:
+              ! a(rk,k+1:n) := a(rk,k+1:n) - a(rk,1:k)*f(k+1:n,1:k)**t.
+              if( k<n ) then
+                 call stdlib_dgemv( 'NO TRANSPOSE', n-k, k, -one, f( k+1, 1 ), ldf,a( rk, 1 ), &
+                           lda, one, a( rk, k+1 ), lda )
+              end if
+              ! update partial column norms.
+              if( rk<lastrk ) then
+                 do j = k + 1, n
+                    if( vn1( j )/=zero ) then
+                       ! note: the following 4 lines follow from the analysis in
+                       ! lapack working note 176.
+                       temp = abs( a( rk, j ) ) / vn1( j )
+                       temp = max( zero, ( one+temp )*( one-temp ) )
+                       temp2 = temp*( vn1( j ) / vn2( j ) )**2
+                       if( temp2 <= tol3z ) then
+                          vn2( j ) = real( lsticc,KIND=dp)
+                          lsticc = j
+                       else
+                          vn1( j ) = vn1( j )*sqrt( temp )
+                       end if
+                    end if
+                 end do
+              end if
+              a( rk, k ) = akk
+              ! end of while loop.
+              go to 10
+           end if
+           kb = k
+           rk = offset + kb
+           ! apply the block reflector to the rest of the matrix:
+           ! a(offset+kb+1:m,kb+1:n) := a(offset+kb+1:m,kb+1:n) -
+                               ! a(offset+kb+1:m,1:kb)*f(kb+1:n,1:kb)**t.
+           if( kb<min( n, m-offset ) ) then
+              call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', m-rk, n-kb, kb, -one,a( rk+1, 1 ), &
+                        lda, f( kb+1, 1 ), ldf, one,a( rk+1, kb+1 ), lda )
+           end if
+           ! recomputation of difficult columns.
+           40 continue
+           if( lsticc>0 ) then
+              itemp = nint( vn2( lsticc ),KIND=ilp)
+              vn1( lsticc ) = stdlib_dnrm2( m-rk, a( rk+1, lsticc ), 1 )
+              ! note: the computation of vn1( lsticc ) relies on the fact that
+              ! stdlib_snrm2 does not fail on vectors with norm below the value of
+              ! sqrt(stdlib_dlamch('s'))
+              vn2( lsticc ) = vn1( lsticc )
+              lsticc = itemp
+              go to 40
+           end if
+           return
+     end subroutine stdlib_dlaqps
+
+     !> DLAQR5:, called by DLAQR0, performs a
+     !> single small-bulge multi-shift QR sweep.
+
+     pure subroutine stdlib_dlaqr5( wantt, wantz, kacc22, n, ktop, kbot, nshfts,sr, si, h, ldh, &
+               iloz, ihiz, z, ldz, v, ldv, u,ldu, nv, wv, ldwv, nh, wh, ldwh )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kacc22, kbot, ktop, ldh, ldu, ldv, ldwh, ldwv, &
+                     ldz, n, nh, nshfts, nv
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(dp), intent(inout) :: h(ldh,*), si(*), sr(*), z(ldz,*)
+           real(dp), intent(out) :: u(ldu,*), v(ldv,*), wh(ldwh,*), wv(ldwv,*)
+        ! ================================================================
+           
+           ! Local Scalars 
+           real(dp) :: alpha, beta, h11, h12, h21, h22, refsum, safmax, safmin, scl, smlnum, swap,&
+                      tst1, tst2, ulp
+           integer(ilp) :: i, i2, i4, incol, j, jbot, jcol, jlen, jrow, jtop, k, k1, kdu, kms, &
+                     krcol, m, m22, mbot, mtop, nbmps, ndcol, ns, nu
+           logical(lk) :: accum, bmp22
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,mod
+           ! Local Arrays 
+           real(dp) :: vt(3)
+           ! Executable Statements 
+           ! ==== if there are no shifts, then there is nothing to do. ====
+           if( nshfts<2 )return
+           ! ==== if the active block is empty or 1-by-1, then there
+           ! .    is nothing to do. ====
+           if( ktop>=kbot )return
+           ! ==== shuffle shifts into pairs of real shifts and pairs
+           ! .    of complex conjugate shifts assuming complex
+           ! .    conjugate shifts are already adjacent to one
+           ! .    another. ====
+           do i = 1, nshfts - 2, 2
+              if( si( i )/=-si( i+1 ) ) then
+                 swap = sr( i )
+                 sr( i ) = sr( i+1 )
+                 sr( i+1 ) = sr( i+2 )
+                 sr( i+2 ) = swap
+                 swap = si( i )
+                 si( i ) = si( i+1 )
+                 si( i+1 ) = si( i+2 )
+                 si( i+2 ) = swap
+              end if
+           end do
+           ! ==== nshfts is supposed to be even, but if it is odd,
+           ! .    then simply reduce it by one.  the shuffle above
+           ! .    ensures that the dropped shift is real and that
+           ! .    the remaining shifts are paired. ====
+           ns = nshfts - mod( nshfts, 2 )
+           ! ==== machine constants for deflation ====
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = one / safmin
+           call stdlib_dlabad( safmin, safmax )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=dp) / ulp )
+           ! ==== use accumulated reflections to update far-from-diagonal
+           ! .    entries ? ====
+           accum = ( kacc22==1 ) .or. ( kacc22==2 )
+           ! ==== clear trash ====
+           if( ktop+2<=kbot )h( ktop+2, ktop ) = zero
+           ! ==== nbmps = number of 2-shift bulges in the chain ====
+           nbmps = ns / 2
+           ! ==== kdu = width of slab ====
+           kdu = 4*nbmps
+           ! ==== create and chase chains of nbmps bulges ====
+           loop_180: do incol = ktop - 2*nbmps + 1, kbot - 2, 2*nbmps
+              ! jtop = index from which updates from the right start.
+              if( accum ) then
+                 jtop = max( ktop, incol )
+              else if( wantt ) then
+                 jtop = 1
+              else
+                 jtop = ktop
+              end if
+              ndcol = incol + kdu
+              if( accum )call stdlib_dlaset( 'ALL', kdu, kdu, zero, one, u, ldu )
+              ! ==== near-the-diagonal bulge chase.  the following loop
+              ! .    performs the near-the-diagonal part of a small bulge
+              ! .    multi-shift qr sweep.  each 4*nbmps column diagonal
+              ! .    chunk extends from column incol to column ndcol
+              ! .    (including both column incol and column ndcol). the
+              ! .    following loop chases a 2*nbmps+1 column long chain of
+              ! .    nbmps bulges 2*nbmps columns to the right.  (incol
+              ! .    may be less than ktop and and ndcol may be greater than
+              ! .    kbot indicating phantom columns from which to chase
+              ! .    bulges before they are actually introduced or to which
+              ! .    to chase bulges beyond column kbot.)  ====
+              loop_145: do krcol = incol, min( incol+2*nbmps-1, kbot-2 )
+                 ! ==== bulges number mtop to mbot are active double implicit
+                 ! .    shift bulges.  there may or may not also be small
+                 ! .    2-by-2 bulge, if there is room.  the inactive bulges
+                 ! .    (if any) must wait until the active bulges have moved
+                 ! .    down the diagonal to make room.  the phantom matrix
+                 ! .    paradigm described above helps keep track.  ====
+                 mtop = max( 1, ( ktop-krcol ) / 2+1 )
+                 mbot = min( nbmps, ( kbot-krcol-1 ) / 2 )
+                 m22 = mbot + 1
+                 bmp22 = ( mbot<nbmps ) .and. ( krcol+2*( m22-1 ) )==( kbot-2 )
+                 ! ==== generate reflections to chase the chain right
+                 ! .    one column.  (the minimum value of k is ktop-1.) ====
+                 if ( bmp22 ) then
+                    ! ==== special case: 2-by-2 reflection at bottom treated
+                    ! .    separately ====
+                    k = krcol + 2*( m22-1 )
+                    if( k==ktop-1 ) then
+                       call stdlib_dlaqr1( 2, h( k+1, k+1 ), ldh, sr( 2*m22-1 ),si( 2*m22-1 ), sr(&
+                                  2*m22 ), si( 2*m22 ),v( 1, m22 ) )
+                       beta = v( 1, m22 )
+                       call stdlib_dlarfg( 2, beta, v( 2, m22 ), 1, v( 1, m22 ) )
+                    else
+                       beta = h( k+1, k )
+                       v( 2, m22 ) = h( k+2, k )
+                       call stdlib_dlarfg( 2, beta, v( 2, m22 ), 1, v( 1, m22 ) )
+                       h( k+1, k ) = beta
+                       h( k+2, k ) = zero
+                    end if
+                    ! ==== perform update from right within
+                    ! .    computational window. ====
+                    do j = jtop, min( kbot, k+3 )
+                       refsum = v( 1, m22 )*( h( j, k+1 )+v( 2, m22 )*h( j, k+2 ) )
+                       h( j, k+1 ) = h( j, k+1 ) - refsum
+                       h( j, k+2 ) = h( j, k+2 ) - refsum*v( 2, m22 )
+                    end do
+                    ! ==== perform update from left within
+                    ! .    computational window. ====
+                    if( accum ) then
+                       jbot = min( ndcol, kbot )
+                    else if( wantt ) then
+                       jbot = n
+                    else
+                       jbot = kbot
+                    end if
+                    do j = k+1, jbot
+                       refsum = v( 1, m22 )*( h( k+1, j )+v( 2, m22 )*h( k+2, j ) )
+                       h( k+1, j ) = h( k+1, j ) - refsum
+                       h( k+2, j ) = h( k+2, j ) - refsum*v( 2, m22 )
+                    end do
+                    ! ==== the following convergence test requires that
+                    ! .    the tradition small-compared-to-nearby-diagonals
+                    ! .    criterion and the ahues
+                    ! .    criteria both be satisfied.  the latter improves
+                    ! .    accuracy in some examples. falling back on an
+                    ! .    alternate convergence criterion when tst1 or tst2
+                    ! .    is zero (as done here) is traditional but probably
+                    ! .    unnecessary. ====
+                    if( k>=ktop ) then
+                       if( h( k+1, k )/=zero ) then
+                          tst1 = abs( h( k, k ) ) + abs( h( k+1, k+1 ) )
+                          if( tst1==zero ) then
+                             if( k>=ktop+1 )tst1 = tst1 + abs( h( k, k-1 ) )
+                             if( k>=ktop+2 )tst1 = tst1 + abs( h( k, k-2 ) )
+                             if( k>=ktop+3 )tst1 = tst1 + abs( h( k, k-3 ) )
+                             if( k<=kbot-2 )tst1 = tst1 + abs( h( k+2, k+1 ) )
+                             if( k<=kbot-3 )tst1 = tst1 + abs( h( k+3, k+1 ) )
+                             if( k<=kbot-4 )tst1 = tst1 + abs( h( k+4, k+1 ) )
+                          end if
+                          if( abs( h( k+1, k ) )<=max( smlnum, ulp*tst1 ) ) then
+                             h12 = max( abs( h( k+1, k ) ),abs( h( k, k+1 ) ) )
+                             h21 = min( abs( h( k+1, k ) ),abs( h( k, k+1 ) ) )
+                             h11 = max( abs( h( k+1, k+1 ) ),abs( h( k, k )-h( k+1, k+1 ) ) )
+                                       
+                             h22 = min( abs( h( k+1, k+1 ) ),abs( h( k, k )-h( k+1, k+1 ) ) )
+                                       
+                             scl = h11 + h12
+                             tst2 = h22*( h11 / scl )
+                             if( tst2==zero .or. h21*( h12 / scl )<=max( smlnum, ulp*tst2 ) ) &
+                                       then
+                                h( k+1, k ) = zero
+                             end if
+                          end if
+                       end if
+                    end if
+                    ! ==== accumulate orthogonal transformations. ====
+                    if( accum ) then
+                       kms = k - incol
+                       do j = max( 1, ktop-incol ), kdu
+                          refsum = v( 1, m22 )*( u( j, kms+1 )+v( 2, m22 )*u( j, kms+2 ) )
+                          u( j, kms+1 ) = u( j, kms+1 ) - refsum
+                          u( j, kms+2 ) = u( j, kms+2 ) - refsum*v( 2, m22 )
+                       end do
+                    else if( wantz ) then
+                       do j = iloz, ihiz
+                          refsum = v( 1, m22 )*( z( j, k+1 )+v( 2, m22 )*z( j, k+2 ) )
+                          z( j, k+1 ) = z( j, k+1 ) - refsum
+                          z( j, k+2 ) = z( j, k+2 ) - refsum*v( 2, m22 )
+                       end do
+                    end if
+                 end if
+                 ! ==== normal case: chain of 3-by-3 reflections ====
+                 loop_80: do m = mbot, mtop, -1
+                    k = krcol + 2*( m-1 )
+                    if( k==ktop-1 ) then
+                       call stdlib_dlaqr1( 3, h( ktop, ktop ), ldh, sr( 2*m-1 ),si( 2*m-1 ), sr( &
+                                 2*m ), si( 2*m ),v( 1, m ) )
+                       alpha = v( 1, m )
+                       call stdlib_dlarfg( 3, alpha, v( 2, m ), 1, v( 1, m ) )
+                    else
+                       ! ==== perform delayed transformation of row below
+                       ! .    mth bulge. exploit fact that first two elements
+                       ! .    of row are actually zero. ====
+                       refsum = v( 1, m )*v( 3, m )*h( k+3, k+2 )
+                       h( k+3, k   ) = -refsum
+                       h( k+3, k+1 ) = -refsum*v( 2, m )
+                       h( k+3, k+2 ) = h( k+3, k+2 ) - refsum*v( 3, m )
+                       ! ==== calculate reflection to move
+                       ! .    mth bulge one step. ====
+                       beta      = h( k+1, k )
+                       v( 2, m ) = h( k+2, k )
+                       v( 3, m ) = h( k+3, k )
+                       call stdlib_dlarfg( 3, beta, v( 2, m ), 1, v( 1, m ) )
+                       ! ==== a bulge may collapse because of vigilant
+                       ! .    deflation or destructive underflow.  in the
+                       ! .    underflow case, try the two-small-subdiagonals
+                       ! .    trick to try to reinflate the bulge.  ====
+                       if( h( k+3, k )/=zero .or. h( k+3, k+1 )/=zero .or. h( k+3, k+2 )==zero ) &
+                                 then
+                          ! ==== typical case: not collapsed (yet). ====
+                          h( k+1, k ) = beta
+                          h( k+2, k ) = zero
+                          h( k+3, k ) = zero
+                       else
+                          ! ==== atypical case: collapsed.  attempt to
+                          ! .    reintroduce ignoring h(k+1,k) and h(k+2,k).
+                          ! .    if the fill resulting from the new
+                          ! .    reflector is too large, then abandon it.
+                          ! .    otherwise, use the new one. ====
+                          call stdlib_dlaqr1( 3, h( k+1, k+1 ), ldh, sr( 2*m-1 ),si( 2*m-1 ), sr( &
+                                    2*m ), si( 2*m ),vt )
+                          alpha = vt( 1 )
+                          call stdlib_dlarfg( 3, alpha, vt( 2 ), 1, vt( 1 ) )
+                          refsum = vt( 1 )*( h( k+1, k )+vt( 2 )*h( k+2, k ) )
+                          if( abs( h( k+2, k )-refsum*vt( 2 ) )+abs( refsum*vt( 3 ) )>ulp*( abs( &
+                                    h( k, k ) )+abs( h( k+1,k+1 ) )+abs( h( k+2, k+2 ) ) ) ) then
+                             ! ==== starting a new bulge here would
+                             ! .    create non-negligible fill.  use
+                             ! .    the old one with trepidation. ====
+                             h( k+1, k ) = beta
+                             h( k+2, k ) = zero
+                             h( k+3, k ) = zero
+                          else
+                             ! ==== starting a new bulge here would
+                             ! .    create only negligible fill.
+                             ! .    replace the old reflector with
+                             ! .    the new one. ====
+                             h( k+1, k ) = h( k+1, k ) - refsum
+                             h( k+2, k ) = zero
+                             h( k+3, k ) = zero
+                             v( 1, m ) = vt( 1 )
+                             v( 2, m ) = vt( 2 )
+                             v( 3, m ) = vt( 3 )
+                          end if
+                       end if
+                    end if
+                    ! ====  apply reflection from the right and
+                    ! .     the first column of update from the left.
+                    ! .     these updates are required for the vigilant
+                    ! .     deflation check. we still delay most of the
+                    ! .     updates from the left for efficiency. ====
+                    do j = jtop, min( kbot, k+3 )
+                       refsum = v( 1, m )*( h( j, k+1 )+v( 2, m )*h( j, k+2 )+v( 3, m )*h( j, k+3 &
+                                 ) )
+                       h( j, k+1 ) = h( j, k+1 ) - refsum
+                       h( j, k+2 ) = h( j, k+2 ) - refsum*v( 2, m )
+                       h( j, k+3 ) = h( j, k+3 ) - refsum*v( 3, m )
+                    end do
+                    ! ==== perform update from left for subsequent
+                    ! .    column. ====
+                    refsum = v( 1, m )*( h( k+1, k+1 )+v( 2, m )*h( k+2, k+1 )+v( 3, m )*h( k+3, &
+                              k+1 ) )
+                    h( k+1, k+1 ) = h( k+1, k+1 ) - refsum
+                    h( k+2, k+1 ) = h( k+2, k+1 ) - refsum*v( 2, m )
+                    h( k+3, k+1 ) = h( k+3, k+1 ) - refsum*v( 3, m )
+                    ! ==== the following convergence test requires that
+                    ! .    the tradition small-compared-to-nearby-diagonals
+                    ! .    criterion and the ahues
+                    ! .    criteria both be satisfied.  the latter improves
+                    ! .    accuracy in some examples. falling back on an
+                    ! .    alternate convergence criterion when tst1 or tst2
+                    ! .    is zero (as done here) is traditional but probably
+                    ! .    unnecessary. ====
+                    if( k<ktop)cycle
+                    if( h( k+1, k )/=zero ) then
+                       tst1 = abs( h( k, k ) ) + abs( h( k+1, k+1 ) )
+                       if( tst1==zero ) then
+                          if( k>=ktop+1 )tst1 = tst1 + abs( h( k, k-1 ) )
+                          if( k>=ktop+2 )tst1 = tst1 + abs( h( k, k-2 ) )
+                          if( k>=ktop+3 )tst1 = tst1 + abs( h( k, k-3 ) )
+                          if( k<=kbot-2 )tst1 = tst1 + abs( h( k+2, k+1 ) )
+                          if( k<=kbot-3 )tst1 = tst1 + abs( h( k+3, k+1 ) )
+                          if( k<=kbot-4 )tst1 = tst1 + abs( h( k+4, k+1 ) )
+                       end if
+                       if( abs( h( k+1, k ) )<=max( smlnum, ulp*tst1 ) )then
+                          h12 = max( abs( h( k+1, k ) ), abs( h( k, k+1 ) ) )
+                          h21 = min( abs( h( k+1, k ) ), abs( h( k, k+1 ) ) )
+                          h11 = max( abs( h( k+1, k+1 ) ),abs( h( k, k )-h( k+1, k+1 ) ) )
+                          h22 = min( abs( h( k+1, k+1 ) ),abs( h( k, k )-h( k+1, k+1 ) ) )
+                          scl = h11 + h12
+                          tst2 = h22*( h11 / scl )
+                          if( tst2==zero .or. h21*( h12 / scl )<=max( smlnum, ulp*tst2 ) ) &
+                                    then
+                             h( k+1, k ) = zero
+                          end if
+                       end if
+                    end if
+                 end do loop_80
+                 ! ==== multiply h by reflections from the left ====
+                 if( accum ) then
+                    jbot = min( ndcol, kbot )
+                 else if( wantt ) then
+                    jbot = n
+                 else
+                    jbot = kbot
+                 end if
+                 do m = mbot, mtop, -1
+                    k = krcol + 2*( m-1 )
+                    do j = max( ktop, krcol + 2*m ), jbot
+                       refsum = v( 1, m )*( h( k+1, j )+v( 2, m )*h( k+2, j )+v( 3, m )*h( k+3, j &
+                                 ) )
+                       h( k+1, j ) = h( k+1, j ) - refsum
+                       h( k+2, j ) = h( k+2, j ) - refsum*v( 2, m )
+                       h( k+3, j ) = h( k+3, j ) - refsum*v( 3, m )
+                    end do
+                 end do
+                 ! ==== accumulate orthogonal transformations. ====
+                 if( accum ) then
+                    ! ==== accumulate u. (if needed, update z later
+                    ! .    with an efficient matrix-matrix
+                    ! .    multiply.) ====
+                    do m = mbot, mtop, -1
+                       k = krcol + 2*( m-1 )
+                       kms = k - incol
+                       i2 = max( 1, ktop-incol )
+                       i2 = max( i2, kms-(krcol-incol)+1 )
+                       i4 = min( kdu, krcol + 2*( mbot-1 ) - incol + 5 )
+                       do j = i2, i4
+                          refsum = v( 1, m )*( u( j, kms+1 )+v( 2, m )*u( j, kms+2 )+v( 3, m )*u( &
+                                    j, kms+3 ) )
+                          u( j, kms+1 ) = u( j, kms+1 ) - refsum
+                          u( j, kms+2 ) = u( j, kms+2 ) - refsum*v( 2, m )
+                          u( j, kms+3 ) = u( j, kms+3 ) - refsum*v( 3, m )
+                       end do
+                    end do
+                 else if( wantz ) then
+                    ! ==== u is not accumulated, so update z
+                    ! .    now by multiplying by reflections
+                    ! .    from the right. ====
+                    do m = mbot, mtop, -1
+                       k = krcol + 2*( m-1 )
+                       do j = iloz, ihiz
+                          refsum = v( 1, m )*( z( j, k+1 )+v( 2, m )*z( j, k+2 )+v( 3, m )*z( j, &
+                                    k+3 ) )
+                          z( j, k+1 ) = z( j, k+1 ) - refsum
+                          z( j, k+2 ) = z( j, k+2 ) - refsum*v( 2, m )
+                          z( j, k+3 ) = z( j, k+3 ) - refsum*v( 3, m )
+                       end do
+                    end do
+                 end if
+                 ! ==== end of near-the-diagonal bulge chase. ====
+              end do loop_145
+              ! ==== use u (if accumulated) to update far-from-diagonal
+              ! .    entries in h.  if required, use u to update z as
+              ! .    well. ====
+              if( accum ) then
+                 if( wantt ) then
+                    jtop = 1
+                    jbot = n
+                 else
+                    jtop = ktop
+                    jbot = kbot
+                 end if
+                 k1 = max( 1, ktop-incol )
+                 nu = ( kdu-max( 0, ndcol-kbot ) ) - k1 + 1
+                 ! ==== horizontal multiply ====
+                 do jcol = min( ndcol, kbot ) + 1, jbot, nh
+                    jlen = min( nh, jbot-jcol+1 )
+                    call stdlib_dgemm( 'C', 'N', nu, jlen, nu, one, u( k1, k1 ),ldu, h( incol+k1, &
+                              jcol ), ldh, zero, wh,ldwh )
+                    call stdlib_dlacpy( 'ALL', nu, jlen, wh, ldwh,h( incol+k1, jcol ), ldh )
+                              
+                 end do
+                 ! ==== vertical multiply ====
+                 do jrow = jtop, max( ktop, incol ) - 1, nv
+                    jlen = min( nv, max( ktop, incol )-jrow )
+                    call stdlib_dgemm( 'N', 'N', jlen, nu, nu, one,h( jrow, incol+k1 ), ldh, u( &
+                              k1, k1 ),ldu, zero, wv, ldwv )
+                    call stdlib_dlacpy( 'ALL', jlen, nu, wv, ldwv,h( jrow, incol+k1 ), ldh )
+                              
+                 end do
+                 ! ==== z multiply (also vertical) ====
+                 if( wantz ) then
+                    do jrow = iloz, ihiz, nv
+                       jlen = min( nv, ihiz-jrow+1 )
+                       call stdlib_dgemm( 'N', 'N', jlen, nu, nu, one,z( jrow, incol+k1 ), ldz, u(&
+                                  k1, k1 ),ldu, zero, wv, ldwv )
+                       call stdlib_dlacpy( 'ALL', jlen, nu, wv, ldwv,z( jrow, incol+k1 ), ldz )
+                                 
+                    end do
+                 end if
+              end if
+           end do loop_180
+     end subroutine stdlib_dlaqr5
+
+     !> DLAQTR: solves the real quasi-triangular system
+     !> op(T)*p = scale*c,               if LREAL = .TRUE.
+     !> or the complex quasi-triangular systems
+     !> op(T + iB)*(p+iq) = scale*(c+id),  if LREAL = .FALSE.
+     !> in real arithmetic, where T is upper quasi-triangular.
+     !> If LREAL = .FALSE., then the first diagonal block of T must be
+     !> 1 by 1, B is the specially structured matrix
+     !> B = [ b(1) b(2) ... b(n) ]
+     !> [       w            ]
+     !> [           w        ]
+     !> [              .     ]
+     !> [                 w  ]
+     !> op(A) = A or A**T, A**T denotes the transpose of
+     !> matrix A.
+     !> On input, X = [ c ].  On output, X = [ p ].
+     !> [ d ]                  [ q ]
+     !> This subroutine is designed for the condition number estimation
+     !> in routine DTRSNA.
+
+     subroutine stdlib_dlaqtr( ltran, lreal, n, t, ldt, b, w, scale, x, work,info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: lreal, ltran
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldt, n
+           real(dp), intent(out) :: scale
+           real(dp), intent(in) :: w
+           ! Array Arguments 
+           real(dp), intent(in) :: b(*), t(ldt,*)
+           real(dp), intent(out) :: work(*)
+           real(dp), intent(inout) :: x(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: i, ierr, j, j1, j2, jnext, k, n1, n2
+           real(dp) :: bignum, eps, rec, scaloc, si, smin, sminw, smlnum, sr, tjj, tmp, xj, xmax, &
+                     xnorm, z
+           ! Local Arrays 
+           real(dp) :: d(2,2), v(2,2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! do not test the input parameters for errors
+           notran = .not.ltran
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' ) / eps
+           bignum = one / smlnum
+           xnorm = stdlib_dlange( 'M', n, n, t, ldt, d )
+           if( .not.lreal )xnorm = max( xnorm, abs( w ), stdlib_dlange( 'M', n, 1, b, n, d ) )
+                     
+           smin = max( smlnum, eps*xnorm )
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           work( 1 ) = zero
+           do j = 2, n
+              work( j ) = stdlib_dasum( j-1, t( 1, j ), 1 )
+           end do
+           if( .not.lreal ) then
+              do i = 2, n
+                 work( i ) = work( i ) + abs( b( i ) )
+              end do
+           end if
+           n2 = 2*n
+           n1 = n
+           if( .not.lreal )n1 = n2
+           k = stdlib_idamax( n1, x, 1 )
+           xmax = abs( x( k ) )
+           scale = one
+           if( xmax>bignum ) then
+              scale = bignum / xmax
+              call stdlib_dscal( n1, scale, x, 1 )
+              xmax = bignum
+           end if
+           if( lreal ) then
+              if( notran ) then
+                 ! solve t*p = scale*c
+                 jnext = n
+                 loop_30: do j = n, 1, -1
+                    if( j>jnext )cycle loop_30
+                    j1 = j
+                    j2 = j
+                    jnext = j - 1
+                    if( j>1 ) then
+                       if( t( j, j-1 )/=zero ) then
+                          j1 = j - 1
+                          jnext = j - 2
+                       end if
+                    end if
+                    if( j1==j2 ) then
+                       ! meet 1 by 1 diagonal block
+                       ! scale to avoid overflow when computing
+                           ! x(j) = b(j)/t(j,j)
+                       xj = abs( x( j1 ) )
+                       tjj = abs( t( j1, j1 ) )
+                       tmp = t( j1, j1 )
+                       if( tjj<smin ) then
+                          tmp = smin
+                          tjj = smin
+                          info = 1
+                       end if
+                       if( xj==zero )cycle loop_30
+                       if( tjj<one ) then
+                          if( xj>bignum*tjj ) then
+                             rec = one / xj
+                             call stdlib_dscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j1 ) = x( j1 ) / tmp
+                       xj = abs( x( j1 ) )
+                       ! scale x if necessary to avoid overflow when adding a
+                       ! multiple of column j1 of t.
+                       if( xj>one ) then
+                          rec = one / xj
+                          if( work( j1 )>( bignum-xmax )*rec ) then
+                             call stdlib_dscal( n, rec, x, 1 )
+                             scale = scale*rec
+                          end if
+                       end if
+                       if( j1>1 ) then
+                          call stdlib_daxpy( j1-1, -x( j1 ), t( 1, j1 ), 1, x, 1 )
+                          k = stdlib_idamax( j1-1, x, 1 )
+                          xmax = abs( x( k ) )
+                       end if
+                    else
+                       ! meet 2 by 2 diagonal block
+                       ! call 2 by 2 linear system solve, to take
+                       ! care of possible overflow by scaling factor.
+                       d( 1, 1 ) = x( j1 )
+                       d( 2, 1 ) = x( j2 )
+                       call stdlib_dlaln2( .false., 2, 1, smin, one, t( j1, j1 ),ldt, one, one, d,&
+                                  2, zero, zero, v, 2,scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 2
+                       if( scaloc/=one ) then
+                          call stdlib_dscal( n, scaloc, x, 1 )
+                          scale = scale*scaloc
+                       end if
+                       x( j1 ) = v( 1, 1 )
+                       x( j2 ) = v( 2, 1 )
+                       ! scale v(1,1) (= x(j1)) and/or v(2,1) (=x(j2))
+                       ! to avoid overflow in updating right-hand side.
+                       xj = max( abs( v( 1, 1 ) ), abs( v( 2, 1 ) ) )
+                       if( xj>one ) then
+                          rec = one / xj
+                          if( max( work( j1 ), work( j2 ) )>( bignum-xmax )*rec ) then
+                             call stdlib_dscal( n, rec, x, 1 )
+                             scale = scale*rec
+                          end if
+                       end if
+                       ! update right-hand side
+                       if( j1>1 ) then
+                          call stdlib_daxpy( j1-1, -x( j1 ), t( 1, j1 ), 1, x, 1 )
+                          call stdlib_daxpy( j1-1, -x( j2 ), t( 1, j2 ), 1, x, 1 )
+                          k = stdlib_idamax( j1-1, x, 1 )
+                          xmax = abs( x( k ) )
+                       end if
+                    end if
+                 end do loop_30
+              else
+                 ! solve t**t*p = scale*c
+                 jnext = 1
+                 loop_40: do j = 1, n
+                    if( j<jnext )cycle loop_40
+                    j1 = j
+                    j2 = j
+                    jnext = j + 1
+                    if( j<n ) then
+                       if( t( j+1, j )/=zero ) then
+                          j2 = j + 1
+                          jnext = j + 2
+                       end if
+                    end if
+                    if( j1==j2 ) then
+                       ! 1 by 1 diagonal block
+                       ! scale if necessary to avoid overflow in forming the
+                       ! right-hand side element by inner product.
+                       xj = abs( x( j1 ) )
+                       if( xmax>one ) then
+                          rec = one / xmax
+                          if( work( j1 )>( bignum-xj )*rec ) then
+                             call stdlib_dscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j1 ) = x( j1 ) - stdlib_ddot( j1-1, t( 1, j1 ), 1, x, 1 )
+                       xj = abs( x( j1 ) )
+                       tjj = abs( t( j1, j1 ) )
+                       tmp = t( j1, j1 )
+                       if( tjj<smin ) then
+                          tmp = smin
+                          tjj = smin
+                          info = 1
+                       end if
+                       if( tjj<one ) then
+                          if( xj>bignum*tjj ) then
+                             rec = one / xj
+                             call stdlib_dscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j1 ) = x( j1 ) / tmp
+                       xmax = max( xmax, abs( x( j1 ) ) )
+                    else
+                       ! 2 by 2 diagonal block
+                       ! scale if necessary to avoid overflow in forming the
+                       ! right-hand side elements by inner product.
+                       xj = max( abs( x( j1 ) ), abs( x( j2 ) ) )
+                       if( xmax>one ) then
+                          rec = one / xmax
+                          if( max( work( j2 ), work( j1 ) )>( bignum-xj )*rec ) then
+                             call stdlib_dscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       d( 1, 1 ) = x( j1 ) - stdlib_ddot( j1-1, t( 1, j1 ), 1, x,1 )
+                       d( 2, 1 ) = x( j2 ) - stdlib_ddot( j1-1, t( 1, j2 ), 1, x,1 )
+                       call stdlib_dlaln2( .true., 2, 1, smin, one, t( j1, j1 ),ldt, one, one, d, &
+                                 2, zero, zero, v, 2,scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 2
+                       if( scaloc/=one ) then
+                          call stdlib_dscal( n, scaloc, x, 1 )
+                          scale = scale*scaloc
+                       end if
+                       x( j1 ) = v( 1, 1 )
+                       x( j2 ) = v( 2, 1 )
+                       xmax = max( abs( x( j1 ) ), abs( x( j2 ) ), xmax )
+                    end if
+                 end do loop_40
+              end if
+           else
+              sminw = max( eps*abs( w ), smin )
+              if( notran ) then
+                 ! solve (t + ib)*(p+iq) = c+id
+                 jnext = n
+                 loop_70: do j = n, 1, -1
+                    if( j>jnext )cycle loop_70
+                    j1 = j
+                    j2 = j
+                    jnext = j - 1
+                    if( j>1 ) then
+                       if( t( j, j-1 )/=zero ) then
+                          j1 = j - 1
+                          jnext = j - 2
+                       end if
+                    end if
+                    if( j1==j2 ) then
+                       ! 1 by 1 diagonal block
+                       ! scale if necessary to avoid overflow in division
+                       z = w
+                       if( j1==1 )z = b( 1 )
+                       xj = abs( x( j1 ) ) + abs( x( n+j1 ) )
+                       tjj = abs( t( j1, j1 ) ) + abs( z )
+                       tmp = t( j1, j1 )
+                       if( tjj<sminw ) then
+                          tmp = sminw
+                          tjj = sminw
+                          info = 1
+                       end if
+                       if( xj==zero )cycle loop_70
+                       if( tjj<one ) then
+                          if( xj>bignum*tjj ) then
+                             rec = one / xj
+                             call stdlib_dscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       call stdlib_dladiv( x( j1 ), x( n+j1 ), tmp, z, sr, si )
+                       x( j1 ) = sr
+                       x( n+j1 ) = si
+                       xj = abs( x( j1 ) ) + abs( x( n+j1 ) )
+                       ! scale x if necessary to avoid overflow when adding a
+                       ! multiple of column j1 of t.
+                       if( xj>one ) then
+                          rec = one / xj
+                          if( work( j1 )>( bignum-xmax )*rec ) then
+                             call stdlib_dscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                          end if
+                       end if
+                       if( j1>1 ) then
+                          call stdlib_daxpy( j1-1, -x( j1 ), t( 1, j1 ), 1, x, 1 )
+                          call stdlib_daxpy( j1-1, -x( n+j1 ), t( 1, j1 ), 1,x( n+1 ), 1 )
+                          x( 1 ) = x( 1 ) + b( j1 )*x( n+j1 )
+                          x( n+1 ) = x( n+1 ) - b( j1 )*x( j1 )
+                          xmax = zero
+                          do k = 1, j1 - 1
+                             xmax = max( xmax, abs( x( k ) )+abs( x( k+n ) ) )
+                          end do
+                       end if
+                    else
+                       ! meet 2 by 2 diagonal block
+                       d( 1, 1 ) = x( j1 )
+                       d( 2, 1 ) = x( j2 )
+                       d( 1, 2 ) = x( n+j1 )
+                       d( 2, 2 ) = x( n+j2 )
+                       call stdlib_dlaln2( .false., 2, 2, sminw, one, t( j1, j1 ),ldt, one, one, &
+                                 d, 2, zero, -w, v, 2,scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 2
+                       if( scaloc/=one ) then
+                          call stdlib_dscal( 2*n, scaloc, x, 1 )
+                          scale = scaloc*scale
+                       end if
+                       x( j1 ) = v( 1, 1 )
+                       x( j2 ) = v( 2, 1 )
+                       x( n+j1 ) = v( 1, 2 )
+                       x( n+j2 ) = v( 2, 2 )
+                       ! scale x(j1), .... to avoid overflow in
+                       ! updating right hand side.
+                       xj = max( abs( v( 1, 1 ) )+abs( v( 1, 2 ) ),abs( v( 2, 1 ) )+abs( v( 2, 2 )&
+                                  ) )
+                       if( xj>one ) then
+                          rec = one / xj
+                          if( max( work( j1 ), work( j2 ) )>( bignum-xmax )*rec ) then
+                             call stdlib_dscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                          end if
+                       end if
+                       ! update the right-hand side.
+                       if( j1>1 ) then
+                          call stdlib_daxpy( j1-1, -x( j1 ), t( 1, j1 ), 1, x, 1 )
+                          call stdlib_daxpy( j1-1, -x( j2 ), t( 1, j2 ), 1, x, 1 )
+                          call stdlib_daxpy( j1-1, -x( n+j1 ), t( 1, j1 ), 1,x( n+1 ), 1 )
+                          call stdlib_daxpy( j1-1, -x( n+j2 ), t( 1, j2 ), 1,x( n+1 ), 1 )
+                          x( 1 ) = x( 1 ) + b( j1 )*x( n+j1 ) +b( j2 )*x( n+j2 )
+                          x( n+1 ) = x( n+1 ) - b( j1 )*x( j1 ) -b( j2 )*x( j2 )
+                          xmax = zero
+                          do k = 1, j1 - 1
+                             xmax = max( abs( x( k ) )+abs( x( k+n ) ),xmax )
+                          end do
+                       end if
+                    end if
+                 end do loop_70
+              else
+                 ! solve (t + ib)**t*(p+iq) = c+id
+                 jnext = 1
+                 loop_80: do j = 1, n
+                    if( j<jnext )cycle loop_80
+                    j1 = j
+                    j2 = j
+                    jnext = j + 1
+                    if( j<n ) then
+                       if( t( j+1, j )/=zero ) then
+                          j2 = j + 1
+                          jnext = j + 2
+                       end if
+                    end if
+                    if( j1==j2 ) then
+                       ! 1 by 1 diagonal block
+                       ! scale if necessary to avoid overflow in forming the
+                       ! right-hand side element by inner product.
+                       xj = abs( x( j1 ) ) + abs( x( j1+n ) )
+                       if( xmax>one ) then
+                          rec = one / xmax
+                          if( work( j1 )>( bignum-xj )*rec ) then
+                             call stdlib_dscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j1 ) = x( j1 ) - stdlib_ddot( j1-1, t( 1, j1 ), 1, x, 1 )
+                       x( n+j1 ) = x( n+j1 ) - stdlib_ddot( j1-1, t( 1, j1 ), 1,x( n+1 ), 1 )
+                                 
+                       if( j1>1 ) then
+                          x( j1 ) = x( j1 ) - b( j1 )*x( n+1 )
+                          x( n+j1 ) = x( n+j1 ) + b( j1 )*x( 1 )
+                       end if
+                       xj = abs( x( j1 ) ) + abs( x( j1+n ) )
+                       z = w
+                       if( j1==1 )z = b( 1 )
+                       ! scale if necessary to avoid overflow in
+                       ! complex division
+                       tjj = abs( t( j1, j1 ) ) + abs( z )
+                       tmp = t( j1, j1 )
+                       if( tjj<sminw ) then
+                          tmp = sminw
+                          tjj = sminw
+                          info = 1
+                       end if
+                       if( tjj<one ) then
+                          if( xj>bignum*tjj ) then
+                             rec = one / xj
+                             call stdlib_dscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       call stdlib_dladiv( x( j1 ), x( n+j1 ), tmp, -z, sr, si )
+                       x( j1 ) = sr
+                       x( j1+n ) = si
+                       xmax = max( abs( x( j1 ) )+abs( x( j1+n ) ), xmax )
+                    else
+                       ! 2 by 2 diagonal block
+                       ! scale if necessary to avoid overflow in forming the
+                       ! right-hand side element by inner product.
+                       xj = max( abs( x( j1 ) )+abs( x( n+j1 ) ),abs( x( j2 ) )+abs( x( n+j2 ) ) )
+                                 
+                       if( xmax>one ) then
+                          rec = one / xmax
+                          if( max( work( j1 ), work( j2 ) )>( bignum-xj ) / xmax ) then
+                             call stdlib_dscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       d( 1, 1 ) = x( j1 ) - stdlib_ddot( j1-1, t( 1, j1 ), 1, x,1 )
+                       d( 2, 1 ) = x( j2 ) - stdlib_ddot( j1-1, t( 1, j2 ), 1, x,1 )
+                       d( 1, 2 ) = x( n+j1 ) - stdlib_ddot( j1-1, t( 1, j1 ), 1,x( n+1 ), 1 )
+                                 
+                       d( 2, 2 ) = x( n+j2 ) - stdlib_ddot( j1-1, t( 1, j2 ), 1,x( n+1 ), 1 )
+                                 
+                       d( 1, 1 ) = d( 1, 1 ) - b( j1 )*x( n+1 )
+                       d( 2, 1 ) = d( 2, 1 ) - b( j2 )*x( n+1 )
+                       d( 1, 2 ) = d( 1, 2 ) + b( j1 )*x( 1 )
+                       d( 2, 2 ) = d( 2, 2 ) + b( j2 )*x( 1 )
+                       call stdlib_dlaln2( .true., 2, 2, sminw, one, t( j1, j1 ),ldt, one, one, d,&
+                                  2, zero, w, v, 2,scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 2
+                       if( scaloc/=one ) then
+                          call stdlib_dscal( n2, scaloc, x, 1 )
+                          scale = scaloc*scale
+                       end if
+                       x( j1 ) = v( 1, 1 )
+                       x( j2 ) = v( 2, 1 )
+                       x( n+j1 ) = v( 1, 2 )
+                       x( n+j2 ) = v( 2, 2 )
+                       xmax = max( abs( x( j1 ) )+abs( x( n+j1 ) ),abs( x( j2 ) )+abs( x( n+j2 ) )&
+                                 , xmax )
+                    end if
+                 end do loop_80
+              end if
+           end if
+           return
+     end subroutine stdlib_dlaqtr
+
+     !> DLASD3: finds all the square roots of the roots of the secular
+     !> equation, as defined by the values in D and Z.  It makes the
+     !> appropriate calls to DLASD4 and then updates the singular
+     !> vectors by matrix multiplication.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+     !> DLASD3 is called from DLASD1.
+
+     pure subroutine stdlib_dlasd3( nl, nr, sqre, k, d, q, ldq, dsigma, u, ldu, u2,ldu2, vt, ldvt,&
+                vt2, ldvt2, idxc, ctot, z,info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldq, ldu, ldu2, ldvt, ldvt2, nl, nr, sqre
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ctot(*), idxc(*)
+           real(dp), intent(out) :: d(*), q(ldq,*), u(ldu,*), vt(ldvt,*)
+           real(dp), intent(inout) :: dsigma(*), vt2(ldvt2,*), z(*)
+           real(dp), intent(in) :: u2(ldu2,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: ctemp, i, j, jc, ktemp, m, n, nlp1, nlp2, nrp1
+           real(dp) :: rho, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( nl<1 ) then
+              info = -1
+           else if( nr<1 ) then
+              info = -2
+           else if( ( sqre/=1 ) .and. ( sqre/=0 ) ) then
+              info = -3
+           end if
+           n = nl + nr + 1
+           m = n + sqre
+           nlp1 = nl + 1
+           nlp2 = nl + 2
+           if( ( k<1 ) .or. ( k>n ) ) then
+              info = -4
+           else if( ldq<k ) then
+              info = -7
+           else if( ldu<n ) then
+              info = -10
+           else if( ldu2<n ) then
+              info = -12
+           else if( ldvt<m ) then
+              info = -14
+           else if( ldvt2<m ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==1 ) then
+              d( 1 ) = abs( z( 1 ) )
+              call stdlib_dcopy( m, vt2( 1, 1 ), ldvt2, vt( 1, 1 ), ldvt )
+              if( z( 1 )>zero ) then
+                 call stdlib_dcopy( n, u2( 1, 1 ), 1, u( 1, 1 ), 1 )
+              else
+                 do i = 1, n
+                    u( i, 1 ) = -u2( i, 1 )
+                 end do
+              end if
+              return
+           end if
+           ! modify values dsigma(i) to make sure all dsigma(i)-dsigma(j) can
+           ! be computed with high relative accuracy (barring over/underflow).
+           ! this is a problem on machines without a guard digit in
+           ! add/subtract (cray xmp, cray ymp, cray c 90 and cray 2).
+           ! the following code replaces dsigma(i) by 2*dsigma(i)-dsigma(i),
+           ! which on any of these machines zeros out the bottommost
+           ! bit of dsigma(i) if it is 1; this makes the subsequent
+           ! subtractions dsigma(i)-dsigma(j) unproblematic when cancellation
+           ! occurs. on binary machines with a guard digit (almost all
+           ! machines) it does not change dsigma(i) at all. on hexadecimal
+           ! and decimal machines with a guard digit, it slightly
+           ! changes the bottommost bits of dsigma(i). it does not account
+           ! for hexadecimal or decimal machines without guard digits
+           ! (we know of none). we use a subroutine call to compute
+           ! 2*dsigma(i) to prevent optimizing compilers from eliminating
+           ! this code.
+           do i = 1, k
+              dsigma( i ) = stdlib_dlamc3( dsigma( i ), dsigma( i ) ) - dsigma( i )
+           end do
+           ! keep a copy of z.
+           call stdlib_dcopy( k, z, 1, q, 1 )
+           ! normalize z.
+           rho = stdlib_dnrm2( k, z, 1 )
+           call stdlib_dlascl( 'G', 0, 0, rho, one, k, 1, z, k, info )
+           rho = rho*rho
+           ! find the new singular values.
+           do j = 1, k
+              call stdlib_dlasd4( k, j, dsigma, z, u( 1, j ), rho, d( j ),vt( 1, j ), info )
+                        
+              ! if the zero finder fails, report the convergence failure.
+              if( info/=0 ) then
+                 return
+              end if
+           end do
+           ! compute updated z.
+           do i = 1, k
+              z( i ) = u( i, k )*vt( i, k )
+              do j = 1, i - 1
+                 z( i ) = z( i )*( u( i, j )*vt( i, j ) /( dsigma( i )-dsigma( j ) ) /( dsigma( i &
+                           )+dsigma( j ) ) )
+              end do
+              do j = i, k - 1
+                 z( i ) = z( i )*( u( i, j )*vt( i, j ) /( dsigma( i )-dsigma( j+1 ) ) /( dsigma( &
+                           i )+dsigma( j+1 ) ) )
+              end do
+              z( i ) = sign( sqrt( abs( z( i ) ) ), q( i, 1 ) )
+           end do
+           ! compute left singular vectors of the modified diagonal matrix,
+           ! and store related information for the right singular vectors.
+           do i = 1, k
+              vt( 1, i ) = z( 1 ) / u( 1, i ) / vt( 1, i )
+              u( 1, i ) = negone
+              do j = 2, k
+                 vt( j, i ) = z( j ) / u( j, i ) / vt( j, i )
+                 u( j, i ) = dsigma( j )*vt( j, i )
+              end do
+              temp = stdlib_dnrm2( k, u( 1, i ), 1 )
+              q( 1, i ) = u( 1, i ) / temp
+              do j = 2, k
+                 jc = idxc( j )
+                 q( j, i ) = u( jc, i ) / temp
+              end do
+           end do
+           ! update the left singular vector matrix.
+           if( k==2 ) then
+              call stdlib_dgemm( 'N', 'N', n, k, k, one, u2, ldu2, q, ldq, zero, u,ldu )
+              go to 100
+           end if
+           if( ctot( 1 )>0 ) then
+              call stdlib_dgemm( 'N', 'N', nl, k, ctot( 1 ), one, u2( 1, 2 ), ldu2,q( 2, 1 ), ldq,&
+                         zero, u( 1, 1 ), ldu )
+              if( ctot( 3 )>0 ) then
+                 ktemp = 2 + ctot( 1 ) + ctot( 2 )
+                 call stdlib_dgemm( 'N', 'N', nl, k, ctot( 3 ), one, u2( 1, ktemp ),ldu2, q( &
+                           ktemp, 1 ), ldq, one, u( 1, 1 ), ldu )
+              end if
+           else if( ctot( 3 )>0 ) then
+              ktemp = 2 + ctot( 1 ) + ctot( 2 )
+              call stdlib_dgemm( 'N', 'N', nl, k, ctot( 3 ), one, u2( 1, ktemp ),ldu2, q( ktemp, &
+                        1 ), ldq, zero, u( 1, 1 ), ldu )
+           else
+              call stdlib_dlacpy( 'F', nl, k, u2, ldu2, u, ldu )
+           end if
+           call stdlib_dcopy( k, q( 1, 1 ), ldq, u( nlp1, 1 ), ldu )
+           ktemp = 2 + ctot( 1 )
+           ctemp = ctot( 2 ) + ctot( 3 )
+           call stdlib_dgemm( 'N', 'N', nr, k, ctemp, one, u2( nlp2, ktemp ), ldu2,q( ktemp, 1 ), &
+                     ldq, zero, u( nlp2, 1 ), ldu )
+           ! generate the right singular vectors.
+           100 continue
+           do i = 1, k
+              temp = stdlib_dnrm2( k, vt( 1, i ), 1 )
+              q( i, 1 ) = vt( 1, i ) / temp
+              do j = 2, k
+                 jc = idxc( j )
+                 q( i, j ) = vt( jc, i ) / temp
+              end do
+           end do
+           ! update the right singular vector matrix.
+           if( k==2 ) then
+              call stdlib_dgemm( 'N', 'N', k, m, k, one, q, ldq, vt2, ldvt2, zero,vt, ldvt )
+                        
+              return
+           end if
+           ktemp = 1 + ctot( 1 )
+           call stdlib_dgemm( 'N', 'N', k, nlp1, ktemp, one, q( 1, 1 ), ldq,vt2( 1, 1 ), ldvt2, &
+                     zero, vt( 1, 1 ), ldvt )
+           ktemp = 2 + ctot( 1 ) + ctot( 2 )
+           if( ktemp<=ldvt2 )call stdlib_dgemm( 'N', 'N', k, nlp1, ctot( 3 ), one, q( 1, ktemp ),&
+                     ldq, vt2( ktemp, 1 ), ldvt2, one, vt( 1, 1 ),ldvt )
+           ktemp = ctot( 1 ) + 1
+           nrp1 = nr + sqre
+           if( ktemp>1 ) then
+              do i = 1, k
+                 q( i, ktemp ) = q( i, 1 )
+              end do
+              do i = nlp2, m
+                 vt2( ktemp, i ) = vt2( 1, i )
+              end do
+           end if
+           ctemp = 1 + ctot( 2 ) + ctot( 3 )
+           call stdlib_dgemm( 'N', 'N', k, nrp1, ctemp, one, q( 1, ktemp ), ldq,vt2( ktemp, nlp2 )&
+                     , ldvt2, zero, vt( 1, nlp2 ), ldvt )
+           return
+     end subroutine stdlib_dlasd3
+
+     !> DLASD6: computes the SVD of an updated upper bidiagonal matrix B
+     !> obtained by merging two smaller ones by appending a row. This
+     !> routine is used only for the problem which requires all singular
+     !> values and optionally singular vector matrices in factored form.
+     !> B is an N-by-M matrix with N = NL + NR + 1 and M = N + SQRE.
+     !> A related subroutine, DLASD1, handles the case in which all singular
+     !> values and singular vectors of the bidiagonal matrix are desired.
+     !> DLASD6 computes the SVD as follows:
+     !> ( D1(in)    0    0       0 )
+     !> B = U(in) * (   Z1**T   a   Z2**T    b ) * VT(in)
+     !> (   0       0   D2(in)   0 )
+     !> = U(out) * ( D(out) 0) * VT(out)
+     !> where Z**T = (Z1**T a Z2**T b) = u**T VT**T, and u is a vector of dimension M
+     !> with ALPHA and BETA in the NL+1 and NL+2 th entries and zeros
+     !> elsewhere; and the entry b is empty if SQRE = 0.
+     !> The singular values of B can be computed using D1, D2, the first
+     !> components of all the right singular vectors of the lower block, and
+     !> the last components of all the right singular vectors of the upper
+     !> block. These components are stored and updated in VF and VL,
+     !> respectively, in DLASD6. Hence U and VT are not explicitly
+     !> referenced.
+     !> The singular values are stored in D. The algorithm consists of two
+     !> stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple singular values or if there is a zero
+     !> in the Z vector. For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one. This stage is
+     !> performed by the routine DLASD7.
+     !> The second stage consists of calculating the updated
+     !> singular values. This is done by finding the roots of the
+     !> secular equation via the routine DLASD4 (as called by DLASD8).
+     !> This routine also updates VF and VL and computes the distances
+     !> between the updated singular values and the old singular
+     !> values.
+     !> DLASD6 is called from DLASDA.
+
+     pure subroutine stdlib_dlasd6( icompq, nl, nr, sqre, d, vf, vl, alpha, beta,idxq, perm, &
+     givptr, givcol, ldgcol, givnum,ldgnum, poles, difl, difr, z, k, c, s, work,iwork, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: givptr, info, k
+           integer(ilp), intent(in) :: icompq, ldgcol, ldgnum, nl, nr, sqre
+           real(dp), intent(inout) :: alpha, beta
+           real(dp), intent(out) :: c, s
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(ldgcol,*), iwork(*), perm(*)
+           integer(ilp), intent(inout) :: idxq(*)
+           real(dp), intent(inout) :: d(*), vf(*), vl(*)
+           real(dp), intent(out) :: difl(*), difr(*), givnum(ldgnum,*), poles(ldgnum,*), work(*), &
+                     z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, idx, idxc, idxp, isigma, ivfw, ivlw, iw, m, n, n1, n2
+           real(dp) :: orgnrm
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           n = nl + nr + 1
+           m = n + sqre
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( nl<1 ) then
+              info = -2
+           else if( nr<1 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( ldgcol<n ) then
+              info = -14
+           else if( ldgnum<n ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD6', -info )
+              return
+           end if
+           ! the following values are for bookkeeping purposes only.  they are
+           ! integer pointers which indicate the portion of the workspace
+           ! used by a particular array in stdlib_dlasd7 and stdlib_dlasd8.
+           isigma = 1
+           iw = isigma + n
+           ivfw = iw + m
+           ivlw = ivfw + m
+           idx = 1
+           idxc = idx + n
+           idxp = idxc + n
+           ! scale.
+           orgnrm = max( abs( alpha ), abs( beta ) )
+           d( nl+1 ) = zero
+           do i = 1, n
+              if( abs( d( i ) )>orgnrm ) then
+                 orgnrm = abs( d( i ) )
+              end if
+           end do
+           call stdlib_dlascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, info )
+           alpha = alpha / orgnrm
+           beta = beta / orgnrm
+           ! sort and deflate singular values.
+           call stdlib_dlasd7( icompq, nl, nr, sqre, k, d, z, work( iw ), vf,work( ivfw ), vl, &
+           work( ivlw ), alpha, beta,work( isigma ), iwork( idx ), iwork( idxp ), idxq,perm, &
+                     givptr, givcol, ldgcol, givnum, ldgnum, c, s,info )
+           ! solve secular equation, compute difl, difr, and update vf, vl.
+           call stdlib_dlasd8( icompq, k, d, z, vf, vl, difl, difr, ldgnum,work( isigma ), work( &
+                     iw ), info )
+           ! report the possible convergence failure.
+           if( info/=0 ) then
+              return
+           end if
+           ! save the poles if icompq = 1.
+           if( icompq==1 ) then
+              call stdlib_dcopy( k, d, 1, poles( 1, 1 ), 1 )
+              call stdlib_dcopy( k, work( isigma ), 1, poles( 1, 2 ), 1 )
+           end if
+           ! unscale.
+           call stdlib_dlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+           ! prepare the idxq sorting permutation.
+           n1 = k
+           n2 = n - k
+           call stdlib_dlamrg( n1, n2, d, 1, -1, idxq )
+           return
+     end subroutine stdlib_dlasd6
+
+     !> DOPGTR: generates a real orthogonal matrix Q which is defined as the
+     !> product of n-1 elementary reflectors H(i) of order n, as returned by
+     !> DSPTRD using packed storage:
+     !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+
+     pure subroutine stdlib_dopgtr( uplo, n, ap, tau, q, ldq, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldq, n
+           ! Array Arguments 
+           real(dp), intent(in) :: ap(*), tau(*)
+           real(dp), intent(out) :: q(ldq,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, ij, j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DOPGTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! q was determined by a call to stdlib_dsptrd with uplo = 'u'
+              ! unpack the vectors which define the elementary reflectors and
+              ! set the last row and column of q equal to those of the unit
+              ! matrix
+              ij = 2
+              do j = 1, n - 1
+                 do i = 1, j - 1
+                    q( i, j ) = ap( ij )
+                    ij = ij + 1
+                 end do
+                 ij = ij + 2
+                 q( n, j ) = zero
+              end do
+              do i = 1, n - 1
+                 q( i, n ) = zero
+              end do
+              q( n, n ) = one
+              ! generate q(1:n-1,1:n-1)
+              call stdlib_dorg2l( n-1, n-1, n-1, q, ldq, tau, work, iinfo )
+           else
+              ! q was determined by a call to stdlib_dsptrd with uplo = 'l'.
+              ! unpack the vectors which define the elementary reflectors and
+              ! set the first row and column of q equal to those of the unit
+              ! matrix
+              q( 1, 1 ) = one
+              do i = 2, n
+                 q( i, 1 ) = zero
+              end do
+              ij = 3
+              do j = 2, n
+                 q( 1, j ) = zero
+                 do i = j + 1, n
+                    q( i, j ) = ap( ij )
+                    ij = ij + 1
+                 end do
+                 ij = ij + 2
+              end do
+              if( n>1 ) then
+                 ! generate q(2:n,2:n)
+                 call stdlib_dorg2r( n-1, n-1, n-1, q( 2, 2 ), ldq, tau, work,iinfo )
+              end if
+           end if
+           return
+     end subroutine stdlib_dopgtr
+
+     !> DOPMTR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> nq-1 elementary reflectors, as returned by DSPTRD using packed
+     !> storage:
+     !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+     pure subroutine stdlib_dopmtr( side, uplo, trans, m, n, ap, tau, c, ldc, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ap(*), c(ldc,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: forwrd, left, notran, upper
+           integer(ilp) :: i, i1, i2, i3, ic, ii, jc, mi, ni, nq
+           real(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           upper = stdlib_lsame( uplo, 'U' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( ldc<max( 1, m ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DOPMTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if( upper ) then
+              ! q was determined by a call to stdlib_dsptrd with uplo = 'u'
+              forwrd = ( left .and. notran ) .or.( .not.left .and. .not.notran )
+              if( forwrd ) then
+                 i1 = 1
+                 i2 = nq - 1
+                 i3 = 1
+                 ii = 2
+              else
+                 i1 = nq - 1
+                 i2 = 1
+                 i3 = -1
+                 ii = nq*( nq+1 ) / 2 - 1
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              do i = i1, i2, i3
+                 if( left ) then
+                    ! h(i) is applied to c(1:i,1:n)
+                    mi = i
+                 else
+                    ! h(i) is applied to c(1:m,1:i)
+                    ni = i
+                 end if
+                 ! apply h(i)
+                 aii = ap( ii )
+                 ap( ii ) = one
+                 call stdlib_dlarf( side, mi, ni, ap( ii-i+1 ), 1, tau( i ), c, ldc,work )
+                 ap( ii ) = aii
+                 if( forwrd ) then
+                    ii = ii + i + 2
+                 else
+                    ii = ii - i - 1
+                 end if
+              end do
+           else
+              ! q was determined by a call to stdlib_dsptrd with uplo = 'l'.
+              forwrd = ( left .and. .not.notran ) .or.( .not.left .and. notran )
+              if( forwrd ) then
+                 i1 = 1
+                 i2 = nq - 1
+                 i3 = 1
+                 ii = 2
+              else
+                 i1 = nq - 1
+                 i2 = 1
+                 i3 = -1
+                 ii = nq*( nq+1 ) / 2 - 1
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              do i = i1, i2, i3
+                 aii = ap( ii )
+                 ap( ii ) = one
+                 if( left ) then
+                    ! h(i) is applied to c(i+1:m,1:n)
+                    mi = m - i
+                    ic = i + 1
+                 else
+                    ! h(i) is applied to c(1:m,i+1:n)
+                    ni = n - i
+                    jc = i + 1
+                 end if
+                 ! apply h(i)
+                 call stdlib_dlarf( side, mi, ni, ap( ii ), 1, tau( i ),c( ic, jc ), ldc, work )
+                           
+                 ap( ii ) = aii
+                 if( forwrd ) then
+                    ii = ii + nq - i + 1
+                 else
+                    ii = ii - nq + i - 2
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_dopmtr
+
+     !> DORBDB1: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. Q must be no larger than P,
+     !> M-P, or M-Q. Routines DORBDB2, DORBDB3, and DORBDB4 handle cases in
+     !> which Q is not the minimum dimension.
+     !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are Q-by-Q bidiagonal matrices represented implicitly by
+     !> angles THETA, PHI.
+
+     subroutine stdlib_dorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(dp), intent(out) :: phi(*), theta(*)
+           real(dp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           real(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < q .or. m-p < q ) then
+              info = -2
+           else if( q < 0 .or. m-q < q ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p-1, m-p-1, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-2
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORBDB1', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce columns 1, ..., q of x11 and x21
+           do i = 1, q
+              call stdlib_dlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              call stdlib_dlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              theta(i) = atan2( x21(i,i), x11(i,i) )
+              c = cos( theta(i) )
+              s = sin( theta(i) )
+              x11(i,i) = one
+              x21(i,i) = one
+              call stdlib_dlarf( 'L', p-i+1, q-i, x11(i,i), 1, taup1(i), x11(i,i+1),ldx11, work(&
+                        ilarf) )
+              call stdlib_dlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, taup2(i),x21(i,i+1), ldx21, work(&
+                        ilarf) )
+              if( i < q ) then
+                 call stdlib_drot( q-i, x11(i,i+1), ldx11, x21(i,i+1), ldx21, c, s )
+                 call stdlib_dlarfgp( q-i, x21(i,i+1), x21(i,i+2), ldx21, tauq1(i) )
+                 s = x21(i,i+1)
+                 x21(i,i+1) = one
+                 call stdlib_dlarf( 'R', p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x11(i+1,i+1), &
+                           ldx11, work(ilarf) )
+                 call stdlib_dlarf( 'R', m-p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x21(i+1,i+1), &
+                           ldx21, work(ilarf) )
+                 c = sqrt( stdlib_dnrm2( p-i, x11(i+1,i+1), 1 )**2+ stdlib_dnrm2( m-p-i, x21(i+1,&
+                           i+1), 1 )**2 )
+                 phi(i) = atan2( s, c )
+                 call stdlib_dorbdb5( p-i, m-p-i, q-i-1, x11(i+1,i+1), 1,x21(i+1,i+1), 1, x11(i+1,&
+                           i+2), ldx11,x21(i+1,i+2), ldx21, work(iorbdb5), lorbdb5,childinfo )
+              end if
+           end do
+           return
+     end subroutine stdlib_dorbdb1
+
+     !> DORBDB2: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. P must be no larger than M-P,
+     !> Q, or M-Q. Routines DORBDB1, DORBDB3, and DORBDB4 handle cases in
+     !> which P is not the minimum dimension.
+     !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are P-by-P bidiagonal matrices represented implicitly by
+     !> angles THETA, PHI.
+
+     subroutine stdlib_dorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(dp), intent(out) :: phi(*), theta(*)
+           real(dp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           real(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < 0 .or. p > m-p ) then
+              info = -2
+           else if( q < 0 .or. q < p .or. m-q < p ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p-1, m-p, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-1
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORBDB2', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce rows 1, ..., p of x11 and x21
+           do i = 1, p
+              if( i > 1 ) then
+                 call stdlib_drot( q-i+1, x11(i,i), ldx11, x21(i-1,i), ldx21, c, s )
+              end if
+              call stdlib_dlarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
+              c = x11(i,i)
+              x11(i,i) = one
+              call stdlib_dlarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_dlarf( 'R', m-p-i+1, q-i+1, x11(i,i), ldx11, tauq1(i),x21(i,i), ldx21, &
+                        work(ilarf) )
+              s = sqrt( stdlib_dnrm2( p-i, x11(i+1,i), 1 )**2+ stdlib_dnrm2( m-p-i+1, x21(i,i), 1 &
+                        )**2 )
+              theta(i) = atan2( s, c )
+              call stdlib_dorbdb5( p-i, m-p-i+1, q-i, x11(i+1,i), 1, x21(i,i), 1,x11(i+1,i+1), &
+                        ldx11, x21(i,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
+              call stdlib_dscal( p-i, negone, x11(i+1,i), 1 )
+              call stdlib_dlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              if( i < p ) then
+                 call stdlib_dlarfgp( p-i, x11(i+1,i), x11(i+2,i), 1, taup1(i) )
+                 phi(i) = atan2( x11(i+1,i), x21(i,i) )
+                 c = cos( phi(i) )
+                 s = sin( phi(i) )
+                 x11(i+1,i) = one
+                 call stdlib_dlarf( 'L', p-i, q-i, x11(i+1,i), 1, taup1(i),x11(i+1,i+1), ldx11, &
+                           work(ilarf) )
+              end if
+              x21(i,i) = one
+              call stdlib_dlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, taup2(i),x21(i,i+1), ldx21, work(&
+                        ilarf) )
+           end do
+           ! reduce the bottom-right portion of x21 to the identity matrix
+           do i = p + 1, q
+              call stdlib_dlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              x21(i,i) = one
+              call stdlib_dlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, taup2(i),x21(i,i+1), ldx21, work(&
+                        ilarf) )
+           end do
+           return
+     end subroutine stdlib_dorbdb2
+
+     !> DORBDB3: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-P must be no larger than P,
+     !> Q, or M-Q. Routines DORBDB1, DORBDB2, and DORBDB4 handle cases in
+     !> which M-P is not the minimum dimension.
+     !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are (M-P)-by-(M-P) bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_dorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(dp), intent(out) :: phi(*), theta(*)
+           real(dp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           real(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( 2*p < m .or. p > m ) then
+              info = -2
+           else if( q < m-p .or. m-q < m-p ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p, m-p-1, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-1
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORBDB3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce rows 1, ..., m-p of x11 and x21
+           do i = 1, m-p
+              if( i > 1 ) then
+                 call stdlib_drot( q-i+1, x11(i-1,i), ldx11, x21(i,i), ldx11, c, s )
+              end if
+              call stdlib_dlarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
+              s = x21(i,i)
+              x21(i,i) = one
+              call stdlib_dlarf( 'R', p-i+1, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_dlarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
+                        work(ilarf) )
+              c = sqrt( stdlib_dnrm2( p-i+1, x11(i,i), 1 )**2+ stdlib_dnrm2( m-p-i, x21(i+1,i), 1 &
+                        )**2 )
+              theta(i) = atan2( s, c )
+              call stdlib_dorbdb5( p-i+1, m-p-i, q-i, x11(i,i), 1, x21(i+1,i), 1,x11(i,i+1), &
+                        ldx11, x21(i+1,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
+              call stdlib_dlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              if( i < m-p ) then
+                 call stdlib_dlarfgp( m-p-i, x21(i+1,i), x21(i+2,i), 1, taup2(i) )
+                 phi(i) = atan2( x21(i+1,i), x11(i,i) )
+                 c = cos( phi(i) )
+                 s = sin( phi(i) )
+                 x21(i+1,i) = one
+                 call stdlib_dlarf( 'L', m-p-i, q-i, x21(i+1,i), 1, taup2(i),x21(i+1,i+1), ldx21, &
+                           work(ilarf) )
+              end if
+              x11(i,i) = one
+              call stdlib_dlarf( 'L', p-i+1, q-i, x11(i,i), 1, taup1(i), x11(i,i+1),ldx11, work(&
+                        ilarf) )
+           end do
+           ! reduce the bottom-right portion of x11 to the identity matrix
+           do i = m-p + 1, q
+              call stdlib_dlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              x11(i,i) = one
+              call stdlib_dlarf( 'L', p-i+1, q-i, x11(i,i), 1, taup1(i), x11(i,i+1),ldx11, work(&
+                        ilarf) )
+           end do
+           return
+     end subroutine stdlib_dorbdb3
+
+     !> DORBDB4: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-Q must be no larger than P,
+     !> M-P, or Q. Routines DORBDB1, DORBDB2, and DORBDB3 handle cases in
+     !> which M-Q is not the minimum dimension.
+     !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are (M-Q)-by-(M-Q) bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_dorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               phantom, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(dp), intent(out) :: phi(*), theta(*)
+           real(dp), intent(out) :: phantom(*), taup1(*), taup2(*), tauq1(*), work(*)
+           real(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, j, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < m-q .or. m-p < m-q ) then
+              info = -2
+           else if( q < m-q .or. q > m ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( q-1, p-1, m-p-1 )
+              iorbdb5 = 2
+              lorbdb5 = q
+              lworkopt = ilarf + llarf - 1
+              lworkopt = max( lworkopt, iorbdb5 + lorbdb5 - 1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORBDB4', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce columns 1, ..., m-q of x11 and x21
+           do i = 1, m-q
+              if( i == 1 ) then
+                 do j = 1, m
+                    phantom(j) = zero
+                 end do
+                 call stdlib_dorbdb5( p, m-p, q, phantom(1), 1, phantom(p+1), 1,x11, ldx11, x21, &
+                           ldx21, work(iorbdb5),lorbdb5, childinfo )
+                 call stdlib_dscal( p, negone, phantom(1), 1 )
+                 call stdlib_dlarfgp( p, phantom(1), phantom(2), 1, taup1(1) )
+                 call stdlib_dlarfgp( m-p, phantom(p+1), phantom(p+2), 1, taup2(1) )
+                 theta(i) = atan2( phantom(1), phantom(p+1) )
+                 c = cos( theta(i) )
+                 s = sin( theta(i) )
+                 phantom(1) = one
+                 phantom(p+1) = one
+                 call stdlib_dlarf( 'L', p, q, phantom(1), 1, taup1(1), x11, ldx11,work(ilarf) )
+                           
+                 call stdlib_dlarf( 'L', m-p, q, phantom(p+1), 1, taup2(1), x21,ldx21, work(ilarf)&
+                            )
+              else
+                 call stdlib_dorbdb5( p-i+1, m-p-i+1, q-i+1, x11(i,i-1), 1,x21(i,i-1), 1, x11(i,i)&
+                           , ldx11, x21(i,i),ldx21, work(iorbdb5), lorbdb5, childinfo )
+                 call stdlib_dscal( p-i+1, negone, x11(i,i-1), 1 )
+                 call stdlib_dlarfgp( p-i+1, x11(i,i-1), x11(i+1,i-1), 1, taup1(i) )
+                 call stdlib_dlarfgp( m-p-i+1, x21(i,i-1), x21(i+1,i-1), 1,taup2(i) )
+                 theta(i) = atan2( x11(i,i-1), x21(i,i-1) )
+                 c = cos( theta(i) )
+                 s = sin( theta(i) )
+                 x11(i,i-1) = one
+                 x21(i,i-1) = one
+                 call stdlib_dlarf( 'L', p-i+1, q-i+1, x11(i,i-1), 1, taup1(i),x11(i,i), ldx11, &
+                           work(ilarf) )
+                 call stdlib_dlarf( 'L', m-p-i+1, q-i+1, x21(i,i-1), 1, taup2(i),x21(i,i), ldx21, &
+                           work(ilarf) )
+              end if
+              call stdlib_drot( q-i+1, x11(i,i), ldx11, x21(i,i), ldx21, s, -c )
+              call stdlib_dlarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
+              c = x21(i,i)
+              x21(i,i) = one
+              call stdlib_dlarf( 'R', p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_dlarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
+                        work(ilarf) )
+              if( i < m-q ) then
+                 s = sqrt( stdlib_dnrm2( p-i, x11(i+1,i), 1 )**2+ stdlib_dnrm2( m-p-i, x21(i+1,i),&
+                            1 )**2 )
+                 phi(i) = atan2( s, c )
+              end if
+           end do
+           ! reduce the bottom-right portion of x11 to [ i 0 ]
+           do i = m - q + 1, p
+              call stdlib_dlarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
+              x11(i,i) = one
+              call stdlib_dlarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_dlarf( 'R', q-p, q-i+1, x11(i,i), ldx11, tauq1(i),x21(m-q+1,i), ldx21, &
+                        work(ilarf) )
+           end do
+           ! reduce the bottom-right portion of x21 to [ 0 i ]
+           do i = p + 1, q
+              call stdlib_dlarfgp( q-i+1, x21(m-q+i-p,i), x21(m-q+i-p,i+1), ldx21,tauq1(i) )
+                        
+              x21(m-q+i-p,i) = one
+              call stdlib_dlarf( 'R', q-i, q-i+1, x21(m-q+i-p,i), ldx21, tauq1(i),x21(m-q+i-p+1,i)&
+                        , ldx21, work(ilarf) )
+           end do
+           return
+     end subroutine stdlib_dorbdb4
+
+     !> DORCSD2BY1: computes the CS decomposition of an M-by-Q matrix X with
+     !> orthonormal columns that has been partitioned into a 2-by-1 block
+     !> structure:
+     !> [  I1 0  0 ]
+     !> [  0  C  0 ]
+     !> [ X11 ]   [ U1 |    ] [  0  0  0 ]
+     !> X = [-----] = [---------] [----------] V1**T .
+     !> [ X21 ]   [    | U2 ] [  0  0  0 ]
+     !> [  0  S  0 ]
+     !> [  0  0  I2]
+     !> X11 is P-by-Q. The orthogonal matrices U1, U2, and V1 are P-by-P,
+     !> (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
+     !> nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
+     !> R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
+     !> K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
+
+     subroutine stdlib_dorcsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta, &
+               u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, iwork, info )
+        ! -- lapack computational routine (3.5.0_dp) --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, lwork, ldx11, ldx21, m, p, q
+           ! Array Arguments 
+           real(dp), intent(out) :: theta(*)
+           real(dp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), work(*)
+           real(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
+           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, j, lbbcsd, lorbdb, &
+           lorglq, lorglqmin, lorglqopt, lorgqr, lorgqrmin, lorgqropt, lworkmin, lworkopt, &
+                     r
+           logical(lk) :: lquery, wantu1, wantu2, wantv1t
+           ! Local Arrays 
+           real(dp) :: dum1(1), dum2(1,1)
+           ! Intrinsic Function 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -4
+           else if( p < 0 .or. p > m ) then
+              info = -5
+           else if( q < 0 .or. q > m ) then
+              info = -6
+           else if( ldx11 < max( 1, p ) ) then
+              info = -8
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -10
+           else if( wantu1 .and. ldu1 < max( 1, p ) ) then
+              info = -13
+           else if( wantu2 .and. ldu2 < max( 1, m - p ) ) then
+              info = -15
+           else if( wantv1t .and. ldv1t < max( 1, q ) ) then
+              info = -17
+           end if
+           r = min( p, m-p, q, m-q )
+           ! compute workspace
+             ! work layout:
+           ! |-------------------------------------------------------|
+           ! | lworkopt (1)                                          |
+           ! |-------------------------------------------------------|
+           ! | phi (max(1,r-1))                                      |
+           ! |-------------------------------------------------------|
+           ! | taup1 (max(1,p))                        | b11d (r)    |
+           ! | taup2 (max(1,m-p))                      | b11e (r-1)  |
+           ! | tauq1 (max(1,q))                        | b12d (r)    |
+           ! |-----------------------------------------| b12e (r-1)  |
+           ! | stdlib_dorbdb work | stdlib_dorgqr work | stdlib_dorglq work | b21d (r)    |
+           ! |             |             |             | b21e (r-1)  |
+           ! |             |             |             | b22d (r)    |
+           ! |             |             |             | b22e (r-1)  |
+           ! |             |             |             | stdlib_dbbcsd work |
+           ! |-------------------------------------------------------|
+           if( info == 0 ) then
+              iphi = 2
+              ib11d = iphi + max( 1, r-1 )
+              ib11e = ib11d + max( 1, r )
+              ib12d = ib11e + max( 1, r - 1 )
+              ib12e = ib12d + max( 1, r )
+              ib21d = ib12e + max( 1, r - 1 )
+              ib21e = ib21d + max( 1, r )
+              ib22d = ib21e + max( 1, r - 1 )
+              ib22e = ib22d + max( 1, r )
+              ibbcsd = ib22e + max( 1, r - 1 )
+              itaup1 = iphi + max( 1, r-1 )
+              itaup2 = itaup1 + max( 1, p )
+              itauq1 = itaup2 + max( 1, m-p )
+              iorbdb = itauq1 + max( 1, q )
+              iorgqr = itauq1 + max( 1, q )
+              iorglq = itauq1 + max( 1, q )
+              lorgqrmin = 1
+              lorgqropt = 1
+              lorglqmin = 1
+              lorglqopt = 1
+              if( r == q ) then
+                 call stdlib_dorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
+                           dum1, work,-1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_dorgqr( p, p, q, u1, ldu1, dum1, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 endif
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_dorgqr( m-p, m-p, q, u2, ldu2, dum1, work(1),-1, childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_dorglq( q-1, q-1, q-1, v1t, ldv1t,dum1, work(1), -1, childinfo )
+                              
+                    lorglqmin = max( lorglqmin, q-1 )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_dbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,dum1, u1, &
+                 ldu1, u2, ldu2, v1t, ldv1t,dum2, 1, dum1, dum1, dum1,dum1, dum1, dum1, dum1,dum1,&
+                            work(1), -1, childinfo )
+                 lbbcsd = int( work(1),KIND=ilp)
+              else if( r == p ) then
+                 call stdlib_dorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
+                           dum1,work(1), -1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_dorgqr( p-1, p-1, p-1, u1(2,2), ldu1, dum1,work(1), -1, childinfo &
+                              )
+                    lorgqrmin = max( lorgqrmin, p-1 )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_dorgqr( m-p, m-p, q, u2, ldu2, dum1, work(1),-1, childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_dorglq( q, q, r, v1t, ldv1t, dum1, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_dbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,dum1, v1t, &
+                 ldv1t, dum2, 1, u1, ldu1,u2, ldu2, dum1, dum1, dum1,dum1, dum1, dum1, dum1,dum1, &
+                           work(1), -1, childinfo )
+                 lbbcsd = int( work(1),KIND=ilp)
+              else if( r == m-p ) then
+                 call stdlib_dorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
+                           dum1,work(1), -1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_dorgqr( p, p, q, u1, ldu1, dum1, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_dorgqr( m-p-1, m-p-1, m-p-1, u2(2,2), ldu2,dum1, work(1), -1, &
+                              childinfo )
+                    lorgqrmin = max( lorgqrmin, m-p-1 )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_dorglq( q, q, r, v1t, ldv1t, dum1, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_dbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, dum1, &
+                 dum2, 1, v1t, ldv1t, u2,ldu2, u1, ldu1, dum1, dum1, dum1,dum1, dum1, dum1, dum1,&
+                           dum1, work(1), -1, childinfo )
+                 lbbcsd = int( work(1),KIND=ilp)
+              else
+                 call stdlib_dorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
+                           dum1,dum1, work(1), -1, childinfo )
+                 lorbdb = m + int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_dorgqr( p, p, m-q, u1, ldu1, dum1, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_dorgqr( m-p, m-p, m-q, u2, ldu2, dum1, work(1),-1, childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_dorglq( q, q, q, v1t, ldv1t, dum1, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_dbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, dum1, u2, &
+                 ldu2, u1, ldu1, dum2,1, v1t, ldv1t, dum1, dum1, dum1,dum1, dum1, dum1, dum1,dum1,&
+                            work(1), -1, childinfo )
+                 lbbcsd = int( work(1),KIND=ilp)
+              end if
+              lworkmin = max( iorbdb+lorbdb-1,iorgqr+lorgqrmin-1,iorglq+lorglqmin-1,ibbcsd+lbbcsd-&
+                        1 )
+              lworkopt = max( iorbdb+lorbdb-1,iorgqr+lorgqropt-1,iorglq+lorglqopt-1,ibbcsd+lbbcsd-&
+                        1 )
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                 info = -19
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORCSD2BY1', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           lorgqr = lwork-iorgqr+1
+           lorglq = lwork-iorglq+1
+           ! handle four cases separately: r = q, r = p, r = m-p, and r = m-q,
+           ! in which r = min(p,m-p,q,m-q)
+           if( r == q ) then
+              ! case 1: r = q
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_dorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
+                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_dlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_dorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_dlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_dorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 v1t(1,1) = one
+                 do j = 2, q
+                    v1t(1,j) = zero
+                    v1t(j,1) = zero
+                 end do
+                 call stdlib_dlacpy( 'U', q-1, q-1, x21(1,2), ldx21, v1t(2,2),ldv1t )
+                 call stdlib_dorglq( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorglq), &
+                           lorglq, childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_dbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,work(iphi), u1, &
+              ldu1, u2, ldu2, v1t, ldv1t,dum2, 1, work(ib11d), work(ib11e),work(ib12d), work(&
+              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
+                        childinfo )
+              ! permute rows and columns to place zero submatrices in
+              ! preferred positions
+              if( q > 0 .and. wantu2 ) then
+                 do i = 1, q
+                    iwork(i) = m - p - q + i
+                 end do
+                 do i = q + 1, m - p
+                    iwork(i) = i - q
+                 end do
+                 call stdlib_dlapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           else if( r == p ) then
+              ! case 2: r = p
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_dorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
+                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 u1(1,1) = one
+                 do j = 2, p
+                    u1(1,j) = zero
+                    u1(j,1) = zero
+                 end do
+                 call stdlib_dlacpy( 'L', p-1, p-1, x11(2,1), ldx11, u1(2,2), ldu1 )
+                 call stdlib_dorgqr( p-1, p-1, p-1, u1(2,2), ldu1, work(itaup1),work(iorgqr), &
+                           lorgqr, childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_dlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_dorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_dlacpy( 'U', p, q, x11, ldx11, v1t, ldv1t )
+                 call stdlib_dorglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_dbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,work(iphi), v1t, &
+              ldv1t, dum2, 1, u1, ldu1, u2,ldu2, work(ib11d), work(ib11e), work(ib12d),work(ib12e)&
+              , work(ib21d), work(ib21e),work(ib22d), work(ib22e), work(ibbcsd), lbbcsd,childinfo &
+                        )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( q > 0 .and. wantu2 ) then
+                 do i = 1, q
+                    iwork(i) = m - p - q + i
+                 end do
+                 do i = q + 1, m - p
+                    iwork(i) = i - q
+                 end do
+                 call stdlib_dlapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           else if( r == m-p ) then
+              ! case 3: r = m-p
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_dorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
+                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_dlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_dorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 u2(1,1) = one
+                 do j = 2, m-p
+                    u2(1,j) = zero
+                    u2(j,1) = zero
+                 end do
+                 call stdlib_dlacpy( 'L', m-p-1, m-p-1, x21(2,1), ldx21, u2(2,2),ldu2 )
+                 call stdlib_dorgqr( m-p-1, m-p-1, m-p-1, u2(2,2), ldu2,work(itaup2), work(iorgqr)&
+                           , lorgqr, childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_dlacpy( 'U', m-p, q, x21, ldx21, v1t, ldv1t )
+                 call stdlib_dorglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_dbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, work(iphi), &
+              dum2, 1, v1t, ldv1t, u2,ldu2, u1, ldu1, work(ib11d), work(ib11e),work(ib12d), work(&
+              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
+                        childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( q > r ) then
+                 do i = 1, r
+                    iwork(i) = q - r + i
+                 end do
+                 do i = r + 1, q
+                    iwork(i) = i - r
+                 end do
+                 if( wantu1 ) then
+                    call stdlib_dlapmt( .false., p, q, u1, ldu1, iwork )
+                 end if
+                 if( wantv1t ) then
+                    call stdlib_dlapmr( .false., q, q, v1t, ldv1t, iwork )
+                 end if
+              end if
+           else
+              ! case 4: r = m-q
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_dorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
+              , work(itaup2),work(itauq1), work(iorbdb), work(iorbdb+m),lorbdb-m, childinfo )
+                        
+              ! accumulate householder reflectors
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_dcopy( m-p, work(iorbdb+p), 1, u2, 1 )
+              end if
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_dcopy( p, work(iorbdb), 1, u1, 1 )
+                 do j = 2, p
+                    u1(1,j) = zero
+                 end do
+                 call stdlib_dlacpy( 'L', p-1, m-q-1, x11(2,1), ldx11, u1(2,2),ldu1 )
+                 call stdlib_dorgqr( p, p, m-q, u1, ldu1, work(itaup1),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 do j = 2, m-p
+                    u2(1,j) = zero
+                 end do
+                 call stdlib_dlacpy( 'L', m-p-1, m-q-1, x21(2,1), ldx21, u2(2,2),ldu2 )
+                 call stdlib_dorgqr( m-p, m-p, m-q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_dlacpy( 'U', m-q, q, x21, ldx21, v1t, ldv1t )
+                 call stdlib_dlacpy( 'U', p-(m-q), q-(m-q), x11(m-q+1,m-q+1), ldx11,v1t(m-q+1,m-q+&
+                           1), ldv1t )
+                 call stdlib_dlacpy( 'U', -p+q, q-p, x21(m-q+1,p+1), ldx21,v1t(p+1,p+1), ldv1t )
+                           
+                 call stdlib_dorglq( q, q, q, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_dbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, work(iphi), &
+              u2, ldu2, u1, ldu1, dum2,1, v1t, ldv1t, work(ib11d), work(ib11e),work(ib12d), work(&
+              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
+                        childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( p > r ) then
+                 do i = 1, r
+                    iwork(i) = p - r + i
+                 end do
+                 do i = r + 1, p
+                    iwork(i) = i - r
+                 end do
+                 if( wantu1 ) then
+                    call stdlib_dlapmt( .false., p, p, u1, ldu1, iwork )
+                 end if
+                 if( wantv1t ) then
+                    call stdlib_dlapmr( .false., p, q, v1t, ldv1t, iwork )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dorcsd2by1
+
+     !> DORGTR: generates a real orthogonal matrix Q which is defined as the
+     !> product of n-1 elementary reflectors of order N, as returned by
+     !> DSYTRD:
+     !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+
+     pure subroutine stdlib_dorgtr( uplo, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, j, lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, n-1 ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              if( upper ) then
+                 nb = stdlib_ilaenv( 1, 'DORGQL', ' ', n-1, n-1, n-1, -1 )
+              else
+                 nb = stdlib_ilaenv( 1, 'DORGQR', ' ', n-1, n-1, n-1, -1 )
+              end if
+              lwkopt = max( 1, n-1 )*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGTR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( upper ) then
+              ! q was determined by a call to stdlib_dsytrd with uplo = 'u'
+              ! shift the vectors which define the elementary reflectors one
+              ! column to the left, and set the last row and column of q to
+              ! those of the unit matrix
+              do j = 1, n - 1
+                 do i = 1, j - 1
+                    a( i, j ) = a( i, j+1 )
+                 end do
+                 a( n, j ) = zero
+              end do
+              do i = 1, n - 1
+                 a( i, n ) = zero
+              end do
+              a( n, n ) = one
+              ! generate q(1:n-1,1:n-1)
+              call stdlib_dorgql( n-1, n-1, n-1, a, lda, tau, work, lwork, iinfo )
+           else
+              ! q was determined by a call to stdlib_dsytrd with uplo = 'l'.
+              ! shift the vectors which define the elementary reflectors one
+              ! column to the right, and set the first row and column of q to
+              ! those of the unit matrix
+              do j = n, 2, -1
+                 a( 1, j ) = zero
+                 do i = j + 1, n
+                    a( i, j ) = a( i, j-1 )
+                 end do
+              end do
+              a( 1, 1 ) = one
+              do i = 2, n
+                 a( i, 1 ) = zero
+              end do
+              if( n>1 ) then
+                 ! generate q(2:n,2:n)
+                 call stdlib_dorgqr( n-1, n-1, n-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                           
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dorgtr
+
+     !> DORGTSQR: generates an M-by-N real matrix Q_out with orthonormal columns,
+     !> which are the first N columns of a product of real orthogonal
+     !> matrices of order M which are returned by DLATSQR
+     !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+     !> See the documentation for DLATSQR.
+
+     pure subroutine stdlib_dorgtsqr( m, n, mb, nb, a, lda, t, ldt, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, mb, nb
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: t(ldt,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: iinfo, ldc, lworkopt, lc, lw, nblocal, j
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           lquery  = lwork==-1
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb<=n ) then
+              info = -3
+           else if( nb<1 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -8
+           else
+              ! test the input lwork for the dimension of the array work.
+              ! this workspace is used to store array c(ldc, n) and work(lwork)
+              ! in the call to stdlib_dlamtsqr. see the documentation for stdlib_dlamtsqr.
+              if( lwork<2 .and. (.not.lquery) ) then
+                 info = -10
+              else
+                 ! set block size for column blocks
+                 nblocal = min( nb, n )
+                 ! lwork = -1, then set the size for the array c(ldc,n)
+                 ! in stdlib_dlamtsqr call and set the optimal size of the work array
+                 ! work(lwork) in stdlib_dlamtsqr call.
+                 ldc = m
+                 lc = ldc*n
+                 lw = n * nblocal
+                 lworkopt = lc+lw
+                 if( ( lwork<max( 1, lworkopt ) ).and.(.not.lquery) ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           ! handle error in the input parameters and return workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGTSQR', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = real( lworkopt,KIND=dp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = real( lworkopt,KIND=dp)
+              return
+           end if
+           ! (1) form explicitly the tall-skinny m-by-n left submatrix q1_in
+           ! of m-by-m orthogonal matrix q_in, which is implicitly stored in
+           ! the subdiagonal part of input array a and in the input array t.
+           ! perform by the following operation using the routine stdlib_dlamtsqr.
+               ! q1_in = q_in * ( i ), where i is a n-by-n identity matrix,
+                              ! ( 0 )        0 is a (m-n)-by-n zero matrix.
+           ! (1a) form m-by-n matrix in the array work(1:ldc*n) with ones
+           ! on the diagonal and zeros elsewhere.
+           call stdlib_dlaset( 'F', m, n, zero, one, work, ldc )
+           ! (1b)  on input, work(1:ldc*n) stores ( i );
+                                                ! ( 0 )
+                 ! on output, work(1:ldc*n) stores q1_in.
+           call stdlib_dlamtsqr( 'L', 'N', m, n, n, mb, nblocal, a, lda, t, ldt,work, ldc, work( &
+                     lc+1 ), lw, iinfo )
+           ! (2) copy the result from the part of the work array (1:m,1:n)
+           ! with the leading dimension ldc that starts at work(1) into
+           ! the output array a(1:m,1:n) column-by-column.
+           do j = 1, n
+              call stdlib_dcopy( m, work( (j-1)*ldc + 1 ), 1, a( 1, j ), 1 )
+           end do
+           work( 1 ) = real( lworkopt,KIND=dp)
+           return
+     end subroutine stdlib_dorgtsqr
+
+     !> DORMTR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> nq-1 elementary reflectors, as returned by DSYTRD:
+     !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+     pure subroutine stdlib_dormtr( side, uplo, trans, m, n, a, lda, tau, c, ldc,work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, lquery, upper
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) )&
+                     then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              if( upper ) then
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'DORMQL', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'DORMQL', side // trans, m, n-1, n-1,-1 )
+                 end if
+              else
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'DORMQR', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'DORMQR', side // trans, m, n-1, n-1,-1 )
+                 end if
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMTR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. nq==1 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( left ) then
+              mi = m - 1
+              ni = n
+           else
+              mi = m
+              ni = n - 1
+           end if
+           if( upper ) then
+              ! q was determined by a call to stdlib_dsytrd with uplo = 'u'
+              call stdlib_dormql( side, trans, mi, ni, nq-1, a( 1, 2 ), lda, tau, c,ldc, work, &
+                        lwork, iinfo )
+           else
+              ! q was determined by a call to stdlib_dsytrd with uplo = 'l'
+              if( left ) then
+                 i1 = 2
+                 i2 = 1
+              else
+                 i1 = 1
+                 i2 = 2
+              end if
+              call stdlib_dormqr( side, trans, mi, ni, nq-1, a( 2, 1 ), lda, tau,c( i1, i2 ), ldc,&
+                         work, lwork, iinfo )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dormtr
+
+     !> DPBTRF: computes the Cholesky factorization of a real symmetric
+     !> positive definite band matrix A.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+
+     pure subroutine stdlib_dpbtrf( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 32
+           integer(ilp), parameter :: ldwork = nbmax+1
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i2, i3, ib, ii, j, jj, nb
+           ! Local Arrays 
+           real(dp) :: work(ldwork,nbmax)
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( .not.stdlib_lsame( uplo, 'U' ) ) .and.( .not.stdlib_lsame( uplo, 'L' ) ) ) &
+                     then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment
+           nb = stdlib_ilaenv( 1, 'DPBTRF', uplo, n, kd, -1, -1 )
+           ! the block size must not exceed the semi-bandwidth kd, and must not
+           ! exceed the limit set by the size of the local array work.
+           nb = min( nb, nbmax )
+           if( nb<=1 .or. nb>kd ) then
+              ! use unblocked code
+              call stdlib_dpbtf2( uplo, n, kd, ab, ldab, info )
+           else
+              ! use blocked code
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! compute the cholesky factorization of a symmetric band
+                 ! matrix, given the upper triangle of the matrix in band
+                 ! storage.
+                 ! zero the upper triangle of the work array.
+                 do j = 1, nb
+                    do i = 1, j - 1
+                       work( i, j ) = zero
+                    end do
+                 end do
+                 ! process the band matrix one diagonal block at a time.
+                 loop_70: do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    ! factorize the diagonal block
+                    call stdlib_dpotf2( uplo, ib, ab( kd+1, i ), ldab-1, ii )
+                    if( ii/=0 ) then
+                       info = i + ii - 1
+                       go to 150
+                    end if
+                    if( i+ib<=n ) then
+                       ! update the relevant part of the trailing submatrix.
+                       ! if a11 denotes the diagonal block which has just been
+                       ! factorized, then we need to update the remaining
+                       ! blocks in the diagram:
+                          ! a11   a12   a13
+                                ! a22   a23
+                                      ! a33
+                       ! the numbers of rows and columns in the partitioning
+                       ! are ib, i2, i3 respectively. the blocks a12, a22 and
+                       ! a23 are empty if ib = kd. the upper triangle of a13
+                       ! lies outside the band.
+                       i2 = min( kd-ib, n-i-ib+1 )
+                       i3 = min( ib, n-i-kd+1 )
+                       if( i2>0 ) then
+                          ! update a12
+                          call stdlib_dtrsm( 'LEFT', 'UPPER', 'TRANSPOSE','NON-UNIT', ib, i2, one,&
+                                     ab( kd+1, i ),ldab-1, ab( kd+1-ib, i+ib ), ldab-1 )
+                          ! update a22
+                          call stdlib_dsyrk( 'UPPER', 'TRANSPOSE', i2, ib, -one,ab( kd+1-ib, i+ib &
+                                    ), ldab-1, one,ab( kd+1, i+ib ), ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! copy the lower triangle of a13 into the work array.
+                          do jj = 1, i3
+                             do ii = jj, ib
+                                work( ii, jj ) = ab( ii-jj+1, jj+i+kd-1 )
+                             end do
+                          end do
+                          ! update a13 (in the work array).
+                          call stdlib_dtrsm( 'LEFT', 'UPPER', 'TRANSPOSE','NON-UNIT', ib, i3, one,&
+                                     ab( kd+1, i ),ldab-1, work, ldwork )
+                          ! update a23
+                          if( i2>0 )call stdlib_dgemm( 'TRANSPOSE', 'NO TRANSPOSE', i2, i3,ib, -&
+                          one, ab( kd+1-ib, i+ib ),ldab-1, work, ldwork, one,ab( 1+ib, i+kd ), &
+                                    ldab-1 )
+                          ! update a33
+                          call stdlib_dsyrk( 'UPPER', 'TRANSPOSE', i3, ib, -one,work, ldwork, one,&
+                                     ab( kd+1, i+kd ),ldab-1 )
+                          ! copy the lower triangle of a13 back into place.
+                          do jj = 1, i3
+                             do ii = jj, ib
+                                ab( ii-jj+1, jj+i+kd-1 ) = work( ii, jj )
+                             end do
+                          end do
+                       end if
+                    end if
+                 end do loop_70
+              else
+                 ! compute the cholesky factorization of a symmetric band
+                 ! matrix, given the lower triangle of the matrix in band
+                 ! storage.
+                 ! zero the lower triangle of the work array.
+                 do j = 1, nb
+                    do i = j + 1, nb
+                       work( i, j ) = zero
+                    end do
+                 end do
+                 ! process the band matrix one diagonal block at a time.
+                 loop_140: do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    ! factorize the diagonal block
+                    call stdlib_dpotf2( uplo, ib, ab( 1, i ), ldab-1, ii )
+                    if( ii/=0 ) then
+                       info = i + ii - 1
+                       go to 150
+                    end if
+                    if( i+ib<=n ) then
+                       ! update the relevant part of the trailing submatrix.
+                       ! if a11 denotes the diagonal block which has just been
+                       ! factorized, then we need to update the remaining
+                       ! blocks in the diagram:
+                          ! a11
+                          ! a21   a22
+                          ! a31   a32   a33
+                       ! the numbers of rows and columns in the partitioning
+                       ! are ib, i2, i3 respectively. the blocks a21, a22 and
+                       ! a32 are empty if ib = kd. the lower triangle of a31
+                       ! lies outside the band.
+                       i2 = min( kd-ib, n-i-ib+1 )
+                       i3 = min( ib, n-i-kd+1 )
+                       if( i2>0 ) then
+                          ! update a21
+                          call stdlib_dtrsm( 'RIGHT', 'LOWER', 'TRANSPOSE','NON-UNIT', i2, ib, &
+                                    one, ab( 1, i ),ldab-1, ab( 1+ib, i ), ldab-1 )
+                          ! update a22
+                          call stdlib_dsyrk( 'LOWER', 'NO TRANSPOSE', i2, ib, -one,ab( 1+ib, i ), &
+                                    ldab-1, one,ab( 1, i+ib ), ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! copy the upper triangle of a31 into the work array.
+                          do jj = 1, ib
+                             do ii = 1, min( jj, i3 )
+                                work( ii, jj ) = ab( kd+1-jj+ii, jj+i-1 )
+                             end do
+                          end do
+                          ! update a31 (in the work array).
+                          call stdlib_dtrsm( 'RIGHT', 'LOWER', 'TRANSPOSE','NON-UNIT', i3, ib, &
+                                    one, ab( 1, i ),ldab-1, work, ldwork )
+                          ! update a32
+                          if( i2>0 )call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', i3, i2,ib, -&
+                          one, work, ldwork,ab( 1+ib, i ), ldab-1, one,ab( 1+kd-ib, i+ib ), ldab-&
+                                    1 )
+                          ! update a33
+                          call stdlib_dsyrk( 'LOWER', 'NO TRANSPOSE', i3, ib, -one,work, ldwork, &
+                                    one, ab( 1, i+kd ),ldab-1 )
+                          ! copy the upper triangle of a31 back into place.
+                          do jj = 1, ib
+                             do ii = 1, min( jj, i3 )
+                                ab( kd+1-jj+ii, jj+i-1 ) = work( ii, jj )
+                             end do
+                          end do
+                       end if
+                    end if
+                 end do loop_140
+              end if
+           end if
+           return
+           150 continue
+           return
+     end subroutine stdlib_dpbtrf
+
+     !> DPFTRI: computes the inverse of a (real) symmetric positive definite
+     !> matrix A using the Cholesky factorization A = U**T*U or A = L*L**T
+     !> computed by DPFTRF.
+
+     pure subroutine stdlib_dpftri( transr, uplo, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPFTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_dtftri( transr, uplo, 'N', n, a, info )
+           if( info>0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution of triangular matrix multiply: inv(u)*inv(u)^c or
+           ! inv(l)^c*inv(l). there are eight cases.
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                    ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                    ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_dlauum( 'L', n1, a( 0 ), n, info )
+                    call stdlib_dsyrk( 'L', 'T', n1, n2, one, a( n1 ), n, one,a( 0 ), n )
+                    call stdlib_dtrmm( 'L', 'U', 'N', 'N', n2, n1, one, a( n ), n,a( n1 ), n )
+                              
+                    call stdlib_dlauum( 'U', n2, a( n ), n, info )
+                 else
+                    ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                    ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                    ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_dlauum( 'L', n1, a( n2 ), n, info )
+                    call stdlib_dsyrk( 'L', 'N', n1, n2, one, a( 0 ), n, one,a( n2 ), n )
+                    call stdlib_dtrmm( 'R', 'U', 'T', 'N', n1, n2, one, a( n1 ), n,a( 0 ), n )
+                              
+                    call stdlib_dlauum( 'U', n2, a( n1 ), n, info )
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose, and n is odd
+                    ! t1 -> a(0), t2 -> a(1), s -> a(0+n1*n1)
+                    call stdlib_dlauum( 'U', n1, a( 0 ), n1, info )
+                    call stdlib_dsyrk( 'U', 'N', n1, n2, one, a( n1*n1 ), n1, one,a( 0 ), n1 )
+                              
+                    call stdlib_dtrmm( 'R', 'L', 'N', 'N', n1, n2, one, a( 1 ), n1,a( n1*n1 ), n1 &
+                              )
+                    call stdlib_dlauum( 'L', n2, a( 1 ), n1, info )
+                 else
+                    ! srpa for upper, transpose, and n is odd
+                    ! t1 -> a(0+n2*n2), t2 -> a(0+n1*n2), s -> a(0)
+                    call stdlib_dlauum( 'U', n1, a( n2*n2 ), n2, info )
+                    call stdlib_dsyrk( 'U', 'T', n1, n2, one, a( 0 ), n2, one,a( n2*n2 ), n2 )
+                              
+                    call stdlib_dtrmm( 'L', 'L', 'T', 'N', n2, n1, one, a( n1*n2 ),n2, a( 0 ), n2 &
+                              )
+                    call stdlib_dlauum( 'L', n2, a( n1*n2 ), n2, info )
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_dlauum( 'L', k, a( 1 ), n+1, info )
+                    call stdlib_dsyrk( 'L', 'T', k, k, one, a( k+1 ), n+1, one,a( 1 ), n+1 )
+                              
+                    call stdlib_dtrmm( 'L', 'U', 'N', 'N', k, k, one, a( 0 ), n+1,a( k+1 ), n+1 )
+                              
+                    call stdlib_dlauum( 'U', k, a( 0 ), n+1, info )
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_dlauum( 'L', k, a( k+1 ), n+1, info )
+                    call stdlib_dsyrk( 'L', 'N', k, k, one, a( 0 ), n+1, one,a( k+1 ), n+1 )
+                              
+                    call stdlib_dtrmm( 'R', 'U', 'T', 'N', k, k, one, a( k ), n+1,a( 0 ), n+1 )
+                              
+                    call stdlib_dlauum( 'U', k, a( k ), n+1, info )
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose, and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1),
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_dlauum( 'U', k, a( k ), k, info )
+                    call stdlib_dsyrk( 'U', 'N', k, k, one, a( k*( k+1 ) ), k, one,a( k ), k )
+                              
+                    call stdlib_dtrmm( 'R', 'L', 'N', 'N', k, k, one, a( 0 ), k,a( k*( k+1 ) ), k &
+                              )
+                    call stdlib_dlauum( 'L', k, a( 0 ), k, info )
+                 else
+                    ! srpa for upper, transpose, and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0),
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_dlauum( 'U', k, a( k*( k+1 ) ), k, info )
+                    call stdlib_dsyrk( 'U', 'T', k, k, one, a( 0 ), k, one,a( k*( k+1 ) ), k )
+                              
+                    call stdlib_dtrmm( 'L', 'L', 'T', 'N', k, k, one, a( k*k ), k,a( 0 ), k )
+                              
+                    call stdlib_dlauum( 'L', k, a( k*k ), k, info )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dpftri
+
+     !> DPOTRF: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the block version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_dpotrf( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'DPOTRF', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code.
+              call stdlib_dpotrf2( uplo, n, a, lda, info )
+           else
+              ! use blocked code.
+              if( upper ) then
+                 ! compute the cholesky factorization a = u**t*u.
+                 do j = 1, n, nb
+                    ! update and factorize the current diagonal block and test
+                    ! for non-positive-definiteness.
+                    jb = min( nb, n-j+1 )
+                    call stdlib_dsyrk( 'UPPER', 'TRANSPOSE', jb, j-1, -one,a( 1, j ), lda, one, a(&
+                               j, j ), lda )
+                    call stdlib_dpotrf2( 'UPPER', jb, a( j, j ), lda, info )
+                    if( info/=0 )go to 30
+                    if( j+jb<=n ) then
+                       ! compute the current block row.
+                       call stdlib_dgemm( 'TRANSPOSE', 'NO TRANSPOSE', jb, n-j-jb+1,j-1, -one, a( &
+                                 1, j ), lda, a( 1, j+jb ),lda, one, a( j, j+jb ), lda )
+                       call stdlib_dtrsm( 'LEFT', 'UPPER', 'TRANSPOSE', 'NON-UNIT',jb, n-j-jb+1, &
+                                 one, a( j, j ), lda,a( j, j+jb ), lda )
+                    end if
+                 end do
+              else
+                 ! compute the cholesky factorization a = l*l**t.
+                 do j = 1, n, nb
+                    ! update and factorize the current diagonal block and test
+                    ! for non-positive-definiteness.
+                    jb = min( nb, n-j+1 )
+                    call stdlib_dsyrk( 'LOWER', 'NO TRANSPOSE', jb, j-1, -one,a( j, 1 ), lda, one,&
+                               a( j, j ), lda )
+                    call stdlib_dpotrf2( 'LOWER', jb, a( j, j ), lda, info )
+                    if( info/=0 )go to 30
+                    if( j+jb<=n ) then
+                       ! compute the current block column.
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,j-1, -one, a( &
+                                 j+jb, 1 ), lda, a( j, 1 ),lda, one, a( j+jb, j ), lda )
+                       call stdlib_dtrsm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'NON-UNIT',n-j-jb+1, jb, &
+                                 one, a( j, j ), lda,a( j+jb, j ), lda )
+                    end if
+                 end do
+              end if
+           end if
+           go to 40
+           30 continue
+           info = info + j - 1
+           40 continue
+           return
+     end subroutine stdlib_dpotrf
+
+     !> DPTRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric positive definite
+     !> and tridiagonal, and provides error bounds and backward error
+     !> estimates for the solution.
+
+     pure subroutine stdlib_dptrfs( n, nrhs, d, e, df, ef, b, ldb, x, ldx, ferr,berr, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(in) :: b(ldb,*), d(*), df(*), e(*), ef(*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*)
+           real(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: count, i, ix, j, nz
+           real(dp) :: bi, cx, dx, eps, ex, lstres, s, safe1, safe2, safmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPTRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = 4
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_90: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x.  also compute
+              ! abs(a)*abs(x) + abs(b) for use in the backward error bound.
+              if( n==1 ) then
+                 bi = b( 1, j )
+                 dx = d( 1 )*x( 1, j )
+                 work( n+1 ) = bi - dx
+                 work( 1 ) = abs( bi ) + abs( dx )
+              else
+                 bi = b( 1, j )
+                 dx = d( 1 )*x( 1, j )
+                 ex = e( 1 )*x( 2, j )
+                 work( n+1 ) = bi - dx - ex
+                 work( 1 ) = abs( bi ) + abs( dx ) + abs( ex )
+                 do i = 2, n - 1
+                    bi = b( i, j )
+                    cx = e( i-1 )*x( i-1, j )
+                    dx = d( i )*x( i, j )
+                    ex = e( i )*x( i+1, j )
+                    work( n+i ) = bi - cx - dx - ex
+                    work( i ) = abs( bi ) + abs( cx ) + abs( dx ) + abs( ex )
+                 end do
+                 bi = b( n, j )
+                 cx = e( n-1 )*x( n-1, j )
+                 dx = d( n )*x( n, j )
+                 work( n+n ) = bi - cx - dx
+                 work( n ) = abs( bi ) + abs( cx ) + abs( dx )
+              end if
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_dpttrs( n, 1, df, ef, work( n+1 ), n, info )
+                 call stdlib_daxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              ix = stdlib_idamax( n, work, 1 )
+              ferr( j ) = work( ix )
+              ! estimate the norm of inv(a).
+              ! solve m(a) * x = e, where m(a) = (m(i,j)) is given by
+                 ! m(i,j) =  abs(a(i,j)), i = j,
+                 ! m(i,j) = -abs(a(i,j)), i .ne. j,
+              ! and e = [ 1, 1, ..., 1 ]**t.  note m(a) = m(l)*d*m(l)**t.
+              ! solve m(l) * x = e.
+              work( 1 ) = one
+              do i = 2, n
+                 work( i ) = one + work( i-1 )*abs( ef( i-1 ) )
+              end do
+              ! solve d * m(l)**t * x = b.
+              work( n ) = work( n ) / df( n )
+              do i = n - 1, 1, -1
+                 work( i ) = work( i ) / df( i ) + work( i+1 )*abs( ef( i ) )
+              end do
+              ! compute norm(inv(a)) = max(x(i)), 1<=i<=n.
+              ix = stdlib_idamax( n, work, 1 )
+              ferr( j ) = ferr( j )*abs( work( ix ) )
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_90
+           return
+     end subroutine stdlib_dptrfs
+
+     !> DPTSV: computes the solution to a real system of linear equations
+     !> A*X = B, where A is an N-by-N symmetric positive definite tridiagonal
+     !> matrix, and X and B are N-by-NRHS matrices.
+     !> A is factored as A = L*D*L**T, and the factored form of A is then
+     !> used to solve the system of equations.
+
+     pure subroutine stdlib_dptsv( n, nrhs, d, e, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(inout) :: b(ldb,*), d(*), e(*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPTSV ', -info )
+              return
+           end if
+           ! compute the l*d*l**t (or u**t*d*u) factorization of a.
+           call stdlib_dpttrf( n, d, e, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_dpttrs( n, nrhs, d, e, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_dptsv
+
+     !> DPTSVX: uses the factorization A = L*D*L**T to compute the solution
+     !> to a real system of linear equations A*X = B, where A is an N-by-N
+     !> symmetric positive definite tridiagonal matrix and X and B are
+     !> N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     pure subroutine stdlib_dptsvx( fact, n, nrhs, d, e, df, ef, b, ldb, x, ldx,rcond, ferr, berr,&
+                work, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(in) :: b(ldb,*), d(*), e(*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+           real(dp), intent(inout) :: df(*), ef(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(dp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPTSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the l*d*l**t (or u**t*d*u) factorization of a.
+              call stdlib_dcopy( n, d, 1, df, 1 )
+              if( n>1 )call stdlib_dcopy( n-1, e, 1, ef, 1 )
+              call stdlib_dpttrf( n, df, ef, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_dlanst( '1', n, d, e )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_dptcon( n, df, ef, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_dlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_dpttrs( n, nrhs, df, ef, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_dptrfs( n, nrhs, d, e, df, ef, b, ldb, x, ldx, ferr, berr,work, info )
+                     
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_dptsvx
+
+     !> DSBEV: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a real symmetric band matrix A.
+
+     subroutine stdlib_dsbev( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldz, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ab(ldab,*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, wantz
+           integer(ilp) :: iinfo, imax, inde, indwrk, iscale
+           real(dp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( lower ) then
+                 w( 1 ) = ab( 1, 1 )
+              else
+                 w( 1 ) = ab( kd+1, 1 )
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_dlansb( 'M', uplo, n, kd, ab, ldab, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_dlascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_dlascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+           end if
+           ! call stdlib_dsbtrd to reduce symmetric band matrix to tridiagonal form.
+           inde = 1
+           indwrk = inde + n
+           call stdlib_dsbtrd( jobz, uplo, n, kd, ab, ldab, w, work( inde ), z, ldz,work( indwrk )&
+                     , iinfo )
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, call stdlib_ssteqr.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, work( inde ), info )
+           else
+              call stdlib_dsteqr( jobz, n, w, work( inde ), z, ldz, work( indwrk ),info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           return
+     end subroutine stdlib_dsbev
+
+     !> DSBEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric band matrix A.  Eigenvalues and eigenvectors can
+     !> be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+
+     subroutine stdlib_dsbevx( jobz, range, uplo, n, kd, ab, ldab, q, ldq, vl,vu, il, iu, abstol, &
+               m, w, z, ldz, work, iwork,ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, kd, ldab, ldq, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(inout) :: ab(ldab,*)
+           real(dp), intent(out) :: q(ldq,*), w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwo, indwrk, &
+                     iscale, itmp1, j, jj, nsplit
+           real(dp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lower = stdlib_lsame( uplo, 'L' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           else if( wantz .and. ldq<max( 1, n ) ) then
+              info = -9
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -11
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -12
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -13
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -18
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              m = 1
+              if( lower ) then
+                 tmp1 = ab( 1, 1 )
+              else
+                 tmp1 = ab( kd+1, 1 )
+              end if
+              if( valeig ) then
+                 if( .not.( vl<tmp1 .and. vu>=tmp1 ) )m = 0
+              end if
+              if( m==1 ) then
+                 w( 1 ) = tmp1
+                 if( wantz )z( 1, 1 ) = one
+              end if
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           end if
+           anrm = stdlib_dlansb( 'M', uplo, n, kd, ab, ldab, work )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_dlascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_dlascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_dsbtrd to reduce symmetric band matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indwrk = inde + n
+           call stdlib_dsbtrd( jobz, uplo, n, kd, ab, ldab, work( indd ),work( inde ), q, ldq, &
+                     work( indwrk ), iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_dsterf or stdlib_ssteqr.  if this fails for some
+           ! eigenvalue, then try stdlib_dstebz.
+           test = .false.
+           if (indeig) then
+              if (il==1 .and. iu==n) then
+                 test = .true.
+              end if
+           end if
+           if ((alleig .or. test) .and. (abstol<=zero)) then
+              call stdlib_dcopy( n, work( indd ), 1, w, 1 )
+              indee = indwrk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_dcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_dsterf( n, w, work( indee ), info )
+              else
+                 call stdlib_dlacpy( 'A', n, n, q, ldq, z, ldz )
+                 call stdlib_dcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_dsteqr( jobz, n, w, work( indee ), z, ldz,work( indwrk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_dstebz and, if eigenvectors are desired, stdlib_sstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_dstebz( range, order, n, vll, vuu, il, iu, abstll,work( indd ), work( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwrk ),iwork( indiwo ), info &
+                     )
+           if( wantz ) then
+              call stdlib_dstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwrk ), iwork( indiwo ), ifail, info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_dstein.
+              do j = 1, m
+                 call stdlib_dcopy( n, z( 1, j ), 1, work( 1 ), 1 )
+                 call stdlib_dgemv( 'N', n, n, one, q, ldq, work, 1, zero,z( 1, j ), 1 )
+              end do
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           30 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_dswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_dsbevx
+
+     !> DSBGV: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x. Here A and B are assumed to be symmetric
+     !> and banded, and B is also positive definite.
+
+     pure subroutine stdlib_dsbgv( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w, z,ldz, work, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldz, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper, wantz
+           character :: vect
+           integer(ilp) :: iinfo, inde, indwrk
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBGV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_dpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           inde = 1
+           indwrk = inde + n
+           call stdlib_dsbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, z, ldz,work( indwrk ), &
+                     iinfo )
+           ! reduce to tridiagonal form.
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_dsbtrd( vect, uplo, n, ka, ab, ldab, w, work( inde ), z, ldz,work( indwrk )&
+                     , iinfo )
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, call stdlib_ssteqr.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, work( inde ), info )
+           else
+              call stdlib_dsteqr( jobz, n, w, work( inde ), z, ldz, work( indwrk ),info )
+           end if
+           return
+     end subroutine stdlib_dsbgv
+
+     !> DSBGVX: computes selected eigenvalues, and optionally, eigenvectors
+     !> of a real generalized symmetric-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x.  Here A and B are assumed to be symmetric
+     !> and banded, and B is also positive definite.  Eigenvalues and
+     !> eigenvectors can be selected by specifying either all eigenvalues,
+     !> a range of values or a range of indices for the desired eigenvalues.
+
+     pure subroutine stdlib_dsbgvx( jobz, range, uplo, n, ka, kb, ab, ldab, bb,ldbb, q, ldq, vl, &
+               vu, il, iu, abstol, m, w, z,ldz, work, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, ka, kb, ldab, ldbb, ldq, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           real(dp), intent(out) :: q(ldq,*), w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, upper, valeig, wantz
+           character :: order, vect
+           integer(ilp) :: i, iinfo, indd, inde, indee, indibl, indisp, indiwo, indwrk, itmp1, j, &
+                     jj, nsplit
+           real(dp) :: tmp1
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ka<0 ) then
+              info = -5
+           else if( kb<0 .or. kb>ka ) then
+              info = -6
+           else if( ldab<ka+1 ) then
+              info = -8
+           else if( ldbb<kb+1 ) then
+              info = -10
+           else if( ldq<1 .or. ( wantz .and. ldq<n ) ) then
+              info = -12
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -14
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -15
+                 else if ( iu<min( n, il ) .or. iu>n ) then
+                    info = -16
+                 end if
+              end if
+           end if
+           if( info==0) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -21
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBGVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_dpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           call stdlib_dsbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, q, ldq,work, iinfo )
+                     
+           ! reduce symmetric band matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indwrk = inde + n
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_dsbtrd( vect, uplo, n, ka, ab, ldab, work( indd ),work( inde ), q, ldq, &
+                     work( indwrk ), iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_dsterf or stdlib_ssteqr.  if this fails for some
+           ! eigenvalue, then try stdlib_dstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_dcopy( n, work( indd ), 1, w, 1 )
+              indee = indwrk + 2*n
+              call stdlib_dcopy( n-1, work( inde ), 1, work( indee ), 1 )
+              if( .not.wantz ) then
+                 call stdlib_dsterf( n, w, work( indee ), info )
+              else
+                 call stdlib_dlacpy( 'A', n, n, q, ldq, z, ldz )
+                 call stdlib_dsteqr( jobz, n, w, work( indee ), z, ldz,work( indwrk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_dstebz and, if eigenvectors are desired,
+           ! call stdlib_dstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_dstebz( range, order, n, vl, vu, il, iu, abstol,work( indd ), work( inde ),&
+            m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwrk ),iwork( indiwo ), info )
+                      
+           if( wantz ) then
+              call stdlib_dstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwrk ), iwork( indiwo ), ifail, info )
+              ! apply transformation matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_dstein.
+              do j = 1, m
+                 call stdlib_dcopy( n, z( 1, j ), 1, work( 1 ), 1 )
+                 call stdlib_dgemv( 'N', n, n, one, q, ldq, work, 1, zero,z( 1, j ), 1 )
+              end do
+           end if
+           30 continue
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_dswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_dsbgvx
+
+     !> DSGESV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> DSGESV first attempts to factorize the matrix in SINGLE PRECISION
+     !> and use this factorization within an iterative refinement procedure
+     !> to produce a solution with DOUBLE PRECISION normwise backward error
+     !> quality (see below). If the approach fails the method switches to a
+     !> DOUBLE PRECISION factorization and solve.
+     !> The iterative refinement is not going to be a winning strategy if
+     !> the ratio SINGLE PRECISION performance over DOUBLE PRECISION
+     !> performance is too small. A reasonable strategy should take the
+     !> number of right-hand sides and the size of the matrix into account.
+     !> This might be done with a call to ILAENV in the future. Up to now, we
+     !> always try iterative refinement.
+     !> The iterative refinement process is stopped if
+     !> ITER > ITERMAX
+     !> or for all the RHS we have:
+     !> RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
+     !> where
+     !> o ITER is the number of the current iteration in the iterative
+     !> refinement process
+     !> o RNRM is the infinity-norm of the residual
+     !> o XNRM is the infinity-norm of the solution
+     !> o ANRM is the infinity-operator-norm of the matrix A
+     !> o EPS is the machine epsilon returned by DLAMCH('Epsilon')
+     !> The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00
+     !> respectively.
+
+     subroutine stdlib_dsgesv( n, nrhs, a, lda, ipiv, b, ldb, x, ldx, work,swork, iter, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, iter
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(out) :: swork(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: b(ldb,*)
+           real(dp), intent(out) :: work(n,*), x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           logical(lk), parameter :: doitref = .true.
+           integer(ilp), parameter :: itermax = 30
+           real(dp), parameter :: bwdmax = 1.0e+00_dp
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iiter, ptsa, ptsx
+           real(dp) :: anrm, cte, eps, rnrm, xnrm
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,sqrt
+           ! Executable Statements 
+           info = 0
+           iter = 0
+           ! test the input parameters.
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSGESV', -info )
+              return
+           end if
+           ! quick return if (n==0).
+           if( n==0 )return
+           ! skip single precision iterative refinement if a priori slower
+           ! than double precision factorization.
+           if( .not.doitref ) then
+              iter = -1
+              go to 40
+           end if
+           ! compute some constants.
+           anrm = stdlib_dlange( 'I', n, n, a, lda, work )
+           eps = stdlib_dlamch( 'EPSILON' )
+           cte = anrm*eps*sqrt( real( n,KIND=dp) )*bwdmax
+           ! set the indices ptsa, ptsx for referencing sa and sx in swork.
+           ptsa = 1
+           ptsx = ptsa + n*n
+           ! convert b from double precision to single precision and store the
+           ! result in sx.
+           call stdlib_dlag2s( n, nrhs, b, ldb, swork( ptsx ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! convert a from double precision to single precision and store the
+           ! result in sa.
+           call stdlib_dlag2s( n, n, a, lda, swork( ptsa ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! compute the lu factorization of sa.
+           call stdlib_sgetrf( n, n, swork( ptsa ), n, ipiv, info )
+           if( info/=0 ) then
+              iter = -3
+              go to 40
+           end if
+           ! solve the system sa*sx = sb.
+           call stdlib_sgetrs( 'NO TRANSPOSE', n, nrhs, swork( ptsa ), n, ipiv,swork( ptsx ), n, &
+                     info )
+           ! convert sx back to double precision
+           call stdlib_slag2d( n, nrhs, swork( ptsx ), n, x, ldx, info )
+           ! compute r = b - ax (r is work).
+           call stdlib_dlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+           call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, nrhs, n, negone, a,lda, x, ldx, &
+                     one, work, n )
+           ! check whether the nrhs normwise backward errors satisfy the
+           ! stopping criterion. if yes, set iter=0 and return.
+           do i = 1, nrhs
+              xnrm = abs( x( stdlib_idamax( n, x( 1, i ), 1 ), i ) )
+              rnrm = abs( work( stdlib_idamax( n, work( 1, i ), 1 ), i ) )
+              if( rnrm>xnrm*cte )go to 10
+           end do
+           ! if we are here, the nrhs normwise backward errors satisfy the
+           ! stopping criterion. we are good to exit.
+           iter = 0
+           return
+           10 continue
+           loop_30: do iiter = 1, itermax
+              ! convert r (in work) from double precision to single precision
+              ! and store the result in sx.
+              call stdlib_dlag2s( n, nrhs, work, n, swork( ptsx ), n, info )
+              if( info/=0 ) then
+                 iter = -2
+                 go to 40
+              end if
+              ! solve the system sa*sx = sr.
+              call stdlib_sgetrs( 'NO TRANSPOSE', n, nrhs, swork( ptsa ), n, ipiv,swork( ptsx ), &
+                        n, info )
+              ! convert sx back to double precision and update the current
+              ! iterate.
+              call stdlib_slag2d( n, nrhs, swork( ptsx ), n, work, n, info )
+              do i = 1, nrhs
+                 call stdlib_daxpy( n, one, work( 1, i ), 1, x( 1, i ), 1 )
+              end do
+              ! compute r = b - ax (r is work).
+              call stdlib_dlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+              call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, nrhs, n, negone,a, lda, x, &
+                        ldx, one, work, n )
+              ! check whether the nrhs normwise backward errors satisfy the
+              ! stopping criterion. if yes, set iter=iiter>0 and return.
+              do i = 1, nrhs
+                 xnrm = abs( x( stdlib_idamax( n, x( 1, i ), 1 ), i ) )
+                 rnrm = abs( work( stdlib_idamax( n, work( 1, i ), 1 ), i ) )
+                 if( rnrm>xnrm*cte )go to 20
+              end do
+              ! if we are here, the nrhs normwise backward errors satisfy the
+              ! stopping criterion, we are good to exit.
+              iter = iiter
+              return
+              20 continue
+           end do loop_30
+           ! if we are at this place of the code, this is because we have
+           ! performed iter=itermax iterations and never satisfied the
+           ! stopping criterion, set up the iter flag accordingly and follow up
+           ! on double precision routine.
+           iter = -itermax - 1
+           40 continue
+           ! single-precision iterative refinement failed to converge to a
+           ! satisfactory solution, so we resort to double precision.
+           call stdlib_dgetrf( n, n, a, lda, ipiv, info )
+           if( info/=0 )return
+           call stdlib_dlacpy( 'ALL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_dgetrs( 'NO TRANSPOSE', n, nrhs, a, lda, ipiv, x, ldx,info )
+           return
+     end subroutine stdlib_dsgesv
+
+     !> DSPEV: computes all the eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric matrix A in packed storage.
+
+     subroutine stdlib_dspev( jobz, uplo, n, ap, w, z, ldz, work, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ap(*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantz
+           integer(ilp) :: iinfo, imax, inde, indtau, indwrk, iscale
+           real(dp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = ap( 1 )
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_dlansp( 'M', uplo, n, ap, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_dscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+           end if
+           ! call stdlib_dsptrd to reduce symmetric packed matrix to tridiagonal form.
+           inde = 1
+           indtau = inde + n
+           call stdlib_dsptrd( uplo, n, ap, w, work( inde ), work( indtau ), iinfo )
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, first call
+           ! stdlib_dopgtr to generate the orthogonal matrix, then call stdlib_dsteqr.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, work( inde ), info )
+           else
+              indwrk = indtau + n
+              call stdlib_dopgtr( uplo, n, ap, work( indtau ), z, ldz,work( indwrk ), iinfo )
+                        
+              call stdlib_dsteqr( jobz, n, w, work( inde ), z, ldz, work( indtau ),info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           return
+     end subroutine stdlib_dspev
+
+     !> DSPEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric matrix A in packed storage.  Eigenvalues/vectors
+     !> can be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+
+     subroutine stdlib_dspevx( jobz, range, uplo, n, ap, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               work, iwork, ifail,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(inout) :: ap(*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwo, indtau, &
+                     indwrk, iscale, itmp1, j, jj, nsplit
+           real(dp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( stdlib_lsame( uplo, 'L' ) .or. stdlib_lsame( uplo, 'U' ) ) )&
+                     then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -7
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -8
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -9
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = ap( 1 )
+              else
+                 if( vl<ap( 1 ) .and. vu>=ap( 1 ) ) then
+                    m = 1
+                    w( 1 ) = ap( 1 )
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           end if
+           anrm = stdlib_dlansp( 'M', uplo, n, ap, work )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_dscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_dsptrd to reduce symmetric packed matrix to tridiagonal form.
+           indtau = 1
+           inde = indtau + n
+           indd = inde + n
+           indwrk = indd + n
+           call stdlib_dsptrd( uplo, n, ap, work( indd ), work( inde ),work( indtau ), iinfo )
+                     
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_dsterf or stdlib_dopgtr and stdlib_ssteqr.  if this fails
+           ! for some eigenvalue, then try stdlib_dstebz.
+           test = .false.
+           if (indeig) then
+              if (il==1 .and. iu==n) then
+                 test = .true.
+              end if
+           end if
+           if ((alleig .or. test) .and. (abstol<=zero)) then
+              call stdlib_dcopy( n, work( indd ), 1, w, 1 )
+              indee = indwrk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_dcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_dsterf( n, w, work( indee ), info )
+              else
+                 call stdlib_dopgtr( uplo, n, ap, work( indtau ), z, ldz,work( indwrk ), iinfo )
+                           
+                 call stdlib_dcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_dsteqr( jobz, n, w, work( indee ), z, ldz,work( indwrk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 20
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_dstebz and, if eigenvectors are desired, stdlib_sstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_dstebz( range, order, n, vll, vuu, il, iu, abstll,work( indd ), work( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwrk ),iwork( indiwo ), info &
+                     )
+           if( wantz ) then
+              call stdlib_dstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwrk ), iwork( indiwo ), ifail, info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_dstein.
+              call stdlib_dopmtr( 'L', uplo, 'N', n, m, ap, work( indtau ), z, ldz,work( indwrk ),&
+                         iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           20 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_dswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_dspevx
+
+     !> DSPGV: computes all the eigenvalues and, optionally, the eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+     !> Here A and B are assumed to be symmetric, stored in packed format,
+     !> and B is also positive definite.
+
+     subroutine stdlib_dspgv( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, ldz, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: ap(*), bp(*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper, wantz
+           character :: trans
+           integer(ilp) :: j, neig
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPGV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_dpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_dspgst( itype, uplo, n, ap, bp, info )
+           call stdlib_dspev( jobz, uplo, n, ap, w, z, ldz, work, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 do j = 1, neig
+                    call stdlib_dtpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, neig
+                    call stdlib_dtpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_dspgv
+
+     !> DSPGVX: computes selected eigenvalues, and optionally, eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A
+     !> and B are assumed to be symmetric, stored in packed storage, and B
+     !> is also positive definite.  Eigenvalues and eigenvectors can be
+     !> selected by specifying either a range of values or a range of indices
+     !> for the desired eigenvalues.
+
+     subroutine stdlib_dspgvx( itype, jobz, range, uplo, n, ap, bp, vl, vu,il, iu, abstol, m, w, &
+               z, ldz, work, iwork,ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, itype, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(inout) :: ap(*), bp(*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, upper, valeig, wantz
+           character :: trans
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           upper = stdlib_lsame( uplo, 'U' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -3
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl ) then
+                    info = -9
+                 end if
+              else if( indeig ) then
+                 if( il<1 ) then
+                    info = -10
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -11
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPGVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_dpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_dspgst( itype, uplo, n, ap, bp, info )
+           call stdlib_dspevx( jobz, range, uplo, n, ap, vl, vu, il, iu, abstol, m,w, z, ldz, &
+                     work, iwork, ifail, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              if( info>0 )m = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 do j = 1, m
+                    call stdlib_dtpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, m
+                    call stdlib_dtpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_dspgvx
+
+     !> DSPOSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> DSPOSV first attempts to factorize the matrix in SINGLE PRECISION
+     !> and use this factorization within an iterative refinement procedure
+     !> to produce a solution with DOUBLE PRECISION normwise backward error
+     !> quality (see below). If the approach fails the method switches to a
+     !> DOUBLE PRECISION factorization and solve.
+     !> The iterative refinement is not going to be a winning strategy if
+     !> the ratio SINGLE PRECISION performance over DOUBLE PRECISION
+     !> performance is too small. A reasonable strategy should take the
+     !> number of right-hand sides and the size of the matrix into account.
+     !> This might be done with a call to ILAENV in the future. Up to now, we
+     !> always try iterative refinement.
+     !> The iterative refinement process is stopped if
+     !> ITER > ITERMAX
+     !> or for all the RHS we have:
+     !> RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
+     !> where
+     !> o ITER is the number of the current iteration in the iterative
+     !> refinement process
+     !> o RNRM is the infinity-norm of the residual
+     !> o XNRM is the infinity-norm of the solution
+     !> o ANRM is the infinity-operator-norm of the matrix A
+     !> o EPS is the machine epsilon returned by DLAMCH('Epsilon')
+     !> The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00
+     !> respectively.
+
+     subroutine stdlib_dsposv( uplo, n, nrhs, a, lda, b, ldb, x, ldx, work,swork, iter, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, iter
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(out) :: swork(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: b(ldb,*)
+           real(dp), intent(out) :: work(n,*), x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           logical(lk), parameter :: doitref = .true.
+           integer(ilp), parameter :: itermax = 30
+           real(dp), parameter :: bwdmax = 1.0e+00_dp
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iiter, ptsa, ptsx
+           real(dp) :: anrm, cte, eps, rnrm, xnrm
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,sqrt
+           ! Executable Statements 
+           info = 0
+           iter = 0
+           ! test the input parameters.
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPOSV', -info )
+              return
+           end if
+           ! quick return if (n==0).
+           if( n==0 )return
+           ! skip single precision iterative refinement if a priori slower
+           ! than double precision factorization.
+           if( .not.doitref ) then
+              iter = -1
+              go to 40
+           end if
+           ! compute some constants.
+           anrm = stdlib_dlansy( 'I', uplo, n, a, lda, work )
+           eps = stdlib_dlamch( 'EPSILON' )
+           cte = anrm*eps*sqrt( real( n,KIND=dp) )*bwdmax
+           ! set the indices ptsa, ptsx for referencing sa and sx in swork.
+           ptsa = 1
+           ptsx = ptsa + n*n
+           ! convert b from double precision to single precision and store the
+           ! result in sx.
+           call stdlib_dlag2s( n, nrhs, b, ldb, swork( ptsx ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! convert a from double precision to single precision and store the
+           ! result in sa.
+           call stdlib_dlat2s( uplo, n, a, lda, swork( ptsa ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! compute the cholesky factorization of sa.
+           call stdlib_spotrf( uplo, n, swork( ptsa ), n, info )
+           if( info/=0 ) then
+              iter = -3
+              go to 40
+           end if
+           ! solve the system sa*sx = sb.
+           call stdlib_spotrs( uplo, n, nrhs, swork( ptsa ), n, swork( ptsx ), n,info )
+           ! convert sx back to double precision
+           call stdlib_slag2d( n, nrhs, swork( ptsx ), n, x, ldx, info )
+           ! compute r = b - ax (r is work).
+           call stdlib_dlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+           call stdlib_dsymm( 'LEFT', uplo, n, nrhs, negone, a, lda, x, ldx, one,work, n )
+           ! check whether the nrhs normwise backward errors satisfy the
+           ! stopping criterion. if yes, set iter=0 and return.
+           do i = 1, nrhs
+              xnrm = abs( x( stdlib_idamax( n, x( 1, i ), 1 ), i ) )
+              rnrm = abs( work( stdlib_idamax( n, work( 1, i ), 1 ), i ) )
+              if( rnrm>xnrm*cte )go to 10
+           end do
+           ! if we are here, the nrhs normwise backward errors satisfy the
+           ! stopping criterion. we are good to exit.
+           iter = 0
+           return
+           10 continue
+           loop_30: do iiter = 1, itermax
+              ! convert r (in work) from double precision to single precision
+              ! and store the result in sx.
+              call stdlib_dlag2s( n, nrhs, work, n, swork( ptsx ), n, info )
+              if( info/=0 ) then
+                 iter = -2
+                 go to 40
+              end if
+              ! solve the system sa*sx = sr.
+              call stdlib_spotrs( uplo, n, nrhs, swork( ptsa ), n, swork( ptsx ), n,info )
+              ! convert sx back to double precision and update the current
+              ! iterate.
+              call stdlib_slag2d( n, nrhs, swork( ptsx ), n, work, n, info )
+              do i = 1, nrhs
+                 call stdlib_daxpy( n, one, work( 1, i ), 1, x( 1, i ), 1 )
+              end do
+              ! compute r = b - ax (r is work).
+              call stdlib_dlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+              call stdlib_dsymm( 'L', uplo, n, nrhs, negone, a, lda, x, ldx, one,work, n )
+              ! check whether the nrhs normwise backward errors satisfy the
+              ! stopping criterion. if yes, set iter=iiter>0 and return.
+              do i = 1, nrhs
+                 xnrm = abs( x( stdlib_idamax( n, x( 1, i ), 1 ), i ) )
+                 rnrm = abs( work( stdlib_idamax( n, work( 1, i ), 1 ), i ) )
+                 if( rnrm>xnrm*cte )go to 20
+              end do
+              ! if we are here, the nrhs normwise backward errors satisfy the
+              ! stopping criterion, we are good to exit.
+              iter = iiter
+              return
+              20 continue
+           end do loop_30
+           ! if we are at this place of the code, this is because we have
+           ! performed iter=itermax iterations and never satisfied the
+           ! stopping criterion, set up the iter flag accordingly and follow
+           ! up on double precision routine.
+           iter = -itermax - 1
+           40 continue
+           ! single-precision iterative refinement failed to converge to a
+           ! satisfactory solution, so we resort to double precision.
+           call stdlib_dpotrf( uplo, n, a, lda, info )
+           if( info/=0 )return
+           call stdlib_dlacpy( 'ALL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_dpotrs( uplo, n, nrhs, a, lda, x, ldx, info )
+           return
+     end subroutine stdlib_dsposv
+
+     !> DSYEV: computes all eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric matrix A.
+
+     subroutine stdlib_dsyev( jobz, uplo, n, a, lda, w, work, lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indtau, indwrk, iscale, llwork, lwkopt, nb
+           real(dp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, ( nb+2 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, 3*n-1 ) .and. .not.lquery )info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              return
+           end if
+           if( n==1 ) then
+              w( 1 ) = a( 1, 1 )
+              work( 1 ) = 2
+              if( wantz )a( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_dlansy( 'M', uplo, n, a, lda, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 )call stdlib_dlascl( uplo, 0, 0, one, sigma, n, n, a, lda, info )
+           ! call stdlib_dsytrd to reduce symmetric matrix to tridiagonal form.
+           inde = 1
+           indtau = inde + n
+           indwrk = indtau + n
+           llwork = lwork - indwrk + 1
+           call stdlib_dsytrd( uplo, n, a, lda, w, work( inde ), work( indtau ),work( indwrk ), &
+                     llwork, iinfo )
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, first call
+           ! stdlib_dorgtr to generate the orthogonal matrix, then call stdlib_dsteqr.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, work( inde ), info )
+           else
+              call stdlib_dorgtr( uplo, n, a, lda, work( indtau ), work( indwrk ),llwork, iinfo )
+                        
+              call stdlib_dsteqr( jobz, n, w, work( inde ), a, lda, work( indtau ),info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsyev
+
+     !> DSYEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric matrix A.  Eigenvalues and eigenvectors can be
+     !> selected by specifying either a range of values or a range of indices
+     !> for the desired eigenvalues.
+
+     subroutine stdlib_dsyevx( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               work, lwork, iwork,ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, lda, ldz, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, lquery, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwo, indtau, &
+           indwkn, indwrk, iscale, itmp1, j, jj, llwork, llwrkn, lwkmin, lwkopt, nb, &
+                     nsplit
+           real(dp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           lower = stdlib_lsame( uplo, 'L' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -8
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -9
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -15
+              end if
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwkmin = 1
+                 work( 1 ) = lwkmin
+              else
+                 lwkmin = 8*n
+                 nb = stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 )
+                 nb = max( nb, stdlib_ilaenv( 1, 'DORMTR', uplo, n, -1, -1, -1 ) )
+                 lwkopt = max( lwkmin, ( nb + 3 )*n )
+                 work( 1 ) = lwkopt
+              end if
+              if( lwork<lwkmin .and. .not.lquery )info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              return
+           end if
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = a( 1, 1 )
+              else
+                 if( vl<a( 1, 1 ) .and. vu>=a( 1, 1 ) ) then
+                    m = 1
+                    w( 1 ) = a( 1, 1 )
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           end if
+           anrm = stdlib_dlansy( 'M', uplo, n, a, lda, work )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 do j = 1, n
+                    call stdlib_dscal( n-j+1, sigma, a( j, j ), 1 )
+                 end do
+              else
+                 do j = 1, n
+                    call stdlib_dscal( j, sigma, a( 1, j ), 1 )
+                 end do
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_dsytrd to reduce symmetric matrix to tridiagonal form.
+           indtau = 1
+           inde = indtau + n
+           indd = inde + n
+           indwrk = indd + n
+           llwork = lwork - indwrk + 1
+           call stdlib_dsytrd( uplo, n, a, lda, work( indd ), work( inde ),work( indtau ), work( &
+                     indwrk ), llwork, iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal to
+           ! zero, then call stdlib_dsterf or stdlib_dorgtr and stdlib_ssteqr.  if this fails for
+           ! some eigenvalue, then try stdlib_dstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_dcopy( n, work( indd ), 1, w, 1 )
+              indee = indwrk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_dcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_dsterf( n, w, work( indee ), info )
+              else
+                 call stdlib_dlacpy( 'A', n, n, a, lda, z, ldz )
+                 call stdlib_dorgtr( uplo, n, z, ldz, work( indtau ),work( indwrk ), llwork, &
+                           iinfo )
+                 call stdlib_dcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_dsteqr( jobz, n, w, work( indee ), z, ldz,work( indwrk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 40
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_dstebz and, if eigenvectors are desired, stdlib_sstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_dstebz( range, order, n, vll, vuu, il, iu, abstll,work( indd ), work( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwrk ),iwork( indiwo ), info &
+                     )
+           if( wantz ) then
+              call stdlib_dstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwrk ), iwork( indiwo ), ifail, info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_dstein.
+              indwkn = inde
+              llwrkn = lwork - indwkn + 1
+              call stdlib_dormtr( 'L', uplo, 'N', n, m, a, lda, work( indtau ), z,ldz, work( &
+                        indwkn ), llwrkn, iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           40 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_dswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsyevx
+
+     !> DSYGV: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+     !> Here A and B are assumed to be symmetric and B is also
+     !> positive definite.
+
+     subroutine stdlib_dsygv( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, lwork, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: lwkmin, lwkopt, nb, neig
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              lwkmin = max( 1, 3*n - 1 )
+              nb = stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( lwkmin, ( nb + 2 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYGV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_dpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_dsygst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_dsyev( jobz, uplo, n, a, lda, w, work, lwork, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 call stdlib_dtrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, neig, one,b, ldb, a, lda )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_dtrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, neig, one,b, ldb, a, lda )
+                           
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsygv
+
+     !> DSYGVX: computes selected eigenvalues, and optionally, eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A
+     !> and B are assumed to be symmetric and B is also positive definite.
+     !> Eigenvalues and eigenvectors can be selected by specifying either a
+     !> range of values or a range of indices for the desired eigenvalues.
+
+     subroutine stdlib_dsygvx( itype, jobz, range, uplo, n, a, lda, b, ldb,vl, vu, il, iu, abstol,&
+                m, w, z, ldz, work,lwork, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, itype, iu, lda, ldb, ldz, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lquery, upper, valeig, wantz
+           character :: trans
+           integer(ilp) :: lwkmin, lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           upper = stdlib_lsame( uplo, 'U' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -3
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -11
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -12
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -13
+                 end if
+              end if
+           end if
+           if (info==0) then
+              if (ldz<1 .or. (wantz .and. ldz<n)) then
+                 info = -18
+              end if
+           end if
+           if( info==0 ) then
+              lwkmin = max( 1, 8*n )
+              nb = stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( lwkmin, ( nb + 3 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYGVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              return
+           end if
+           ! form a cholesky factorization of b.
+           call stdlib_dpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_dsygst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_dsyevx( jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol,m, w, z, ldz, &
+                     work, lwork, iwork, ifail, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              if( info>0 )m = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 call stdlib_dtrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, m, one, b,ldb, z, ldz )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_dtrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, m, one, b,ldb, z, ldz )
+                           
+              end if
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsygvx
+
+     !> DSYSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.  The factored form of A is then
+     !> used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_dsysv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_dsytrf( uplo, n, a, lda, ipiv, work, -1, info )
+                 lwkopt = work(1)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYSV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_dsytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              if ( lwork<n ) then
+              ! solve with trs ( use level blas 2)
+                 call stdlib_dsytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+              else
+              ! solve with trs2 ( use level blas 3)
+                 call stdlib_dsytrs2( uplo,n,nrhs,a,lda,ipiv,b,ldb,work,info )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsysv
+
+     !> DSYSVX: uses the diagonal pivoting factorization to compute the
+     !> solution to a real system of linear equations A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_dsysvx( fact, uplo, n, nrhs, a, lda, af, ldaf, ipiv, b,ldb, x, ldx, rcond, &
+               ferr, berr, work, lwork,iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, lwork, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: a(lda,*), b(ldb,*)
+           real(dp), intent(inout) :: af(ldaf,*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, nofact
+           integer(ilp) :: lwkopt, nb
+           real(dp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -11
+           else if( ldx<max( 1, n ) ) then
+              info = -13
+           else if( lwork<max( 1, 3*n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           if( info==0 ) then
+              lwkopt = max( 1, 3*n )
+              if( nofact ) then
+                 nb = stdlib_ilaenv( 1, 'DSYTRF', uplo, n, -1, -1, -1 )
+                 lwkopt = max( lwkopt, n*nb )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYSVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+              call stdlib_dlacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_dsytrf( uplo, n, af, ldaf, ipiv, work, lwork, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_dlansy( 'I', uplo, n, a, lda, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_dsycon( uplo, n, af, ldaf, ipiv, anorm, rcond, work, iwork,info )
+           ! compute the solution vectors x.
+           call stdlib_dlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_dsytrs( uplo, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_dsyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, iwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsysvx
+
+     !> DSYTRD_SY2SB: reduces a real symmetric matrix A to real symmetric
+     !> band-diagonal form AB by a orthogonal similarity transformation:
+     !> Q**T * A * Q = AB.
+
+     pure subroutine stdlib_dsytrd_sy2sb( uplo, n, kd, a, lda, ab, ldab, tau,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldab, lwork, n, kd
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: ab(ldab,*), tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: rone = 1.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, j, iinfo, lwmin, pn, pk, lk, ldt, ldw, lds2, lds1, ls2, ls1, lw, lt,&
+                      tpos, wpos, s2pos, s1pos
+           ! Intrinsic Functions 
+           intrinsic :: min,max
+           ! Executable Statements 
+           ! determine the minimal workspace size required
+           ! and test the input parameters
+           info   = 0
+           upper  = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           lwmin  = stdlib_ilaenv2stage( 4, 'DSYTRD_SY2SB', '', n, kd, -1, -1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldab<max( 1, kd+1 ) ) then
+              info = -7
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRD_SY2SB', -info )
+              return
+           else if( lquery ) then
+              work( 1 ) = lwmin
+              return
+           end if
+           ! quick return if possible
+           ! copy the upper/lower portion of a into ab
+           if( n<=kd+1 ) then
+               if( upper ) then
+                   do i = 1, n
+                       lk = min( kd+1, i )
+                       call stdlib_dcopy( lk, a( i-lk+1, i ), 1,ab( kd+1-lk+1, i ), 1 )
+                   end do
+               else
+                   do i = 1, n
+                       lk = min( kd+1, n-i+1 )
+                       call stdlib_dcopy( lk, a( i, i ), 1, ab( 1, i ), 1 )
+                   end do
+               endif
+               work( 1 ) = 1
+               return
+           end if
+           ! determine the pointer position for the workspace
+           ldt    = kd
+           lds1   = kd
+           lt     = ldt*kd
+           lw     = n*kd
+           ls1    = lds1*kd
+           ls2    = lwmin - lt - lw - ls1
+            ! ls2 = n*max(kd,factoptnb)
+           tpos   = 1
+           wpos   = tpos  + lt
+           s1pos  = wpos  + lw
+           s2pos  = s1pos + ls1
+           if( upper ) then
+               ldw    = kd
+               lds2   = kd
+           else
+               ldw    = n
+               lds2   = n
+           endif
+           ! set the workspace of the triangular matrix t to zero once such a
+           ! way every time t is generated the upper/lower portion will be always zero
+           call stdlib_dlaset( "A", ldt, kd, zero, zero, work( tpos ), ldt )
+           if( upper ) then
+               do i = 1, n - kd, kd
+                  pn = n-i-kd+1
+                  pk = min( n-i-kd+1, kd )
+                  ! compute the lq factorization of the current block
+                  call stdlib_dgelqf( kd, pn, a( i, i+kd ), lda,tau( i ), work( s2pos ), ls2, &
+                            iinfo )
+                  ! copy the upper portion of a into ab
+                  do j = i, i+pk-1
+                     lk = min( kd, n-j ) + 1
+                     call stdlib_dcopy( lk, a( j, j ), lda, ab( kd+1, j ), ldab-1 )
+                  end do
+                  call stdlib_dlaset( 'LOWER', pk, pk, zero, one,a( i, i+kd ), lda )
+                  ! form the matrix t
+                  call stdlib_dlarft( 'FORWARD', 'ROWWISE', pn, pk,a( i, i+kd ), lda, tau( i ),&
+                            work( tpos ), ldt )
+                  ! compute w:
+                  call stdlib_dgemm( 'CONJUGATE', 'NO TRANSPOSE', pk, pn, pk,one,  work( tpos ), &
+                            ldt,a( i, i+kd ), lda,zero, work( s2pos ), lds2 )
+                  call stdlib_dsymm( 'RIGHT', uplo, pk, pn,one,  a( i+kd, i+kd ), lda,work( s2pos &
+                            ), lds2,zero, work( wpos ), ldw )
+                  call stdlib_dgemm( 'NO TRANSPOSE', 'CONJUGATE', pk, pk, pn,one,  work( wpos ), &
+                            ldw,work( s2pos ), lds2,zero, work( s1pos ), lds1 )
+                  call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pk, pn, pk,-half, work( &
+                            s1pos ), lds1,a( i, i+kd ), lda,one,   work( wpos ), ldw )
+                  ! update the unreduced submatrix a(i+kd:n,i+kd:n), using
+                  ! an update of the form:  a := a - v'*w - w'*v
+                  call stdlib_dsyr2k( uplo, 'CONJUGATE', pn, pk,-one, a( i, i+kd ), lda,work( &
+                            wpos ), ldw,rone, a( i+kd, i+kd ), lda )
+               end do
+              ! copy the upper band to ab which is the band storage matrix
+              do j = n-kd+1, n
+                 lk = min(kd, n-j) + 1
+                 call stdlib_dcopy( lk, a( j, j ), lda, ab( kd+1, j ), ldab-1 )
+              end do
+           else
+               ! reduce the lower triangle of a to lower band matrix
+               loop_40: do i = 1, n - kd, kd
+                  pn = n-i-kd+1
+                  pk = min( n-i-kd+1, kd )
+                  ! compute the qr factorization of the current block
+                  call stdlib_dgeqrf( pn, kd, a( i+kd, i ), lda,tau( i ), work( s2pos ), ls2, &
+                            iinfo )
+                  ! copy the upper portion of a into ab
+                  do j = i, i+pk-1
+                     lk = min( kd, n-j ) + 1
+                     call stdlib_dcopy( lk, a( j, j ), 1, ab( 1, j ), 1 )
+                  end do
+                  call stdlib_dlaset( 'UPPER', pk, pk, zero, one,a( i+kd, i ), lda )
+                  ! form the matrix t
+                  call stdlib_dlarft( 'FORWARD', 'COLUMNWISE', pn, pk,a( i+kd, i ), lda, tau( i ),&
+                            work( tpos ), ldt )
+                  ! compute w:
+                  call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pn, pk, pk,one, a( i+kd, i ),&
+                             lda,work( tpos ), ldt,zero, work( s2pos ), lds2 )
+                  call stdlib_dsymm( 'LEFT', uplo, pn, pk,one, a( i+kd, i+kd ), lda,work( s2pos ),&
+                             lds2,zero, work( wpos ), ldw )
+                  call stdlib_dgemm( 'CONJUGATE', 'NO TRANSPOSE', pk, pk, pn,one, work( s2pos ), &
+                            lds2,work( wpos ), ldw,zero, work( s1pos ), lds1 )
+                  call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pn, pk, pk,-half, a( i+kd, i &
+                            ), lda,work( s1pos ), lds1,one, work( wpos ), ldw )
+                  ! update the unreduced submatrix a(i+kd:n,i+kd:n), using
+                  ! an update of the form:  a := a - v*w' - w*v'
+                  call stdlib_dsyr2k( uplo, 'NO TRANSPOSE', pn, pk,-one, a( i+kd, i ), lda,work( &
+                            wpos ), ldw,rone, a( i+kd, i+kd ), lda )
+                  ! ==================================================================
+                  ! restore a for comparison and checking to be removed
+                   ! do 45 j = i, i+pk-1
+                      ! lk = min( kd, n-j ) + 1
+                      ! call stdlib_dcopy( lk, ab( 1, j ), 1, a( j, j ), 1 )
+                      45 continue
+                  ! ==================================================================
+               end do loop_40
+              ! copy the lower band to ab which is the band storage matrix
+              do j = n-kd+1, n
+                 lk = min(kd, n-j) + 1
+                 call stdlib_dcopy( lk, a( j, j ), 1, ab( 1, j ), 1 )
+              end do
+           end if
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_dsytrd_sy2sb
+
+     !> DTGEVC: computes some or all of the right and/or left eigenvectors of
+     !> a pair of real matrices (S,P), where S is a quasi-triangular matrix
+     !> and P is upper triangular.  Matrix pairs of this type are produced by
+     !> the generalized Schur factorization of a matrix pair (A,B):
+     !> A = Q*S*Z**T,  B = Q*P*Z**T
+     !> as computed by DGGHRD + DHGEQZ.
+     !> The right eigenvector x and the left eigenvector y of (S,P)
+     !> corresponding to an eigenvalue w are defined by:
+     !> S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
+     !> where y**H denotes the conjugate tranpose of y.
+     !> The eigenvalues are not input to this routine, but are computed
+     !> directly from the diagonal blocks of S and P.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of (S,P), or the products Z*X and/or Q*Y,
+     !> where Z and Q are input matrices.
+     !> If Q and Z are the orthogonal factors from the generalized Schur
+     !> factorization of a matrix pair (A,B), then Z*X and Q*Y
+     !> are the matrices of right and left eigenvectors of (A,B).
+
+     pure subroutine stdlib_dtgevc( side, howmny, select, n, s, lds, p, ldp, vl,ldvl, vr, ldvr, &
+               mm, m, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldp, lds, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(dp), intent(in) :: p(ldp,*), s(lds,*)
+           real(dp), intent(inout) :: vl(ldvl,*), vr(ldvr,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: safety = 1.0e+2_dp
+           
+           ! Local Scalars 
+           logical(lk) :: compl, compr, il2by2, ilabad, ilall, ilback, ilbbad, ilcomp, ilcplx, &
+                     lsa, lsb
+           integer(ilp) :: i, ibeg, ieig, iend, ihwmny, iinfo, im, iside, j, ja, jc, je, jr, jw, &
+                     na, nw
+           real(dp) :: acoef, acoefa, anorm, ascale, bcoefa, bcoefi, bcoefr, big, bignum, bnorm, &
+           bscale, cim2a, cim2b, cimaga, cimagb, cre2a, cre2b, creala, crealb, dmin, safmin, &
+                     salfar, sbeta, scale, small, temp, temp2, temp2i, temp2r, ulp, xmax, xscale
+           ! Local Arrays 
+           real(dp) :: bdiag(2), sum(2,2), sums(2,2), sump(2,2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! decode and test the input parameters
+           if( stdlib_lsame( howmny, 'A' ) ) then
+              ihwmny = 1
+              ilall = .true.
+              ilback = .false.
+           else if( stdlib_lsame( howmny, 'S' ) ) then
+              ihwmny = 2
+              ilall = .false.
+              ilback = .false.
+           else if( stdlib_lsame( howmny, 'B' ) ) then
+              ihwmny = 3
+              ilall = .true.
+              ilback = .true.
+           else
+              ihwmny = -1
+              ilall = .true.
+           end if
+           if( stdlib_lsame( side, 'R' ) ) then
+              iside = 1
+              compl = .false.
+              compr = .true.
+           else if( stdlib_lsame( side, 'L' ) ) then
+              iside = 2
+              compl = .true.
+              compr = .false.
+           else if( stdlib_lsame( side, 'B' ) ) then
+              iside = 3
+              compl = .true.
+              compr = .true.
+           else
+              iside = -1
+           end if
+           info = 0
+           if( iside<0 ) then
+              info = -1
+           else if( ihwmny<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lds<max( 1, n ) ) then
+              info = -6
+           else if( ldp<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGEVC', -info )
+              return
+           end if
+           ! count the number of eigenvectors to be computed
+           if( .not.ilall ) then
+              im = 0
+              ilcplx = .false.
+              loop_10: do j = 1, n
+                 if( ilcplx ) then
+                    ilcplx = .false.
+                    cycle loop_10
+                 end if
+                 if( j<n ) then
+                    if( s( j+1, j )/=zero )ilcplx = .true.
+                 end if
+                 if( ilcplx ) then
+                    if( select( j ) .or. select( j+1 ) )im = im + 2
+                 else
+                    if( select( j ) )im = im + 1
+                 end if
+              end do loop_10
+           else
+              im = n
+           end if
+           ! check 2-by-2 diagonal blocks of a, b
+           ilabad = .false.
+           ilbbad = .false.
+           do j = 1, n - 1
+              if( s( j+1, j )/=zero ) then
+                 if( p( j, j )==zero .or. p( j+1, j+1 )==zero .or.p( j, j+1 )/=zero )ilbbad = &
+                           .true.
+                 if( j<n-1 ) then
+                    if( s( j+2, j+1 )/=zero )ilabad = .true.
+                 end if
+              end if
+           end do
+           if( ilabad ) then
+              info = -5
+           else if( ilbbad ) then
+              info = -7
+           else if( compl .and. ldvl<n .or. ldvl<1 ) then
+              info = -10
+           else if( compr .and. ldvr<n .or. ldvr<1 ) then
+              info = -12
+           else if( mm<im ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGEVC', -info )
+              return
+           end if
+           ! quick return if possible
+           m = im
+           if( n==0 )return
+           ! machine constants
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           big = one / safmin
+           call stdlib_dlabad( safmin, big )
+           ulp = stdlib_dlamch( 'EPSILON' )*stdlib_dlamch( 'BASE' )
+           small = safmin*n / ulp
+           big = one / small
+           bignum = one / ( safmin*n )
+           ! compute the 1-norm of each column of the strictly upper triangular
+           ! part (i.e., excluding all elements belonging to the diagonal
+           ! blocks) of a and b to check for possible overflow in the
+           ! triangular solver.
+           anorm = abs( s( 1, 1 ) )
+           if( n>1 )anorm = anorm + abs( s( 2, 1 ) )
+           bnorm = abs( p( 1, 1 ) )
+           work( 1 ) = zero
+           work( n+1 ) = zero
+           do j = 2, n
+              temp = zero
+              temp2 = zero
+              if( s( j, j-1 )==zero ) then
+                 iend = j - 1
+              else
+                 iend = j - 2
+              end if
+              do i = 1, iend
+                 temp = temp + abs( s( i, j ) )
+                 temp2 = temp2 + abs( p( i, j ) )
+              end do
+              work( j ) = temp
+              work( n+j ) = temp2
+              do i = iend + 1, min( j+1, n )
+                 temp = temp + abs( s( i, j ) )
+                 temp2 = temp2 + abs( p( i, j ) )
+              end do
+              anorm = max( anorm, temp )
+              bnorm = max( bnorm, temp2 )
+           end do
+           ascale = one / max( anorm, safmin )
+           bscale = one / max( bnorm, safmin )
+           ! left eigenvectors
+           if( compl ) then
+              ieig = 0
+              ! main loop over eigenvalues
+              ilcplx = .false.
+              loop_220: do je = 1, n
+                 ! skip this iteration if (a) howmny='s' and select=.false., or
+                 ! (b) this would be the second of a complex pair.
+                 ! check for complex eigenvalue, so as to be sure of which
+                 ! entry(-ies) of select to look at.
+                 if( ilcplx ) then
+                    ilcplx = .false.
+                    cycle loop_220
+                 end if
+                 nw = 1
+                 if( je<n ) then
+                    if( s( je+1, je )/=zero ) then
+                       ilcplx = .true.
+                       nw = 2
+                    end if
+                 end if
+                 if( ilall ) then
+                    ilcomp = .true.
+                 else if( ilcplx ) then
+                    ilcomp = select( je ) .or. select( je+1 )
+                 else
+                    ilcomp = select( je )
+                 end if
+                 if( .not.ilcomp )cycle loop_220
+                 ! decide if (a) singular pencil, (b) real eigenvalue, or
+                 ! (c) complex eigenvalue.
+                 if( .not.ilcplx ) then
+                    if( abs( s( je, je ) )<=safmin .and.abs( p( je, je ) )<=safmin ) then
+                       ! singular matrix pencil -- return unit eigenvector
+                       ieig = ieig + 1
+                       do jr = 1, n
+                          vl( jr, ieig ) = zero
+                       end do
+                       vl( ieig, ieig ) = one
+                       cycle loop_220
+                    end if
+                 end if
+                 ! clear vector
+                 do jr = 1, nw*n
+                    work( 2*n+jr ) = zero
+                 end do
+                                                       ! t
+                 ! compute coefficients in  ( a a - b b )  y = 0
+                    ! a  is  acoef
+                    ! b  is  bcoefr + i*bcoefi
+                 if( .not.ilcplx ) then
+                    ! real eigenvalue
+                    temp = one / max( abs( s( je, je ) )*ascale,abs( p( je, je ) )*bscale, safmin &
+                              )
+                    salfar = ( temp*s( je, je ) )*ascale
+                    sbeta = ( temp*p( je, je ) )*bscale
+                    acoef = sbeta*ascale
+                    bcoefr = salfar*bscale
+                    bcoefi = zero
+                    ! scale to avoid underflow
+                    scale = one
+                    lsa = abs( sbeta )>=safmin .and. abs( acoef )<small
+                    lsb = abs( salfar )>=safmin .and. abs( bcoefr )<small
+                    if( lsa )scale = ( small / abs( sbeta ) )*min( anorm, big )
+                    if( lsb )scale = max( scale, ( small / abs( salfar ) )*min( bnorm, big ) )
+                              
+                    if( lsa .or. lsb ) then
+                       scale = min( scale, one /( safmin*max( one, abs( acoef ),abs( bcoefr ) ) ) &
+                                 )
+                       if( lsa ) then
+                          acoef = ascale*( scale*sbeta )
+                       else
+                          acoef = scale*acoef
+                       end if
+                       if( lsb ) then
+                          bcoefr = bscale*( scale*salfar )
+                       else
+                          bcoefr = scale*bcoefr
+                       end if
+                    end if
+                    acoefa = abs( acoef )
+                    bcoefa = abs( bcoefr )
+                    ! first component is 1
+                    work( 2*n+je ) = one
+                    xmax = one
+                 else
+                    ! complex eigenvalue
+                    call stdlib_dlag2( s( je, je ), lds, p( je, je ), ldp,safmin*safety, acoef, &
+                              temp, bcoefr, temp2,bcoefi )
+                    bcoefi = -bcoefi
+                    if( bcoefi==zero ) then
+                       info = je
+                       return
+                    end if
+                    ! scale to avoid over/underflow
+                    acoefa = abs( acoef )
+                    bcoefa = abs( bcoefr ) + abs( bcoefi )
+                    scale = one
+                    if( acoefa*ulp<safmin .and. acoefa>=safmin )scale = ( safmin / ulp ) / &
+                              acoefa
+                    if( bcoefa*ulp<safmin .and. bcoefa>=safmin )scale = max( scale, ( safmin / &
+                              ulp ) / bcoefa )
+                    if( safmin*acoefa>ascale )scale = ascale / ( safmin*acoefa )
+                    if( safmin*bcoefa>bscale )scale = min( scale, bscale / ( safmin*bcoefa ) )
+                              
+                    if( scale/=one ) then
+                       acoef = scale*acoef
+                       acoefa = abs( acoef )
+                       bcoefr = scale*bcoefr
+                       bcoefi = scale*bcoefi
+                       bcoefa = abs( bcoefr ) + abs( bcoefi )
+                    end if
+                    ! compute first two components of eigenvector
+                    temp = acoef*s( je+1, je )
+                    temp2r = acoef*s( je, je ) - bcoefr*p( je, je )
+                    temp2i = -bcoefi*p( je, je )
+                    if( abs( temp )>abs( temp2r )+abs( temp2i ) ) then
+                       work( 2*n+je ) = one
+                       work( 3*n+je ) = zero
+                       work( 2*n+je+1 ) = -temp2r / temp
+                       work( 3*n+je+1 ) = -temp2i / temp
+                    else
+                       work( 2*n+je+1 ) = one
+                       work( 3*n+je+1 ) = zero
+                       temp = acoef*s( je, je+1 )
+                       work( 2*n+je ) = ( bcoefr*p( je+1, je+1 )-acoef*s( je+1, je+1 ) ) / &
+                                 temp
+                       work( 3*n+je ) = bcoefi*p( je+1, je+1 ) / temp
+                    end if
+                    xmax = max( abs( work( 2*n+je ) )+abs( work( 3*n+je ) ),abs( work( 2*n+je+1 ) &
+                              )+abs( work( 3*n+je+1 ) ) )
+                 end if
+                 dmin = max( ulp*acoefa*anorm, ulp*bcoefa*bnorm, safmin )
+                                                 ! t
+                 ! triangular solve of  (a a - b b)  y = 0
+                                         ! t
+                 ! (rowwise in  (a a - b b) , or columnwise in (a a - b b) )
+                 il2by2 = .false.
+                 loop_160: do j = je + nw, n
+                    if( il2by2 ) then
+                       il2by2 = .false.
+                       cycle loop_160
+                    end if
+                    na = 1
+                    bdiag( 1 ) = p( j, j )
+                    if( j<n ) then
+                       if( s( j+1, j )/=zero ) then
+                          il2by2 = .true.
+                          bdiag( 2 ) = p( j+1, j+1 )
+                          na = 2
+                       end if
+                    end if
+                    ! check whether scaling is necessary for dot products
+                    xscale = one / max( one, xmax )
+                    temp = max( work( j ), work( n+j ),acoefa*work( j )+bcoefa*work( n+j ) )
+                              
+                    if( il2by2 )temp = max( temp, work( j+1 ), work( n+j+1 ),acoefa*work( j+1 )+&
+                              bcoefa*work( n+j+1 ) )
+                    if( temp>bignum*xscale ) then
+                       do jw = 0, nw - 1
+                          do jr = je, j - 1
+                             work( ( jw+2 )*n+jr ) = xscale*work( ( jw+2 )*n+jr )
+                          end do
+                       end do
+                       xmax = xmax*xscale
+                    end if
+                    ! compute dot products
+                          ! j-1
+                    ! sum = sum  conjg( a*s(k,j) - b*p(k,j) )*x(k)
+                          ! k=je
+                    ! to reduce the op count, this is done as
+                    ! _        j-1                  _        j-1
+                    ! a*conjg( sum  s(k,j)*x(k) ) - b*conjg( sum  p(k,j)*x(k) )
+                             ! k=je                          k=je
+                    ! which may cause underflow problems if a or b are close
+                    ! to underflow.  (e.g., less than small.)
+                    do jw = 1, nw
+                       do ja = 1, na
+                          sums( ja, jw ) = zero
+                          sump( ja, jw ) = zero
+                          do jr = je, j - 1
+                             sums( ja, jw ) = sums( ja, jw ) +s( jr, j+ja-1 )*work( ( jw+1 )*n+jr &
+                                       )
+                             sump( ja, jw ) = sump( ja, jw ) +p( jr, j+ja-1 )*work( ( jw+1 )*n+jr &
+                                       )
+                          end do
+                       end do
+                    end do
+                    do ja = 1, na
+                       if( ilcplx ) then
+                          sum( ja, 1 ) = -acoef*sums( ja, 1 ) +bcoefr*sump( ja, 1 ) -bcoefi*sump( &
+                                    ja, 2 )
+                          sum( ja, 2 ) = -acoef*sums( ja, 2 ) +bcoefr*sump( ja, 2 ) +bcoefi*sump( &
+                                    ja, 1 )
+                       else
+                          sum( ja, 1 ) = -acoef*sums( ja, 1 ) +bcoefr*sump( ja, 1 )
+                       end if
+                    end do
+                                        ! t
+                    ! solve  ( a a - b b )  y = sum(,)
+                    ! with scaling and perturbation of the denominator
+                    call stdlib_dlaln2( .true., na, nw, dmin, acoef, s( j, j ), lds,bdiag( 1 ), &
+                    bdiag( 2 ), sum, 2, bcoefr,bcoefi, work( 2*n+j ), n, scale, temp,iinfo )
+                              
+                    if( scale<one ) then
+                       do jw = 0, nw - 1
+                          do jr = je, j - 1
+                             work( ( jw+2 )*n+jr ) = scale*work( ( jw+2 )*n+jr )
+                          end do
+                       end do
+                       xmax = scale*xmax
+                    end if
+                    xmax = max( xmax, temp )
+                 end do loop_160
+                 ! copy eigenvector to vl, back transforming if
+                 ! howmny='b'.
+                 ieig = ieig + 1
+                 if( ilback ) then
+                    do jw = 0, nw - 1
+                       call stdlib_dgemv( 'N', n, n+1-je, one, vl( 1, je ), ldvl,work( ( jw+2 )*n+&
+                                 je ), 1, zero,work( ( jw+4 )*n+1 ), 1 )
+                    end do
+                    call stdlib_dlacpy( ' ', n, nw, work( 4*n+1 ), n, vl( 1, je ),ldvl )
+                    ibeg = 1
+                 else
+                    call stdlib_dlacpy( ' ', n, nw, work( 2*n+1 ), n, vl( 1, ieig ),ldvl )
+                    ibeg = je
+                 end if
+                 ! scale eigenvector
+                 xmax = zero
+                 if( ilcplx ) then
+                    do j = ibeg, n
+                       xmax = max( xmax, abs( vl( j, ieig ) )+abs( vl( j, ieig+1 ) ) )
+                    end do
+                 else
+                    do j = ibeg, n
+                       xmax = max( xmax, abs( vl( j, ieig ) ) )
+                    end do
+                 end if
+                 if( xmax>safmin ) then
+                    xscale = one / xmax
+                    do jw = 0, nw - 1
+                       do jr = ibeg, n
+                          vl( jr, ieig+jw ) = xscale*vl( jr, ieig+jw )
+                       end do
+                    end do
+                 end if
+                 ieig = ieig + nw - 1
+              end do loop_220
+           end if
+           ! right eigenvectors
+           if( compr ) then
+              ieig = im + 1
+              ! main loop over eigenvalues
+              ilcplx = .false.
+              loop_500: do je = n, 1, -1
+                 ! skip this iteration if (a) howmny='s' and select=.false., or
+                 ! (b) this would be the second of a complex pair.
+                 ! check for complex eigenvalue, so as to be sure of which
+                 ! entry(-ies) of select to look at -- if complex, select(je)
+                 ! or select(je-1).
+                 ! if this is a complex pair, the 2-by-2 diagonal block
+                 ! corresponding to the eigenvalue is in rows/columns je-1:je
+                 if( ilcplx ) then
+                    ilcplx = .false.
+                    cycle loop_500
+                 end if
+                 nw = 1
+                 if( je>1 ) then
+                    if( s( je, je-1 )/=zero ) then
+                       ilcplx = .true.
+                       nw = 2
+                    end if
+                 end if
+                 if( ilall ) then
+                    ilcomp = .true.
+                 else if( ilcplx ) then
+                    ilcomp = select( je ) .or. select( je-1 )
+                 else
+                    ilcomp = select( je )
+                 end if
+                 if( .not.ilcomp )cycle loop_500
+                 ! decide if (a) singular pencil, (b) real eigenvalue, or
+                 ! (c) complex eigenvalue.
+                 if( .not.ilcplx ) then
+                    if( abs( s( je, je ) )<=safmin .and.abs( p( je, je ) )<=safmin ) then
+                       ! singular matrix pencil -- unit eigenvector
+                       ieig = ieig - 1
+                       do jr = 1, n
+                          vr( jr, ieig ) = zero
+                       end do
+                       vr( ieig, ieig ) = one
+                       cycle loop_500
+                    end if
+                 end if
+                 ! clear vector
+                 do jw = 0, nw - 1
+                    do jr = 1, n
+                       work( ( jw+2 )*n+jr ) = zero
+                    end do
+                 end do
+                 ! compute coefficients in  ( a a - b b ) x = 0
+                    ! a  is  acoef
+                    ! b  is  bcoefr + i*bcoefi
+                 if( .not.ilcplx ) then
+                    ! real eigenvalue
+                    temp = one / max( abs( s( je, je ) )*ascale,abs( p( je, je ) )*bscale, safmin &
+                              )
+                    salfar = ( temp*s( je, je ) )*ascale
+                    sbeta = ( temp*p( je, je ) )*bscale
+                    acoef = sbeta*ascale
+                    bcoefr = salfar*bscale
+                    bcoefi = zero
+                    ! scale to avoid underflow
+                    scale = one
+                    lsa = abs( sbeta )>=safmin .and. abs( acoef )<small
+                    lsb = abs( salfar )>=safmin .and. abs( bcoefr )<small
+                    if( lsa )scale = ( small / abs( sbeta ) )*min( anorm, big )
+                    if( lsb )scale = max( scale, ( small / abs( salfar ) )*min( bnorm, big ) )
+                              
+                    if( lsa .or. lsb ) then
+                       scale = min( scale, one /( safmin*max( one, abs( acoef ),abs( bcoefr ) ) ) &
+                                 )
+                       if( lsa ) then
+                          acoef = ascale*( scale*sbeta )
+                       else
+                          acoef = scale*acoef
+                       end if
+                       if( lsb ) then
+                          bcoefr = bscale*( scale*salfar )
+                       else
+                          bcoefr = scale*bcoefr
+                       end if
+                    end if
+                    acoefa = abs( acoef )
+                    bcoefa = abs( bcoefr )
+                    ! first component is 1
+                    work( 2*n+je ) = one
+                    xmax = one
+                    ! compute contribution from column je of a and b to sum
+                    ! (see "further details", above.)
+                    do jr = 1, je - 1
+                       work( 2*n+jr ) = bcoefr*p( jr, je ) -acoef*s( jr, je )
+                    end do
+                 else
+                    ! complex eigenvalue
+                    call stdlib_dlag2( s( je-1, je-1 ), lds, p( je-1, je-1 ), ldp,safmin*safety, &
+                              acoef, temp, bcoefr, temp2,bcoefi )
+                    if( bcoefi==zero ) then
+                       info = je - 1
+                       return
+                    end if
+                    ! scale to avoid over/underflow
+                    acoefa = abs( acoef )
+                    bcoefa = abs( bcoefr ) + abs( bcoefi )
+                    scale = one
+                    if( acoefa*ulp<safmin .and. acoefa>=safmin )scale = ( safmin / ulp ) / &
+                              acoefa
+                    if( bcoefa*ulp<safmin .and. bcoefa>=safmin )scale = max( scale, ( safmin / &
+                              ulp ) / bcoefa )
+                    if( safmin*acoefa>ascale )scale = ascale / ( safmin*acoefa )
+                    if( safmin*bcoefa>bscale )scale = min( scale, bscale / ( safmin*bcoefa ) )
+                              
+                    if( scale/=one ) then
+                       acoef = scale*acoef
+                       acoefa = abs( acoef )
+                       bcoefr = scale*bcoefr
+                       bcoefi = scale*bcoefi
+                       bcoefa = abs( bcoefr ) + abs( bcoefi )
+                    end if
+                    ! compute first two components of eigenvector
+                    ! and contribution to sums
+                    temp = acoef*s( je, je-1 )
+                    temp2r = acoef*s( je, je ) - bcoefr*p( je, je )
+                    temp2i = -bcoefi*p( je, je )
+                    if( abs( temp )>=abs( temp2r )+abs( temp2i ) ) then
+                       work( 2*n+je ) = one
+                       work( 3*n+je ) = zero
+                       work( 2*n+je-1 ) = -temp2r / temp
+                       work( 3*n+je-1 ) = -temp2i / temp
+                    else
+                       work( 2*n+je-1 ) = one
+                       work( 3*n+je-1 ) = zero
+                       temp = acoef*s( je-1, je )
+                       work( 2*n+je ) = ( bcoefr*p( je-1, je-1 )-acoef*s( je-1, je-1 ) ) / &
+                                 temp
+                       work( 3*n+je ) = bcoefi*p( je-1, je-1 ) / temp
+                    end if
+                    xmax = max( abs( work( 2*n+je ) )+abs( work( 3*n+je ) ),abs( work( 2*n+je-1 ) &
+                              )+abs( work( 3*n+je-1 ) ) )
+                    ! compute contribution from columns je and je-1
+                    ! of a and b to the sums.
+                    creala = acoef*work( 2*n+je-1 )
+                    cimaga = acoef*work( 3*n+je-1 )
+                    crealb = bcoefr*work( 2*n+je-1 ) -bcoefi*work( 3*n+je-1 )
+                    cimagb = bcoefi*work( 2*n+je-1 ) +bcoefr*work( 3*n+je-1 )
+                    cre2a = acoef*work( 2*n+je )
+                    cim2a = acoef*work( 3*n+je )
+                    cre2b = bcoefr*work( 2*n+je ) - bcoefi*work( 3*n+je )
+                    cim2b = bcoefi*work( 2*n+je ) + bcoefr*work( 3*n+je )
+                    do jr = 1, je - 2
+                       work( 2*n+jr ) = -creala*s( jr, je-1 ) +crealb*p( jr, je-1 ) -cre2a*s( jr, &
+                                 je ) + cre2b*p( jr, je )
+                       work( 3*n+jr ) = -cimaga*s( jr, je-1 ) +cimagb*p( jr, je-1 ) -cim2a*s( jr, &
+                                 je ) + cim2b*p( jr, je )
+                    end do
+                 end if
+                 dmin = max( ulp*acoefa*anorm, ulp*bcoefa*bnorm, safmin )
+                 ! columnwise triangular solve of  (a a - b b)  x = 0
+                 il2by2 = .false.
+                 loop_370: do j = je - nw, 1, -1
+                    ! if a 2-by-2 block, is in position j-1:j, wait until
+                    ! next iteration to process it (when it will be j:j+1)
+                    if( .not.il2by2 .and. j>1 ) then
+                       if( s( j, j-1 )/=zero ) then
+                          il2by2 = .true.
+                          cycle loop_370
+                       end if
+                    end if
+                    bdiag( 1 ) = p( j, j )
+                    if( il2by2 ) then
+                       na = 2
+                       bdiag( 2 ) = p( j+1, j+1 )
+                    else
+                       na = 1
+                    end if
+                    ! compute x(j) (and x(j+1), if 2-by-2 block)
+                    call stdlib_dlaln2( .false., na, nw, dmin, acoef, s( j, j ),lds, bdiag( 1 ), &
+                    bdiag( 2 ), work( 2*n+j ),n, bcoefr, bcoefi, sum, 2, scale, temp,iinfo )
+                              
+                    if( scale<one ) then
+                       do jw = 0, nw - 1
+                          do jr = 1, je
+                             work( ( jw+2 )*n+jr ) = scale*work( ( jw+2 )*n+jr )
+                          end do
+                       end do
+                    end if
+                    xmax = max( scale*xmax, temp )
+                    do jw = 1, nw
+                       do ja = 1, na
+                          work( ( jw+1 )*n+j+ja-1 ) = sum( ja, jw )
+                       end do
+                    end do
+                    ! w = w + x(j)*(a s(*,j) - b p(*,j) ) with scaling
+                    if( j>1 ) then
+                       ! check whether scaling is necessary for sum.
+                       xscale = one / max( one, xmax )
+                       temp = acoefa*work( j ) + bcoefa*work( n+j )
+                       if( il2by2 )temp = max( temp, acoefa*work( j+1 )+bcoefa*work( n+j+1 ) )
+                                 
+                       temp = max( temp, acoefa, bcoefa )
+                       if( temp>bignum*xscale ) then
+                          do jw = 0, nw - 1
+                             do jr = 1, je
+                                work( ( jw+2 )*n+jr ) = xscale*work( ( jw+2 )*n+jr )
+                             end do
+                          end do
+                          xmax = xmax*xscale
+                       end if
+                       ! compute the contributions of the off-diagonals of
+                       ! column j (and j+1, if 2-by-2 block) of a and b to the
+                       ! sums.
+                       do ja = 1, na
+                          if( ilcplx ) then
+                             creala = acoef*work( 2*n+j+ja-1 )
+                             cimaga = acoef*work( 3*n+j+ja-1 )
+                             crealb = bcoefr*work( 2*n+j+ja-1 ) -bcoefi*work( 3*n+j+ja-1 )
+                             cimagb = bcoefi*work( 2*n+j+ja-1 ) +bcoefr*work( 3*n+j+ja-1 )
+                             do jr = 1, j - 1
+                                work( 2*n+jr ) = work( 2*n+jr ) -creala*s( jr, j+ja-1 ) +crealb*p(&
+                                           jr, j+ja-1 )
+                                work( 3*n+jr ) = work( 3*n+jr ) -cimaga*s( jr, j+ja-1 ) +cimagb*p(&
+                                           jr, j+ja-1 )
+                             end do
+                          else
+                             creala = acoef*work( 2*n+j+ja-1 )
+                             crealb = bcoefr*work( 2*n+j+ja-1 )
+                             do jr = 1, j - 1
+                                work( 2*n+jr ) = work( 2*n+jr ) -creala*s( jr, j+ja-1 ) +crealb*p(&
+                                           jr, j+ja-1 )
+                             end do
+                          end if
+                       end do
+                    end if
+                    il2by2 = .false.
+                 end do loop_370
+                 ! copy eigenvector to vr, back transforming if
+                 ! howmny='b'.
+                 ieig = ieig - nw
+                 if( ilback ) then
+                    do jw = 0, nw - 1
+                       do jr = 1, n
+                          work( ( jw+4 )*n+jr ) = work( ( jw+2 )*n+1 )*vr( jr, 1 )
+                       end do
+                       ! a series of compiler directives to defeat
+                       ! vectorization for the next loop
+                       do jc = 2, je
+                          do jr = 1, n
+                             work( ( jw+4 )*n+jr ) = work( ( jw+4 )*n+jr ) +work( ( jw+2 )*n+jc )&
+                                       *vr( jr, jc )
+                          end do
+                       end do
+                    end do
+                    do jw = 0, nw - 1
+                       do jr = 1, n
+                          vr( jr, ieig+jw ) = work( ( jw+4 )*n+jr )
+                       end do
+                    end do
+                    iend = n
+                 else
+                    do jw = 0, nw - 1
+                       do jr = 1, n
+                          vr( jr, ieig+jw ) = work( ( jw+2 )*n+jr )
+                       end do
+                    end do
+                    iend = je
+                 end if
+                 ! scale eigenvector
+                 xmax = zero
+                 if( ilcplx ) then
+                    do j = 1, iend
+                       xmax = max( xmax, abs( vr( j, ieig ) )+abs( vr( j, ieig+1 ) ) )
+                    end do
+                 else
+                    do j = 1, iend
+                       xmax = max( xmax, abs( vr( j, ieig ) ) )
+                    end do
+                 end if
+                 if( xmax>safmin ) then
+                    xscale = one / xmax
+                    do jw = 0, nw - 1
+                       do jr = 1, iend
+                          vr( jr, ieig+jw ) = xscale*vr( jr, ieig+jw )
+                       end do
+                    end do
+                 end if
+              end do loop_500
+           end if
+           return
+     end subroutine stdlib_dtgevc
+
+     !> DTGEX2: swaps adjacent diagonal blocks (A11, B11) and (A22, B22)
+     !> of size 1-by-1 or 2-by-2 in an upper (quasi) triangular matrix pair
+     !> (A, B) by an orthogonal equivalence transformation.
+     !> (A, B) must be in generalized real Schur canonical form (as returned
+     !> by DGGES), i.e. A is block upper triangular with 1-by-1 and 2-by-2
+     !> diagonal blocks. B is upper triangular.
+     !> Optionally, the matrices Q and Z of generalized Schur vectors are
+     !> updated.
+     !> Q(in) * A(in) * Z(in)**T = Q(out) * A(out) * Z(out)**T
+     !> Q(in) * B(in) * Z(in)**T = Q(out) * B(out) * Z(out)**T
+
+     pure subroutine stdlib_dtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, j1, n1, n2, &
+               work, lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: j1, lda, ldb, ldq, ldz, lwork, n, n1, n2
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+        ! replaced various illegal calls to stdlib_dcopy by calls to stdlib_dlaset, or by do
+        ! loops. sven hammarling, 1/5/02.
+           ! Parameters 
+           real(dp), parameter :: twenty = 2.0e+01_dp
+           integer(ilp), parameter :: ldst = 4
+           logical(lk), parameter :: wands = .true.
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: strong, weak
+           integer(ilp) :: i, idum, linfo, m
+           real(dp) :: bqra21, brqa21, ddum, dnorma, dnormb, dscale, dsum, eps, f, g, sa, sb, &
+                     scale, smlnum, thresha, threshb
+           ! Local Arrays 
+           integer(ilp) :: iwork(ldst)
+           real(dp) :: ai(2), ar(2), be(2), ir(ldst,ldst), ircop(ldst,ldst), li(ldst,ldst), licop(&
+           ldst,ldst), s(ldst,ldst), scpy(ldst,ldst), t(ldst,ldst), taul(ldst), taur(ldst), tcpy(&
+                     ldst,ldst)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=1 .or. n1<=0 .or. n2<=0 )return
+           if( n1>n .or. ( j1+n1 )>n )return
+           m = n1 + n2
+           if( lwork<max( 1, n*m, m*m*2 ) ) then
+              info = -16
+              work( 1 ) = max( 1, n*m, m*m*2 )
+              return
+           end if
+           weak = .false.
+           strong = .false.
+           ! make a local copy of selected block
+           call stdlib_dlaset( 'FULL', ldst, ldst, zero, zero, li, ldst )
+           call stdlib_dlaset( 'FULL', ldst, ldst, zero, zero, ir, ldst )
+           call stdlib_dlacpy( 'FULL', m, m, a( j1, j1 ), lda, s, ldst )
+           call stdlib_dlacpy( 'FULL', m, m, b( j1, j1 ), ldb, t, ldst )
+           ! compute threshold for testing acceptance of swapping.
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' ) / eps
+           dscale = zero
+           dsum = one
+           call stdlib_dlacpy( 'FULL', m, m, s, ldst, work, m )
+           call stdlib_dlassq( m*m, work, 1, dscale, dsum )
+           dnorma = dscale*sqrt( dsum )
+           dscale = zero
+           dsum = one
+           call stdlib_dlacpy( 'FULL', m, m, t, ldst, work, m )
+           call stdlib_dlassq( m*m, work, 1, dscale, dsum )
+           dnormb = dscale*sqrt( dsum )
+           ! thres has been changed from
+              ! thresh = max( ten*eps*sa, smlnum )
+           ! to
+              ! thresh = max( twenty*eps*sa, smlnum )
+           ! on 04/01/10.
+           ! "bug" reported by ondra kamenik, confirmed by julie langou, fixed by
+           ! jim demmel and guillaume revy. see forum post 1783.
+           thresha = max( twenty*eps*dnorma, smlnum )
+           threshb = max( twenty*eps*dnormb, smlnum )
+           if( m==2 ) then
+              ! case 1: swap 1-by-1 and 1-by-1 blocks.
+              ! compute orthogonal ql and rq that swap 1-by-1 and 1-by-1 blocks
+              ! using givens rotations and perform the swap tentatively.
+              f = s( 2, 2 )*t( 1, 1 ) - t( 2, 2 )*s( 1, 1 )
+              g = s( 2, 2 )*t( 1, 2 ) - t( 2, 2 )*s( 1, 2 )
+              sa = abs( s( 2, 2 ) ) * abs( t( 1, 1 ) )
+              sb = abs( s( 1, 1 ) ) * abs( t( 2, 2 ) )
+              call stdlib_dlartg( f, g, ir( 1, 2 ), ir( 1, 1 ), ddum )
+              ir( 2, 1 ) = -ir( 1, 2 )
+              ir( 2, 2 ) = ir( 1, 1 )
+              call stdlib_drot( 2, s( 1, 1 ), 1, s( 1, 2 ), 1, ir( 1, 1 ),ir( 2, 1 ) )
+              call stdlib_drot( 2, t( 1, 1 ), 1, t( 1, 2 ), 1, ir( 1, 1 ),ir( 2, 1 ) )
+              if( sa>=sb ) then
+                 call stdlib_dlartg( s( 1, 1 ), s( 2, 1 ), li( 1, 1 ), li( 2, 1 ),ddum )
+              else
+                 call stdlib_dlartg( t( 1, 1 ), t( 2, 1 ), li( 1, 1 ), li( 2, 1 ),ddum )
+              end if
+              call stdlib_drot( 2, s( 1, 1 ), ldst, s( 2, 1 ), ldst, li( 1, 1 ),li( 2, 1 ) )
+                        
+              call stdlib_drot( 2, t( 1, 1 ), ldst, t( 2, 1 ), ldst, li( 1, 1 ),li( 2, 1 ) )
+                        
+              li( 2, 2 ) = li( 1, 1 )
+              li( 1, 2 ) = -li( 2, 1 )
+              ! weak stability test: |s21| <= o(eps f-norm((a)))
+                                 ! and  |t21| <= o(eps f-norm((b)))
+              weak = abs( s( 2, 1 ) ) <= thresha .and.abs( t( 2, 1 ) ) <= threshb
+              if( .not.weak )go to 70
+              if( wands ) then
+                 ! strong stability test:
+                     ! f-norm((a-ql**h*s*qr)) <= o(eps*f-norm((a)))
+                     ! and
+                     ! f-norm((b-ql**h*t*qr)) <= o(eps*f-norm((b)))
+                 call stdlib_dlacpy( 'FULL', m, m, a( j1, j1 ), lda, work( m*m+1 ),m )
+                 call stdlib_dgemm( 'N', 'N', m, m, m, one, li, ldst, s, ldst, zero,work, m )
+                           
+                 call stdlib_dgemm( 'N', 'T', m, m, m, -one, work, m, ir, ldst, one,work( m*m+1 ),&
+                            m )
+                 dscale = zero
+                 dsum = one
+                 call stdlib_dlassq( m*m, work( m*m+1 ), 1, dscale, dsum )
+                 sa = dscale*sqrt( dsum )
+                 call stdlib_dlacpy( 'FULL', m, m, b( j1, j1 ), ldb, work( m*m+1 ),m )
+                 call stdlib_dgemm( 'N', 'N', m, m, m, one, li, ldst, t, ldst, zero,work, m )
+                           
+                 call stdlib_dgemm( 'N', 'T', m, m, m, -one, work, m, ir, ldst, one,work( m*m+1 ),&
+                            m )
+                 dscale = zero
+                 dsum = one
+                 call stdlib_dlassq( m*m, work( m*m+1 ), 1, dscale, dsum )
+                 sb = dscale*sqrt( dsum )
+                 strong = sa<=thresha .and. sb<=threshb
+                 if( .not.strong )go to 70
+              end if
+              ! update (a(j1:j1+m-1, m+j1:n), b(j1:j1+m-1, m+j1:n)) and
+                     ! (a(1:j1-1, j1:j1+m), b(1:j1-1, j1:j1+m)).
+              call stdlib_drot( j1+1, a( 1, j1 ), 1, a( 1, j1+1 ), 1, ir( 1, 1 ),ir( 2, 1 ) )
+                        
+              call stdlib_drot( j1+1, b( 1, j1 ), 1, b( 1, j1+1 ), 1, ir( 1, 1 ),ir( 2, 1 ) )
+                        
+              call stdlib_drot( n-j1+1, a( j1, j1 ), lda, a( j1+1, j1 ), lda,li( 1, 1 ), li( 2, 1 &
+                        ) )
+              call stdlib_drot( n-j1+1, b( j1, j1 ), ldb, b( j1+1, j1 ), ldb,li( 1, 1 ), li( 2, 1 &
+                        ) )
+              ! set  n1-by-n2 (2,1) - blocks to zero.
+              a( j1+1, j1 ) = zero
+              b( j1+1, j1 ) = zero
+              ! accumulate transformations into q and z if requested.
+              if( wantz )call stdlib_drot( n, z( 1, j1 ), 1, z( 1, j1+1 ), 1, ir( 1, 1 ),ir( 2, 1 &
+                        ) )
+              if( wantq )call stdlib_drot( n, q( 1, j1 ), 1, q( 1, j1+1 ), 1, li( 1, 1 ),li( 2, 1 &
+                        ) )
+              ! exit with info = 0 if swap was successfully performed.
+              return
+           else
+              ! case 2: swap 1-by-1 and 2-by-2 blocks, or 2-by-2
+                      ! and 2-by-2 blocks.
+              ! solve the generalized sylvester equation
+                       ! s11 * r - l * s22 = scale * s12
+                       ! t11 * r - l * t22 = scale * t12
+              ! for r and l. solutions in li and ir.
+              call stdlib_dlacpy( 'FULL', n1, n2, t( 1, n1+1 ), ldst, li, ldst )
+              call stdlib_dlacpy( 'FULL', n1, n2, s( 1, n1+1 ), ldst,ir( n2+1, n1+1 ), ldst )
+                        
+              call stdlib_dtgsy2( 'N', 0, n1, n2, s, ldst, s( n1+1, n1+1 ), ldst,ir( n2+1, n1+1 ),&
+               ldst, t, ldst, t( n1+1, n1+1 ),ldst, li, ldst, scale, dsum, dscale, iwork, idum,&
+                         linfo )
+              if( linfo/=0 )go to 70
+              ! compute orthogonal matrix ql:
+                          ! ql**t * li = [ tl ]
+                                       ! [ 0  ]
+              ! where
+                          ! li =  [      -l              ]
+                                ! [ scale * identity(n2) ]
+              do i = 1, n2
+                 call stdlib_dscal( n1, -one, li( 1, i ), 1 )
+                 li( n1+i, i ) = scale
+              end do
+              call stdlib_dgeqr2( m, n2, li, ldst, taul, work, linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_dorg2r( m, m, n2, li, ldst, taul, work, linfo )
+              if( linfo/=0 )go to 70
+              ! compute orthogonal matrix rq:
+                          ! ir * rq**t =   [ 0  tr],
+               ! where ir = [ scale * identity(n1), r ]
+              do i = 1, n1
+                 ir( n2+i, i ) = scale
+              end do
+              call stdlib_dgerq2( n1, m, ir( n2+1, 1 ), ldst, taur, work, linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_dorgr2( m, m, n1, ir, ldst, taur, work, linfo )
+              if( linfo/=0 )go to 70
+              ! perform the swapping tentatively:
+              call stdlib_dgemm( 'T', 'N', m, m, m, one, li, ldst, s, ldst, zero,work, m )
+              call stdlib_dgemm( 'N', 'T', m, m, m, one, work, m, ir, ldst, zero, s,ldst )
+              call stdlib_dgemm( 'T', 'N', m, m, m, one, li, ldst, t, ldst, zero,work, m )
+              call stdlib_dgemm( 'N', 'T', m, m, m, one, work, m, ir, ldst, zero, t,ldst )
+              call stdlib_dlacpy( 'F', m, m, s, ldst, scpy, ldst )
+              call stdlib_dlacpy( 'F', m, m, t, ldst, tcpy, ldst )
+              call stdlib_dlacpy( 'F', m, m, ir, ldst, ircop, ldst )
+              call stdlib_dlacpy( 'F', m, m, li, ldst, licop, ldst )
+              ! triangularize the b-part by an rq factorization.
+              ! apply transformation (from left) to a-part, giving s.
+              call stdlib_dgerq2( m, m, t, ldst, taur, work, linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_dormr2( 'R', 'T', m, m, m, t, ldst, taur, s, ldst, work,linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_dormr2( 'L', 'N', m, m, m, t, ldst, taur, ir, ldst, work,linfo )
+              if( linfo/=0 )go to 70
+              ! compute f-norm(s21) in brqa21. (t21 is 0.)
+              dscale = zero
+              dsum = one
+              do i = 1, n2
+                 call stdlib_dlassq( n1, s( n2+1, i ), 1, dscale, dsum )
+              end do
+              brqa21 = dscale*sqrt( dsum )
+              ! triangularize the b-part by a qr factorization.
+              ! apply transformation (from right) to a-part, giving s.
+              call stdlib_dgeqr2( m, m, tcpy, ldst, taul, work, linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_dorm2r( 'L', 'T', m, m, m, tcpy, ldst, taul, scpy, ldst,work, info )
+                        
+              call stdlib_dorm2r( 'R', 'N', m, m, m, tcpy, ldst, taul, licop, ldst,work, info )
+                        
+              if( linfo/=0 )go to 70
+              ! compute f-norm(s21) in bqra21. (t21 is 0.)
+              dscale = zero
+              dsum = one
+              do i = 1, n2
+                 call stdlib_dlassq( n1, scpy( n2+1, i ), 1, dscale, dsum )
+              end do
+              bqra21 = dscale*sqrt( dsum )
+              ! decide which method to use.
+                ! weak stability test:
+                   ! f-norm(s21) <= o(eps * f-norm((s)))
+              if( bqra21<=brqa21 .and. bqra21<=thresha ) then
+                 call stdlib_dlacpy( 'F', m, m, scpy, ldst, s, ldst )
+                 call stdlib_dlacpy( 'F', m, m, tcpy, ldst, t, ldst )
+                 call stdlib_dlacpy( 'F', m, m, ircop, ldst, ir, ldst )
+                 call stdlib_dlacpy( 'F', m, m, licop, ldst, li, ldst )
+              else if( brqa21>=thresha ) then
+                 go to 70
+              end if
+              ! set lower triangle of b-part to zero
+              call stdlib_dlaset( 'LOWER', m-1, m-1, zero, zero, t(2,1), ldst )
+              if( wands ) then
+                 ! strong stability test:
+                     ! f-norm((a-ql**h*s*qr)) <= o(eps*f-norm((a)))
+                     ! and
+                     ! f-norm((b-ql**h*t*qr)) <= o(eps*f-norm((b)))
+                 call stdlib_dlacpy( 'FULL', m, m, a( j1, j1 ), lda, work( m*m+1 ),m )
+                 call stdlib_dgemm( 'N', 'N', m, m, m, one, li, ldst, s, ldst, zero,work, m )
+                           
+                 call stdlib_dgemm( 'N', 'N', m, m, m, -one, work, m, ir, ldst, one,work( m*m+1 ),&
+                            m )
+                 dscale = zero
+                 dsum = one
+                 call stdlib_dlassq( m*m, work( m*m+1 ), 1, dscale, dsum )
+                 sa = dscale*sqrt( dsum )
+                 call stdlib_dlacpy( 'FULL', m, m, b( j1, j1 ), ldb, work( m*m+1 ),m )
+                 call stdlib_dgemm( 'N', 'N', m, m, m, one, li, ldst, t, ldst, zero,work, m )
+                           
+                 call stdlib_dgemm( 'N', 'N', m, m, m, -one, work, m, ir, ldst, one,work( m*m+1 ),&
+                            m )
+                 dscale = zero
+                 dsum = one
+                 call stdlib_dlassq( m*m, work( m*m+1 ), 1, dscale, dsum )
+                 sb = dscale*sqrt( dsum )
+                 strong = sa<=thresha .and. sb<=threshb
+                 if( .not.strong )go to 70
+              end if
+              ! if the swap is accepted ("weakly" and "strongly"), apply the
+              ! transformations and set n1-by-n2 (2,1)-block to zero.
+              call stdlib_dlaset( 'FULL', n1, n2, zero, zero, s(n2+1,1), ldst )
+              ! copy back m-by-m diagonal block starting at index j1 of (a, b)
+              call stdlib_dlacpy( 'F', m, m, s, ldst, a( j1, j1 ), lda )
+              call stdlib_dlacpy( 'F', m, m, t, ldst, b( j1, j1 ), ldb )
+              call stdlib_dlaset( 'FULL', ldst, ldst, zero, zero, t, ldst )
+              ! standardize existing 2-by-2 blocks.
+              call stdlib_dlaset( 'FULL', m, m, zero, zero, work, m )
+              work( 1 ) = one
+              t( 1, 1 ) = one
+              idum = lwork - m*m - 2
+              if( n2>1 ) then
+                 call stdlib_dlagv2( a( j1, j1 ), lda, b( j1, j1 ), ldb, ar, ai, be,work( 1 ), &
+                           work( 2 ), t( 1, 1 ), t( 2, 1 ) )
+                 work( m+1 ) = -work( 2 )
+                 work( m+2 ) = work( 1 )
+                 t( n2, n2 ) = t( 1, 1 )
+                 t( 1, 2 ) = -t( 2, 1 )
+              end if
+              work( m*m ) = one
+              t( m, m ) = one
+              if( n1>1 ) then
+                 call stdlib_dlagv2( a( j1+n2, j1+n2 ), lda, b( j1+n2, j1+n2 ), ldb,taur, taul, &
+                 work( m*m+1 ), work( n2*m+n2+1 ),work( n2*m+n2+2 ), t( n2+1, n2+1 ),t( m, m-1 ) )
+                           
+                 work( m*m ) = work( n2*m+n2+1 )
+                 work( m*m-1 ) = -work( n2*m+n2+2 )
+                 t( m, m ) = t( n2+1, n2+1 )
+                 t( m-1, m ) = -t( m, m-1 )
+              end if
+              call stdlib_dgemm( 'T', 'N', n2, n1, n2, one, work, m, a( j1, j1+n2 ),lda, zero, &
+                        work( m*m+1 ), n2 )
+              call stdlib_dlacpy( 'FULL', n2, n1, work( m*m+1 ), n2, a( j1, j1+n2 ),lda )
+              call stdlib_dgemm( 'T', 'N', n2, n1, n2, one, work, m, b( j1, j1+n2 ),ldb, zero, &
+                        work( m*m+1 ), n2 )
+              call stdlib_dlacpy( 'FULL', n2, n1, work( m*m+1 ), n2, b( j1, j1+n2 ),ldb )
+              call stdlib_dgemm( 'N', 'N', m, m, m, one, li, ldst, work, m, zero,work( m*m+1 ), m &
+                        )
+              call stdlib_dlacpy( 'FULL', m, m, work( m*m+1 ), m, li, ldst )
+              call stdlib_dgemm( 'N', 'N', n2, n1, n1, one, a( j1, j1+n2 ), lda,t( n2+1, n2+1 ), &
+                        ldst, zero, work, n2 )
+              call stdlib_dlacpy( 'FULL', n2, n1, work, n2, a( j1, j1+n2 ), lda )
+              call stdlib_dgemm( 'N', 'N', n2, n1, n1, one, b( j1, j1+n2 ), ldb,t( n2+1, n2+1 ), &
+                        ldst, zero, work, n2 )
+              call stdlib_dlacpy( 'FULL', n2, n1, work, n2, b( j1, j1+n2 ), ldb )
+              call stdlib_dgemm( 'T', 'N', m, m, m, one, ir, ldst, t, ldst, zero,work, m )
+              call stdlib_dlacpy( 'FULL', m, m, work, m, ir, ldst )
+              ! accumulate transformations into q and z if requested.
+              if( wantq ) then
+                 call stdlib_dgemm( 'N', 'N', n, m, m, one, q( 1, j1 ), ldq, li,ldst, zero, work, &
+                           n )
+                 call stdlib_dlacpy( 'FULL', n, m, work, n, q( 1, j1 ), ldq )
+              end if
+              if( wantz ) then
+                 call stdlib_dgemm( 'N', 'N', n, m, m, one, z( 1, j1 ), ldz, ir,ldst, zero, work, &
+                           n )
+                 call stdlib_dlacpy( 'FULL', n, m, work, n, z( 1, j1 ), ldz )
+              end if
+              ! update (a(j1:j1+m-1, m+j1:n), b(j1:j1+m-1, m+j1:n)) and
+                      ! (a(1:j1-1, j1:j1+m), b(1:j1-1, j1:j1+m)).
+              i = j1 + m
+              if( i<=n ) then
+                 call stdlib_dgemm( 'T', 'N', m, n-i+1, m, one, li, ldst,a( j1, i ), lda, zero, &
+                           work, m )
+                 call stdlib_dlacpy( 'FULL', m, n-i+1, work, m, a( j1, i ), lda )
+                 call stdlib_dgemm( 'T', 'N', m, n-i+1, m, one, li, ldst,b( j1, i ), ldb, zero, &
+                           work, m )
+                 call stdlib_dlacpy( 'FULL', m, n-i+1, work, m, b( j1, i ), ldb )
+              end if
+              i = j1 - 1
+              if( i>0 ) then
+                 call stdlib_dgemm( 'N', 'N', i, m, m, one, a( 1, j1 ), lda, ir,ldst, zero, work, &
+                           i )
+                 call stdlib_dlacpy( 'FULL', i, m, work, i, a( 1, j1 ), lda )
+                 call stdlib_dgemm( 'N', 'N', i, m, m, one, b( 1, j1 ), ldb, ir,ldst, zero, work, &
+                           i )
+                 call stdlib_dlacpy( 'FULL', i, m, work, i, b( 1, j1 ), ldb )
+              end if
+              ! exit with info = 0 if swap was successfully performed.
+              return
+           end if
+           ! exit with info = 1 if swap was rejected.
+           70 continue
+           info = 1
+           return
+     end subroutine stdlib_dtgex2
+
+     !> DTGEXC: reorders the generalized real Schur decomposition of a real
+     !> matrix pair (A,B) using an orthogonal equivalence transformation
+     !> (A, B) = Q * (A, B) * Z**T,
+     !> so that the diagonal block of (A, B) with row index IFST is moved
+     !> to row ILST.
+     !> (A, B) must be in generalized real Schur canonical form (as returned
+     !> by DGGES), i.e. A is block upper triangular with 1-by-1 and 2-by-2
+     !> diagonal blocks. B is upper triangular.
+     !> Optionally, the matrices Q and Z of generalized Schur vectors are
+     !> updated.
+     !> Q(in) * A(in) * Z(in)**T = Q(out) * A(out) * Z(out)**T
+     !> Q(in) * B(in) * Z(in)**T = Q(out) * B(out) * Z(out)**T
+
+     pure subroutine stdlib_dtgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, ifst, ilst, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(inout) :: ifst, ilst
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldq, ldz, lwork, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: here, lwmin, nbf, nbl, nbnext
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode and test input arguments.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldq<1 .or. wantq .and. ( ldq<max( 1, n ) ) ) then
+              info = -9
+           else if( ldz<1 .or. wantz .and. ( ldz<max( 1, n ) ) ) then
+              info = -11
+           else if( ifst<1 .or. ifst>n ) then
+              info = -12
+           else if( ilst<1 .or. ilst>n ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwmin = 1
+              else
+                 lwmin = 4*n + 16
+              end if
+              work(1) = lwmin
+              if (lwork<lwmin .and. .not.lquery) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGEXC', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 )return
+           ! determine the first row of the specified block and find out
+           ! if it is 1-by-1 or 2-by-2.
+           if( ifst>1 ) then
+              if( a( ifst, ifst-1 )/=zero )ifst = ifst - 1
+           end if
+           nbf = 1
+           if( ifst<n ) then
+              if( a( ifst+1, ifst )/=zero )nbf = 2
+           end if
+           ! determine the first row of the final block
+           ! and find out if it is 1-by-1 or 2-by-2.
+           if( ilst>1 ) then
+              if( a( ilst, ilst-1 )/=zero )ilst = ilst - 1
+           end if
+           nbl = 1
+           if( ilst<n ) then
+              if( a( ilst+1, ilst )/=zero )nbl = 2
+           end if
+           if( ifst==ilst )return
+           if( ifst<ilst ) then
+              ! update ilst.
+              if( nbf==2 .and. nbl==1 )ilst = ilst - 1
+              if( nbf==1 .and. nbl==2 )ilst = ilst + 1
+              here = ifst
+              10 continue
+              ! swap with next one below.
+              if( nbf==1 .or. nbf==2 ) then
+                 ! current block either 1-by-1 or 2-by-2.
+                 nbnext = 1
+                 if( here+nbf+1<=n ) then
+                    if( a( here+nbf+1, here+nbf )/=zero )nbnext = 2
+                 end if
+                 call stdlib_dtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here, nbf, &
+                           nbnext, work, lwork, info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 here = here + nbnext
+                 ! test if 2-by-2 block breaks into two 1-by-1 blocks.
+                 if( nbf==2 ) then
+                    if( a( here+1, here )==zero )nbf = 3
+                 end if
+              else
+                 ! current block consists of two 1-by-1 blocks, each of which
+                 ! must be swapped individually.
+                 nbnext = 1
+                 if( here+3<=n ) then
+                    if( a( here+3, here+2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_dtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here+1, 1, &
+                           nbnext, work, lwork, info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 if( nbnext==1 ) then
+                    ! swap two 1-by-1 blocks.
+                    call stdlib_dtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here, 1, &
+                              1, work, lwork, info )
+                    if( info/=0 ) then
+                       ilst = here
+                       return
+                    end if
+                    here = here + 1
+                 else
+                    ! recompute nbnext in case of 2-by-2 split.
+                    if( a( here+2, here+1 )==zero )nbnext = 1
+                    if( nbnext==2 ) then
+                       ! 2-by-2 block did not split.
+                       call stdlib_dtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, nbnext, work, lwork,info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here + 2
+                    else
+                       ! 2-by-2 block did split.
+                       call stdlib_dtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here + 1
+                       call stdlib_dtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here + 1
+                    end if
+                 end if
+              end if
+              if( here<ilst )go to 10
+           else
+              here = ifst
+              20 continue
+              ! swap with next one below.
+              if( nbf==1 .or. nbf==2 ) then
+                 ! current block either 1-by-1 or 2-by-2.
+                 nbnext = 1
+                 if( here>=3 ) then
+                    if( a( here-1, here-2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_dtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here-nbnext, &
+                           nbnext, nbf, work, lwork,info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 here = here - nbnext
+                 ! test if 2-by-2 block breaks into two 1-by-1 blocks.
+                 if( nbf==2 ) then
+                    if( a( here+1, here )==zero )nbf = 3
+                 end if
+              else
+                 ! current block consists of two 1-by-1 blocks, each of which
+                 ! must be swapped individually.
+                 nbnext = 1
+                 if( here>=3 ) then
+                    if( a( here-1, here-2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_dtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here-nbnext, &
+                           nbnext, 1, work, lwork,info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 if( nbnext==1 ) then
+                    ! swap two 1-by-1 blocks.
+                    call stdlib_dtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here, &
+                              nbnext, 1, work, lwork, info )
+                    if( info/=0 ) then
+                       ilst = here
+                       return
+                    end if
+                    here = here - 1
+                 else
+                   ! recompute nbnext in case of 2-by-2 split.
+                    if( a( here, here-1 )==zero )nbnext = 1
+                    if( nbnext==2 ) then
+                       ! 2-by-2 block did not split.
+                       call stdlib_dtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here-1,&
+                                  2, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here - 2
+                    else
+                       ! 2-by-2 block did split.
+                       call stdlib_dtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here - 1
+                       call stdlib_dtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here - 1
+                    end if
+                 end if
+              end if
+              if( here>ilst )go to 20
+           end if
+           ilst = here
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_dtgexc
+
+     !> DTGSEN: reorders the generalized real Schur decomposition of a real
+     !> matrix pair (A, B) (in terms of an orthonormal equivalence trans-
+     !> formation Q**T * (A, B) * Z), so that a selected cluster of eigenvalues
+     !> appears in the leading diagonal blocks of the upper quasi-triangular
+     !> matrix A and the upper triangular B. The leading columns of Q and
+     !> Z form orthonormal bases of the corresponding left and right eigen-
+     !> spaces (deflating subspaces). (A, B) must be in generalized real
+     !> Schur canonical form (as returned by DGGES), i.e. A is block upper
+     !> triangular with 1-by-1 and 2-by-2 diagonal blocks. B is upper
+     !> triangular.
+     !> DTGSEN also computes the generalized eigenvalues
+     !> w(j) = (ALPHAR(j) + i*ALPHAI(j))/BETA(j)
+     !> of the reordered matrix pair (A, B).
+     !> Optionally, DTGSEN computes the estimates of reciprocal condition
+     !> numbers for eigenvalues and eigenspaces. These are Difu[(A11,B11),
+     !> (A22,B22)] and Difl[(A11,B11), (A22,B22)], i.e. the separation(s)
+     !> between the matrix pairs (A11, B11) and (A22,B22) that correspond to
+     !> the selected cluster and the eigenvalues outside the cluster, resp.,
+     !> and norms of "projections" onto left and right eigenspaces w.r.t.
+     !> the selected cluster in the (1,1)-block.
+
+     pure subroutine stdlib_dtgsen( ijob, wantq, wantz, select, n, a, lda, b, ldb,alphar, alphai, &
+               beta, q, ldq, z, ldz, m, pl,pr, dif, work, lwork, iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldq, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(out) :: pl, pr
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           real(dp), intent(out) :: alphai(*), alphar(*), beta(*), dif(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: idifjb = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, pair, swap, wantd, wantd1, wantd2, wantp
+           integer(ilp) :: i, ierr, ijb, k, kase, kk, ks, liwmin, lwmin, mn2, n1, n2
+           real(dp) :: dscale, dsum, eps, rdscal, smlnum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max,sign,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           info = 0
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( ijob<0 .or. ijob>5 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -14
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGSEN', -info )
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' ) / eps
+           ierr = 0
+           wantp = ijob==1 .or. ijob>=4
+           wantd1 = ijob==2 .or. ijob==4
+           wantd2 = ijob==3 .or. ijob==5
+           wantd = wantd1 .or. wantd2
+           ! set m to the dimension of the specified pair of deflating
+           ! subspaces.
+           m = 0
+           pair = .false.
+           if( .not.lquery .or. ijob/=0 ) then
+           do k = 1, n
+              if( pair ) then
+                 pair = .false.
+              else
+                 if( k<n ) then
+                    if( a( k+1, k )==zero ) then
+                       if( select( k ) )m = m + 1
+                    else
+                       pair = .true.
+                       if( select( k ) .or. select( k+1 ) )m = m + 2
+                    end if
+                 else
+                    if( select( n ) )m = m + 1
+                 end if
+              end if
+           end do
+           end if
+           if( ijob==1 .or. ijob==2 .or. ijob==4 ) then
+              lwmin = max( 1, 4*n+16, 2*m*( n-m ) )
+              liwmin = max( 1, n+6 )
+           else if( ijob==3 .or. ijob==5 ) then
+              lwmin = max( 1, 4*n+16, 4*m*( n-m ) )
+              liwmin = max( 1, 2*m*( n-m ), n+6 )
+           else
+              lwmin = max( 1, 4*n+16 )
+              liwmin = 1
+           end if
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           if( lwork<lwmin .and. .not.lquery ) then
+              info = -22
+           else if( liwork<liwmin .and. .not.lquery ) then
+              info = -24
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGSEN', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( m==n .or. m==0 ) then
+              if( wantp ) then
+                 pl = one
+                 pr = one
+              end if
+              if( wantd ) then
+                 dscale = zero
+                 dsum = one
+                 do i = 1, n
+                    call stdlib_dlassq( n, a( 1, i ), 1, dscale, dsum )
+                    call stdlib_dlassq( n, b( 1, i ), 1, dscale, dsum )
+                 end do
+                 dif( 1 ) = dscale*sqrt( dsum )
+                 dif( 2 ) = dif( 1 )
+              end if
+              go to 60
+           end if
+           ! collect the selected blocks at the top-left corner of (a, b).
+           ks = 0
+           pair = .false.
+           loop_30: do k = 1, n
+              if( pair ) then
+                 pair = .false.
+              else
+                 swap = select( k )
+                 if( k<n ) then
+                    if( a( k+1, k )/=zero ) then
+                       pair = .true.
+                       swap = swap .or. select( k+1 )
+                    end if
+                 end if
+                 if( swap ) then
+                    ks = ks + 1
+                    ! swap the k-th block to position ks.
+                    ! perform the reordering of diagonal blocks in (a, b)
+                    ! by orthogonal transformation matrices and update
+                    ! q and z accordingly (if requested):
+                    kk = k
+                    if( k/=ks )call stdlib_dtgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz,&
+                               kk, ks, work, lwork, ierr )
+                    if( ierr>0 ) then
+                       ! swap is rejected: exit.
+                       info = 1
+                       if( wantp ) then
+                          pl = zero
+                          pr = zero
+                       end if
+                       if( wantd ) then
+                          dif( 1 ) = zero
+                          dif( 2 ) = zero
+                       end if
+                       go to 60
+                    end if
+                    if( pair )ks = ks + 1
+                 end if
+              end if
+           end do loop_30
+           if( wantp ) then
+              ! solve generalized sylvester equation for r and l
+              ! and compute pl and pr.
+              n1 = m
+              n2 = n - m
+              i = n1 + 1
+              ijb = 0
+              call stdlib_dlacpy( 'FULL', n1, n2, a( 1, i ), lda, work, n1 )
+              call stdlib_dlacpy( 'FULL', n1, n2, b( 1, i ), ldb, work( n1*n2+1 ),n1 )
+              call stdlib_dtgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda, work,n1, b, ldb, b( i,&
+               i ), ldb, work( n1*n2+1 ), n1,dscale, dif( 1 ), work( n1*n2*2+1 ),lwork-2*n1*n2, &
+                         iwork, ierr )
+              ! estimate the reciprocal of norms of "projections" onto left
+              ! and right eigenspaces.
+              rdscal = zero
+              dsum = one
+              call stdlib_dlassq( n1*n2, work, 1, rdscal, dsum )
+              pl = rdscal*sqrt( dsum )
+              if( pl==zero ) then
+                 pl = one
+              else
+                 pl = dscale / ( sqrt( dscale*dscale / pl+pl )*sqrt( pl ) )
+              end if
+              rdscal = zero
+              dsum = one
+              call stdlib_dlassq( n1*n2, work( n1*n2+1 ), 1, rdscal, dsum )
+              pr = rdscal*sqrt( dsum )
+              if( pr==zero ) then
+                 pr = one
+              else
+                 pr = dscale / ( sqrt( dscale*dscale / pr+pr )*sqrt( pr ) )
+              end if
+           end if
+           if( wantd ) then
+              ! compute estimates of difu and difl.
+              if( wantd1 ) then
+                 n1 = m
+                 n2 = n - m
+                 i = n1 + 1
+                 ijb = idifjb
+                 ! frobenius norm-based difu-estimate.
+                 call stdlib_dtgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda, work,n1, b, ldb, b(&
+                  i, i ), ldb, work( n1*n2+1 ),n1, dscale, dif( 1 ), work( 2*n1*n2+1 ),lwork-&
+                            2*n1*n2, iwork, ierr )
+                 ! frobenius norm-based difl-estimate.
+                 call stdlib_dtgsyl( 'N', ijb, n2, n1, a( i, i ), lda, a, lda, work,n2, b( i, i ),&
+                  ldb, b, ldb, work( n1*n2+1 ),n2, dscale, dif( 2 ), work( 2*n1*n2+1 ),lwork-&
+                            2*n1*n2, iwork, ierr )
+              else
+                 ! compute 1-norm-based estimates of difu and difl using
+                 ! reversed communication with stdlib_dlacn2. in each step a
+                 ! generalized sylvester equation or a transposed variant
+                 ! is solved.
+                 kase = 0
+                 n1 = m
+                 n2 = n - m
+                 i = n1 + 1
+                 ijb = 0
+                 mn2 = 2*n1*n2
+                 ! 1-norm-based estimate of difu.
+                 40 continue
+                 call stdlib_dlacn2( mn2, work( mn2+1 ), work, iwork, dif( 1 ),kase, isave )
+                           
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve generalized sylvester equation.
+                       call stdlib_dtgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda,work, n1, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n1, dscale, dif( 1 ),work( 2*n1*n2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    else
+                       ! solve the transposed variant.
+                       call stdlib_dtgsyl( 'T', ijb, n1, n2, a, lda, a( i, i ), lda,work, n1, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n1, dscale, dif( 1 ),work( 2*n1*n2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    end if
+                    go to 40
+                 end if
+                 dif( 1 ) = dscale / dif( 1 )
+                 ! 1-norm-based estimate of difl.
+                 50 continue
+                 call stdlib_dlacn2( mn2, work( mn2+1 ), work, iwork, dif( 2 ),kase, isave )
+                           
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve generalized sylvester equation.
+                       call stdlib_dtgsyl( 'N', ijb, n2, n1, a( i, i ), lda, a, lda,work, n2, b( &
+                       i, i ), ldb, b, ldb,work( n1*n2+1 ), n2, dscale, dif( 2 ),work( 2*n1*n2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    else
+                       ! solve the transposed variant.
+                       call stdlib_dtgsyl( 'T', ijb, n2, n1, a( i, i ), lda, a, lda,work, n2, b( &
+                       i, i ), ldb, b, ldb,work( n1*n2+1 ), n2, dscale, dif( 2 ),work( 2*n1*n2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    end if
+                    go to 50
+                 end if
+                 dif( 2 ) = dscale / dif( 2 )
+              end if
+           end if
+           60 continue
+           ! compute generalized eigenvalues of reordered pair (a, b) and
+           ! normalize the generalized schur form.
+           pair = .false.
+           loop_80: do k = 1, n
+              if( pair ) then
+                 pair = .false.
+              else
+                 if( k<n ) then
+                    if( a( k+1, k )/=zero ) then
+                       pair = .true.
+                    end if
+                 end if
+                 if( pair ) then
+                   ! compute the eigenvalue(s) at position k.
+                    work( 1 ) = a( k, k )
+                    work( 2 ) = a( k+1, k )
+                    work( 3 ) = a( k, k+1 )
+                    work( 4 ) = a( k+1, k+1 )
+                    work( 5 ) = b( k, k )
+                    work( 6 ) = b( k+1, k )
+                    work( 7 ) = b( k, k+1 )
+                    work( 8 ) = b( k+1, k+1 )
+                    call stdlib_dlag2( work, 2, work( 5 ), 2, smlnum*eps, beta( k ),beta( k+1 ), &
+                              alphar( k ), alphar( k+1 ),alphai( k ) )
+                    alphai( k+1 ) = -alphai( k )
+                 else
+                    if( sign( one, b( k, k ) )<zero ) then
+                       ! if b(k,k) is negative, make it positive
+                       do i = 1, n
+                          a( k, i ) = -a( k, i )
+                          b( k, i ) = -b( k, i )
+                          if( wantq ) q( i, k ) = -q( i, k )
+                       end do
+                    end if
+                    alphar( k ) = a( k, k )
+                    alphai( k ) = zero
+                    beta( k ) = b( k, k )
+                 end if
+              end if
+           end do loop_80
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_dtgsen
+
+     !> DTGSJA: computes the generalized singular value decomposition (GSVD)
+     !> of two real upper triangular (or trapezoidal) matrices A and B.
+     !> On entry, it is assumed that matrices A and B have the following
+     !> forms, which may be obtained by the preprocessing subroutine DGGSVP
+     !> from a general M-by-N matrix A and P-by-N matrix B:
+     !> N-K-L  K    L
+     !> A =    K ( 0    A12  A13 ) if M-K-L >= 0;
+     !> L ( 0     0   A23 )
+     !> M-K-L ( 0     0    0  )
+     !> N-K-L  K    L
+     !> A =  K ( 0    A12  A13 ) if M-K-L < 0;
+     !> M-K ( 0     0   A23 )
+     !> N-K-L  K    L
+     !> B =  L ( 0     0   B13 )
+     !> P-L ( 0     0    0  )
+     !> where the K-by-K matrix A12 and L-by-L matrix B13 are nonsingular
+     !> upper triangular; A23 is L-by-L upper triangular if M-K-L >= 0,
+     !> otherwise A23 is (M-K)-by-L upper trapezoidal.
+     !> On exit,
+     !> U**T *A*Q = D1*( 0 R ),    V**T *B*Q = D2*( 0 R ),
+     !> where U, V and Q are orthogonal matrices.
+     !> R is a nonsingular upper triangular matrix, and D1 and D2 are
+     !> ``diagonal'' matrices, which are of the following structures:
+     !> If M-K-L >= 0,
+     !> K  L
+     !> D1 =     K ( I  0 )
+     !> L ( 0  C )
+     !> M-K-L ( 0  0 )
+     !> K  L
+     !> D2 = L   ( 0  S )
+     !> P-L ( 0  0 )
+     !> N-K-L  K    L
+     !> ( 0 R ) = K (  0   R11  R12 ) K
+     !> L (  0    0   R22 ) L
+     !> where
+     !> C = diag( ALPHA(K+1), ... , ALPHA(K+L) ),
+     !> S = diag( BETA(K+1),  ... , BETA(K+L) ),
+     !> C**2 + S**2 = I.
+     !> R is stored in A(1:K+L,N-K-L+1:N) on exit.
+     !> If M-K-L < 0,
+     !> K M-K K+L-M
+     !> D1 =   K ( I  0    0   )
+     !> M-K ( 0  C    0   )
+     !> K M-K K+L-M
+     !> D2 =   M-K ( 0  S    0   )
+     !> K+L-M ( 0  0    I   )
+     !> P-L ( 0  0    0   )
+     !> N-K-L  K   M-K  K+L-M
+     !> ( 0 R ) =    K ( 0    R11  R12  R13  )
+     !> M-K ( 0     0   R22  R23  )
+     !> K+L-M ( 0     0    0   R33  )
+     !> where
+     !> C = diag( ALPHA(K+1), ... , ALPHA(M) ),
+     !> S = diag( BETA(K+1),  ... , BETA(M) ),
+     !> C**2 + S**2 = I.
+     !> R = ( R11 R12 R13 ) is stored in A(1:M, N-K-L+1:N) and R33 is stored
+     !> (  0  R22 R23 )
+     !> in B(M-K+1:L,N+M-K-L+1:N) on exit.
+     !> The computation of the orthogonal transformation matrices U, V or Q
+     !> is optional.  These matrices may either be formed explicitly, or they
+     !> may be postmultiplied into input matrices U1, V1, or Q1.
+
+     pure subroutine stdlib_dtgsja( jobu, jobv, jobq, m, p, n, k, l, a, lda, b,ldb, tola, tolb, &
+               alpha, beta, u, ldu, v, ldv,q, ldq, work, ncycle, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobq, jobu, jobv
+           integer(ilp), intent(out) :: info, ncycle
+           integer(ilp), intent(in) :: k, l, lda, ldb, ldq, ldu, ldv, m, n, p
+           real(dp), intent(in) :: tola, tolb
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), u(ldu,*), v(ldv,*)
+           real(dp), intent(out) :: alpha(*), beta(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 40
+           real(dp), parameter :: hugenum = huge(zero)
+           
+           
+           ! Local Scalars 
+           logical(lk) :: initq, initu, initv, upper, wantq, wantu, wantv
+           integer(ilp) :: i, j, kcycle
+           real(dp) :: a1, a2, a3, b1, b2, b3, csq, csu, csv, error, gamma, rwk, snq, snu, snv, &
+                     ssmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,huge
+           ! Executable Statements 
+           ! decode and test the input parameters
+           initu = stdlib_lsame( jobu, 'I' )
+           wantu = initu .or. stdlib_lsame( jobu, 'U' )
+           initv = stdlib_lsame( jobv, 'I' )
+           wantv = initv .or. stdlib_lsame( jobv, 'V' )
+           initq = stdlib_lsame( jobq, 'I' )
+           wantq = initq .or. stdlib_lsame( jobq, 'Q' )
+           info = 0
+           if( .not.( initu .or. wantu .or. stdlib_lsame( jobu, 'N' ) ) ) then
+              info = -1
+           else if( .not.( initv .or. wantv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -2
+           else if( .not.( initq .or. wantq .or. stdlib_lsame( jobq, 'N' ) ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( p<0 ) then
+              info = -5
+           else if( n<0 ) then
+              info = -6
+           else if( lda<max( 1, m ) ) then
+              info = -10
+           else if( ldb<max( 1, p ) ) then
+              info = -12
+           else if( ldu<1 .or. ( wantu .and. ldu<m ) ) then
+              info = -18
+           else if( ldv<1 .or. ( wantv .and. ldv<p ) ) then
+              info = -20
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -22
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGSJA', -info )
+              return
+           end if
+           ! initialize u, v and q, if necessary
+           if( initu )call stdlib_dlaset( 'FULL', m, m, zero, one, u, ldu )
+           if( initv )call stdlib_dlaset( 'FULL', p, p, zero, one, v, ldv )
+           if( initq )call stdlib_dlaset( 'FULL', n, n, zero, one, q, ldq )
+           ! loop until convergence
+           upper = .false.
+           loop_40: do kcycle = 1, maxit
+              upper = .not.upper
+              loop_20: do i = 1, l - 1
+                 loop_10: do j = i + 1, l
+                    a1 = zero
+                    a2 = zero
+                    a3 = zero
+                    if( k+i<=m )a1 = a( k+i, n-l+i )
+                    if( k+j<=m )a3 = a( k+j, n-l+j )
+                    b1 = b( i, n-l+i )
+                    b3 = b( j, n-l+j )
+                    if( upper ) then
+                       if( k+i<=m )a2 = a( k+i, n-l+j )
+                       b2 = b( i, n-l+j )
+                    else
+                       if( k+j<=m )a2 = a( k+j, n-l+i )
+                       b2 = b( j, n-l+i )
+                    end if
+                    call stdlib_dlags2( upper, a1, a2, a3, b1, b2, b3, csu, snu,csv, snv, csq, &
+                              snq )
+                    ! update (k+i)-th and (k+j)-th rows of matrix a: u**t *a
+                    if( k+j<=m )call stdlib_drot( l, a( k+j, n-l+1 ), lda, a( k+i, n-l+1 ),lda, &
+                              csu, snu )
+                    ! update i-th and j-th rows of matrix b: v**t *b
+                    call stdlib_drot( l, b( j, n-l+1 ), ldb, b( i, n-l+1 ), ldb,csv, snv )
+                    ! update (n-l+i)-th and (n-l+j)-th columns of matrices
+                    ! a and b: a*q and b*q
+                    call stdlib_drot( min( k+l, m ), a( 1, n-l+j ), 1,a( 1, n-l+i ), 1, csq, snq )
+                              
+                    call stdlib_drot( l, b( 1, n-l+j ), 1, b( 1, n-l+i ), 1, csq,snq )
+                    if( upper ) then
+                       if( k+i<=m )a( k+i, n-l+j ) = zero
+                       b( i, n-l+j ) = zero
+                    else
+                       if( k+j<=m )a( k+j, n-l+i ) = zero
+                       b( j, n-l+i ) = zero
+                    end if
+                    ! update orthogonal matrices u, v, q, if desired.
+                    if( wantu .and. k+j<=m )call stdlib_drot( m, u( 1, k+j ), 1, u( 1, k+i ), 1, &
+                              csu,snu )
+                    if( wantv )call stdlib_drot( p, v( 1, j ), 1, v( 1, i ), 1, csv, snv )
+                    if( wantq )call stdlib_drot( n, q( 1, n-l+j ), 1, q( 1, n-l+i ), 1, csq,snq )
+                              
+                 end do loop_10
+              end do loop_20
+              if( .not.upper ) then
+                 ! the matrices a13 and b13 were lower triangular at the start
+                 ! of the cycle, and are now upper triangular.
+                 ! convergence test: test the parallelism of the corresponding
+                 ! rows of a and b.
+                 error = zero
+                 do i = 1, min( l, m-k )
+                    call stdlib_dcopy( l-i+1, a( k+i, n-l+i ), lda, work, 1 )
+                    call stdlib_dcopy( l-i+1, b( i, n-l+i ), ldb, work( l+1 ), 1 )
+                    call stdlib_dlapll( l-i+1, work, 1, work( l+1 ), 1, ssmin )
+                    error = max( error, ssmin )
+                 end do
+                 if( abs( error )<=min( tola, tolb ) )go to 50
+              end if
+              ! end of cycle loop
+           end do loop_40
+           ! the algorithm has not converged after maxit cycles.
+           info = 1
+           go to 100
+           50 continue
+           ! if error <= min(tola,tolb), then the algorithm has converged.
+           ! compute the generalized singular value pairs (alpha, beta), and
+           ! set the triangular matrix r to array a.
+           do i = 1, k
+              alpha( i ) = one
+              beta( i ) = zero
+           end do
+           do i = 1, min( l, m-k )
+              a1 = a( k+i, n-l+i )
+              b1 = b( i, n-l+i )
+              gamma = b1 / a1
+              if( (gamma<=hugenum).and.(gamma>=-hugenum) ) then
+                 ! change sign if necessary
+                 if( gamma<zero ) then
+                    call stdlib_dscal( l-i+1, -one, b( i, n-l+i ), ldb )
+                    if( wantv )call stdlib_dscal( p, -one, v( 1, i ), 1 )
+                 end if
+                 call stdlib_dlartg( abs( gamma ), one, beta( k+i ), alpha( k+i ),rwk )
+                 if( alpha( k+i )>=beta( k+i ) ) then
+                    call stdlib_dscal( l-i+1, one / alpha( k+i ), a( k+i, n-l+i ),lda )
+                 else
+                    call stdlib_dscal( l-i+1, one / beta( k+i ), b( i, n-l+i ),ldb )
+                    call stdlib_dcopy( l-i+1, b( i, n-l+i ), ldb, a( k+i, n-l+i ),lda )
+                 end if
+              else
+                 alpha( k+i ) = zero
+                 beta( k+i ) = one
+                 call stdlib_dcopy( l-i+1, b( i, n-l+i ), ldb, a( k+i, n-l+i ),lda )
+              end if
+           end do
+           ! post-assignment
+           do i = m + 1, k + l
+              alpha( i ) = zero
+              beta( i ) = one
+           end do
+           if( k+l<n ) then
+              do i = k + l + 1, n
+                 alpha( i ) = zero
+                 beta( i ) = zero
+              end do
+           end if
+           100 continue
+           ncycle = kcycle
+           return
+     end subroutine stdlib_dtgsja
+
+     !> DTGSNA: estimates reciprocal condition numbers for specified
+     !> eigenvalues and/or eigenvectors of a matrix pair (A, B) in
+     !> generalized real Schur canonical form (or of any matrix pair
+     !> (Q*A*Z**T, Q*B*Z**T) with orthogonal matrices Q and Z, where
+     !> Z**T denotes the transpose of Z.
+     !> (A, B) must be in generalized real Schur form (as returned by DGGES),
+     !> i.e. A is block upper triangular with 1-by-1 and 2-by-2 diagonal
+     !> blocks. B is upper triangular.
+
+     pure subroutine stdlib_dtgsna( job, howmny, select, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, s, &
+               dif, mm, m, work, lwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: a(lda,*), b(ldb,*), vl(ldvl,*), vr(ldvr,*)
+           real(dp), intent(out) :: dif(*), s(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: difdri = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, pair, somcon, wantbh, wantdf, wants
+           integer(ilp) :: i, ierr, ifst, ilst, iz, k, ks, lwmin, n1, n2
+           real(dp) :: alphai, alphar, alprqt, beta, c1, c2, cond, eps, lnrm, rnrm, root1, root2, &
+                     scale, smlnum, tmpii, tmpir, tmpri, tmprr, uhav, uhavi, uhbv, uhbvi
+           ! Local Arrays 
+           real(dp) :: dummy(1), dummy1(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantdf = stdlib_lsame( job, 'V' ) .or. wantbh
+           somcon = stdlib_lsame( howmny, 'S' )
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.wants .and. .not.wantdf ) then
+              info = -1
+           else if( .not.stdlib_lsame( howmny, 'A' ) .and. .not.somcon ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( wants .and. ldvl<n ) then
+              info = -10
+           else if( wants .and. ldvr<n ) then
+              info = -12
+           else
+              ! set m to the number of eigenpairs for which condition numbers
+              ! are required, and test mm.
+              if( somcon ) then
+                 m = 0
+                 pair = .false.
+                 do k = 1, n
+                    if( pair ) then
+                       pair = .false.
+                    else
+                       if( k<n ) then
+                          if( a( k+1, k )==zero ) then
+                             if( select( k ) )m = m + 1
+                          else
+                             pair = .true.
+                             if( select( k ) .or. select( k+1 ) )m = m + 2
+                          end if
+                       else
+                          if( select( n ) )m = m + 1
+                       end if
+                    end if
+                 end do
+              else
+                 m = n
+              end if
+              if( n==0 ) then
+                 lwmin = 1
+              else if( stdlib_lsame( job, 'V' ) .or. stdlib_lsame( job, 'B' ) ) then
+                 lwmin = 2*n*( n + 2 ) + 16
+              else
+                 lwmin = n
+              end if
+              work( 1 ) = lwmin
+              if( mm<m ) then
+                 info = -15
+              else if( lwork<lwmin .and. .not.lquery ) then
+                 info = -18
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGSNA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' ) / eps
+           ks = 0
+           pair = .false.
+           loop_20: do k = 1, n
+              ! determine whether a(k,k) begins a 1-by-1 or 2-by-2 block.
+              if( pair ) then
+                 pair = .false.
+                 cycle loop_20
+              else
+                 if( k<n )pair = a( k+1, k )/=zero
+              end if
+              ! determine whether condition numbers are required for the k-th
+              ! eigenpair.
+              if( somcon ) then
+                 if( pair ) then
+                    if( .not.select( k ) .and. .not.select( k+1 ) )cycle loop_20
+                 else
+                    if( .not.select( k ) )cycle loop_20
+                 end if
+              end if
+              ks = ks + 1
+              if( wants ) then
+                 ! compute the reciprocal condition number of the k-th
+                 ! eigenvalue.
+                 if( pair ) then
+                    ! complex eigenvalue pair.
+                    rnrm = stdlib_dlapy2( stdlib_dnrm2( n, vr( 1, ks ), 1 ),stdlib_dnrm2( n, vr( &
+                              1, ks+1 ), 1 ) )
+                    lnrm = stdlib_dlapy2( stdlib_dnrm2( n, vl( 1, ks ), 1 ),stdlib_dnrm2( n, vl( &
+                              1, ks+1 ), 1 ) )
+                    call stdlib_dgemv( 'N', n, n, one, a, lda, vr( 1, ks ), 1, zero,work, 1 )
+                              
+                    tmprr = stdlib_ddot( n, work, 1, vl( 1, ks ), 1 )
+                    tmpri = stdlib_ddot( n, work, 1, vl( 1, ks+1 ), 1 )
+                    call stdlib_dgemv( 'N', n, n, one, a, lda, vr( 1, ks+1 ), 1,zero, work, 1 )
+                              
+                    tmpii = stdlib_ddot( n, work, 1, vl( 1, ks+1 ), 1 )
+                    tmpir = stdlib_ddot( n, work, 1, vl( 1, ks ), 1 )
+                    uhav = tmprr + tmpii
+                    uhavi = tmpir - tmpri
+                    call stdlib_dgemv( 'N', n, n, one, b, ldb, vr( 1, ks ), 1, zero,work, 1 )
+                              
+                    tmprr = stdlib_ddot( n, work, 1, vl( 1, ks ), 1 )
+                    tmpri = stdlib_ddot( n, work, 1, vl( 1, ks+1 ), 1 )
+                    call stdlib_dgemv( 'N', n, n, one, b, ldb, vr( 1, ks+1 ), 1,zero, work, 1 )
+                              
+                    tmpii = stdlib_ddot( n, work, 1, vl( 1, ks+1 ), 1 )
+                    tmpir = stdlib_ddot( n, work, 1, vl( 1, ks ), 1 )
+                    uhbv = tmprr + tmpii
+                    uhbvi = tmpir - tmpri
+                    uhav = stdlib_dlapy2( uhav, uhavi )
+                    uhbv = stdlib_dlapy2( uhbv, uhbvi )
+                    cond = stdlib_dlapy2( uhav, uhbv )
+                    s( ks ) = cond / ( rnrm*lnrm )
+                    s( ks+1 ) = s( ks )
+                 else
+                    ! real eigenvalue.
+                    rnrm = stdlib_dnrm2( n, vr( 1, ks ), 1 )
+                    lnrm = stdlib_dnrm2( n, vl( 1, ks ), 1 )
+                    call stdlib_dgemv( 'N', n, n, one, a, lda, vr( 1, ks ), 1, zero,work, 1 )
+                              
+                    uhav = stdlib_ddot( n, work, 1, vl( 1, ks ), 1 )
+                    call stdlib_dgemv( 'N', n, n, one, b, ldb, vr( 1, ks ), 1, zero,work, 1 )
+                              
+                    uhbv = stdlib_ddot( n, work, 1, vl( 1, ks ), 1 )
+                    cond = stdlib_dlapy2( uhav, uhbv )
+                    if( cond==zero ) then
+                       s( ks ) = -one
+                    else
+                       s( ks ) = cond / ( rnrm*lnrm )
+                    end if
+                 end if
+              end if
+              if( wantdf ) then
+                 if( n==1 ) then
+                    dif( ks ) = stdlib_dlapy2( a( 1, 1 ), b( 1, 1 ) )
+                    cycle loop_20
+                 end if
+                 ! estimate the reciprocal condition number of the k-th
+                 ! eigenvectors.
+                 if( pair ) then
+                    ! copy the  2-by 2 pencil beginning at (a(k,k), b(k, k)).
+                    ! compute the eigenvalue(s) at position k.
+                    work( 1 ) = a( k, k )
+                    work( 2 ) = a( k+1, k )
+                    work( 3 ) = a( k, k+1 )
+                    work( 4 ) = a( k+1, k+1 )
+                    work( 5 ) = b( k, k )
+                    work( 6 ) = b( k+1, k )
+                    work( 7 ) = b( k, k+1 )
+                    work( 8 ) = b( k+1, k+1 )
+                    call stdlib_dlag2( work, 2, work( 5 ), 2, smlnum*eps, beta,dummy1( 1 ), &
+                              alphar, dummy( 1 ), alphai )
+                    alprqt = one
+                    c1 = two*( alphar*alphar+alphai*alphai+beta*beta )
+                    c2 = four*beta*beta*alphai*alphai
+                    root1 = c1 + sqrt( c1*c1-4.0_dp*c2 )
+                    root2 = c2 / root1
+                    root1 = root1 / two
+                    cond = min( sqrt( root1 ), sqrt( root2 ) )
+                 end if
+                 ! copy the matrix (a, b) to the array work and swap the
+                 ! diagonal block beginning at a(k,k) to the (1,1) position.
+                 call stdlib_dlacpy( 'FULL', n, n, a, lda, work, n )
+                 call stdlib_dlacpy( 'FULL', n, n, b, ldb, work( n*n+1 ), n )
+                 ifst = k
+                 ilst = 1
+                 call stdlib_dtgexc( .false., .false., n, work, n, work( n*n+1 ), n,dummy, 1, &
+                           dummy1, 1, ifst, ilst,work( n*n*2+1 ), lwork-2*n*n, ierr )
+                 if( ierr>0 ) then
+                    ! ill-conditioned problem - swap rejected.
+                    dif( ks ) = zero
+                 else
+                    ! reordering successful, solve generalized sylvester
+                    ! equation for r and l,
+                               ! a22 * r - l * a11 = a12
+                               ! b22 * r - l * b11 = b12,
+                    ! and compute estimate of difl((a11,b11), (a22, b22)).
+                    n1 = 1
+                    if( work( 2 )/=zero )n1 = 2
+                    n2 = n - n1
+                    if( n2==0 ) then
+                       dif( ks ) = cond
+                    else
+                       i = n*n + 1
+                       iz = 2*n*n + 1
+                       call stdlib_dtgsyl( 'N', difdri, n2, n1, work( n*n1+n1+1 ),n, work, n, &
+                       work( n1+1 ), n,work( n*n1+n1+i ), n, work( i ), n,work( n1+i ), n, scale, &
+                                 dif( ks ),work( iz+1 ), lwork-2*n*n, iwork, ierr )
+                       if( pair )dif( ks ) = min( max( one, alprqt )*dif( ks ),cond )
+                    end if
+                 end if
+                 if( pair )dif( ks+1 ) = dif( ks )
+              end if
+              if( pair )ks = ks + 1
+           end do loop_20
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_dtgsna
+
+     !> DTPLQT: computes a blocked LQ factorization of a real
+     !> "triangular-pentagonal" matrix C, which is composed of a
+     !> triangular block A and pentagonal block B, using the compact
+     !> WY representation for Q.
+
+     pure subroutine stdlib_dtplqt( m, n, l, mb, a, lda, b, ldb, t, ldt, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l, mb
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, lb, nb, iinfo
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. (l>min(m,n) .and. min(m,n)>=0)) then
+              info = -3
+           else if( mb<1 .or. (mb>m .and. m>0)) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m ) ) then
+              info = -8
+           else if( ldt<mb ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPLQT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) return
+           do i = 1, m, mb
+           ! compute the qr factorization of the current block
+              ib = min( m-i+1, mb )
+              nb = min( n-l+i+ib-1, n )
+              if( i>=l ) then
+                 lb = 0
+              else
+                 lb = nb-n+l-i+1
+              end if
+              call stdlib_dtplqt2( ib, nb, lb, a(i,i), lda, b( i, 1 ), ldb,t(1, i ), ldt, iinfo )
+                        
+           ! update by applying h**t to b(i+ib:m,:) from the right
+              if( i+ib<=m ) then
+                 call stdlib_dtprfb( 'R', 'N', 'F', 'R', m-i-ib+1, nb, ib, lb,b( i, 1 ), ldb, t( &
+                           1, i ), ldt,a( i+ib, i ), lda, b( i+ib, 1 ), ldb,work, m-i-ib+1)
+              end if
+           end do
+           return
+     end subroutine stdlib_dtplqt
+
+     !> DTPQRT: computes a blocked QR factorization of a real
+     !> "triangular-pentagonal" matrix C, which is composed of a
+     !> triangular block A and pentagonal block B, using the compact
+     !> WY representation for Q.
+
+     pure subroutine stdlib_dtpqrt( m, n, l, nb, a, lda, b, ldb, t, ldt, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l, nb
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, lb, mb, iinfo
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. (l>min(m,n) .and. min(m,n)>=0)) then
+              info = -3
+           else if( nb<1 .or. (nb>n .and. n>0)) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, m ) ) then
+              info = -8
+           else if( ldt<nb ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPQRT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) return
+           do i = 1, n, nb
+           ! compute the qr factorization of the current block
+              ib = min( n-i+1, nb )
+              mb = min( m-l+i+ib-1, m )
+              if( i>=l ) then
+                 lb = 0
+              else
+                 lb = mb-m+l-i+1
+              end if
+              call stdlib_dtpqrt2( mb, ib, lb, a(i,i), lda, b( 1, i ), ldb,t(1, i ), ldt, iinfo )
+                        
+           ! update by applying h**t to b(:,i+ib:n) from the left
+              if( i+ib<=n ) then
+                 call stdlib_dtprfb( 'L', 'T', 'F', 'C', mb, n-i-ib+1, ib, lb,b( 1, i ), ldb, t( &
+                           1, i ), ldt,a( i, i+ib ), lda, b( 1, i+ib ), ldb,work, ib )
+              end if
+           end do
+           return
+     end subroutine stdlib_dtpqrt
+
+     !> DTREVC: computes some or all of the right and/or left eigenvectors of
+     !> a real upper quasi-triangular matrix T.
+     !> Matrices of this type are produced by the Schur factorization of
+     !> a real general matrix:  A = Q*T*Q**T, as computed by DHSEQR.
+     !> The right eigenvector x and the left eigenvector y of T corresponding
+     !> to an eigenvalue w are defined by:
+     !> T*x = w*x,     (y**H)*T = w*(y**H)
+     !> where y**H denotes the conjugate transpose of y.
+     !> The eigenvalues are not input to this routine, but are read directly
+     !> from the diagonal blocks of T.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+     !> input matrix.  If Q is the orthogonal factor that reduces a matrix
+     !> A to Schur form T, then Q*X and Q*Y are the matrices of right and
+     !> left eigenvectors of A.
+
+     pure subroutine stdlib_dtrevc( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(inout) :: select(*)
+           real(dp), intent(in) :: t(ldt,*)
+           real(dp), intent(inout) :: vl(ldvl,*), vr(ldvr,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: allv, bothv, leftv, over, pair, rightv, somev
+           integer(ilp) :: i, ierr, ii, ip, is, j, j1, j2, jnxt, k, ki, n2
+           real(dp) :: beta, bignum, emax, ovfl, rec, remax, scale, smin, smlnum, ulp, unfl, &
+                     vcrit, vmax, wi, wr, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Local Arrays 
+           real(dp) :: x(2,2)
+           ! Executable Statements 
+           ! decode and test the input parameters
+           bothv = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv = stdlib_lsame( side, 'L' ) .or. bothv
+           allv = stdlib_lsame( howmny, 'A' )
+           over = stdlib_lsame( howmny, 'B' )
+           somev = stdlib_lsame( howmny, 'S' )
+           info = 0
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.allv .and. .not.over .and. .not.somev ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -10
+           else
+              ! set m to the number of columns required to store the selected
+              ! eigenvectors, standardize the array select if necessary, and
+              ! test mm.
+              if( somev ) then
+                 m = 0
+                 pair = .false.
+                 do j = 1, n
+                    if( pair ) then
+                       pair = .false.
+                       select( j ) = .false.
+                    else
+                       if( j<n ) then
+                          if( t( j+1, j )==zero ) then
+                             if( select( j ) )m = m + 1
+                          else
+                             pair = .true.
+                             if( select( j ) .or. select( j+1 ) ) then
+                                select( j ) = .true.
+                                m = m + 2
+                             end if
+                          end if
+                       else
+                          if( select( n ) )m = m + 1
+                       end if
+                    end if
+                 end do
+              else
+                 m = n
+              end if
+              if( mm<m ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTREVC', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! set the constants to control overflow.
+           unfl = stdlib_dlamch( 'SAFE MINIMUM' )
+           ovfl = one / unfl
+           call stdlib_dlabad( unfl, ovfl )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           bignum = ( one-ulp ) / smlnum
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           work( 1 ) = zero
+           do j = 2, n
+              work( j ) = zero
+              do i = 1, j - 1
+                 work( j ) = work( j ) + abs( t( i, j ) )
+              end do
+           end do
+           ! index ip is used to specify the real or complex eigenvalue:
+             ! ip = 0, real eigenvalue,
+                  ! 1, first of conjugate complex pair: (wr,wi)
+                 ! -1, second of conjugate complex pair: (wr,wi)
+           n2 = 2*n
+           if( rightv ) then
+              ! compute right eigenvectors.
+              ip = 0
+              is = m
+              loop_140: do ki = n, 1, -1
+                 if( ip==1 )go to 130
+                 if( ki==1 )go to 40
+                 if( t( ki, ki-1 )==zero )go to 40
+                 ip = -1
+                 40 continue
+                 if( somev ) then
+                    if( ip==0 ) then
+                       if( .not.select( ki ) )go to 130
+                    else
+                       if( .not.select( ki-1 ) )go to 130
+                    end if
+                 end if
+                 ! compute the ki-th eigenvalue (wr,wi).
+                 wr = t( ki, ki )
+                 wi = zero
+                 if( ip/=0 )wi = sqrt( abs( t( ki, ki-1 ) ) )*sqrt( abs( t( ki-1, ki ) ) )
+                 smin = max( ulp*( abs( wr )+abs( wi ) ), smlnum )
+                 if( ip==0 ) then
+                    ! real right eigenvector
+                    work( ki+n ) = one
+                    ! form right-hand side
+                    do k = 1, ki - 1
+                       work( k+n ) = -t( k, ki )
+                    end do
+                    ! solve the upper quasi-triangular system:
+                       ! (t(1:ki-1,1:ki-1) - wr)*x = scale*work.
+                    jnxt = ki - 1
+                    loop_60: do j = ki - 1, 1, -1
+                       if( j>jnxt )cycle loop_60
+                       j1 = j
+                       j2 = j
+                       jnxt = j - 1
+                       if( j>1 ) then
+                          if( t( j, j-1 )/=zero ) then
+                             j1 = j - 1
+                             jnxt = j - 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          call stdlib_dlaln2( .false., 1, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale x(1,1) to avoid overflow when updating
+                          ! the right-hand side.
+                          if( xnorm>one ) then
+                             if( work( j )>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_dscal( ki, scale, work( 1+n ), 1 )
+                          work( j+n ) = x( 1, 1 )
+                          ! update right-hand side
+                          call stdlib_daxpy( j-1, -x( 1, 1 ), t( 1, j ), 1,work( 1+n ), 1 )
+                                    
+                       else
+                          ! 2-by-2 diagonal block
+                          call stdlib_dlaln2( .false., 2, 1, smin, one,t( j-1, j-1 ), ldt, one, &
+                                    one,work( j-1+n ), n, wr, zero, x, 2,scale, xnorm, ierr )
+                          ! scale x(1,1) and x(2,1) to avoid overflow when
+                          ! updating the right-hand side.
+                          if( xnorm>one ) then
+                             beta = max( work( j-1 ), work( j ) )
+                             if( beta>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                x( 2, 1 ) = x( 2, 1 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_dscal( ki, scale, work( 1+n ), 1 )
+                          work( j-1+n ) = x( 1, 1 )
+                          work( j+n ) = x( 2, 1 )
+                          ! update right-hand side
+                          call stdlib_daxpy( j-2, -x( 1, 1 ), t( 1, j-1 ), 1,work( 1+n ), 1 )
+                                    
+                          call stdlib_daxpy( j-2, -x( 2, 1 ), t( 1, j ), 1,work( 1+n ), 1 )
+                                    
+                       end if
+                    end do loop_60
+                    ! copy the vector x or q*x to vr and normalize.
+                    if( .not.over ) then
+                       call stdlib_dcopy( ki, work( 1+n ), 1, vr( 1, is ), 1 )
+                       ii = stdlib_idamax( ki, vr( 1, is ), 1 )
+                       remax = one / abs( vr( ii, is ) )
+                       call stdlib_dscal( ki, remax, vr( 1, is ), 1 )
+                       do k = ki + 1, n
+                          vr( k, is ) = zero
+                       end do
+                    else
+                       if( ki>1 )call stdlib_dgemv( 'N', n, ki-1, one, vr, ldvr,work( 1+n ), 1, &
+                                 work( ki+n ),vr( 1, ki ), 1 )
+                       ii = stdlib_idamax( n, vr( 1, ki ), 1 )
+                       remax = one / abs( vr( ii, ki ) )
+                       call stdlib_dscal( n, remax, vr( 1, ki ), 1 )
+                    end if
+                 else
+                    ! complex right eigenvector.
+                    ! initial solve
+                      ! [ (t(ki-1,ki-1) t(ki-1,ki) ) - (wr + i* wi)]*x = 0.
+                      ! [ (t(ki,ki-1)   t(ki,ki)   )               ]
+                    if( abs( t( ki-1, ki ) )>=abs( t( ki, ki-1 ) ) ) then
+                       work( ki-1+n ) = one
+                       work( ki+n2 ) = wi / t( ki-1, ki )
+                    else
+                       work( ki-1+n ) = -wi / t( ki, ki-1 )
+                       work( ki+n2 ) = one
+                    end if
+                    work( ki+n ) = zero
+                    work( ki-1+n2 ) = zero
+                    ! form right-hand side
+                    do k = 1, ki - 2
+                       work( k+n ) = -work( ki-1+n )*t( k, ki-1 )
+                       work( k+n2 ) = -work( ki+n2 )*t( k, ki )
+                    end do
+                    ! solve upper quasi-triangular system:
+                    ! (t(1:ki-2,1:ki-2) - (wr+i*wi))*x = scale*(work+i*work2)
+                    jnxt = ki - 2
+                    loop_90: do j = ki - 2, 1, -1
+                       if( j>jnxt )cycle loop_90
+                       j1 = j
+                       j2 = j
+                       jnxt = j - 1
+                       if( j>1 ) then
+                          if( t( j, j-1 )/=zero ) then
+                             j1 = j - 1
+                             jnxt = j - 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          call stdlib_dlaln2( .false., 1, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr, wi,x, 2, scale, xnorm, ierr )
+                          ! scale x(1,1) and x(1,2) to avoid overflow when
+                          ! updating the right-hand side.
+                          if( xnorm>one ) then
+                             if( work( j )>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                x( 1, 2 ) = x( 1, 2 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_dscal( ki, scale, work( 1+n ), 1 )
+                             call stdlib_dscal( ki, scale, work( 1+n2 ), 1 )
+                          end if
+                          work( j+n ) = x( 1, 1 )
+                          work( j+n2 ) = x( 1, 2 )
+                          ! update the right-hand side
+                          call stdlib_daxpy( j-1, -x( 1, 1 ), t( 1, j ), 1,work( 1+n ), 1 )
+                                    
+                          call stdlib_daxpy( j-1, -x( 1, 2 ), t( 1, j ), 1,work( 1+n2 ), 1 )
+                                    
+                       else
+                          ! 2-by-2 diagonal block
+                          call stdlib_dlaln2( .false., 2, 2, smin, one,t( j-1, j-1 ), ldt, one, &
+                                    one,work( j-1+n ), n, wr, wi, x, 2, scale,xnorm, ierr )
+                          ! scale x to avoid overflow when updating
+                          ! the right-hand side.
+                          if( xnorm>one ) then
+                             beta = max( work( j-1 ), work( j ) )
+                             if( beta>bignum / xnorm ) then
+                                rec = one / xnorm
+                                x( 1, 1 ) = x( 1, 1 )*rec
+                                x( 1, 2 ) = x( 1, 2 )*rec
+                                x( 2, 1 ) = x( 2, 1 )*rec
+                                x( 2, 2 ) = x( 2, 2 )*rec
+                                scale = scale*rec
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_dscal( ki, scale, work( 1+n ), 1 )
+                             call stdlib_dscal( ki, scale, work( 1+n2 ), 1 )
+                          end if
+                          work( j-1+n ) = x( 1, 1 )
+                          work( j+n ) = x( 2, 1 )
+                          work( j-1+n2 ) = x( 1, 2 )
+                          work( j+n2 ) = x( 2, 2 )
+                          ! update the right-hand side
+                          call stdlib_daxpy( j-2, -x( 1, 1 ), t( 1, j-1 ), 1,work( 1+n ), 1 )
+                                    
+                          call stdlib_daxpy( j-2, -x( 2, 1 ), t( 1, j ), 1,work( 1+n ), 1 )
+                                    
+                          call stdlib_daxpy( j-2, -x( 1, 2 ), t( 1, j-1 ), 1,work( 1+n2 ), 1 )
+                                    
+                          call stdlib_daxpy( j-2, -x( 2, 2 ), t( 1, j ), 1,work( 1+n2 ), 1 )
+                                    
+                       end if
+                    end do loop_90
+                    ! copy the vector x or q*x to vr and normalize.
+                    if( .not.over ) then
+                       call stdlib_dcopy( ki, work( 1+n ), 1, vr( 1, is-1 ), 1 )
+                       call stdlib_dcopy( ki, work( 1+n2 ), 1, vr( 1, is ), 1 )
+                       emax = zero
+                       do k = 1, ki
+                          emax = max( emax, abs( vr( k, is-1 ) )+abs( vr( k, is ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_dscal( ki, remax, vr( 1, is-1 ), 1 )
+                       call stdlib_dscal( ki, remax, vr( 1, is ), 1 )
+                       do k = ki + 1, n
+                          vr( k, is-1 ) = zero
+                          vr( k, is ) = zero
+                       end do
+                    else
+                       if( ki>2 ) then
+                          call stdlib_dgemv( 'N', n, ki-2, one, vr, ldvr,work( 1+n ), 1, work( ki-&
+                                    1+n ),vr( 1, ki-1 ), 1 )
+                          call stdlib_dgemv( 'N', n, ki-2, one, vr, ldvr,work( 1+n2 ), 1, work( &
+                                    ki+n2 ),vr( 1, ki ), 1 )
+                       else
+                          call stdlib_dscal( n, work( ki-1+n ), vr( 1, ki-1 ), 1 )
+                          call stdlib_dscal( n, work( ki+n2 ), vr( 1, ki ), 1 )
+                       end if
+                       emax = zero
+                       do k = 1, n
+                          emax = max( emax, abs( vr( k, ki-1 ) )+abs( vr( k, ki ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_dscal( n, remax, vr( 1, ki-1 ), 1 )
+                       call stdlib_dscal( n, remax, vr( 1, ki ), 1 )
+                    end if
+                 end if
+                 is = is - 1
+                 if( ip/=0 )is = is - 1
+                 130 continue
+                 if( ip==1 )ip = 0
+                 if( ip==-1 )ip = 1
+              end do loop_140
+           end if
+           if( leftv ) then
+              ! compute left eigenvectors.
+              ip = 0
+              is = 1
+              loop_260: do ki = 1, n
+                 if( ip==-1 )go to 250
+                 if( ki==n )go to 150
+                 if( t( ki+1, ki )==zero )go to 150
+                 ip = 1
+                 150 continue
+                 if( somev ) then
+                    if( .not.select( ki ) )go to 250
+                 end if
+                 ! compute the ki-th eigenvalue (wr,wi).
+                 wr = t( ki, ki )
+                 wi = zero
+                 if( ip/=0 )wi = sqrt( abs( t( ki, ki+1 ) ) )*sqrt( abs( t( ki+1, ki ) ) )
+                 smin = max( ulp*( abs( wr )+abs( wi ) ), smlnum )
+                 if( ip==0 ) then
+                    ! real left eigenvector.
+                    work( ki+n ) = one
+                    ! form right-hand side
+                    do k = ki + 1, n
+                       work( k+n ) = -t( ki, k )
+                    end do
+                    ! solve the quasi-triangular system:
+                       ! (t(ki+1:n,ki+1:n) - wr)**t*x = scale*work
+                    vmax = one
+                    vcrit = bignum
+                    jnxt = ki + 1
+                    loop_170: do j = ki + 1, n
+                       if( j<jnxt )cycle loop_170
+                       j1 = j
+                       j2 = j
+                       jnxt = j + 1
+                       if( j<n ) then
+                          if( t( j+1, j )/=zero ) then
+                             j2 = j + 1
+                             jnxt = j + 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side.
+                          if( work( j )>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_dscal( n-ki+1, rec, work( ki+n ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+n ) = work( j+n ) -stdlib_ddot( j-ki-1, t( ki+1, j ), 1,work( &
+                                    ki+1+n ), 1 )
+                          ! solve (t(j,j)-wr)**t*x = work
+                          call stdlib_dlaln2( .false., 1, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_dscal( n-ki+1, scale, work( ki+n ), 1 )
+                                    
+                          work( j+n ) = x( 1, 1 )
+                          vmax = max( abs( work( j+n ) ), vmax )
+                          vcrit = bignum / vmax
+                       else
+                          ! 2-by-2 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side.
+                          beta = max( work( j ), work( j+1 ) )
+                          if( beta>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_dscal( n-ki+1, rec, work( ki+n ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+n ) = work( j+n ) -stdlib_ddot( j-ki-1, t( ki+1, j ), 1,work( &
+                                    ki+1+n ), 1 )
+                          work( j+1+n ) = work( j+1+n ) -stdlib_ddot( j-ki-1, t( ki+1, j+1 ), 1,&
+                                    work( ki+1+n ), 1 )
+                          ! solve
+                            ! [t(j,j)-wr   t(j,j+1)     ]**t * x = scale*( work1 )
+                            ! [t(j+1,j)    t(j+1,j+1)-wr]                ( work2 )
+                          call stdlib_dlaln2( .true., 2, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_dscal( n-ki+1, scale, work( ki+n ), 1 )
+                                    
+                          work( j+n ) = x( 1, 1 )
+                          work( j+1+n ) = x( 2, 1 )
+                          vmax = max( abs( work( j+n ) ),abs( work( j+1+n ) ), vmax )
+                          vcrit = bignum / vmax
+                       end if
+                    end do loop_170
+                    ! copy the vector x or q*x to vl and normalize.
+                    if( .not.over ) then
+                       call stdlib_dcopy( n-ki+1, work( ki+n ), 1, vl( ki, is ), 1 )
+                       ii = stdlib_idamax( n-ki+1, vl( ki, is ), 1 ) + ki - 1
+                       remax = one / abs( vl( ii, is ) )
+                       call stdlib_dscal( n-ki+1, remax, vl( ki, is ), 1 )
+                       do k = 1, ki - 1
+                          vl( k, is ) = zero
+                       end do
+                    else
+                       if( ki<n )call stdlib_dgemv( 'N', n, n-ki, one, vl( 1, ki+1 ), ldvl,work( &
+                                 ki+1+n ), 1, work( ki+n ),vl( 1, ki ), 1 )
+                       ii = stdlib_idamax( n, vl( 1, ki ), 1 )
+                       remax = one / abs( vl( ii, ki ) )
+                       call stdlib_dscal( n, remax, vl( 1, ki ), 1 )
+                    end if
+                 else
+                    ! complex left eigenvector.
+                     ! initial solve:
+                       ! ((t(ki,ki)    t(ki,ki+1) )**t - (wr - i* wi))*x = 0.
+                       ! ((t(ki+1,ki) t(ki+1,ki+1))                )
+                    if( abs( t( ki, ki+1 ) )>=abs( t( ki+1, ki ) ) ) then
+                       work( ki+n ) = wi / t( ki, ki+1 )
+                       work( ki+1+n2 ) = one
+                    else
+                       work( ki+n ) = one
+                       work( ki+1+n2 ) = -wi / t( ki+1, ki )
+                    end if
+                    work( ki+1+n ) = zero
+                    work( ki+n2 ) = zero
+                    ! form right-hand side
+                    do k = ki + 2, n
+                       work( k+n ) = -work( ki+n )*t( ki, k )
+                       work( k+n2 ) = -work( ki+1+n2 )*t( ki+1, k )
+                    end do
+                    ! solve complex quasi-triangular system:
+                    ! ( t(ki+2,n:ki+2,n) - (wr-i*wi) )*x = work1+i*work2
+                    vmax = one
+                    vcrit = bignum
+                    jnxt = ki + 2
+                    loop_200: do j = ki + 2, n
+                       if( j<jnxt )cycle loop_200
+                       j1 = j
+                       j2 = j
+                       jnxt = j + 1
+                       if( j<n ) then
+                          if( t( j+1, j )/=zero ) then
+                             j2 = j + 1
+                             jnxt = j + 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          ! scale if necessary to avoid overflow when
+                          ! forming the right-hand side elements.
+                          if( work( j )>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_dscal( n-ki+1, rec, work( ki+n ), 1 )
+                             call stdlib_dscal( n-ki+1, rec, work( ki+n2 ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+n ) = work( j+n ) -stdlib_ddot( j-ki-2, t( ki+2, j ), 1,work( &
+                                    ki+2+n ), 1 )
+                          work( j+n2 ) = work( j+n2 ) -stdlib_ddot( j-ki-2, t( ki+2, j ), 1,work( &
+                                    ki+2+n2 ), 1 )
+                          ! solve (t(j,j)-(wr-i*wi))*(x11+i*x12)= wk+i*wk2
+                          call stdlib_dlaln2( .false., 1, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,-wi, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_dscal( n-ki+1, scale, work( ki+n ), 1 )
+                             call stdlib_dscal( n-ki+1, scale, work( ki+n2 ), 1 )
+                          end if
+                          work( j+n ) = x( 1, 1 )
+                          work( j+n2 ) = x( 1, 2 )
+                          vmax = max( abs( work( j+n ) ),abs( work( j+n2 ) ), vmax )
+                          vcrit = bignum / vmax
+                       else
+                          ! 2-by-2 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side elements.
+                          beta = max( work( j ), work( j+1 ) )
+                          if( beta>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_dscal( n-ki+1, rec, work( ki+n ), 1 )
+                             call stdlib_dscal( n-ki+1, rec, work( ki+n2 ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+n ) = work( j+n ) -stdlib_ddot( j-ki-2, t( ki+2, j ), 1,work( &
+                                    ki+2+n ), 1 )
+                          work( j+n2 ) = work( j+n2 ) -stdlib_ddot( j-ki-2, t( ki+2, j ), 1,work( &
+                                    ki+2+n2 ), 1 )
+                          work( j+1+n ) = work( j+1+n ) -stdlib_ddot( j-ki-2, t( ki+2, j+1 ), 1,&
+                                    work( ki+2+n ), 1 )
+                          work( j+1+n2 ) = work( j+1+n2 ) -stdlib_ddot( j-ki-2, t( ki+2, j+1 ), 1,&
+                                    work( ki+2+n2 ), 1 )
+                          ! solve 2-by-2 complex linear equation
+                            ! ([t(j,j)   t(j,j+1)  ]**t-(wr-i*wi)*i)*x = scale*b
+                            ! ([t(j+1,j) t(j+1,j+1)]               )
+                          call stdlib_dlaln2( .true., 2, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,-wi, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_dscal( n-ki+1, scale, work( ki+n ), 1 )
+                             call stdlib_dscal( n-ki+1, scale, work( ki+n2 ), 1 )
+                          end if
+                          work( j+n ) = x( 1, 1 )
+                          work( j+n2 ) = x( 1, 2 )
+                          work( j+1+n ) = x( 2, 1 )
+                          work( j+1+n2 ) = x( 2, 2 )
+                          vmax = max( abs( x( 1, 1 ) ), abs( x( 1, 2 ) ),abs( x( 2, 1 ) ), abs( x(&
+                                     2, 2 ) ), vmax )
+                          vcrit = bignum / vmax
+                       end if
+                    end do loop_200
+                    ! copy the vector x or q*x to vl and normalize.
+                    if( .not.over ) then
+                       call stdlib_dcopy( n-ki+1, work( ki+n ), 1, vl( ki, is ), 1 )
+                       call stdlib_dcopy( n-ki+1, work( ki+n2 ), 1, vl( ki, is+1 ),1 )
+                       emax = zero
+                       do k = ki, n
+                          emax = max( emax, abs( vl( k, is ) )+abs( vl( k, is+1 ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_dscal( n-ki+1, remax, vl( ki, is ), 1 )
+                       call stdlib_dscal( n-ki+1, remax, vl( ki, is+1 ), 1 )
+                       do k = 1, ki - 1
+                          vl( k, is ) = zero
+                          vl( k, is+1 ) = zero
+                       end do
+                    else
+                       if( ki<n-1 ) then
+                          call stdlib_dgemv( 'N', n, n-ki-1, one, vl( 1, ki+2 ),ldvl, work( ki+2+&
+                                    n ), 1, work( ki+n ),vl( 1, ki ), 1 )
+                          call stdlib_dgemv( 'N', n, n-ki-1, one, vl( 1, ki+2 ),ldvl, work( ki+2+&
+                                    n2 ), 1,work( ki+1+n2 ), vl( 1, ki+1 ), 1 )
+                       else
+                          call stdlib_dscal( n, work( ki+n ), vl( 1, ki ), 1 )
+                          call stdlib_dscal( n, work( ki+1+n2 ), vl( 1, ki+1 ), 1 )
+                       end if
+                       emax = zero
+                       do k = 1, n
+                          emax = max( emax, abs( vl( k, ki ) )+abs( vl( k, ki+1 ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_dscal( n, remax, vl( 1, ki ), 1 )
+                       call stdlib_dscal( n, remax, vl( 1, ki+1 ), 1 )
+                    end if
+                 end if
+                 is = is + 1
+                 if( ip/=0 )is = is + 1
+                 250 continue
+                 if( ip==-1 )ip = 0
+                 if( ip==1 )ip = -1
+              end do loop_260
+           end if
+           return
+     end subroutine stdlib_dtrevc
+
+     !> DTREVC3: computes some or all of the right and/or left eigenvectors of
+     !> a real upper quasi-triangular matrix T.
+     !> Matrices of this type are produced by the Schur factorization of
+     !> a real general matrix:  A = Q*T*Q**T, as computed by DHSEQR.
+     !> The right eigenvector x and the left eigenvector y of T corresponding
+     !> to an eigenvalue w are defined by:
+     !> T*x = w*x,     (y**T)*T = w*(y**T)
+     !> where y**T denotes the transpose of the vector y.
+     !> The eigenvalues are not input to this routine, but are read directly
+     !> from the diagonal blocks of T.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+     !> input matrix. If Q is the orthogonal factor that reduces a matrix
+     !> A to Schur form T, then Q*X and Q*Y are the matrices of right and
+     !> left eigenvectors of A.
+     !> This uses a Level 3 BLAS version of the back transformation.
+
+     pure subroutine stdlib_dtrevc3( side, howmny, select, n, t, ldt, vl, ldvl,vr, ldvr, mm, m, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, lwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(inout) :: select(*)
+           real(dp), intent(in) :: t(ldt,*)
+           real(dp), intent(inout) :: vl(ldvl,*), vr(ldvr,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmin = 8
+           integer(ilp), parameter :: nbmax = 128
+           
+           
+           ! Local Scalars 
+           logical(lk) :: allv, bothv, leftv, lquery, over, pair, rightv, somev
+           integer(ilp) :: i, ierr, ii, ip, is, j, j1, j2, jnxt, k, ki, iv, maxwrk, nb, &
+                     ki2
+           real(dp) :: beta, bignum, emax, ovfl, rec, remax, scale, smin, smlnum, ulp, unfl, &
+                     vcrit, vmax, wi, wr, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Local Arrays 
+           real(dp) :: x(2,2)
+           integer(ilp) :: iscomplex(nbmax)
+           ! Executable Statements 
+           ! decode and test the input parameters
+           bothv  = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv  = stdlib_lsame( side, 'L' ) .or. bothv
+           allv  = stdlib_lsame( howmny, 'A' )
+           over  = stdlib_lsame( howmny, 'B' )
+           somev = stdlib_lsame( howmny, 'S' )
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DTREVC', side // howmny, n, -1, -1, -1 )
+           maxwrk = n + 2*n*nb
+           work(1) = maxwrk
+           lquery = ( lwork==-1 )
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.allv .and. .not.over .and. .not.somev ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -10
+           else if( lwork<max( 1, 3*n ) .and. .not.lquery ) then
+              info = -14
+           else
+              ! set m to the number of columns required to store the selected
+              ! eigenvectors, standardize the array select if necessary, and
+              ! test mm.
+              if( somev ) then
+                 m = 0
+                 pair = .false.
+                 do j = 1, n
+                    if( pair ) then
+                       pair = .false.
+                       select( j ) = .false.
+                    else
+                       if( j<n ) then
+                          if( t( j+1, j )==zero ) then
+                             if( select( j ) )m = m + 1
+                          else
+                             pair = .true.
+                             if( select( j ) .or. select( j+1 ) ) then
+                                select( j ) = .true.
+                                m = m + 2
+                             end if
+                          end if
+                       else
+                          if( select( n ) )m = m + 1
+                       end if
+                    end if
+                 end do
+              else
+                 m = n
+              end if
+              if( mm<m ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTREVC3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! use blocked version of back-transformation if sufficient workspace.
+           ! zero-out the workspace to avoid potential nan propagation.
+           if( over .and. lwork >= n + 2*n*nbmin ) then
+              nb = (lwork - n) / (2*n)
+              nb = min( nb, nbmax )
+              call stdlib_dlaset( 'F', n, 1+2*nb, zero, zero, work, n )
+           else
+              nb = 1
+           end if
+           ! set the constants to control overflow.
+           unfl = stdlib_dlamch( 'SAFE MINIMUM' )
+           ovfl = one / unfl
+           call stdlib_dlabad( unfl, ovfl )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           bignum = ( one-ulp ) / smlnum
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           work( 1 ) = zero
+           do j = 2, n
+              work( j ) = zero
+              do i = 1, j - 1
+                 work( j ) = work( j ) + abs( t( i, j ) )
+              end do
+           end do
+           ! index ip is used to specify the real or complex eigenvalue:
+             ! ip = 0, real eigenvalue,
+                  ! 1, first  of conjugate complex pair: (wr,wi)
+                 ! -1, second of conjugate complex pair: (wr,wi)
+             ! iscomplex array stores ip for each column in current block.
+           if( rightv ) then
+              ! ============================================================
+              ! compute right eigenvectors.
+              ! iv is index of column in current block.
+              ! for complex right vector, uses iv-1 for real part and iv for complex part.
+              ! non-blocked version always uses iv=2;
+              ! blocked     version starts with iv=nb, goes down to 1 or 2.
+              ! (note the "0-th" column is used for 1-norms computed above.)
+              iv = 2
+              if( nb>2 ) then
+                 iv = nb
+              end if
+              ip = 0
+              is = m
+              loop_140: do ki = n, 1, -1
+                 if( ip==-1 ) then
+                    ! previous iteration (ki+1) was second of conjugate pair,
+                    ! so this ki is first of conjugate pair; skip to end of loop
+                    ip = 1
+                    cycle loop_140
+                 else if( ki==1 ) then
+                    ! last column, so this ki must be real eigenvalue
+                    ip = 0
+                 else if( t( ki, ki-1 )==zero ) then
+                    ! zero on sub-diagonal, so this ki is real eigenvalue
+                    ip = 0
+                 else
+                    ! non-zero on sub-diagonal, so this ki is second of conjugate pair
+                    ip = -1
+                 end if
+                 if( somev ) then
+                    if( ip==0 ) then
+                       if( .not.select( ki ) )cycle loop_140
+                    else
+                       if( .not.select( ki-1 ) )cycle loop_140
+                    end if
+                 end if
+                 ! compute the ki-th eigenvalue (wr,wi).
+                 wr = t( ki, ki )
+                 wi = zero
+                 if( ip/=0 )wi = sqrt( abs( t( ki, ki-1 ) ) )*sqrt( abs( t( ki-1, ki ) ) )
+                 smin = max( ulp*( abs( wr )+abs( wi ) ), smlnum )
+                 if( ip==0 ) then
+                    ! --------------------------------------------------------
+                    ! real right eigenvector
+                    work( ki + iv*n ) = one
+                    ! form right-hand side.
+                    do k = 1, ki - 1
+                       work( k + iv*n ) = -t( k, ki )
+                    end do
+                    ! solve upper quasi-triangular system:
+                    ! [ t(1:ki-1,1:ki-1) - wr ]*x = scale*work.
+                    jnxt = ki - 1
+                    loop_60: do j = ki - 1, 1, -1
+                       if( j>jnxt )cycle loop_60
+                       j1 = j
+                       j2 = j
+                       jnxt = j - 1
+                       if( j>1 ) then
+                          if( t( j, j-1 )/=zero ) then
+                             j1   = j - 1
+                             jnxt = j - 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          call stdlib_dlaln2( .false., 1, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale x(1,1) to avoid overflow when updating
+                          ! the right-hand side.
+                          if( xnorm>one ) then
+                             if( work( j )>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_dscal( ki, scale, work( 1+iv*n ), 1 )
+                                    
+                          work( j+iv*n ) = x( 1, 1 )
+                          ! update right-hand side
+                          call stdlib_daxpy( j-1, -x( 1, 1 ), t( 1, j ), 1,work( 1+iv*n ), 1 )
+                                    
+                       else
+                          ! 2-by-2 diagonal block
+                          call stdlib_dlaln2( .false., 2, 1, smin, one,t( j-1, j-1 ), ldt, one, &
+                                    one,work( j-1+iv*n ), n, wr, zero, x, 2,scale, xnorm, ierr )
+                          ! scale x(1,1) and x(2,1) to avoid overflow when
+                          ! updating the right-hand side.
+                          if( xnorm>one ) then
+                             beta = max( work( j-1 ), work( j ) )
+                             if( beta>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                x( 2, 1 ) = x( 2, 1 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_dscal( ki, scale, work( 1+iv*n ), 1 )
+                                    
+                          work( j-1+iv*n ) = x( 1, 1 )
+                          work( j  +iv*n ) = x( 2, 1 )
+                          ! update right-hand side
+                          call stdlib_daxpy( j-2, -x( 1, 1 ), t( 1, j-1 ), 1,work( 1+iv*n ), 1 )
+                                    
+                          call stdlib_daxpy( j-2, -x( 2, 1 ), t( 1, j ), 1,work( 1+iv*n ), 1 )
+                                    
+                       end if
+                    end do loop_60
+                    ! copy the vector x or q*x to vr and normalize.
+                    if( .not.over ) then
+                       ! ------------------------------
+                       ! no back-transform: copy x to vr and normalize.
+                       call stdlib_dcopy( ki, work( 1 + iv*n ), 1, vr( 1, is ), 1 )
+                       ii = stdlib_idamax( ki, vr( 1, is ), 1 )
+                       remax = one / abs( vr( ii, is ) )
+                       call stdlib_dscal( ki, remax, vr( 1, is ), 1 )
+                       do k = ki + 1, n
+                          vr( k, is ) = zero
+                       end do
+                    else if( nb==1 ) then
+                       ! ------------------------------
+                       ! version 1: back-transform each vector with gemv, q*x.
+                       if( ki>1 )call stdlib_dgemv( 'N', n, ki-1, one, vr, ldvr,work( 1 + iv*n ), &
+                                 1, work( ki + iv*n ),vr( 1, ki ), 1 )
+                       ii = stdlib_idamax( n, vr( 1, ki ), 1 )
+                       remax = one / abs( vr( ii, ki ) )
+                       call stdlib_dscal( n, remax, vr( 1, ki ), 1 )
+                    else
+                       ! ------------------------------
+                       ! version 2: back-transform block of vectors with gemm
+                       ! zero out below vector
+                       do k = ki + 1, n
+                          work( k + iv*n ) = zero
+                       end do
+                       iscomplex( iv ) = ip
+                       ! back-transform and normalization is done below
+                    end if
+                 else
+                    ! --------------------------------------------------------
+                    ! complex right eigenvector.
+                    ! initial solve
+                    ! [ ( t(ki-1,ki-1) t(ki-1,ki) ) - (wr + i*wi) ]*x = 0.
+                    ! [ ( t(ki,  ki-1) t(ki,  ki) )               ]
+                    if( abs( t( ki-1, ki ) )>=abs( t( ki, ki-1 ) ) ) then
+                       work( ki-1 + (iv-1)*n ) = one
+                       work( ki   + (iv  )*n ) = wi / t( ki-1, ki )
+                    else
+                       work( ki-1 + (iv-1)*n ) = -wi / t( ki, ki-1 )
+                       work( ki   + (iv  )*n ) = one
+                    end if
+                    work( ki   + (iv-1)*n ) = zero
+                    work( ki-1 + (iv  )*n ) = zero
+                    ! form right-hand side.
+                    do k = 1, ki - 2
+                       work( k+(iv-1)*n ) = -work( ki-1+(iv-1)*n )*t(k,ki-1)
+                       work( k+(iv  )*n ) = -work( ki  +(iv  )*n )*t(k,ki  )
+                    end do
+                    ! solve upper quasi-triangular system:
+                    ! [ t(1:ki-2,1:ki-2) - (wr+i*wi) ]*x = scale*(work+i*work2)
+                    jnxt = ki - 2
+                    loop_90: do j = ki - 2, 1, -1
+                       if( j>jnxt )cycle loop_90
+                       j1 = j
+                       j2 = j
+                       jnxt = j - 1
+                       if( j>1 ) then
+                          if( t( j, j-1 )/=zero ) then
+                             j1   = j - 1
+                             jnxt = j - 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          call stdlib_dlaln2( .false., 1, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+(iv-1)*n ), n,wr, wi, x, 2, scale, xnorm, ierr )
+                          ! scale x(1,1) and x(1,2) to avoid overflow when
+                          ! updating the right-hand side.
+                          if( xnorm>one ) then
+                             if( work( j )>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                x( 1, 2 ) = x( 1, 2 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_dscal( ki, scale, work( 1+(iv-1)*n ), 1 )
+                             call stdlib_dscal( ki, scale, work( 1+(iv  )*n ), 1 )
+                          end if
+                          work( j+(iv-1)*n ) = x( 1, 1 )
+                          work( j+(iv  )*n ) = x( 1, 2 )
+                          ! update the right-hand side
+                          call stdlib_daxpy( j-1, -x( 1, 1 ), t( 1, j ), 1,work( 1+(iv-1)*n ), 1 )
+                                    
+                          call stdlib_daxpy( j-1, -x( 1, 2 ), t( 1, j ), 1,work( 1+(iv  )*n ), 1 )
+                                    
+                       else
+                          ! 2-by-2 diagonal block
+                          call stdlib_dlaln2( .false., 2, 2, smin, one,t( j-1, j-1 ), ldt, one, &
+                                    one,work( j-1+(iv-1)*n ), n, wr, wi, x, 2,scale, xnorm, ierr )
+                          ! scale x to avoid overflow when updating
+                          ! the right-hand side.
+                          if( xnorm>one ) then
+                             beta = max( work( j-1 ), work( j ) )
+                             if( beta>bignum / xnorm ) then
+                                rec = one / xnorm
+                                x( 1, 1 ) = x( 1, 1 )*rec
+                                x( 1, 2 ) = x( 1, 2 )*rec
+                                x( 2, 1 ) = x( 2, 1 )*rec
+                                x( 2, 2 ) = x( 2, 2 )*rec
+                                scale = scale*rec
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_dscal( ki, scale, work( 1+(iv-1)*n ), 1 )
+                             call stdlib_dscal( ki, scale, work( 1+(iv  )*n ), 1 )
+                          end if
+                          work( j-1+(iv-1)*n ) = x( 1, 1 )
+                          work( j  +(iv-1)*n ) = x( 2, 1 )
+                          work( j-1+(iv  )*n ) = x( 1, 2 )
+                          work( j  +(iv  )*n ) = x( 2, 2 )
+                          ! update the right-hand side
+                          call stdlib_daxpy( j-2, -x( 1, 1 ), t( 1, j-1 ), 1,work( 1+(iv-1)*n   ),&
+                                     1 )
+                          call stdlib_daxpy( j-2, -x( 2, 1 ), t( 1, j ), 1,work( 1+(iv-1)*n   ), &
+                                    1 )
+                          call stdlib_daxpy( j-2, -x( 1, 2 ), t( 1, j-1 ), 1,work( 1+(iv  )*n ), &
+                                    1 )
+                          call stdlib_daxpy( j-2, -x( 2, 2 ), t( 1, j ), 1,work( 1+(iv  )*n ), 1 )
+                                    
+                       end if
+                    end do loop_90
+                    ! copy the vector x or q*x to vr and normalize.
+                    if( .not.over ) then
+                       ! ------------------------------
+                       ! no back-transform: copy x to vr and normalize.
+                       call stdlib_dcopy( ki, work( 1+(iv-1)*n ), 1, vr(1,is-1), 1 )
+                       call stdlib_dcopy( ki, work( 1+(iv  )*n ), 1, vr(1,is  ), 1 )
+                       emax = zero
+                       do k = 1, ki
+                          emax = max( emax, abs( vr( k, is-1 ) )+abs( vr( k, is   ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_dscal( ki, remax, vr( 1, is-1 ), 1 )
+                       call stdlib_dscal( ki, remax, vr( 1, is   ), 1 )
+                       do k = ki + 1, n
+                          vr( k, is-1 ) = zero
+                          vr( k, is   ) = zero
+                       end do
+                    else if( nb==1 ) then
+                       ! ------------------------------
+                       ! version 1: back-transform each vector with gemv, q*x.
+                       if( ki>2 ) then
+                          call stdlib_dgemv( 'N', n, ki-2, one, vr, ldvr,work( 1    + (iv-1)*n ), &
+                                    1,work( ki-1 + (iv-1)*n ), vr(1,ki-1), 1)
+                          call stdlib_dgemv( 'N', n, ki-2, one, vr, ldvr,work( 1  + (iv)*n ), 1,&
+                                    work( ki + (iv)*n ), vr( 1, ki ), 1 )
+                       else
+                          call stdlib_dscal( n, work(ki-1+(iv-1)*n), vr(1,ki-1), 1)
+                          call stdlib_dscal( n, work(ki  +(iv  )*n), vr(1,ki  ), 1)
+                       end if
+                       emax = zero
+                       do k = 1, n
+                          emax = max( emax, abs( vr( k, ki-1 ) )+abs( vr( k, ki   ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_dscal( n, remax, vr( 1, ki-1 ), 1 )
+                       call stdlib_dscal( n, remax, vr( 1, ki   ), 1 )
+                    else
+                       ! ------------------------------
+                       ! version 2: back-transform block of vectors with gemm
+                       ! zero out below vector
+                       do k = ki + 1, n
+                          work( k + (iv-1)*n ) = zero
+                          work( k + (iv  )*n ) = zero
+                       end do
+                       iscomplex( iv-1 ) = -ip
+                       iscomplex( iv   ) =  ip
+                       iv = iv - 1
+                       ! back-transform and normalization is done below
+                    end if
+                 end if
+                 if( nb>1 ) then
+                    ! --------------------------------------------------------
+                    ! blocked version of back-transform
+                    ! for complex case, ki2 includes both vectors (ki-1 and ki)
+                    if( ip==0 ) then
+                       ki2 = ki
+                    else
+                       ki2 = ki - 1
+                    end if
+                    ! columns iv:nb of work are valid vectors.
+                    ! when the number of vectors stored reaches nb-1 or nb,
+                    ! or if this was last vector, do the gemm
+                    if( (iv<=2) .or. (ki2==1) ) then
+                       call stdlib_dgemm( 'N', 'N', n, nb-iv+1, ki2+nb-iv, one,vr, ldvr,work( 1 + &
+                                 (iv)*n    ), n,zero,work( 1 + (nb+iv)*n ), n )
+                       ! normalize vectors
+                       do k = iv, nb
+                          if( iscomplex(k)==0 ) then
+                             ! real eigenvector
+                             ii = stdlib_idamax( n, work( 1 + (nb+k)*n ), 1 )
+                             remax = one / abs( work( ii + (nb+k)*n ) )
+                          else if( iscomplex(k)==1 ) then
+                             ! first eigenvector of conjugate pair
+                             emax = zero
+                             do ii = 1, n
+                                emax = max( emax,abs( work( ii + (nb+k  )*n ) )+abs( work( ii + (&
+                                          nb+k+1)*n ) ) )
+                             end do
+                             remax = one / emax
+                          ! else if iscomplex(k)==-1
+                             ! second eigenvector of conjugate pair
+                             ! reuse same remax as previous k
+                          end if
+                          call stdlib_dscal( n, remax, work( 1 + (nb+k)*n ), 1 )
+                       end do
+                       call stdlib_dlacpy( 'F', n, nb-iv+1,work( 1 + (nb+iv)*n ), n,vr( 1, ki2 ), &
+                                 ldvr )
+                       iv = nb
+                    else
+                       iv = iv - 1
+                    end if
+                 end if ! blocked back-transform
+                 is = is - 1
+                 if( ip/=0 )is = is - 1
+              end do loop_140
+           end if
+           if( leftv ) then
+              ! ============================================================
+              ! compute left eigenvectors.
+              ! iv is index of column in current block.
+              ! for complex left vector, uses iv for real part and iv+1 for complex part.
+              ! non-blocked version always uses iv=1;
+              ! blocked     version starts with iv=1, goes up to nb-1 or nb.
+              ! (note the "0-th" column is used for 1-norms computed above.)
+              iv = 1
+              ip = 0
+              is = 1
+              loop_260: do ki = 1, n
+                 if( ip==1 ) then
+                    ! previous iteration (ki-1) was first of conjugate pair,
+                    ! so this ki is second of conjugate pair; skip to end of loop
+                    ip = -1
+                    cycle loop_260
+                 else if( ki==n ) then
+                    ! last column, so this ki must be real eigenvalue
+                    ip = 0
+                 else if( t( ki+1, ki )==zero ) then
+                    ! zero on sub-diagonal, so this ki is real eigenvalue
+                    ip = 0
+                 else
+                    ! non-zero on sub-diagonal, so this ki is first of conjugate pair
+                    ip = 1
+                 end if
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_260
+                 end if
+                 ! compute the ki-th eigenvalue (wr,wi).
+                 wr = t( ki, ki )
+                 wi = zero
+                 if( ip/=0 )wi = sqrt( abs( t( ki, ki+1 ) ) )*sqrt( abs( t( ki+1, ki ) ) )
+                 smin = max( ulp*( abs( wr )+abs( wi ) ), smlnum )
+                 if( ip==0 ) then
+                    ! --------------------------------------------------------
+                    ! real left eigenvector
+                    work( ki + iv*n ) = one
+                    ! form right-hand side.
+                    do k = ki + 1, n
+                       work( k + iv*n ) = -t( ki, k )
+                    end do
+                    ! solve transposed quasi-triangular system:
+                    ! [ t(ki+1:n,ki+1:n) - wr ]**t * x = scale*work
+                    vmax = one
+                    vcrit = bignum
+                    jnxt = ki + 1
+                    loop_170: do j = ki + 1, n
+                       if( j<jnxt )cycle loop_170
+                       j1 = j
+                       j2 = j
+                       jnxt = j + 1
+                       if( j<n ) then
+                          if( t( j+1, j )/=zero ) then
+                             j2 = j + 1
+                             jnxt = j + 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side.
+                          if( work( j )>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_dscal( n-ki+1, rec, work( ki+iv*n ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+iv*n ) = work( j+iv*n ) -stdlib_ddot( j-ki-1, t( ki+1, j ), 1,&
+                                    work( ki+1+iv*n ), 1 )
+                          ! solve [ t(j,j) - wr ]**t * x = work
+                          call stdlib_dlaln2( .false., 1, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_dscal( n-ki+1, scale, work( ki+iv*n ), 1 )
+                                    
+                          work( j+iv*n ) = x( 1, 1 )
+                          vmax = max( abs( work( j+iv*n ) ), vmax )
+                          vcrit = bignum / vmax
+                       else
+                          ! 2-by-2 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side.
+                          beta = max( work( j ), work( j+1 ) )
+                          if( beta>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_dscal( n-ki+1, rec, work( ki+iv*n ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+iv*n ) = work( j+iv*n ) -stdlib_ddot( j-ki-1, t( ki+1, j ), 1,&
+                                    work( ki+1+iv*n ), 1 )
+                          work( j+1+iv*n ) = work( j+1+iv*n ) -stdlib_ddot( j-ki-1, t( ki+1, j+1 )&
+                                    , 1,work( ki+1+iv*n ), 1 )
+                          ! solve
+                          ! [ t(j,j)-wr   t(j,j+1)      ]**t * x = scale*( work1 )
+                          ! [ t(j+1,j)    t(j+1,j+1)-wr ]                ( work2 )
+                          call stdlib_dlaln2( .true., 2, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_dscal( n-ki+1, scale, work( ki+iv*n ), 1 )
+                                    
+                          work( j  +iv*n ) = x( 1, 1 )
+                          work( j+1+iv*n ) = x( 2, 1 )
+                          vmax = max( abs( work( j  +iv*n ) ),abs( work( j+1+iv*n ) ), vmax )
+                                    
+                          vcrit = bignum / vmax
+                       end if
+                    end do loop_170
+                    ! copy the vector x or q*x to vl and normalize.
+                    if( .not.over ) then
+                       ! ------------------------------
+                       ! no back-transform: copy x to vl and normalize.
+                       call stdlib_dcopy( n-ki+1, work( ki + iv*n ), 1,vl( ki, is ), 1 )
+                       ii = stdlib_idamax( n-ki+1, vl( ki, is ), 1 ) + ki - 1
+                       remax = one / abs( vl( ii, is ) )
+                       call stdlib_dscal( n-ki+1, remax, vl( ki, is ), 1 )
+                       do k = 1, ki - 1
+                          vl( k, is ) = zero
+                       end do
+                    else if( nb==1 ) then
+                       ! ------------------------------
+                       ! version 1: back-transform each vector with gemv, q*x.
+                       if( ki<n )call stdlib_dgemv( 'N', n, n-ki, one,vl( 1, ki+1 ), ldvl,work( &
+                                 ki+1 + iv*n ), 1,work( ki   + iv*n ), vl( 1, ki ), 1 )
+                       ii = stdlib_idamax( n, vl( 1, ki ), 1 )
+                       remax = one / abs( vl( ii, ki ) )
+                       call stdlib_dscal( n, remax, vl( 1, ki ), 1 )
+                    else
+                       ! ------------------------------
+                       ! version 2: back-transform block of vectors with gemm
+                       ! zero out above vector
+                       ! could go from ki-nv+1 to ki-1
+                       do k = 1, ki - 1
+                          work( k + iv*n ) = zero
+                       end do
+                       iscomplex( iv ) = ip
+                       ! back-transform and normalization is done below
+                    end if
+                 else
+                    ! --------------------------------------------------------
+                    ! complex left eigenvector.
+                    ! initial solve:
+                    ! [ ( t(ki,ki)    t(ki,ki+1)  )**t - (wr - i* wi) ]*x = 0.
+                    ! [ ( t(ki+1,ki) t(ki+1,ki+1) )                   ]
+                    if( abs( t( ki, ki+1 ) )>=abs( t( ki+1, ki ) ) ) then
+                       work( ki   + (iv  )*n ) = wi / t( ki, ki+1 )
+                       work( ki+1 + (iv+1)*n ) = one
+                    else
+                       work( ki   + (iv  )*n ) = one
+                       work( ki+1 + (iv+1)*n ) = -wi / t( ki+1, ki )
+                    end if
+                    work( ki+1 + (iv  )*n ) = zero
+                    work( ki   + (iv+1)*n ) = zero
+                    ! form right-hand side.
+                    do k = ki + 2, n
+                       work( k+(iv  )*n ) = -work( ki  +(iv  )*n )*t(ki,  k)
+                       work( k+(iv+1)*n ) = -work( ki+1+(iv+1)*n )*t(ki+1,k)
+                    end do
+                    ! solve transposed quasi-triangular system:
+                    ! [ t(ki+2:n,ki+2:n)**t - (wr-i*wi) ]*x = work1+i*work2
+                    vmax = one
+                    vcrit = bignum
+                    jnxt = ki + 2
+                    loop_200: do j = ki + 2, n
+                       if( j<jnxt )cycle loop_200
+                       j1 = j
+                       j2 = j
+                       jnxt = j + 1
+                       if( j<n ) then
+                          if( t( j+1, j )/=zero ) then
+                             j2 = j + 1
+                             jnxt = j + 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          ! scale if necessary to avoid overflow when
+                          ! forming the right-hand side elements.
+                          if( work( j )>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_dscal( n-ki+1, rec, work(ki+(iv  )*n), 1 )
+                             call stdlib_dscal( n-ki+1, rec, work(ki+(iv+1)*n), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+(iv  )*n ) = work( j+(iv)*n ) -stdlib_ddot( j-ki-2, t( ki+2, j )&
+                                    , 1,work( ki+2+(iv)*n ), 1 )
+                          work( j+(iv+1)*n ) = work( j+(iv+1)*n ) -stdlib_ddot( j-ki-2, t( ki+2, &
+                                    j ), 1,work( ki+2+(iv+1)*n ), 1 )
+                          ! solve [ t(j,j)-(wr-i*wi) ]*(x11+i*x12)= wk+i*wk2
+                          call stdlib_dlaln2( .false., 1, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,-wi, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_dscal( n-ki+1, scale, work(ki+(iv  )*n), 1)
+                             call stdlib_dscal( n-ki+1, scale, work(ki+(iv+1)*n), 1)
+                          end if
+                          work( j+(iv  )*n ) = x( 1, 1 )
+                          work( j+(iv+1)*n ) = x( 1, 2 )
+                          vmax = max( abs( work( j+(iv  )*n ) ),abs( work( j+(iv+1)*n ) ), vmax )
+                                    
+                          vcrit = bignum / vmax
+                       else
+                          ! 2-by-2 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side elements.
+                          beta = max( work( j ), work( j+1 ) )
+                          if( beta>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_dscal( n-ki+1, rec, work(ki+(iv  )*n), 1 )
+                             call stdlib_dscal( n-ki+1, rec, work(ki+(iv+1)*n), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j  +(iv  )*n ) = work( j+(iv)*n ) -stdlib_ddot( j-ki-2, t( ki+2, &
+                                    j ), 1,work( ki+2+(iv)*n ), 1 )
+                          work( j  +(iv+1)*n ) = work( j+(iv+1)*n ) -stdlib_ddot( j-ki-2, t( ki+2,&
+                                     j ), 1,work( ki+2+(iv+1)*n ), 1 )
+                          work( j+1+(iv  )*n ) = work( j+1+(iv)*n ) -stdlib_ddot( j-ki-2, t( ki+2,&
+                                     j+1 ), 1,work( ki+2+(iv)*n ), 1 )
+                          work( j+1+(iv+1)*n ) = work( j+1+(iv+1)*n ) -stdlib_ddot( j-ki-2, t( ki+&
+                                    2, j+1 ), 1,work( ki+2+(iv+1)*n ), 1 )
+                          ! solve 2-by-2 complex linear equation
+                          ! [ (t(j,j)   t(j,j+1)  )**t - (wr-i*wi)*i ]*x = scale*b
+                          ! [ (t(j+1,j) t(j+1,j+1))                  ]
+                          call stdlib_dlaln2( .true., 2, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,-wi, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_dscal( n-ki+1, scale, work(ki+(iv  )*n), 1)
+                             call stdlib_dscal( n-ki+1, scale, work(ki+(iv+1)*n), 1)
+                          end if
+                          work( j  +(iv  )*n ) = x( 1, 1 )
+                          work( j  +(iv+1)*n ) = x( 1, 2 )
+                          work( j+1+(iv  )*n ) = x( 2, 1 )
+                          work( j+1+(iv+1)*n ) = x( 2, 2 )
+                          vmax = max( abs( x( 1, 1 ) ), abs( x( 1, 2 ) ),abs( x( 2, 1 ) ), abs( x(&
+                                     2, 2 ) ),vmax )
+                          vcrit = bignum / vmax
+                       end if
+                    end do loop_200
+                    ! copy the vector x or q*x to vl and normalize.
+                    if( .not.over ) then
+                       ! ------------------------------
+                       ! no back-transform: copy x to vl and normalize.
+                       call stdlib_dcopy( n-ki+1, work( ki + (iv  )*n ), 1,vl( ki, is   ), 1 )
+                                 
+                       call stdlib_dcopy( n-ki+1, work( ki + (iv+1)*n ), 1,vl( ki, is+1 ), 1 )
+                                 
+                       emax = zero
+                       do k = ki, n
+                          emax = max( emax, abs( vl( k, is   ) )+abs( vl( k, is+1 ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_dscal( n-ki+1, remax, vl( ki, is   ), 1 )
+                       call stdlib_dscal( n-ki+1, remax, vl( ki, is+1 ), 1 )
+                       do k = 1, ki - 1
+                          vl( k, is   ) = zero
+                          vl( k, is+1 ) = zero
+                       end do
+                    else if( nb==1 ) then
+                       ! ------------------------------
+                       ! version 1: back-transform each vector with gemv, q*x.
+                       if( ki<n-1 ) then
+                          call stdlib_dgemv( 'N', n, n-ki-1, one,vl( 1, ki+2 ), ldvl,work( ki+2 + &
+                                    (iv)*n ), 1,work( ki   + (iv)*n ),vl( 1, ki ), 1 )
+                          call stdlib_dgemv( 'N', n, n-ki-1, one,vl( 1, ki+2 ), ldvl,work( ki+2 + &
+                                    (iv+1)*n ), 1,work( ki+1 + (iv+1)*n ),vl( 1, ki+1 ), 1 )
+                       else
+                          call stdlib_dscal( n, work(ki+  (iv  )*n), vl(1, ki  ), 1)
+                          call stdlib_dscal( n, work(ki+1+(iv+1)*n), vl(1, ki+1), 1)
+                       end if
+                       emax = zero
+                       do k = 1, n
+                          emax = max( emax, abs( vl( k, ki   ) )+abs( vl( k, ki+1 ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_dscal( n, remax, vl( 1, ki   ), 1 )
+                       call stdlib_dscal( n, remax, vl( 1, ki+1 ), 1 )
+                    else
+                       ! ------------------------------
+                       ! version 2: back-transform block of vectors with gemm
+                       ! zero out above vector
+                       ! could go from ki-nv+1 to ki-1
+                       do k = 1, ki - 1
+                          work( k + (iv  )*n ) = zero
+                          work( k + (iv+1)*n ) = zero
+                       end do
+                       iscomplex( iv   ) =  ip
+                       iscomplex( iv+1 ) = -ip
+                       iv = iv + 1
+                       ! back-transform and normalization is done below
+                    end if
+                 end if
+                 if( nb>1 ) then
+                    ! --------------------------------------------------------
+                    ! blocked version of back-transform
+                    ! for complex case, ki2 includes both vectors (ki and ki+1)
+                    if( ip==0 ) then
+                       ki2 = ki
+                    else
+                       ki2 = ki + 1
+                    end if
+                    ! columns 1:iv of work are valid vectors.
+                    ! when the number of vectors stored reaches nb-1 or nb,
+                    ! or if this was last vector, do the gemm
+                    if( (iv>=nb-1) .or. (ki2==n) ) then
+                       call stdlib_dgemm( 'N', 'N', n, iv, n-ki2+iv, one,vl( 1, ki2-iv+1 ), ldvl,&
+                                 work( ki2-iv+1 + (1)*n ), n,zero,work( 1 + (nb+1)*n ), n )
+                       ! normalize vectors
+                       do k = 1, iv
+                          if( iscomplex(k)==0) then
+                             ! real eigenvector
+                             ii = stdlib_idamax( n, work( 1 + (nb+k)*n ), 1 )
+                             remax = one / abs( work( ii + (nb+k)*n ) )
+                          else if( iscomplex(k)==1) then
+                             ! first eigenvector of conjugate pair
+                             emax = zero
+                             do ii = 1, n
+                                emax = max( emax,abs( work( ii + (nb+k  )*n ) )+abs( work( ii + (&
+                                          nb+k+1)*n ) ) )
+                             end do
+                             remax = one / emax
+                          ! else if iscomplex(k)==-1
+                             ! second eigenvector of conjugate pair
+                             ! reuse same remax as previous k
+                          end if
+                          call stdlib_dscal( n, remax, work( 1 + (nb+k)*n ), 1 )
+                       end do
+                       call stdlib_dlacpy( 'F', n, iv,work( 1 + (nb+1)*n ), n,vl( 1, ki2-iv+1 ), &
+                                 ldvl )
+                       iv = 1
+                    else
+                       iv = iv + 1
+                    end if
+                 end if ! blocked back-transform
+                 is = is + 1
+                 if( ip/=0 )is = is + 1
+              end do loop_260
+           end if
+           return
+     end subroutine stdlib_dtrevc3
+
+     !> DTRSYL: solves the real Sylvester matrix equation:
+     !> op(A)*X + X*op(B) = scale*C or
+     !> op(A)*X - X*op(B) = scale*C,
+     !> where op(A) = A or A**T, and  A and B are both upper quasi-
+     !> triangular. A is M-by-M and B is N-by-N; the right hand side C and
+     !> the solution X are M-by-N; and scale is an output scale factor, set
+     !> <= 1 to avoid overflow in X.
+     !> A and B must be in Schur canonical form (as returned by DHSEQR), that
+     !> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks;
+     !> each 2-by-2 diagonal block has its diagonal elements equal and its
+     !> off-diagonal elements of opposite sign.
+
+     subroutine stdlib_dtrsyl( trana, tranb, isgn, m, n, a, lda, b, ldb, c,ldc, scale, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trana, tranb
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: isgn, lda, ldb, ldc, m, n
+           real(dp), intent(out) :: scale
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), b(ldb,*)
+           real(dp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notrna, notrnb
+           integer(ilp) :: ierr, j, k, k1, k2, knext, l, l1, l2, lnext
+           real(dp) :: a11, bignum, da11, db, eps, scaloc, sgn, smin, smlnum, suml, sumr, &
+                     xnorm
+           ! Local Arrays 
+           real(dp) :: dum(1), vec(2,2), x(2,2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min
+           ! Executable Statements 
+           ! decode and test input parameters
+           notrna = stdlib_lsame( trana, 'N' )
+           notrnb = stdlib_lsame( tranb, 'N' )
+           info = 0
+           if( .not.notrna .and. .not.stdlib_lsame( trana, 'T' ) .and. .not.stdlib_lsame( trana, &
+                     'C' ) ) then
+              info = -1
+           else if( .not.notrnb .and. .not.stdlib_lsame( tranb, 'T' ) .and. .not.stdlib_lsame( &
+                     tranb, 'C' ) ) then
+              info = -2
+           else if( isgn/=1 .and. isgn/=-1 ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, m ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRSYL', -info )
+              return
+           end if
+           ! quick return if possible
+           scale = one
+           if( m==0 .or. n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = smlnum*real( m*n,KIND=dp) / eps
+           bignum = one / smlnum
+           smin = max( smlnum, eps*stdlib_dlange( 'M', m, m, a, lda, dum ),eps*stdlib_dlange( 'M',&
+                      n, n, b, ldb, dum ) )
+           sgn = isgn
+           if( notrna .and. notrnb ) then
+              ! solve    a*x + isgn*x*b = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! bottom-left corner column by column by
+               ! a(k,k)*x(k,l) + isgn*x(k,l)*b(l,l) = c(k,l) - r(k,l)
+              ! where
+                        ! m                         l-1
+              ! r(k,l) = sum [a(k,i)*x(i,l)] + isgn*sum [x(k,j)*b(j,l)].
+                      ! i=k+1                       j=1
+              ! start column loop (index = l)
+              ! l1 (l2) : column index of the first (first) row of x(k,l).
+              lnext = 1
+              loop_60: do l = 1, n
+                 if( l<lnext )cycle loop_60
+                 if( l==n ) then
+                    l1 = l
+                    l2 = l
+                 else
+                    if( b( l+1, l )/=zero ) then
+                       l1 = l
+                       l2 = l + 1
+                       lnext = l + 2
+                    else
+                       l1 = l
+                       l2 = l
+                       lnext = l + 1
+                    end if
+                 end if
+                 ! start row loop (index = k)
+                 ! k1 (k2): row index of the first (last) row of x(k,l).
+                 knext = m
+                 loop_50: do k = m, 1, -1
+                    if( k>knext )cycle loop_50
+                    if( k==1 ) then
+                       k1 = k
+                       k2 = k
+                    else
+                       if( a( k, k-1 )/=zero ) then
+                          k1 = k - 1
+                          k2 = k
+                          knext = k - 2
+                       else
+                          k1 = k
+                          k2 = k
+                          knext = k - 1
+                       end if
+                    end if
+                    if( l1==l2 .and. k1==k2 ) then
+                       suml = stdlib_ddot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       scaloc = one
+                       a11 = a( k1, k1 ) + sgn*b( l1, l1 )
+                       da11 = abs( a11 )
+                       if( da11<=smin ) then
+                          a11 = smin
+                          da11 = smin
+                          info = 1
+                       end if
+                       db = abs( vec( 1, 1 ) )
+                       if( da11<one .and. db>one ) then
+                          if( db>bignum*da11 )scaloc = one / db
+                       end if
+                       x( 1, 1 ) = ( vec( 1, 1 )*scaloc ) / a11
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                    else if( l1==l2 .and. k1/=k2 ) then
+                       suml = stdlib_ddot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k2, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       call stdlib_dlaln2( .false., 2, 1, smin, one, a( k1, k1 ),lda, one, one, &
+                                 vec, 2, -sgn*b( l1, l1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k2, l1 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1==k2 ) then
+                       suml = stdlib_ddot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = sgn*( c( k1, l1 )-( suml+sgn*sumr ) )
+                       suml = stdlib_ddot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k1, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 2, 1 ) = sgn*( c( k1, l2 )-( suml+sgn*sumr ) )
+                       call stdlib_dlaln2( .true., 2, 1, smin, one, b( l1, l1 ),ldb, one, one, &
+                                 vec, 2, -sgn*a( k1, k1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1/=k2 ) then
+                       suml = stdlib_ddot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k1, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 1, 2 ) = c( k1, l2 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k2, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k2, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 2, 2 ) = c( k2, l2 ) - ( suml+sgn*sumr )
+                       call stdlib_dlasy2( .false., .false., isgn, 2, 2,a( k1, k1 ), lda, b( l1, &
+                                 l1 ), ldb, vec,2, scaloc, x, 2, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 1, 2 )
+                       c( k2, l1 ) = x( 2, 1 )
+                       c( k2, l2 ) = x( 2, 2 )
+                    end if
+                 end do loop_50
+              end do loop_60
+           else if( .not.notrna .and. notrnb ) then
+              ! solve    a**t *x + isgn*x*b = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! upper-left corner column by column by
+                ! a(k,k)**t*x(k,l) + isgn*x(k,l)*b(l,l) = c(k,l) - r(k,l)
+              ! where
+                         ! k-1        t                    l-1
+                ! r(k,l) = sum [a(i,k)**t*x(i,l)] +isgn*sum [x(k,j)*b(j,l)]
+                         ! i=1                          j=1
+              ! start column loop (index = l)
+              ! l1 (l2): column index of the first (last) row of x(k,l)
+              lnext = 1
+              loop_120: do l = 1, n
+                 if( l<lnext )cycle loop_120
+                 if( l==n ) then
+                    l1 = l
+                    l2 = l
+                 else
+                    if( b( l+1, l )/=zero ) then
+                       l1 = l
+                       l2 = l + 1
+                       lnext = l + 2
+                    else
+                       l1 = l
+                       l2 = l
+                       lnext = l + 1
+                    end if
+                 end if
+                 ! start row loop (index = k)
+                 ! k1 (k2): row index of the first (last) row of x(k,l)
+                 knext = 1
+                 loop_110: do k = 1, m
+                    if( k<knext )cycle loop_110
+                    if( k==m ) then
+                       k1 = k
+                       k2 = k
+                    else
+                       if( a( k+1, k )/=zero ) then
+                          k1 = k
+                          k2 = k + 1
+                          knext = k + 2
+                       else
+                          k1 = k
+                          k2 = k
+                          knext = k + 1
+                       end if
+                    end if
+                    if( l1==l2 .and. k1==k2 ) then
+                       suml = stdlib_ddot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       scaloc = one
+                       a11 = a( k1, k1 ) + sgn*b( l1, l1 )
+                       da11 = abs( a11 )
+                       if( da11<=smin ) then
+                          a11 = smin
+                          da11 = smin
+                          info = 1
+                       end if
+                       db = abs( vec( 1, 1 ) )
+                       if( da11<one .and. db>one ) then
+                          if( db>bignum*da11 )scaloc = one / db
+                       end if
+                       x( 1, 1 ) = ( vec( 1, 1 )*scaloc ) / a11
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                    else if( l1==l2 .and. k1/=k2 ) then
+                       suml = stdlib_ddot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( k1-1, a( 1, k2 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k2, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       call stdlib_dlaln2( .true., 2, 1, smin, one, a( k1, k1 ),lda, one, one, &
+                                 vec, 2, -sgn*b( l1, l1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k2, l1 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1==k2 ) then
+                       suml = stdlib_ddot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = sgn*( c( k1, l1 )-( suml+sgn*sumr ) )
+                       suml = stdlib_ddot( k1-1, a( 1, k1 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k1, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 2, 1 ) = sgn*( c( k1, l2 )-( suml+sgn*sumr ) )
+                       call stdlib_dlaln2( .true., 2, 1, smin, one, b( l1, l1 ),ldb, one, one, &
+                                 vec, 2, -sgn*a( k1, k1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1/=k2 ) then
+                       suml = stdlib_ddot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( k1-1, a( 1, k1 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k1, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 1, 2 ) = c( k1, l2 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( k1-1, a( 1, k2 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k2, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( k1-1, a( 1, k2 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_ddot( l1-1, c( k2, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 2, 2 ) = c( k2, l2 ) - ( suml+sgn*sumr )
+                       call stdlib_dlasy2( .true., .false., isgn, 2, 2, a( k1, k1 ),lda, b( l1, &
+                                 l1 ), ldb, vec, 2, scaloc, x,2, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 1, 2 )
+                       c( k2, l1 ) = x( 2, 1 )
+                       c( k2, l2 ) = x( 2, 2 )
+                    end if
+                 end do loop_110
+              end do loop_120
+           else if( .not.notrna .and. .not.notrnb ) then
+              ! solve    a**t*x + isgn*x*b**t = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! top-right corner column by column by
+                 ! a(k,k)**t*x(k,l) + isgn*x(k,l)*b(l,l)**t = c(k,l) - r(k,l)
+              ! where
+                           ! k-1                            n
+                  ! r(k,l) = sum [a(i,k)**t*x(i,l)] + isgn*sum [x(k,j)*b(l,j)**t].
+                           ! i=1                          j=l+1
+              ! start column loop (index = l)
+              ! l1 (l2): column index of the first (last) row of x(k,l)
+              lnext = n
+              loop_180: do l = n, 1, -1
+                 if( l>lnext )cycle loop_180
+                 if( l==1 ) then
+                    l1 = l
+                    l2 = l
+                 else
+                    if( b( l, l-1 )/=zero ) then
+                       l1 = l - 1
+                       l2 = l
+                       lnext = l - 2
+                    else
+                       l1 = l
+                       l2 = l
+                       lnext = l - 1
+                    end if
+                 end if
+                 ! start row loop (index = k)
+                 ! k1 (k2): row index of the first (last) row of x(k,l)
+                 knext = 1
+                 loop_170: do k = 1, m
+                    if( k<knext )cycle loop_170
+                    if( k==m ) then
+                       k1 = k
+                       k2 = k
+                    else
+                       if( a( k+1, k )/=zero ) then
+                          k1 = k
+                          k2 = k + 1
+                          knext = k + 2
+                       else
+                          k1 = k
+                          k2 = k
+                          knext = k + 1
+                       end if
+                    end if
+                    if( l1==l2 .and. k1==k2 ) then
+                       suml = stdlib_ddot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_ddot( n-l1, c( k1, min( l1+1, n ) ), ldc,b( l1, min( l1+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       scaloc = one
+                       a11 = a( k1, k1 ) + sgn*b( l1, l1 )
+                       da11 = abs( a11 )
+                       if( da11<=smin ) then
+                          a11 = smin
+                          da11 = smin
+                          info = 1
+                       end if
+                       db = abs( vec( 1, 1 ) )
+                       if( da11<one .and. db>one ) then
+                          if( db>bignum*da11 )scaloc = one / db
+                       end if
+                       x( 1, 1 ) = ( vec( 1, 1 )*scaloc ) / a11
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                    else if( l1==l2 .and. k1/=k2 ) then
+                       suml = stdlib_ddot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( k1-1, a( 1, k2 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       call stdlib_dlaln2( .true., 2, 1, smin, one, a( k1, k1 ),lda, one, one, &
+                                 vec, 2, -sgn*b( l1, l1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k2, l1 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1==k2 ) then
+                       suml = stdlib_ddot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = sgn*( c( k1, l1 )-( suml+sgn*sumr ) )
+                       suml = stdlib_ddot( k1-1, a( 1, k1 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = sgn*( c( k1, l2 )-( suml+sgn*sumr ) )
+                       call stdlib_dlaln2( .false., 2, 1, smin, one, b( l1, l1 ),ldb, one, one, &
+                                 vec, 2, -sgn*a( k1, k1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1/=k2 ) then
+                       suml = stdlib_ddot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( k1-1, a( 1, k1 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 2 ) = c( k1, l2 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( k1-1, a( 1, k2 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( k1-1, a( 1, k2 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 2 ) = c( k2, l2 ) - ( suml+sgn*sumr )
+                       call stdlib_dlasy2( .true., .true., isgn, 2, 2, a( k1, k1 ),lda, b( l1, l1 &
+                                 ), ldb, vec, 2, scaloc, x,2, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 1, 2 )
+                       c( k2, l1 ) = x( 2, 1 )
+                       c( k2, l2 ) = x( 2, 2 )
+                    end if
+                 end do loop_170
+              end do loop_180
+           else if( notrna .and. .not.notrnb ) then
+              ! solve    a*x + isgn*x*b**t = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! bottom-right corner column by column by
+                  ! a(k,k)*x(k,l) + isgn*x(k,l)*b(l,l)**t = c(k,l) - r(k,l)
+              ! where
+                            ! m                          n
+                  ! r(k,l) = sum [a(k,i)*x(i,l)] + isgn*sum [x(k,j)*b(l,j)**t].
+                          ! i=k+1                      j=l+1
+              ! start column loop (index = l)
+              ! l1 (l2): column index of the first (last) row of x(k,l)
+              lnext = n
+              loop_240: do l = n, 1, -1
+                 if( l>lnext )cycle loop_240
+                 if( l==1 ) then
+                    l1 = l
+                    l2 = l
+                 else
+                    if( b( l, l-1 )/=zero ) then
+                       l1 = l - 1
+                       l2 = l
+                       lnext = l - 2
+                    else
+                       l1 = l
+                       l2 = l
+                       lnext = l - 1
+                    end if
+                 end if
+                 ! start row loop (index = k)
+                 ! k1 (k2): row index of the first (last) row of x(k,l)
+                 knext = m
+                 loop_230: do k = m, 1, -1
+                    if( k>knext )cycle loop_230
+                    if( k==1 ) then
+                       k1 = k
+                       k2 = k
+                    else
+                       if( a( k, k-1 )/=zero ) then
+                          k1 = k - 1
+                          k2 = k
+                          knext = k - 2
+                       else
+                          k1 = k
+                          k2 = k
+                          knext = k - 1
+                       end if
+                    end if
+                    if( l1==l2 .and. k1==k2 ) then
+                       suml = stdlib_ddot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_ddot( n-l1, c( k1, min( l1+1, n ) ), ldc,b( l1, min( l1+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       scaloc = one
+                       a11 = a( k1, k1 ) + sgn*b( l1, l1 )
+                       da11 = abs( a11 )
+                       if( da11<=smin ) then
+                          a11 = smin
+                          da11 = smin
+                          info = 1
+                       end if
+                       db = abs( vec( 1, 1 ) )
+                       if( da11<one .and. db>one ) then
+                          if( db>bignum*da11 )scaloc = one / db
+                       end if
+                       x( 1, 1 ) = ( vec( 1, 1 )*scaloc ) / a11
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                    else if( l1==l2 .and. k1/=k2 ) then
+                       suml = stdlib_ddot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       call stdlib_dlaln2( .false., 2, 1, smin, one, a( k1, k1 ),lda, one, one, &
+                                 vec, 2, -sgn*b( l1, l1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k2, l1 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1==k2 ) then
+                       suml = stdlib_ddot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = sgn*( c( k1, l1 )-( suml+sgn*sumr ) )
+                       suml = stdlib_ddot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = sgn*( c( k1, l2 )-( suml+sgn*sumr ) )
+                       call stdlib_dlaln2( .false., 2, 1, smin, one, b( l1, l1 ),ldb, one, one, &
+                                 vec, 2, -sgn*a( k1, k1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1/=k2 ) then
+                       suml = stdlib_ddot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 2 ) = c( k1, l2 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_ddot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_ddot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 2 ) = c( k2, l2 ) - ( suml+sgn*sumr )
+                       call stdlib_dlasy2( .false., .true., isgn, 2, 2, a( k1, k1 ),lda, b( l1, &
+                                 l1 ), ldb, vec, 2, scaloc, x,2, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_dscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 1, 2 )
+                       c( k2, l1 ) = x( 2, 1 )
+                       c( k2, l2 ) = x( 2, 2 )
+                    end if
+                 end do loop_230
+              end do loop_240
+           end if
+           return
+     end subroutine stdlib_dtrsyl
+
+     !> DGEBRD: reduces a general real M-by-N matrix A to upper or lower
+     !> bidiagonal form B by an orthogonal transformation: Q**T * A * P = B.
+     !> If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+     pure subroutine stdlib_dgebrd( m, n, a, lda, d, e, tauq, taup, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: d(*), e(*), taup(*), tauq(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, ldwrkx, ldwrky, lwkopt, minmn, nb, nbmin, nx, ws
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb = max( 1, stdlib_ilaenv( 1, 'DGEBRD', ' ', m, n, -1, -1 ) )
+           lwkopt = ( m+n )*nb
+           work( 1 ) = real( lwkopt,KIND=dp)
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, m, n ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info<0 ) then
+              call stdlib_xerbla( 'DGEBRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           minmn = min( m, n )
+           if( minmn==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ws = max( m, n )
+           ldwrkx = m
+           ldwrky = n
+           if( nb>1 .and. nb<minmn ) then
+              ! set the crossover point nx.
+              nx = max( nb, stdlib_ilaenv( 3, 'DGEBRD', ' ', m, n, -1, -1 ) )
+              ! determine when to switch from blocked to unblocked code.
+              if( nx<minmn ) then
+                 ws = ( m+n )*nb
+                 if( lwork<ws ) then
+                    ! not enough work space for the optimal nb, consider using
+                    ! a smaller block size.
+                    nbmin = stdlib_ilaenv( 2, 'DGEBRD', ' ', m, n, -1, -1 )
+                    if( lwork>=( m+n )*nbmin ) then
+                       nb = lwork / ( m+n )
+                    else
+                       nb = 1
+                       nx = minmn
+                    end if
+                 end if
+              end if
+           else
+              nx = minmn
+           end if
+           do i = 1, minmn - nx, nb
+              ! reduce rows and columns i:i+nb-1 to bidiagonal form and return
+              ! the matrices x and y which are needed to update the unreduced
+              ! part of the matrix
+              call stdlib_dlabrd( m-i+1, n-i+1, nb, a( i, i ), lda, d( i ), e( i ),tauq( i ), &
+                        taup( i ), work, ldwrkx,work( ldwrkx*nb+1 ), ldwrky )
+              ! update the trailing submatrix a(i+nb:m,i+nb:n), using an update
+              ! of the form  a := a - v*y**t - x*u**t
+              call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE', m-i-nb+1, n-i-nb+1,nb, -one, a( i+&
+                        nb, i ), lda,work( ldwrkx*nb+nb+1 ), ldwrky, one,a( i+nb, i+nb ), lda )
+              call stdlib_dgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-i-nb+1, n-i-nb+1,nb, -one, &
+                        work( nb+1 ), ldwrkx, a( i, i+nb ), lda,one, a( i+nb, i+nb ), lda )
+              ! copy diagonal and off-diagonal elements of b back into a
+              if( m>=n ) then
+                 do j = i, i + nb - 1
+                    a( j, j ) = d( j )
+                    a( j, j+1 ) = e( j )
+                 end do
+              else
+                 do j = i, i + nb - 1
+                    a( j, j ) = d( j )
+                    a( j+1, j ) = e( j )
+                 end do
+              end if
+           end do
+           ! use unblocked code to reduce the remainder of the matrix
+           call stdlib_dgebd2( m-i+1, n-i+1, a( i, i ), lda, d( i ), e( i ),tauq( i ), taup( i ), &
+                     work, iinfo )
+           work( 1 ) = ws
+           return
+     end subroutine stdlib_dgebrd
+
+     !> DGEHRD: reduces a real general matrix A to upper Hessenberg form H by
+     !> an orthogonal similarity transformation:  Q**T * A * Q = H .
+
+     pure subroutine stdlib_dgehrd( n, ilo, ihi, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iwt, j, ldwork, lwkopt, nb, nbmin, nh, nx
+           real(dp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'DGEHRD', ' ', n, ilo, ihi, -1 ) )
+              lwkopt = n*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEHRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! set elements 1:ilo-1 and ihi:n-1 of tau to zero
+           do i = 1, ilo - 1
+              tau( i ) = zero
+           end do
+           do i = max( 1, ihi ), n - 1
+              tau( i ) = zero
+           end do
+           ! quick return if possible
+           nh = ihi - ilo + 1
+           if( nh<=1 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! determine the block size
+           nb = min( nbmax, stdlib_ilaenv( 1, 'DGEHRD', ' ', n, ilo, ihi, -1 ) )
+           nbmin = 2
+           if( nb>1 .and. nb<nh ) then
+              ! determine when to cross over from blocked to unblocked code
+              ! (last block is always handled by unblocked code)
+              nx = max( nb, stdlib_ilaenv( 3, 'DGEHRD', ' ', n, ilo, ihi, -1 ) )
+              if( nx<nh ) then
+                 ! determine if workspace is large enough for blocked code
+                 if( lwork<n*nb+tsize ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGEHRD', ' ', n, ilo, ihi,-1 ) )
+                    if( lwork>=(n*nbmin + tsize) ) then
+                       nb = (lwork-tsize) / n
+                    else
+                       nb = 1
+                    end if
+                 end if
+              end if
+           end if
+           ldwork = n
+           if( nb<nbmin .or. nb>=nh ) then
+              ! use unblocked code below
+              i = ilo
+           else
+              ! use blocked code
+              iwt = 1 + n*nb
+              do i = ilo, ihi - 1 - nx, nb
+                 ib = min( nb, ihi-i )
+                 ! reduce columns i:i+ib-1 to hessenberg form, returning the
+                 ! matrices v and t of the block reflector h = i - v*t*v**t
+                 ! which performs the reduction, and also the matrix y = a*v*t
+                 call stdlib_dlahr2( ihi, i, ib, a( 1, i ), lda, tau( i ),work( iwt ), ldt, work, &
+                           ldwork )
+                 ! apply the block reflector h to a(1:ihi,i+ib:ihi) from the
+                 ! right, computing  a := a - y * v**t. v(i+ib,ib-1) must be set
+                 ! to 1
+                 ei = a( i+ib, i+ib-1 )
+                 a( i+ib, i+ib-1 ) = one
+                 call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE',ihi, ihi-i-ib+1,ib, -one, work, &
+                           ldwork, a( i+ib, i ), lda, one,a( 1, i+ib ), lda )
+                 a( i+ib, i+ib-1 ) = ei
+                 ! apply the block reflector h to a(1:i,i+1:i+ib-1) from the
+                 ! right
+                 call stdlib_dtrmm( 'RIGHT', 'LOWER', 'TRANSPOSE','UNIT', i, ib-1,one, a( i+1, i )&
+                           , lda, work, ldwork )
+                 do j = 0, ib-2
+                    call stdlib_daxpy( i, -one, work( ldwork*j+1 ), 1,a( 1, i+j+1 ), 1 )
+                 end do
+                 ! apply the block reflector h to a(i+1:ihi,i+ib:n) from the
+                 ! left
+                 call stdlib_dlarfb( 'LEFT', 'TRANSPOSE', 'FORWARD','COLUMNWISE',ihi-i, n-i-ib+1, &
+                           ib, a( i+1, i ), lda,work( iwt ), ldt, a( i+1, i+ib ), lda,work, ldwork )
+              end do
+           end if
+           ! use unblocked code to reduce the rest of the matrix
+           call stdlib_dgehd2( n, i, ihi, a, lda, tau, work, iinfo )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dgehrd
+
+     !> DGELQT: computes a blocked LQ factorization of a real M-by-N matrix A
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_dgelqt( m, n, mb, a, lda, t, ldt, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, mb
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, iinfo, k
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( mb<1 .or. ( mb>min(m,n) .and. min(m,n)>0 ) )then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<mb ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELQT', -info )
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) return
+           ! blocked loop of length k
+           do i = 1, k,  mb
+              ib = min( k-i+1, mb )
+           ! compute the lq factorization of the current block a(i:m,i:i+ib-1)
+              call stdlib_dgelqt3( ib, n-i+1, a(i,i), lda, t(1,i), ldt, iinfo )
+              if( i+ib<=m ) then
+           ! update by applying h**t to a(i:m,i+ib:n) from the right
+              call stdlib_dlarfb( 'R', 'N', 'F', 'R', m-i-ib+1, n-i+1, ib,a( i, i ), lda, t( 1, i &
+                        ), ldt,a( i+ib, i ), lda, work , m-i-ib+1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_dgelqt
+
+     !> DGELS: solves overdetermined or underdetermined real linear systems
+     !> involving an M-by-N matrix A, or its transpose, using a QR or LQ
+     !> factorization of A.  It is assumed that A has full rank.
+     !> The following options are provided:
+     !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A*X ||.
+     !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+     !> an underdetermined system A * X = B.
+     !> 3. If TRANS = 'T' and m >= n:  find the minimum norm solution of
+     !> an underdetermined system A**T * X = B.
+     !> 4. If TRANS = 'T' and m < n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A**T * X ||.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+
+     subroutine stdlib_dgels( trans, m, n, nrhs, a, lda, b, ldb, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, tpsd
+           integer(ilp) :: brow, i, iascl, ibscl, j, mn, nb, scllen, wsize
+           real(dp) :: anrm, bignum, bnrm, smlnum
+           ! Local Arrays 
+           real(dp) :: rwork(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( .not.( stdlib_lsame( trans, 'N' ) .or. stdlib_lsame( trans, 'T' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m, n ) ) then
+              info = -8
+           else if( lwork<max( 1, mn+max( mn, nrhs ) ) .and. .not.lquery )then
+              info = -10
+           end if
+           ! figure out optimal block size
+           if( info==0 .or. info==-10 ) then
+              tpsd = .true.
+              if( stdlib_lsame( trans, 'N' ) )tpsd = .false.
+              if( m>=n ) then
+                 nb = stdlib_ilaenv( 1, 'DGEQRF', ' ', m, n, -1, -1 )
+                 if( tpsd ) then
+                    nb = max( nb, stdlib_ilaenv( 1, 'DORMQR', 'LN', m, nrhs, n,-1 ) )
+                 else
+                    nb = max( nb, stdlib_ilaenv( 1, 'DORMQR', 'LT', m, nrhs, n,-1 ) )
+                 end if
+              else
+                 nb = stdlib_ilaenv( 1, 'DGELQF', ' ', m, n, -1, -1 )
+                 if( tpsd ) then
+                    nb = max( nb, stdlib_ilaenv( 1, 'DORMLQ', 'LT', n, nrhs, m,-1 ) )
+                 else
+                    nb = max( nb, stdlib_ilaenv( 1, 'DORMLQ', 'LN', n, nrhs, m,-1 ) )
+                 end if
+              end if
+              wsize = max( 1, mn+max( mn, nrhs )*nb )
+              work( 1 ) = real( wsize,KIND=dp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELS ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+              call stdlib_dlaset( 'FULL', max( m, n ), nrhs, zero, zero, b, ldb )
+              return
+           end if
+           ! get machine parameters
+           smlnum = stdlib_dlamch( 'S' ) / stdlib_dlamch( 'P' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! scale a, b if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_dlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_dlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_dlaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              go to 50
+           end if
+           brow = m
+           if( tpsd )brow = n
+           bnrm = stdlib_dlange( 'M', brow, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_dlascl( 'G', 0, 0, bnrm, smlnum, brow, nrhs, b, ldb,info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_dlascl( 'G', 0, 0, bnrm, bignum, brow, nrhs, b, ldb,info )
+              ibscl = 2
+           end if
+           if( m>=n ) then
+              ! compute qr factorization of a
+              call stdlib_dgeqrf( m, n, a, lda, work( 1 ), work( mn+1 ), lwork-mn,info )
+              ! workspace at least n, optimally n*nb
+              if( .not.tpsd ) then
+                 ! least-squares problem min || a * x - b ||
+                 ! b(1:m,1:nrhs) := q**t * b(1:m,1:nrhs)
+                 call stdlib_dormqr( 'LEFT', 'TRANSPOSE', m, nrhs, n, a, lda,work( 1 ), b, ldb, &
+                           work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:n,1:nrhs) := inv(r) * b(1:n,1:nrhs)
+                 call stdlib_dtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = n
+              else
+                 ! underdetermined system of equations a**t * x = b
+                 ! b(1:n,1:nrhs) := inv(r**t) * b(1:n,1:nrhs)
+                 call stdlib_dtrtrs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(n+1:m,1:nrhs) = zero
+                 do j = 1, nrhs
+                    do i = n + 1, m
+                       b( i, j ) = zero
+                    end do
+                 end do
+                 ! b(1:m,1:nrhs) := q(1:n,:) * b(1:n,1:nrhs)
+                 call stdlib_dormqr( 'LEFT', 'NO TRANSPOSE', m, nrhs, n, a, lda,work( 1 ), b, ldb,&
+                            work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = m
+              end if
+           else
+              ! compute lq factorization of a
+              call stdlib_dgelqf( m, n, a, lda, work( 1 ), work( mn+1 ), lwork-mn,info )
+              ! workspace at least m, optimally m*nb.
+              if( .not.tpsd ) then
+                 ! underdetermined system of equations a * x = b
+                 ! b(1:m,1:nrhs) := inv(l) * b(1:m,1:nrhs)
+                 call stdlib_dtrtrs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', m, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(m+1:n,1:nrhs) = 0
+                 do j = 1, nrhs
+                    do i = m + 1, n
+                       b( i, j ) = zero
+                    end do
+                 end do
+                 ! b(1:n,1:nrhs) := q(1:n,:)**t * b(1:m,1:nrhs)
+                 call stdlib_dormlq( 'LEFT', 'TRANSPOSE', n, nrhs, m, a, lda,work( 1 ), b, ldb, &
+                           work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = n
+              else
+                 ! overdetermined system min || a**t * x - b ||
+                 ! b(1:n,1:nrhs) := q * b(1:n,1:nrhs)
+                 call stdlib_dormlq( 'LEFT', 'NO TRANSPOSE', n, nrhs, m, a, lda,work( 1 ), b, ldb,&
+                            work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:m,1:nrhs) := inv(l**t) * b(1:m,1:nrhs)
+                 call stdlib_dtrtrs( 'LOWER', 'TRANSPOSE', 'NON-UNIT', m, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = m
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_dlascl( 'G', 0, 0, anrm, smlnum, scllen, nrhs, b, ldb,info )
+           else if( iascl==2 ) then
+              call stdlib_dlascl( 'G', 0, 0, anrm, bignum, scllen, nrhs, b, ldb,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_dlascl( 'G', 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb,info )
+           else if( ibscl==2 ) then
+              call stdlib_dlascl( 'G', 0, 0, bignum, bnrm, scllen, nrhs, b, ldb,info )
+           end if
+           50 continue
+           work( 1 ) = real( wsize,KIND=dp)
+           return
+     end subroutine stdlib_dgels
+
+     !> DGEMLQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product
+     !> of blocked elementary reflectors computed by short wide LQ
+     !> factorization (DGELQ)
+
+     pure subroutine stdlib_dgemlq( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, tsize, lwork, ldc
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), t(*)
+           real(dp), intent(inout) :: c(ldc,*)
+           real(dp), intent(out) :: work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: mb, nb, lw, nblcks, mn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork==-1
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'T' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           mb = int( t( 2 ),KIND=ilp)
+           nb = int( t( 3 ),KIND=ilp)
+           if( left ) then
+             lw = n * mb
+             mn = m
+           else
+             lw = m * mb
+             mn = n
+           end if
+           if( ( nb>k ) .and. ( mn>k ) ) then
+             if( mod( mn - k, nb - k ) == 0 ) then
+               nblcks = ( mn - k ) / ( nb - k )
+             else
+               nblcks = ( mn - k ) / ( nb - k ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+             info = -1
+           else if( .not.tran .and. .not.notran ) then
+             info = -2
+           else if( m<0 ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 .or. k>mn ) then
+             info = -5
+           else if( lda<max( 1, k ) ) then
+             info = -7
+           else if( tsize<5 ) then
+             info = -9
+           else if( ldc<max( 1, m ) ) then
+             info = -11
+           else if( ( lwork<max( 1, lw ) ) .and. ( .not.lquery ) ) then
+             info = -13
+           end if
+           if( info==0 ) then
+             work( 1 ) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DGEMLQ', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n, k )==0 ) then
+             return
+           end if
+           if( ( left .and. m<=k ) .or. ( right .and. n<=k ).or. ( nb<=k ) .or. ( nb>=max( m, n, &
+                     k ) ) ) then
+             call stdlib_dgemlqt( side, trans, m, n, k, mb, a, lda,t( 6 ), mb, c, ldc, work, info &
+                       )
+           else
+             call stdlib_dlamswlq( side, trans, m, n, k, mb, nb, a, lda, t( 6 ),mb, c, ldc, work, &
+                       lwork, info )
+           end if
+           work( 1 ) = lw
+           return
+     end subroutine stdlib_dgemlq
+
+     !> DGEMQR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product
+     !> of blocked elementary reflectors computed by tall skinny
+     !> QR factorization (DGEQR)
+
+     pure subroutine stdlib_dgemqr( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, tsize, lwork, ldc
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), t(*)
+           real(dp), intent(inout) :: c(ldc,*)
+           real(dp), intent(out) :: work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: mb, nb, lw, nblcks, mn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork==-1
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'T' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           mb = int( t( 2 ),KIND=ilp)
+           nb = int( t( 3 ),KIND=ilp)
+           if( left ) then
+             lw = n * nb
+             mn = m
+           else
+             lw = mb * nb
+             mn = n
+           end if
+           if( ( mb>k ) .and. ( mn>k ) ) then
+             if( mod( mn - k, mb - k )==0 ) then
+               nblcks = ( mn - k ) / ( mb - k )
+             else
+               nblcks = ( mn - k ) / ( mb - k ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+             info = -1
+           else if( .not.tran .and. .not.notran ) then
+             info = -2
+           else if( m<0 ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 .or. k>mn ) then
+             info = -5
+           else if( lda<max( 1, mn ) ) then
+             info = -7
+           else if( tsize<5 ) then
+             info = -9
+           else if( ldc<max( 1, m ) ) then
+             info = -11
+           else if( ( lwork<max( 1, lw ) ) .and. ( .not.lquery ) ) then
+             info = -13
+           end if
+           if( info==0 ) then
+             work( 1 ) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DGEMQR', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n, k )==0 ) then
+             return
+           end if
+           if( ( left .and. m<=k ) .or. ( right .and. n<=k ).or. ( mb<=k ) .or. ( mb>=max( m, n, &
+                     k ) ) ) then
+             call stdlib_dgemqrt( side, trans, m, n, k, nb, a, lda, t( 6 ),nb, c, ldc, work, info &
+                       )
+           else
+             call stdlib_dlamtsqr( side, trans, m, n, k, mb, nb, a, lda, t( 6 ),nb, c, ldc, work, &
+                       lwork, info )
+           end if
+           work( 1 ) = lw
+           return
+     end subroutine stdlib_dgemqr
+
+     !> DGEQP3: computes a QR factorization with column pivoting of a
+     !> matrix A:  A*P = Q*R  using Level 3 BLAS.
+
+     pure subroutine stdlib_dgeqp3( m, n, a, lda, jpvt, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: inb = 1
+           integer(ilp), parameter :: inbmin = 2
+           integer(ilp), parameter :: ixover = 3
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: fjb, iws, j, jb, lwkopt, minmn, minws, na, nb, nbmin, nfxd, nx, sm, &
+                     sminmn, sn, topbmn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+        ! ====================
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              minmn = min( m, n )
+              if( minmn==0 ) then
+                 iws = 1
+                 lwkopt = 1
+              else
+                 iws = 3*n + 1
+                 nb = stdlib_ilaenv( inb, 'DGEQRF', ' ', m, n, -1, -1 )
+                 lwkopt = 2*n + ( n + 1 )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( ( lwork<iws ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQP3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! move initial columns up front.
+           nfxd = 1
+           do j = 1, n
+              if( jpvt( j )/=0 ) then
+                 if( j/=nfxd ) then
+                    call stdlib_dswap( m, a( 1, j ), 1, a( 1, nfxd ), 1 )
+                    jpvt( j ) = jpvt( nfxd )
+                    jpvt( nfxd ) = j
+                 else
+                    jpvt( j ) = j
+                 end if
+                 nfxd = nfxd + 1
+              else
+                 jpvt( j ) = j
+              end if
+           end do
+           nfxd = nfxd - 1
+           ! factorize fixed columns
+        ! =======================
+           ! compute the qr factorization of fixed columns and update
+           ! remaining columns.
+           if( nfxd>0 ) then
+              na = min( m, nfxd )
+      ! cc      call stdlib_dgeqr2( m, na, a, lda, tau, work, info )
+              call stdlib_dgeqrf( m, na, a, lda, tau, work, lwork, info )
+              iws = max( iws, int( work( 1 ),KIND=ilp) )
+              if( na<n ) then
+      ! cc         call stdlib_dorm2r( 'left', 'transpose', m, n-na, na, a, lda,
+      ! cc  $                   tau, a( 1, na+1 ), lda, work, info )
+                 call stdlib_dormqr( 'LEFT', 'TRANSPOSE', m, n-na, na, a, lda, tau,a( 1, na+1 ), &
+                           lda, work, lwork, info )
+                 iws = max( iws, int( work( 1 ),KIND=ilp) )
+              end if
+           end if
+           ! factorize free columns
+        ! ======================
+           if( nfxd<minmn ) then
+              sm = m - nfxd
+              sn = n - nfxd
+              sminmn = minmn - nfxd
+              ! determine the block size.
+              nb = stdlib_ilaenv( inb, 'DGEQRF', ' ', sm, sn, -1, -1 )
+              nbmin = 2
+              nx = 0
+              if( ( nb>1 ) .and. ( nb<sminmn ) ) then
+                 ! determine when to cross over from blocked to unblocked code.
+                 nx = max( 0, stdlib_ilaenv( ixover, 'DGEQRF', ' ', sm, sn, -1,-1 ) )
+                 if( nx<sminmn ) then
+                    ! determine if workspace is large enough for blocked code.
+                    minws = 2*sn + ( sn+1 )*nb
+                    iws = max( iws, minws )
+                    if( lwork<minws ) then
+                       ! not enough workspace to use optimal nb: reduce nb and
+                       ! determine the minimum value of nb.
+                       nb = ( lwork-2*sn ) / ( sn+1 )
+                       nbmin = max( 2, stdlib_ilaenv( inbmin, 'DGEQRF', ' ', sm, sn,-1, -1 ) )
+                                 
+                    end if
+                 end if
+              end if
+              ! initialize partial column norms. the first n elements of work
+              ! store the exact column norms.
+              do j = nfxd + 1, n
+                 work( j ) = stdlib_dnrm2( sm, a( nfxd+1, j ), 1 )
+                 work( n+j ) = work( j )
+              end do
+              if( ( nb>=nbmin ) .and. ( nb<sminmn ) .and.( nx<sminmn ) ) then
+                 ! use blocked code initially.
+                 j = nfxd + 1
+                 ! compute factorization: while loop.
+                 topbmn = minmn - nx
+                 30 continue
+                 if( j<=topbmn ) then
+                    jb = min( nb, topbmn-j+1 )
+                    ! factorize jb columns among columns j:n.
+                    call stdlib_dlaqps( m, n-j+1, j-1, jb, fjb, a( 1, j ), lda,jpvt( j ), tau( j )&
+                              , work( j ), work( n+j ),work( 2*n+1 ), work( 2*n+jb+1 ), n-j+1 )
+                    j = j + fjb
+                    go to 30
+                 end if
+              else
+                 j = nfxd + 1
+              end if
+              ! use unblocked code to factor the last or only block.
+              if( j<=minmn )call stdlib_dlaqp2( m, n-j+1, j-1, a( 1, j ), lda, jpvt( j ),tau( j ),&
+                         work( j ), work( n+j ),work( 2*n+1 ) )
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_dgeqp3
+
+     !> DGEQRT: computes a blocked QR factorization of a real M-by-N matrix A
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_dgeqrt( m, n, nb, a, lda, t, ldt, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, nb
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk), parameter :: use_recursive_qr = .true.
+           integer(ilp) :: i, ib, iinfo, k
+           
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nb<1 .or. ( nb>min(m,n) .and. min(m,n)>0 ) )then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<nb ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQRT', -info )
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) return
+           ! blocked loop of length k
+           do i = 1, k,  nb
+              ib = min( k-i+1, nb )
+           ! compute the qr factorization of the current block a(i:m,i:i+ib-1)
+              if( use_recursive_qr ) then
+                 call stdlib_dgeqrt3( m-i+1, ib, a(i,i), lda, t(1,i), ldt, iinfo )
+              else
+                 call stdlib_dgeqrt2( m-i+1, ib, a(i,i), lda, t(1,i), ldt, iinfo )
+              end if
+              if( i+ib<=n ) then
+           ! update by applying h**t to a(i:m,i+ib:n) from the left
+                 call stdlib_dlarfb( 'L', 'T', 'F', 'C', m-i+1, n-i-ib+1, ib,a( i, i ), lda, t( 1,&
+                            i ), ldt,a( i, i+ib ), lda, work , n-i-ib+1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_dgeqrt
+
+     !> DGESV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> The LU decomposition with partial pivoting and row interchanges is
+     !> used to factor A as
+     !> A = P * L * U,
+     !> where P is a permutation matrix, L is unit lower triangular, and U is
+     !> upper triangular.  The factored form of A is then used to solve the
+     !> system of equations A * X = B.
+
+     pure subroutine stdlib_dgesv( n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGESV ', -info )
+              return
+           end if
+           ! compute the lu factorization of a.
+           call stdlib_dgetrf( n, n, a, lda, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_dgetrs( 'NO TRANSPOSE', n, nrhs, a, lda, ipiv, b, ldb,info )
+           end if
+           return
+     end subroutine stdlib_dgesv
+
+     !> DGESVJ: computes the singular value decomposition (SVD) of a real
+     !> M-by-N matrix A, where M >= N. The SVD of A is written as
+     !> [++]   [xx]   [x0]   [xx]
+     !> A = U * SIGMA * V^t,  [++] = [xx] * [ox] * [xx]
+     !> [++]   [xx]
+     !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+     !> matrix, and V is an N-by-N orthogonal matrix. The diagonal elements
+     !> of SIGMA are the singular values of A. The columns of U and V are the
+     !> left and the right singular vectors of A, respectively.
+     !> DGESVJ can sometimes compute tiny singular values and their singular vectors much
+     !> more accurately than other SVD routines, see below under Further Details.
+
+     pure subroutine stdlib_dgesvj( joba, jobu, jobv, m, n, a, lda, sva, mv, v,ldv, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n
+           character, intent(in) :: joba, jobu, jobv
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), v(ldv,*), work(lwork)
+           real(dp), intent(out) :: sva(n)
+        ! =====================================================================
+           ! Local Parameters 
+           integer(ilp), parameter :: nsweep = 30
+           
+           
+           ! Local Scalars 
+           real(dp) :: aapp, aapp0, aapq, aaqq, apoaq, aqoap, big, bigtheta, cs, ctol, epsln, &
+           large, mxaapq, mxsinj, rootbig, rooteps, rootsfmin, roottol, skl, sfmin, small, sn, t, &
+                     temp1, theta, thsign, tol
+           integer(ilp) :: blskip, emptsw, i, ibr, ierr, igl, ijblsk, ir1, iswrot, jbc, jgl, kbl, &
+                     lkahead, mvl, n2, n34, n4, nbl, notrot, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, goscale, lower, lsvec, noscale, rotok, rsvec, uctol, &
+                     upper
+           ! Local Arrays 
+           real(dp) :: fastr(5)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,sign,sqrt
+           ! from lapack
+           ! from lapack
+           ! Executable Statements 
+           ! test the input arguments
+           lsvec = stdlib_lsame( jobu, 'U' )
+           uctol = stdlib_lsame( jobu, 'C' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           applv = stdlib_lsame( jobv, 'A' )
+           upper = stdlib_lsame( joba, 'U' )
+           lower = stdlib_lsame( joba, 'L' )
+           if( .not.( upper .or. lower .or. stdlib_lsame( joba, 'G' ) ) ) then
+              info = -1
+           else if( .not.( lsvec .or. uctol .or. stdlib_lsame( jobu, 'N' ) ) ) then
+              info = -2
+           else if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -5
+           else if( lda<m ) then
+              info = -7
+           else if( mv<0 ) then
+              info = -9
+           else if( ( rsvec .and. ( ldv<n ) ) .or.( applv .and. ( ldv<mv ) ) ) then
+              info = -11
+           else if( uctol .and. ( work( 1 )<=one ) ) then
+              info = -12
+           else if( lwork<max( m+n, 6 ) ) then
+              info = -13
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGESVJ', -info )
+              return
+           end if
+       ! #:) quick return for void matrix
+           if( ( m==0 ) .or. ( n==0 ) )return
+           ! set numerical parameters
+           ! the stopping criterion for jacobi rotations is
+           ! max_{i<>j}|a(:,i)^t * a(:,j)|/(||a(:,i)||*||a(:,j)||) < ctol*eps
+           ! where eps is the round-off and ctol is defined as follows:
+           if( uctol ) then
+              ! ... user controlled
+              ctol = work( 1 )
+           else
+              ! ... default
+              if( lsvec .or. rsvec .or. applv ) then
+                 ctol = sqrt( real( m,KIND=dp) )
+              else
+                 ctol = real( m,KIND=dp)
+              end if
+           end if
+           ! ... and the machine dependent parameters are
+      ! [!]  (make sure that stdlib_dlamch() works properly on the target machine.)
+           epsln = stdlib_dlamch( 'EPSILON' )
+           rooteps = sqrt( epsln )
+           sfmin = stdlib_dlamch( 'SAFEMINIMUM' )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / epsln
+           big = stdlib_dlamch( 'OVERFLOW' )
+           ! big         = one    / sfmin
+           rootbig = one / rootsfmin
+           large = big / sqrt( real( m*n,KIND=dp) )
+           bigtheta = one / rooteps
+           tol = ctol*epsln
+           roottol = sqrt( tol )
+           if( real( m,KIND=dp)*epsln>=one ) then
+              info = -4
+              call stdlib_xerbla( 'DGESVJ', -info )
+              return
+           end if
+           ! initialize the right singular vector matrix.
+           if( rsvec ) then
+              mvl = n
+              call stdlib_dlaset( 'A', mvl, n, zero, one, v, ldv )
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           ! initialize sva( 1:n ) = ( ||a e_i||_2, i = 1:n )
+      ! (!)  if necessary, scale a to protect the largest singular value
+           ! from overflow. it is possible that saving the largest singular
+           ! value destroys the information about the small ones.
+           ! this initial scaling is almost minimal in the sense that the
+           ! goal is to make sure that no column norm overflows, and that
+           ! sqrt(n)*max_i sva(i) does not overflow. if infinite entries
+           ! in a are detected, the procedure returns with info=-6.
+           skl= one / sqrt( real( m,KIND=dp)*real( n,KIND=dp) )
+           noscale = .true.
+           goscale = .true.
+           if( lower ) then
+              ! the input matrix is m-by-n lower triangular (trapezoidal)
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_dlassq( m-p+1, a( p, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'DGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl)
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           else if( upper ) then
+              ! the input matrix is m-by-n upper triangular (trapezoidal)
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_dlassq( p, a( 1, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'DGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl)
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           else
+              ! the input matrix is m-by-n general dense
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_dlassq( m, a( 1, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'DGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl)
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           end if
+           if( noscale )skl= one
+           ! move the smaller part of the spectrum from the underflow threshold
+      ! (!)  start by determining the position of the nonzero entries of the
+           ! array sva() relative to ( sfmin, big ).
+           aapp = zero
+           aaqq = big
+           do p = 1, n
+              if( sva( p )/=zero )aaqq = min( aaqq, sva( p ) )
+              aapp = max( aapp, sva( p ) )
+           end do
+       ! #:) quick return for zero matrix
+           if( aapp==zero ) then
+              if( lsvec )call stdlib_dlaset( 'G', m, n, zero, one, a, lda )
+              work( 1 ) = one
+              work( 2 ) = zero
+              work( 3 ) = zero
+              work( 4 ) = zero
+              work( 5 ) = zero
+              work( 6 ) = zero
+              return
+           end if
+       ! #:) quick return for one-column matrix
+           if( n==1 ) then
+              if( lsvec )call stdlib_dlascl( 'G', 0, 0, sva( 1 ), skl, m, 1,a( 1, 1 ), lda, ierr )
+                        
+              work( 1 ) = one / skl
+              if( sva( 1 )>=sfmin ) then
+                 work( 2 ) = one
+              else
+                 work( 2 ) = zero
+              end if
+              work( 3 ) = zero
+              work( 4 ) = zero
+              work( 5 ) = zero
+              work( 6 ) = zero
+              return
+           end if
+           ! protect small singular values from underflow, and try to
+           ! avoid underflows/overflows in computing jacobi rotations.
+           sn = sqrt( sfmin / epsln )
+           temp1 = sqrt( big / real( n,KIND=dp) )
+           if( ( aapp<=sn ) .or. ( aaqq>=temp1 ) .or.( ( sn<=aaqq ) .and. ( aapp<=temp1 ) ) ) &
+                     then
+              temp1 = min( big, temp1 / aapp )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq<=sn ) .and. ( aapp<=temp1 ) ) then
+              temp1 = min( sn / aaqq, big / ( aapp*sqrt( real( n,KIND=dp) ) ) )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq>=sn ) .and. ( aapp>=temp1 ) ) then
+              temp1 = max( sn / aaqq, temp1 / aapp )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq<=sn ) .and. ( aapp>=temp1 ) ) then
+              temp1 = min( sn / aaqq, big / ( sqrt( real( n,KIND=dp) )*aapp ) )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else
+              temp1 = one
+           end if
+           ! scale, if necessary
+           if( temp1/=one ) then
+              call stdlib_dlascl( 'G', 0, 0, one, temp1, n, 1, sva, n, ierr )
+           end if
+           skl= temp1*skl
+           if( skl/=one ) then
+              call stdlib_dlascl( joba, 0, 0, one, skl, m, n, a, lda, ierr )
+              skl= one / skl
+           end if
+           ! row-cyclic jacobi svd algorithm with column pivoting
+           emptsw = ( n*( n-1 ) ) / 2
+           notrot = 0
+           fastr( 1 ) = zero
+           ! a is represented in factored form a = a * diag(work), where diag(work)
+           ! is initialized to identity. work is updated during fast scaled
+           ! rotations.
+           do q = 1, n
+              work( q ) = one
+           end do
+           swband = 3
+      ! [tp] swband is a tuning parameter [tp]. it is meaningful and effective
+           ! if stdlib_dgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_dgesvj. for sweeps i=1:swband the procedure
+           ! works on pivots inside a band-like region around the diagonal.
+           ! the boundaries are determined dynamically, based on the number of
+           ! pivots above a threshold.
+           kbl = min( 8, n )
+      ! [tp] kbl is a tuning parameter that defines the tile size in the
+           ! tiling of the p-q loops of pivot pairs. in general, an optimal
+           ! value of kbl depends on the matrix dimensions and on the
+           ! parameters of the computer's memory.
+           nbl = n / kbl
+           if( ( nbl*kbl )/=n )nbl = nbl + 1
+           blskip = kbl**2
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           lkahead = 1
+      ! [tp] lkahead is a tuning parameter.
+           ! quasi block transformations, using the lower (upper) triangular
+           ! structure of the input matrix. the quasi-block-cycling usually
+           ! invokes cubic convergence. big part of this cycle is done inside
+           ! canonical subspaces of dimensions less than m.
+           if( ( lower .or. upper ) .and. ( n>max( 64, 4*kbl ) ) ) then
+      ! [tp] the number of partition levels and the actual partition are
+           ! tuning parameters.
+              n4 = n / 4
+              n2 = n / 2
+              n34 = 3*n4
+              if( applv ) then
+                 q = 0
+              else
+                 q = 1
+              end if
+              if( lower ) then
+           ! this works very well on lower triangular matrices, in particular
+           ! in the framework of the preconditioned jacobi svd (xgejsv).
+           ! the idea is simple:
+           ! [+ 0 0 0]   note that jacobi transformations of [0 0]
+           ! [+ + 0 0]                                       [0 0]
+           ! [+ + x 0]   actually work on [x 0]              [x 0]
+           ! [+ + x x]                    [x x].             [x x]
+                 call stdlib_dgsvj0( jobv, m-n34, n-n34, a( n34+1, n34+1 ), lda,work( n34+1 ), &
+                 sva( n34+1 ), mvl,v( n34*q+1, n34+1 ), ldv, epsln, sfmin, tol,2, work( n+1 ), &
+                           lwork-n, ierr )
+                 call stdlib_dgsvj0( jobv, m-n2, n34-n2, a( n2+1, n2+1 ), lda,work( n2+1 ), sva( &
+                 n2+1 ), mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 2,work( n+1 ), lwork-n, &
+                           ierr )
+                 call stdlib_dgsvj1( jobv, m-n2, n-n2, n4, a( n2+1, n2+1 ), lda,work( n2+1 ), sva(&
+                  n2+1 ), mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 1,work( n+1 ), lwork-n, &
+                            ierr )
+                 call stdlib_dgsvj0( jobv, m-n4, n2-n4, a( n4+1, n4+1 ), lda,work( n4+1 ), sva( &
+                 n4+1 ), mvl,v( n4*q+1, n4+1 ), ldv, epsln, sfmin, tol, 1,work( n+1 ), lwork-n, &
+                           ierr )
+                 call stdlib_dgsvj0( jobv, m, n4, a, lda, work, sva, mvl, v, ldv,epsln, sfmin, &
+                           tol, 1, work( n+1 ), lwork-n,ierr )
+                 call stdlib_dgsvj1( jobv, m, n2, n4, a, lda, work, sva, mvl, v,ldv, epsln, sfmin,&
+                            tol, 1, work( n+1 ),lwork-n, ierr )
+              else if( upper ) then
+                 call stdlib_dgsvj0( jobv, n4, n4, a, lda, work, sva, mvl, v, ldv,epsln, sfmin, &
+                           tol, 2, work( n+1 ), lwork-n,ierr )
+                 call stdlib_dgsvj0( jobv, n2, n4, a( 1, n4+1 ), lda, work( n4+1 ),sva( n4+1 ), &
+                 mvl, v( n4*q+1, n4+1 ), ldv,epsln, sfmin, tol, 1, work( n+1 ), lwork-n,ierr )
+                           
+                 call stdlib_dgsvj1( jobv, n2, n2, n4, a, lda, work, sva, mvl, v,ldv, epsln, &
+                           sfmin, tol, 1, work( n+1 ),lwork-n, ierr )
+                 call stdlib_dgsvj0( jobv, n2+n4, n4, a( 1, n2+1 ), lda,work( n2+1 ), sva( n2+1 ),&
+                  mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 1,work( n+1 ), lwork-n, ierr )
+                            
+              end if
+           end if
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+           ! each sweep is unrolled using kbl-by-kbl tiles over the pivot pairs
+           ! 1 <= p < q <= n. this is the first step toward a blocked implementation
+           ! of the rotations. new implementation, based on block transformations,
+           ! is under development.
+              loop_2000: do ibr = 1, nbl
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_1002: do ir1 = 0, min( lkahead, nbl-ibr )
+                    igl = igl + ir1*kbl
+                    loop_2001: do p = igl, min( igl+kbl-1, n-1 )
+           ! .. de rijk's pivoting
+                       q = stdlib_idamax( n-p+1, sva( p ), 1 ) + p - 1
+                       if( p/=q ) then
+                          call stdlib_dswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                          if( rsvec )call stdlib_dswap( mvl, v( 1, p ), 1,v( 1, q ), 1 )
+                          temp1 = sva( p )
+                          sva( p ) = sva( q )
+                          sva( q ) = temp1
+                          temp1 = work( p )
+                          work( p ) = work( q )
+                          work( q ) = temp1
+                       end if
+                       if( ir1==0 ) then
+              ! column norms are periodically updated by explicit
+              ! norm computation.
+              ! caveat:
+              ! unfortunately, some blas implementations compute stdlib_dnrm2(m,a(1,p),1)
+              ! as sqrt(stdlib_ddot(m,a(1,p),1,a(1,p),1)), which may cause the result to
+              ! overflow for ||a(:,p)||_2 > sqrt(overflow_threshold), and to
+              ! underflow for ||a(:,p)||_2 < sqrt(underflow_threshold).
+              ! hence, stdlib_dnrm2 cannot be trusted, not even in the case when
+              ! the true norm is far from the under(over)flow boundaries.
+              ! if properly implemented stdlib_dnrm2 is available, the if-then-else
+              ! below should read "aapp = stdlib_dnrm2( m, a(1,p), 1 ) * work(p)".
+                          if( ( sva( p )<rootbig ) .and.( sva( p )>rootsfmin ) ) then
+                             sva( p ) = stdlib_dnrm2( m, a( 1, p ), 1 )*work( p )
+                          else
+                             temp1 = zero
+                             aapp = one
+                             call stdlib_dlassq( m, a( 1, p ), 1, temp1, aapp )
+                             sva( p ) = temp1*sqrt( aapp )*work( p )
+                          end if
+                          aapp = sva( p )
+                       else
+                          aapp = sva( p )
+                       end if
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2002: do q = p + 1, min( igl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+                                if( aaqq>=one ) then
+                                   rotok = ( small*aapp )<=aaqq
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_ddot( m, a( 1, p ), 1, a( 1,q ), 1 )*work( &
+                                                p )*work( q ) /aaqq ) / aapp
+                                   else
+                                      call stdlib_dcopy( m, a( 1, p ), 1,work( n+1 ), 1 )
+                                      call stdlib_dlascl( 'G', 0, 0, aapp,work( p ), m, 1,work( n+&
+                                                1 ), lda, ierr )
+                                      aapq = stdlib_ddot( m, work( n+1 ), 1,a( 1, q ), 1 )*work( &
+                                                q ) / aaqq
+                                   end if
+                                else
+                                   rotok = aapp<=( aaqq / small )
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_ddot( m, a( 1, p ), 1, a( 1,q ), 1 )*work( &
+                                                p )*work( q ) /aaqq ) / aapp
+                                   else
+                                      call stdlib_dcopy( m, a( 1, q ), 1,work( n+1 ), 1 )
+                                      call stdlib_dlascl( 'G', 0, 0, aaqq,work( q ), m, 1,work( n+&
+                                                1 ), lda, ierr )
+                                      aapq = stdlib_ddot( m, work( n+1 ), 1,a( 1, p ), 1 )*work( &
+                                                p ) / aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                 ! Rotate
+      ! [rtd]      rotated = rotated + one
+                                   if( ir1==0 ) then
+                                      notrot = 0
+                                      pskipped = 0
+                                      iswrot = iswrot + 1
+                                   end if
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs(aqoap-apoaq)/aapq
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*work( p ) / work( q )
+                                         fastr( 4 ) = -t*work( q ) /work( p )
+                                         call stdlib_drotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_drotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = work( p ) / work( q )
+                                         aqoap = work( q ) / work( p )
+                                         if( work( p )>=one ) then
+                                            if( work( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               work( p ) = work( p )*cs
+                                               work( q ) = work( q )*cs
+                                               call stdlib_drotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_drotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_daxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_daxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               work( p ) = work( p )*cs
+                                               work( q ) = work( q ) / cs
+                                               if( rsvec ) then
+                                                  call stdlib_daxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_daxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                            end if
+                                         else
+                                            if( work( q )>=one ) then
+                                               call stdlib_daxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_daxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               work( p ) = work( p ) / cs
+                                               work( q ) = work( q )*cs
+                                               if( rsvec ) then
+                                                  call stdlib_daxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_daxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                            else
+                                               if( work( p )>=work( q ) )then
+                                                  call stdlib_daxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_daxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  work( p ) = work( p )*cs
+                                                  work( q ) = work( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_daxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_daxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_daxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_daxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  work( p ) = work( p ) / cs
+                                                  work( q ) = work( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_daxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_daxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                      call stdlib_dcopy( m, a( 1, p ), 1,work( n+1 ), 1 )
+                                      call stdlib_dlascl( 'G', 0, 0, aapp, one, m,1, work( n+1 ), &
+                                                lda,ierr )
+                                      call stdlib_dlascl( 'G', 0, 0, aaqq, one, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      temp1 = -aapq*work( p ) / work( q )
+                                      call stdlib_daxpy( m, temp1, work( n+1 ), 1,a( 1, q ), 1 )
+                                                
+                                      call stdlib_dlascl( 'G', 0, 0, one, aaqq, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                      mxsinj = max( mxsinj, sfmin )
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! recompute sva(q), sva(p).
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_dnrm2( m, a( 1, q ), 1 )*work( q )
+                                                   
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_dlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*work( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_dnrm2( m, a( 1, p ), 1 )*work( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_dlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*work( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                                else
+              ! a(:,p) and a(:,q) already numerically orthogonal
+                                   if( ir1==0 )notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                end if
+                             else
+              ! a(:,q) is zero column
+                                if( ir1==0 )notrot = notrot + 1
+                                pskipped = pskipped + 1
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                if( ir1==0 )aapp = -aapp
+                                notrot = 0
+                                go to 2103
+                             end if
+                          end do loop_2002
+           ! end q-loop
+           2103 continue
+           ! bailed out of q-loop
+                          sva( p ) = aapp
+                       else
+                          sva( p ) = aapp
+                          if( ( ir1==0 ) .and. ( aapp==zero ) )notrot = notrot + min( igl+kbl-1, &
+                                    n ) - p
+                       end if
+                    end do loop_2001
+           ! end of the p-loop
+           ! end of doing the block ( ibr, ibr )
+                 end do loop_1002
+           ! end of ir1-loop
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = ibr + 1, nbl
+                    jgl = ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! safe gram matrix computation
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_ddot( m, a( 1, p ), 1, a( 1,q ), 1 )*work( &
+                                                p )*work( q ) /aaqq ) / aapp
+                                   else
+                                      call stdlib_dcopy( m, a( 1, p ), 1,work( n+1 ), 1 )
+                                      call stdlib_dlascl( 'G', 0, 0, aapp,work( p ), m, 1,work( n+&
+                                                1 ), lda, ierr )
+                                      aapq = stdlib_ddot( m, work( n+1 ), 1,a( 1, q ), 1 )*work( &
+                                                q ) / aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_ddot( m, a( 1, p ), 1, a( 1,q ), 1 )*work( &
+                                                p )*work( q ) /aaqq ) / aapp
+                                   else
+                                      call stdlib_dcopy( m, a( 1, q ), 1,work( n+1 ), 1 )
+                                      call stdlib_dlascl( 'G', 0, 0, aaqq,work( q ), m, 1,work( n+&
+                                                1 ), lda, ierr )
+                                      aapq = stdlib_ddot( m, work( n+1 ), 1,a( 1, p ), 1 )*work( &
+                                                p ) / aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                                   notrot = 0
+      ! [rtd]      rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs(aqoap-apoaq)/aapq
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*work( p ) / work( q )
+                                         fastr( 4 ) = -t*work( q ) /work( p )
+                                         call stdlib_drotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_drotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = work( p ) / work( q )
+                                         aqoap = work( q ) / work( p )
+                                         if( work( p )>=one ) then
+                                            if( work( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               work( p ) = work( p )*cs
+                                               work( q ) = work( q )*cs
+                                               call stdlib_drotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_drotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_daxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_daxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_daxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_daxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                               work( p ) = work( p )*cs
+                                               work( q ) = work( q ) / cs
+                                            end if
+                                         else
+                                            if( work( q )>=one ) then
+                                               call stdlib_daxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_daxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_daxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_daxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                               work( p ) = work( p ) / cs
+                                               work( q ) = work( q )*cs
+                                            else
+                                               if( work( p )>=work( q ) )then
+                                                  call stdlib_daxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_daxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  work( p ) = work( p )*cs
+                                                  work( q ) = work( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_daxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_daxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_daxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_daxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  work( p ) = work( p ) / cs
+                                                  work( q ) = work( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_daxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_daxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                                      if( aapp>aaqq ) then
+                                         call stdlib_dcopy( m, a( 1, p ), 1,work( n+1 ), 1 )
+                                                   
+                                         call stdlib_dlascl( 'G', 0, 0, aapp, one,m, 1, work( n+1 &
+                                                   ), lda,ierr )
+                                         call stdlib_dlascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*work( p ) / work( q )
+                                         call stdlib_daxpy( m, temp1, work( n+1 ),1, a( 1, q ), 1 &
+                                                   )
+                                         call stdlib_dlascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      else
+                                         call stdlib_dcopy( m, a( 1, q ), 1,work( n+1 ), 1 )
+                                                   
+                                         call stdlib_dlascl( 'G', 0, 0, aaqq, one,m, 1, work( n+1 &
+                                                   ), lda,ierr )
+                                         call stdlib_dlascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*work( q ) / work( p )
+                                         call stdlib_daxpy( m, temp1, work( n+1 ),1, a( 1, p ), 1 &
+                                                   )
+                                         call stdlib_dlascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q)
+                 ! .. recompute sva(q)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_dnrm2( m, a( 1, q ), 1 )*work( q )
+                                                   
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_dlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*work( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_dnrm2( m, a( 1, p ), 1 )*work( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_dlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*work( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_dnrm2( m, a( 1, n ), 1 )*work( n )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_dlassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )*work( n )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<sqrt( real( n,KIND=dp) )*tol ) .and. ( real( n,&
+                        KIND=dp)*mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:( reaching this point means that the procedure has not converged.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means numerical convergence after the i-th
+           ! sweep.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the singular values and find how many are above
+           ! the underflow threshold.
+           n2 = 0
+           n4 = 0
+           do p = 1, n - 1
+              q = stdlib_idamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 temp1 = work( p )
+                 work( p ) = work( q )
+                 work( q ) = temp1
+                 call stdlib_dswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_dswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+              if( sva( p )/=zero ) then
+                 n4 = n4 + 1
+                 if( sva( p )*skl>sfmin )n2 = n2 + 1
+              end if
+           end do
+           if( sva( n )/=zero ) then
+              n4 = n4 + 1
+              if( sva( n )*skl>sfmin )n2 = n2 + 1
+           end if
+           ! normalize the left singular vectors.
+           if( lsvec .or. uctol ) then
+              do p = 1, n2
+                 call stdlib_dscal( m, work( p ) / sva( p ), a( 1, p ), 1 )
+              end do
+           end if
+           ! scale the product of jacobi rotations (assemble the fast rotations).
+           if( rsvec ) then
+              if( applv ) then
+                 do p = 1, n
+                    call stdlib_dscal( mvl, work( p ), v( 1, p ), 1 )
+                 end do
+              else
+                 do p = 1, n
+                    temp1 = one / stdlib_dnrm2( mvl, v( 1, p ), 1 )
+                    call stdlib_dscal( mvl, temp1, v( 1, p ), 1 )
+                 end do
+              end if
+           end if
+           ! undo scaling, if necessary (and possible).
+           if( ( ( skl>one ) .and. ( sva( 1 )<( big / skl) ) ).or. ( ( skl<one ) .and. ( sva( max(&
+                      n2, 1 ) ) >( sfmin / skl) ) ) ) then
+              do p = 1, n
+                 sva( p ) = skl*sva( p )
+              end do
+              skl= one
+           end if
+           work( 1 ) = skl
+           ! the singular values of a are skl*sva(1:n). if skl/=one
+           ! then some of the singular values may overflow or underflow and
+           ! the spectrum is given in this factored representation.
+           work( 2 ) = real( n4,KIND=dp)
+           ! n4 is the number of computed nonzero singular values of a.
+           work( 3 ) = real( n2,KIND=dp)
+           ! n2 is the number of singular values of a greater than sfmin.
+           ! if n2<n, sva(n2:n) contains zeros and/or denormalized numbers
+           ! that may carry some information.
+           work( 4 ) = real( i,KIND=dp)
+           ! i is the index of the last sweep before declaring convergence.
+           work( 5 ) = mxaapq
+           ! mxaapq is the largest absolute value of scaled pivots in the
+           ! last sweep
+           work( 6 ) = mxsinj
+           ! mxsinj is the largest absolute value of the sines of jacobi angles
+           ! in the last sweep
+           return
+     end subroutine stdlib_dgesvj
+
+     !> DGESVX: uses the LU factorization to compute the solution to a real
+     !> system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_dgesvx( fact, trans, n, nrhs, a, lda, af, ldaf, ipiv,equed, r, c, b, ldb, &
+               x, ldx, rcond, ferr, berr,work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: a(lda,*), af(ldaf,*), b(ldb,*), c(*), r(*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: colequ, equil, nofact, notran, rowequ
+           character :: norm
+           integer(ilp) :: i, infequ, j
+           real(dp) :: amax, anorm, bignum, colcnd, rcmax, rcmin, rowcnd, rpvgrw, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rowequ = .false.
+              colequ = .false.
+           else
+              rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+              colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rowequ .or. colequ .or. stdlib_lsame( &
+                     equed, 'N' ) ) ) then
+              info = -10
+           else
+              if( rowequ ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, r( j ) )
+                    rcmax = max( rcmax, r( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -11
+                 else if( n>0 ) then
+                    rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    rowcnd = one
+                 end if
+              end if
+              if( colequ .and. info==0 ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, c( j ) )
+                    rcmax = max( rcmax, c( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -12
+                 else if( n>0 ) then
+                    colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    colcnd = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -14
+                 else if( ldx<max( 1, n ) ) then
+                    info = -16
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGESVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_dgeequ( n, n, a, lda, r, c, rowcnd, colcnd, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_dlaqge( n, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+                 rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+                 colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( notran ) then
+              if( rowequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       b( i, j ) = r( i )*b( i, j )
+                    end do
+                 end do
+              end if
+           else if( colequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = c( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the lu factorization of a.
+              call stdlib_dlacpy( 'FULL', n, n, a, lda, af, ldaf )
+              call stdlib_dgetrf( n, n, af, ldaf, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 ) then
+                 ! compute the reciprocal pivot growth factor of the
+                 ! leading rank-deficient info columns of a.
+                 rpvgrw = stdlib_dlantr( 'M', 'U', 'N', info, info, af, ldaf,work )
+                 if( rpvgrw==zero ) then
+                    rpvgrw = one
+                 else
+                    rpvgrw = stdlib_dlange( 'M', n, info, a, lda, work ) / rpvgrw
+                 end if
+                 work( 1 ) = rpvgrw
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a and the
+           ! reciprocal pivot growth factor rpvgrw.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_dlange( norm, n, n, a, lda, work )
+           rpvgrw = stdlib_dlantr( 'M', 'U', 'N', n, n, af, ldaf, work )
+           if( rpvgrw==zero ) then
+              rpvgrw = one
+           else
+              rpvgrw = stdlib_dlange( 'M', n, n, a, lda, work ) / rpvgrw
+           end if
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_dgecon( norm, n, af, ldaf, anorm, rcond, work, iwork, info )
+           ! compute the solution matrix x.
+           call stdlib_dlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_dgetrs( trans, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_dgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, iwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( notran ) then
+              if( colequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       x( i, j ) = c( i )*x( i, j )
+                    end do
+                 end do
+                 do j = 1, nrhs
+                    ferr( j ) = ferr( j ) / colcnd
+                 end do
+              end if
+           else if( rowequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = r( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / rowcnd
+              end do
+           end if
+           work( 1 ) = rpvgrw
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_dgesvx
+
+     !> DGGES: computes for a pair of N-by-N real nonsymmetric matrices (A,B),
+     !> the generalized eigenvalues, the generalized real Schur form (S,T),
+     !> optionally, the left and/or right matrices of Schur vectors (VSL and
+     !> VSR). This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL)*S*(VSR)**T, (VSL)*T*(VSR)**T )
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> quasi-triangular matrix S and the upper triangular matrix T.The
+     !> leading columns of VSL and VSR then form an orthonormal basis for the
+     !> corresponding left and right eigenspaces (deflating subspaces).
+     !> (If only the generalized eigenvalues are needed, use the driver
+     !> DGGEV instead, which is faster.)
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0 or both being zero.
+     !> A pair of matrices (S,T) is in generalized real Schur form if T is
+     !> upper triangular with non-negative diagonal and S is block upper
+     !> triangular with 1-by-1 and 2-by-2 blocks.  1-by-1 blocks correspond
+     !> to real generalized eigenvalues, while 2-by-2 blocks of S will be
+     !> "standardized" by making the corresponding elements of T have the
+     !> form:
+     !> [  a  0  ]
+     !> [  0  b  ]
+     !> and the pair of corresponding 2-by-2 blocks in S and T will have a
+     !> complex conjugate pair of generalized eigenvalues.
+
+     subroutine stdlib_dgges( jobvsl, jobvsr, sort, selctg, n, a, lda, b, ldb,sdim, alphar, &
+               alphai, beta, vsl, ldvsl, vsr,ldvsr, work, lwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: alphai(*), alphar(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), &
+                     work(*)
+           ! Function Arguments 
+           procedure(stdlib_selctg_d) :: selctg
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, lst2sl, &
+                     wantst
+           integer(ilp) :: i, icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, ip, iright, irows, &
+                     itau, iwrk, maxwrk, minwrk
+           real(dp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pvsl, pvsr, safmax, safmin, &
+                     smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(dp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -15
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -17
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              if( n>0 )then
+                 minwrk = max( 8*n, 6*n + 16 )
+                 maxwrk = minwrk - n +n*stdlib_ilaenv( 1, 'DGEQRF', ' ', n, 1, n, 0 )
+                 maxwrk = max( maxwrk, minwrk - n +n*stdlib_ilaenv( 1, 'DORMQR', ' ', n, 1, n, -1 &
+                           ) )
+                 if( ilvsl ) then
+                    maxwrk = max( maxwrk, minwrk - n +n*stdlib_ilaenv( 1, 'DORGQR', ' ', n, 1, n, &
+                              -1 ) )
+                 end if
+              else
+                 minwrk = 1
+                 maxwrk = 1
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery )info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGES ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           safmin = stdlib_dlamch( 'S' )
+           safmax = one / safmin
+           call stdlib_dlabad( safmin, safmax )
+           smlnum = sqrt( safmin ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_dlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_dlange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_dlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (workspace: need 6*n + 2*n space for storing balancing factors)
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_dggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_dgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (workspace: need n, prefer n*nb)
+           call stdlib_dormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           ! (workspace: need n, prefer n*nb)
+           if( ilvsl ) then
+              call stdlib_dlaset( 'FULL', n, n, zero, one, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_dlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_dorgqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_dlaset( 'FULL', n, n, zero, one, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           call stdlib_dgghrd( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      ierr )
+           ! perform qz algorithm, computing schur vectors if desired
+           ! (workspace: need n)
+           iwrk = itau
+           call stdlib_dhgeqz( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vsl, ldvsl, vsr, ldvsr,work( iwrk ), lwork+1-iwrk, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 50
+           end if
+           ! sort eigenvalues alpha/beta if desired
+           ! (workspace: need 4*n+16 )
+           sdim = 0
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selctging
+              if( ilascl ) then
+                 call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n,ierr )
+                 call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n,ierr )
+              end if
+              if( ilbscl )call stdlib_dlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alphar( i ), alphai( i ), beta( i ) )
+              end do
+              call stdlib_dtgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb, alphar,alphai, beta, &
+              vsl, ldvsl, vsr, ldvsr, sdim, pvsl,pvsr, dif, work( iwrk ), lwork-iwrk+1, idum, 1,&
+                        ierr )
+              if( ierr==1 )info = n + 3
+           end if
+           ! apply back-permutation to vsl and vsr
+           ! (workspace: none needed)
+           if( ilvsl )call stdlib_dggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_dggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsr, ldvsr, ierr )
+           ! check if unscaling would cause over/underflow, if so, rescale
+           ! (alphar(i),alphai(i),beta(i)) so beta(i) is on the order of
+           ! b(i,i) and alphar(i) and alphai(i) are on the order of a(i,i)
+           if( ilascl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( alphar( i ) / safmax )>( anrmto / anrm ) .or.( safmin / alphar( i ) )>( &
+                              anrm / anrmto ) ) then
+                       work( 1 ) = abs( a( i, i ) / alphar( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    else if( ( alphai( i ) / safmax )>( anrmto / anrm ) .or.( safmin / alphai( i )&
+                               )>( anrm / anrmto ) )then
+                       work( 1 ) = abs( a( i, i+1 ) / alphai( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           if( ilbscl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( beta( i ) / safmax )>( bnrmto / bnrm ) .or.( safmin / beta( i ) )>( &
+                              bnrm / bnrmto ) ) then
+                       work( 1 ) = abs( b( i, i ) / beta( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_dlascl( 'H', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_dlascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_dlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = selctg( alphar( i ), alphai( i ), beta( i ) )
+                 if( alphai( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           50 continue
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_dgges
+
+     !> DGGESX: computes for a pair of N-by-N real nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, the real Schur form (S,T), and,
+     !> optionally, the left and/or right matrices of Schur vectors (VSL and
+     !> VSR).  This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL) S (VSR)**T, (VSL) T (VSR)**T )
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> quasi-triangular matrix S and the upper triangular matrix T; computes
+     !> a reciprocal condition number for the average of the selected
+     !> eigenvalues (RCONDE); and computes a reciprocal condition number for
+     !> the right and left deflating subspaces corresponding to the selected
+     !> eigenvalues (RCONDV). The leading columns of VSL and VSR then form
+     !> an orthonormal basis for the corresponding left and right eigenspaces
+     !> (deflating subspaces).
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0 or for both being zero.
+     !> A pair of matrices (S,T) is in generalized real Schur form if T is
+     !> upper triangular with non-negative diagonal and S is block upper
+     !> triangular with 1-by-1 and 2-by-2 blocks.  1-by-1 blocks correspond
+     !> to real generalized eigenvalues, while 2-by-2 blocks of S will be
+     !> "standardized" by making the corresponding elements of T have the
+     !> form:
+     !> [  a  0  ]
+     !> [  0  b  ]
+     !> and the pair of corresponding 2-by-2 blocks in S and T will have a
+     !> complex conjugate pair of generalized eigenvalues.
+
+     subroutine stdlib_dggesx( jobvsl, jobvsr, sort, selctg, sense, n, a, lda,b, ldb, sdim, &
+     alphar, alphai, beta, vsl, ldvsl,vsr, ldvsr, rconde, rcondv, work, lwork, iwork,liwork, &
+               bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sense, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, liwork, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: alphai(*), alphar(*), beta(*), rconde(2), rcondv(2), vsl(&
+                     ldvsl,*), vsr(ldvsr,*), work(*)
+           ! Function Arguments 
+           procedure(stdlib_selctg_d) :: selctg
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, lst2sl, wantsb, &
+                     wantse, wantsn, wantst, wantsv
+           integer(ilp) :: i, icols, ierr, ihi, ijob, ijobvl, ijobvr, ileft, ilo, ip, iright, &
+                     irows, itau, iwrk, liwmin, lwrk, maxwrk, minwrk
+           real(dp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pl, pr, safmax, safmin, &
+                     smlnum
+           ! Local Arrays 
+           real(dp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( wantsn ) then
+              ijob = 0
+           else if( wantse ) then
+              ijob = 1
+           else if( wantsv ) then
+              ijob = 2
+           else if( wantsb ) then
+              ijob = 4
+           end if
+           ! test the input arguments
+           info = 0
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( .not.( wantsn .or. wantse .or. wantsv .or. wantsb ) .or.( .not.wantst .and. &
+                     .not.wantsn ) ) then
+              info = -5
+           else if( n<0 ) then
+              info = -6
+           else if( lda<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -16
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -18
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              if( n>0) then
+                 minwrk = max( 8*n, 6*n + 16 )
+                 maxwrk = minwrk - n +n*stdlib_ilaenv( 1, 'DGEQRF', ' ', n, 1, n, 0 )
+                 maxwrk = max( maxwrk, minwrk - n +n*stdlib_ilaenv( 1, 'DORMQR', ' ', n, 1, n, -1 &
+                           ) )
+                 if( ilvsl ) then
+                    maxwrk = max( maxwrk, minwrk - n +n*stdlib_ilaenv( 1, 'DORGQR', ' ', n, 1, n, &
+                              -1 ) )
+                 end if
+                 lwrk = maxwrk
+                 if( ijob>=1 )lwrk = max( lwrk, n*n/2 )
+              else
+                 minwrk = 1
+                 maxwrk = 1
+                 lwrk   = 1
+              end if
+              work( 1 ) = lwrk
+              if( wantsn .or. n==0 ) then
+                 liwmin = 1
+              else
+                 liwmin = n + 6
+              end if
+              iwork( 1 ) = liwmin
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -22
+              else if( liwork<liwmin  .and. .not.lquery ) then
+                 info = -24
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGESX', -info )
+              return
+           else if (lquery) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           safmin = stdlib_dlamch( 'S' )
+           safmax = one / safmin
+           call stdlib_dlabad( safmin, safmax )
+           smlnum = sqrt( safmin ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_dlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_dlange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_dlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (workspace: need 6*n + 2*n for permutation parameters)
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_dggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_dgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (workspace: need n, prefer n*nb)
+           call stdlib_dormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           ! (workspace: need n, prefer n*nb)
+           if( ilvsl ) then
+              call stdlib_dlaset( 'FULL', n, n, zero, one, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_dlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_dorgqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_dlaset( 'FULL', n, n, zero, one, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           call stdlib_dgghrd( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      ierr )
+           sdim = 0
+           ! perform qz algorithm, computing schur vectors if desired
+           ! (workspace: need n)
+           iwrk = itau
+           call stdlib_dhgeqz( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vsl, ldvsl, vsr, ldvsr,work( iwrk ), lwork+1-iwrk, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 60
+           end if
+           ! sort eigenvalues alpha/beta and compute the reciprocal of
+           ! condition number(s)
+           ! (workspace: if ijob >= 1, need max( 8*(n+1), 2*sdim*(n-sdim) )
+                       ! otherwise, need 8*(n+1) )
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selctging
+              if( ilascl ) then
+                 call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n,ierr )
+                 call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n,ierr )
+              end if
+              if( ilbscl )call stdlib_dlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alphar( i ), alphai( i ), beta( i ) )
+              end do
+              ! reorder eigenvalues, transform generalized schur vectors, and
+              ! compute reciprocal condition numbers
+              call stdlib_dtgsen( ijob, ilvsl, ilvsr, bwork, n, a, lda, b, ldb,alphar, alphai, &
+              beta, vsl, ldvsl, vsr, ldvsr,sdim, pl, pr, dif, work( iwrk ), lwork-iwrk+1,iwork, &
+                        liwork, ierr )
+              if( ijob>=1 )maxwrk = max( maxwrk, 2*sdim*( n-sdim ) )
+              if( ierr==-22 ) then
+                  ! not enough real workspace
+                 info = -22
+              else
+                 if( ijob==1 .or. ijob==4 ) then
+                    rconde( 1 ) = pl
+                    rconde( 2 ) = pr
+                 end if
+                 if( ijob==2 .or. ijob==4 ) then
+                    rcondv( 1 ) = dif( 1 )
+                    rcondv( 2 ) = dif( 2 )
+                 end if
+                 if( ierr==1 )info = n + 3
+              end if
+           end if
+           ! apply permutation to vsl and vsr
+           ! (workspace: none needed)
+           if( ilvsl )call stdlib_dggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_dggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsr, ldvsr, ierr )
+           ! check if unscaling would cause over/underflow, if so, rescale
+           ! (alphar(i),alphai(i),beta(i)) so beta(i) is on the order of
+           ! b(i,i) and alphar(i) and alphai(i) are on the order of a(i,i)
+           if( ilascl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( alphar( i ) / safmax )>( anrmto / anrm ) .or.( safmin / alphar( i ) )>( &
+                              anrm / anrmto ) ) then
+                       work( 1 ) = abs( a( i, i ) / alphar( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    else if( ( alphai( i ) / safmax )>( anrmto / anrm ) .or.( safmin / alphai( i )&
+                               )>( anrm / anrmto ) )then
+                       work( 1 ) = abs( a( i, i+1 ) / alphai( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           if( ilbscl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( beta( i ) / safmax )>( bnrmto / bnrm ) .or.( safmin / beta( i ) )>( &
+                              bnrm / bnrmto ) ) then
+                       work( 1 ) = abs( b( i, i ) / beta( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_dlascl( 'H', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_dlascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_dlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = selctg( alphar( i ), alphai( i ), beta( i ) )
+                 if( alphai( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           60 continue
+           work( 1 ) = maxwrk
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_dggesx
+
+     !> DGGEV: computes for a pair of N-by-N real nonsymmetric matrices (A,B)
+     !> the generalized eigenvalues, and optionally, the left and/or right
+     !> generalized eigenvectors.
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right eigenvector v(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j).
+     !> The left eigenvector u(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> u(j)**H * A  = lambda(j) * u(j)**H * B .
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_dggev( jobvl, jobvr, n, a, lda, b, ldb, alphar, alphai,beta, vl, ldvl, vr, &
+               ldvr, work, lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: alphai(*), alphar(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+                     
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery
+           character :: chtemp
+           integer(ilp) :: icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, in, iright, irows, itau, &
+                     iwrk, jc, jr, maxwrk, minwrk
+           real(dp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -12
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -14
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv. the workspace is
+             ! computed assuming ilo = 1 and ihi = n, the worst case.)
+           if( info==0 ) then
+              minwrk = max( 1, 8*n )
+              maxwrk = max( 1, n*( 7 +stdlib_ilaenv( 1, 'DGEQRF', ' ', n, 1, n, 0 ) ) )
+              maxwrk = max( maxwrk, n*( 7 +stdlib_ilaenv( 1, 'DORMQR', ' ', n, 1, n, 0 ) ) )
+                        
+              if( ilvl ) then
+                 maxwrk = max( maxwrk, n*( 7 +stdlib_ilaenv( 1, 'DORGQR', ' ', n, 1, n, -1 ) ) )
+                           
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery )info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_dlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_dlange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_dlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrices a, b to isolate eigenvalues if possible
+           ! (workspace: need 6*n)
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_dggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           if( ilv ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_dgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (workspace: need n, prefer n*nb)
+           call stdlib_dormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl
+           ! (workspace: need n, prefer n*nb)
+           if( ilvl ) then
+              call stdlib_dlaset( 'FULL', n, n, zero, one, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_dlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_dorgqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vr
+           if( ilvr )call stdlib_dlaset( 'FULL', n, n, zero, one, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           if( ilv ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_dgghrd( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        ierr )
+           else
+              call stdlib_dgghrd( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur forms and schur vectors)
+           ! (workspace: need n)
+           iwrk = itau
+           if( ilv ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_dhgeqz( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vl, ldvl, vr, ldvr,work( iwrk ), lwork+1-iwrk, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 110
+           end if
+           ! compute eigenvectors
+           ! (workspace: need 6*n)
+           if( ilv ) then
+              if( ilvl ) then
+                 if( ilvr ) then
+                    chtemp = 'B'
+                 else
+                    chtemp = 'L'
+                 end if
+              else
+                 chtemp = 'R'
+              end if
+              call stdlib_dtgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, n, &
+                        in, work( iwrk ), ierr )
+              if( ierr/=0 ) then
+                 info = n + 2
+                 go to 110
+              end if
+              ! undo balancing on vl and vr and normalization
+              ! (workspace: none needed)
+              if( ilvl ) then
+                 call stdlib_dggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, vl, &
+                           ldvl, ierr )
+                 loop_50: do jc = 1, n
+                    if( alphai( jc )<zero )cycle loop_50
+                    temp = zero
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          temp = max( temp, abs( vl( jr, jc ) ) )
+                       end do
+                    else
+                       do jr = 1, n
+                          temp = max( temp, abs( vl( jr, jc ) )+abs( vl( jr, jc+1 ) ) )
+                       end do
+                    end if
+                    if( temp<smlnum )cycle loop_50
+                    temp = one / temp
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          vl( jr, jc ) = vl( jr, jc )*temp
+                       end do
+                    else
+                       do jr = 1, n
+                          vl( jr, jc ) = vl( jr, jc )*temp
+                          vl( jr, jc+1 ) = vl( jr, jc+1 )*temp
+                       end do
+                    end if
+                 end do loop_50
+              end if
+              if( ilvr ) then
+                 call stdlib_dggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, vr, &
+                           ldvr, ierr )
+                 loop_100: do jc = 1, n
+                    if( alphai( jc )<zero )cycle loop_100
+                    temp = zero
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          temp = max( temp, abs( vr( jr, jc ) ) )
+                       end do
+                    else
+                       do jr = 1, n
+                          temp = max( temp, abs( vr( jr, jc ) )+abs( vr( jr, jc+1 ) ) )
+                       end do
+                    end if
+                    if( temp<smlnum )cycle loop_100
+                    temp = one / temp
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          vr( jr, jc ) = vr( jr, jc )*temp
+                       end do
+                    else
+                       do jr = 1, n
+                          vr( jr, jc ) = vr( jr, jc )*temp
+                          vr( jr, jc+1 ) = vr( jr, jc+1 )*temp
+                       end do
+                    end if
+                 end do loop_100
+              end if
+              ! end of eigenvector calculation
+           end if
+           ! undo scaling if necessary
+           110 continue
+           if( ilascl ) then
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_dlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_dggev
+
+     !> DGGEVX: computes for a pair of N-by-N real nonsymmetric matrices (A,B)
+     !> the generalized eigenvalues, and optionally, the left and/or right
+     !> generalized eigenvectors.
+     !> Optionally also, it computes a balancing transformation to improve
+     !> the conditioning of the eigenvalues and eigenvectors (ILO, IHI,
+     !> LSCALE, RSCALE, ABNRM, and BBNRM), reciprocal condition numbers for
+     !> the eigenvalues (RCONDE), and reciprocal condition numbers for the
+     !> right eigenvectors (RCONDV).
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right eigenvector v(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j) .
+     !> The left eigenvector u(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> u(j)**H * A  = lambda(j) * u(j)**H * B.
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_dggevx( balanc, jobvl, jobvr, sense, n, a, lda, b, ldb,alphar, alphai, &
+     beta, vl, ldvl, vr, ldvr, ilo,ihi, lscale, rscale, abnrm, bbnrm, rconde,rcondv, work, lwork, &
+               iwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: balanc, jobvl, jobvr, sense
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           real(dp), intent(out) :: abnrm, bbnrm
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: alphai(*), alphar(*), beta(*), lscale(*), rconde(*), rcondv(*)&
+                     , rscale(*), vl(ldvl,*), vr(ldvr,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery, noscl, pair, wantsb, wantse, &
+                     wantsn, wantsv
+           character :: chtemp
+           integer(ilp) :: i, icols, ierr, ijobvl, ijobvr, in, irows, itau, iwrk, iwrk1, j, jc, &
+                     jr, m, maxwrk, minwrk, mm
+           real(dp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           noscl  = stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc, 'P' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.( stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc,'S' ) .or. &
+                     stdlib_lsame( balanc, 'P' ) .or. stdlib_lsame( balanc, 'B' ) ) )then
+              info = -1
+           else if( ijobvl<=0 ) then
+              info = -2
+           else if( ijobvr<=0 ) then
+              info = -3
+           else if( .not.( wantsn .or. wantse .or. wantsb .or. wantsv ) )then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -14
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -16
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv. the workspace is
+             ! computed assuming ilo = 1 and ihi = n, the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 if( noscl .and. .not.ilv ) then
+                    minwrk = 2*n
+                 else
+                    minwrk = 6*n
+                 end if
+                 if( wantse .or. wantsb ) then
+                    minwrk = 10*n
+                 end if
+                 if( wantsv .or. wantsb ) then
+                    minwrk = max( minwrk, 2*n*( n + 4 ) + 16 )
+                 end if
+                 maxwrk = minwrk
+                 maxwrk = max( maxwrk,n + n*stdlib_ilaenv( 1, 'DGEQRF', ' ', n, 1, n, 0 ) )
+                           
+                 maxwrk = max( maxwrk,n + n*stdlib_ilaenv( 1, 'DORMQR', ' ', n, 1, n, 0 ) )
+                           
+                 if( ilvl ) then
+                    maxwrk = max( maxwrk, n +n*stdlib_ilaenv( 1, 'DORGQR', ' ', n, 1, n, 0 ) )
+                              
+                 end if
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -26
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_dlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_dlange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_dlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute and/or balance the matrix pair (a,b)
+           ! (workspace: need 6*n if balanc = 's' or 'b', 1 otherwise)
+           call stdlib_dggbal( balanc, n, a, lda, b, ldb, ilo, ihi, lscale, rscale,work, ierr )
+                     
+           ! compute abnrm and bbnrm
+           abnrm = stdlib_dlange( '1', n, n, a, lda, work( 1 ) )
+           if( ilascl ) then
+              work( 1 ) = abnrm
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, 1, 1, work( 1 ), 1,ierr )
+              abnrm = work( 1 )
+           end if
+           bbnrm = stdlib_dlange( '1', n, n, b, ldb, work( 1 ) )
+           if( ilbscl ) then
+              work( 1 ) = bbnrm
+              call stdlib_dlascl( 'G', 0, 0, bnrmto, bnrm, 1, 1, work( 1 ), 1,ierr )
+              bbnrm = work( 1 )
+           end if
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (workspace: need n, prefer n*nb )
+           irows = ihi + 1 - ilo
+           if( ilv .or. .not.wantsn ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_dgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to a
+           ! (workspace: need n, prefer n*nb)
+           call stdlib_dormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl and/or vr
+           ! (workspace: need n, prefer n*nb)
+           if( ilvl ) then
+              call stdlib_dlaset( 'FULL', n, n, zero, one, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_dlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_dorgqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           if( ilvr )call stdlib_dlaset( 'FULL', n, n, zero, one, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           if( ilv .or. .not.wantsn ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_dgghrd( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        ierr )
+           else
+              call stdlib_dgghrd( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur forms and schur vectors)
+           ! (workspace: need n)
+           if( ilv .or. .not.wantsn ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_dhgeqz( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vl, ldvl, vr, ldvr, work,lwork, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 130
+           end if
+           ! compute eigenvectors and estimate condition numbers if desired
+           ! (workspace: stdlib_dtgevc: need 6*n
+                       ! stdlib_dtgsna: need 2*n*(n+2)+16 if sense = 'v' or 'b',
+                               ! need n otherwise )
+           if( ilv .or. .not.wantsn ) then
+              if( ilv ) then
+                 if( ilvl ) then
+                    if( ilvr ) then
+                       chtemp = 'B'
+                    else
+                       chtemp = 'L'
+                    end if
+                 else
+                    chtemp = 'R'
+                 end if
+                 call stdlib_dtgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, n,&
+                            in, work, ierr )
+                 if( ierr/=0 ) then
+                    info = n + 2
+                    go to 130
+                 end if
+              end if
+              if( .not.wantsn ) then
+                 ! compute eigenvectors (stdlib_dtgevc) and estimate condition
+                 ! numbers (stdlib_dtgsna). note that the definition of the condition
+                 ! number is not invariant under transformation (u,v) to
+                 ! (q*u, z*v), where (u,v) are eigenvectors of the generalized
+                 ! schur form (s,t), q and z are orthogonal matrices. in order
+                 ! to avoid using extra 2*n*n workspace, we have to recalculate
+                 ! eigenvectors and estimate one condition numbers at a time.
+                 pair = .false.
+                 loop_20: do i = 1, n
+                    if( pair ) then
+                       pair = .false.
+                       cycle loop_20
+                    end if
+                    mm = 1
+                    if( i<n ) then
+                       if( a( i+1, i )/=zero ) then
+                          pair = .true.
+                          mm = 2
+                       end if
+                    end if
+                    do j = 1, n
+                       bwork( j ) = .false.
+                    end do
+                    if( mm==1 ) then
+                       bwork( i ) = .true.
+                    else if( mm==2 ) then
+                       bwork( i ) = .true.
+                       bwork( i+1 ) = .true.
+                    end if
+                    iwrk = mm*n + 1
+                    iwrk1 = iwrk + mm*n
+                    ! compute a pair of left and right eigenvectors.
+                    ! (compute workspace: need up to 4*n + 6*n)
+                    if( wantse .or. wantsb ) then
+                       call stdlib_dtgevc( 'B', 'S', bwork, n, a, lda, b, ldb,work( 1 ), n, work( &
+                                 iwrk ), n, mm, m,work( iwrk1 ), ierr )
+                       if( ierr/=0 ) then
+                          info = n + 2
+                          go to 130
+                       end if
+                    end if
+                    call stdlib_dtgsna( sense, 'S', bwork, n, a, lda, b, ldb,work( 1 ), n, work( &
+                    iwrk ), n, rconde( i ),rcondv( i ), mm, m, work( iwrk1 ),lwork-iwrk1+1, iwork,&
+                               ierr )
+                 end do loop_20
+              end if
+           end if
+           ! undo balancing on vl and vr and normalization
+           ! (workspace: none needed)
+           if( ilvl ) then
+              call stdlib_dggbak( balanc, 'L', n, ilo, ihi, lscale, rscale, n, vl,ldvl, ierr )
+                        
+              loop_70: do jc = 1, n
+                 if( alphai( jc )<zero )cycle loop_70
+                 temp = zero
+                 if( alphai( jc )==zero ) then
+                    do jr = 1, n
+                       temp = max( temp, abs( vl( jr, jc ) ) )
+                    end do
+                 else
+                    do jr = 1, n
+                       temp = max( temp, abs( vl( jr, jc ) )+abs( vl( jr, jc+1 ) ) )
+                    end do
+                 end if
+                 if( temp<smlnum )cycle loop_70
+                 temp = one / temp
+                 if( alphai( jc )==zero ) then
+                    do jr = 1, n
+                       vl( jr, jc ) = vl( jr, jc )*temp
+                    end do
+                 else
+                    do jr = 1, n
+                       vl( jr, jc ) = vl( jr, jc )*temp
+                       vl( jr, jc+1 ) = vl( jr, jc+1 )*temp
+                    end do
+                 end if
+              end do loop_70
+           end if
+           if( ilvr ) then
+              call stdlib_dggbak( balanc, 'R', n, ilo, ihi, lscale, rscale, n, vr,ldvr, ierr )
+                        
+              loop_120: do jc = 1, n
+                 if( alphai( jc )<zero )cycle loop_120
+                 temp = zero
+                 if( alphai( jc )==zero ) then
+                    do jr = 1, n
+                       temp = max( temp, abs( vr( jr, jc ) ) )
+                    end do
+                 else
+                    do jr = 1, n
+                       temp = max( temp, abs( vr( jr, jc ) )+abs( vr( jr, jc+1 ) ) )
+                    end do
+                 end if
+                 if( temp<smlnum )cycle loop_120
+                 temp = one / temp
+                 if( alphai( jc )==zero ) then
+                    do jr = 1, n
+                       vr( jr, jc ) = vr( jr, jc )*temp
+                    end do
+                 else
+                    do jr = 1, n
+                       vr( jr, jc ) = vr( jr, jc )*temp
+                       vr( jr, jc+1 ) = vr( jr, jc+1 )*temp
+                    end do
+                 end if
+              end do loop_120
+           end if
+           ! undo scaling if necessary
+           130 continue
+           if( ilascl ) then
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_dlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_dggevx
+
+     !> DGGGLM: solves a general Gauss-Markov linear model (GLM) problem:
+     !> minimize || y ||_2   subject to   d = A*x + B*y
+     !> x
+     !> where A is an N-by-M matrix, B is an N-by-P matrix, and d is a
+     !> given N-vector. It is assumed that M <= N <= M+P, and
+     !> rank(A) = M    and    rank( A B ) = N.
+     !> Under these assumptions, the constrained equation is always
+     !> consistent, and there is a unique solution x and a minimal 2-norm
+     !> solution y, which is obtained using a generalized QR factorization
+     !> of the matrices (A, B) given by
+     !> A = Q*(R),   B = Q*T*Z.
+     !> (0)
+     !> In particular, if matrix B is square nonsingular, then the problem
+     !> GLM is equivalent to the following weighted linear least squares
+     !> problem
+     !> minimize || inv(B)*(d-A*x) ||_2
+     !> x
+     !> where inv(B) denotes the inverse of B.
+
+     pure subroutine stdlib_dggglm( n, m, p, a, lda, b, ldb, d, x, y, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*), d(*)
+           real(dp), intent(out) :: work(*), x(*), y(*)
+        ! ===================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, lopt, lwkmin, lwkopt, nb, nb1, nb2, nb3, nb4, np
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           np = min( n, p )
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( m<0 .or. m>n ) then
+              info = -2
+           else if( p<0 .or. p<n-m ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           ! calculate workspace
+           if( info==0) then
+              if( n==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'DGEQRF', ' ', n, m, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'DGERQF', ' ', n, m, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'DORMQR', ' ', n, m, p, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'DORMRQ', ' ', n, m, p, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = m + n + p
+                 lwkopt = m + np + max( n, p )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGGLM', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              do i = 1, m
+                 x(i) = zero
+              end do
+              do i = 1, p
+                 y(i) = zero
+              end do
+              return
+           end if
+           ! compute the gqr factorization of matrices a and b:
+                ! q**t*a = ( r11 ) m,    q**t*b*z**t = ( t11   t12 ) m
+                         ! (  0  ) n-m                 (  0    t22 ) n-m
+                            ! m                         m+p-n  n-m
+           ! where r11 and t22 are upper triangular, and q and z are
+           ! orthogonal.
+           call stdlib_dggqrf( n, m, p, a, lda, work, b, ldb, work( m+1 ),work( m+np+1 ), lwork-m-&
+                     np, info )
+           lopt = work( m+np+1 )
+           ! update left-hand-side vector d = q**t*d = ( d1 ) m
+                                                     ! ( d2 ) n-m
+           call stdlib_dormqr( 'LEFT', 'TRANSPOSE', n, 1, m, a, lda, work, d,max( 1, n ), work( m+&
+                     np+1 ), lwork-m-np, info )
+           lopt = max( lopt, int( work( m+np+1 ),KIND=ilp) )
+           ! solve t22*y2 = d2 for y2
+           if( n>m ) then
+              call stdlib_dtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', n-m, 1,b( m+1, m+p-n+1 ), &
+                        ldb, d( m+1 ), n-m, info )
+              if( info>0 ) then
+                 info = 1
+                 return
+              end if
+              call stdlib_dcopy( n-m, d( m+1 ), 1, y( m+p-n+1 ), 1 )
+           end if
+           ! set y1 = 0
+           do i = 1, m + p - n
+              y( i ) = zero
+           end do
+           ! update d1 = d1 - t12*y2
+           call stdlib_dgemv( 'NO TRANSPOSE', m, n-m, -one, b( 1, m+p-n+1 ), ldb,y( m+p-n+1 ), 1, &
+                     one, d, 1 )
+           ! solve triangular system: r11*x = d1
+           if( m>0 ) then
+              call stdlib_dtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', m, 1, a, lda,d, m, info )
+                        
+              if( info>0 ) then
+                 info = 2
+                 return
+              end if
+              ! copy d to x
+              call stdlib_dcopy( m, d, 1, x, 1 )
+           end if
+           ! backward transformation y = z**t *y
+           call stdlib_dormrq( 'LEFT', 'TRANSPOSE', p, 1, np,b( max( 1, n-p+1 ), 1 ), ldb, work( &
+                     m+1 ), y,max( 1, p ), work( m+np+1 ), lwork-m-np, info )
+           work( 1 ) = m + np + max( lopt, int( work( m+np+1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_dggglm
+
+     !> DGGLSE: solves the linear equality-constrained least squares (LSE)
+     !> problem:
+     !> minimize || c - A*x ||_2   subject to   B*x = d
+     !> where A is an M-by-N matrix, B is a P-by-N matrix, c is a given
+     !> M-vector, and d is a given P-vector. It is assumed that
+     !> P <= N <= M+P, and
+     !> rank(B) = P and  rank( (A) ) = N.
+     !> ( (B) )
+     !> These conditions ensure that the LSE problem has a unique solution,
+     !> which is obtained using a generalized RQ factorization of the
+     !> matrices (B, A) given by
+     !> B = (0 R)*Q,   A = Z*T*Q.
+
+     pure subroutine stdlib_dgglse( m, n, p, a, lda, b, ldb, c, d, x, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*), c(*), d(*)
+           real(dp), intent(out) :: work(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkmin, lwkopt, mn, nb, nb1, nb2, nb3, nb4, nr
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( p<0 .or. p>n .or. p<n-m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, p ) ) then
+              info = -7
+           end if
+           ! calculate workspace
+           if( info==0) then
+              if( n==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'DGEQRF', ' ', m, n, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'DGERQF', ' ', m, n, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'DORMQR', ' ', m, n, p, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'DORMRQ', ' ', m, n, p, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = m + n + p
+                 lwkopt = p + mn + max( m, n )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGLSE', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! compute the grq factorization of matrices b and a:
+                  ! b*q**t = (  0  t12 ) p   z**t*a*q**t = ( r11 r12 ) n-p
+                              ! n-p  p                     (  0  r22 ) m+p-n
+                                                            ! n-p  p
+           ! where t12 and r11 are upper triangular, and q and z are
+           ! orthogonal.
+           call stdlib_dggrqf( p, m, n, b, ldb, work, a, lda, work( p+1 ),work( p+mn+1 ), lwork-p-&
+                     mn, info )
+           lopt = work( p+mn+1 )
+           ! update c = z**t *c = ( c1 ) n-p
+                                ! ( c2 ) m+p-n
+           call stdlib_dormqr( 'LEFT', 'TRANSPOSE', m, 1, mn, a, lda, work( p+1 ),c, max( 1, m ), &
+                     work( p+mn+1 ), lwork-p-mn, info )
+           lopt = max( lopt, int( work( p+mn+1 ),KIND=ilp) )
+           ! solve t12*x2 = d for x2
+           if( p>0 ) then
+              call stdlib_dtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', p, 1,b( 1, n-p+1 ), ldb, d,&
+                         p, info )
+              if( info>0 ) then
+                 info = 1
+                 return
+              end if
+              ! put the solution in x
+              call stdlib_dcopy( p, d, 1, x( n-p+1 ), 1 )
+              ! update c1
+              call stdlib_dgemv( 'NO TRANSPOSE', n-p, p, -one, a( 1, n-p+1 ), lda,d, 1, one, c, 1 &
+                        )
+           end if
+           ! solve r11*x1 = c1 for x1
+           if( n>p ) then
+              call stdlib_dtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n-p, 1,a, lda, c, n-p, &
+                        info )
+              if( info>0 ) then
+                 info = 2
+                 return
+              end if
+              ! put the solutions in x
+              call stdlib_dcopy( n-p, c, 1, x, 1 )
+           end if
+           ! compute the residual vector:
+           if( m<n ) then
+              nr = m + p - n
+              if( nr>0 )call stdlib_dgemv( 'NO TRANSPOSE', nr, n-m, -one, a( n-p+1, m+1 ),lda, d( &
+                        nr+1 ), 1, one, c( n-p+1 ), 1 )
+           else
+              nr = p
+           end if
+           if( nr>0 ) then
+              call stdlib_dtrmv( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', nr,a( n-p+1, n-p+1 ), lda, &
+                        d, 1 )
+              call stdlib_daxpy( nr, -one, d, 1, c( n-p+1 ), 1 )
+           end if
+           ! backward transformation x = q**t*x
+           call stdlib_dormrq( 'LEFT', 'TRANSPOSE', n, 1, p, b, ldb, work( 1 ), x,n, work( p+mn+1 &
+                     ), lwork-p-mn, info )
+           work( 1 ) = p + mn + max( lopt, int( work( p+mn+1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_dgglse
+
+     !> DHSEIN: uses inverse iteration to find specified right and/or left
+     !> eigenvectors of a real upper Hessenberg matrix H.
+     !> The right eigenvector x and the left eigenvector y of the matrix H
+     !> corresponding to an eigenvalue w are defined by:
+     !> H * x = w * x,     y**h * H = w * y**h
+     !> where y**h denotes the conjugate transpose of the vector y.
+
+     subroutine stdlib_dhsein( side, eigsrc, initv, select, n, h, ldh, wr, wi,vl, ldvl, vr, ldvr, &
+               mm, m, work, ifaill,ifailr, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: eigsrc, initv, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldh, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(inout) :: select(*)
+           integer(ilp), intent(out) :: ifaill(*), ifailr(*)
+           real(dp), intent(in) :: h(ldh,*), wi(*)
+           real(dp), intent(inout) :: vl(ldvl,*), vr(ldvr,*), wr(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: bothv, fromqr, leftv, noinit, pair, rightv
+           integer(ilp) :: i, iinfo, k, kl, kln, kr, ksi, ksr, ldwork
+           real(dp) :: bignum, eps3, hnorm, smlnum, ulp, unfl, wki, wkr
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           bothv = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv = stdlib_lsame( side, 'L' ) .or. bothv
+           fromqr = stdlib_lsame( eigsrc, 'Q' )
+           noinit = stdlib_lsame( initv, 'N' )
+           ! set m to the number of columns required to store the selected
+           ! eigenvectors, and standardize the array select.
+           m = 0
+           pair = .false.
+           do k = 1, n
+              if( pair ) then
+                 pair = .false.
+                 select( k ) = .false.
+              else
+                 if( wi( k )==zero ) then
+                    if( select( k ) )m = m + 1
+                 else
+                    pair = .true.
+                    if( select( k ) .or. select( k+1 ) ) then
+                       select( k ) = .true.
+                       m = m + 2
+                    end if
+                 end if
+              end if
+           end do
+           info = 0
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.fromqr .and. .not.stdlib_lsame( eigsrc, 'N' ) ) then
+              info = -2
+           else if( .not.noinit .and. .not.stdlib_lsame( initv, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( ldh<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -11
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -13
+           else if( mm<m ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DHSEIN', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! set machine-dependent constants.
+           unfl = stdlib_dlamch( 'SAFE MINIMUM' )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           bignum = ( one-ulp ) / smlnum
+           ldwork = n + 1
+           kl = 1
+           kln = 0
+           if( fromqr ) then
+              kr = 0
+           else
+              kr = n
+           end if
+           ksr = 1
+           loop_120: do k = 1, n
+              if( select( k ) ) then
+                 ! compute eigenvector(s) corresponding to w(k).
+                 if( fromqr ) then
+                    ! if affiliation of eigenvalues is known, check whether
+                    ! the matrix splits.
+                    ! determine kl and kr such that 1 <= kl <= k <= kr <= n
+                    ! and h(kl,kl-1) and h(kr+1,kr) are zero (or kl = 1 or
+                    ! kr = n).
+                    ! then inverse iteration can be performed with the
+                    ! submatrix h(kl:n,kl:n) for a left eigenvector, and with
+                    ! the submatrix h(1:kr,1:kr) for a right eigenvector.
+                    do i = k, kl + 1, -1
+                       if( h( i, i-1 )==zero )go to 30
+                    end do
+                    30 continue
+                    kl = i
+                    if( k>kr ) then
+                       do i = k, n - 1
+                          if( h( i+1, i )==zero )go to 50
+                       end do
+                       50 continue
+                       kr = i
+                    end if
+                 end if
+                 if( kl/=kln ) then
+                    kln = kl
+                    ! compute infinity-norm of submatrix h(kl:kr,kl:kr) if it
+                    ! has not ben computed before.
+                    hnorm = stdlib_dlanhs( 'I', kr-kl+1, h( kl, kl ), ldh, work )
+                    if( stdlib_disnan( hnorm ) ) then
+                       info = -6
+                       return
+                    else if( hnorm>zero ) then
+                       eps3 = hnorm*ulp
+                    else
+                       eps3 = smlnum
+                    end if
+                 end if
+                 ! perturb eigenvalue if it is close to any previous
+                 ! selected eigenvalues affiliated to the submatrix
+                 ! h(kl:kr,kl:kr). close roots are modified by eps3.
+                 wkr = wr( k )
+                 wki = wi( k )
+                 60 continue
+                 do i = k - 1, kl, -1
+                    if( select( i ) .and. abs( wr( i )-wkr )+abs( wi( i )-wki )<eps3 ) &
+                              then
+                       wkr = wkr + eps3
+                       go to 60
+                    end if
+                 end do
+                 wr( k ) = wkr
+                 pair = wki/=zero
+                 if( pair ) then
+                    ksi = ksr + 1
+                 else
+                    ksi = ksr
+                 end if
+                 if( leftv ) then
+                    ! compute left eigenvector.
+                    call stdlib_dlaein( .false., noinit, n-kl+1, h( kl, kl ), ldh,wkr, wki, vl( &
+                    kl, ksr ), vl( kl, ksi ),work, ldwork, work( n*n+n+1 ), eps3, smlnum,bignum, &
+                              iinfo )
+                    if( iinfo>0 ) then
+                       if( pair ) then
+                          info = info + 2
+                       else
+                          info = info + 1
+                       end if
+                       ifaill( ksr ) = k
+                       ifaill( ksi ) = k
+                    else
+                       ifaill( ksr ) = 0
+                       ifaill( ksi ) = 0
+                    end if
+                    do i = 1, kl - 1
+                       vl( i, ksr ) = zero
+                    end do
+                    if( pair ) then
+                       do i = 1, kl - 1
+                          vl( i, ksi ) = zero
+                       end do
+                    end if
+                 end if
+                 if( rightv ) then
+                    ! compute right eigenvector.
+                    call stdlib_dlaein( .true., noinit, kr, h, ldh, wkr, wki,vr( 1, ksr ), vr( 1, &
+                              ksi ), work, ldwork,work( n*n+n+1 ), eps3, smlnum, bignum,iinfo )
+                    if( iinfo>0 ) then
+                       if( pair ) then
+                          info = info + 2
+                       else
+                          info = info + 1
+                       end if
+                       ifailr( ksr ) = k
+                       ifailr( ksi ) = k
+                    else
+                       ifailr( ksr ) = 0
+                       ifailr( ksi ) = 0
+                    end if
+                    do i = kr + 1, n
+                       vr( i, ksr ) = zero
+                    end do
+                    if( pair ) then
+                       do i = kr + 1, n
+                          vr( i, ksi ) = zero
+                       end do
+                    end if
+                 end if
+                 if( pair ) then
+                    ksr = ksr + 2
+                 else
+                    ksr = ksr + 1
+                 end if
+              end if
+           end do loop_120
+           return
+     end subroutine stdlib_dhsein
+
+     !> DLA_PORPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(dp) function stdlib_dla_porpvgrw( uplo, ncols, a, lda, af,ldaf, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: ncols, lda, ldaf
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: amax, umax, rpvgrw
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           ! stdlib_dpotrf will have factored only the ncolsxncols leading minor, so
+           ! we restrict the growth search to that minor and use only the first
+           ! 2*ncols workspace entries.
+           rpvgrw = one
+           do i = 1, 2*ncols
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column.
+           if ( upper ) then
+              do j = 1, ncols
+                 do i = 1, j
+                    work( ncols+j ) =max( abs( a( i, j ) ), work( ncols+j ) )
+                 end do
+              end do
+           else
+              do j = 1, ncols
+                 do i = j, ncols
+                    work( ncols+j ) =max( abs( a( i, j ) ), work( ncols+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of the factor in
+           ! af.  no pivoting, so no permutations.
+           if ( stdlib_lsame( 'UPPER', uplo ) ) then
+              do j = 1, ncols
+                 do i = 1, j
+                    work( j ) = max( abs( af( i, j ) ), work( j ) )
+                 end do
+              end do
+           else
+              do j = 1, ncols
+                 do i = j, ncols
+                    work( j ) = max( abs( af( i, j ) ), work( j ) )
+                 end do
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( stdlib_lsame( 'UPPER', uplo ) ) then
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( ncols+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( ncols+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_dla_porpvgrw = rpvgrw
+     end function stdlib_dla_porpvgrw
+
+     !> DLAED3: finds the roots of the secular equation, as defined by the
+     !> values in D, W, and RHO, between 1 and K.  It makes the
+     !> appropriate calls to DLAED4 and then updates the eigenvectors by
+     !> multiplying the matrix of eigenvectors of the pair of eigensystems
+     !> being combined by the matrix of eigenvectors of the K-by-K system
+     !> which is solved here.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_dlaed3( k, n, n1, d, q, ldq, rho, dlamda, q2, indx,ctot, w, s, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldq, n, n1
+           real(dp), intent(in) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ctot(*), indx(*)
+           real(dp), intent(out) :: d(*), q(ldq,*), s(*)
+           real(dp), intent(inout) :: dlamda(*), w(*)
+           real(dp), intent(in) :: q2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, iq2, j, n12, n2, n23
+           real(dp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( k<0 ) then
+              info = -1
+           else if( n<k ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==0 )return
+           ! modify values dlamda(i) to make sure all dlamda(i)-dlamda(j) can
+           ! be computed with high relative accuracy (barring over/underflow).
+           ! this is a problem on machines without a guard digit in
+           ! add/subtract (cray xmp, cray ymp, cray c 90 and cray 2).
+           ! the following code replaces dlamda(i) by 2*dlamda(i)-dlamda(i),
+           ! which on any of these machines zeros out the bottommost
+           ! bit of dlamda(i) if it is 1; this makes the subsequent
+           ! subtractions dlamda(i)-dlamda(j) unproblematic when cancellation
+           ! occurs. on binary machines with a guard digit (almost all
+           ! machines) it does not change dlamda(i) at all. on hexadecimal
+           ! and decimal machines with a guard digit, it slightly
+           ! changes the bottommost bits of dlamda(i). it does not account
+           ! for hexadecimal or decimal machines without guard digits
+           ! (we know of none). we use a subroutine call to compute
+           ! 2*dlambda(i) to prevent optimizing compilers from eliminating
+           ! this code.
+           do i = 1, k
+              dlamda( i ) = stdlib_dlamc3( dlamda( i ), dlamda( i ) ) - dlamda( i )
+           end do
+           do j = 1, k
+              call stdlib_dlaed4( k, j, dlamda, w, q( 1, j ), rho, d( j ), info )
+              ! if the zero finder fails, the computation is terminated.
+              if( info/=0 )go to 120
+           end do
+           if( k==1 )go to 110
+           if( k==2 ) then
+              do j = 1, k
+                 w( 1 ) = q( 1, j )
+                 w( 2 ) = q( 2, j )
+                 ii = indx( 1 )
+                 q( 1, j ) = w( ii )
+                 ii = indx( 2 )
+                 q( 2, j ) = w( ii )
+              end do
+              go to 110
+           end if
+           ! compute updated w.
+           call stdlib_dcopy( k, w, 1, s, 1 )
+           ! initialize w(i) = q(i,i)
+           call stdlib_dcopy( k, q, ldq+1, w, 1 )
+           do j = 1, k
+              do i = 1, j - 1
+                 w( i ) = w( i )*( q( i, j ) / ( dlamda( i )-dlamda( j ) ) )
+              end do
+              do i = j + 1, k
+                 w( i ) = w( i )*( q( i, j ) / ( dlamda( i )-dlamda( j ) ) )
+              end do
+           end do
+           do i = 1, k
+              w( i ) = sign( sqrt( -w( i ) ), s( i ) )
+           end do
+           ! compute eigenvectors of the modified rank-1 modification.
+           do j = 1, k
+              do i = 1, k
+                 s( i ) = w( i ) / q( i, j )
+              end do
+              temp = stdlib_dnrm2( k, s, 1 )
+              do i = 1, k
+                 ii = indx( i )
+                 q( i, j ) = s( ii ) / temp
+              end do
+           end do
+           ! compute the updated eigenvectors.
+           110 continue
+           n2 = n - n1
+           n12 = ctot( 1 ) + ctot( 2 )
+           n23 = ctot( 2 ) + ctot( 3 )
+           call stdlib_dlacpy( 'A', n23, k, q( ctot( 1 )+1, 1 ), ldq, s, n23 )
+           iq2 = n1*n12 + 1
+           if( n23/=0 ) then
+              call stdlib_dgemm( 'N', 'N', n2, k, n23, one, q2( iq2 ), n2, s, n23,zero, q( n1+1, &
+                        1 ), ldq )
+           else
+              call stdlib_dlaset( 'A', n2, k, zero, zero, q( n1+1, 1 ), ldq )
+           end if
+           call stdlib_dlacpy( 'A', n12, k, q, ldq, s, n12 )
+           if( n12/=0 ) then
+              call stdlib_dgemm( 'N', 'N', n1, k, n12, one, q2, n1, s, n12, zero, q,ldq )
+           else
+              call stdlib_dlaset( 'A', n1, k, zero, zero, q( 1, 1 ), ldq )
+           end if
+           120 continue
+           return
+     end subroutine stdlib_dlaed3
+
+     !> DLAED7: computes the updated eigensystem of a diagonal
+     !> matrix after modification by a rank-one symmetric matrix. This
+     !> routine is used only for the eigenproblem which requires all
+     !> eigenvalues and optionally eigenvectors of a dense symmetric matrix
+     !> that has been reduced to tridiagonal form.  DLAED1 handles
+     !> the case in which all eigenvalues and eigenvectors of a symmetric
+     !> tridiagonal matrix are desired.
+     !> T = Q(in) ( D(in) + RHO * Z*Z**T ) Q**T(in) = Q(out) * D(out) * Q**T(out)
+     !> where Z = Q**Tu, u is a vector of length N with ones in the
+     !> CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+     !> The eigenvectors of the original matrix are stored in Q, and the
+     !> eigenvalues are in D.  The algorithm consists of three stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple eigenvalues or if there is a zero in
+     !> the Z vector.  For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one.  This stage is
+     !> performed by the routine DLAED8.
+     !> The second stage consists of calculating the updated
+     !> eigenvalues. This is done by finding the roots of the secular
+     !> equation via the routine DLAED4 (as called by DLAED9).
+     !> This routine also calculates the eigenvectors of the current
+     !> problem.
+     !> The final stage consists of computing the updated eigenvectors
+     !> directly using the updated eigenvalues.  The eigenvectors for
+     !> the current problem are multiplied with the eigenvectors from
+     !> the overall problem.
+
+     pure subroutine stdlib_dlaed7( icompq, n, qsiz, tlvls, curlvl, curpbm, d, q,ldq, indxq, rho, &
+               cutpnt, qstore, qptr, prmptr,perm, givptr, givcol, givnum, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: curlvl, curpbm, cutpnt, icompq, ldq, n, qsiz, tlvls
+           integer(ilp), intent(out) :: info
+           real(dp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: givcol(2,*), givptr(*), perm(*), prmptr(*), qptr(*)
+                     
+           integer(ilp), intent(out) :: indxq(*), iwork(*)
+           real(dp), intent(inout) :: d(*), givnum(2,*), q(ldq,*), qstore(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: coltyp, curr, i, idlmda, indx, indxc, indxp, iq2, is, iw, iz, k, ldq2, &
+                     n1, n2, ptr
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( icompq<0 .or. icompq>1 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( icompq==1 .and. qsiz<n ) then
+              info = -3
+           else if( ldq<max( 1, n ) ) then
+              info = -9
+           else if( min( 1, n )>cutpnt .or. n<cutpnt ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED7', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! the following values are for bookkeeping purposes only.  they are
+           ! integer pointers which indicate the portion of the workspace
+           ! used by a particular array in stdlib_dlaed8 and stdlib_dlaed9.
+           if( icompq==1 ) then
+              ldq2 = qsiz
+           else
+              ldq2 = n
+           end if
+           iz = 1
+           idlmda = iz + n
+           iw = idlmda + n
+           iq2 = iw + n
+           is = iq2 + n*ldq2
+           indx = 1
+           indxc = indx + n
+           coltyp = indxc + n
+           indxp = coltyp + n
+           ! form the z-vector which consists of the last row of q_1 and the
+           ! first row of q_2.
+           ptr = 1 + 2**tlvls
+           do i = 1, curlvl - 1
+              ptr = ptr + 2**( tlvls-i )
+           end do
+           curr = ptr + curpbm
+           call stdlib_dlaeda( n, tlvls, curlvl, curpbm, prmptr, perm, givptr,givcol, givnum, &
+                     qstore, qptr, work( iz ),work( iz+n ), info )
+           ! when solving the final problem, we no longer need the stored data,
+           ! so we will overwrite the data from this level onto the previously
+           ! used storage space.
+           if( curlvl==tlvls ) then
+              qptr( curr ) = 1
+              prmptr( curr ) = 1
+              givptr( curr ) = 1
+           end if
+           ! sort and deflate eigenvalues.
+           call stdlib_dlaed8( icompq, k, n, qsiz, d, q, ldq, indxq, rho, cutpnt,work( iz ), work(&
+            idlmda ), work( iq2 ), ldq2,work( iw ), perm( prmptr( curr ) ), givptr( curr+1 ),&
+            givcol( 1, givptr( curr ) ),givnum( 1, givptr( curr ) ), iwork( indxp ),iwork( indx ),&
+                       info )
+           prmptr( curr+1 ) = prmptr( curr ) + n
+           givptr( curr+1 ) = givptr( curr+1 ) + givptr( curr )
+           ! solve secular equation.
+           if( k/=0 ) then
+              call stdlib_dlaed9( k, 1, k, n, d, work( is ), k, rho, work( idlmda ),work( iw ), &
+                        qstore( qptr( curr ) ), k, info )
+              if( info/=0 )go to 30
+              if( icompq==1 ) then
+                 call stdlib_dgemm( 'N', 'N', qsiz, k, k, one, work( iq2 ), ldq2,qstore( qptr( &
+                           curr ) ), k, zero, q, ldq )
+              end if
+              qptr( curr+1 ) = qptr( curr ) + k**2
+           ! prepare the indxq sorting permutation.
+              n1 = k
+              n2 = n - k
+              call stdlib_dlamrg( n1, n2, d, 1, -1, indxq )
+           else
+              qptr( curr+1 ) = qptr( curr )
+              do i = 1, n
+                 indxq( i ) = i
+              end do
+           end if
+           30 continue
+           return
+     end subroutine stdlib_dlaed7
+
+     !> DLAEXC: swaps adjacent diagonal blocks T11 and T22 of order 1 or 2 in
+     !> an upper quasi-triangular matrix T by an orthogonal similarity
+     !> transformation.
+     !> T must be in Schur canonical form, that is, block upper triangular
+     !> with 1-by-1 and 2-by-2 diagonal blocks; each 2-by-2 diagonal block
+     !> has its diagonal elements equal and its off-diagonal elements of
+     !> opposite sign.
+
+     subroutine stdlib_dlaexc( wantq, n, t, ldt, q, ldq, j1, n1, n2, work,info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: j1, ldq, ldt, n, n1, n2
+           ! Array Arguments 
+           real(dp), intent(inout) :: q(ldq,*), t(ldt,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ldd = 4
+           integer(ilp), parameter :: ldx = 2
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: ierr, j2, j3, j4, k, nd
+           real(dp) :: cs, dnorm, eps, scale, smlnum, sn, t11, t22, t33, tau, tau1, tau2, temp, &
+                     thresh, wi1, wi2, wr1, wr2, xnorm
+           ! Local Arrays 
+           real(dp) :: d(ldd,4), u(3), u1(3), u2(3), x(ldx,2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 .or. n1==0 .or. n2==0 )return
+           if( j1+n1>n )return
+           j2 = j1 + 1
+           j3 = j1 + 2
+           j4 = j1 + 3
+           if( n1==1 .and. n2==1 ) then
+              ! swap two 1-by-1 blocks.
+              t11 = t( j1, j1 )
+              t22 = t( j2, j2 )
+              ! determine the transformation to perform the interchange.
+              call stdlib_dlartg( t( j1, j2 ), t22-t11, cs, sn, temp )
+              ! apply transformation to the matrix t.
+              if( j3<=n )call stdlib_drot( n-j1-1, t( j1, j3 ), ldt, t( j2, j3 ), ldt, cs,sn )
+                        
+              call stdlib_drot( j1-1, t( 1, j1 ), 1, t( 1, j2 ), 1, cs, sn )
+              t( j1, j1 ) = t22
+              t( j2, j2 ) = t11
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_drot( n, q( 1, j1 ), 1, q( 1, j2 ), 1, cs, sn )
+              end if
+           else
+              ! swapping involves at least one 2-by-2 block.
+              ! copy the diagonal block of order n1+n2 to the local array d
+              ! and compute its norm.
+              nd = n1 + n2
+              call stdlib_dlacpy( 'FULL', nd, nd, t( j1, j1 ), ldt, d, ldd )
+              dnorm = stdlib_dlange( 'MAX', nd, nd, d, ldd, work )
+              ! compute machine-dependent threshold for test for accepting
+              ! swap.
+              eps = stdlib_dlamch( 'P' )
+              smlnum = stdlib_dlamch( 'S' ) / eps
+              thresh = max( ten*eps*dnorm, smlnum )
+              ! solve t11*x - x*t22 = scale*t12 for x.
+              call stdlib_dlasy2( .false., .false., -1, n1, n2, d, ldd,d( n1+1, n1+1 ), ldd, d( 1,&
+                         n1+1 ), ldd, scale, x,ldx, xnorm, ierr )
+              ! swap the adjacent diagonal blocks.
+              k = n1 + n1 + n2 - 3
+              go to ( 10, 20, 30 )k
+              10 continue
+              ! n1 = 1, n2 = 2: generate elementary reflector h so that:
+              ! ( scale, x11, x12 ) h = ( 0, 0, * )
+              u( 1 ) = scale
+              u( 2 ) = x( 1, 1 )
+              u( 3 ) = x( 1, 2 )
+              call stdlib_dlarfg( 3, u( 3 ), u, 1, tau )
+              u( 3 ) = one
+              t11 = t( j1, j1 )
+              ! perform swap provisionally on diagonal block in d.
+              call stdlib_dlarfx( 'L', 3, 3, u, tau, d, ldd, work )
+              call stdlib_dlarfx( 'R', 3, 3, u, tau, d, ldd, work )
+              ! test whether to reject swap.
+              if( max( abs( d( 3, 1 ) ), abs( d( 3, 2 ) ), abs( d( 3,3 )-t11 ) )>thresh )go to &
+                        50
+              ! accept swap: apply transformation to the entire matrix t.
+              call stdlib_dlarfx( 'L', 3, n-j1+1, u, tau, t( j1, j1 ), ldt, work )
+              call stdlib_dlarfx( 'R', j2, 3, u, tau, t( 1, j1 ), ldt, work )
+              t( j3, j1 ) = zero
+              t( j3, j2 ) = zero
+              t( j3, j3 ) = t11
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_dlarfx( 'R', n, 3, u, tau, q( 1, j1 ), ldq, work )
+              end if
+              go to 40
+              20 continue
+              ! n1 = 2, n2 = 1: generate elementary reflector h so that:
+              ! h (  -x11 ) = ( * )
+                ! (  -x21 ) = ( 0 )
+                ! ( scale ) = ( 0 )
+              u( 1 ) = -x( 1, 1 )
+              u( 2 ) = -x( 2, 1 )
+              u( 3 ) = scale
+              call stdlib_dlarfg( 3, u( 1 ), u( 2 ), 1, tau )
+              u( 1 ) = one
+              t33 = t( j3, j3 )
+              ! perform swap provisionally on diagonal block in d.
+              call stdlib_dlarfx( 'L', 3, 3, u, tau, d, ldd, work )
+              call stdlib_dlarfx( 'R', 3, 3, u, tau, d, ldd, work )
+              ! test whether to reject swap.
+              if( max( abs( d( 2, 1 ) ), abs( d( 3, 1 ) ), abs( d( 1,1 )-t33 ) )>thresh )go to &
+                        50
+              ! accept swap: apply transformation to the entire matrix t.
+              call stdlib_dlarfx( 'R', j3, 3, u, tau, t( 1, j1 ), ldt, work )
+              call stdlib_dlarfx( 'L', 3, n-j1, u, tau, t( j1, j2 ), ldt, work )
+              t( j1, j1 ) = t33
+              t( j2, j1 ) = zero
+              t( j3, j1 ) = zero
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_dlarfx( 'R', n, 3, u, tau, q( 1, j1 ), ldq, work )
+              end if
+              go to 40
+              30 continue
+              ! n1 = 2, n2 = 2: generate elementary reflectors h(1) and h(2) so
+              ! that:
+              ! h(2) h(1) (  -x11  -x12 ) = (  *  * )
+                        ! (  -x21  -x22 )   (  0  * )
+                        ! ( scale    0  )   (  0  0 )
+                        ! (    0  scale )   (  0  0 )
+              u1( 1 ) = -x( 1, 1 )
+              u1( 2 ) = -x( 2, 1 )
+              u1( 3 ) = scale
+              call stdlib_dlarfg( 3, u1( 1 ), u1( 2 ), 1, tau1 )
+              u1( 1 ) = one
+              temp = -tau1*( x( 1, 2 )+u1( 2 )*x( 2, 2 ) )
+              u2( 1 ) = -temp*u1( 2 ) - x( 2, 2 )
+              u2( 2 ) = -temp*u1( 3 )
+              u2( 3 ) = scale
+              call stdlib_dlarfg( 3, u2( 1 ), u2( 2 ), 1, tau2 )
+              u2( 1 ) = one
+              ! perform swap provisionally on diagonal block in d.
+              call stdlib_dlarfx( 'L', 3, 4, u1, tau1, d, ldd, work )
+              call stdlib_dlarfx( 'R', 4, 3, u1, tau1, d, ldd, work )
+              call stdlib_dlarfx( 'L', 3, 4, u2, tau2, d( 2, 1 ), ldd, work )
+              call stdlib_dlarfx( 'R', 4, 3, u2, tau2, d( 1, 2 ), ldd, work )
+              ! test whether to reject swap.
+              if( max( abs( d( 3, 1 ) ), abs( d( 3, 2 ) ), abs( d( 4, 1 ) ),abs( d( 4, 2 ) ) )&
+                        >thresh )go to 50
+              ! accept swap: apply transformation to the entire matrix t.
+              call stdlib_dlarfx( 'L', 3, n-j1+1, u1, tau1, t( j1, j1 ), ldt, work )
+              call stdlib_dlarfx( 'R', j4, 3, u1, tau1, t( 1, j1 ), ldt, work )
+              call stdlib_dlarfx( 'L', 3, n-j1+1, u2, tau2, t( j2, j1 ), ldt, work )
+              call stdlib_dlarfx( 'R', j4, 3, u2, tau2, t( 1, j2 ), ldt, work )
+              t( j3, j1 ) = zero
+              t( j3, j2 ) = zero
+              t( j4, j1 ) = zero
+              t( j4, j2 ) = zero
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_dlarfx( 'R', n, 3, u1, tau1, q( 1, j1 ), ldq, work )
+                 call stdlib_dlarfx( 'R', n, 3, u2, tau2, q( 1, j2 ), ldq, work )
+              end if
+              40 continue
+              if( n2==2 ) then
+                 ! standardize new 2-by-2 block t11
+                 call stdlib_dlanv2( t( j1, j1 ), t( j1, j2 ), t( j2, j1 ),t( j2, j2 ), wr1, wi1, &
+                           wr2, wi2, cs, sn )
+                 call stdlib_drot( n-j1-1, t( j1, j1+2 ), ldt, t( j2, j1+2 ), ldt,cs, sn )
+                 call stdlib_drot( j1-1, t( 1, j1 ), 1, t( 1, j2 ), 1, cs, sn )
+                 if( wantq )call stdlib_drot( n, q( 1, j1 ), 1, q( 1, j2 ), 1, cs, sn )
+              end if
+              if( n1==2 ) then
+                 ! standardize new 2-by-2 block t22
+                 j3 = j1 + n2
+                 j4 = j3 + 1
+                 call stdlib_dlanv2( t( j3, j3 ), t( j3, j4 ), t( j4, j3 ),t( j4, j4 ), wr1, wi1, &
+                           wr2, wi2, cs, sn )
+                 if( j3+2<=n )call stdlib_drot( n-j3-1, t( j3, j3+2 ), ldt, t( j4, j3+2 ),ldt, cs,&
+                            sn )
+                 call stdlib_drot( j3-1, t( 1, j3 ), 1, t( 1, j4 ), 1, cs, sn )
+                 if( wantq )call stdlib_drot( n, q( 1, j3 ), 1, q( 1, j4 ), 1, cs, sn )
+              end if
+           end if
+           return
+           ! exit with info = 1 if swap was rejected.
+           50 continue
+           info = 1
+           return
+     end subroutine stdlib_dlaexc
+
+     !> DLAHQR: is an auxiliary routine called by DHSEQR to update the
+     !> eigenvalues and Schur decomposition already computed by DHSEQR, by
+     !> dealing with the Hessenberg submatrix in rows and columns ILO to
+     !> IHI.
+
+     pure subroutine stdlib_dlahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, ldz, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(dp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(dp), intent(out) :: wi(*), wr(*)
+        ! =========================================================
+           ! Parameters 
+           real(dp), parameter :: dat1 = 3.0_dp/4.0_dp
+           real(dp), parameter :: dat2 = -0.4375_dp
+           integer(ilp), parameter :: kexsh = 10
+           
+           
+           
+           ! Local Scalars 
+           real(dp) :: aa, ab, ba, bb, cs, det, h11, h12, h21, h21s, h22, rt1i, rt1r, rt2i, rt2r, &
+                     rtdisc, s, safmax, safmin, smlnum, sn, sum, t1, t2, t3, tr, tst, ulp, v2, v3
+           integer(ilp) :: i, i1, i2, its, itmax, j, k, l, m, nh, nr, nz, kdefl
+           ! Local Arrays 
+           real(dp) :: v(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           if( ilo==ihi ) then
+              wr( ilo ) = h( ilo, ilo )
+              wi( ilo ) = zero
+              return
+           end if
+           ! ==== clear out the trash ====
+           do j = ilo, ihi - 3
+              h( j+2, j ) = zero
+              h( j+3, j ) = zero
+           end do
+           if( ilo<=ihi-2 )h( ihi, ihi-2 ) = zero
+           nh = ihi - ilo + 1
+           nz = ihiz - iloz + 1
+           ! set machine-dependent constants for the stopping criterion.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = one / safmin
+           call stdlib_dlabad( safmin, safmax )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin*( real( nh,KIND=dp) / ulp )
+           ! i1 and i2 are the indices of the first row and last column of h
+           ! to which transformations must be applied. if eigenvalues only are
+           ! being computed, i1 and i2 are set inside the main loop.
+           if( wantt ) then
+              i1 = 1
+              i2 = n
+           end if
+           ! itmax is the total number of qr iterations allowed.
+           itmax = 30 * max( 10, nh )
+           ! kdefl counts the number of iterations since a deflation
+           kdefl = 0
+           ! the main loop begins here. i is the loop index and decreases from
+           ! ihi to ilo in steps of 1 or 2. each iteration of the loop works
+           ! with the active submatrix in rows and columns l to i.
+           ! eigenvalues i+1 to ihi have already converged. either l = ilo or
+           ! h(l,l-1) is negligible so that the matrix splits.
+           i = ihi
+           20 continue
+           l = ilo
+           if( i<ilo )go to 160
+           ! perform qr iterations on rows and columns ilo to i until a
+           ! submatrix of order 1 or 2 splits off at the bottom because a
+           ! subdiagonal element has become negligible.
+           loop_140: do its = 0, itmax
+              ! look for a single small subdiagonal element.
+              do k = i, l + 1, -1
+                 if( abs( h( k, k-1 ) )<=smlnum )go to 40
+                 tst = abs( h( k-1, k-1 ) ) + abs( h( k, k ) )
+                 if( tst==zero ) then
+                    if( k-2>=ilo )tst = tst + abs( h( k-1, k-2 ) )
+                    if( k+1<=ihi )tst = tst + abs( h( k+1, k ) )
+                 end if
+                 ! ==== the following is a conservative small subdiagonal
+                 ! .    deflation  criterion due to ahues
+                 ! .    1997). it has better mathematical foundation and
+                 ! .    improves accuracy in some cases.  ====
+                 if( abs( h( k, k-1 ) )<=ulp*tst ) then
+                    ab = max( abs( h( k, k-1 ) ), abs( h( k-1, k ) ) )
+                    ba = min( abs( h( k, k-1 ) ), abs( h( k-1, k ) ) )
+                    aa = max( abs( h( k, k ) ),abs( h( k-1, k-1 )-h( k, k ) ) )
+                    bb = min( abs( h( k, k ) ),abs( h( k-1, k-1 )-h( k, k ) ) )
+                    s = aa + ab
+                    if( ba*( ab / s )<=max( smlnum,ulp*( bb*( aa / s ) ) ) )go to 40
+                 end if
+              end do
+              40 continue
+              l = k
+              if( l>ilo ) then
+                 ! h(l,l-1) is negligible
+                 h( l, l-1 ) = zero
+              end if
+              ! exit from loop if a submatrix of order 1 or 2 has split off.
+              if( l>=i-1 )go to 150
+              kdefl = kdefl + 1
+              ! now the active submatrix is in rows and columns l to i. if
+              ! eigenvalues only are being computed, only the active submatrix
+              ! need be transformed.
+              if( .not.wantt ) then
+                 i1 = l
+                 i2 = i
+              end if
+              if( mod(kdefl,2*kexsh)==0 ) then
+                 ! exceptional shift.
+                 s = abs( h( i, i-1 ) ) + abs( h( i-1, i-2 ) )
+                 h11 = dat1*s + h( i, i )
+                 h12 = dat2*s
+                 h21 = s
+                 h22 = h11
+              else if( mod(kdefl,kexsh)==0 ) then
+                 ! exceptional shift.
+                 s = abs( h( l+1, l ) ) + abs( h( l+2, l+1 ) )
+                 h11 = dat1*s + h( l, l )
+                 h12 = dat2*s
+                 h21 = s
+                 h22 = h11
+              else
+                 ! prepare to use francis' double shift
+                 ! (i.e. 2nd degree generalized rayleigh quotient)
+                 h11 = h( i-1, i-1 )
+                 h21 = h( i, i-1 )
+                 h12 = h( i-1, i )
+                 h22 = h( i, i )
+              end if
+              s = abs( h11 ) + abs( h12 ) + abs( h21 ) + abs( h22 )
+              if( s==zero ) then
+                 rt1r = zero
+                 rt1i = zero
+                 rt2r = zero
+                 rt2i = zero
+              else
+                 h11 = h11 / s
+                 h21 = h21 / s
+                 h12 = h12 / s
+                 h22 = h22 / s
+                 tr = ( h11+h22 ) / two
+                 det = ( h11-tr )*( h22-tr ) - h12*h21
+                 rtdisc = sqrt( abs( det ) )
+                 if( det>=zero ) then
+                    ! ==== complex conjugate shifts ====
+                    rt1r = tr*s
+                    rt2r = rt1r
+                    rt1i = rtdisc*s
+                    rt2i = -rt1i
+                 else
+                    ! ==== realshifts (use only one of them,KIND=dp)  ====
+                    rt1r = tr + rtdisc
+                    rt2r = tr - rtdisc
+                    if( abs( rt1r-h22 )<=abs( rt2r-h22 ) ) then
+                       rt1r = rt1r*s
+                       rt2r = rt1r
+                    else
+                       rt2r = rt2r*s
+                       rt1r = rt2r
+                    end if
+                    rt1i = zero
+                    rt2i = zero
+                 end if
+              end if
+              ! look for two consecutive small subdiagonal elements.
+              do m = i - 2, l, -1
+                 ! determine the effect of starting the double-shift qr
+                 ! iteration at row m, and see if this would make h(m,m-1)
+                 ! negligible.  (the following uses scaling to avoid
+                 ! overflows and most underflows.)
+                 h21s = h( m+1, m )
+                 s = abs( h( m, m )-rt2r ) + abs( rt2i ) + abs( h21s )
+                 h21s = h( m+1, m ) / s
+                 v( 1 ) = h21s*h( m, m+1 ) + ( h( m, m )-rt1r )*( ( h( m, m )-rt2r ) / s ) - &
+                           rt1i*( rt2i / s )
+                 v( 2 ) = h21s*( h( m, m )+h( m+1, m+1 )-rt1r-rt2r )
+                 v( 3 ) = h21s*h( m+2, m+1 )
+                 s = abs( v( 1 ) ) + abs( v( 2 ) ) + abs( v( 3 ) )
+                 v( 1 ) = v( 1 ) / s
+                 v( 2 ) = v( 2 ) / s
+                 v( 3 ) = v( 3 ) / s
+                 if( m==l )go to 60
+                 if( abs( h( m, m-1 ) )*( abs( v( 2 ) )+abs( v( 3 ) ) )<=ulp*abs( v( 1 ) )*( abs( &
+                           h( m-1, m-1 ) )+abs( h( m,m ) )+abs( h( m+1, m+1 ) ) ) )go to 60
+              end do
+              60 continue
+              ! double-shift qr step
+              loop_130: do k = m, i - 1
+                 ! the first iteration of this loop determines a reflection g
+                 ! from the vector v and applies it from left and right to h,
+                 ! thus creating a nonzero bulge below the subdiagonal.
+                 ! each subsequent iteration determines a reflection g to
+                 ! restore the hessenberg form in the (k-1)th column, and thus
+                 ! chases the bulge one step toward the bottom of the active
+                 ! submatrix. nr is the order of g.
+                 nr = min( 3, i-k+1 )
+                 if( k>m )call stdlib_dcopy( nr, h( k, k-1 ), 1, v, 1 )
+                 call stdlib_dlarfg( nr, v( 1 ), v( 2 ), 1, t1 )
+                 if( k>m ) then
+                    h( k, k-1 ) = v( 1 )
+                    h( k+1, k-1 ) = zero
+                    if( k<i-1 )h( k+2, k-1 ) = zero
+                 else if( m>l ) then
+                     ! ==== use the following instead of
+                     ! .    h( k, k-1 ) = -h( k, k-1 ) to
+                     ! .    avoid a bug when v(2) and v(3)
+                     ! .    underflow. ====
+                    h( k, k-1 ) = h( k, k-1 )*( one-t1 )
+                 end if
+                 v2 = v( 2 )
+                 t2 = t1*v2
+                 if( nr==3 ) then
+                    v3 = v( 3 )
+                    t3 = t1*v3
+                    ! apply g from the left to transform the rows of the matrix
+                    ! in columns k to i2.
+                    do j = k, i2
+                       sum = h( k, j ) + v2*h( k+1, j ) + v3*h( k+2, j )
+                       h( k, j ) = h( k, j ) - sum*t1
+                       h( k+1, j ) = h( k+1, j ) - sum*t2
+                       h( k+2, j ) = h( k+2, j ) - sum*t3
+                    end do
+                    ! apply g from the right to transform the columns of the
+                    ! matrix in rows i1 to min(k+3,i).
+                    do j = i1, min( k+3, i )
+                       sum = h( j, k ) + v2*h( j, k+1 ) + v3*h( j, k+2 )
+                       h( j, k ) = h( j, k ) - sum*t1
+                       h( j, k+1 ) = h( j, k+1 ) - sum*t2
+                       h( j, k+2 ) = h( j, k+2 ) - sum*t3
+                    end do
+                    if( wantz ) then
+                       ! accumulate transformations in the matrix z
+                       do j = iloz, ihiz
+                          sum = z( j, k ) + v2*z( j, k+1 ) + v3*z( j, k+2 )
+                          z( j, k ) = z( j, k ) - sum*t1
+                          z( j, k+1 ) = z( j, k+1 ) - sum*t2
+                          z( j, k+2 ) = z( j, k+2 ) - sum*t3
+                       end do
+                    end if
+                 else if( nr==2 ) then
+                    ! apply g from the left to transform the rows of the matrix
+                    ! in columns k to i2.
+                    do j = k, i2
+                       sum = h( k, j ) + v2*h( k+1, j )
+                       h( k, j ) = h( k, j ) - sum*t1
+                       h( k+1, j ) = h( k+1, j ) - sum*t2
+                    end do
+                    ! apply g from the right to transform the columns of the
+                    ! matrix in rows i1 to min(k+3,i).
+                    do j = i1, i
+                       sum = h( j, k ) + v2*h( j, k+1 )
+                       h( j, k ) = h( j, k ) - sum*t1
+                       h( j, k+1 ) = h( j, k+1 ) - sum*t2
+                    end do
+                    if( wantz ) then
+                       ! accumulate transformations in the matrix z
+                       do j = iloz, ihiz
+                          sum = z( j, k ) + v2*z( j, k+1 )
+                          z( j, k ) = z( j, k ) - sum*t1
+                          z( j, k+1 ) = z( j, k+1 ) - sum*t2
+                       end do
+                    end if
+                 end if
+              end do loop_130
+           end do loop_140
+           ! failure to converge in remaining number of iterations
+           info = i
+           return
+           150 continue
+           if( l==i ) then
+              ! h(i,i-1) is negligible: one eigenvalue has converged.
+              wr( i ) = h( i, i )
+              wi( i ) = zero
+           else if( l==i-1 ) then
+              ! h(i-1,i-2) is negligible: a pair of eigenvalues have converged.
+              ! transform the 2-by-2 submatrix to standard schur form,
+              ! and compute and store the eigenvalues.
+              call stdlib_dlanv2( h( i-1, i-1 ), h( i-1, i ), h( i, i-1 ),h( i, i ), wr( i-1 ), &
+                        wi( i-1 ), wr( i ), wi( i ),cs, sn )
+              if( wantt ) then
+                 ! apply the transformation to the rest of h.
+                 if( i2>i )call stdlib_drot( i2-i, h( i-1, i+1 ), ldh, h( i, i+1 ), ldh,cs, sn )
+                           
+                 call stdlib_drot( i-i1-1, h( i1, i-1 ), 1, h( i1, i ), 1, cs, sn )
+              end if
+              if( wantz ) then
+                 ! apply the transformation to z.
+                 call stdlib_drot( nz, z( iloz, i-1 ), 1, z( iloz, i ), 1, cs, sn )
+              end if
+           end if
+           ! reset deflation counter
+           kdefl = 0
+           ! return to start of the main loop with new value of i.
+           i = l - 1
+           go to 20
+           160 continue
+           return
+     end subroutine stdlib_dlahqr
+
+     !> DLASD2: merges the two sets of singular values together into a single
+     !> sorted set.  Then it tries to deflate the size of the problem.
+     !> There are two ways in which deflation can occur:  when two or more
+     !> singular values are close together or if there is a tiny entry in the
+     !> Z vector.  For each such occurrence the order of the related secular
+     !> equation problem is reduced by one.
+     !> DLASD2 is called from DLASD1.
+
+     pure subroutine stdlib_dlasd2( nl, nr, sqre, k, d, z, alpha, beta, u, ldu, vt,ldvt, dsigma, &
+               u2, ldu2, vt2, ldvt2, idxp, idx,idxc, idxq, coltyp, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, k
+           integer(ilp), intent(in) :: ldu, ldu2, ldvt, ldvt2, nl, nr, sqre
+           real(dp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           integer(ilp), intent(out) :: coltyp(*), idx(*), idxc(*), idxp(*)
+           integer(ilp), intent(inout) :: idxq(*)
+           real(dp), intent(inout) :: d(*), u(ldu,*), vt(ldvt,*)
+           real(dp), intent(out) :: dsigma(*), u2(ldu2,*), vt2(ldvt2,*), z(*)
+        ! =====================================================================
+           
+           ! Local Arrays 
+           integer(ilp) :: ctot(4), psm(4)
+           ! Local Scalars 
+           integer(ilp) :: ct, i, idxi, idxj, idxjp, j, jp, jprev, k2, m, n, nlp1, nlp2
+           real(dp) :: c, eps, hlftol, s, tau, tol, z1
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( nl<1 ) then
+              info = -1
+           else if( nr<1 ) then
+              info = -2
+           else if( ( sqre/=1 ) .and. ( sqre/=0 ) ) then
+              info = -3
+           end if
+           n = nl + nr + 1
+           m = n + sqre
+           if( ldu<n ) then
+              info = -10
+           else if( ldvt<m ) then
+              info = -12
+           else if( ldu2<n ) then
+              info = -15
+           else if( ldvt2<m ) then
+              info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD2', -info )
+              return
+           end if
+           nlp1 = nl + 1
+           nlp2 = nl + 2
+           ! generate the first part of the vector z; and move the singular
+           ! values in the first part of d one position backward.
+           z1 = alpha*vt( nlp1, nlp1 )
+           z( 1 ) = z1
+           do i = nl, 1, -1
+              z( i+1 ) = alpha*vt( i, nlp1 )
+              d( i+1 ) = d( i )
+              idxq( i+1 ) = idxq( i ) + 1
+           end do
+           ! generate the second part of the vector z.
+           do i = nlp2, m
+              z( i ) = beta*vt( i, nlp2 )
+           end do
+           ! initialize some reference arrays.
+           do i = 2, nlp1
+              coltyp( i ) = 1
+           end do
+           do i = nlp2, n
+              coltyp( i ) = 2
+           end do
+           ! sort the singular values into increasing order
+           do i = nlp2, n
+              idxq( i ) = idxq( i ) + nlp1
+           end do
+           ! dsigma, idxc, idxc, and the first column of u2
+           ! are used as storage space.
+           do i = 2, n
+              dsigma( i ) = d( idxq( i ) )
+              u2( i, 1 ) = z( idxq( i ) )
+              idxc( i ) = coltyp( idxq( i ) )
+           end do
+           call stdlib_dlamrg( nl, nr, dsigma( 2 ), 1, 1, idx( 2 ) )
+           do i = 2, n
+              idxi = 1 + idx( i )
+              d( i ) = dsigma( idxi )
+              z( i ) = u2( idxi, 1 )
+              coltyp( i ) = idxc( idxi )
+           end do
+           ! calculate the allowable deflation tolerance
+           eps = stdlib_dlamch( 'EPSILON' )
+           tol = max( abs( alpha ), abs( beta ) )
+           tol = eight*eps*max( abs( d( n ) ), tol )
+           ! there are 2 kinds of deflation -- first a value in the z-vector
+           ! is small, second two (or more) singular values are very close
+           ! together (their difference is small).
+           ! if the value in the z-vector is small, we simply permute the
+           ! array so that the corresponding singular value is moved to the
+           ! end.
+           ! if two values in the d-vector are close, we perform a two-sided
+           ! rotation designed to make one of the corresponding z-vector
+           ! entries zero, and then permute the array so that the deflated
+           ! singular value is moved to the end.
+           ! if there are multiple singular values then the problem deflates.
+           ! here the number of equal singular values are found.  as each equal
+           ! singular value is found, an elementary reflector is computed to
+           ! rotate the corresponding singular subspace so that the
+           ! corresponding components of z are zero in this new basis.
+           k = 1
+           k2 = n + 1
+           do j = 2, n
+              if( abs( z( j ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 idxp( k2 ) = j
+                 coltyp( j ) = 4
+                 if( j==n )go to 120
+              else
+                 jprev = j
+                 go to 90
+              end if
+           end do
+           90 continue
+           j = jprev
+           100 continue
+           j = j + 1
+           if( j>n )go to 110
+           if( abs( z( j ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              idxp( k2 ) = j
+              coltyp( j ) = 4
+           else
+              ! check if singular values are close enough to allow deflation.
+              if( abs( d( j )-d( jprev ) )<=tol ) then
+                 ! deflation is possible.
+                 s = z( jprev )
+                 c = z( j )
+                 ! find sqrt(a**2+b**2) without overflow or
+                 ! destructive underflow.
+                 tau = stdlib_dlapy2( c, s )
+                 c = c / tau
+                 s = -s / tau
+                 z( j ) = tau
+                 z( jprev ) = zero
+                 ! apply back the givens rotation to the left and right
+                 ! singular vector matrices.
+                 idxjp = idxq( idx( jprev )+1 )
+                 idxj = idxq( idx( j )+1 )
+                 if( idxjp<=nlp1 ) then
+                    idxjp = idxjp - 1
+                 end if
+                 if( idxj<=nlp1 ) then
+                    idxj = idxj - 1
+                 end if
+                 call stdlib_drot( n, u( 1, idxjp ), 1, u( 1, idxj ), 1, c, s )
+                 call stdlib_drot( m, vt( idxjp, 1 ), ldvt, vt( idxj, 1 ), ldvt, c,s )
+                 if( coltyp( j )/=coltyp( jprev ) ) then
+                    coltyp( j ) = 3
+                 end if
+                 coltyp( jprev ) = 4
+                 k2 = k2 - 1
+                 idxp( k2 ) = jprev
+                 jprev = j
+              else
+                 k = k + 1
+                 u2( k, 1 ) = z( jprev )
+                 dsigma( k ) = d( jprev )
+                 idxp( k ) = jprev
+                 jprev = j
+              end if
+           end if
+           go to 100
+           110 continue
+           ! record the last singular value.
+           k = k + 1
+           u2( k, 1 ) = z( jprev )
+           dsigma( k ) = d( jprev )
+           idxp( k ) = jprev
+           120 continue
+           ! count up the total number of the various types of columns, then
+           ! form a permutation which positions the four column types into
+           ! four groups of uniform structure (although one or more of these
+           ! groups may be empty).
+           do j = 1, 4
+              ctot( j ) = 0
+           end do
+           do j = 2, n
+              ct = coltyp( j )
+              ctot( ct ) = ctot( ct ) + 1
+           end do
+           ! psm(*) = position in submatrix (of types 1 through 4)
+           psm( 1 ) = 2
+           psm( 2 ) = 2 + ctot( 1 )
+           psm( 3 ) = psm( 2 ) + ctot( 2 )
+           psm( 4 ) = psm( 3 ) + ctot( 3 )
+           ! fill out the idxc array so that the permutation which it induces
+           ! will place all type-1 columns first, all type-2 columns next,
+           ! then all type-3's, and finally all type-4's, starting from the
+           ! second column. this applies similarly to the rows of vt.
+           do j = 2, n
+              jp = idxp( j )
+              ct = coltyp( jp )
+              idxc( psm( ct ) ) = j
+              psm( ct ) = psm( ct ) + 1
+           end do
+           ! sort the singular values and corresponding singular vectors into
+           ! dsigma, u2, and vt2 respectively.  the singular values/vectors
+           ! which were not deflated go into the first k slots of dsigma, u2,
+           ! and vt2 respectively, while those which were deflated go into the
+           ! last n - k slots, except that the first column/row will be treated
+           ! separately.
+           do j = 2, n
+              jp = idxp( j )
+              dsigma( j ) = d( jp )
+              idxj = idxq( idx( idxp( idxc( j ) ) )+1 )
+              if( idxj<=nlp1 ) then
+                 idxj = idxj - 1
+              end if
+              call stdlib_dcopy( n, u( 1, idxj ), 1, u2( 1, j ), 1 )
+              call stdlib_dcopy( m, vt( idxj, 1 ), ldvt, vt2( j, 1 ), ldvt2 )
+           end do
+           ! determine dsigma(1), dsigma(2) and z(1)
+           dsigma( 1 ) = zero
+           hlftol = tol / two
+           if( abs( dsigma( 2 ) )<=hlftol )dsigma( 2 ) = hlftol
+           if( m>n ) then
+              z( 1 ) = stdlib_dlapy2( z1, z( m ) )
+              if( z( 1 )<=tol ) then
+                 c = one
+                 s = zero
+                 z( 1 ) = tol
+              else
+                 c = z1 / z( 1 )
+                 s = z( m ) / z( 1 )
+              end if
+           else
+              if( abs( z1 )<=tol ) then
+                 z( 1 ) = tol
+              else
+                 z( 1 ) = z1
+              end if
+           end if
+           ! move the rest of the updating row to z.
+           call stdlib_dcopy( k-1, u2( 2, 1 ), 1, z( 2 ), 1 )
+           ! determine the first column of u2, the first row of vt2 and the
+           ! last row of vt.
+           call stdlib_dlaset( 'A', n, 1, zero, zero, u2, ldu2 )
+           u2( nlp1, 1 ) = one
+           if( m>n ) then
+              do i = 1, nlp1
+                 vt( m, i ) = -s*vt( nlp1, i )
+                 vt2( 1, i ) = c*vt( nlp1, i )
+              end do
+              do i = nlp2, m
+                 vt2( 1, i ) = s*vt( m, i )
+                 vt( m, i ) = c*vt( m, i )
+              end do
+           else
+              call stdlib_dcopy( m, vt( nlp1, 1 ), ldvt, vt2( 1, 1 ), ldvt2 )
+           end if
+           if( m>n ) then
+              call stdlib_dcopy( m, vt( m, 1 ), ldvt, vt2( m, 1 ), ldvt2 )
+           end if
+           ! the deflated singular values and their corresponding vectors go
+           ! into the back of d, u, and v respectively.
+           if( n>k ) then
+              call stdlib_dcopy( n-k, dsigma( k+1 ), 1, d( k+1 ), 1 )
+              call stdlib_dlacpy( 'A', n, n-k, u2( 1, k+1 ), ldu2, u( 1, k+1 ),ldu )
+              call stdlib_dlacpy( 'A', n-k, m, vt2( k+1, 1 ), ldvt2, vt( k+1, 1 ),ldvt )
+           end if
+           ! copy ctot into coltyp for referencing in stdlib_dlasd3.
+           do j = 1, 4
+              coltyp( j ) = ctot( j )
+           end do
+           return
+     end subroutine stdlib_dlasd2
+
+     !> DLASWLQ: computes a blocked Tall-Skinny LQ factorization of
+     !> a real M-by-N matrix A for M <= N:
+     !> A = ( L 0 ) *  Q,
+     !> where:
+     !> Q is a n-by-N orthogonal matrix, stored on exit in an implicit
+     !> form in the elements above the diagonal of the array A and in
+     !> the elements of the array T;
+     !> L is a lower-triangular M-by-M matrix stored on exit in
+     !> the elements on and below the diagonal of the array A.
+     !> 0 is a M-by-(N-M) zero matrix, if M < N, and is not stored.
+
+     pure subroutine stdlib_dlaswlq( m, n, mb, nb, a, lda, t, ldt, work, lwork,info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, mb, nb, lwork, ldt
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ii, kk, ctr
+           ! External Subroutines 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+             info = -1
+           else if( n<0 .or. n<m ) then
+             info = -2
+           else if( mb<1 .or. ( mb>m .and. m>0 )) then
+             info = -3
+           else if( nb<0 ) then
+             info = -4
+           else if( lda<max( 1, m ) ) then
+             info = -6
+           else if( ldt<mb ) then
+             info = -8
+           else if( ( lwork<m*mb) .and. (.not.lquery) ) then
+             info = -10
+           end if
+           if( info==0)  then
+           work(1) = mb*m
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DLASWLQ', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n)==0 ) then
+               return
+           end if
+           ! the lq decomposition
+            if((m>=n).or.(nb<=m).or.(nb>=n)) then
+             call stdlib_dgelqt( m, n, mb, a, lda, t, ldt, work, info)
+             return
+            end if
+            kk = mod((n-m),(nb-m))
+            ii=n-kk+1
+            ! compute the lq factorization of the first block a(1:m,1:nb)
+            call stdlib_dgelqt( m, nb, mb, a(1,1), lda, t, ldt, work, info)
+            ctr = 1
+            do i = nb+1, ii-nb+m , (nb-m)
+            ! compute the qr factorization of the current block a(1:m,i:i+nb-m)
+              call stdlib_dtplqt( m, nb-m, 0, mb, a(1,1), lda, a( 1, i ),lda, t(1, ctr * m + 1),&
+                        ldt, work, info )
+              ctr = ctr + 1
+            end do
+           ! compute the qr factorization of the last block a(1:m,ii:n)
+            if (ii<=n) then
+             call stdlib_dtplqt( m, kk, 0, mb, a(1,1), lda, a( 1, ii ),lda, t(1, ctr * m + 1), &
+                       ldt,work, info )
+            end if
+           work( 1 ) = m * mb
+           return
+     end subroutine stdlib_dlaswlq
+
+     !> DLATSQR: computes a blocked Tall-Skinny QR factorization of
+     !> a real M-by-N matrix A for M >= N:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix, stored on exit in an implicit
+     !> form in the elements below the diagonal of the array A and in
+     !> the elements of the array T;
+     !> R is an upper-triangular N-by-N matrix, stored on exit in
+     !> the elements on and above the diagonal of the array A.
+     !> 0 is a (M-N)-by-N zero matrix, and is not stored.
+
+     pure subroutine stdlib_dlatsqr( m, n, mb, nb, a, lda, t, ldt, work,lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, mb, nb, ldt, lwork
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ii, kk, ctr
+           ! External Subroutines 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+             info = -1
+           else if( n<0 .or. m<n ) then
+             info = -2
+           else if( mb<1 ) then
+             info = -3
+           else if( nb<1 .or. ( nb>n .and. n>0 )) then
+             info = -4
+           else if( lda<max( 1, m ) ) then
+             info = -6
+           else if( ldt<nb ) then
+             info = -8
+           else if( lwork<(n*nb) .and. (.not.lquery) ) then
+             info = -10
+           end if
+           if( info==0)  then
+             work(1) = nb*n
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DLATSQR', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n)==0 ) then
+               return
+           end if
+           ! the qr decomposition
+            if ((mb<=n).or.(mb>=m)) then
+              call stdlib_dgeqrt( m, n, nb, a, lda, t, ldt, work, info)
+              return
+            end if
+            kk = mod((m-n),(mb-n))
+            ii=m-kk+1
+            ! compute the qr factorization of the first block a(1:mb,1:n)
+            call stdlib_dgeqrt( mb, n, nb, a(1,1), lda, t, ldt, work, info )
+            ctr = 1
+            do i = mb+1, ii-mb+n ,  (mb-n)
+            ! compute the qr factorization of the current block a(i:i+mb-n,1:n)
+              call stdlib_dtpqrt( mb-n, n, 0, nb, a(1,1), lda, a( i, 1 ), lda,t(1, ctr * n + 1),&
+                        ldt, work, info )
+              ctr = ctr + 1
+            end do
+            ! compute the qr factorization of the last block a(ii:m,1:n)
+            if (ii<=m) then
+              call stdlib_dtpqrt( kk, n, 0, nb, a(1,1), lda, a( ii, 1 ), lda,t(1, ctr * n + 1), &
+                        ldt,work, info )
+            end if
+           work( 1 ) = n*nb
+           return
+     end subroutine stdlib_dlatsqr
+
+     !> DORGBR: generates one of the real orthogonal matrices Q or P**T
+     !> determined by DGEBRD when reducing a real matrix A to bidiagonal
+     !> form: A = Q * B * P**T.  Q and P**T are defined as products of
+     !> elementary reflectors H(i) or G(i) respectively.
+     !> If VECT = 'Q', A is assumed to have been an M-by-K matrix, and Q
+     !> is of order M:
+     !> if m >= k, Q = H(1) H(2) . . . H(k) and DORGBR returns the first n
+     !> columns of Q, where m >= n >= k;
+     !> if m < k, Q = H(1) H(2) . . . H(m-1) and DORGBR returns Q as an
+     !> M-by-M matrix.
+     !> If VECT = 'P', A is assumed to have been a K-by-N matrix, and P**T
+     !> is of order N:
+     !> if k < n, P**T = G(k) . . . G(2) G(1) and DORGBR returns the first m
+     !> rows of P**T, where n >= m >= k;
+     !> if k >= n, P**T = G(n-1) . . . G(2) G(1) and DORGBR returns P**T as
+     !> an N-by-N matrix.
+
+     pure subroutine stdlib_dorgbr( vect, m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantq
+           integer(ilp) :: i, iinfo, j, lwkopt, mn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           wantq = stdlib_lsame( vect, 'Q' )
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( .not.wantq .and. .not.stdlib_lsame( vect, 'P' ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 .or. ( wantq .and. ( n>m .or. n<min( m,k ) ) ) .or. ( .not.wantq .and. ( &
+                     m>n .or. m<min( n, k ) ) ) ) then
+              info = -3
+           else if( k<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( lwork<max( 1, mn ) .and. .not.lquery ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              work( 1 ) = 1
+              if( wantq ) then
+                 if( m>=k ) then
+                    call stdlib_dorgqr( m, n, k, a, lda, tau, work, -1, iinfo )
+                 else
+                    if( m>1 ) then
+                       call stdlib_dorgqr( m-1, m-1, m-1, a, lda, tau, work, -1,iinfo )
+                    end if
+                 end if
+              else
+                 if( k<n ) then
+                    call stdlib_dorglq( m, n, k, a, lda, tau, work, -1, iinfo )
+                 else
+                    if( n>1 ) then
+                       call stdlib_dorglq( n-1, n-1, n-1, a, lda, tau, work, -1,iinfo )
+                    end if
+                 end if
+              end if
+              lwkopt = work( 1 )
+              lwkopt = max (lwkopt, mn)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGBR', -info )
+              return
+           else if( lquery ) then
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( wantq ) then
+              ! form q, determined by a call to stdlib_dgebrd to reduce an m-by-k
+              ! matrix
+              if( m>=k ) then
+                 ! if m >= k, assume m >= n >= k
+                 call stdlib_dorgqr( m, n, k, a, lda, tau, work, lwork, iinfo )
+              else
+                 ! if m < k, assume m = n
+                 ! shift the vectors which define the elementary reflectors one
+                 ! column to the right, and set the first row and column of q
+                 ! to those of the unit matrix
+                 do j = m, 2, -1
+                    a( 1, j ) = zero
+                    do i = j + 1, m
+                       a( i, j ) = a( i, j-1 )
+                    end do
+                 end do
+                 a( 1, 1 ) = one
+                 do i = 2, m
+                    a( i, 1 ) = zero
+                 end do
+                 if( m>1 ) then
+                    ! form q(2:m,2:m)
+                    call stdlib_dorgqr( m-1, m-1, m-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                              
+                 end if
+              end if
+           else
+              ! form p**t, determined by a call to stdlib_dgebrd to reduce a k-by-n
+              ! matrix
+              if( k<n ) then
+                 ! if k < n, assume k <= m <= n
+                 call stdlib_dorglq( m, n, k, a, lda, tau, work, lwork, iinfo )
+              else
+                 ! if k >= n, assume m = n
+                 ! shift the vectors which define the elementary reflectors one
+                 ! row downward, and set the first row and column of p**t to
+                 ! those of the unit matrix
+                 a( 1, 1 ) = one
+                 do i = 2, n
+                    a( i, 1 ) = zero
+                 end do
+                 do j = 2, n
+                    do i = j - 1, 2, -1
+                       a( i, j ) = a( i-1, j )
+                    end do
+                    a( 1, j ) = zero
+                 end do
+                 if( n>1 ) then
+                    ! form p**t(2:n,2:n)
+                    call stdlib_dorglq( n-1, n-1, n-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                              
+                 end if
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dorgbr
+
+     !> If VECT = 'Q', DORMBR: overwrites the general real M-by-N matrix C
+     !> with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> If VECT = 'P', DORMBR overwrites the general real M-by-N matrix C
+     !> with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      P * C          C * P
+     !> TRANS = 'T':      P**T * C       C * P**T
+     !> Here Q and P**T are the orthogonal matrices determined by DGEBRD when
+     !> reducing a real matrix A to bidiagonal form: A = Q * B * P**T. Q and
+     !> P**T are defined as products of elementary reflectors H(i) and G(i)
+     !> respectively.
+     !> Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the
+     !> order of the orthogonal matrix Q or P**T that is applied.
+     !> If VECT = 'Q', A is assumed to have been an NQ-by-K matrix:
+     !> if nq >= k, Q = H(1) H(2) . . . H(k);
+     !> if nq < k, Q = H(1) H(2) . . . H(nq-1).
+     !> If VECT = 'P', A is assumed to have been a K-by-NQ matrix:
+     !> if k < nq, P = G(1) G(2) . . . G(k);
+     !> if k >= nq, P = G(1) G(2) . . . G(nq-1).
+
+     pure subroutine stdlib_dormbr( vect, side, trans, m, n, k, a, lda, tau, c,ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(dp), intent(in) :: tau(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: applyq, left, lquery, notran
+           character :: transt
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           applyq = stdlib_lsame( vect, 'Q' )
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q or p and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.applyq .and. .not.stdlib_lsame( vect, 'P' ) ) then
+              info = -1
+           else if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -2
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( k<0 ) then
+              info = -6
+           else if( ( applyq .and. lda<max( 1, nq ) ) .or.( .not.applyq .and. lda<max( 1, min( nq,&
+                      k ) ) ) )then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( applyq ) then
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'DORMQR', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'DORMQR', side // trans, m, n-1, n-1,-1 )
+                 end if
+              else
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'DORMLQ', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'DORMLQ', side // trans, m, n-1, n-1,-1 )
+                 end if
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMBR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           work( 1 ) = 1
+           if( m==0 .or. n==0 )return
+           if( applyq ) then
+              ! apply q
+              if( nq>=k ) then
+                 ! q was determined by a call to stdlib_dgebrd with nq >= k
+                 call stdlib_dormqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, iinfo &
+                           )
+              else if( nq>1 ) then
+                 ! q was determined by a call to stdlib_dgebrd with nq < k
+                 if( left ) then
+                    mi = m - 1
+                    ni = n
+                    i1 = 2
+                    i2 = 1
+                 else
+                    mi = m
+                    ni = n - 1
+                    i1 = 1
+                    i2 = 2
+                 end if
+                 call stdlib_dormqr( side, trans, mi, ni, nq-1, a( 2, 1 ), lda, tau,c( i1, i2 ), &
+                           ldc, work, lwork, iinfo )
+              end if
+           else
+              ! apply p
+              if( notran ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              if( nq>k ) then
+                 ! p was determined by a call to stdlib_dgebrd with nq > k
+                 call stdlib_dormlq( side, transt, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                           iinfo )
+              else if( nq>1 ) then
+                 ! p was determined by a call to stdlib_dgebrd with nq <= k
+                 if( left ) then
+                    mi = m - 1
+                    ni = n
+                    i1 = 2
+                    i2 = 1
+                 else
+                    mi = m
+                    ni = n - 1
+                    i1 = 1
+                    i2 = 2
+                 end if
+                 call stdlib_dormlq( side, transt, mi, ni, nq-1, a( 1, 2 ), lda,tau, c( i1, i2 ), &
+                           ldc, work, lwork, iinfo )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dormbr
+
+     !> DPBSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite band matrix and X
+     !> and B are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular band matrix, and L is a lower
+     !> triangular band matrix, with the same number of superdiagonals or
+     !> subdiagonals as A.  The factored form of A is then used to solve the
+     !> system of equations A * X = B.
+
+     pure subroutine stdlib_dpbsv( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(inout) :: ab(ldab,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**t*u or a = l*l**t.
+           call stdlib_dpbtrf( uplo, n, kd, ab, ldab, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_dpbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_dpbsv
+
+     !> DPBSVX: uses the Cholesky factorization A = U**T*U or A = L*L**T to
+     !> compute the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite band matrix and X
+     !> and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_dpbsvx( fact, uplo, n, kd, nrhs, ab, ldab, afb, ldafb,equed, s, b, ldb, x, &
+               ldx, rcond, ferr, berr,work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldafb, ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: ab(ldab,*), afb(ldafb,*), b(ldb,*), s(*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ, upper
+           integer(ilp) :: i, infequ, j, j1, j2
+           real(dp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           else if( ldafb<kd+1 ) then
+              info = -9
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -10
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -11
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -13
+                 else if( ldx<max( 1, n ) ) then
+                    info = -15
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_dpbequ( uplo, n, kd, ab, ldab, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_dlaqsb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right-hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**t *u or a = l*l**t.
+              if( upper ) then
+                 do j = 1, n
+                    j1 = max( j-kd, 1 )
+                    call stdlib_dcopy( j-j1+1, ab( kd+1-j+j1, j ), 1,afb( kd+1-j+j1, j ), 1 )
+                              
+                 end do
+              else
+                 do j = 1, n
+                    j2 = min( j+kd, n )
+                    call stdlib_dcopy( j2-j+1, ab( 1, j ), 1, afb( 1, j ), 1 )
+                 end do
+              end if
+              call stdlib_dpbtrf( uplo, n, kd, afb, ldafb, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_dlansb( '1', uplo, n, kd, ab, ldab, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_dpbcon( uplo, n, kd, afb, ldafb, anorm, rcond, work, iwork,info )
+           ! compute the solution matrix x.
+           call stdlib_dlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_dpbtrs( uplo, n, kd, nrhs, afb, ldafb, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_dpbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b, ldb, x,ldx, ferr, berr,&
+                      work, iwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_dpbsvx
+
+     !> DPFTRF: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the block version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_dpftrf( transr, uplo, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(in) :: n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPFTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_dpotrf( 'L', n1, a( 0 ), n, info )
+                    if( info>0 )return
+                    call stdlib_dtrsm( 'R', 'L', 'T', 'N', n2, n1, one, a( 0 ), n,a( n1 ), n )
+                              
+                    call stdlib_dsyrk( 'U', 'N', n2, n1, -one, a( n1 ), n, one,a( n ), n )
+                    call stdlib_dpotrf( 'U', n2, a( n ), n, info )
+                    if( info>0 )info = info + n1
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_dpotrf( 'L', n1, a( n2 ), n, info )
+                    if( info>0 )return
+                    call stdlib_dtrsm( 'L', 'L', 'N', 'N', n1, n2, one, a( n2 ), n,a( 0 ), n )
+                              
+                    call stdlib_dsyrk( 'U', 'T', n2, n1, -one, a( 0 ), n, one,a( n1 ), n )
+                    call stdlib_dpotrf( 'U', n2, a( n1 ), n, info )
+                    if( info>0 )info = info + n1
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    call stdlib_dpotrf( 'U', n1, a( 0 ), n1, info )
+                    if( info>0 )return
+                    call stdlib_dtrsm( 'L', 'U', 'T', 'N', n1, n2, one, a( 0 ), n1,a( n1*n1 ), n1 &
+                              )
+                    call stdlib_dsyrk( 'L', 'T', n2, n1, -one, a( n1*n1 ), n1, one,a( 1 ), n1 )
+                              
+                    call stdlib_dpotrf( 'L', n2, a( 1 ), n1, info )
+                    if( info>0 )info = info + n1
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    call stdlib_dpotrf( 'U', n1, a( n2*n2 ), n2, info )
+                    if( info>0 )return
+                    call stdlib_dtrsm( 'R', 'U', 'N', 'N', n2, n1, one, a( n2*n2 ),n2, a( 0 ), n2 &
+                              )
+                    call stdlib_dsyrk( 'L', 'N', n2, n1, -one, a( 0 ), n2, one,a( n1*n2 ), n2 )
+                              
+                    call stdlib_dpotrf( 'L', n2, a( n1*n2 ), n2, info )
+                    if( info>0 )info = info + n1
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_dpotrf( 'L', k, a( 1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_dtrsm( 'R', 'L', 'T', 'N', k, k, one, a( 1 ), n+1,a( k+1 ), n+1 )
+                              
+                    call stdlib_dsyrk( 'U', 'N', k, k, -one, a( k+1 ), n+1, one,a( 0 ), n+1 )
+                              
+                    call stdlib_dpotrf( 'U', k, a( 0 ), n+1, info )
+                    if( info>0 )info = info + k
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_dpotrf( 'L', k, a( k+1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_dtrsm( 'L', 'L', 'N', 'N', k, k, one, a( k+1 ),n+1, a( 0 ), n+1 )
+                              
+                    call stdlib_dsyrk( 'U', 'T', k, k, -one, a( 0 ), n+1, one,a( k ), n+1 )
+                              
+                    call stdlib_dpotrf( 'U', k, a( k ), n+1, info )
+                    if( info>0 )info = info + k
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_dpotrf( 'U', k, a( 0+k ), k, info )
+                    if( info>0 )return
+                    call stdlib_dtrsm( 'L', 'U', 'T', 'N', k, k, one, a( k ), n1,a( k*( k+1 ) ), &
+                              k )
+                    call stdlib_dsyrk( 'L', 'T', k, k, -one, a( k*( k+1 ) ), k, one,a( 0 ), k )
+                              
+                    call stdlib_dpotrf( 'L', k, a( 0 ), k, info )
+                    if( info>0 )info = info + k
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_dpotrf( 'U', k, a( k*( k+1 ) ), k, info )
+                    if( info>0 )return
+                    call stdlib_dtrsm( 'R', 'U', 'N', 'N', k, k, one,a( k*( k+1 ) ), k, a( 0 ), k &
+                              )
+                    call stdlib_dsyrk( 'L', 'N', k, k, -one, a( 0 ), k, one,a( k*k ), k )
+                    call stdlib_dpotrf( 'L', k, a( k*k ), k, info )
+                    if( info>0 )info = info + k
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_dpftrf
+
+     !> DPOSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**T* U,  if UPLO = 'U', or
+     !> A = L * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is a lower triangular
+     !> matrix.  The factored form of A is then used to solve the system of
+     !> equations A * X = B.
+
+     pure subroutine stdlib_dposv( uplo, n, nrhs, a, lda, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**t*u or a = l*l**t.
+           call stdlib_dpotrf( uplo, n, a, lda, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_dpotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_dposv
+
+     !> DPOSVX: uses the Cholesky factorization A = U**T*U or A = L*L**T to
+     !> compute the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_dposvx( fact, uplo, n, nrhs, a, lda, af, ldaf, equed,s, b, ldb, x, ldx, &
+               rcond, ferr, berr, work,iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: a(lda,*), af(ldaf,*), b(ldb,*), s(*)
+           real(dp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ
+           integer(ilp) :: i, infequ, j
+           real(dp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -9
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -10
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -12
+                 else if( ldx<max( 1, n ) ) then
+                    info = -14
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_dpoequ( n, a, lda, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_dlaqsy( uplo, n, a, lda, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**t *u or a = l*l**t.
+              call stdlib_dlacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_dpotrf( uplo, n, af, ldaf, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_dlansy( '1', uplo, n, a, lda, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_dpocon( uplo, n, af, ldaf, anorm, rcond, work, iwork, info )
+           ! compute the solution matrix x.
+           call stdlib_dlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_dpotrs( uplo, n, nrhs, af, ldaf, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_dporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x, ldx,ferr, berr, work, &
+                     iwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_dposvx
+
+     !> DTREXC: reorders the real Schur factorization of a real matrix
+     !> A = Q*T*Q**T, so that the diagonal block of T with row index IFST is
+     !> moved to row ILST.
+     !> The real Schur form T is reordered by an orthogonal similarity
+     !> transformation Z**T*T*Z, and optionally the matrix Q of Schur vectors
+     !> is updated by postmultiplying it with Z.
+     !> T must be in Schur canonical form (as returned by DHSEQR), that is,
+     !> block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; each
+     !> 2-by-2 diagonal block has its diagonal elements equal and its
+     !> off-diagonal elements of opposite sign.
+
+     subroutine stdlib_dtrexc( compq, n, t, ldt, q, ldq, ifst, ilst, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq
+           integer(ilp), intent(inout) :: ifst, ilst
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldq, ldt, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: q(ldq,*), t(ldt,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantq
+           integer(ilp) :: here, nbf, nbl, nbnext
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode and test the input arguments.
+           info = 0
+           wantq = stdlib_lsame( compq, 'V' )
+           if( .not.wantq .and. .not.stdlib_lsame( compq, 'N' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldt<max( 1, n ) ) then
+              info = -4
+           else if( ldq<1 .or. ( wantq .and. ldq<max( 1, n ) ) ) then
+              info = -6
+           else if(( ifst<1 .or. ifst>n ).and.( n>0 )) then
+              info = -7
+           else if(( ilst<1 .or. ilst>n ).and.( n>0 )) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTREXC', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 )return
+           ! determine the first row of specified block
+           ! and find out it is 1 by 1 or 2 by 2.
+           if( ifst>1 ) then
+              if( t( ifst, ifst-1 )/=zero )ifst = ifst - 1
+           end if
+           nbf = 1
+           if( ifst<n ) then
+              if( t( ifst+1, ifst )/=zero )nbf = 2
+           end if
+           ! determine the first row of the final block
+           ! and find out it is 1 by 1 or 2 by 2.
+           if( ilst>1 ) then
+              if( t( ilst, ilst-1 )/=zero )ilst = ilst - 1
+           end if
+           nbl = 1
+           if( ilst<n ) then
+              if( t( ilst+1, ilst )/=zero )nbl = 2
+           end if
+           if( ifst==ilst )return
+           if( ifst<ilst ) then
+              ! update ilst
+              if( nbf==2 .and. nbl==1 )ilst = ilst - 1
+              if( nbf==1 .and. nbl==2 )ilst = ilst + 1
+              here = ifst
+              10 continue
+              ! swap block with next one below
+              if( nbf==1 .or. nbf==2 ) then
+                 ! current block either 1 by 1 or 2 by 2
+                 nbnext = 1
+                 if( here+nbf+1<=n ) then
+                    if( t( here+nbf+1, here+nbf )/=zero )nbnext = 2
+                 end if
+                 call stdlib_dlaexc( wantq, n, t, ldt, q, ldq, here, nbf, nbnext,work, info )
+                           
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 here = here + nbnext
+                 ! test if 2 by 2 block breaks into two 1 by 1 blocks
+                 if( nbf==2 ) then
+                    if( t( here+1, here )==zero )nbf = 3
+                 end if
+              else
+                 ! current block consists of two 1 by 1 blocks each of which
+                 ! must be swapped individually
+                 nbnext = 1
+                 if( here+3<=n ) then
+                    if( t( here+3, here+2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_dlaexc( wantq, n, t, ldt, q, ldq, here+1, 1, nbnext,work, info )
+                           
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 if( nbnext==1 ) then
+                    ! swap two 1 by 1 blocks, no problems possible
+                    call stdlib_dlaexc( wantq, n, t, ldt, q, ldq, here, 1, nbnext,work, info )
+                              
+                    here = here + 1
+                 else
+                    ! recompute nbnext in case 2 by 2 split
+                    if( t( here+2, here+1 )==zero )nbnext = 1
+                    if( nbnext==2 ) then
+                       ! 2 by 2 block did not split
+                       call stdlib_dlaexc( wantq, n, t, ldt, q, ldq, here, 1,nbnext, work, info )
+                                 
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here + 2
+                    else
+                       ! 2 by 2 block did split
+                       call stdlib_dlaexc( wantq, n, t, ldt, q, ldq, here, 1, 1,work, info )
+                                 
+                       call stdlib_dlaexc( wantq, n, t, ldt, q, ldq, here+1, 1, 1,work, info )
+                                 
+                       here = here + 2
+                    end if
+                 end if
+              end if
+              if( here<ilst )go to 10
+           else
+              here = ifst
+              20 continue
+              ! swap block with next one above
+              if( nbf==1 .or. nbf==2 ) then
+                 ! current block either 1 by 1 or 2 by 2
+                 nbnext = 1
+                 if( here>=3 ) then
+                    if( t( here-1, here-2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_dlaexc( wantq, n, t, ldt, q, ldq, here-nbnext, nbnext,nbf, work, &
+                           info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 here = here - nbnext
+                 ! test if 2 by 2 block breaks into two 1 by 1 blocks
+                 if( nbf==2 ) then
+                    if( t( here+1, here )==zero )nbf = 3
+                 end if
+              else
+                 ! current block consists of two 1 by 1 blocks each of which
+                 ! must be swapped individually
+                 nbnext = 1
+                 if( here>=3 ) then
+                    if( t( here-1, here-2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_dlaexc( wantq, n, t, ldt, q, ldq, here-nbnext, nbnext,1, work, info )
+                           
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 if( nbnext==1 ) then
+                    ! swap two 1 by 1 blocks, no problems possible
+                    call stdlib_dlaexc( wantq, n, t, ldt, q, ldq, here, nbnext, 1,work, info )
+                              
+                    here = here - 1
+                 else
+                    ! recompute nbnext in case 2 by 2 split
+                    if( t( here, here-1 )==zero )nbnext = 1
+                    if( nbnext==2 ) then
+                       ! 2 by 2 block did not split
+                       call stdlib_dlaexc( wantq, n, t, ldt, q, ldq, here-1, 2, 1,work, info )
+                                 
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here - 2
+                    else
+                       ! 2 by 2 block did split
+                       call stdlib_dlaexc( wantq, n, t, ldt, q, ldq, here, 1, 1,work, info )
+                                 
+                       call stdlib_dlaexc( wantq, n, t, ldt, q, ldq, here-1, 1, 1,work, info )
+                                 
+                       here = here - 2
+                    end if
+                 end if
+              end if
+              if( here>ilst )go to 20
+           end if
+           ilst = here
+           return
+     end subroutine stdlib_dtrexc
+
+     !> DTRSEN: reorders the real Schur factorization of a real matrix
+     !> A = Q*T*Q**T, so that a selected cluster of eigenvalues appears in
+     !> the leading diagonal blocks of the upper quasi-triangular matrix T,
+     !> and the leading columns of Q form an orthonormal basis of the
+     !> corresponding right invariant subspace.
+     !> Optionally the routine computes the reciprocal condition numbers of
+     !> the cluster of eigenvalues and/or the invariant subspace.
+     !> T must be in Schur canonical form (as returned by DHSEQR), that is,
+     !> block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; each
+     !> 2-by-2 diagonal block has its diagonal elements equal and its
+     !> off-diagonal elements of opposite sign.
+
+     subroutine stdlib_dtrsen( job, compq, select, n, t, ldt, q, ldq, wr, wi,m, s, sep, work, &
+               lwork, iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldq, ldt, liwork, lwork, n
+           real(dp), intent(out) :: s, sep
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: q(ldq,*), t(ldt,*)
+           real(dp), intent(out) :: wi(*), work(*), wr(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, pair, swap, wantbh, wantq, wants, wantsp
+           integer(ilp) :: ierr, k, kase, kk, ks, liwmin, lwmin, n1, n2, nn
+           real(dp) :: est, rnorm, scale
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantsp = stdlib_lsame( job, 'V' ) .or. wantbh
+           wantq = stdlib_lsame( compq, 'V' )
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.wants .and. .not.wantsp )then
+              info = -1
+           else if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -8
+           else
+              ! set m to the dimension of the specified invariant subspace,
+              ! and test lwork and liwork.
+              m = 0
+              pair = .false.
+              do k = 1, n
+                 if( pair ) then
+                    pair = .false.
+                 else
+                    if( k<n ) then
+                       if( t( k+1, k )==zero ) then
+                          if( select( k ) )m = m + 1
+                       else
+                          pair = .true.
+                          if( select( k ) .or. select( k+1 ) )m = m + 2
+                       end if
+                    else
+                       if( select( n ) )m = m + 1
+                    end if
+                 end if
+              end do
+              n1 = m
+              n2 = n - m
+              nn = n1*n2
+              if( wantsp ) then
+                 lwmin = max( 1, 2*nn )
+                 liwmin = max( 1, nn )
+              else if( stdlib_lsame( job, 'N' ) ) then
+                 lwmin = max( 1, n )
+                 liwmin = 1
+              else if( stdlib_lsame( job, 'E' ) ) then
+                 lwmin = max( 1, nn )
+                 liwmin = 1
+              end if
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -15
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -17
+              end if
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRSEN', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( m==n .or. m==0 ) then
+              if( wants )s = one
+              if( wantsp )sep = stdlib_dlange( '1', n, n, t, ldt, work )
+              go to 40
+           end if
+           ! collect the selected blocks at the top-left corner of t.
+           ks = 0
+           pair = .false.
+           loop_20: do k = 1, n
+              if( pair ) then
+                 pair = .false.
+              else
+                 swap = select( k )
+                 if( k<n ) then
+                    if( t( k+1, k )/=zero ) then
+                       pair = .true.
+                       swap = swap .or. select( k+1 )
+                    end if
+                 end if
+                 if( swap ) then
+                    ks = ks + 1
+                    ! swap the k-th block to position ks.
+                    ierr = 0
+                    kk = k
+                    if( k/=ks )call stdlib_dtrexc( compq, n, t, ldt, q, ldq, kk, ks, work,ierr )
+                              
+                    if( ierr==1 .or. ierr==2 ) then
+                       ! blocks too close to swap: exit.
+                       info = 1
+                       if( wants )s = zero
+                       if( wantsp )sep = zero
+                       go to 40
+                    end if
+                    if( pair )ks = ks + 1
+                 end if
+              end if
+           end do loop_20
+           if( wants ) then
+              ! solve sylvester equation for r:
+                 ! t11*r - r*t22 = scale*t12
+              call stdlib_dlacpy( 'F', n1, n2, t( 1, n1+1 ), ldt, work, n1 )
+              call stdlib_dtrsyl( 'N', 'N', -1, n1, n2, t, ldt, t( n1+1, n1+1 ),ldt, work, n1, &
+                        scale, ierr )
+              ! estimate the reciprocal of the condition number of the cluster
+              ! of eigenvalues.
+              rnorm = stdlib_dlange( 'F', n1, n2, work, n1, work )
+              if( rnorm==zero ) then
+                 s = one
+              else
+                 s = scale / ( sqrt( scale*scale / rnorm+rnorm )*sqrt( rnorm ) )
+              end if
+           end if
+           if( wantsp ) then
+              ! estimate sep(t11,t22).
+              est = zero
+              kase = 0
+              30 continue
+              call stdlib_dlacn2( nn, work( nn+1 ), work, iwork, est, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! solve  t11*r - r*t22 = scale*x.
+                    call stdlib_dtrsyl( 'N', 'N', -1, n1, n2, t, ldt,t( n1+1, n1+1 ), ldt, work, &
+                              n1, scale,ierr )
+                 else
+                    ! solve t11**t*r - r*t22**t = scale*x.
+                    call stdlib_dtrsyl( 'T', 'T', -1, n1, n2, t, ldt,t( n1+1, n1+1 ), ldt, work, &
+                              n1, scale,ierr )
+                 end if
+                 go to 30
+              end if
+              sep = scale / est
+           end if
+           40 continue
+           ! store the output eigenvalues in wr and wi.
+           do k = 1, n
+              wr( k ) = t( k, k )
+              wi( k ) = zero
+           end do
+           do k = 1, n - 1
+              if( t( k+1, k )/=zero ) then
+                 wi( k ) = sqrt( abs( t( k, k+1 ) ) )*sqrt( abs( t( k+1, k ) ) )
+                 wi( k+1 ) = -wi( k )
+              end if
+           end do
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_dtrsen
+
+     !> DTRSNA: estimates reciprocal condition numbers for specified
+     !> eigenvalues and/or right eigenvectors of a real upper
+     !> quasi-triangular matrix T (or of any matrix Q*T*Q**T with Q
+     !> orthogonal).
+     !> T must be in Schur canonical form (as returned by DHSEQR), that is,
+     !> block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; each
+     !> 2-by-2 diagonal block has its diagonal elements equal and its
+     !> off-diagonal elements of opposite sign.
+
+     subroutine stdlib_dtrsna( job, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, s, sep, mm, m, &
+               work, ldwork, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, ldwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: s(*), sep(*), work(ldwork,*)
+           real(dp), intent(in) :: t(ldt,*), vl(ldvl,*), vr(ldvr,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: pair, somcon, wantbh, wants, wantsp
+           integer(ilp) :: i, ierr, ifst, ilst, j, k, kase, ks, n2, nn
+           real(dp) :: bignum, cond, cs, delta, dumm, eps, est, lnrm, mu, prod, prod1, prod2, &
+                     rnrm, scale, smlnum, sn
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           real(dp) :: dummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantsp = stdlib_lsame( job, 'V' ) .or. wantbh
+           somcon = stdlib_lsame( howmny, 'S' )
+           info = 0
+           if( .not.wants .and. .not.wantsp ) then
+              info = -1
+           else if( .not.stdlib_lsame( howmny, 'A' ) .and. .not.somcon ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( wants .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( wants .and. ldvr<n ) ) then
+              info = -10
+           else
+              ! set m to the number of eigenpairs for which condition numbers
+              ! are required, and test mm.
+              if( somcon ) then
+                 m = 0
+                 pair = .false.
+                 do k = 1, n
+                    if( pair ) then
+                       pair = .false.
+                    else
+                       if( k<n ) then
+                          if( t( k+1, k )==zero ) then
+                             if( select( k ) )m = m + 1
+                          else
+                             pair = .true.
+                             if( select( k ) .or. select( k+1 ) )m = m + 2
+                          end if
+                       else
+                          if( select( n ) )m = m + 1
+                       end if
+                    end if
+                 end do
+              else
+                 m = n
+              end if
+              if( mm<m ) then
+                 info = -13
+              else if( ldwork<1 .or. ( wantsp .and. ldwork<n ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRSNA', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( somcon ) then
+                 if( .not.select( 1 ) )return
+              end if
+              if( wants )s( 1 ) = one
+              if( wantsp )sep( 1 ) = abs( t( 1, 1 ) )
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ks = 0
+           pair = .false.
+           loop_60: do k = 1, n
+              ! determine whether t(k,k) begins a 1-by-1 or 2-by-2 block.
+              if( pair ) then
+                 pair = .false.
+                 cycle loop_60
+              else
+                 if( k<n )pair = t( k+1, k )/=zero
+              end if
+              ! determine whether condition numbers are required for the k-th
+              ! eigenpair.
+              if( somcon ) then
+                 if( pair ) then
+                    if( .not.select( k ) .and. .not.select( k+1 ) )cycle loop_60
+                 else
+                    if( .not.select( k ) )cycle loop_60
+                 end if
+              end if
+              ks = ks + 1
+              if( wants ) then
+                 ! compute the reciprocal condition number of the k-th
+                 ! eigenvalue.
+                 if( .not.pair ) then
+                    ! real eigenvalue.
+                    prod = stdlib_ddot( n, vr( 1, ks ), 1, vl( 1, ks ), 1 )
+                    rnrm = stdlib_dnrm2( n, vr( 1, ks ), 1 )
+                    lnrm = stdlib_dnrm2( n, vl( 1, ks ), 1 )
+                    s( ks ) = abs( prod ) / ( rnrm*lnrm )
+                 else
+                    ! complex eigenvalue.
+                    prod1 = stdlib_ddot( n, vr( 1, ks ), 1, vl( 1, ks ), 1 )
+                    prod1 = prod1 + stdlib_ddot( n, vr( 1, ks+1 ), 1, vl( 1, ks+1 ),1 )
+                    prod2 = stdlib_ddot( n, vl( 1, ks ), 1, vr( 1, ks+1 ), 1 )
+                    prod2 = prod2 - stdlib_ddot( n, vl( 1, ks+1 ), 1, vr( 1, ks ),1 )
+                    rnrm = stdlib_dlapy2( stdlib_dnrm2( n, vr( 1, ks ), 1 ),stdlib_dnrm2( n, vr( &
+                              1, ks+1 ), 1 ) )
+                    lnrm = stdlib_dlapy2( stdlib_dnrm2( n, vl( 1, ks ), 1 ),stdlib_dnrm2( n, vl( &
+                              1, ks+1 ), 1 ) )
+                    cond = stdlib_dlapy2( prod1, prod2 ) / ( rnrm*lnrm )
+                    s( ks ) = cond
+                    s( ks+1 ) = cond
+                 end if
+              end if
+              if( wantsp ) then
+                 ! estimate the reciprocal condition number of the k-th
+                 ! eigenvector.
+                 ! copy the matrix t to the array work and swap the diagonal
+                 ! block beginning at t(k,k) to the (1,1) position.
+                 call stdlib_dlacpy( 'FULL', n, n, t, ldt, work, ldwork )
+                 ifst = k
+                 ilst = 1
+                 call stdlib_dtrexc( 'NO Q', n, work, ldwork, dummy, 1, ifst, ilst,work( 1, n+1 ),&
+                            ierr )
+                 if( ierr==1 .or. ierr==2 ) then
+                    ! could not swap because blocks not well separated
+                    scale = one
+                    est = bignum
+                 else
+                    ! reordering successful
+                    if( work( 2, 1 )==zero ) then
+                       ! form c = t22 - lambda*i in work(2:n,2:n).
+                       do i = 2, n
+                          work( i, i ) = work( i, i ) - work( 1, 1 )
+                       end do
+                       n2 = 1
+                       nn = n - 1
+                    else
+                       ! triangularize the 2 by 2 block by unitary
+                       ! transformation u = [  cs   i*ss ]
+                                          ! [ i*ss   cs  ].
+                       ! such that the (1,1) position of work is complex
+                       ! eigenvalue lambda with positive imaginary part. (2,2)
+                       ! position of work is the complex eigenvalue lambda
+                       ! with negative imaginary  part.
+                       mu = sqrt( abs( work( 1, 2 ) ) )*sqrt( abs( work( 2, 1 ) ) )
+                       delta = stdlib_dlapy2( mu, work( 2, 1 ) )
+                       cs = mu / delta
+                       sn = -work( 2, 1 ) / delta
+                       ! form
+                       ! c**t = work(2:n,2:n) + i*[rwork(1) ..... rwork(n-1) ]
+                                                ! [   mu                     ]
+                                                ! [         ..               ]
+                                                ! [             ..           ]
+                                                ! [                  mu      ]
+                       ! where c**t is transpose of matrix c,
+                       ! and rwork is stored starting in the n+1-st column of
+                       ! work.
+                       do j = 3, n
+                          work( 2, j ) = cs*work( 2, j )
+                          work( j, j ) = work( j, j ) - work( 1, 1 )
+                       end do
+                       work( 2, 2 ) = zero
+                       work( 1, n+1 ) = two*mu
+                       do i = 2, n - 1
+                          work( i, n+1 ) = sn*work( 1, i+1 )
+                       end do
+                       n2 = 2
+                       nn = 2*( n-1 )
+                    end if
+                    ! estimate norm(inv(c**t))
+                    est = zero
+                    kase = 0
+                    50 continue
+                    call stdlib_dlacn2( nn, work( 1, n+2 ), work( 1, n+4 ), iwork,est, kase, &
+                              isave )
+                    if( kase/=0 ) then
+                       if( kase==1 ) then
+                          if( n2==1 ) then
+                             ! real eigenvalue: solve c**t*x = scale*c.
+                             call stdlib_dlaqtr( .true., .true., n-1, work( 2, 2 ),ldwork, dummy, &
+                                       dumm, scale,work( 1, n+4 ), work( 1, n+6 ),ierr )
+                          else
+                             ! complex eigenvalue: solve
+                             ! c**t*(p+iq) = scale*(c+id) in real arithmetic.
+                             call stdlib_dlaqtr( .true., .false., n-1, work( 2, 2 ),ldwork, work( &
+                                       1, n+1 ), mu, scale,work( 1, n+4 ), work( 1, n+6 ),ierr )
+                          end if
+                       else
+                          if( n2==1 ) then
+                             ! real eigenvalue: solve c*x = scale*c.
+                             call stdlib_dlaqtr( .false., .true., n-1, work( 2, 2 ),ldwork, dummy,&
+                                        dumm, scale,work( 1, n+4 ), work( 1, n+6 ),ierr )
+                          else
+                             ! complex eigenvalue: solve
+                             ! c*(p+iq) = scale*(c+id) in real arithmetic.
+                             call stdlib_dlaqtr( .false., .false., n-1,work( 2, 2 ), ldwork,work( &
+                                       1, n+1 ), mu, scale,work( 1, n+4 ), work( 1, n+6 ),ierr )
+                          end if
+                       end if
+                       go to 50
+                    end if
+                 end if
+                 sep( ks ) = scale / max( est, smlnum )
+                 if( pair )sep( ks+1 ) = sep( ks )
+              end if
+              if( pair )ks = ks + 1
+           end do loop_60
+           return
+     end subroutine stdlib_dtrsna
+
+     !> DGEJSV: computes the singular value decomposition (SVD) of a real M-by-N
+     !> matrix [A], where M >= N. The SVD of [A] is written as
+     !> [A] = [U] * [SIGMA] * [V]^t,
+     !> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+     !> diagonal elements, [U] is an M-by-N (or M-by-M) orthonormal matrix, and
+     !> [V] is an N-by-N orthogonal matrix. The diagonal elements of [SIGMA] are
+     !> the singular values of [A]. The columns of [U] and [V] are the left and
+     !> the right singular vectors of [A], respectively. The matrices [U] and [V]
+     !> are computed and stored in the arrays U and V, respectively. The diagonal
+     !> of [SIGMA] is computed and stored in the array SVA.
+     !> DGEJSV can sometimes compute tiny singular values and their singular vectors much
+     !> more accurately than other SVD routines, see below under Further Details.
+
+     pure subroutine stdlib_dgejsv( joba, jobu, jobv, jobr, jobt, jobp,m, n, a, lda, sva, u, ldu, &
+               v, ldv,work, lwork, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldv, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: sva(n), u(ldu,*), v(ldv,*), work(lwork)
+           integer(ilp), intent(out) :: iwork(*)
+           character, intent(in) :: joba, jobp, jobr, jobt, jobu, jobv
+        ! ===========================================================================
+           
+           ! Local Scalars 
+           real(dp) :: aapp, aaqq, aatmax, aatmin, big, big1, cond_ok, condr1, condr2, entra, &
+                     entrat, epsln, maxprj, scalem, sconda, sfmin, small, temp1, uscal1, uscal2, xsc
+           integer(ilp) :: ierr, n1, nr, numrank, p, q, warning
+           logical(lk) :: almort, defr, errest, goscal, jracc, kill, lsvec, l2aber, l2kill, &
+                     l2pert, l2rank, l2tran, noscal, rowpiv, rsvec, transp
+           ! Intrinsic Functions 
+           intrinsic :: abs,log,max,min,real,idnint,sign,sqrt
+           ! test the input arguments
+           lsvec  = stdlib_lsame( jobu, 'U' ) .or. stdlib_lsame( jobu, 'F' )
+           jracc  = stdlib_lsame( jobv, 'J' )
+           rsvec  = stdlib_lsame( jobv, 'V' ) .or. jracc
+           rowpiv = stdlib_lsame( joba, 'F' ) .or. stdlib_lsame( joba, 'G' )
+           l2rank = stdlib_lsame( joba, 'R' )
+           l2aber = stdlib_lsame( joba, 'A' )
+           errest = stdlib_lsame( joba, 'E' ) .or. stdlib_lsame( joba, 'G' )
+           l2tran = stdlib_lsame( jobt, 'T' )
+           l2kill = stdlib_lsame( jobr, 'R' )
+           defr   = stdlib_lsame( jobr, 'N' )
+           l2pert = stdlib_lsame( jobp, 'P' )
+           if ( .not.(rowpiv .or. l2rank .or. l2aber .or.errest .or. stdlib_lsame( joba, 'C' ) )) &
+                     then
+              info = - 1
+           else if ( .not.( lsvec  .or. stdlib_lsame( jobu, 'N' ) .or.stdlib_lsame( jobu, 'W' )) )&
+                      then
+              info = - 2
+           else if ( .not.( rsvec .or. stdlib_lsame( jobv, 'N' ) .or.stdlib_lsame( jobv, 'W' )) &
+                     .or. ( jracc .and. (.not.lsvec) ) ) then
+              info = - 3
+           else if ( .not. ( l2kill .or. defr ) )    then
+              info = - 4
+           else if ( .not. ( l2tran .or. stdlib_lsame( jobt, 'N' ) ) ) then
+              info = - 5
+           else if ( .not. ( l2pert .or. stdlib_lsame( jobp, 'N' ) ) ) then
+              info = - 6
+           else if ( m < 0 ) then
+              info = - 7
+           else if ( ( n < 0 ) .or. ( n > m ) ) then
+              info = - 8
+           else if ( lda < m ) then
+              info = - 10
+           else if ( lsvec .and. ( ldu < m ) ) then
+              info = - 13
+           else if ( rsvec .and. ( ldv < n ) ) then
+              info = - 15
+           else if ( (.not.(lsvec .or. rsvec .or. errest).and.(lwork < max(7,4*n+1,2*m+n))) .or.(&
+           .not.(lsvec .or. rsvec) .and. errest .and.(lwork < max(7,4*n+n*n,2*m+n))) .or.(lsvec &
+           .and. (.not.rsvec) .and. (lwork < max(7,2*m+n,4*n+1))).or.(rsvec .and. (.not.lsvec) &
+           .and. (lwork < max(7,2*m+n,4*n+1))).or.(lsvec .and. rsvec .and. (.not.jracc) .and.(&
+           lwork<max(2*m+n,6*n+2*n*n))).or. (lsvec .and. rsvec .and. jracc .and.lwork<max(2*m+n,&
+                     4*n+n*n,2*n+n*n+6)))then
+              info = - 17
+           else
+              ! #:)
+              info = 0
+           end if
+           if ( info /= 0 ) then
+             ! #:(
+              call stdlib_xerbla( 'DGEJSV', - info )
+              return
+           end if
+           ! quick return for void matrix (y3k safe)
+       ! #:)
+           if ( ( m == 0 ) .or. ( n == 0 ) ) then
+              iwork(1:3) = 0
+              work(1:7) = 0
+              return
+           endif
+           ! determine whether the matrix u should be m x n or m x m
+           if ( lsvec ) then
+              n1 = n
+              if ( stdlib_lsame( jobu, 'F' ) ) n1 = m
+           end if
+           ! set numerical parameters
+      ! !    note: make sure stdlib_dlamch() does not fail on the target architecture.
+           epsln = stdlib_dlamch('EPSILON')
+           sfmin = stdlib_dlamch('SAFEMINIMUM')
+           small = sfmin / epsln
+           big   = stdlib_dlamch('O')
+           ! big   = one / sfmin
+           ! initialize sva(1:n) = diag( ||a e_i||_2 )_1^n
+      ! (!)  if necessary, scale sva() to protect the largest norm from
+           ! overflow. it is possible that this scaling pushes the smallest
+           ! column norm left from the underflow threshold (extreme case).
+           scalem  = one / sqrt(real(m,KIND=dp)*real(n,KIND=dp))
+           noscal  = .true.
+           goscal  = .true.
+           do p = 1, n
+              aapp = zero
+              aaqq = one
+              call stdlib_dlassq( m, a(1,p), 1, aapp, aaqq )
+              if ( aapp > big ) then
+                 info = - 9
+                 call stdlib_xerbla( 'DGEJSV', -info )
+                 return
+              end if
+              aaqq = sqrt(aaqq)
+              if ( ( aapp < (big / aaqq) ) .and. noscal  ) then
+                 sva(p)  = aapp * aaqq
+              else
+                 noscal  = .false.
+                 sva(p)  = aapp * ( aaqq * scalem )
+                 if ( goscal ) then
+                    goscal = .false.
+                    call stdlib_dscal( p-1, scalem, sva, 1 )
+                 end if
+              end if
+           end do
+           if ( noscal ) scalem = one
+           aapp = zero
+           aaqq = big
+           do p = 1, n
+              aapp = max( aapp, sva(p) )
+              if ( sva(p) /= zero ) aaqq = min( aaqq, sva(p) )
+           end do
+           ! quick return for zero m x n matrix
+       ! #:)
+           if ( aapp == zero ) then
+              if ( lsvec ) call stdlib_dlaset( 'G', m, n1, zero, one, u, ldu )
+              if ( rsvec ) call stdlib_dlaset( 'G', n, n,  zero, one, v, ldv )
+              work(1) = one
+              work(2) = one
+              if ( errest ) work(3) = one
+              if ( lsvec .and. rsvec ) then
+                 work(4) = one
+                 work(5) = one
+              end if
+              if ( l2tran ) then
+                 work(6) = zero
+                 work(7) = zero
+              end if
+              iwork(1) = 0
+              iwork(2) = 0
+              iwork(3) = 0
+              return
+           end if
+           ! issue warning if denormalized column norms detected. override the
+           ! high relative accuracy request. issue licence to kill columns
+           ! (set them to zero) whose norm is less than sigma_max / big (roughly).
+       ! #:(
+           warning = 0
+           if ( aaqq <= sfmin ) then
+              l2rank = .true.
+              l2kill = .true.
+              warning = 1
+           end if
+           ! quick return for one-column matrix
+       ! #:)
+           if ( n == 1 ) then
+              if ( lsvec ) then
+                 call stdlib_dlascl( 'G',0,0,sva(1),scalem, m,1,a(1,1),lda,ierr )
+                 call stdlib_dlacpy( 'A', m, 1, a, lda, u, ldu )
+                 ! computing all m left singular vectors of the m x 1 matrix
+                 if ( n1 /= n  ) then
+                    call stdlib_dgeqrf( m, n, u,ldu, work, work(n+1),lwork-n,ierr )
+                    call stdlib_dorgqr( m,n1,1, u,ldu,work,work(n+1),lwork-n,ierr )
+                    call stdlib_dcopy( m, a(1,1), 1, u(1,1), 1 )
+                 end if
+              end if
+              if ( rsvec ) then
+                  v(1,1) = one
+              end if
+              if ( sva(1) < (big*scalem) ) then
+                 sva(1)  = sva(1) / scalem
+                 scalem  = one
+              end if
+              work(1) = one / scalem
+              work(2) = one
+              if ( sva(1) /= zero ) then
+                 iwork(1) = 1
+                 if ( ( sva(1) / scalem) >= sfmin ) then
+                    iwork(2) = 1
+                 else
+                    iwork(2) = 0
+                 end if
+              else
+                 iwork(1) = 0
+                 iwork(2) = 0
+              end if
+              iwork(3) = 0
+              if ( errest ) work(3) = one
+              if ( lsvec .and. rsvec ) then
+                 work(4) = one
+                 work(5) = one
+              end if
+              if ( l2tran ) then
+                 work(6) = zero
+                 work(7) = zero
+              end if
+              return
+           end if
+           transp = .false.
+           l2tran = l2tran .and. ( m == n )
+           aatmax = -one
+           aatmin =  big
+           if ( rowpiv .or. l2tran ) then
+           ! compute the row norms, needed to determine row pivoting sequence
+           ! (in the case of heavily row weighted a, row pivoting is strongly
+           ! advised) and to collect information needed to compare the
+           ! structures of a * a^t and a^t * a (in the case l2tran==.true.).
+              if ( l2tran ) then
+                 do p = 1, m
+                    xsc   = zero
+                    temp1 = one
+                    call stdlib_dlassq( n, a(p,1), lda, xsc, temp1 )
+                    ! stdlib_dlassq gets both the ell_2 and the ell_infinity norm
+                    ! in one pass through the vector
+                    work(m+n+p)  = xsc * scalem
+                    work(n+p)    = xsc * (scalem*sqrt(temp1))
+                    aatmax = max( aatmax, work(n+p) )
+                    if (work(n+p) /= zero) aatmin = min(aatmin,work(n+p))
+                 end do
+              else
+                 do p = 1, m
+                    work(m+n+p) = scalem*abs( a(p,stdlib_idamax(n,a(p,1),lda)) )
+                    aatmax = max( aatmax, work(m+n+p) )
+                    aatmin = min( aatmin, work(m+n+p) )
+                 end do
+              end if
+           end if
+           ! for square matrix a try to determine whether a^t  would be  better
+           ! input for the preconditioned jacobi svd, with faster convergence.
+           ! the decision is based on an o(n) function of the vector of column
+           ! and row norms of a, based on the shannon entropy. this should give
+           ! the right choice in most cases when the difference actually matters.
+           ! it may fail and pick the slower converging side.
+           entra  = zero
+           entrat = zero
+           if ( l2tran ) then
+              xsc   = zero
+              temp1 = one
+              call stdlib_dlassq( n, sva, 1, xsc, temp1 )
+              temp1 = one / temp1
+              entra = zero
+              do p = 1, n
+                 big1  = ( ( sva(p) / xsc )**2 ) * temp1
+                 if ( big1 /= zero ) entra = entra + big1 * log(big1)
+              end do
+              entra = - entra / log(real(n,KIND=dp))
+              ! now, sva().^2/trace(a^t * a) is a point in the probability simplex.
+              ! it is derived from the diagonal of  a^t * a.  do the same with the
+              ! diagonal of a * a^t, compute the entropy of the corresponding
+              ! probability distribution. note that a * a^t and a^t * a have the
+              ! same trace.
+              entrat = zero
+              do p = n+1, n+m
+                 big1 = ( ( work(p) / xsc )**2 ) * temp1
+                 if ( big1 /= zero ) entrat = entrat + big1 * log(big1)
+              end do
+              entrat = - entrat / log(real(m,KIND=dp))
+              ! analyze the entropies and decide a or a^t. smaller entropy
+              ! usually means better input for the algorithm.
+              transp = ( entrat < entra )
+              ! if a^t is better than a, transpose a.
+              if ( transp ) then
+                 ! in an optimal implementation, this trivial transpose
+                 ! should be replaced with faster transpose.
+                 do p = 1, n - 1
+                    do q = p + 1, n
+                        temp1 = a(q,p)
+                       a(q,p) = a(p,q)
+                       a(p,q) = temp1
+                    end do
+                 end do
+                 do p = 1, n
+                    work(m+n+p) = sva(p)
+                    sva(p)      = work(n+p)
+                 end do
+                 temp1  = aapp
+                 aapp   = aatmax
+                 aatmax = temp1
+                 temp1  = aaqq
+                 aaqq   = aatmin
+                 aatmin = temp1
+                 kill   = lsvec
+                 lsvec  = rsvec
+                 rsvec  = kill
+                 if ( lsvec ) n1 = n
+                 rowpiv = .true.
+              end if
+           end if
+           ! end if l2tran
+           ! scale the matrix so that its maximal singular value remains less
+           ! than sqrt(big) -- the matrix is scaled so that its maximal column
+           ! has euclidean norm equal to sqrt(big/n). the only reason to keep
+           ! sqrt(big) instead of big is the fact that stdlib_dgejsv uses lapack and
+           ! blas routines that, in some implementations, are not capable of
+           ! working in the full interval [sfmin,big] and that they may provoke
+           ! overflows in the intermediate results. if the singular values spread
+           ! from sfmin to big, then stdlib_dgesvj will compute them. so, in that case,
+           ! one should use stdlib_dgesvj instead of stdlib_dgejsv.
+           big1   = sqrt( big )
+           temp1  = sqrt( big / real(n,KIND=dp) )
+           call stdlib_dlascl( 'G', 0, 0, aapp, temp1, n, 1, sva, n, ierr )
+           if ( aaqq > (aapp * sfmin) ) then
+               aaqq = ( aaqq / aapp ) * temp1
+           else
+               aaqq = ( aaqq * temp1 ) / aapp
+           end if
+           temp1 = temp1 * scalem
+           call stdlib_dlascl( 'G', 0, 0, aapp, temp1, m, n, a, lda, ierr )
+           ! to undo scaling at the end of this procedure, multiply the
+           ! computed singular values with uscal2 / uscal1.
+           uscal1 = temp1
+           uscal2 = aapp
+           if ( l2kill ) then
+              ! l2kill enforces computation of nonzero singular values in
+              ! the restricted range of condition number of the initial a,
+              ! sigma_max(a) / sigma_min(a) approx. sqrt(big)/sqrt(sfmin).
+              xsc = sqrt( sfmin )
+           else
+              xsc = small
+              ! now, if the condition number of a is too big,
+              ! sigma_max(a) / sigma_min(a) > sqrt(big/n) * epsln / sfmin,
+              ! as a precaution measure, the full svd is computed using stdlib_dgesvj
+              ! with accumulated jacobi rotations. this provides numerically
+              ! more robust computation, at the cost of slightly increased run
+              ! time. depending on the concrete implementation of blas and lapack
+              ! (i.e. how they behave in presence of extreme ill-conditioning) the
+              ! implementor may decide to remove this switch.
+              if ( ( aaqq<sqrt(sfmin) ) .and. lsvec .and. rsvec ) then
+                 jracc = .true.
+              end if
+           end if
+           if ( aaqq < xsc ) then
+              do p = 1, n
+                 if ( sva(p) < xsc ) then
+                    call stdlib_dlaset( 'A', m, 1, zero, zero, a(1,p), lda )
+                    sva(p) = zero
+                 end if
+              end do
+           end if
+           ! preconditioning using qr factorization with pivoting
+           if ( rowpiv ) then
+              ! optional row permutation (bjoerck row pivoting):
+              ! a result by cox and higham shows that the bjoerck's
+              ! row pivoting combined with standard column pivoting
+              ! has similar effect as powell-reid complete pivoting.
+              ! the ell-infinity norms of a are made nonincreasing.
+              do p = 1, m - 1
+                 q = stdlib_idamax( m-p+1, work(m+n+p), 1 ) + p - 1
+                 iwork(2*n+p) = q
+                 if ( p /= q ) then
+                    temp1       = work(m+n+p)
+                    work(m+n+p) = work(m+n+q)
+                    work(m+n+q) = temp1
+                 end if
+              end do
+              call stdlib_dlaswp( n, a, lda, 1, m-1, iwork(2*n+1), 1 )
+           end if
+           ! end of the preparation phase (scaling, optional sorting and
+           ! transposing, optional flushing of small columns).
+           ! preconditioning
+           ! if the full svd is needed, the right singular vectors are computed
+           ! from a matrix equation, and for that we need theoretical analysis
+           ! of the businger-golub pivoting. so we use stdlib_dgeqp3 as the first rr qrf.
+           ! in all other cases the first rr qrf can be chosen by other criteria
+           ! (eg speed by replacing global with restricted window pivoting, such
+           ! as in sgeqpx from toms # 782). good results will be obtained using
+           ! sgeqpx with properly (!) chosen numerical parameters.
+           ! any improvement of stdlib_dgeqp3 improves overall performance of stdlib_dgejsv.
+           ! a * p1 = q1 * [ r1^t 0]^t:
+           do p = 1, n
+              ! All Columns Are Free Columns
+              iwork(p) = 0
+           end do
+           call stdlib_dgeqp3( m,n,a,lda, iwork,work, work(n+1),lwork-n, ierr )
+           ! the upper triangular matrix r1 from the first qrf is inspected for
+           ! rank deficiency and possibilities for deflation, or possible
+           ! ill-conditioning. depending on the user specified flag l2rank,
+           ! the procedure explores possibilities to reduce the numerical
+           ! rank by inspecting the computed upper triangular factor. if
+           ! l2rank or l2aber are up, then stdlib_dgejsv will compute the svd of
+           ! a + da, where ||da|| <= f(m,n)*epsln.
+           nr = 1
+           if ( l2aber ) then
+              ! standard absolute error bound suffices. all sigma_i with
+              ! sigma_i < n*epsln*||a|| are flushed to zero. this is an
+              ! aggressive enforcement of lower numerical rank by introducing a
+              ! backward error of the order of n*epsln*||a||.
+              temp1 = sqrt(real(n,KIND=dp))*epsln
+              do p = 2, n
+                 if ( abs(a(p,p)) >= (temp1*abs(a(1,1))) ) then
+                    nr = nr + 1
+                 else
+                    go to 3002
+                 end if
+              end do
+              3002 continue
+           else if ( l2rank ) then
+              ! .. similarly as above, only slightly more gentle (less aggressive).
+              ! sudden drop on the diagonal of r1 is used as the criterion for
+              ! close-to-rank-deficient.
+              temp1 = sqrt(sfmin)
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < (epsln*abs(a(p-1,p-1))) ) .or.( abs(a(p,p)) < small ) .or.( &
+                           l2kill .and. (abs(a(p,p)) < temp1) ) ) go to 3402
+                 nr = nr + 1
+              end do
+              3402 continue
+           else
+              ! the goal is high relative accuracy. however, if the matrix
+              ! has high scaled condition number the relative accuracy is in
+              ! general not feasible. later on, a condition number estimator
+              ! will be deployed to estimate the scaled condition number.
+              ! here we just remove the underflowed part of the triangular
+              ! factor. this prevents the situation in which the code is
+              ! working hard to get the accuracy not warranted by the data.
+              temp1  = sqrt(sfmin)
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < small ) .or.( l2kill .and. (abs(a(p,p)) < temp1) ) ) go to &
+                           3302
+                 nr = nr + 1
+              end do
+              3302 continue
+           end if
+           almort = .false.
+           if ( nr == n ) then
+              maxprj = one
+              do p = 2, n
+                 temp1  = abs(a(p,p)) / sva(iwork(p))
+                 maxprj = min( maxprj, temp1 )
+              end do
+              if ( maxprj**2 >= one - real(n,KIND=dp)*epsln ) almort = .true.
+           end if
+           sconda = - one
+           condr1 = - one
+           condr2 = - one
+           if ( errest ) then
+              if ( n == nr ) then
+                 if ( rsvec ) then
+                    ! V Is Available As Workspace
+                    call stdlib_dlacpy( 'U', n, n, a, lda, v, ldv )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_dscal( p, one/temp1, v(1,p), 1 )
+                    end do
+                    call stdlib_dpocon( 'U', n, v, ldv, one, temp1,work(n+1), iwork(2*n+m+1), &
+                              ierr )
+                 else if ( lsvec ) then
+                    ! U Is Available As Workspace
+                    call stdlib_dlacpy( 'U', n, n, a, lda, u, ldu )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_dscal( p, one/temp1, u(1,p), 1 )
+                    end do
+                    call stdlib_dpocon( 'U', n, u, ldu, one, temp1,work(n+1), iwork(2*n+m+1), &
+                              ierr )
+                 else
+                    call stdlib_dlacpy( 'U', n, n, a, lda, work(n+1), n )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_dscal( p, one/temp1, work(n+(p-1)*n+1), 1 )
+                    end do
+                 ! The Columns Of R Are Scaled To Have Unit Euclidean Lengths
+                    call stdlib_dpocon( 'U', n, work(n+1), n, one, temp1,work(n+n*n+1), iwork(2*n+&
+                              m+1), ierr )
+                 end if
+                 sconda = one / sqrt(temp1)
+                 ! sconda is an estimate of sqrt(||(r^t * r)^(-1)||_1).
+                 ! n^(-1/4) * sconda <= ||r^(-1)||_2 <= n^(1/4) * sconda
+              else
+                 sconda = - one
+              end if
+           end if
+           l2pert = l2pert .and. ( abs( a(1,1)/a(nr,nr) ) > sqrt(big1) )
+           ! if there is no violent scaling, artificial perturbation is not needed.
+           ! phase 3:
+           if ( .not. ( rsvec .or. lsvec ) ) then
+               ! singular values only
+               ! .. transpose a(1:nr,1:n)
+              do p = 1, min( n-1, nr )
+                 call stdlib_dcopy( n-p, a(p,p+1), lda, a(p+1,p), 1 )
+              end do
+              ! the following two do-loops introduce small relative perturbation
+              ! into the strict upper triangle of the lower triangular matrix.
+              ! small entries below the main diagonal are also changed.
+              ! this modification is useful if the computing environment does not
+              ! provide/allow flush to zero underflow, for it prevents many
+              ! annoying denormalized numbers in case of strongly scaled matrices.
+              ! the perturbation is structured so that it does not introduce any
+              ! new perturbation of the singular values, and it does not destroy
+              ! the job done by the preconditioner.
+              ! the licence for this perturbation is in the variable l2pert, which
+              ! should be .false. if flush to zero underflow is active.
+              if ( .not. almort ) then
+                 if ( l2pert ) then
+                    ! xsc = sqrt(small)
+                    xsc = epsln / real(n,KIND=dp)
+                    do q = 1, nr
+                       temp1 = xsc*abs(a(q,q))
+                       do p = 1, n
+                          if ( ( (p>q) .and. (abs(a(p,q))<=temp1) ).or. ( p < q ) )a(p,q) = sign( &
+                                    temp1, a(p,q) )
+                       end do
+                    end do
+                 else
+                    call stdlib_dlaset( 'U', nr-1,nr-1, zero,zero, a(1,2),lda )
+                 end if
+                  ! Second Preconditioning Using The Qr Factorization
+                 call stdlib_dgeqrf( n,nr, a,lda, work, work(n+1),lwork-n, ierr )
+                 ! And Transpose Upper To Lower Triangular
+                 do p = 1, nr - 1
+                    call stdlib_dcopy( nr-p, a(p,p+1), lda, a(p+1,p), 1 )
+                 end do
+              end if
+                 ! row-cyclic jacobi svd algorithm with column pivoting
+                 ! .. again some perturbation (a "background noise") is added
+                 ! to drown denormals
+                 if ( l2pert ) then
+                    ! xsc = sqrt(small)
+                    xsc = epsln / real(n,KIND=dp)
+                    do q = 1, nr
+                       temp1 = xsc*abs(a(q,q))
+                       do p = 1, nr
+                          if ( ( (p>q) .and. (abs(a(p,q))<=temp1) ).or. ( p < q ) )a(p,q) = sign( &
+                                    temp1, a(p,q) )
+                       end do
+                    end do
+                 else
+                    call stdlib_dlaset( 'U', nr-1, nr-1, zero, zero, a(1,2), lda )
+                 end if
+                 ! .. and one-sided jacobi rotations are started on a lower
+                 ! triangular matrix (plus perturbation which is ignored in
+                 ! the part which destroys triangular form (confusing?!))
+                 call stdlib_dgesvj( 'L', 'NOU', 'NOV', nr, nr, a, lda, sva,n, v, ldv, work, &
+                           lwork, info )
+                 scalem  = work(1)
+                 numrank = nint(work(2),KIND=ilp)
+           else if ( rsvec .and. ( .not. lsvec ) ) then
+              ! -> singular values and right singular vectors <-
+              if ( almort ) then
+                 ! In This Case Nr Equals N
+                 do p = 1, nr
+                    call stdlib_dcopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                 end do
+                 call stdlib_dlaset( 'UPPER', nr-1, nr-1, zero, zero, v(1,2), ldv )
+                 call stdlib_dgesvj( 'L','U','N', n, nr, v,ldv, sva, nr, a,lda,work, lwork, info )
+                           
+                 scalem  = work(1)
+                 numrank = nint(work(2),KIND=ilp)
+              else
+              ! .. two more qr factorizations ( one qrf is not enough, two require
+              ! accumulated product of jacobi rotations, three are perfect )
+                 call stdlib_dlaset( 'LOWER', nr-1, nr-1, zero, zero, a(2,1), lda )
+                 call stdlib_dgelqf( nr, n, a, lda, work, work(n+1), lwork-n, ierr)
+                 call stdlib_dlacpy( 'LOWER', nr, nr, a, lda, v, ldv )
+                 call stdlib_dlaset( 'UPPER', nr-1, nr-1, zero, zero, v(1,2), ldv )
+                 call stdlib_dgeqrf( nr, nr, v, ldv, work(n+1), work(2*n+1),lwork-2*n, ierr )
+                           
+                 do p = 1, nr
+                    call stdlib_dcopy( nr-p+1, v(p,p), ldv, v(p,p), 1 )
+                 end do
+                 call stdlib_dlaset( 'UPPER', nr-1, nr-1, zero, zero, v(1,2), ldv )
+                 call stdlib_dgesvj( 'LOWER', 'U','N', nr, nr, v,ldv, sva, nr, u,ldu, work(n+1), &
+                           lwork, info )
+                 scalem  = work(n+1)
+                 numrank = nint(work(n+2),KIND=ilp)
+                 if ( nr < n ) then
+                    call stdlib_dlaset( 'A',n-nr, nr, zero,zero, v(nr+1,1),   ldv )
+                    call stdlib_dlaset( 'A',nr, n-nr, zero,zero, v(1,nr+1),   ldv )
+                    call stdlib_dlaset( 'A',n-nr,n-nr,zero,one, v(nr+1,nr+1), ldv )
+                 end if
+              call stdlib_dormlq( 'LEFT', 'TRANSPOSE', n, n, nr, a, lda, work,v, ldv, work(n+1), &
+                        lwork-n, ierr )
+              end if
+              do p = 1, n
+                 call stdlib_dcopy( n, v(p,1), ldv, a(iwork(p),1), lda )
+              end do
+              call stdlib_dlacpy( 'ALL', n, n, a, lda, v, ldv )
+              if ( transp ) then
+                 call stdlib_dlacpy( 'ALL', n, n, v, ldv, u, ldu )
+              end if
+           else if ( lsvec .and. ( .not. rsvec ) ) then
+              ! Singular Values And Left Singular Vectors                 
+              ! Second Preconditioning Step To Avoid Need To Accumulate
+              ! jacobi rotations in the jacobi iterations.
+              do p = 1, nr
+                 call stdlib_dcopy( n-p+1, a(p,p), lda, u(p,p), 1 )
+              end do
+              call stdlib_dlaset( 'UPPER', nr-1, nr-1, zero, zero, u(1,2), ldu )
+              call stdlib_dgeqrf( n, nr, u, ldu, work(n+1), work(2*n+1),lwork-2*n, ierr )
+              do p = 1, nr - 1
+                 call stdlib_dcopy( nr-p, u(p,p+1), ldu, u(p+1,p), 1 )
+              end do
+              call stdlib_dlaset( 'UPPER', nr-1, nr-1, zero, zero, u(1,2), ldu )
+              call stdlib_dgesvj( 'LOWER', 'U', 'N', nr,nr, u, ldu, sva, nr, a,lda, work(n+1), &
+                        lwork-n, info )
+              scalem  = work(n+1)
+              numrank = nint(work(n+2),KIND=ilp)
+              if ( nr < m ) then
+                 call stdlib_dlaset( 'A',  m-nr, nr,zero, zero, u(nr+1,1), ldu )
+                 if ( nr < n1 ) then
+                    call stdlib_dlaset( 'A',nr, n1-nr, zero, zero, u(1,nr+1), ldu )
+                    call stdlib_dlaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                 end if
+              end if
+              call stdlib_dormqr( 'LEFT', 'NO TR', m, n1, n, a, lda, work, u,ldu, work(n+1), &
+                        lwork-n, ierr )
+              if ( rowpiv )call stdlib_dlaswp( n1, u, ldu, 1, m-1, iwork(2*n+1), -1 )
+              do p = 1, n1
+                 xsc = one / stdlib_dnrm2( m, u(1,p), 1 )
+                 call stdlib_dscal( m, xsc, u(1,p), 1 )
+              end do
+              if ( transp ) then
+                 call stdlib_dlacpy( 'ALL', n, n, u, ldu, v, ldv )
+              end if
+           else
+              ! Full Svd 
+              if ( .not. jracc ) then
+              if ( .not. almort ) then
+                 ! second preconditioning step (qrf [with pivoting])
+                 ! note that the composition of transpose, qrf and transpose is
+                 ! equivalent to an lqf call. since in many libraries the qrf
+                 ! seems to be better optimized than the lqf, we do explicit
+                 ! transpose and use the qrf. this is subject to changes in an
+                 ! optimized implementation of stdlib_dgejsv.
+                 do p = 1, nr
+                    call stdlib_dcopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                 end do
+                 ! The Following Two Loops Perturb Small Entries To Avoid
+                 ! denormals in the second qr factorization, where they are
+                 ! as good as zeros. this is done to avoid painfully slow
+                 ! computation with denormals. the relative size of the perturbation
+                 ! is a parameter that can be changed by the implementer.
+                 ! this perturbation device will be obsolete on machines with
+                 ! properly implemented arithmetic.
+                 ! to switch it off, set l2pert=.false. to remove it from  the
+                 ! code, remove the action under l2pert=.true., leave the else part.
+                 ! the following two loops should be blocked and fused with the
+                 ! transposed copy above.
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do q = 1, nr
+                       temp1 = xsc*abs( v(q,q) )
+                       do p = 1, n
+                          if ( ( p > q ) .and. ( abs(v(p,q)) <= temp1 ).or. ( p < q ) )v(p,q) = &
+                                    sign( temp1, v(p,q) )
+                          if ( p < q ) v(p,q) = - v(p,q)
+                       end do
+                    end do
+                 else
+                    call stdlib_dlaset( 'U', nr-1, nr-1, zero, zero, v(1,2), ldv )
+                 end if
+                 ! estimate the row scaled condition number of r1
+                 ! (if r1 is rectangular, n > nr, then the condition number
+                 ! of the leading nr x nr submatrix is estimated.)
+                 call stdlib_dlacpy( 'L', nr, nr, v, ldv, work(2*n+1), nr )
+                 do p = 1, nr
+                    temp1 = stdlib_dnrm2(nr-p+1,work(2*n+(p-1)*nr+p),1)
+                    call stdlib_dscal(nr-p+1,one/temp1,work(2*n+(p-1)*nr+p),1)
+                 end do
+                 call stdlib_dpocon('LOWER',nr,work(2*n+1),nr,one,temp1,work(2*n+nr*nr+1),iwork(m+&
+                           2*n+1),ierr)
+                 condr1 = one / sqrt(temp1)
+                 ! Here Need A Second Opinion On The Condition Number
+                 ! Then Assume Worst Case Scenario
+                 ! r1 is ok for inverse <=> condr1 < real(n,KIND=dp)
+                 ! more conservative    <=> condr1 < sqrt(real(n,KIND=dp))
+                 cond_ok = sqrt(real(nr,KIND=dp))
+      ! [tp]       cond_ok is a tuning parameter.
+                 if ( condr1 < cond_ok ) then
+                    ! .. the second qrf without pivoting. note: in an optimized
+                    ! implementation, this qrf should be implemented as the qrf
+                    ! of a lower triangular matrix.
+                    ! r1^t = q2 * r2
+                    call stdlib_dgeqrf( n, nr, v, ldv, work(n+1), work(2*n+1),lwork-2*n, ierr )
+                              
+                    if ( l2pert ) then
+                       xsc = sqrt(small)/epsln
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             temp1 = xsc * min(abs(v(p,p)),abs(v(q,q)))
+                             if ( abs(v(q,p)) <= temp1 )v(q,p) = sign( temp1, v(q,p) )
+                          end do
+                       end do
+                    end if
+                    if ( nr /= n )call stdlib_dlacpy( 'A', n, nr, v, ldv, work(2*n+1), n )
+                    ! .. save ...
+                 ! This Transposed Copy Should Be Better Than Naive
+                    do p = 1, nr - 1
+                       call stdlib_dcopy( nr-p, v(p,p+1), ldv, v(p+1,p), 1 )
+                    end do
+                    condr2 = condr1
+                 else
+                    ! .. ill-conditioned case: second qrf with pivoting
+                    ! note that windowed pivoting would be equally good
+                    ! numerically, and more run-time efficient. so, in
+                    ! an optimal implementation, the next call to stdlib_dgeqp3
+                    ! should be replaced with eg. call sgeqpx (acm toms #782)
+                    ! with properly (carefully) chosen parameters.
+                    ! r1^t * p2 = q2 * r2
+                    do p = 1, nr
+                       iwork(n+p) = 0
+                    end do
+                    call stdlib_dgeqp3( n, nr, v, ldv, iwork(n+1), work(n+1),work(2*n+1), lwork-&
+                              2*n, ierr )
+      ! *               call stdlib_dgeqrf( n, nr, v, ldv, work(n+1), work(2*n+1),
+      ! *     $              lwork-2*n, ierr )
+                    if ( l2pert ) then
+                       xsc = sqrt(small)
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             temp1 = xsc * min(abs(v(p,p)),abs(v(q,q)))
+                             if ( abs(v(q,p)) <= temp1 )v(q,p) = sign( temp1, v(q,p) )
+                          end do
+                       end do
+                    end if
+                    call stdlib_dlacpy( 'A', n, nr, v, ldv, work(2*n+1), n )
+                    if ( l2pert ) then
+                       xsc = sqrt(small)
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             temp1 = xsc * min(abs(v(p,p)),abs(v(q,q)))
+                             v(p,q) = - sign( temp1, v(q,p) )
+                          end do
+                       end do
+                    else
+                       call stdlib_dlaset( 'L',nr-1,nr-1,zero,zero,v(2,1),ldv )
+                    end if
+                    ! now, compute r2 = l3 * q3, the lq factorization.
+                    call stdlib_dgelqf( nr, nr, v, ldv, work(2*n+n*nr+1),work(2*n+n*nr+nr+1), &
+                              lwork-2*n-n*nr-nr, ierr )
+                    ! And Estimate The Condition Number
+                    call stdlib_dlacpy( 'L',nr,nr,v,ldv,work(2*n+n*nr+nr+1),nr )
+                    do p = 1, nr
+                       temp1 = stdlib_dnrm2( p, work(2*n+n*nr+nr+p), nr )
+                       call stdlib_dscal( p, one/temp1, work(2*n+n*nr+nr+p), nr )
+                    end do
+                    call stdlib_dpocon( 'L',nr,work(2*n+n*nr+nr+1),nr,one,temp1,work(2*n+n*nr+nr+&
+                              nr*nr+1),iwork(m+2*n+1),ierr )
+                    condr2 = one / sqrt(temp1)
+                    if ( condr2 >= cond_ok ) then
+                       ! Save The Householder Vectors Used For Q3
+                       ! (this overwrites the copy of r2, as it will not be
+                       ! needed in this branch, but it does not overwritte the
+                       ! huseholder vectors of q2.).
+                       call stdlib_dlacpy( 'U', nr, nr, v, ldv, work(2*n+1), n )
+                       ! And The Rest Of The Information On Q3 Is In
+                       ! work(2*n+n*nr+1:2*n+n*nr+n)
+                    end if
+                 end if
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do q = 2, nr
+                       temp1 = xsc * v(q,q)
+                       do p = 1, q - 1
+                          ! v(p,q) = - sign( temp1, v(q,p) )
+                          v(p,q) = - sign( temp1, v(p,q) )
+                       end do
+                    end do
+                 else
+                    call stdlib_dlaset( 'U', nr-1,nr-1, zero,zero, v(1,2), ldv )
+                 end if
+              ! second preconditioning finished; continue with jacobi svd
+              ! the input matrix is lower trinagular.
+              ! recover the right singular vectors as solution of a well
+              ! conditioned triangular matrix equation.
+                 if ( condr1 < cond_ok ) then
+                    call stdlib_dgesvj( 'L','U','N',nr,nr,v,ldv,sva,nr,u,ldu,work(2*n+n*nr+nr+1),&
+                              lwork-2*n-n*nr-nr,info )
+                    scalem  = work(2*n+n*nr+nr+1)
+                    numrank = nint(work(2*n+n*nr+nr+2),KIND=ilp)
+                    do p = 1, nr
+                       call stdlib_dcopy( nr, v(1,p), 1, u(1,p), 1 )
+                       call stdlib_dscal( nr, sva(p),    v(1,p), 1 )
+                    end do
+              ! Pick The Right Matrix Equation And Solve It
+                    if ( nr == n ) then
+       ! :))             .. best case, r1 is inverted. the solution of this matrix
+                       ! equation is q2*v2 = the product of the jacobi rotations
+                       ! used in stdlib_dgesvj, premultiplied with the orthogonal matrix
+                       ! from the second qr factorization.
+                       call stdlib_dtrsm( 'L','U','N','N', nr,nr,one, a,lda, v,ldv )
+                    else
+                       ! .. r1 is well conditioned, but non-square. transpose(r2)
+                       ! is inverted to get the product of the jacobi rotations
+                       ! used in stdlib_dgesvj. the q-factor from the second qr
+                       ! factorization is then built in explicitly.
+                       call stdlib_dtrsm('L','U','T','N',nr,nr,one,work(2*n+1),n,v,ldv)
+                       if ( nr < n ) then
+                         call stdlib_dlaset('A',n-nr,nr,zero,zero,v(nr+1,1),ldv)
+                         call stdlib_dlaset('A',nr,n-nr,zero,zero,v(1,nr+1),ldv)
+                         call stdlib_dlaset('A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv)
+                       end if
+                       call stdlib_dormqr('L','N',n,n,nr,work(2*n+1),n,work(n+1),v,ldv,work(2*n+&
+                                 n*nr+nr+1),lwork-2*n-n*nr-nr,ierr)
+                    end if
+                 else if ( condr2 < cond_ok ) then
+       ! :)           .. the input matrix a is very likely a relative of
+                    ! the kahan matrix :)
+                    ! the matrix r2 is inverted. the solution of the matrix equation
+                    ! is q3^t*v3 = the product of the jacobi rotations (appplied to
+                    ! the lower triangular l3 from the lq factorization of
+                    ! r2=l3*q3), pre-multiplied with the transposed q3.
+                    call stdlib_dgesvj( 'L', 'U', 'N', nr, nr, v, ldv, sva, nr, u,ldu, work(2*n+&
+                              n*nr+nr+1), lwork-2*n-n*nr-nr, info )
+                    scalem  = work(2*n+n*nr+nr+1)
+                    numrank = nint(work(2*n+n*nr+nr+2),KIND=ilp)
+                    do p = 1, nr
+                       call stdlib_dcopy( nr, v(1,p), 1, u(1,p), 1 )
+                       call stdlib_dscal( nr, sva(p),    u(1,p), 1 )
+                    end do
+                    call stdlib_dtrsm('L','U','N','N',nr,nr,one,work(2*n+1),n,u,ldu)
+                    ! Apply The Permutation From The Second Qr Factorization
+                    do q = 1, nr
+                       do p = 1, nr
+                          work(2*n+n*nr+nr+iwork(n+p)) = u(p,q)
+                       end do
+                       do p = 1, nr
+                          u(p,q) = work(2*n+n*nr+nr+p)
+                       end do
+                    end do
+                    if ( nr < n ) then
+                       call stdlib_dlaset( 'A',n-nr,nr,zero,zero,v(nr+1,1),ldv )
+                       call stdlib_dlaset( 'A',nr,n-nr,zero,zero,v(1,nr+1),ldv )
+                       call stdlib_dlaset( 'A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv )
+                    end if
+                    call stdlib_dormqr( 'L','N',n,n,nr,work(2*n+1),n,work(n+1),v,ldv,work(2*n+&
+                              n*nr+nr+1),lwork-2*n-n*nr-nr,ierr )
+                 else
+                    ! last line of defense.
+       ! #:(          this is a rather pathological case: no scaled condition
+                    ! improvement after two pivoted qr factorizations. other
+                    ! possibility is that the rank revealing qr factorization
+                    ! or the condition estimator has failed, or the cond_ok
+                    ! is set very close to one (which is unnecessary). normally,
+                    ! this branch should never be executed, but in rare cases of
+                    ! failure of the rrqr or condition estimator, the last line of
+                    ! defense ensures that stdlib_dgejsv completes the task.
+                    ! compute the full svd of l3 using stdlib_dgesvj with explicit
+                    ! accumulation of jacobi rotations.
+                    call stdlib_dgesvj( 'L', 'U', 'V', nr, nr, v, ldv, sva, nr, u,ldu, work(2*n+&
+                              n*nr+nr+1), lwork-2*n-n*nr-nr, info )
+                    scalem  = work(2*n+n*nr+nr+1)
+                    numrank = nint(work(2*n+n*nr+nr+2),KIND=ilp)
+                    if ( nr < n ) then
+                       call stdlib_dlaset( 'A',n-nr,nr,zero,zero,v(nr+1,1),ldv )
+                       call stdlib_dlaset( 'A',nr,n-nr,zero,zero,v(1,nr+1),ldv )
+                       call stdlib_dlaset( 'A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv )
+                    end if
+                    call stdlib_dormqr( 'L','N',n,n,nr,work(2*n+1),n,work(n+1),v,ldv,work(2*n+&
+                              n*nr+nr+1),lwork-2*n-n*nr-nr,ierr )
+                    call stdlib_dormlq( 'L', 'T', nr, nr, nr, work(2*n+1), n,work(2*n+n*nr+1), u, &
+                              ldu, work(2*n+n*nr+nr+1),lwork-2*n-n*nr-nr, ierr )
+                    do q = 1, nr
+                       do p = 1, nr
+                          work(2*n+n*nr+nr+iwork(n+p)) = u(p,q)
+                       end do
+                       do p = 1, nr
+                          u(p,q) = work(2*n+n*nr+nr+p)
+                       end do
+                    end do
+                 end if
+                 ! permute the rows of v using the (column) permutation from the
+                 ! first qrf. also, scale the columns to make them unit in
+                 ! euclidean norm. this applies to all cases.
+                 temp1 = sqrt(real(n,KIND=dp)) * epsln
+                 do q = 1, n
+                    do p = 1, n
+                       work(2*n+n*nr+nr+iwork(p)) = v(p,q)
+                    end do
+                    do p = 1, n
+                       v(p,q) = work(2*n+n*nr+nr+p)
+                    end do
+                    xsc = one / stdlib_dnrm2( n, v(1,q), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_dscal( n, xsc, &
+                              v(1,q), 1 )
+                 end do
+                 ! at this moment, v contains the right singular vectors of a.
+                 ! next, assemble the left singular vector matrix u (m x n).
+                 if ( nr < m ) then
+                    call stdlib_dlaset( 'A', m-nr, nr, zero, zero, u(nr+1,1), ldu )
+                    if ( nr < n1 ) then
+                       call stdlib_dlaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                       call stdlib_dlaset('A',m-nr,n1-nr,zero,one,u(nr+1,nr+1),ldu)
+                    end if
+                 end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! matrix u. this applies to all cases.
+                 call stdlib_dormqr( 'LEFT', 'NO_TR', m, n1, n, a, lda, work, u,ldu, work(n+1), &
+                           lwork-n, ierr )
+                 ! the columns of u are normalized. the cost is o(m*n) flops.
+                 temp1 = sqrt(real(m,KIND=dp)) * epsln
+                 do p = 1, nr
+                    xsc = one / stdlib_dnrm2( m, u(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_dscal( m, xsc, &
+                              u(1,p), 1 )
+                 end do
+                 ! if the initial qrf is computed with row pivoting, the left
+                 ! singular vectors must be adjusted.
+                 if ( rowpiv )call stdlib_dlaswp( n1, u, ldu, 1, m-1, iwork(2*n+1), -1 )
+              else
+              ! The Initial Matrix A Has Almost Orthogonal Columns And
+              ! the second qrf is not needed
+                 call stdlib_dlacpy( 'UPPER', n, n, a, lda, work(n+1), n )
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do p = 2, n
+                       temp1 = xsc * work( n + (p-1)*n + p )
+                       do q = 1, p - 1
+                          work(n+(q-1)*n+p)=-sign(temp1,work(n+(p-1)*n+q))
+                       end do
+                    end do
+                 else
+                    call stdlib_dlaset( 'LOWER',n-1,n-1,zero,zero,work(n+2),n )
+                 end if
+                 call stdlib_dgesvj( 'UPPER', 'U', 'N', n, n, work(n+1), n, sva,n, u, ldu, work(n+&
+                           n*n+1), lwork-n-n*n, info )
+                 scalem  = work(n+n*n+1)
+                 numrank = nint(work(n+n*n+2),KIND=ilp)
+                 do p = 1, n
+                    call stdlib_dcopy( n, work(n+(p-1)*n+1), 1, u(1,p), 1 )
+                    call stdlib_dscal( n, sva(p), work(n+(p-1)*n+1), 1 )
+                 end do
+                 call stdlib_dtrsm( 'LEFT', 'UPPER', 'NOTRANS', 'NO UD', n, n,one, a, lda, work(n+&
+                           1), n )
+                 do p = 1, n
+                    call stdlib_dcopy( n, work(n+p), n, v(iwork(p),1), ldv )
+                 end do
+                 temp1 = sqrt(real(n,KIND=dp))*epsln
+                 do p = 1, n
+                    xsc = one / stdlib_dnrm2( n, v(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_dscal( n, xsc, &
+                              v(1,p), 1 )
+                 end do
+                 ! assemble the left singular vector matrix u (m x n).
+                 if ( n < m ) then
+                    call stdlib_dlaset( 'A',  m-n, n, zero, zero, u(n+1,1), ldu )
+                    if ( n < n1 ) then
+                       call stdlib_dlaset( 'A',n,  n1-n, zero, zero,  u(1,n+1),ldu )
+                       call stdlib_dlaset( 'A',m-n,n1-n, zero, one,u(n+1,n+1),ldu )
+                    end if
+                 end if
+                 call stdlib_dormqr( 'LEFT', 'NO TR', m, n1, n, a, lda, work, u,ldu, work(n+1), &
+                           lwork-n, ierr )
+                 temp1 = sqrt(real(m,KIND=dp))*epsln
+                 do p = 1, n1
+                    xsc = one / stdlib_dnrm2( m, u(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_dscal( m, xsc, &
+                              u(1,p), 1 )
+                 end do
+                 if ( rowpiv )call stdlib_dlaswp( n1, u, ldu, 1, m-1, iwork(2*n+1), -1 )
+              end if
+              ! end of the  >> almost orthogonal case <<  in the full svd
+              else
+              ! this branch deploys a preconditioned jacobi svd with explicitly
+              ! accumulated rotations. it is included as optional, mainly for
+              ! experimental purposes. it does perform well, and can also be used.
+              ! in this implementation, this branch will be automatically activated
+              ! if the  condition number sigma_max(a) / sigma_min(a) is predicted
+              ! to be greater than the overflow threshold. this is because the
+              ! a posteriori computation of the singular vectors assumes robust
+              ! implementation of blas and some lapack procedures, capable of working
+              ! in presence of extreme values. since that is not always the case, ...
+              do p = 1, nr
+                 call stdlib_dcopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+              end do
+              if ( l2pert ) then
+                 xsc = sqrt(small/epsln)
+                 do q = 1, nr
+                    temp1 = xsc*abs( v(q,q) )
+                    do p = 1, n
+                       if ( ( p > q ) .and. ( abs(v(p,q)) <= temp1 ).or. ( p < q ) )v(p,q) = sign(&
+                                  temp1, v(p,q) )
+                       if ( p < q ) v(p,q) = - v(p,q)
+                    end do
+                 end do
+              else
+                 call stdlib_dlaset( 'U', nr-1, nr-1, zero, zero, v(1,2), ldv )
+              end if
+              call stdlib_dgeqrf( n, nr, v, ldv, work(n+1), work(2*n+1),lwork-2*n, ierr )
+              call stdlib_dlacpy( 'L', n, nr, v, ldv, work(2*n+1), n )
+              do p = 1, nr
+                 call stdlib_dcopy( nr-p+1, v(p,p), ldv, u(p,p), 1 )
+              end do
+              if ( l2pert ) then
+                 xsc = sqrt(small/epsln)
+                 do q = 2, nr
+                    do p = 1, q - 1
+                       temp1 = xsc * min(abs(u(p,p)),abs(u(q,q)))
+                       u(p,q) = - sign( temp1, u(q,p) )
+                    end do
+                 end do
+              else
+                 call stdlib_dlaset('U', nr-1, nr-1, zero, zero, u(1,2), ldu )
+              end if
+              call stdlib_dgesvj( 'G', 'U', 'V', nr, nr, u, ldu, sva,n, v, ldv, work(2*n+n*nr+1), &
+                        lwork-2*n-n*nr, info )
+              scalem  = work(2*n+n*nr+1)
+              numrank = nint(work(2*n+n*nr+2),KIND=ilp)
+              if ( nr < n ) then
+                 call stdlib_dlaset( 'A',n-nr,nr,zero,zero,v(nr+1,1),ldv )
+                 call stdlib_dlaset( 'A',nr,n-nr,zero,zero,v(1,nr+1),ldv )
+                 call stdlib_dlaset( 'A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv )
+              end if
+              call stdlib_dormqr( 'L','N',n,n,nr,work(2*n+1),n,work(n+1),v,ldv,work(2*n+n*nr+nr+1)&
+                        ,lwork-2*n-n*nr-nr,ierr )
+                 ! permute the rows of v using the (column) permutation from the
+                 ! first qrf. also, scale the columns to make them unit in
+                 ! euclidean norm. this applies to all cases.
+                 temp1 = sqrt(real(n,KIND=dp)) * epsln
+                 do q = 1, n
+                    do p = 1, n
+                       work(2*n+n*nr+nr+iwork(p)) = v(p,q)
+                    end do
+                    do p = 1, n
+                       v(p,q) = work(2*n+n*nr+nr+p)
+                    end do
+                    xsc = one / stdlib_dnrm2( n, v(1,q), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_dscal( n, xsc, &
+                              v(1,q), 1 )
+                 end do
+                 ! at this moment, v contains the right singular vectors of a.
+                 ! next, assemble the left singular vector matrix u (m x n).
+              if ( nr < m ) then
+                 call stdlib_dlaset( 'A',  m-nr, nr, zero, zero, u(nr+1,1), ldu )
+                 if ( nr < n1 ) then
+                    call stdlib_dlaset( 'A',nr,  n1-nr, zero, zero,  u(1,nr+1),ldu )
+                    call stdlib_dlaset( 'A',m-nr,n1-nr, zero, one,u(nr+1,nr+1),ldu )
+                 end if
+              end if
+              call stdlib_dormqr( 'LEFT', 'NO TR', m, n1, n, a, lda, work, u,ldu, work(n+1), &
+                        lwork-n, ierr )
+                 if ( rowpiv )call stdlib_dlaswp( n1, u, ldu, 1, m-1, iwork(2*n+1), -1 )
+              end if
+              if ( transp ) then
+                 ! .. swap u and v because the procedure worked on a^t
+                 do p = 1, n
+                    call stdlib_dswap( n, u(1,p), 1, v(1,p), 1 )
+                 end do
+              end if
+           end if
+           ! end of the full svd
+           ! undo scaling, if necessary (and possible)
+           if ( uscal2 <= (big/sva(1))*uscal1 ) then
+              call stdlib_dlascl( 'G', 0, 0, uscal1, uscal2, nr, 1, sva, n, ierr )
+              uscal1 = one
+              uscal2 = one
+           end if
+           if ( nr < n ) then
+              do p = nr+1, n
+                 sva(p) = zero
+              end do
+           end if
+           work(1) = uscal2 * scalem
+           work(2) = uscal1
+           if ( errest ) work(3) = sconda
+           if ( lsvec .and. rsvec ) then
+              work(4) = condr1
+              work(5) = condr2
+           end if
+           if ( l2tran ) then
+              work(6) = entra
+              work(7) = entrat
+           end if
+           iwork(1) = nr
+           iwork(2) = numrank
+           iwork(3) = warning
+           return
+     end subroutine stdlib_dgejsv
+
+     !> DGELQ: computes an LQ factorization of a real M-by-N matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a N-by-N orthogonal matrix;
+     !> L is a lower-triangular M-by-M matrix;
+     !> 0 is a M-by-(N-M) zero matrix, if M < N.
+
+     pure subroutine stdlib_dgelq( m, n, a, lda, t, tsize, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, tsize, lwork
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: t(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, lminws, mint, minw
+           integer(ilp) :: mb, nb, mintsz, nblcks, lwmin, lwopt, lwreq
+           ! Intrinsic Functions 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( tsize==-1 .or. tsize==-2 .or.lwork==-1 .or. lwork==-2 )
+           mint = .false.
+           minw = .false.
+           if( tsize==-2 .or. lwork==-2 ) then
+             if( tsize/=-1 ) mint = .true.
+             if( lwork/=-1 ) minw = .true.
+           end if
+           ! determine the block size
+           if( min( m, n )>0 ) then
+             mb = stdlib_ilaenv( 1, 'DGELQ ', ' ', m, n, 1, -1 )
+             nb = stdlib_ilaenv( 1, 'DGELQ ', ' ', m, n, 2, -1 )
+           else
+             mb = 1
+             nb = n
+           end if
+           if( mb>min( m, n ) .or. mb<1 ) mb = 1
+           if( nb>n .or. nb<=m ) nb = n
+           mintsz = m + 5
+           if ( nb>m .and. n>m ) then
+             if( mod( n - m, nb - m )==0 ) then
+               nblcks = ( n - m ) / ( nb - m )
+             else
+               nblcks = ( n - m ) / ( nb - m ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           ! determine if the workspace size satisfies minimal size
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+              lwmin = max( 1, n )
+              lwopt = max( 1, mb*n )
+           else
+              lwmin = max( 1, m )
+              lwopt = max( 1, mb*m )
+           end if
+           lminws = .false.
+           if( ( tsize<max( 1, mb*m*nblcks + 5 ) .or. lwork<lwopt ).and. ( lwork>=lwmin ) .and. ( &
+                     tsize>=mintsz ).and. ( .not.lquery ) ) then
+             if( tsize<max( 1, mb*m*nblcks + 5 ) ) then
+                 lminws = .true.
+                 mb = 1
+                 nb = n
+             end if
+             if( lwork<lwopt ) then
+                 lminws = .true.
+                 mb = 1
+             end if
+           end if
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+              lwreq = max( 1, mb*n )
+           else
+              lwreq = max( 1, mb*m )
+           end if
+           if( m<0 ) then
+             info = -1
+           else if( n<0 ) then
+             info = -2
+           else if( lda<max( 1, m ) ) then
+             info = -4
+           else if( tsize<max( 1, mb*m*nblcks + 5 ).and. ( .not.lquery ) .and. ( .not.lminws ) ) &
+                     then
+             info = -6
+           else if( ( lwork<lwreq ) .and.( .not.lquery ).and. ( .not.lminws ) ) then
+             info = -8
+           end if
+           if( info==0 ) then
+             if( mint ) then
+               t( 1 ) = mintsz
+             else
+               t( 1 ) = mb*m*nblcks + 5
+             end if
+             t( 2 ) = mb
+             t( 3 ) = nb
+             if( minw ) then
+               work( 1 ) = lwmin
+             else
+               work( 1 ) = lwreq
+             end if
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DGELQ', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+             return
+           end if
+           ! the lq decomposition
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+             call stdlib_dgelqt( m, n, mb, a, lda, t( 6 ), mb, work, info )
+           else
+             call stdlib_dlaswlq( m, n, mb, nb, a, lda, t( 6 ), mb, work,lwork, info )
+           end if
+           work( 1 ) = lwreq
+           return
+     end subroutine stdlib_dgelq
+
+     !> DGELSY: computes the minimum-norm solution to a real linear least
+     !> squares problem:
+     !> minimize || A * X - B ||
+     !> using a complete orthogonal factorization of A.  A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+     !> The routine first computes a QR factorization with column pivoting:
+     !> A * P = Q * [ R11 R12 ]
+     !> [  0  R22 ]
+     !> with R11 defined as the largest leading submatrix whose estimated
+     !> condition number is less than 1/RCOND.  The order of R11, RANK,
+     !> is the effective rank of A.
+     !> Then, R22 is considered to be negligible, and R12 is annihilated
+     !> by orthogonal transformations from the right, arriving at the
+     !> complete orthogonal factorization:
+     !> A * P = Q * [ T11 0 ] * Z
+     !> [  0  0 ]
+     !> The minimum-norm solution is then
+     !> X = P * Z**T [ inv(T11)*Q1**T*B ]
+     !> [        0         ]
+     !> where Q1 consists of the first RANK columns of Q.
+     !> This routine is basically identical to the original xGELSX except
+     !> three differences:
+     !> o The call to the subroutine xGEQPF has been substituted by the
+     !> the call to the subroutine xGEQP3. This subroutine is a Blas-3
+     !> version of the QR factorization with column pivoting.
+     !> o Matrix B (the right hand side) is updated with Blas-3.
+     !> o The permutation of matrix B (the right hand side) is faster and
+     !> more simple.
+
+     subroutine stdlib_dgelsy( m, n, nrhs, a, lda, b, ldb, jpvt, rcond, rank,work, lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(dp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: imax = 1
+           integer(ilp), parameter :: imin = 2
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iascl, ibscl, ismax, ismin, j, lwkmin, lwkopt, mn, nb, nb1, nb2, &
+                     nb3, nb4
+           real(dp) :: anrm, bignum, bnrm, c1, c2, s1, s2, smax, smaxpr, smin, sminpr, smlnum, &
+                     wsize
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           mn = min( m, n )
+           ismin = mn + 1
+           ismax = 2*mn + 1
+           ! test the input arguments.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, m, n ) ) then
+              info = -7
+           end if
+           ! figure out optimal block size
+           if( info==0 ) then
+              if( mn==0 .or. nrhs==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'DGEQRF', ' ', m, n, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'DGERQF', ' ', m, n, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'DORMQR', ' ', m, n, nrhs, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'DORMRQ', ' ', m, n, nrhs, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = mn + max( 2*mn, n + 1, mn + nrhs )
+                 lwkopt = max( lwkmin,mn + 2*n + nb*( n + 1 ), 2*mn + nb*nrhs )
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELSY', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( mn==0 .or. nrhs==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters
+           smlnum = stdlib_dlamch( 'S' ) / stdlib_dlamch( 'P' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! scale a, b if max entries outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', m, n, a, lda, work )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_dlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_dlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_dlaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              rank = 0
+              go to 70
+           end if
+           bnrm = stdlib_dlange( 'M', m, nrhs, b, ldb, work )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_dlascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_dlascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! compute qr factorization with column pivoting of a:
+              ! a * p = q * r
+           call stdlib_dgeqp3( m, n, a, lda, jpvt, work( 1 ), work( mn+1 ),lwork-mn, info )
+                     
+           wsize = mn + work( mn+1 )
+           ! workspace: mn+2*n+nb*(n+1).
+           ! details of householder rotations stored in work(1:mn).
+           ! determine rank using incremental condition estimation
+           work( ismin ) = one
+           work( ismax ) = one
+           smax = abs( a( 1, 1 ) )
+           smin = smax
+           if( abs( a( 1, 1 ) )==zero ) then
+              rank = 0
+              call stdlib_dlaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              go to 70
+           else
+              rank = 1
+           end if
+           10 continue
+           if( rank<mn ) then
+              i = rank + 1
+              call stdlib_dlaic1( imin, rank, work( ismin ), smin, a( 1, i ),a( i, i ), sminpr, &
+                        s1, c1 )
+              call stdlib_dlaic1( imax, rank, work( ismax ), smax, a( 1, i ),a( i, i ), smaxpr, &
+                        s2, c2 )
+              if( smaxpr*rcond<=sminpr ) then
+                 do i = 1, rank
+                    work( ismin+i-1 ) = s1*work( ismin+i-1 )
+                    work( ismax+i-1 ) = s2*work( ismax+i-1 )
+                 end do
+                 work( ismin+rank ) = c1
+                 work( ismax+rank ) = c2
+                 smin = sminpr
+                 smax = smaxpr
+                 rank = rank + 1
+                 go to 10
+              end if
+           end if
+           ! workspace: 3*mn.
+           ! logically partition r = [ r11 r12 ]
+                                   ! [  0  r22 ]
+           ! where r11 = r(1:rank,1:rank)
+           ! [r11,r12] = [ t11, 0 ] * y
+           if( rank<n )call stdlib_dtzrzf( rank, n, a, lda, work( mn+1 ), work( 2*mn+1 ),lwork-&
+                     2*mn, info )
+           ! workspace: 2*mn.
+           ! details of householder rotations stored in work(mn+1:2*mn)
+           ! b(1:m,1:nrhs) := q**t * b(1:m,1:nrhs)
+           call stdlib_dormqr( 'LEFT', 'TRANSPOSE', m, nrhs, mn, a, lda, work( 1 ),b, ldb, work( &
+                     2*mn+1 ), lwork-2*mn, info )
+           wsize = max( wsize, 2*mn+work( 2*mn+1 ) )
+           ! workspace: 2*mn+nb*nrhs.
+           ! b(1:rank,1:nrhs) := inv(t11) * b(1:rank,1:nrhs)
+           call stdlib_dtrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', rank,nrhs, one, a, lda,&
+                      b, ldb )
+           do j = 1, nrhs
+              do i = rank + 1, n
+                 b( i, j ) = zero
+              end do
+           end do
+           ! b(1:n,1:nrhs) := y**t * b(1:n,1:nrhs)
+           if( rank<n ) then
+              call stdlib_dormrz( 'LEFT', 'TRANSPOSE', n, nrhs, rank, n-rank, a,lda, work( mn+1 ),&
+                         b, ldb, work( 2*mn+1 ),lwork-2*mn, info )
+           end if
+           ! workspace: 2*mn+nrhs.
+           ! b(1:n,1:nrhs) := p * b(1:n,1:nrhs)
+           do j = 1, nrhs
+              do i = 1, n
+                 work( jpvt( i ) ) = b( i, j )
+              end do
+              call stdlib_dcopy( n, work( 1 ), 1, b( 1, j ), 1 )
+           end do
+           ! workspace: n.
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_dlascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_dlascl( 'U', 0, 0, smlnum, anrm, rank, rank, a, lda,info )
+           else if( iascl==2 ) then
+              call stdlib_dlascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_dlascl( 'U', 0, 0, bignum, anrm, rank, rank, a, lda,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_dlascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_dlascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           70 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dgelsy
+
+     !> DGEQR: computes a QR factorization of a real M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     pure subroutine stdlib_dgeqr( m, n, a, lda, t, tsize, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, tsize, lwork
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: t(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, lminws, mint, minw
+           integer(ilp) :: mb, nb, mintsz, nblcks
+           ! Intrinsic Functions 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( tsize==-1 .or. tsize==-2 .or.lwork==-1 .or. lwork==-2 )
+           mint = .false.
+           minw = .false.
+           if( tsize==-2 .or. lwork==-2 ) then
+             if( tsize/=-1 ) mint = .true.
+             if( lwork/=-1 ) minw = .true.
+           end if
+           ! determine the block size
+           if( min( m, n )>0 ) then
+             mb = stdlib_ilaenv( 1, 'DGEQR ', ' ', m, n, 1, -1 )
+             nb = stdlib_ilaenv( 1, 'DGEQR ', ' ', m, n, 2, -1 )
+           else
+             mb = m
+             nb = 1
+           end if
+           if( mb>m .or. mb<=n ) mb = m
+           if( nb>min( m, n ) .or. nb<1 ) nb = 1
+           mintsz = n + 5
+           if( mb>n .and. m>n ) then
+             if( mod( m - n, mb - n )==0 ) then
+               nblcks = ( m - n ) / ( mb - n )
+             else
+               nblcks = ( m - n ) / ( mb - n ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           ! determine if the workspace size satisfies minimal size
+           lminws = .false.
+           if( ( tsize<max( 1, nb*n*nblcks + 5 ) .or. lwork<nb*n ).and. ( lwork>=n ) .and. ( &
+                     tsize>=mintsz ).and. ( .not.lquery ) ) then
+             if( tsize<max( 1, nb*n*nblcks + 5 ) ) then
+               lminws = .true.
+               nb = 1
+               mb = m
+             end if
+             if( lwork<nb*n ) then
+               lminws = .true.
+               nb = 1
+             end if
+           end if
+           if( m<0 ) then
+             info = -1
+           else if( n<0 ) then
+             info = -2
+           else if( lda<max( 1, m ) ) then
+             info = -4
+           else if( tsize<max( 1, nb*n*nblcks + 5 ).and. ( .not.lquery ) .and. ( .not.lminws ) ) &
+                     then
+             info = -6
+           else if( ( lwork<max( 1, n*nb ) ) .and. ( .not.lquery ).and. ( .not.lminws ) ) &
+                     then
+             info = -8
+           end if
+           if( info==0 ) then
+             if( mint ) then
+               t( 1 ) = mintsz
+             else
+               t( 1 ) = nb*n*nblcks + 5
+             end if
+             t( 2 ) = mb
+             t( 3 ) = nb
+             if( minw ) then
+               work( 1 ) = max( 1, n )
+             else
+               work( 1 ) = max( 1, nb*n )
+             end if
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DGEQR', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+             return
+           end if
+           ! the qr decomposition
+           if( ( m<=n ) .or. ( mb<=n ) .or. ( mb>=m ) ) then
+             call stdlib_dgeqrt( m, n, nb, a, lda, t( 6 ), nb, work, info )
+           else
+             call stdlib_dlatsqr( m, n, mb, nb, a, lda, t( 6 ), nb, work,lwork, info )
+           end if
+           work( 1 ) = max( 1, nb*n )
+           return
+     end subroutine stdlib_dgeqr
+
+     !> DGETSLS: solves overdetermined or underdetermined real linear systems
+     !> involving an M-by-N matrix A, using a tall skinny QR or short wide LQ
+     !> factorization of A.  It is assumed that A has full rank.
+     !> The following options are provided:
+     !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A*X ||.
+     !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+     !> an underdetermined system A * X = B.
+     !> 3. If TRANS = 'T' and m >= n:  find the minimum norm solution of
+     !> an undetermined system A**T * X = B.
+     !> 4. If TRANS = 'T' and m < n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A**T * X ||.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+
+     subroutine stdlib_dgetsls( trans, m, n, nrhs, a, lda, b, ldb,work, lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, tran
+           integer(ilp) :: i, iascl, ibscl, j, maxmn, brow, scllen, tszo, tszm, lwo, lwm, lw1, &
+                     lw2, wsizeo, wsizem, info2
+           real(dp) :: anrm, bignum, bnrm, smlnum, tq(5), workq(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,int
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           maxmn = max( m, n )
+           tran  = stdlib_lsame( trans, 'T' )
+           lquery = ( lwork==-1 .or. lwork==-2 )
+           if( .not.( stdlib_lsame( trans, 'N' ) .or.stdlib_lsame( trans, 'T' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+           ! determine the optimum and minimum lwork
+            if( m>=n ) then
+              call stdlib_dgeqr( m, n, a, lda, tq, -1, workq, -1, info2 )
+              tszo = int( tq( 1 ),KIND=ilp)
+              lwo  = int( workq( 1 ),KIND=ilp)
+              call stdlib_dgemqr( 'L', trans, m, nrhs, n, a, lda, tq,tszo, b, ldb, workq, -1, &
+                        info2 )
+              lwo  = max( lwo, int( workq( 1 ),KIND=ilp) )
+              call stdlib_dgeqr( m, n, a, lda, tq, -2, workq, -2, info2 )
+              tszm = int( tq( 1 ),KIND=ilp)
+              lwm  = int( workq( 1 ),KIND=ilp)
+              call stdlib_dgemqr( 'L', trans, m, nrhs, n, a, lda, tq,tszm, b, ldb, workq, -1, &
+                        info2 )
+              lwm = max( lwm, int( workq( 1 ),KIND=ilp) )
+              wsizeo = tszo + lwo
+              wsizem = tszm + lwm
+            else
+              call stdlib_dgelq( m, n, a, lda, tq, -1, workq, -1, info2 )
+              tszo = int( tq( 1 ),KIND=ilp)
+              lwo  = int( workq( 1 ),KIND=ilp)
+              call stdlib_dgemlq( 'L', trans, n, nrhs, m, a, lda, tq,tszo, b, ldb, workq, -1, &
+                        info2 )
+              lwo  = max( lwo, int( workq( 1 ),KIND=ilp) )
+              call stdlib_dgelq( m, n, a, lda, tq, -2, workq, -2, info2 )
+              tszm = int( tq( 1 ),KIND=ilp)
+              lwm  = int( workq( 1 ),KIND=ilp)
+              call stdlib_dgemlq( 'L', trans, n, nrhs, m, a, lda, tq,tszm, b, ldb, workq, -1, &
+                        info2 )
+              lwm  = max( lwm, int( workq( 1 ),KIND=ilp) )
+              wsizeo = tszo + lwo
+              wsizem = tszm + lwm
+            end if
+            if( ( lwork<wsizem ).and.( .not.lquery ) ) then
+               info = -10
+            end if
+            work( 1 ) = real( wsizeo,KIND=dp)
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DGETSLS', -info )
+             return
+           end if
+           if( lquery ) then
+             if( lwork==-2 ) work( 1 ) = real( wsizem,KIND=dp)
+             return
+           end if
+           if( lwork<wsizeo ) then
+             lw1 = tszm
+             lw2 = lwm
+           else
+             lw1 = tszo
+             lw2 = lwo
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+                call stdlib_dlaset( 'FULL', max( m, n ), nrhs, zero, zero,b, ldb )
+                return
+           end if
+           ! get machine parameters
+            smlnum = stdlib_dlamch( 'S' ) / stdlib_dlamch( 'P' )
+            bignum = one / smlnum
+            call stdlib_dlabad( smlnum, bignum )
+           ! scale a, b if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', m, n, a, lda, work )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_dlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_dlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_dlaset( 'F', maxmn, nrhs, zero, zero, b, ldb )
+              go to 50
+           end if
+           brow = m
+           if ( tran ) then
+             brow = n
+           end if
+           bnrm = stdlib_dlange( 'M', brow, nrhs, b, ldb, work )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_dlascl( 'G', 0, 0, bnrm, smlnum, brow, nrhs, b, ldb,info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_dlascl( 'G', 0, 0, bnrm, bignum, brow, nrhs, b, ldb,info )
+              ibscl = 2
+           end if
+           if ( m>=n ) then
+              ! compute qr factorization of a
+             call stdlib_dgeqr( m, n, a, lda, work( lw2+1 ), lw1,work( 1 ), lw2, info )
+             if ( .not.tran ) then
+                 ! least-squares problem min || a * x - b ||
+                 ! b(1:m,1:nrhs) := q**t * b(1:m,1:nrhs)
+               call stdlib_dgemqr( 'L' , 'T', m, nrhs, n, a, lda,work( lw2+1 ), lw1, b, ldb, work(&
+                          1 ), lw2,info )
+                 ! b(1:n,1:nrhs) := inv(r) * b(1:n,1:nrhs)
+               call stdlib_dtrtrs( 'U', 'N', 'N', n, nrhs,a, lda, b, ldb, info )
+               if( info>0 ) then
+                 return
+               end if
+               scllen = n
+             else
+                 ! overdetermined system of equations a**t * x = b
+                 ! b(1:n,1:nrhs) := inv(r**t) * b(1:n,1:nrhs)
+                 call stdlib_dtrtrs( 'U', 'T', 'N', n, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(n+1:m,1:nrhs) = zero
+                 do j = 1, nrhs
+                    do i = n + 1, m
+                       b( i, j ) = zero
+                    end do
+                 end do
+                 ! b(1:m,1:nrhs) := q(1:n,:) * b(1:n,1:nrhs)
+                 call stdlib_dgemqr( 'L', 'N', m, nrhs, n, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 scllen = m
+              end if
+           else
+              ! compute lq factorization of a
+              call stdlib_dgelq( m, n, a, lda, work( lw2+1 ), lw1,work( 1 ), lw2, info )
+              ! workspace at least m, optimally m*nb.
+              if( .not.tran ) then
+                 ! underdetermined system of equations a * x = b
+                 ! b(1:m,1:nrhs) := inv(l) * b(1:m,1:nrhs)
+                 call stdlib_dtrtrs( 'L', 'N', 'N', m, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(m+1:n,1:nrhs) = 0
+                 do j = 1, nrhs
+                    do i = m + 1, n
+                       b( i, j ) = zero
+                    end do
+                 end do
+                 ! b(1:n,1:nrhs) := q(1:n,:)**t * b(1:m,1:nrhs)
+                 call stdlib_dgemlq( 'L', 'T', n, nrhs, m, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = n
+              else
+                 ! overdetermined system min || a**t * x - b ||
+                 ! b(1:n,1:nrhs) := q * b(1:n,1:nrhs)
+                 call stdlib_dgemlq( 'L', 'N', n, nrhs, m, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:m,1:nrhs) := inv(l**t) * b(1:m,1:nrhs)
+                 call stdlib_dtrtrs( 'LOWER', 'TRANSPOSE', 'NON-UNIT', m, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = m
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+             call stdlib_dlascl( 'G', 0, 0, anrm, smlnum, scllen, nrhs, b, ldb,info )
+           else if( iascl==2 ) then
+             call stdlib_dlascl( 'G', 0, 0, anrm, bignum, scllen, nrhs, b, ldb,info )
+           end if
+           if( ibscl==1 ) then
+             call stdlib_dlascl( 'G', 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb,info )
+           else if( ibscl==2 ) then
+             call stdlib_dlascl( 'G', 0, 0, bignum, bnrm, scllen, nrhs, b, ldb,info )
+           end if
+           50 continue
+           work( 1 ) = real( tszo + lwo,KIND=dp)
+           return
+     end subroutine stdlib_dgetsls
+
+     !> DGETSQRHRT: computes a NB2-sized column blocked QR-factorization
+     !> of a real M-by-N matrix A with M >= N,
+     !> A = Q * R.
+     !> The routine uses internally a NB1-sized column blocked and MB1-sized
+     !> row blocked TSQR-factorization and perfors the reconstruction
+     !> of the Householder vectors from the TSQR output. The routine also
+     !> converts the R_tsqr factor from the TSQR-factorization output into
+     !> the R factor that corresponds to the Householder QR-factorization,
+     !> A = Q_tsqr * R_tsqr = Q * R.
+     !> The output Q and R factors are stored in the same format as in DGEQRT
+     !> (Q is in blocked compact WY-representation). See the documentation
+     !> of DGEQRT for more details on the format.
+
+     pure subroutine stdlib_dgetsqrhrt( m, n, mb1, nb1, nb2, a, lda, t, ldt, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, nb1, nb2, mb1
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: t(ldt,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, lw1, lw2, lwt, ldwt, lworkopt, nb1local, nb2local, &
+                     num_all_row_blocks
+           ! Intrinsic Functions 
+           intrinsic :: ceiling,real,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery  = lwork==-1
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb1<=n ) then
+              info = -3
+           else if( nb1<1 ) then
+              info = -4
+           else if( nb2<1 ) then
+              info = -5
+           else if( lda<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1,  min( nb2, n ) ) ) then
+              info = -9
+           else
+              ! test the input lwork for the dimension of the array work.
+              ! this workspace is used to store array:
+              ! a) matrix t and work for stdlib_dlatsqr;
+              ! b) n-by-n upper-triangular factor r_tsqr;
+              ! c) matrix t and array work for stdlib_dorgtsqr_row;
+              ! d) diagonal d for stdlib_dorhr_col.
+              if( lwork<n*n+1 .and. .not.lquery ) then
+                 info = -11
+              else
+                 ! set block size for column blocks
+                 nb1local = min( nb1, n )
+                 num_all_row_blocks = max( 1,ceiling( real( m - n,KIND=dp) / real( mb1 - n,&
+                           KIND=dp) ) )
+                 ! length and leading dimension of work array to place
+                 ! t array in tsqr.
+                 lwt = num_all_row_blocks * n * nb1local
+                 ldwt = nb1local
+                 ! length of tsqr work array
+                 lw1 = nb1local * n
+                 ! length of stdlib_dorgtsqr_row work array.
+                 lw2 = nb1local * max( nb1local, ( n - nb1local ) )
+                 lworkopt = max( lwt + lw1, max( lwt+n*n+lw2, lwt+n*n+n ) )
+                 if( ( lwork<max( 1, lworkopt ) ).and.(.not.lquery) ) then
+                    info = -11
+                 end if
+              end if
+           end if
+           ! handle error in the input parameters and return workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGETSQRHRT', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = real( lworkopt,KIND=dp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = real( lworkopt,KIND=dp)
+              return
+           end if
+           nb2local = min( nb2, n )
+           ! (1) perform tsqr-factorization of the m-by-n matrix a.
+           call stdlib_dlatsqr( m, n, mb1, nb1local, a, lda, work, ldwt,work(lwt+1), lw1, iinfo )
+                     
+           ! (2) copy the factor r_tsqr stored in the upper-triangular part
+               ! of a into the square matrix in the work array
+               ! work(lwt+1:lwt+n*n) column-by-column.
+           do j = 1, n
+              call stdlib_dcopy( j, a( 1, j ), 1, work( lwt + n*(j-1)+1 ), 1 )
+           end do
+           ! (3) generate a m-by-n matrix q with orthonormal columns from
+           ! the result stored below the diagonal in the array a in place.
+           call stdlib_dorgtsqr_row( m, n, mb1, nb1local, a, lda, work, ldwt,work( lwt+n*n+1 ), &
+                     lw2, iinfo )
+           ! (4) perform the reconstruction of householder vectors from
+           ! the matrix q (stored in a) in place.
+           call stdlib_dorhr_col( m, n, nb2local, a, lda, t, ldt,work( lwt+n*n+1 ), iinfo )
+                     
+           ! (5) copy the factor r_tsqr stored in the square matrix in the
+           ! work array work(lwt+1:lwt+n*n) into the upper-triangular
+           ! part of a.
+           ! (6) compute from r_tsqr the factor r_hr corresponding to
+           ! the reconstructed householder vectors, i.e. r_hr = s * r_tsqr.
+           ! this multiplication by the sign matrix s on the left means
+           ! changing the sign of i-th row of the matrix r_tsqr according
+           ! to sign of the i-th diagonal element diag(i) of the matrix s.
+           ! diag is stored in work( lwt+n*n+1 ) from the stdlib_dorhr_col output.
+           ! (5) and (6) can be combined in a single loop, so the rows in a
+           ! are accessed only once.
+           do i = 1, n
+              if( work( lwt+n*n+i )==-one ) then
+                 do j = i, n
+                    a( i, j ) = -one * work( lwt+n*(j-1)+i )
+                 end do
+              else
+                 call stdlib_dcopy( n-i+1, work(lwt+n*(i-1)+i), n, a( i, i ), lda )
+              end if
+           end do
+           work( 1 ) = real( lworkopt,KIND=dp)
+           return
+     end subroutine stdlib_dgetsqrhrt
+
+     !> DLAED2: merges the two sets of eigenvalues together into a single
+     !> sorted set.  Then it tries to deflate the size of the problem.
+     !> There are two ways in which deflation can occur:  when two or more
+     !> eigenvalues are close together or if there is a tiny entry in the
+     !> Z vector.  For each such occurrence the order of the related secular
+     !> equation problem is reduced by one.
+
+     pure subroutine stdlib_dlaed2( k, n, n1, d, q, ldq, indxq, rho, z, dlamda, w,q2, indx, indxc,&
+                indxp, coltyp, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, k
+           integer(ilp), intent(in) :: ldq, n, n1
+           real(dp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(out) :: coltyp(*), indx(*), indxc(*), indxp(*)
+           integer(ilp), intent(inout) :: indxq(*)
+           real(dp), intent(inout) :: d(*), q(ldq,*), z(*)
+           real(dp), intent(out) :: dlamda(*), q2(*), w(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: mone = -1.0_dp
+           
+           ! Local Arrays 
+           integer(ilp) :: ctot(4), psm(4)
+           ! Local Scalars 
+           integer(ilp) :: ct, i, imax, iq1, iq2, j, jmax, js, k2, n1p1, n2, nj, pj
+           real(dp) :: c, eps, s, t, tau, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           else if( min( 1, ( n / 2 ) )>n1 .or. ( n / 2 )<n1 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           n2 = n - n1
+           n1p1 = n1 + 1
+           if( rho<zero ) then
+              call stdlib_dscal( n2, mone, z( n1p1 ), 1 )
+           end if
+           ! normalize z so that norm(z) = 1.  since z is the concatenation of
+           ! two normalized vectors, norm2(z) = sqrt(2).
+           t = one / sqrt( two )
+           call stdlib_dscal( n, t, z, 1 )
+           ! rho = abs( norm(z)**2 * rho )
+           rho = abs( two*rho )
+           ! sort the eigenvalues into increasing order
+           do i = n1p1, n
+              indxq( i ) = indxq( i ) + n1
+           end do
+           ! re-integrate the deflated parts from the last pass
+           do i = 1, n
+              dlamda( i ) = d( indxq( i ) )
+           end do
+           call stdlib_dlamrg( n1, n2, dlamda, 1, 1, indxc )
+           do i = 1, n
+              indx( i ) = indxq( indxc( i ) )
+           end do
+           ! calculate the allowable deflation tolerance
+           imax = stdlib_idamax( n, z, 1 )
+           jmax = stdlib_idamax( n, d, 1 )
+           eps = stdlib_dlamch( 'EPSILON' )
+           tol = eight*eps*max( abs( d( jmax ) ), abs( z( imax ) ) )
+           ! if the rank-1 modifier is small enough, no more needs to be done
+           ! except to reorganize q so that its columns correspond with the
+           ! elements in d.
+           if( rho*abs( z( imax ) )<=tol ) then
+              k = 0
+              iq2 = 1
+              do j = 1, n
+                 i = indx( j )
+                 call stdlib_dcopy( n, q( 1, i ), 1, q2( iq2 ), 1 )
+                 dlamda( j ) = d( i )
+                 iq2 = iq2 + n
+              end do
+              call stdlib_dlacpy( 'A', n, n, q2, n, q, ldq )
+              call stdlib_dcopy( n, dlamda, 1, d, 1 )
+              go to 190
+           end if
+           ! if there are multiple eigenvalues then the problem deflates.  here
+           ! the number of equal eigenvalues are found.  as each equal
+           ! eigenvalue is found, an elementary reflector is computed to rotate
+           ! the corresponding eigensubspace so that the corresponding
+           ! components of z are zero in this new basis.
+           do i = 1, n1
+              coltyp( i ) = 1
+           end do
+           do i = n1p1, n
+              coltyp( i ) = 3
+           end do
+           k = 0
+           k2 = n + 1
+           do j = 1, n
+              nj = indx( j )
+              if( rho*abs( z( nj ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 coltyp( nj ) = 4
+                 indxp( k2 ) = nj
+                 if( j==n )go to 100
+              else
+                 pj = nj
+                 go to 80
+              end if
+           end do
+           80 continue
+           j = j + 1
+           nj = indx( j )
+           if( j>n )go to 100
+           if( rho*abs( z( nj ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              coltyp( nj ) = 4
+              indxp( k2 ) = nj
+           else
+              ! check if eigenvalues are close enough to allow deflation.
+              s = z( pj )
+              c = z( nj )
+              ! find sqrt(a**2+b**2) without overflow or
+              ! destructive underflow.
+              tau = stdlib_dlapy2( c, s )
+              t = d( nj ) - d( pj )
+              c = c / tau
+              s = -s / tau
+              if( abs( t*c*s )<=tol ) then
+                 ! deflation is possible.
+                 z( nj ) = tau
+                 z( pj ) = zero
+                 if( coltyp( nj )/=coltyp( pj ) )coltyp( nj ) = 2
+                 coltyp( pj ) = 4
+                 call stdlib_drot( n, q( 1, pj ), 1, q( 1, nj ), 1, c, s )
+                 t = d( pj )*c**2 + d( nj )*s**2
+                 d( nj ) = d( pj )*s**2 + d( nj )*c**2
+                 d( pj ) = t
+                 k2 = k2 - 1
+                 i = 1
+                 90 continue
+                 if( k2+i<=n ) then
+                    if( d( pj )<d( indxp( k2+i ) ) ) then
+                       indxp( k2+i-1 ) = indxp( k2+i )
+                       indxp( k2+i ) = pj
+                       i = i + 1
+                       go to 90
+                    else
+                       indxp( k2+i-1 ) = pj
+                    end if
+                 else
+                    indxp( k2+i-1 ) = pj
+                 end if
+                 pj = nj
+              else
+                 k = k + 1
+                 dlamda( k ) = d( pj )
+                 w( k ) = z( pj )
+                 indxp( k ) = pj
+                 pj = nj
+              end if
+           end if
+           go to 80
+           100 continue
+           ! record the last eigenvalue.
+           k = k + 1
+           dlamda( k ) = d( pj )
+           w( k ) = z( pj )
+           indxp( k ) = pj
+           ! count up the total number of the various types of columns, then
+           ! form a permutation which positions the four column types into
+           ! four uniform groups (although one or more of these groups may be
+           ! empty).
+           do j = 1, 4
+              ctot( j ) = 0
+           end do
+           do j = 1, n
+              ct = coltyp( j )
+              ctot( ct ) = ctot( ct ) + 1
+           end do
+           ! psm(*) = position in submatrix (of types 1 through 4)
+           psm( 1 ) = 1
+           psm( 2 ) = 1 + ctot( 1 )
+           psm( 3 ) = psm( 2 ) + ctot( 2 )
+           psm( 4 ) = psm( 3 ) + ctot( 3 )
+           k = n - ctot( 4 )
+           ! fill out the indxc array so that the permutation which it induces
+           ! will place all type-1 columns first, all type-2 columns next,
+           ! then all type-3's, and finally all type-4's.
+           do j = 1, n
+              js = indxp( j )
+              ct = coltyp( js )
+              indx( psm( ct ) ) = js
+              indxc( psm( ct ) ) = j
+              psm( ct ) = psm( ct ) + 1
+           end do
+           ! sort the eigenvalues and corresponding eigenvectors into dlamda
+           ! and q2 respectively.  the eigenvalues/vectors which were not
+           ! deflated go into the first k slots of dlamda and q2 respectively,
+           ! while those which were deflated go into the last n - k slots.
+           i = 1
+           iq1 = 1
+           iq2 = 1 + ( ctot( 1 )+ctot( 2 ) )*n1
+           do j = 1, ctot( 1 )
+              js = indx( i )
+              call stdlib_dcopy( n1, q( 1, js ), 1, q2( iq1 ), 1 )
+              z( i ) = d( js )
+              i = i + 1
+              iq1 = iq1 + n1
+           end do
+           do j = 1, ctot( 2 )
+              js = indx( i )
+              call stdlib_dcopy( n1, q( 1, js ), 1, q2( iq1 ), 1 )
+              call stdlib_dcopy( n2, q( n1+1, js ), 1, q2( iq2 ), 1 )
+              z( i ) = d( js )
+              i = i + 1
+              iq1 = iq1 + n1
+              iq2 = iq2 + n2
+           end do
+           do j = 1, ctot( 3 )
+              js = indx( i )
+              call stdlib_dcopy( n2, q( n1+1, js ), 1, q2( iq2 ), 1 )
+              z( i ) = d( js )
+              i = i + 1
+              iq2 = iq2 + n2
+           end do
+           iq1 = iq2
+           do j = 1, ctot( 4 )
+              js = indx( i )
+              call stdlib_dcopy( n, q( 1, js ), 1, q2( iq2 ), 1 )
+              iq2 = iq2 + n
+              z( i ) = d( js )
+              i = i + 1
+           end do
+           ! the deflated eigenvalues and their corresponding vectors go back
+           ! into the last n - k slots of d and q respectively.
+           if( k<n ) then
+              call stdlib_dlacpy( 'A', n, ctot( 4 ), q2( iq1 ), n,q( 1, k+1 ), ldq )
+              call stdlib_dcopy( n-k, z( k+1 ), 1, d( k+1 ), 1 )
+           end if
+           ! copy ctot into coltyp for referencing in stdlib_dlaed3.
+           do j = 1, 4
+              coltyp( j ) = ctot( j )
+           end do
+           190 continue
+           return
+     end subroutine stdlib_dlaed2
+
+     !> DLAQR2: is identical to DLAQR3 except that it avoids
+     !> recursion by calling DLAHQR instead of DLAQR4.
+     !> Aggressive early deflation:
+     !> This subroutine accepts as input an upper Hessenberg matrix
+     !> H and performs an orthogonal similarity transformation
+     !> designed to detect and deflate fully converged eigenvalues from
+     !> a trailing principal submatrix.  On output H has been over-
+     !> written by a new Hessenberg matrix that is a perturbation of
+     !> an orthogonal similarity transformation of H.  It is to be
+     !> hoped that the final version of H has many zero subdiagonal
+     !> entries.
+
+     subroutine stdlib_dlaqr2( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, ns, nd,&
+                sr, si, v, ldv, nh, t,ldt, nv, wv, ldwv, work, lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kbot, ktop, ldh, ldt, ldv, ldwv, ldz, lwork, n,&
+                      nh, nv, nw
+           integer(ilp), intent(out) :: nd, ns
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(dp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(dp), intent(out) :: si(*), sr(*), t(ldt,*), v(ldv,*), work(*), wv(ldwv,*)
+        ! ================================================================
+           
+           ! Local Scalars 
+           real(dp) :: aa, bb, beta, cc, cs, dd, evi, evk, foo, s, safmax, safmin, smlnum, sn, &
+                     tau, ulp
+           integer(ilp) :: i, ifst, ilst, info, infqr, j, jw, k, kcol, kend, kln, krow, kwtop, &
+                     ltop, lwk1, lwk2, lwkopt
+           logical(lk) :: bulge, sorted
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,max,min,sqrt
+           ! Executable Statements 
+           ! ==== estimate optimal workspace. ====
+           jw = min( nw, kbot-ktop+1 )
+           if( jw<=2 ) then
+              lwkopt = 1
+           else
+              ! ==== workspace query call to stdlib_dgehrd ====
+              call stdlib_dgehrd( jw, 1, jw-1, t, ldt, work, work, -1, info )
+              lwk1 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_dormhr ====
+              call stdlib_dormhr( 'R', 'N', jw, jw, 1, jw-1, t, ldt, work, v, ldv,work, -1, info )
+                        
+              lwk2 = int( work( 1 ),KIND=ilp)
+              ! ==== optimal workspace ====
+              lwkopt = jw + max( lwk1, lwk2 )
+           end if
+           ! ==== quick return in case of workspace query. ====
+           if( lwork==-1 ) then
+              work( 1 ) = real( lwkopt,KIND=dp)
+              return
+           end if
+           ! ==== nothing to do ...
+           ! ... for an empty active block ... ====
+           ns = 0
+           nd = 0
+           work( 1 ) = one
+           if( ktop>kbot )return
+           ! ... nor for an empty deflation window. ====
+           if( nw<1 )return
+           ! ==== machine constants ====
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = one / safmin
+           call stdlib_dlabad( safmin, safmax )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=dp) / ulp )
+           ! ==== setup deflation window ====
+           jw = min( nw, kbot-ktop+1 )
+           kwtop = kbot - jw + 1
+           if( kwtop==ktop ) then
+              s = zero
+           else
+              s = h( kwtop, kwtop-1 )
+           end if
+           if( kbot==kwtop ) then
+              ! ==== 1-by-1 deflation window: not much to do ====
+              sr( kwtop ) = h( kwtop, kwtop )
+              si( kwtop ) = zero
+              ns = 1
+              nd = 0
+              if( abs( s )<=max( smlnum, ulp*abs( h( kwtop, kwtop ) ) ) )then
+                 ns = 0
+                 nd = 1
+                 if( kwtop>ktop )h( kwtop, kwtop-1 ) = zero
+              end if
+              work( 1 ) = one
+              return
+           end if
+           ! ==== convert to spike-triangular form.  (in case of a
+           ! .    rare qr failure, this routine continues to do
+           ! .    aggressive early deflation using that part of
+           ! .    the deflation window that converged using infqr
+           ! .    here and there to keep track.) ====
+           call stdlib_dlacpy( 'U', jw, jw, h( kwtop, kwtop ), ldh, t, ldt )
+           call stdlib_dcopy( jw-1, h( kwtop+1, kwtop ), ldh+1, t( 2, 1 ), ldt+1 )
+           call stdlib_dlaset( 'A', jw, jw, zero, one, v, ldv )
+           call stdlib_dlahqr( .true., .true., jw, 1, jw, t, ldt, sr( kwtop ),si( kwtop ), 1, jw, &
+                     v, ldv, infqr )
+           ! ==== stdlib_dtrexc needs a clean margin near the diagonal ====
+           do j = 1, jw - 3
+              t( j+2, j ) = zero
+              t( j+3, j ) = zero
+           end do
+           if( jw>2 )t( jw, jw-2 ) = zero
+           ! ==== deflation detection loop ====
+           ns = jw
+           ilst = infqr + 1
+           20 continue
+           if( ilst<=ns ) then
+              if( ns==1 ) then
+                 bulge = .false.
+              else
+                 bulge = t( ns, ns-1 )/=zero
+              end if
+              ! ==== small spike tip test for deflation ====
+              if( .not.bulge ) then
+                 ! ==== real eigenvalue ====
+                 foo = abs( t( ns, ns ) )
+                 if( foo==zero )foo = abs( s )
+                 if( abs( s*v( 1, ns ) )<=max( smlnum, ulp*foo ) ) then
+                    ! ==== deflatable ====
+                    ns = ns - 1
+                 else
+                    ! ==== undeflatable.   move it up out of the way.
+                    ! .    (stdlib_dtrexc can not fail in this case.) ====
+                    ifst = ns
+                    call stdlib_dtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    ilst = ilst + 1
+                 end if
+              else
+                 ! ==== complex conjugate pair ====
+                 foo = abs( t( ns, ns ) ) + sqrt( abs( t( ns, ns-1 ) ) )*sqrt( abs( t( ns-1, ns ) &
+                           ) )
+                 if( foo==zero )foo = abs( s )
+                 if( max( abs( s*v( 1, ns ) ), abs( s*v( 1, ns-1 ) ) )<=max( smlnum, ulp*foo ) ) &
+                           then
+                    ! ==== deflatable ====
+                    ns = ns - 2
+                 else
+                    ! ==== undeflatable. move them up out of the way.
+                    ! .    fortunately, stdlib_dtrexc does the right thing with
+                    ! .    ilst in case of a rare exchange failure. ====
+                    ifst = ns
+                    call stdlib_dtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    ilst = ilst + 2
+                 end if
+              end if
+              ! ==== end deflation detection loop ====
+              go to 20
+           end if
+              ! ==== return to hessenberg form ====
+           if( ns==0 )s = zero
+           if( ns<jw ) then
+              ! ==== sorting diagonal blocks of t improves accuracy for
+              ! .    graded matrices.  bubble sort deals well with
+              ! .    exchange failures. ====
+              sorted = .false.
+              i = ns + 1
+              30 continue
+              if( sorted )go to 50
+              sorted = .true.
+              kend = i - 1
+              i = infqr + 1
+              if( i==ns ) then
+                 k = i + 1
+              else if( t( i+1, i )==zero ) then
+                 k = i + 1
+              else
+                 k = i + 2
+              end if
+              40 continue
+              if( k<=kend ) then
+                 if( k==i+1 ) then
+                    evi = abs( t( i, i ) )
+                 else
+                    evi = abs( t( i, i ) ) + sqrt( abs( t( i+1, i ) ) )*sqrt( abs( t( i, i+1 ) ) )
+                              
+                 end if
+                 if( k==kend ) then
+                    evk = abs( t( k, k ) )
+                 else if( t( k+1, k )==zero ) then
+                    evk = abs( t( k, k ) )
+                 else
+                    evk = abs( t( k, k ) ) + sqrt( abs( t( k+1, k ) ) )*sqrt( abs( t( k, k+1 ) ) )
+                              
+                 end if
+                 if( evi>=evk ) then
+                    i = k
+                 else
+                    sorted = .false.
+                    ifst = i
+                    ilst = k
+                    call stdlib_dtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    if( info==0 ) then
+                       i = ilst
+                    else
+                       i = k
+                    end if
+                 end if
+                 if( i==kend ) then
+                    k = i + 1
+                 else if( t( i+1, i )==zero ) then
+                    k = i + 1
+                 else
+                    k = i + 2
+                 end if
+                 go to 40
+              end if
+              go to 30
+              50 continue
+           end if
+           ! ==== restore shift/eigenvalue array from t ====
+           i = jw
+           60 continue
+           if( i>=infqr+1 ) then
+              if( i==infqr+1 ) then
+                 sr( kwtop+i-1 ) = t( i, i )
+                 si( kwtop+i-1 ) = zero
+                 i = i - 1
+              else if( t( i, i-1 )==zero ) then
+                 sr( kwtop+i-1 ) = t( i, i )
+                 si( kwtop+i-1 ) = zero
+                 i = i - 1
+              else
+                 aa = t( i-1, i-1 )
+                 cc = t( i, i-1 )
+                 bb = t( i-1, i )
+                 dd = t( i, i )
+                 call stdlib_dlanv2( aa, bb, cc, dd, sr( kwtop+i-2 ),si( kwtop+i-2 ), sr( kwtop+i-&
+                           1 ),si( kwtop+i-1 ), cs, sn )
+                 i = i - 2
+              end if
+              go to 60
+           end if
+           if( ns<jw .or. s==zero ) then
+              if( ns>1 .and. s/=zero ) then
+                 ! ==== reflect spike back into lower triangle ====
+                 call stdlib_dcopy( ns, v, ldv, work, 1 )
+                 beta = work( 1 )
+                 call stdlib_dlarfg( ns, beta, work( 2 ), 1, tau )
+                 work( 1 ) = one
+                 call stdlib_dlaset( 'L', jw-2, jw-2, zero, zero, t( 3, 1 ), ldt )
+                 call stdlib_dlarf( 'L', ns, jw, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_dlarf( 'R', ns, ns, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_dlarf( 'R', jw, ns, work, 1, tau, v, ldv,work( jw+1 ) )
+                 call stdlib_dgehrd( jw, 1, ns, t, ldt, work, work( jw+1 ),lwork-jw, info )
+                           
+              end if
+              ! ==== copy updated reduced window into place ====
+              if( kwtop>1 )h( kwtop, kwtop-1 ) = s*v( 1, 1 )
+              call stdlib_dlacpy( 'U', jw, jw, t, ldt, h( kwtop, kwtop ), ldh )
+              call stdlib_dcopy( jw-1, t( 2, 1 ), ldt+1, h( kwtop+1, kwtop ),ldh+1 )
+              ! ==== accumulate orthogonal matrix in order update
+              ! .    h and z, if requested.  ====
+              if( ns>1 .and. s/=zero )call stdlib_dormhr( 'R', 'N', jw, ns, 1, ns, t, ldt, work, &
+                        v, ldv,work( jw+1 ), lwork-jw, info )
+              ! ==== update vertical slab in h ====
+              if( wantt ) then
+                 ltop = 1
+              else
+                 ltop = ktop
+              end if
+              do krow = ltop, kwtop - 1, nv
+                 kln = min( nv, kwtop-krow )
+                 call stdlib_dgemm( 'N', 'N', kln, jw, jw, one, h( krow, kwtop ),ldh, v, ldv, &
+                           zero, wv, ldwv )
+                 call stdlib_dlacpy( 'A', kln, jw, wv, ldwv, h( krow, kwtop ), ldh )
+              end do
+              ! ==== update horizontal slab in h ====
+              if( wantt ) then
+                 do kcol = kbot + 1, n, nh
+                    kln = min( nh, n-kcol+1 )
+                    call stdlib_dgemm( 'C', 'N', jw, kln, jw, one, v, ldv,h( kwtop, kcol ), ldh, &
+                              zero, t, ldt )
+                    call stdlib_dlacpy( 'A', jw, kln, t, ldt, h( kwtop, kcol ),ldh )
+                 end do
+              end if
+              ! ==== update vertical slab in z ====
+              if( wantz ) then
+                 do krow = iloz, ihiz, nv
+                    kln = min( nv, ihiz-krow+1 )
+                    call stdlib_dgemm( 'N', 'N', kln, jw, jw, one, z( krow, kwtop ),ldz, v, ldv, &
+                              zero, wv, ldwv )
+                    call stdlib_dlacpy( 'A', kln, jw, wv, ldwv, z( krow, kwtop ),ldz )
+                 end do
+              end if
+           end if
+           ! ==== return the number of deflations ... ====
+           nd = jw - ns
+           ! ==== ... and the number of shifts. (subtracting
+           ! .    infqr from the spike length takes care
+           ! .    of the case of a rare qr failure while
+           ! .    calculating eigenvalues of the deflation
+           ! .    window.)  ====
+           ns = ns - infqr
+            ! ==== return optimal workspace. ====
+           work( 1 ) = real( lwkopt,KIND=dp)
+     end subroutine stdlib_dlaqr2
+
+     !> DLASD1: computes the SVD of an upper bidiagonal N-by-M matrix B,
+     !> where N = NL + NR + 1 and M = N + SQRE. DLASD1 is called from DLASD0.
+     !> A related subroutine DLASD7 handles the case in which the singular
+     !> values (and the singular vectors in factored form) are desired.
+     !> DLASD1 computes the SVD as follows:
+     !> ( D1(in)    0    0       0 )
+     !> B = U(in) * (   Z1**T   a   Z2**T    b ) * VT(in)
+     !> (   0       0   D2(in)   0 )
+     !> = U(out) * ( D(out) 0) * VT(out)
+     !> where Z**T = (Z1**T a Z2**T b) = u**T VT**T, and u is a vector of dimension M
+     !> with ALPHA and BETA in the NL+1 and NL+2 th entries and zeros
+     !> elsewhere; and the entry b is empty if SQRE = 0.
+     !> The left singular vectors of the original matrix are stored in U, and
+     !> the transpose of the right singular vectors are stored in VT, and the
+     !> singular values are in D.  The algorithm consists of three stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple singular values or when there are zeros in
+     !> the Z vector.  For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one.  This stage is
+     !> performed by the routine DLASD2.
+     !> The second stage consists of calculating the updated
+     !> singular values. This is done by finding the square roots of the
+     !> roots of the secular equation via the routine DLASD4 (as called
+     !> by DLASD3). This routine also calculates the singular vectors of
+     !> the current problem.
+     !> The final stage consists of computing the updated singular vectors
+     !> directly using the updated singular values.  The singular vectors
+     !> for the current problem are multiplied with the singular vectors
+     !> from the overall problem.
+
+     pure subroutine stdlib_dlasd1( nl, nr, sqre, d, alpha, beta, u, ldu, vt, ldvt,idxq, iwork, &
+               work, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu, ldvt, nl, nr, sqre
+           real(dp), intent(inout) :: alpha, beta
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: idxq(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: d(*), u(ldu,*), vt(ldvt,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: coltyp, i, idx, idxc, idxp, iq, isigma, iu2, ivt2, iz, k, ldq, ldu2, &
+                     ldvt2, m, n, n1, n2
+           real(dp) :: orgnrm
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( nl<1 ) then
+              info = -1
+           else if( nr<1 ) then
+              info = -2
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD1', -info )
+              return
+           end if
+           n = nl + nr + 1
+           m = n + sqre
+           ! the following values are for bookkeeping purposes only.  they are
+           ! integer pointers which indicate the portion of the workspace
+           ! used by a particular array in stdlib_dlasd2 and stdlib_dlasd3.
+           ldu2 = n
+           ldvt2 = m
+           iz = 1
+           isigma = iz + m
+           iu2 = isigma + n
+           ivt2 = iu2 + ldu2*n
+           iq = ivt2 + ldvt2*m
+           idx = 1
+           idxc = idx + n
+           coltyp = idxc + n
+           idxp = coltyp + n
+           ! scale.
+           orgnrm = max( abs( alpha ), abs( beta ) )
+           d( nl+1 ) = zero
+           do i = 1, n
+              if( abs( d( i ) )>orgnrm ) then
+                 orgnrm = abs( d( i ) )
+              end if
+           end do
+           call stdlib_dlascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, info )
+           alpha = alpha / orgnrm
+           beta = beta / orgnrm
+           ! deflate singular values.
+           call stdlib_dlasd2( nl, nr, sqre, k, d, work( iz ), alpha, beta, u, ldu,vt, ldvt, work(&
+            isigma ), work( iu2 ), ldu2,work( ivt2 ), ldvt2, iwork( idxp ), iwork( idx ),iwork( &
+                      idxc ), idxq, iwork( coltyp ), info )
+           ! solve secular equation and update singular vectors.
+           ldq = k
+           call stdlib_dlasd3( nl, nr, sqre, k, d, work( iq ), ldq, work( isigma ),u, ldu, work( &
+           iu2 ), ldu2, vt, ldvt, work( ivt2 ),ldvt2, iwork( idxc ), iwork( coltyp ), work( iz ),&
+                     info )
+           ! report the convergence failure.
+           if( info/=0 ) then
+              return
+           end if
+           ! unscale.
+           call stdlib_dlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+           ! prepare the idxq sorting permutation.
+           n1 = k
+           n2 = n - k
+           call stdlib_dlamrg( n1, n2, d, 1, -1, idxq )
+           return
+     end subroutine stdlib_dlasd1
+
+     !> DLAED1: computes the updated eigensystem of a diagonal
+     !> matrix after modification by a rank-one symmetric matrix.  This
+     !> routine is used only for the eigenproblem which requires all
+     !> eigenvalues and eigenvectors of a tridiagonal matrix.  DLAED7 handles
+     !> the case in which eigenvalues only or eigenvalues and eigenvectors
+     !> of a full symmetric matrix (which was reduced to tridiagonal form)
+     !> are desired.
+     !> T = Q(in) ( D(in) + RHO * Z*Z**T ) Q**T(in) = Q(out) * D(out) * Q**T(out)
+     !> where Z = Q**T*u, u is a vector of length N with ones in the
+     !> CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+     !> The eigenvectors of the original matrix are stored in Q, and the
+     !> eigenvalues are in D.  The algorithm consists of three stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple eigenvalues or if there is a zero in
+     !> the Z vector.  For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one.  This stage is
+     !> performed by the routine DLAED2.
+     !> The second stage consists of calculating the updated
+     !> eigenvalues. This is done by finding the roots of the secular
+     !> equation via the routine DLAED4 (as called by DLAED3).
+     !> This routine also calculates the eigenvectors of the current
+     !> problem.
+     !> The final stage consists of computing the updated eigenvectors
+     !> directly using the updated eigenvalues.  The eigenvectors for
+     !> the current problem are multiplied with the eigenvectors from
+     !> the overall problem.
+
+     pure subroutine stdlib_dlaed1( n, d, q, ldq, indxq, rho, cutpnt, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: cutpnt, ldq, n
+           integer(ilp), intent(out) :: info
+           real(dp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: indxq(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: d(*), q(ldq,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: coltyp, i, idlmda, indx, indxc, indxp, iq2, is, iw, iz, k, n1, n2, &
+                     zpp1
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( ldq<max( 1, n ) ) then
+              info = -4
+           else if( min( 1, n / 2 )>cutpnt .or. ( n / 2 )<cutpnt ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED1', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! the following values are integer pointers which indicate
+           ! the portion of the workspace
+           ! used by a particular array in stdlib_dlaed2 and stdlib_dlaed3.
+           iz = 1
+           idlmda = iz + n
+           iw = idlmda + n
+           iq2 = iw + n
+           indx = 1
+           indxc = indx + n
+           coltyp = indxc + n
+           indxp = coltyp + n
+           ! form the z-vector which consists of the last row of q_1 and the
+           ! first row of q_2.
+           call stdlib_dcopy( cutpnt, q( cutpnt, 1 ), ldq, work( iz ), 1 )
+           zpp1 = cutpnt + 1
+           call stdlib_dcopy( n-cutpnt, q( zpp1, zpp1 ), ldq, work( iz+cutpnt ), 1 )
+           ! deflate eigenvalues.
+           call stdlib_dlaed2( k, n, cutpnt, d, q, ldq, indxq, rho, work( iz ),work( idlmda ), &
+           work( iw ), work( iq2 ),iwork( indx ), iwork( indxc ), iwork( indxp ),iwork( coltyp ), &
+                     info )
+           if( info/=0 )go to 20
+           ! solve secular equation.
+           if( k/=0 ) then
+              is = ( iwork( coltyp )+iwork( coltyp+1 ) )*cutpnt +( iwork( coltyp+1 )+iwork( &
+                        coltyp+2 ) )*( n-cutpnt ) + iq2
+              call stdlib_dlaed3( k, n, cutpnt, d, q, ldq, rho, work( idlmda ),work( iq2 ), iwork(&
+                         indxc ), iwork( coltyp ),work( iw ), work( is ), info )
+              if( info/=0 )go to 20
+           ! prepare the indxq sorting permutation.
+              n1 = k
+              n2 = n - k
+              call stdlib_dlamrg( n1, n2, d, 1, -1, indxq )
+           else
+              do i = 1, n
+                 indxq( i ) = i
+              end do
+           end if
+           20 continue
+           return
+     end subroutine stdlib_dlaed1
+
+     !> DLAED0: computes all eigenvalues and corresponding eigenvectors of a
+     !> symmetric tridiagonal matrix using the divide and conquer method.
+
+     pure subroutine stdlib_dlaed0( icompq, qsiz, n, d, e, q, ldq, qstore, ldqs,work, iwork, info &
+               )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, ldq, ldqs, n, qsiz
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: d(*), e(*), q(ldq,*)
+           real(dp), intent(out) :: qstore(ldqs,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: curlvl, curprb, curr, i, igivcl, igivnm, igivpt, indxq, iperm, iprmpt, &
+           iq, iqptr, iwrem, j, k, lgn, matsiz, msd2, smlsiz, smm1, spm1, spm2, submat, subpbs, &
+                     tlvls
+           real(dp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( icompq<0 .or. icompq>2 ) then
+              info = -1
+           else if( ( icompq==1 ) .and. ( qsiz<max( 0, n ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldq<max( 1, n ) ) then
+              info = -7
+           else if( ldqs<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED0', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           smlsiz = stdlib_ilaenv( 9, 'DLAED0', ' ', 0, 0, 0, 0 )
+           ! determine the size and placement of the submatrices, and save in
+           ! the leading elements of iwork.
+           iwork( 1 ) = n
+           subpbs = 1
+           tlvls = 0
+           10 continue
+           if( iwork( subpbs )>smlsiz ) then
+              do j = subpbs, 1, -1
+                 iwork( 2*j ) = ( iwork( j )+1 ) / 2
+                 iwork( 2*j-1 ) = iwork( j ) / 2
+              end do
+              tlvls = tlvls + 1
+              subpbs = 2*subpbs
+              go to 10
+           end if
+           do j = 2, subpbs
+              iwork( j ) = iwork( j ) + iwork( j-1 )
+           end do
+           ! divide the matrix into subpbs submatrices of size at most smlsiz+1
+           ! using rank-1 modifications (cuts).
+           spm1 = subpbs - 1
+           do i = 1, spm1
+              submat = iwork( i ) + 1
+              smm1 = submat - 1
+              d( smm1 ) = d( smm1 ) - abs( e( smm1 ) )
+              d( submat ) = d( submat ) - abs( e( smm1 ) )
+           end do
+           indxq = 4*n + 3
+           if( icompq/=2 ) then
+              ! set up workspaces for eigenvalues only/accumulate new vectors
+              ! routine
+              temp = log( real( n,KIND=dp) ) / log( two )
+              lgn = int( temp,KIND=ilp)
+              if( 2**lgn<n )lgn = lgn + 1
+              if( 2**lgn<n )lgn = lgn + 1
+              iprmpt = indxq + n + 1
+              iperm = iprmpt + n*lgn
+              iqptr = iperm + n*lgn
+              igivpt = iqptr + n + 2
+              igivcl = igivpt + n*lgn
+              igivnm = 1
+              iq = igivnm + 2*n*lgn
+              iwrem = iq + n**2 + 1
+              ! initialize pointers
+              do i = 0, subpbs
+                 iwork( iprmpt+i ) = 1
+                 iwork( igivpt+i ) = 1
+              end do
+              iwork( iqptr ) = 1
+           end if
+           ! solve each submatrix eigenproblem at the bottom of the divide and
+           ! conquer tree.
+           curr = 0
+           loop_70: do i = 0, spm1
+              if( i==0 ) then
+                 submat = 1
+                 matsiz = iwork( 1 )
+              else
+                 submat = iwork( i ) + 1
+                 matsiz = iwork( i+1 ) - iwork( i )
+              end if
+              if( icompq==2 ) then
+                 call stdlib_dsteqr( 'I', matsiz, d( submat ), e( submat ),q( submat, submat ), &
+                           ldq, work, info )
+                 if( info/=0 )go to 130
+              else
+                 call stdlib_dsteqr( 'I', matsiz, d( submat ), e( submat ),work( iq-1+iwork( &
+                           iqptr+curr ) ), matsiz, work,info )
+                 if( info/=0 )go to 130
+                 if( icompq==1 ) then
+                    call stdlib_dgemm( 'N', 'N', qsiz, matsiz, matsiz, one,q( 1, submat ), ldq, &
+                    work( iq-1+iwork( iqptr+curr ) ), matsiz, zero, qstore( 1, submat ),ldqs )
+                              
+                 end if
+                 iwork( iqptr+curr+1 ) = iwork( iqptr+curr ) + matsiz**2
+                 curr = curr + 1
+              end if
+              k = 1
+              do j = submat, iwork( i+1 )
+                 iwork( indxq+j ) = k
+                 k = k + 1
+              end do
+           end do loop_70
+           ! successively merge eigensystems of adjacent submatrices
+           ! into eigensystem for the corresponding larger matrix.
+           ! while ( subpbs > 1 )
+           curlvl = 1
+           80 continue
+           if( subpbs>1 ) then
+              spm2 = subpbs - 2
+              loop_90: do i = 0, spm2, 2
+                 if( i==0 ) then
+                    submat = 1
+                    matsiz = iwork( 2 )
+                    msd2 = iwork( 1 )
+                    curprb = 0
+                 else
+                    submat = iwork( i ) + 1
+                    matsiz = iwork( i+2 ) - iwork( i )
+                    msd2 = matsiz / 2
+                    curprb = curprb + 1
+                 end if
+           ! merge lower order eigensystems (of size msd2 and matsiz - msd2)
+           ! into an eigensystem of size matsiz.
+           ! stdlib_dlaed1 is used only for the full eigensystem of a tridiagonal
+           ! matrix.
+           ! stdlib_dlaed7 handles the cases in which eigenvalues only or eigenvalues
+           ! and eigenvectors of a full symmetric matrix (which was reduced to
+           ! tridiagonal form) are desired.
+                 if( icompq==2 ) then
+                    call stdlib_dlaed1( matsiz, d( submat ), q( submat, submat ),ldq, iwork( &
+                    indxq+submat ),e( submat+msd2-1 ), msd2, work,iwork( subpbs+1 ), info )
+                              
+                 else
+                    call stdlib_dlaed7( icompq, matsiz, qsiz, tlvls, curlvl, curprb,d( submat ), &
+                    qstore( 1, submat ), ldqs,iwork( indxq+submat ), e( submat+msd2-1 ),msd2, &
+                    work( iq ), iwork( iqptr ),iwork( iprmpt ), iwork( iperm ),iwork( igivpt ), &
+                    iwork( igivcl ),work( igivnm ), work( iwrem ),iwork( subpbs+1 ), info )
+                              
+                 end if
+                 if( info/=0 )go to 130
+                 iwork( i / 2+1 ) = iwork( i+2 )
+              end do loop_90
+              subpbs = subpbs / 2
+              curlvl = curlvl + 1
+              go to 80
+           end if
+           ! end while
+           ! re-merge the eigenvalues/vectors which were deflated at the final
+           ! merge step.
+           if( icompq==1 ) then
+              do i = 1, n
+                 j = iwork( indxq+i )
+                 work( i ) = d( j )
+                 call stdlib_dcopy( qsiz, qstore( 1, j ), 1, q( 1, i ), 1 )
+              end do
+              call stdlib_dcopy( n, work, 1, d, 1 )
+           else if( icompq==2 ) then
+              do i = 1, n
+                 j = iwork( indxq+i )
+                 work( i ) = d( j )
+                 call stdlib_dcopy( n, q( 1, j ), 1, work( n*i+1 ), 1 )
+              end do
+              call stdlib_dcopy( n, work, 1, d, 1 )
+              call stdlib_dlacpy( 'A', n, n, work( n+1 ), n, q, ldq )
+           else
+              do i = 1, n
+                 j = iwork( indxq+i )
+                 work( i ) = d( j )
+              end do
+              call stdlib_dcopy( n, work, 1, d, 1 )
+           end if
+           go to 140
+           130 continue
+           info = submat*( n+1 ) + submat + matsiz - 1
+           140 continue
+           return
+     end subroutine stdlib_dlaed0
+
+     !> DSTEDC: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric tridiagonal matrix using the divide and conquer method.
+     !> The eigenvectors of a full or band real symmetric matrix can also be
+     !> found if DSYTRD or DSPTRD or DSBTRD has been used to reduce this
+     !> matrix to tridiagonal form.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.  See DLAED3 for details.
+
+     pure subroutine stdlib_dstedc( compz, n, d, e, z, ldz, work, lwork, iwork,liwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: d(*), e(*), z(ldz,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: finish, i, icompz, ii, j, k, lgn, liwmin, lwmin, m, smlsiz, start, &
+                     storez, strtrw
+           real(dp) :: eps, orgnrm, p, tiny
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log,max,mod,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or.( icompz>0 .and. ldz<max( 1, n ) ) ) then
+              info = -6
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              smlsiz = stdlib_ilaenv( 9, 'DSTEDC', ' ', 0, 0, 0, 0 )
+              if( n<=1 .or. icompz==0 ) then
+                 liwmin = 1
+                 lwmin = 1
+              else if( n<=smlsiz ) then
+                 liwmin = 1
+                 lwmin = 2*( n - 1 )
+              else
+                 lgn = int( log( real( n,KIND=dp) )/log( two ),KIND=ilp)
+                 if( 2**lgn<n )lgn = lgn + 1
+                 if( 2**lgn<n )lgn = lgn + 1
+                 if( icompz==1 ) then
+                    lwmin = 1 + 3*n + 2*n*lgn + 4*n**2
+                    liwmin = 6 + 6*n + 5*n*lgn
+                 else if( icompz==2 ) then
+                    lwmin = 1 + 4*n + n**2
+                    liwmin = 3 + 5*n
+                 end if
+              end if
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not. lquery ) then
+                 info = -8
+              else if( liwork<liwmin .and. .not. lquery ) then
+                 info = -10
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEDC', -info )
+              return
+           else if (lquery) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz/=0 )z( 1, 1 ) = one
+              return
+           end if
+           ! if the following conditional clause is removed, then the routine
+           ! will use the divide and conquer routine to compute only the
+           ! eigenvalues, which requires (3n + 3n**2) real workspace and
+           ! (2 + 5n + 2n lg(n)) integer workspace.
+           ! since on many architectures stdlib_dsterf is much faster than any other
+           ! algorithm for finding eigenvalues only, it is used here
+           ! as the default. if the conditional clause is removed, then
+           ! information on the size of workspace needs to be changed.
+           ! if compz = 'n', use stdlib_dsterf to compute the eigenvalues.
+           if( icompz==0 ) then
+              call stdlib_dsterf( n, d, e, info )
+              go to 50
+           end if
+           ! if n is smaller than the minimum divide size (smlsiz+1), then
+           ! solve the problem with another solver.
+           if( n<=smlsiz ) then
+              call stdlib_dsteqr( compz, n, d, e, z, ldz, work, info )
+           else
+              ! if compz = 'v', the z matrix must be stored elsewhere for later
+              ! use.
+              if( icompz==1 ) then
+                 storez = 1 + n*n
+              else
+                 storez = 1
+              end if
+              if( icompz==2 ) then
+                 call stdlib_dlaset( 'FULL', n, n, zero, one, z, ldz )
+              end if
+              ! scale.
+              orgnrm = stdlib_dlanst( 'M', n, d, e )
+              if( orgnrm==zero )go to 50
+              eps = stdlib_dlamch( 'EPSILON' )
+              start = 1
+              ! while ( start <= n )
+              10 continue
+              if( start<=n ) then
+                 ! let finish be the position of the next subdiagonal entry
+                 ! such that e( finish ) <= tiny or finish = n if no such
+                 ! subdiagonal exists.  the matrix identified by the elements
+                 ! between start and finish constitutes an independent
+                 ! sub-problem.
+                 finish = start
+                 20 continue
+                 if( finish<n ) then
+                    tiny = eps*sqrt( abs( d( finish ) ) )*sqrt( abs( d( finish+1 ) ) )
+                    if( abs( e( finish ) )>tiny ) then
+                       finish = finish + 1
+                       go to 20
+                    end if
+                 end if
+                 ! (sub) problem determined.  compute its size and solve it.
+                 m = finish - start + 1
+                 if( m==1 ) then
+                    start = finish + 1
+                    go to 10
+                 end if
+                 if( m>smlsiz ) then
+                    ! scale.
+                    orgnrm = stdlib_dlanst( 'M', m, d( start ), e( start ) )
+                    call stdlib_dlascl( 'G', 0, 0, orgnrm, one, m, 1, d( start ), m,info )
+                    call stdlib_dlascl( 'G', 0, 0, orgnrm, one, m-1, 1, e( start ),m-1, info )
+                              
+                    if( icompz==1 ) then
+                       strtrw = 1
+                    else
+                       strtrw = start
+                    end if
+                    call stdlib_dlaed0( icompz, n, m, d( start ), e( start ),z( strtrw, start ), &
+                              ldz, work( 1 ), n,work( storez ), iwork, info )
+                    if( info/=0 ) then
+                       info = ( info / ( m+1 )+start-1 )*( n+1 ) +mod( info, ( m+1 ) ) + start - &
+                                 1
+                       go to 50
+                    end if
+                    ! scale back.
+                    call stdlib_dlascl( 'G', 0, 0, one, orgnrm, m, 1, d( start ), m,info )
+                 else
+                    if( icompz==1 ) then
+                       ! since qr won't update a z matrix which is larger than
+                       ! the length of d, we must solve the sub-problem in a
+                       ! workspace and then multiply back into z.
+                       call stdlib_dsteqr( 'I', m, d( start ), e( start ), work, m,work( m*m+1 ), &
+                                 info )
+                       call stdlib_dlacpy( 'A', n, m, z( 1, start ), ldz,work( storez ), n )
+                                 
+                       call stdlib_dgemm( 'N', 'N', n, m, m, one,work( storez ), n, work, m, zero,&
+                                 z( 1, start ), ldz )
+                    else if( icompz==2 ) then
+                       call stdlib_dsteqr( 'I', m, d( start ), e( start ),z( start, start ), ldz, &
+                                 work, info )
+                    else
+                       call stdlib_dsterf( m, d( start ), e( start ), info )
+                    end if
+                    if( info/=0 ) then
+                       info = start*( n+1 ) + finish
+                       go to 50
+                    end if
+                 end if
+                 start = finish + 1
+                 go to 10
+              end if
+              ! endwhile
+              if( icompz==0 ) then
+                ! use quick sort
+                call stdlib_dlasrt( 'I', n, d, info )
+              else
+                ! use selection sort to minimize swaps of eigenvectors
+                do ii = 2, n
+                   i = ii - 1
+                   k = i
+                   p = d( i )
+                   do j = ii, n
+                      if( d( j )<p ) then
+                         k = j
+                         p = d( j )
+                      end if
+                   end do
+                   if( k/=i ) then
+                      d( k ) = d( i )
+                      d( i ) = p
+                      call stdlib_dswap( n, z( 1, i ), 1, z( 1, k ), 1 )
+                   end if
+                end do
+              end if
+           end if
+           50 continue
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_dstedc
+
+     !> DSTEVD: computes all eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric tridiagonal matrix. If eigenvectors are desired, it
+     !> uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_dstevd( jobz, n, d, e, z, ldz, work, lwork, iwork,liwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantz
+           integer(ilp) :: iscale, liwmin, lwmin
+           real(dp) :: bignum, eps, rmax, rmin, safmin, sigma, smlnum, tnrm
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           liwmin = 1
+           lwmin = 1
+           if( n>1 .and. wantz ) then
+              lwmin = 1 + 4*n + n**2
+              liwmin = 3 + 5*n
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -6
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -8
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -10
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           tnrm = stdlib_dlanst( 'M', n, d, e )
+           if( tnrm>zero .and. tnrm<rmin ) then
+              iscale = 1
+              sigma = rmin / tnrm
+           else if( tnrm>rmax ) then
+              iscale = 1
+              sigma = rmax / tnrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_dscal( n, sigma, d, 1 )
+              call stdlib_dscal( n-1, sigma, e( 1 ), 1 )
+           end if
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvalues and
+           ! eigenvectors, call stdlib_dstedc.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, d, e, info )
+           else
+              call stdlib_dstedc( 'I', n, d, e, z, ldz, work, lwork, iwork, liwork,info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 )call stdlib_dscal( n, one / sigma, d, 1 )
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_dstevd
+
+     !> DSYEVD: computes all eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric matrix A. If eigenvectors are desired, it uses a
+     !> divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+     !> Because of large use of BLAS of level 3, DSYEVD needs N**2 more
+     !> workspace than DSYEVX.
+
+     subroutine stdlib_dsyevd( jobz, uplo, n, a, lda, w, work, lwork, iwork,liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, inde, indtau, indwk2, indwrk, iscale, liopt, liwmin, llwork, &
+                     llwrk2, lopt, lwmin
+           real(dp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 liwmin = 1
+                 lwmin = 1
+                 lopt = lwmin
+                 liopt = liwmin
+              else
+                 if( wantz ) then
+                    liwmin = 3 + 5*n
+                    lwmin = 1 + 6*n + 2*n**2
+                 else
+                    liwmin = 1
+                    lwmin = 2*n + 1
+                 end if
+                 lopt = max( lwmin, 2*n +stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 ) )
+                           
+                 liopt = liwmin
+              end if
+              work( 1 ) = lopt
+              iwork( 1 ) = liopt
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -8
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -10
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = a( 1, 1 )
+              if( wantz )a( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_dlansy( 'M', uplo, n, a, lda, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 )call stdlib_dlascl( uplo, 0, 0, one, sigma, n, n, a, lda, info )
+           ! call stdlib_dsytrd to reduce symmetric matrix to tridiagonal form.
+           inde = 1
+           indtau = inde + n
+           indwrk = indtau + n
+           llwork = lwork - indwrk + 1
+           indwk2 = indwrk + n*n
+           llwrk2 = lwork - indwk2 + 1
+           call stdlib_dsytrd( uplo, n, a, lda, w, work( inde ), work( indtau ),work( indwrk ), &
+                     llwork, iinfo )
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, first call
+           ! stdlib_dstedc to generate the eigenvector matrix, work(indwrk), of the
+           ! tridiagonal matrix, then call stdlib_dormtr to multiply it by the
+           ! householder transformations stored in a.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, work( inde ), info )
+           else
+              call stdlib_dstedc( 'I', n, w, work( inde ), work( indwrk ), n,work( indwk2 ), &
+                        llwrk2, iwork, liwork, info )
+              call stdlib_dormtr( 'L', uplo, 'N', n, n, a, lda, work( indtau ),work( indwrk ), n, &
+                        work( indwk2 ), llwrk2, iinfo )
+              call stdlib_dlacpy( 'A', n, n, work( indwrk ), n, a, lda )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 )call stdlib_dscal( n, one / sigma, w, 1 )
+           work( 1 ) = lopt
+           iwork( 1 ) = liopt
+           return
+     end subroutine stdlib_dsyevd
+
+     !> DSYGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be symmetric and B is also positive definite.
+     !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_dsygvd( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, iwork, liwork,&
+                info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: liopt, liwmin, lopt, lwmin
+           ! Intrinsic Functions 
+           intrinsic :: real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              liwmin = 1
+              lwmin = 1
+           else if( wantz ) then
+              liwmin = 3 + 5*n
+              lwmin = 1 + 6*n + 2*n**2
+           else
+              liwmin = 1
+              lwmin = 2*n + 1
+           end if
+           lopt = lwmin
+           liopt = liwmin
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              work( 1 ) = lopt
+              iwork( 1 ) = liopt
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_dpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_dsygst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_dsyevd( jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork,info )
+           lopt = max( real( lopt,KIND=dp), real( work( 1 ),KIND=dp) )
+           liopt = max( real( liopt,KIND=dp), real( iwork( 1 ),KIND=dp) )
+           if( wantz .and. info==0 ) then
+              ! backtransform eigenvectors to the original problem.
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 call stdlib_dtrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, n, one,b, ldb, a, lda )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_dtrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, n, one,b, ldb, a, lda )
+                           
+              end if
+           end if
+           work( 1 ) = lopt
+           iwork( 1 ) = liopt
+           return
+     end subroutine stdlib_dsygvd
+
+     !> DSBEVD: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a real symmetric band matrix A. If eigenvectors are desired, it uses
+     !> a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_dsbevd( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,lwork, iwork, liwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: ab(ldab,*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, inde, indwk2, indwrk, iscale, liwmin, llwrk2, lwmin
+           real(dp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              liwmin = 1
+              lwmin = 1
+           else
+              if( wantz ) then
+                 liwmin = 3 + 5*n
+                 lwmin = 1 + 5*n + 2*n**2
+              else
+                 liwmin = 1
+                 lwmin = 2*n
+              end if
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = ab( 1, 1 )
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_dlansb( 'M', uplo, n, kd, ab, ldab, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_dlascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_dlascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+           end if
+           ! call stdlib_dsbtrd to reduce symmetric band matrix to tridiagonal form.
+           inde = 1
+           indwrk = inde + n
+           indwk2 = indwrk + n*n
+           llwrk2 = lwork - indwk2 + 1
+           call stdlib_dsbtrd( jobz, uplo, n, kd, ab, ldab, w, work( inde ), z, ldz,work( indwrk )&
+                     , iinfo )
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, call stdlib_sstedc.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, work( inde ), info )
+           else
+              call stdlib_dstedc( 'I', n, w, work( inde ), work( indwrk ), n,work( indwk2 ), &
+                        llwrk2, iwork, liwork, info )
+              call stdlib_dgemm( 'N', 'N', n, n, n, one, z, ldz, work( indwrk ), n,zero, work( &
+                        indwk2 ), n )
+              call stdlib_dlacpy( 'A', n, n, work( indwk2 ), n, z, ldz )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 )call stdlib_dscal( n, one / sigma, w, 1 )
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_dsbevd
+
+     !> DSBGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite banded eigenproblem, of the
+     !> form A*x=(lambda)*B*x.  Here A and B are assumed to be symmetric and
+     !> banded, and B is also positive definite.  If eigenvectors are
+     !> desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_dsbgvd( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w,z, ldz, work, &
+               lwork, iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: vect
+           integer(ilp) :: iinfo, inde, indwk2, indwrk, liwmin, llwrk2, lwmin
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              liwmin = 1
+              lwmin = 1
+           else if( wantz ) then
+              liwmin = 3 + 5*n
+              lwmin = 1 + 5*n + 2*n**2
+           else
+              liwmin = 1
+              lwmin = 2*n
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -14
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_dpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           inde = 1
+           indwrk = inde + n
+           indwk2 = indwrk + n*n
+           llwrk2 = lwork - indwk2 + 1
+           call stdlib_dsbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, z, ldz,work, iinfo )
+                     
+           ! reduce to tridiagonal form.
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_dsbtrd( vect, uplo, n, ka, ab, ldab, w, work( inde ), z, ldz,work( indwrk )&
+                     , iinfo )
+           ! for eigenvalues only, call stdlib_dsterf. for eigenvectors, call stdlib_sstedc.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, work( inde ), info )
+           else
+              call stdlib_dstedc( 'I', n, w, work( inde ), work( indwrk ), n,work( indwk2 ), &
+                        llwrk2, iwork, liwork, info )
+              call stdlib_dgemm( 'N', 'N', n, n, n, one, z, ldz, work( indwrk ), n,zero, work( &
+                        indwk2 ), n )
+              call stdlib_dlacpy( 'A', n, n, work( indwk2 ), n, z, ldz )
+           end if
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_dsbgvd
+
+     !> DSPEVD: computes all the eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric matrix A in packed storage. If eigenvectors are
+     !> desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_dspevd( jobz, uplo, n, ap, w, z, ldz, work, lwork,iwork, liwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: ap(*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantz
+           integer(ilp) :: iinfo, inde, indtau, indwrk, iscale, liwmin, llwork, lwmin
+           real(dp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 liwmin = 1
+                 lwmin = 1
+              else
+                 if( wantz ) then
+                    liwmin = 3 + 5*n
+                    lwmin = 1 + 6*n + n**2
+                 else
+                    liwmin = 1
+                    lwmin = 2*n
+                 end if
+              end if
+              iwork( 1 ) = liwmin
+              work( 1 ) = lwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -9
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = ap( 1 )
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_dlansp( 'M', uplo, n, ap, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_dscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+           end if
+           ! call stdlib_dsptrd to reduce symmetric packed matrix to tridiagonal form.
+           inde = 1
+           indtau = inde + n
+           call stdlib_dsptrd( uplo, n, ap, w, work( inde ), work( indtau ), iinfo )
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, first call
+           ! stdlib_dstedc to generate the eigenvector matrix, work(indwrk), of the
+           ! tridiagonal matrix, then call stdlib_dopmtr to multiply it by the
+           ! householder transformations represented in ap.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, work( inde ), info )
+           else
+              indwrk = indtau + n
+              llwork = lwork - indwrk + 1
+              call stdlib_dstedc( 'I', n, w, work( inde ), z, ldz, work( indwrk ),llwork, iwork, &
+                        liwork, info )
+              call stdlib_dopmtr( 'L', uplo, 'N', n, n, ap, work( indtau ), z, ldz,work( indwrk ),&
+                         iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 )call stdlib_dscal( n, one / sigma, w, 1 )
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_dspevd
+
+     !> DSPGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be symmetric, stored in packed format, and B is also
+     !> positive definite.
+     !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_dspgvd( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,lwork, iwork, liwork,&
+                info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: ap(*), bp(*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: j, liwmin, lwmin, neig
+           ! Intrinsic Functions 
+           intrinsic :: real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 liwmin = 1
+                 lwmin = 1
+              else
+                 if( wantz ) then
+                    liwmin = 3 + 5*n
+                    lwmin = 1 + 6*n + 2*n**2
+                 else
+                    liwmin = 1
+                    lwmin = 2*n
+                 end if
+              end if
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of bp.
+           call stdlib_dpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_dspgst( itype, uplo, n, ap, bp, info )
+           call stdlib_dspevd( jobz, uplo, n, ap, w, z, ldz, work, lwork, iwork,liwork, info )
+                     
+           lwmin = max( real( lwmin,KIND=dp), real( work( 1 ),KIND=dp) )
+           liwmin = max( real( liwmin,KIND=dp), real( iwork( 1 ),KIND=dp) )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 do j = 1, neig
+                    call stdlib_dtpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t *y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, neig
+                    call stdlib_dtpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_dspgvd
+
+     !> DBDSDC: computes the singular value decomposition (SVD) of a real
+     !> N-by-N (upper or lower) bidiagonal matrix B:  B = U * S * VT,
+     !> using a divide and conquer method, where S is a diagonal matrix
+     !> with non-negative diagonal elements (the singular values of B), and
+     !> U and VT are orthogonal matrices of left and right singular vectors,
+     !> respectively. DBDSDC can be used to compute all singular values,
+     !> and optionally, singular vectors or singular vectors in compact form.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.  See DLASD3 for details.
+     !> The code currently calls DLASDQ if singular values only are desired.
+     !> However, it can be slightly modified to compute singular values
+     !> using the divide and conquer method.
+
+     pure subroutine stdlib_dbdsdc( uplo, compq, n, d, e, u, ldu, vt, ldvt, q, iq,work, iwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu, ldvt, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iq(*), iwork(*)
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: q(*), u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+        ! changed dimension statement in comment describing e from (n) to
+        ! (n-1).  sven, 17 feb 05.
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: difl, difr, givcol, givnum, givptr, i, ic, icompq, ierr, ii, is, iu, &
+           iuplo, ivt, j, k, kk, mlvl, nm1, nsize, perm, poles, qstart, smlsiz, smlszp, sqre, &
+                     start, wstart, z
+           real(dp) :: cs, eps, orgnrm, p, r, sn
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           iuplo = 0
+           if( stdlib_lsame( uplo, 'U' ) )iuplo = 1
+           if( stdlib_lsame( uplo, 'L' ) )iuplo = 2
+           if( stdlib_lsame( compq, 'N' ) ) then
+              icompq = 0
+           else if( stdlib_lsame( compq, 'P' ) ) then
+              icompq = 1
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              icompq = 2
+           else
+              icompq = -1
+           end if
+           if( iuplo==0 ) then
+              info = -1
+           else if( icompq<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ( ldu<1 ) .or. ( ( icompq==2 ) .and. ( ldu<n ) ) ) then
+              info = -7
+           else if( ( ldvt<1 ) .or. ( ( icompq==2 ) .and. ( ldvt<n ) ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DBDSDC', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           smlsiz = stdlib_ilaenv( 9, 'DBDSDC', ' ', 0, 0, 0, 0 )
+           if( n==1 ) then
+              if( icompq==1 ) then
+                 q( 1 ) = sign( one, d( 1 ) )
+                 q( 1+smlsiz*n ) = one
+              else if( icompq==2 ) then
+                 u( 1, 1 ) = sign( one, d( 1 ) )
+                 vt( 1, 1 ) = one
+              end if
+              d( 1 ) = abs( d( 1 ) )
+              return
+           end if
+           nm1 = n - 1
+           ! if matrix lower bidiagonal, rotate to be upper bidiagonal
+           ! by applying givens rotations on the left
+           wstart = 1
+           qstart = 3
+           if( icompq==1 ) then
+              call stdlib_dcopy( n, d, 1, q( 1 ), 1 )
+              call stdlib_dcopy( n-1, e, 1, q( n+1 ), 1 )
+           end if
+           if( iuplo==2 ) then
+              qstart = 5
+              if( icompq == 2 ) wstart = 2*n - 1
+              do i = 1, n - 1
+                 call stdlib_dlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( icompq==1 ) then
+                    q( i+2*n ) = cs
+                    q( i+3*n ) = sn
+                 else if( icompq==2 ) then
+                    work( i ) = cs
+                    work( nm1+i ) = -sn
+                 end if
+              end do
+           end if
+           ! if icompq = 0, use stdlib_dlasdq to compute the singular values.
+           if( icompq==0 ) then
+              ! ignore wstart, instead using work( 1 ), since the two vectors
+              ! for cs and -sn above are added only if icompq == 2,
+              ! and adding them exceeds documented work size of 4*n.
+              call stdlib_dlasdq( 'U', 0, n, 0, 0, 0, d, e, vt, ldvt, u, ldu, u,ldu, work( 1 ), &
+                        info )
+              go to 40
+           end if
+           ! if n is smaller than the minimum divide size smlsiz, then solve
+           ! the problem with another solver.
+           if( n<=smlsiz ) then
+              if( icompq==2 ) then
+                 call stdlib_dlaset( 'A', n, n, zero, one, u, ldu )
+                 call stdlib_dlaset( 'A', n, n, zero, one, vt, ldvt )
+                 call stdlib_dlasdq( 'U', 0, n, n, n, 0, d, e, vt, ldvt, u, ldu, u,ldu, work( &
+                           wstart ), info )
+              else if( icompq==1 ) then
+                 iu = 1
+                 ivt = iu + n
+                 call stdlib_dlaset( 'A', n, n, zero, one, q( iu+( qstart-1 )*n ),n )
+                 call stdlib_dlaset( 'A', n, n, zero, one, q( ivt+( qstart-1 )*n ),n )
+                 call stdlib_dlasdq( 'U', 0, n, n, n, 0, d, e,q( ivt+( qstart-1 )*n ), n,q( iu+( &
+                           qstart-1 )*n ), n,q( iu+( qstart-1 )*n ), n, work( wstart ),info )
+              end if
+              go to 40
+           end if
+           if( icompq==2 ) then
+              call stdlib_dlaset( 'A', n, n, zero, one, u, ldu )
+              call stdlib_dlaset( 'A', n, n, zero, one, vt, ldvt )
+           end if
+           ! scale.
+           orgnrm = stdlib_dlanst( 'M', n, d, e )
+           if( orgnrm==zero )return
+           call stdlib_dlascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, ierr )
+           call stdlib_dlascl( 'G', 0, 0, orgnrm, one, nm1, 1, e, nm1, ierr )
+           eps = (0.9e+0_dp)*stdlib_dlamch( 'EPSILON' )
+           mlvl = int( log( real( n,KIND=dp) / real( smlsiz+1,KIND=dp) ) / log( two ),KIND=ilp) + &
+                     1
+           smlszp = smlsiz + 1
+           if( icompq==1 ) then
+              iu = 1
+              ivt = 1 + smlsiz
+              difl = ivt + smlszp
+              difr = difl + mlvl
+              z = difr + mlvl*2
+              ic = z + mlvl
+              is = ic + 1
+              poles = is + 1
+              givnum = poles + 2*mlvl
+              k = 1
+              givptr = 2
+              perm = 3
+              givcol = perm + mlvl
+           end if
+           do i = 1, n
+              if( abs( d( i ) )<eps ) then
+                 d( i ) = sign( eps, d( i ) )
+              end if
+           end do
+           start = 1
+           sqre = 0
+           loop_30: do i = 1, nm1
+              if( ( abs( e( i ) )<eps ) .or. ( i==nm1 ) ) then
+                 ! subproblem found. first determine its size and then
+                 ! apply divide and conquer on it.
+                 if( i<nm1 ) then
+                    ! a subproblem with e(i) small for i < nm1.
+                    nsize = i - start + 1
+                 else if( abs( e( i ) )>=eps ) then
+                    ! a subproblem with e(nm1) not too small but i = nm1.
+                    nsize = n - start + 1
+                 else
+                    ! a subproblem with e(nm1) small. this implies an
+                    ! 1-by-1 subproblem at d(n). solve this 1-by-1 problem
+                    ! first.
+                    nsize = i - start + 1
+                    if( icompq==2 ) then
+                       u( n, n ) = sign( one, d( n ) )
+                       vt( n, n ) = one
+                    else if( icompq==1 ) then
+                       q( n+( qstart-1 )*n ) = sign( one, d( n ) )
+                       q( n+( smlsiz+qstart-1 )*n ) = one
+                    end if
+                    d( n ) = abs( d( n ) )
+                 end if
+                 if( icompq==2 ) then
+                    call stdlib_dlasd0( nsize, sqre, d( start ), e( start ),u( start, start ), &
+                              ldu, vt( start, start ),ldvt, smlsiz, iwork, work( wstart ), info )
+                 else
+                    call stdlib_dlasda( icompq, smlsiz, nsize, sqre, d( start ),e( start ), q( &
+                    start+( iu+qstart-2 )*n ), n,q( start+( ivt+qstart-2 )*n ),iq( start+k*n ), q(&
+                     start+( difl+qstart-2 )*n ), q( start+( difr+qstart-2 )*n ),q( start+( z+&
+                     qstart-2 )*n ),q( start+( poles+qstart-2 )*n ),iq( start+givptr*n ), iq( &
+                     start+givcol*n ),n, iq( start+perm*n ),q( start+( givnum+qstart-2 )*n ),q( &
+                     start+( ic+qstart-2 )*n ),q( start+( is+qstart-2 )*n ),work( wstart ), iwork,&
+                                info )
+                 end if
+                 if( info/=0 ) then
+                    return
+                 end if
+                 start = i + 1
+              end if
+           end do loop_30
+           ! unscale
+           call stdlib_dlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, ierr )
+           40 continue
+           ! use selection sort to minimize swaps of singular vectors
+           do ii = 2, n
+              i = ii - 1
+              kk = i
+              p = d( i )
+              do j = ii, n
+                 if( d( j )>p ) then
+                    kk = j
+                    p = d( j )
+                 end if
+              end do
+              if( kk/=i ) then
+                 d( kk ) = d( i )
+                 d( i ) = p
+                 if( icompq==1 ) then
+                    iq( i ) = kk
+                 else if( icompq==2 ) then
+                    call stdlib_dswap( n, u( 1, i ), 1, u( 1, kk ), 1 )
+                    call stdlib_dswap( n, vt( i, 1 ), ldvt, vt( kk, 1 ), ldvt )
+                 end if
+              else if( icompq==1 ) then
+                 iq( i ) = i
+              end if
+           end do
+           ! if icompq = 1, use iq(n,1) as the indicator for uplo
+           if( icompq==1 ) then
+              if( iuplo==1 ) then
+                 iq( n ) = 1
+              else
+                 iq( n ) = 0
+              end if
+           end if
+           ! if b is lower bidiagonal, update u by those givens rotations
+           ! which rotated b to be upper bidiagonal
+           if( ( iuplo==2 ) .and. ( icompq==2 ) )call stdlib_dlasr( 'L', 'V', 'B', n, n, work( 1 )&
+                     , work( n ), u, ldu )
+           return
+     end subroutine stdlib_dbdsdc
+
+     !> DBDSQR: computes the singular values and, optionally, the right and/or
+     !> left singular vectors from the singular value decomposition (SVD) of
+     !> a real N-by-N (upper or lower) bidiagonal matrix B using the implicit
+     !> zero-shift QR algorithm.  The SVD of B has the form
+     !> B = Q * S * P**T
+     !> where S is the diagonal matrix of singular values, Q is an orthogonal
+     !> matrix of left singular vectors, and P is an orthogonal matrix of
+     !> right singular vectors.  If left singular vectors are requested, this
+     !> subroutine actually returns U*Q instead of Q, and, if right singular
+     !> vectors are requested, this subroutine returns P**T*VT instead of
+     !> P**T, for given real input matrices U and VT.  When U and VT are the
+     !> orthogonal matrices that reduce a general matrix A to bidiagonal
+     !> form:  A = U*B*VT, as computed by DGEBRD, then
+     !> A = (U*Q) * S * (P**T*VT)
+     !> is the SVD of A.  Optionally, the subroutine may also compute Q**T*C
+     !> for a given real input matrix C.
+     !> See "Computing  Small Singular Values of Bidiagonal Matrices With
+     !> Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+     !> LAPACK Working Note #3 (or SIAM J. Sci. Statist. Comput. vol. 11,
+     !> no. 5, pp. 873-912, Sept 1990) and
+     !> "Accurate singular values and differential qd algorithms," by
+     !> B. Parlett and V. Fernando, Technical Report CPAM-554, Mathematics
+     !> Department, University of California at Berkeley, July 1992
+     !> for a detailed description of the algorithm.
+
+     pure subroutine stdlib_dbdsqr( uplo, n, ncvt, nru, ncc, d, e, vt, ldvt, u,ldu, c, ldc, work, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, ldu, ldvt, n, ncc, ncvt, nru
+           ! Array Arguments 
+           real(dp), intent(inout) :: c(ldc,*), d(*), e(*), u(ldu,*), vt(ldvt,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: hndrth = 0.01_dp
+           real(dp), parameter :: hndrd = 100.0_dp
+           real(dp), parameter :: meigth = -0.125_dp
+           integer(ilp), parameter :: maxitr = 6
+           
+           
+           
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, rotate
+           integer(ilp) :: i, idir, isub, iter, iterdivn, j, ll, lll, m, maxitdivn, nm1, nm12, &
+                     nm13, oldll, oldm
+           real(dp) :: abse, abss, cosl, cosr, cs, eps, f, g, h, mu, oldcs, oldsn, r, shift, &
+           sigmn, sigmx, sinl, sinr, sll, smax, smin, sminl, sminoa, sn, thresh, tol, tolmul, &
+                     unfl
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.lower ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ncvt<0 ) then
+              info = -3
+           else if( nru<0 ) then
+              info = -4
+           else if( ncc<0 ) then
+              info = -5
+           else if( ( ncvt==0 .and. ldvt<1 ) .or.( ncvt>0 .and. ldvt<max( 1, n ) ) ) then
+              info = -9
+           else if( ldu<max( 1, nru ) ) then
+              info = -11
+           else if( ( ncc==0 .and. ldc<1 ) .or.( ncc>0 .and. ldc<max( 1, n ) ) ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DBDSQR', -info )
+              return
+           end if
+           if( n==0 )return
+           if( n==1 )go to 160
+           ! rotate is true if any singular vectors desired, false otherwise
+           rotate = ( ncvt>0 ) .or. ( nru>0 ) .or. ( ncc>0 )
+           ! if no singular vectors desired, use qd algorithm
+           if( .not.rotate ) then
+              call stdlib_dlasq1( n, d, e, work, info )
+           ! if info equals 2, dqds didn't finish, try to finish
+              if( info /= 2 ) return
+              info = 0
+           end if
+           nm1 = n - 1
+           nm12 = nm1 + nm1
+           nm13 = nm12 + nm1
+           idir = 0
+           ! get machine constants
+           eps = stdlib_dlamch( 'EPSILON' )
+           unfl = stdlib_dlamch( 'SAFE MINIMUM' )
+           ! if matrix lower bidiagonal, rotate to be upper bidiagonal
+           ! by applying givens rotations on the left
+           if( lower ) then
+              do i = 1, n - 1
+                 call stdlib_dlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 work( i ) = cs
+                 work( nm1+i ) = sn
+              end do
+              ! update singular vectors if desired
+              if( nru>0 )call stdlib_dlasr( 'R', 'V', 'F', nru, n, work( 1 ), work( n ), u,ldu )
+                        
+              if( ncc>0 )call stdlib_dlasr( 'L', 'V', 'F', n, ncc, work( 1 ), work( n ), c,ldc )
+                        
+           end if
+           ! compute singular values to relative accuracy tol
+           ! (by setting tol to be negative, algorithm will compute
+           ! singular values to absolute accuracy abs(tol)*norm(input matrix))
+           tolmul = max( ten, min( hndrd, eps**meigth ) )
+           tol = tolmul*eps
+           ! compute approximate maximum, minimum singular values
+           smax = zero
+           do i = 1, n
+              smax = max( smax, abs( d( i ) ) )
+           end do
+           do i = 1, n - 1
+              smax = max( smax, abs( e( i ) ) )
+           end do
+           sminl = zero
+           if( tol>=zero ) then
+              ! relative accuracy desired
+              sminoa = abs( d( 1 ) )
+              if( sminoa==zero )go to 50
+              mu = sminoa
+              do i = 2, n
+                 mu = abs( d( i ) )*( mu / ( mu+abs( e( i-1 ) ) ) )
+                 sminoa = min( sminoa, mu )
+                 if( sminoa==zero )go to 50
+              end do
+              50 continue
+              sminoa = sminoa / sqrt( real( n,KIND=dp) )
+              thresh = max( tol*sminoa, maxitr*(n*(n*unfl)) )
+           else
+              ! absolute accuracy desired
+              thresh = max( abs( tol )*smax, maxitr*(n*(n*unfl)) )
+           end if
+           ! prepare for main iteration loop for the singular values
+           ! (maxit is the maximum number of passes through the inner
+           ! loop permitted before nonconvergence signalled.)
+           maxitdivn = maxitr*n
+           iterdivn = 0
+           iter = -1
+           oldll = -1
+           oldm = -1
+           ! m points to last element of unconverged part of matrix
+           m = n
+           ! begin main iteration loop
+           60 continue
+           ! check for convergence or exceeding iteration count
+           if( m<=1 )go to 160
+           if( iter>=n ) then
+              iter = iter - n
+              iterdivn = iterdivn + 1
+              if( iterdivn>=maxitdivn )go to 200
+           end if
+           ! find diagonal block of matrix to work on
+           if( tol<zero .and. abs( d( m ) )<=thresh )d( m ) = zero
+           smax = abs( d( m ) )
+           smin = smax
+           do lll = 1, m - 1
+              ll = m - lll
+              abss = abs( d( ll ) )
+              abse = abs( e( ll ) )
+              if( tol<zero .and. abss<=thresh )d( ll ) = zero
+              if( abse<=thresh )go to 80
+              smin = min( smin, abss )
+              smax = max( smax, abss, abse )
+           end do
+           ll = 0
+           go to 90
+           80 continue
+           e( ll ) = zero
+           ! matrix splits since e(ll) = 0
+           if( ll==m-1 ) then
+              ! convergence of bottom singular value, return to top of loop
+              m = m - 1
+              go to 60
+           end if
+           90 continue
+           ll = ll + 1
+           ! e(ll) through e(m-1) are nonzero, e(ll-1) is zero
+           if( ll==m-1 ) then
+              ! 2 by 2 block, handle separately
+              call stdlib_dlasv2( d( m-1 ), e( m-1 ), d( m ), sigmn, sigmx, sinr,cosr, sinl, cosl &
+                        )
+              d( m-1 ) = sigmx
+              e( m-1 ) = zero
+              d( m ) = sigmn
+              ! compute singular vectors, if desired
+              if( ncvt>0 )call stdlib_drot( ncvt, vt( m-1, 1 ), ldvt, vt( m, 1 ), ldvt, cosr,sinr &
+                        )
+              if( nru>0 )call stdlib_drot( nru, u( 1, m-1 ), 1, u( 1, m ), 1, cosl, sinl )
+              if( ncc>0 )call stdlib_drot( ncc, c( m-1, 1 ), ldc, c( m, 1 ), ldc, cosl,sinl )
+                        
+              m = m - 2
+              go to 60
+           end if
+           ! if working on new submatrix, choose shift direction
+           ! (from larger end diagonal element towards smaller)
+           if( ll>oldm .or. m<oldll ) then
+              if( abs( d( ll ) )>=abs( d( m ) ) ) then
+                 ! chase bulge from top (big end) to bottom (small end)
+                 idir = 1
+              else
+                 ! chase bulge from bottom (big end) to top (small end)
+                 idir = 2
+              end if
+           end if
+           ! apply convergence tests
+           if( idir==1 ) then
+              ! run convergence test in forward direction
+              ! first apply standard test to bottom of matrix
+              if( abs( e( m-1 ) )<=abs( tol )*abs( d( m ) ) .or.( tol<zero .and. abs( e( m-1 ) )&
+                        <=thresh ) ) then
+                 e( m-1 ) = zero
+                 go to 60
+              end if
+              if( tol>=zero ) then
+                 ! if relative accuracy desired,
+                 ! apply convergence criterion forward
+                 mu = abs( d( ll ) )
+                 sminl = mu
+                 do lll = ll, m - 1
+                    if( abs( e( lll ) )<=tol*mu ) then
+                       e( lll ) = zero
+                       go to 60
+                    end if
+                    mu = abs( d( lll+1 ) )*( mu / ( mu+abs( e( lll ) ) ) )
+                    sminl = min( sminl, mu )
+                 end do
+              end if
+           else
+              ! run convergence test in backward direction
+              ! first apply standard test to top of matrix
+              if( abs( e( ll ) )<=abs( tol )*abs( d( ll ) ) .or.( tol<zero .and. abs( e( ll ) )&
+                        <=thresh ) ) then
+                 e( ll ) = zero
+                 go to 60
+              end if
+              if( tol>=zero ) then
+                 ! if relative accuracy desired,
+                 ! apply convergence criterion backward
+                 mu = abs( d( m ) )
+                 sminl = mu
+                 do lll = m - 1, ll, -1
+                    if( abs( e( lll ) )<=tol*mu ) then
+                       e( lll ) = zero
+                       go to 60
+                    end if
+                    mu = abs( d( lll ) )*( mu / ( mu+abs( e( lll ) ) ) )
+                    sminl = min( sminl, mu )
+                 end do
+              end if
+           end if
+           oldll = ll
+           oldm = m
+           ! compute shift.  first, test if shifting would ruin relative
+           ! accuracy, and if so set the shift to zero.
+           if( tol>=zero .and. n*tol*( sminl / smax )<=max( eps, hndrth*tol ) ) then
+              ! use a zero shift to avoid loss of relative accuracy
+              shift = zero
+           else
+              ! compute the shift from 2-by-2 block at end of matrix
+              if( idir==1 ) then
+                 sll = abs( d( ll ) )
+                 call stdlib_dlas2( d( m-1 ), e( m-1 ), d( m ), shift, r )
+              else
+                 sll = abs( d( m ) )
+                 call stdlib_dlas2( d( ll ), e( ll ), d( ll+1 ), shift, r )
+              end if
+              ! test if shift negligible, and if so set to zero
+              if( sll>zero ) then
+                 if( ( shift / sll )**2<eps )shift = zero
+              end if
+           end if
+           ! increment iteration count
+           iter = iter + m - ll
+           ! if shift = 0, do simplified qr iteration
+           if( shift==zero ) then
+              if( idir==1 ) then
+                 ! chase bulge from top to bottom
+                 ! save cosines and sines for later singular vector updates
+                 cs = one
+                 oldcs = one
+                 do i = ll, m - 1
+                    call stdlib_dlartg( d( i )*cs, e( i ), cs, sn, r )
+                    if( i>ll )e( i-1 ) = oldsn*r
+                    call stdlib_dlartg( oldcs*r, d( i+1 )*sn, oldcs, oldsn, d( i ) )
+                    work( i-ll+1 ) = cs
+                    work( i-ll+1+nm1 ) = sn
+                    work( i-ll+1+nm12 ) = oldcs
+                    work( i-ll+1+nm13 ) = oldsn
+                 end do
+                 h = d( m )*cs
+                 d( m ) = h*oldcs
+                 e( m-1 ) = h*oldsn
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_dlasr( 'L', 'V', 'F', m-ll+1, ncvt, work( 1 ),work( n ), &
+                           vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_dlasr( 'R', 'V', 'F', nru, m-ll+1, work( nm12+1 ),work( &
+                           nm13+1 ), u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_dlasr( 'L', 'V', 'F', m-ll+1, ncc, work( nm12+1 ),work( &
+                           nm13+1 ), c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( m-1 ) )<=thresh )e( m-1 ) = zero
+              else
+                 ! chase bulge from bottom to top
+                 ! save cosines and sines for later singular vector updates
+                 cs = one
+                 oldcs = one
+                 do i = m, ll + 1, -1
+                    call stdlib_dlartg( d( i )*cs, e( i-1 ), cs, sn, r )
+                    if( i<m )e( i ) = oldsn*r
+                    call stdlib_dlartg( oldcs*r, d( i-1 )*sn, oldcs, oldsn, d( i ) )
+                    work( i-ll ) = cs
+                    work( i-ll+nm1 ) = -sn
+                    work( i-ll+nm12 ) = oldcs
+                    work( i-ll+nm13 ) = -oldsn
+                 end do
+                 h = d( ll )*cs
+                 d( ll ) = h*oldcs
+                 e( ll ) = h*oldsn
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_dlasr( 'L', 'V', 'B', m-ll+1, ncvt, work( nm12+1 ),work( &
+                           nm13+1 ), vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_dlasr( 'R', 'V', 'B', nru, m-ll+1, work( 1 ),work( n ), u(&
+                            1, ll ), ldu )
+                 if( ncc>0 )call stdlib_dlasr( 'L', 'V', 'B', m-ll+1, ncc, work( 1 ),work( n ), c(&
+                            ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( ll ) )<=thresh )e( ll ) = zero
+              end if
+           else
+              ! use nonzero shift
+              if( idir==1 ) then
+                 ! chase bulge from top to bottom
+                 ! save cosines and sines for later singular vector updates
+                 f = ( abs( d( ll ) )-shift )*( sign( one, d( ll ) )+shift / d( ll ) )
+                 g = e( ll )
+                 do i = ll, m - 1
+                    call stdlib_dlartg( f, g, cosr, sinr, r )
+                    if( i>ll )e( i-1 ) = r
+                    f = cosr*d( i ) + sinr*e( i )
+                    e( i ) = cosr*e( i ) - sinr*d( i )
+                    g = sinr*d( i+1 )
+                    d( i+1 ) = cosr*d( i+1 )
+                    call stdlib_dlartg( f, g, cosl, sinl, r )
+                    d( i ) = r
+                    f = cosl*e( i ) + sinl*d( i+1 )
+                    d( i+1 ) = cosl*d( i+1 ) - sinl*e( i )
+                    if( i<m-1 ) then
+                       g = sinl*e( i+1 )
+                       e( i+1 ) = cosl*e( i+1 )
+                    end if
+                    work( i-ll+1 ) = cosr
+                    work( i-ll+1+nm1 ) = sinr
+                    work( i-ll+1+nm12 ) = cosl
+                    work( i-ll+1+nm13 ) = sinl
+                 end do
+                 e( m-1 ) = f
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_dlasr( 'L', 'V', 'F', m-ll+1, ncvt, work( 1 ),work( n ), &
+                           vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_dlasr( 'R', 'V', 'F', nru, m-ll+1, work( nm12+1 ),work( &
+                           nm13+1 ), u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_dlasr( 'L', 'V', 'F', m-ll+1, ncc, work( nm12+1 ),work( &
+                           nm13+1 ), c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( m-1 ) )<=thresh )e( m-1 ) = zero
+              else
+                 ! chase bulge from bottom to top
+                 ! save cosines and sines for later singular vector updates
+                 f = ( abs( d( m ) )-shift )*( sign( one, d( m ) )+shift /d( m ) )
+                 g = e( m-1 )
+                 do i = m, ll + 1, -1
+                    call stdlib_dlartg( f, g, cosr, sinr, r )
+                    if( i<m )e( i ) = r
+                    f = cosr*d( i ) + sinr*e( i-1 )
+                    e( i-1 ) = cosr*e( i-1 ) - sinr*d( i )
+                    g = sinr*d( i-1 )
+                    d( i-1 ) = cosr*d( i-1 )
+                    call stdlib_dlartg( f, g, cosl, sinl, r )
+                    d( i ) = r
+                    f = cosl*e( i-1 ) + sinl*d( i-1 )
+                    d( i-1 ) = cosl*d( i-1 ) - sinl*e( i-1 )
+                    if( i>ll+1 ) then
+                       g = sinl*e( i-2 )
+                       e( i-2 ) = cosl*e( i-2 )
+                    end if
+                    work( i-ll ) = cosr
+                    work( i-ll+nm1 ) = -sinr
+                    work( i-ll+nm12 ) = cosl
+                    work( i-ll+nm13 ) = -sinl
+                 end do
+                 e( ll ) = f
+                 ! test convergence
+                 if( abs( e( ll ) )<=thresh )e( ll ) = zero
+                 ! update singular vectors if desired
+                 if( ncvt>0 )call stdlib_dlasr( 'L', 'V', 'B', m-ll+1, ncvt, work( nm12+1 ),work( &
+                           nm13+1 ), vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_dlasr( 'R', 'V', 'B', nru, m-ll+1, work( 1 ),work( n ), u(&
+                            1, ll ), ldu )
+                 if( ncc>0 )call stdlib_dlasr( 'L', 'V', 'B', m-ll+1, ncc, work( 1 ),work( n ), c(&
+                            ll, 1 ), ldc )
+              end if
+           end if
+           ! qr iteration finished, go back and check convergence
+           go to 60
+           ! all singular values converged, so make them positive
+           160 continue
+           do i = 1, n
+              if( d( i )<zero ) then
+                 d( i ) = -d( i )
+                 ! change sign of singular vectors, if desired
+                 if( ncvt>0 )call stdlib_dscal( ncvt, negone, vt( i, 1 ), ldvt )
+              end if
+           end do
+           ! sort the singular values into decreasing order (insertion sort on
+           ! singular values, but only one transposition per singular vector)
+           do i = 1, n - 1
+              ! scan for smallest d(i)
+              isub = 1
+              smin = d( 1 )
+              do j = 2, n + 1 - i
+                 if( d( j )<=smin ) then
+                    isub = j
+                    smin = d( j )
+                 end if
+              end do
+              if( isub/=n+1-i ) then
+                 ! swap singular values and vectors
+                 d( isub ) = d( n+1-i )
+                 d( n+1-i ) = smin
+                 if( ncvt>0 )call stdlib_dswap( ncvt, vt( isub, 1 ), ldvt, vt( n+1-i, 1 ),ldvt )
+                           
+                 if( nru>0 )call stdlib_dswap( nru, u( 1, isub ), 1, u( 1, n+1-i ), 1 )
+                 if( ncc>0 )call stdlib_dswap( ncc, c( isub, 1 ), ldc, c( n+1-i, 1 ), ldc )
+                           
+              end if
+           end do
+           go to 220
+           ! maximum number of iterations exceeded, failure to converge
+           200 continue
+           info = 0
+           do i = 1, n - 1
+              if( e( i )/=zero )info = info + 1
+           end do
+           220 continue
+           return
+     end subroutine stdlib_dbdsqr
+
+     !> DGEES: computes for an N-by-N real nonsymmetric matrix A, the
+     !> eigenvalues, the real Schur form T, and, optionally, the matrix of
+     !> Schur vectors Z.  This gives the Schur factorization A = Z*T*(Z**T).
+     !> Optionally, it also orders the eigenvalues on the diagonal of the
+     !> real Schur form so that selected eigenvalues are at the top left.
+     !> The leading columns of Z then form an orthonormal basis for the
+     !> invariant subspace corresponding to the selected eigenvalues.
+     !> A matrix is in real Schur form if it is upper quasi-triangular with
+     !> 1-by-1 and 2-by-2 blocks. 2-by-2 blocks will be standardized in the
+     !> form
+     !> [  a  b  ]
+     !> [  c  a  ]
+     !> where b*c < 0. The eigenvalues of such a block are a +- sqrt(bc).
+
+     subroutine stdlib_dgees( jobvs, sort, select, n, a, lda, sdim, wr, wi,vs, ldvs, work, lwork, &
+               bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvs, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldvs, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: vs(ldvs,*), wi(*), work(*), wr(*)
+           ! Function Arguments 
+           procedure(stdlib_select_d) :: select
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, lastsl, lquery, lst2sl, scalea, wantst, wantvs
+           integer(ilp) :: hswork, i, i1, i2, ibal, icond, ierr, ieval, ihi, ilo, inxt, ip, itau, &
+                     iwrk, maxwrk, minwrk
+           real(dp) :: anrm, bignum, cscale, eps, s, sep, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(dp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvs = stdlib_lsame( jobvs, 'V' )
+           wantst = stdlib_lsame( sort, 'S' )
+           if( ( .not.wantvs ) .and. ( .not.stdlib_lsame( jobvs, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldvs<1 .or. ( wantvs .and. ldvs<n ) ) then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_dhseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = 2*n + n*stdlib_ilaenv( 1, 'DGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 3*n
+                 call stdlib_dhseqr( 'S', jobvs, n, 1, n, a, lda, wr, wi, vs, ldvs,work, -1, &
+                           ieval )
+                 hswork = work( 1 )
+                 if( .not.wantvs ) then
+                    maxwrk = max( maxwrk, n + hswork )
+                 else
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'DORGHR', ' ', n, 1, n,&
+                               -1 ) )
+                    maxwrk = max( maxwrk, n + hswork )
+                 end if
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEES ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_dlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (workspace: need n)
+           ibal = 1
+           call stdlib_dgebal( 'P', n, a, lda, ilo, ihi, work( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (workspace: need 3*n, prefer 2*n+n*nb)
+           itau = n + ibal
+           iwrk = n + itau
+           call stdlib_dgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvs ) then
+              ! copy householder vectors to vs
+              call stdlib_dlacpy( 'L', n, n, a, lda, vs, ldvs )
+              ! generate orthogonal matrix in vs
+              ! (workspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              call stdlib_dorghr( n, ilo, ihi, vs, ldvs, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+           end if
+           sdim = 0
+           ! perform qr iteration, accumulating schur vectors in vs if desired
+           ! (workspace: need n+1, prefer n+hswork (see comments) )
+           iwrk = itau
+           call stdlib_dhseqr( 'S', jobvs, n, ilo, ihi, a, lda, wr, wi, vs, ldvs,work( iwrk ), &
+                     lwork-iwrk+1, ieval )
+           if( ieval>0 )info = ieval
+           ! sort eigenvalues if desired
+           if( wantst .and. info==0 ) then
+              if( scalea ) then
+                 call stdlib_dlascl( 'G', 0, 0, cscale, anrm, n, 1, wr, n, ierr )
+                 call stdlib_dlascl( 'G', 0, 0, cscale, anrm, n, 1, wi, n, ierr )
+              end if
+              do i = 1, n
+                 bwork( i ) = select( wr( i ), wi( i ) )
+              end do
+              ! reorder eigenvalues and transform schur vectors
+              ! (workspace: none needed)
+              call stdlib_dtrsen( 'N', jobvs, bwork, n, a, lda, vs, ldvs, wr, wi,sdim, s, sep, &
+                        work( iwrk ), lwork-iwrk+1, idum, 1,icond )
+              if( icond>0 )info = n + icond
+           end if
+           if( wantvs ) then
+              ! undo balancing
+              ! (workspace: need n)
+              call stdlib_dgebak( 'P', 'R', n, ilo, ihi, work( ibal ), n, vs, ldvs,ierr )
+           end if
+           if( scalea ) then
+              ! undo scaling for the schur form of a
+              call stdlib_dlascl( 'H', 0, 0, cscale, anrm, n, n, a, lda, ierr )
+              call stdlib_dcopy( n, a, lda+1, wr, 1 )
+              if( cscale==smlnum ) then
+                 ! if scaling back towards underflow, adjust wi if an
+                 ! offdiagonal element of a 2-by-2 block in the schur form
+                 ! underflows.
+                 if( ieval>0 ) then
+                    i1 = ieval + 1
+                    i2 = ihi - 1
+                    call stdlib_dlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wi,max( ilo-1, 1 ), &
+                              ierr )
+                 else if( wantst ) then
+                    i1 = 1
+                    i2 = n - 1
+                 else
+                    i1 = ilo
+                    i2 = ihi - 1
+                 end if
+                 inxt = i1 - 1
+                 loop_20: do i = i1, i2
+                    if( i<inxt )cycle loop_20
+                    if( wi( i )==zero ) then
+                       inxt = i + 1
+                    else
+                       if( a( i+1, i )==zero ) then
+                          wi( i ) = zero
+                          wi( i+1 ) = zero
+                       else if( a( i+1, i )/=zero .and. a( i, i+1 )==zero ) then
+                          wi( i ) = zero
+                          wi( i+1 ) = zero
+                          if( i>1 )call stdlib_dswap( i-1, a( 1, i ), 1, a( 1, i+1 ), 1 )
+                          if( n>i+1 )call stdlib_dswap( n-i-1, a( i, i+2 ), lda,a( i+1, i+2 ), &
+                                    lda )
+                          if( wantvs ) then
+                             call stdlib_dswap( n, vs( 1, i ), 1, vs( 1, i+1 ), 1 )
+                          end if
+                          a( i, i+1 ) = a( i+1, i )
+                          a( i+1, i ) = zero
+                       end if
+                       inxt = i + 2
+                    end if
+                 end do loop_20
+              end if
+              ! undo scaling for the imaginary part of the eigenvalues
+              call stdlib_dlascl( 'G', 0, 0, cscale, anrm, n-ieval, 1,wi( ieval+1 ), max( n-ieval,&
+                         1 ), ierr )
+           end if
+           if( wantst .and. info==0 ) then
+              ! check if reordering successful
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = select( wr( i ), wi( i ) )
+                 if( wi( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_dgees
+
+     !> DGEESX: computes for an N-by-N real nonsymmetric matrix A, the
+     !> eigenvalues, the real Schur form T, and, optionally, the matrix of
+     !> Schur vectors Z.  This gives the Schur factorization A = Z*T*(Z**T).
+     !> Optionally, it also orders the eigenvalues on the diagonal of the
+     !> real Schur form so that selected eigenvalues are at the top left;
+     !> computes a reciprocal condition number for the average of the
+     !> selected eigenvalues (RCONDE); and computes a reciprocal condition
+     !> number for the right invariant subspace corresponding to the
+     !> selected eigenvalues (RCONDV).  The leading columns of Z form an
+     !> orthonormal basis for this invariant subspace.
+     !> For further explanation of the reciprocal condition numbers RCONDE
+     !> and RCONDV, see Section 4.10_dp of the LAPACK Users' Guide (where
+     !> these quantities are called s and sep respectively).
+     !> A real matrix is in real Schur form if it is upper quasi-triangular
+     !> with 1-by-1 and 2-by-2 blocks. 2-by-2 blocks will be standardized in
+     !> the form
+     !> [  a  b  ]
+     !> [  c  a  ]
+     !> where b*c < 0. The eigenvalues of such a block are a +- sqrt(bc).
+
+     subroutine stdlib_dgeesx( jobvs, sort, select, sense, n, a, lda, sdim,wr, wi, vs, ldvs, &
+               rconde, rcondv, work, lwork,iwork, liwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvs, sense, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldvs, liwork, lwork, n
+           real(dp), intent(out) :: rconde, rcondv
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: vs(ldvs,*), wi(*), work(*), wr(*)
+           ! Function Arguments 
+           procedure(stdlib_select_d) :: select
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, lastsl, lquery, lst2sl, scalea, wantsb, wantse, wantsn, wantst, &
+                     wantsv, wantvs
+           integer(ilp) :: hswork, i, i1, i2, ibal, icond, ierr, ieval, ihi, ilo, inxt, ip, itau, &
+                     iwrk, liwrk, lwrk, maxwrk, minwrk
+           real(dp) :: anrm, bignum, cscale, eps, smlnum
+           ! Local Arrays 
+           real(dp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           wantvs = stdlib_lsame( jobvs, 'V' )
+           wantst = stdlib_lsame( sort, 'S' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( ( .not.wantvs ) .and. ( .not.stdlib_lsame( jobvs, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -2
+           else if( .not.( wantsn .or. wantse .or. wantsv .or. wantsb ) .or.( .not.wantst .and. &
+                     .not.wantsn ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldvs<1 .or. ( wantvs .and. ldvs<n ) ) then
+              info = -12
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "rworkspace:" describe the
+             ! minimal amount of real workspace needed at that point in the
+             ! code, as well as the preferred amount for good performance.
+             ! iworkspace refers to integer workspace.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_dhseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.
+             ! if sense = 'e', 'v' or 'b', then the amount of workspace needed
+             ! depends on sdim, which is computed by the routine stdlib_dtrsen later
+             ! in the code.)
+           if( info==0 ) then
+              liwrk = 1
+              if( n==0 ) then
+                 minwrk = 1
+                 lwrk = 1
+              else
+                 maxwrk = 2*n + n*stdlib_ilaenv( 1, 'DGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 3*n
+                 call stdlib_dhseqr( 'S', jobvs, n, 1, n, a, lda, wr, wi, vs, ldvs,work, -1, &
+                           ieval )
+                 hswork = work( 1 )
+                 if( .not.wantvs ) then
+                    maxwrk = max( maxwrk, n + hswork )
+                 else
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'DORGHR', ' ', n, 1, n,&
+                               -1 ) )
+                    maxwrk = max( maxwrk, n + hswork )
+                 end if
+                 lwrk = maxwrk
+                 if( .not.wantsn )lwrk = max( lwrk, n + ( n*n )/2 )
+                 if( wantsv .or. wantsb )liwrk = ( n*n )/4
+              end if
+              iwork( 1 ) = liwrk
+              work( 1 ) = lwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -16
+              else if( liwork<1 .and. .not.lquery ) then
+                 info = -18
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEESX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_dlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (rworkspace: need n)
+           ibal = 1
+           call stdlib_dgebal( 'P', n, a, lda, ilo, ihi, work( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (rworkspace: need 3*n, prefer 2*n+n*nb)
+           itau = n + ibal
+           iwrk = n + itau
+           call stdlib_dgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvs ) then
+              ! copy householder vectors to vs
+              call stdlib_dlacpy( 'L', n, n, a, lda, vs, ldvs )
+              ! generate orthogonal matrix in vs
+              ! (rworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              call stdlib_dorghr( n, ilo, ihi, vs, ldvs, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+           end if
+           sdim = 0
+           ! perform qr iteration, accumulating schur vectors in vs if desired
+           ! (rworkspace: need n+1, prefer n+hswork (see comments) )
+           iwrk = itau
+           call stdlib_dhseqr( 'S', jobvs, n, ilo, ihi, a, lda, wr, wi, vs, ldvs,work( iwrk ), &
+                     lwork-iwrk+1, ieval )
+           if( ieval>0 )info = ieval
+           ! sort eigenvalues if desired
+           if( wantst .and. info==0 ) then
+              if( scalea ) then
+                 call stdlib_dlascl( 'G', 0, 0, cscale, anrm, n, 1, wr, n, ierr )
+                 call stdlib_dlascl( 'G', 0, 0, cscale, anrm, n, 1, wi, n, ierr )
+              end if
+              do i = 1, n
+                 bwork( i ) = select( wr( i ), wi( i ) )
+              end do
+              ! reorder eigenvalues, transform schur vectors, and compute
+              ! reciprocal condition numbers
+              ! (rworkspace: if sense is not 'n', need n+2*sdim*(n-sdim)
+                           ! otherwise, need n )
+              ! (iworkspace: if sense is 'v' or 'b', need sdim*(n-sdim)
+                           ! otherwise, need 0 )
+              call stdlib_dtrsen( sense, jobvs, bwork, n, a, lda, vs, ldvs, wr, wi,sdim, rconde, &
+                        rcondv, work( iwrk ), lwork-iwrk+1,iwork, liwork, icond )
+              if( .not.wantsn )maxwrk = max( maxwrk, n+2*sdim*( n-sdim ) )
+              if( icond==-15 ) then
+                 ! not enough real workspace
+                 info = -16
+              else if( icond==-17 ) then
+                 ! not enough integer workspace
+                 info = -18
+              else if( icond>0 ) then
+                 ! stdlib_dtrsen failed to reorder or to restore standard schur form
+                 info = icond + n
+              end if
+           end if
+           if( wantvs ) then
+              ! undo balancing
+              ! (rworkspace: need n)
+              call stdlib_dgebak( 'P', 'R', n, ilo, ihi, work( ibal ), n, vs, ldvs,ierr )
+           end if
+           if( scalea ) then
+              ! undo scaling for the schur form of a
+              call stdlib_dlascl( 'H', 0, 0, cscale, anrm, n, n, a, lda, ierr )
+              call stdlib_dcopy( n, a, lda+1, wr, 1 )
+              if( ( wantsv .or. wantsb ) .and. info==0 ) then
+                 dum( 1 ) = rcondv
+                 call stdlib_dlascl( 'G', 0, 0, cscale, anrm, 1, 1, dum, 1, ierr )
+                 rcondv = dum( 1 )
+              end if
+              if( cscale==smlnum ) then
+                 ! if scaling back towards underflow, adjust wi if an
+                 ! offdiagonal element of a 2-by-2 block in the schur form
+                 ! underflows.
+                 if( ieval>0 ) then
+                    i1 = ieval + 1
+                    i2 = ihi - 1
+                    call stdlib_dlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wi, n,ierr )
+                 else if( wantst ) then
+                    i1 = 1
+                    i2 = n - 1
+                 else
+                    i1 = ilo
+                    i2 = ihi - 1
+                 end if
+                 inxt = i1 - 1
+                 loop_20: do i = i1, i2
+                    if( i<inxt )cycle loop_20
+                    if( wi( i )==zero ) then
+                       inxt = i + 1
+                    else
+                       if( a( i+1, i )==zero ) then
+                          wi( i ) = zero
+                          wi( i+1 ) = zero
+                       else if( a( i+1, i )/=zero .and. a( i, i+1 )==zero ) then
+                          wi( i ) = zero
+                          wi( i+1 ) = zero
+                          if( i>1 )call stdlib_dswap( i-1, a( 1, i ), 1, a( 1, i+1 ), 1 )
+                          if( n>i+1 )call stdlib_dswap( n-i-1, a( i, i+2 ), lda,a( i+1, i+2 ), &
+                                    lda )
+                          if( wantvs ) then
+                            call stdlib_dswap( n, vs( 1, i ), 1, vs( 1, i+1 ), 1 )
+                          end if
+                          a( i, i+1 ) = a( i+1, i )
+                          a( i+1, i ) = zero
+                       end if
+                       inxt = i + 2
+                    end if
+                 end do loop_20
+              end if
+              call stdlib_dlascl( 'G', 0, 0, cscale, anrm, n-ieval, 1,wi( ieval+1 ), max( n-ieval,&
+                         1 ), ierr )
+           end if
+           if( wantst .and. info==0 ) then
+              ! check if reordering successful
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = select( wr( i ), wi( i ) )
+                 if( wi( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           work( 1 ) = maxwrk
+           if( wantsv .or. wantsb ) then
+              iwork( 1 ) = max( 1, sdim*( n-sdim ) )
+           else
+              iwork( 1 ) = 1
+           end if
+           return
+     end subroutine stdlib_dgeesx
+
+     !> DGEEV: computes for an N-by-N real nonsymmetric matrix A, the
+     !> eigenvalues and, optionally, the left and/or right eigenvectors.
+     !> The right eigenvector v(j) of A satisfies
+     !> A * v(j) = lambda(j) * v(j)
+     !> where lambda(j) is its eigenvalue.
+     !> The left eigenvector u(j) of A satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H
+     !> where u(j)**H denotes the conjugate-transpose of u(j).
+     !> The computed eigenvectors are normalized to have Euclidean norm
+     !> equal to 1 and largest component real.
+
+     subroutine stdlib_dgeev( jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr,ldvr, work, lwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: vl(ldvl,*), vr(ldvr,*), wi(*), work(*), wr(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantvl, wantvr
+           character :: side
+           integer(ilp) :: hswork, i, ibal, ierr, ihi, ilo, itau, iwrk, k, lwork_trevc, maxwrk, &
+                     minwrk, nout
+           real(dp) :: anrm, bignum, cs, cscale, eps, r, scl, smlnum, sn
+           ! Local Arrays 
+           logical(lk) :: select(1)
+           real(dp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvl = stdlib_lsame( jobvl, 'V' )
+           wantvr = stdlib_lsame( jobvr, 'V' )
+           if( ( .not.wantvl ) .and. ( .not.stdlib_lsame( jobvl, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantvr ) .and. ( .not.stdlib_lsame( jobvr, 'N' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldvl<1 .or. ( wantvl .and. ldvl<n ) ) then
+              info = -9
+           else if( ldvr<1 .or. ( wantvr .and. ldvr<n ) ) then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_dhseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = 2*n + n*stdlib_ilaenv( 1, 'DGEHRD', ' ', n, 1, n, 0 )
+                 if( wantvl ) then
+                    minwrk = 4*n
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'DORGHR', ' ', n, 1, n,&
+                               -1 ) )
+                    call stdlib_dhseqr( 'S', 'V', n, 1, n, a, lda, wr, wi, vl, ldvl,work, -1, &
+                              info )
+                    hswork = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + 1, n + hswork )
+                    call stdlib_dtrevc3( 'L', 'B', select, n, a, lda,vl, ldvl, vr, ldvr, n, nout,&
+                              work, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    maxwrk = max( maxwrk, 4*n )
+                 else if( wantvr ) then
+                    minwrk = 4*n
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'DORGHR', ' ', n, 1, n,&
+                               -1 ) )
+                    call stdlib_dhseqr( 'S', 'V', n, 1, n, a, lda, wr, wi, vr, ldvr,work, -1, &
+                              info )
+                    hswork = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + 1, n + hswork )
+                    call stdlib_dtrevc3( 'R', 'B', select, n, a, lda,vl, ldvl, vr, ldvr, n, nout,&
+                              work, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    maxwrk = max( maxwrk, 4*n )
+                 else
+                    minwrk = 3*n
+                    call stdlib_dhseqr( 'E', 'N', n, 1, n, a, lda, wr, wi, vr, ldvr,work, -1, &
+                              info )
+                    hswork = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + 1, n + hswork )
+                 end if
+                 maxwrk = max( maxwrk, minwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_dlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! balance the matrix
+           ! (workspace: need n)
+           ibal = 1
+           call stdlib_dgebal( 'B', n, a, lda, ilo, ihi, work( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (workspace: need 3*n, prefer 2*n+n*nb)
+           itau = ibal + n
+           iwrk = itau + n
+           call stdlib_dgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvl ) then
+              ! want left eigenvectors
+              ! copy householder vectors to vl
+              side = 'L'
+              call stdlib_dlacpy( 'L', n, n, a, lda, vl, ldvl )
+              ! generate orthogonal matrix in vl
+              ! (workspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              call stdlib_dorghr( n, ilo, ihi, vl, ldvl, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vl
+              ! (workspace: need n+1, prefer n+hswork (see comments) )
+              iwrk = itau
+              call stdlib_dhseqr( 'S', 'V', n, ilo, ihi, a, lda, wr, wi, vl, ldvl,work( iwrk ), &
+                        lwork-iwrk+1, info )
+              if( wantvr ) then
+                 ! want left and right eigenvectors
+                 ! copy schur vectors to vr
+                 side = 'B'
+                 call stdlib_dlacpy( 'F', n, n, vl, ldvl, vr, ldvr )
+              end if
+           else if( wantvr ) then
+              ! want right eigenvectors
+              ! copy householder vectors to vr
+              side = 'R'
+              call stdlib_dlacpy( 'L', n, n, a, lda, vr, ldvr )
+              ! generate orthogonal matrix in vr
+              ! (workspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              call stdlib_dorghr( n, ilo, ihi, vr, ldvr, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vr
+              ! (workspace: need n+1, prefer n+hswork (see comments) )
+              iwrk = itau
+              call stdlib_dhseqr( 'S', 'V', n, ilo, ihi, a, lda, wr, wi, vr, ldvr,work( iwrk ), &
+                        lwork-iwrk+1, info )
+           else
+              ! compute eigenvalues only
+              ! (workspace: need n+1, prefer n+hswork (see comments) )
+              iwrk = itau
+              call stdlib_dhseqr( 'E', 'N', n, ilo, ihi, a, lda, wr, wi, vr, ldvr,work( iwrk ), &
+                        lwork-iwrk+1, info )
+           end if
+           ! if info /= 0 from stdlib_dhseqr, then quit
+           if( info/=0 )go to 50
+           if( wantvl .or. wantvr ) then
+              ! compute left and/or right eigenvectors
+              ! (workspace: need 4*n, prefer n + n + 2*n*nb)
+              call stdlib_dtrevc3( side, 'B', select, n, a, lda, vl, ldvl, vr, ldvr,n, nout, work(&
+                         iwrk ), lwork-iwrk+1, ierr )
+           end if
+           if( wantvl ) then
+              ! undo balancing of left eigenvectors
+              ! (workspace: need n)
+              call stdlib_dgebak( 'B', 'L', n, ilo, ihi, work( ibal ), n, vl, ldvl,ierr )
+              ! normalize left eigenvectors and make largest component real
+              do i = 1, n
+                 if( wi( i )==zero ) then
+                    scl = one / stdlib_dnrm2( n, vl( 1, i ), 1 )
+                    call stdlib_dscal( n, scl, vl( 1, i ), 1 )
+                 else if( wi( i )>zero ) then
+                    scl = one / stdlib_dlapy2( stdlib_dnrm2( n, vl( 1, i ), 1 ),stdlib_dnrm2( n, &
+                              vl( 1, i+1 ), 1 ) )
+                    call stdlib_dscal( n, scl, vl( 1, i ), 1 )
+                    call stdlib_dscal( n, scl, vl( 1, i+1 ), 1 )
+                    do k = 1, n
+                       work( iwrk+k-1 ) = vl( k, i )**2 + vl( k, i+1 )**2
+                    end do
+                    k = stdlib_idamax( n, work( iwrk ), 1 )
+                    call stdlib_dlartg( vl( k, i ), vl( k, i+1 ), cs, sn, r )
+                    call stdlib_drot( n, vl( 1, i ), 1, vl( 1, i+1 ), 1, cs, sn )
+                    vl( k, i+1 ) = zero
+                 end if
+              end do
+           end if
+           if( wantvr ) then
+              ! undo balancing of right eigenvectors
+              ! (workspace: need n)
+              call stdlib_dgebak( 'B', 'R', n, ilo, ihi, work( ibal ), n, vr, ldvr,ierr )
+              ! normalize right eigenvectors and make largest component real
+              do i = 1, n
+                 if( wi( i )==zero ) then
+                    scl = one / stdlib_dnrm2( n, vr( 1, i ), 1 )
+                    call stdlib_dscal( n, scl, vr( 1, i ), 1 )
+                 else if( wi( i )>zero ) then
+                    scl = one / stdlib_dlapy2( stdlib_dnrm2( n, vr( 1, i ), 1 ),stdlib_dnrm2( n, &
+                              vr( 1, i+1 ), 1 ) )
+                    call stdlib_dscal( n, scl, vr( 1, i ), 1 )
+                    call stdlib_dscal( n, scl, vr( 1, i+1 ), 1 )
+                    do k = 1, n
+                       work( iwrk+k-1 ) = vr( k, i )**2 + vr( k, i+1 )**2
+                    end do
+                    k = stdlib_idamax( n, work( iwrk ), 1 )
+                    call stdlib_dlartg( vr( k, i ), vr( k, i+1 ), cs, sn, r )
+                    call stdlib_drot( n, vr( 1, i ), 1, vr( 1, i+1 ), 1, cs, sn )
+                    vr( k, i+1 ) = zero
+                 end if
+              end do
+           end if
+           ! undo scaling if necessary
+           50 continue
+           if( scalea ) then
+              call stdlib_dlascl( 'G', 0, 0, cscale, anrm, n-info, 1, wr( info+1 ),max( n-info, 1 &
+                        ), ierr )
+              call stdlib_dlascl( 'G', 0, 0, cscale, anrm, n-info, 1, wi( info+1 ),max( n-info, 1 &
+                        ), ierr )
+              if( info>0 ) then
+                 call stdlib_dlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wr, n,ierr )
+                 call stdlib_dlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wi, n,ierr )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_dgeev
+
+     !> DGEEVX: computes for an N-by-N real nonsymmetric matrix A, the
+     !> eigenvalues and, optionally, the left and/or right eigenvectors.
+     !> Optionally also, it computes a balancing transformation to improve
+     !> the conditioning of the eigenvalues and eigenvectors (ILO, IHI,
+     !> SCALE, and ABNRM), reciprocal condition numbers for the eigenvalues
+     !> (RCONDE), and reciprocal condition numbers for the right
+     !> eigenvectors (RCONDV).
+     !> The right eigenvector v(j) of A satisfies
+     !> A * v(j) = lambda(j) * v(j)
+     !> where lambda(j) is its eigenvalue.
+     !> The left eigenvector u(j) of A satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H
+     !> where u(j)**H denotes the conjugate-transpose of u(j).
+     !> The computed eigenvectors are normalized to have Euclidean norm
+     !> equal to 1 and largest component real.
+     !> Balancing a matrix means permuting the rows and columns to make it
+     !> more nearly upper triangular, and applying a diagonal similarity
+     !> transformation D * A * D**(-1), where D is a diagonal matrix, to
+     !> make its rows and columns closer in norm and the condition numbers
+     !> of its eigenvalues and eigenvectors smaller.  The computed
+     !> reciprocal condition numbers correspond to the balanced matrix.
+     !> Permuting rows and columns will not change the condition numbers
+     !> (in exact arithmetic) but diagonal scaling will.  For further
+     !> explanation of balancing, see section 4.10.2_dp of the LAPACK
+     !> Users' Guide.
+
+     subroutine stdlib_dgeevx( balanc, jobvl, jobvr, sense, n, a, lda, wr, wi,vl, ldvl, vr, ldvr, &
+               ilo, ihi, scale, abnrm,rconde, rcondv, work, lwork, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: balanc, jobvl, jobvr, sense
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldvl, ldvr, lwork, n
+           real(dp), intent(out) :: abnrm
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: rconde(*), rcondv(*), scale(*), vl(ldvl,*), vr(ldvr,*), wi(*),&
+                      work(*), wr(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantvl, wantvr, wntsnb, wntsne, wntsnn, wntsnv
+           character :: job, side
+           integer(ilp) :: hswork, i, icond, ierr, itau, iwrk, k, lwork_trevc, maxwrk, minwrk, &
+                     nout
+           real(dp) :: anrm, bignum, cs, cscale, eps, r, scl, smlnum, sn
+           ! Local Arrays 
+           logical(lk) :: select(1)
+           real(dp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvl = stdlib_lsame( jobvl, 'V' )
+           wantvr = stdlib_lsame( jobvr, 'V' )
+           wntsnn = stdlib_lsame( sense, 'N' )
+           wntsne = stdlib_lsame( sense, 'E' )
+           wntsnv = stdlib_lsame( sense, 'V' )
+           wntsnb = stdlib_lsame( sense, 'B' )
+           if( .not.( stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc, 'S' ).or. &
+                     stdlib_lsame( balanc, 'P' ) .or. stdlib_lsame( balanc, 'B' ) ) )then
+              info = -1
+           else if( ( .not.wantvl ) .and. ( .not.stdlib_lsame( jobvl, 'N' ) ) ) then
+              info = -2
+           else if( ( .not.wantvr ) .and. ( .not.stdlib_lsame( jobvr, 'N' ) ) ) then
+              info = -3
+           else if( .not.( wntsnn .or. wntsne .or. wntsnb .or. wntsnv ) .or.( ( wntsne .or. &
+                     wntsnb ) .and. .not.( wantvl .and.wantvr ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( wantvl .and. ldvl<n ) ) then
+              info = -11
+           else if( ldvr<1 .or. ( wantvr .and. ldvr<n ) ) then
+              info = -13
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_dhseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'DGEHRD', ' ', n, 1, n, 0 )
+                 if( wantvl ) then
+                    call stdlib_dtrevc3( 'L', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    call stdlib_dhseqr( 'S', 'V', n, 1, n, a, lda, wr, wi, vl, ldvl,work, -1, &
+                              info )
+                 else if( wantvr ) then
+                    call stdlib_dtrevc3( 'R', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    call stdlib_dhseqr( 'S', 'V', n, 1, n, a, lda, wr, wi, vr, ldvr,work, -1, &
+                              info )
+                 else
+                    if( wntsnn ) then
+                       call stdlib_dhseqr( 'E', 'N', n, 1, n, a, lda, wr, wi, vr,ldvr, work, -1, &
+                                 info )
+                    else
+                       call stdlib_dhseqr( 'S', 'N', n, 1, n, a, lda, wr, wi, vr,ldvr, work, -1, &
+                                 info )
+                    end if
+                 end if
+                 hswork = int( work(1),KIND=ilp)
+                 if( ( .not.wantvl ) .and. ( .not.wantvr ) ) then
+                    minwrk = 2*n
+                    if( .not.wntsnn )minwrk = max( minwrk, n*n+6*n )
+                    maxwrk = max( maxwrk, hswork )
+                    if( .not.wntsnn )maxwrk = max( maxwrk, n*n + 6*n )
+                 else
+                    minwrk = 3*n
+                    if( ( .not.wntsnn ) .and. ( .not.wntsne ) )minwrk = max( minwrk, n*n + 6*n )
+                              
+                    maxwrk = max( maxwrk, hswork )
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'DORGHR',' ', n, 1, n, -&
+                              1 ) )
+                    if( ( .not.wntsnn ) .and. ( .not.wntsne ) )maxwrk = max( maxwrk, n*n + 6*n )
+                              
+                    maxwrk = max( maxwrk, 3*n )
+                 end if
+                 maxwrk = max( maxwrk, minwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -21
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           icond = 0
+           anrm = stdlib_dlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_dlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! balance the matrix and compute abnrm
+           call stdlib_dgebal( balanc, n, a, lda, ilo, ihi, scale, ierr )
+           abnrm = stdlib_dlange( '1', n, n, a, lda, dum )
+           if( scalea ) then
+              dum( 1 ) = abnrm
+              call stdlib_dlascl( 'G', 0, 0, cscale, anrm, 1, 1, dum, 1, ierr )
+              abnrm = dum( 1 )
+           end if
+           ! reduce to upper hessenberg form
+           ! (workspace: need 2*n, prefer n+n*nb)
+           itau = 1
+           iwrk = itau + n
+           call stdlib_dgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvl ) then
+              ! want left eigenvectors
+              ! copy householder vectors to vl
+              side = 'L'
+              call stdlib_dlacpy( 'L', n, n, a, lda, vl, ldvl )
+              ! generate orthogonal matrix in vl
+              ! (workspace: need 2*n-1, prefer n+(n-1)*nb)
+              call stdlib_dorghr( n, ilo, ihi, vl, ldvl, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vl
+              ! (workspace: need 1, prefer hswork (see comments) )
+              iwrk = itau
+              call stdlib_dhseqr( 'S', 'V', n, ilo, ihi, a, lda, wr, wi, vl, ldvl,work( iwrk ), &
+                        lwork-iwrk+1, info )
+              if( wantvr ) then
+                 ! want left and right eigenvectors
+                 ! copy schur vectors to vr
+                 side = 'B'
+                 call stdlib_dlacpy( 'F', n, n, vl, ldvl, vr, ldvr )
+              end if
+           else if( wantvr ) then
+              ! want right eigenvectors
+              ! copy householder vectors to vr
+              side = 'R'
+              call stdlib_dlacpy( 'L', n, n, a, lda, vr, ldvr )
+              ! generate orthogonal matrix in vr
+              ! (workspace: need 2*n-1, prefer n+(n-1)*nb)
+              call stdlib_dorghr( n, ilo, ihi, vr, ldvr, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vr
+              ! (workspace: need 1, prefer hswork (see comments) )
+              iwrk = itau
+              call stdlib_dhseqr( 'S', 'V', n, ilo, ihi, a, lda, wr, wi, vr, ldvr,work( iwrk ), &
+                        lwork-iwrk+1, info )
+           else
+              ! compute eigenvalues only
+              ! if condition numbers desired, compute schur form
+              if( wntsnn ) then
+                 job = 'E'
+              else
+                 job = 'S'
+              end if
+              ! (workspace: need 1, prefer hswork (see comments) )
+              iwrk = itau
+              call stdlib_dhseqr( job, 'N', n, ilo, ihi, a, lda, wr, wi, vr, ldvr,work( iwrk ), &
+                        lwork-iwrk+1, info )
+           end if
+           ! if info /= 0 from stdlib_dhseqr, then quit
+           if( info/=0 )go to 50
+           if( wantvl .or. wantvr ) then
+              ! compute left and/or right eigenvectors
+              ! (workspace: need 3*n, prefer n + 2*n*nb)
+              call stdlib_dtrevc3( side, 'B', select, n, a, lda, vl, ldvl, vr, ldvr,n, nout, work(&
+                         iwrk ), lwork-iwrk+1, ierr )
+           end if
+           ! compute condition numbers if desired
+           ! (workspace: need n*n+6*n unless sense = 'e')
+           if( .not.wntsnn ) then
+              call stdlib_dtrsna( sense, 'A', select, n, a, lda, vl, ldvl, vr, ldvr,rconde, &
+                        rcondv, n, nout, work( iwrk ), n, iwork,icond )
+           end if
+           if( wantvl ) then
+              ! undo balancing of left eigenvectors
+              call stdlib_dgebak( balanc, 'L', n, ilo, ihi, scale, n, vl, ldvl,ierr )
+              ! normalize left eigenvectors and make largest component real
+              do i = 1, n
+                 if( wi( i )==zero ) then
+                    scl = one / stdlib_dnrm2( n, vl( 1, i ), 1 )
+                    call stdlib_dscal( n, scl, vl( 1, i ), 1 )
+                 else if( wi( i )>zero ) then
+                    scl = one / stdlib_dlapy2( stdlib_dnrm2( n, vl( 1, i ), 1 ),stdlib_dnrm2( n, &
+                              vl( 1, i+1 ), 1 ) )
+                    call stdlib_dscal( n, scl, vl( 1, i ), 1 )
+                    call stdlib_dscal( n, scl, vl( 1, i+1 ), 1 )
+                    do k = 1, n
+                       work( k ) = vl( k, i )**2 + vl( k, i+1 )**2
+                    end do
+                    k = stdlib_idamax( n, work, 1 )
+                    call stdlib_dlartg( vl( k, i ), vl( k, i+1 ), cs, sn, r )
+                    call stdlib_drot( n, vl( 1, i ), 1, vl( 1, i+1 ), 1, cs, sn )
+                    vl( k, i+1 ) = zero
+                 end if
+              end do
+           end if
+           if( wantvr ) then
+              ! undo balancing of right eigenvectors
+              call stdlib_dgebak( balanc, 'R', n, ilo, ihi, scale, n, vr, ldvr,ierr )
+              ! normalize right eigenvectors and make largest component real
+              do i = 1, n
+                 if( wi( i )==zero ) then
+                    scl = one / stdlib_dnrm2( n, vr( 1, i ), 1 )
+                    call stdlib_dscal( n, scl, vr( 1, i ), 1 )
+                 else if( wi( i )>zero ) then
+                    scl = one / stdlib_dlapy2( stdlib_dnrm2( n, vr( 1, i ), 1 ),stdlib_dnrm2( n, &
+                              vr( 1, i+1 ), 1 ) )
+                    call stdlib_dscal( n, scl, vr( 1, i ), 1 )
+                    call stdlib_dscal( n, scl, vr( 1, i+1 ), 1 )
+                    do k = 1, n
+                       work( k ) = vr( k, i )**2 + vr( k, i+1 )**2
+                    end do
+                    k = stdlib_idamax( n, work, 1 )
+                    call stdlib_dlartg( vr( k, i ), vr( k, i+1 ), cs, sn, r )
+                    call stdlib_drot( n, vr( 1, i ), 1, vr( 1, i+1 ), 1, cs, sn )
+                    vr( k, i+1 ) = zero
+                 end if
+              end do
+           end if
+           ! undo scaling if necessary
+           50 continue
+           if( scalea ) then
+              call stdlib_dlascl( 'G', 0, 0, cscale, anrm, n-info, 1, wr( info+1 ),max( n-info, 1 &
+                        ), ierr )
+              call stdlib_dlascl( 'G', 0, 0, cscale, anrm, n-info, 1, wi( info+1 ),max( n-info, 1 &
+                        ), ierr )
+              if( info==0 ) then
+                 if( ( wntsnv .or. wntsnb ) .and. icond==0 )call stdlib_dlascl( 'G', 0, 0, cscale,&
+                            anrm, n, 1, rcondv, n,ierr )
+              else
+                 call stdlib_dlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wr, n,ierr )
+                 call stdlib_dlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wi, n,ierr )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_dgeevx
+
+     !> DGELSD: computes the minimum-norm solution to a real linear least
+     !> squares problem:
+     !> minimize 2-norm(| b - A*x |)
+     !> using the singular value decomposition (SVD) of A. A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+     !> The problem is solved in three steps:
+     !> (1) Reduce the coefficient matrix A to bidiagonal form with
+     !> Householder transformations, reducing the original problem
+     !> into a "bidiagonal least squares problem" (BLS)
+     !> (2) Solve the BLS using a divide and conquer approach.
+     !> (3) Apply back all the Householder transformations to solve
+     !> the original least squares problem.
+     !> The effective rank of A is determined by treating as zero those
+     !> singular values which are less than RCOND times the largest singular
+     !> value.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_dgelsd( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, iwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(dp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: s(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: iascl, ibscl, ie, il, itau, itaup, itauq, ldwork, liwork, maxmn, &
+                     maxwrk, minmn, minwrk, mm, mnthr, nlvl, nwork, smlsiz, wlalsd
+           real(dp) :: anrm, bignum, bnrm, eps, sfmin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: real,int,log,max,min
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           minmn = min( m, n )
+           maxmn = max( m, n )
+           mnthr = stdlib_ilaenv( 6, 'DGELSD', ' ', m, n, nrhs, -1 )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, maxmn ) ) then
+              info = -7
+           end if
+           smlsiz = stdlib_ilaenv( 9, 'DGELSD', ' ', 0, 0, 0, 0 )
+           ! compute workspace.
+           ! (note: comments in the code beginning "workspace:" describe the
+           ! minimal amount of workspace needed at that point in the code,
+           ! as well as the preferred amount for good performance.
+           ! nb refers to the optimal block size for the immediately
+           ! following subroutine, as returned by stdlib_ilaenv.)
+           minwrk = 1
+           liwork = 1
+           minmn = max( 1, minmn )
+           nlvl = max( int( log( real( minmn,KIND=dp) / real( smlsiz+1,KIND=dp) ) /log( two ),&
+                     KIND=ilp) + 1, 0 )
+           if( info==0 ) then
+              maxwrk = 0
+              liwork = 3*minmn*nlvl + 11*minmn
+              mm = m
+              if( m>=n .and. m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns.
+                 mm = n
+                 maxwrk = max( maxwrk, n+n*stdlib_ilaenv( 1, 'DGEQRF', ' ', m, n,-1, -1 ) )
+                           
+                 maxwrk = max( maxwrk, n+nrhs*stdlib_ilaenv( 1, 'DORMQR', 'LT', m, nrhs, n, -1 ) )
+                           
+              end if
+              if( m>=n ) then
+                 ! path 1 - overdetermined or exactly determined.
+                 maxwrk = max( maxwrk, 3*n+( mm+n )*stdlib_ilaenv( 1, 'DGEBRD', ' ', mm, n, -1, -&
+                           1 ) )
+                 maxwrk = max( maxwrk, 3*n+nrhs*stdlib_ilaenv( 1, 'DORMBR', 'QLT', mm, nrhs, n, -&
+                           1 ) )
+                 maxwrk = max( maxwrk, 3*n+( n-1 )*stdlib_ilaenv( 1, 'DORMBR', 'PLN', n, nrhs, n, &
+                           -1 ) )
+                 wlalsd = 9*n+2*n*smlsiz+8*n*nlvl+n*nrhs+(smlsiz+1)**2
+                 maxwrk = max( maxwrk, 3*n+wlalsd )
+                 minwrk = max( 3*n+mm, 3*n+nrhs, 3*n+wlalsd )
+              end if
+              if( n>m ) then
+                 wlalsd = 9*m+2*m*smlsiz+8*m*nlvl+m*nrhs+(smlsiz+1)**2
+                 if( n>=mnthr ) then
+                    ! path 2a - underdetermined, with many more columns
+                    ! than rows.
+                    maxwrk = m + m*stdlib_ilaenv( 1, 'DGELQF', ' ', m, n, -1, -1 )
+                    maxwrk = max( maxwrk, m*m+4*m+2*m*stdlib_ilaenv( 1, 'DGEBRD', ' ', m, m, -1, -&
+                              1 ) )
+                    maxwrk = max( maxwrk, m*m+4*m+nrhs*stdlib_ilaenv( 1, 'DORMBR', 'QLT', m, nrhs,&
+                               m, -1 ) )
+                    maxwrk = max( maxwrk, m*m+4*m+( m-1 )*stdlib_ilaenv( 1, 'DORMBR', 'PLN', m, &
+                              nrhs, m, -1 ) )
+                    if( nrhs>1 ) then
+                       maxwrk = max( maxwrk, m*m+m+m*nrhs )
+                    else
+                       maxwrk = max( maxwrk, m*m+2*m )
+                    end if
+                    maxwrk = max( maxwrk, m+nrhs*stdlib_ilaenv( 1, 'DORMLQ', 'LT', n, nrhs, m, -1 &
+                              ) )
+                    maxwrk = max( maxwrk, m*m+4*m+wlalsd )
+           ! xxx: ensure the path 2a case below is triggered.  the workspace
+           ! calculation should use queries for all routines eventually.
+                    maxwrk = max( maxwrk,4*m+m*m+max( m, 2*m-4, nrhs, n-3*m ) )
+                 else
+                    ! path 2 - remaining underdetermined cases.
+                    maxwrk = 3*m + ( n+m )*stdlib_ilaenv( 1, 'DGEBRD', ' ', m, n,-1, -1 )
+                    maxwrk = max( maxwrk, 3*m+nrhs*stdlib_ilaenv( 1, 'DORMBR', 'QLT', m, nrhs, n, &
+                              -1 ) )
+                    maxwrk = max( maxwrk, 3*m+m*stdlib_ilaenv( 1, 'DORMBR', 'PLN', n, nrhs, m, -1 &
+                              ) )
+                    maxwrk = max( maxwrk, 3*m+wlalsd )
+                 end if
+                 minwrk = max( 3*m+nrhs, 3*m+m, 3*m+wlalsd )
+              end if
+              minwrk = min( minwrk, maxwrk )
+              work( 1 ) = maxwrk
+              iwork( 1 ) = liwork
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELSD', -info )
+              return
+           else if( lquery ) then
+              go to 10
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters.
+           eps = stdlib_dlamch( 'P' )
+           sfmin = stdlib_dlamch( 'S' )
+           smlnum = sfmin / eps
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! scale a if max entry outside range [smlnum,bignum].
+           anrm = stdlib_dlange( 'M', m, n, a, lda, work )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum.
+              call stdlib_dlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum.
+              call stdlib_dlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_dlaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              call stdlib_dlaset( 'F', minmn, 1, zero, zero, s, 1 )
+              rank = 0
+              go to 10
+           end if
+           ! scale b if max entry outside range [smlnum,bignum].
+           bnrm = stdlib_dlange( 'M', m, nrhs, b, ldb, work )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum.
+              call stdlib_dlascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum.
+              call stdlib_dlascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! if m < n make sure certain entries of b are zero.
+           if( m<n )call stdlib_dlaset( 'F', n-m, nrhs, zero, zero, b( m+1, 1 ), ldb )
+           ! overdetermined case.
+           if( m>=n ) then
+              ! path 1 - overdetermined or exactly determined.
+              mm = m
+              if( m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns.
+                 mm = n
+                 itau = 1
+                 nwork = itau + n
+                 ! compute a=q*r.
+                 ! (workspace: need 2*n, prefer n+n*nb)
+                 call stdlib_dgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                           info )
+                 ! multiply b by transpose(q).
+                 ! (workspace: need n+nrhs, prefer n+nrhs*nb)
+                 call stdlib_dormqr( 'L', 'T', m, nrhs, n, a, lda, work( itau ), b,ldb, work( &
+                           nwork ), lwork-nwork+1, info )
+                 ! zero out below r.
+                 if( n>1 ) then
+                    call stdlib_dlaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ), lda )
+                 end if
+              end if
+              ie = 1
+              itauq = ie + n
+              itaup = itauq + n
+              nwork = itaup + n
+              ! bidiagonalize r in a.
+              ! (workspace: need 3*n+mm, prefer 3*n+(mm+n)*nb)
+              call stdlib_dgebrd( mm, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), work(&
+                         nwork ), lwork-nwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors of r.
+              ! (workspace: need 3*n+nrhs, prefer 3*n+nrhs*nb)
+              call stdlib_dormbr( 'Q', 'L', 'T', mm, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_dlalsd( 'U', smlsiz, n, nrhs, s, work( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of r.
+              call stdlib_dormbr( 'P', 'L', 'N', n, nrhs, n, a, lda, work( itaup ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+           else if( n>=mnthr .and. lwork>=4*m+m*m+max( m, 2*m-4, nrhs, n-3*m, wlalsd ) ) &
+                     then
+              ! path 2a - underdetermined, with many more columns than rows
+              ! and sufficient workspace for an efficient algorithm.
+              ldwork = m
+              if( lwork>=max( 4*m+m*lda+max( m, 2*m-4, nrhs, n-3*m ),m*lda+m+m*nrhs, 4*m+m*lda+&
+                        wlalsd ) )ldwork = lda
+              itau = 1
+              nwork = m + 1
+              ! compute a=l*q.
+              ! (workspace: need 2*m, prefer m+m*nb)
+              call stdlib_dgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, info )
+                        
+              il = nwork
+              ! copy l to work(il), zeroing out above its diagonal.
+              call stdlib_dlacpy( 'L', m, m, a, lda, work( il ), ldwork )
+              call stdlib_dlaset( 'U', m-1, m-1, zero, zero, work( il+ldwork ),ldwork )
+              ie = il + ldwork*m
+              itauq = ie + m
+              itaup = itauq + m
+              nwork = itaup + m
+              ! bidiagonalize l in work(il).
+              ! (workspace: need m*m+5*m, prefer m*m+4*m+2*m*nb)
+              call stdlib_dgebrd( m, m, work( il ), ldwork, s, work( ie ),work( itauq ), work( &
+                        itaup ), work( nwork ),lwork-nwork+1, info )
+              ! multiply b by transpose of left bidiagonalizing vectors of l.
+              ! (workspace: need m*m+4*m+nrhs, prefer m*m+4*m+nrhs*nb)
+              call stdlib_dormbr( 'Q', 'L', 'T', m, nrhs, m, work( il ), ldwork,work( itauq ), b, &
+                        ldb, work( nwork ),lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_dlalsd( 'U', smlsiz, m, nrhs, s, work( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of l.
+              call stdlib_dormbr( 'P', 'L', 'N', m, nrhs, m, work( il ), ldwork,work( itaup ), b, &
+                        ldb, work( nwork ),lwork-nwork+1, info )
+              ! zero out below first m rows of b.
+              call stdlib_dlaset( 'F', n-m, nrhs, zero, zero, b( m+1, 1 ), ldb )
+              nwork = itau + m
+              ! multiply transpose(q) by b.
+              ! (workspace: need m+nrhs, prefer m+nrhs*nb)
+              call stdlib_dormlq( 'L', 'T', n, nrhs, m, a, lda, work( itau ), b,ldb, work( nwork )&
+                        , lwork-nwork+1, info )
+           else
+              ! path 2 - remaining underdetermined cases.
+              ie = 1
+              itauq = ie + m
+              itaup = itauq + m
+              nwork = itaup + m
+              ! bidiagonalize a.
+              ! (workspace: need 3*m+n, prefer 3*m+(m+n)*nb)
+              call stdlib_dgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), work( &
+                        nwork ), lwork-nwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors.
+              ! (workspace: need 3*m+nrhs, prefer 3*m+nrhs*nb)
+              call stdlib_dormbr( 'Q', 'L', 'T', m, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_dlalsd( 'L', smlsiz, m, nrhs, s, work( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of a.
+              call stdlib_dormbr( 'P', 'L', 'N', n, nrhs, m, a, lda, work( itaup ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+           end if
+           ! undo scaling.
+           if( iascl==1 ) then
+              call stdlib_dlascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_dlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,info )
+           else if( iascl==2 ) then
+              call stdlib_dlascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_dlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_dlascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_dlascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           10 continue
+           work( 1 ) = maxwrk
+           iwork( 1 ) = liwork
+           return
+     end subroutine stdlib_dgelsd
+
+     !> DGELSS: computes the minimum norm solution to a real linear least
+     !> squares problem:
+     !> Minimize 2-norm(| b - A*x |).
+     !> using the singular value decomposition (SVD) of A. A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution matrix
+     !> X.
+     !> The effective rank of A is determined by treating as zero those
+     !> singular values which are less than RCOND times the largest singular
+     !> value.
+
+     subroutine stdlib_dgelss( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(dp), intent(in) :: rcond
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: s(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: bdspac, bl, chunk, i, iascl, ibscl, ie, il, itau, itaup, itauq, iwork, &
+                     ldwork, maxmn, maxwrk, minmn, minwrk, mm, mnthr
+           integer(ilp) :: lwork_dgeqrf, lwork_dormqr, lwork_dgebrd, lwork_dormbr, lwork_dorgbr, &
+                     lwork_dormlq, lwork_dgelqf
+           real(dp) :: anrm, bignum, bnrm, eps, sfmin, smlnum, thr
+           ! Local Arrays 
+           real(dp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           minmn = min( m, n )
+           maxmn = max( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, maxmn ) ) then
+              info = -7
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( minmn>0 ) then
+                 mm = m
+                 mnthr = stdlib_ilaenv( 6, 'DGELSS', ' ', m, n, nrhs, -1 )
+                 if( m>=n .and. m>=mnthr ) then
+                    ! path 1a - overdetermined, with many more rows than
+                              ! columns
+                    ! compute space needed for stdlib_dgeqrf
+                    call stdlib_dgeqrf( m, n, a, lda, dum(1), dum(1), -1, info )
+                    lwork_dgeqrf=dum(1)
+                    ! compute space needed for stdlib_dormqr
+                    call stdlib_dormqr( 'L', 'T', m, nrhs, n, a, lda, dum(1), b,ldb, dum(1), -1, &
+                              info )
+                    lwork_dormqr=dum(1)
+                    mm = n
+                    maxwrk = max( maxwrk, n + lwork_dgeqrf )
+                    maxwrk = max( maxwrk, n + lwork_dormqr )
+                 end if
+                 if( m>=n ) then
+                    ! path 1 - overdetermined or exactly determined
+                    ! compute workspace needed for stdlib_dbdsqr
+                    bdspac = max( 1, 5*n )
+                    ! compute space needed for stdlib_dgebrd
+                    call stdlib_dgebrd( mm, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, info &
+                              )
+                    lwork_dgebrd=dum(1)
+                    ! compute space needed for stdlib_dormbr
+                    call stdlib_dormbr( 'Q', 'L', 'T', mm, nrhs, n, a, lda, dum(1),b, ldb, dum(1),&
+                               -1, info )
+                    lwork_dormbr=dum(1)
+                    ! compute space needed for stdlib_dorgbr
+                    call stdlib_dorgbr( 'P', n, n, n, a, lda, dum(1),dum(1), -1, info )
+                    lwork_dorgbr=dum(1)
+                    ! compute total workspace needed
+                    maxwrk = max( maxwrk, 3*n + lwork_dgebrd )
+                    maxwrk = max( maxwrk, 3*n + lwork_dormbr )
+                    maxwrk = max( maxwrk, 3*n + lwork_dorgbr )
+                    maxwrk = max( maxwrk, bdspac )
+                    maxwrk = max( maxwrk, n*nrhs )
+                    minwrk = max( 3*n + mm, 3*n + nrhs, bdspac )
+                    maxwrk = max( minwrk, maxwrk )
+                 end if
+                 if( n>m ) then
+                    ! compute workspace needed for stdlib_dbdsqr
+                    bdspac = max( 1, 5*m )
+                    minwrk = max( 3*m+nrhs, 3*m+n, bdspac )
+                    if( n>=mnthr ) then
+                       ! path 2a - underdetermined, with many more columns
+                       ! than rows
+                       ! compute space needed for stdlib_dgelqf
+                       call stdlib_dgelqf( m, n, a, lda, dum(1), dum(1),-1, info )
+                       lwork_dgelqf=dum(1)
+                       ! compute space needed for stdlib_dgebrd
+                       call stdlib_dgebrd( m, m, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, &
+                                 info )
+                       lwork_dgebrd=dum(1)
+                       ! compute space needed for stdlib_dormbr
+                       call stdlib_dormbr( 'Q', 'L', 'T', m, nrhs, n, a, lda,dum(1), b, ldb, dum(&
+                                 1), -1, info )
+                       lwork_dormbr=dum(1)
+                       ! compute space needed for stdlib_dorgbr
+                       call stdlib_dorgbr( 'P', m, m, m, a, lda, dum(1),dum(1), -1, info )
+                       lwork_dorgbr=dum(1)
+                       ! compute space needed for stdlib_dormlq
+                       call stdlib_dormlq( 'L', 'T', n, nrhs, m, a, lda, dum(1),b, ldb, dum(1), -&
+                                 1, info )
+                       lwork_dormlq=dum(1)
+                       ! compute total workspace needed
+                       maxwrk = m + lwork_dgelqf
+                       maxwrk = max( maxwrk, m*m + 4*m + lwork_dgebrd )
+                       maxwrk = max( maxwrk, m*m + 4*m + lwork_dormbr )
+                       maxwrk = max( maxwrk, m*m + 4*m + lwork_dorgbr )
+                       maxwrk = max( maxwrk, m*m + m + bdspac )
+                       if( nrhs>1 ) then
+                          maxwrk = max( maxwrk, m*m + m + m*nrhs )
+                       else
+                          maxwrk = max( maxwrk, m*m + 2*m )
+                       end if
+                       maxwrk = max( maxwrk, m + lwork_dormlq )
+                    else
+                       ! path 2 - underdetermined
+                       ! compute space needed for stdlib_dgebrd
+                       call stdlib_dgebrd( m, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, &
+                                 info )
+                       lwork_dgebrd=dum(1)
+                       ! compute space needed for stdlib_dormbr
+                       call stdlib_dormbr( 'Q', 'L', 'T', m, nrhs, m, a, lda,dum(1), b, ldb, dum(&
+                                 1), -1, info )
+                       lwork_dormbr=dum(1)
+                       ! compute space needed for stdlib_dorgbr
+                       call stdlib_dorgbr( 'P', m, n, m, a, lda, dum(1),dum(1), -1, info )
+                       lwork_dorgbr=dum(1)
+                       maxwrk = 3*m + lwork_dgebrd
+                       maxwrk = max( maxwrk, 3*m + lwork_dormbr )
+                       maxwrk = max( maxwrk, 3*m + lwork_dorgbr )
+                       maxwrk = max( maxwrk, bdspac )
+                       maxwrk = max( maxwrk, n*nrhs )
+                    end if
+                 end if
+                 maxwrk = max( minwrk, maxwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery )info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELSS', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters
+           eps = stdlib_dlamch( 'P' )
+           sfmin = stdlib_dlamch( 'S' )
+           smlnum = sfmin / eps
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', m, n, a, lda, work )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_dlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_dlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_dlaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              call stdlib_dlaset( 'F', minmn, 1, zero, zero, s, minmn )
+              rank = 0
+              go to 70
+           end if
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_dlange( 'M', m, nrhs, b, ldb, work )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_dlascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_dlascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! overdetermined case
+           if( m>=n ) then
+              ! path 1 - overdetermined or exactly determined
+              mm = m
+              if( m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns
+                 mm = n
+                 itau = 1
+                 iwork = itau + n
+                 ! compute a=q*r
+                 ! (workspace: need 2*n, prefer n+n*nb)
+                 call stdlib_dgeqrf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                           info )
+                 ! multiply b by transpose(q)
+                 ! (workspace: need n+nrhs, prefer n+nrhs*nb)
+                 call stdlib_dormqr( 'L', 'T', m, nrhs, n, a, lda, work( itau ), b,ldb, work( &
+                           iwork ), lwork-iwork+1, info )
+                 ! zero out below r
+                 if( n>1 )call stdlib_dlaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ), lda )
+              end if
+              ie = 1
+              itauq = ie + n
+              itaup = itauq + n
+              iwork = itaup + n
+              ! bidiagonalize r in a
+              ! (workspace: need 3*n+mm, prefer 3*n+(mm+n)*nb)
+              call stdlib_dgebrd( mm, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), work(&
+                         iwork ), lwork-iwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors of r
+              ! (workspace: need 3*n+nrhs, prefer 3*n+nrhs*nb)
+              call stdlib_dormbr( 'Q', 'L', 'T', mm, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        iwork ), lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors of r in a
+              ! (workspace: need 4*n-1, prefer 3*n+(n-1)*nb)
+              call stdlib_dorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), lwork-iwork+&
+                        1, info )
+              iwork = ie + n
+              ! perform bidiagonal qr iteration
+                ! multiply b by transpose of left singular vectors
+                ! compute right singular vectors in a
+              ! (workspace: need bdspac)
+              call stdlib_dbdsqr( 'U', n, n, 0, nrhs, s, work( ie ), a, lda, dum,1, b, ldb, work( &
+                        iwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, n
+                 if( s( i )>thr ) then
+                    call stdlib_drscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_dlaset( 'F', 1, nrhs, zero, zero, b( i, 1 ), ldb )
+                 end if
+              end do
+              ! multiply b by right singular vectors
+              ! (workspace: need n, prefer n*nrhs)
+              if( lwork>=ldb*nrhs .and. nrhs>1 ) then
+                 call stdlib_dgemm( 'T', 'N', n, nrhs, n, one, a, lda, b, ldb, zero,work, ldb )
+                           
+                 call stdlib_dlacpy( 'G', n, nrhs, work, ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = lwork / n
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_dgemm( 'T', 'N', n, bl, n, one, a, lda, b( 1, i ),ldb, zero, work,&
+                               n )
+                    call stdlib_dlacpy( 'G', n, bl, work, n, b( 1, i ), ldb )
+                 end do
+              else
+                 call stdlib_dgemv( 'T', n, n, one, a, lda, b, 1, zero, work, 1 )
+                 call stdlib_dcopy( n, work, 1, b, 1 )
+              end if
+           else if( n>=mnthr .and. lwork>=4*m+m*m+max( m, 2*m-4, nrhs, n-3*m ) ) then
+              ! path 2a - underdetermined, with many more columns than rows
+              ! and sufficient workspace for an efficient algorithm
+              ldwork = m
+              if( lwork>=max( 4*m+m*lda+max( m, 2*m-4, nrhs, n-3*m ),m*lda+m+m*nrhs ) )ldwork = &
+                        lda
+              itau = 1
+              iwork = m + 1
+              ! compute a=l*q
+              ! (workspace: need 2*m, prefer m+m*nb)
+              call stdlib_dgelqf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, info )
+                        
+              il = iwork
+              ! copy l to work(il), zeroing out above it
+              call stdlib_dlacpy( 'L', m, m, a, lda, work( il ), ldwork )
+              call stdlib_dlaset( 'U', m-1, m-1, zero, zero, work( il+ldwork ),ldwork )
+              ie = il + ldwork*m
+              itauq = ie + m
+              itaup = itauq + m
+              iwork = itaup + m
+              ! bidiagonalize l in work(il)
+              ! (workspace: need m*m+5*m, prefer m*m+4*m+2*m*nb)
+              call stdlib_dgebrd( m, m, work( il ), ldwork, s, work( ie ),work( itauq ), work( &
+                        itaup ), work( iwork ),lwork-iwork+1, info )
+              ! multiply b by transpose of left bidiagonalizing vectors of l
+              ! (workspace: need m*m+4*m+nrhs, prefer m*m+4*m+nrhs*nb)
+              call stdlib_dormbr( 'Q', 'L', 'T', m, nrhs, m, work( il ), ldwork,work( itauq ), b, &
+                        ldb, work( iwork ),lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors of r in work(il)
+              ! (workspace: need m*m+5*m-1, prefer m*m+4*m+(m-1)*nb)
+              call stdlib_dorgbr( 'P', m, m, m, work( il ), ldwork, work( itaup ),work( iwork ), &
+                        lwork-iwork+1, info )
+              iwork = ie + m
+              ! perform bidiagonal qr iteration,
+                 ! computing right singular vectors of l in work(il) and
+                 ! multiplying b by transpose of left singular vectors
+              ! (workspace: need m*m+m+bdspac)
+              call stdlib_dbdsqr( 'U', m, m, 0, nrhs, s, work( ie ), work( il ),ldwork, a, lda, b,&
+                         ldb, work( iwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, m
+                 if( s( i )>thr ) then
+                    call stdlib_drscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_dlaset( 'F', 1, nrhs, zero, zero, b( i, 1 ), ldb )
+                 end if
+              end do
+              iwork = ie
+              ! multiply b by right singular vectors of l in work(il)
+              ! (workspace: need m*m+2*m, prefer m*m+m+m*nrhs)
+              if( lwork>=ldb*nrhs+iwork-1 .and. nrhs>1 ) then
+                 call stdlib_dgemm( 'T', 'N', m, nrhs, m, one, work( il ), ldwork,b, ldb, zero, &
+                           work( iwork ), ldb )
+                 call stdlib_dlacpy( 'G', m, nrhs, work( iwork ), ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = ( lwork-iwork+1 ) / m
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_dgemm( 'T', 'N', m, bl, m, one, work( il ), ldwork,b( 1, i ), ldb,&
+                               zero, work( iwork ), m )
+                    call stdlib_dlacpy( 'G', m, bl, work( iwork ), m, b( 1, i ),ldb )
+                 end do
+              else
+                 call stdlib_dgemv( 'T', m, m, one, work( il ), ldwork, b( 1, 1 ),1, zero, work( &
+                           iwork ), 1 )
+                 call stdlib_dcopy( m, work( iwork ), 1, b( 1, 1 ), 1 )
+              end if
+              ! zero out below first m rows of b
+              call stdlib_dlaset( 'F', n-m, nrhs, zero, zero, b( m+1, 1 ), ldb )
+              iwork = itau + m
+              ! multiply transpose(q) by b
+              ! (workspace: need m+nrhs, prefer m+nrhs*nb)
+              call stdlib_dormlq( 'L', 'T', n, nrhs, m, a, lda, work( itau ), b,ldb, work( iwork )&
+                        , lwork-iwork+1, info )
+           else
+              ! path 2 - remaining underdetermined cases
+              ie = 1
+              itauq = ie + m
+              itaup = itauq + m
+              iwork = itaup + m
+              ! bidiagonalize a
+              ! (workspace: need 3*m+n, prefer 3*m+(m+n)*nb)
+              call stdlib_dgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), work( &
+                        iwork ), lwork-iwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors
+              ! (workspace: need 3*m+nrhs, prefer 3*m+nrhs*nb)
+              call stdlib_dormbr( 'Q', 'L', 'T', m, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        iwork ), lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors in a
+              ! (workspace: need 4*m, prefer 3*m+m*nb)
+              call stdlib_dorgbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), lwork-iwork+&
+                        1, info )
+              iwork = ie + m
+              ! perform bidiagonal qr iteration,
+                 ! computing right singular vectors of a in a and
+                 ! multiplying b by transpose of left singular vectors
+              ! (workspace: need bdspac)
+              call stdlib_dbdsqr( 'L', m, n, 0, nrhs, s, work( ie ), a, lda, dum,1, b, ldb, work( &
+                        iwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, m
+                 if( s( i )>thr ) then
+                    call stdlib_drscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_dlaset( 'F', 1, nrhs, zero, zero, b( i, 1 ), ldb )
+                 end if
+              end do
+              ! multiply b by right singular vectors of a
+              ! (workspace: need n, prefer n*nrhs)
+              if( lwork>=ldb*nrhs .and. nrhs>1 ) then
+                 call stdlib_dgemm( 'T', 'N', n, nrhs, m, one, a, lda, b, ldb, zero,work, ldb )
+                           
+                 call stdlib_dlacpy( 'F', n, nrhs, work, ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = lwork / n
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_dgemm( 'T', 'N', n, bl, m, one, a, lda, b( 1, i ),ldb, zero, work,&
+                               n )
+                    call stdlib_dlacpy( 'F', n, bl, work, n, b( 1, i ), ldb )
+                 end do
+              else
+                 call stdlib_dgemv( 'T', m, n, one, a, lda, b, 1, zero, work, 1 )
+                 call stdlib_dcopy( n, work, 1, b, 1 )
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_dlascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_dlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,info )
+           else if( iascl==2 ) then
+              call stdlib_dlascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_dlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_dlascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_dlascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           70 continue
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_dgelss
+
+     !> DGESDD: computes the singular value decomposition (SVD) of a real
+     !> M-by-N matrix A, optionally computing the left and right singular
+     !> vectors.  If singular vectors are desired, it uses a
+     !> divide-and-conquer algorithm.
+     !> The SVD is written
+     !> A = U * SIGMA * transpose(V)
+     !> where SIGMA is an M-by-N matrix which is zero except for its
+     !> min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
+     !> V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
+     !> are the singular values of A; they are real and non-negative, and
+     !> are returned in descending order.  The first min(m,n) columns of
+     !> U and V are the left and right singular vectors of A.
+     !> Note that the routine returns VT = V**T, not V.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_dgesdd( jobz, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, iwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldvt, lwork, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: s(*), u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wntqa, wntqas, wntqn, wntqo, wntqs
+           integer(ilp) :: bdspac, blk, chunk, i, ie, ierr, il, ir, iscl, itau, itaup, itauq, iu, &
+                     ivt, ldwkvt, ldwrkl, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr, nwork, wrkbl
+           integer(ilp) :: lwork_dgebrd_mn, lwork_dgebrd_mm, lwork_dgebrd_nn, lwork_dgelqf_mn, &
+           lwork_dgeqrf_mn, lwork_dorgbr_p_mm, lwork_dorgbr_q_nn, lwork_dorglq_mn, &
+           lwork_dorglq_nn, lwork_dorgqr_mm, lwork_dorgqr_mn, lwork_dormbr_prt_mm, &
+           lwork_dormbr_qln_mm, lwork_dormbr_prt_mn, lwork_dormbr_qln_mn, lwork_dormbr_prt_nn, &
+                     lwork_dormbr_qln_nn
+           real(dp) :: anrm, bignum, eps, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(dp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info   = 0
+           minmn  = min( m, n )
+           wntqa  = stdlib_lsame( jobz, 'A' )
+           wntqs  = stdlib_lsame( jobz, 'S' )
+           wntqas = wntqa .or. wntqs
+           wntqo  = stdlib_lsame( jobz, 'O' )
+           wntqn  = stdlib_lsame( jobz, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.( wntqa .or. wntqs .or. wntqo .or. wntqn ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldu<1 .or. ( wntqas .and. ldu<m ) .or.( wntqo .and. m<n .and. ldu<m ) ) &
+                     then
+              info = -8
+           else if( ldvt<1 .or. ( wntqa .and. ldvt<n ) .or.( wntqs .and. ldvt<minmn ) .or.( wntqo &
+                     .and. m>=n .and. ldvt<n ) ) then
+              info = -10
+           end if
+           ! compute workspace
+             ! note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace allocated at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              bdspac = 0
+              mnthr  = int( minmn*11.0_dp / 6.0_dp,KIND=ilp)
+              if( m>=n .and. minmn>0 ) then
+                 ! compute space needed for stdlib_dbdsdc
+                 if( wntqn ) then
+                    ! stdlib_dbdsdc needs only 4*n (or 6*n for uplo=l for lapack <= 3.6_dp)
+                    ! keep 7*n for backwards compatibility.
+                    bdspac = 7*n
+                 else
+                    bdspac = 3*n*n + 4*n
+                 end if
+                 ! compute space preferred for each routine
+                 call stdlib_dgebrd( m, n, dum(1), m, dum(1), dum(1), dum(1),dum(1), dum(1), -1, &
+                           ierr )
+                 lwork_dgebrd_mn = int( dum(1),KIND=ilp)
+                 call stdlib_dgebrd( n, n, dum(1), n, dum(1), dum(1), dum(1),dum(1), dum(1), -1, &
+                           ierr )
+                 lwork_dgebrd_nn = int( dum(1),KIND=ilp)
+                 call stdlib_dgeqrf( m, n, dum(1), m, dum(1), dum(1), -1, ierr )
+                 lwork_dgeqrf_mn = int( dum(1),KIND=ilp)
+                 call stdlib_dorgbr( 'Q', n, n, n, dum(1), n, dum(1), dum(1), -1,ierr )
+                 lwork_dorgbr_q_nn = int( dum(1),KIND=ilp)
+                 call stdlib_dorgqr( m, m, n, dum(1), m, dum(1), dum(1), -1, ierr )
+                 lwork_dorgqr_mm = int( dum(1),KIND=ilp)
+                 call stdlib_dorgqr( m, n, n, dum(1), m, dum(1), dum(1), -1, ierr )
+                 lwork_dorgqr_mn = int( dum(1),KIND=ilp)
+                 call stdlib_dormbr( 'P', 'R', 'T', n, n, n, dum(1), n,dum(1), dum(1), n, dum(1), &
+                           -1, ierr )
+                 lwork_dormbr_prt_nn = int( dum(1),KIND=ilp)
+                 call stdlib_dormbr( 'Q', 'L', 'N', n, n, n, dum(1), n,dum(1), dum(1), n, dum(1), &
+                           -1, ierr )
+                 lwork_dormbr_qln_nn = int( dum(1),KIND=ilp)
+                 call stdlib_dormbr( 'Q', 'L', 'N', m, n, n, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_dormbr_qln_mn = int( dum(1),KIND=ilp)
+                 call stdlib_dormbr( 'Q', 'L', 'N', m, m, n, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_dormbr_qln_mm = int( dum(1),KIND=ilp)
+                 if( m>=mnthr ) then
+                    if( wntqn ) then
+                       ! path 1 (m >> n, jobz='n')
+                       wrkbl = n + lwork_dgeqrf_mn
+                       wrkbl = max( wrkbl, 3*n + lwork_dgebrd_nn )
+                       maxwrk = max( wrkbl, bdspac + n )
+                       minwrk = bdspac + n
+                    else if( wntqo ) then
+                       ! path 2 (m >> n, jobz='o')
+                       wrkbl = n + lwork_dgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_dorgqr_mn )
+                       wrkbl = max( wrkbl, 3*n + lwork_dgebrd_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_dormbr_qln_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_dormbr_prt_nn )
+                       wrkbl = max( wrkbl, 3*n + bdspac )
+                       maxwrk = wrkbl + 2*n*n
+                       minwrk = bdspac + 2*n*n + 3*n
+                    else if( wntqs ) then
+                       ! path 3 (m >> n, jobz='s')
+                       wrkbl = n + lwork_dgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_dorgqr_mn )
+                       wrkbl = max( wrkbl, 3*n + lwork_dgebrd_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_dormbr_qln_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_dormbr_prt_nn )
+                       wrkbl = max( wrkbl, 3*n + bdspac )
+                       maxwrk = wrkbl + n*n
+                       minwrk = bdspac + n*n + 3*n
+                    else if( wntqa ) then
+                       ! path 4 (m >> n, jobz='a')
+                       wrkbl = n + lwork_dgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_dorgqr_mm )
+                       wrkbl = max( wrkbl, 3*n + lwork_dgebrd_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_dormbr_qln_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_dormbr_prt_nn )
+                       wrkbl = max( wrkbl, 3*n + bdspac )
+                       maxwrk = wrkbl + n*n
+                       minwrk = n*n + max( 3*n + bdspac, n + m )
+                    end if
+                 else
+                    ! path 5 (m >= n, but not much larger)
+                    wrkbl = 3*n + lwork_dgebrd_mn
+                    if( wntqn ) then
+                       ! path 5n (m >= n, jobz='n')
+                       maxwrk = max( wrkbl, 3*n + bdspac )
+                       minwrk = 3*n + max( m, bdspac )
+                    else if( wntqo ) then
+                       ! path 5o (m >= n, jobz='o')
+                       wrkbl = max( wrkbl, 3*n + lwork_dormbr_prt_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_dormbr_qln_mn )
+                       wrkbl = max( wrkbl, 3*n + bdspac )
+                       maxwrk = wrkbl + m*n
+                       minwrk = 3*n + max( m, n*n + bdspac )
+                    else if( wntqs ) then
+                       ! path 5s (m >= n, jobz='s')
+                       wrkbl = max( wrkbl, 3*n + lwork_dormbr_qln_mn )
+                       wrkbl = max( wrkbl, 3*n + lwork_dormbr_prt_nn )
+                       maxwrk = max( wrkbl, 3*n + bdspac )
+                       minwrk = 3*n + max( m, bdspac )
+                    else if( wntqa ) then
+                       ! path 5a (m >= n, jobz='a')
+                       wrkbl = max( wrkbl, 3*n + lwork_dormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*n + lwork_dormbr_prt_nn )
+                       maxwrk = max( wrkbl, 3*n + bdspac )
+                       minwrk = 3*n + max( m, bdspac )
+                    end if
+                 end if
+              else if( minmn>0 ) then
+                 ! compute space needed for stdlib_dbdsdc
+                 if( wntqn ) then
+                    ! stdlib_dbdsdc needs only 4*n (or 6*n for uplo=l for lapack <= 3.6_dp)
+                    ! keep 7*n for backwards compatibility.
+                    bdspac = 7*m
+                 else
+                    bdspac = 3*m*m + 4*m
+                 end if
+                 ! compute space preferred for each routine
+                 call stdlib_dgebrd( m, n, dum(1), m, dum(1), dum(1), dum(1),dum(1), dum(1), -1, &
+                           ierr )
+                 lwork_dgebrd_mn = int( dum(1),KIND=ilp)
+                 call stdlib_dgebrd( m, m, a, m, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                           
+                 lwork_dgebrd_mm = int( dum(1),KIND=ilp)
+                 call stdlib_dgelqf( m, n, a, m, dum(1), dum(1), -1, ierr )
+                 lwork_dgelqf_mn = int( dum(1),KIND=ilp)
+                 call stdlib_dorglq( n, n, m, dum(1), n, dum(1), dum(1), -1, ierr )
+                 lwork_dorglq_nn = int( dum(1),KIND=ilp)
+                 call stdlib_dorglq( m, n, m, a, m, dum(1), dum(1), -1, ierr )
+                 lwork_dorglq_mn = int( dum(1),KIND=ilp)
+                 call stdlib_dorgbr( 'P', m, m, m, a, n, dum(1), dum(1), -1, ierr )
+                 lwork_dorgbr_p_mm = int( dum(1),KIND=ilp)
+                 call stdlib_dormbr( 'P', 'R', 'T', m, m, m, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_dormbr_prt_mm = int( dum(1),KIND=ilp)
+                 call stdlib_dormbr( 'P', 'R', 'T', m, n, m, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_dormbr_prt_mn = int( dum(1),KIND=ilp)
+                 call stdlib_dormbr( 'P', 'R', 'T', n, n, m, dum(1), n,dum(1), dum(1), n, dum(1), &
+                           -1, ierr )
+                 lwork_dormbr_prt_nn = int( dum(1),KIND=ilp)
+                 call stdlib_dormbr( 'Q', 'L', 'N', m, m, m, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_dormbr_qln_mm = int( dum(1),KIND=ilp)
+                 if( n>=mnthr ) then
+                    if( wntqn ) then
+                       ! path 1t (n >> m, jobz='n')
+                       wrkbl = m + lwork_dgelqf_mn
+                       wrkbl = max( wrkbl, 3*m + lwork_dgebrd_mm )
+                       maxwrk = max( wrkbl, bdspac + m )
+                       minwrk = bdspac + m
+                    else if( wntqo ) then
+                       ! path 2t (n >> m, jobz='o')
+                       wrkbl = m + lwork_dgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_dorglq_mn )
+                       wrkbl = max( wrkbl, 3*m + lwork_dgebrd_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_dormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_dormbr_prt_mm )
+                       wrkbl = max( wrkbl, 3*m + bdspac )
+                       maxwrk = wrkbl + 2*m*m
+                       minwrk = bdspac + 2*m*m + 3*m
+                    else if( wntqs ) then
+                       ! path 3t (n >> m, jobz='s')
+                       wrkbl = m + lwork_dgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_dorglq_mn )
+                       wrkbl = max( wrkbl, 3*m + lwork_dgebrd_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_dormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_dormbr_prt_mm )
+                       wrkbl = max( wrkbl, 3*m + bdspac )
+                       maxwrk = wrkbl + m*m
+                       minwrk = bdspac + m*m + 3*m
+                    else if( wntqa ) then
+                       ! path 4t (n >> m, jobz='a')
+                       wrkbl = m + lwork_dgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_dorglq_nn )
+                       wrkbl = max( wrkbl, 3*m + lwork_dgebrd_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_dormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_dormbr_prt_mm )
+                       wrkbl = max( wrkbl, 3*m + bdspac )
+                       maxwrk = wrkbl + m*m
+                       minwrk = m*m + max( 3*m + bdspac, m + n )
+                    end if
+                 else
+                    ! path 5t (n > m, but not much larger)
+                    wrkbl = 3*m + lwork_dgebrd_mn
+                    if( wntqn ) then
+                       ! path 5tn (n > m, jobz='n')
+                       maxwrk = max( wrkbl, 3*m + bdspac )
+                       minwrk = 3*m + max( n, bdspac )
+                    else if( wntqo ) then
+                       ! path 5to (n > m, jobz='o')
+                       wrkbl = max( wrkbl, 3*m + lwork_dormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_dormbr_prt_mn )
+                       wrkbl = max( wrkbl, 3*m + bdspac )
+                       maxwrk = wrkbl + m*n
+                       minwrk = 3*m + max( n, m*m + bdspac )
+                    else if( wntqs ) then
+                       ! path 5ts (n > m, jobz='s')
+                       wrkbl = max( wrkbl, 3*m + lwork_dormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_dormbr_prt_mn )
+                       maxwrk = max( wrkbl, 3*m + bdspac )
+                       minwrk = 3*m + max( n, bdspac )
+                    else if( wntqa ) then
+                       ! path 5ta (n > m, jobz='a')
+                       wrkbl = max( wrkbl, 3*m + lwork_dormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_dormbr_prt_nn )
+                       maxwrk = max( wrkbl, 3*m + bdspac )
+                       minwrk = 3*m + max( n, bdspac )
+                    end if
+                 end if
+              end if
+              maxwrk = max( maxwrk, minwrk )
+              work( 1 ) = stdlib_droundup_lwork( maxwrk )
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGESDD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = sqrt( stdlib_dlamch( 'S' ) ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', m, n, a, lda, dum )
+           if( stdlib_disnan( anrm ) ) then
+               info = -4
+               return
+           end if
+           iscl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              iscl = 1
+              call stdlib_dlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, ierr )
+           else if( anrm>bignum ) then
+              iscl = 1
+              call stdlib_dlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, ierr )
+           end if
+           if( m>=n ) then
+              ! a has at least as many rows as columns. if a has sufficiently
+              ! more rows than columns, first reduce using the qr
+              ! decomposition (if sufficient workspace available)
+              if( m>=mnthr ) then
+                 if( wntqn ) then
+                    ! path 1 (m >> n, jobz='n')
+                    ! no singular vectors to be computed
+                    itau = 1
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! workspace: need   n [tau] + n    [work]
+                    ! workspace: prefer n [tau] + n*nb [work]
+                    call stdlib_dgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! zero out below r
+                    call stdlib_dlaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ), lda )
+                    ie = 1
+                    itauq = ie + n
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! workspace: need   3*n [e, tauq, taup] + n      [work]
+                    ! workspace: prefer 3*n [e, tauq, taup] + 2*n*nb [work]
+                    call stdlib_dgebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( nwork ), lwork-nwork+1,ierr )
+                    nwork = ie + n
+                    ! perform bidiagonal svd, computing singular values only
+                    ! workspace: need   n [e] + bdspac
+                    call stdlib_dbdsdc( 'U', 'N', n, s, work( ie ), dum, 1, dum, 1,dum, idum, &
+                              work( nwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 2 (m >> n, jobz = 'o')
+                    ! n left singular vectors to be overwritten on a and
+                    ! n right singular vectors to be computed in vt
+                    ir = 1
+                    ! work(ir) is ldwrkr by n
+                    if( lwork >= lda*n + n*n + 3*n + bdspac ) then
+                       ldwrkr = lda
+                    else
+                       ldwrkr = ( lwork - n*n - 3*n - bdspac ) / n
+                    end if
+                    itau = ir + ldwrkr*n
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! workspace: need   n*n [r] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    call stdlib_dgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! copy r to work(ir), zeroing out below it
+                    call stdlib_dlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                    call stdlib_dlaset( 'L', n - 1, n - 1, zero, zero, work(ir+1),ldwrkr )
+                    ! generate q in a
+                    ! workspace: need   n*n [r] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    call stdlib_dorgqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ie = itau
+                    itauq = ie + n
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in work(ir)
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n      [work]
+                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + 2*n*nb [work]
+                    call stdlib_dgebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork - nwork + 1, ierr )
+                    ! work(iu) is n by n
+                    iu = nwork
+                    nwork = iu + n*n
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in work(iu) and computing right
+                    ! singular vectors of bidiagonal matrix in vt
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u] + bdspac
+                    call stdlib_dbdsdc( 'U', 'I', n, s, work( ie ), work( iu ), n,vt, ldvt, dum, &
+                              idum, work( nwork ), iwork,info )
+                    ! overwrite work(iu) by left singular vectors of r
+                    ! and vt by right singular vectors of r
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u] + n    [work]
+                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
+                    call stdlib_dormbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                              work( iu ), n, work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_dormbr( 'P', 'R', 'T', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork - nwork + 1, ierr )
+                    ! multiply q in a by left singular vectors of r in
+                    ! work(iu), storing result in work(ir) and copying to a
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u]
+                    ! workspace: prefer m*n [r] + 3*n [e, tauq, taup] + n*n [u]
+                    do i = 1, m, ldwrkr
+                       chunk = min( m - i + 1, ldwrkr )
+                       call stdlib_dgemm( 'N', 'N', chunk, n, n, one, a( i, 1 ),lda, work( iu ), &
+                                 n, zero, work( ir ),ldwrkr )
+                       call stdlib_dlacpy( 'F', chunk, n, work( ir ), ldwrkr,a( i, 1 ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 3 (m >> n, jobz='s')
+                    ! n left singular vectors to be computed in u and
+                    ! n right singular vectors to be computed in vt
+                    ir = 1
+                    ! work(ir) is n by n
+                    ldwrkr = n
+                    itau = ir + ldwrkr*n
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! workspace: need   n*n [r] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    call stdlib_dgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! copy r to work(ir), zeroing out below it
+                    call stdlib_dlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                    call stdlib_dlaset( 'L', n - 1, n - 1, zero, zero, work(ir+1),ldwrkr )
+                    ! generate q in a
+                    ! workspace: need   n*n [r] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    call stdlib_dorgqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ie = itau
+                    itauq = ie + n
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in work(ir)
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n      [work]
+                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + 2*n*nb [work]
+                    call stdlib_dgebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork - nwork + 1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagoal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + bdspac
+                    call stdlib_dbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! overwrite u by left singular vectors of r and vt
+                    ! by right singular vectors of r
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n    [work]
+                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + n*nb [work]
+                    call stdlib_dormbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                              u, ldu, work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_dormbr( 'P', 'R', 'T', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork - nwork + 1, ierr )
+                    ! multiply q in a by left singular vectors of r in
+                    ! work(ir), storing result in u
+                    ! workspace: need   n*n [r]
+                    call stdlib_dlacpy( 'F', n, n, u, ldu, work( ir ), ldwrkr )
+                    call stdlib_dgemm( 'N', 'N', m, n, n, one, a, lda, work( ir ),ldwrkr, zero, u,&
+                               ldu )
+                 else if( wntqa ) then
+                    ! path 4 (m >> n, jobz='a')
+                    ! m left singular vectors to be computed in u and
+                    ! n right singular vectors to be computed in vt
+                    iu = 1
+                    ! work(iu) is n by n
+                    ldwrku = n
+                    itau = iu + ldwrku*n
+                    nwork = itau + n
+                    ! compute a=q*r, copying result to u
+                    ! workspace: need   n*n [u] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [u] + n [tau] + n*nb [work]
+                    call stdlib_dgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    call stdlib_dlacpy( 'L', m, n, a, lda, u, ldu )
+                    ! generate q in u
+                    ! workspace: need   n*n [u] + n [tau] + m    [work]
+                    ! workspace: prefer n*n [u] + n [tau] + m*nb [work]
+                    call stdlib_dorgqr( m, m, n, u, ldu, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ! produce r in a, zeroing out other entries
+                    call stdlib_dlaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ), lda )
+                    ie = itau
+                    itauq = ie + n
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + n      [work]
+                    ! workspace: prefer n*n [u] + 3*n [e, tauq, taup] + 2*n*nb [work]
+                    call stdlib_dgebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( nwork ), lwork-nwork+1,ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in work(iu) and computing right
+                    ! singular vectors of bidiagonal matrix in vt
+                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + bdspac
+                    call stdlib_dbdsdc( 'U', 'I', n, s, work( ie ), work( iu ), n,vt, ldvt, dum, &
+                              idum, work( nwork ), iwork,info )
+                    ! overwrite work(iu) by left singular vectors of r and vt
+                    ! by right singular vectors of r
+                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + n    [work]
+                    ! workspace: prefer n*n [u] + 3*n [e, tauq, taup] + n*nb [work]
+                    call stdlib_dormbr( 'Q', 'L', 'N', n, n, n, a, lda,work( itauq ), work( iu ), &
+                              ldwrku,work( nwork ), lwork - nwork + 1, ierr )
+                    call stdlib_dormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    ! multiply q in u by left singular vectors of r in
+                    ! work(iu), storing result in a
+                    ! workspace: need   n*n [u]
+                    call stdlib_dgemm( 'N', 'N', m, n, n, one, u, ldu, work( iu ),ldwrku, zero, a,&
+                               lda )
+                    ! copy left singular vectors of a from a to u
+                    call stdlib_dlacpy( 'F', m, n, a, lda, u, ldu )
+                 end if
+              else
+                 ! m < mnthr
+                 ! path 5 (m >= n, but not much larger)
+                 ! reduce to bidiagonal form without qr decomposition
+                 ie = 1
+                 itauq = ie + n
+                 itaup = itauq + n
+                 nwork = itaup + n
+                 ! bidiagonalize a
+                 ! workspace: need   3*n [e, tauq, taup] + m        [work]
+                 ! workspace: prefer 3*n [e, tauq, taup] + (m+n)*nb [work]
+                 call stdlib_dgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 5n (m >= n, jobz='n')
+                    ! perform bidiagonal svd, only computing singular values
+                    ! workspace: need   3*n [e, tauq, taup] + bdspac
+                    call stdlib_dbdsdc( 'U', 'N', n, s, work( ie ), dum, 1, dum, 1,dum, idum, &
+                              work( nwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 5o (m >= n, jobz='o')
+                    iu = nwork
+                    if( lwork >= m*n + 3*n + bdspac ) then
+                       ! work( iu ) is m by n
+                       ldwrku = m
+                       nwork = iu + ldwrku*n
+                       call stdlib_dlaset( 'F', m, n, zero, zero, work( iu ),ldwrku )
+                       ! ir is unused; silence compile warnings
+                       ir = -1
+                    else
+                       ! work( iu ) is n by n
+                       ldwrku = n
+                       nwork = iu + ldwrku*n
+                       ! work(ir) is ldwrkr by n
+                       ir = nwork
+                       ldwrkr = ( lwork - n*n - 3*n ) / n
+                    end if
+                    nwork = iu + ldwrku*n
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in work(iu) and computing right
+                    ! singular vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*n [e, tauq, taup] + n*n [u] + bdspac
+                    call stdlib_dbdsdc( 'U', 'I', n, s, work( ie ), work( iu ),ldwrku, vt, ldvt, &
+                              dum, idum, work( nwork ),iwork, info )
+                    ! overwrite vt by right singular vectors of a
+                    ! workspace: need   3*n [e, tauq, taup] + n*n [u] + n    [work]
+                    ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
+                    call stdlib_dormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    if( lwork >= m*n + 3*n + bdspac ) then
+                       ! path 5o-fast
+                       ! overwrite work(iu) by left singular vectors of a
+                       ! workspace: need   3*n [e, tauq, taup] + m*n [u] + n    [work]
+                       ! workspace: prefer 3*n [e, tauq, taup] + m*n [u] + n*nb [work]
+                       call stdlib_dormbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), work( iu &
+                                 ), ldwrku,work( nwork ), lwork - nwork + 1, ierr )
+                       ! copy left singular vectors of a from work(iu) to a
+                       call stdlib_dlacpy( 'F', m, n, work( iu ), ldwrku, a, lda )
+                    else
+                       ! path 5o-slow
+                       ! generate q in a
+                       ! workspace: need   3*n [e, tauq, taup] + n*n [u] + n    [work]
+                       ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
+                       call stdlib_dorgbr( 'Q', m, n, n, a, lda, work( itauq ),work( nwork ), &
+                                 lwork - nwork + 1, ierr )
+                       ! multiply q in a by left singular vectors of
+                       ! bidiagonal matrix in work(iu), storing result in
+                       ! work(ir) and copying to a
+                       ! workspace: need   3*n [e, tauq, taup] + n*n [u] + nb*n [r]
+                       ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + m*n  [r]
+                       do i = 1, m, ldwrkr
+                          chunk = min( m - i + 1, ldwrkr )
+                          call stdlib_dgemm( 'N', 'N', chunk, n, n, one, a( i, 1 ),lda, work( iu )&
+                                    , ldwrku, zero,work( ir ), ldwrkr )
+                          call stdlib_dlacpy( 'F', chunk, n, work( ir ), ldwrkr,a( i, 1 ), lda )
+                                    
+                       end do
+                    end if
+                 else if( wntqs ) then
+                    ! path 5s (m >= n, jobz='s')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*n [e, tauq, taup] + bdspac
+                    call stdlib_dlaset( 'F', m, n, zero, zero, u, ldu )
+                    call stdlib_dbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! overwrite u by left singular vectors of a and vt
+                    ! by right singular vectors of a
+                    ! workspace: need   3*n [e, tauq, taup] + n    [work]
+                    ! workspace: prefer 3*n [e, tauq, taup] + n*nb [work]
+                    call stdlib_dormbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_dormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                 else if( wntqa ) then
+                    ! path 5a (m >= n, jobz='a')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*n [e, tauq, taup] + bdspac
+                    call stdlib_dlaset( 'F', m, m, zero, zero, u, ldu )
+                    call stdlib_dbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! set the right corner of u to identity matrix
+                    if( m>n ) then
+                       call stdlib_dlaset( 'F', m - n, m - n, zero, one, u(n+1,n+1),ldu )
+                    end if
+                    ! overwrite u by left singular vectors of a and vt
+                    ! by right singular vectors of a
+                    ! workspace: need   3*n [e, tauq, taup] + m    [work]
+                    ! workspace: prefer 3*n [e, tauq, taup] + m*nb [work]
+                    call stdlib_dormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_dormbr( 'P', 'R', 'T', n, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                 end if
+              end if
+           else
+              ! a has more columns than rows. if a has sufficiently more
+              ! columns than rows, first reduce using the lq decomposition (if
+              ! sufficient workspace available)
+              if( n>=mnthr ) then
+                 if( wntqn ) then
+                    ! path 1t (n >> m, jobz='n')
+                    ! no singular vectors to be computed
+                    itau = 1
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! workspace: need   m [tau] + m [work]
+                    ! workspace: prefer m [tau] + m*nb [work]
+                    call stdlib_dgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! zero out above l
+                    call stdlib_dlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ), lda )
+                    ie = 1
+                    itauq = ie + m
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! workspace: need   3*m [e, tauq, taup] + m      [work]
+                    ! workspace: prefer 3*m [e, tauq, taup] + 2*m*nb [work]
+                    call stdlib_dgebrd( m, m, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( nwork ), lwork-nwork+1,ierr )
+                    nwork = ie + m
+                    ! perform bidiagonal svd, computing singular values only
+                    ! workspace: need   m [e] + bdspac
+                    call stdlib_dbdsdc( 'U', 'N', m, s, work( ie ), dum, 1, dum, 1,dum, idum, &
+                              work( nwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 2t (n >> m, jobz='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! m left singular vectors to be computed in u
+                    ivt = 1
+                    ! work(ivt) is m by m
+                    ! work(il)  is m by m; it is later resized to m by chunk for gemm
+                    il = ivt + m*m
+                    if( lwork >= m*n + m*m + 3*m + bdspac ) then
+                       ldwrkl = m
+                       chunk = n
+                    else
+                       ldwrkl = m
+                       chunk = ( lwork - m*m ) / m
+                    end if
+                    itau = il + ldwrkl*m
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! workspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
+                    call stdlib_dgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! copy l to work(il), zeroing about above it
+                    call stdlib_dlacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
+                    call stdlib_dlaset( 'U', m - 1, m - 1, zero, zero,work( il + ldwrkl ), ldwrkl &
+                              )
+                    ! generate q in a
+                    ! workspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
+                    call stdlib_dorglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ie = itau
+                    itauq = ie + m
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in work(il)
+                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m      [work]
+                    ! workspace: prefer m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + 2*m*nb [work]
+                    call stdlib_dgebrd( m, m, work( il ), ldwrkl, s, work( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork - nwork + 1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u, and computing right singular
+                    ! vectors of bidiagonal matrix in work(ivt)
+                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + bdspac
+                    call stdlib_dbdsdc( 'U', 'I', m, s, work( ie ), u, ldu,work( ivt ), m, dum, &
+                              idum, work( nwork ),iwork, info )
+                    ! overwrite u by left singular vectors of l and work(ivt)
+                    ! by right singular vectors of l
+                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m    [work]
+                    ! workspace: prefer m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m*nb [work]
+                    call stdlib_dormbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
+                              u, ldu, work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_dormbr( 'P', 'R', 'T', m, m, m, work( il ), ldwrkl,work( itaup ), &
+                              work( ivt ), m,work( nwork ), lwork - nwork + 1, ierr )
+                    ! multiply right singular vectors of l in work(ivt) by q
+                    ! in a, storing result in work(il) and copying to a
+                    ! workspace: need   m*m [vt] + m*m [l]
+                    ! workspace: prefer m*m [vt] + m*n [l]
+                    ! at this point, l is resized as m by chunk.
+                    do i = 1, n, chunk
+                       blk = min( n - i + 1, chunk )
+                       call stdlib_dgemm( 'N', 'N', m, blk, m, one, work( ivt ), m,a( 1, i ), lda,&
+                                  zero, work( il ), ldwrkl )
+                       call stdlib_dlacpy( 'F', m, blk, work( il ), ldwrkl,a( 1, i ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 3t (n >> m, jobz='s')
+                    ! m right singular vectors to be computed in vt and
+                    ! m left singular vectors to be computed in u
+                    il = 1
+                    ! work(il) is m by m
+                    ldwrkl = m
+                    itau = il + ldwrkl*m
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! workspace: need   m*m [l] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [l] + m [tau] + m*nb [work]
+                    call stdlib_dgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! copy l to work(il), zeroing out above it
+                    call stdlib_dlacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
+                    call stdlib_dlaset( 'U', m - 1, m - 1, zero, zero,work( il + ldwrkl ), ldwrkl &
+                              )
+                    ! generate q in a
+                    ! workspace: need   m*m [l] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [l] + m [tau] + m*nb [work]
+                    call stdlib_dorglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ie = itau
+                    itauq = ie + m
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in work(iu).
+                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + m      [work]
+                    ! workspace: prefer m*m [l] + 3*m [e, tauq, taup] + 2*m*nb [work]
+                    call stdlib_dgebrd( m, m, work( il ), ldwrkl, s, work( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork - nwork + 1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + bdspac
+                    call stdlib_dbdsdc( 'U', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! overwrite u by left singular vectors of l and vt
+                    ! by right singular vectors of l
+                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + m    [work]
+                    ! workspace: prefer m*m [l] + 3*m [e, tauq, taup] + m*nb [work]
+                    call stdlib_dormbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
+                              u, ldu, work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_dormbr( 'P', 'R', 'T', m, m, m, work( il ), ldwrkl,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork - nwork + 1, ierr )
+                    ! multiply right singular vectors of l in work(il) by
+                    ! q in a, storing result in vt
+                    ! workspace: need   m*m [l]
+                    call stdlib_dlacpy( 'F', m, m, vt, ldvt, work( il ), ldwrkl )
+                    call stdlib_dgemm( 'N', 'N', m, n, m, one, work( il ), ldwrkl,a, lda, zero, &
+                              vt, ldvt )
+                 else if( wntqa ) then
+                    ! path 4t (n >> m, jobz='a')
+                    ! n right singular vectors to be computed in vt and
+                    ! m left singular vectors to be computed in u
+                    ivt = 1
+                    ! work(ivt) is m by m
+                    ldwkvt = m
+                    itau = ivt + ldwkvt*m
+                    nwork = itau + m
+                    ! compute a=l*q, copying result to vt
+                    ! workspace: need   m*m [vt] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [vt] + m [tau] + m*nb [work]
+                    call stdlib_dgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    call stdlib_dlacpy( 'U', m, n, a, lda, vt, ldvt )
+                    ! generate q in vt
+                    ! workspace: need   m*m [vt] + m [tau] + n    [work]
+                    ! workspace: prefer m*m [vt] + m [tau] + n*nb [work]
+                    call stdlib_dorglq( n, n, m, vt, ldvt, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ! produce l in a, zeroing out other entries
+                    call stdlib_dlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ), lda )
+                    ie = itau
+                    itauq = ie + m
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup] + m      [work]
+                    ! workspace: prefer m*m [vt] + 3*m [e, tauq, taup] + 2*m*nb [work]
+                    call stdlib_dgebrd( m, m, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( nwork ), lwork-nwork+1,ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in work(ivt)
+                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup] + bdspac
+                    call stdlib_dbdsdc( 'U', 'I', m, s, work( ie ), u, ldu,work( ivt ), ldwkvt, &
+                              dum, idum,work( nwork ), iwork, info )
+                    ! overwrite u by left singular vectors of l and work(ivt)
+                    ! by right singular vectors of l
+                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup]+ m    [work]
+                    ! workspace: prefer m*m [vt] + 3*m [e, tauq, taup]+ m*nb [work]
+                    call stdlib_dormbr( 'Q', 'L', 'N', m, m, m, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_dormbr( 'P', 'R', 'T', m, m, m, a, lda,work( itaup ), work( ivt ),&
+                               ldwkvt,work( nwork ), lwork - nwork + 1, ierr )
+                    ! multiply right singular vectors of l in work(ivt) by
+                    ! q in vt, storing result in a
+                    ! workspace: need   m*m [vt]
+                    call stdlib_dgemm( 'N', 'N', m, n, m, one, work( ivt ), ldwkvt,vt, ldvt, zero,&
+                               a, lda )
+                    ! copy right singular vectors of a from a to vt
+                    call stdlib_dlacpy( 'F', m, n, a, lda, vt, ldvt )
+                 end if
+              else
+                 ! n < mnthr
+                 ! path 5t (n > m, but not much larger)
+                 ! reduce to bidiagonal form without lq decomposition
+                 ie = 1
+                 itauq = ie + m
+                 itaup = itauq + m
+                 nwork = itaup + m
+                 ! bidiagonalize a
+                 ! workspace: need   3*m [e, tauq, taup] + n        [work]
+                 ! workspace: prefer 3*m [e, tauq, taup] + (m+n)*nb [work]
+                 call stdlib_dgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 5tn (n > m, jobz='n')
+                    ! perform bidiagonal svd, only computing singular values
+                    ! workspace: need   3*m [e, tauq, taup] + bdspac
+                    call stdlib_dbdsdc( 'L', 'N', m, s, work( ie ), dum, 1, dum, 1,dum, idum, &
+                              work( nwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 5to (n > m, jobz='o')
+                    ldwkvt = m
+                    ivt = nwork
+                    if( lwork >= m*n + 3*m + bdspac ) then
+                       ! work( ivt ) is m by n
+                       call stdlib_dlaset( 'F', m, n, zero, zero, work( ivt ),ldwkvt )
+                       nwork = ivt + ldwkvt*n
+                       ! il is unused; silence compile warnings
+                       il = -1
+                    else
+                       ! work( ivt ) is m by m
+                       nwork = ivt + ldwkvt*m
+                       il = nwork
+                       ! work(il) is m by chunk
+                       chunk = ( lwork - m*m - 3*m ) / m
+                    end if
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in work(ivt)
+                    ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + bdspac
+                    call stdlib_dbdsdc( 'L', 'I', m, s, work( ie ), u, ldu,work( ivt ), ldwkvt, &
+                              dum, idum,work( nwork ), iwork, info )
+                    ! overwrite u by left singular vectors of a
+                    ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m    [work]
+                    ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*nb [work]
+                    call stdlib_dormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    if( lwork >= m*n + 3*m + bdspac ) then
+                       ! path 5to-fast
+                       ! overwrite work(ivt) by left singular vectors of a
+                       ! workspace: need   3*m [e, tauq, taup] + m*n [vt] + m    [work]
+                       ! workspace: prefer 3*m [e, tauq, taup] + m*n [vt] + m*nb [work]
+                       call stdlib_dormbr( 'P', 'R', 'T', m, n, m, a, lda,work( itaup ), work( &
+                                 ivt ), ldwkvt,work( nwork ), lwork - nwork + 1, ierr )
+                       ! copy right singular vectors of a from work(ivt) to a
+                       call stdlib_dlacpy( 'F', m, n, work( ivt ), ldwkvt, a, lda )
+                    else
+                       ! path 5to-slow
+                       ! generate p**t in a
+                       ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m    [work]
+                       ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*nb [work]
+                       call stdlib_dorgbr( 'P', m, n, m, a, lda, work( itaup ),work( nwork ), &
+                                 lwork - nwork + 1, ierr )
+                       ! multiply q in a by right singular vectors of
+                       ! bidiagonal matrix in work(ivt), storing result in
+                       ! work(il) and copying to a
+                       ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m*nb [l]
+                       ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*n  [l]
+                       do i = 1, n, chunk
+                          blk = min( n - i + 1, chunk )
+                          call stdlib_dgemm( 'N', 'N', m, blk, m, one, work( ivt ),ldwkvt, a( 1, &
+                                    i ), lda, zero,work( il ), m )
+                          call stdlib_dlacpy( 'F', m, blk, work( il ), m, a( 1, i ),lda )
+                       end do
+                    end if
+                 else if( wntqs ) then
+                    ! path 5ts (n > m, jobz='s')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*m [e, tauq, taup] + bdspac
+                    call stdlib_dlaset( 'F', m, n, zero, zero, vt, ldvt )
+                    call stdlib_dbdsdc( 'L', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! overwrite u by left singular vectors of a and vt
+                    ! by right singular vectors of a
+                    ! workspace: need   3*m [e, tauq, taup] + m    [work]
+                    ! workspace: prefer 3*m [e, tauq, taup] + m*nb [work]
+                    call stdlib_dormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_dormbr( 'P', 'R', 'T', m, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                 else if( wntqa ) then
+                    ! path 5ta (n > m, jobz='a')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*m [e, tauq, taup] + bdspac
+                    call stdlib_dlaset( 'F', n, n, zero, zero, vt, ldvt )
+                    call stdlib_dbdsdc( 'L', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! set the right corner of vt to identity matrix
+                    if( n>m ) then
+                       call stdlib_dlaset( 'F', n-m, n-m, zero, one, vt(m+1,m+1),ldvt )
+                    end if
+                    ! overwrite u by left singular vectors of a and vt
+                    ! by right singular vectors of a
+                    ! workspace: need   3*m [e, tauq, taup] + n    [work]
+                    ! workspace: prefer 3*m [e, tauq, taup] + n*nb [work]
+                    call stdlib_dormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_dormbr( 'P', 'R', 'T', n, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                 end if
+              end if
+           end if
+           ! undo scaling if necessary
+           if( iscl==1 ) then
+              if( anrm>bignum )call stdlib_dlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( anrm<smlnum )call stdlib_dlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+           end if
+           ! return optimal workspace in work(1)
+           work( 1 ) = stdlib_droundup_lwork( maxwrk )
+           return
+     end subroutine stdlib_dgesdd
+
+     !> DGESVD: computes the singular value decomposition (SVD) of a real
+     !> M-by-N matrix A, optionally computing the left and/or right singular
+     !> vectors. The SVD is written
+     !> A = U * SIGMA * transpose(V)
+     !> where SIGMA is an M-by-N matrix which is zero except for its
+     !> min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
+     !> V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
+     !> are the singular values of A; they are real and non-negative, and
+     !> are returned in descending order.  The first min(m,n) columns of
+     !> U and V are the left and right singular vectors of A.
+     !> Note that the routine returns V**T, not V.
+
+     subroutine stdlib_dgesvd( jobu, jobvt, m, n, a, lda, s, u, ldu,vt, ldvt, work, lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu, jobvt
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldvt, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: s(*), u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wntua, wntuas, wntun, wntuo, wntus, wntva, wntvas, wntvn, wntvo,&
+                      wntvs
+           integer(ilp) :: bdspac, blk, chunk, i, ie, ierr, ir, iscl, itau, itaup, itauq, iu, &
+           iwork, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr, ncu, ncvt, nru, nrvt, &
+                     wrkbl
+           integer(ilp) :: lwork_dgeqrf, lwork_dorgqr_n, lwork_dorgqr_m, lwork_dgebrd, &
+                     lwork_dorgbr_p, lwork_dorgbr_q, lwork_dgelqf, lwork_dorglq_n, lwork_dorglq_m
+           real(dp) :: anrm, bignum, eps, smlnum
+           ! Local Arrays 
+           real(dp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           minmn = min( m, n )
+           wntua = stdlib_lsame( jobu, 'A' )
+           wntus = stdlib_lsame( jobu, 'S' )
+           wntuas = wntua .or. wntus
+           wntuo = stdlib_lsame( jobu, 'O' )
+           wntun = stdlib_lsame( jobu, 'N' )
+           wntva = stdlib_lsame( jobvt, 'A' )
+           wntvs = stdlib_lsame( jobvt, 'S' )
+           wntvas = wntva .or. wntvs
+           wntvo = stdlib_lsame( jobvt, 'O' )
+           wntvn = stdlib_lsame( jobvt, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.( wntua .or. wntus .or. wntuo .or. wntun ) ) then
+              info = -1
+           else if( .not.( wntva .or. wntvs .or. wntvo .or. wntvn ) .or.( wntvo .and. wntuo ) ) &
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldu<1 .or. ( wntuas .and. ldu<m ) ) then
+              info = -9
+           else if( ldvt<1 .or. ( wntva .and. ldvt<n ) .or.( wntvs .and. ldvt<minmn ) ) &
+                     then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( m>=n .and. minmn>0 ) then
+                 ! compute space needed for stdlib_dbdsqr
+                 mnthr = stdlib_ilaenv( 6, 'DGESVD', jobu // jobvt, m, n, 0, 0 )
+                 bdspac = 5*n
+                 ! compute space needed for stdlib_dgeqrf
+                 call stdlib_dgeqrf( m, n, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_dgeqrf = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_dorgqr
+                 call stdlib_dorgqr( m, n, n, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_dorgqr_n = int( dum(1),KIND=ilp)
+                 call stdlib_dorgqr( m, m, n, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_dorgqr_m = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_dgebrd
+                 call stdlib_dgebrd( n, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                           
+                 lwork_dgebrd = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_dorgbr p
+                 call stdlib_dorgbr( 'P', n, n, n, a, lda, dum(1),dum(1), -1, ierr )
+                 lwork_dorgbr_p = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_dorgbr q
+                 call stdlib_dorgbr( 'Q', n, n, n, a, lda, dum(1),dum(1), -1, ierr )
+                 lwork_dorgbr_q = int( dum(1),KIND=ilp)
+                 if( m>=mnthr ) then
+                    if( wntun ) then
+                       ! path 1 (m much larger than n, jobu='n')
+                       maxwrk = n + lwork_dgeqrf
+                       maxwrk = max( maxwrk, 3*n + lwork_dgebrd )
+                       if( wntvo .or. wntvas )maxwrk = max( maxwrk, 3*n + lwork_dorgbr_p )
+                       maxwrk = max( maxwrk, bdspac )
+                       minwrk = max( 4*n, bdspac )
+                    else if( wntuo .and. wntvn ) then
+                       ! path 2 (m much larger than n, jobu='o', jobvt='n')
+                       wrkbl = n + lwork_dgeqrf
+                       wrkbl = max( wrkbl, n + lwork_dorgqr_n )
+                       wrkbl = max( wrkbl, 3*n + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_dorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = max( n*n + wrkbl, n*n + m*n + n )
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntuo .and. wntvas ) then
+                       ! path 3 (m much larger than n, jobu='o', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_dgeqrf
+                       wrkbl = max( wrkbl, n + lwork_dorgqr_n )
+                       wrkbl = max( wrkbl, 3*n + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_dorgbr_q )
+                       wrkbl = max( wrkbl, 3*n + lwork_dorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = max( n*n + wrkbl, n*n + m*n + n )
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntus .and. wntvn ) then
+                       ! path 4 (m much larger than n, jobu='s', jobvt='n')
+                       wrkbl = n + lwork_dgeqrf
+                       wrkbl = max( wrkbl, n + lwork_dorgqr_n )
+                       wrkbl = max( wrkbl, 3*n + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_dorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = n*n + wrkbl
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntus .and. wntvo ) then
+                       ! path 5 (m much larger than n, jobu='s', jobvt='o')
+                       wrkbl = n + lwork_dgeqrf
+                       wrkbl = max( wrkbl, n + lwork_dorgqr_n )
+                       wrkbl = max( wrkbl, 3*n + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_dorgbr_q )
+                       wrkbl = max( wrkbl, 3*n + lwork_dorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = 2*n*n + wrkbl
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntus .and. wntvas ) then
+                       ! path 6 (m much larger than n, jobu='s', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_dgeqrf
+                       wrkbl = max( wrkbl, n + lwork_dorgqr_n )
+                       wrkbl = max( wrkbl, 3*n + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_dorgbr_q )
+                       wrkbl = max( wrkbl, 3*n + lwork_dorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = n*n + wrkbl
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntua .and. wntvn ) then
+                       ! path 7 (m much larger than n, jobu='a', jobvt='n')
+                       wrkbl = n + lwork_dgeqrf
+                       wrkbl = max( wrkbl, n + lwork_dorgqr_m )
+                       wrkbl = max( wrkbl, 3*n + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_dorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = n*n + wrkbl
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntua .and. wntvo ) then
+                       ! path 8 (m much larger than n, jobu='a', jobvt='o')
+                       wrkbl = n + lwork_dgeqrf
+                       wrkbl = max( wrkbl, n + lwork_dorgqr_m )
+                       wrkbl = max( wrkbl, 3*n + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_dorgbr_q )
+                       wrkbl = max( wrkbl, 3*n + lwork_dorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = 2*n*n + wrkbl
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntua .and. wntvas ) then
+                       ! path 9 (m much larger than n, jobu='a', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_dgeqrf
+                       wrkbl = max( wrkbl, n + lwork_dorgqr_m )
+                       wrkbl = max( wrkbl, 3*n + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_dorgbr_q )
+                       wrkbl = max( wrkbl, 3*n + lwork_dorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = n*n + wrkbl
+                       minwrk = max( 3*n + m, bdspac )
+                    end if
+                 else
+                    ! path 10 (m at least n, but not much larger)
+                    call stdlib_dgebrd( m, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                              
+                    lwork_dgebrd = int( dum(1),KIND=ilp)
+                    maxwrk = 3*n + lwork_dgebrd
+                    if( wntus .or. wntuo ) then
+                       call stdlib_dorgbr( 'Q', m, n, n, a, lda, dum(1),dum(1), -1, ierr )
+                       lwork_dorgbr_q = int( dum(1),KIND=ilp)
+                       maxwrk = max( maxwrk, 3*n + lwork_dorgbr_q )
+                    end if
+                    if( wntua ) then
+                       call stdlib_dorgbr( 'Q', m, m, n, a, lda, dum(1),dum(1), -1, ierr )
+                       lwork_dorgbr_q = int( dum(1),KIND=ilp)
+                       maxwrk = max( maxwrk, 3*n + lwork_dorgbr_q )
+                    end if
+                    if( .not.wntvn ) then
+                      maxwrk = max( maxwrk, 3*n + lwork_dorgbr_p )
+                    end if
+                    maxwrk = max( maxwrk, bdspac )
+                    minwrk = max( 3*n + m, bdspac )
+                 end if
+              else if( minmn>0 ) then
+                 ! compute space needed for stdlib_dbdsqr
+                 mnthr = stdlib_ilaenv( 6, 'DGESVD', jobu // jobvt, m, n, 0, 0 )
+                 bdspac = 5*m
+                 ! compute space needed for stdlib_dgelqf
+                 call stdlib_dgelqf( m, n, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_dgelqf = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_dorglq
+                 call stdlib_dorglq( n, n, m, dum(1), n, dum(1), dum(1), -1, ierr )
+                 lwork_dorglq_n = int( dum(1),KIND=ilp)
+                 call stdlib_dorglq( m, n, m, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_dorglq_m = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_dgebrd
+                 call stdlib_dgebrd( m, m, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                           
+                 lwork_dgebrd = int( dum(1),KIND=ilp)
+                  ! compute space needed for stdlib_dorgbr p
+                 call stdlib_dorgbr( 'P', m, m, m, a, n, dum(1),dum(1), -1, ierr )
+                 lwork_dorgbr_p = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_dorgbr q
+                 call stdlib_dorgbr( 'Q', m, m, m, a, n, dum(1),dum(1), -1, ierr )
+                 lwork_dorgbr_q = int( dum(1),KIND=ilp)
+                 if( n>=mnthr ) then
+                    if( wntvn ) then
+                       ! path 1t(n much larger than m, jobvt='n')
+                       maxwrk = m + lwork_dgelqf
+                       maxwrk = max( maxwrk, 3*m + lwork_dgebrd )
+                       if( wntuo .or. wntuas )maxwrk = max( maxwrk, 3*m + lwork_dorgbr_q )
+                       maxwrk = max( maxwrk, bdspac )
+                       minwrk = max( 4*m, bdspac )
+                    else if( wntvo .and. wntun ) then
+                       ! path 2t(n much larger than m, jobu='n', jobvt='o')
+                       wrkbl = m + lwork_dgelqf
+                       wrkbl = max( wrkbl, m + lwork_dorglq_m )
+                       wrkbl = max( wrkbl, 3*m + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_dorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = max( m*m + wrkbl, m*m + m*n + m )
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntvo .and. wntuas ) then
+                       ! path 3t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='o')
+                       wrkbl = m + lwork_dgelqf
+                       wrkbl = max( wrkbl, m + lwork_dorglq_m )
+                       wrkbl = max( wrkbl, 3*m + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_dorgbr_p )
+                       wrkbl = max( wrkbl, 3*m + lwork_dorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = max( m*m + wrkbl, m*m + m*n + m )
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntvs .and. wntun ) then
+                       ! path 4t(n much larger than m, jobu='n', jobvt='s')
+                       wrkbl = m + lwork_dgelqf
+                       wrkbl = max( wrkbl, m + lwork_dorglq_m )
+                       wrkbl = max( wrkbl, 3*m + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_dorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = m*m + wrkbl
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntvs .and. wntuo ) then
+                       ! path 5t(n much larger than m, jobu='o', jobvt='s')
+                       wrkbl = m + lwork_dgelqf
+                       wrkbl = max( wrkbl, m + lwork_dorglq_m )
+                       wrkbl = max( wrkbl, 3*m + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_dorgbr_p )
+                       wrkbl = max( wrkbl, 3*m + lwork_dorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = 2*m*m + wrkbl
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntvs .and. wntuas ) then
+                       ! path 6t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='s')
+                       wrkbl = m + lwork_dgelqf
+                       wrkbl = max( wrkbl, m + lwork_dorglq_m )
+                       wrkbl = max( wrkbl, 3*m + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_dorgbr_p )
+                       wrkbl = max( wrkbl, 3*m + lwork_dorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = m*m + wrkbl
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntva .and. wntun ) then
+                       ! path 7t(n much larger than m, jobu='n', jobvt='a')
+                       wrkbl = m + lwork_dgelqf
+                       wrkbl = max( wrkbl, m + lwork_dorglq_n )
+                       wrkbl = max( wrkbl, 3*m + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_dorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = m*m + wrkbl
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntva .and. wntuo ) then
+                       ! path 8t(n much larger than m, jobu='o', jobvt='a')
+                       wrkbl = m + lwork_dgelqf
+                       wrkbl = max( wrkbl, m + lwork_dorglq_n )
+                       wrkbl = max( wrkbl, 3*m + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_dorgbr_p )
+                       wrkbl = max( wrkbl, 3*m + lwork_dorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = 2*m*m + wrkbl
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntva .and. wntuas ) then
+                       ! path 9t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='a')
+                       wrkbl = m + lwork_dgelqf
+                       wrkbl = max( wrkbl, m + lwork_dorglq_n )
+                       wrkbl = max( wrkbl, 3*m + lwork_dgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_dorgbr_p )
+                       wrkbl = max( wrkbl, 3*m + lwork_dorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = m*m + wrkbl
+                       minwrk = max( 3*m + n, bdspac )
+                    end if
+                 else
+                    ! path 10t(n greater than m, but not much larger)
+                    call stdlib_dgebrd( m, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                              
+                    lwork_dgebrd = int( dum(1),KIND=ilp)
+                    maxwrk = 3*m + lwork_dgebrd
+                    if( wntvs .or. wntvo ) then
+                      ! compute space needed for stdlib_dorgbr p
+                      call stdlib_dorgbr( 'P', m, n, m, a, n, dum(1),dum(1), -1, ierr )
+                      lwork_dorgbr_p = int( dum(1),KIND=ilp)
+                      maxwrk = max( maxwrk, 3*m + lwork_dorgbr_p )
+                    end if
+                    if( wntva ) then
+                      call stdlib_dorgbr( 'P', n, n, m, a, n, dum(1),dum(1), -1, ierr )
+                      lwork_dorgbr_p = int( dum(1),KIND=ilp)
+                      maxwrk = max( maxwrk, 3*m + lwork_dorgbr_p )
+                    end if
+                    if( .not.wntun ) then
+                       maxwrk = max( maxwrk, 3*m + lwork_dorgbr_q )
+                    end if
+                    maxwrk = max( maxwrk, bdspac )
+                    minwrk = max( 3*m + n, bdspac )
+                 end if
+              end if
+              maxwrk = max( maxwrk, minwrk )
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGESVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = sqrt( stdlib_dlamch( 'S' ) ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', m, n, a, lda, dum )
+           iscl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              iscl = 1
+              call stdlib_dlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, ierr )
+           else if( anrm>bignum ) then
+              iscl = 1
+              call stdlib_dlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, ierr )
+           end if
+           if( m>=n ) then
+              ! a has at least as many rows as columns. if a has sufficiently
+              ! more rows than columns, first reduce using the qr
+              ! decomposition (if sufficient workspace available)
+              if( m>=mnthr ) then
+                 if( wntun ) then
+                    ! path 1 (m much larger than n, jobu='n')
+                    ! no left singular vectors to be computed
+                    itau = 1
+                    iwork = itau + n
+                    ! compute a=q*r
+                    ! (workspace: need 2*n, prefer n + n*nb)
+                    call stdlib_dgeqrf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                              ierr )
+                    ! zero out below r
+                    if( n > 1 ) then
+                       call stdlib_dlaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ),lda )
+                    end if
+                    ie = 1
+                    itauq = ie + n
+                    itaup = itauq + n
+                    iwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                    call stdlib_dgebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( iwork ), lwork-iwork+1,ierr )
+                    ncvt = 0
+                    if( wntvo .or. wntvas ) then
+                       ! if right singular vectors desired, generate p'.
+                       ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                       call stdlib_dorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       ncvt = n
+                    end if
+                    iwork = ie + n
+                    ! perform bidiagonal qr iteration, computing right
+                    ! singular vectors of a in a if desired
+                    ! (workspace: need bdspac)
+                    call stdlib_dbdsqr( 'U', n, ncvt, 0, 0, s, work( ie ), a, lda,dum, 1, dum, 1, &
+                              work( iwork ), info )
+                    ! if right singular vectors desired in vt, copy them there
+                    if( wntvas )call stdlib_dlacpy( 'F', n, n, a, lda, vt, ldvt )
+                 else if( wntuo .and. wntvn ) then
+                    ! path 2 (m much larger than n, jobu='o', jobvt='n')
+                    ! n left singular vectors to be overwritten on a and
+                    ! no right singular vectors to be computed
+                    if( lwork>=n*n+max( 4*n, bdspac ) ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n + n ) + lda*n ) then
+                          ! work(iu) is lda by n, work(ir) is lda by n
+                          ldwrku = lda
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n + n ) + n*n ) then
+                          ! work(iu) is lda by n, work(ir) is n by n
+                          ldwrku = lda
+                          ldwrkr = n
+                       else
+                          ! work(iu) is ldwrku by n, work(ir) is n by n
+                          ldwrku = ( lwork-n*n-n ) / n
+                          ldwrkr = n
+                       end if
+                       itau = ir + ldwrkr*n
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                       call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to work(ir) and zero out below it
+                       call stdlib_dlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                       call stdlib_dlaset( 'L', n-1, n-1, zero, zero, work( ir+1 ),ldwrkr )
+                                 
+                       ! generate q in a
+                       ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                       call stdlib_dorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + n
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in work(ir)
+                       ! (workspace: need n*n + 4*n, prefer n*n + 3*n + 2*n*nb)
+                       call stdlib_dgebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
+                                 work( itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing r
+                       ! (workspace: need n*n + 4*n, prefer n*n + 3*n + n*nb)
+                       call stdlib_dorgbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       iwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of r in work(ir)
+                       ! (workspace: need n*n + bdspac)
+                       call stdlib_dbdsqr( 'U', n, 0, n, 0, s, work( ie ), dum, 1,work( ir ), &
+                                 ldwrkr, dum, 1,work( iwork ), info )
+                       iu = ie + n
+                       ! multiply q in a by left singular vectors of r in
+                       ! work(ir), storing result in work(iu) and copying to a
+                       ! (workspace: need n*n + 2*n, prefer n*n + m*n + n)
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_dgemm( 'N', 'N', chunk, n, n, one, a( i, 1 ),lda, work( ir )&
+                                    , ldwrkr, zero,work( iu ), ldwrku )
+                          call stdlib_dlacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       ie = 1
+                       itauq = ie + n
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize a
+                       ! (workspace: need 3*n + m, prefer 3*n + (m + n)*nb)
+                       call stdlib_dgebrd( m, n, a, lda, s, work( ie ),work( itauq ), work( itaup &
+                                 ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing a
+                       ! (workspace: need 4*n, prefer 3*n + n*nb)
+                       call stdlib_dorgbr( 'Q', m, n, n, a, lda, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in a
+                       ! (workspace: need bdspac)
+                       call stdlib_dbdsqr( 'U', n, 0, m, 0, s, work( ie ), dum, 1,a, lda, dum, 1, &
+                                 work( iwork ), info )
+                    end if
+                 else if( wntuo .and. wntvas ) then
+                    ! path 3 (m much larger than n, jobu='o', jobvt='s' or 'a')
+                    ! n left singular vectors to be overwritten on a and
+                    ! n right singular vectors to be computed in vt
+                    if( lwork>=n*n+max( 4*n, bdspac ) ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n + n ) + lda*n ) then
+                          ! work(iu) is lda by n and work(ir) is lda by n
+                          ldwrku = lda
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n + n ) + n*n ) then
+                          ! work(iu) is lda by n and work(ir) is n by n
+                          ldwrku = lda
+                          ldwrkr = n
+                       else
+                          ! work(iu) is ldwrku by n and work(ir) is n by n
+                          ldwrku = ( lwork-n*n-n ) / n
+                          ldwrkr = n
+                       end if
+                       itau = ir + ldwrkr*n
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                       call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to vt, zeroing out below it
+                       call stdlib_dlacpy( 'U', n, n, a, lda, vt, ldvt )
+                       if( n>1 )call stdlib_dlaset( 'L', n-1, n-1, zero, zero,vt( 2, 1 ), ldvt )
+                                 
+                       ! generate q in a
+                       ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                       call stdlib_dorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + n
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in vt, copying result to work(ir)
+                       ! (workspace: need n*n + 4*n, prefer n*n + 3*n + 2*n*nb)
+                       call stdlib_dgebrd( n, n, vt, ldvt, s, work( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       call stdlib_dlacpy( 'L', n, n, vt, ldvt, work( ir ), ldwrkr )
+                       ! generate left vectors bidiagonalizing r in work(ir)
+                       ! (workspace: need n*n + 4*n, prefer n*n + 3*n + n*nb)
+                       call stdlib_dorgbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing r in vt
+                       ! (workspace: need n*n + 4*n-1, prefer n*n + 3*n + (n-1)*nb)
+                       call stdlib_dorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of r in work(ir) and computing right
+                       ! singular vectors of r in vt
+                       ! (workspace: need n*n + bdspac)
+                       call stdlib_dbdsqr( 'U', n, n, n, 0, s, work( ie ), vt, ldvt,work( ir ), &
+                                 ldwrkr, dum, 1,work( iwork ), info )
+                       iu = ie + n
+                       ! multiply q in a by left singular vectors of r in
+                       ! work(ir), storing result in work(iu) and copying to a
+                       ! (workspace: need n*n + 2*n, prefer n*n + m*n + n)
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_dgemm( 'N', 'N', chunk, n, n, one, a( i, 1 ),lda, work( ir )&
+                                    , ldwrkr, zero,work( iu ), ldwrku )
+                          call stdlib_dlacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       itau = 1
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (workspace: need 2*n, prefer n + n*nb)
+                       call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to vt, zeroing out below it
+                       call stdlib_dlacpy( 'U', n, n, a, lda, vt, ldvt )
+                       if( n>1 )call stdlib_dlaset( 'L', n-1, n-1, zero, zero,vt( 2, 1 ), ldvt )
+                                 
+                       ! generate q in a
+                       ! (workspace: need 2*n, prefer n + n*nb)
+                       call stdlib_dorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + n
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in vt
+                       ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                       call stdlib_dgebrd( n, n, vt, ldvt, s, work( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! multiply q in a by left vectors bidiagonalizing r
+                       ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                       call stdlib_dormbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), a, lda,&
+                                  work( iwork ),lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing r in vt
+                       ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                       call stdlib_dorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in a and computing right
+                       ! singular vectors of a in vt
+                       ! (workspace: need bdspac)
+                       call stdlib_dbdsqr( 'U', n, n, m, 0, s, work( ie ), vt, ldvt,a, lda, dum, &
+                                 1, work( iwork ), info )
+                    end if
+                 else if( wntus ) then
+                    if( wntvn ) then
+                       ! path 4 (m much larger than n, jobu='s', jobvt='n')
+                       ! n left singular vectors to be computed in u and
+                       ! no right singular vectors to be computed
+                       if( lwork>=n*n+max( 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(ir) is lda by n
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is n by n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                          call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(ir), zeroing out below it
+                          call stdlib_dlacpy( 'U', n, n, a, lda, work( ir ),ldwrkr )
+                          call stdlib_dlaset( 'L', n-1, n-1, zero, zero,work( ir+1 ), ldwrkr )
+                                    
+                          ! generate q in a
+                          ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                          call stdlib_dorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(ir)
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + 2*n*nb)
+                          call stdlib_dgebrd( n, n, work( ir ), ldwrkr, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left vectors bidiagonalizing r in work(ir)
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + n*nb)
+                          call stdlib_dorgbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(ir)
+                          ! (workspace: need n*n + bdspac)
+                          call stdlib_dbdsqr( 'U', n, 0, n, 0, s, work( ie ), dum,1, work( ir ), &
+                                    ldwrkr, dum, 1,work( iwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(ir), storing result in u
+                          ! (workspace: need n*n)
+                          call stdlib_dgemm( 'N', 'N', m, n, n, one, a, lda,work( ir ), ldwrkr, &
+                                    zero, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_dorgqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_dlaset( 'L', n-1, n-1, zero, zero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                          call stdlib_dgebrd( n, n, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left vectors bidiagonalizing r
+                          ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                          call stdlib_dormbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u
+                          ! (workspace: need bdspac)
+                          call stdlib_dbdsqr( 'U', n, 0, m, 0, s, work( ie ), dum,1, u, ldu, dum, &
+                                    1, work( iwork ),info )
+                       end if
+                    else if( wntvo ) then
+                       ! path 5 (m much larger than n, jobu='s', jobvt='o')
+                       ! n left singular vectors to be computed in u and
+                       ! n right singular vectors to be overwritten on a
+                       if( lwork>=2*n*n+max( 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*n ) then
+                             ! work(iu) is lda by n and work(ir) is lda by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda + n )*n ) then
+                             ! work(iu) is lda by n and work(ir) is n by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          else
+                             ! work(iu) is n by n and work(ir) is n by n
+                             ldwrku = n
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (workspace: need 2*n*n + 2*n, prefer 2*n*n + n + n*nb)
+                          call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_dlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_dlaset( 'L', n-1, n-1, zero, zero,work( iu+1 ), ldwrku )
+                                    
+                          ! generate q in a
+                          ! (workspace: need 2*n*n + 2*n, prefer 2*n*n + n + n*nb)
+                          call stdlib_dorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to
+                          ! work(ir)
+                          ! (workspace: need 2*n*n + 4*n,
+                                      ! prefer 2*n*n+3*n+2*n*nb)
+                          call stdlib_dgebrd( n, n, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_dlacpy( 'U', n, n, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (workspace: need 2*n*n + 4*n, prefer 2*n*n + 3*n + n*nb)
+                          call stdlib_dorgbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (workspace: need 2*n*n + 4*n-1,
+                                      ! prefer 2*n*n+3*n+(n-1)*nb)
+                          call stdlib_dorgbr( 'P', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in work(ir)
+                          ! (workspace: need 2*n*n + bdspac)
+                          call stdlib_dbdsqr( 'U', n, n, n, 0, s, work( ie ),work( ir ), ldwrkr, &
+                                    work( iu ),ldwrku, dum, 1, work( iwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(iu), storing result in u
+                          ! (workspace: need n*n)
+                          call stdlib_dgemm( 'N', 'N', m, n, n, one, a, lda,work( iu ), ldwrku, &
+                                    zero, u, ldu )
+                          ! copy right singular vectors of r to a
+                          ! (workspace: need n*n)
+                          call stdlib_dlacpy( 'F', n, n, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_dorgqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_dlaset( 'L', n-1, n-1, zero, zero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                          call stdlib_dgebrd( n, n, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left vectors bidiagonalizing r
+                          ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                          call stdlib_dormbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right vectors bidiagonalizing r in a
+                          ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                          call stdlib_dorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in a
+                          ! (workspace: need bdspac)
+                          call stdlib_dbdsqr( 'U', n, n, m, 0, s, work( ie ), a,lda, u, ldu, dum, &
+                                    1, work( iwork ),info )
+                       end if
+                    else if( wntvas ) then
+                       ! path 6 (m much larger than n, jobu='s', jobvt='s'
+                               ! or 'a')
+                       ! n left singular vectors to be computed in u and
+                       ! n right singular vectors to be computed in vt
+                       if( lwork>=n*n+max( 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is n by n
+                             ldwrku = n
+                          end if
+                          itau = iu + ldwrku*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                          call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_dlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_dlaset( 'L', n-1, n-1, zero, zero,work( iu+1 ), ldwrku )
+                                    
+                          ! generate q in a
+                          ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                          call stdlib_dorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to vt
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + 2*n*nb)
+                          call stdlib_dgebrd( n, n, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_dlacpy( 'U', n, n, work( iu ), ldwrku, vt,ldvt )
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + n*nb)
+                          call stdlib_dorgbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (workspace: need n*n + 4*n-1,
+                                      ! prefer n*n+3*n+(n-1)*nb)
+                          call stdlib_dorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in vt
+                          ! (workspace: need n*n + bdspac)
+                          call stdlib_dbdsqr( 'U', n, n, n, 0, s, work( ie ), vt,ldvt, work( iu ),&
+                                     ldwrku, dum, 1,work( iwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(iu), storing result in u
+                          ! (workspace: need n*n)
+                          call stdlib_dgemm( 'N', 'N', m, n, n, one, a, lda,work( iu ), ldwrku, &
+                                    zero, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_dorgqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to vt, zeroing out below it
+                          call stdlib_dlacpy( 'U', n, n, a, lda, vt, ldvt )
+                          if( n>1 )call stdlib_dlaset( 'L', n-1, n-1, zero, zero,vt( 2, 1 ), ldvt &
+                                    )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in vt
+                          ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                          call stdlib_dgebrd( n, n, vt, ldvt, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in vt
+                          ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                          call stdlib_dormbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                          call stdlib_dorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_dbdsqr( 'U', n, n, m, 0, s, work( ie ), vt,ldvt, u, ldu, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    end if
+                 else if( wntua ) then
+                    if( wntvn ) then
+                       ! path 7 (m much larger than n, jobu='a', jobvt='n')
+                       ! m left singular vectors to be computed in u and
+                       ! no right singular vectors to be computed
+                       if( lwork>=n*n+max( n+m, 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(ir) is lda by n
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is n by n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                          call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! copy r to work(ir), zeroing out below it
+                          call stdlib_dlacpy( 'U', n, n, a, lda, work( ir ),ldwrkr )
+                          call stdlib_dlaset( 'L', n-1, n-1, zero, zero,work( ir+1 ), ldwrkr )
+                                    
+                          ! generate q in u
+                          ! (workspace: need n*n + n + m, prefer n*n + n + m*nb)
+                          call stdlib_dorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(ir)
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + 2*n*nb)
+                          call stdlib_dgebrd( n, n, work( ir ), ldwrkr, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + n*nb)
+                          call stdlib_dorgbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(ir)
+                          ! (workspace: need n*n + bdspac)
+                          call stdlib_dbdsqr( 'U', n, 0, n, 0, s, work( ie ), dum,1, work( ir ), &
+                                    ldwrkr, dum, 1,work( iwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(ir), storing result in a
+                          ! (workspace: need n*n)
+                          call stdlib_dgemm( 'N', 'N', m, n, n, one, u, ldu,work( ir ), ldwrkr, &
+                                    zero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_dlacpy( 'F', m, n, a, lda, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need n + m, prefer n + m*nb)
+                          call stdlib_dorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_dlaset( 'L', n-1, n-1, zero, zero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                          call stdlib_dgebrd( n, n, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in a
+                          ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                          call stdlib_dormbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u
+                          ! (workspace: need bdspac)
+                          call stdlib_dbdsqr( 'U', n, 0, m, 0, s, work( ie ), dum,1, u, ldu, dum, &
+                                    1, work( iwork ),info )
+                       end if
+                    else if( wntvo ) then
+                       ! path 8 (m much larger than n, jobu='a', jobvt='o')
+                       ! m left singular vectors to be computed in u and
+                       ! n right singular vectors to be overwritten on a
+                       if( lwork>=2*n*n+max( n+m, 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*n ) then
+                             ! work(iu) is lda by n and work(ir) is lda by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda + n )*n ) then
+                             ! work(iu) is lda by n and work(ir) is n by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          else
+                             ! work(iu) is n by n and work(ir) is n by n
+                             ldwrku = n
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n*n + 2*n, prefer 2*n*n + n + n*nb)
+                          call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need 2*n*n + n + m, prefer 2*n*n + n + m*nb)
+                          call stdlib_dorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_dlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_dlaset( 'L', n-1, n-1, zero, zero,work( iu+1 ), ldwrku )
+                                    
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to
+                          ! work(ir)
+                          ! (workspace: need 2*n*n + 4*n,
+                                      ! prefer 2*n*n+3*n+2*n*nb)
+                          call stdlib_dgebrd( n, n, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_dlacpy( 'U', n, n, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (workspace: need 2*n*n + 4*n, prefer 2*n*n + 3*n + n*nb)
+                          call stdlib_dorgbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (workspace: need 2*n*n + 4*n-1,
+                                      ! prefer 2*n*n+3*n+(n-1)*nb)
+                          call stdlib_dorgbr( 'P', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in work(ir)
+                          ! (workspace: need 2*n*n + bdspac)
+                          call stdlib_dbdsqr( 'U', n, n, n, 0, s, work( ie ),work( ir ), ldwrkr, &
+                                    work( iu ),ldwrku, dum, 1, work( iwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(iu), storing result in a
+                          ! (workspace: need n*n)
+                          call stdlib_dgemm( 'N', 'N', m, n, n, one, u, ldu,work( iu ), ldwrku, &
+                                    zero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_dlacpy( 'F', m, n, a, lda, u, ldu )
+                          ! copy right singular vectors of r from work(ir) to a
+                          call stdlib_dlacpy( 'F', n, n, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need n + m, prefer n + m*nb)
+                          call stdlib_dorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_dlaset( 'L', n-1, n-1, zero, zero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                          call stdlib_dgebrd( n, n, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in a
+                          ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                          call stdlib_dormbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in a
+                          ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                          call stdlib_dorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in a
+                          ! (workspace: need bdspac)
+                          call stdlib_dbdsqr( 'U', n, n, m, 0, s, work( ie ), a,lda, u, ldu, dum, &
+                                    1, work( iwork ),info )
+                       end if
+                    else if( wntvas ) then
+                       ! path 9 (m much larger than n, jobu='a', jobvt='s'
+                               ! or 'a')
+                       ! m left singular vectors to be computed in u and
+                       ! n right singular vectors to be computed in vt
+                       if( lwork>=n*n+max( n+m, 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is n by n
+                             ldwrku = n
+                          end if
+                          itau = iu + ldwrku*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                          call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need n*n + n + m, prefer n*n + n + m*nb)
+                          call stdlib_dorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_dlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_dlaset( 'L', n-1, n-1, zero, zero,work( iu+1 ), ldwrku )
+                                    
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to vt
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + 2*n*nb)
+                          call stdlib_dgebrd( n, n, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_dlacpy( 'U', n, n, work( iu ), ldwrku, vt,ldvt )
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + n*nb)
+                          call stdlib_dorgbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (workspace: need n*n + 4*n-1,
+                                      ! prefer n*n+3*n+(n-1)*nb)
+                          call stdlib_dorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in vt
+                          ! (workspace: need n*n + bdspac)
+                          call stdlib_dbdsqr( 'U', n, n, n, 0, s, work( ie ), vt,ldvt, work( iu ),&
+                                     ldwrku, dum, 1,work( iwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(iu), storing result in a
+                          ! (workspace: need n*n)
+                          call stdlib_dgemm( 'N', 'N', m, n, n, one, u, ldu,work( iu ), ldwrku, &
+                                    zero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_dlacpy( 'F', m, n, a, lda, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_dgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need n + m, prefer n + m*nb)
+                          call stdlib_dorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r from a to vt, zeroing out below it
+                          call stdlib_dlacpy( 'U', n, n, a, lda, vt, ldvt )
+                          if( n>1 )call stdlib_dlaset( 'L', n-1, n-1, zero, zero,vt( 2, 1 ), ldvt &
+                                    )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in vt
+                          ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                          call stdlib_dgebrd( n, n, vt, ldvt, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in vt
+                          ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                          call stdlib_dormbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                          call stdlib_dorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_dbdsqr( 'U', n, n, m, 0, s, work( ie ), vt,ldvt, u, ldu, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    end if
+                 end if
+              else
+                 ! m < mnthr
+                 ! path 10 (m at least n, but not much larger)
+                 ! reduce to bidiagonal form without qr decomposition
+                 ie = 1
+                 itauq = ie + n
+                 itaup = itauq + n
+                 iwork = itaup + n
+                 ! bidiagonalize a
+                 ! (workspace: need 3*n + m, prefer 3*n + (m + n)*nb)
+                 call stdlib_dgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                           work( iwork ), lwork-iwork+1,ierr )
+                 if( wntuas ) then
+                    ! if left singular vectors desired in u, copy result to u
+                    ! and generate left bidiagonalizing vectors in u
+                    ! (workspace: need 3*n + ncu, prefer 3*n + ncu*nb)
+                    call stdlib_dlacpy( 'L', m, n, a, lda, u, ldu )
+                    if( wntus )ncu = n
+                    if( wntua )ncu = m
+                    call stdlib_dorgbr( 'Q', m, ncu, n, u, ldu, work( itauq ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntvas ) then
+                    ! if right singular vectors desired in vt, copy result to
+                    ! vt and generate right bidiagonalizing vectors in vt
+                    ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                    call stdlib_dlacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_dorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntuo ) then
+                    ! if left singular vectors desired in a, generate left
+                    ! bidiagonalizing vectors in a
+                    ! (workspace: need 4*n, prefer 3*n + n*nb)
+                    call stdlib_dorgbr( 'Q', m, n, n, a, lda, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvo ) then
+                    ! if right singular vectors desired in a, generate right
+                    ! bidiagonalizing vectors in a
+                    ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                    call stdlib_dorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 iwork = ie + n
+                 if( wntuas .or. wntuo )nru = m
+                 if( wntun )nru = 0
+                 if( wntvas .or. wntvo )ncvt = n
+                 if( wntvn )ncvt = 0
+                 if( ( .not.wntuo ) .and. ( .not.wntvo ) ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in vt
+                    ! (workspace: need bdspac)
+                    call stdlib_dbdsqr( 'U', n, ncvt, nru, 0, s, work( ie ), vt,ldvt, u, ldu, dum,&
+                               1, work( iwork ), info )
+                 else if( ( .not.wntuo ) .and. wntvo ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in a
+                    ! (workspace: need bdspac)
+                    call stdlib_dbdsqr( 'U', n, ncvt, nru, 0, s, work( ie ), a, lda,u, ldu, dum, &
+                              1, work( iwork ), info )
+                 else
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in a and computing right singular
+                    ! vectors in vt
+                    ! (workspace: need bdspac)
+                    call stdlib_dbdsqr( 'U', n, ncvt, nru, 0, s, work( ie ), vt,ldvt, a, lda, dum,&
+                               1, work( iwork ), info )
+                 end if
+              end if
+           else
+              ! a has more columns than rows. if a has sufficiently more
+              ! columns than rows, first reduce using the lq decomposition (if
+              ! sufficient workspace available)
+              if( n>=mnthr ) then
+                 if( wntvn ) then
+                    ! path 1t(n much larger than m, jobvt='n')
+                    ! no right singular vectors to be computed
+                    itau = 1
+                    iwork = itau + m
+                    ! compute a=l*q
+                    ! (workspace: need 2*m, prefer m + m*nb)
+                    call stdlib_dgelqf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                              ierr )
+                    ! zero out above l
+                    call stdlib_dlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ), lda )
+                    ie = 1
+                    itauq = ie + m
+                    itaup = itauq + m
+                    iwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                    call stdlib_dgebrd( m, m, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( iwork ), lwork-iwork+1,ierr )
+                    if( wntuo .or. wntuas ) then
+                       ! if left singular vectors desired, generate q
+                       ! (workspace: need 4*m, prefer 3*m + m*nb)
+                       call stdlib_dorgbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                    end if
+                    iwork = ie + m
+                    nru = 0
+                    if( wntuo .or. wntuas )nru = m
+                    ! perform bidiagonal qr iteration, computing left singular
+                    ! vectors of a in a if desired
+                    ! (workspace: need bdspac)
+                    call stdlib_dbdsqr( 'U', m, 0, nru, 0, s, work( ie ), dum, 1, a,lda, dum, 1, &
+                              work( iwork ), info )
+                    ! if left singular vectors desired in u, copy them there
+                    if( wntuas )call stdlib_dlacpy( 'F', m, m, a, lda, u, ldu )
+                 else if( wntvo .and. wntun ) then
+                    ! path 2t(n much larger than m, jobu='n', jobvt='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! no left singular vectors to be computed
+                    if( lwork>=m*m+max( 4*m, bdspac ) ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n + m ) + lda*m ) then
+                          ! work(iu) is lda by n and work(ir) is lda by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n + m ) + m*m ) then
+                          ! work(iu) is lda by n and work(ir) is m by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = m
+                       else
+                          ! work(iu) is m by chunk and work(ir) is m by m
+                          ldwrku = m
+                          chunk = ( lwork-m*m-m ) / m
+                          ldwrkr = m
+                       end if
+                       itau = ir + ldwrkr*m
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                       call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to work(ir) and zero out above it
+                       call stdlib_dlacpy( 'L', m, m, a, lda, work( ir ), ldwrkr )
+                       call stdlib_dlaset( 'U', m-1, m-1, zero, zero,work( ir+ldwrkr ), ldwrkr )
+                                 
+                       ! generate q in a
+                       ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                       call stdlib_dorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + m
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in work(ir)
+                       ! (workspace: need m*m + 4*m, prefer m*m + 3*m + 2*m*nb)
+                       call stdlib_dgebrd( m, m, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
+                                 work( itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing l
+                       ! (workspace: need m*m + 4*m-1, prefer m*m + 3*m + (m-1)*nb)
+                       call stdlib_dorgbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       iwork = ie + m
+                       ! perform bidiagonal qr iteration, computing right
+                       ! singular vectors of l in work(ir)
+                       ! (workspace: need m*m + bdspac)
+                       call stdlib_dbdsqr( 'U', m, m, 0, 0, s, work( ie ),work( ir ), ldwrkr, dum,&
+                                  1, dum, 1,work( iwork ), info )
+                       iu = ie + m
+                       ! multiply right singular vectors of l in work(ir) by q
+                       ! in a, storing result in work(iu) and copying to a
+                       ! (workspace: need m*m + 2*m, prefer m*m + m*n + m)
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_dgemm( 'N', 'N', m, blk, m, one, work( ir ),ldwrkr, a( 1, i &
+                                    ), lda, zero,work( iu ), ldwrku )
+                          call stdlib_dlacpy( 'F', m, blk, work( iu ), ldwrku,a( 1, i ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       ie = 1
+                       itauq = ie + m
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize a
+                       ! (workspace: need 3*m + n, prefer 3*m + (m + n)*nb)
+                       call stdlib_dgebrd( m, n, a, lda, s, work( ie ),work( itauq ), work( itaup &
+                                 ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing a
+                       ! (workspace: need 4*m, prefer 3*m + m*nb)
+                       call stdlib_dorgbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + m
+                       ! perform bidiagonal qr iteration, computing right
+                       ! singular vectors of a in a
+                       ! (workspace: need bdspac)
+                       call stdlib_dbdsqr( 'L', m, n, 0, 0, s, work( ie ), a, lda,dum, 1, dum, 1, &
+                                 work( iwork ), info )
+                    end if
+                 else if( wntvo .and. wntuas ) then
+                    ! path 3t(n much larger than m, jobu='s' or 'a', jobvt='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! m left singular vectors to be computed in u
+                    if( lwork>=m*m+max( 4*m, bdspac ) ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n + m ) + lda*m ) then
+                          ! work(iu) is lda by n and work(ir) is lda by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n + m ) + m*m ) then
+                          ! work(iu) is lda by n and work(ir) is m by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = m
+                       else
+                          ! work(iu) is m by chunk and work(ir) is m by m
+                          ldwrku = m
+                          chunk = ( lwork-m*m-m ) / m
+                          ldwrkr = m
+                       end if
+                       itau = ir + ldwrkr*m
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                       call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to u, zeroing about above it
+                       call stdlib_dlacpy( 'L', m, m, a, lda, u, ldu )
+                       call stdlib_dlaset( 'U', m-1, m-1, zero, zero, u( 1, 2 ),ldu )
+                       ! generate q in a
+                       ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                       call stdlib_dorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + m
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in u, copying result to work(ir)
+                       ! (workspace: need m*m + 4*m, prefer m*m + 3*m + 2*m*nb)
+                       call stdlib_dgebrd( m, m, u, ldu, s, work( ie ),work( itauq ), work( itaup &
+                                 ),work( iwork ), lwork-iwork+1, ierr )
+                       call stdlib_dlacpy( 'U', m, m, u, ldu, work( ir ), ldwrkr )
+                       ! generate right vectors bidiagonalizing l in work(ir)
+                       ! (workspace: need m*m + 4*m-1, prefer m*m + 3*m + (m-1)*nb)
+                       call stdlib_dorgbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing l in u
+                       ! (workspace: need m*m + 4*m, prefer m*m + 3*m + m*nb)
+                       call stdlib_dorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + m
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of l in u, and computing right
+                       ! singular vectors of l in work(ir)
+                       ! (workspace: need m*m + bdspac)
+                       call stdlib_dbdsqr( 'U', m, m, m, 0, s, work( ie ),work( ir ), ldwrkr, u, &
+                                 ldu, dum, 1,work( iwork ), info )
+                       iu = ie + m
+                       ! multiply right singular vectors of l in work(ir) by q
+                       ! in a, storing result in work(iu) and copying to a
+                       ! (workspace: need m*m + 2*m, prefer m*m + m*n + m))
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_dgemm( 'N', 'N', m, blk, m, one, work( ir ),ldwrkr, a( 1, i &
+                                    ), lda, zero,work( iu ), ldwrku )
+                          call stdlib_dlacpy( 'F', m, blk, work( iu ), ldwrku,a( 1, i ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       itau = 1
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (workspace: need 2*m, prefer m + m*nb)
+                       call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to u, zeroing out above it
+                       call stdlib_dlacpy( 'L', m, m, a, lda, u, ldu )
+                       call stdlib_dlaset( 'U', m-1, m-1, zero, zero, u( 1, 2 ),ldu )
+                       ! generate q in a
+                       ! (workspace: need 2*m, prefer m + m*nb)
+                       call stdlib_dorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + m
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in u
+                       ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                       call stdlib_dgebrd( m, m, u, ldu, s, work( ie ),work( itauq ), work( itaup &
+                                 ),work( iwork ), lwork-iwork+1, ierr )
+                       ! multiply right vectors bidiagonalizing l by q in a
+                       ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                       call stdlib_dormbr( 'P', 'L', 'T', m, n, m, u, ldu,work( itaup ), a, lda, &
+                                 work( iwork ),lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing l in u
+                       ! (workspace: need 4*m, prefer 3*m + m*nb)
+                       call stdlib_dorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + m
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in u and computing right
+                       ! singular vectors of a in a
+                       ! (workspace: need bdspac)
+                       call stdlib_dbdsqr( 'U', m, n, m, 0, s, work( ie ), a, lda,u, ldu, dum, 1, &
+                                 work( iwork ), info )
+                    end if
+                 else if( wntvs ) then
+                    if( wntun ) then
+                       ! path 4t(n much larger than m, jobu='n', jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! no left singular vectors to be computed
+                       if( lwork>=m*m+max( 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(ir) is lda by m
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is m by m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                          call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(ir), zeroing out above it
+                          call stdlib_dlacpy( 'L', m, m, a, lda, work( ir ),ldwrkr )
+                          call stdlib_dlaset( 'U', m-1, m-1, zero, zero,work( ir+ldwrkr ), ldwrkr &
+                                    )
+                          ! generate q in a
+                          ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                          call stdlib_dorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(ir)
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + 2*m*nb)
+                          call stdlib_dgebrd( m, m, work( ir ), ldwrkr, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right vectors bidiagonalizing l in
+                          ! work(ir)
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + (m-1)*nb)
+                          call stdlib_dorgbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of l in work(ir)
+                          ! (workspace: need m*m + bdspac)
+                          call stdlib_dbdsqr( 'U', m, m, 0, 0, s, work( ie ),work( ir ), ldwrkr, &
+                                    dum, 1, dum, 1,work( iwork ), info )
+                          ! multiply right singular vectors of l in work(ir) by
+                          ! q in a, storing result in vt
+                          ! (workspace: need m*m)
+                          call stdlib_dgemm( 'N', 'N', m, n, m, one, work( ir ),ldwrkr, a, lda, &
+                                    zero, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy result to vt
+                          call stdlib_dlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_dorglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_dlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ),lda )
+                          ! bidiagonalize l in a
+                          ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                          call stdlib_dgebrd( m, m, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right vectors bidiagonalizing l by q in vt
+                          ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                          call stdlib_dormbr( 'P', 'L', 'T', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_dbdsqr( 'U', m, n, 0, 0, s, work( ie ), vt,ldvt, dum, 1, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    else if( wntuo ) then
+                       ! path 5t(n much larger than m, jobu='o', jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! m left singular vectors to be overwritten on a
+                       if( lwork>=2*m*m+max( 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*m ) then
+                             ! work(iu) is lda by m and work(ir) is lda by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda + m )*m ) then
+                             ! work(iu) is lda by m and work(ir) is m by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          else
+                             ! work(iu) is m by m and work(ir) is m by m
+                             ldwrku = m
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (workspace: need 2*m*m + 2*m, prefer 2*m*m + m + m*nb)
+                          call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out below it
+                          call stdlib_dlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_dlaset( 'U', m-1, m-1, zero, zero,work( iu+ldwrku ), ldwrku &
+                                    )
+                          ! generate q in a
+                          ! (workspace: need 2*m*m + 2*m, prefer 2*m*m + m + m*nb)
+                          call stdlib_dorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to
+                          ! work(ir)
+                          ! (workspace: need 2*m*m + 4*m,
+                                      ! prefer 2*m*m+3*m+2*m*nb)
+                          call stdlib_dgebrd( m, m, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_dlacpy( 'L', m, m, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (workspace: need 2*m*m + 4*m-1,
+                                      ! prefer 2*m*m+3*m+(m-1)*nb)
+                          call stdlib_dorgbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (workspace: need 2*m*m + 4*m, prefer 2*m*m + 3*m + m*nb)
+                          call stdlib_dorgbr( 'Q', m, m, m, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in work(ir) and computing
+                          ! right singular vectors of l in work(iu)
+                          ! (workspace: need 2*m*m + bdspac)
+                          call stdlib_dbdsqr( 'U', m, m, m, 0, s, work( ie ),work( iu ), ldwrku, &
+                                    work( ir ),ldwrkr, dum, 1, work( iwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in a, storing result in vt
+                          ! (workspace: need m*m)
+                          call stdlib_dgemm( 'N', 'N', m, n, m, one, work( iu ),ldwrku, a, lda, &
+                                    zero, vt, ldvt )
+                          ! copy left singular vectors of l to a
+                          ! (workspace: need m*m)
+                          call stdlib_dlacpy( 'F', m, m, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_dorglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_dlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ),lda )
+                          ! bidiagonalize l in a
+                          ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                          call stdlib_dgebrd( m, m, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right vectors bidiagonalizing l by q in vt
+                          ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                          call stdlib_dormbr( 'P', 'L', 'T', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors of l in a
+                          ! (workspace: need 4*m, prefer 3*m + m*nb)
+                          call stdlib_dorgbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, compute left
+                          ! singular vectors of a in a and compute right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_dbdsqr( 'U', m, n, m, 0, s, work( ie ), vt,ldvt, a, lda, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    else if( wntuas ) then
+                       ! path 6t(n much larger than m, jobu='s' or 'a',
+                               ! jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! m left singular vectors to be computed in u
+                       if( lwork>=m*m+max( 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is lda by m
+                             ldwrku = m
+                          end if
+                          itau = iu + ldwrku*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                          call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_dlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_dlaset( 'U', m-1, m-1, zero, zero,work( iu+ldwrku ), ldwrku &
+                                    )
+                          ! generate q in a
+                          ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                          call stdlib_dorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to u
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + 2*m*nb)
+                          call stdlib_dgebrd( m, m, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_dlacpy( 'L', m, m, work( iu ), ldwrku, u,ldu )
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (workspace: need m*m + 4*m-1,
+                                      ! prefer m*m+3*m+(m-1)*nb)
+                          call stdlib_dorgbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + m*nb)
+                          call stdlib_dorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in u and computing right
+                          ! singular vectors of l in work(iu)
+                          ! (workspace: need m*m + bdspac)
+                          call stdlib_dbdsqr( 'U', m, m, m, 0, s, work( ie ),work( iu ), ldwrku, &
+                                    u, ldu, dum, 1,work( iwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in a, storing result in vt
+                          ! (workspace: need m*m)
+                          call stdlib_dgemm( 'N', 'N', m, n, m, one, work( iu ),ldwrku, a, lda, &
+                                    zero, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_dorglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to u, zeroing out above it
+                          call stdlib_dlacpy( 'L', m, m, a, lda, u, ldu )
+                          call stdlib_dlaset( 'U', m-1, m-1, zero, zero, u( 1, 2 ),ldu )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in u
+                          ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                          call stdlib_dgebrd( m, m, u, ldu, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in u by q
+                          ! in vt
+                          ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                          call stdlib_dormbr( 'P', 'L', 'T', m, n, m, u, ldu,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (workspace: need 4*m, prefer 3*m + m*nb)
+                          call stdlib_dorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_dbdsqr( 'U', m, n, m, 0, s, work( ie ), vt,ldvt, u, ldu, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    end if
+                 else if( wntva ) then
+                    if( wntun ) then
+                       ! path 7t(n much larger than m, jobu='n', jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! no left singular vectors to be computed
+                       if( lwork>=m*m+max( n + m, 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(ir) is lda by m
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is m by m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                          call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! copy l to work(ir), zeroing out above it
+                          call stdlib_dlacpy( 'L', m, m, a, lda, work( ir ),ldwrkr )
+                          call stdlib_dlaset( 'U', m-1, m-1, zero, zero,work( ir+ldwrkr ), ldwrkr &
+                                    )
+                          ! generate q in vt
+                          ! (workspace: need m*m + m + n, prefer m*m + m + n*nb)
+                          call stdlib_dorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(ir)
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + 2*m*nb)
+                          call stdlib_dgebrd( m, m, work( ir ), ldwrkr, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (workspace: need m*m + 4*m-1,
+                                      ! prefer m*m+3*m+(m-1)*nb)
+                          call stdlib_dorgbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of l in work(ir)
+                          ! (workspace: need m*m + bdspac)
+                          call stdlib_dbdsqr( 'U', m, m, 0, 0, s, work( ie ),work( ir ), ldwrkr, &
+                                    dum, 1, dum, 1,work( iwork ), info )
+                          ! multiply right singular vectors of l in work(ir) by
+                          ! q in vt, storing result in a
+                          ! (workspace: need m*m)
+                          call stdlib_dgemm( 'N', 'N', m, n, m, one, work( ir ),ldwrkr, vt, ldvt, &
+                                    zero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_dlacpy( 'F', m, n, a, lda, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need m + n, prefer m + n*nb)
+                          call stdlib_dorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_dlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ),lda )
+                          ! bidiagonalize l in a
+                          ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                          call stdlib_dgebrd( m, m, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in a by q
+                          ! in vt
+                          ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                          call stdlib_dormbr( 'P', 'L', 'T', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_dbdsqr( 'U', m, n, 0, 0, s, work( ie ), vt,ldvt, dum, 1, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    else if( wntuo ) then
+                       ! path 8t(n much larger than m, jobu='o', jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! m left singular vectors to be overwritten on a
+                       if( lwork>=2*m*m+max( n + m, 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*m ) then
+                             ! work(iu) is lda by m and work(ir) is lda by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda + m )*m ) then
+                             ! work(iu) is lda by m and work(ir) is m by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          else
+                             ! work(iu) is m by m and work(ir) is m by m
+                             ldwrku = m
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m*m + 2*m, prefer 2*m*m + m + m*nb)
+                          call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need 2*m*m + m + n, prefer 2*m*m + m + n*nb)
+                          call stdlib_dorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_dlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_dlaset( 'U', m-1, m-1, zero, zero,work( iu+ldwrku ), ldwrku &
+                                    )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to
+                          ! work(ir)
+                          ! (workspace: need 2*m*m + 4*m,
+                                      ! prefer 2*m*m+3*m+2*m*nb)
+                          call stdlib_dgebrd( m, m, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_dlacpy( 'L', m, m, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (workspace: need 2*m*m + 4*m-1,
+                                      ! prefer 2*m*m+3*m+(m-1)*nb)
+                          call stdlib_dorgbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (workspace: need 2*m*m + 4*m, prefer 2*m*m + 3*m + m*nb)
+                          call stdlib_dorgbr( 'Q', m, m, m, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in work(ir) and computing
+                          ! right singular vectors of l in work(iu)
+                          ! (workspace: need 2*m*m + bdspac)
+                          call stdlib_dbdsqr( 'U', m, m, m, 0, s, work( ie ),work( iu ), ldwrku, &
+                                    work( ir ),ldwrkr, dum, 1, work( iwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in vt, storing result in a
+                          ! (workspace: need m*m)
+                          call stdlib_dgemm( 'N', 'N', m, n, m, one, work( iu ),ldwrku, vt, ldvt, &
+                                    zero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_dlacpy( 'F', m, n, a, lda, vt, ldvt )
+                          ! copy left singular vectors of a from work(ir) to a
+                          call stdlib_dlacpy( 'F', m, m, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need m + n, prefer m + n*nb)
+                          call stdlib_dorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_dlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ),lda )
+                          ! bidiagonalize l in a
+                          ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                          call stdlib_dgebrd( m, m, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in a by q
+                          ! in vt
+                          ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                          call stdlib_dormbr( 'P', 'L', 'T', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in a
+                          ! (workspace: need 4*m, prefer 3*m + m*nb)
+                          call stdlib_dorgbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in a and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_dbdsqr( 'U', m, n, m, 0, s, work( ie ), vt,ldvt, a, lda, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    else if( wntuas ) then
+                       ! path 9t(n much larger than m, jobu='s' or 'a',
+                               ! jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! m left singular vectors to be computed in u
+                       if( lwork>=m*m+max( n + m, 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(iu) is lda by m
+                             ldwrku = lda
+                          else
+                             ! work(iu) is m by m
+                             ldwrku = m
+                          end if
+                          itau = iu + ldwrku*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                          call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need m*m + m + n, prefer m*m + m + n*nb)
+                          call stdlib_dorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_dlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_dlaset( 'U', m-1, m-1, zero, zero,work( iu+ldwrku ), ldwrku &
+                                    )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to u
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + 2*m*nb)
+                          call stdlib_dgebrd( m, m, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_dlacpy( 'L', m, m, work( iu ), ldwrku, u,ldu )
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + (m-1)*nb)
+                          call stdlib_dorgbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + m*nb)
+                          call stdlib_dorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in u and computing right
+                          ! singular vectors of l in work(iu)
+                          ! (workspace: need m*m + bdspac)
+                          call stdlib_dbdsqr( 'U', m, m, m, 0, s, work( ie ),work( iu ), ldwrku, &
+                                    u, ldu, dum, 1,work( iwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in vt, storing result in a
+                          ! (workspace: need m*m)
+                          call stdlib_dgemm( 'N', 'N', m, n, m, one, work( iu ),ldwrku, vt, ldvt, &
+                                    zero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_dlacpy( 'F', m, n, a, lda, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_dgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_dlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need m + n, prefer m + n*nb)
+                          call stdlib_dorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to u, zeroing out above it
+                          call stdlib_dlacpy( 'L', m, m, a, lda, u, ldu )
+                          call stdlib_dlaset( 'U', m-1, m-1, zero, zero, u( 1, 2 ),ldu )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in u
+                          ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                          call stdlib_dgebrd( m, m, u, ldu, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in u by q
+                          ! in vt
+                          ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                          call stdlib_dormbr( 'P', 'L', 'T', m, n, m, u, ldu,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (workspace: need 4*m, prefer 3*m + m*nb)
+                          call stdlib_dorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_dbdsqr( 'U', m, n, m, 0, s, work( ie ), vt,ldvt, u, ldu, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    end if
+                 end if
+              else
+                 ! n < mnthr
+                 ! path 10t(n greater than m, but not much larger)
+                 ! reduce to bidiagonal form without lq decomposition
+                 ie = 1
+                 itauq = ie + m
+                 itaup = itauq + m
+                 iwork = itaup + m
+                 ! bidiagonalize a
+                 ! (workspace: need 3*m + n, prefer 3*m + (m + n)*nb)
+                 call stdlib_dgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                           work( iwork ), lwork-iwork+1,ierr )
+                 if( wntuas ) then
+                    ! if left singular vectors desired in u, copy result to u
+                    ! and generate left bidiagonalizing vectors in u
+                    ! (workspace: need 4*m-1, prefer 3*m + (m-1)*nb)
+                    call stdlib_dlacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_dorgbr( 'Q', m, m, n, u, ldu, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvas ) then
+                    ! if right singular vectors desired in vt, copy result to
+                    ! vt and generate right bidiagonalizing vectors in vt
+                    ! (workspace: need 3*m + nrvt, prefer 3*m + nrvt*nb)
+                    call stdlib_dlacpy( 'U', m, n, a, lda, vt, ldvt )
+                    if( wntva )nrvt = n
+                    if( wntvs )nrvt = m
+                    call stdlib_dorgbr( 'P', nrvt, n, m, vt, ldvt, work( itaup ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntuo ) then
+                    ! if left singular vectors desired in a, generate left
+                    ! bidiagonalizing vectors in a
+                    ! (workspace: need 4*m-1, prefer 3*m + (m-1)*nb)
+                    call stdlib_dorgbr( 'Q', m, m, n, a, lda, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvo ) then
+                    ! if right singular vectors desired in a, generate right
+                    ! bidiagonalizing vectors in a
+                    ! (workspace: need 4*m, prefer 3*m + m*nb)
+                    call stdlib_dorgbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 iwork = ie + m
+                 if( wntuas .or. wntuo )nru = m
+                 if( wntun )nru = 0
+                 if( wntvas .or. wntvo )ncvt = n
+                 if( wntvn )ncvt = 0
+                 if( ( .not.wntuo ) .and. ( .not.wntvo ) ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in vt
+                    ! (workspace: need bdspac)
+                    call stdlib_dbdsqr( 'L', m, ncvt, nru, 0, s, work( ie ), vt,ldvt, u, ldu, dum,&
+                               1, work( iwork ), info )
+                 else if( ( .not.wntuo ) .and. wntvo ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in a
+                    ! (workspace: need bdspac)
+                    call stdlib_dbdsqr( 'L', m, ncvt, nru, 0, s, work( ie ), a, lda,u, ldu, dum, &
+                              1, work( iwork ), info )
+                 else
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in a and computing right singular
+                    ! vectors in vt
+                    ! (workspace: need bdspac)
+                    call stdlib_dbdsqr( 'L', m, ncvt, nru, 0, s, work( ie ), vt,ldvt, a, lda, dum,&
+                               1, work( iwork ), info )
+                 end if
+              end if
+           end if
+           ! if stdlib_dbdsqr failed to converge, copy unconverged superdiagonals
+           ! to work( 2:minmn )
+           if( info/=0 ) then
+              if( ie>2 ) then
+                 do i = 1, minmn - 1
+                    work( i+1 ) = work( i+ie-1 )
+                 end do
+              end if
+              if( ie<2 ) then
+                 do i = minmn - 1, 1, -1
+                    work( i+1 ) = work( i+ie-1 )
+                 end do
+              end if
+           end if
+           ! undo scaling if necessary
+           if( iscl==1 ) then
+              if( anrm>bignum )call stdlib_dlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm>bignum )call stdlib_dlascl( 'G', 0, 0, bignum, anrm, minmn-1,&
+                         1, work( 2 ),minmn, ierr )
+              if( anrm<smlnum )call stdlib_dlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm<smlnum )call stdlib_dlascl( 'G', 0, 0, smlnum, anrm, minmn-1,&
+                         1, work( 2 ),minmn, ierr )
+           end if
+           ! return optimal workspace in work(1)
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_dgesvd
+
+     !> DGESVDQ: computes the singular value decomposition (SVD) of a real
+     !> M-by-N matrix A, where M >= N. The SVD of A is written as
+     !> [++]   [xx]   [x0]   [xx]
+     !> A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+     !> [++]   [xx]
+     !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+     !> matrix, and V is an N-by-N orthogonal matrix. The diagonal elements
+     !> of SIGMA are the singular values of A. The columns of U and V are the
+     !> left and the right singular vectors of A, respectively.
+
+     subroutine stdlib_dgesvdq( joba, jobp, jobr, jobu, jobv, m, n, a, lda,s, u, ldu, v, ldv, &
+               numrank, iwork, liwork,work, lwork, rwork, lrwork, info )
+           ! Scalar Arguments 
+           character, intent(in) :: joba, jobp, jobr, jobu, jobv
+           integer(ilp), intent(in) :: m, n, lda, ldu, ldv, liwork, lrwork
+           integer(ilp), intent(out) :: numrank, info
+           integer(ilp), intent(inout) :: lwork
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: u(ldu,*), v(ldv,*), work(*)
+           real(dp), intent(out) :: s(*), rwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: ierr, iwoff, nr, n1, optratio, p, q
+           integer(ilp) :: lwcon, lwqp3, lwrk_dgelqf, lwrk_dgesvd, lwrk_dgesvd2, lwrk_dgeqp3, &
+           lwrk_dgeqrf, lwrk_dormlq, lwrk_dormqr, lwrk_dormqr2, lwlqf, lwqrf, lwsvd, lwsvd2, &
+                     lworq, lworq2, lworlq, minwrk, minwrk2, optwrk, optwrk2, iminwrk, rminwrk
+           logical(lk) :: accla, acclm, acclh, ascaled, conda, dntwu, dntwv, lquery, lsvc0, lsvec,&
+                      rowprm, rsvec, rtrans, wntua, wntuf, wntur, wntus, wntva, wntvr
+           real(dp) :: big, epsln, rtmp, sconda, sfmin
+           ! Local Arrays
+           real(dp) :: rdummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,sqrt
+           ! test the input arguments
+           wntus  = stdlib_lsame( jobu, 'S' ) .or. stdlib_lsame( jobu, 'U' )
+           wntur  = stdlib_lsame( jobu, 'R' )
+           wntua  = stdlib_lsame( jobu, 'A' )
+           wntuf  = stdlib_lsame( jobu, 'F' )
+           lsvc0  = wntus .or. wntur .or. wntua
+           lsvec  = lsvc0 .or. wntuf
+           dntwu  = stdlib_lsame( jobu, 'N' )
+           wntvr  = stdlib_lsame( jobv, 'R' )
+           wntva  = stdlib_lsame( jobv, 'A' ) .or. stdlib_lsame( jobv, 'V' )
+           rsvec  = wntvr .or. wntva
+           dntwv  = stdlib_lsame( jobv, 'N' )
+           accla  = stdlib_lsame( joba, 'A' )
+           acclm  = stdlib_lsame( joba, 'M' )
+           conda  = stdlib_lsame( joba, 'E' )
+           acclh  = stdlib_lsame( joba, 'H' ) .or. conda
+           rowprm = stdlib_lsame( jobp, 'P' )
+           rtrans = stdlib_lsame( jobr, 'T' )
+           if ( rowprm ) then
+              if ( conda ) then
+                 iminwrk = max( 1, n + m - 1 + n )
+              else
+                 iminwrk = max( 1, n + m - 1 )
+              end if
+              rminwrk = max( 2, m )
+           else
+              if ( conda ) then
+                 iminwrk = max( 1, n + n )
+              else
+                 iminwrk = max( 1, n )
+              end if
+              rminwrk = 2
+           end if
+           lquery = (liwork == -1 .or. lwork == -1 .or. lrwork == -1)
+           info  = 0
+           if ( .not. ( accla .or. acclm .or. acclh ) ) then
+              info = -1
+           else if ( .not.( rowprm .or. stdlib_lsame( jobp, 'N' ) ) ) then
+               info = -2
+           else if ( .not.( rtrans .or. stdlib_lsame( jobr, 'N' ) ) ) then
+               info = -3
+           else if ( .not.( lsvec .or. dntwu ) ) then
+              info = -4
+           else if ( wntur .and. wntva ) then
+              info = -5
+           else if ( .not.( rsvec .or. dntwv )) then
+              info = -5
+           else if ( m<0 ) then
+              info = -6
+           else if ( ( n<0 ) .or. ( n>m ) ) then
+              info = -7
+           else if ( lda<max( 1, m ) ) then
+              info = -9
+           else if ( ldu<1 .or. ( lsvc0 .and. ldu<m ) .or.( wntuf .and. ldu<n ) ) then
+              info = -12
+           else if ( ldv<1 .or. ( rsvec .and. ldv<n ) .or.( conda .and. ldv<n ) ) then
+              info = -14
+           else if ( liwork < iminwrk .and. .not. lquery ) then
+              info = -17
+           end if
+           if ( info == 0 ) then
+              ! Compute The Minimal And The Optimal Workspace Lengths
+              ! [[the expressions for computing the minimal and the optimal
+              ! values of lwork are written with a lot of redundancy and
+              ! can be simplified. however, this detailed form is easier for
+              ! maintenance and modifications of the code.]]
+              ! Minimal Workspace Length For Stdlib_Dgeqp3 Of An M X N Matrix
+              lwqp3 = 3 * n + 1
+              ! Minimal Workspace Length For Stdlib_Dormqr To Build Left Singular Vectors
+              if ( wntus .or. wntur ) then
+                  lworq  = max( n  , 1 )
+              else if ( wntua ) then
+                  lworq = max( m , 1 )
+              end if
+              ! Minimal Workspace Length For Stdlib_Dpocon Of An N X N Matrix
+              lwcon = 3 * n
+              ! Stdlib_Dgesvd Of An N X N Matrix
+              lwsvd = max( 5 * n, 1 )
+              if ( lquery ) then
+                  call stdlib_dgeqp3( m, n, a, lda, iwork, rdummy, rdummy, -1,ierr )
+                  lwrk_dgeqp3 = int( rdummy(1),KIND=ilp)
+                  if ( wntus .or. wntur ) then
+                      call stdlib_dormqr( 'L', 'N', m, n, n, a, lda, rdummy, u,ldu, rdummy, -1, &
+                                ierr )
+                      lwrk_dormqr = int( rdummy(1),KIND=ilp)
+                  else if ( wntua ) then
+                      call stdlib_dormqr( 'L', 'N', m, m, n, a, lda, rdummy, u,ldu, rdummy, -1, &
+                                ierr )
+                      lwrk_dormqr = int( rdummy(1),KIND=ilp)
+                  else
+                      lwrk_dormqr = 0
+                  end if
+              end if
+              minwrk = 2
+              optwrk = 2
+              if ( .not. (lsvec .or. rsvec )) then
+                  ! Minimal And Optimal Sizes Of The Workspace If
+                  ! only the singular values are requested
+                  if ( conda ) then
+                     minwrk = max( n+lwqp3, lwcon, lwsvd )
+                  else
+                     minwrk = max( n+lwqp3, lwsvd )
+                  end if
+                  if ( lquery ) then
+                      call stdlib_dgesvd( 'N', 'N', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                ierr )
+                      lwrk_dgesvd = int( rdummy(1),KIND=ilp)
+                      if ( conda ) then
+                         optwrk = max( n+lwrk_dgeqp3, n+lwcon, lwrk_dgesvd )
+                      else
+                         optwrk = max( n+lwrk_dgeqp3, lwrk_dgesvd )
+                      end if
+                  end if
+              else if ( lsvec .and. (.not.rsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Workspace If The
+                  ! singular values and the left singular vectors are requested
+                  if ( conda ) then
+                      minwrk = n + max( lwqp3, lwcon, lwsvd, lworq )
+                  else
+                      minwrk = n + max( lwqp3, lwsvd, lworq )
+                  end if
+                  if ( lquery ) then
+                     if ( rtrans ) then
+                        call stdlib_dgesvd( 'N', 'O', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                  ierr )
+                     else
+                        call stdlib_dgesvd( 'O', 'N', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                  ierr )
+                     end if
+                     lwrk_dgesvd = int( rdummy(1),KIND=ilp)
+                     if ( conda ) then
+                         optwrk = n + max( lwrk_dgeqp3, lwcon, lwrk_dgesvd,lwrk_dormqr )
+                     else
+                         optwrk = n + max( lwrk_dgeqp3, lwrk_dgesvd,lwrk_dormqr )
+                     end if
+                  end if
+              else if ( rsvec .and. (.not.lsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Workspace If The
+                  ! singular values and the right singular vectors are requested
+                  if ( conda ) then
+                      minwrk = n + max( lwqp3, lwcon, lwsvd )
+                  else
+                      minwrk = n + max( lwqp3, lwsvd )
+                  end if
+                  if ( lquery ) then
+                      if ( rtrans ) then
+                          call stdlib_dgesvd( 'O', 'N', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -&
+                                    1, ierr )
+                      else
+                          call stdlib_dgesvd( 'N', 'O', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -&
+                                    1, ierr )
+                      end if
+                      lwrk_dgesvd = int( rdummy(1),KIND=ilp)
+                      if ( conda ) then
+                          optwrk = n + max( lwrk_dgeqp3, lwcon, lwrk_dgesvd )
+                      else
+                          optwrk = n + max( lwrk_dgeqp3, lwrk_dgesvd )
+                      end if
+                  end if
+              else
+                  ! Minimal And Optimal Sizes Of The Workspace If The
+                  ! full svd is requested
+                  if ( rtrans ) then
+                      minwrk = max( lwqp3, lwsvd, lworq )
+                      if ( conda ) minwrk = max( minwrk, lwcon )
+                      minwrk = minwrk + n
+                      if ( wntva ) then
+                         ! .. minimal workspace length for n x n/2 stdlib_dgeqrf
+                         lwqrf  = max( n/2, 1 )
+                         ! .. minimal workspace length for n/2 x n/2 stdlib_dgesvd
+                         lwsvd2 = max( 5 * (n/2), 1 )
+                         lworq2 = max( n, 1 )
+                         minwrk2 = max( lwqp3, n/2+lwqrf, n/2+lwsvd2,n/2+lworq2, lworq )
+                         if ( conda ) minwrk2 = max( minwrk2, lwcon )
+                         minwrk2 = n + minwrk2
+                         minwrk = max( minwrk, minwrk2 )
+                      end if
+                  else
+                      minwrk = max( lwqp3, lwsvd, lworq )
+                      if ( conda ) minwrk = max( minwrk, lwcon )
+                      minwrk = minwrk + n
+                      if ( wntva ) then
+                         ! .. minimal workspace length for n/2 x n stdlib_dgelqf
+                         lwlqf  = max( n/2, 1 )
+                         lwsvd2 = max( 5 * (n/2), 1 )
+                         lworlq = max( n , 1 )
+                         minwrk2 = max( lwqp3, n/2+lwlqf, n/2+lwsvd2,n/2+lworlq, lworq )
+                         if ( conda ) minwrk2 = max( minwrk2, lwcon )
+                         minwrk2 = n + minwrk2
+                         minwrk = max( minwrk, minwrk2 )
+                      end if
+                  end if
+                  if ( lquery ) then
+                     if ( rtrans ) then
+                        call stdlib_dgesvd( 'O', 'A', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                  ierr )
+                        lwrk_dgesvd = int( rdummy(1),KIND=ilp)
+                        optwrk = max(lwrk_dgeqp3,lwrk_dgesvd,lwrk_dormqr)
+                        if ( conda ) optwrk = max( optwrk, lwcon )
+                        optwrk = n + optwrk
+                        if ( wntva ) then
+                            call stdlib_dgeqrf(n,n/2,u,ldu,rdummy,rdummy,-1,ierr)
+                            lwrk_dgeqrf = int( rdummy(1),KIND=ilp)
+                            call stdlib_dgesvd( 'S', 'O', n/2,n/2, v,ldv, s, u,ldu,v, ldv, rdummy,&
+                                       -1, ierr )
+                            lwrk_dgesvd2 = int( rdummy(1),KIND=ilp)
+                            call stdlib_dormqr( 'R', 'C', n, n, n/2, u, ldu, rdummy,v, ldv, &
+                                      rdummy, -1, ierr )
+                            lwrk_dormqr2 = int( rdummy(1),KIND=ilp)
+                            optwrk2 = max( lwrk_dgeqp3, n/2+lwrk_dgeqrf,n/2+lwrk_dgesvd2, n/2+&
+                                      lwrk_dormqr2 )
+                            if ( conda ) optwrk2 = max( optwrk2, lwcon )
+                            optwrk2 = n + optwrk2
+                            optwrk = max( optwrk, optwrk2 )
+                        end if
+                     else
+                        call stdlib_dgesvd( 'S', 'O', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                  ierr )
+                        lwrk_dgesvd = int( rdummy(1),KIND=ilp)
+                        optwrk = max(lwrk_dgeqp3,lwrk_dgesvd,lwrk_dormqr)
+                        if ( conda ) optwrk = max( optwrk, lwcon )
+                        optwrk = n + optwrk
+                        if ( wntva ) then
+                           call stdlib_dgelqf(n/2,n,u,ldu,rdummy,rdummy,-1,ierr)
+                           lwrk_dgelqf = int( rdummy(1),KIND=ilp)
+                           call stdlib_dgesvd( 'S','O', n/2,n/2, v, ldv, s, u, ldu,v, ldv, rdummy,&
+                                      -1, ierr )
+                           lwrk_dgesvd2 = int( rdummy(1),KIND=ilp)
+                           call stdlib_dormlq( 'R', 'N', n, n, n/2, u, ldu, rdummy,v, ldv, rdummy,&
+                                     -1,ierr )
+                           lwrk_dormlq = int( rdummy(1),KIND=ilp)
+                           optwrk2 = max( lwrk_dgeqp3, n/2+lwrk_dgelqf,n/2+lwrk_dgesvd2, n/2+&
+                                     lwrk_dormlq )
+                            if ( conda ) optwrk2 = max( optwrk2, lwcon )
+                            optwrk2 = n + optwrk2
+                            optwrk = max( optwrk, optwrk2 )
+                        end if
+                     end if
+                  end if
+              end if
+              minwrk = max( 2, minwrk )
+              optwrk = max( 2, optwrk )
+              if ( lwork < minwrk .and. (.not.lquery) ) info = -19
+           end if
+           if (info == 0 .and. lrwork < rminwrk .and. .not. lquery) then
+              info = -21
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGESVDQ', -info )
+              return
+           else if ( lquery ) then
+           ! return optimal workspace
+               iwork(1) = iminwrk
+               work(1) = optwrk
+               work(2) = minwrk
+               rwork(1) = rminwrk
+               return
+           end if
+           ! quick return if the matrix is void.
+           if( ( m==0 ) .or. ( n==0 ) ) then
+           ! All Output Is Void
+              return
+           end if
+           big = stdlib_dlamch('O')
+           ascaled = .false.
+           iwoff = 1
+           if ( rowprm ) then
+                 iwoff = m
+                 ! Reordering The Rows In Decreasing Sequence In The
+                 ! ell-infinity norm - this enhances numerical robustness in
+                 ! the case of differently scaled rows.
+                 do p = 1, m
+                     ! rwork(p) = abs( a(p,stdlib_icamax(n,a(p,1),lda)) )
+                     ! [[stdlib_dlange will return nan if an entry of the p-th row is nan]]
+                     rwork(p) = stdlib_dlange( 'M', 1, n, a(p,1), lda, rdummy )
+                     ! .. check for nan's and inf's
+                     if ( ( rwork(p) /= rwork(p) ) .or.( (rwork(p)*zero) /= zero ) ) then
+                         info = -8
+                         call stdlib_xerbla( 'DGESVDQ', -info )
+                         return
+                     end if
+                 end do
+                 do p = 1, m - 1
+                 q = stdlib_idamax( m-p+1, rwork(p), 1 ) + p - 1
+                 iwork(n+p) = q
+                 if ( p /= q ) then
+                    rtmp     = rwork(p)
+                    rwork(p) = rwork(q)
+                    rwork(q) = rtmp
+                 end if
+                 end do
+                 if ( rwork(1) == zero ) then
+                    ! quick return: a is the m x n zero matrix.
+                    numrank = 0
+                    call stdlib_dlaset( 'G', n, 1, zero, zero, s, n )
+                    if ( wntus ) call stdlib_dlaset('G', m, n, zero, one, u, ldu)
+                    if ( wntua ) call stdlib_dlaset('G', m, m, zero, one, u, ldu)
+                    if ( wntva ) call stdlib_dlaset('G', n, n, zero, one, v, ldv)
+                    if ( wntuf ) then
+                        call stdlib_dlaset( 'G', n, 1, zero, zero, work, n )
+                        call stdlib_dlaset( 'G', m, n, zero,  one, u, ldu )
+                    end if
+                    do p = 1, n
+                        iwork(p) = p
+                    end do
+                    if ( rowprm ) then
+                        do p = n + 1, n + m - 1
+                            iwork(p) = p - n
+                        end do
+                    end if
+                    if ( conda ) rwork(1) = -1
+                    rwork(2) = -1
+                    return
+                 end if
+                 if ( rwork(1) > big / sqrt(real(m,KIND=dp)) ) then
+                     ! .. to prevent overflow in the qr factorization, scale the
+                     ! matrix by 1/sqrt(m) if too large entry detected
+                     call stdlib_dlascl('G',0,0,sqrt(real(m,KIND=dp)),one, m,n, a,lda, ierr)
+                               
+                     ascaled = .true.
+                 end if
+                 call stdlib_dlaswp( n, a, lda, 1, m-1, iwork(n+1), 1 )
+           end if
+          ! .. at this stage, preemptive scaling is done only to avoid column
+          ! norms overflows during the qr factorization. the svd procedure should
+          ! have its own scaling to save the singular values from overflows and
+          ! underflows. that depends on the svd procedure.
+           if ( .not.rowprm ) then
+               rtmp = stdlib_dlange( 'M', m, n, a, lda, rdummy )
+               if ( ( rtmp /= rtmp ) .or.( (rtmp*zero) /= zero ) ) then
+                    info = -8
+                    call stdlib_xerbla( 'DGESVDQ', -info )
+                    return
+               end if
+               if ( rtmp > big / sqrt(real(m,KIND=dp)) ) then
+                   ! .. to prevent overflow in the qr factorization, scale the
+                   ! matrix by 1/sqrt(m) if too large entry detected
+                   call stdlib_dlascl('G',0,0, sqrt(real(m,KIND=dp)),one, m,n, a,lda, ierr)
+                             
+                   ascaled = .true.
+               end if
+           end if
+           ! Qr Factorization With Column Pivoting
+           ! a * p = q * [ r ]
+                       ! [ 0 ]
+           do p = 1, n
+              ! All Columns Are Free Columns
+              iwork(p) = 0
+           end do
+           call stdlib_dgeqp3( m, n, a, lda, iwork, work, work(n+1), lwork-n,ierr )
+          ! if the user requested accuracy level allows truncation in the
+          ! computed upper triangular factor, the matrix r is examined and,
+          ! if possible, replaced with its leading upper trapezoidal part.
+           epsln = stdlib_dlamch('E')
+           sfmin = stdlib_dlamch('S')
+           ! small = sfmin / epsln
+           nr = n
+           if ( accla ) then
+              ! standard absolute error bound suffices. all sigma_i with
+              ! sigma_i < n*eps*||a||_f are flushed to zero. this is an
+              ! aggressive enforcement of lower numerical rank by introducing a
+              ! backward error of the order of n*eps*||a||_f.
+              nr = 1
+              rtmp = sqrt(real(n,KIND=dp))*epsln
+              do p = 2, n
+                 if ( abs(a(p,p)) < (rtmp*abs(a(1,1))) ) go to 3002
+                    nr = nr + 1
+              end do
+              3002 continue
+           elseif ( acclm ) then
+              ! .. similarly as above, only slightly more gentle (less aggressive).
+              ! sudden drop on the diagonal of r is used as the criterion for being
+              ! close-to-rank-deficient. the threshold is set to epsln=stdlib_dlamch('e').
+              ! [[this can be made more flexible by replacing this hard-coded value
+              ! with a user specified threshold.]] also, the values that underflow
+              ! will be truncated.
+              nr = 1
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < (epsln*abs(a(p-1,p-1))) ) .or.( abs(a(p,p)) < sfmin ) ) go &
+                           to 3402
+                 nr = nr + 1
+              end do
+              3402 continue
+           else
+              ! Rrqr Not Authorized To Determine Numerical Rank Except In The
+              ! obvious case of zero pivots.
+              ! .. inspect r for exact zeros on the diagonal;
+              ! r(i,i)=0 => r(i:n,i:n)=0.
+              nr = 1
+              do p = 2, n
+                 if ( abs(a(p,p)) == zero ) go to 3502
+                 nr = nr + 1
+              end do
+              3502 continue
+              if ( conda ) then
+                 ! estimate the scaled condition number of a. use the fact that it is
+                 ! the same as the scaled condition number of r.
+                    ! V Is Used As Workspace
+                    call stdlib_dlacpy( 'U', n, n, a, lda, v, ldv )
+                    ! only the leading nr x nr submatrix of the triangular factor
+                    ! is considered. only if nr=n will this give a reliable error
+                    ! bound. however, even for nr < n, this can be used on an
+                    ! expert level and obtain useful information in the sense of
+                    ! perturbation theory.
+                    do p = 1, nr
+                       rtmp = stdlib_dnrm2( p, v(1,p), 1 )
+                       call stdlib_dscal( p, one/rtmp, v(1,p), 1 )
+                    end do
+                    if ( .not. ( lsvec .or. rsvec ) ) then
+                        call stdlib_dpocon( 'U', nr, v, ldv, one, rtmp,work, iwork(n+iwoff), ierr &
+                                  )
+                    else
+                        call stdlib_dpocon( 'U', nr, v, ldv, one, rtmp,work(n+1), iwork(n+iwoff), &
+                                  ierr )
+                    end if
+                    sconda = one / sqrt(rtmp)
+                 ! for nr=n, sconda is an estimate of sqrt(||(r^* * r)^(-1)||_1),
+                 ! n^(-1/4) * sconda <= ||r^(-1)||_2 <= n^(1/4) * sconda
+                 ! see the reference [1] for more details.
+              end if
+           endif
+           if ( wntur ) then
+               n1 = nr
+           else if ( wntus .or. wntuf) then
+               n1 = n
+           else if ( wntua ) then
+               n1 = m
+           end if
+           if ( .not. ( rsvec .or. lsvec ) ) then
+      ! .......................................................................
+              ! Only The Singular Values Are Requested
+      ! .......................................................................
+              if ( rtrans ) then
+               ! .. compute the singular values of r**t = [a](1:nr,1:n)**t
+                 ! .. set the lower triangle of [a] to [a](1:nr,1:n)**t and
+                 ! the upper triangle of [a] to zero.
+                 do p = 1, min( n, nr )
+                    do q = p + 1, n
+                       a(q,p) = a(p,q)
+                       if ( q <= nr ) a(p,q) = zero
+                    end do
+                 end do
+                 call stdlib_dgesvd( 'N', 'N', n, nr, a, lda, s, u, ldu,v, ldv, work, lwork, info &
+                           )
+              else
+                 ! .. compute the singular values of r = [a](1:nr,1:n)
+                 if ( nr > 1 )call stdlib_dlaset( 'L', nr-1,nr-1, zero,zero, a(2,1), lda )
+                 call stdlib_dgesvd( 'N', 'N', nr, n, a, lda, s, u, ldu,v, ldv, work, lwork, info &
+                           )
+              end if
+           else if ( lsvec .and. ( .not. rsvec) ) then
+      ! .......................................................................
+             ! The Singular Values And The Left Singular Vectors Requested
+      ! .......................................................................""""""""
+              if ( rtrans ) then
+                  ! .. apply stdlib_dgesvd to r**t
+                  ! .. copy r**t into [u] and overwrite [u] with the right singular
+                  ! vectors of r
+                 do p = 1, nr
+                    do q = p, n
+                       u(q,p) = a(p,q)
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_dlaset( 'U', nr-1,nr-1, zero,zero, u(1,2), ldu )
+                 ! .. the left singular vectors not computed, the nr right singular
+                 ! vectors overwrite [u](1:nr,1:nr) as transposed. these
+                 ! will be pre-multiplied by q to build the left singular vectors of a.
+                    call stdlib_dgesvd( 'N', 'O', n, nr, u, ldu, s, u, ldu,u, ldu, work(n+1), &
+                              lwork-n, info )
+                    do p = 1, nr
+                        do q = p + 1, nr
+                           rtmp   = u(q,p)
+                           u(q,p) = u(p,q)
+                           u(p,q) = rtmp
+                        end do
+                    end do
+              else
+                  ! Apply Stdlib_Dgesvd To R
+                  ! .. copy r into [u] and overwrite [u] with the left singular vectors
+                  call stdlib_dlacpy( 'U', nr, n, a, lda, u, ldu )
+                  if ( nr > 1 )call stdlib_dlaset( 'L', nr-1, nr-1, zero, zero, u(2,1), ldu )
+                            
+                  ! .. the right singular vectors not computed, the nr left singular
+                  ! vectors overwrite [u](1:nr,1:nr)
+                     call stdlib_dgesvd( 'O', 'N', nr, n, u, ldu, s, u, ldu,v, ldv, work(n+1), &
+                               lwork-n, info )
+                     ! .. now [u](1:nr,1:nr) contains the nr left singular vectors of
+                     ! r. these will be pre-multiplied by q to build the left singular
+                     ! vectors of a.
+              end if
+                 ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+              if ( ( nr < m ) .and. ( .not.wntuf ) ) then
+                  call stdlib_dlaset('A', m-nr, nr, zero, zero, u(nr+1,1), ldu)
+                  if ( nr < n1 ) then
+                     call stdlib_dlaset( 'A',nr,n1-nr,zero,zero,u(1,nr+1), ldu )
+                     call stdlib_dlaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                  end if
+              end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! vectors matrix u.
+              if ( .not.wntuf )call stdlib_dormqr( 'L', 'N', m, n1, n, a, lda, work, u,ldu, work(&
+                        n+1), lwork-n, ierr )
+              if ( rowprm .and. .not.wntuf )call stdlib_dlaswp( n1, u, ldu, 1, m-1, iwork(n+1), -&
+                        1 )
+           else if ( rsvec .and. ( .not. lsvec ) ) then
+      ! .......................................................................
+             ! The Singular Values And The Right Singular Vectors Requested
+      ! .......................................................................
+               if ( rtrans ) then
+                  ! .. apply stdlib_dgesvd to r**t
+                  ! .. copy r**t into v and overwrite v with the left singular vectors
+                 do p = 1, nr
+                    do q = p, n
+                       v(q,p) = (a(p,q))
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_dlaset( 'U', nr-1,nr-1, zero,zero, v(1,2), ldv )
+                 ! .. the left singular vectors of r**t overwrite v, the right singular
+                 ! vectors not computed
+                 if ( wntvr .or. ( nr == n ) ) then
+                    call stdlib_dgesvd( 'O', 'N', n, nr, v, ldv, s, u, ldu,u, ldu, work(n+1), &
+                              lwork-n, info )
+                    do p = 1, nr
+                        do q = p + 1, nr
+                           rtmp   = v(q,p)
+                           v(q,p) = v(p,q)
+                           v(p,q) = rtmp
+                        end do
+                    end do
+                    if ( nr < n ) then
+                        do p = 1, nr
+                           do q = nr + 1, n
+                               v(p,q) = v(q,p)
+                           end do
+                        end do
+                    end if
+                    call stdlib_dlapmt( .false., nr, n, v, ldv, iwork )
+                 else
+                     ! .. need all n right singular vectors and nr < n
+                     ! [!] this is simple implementation that augments [v](1:n,1:nr)
+                     ! by padding a zero block. in the case nr << n, a more efficient
+                     ! way is to first use the qr factorization. for more details
+                     ! how to implement this, see the " full svd " branch.
+                     call stdlib_dlaset('G', n, n-nr, zero, zero, v(1,nr+1), ldv)
+                     call stdlib_dgesvd( 'O', 'N', n, n, v, ldv, s, u, ldu,u, ldu, work(n+1), &
+                               lwork-n, info )
+                     do p = 1, n
+                        do q = p + 1, n
+                           rtmp   = v(q,p)
+                           v(q,p) = v(p,q)
+                           v(p,q) = rtmp
+                        end do
+                     end do
+                     call stdlib_dlapmt( .false., n, n, v, ldv, iwork )
+                 end if
+               else
+                  ! Aply Stdlib_Dgesvd To R
+                  ! Copy R Into V And Overwrite V With The Right Singular Vectors
+                  call stdlib_dlacpy( 'U', nr, n, a, lda, v, ldv )
+                  if ( nr > 1 )call stdlib_dlaset( 'L', nr-1, nr-1, zero, zero, v(2,1), ldv )
+                            
+                  ! .. the right singular vectors overwrite v, the nr left singular
+                  ! vectors stored in u(1:nr,1:nr)
+                  if ( wntvr .or. ( nr == n ) ) then
+                     call stdlib_dgesvd( 'N', 'O', nr, n, v, ldv, s, u, ldu,v, ldv, work(n+1), &
+                               lwork-n, info )
+                     call stdlib_dlapmt( .false., nr, n, v, ldv, iwork )
+                     ! .. now [v](1:nr,1:n) contains v(1:n,1:nr)**t
+                  else
+                     ! .. need all n right singular vectors and nr < n
+                     ! [!] this is simple implementation that augments [v](1:nr,1:n)
+                     ! by padding a zero block. in the case nr << n, a more efficient
+                     ! way is to first use the lq factorization. for more details
+                     ! how to implement this, see the " full svd " branch.
+                      call stdlib_dlaset('G', n-nr, n, zero,zero, v(nr+1,1), ldv)
+                      call stdlib_dgesvd( 'N', 'O', n, n, v, ldv, s, u, ldu,v, ldv, work(n+1), &
+                                lwork-n, info )
+                      call stdlib_dlapmt( .false., n, n, v, ldv, iwork )
+                  end if
+                  ! .. now [v] contains the transposed matrix of the right singular
+                  ! vectors of a.
+               end if
+           else
+      ! .......................................................................
+             ! Full Svd Requested
+      ! .......................................................................
+              if ( rtrans ) then
+                  ! .. apply stdlib_dgesvd to r**t [[this option is left for r
+                 if ( wntvr .or. ( nr == n ) ) then
+                  ! .. copy r**t into [v] and overwrite [v] with the left singular
+                  ! vectors of r**t
+                 do p = 1, nr
+                    do q = p, n
+                       v(q,p) = a(p,q)
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_dlaset( 'U', nr-1,nr-1, zero,zero, v(1,2), ldv )
+                 ! .. the left singular vectors of r**t overwrite [v], the nr right
+                 ! singular vectors of r**t stored in [u](1:nr,1:nr) as transposed
+                    call stdlib_dgesvd( 'O', 'A', n, nr, v, ldv, s, v, ldv,u, ldu, work(n+1), &
+                              lwork-n, info )
+                    ! Assemble V
+                    do p = 1, nr
+                       do q = p + 1, nr
+                          rtmp   = v(q,p)
+                          v(q,p) = v(p,q)
+                          v(p,q) = rtmp
+                       end do
+                    end do
+                    if ( nr < n ) then
+                        do p = 1, nr
+                           do q = nr+1, n
+                              v(p,q) = v(q,p)
+                           end do
+                        end do
+                    end if
+                    call stdlib_dlapmt( .false., nr, n, v, ldv, iwork )
+                     do p = 1, nr
+                        do q = p + 1, nr
+                           rtmp   = u(q,p)
+                           u(q,p) = u(p,q)
+                           u(p,q) = rtmp
+                        end do
+                     end do
+                     if ( ( nr < m ) .and. .not.(wntuf)) then
+                       call stdlib_dlaset('A', m-nr,nr, zero,zero, u(nr+1,1), ldu)
+                       if ( nr < n1 ) then
+                          call stdlib_dlaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                          call stdlib_dlaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                       end if
+                    end if
+                 else
+                     ! .. need all n right singular vectors and nr < n
+                  ! .. copy r**t into [v] and overwrite [v] with the left singular
+                  ! vectors of r**t
+                     ! [[the optimal ratio n/nr for using qrf instead of padding
+                       ! with zeros. here hard coded to 2; it must be at least
+                       ! two due to work space constraints.]]
+                     ! optratio = stdlib_ilaenv(6, 'dgesvd', 's' // 'o', nr,n,0,0)
+                     ! optratio = max( optratio, 2 )
+                     optratio = 2
+                     if ( optratio*nr > n ) then
+                        do p = 1, nr
+                           do q = p, n
+                              v(q,p) = a(p,q)
+                           end do
+                        end do
+                        if ( nr > 1 )call stdlib_dlaset('U',nr-1,nr-1, zero,zero, v(1,2),ldv)
+                                  
+                        call stdlib_dlaset('A',n,n-nr,zero,zero,v(1,nr+1),ldv)
+                        call stdlib_dgesvd( 'O', 'A', n, n, v, ldv, s, v, ldv,u, ldu, work(n+1), &
+                                  lwork-n, info )
+                        do p = 1, n
+                           do q = p + 1, n
+                              rtmp   = v(q,p)
+                              v(q,p) = v(p,q)
+                              v(p,q) = rtmp
+                           end do
+                        end do
+                        call stdlib_dlapmt( .false., n, n, v, ldv, iwork )
+                    ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x n1), i.e. (m x n) or (m x m).
+                        do p = 1, n
+                           do q = p + 1, n
+                              rtmp   = u(q,p)
+                              u(q,p) = u(p,q)
+                              u(p,q) = rtmp
+                           end do
+                        end do
+                        if ( ( n < m ) .and. .not.(wntuf)) then
+                           call stdlib_dlaset('A',m-n,n,zero,zero,u(n+1,1),ldu)
+                           if ( n < n1 ) then
+                             call stdlib_dlaset('A',n,n1-n,zero,zero,u(1,n+1),ldu)
+                             call stdlib_dlaset('A',m-n,n1-n,zero,one,u(n+1,n+1), ldu )
+                           end if
+                        end if
+                     else
+                        ! .. copy r**t into [u] and overwrite [u] with the right
+                        ! singular vectors of r
+                        do p = 1, nr
+                           do q = p, n
+                              u(q,nr+p) = a(p,q)
+                           end do
+                        end do
+                        if ( nr > 1 )call stdlib_dlaset('U',nr-1,nr-1,zero,zero,u(1,nr+2),ldu)
+                                  
+                        call stdlib_dgeqrf( n, nr, u(1,nr+1), ldu, work(n+1),work(n+nr+1), lwork-&
+                                  n-nr, ierr )
+                        do p = 1, nr
+                            do q = 1, n
+                                v(q,p) = u(p,nr+q)
+                            end do
+                        end do
+                       call stdlib_dlaset('U',nr-1,nr-1,zero,zero,v(1,2),ldv)
+                       call stdlib_dgesvd( 'S', 'O', nr, nr, v, ldv, s, u, ldu,v,ldv, work(n+nr+1)&
+                                 ,lwork-n-nr, info )
+                       call stdlib_dlaset('A',n-nr,nr,zero,zero,v(nr+1,1),ldv)
+                       call stdlib_dlaset('A',nr,n-nr,zero,zero,v(1,nr+1),ldv)
+                       call stdlib_dlaset('A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv)
+                       call stdlib_dormqr('R','C', n, n, nr, u(1,nr+1), ldu,work(n+1),v,ldv,work(&
+                                 n+nr+1),lwork-n-nr,ierr)
+                       call stdlib_dlapmt( .false., n, n, v, ldv, iwork )
+                       ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                       ! (m x nr) or (m x n) or (m x m).
+                       if ( ( nr < m ) .and. .not.(wntuf)) then
+                          call stdlib_dlaset('A',m-nr,nr,zero,zero,u(nr+1,1),ldu)
+                          if ( nr < n1 ) then
+                          call stdlib_dlaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                          call stdlib_dlaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1),ldu)
+                          end if
+                       end if
+                     end if
+                 end if
+              else
+                  ! .. apply stdlib_dgesvd to r [[this is the recommended option]]
+                  if ( wntvr .or. ( nr == n ) ) then
+                      ! .. copy r into [v] and overwrite v with the right singular vectors
+                      call stdlib_dlacpy( 'U', nr, n, a, lda, v, ldv )
+                     if ( nr > 1 )call stdlib_dlaset( 'L', nr-1,nr-1, zero,zero, v(2,1), ldv )
+                               
+                     ! .. the right singular vectors of r overwrite [v], the nr left
+                     ! singular vectors of r stored in [u](1:nr,1:nr)
+                     call stdlib_dgesvd( 'S', 'O', nr, n, v, ldv, s, u, ldu,v, ldv, work(n+1), &
+                               lwork-n, info )
+                     call stdlib_dlapmt( .false., nr, n, v, ldv, iwork )
+                     ! .. now [v](1:nr,1:n) contains v(1:n,1:nr)**t
+                     ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+                    if ( ( nr < m ) .and. .not.(wntuf)) then
+                       call stdlib_dlaset('A', m-nr,nr, zero,zero, u(nr+1,1), ldu)
+                       if ( nr < n1 ) then
+                          call stdlib_dlaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                          call stdlib_dlaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                       end if
+                    end if
+                  else
+                    ! .. need all n right singular vectors and nr < n
+                    ! The Requested Number Of The Left Singular Vectors
+                     ! is then n1 (n or m)
+                     ! [[the optimal ratio n/nr for using lq instead of padding
+                       ! with zeros. here hard coded to 2; it must be at least
+                       ! two due to work space constraints.]]
+                     ! optratio = stdlib_ilaenv(6, 'dgesvd', 's' // 'o', nr,n,0,0)
+                     ! optratio = max( optratio, 2 )
+                    optratio = 2
+                    if ( optratio * nr > n ) then
+                       call stdlib_dlacpy( 'U', nr, n, a, lda, v, ldv )
+                       if ( nr > 1 )call stdlib_dlaset('L', nr-1,nr-1, zero,zero, v(2,1),ldv)
+                                 
+                    ! .. the right singular vectors of r overwrite [v], the nr left
+                       ! singular vectors of r stored in [u](1:nr,1:nr)
+                       call stdlib_dlaset('A', n-nr,n, zero,zero, v(nr+1,1),ldv)
+                       call stdlib_dgesvd( 'S', 'O', n, n, v, ldv, s, u, ldu,v, ldv, work(n+1), &
+                                 lwork-n, info )
+                       call stdlib_dlapmt( .false., n, n, v, ldv, iwork )
+                       ! .. now [v] contains the transposed matrix of the right
+                       ! singular vectors of a. the leading n left singular vectors
+                       ! are in [u](1:n,1:n)
+                       ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                       ! (m x n1), i.e. (m x n) or (m x m).
+                       if ( ( n < m ) .and. .not.(wntuf)) then
+                           call stdlib_dlaset('A',m-n,n,zero,zero,u(n+1,1),ldu)
+                           if ( n < n1 ) then
+                             call stdlib_dlaset('A',n,n1-n,zero,zero,u(1,n+1),ldu)
+                             call stdlib_dlaset( 'A',m-n,n1-n,zero,one,u(n+1,n+1), ldu )
+                           end if
+                       end if
+                    else
+                       call stdlib_dlacpy( 'U', nr, n, a, lda, u(nr+1,1), ldu )
+                       if ( nr > 1 )call stdlib_dlaset('L',nr-1,nr-1,zero,zero,u(nr+2,1),ldu)
+                                 
+                       call stdlib_dgelqf( nr, n, u(nr+1,1), ldu, work(n+1),work(n+nr+1), lwork-n-&
+                                 nr, ierr )
+                       call stdlib_dlacpy('L',nr,nr,u(nr+1,1),ldu,v,ldv)
+                       if ( nr > 1 )call stdlib_dlaset('U',nr-1,nr-1,zero,zero,v(1,2),ldv)
+                       call stdlib_dgesvd( 'S', 'O', nr, nr, v, ldv, s, u, ldu,v, ldv, work(n+nr+&
+                                 1), lwork-n-nr, info )
+                       call stdlib_dlaset('A',n-nr,nr,zero,zero,v(nr+1,1),ldv)
+                       call stdlib_dlaset('A',nr,n-nr,zero,zero,v(1,nr+1),ldv)
+                       call stdlib_dlaset('A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv)
+                       call stdlib_dormlq('R','N',n,n,nr,u(nr+1,1),ldu,work(n+1),v, ldv, work(n+&
+                                 nr+1),lwork-n-nr,ierr)
+                       call stdlib_dlapmt( .false., n, n, v, ldv, iwork )
+                     ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+                       if ( ( nr < m ) .and. .not.(wntuf)) then
+                          call stdlib_dlaset('A',m-nr,nr,zero,zero,u(nr+1,1),ldu)
+                          if ( nr < n1 ) then
+                          call stdlib_dlaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                          call stdlib_dlaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                          end if
+                       end if
+                    end if
+                  end if
+              ! .. end of the "r**t or r" branch
+              end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! vectors matrix u.
+              if ( .not. wntuf )call stdlib_dormqr( 'L', 'N', m, n1, n, a, lda, work, u,ldu, work(&
+                        n+1), lwork-n, ierr )
+              if ( rowprm .and. .not.wntuf )call stdlib_dlaswp( n1, u, ldu, 1, m-1, iwork(n+1), -&
+                        1 )
+           ! ... end of the "full svd" branch
+           end if
+           ! check whether some singular values are returned as zeros, e.g.
+           ! due to underflow, and update the numerical rank.
+           p = nr
+           do q = p, 1, -1
+               if ( s(q) > zero ) go to 4002
+               nr = nr - 1
+           end do
+           4002 continue
+           ! .. if numerical rank deficiency is detected, the truncated
+           ! singular values are set to zero.
+           if ( nr < n ) call stdlib_dlaset( 'G', n-nr,1, zero,zero, s(nr+1), n )
+           ! .. undo scaling; this may cause overflow in the largest singular
+           ! values.
+           if ( ascaled )call stdlib_dlascl( 'G',0,0, one,sqrt(real(m,KIND=dp)), nr,1, s, n, ierr &
+                     )
+           if ( conda ) rwork(1) = sconda
+           rwork(2) = p - nr
+           ! .. p-nr is the number of singular values that are computed as
+           ! exact zeros in stdlib_dgesvd() applied to the (possibly truncated)
+           ! full row rank triangular (trapezoidal) factor of a.
+           numrank = nr
+           return
+     end subroutine stdlib_dgesvdq
+
+     !> DGGES3: computes for a pair of N-by-N real nonsymmetric matrices (A,B),
+     !> the generalized eigenvalues, the generalized real Schur form (S,T),
+     !> optionally, the left and/or right matrices of Schur vectors (VSL and
+     !> VSR). This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL)*S*(VSR)**T, (VSL)*T*(VSR)**T )
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> quasi-triangular matrix S and the upper triangular matrix T.The
+     !> leading columns of VSL and VSR then form an orthonormal basis for the
+     !> corresponding left and right eigenspaces (deflating subspaces).
+     !> (If only the generalized eigenvalues are needed, use the driver
+     !> DGGEV instead, which is faster.)
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0 or both being zero.
+     !> A pair of matrices (S,T) is in generalized real Schur form if T is
+     !> upper triangular with non-negative diagonal and S is block upper
+     !> triangular with 1-by-1 and 2-by-2 blocks.  1-by-1 blocks correspond
+     !> to real generalized eigenvalues, while 2-by-2 blocks of S will be
+     !> "standardized" by making the corresponding elements of T have the
+     !> form:
+     !> [  a  0  ]
+     !> [  0  b  ]
+     !> and the pair of corresponding 2-by-2 blocks in S and T will have a
+     !> complex conjugate pair of generalized eigenvalues.
+
+     subroutine stdlib_dgges3( jobvsl, jobvsr, sort, selctg, n, a, lda, b,ldb, sdim, alphar, &
+               alphai, beta, vsl, ldvsl,vsr, ldvsr, work, lwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: alphai(*), alphar(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), &
+                     work(*)
+           ! Function Arguments 
+           procedure(stdlib_selctg_d) :: selctg
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, lst2sl, &
+                     wantst
+           integer(ilp) :: i, icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, ip, iright, irows, &
+                     itau, iwrk, lwkopt
+           real(dp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pvsl, pvsr, safmax, safmin, &
+                     smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(dp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -15
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -17
+           else if( lwork<6*n+16 .and. .not.lquery ) then
+              info = -19
+           end if
+           ! compute workspace
+           if( info==0 ) then
+              call stdlib_dgeqrf( n, n, b, ldb, work, work, -1, ierr )
+              lwkopt = max( 6*n+16, 3*n+int( work ( 1 ),KIND=ilp) )
+              call stdlib_dormqr( 'L', 'T', n, n, n, b, ldb, work, a, lda, work,-1, ierr )
+              lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+              if( ilvsl ) then
+                 call stdlib_dorgqr( n, n, n, vsl, ldvsl, work, work, -1, ierr )
+                 lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+              end if
+              call stdlib_dgghd3( jobvsl, jobvsr, n, 1, n, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr, &
+                        work, -1, ierr )
+              lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+              call stdlib_dlaqz0( 'S', jobvsl, jobvsr, n, 1, n, a, lda, b, ldb,alphar, alphai, &
+                        beta, vsl, ldvsl, vsr, ldvsr,work, -1, 0, ierr )
+              lwkopt = max( lwkopt, 2*n+int( work ( 1 ),KIND=ilp) )
+              if( wantst ) then
+                 call stdlib_dtgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb,alphar, alphai, &
+                 beta, vsl, ldvsl, vsr, ldvsr,sdim, pvsl, pvsr, dif, work, -1, idum, 1,ierr )
+                           
+                 lwkopt = max( lwkopt, 2*n+int( work ( 1 ),KIND=ilp) )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGES3 ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           safmin = stdlib_dlamch( 'S' )
+           safmax = one / safmin
+           call stdlib_dlabad( safmin, safmax )
+           smlnum = sqrt( safmin ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_dlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_dlange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_dlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_dggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_dgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           call stdlib_dormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           if( ilvsl ) then
+              call stdlib_dlaset( 'FULL', n, n, zero, one, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_dlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_dorgqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_dlaset( 'FULL', n, n, zero, one, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           call stdlib_dgghd3( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      work( iwrk ), lwork+1-iwrk,ierr )
+           ! perform qz algorithm, computing schur vectors if desired
+           iwrk = itau
+           call stdlib_dlaqz0( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vsl, ldvsl, vsr, ldvsr,work( iwrk ), lwork+1-iwrk, 0, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 50
+           end if
+           ! sort eigenvalues alpha/beta if desired
+           sdim = 0
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selctging
+              if( ilascl ) then
+                 call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n,ierr )
+                 call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n,ierr )
+              end if
+              if( ilbscl )call stdlib_dlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alphar( i ), alphai( i ), beta( i ) )
+              end do
+              call stdlib_dtgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb, alphar,alphai, beta, &
+              vsl, ldvsl, vsr, ldvsr, sdim, pvsl,pvsr, dif, work( iwrk ), lwork-iwrk+1, idum, 1,&
+                        ierr )
+              if( ierr==1 )info = n + 3
+           end if
+           ! apply back-permutation to vsl and vsr
+           if( ilvsl )call stdlib_dggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_dggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsr, ldvsr, ierr )
+           ! check if unscaling would cause over/underflow, if so, rescale
+           ! (alphar(i),alphai(i),beta(i)) so beta(i) is on the order of
+           ! b(i,i) and alphar(i) and alphai(i) are on the order of a(i,i)
+           if( ilascl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( alphar( i ) / safmax )>( anrmto / anrm ) .or.( safmin / alphar( i ) )>( &
+                              anrm / anrmto ) ) then
+                       work( 1 ) = abs( a( i, i ) / alphar( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    else if( ( alphai( i ) / safmax )>( anrmto / anrm ) .or.( safmin / alphai( i )&
+                               )>( anrm / anrmto ) )then
+                       work( 1 ) = abs( a( i, i+1 ) / alphai( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           if( ilbscl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( beta( i ) / safmax )>( bnrmto / bnrm ) .or.( safmin / beta( i ) )>( &
+                              bnrm / bnrmto ) ) then
+                       work( 1 ) = abs( b( i, i ) / beta( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_dlascl( 'H', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_dlascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_dlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = selctg( alphar( i ), alphai( i ), beta( i ) )
+                 if( alphai( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           50 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dgges3
+
+     !> DGGEV3: computes for a pair of N-by-N real nonsymmetric matrices (A,B)
+     !> the generalized eigenvalues, and optionally, the left and/or right
+     !> generalized eigenvectors.
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right eigenvector v(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j).
+     !> The left eigenvector u(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> u(j)**H * A  = lambda(j) * u(j)**H * B .
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_dggev3( jobvl, jobvr, n, a, lda, b, ldb, alphar,alphai, beta, vl, ldvl, vr,&
+                ldvr, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: alphai(*), alphar(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+                     
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery
+           character :: chtemp
+           integer(ilp) :: icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, in, iright, irows, itau, &
+                     iwrk, jc, jr, lwkopt
+           real(dp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -12
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -14
+           else if( lwork<max( 1, 8*n ) .and. .not.lquery ) then
+              info = -16
+           end if
+           ! compute workspace
+           if( info==0 ) then
+              call stdlib_dgeqrf( n, n, b, ldb, work, work, -1, ierr )
+              lwkopt = max(1, 8*n, 3*n+int( work( 1 ),KIND=ilp) )
+              call stdlib_dormqr( 'L', 'T', n, n, n, b, ldb, work, a, lda, work, -1,ierr )
+              lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+              if( ilvl ) then
+                 call stdlib_dorgqr( n, n, n, vl, ldvl, work, work, -1, ierr )
+                 lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+              end if
+              if( ilv ) then
+                 call stdlib_dgghd3( jobvl, jobvr, n, 1, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                           work, -1, ierr )
+                 lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+                 call stdlib_dlaqz0( 'S', jobvl, jobvr, n, 1, n, a, lda, b, ldb,alphar, alphai, &
+                           beta, vl, ldvl, vr, ldvr,work, -1, 0, ierr )
+                 lwkopt = max( lwkopt, 2*n+int( work ( 1 ),KIND=ilp) )
+              else
+                 call stdlib_dgghd3( 'N', 'N', n, 1, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, work, -&
+                           1, ierr )
+                 lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+                 call stdlib_dlaqz0( 'E', jobvl, jobvr, n, 1, n, a, lda, b, ldb,alphar, alphai, &
+                           beta, vl, ldvl, vr, ldvr,work, -1, 0, ierr )
+                 lwkopt = max( lwkopt, 2*n+int( work ( 1 ),KIND=ilp) )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGEV3 ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_dlange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_dlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_dlange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_dlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrices a, b to isolate eigenvalues if possible
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_dggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           irows = ihi + 1 - ilo
+           if( ilv ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_dgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           call stdlib_dormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl
+           if( ilvl ) then
+              call stdlib_dlaset( 'FULL', n, n, zero, one, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_dlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_dorgqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vr
+           if( ilvr )call stdlib_dlaset( 'FULL', n, n, zero, one, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           if( ilv ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_dgghd3( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        work( iwrk ), lwork+1-iwrk, ierr )
+           else
+              call stdlib_dgghd3( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr,work( iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur forms and schur vectors)
+           iwrk = itau
+           if( ilv ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_dlaqz0( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vl, ldvl, vr, ldvr,work( iwrk ), lwork+1-iwrk, 0, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 110
+           end if
+           ! compute eigenvectors
+           if( ilv ) then
+              if( ilvl ) then
+                 if( ilvr ) then
+                    chtemp = 'B'
+                 else
+                    chtemp = 'L'
+                 end if
+              else
+                 chtemp = 'R'
+              end if
+              call stdlib_dtgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, n, &
+                        in, work( iwrk ), ierr )
+              if( ierr/=0 ) then
+                 info = n + 2
+                 go to 110
+              end if
+              ! undo balancing on vl and vr and normalization
+              if( ilvl ) then
+                 call stdlib_dggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, vl, &
+                           ldvl, ierr )
+                 loop_50: do jc = 1, n
+                    if( alphai( jc )<zero )cycle loop_50
+                    temp = zero
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          temp = max( temp, abs( vl( jr, jc ) ) )
+                       end do
+                    else
+                       do jr = 1, n
+                          temp = max( temp, abs( vl( jr, jc ) )+abs( vl( jr, jc+1 ) ) )
+                       end do
+                    end if
+                    if( temp<smlnum )cycle loop_50
+                    temp = one / temp
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          vl( jr, jc ) = vl( jr, jc )*temp
+                       end do
+                    else
+                       do jr = 1, n
+                          vl( jr, jc ) = vl( jr, jc )*temp
+                          vl( jr, jc+1 ) = vl( jr, jc+1 )*temp
+                       end do
+                    end if
+                 end do loop_50
+              end if
+              if( ilvr ) then
+                 call stdlib_dggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, vr, &
+                           ldvr, ierr )
+                 loop_100: do jc = 1, n
+                    if( alphai( jc )<zero )cycle loop_100
+                    temp = zero
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          temp = max( temp, abs( vr( jr, jc ) ) )
+                       end do
+                    else
+                       do jr = 1, n
+                          temp = max( temp, abs( vr( jr, jc ) )+abs( vr( jr, jc+1 ) ) )
+                       end do
+                    end if
+                    if( temp<smlnum )cycle loop_100
+                    temp = one / temp
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          vr( jr, jc ) = vr( jr, jc )*temp
+                       end do
+                    else
+                       do jr = 1, n
+                          vr( jr, jc ) = vr( jr, jc )*temp
+                          vr( jr, jc+1 ) = vr( jr, jc+1 )*temp
+                       end do
+                    end if
+                 end do loop_100
+              end if
+              ! end of eigenvector calculation
+           end if
+           ! undo scaling if necessary
+           110 continue
+           if( ilascl ) then
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_dlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dggev3
+
+     !> DHSEQR: computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+     !> Schur form), and Z is the orthogonal matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input orthogonal
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+     subroutine stdlib_dhseqr( job, compz, n, ilo, ihi, h, ldh, wr, wi, z,ldz, work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           character, intent(in) :: compz, job
+           ! Array Arguments 
+           real(dp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(dp), intent(out) :: wi(*), work(*), wr(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: nl = 49
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_dlahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== nl allocates some local workspace to help small matrices
+           ! .    through a rare stdlib_dlahqr failure.  nl > ntiny = 15 is
+           ! .    required and nl <= nmin = stdlib_ilaenv(ispec=12,...) is recom-
+           ! .    mended.  (the default value of nmin is 75.)  using nl = 49
+           ! .    allows up to six simultaneous shifts and a 16-by-16
+           ! .    deflation window.  ====
+           
+           
+           ! Local Arrays 
+           real(dp) :: hl(nl,nl), workl(nl)
+           ! Local Scalars 
+           integer(ilp) :: i, kbot, nmin
+           logical(lk) :: initz, lquery, wantt, wantz
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! ==== decode and check the input parameters. ====
+           wantt = stdlib_lsame( job, 'S' )
+           initz = stdlib_lsame( compz, 'I' )
+           wantz = initz .or. stdlib_lsame( compz, 'V' )
+           work( 1 ) = real( max( 1, n ),KIND=dp)
+           lquery = lwork==-1
+           info = 0
+           if( .not.stdlib_lsame( job, 'E' ) .and. .not.wantt ) then
+              info = -1
+           else if( .not.stdlib_lsame( compz, 'N' ) .and. .not.wantz ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -4
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -5
+           else if( ldh<max( 1, n ) ) then
+              info = -7
+           else if( ldz<1 .or. ( wantz .and. ldz<max( 1, n ) ) ) then
+              info = -11
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              ! ==== quick return in case of invalid argument. ====
+              call stdlib_xerbla( 'DHSEQR', -info )
+              return
+           else if( n==0 ) then
+              ! ==== quick return in case n = 0; nothing to do. ====
+              return
+           else if( lquery ) then
+              ! ==== quick return in case of a workspace query ====
+              call stdlib_dlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, ilo,ihi, z, ldz, &
+                        work, lwork, info )
+              ! ==== ensure reported workspace size is backward-compatible with
+              ! .    previous lapack versions. ====
+              work( 1 ) = max( real( max( 1, n ),KIND=dp), work( 1 ) )
+              return
+           else
+              ! ==== copy eigenvalues isolated by stdlib_dgebal ====
+              do i = 1, ilo - 1
+                 wr( i ) = h( i, i )
+                 wi( i ) = zero
+              end do
+              do i = ihi + 1, n
+                 wr( i ) = h( i, i )
+                 wi( i ) = zero
+              end do
+              ! ==== initialize z, if requested ====
+              if( initz )call stdlib_dlaset( 'A', n, n, zero, one, z, ldz )
+              ! ==== quick return if possible ====
+              if( ilo==ihi ) then
+                 wr( ilo ) = h( ilo, ilo )
+                 wi( ilo ) = zero
+                 return
+              end if
+              ! ==== stdlib_dlahqr/stdlib_dlaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'DHSEQR', job( : 1 ) // compz( : 1 ), n,ilo, ihi, lwork )
+                        
+              nmin = max( ntiny, nmin )
+              ! ==== stdlib_dlaqr0 for big matrices; stdlib_dlahqr for small ones ====
+              if( n>nmin ) then
+                 call stdlib_dlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, ilo,ihi, z, ldz, &
+                           work, lwork, info )
+              else
+                 ! ==== small matrix ====
+                 call stdlib_dlahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, ilo,ihi, z, ldz, &
+                           info )
+                 if( info>0 ) then
+                    ! ==== a rare stdlib_dlahqr failure!  stdlib_dlaqr0 sometimes succeeds
+                    ! .    when stdlib_dlahqr fails. ====
+                    kbot = info
+                    if( n>=nl ) then
+                       ! ==== larger matrices have enough subdiagonal scratch
+                       ! .    space to call stdlib_dlaqr0 directly. ====
+                       call stdlib_dlaqr0( wantt, wantz, n, ilo, kbot, h, ldh, wr,wi, ilo, ihi, z,&
+                                  ldz, work, lwork, info )
+                    else
+                       ! ==== tiny matrices don't have enough subdiagonal
+                       ! .    scratch space to benefit from stdlib_dlaqr0.  hence,
+                       ! .    tiny matrices must be copied into a larger
+                       ! .    array before calling stdlib_dlaqr0. ====
+                       call stdlib_dlacpy( 'A', n, n, h, ldh, hl, nl )
+                       hl( n+1, n ) = zero
+                       call stdlib_dlaset( 'A', nl, nl-n, zero, zero, hl( 1, n+1 ),nl )
+                       call stdlib_dlaqr0( wantt, wantz, nl, ilo, kbot, hl, nl, wr,wi, ilo, ihi, &
+                                 z, ldz, workl, nl, info )
+                       if( wantt .or. info/=0 )call stdlib_dlacpy( 'A', n, n, hl, nl, h, ldh )
+                                 
+                    end if
+                 end if
+              end if
+              ! ==== clear out the trash, if necessary. ====
+              if( ( wantt .or. info/=0 ) .and. n>2 )call stdlib_dlaset( 'L', n-2, n-2, zero, zero,&
+                         h( 3, 1 ), ldh )
+              ! ==== ensure reported workspace size is backward-compatible with
+              ! .    previous lapack versions. ====
+              work( 1 ) = max( real( max( 1, n ),KIND=dp), work( 1 ) )
+           end if
+     end subroutine stdlib_dhseqr
+
+     !> DLALSA: is an itermediate step in solving the least squares problem
+     !> by computing the SVD of the coefficient matrix in compact form (The
+     !> singular vectors are computed as products of simple orthorgonal
+     !> matrices.).
+     !> If ICOMPQ = 0, DLALSA applies the inverse of the left singular vector
+     !> matrix of an upper bidiagonal matrix to the right hand side; and if
+     !> ICOMPQ = 1, DLALSA applies the right singular vector matrix to the
+     !> right hand side. The singular vector matrices were generated in
+     !> compact form by DLALSA.
+
+     pure subroutine stdlib_dlalsa( icompq, smlsiz, n, nrhs, b, ldb, bx, ldbx, u,ldu, vt, k, difl,&
+                difr, z, poles, givptr,givcol, ldgcol, perm, givnum, c, s, work,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, ldb, ldbx, ldgcol, ldu, n, nrhs, smlsiz
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(ldgcol,*), givptr(*), k(*), perm(ldgcol,*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: b(ldb,*)
+           real(dp), intent(out) :: bx(ldbx,*), work(*)
+           real(dp), intent(in) :: c(*), difl(ldu,*), difr(ldu,*), givnum(ldu,*), poles(ldu,*), s(&
+                     *), u(ldu,*), vt(ldu,*), z(ldu,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ic, im1, inode, j, lf, ll, lvl, lvl2, nd, ndb1, ndiml, ndimr, &
+                     nl, nlf, nlp1, nlvl, nr, nrf, nrp1, sqre
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( smlsiz<3 ) then
+              info = -2
+           else if( n<smlsiz ) then
+              info = -3
+           else if( nrhs<1 ) then
+              info = -4
+           else if( ldb<n ) then
+              info = -6
+           else if( ldbx<n ) then
+              info = -8
+           else if( ldu<n ) then
+              info = -10
+           else if( ldgcol<n ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLALSA', -info )
+              return
+           end if
+           ! book-keeping and  setting up the computation tree.
+           inode = 1
+           ndiml = inode + n
+           ndimr = ndiml + n
+           call stdlib_dlasdt( n, nlvl, nd, iwork( inode ), iwork( ndiml ),iwork( ndimr ), smlsiz &
+                     )
+           ! the following code applies back the left singular vector factors.
+           ! for applying back the right singular vector factors, go to 50.
+           if( icompq==1 ) then
+              go to 50
+           end if
+           ! the nodes on the bottom level of the tree were solved
+           ! by stdlib_dlasdq. the corresponding left and right singular vector
+           ! matrices are in explicit form. first apply back the left
+           ! singular vector matrices.
+           ndb1 = ( nd+1 ) / 2
+           do i = ndb1, nd
+              ! ic : center row of each node
+              ! nl : number of rows of left  subproblem
+              ! nr : number of rows of right subproblem
+              ! nlf: starting row of the left   subproblem
+              ! nrf: starting row of the right  subproblem
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nr = iwork( ndimr+i1 )
+              nlf = ic - nl
+              nrf = ic + 1
+              call stdlib_dgemm( 'T', 'N', nl, nrhs, nl, one, u( nlf, 1 ), ldu,b( nlf, 1 ), ldb, &
+                        zero, bx( nlf, 1 ), ldbx )
+              call stdlib_dgemm( 'T', 'N', nr, nrhs, nr, one, u( nrf, 1 ), ldu,b( nrf, 1 ), ldb, &
+                        zero, bx( nrf, 1 ), ldbx )
+           end do
+           ! next copy the rows of b that correspond to unchanged rows
+           ! in the bidiagonal matrix to bx.
+           do i = 1, nd
+              ic = iwork( inode+i-1 )
+              call stdlib_dcopy( nrhs, b( ic, 1 ), ldb, bx( ic, 1 ), ldbx )
+           end do
+           ! finally go through the left singular vector matrices of all
+           ! the other subproblems bottom-up on the tree.
+           j = 2**nlvl
+           sqre = 0
+           do lvl = nlvl, 1, -1
+              lvl2 = 2*lvl - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = lf, ll
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 j = j - 1
+                 call stdlib_dlals0( icompq, nl, nr, sqre, nrhs, bx( nlf, 1 ), ldbx,b( nlf, 1 ), &
+                 ldb, perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ), ldgcol,givnum( nlf, lvl2 &
+                 ), ldu, poles( nlf, lvl2 ),difl( nlf, lvl ), difr( nlf, lvl2 ),z( nlf, lvl ), k( &
+                           j ), c( j ), s( j ), work,info )
+              end do
+           end do
+           go to 90
+           ! icompq = 1: applying back the right singular vector factors.
+           50 continue
+           ! first now go through the right singular vector matrices of all
+           ! the tree nodes top-down.
+           j = 0
+           loop_70: do lvl = 1, nlvl
+              lvl2 = 2*lvl - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl.
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = ll, lf, -1
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 if( i==ll ) then
+                    sqre = 0
+                 else
+                    sqre = 1
+                 end if
+                 j = j + 1
+                 call stdlib_dlals0( icompq, nl, nr, sqre, nrhs, b( nlf, 1 ), ldb,bx( nlf, 1 ), &
+                 ldbx, perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ), ldgcol,givnum( nlf, &
+                 lvl2 ), ldu, poles( nlf, lvl2 ),difl( nlf, lvl ), difr( nlf, lvl2 ),z( nlf, lvl )&
+                           , k( j ), c( j ), s( j ), work,info )
+              end do
+           end do loop_70
+           ! the nodes on the bottom level of the tree were solved
+           ! by stdlib_dlasdq. the corresponding right singular vector
+           ! matrices are in explicit form. apply them back.
+           ndb1 = ( nd+1 ) / 2
+           do i = ndb1, nd
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nr = iwork( ndimr+i1 )
+              nlp1 = nl + 1
+              if( i==nd ) then
+                 nrp1 = nr
+              else
+                 nrp1 = nr + 1
+              end if
+              nlf = ic - nl
+              nrf = ic + 1
+              call stdlib_dgemm( 'T', 'N', nlp1, nrhs, nlp1, one, vt( nlf, 1 ), ldu,b( nlf, 1 ), &
+                        ldb, zero, bx( nlf, 1 ), ldbx )
+              call stdlib_dgemm( 'T', 'N', nrp1, nrhs, nrp1, one, vt( nrf, 1 ), ldu,b( nrf, 1 ), &
+                        ldb, zero, bx( nrf, 1 ), ldbx )
+           end do
+           90 continue
+           return
+     end subroutine stdlib_dlalsa
+
+     !> DLALSD: uses the singular value decomposition of A to solve the least
+     !> squares problem of finding X to minimize the Euclidean norm of each
+     !> column of A*X-B, where A is N-by-N upper bidiagonal, and X and B
+     !> are N-by-NRHS. The solution X overwrites B.
+     !> The singular values of A smaller than RCOND times the largest
+     !> singular value are treated as zero in solving the least squares
+     !> problem; in this case a minimum norm solution is returned.
+     !> The actual singular values are returned in D in ascending order.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_dlalsd( uplo, smlsiz, n, nrhs, d, e, b, ldb, rcond,rank, work, iwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: ldb, n, nrhs, smlsiz
+           real(dp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: b(ldb,*), d(*), e(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: bx, bxst, c, difl, difr, givcol, givnum, givptr, i, icmpq1, icmpq2, &
+           iwk, j, k, nlvl, nm1, nsize, nsub, nwork, perm, poles, s, sizei, smlszp, sqre, st, st1,&
+                      u, vt, z
+           real(dp) :: cs, eps, orgnrm, r, rcnd, sn, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -3
+           else if( nrhs<1 ) then
+              info = -4
+           else if( ( ldb<1 ) .or. ( ldb<n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLALSD', -info )
+              return
+           end if
+           eps = stdlib_dlamch( 'EPSILON' )
+           ! set up the tolerance.
+           if( ( rcond<=zero ) .or. ( rcond>=one ) ) then
+              rcnd = eps
+           else
+              rcnd = rcond
+           end if
+           rank = 0
+           ! quick return if possible.
+           if( n==0 ) then
+              return
+           else if( n==1 ) then
+              if( d( 1 )==zero ) then
+                 call stdlib_dlaset( 'A', 1, nrhs, zero, zero, b, ldb )
+              else
+                 rank = 1
+                 call stdlib_dlascl( 'G', 0, 0, d( 1 ), one, 1, nrhs, b, ldb, info )
+                 d( 1 ) = abs( d( 1 ) )
+              end if
+              return
+           end if
+           ! rotate the matrix if it is lower bidiagonal.
+           if( uplo=='L' ) then
+              do i = 1, n - 1
+                 call stdlib_dlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( nrhs==1 ) then
+                    call stdlib_drot( 1, b( i, 1 ), 1, b( i+1, 1 ), 1, cs, sn )
+                 else
+                    work( i*2-1 ) = cs
+                    work( i*2 ) = sn
+                 end if
+              end do
+              if( nrhs>1 ) then
+                 do i = 1, nrhs
+                    do j = 1, n - 1
+                       cs = work( j*2-1 )
+                       sn = work( j*2 )
+                       call stdlib_drot( 1, b( j, i ), 1, b( j+1, i ), 1, cs, sn )
+                    end do
+                 end do
+              end if
+           end if
+           ! scale.
+           nm1 = n - 1
+           orgnrm = stdlib_dlanst( 'M', n, d, e )
+           if( orgnrm==zero ) then
+              call stdlib_dlaset( 'A', n, nrhs, zero, zero, b, ldb )
+              return
+           end if
+           call stdlib_dlascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, info )
+           call stdlib_dlascl( 'G', 0, 0, orgnrm, one, nm1, 1, e, nm1, info )
+           ! if n is smaller than the minimum divide size smlsiz, then solve
+           ! the problem with another solver.
+           if( n<=smlsiz ) then
+              nwork = 1 + n*n
+              call stdlib_dlaset( 'A', n, n, zero, one, work, n )
+              call stdlib_dlasdq( 'U', 0, n, n, 0, nrhs, d, e, work, n, work, n, b,ldb, work( &
+                        nwork ), info )
+              if( info/=0 ) then
+                 return
+              end if
+              tol = rcnd*abs( d( stdlib_idamax( n, d, 1 ) ) )
+              do i = 1, n
+                 if( d( i )<=tol ) then
+                    call stdlib_dlaset( 'A', 1, nrhs, zero, zero, b( i, 1 ), ldb )
+                 else
+                    call stdlib_dlascl( 'G', 0, 0, d( i ), one, 1, nrhs, b( i, 1 ),ldb, info )
+                              
+                    rank = rank + 1
+                 end if
+              end do
+              call stdlib_dgemm( 'T', 'N', n, nrhs, n, one, work, n, b, ldb, zero,work( nwork ), &
+                        n )
+              call stdlib_dlacpy( 'A', n, nrhs, work( nwork ), n, b, ldb )
+              ! unscale.
+              call stdlib_dlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+              call stdlib_dlasrt( 'D', n, d, info )
+              call stdlib_dlascl( 'G', 0, 0, orgnrm, one, n, nrhs, b, ldb, info )
+              return
+           end if
+           ! book-keeping and setting up some constants.
+           nlvl = int( log( real( n,KIND=dp) / real( smlsiz+1,KIND=dp) ) / log( two ),KIND=ilp) + &
+                     1
+           smlszp = smlsiz + 1
+           u = 1
+           vt = 1 + smlsiz*n
+           difl = vt + smlszp*n
+           difr = difl + nlvl*n
+           z = difr + nlvl*n*2
+           c = z + nlvl*n
+           s = c + n
+           poles = s + n
+           givnum = poles + 2*nlvl*n
+           bx = givnum + 2*nlvl*n
+           nwork = bx + n*nrhs
+           sizei = 1 + n
+           k = sizei + n
+           givptr = k + n
+           perm = givptr + n
+           givcol = perm + nlvl*n
+           iwk = givcol + nlvl*n*2
+           st = 1
+           sqre = 0
+           icmpq1 = 1
+           icmpq2 = 0
+           nsub = 0
+           do i = 1, n
+              if( abs( d( i ) )<eps ) then
+                 d( i ) = sign( eps, d( i ) )
+              end if
+           end do
+           loop_60: do i = 1, nm1
+              if( ( abs( e( i ) )<eps ) .or. ( i==nm1 ) ) then
+                 nsub = nsub + 1
+                 iwork( nsub ) = st
+                 ! subproblem found. first determine its size and then
+                 ! apply divide and conquer on it.
+                 if( i<nm1 ) then
+                    ! a subproblem with e(i) small for i < nm1.
+                    nsize = i - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                 else if( abs( e( i ) )>=eps ) then
+                    ! a subproblem with e(nm1) not too small but i = nm1.
+                    nsize = n - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                 else
+                    ! a subproblem with e(nm1) small. this implies an
+                    ! 1-by-1 subproblem at d(n), which is not solved
+                    ! explicitly.
+                    nsize = i - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                    nsub = nsub + 1
+                    iwork( nsub ) = n
+                    iwork( sizei+nsub-1 ) = 1
+                    call stdlib_dcopy( nrhs, b( n, 1 ), ldb, work( bx+nm1 ), n )
+                 end if
+                 st1 = st - 1
+                 if( nsize==1 ) then
+                    ! this is a 1-by-1 subproblem and is not solved
+                    ! explicitly.
+                    call stdlib_dcopy( nrhs, b( st, 1 ), ldb, work( bx+st1 ), n )
+                 else if( nsize<=smlsiz ) then
+                    ! this is a small subproblem and is solved by stdlib_dlasdq.
+                    call stdlib_dlaset( 'A', nsize, nsize, zero, one,work( vt+st1 ), n )
+                    call stdlib_dlasdq( 'U', 0, nsize, nsize, 0, nrhs, d( st ),e( st ), work( vt+&
+                              st1 ), n, work( nwork ),n, b( st, 1 ), ldb, work( nwork ), info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                    call stdlib_dlacpy( 'A', nsize, nrhs, b( st, 1 ), ldb,work( bx+st1 ), n )
+                              
+                 else
+                    ! a large problem. solve it using divide and conquer.
+                    call stdlib_dlasda( icmpq1, smlsiz, nsize, sqre, d( st ),e( st ), work( u+st1 &
+                    ), n, work( vt+st1 ),iwork( k+st1 ), work( difl+st1 ),work( difr+st1 ), work( &
+                    z+st1 ),work( poles+st1 ), iwork( givptr+st1 ),iwork( givcol+st1 ), n, iwork( &
+                    perm+st1 ),work( givnum+st1 ), work( c+st1 ),work( s+st1 ), work( nwork ), &
+                              iwork( iwk ),info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                    bxst = bx + st1
+                    call stdlib_dlalsa( icmpq2, smlsiz, nsize, nrhs, b( st, 1 ),ldb, work( bxst ),&
+                     n, work( u+st1 ), n,work( vt+st1 ), iwork( k+st1 ),work( difl+st1 ), work( &
+                     difr+st1 ),work( z+st1 ), work( poles+st1 ),iwork( givptr+st1 ), iwork( &
+                     givcol+st1 ), n,iwork( perm+st1 ), work( givnum+st1 ),work( c+st1 ), work( s+&
+                               st1 ), work( nwork ),iwork( iwk ), info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                 end if
+                 st = i + 1
+              end if
+           end do loop_60
+           ! apply the singular values and treat the tiny ones as zero.
+           tol = rcnd*abs( d( stdlib_idamax( n, d, 1 ) ) )
+           do i = 1, n
+              ! some of the elements in d can be negative because 1-by-1
+              ! subproblems were not solved explicitly.
+              if( abs( d( i ) )<=tol ) then
+                 call stdlib_dlaset( 'A', 1, nrhs, zero, zero, work( bx+i-1 ), n )
+              else
+                 rank = rank + 1
+                 call stdlib_dlascl( 'G', 0, 0, d( i ), one, 1, nrhs,work( bx+i-1 ), n, info )
+                           
+              end if
+              d( i ) = abs( d( i ) )
+           end do
+           ! now apply back the right singular vectors.
+           icmpq2 = 1
+           do i = 1, nsub
+              st = iwork( i )
+              st1 = st - 1
+              nsize = iwork( sizei+i-1 )
+              bxst = bx + st1
+              if( nsize==1 ) then
+                 call stdlib_dcopy( nrhs, work( bxst ), n, b( st, 1 ), ldb )
+              else if( nsize<=smlsiz ) then
+                 call stdlib_dgemm( 'T', 'N', nsize, nrhs, nsize, one,work( vt+st1 ), n, work( &
+                           bxst ), n, zero,b( st, 1 ), ldb )
+              else
+                 call stdlib_dlalsa( icmpq2, smlsiz, nsize, nrhs, work( bxst ), n,b( st, 1 ), ldb,&
+                  work( u+st1 ), n,work( vt+st1 ), iwork( k+st1 ),work( difl+st1 ), work( difr+&
+                  st1 ),work( z+st1 ), work( poles+st1 ),iwork( givptr+st1 ), iwork( givcol+st1 ),&
+                   n,iwork( perm+st1 ), work( givnum+st1 ),work( c+st1 ), work( s+st1 ), work( &
+                             nwork ),iwork( iwk ), info )
+                 if( info/=0 ) then
+                    return
+                 end if
+              end if
+           end do
+           ! unscale and sort the singular values.
+           call stdlib_dlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+           call stdlib_dlasrt( 'D', n, d, info )
+           call stdlib_dlascl( 'G', 0, 0, orgnrm, one, n, nrhs, b, ldb, info )
+           return
+     end subroutine stdlib_dlalsd
+
+     !> DLAQR0: computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+     !> Schur form), and Z is the orthogonal matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input orthogonal
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+     subroutine stdlib_dlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, ldz, work,&
+                lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(dp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(dp), intent(out) :: wi(*), work(*), wr(*)
+        ! ================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: kexnw = 5
+           integer(ilp), parameter :: kexsh = 6
+           real(dp), parameter :: wilk1 = 0.75_dp
+           real(dp), parameter :: wilk2 = -0.4375_dp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_dlahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== exceptional deflation windows:  try to cure rare
+           ! .    slow convergence by varying the size of the
+           ! .    deflation window after kexnw iterations. ====
+           
+           ! ==== exceptional shifts: try to cure rare slow convergence
+           ! .    with ad-hoc exceptional shifts every kexsh iterations.
+           ! .    ====
+           
+           ! ==== the constants wilk1 and wilk2 are used to form the
+           ! .    exceptional shifts. ====
+           
+           
+           ! Local Scalars 
+           real(dp) :: aa, bb, cc, cs, dd, sn, ss, swap
+           integer(ilp) :: i, inf, it, itmax, k, kacc22, kbot, kdu, ks, kt, ktop, ku, kv, kwh, &
+           kwtop, kwv, ld, ls, lwkopt, ndec, ndfl, nh, nho, nibble, nmin, ns, nsmax, nsr, nve, nw,&
+                      nwmax, nwr, nwupbd
+           logical(lk) :: sorted
+           character :: jbcmpz*2
+           ! Local Arrays 
+           real(dp) :: zdum(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,max,min,mod
+           ! Executable Statements 
+           info = 0
+           ! ==== quick return for n = 0: nothing to do. ====
+           if( n==0 ) then
+              work( 1 ) = one
+              return
+           end if
+           if( n<=ntiny ) then
+              ! ==== tiny matrices must use stdlib_dlahqr. ====
+              lwkopt = 1
+              if( lwork/=-1 )call stdlib_dlahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, &
+                        ihiz, z, ldz, info )
+           else
+              ! ==== use small bulge multi-shift qr with aggressive early
+              ! .    deflation on larger-than-tiny matrices. ====
+              ! ==== hope for the best. ====
+              info = 0
+              ! ==== set up job flags for stdlib_ilaenv. ====
+              if( wantt ) then
+                 jbcmpz( 1: 1 ) = 'S'
+              else
+                 jbcmpz( 1: 1 ) = 'E'
+              end if
+              if( wantz ) then
+                 jbcmpz( 2: 2 ) = 'V'
+              else
+                 jbcmpz( 2: 2 ) = 'N'
+              end if
+              ! ==== nwr = recommended deflation window size.  at this
+              ! .    point,  n > ntiny = 15, so there is enough
+              ! .    subdiagonal workspace for nwr>=2 as required.
+              ! .    (in fact, there is enough subdiagonal space for
+              ! .    nwr>=4.) ====
+              nwr = stdlib_ilaenv( 13, 'DLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nwr = max( 2, nwr )
+              nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+              ! ==== nsr = recommended number of simultaneous shifts.
+              ! .    at this point n > ntiny = 15, so there is at
+              ! .    enough subdiagonal workspace for nsr to be even
+              ! .    and greater than or equal to two as required. ====
+              nsr = stdlib_ilaenv( 15, 'DLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nsr = min( nsr, ( n-3 ) / 6, ihi-ilo )
+              nsr = max( 2, nsr-mod( nsr, 2 ) )
+              ! ==== estimate optimal workspace ====
+              ! ==== workspace query call to stdlib_dlaqr3 ====
+              call stdlib_dlaqr3( wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz,ihiz, z, ldz, ls,&
+                         ld, wr, wi, h, ldh, n, h, ldh,n, h, ldh, work, -1 )
+              ! ==== optimal workspace = max(stdlib_dlaqr5, stdlib_dlaqr3) ====
+              lwkopt = max( 3*nsr / 2, int( work( 1 ),KIND=ilp) )
+              ! ==== quick return in case of workspace query. ====
+              if( lwork==-1 ) then
+                 work( 1 ) = real( lwkopt,KIND=dp)
+                 return
+              end if
+              ! ==== stdlib_dlahqr/stdlib_dlaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'DLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nmin = max( ntiny, nmin )
+              ! ==== nibble crossover point ====
+              nibble = stdlib_ilaenv( 14, 'DLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nibble = max( 0, nibble )
+              ! ==== accumulate reflections during ttswp?  use block
+              ! .    2-by-2 structure during matrix-matrix multiply? ====
+              kacc22 = stdlib_ilaenv( 16, 'DLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              kacc22 = max( 0, kacc22 )
+              kacc22 = min( 2, kacc22 )
+              ! ==== nwmax = the largest possible deflation window for
+              ! .    which there is sufficient workspace. ====
+              nwmax = min( ( n-1 ) / 3, lwork / 2 )
+              nw = nwmax
+              ! ==== nsmax = the largest number of simultaneous shifts
+              ! .    for which there is sufficient workspace. ====
+              nsmax = min( ( n-3 ) / 6, 2*lwork / 3 )
+              nsmax = nsmax - mod( nsmax, 2 )
+              ! ==== ndfl: an iteration count restarted at deflation. ====
+              ndfl = 1
+              ! ==== itmax = iteration limit ====
+              itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )
+              ! ==== last row and column in the active block ====
+              kbot = ihi
+              ! ==== main loop ====
+              loop_80: do it = 1, itmax
+                 ! ==== done when kbot falls below ilo ====
+                 if( kbot<ilo )go to 90
+                 ! ==== locate active block ====
+                 do k = kbot, ilo + 1, -1
+                    if( h( k, k-1 )==zero )go to 20
+                 end do
+                 k = ilo
+                 20 continue
+                 ktop = k
+                 ! ==== select deflation window size:
+                 ! .    typical case:
+                 ! .      if possible and advisable, nibble the entire
+                 ! .      active block.  if not, use size min(nwr,nwmax)
+                 ! .      or min(nwr+1,nwmax) depending upon which has
+                 ! .      the smaller corresponding subdiagonal entry
+                 ! .      (a heuristic).
+                 ! .    exceptional case:
+                 ! .      if there have been no deflations in kexnw or
+                 ! .      more iterations, then vary the deflation window
+                 ! .      size.   at first, because, larger windows are,
+                 ! .      in general, more powerful than smaller ones,
+                 ! .      rapidly increase the window to the maximum possible.
+                 ! .      then, gradually reduce the window size. ====
+                 nh = kbot - ktop + 1
+                 nwupbd = min( nh, nwmax )
+                 if( ndfl<kexnw ) then
+                    nw = min( nwupbd, nwr )
+                 else
+                    nw = min( nwupbd, 2*nw )
+                 end if
+                 if( nw<nwmax ) then
+                    if( nw>=nh-1 ) then
+                       nw = nh
+                    else
+                       kwtop = kbot - nw + 1
+                       if( abs( h( kwtop, kwtop-1 ) )>abs( h( kwtop-1, kwtop-2 ) ) )nw = nw + &
+                                 1
+                    end if
+                 end if
+                 if( ndfl<kexnw ) then
+                    ndec = -1
+                 else if( ndec>=0 .or. nw>=nwupbd ) then
+                    ndec = ndec + 1
+                    if( nw-ndec<2 )ndec = 0
+                    nw = nw - ndec
+                 end if
+                 ! ==== aggressive early deflation:
+                 ! .    split workspace under the subdiagonal into
+                 ! .      - an nw-by-nw work array v in the lower
+                 ! .        left-hand-corner,
+                 ! .      - an nw-by-at-least-nw-but-more-is-better
+                 ! .        (nw-by-nho) horizontal work array along
+                 ! .        the bottom edge,
+                 ! .      - an at-least-nw-but-more-is-better (nhv-by-nw)
+                 ! .        vertical work array along the left-hand-edge.
+                 ! .        ====
+                 kv = n - nw + 1
+                 kt = nw + 1
+                 nho = ( n-nw-1 ) - kt + 1
+                 kwv = nw + 2
+                 nve = ( n-nw ) - kwv + 1
+                 ! ==== aggressive early deflation ====
+                 call stdlib_dlaqr3( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+                 ls, ld, wr, wi, h( kv, 1 ), ldh,nho, h( kv, kt ), ldh, nve, h( kwv, 1 ), ldh,&
+                           work, lwork )
+                 ! ==== adjust kbot accounting for new deflations. ====
+                 kbot = kbot - ld
+                 ! ==== ks points to the shifts. ====
+                 ks = kbot - ls + 1
+                 ! ==== skip an expensive qr sweep if there is a (partly
+                 ! .    heuristic) reason to expect that many eigenvalues
+                 ! .    will deflate without it.  here, the qr sweep is
+                 ! .    skipped if many eigenvalues have just been deflated
+                 ! .    or if the remaining active block is small.
+                 if( ( ld==0 ) .or. ( ( 100*ld<=nw*nibble ) .and. ( kbot-ktop+1>min( nmin, nwmax )&
+                            ) ) ) then
+                    ! ==== ns = nominal number of simultaneous shifts.
+                    ! .    this may be lowered (slightly) if stdlib_dlaqr3
+                    ! .    did not provide that many shifts. ====
+                    ns = min( nsmax, nsr, max( 2, kbot-ktop ) )
+                    ns = ns - mod( ns, 2 )
+                    ! ==== if there have been no deflations
+                    ! .    in a multiple of kexsh iterations,
+                    ! .    then try exceptional shifts.
+                    ! .    otherwise use shifts provided by
+                    ! .    stdlib_dlaqr3 above or from the eigenvalues
+                    ! .    of a trailing principal submatrix. ====
+                    if( mod( ndfl, kexsh )==0 ) then
+                       ks = kbot - ns + 1
+                       do i = kbot, max( ks+1, ktop+2 ), -2
+                          ss = abs( h( i, i-1 ) ) + abs( h( i-1, i-2 ) )
+                          aa = wilk1*ss + h( i, i )
+                          bb = ss
+                          cc = wilk2*ss
+                          dd = aa
+                          call stdlib_dlanv2( aa, bb, cc, dd, wr( i-1 ), wi( i-1 ),wr( i ), wi( i &
+                                    ), cs, sn )
+                       end do
+                       if( ks==ktop ) then
+                          wr( ks+1 ) = h( ks+1, ks+1 )
+                          wi( ks+1 ) = zero
+                          wr( ks ) = wr( ks+1 )
+                          wi( ks ) = wi( ks+1 )
+                       end if
+                    else
+                       ! ==== got ns/2 or fewer shifts? use stdlib_dlaqr4 or
+                       ! .    stdlib_dlahqr on a trailing principal submatrix to
+                       ! .    get more. (since ns<=nsmax<=(n-3)/6,
+                       ! .    there is enough space below the subdiagonal
+                       ! .    to fit an ns-by-ns scratch array.) ====
+                       if( kbot-ks+1<=ns / 2 ) then
+                          ks = kbot - ns + 1
+                          kt = n - ns + 1
+                          call stdlib_dlacpy( 'A', ns, ns, h( ks, ks ), ldh,h( kt, 1 ), ldh )
+                                    
+                          if( ns>nmin ) then
+                             call stdlib_dlaqr4( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, wr( &
+                                       ks ),wi( ks ), 1, 1, zdum, 1, work,lwork, inf )
+                          else
+                             call stdlib_dlahqr( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, wr( &
+                                       ks ),wi( ks ), 1, 1, zdum, 1, inf )
+                          end if
+                          ks = ks + inf
+                          ! ==== in case of a rare qr failure use
+                          ! .    eigenvalues of the trailing 2-by-2
+                          ! .    principal submatrix.  ====
+                          if( ks>=kbot ) then
+                             aa = h( kbot-1, kbot-1 )
+                             cc = h( kbot, kbot-1 )
+                             bb = h( kbot-1, kbot )
+                             dd = h( kbot, kbot )
+                             call stdlib_dlanv2( aa, bb, cc, dd, wr( kbot-1 ),wi( kbot-1 ), wr( &
+                                       kbot ),wi( kbot ), cs, sn )
+                             ks = kbot - 1
+                          end if
+                       end if
+                       if( kbot-ks+1>ns ) then
+                          ! ==== sort the shifts (helps a little)
+                          ! .    bubble sort keeps complex conjugate
+                          ! .    pairs together. ====
+                          sorted = .false.
+                          do k = kbot, ks + 1, -1
+                             if( sorted )go to 60
+                             sorted = .true.
+                             do i = ks, k - 1
+                                if( abs( wr( i ) )+abs( wi( i ) )<abs( wr( i+1 ) )+abs( wi( i+1 ) &
+                                          ) ) then
+                                   sorted = .false.
+                                   swap = wr( i )
+                                   wr( i ) = wr( i+1 )
+                                   wr( i+1 ) = swap
+                                   swap = wi( i )
+                                   wi( i ) = wi( i+1 )
+                                   wi( i+1 ) = swap
+                                end if
+                             end do
+                          end do
+                          60 continue
+                       end if
+                       ! ==== shuffle shifts into pairs of real shifts
+                       ! .    and pairs of complex conjugate shifts
+                       ! .    assuming complex conjugate shifts are
+                       ! .    already adjacent to one another. (yes,
+                       ! .    they are.)  ====
+                       do i = kbot, ks + 2, -2
+                          if( wi( i )/=-wi( i-1 ) ) then
+                             swap = wr( i )
+                             wr( i ) = wr( i-1 )
+                             wr( i-1 ) = wr( i-2 )
+                             wr( i-2 ) = swap
+                             swap = wi( i )
+                             wi( i ) = wi( i-1 )
+                             wi( i-1 ) = wi( i-2 )
+                             wi( i-2 ) = swap
+                          end if
+                       end do
+                    end if
+                    ! ==== if there are only two shifts and both are
+                    ! .    real, then use only one.  ====
+                    if( kbot-ks+1==2 ) then
+                       if( wi( kbot )==zero ) then
+                          if( abs( wr( kbot )-h( kbot, kbot ) )<abs( wr( kbot-1 )-h( kbot, kbot ) &
+                                    ) ) then
+                             wr( kbot-1 ) = wr( kbot )
+                          else
+                             wr( kbot ) = wr( kbot-1 )
+                          end if
+                       end if
+                    end if
+                    ! ==== use up to ns of the the smallest magnitude
+                    ! .    shifts.  if there aren't ns shifts available,
+                    ! .    then use them all, possibly dropping one to
+                    ! .    make the number of shifts even. ====
+                    ns = min( ns, kbot-ks+1 )
+                    ns = ns - mod( ns, 2 )
+                    ks = kbot - ns + 1
+                    ! ==== small-bulge multi-shift qr sweep:
+                    ! .    split workspace under the subdiagonal into
+                    ! .    - a kdu-by-kdu work array u in the lower
+                    ! .      left-hand-corner,
+                    ! .    - a kdu-by-at-least-kdu-but-more-is-better
+                    ! .      (kdu-by-nho) horizontal work array wh along
+                    ! .      the bottom edge,
+                    ! .    - and an at-least-kdu-but-more-is-better-by-kdu
+                    ! .      (nve-by-kdu) vertical work wv arrow along
+                    ! .      the left-hand-edge. ====
+                    kdu = 2*ns
+                    ku = n - kdu + 1
+                    kwh = kdu + 1
+                    nho = ( n-kdu+1-4 ) - ( kdu+1 ) + 1
+                    kwv = kdu + 4
+                    nve = n - kdu - kwv + 1
+                    ! ==== small-bulge multi-shift qr sweep ====
+                    call stdlib_dlaqr5( wantt, wantz, kacc22, n, ktop, kbot, ns,wr( ks ), wi( ks )&
+                    , h, ldh, iloz, ihiz, z,ldz, work, 3, h( ku, 1 ), ldh, nve,h( kwv, 1 ), ldh, &
+                              nho, h( ku, kwh ), ldh )
+                 end if
+                 ! ==== note progress (or the lack of it). ====
+                 if( ld>0 ) then
+                    ndfl = 1
+                 else
+                    ndfl = ndfl + 1
+                 end if
+                 ! ==== end of main loop ====
+              end do loop_80
+              ! ==== iteration limit exceeded.  set info to show where
+              ! .    the problem occurred and exit. ====
+              info = kbot
+              90 continue
+           end if
+           ! ==== return the optimal value of lwork. ====
+           work( 1 ) = real( lwkopt,KIND=dp)
+     end subroutine stdlib_dlaqr0
+
+     !> Aggressive early deflation:
+     !> DLAQR3: accepts as input an upper Hessenberg matrix
+     !> H and performs an orthogonal similarity transformation
+     !> designed to detect and deflate fully converged eigenvalues from
+     !> a trailing principal submatrix.  On output H has been over-
+     !> written by a new Hessenberg matrix that is a perturbation of
+     !> an orthogonal similarity transformation of H.  It is to be
+     !> hoped that the final version of H has many zero subdiagonal
+     !> entries.
+
+     subroutine stdlib_dlaqr3( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, ns, nd,&
+                sr, si, v, ldv, nh, t,ldt, nv, wv, ldwv, work, lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kbot, ktop, ldh, ldt, ldv, ldwv, ldz, lwork, n,&
+                      nh, nv, nw
+           integer(ilp), intent(out) :: nd, ns
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(dp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(dp), intent(out) :: si(*), sr(*), t(ldt,*), v(ldv,*), work(*), wv(ldwv,*)
+        ! ================================================================
+           
+           ! Local Scalars 
+           real(dp) :: aa, bb, beta, cc, cs, dd, evi, evk, foo, s, safmax, safmin, smlnum, sn, &
+                     tau, ulp
+           integer(ilp) :: i, ifst, ilst, info, infqr, j, jw, k, kcol, kend, kln, krow, kwtop, &
+                     ltop, lwk1, lwk2, lwk3, lwkopt, nmin
+           logical(lk) :: bulge, sorted
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,max,min,sqrt
+           ! Executable Statements 
+           ! ==== estimate optimal workspace. ====
+           jw = min( nw, kbot-ktop+1 )
+           if( jw<=2 ) then
+              lwkopt = 1
+           else
+              ! ==== workspace query call to stdlib_dgehrd ====
+              call stdlib_dgehrd( jw, 1, jw-1, t, ldt, work, work, -1, info )
+              lwk1 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_dormhr ====
+              call stdlib_dormhr( 'R', 'N', jw, jw, 1, jw-1, t, ldt, work, v, ldv,work, -1, info )
+                        
+              lwk2 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_dlaqr4 ====
+              call stdlib_dlaqr4( .true., .true., jw, 1, jw, t, ldt, sr, si, 1, jw,v, ldv, work, -&
+                        1, infqr )
+              lwk3 = int( work( 1 ),KIND=ilp)
+              ! ==== optimal workspace ====
+              lwkopt = max( jw+max( lwk1, lwk2 ), lwk3 )
+           end if
+           ! ==== quick return in case of workspace query. ====
+           if( lwork==-1 ) then
+              work( 1 ) = real( lwkopt,KIND=dp)
+              return
+           end if
+           ! ==== nothing to do ...
+           ! ... for an empty active block ... ====
+           ns = 0
+           nd = 0
+           work( 1 ) = one
+           if( ktop>kbot )return
+           ! ... nor for an empty deflation window. ====
+           if( nw<1 )return
+           ! ==== machine constants ====
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = one / safmin
+           call stdlib_dlabad( safmin, safmax )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=dp) / ulp )
+           ! ==== setup deflation window ====
+           jw = min( nw, kbot-ktop+1 )
+           kwtop = kbot - jw + 1
+           if( kwtop==ktop ) then
+              s = zero
+           else
+              s = h( kwtop, kwtop-1 )
+           end if
+           if( kbot==kwtop ) then
+              ! ==== 1-by-1 deflation window: not much to do ====
+              sr( kwtop ) = h( kwtop, kwtop )
+              si( kwtop ) = zero
+              ns = 1
+              nd = 0
+              if( abs( s )<=max( smlnum, ulp*abs( h( kwtop, kwtop ) ) ) )then
+                 ns = 0
+                 nd = 1
+                 if( kwtop>ktop )h( kwtop, kwtop-1 ) = zero
+              end if
+              work( 1 ) = one
+              return
+           end if
+           ! ==== convert to spike-triangular form.  (in case of a
+           ! .    rare qr failure, this routine continues to do
+           ! .    aggressive early deflation using that part of
+           ! .    the deflation window that converged using infqr
+           ! .    here and there to keep track.) ====
+           call stdlib_dlacpy( 'U', jw, jw, h( kwtop, kwtop ), ldh, t, ldt )
+           call stdlib_dcopy( jw-1, h( kwtop+1, kwtop ), ldh+1, t( 2, 1 ), ldt+1 )
+           call stdlib_dlaset( 'A', jw, jw, zero, one, v, ldv )
+           nmin = stdlib_ilaenv( 12, 'DLAQR3', 'SV', jw, 1, jw, lwork )
+           if( jw>nmin ) then
+              call stdlib_dlaqr4( .true., .true., jw, 1, jw, t, ldt, sr( kwtop ),si( kwtop ), 1, &
+                        jw, v, ldv, work, lwork, infqr )
+           else
+              call stdlib_dlahqr( .true., .true., jw, 1, jw, t, ldt, sr( kwtop ),si( kwtop ), 1, &
+                        jw, v, ldv, infqr )
+           end if
+           ! ==== stdlib_dtrexc needs a clean margin near the diagonal ====
+           do j = 1, jw - 3
+              t( j+2, j ) = zero
+              t( j+3, j ) = zero
+           end do
+           if( jw>2 )t( jw, jw-2 ) = zero
+           ! ==== deflation detection loop ====
+           ns = jw
+           ilst = infqr + 1
+           20 continue
+           if( ilst<=ns ) then
+              if( ns==1 ) then
+                 bulge = .false.
+              else
+                 bulge = t( ns, ns-1 )/=zero
+              end if
+              ! ==== small spike tip test for deflation ====
+              if( .not. bulge ) then
+                 ! ==== real eigenvalue ====
+                 foo = abs( t( ns, ns ) )
+                 if( foo==zero )foo = abs( s )
+                 if( abs( s*v( 1, ns ) )<=max( smlnum, ulp*foo ) ) then
+                    ! ==== deflatable ====
+                    ns = ns - 1
+                 else
+                    ! ==== undeflatable.   move it up out of the way.
+                    ! .    (stdlib_dtrexc can not fail in this case.) ====
+                    ifst = ns
+                    call stdlib_dtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    ilst = ilst + 1
+                 end if
+              else
+                 ! ==== complex conjugate pair ====
+                 foo = abs( t( ns, ns ) ) + sqrt( abs( t( ns, ns-1 ) ) )*sqrt( abs( t( ns-1, ns ) &
+                           ) )
+                 if( foo==zero )foo = abs( s )
+                 if( max( abs( s*v( 1, ns ) ), abs( s*v( 1, ns-1 ) ) )<=max( smlnum, ulp*foo ) ) &
+                           then
+                    ! ==== deflatable ====
+                    ns = ns - 2
+                 else
+                    ! ==== undeflatable. move them up out of the way.
+                    ! .    fortunately, stdlib_dtrexc does the right thing with
+                    ! .    ilst in case of a rare exchange failure. ====
+                    ifst = ns
+                    call stdlib_dtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    ilst = ilst + 2
+                 end if
+              end if
+              ! ==== end deflation detection loop ====
+              go to 20
+           end if
+              ! ==== return to hessenberg form ====
+           if( ns==0 )s = zero
+           if( ns<jw ) then
+              ! ==== sorting diagonal blocks of t improves accuracy for
+              ! .    graded matrices.  bubble sort deals well with
+              ! .    exchange failures. ====
+              sorted = .false.
+              i = ns + 1
+              30 continue
+              if( sorted )go to 50
+              sorted = .true.
+              kend = i - 1
+              i = infqr + 1
+              if( i==ns ) then
+                 k = i + 1
+              else if( t( i+1, i )==zero ) then
+                 k = i + 1
+              else
+                 k = i + 2
+              end if
+              40 continue
+              if( k<=kend ) then
+                 if( k==i+1 ) then
+                    evi = abs( t( i, i ) )
+                 else
+                    evi = abs( t( i, i ) ) + sqrt( abs( t( i+1, i ) ) )*sqrt( abs( t( i, i+1 ) ) )
+                              
+                 end if
+                 if( k==kend ) then
+                    evk = abs( t( k, k ) )
+                 else if( t( k+1, k )==zero ) then
+                    evk = abs( t( k, k ) )
+                 else
+                    evk = abs( t( k, k ) ) + sqrt( abs( t( k+1, k ) ) )*sqrt( abs( t( k, k+1 ) ) )
+                              
+                 end if
+                 if( evi>=evk ) then
+                    i = k
+                 else
+                    sorted = .false.
+                    ifst = i
+                    ilst = k
+                    call stdlib_dtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    if( info==0 ) then
+                       i = ilst
+                    else
+                       i = k
+                    end if
+                 end if
+                 if( i==kend ) then
+                    k = i + 1
+                 else if( t( i+1, i )==zero ) then
+                    k = i + 1
+                 else
+                    k = i + 2
+                 end if
+                 go to 40
+              end if
+              go to 30
+              50 continue
+           end if
+           ! ==== restore shift/eigenvalue array from t ====
+           i = jw
+           60 continue
+           if( i>=infqr+1 ) then
+              if( i==infqr+1 ) then
+                 sr( kwtop+i-1 ) = t( i, i )
+                 si( kwtop+i-1 ) = zero
+                 i = i - 1
+              else if( t( i, i-1 )==zero ) then
+                 sr( kwtop+i-1 ) = t( i, i )
+                 si( kwtop+i-1 ) = zero
+                 i = i - 1
+              else
+                 aa = t( i-1, i-1 )
+                 cc = t( i, i-1 )
+                 bb = t( i-1, i )
+                 dd = t( i, i )
+                 call stdlib_dlanv2( aa, bb, cc, dd, sr( kwtop+i-2 ),si( kwtop+i-2 ), sr( kwtop+i-&
+                           1 ),si( kwtop+i-1 ), cs, sn )
+                 i = i - 2
+              end if
+              go to 60
+           end if
+           if( ns<jw .or. s==zero ) then
+              if( ns>1 .and. s/=zero ) then
+                 ! ==== reflect spike back into lower triangle ====
+                 call stdlib_dcopy( ns, v, ldv, work, 1 )
+                 beta = work( 1 )
+                 call stdlib_dlarfg( ns, beta, work( 2 ), 1, tau )
+                 work( 1 ) = one
+                 call stdlib_dlaset( 'L', jw-2, jw-2, zero, zero, t( 3, 1 ), ldt )
+                 call stdlib_dlarf( 'L', ns, jw, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_dlarf( 'R', ns, ns, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_dlarf( 'R', jw, ns, work, 1, tau, v, ldv,work( jw+1 ) )
+                 call stdlib_dgehrd( jw, 1, ns, t, ldt, work, work( jw+1 ),lwork-jw, info )
+                           
+              end if
+              ! ==== copy updated reduced window into place ====
+              if( kwtop>1 )h( kwtop, kwtop-1 ) = s*v( 1, 1 )
+              call stdlib_dlacpy( 'U', jw, jw, t, ldt, h( kwtop, kwtop ), ldh )
+              call stdlib_dcopy( jw-1, t( 2, 1 ), ldt+1, h( kwtop+1, kwtop ),ldh+1 )
+              ! ==== accumulate orthogonal matrix in order update
+              ! .    h and z, if requested.  ====
+              if( ns>1 .and. s/=zero )call stdlib_dormhr( 'R', 'N', jw, ns, 1, ns, t, ldt, work, &
+                        v, ldv,work( jw+1 ), lwork-jw, info )
+              ! ==== update vertical slab in h ====
+              if( wantt ) then
+                 ltop = 1
+              else
+                 ltop = ktop
+              end if
+              do krow = ltop, kwtop - 1, nv
+                 kln = min( nv, kwtop-krow )
+                 call stdlib_dgemm( 'N', 'N', kln, jw, jw, one, h( krow, kwtop ),ldh, v, ldv, &
+                           zero, wv, ldwv )
+                 call stdlib_dlacpy( 'A', kln, jw, wv, ldwv, h( krow, kwtop ), ldh )
+              end do
+              ! ==== update horizontal slab in h ====
+              if( wantt ) then
+                 do kcol = kbot + 1, n, nh
+                    kln = min( nh, n-kcol+1 )
+                    call stdlib_dgemm( 'C', 'N', jw, kln, jw, one, v, ldv,h( kwtop, kcol ), ldh, &
+                              zero, t, ldt )
+                    call stdlib_dlacpy( 'A', jw, kln, t, ldt, h( kwtop, kcol ),ldh )
+                 end do
+              end if
+              ! ==== update vertical slab in z ====
+              if( wantz ) then
+                 do krow = iloz, ihiz, nv
+                    kln = min( nv, ihiz-krow+1 )
+                    call stdlib_dgemm( 'N', 'N', kln, jw, jw, one, z( krow, kwtop ),ldz, v, ldv, &
+                              zero, wv, ldwv )
+                    call stdlib_dlacpy( 'A', kln, jw, wv, ldwv, z( krow, kwtop ),ldz )
+                 end do
+              end if
+           end if
+           ! ==== return the number of deflations ... ====
+           nd = jw - ns
+           ! ==== ... and the number of shifts. (subtracting
+           ! .    infqr from the spike length takes care
+           ! .    of the case of a rare qr failure while
+           ! .    calculating eigenvalues of the deflation
+           ! .    window.)  ====
+           ns = ns - infqr
+            ! ==== return optimal workspace. ====
+           work( 1 ) = real( lwkopt,KIND=dp)
+     end subroutine stdlib_dlaqr3
+
+     !> DLAQR4: implements one level of recursion for DLAQR0.
+     !> It is a complete implementation of the small bulge multi-shift
+     !> QR algorithm.  It may be called by DLAQR0 and, for large enough
+     !> deflation window size, it may be called by DLAQR3.  This
+     !> subroutine is identical to DLAQR0 except that it calls DLAQR2
+     !> instead of DLAQR3.
+     !> DLAQR4 computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+     !> Schur form), and Z is the orthogonal matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input orthogonal
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+     subroutine stdlib_dlaqr4( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, ldz, work,&
+                lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(dp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(dp), intent(out) :: wi(*), work(*), wr(*)
+        ! ================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: kexnw = 5
+           integer(ilp), parameter :: kexsh = 6
+           real(dp), parameter :: wilk1 = 0.75_dp
+           real(dp), parameter :: wilk2 = -0.4375_dp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_dlahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== exceptional deflation windows:  try to cure rare
+           ! .    slow convergence by varying the size of the
+           ! .    deflation window after kexnw iterations. ====
+           
+           ! ==== exceptional shifts: try to cure rare slow convergence
+           ! .    with ad-hoc exceptional shifts every kexsh iterations.
+           ! .    ====
+           
+           ! ==== the constants wilk1 and wilk2 are used to form the
+           ! .    exceptional shifts. ====
+           
+           
+           ! Local Scalars 
+           real(dp) :: aa, bb, cc, cs, dd, sn, ss, swap
+           integer(ilp) :: i, inf, it, itmax, k, kacc22, kbot, kdu, ks, kt, ktop, ku, kv, kwh, &
+           kwtop, kwv, ld, ls, lwkopt, ndec, ndfl, nh, nho, nibble, nmin, ns, nsmax, nsr, nve, nw,&
+                      nwmax, nwr, nwupbd
+           logical(lk) :: sorted
+           character :: jbcmpz*2
+           ! Local Arrays 
+           real(dp) :: zdum(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,max,min,mod
+           ! Executable Statements 
+           info = 0
+           ! ==== quick return for n = 0: nothing to do. ====
+           if( n==0 ) then
+              work( 1 ) = one
+              return
+           end if
+           if( n<=ntiny ) then
+              ! ==== tiny matrices must use stdlib_dlahqr. ====
+              lwkopt = 1
+              if( lwork/=-1 )call stdlib_dlahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, &
+                        ihiz, z, ldz, info )
+           else
+              ! ==== use small bulge multi-shift qr with aggressive early
+              ! .    deflation on larger-than-tiny matrices. ====
+              ! ==== hope for the best. ====
+              info = 0
+              ! ==== set up job flags for stdlib_ilaenv. ====
+              if( wantt ) then
+                 jbcmpz( 1: 1 ) = 'S'
+              else
+                 jbcmpz( 1: 1 ) = 'E'
+              end if
+              if( wantz ) then
+                 jbcmpz( 2: 2 ) = 'V'
+              else
+                 jbcmpz( 2: 2 ) = 'N'
+              end if
+              ! ==== nwr = recommended deflation window size.  at this
+              ! .    point,  n > ntiny = 15, so there is enough
+              ! .    subdiagonal workspace for nwr>=2 as required.
+              ! .    (in fact, there is enough subdiagonal space for
+              ! .    nwr>=4.) ====
+              nwr = stdlib_ilaenv( 13, 'DLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nwr = max( 2, nwr )
+              nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+              ! ==== nsr = recommended number of simultaneous shifts.
+              ! .    at this point n > ntiny = 15, so there is at
+              ! .    enough subdiagonal workspace for nsr to be even
+              ! .    and greater than or equal to two as required. ====
+              nsr = stdlib_ilaenv( 15, 'DLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nsr = min( nsr, ( n-3 ) / 6, ihi-ilo )
+              nsr = max( 2, nsr-mod( nsr, 2 ) )
+              ! ==== estimate optimal workspace ====
+              ! ==== workspace query call to stdlib_dlaqr2 ====
+              call stdlib_dlaqr2( wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz,ihiz, z, ldz, ls,&
+                         ld, wr, wi, h, ldh, n, h, ldh,n, h, ldh, work, -1 )
+              ! ==== optimal workspace = max(stdlib_dlaqr5, stdlib_dlaqr2) ====
+              lwkopt = max( 3*nsr / 2, int( work( 1 ),KIND=ilp) )
+              ! ==== quick return in case of workspace query. ====
+              if( lwork==-1 ) then
+                 work( 1 ) = real( lwkopt,KIND=dp)
+                 return
+              end if
+              ! ==== stdlib_dlahqr/stdlib_dlaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'DLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nmin = max( ntiny, nmin )
+              ! ==== nibble crossover point ====
+              nibble = stdlib_ilaenv( 14, 'DLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nibble = max( 0, nibble )
+              ! ==== accumulate reflections during ttswp?  use block
+              ! .    2-by-2 structure during matrix-matrix multiply? ====
+              kacc22 = stdlib_ilaenv( 16, 'DLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              kacc22 = max( 0, kacc22 )
+              kacc22 = min( 2, kacc22 )
+              ! ==== nwmax = the largest possible deflation window for
+              ! .    which there is sufficient workspace. ====
+              nwmax = min( ( n-1 ) / 3, lwork / 2 )
+              nw = nwmax
+              ! ==== nsmax = the largest number of simultaneous shifts
+              ! .    for which there is sufficient workspace. ====
+              nsmax = min( ( n-3 ) / 6, 2*lwork / 3 )
+              nsmax = nsmax - mod( nsmax, 2 )
+              ! ==== ndfl: an iteration count restarted at deflation. ====
+              ndfl = 1
+              ! ==== itmax = iteration limit ====
+              itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )
+              ! ==== last row and column in the active block ====
+              kbot = ihi
+              ! ==== main loop ====
+              loop_80: do it = 1, itmax
+                 ! ==== done when kbot falls below ilo ====
+                 if( kbot<ilo )go to 90
+                 ! ==== locate active block ====
+                 do k = kbot, ilo + 1, -1
+                    if( h( k, k-1 )==zero )go to 20
+                 end do
+                 k = ilo
+                 20 continue
+                 ktop = k
+                 ! ==== select deflation window size:
+                 ! .    typical case:
+                 ! .      if possible and advisable, nibble the entire
+                 ! .      active block.  if not, use size min(nwr,nwmax)
+                 ! .      or min(nwr+1,nwmax) depending upon which has
+                 ! .      the smaller corresponding subdiagonal entry
+                 ! .      (a heuristic).
+                 ! .    exceptional case:
+                 ! .      if there have been no deflations in kexnw or
+                 ! .      more iterations, then vary the deflation window
+                 ! .      size.   at first, because, larger windows are,
+                 ! .      in general, more powerful than smaller ones,
+                 ! .      rapidly increase the window to the maximum possible.
+                 ! .      then, gradually reduce the window size. ====
+                 nh = kbot - ktop + 1
+                 nwupbd = min( nh, nwmax )
+                 if( ndfl<kexnw ) then
+                    nw = min( nwupbd, nwr )
+                 else
+                    nw = min( nwupbd, 2*nw )
+                 end if
+                 if( nw<nwmax ) then
+                    if( nw>=nh-1 ) then
+                       nw = nh
+                    else
+                       kwtop = kbot - nw + 1
+                       if( abs( h( kwtop, kwtop-1 ) )>abs( h( kwtop-1, kwtop-2 ) ) )nw = nw + &
+                                 1
+                    end if
+                 end if
+                 if( ndfl<kexnw ) then
+                    ndec = -1
+                 else if( ndec>=0 .or. nw>=nwupbd ) then
+                    ndec = ndec + 1
+                    if( nw-ndec<2 )ndec = 0
+                    nw = nw - ndec
+                 end if
+                 ! ==== aggressive early deflation:
+                 ! .    split workspace under the subdiagonal into
+                 ! .      - an nw-by-nw work array v in the lower
+                 ! .        left-hand-corner,
+                 ! .      - an nw-by-at-least-nw-but-more-is-better
+                 ! .        (nw-by-nho) horizontal work array along
+                 ! .        the bottom edge,
+                 ! .      - an at-least-nw-but-more-is-better (nhv-by-nw)
+                 ! .        vertical work array along the left-hand-edge.
+                 ! .        ====
+                 kv = n - nw + 1
+                 kt = nw + 1
+                 nho = ( n-nw-1 ) - kt + 1
+                 kwv = nw + 2
+                 nve = ( n-nw ) - kwv + 1
+                 ! ==== aggressive early deflation ====
+                 call stdlib_dlaqr2( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+                 ls, ld, wr, wi, h( kv, 1 ), ldh,nho, h( kv, kt ), ldh, nve, h( kwv, 1 ), ldh,&
+                           work, lwork )
+                 ! ==== adjust kbot accounting for new deflations. ====
+                 kbot = kbot - ld
+                 ! ==== ks points to the shifts. ====
+                 ks = kbot - ls + 1
+                 ! ==== skip an expensive qr sweep if there is a (partly
+                 ! .    heuristic) reason to expect that many eigenvalues
+                 ! .    will deflate without it.  here, the qr sweep is
+                 ! .    skipped if many eigenvalues have just been deflated
+                 ! .    or if the remaining active block is small.
+                 if( ( ld==0 ) .or. ( ( 100*ld<=nw*nibble ) .and. ( kbot-ktop+1>min( nmin, nwmax )&
+                            ) ) ) then
+                    ! ==== ns = nominal number of simultaneous shifts.
+                    ! .    this may be lowered (slightly) if stdlib_dlaqr2
+                    ! .    did not provide that many shifts. ====
+                    ns = min( nsmax, nsr, max( 2, kbot-ktop ) )
+                    ns = ns - mod( ns, 2 )
+                    ! ==== if there have been no deflations
+                    ! .    in a multiple of kexsh iterations,
+                    ! .    then try exceptional shifts.
+                    ! .    otherwise use shifts provided by
+                    ! .    stdlib_dlaqr2 above or from the eigenvalues
+                    ! .    of a trailing principal submatrix. ====
+                    if( mod( ndfl, kexsh )==0 ) then
+                       ks = kbot - ns + 1
+                       do i = kbot, max( ks+1, ktop+2 ), -2
+                          ss = abs( h( i, i-1 ) ) + abs( h( i-1, i-2 ) )
+                          aa = wilk1*ss + h( i, i )
+                          bb = ss
+                          cc = wilk2*ss
+                          dd = aa
+                          call stdlib_dlanv2( aa, bb, cc, dd, wr( i-1 ), wi( i-1 ),wr( i ), wi( i &
+                                    ), cs, sn )
+                       end do
+                       if( ks==ktop ) then
+                          wr( ks+1 ) = h( ks+1, ks+1 )
+                          wi( ks+1 ) = zero
+                          wr( ks ) = wr( ks+1 )
+                          wi( ks ) = wi( ks+1 )
+                       end if
+                    else
+                       ! ==== got ns/2 or fewer shifts? use stdlib_dlahqr
+                       ! .    on a trailing principal submatrix to
+                       ! .    get more. (since ns<=nsmax<=(n-3)/6,
+                       ! .    there is enough space below the subdiagonal
+                       ! .    to fit an ns-by-ns scratch array.) ====
+                       if( kbot-ks+1<=ns / 2 ) then
+                          ks = kbot - ns + 1
+                          kt = n - ns + 1
+                          call stdlib_dlacpy( 'A', ns, ns, h( ks, ks ), ldh,h( kt, 1 ), ldh )
+                                    
+                          call stdlib_dlahqr( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, wr( ks &
+                                    ), wi( ks ),1, 1, zdum, 1, inf )
+                          ks = ks + inf
+                          ! ==== in case of a rare qr failure use
+                          ! .    eigenvalues of the trailing 2-by-2
+                          ! .    principal submatrix.  ====
+                          if( ks>=kbot ) then
+                             aa = h( kbot-1, kbot-1 )
+                             cc = h( kbot, kbot-1 )
+                             bb = h( kbot-1, kbot )
+                             dd = h( kbot, kbot )
+                             call stdlib_dlanv2( aa, bb, cc, dd, wr( kbot-1 ),wi( kbot-1 ), wr( &
+                                       kbot ),wi( kbot ), cs, sn )
+                             ks = kbot - 1
+                          end if
+                       end if
+                       if( kbot-ks+1>ns ) then
+                          ! ==== sort the shifts (helps a little)
+                          ! .    bubble sort keeps complex conjugate
+                          ! .    pairs together. ====
+                          sorted = .false.
+                          do k = kbot, ks + 1, -1
+                             if( sorted )go to 60
+                             sorted = .true.
+                             do i = ks, k - 1
+                                if( abs( wr( i ) )+abs( wi( i ) )<abs( wr( i+1 ) )+abs( wi( i+1 ) &
+                                          ) ) then
+                                   sorted = .false.
+                                   swap = wr( i )
+                                   wr( i ) = wr( i+1 )
+                                   wr( i+1 ) = swap
+                                   swap = wi( i )
+                                   wi( i ) = wi( i+1 )
+                                   wi( i+1 ) = swap
+                                end if
+                             end do
+                          end do
+                          60 continue
+                       end if
+                       ! ==== shuffle shifts into pairs of real shifts
+                       ! .    and pairs of complex conjugate shifts
+                       ! .    assuming complex conjugate shifts are
+                       ! .    already adjacent to one another. (yes,
+                       ! .    they are.)  ====
+                       do i = kbot, ks + 2, -2
+                          if( wi( i )/=-wi( i-1 ) ) then
+                             swap = wr( i )
+                             wr( i ) = wr( i-1 )
+                             wr( i-1 ) = wr( i-2 )
+                             wr( i-2 ) = swap
+                             swap = wi( i )
+                             wi( i ) = wi( i-1 )
+                             wi( i-1 ) = wi( i-2 )
+                             wi( i-2 ) = swap
+                          end if
+                       end do
+                    end if
+                    ! ==== if there are only two shifts and both are
+                    ! .    real, then use only one.  ====
+                    if( kbot-ks+1==2 ) then
+                       if( wi( kbot )==zero ) then
+                          if( abs( wr( kbot )-h( kbot, kbot ) )<abs( wr( kbot-1 )-h( kbot, kbot ) &
+                                    ) ) then
+                             wr( kbot-1 ) = wr( kbot )
+                          else
+                             wr( kbot ) = wr( kbot-1 )
+                          end if
+                       end if
+                    end if
+                    ! ==== use up to ns of the the smallest magnitude
+                    ! .    shifts.  if there aren't ns shifts available,
+                    ! .    then use them all, possibly dropping one to
+                    ! .    make the number of shifts even. ====
+                    ns = min( ns, kbot-ks+1 )
+                    ns = ns - mod( ns, 2 )
+                    ks = kbot - ns + 1
+                    ! ==== small-bulge multi-shift qr sweep:
+                    ! .    split workspace under the subdiagonal into
+                    ! .    - a kdu-by-kdu work array u in the lower
+                    ! .      left-hand-corner,
+                    ! .    - a kdu-by-at-least-kdu-but-more-is-better
+                    ! .      (kdu-by-nho) horizontal work array wh along
+                    ! .      the bottom edge,
+                    ! .    - and an at-least-kdu-but-more-is-better-by-kdu
+                    ! .      (nve-by-kdu) vertical work wv arrow along
+                    ! .      the left-hand-edge. ====
+                    kdu = 2*ns
+                    ku = n - kdu + 1
+                    kwh = kdu + 1
+                    nho = ( n-kdu+1-4 ) - ( kdu+1 ) + 1
+                    kwv = kdu + 4
+                    nve = n - kdu - kwv + 1
+                    ! ==== small-bulge multi-shift qr sweep ====
+                    call stdlib_dlaqr5( wantt, wantz, kacc22, n, ktop, kbot, ns,wr( ks ), wi( ks )&
+                    , h, ldh, iloz, ihiz, z,ldz, work, 3, h( ku, 1 ), ldh, nve,h( kwv, 1 ), ldh, &
+                              nho, h( ku, kwh ), ldh )
+                 end if
+                 ! ==== note progress (or the lack of it). ====
+                 if( ld>0 ) then
+                    ndfl = 1
+                 else
+                    ndfl = ndfl + 1
+                 end if
+                 ! ==== end of main loop ====
+              end do loop_80
+              ! ==== iteration limit exceeded.  set info to show where
+              ! .    the problem occurred and exit. ====
+              info = kbot
+              90 continue
+           end if
+           ! ==== return the optimal value of lwork. ====
+           work( 1 ) = real( lwkopt,KIND=dp)
+     end subroutine stdlib_dlaqr4
+
+     !> DLAQZ0: computes the eigenvalues of a real matrix pair (H,T),
+     !> where H is an upper Hessenberg matrix and T is upper triangular,
+     !> using the double-shift QZ method.
+     !> Matrix pairs of this type are produced by the reduction to
+     !> generalized upper Hessenberg form of a real matrix pair (A,B):
+     !> A = Q1*H*Z1**T,  B = Q1*T*Z1**T,
+     !> as computed by DGGHRD.
+     !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+     !> also reduced to generalized Schur form,
+     !> H = Q*S*Z**T,  T = Q*P*Z**T,
+     !> where Q and Z are orthogonal matrices, P is an upper triangular
+     !> matrix, and S is a quasi-triangular matrix with 1-by-1 and 2-by-2
+     !> diagonal blocks.
+     !> The 1-by-1 blocks correspond to real eigenvalues of the matrix pair
+     !> (H,T) and the 2-by-2 blocks correspond to complex conjugate pairs of
+     !> eigenvalues.
+     !> Additionally, the 2-by-2 upper triangular diagonal blocks of P
+     !> corresponding to 2-by-2 blocks of S are reduced to positive diagonal
+     !> form, i.e., if S(j+1,j) is non-zero, then P(j+1,j) = P(j,j+1) = 0,
+     !> P(j,j) > 0, and P(j+1,j+1) > 0.
+     !> Optionally, the orthogonal matrix Q from the generalized Schur
+     !> factorization may be postmultiplied into an input matrix Q1, and the
+     !> orthogonal matrix Z may be postmultiplied into an input matrix Z1.
+     !> If Q1 and Z1 are the orthogonal matrices from DGGHRD that reduced
+     !> the matrix pair (A,B) to generalized upper Hessenberg form, then the
+     !> output matrices Q1*Q and Z1*Z are the orthogonal factors from the
+     !> generalized Schur factorization of (A,B):
+     !> A = (Q1*Q)*S*(Z1*Z)**T,  B = (Q1*Q)*P*(Z1*Z)**T.
+     !> To avoid overflow, eigenvalues of the matrix pair (H,T) (equivalently,
+     !> of (A,B)) are computed as a pair of values (alpha,beta), where alpha is
+     !> complex and beta real.
+     !> If beta is nonzero, lambda = alpha / beta is an eigenvalue of the
+     !> generalized nonsymmetric eigenvalue problem (GNEP)
+     !> A*x = lambda*B*x
+     !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+     !> alternate form of the GNEP
+     !> mu*A*y = B*y.
+     !> Real eigenvalues can be read directly from the generalized Schur
+     !> form:
+     !> alpha = S(i,i), beta = P(i,i).
+     !> Ref: C.B. Moler
+     !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+     !> pp. 241--256.
+     !> Ref: B. Kagstrom, D. Kressner, "Multishift Variants of the QZ
+     !> Algorithm with Aggressive Early Deflation", SIAM J. Numer.
+     !> Anal., 29(2006), pp. 199--227.
+     !> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
+     !> multipole rational QZ method with agressive early deflation"
+
+     recursive subroutine stdlib_dlaqz0( wants, wantq, wantz, n, ilo, ihi, a,lda, b, ldb, alphar, &
+               alphai, beta,q, ldq, z, ldz, work, lwork, rec,info )
+           ! arguments
+           character, intent( in ) :: wants, wantq, wantz
+           integer(ilp), intent( in ) :: n, ilo, ihi, lda, ldb, ldq, ldz, lwork,rec
+           integer(ilp), intent( out ) :: info
+           real(dp), intent( inout ) :: a( lda, * ), b( ldb, * ),q( ldq, * ), z( ldz, * ), alphar(&
+                      * ),alphai( * ), beta( * ), work( * )
+           
+           ! local scalars
+           real(dp) :: smlnum, ulp, eshift, safmin, safmax, c1, s1, temp, swap
+           integer(ilp) :: istart, istop, iiter, maxit, istart2, k, ld, nshifts, nblock, nw, nmin,&
+            nibble, n_undeflated, n_deflated, ns, sweep_info, shiftpos, lworkreq, k2, istartm, &
+            istopm, iwants, iwantq, iwantz, norm_info, aed_info, nwr, nbr, nsr, itemp1, itemp2, &
+                      rcost, i
+           logical(lk) :: ilschur, ilq, ilz
+           character :: jbcmpz*3
+           if( stdlib_lsame( wants, 'E' ) ) then
+              ilschur = .false.
+              iwants = 1
+           else if( stdlib_lsame( wants, 'S' ) ) then
+              ilschur = .true.
+              iwants = 2
+           else
+              iwants = 0
+           end if
+           if( stdlib_lsame( wantq, 'N' ) ) then
+              ilq = .false.
+              iwantq = 1
+           else if( stdlib_lsame( wantq, 'V' ) ) then
+              ilq = .true.
+              iwantq = 2
+           else if( stdlib_lsame( wantq, 'I' ) ) then
+              ilq = .true.
+              iwantq = 3
+           else
+              iwantq = 0
+           end if
+           if( stdlib_lsame( wantz, 'N' ) ) then
+              ilz = .false.
+              iwantz = 1
+           else if( stdlib_lsame( wantz, 'V' ) ) then
+              ilz = .true.
+              iwantz = 2
+           else if( stdlib_lsame( wantz, 'I' ) ) then
+              ilz = .true.
+              iwantz = 3
+           else
+              iwantz = 0
+           end if
+           ! check argument values
+           info = 0
+           if( iwants==0 ) then
+              info = -1
+           else if( iwantq==0 ) then
+              info = -2
+           else if( iwantz==0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 ) then
+              info = -5
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -6
+           else if( lda<n ) then
+              info = -8
+           else if( ldb<n ) then
+              info = -10
+           else if( ldq<1 .or. ( ilq .and. ldq<n ) ) then
+              info = -15
+           else if( ldz<1 .or. ( ilz .and. ldz<n ) ) then
+              info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAQZ0', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = real( 1,KIND=dp)
+              return
+           end if
+           ! get the parameters
+           jbcmpz( 1:1 ) = wants
+           jbcmpz( 2:2 ) = wantq
+           jbcmpz( 3:3 ) = wantz
+           nmin = stdlib_ilaenv( 12, 'DLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nwr = stdlib_ilaenv( 13, 'DLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nwr = max( 2, nwr )
+           nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+           nibble = stdlib_ilaenv( 14, 'DLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nsr = stdlib_ilaenv( 15, 'DLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nsr = min( nsr, ( n+6 ) / 9, ihi-ilo )
+           nsr = max( 2, nsr-mod( nsr, 2 ) )
+           rcost = stdlib_ilaenv( 17, 'DLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           itemp1 = int( nsr/sqrt( 1+2*nsr/( real( rcost,KIND=dp)/100*n ) ),KIND=ilp)
+           itemp1 = ( ( itemp1-1 )/4 )*4+4
+           nbr = nsr+itemp1
+           if( n < nmin .or. rec >= 2 ) then
+              call stdlib_dhgeqz( wants, wantq, wantz, n, ilo, ihi, a, lda, b, ldb,alphar, alphai,&
+                         beta, q, ldq, z, ldz, work,lwork, info )
+              return
+           end if
+           ! find out required workspace
+           ! workspace query to stdlib_dlaqz3
+           nw = max( nwr, nmin )
+           call stdlib_dlaqz3( ilschur, ilq, ilz, n, ilo, ihi, nw, a, lda, b, ldb,q, ldq, z, ldz, &
+           n_undeflated, n_deflated, alphar,alphai, beta, work, nw, work, nw, work, -1, rec,&
+                     aed_info )
+           itemp1 = int( work( 1 ),KIND=ilp)
+           ! workspace query to stdlib_dlaqz4
+           call stdlib_dlaqz4( ilschur, ilq, ilz, n, ilo, ihi, nsr, nbr, alphar,alphai, beta, a, &
+                     lda, b, ldb, q, ldq, z, ldz, work,nbr, work, nbr, work, -1, sweep_info )
+           itemp2 = int( work( 1 ),KIND=ilp)
+           lworkreq = max( itemp1+2*nw**2, itemp2+2*nbr**2 )
+           if ( lwork ==-1 ) then
+              work( 1 ) = real( lworkreq,KIND=dp)
+              return
+           else if ( lwork < lworkreq ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAQZ0', info )
+              return
+           end if
+           ! initialize q and z
+           if( iwantq==3 ) call stdlib_dlaset( 'FULL', n, n, zero, one, q, ldq )
+           if( iwantz==3 ) call stdlib_dlaset( 'FULL', n, n, zero, one, z, ldz )
+           ! get machine constants
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_dlabad( safmin, safmax )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=dp)/ulp )
+           istart = ilo
+           istop = ihi
+           maxit = 3*( ihi-ilo+1 )
+           ld = 0
+           do iiter = 1, maxit
+              if( iiter >= maxit ) then
+                 info = istop+1
+                 goto 80
+              end if
+              if ( istart+1 >= istop ) then
+                 istop = istart
+                 exit
+              end if
+              ! check deflations at the end
+              if ( abs( a( istop-1, istop-2 ) ) <= max( smlnum,ulp*( abs( a( istop-1, istop-1 ) )+&
+                        abs( a( istop-2,istop-2 ) ) ) ) ) then
+                 a( istop-1, istop-2 ) = zero
+                 istop = istop-2
+                 ld = 0
+                 eshift = zero
+              else if ( abs( a( istop, istop-1 ) ) <= max( smlnum,ulp*( abs( a( istop, istop ) )+&
+                        abs( a( istop-1,istop-1 ) ) ) ) ) then
+                 a( istop, istop-1 ) = zero
+                 istop = istop-1
+                 ld = 0
+                 eshift = zero
+              end if
+              ! check deflations at the start
+              if ( abs( a( istart+2, istart+1 ) ) <= max( smlnum,ulp*( abs( a( istart+1, istart+1 &
+                        ) )+abs( a( istart+2,istart+2 ) ) ) ) ) then
+                 a( istart+2, istart+1 ) = zero
+                 istart = istart+2
+                 ld = 0
+                 eshift = zero
+              else if ( abs( a( istart+1, istart ) ) <= max( smlnum,ulp*( abs( a( istart, istart )&
+                         )+abs( a( istart+1,istart+1 ) ) ) ) ) then
+                 a( istart+1, istart ) = zero
+                 istart = istart+1
+                 ld = 0
+                 eshift = zero
+              end if
+              if ( istart+1 >= istop ) then
+                 exit
+              end if
+              ! check interior deflations
+              istart2 = istart
+              do k = istop, istart+1, -1
+                 if ( abs( a( k, k-1 ) ) <= max( smlnum, ulp*( abs( a( k,k ) )+abs( a( k-1, k-1 ) &
+                           ) ) ) ) then
+                    a( k, k-1 ) = zero
+                    istart2 = k
+                    exit
+                 end if
+              end do
+              ! get range to apply rotations to
+              if ( ilschur ) then
+                 istartm = 1
+                 istopm = n
+              else
+                 istartm = istart2
+                 istopm = istop
+              end if
+              ! check infinite eigenvalues, this is done without blocking so might
+              ! slow down the method when many infinite eigenvalues are present
+              k = istop
+              do while ( k>=istart2 )
+                 temp = zero
+                 if( k < istop ) then
+                    temp = temp+abs( b( k, k+1 ) )
+                 end if
+                 if( k > istart2 ) then
+                    temp = temp+abs( b( k-1, k ) )
+                 end if
+                 if( abs( b( k, k ) ) < max( smlnum, ulp*temp ) ) then
+                    ! a diagonal element of b is negligable, move it
+                    ! to the top and deflate it
+                    do k2 = k, istart2+1, -1
+                       call stdlib_dlartg( b( k2-1, k2 ), b( k2-1, k2-1 ), c1, s1,temp )
+                       b( k2-1, k2 ) = temp
+                       b( k2-1, k2-1 ) = zero
+                       call stdlib_drot( k2-2-istartm+1, b( istartm, k2 ), 1,b( istartm, k2-1 ), &
+                                 1, c1, s1 )
+                       call stdlib_drot( min( k2+1, istop )-istartm+1, a( istartm,k2 ), 1, a( &
+                                 istartm, k2-1 ), 1, c1, s1 )
+                       if ( ilz ) then
+                          call stdlib_drot( n, z( 1, k2 ), 1, z( 1, k2-1 ), 1, c1,s1 )
+                       end if
+                       if( k2<istop ) then
+                          call stdlib_dlartg( a( k2, k2-1 ), a( k2+1, k2-1 ), c1,s1, temp )
+                                    
+                          a( k2, k2-1 ) = temp
+                          a( k2+1, k2-1 ) = zero
+                          call stdlib_drot( istopm-k2+1, a( k2, k2 ), lda, a( k2+1,k2 ), lda, c1, &
+                                    s1 )
+                          call stdlib_drot( istopm-k2+1, b( k2, k2 ), ldb, b( k2+1,k2 ), ldb, c1, &
+                                    s1 )
+                          if( ilq ) then
+                             call stdlib_drot( n, q( 1, k2 ), 1, q( 1, k2+1 ), 1,c1, s1 )
+                          end if
+                       end if
+                    end do
+                    if( istart2<istop )then
+                       call stdlib_dlartg( a( istart2, istart2 ), a( istart2+1,istart2 ), c1, s1, &
+                                 temp )
+                       a( istart2, istart2 ) = temp
+                       a( istart2+1, istart2 ) = zero
+                       call stdlib_drot( istopm-( istart2+1 )+1, a( istart2,istart2+1 ), lda, a( &
+                                 istart2+1,istart2+1 ), lda, c1, s1 )
+                       call stdlib_drot( istopm-( istart2+1 )+1, b( istart2,istart2+1 ), ldb, b( &
+                                 istart2+1,istart2+1 ), ldb, c1, s1 )
+                       if( ilq ) then
+                          call stdlib_drot( n, q( 1, istart2 ), 1, q( 1,istart2+1 ), 1, c1, s1 )
+                                    
+                       end if
+                    end if
+                    istart2 = istart2+1
+                 end if
+                 k = k-1
+              end do
+              ! istart2 now points to the top of the bottom right
+              ! unreduced hessenberg block
+              if ( istart2 >= istop ) then
+                 istop = istart2-1
+                 ld = 0
+                 eshift = zero
+                 cycle
+              end if
+              nw = nwr
+              nshifts = nsr
+              nblock = nbr
+              if ( istop-istart2+1 < nmin ) then
+                 ! setting nw to the size of the subblock will make aed deflate
+                 ! all the eigenvalues. this is slightly more efficient than just
+                 ! using stdlib_dhgeqz because the off diagonal part gets updated via blas.
+                 if ( istop-istart+1 < nmin ) then
+                    nw = istop-istart+1
+                    istart2 = istart
+                 else
+                    nw = istop-istart2+1
+                 end if
+              end if
+              ! time for aed
+              call stdlib_dlaqz3( ilschur, ilq, ilz, n, istart2, istop, nw, a, lda,b, ldb, q, ldq,&
+               z, ldz, n_undeflated, n_deflated,alphar, alphai, beta, work, nw, work( nw**2+1 ),&
+                         nw, work( 2*nw**2+1 ), lwork-2*nw**2, rec,aed_info )
+              if ( n_deflated > 0 ) then
+                 istop = istop-n_deflated
+                 ld = 0
+                 eshift = zero
+              end if
+              if ( 100*n_deflated > nibble*( n_deflated+n_undeflated ) .or.istop-istart2+1 < nmin &
+                        ) then
+                 ! aed has uncovered many eigenvalues. skip a qz sweep and run
+                 ! aed again.
+                 cycle
+              end if
+              ld = ld+1
+              ns = min( nshifts, istop-istart2 )
+              ns = min( ns, n_undeflated )
+              shiftpos = istop-n_deflated-n_undeflated+1
+              ! shuffle shifts to put double shifts in front
+              ! this ensures that we don't split up a double shift
+              do i = shiftpos, shiftpos+n_undeflated-1, 2
+                 if( alphai( i )/=-alphai( i+1 ) ) then
+                    swap = alphar( i )
+                    alphar( i ) = alphar( i+1 )
+                    alphar( i+1 ) = alphar( i+2 )
+                    alphar( i+2 ) = swap
+                    swap = alphai( i )
+                    alphai( i ) = alphai( i+1 )
+                    alphai( i+1 ) = alphai( i+2 )
+                    alphai( i+2 ) = swap
+                    swap = beta( i )
+                    beta( i ) = beta( i+1 )
+                    beta( i+1 ) = beta( i+2 )
+                    beta( i+2 ) = swap
+                 end if
+              end do
+              if ( mod( ld, 6 ) == 0 ) then
+                 ! exceptional shift.  chosen for no particularly good reason.
+                 if( ( real( maxit,KIND=dp)*safmin )*abs( a( istop,istop-1 ) )<abs( a( istop-1, &
+                           istop-1 ) ) ) then
+                    eshift = a( istop, istop-1 )/b( istop-1, istop-1 )
+                 else
+                    eshift = eshift+one/( safmin*real( maxit,KIND=dp) )
+                 end if
+                 alphar( shiftpos ) = one
+                 alphar( shiftpos+1 ) = zero
+                 alphai( shiftpos ) = zero
+                 alphai( shiftpos+1 ) = zero
+                 beta( shiftpos ) = eshift
+                 beta( shiftpos+1 ) = eshift
+                 ns = 2
+              end if
+              ! time for a qz sweep
+              call stdlib_dlaqz4( ilschur, ilq, ilz, n, istart2, istop, ns, nblock,alphar( &
+              shiftpos ), alphai( shiftpos ),beta( shiftpos ), a, lda, b, ldb, q, ldq, z, ldz,&
+              work, nblock, work( nblock**2+1 ), nblock,work( 2*nblock**2+1 ), lwork-2*nblock**2,&
+                        sweep_info )
+           end do
+           ! call stdlib_dhgeqz to normalize the eigenvalue blocks and set the eigenvalues
+           ! if all the eigenvalues have been found, stdlib_dhgeqz will not do any iterations
+           ! and only normalize the blocks. in case of a rare convergence failure,
+           ! the single shift might perform better.
+        80 call stdlib_dhgeqz( wants, wantq, wantz, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                  beta, q, ldq, z, ldz, work, lwork,norm_info )
+           info = norm_info
+     end subroutine stdlib_dlaqz0
+
+     !> DLAQZ3: performs AED
+
+     recursive subroutine stdlib_dlaqz3( ilschur, ilq, ilz, n, ilo, ihi, nw,a, lda, b, ldb, q, &
+               ldq, z, ldz, ns,nd, alphar, alphai, beta, qc, ldqc,zc, ldzc, work, lwork, rec, info )
+           ! arguments
+           logical(lk), intent( in ) :: ilschur, ilq, ilz
+           integer(ilp), intent( in ) :: n, ilo, ihi, nw, lda, ldb, ldq, ldz,ldqc, ldzc, lwork, &
+                     rec
+           real(dp), intent( inout ) :: a( lda, * ), b( ldb, * ),q( ldq, * ), z( ldz, * ), alphar(&
+                      * ),alphai( * ), beta( * )
+           integer(ilp), intent( out ) :: ns, nd, info
+           real(dp), intent(inout) :: qc(ldqc,*), zc(ldzc,*)
+           real(dp), intent(out) :: work(*)
+           
+           ! local scalars
+           logical(lk) :: bulge
+           integer(ilp) :: jw, kwtop, kwbot, istopm, istartm, k, k2, dtgexc_info, ifst, ilst, &
+                     lworkreq, qz_small_info
+           real(dp) :: s, smlnum, ulp, safmin, safmax, c1, s1, temp
+           info = 0
+           ! set up deflation window
+           jw = min( nw, ihi-ilo+1 )
+           kwtop = ihi-jw+1
+           if ( kwtop == ilo ) then
+              s = zero
+           else
+              s = a( kwtop, kwtop-1 )
+           end if
+           ! determine required workspace
+           ifst = 1
+           ilst = jw
+           call stdlib_dtgexc( .true., .true., jw, a, lda, b, ldb, qc, ldqc, zc,ldzc, ifst, ilst, &
+                     work, -1, dtgexc_info )
+           lworkreq = int( work( 1 ),KIND=ilp)
+           call stdlib_dlaqz0( 'S', 'V', 'V', jw, 1, jw, a( kwtop, kwtop ), lda,b( kwtop, kwtop ),&
+                      ldb, alphar, alphai, beta, qc,ldqc, zc, ldzc, work, -1, rec+1, qz_small_info )
+           lworkreq = max( lworkreq, int( work( 1 ),KIND=ilp)+2*jw**2 )
+           lworkreq = max( lworkreq, n*nw, 2*nw**2+n )
+           if ( lwork ==-1 ) then
+              ! workspace query, quick return
+              work( 1 ) = lworkreq
+              return
+           else if ( lwork < lworkreq ) then
+              info = -26
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAQZ3', -info )
+              return
+           end if
+           ! get machine constants
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_dlabad( safmin, safmax )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=dp)/ulp )
+           if ( ihi == kwtop ) then
+              ! 1 by 1 deflation window, just try a regular deflation
+              alphar( kwtop ) = a( kwtop, kwtop )
+              alphai( kwtop ) = zero
+              beta( kwtop ) = b( kwtop, kwtop )
+              ns = 1
+              nd = 0
+              if ( abs( s ) <= max( smlnum, ulp*abs( a( kwtop,kwtop ) ) ) ) then
+                 ns = 0
+                 nd = 1
+                 if ( kwtop > ilo ) then
+                    a( kwtop, kwtop-1 ) = zero
+                 end if
+              end if
+           end if
+           ! store window in case of convergence failure
+           call stdlib_dlacpy( 'ALL', jw, jw, a( kwtop, kwtop ), lda, work, jw )
+           call stdlib_dlacpy( 'ALL', jw, jw, b( kwtop, kwtop ), ldb, work( jw**2+1 ), jw )
+                     
+           ! transform window to real schur form
+           call stdlib_dlaset( 'FULL', jw, jw, zero, one, qc, ldqc )
+           call stdlib_dlaset( 'FULL', jw, jw, zero, one, zc, ldzc )
+           call stdlib_dlaqz0( 'S', 'V', 'V', jw, 1, jw, a( kwtop, kwtop ), lda,b( kwtop, kwtop ),&
+            ldb, alphar, alphai, beta, qc,ldqc, zc, ldzc, work( 2*jw**2+1 ), lwork-2*jw**2,rec+1, &
+                      qz_small_info )
+           if( qz_small_info /= 0 ) then
+              ! convergence failure, restore the window and exit
+              nd = 0
+              ns = jw-qz_small_info
+              call stdlib_dlacpy( 'ALL', jw, jw, work, jw, a( kwtop, kwtop ), lda )
+              call stdlib_dlacpy( 'ALL', jw, jw, work( jw**2+1 ), jw, b( kwtop,kwtop ), ldb )
+                        
+              return
+           end if
+           ! deflation detection loop
+           if ( kwtop == ilo .or. s == zero ) then
+              kwbot = kwtop-1
+           else
+              kwbot = ihi
+              k = 1
+              k2 = 1
+              do while ( k <= jw )
+                 bulge = .false.
+                 if ( kwbot-kwtop+1 >= 2 ) then
+                    bulge = a( kwbot, kwbot-1 ) /= zero
+                 end if
+                 if ( bulge ) then
+                    ! try to deflate complex conjugate eigenvalue pair
+                    temp = abs( a( kwbot, kwbot ) )+sqrt( abs( a( kwbot,kwbot-1 ) ) )*sqrt( abs( &
+                              a( kwbot-1, kwbot ) ) )
+                    if( temp == zero )then
+                       temp = abs( s )
+                    end if
+                    if ( max( abs( s*qc( 1, kwbot-kwtop ) ), abs( s*qc( 1,kwbot-kwtop+1 ) ) ) <= &
+                              max( smlnum,ulp*temp ) ) then
+                       ! deflatable
+                       kwbot = kwbot-2
+                    else
+                       ! not deflatable, move out of the way
+                       ifst = kwbot-kwtop+1
+                       ilst = k2
+                       call stdlib_dtgexc( .true., .true., jw, a( kwtop, kwtop ),lda, b( kwtop, &
+                       kwtop ), ldb, qc, ldqc,zc, ldzc, ifst, ilst, work, lwork,dtgexc_info )
+                                 
+                       k2 = k2+2
+                    end if
+                    k = k+2
+                 else
+                    ! try to deflate real eigenvalue
+                    temp = abs( a( kwbot, kwbot ) )
+                    if( temp == zero ) then
+                       temp = abs( s )
+                    end if
+                    if ( ( abs( s*qc( 1, kwbot-kwtop+1 ) ) ) <= max( ulp*temp, smlnum ) ) &
+                              then
+                       ! deflatable
+                       kwbot = kwbot-1
+                    else
+                       ! not deflatable, move out of the way
+                       ifst = kwbot-kwtop+1
+                       ilst = k2
+                       call stdlib_dtgexc( .true., .true., jw, a( kwtop, kwtop ),lda, b( kwtop, &
+                       kwtop ), ldb, qc, ldqc,zc, ldzc, ifst, ilst, work, lwork,dtgexc_info )
+                                 
+                       k2 = k2+1
+                    end if
+                    k = k+1
+                 end if
+              end do
+           end if
+           ! store eigenvalues
+           nd = ihi-kwbot
+           ns = jw-nd
+           k = kwtop
+           do while ( k <= ihi )
+              bulge = .false.
+              if ( k < ihi ) then
+                 if ( a( k+1, k ) /= zero ) then
+                    bulge = .true.
+                 end if
+              end if
+              if ( bulge ) then
+                 ! 2x2 eigenvalue block
+                 call stdlib_dlag2( a( k, k ), lda, b( k, k ), ldb, safmin,beta( k ), beta( k+1 ),&
+                            alphar( k ),alphar( k+1 ), alphai( k ) )
+                 alphai( k+1 ) = -alphai( k )
+                 k = k+2
+              else
+                 ! 1x1 eigenvalue block
+                 alphar( k ) = a( k, k )
+                 alphai( k ) = zero
+                 beta( k ) = b( k, k )
+                 k = k+1
+              end if
+           end do
+           if ( kwtop /= ilo .and. s /= zero ) then
+              ! reflect spike back, this will create optimally packed bulges
+              a( kwtop:kwbot, kwtop-1 ) = a( kwtop, kwtop-1 )*qc( 1,1:jw-nd )
+              do k = kwbot-1, kwtop, -1
+                 call stdlib_dlartg( a( k, kwtop-1 ), a( k+1, kwtop-1 ), c1, s1,temp )
+                 a( k, kwtop-1 ) = temp
+                 a( k+1, kwtop-1 ) = zero
+                 k2 = max( kwtop, k-1 )
+                 call stdlib_drot( ihi-k2+1, a( k, k2 ), lda, a( k+1, k2 ), lda, c1,s1 )
+                 call stdlib_drot( ihi-( k-1 )+1, b( k, k-1 ), ldb, b( k+1, k-1 ),ldb, c1, s1 )
+                           
+                 call stdlib_drot( jw, qc( 1, k-kwtop+1 ), 1, qc( 1, k+1-kwtop+1 ),1, c1, s1 )
+                           
+              end do
+              ! chase bulges down
+              istartm = kwtop
+              istopm = ihi
+              k = kwbot-1
+              do while ( k >= kwtop )
+                 if ( ( k >= kwtop+1 ) .and. a( k+1, k-1 ) /= zero ) then
+                    ! move double pole block down and remove it
+                    do k2 = k-1, kwbot-2
+                       call stdlib_dlaqz2( .true., .true., k2, kwtop, kwtop+jw-1,kwbot, a, lda, b,&
+                                  ldb, jw, kwtop, qc,ldqc, jw, kwtop, zc, ldzc )
+                    end do
+                    k = k-2
+                 else
+                    ! k points to single shift
+                    do k2 = k, kwbot-2
+                       ! move shift down
+                       call stdlib_dlartg( b( k2+1, k2+1 ), b( k2+1, k2 ), c1, s1,temp )
+                       b( k2+1, k2+1 ) = temp
+                       b( k2+1, k2 ) = zero
+                       call stdlib_drot( k2+2-istartm+1, a( istartm, k2+1 ), 1,a( istartm, k2 ), &
+                                 1, c1, s1 )
+                       call stdlib_drot( k2-istartm+1, b( istartm, k2+1 ), 1,b( istartm, k2 ), 1, &
+                                 c1, s1 )
+                       call stdlib_drot( jw, zc( 1, k2+1-kwtop+1 ), 1, zc( 1,k2-kwtop+1 ), 1, c1, &
+                                 s1 )
+                       call stdlib_dlartg( a( k2+1, k2 ), a( k2+2, k2 ), c1, s1,temp )
+                       a( k2+1, k2 ) = temp
+                       a( k2+2, k2 ) = zero
+                       call stdlib_drot( istopm-k2, a( k2+1, k2+1 ), lda, a( k2+2,k2+1 ), lda, c1,&
+                                  s1 )
+                       call stdlib_drot( istopm-k2, b( k2+1, k2+1 ), ldb, b( k2+2,k2+1 ), ldb, c1,&
+                                  s1 )
+                       call stdlib_drot( jw, qc( 1, k2+1-kwtop+1 ), 1, qc( 1,k2+2-kwtop+1 ), 1, &
+                                 c1, s1 )
+                    end do
+                    ! remove the shift
+                    call stdlib_dlartg( b( kwbot, kwbot ), b( kwbot, kwbot-1 ), c1,s1, temp )
+                              
+                    b( kwbot, kwbot ) = temp
+                    b( kwbot, kwbot-1 ) = zero
+                    call stdlib_drot( kwbot-istartm, b( istartm, kwbot ), 1,b( istartm, kwbot-1 ),&
+                               1, c1, s1 )
+                    call stdlib_drot( kwbot-istartm+1, a( istartm, kwbot ), 1,a( istartm, kwbot-1 &
+                              ), 1, c1, s1 )
+                    call stdlib_drot( jw, zc( 1, kwbot-kwtop+1 ), 1, zc( 1,kwbot-1-kwtop+1 ), 1, &
+                              c1, s1 )
+                    k = k-1
+                 end if
+              end do
+           end if
+           ! apply qc and zc to rest of the matrix
+           if ( ilschur ) then
+              istartm = 1
+              istopm = n
+           else
+              istartm = ilo
+              istopm = ihi
+           end if
+           if ( istopm-ihi > 0 ) then
+              call stdlib_dgemm( 'T', 'N', jw, istopm-ihi, jw, one, qc, ldqc,a( kwtop, ihi+1 ), &
+                        lda, zero, work, jw )
+              call stdlib_dlacpy( 'ALL', jw, istopm-ihi, work, jw, a( kwtop,ihi+1 ), lda )
+              call stdlib_dgemm( 'T', 'N', jw, istopm-ihi, jw, one, qc, ldqc,b( kwtop, ihi+1 ), &
+                        ldb, zero, work, jw )
+              call stdlib_dlacpy( 'ALL', jw, istopm-ihi, work, jw, b( kwtop,ihi+1 ), ldb )
+           end if
+           if ( ilq ) then
+              call stdlib_dgemm( 'N', 'N', n, jw, jw, one, q( 1, kwtop ), ldq, qc,ldqc, zero, &
+                        work, n )
+              call stdlib_dlacpy( 'ALL', n, jw, work, n, q( 1, kwtop ), ldq )
+           end if
+           if ( kwtop-1-istartm+1 > 0 ) then
+              call stdlib_dgemm( 'N', 'N', kwtop-istartm, jw, jw, one, a( istartm,kwtop ), lda, &
+                        zc, ldzc, zero, work,kwtop-istartm )
+              call stdlib_dlacpy( 'ALL', kwtop-istartm, jw, work, kwtop-istartm,a( istartm, kwtop &
+                        ), lda )
+              call stdlib_dgemm( 'N', 'N', kwtop-istartm, jw, jw, one, b( istartm,kwtop ), ldb, &
+                        zc, ldzc, zero, work,kwtop-istartm )
+              call stdlib_dlacpy( 'ALL', kwtop-istartm, jw, work, kwtop-istartm,b( istartm, kwtop &
+                        ), ldb )
+           end if
+           if ( ilz ) then
+              call stdlib_dgemm( 'N', 'N', n, jw, jw, one, z( 1, kwtop ), ldz, zc,ldzc, zero, &
+                        work, n )
+              call stdlib_dlacpy( 'ALL', n, jw, work, n, z( 1, kwtop ), ldz )
+           end if
+     end subroutine stdlib_dlaqz3
+
+     !> To find the desired eigenvalues of a given real symmetric
+     !> tridiagonal matrix T, DLARRE: sets any "small" off-diagonal
+     !> elements to zero, and for each unreduced block T_i, it finds
+     !> (a) a suitable shift at one end of the block's spectrum,
+     !> (b) the base representation, T_i - sigma_i I = L_i D_i L_i^T, and
+     !> (c) eigenvalues of each L_i D_i L_i^T.
+     !> The representations and eigenvalues found are then used by
+     !> DSTEMR to compute the eigenvectors of T.
+     !> The accuracy varies depending on whether bisection is used to
+     !> find a few eigenvalues or the dqds algorithm (subroutine DLASQ2) to
+     !> conpute all and then discard any unwanted one.
+     !> As an added benefit, DLARRE also outputs the n
+     !> Gerschgorin intervals for the matrices L_i D_i L_i^T.
+
+     pure subroutine stdlib_dlarre( range, n, vl, vu, il, iu, d, e, e2,rtol1, rtol2, spltol, &
+               nsplit, isplit, m,w, werr, wgap, iblock, indexw, gers, pivmin,work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: range
+           integer(ilp), intent(in) :: il, iu, n
+           integer(ilp), intent(out) :: info, m, nsplit
+           real(dp), intent(out) :: pivmin
+           real(dp), intent(in) :: rtol1, rtol2, spltol
+           real(dp), intent(inout) :: vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iblock(*), isplit(*), iwork(*), indexw(*)
+           real(dp), intent(inout) :: d(*), e(*), e2(*)
+           real(dp), intent(out) :: gers(*), w(*), werr(*), wgap(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: hndrd = 100.0_dp
+           real(dp), parameter :: pert = 8.0_dp
+           real(dp), parameter :: fourth = one/four
+           real(dp), parameter :: fac = half
+           real(dp), parameter :: maxgrowth = 64.0_dp
+           real(dp), parameter :: fudge = 2.0_dp
+           integer(ilp), parameter :: maxtry = 6
+           integer(ilp), parameter :: allrng = 1
+           integer(ilp), parameter :: indrng = 2
+           integer(ilp), parameter :: valrng = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: forceb, norep, usedqd
+           integer(ilp) :: cnt, cnt1, cnt2, i, ibegin, idum, iend, iinfo, in, indl, indu, irange, &
+                     j, jblk, mb, mm, wbegin, wend
+           real(dp) :: avgap, bsrtol, clwdth, dmax, dpivot, eabs, emax, eold, eps, gl, gu, isleft,&
+                      isrght, rtl, rtol, s1, s2, safmin, sgndef, sigma, spdiam, tau, tmp, tmp1
+           ! Local Arrays 
+           integer(ilp) :: iseed(4)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           ! decode range
+           if( stdlib_lsame( range, 'A' ) ) then
+              irange = allrng
+           else if( stdlib_lsame( range, 'V' ) ) then
+              irange = valrng
+           else if( stdlib_lsame( range, 'I' ) ) then
+              irange = indrng
+           end if
+           m = 0
+           ! get machine constants
+           safmin = stdlib_dlamch( 'S' )
+           eps = stdlib_dlamch( 'P' )
+           ! set parameters
+           rtl = sqrt(eps)
+           bsrtol = sqrt(eps)
+           ! treat case of 1x1 matrix for quick return
+           if( n==1 ) then
+              if( (irange==allrng).or.((irange==valrng).and.(d(1)>vl).and.(d(1)<=vu)).or.((&
+                        irange==indrng).and.(il==1).and.(iu==1)) ) then
+                 m = 1
+                 w(1) = d(1)
+                 ! the computation error of the eigenvalue is zero
+                 werr(1) = zero
+                 wgap(1) = zero
+                 iblock( 1 ) = 1
+                 indexw( 1 ) = 1
+                 gers(1) = d( 1 )
+                 gers(2) = d( 1 )
+              endif
+              ! store the shift for the initial rrr, which is zero in this case
+              e(1) = zero
+              return
+           end if
+           ! general case: tridiagonal matrix of order > 1
+           ! init werr, wgap. compute gerschgorin intervals and spectral diameter.
+           ! compute maximum off-diagonal entry and pivmin.
+           gl = d(1)
+           gu = d(1)
+           eold = zero
+           emax = zero
+           e(n) = zero
+           do i = 1,n
+              werr(i) = zero
+              wgap(i) = zero
+              eabs = abs( e(i) )
+              if( eabs >= emax ) then
+                 emax = eabs
+              end if
+              tmp1 = eabs + eold
+              gers( 2*i-1) = d(i) - tmp1
+              gl =  min( gl, gers( 2*i - 1))
+              gers( 2*i ) = d(i) + tmp1
+              gu = max( gu, gers(2*i) )
+              eold  = eabs
+           end do
+           ! the minimum pivot allowed in the sturm sequence for t
+           pivmin = safmin * max( one, emax**2 )
+           ! compute spectral diameter. the gerschgorin bounds give an
+           ! estimate that is wrong by at most a factor of sqrt(2)
+           spdiam = gu - gl
+           ! compute splitting points
+           call stdlib_dlarra( n, d, e, e2, spltol, spdiam,nsplit, isplit, iinfo )
+           ! can force use of bisection instead of faster dqds.
+           ! option left in the code for future multisection work.
+           forceb = .false.
+           ! initialize usedqd, dqds should be used for allrng unless someone
+           ! explicitly wants bisection.
+           usedqd = (( irange==allrng ) .and. (.not.forceb))
+           if( (irange==allrng) .and. (.not. forceb) ) then
+              ! set interval [vl,vu] that contains all eigenvalues
+              vl = gl
+              vu = gu
+           else
+              ! we call stdlib_dlarrd to find crude approximations to the eigenvalues
+              ! in the desired range. in case irange = indrng, we also obtain the
+              ! interval (vl,vu] that contains all the wanted eigenvalues.
+              ! an interval [left,right] has converged if
+              ! right-left<rtol*max(abs(left),abs(right))
+              ! stdlib_dlarrd needs a work of size 4*n, iwork of size 3*n
+              call stdlib_dlarrd( range, 'B', n, vl, vu, il, iu, gers,bsrtol, d, e, e2, pivmin, &
+                        nsplit, isplit,mm, w, werr, vl, vu, iblock, indexw,work, iwork, iinfo )
+              if( iinfo/=0 ) then
+                 info = -1
+                 return
+              endif
+              ! make sure that the entries m+1 to n in w, werr, iblock, indexw are 0
+              do i = mm+1,n
+                 w( i ) = zero
+                 werr( i ) = zero
+                 iblock( i ) = 0
+                 indexw( i ) = 0
+              end do
+           end if
+      ! **
+           ! loop over unreduced blocks
+           ibegin = 1
+           wbegin = 1
+           loop_170: do jblk = 1, nsplit
+              iend = isplit( jblk )
+              in = iend - ibegin + 1
+              ! 1 x 1 block
+              if( in==1 ) then
+                 if( (irange==allrng).or.( (irange==valrng).and.( d( ibegin )>vl ).and.( d( &
+                           ibegin )<=vu ) ).or. ( (irange==indrng).and.(iblock(wbegin)==jblk))) then
+                    m = m + 1
+                    w( m ) = d( ibegin )
+                    werr(m) = zero
+                    ! the gap for a single block doesn't matter for the later
+                    ! algorithm and is assigned an arbitrary large value
+                    wgap(m) = zero
+                    iblock( m ) = jblk
+                    indexw( m ) = 1
+                    wbegin = wbegin + 1
+                 endif
+                 ! e( iend ) holds the shift for the initial rrr
+                 e( iend ) = zero
+                 ibegin = iend + 1
+                 cycle loop_170
+              end if
+              ! blocks of size larger than 1x1
+              ! e( iend ) will hold the shift for the initial rrr, for now set it =0
+              e( iend ) = zero
+              ! find local outer bounds gl,gu for the block
+              gl = d(ibegin)
+              gu = d(ibegin)
+              do i = ibegin , iend
+                 gl = min( gers( 2*i-1 ), gl )
+                 gu = max( gers( 2*i ), gu )
+              end do
+              spdiam = gu - gl
+              if(.not. ((irange==allrng).and.(.not.forceb)) ) then
+                 ! count the number of eigenvalues in the current block.
+                 mb = 0
+                 do i = wbegin,mm
+                    if( iblock(i)==jblk ) then
+                       mb = mb+1
+                    else
+                       goto 21
+                    endif
+                 end do
+                 21 continue
+                 if( mb==0) then
+                    ! no eigenvalue in the current block lies in the desired range
+                    ! e( iend ) holds the shift for the initial rrr
+                    e( iend ) = zero
+                    ibegin = iend + 1
+                    cycle loop_170
+                 else
+                    ! decide whether dqds or bisection is more efficient
+                    usedqd = ( (mb > fac*in) .and. (.not.forceb) )
+                    wend = wbegin + mb - 1
+                    ! calculate gaps for the current block
+                    ! in later stages, when representations for individual
+                    ! eigenvalues are different, we use sigma = e( iend ).
+                    sigma = zero
+                    do i = wbegin, wend - 1
+                       wgap( i ) = max( zero,w(i+1)-werr(i+1) - (w(i)+werr(i)) )
+                    end do
+                    wgap( wend ) = max( zero,vu - sigma - (w( wend )+werr( wend )))
+                    ! find local index of the first and last desired evalue.
+                    indl = indexw(wbegin)
+                    indu = indexw( wend )
+                 endif
+              endif
+              if(( (irange==allrng) .and. (.not. forceb) ).or.usedqd) then
+                 ! case of dqds
+                 ! find approximations to the extremal eigenvalues of the block
+                 call stdlib_dlarrk( in, 1, gl, gu, d(ibegin),e2(ibegin), pivmin, rtl, tmp, tmp1, &
+                           iinfo )
+                 if( iinfo/=0 ) then
+                    info = -1
+                    return
+                 endif
+                 isleft = max(gl, tmp - tmp1- hndrd * eps* abs(tmp - tmp1))
+                 call stdlib_dlarrk( in, in, gl, gu, d(ibegin),e2(ibegin), pivmin, rtl, tmp, tmp1,&
+                            iinfo )
+                 if( iinfo/=0 ) then
+                    info = -1
+                    return
+                 endif
+                 isrght = min(gu, tmp + tmp1+ hndrd * eps * abs(tmp + tmp1))
+                 ! improve the estimate of the spectral diameter
+                 spdiam = isrght - isleft
+              else
+                 ! case of bisection
+                 ! find approximations to the wanted extremal eigenvalues
+                 isleft = max(gl, w(wbegin) - werr(wbegin)- hndrd * eps*abs(w(wbegin)- werr(&
+                           wbegin) ))
+                 isrght = min(gu,w(wend) + werr(wend)+ hndrd * eps * abs(w(wend)+ werr(wend)))
+                           
+              endif
+              ! decide whether the base representation for the current block
+              ! l_jblk d_jblk l_jblk^t = t_jblk - sigma_jblk i
+              ! should be on the left or the right end of the current block.
+              ! the strategy is to shift to the end which is "more populated"
+              ! furthermore, decide whether to use dqds for the computation of
+              ! dqds is chosen if all eigenvalues are desired or the number of
+              ! eigenvalues to be computed is large compared to the blocksize.
+              if( ( irange==allrng ) .and. (.not.forceb) ) then
+                 ! if all the eigenvalues have to be computed, we use dqd
+                 usedqd = .true.
+                 ! indl is the local index of the first eigenvalue to compute
+                 indl = 1
+                 indu = in
+                 ! mb =  number of eigenvalues to compute
+                 mb = in
+                 wend = wbegin + mb - 1
+                 ! define 1/4 and 3/4 points of the spectrum
+                 s1 = isleft + fourth * spdiam
+                 s2 = isrght - fourth * spdiam
+              else
+                 ! stdlib_dlarrd has computed iblock and indexw for each eigenvalue
+                 ! approximation.
+                 ! choose sigma
+                 if( usedqd ) then
+                    s1 = isleft + fourth * spdiam
+                    s2 = isrght - fourth * spdiam
+                 else
+                    tmp = min(isrght,vu) -  max(isleft,vl)
+                    s1 =  max(isleft,vl) + fourth * tmp
+                    s2 =  min(isrght,vu) - fourth * tmp
+                 endif
+              endif
+              ! compute the negcount at the 1/4 and 3/4 points
+              if(mb>1) then
+                 call stdlib_dlarrc( 'T', in, s1, s2, d(ibegin),e(ibegin), pivmin, cnt, cnt1, &
+                           cnt2, iinfo)
+              endif
+              if(mb==1) then
+                 sigma = gl
+                 sgndef = one
+              elseif( cnt1 - indl >= indu - cnt2 ) then
+                 if( ( irange==allrng ) .and. (.not.forceb) ) then
+                    sigma = max(isleft,gl)
+                 elseif( usedqd ) then
+                    ! use gerschgorin bound as shift to get pos def matrix
+                    ! for dqds
+                    sigma = isleft
+                 else
+                    ! use approximation of the first desired eigenvalue of the
+                    ! block as shift
+                    sigma = max(isleft,vl)
+                 endif
+                 sgndef = one
+              else
+                 if( ( irange==allrng ) .and. (.not.forceb) ) then
+                    sigma = min(isrght,gu)
+                 elseif( usedqd ) then
+                    ! use gerschgorin bound as shift to get neg def matrix
+                    ! for dqds
+                    sigma = isrght
+                 else
+                    ! use approximation of the first desired eigenvalue of the
+                    ! block as shift
+                    sigma = min(isrght,vu)
+                 endif
+                 sgndef = -one
+              endif
+              ! an initial sigma has been chosen that will be used for computing
+              ! t - sigma i = l d l^t
+              ! define the increment tau of the shift in case the initial shift
+              ! needs to be refined to obtain a factorization with not too much
+              ! element growth.
+              if( usedqd ) then
+                 ! the initial sigma was to the outer end of the spectrum
+                 ! the matrix is definite and we need not retreat.
+                 tau = spdiam*eps*n + two*pivmin
+                 tau = max( tau,two*eps*abs(sigma) )
+              else
+                 if(mb>1) then
+                    clwdth = w(wend) + werr(wend) - w(wbegin) - werr(wbegin)
+                    avgap = abs(clwdth / real(wend-wbegin,KIND=dp))
+                    if( sgndef==one ) then
+                       tau = half*max(wgap(wbegin),avgap)
+                       tau = max(tau,werr(wbegin))
+                    else
+                       tau = half*max(wgap(wend-1),avgap)
+                       tau = max(tau,werr(wend))
+                    endif
+                 else
+                    tau = werr(wbegin)
+                 endif
+              endif
+              loop_80: do idum = 1, maxtry
+                 ! compute l d l^t factorization of tridiagonal matrix t - sigma i.
+                 ! store d in work(1:in), l in work(in+1:2*in), and reciprocals of
+                 ! pivots in work(2*in+1:3*in)
+                 dpivot = d( ibegin ) - sigma
+                 work( 1 ) = dpivot
+                 dmax = abs( work(1) )
+                 j = ibegin
+                 do i = 1, in - 1
+                    work( 2*in+i ) = one / work( i )
+                    tmp = e( j )*work( 2*in+i )
+                    work( in+i ) = tmp
+                    dpivot = ( d( j+1 )-sigma ) - tmp*e( j )
+                    work( i+1 ) = dpivot
+                    dmax = max( dmax, abs(dpivot) )
+                    j = j + 1
+                 end do
+                 ! check for element growth
+                 if( dmax > maxgrowth*spdiam ) then
+                    norep = .true.
+                 else
+                    norep = .false.
+                 endif
+                 if( usedqd .and. .not.norep ) then
+                    ! ensure the definiteness of the representation
+                    ! all entries of d (of l d l^t) must have the same sign
+                    do i = 1, in
+                       tmp = sgndef*work( i )
+                       if( tmp<zero ) norep = .true.
+                    end do
+                 endif
+                 if(norep) then
+                    ! note that in the case of irange=allrng, we use the gerschgorin
+                    ! shift which makes the matrix definite. so we should end up
+                    ! here really only in the case of irange = valrng or indrng.
+                    if( idum==maxtry-1 ) then
+                       if( sgndef==one ) then
+                          ! the fudged gerschgorin shift should succeed
+                          sigma =gl - fudge*spdiam*eps*n - fudge*two*pivmin
+                       else
+                          sigma =gu + fudge*spdiam*eps*n + fudge*two*pivmin
+                       end if
+                    else
+                       sigma = sigma - sgndef * tau
+                       tau = two * tau
+                    end if
+                 else
+                    ! an initial rrr is found
+                    go to 83
+                 end if
+              end do loop_80
+              ! if the program reaches this point, no base representation could be
+              ! found in maxtry iterations.
+              info = 2
+              return
+              83 continue
+              ! at this point, we have found an initial base representation
+              ! t - sigma i = l d l^t with not too much element growth.
+              ! store the shift.
+              e( iend ) = sigma
+              ! store d and l.
+              call stdlib_dcopy( in, work, 1, d( ibegin ), 1 )
+              call stdlib_dcopy( in-1, work( in+1 ), 1, e( ibegin ), 1 )
+              if(mb>1 ) then
+                 ! perturb each entry of the base representation by a small
+                 ! (but random) relative amount to overcome difficulties with
+                 ! glued matrices.
+                 do i = 1, 4
+                    iseed( i ) = 1
+                 end do
+                 call stdlib_dlarnv(2, iseed, 2*in-1, work(1))
+                 do i = 1,in-1
+                    d(ibegin+i-1) = d(ibegin+i-1)*(one+eps*pert*work(i))
+                    e(ibegin+i-1) = e(ibegin+i-1)*(one+eps*pert*work(in+i))
+                 end do
+                 d(iend) = d(iend)*(one+eps*four*work(in))
+              endif
+              ! don't update the gerschgorin intervals because keeping track
+              ! of the updates would be too much work in stdlib_dlarrv.
+              ! we update w instead and use it to locate the proper gerschgorin
+              ! intervals.
+              ! compute the required eigenvalues of l d l' by bisection or dqds
+              if ( .not.usedqd ) then
+                 ! if stdlib_dlarrd has been used, shift the eigenvalue approximations
+                 ! according to their representation. this is necessary for
+                 ! a uniform stdlib_dlarrv since dqds computes eigenvalues of the
+                 ! shifted representation. in stdlib_dlarrv, w will always hold the
+                 ! unshifted eigenvalue approximation.
+                 do j=wbegin,wend
+                    w(j) = w(j) - sigma
+                    werr(j) = werr(j) + abs(w(j)) * eps
+                 end do
+                 ! call stdlib_dlarrb to reduce eigenvalue error of the approximations
+                 ! from stdlib_dlarrd
+                 do i = ibegin, iend-1
+                    work( i ) = d( i ) * e( i )**2
+                 end do
+                 ! use bisection to find ev from indl to indu
+                 call stdlib_dlarrb(in, d(ibegin), work(ibegin),indl, indu, rtol1, rtol2, indl-1,&
+                 w(wbegin), wgap(wbegin), werr(wbegin),work( 2*n+1 ), iwork, pivmin, spdiam,in, &
+                           iinfo )
+                 if( iinfo /= 0 ) then
+                    info = -4
+                    return
+                 end if
+                 ! stdlib_dlarrb computes all gaps correctly except for the last one
+                 ! record distance to vu/gu
+                 wgap( wend ) = max( zero,( vu-sigma ) - ( w( wend ) + werr( wend ) ) )
+                 do i = indl, indu
+                    m = m + 1
+                    iblock(m) = jblk
+                    indexw(m) = i
+                 end do
+              else
+                 ! call dqds to get all eigs (and then possibly delete unwanted
+                 ! eigenvalues).
+                 ! note that dqds finds the eigenvalues of the l d l^t representation
+                 ! of t to high relative accuracy. high relative accuracy
+                 ! might be lost when the shift of the rrr is subtracted to obtain
+                 ! the eigenvalues of t. however, t is not guaranteed to define its
+                 ! eigenvalues to high relative accuracy anyway.
+                 ! set rtol to the order of the tolerance used in stdlib_dlasq2
+                 ! this is an estimated error, the worst case bound is 4*n*eps
+                 ! which is usually too large and requires unnecessary work to be
+                 ! done by bisection when computing the eigenvectors
+                 rtol = log(real(in,KIND=dp)) * four * eps
+                 j = ibegin
+                 do i = 1, in - 1
+                    work( 2*i-1 ) = abs( d( j ) )
+                    work( 2*i ) = e( j )*e( j )*work( 2*i-1 )
+                    j = j + 1
+                 end do
+                 work( 2*in-1 ) = abs( d( iend ) )
+                 work( 2*in ) = zero
+                 call stdlib_dlasq2( in, work, iinfo )
+                 if( iinfo /= 0 ) then
+                    ! if iinfo = -5 then an index is part of a tight cluster
+                    ! and should be changed. the index is in iwork(1) and the
+                    ! gap is in work(n+1)
+                    info = -5
+                    return
+                 else
+                    ! test that all eigenvalues are positive as expected
+                    do i = 1, in
+                       if( work( i )<zero ) then
+                          info = -6
+                          return
+                       endif
+                    end do
+                 end if
+                 if( sgndef>zero ) then
+                    do i = indl, indu
+                       m = m + 1
+                       w( m ) = work( in-i+1 )
+                       iblock( m ) = jblk
+                       indexw( m ) = i
+                    end do
+                 else
+                    do i = indl, indu
+                       m = m + 1
+                       w( m ) = -work( i )
+                       iblock( m ) = jblk
+                       indexw( m ) = i
+                    end do
+                 end if
+                 do i = m - mb + 1, m
+                    ! the value of rtol below should be the tolerance in stdlib_dlasq2
+                    werr( i ) = rtol * abs( w(i) )
+                 end do
+                 do i = m - mb + 1, m - 1
+                    ! compute the right gap between the intervals
+                    wgap( i ) = max( zero,w(i+1)-werr(i+1) - (w(i)+werr(i)) )
+                 end do
+                 wgap( m ) = max( zero,( vu-sigma ) - ( w( m ) + werr( m ) ) )
+              end if
+              ! proceed with next block
+              ibegin = iend + 1
+              wbegin = wend + 1
+           end do loop_170
+           return
+     end subroutine stdlib_dlarre
+
+     !> Using a divide and conquer approach, DLASD0: computes the singular
+     !> value decomposition (SVD) of a real upper bidiagonal N-by-M
+     !> matrix B with diagonal D and offdiagonal E, where M = N + SQRE.
+     !> The algorithm computes orthogonal matrices U and VT such that
+     !> B = U * S * VT. The singular values S are overwritten on D.
+     !> A related subroutine, DLASDA, computes only the singular values,
+     !> and optionally, the singular vectors in compact form.
+
+     pure subroutine stdlib_dlasd0( n, sqre, d, e, u, ldu, vt, ldvt, smlsiz, iwork,work, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu, ldvt, n, smlsiz, sqre
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ic, idxq, idxqc, im1, inode, itemp, iwk, j, lf, ll, lvl, m, ncc,&
+                      nd, ndb1, ndiml, ndimr, nl, nlf, nlp1, nlvl, nr, nrf, nrp1, sqrei
+           real(dp) :: alpha, beta
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -2
+           end if
+           m = n + sqre
+           if( ldu<n ) then
+              info = -6
+           else if( ldvt<m ) then
+              info = -8
+           else if( smlsiz<3 ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD0', -info )
+              return
+           end if
+           ! if the input matrix is too small, call stdlib_dlasdq to find the svd.
+           if( n<=smlsiz ) then
+              call stdlib_dlasdq( 'U', sqre, n, m, n, 0, d, e, vt, ldvt, u, ldu, u,ldu, work, &
+                        info )
+              return
+           end if
+           ! set up the computation tree.
+           inode = 1
+           ndiml = inode + n
+           ndimr = ndiml + n
+           idxq = ndimr + n
+           iwk = idxq + n
+           call stdlib_dlasdt( n, nlvl, nd, iwork( inode ), iwork( ndiml ),iwork( ndimr ), smlsiz &
+                     )
+           ! for the nodes on bottom level of the tree, solve
+           ! their subproblems by stdlib_dlasdq.
+           ndb1 = ( nd+1 ) / 2
+           ncc = 0
+           loop_30: do i = ndb1, nd
+           ! ic : center row of each node
+           ! nl : number of rows of left  subproblem
+           ! nr : number of rows of right subproblem
+           ! nlf: starting row of the left   subproblem
+           ! nrf: starting row of the right  subproblem
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nlp1 = nl + 1
+              nr = iwork( ndimr+i1 )
+              nrp1 = nr + 1
+              nlf = ic - nl
+              nrf = ic + 1
+              sqrei = 1
+              call stdlib_dlasdq( 'U', sqrei, nl, nlp1, nl, ncc, d( nlf ), e( nlf ),vt( nlf, nlf )&
+                        , ldvt, u( nlf, nlf ), ldu,u( nlf, nlf ), ldu, work, info )
+              if( info/=0 ) then
+                 return
+              end if
+              itemp = idxq + nlf - 2
+              do j = 1, nl
+                 iwork( itemp+j ) = j
+              end do
+              if( i==nd ) then
+                 sqrei = sqre
+              else
+                 sqrei = 1
+              end if
+              nrp1 = nr + sqrei
+              call stdlib_dlasdq( 'U', sqrei, nr, nrp1, nr, ncc, d( nrf ), e( nrf ),vt( nrf, nrf )&
+                        , ldvt, u( nrf, nrf ), ldu,u( nrf, nrf ), ldu, work, info )
+              if( info/=0 ) then
+                 return
+              end if
+              itemp = idxq + ic
+              do j = 1, nr
+                 iwork( itemp+j-1 ) = j
+              end do
+           end do loop_30
+           ! now conquer each subproblem bottom-up.
+           loop_50: do lvl = nlvl, 1, -1
+              ! find the first node lf and last node ll on the
+              ! current level lvl.
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = lf, ll
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 if( ( sqre==0 ) .and. ( i==ll ) ) then
+                    sqrei = sqre
+                 else
+                    sqrei = 1
+                 end if
+                 idxqc = idxq + nlf - 1
+                 alpha = d( ic )
+                 beta = e( ic )
+                 call stdlib_dlasd1( nl, nr, sqrei, d( nlf ), alpha, beta,u( nlf, nlf ), ldu, vt( &
+                           nlf, nlf ), ldvt,iwork( idxqc ), iwork( iwk ), work, info )
+              ! report the possible convergence failure.
+                 if( info/=0 ) then
+                    return
+                 end if
+              end do
+           end do loop_50
+           return
+     end subroutine stdlib_dlasd0
+
+     !> Using a divide and conquer approach, DLASDA: computes the singular
+     !> value decomposition (SVD) of a real upper bidiagonal N-by-M matrix
+     !> B with diagonal D and offdiagonal E, where M = N + SQRE. The
+     !> algorithm computes the singular values in the SVD B = U * S * VT.
+     !> The orthogonal matrices U and VT are optionally computed in
+     !> compact form.
+     !> A related subroutine, DLASD0, computes the singular values and
+     !> the singular vectors in explicit form.
+
+     pure subroutine stdlib_dlasda( icompq, smlsiz, n, sqre, d, e, u, ldu, vt, k,difl, difr, z, &
+               poles, givptr, givcol, ldgcol,perm, givnum, c, s, work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, ldgcol, ldu, n, smlsiz, sqre
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(ldgcol,*), givptr(*), iwork(*), k(*), perm(ldgcol,&
+                     *)
+           real(dp), intent(out) :: c(*), difl(ldu,*), difr(ldu,*), givnum(ldu,*), poles(ldu,*), &
+                     s(*), u(ldu,*), vt(ldu,*), work(*), z(ldu,*)
+           real(dp), intent(inout) :: d(*), e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ic, idxq, idxqi, im1, inode, itemp, iwk, j, lf, ll, lvl, lvl2, &
+           m, ncc, nd, ndb1, ndiml, ndimr, nl, nlf, nlp1, nlvl, nr, nrf, nrp1, nru, nwork1, &
+                     nwork2, smlszp, sqrei, vf, vfi, vl, vli
+           real(dp) :: alpha, beta
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( smlsiz<3 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( ldu<( n+sqre ) ) then
+              info = -8
+           else if( ldgcol<n ) then
+              info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASDA', -info )
+              return
+           end if
+           m = n + sqre
+           ! if the input matrix is too small, call stdlib_dlasdq to find the svd.
+           if( n<=smlsiz ) then
+              if( icompq==0 ) then
+                 call stdlib_dlasdq( 'U', sqre, n, 0, 0, 0, d, e, vt, ldu, u, ldu,u, ldu, work, &
+                           info )
+              else
+                 call stdlib_dlasdq( 'U', sqre, n, m, n, 0, d, e, vt, ldu, u, ldu,u, ldu, work, &
+                           info )
+              end if
+              return
+           end if
+           ! book-keeping and  set up the computation tree.
+           inode = 1
+           ndiml = inode + n
+           ndimr = ndiml + n
+           idxq = ndimr + n
+           iwk = idxq + n
+           ncc = 0
+           nru = 0
+           smlszp = smlsiz + 1
+           vf = 1
+           vl = vf + m
+           nwork1 = vl + m
+           nwork2 = nwork1 + smlszp*smlszp
+           call stdlib_dlasdt( n, nlvl, nd, iwork( inode ), iwork( ndiml ),iwork( ndimr ), smlsiz &
+                     )
+           ! for the nodes on bottom level of the tree, solve
+           ! their subproblems by stdlib_dlasdq.
+           ndb1 = ( nd+1 ) / 2
+           loop_30: do i = ndb1, nd
+              ! ic : center row of each node
+              ! nl : number of rows of left  subproblem
+              ! nr : number of rows of right subproblem
+              ! nlf: starting row of the left   subproblem
+              ! nrf: starting row of the right  subproblem
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nlp1 = nl + 1
+              nr = iwork( ndimr+i1 )
+              nlf = ic - nl
+              nrf = ic + 1
+              idxqi = idxq + nlf - 2
+              vfi = vf + nlf - 1
+              vli = vl + nlf - 1
+              sqrei = 1
+              if( icompq==0 ) then
+                 call stdlib_dlaset( 'A', nlp1, nlp1, zero, one, work( nwork1 ),smlszp )
+                 call stdlib_dlasdq( 'U', sqrei, nl, nlp1, nru, ncc, d( nlf ),e( nlf ), work( &
+                 nwork1 ), smlszp,work( nwork2 ), nl, work( nwork2 ), nl,work( nwork2 ), info )
+                           
+                 itemp = nwork1 + nl*smlszp
+                 call stdlib_dcopy( nlp1, work( nwork1 ), 1, work( vfi ), 1 )
+                 call stdlib_dcopy( nlp1, work( itemp ), 1, work( vli ), 1 )
+              else
+                 call stdlib_dlaset( 'A', nl, nl, zero, one, u( nlf, 1 ), ldu )
+                 call stdlib_dlaset( 'A', nlp1, nlp1, zero, one, vt( nlf, 1 ), ldu )
+                 call stdlib_dlasdq( 'U', sqrei, nl, nlp1, nl, ncc, d( nlf ),e( nlf ), vt( nlf, 1 &
+                           ), ldu, u( nlf, 1 ), ldu,u( nlf, 1 ), ldu, work( nwork1 ), info )
+                 call stdlib_dcopy( nlp1, vt( nlf, 1 ), 1, work( vfi ), 1 )
+                 call stdlib_dcopy( nlp1, vt( nlf, nlp1 ), 1, work( vli ), 1 )
+              end if
+              if( info/=0 ) then
+                 return
+              end if
+              do j = 1, nl
+                 iwork( idxqi+j ) = j
+              end do
+              if( ( i==nd ) .and. ( sqre==0 ) ) then
+                 sqrei = 0
+              else
+                 sqrei = 1
+              end if
+              idxqi = idxqi + nlp1
+              vfi = vfi + nlp1
+              vli = vli + nlp1
+              nrp1 = nr + sqrei
+              if( icompq==0 ) then
+                 call stdlib_dlaset( 'A', nrp1, nrp1, zero, one, work( nwork1 ),smlszp )
+                 call stdlib_dlasdq( 'U', sqrei, nr, nrp1, nru, ncc, d( nrf ),e( nrf ), work( &
+                 nwork1 ), smlszp,work( nwork2 ), nr, work( nwork2 ), nr,work( nwork2 ), info )
+                           
+                 itemp = nwork1 + ( nrp1-1 )*smlszp
+                 call stdlib_dcopy( nrp1, work( nwork1 ), 1, work( vfi ), 1 )
+                 call stdlib_dcopy( nrp1, work( itemp ), 1, work( vli ), 1 )
+              else
+                 call stdlib_dlaset( 'A', nr, nr, zero, one, u( nrf, 1 ), ldu )
+                 call stdlib_dlaset( 'A', nrp1, nrp1, zero, one, vt( nrf, 1 ), ldu )
+                 call stdlib_dlasdq( 'U', sqrei, nr, nrp1, nr, ncc, d( nrf ),e( nrf ), vt( nrf, 1 &
+                           ), ldu, u( nrf, 1 ), ldu,u( nrf, 1 ), ldu, work( nwork1 ), info )
+                 call stdlib_dcopy( nrp1, vt( nrf, 1 ), 1, work( vfi ), 1 )
+                 call stdlib_dcopy( nrp1, vt( nrf, nrp1 ), 1, work( vli ), 1 )
+              end if
+              if( info/=0 ) then
+                 return
+              end if
+              do j = 1, nr
+                 iwork( idxqi+j ) = j
+              end do
+           end do loop_30
+           ! now conquer each subproblem bottom-up.
+           j = 2**nlvl
+           loop_50: do lvl = nlvl, 1, -1
+              lvl2 = lvl*2 - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl.
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              loop_40: do i = lf, ll
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 if( i==ll ) then
+                    sqrei = sqre
+                 else
+                    sqrei = 1
+                 end if
+                 vfi = vf + nlf - 1
+                 vli = vl + nlf - 1
+                 idxqi = idxq + nlf - 1
+                 alpha = d( ic )
+                 beta = e( ic )
+                 if( icompq==0 ) then
+                    call stdlib_dlasd6( icompq, nl, nr, sqrei, d( nlf ),work( vfi ), work( vli ), &
+                    alpha, beta,iwork( idxqi ), perm, givptr( 1 ), givcol,ldgcol, givnum, ldu, &
+                    poles, difl, difr, z,k( 1 ), c( 1 ), s( 1 ), work( nwork1 ),iwork( iwk ), &
+                              info )
+                 else
+                    j = j - 1
+                    call stdlib_dlasd6( icompq, nl, nr, sqrei, d( nlf ),work( vfi ), work( vli ), &
+                    alpha, beta,iwork( idxqi ), perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ),&
+                     ldgcol,givnum( nlf, lvl2 ), ldu,poles( nlf, lvl2 ), difl( nlf, lvl ),difr( &
+                     nlf, lvl2 ), z( nlf, lvl ), k( j ),c( j ), s( j ), work( nwork1 ),iwork( iwk &
+                               ), info )
+                 end if
+                 if( info/=0 ) then
+                    return
+                 end if
+              end do loop_40
+           end do loop_50
+           return
+     end subroutine stdlib_dlasda
+
+     !> DLASDQ: computes the singular value decomposition (SVD) of a real
+     !> (upper or lower) bidiagonal matrix with diagonal D and offdiagonal
+     !> E, accumulating the transformations if desired. Letting B denote
+     !> the input bidiagonal matrix, the algorithm computes orthogonal
+     !> matrices Q and P such that B = Q * S * P**T (P**T denotes the transpose
+     !> of P). The singular values S are overwritten on D.
+     !> The input matrix U  is changed to U  * Q  if desired.
+     !> The input matrix VT is changed to P**T * VT if desired.
+     !> The input matrix C  is changed to Q**T * C  if desired.
+     !> See "Computing  Small Singular Values of Bidiagonal Matrices With
+     !> Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+     !> LAPACK Working Note #3, for a detailed description of the algorithm.
+
+     pure subroutine stdlib_dlasdq( uplo, sqre, n, ncvt, nru, ncc, d, e, vt, ldvt,u, ldu, c, ldc, &
+               work, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, ldu, ldvt, n, ncc, ncvt, nru, sqre
+           ! Array Arguments 
+           real(dp), intent(inout) :: c(ldc,*), d(*), e(*), u(ldu,*), vt(ldvt,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: rotate
+           integer(ilp) :: i, isub, iuplo, j, np1, sqre1
+           real(dp) :: cs, r, smin, sn
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           iuplo = 0
+           if( stdlib_lsame( uplo, 'U' ) )iuplo = 1
+           if( stdlib_lsame( uplo, 'L' ) )iuplo = 2
+           if( iuplo==0 ) then
+              info = -1
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ncvt<0 ) then
+              info = -4
+           else if( nru<0 ) then
+              info = -5
+           else if( ncc<0 ) then
+              info = -6
+           else if( ( ncvt==0 .and. ldvt<1 ) .or.( ncvt>0 .and. ldvt<max( 1, n ) ) ) then
+              info = -10
+           else if( ldu<max( 1, nru ) ) then
+              info = -12
+           else if( ( ncc==0 .and. ldc<1 ) .or.( ncc>0 .and. ldc<max( 1, n ) ) ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASDQ', -info )
+              return
+           end if
+           if( n==0 )return
+           ! rotate is true if any singular vectors desired, false otherwise
+           rotate = ( ncvt>0 ) .or. ( nru>0 ) .or. ( ncc>0 )
+           np1 = n + 1
+           sqre1 = sqre
+           ! if matrix non-square upper bidiagonal, rotate to be lower
+           ! bidiagonal.  the rotations are on the right.
+           if( ( iuplo==1 ) .and. ( sqre1==1 ) ) then
+              do i = 1, n - 1
+                 call stdlib_dlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( rotate ) then
+                    work( i ) = cs
+                    work( n+i ) = sn
+                 end if
+              end do
+              call stdlib_dlartg( d( n ), e( n ), cs, sn, r )
+              d( n ) = r
+              e( n ) = zero
+              if( rotate ) then
+                 work( n ) = cs
+                 work( n+n ) = sn
+              end if
+              iuplo = 2
+              sqre1 = 0
+              ! update singular vectors if desired.
+              if( ncvt>0 )call stdlib_dlasr( 'L', 'V', 'F', np1, ncvt, work( 1 ),work( np1 ), vt, &
+                        ldvt )
+           end if
+           ! if matrix lower bidiagonal, rotate to be upper bidiagonal
+           ! by applying givens rotations on the left.
+           if( iuplo==2 ) then
+              do i = 1, n - 1
+                 call stdlib_dlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( rotate ) then
+                    work( i ) = cs
+                    work( n+i ) = sn
+                 end if
+              end do
+              ! if matrix (n+1)-by-n lower bidiagonal, one additional
+              ! rotation is needed.
+              if( sqre1==1 ) then
+                 call stdlib_dlartg( d( n ), e( n ), cs, sn, r )
+                 d( n ) = r
+                 if( rotate ) then
+                    work( n ) = cs
+                    work( n+n ) = sn
+                 end if
+              end if
+              ! update singular vectors if desired.
+              if( nru>0 ) then
+                 if( sqre1==0 ) then
+                    call stdlib_dlasr( 'R', 'V', 'F', nru, n, work( 1 ),work( np1 ), u, ldu )
+                              
+                 else
+                    call stdlib_dlasr( 'R', 'V', 'F', nru, np1, work( 1 ),work( np1 ), u, ldu )
+                              
+                 end if
+              end if
+              if( ncc>0 ) then
+                 if( sqre1==0 ) then
+                    call stdlib_dlasr( 'L', 'V', 'F', n, ncc, work( 1 ),work( np1 ), c, ldc )
+                              
+                 else
+                    call stdlib_dlasr( 'L', 'V', 'F', np1, ncc, work( 1 ),work( np1 ), c, ldc )
+                              
+                 end if
+              end if
+           end if
+           ! call stdlib_dbdsqr to compute the svd of the reduced real
+           ! n-by-n upper bidiagonal matrix.
+           call stdlib_dbdsqr( 'U', n, ncvt, nru, ncc, d, e, vt, ldvt, u, ldu, c,ldc, work, info )
+                     
+           ! sort the singular values into ascending order (insertion sort on
+           ! singular values, but only one transposition per singular vector)
+           do i = 1, n
+              ! scan for smallest d(i).
+              isub = i
+              smin = d( i )
+              do j = i + 1, n
+                 if( d( j )<smin ) then
+                    isub = j
+                    smin = d( j )
+                 end if
+              end do
+              if( isub/=i ) then
+                 ! swap singular values and vectors.
+                 d( isub ) = d( i )
+                 d( i ) = smin
+                 if( ncvt>0 )call stdlib_dswap( ncvt, vt( isub, 1 ), ldvt, vt( i, 1 ), ldvt )
+                           
+                 if( nru>0 )call stdlib_dswap( nru, u( 1, isub ), 1, u( 1, i ), 1 )
+                 if( ncc>0 )call stdlib_dswap( ncc, c( isub, 1 ), ldc, c( i, 1 ), ldc )
+              end if
+           end do
+           return
+     end subroutine stdlib_dlasdq
+
+     !> DLASQ1: computes the singular values of a real N-by-N bidiagonal
+     !> matrix with diagonal D and off-diagonal E. The singular values
+     !> are computed to high relative accuracy, in the absence of
+     !> denormalization, underflow and overflow. The algorithm was first
+     !> presented in
+     !> "Accurate singular values and differential qd algorithms" by K. V.
+     !> Fernando and B. N. Parlett, Numer. Math., Vol-67, No. 2, pp. 191-230,
+     !> 1994,
+     !> and the present implementation is described in "An implementation of
+     !> the dqds Algorithm (Positive Case)", LAPACK Working Note.
+
+     pure subroutine stdlib_dlasq1( n, d, e, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo
+           real(dp) :: eps, scale, safmin, sigmn, sigmx
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'DLASQ1', -info )
+              return
+           else if( n==0 ) then
+              return
+           else if( n==1 ) then
+              d( 1 ) = abs( d( 1 ) )
+              return
+           else if( n==2 ) then
+              call stdlib_dlas2( d( 1 ), e( 1 ), d( 2 ), sigmn, sigmx )
+              d( 1 ) = sigmx
+              d( 2 ) = sigmn
+              return
+           end if
+           ! estimate the largest singular value.
+           sigmx = zero
+           do i = 1, n - 1
+              d( i ) = abs( d( i ) )
+              sigmx = max( sigmx, abs( e( i ) ) )
+           end do
+           d( n ) = abs( d( n ) )
+           ! early return if sigmx is zero (matrix is already diagonal).
+           if( sigmx==zero ) then
+              call stdlib_dlasrt( 'D', n, d, iinfo )
+              return
+           end if
+           do i = 1, n
+              sigmx = max( sigmx, d( i ) )
+           end do
+           ! copy d and e into work (in the z format) and scale (squaring the
+           ! input data makes scaling by a power of the radix pointless).
+           eps = stdlib_dlamch( 'PRECISION' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           scale = sqrt( eps / safmin )
+           call stdlib_dcopy( n, d, 1, work( 1 ), 2 )
+           call stdlib_dcopy( n-1, e, 1, work( 2 ), 2 )
+           call stdlib_dlascl( 'G', 0, 0, sigmx, scale, 2*n-1, 1, work, 2*n-1,iinfo )
+           ! compute the q's and e's.
+           do i = 1, 2*n - 1
+              work( i ) = work( i )**2
+           end do
+           work( 2*n ) = zero
+           call stdlib_dlasq2( n, work, info )
+           if( info==0 ) then
+              do i = 1, n
+                 d( i ) = sqrt( work( i ) )
+              end do
+              call stdlib_dlascl( 'G', 0, 0, scale, sigmx, n, 1, d, n, iinfo )
+           else if( info==2 ) then
+           ! maximum number of iterations exceeded.  move data from work
+           ! into d and e so the calling subroutine can try to finish
+              do i = 1, n
+                 d( i ) = sqrt( work( 2*i-1 ) )
+                 e( i ) = sqrt( work( 2*i ) )
+              end do
+              call stdlib_dlascl( 'G', 0, 0, scale, sigmx, n, 1, d, n, iinfo )
+              call stdlib_dlascl( 'G', 0, 0, scale, sigmx, n, 1, e, n, iinfo )
+           end if
+           return
+     end subroutine stdlib_dlasq1
+
+     !> DLASQ2: computes all the eigenvalues of the symmetric positive
+     !> definite tridiagonal matrix associated with the qd array Z to high
+     !> relative accuracy are computed to high relative accuracy, in the
+     !> absence of denormalization, underflow and overflow.
+     !> To see the relation of Z to the tridiagonal matrix, let L be a
+     !> unit lower bidiagonal matrix with subdiagonals Z(2,4,6,,..) and
+     !> let U be an upper bidiagonal matrix with 1's above and diagonal
+     !> Z(1,3,5,,..). The tridiagonal is L*U or, if you prefer, the
+     !> symmetric tridiagonal to which it is similar.
+     !> Note : DLASQ2 defines a logical variable, IEEE, which is true
+     !> on machines which follow ieee-754 floating-point standard in their
+     !> handling of infinities and NaNs, and false otherwise. This variable
+     !> is passed to DLASQ3.
+
+     pure subroutine stdlib_dlasq2( n, z, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(inout) :: z(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: cbias = 1.50_dp
+           real(dp), parameter :: hundrd = 100.0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ieee
+           integer(ilp) :: i0, i1, i4, iinfo, ipn4, iter, iwhila, iwhilb, k, kmin, n0, n1, nbig, &
+                     ndiv, nfail, pp, splt, ttype
+           real(dp) :: d, dee, deemin, desig, dmin, dmin1, dmin2, dn, dn1, dn2, e, emax, emin, &
+           eps, g, oldemn, qmax, qmin, s, safmin, sigma, t, tau, temp, tol, tol2, trace, zmax, &
+                     tempe, tempq
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments.
+           ! (in case stdlib_dlasq2 is not called by stdlib_dlasq1)
+           info = 0
+           eps = stdlib_dlamch( 'PRECISION' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           tol = eps*hundrd
+           tol2 = tol**2
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'DLASQ2', 1 )
+              return
+           else if( n==0 ) then
+              return
+           else if( n==1 ) then
+              ! 1-by-1 case.
+              if( z( 1 )<zero ) then
+                 info = -201
+                 call stdlib_xerbla( 'DLASQ2', 2 )
+              end if
+              return
+           else if( n==2 ) then
+              ! 2-by-2 case.
+              if( z( 1 )<zero ) then
+                 info = -201
+                 call stdlib_xerbla( 'DLASQ2', 2 )
+                 return
+              else if( z( 2 )<zero ) then
+                 info = -202
+                 call stdlib_xerbla( 'DLASQ2', 2 )
+                 return
+              else if( z( 3 )<zero ) then
+                info = -203
+                call stdlib_xerbla( 'DLASQ2', 2 )
+                return
+              else if( z( 3 )>z( 1 ) ) then
+                 d = z( 3 )
+                 z( 3 ) = z( 1 )
+                 z( 1 ) = d
+              end if
+              z( 5 ) = z( 1 ) + z( 2 ) + z( 3 )
+              if( z( 2 )>z( 3 )*tol2 ) then
+                 t = half*( ( z( 1 )-z( 3 ) )+z( 2 ) )
+                 s = z( 3 )*( z( 2 ) / t )
+                 if( s<=t ) then
+                    s = z( 3 )*( z( 2 ) / ( t*( one+sqrt( one+s / t ) ) ) )
+                 else
+                    s = z( 3 )*( z( 2 ) / ( t+sqrt( t )*sqrt( t+s ) ) )
+                 end if
+                 t = z( 1 ) + ( s+z( 2 ) )
+                 z( 3 ) = z( 3 )*( z( 1 ) / t )
+                 z( 1 ) = t
+              end if
+              z( 2 ) = z( 3 )
+              z( 6 ) = z( 2 ) + z( 1 )
+              return
+           end if
+           ! check for negative data and compute sums of q's and e's.
+           z( 2*n ) = zero
+           emin = z( 2 )
+           qmax = zero
+           zmax = zero
+           d = zero
+           e = zero
+           do k = 1, 2*( n-1 ), 2
+              if( z( k )<zero ) then
+                 info = -( 200+k )
+                 call stdlib_xerbla( 'DLASQ2', 2 )
+                 return
+              else if( z( k+1 )<zero ) then
+                 info = -( 200+k+1 )
+                 call stdlib_xerbla( 'DLASQ2', 2 )
+                 return
+              end if
+              d = d + z( k )
+              e = e + z( k+1 )
+              qmax = max( qmax, z( k ) )
+              emin = min( emin, z( k+1 ) )
+              zmax = max( qmax, zmax, z( k+1 ) )
+           end do
+           if( z( 2*n-1 )<zero ) then
+              info = -( 200+2*n-1 )
+              call stdlib_xerbla( 'DLASQ2', 2 )
+              return
+           end if
+           d = d + z( 2*n-1 )
+           qmax = max( qmax, z( 2*n-1 ) )
+           zmax = max( qmax, zmax )
+           ! check for diagonality.
+           if( e==zero ) then
+              do k = 2, n
+                 z( k ) = z( 2*k-1 )
+              end do
+              call stdlib_dlasrt( 'D', n, z, iinfo )
+              z( 2*n-1 ) = d
+              return
+           end if
+           trace = d + e
+           ! check for zero data.
+           if( trace==zero ) then
+              z( 2*n-1 ) = zero
+              return
+           end if
+           ! check whether the machine is ieee conformable.
+           ieee = ( stdlib_ilaenv( 10, 'DLASQ2', 'N', 1, 2, 3, 4 )==1 )
+           ! rearrange data for locality: z=(q1,qq1,e1,ee1,q2,qq2,e2,ee2,...).
+           do k = 2*n, 2, -2
+              z( 2*k ) = zero
+              z( 2*k-1 ) = z( k )
+              z( 2*k-2 ) = zero
+              z( 2*k-3 ) = z( k-1 )
+           end do
+           i0 = 1
+           n0 = n
+           ! reverse the qd-array, if warranted.
+           if( cbias*z( 4*i0-3 )<z( 4*n0-3 ) ) then
+              ipn4 = 4*( i0+n0 )
+              do i4 = 4*i0, 2*( i0+n0-1 ), 4
+                 temp = z( i4-3 )
+                 z( i4-3 ) = z( ipn4-i4-3 )
+                 z( ipn4-i4-3 ) = temp
+                 temp = z( i4-1 )
+                 z( i4-1 ) = z( ipn4-i4-5 )
+                 z( ipn4-i4-5 ) = temp
+              end do
+           end if
+           ! initial split checking via dqd and li's test.
+           pp = 0
+           loop_80: do k = 1, 2
+              d = z( 4*n0+pp-3 )
+              do i4 = 4*( n0-1 ) + pp, 4*i0 + pp, -4
+                 if( z( i4-1 )<=tol2*d ) then
+                    z( i4-1 ) = -zero
+                    d = z( i4-3 )
+                 else
+                    d = z( i4-3 )*( d / ( d+z( i4-1 ) ) )
+                 end if
+              end do
+              ! dqd maps z to zz plus li's test.
+              emin = z( 4*i0+pp+1 )
+              d = z( 4*i0+pp-3 )
+              do i4 = 4*i0 + pp, 4*( n0-1 ) + pp, 4
+                 z( i4-2*pp-2 ) = d + z( i4-1 )
+                 if( z( i4-1 )<=tol2*d ) then
+                    z( i4-1 ) = -zero
+                    z( i4-2*pp-2 ) = d
+                    z( i4-2*pp ) = zero
+                    d = z( i4+1 )
+                 else if( safmin*z( i4+1 )<z( i4-2*pp-2 ) .and.safmin*z( i4-2*pp-2 )<z( i4+1 ) ) &
+                           then
+                    temp = z( i4+1 ) / z( i4-2*pp-2 )
+                    z( i4-2*pp ) = z( i4-1 )*temp
+                    d = d*temp
+                 else
+                    z( i4-2*pp ) = z( i4+1 )*( z( i4-1 ) / z( i4-2*pp-2 ) )
+                    d = z( i4+1 )*( d / z( i4-2*pp-2 ) )
+                 end if
+                 emin = min( emin, z( i4-2*pp ) )
+              end do
+              z( 4*n0-pp-2 ) = d
+              ! now find qmax.
+              qmax = z( 4*i0-pp-2 )
+              do i4 = 4*i0 - pp + 2, 4*n0 - pp - 2, 4
+                 qmax = max( qmax, z( i4 ) )
+              end do
+              ! prepare for the next iteration on k.
+              pp = 1 - pp
+           end do loop_80
+           ! initialise variables to pass to stdlib_dlasq3.
+           ttype = 0
+           dmin1 = zero
+           dmin2 = zero
+           dn    = zero
+           dn1   = zero
+           dn2   = zero
+           g     = zero
+           tau   = zero
+           iter = 2
+           nfail = 0
+           ndiv = 2*( n0-i0 )
+           loop_160: do iwhila = 1, n + 1
+              if( n0<1 )go to 170
+              ! while array unfinished do
+              ! e(n0) holds the value of sigma when submatrix in i0:n0
+              ! splits from the rest of the array, but is negated.
+              desig = zero
+              if( n0==n ) then
+                 sigma = zero
+              else
+                 sigma = -z( 4*n0-1 )
+              end if
+              if( sigma<zero ) then
+                 info = 1
+                 return
+              end if
+              ! find last unreduced submatrix's top index i0, find qmax and
+              ! emin. find gershgorin-type bound if q's much greater than e's.
+              emax = zero
+              if( n0>i0 ) then
+                 emin = abs( z( 4*n0-5 ) )
+              else
+                 emin = zero
+              end if
+              qmin = z( 4*n0-3 )
+              qmax = qmin
+              do i4 = 4*n0, 8, -4
+                 if( z( i4-5 )<=zero )go to 100
+                 if( qmin>=four*emax ) then
+                    qmin = min( qmin, z( i4-3 ) )
+                    emax = max( emax, z( i4-5 ) )
+                 end if
+                 qmax = max( qmax, z( i4-7 )+z( i4-5 ) )
+                 emin = min( emin, z( i4-5 ) )
+              end do
+              i4 = 4
+              100 continue
+              i0 = i4 / 4
+              pp = 0
+              if( n0-i0>1 ) then
+                 dee = z( 4*i0-3 )
+                 deemin = dee
+                 kmin = i0
+                 do i4 = 4*i0+1, 4*n0-3, 4
+                    dee = z( i4 )*( dee /( dee+z( i4-2 ) ) )
+                    if( dee<=deemin ) then
+                       deemin = dee
+                       kmin = ( i4+3 )/4
+                    end if
+                 end do
+                 if( (kmin-i0)*2<n0-kmin .and.deemin<=half*z(4*n0-3) ) then
+                    ipn4 = 4*( i0+n0 )
+                    pp = 2
+                    do i4 = 4*i0, 2*( i0+n0-1 ), 4
+                       temp = z( i4-3 )
+                       z( i4-3 ) = z( ipn4-i4-3 )
+                       z( ipn4-i4-3 ) = temp
+                       temp = z( i4-2 )
+                       z( i4-2 ) = z( ipn4-i4-2 )
+                       z( ipn4-i4-2 ) = temp
+                       temp = z( i4-1 )
+                       z( i4-1 ) = z( ipn4-i4-5 )
+                       z( ipn4-i4-5 ) = temp
+                       temp = z( i4 )
+                       z( i4 ) = z( ipn4-i4-4 )
+                       z( ipn4-i4-4 ) = temp
+                    end do
+                 end if
+              end if
+              ! put -(initial shift) into dmin.
+              dmin = -max( zero, qmin-two*sqrt( qmin )*sqrt( emax ) )
+              ! now i0:n0 is unreduced.
+              ! pp = 0 for ping, pp = 1 for pong.
+              ! pp = 2 indicates that flipping was applied to the z array and
+                     ! and that the tests for deflation upon entry in stdlib_dlasq3
+                     ! should not be performed.
+              nbig = 100*( n0-i0+1 )
+              loop_140: do iwhilb = 1, nbig
+                 if( i0>n0 )go to 150
+                 ! while submatrix unfinished take a good dqds step.
+                 call stdlib_dlasq3( i0, n0, z, pp, dmin, sigma, desig, qmax, nfail,iter, ndiv, &
+                           ieee, ttype, dmin1, dmin2, dn, dn1,dn2, g, tau )
+                 pp = 1 - pp
+                 ! when emin is very small check for splits.
+                 if( pp==0 .and. n0-i0>=3 ) then
+                    if( z( 4*n0 )<=tol2*qmax .or.z( 4*n0-1 )<=tol2*sigma ) then
+                       splt = i0 - 1
+                       qmax = z( 4*i0-3 )
+                       emin = z( 4*i0-1 )
+                       oldemn = z( 4*i0 )
+                       do i4 = 4*i0, 4*( n0-3 ), 4
+                          if( z( i4 )<=tol2*z( i4-3 ) .or.z( i4-1 )<=tol2*sigma ) then
+                             z( i4-1 ) = -sigma
+                             splt = i4 / 4
+                             qmax = zero
+                             emin = z( i4+3 )
+                             oldemn = z( i4+4 )
+                          else
+                             qmax = max( qmax, z( i4+1 ) )
+                             emin = min( emin, z( i4-1 ) )
+                             oldemn = min( oldemn, z( i4 ) )
+                          end if
+                       end do
+                       z( 4*n0-1 ) = emin
+                       z( 4*n0 ) = oldemn
+                       i0 = splt + 1
+                    end if
+                 end if
+              end do loop_140
+              info = 2
+              ! maximum number of iterations exceeded, restore the shift
+              ! sigma and place the new d's and e's in a qd array.
+              ! this might need to be done for several blocks
+              i1 = i0
+              n1 = n0
+              145 continue
+              tempq = z( 4*i0-3 )
+              z( 4*i0-3 ) = z( 4*i0-3 ) + sigma
+              do k = i0+1, n0
+                 tempe = z( 4*k-5 )
+                 z( 4*k-5 ) = z( 4*k-5 ) * (tempq / z( 4*k-7 ))
+                 tempq = z( 4*k-3 )
+                 z( 4*k-3 ) = z( 4*k-3 ) + sigma + tempe - z( 4*k-5 )
+              end do
+              ! prepare to do this on the previous block if there is one
+              if( i1>1 ) then
+                 n1 = i1-1
+                 do while( ( i1>=2 ) .and. ( z(4*i1-5)>=zero ) )
+                    i1 = i1 - 1
+                 end do
+                 sigma = -z(4*n1-1)
+                 go to 145
+              end if
+              do k = 1, n
+                 z( 2*k-1 ) = z( 4*k-3 )
+              ! only the block 1..n0 is unfinished.  the rest of the e's
+              ! must be essentially zero, although sometimes other data
+              ! has been stored in them.
+                 if( k<n0 ) then
+                    z( 2*k ) = z( 4*k-1 )
+                 else
+                    z( 2*k ) = 0
+                 end if
+              end do
+              return
+              ! end iwhilb
+              150 continue
+           end do loop_160
+           info = 3
+           return
+           ! end iwhila
+           170 continue
+           ! move q's to the front.
+           do k = 2, n
+              z( k ) = z( 4*k-3 )
+           end do
+           ! sort and compute sum of eigenvalues.
+           call stdlib_dlasrt( 'D', n, z, iinfo )
+           e = zero
+           do k = n, 1, -1
+              e = e + z( k )
+           end do
+           ! store trace, sum(eigenvalues) and information on performance.
+           z( 2*n+1 ) = trace
+           z( 2*n+2 ) = e
+           z( 2*n+3 ) = real( iter,KIND=dp)
+           z( 2*n+4 ) = real( ndiv,KIND=dp) / real( n**2,KIND=dp)
+           z( 2*n+5 ) = hundrd*nfail / real( iter,KIND=dp)
+           return
+     end subroutine stdlib_dlasq2
+
+     !> DLATRF_AA factorizes a panel of a real symmetric matrix A using
+     !> the Aasen's algorithm. The panel consists of a set of NB rows of A
+     !> when UPLO is U, or a set of NB columns when UPLO is L.
+     !> In order to factorize the panel, the Aasen's algorithm requires the
+     !> last row, or column, of the previous panel. The first row, or column,
+     !> of A is set to be the first row, or column, of an identity matrix,
+     !> which is used to factorize the first panel.
+     !> The resulting J-th row of U, or J-th column of L, is stored in the
+     !> (J-1)-th row, or column, of A (without the unit diagonals), while
+     !> the diagonal and subdiagonal of A are overwritten by those of T.
+
+     pure subroutine stdlib_dlasyf_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: m, nb, j1, lda, ldh
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*), h(ldh,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, k, k1, i1, i2, mj
+           real(dp) :: piv, alpha
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           j = 1
+           ! k1 is the first column of the panel to be factorized
+           ! i.e.,  k1 is 2 for the first block column, and 1 for the rest of the blocks
+           k1 = (2-j1)+1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              10 continue
+              if ( j>min(m, nb) )go to 20
+              ! k is the column to be factorized
+               ! when being called from stdlib_dsytrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:m, j) := a(j, j:m) - h(j:m, 1:(j-1)) * l(j1:(j-1), j),
+               ! where h(j:m, j) has been initialized to be a(j, j:m)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_dgemv( 'NO TRANSPOSE', mj, j-k1,-one, h( j, k1 ), ldh,a( 1, j ), 1,&
+                           one, h( j, j ), 1 )
+              end if
+              ! copy h(i:m, i) into work
+              call stdlib_dcopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j-1, j:m) * t(j-1,j),
+                  ! where a(j-1, j) stores t(j-1, j) and a(j-2, j:m) stores u(j-1, j:m)
+                 alpha = -a( k-1, j )
+                 call stdlib_daxpy( mj, alpha, a( k-2, j ), lda, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( k, j ) = work( 1 )
+              if( j<m ) then
+                 ! compute work(2:m) = t(j, j) l(j, (j+1):m)
+                  ! where a(j, j) stores t(j, j) and a(j-1, (j+1):m) stores u(j, (j+1):m)
+                 if( k>1 ) then
+                    alpha = -a( k, j )
+                    call stdlib_daxpy( m-j, alpha, a( k-1, j+1 ), lda,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:m)|)
+                 i2 = stdlib_idamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply symmetric pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1, i1+1:m) with a(i1+1:m, i2)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_dswap( i2-i1-1, a( j1+i1-1, i1+1 ), lda,a( j1+i1, i2 ), 1 )
+                              
+                    ! swap a(i1, i2+1:m) with a(i2, i2+1:m)
+                    if( i2<m )call stdlib_dswap( m-i2, a( j1+i1-1, i2+1 ), lda,a( j1+i2-1, i2+1 ),&
+                               lda )
+                    ! swap a(i1, i1) with a(i2,i2)
+                    piv = a( i1+j1-1, i1 )
+                    a( j1+i1-1, i1 ) = a( j1+i2-1, i2 )
+                    a( j1+i2-1, i2 ) = piv
+                    ! swap h(i1, 1:j1) with h(i2, 1:j1)
+                    call stdlib_dswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_dswap( i1-k1+1, a( 1, i1 ), 1,a( 1, i2 ), 1 )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j, j+1) = t(j, j+1)
+                 a( k, j+1 ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:m, j+1) into h(j:m, j),
+                    call stdlib_dcopy( m-j, a( k+1, j+1 ), lda,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:m ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:m, j) = l(j+2:m, j+1)
+                 if( j<(m-1) ) then
+                    if( a( k, j+1 )/=zero ) then
+                       alpha = one / a( k, j+1 )
+                       call stdlib_dcopy( m-j-1, work( 3 ), 1, a( k, j+2 ), lda )
+                       call stdlib_dscal( m-j-1, alpha, a( k, j+2 ), lda )
+                    else
+                       call stdlib_dlaset( 'FULL', 1, m-j-1, zero, zero,a( k, j+2 ), lda)
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 10
+              20 continue
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              30 continue
+              if( j>min( m, nb ) )go to 40
+              ! k is the column to be factorized
+               ! when being called from stdlib_dsytrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:m, j) := a(j:m, j) - h(j:m, 1:(j-1)) * l(j, j1:(j-1))^t,
+               ! where h(j:m, j) has been initialized to be a(j:m, j)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_dgemv( 'NO TRANSPOSE', mj, j-k1,-one, h( j, k1 ), ldh,a( j, 1 ), lda,&
+                           one, h( j, j ), 1 )
+              end if
+              ! copy h(j:m, j) into work
+              call stdlib_dcopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j:m, j-1) * t(j-1,j),
+                  ! where a(j-1, j) = t(j-1, j) and a(j, j-2) = l(j, j-1)
+                 alpha = -a( j, k-1 )
+                 call stdlib_daxpy( mj, alpha, a( j, k-2 ), 1, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( j, k ) = work( 1 )
+              if( j<m ) then
+                 ! compute work(2:m) = t(j, j) l((j+1):m, j)
+                  ! where a(j, j) = t(j, j) and a((j+1):m, j-1) = l((j+1):m, j)
+                 if( k>1 ) then
+                    alpha = -a( j, k )
+                    call stdlib_daxpy( m-j, alpha, a( j+1, k-1 ), 1,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:m)|)
+                 i2 = stdlib_idamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply symmetric pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1+1:m, i1) with a(i2, i1+1:m)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_dswap( i2-i1-1, a( i1+1, j1+i1-1 ), 1,a( i2, j1+i1 ), lda )
+                              
+                    ! swap a(i2+1:m, i1) with a(i2+1:m, i2)
+                    if( i2<m )call stdlib_dswap( m-i2, a( i2+1, j1+i1-1 ), 1,a( i2+1, j1+i2-1 ), &
+                              1 )
+                    ! swap a(i1, i1) with a(i2, i2)
+                    piv = a( i1, j1+i1-1 )
+                    a( i1, j1+i1-1 ) = a( i2, j1+i2-1 )
+                    a( i2, j1+i2-1 ) = piv
+                    ! swap h(i1, i1:j1) with h(i2, i2:j1)
+                    call stdlib_dswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_dswap( i1-k1+1, a( i1, 1 ), lda,a( i2, 1 ), lda )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j+1, j) = t(j+1, j)
+                 a( j+1, k ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:m, j+1) into h(j+1:m, j),
+                    call stdlib_dcopy( m-j, a( j+1, k+1 ), 1,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:m ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:m, j) = l(j+2:m, j+1)
+                 if( j<(m-1) ) then
+                    if( a( j+1, k )/=zero ) then
+                       alpha = one / a( j+1, k )
+                       call stdlib_dcopy( m-j-1, work( 3 ), 1, a( j+2, k ), 1 )
+                       call stdlib_dscal( m-j-1, alpha, a( j+2, k ), 1 )
+                    else
+                       call stdlib_dlaset( 'FULL', m-j-1, 1, zero, zero,a( j+2, k ), lda )
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 30
+              40 continue
+           end if
+           return
+     end subroutine stdlib_dlasyf_aa
+
+     !> DPTEQR: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric positive definite tridiagonal matrix by first factoring the
+     !> matrix using DPTTRF, and then calling DBDSQR to compute the singular
+     !> values of the bidiagonal factor.
+     !> This routine computes the eigenvalues of the positive definite
+     !> tridiagonal matrix to high relative accuracy.  This means that if the
+     !> eigenvalues range over many orders of magnitude in size, then the
+     !> small eigenvalues and corresponding eigenvectors will be computed
+     !> more accurately than, for example, with the standard QR method.
+     !> The eigenvectors of a full or band symmetric positive definite matrix
+     !> can also be found if DSYTRD, DSPTRD, or DSBTRD has been used to
+     !> reduce this matrix to tridiagonal form. (The reduction to tridiagonal
+     !> form, however, may preclude the possibility of obtaining high
+     !> relative accuracy in the small eigenvalues of the original matrix, if
+     !> these eigenvalues range over many orders of magnitude.)
+
+     pure subroutine stdlib_dpteqr( compz, n, d, e, z, ldz, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: d(*), e(*), z(ldz,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Arrays 
+           real(dp) :: c(1,1), vt(1,1)
+           ! Local Scalars 
+           integer(ilp) :: i, icompz, nru
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or. ( icompz>0 .and. ldz<max( 1,n ) ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPTEQR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz>0 )z( 1, 1 ) = one
+              return
+           end if
+           if( icompz==2 )call stdlib_dlaset( 'FULL', n, n, zero, one, z, ldz )
+           ! call stdlib_dpttrf to factor the matrix.
+           call stdlib_dpttrf( n, d, e, info )
+           if( info/=0 )return
+           do i = 1, n
+              d( i ) = sqrt( d( i ) )
+           end do
+           do i = 1, n - 1
+              e( i ) = e( i )*d( i )
+           end do
+           ! call stdlib_dbdsqr to compute the singular values/vectors of the
+           ! bidiagonal factor.
+           if( icompz>0 ) then
+              nru = n
+           else
+              nru = 0
+           end if
+           call stdlib_dbdsqr( 'LOWER', n, 0, nru, 0, d, e, vt, 1, z, ldz, c, 1,work, info )
+                     
+           ! square the singular values.
+           if( info==0 ) then
+              do i = 1, n
+                 d( i ) = d( i )*d( i )
+              end do
+           else
+              info = n + info
+           end if
+           return
+     end subroutine stdlib_dpteqr
+
+     !> DSTEGR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+     !> a well defined set of pairwise different real eigenvalues, the corresponding
+     !> real eigenvectors are pairwise orthogonal.
+     !> The spectrum may be computed either completely or partially by specifying
+     !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+     !> eigenvalues.
+     !> DSTEGR is a compatibility wrapper around the improved DSTEMR routine.
+     !> See DSTEMR for further details.
+     !> One important change is that the ABSTOL parameter no longer provides any
+     !> benefit and hence is no longer used.
+     !> Note : DSTEGR and DSTEMR work only on machines which follow
+     !> IEEE-754 floating-point standard in their handling of infinities and
+     !> NaNs.  Normal execution may create these exceptiona values and hence
+     !> may abort due to a floating point exception in environments which
+     !> do not conform to the IEEE-754 standard.
+
+     pure subroutine stdlib_dstegr( jobz, range, n, d, e, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               isuppz, work, lwork, iwork,liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           integer(ilp), intent(in) :: il, iu, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: w(*), work(*)
+           real(dp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: tryrac
+           ! Executable Statements 
+           info = 0
+           tryrac = .false.
+           call stdlib_dstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, n, isuppz, &
+                     tryrac, work, lwork,iwork, liwork, info )
+     end subroutine stdlib_dstegr
+
+     !> DSTEMR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+     !> a well defined set of pairwise different real eigenvalues, the corresponding
+     !> real eigenvectors are pairwise orthogonal.
+     !> The spectrum may be computed either completely or partially by specifying
+     !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+     !> eigenvalues.
+     !> Depending on the number of desired eigenvalues, these are computed either
+     !> by bisection or the dqds algorithm. Numerically orthogonal eigenvectors are
+     !> computed by the use of various suitable L D L^T factorizations near clusters
+     !> of close eigenvalues (referred to as RRRs, Relatively Robust
+     !> Representations). An informal sketch of the algorithm follows.
+     !> For each unreduced block (submatrix) of T,
+     !> (a) Compute T - sigma I  = L D L^T, so that L and D
+     !> define all the wanted eigenvalues to high relative accuracy.
+     !> This means that small relative changes in the entries of D and L
+     !> cause only small relative changes in the eigenvalues and
+     !> eigenvectors. The standard (unfactored) representation of the
+     !> tridiagonal matrix T does not have this property in general.
+     !> (b) Compute the eigenvalues to suitable accuracy.
+     !> If the eigenvectors are desired, the algorithm attains full
+     !> accuracy of the computed eigenvalues only right before
+     !> the corresponding vectors have to be computed, see steps c) and d).
+     !> (c) For each cluster of close eigenvalues, select a new
+     !> shift close to the cluster, find a new factorization, and refine
+     !> the shifted eigenvalues to suitable accuracy.
+     !> (d) For each eigenvalue with a large enough relative separation compute
+     !> the corresponding eigenvector by forming a rank revealing twisted
+     !> factorization. Go back to (c) for any clusters that remain.
+     !> For more details, see:
+     !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+     !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+     !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+     !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+     !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+     !> 2004.  Also LAPACK Working Note 154.
+     !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem",
+     !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Further Details
+     !> 1.DSTEMR works only on machines which follow IEEE-754
+     !> floating-point standard in their handling of infinities and NaNs.
+     !> This permits the use of efficient inner loops avoiding a check for
+     !> zero divisors.
+
+     pure subroutine stdlib_dstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, nzc, &
+               isuppz, tryrac, work, lwork,iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           logical(lk), intent(inout) :: tryrac
+           integer(ilp), intent(in) :: il, iu, ldz, nzc, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: w(*), work(*)
+           real(dp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: minrgp = 1.0e-3_dp
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lquery, valeig, wantz, zquery
+           integer(ilp) :: i, ibegin, iend, ifirst, iil, iindbl, iindw, iindwk, iinfo, iinspl, &
+           iiu, ilast, in, indd, inde2, inderr, indgp, indgrs, indwrk, itmp, itmp2, j, jblk, jj, &
+                     liwmin, lwmin, nsplit, nzcmin, offset, wbegin, wend
+           real(dp) :: bignum, cs, eps, pivmin, r1, r2, rmax, rmin, rtol1, rtol2, safmin, scale, &
+                     smlnum, sn, thresh, tmp, tnrm, wl, wu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ).or.( liwork==-1 ) )
+           zquery = ( nzc==-1 )
+           ! stdlib_dstemr needs work of size 6*n, iwork of size 3*n.
+           ! in addition, stdlib_dlarre needs work of size 6*n, iwork of size 5*n.
+           ! furthermore, stdlib_dlarrv needs work of size 12*n, iwork of size 7*n.
+           if( wantz ) then
+              lwmin = 18*n
+              liwmin = 10*n
+           else
+              ! need less workspace if only the eigenvalues are wanted
+              lwmin = 12*n
+              liwmin = 8*n
+           endif
+           wl = zero
+           wu = zero
+           iil = 0
+           iiu = 0
+           nsplit = 0
+           if( valeig ) then
+              ! we do not reference vl, vu in the cases range = 'i','a'
+              ! the interval (wl, wu] contains all the wanted eigenvalues.
+              ! it is either given by the user or computed in stdlib_dlarre.
+              wl = vl
+              wu = vu
+           elseif( indeig ) then
+              ! we do not reference il, iu in the cases range = 'v','a'
+              iil = il
+              iiu = iu
+           endif
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( valeig .and. n>0 .and. wu<=wl ) then
+              info = -7
+           else if( indeig .and. ( iil<1 .or. iil>n ) ) then
+              info = -8
+           else if( indeig .and. ( iiu<iil .or. iiu>n ) ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -13
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -17
+           else if( liwork<liwmin .and. .not.lquery ) then
+              info = -19
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( wantz .and. alleig ) then
+                 nzcmin = n
+              else if( wantz .and. valeig ) then
+                 call stdlib_dlarrc( 'T', n, vl, vu, d, e, safmin,nzcmin, itmp, itmp2, info )
+                           
+              else if( wantz .and. indeig ) then
+                 nzcmin = iiu-iil+1
+              else
+                 ! wantz == false.
+                 nzcmin = 0
+              endif
+              if( zquery .and. info==0 ) then
+                 z( 1,1 ) = nzcmin
+              else if( nzc<nzcmin .and. .not.zquery ) then
+                 info = -14
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEMR', -info )
+              return
+           else if( lquery .or. zquery ) then
+              return
+           end if
+           ! handle n = 0, 1, and 2 cases immediately
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = d( 1 )
+              else
+                 if( wl<d( 1 ) .and. wu>=d( 1 ) ) then
+                    m = 1
+                    w( 1 ) = d( 1 )
+                 end if
+              end if
+              if( wantz.and.(.not.zquery) ) then
+                 z( 1, 1 ) = one
+                 isuppz(1) = 1
+                 isuppz(2) = 1
+              end if
+              return
+           end if
+           if( n==2 ) then
+              if( .not.wantz ) then
+                 call stdlib_dlae2( d(1), e(1), d(2), r1, r2 )
+              else if( wantz.and.(.not.zquery) ) then
+                 call stdlib_dlaev2( d(1), e(1), d(2), r1, r2, cs, sn )
+              end if
+              if( alleig.or.(valeig.and.(r2>wl).and.(r2<=wu)).or.(indeig.and.(iil==1)) ) &
+                        then
+                 m = m+1
+                 w( m ) = r2
+                 if( wantz.and.(.not.zquery) ) then
+                    z( 1, m ) = -sn
+                    z( 2, m ) = cs
+                    ! note: at most one of sn and cs can be zero.
+                    if (sn/=zero) then
+                       if (cs/=zero) then
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 2
+                       else
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 1
+                       end if
+                    else
+                       isuppz(2*m-1) = 2
+                       isuppz(2*m) = 2
+                    end if
+                 endif
+              endif
+              if( alleig.or.(valeig.and.(r1>wl).and.(r1<=wu)).or.(indeig.and.(iiu==2)) ) &
+                        then
+                 m = m+1
+                 w( m ) = r1
+                 if( wantz.and.(.not.zquery) ) then
+                    z( 1, m ) = cs
+                    z( 2, m ) = sn
+                    ! note: at most one of sn and cs can be zero.
+                    if (sn/=zero) then
+                       if (cs/=zero) then
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 2
+                       else
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 1
+                       end if
+                    else
+                       isuppz(2*m-1) = 2
+                       isuppz(2*m) = 2
+                    end if
+                 endif
+              endif
+           else
+           ! continue with general n
+              indgrs = 1
+              inderr = 2*n + 1
+              indgp = 3*n + 1
+              indd = 4*n + 1
+              inde2 = 5*n + 1
+              indwrk = 6*n + 1
+              iinspl = 1
+              iindbl = n + 1
+              iindw = 2*n + 1
+              iindwk = 3*n + 1
+              ! scale matrix to allowable range, if necessary.
+              ! the allowable range is related to the pivmin parameter; see the
+              ! comments in stdlib_dlarrd.  the preference for scaling small values
+              ! up is heuristic; we expect users' matrices not to be close to the
+              ! rmax threshold.
+              scale = one
+              tnrm = stdlib_dlanst( 'M', n, d, e )
+              if( tnrm>zero .and. tnrm<rmin ) then
+                 scale = rmin / tnrm
+              else if( tnrm>rmax ) then
+                 scale = rmax / tnrm
+              end if
+              if( scale/=one ) then
+                 call stdlib_dscal( n, scale, d, 1 )
+                 call stdlib_dscal( n-1, scale, e, 1 )
+                 tnrm = tnrm*scale
+                 if( valeig ) then
+                    ! if eigenvalues in interval have to be found,
+                    ! scale (wl, wu] accordingly
+                    wl = wl*scale
+                    wu = wu*scale
+                 endif
+              end if
+              ! compute the desired eigenvalues of the tridiagonal after splitting
+              ! into smaller subblocks if the corresponding off-diagonal elements
+              ! are small
+              ! thresh is the splitting parameter for stdlib_dlarre
+              ! a negative thresh forces the old splitting criterion based on the
+              ! size of the off-diagonal. a positive thresh switches to splitting
+              ! which preserves relative accuracy.
+              if( tryrac ) then
+                 ! test whether the matrix warrants the more expensive relative approach.
+                 call stdlib_dlarrr( n, d, e, iinfo )
+              else
+                 ! the user does not care about relative accurately eigenvalues
+                 iinfo = -1
+              endif
+              ! set the splitting criterion
+              if (iinfo==0) then
+                 thresh = eps
+              else
+                 thresh = -eps
+                 ! relative accuracy is desired but t does not guarantee it
+                 tryrac = .false.
+              endif
+              if( tryrac ) then
+                 ! copy original diagonal, needed to guarantee relative accuracy
+                 call stdlib_dcopy(n,d,1,work(indd),1)
+              endif
+              ! store the squares of the offdiagonal values of t
+              do j = 1, n-1
+                 work( inde2+j-1 ) = e(j)**2
+              end do
+              ! set the tolerance parameters for bisection
+              if( .not.wantz ) then
+                 ! stdlib_dlarre computes the eigenvalues to full precision.
+                 rtol1 = four * eps
+                 rtol2 = four * eps
+              else
+                 ! stdlib_dlarre computes the eigenvalues to less than full precision.
+                 ! stdlib_dlarrv will refine the eigenvalue approximations, and we can
+                 ! need less accurate initial bisection in stdlib_dlarre.
+                 ! note: these settings do only affect the subset case and stdlib_dlarre
+                 rtol1 = sqrt(eps)
+                 rtol2 = max( sqrt(eps)*5.0e-3_dp, four * eps )
+              endif
+              call stdlib_dlarre( range, n, wl, wu, iil, iiu, d, e,work(inde2), rtol1, rtol2, &
+              thresh, nsplit,iwork( iinspl ), m, w, work( inderr ),work( indgp ), iwork( iindbl ),&
+              iwork( iindw ), work( indgrs ), pivmin,work( indwrk ), iwork( iindwk ), iinfo )
+                        
+              if( iinfo/=0 ) then
+                 info = 10 + abs( iinfo )
+                 return
+              end if
+              ! note that if range /= 'v', stdlib_dlarre computes bounds on the desired
+              ! part of the spectrum. all desired eigenvalues are contained in
+              ! (wl,wu]
+              if( wantz ) then
+                 ! compute the desired eigenvectors corresponding to the computed
+                 ! eigenvalues
+                 call stdlib_dlarrv( n, wl, wu, d, e,pivmin, iwork( iinspl ), m,1, m, minrgp, &
+                 rtol1, rtol2,w, work( inderr ), work( indgp ), iwork( iindbl ),iwork( iindw ), &
+                           work( indgrs ), z, ldz,isuppz, work( indwrk ), iwork( iindwk ), iinfo )
+                 if( iinfo/=0 ) then
+                    info = 20 + abs( iinfo )
+                    return
+                 end if
+              else
+                 ! stdlib_dlarre computes eigenvalues of the (shifted) root representation
+                 ! stdlib_dlarrv returns the eigenvalues of the unshifted matrix.
+                 ! however, if the eigenvectors are not desired by the user, we need
+                 ! to apply the corresponding shifts from stdlib_dlarre to obtain the
+                 ! eigenvalues of the original matrix.
+                 do j = 1, m
+                    itmp = iwork( iindbl+j-1 )
+                    w( j ) = w( j ) + e( iwork( iinspl+itmp-1 ) )
+                 end do
+              end if
+              if ( tryrac ) then
+                 ! refine computed eigenvalues so that they are relatively accurate
+                 ! with respect to the original matrix t.
+                 ibegin = 1
+                 wbegin = 1
+                 loop_39: do jblk = 1, iwork( iindbl+m-1 )
+                    iend = iwork( iinspl+jblk-1 )
+                    in = iend - ibegin + 1
+                    wend = wbegin - 1
+                    ! check if any eigenvalues have to be refined in this block
+                    36 continue
+                    if( wend<m ) then
+                       if( iwork( iindbl+wend )==jblk ) then
+                          wend = wend + 1
+                          go to 36
+                       end if
+                    end if
+                    if( wend<wbegin ) then
+                       ibegin = iend + 1
+                       cycle loop_39
+                    end if
+                    offset = iwork(iindw+wbegin-1)-1
+                    ifirst = iwork(iindw+wbegin-1)
+                    ilast = iwork(iindw+wend-1)
+                    rtol2 = four * eps
+                    call stdlib_dlarrj( in,work(indd+ibegin-1), work(inde2+ibegin-1),ifirst, &
+                    ilast, rtol2, offset, w(wbegin),work( inderr+wbegin-1 ),work( indwrk ), iwork(&
+                               iindwk ), pivmin,tnrm, iinfo )
+                    ibegin = iend + 1
+                    wbegin = wend + 1
+                 end do loop_39
+              endif
+              ! if matrix was scaled, then rescale eigenvalues appropriately.
+              if( scale/=one ) then
+                 call stdlib_dscal( m, one / scale, w, 1 )
+              end if
+           end if
+           ! if eigenvalues are not in increasing order, then sort them,
+           ! possibly along with eigenvectors.
+           if( nsplit>1 .or. n==2 ) then
+              if( .not. wantz ) then
+                 call stdlib_dlasrt( 'I', m, w, iinfo )
+                 if( iinfo/=0 ) then
+                    info = 3
+                    return
+                 end if
+              else
+                 do j = 1, m - 1
+                    i = 0
+                    tmp = w( j )
+                    do jj = j + 1, m
+                       if( w( jj )<tmp ) then
+                          i = jj
+                          tmp = w( jj )
+                       end if
+                    end do
+                    if( i/=0 ) then
+                       w( i ) = w( j )
+                       w( j ) = tmp
+                       if( wantz ) then
+                          call stdlib_dswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                          itmp = isuppz( 2*i-1 )
+                          isuppz( 2*i-1 ) = isuppz( 2*j-1 )
+                          isuppz( 2*j-1 ) = itmp
+                          itmp = isuppz( 2*i )
+                          isuppz( 2*i ) = isuppz( 2*j )
+                          isuppz( 2*j ) = itmp
+                       end if
+                    end if
+                 end do
+              end if
+           endif
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_dstemr
+
+     !> DSTEVR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T.  Eigenvalues and
+     !> eigenvectors can be selected by specifying either a range of values
+     !> or a range of indices for the desired eigenvalues.
+     !> Whenever possible, DSTEVR calls DSTEMR to compute the
+     !> eigenspectrum using Relatively Robust Representations.  DSTEMR
+     !> computes eigenvalues by the dqds algorithm, while orthogonal
+     !> eigenvectors are computed from various "good" L D L^T representations
+     !> (also known as Relatively Robust Representations). Gram-Schmidt
+     !> orthogonalization is avoided as far as possible. More specifically,
+     !> the various steps of the algorithm are as follows. For the i-th
+     !> unreduced block of T,
+     !> (a) Compute T - sigma_i = L_i D_i L_i^T, such that L_i D_i L_i^T
+     !> is a relatively robust representation,
+     !> (b) Compute the eigenvalues, lambda_j, of L_i D_i L_i^T to high
+     !> relative accuracy by the dqds algorithm,
+     !> (c) If there is a cluster of close eigenvalues, "choose" sigma_i
+     !> close to the cluster, and go to step (a),
+     !> (d) Given the approximate eigenvalue lambda_j of L_i D_i L_i^T,
+     !> compute the corresponding eigenvector by forming a
+     !> rank-revealing twisted factorization.
+     !> The desired accuracy of the output can be specified by the input
+     !> parameter ABSTOL.
+     !> For more details, see "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem", by Inderjit Dhillon,
+     !> Computer Science Division Technical Report No. UCB//CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Note 1 : DSTEVR calls DSTEMR when the full spectrum is requested
+     !> on machines which conform to the ieee-754 floating point standard.
+     !> DSTEVR calls DSTEBZ and DSTEIN on non-ieee machines and
+     !> when partial spectrum requests are made.
+     !> Normal execution of DSTEMR may create NaNs and infinities and
+     !> hence may abort due to a floating point exception in environments
+     !> which do not handle NaNs and infinities in the ieee standard default
+     !> manner.
+
+     pure subroutine stdlib_dstevr( jobz, range, n, d, e, vl, vu, il, iu, abstol,m, w, z, ldz, &
+               isuppz, work, lwork, iwork,liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           integer(ilp), intent(in) :: il, iu, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, lquery, valeig, wantz, tryrac
+           character :: order
+           integer(ilp) :: i, ieeeok, imax, indibl, indifl, indisp, indiwo, iscale, itmp1, j, jj, &
+                     liwmin, lwmin, nsplit
+           real(dp) :: bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, tnrm, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           ieeeok = stdlib_ilaenv( 10, 'DSTEVR', 'N', 1, 2, 3, 4 )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ) .or. ( liwork==-1 ) )
+           lwmin = max( 1, 20*n )
+           liwmin = max( 1, 10*n )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -7
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -8
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -9
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -14
+              end if
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -17
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -19
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEVR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = d( 1 )
+              else
+                 if( vl<d( 1 ) .and. vu>=d( 1 ) ) then
+                    m = 1
+                    w( 1 ) = d( 1 )
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           end if
+           tnrm = stdlib_dlanst( 'M', n, d, e )
+           if( tnrm>zero .and. tnrm<rmin ) then
+              iscale = 1
+              sigma = rmin / tnrm
+           else if( tnrm>rmax ) then
+              iscale = 1
+              sigma = rmax / tnrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_dscal( n, sigma, d, 1 )
+              call stdlib_dscal( n-1, sigma, e( 1 ), 1 )
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! initialize indices into workspaces.  note: these indices are used only
+           ! if stdlib_dsterf or stdlib_dstemr fail.
+           ! iwork(indibl:indibl+m-1) corresponds to iblock in stdlib_dstebz and
+           ! stores the block indices of each of the m<=n eigenvalues.
+           indibl = 1
+           ! iwork(indisp:indisp+nsplit-1) corresponds to isplit in stdlib_dstebz and
+           ! stores the starting and finishing indices of each block.
+           indisp = indibl + n
+           ! iwork(indifl:indifl+n-1) stores the indices of eigenvectors
+           ! that corresponding to eigenvectors that fail to converge in
+           ! stdlib_dstein.  this information is discarded; if any fail, the driver
+           ! returns info > 0.
+           indifl = indisp + n
+           ! indiwo is the offset of the remaining integer workspace.
+           indiwo = indisp + n
+           ! if all eigenvalues are desired, then
+           ! call stdlib_dsterf or stdlib_dstemr.  if this fails for some eigenvalue, then
+           ! try stdlib_dstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ieeeok==1 ) then
+              call stdlib_dcopy( n-1, e( 1 ), 1, work( 1 ), 1 )
+              if( .not.wantz ) then
+                 call stdlib_dcopy( n, d, 1, w, 1 )
+                 call stdlib_dsterf( n, w, work, info )
+              else
+                 call stdlib_dcopy( n, d, 1, work( n+1 ), 1 )
+                 if (abstol <= two*n*eps) then
+                    tryrac = .true.
+                 else
+                    tryrac = .false.
+                 end if
+                 call stdlib_dstemr( jobz, 'A', n, work( n+1 ), work, vl, vu, il,iu, m, w, z, ldz,&
+                            n, isuppz, tryrac,work( 2*n+1 ), lwork-2*n, iwork, liwork, info )
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 10
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_dstebz and, if eigenvectors are desired, stdlib_dstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           call stdlib_dstebz( range, order, n, vll, vuu, il, iu, abstol, d, e, m,nsplit, w, &
+                     iwork( indibl ), iwork( indisp ), work,iwork( indiwo ), info )
+           if( wantz ) then
+              call stdlib_dstein( n, d, e, m, w, iwork( indibl ), iwork( indisp ),z, ldz, work, &
+                        iwork( indiwo ), iwork( indifl ),info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           10 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( i )
+                    w( i ) = w( j )
+                    iwork( i ) = iwork( j )
+                    w( j ) = tmp1
+                    iwork( j ) = itmp1
+                    call stdlib_dswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                 end if
+              end do
+           end if
+            ! causes problems with tests 19
+            ! if (wantz .and. indeig ) z( 1,1) = z(1,1) / 1.002_dp + .002
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_dstevr
+
+     !> DSYEVR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric matrix A.  Eigenvalues and eigenvectors can be
+     !> selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+     !> DSYEVR first reduces the matrix A to tridiagonal form T with a call
+     !> to DSYTRD.  Then, whenever possible, DSYEVR calls DSTEMR to compute
+     !> the eigenspectrum using Relatively Robust Representations.  DSTEMR
+     !> computes eigenvalues by the dqds algorithm, while orthogonal
+     !> eigenvectors are computed from various "good" L D L^T representations
+     !> (also known as Relatively Robust Representations). Gram-Schmidt
+     !> orthogonalization is avoided as far as possible. More specifically,
+     !> the various steps of the algorithm are as follows.
+     !> For each unreduced block (submatrix) of T,
+     !> (a) Compute T - sigma I  = L D L^T, so that L and D
+     !> define all the wanted eigenvalues to high relative accuracy.
+     !> This means that small relative changes in the entries of D and L
+     !> cause only small relative changes in the eigenvalues and
+     !> eigenvectors. The standard (unfactored) representation of the
+     !> tridiagonal matrix T does not have this property in general.
+     !> (b) Compute the eigenvalues to suitable accuracy.
+     !> If the eigenvectors are desired, the algorithm attains full
+     !> accuracy of the computed eigenvalues only right before
+     !> the corresponding vectors have to be computed, see steps c) and d).
+     !> (c) For each cluster of close eigenvalues, select a new
+     !> shift close to the cluster, find a new factorization, and refine
+     !> the shifted eigenvalues to suitable accuracy.
+     !> (d) For each eigenvalue with a large enough relative separation compute
+     !> the corresponding eigenvector by forming a rank revealing twisted
+     !> factorization. Go back to (c) for any clusters that remain.
+     !> The desired accuracy of the output can be specified by the input
+     !> parameter ABSTOL.
+     !> For more details, see DSTEMR's documentation and:
+     !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+     !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+     !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+     !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+     !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+     !> 2004.  Also LAPACK Working Note 154.
+     !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem",
+     !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Note 1 : DSYEVR calls DSTEMR when the full spectrum is requested
+     !> on machines which conform to the ieee-754 floating point standard.
+     !> DSYEVR calls DSTEBZ and DSTEIN on non-ieee machines and
+     !> when partial spectrum requests are made.
+     !> Normal execution of DSTEMR may create NaNs and infinities and
+     !> hence may abort due to a floating point exception in environments
+     !> which do not handle NaNs and infinities in the ieee standard default
+     !> manner.
+
+     subroutine stdlib_dsyevr( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               isuppz, work, lwork,iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, lda, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: w(*), work(*), z(ldz,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, lquery, valeig, wantz, tryrac
+           character :: order
+           integer(ilp) :: i, ieeeok, iinfo, imax, indd, inddd, inde, indee, indibl, indifl, &
+           indisp, indiwo, indtau, indwk, indwkn, iscale, j, jj, liwmin, llwork, llwrkn, lwkopt, &
+                     lwmin, nb, nsplit
+           real(dp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           ieeeok = stdlib_ilaenv( 10, 'DSYEVR', 'N', 1, 2, 3, 4 )
+           lower = stdlib_lsame( uplo, 'L' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ) .or. ( liwork==-1 ) )
+           lwmin = max( 1, 26*n )
+           liwmin = max( 1, 10*n )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -8
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -9
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -15
+              else if( lwork<lwmin .and. .not.lquery ) then
+                 info = -18
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 )
+              nb = max( nb, stdlib_ilaenv( 1, 'DORMTR', uplo, n, -1, -1, -1 ) )
+              lwkopt = max( ( nb+1 )*n, lwmin )
+              work( 1 ) = lwkopt
+              iwork( 1 ) = liwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYEVR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( n==1 ) then
+              work( 1 ) = 7
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = a( 1, 1 )
+              else
+                 if( vl<a( 1, 1 ) .and. vu>=a( 1, 1 ) ) then
+                    m = 1
+                    w( 1 ) = a( 1, 1 )
+                 end if
+              end if
+              if( wantz ) then
+                 z( 1, 1 ) = one
+                 isuppz( 1 ) = 1
+                 isuppz( 2 ) = 1
+              end if
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if (valeig) then
+              vll = vl
+              vuu = vu
+           end if
+           anrm = stdlib_dlansy( 'M', uplo, n, a, lda, work )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 do j = 1, n
+                    call stdlib_dscal( n-j+1, sigma, a( j, j ), 1 )
+                 end do
+              else
+                 do j = 1, n
+                    call stdlib_dscal( j, sigma, a( 1, j ), 1 )
+                 end do
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! initialize indices into workspaces.  note: the iwork indices are
+           ! used only if stdlib_dsterf or stdlib_dstemr fail.
+           ! work(indtau:indtau+n-1) stores the scalar factors of the
+           ! elementary reflectors used in stdlib_dsytrd.
+           indtau = 1
+           ! work(indd:indd+n-1) stores the tridiagonal's diagonal entries.
+           indd = indtau + n
+           ! work(inde:inde+n-1) stores the off-diagonal entries of the
+           ! tridiagonal matrix from stdlib_dsytrd.
+           inde = indd + n
+           ! work(inddd:inddd+n-1) is a copy of the diagonal entries over
+           ! -written by stdlib_dstemr (the stdlib_dsterf path copies the diagonal to w).
+           inddd = inde + n
+           ! work(indee:indee+n-1) is a copy of the off-diagonal entries over
+           ! -written while computing the eigenvalues in stdlib_dsterf and stdlib_dstemr.
+           indee = inddd + n
+           ! indwk is the starting offset of the left-over workspace, and
+           ! llwork is the remaining workspace size.
+           indwk = indee + n
+           llwork = lwork - indwk + 1
+           ! iwork(indibl:indibl+m-1) corresponds to iblock in stdlib_dstebz and
+           ! stores the block indices of each of the m<=n eigenvalues.
+           indibl = 1
+           ! iwork(indisp:indisp+nsplit-1) corresponds to isplit in stdlib_dstebz and
+           ! stores the starting and finishing indices of each block.
+           indisp = indibl + n
+           ! iwork(indifl:indifl+n-1) stores the indices of eigenvectors
+           ! that corresponding to eigenvectors that fail to converge in
+           ! stdlib_dstein.  this information is discarded; if any fail, the driver
+           ! returns info > 0.
+           indifl = indisp + n
+           ! indiwo is the offset of the remaining integer workspace.
+           indiwo = indifl + n
+           ! call stdlib_dsytrd to reduce symmetric matrix to tridiagonal form.
+           call stdlib_dsytrd( uplo, n, a, lda, work( indd ), work( inde ),work( indtau ), work( &
+                     indwk ), llwork, iinfo )
+           ! if all eigenvalues are desired
+           ! then call stdlib_dsterf or stdlib_dstemr and stdlib_dormtr.
+           if( ( alleig .or. ( indeig .and. il==1 .and. iu==n ) ) .and.ieeeok==1 ) then
+              if( .not.wantz ) then
+                 call stdlib_dcopy( n, work( indd ), 1, w, 1 )
+                 call stdlib_dcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_dsterf( n, w, work( indee ), info )
+              else
+                 call stdlib_dcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_dcopy( n, work( indd ), 1, work( inddd ), 1 )
+                 if (abstol <= two*n*eps) then
+                    tryrac = .true.
+                 else
+                    tryrac = .false.
+                 end if
+                 call stdlib_dstemr( jobz, 'A', n, work( inddd ), work( indee ),vl, vu, il, iu, m,&
+                            w, z, ldz, n, isuppz,tryrac, work( indwk ), lwork, iwork, liwork,info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_dstemr.
+                 if( wantz .and. info==0 ) then
+                    indwkn = inde
+                    llwrkn = lwork - indwkn + 1
+                    call stdlib_dormtr( 'L', uplo, 'N', n, m, a, lda,work( indtau ), z, ldz, work(&
+                               indwkn ),llwrkn, iinfo )
+                 end if
+              end if
+              if( info==0 ) then
+                 ! everything worked.  skip stdlib_dstebz/stdlib_dstein.  iwork(:) are
+                 ! undefined.
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_dstebz and, if eigenvectors are desired, stdlib_dstein.
+           ! also call stdlib_dstebz and stdlib_dstein if stdlib_dstemr fails.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           call stdlib_dstebz( range, order, n, vll, vuu, il, iu, abstll,work( indd ), work( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwk ),iwork( indiwo ), info )
+                     
+           if( wantz ) then
+              call stdlib_dstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwk ), iwork( indiwo ), iwork( indifl ),info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_dstein.
+              indwkn = inde
+              llwrkn = lwork - indwkn + 1
+              call stdlib_dormtr( 'L', uplo, 'N', n, m, a, lda, work( indtau ), z,ldz, work( &
+                        indwkn ), llwrkn, iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+        ! jump here if stdlib_dstemr/stdlib_dstein succeeded.
+        30 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.  note: we do not sort the ifail portion of iwork.
+           ! it may not be initialized (if stdlib_dstemr/stdlib_dstein succeeded), and we do
+           ! not return this detailed information to the user.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    w( i ) = w( j )
+                    w( j ) = tmp1
+                    call stdlib_dswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                 end if
+              end do
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_dsyevr
+
+     !> DSYSV computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Aasen's algorithm is used to factor A as
+     !> A = U**T * T * U,  if UPLO = 'U', or
+     !> A = L * T * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is symmetric tridiagonal. The factored
+     !> form of A is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_dsysv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, lwkopt_sytrf, lwkopt_sytrs
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max(2*n, 3*n-2) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              call stdlib_dsytrf_aa( uplo, n, a, lda, ipiv, work, -1, info )
+              lwkopt_sytrf = int( work(1),KIND=ilp)
+              call stdlib_dsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,-1, info )
+              lwkopt_sytrs = int( work(1),KIND=ilp)
+              lwkopt = max( lwkopt_sytrf, lwkopt_sytrs )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYSV_AA ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u**t*t*u or a = l*t*l**t.
+           call stdlib_dsytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_dsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsysv_aa
+
+     !> DSYTRF_AA: computes the factorization of a real symmetric matrix A
+     !> using the Aasen's algorithm.  The form of the factorization is
+     !> A = U**T*T*U  or  A = L*T*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is a symmetric tridiagonal matrix.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_dsytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: j, lwkopt
+           integer(ilp) :: nb, mj, nj, k1, k2, j1, j2, j3, jb
+           real(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! determine the block size
+           nb = stdlib_ilaenv( 1, 'DSYTRF_AA', uplo, n, -1, -1, -1 )
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              lwkopt = (nb+1)*n
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRF_AA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return
+           if ( n==0 ) then
+               return
+           endif
+           ipiv( 1 ) = 1
+           if ( n==1 ) then
+              return
+           end if
+           ! adjust block size based on the workspace size
+           if( lwork<((1+nb)*n) ) then
+              nb = ( lwork-n ) / n
+           end if
+           if( upper ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              ! copy first row a(1, 1:n) into h(1:n) (stored in work(1:n))
+              call stdlib_dcopy( n, a( 1, 1 ), lda, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_dlasyf;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              10 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j + 1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_dlasyf_aa( uplo, 2-k1, n-j, jb,a( max(1, j), j+1 ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_dswap( j1-k1-2, a( 1, j2 ), 1,a( 1, ipiv(j2) ), 1 )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+               ! the row a(j1-1, j2-1:n) stores u(j1, j2+1:n) and
+               ! work stores the current block of the auxiriarly matrix h
+              if( j<n ) then
+                 ! if first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = a( j, j+1 )
+                    a( j, j+1 ) = one
+                    call stdlib_dcopy( n-j, a( j-1, j+1 ), lda,work( (j+1-j1+1)+jb*n ), 1 )
+                              
+                    call stdlib_dscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=1 and k2= 0 for the first panel,
+                     ! while k1=0 and k2=1 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_dgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_dgemv( 'NO TRANSPOSE', mj, jb+1,-one, work( j3-j1+1+k1*n ), &
+                                    n,a( j1-k2, j3 ), 1,one, a( j3, j3 ), lda )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block of j2-th block row with stdlib_dgemm
+                       call stdlib_dgemm( 'TRANSPOSE', 'TRANSPOSE',nj, n-j3+1, jb+1,-one, a( j1-&
+                                 k2, j2 ), lda,work( j3-j1+1+k1*n ), n,one, a( j2, j3 ), lda )
+                    end do
+                    ! recover t( j, j+1 )
+                    a( j, j+1 ) = alpha
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_dcopy( n-j, a( j+1, j+1 ), lda, work( 1 ), 1 )
+              end if
+              go to 10
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              ! copy first column a(1:n, 1) into h(1:n, 1)
+               ! (stored in work(1:n))
+              call stdlib_dcopy( n, a( 1, 1 ), 1, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_dlasyf;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              11 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j+1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_dlasyf_aa( uplo, 2-k1, n-j, jb,a( j+1, max(1, j) ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_dswap( j1-k1-2, a( j2, 1 ), lda,a( ipiv(j2), 1 ), lda )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+                ! a(j2+1, j1-1) stores l(j2+1, j1) and
+                ! work(j2+1, 1) stores h(j2+1, 1)
+              if( j<n ) then
+                 ! if first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = a( j+1, j )
+                    a( j+1, j ) = one
+                    call stdlib_dcopy( n-j, a( j+1, j-1 ), 1,work( (j+1-j1+1)+jb*n ), 1 )
+                    call stdlib_dscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=1 and k2= 0 for the first panel,
+                     ! while k1=0 and k2=1 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_dgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_dgemv( 'NO TRANSPOSE', mj, jb+1,-one, work( j3-j1+1+k1*n ), &
+                                    n,a( j3, j1-k2 ), lda,one, a( j3, j3 ), 1 )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block in j2-th block column with stdlib_dgemm
+                       call stdlib_dgemm( 'NO TRANSPOSE', 'TRANSPOSE',n-j3+1, nj, jb+1,-one, work(&
+                                  j3-j1+1+k1*n ), n,a( j2, j1-k2 ), lda,one, a( j3, j2 ), lda )
+                    end do
+                    ! recover t( j+1, j )
+                    a( j+1, j ) = alpha
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_dcopy( n-j, a( j+1, j+1 ), 1, work( 1 ), 1 )
+              end if
+              go to 11
+           end if
+           20 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_dsytrf_aa
+
+
+
+end module stdlib_linalg_lapack_d
diff --git a/src/stdlib_linalg_lapack_q.fypp b/src/stdlib_linalg_lapack_q.fypp
new file mode 100644
index 000000000..66a9ecd67
--- /dev/null
+++ b/src/stdlib_linalg_lapack_q.fypp
@@ -0,0 +1,85277 @@
+#:include "common.fypp" 
+#:if WITH_QP 
+module stdlib_linalg_lapack_q
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas
+     use stdlib_linalg_lapack_aux
+     use stdlib_linalg_lapack_s
+     use stdlib_linalg_lapack_c
+     use stdlib_linalg_lapack_d
+     use stdlib_linalg_lapack_z
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+     public :: stdlib_qbbcsd
+     public :: stdlib_qbdsdc
+     public :: stdlib_qbdsqr
+     public :: stdlib_qdisna
+     public :: stdlib_qgbbrd
+     public :: stdlib_qgbcon
+     public :: stdlib_qgbequ
+     public :: stdlib_qgbequb
+     public :: stdlib_qgbrfs
+     public :: stdlib_qgbsv
+     public :: stdlib_qgbsvx
+     public :: stdlib_qgbtf2
+     public :: stdlib_qgbtrf
+     public :: stdlib_qgbtrs
+     public :: stdlib_qgebak
+     public :: stdlib_qgebal
+     public :: stdlib_qgebd2
+     public :: stdlib_qgebrd
+     public :: stdlib_qgecon
+     public :: stdlib_qgeequ
+     public :: stdlib_qgeequb
+     public :: stdlib_qgees
+     public :: stdlib_qgeesx
+     public :: stdlib_qgeev
+     public :: stdlib_qgeevx
+     public :: stdlib_qgehd2
+     public :: stdlib_qgehrd
+     public :: stdlib_qgejsv
+     public :: stdlib_qgelq
+     public :: stdlib_qgelq2
+     public :: stdlib_qgelqf
+     public :: stdlib_qgelqt
+     public :: stdlib_qgelqt3
+     public :: stdlib_qgels
+     public :: stdlib_qgelsd
+     public :: stdlib_qgelss
+     public :: stdlib_qgelsy
+     public :: stdlib_qgemlq
+     public :: stdlib_qgemlqt
+     public :: stdlib_qgemqr
+     public :: stdlib_qgemqrt
+     public :: stdlib_qgeql2
+     public :: stdlib_qgeqlf
+     public :: stdlib_qgeqp3
+     public :: stdlib_qgeqr
+     public :: stdlib_qgeqr2
+     public :: stdlib_qgeqr2p
+     public :: stdlib_qgeqrf
+     public :: stdlib_qgeqrfp
+     public :: stdlib_qgeqrt
+     public :: stdlib_qgeqrt2
+     public :: stdlib_qgeqrt3
+     public :: stdlib_qgerfs
+     public :: stdlib_qgerq2
+     public :: stdlib_qgerqf
+     public :: stdlib_qgesc2
+     public :: stdlib_qgesdd
+     public :: stdlib_qgesv
+     public :: stdlib_qgesvd
+     public :: stdlib_qgesvdq
+     public :: stdlib_qgesvj
+     public :: stdlib_qgesvx
+     public :: stdlib_qgetc2
+     public :: stdlib_qgetf2
+     public :: stdlib_qgetrf
+     public :: stdlib_qgetrf2
+     public :: stdlib_qgetri
+     public :: stdlib_qgetrs
+     public :: stdlib_qgetsls
+     public :: stdlib_qgetsqrhrt
+     public :: stdlib_qggbak
+     public :: stdlib_qggbal
+     public :: stdlib_qgges
+     public :: stdlib_qgges3
+     public :: stdlib_qggesx
+     public :: stdlib_qggev
+     public :: stdlib_qggev3
+     public :: stdlib_qggevx
+     public :: stdlib_qggglm
+     public :: stdlib_qgghd3
+     public :: stdlib_qgghrd
+     public :: stdlib_qgglse
+     public :: stdlib_qggqrf
+     public :: stdlib_qggrqf
+     public :: stdlib_qgsvj0
+     public :: stdlib_qgsvj1
+     public :: stdlib_qgtcon
+     public :: stdlib_qgtrfs
+     public :: stdlib_qgtsv
+     public :: stdlib_qgtsvx
+     public :: stdlib_qgttrf
+     public :: stdlib_qgttrs
+     public :: stdlib_qgtts2
+     public :: stdlib_qhgeqz
+     public :: stdlib_qhsein
+     public :: stdlib_qhseqr
+     public :: stdlib_qisnan
+     public :: stdlib_qla_gbamv
+     public :: stdlib_qla_gbrcond
+     public :: stdlib_qla_gbrpvgrw
+     public :: stdlib_qla_geamv
+     public :: stdlib_qla_gercond
+     public :: stdlib_qla_gerpvgrw
+     public :: stdlib_qla_lin_berr
+     public :: stdlib_qla_porcond
+     public :: stdlib_qla_porpvgrw
+     public :: stdlib_qla_syamv
+     public :: stdlib_qla_syrcond
+     public :: stdlib_qla_syrpvgrw
+     public :: stdlib_qla_wwaddw
+     public :: stdlib_qlabad
+     public :: stdlib_qlabrd
+     public :: stdlib_qlacn2
+     public :: stdlib_qlacon
+     public :: stdlib_qlacpy
+     public :: stdlib_qladiv
+     public :: stdlib_qladiv1
+     public :: stdlib_qladiv2
+     public :: stdlib_qlae2
+     public :: stdlib_qlaebz
+     public :: stdlib_qlaed0
+     public :: stdlib_qlaed1
+     public :: stdlib_qlaed2
+     public :: stdlib_qlaed3
+     public :: stdlib_qlaed4
+     public :: stdlib_qlaed5
+     public :: stdlib_qlaed6
+     public :: stdlib_qlaed7
+     public :: stdlib_qlaed8
+     public :: stdlib_qlaed9
+     public :: stdlib_qlaeda
+     public :: stdlib_qlaein
+     public :: stdlib_qlaev2
+     public :: stdlib_qlaexc
+     public :: stdlib_qlag2
+     public :: stdlib_qlag2s
+     public :: stdlib_qlags2
+     public :: stdlib_qlagtf
+     public :: stdlib_qlagtm
+     public :: stdlib_qlagts
+     public :: stdlib_qlagv2
+     public :: stdlib_qlahqr
+     public :: stdlib_qlahr2
+     public :: stdlib_qlaic1
+     public :: stdlib_qlaisnan
+     public :: stdlib_qlaln2
+     public :: stdlib_qlals0
+     public :: stdlib_qlalsa
+     public :: stdlib_qlalsd
+     public :: stdlib_qlamch
+     public :: stdlib_qlamc3
+     public :: stdlib_qlamrg
+     public :: stdlib_qlamswlq
+     public :: stdlib_qlamtsqr
+     public :: stdlib_qlaneg
+     public :: stdlib_qlangb
+     public :: stdlib_qlange
+     public :: stdlib_qlangt
+     public :: stdlib_qlanhs
+     public :: stdlib_qlansb
+     public :: stdlib_qlansf
+     public :: stdlib_qlansp
+     public :: stdlib_qlanst
+     public :: stdlib_qlansy
+     public :: stdlib_qlantb
+     public :: stdlib_qlantp
+     public :: stdlib_qlantr
+     public :: stdlib_qlanv2
+     public :: stdlib_qlaorhr_col_getrfnp
+     public :: stdlib_qlaorhr_col_getrfnp2
+     public :: stdlib_qlapll
+     public :: stdlib_qlapmr
+     public :: stdlib_qlapmt
+     public :: stdlib_qlapy2
+     public :: stdlib_qlapy3
+     public :: stdlib_qlaqgb
+     public :: stdlib_qlaqge
+     public :: stdlib_qlaqp2
+     public :: stdlib_qlaqps
+     public :: stdlib_qlaqr0
+     public :: stdlib_qlaqr1
+     public :: stdlib_qlaqr2
+     public :: stdlib_qlaqr3
+     public :: stdlib_qlaqr4
+     public :: stdlib_qlaqr5
+     public :: stdlib_qlaqsb
+     public :: stdlib_qlaqsp
+     public :: stdlib_qlaqsy
+     public :: stdlib_qlaqtr
+     public :: stdlib_qlaqz0
+     public :: stdlib_qlaqz1
+     public :: stdlib_qlaqz2
+     public :: stdlib_qlaqz3
+     public :: stdlib_qlaqz4
+     public :: stdlib_qlar1v
+     public :: stdlib_qlar2v
+     public :: stdlib_qlarf
+     public :: stdlib_qlarfb
+     public :: stdlib_qlarfb_gett
+     public :: stdlib_qlarfg
+     public :: stdlib_qlarfgp
+     public :: stdlib_qlarft
+     public :: stdlib_qlarfx
+     public :: stdlib_qlarfy
+     public :: stdlib_qlargv
+     public :: stdlib_qlarnv
+     public :: stdlib_qlarra
+     public :: stdlib_qlarrb
+     public :: stdlib_qlarrc
+     public :: stdlib_qlarrd
+     public :: stdlib_qlarre
+     public :: stdlib_qlarrf
+     public :: stdlib_qlarrj
+     public :: stdlib_qlarrk
+     public :: stdlib_qlarrr
+     public :: stdlib_qlarrv
+     public :: stdlib_qlartg
+     public :: stdlib_qlartgp
+     public :: stdlib_qlartgs
+     public :: stdlib_qlartv
+     public :: stdlib_qlaruv
+     public :: stdlib_qlarz
+     public :: stdlib_qlarzb
+     public :: stdlib_qlarzt
+     public :: stdlib_qlas2
+     public :: stdlib_qlascl
+     public :: stdlib_qlasd0
+     public :: stdlib_qlasd1
+     public :: stdlib_qlasd2
+     public :: stdlib_qlasd3
+     public :: stdlib_qlasd4
+     public :: stdlib_qlasd5
+     public :: stdlib_qlasd6
+     public :: stdlib_qlasd7
+     public :: stdlib_qlasd8
+     public :: stdlib_qlasda
+     public :: stdlib_qlasdq
+     public :: stdlib_qlasdt
+     public :: stdlib_qlaset
+     public :: stdlib_qlasq1
+     public :: stdlib_qlasq2
+     public :: stdlib_qlasq3
+     public :: stdlib_qlasq4
+     public :: stdlib_qlasq5
+     public :: stdlib_qlasq6
+     public :: stdlib_qlasr
+     public :: stdlib_qlasrt
+     public :: stdlib_qlassq
+     public :: stdlib_qlasv2
+     public :: stdlib_qlaswlq
+     public :: stdlib_qlaswp
+     public :: stdlib_qlasy2
+     public :: stdlib_qlasyf
+     public :: stdlib_qlasyf_aa
+     public :: stdlib_qlasyf_rk
+     public :: stdlib_qlasyf_rook
+     public :: stdlib_qlat2s
+     public :: stdlib_qlatbs
+     public :: stdlib_qlatdf
+     public :: stdlib_qlatps
+     public :: stdlib_qlatrd
+     public :: stdlib_qlatrs
+     public :: stdlib_qlatrz
+     public :: stdlib_qlatsqr
+     public :: stdlib_qlauu2
+     public :: stdlib_qlauum
+     public :: stdlib_qopgtr
+     public :: stdlib_qopmtr
+     public :: stdlib_qorbdb
+     public :: stdlib_qorbdb1
+     public :: stdlib_qorbdb2
+     public :: stdlib_qorbdb3
+     public :: stdlib_qorbdb4
+     public :: stdlib_qorbdb5
+     public :: stdlib_qorbdb6
+     public :: stdlib_qorcsd
+     public :: stdlib_qorcsd2by1
+     public :: stdlib_qorg2l
+     public :: stdlib_qorg2r
+     public :: stdlib_qorgbr
+     public :: stdlib_qorghr
+     public :: stdlib_qorgl2
+     public :: stdlib_qorglq
+     public :: stdlib_qorgql
+     public :: stdlib_qorgqr
+     public :: stdlib_qorgr2
+     public :: stdlib_qorgrq
+     public :: stdlib_qorgtr
+     public :: stdlib_qorgtsqr
+     public :: stdlib_qorgtsqr_row
+     public :: stdlib_qorhr_col
+     public :: stdlib_qorm22
+     public :: stdlib_qorm2l
+     public :: stdlib_qorm2r
+     public :: stdlib_qormbr
+     public :: stdlib_qormhr
+     public :: stdlib_qorml2
+     public :: stdlib_qormlq
+     public :: stdlib_qormql
+     public :: stdlib_qormqr
+     public :: stdlib_qormr2
+     public :: stdlib_qormr3
+     public :: stdlib_qormrq
+     public :: stdlib_qormrz
+     public :: stdlib_qormtr
+     public :: stdlib_qpbcon
+     public :: stdlib_qpbequ
+     public :: stdlib_qpbrfs
+     public :: stdlib_qpbstf
+     public :: stdlib_qpbsv
+     public :: stdlib_qpbsvx
+     public :: stdlib_qpbtf2
+     public :: stdlib_qpbtrf
+     public :: stdlib_qpbtrs
+     public :: stdlib_qpftrf
+     public :: stdlib_qpftri
+     public :: stdlib_qpftrs
+     public :: stdlib_qpocon
+     public :: stdlib_qpoequ
+     public :: stdlib_qpoequb
+     public :: stdlib_qporfs
+     public :: stdlib_qposv
+     public :: stdlib_qposvx
+     public :: stdlib_qpotf2
+     public :: stdlib_qpotrf
+     public :: stdlib_qpotrf2
+     public :: stdlib_qpotri
+     public :: stdlib_qpotrs
+     public :: stdlib_qppcon
+     public :: stdlib_qppequ
+     public :: stdlib_qpprfs
+     public :: stdlib_qppsv
+     public :: stdlib_qppsvx
+     public :: stdlib_qpptrf
+     public :: stdlib_qpptri
+     public :: stdlib_qpptrs
+     public :: stdlib_qpstf2
+     public :: stdlib_qpstrf
+     public :: stdlib_qptcon
+     public :: stdlib_qpteqr
+     public :: stdlib_qptrfs
+     public :: stdlib_qptsv
+     public :: stdlib_qptsvx
+     public :: stdlib_qpttrf
+     public :: stdlib_qpttrs
+     public :: stdlib_qptts2
+     public :: stdlib_qrscl
+     public :: stdlib_qsb2st_kernels
+     public :: stdlib_qsbev
+     public :: stdlib_qsbevd
+     public :: stdlib_qsbevx
+     public :: stdlib_qsbgst
+     public :: stdlib_qsbgv
+     public :: stdlib_qsbgvd
+     public :: stdlib_qsbgvx
+     public :: stdlib_qsbtrd
+     public :: stdlib_qsfrk
+     public :: stdlib_qsgesv
+     public :: stdlib_qspcon
+     public :: stdlib_qspev
+     public :: stdlib_qspevd
+     public :: stdlib_qspevx
+     public :: stdlib_qspgst
+     public :: stdlib_qspgv
+     public :: stdlib_qspgvd
+     public :: stdlib_qspgvx
+     public :: stdlib_qsposv
+     public :: stdlib_qsprfs
+     public :: stdlib_qspsv
+     public :: stdlib_qspsvx
+     public :: stdlib_qsptrd
+     public :: stdlib_qsptrf
+     public :: stdlib_qsptri
+     public :: stdlib_qsptrs
+     public :: stdlib_qstebz
+     public :: stdlib_qstedc
+     public :: stdlib_qstegr
+     public :: stdlib_qstein
+     public :: stdlib_qstemr
+     public :: stdlib_qsteqr
+     public :: stdlib_qsterf
+     public :: stdlib_qstev
+     public :: stdlib_qstevd
+     public :: stdlib_qstevr
+     public :: stdlib_qstevx
+     public :: stdlib_qsycon
+     public :: stdlib_qsycon_rook
+     public :: stdlib_qsyconv
+     public :: stdlib_qsyconvf
+     public :: stdlib_qsyconvf_rook
+     public :: stdlib_qsyequb
+     public :: stdlib_qsyev
+     public :: stdlib_qsyevd
+     public :: stdlib_qsyevr
+     public :: stdlib_qsyevx
+     public :: stdlib_qsygs2
+     public :: stdlib_qsygst
+     public :: stdlib_qsygv
+     public :: stdlib_qsygvd
+     public :: stdlib_qsygvx
+     public :: stdlib_qsyrfs
+     public :: stdlib_qsysv
+     public :: stdlib_qsysv_aa
+     public :: stdlib_qsysv_rk
+     public :: stdlib_qsysv_rook
+     public :: stdlib_qsysvx
+     public :: stdlib_qsyswapr
+     public :: stdlib_qsytd2
+     public :: stdlib_qsytf2
+     public :: stdlib_qsytf2_rk
+     public :: stdlib_qsytf2_rook
+     public :: stdlib_qsytrd
+     public :: stdlib_qsytrd_sb2st
+     public :: stdlib_qsytrd_sy2sb
+     public :: stdlib_qsytrf
+     public :: stdlib_qsytrf_aa
+     public :: stdlib_qsytrf_rk
+     public :: stdlib_qsytrf_rook
+     public :: stdlib_qsytri
+     public :: stdlib_qsytri_rook
+     public :: stdlib_qsytrs
+     public :: stdlib_qsytrs2
+     public :: stdlib_qsytrs_3
+     public :: stdlib_qsytrs_aa
+     public :: stdlib_qsytrs_rook
+     public :: stdlib_qtbcon
+     public :: stdlib_qtbrfs
+     public :: stdlib_qtbtrs
+     public :: stdlib_qtfsm
+     public :: stdlib_qtftri
+     public :: stdlib_qtfttp
+     public :: stdlib_qtfttr
+     public :: stdlib_qtgevc
+     public :: stdlib_qtgex2
+     public :: stdlib_qtgexc
+     public :: stdlib_qtgsen
+     public :: stdlib_qtgsja
+     public :: stdlib_qtgsna
+     public :: stdlib_qtgsy2
+     public :: stdlib_qtgsyl
+     public :: stdlib_qtpcon
+     public :: stdlib_qtplqt
+     public :: stdlib_qtplqt2
+     public :: stdlib_qtpmlqt
+     public :: stdlib_qtpmqrt
+     public :: stdlib_qtpqrt
+     public :: stdlib_qtpqrt2
+     public :: stdlib_qtprfb
+     public :: stdlib_qtprfs
+     public :: stdlib_qtptri
+     public :: stdlib_qtptrs
+     public :: stdlib_qtpttf
+     public :: stdlib_qtpttr
+     public :: stdlib_qtrcon
+     public :: stdlib_qtrevc
+     public :: stdlib_qtrevc3
+     public :: stdlib_qtrexc
+     public :: stdlib_qtrrfs
+     public :: stdlib_qtrsen
+     public :: stdlib_qtrsna
+     public :: stdlib_qtrsyl
+     public :: stdlib_qtrti2
+     public :: stdlib_qtrtri
+     public :: stdlib_qtrtrs
+     public :: stdlib_qtrttf
+     public :: stdlib_qtrttp
+     public :: stdlib_qtzrzf
+     public :: stdlib_qzsum1
+     public :: stdlib_qlag2q
+
+     ! 128-bit real constants 
+     real(qp),    parameter, private ::     negone = -1.00_qp
+     real(qp),    parameter, private ::       zero = 0.00_qp
+     real(qp),    parameter, private ::       half = 0.50_qp
+     real(qp),    parameter, private ::        one = 1.00_qp
+     real(qp),    parameter, private ::        two = 2.00_qp
+     real(qp),    parameter, private ::      three = 3.00_qp
+     real(qp),    parameter, private ::       four = 4.00_qp
+     real(qp),    parameter, private ::      eight = 8.00_qp
+     real(qp),    parameter, private ::        ten = 10.00_qp
+
+     ! 128-bit complex constants 
+     complex(qp), parameter, private :: czero   = ( 0.0_qp,0.0_qp)
+     complex(qp), parameter, private :: chalf   = ( 0.5_qp,0.0_qp)
+     complex(qp), parameter, private :: cone    = ( 1.0_qp,0.0_qp)
+     complex(qp), parameter, private :: cnegone = (-1.0_qp,0.0_qp)
+
+     ! 128-bit scaling constants 
+     integer,     parameter, private :: maxexp = maxexponent(zero) 
+     integer,     parameter, private :: minexp = minexponent(zero) 
+     real(qp),    parameter, private :: rradix = real(radix(zero),qp) 
+     real(qp),    parameter, private :: ulp    = epsilon(zero) 
+     real(qp),    parameter, private :: eps    = ulp*half 
+     real(qp),    parameter, private :: safmin = rradix**max(minexp-1,1-maxexp) 
+     real(qp),    parameter, private :: safmax = one/safmin 
+     real(qp),    parameter, private :: smlnum = safmin/ulp 
+     real(qp),    parameter, private :: bignum = safmax*ulp 
+     real(qp),    parameter, private :: rtmin  = sqrt(smlnum) 
+     real(qp),    parameter, private :: rtmax  = sqrt(bignum) 
+
+     ! 128-bit Blue's scaling constants 
+     ! ssml>=1/s and sbig==1/S with s,S as defined in https://doi.org/10.1145/355769.355771 
+     real(qp),    parameter, private :: tsml   = rradix**ceiling((minexp-1)*half) 
+     real(qp),    parameter, private :: tbig   = rradix**floor((maxexp-digits(zero)+1)*half) 
+     real(qp),    parameter, private :: ssml   = rradix**(-floor((minexp-digits(zero))*half)) 
+     real(qp),    parameter, private :: sbig   = rradix**(-ceiling((maxexp+digits(zero)-1)*half)) 
+
+
+     contains
+
+     !> DBBCSD: computes the CS decomposition of an orthogonal matrix in
+     !> bidiagonal-block form,
+     !> [ B11 | B12 0  0 ]
+     !> [  0  |  0 -I  0 ]
+     !> X = [----------------]
+     !> [ B21 | B22 0  0 ]
+     !> [  0  |  0  0  I ]
+     !> [  C | -S  0  0 ]
+     !> [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**T
+     !> = [---------] [---------------] [---------]   .
+     !> [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
+     !> [  0 |  0  0  I ]
+     !> X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
+     !> than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
+     !> transposed and/or permuted. This can be done in constant time using
+     !> the TRANS and SIGNS options. See DORCSD for details.)
+     !> The bidiagonal matrices B11, B12, B21, and B22 are represented
+     !> implicitly by angles THETA(1:Q) and PHI(1:Q-1).
+     !> The orthogonal matrices U1, U2, V1T, and V2T are input/output.
+     !> The input matrices are pre- or post-multiplied by the appropriate
+     !> singular vector matrices.
+
+     pure subroutine stdlib_qbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
+     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lwork, m, p, q
+           ! Array Arguments 
+           real(qp), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
+                      b22e(*), work(*)
+           real(qp), intent(inout) :: phi(*), theta(*)
+           real(qp), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
+        ! ===================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxitr = 6
+           real(qp), parameter :: hundred = 100.0_qp
+           real(qp), parameter :: meighth = -0.125_qp
+           real(qp), parameter :: piover2 = 1.57079632679489661923132169163975144210_qp
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
+                     wantu2, wantv1t, wantv2t
+           integer(ilp) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
+                     iv2tcs, iv2tsn, j, lworkmin, lworkopt, maxit, mini
+           real(qp) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
+                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
+           ! Intrinsic Functions 
+           intrinsic :: abs,atan2,cos,max,min,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           wantv2t = stdlib_lsame( jobv2t, 'Y' )
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           if( m < 0 ) then
+              info = -6
+           else if( p < 0 .or. p > m ) then
+              info = -7
+           else if( q < 0 .or. q > m ) then
+              info = -8
+           else if( q > p .or. q > m-p .or. q > m-q ) then
+              info = -8
+           else if( wantu1 .and. ldu1 < p ) then
+              info = -12
+           else if( wantu2 .and. ldu2 < m-p ) then
+              info = -14
+           else if( wantv1t .and. ldv1t < q ) then
+              info = -16
+           else if( wantv2t .and. ldv2t < m-q ) then
+              info = -18
+           end if
+           ! quick return if q = 0
+           if( info == 0 .and. q == 0 ) then
+              lworkmin = 1
+              work(1) = lworkmin
+              return
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              iu1cs = 1
+              iu1sn = iu1cs + q
+              iu2cs = iu1sn + q
+              iu2sn = iu2cs + q
+              iv1tcs = iu2sn + q
+              iv1tsn = iv1tcs + q
+              iv2tcs = iv1tsn + q
+              iv2tsn = iv2tcs + q
+              lworkopt = iv2tsn + q - 1
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not. lquery ) then
+                 info = -28
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DBBCSD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'EPSILON' )
+           unfl = stdlib_qlamch( 'SAFE MINIMUM' )
+           tolmul = max( ten, min( hundred, eps**meighth ) )
+           tol = tolmul*eps
+           thresh = max( tol, maxitr*q*q*unfl )
+           ! test for negligible sines or cosines
+           do i = 1, q
+              if( theta(i) < thresh ) then
+                 theta(i) = zero
+              else if( theta(i) > piover2-thresh ) then
+                 theta(i) = piover2
+              end if
+           end do
+           do i = 1, q-1
+              if( phi(i) < thresh ) then
+                 phi(i) = zero
+              else if( phi(i) > piover2-thresh ) then
+                 phi(i) = piover2
+              end if
+           end do
+           ! initial deflation
+           imax = q
+           do while( imax > 1 )
+              if( phi(imax-1) /= zero ) then
+                 exit
+              end if
+              imax = imax - 1
+           end do
+           imin = imax - 1
+           if  ( imin > 1 ) then
+              do while( phi(imin-1) /= zero )
+                 imin = imin - 1
+                 if  ( imin <= 1 ) exit
+              end do
+           end if
+           ! initialize iteration counter
+           maxit = maxitr*q*q
+           iter = 0
+           ! begin main iteration loop
+           do while( imax > 1 )
+              ! compute the matrix entries
+              b11d(imin) = cos( theta(imin) )
+              b21d(imin) = -sin( theta(imin) )
+              do i = imin, imax - 1
+                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
+                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
+                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
+                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
+                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
+                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
+                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
+                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
+              end do
+              b12d(imax) = sin( theta(imax) )
+              b22d(imax) = cos( theta(imax) )
+              ! abort if not converging; otherwise, increment iter
+              if( iter > maxit ) then
+                 info = 0
+                 do i = 1, q
+                    if( phi(i) /= zero )info = info + 1
+                 end do
+                 return
+              end if
+              iter = iter + imax - imin
+              ! compute shifts
+              thetamax = theta(imin)
+              thetamin = theta(imin)
+              do i = imin+1, imax
+                 if( theta(i) > thetamax )thetamax = theta(i)
+                 if( theta(i) < thetamin )thetamin = theta(i)
+              end do
+              if( thetamax > piover2 - thresh ) then
+                 ! zero on diagonals of b11 and b22; induce deflation with a
+                 ! zero shift
+                 mu = zero
+                 nu = one
+              else if( thetamin < thresh ) then
+                 ! zero on diagonals of b12 and b22; induce deflation with a
+                 ! zero shift
+                 mu = one
+                 nu = zero
+              else
+                 ! compute shifts for b11 and b21 and use the lesser
+                 call stdlib_qlas2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
+                           
+                 call stdlib_qlas2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
+                           
+                 if( sigma11 <= sigma21 ) then
+                    mu = sigma11
+                    nu = sqrt( one - mu**2 )
+                    if( mu < thresh ) then
+                       mu = zero
+                       nu = one
+                    end if
+                 else
+                    nu = sigma21
+                    mu = sqrt( 1.0_qp - nu**2 )
+                    if( nu < thresh ) then
+                       mu = one
+                       nu = zero
+                    end if
+                 end if
+              end if
+              ! rotate to produce bulges in b11 and b21
+              if( mu <= nu ) then
+                 call stdlib_qlartgs( b11d(imin), b11e(imin), mu,work(iv1tcs+imin-1), work(iv1tsn+&
+                           imin-1) )
+              else
+                 call stdlib_qlartgs( b21d(imin), b21e(imin), nu,work(iv1tcs+imin-1), work(iv1tsn+&
+                           imin-1) )
+              end if
+              temp = work(iv1tcs+imin-1)*b11d(imin) +work(iv1tsn+imin-1)*b11e(imin)
+              b11e(imin) = work(iv1tcs+imin-1)*b11e(imin) -work(iv1tsn+imin-1)*b11d(imin)
+              b11d(imin) = temp
+              b11bulge = work(iv1tsn+imin-1)*b11d(imin+1)
+              b11d(imin+1) = work(iv1tcs+imin-1)*b11d(imin+1)
+              temp = work(iv1tcs+imin-1)*b21d(imin) +work(iv1tsn+imin-1)*b21e(imin)
+              b21e(imin) = work(iv1tcs+imin-1)*b21e(imin) -work(iv1tsn+imin-1)*b21d(imin)
+              b21d(imin) = temp
+              b21bulge = work(iv1tsn+imin-1)*b21d(imin+1)
+              b21d(imin+1) = work(iv1tcs+imin-1)*b21d(imin+1)
+              ! compute theta(imin)
+              theta( imin ) = atan2( sqrt( b21d(imin)**2+b21bulge**2 ),sqrt( b11d(imin)**2+&
+                        b11bulge**2 ) )
+              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
+              if( b11d(imin)**2+b11bulge**2 > thresh**2 ) then
+                 call stdlib_qlartgp( b11bulge, b11d(imin), work(iu1sn+imin-1),work(iu1cs+imin-1),&
+                            r )
+              else if( mu <= nu ) then
+                 call stdlib_qlartgs( b11e( imin ), b11d( imin + 1 ), mu,work(iu1cs+imin-1), work(&
+                           iu1sn+imin-1) )
+              else
+                 call stdlib_qlartgs( b12d( imin ), b12e( imin ), nu,work(iu1cs+imin-1), work(&
+                           iu1sn+imin-1) )
+              end if
+              if( b21d(imin)**2+b21bulge**2 > thresh**2 ) then
+                 call stdlib_qlartgp( b21bulge, b21d(imin), work(iu2sn+imin-1),work(iu2cs+imin-1),&
+                            r )
+              else if( nu < mu ) then
+                 call stdlib_qlartgs( b21e( imin ), b21d( imin + 1 ), nu,work(iu2cs+imin-1), work(&
+                           iu2sn+imin-1) )
+              else
+                 call stdlib_qlartgs( b22d(imin), b22e(imin), mu,work(iu2cs+imin-1), work(iu2sn+&
+                           imin-1) )
+              end if
+              work(iu2cs+imin-1) = -work(iu2cs+imin-1)
+              work(iu2sn+imin-1) = -work(iu2sn+imin-1)
+              temp = work(iu1cs+imin-1)*b11e(imin) +work(iu1sn+imin-1)*b11d(imin+1)
+              b11d(imin+1) = work(iu1cs+imin-1)*b11d(imin+1) -work(iu1sn+imin-1)*b11e(imin)
+                        
+              b11e(imin) = temp
+              if( imax > imin+1 ) then
+                 b11bulge = work(iu1sn+imin-1)*b11e(imin+1)
+                 b11e(imin+1) = work(iu1cs+imin-1)*b11e(imin+1)
+              end if
+              temp = work(iu1cs+imin-1)*b12d(imin) +work(iu1sn+imin-1)*b12e(imin)
+              b12e(imin) = work(iu1cs+imin-1)*b12e(imin) -work(iu1sn+imin-1)*b12d(imin)
+              b12d(imin) = temp
+              b12bulge = work(iu1sn+imin-1)*b12d(imin+1)
+              b12d(imin+1) = work(iu1cs+imin-1)*b12d(imin+1)
+              temp = work(iu2cs+imin-1)*b21e(imin) +work(iu2sn+imin-1)*b21d(imin+1)
+              b21d(imin+1) = work(iu2cs+imin-1)*b21d(imin+1) -work(iu2sn+imin-1)*b21e(imin)
+                        
+              b21e(imin) = temp
+              if( imax > imin+1 ) then
+                 b21bulge = work(iu2sn+imin-1)*b21e(imin+1)
+                 b21e(imin+1) = work(iu2cs+imin-1)*b21e(imin+1)
+              end if
+              temp = work(iu2cs+imin-1)*b22d(imin) +work(iu2sn+imin-1)*b22e(imin)
+              b22e(imin) = work(iu2cs+imin-1)*b22e(imin) -work(iu2sn+imin-1)*b22d(imin)
+              b22d(imin) = temp
+              b22bulge = work(iu2sn+imin-1)*b22d(imin+1)
+              b22d(imin+1) = work(iu2cs+imin-1)*b22d(imin+1)
+              ! inner loop: chase bulges from b11(imin,imin+2),
+              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
+              ! bottom-right
+              do i = imin+1, imax-1
+                 ! compute phi(i-1)
+                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
+                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
+                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
+                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
+                 phi(i-1) = atan2( sqrt(x1**2+x2**2), sqrt(y1**2+y2**2) )
+                 ! determine if there are bulges to chase or if a new direct
+                 ! summand has been reached
+                 restart11 = b11e(i-1)**2 + b11bulge**2 <= thresh**2
+                 restart21 = b21e(i-1)**2 + b21bulge**2 <= thresh**2
+                 restart12 = b12d(i-1)**2 + b12bulge**2 <= thresh**2
+                 restart22 = b22d(i-1)**2 + b22bulge**2 <= thresh**2
+                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
+                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
+                 ! chasing by applying the original shift again.
+                 if( .not. restart11 .and. .not. restart21 ) then
+                    call stdlib_qlartgp( x2, x1, work(iv1tsn+i-1), work(iv1tcs+i-1),r )
+                 else if( .not. restart11 .and. restart21 ) then
+                    call stdlib_qlartgp( b11bulge, b11e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
+                              r )
+                 else if( restart11 .and. .not. restart21 ) then
+                    call stdlib_qlartgp( b21bulge, b21e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
+                              r )
+                 else if( mu <= nu ) then
+                    call stdlib_qlartgs( b11d(i), b11e(i), mu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
+                              
+                 else
+                    call stdlib_qlartgs( b21d(i), b21e(i), nu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
+                              
+                 end if
+                 work(iv1tcs+i-1) = -work(iv1tcs+i-1)
+                 work(iv1tsn+i-1) = -work(iv1tsn+i-1)
+                 if( .not. restart12 .and. .not. restart22 ) then
+                    call stdlib_qlartgp( y2, y1, work(iv2tsn+i-1-1),work(iv2tcs+i-1-1), r )
+                              
+                 else if( .not. restart12 .and. restart22 ) then
+                    call stdlib_qlartgp( b12bulge, b12d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
+                              1), r )
+                 else if( restart12 .and. .not. restart22 ) then
+                    call stdlib_qlartgp( b22bulge, b22d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
+                              1), r )
+                 else if( nu < mu ) then
+                    call stdlib_qlartgs( b12e(i-1), b12d(i), nu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
+                              1-1) )
+                 else
+                    call stdlib_qlartgs( b22e(i-1), b22d(i), mu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
+                              1-1) )
+                 end if
+                 temp = work(iv1tcs+i-1)*b11d(i) + work(iv1tsn+i-1)*b11e(i)
+                 b11e(i) = work(iv1tcs+i-1)*b11e(i) -work(iv1tsn+i-1)*b11d(i)
+                 b11d(i) = temp
+                 b11bulge = work(iv1tsn+i-1)*b11d(i+1)
+                 b11d(i+1) = work(iv1tcs+i-1)*b11d(i+1)
+                 temp = work(iv1tcs+i-1)*b21d(i) + work(iv1tsn+i-1)*b21e(i)
+                 b21e(i) = work(iv1tcs+i-1)*b21e(i) -work(iv1tsn+i-1)*b21d(i)
+                 b21d(i) = temp
+                 b21bulge = work(iv1tsn+i-1)*b21d(i+1)
+                 b21d(i+1) = work(iv1tcs+i-1)*b21d(i+1)
+                 temp = work(iv2tcs+i-1-1)*b12e(i-1) +work(iv2tsn+i-1-1)*b12d(i)
+                 b12d(i) = work(iv2tcs+i-1-1)*b12d(i) -work(iv2tsn+i-1-1)*b12e(i-1)
+                 b12e(i-1) = temp
+                 b12bulge = work(iv2tsn+i-1-1)*b12e(i)
+                 b12e(i) = work(iv2tcs+i-1-1)*b12e(i)
+                 temp = work(iv2tcs+i-1-1)*b22e(i-1) +work(iv2tsn+i-1-1)*b22d(i)
+                 b22d(i) = work(iv2tcs+i-1-1)*b22d(i) -work(iv2tsn+i-1-1)*b22e(i-1)
+                 b22e(i-1) = temp
+                 b22bulge = work(iv2tsn+i-1-1)*b22e(i)
+                 b22e(i) = work(iv2tcs+i-1-1)*b22e(i)
+                 ! compute theta(i)
+                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
+                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
+                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
+                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
+                 theta(i) = atan2( sqrt(y1**2+y2**2), sqrt(x1**2+x2**2) )
+                 ! determine if there are bulges to chase or if a new direct
+                 ! summand has been reached
+                 restart11 =   b11d(i)**2 + b11bulge**2 <= thresh**2
+                 restart12 = b12e(i-1)**2 + b12bulge**2 <= thresh**2
+                 restart21 =   b21d(i)**2 + b21bulge**2 <= thresh**2
+                 restart22 = b22e(i-1)**2 + b22bulge**2 <= thresh**2
+                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
+                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
+                 ! chasing by applying the original shift again.
+                 if( .not. restart11 .and. .not. restart12 ) then
+                    call stdlib_qlartgp( x2, x1, work(iu1sn+i-1), work(iu1cs+i-1),r )
+                 else if( .not. restart11 .and. restart12 ) then
+                    call stdlib_qlartgp( b11bulge, b11d(i), work(iu1sn+i-1),work(iu1cs+i-1), r )
+                              
+                 else if( restart11 .and. .not. restart12 ) then
+                    call stdlib_qlartgp( b12bulge, b12e(i-1), work(iu1sn+i-1),work(iu1cs+i-1), r )
+                              
+                 else if( mu <= nu ) then
+                    call stdlib_qlartgs( b11e(i), b11d(i+1), mu, work(iu1cs+i-1),work(iu1sn+i-1) )
+                              
+                 else
+                    call stdlib_qlartgs( b12d(i), b12e(i), nu, work(iu1cs+i-1),work(iu1sn+i-1) )
+                              
+                 end if
+                 if( .not. restart21 .and. .not. restart22 ) then
+                    call stdlib_qlartgp( y2, y1, work(iu2sn+i-1), work(iu2cs+i-1),r )
+                 else if( .not. restart21 .and. restart22 ) then
+                    call stdlib_qlartgp( b21bulge, b21d(i), work(iu2sn+i-1),work(iu2cs+i-1), r )
+                              
+                 else if( restart21 .and. .not. restart22 ) then
+                    call stdlib_qlartgp( b22bulge, b22e(i-1), work(iu2sn+i-1),work(iu2cs+i-1), r )
+                              
+                 else if( nu < mu ) then
+                    call stdlib_qlartgs( b21e(i), b21e(i+1), nu, work(iu2cs+i-1),work(iu2sn+i-1) )
+                              
+                 else
+                    call stdlib_qlartgs( b22d(i), b22e(i), mu, work(iu2cs+i-1),work(iu2sn+i-1) )
+                              
+                 end if
+                 work(iu2cs+i-1) = -work(iu2cs+i-1)
+                 work(iu2sn+i-1) = -work(iu2sn+i-1)
+                 temp = work(iu1cs+i-1)*b11e(i) + work(iu1sn+i-1)*b11d(i+1)
+                 b11d(i+1) = work(iu1cs+i-1)*b11d(i+1) -work(iu1sn+i-1)*b11e(i)
+                 b11e(i) = temp
+                 if( i < imax - 1 ) then
+                    b11bulge = work(iu1sn+i-1)*b11e(i+1)
+                    b11e(i+1) = work(iu1cs+i-1)*b11e(i+1)
+                 end if
+                 temp = work(iu2cs+i-1)*b21e(i) + work(iu2sn+i-1)*b21d(i+1)
+                 b21d(i+1) = work(iu2cs+i-1)*b21d(i+1) -work(iu2sn+i-1)*b21e(i)
+                 b21e(i) = temp
+                 if( i < imax - 1 ) then
+                    b21bulge = work(iu2sn+i-1)*b21e(i+1)
+                    b21e(i+1) = work(iu2cs+i-1)*b21e(i+1)
+                 end if
+                 temp = work(iu1cs+i-1)*b12d(i) + work(iu1sn+i-1)*b12e(i)
+                 b12e(i) = work(iu1cs+i-1)*b12e(i) - work(iu1sn+i-1)*b12d(i)
+                 b12d(i) = temp
+                 b12bulge = work(iu1sn+i-1)*b12d(i+1)
+                 b12d(i+1) = work(iu1cs+i-1)*b12d(i+1)
+                 temp = work(iu2cs+i-1)*b22d(i) + work(iu2sn+i-1)*b22e(i)
+                 b22e(i) = work(iu2cs+i-1)*b22e(i) - work(iu2sn+i-1)*b22d(i)
+                 b22d(i) = temp
+                 b22bulge = work(iu2sn+i-1)*b22d(i+1)
+                 b22d(i+1) = work(iu2cs+i-1)*b22d(i+1)
+              end do
+              ! compute phi(imax-1)
+              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
+              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
+              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
+              phi(imax-1) = atan2( abs(x1), sqrt(y1**2+y2**2) )
+              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
+              restart12 = b12d(imax-1)**2 + b12bulge**2 <= thresh**2
+              restart22 = b22d(imax-1)**2 + b22bulge**2 <= thresh**2
+              if( .not. restart12 .and. .not. restart22 ) then
+                 call stdlib_qlartgp( y2, y1, work(iv2tsn+imax-1-1),work(iv2tcs+imax-1-1), r )
+                           
+              else if( .not. restart12 .and. restart22 ) then
+                 call stdlib_qlartgp( b12bulge, b12d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
+                           imax-1-1), r )
+              else if( restart12 .and. .not. restart22 ) then
+                 call stdlib_qlartgp( b22bulge, b22d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
+                           imax-1-1), r )
+              else if( nu < mu ) then
+                 call stdlib_qlartgs( b12e(imax-1), b12d(imax), nu,work(iv2tcs+imax-1-1), work(&
+                           iv2tsn+imax-1-1) )
+              else
+                 call stdlib_qlartgs( b22e(imax-1), b22d(imax), mu,work(iv2tcs+imax-1-1), work(&
+                           iv2tsn+imax-1-1) )
+              end if
+              temp = work(iv2tcs+imax-1-1)*b12e(imax-1) +work(iv2tsn+imax-1-1)*b12d(imax)
+              b12d(imax) = work(iv2tcs+imax-1-1)*b12d(imax) -work(iv2tsn+imax-1-1)*b12e(imax-1)
+                        
+              b12e(imax-1) = temp
+              temp = work(iv2tcs+imax-1-1)*b22e(imax-1) +work(iv2tsn+imax-1-1)*b22d(imax)
+              b22d(imax) = work(iv2tcs+imax-1-1)*b22d(imax) -work(iv2tsn+imax-1-1)*b22e(imax-1)
+                        
+              b22e(imax-1) = temp
+              ! update singular vectors
+              if( wantu1 ) then
+                 if( colmajor ) then
+                    call stdlib_qlasr( 'R', 'V', 'F', p, imax-imin+1,work(iu1cs+imin-1), work(&
+                              iu1sn+imin-1),u1(1,imin), ldu1 )
+                 else
+                    call stdlib_qlasr( 'L', 'V', 'F', imax-imin+1, p,work(iu1cs+imin-1), work(&
+                              iu1sn+imin-1),u1(imin,1), ldu1 )
+                 end if
+              end if
+              if( wantu2 ) then
+                 if( colmajor ) then
+                    call stdlib_qlasr( 'R', 'V', 'F', m-p, imax-imin+1,work(iu2cs+imin-1), work(&
+                              iu2sn+imin-1),u2(1,imin), ldu2 )
+                 else
+                    call stdlib_qlasr( 'L', 'V', 'F', imax-imin+1, m-p,work(iu2cs+imin-1), work(&
+                              iu2sn+imin-1),u2(imin,1), ldu2 )
+                 end if
+              end if
+              if( wantv1t ) then
+                 if( colmajor ) then
+                    call stdlib_qlasr( 'L', 'V', 'F', imax-imin+1, q,work(iv1tcs+imin-1), work(&
+                              iv1tsn+imin-1),v1t(imin,1), ldv1t )
+                 else
+                    call stdlib_qlasr( 'R', 'V', 'F', q, imax-imin+1,work(iv1tcs+imin-1), work(&
+                              iv1tsn+imin-1),v1t(1,imin), ldv1t )
+                 end if
+              end if
+              if( wantv2t ) then
+                 if( colmajor ) then
+                    call stdlib_qlasr( 'L', 'V', 'F', imax-imin+1, m-q,work(iv2tcs+imin-1), work(&
+                              iv2tsn+imin-1),v2t(imin,1), ldv2t )
+                 else
+                    call stdlib_qlasr( 'R', 'V', 'F', m-q, imax-imin+1,work(iv2tcs+imin-1), work(&
+                              iv2tsn+imin-1),v2t(1,imin), ldv2t )
+                 end if
+              end if
+              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
+              if( b11e(imax-1)+b21e(imax-1) > 0 ) then
+                 b11d(imax) = -b11d(imax)
+                 b21d(imax) = -b21d(imax)
+                 if( wantv1t ) then
+                    if( colmajor ) then
+                       call stdlib_qscal( q, negone, v1t(imax,1), ldv1t )
+                    else
+                       call stdlib_qscal( q, negone, v1t(1,imax), 1 )
+                    end if
+                 end if
+              end if
+              ! compute theta(imax)
+              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
+              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
+              theta(imax) = atan2( abs(y1), abs(x1) )
+              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
+              ! and b22(imax,imax-1)
+              if( b11d(imax)+b12e(imax-1) < 0 ) then
+                 b12d(imax) = -b12d(imax)
+                 if( wantu1 ) then
+                    if( colmajor ) then
+                       call stdlib_qscal( p, negone, u1(1,imax), 1 )
+                    else
+                       call stdlib_qscal( p, negone, u1(imax,1), ldu1 )
+                    end if
+                 end if
+              end if
+              if( b21d(imax)+b22e(imax-1) > 0 ) then
+                 b22d(imax) = -b22d(imax)
+                 if( wantu2 ) then
+                    if( colmajor ) then
+                       call stdlib_qscal( m-p, negone, u2(1,imax), 1 )
+                    else
+                       call stdlib_qscal( m-p, negone, u2(imax,1), ldu2 )
+                    end if
+                 end if
+              end if
+              ! fix signs on b12(imax,imax) and b22(imax,imax)
+              if( b12d(imax)+b22d(imax) < 0 ) then
+                 if( wantv2t ) then
+                    if( colmajor ) then
+                       call stdlib_qscal( m-q, negone, v2t(imax,1), ldv2t )
+                    else
+                       call stdlib_qscal( m-q, negone, v2t(1,imax), 1 )
+                    end if
+                 end if
+              end if
+              ! test for negligible sines or cosines
+              do i = imin, imax
+                 if( theta(i) < thresh ) then
+                    theta(i) = zero
+                 else if( theta(i) > piover2-thresh ) then
+                    theta(i) = piover2
+                 end if
+              end do
+              do i = imin, imax-1
+                 if( phi(i) < thresh ) then
+                    phi(i) = zero
+                 else if( phi(i) > piover2-thresh ) then
+                    phi(i) = piover2
+                 end if
+              end do
+              ! deflate
+              if (imax > 1) then
+                 do while( phi(imax-1) == zero )
+                    imax = imax - 1
+                    if (imax <= 1) exit
+                 end do
+              end if
+              if( imin > imax - 1 )imin = imax - 1
+              if (imin > 1) then
+                 do while (phi(imin-1) /= zero)
+                     imin = imin - 1
+                     if (imin <= 1) exit
+                 end do
+              end if
+              ! repeat main iteration loop
+           end do
+           ! postprocessing: order theta from least to greatest
+           do i = 1, q
+              mini = i
+              thetamin = theta(i)
+              do j = i+1, q
+                 if( theta(j) < thetamin ) then
+                    mini = j
+                    thetamin = theta(j)
+                 end if
+              end do
+              if( mini /= i ) then
+                 theta(mini) = theta(i)
+                 theta(i) = thetamin
+                 if( colmajor ) then
+                    if( wantu1 )call stdlib_qswap( p, u1(1,i), 1, u1(1,mini), 1 )
+                    if( wantu2 )call stdlib_qswap( m-p, u2(1,i), 1, u2(1,mini), 1 )
+                    if( wantv1t )call stdlib_qswap( q, v1t(i,1), ldv1t, v1t(mini,1), ldv1t )
+                              
+                    if( wantv2t )call stdlib_qswap( m-q, v2t(i,1), ldv2t, v2t(mini,1),ldv2t )
+                              
+                 else
+                    if( wantu1 )call stdlib_qswap( p, u1(i,1), ldu1, u1(mini,1), ldu1 )
+                    if( wantu2 )call stdlib_qswap( m-p, u2(i,1), ldu2, u2(mini,1), ldu2 )
+                    if( wantv1t )call stdlib_qswap( q, v1t(1,i), 1, v1t(1,mini), 1 )
+                    if( wantv2t )call stdlib_qswap( m-q, v2t(1,i), 1, v2t(1,mini), 1 )
+                 end if
+              end if
+           end do
+           return
+     end subroutine stdlib_qbbcsd
+
+     !> DBDSDC: computes the singular value decomposition (SVD) of a real
+     !> N-by-N (upper or lower) bidiagonal matrix B:  B = U * S * VT,
+     !> using a divide and conquer method, where S is a diagonal matrix
+     !> with non-negative diagonal elements (the singular values of B), and
+     !> U and VT are orthogonal matrices of left and right singular vectors,
+     !> respectively. DBDSDC can be used to compute all singular values,
+     !> and optionally, singular vectors or singular vectors in compact form.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.  See DLASD3 for details.
+     !> The code currently calls DLASDQ if singular values only are desired.
+     !> However, it can be slightly modified to compute singular values
+     !> using the divide and conquer method.
+
+     pure subroutine stdlib_qbdsdc( uplo, compq, n, d, e, u, ldu, vt, ldvt, q, iq,work, iwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu, ldvt, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iq(*), iwork(*)
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: q(*), u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+        ! changed dimension statement in comment describing e from (n) to
+        ! (n-1).  sven, 17 feb 05.
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: difl, difr, givcol, givnum, givptr, i, ic, icompq, ierr, ii, is, iu, &
+           iuplo, ivt, j, k, kk, mlvl, nm1, nsize, perm, poles, qstart, smlsiz, smlszp, sqre, &
+                     start, wstart, z
+           real(qp) :: cs, eps, orgnrm, p, r, sn
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           iuplo = 0
+           if( stdlib_lsame( uplo, 'U' ) )iuplo = 1
+           if( stdlib_lsame( uplo, 'L' ) )iuplo = 2
+           if( stdlib_lsame( compq, 'N' ) ) then
+              icompq = 0
+           else if( stdlib_lsame( compq, 'P' ) ) then
+              icompq = 1
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              icompq = 2
+           else
+              icompq = -1
+           end if
+           if( iuplo==0 ) then
+              info = -1
+           else if( icompq<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ( ldu<1 ) .or. ( ( icompq==2 ) .and. ( ldu<n ) ) ) then
+              info = -7
+           else if( ( ldvt<1 ) .or. ( ( icompq==2 ) .and. ( ldvt<n ) ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DBDSDC', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           smlsiz = stdlib_ilaenv( 9, 'DBDSDC', ' ', 0, 0, 0, 0 )
+           if( n==1 ) then
+              if( icompq==1 ) then
+                 q( 1 ) = sign( one, d( 1 ) )
+                 q( 1+smlsiz*n ) = one
+              else if( icompq==2 ) then
+                 u( 1, 1 ) = sign( one, d( 1 ) )
+                 vt( 1, 1 ) = one
+              end if
+              d( 1 ) = abs( d( 1 ) )
+              return
+           end if
+           nm1 = n - 1
+           ! if matrix lower bidiagonal, rotate to be upper bidiagonal
+           ! by applying givens rotations on the left
+           wstart = 1
+           qstart = 3
+           if( icompq==1 ) then
+              call stdlib_qcopy( n, d, 1, q( 1 ), 1 )
+              call stdlib_qcopy( n-1, e, 1, q( n+1 ), 1 )
+           end if
+           if( iuplo==2 ) then
+              qstart = 5
+              if( icompq == 2 ) wstart = 2*n - 1
+              do i = 1, n - 1
+                 call stdlib_qlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( icompq==1 ) then
+                    q( i+2*n ) = cs
+                    q( i+3*n ) = sn
+                 else if( icompq==2 ) then
+                    work( i ) = cs
+                    work( nm1+i ) = -sn
+                 end if
+              end do
+           end if
+           ! if icompq = 0, use stdlib_qlasdq to compute the singular values.
+           if( icompq==0 ) then
+              ! ignore wstart, instead using work( 1 ), since the two vectors
+              ! for cs and -sn above are added only if icompq == 2,
+              ! and adding them exceeds documented work size of 4*n.
+              call stdlib_qlasdq( 'U', 0, n, 0, 0, 0, d, e, vt, ldvt, u, ldu, u,ldu, work( 1 ), &
+                        info )
+              go to 40
+           end if
+           ! if n is smaller than the minimum divide size smlsiz, then solve
+           ! the problem with another solver.
+           if( n<=smlsiz ) then
+              if( icompq==2 ) then
+                 call stdlib_qlaset( 'A', n, n, zero, one, u, ldu )
+                 call stdlib_qlaset( 'A', n, n, zero, one, vt, ldvt )
+                 call stdlib_qlasdq( 'U', 0, n, n, n, 0, d, e, vt, ldvt, u, ldu, u,ldu, work( &
+                           wstart ), info )
+              else if( icompq==1 ) then
+                 iu = 1
+                 ivt = iu + n
+                 call stdlib_qlaset( 'A', n, n, zero, one, q( iu+( qstart-1 )*n ),n )
+                 call stdlib_qlaset( 'A', n, n, zero, one, q( ivt+( qstart-1 )*n ),n )
+                 call stdlib_qlasdq( 'U', 0, n, n, n, 0, d, e,q( ivt+( qstart-1 )*n ), n,q( iu+( &
+                           qstart-1 )*n ), n,q( iu+( qstart-1 )*n ), n, work( wstart ),info )
+              end if
+              go to 40
+           end if
+           if( icompq==2 ) then
+              call stdlib_qlaset( 'A', n, n, zero, one, u, ldu )
+              call stdlib_qlaset( 'A', n, n, zero, one, vt, ldvt )
+           end if
+           ! scale.
+           orgnrm = stdlib_qlanst( 'M', n, d, e )
+           if( orgnrm==zero )return
+           call stdlib_qlascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, ierr )
+           call stdlib_qlascl( 'G', 0, 0, orgnrm, one, nm1, 1, e, nm1, ierr )
+           eps = (0.9e+0_qp)*stdlib_qlamch( 'EPSILON' )
+           mlvl = int( log( real( n,KIND=qp) / real( smlsiz+1,KIND=qp) ) / log( two ),KIND=ilp) + &
+                     1
+           smlszp = smlsiz + 1
+           if( icompq==1 ) then
+              iu = 1
+              ivt = 1 + smlsiz
+              difl = ivt + smlszp
+              difr = difl + mlvl
+              z = difr + mlvl*2
+              ic = z + mlvl
+              is = ic + 1
+              poles = is + 1
+              givnum = poles + 2*mlvl
+              k = 1
+              givptr = 2
+              perm = 3
+              givcol = perm + mlvl
+           end if
+           do i = 1, n
+              if( abs( d( i ) )<eps ) then
+                 d( i ) = sign( eps, d( i ) )
+              end if
+           end do
+           start = 1
+           sqre = 0
+           loop_30: do i = 1, nm1
+              if( ( abs( e( i ) )<eps ) .or. ( i==nm1 ) ) then
+                 ! subproblem found. first determine its size and then
+                 ! apply divide and conquer on it.
+                 if( i<nm1 ) then
+                    ! a subproblem with e(i) small for i < nm1.
+                    nsize = i - start + 1
+                 else if( abs( e( i ) )>=eps ) then
+                    ! a subproblem with e(nm1) not too small but i = nm1.
+                    nsize = n - start + 1
+                 else
+                    ! a subproblem with e(nm1) small. this implies an
+                    ! 1-by-1 subproblem at d(n). solve this 1-by-1 problem
+                    ! first.
+                    nsize = i - start + 1
+                    if( icompq==2 ) then
+                       u( n, n ) = sign( one, d( n ) )
+                       vt( n, n ) = one
+                    else if( icompq==1 ) then
+                       q( n+( qstart-1 )*n ) = sign( one, d( n ) )
+                       q( n+( smlsiz+qstart-1 )*n ) = one
+                    end if
+                    d( n ) = abs( d( n ) )
+                 end if
+                 if( icompq==2 ) then
+                    call stdlib_qlasd0( nsize, sqre, d( start ), e( start ),u( start, start ), &
+                              ldu, vt( start, start ),ldvt, smlsiz, iwork, work( wstart ), info )
+                 else
+                    call stdlib_qlasda( icompq, smlsiz, nsize, sqre, d( start ),e( start ), q( &
+                    start+( iu+qstart-2 )*n ), n,q( start+( ivt+qstart-2 )*n ),iq( start+k*n ), q(&
+                     start+( difl+qstart-2 )*n ), q( start+( difr+qstart-2 )*n ),q( start+( z+&
+                     qstart-2 )*n ),q( start+( poles+qstart-2 )*n ),iq( start+givptr*n ), iq( &
+                     start+givcol*n ),n, iq( start+perm*n ),q( start+( givnum+qstart-2 )*n ),q( &
+                     start+( ic+qstart-2 )*n ),q( start+( is+qstart-2 )*n ),work( wstart ), iwork,&
+                                info )
+                 end if
+                 if( info/=0 ) then
+                    return
+                 end if
+                 start = i + 1
+              end if
+           end do loop_30
+           ! unscale
+           call stdlib_qlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, ierr )
+           40 continue
+           ! use selection sort to minimize swaps of singular vectors
+           do ii = 2, n
+              i = ii - 1
+              kk = i
+              p = d( i )
+              do j = ii, n
+                 if( d( j )>p ) then
+                    kk = j
+                    p = d( j )
+                 end if
+              end do
+              if( kk/=i ) then
+                 d( kk ) = d( i )
+                 d( i ) = p
+                 if( icompq==1 ) then
+                    iq( i ) = kk
+                 else if( icompq==2 ) then
+                    call stdlib_qswap( n, u( 1, i ), 1, u( 1, kk ), 1 )
+                    call stdlib_qswap( n, vt( i, 1 ), ldvt, vt( kk, 1 ), ldvt )
+                 end if
+              else if( icompq==1 ) then
+                 iq( i ) = i
+              end if
+           end do
+           ! if icompq = 1, use iq(n,1) as the indicator for uplo
+           if( icompq==1 ) then
+              if( iuplo==1 ) then
+                 iq( n ) = 1
+              else
+                 iq( n ) = 0
+              end if
+           end if
+           ! if b is lower bidiagonal, update u by those givens rotations
+           ! which rotated b to be upper bidiagonal
+           if( ( iuplo==2 ) .and. ( icompq==2 ) )call stdlib_qlasr( 'L', 'V', 'B', n, n, work( 1 )&
+                     , work( n ), u, ldu )
+           return
+     end subroutine stdlib_qbdsdc
+
+     !> DBDSQR: computes the singular values and, optionally, the right and/or
+     !> left singular vectors from the singular value decomposition (SVD) of
+     !> a real N-by-N (upper or lower) bidiagonal matrix B using the implicit
+     !> zero-shift QR algorithm.  The SVD of B has the form
+     !> B = Q * S * P**T
+     !> where S is the diagonal matrix of singular values, Q is an orthogonal
+     !> matrix of left singular vectors, and P is an orthogonal matrix of
+     !> right singular vectors.  If left singular vectors are requested, this
+     !> subroutine actually returns U*Q instead of Q, and, if right singular
+     !> vectors are requested, this subroutine returns P**T*VT instead of
+     !> P**T, for given real input matrices U and VT.  When U and VT are the
+     !> orthogonal matrices that reduce a general matrix A to bidiagonal
+     !> form:  A = U*B*VT, as computed by DGEBRD, then
+     !> A = (U*Q) * S * (P**T*VT)
+     !> is the SVD of A.  Optionally, the subroutine may also compute Q**T*C
+     !> for a given real input matrix C.
+     !> See "Computing  Small Singular Values of Bidiagonal Matrices With
+     !> Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+     !> LAPACK Working Note #3 (or SIAM J. Sci. Statist. Comput. vol. 11,
+     !> no. 5, pp. 873-912, Sept 1990) and
+     !> "Accurate singular values and differential qd algorithms," by
+     !> B. Parlett and V. Fernando, Technical Report CPAM-554, Mathematics
+     !> Department, University of California at Berkeley, July 1992
+     !> for a detailed description of the algorithm.
+
+     pure subroutine stdlib_qbdsqr( uplo, n, ncvt, nru, ncc, d, e, vt, ldvt, u,ldu, c, ldc, work, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, ldu, ldvt, n, ncc, ncvt, nru
+           ! Array Arguments 
+           real(qp), intent(inout) :: c(ldc,*), d(*), e(*), u(ldu,*), vt(ldvt,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: hndrth = 0.01_qp
+           real(qp), parameter :: hndrd = 100.0_qp
+           real(qp), parameter :: meigth = -0.125_qp
+           integer(ilp), parameter :: maxitr = 6
+           
+           
+           
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, rotate
+           integer(ilp) :: i, idir, isub, iter, iterdivn, j, ll, lll, m, maxitdivn, nm1, nm12, &
+                     nm13, oldll, oldm
+           real(qp) :: abse, abss, cosl, cosr, cs, eps, f, g, h, mu, oldcs, oldsn, r, shift, &
+           sigmn, sigmx, sinl, sinr, sll, smax, smin, sminl, sminoa, sn, thresh, tol, tolmul, &
+                     unfl
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.lower ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ncvt<0 ) then
+              info = -3
+           else if( nru<0 ) then
+              info = -4
+           else if( ncc<0 ) then
+              info = -5
+           else if( ( ncvt==0 .and. ldvt<1 ) .or.( ncvt>0 .and. ldvt<max( 1, n ) ) ) then
+              info = -9
+           else if( ldu<max( 1, nru ) ) then
+              info = -11
+           else if( ( ncc==0 .and. ldc<1 ) .or.( ncc>0 .and. ldc<max( 1, n ) ) ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DBDSQR', -info )
+              return
+           end if
+           if( n==0 )return
+           if( n==1 )go to 160
+           ! rotate is true if any singular vectors desired, false otherwise
+           rotate = ( ncvt>0 ) .or. ( nru>0 ) .or. ( ncc>0 )
+           ! if no singular vectors desired, use qd algorithm
+           if( .not.rotate ) then
+              call stdlib_qlasq1( n, d, e, work, info )
+           ! if info equals 2, dqds didn't finish, try to finish
+              if( info /= 2 ) return
+              info = 0
+           end if
+           nm1 = n - 1
+           nm12 = nm1 + nm1
+           nm13 = nm12 + nm1
+           idir = 0
+           ! get machine constants
+           eps = stdlib_qlamch( 'EPSILON' )
+           unfl = stdlib_qlamch( 'SAFE MINIMUM' )
+           ! if matrix lower bidiagonal, rotate to be upper bidiagonal
+           ! by applying givens rotations on the left
+           if( lower ) then
+              do i = 1, n - 1
+                 call stdlib_qlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 work( i ) = cs
+                 work( nm1+i ) = sn
+              end do
+              ! update singular vectors if desired
+              if( nru>0 )call stdlib_qlasr( 'R', 'V', 'F', nru, n, work( 1 ), work( n ), u,ldu )
+                        
+              if( ncc>0 )call stdlib_qlasr( 'L', 'V', 'F', n, ncc, work( 1 ), work( n ), c,ldc )
+                        
+           end if
+           ! compute singular values to relative accuracy tol
+           ! (by setting tol to be negative, algorithm will compute
+           ! singular values to absolute accuracy abs(tol)*norm(input matrix))
+           tolmul = max( ten, min( hndrd, eps**meigth ) )
+           tol = tolmul*eps
+           ! compute approximate maximum, minimum singular values
+           smax = zero
+           do i = 1, n
+              smax = max( smax, abs( d( i ) ) )
+           end do
+           do i = 1, n - 1
+              smax = max( smax, abs( e( i ) ) )
+           end do
+           sminl = zero
+           if( tol>=zero ) then
+              ! relative accuracy desired
+              sminoa = abs( d( 1 ) )
+              if( sminoa==zero )go to 50
+              mu = sminoa
+              do i = 2, n
+                 mu = abs( d( i ) )*( mu / ( mu+abs( e( i-1 ) ) ) )
+                 sminoa = min( sminoa, mu )
+                 if( sminoa==zero )go to 50
+              end do
+              50 continue
+              sminoa = sminoa / sqrt( real( n,KIND=qp) )
+              thresh = max( tol*sminoa, maxitr*(n*(n*unfl)) )
+           else
+              ! absolute accuracy desired
+              thresh = max( abs( tol )*smax, maxitr*(n*(n*unfl)) )
+           end if
+           ! prepare for main iteration loop for the singular values
+           ! (maxit is the maximum number of passes through the inner
+           ! loop permitted before nonconvergence signalled.)
+           maxitdivn = maxitr*n
+           iterdivn = 0
+           iter = -1
+           oldll = -1
+           oldm = -1
+           ! m points to last element of unconverged part of matrix
+           m = n
+           ! begin main iteration loop
+           60 continue
+           ! check for convergence or exceeding iteration count
+           if( m<=1 )go to 160
+           if( iter>=n ) then
+              iter = iter - n
+              iterdivn = iterdivn + 1
+              if( iterdivn>=maxitdivn )go to 200
+           end if
+           ! find diagonal block of matrix to work on
+           if( tol<zero .and. abs( d( m ) )<=thresh )d( m ) = zero
+           smax = abs( d( m ) )
+           smin = smax
+           do lll = 1, m - 1
+              ll = m - lll
+              abss = abs( d( ll ) )
+              abse = abs( e( ll ) )
+              if( tol<zero .and. abss<=thresh )d( ll ) = zero
+              if( abse<=thresh )go to 80
+              smin = min( smin, abss )
+              smax = max( smax, abss, abse )
+           end do
+           ll = 0
+           go to 90
+           80 continue
+           e( ll ) = zero
+           ! matrix splits since e(ll) = 0
+           if( ll==m-1 ) then
+              ! convergence of bottom singular value, return to top of loop
+              m = m - 1
+              go to 60
+           end if
+           90 continue
+           ll = ll + 1
+           ! e(ll) through e(m-1) are nonzero, e(ll-1) is zero
+           if( ll==m-1 ) then
+              ! 2 by 2 block, handle separately
+              call stdlib_qlasv2( d( m-1 ), e( m-1 ), d( m ), sigmn, sigmx, sinr,cosr, sinl, cosl &
+                        )
+              d( m-1 ) = sigmx
+              e( m-1 ) = zero
+              d( m ) = sigmn
+              ! compute singular vectors, if desired
+              if( ncvt>0 )call stdlib_qrot( ncvt, vt( m-1, 1 ), ldvt, vt( m, 1 ), ldvt, cosr,sinr &
+                        )
+              if( nru>0 )call stdlib_qrot( nru, u( 1, m-1 ), 1, u( 1, m ), 1, cosl, sinl )
+              if( ncc>0 )call stdlib_qrot( ncc, c( m-1, 1 ), ldc, c( m, 1 ), ldc, cosl,sinl )
+                        
+              m = m - 2
+              go to 60
+           end if
+           ! if working on new submatrix, choose shift direction
+           ! (from larger end diagonal element towards smaller)
+           if( ll>oldm .or. m<oldll ) then
+              if( abs( d( ll ) )>=abs( d( m ) ) ) then
+                 ! chase bulge from top (big end) to bottom (small end)
+                 idir = 1
+              else
+                 ! chase bulge from bottom (big end) to top (small end)
+                 idir = 2
+              end if
+           end if
+           ! apply convergence tests
+           if( idir==1 ) then
+              ! run convergence test in forward direction
+              ! first apply standard test to bottom of matrix
+              if( abs( e( m-1 ) )<=abs( tol )*abs( d( m ) ) .or.( tol<zero .and. abs( e( m-1 ) )&
+                        <=thresh ) ) then
+                 e( m-1 ) = zero
+                 go to 60
+              end if
+              if( tol>=zero ) then
+                 ! if relative accuracy desired,
+                 ! apply convergence criterion forward
+                 mu = abs( d( ll ) )
+                 sminl = mu
+                 do lll = ll, m - 1
+                    if( abs( e( lll ) )<=tol*mu ) then
+                       e( lll ) = zero
+                       go to 60
+                    end if
+                    mu = abs( d( lll+1 ) )*( mu / ( mu+abs( e( lll ) ) ) )
+                    sminl = min( sminl, mu )
+                 end do
+              end if
+           else
+              ! run convergence test in backward direction
+              ! first apply standard test to top of matrix
+              if( abs( e( ll ) )<=abs( tol )*abs( d( ll ) ) .or.( tol<zero .and. abs( e( ll ) )&
+                        <=thresh ) ) then
+                 e( ll ) = zero
+                 go to 60
+              end if
+              if( tol>=zero ) then
+                 ! if relative accuracy desired,
+                 ! apply convergence criterion backward
+                 mu = abs( d( m ) )
+                 sminl = mu
+                 do lll = m - 1, ll, -1
+                    if( abs( e( lll ) )<=tol*mu ) then
+                       e( lll ) = zero
+                       go to 60
+                    end if
+                    mu = abs( d( lll ) )*( mu / ( mu+abs( e( lll ) ) ) )
+                    sminl = min( sminl, mu )
+                 end do
+              end if
+           end if
+           oldll = ll
+           oldm = m
+           ! compute shift.  first, test if shifting would ruin relative
+           ! accuracy, and if so set the shift to zero.
+           if( tol>=zero .and. n*tol*( sminl / smax )<=max( eps, hndrth*tol ) ) then
+              ! use a zero shift to avoid loss of relative accuracy
+              shift = zero
+           else
+              ! compute the shift from 2-by-2 block at end of matrix
+              if( idir==1 ) then
+                 sll = abs( d( ll ) )
+                 call stdlib_qlas2( d( m-1 ), e( m-1 ), d( m ), shift, r )
+              else
+                 sll = abs( d( m ) )
+                 call stdlib_qlas2( d( ll ), e( ll ), d( ll+1 ), shift, r )
+              end if
+              ! test if shift negligible, and if so set to zero
+              if( sll>zero ) then
+                 if( ( shift / sll )**2<eps )shift = zero
+              end if
+           end if
+           ! increment iteration count
+           iter = iter + m - ll
+           ! if shift = 0, do simplified qr iteration
+           if( shift==zero ) then
+              if( idir==1 ) then
+                 ! chase bulge from top to bottom
+                 ! save cosines and sines for later singular vector updates
+                 cs = one
+                 oldcs = one
+                 do i = ll, m - 1
+                    call stdlib_qlartg( d( i )*cs, e( i ), cs, sn, r )
+                    if( i>ll )e( i-1 ) = oldsn*r
+                    call stdlib_qlartg( oldcs*r, d( i+1 )*sn, oldcs, oldsn, d( i ) )
+                    work( i-ll+1 ) = cs
+                    work( i-ll+1+nm1 ) = sn
+                    work( i-ll+1+nm12 ) = oldcs
+                    work( i-ll+1+nm13 ) = oldsn
+                 end do
+                 h = d( m )*cs
+                 d( m ) = h*oldcs
+                 e( m-1 ) = h*oldsn
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_qlasr( 'L', 'V', 'F', m-ll+1, ncvt, work( 1 ),work( n ), &
+                           vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_qlasr( 'R', 'V', 'F', nru, m-ll+1, work( nm12+1 ),work( &
+                           nm13+1 ), u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_qlasr( 'L', 'V', 'F', m-ll+1, ncc, work( nm12+1 ),work( &
+                           nm13+1 ), c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( m-1 ) )<=thresh )e( m-1 ) = zero
+              else
+                 ! chase bulge from bottom to top
+                 ! save cosines and sines for later singular vector updates
+                 cs = one
+                 oldcs = one
+                 do i = m, ll + 1, -1
+                    call stdlib_qlartg( d( i )*cs, e( i-1 ), cs, sn, r )
+                    if( i<m )e( i ) = oldsn*r
+                    call stdlib_qlartg( oldcs*r, d( i-1 )*sn, oldcs, oldsn, d( i ) )
+                    work( i-ll ) = cs
+                    work( i-ll+nm1 ) = -sn
+                    work( i-ll+nm12 ) = oldcs
+                    work( i-ll+nm13 ) = -oldsn
+                 end do
+                 h = d( ll )*cs
+                 d( ll ) = h*oldcs
+                 e( ll ) = h*oldsn
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_qlasr( 'L', 'V', 'B', m-ll+1, ncvt, work( nm12+1 ),work( &
+                           nm13+1 ), vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_qlasr( 'R', 'V', 'B', nru, m-ll+1, work( 1 ),work( n ), u(&
+                            1, ll ), ldu )
+                 if( ncc>0 )call stdlib_qlasr( 'L', 'V', 'B', m-ll+1, ncc, work( 1 ),work( n ), c(&
+                            ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( ll ) )<=thresh )e( ll ) = zero
+              end if
+           else
+              ! use nonzero shift
+              if( idir==1 ) then
+                 ! chase bulge from top to bottom
+                 ! save cosines and sines for later singular vector updates
+                 f = ( abs( d( ll ) )-shift )*( sign( one, d( ll ) )+shift / d( ll ) )
+                 g = e( ll )
+                 do i = ll, m - 1
+                    call stdlib_qlartg( f, g, cosr, sinr, r )
+                    if( i>ll )e( i-1 ) = r
+                    f = cosr*d( i ) + sinr*e( i )
+                    e( i ) = cosr*e( i ) - sinr*d( i )
+                    g = sinr*d( i+1 )
+                    d( i+1 ) = cosr*d( i+1 )
+                    call stdlib_qlartg( f, g, cosl, sinl, r )
+                    d( i ) = r
+                    f = cosl*e( i ) + sinl*d( i+1 )
+                    d( i+1 ) = cosl*d( i+1 ) - sinl*e( i )
+                    if( i<m-1 ) then
+                       g = sinl*e( i+1 )
+                       e( i+1 ) = cosl*e( i+1 )
+                    end if
+                    work( i-ll+1 ) = cosr
+                    work( i-ll+1+nm1 ) = sinr
+                    work( i-ll+1+nm12 ) = cosl
+                    work( i-ll+1+nm13 ) = sinl
+                 end do
+                 e( m-1 ) = f
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_qlasr( 'L', 'V', 'F', m-ll+1, ncvt, work( 1 ),work( n ), &
+                           vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_qlasr( 'R', 'V', 'F', nru, m-ll+1, work( nm12+1 ),work( &
+                           nm13+1 ), u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_qlasr( 'L', 'V', 'F', m-ll+1, ncc, work( nm12+1 ),work( &
+                           nm13+1 ), c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( m-1 ) )<=thresh )e( m-1 ) = zero
+              else
+                 ! chase bulge from bottom to top
+                 ! save cosines and sines for later singular vector updates
+                 f = ( abs( d( m ) )-shift )*( sign( one, d( m ) )+shift /d( m ) )
+                 g = e( m-1 )
+                 do i = m, ll + 1, -1
+                    call stdlib_qlartg( f, g, cosr, sinr, r )
+                    if( i<m )e( i ) = r
+                    f = cosr*d( i ) + sinr*e( i-1 )
+                    e( i-1 ) = cosr*e( i-1 ) - sinr*d( i )
+                    g = sinr*d( i-1 )
+                    d( i-1 ) = cosr*d( i-1 )
+                    call stdlib_qlartg( f, g, cosl, sinl, r )
+                    d( i ) = r
+                    f = cosl*e( i-1 ) + sinl*d( i-1 )
+                    d( i-1 ) = cosl*d( i-1 ) - sinl*e( i-1 )
+                    if( i>ll+1 ) then
+                       g = sinl*e( i-2 )
+                       e( i-2 ) = cosl*e( i-2 )
+                    end if
+                    work( i-ll ) = cosr
+                    work( i-ll+nm1 ) = -sinr
+                    work( i-ll+nm12 ) = cosl
+                    work( i-ll+nm13 ) = -sinl
+                 end do
+                 e( ll ) = f
+                 ! test convergence
+                 if( abs( e( ll ) )<=thresh )e( ll ) = zero
+                 ! update singular vectors if desired
+                 if( ncvt>0 )call stdlib_qlasr( 'L', 'V', 'B', m-ll+1, ncvt, work( nm12+1 ),work( &
+                           nm13+1 ), vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_qlasr( 'R', 'V', 'B', nru, m-ll+1, work( 1 ),work( n ), u(&
+                            1, ll ), ldu )
+                 if( ncc>0 )call stdlib_qlasr( 'L', 'V', 'B', m-ll+1, ncc, work( 1 ),work( n ), c(&
+                            ll, 1 ), ldc )
+              end if
+           end if
+           ! qr iteration finished, go back and check convergence
+           go to 60
+           ! all singular values converged, so make them positive
+           160 continue
+           do i = 1, n
+              if( d( i )<zero ) then
+                 d( i ) = -d( i )
+                 ! change sign of singular vectors, if desired
+                 if( ncvt>0 )call stdlib_qscal( ncvt, negone, vt( i, 1 ), ldvt )
+              end if
+           end do
+           ! sort the singular values into decreasing order (insertion sort on
+           ! singular values, but only one transposition per singular vector)
+           do i = 1, n - 1
+              ! scan for smallest d(i)
+              isub = 1
+              smin = d( 1 )
+              do j = 2, n + 1 - i
+                 if( d( j )<=smin ) then
+                    isub = j
+                    smin = d( j )
+                 end if
+              end do
+              if( isub/=n+1-i ) then
+                 ! swap singular values and vectors
+                 d( isub ) = d( n+1-i )
+                 d( n+1-i ) = smin
+                 if( ncvt>0 )call stdlib_qswap( ncvt, vt( isub, 1 ), ldvt, vt( n+1-i, 1 ),ldvt )
+                           
+                 if( nru>0 )call stdlib_qswap( nru, u( 1, isub ), 1, u( 1, n+1-i ), 1 )
+                 if( ncc>0 )call stdlib_qswap( ncc, c( isub, 1 ), ldc, c( n+1-i, 1 ), ldc )
+                           
+              end if
+           end do
+           go to 220
+           ! maximum number of iterations exceeded, failure to converge
+           200 continue
+           info = 0
+           do i = 1, n - 1
+              if( e( i )/=zero )info = info + 1
+           end do
+           220 continue
+           return
+     end subroutine stdlib_qbdsqr
+
+     !> DDISNA: computes the reciprocal condition numbers for the eigenvectors
+     !> of a real symmetric or complex Hermitian matrix or for the left or
+     !> right singular vectors of a general m-by-n matrix. The reciprocal
+     !> condition number is the 'gap' between the corresponding eigenvalue or
+     !> singular value and the nearest other one.
+     !> The bound on the error, measured by angle in radians, in the I-th
+     !> computed vector is given by
+     !> DLAMCH( 'E' ) * ( ANORM / SEP( I ) )
+     !> where ANORM = 2-norm(A) = max( abs( D(j) ) ).  SEP(I) is not allowed
+     !> to be smaller than DLAMCH( 'E' )*ANORM in order to limit the size of
+     !> the error bound.
+     !> DDISNA may also be used to compute error bounds for eigenvectors of
+     !> the generalized symmetric definite eigenproblem.
+
+     pure subroutine stdlib_qdisna( job, m, n, d, sep, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: m, n
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*)
+           real(qp), intent(out) :: sep(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: decr, eigen, incr, left, right, sing
+           integer(ilp) :: i, k
+           real(qp) :: anorm, eps, newgap, oldgap, safmin, thresh
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           eigen = stdlib_lsame( job, 'E' )
+           left = stdlib_lsame( job, 'L' )
+           right = stdlib_lsame( job, 'R' )
+           sing = left .or. right
+           if( eigen ) then
+              k = m
+           else if( sing ) then
+              k = min( m, n )
+           end if
+           if( .not.eigen .and. .not.sing ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( k<0 ) then
+              info = -3
+           else
+              incr = .true.
+              decr = .true.
+              do i = 1, k - 1
+                 if( incr )incr = incr .and. d( i )<=d( i+1 )
+                 if( decr )decr = decr .and. d( i )>=d( i+1 )
+              end do
+              if( sing .and. k>0 ) then
+                 if( incr )incr = incr .and. zero<=d( 1 )
+                 if( decr )decr = decr .and. d( k )>=zero
+              end if
+              if( .not.( incr .or. decr ) )info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DDISNA', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==0 )return
+           ! compute reciprocal condition numbers
+           if( k==1 ) then
+              sep( 1 ) = stdlib_qlamch( 'O' )
+           else
+              oldgap = abs( d( 2 )-d( 1 ) )
+              sep( 1 ) = oldgap
+              do i = 2, k - 1
+                 newgap = abs( d( i+1 )-d( i ) )
+                 sep( i ) = min( oldgap, newgap )
+                 oldgap = newgap
+              end do
+              sep( k ) = oldgap
+           end if
+           if( sing ) then
+              if( ( left .and. m>n ) .or. ( right .and. m<n ) ) then
+                 if( incr )sep( 1 ) = min( sep( 1 ), d( 1 ) )
+                 if( decr )sep( k ) = min( sep( k ), d( k ) )
+              end if
+           end if
+           ! ensure that reciprocal condition numbers are not less than
+           ! threshold, in order to limit the size of the error bound
+           eps = stdlib_qlamch( 'E' )
+           safmin = stdlib_qlamch( 'S' )
+           anorm = max( abs( d( 1 ) ), abs( d( k ) ) )
+           if( anorm==zero ) then
+              thresh = eps
+           else
+              thresh = max( eps*anorm, safmin )
+           end if
+           do i = 1, k
+              sep( i ) = max( sep( i ), thresh )
+           end do
+           return
+     end subroutine stdlib_qdisna
+
+     !> DGBBRD: reduces a real general m-by-n band matrix A to upper
+     !> bidiagonal form B by an orthogonal transformation: Q**T * A * P = B.
+     !> The routine computes B, and optionally forms Q or P**T, or computes
+     !> Q**T*C for a given matrix C.
+
+     pure subroutine stdlib_qgbbrd( vect, m, n, ncc, kl, ku, ab, ldab, d, e, q,ldq, pt, ldpt, c, &
+               ldc, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldc, ldpt, ldq, m, n, ncc
+           ! Array Arguments 
+           real(qp), intent(inout) :: ab(ldab,*), c(ldc,*)
+           real(qp), intent(out) :: d(*), e(*), pt(ldpt,*), q(ldq,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantb, wantc, wantpt, wantq
+           integer(ilp) :: i, inca, j, j1, j2, kb, kb1, kk, klm, klu1, kun, l, minmn, ml, ml0, mn,&
+                      mu, mu0, nr, nrt
+           real(qp) :: ra, rb, rc, rs
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           wantb = stdlib_lsame( vect, 'B' )
+           wantq = stdlib_lsame( vect, 'Q' ) .or. wantb
+           wantpt = stdlib_lsame( vect, 'P' ) .or. wantb
+           wantc = ncc>0
+           klu1 = kl + ku + 1
+           info = 0
+           if( .not.wantq .and. .not.wantpt .and. .not.stdlib_lsame( vect, 'N' ) )then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ncc<0 ) then
+              info = -4
+           else if( kl<0 ) then
+              info = -5
+           else if( ku<0 ) then
+              info = -6
+           else if( ldab<klu1 ) then
+              info = -8
+           else if( ldq<1 .or. wantq .and. ldq<max( 1, m ) ) then
+              info = -12
+           else if( ldpt<1 .or. wantpt .and. ldpt<max( 1, n ) ) then
+              info = -14
+           else if( ldc<1 .or. wantc .and. ldc<max( 1, m ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBBRD', -info )
+              return
+           end if
+           ! initialize q and p**t to the unit matrix, if needed
+           if( wantq )call stdlib_qlaset( 'FULL', m, m, zero, one, q, ldq )
+           if( wantpt )call stdlib_qlaset( 'FULL', n, n, zero, one, pt, ldpt )
+           ! quick return if possible.
+           if( m==0 .or. n==0 )return
+           minmn = min( m, n )
+           if( kl+ku>1 ) then
+              ! reduce to upper bidiagonal form if ku > 0; if ku = 0, reduce
+              ! first to lower bidiagonal form and then transform to upper
+              ! bidiagonal
+              if( ku>0 ) then
+                 ml0 = 1
+                 mu0 = 2
+              else
+                 ml0 = 2
+                 mu0 = 1
+              end if
+              ! wherever possible, plane rotations are generated and applied in
+              ! vector operations of length nr over the index set j1:j2:klu1.
+              ! the sines of the plane rotations are stored in work(1:max(m,n))
+              ! and the cosines in work(max(m,n)+1:2*max(m,n)).
+              mn = max( m, n )
+              klm = min( m-1, kl )
+              kun = min( n-1, ku )
+              kb = klm + kun
+              kb1 = kb + 1
+              inca = kb1*ldab
+              nr = 0
+              j1 = klm + 2
+              j2 = 1 - kun
+              loop_90: do i = 1, minmn
+                 ! reduce i-th column and i-th row of matrix to bidiagonal form
+                 ml = klm + 1
+                 mu = kun + 1
+                 loop_80: do kk = 1, kb
+                    j1 = j1 + kb
+                    j2 = j2 + kb
+                    ! generate plane rotations to annihilate nonzero elements
+                    ! which have been created below the band
+                    if( nr>0 )call stdlib_qlargv( nr, ab( klu1, j1-klm-1 ), inca,work( j1 ), kb1, &
+                              work( mn+j1 ), kb1 )
+                    ! apply plane rotations from the left
+                    do l = 1, kb
+                       if( j2-klm+l-1>n ) then
+                          nrt = nr - 1
+                       else
+                          nrt = nr
+                       end if
+                       if( nrt>0 )call stdlib_qlartv( nrt, ab( klu1-l, j1-klm+l-1 ), inca,ab( &
+                                 klu1-l+1, j1-klm+l-1 ), inca,work( mn+j1 ), work( j1 ), kb1 )
+                    end do
+                    if( ml>ml0 ) then
+                       if( ml<=m-i+1 ) then
+                          ! generate plane rotation to annihilate a(i+ml-1,i)
+                          ! within the band, and apply rotation from the left
+                          call stdlib_qlartg( ab( ku+ml-1, i ), ab( ku+ml, i ),work( mn+i+ml-1 ), &
+                                    work( i+ml-1 ),ra )
+                          ab( ku+ml-1, i ) = ra
+                          if( i<n )call stdlib_qrot( min( ku+ml-2, n-i ),ab( ku+ml-2, i+1 ), ldab-&
+                                    1,ab( ku+ml-1, i+1 ), ldab-1,work( mn+i+ml-1 ), work( i+ml-1 ) )
+                       end if
+                       nr = nr + 1
+                       j1 = j1 - kb1
+                    end if
+                    if( wantq ) then
+                       ! accumulate product of plane rotations in q
+                       do j = j1, j2, kb1
+                          call stdlib_qrot( m, q( 1, j-1 ), 1, q( 1, j ), 1,work( mn+j ), work( j &
+                                    ) )
+                       end do
+                    end if
+                    if( wantc ) then
+                       ! apply plane rotations to c
+                       do j = j1, j2, kb1
+                          call stdlib_qrot( ncc, c( j-1, 1 ), ldc, c( j, 1 ), ldc,work( mn+j ), &
+                                    work( j ) )
+                       end do
+                    end if
+                    if( j2+kun>n ) then
+                       ! adjust j2 to keep within the bounds of the matrix
+                       nr = nr - 1
+                       j2 = j2 - kb1
+                    end if
+                    do j = j1, j2, kb1
+                       ! create nonzero element a(j-1,j+ku) above the band
+                       ! and store it in work(n+1:2*n)
+                       work( j+kun ) = work( j )*ab( 1, j+kun )
+                       ab( 1, j+kun ) = work( mn+j )*ab( 1, j+kun )
+                    end do
+                    ! generate plane rotations to annihilate nonzero elements
+                    ! which have been generated above the band
+                    if( nr>0 )call stdlib_qlargv( nr, ab( 1, j1+kun-1 ), inca,work( j1+kun ), kb1,&
+                               work( mn+j1+kun ),kb1 )
+                    ! apply plane rotations from the right
+                    do l = 1, kb
+                       if( j2+l-1>m ) then
+                          nrt = nr - 1
+                       else
+                          nrt = nr
+                       end if
+                       if( nrt>0 )call stdlib_qlartv( nrt, ab( l+1, j1+kun-1 ), inca,ab( l, j1+&
+                                 kun ), inca,work( mn+j1+kun ), work( j1+kun ),kb1 )
+                    end do
+                    if( ml==ml0 .and. mu>mu0 ) then
+                       if( mu<=n-i+1 ) then
+                          ! generate plane rotation to annihilate a(i,i+mu-1)
+                          ! within the band, and apply rotation from the right
+                          call stdlib_qlartg( ab( ku-mu+3, i+mu-2 ),ab( ku-mu+2, i+mu-1 ),work( &
+                                    mn+i+mu-1 ), work( i+mu-1 ),ra )
+                          ab( ku-mu+3, i+mu-2 ) = ra
+                          call stdlib_qrot( min( kl+mu-2, m-i ),ab( ku-mu+4, i+mu-2 ), 1,ab( ku-&
+                                    mu+3, i+mu-1 ), 1,work( mn+i+mu-1 ), work( i+mu-1 ) )
+                       end if
+                       nr = nr + 1
+                       j1 = j1 - kb1
+                    end if
+                    if( wantpt ) then
+                       ! accumulate product of plane rotations in p**t
+                       do j = j1, j2, kb1
+                          call stdlib_qrot( n, pt( j+kun-1, 1 ), ldpt,pt( j+kun, 1 ), ldpt, work( &
+                                    mn+j+kun ),work( j+kun ) )
+                       end do
+                    end if
+                    if( j2+kb>m ) then
+                       ! adjust j2 to keep within the bounds of the matrix
+                       nr = nr - 1
+                       j2 = j2 - kb1
+                    end if
+                    do j = j1, j2, kb1
+                       ! create nonzero element a(j+kl+ku,j+ku-1) below the
+                       ! band and store it in work(1:n)
+                       work( j+kb ) = work( j+kun )*ab( klu1, j+kun )
+                       ab( klu1, j+kun ) = work( mn+j+kun )*ab( klu1, j+kun )
+                    end do
+                    if( ml>ml0 ) then
+                       ml = ml - 1
+                    else
+                       mu = mu - 1
+                    end if
+                 end do loop_80
+              end do loop_90
+           end if
+           if( ku==0 .and. kl>0 ) then
+              ! a has been reduced to lower bidiagonal form
+              ! transform lower bidiagonal form to upper bidiagonal by applying
+              ! plane rotations from the left, storing diagonal elements in d
+              ! and off-diagonal elements in e
+              do i = 1, min( m-1, n )
+                 call stdlib_qlartg( ab( 1, i ), ab( 2, i ), rc, rs, ra )
+                 d( i ) = ra
+                 if( i<n ) then
+                    e( i ) = rs*ab( 1, i+1 )
+                    ab( 1, i+1 ) = rc*ab( 1, i+1 )
+                 end if
+                 if( wantq )call stdlib_qrot( m, q( 1, i ), 1, q( 1, i+1 ), 1, rc, rs )
+                 if( wantc )call stdlib_qrot( ncc, c( i, 1 ), ldc, c( i+1, 1 ), ldc, rc,rs )
+                           
+              end do
+              if( m<=n )d( m ) = ab( 1, m )
+           else if( ku>0 ) then
+              ! a has been reduced to upper bidiagonal form
+              if( m<n ) then
+                 ! annihilate a(m,m+1) by applying plane rotations from the
+                 ! right, storing diagonal elements in d and off-diagonal
+                 ! elements in e
+                 rb = ab( ku, m+1 )
+                 do i = m, 1, -1
+                    call stdlib_qlartg( ab( ku+1, i ), rb, rc, rs, ra )
+                    d( i ) = ra
+                    if( i>1 ) then
+                       rb = -rs*ab( ku, i )
+                       e( i-1 ) = rc*ab( ku, i )
+                    end if
+                    if( wantpt )call stdlib_qrot( n, pt( i, 1 ), ldpt, pt( m+1, 1 ), ldpt,rc, rs )
+                              
+                 end do
+              else
+                 ! copy off-diagonal elements to e and diagonal elements to d
+                 do i = 1, minmn - 1
+                    e( i ) = ab( ku, i+1 )
+                 end do
+                 do i = 1, minmn
+                    d( i ) = ab( ku+1, i )
+                 end do
+              end if
+           else
+              ! a is diagonal. set elements of e to zero and copy diagonal
+              ! elements to d.
+              do i = 1, minmn - 1
+                 e( i ) = zero
+              end do
+              do i = 1, minmn
+                 d( i ) = ab( 1, i )
+              end do
+           end if
+           return
+     end subroutine stdlib_qgbbrd
+
+     !> DGBCON: estimates the reciprocal of the condition number of a real
+     !> general band matrix A, in either the 1-norm or the infinity-norm,
+     !> using the LU factorization computed by DGBTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     pure subroutine stdlib_qgbcon( norm, n, kl, ku, ab, ldab, ipiv, anorm, rcond,work, iwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: ab(ldab,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnoti, onenrm
+           character :: normin
+           integer(ilp) :: ix, j, jp, kase, kase1, kd, lm
+           real(qp) :: ainvnm, scale, smlnum, t
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<2*kl+ku+1 ) then
+              info = -6
+           else if( anorm<zero ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           ! estimate the norm of inv(a).
+           ainvnm = zero
+           normin = 'N'
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kd = kl + ku + 1
+           lnoti = kl>0
+           kase = 0
+           10 continue
+           call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(l).
+                 if( lnoti ) then
+                    do j = 1, n - 1
+                       lm = min( kl, n-j )
+                       jp = ipiv( j )
+                       t = work( jp )
+                       if( jp/=j ) then
+                          work( jp ) = work( j )
+                          work( j ) = t
+                       end if
+                       call stdlib_qaxpy( lm, -t, ab( kd+1, j ), 1, work( j+1 ), 1 )
+                    end do
+                 end if
+                 ! multiply by inv(u).
+                 call stdlib_qlatbs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kl+ku, ab, &
+                           ldab, work, scale, work( 2*n+1 ),info )
+              else
+                 ! multiply by inv(u**t).
+                 call stdlib_qlatbs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n,kl+ku, ab, ldab, &
+                           work, scale, work( 2*n+1 ),info )
+                 ! multiply by inv(l**t).
+                 if( lnoti ) then
+                    do j = n - 1, 1, -1
+                       lm = min( kl, n-j )
+                       work( j ) = work( j ) - stdlib_qdot( lm, ab( kd+1, j ), 1,work( j+1 ), 1 )
+                                 
+                       jp = ipiv( j )
+                       if( jp/=j ) then
+                          t = work( jp )
+                          work( jp ) = work( j )
+                          work( j ) = t
+                       end if
+                    end do
+                 end if
+              end if
+              ! divide x by 1/scale if doing so will not cause overflow.
+              normin = 'Y'
+              if( scale/=one ) then
+                 ix = stdlib_iqamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 40
+                 call stdlib_qrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           40 continue
+           return
+     end subroutine stdlib_qgbcon
+
+     !> DGBEQU: computes row and column scalings intended to equilibrate an
+     !> M-by-N band matrix A and reduce its condition number.  R returns the
+     !> row scale factors and C the column scale factors, chosen to try to
+     !> make the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+     !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+     !> number and BIGNUM = largest safe number.  Use of these scaling
+     !> factors is not guaranteed to reduce the condition number of A but
+     !> works well in practice.
+
+     pure subroutine stdlib_qgbequ( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(qp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(qp), intent(in) :: ab(ldab,*)
+           real(qp), intent(out) :: c(*), r(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(qp) :: bignum, rcmax, rcmin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 r( i ) = max( r( i ), abs( ab( kd+i-j, j ) ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i))
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 c( j ) = max( c( j ), abs( ab( kd+i-j, j ) )*r( i ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j))
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_qgbequ
+
+     !> DGBEQUB: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+     !> the radix.
+     !> R(i) and C(j) are restricted to be a power of the radix between
+     !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+     !> of these scaling factors is not guaranteed to reduce the condition
+     !> number of A but works well in practice.
+     !> This routine differs from DGEEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled entries' magnitudes are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_qgbequb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(qp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(qp), intent(in) :: ab(ldab,*)
+           real(qp), intent(out) :: c(*), r(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(qp) :: bignum, rcmax, rcmin, smlnum, radix, logrdx
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,log
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.  assume smlnum is a power of the radix.
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           radix = stdlib_qlamch( 'B' )
+           logrdx = log(radix)
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 r( i ) = max( r( i ), abs( ab( kd+i-j, j ) ) )
+              end do
+           end do
+           do i = 1, m
+              if( r( i )>zero ) then
+                 r( i ) = radix**int( log( r( i ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i)).
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors.
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 c( j ) = max( c( j ), abs( ab( kd+i-j, j ) )*r( i ) )
+              end do
+              if( c( j )>zero ) then
+                 c( j ) = radix**int( log( c( j ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j)).
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_qgbequb
+
+     !> DGBRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is banded, and provides
+     !> error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_qgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb,ipiv, b, ldb, x, &
+               ldx, ferr, berr, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldafb, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*)
+           real(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transt
+           integer(ilp) :: count, i, j, k, kase, kk, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<kl+ku+1 ) then
+              info = -7
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -9
+           else if( ldb<max( 1, n ) ) then
+              info = -12
+           else if( ldx<max( 1, n ) ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = min( kl+ku+2, n+1 )
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_qcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_qgbmv( trans, n, n, kl, ku, -one, ab, ldab, x( 1, j ), 1,one, work( n+1 &
+                        ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(op(a))*abs(x) + abs(b).
+              if( notran ) then
+                 do k = 1, n
+                    kk = ku + 1 - k
+                    xk = abs( x( k, j ) )
+                    do i = max( 1, k-ku ), min( n, k+kl )
+                       work( i ) = work( i ) + abs( ab( kk+i, k ) )*xk
+                    end do
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    kk = ku + 1 - k
+                    do i = max( 1, k-ku ), min( n, k+kl )
+                       s = s + abs( ab( kk+i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_qgbtrs( trans, n, kl, ku, 1, afb, ldafb, ipiv,work( n+1 ), n, info )
+                           
+                 call stdlib_qaxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_qlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_qlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_qgbtrs( transt, n, kl, ku, 1, afb, ldafb, ipiv,work( n+1 ), n, &
+                              info )
+                    do i = 1, n
+                       work( n+i ) = work( n+i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( n+i )*work( i )
+                    end do
+                    call stdlib_qgbtrs( trans, n, kl, ku, 1, afb, ldafb, ipiv,work( n+1 ), n, &
+                              info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_qgbrfs
+
+     !> DGBSV: computes the solution to a real system of linear equations
+     !> A * X = B, where A is a band matrix of order N with KL subdiagonals
+     !> and KU superdiagonals, and X and B are N-by-NRHS matrices.
+     !> The LU decomposition with partial pivoting and row interchanges is
+     !> used to factor A as A = L * U, where L is a product of permutation
+     !> and unit lower triangular matrices with KL subdiagonals, and U is
+     !> upper triangular with KL+KU superdiagonals.  The factored form of A
+     !> is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_qgbsv( n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: ab(ldab,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( kl<0 ) then
+              info = -2
+           else if( ku<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<2*kl+ku+1 ) then
+              info = -6
+           else if( ldb<max( n, 1 ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBSV ', -info )
+              return
+           end if
+           ! compute the lu factorization of the band matrix a.
+           call stdlib_qgbtrf( n, n, kl, ku, ab, ldab, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_qgbtrs( 'NO TRANSPOSE', n, kl, ku, nrhs, ab, ldab, ipiv,b, ldb, info )
+                        
+           end if
+           return
+     end subroutine stdlib_qgbsv
+
+     !> DGBSVX: uses the LU factorization to compute the solution to a real
+     !> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+     !> where A is a band matrix of order N with KL subdiagonals and KU
+     !> superdiagonals, and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_qgbsvx( fact, trans, n, kl, ku, nrhs, ab, ldab, afb,ldafb, ipiv, equed, r, &
+               c, b, ldb, x, ldx,rcond, ferr, berr, work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldafb, ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: ab(ldab,*), afb(ldafb,*), b(ldb,*), c(*), r(*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: colequ, equil, nofact, notran, rowequ
+           character :: norm
+           integer(ilp) :: i, infequ, j, j1, j2
+           real(qp) :: amax, anorm, bignum, colcnd, rcmax, rcmin, rowcnd, rpvgrw, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rowequ = .false.
+              colequ = .false.
+           else
+              rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+              colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kl<0 ) then
+              info = -4
+           else if( ku<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kl+ku+1 ) then
+              info = -8
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -10
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rowequ .or. colequ .or. stdlib_lsame( &
+                     equed, 'N' ) ) ) then
+              info = -12
+           else
+              if( rowequ ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, r( j ) )
+                    rcmax = max( rcmax, r( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -13
+                 else if( n>0 ) then
+                    rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    rowcnd = one
+                 end if
+              end if
+              if( colequ .and. info==0 ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, c( j ) )
+                    rcmax = max( rcmax, c( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -14
+                 else if( n>0 ) then
+                    colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    colcnd = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -16
+                 else if( ldx<max( 1, n ) ) then
+                    info = -18
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_qgbequ( n, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, infequ )
+                        
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_qlaqgb( n, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+                           
+                 rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+                 colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( notran ) then
+              if( rowequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       b( i, j ) = r( i )*b( i, j )
+                    end do
+                 end do
+              end if
+           else if( colequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = c( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the lu factorization of the band matrix a.
+              do j = 1, n
+                 j1 = max( j-ku, 1 )
+                 j2 = min( j+kl, n )
+                 call stdlib_qcopy( j2-j1+1, ab( ku+1-j+j1, j ), 1,afb( kl+ku+1-j+j1, j ), 1 )
+                           
+              end do
+              call stdlib_qgbtrf( n, n, kl, ku, afb, ldafb, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 ) then
+                 ! compute the reciprocal pivot growth factor of the
+                 ! leading rank-deficient info columns of a.
+                 anorm = zero
+                 do j = 1, info
+                    do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                       anorm = max( anorm, abs( ab( i, j ) ) )
+                    end do
+                 end do
+                 rpvgrw = stdlib_qlantb( 'M', 'U', 'N', info, min( info-1, kl+ku ),afb( max( 1, &
+                           kl+ku+2-info ), 1 ), ldafb,work )
+                 if( rpvgrw==zero ) then
+                    rpvgrw = one
+                 else
+                    rpvgrw = anorm / rpvgrw
+                 end if
+                 work( 1 ) = rpvgrw
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a and the
+           ! reciprocal pivot growth factor rpvgrw.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_qlangb( norm, n, kl, ku, ab, ldab, work )
+           rpvgrw = stdlib_qlantb( 'M', 'U', 'N', n, kl+ku, afb, ldafb, work )
+           if( rpvgrw==zero ) then
+              rpvgrw = one
+           else
+              rpvgrw = stdlib_qlangb( 'M', n, kl, ku, ab, ldab, work ) / rpvgrw
+           end if
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_qgbcon( norm, n, kl, ku, afb, ldafb, ipiv, anorm, rcond,work, iwork, info )
+                     
+           ! compute the solution matrix x.
+           call stdlib_qlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_qgbtrs( trans, n, kl, ku, nrhs, afb, ldafb, ipiv, x, ldx,info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_qgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb, ipiv,b, ldb, x, ldx, &
+                     ferr, berr, work, iwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( notran ) then
+              if( colequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       x( i, j ) = c( i )*x( i, j )
+                    end do
+                 end do
+                 do j = 1, nrhs
+                    ferr( j ) = ferr( j ) / colcnd
+                 end do
+              end if
+           else if( rowequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = r( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / rowcnd
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           work( 1 ) = rpvgrw
+           return
+     end subroutine stdlib_qgbsvx
+
+     !> DGBTF2: computes an LU factorization of a real m-by-n band matrix A
+     !> using partial pivoting with row interchanges.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_qgbtf2( m, n, kl, ku, ab, ldab, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jp, ju, km, kv
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! kv is the number of superdiagonals in the factor u, allowing for
+           ! fill-in.
+           kv = ku + kl
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+kv+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! gaussian elimination with partial pivoting
+           ! set fill-in elements in columns ku+2 to kv to zero.
+           do j = ku + 2, min( kv, n )
+              do i = kv - j + 2, kl
+                 ab( i, j ) = zero
+              end do
+           end do
+           ! ju is the index of the last column affected by the current stage
+           ! of the factorization.
+           ju = 1
+           loop_40: do j = 1, min( m, n )
+              ! set fill-in elements in column j+kv to zero.
+              if( j+kv<=n ) then
+                 do i = 1, kl
+                    ab( i, j+kv ) = zero
+                 end do
+              end if
+              ! find pivot and test for singularity. km is the number of
+              ! subdiagonal elements in the current column.
+              km = min( kl, m-j )
+              jp = stdlib_iqamax( km+1, ab( kv+1, j ), 1 )
+              ipiv( j ) = jp + j - 1
+              if( ab( kv+jp, j )/=zero ) then
+                 ju = max( ju, min( j+ku+jp-1, n ) )
+                 ! apply interchange to columns j to ju.
+                 if( jp/=1 )call stdlib_qswap( ju-j+1, ab( kv+jp, j ), ldab-1,ab( kv+1, j ), ldab-&
+                           1 )
+                 if( km>0 ) then
+                    ! compute multipliers.
+                    call stdlib_qscal( km, one / ab( kv+1, j ), ab( kv+2, j ), 1 )
+                    ! update trailing submatrix within the band.
+                    if( ju>j )call stdlib_qger( km, ju-j, -one, ab( kv+2, j ), 1,ab( kv, j+1 ), &
+                              ldab-1, ab( kv+1, j+1 ),ldab-1 )
+                 end if
+              else
+                 ! if pivot is zero, set info to the index of the pivot
+                 ! unless a zero pivot has already been found.
+                 if( info==0 )info = j
+              end if
+           end do loop_40
+           return
+     end subroutine stdlib_qgbtf2
+
+     !> DGBTRF: computes an LU factorization of a real m-by-n band matrix A
+     !> using partial pivoting with row interchanges.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_qgbtrf( m, n, kl, ku, ab, ldab, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldwork = nbmax+1
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i2, i3, ii, ip, j, j2, j3, jb, jj, jm, jp, ju, k2, km, kv, nb, &
+                     nw
+           real(qp) :: temp
+           ! Local Arrays 
+           real(qp) :: work13(ldwork,nbmax), work31(ldwork,nbmax)
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! kv is the number of superdiagonals in the factor u, allowing for
+           ! fill-in
+           kv = ku + kl
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+kv+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! determine the block size for this environment
+           nb = stdlib_ilaenv( 1, 'DGBTRF', ' ', m, n, kl, ku )
+           ! the block size must not exceed the limit set by the size of the
+           ! local arrays work13 and work31.
+           nb = min( nb, nbmax )
+           if( nb<=1 .or. nb>kl ) then
+              ! use unblocked code
+              call stdlib_qgbtf2( m, n, kl, ku, ab, ldab, ipiv, info )
+           else
+              ! use blocked code
+              ! zero the superdiagonal elements of the work array work13
+              do j = 1, nb
+                 do i = 1, j - 1
+                    work13( i, j ) = zero
+                 end do
+              end do
+              ! zero the subdiagonal elements of the work array work31
+              do j = 1, nb
+                 do i = j + 1, nb
+                    work31( i, j ) = zero
+                 end do
+              end do
+              ! gaussian elimination with partial pivoting
+              ! set fill-in elements in columns ku+2 to kv to zero
+              do j = ku + 2, min( kv, n )
+                 do i = kv - j + 2, kl
+                    ab( i, j ) = zero
+                 end do
+              end do
+              ! ju is the index of the last column affected by the current
+              ! stage of the factorization
+              ju = 1
+              loop_180: do j = 1, min( m, n ), nb
+                 jb = min( nb, min( m, n )-j+1 )
+                 ! the active part of the matrix is partitioned
+                    ! a11   a12   a13
+                    ! a21   a22   a23
+                    ! a31   a32   a33
+                 ! here a11, a21 and a31 denote the current block of jb columns
+                 ! which is about to be factorized. the number of rows in the
+                 ! partitioning are jb, i2, i3 respectively, and the numbers
+                 ! of columns are jb, j2, j3. the superdiagonal elements of a13
+                 ! and the subdiagonal elements of a31 lie outside the band.
+                 i2 = min( kl-jb, m-j-jb+1 )
+                 i3 = min( jb, m-j-kl+1 )
+                 ! j2 and j3 are computed after ju has been updated.
+                 ! factorize the current block of jb columns
+                 loop_80: do jj = j, j + jb - 1
+                    ! set fill-in elements in column jj+kv to zero
+                    if( jj+kv<=n ) then
+                       do i = 1, kl
+                          ab( i, jj+kv ) = zero
+                       end do
+                    end if
+                    ! find pivot and test for singularity. km is the number of
+                    ! subdiagonal elements in the current column.
+                    km = min( kl, m-jj )
+                    jp = stdlib_iqamax( km+1, ab( kv+1, jj ), 1 )
+                    ipiv( jj ) = jp + jj - j
+                    if( ab( kv+jp, jj )/=zero ) then
+                       ju = max( ju, min( jj+ku+jp-1, n ) )
+                       if( jp/=1 ) then
+                          ! apply interchange to columns j to j+jb-1
+                          if( jp+jj-1<j+kl ) then
+                             call stdlib_qswap( jb, ab( kv+1+jj-j, j ), ldab-1,ab( kv+jp+jj-j, j )&
+                                       , ldab-1 )
+                          else
+                             ! the interchange affects columns j to jj-1 of a31
+                             ! which are stored in the work array work31
+                             call stdlib_qswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,work31( jp+jj-j-&
+                                       kl, 1 ), ldwork )
+                             call stdlib_qswap( j+jb-jj, ab( kv+1, jj ), ldab-1,ab( kv+jp, jj ), &
+                                       ldab-1 )
+                          end if
+                       end if
+                       ! compute multipliers
+                       call stdlib_qscal( km, one / ab( kv+1, jj ), ab( kv+2, jj ),1 )
+                       ! update trailing submatrix within the band and within
+                       ! the current block. jm is the index of the last column
+                       ! which needs to be updated.
+                       jm = min( ju, j+jb-1 )
+                       if( jm>jj )call stdlib_qger( km, jm-jj, -one, ab( kv+2, jj ), 1,ab( kv, jj+&
+                                 1 ), ldab-1,ab( kv+1, jj+1 ), ldab-1 )
+                    else
+                       ! if pivot is zero, set info to the index of the pivot
+                       ! unless a zero pivot has already been found.
+                       if( info==0 )info = jj
+                    end if
+                    ! copy current column of a31 into the work array work31
+                    nw = min( jj-j+1, i3 )
+                    if( nw>0 )call stdlib_qcopy( nw, ab( kv+kl+1-jj+j, jj ), 1,work31( 1, jj-j+1 )&
+                              , 1 )
+                 end do loop_80
+                 if( j+jb<=n ) then
+                    ! apply the row interchanges to the other blocks.
+                    j2 = min( ju-j+1, kv ) - jb
+                    j3 = max( 0, ju-j-kv+1 )
+                    ! use stdlib_qlaswp to apply the row interchanges to a12, a22, and
+                    ! a32.
+                    call stdlib_qlaswp( j2, ab( kv+1-jb, j+jb ), ldab-1, 1, jb,ipiv( j ), 1 )
+                              
+                    ! adjust the pivot indices.
+                    do i = j, j + jb - 1
+                       ipiv( i ) = ipiv( i ) + j - 1
+                    end do
+                    ! apply the row interchanges to a13, a23, and a33
+                    ! columnwise.
+                    k2 = j - 1 + jb + j2
+                    do i = 1, j3
+                       jj = k2 + i
+                       do ii = j + i - 1, j + jb - 1
+                          ip = ipiv( ii )
+                          if( ip/=ii ) then
+                             temp = ab( kv+1+ii-jj, jj )
+                             ab( kv+1+ii-jj, jj ) = ab( kv+1+ip-jj, jj )
+                             ab( kv+1+ip-jj, jj ) = temp
+                          end if
+                       end do
+                    end do
+                    ! update the relevant part of the trailing submatrix
+                    if( j2>0 ) then
+                       ! update a12
+                       call stdlib_qtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT',jb, j2, one, ab(&
+                                  kv+1, j ), ldab-1,ab( kv+1-jb, j+jb ), ldab-1 )
+                       if( i2>0 ) then
+                          ! update a22
+                          call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i2, j2,jb, -one, ab( &
+                          kv+1+jb, j ), ldab-1,ab( kv+1-jb, j+jb ), ldab-1, one,ab( kv+1, j+jb ), &
+                                    ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! update a32
+                          call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i3, j2,jb, -one, &
+                          work31, ldwork,ab( kv+1-jb, j+jb ), ldab-1, one,ab( kv+kl+1-jb, j+jb ), &
+                                    ldab-1 )
+                       end if
+                    end if
+                    if( j3>0 ) then
+                       ! copy the lower triangle of a13 into the work array
+                       ! work13
+                       do jj = 1, j3
+                          do ii = jj, jb
+                             work13( ii, jj ) = ab( ii-jj+1, jj+j+kv-1 )
+                          end do
+                       end do
+                       ! update a13 in the work array
+                       call stdlib_qtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT',jb, j3, one, ab(&
+                                  kv+1, j ), ldab-1,work13, ldwork )
+                       if( i2>0 ) then
+                          ! update a23
+                          call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i2, j3,jb, -one, ab( &
+                          kv+1+jb, j ), ldab-1,work13, ldwork, one, ab( 1+jb, j+kv ),ldab-1 )
+                                    
+                       end if
+                       if( i3>0 ) then
+                          ! update a33
+                          call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i3, j3,jb, -one, &
+                                    work31, ldwork, work13,ldwork, one, ab( 1+kl, j+kv ), ldab-1 )
+                       end if
+                       ! copy the lower triangle of a13 back into place
+                       do jj = 1, j3
+                          do ii = jj, jb
+                             ab( ii-jj+1, jj+j+kv-1 ) = work13( ii, jj )
+                          end do
+                       end do
+                    end if
+                 else
+                    ! adjust the pivot indices.
+                    do i = j, j + jb - 1
+                       ipiv( i ) = ipiv( i ) + j - 1
+                    end do
+                 end if
+                 ! partially undo the interchanges in the current block to
+                 ! restore the upper triangular form of a31 and copy the upper
+                 ! triangle of a31 back into place
+                 do jj = j + jb - 1, j, -1
+                    jp = ipiv( jj ) - jj + 1
+                    if( jp/=1 ) then
+                       ! apply interchange to columns j to jj-1
+                       if( jp+jj-1<j+kl ) then
+                          ! the interchange does not affect a31
+                          call stdlib_qswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,ab( kv+jp+jj-j, j ),&
+                                     ldab-1 )
+                       else
+                          ! the interchange does affect a31
+                          call stdlib_qswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,work31( jp+jj-j-kl, &
+                                    1 ), ldwork )
+                       end if
+                    end if
+                    ! copy the current column of a31 back into place
+                    nw = min( i3, jj-j+1 )
+                    if( nw>0 )call stdlib_qcopy( nw, work31( 1, jj-j+1 ), 1,ab( kv+kl+1-jj+j, jj )&
+                              , 1 )
+                 end do
+              end do loop_180
+           end if
+           return
+     end subroutine stdlib_qgbtrf
+
+     !> DGBTRS: solves a system of linear equations
+     !> A * X = B  or  A**T * X = B
+     !> with a general band matrix A using the LU factorization computed
+     !> by DGBTRF.
+
+     pure subroutine stdlib_qgbtrs( trans, n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(in) :: ab(ldab,*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnoti, notran
+           integer(ilp) :: i, j, kd, l, lm
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<( 2*kl+ku+1 ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           kd = ku + kl + 1
+           lnoti = kl>0
+           if( notran ) then
+              ! solve  a*x = b.
+              ! solve l*x = b, overwriting b with x.
+              ! l is represented as a product of permutations and unit lower
+              ! triangular matrices l = p(1) * l(1) * ... * p(n-1) * l(n-1),
+              ! where each transformation l(i) is a rank-one modification of
+              ! the identity matrix.
+              if( lnoti ) then
+                 do j = 1, n - 1
+                    lm = min( kl, n-j )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_qswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                    call stdlib_qger( lm, nrhs, -one, ab( kd+1, j ), 1, b( j, 1 ),ldb, b( j+1, 1 )&
+                              , ldb )
+                 end do
+              end if
+              do i = 1, nrhs
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_qtbsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, kl+ku,ab, ldab, b( 1, &
+                           i ), 1 )
+              end do
+           else
+              ! solve a**t*x = b.
+              do i = 1, nrhs
+                 ! solve u**t*x = b, overwriting b with x.
+                 call stdlib_qtbsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, kl+ku, ab,ldab, b( 1, i )&
+                           , 1 )
+              end do
+              ! solve l**t*x = b, overwriting b with x.
+              if( lnoti ) then
+                 do j = n - 1, 1, -1
+                    lm = min( kl, n-j )
+                    call stdlib_qgemv( 'TRANSPOSE', lm, nrhs, -one, b( j+1, 1 ),ldb, ab( kd+1, j )&
+                              , 1, one, b( j, 1 ), ldb )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_qswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qgbtrs
+
+     !> DGEBAK: forms the right or left eigenvectors of a real general matrix
+     !> by backward transformation on the computed eigenvectors of the
+     !> balanced matrix output by DGEBAL.
+
+     pure subroutine stdlib_qgebak( job, side, n, ilo, ihi, scale, m, v, ldv,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job, side
+           integer(ilp), intent(in) :: ihi, ilo, ldv, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(in) :: scale(*)
+           real(qp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: leftv, rightv
+           integer(ilp) :: i, ii, k
+           real(qp) :: s
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! decode and test the input parameters
+           rightv = stdlib_lsame( side, 'R' )
+           leftv = stdlib_lsame( side, 'L' )
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( .not.rightv .and. .not.leftv ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -4
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -5
+           else if( m<0 ) then
+              info = -7
+           else if( ldv<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEBAK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( m==0 )return
+           if( stdlib_lsame( job, 'N' ) )return
+           if( ilo==ihi )go to 30
+           ! backward balance
+           if( stdlib_lsame( job, 'S' ) .or. stdlib_lsame( job, 'B' ) ) then
+              if( rightv ) then
+                 do i = ilo, ihi
+                    s = scale( i )
+                    call stdlib_qscal( m, s, v( i, 1 ), ldv )
+                 end do
+              end if
+              if( leftv ) then
+                 do i = ilo, ihi
+                    s = one / scale( i )
+                    call stdlib_qscal( m, s, v( i, 1 ), ldv )
+                 end do
+              end if
+           end if
+           ! backward permutation
+           ! for  i = ilo-1 step -1 until 1,
+                    ! ihi+1 step 1 until n do --
+                    30 continue
+           if( stdlib_lsame( job, 'P' ) .or. stdlib_lsame( job, 'B' ) ) then
+              if( rightv ) then
+                 loop_40: do ii = 1, n
+                    i = ii
+                    if( i>=ilo .and. i<=ihi )cycle loop_40
+                    if( i<ilo )i = ilo - ii
+                    k = scale( i )
+                    if( k==i )cycle loop_40
+                    call stdlib_qswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_40
+              end if
+              if( leftv ) then
+                 loop_50: do ii = 1, n
+                    i = ii
+                    if( i>=ilo .and. i<=ihi )cycle loop_50
+                    if( i<ilo )i = ilo - ii
+                    k = scale( i )
+                    if( k==i )cycle loop_50
+                    call stdlib_qswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_50
+              end if
+           end if
+           return
+     end subroutine stdlib_qgebak
+
+     !> DGEBAL: balances a general real matrix A.  This involves, first,
+     !> permuting A by a similarity transformation to isolate eigenvalues
+     !> in the first 1 to ILO-1 and last IHI+1 to N elements on the
+     !> diagonal; and second, applying a diagonal similarity transformation
+     !> to rows and columns ILO to IHI to make the rows and columns as
+     !> close in norm as possible.  Both steps are optional.
+     !> Balancing may reduce the 1-norm of the matrix, and improve the
+     !> accuracy of the computed eigenvalues and/or eigenvectors.
+
+     pure subroutine stdlib_qgebal( job, n, a, lda, ilo, ihi, scale, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: scale(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sclfac = 2.0e+0_qp
+           real(qp), parameter :: factor = 0.95e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: noconv
+           integer(ilp) :: i, ica, iexc, ira, j, k, l, m
+           real(qp) :: c, ca, f, g, r, ra, s, sfmax1, sfmax2, sfmin1, sfmin2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! test the input parameters
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEBAL', -info )
+              return
+           end if
+           k = 1
+           l = n
+           if( n==0 )go to 210
+           if( stdlib_lsame( job, 'N' ) ) then
+              do i = 1, n
+                 scale( i ) = one
+              end do
+              go to 210
+           end if
+           if( stdlib_lsame( job, 'S' ) )go to 120
+           ! permutation to isolate eigenvalues if possible
+           go to 50
+           ! row and column exchange.
+           20 continue
+           scale( m ) = j
+           if( j==m )go to 30
+           call stdlib_qswap( l, a( 1, j ), 1, a( 1, m ), 1 )
+           call stdlib_qswap( n-k+1, a( j, k ), lda, a( m, k ), lda )
+           30 continue
+           go to ( 40, 80 )iexc
+           ! search for rows isolating an eigenvalue and push them down.
+           40 continue
+           if( l==1 )go to 210
+           l = l - 1
+           50 continue
+           loop_70: do j = l, 1, -1
+              loop_60: do i = 1, l
+                 if( i==j )cycle loop_60
+                 if( a( j, i )/=zero )cycle loop_70
+              end do loop_60
+              m = l
+              iexc = 1
+              go to 20
+           end do loop_70
+           go to 90
+           ! search for columns isolating an eigenvalue and push them left.
+           80 continue
+           k = k + 1
+           90 continue
+           loop_110: do j = k, l
+              loop_100: do i = k, l
+                 if( i==j )cycle loop_100
+                 if( a( i, j )/=zero )cycle loop_110
+              end do loop_100
+              m = k
+              iexc = 2
+              go to 20
+           end do loop_110
+           120 continue
+           do i = k, l
+              scale( i ) = one
+           end do
+           if( stdlib_lsame( job, 'P' ) )go to 210
+           ! balance the submatrix in rows k to l.
+           ! iterative loop for norm reduction
+           sfmin1 = stdlib_qlamch( 'S' ) / stdlib_qlamch( 'P' )
+           sfmax1 = one / sfmin1
+           sfmin2 = sfmin1*sclfac
+           sfmax2 = one / sfmin2
+           140 continue
+           noconv = .false.
+           loop_200: do i = k, l
+              c = stdlib_qnrm2( l-k+1, a( k, i ), 1 )
+              r = stdlib_qnrm2( l-k+1, a( i, k ), lda )
+              ica = stdlib_iqamax( l, a( 1, i ), 1 )
+              ca = abs( a( ica, i ) )
+              ira = stdlib_iqamax( n-k+1, a( i, k ), lda )
+              ra = abs( a( i, ira+k-1 ) )
+              ! guard against zero c or r due to underflow.
+              if( c==zero .or. r==zero )cycle loop_200
+              g = r / sclfac
+              f = one
+              s = c + r
+              160 continue
+              if( c>=g .or. max( f, c, ca )>=sfmax2 .or.min( r, g, ra )<=sfmin2 )go to 170
+                 if( stdlib_qisnan( c+f+ca+r+g+ra ) ) then
+                 ! exit if nan to avoid infinite loop
+                 info = -3
+                 call stdlib_xerbla( 'DGEBAL', -info )
+                 return
+              end if
+              f = f*sclfac
+              c = c*sclfac
+              ca = ca*sclfac
+              r = r / sclfac
+              g = g / sclfac
+              ra = ra / sclfac
+              go to 160
+              170 continue
+              g = c / sclfac
+              180 continue
+              if( g<r .or. max( r, ra )>=sfmax2 .or.min( f, c, g, ca )<=sfmin2 )go to 190
+              f = f / sclfac
+              c = c / sclfac
+              g = g / sclfac
+              ca = ca / sclfac
+              r = r*sclfac
+              ra = ra*sclfac
+              go to 180
+              ! now balance.
+              190 continue
+              if( ( c+r )>=factor*s )cycle loop_200
+              if( f<one .and. scale( i )<one ) then
+                 if( f*scale( i )<=sfmin1 )cycle loop_200
+              end if
+              if( f>one .and. scale( i )>one ) then
+                 if( scale( i )>=sfmax1 / f )cycle loop_200
+              end if
+              g = one / f
+              scale( i ) = scale( i )*f
+              noconv = .true.
+              call stdlib_qscal( n-k+1, g, a( i, k ), lda )
+              call stdlib_qscal( l, f, a( 1, i ), 1 )
+           end do loop_200
+           if( noconv )go to 140
+           210 continue
+           ilo = k
+           ihi = l
+           return
+     end subroutine stdlib_qgebal
+
+     !> DGEBD2: reduces a real general m by n matrix A to upper or lower
+     !> bidiagonal form B by an orthogonal transformation: Q**T * A * P = B.
+     !> If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+     pure subroutine stdlib_qgebd2( m, n, a, lda, d, e, tauq, taup, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: d(*), e(*), taup(*), tauq(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info<0 ) then
+              call stdlib_xerbla( 'DGEBD2', -info )
+              return
+           end if
+           if( m>=n ) then
+              ! reduce to upper bidiagonal form
+              do i = 1, n
+                 ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+                 call stdlib_qlarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tauq( i ) )
+                           
+                 d( i ) = a( i, i )
+                 a( i, i ) = one
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 if( i<n )call stdlib_qlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tauq( i ),a( i, i+&
+                           1 ), lda, work )
+                 a( i, i ) = d( i )
+                 if( i<n ) then
+                    ! generate elementary reflector g(i) to annihilate
+                    ! a(i,i+2:n)
+                    call stdlib_qlarfg( n-i, a( i, i+1 ), a( i, min( i+2, n ) ),lda, taup( i ) )
+                              
+                    e( i ) = a( i, i+1 )
+                    a( i, i+1 ) = one
+                    ! apply g(i) to a(i+1:m,i+1:n) from the right
+                    call stdlib_qlarf( 'RIGHT', m-i, n-i, a( i, i+1 ), lda,taup( i ), a( i+1, i+1 &
+                              ), lda, work )
+                    a( i, i+1 ) = e( i )
+                 else
+                    taup( i ) = zero
+                 end if
+              end do
+           else
+              ! reduce to lower bidiagonal form
+              do i = 1, m
+                 ! generate elementary reflector g(i) to annihilate a(i,i+1:n)
+                 call stdlib_qlarfg( n-i+1, a( i, i ), a( i, min( i+1, n ) ), lda,taup( i ) )
+                           
+                 d( i ) = a( i, i )
+                 a( i, i ) = one
+                 ! apply g(i) to a(i+1:m,i:n) from the right
+                 if( i<m )call stdlib_qlarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda,taup( i ), a( i+&
+                           1, i ), lda, work )
+                 a( i, i ) = d( i )
+                 if( i<m ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(i+2:m,i)
+                    call stdlib_qlarfg( m-i, a( i+1, i ), a( min( i+2, m ), i ), 1,tauq( i ) )
+                              
+                    e( i ) = a( i+1, i )
+                    a( i+1, i ) = one
+                    ! apply h(i) to a(i+1:m,i+1:n) from the left
+                    call stdlib_qlarf( 'LEFT', m-i, n-i, a( i+1, i ), 1, tauq( i ),a( i+1, i+1 ), &
+                              lda, work )
+                    a( i+1, i ) = e( i )
+                 else
+                    tauq( i ) = zero
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_qgebd2
+
+     !> DGEBRD: reduces a general real M-by-N matrix A to upper or lower
+     !> bidiagonal form B by an orthogonal transformation: Q**T * A * P = B.
+     !> If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+     pure subroutine stdlib_qgebrd( m, n, a, lda, d, e, tauq, taup, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: d(*), e(*), taup(*), tauq(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, ldwrkx, ldwrky, lwkopt, minmn, nb, nbmin, nx, ws
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb = max( 1, stdlib_ilaenv( 1, 'DGEBRD', ' ', m, n, -1, -1 ) )
+           lwkopt = ( m+n )*nb
+           work( 1 ) = real( lwkopt,KIND=qp)
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, m, n ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info<0 ) then
+              call stdlib_xerbla( 'DGEBRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           minmn = min( m, n )
+           if( minmn==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ws = max( m, n )
+           ldwrkx = m
+           ldwrky = n
+           if( nb>1 .and. nb<minmn ) then
+              ! set the crossover point nx.
+              nx = max( nb, stdlib_ilaenv( 3, 'DGEBRD', ' ', m, n, -1, -1 ) )
+              ! determine when to switch from blocked to unblocked code.
+              if( nx<minmn ) then
+                 ws = ( m+n )*nb
+                 if( lwork<ws ) then
+                    ! not enough work space for the optimal nb, consider using
+                    ! a smaller block size.
+                    nbmin = stdlib_ilaenv( 2, 'DGEBRD', ' ', m, n, -1, -1 )
+                    if( lwork>=( m+n )*nbmin ) then
+                       nb = lwork / ( m+n )
+                    else
+                       nb = 1
+                       nx = minmn
+                    end if
+                 end if
+              end if
+           else
+              nx = minmn
+           end if
+           do i = 1, minmn - nx, nb
+              ! reduce rows and columns i:i+nb-1 to bidiagonal form and return
+              ! the matrices x and y which are needed to update the unreduced
+              ! part of the matrix
+              call stdlib_qlabrd( m-i+1, n-i+1, nb, a( i, i ), lda, d( i ), e( i ),tauq( i ), &
+                        taup( i ), work, ldwrkx,work( ldwrkx*nb+1 ), ldwrky )
+              ! update the trailing submatrix a(i+nb:m,i+nb:n), using an update
+              ! of the form  a := a - v*y**t - x*u**t
+              call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', m-i-nb+1, n-i-nb+1,nb, -one, a( i+&
+                        nb, i ), lda,work( ldwrkx*nb+nb+1 ), ldwrky, one,a( i+nb, i+nb ), lda )
+              call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-i-nb+1, n-i-nb+1,nb, -one, &
+                        work( nb+1 ), ldwrkx, a( i, i+nb ), lda,one, a( i+nb, i+nb ), lda )
+              ! copy diagonal and off-diagonal elements of b back into a
+              if( m>=n ) then
+                 do j = i, i + nb - 1
+                    a( j, j ) = d( j )
+                    a( j, j+1 ) = e( j )
+                 end do
+              else
+                 do j = i, i + nb - 1
+                    a( j, j ) = d( j )
+                    a( j+1, j ) = e( j )
+                 end do
+              end if
+           end do
+           ! use unblocked code to reduce the remainder of the matrix
+           call stdlib_qgebd2( m-i+1, n-i+1, a( i, i ), lda, d( i ), e( i ),tauq( i ), taup( i ), &
+                     work, iinfo )
+           work( 1 ) = ws
+           return
+     end subroutine stdlib_qgebrd
+
+     !> DGECON: estimates the reciprocal of the condition number of a general
+     !> real matrix A, in either the 1-norm or the infinity-norm, using
+     !> the LU factorization computed by DGETRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     pure subroutine stdlib_qgecon( norm, n, a, lda, anorm, rcond, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: onenrm
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(qp) :: ainvnm, scale, sl, smlnum, su
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGECON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           ! estimate the norm of inv(a).
+           ainvnm = zero
+           normin = 'N'
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kase = 0
+           10 continue
+           call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(l).
+                 call stdlib_qlatrs( 'LOWER', 'NO TRANSPOSE', 'UNIT', normin, n, a,lda, work, sl, &
+                           work( 2*n+1 ), info )
+                 ! multiply by inv(u).
+                 call stdlib_qlatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           su, work( 3*n+1 ), info )
+              else
+                 ! multiply by inv(u**t).
+                 call stdlib_qlatrs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n, a,lda, work, su,&
+                            work( 3*n+1 ), info )
+                 ! multiply by inv(l**t).
+                 call stdlib_qlatrs( 'LOWER', 'TRANSPOSE', 'UNIT', normin, n, a,lda, work, sl, &
+                           work( 2*n+1 ), info )
+              end if
+              ! divide x by 1/(sl*su) if doing so will not cause overflow.
+              scale = sl*su
+              normin = 'Y'
+              if( scale/=one ) then
+                 ix = stdlib_iqamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_qrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_qgecon
+
+     !> DGEEQU: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+     !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+     !> number and BIGNUM = largest safe number.  Use of these scaling
+     !> factors is not guaranteed to reduce the condition number of A but
+     !> works well in practice.
+
+     pure subroutine stdlib_qgeequ( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           real(qp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: c(*), r(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: bignum, rcmax, rcmin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           do j = 1, n
+              do i = 1, m
+                 r( i ) = max( r( i ), abs( a( i, j ) ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i))
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = 1, m
+                 c( j ) = max( c( j ), abs( a( i, j ) )*r( i ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j))
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_qgeequ
+
+     !> DGEEQUB: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+     !> the radix.
+     !> R(i) and C(j) are restricted to be a power of the radix between
+     !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+     !> of these scaling factors is not guaranteed to reduce the condition
+     !> number of A but works well in practice.
+     !> This routine differs from DGEEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled entries' magnitudes are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_qgeequb( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           real(qp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: c(*), r(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: bignum, rcmax, rcmin, smlnum, radix, logrdx
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,log
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.  assume smlnum is a power of the radix.
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           radix = stdlib_qlamch( 'B' )
+           logrdx = log( radix )
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           do j = 1, n
+              do i = 1, m
+                 r( i ) = max( r( i ), abs( a( i, j ) ) )
+              end do
+           end do
+           do i = 1, m
+              if( r( i )>zero ) then
+                 r( i ) = radix**int( log( r( i ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i)).
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = 1, m
+                 c( j ) = max( c( j ), abs( a( i, j ) )*r( i ) )
+              end do
+              if( c( j )>zero ) then
+                 c( j ) = radix**int( log( c( j ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j)).
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_qgeequb
+
+     !> DGEES: computes for an N-by-N real nonsymmetric matrix A, the
+     !> eigenvalues, the real Schur form T, and, optionally, the matrix of
+     !> Schur vectors Z.  This gives the Schur factorization A = Z*T*(Z**T).
+     !> Optionally, it also orders the eigenvalues on the diagonal of the
+     !> real Schur form so that selected eigenvalues are at the top left.
+     !> The leading columns of Z then form an orthonormal basis for the
+     !> invariant subspace corresponding to the selected eigenvalues.
+     !> A matrix is in real Schur form if it is upper quasi-triangular with
+     !> 1-by-1 and 2-by-2 blocks. 2-by-2 blocks will be standardized in the
+     !> form
+     !> [  a  b  ]
+     !> [  c  a  ]
+     !> where b*c < 0. The eigenvalues of such a block are a +- sqrt(bc).
+
+     subroutine stdlib_qgees( jobvs, sort, select, n, a, lda, sdim, wr, wi,vs, ldvs, work, lwork, &
+               bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvs, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldvs, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: vs(ldvs,*), wi(*), work(*), wr(*)
+           ! Function Arguments 
+           procedure(stdlib_select_q) :: select
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, lastsl, lquery, lst2sl, scalea, wantst, wantvs
+           integer(ilp) :: hswork, i, i1, i2, ibal, icond, ierr, ieval, ihi, ilo, inxt, ip, itau, &
+                     iwrk, maxwrk, minwrk
+           real(qp) :: anrm, bignum, cscale, eps, s, sep, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(qp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvs = stdlib_lsame( jobvs, 'V' )
+           wantst = stdlib_lsame( sort, 'S' )
+           if( ( .not.wantvs ) .and. ( .not.stdlib_lsame( jobvs, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldvs<1 .or. ( wantvs .and. ldvs<n ) ) then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_qhseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = 2*n + n*stdlib_ilaenv( 1, 'DGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 3*n
+                 call stdlib_qhseqr( 'S', jobvs, n, 1, n, a, lda, wr, wi, vs, ldvs,work, -1, &
+                           ieval )
+                 hswork = work( 1 )
+                 if( .not.wantvs ) then
+                    maxwrk = max( maxwrk, n + hswork )
+                 else
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'DORGHR', ' ', n, 1, n,&
+                               -1 ) )
+                    maxwrk = max( maxwrk, n + hswork )
+                 end if
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEES ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_qlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (workspace: need n)
+           ibal = 1
+           call stdlib_qgebal( 'P', n, a, lda, ilo, ihi, work( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (workspace: need 3*n, prefer 2*n+n*nb)
+           itau = n + ibal
+           iwrk = n + itau
+           call stdlib_qgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvs ) then
+              ! copy householder vectors to vs
+              call stdlib_qlacpy( 'L', n, n, a, lda, vs, ldvs )
+              ! generate orthogonal matrix in vs
+              ! (workspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              call stdlib_qorghr( n, ilo, ihi, vs, ldvs, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+           end if
+           sdim = 0
+           ! perform qr iteration, accumulating schur vectors in vs if desired
+           ! (workspace: need n+1, prefer n+hswork (see comments) )
+           iwrk = itau
+           call stdlib_qhseqr( 'S', jobvs, n, ilo, ihi, a, lda, wr, wi, vs, ldvs,work( iwrk ), &
+                     lwork-iwrk+1, ieval )
+           if( ieval>0 )info = ieval
+           ! sort eigenvalues if desired
+           if( wantst .and. info==0 ) then
+              if( scalea ) then
+                 call stdlib_qlascl( 'G', 0, 0, cscale, anrm, n, 1, wr, n, ierr )
+                 call stdlib_qlascl( 'G', 0, 0, cscale, anrm, n, 1, wi, n, ierr )
+              end if
+              do i = 1, n
+                 bwork( i ) = select( wr( i ), wi( i ) )
+              end do
+              ! reorder eigenvalues and transform schur vectors
+              ! (workspace: none needed)
+              call stdlib_qtrsen( 'N', jobvs, bwork, n, a, lda, vs, ldvs, wr, wi,sdim, s, sep, &
+                        work( iwrk ), lwork-iwrk+1, idum, 1,icond )
+              if( icond>0 )info = n + icond
+           end if
+           if( wantvs ) then
+              ! undo balancing
+              ! (workspace: need n)
+              call stdlib_qgebak( 'P', 'R', n, ilo, ihi, work( ibal ), n, vs, ldvs,ierr )
+           end if
+           if( scalea ) then
+              ! undo scaling for the schur form of a
+              call stdlib_qlascl( 'H', 0, 0, cscale, anrm, n, n, a, lda, ierr )
+              call stdlib_qcopy( n, a, lda+1, wr, 1 )
+              if( cscale==smlnum ) then
+                 ! if scaling back towards underflow, adjust wi if an
+                 ! offdiagonal element of a 2-by-2 block in the schur form
+                 ! underflows.
+                 if( ieval>0 ) then
+                    i1 = ieval + 1
+                    i2 = ihi - 1
+                    call stdlib_qlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wi,max( ilo-1, 1 ), &
+                              ierr )
+                 else if( wantst ) then
+                    i1 = 1
+                    i2 = n - 1
+                 else
+                    i1 = ilo
+                    i2 = ihi - 1
+                 end if
+                 inxt = i1 - 1
+                 loop_20: do i = i1, i2
+                    if( i<inxt )cycle loop_20
+                    if( wi( i )==zero ) then
+                       inxt = i + 1
+                    else
+                       if( a( i+1, i )==zero ) then
+                          wi( i ) = zero
+                          wi( i+1 ) = zero
+                       else if( a( i+1, i )/=zero .and. a( i, i+1 )==zero ) then
+                          wi( i ) = zero
+                          wi( i+1 ) = zero
+                          if( i>1 )call stdlib_qswap( i-1, a( 1, i ), 1, a( 1, i+1 ), 1 )
+                          if( n>i+1 )call stdlib_qswap( n-i-1, a( i, i+2 ), lda,a( i+1, i+2 ), &
+                                    lda )
+                          if( wantvs ) then
+                             call stdlib_qswap( n, vs( 1, i ), 1, vs( 1, i+1 ), 1 )
+                          end if
+                          a( i, i+1 ) = a( i+1, i )
+                          a( i+1, i ) = zero
+                       end if
+                       inxt = i + 2
+                    end if
+                 end do loop_20
+              end if
+              ! undo scaling for the imaginary part of the eigenvalues
+              call stdlib_qlascl( 'G', 0, 0, cscale, anrm, n-ieval, 1,wi( ieval+1 ), max( n-ieval,&
+                         1 ), ierr )
+           end if
+           if( wantst .and. info==0 ) then
+              ! check if reordering successful
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = select( wr( i ), wi( i ) )
+                 if( wi( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_qgees
+
+     !> DGEESX: computes for an N-by-N real nonsymmetric matrix A, the
+     !> eigenvalues, the real Schur form T, and, optionally, the matrix of
+     !> Schur vectors Z.  This gives the Schur factorization A = Z*T*(Z**T).
+     !> Optionally, it also orders the eigenvalues on the diagonal of the
+     !> real Schur form so that selected eigenvalues are at the top left;
+     !> computes a reciprocal condition number for the average of the
+     !> selected eigenvalues (RCONDE); and computes a reciprocal condition
+     !> number for the right invariant subspace corresponding to the
+     !> selected eigenvalues (RCONDV).  The leading columns of Z form an
+     !> orthonormal basis for this invariant subspace.
+     !> For further explanation of the reciprocal condition numbers RCONDE
+     !> and RCONDV, see Section 4.10_qp of the LAPACK Users' Guide (where
+     !> these quantities are called s and sep respectively).
+     !> A real matrix is in real Schur form if it is upper quasi-triangular
+     !> with 1-by-1 and 2-by-2 blocks. 2-by-2 blocks will be standardized in
+     !> the form
+     !> [  a  b  ]
+     !> [  c  a  ]
+     !> where b*c < 0. The eigenvalues of such a block are a +- sqrt(bc).
+
+     subroutine stdlib_qgeesx( jobvs, sort, select, sense, n, a, lda, sdim,wr, wi, vs, ldvs, &
+               rconde, rcondv, work, lwork,iwork, liwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvs, sense, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldvs, liwork, lwork, n
+           real(qp), intent(out) :: rconde, rcondv
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: vs(ldvs,*), wi(*), work(*), wr(*)
+           ! Function Arguments 
+           procedure(stdlib_select_q) :: select
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, lastsl, lquery, lst2sl, scalea, wantsb, wantse, wantsn, wantst, &
+                     wantsv, wantvs
+           integer(ilp) :: hswork, i, i1, i2, ibal, icond, ierr, ieval, ihi, ilo, inxt, ip, itau, &
+                     iwrk, liwrk, lwrk, maxwrk, minwrk
+           real(qp) :: anrm, bignum, cscale, eps, smlnum
+           ! Local Arrays 
+           real(qp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           wantvs = stdlib_lsame( jobvs, 'V' )
+           wantst = stdlib_lsame( sort, 'S' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( ( .not.wantvs ) .and. ( .not.stdlib_lsame( jobvs, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -2
+           else if( .not.( wantsn .or. wantse .or. wantsv .or. wantsb ) .or.( .not.wantst .and. &
+                     .not.wantsn ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldvs<1 .or. ( wantvs .and. ldvs<n ) ) then
+              info = -12
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "rworkspace:" describe the
+             ! minimal amount of real workspace needed at that point in the
+             ! code, as well as the preferred amount for good performance.
+             ! iworkspace refers to integer workspace.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_qhseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.
+             ! if sense = 'e', 'v' or 'b', then the amount of workspace needed
+             ! depends on sdim, which is computed by the routine stdlib_qtrsen later
+             ! in the code.)
+           if( info==0 ) then
+              liwrk = 1
+              if( n==0 ) then
+                 minwrk = 1
+                 lwrk = 1
+              else
+                 maxwrk = 2*n + n*stdlib_ilaenv( 1, 'DGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 3*n
+                 call stdlib_qhseqr( 'S', jobvs, n, 1, n, a, lda, wr, wi, vs, ldvs,work, -1, &
+                           ieval )
+                 hswork = work( 1 )
+                 if( .not.wantvs ) then
+                    maxwrk = max( maxwrk, n + hswork )
+                 else
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'DORGHR', ' ', n, 1, n,&
+                               -1 ) )
+                    maxwrk = max( maxwrk, n + hswork )
+                 end if
+                 lwrk = maxwrk
+                 if( .not.wantsn )lwrk = max( lwrk, n + ( n*n )/2 )
+                 if( wantsv .or. wantsb )liwrk = ( n*n )/4
+              end if
+              iwork( 1 ) = liwrk
+              work( 1 ) = lwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -16
+              else if( liwork<1 .and. .not.lquery ) then
+                 info = -18
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEESX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_qlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (rworkspace: need n)
+           ibal = 1
+           call stdlib_qgebal( 'P', n, a, lda, ilo, ihi, work( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (rworkspace: need 3*n, prefer 2*n+n*nb)
+           itau = n + ibal
+           iwrk = n + itau
+           call stdlib_qgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvs ) then
+              ! copy householder vectors to vs
+              call stdlib_qlacpy( 'L', n, n, a, lda, vs, ldvs )
+              ! generate orthogonal matrix in vs
+              ! (rworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              call stdlib_qorghr( n, ilo, ihi, vs, ldvs, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+           end if
+           sdim = 0
+           ! perform qr iteration, accumulating schur vectors in vs if desired
+           ! (rworkspace: need n+1, prefer n+hswork (see comments) )
+           iwrk = itau
+           call stdlib_qhseqr( 'S', jobvs, n, ilo, ihi, a, lda, wr, wi, vs, ldvs,work( iwrk ), &
+                     lwork-iwrk+1, ieval )
+           if( ieval>0 )info = ieval
+           ! sort eigenvalues if desired
+           if( wantst .and. info==0 ) then
+              if( scalea ) then
+                 call stdlib_qlascl( 'G', 0, 0, cscale, anrm, n, 1, wr, n, ierr )
+                 call stdlib_qlascl( 'G', 0, 0, cscale, anrm, n, 1, wi, n, ierr )
+              end if
+              do i = 1, n
+                 bwork( i ) = select( wr( i ), wi( i ) )
+              end do
+              ! reorder eigenvalues, transform schur vectors, and compute
+              ! reciprocal condition numbers
+              ! (rworkspace: if sense is not 'n', need n+2*sdim*(n-sdim)
+                           ! otherwise, need n )
+              ! (iworkspace: if sense is 'v' or 'b', need sdim*(n-sdim)
+                           ! otherwise, need 0 )
+              call stdlib_qtrsen( sense, jobvs, bwork, n, a, lda, vs, ldvs, wr, wi,sdim, rconde, &
+                        rcondv, work( iwrk ), lwork-iwrk+1,iwork, liwork, icond )
+              if( .not.wantsn )maxwrk = max( maxwrk, n+2*sdim*( n-sdim ) )
+              if( icond==-15 ) then
+                 ! not enough real workspace
+                 info = -16
+              else if( icond==-17 ) then
+                 ! not enough integer workspace
+                 info = -18
+              else if( icond>0 ) then
+                 ! stdlib_qtrsen failed to reorder or to restore standard schur form
+                 info = icond + n
+              end if
+           end if
+           if( wantvs ) then
+              ! undo balancing
+              ! (rworkspace: need n)
+              call stdlib_qgebak( 'P', 'R', n, ilo, ihi, work( ibal ), n, vs, ldvs,ierr )
+           end if
+           if( scalea ) then
+              ! undo scaling for the schur form of a
+              call stdlib_qlascl( 'H', 0, 0, cscale, anrm, n, n, a, lda, ierr )
+              call stdlib_qcopy( n, a, lda+1, wr, 1 )
+              if( ( wantsv .or. wantsb ) .and. info==0 ) then
+                 dum( 1 ) = rcondv
+                 call stdlib_qlascl( 'G', 0, 0, cscale, anrm, 1, 1, dum, 1, ierr )
+                 rcondv = dum( 1 )
+              end if
+              if( cscale==smlnum ) then
+                 ! if scaling back towards underflow, adjust wi if an
+                 ! offdiagonal element of a 2-by-2 block in the schur form
+                 ! underflows.
+                 if( ieval>0 ) then
+                    i1 = ieval + 1
+                    i2 = ihi - 1
+                    call stdlib_qlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wi, n,ierr )
+                 else if( wantst ) then
+                    i1 = 1
+                    i2 = n - 1
+                 else
+                    i1 = ilo
+                    i2 = ihi - 1
+                 end if
+                 inxt = i1 - 1
+                 loop_20: do i = i1, i2
+                    if( i<inxt )cycle loop_20
+                    if( wi( i )==zero ) then
+                       inxt = i + 1
+                    else
+                       if( a( i+1, i )==zero ) then
+                          wi( i ) = zero
+                          wi( i+1 ) = zero
+                       else if( a( i+1, i )/=zero .and. a( i, i+1 )==zero ) then
+                          wi( i ) = zero
+                          wi( i+1 ) = zero
+                          if( i>1 )call stdlib_qswap( i-1, a( 1, i ), 1, a( 1, i+1 ), 1 )
+                          if( n>i+1 )call stdlib_qswap( n-i-1, a( i, i+2 ), lda,a( i+1, i+2 ), &
+                                    lda )
+                          if( wantvs ) then
+                            call stdlib_qswap( n, vs( 1, i ), 1, vs( 1, i+1 ), 1 )
+                          end if
+                          a( i, i+1 ) = a( i+1, i )
+                          a( i+1, i ) = zero
+                       end if
+                       inxt = i + 2
+                    end if
+                 end do loop_20
+              end if
+              call stdlib_qlascl( 'G', 0, 0, cscale, anrm, n-ieval, 1,wi( ieval+1 ), max( n-ieval,&
+                         1 ), ierr )
+           end if
+           if( wantst .and. info==0 ) then
+              ! check if reordering successful
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = select( wr( i ), wi( i ) )
+                 if( wi( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           work( 1 ) = maxwrk
+           if( wantsv .or. wantsb ) then
+              iwork( 1 ) = max( 1, sdim*( n-sdim ) )
+           else
+              iwork( 1 ) = 1
+           end if
+           return
+     end subroutine stdlib_qgeesx
+
+     !> DGEEV: computes for an N-by-N real nonsymmetric matrix A, the
+     !> eigenvalues and, optionally, the left and/or right eigenvectors.
+     !> The right eigenvector v(j) of A satisfies
+     !> A * v(j) = lambda(j) * v(j)
+     !> where lambda(j) is its eigenvalue.
+     !> The left eigenvector u(j) of A satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H
+     !> where u(j)**H denotes the conjugate-transpose of u(j).
+     !> The computed eigenvectors are normalized to have Euclidean norm
+     !> equal to 1 and largest component real.
+
+     subroutine stdlib_qgeev( jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr,ldvr, work, lwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: vl(ldvl,*), vr(ldvr,*), wi(*), work(*), wr(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantvl, wantvr
+           character :: side
+           integer(ilp) :: hswork, i, ibal, ierr, ihi, ilo, itau, iwrk, k, lwork_trevc, maxwrk, &
+                     minwrk, nout
+           real(qp) :: anrm, bignum, cs, cscale, eps, r, scl, smlnum, sn
+           ! Local Arrays 
+           logical(lk) :: select(1)
+           real(qp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvl = stdlib_lsame( jobvl, 'V' )
+           wantvr = stdlib_lsame( jobvr, 'V' )
+           if( ( .not.wantvl ) .and. ( .not.stdlib_lsame( jobvl, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantvr ) .and. ( .not.stdlib_lsame( jobvr, 'N' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldvl<1 .or. ( wantvl .and. ldvl<n ) ) then
+              info = -9
+           else if( ldvr<1 .or. ( wantvr .and. ldvr<n ) ) then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_qhseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = 2*n + n*stdlib_ilaenv( 1, 'DGEHRD', ' ', n, 1, n, 0 )
+                 if( wantvl ) then
+                    minwrk = 4*n
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'DORGHR', ' ', n, 1, n,&
+                               -1 ) )
+                    call stdlib_qhseqr( 'S', 'V', n, 1, n, a, lda, wr, wi, vl, ldvl,work, -1, &
+                              info )
+                    hswork = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + 1, n + hswork )
+                    call stdlib_qtrevc3( 'L', 'B', select, n, a, lda,vl, ldvl, vr, ldvr, n, nout,&
+                              work, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    maxwrk = max( maxwrk, 4*n )
+                 else if( wantvr ) then
+                    minwrk = 4*n
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'DORGHR', ' ', n, 1, n,&
+                               -1 ) )
+                    call stdlib_qhseqr( 'S', 'V', n, 1, n, a, lda, wr, wi, vr, ldvr,work, -1, &
+                              info )
+                    hswork = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + 1, n + hswork )
+                    call stdlib_qtrevc3( 'R', 'B', select, n, a, lda,vl, ldvl, vr, ldvr, n, nout,&
+                              work, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    maxwrk = max( maxwrk, 4*n )
+                 else
+                    minwrk = 3*n
+                    call stdlib_qhseqr( 'E', 'N', n, 1, n, a, lda, wr, wi, vr, ldvr,work, -1, &
+                              info )
+                    hswork = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + 1, n + hswork )
+                 end if
+                 maxwrk = max( maxwrk, minwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_qlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! balance the matrix
+           ! (workspace: need n)
+           ibal = 1
+           call stdlib_qgebal( 'B', n, a, lda, ilo, ihi, work( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (workspace: need 3*n, prefer 2*n+n*nb)
+           itau = ibal + n
+           iwrk = itau + n
+           call stdlib_qgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvl ) then
+              ! want left eigenvectors
+              ! copy householder vectors to vl
+              side = 'L'
+              call stdlib_qlacpy( 'L', n, n, a, lda, vl, ldvl )
+              ! generate orthogonal matrix in vl
+              ! (workspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              call stdlib_qorghr( n, ilo, ihi, vl, ldvl, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vl
+              ! (workspace: need n+1, prefer n+hswork (see comments) )
+              iwrk = itau
+              call stdlib_qhseqr( 'S', 'V', n, ilo, ihi, a, lda, wr, wi, vl, ldvl,work( iwrk ), &
+                        lwork-iwrk+1, info )
+              if( wantvr ) then
+                 ! want left and right eigenvectors
+                 ! copy schur vectors to vr
+                 side = 'B'
+                 call stdlib_qlacpy( 'F', n, n, vl, ldvl, vr, ldvr )
+              end if
+           else if( wantvr ) then
+              ! want right eigenvectors
+              ! copy householder vectors to vr
+              side = 'R'
+              call stdlib_qlacpy( 'L', n, n, a, lda, vr, ldvr )
+              ! generate orthogonal matrix in vr
+              ! (workspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              call stdlib_qorghr( n, ilo, ihi, vr, ldvr, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vr
+              ! (workspace: need n+1, prefer n+hswork (see comments) )
+              iwrk = itau
+              call stdlib_qhseqr( 'S', 'V', n, ilo, ihi, a, lda, wr, wi, vr, ldvr,work( iwrk ), &
+                        lwork-iwrk+1, info )
+           else
+              ! compute eigenvalues only
+              ! (workspace: need n+1, prefer n+hswork (see comments) )
+              iwrk = itau
+              call stdlib_qhseqr( 'E', 'N', n, ilo, ihi, a, lda, wr, wi, vr, ldvr,work( iwrk ), &
+                        lwork-iwrk+1, info )
+           end if
+           ! if info /= 0 from stdlib_qhseqr, then quit
+           if( info/=0 )go to 50
+           if( wantvl .or. wantvr ) then
+              ! compute left and/or right eigenvectors
+              ! (workspace: need 4*n, prefer n + n + 2*n*nb)
+              call stdlib_qtrevc3( side, 'B', select, n, a, lda, vl, ldvl, vr, ldvr,n, nout, work(&
+                         iwrk ), lwork-iwrk+1, ierr )
+           end if
+           if( wantvl ) then
+              ! undo balancing of left eigenvectors
+              ! (workspace: need n)
+              call stdlib_qgebak( 'B', 'L', n, ilo, ihi, work( ibal ), n, vl, ldvl,ierr )
+              ! normalize left eigenvectors and make largest component real
+              do i = 1, n
+                 if( wi( i )==zero ) then
+                    scl = one / stdlib_qnrm2( n, vl( 1, i ), 1 )
+                    call stdlib_qscal( n, scl, vl( 1, i ), 1 )
+                 else if( wi( i )>zero ) then
+                    scl = one / stdlib_qlapy2( stdlib_qnrm2( n, vl( 1, i ), 1 ),stdlib_qnrm2( n, &
+                              vl( 1, i+1 ), 1 ) )
+                    call stdlib_qscal( n, scl, vl( 1, i ), 1 )
+                    call stdlib_qscal( n, scl, vl( 1, i+1 ), 1 )
+                    do k = 1, n
+                       work( iwrk+k-1 ) = vl( k, i )**2 + vl( k, i+1 )**2
+                    end do
+                    k = stdlib_iqamax( n, work( iwrk ), 1 )
+                    call stdlib_qlartg( vl( k, i ), vl( k, i+1 ), cs, sn, r )
+                    call stdlib_qrot( n, vl( 1, i ), 1, vl( 1, i+1 ), 1, cs, sn )
+                    vl( k, i+1 ) = zero
+                 end if
+              end do
+           end if
+           if( wantvr ) then
+              ! undo balancing of right eigenvectors
+              ! (workspace: need n)
+              call stdlib_qgebak( 'B', 'R', n, ilo, ihi, work( ibal ), n, vr, ldvr,ierr )
+              ! normalize right eigenvectors and make largest component real
+              do i = 1, n
+                 if( wi( i )==zero ) then
+                    scl = one / stdlib_qnrm2( n, vr( 1, i ), 1 )
+                    call stdlib_qscal( n, scl, vr( 1, i ), 1 )
+                 else if( wi( i )>zero ) then
+                    scl = one / stdlib_qlapy2( stdlib_qnrm2( n, vr( 1, i ), 1 ),stdlib_qnrm2( n, &
+                              vr( 1, i+1 ), 1 ) )
+                    call stdlib_qscal( n, scl, vr( 1, i ), 1 )
+                    call stdlib_qscal( n, scl, vr( 1, i+1 ), 1 )
+                    do k = 1, n
+                       work( iwrk+k-1 ) = vr( k, i )**2 + vr( k, i+1 )**2
+                    end do
+                    k = stdlib_iqamax( n, work( iwrk ), 1 )
+                    call stdlib_qlartg( vr( k, i ), vr( k, i+1 ), cs, sn, r )
+                    call stdlib_qrot( n, vr( 1, i ), 1, vr( 1, i+1 ), 1, cs, sn )
+                    vr( k, i+1 ) = zero
+                 end if
+              end do
+           end if
+           ! undo scaling if necessary
+           50 continue
+           if( scalea ) then
+              call stdlib_qlascl( 'G', 0, 0, cscale, anrm, n-info, 1, wr( info+1 ),max( n-info, 1 &
+                        ), ierr )
+              call stdlib_qlascl( 'G', 0, 0, cscale, anrm, n-info, 1, wi( info+1 ),max( n-info, 1 &
+                        ), ierr )
+              if( info>0 ) then
+                 call stdlib_qlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wr, n,ierr )
+                 call stdlib_qlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wi, n,ierr )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_qgeev
+
+     !> DGEEVX: computes for an N-by-N real nonsymmetric matrix A, the
+     !> eigenvalues and, optionally, the left and/or right eigenvectors.
+     !> Optionally also, it computes a balancing transformation to improve
+     !> the conditioning of the eigenvalues and eigenvectors (ILO, IHI,
+     !> SCALE, and ABNRM), reciprocal condition numbers for the eigenvalues
+     !> (RCONDE), and reciprocal condition numbers for the right
+     !> eigenvectors (RCONDV).
+     !> The right eigenvector v(j) of A satisfies
+     !> A * v(j) = lambda(j) * v(j)
+     !> where lambda(j) is its eigenvalue.
+     !> The left eigenvector u(j) of A satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H
+     !> where u(j)**H denotes the conjugate-transpose of u(j).
+     !> The computed eigenvectors are normalized to have Euclidean norm
+     !> equal to 1 and largest component real.
+     !> Balancing a matrix means permuting the rows and columns to make it
+     !> more nearly upper triangular, and applying a diagonal similarity
+     !> transformation D * A * D**(-1), where D is a diagonal matrix, to
+     !> make its rows and columns closer in norm and the condition numbers
+     !> of its eigenvalues and eigenvectors smaller.  The computed
+     !> reciprocal condition numbers correspond to the balanced matrix.
+     !> Permuting rows and columns will not change the condition numbers
+     !> (in exact arithmetic) but diagonal scaling will.  For further
+     !> explanation of balancing, see section 4.10.2_qp of the LAPACK
+     !> Users' Guide.
+
+     subroutine stdlib_qgeevx( balanc, jobvl, jobvr, sense, n, a, lda, wr, wi,vl, ldvl, vr, ldvr, &
+               ilo, ihi, scale, abnrm,rconde, rcondv, work, lwork, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: balanc, jobvl, jobvr, sense
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldvl, ldvr, lwork, n
+           real(qp), intent(out) :: abnrm
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: rconde(*), rcondv(*), scale(*), vl(ldvl,*), vr(ldvr,*), wi(*),&
+                      work(*), wr(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantvl, wantvr, wntsnb, wntsne, wntsnn, wntsnv
+           character :: job, side
+           integer(ilp) :: hswork, i, icond, ierr, itau, iwrk, k, lwork_trevc, maxwrk, minwrk, &
+                     nout
+           real(qp) :: anrm, bignum, cs, cscale, eps, r, scl, smlnum, sn
+           ! Local Arrays 
+           logical(lk) :: select(1)
+           real(qp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvl = stdlib_lsame( jobvl, 'V' )
+           wantvr = stdlib_lsame( jobvr, 'V' )
+           wntsnn = stdlib_lsame( sense, 'N' )
+           wntsne = stdlib_lsame( sense, 'E' )
+           wntsnv = stdlib_lsame( sense, 'V' )
+           wntsnb = stdlib_lsame( sense, 'B' )
+           if( .not.( stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc, 'S' ).or. &
+                     stdlib_lsame( balanc, 'P' ) .or. stdlib_lsame( balanc, 'B' ) ) )then
+              info = -1
+           else if( ( .not.wantvl ) .and. ( .not.stdlib_lsame( jobvl, 'N' ) ) ) then
+              info = -2
+           else if( ( .not.wantvr ) .and. ( .not.stdlib_lsame( jobvr, 'N' ) ) ) then
+              info = -3
+           else if( .not.( wntsnn .or. wntsne .or. wntsnb .or. wntsnv ) .or.( ( wntsne .or. &
+                     wntsnb ) .and. .not.( wantvl .and.wantvr ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( wantvl .and. ldvl<n ) ) then
+              info = -11
+           else if( ldvr<1 .or. ( wantvr .and. ldvr<n ) ) then
+              info = -13
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_qhseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'DGEHRD', ' ', n, 1, n, 0 )
+                 if( wantvl ) then
+                    call stdlib_qtrevc3( 'L', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    call stdlib_qhseqr( 'S', 'V', n, 1, n, a, lda, wr, wi, vl, ldvl,work, -1, &
+                              info )
+                 else if( wantvr ) then
+                    call stdlib_qtrevc3( 'R', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    call stdlib_qhseqr( 'S', 'V', n, 1, n, a, lda, wr, wi, vr, ldvr,work, -1, &
+                              info )
+                 else
+                    if( wntsnn ) then
+                       call stdlib_qhseqr( 'E', 'N', n, 1, n, a, lda, wr, wi, vr,ldvr, work, -1, &
+                                 info )
+                    else
+                       call stdlib_qhseqr( 'S', 'N', n, 1, n, a, lda, wr, wi, vr,ldvr, work, -1, &
+                                 info )
+                    end if
+                 end if
+                 hswork = int( work(1),KIND=ilp)
+                 if( ( .not.wantvl ) .and. ( .not.wantvr ) ) then
+                    minwrk = 2*n
+                    if( .not.wntsnn )minwrk = max( minwrk, n*n+6*n )
+                    maxwrk = max( maxwrk, hswork )
+                    if( .not.wntsnn )maxwrk = max( maxwrk, n*n + 6*n )
+                 else
+                    minwrk = 3*n
+                    if( ( .not.wntsnn ) .and. ( .not.wntsne ) )minwrk = max( minwrk, n*n + 6*n )
+                              
+                    maxwrk = max( maxwrk, hswork )
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'DORGHR',' ', n, 1, n, -&
+                              1 ) )
+                    if( ( .not.wntsnn ) .and. ( .not.wntsne ) )maxwrk = max( maxwrk, n*n + 6*n )
+                              
+                    maxwrk = max( maxwrk, 3*n )
+                 end if
+                 maxwrk = max( maxwrk, minwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -21
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           icond = 0
+           anrm = stdlib_qlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_qlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! balance the matrix and compute abnrm
+           call stdlib_qgebal( balanc, n, a, lda, ilo, ihi, scale, ierr )
+           abnrm = stdlib_qlange( '1', n, n, a, lda, dum )
+           if( scalea ) then
+              dum( 1 ) = abnrm
+              call stdlib_qlascl( 'G', 0, 0, cscale, anrm, 1, 1, dum, 1, ierr )
+              abnrm = dum( 1 )
+           end if
+           ! reduce to upper hessenberg form
+           ! (workspace: need 2*n, prefer n+n*nb)
+           itau = 1
+           iwrk = itau + n
+           call stdlib_qgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvl ) then
+              ! want left eigenvectors
+              ! copy householder vectors to vl
+              side = 'L'
+              call stdlib_qlacpy( 'L', n, n, a, lda, vl, ldvl )
+              ! generate orthogonal matrix in vl
+              ! (workspace: need 2*n-1, prefer n+(n-1)*nb)
+              call stdlib_qorghr( n, ilo, ihi, vl, ldvl, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vl
+              ! (workspace: need 1, prefer hswork (see comments) )
+              iwrk = itau
+              call stdlib_qhseqr( 'S', 'V', n, ilo, ihi, a, lda, wr, wi, vl, ldvl,work( iwrk ), &
+                        lwork-iwrk+1, info )
+              if( wantvr ) then
+                 ! want left and right eigenvectors
+                 ! copy schur vectors to vr
+                 side = 'B'
+                 call stdlib_qlacpy( 'F', n, n, vl, ldvl, vr, ldvr )
+              end if
+           else if( wantvr ) then
+              ! want right eigenvectors
+              ! copy householder vectors to vr
+              side = 'R'
+              call stdlib_qlacpy( 'L', n, n, a, lda, vr, ldvr )
+              ! generate orthogonal matrix in vr
+              ! (workspace: need 2*n-1, prefer n+(n-1)*nb)
+              call stdlib_qorghr( n, ilo, ihi, vr, ldvr, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vr
+              ! (workspace: need 1, prefer hswork (see comments) )
+              iwrk = itau
+              call stdlib_qhseqr( 'S', 'V', n, ilo, ihi, a, lda, wr, wi, vr, ldvr,work( iwrk ), &
+                        lwork-iwrk+1, info )
+           else
+              ! compute eigenvalues only
+              ! if condition numbers desired, compute schur form
+              if( wntsnn ) then
+                 job = 'E'
+              else
+                 job = 'S'
+              end if
+              ! (workspace: need 1, prefer hswork (see comments) )
+              iwrk = itau
+              call stdlib_qhseqr( job, 'N', n, ilo, ihi, a, lda, wr, wi, vr, ldvr,work( iwrk ), &
+                        lwork-iwrk+1, info )
+           end if
+           ! if info /= 0 from stdlib_qhseqr, then quit
+           if( info/=0 )go to 50
+           if( wantvl .or. wantvr ) then
+              ! compute left and/or right eigenvectors
+              ! (workspace: need 3*n, prefer n + 2*n*nb)
+              call stdlib_qtrevc3( side, 'B', select, n, a, lda, vl, ldvl, vr, ldvr,n, nout, work(&
+                         iwrk ), lwork-iwrk+1, ierr )
+           end if
+           ! compute condition numbers if desired
+           ! (workspace: need n*n+6*n unless sense = 'e')
+           if( .not.wntsnn ) then
+              call stdlib_qtrsna( sense, 'A', select, n, a, lda, vl, ldvl, vr, ldvr,rconde, &
+                        rcondv, n, nout, work( iwrk ), n, iwork,icond )
+           end if
+           if( wantvl ) then
+              ! undo balancing of left eigenvectors
+              call stdlib_qgebak( balanc, 'L', n, ilo, ihi, scale, n, vl, ldvl,ierr )
+              ! normalize left eigenvectors and make largest component real
+              do i = 1, n
+                 if( wi( i )==zero ) then
+                    scl = one / stdlib_qnrm2( n, vl( 1, i ), 1 )
+                    call stdlib_qscal( n, scl, vl( 1, i ), 1 )
+                 else if( wi( i )>zero ) then
+                    scl = one / stdlib_qlapy2( stdlib_qnrm2( n, vl( 1, i ), 1 ),stdlib_qnrm2( n, &
+                              vl( 1, i+1 ), 1 ) )
+                    call stdlib_qscal( n, scl, vl( 1, i ), 1 )
+                    call stdlib_qscal( n, scl, vl( 1, i+1 ), 1 )
+                    do k = 1, n
+                       work( k ) = vl( k, i )**2 + vl( k, i+1 )**2
+                    end do
+                    k = stdlib_iqamax( n, work, 1 )
+                    call stdlib_qlartg( vl( k, i ), vl( k, i+1 ), cs, sn, r )
+                    call stdlib_qrot( n, vl( 1, i ), 1, vl( 1, i+1 ), 1, cs, sn )
+                    vl( k, i+1 ) = zero
+                 end if
+              end do
+           end if
+           if( wantvr ) then
+              ! undo balancing of right eigenvectors
+              call stdlib_qgebak( balanc, 'R', n, ilo, ihi, scale, n, vr, ldvr,ierr )
+              ! normalize right eigenvectors and make largest component real
+              do i = 1, n
+                 if( wi( i )==zero ) then
+                    scl = one / stdlib_qnrm2( n, vr( 1, i ), 1 )
+                    call stdlib_qscal( n, scl, vr( 1, i ), 1 )
+                 else if( wi( i )>zero ) then
+                    scl = one / stdlib_qlapy2( stdlib_qnrm2( n, vr( 1, i ), 1 ),stdlib_qnrm2( n, &
+                              vr( 1, i+1 ), 1 ) )
+                    call stdlib_qscal( n, scl, vr( 1, i ), 1 )
+                    call stdlib_qscal( n, scl, vr( 1, i+1 ), 1 )
+                    do k = 1, n
+                       work( k ) = vr( k, i )**2 + vr( k, i+1 )**2
+                    end do
+                    k = stdlib_iqamax( n, work, 1 )
+                    call stdlib_qlartg( vr( k, i ), vr( k, i+1 ), cs, sn, r )
+                    call stdlib_qrot( n, vr( 1, i ), 1, vr( 1, i+1 ), 1, cs, sn )
+                    vr( k, i+1 ) = zero
+                 end if
+              end do
+           end if
+           ! undo scaling if necessary
+           50 continue
+           if( scalea ) then
+              call stdlib_qlascl( 'G', 0, 0, cscale, anrm, n-info, 1, wr( info+1 ),max( n-info, 1 &
+                        ), ierr )
+              call stdlib_qlascl( 'G', 0, 0, cscale, anrm, n-info, 1, wi( info+1 ),max( n-info, 1 &
+                        ), ierr )
+              if( info==0 ) then
+                 if( ( wntsnv .or. wntsnb ) .and. icond==0 )call stdlib_qlascl( 'G', 0, 0, cscale,&
+                            anrm, n, 1, rcondv, n,ierr )
+              else
+                 call stdlib_qlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wr, n,ierr )
+                 call stdlib_qlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wi, n,ierr )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_qgeevx
+
+     !> DGEHD2: reduces a real general matrix A to upper Hessenberg form H by
+     !> an orthogonal similarity transformation:  Q**T * A * Q = H .
+
+     pure subroutine stdlib_qgehd2( n, ilo, ihi, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEHD2', -info )
+              return
+           end if
+           do i = ilo, ihi - 1
+              ! compute elementary reflector h(i) to annihilate a(i+2:ihi,i)
+              call stdlib_qlarfg( ihi-i, a( i+1, i ), a( min( i+2, n ), i ), 1,tau( i ) )
+              aii = a( i+1, i )
+              a( i+1, i ) = one
+              ! apply h(i) to a(1:ihi,i+1:ihi) from the right
+              call stdlib_qlarf( 'RIGHT', ihi, ihi-i, a( i+1, i ), 1, tau( i ),a( 1, i+1 ), lda, &
+                        work )
+              ! apply h(i) to a(i+1:ihi,i+1:n) from the left
+              call stdlib_qlarf( 'LEFT', ihi-i, n-i, a( i+1, i ), 1, tau( i ),a( i+1, i+1 ), lda, &
+                        work )
+              a( i+1, i ) = aii
+           end do
+           return
+     end subroutine stdlib_qgehd2
+
+     !> DGEHRD: reduces a real general matrix A to upper Hessenberg form H by
+     !> an orthogonal similarity transformation:  Q**T * A * Q = H .
+
+     pure subroutine stdlib_qgehrd( n, ilo, ihi, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iwt, j, ldwork, lwkopt, nb, nbmin, nh, nx
+           real(qp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'DGEHRD', ' ', n, ilo, ihi, -1 ) )
+              lwkopt = n*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEHRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! set elements 1:ilo-1 and ihi:n-1 of tau to zero
+           do i = 1, ilo - 1
+              tau( i ) = zero
+           end do
+           do i = max( 1, ihi ), n - 1
+              tau( i ) = zero
+           end do
+           ! quick return if possible
+           nh = ihi - ilo + 1
+           if( nh<=1 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! determine the block size
+           nb = min( nbmax, stdlib_ilaenv( 1, 'DGEHRD', ' ', n, ilo, ihi, -1 ) )
+           nbmin = 2
+           if( nb>1 .and. nb<nh ) then
+              ! determine when to cross over from blocked to unblocked code
+              ! (last block is always handled by unblocked code)
+              nx = max( nb, stdlib_ilaenv( 3, 'DGEHRD', ' ', n, ilo, ihi, -1 ) )
+              if( nx<nh ) then
+                 ! determine if workspace is large enough for blocked code
+                 if( lwork<n*nb+tsize ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGEHRD', ' ', n, ilo, ihi,-1 ) )
+                    if( lwork>=(n*nbmin + tsize) ) then
+                       nb = (lwork-tsize) / n
+                    else
+                       nb = 1
+                    end if
+                 end if
+              end if
+           end if
+           ldwork = n
+           if( nb<nbmin .or. nb>=nh ) then
+              ! use unblocked code below
+              i = ilo
+           else
+              ! use blocked code
+              iwt = 1 + n*nb
+              do i = ilo, ihi - 1 - nx, nb
+                 ib = min( nb, ihi-i )
+                 ! reduce columns i:i+ib-1 to hessenberg form, returning the
+                 ! matrices v and t of the block reflector h = i - v*t*v**t
+                 ! which performs the reduction, and also the matrix y = a*v*t
+                 call stdlib_qlahr2( ihi, i, ib, a( 1, i ), lda, tau( i ),work( iwt ), ldt, work, &
+                           ldwork )
+                 ! apply the block reflector h to a(1:ihi,i+ib:ihi) from the
+                 ! right, computing  a := a - y * v**t. v(i+ib,ib-1) must be set
+                 ! to 1
+                 ei = a( i+ib, i+ib-1 )
+                 a( i+ib, i+ib-1 ) = one
+                 call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE',ihi, ihi-i-ib+1,ib, -one, work, &
+                           ldwork, a( i+ib, i ), lda, one,a( 1, i+ib ), lda )
+                 a( i+ib, i+ib-1 ) = ei
+                 ! apply the block reflector h to a(1:i,i+1:i+ib-1) from the
+                 ! right
+                 call stdlib_qtrmm( 'RIGHT', 'LOWER', 'TRANSPOSE','UNIT', i, ib-1,one, a( i+1, i )&
+                           , lda, work, ldwork )
+                 do j = 0, ib-2
+                    call stdlib_qaxpy( i, -one, work( ldwork*j+1 ), 1,a( 1, i+j+1 ), 1 )
+                 end do
+                 ! apply the block reflector h to a(i+1:ihi,i+ib:n) from the
+                 ! left
+                 call stdlib_qlarfb( 'LEFT', 'TRANSPOSE', 'FORWARD','COLUMNWISE',ihi-i, n-i-ib+1, &
+                           ib, a( i+1, i ), lda,work( iwt ), ldt, a( i+1, i+ib ), lda,work, ldwork )
+              end do
+           end if
+           ! use unblocked code to reduce the rest of the matrix
+           call stdlib_qgehd2( n, i, ihi, a, lda, tau, work, iinfo )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qgehrd
+
+     !> DGEJSV: computes the singular value decomposition (SVD) of a real M-by-N
+     !> matrix [A], where M >= N. The SVD of [A] is written as
+     !> [A] = [U] * [SIGMA] * [V]^t,
+     !> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+     !> diagonal elements, [U] is an M-by-N (or M-by-M) orthonormal matrix, and
+     !> [V] is an N-by-N orthogonal matrix. The diagonal elements of [SIGMA] are
+     !> the singular values of [A]. The columns of [U] and [V] are the left and
+     !> the right singular vectors of [A], respectively. The matrices [U] and [V]
+     !> are computed and stored in the arrays U and V, respectively. The diagonal
+     !> of [SIGMA] is computed and stored in the array SVA.
+     !> DGEJSV can sometimes compute tiny singular values and their singular vectors much
+     !> more accurately than other SVD routines, see below under Further Details.
+
+     pure subroutine stdlib_qgejsv( joba, jobu, jobv, jobr, jobt, jobp,m, n, a, lda, sva, u, ldu, &
+               v, ldv,work, lwork, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldv, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: sva(n), u(ldu,*), v(ldv,*), work(lwork)
+           integer(ilp), intent(out) :: iwork(*)
+           character, intent(in) :: joba, jobp, jobr, jobt, jobu, jobv
+        ! ===========================================================================
+           
+           ! Local Scalars 
+           real(qp) :: aapp, aaqq, aatmax, aatmin, big, big1, cond_ok, condr1, condr2, entra, &
+                     entrat, epsln, maxprj, scalem, sconda, sfmin, small, temp1, uscal1, uscal2, xsc
+           integer(ilp) :: ierr, n1, nr, numrank, p, q, warning
+           logical(lk) :: almort, defr, errest, goscal, jracc, kill, lsvec, l2aber, l2kill, &
+                     l2pert, l2rank, l2tran, noscal, rowpiv, rsvec, transp
+           ! Intrinsic Functions 
+           intrinsic :: abs,log,max,min,real,idnint,sign,sqrt
+           ! test the input arguments
+           lsvec  = stdlib_lsame( jobu, 'U' ) .or. stdlib_lsame( jobu, 'F' )
+           jracc  = stdlib_lsame( jobv, 'J' )
+           rsvec  = stdlib_lsame( jobv, 'V' ) .or. jracc
+           rowpiv = stdlib_lsame( joba, 'F' ) .or. stdlib_lsame( joba, 'G' )
+           l2rank = stdlib_lsame( joba, 'R' )
+           l2aber = stdlib_lsame( joba, 'A' )
+           errest = stdlib_lsame( joba, 'E' ) .or. stdlib_lsame( joba, 'G' )
+           l2tran = stdlib_lsame( jobt, 'T' )
+           l2kill = stdlib_lsame( jobr, 'R' )
+           defr   = stdlib_lsame( jobr, 'N' )
+           l2pert = stdlib_lsame( jobp, 'P' )
+           if ( .not.(rowpiv .or. l2rank .or. l2aber .or.errest .or. stdlib_lsame( joba, 'C' ) )) &
+                     then
+              info = - 1
+           else if ( .not.( lsvec  .or. stdlib_lsame( jobu, 'N' ) .or.stdlib_lsame( jobu, 'W' )) )&
+                      then
+              info = - 2
+           else if ( .not.( rsvec .or. stdlib_lsame( jobv, 'N' ) .or.stdlib_lsame( jobv, 'W' )) &
+                     .or. ( jracc .and. (.not.lsvec) ) ) then
+              info = - 3
+           else if ( .not. ( l2kill .or. defr ) )    then
+              info = - 4
+           else if ( .not. ( l2tran .or. stdlib_lsame( jobt, 'N' ) ) ) then
+              info = - 5
+           else if ( .not. ( l2pert .or. stdlib_lsame( jobp, 'N' ) ) ) then
+              info = - 6
+           else if ( m < 0 ) then
+              info = - 7
+           else if ( ( n < 0 ) .or. ( n > m ) ) then
+              info = - 8
+           else if ( lda < m ) then
+              info = - 10
+           else if ( lsvec .and. ( ldu < m ) ) then
+              info = - 13
+           else if ( rsvec .and. ( ldv < n ) ) then
+              info = - 15
+           else if ( (.not.(lsvec .or. rsvec .or. errest).and.(lwork < max(7,4*n+1,2*m+n))) .or.(&
+           .not.(lsvec .or. rsvec) .and. errest .and.(lwork < max(7,4*n+n*n,2*m+n))) .or.(lsvec &
+           .and. (.not.rsvec) .and. (lwork < max(7,2*m+n,4*n+1))).or.(rsvec .and. (.not.lsvec) &
+           .and. (lwork < max(7,2*m+n,4*n+1))).or.(lsvec .and. rsvec .and. (.not.jracc) .and.(&
+           lwork<max(2*m+n,6*n+2*n*n))).or. (lsvec .and. rsvec .and. jracc .and.lwork<max(2*m+n,&
+                     4*n+n*n,2*n+n*n+6)))then
+              info = - 17
+           else
+              ! #:)
+              info = 0
+           end if
+           if ( info /= 0 ) then
+             ! #:(
+              call stdlib_xerbla( 'DGEJSV', - info )
+              return
+           end if
+           ! quick return for void matrix (y3k safe)
+       ! #:)
+           if ( ( m == 0 ) .or. ( n == 0 ) ) then
+              iwork(1:3) = 0
+              work(1:7) = 0
+              return
+           endif
+           ! determine whether the matrix u should be m x n or m x m
+           if ( lsvec ) then
+              n1 = n
+              if ( stdlib_lsame( jobu, 'F' ) ) n1 = m
+           end if
+           ! set numerical parameters
+      ! !    note: make sure stdlib_qlamch() does not fail on the target architecture.
+           epsln = stdlib_qlamch('EPSILON')
+           sfmin = stdlib_qlamch('SAFEMINIMUM')
+           small = sfmin / epsln
+           big   = stdlib_qlamch('O')
+           ! big   = one / sfmin
+           ! initialize sva(1:n) = diag( ||a e_i||_2 )_1^n
+      ! (!)  if necessary, scale sva() to protect the largest norm from
+           ! overflow. it is possible that this scaling pushes the smallest
+           ! column norm left from the underflow threshold (extreme case).
+           scalem  = one / sqrt(real(m,KIND=qp)*real(n,KIND=qp))
+           noscal  = .true.
+           goscal  = .true.
+           do p = 1, n
+              aapp = zero
+              aaqq = one
+              call stdlib_qlassq( m, a(1,p), 1, aapp, aaqq )
+              if ( aapp > big ) then
+                 info = - 9
+                 call stdlib_xerbla( 'DGEJSV', -info )
+                 return
+              end if
+              aaqq = sqrt(aaqq)
+              if ( ( aapp < (big / aaqq) ) .and. noscal  ) then
+                 sva(p)  = aapp * aaqq
+              else
+                 noscal  = .false.
+                 sva(p)  = aapp * ( aaqq * scalem )
+                 if ( goscal ) then
+                    goscal = .false.
+                    call stdlib_qscal( p-1, scalem, sva, 1 )
+                 end if
+              end if
+           end do
+           if ( noscal ) scalem = one
+           aapp = zero
+           aaqq = big
+           do p = 1, n
+              aapp = max( aapp, sva(p) )
+              if ( sva(p) /= zero ) aaqq = min( aaqq, sva(p) )
+           end do
+           ! quick return for zero m x n matrix
+       ! #:)
+           if ( aapp == zero ) then
+              if ( lsvec ) call stdlib_qlaset( 'G', m, n1, zero, one, u, ldu )
+              if ( rsvec ) call stdlib_qlaset( 'G', n, n,  zero, one, v, ldv )
+              work(1) = one
+              work(2) = one
+              if ( errest ) work(3) = one
+              if ( lsvec .and. rsvec ) then
+                 work(4) = one
+                 work(5) = one
+              end if
+              if ( l2tran ) then
+                 work(6) = zero
+                 work(7) = zero
+              end if
+              iwork(1) = 0
+              iwork(2) = 0
+              iwork(3) = 0
+              return
+           end if
+           ! issue warning if denormalized column norms detected. override the
+           ! high relative accuracy request. issue licence to kill columns
+           ! (set them to zero) whose norm is less than sigma_max / big (roughly).
+       ! #:(
+           warning = 0
+           if ( aaqq <= sfmin ) then
+              l2rank = .true.
+              l2kill = .true.
+              warning = 1
+           end if
+           ! quick return for one-column matrix
+       ! #:)
+           if ( n == 1 ) then
+              if ( lsvec ) then
+                 call stdlib_qlascl( 'G',0,0,sva(1),scalem, m,1,a(1,1),lda,ierr )
+                 call stdlib_qlacpy( 'A', m, 1, a, lda, u, ldu )
+                 ! computing all m left singular vectors of the m x 1 matrix
+                 if ( n1 /= n  ) then
+                    call stdlib_qgeqrf( m, n, u,ldu, work, work(n+1),lwork-n,ierr )
+                    call stdlib_qorgqr( m,n1,1, u,ldu,work,work(n+1),lwork-n,ierr )
+                    call stdlib_qcopy( m, a(1,1), 1, u(1,1), 1 )
+                 end if
+              end if
+              if ( rsvec ) then
+                  v(1,1) = one
+              end if
+              if ( sva(1) < (big*scalem) ) then
+                 sva(1)  = sva(1) / scalem
+                 scalem  = one
+              end if
+              work(1) = one / scalem
+              work(2) = one
+              if ( sva(1) /= zero ) then
+                 iwork(1) = 1
+                 if ( ( sva(1) / scalem) >= sfmin ) then
+                    iwork(2) = 1
+                 else
+                    iwork(2) = 0
+                 end if
+              else
+                 iwork(1) = 0
+                 iwork(2) = 0
+              end if
+              iwork(3) = 0
+              if ( errest ) work(3) = one
+              if ( lsvec .and. rsvec ) then
+                 work(4) = one
+                 work(5) = one
+              end if
+              if ( l2tran ) then
+                 work(6) = zero
+                 work(7) = zero
+              end if
+              return
+           end if
+           transp = .false.
+           l2tran = l2tran .and. ( m == n )
+           aatmax = -one
+           aatmin =  big
+           if ( rowpiv .or. l2tran ) then
+           ! compute the row norms, needed to determine row pivoting sequence
+           ! (in the case of heavily row weighted a, row pivoting is strongly
+           ! advised) and to collect information needed to compare the
+           ! structures of a * a^t and a^t * a (in the case l2tran==.true.).
+              if ( l2tran ) then
+                 do p = 1, m
+                    xsc   = zero
+                    temp1 = one
+                    call stdlib_qlassq( n, a(p,1), lda, xsc, temp1 )
+                    ! stdlib_qlassq gets both the ell_2 and the ell_infinity norm
+                    ! in one pass through the vector
+                    work(m+n+p)  = xsc * scalem
+                    work(n+p)    = xsc * (scalem*sqrt(temp1))
+                    aatmax = max( aatmax, work(n+p) )
+                    if (work(n+p) /= zero) aatmin = min(aatmin,work(n+p))
+                 end do
+              else
+                 do p = 1, m
+                    work(m+n+p) = scalem*abs( a(p,stdlib_iqamax(n,a(p,1),lda)) )
+                    aatmax = max( aatmax, work(m+n+p) )
+                    aatmin = min( aatmin, work(m+n+p) )
+                 end do
+              end if
+           end if
+           ! for square matrix a try to determine whether a^t  would be  better
+           ! input for the preconditioned jacobi svd, with faster convergence.
+           ! the decision is based on an o(n) function of the vector of column
+           ! and row norms of a, based on the shannon entropy. this should give
+           ! the right choice in most cases when the difference actually matters.
+           ! it may fail and pick the slower converging side.
+           entra  = zero
+           entrat = zero
+           if ( l2tran ) then
+              xsc   = zero
+              temp1 = one
+              call stdlib_qlassq( n, sva, 1, xsc, temp1 )
+              temp1 = one / temp1
+              entra = zero
+              do p = 1, n
+                 big1  = ( ( sva(p) / xsc )**2 ) * temp1
+                 if ( big1 /= zero ) entra = entra + big1 * log(big1)
+              end do
+              entra = - entra / log(real(n,KIND=qp))
+              ! now, sva().^2/trace(a^t * a) is a point in the probability simplex.
+              ! it is derived from the diagonal of  a^t * a.  do the same with the
+              ! diagonal of a * a^t, compute the entropy of the corresponding
+              ! probability distribution. note that a * a^t and a^t * a have the
+              ! same trace.
+              entrat = zero
+              do p = n+1, n+m
+                 big1 = ( ( work(p) / xsc )**2 ) * temp1
+                 if ( big1 /= zero ) entrat = entrat + big1 * log(big1)
+              end do
+              entrat = - entrat / log(real(m,KIND=qp))
+              ! analyze the entropies and decide a or a^t. smaller entropy
+              ! usually means better input for the algorithm.
+              transp = ( entrat < entra )
+              ! if a^t is better than a, transpose a.
+              if ( transp ) then
+                 ! in an optimal implementation, this trivial transpose
+                 ! should be replaced with faster transpose.
+                 do p = 1, n - 1
+                    do q = p + 1, n
+                        temp1 = a(q,p)
+                       a(q,p) = a(p,q)
+                       a(p,q) = temp1
+                    end do
+                 end do
+                 do p = 1, n
+                    work(m+n+p) = sva(p)
+                    sva(p)      = work(n+p)
+                 end do
+                 temp1  = aapp
+                 aapp   = aatmax
+                 aatmax = temp1
+                 temp1  = aaqq
+                 aaqq   = aatmin
+                 aatmin = temp1
+                 kill   = lsvec
+                 lsvec  = rsvec
+                 rsvec  = kill
+                 if ( lsvec ) n1 = n
+                 rowpiv = .true.
+              end if
+           end if
+           ! end if l2tran
+           ! scale the matrix so that its maximal singular value remains less
+           ! than sqrt(big) -- the matrix is scaled so that its maximal column
+           ! has euclidean norm equal to sqrt(big/n). the only reason to keep
+           ! sqrt(big) instead of big is the fact that stdlib_qgejsv uses lapack and
+           ! blas routines that, in some implementations, are not capable of
+           ! working in the full interval [sfmin,big] and that they may provoke
+           ! overflows in the intermediate results. if the singular values spread
+           ! from sfmin to big, then stdlib_qgesvj will compute them. so, in that case,
+           ! one should use stdlib_qgesvj instead of stdlib_qgejsv.
+           big1   = sqrt( big )
+           temp1  = sqrt( big / real(n,KIND=qp) )
+           call stdlib_qlascl( 'G', 0, 0, aapp, temp1, n, 1, sva, n, ierr )
+           if ( aaqq > (aapp * sfmin) ) then
+               aaqq = ( aaqq / aapp ) * temp1
+           else
+               aaqq = ( aaqq * temp1 ) / aapp
+           end if
+           temp1 = temp1 * scalem
+           call stdlib_qlascl( 'G', 0, 0, aapp, temp1, m, n, a, lda, ierr )
+           ! to undo scaling at the end of this procedure, multiply the
+           ! computed singular values with uscal2 / uscal1.
+           uscal1 = temp1
+           uscal2 = aapp
+           if ( l2kill ) then
+              ! l2kill enforces computation of nonzero singular values in
+              ! the restricted range of condition number of the initial a,
+              ! sigma_max(a) / sigma_min(a) approx. sqrt(big)/sqrt(sfmin).
+              xsc = sqrt( sfmin )
+           else
+              xsc = small
+              ! now, if the condition number of a is too big,
+              ! sigma_max(a) / sigma_min(a) > sqrt(big/n) * epsln / sfmin,
+              ! as a precaution measure, the full svd is computed using stdlib_qgesvj
+              ! with accumulated jacobi rotations. this provides numerically
+              ! more robust computation, at the cost of slightly increased run
+              ! time. depending on the concrete implementation of blas and lapack
+              ! (i.e. how they behave in presence of extreme ill-conditioning) the
+              ! implementor may decide to remove this switch.
+              if ( ( aaqq<sqrt(sfmin) ) .and. lsvec .and. rsvec ) then
+                 jracc = .true.
+              end if
+           end if
+           if ( aaqq < xsc ) then
+              do p = 1, n
+                 if ( sva(p) < xsc ) then
+                    call stdlib_qlaset( 'A', m, 1, zero, zero, a(1,p), lda )
+                    sva(p) = zero
+                 end if
+              end do
+           end if
+           ! preconditioning using qr factorization with pivoting
+           if ( rowpiv ) then
+              ! optional row permutation (bjoerck row pivoting):
+              ! a result by cox and higham shows that the bjoerck's
+              ! row pivoting combined with standard column pivoting
+              ! has similar effect as powell-reid complete pivoting.
+              ! the ell-infinity norms of a are made nonincreasing.
+              do p = 1, m - 1
+                 q = stdlib_iqamax( m-p+1, work(m+n+p), 1 ) + p - 1
+                 iwork(2*n+p) = q
+                 if ( p /= q ) then
+                    temp1       = work(m+n+p)
+                    work(m+n+p) = work(m+n+q)
+                    work(m+n+q) = temp1
+                 end if
+              end do
+              call stdlib_qlaswp( n, a, lda, 1, m-1, iwork(2*n+1), 1 )
+           end if
+           ! end of the preparation phase (scaling, optional sorting and
+           ! transposing, optional flushing of small columns).
+           ! preconditioning
+           ! if the full svd is needed, the right singular vectors are computed
+           ! from a matrix equation, and for that we need theoretical analysis
+           ! of the businger-golub pivoting. so we use stdlib_qgeqp3 as the first rr qrf.
+           ! in all other cases the first rr qrf can be chosen by other criteria
+           ! (eg speed by replacing global with restricted window pivoting, such
+           ! as in sgeqpx from toms # 782). good results will be obtained using
+           ! sgeqpx with properly (!) chosen numerical parameters.
+           ! any improvement of stdlib_qgeqp3 improves overall performance of stdlib_qgejsv.
+           ! a * p1 = q1 * [ r1^t 0]^t:
+           do p = 1, n
+              ! All Columns Are Free Columns
+              iwork(p) = 0
+           end do
+           call stdlib_qgeqp3( m,n,a,lda, iwork,work, work(n+1),lwork-n, ierr )
+           ! the upper triangular matrix r1 from the first qrf is inspected for
+           ! rank deficiency and possibilities for deflation, or possible
+           ! ill-conditioning. depending on the user specified flag l2rank,
+           ! the procedure explores possibilities to reduce the numerical
+           ! rank by inspecting the computed upper triangular factor. if
+           ! l2rank or l2aber are up, then stdlib_qgejsv will compute the svd of
+           ! a + da, where ||da|| <= f(m,n)*epsln.
+           nr = 1
+           if ( l2aber ) then
+              ! standard absolute error bound suffices. all sigma_i with
+              ! sigma_i < n*epsln*||a|| are flushed to zero. this is an
+              ! aggressive enforcement of lower numerical rank by introducing a
+              ! backward error of the order of n*epsln*||a||.
+              temp1 = sqrt(real(n,KIND=qp))*epsln
+              do p = 2, n
+                 if ( abs(a(p,p)) >= (temp1*abs(a(1,1))) ) then
+                    nr = nr + 1
+                 else
+                    go to 3002
+                 end if
+              end do
+              3002 continue
+           else if ( l2rank ) then
+              ! .. similarly as above, only slightly more gentle (less aggressive).
+              ! sudden drop on the diagonal of r1 is used as the criterion for
+              ! close-to-rank-deficient.
+              temp1 = sqrt(sfmin)
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < (epsln*abs(a(p-1,p-1))) ) .or.( abs(a(p,p)) < small ) .or.( &
+                           l2kill .and. (abs(a(p,p)) < temp1) ) ) go to 3402
+                 nr = nr + 1
+              end do
+              3402 continue
+           else
+              ! the goal is high relative accuracy. however, if the matrix
+              ! has high scaled condition number the relative accuracy is in
+              ! general not feasible. later on, a condition number estimator
+              ! will be deployed to estimate the scaled condition number.
+              ! here we just remove the underflowed part of the triangular
+              ! factor. this prevents the situation in which the code is
+              ! working hard to get the accuracy not warranted by the data.
+              temp1  = sqrt(sfmin)
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < small ) .or.( l2kill .and. (abs(a(p,p)) < temp1) ) ) go to &
+                           3302
+                 nr = nr + 1
+              end do
+              3302 continue
+           end if
+           almort = .false.
+           if ( nr == n ) then
+              maxprj = one
+              do p = 2, n
+                 temp1  = abs(a(p,p)) / sva(iwork(p))
+                 maxprj = min( maxprj, temp1 )
+              end do
+              if ( maxprj**2 >= one - real(n,KIND=qp)*epsln ) almort = .true.
+           end if
+           sconda = - one
+           condr1 = - one
+           condr2 = - one
+           if ( errest ) then
+              if ( n == nr ) then
+                 if ( rsvec ) then
+                    ! V Is Available As Workspace
+                    call stdlib_qlacpy( 'U', n, n, a, lda, v, ldv )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_qscal( p, one/temp1, v(1,p), 1 )
+                    end do
+                    call stdlib_qpocon( 'U', n, v, ldv, one, temp1,work(n+1), iwork(2*n+m+1), &
+                              ierr )
+                 else if ( lsvec ) then
+                    ! U Is Available As Workspace
+                    call stdlib_qlacpy( 'U', n, n, a, lda, u, ldu )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_qscal( p, one/temp1, u(1,p), 1 )
+                    end do
+                    call stdlib_qpocon( 'U', n, u, ldu, one, temp1,work(n+1), iwork(2*n+m+1), &
+                              ierr )
+                 else
+                    call stdlib_qlacpy( 'U', n, n, a, lda, work(n+1), n )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_qscal( p, one/temp1, work(n+(p-1)*n+1), 1 )
+                    end do
+                 ! The Columns Of R Are Scaled To Have Unit Euclidean Lengths
+                    call stdlib_qpocon( 'U', n, work(n+1), n, one, temp1,work(n+n*n+1), iwork(2*n+&
+                              m+1), ierr )
+                 end if
+                 sconda = one / sqrt(temp1)
+                 ! sconda is an estimate of sqrt(||(r^t * r)^(-1)||_1).
+                 ! n^(-1/4) * sconda <= ||r^(-1)||_2 <= n^(1/4) * sconda
+              else
+                 sconda = - one
+              end if
+           end if
+           l2pert = l2pert .and. ( abs( a(1,1)/a(nr,nr) ) > sqrt(big1) )
+           ! if there is no violent scaling, artificial perturbation is not needed.
+           ! phase 3:
+           if ( .not. ( rsvec .or. lsvec ) ) then
+               ! singular values only
+               ! .. transpose a(1:nr,1:n)
+              do p = 1, min( n-1, nr )
+                 call stdlib_qcopy( n-p, a(p,p+1), lda, a(p+1,p), 1 )
+              end do
+              ! the following two do-loops introduce small relative perturbation
+              ! into the strict upper triangle of the lower triangular matrix.
+              ! small entries below the main diagonal are also changed.
+              ! this modification is useful if the computing environment does not
+              ! provide/allow flush to zero underflow, for it prevents many
+              ! annoying denormalized numbers in case of strongly scaled matrices.
+              ! the perturbation is structured so that it does not introduce any
+              ! new perturbation of the singular values, and it does not destroy
+              ! the job done by the preconditioner.
+              ! the licence for this perturbation is in the variable l2pert, which
+              ! should be .false. if flush to zero underflow is active.
+              if ( .not. almort ) then
+                 if ( l2pert ) then
+                    ! xsc = sqrt(small)
+                    xsc = epsln / real(n,KIND=qp)
+                    do q = 1, nr
+                       temp1 = xsc*abs(a(q,q))
+                       do p = 1, n
+                          if ( ( (p>q) .and. (abs(a(p,q))<=temp1) ).or. ( p < q ) )a(p,q) = sign( &
+                                    temp1, a(p,q) )
+                       end do
+                    end do
+                 else
+                    call stdlib_qlaset( 'U', nr-1,nr-1, zero,zero, a(1,2),lda )
+                 end if
+                  ! Second Preconditioning Using The Qr Factorization
+                 call stdlib_qgeqrf( n,nr, a,lda, work, work(n+1),lwork-n, ierr )
+                 ! And Transpose Upper To Lower Triangular
+                 do p = 1, nr - 1
+                    call stdlib_qcopy( nr-p, a(p,p+1), lda, a(p+1,p), 1 )
+                 end do
+              end if
+                 ! row-cyclic jacobi svd algorithm with column pivoting
+                 ! .. again some perturbation (a "background noise") is added
+                 ! to drown denormals
+                 if ( l2pert ) then
+                    ! xsc = sqrt(small)
+                    xsc = epsln / real(n,KIND=qp)
+                    do q = 1, nr
+                       temp1 = xsc*abs(a(q,q))
+                       do p = 1, nr
+                          if ( ( (p>q) .and. (abs(a(p,q))<=temp1) ).or. ( p < q ) )a(p,q) = sign( &
+                                    temp1, a(p,q) )
+                       end do
+                    end do
+                 else
+                    call stdlib_qlaset( 'U', nr-1, nr-1, zero, zero, a(1,2), lda )
+                 end if
+                 ! .. and one-sided jacobi rotations are started on a lower
+                 ! triangular matrix (plus perturbation which is ignored in
+                 ! the part which destroys triangular form (confusing?!))
+                 call stdlib_qgesvj( 'L', 'NOU', 'NOV', nr, nr, a, lda, sva,n, v, ldv, work, &
+                           lwork, info )
+                 scalem  = work(1)
+                 numrank = nint(work(2),KIND=ilp)
+           else if ( rsvec .and. ( .not. lsvec ) ) then
+              ! -> singular values and right singular vectors <-
+              if ( almort ) then
+                 ! In This Case Nr Equals N
+                 do p = 1, nr
+                    call stdlib_qcopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                 end do
+                 call stdlib_qlaset( 'UPPER', nr-1, nr-1, zero, zero, v(1,2), ldv )
+                 call stdlib_qgesvj( 'L','U','N', n, nr, v,ldv, sva, nr, a,lda,work, lwork, info )
+                           
+                 scalem  = work(1)
+                 numrank = nint(work(2),KIND=ilp)
+              else
+              ! .. two more qr factorizations ( one qrf is not enough, two require
+              ! accumulated product of jacobi rotations, three are perfect )
+                 call stdlib_qlaset( 'LOWER', nr-1, nr-1, zero, zero, a(2,1), lda )
+                 call stdlib_qgelqf( nr, n, a, lda, work, work(n+1), lwork-n, ierr)
+                 call stdlib_qlacpy( 'LOWER', nr, nr, a, lda, v, ldv )
+                 call stdlib_qlaset( 'UPPER', nr-1, nr-1, zero, zero, v(1,2), ldv )
+                 call stdlib_qgeqrf( nr, nr, v, ldv, work(n+1), work(2*n+1),lwork-2*n, ierr )
+                           
+                 do p = 1, nr
+                    call stdlib_qcopy( nr-p+1, v(p,p), ldv, v(p,p), 1 )
+                 end do
+                 call stdlib_qlaset( 'UPPER', nr-1, nr-1, zero, zero, v(1,2), ldv )
+                 call stdlib_qgesvj( 'LOWER', 'U','N', nr, nr, v,ldv, sva, nr, u,ldu, work(n+1), &
+                           lwork, info )
+                 scalem  = work(n+1)
+                 numrank = nint(work(n+2),KIND=ilp)
+                 if ( nr < n ) then
+                    call stdlib_qlaset( 'A',n-nr, nr, zero,zero, v(nr+1,1),   ldv )
+                    call stdlib_qlaset( 'A',nr, n-nr, zero,zero, v(1,nr+1),   ldv )
+                    call stdlib_qlaset( 'A',n-nr,n-nr,zero,one, v(nr+1,nr+1), ldv )
+                 end if
+              call stdlib_qormlq( 'LEFT', 'TRANSPOSE', n, n, nr, a, lda, work,v, ldv, work(n+1), &
+                        lwork-n, ierr )
+              end if
+              do p = 1, n
+                 call stdlib_qcopy( n, v(p,1), ldv, a(iwork(p),1), lda )
+              end do
+              call stdlib_qlacpy( 'ALL', n, n, a, lda, v, ldv )
+              if ( transp ) then
+                 call stdlib_qlacpy( 'ALL', n, n, v, ldv, u, ldu )
+              end if
+           else if ( lsvec .and. ( .not. rsvec ) ) then
+              ! Singular Values And Left Singular Vectors                 
+              ! Second Preconditioning Step To Avoid Need To Accumulate
+              ! jacobi rotations in the jacobi iterations.
+              do p = 1, nr
+                 call stdlib_qcopy( n-p+1, a(p,p), lda, u(p,p), 1 )
+              end do
+              call stdlib_qlaset( 'UPPER', nr-1, nr-1, zero, zero, u(1,2), ldu )
+              call stdlib_qgeqrf( n, nr, u, ldu, work(n+1), work(2*n+1),lwork-2*n, ierr )
+              do p = 1, nr - 1
+                 call stdlib_qcopy( nr-p, u(p,p+1), ldu, u(p+1,p), 1 )
+              end do
+              call stdlib_qlaset( 'UPPER', nr-1, nr-1, zero, zero, u(1,2), ldu )
+              call stdlib_qgesvj( 'LOWER', 'U', 'N', nr,nr, u, ldu, sva, nr, a,lda, work(n+1), &
+                        lwork-n, info )
+              scalem  = work(n+1)
+              numrank = nint(work(n+2),KIND=ilp)
+              if ( nr < m ) then
+                 call stdlib_qlaset( 'A',  m-nr, nr,zero, zero, u(nr+1,1), ldu )
+                 if ( nr < n1 ) then
+                    call stdlib_qlaset( 'A',nr, n1-nr, zero, zero, u(1,nr+1), ldu )
+                    call stdlib_qlaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                 end if
+              end if
+              call stdlib_qormqr( 'LEFT', 'NO TR', m, n1, n, a, lda, work, u,ldu, work(n+1), &
+                        lwork-n, ierr )
+              if ( rowpiv )call stdlib_qlaswp( n1, u, ldu, 1, m-1, iwork(2*n+1), -1 )
+              do p = 1, n1
+                 xsc = one / stdlib_qnrm2( m, u(1,p), 1 )
+                 call stdlib_qscal( m, xsc, u(1,p), 1 )
+              end do
+              if ( transp ) then
+                 call stdlib_qlacpy( 'ALL', n, n, u, ldu, v, ldv )
+              end if
+           else
+              ! Full Svd 
+              if ( .not. jracc ) then
+              if ( .not. almort ) then
+                 ! second preconditioning step (qrf [with pivoting])
+                 ! note that the composition of transpose, qrf and transpose is
+                 ! equivalent to an lqf call. since in many libraries the qrf
+                 ! seems to be better optimized than the lqf, we do explicit
+                 ! transpose and use the qrf. this is subject to changes in an
+                 ! optimized implementation of stdlib_qgejsv.
+                 do p = 1, nr
+                    call stdlib_qcopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                 end do
+                 ! The Following Two Loops Perturb Small Entries To Avoid
+                 ! denormals in the second qr factorization, where they are
+                 ! as good as zeros. this is done to avoid painfully slow
+                 ! computation with denormals. the relative size of the perturbation
+                 ! is a parameter that can be changed by the implementer.
+                 ! this perturbation device will be obsolete on machines with
+                 ! properly implemented arithmetic.
+                 ! to switch it off, set l2pert=.false. to remove it from  the
+                 ! code, remove the action under l2pert=.true., leave the else part.
+                 ! the following two loops should be blocked and fused with the
+                 ! transposed copy above.
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do q = 1, nr
+                       temp1 = xsc*abs( v(q,q) )
+                       do p = 1, n
+                          if ( ( p > q ) .and. ( abs(v(p,q)) <= temp1 ).or. ( p < q ) )v(p,q) = &
+                                    sign( temp1, v(p,q) )
+                          if ( p < q ) v(p,q) = - v(p,q)
+                       end do
+                    end do
+                 else
+                    call stdlib_qlaset( 'U', nr-1, nr-1, zero, zero, v(1,2), ldv )
+                 end if
+                 ! estimate the row scaled condition number of r1
+                 ! (if r1 is rectangular, n > nr, then the condition number
+                 ! of the leading nr x nr submatrix is estimated.)
+                 call stdlib_qlacpy( 'L', nr, nr, v, ldv, work(2*n+1), nr )
+                 do p = 1, nr
+                    temp1 = stdlib_qnrm2(nr-p+1,work(2*n+(p-1)*nr+p),1)
+                    call stdlib_qscal(nr-p+1,one/temp1,work(2*n+(p-1)*nr+p),1)
+                 end do
+                 call stdlib_qpocon('LOWER',nr,work(2*n+1),nr,one,temp1,work(2*n+nr*nr+1),iwork(m+&
+                           2*n+1),ierr)
+                 condr1 = one / sqrt(temp1)
+                 ! Here Need A Second Opinion On The Condition Number
+                 ! Then Assume Worst Case Scenario
+                 ! r1 is ok for inverse <=> condr1 < real(n,KIND=qp)
+                 ! more conservative    <=> condr1 < sqrt(real(n,KIND=qp))
+                 cond_ok = sqrt(real(nr,KIND=qp))
+      ! [tp]       cond_ok is a tuning parameter.
+                 if ( condr1 < cond_ok ) then
+                    ! .. the second qrf without pivoting. note: in an optimized
+                    ! implementation, this qrf should be implemented as the qrf
+                    ! of a lower triangular matrix.
+                    ! r1^t = q2 * r2
+                    call stdlib_qgeqrf( n, nr, v, ldv, work(n+1), work(2*n+1),lwork-2*n, ierr )
+                              
+                    if ( l2pert ) then
+                       xsc = sqrt(small)/epsln
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             temp1 = xsc * min(abs(v(p,p)),abs(v(q,q)))
+                             if ( abs(v(q,p)) <= temp1 )v(q,p) = sign( temp1, v(q,p) )
+                          end do
+                       end do
+                    end if
+                    if ( nr /= n )call stdlib_qlacpy( 'A', n, nr, v, ldv, work(2*n+1), n )
+                    ! .. save ...
+                 ! This Transposed Copy Should Be Better Than Naive
+                    do p = 1, nr - 1
+                       call stdlib_qcopy( nr-p, v(p,p+1), ldv, v(p+1,p), 1 )
+                    end do
+                    condr2 = condr1
+                 else
+                    ! .. ill-conditioned case: second qrf with pivoting
+                    ! note that windowed pivoting would be equally good
+                    ! numerically, and more run-time efficient. so, in
+                    ! an optimal implementation, the next call to stdlib_qgeqp3
+                    ! should be replaced with eg. call sgeqpx (acm toms #782)
+                    ! with properly (carefully) chosen parameters.
+                    ! r1^t * p2 = q2 * r2
+                    do p = 1, nr
+                       iwork(n+p) = 0
+                    end do
+                    call stdlib_qgeqp3( n, nr, v, ldv, iwork(n+1), work(n+1),work(2*n+1), lwork-&
+                              2*n, ierr )
+      ! *               call stdlib_qgeqrf( n, nr, v, ldv, work(n+1), work(2*n+1),
+      ! *     $              lwork-2*n, ierr )
+                    if ( l2pert ) then
+                       xsc = sqrt(small)
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             temp1 = xsc * min(abs(v(p,p)),abs(v(q,q)))
+                             if ( abs(v(q,p)) <= temp1 )v(q,p) = sign( temp1, v(q,p) )
+                          end do
+                       end do
+                    end if
+                    call stdlib_qlacpy( 'A', n, nr, v, ldv, work(2*n+1), n )
+                    if ( l2pert ) then
+                       xsc = sqrt(small)
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             temp1 = xsc * min(abs(v(p,p)),abs(v(q,q)))
+                             v(p,q) = - sign( temp1, v(q,p) )
+                          end do
+                       end do
+                    else
+                       call stdlib_qlaset( 'L',nr-1,nr-1,zero,zero,v(2,1),ldv )
+                    end if
+                    ! now, compute r2 = l3 * q3, the lq factorization.
+                    call stdlib_qgelqf( nr, nr, v, ldv, work(2*n+n*nr+1),work(2*n+n*nr+nr+1), &
+                              lwork-2*n-n*nr-nr, ierr )
+                    ! And Estimate The Condition Number
+                    call stdlib_qlacpy( 'L',nr,nr,v,ldv,work(2*n+n*nr+nr+1),nr )
+                    do p = 1, nr
+                       temp1 = stdlib_qnrm2( p, work(2*n+n*nr+nr+p), nr )
+                       call stdlib_qscal( p, one/temp1, work(2*n+n*nr+nr+p), nr )
+                    end do
+                    call stdlib_qpocon( 'L',nr,work(2*n+n*nr+nr+1),nr,one,temp1,work(2*n+n*nr+nr+&
+                              nr*nr+1),iwork(m+2*n+1),ierr )
+                    condr2 = one / sqrt(temp1)
+                    if ( condr2 >= cond_ok ) then
+                       ! Save The Householder Vectors Used For Q3
+                       ! (this overwrites the copy of r2, as it will not be
+                       ! needed in this branch, but it does not overwritte the
+                       ! huseholder vectors of q2.).
+                       call stdlib_qlacpy( 'U', nr, nr, v, ldv, work(2*n+1), n )
+                       ! And The Rest Of The Information On Q3 Is In
+                       ! work(2*n+n*nr+1:2*n+n*nr+n)
+                    end if
+                 end if
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do q = 2, nr
+                       temp1 = xsc * v(q,q)
+                       do p = 1, q - 1
+                          ! v(p,q) = - sign( temp1, v(q,p) )
+                          v(p,q) = - sign( temp1, v(p,q) )
+                       end do
+                    end do
+                 else
+                    call stdlib_qlaset( 'U', nr-1,nr-1, zero,zero, v(1,2), ldv )
+                 end if
+              ! second preconditioning finished; continue with jacobi svd
+              ! the input matrix is lower trinagular.
+              ! recover the right singular vectors as solution of a well
+              ! conditioned triangular matrix equation.
+                 if ( condr1 < cond_ok ) then
+                    call stdlib_qgesvj( 'L','U','N',nr,nr,v,ldv,sva,nr,u,ldu,work(2*n+n*nr+nr+1),&
+                              lwork-2*n-n*nr-nr,info )
+                    scalem  = work(2*n+n*nr+nr+1)
+                    numrank = nint(work(2*n+n*nr+nr+2),KIND=ilp)
+                    do p = 1, nr
+                       call stdlib_qcopy( nr, v(1,p), 1, u(1,p), 1 )
+                       call stdlib_qscal( nr, sva(p),    v(1,p), 1 )
+                    end do
+              ! Pick The Right Matrix Equation And Solve It
+                    if ( nr == n ) then
+       ! :))             .. best case, r1 is inverted. the solution of this matrix
+                       ! equation is q2*v2 = the product of the jacobi rotations
+                       ! used in stdlib_qgesvj, premultiplied with the orthogonal matrix
+                       ! from the second qr factorization.
+                       call stdlib_qtrsm( 'L','U','N','N', nr,nr,one, a,lda, v,ldv )
+                    else
+                       ! .. r1 is well conditioned, but non-square. transpose(r2)
+                       ! is inverted to get the product of the jacobi rotations
+                       ! used in stdlib_qgesvj. the q-factor from the second qr
+                       ! factorization is then built in explicitly.
+                       call stdlib_qtrsm('L','U','T','N',nr,nr,one,work(2*n+1),n,v,ldv)
+                       if ( nr < n ) then
+                         call stdlib_qlaset('A',n-nr,nr,zero,zero,v(nr+1,1),ldv)
+                         call stdlib_qlaset('A',nr,n-nr,zero,zero,v(1,nr+1),ldv)
+                         call stdlib_qlaset('A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv)
+                       end if
+                       call stdlib_qormqr('L','N',n,n,nr,work(2*n+1),n,work(n+1),v,ldv,work(2*n+&
+                                 n*nr+nr+1),lwork-2*n-n*nr-nr,ierr)
+                    end if
+                 else if ( condr2 < cond_ok ) then
+       ! :)           .. the input matrix a is very likely a relative of
+                    ! the kahan matrix :)
+                    ! the matrix r2 is inverted. the solution of the matrix equation
+                    ! is q3^t*v3 = the product of the jacobi rotations (appplied to
+                    ! the lower triangular l3 from the lq factorization of
+                    ! r2=l3*q3), pre-multiplied with the transposed q3.
+                    call stdlib_qgesvj( 'L', 'U', 'N', nr, nr, v, ldv, sva, nr, u,ldu, work(2*n+&
+                              n*nr+nr+1), lwork-2*n-n*nr-nr, info )
+                    scalem  = work(2*n+n*nr+nr+1)
+                    numrank = nint(work(2*n+n*nr+nr+2),KIND=ilp)
+                    do p = 1, nr
+                       call stdlib_qcopy( nr, v(1,p), 1, u(1,p), 1 )
+                       call stdlib_qscal( nr, sva(p),    u(1,p), 1 )
+                    end do
+                    call stdlib_qtrsm('L','U','N','N',nr,nr,one,work(2*n+1),n,u,ldu)
+                    ! Apply The Permutation From The Second Qr Factorization
+                    do q = 1, nr
+                       do p = 1, nr
+                          work(2*n+n*nr+nr+iwork(n+p)) = u(p,q)
+                       end do
+                       do p = 1, nr
+                          u(p,q) = work(2*n+n*nr+nr+p)
+                       end do
+                    end do
+                    if ( nr < n ) then
+                       call stdlib_qlaset( 'A',n-nr,nr,zero,zero,v(nr+1,1),ldv )
+                       call stdlib_qlaset( 'A',nr,n-nr,zero,zero,v(1,nr+1),ldv )
+                       call stdlib_qlaset( 'A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv )
+                    end if
+                    call stdlib_qormqr( 'L','N',n,n,nr,work(2*n+1),n,work(n+1),v,ldv,work(2*n+&
+                              n*nr+nr+1),lwork-2*n-n*nr-nr,ierr )
+                 else
+                    ! last line of defense.
+       ! #:(          this is a rather pathological case: no scaled condition
+                    ! improvement after two pivoted qr factorizations. other
+                    ! possibility is that the rank revealing qr factorization
+                    ! or the condition estimator has failed, or the cond_ok
+                    ! is set very close to one (which is unnecessary). normally,
+                    ! this branch should never be executed, but in rare cases of
+                    ! failure of the rrqr or condition estimator, the last line of
+                    ! defense ensures that stdlib_qgejsv completes the task.
+                    ! compute the full svd of l3 using stdlib_qgesvj with explicit
+                    ! accumulation of jacobi rotations.
+                    call stdlib_qgesvj( 'L', 'U', 'V', nr, nr, v, ldv, sva, nr, u,ldu, work(2*n+&
+                              n*nr+nr+1), lwork-2*n-n*nr-nr, info )
+                    scalem  = work(2*n+n*nr+nr+1)
+                    numrank = nint(work(2*n+n*nr+nr+2),KIND=ilp)
+                    if ( nr < n ) then
+                       call stdlib_qlaset( 'A',n-nr,nr,zero,zero,v(nr+1,1),ldv )
+                       call stdlib_qlaset( 'A',nr,n-nr,zero,zero,v(1,nr+1),ldv )
+                       call stdlib_qlaset( 'A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv )
+                    end if
+                    call stdlib_qormqr( 'L','N',n,n,nr,work(2*n+1),n,work(n+1),v,ldv,work(2*n+&
+                              n*nr+nr+1),lwork-2*n-n*nr-nr,ierr )
+                    call stdlib_qormlq( 'L', 'T', nr, nr, nr, work(2*n+1), n,work(2*n+n*nr+1), u, &
+                              ldu, work(2*n+n*nr+nr+1),lwork-2*n-n*nr-nr, ierr )
+                    do q = 1, nr
+                       do p = 1, nr
+                          work(2*n+n*nr+nr+iwork(n+p)) = u(p,q)
+                       end do
+                       do p = 1, nr
+                          u(p,q) = work(2*n+n*nr+nr+p)
+                       end do
+                    end do
+                 end if
+                 ! permute the rows of v using the (column) permutation from the
+                 ! first qrf. also, scale the columns to make them unit in
+                 ! euclidean norm. this applies to all cases.
+                 temp1 = sqrt(real(n,KIND=qp)) * epsln
+                 do q = 1, n
+                    do p = 1, n
+                       work(2*n+n*nr+nr+iwork(p)) = v(p,q)
+                    end do
+                    do p = 1, n
+                       v(p,q) = work(2*n+n*nr+nr+p)
+                    end do
+                    xsc = one / stdlib_qnrm2( n, v(1,q), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_qscal( n, xsc, &
+                              v(1,q), 1 )
+                 end do
+                 ! at this moment, v contains the right singular vectors of a.
+                 ! next, assemble the left singular vector matrix u (m x n).
+                 if ( nr < m ) then
+                    call stdlib_qlaset( 'A', m-nr, nr, zero, zero, u(nr+1,1), ldu )
+                    if ( nr < n1 ) then
+                       call stdlib_qlaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                       call stdlib_qlaset('A',m-nr,n1-nr,zero,one,u(nr+1,nr+1),ldu)
+                    end if
+                 end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! matrix u. this applies to all cases.
+                 call stdlib_qormqr( 'LEFT', 'NO_TR', m, n1, n, a, lda, work, u,ldu, work(n+1), &
+                           lwork-n, ierr )
+                 ! the columns of u are normalized. the cost is o(m*n) flops.
+                 temp1 = sqrt(real(m,KIND=qp)) * epsln
+                 do p = 1, nr
+                    xsc = one / stdlib_qnrm2( m, u(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_qscal( m, xsc, &
+                              u(1,p), 1 )
+                 end do
+                 ! if the initial qrf is computed with row pivoting, the left
+                 ! singular vectors must be adjusted.
+                 if ( rowpiv )call stdlib_qlaswp( n1, u, ldu, 1, m-1, iwork(2*n+1), -1 )
+              else
+              ! The Initial Matrix A Has Almost Orthogonal Columns And
+              ! the second qrf is not needed
+                 call stdlib_qlacpy( 'UPPER', n, n, a, lda, work(n+1), n )
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do p = 2, n
+                       temp1 = xsc * work( n + (p-1)*n + p )
+                       do q = 1, p - 1
+                          work(n+(q-1)*n+p)=-sign(temp1,work(n+(p-1)*n+q))
+                       end do
+                    end do
+                 else
+                    call stdlib_qlaset( 'LOWER',n-1,n-1,zero,zero,work(n+2),n )
+                 end if
+                 call stdlib_qgesvj( 'UPPER', 'U', 'N', n, n, work(n+1), n, sva,n, u, ldu, work(n+&
+                           n*n+1), lwork-n-n*n, info )
+                 scalem  = work(n+n*n+1)
+                 numrank = nint(work(n+n*n+2),KIND=ilp)
+                 do p = 1, n
+                    call stdlib_qcopy( n, work(n+(p-1)*n+1), 1, u(1,p), 1 )
+                    call stdlib_qscal( n, sva(p), work(n+(p-1)*n+1), 1 )
+                 end do
+                 call stdlib_qtrsm( 'LEFT', 'UPPER', 'NOTRANS', 'NO UD', n, n,one, a, lda, work(n+&
+                           1), n )
+                 do p = 1, n
+                    call stdlib_qcopy( n, work(n+p), n, v(iwork(p),1), ldv )
+                 end do
+                 temp1 = sqrt(real(n,KIND=qp))*epsln
+                 do p = 1, n
+                    xsc = one / stdlib_qnrm2( n, v(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_qscal( n, xsc, &
+                              v(1,p), 1 )
+                 end do
+                 ! assemble the left singular vector matrix u (m x n).
+                 if ( n < m ) then
+                    call stdlib_qlaset( 'A',  m-n, n, zero, zero, u(n+1,1), ldu )
+                    if ( n < n1 ) then
+                       call stdlib_qlaset( 'A',n,  n1-n, zero, zero,  u(1,n+1),ldu )
+                       call stdlib_qlaset( 'A',m-n,n1-n, zero, one,u(n+1,n+1),ldu )
+                    end if
+                 end if
+                 call stdlib_qormqr( 'LEFT', 'NO TR', m, n1, n, a, lda, work, u,ldu, work(n+1), &
+                           lwork-n, ierr )
+                 temp1 = sqrt(real(m,KIND=qp))*epsln
+                 do p = 1, n1
+                    xsc = one / stdlib_qnrm2( m, u(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_qscal( m, xsc, &
+                              u(1,p), 1 )
+                 end do
+                 if ( rowpiv )call stdlib_qlaswp( n1, u, ldu, 1, m-1, iwork(2*n+1), -1 )
+              end if
+              ! end of the  >> almost orthogonal case <<  in the full svd
+              else
+              ! this branch deploys a preconditioned jacobi svd with explicitly
+              ! accumulated rotations. it is included as optional, mainly for
+              ! experimental purposes. it does perform well, and can also be used.
+              ! in this implementation, this branch will be automatically activated
+              ! if the  condition number sigma_max(a) / sigma_min(a) is predicted
+              ! to be greater than the overflow threshold. this is because the
+              ! a posteriori computation of the singular vectors assumes robust
+              ! implementation of blas and some lapack procedures, capable of working
+              ! in presence of extreme values. since that is not always the case, ...
+              do p = 1, nr
+                 call stdlib_qcopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+              end do
+              if ( l2pert ) then
+                 xsc = sqrt(small/epsln)
+                 do q = 1, nr
+                    temp1 = xsc*abs( v(q,q) )
+                    do p = 1, n
+                       if ( ( p > q ) .and. ( abs(v(p,q)) <= temp1 ).or. ( p < q ) )v(p,q) = sign(&
+                                  temp1, v(p,q) )
+                       if ( p < q ) v(p,q) = - v(p,q)
+                    end do
+                 end do
+              else
+                 call stdlib_qlaset( 'U', nr-1, nr-1, zero, zero, v(1,2), ldv )
+              end if
+              call stdlib_qgeqrf( n, nr, v, ldv, work(n+1), work(2*n+1),lwork-2*n, ierr )
+              call stdlib_qlacpy( 'L', n, nr, v, ldv, work(2*n+1), n )
+              do p = 1, nr
+                 call stdlib_qcopy( nr-p+1, v(p,p), ldv, u(p,p), 1 )
+              end do
+              if ( l2pert ) then
+                 xsc = sqrt(small/epsln)
+                 do q = 2, nr
+                    do p = 1, q - 1
+                       temp1 = xsc * min(abs(u(p,p)),abs(u(q,q)))
+                       u(p,q) = - sign( temp1, u(q,p) )
+                    end do
+                 end do
+              else
+                 call stdlib_qlaset('U', nr-1, nr-1, zero, zero, u(1,2), ldu )
+              end if
+              call stdlib_qgesvj( 'G', 'U', 'V', nr, nr, u, ldu, sva,n, v, ldv, work(2*n+n*nr+1), &
+                        lwork-2*n-n*nr, info )
+              scalem  = work(2*n+n*nr+1)
+              numrank = nint(work(2*n+n*nr+2),KIND=ilp)
+              if ( nr < n ) then
+                 call stdlib_qlaset( 'A',n-nr,nr,zero,zero,v(nr+1,1),ldv )
+                 call stdlib_qlaset( 'A',nr,n-nr,zero,zero,v(1,nr+1),ldv )
+                 call stdlib_qlaset( 'A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv )
+              end if
+              call stdlib_qormqr( 'L','N',n,n,nr,work(2*n+1),n,work(n+1),v,ldv,work(2*n+n*nr+nr+1)&
+                        ,lwork-2*n-n*nr-nr,ierr )
+                 ! permute the rows of v using the (column) permutation from the
+                 ! first qrf. also, scale the columns to make them unit in
+                 ! euclidean norm. this applies to all cases.
+                 temp1 = sqrt(real(n,KIND=qp)) * epsln
+                 do q = 1, n
+                    do p = 1, n
+                       work(2*n+n*nr+nr+iwork(p)) = v(p,q)
+                    end do
+                    do p = 1, n
+                       v(p,q) = work(2*n+n*nr+nr+p)
+                    end do
+                    xsc = one / stdlib_qnrm2( n, v(1,q), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_qscal( n, xsc, &
+                              v(1,q), 1 )
+                 end do
+                 ! at this moment, v contains the right singular vectors of a.
+                 ! next, assemble the left singular vector matrix u (m x n).
+              if ( nr < m ) then
+                 call stdlib_qlaset( 'A',  m-nr, nr, zero, zero, u(nr+1,1), ldu )
+                 if ( nr < n1 ) then
+                    call stdlib_qlaset( 'A',nr,  n1-nr, zero, zero,  u(1,nr+1),ldu )
+                    call stdlib_qlaset( 'A',m-nr,n1-nr, zero, one,u(nr+1,nr+1),ldu )
+                 end if
+              end if
+              call stdlib_qormqr( 'LEFT', 'NO TR', m, n1, n, a, lda, work, u,ldu, work(n+1), &
+                        lwork-n, ierr )
+                 if ( rowpiv )call stdlib_qlaswp( n1, u, ldu, 1, m-1, iwork(2*n+1), -1 )
+              end if
+              if ( transp ) then
+                 ! .. swap u and v because the procedure worked on a^t
+                 do p = 1, n
+                    call stdlib_qswap( n, u(1,p), 1, v(1,p), 1 )
+                 end do
+              end if
+           end if
+           ! end of the full svd
+           ! undo scaling, if necessary (and possible)
+           if ( uscal2 <= (big/sva(1))*uscal1 ) then
+              call stdlib_qlascl( 'G', 0, 0, uscal1, uscal2, nr, 1, sva, n, ierr )
+              uscal1 = one
+              uscal2 = one
+           end if
+           if ( nr < n ) then
+              do p = nr+1, n
+                 sva(p) = zero
+              end do
+           end if
+           work(1) = uscal2 * scalem
+           work(2) = uscal1
+           if ( errest ) work(3) = sconda
+           if ( lsvec .and. rsvec ) then
+              work(4) = condr1
+              work(5) = condr2
+           end if
+           if ( l2tran ) then
+              work(6) = entra
+              work(7) = entrat
+           end if
+           iwork(1) = nr
+           iwork(2) = numrank
+           iwork(3) = warning
+           return
+     end subroutine stdlib_qgejsv
+
+     !> DGELQ: computes an LQ factorization of a real M-by-N matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a N-by-N orthogonal matrix;
+     !> L is a lower-triangular M-by-M matrix;
+     !> 0 is a M-by-(N-M) zero matrix, if M < N.
+
+     pure subroutine stdlib_qgelq( m, n, a, lda, t, tsize, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, tsize, lwork
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: t(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, lminws, mint, minw
+           integer(ilp) :: mb, nb, mintsz, nblcks, lwmin, lwopt, lwreq
+           ! Intrinsic Functions 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( tsize==-1 .or. tsize==-2 .or.lwork==-1 .or. lwork==-2 )
+           mint = .false.
+           minw = .false.
+           if( tsize==-2 .or. lwork==-2 ) then
+             if( tsize/=-1 ) mint = .true.
+             if( lwork/=-1 ) minw = .true.
+           end if
+           ! determine the block size
+           if( min( m, n )>0 ) then
+             mb = stdlib_ilaenv( 1, 'DGELQ ', ' ', m, n, 1, -1 )
+             nb = stdlib_ilaenv( 1, 'DGELQ ', ' ', m, n, 2, -1 )
+           else
+             mb = 1
+             nb = n
+           end if
+           if( mb>min( m, n ) .or. mb<1 ) mb = 1
+           if( nb>n .or. nb<=m ) nb = n
+           mintsz = m + 5
+           if ( nb>m .and. n>m ) then
+             if( mod( n - m, nb - m )==0 ) then
+               nblcks = ( n - m ) / ( nb - m )
+             else
+               nblcks = ( n - m ) / ( nb - m ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           ! determine if the workspace size satisfies minimal size
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+              lwmin = max( 1, n )
+              lwopt = max( 1, mb*n )
+           else
+              lwmin = max( 1, m )
+              lwopt = max( 1, mb*m )
+           end if
+           lminws = .false.
+           if( ( tsize<max( 1, mb*m*nblcks + 5 ) .or. lwork<lwopt ).and. ( lwork>=lwmin ) .and. ( &
+                     tsize>=mintsz ).and. ( .not.lquery ) ) then
+             if( tsize<max( 1, mb*m*nblcks + 5 ) ) then
+                 lminws = .true.
+                 mb = 1
+                 nb = n
+             end if
+             if( lwork<lwopt ) then
+                 lminws = .true.
+                 mb = 1
+             end if
+           end if
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+              lwreq = max( 1, mb*n )
+           else
+              lwreq = max( 1, mb*m )
+           end if
+           if( m<0 ) then
+             info = -1
+           else if( n<0 ) then
+             info = -2
+           else if( lda<max( 1, m ) ) then
+             info = -4
+           else if( tsize<max( 1, mb*m*nblcks + 5 ).and. ( .not.lquery ) .and. ( .not.lminws ) ) &
+                     then
+             info = -6
+           else if( ( lwork<lwreq ) .and.( .not.lquery ).and. ( .not.lminws ) ) then
+             info = -8
+           end if
+           if( info==0 ) then
+             if( mint ) then
+               t( 1 ) = mintsz
+             else
+               t( 1 ) = mb*m*nblcks + 5
+             end if
+             t( 2 ) = mb
+             t( 3 ) = nb
+             if( minw ) then
+               work( 1 ) = lwmin
+             else
+               work( 1 ) = lwreq
+             end if
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DGELQ', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+             return
+           end if
+           ! the lq decomposition
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+             call stdlib_qgelqt( m, n, mb, a, lda, t( 6 ), mb, work, info )
+           else
+             call stdlib_qlaswlq( m, n, mb, nb, a, lda, t( 6 ), mb, work,lwork, info )
+           end if
+           work( 1 ) = lwreq
+           return
+     end subroutine stdlib_qgelq
+
+     !> DGELQ2: computes an LQ factorization of a real m-by-n matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a n-by-n orthogonal matrix;
+     !> L is a lower-triangular m-by-m matrix;
+     !> 0 is a m-by-(n-m) zero matrix, if m < n.
+
+     pure subroutine stdlib_qgelq2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELQ2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i,i+1:n)
+              call stdlib_qlarfg( n-i+1, a( i, i ), a( i, min( i+1, n ) ), lda,tau( i ) )
+              if( i<m ) then
+                 ! apply h(i) to a(i+1:m,i:n) from the right
+                 aii = a( i, i )
+                 a( i, i ) = one
+                 call stdlib_qlarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda, tau( i ),a( i+1, i ), &
+                           lda, work )
+                 a( i, i ) = aii
+              end if
+           end do
+           return
+     end subroutine stdlib_qgelq2
+
+     !> DGELQF: computes an LQ factorization of a real M-by-N matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a N-by-N orthogonal matrix;
+     !> L is a lower-triangular M-by-M matrix;
+     !> 0 is a M-by-(N-M) zero matrix, if M < N.
+
+     pure subroutine stdlib_qgelqf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DGELQF', ' ', m, n, -1, -1 )
+           lwkopt = m*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, m ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DGELQF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGELQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the lq factorization of the current block
+                 ! a(i:i+ib-1,i:n)
+                 call stdlib_qgelq2( ib, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=m ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_qlarft( 'FORWARD', 'ROWWISE', n-i+1, ib, a( i, i ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h to a(i+ib:m,i:n) from the right
+                    call stdlib_qlarfb( 'RIGHT', 'NO TRANSPOSE', 'FORWARD','ROWWISE', m-i-ib+1, n-&
+                    i+1, ib, a( i, i ),lda, work, ldwork, a( i+ib, i ), lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_qgelq2( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_qgelqf
+
+     !> DGELQT: computes a blocked LQ factorization of a real M-by-N matrix A
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_qgelqt( m, n, mb, a, lda, t, ldt, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, mb
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, iinfo, k
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( mb<1 .or. ( mb>min(m,n) .and. min(m,n)>0 ) )then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<mb ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELQT', -info )
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) return
+           ! blocked loop of length k
+           do i = 1, k,  mb
+              ib = min( k-i+1, mb )
+           ! compute the lq factorization of the current block a(i:m,i:i+ib-1)
+              call stdlib_qgelqt3( ib, n-i+1, a(i,i), lda, t(1,i), ldt, iinfo )
+              if( i+ib<=m ) then
+           ! update by applying h**t to a(i:m,i+ib:n) from the right
+              call stdlib_qlarfb( 'R', 'N', 'F', 'R', m-i-ib+1, n-i+1, ib,a( i, i ), lda, t( 1, i &
+                        ), ldt,a( i+ib, i ), lda, work , m-i-ib+1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_qgelqt
+
+     !> DGELQT3: recursively computes a LQ factorization of a real M-by-N
+     !> matrix A, using the compact WY representation of Q.
+     !> Based on the algorithm of Elmroth and Gustavson,
+     !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+
+     pure recursive subroutine stdlib_qgelqt3( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, ldt
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, j, j1, m1, m2, iinfo
+           ! Executable Statements 
+           info = 0
+           if( m < 0 ) then
+              info = -1
+           else if( n < m ) then
+              info = -2
+           else if( lda < max( 1, m ) ) then
+              info = -4
+           else if( ldt < max( 1, m ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELQT3', -info )
+              return
+           end if
+           if( m==1 ) then
+              ! compute householder transform when m=1
+              call stdlib_qlarfg( n, a(1,1), a( 1, min( 2, n ) ), lda, t(1,1) )
+           else
+              ! otherwise, split a into blocks...
+              m1 = m/2
+              m2 = m-m1
+              i1 = min( m1+1, m )
+              j1 = min( m+1, n )
+              ! compute a(1:m1,1:n) <- (y1,r1,t1), where q1 = i - y1 t1 y1^h
+              call stdlib_qgelqt3( m1, n, a, lda, t, ldt, iinfo )
+              ! compute a(j1:m,1:n) = q1^h a(j1:m,1:n) [workspace: t(1:n1,j1:n)]
+              do i=1,m2
+                 do j=1,m1
+                    t(  i+m1, j ) = a( i+m1, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'R', 'U', 'T', 'U', m2, m1, one,a, lda, t( i1, 1 ), ldt )
+              call stdlib_qgemm( 'N', 'T', m2, m1, n-m1, one, a( i1, i1 ), lda,a( 1, i1 ), lda, &
+                        one, t( i1, 1 ), ldt)
+              call stdlib_qtrmm( 'R', 'U', 'N', 'N', m2, m1, one,t, ldt, t( i1, 1 ), ldt )
+              call stdlib_qgemm( 'N', 'N', m2, n-m1, m1, -one, t( i1, 1 ), ldt,a( 1, i1 ), lda, &
+                        one, a( i1, i1 ), lda )
+              call stdlib_qtrmm( 'R', 'U', 'N', 'U', m2, m1 , one,a, lda, t( i1, 1 ), ldt )
+                        
+              do i=1,m2
+                 do j=1,m1
+                    a(  i+m1, j ) = a( i+m1, j ) - t( i+m1, j )
+                    t( i+m1, j )=0
+                 end do
+              end do
+              ! compute a(j1:m,j1:n) <- (y2,r2,t2) where q2 = i - y2 t2 y2^h
+              call stdlib_qgelqt3( m2, n-m1, a( i1, i1 ), lda,t( i1, i1 ), ldt, iinfo )
+              ! compute t3 = t(j1:n1,1:n) = -t1 y1^h y2 t2
+              do i=1,m2
+                 do j=1,m1
+                    t( j, i+m1  ) = (a( j, i+m1 ))
+                 end do
+              end do
+              call stdlib_qtrmm( 'R', 'U', 'T', 'U', m1, m2, one,a( i1, i1 ), lda, t( 1, i1 ), &
+                        ldt )
+              call stdlib_qgemm( 'N', 'T', m1, m2, n-m, one, a( 1, j1 ), lda,a( i1, j1 ), lda, &
+                        one, t( 1, i1 ), ldt )
+              call stdlib_qtrmm( 'L', 'U', 'N', 'N', m1, m2, -one, t, ldt,t( 1, i1 ), ldt )
+                        
+              call stdlib_qtrmm( 'R', 'U', 'N', 'N', m1, m2, one,t( i1, i1 ), ldt, t( 1, i1 ), &
+                        ldt )
+              ! y = (y1,y2); l = [ l1            0  ];  t = [t1 t3]
+                               ! [ a(1:n1,j1:n)  l2 ]       [ 0 t2]
+           end if
+           return
+     end subroutine stdlib_qgelqt3
+
+     !> DGELS: solves overdetermined or underdetermined real linear systems
+     !> involving an M-by-N matrix A, or its transpose, using a QR or LQ
+     !> factorization of A.  It is assumed that A has full rank.
+     !> The following options are provided:
+     !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A*X ||.
+     !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+     !> an underdetermined system A * X = B.
+     !> 3. If TRANS = 'T' and m >= n:  find the minimum norm solution of
+     !> an underdetermined system A**T * X = B.
+     !> 4. If TRANS = 'T' and m < n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A**T * X ||.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+
+     subroutine stdlib_qgels( trans, m, n, nrhs, a, lda, b, ldb, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, tpsd
+           integer(ilp) :: brow, i, iascl, ibscl, j, mn, nb, scllen, wsize
+           real(qp) :: anrm, bignum, bnrm, smlnum
+           ! Local Arrays 
+           real(qp) :: rwork(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( .not.( stdlib_lsame( trans, 'N' ) .or. stdlib_lsame( trans, 'T' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m, n ) ) then
+              info = -8
+           else if( lwork<max( 1, mn+max( mn, nrhs ) ) .and. .not.lquery )then
+              info = -10
+           end if
+           ! figure out optimal block size
+           if( info==0 .or. info==-10 ) then
+              tpsd = .true.
+              if( stdlib_lsame( trans, 'N' ) )tpsd = .false.
+              if( m>=n ) then
+                 nb = stdlib_ilaenv( 1, 'DGEQRF', ' ', m, n, -1, -1 )
+                 if( tpsd ) then
+                    nb = max( nb, stdlib_ilaenv( 1, 'DORMQR', 'LN', m, nrhs, n,-1 ) )
+                 else
+                    nb = max( nb, stdlib_ilaenv( 1, 'DORMQR', 'LT', m, nrhs, n,-1 ) )
+                 end if
+              else
+                 nb = stdlib_ilaenv( 1, 'DGELQF', ' ', m, n, -1, -1 )
+                 if( tpsd ) then
+                    nb = max( nb, stdlib_ilaenv( 1, 'DORMLQ', 'LT', n, nrhs, m,-1 ) )
+                 else
+                    nb = max( nb, stdlib_ilaenv( 1, 'DORMLQ', 'LN', n, nrhs, m,-1 ) )
+                 end if
+              end if
+              wsize = max( 1, mn+max( mn, nrhs )*nb )
+              work( 1 ) = real( wsize,KIND=qp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELS ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+              call stdlib_qlaset( 'FULL', max( m, n ), nrhs, zero, zero, b, ldb )
+              return
+           end if
+           ! get machine parameters
+           smlnum = stdlib_qlamch( 'S' ) / stdlib_qlamch( 'P' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! scale a, b if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_qlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_qlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_qlaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              go to 50
+           end if
+           brow = m
+           if( tpsd )brow = n
+           bnrm = stdlib_qlange( 'M', brow, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_qlascl( 'G', 0, 0, bnrm, smlnum, brow, nrhs, b, ldb,info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_qlascl( 'G', 0, 0, bnrm, bignum, brow, nrhs, b, ldb,info )
+              ibscl = 2
+           end if
+           if( m>=n ) then
+              ! compute qr factorization of a
+              call stdlib_qgeqrf( m, n, a, lda, work( 1 ), work( mn+1 ), lwork-mn,info )
+              ! workspace at least n, optimally n*nb
+              if( .not.tpsd ) then
+                 ! least-squares problem min || a * x - b ||
+                 ! b(1:m,1:nrhs) := q**t * b(1:m,1:nrhs)
+                 call stdlib_qormqr( 'LEFT', 'TRANSPOSE', m, nrhs, n, a, lda,work( 1 ), b, ldb, &
+                           work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:n,1:nrhs) := inv(r) * b(1:n,1:nrhs)
+                 call stdlib_qtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = n
+              else
+                 ! underdetermined system of equations a**t * x = b
+                 ! b(1:n,1:nrhs) := inv(r**t) * b(1:n,1:nrhs)
+                 call stdlib_qtrtrs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(n+1:m,1:nrhs) = zero
+                 do j = 1, nrhs
+                    do i = n + 1, m
+                       b( i, j ) = zero
+                    end do
+                 end do
+                 ! b(1:m,1:nrhs) := q(1:n,:) * b(1:n,1:nrhs)
+                 call stdlib_qormqr( 'LEFT', 'NO TRANSPOSE', m, nrhs, n, a, lda,work( 1 ), b, ldb,&
+                            work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = m
+              end if
+           else
+              ! compute lq factorization of a
+              call stdlib_qgelqf( m, n, a, lda, work( 1 ), work( mn+1 ), lwork-mn,info )
+              ! workspace at least m, optimally m*nb.
+              if( .not.tpsd ) then
+                 ! underdetermined system of equations a * x = b
+                 ! b(1:m,1:nrhs) := inv(l) * b(1:m,1:nrhs)
+                 call stdlib_qtrtrs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', m, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(m+1:n,1:nrhs) = 0
+                 do j = 1, nrhs
+                    do i = m + 1, n
+                       b( i, j ) = zero
+                    end do
+                 end do
+                 ! b(1:n,1:nrhs) := q(1:n,:)**t * b(1:m,1:nrhs)
+                 call stdlib_qormlq( 'LEFT', 'TRANSPOSE', n, nrhs, m, a, lda,work( 1 ), b, ldb, &
+                           work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = n
+              else
+                 ! overdetermined system min || a**t * x - b ||
+                 ! b(1:n,1:nrhs) := q * b(1:n,1:nrhs)
+                 call stdlib_qormlq( 'LEFT', 'NO TRANSPOSE', n, nrhs, m, a, lda,work( 1 ), b, ldb,&
+                            work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:m,1:nrhs) := inv(l**t) * b(1:m,1:nrhs)
+                 call stdlib_qtrtrs( 'LOWER', 'TRANSPOSE', 'NON-UNIT', m, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = m
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_qlascl( 'G', 0, 0, anrm, smlnum, scllen, nrhs, b, ldb,info )
+           else if( iascl==2 ) then
+              call stdlib_qlascl( 'G', 0, 0, anrm, bignum, scllen, nrhs, b, ldb,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_qlascl( 'G', 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb,info )
+           else if( ibscl==2 ) then
+              call stdlib_qlascl( 'G', 0, 0, bignum, bnrm, scllen, nrhs, b, ldb,info )
+           end if
+           50 continue
+           work( 1 ) = real( wsize,KIND=qp)
+           return
+     end subroutine stdlib_qgels
+
+     !> DGELSD: computes the minimum-norm solution to a real linear least
+     !> squares problem:
+     !> minimize 2-norm(| b - A*x |)
+     !> using the singular value decomposition (SVD) of A. A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+     !> The problem is solved in three steps:
+     !> (1) Reduce the coefficient matrix A to bidiagonal form with
+     !> Householder transformations, reducing the original problem
+     !> into a "bidiagonal least squares problem" (BLS)
+     !> (2) Solve the BLS using a divide and conquer approach.
+     !> (3) Apply back all the Householder transformations to solve
+     !> the original least squares problem.
+     !> The effective rank of A is determined by treating as zero those
+     !> singular values which are less than RCOND times the largest singular
+     !> value.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_qgelsd( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, iwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(qp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: s(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: iascl, ibscl, ie, il, itau, itaup, itauq, ldwork, liwork, maxmn, &
+                     maxwrk, minmn, minwrk, mm, mnthr, nlvl, nwork, smlsiz, wlalsd
+           real(qp) :: anrm, bignum, bnrm, eps, sfmin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: real,int,log,max,min
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           minmn = min( m, n )
+           maxmn = max( m, n )
+           mnthr = stdlib_ilaenv( 6, 'DGELSD', ' ', m, n, nrhs, -1 )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, maxmn ) ) then
+              info = -7
+           end if
+           smlsiz = stdlib_ilaenv( 9, 'DGELSD', ' ', 0, 0, 0, 0 )
+           ! compute workspace.
+           ! (note: comments in the code beginning "workspace:" describe the
+           ! minimal amount of workspace needed at that point in the code,
+           ! as well as the preferred amount for good performance.
+           ! nb refers to the optimal block size for the immediately
+           ! following subroutine, as returned by stdlib_ilaenv.)
+           minwrk = 1
+           liwork = 1
+           minmn = max( 1, minmn )
+           nlvl = max( int( log( real( minmn,KIND=qp) / real( smlsiz+1,KIND=qp) ) /log( two ),&
+                     KIND=ilp) + 1, 0 )
+           if( info==0 ) then
+              maxwrk = 0
+              liwork = 3*minmn*nlvl + 11*minmn
+              mm = m
+              if( m>=n .and. m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns.
+                 mm = n
+                 maxwrk = max( maxwrk, n+n*stdlib_ilaenv( 1, 'DGEQRF', ' ', m, n,-1, -1 ) )
+                           
+                 maxwrk = max( maxwrk, n+nrhs*stdlib_ilaenv( 1, 'DORMQR', 'LT', m, nrhs, n, -1 ) )
+                           
+              end if
+              if( m>=n ) then
+                 ! path 1 - overdetermined or exactly determined.
+                 maxwrk = max( maxwrk, 3*n+( mm+n )*stdlib_ilaenv( 1, 'DGEBRD', ' ', mm, n, -1, -&
+                           1 ) )
+                 maxwrk = max( maxwrk, 3*n+nrhs*stdlib_ilaenv( 1, 'DORMBR', 'QLT', mm, nrhs, n, -&
+                           1 ) )
+                 maxwrk = max( maxwrk, 3*n+( n-1 )*stdlib_ilaenv( 1, 'DORMBR', 'PLN', n, nrhs, n, &
+                           -1 ) )
+                 wlalsd = 9*n+2*n*smlsiz+8*n*nlvl+n*nrhs+(smlsiz+1)**2
+                 maxwrk = max( maxwrk, 3*n+wlalsd )
+                 minwrk = max( 3*n+mm, 3*n+nrhs, 3*n+wlalsd )
+              end if
+              if( n>m ) then
+                 wlalsd = 9*m+2*m*smlsiz+8*m*nlvl+m*nrhs+(smlsiz+1)**2
+                 if( n>=mnthr ) then
+                    ! path 2a - underdetermined, with many more columns
+                    ! than rows.
+                    maxwrk = m + m*stdlib_ilaenv( 1, 'DGELQF', ' ', m, n, -1, -1 )
+                    maxwrk = max( maxwrk, m*m+4*m+2*m*stdlib_ilaenv( 1, 'DGEBRD', ' ', m, m, -1, -&
+                              1 ) )
+                    maxwrk = max( maxwrk, m*m+4*m+nrhs*stdlib_ilaenv( 1, 'DORMBR', 'QLT', m, nrhs,&
+                               m, -1 ) )
+                    maxwrk = max( maxwrk, m*m+4*m+( m-1 )*stdlib_ilaenv( 1, 'DORMBR', 'PLN', m, &
+                              nrhs, m, -1 ) )
+                    if( nrhs>1 ) then
+                       maxwrk = max( maxwrk, m*m+m+m*nrhs )
+                    else
+                       maxwrk = max( maxwrk, m*m+2*m )
+                    end if
+                    maxwrk = max( maxwrk, m+nrhs*stdlib_ilaenv( 1, 'DORMLQ', 'LT', n, nrhs, m, -1 &
+                              ) )
+                    maxwrk = max( maxwrk, m*m+4*m+wlalsd )
+           ! xxx: ensure the path 2a case below is triggered.  the workspace
+           ! calculation should use queries for all routines eventually.
+                    maxwrk = max( maxwrk,4*m+m*m+max( m, 2*m-4, nrhs, n-3*m ) )
+                 else
+                    ! path 2 - remaining underdetermined cases.
+                    maxwrk = 3*m + ( n+m )*stdlib_ilaenv( 1, 'DGEBRD', ' ', m, n,-1, -1 )
+                    maxwrk = max( maxwrk, 3*m+nrhs*stdlib_ilaenv( 1, 'DORMBR', 'QLT', m, nrhs, n, &
+                              -1 ) )
+                    maxwrk = max( maxwrk, 3*m+m*stdlib_ilaenv( 1, 'DORMBR', 'PLN', n, nrhs, m, -1 &
+                              ) )
+                    maxwrk = max( maxwrk, 3*m+wlalsd )
+                 end if
+                 minwrk = max( 3*m+nrhs, 3*m+m, 3*m+wlalsd )
+              end if
+              minwrk = min( minwrk, maxwrk )
+              work( 1 ) = maxwrk
+              iwork( 1 ) = liwork
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELSD', -info )
+              return
+           else if( lquery ) then
+              go to 10
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters.
+           eps = stdlib_qlamch( 'P' )
+           sfmin = stdlib_qlamch( 'S' )
+           smlnum = sfmin / eps
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! scale a if max entry outside range [smlnum,bignum].
+           anrm = stdlib_qlange( 'M', m, n, a, lda, work )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum.
+              call stdlib_qlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum.
+              call stdlib_qlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_qlaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              call stdlib_qlaset( 'F', minmn, 1, zero, zero, s, 1 )
+              rank = 0
+              go to 10
+           end if
+           ! scale b if max entry outside range [smlnum,bignum].
+           bnrm = stdlib_qlange( 'M', m, nrhs, b, ldb, work )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum.
+              call stdlib_qlascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum.
+              call stdlib_qlascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! if m < n make sure certain entries of b are zero.
+           if( m<n )call stdlib_qlaset( 'F', n-m, nrhs, zero, zero, b( m+1, 1 ), ldb )
+           ! overdetermined case.
+           if( m>=n ) then
+              ! path 1 - overdetermined or exactly determined.
+              mm = m
+              if( m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns.
+                 mm = n
+                 itau = 1
+                 nwork = itau + n
+                 ! compute a=q*r.
+                 ! (workspace: need 2*n, prefer n+n*nb)
+                 call stdlib_qgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                           info )
+                 ! multiply b by transpose(q).
+                 ! (workspace: need n+nrhs, prefer n+nrhs*nb)
+                 call stdlib_qormqr( 'L', 'T', m, nrhs, n, a, lda, work( itau ), b,ldb, work( &
+                           nwork ), lwork-nwork+1, info )
+                 ! zero out below r.
+                 if( n>1 ) then
+                    call stdlib_qlaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ), lda )
+                 end if
+              end if
+              ie = 1
+              itauq = ie + n
+              itaup = itauq + n
+              nwork = itaup + n
+              ! bidiagonalize r in a.
+              ! (workspace: need 3*n+mm, prefer 3*n+(mm+n)*nb)
+              call stdlib_qgebrd( mm, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), work(&
+                         nwork ), lwork-nwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors of r.
+              ! (workspace: need 3*n+nrhs, prefer 3*n+nrhs*nb)
+              call stdlib_qormbr( 'Q', 'L', 'T', mm, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_qlalsd( 'U', smlsiz, n, nrhs, s, work( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of r.
+              call stdlib_qormbr( 'P', 'L', 'N', n, nrhs, n, a, lda, work( itaup ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+           else if( n>=mnthr .and. lwork>=4*m+m*m+max( m, 2*m-4, nrhs, n-3*m, wlalsd ) ) &
+                     then
+              ! path 2a - underdetermined, with many more columns than rows
+              ! and sufficient workspace for an efficient algorithm.
+              ldwork = m
+              if( lwork>=max( 4*m+m*lda+max( m, 2*m-4, nrhs, n-3*m ),m*lda+m+m*nrhs, 4*m+m*lda+&
+                        wlalsd ) )ldwork = lda
+              itau = 1
+              nwork = m + 1
+              ! compute a=l*q.
+              ! (workspace: need 2*m, prefer m+m*nb)
+              call stdlib_qgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, info )
+                        
+              il = nwork
+              ! copy l to work(il), zeroing out above its diagonal.
+              call stdlib_qlacpy( 'L', m, m, a, lda, work( il ), ldwork )
+              call stdlib_qlaset( 'U', m-1, m-1, zero, zero, work( il+ldwork ),ldwork )
+              ie = il + ldwork*m
+              itauq = ie + m
+              itaup = itauq + m
+              nwork = itaup + m
+              ! bidiagonalize l in work(il).
+              ! (workspace: need m*m+5*m, prefer m*m+4*m+2*m*nb)
+              call stdlib_qgebrd( m, m, work( il ), ldwork, s, work( ie ),work( itauq ), work( &
+                        itaup ), work( nwork ),lwork-nwork+1, info )
+              ! multiply b by transpose of left bidiagonalizing vectors of l.
+              ! (workspace: need m*m+4*m+nrhs, prefer m*m+4*m+nrhs*nb)
+              call stdlib_qormbr( 'Q', 'L', 'T', m, nrhs, m, work( il ), ldwork,work( itauq ), b, &
+                        ldb, work( nwork ),lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_qlalsd( 'U', smlsiz, m, nrhs, s, work( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of l.
+              call stdlib_qormbr( 'P', 'L', 'N', m, nrhs, m, work( il ), ldwork,work( itaup ), b, &
+                        ldb, work( nwork ),lwork-nwork+1, info )
+              ! zero out below first m rows of b.
+              call stdlib_qlaset( 'F', n-m, nrhs, zero, zero, b( m+1, 1 ), ldb )
+              nwork = itau + m
+              ! multiply transpose(q) by b.
+              ! (workspace: need m+nrhs, prefer m+nrhs*nb)
+              call stdlib_qormlq( 'L', 'T', n, nrhs, m, a, lda, work( itau ), b,ldb, work( nwork )&
+                        , lwork-nwork+1, info )
+           else
+              ! path 2 - remaining underdetermined cases.
+              ie = 1
+              itauq = ie + m
+              itaup = itauq + m
+              nwork = itaup + m
+              ! bidiagonalize a.
+              ! (workspace: need 3*m+n, prefer 3*m+(m+n)*nb)
+              call stdlib_qgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), work( &
+                        nwork ), lwork-nwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors.
+              ! (workspace: need 3*m+nrhs, prefer 3*m+nrhs*nb)
+              call stdlib_qormbr( 'Q', 'L', 'T', m, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_qlalsd( 'L', smlsiz, m, nrhs, s, work( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of a.
+              call stdlib_qormbr( 'P', 'L', 'N', n, nrhs, m, a, lda, work( itaup ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+           end if
+           ! undo scaling.
+           if( iascl==1 ) then
+              call stdlib_qlascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_qlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,info )
+           else if( iascl==2 ) then
+              call stdlib_qlascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_qlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_qlascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_qlascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           10 continue
+           work( 1 ) = maxwrk
+           iwork( 1 ) = liwork
+           return
+     end subroutine stdlib_qgelsd
+
+     !> DGELSS: computes the minimum norm solution to a real linear least
+     !> squares problem:
+     !> Minimize 2-norm(| b - A*x |).
+     !> using the singular value decomposition (SVD) of A. A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution matrix
+     !> X.
+     !> The effective rank of A is determined by treating as zero those
+     !> singular values which are less than RCOND times the largest singular
+     !> value.
+
+     subroutine stdlib_qgelss( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(qp), intent(in) :: rcond
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: s(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: bdspac, bl, chunk, i, iascl, ibscl, ie, il, itau, itaup, itauq, iwork, &
+                     ldwork, maxmn, maxwrk, minmn, minwrk, mm, mnthr
+           integer(ilp) :: lwork_qgeqrf, lwork_qormqr, lwork_qgebrd, lwork_qormbr, lwork_qorgbr, &
+                     lwork_qormlq, lwork_qgelqf
+           real(qp) :: anrm, bignum, bnrm, eps, sfmin, smlnum, thr
+           ! Local Arrays 
+           real(qp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           minmn = min( m, n )
+           maxmn = max( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, maxmn ) ) then
+              info = -7
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( minmn>0 ) then
+                 mm = m
+                 mnthr = stdlib_ilaenv( 6, 'DGELSS', ' ', m, n, nrhs, -1 )
+                 if( m>=n .and. m>=mnthr ) then
+                    ! path 1a - overdetermined, with many more rows than
+                              ! columns
+                    ! compute space needed for stdlib_qgeqrf
+                    call stdlib_qgeqrf( m, n, a, lda, dum(1), dum(1), -1, info )
+                    lwork_qgeqrf=dum(1)
+                    ! compute space needed for stdlib_qormqr
+                    call stdlib_qormqr( 'L', 'T', m, nrhs, n, a, lda, dum(1), b,ldb, dum(1), -1, &
+                              info )
+                    lwork_qormqr=dum(1)
+                    mm = n
+                    maxwrk = max( maxwrk, n + lwork_qgeqrf )
+                    maxwrk = max( maxwrk, n + lwork_qormqr )
+                 end if
+                 if( m>=n ) then
+                    ! path 1 - overdetermined or exactly determined
+                    ! compute workspace needed for stdlib_qbdsqr
+                    bdspac = max( 1, 5*n )
+                    ! compute space needed for stdlib_qgebrd
+                    call stdlib_qgebrd( mm, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, info &
+                              )
+                    lwork_qgebrd=dum(1)
+                    ! compute space needed for stdlib_qormbr
+                    call stdlib_qormbr( 'Q', 'L', 'T', mm, nrhs, n, a, lda, dum(1),b, ldb, dum(1),&
+                               -1, info )
+                    lwork_qormbr=dum(1)
+                    ! compute space needed for stdlib_qorgbr
+                    call stdlib_qorgbr( 'P', n, n, n, a, lda, dum(1),dum(1), -1, info )
+                    lwork_qorgbr=dum(1)
+                    ! compute total workspace needed
+                    maxwrk = max( maxwrk, 3*n + lwork_qgebrd )
+                    maxwrk = max( maxwrk, 3*n + lwork_qormbr )
+                    maxwrk = max( maxwrk, 3*n + lwork_qorgbr )
+                    maxwrk = max( maxwrk, bdspac )
+                    maxwrk = max( maxwrk, n*nrhs )
+                    minwrk = max( 3*n + mm, 3*n + nrhs, bdspac )
+                    maxwrk = max( minwrk, maxwrk )
+                 end if
+                 if( n>m ) then
+                    ! compute workspace needed for stdlib_qbdsqr
+                    bdspac = max( 1, 5*m )
+                    minwrk = max( 3*m+nrhs, 3*m+n, bdspac )
+                    if( n>=mnthr ) then
+                       ! path 2a - underdetermined, with many more columns
+                       ! than rows
+                       ! compute space needed for stdlib_qgelqf
+                       call stdlib_qgelqf( m, n, a, lda, dum(1), dum(1),-1, info )
+                       lwork_qgelqf=dum(1)
+                       ! compute space needed for stdlib_qgebrd
+                       call stdlib_qgebrd( m, m, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, &
+                                 info )
+                       lwork_qgebrd=dum(1)
+                       ! compute space needed for stdlib_qormbr
+                       call stdlib_qormbr( 'Q', 'L', 'T', m, nrhs, n, a, lda,dum(1), b, ldb, dum(&
+                                 1), -1, info )
+                       lwork_qormbr=dum(1)
+                       ! compute space needed for stdlib_qorgbr
+                       call stdlib_qorgbr( 'P', m, m, m, a, lda, dum(1),dum(1), -1, info )
+                       lwork_qorgbr=dum(1)
+                       ! compute space needed for stdlib_qormlq
+                       call stdlib_qormlq( 'L', 'T', n, nrhs, m, a, lda, dum(1),b, ldb, dum(1), -&
+                                 1, info )
+                       lwork_qormlq=dum(1)
+                       ! compute total workspace needed
+                       maxwrk = m + lwork_qgelqf
+                       maxwrk = max( maxwrk, m*m + 4*m + lwork_qgebrd )
+                       maxwrk = max( maxwrk, m*m + 4*m + lwork_qormbr )
+                       maxwrk = max( maxwrk, m*m + 4*m + lwork_qorgbr )
+                       maxwrk = max( maxwrk, m*m + m + bdspac )
+                       if( nrhs>1 ) then
+                          maxwrk = max( maxwrk, m*m + m + m*nrhs )
+                       else
+                          maxwrk = max( maxwrk, m*m + 2*m )
+                       end if
+                       maxwrk = max( maxwrk, m + lwork_qormlq )
+                    else
+                       ! path 2 - underdetermined
+                       ! compute space needed for stdlib_qgebrd
+                       call stdlib_qgebrd( m, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, &
+                                 info )
+                       lwork_qgebrd=dum(1)
+                       ! compute space needed for stdlib_qormbr
+                       call stdlib_qormbr( 'Q', 'L', 'T', m, nrhs, m, a, lda,dum(1), b, ldb, dum(&
+                                 1), -1, info )
+                       lwork_qormbr=dum(1)
+                       ! compute space needed for stdlib_qorgbr
+                       call stdlib_qorgbr( 'P', m, n, m, a, lda, dum(1),dum(1), -1, info )
+                       lwork_qorgbr=dum(1)
+                       maxwrk = 3*m + lwork_qgebrd
+                       maxwrk = max( maxwrk, 3*m + lwork_qormbr )
+                       maxwrk = max( maxwrk, 3*m + lwork_qorgbr )
+                       maxwrk = max( maxwrk, bdspac )
+                       maxwrk = max( maxwrk, n*nrhs )
+                    end if
+                 end if
+                 maxwrk = max( minwrk, maxwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery )info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELSS', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters
+           eps = stdlib_qlamch( 'P' )
+           sfmin = stdlib_qlamch( 'S' )
+           smlnum = sfmin / eps
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', m, n, a, lda, work )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_qlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_qlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_qlaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              call stdlib_qlaset( 'F', minmn, 1, zero, zero, s, minmn )
+              rank = 0
+              go to 70
+           end if
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_qlange( 'M', m, nrhs, b, ldb, work )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_qlascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_qlascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! overdetermined case
+           if( m>=n ) then
+              ! path 1 - overdetermined or exactly determined
+              mm = m
+              if( m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns
+                 mm = n
+                 itau = 1
+                 iwork = itau + n
+                 ! compute a=q*r
+                 ! (workspace: need 2*n, prefer n+n*nb)
+                 call stdlib_qgeqrf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                           info )
+                 ! multiply b by transpose(q)
+                 ! (workspace: need n+nrhs, prefer n+nrhs*nb)
+                 call stdlib_qormqr( 'L', 'T', m, nrhs, n, a, lda, work( itau ), b,ldb, work( &
+                           iwork ), lwork-iwork+1, info )
+                 ! zero out below r
+                 if( n>1 )call stdlib_qlaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ), lda )
+              end if
+              ie = 1
+              itauq = ie + n
+              itaup = itauq + n
+              iwork = itaup + n
+              ! bidiagonalize r in a
+              ! (workspace: need 3*n+mm, prefer 3*n+(mm+n)*nb)
+              call stdlib_qgebrd( mm, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), work(&
+                         iwork ), lwork-iwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors of r
+              ! (workspace: need 3*n+nrhs, prefer 3*n+nrhs*nb)
+              call stdlib_qormbr( 'Q', 'L', 'T', mm, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        iwork ), lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors of r in a
+              ! (workspace: need 4*n-1, prefer 3*n+(n-1)*nb)
+              call stdlib_qorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), lwork-iwork+&
+                        1, info )
+              iwork = ie + n
+              ! perform bidiagonal qr iteration
+                ! multiply b by transpose of left singular vectors
+                ! compute right singular vectors in a
+              ! (workspace: need bdspac)
+              call stdlib_qbdsqr( 'U', n, n, 0, nrhs, s, work( ie ), a, lda, dum,1, b, ldb, work( &
+                        iwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, n
+                 if( s( i )>thr ) then
+                    call stdlib_qrscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_qlaset( 'F', 1, nrhs, zero, zero, b( i, 1 ), ldb )
+                 end if
+              end do
+              ! multiply b by right singular vectors
+              ! (workspace: need n, prefer n*nrhs)
+              if( lwork>=ldb*nrhs .and. nrhs>1 ) then
+                 call stdlib_qgemm( 'T', 'N', n, nrhs, n, one, a, lda, b, ldb, zero,work, ldb )
+                           
+                 call stdlib_qlacpy( 'G', n, nrhs, work, ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = lwork / n
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_qgemm( 'T', 'N', n, bl, n, one, a, lda, b( 1, i ),ldb, zero, work,&
+                               n )
+                    call stdlib_qlacpy( 'G', n, bl, work, n, b( 1, i ), ldb )
+                 end do
+              else
+                 call stdlib_qgemv( 'T', n, n, one, a, lda, b, 1, zero, work, 1 )
+                 call stdlib_qcopy( n, work, 1, b, 1 )
+              end if
+           else if( n>=mnthr .and. lwork>=4*m+m*m+max( m, 2*m-4, nrhs, n-3*m ) ) then
+              ! path 2a - underdetermined, with many more columns than rows
+              ! and sufficient workspace for an efficient algorithm
+              ldwork = m
+              if( lwork>=max( 4*m+m*lda+max( m, 2*m-4, nrhs, n-3*m ),m*lda+m+m*nrhs ) )ldwork = &
+                        lda
+              itau = 1
+              iwork = m + 1
+              ! compute a=l*q
+              ! (workspace: need 2*m, prefer m+m*nb)
+              call stdlib_qgelqf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, info )
+                        
+              il = iwork
+              ! copy l to work(il), zeroing out above it
+              call stdlib_qlacpy( 'L', m, m, a, lda, work( il ), ldwork )
+              call stdlib_qlaset( 'U', m-1, m-1, zero, zero, work( il+ldwork ),ldwork )
+              ie = il + ldwork*m
+              itauq = ie + m
+              itaup = itauq + m
+              iwork = itaup + m
+              ! bidiagonalize l in work(il)
+              ! (workspace: need m*m+5*m, prefer m*m+4*m+2*m*nb)
+              call stdlib_qgebrd( m, m, work( il ), ldwork, s, work( ie ),work( itauq ), work( &
+                        itaup ), work( iwork ),lwork-iwork+1, info )
+              ! multiply b by transpose of left bidiagonalizing vectors of l
+              ! (workspace: need m*m+4*m+nrhs, prefer m*m+4*m+nrhs*nb)
+              call stdlib_qormbr( 'Q', 'L', 'T', m, nrhs, m, work( il ), ldwork,work( itauq ), b, &
+                        ldb, work( iwork ),lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors of r in work(il)
+              ! (workspace: need m*m+5*m-1, prefer m*m+4*m+(m-1)*nb)
+              call stdlib_qorgbr( 'P', m, m, m, work( il ), ldwork, work( itaup ),work( iwork ), &
+                        lwork-iwork+1, info )
+              iwork = ie + m
+              ! perform bidiagonal qr iteration,
+                 ! computing right singular vectors of l in work(il) and
+                 ! multiplying b by transpose of left singular vectors
+              ! (workspace: need m*m+m+bdspac)
+              call stdlib_qbdsqr( 'U', m, m, 0, nrhs, s, work( ie ), work( il ),ldwork, a, lda, b,&
+                         ldb, work( iwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, m
+                 if( s( i )>thr ) then
+                    call stdlib_qrscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_qlaset( 'F', 1, nrhs, zero, zero, b( i, 1 ), ldb )
+                 end if
+              end do
+              iwork = ie
+              ! multiply b by right singular vectors of l in work(il)
+              ! (workspace: need m*m+2*m, prefer m*m+m+m*nrhs)
+              if( lwork>=ldb*nrhs+iwork-1 .and. nrhs>1 ) then
+                 call stdlib_qgemm( 'T', 'N', m, nrhs, m, one, work( il ), ldwork,b, ldb, zero, &
+                           work( iwork ), ldb )
+                 call stdlib_qlacpy( 'G', m, nrhs, work( iwork ), ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = ( lwork-iwork+1 ) / m
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_qgemm( 'T', 'N', m, bl, m, one, work( il ), ldwork,b( 1, i ), ldb,&
+                               zero, work( iwork ), m )
+                    call stdlib_qlacpy( 'G', m, bl, work( iwork ), m, b( 1, i ),ldb )
+                 end do
+              else
+                 call stdlib_qgemv( 'T', m, m, one, work( il ), ldwork, b( 1, 1 ),1, zero, work( &
+                           iwork ), 1 )
+                 call stdlib_qcopy( m, work( iwork ), 1, b( 1, 1 ), 1 )
+              end if
+              ! zero out below first m rows of b
+              call stdlib_qlaset( 'F', n-m, nrhs, zero, zero, b( m+1, 1 ), ldb )
+              iwork = itau + m
+              ! multiply transpose(q) by b
+              ! (workspace: need m+nrhs, prefer m+nrhs*nb)
+              call stdlib_qormlq( 'L', 'T', n, nrhs, m, a, lda, work( itau ), b,ldb, work( iwork )&
+                        , lwork-iwork+1, info )
+           else
+              ! path 2 - remaining underdetermined cases
+              ie = 1
+              itauq = ie + m
+              itaup = itauq + m
+              iwork = itaup + m
+              ! bidiagonalize a
+              ! (workspace: need 3*m+n, prefer 3*m+(m+n)*nb)
+              call stdlib_qgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), work( &
+                        iwork ), lwork-iwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors
+              ! (workspace: need 3*m+nrhs, prefer 3*m+nrhs*nb)
+              call stdlib_qormbr( 'Q', 'L', 'T', m, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        iwork ), lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors in a
+              ! (workspace: need 4*m, prefer 3*m+m*nb)
+              call stdlib_qorgbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), lwork-iwork+&
+                        1, info )
+              iwork = ie + m
+              ! perform bidiagonal qr iteration,
+                 ! computing right singular vectors of a in a and
+                 ! multiplying b by transpose of left singular vectors
+              ! (workspace: need bdspac)
+              call stdlib_qbdsqr( 'L', m, n, 0, nrhs, s, work( ie ), a, lda, dum,1, b, ldb, work( &
+                        iwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, m
+                 if( s( i )>thr ) then
+                    call stdlib_qrscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_qlaset( 'F', 1, nrhs, zero, zero, b( i, 1 ), ldb )
+                 end if
+              end do
+              ! multiply b by right singular vectors of a
+              ! (workspace: need n, prefer n*nrhs)
+              if( lwork>=ldb*nrhs .and. nrhs>1 ) then
+                 call stdlib_qgemm( 'T', 'N', n, nrhs, m, one, a, lda, b, ldb, zero,work, ldb )
+                           
+                 call stdlib_qlacpy( 'F', n, nrhs, work, ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = lwork / n
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_qgemm( 'T', 'N', n, bl, m, one, a, lda, b( 1, i ),ldb, zero, work,&
+                               n )
+                    call stdlib_qlacpy( 'F', n, bl, work, n, b( 1, i ), ldb )
+                 end do
+              else
+                 call stdlib_qgemv( 'T', m, n, one, a, lda, b, 1, zero, work, 1 )
+                 call stdlib_qcopy( n, work, 1, b, 1 )
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_qlascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_qlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,info )
+           else if( iascl==2 ) then
+              call stdlib_qlascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_qlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_qlascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_qlascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           70 continue
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_qgelss
+
+     !> DGELSY: computes the minimum-norm solution to a real linear least
+     !> squares problem:
+     !> minimize || A * X - B ||
+     !> using a complete orthogonal factorization of A.  A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+     !> The routine first computes a QR factorization with column pivoting:
+     !> A * P = Q * [ R11 R12 ]
+     !> [  0  R22 ]
+     !> with R11 defined as the largest leading submatrix whose estimated
+     !> condition number is less than 1/RCOND.  The order of R11, RANK,
+     !> is the effective rank of A.
+     !> Then, R22 is considered to be negligible, and R12 is annihilated
+     !> by orthogonal transformations from the right, arriving at the
+     !> complete orthogonal factorization:
+     !> A * P = Q * [ T11 0 ] * Z
+     !> [  0  0 ]
+     !> The minimum-norm solution is then
+     !> X = P * Z**T [ inv(T11)*Q1**T*B ]
+     !> [        0         ]
+     !> where Q1 consists of the first RANK columns of Q.
+     !> This routine is basically identical to the original xGELSX except
+     !> three differences:
+     !> o The call to the subroutine xGEQPF has been substituted by the
+     !> the call to the subroutine xGEQP3. This subroutine is a Blas-3
+     !> version of the QR factorization with column pivoting.
+     !> o Matrix B (the right hand side) is updated with Blas-3.
+     !> o The permutation of matrix B (the right hand side) is faster and
+     !> more simple.
+
+     subroutine stdlib_qgelsy( m, n, nrhs, a, lda, b, ldb, jpvt, rcond, rank,work, lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(qp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: imax = 1
+           integer(ilp), parameter :: imin = 2
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iascl, ibscl, ismax, ismin, j, lwkmin, lwkopt, mn, nb, nb1, nb2, &
+                     nb3, nb4
+           real(qp) :: anrm, bignum, bnrm, c1, c2, s1, s2, smax, smaxpr, smin, sminpr, smlnum, &
+                     wsize
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           mn = min( m, n )
+           ismin = mn + 1
+           ismax = 2*mn + 1
+           ! test the input arguments.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, m, n ) ) then
+              info = -7
+           end if
+           ! figure out optimal block size
+           if( info==0 ) then
+              if( mn==0 .or. nrhs==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'DGEQRF', ' ', m, n, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'DGERQF', ' ', m, n, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'DORMQR', ' ', m, n, nrhs, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'DORMRQ', ' ', m, n, nrhs, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = mn + max( 2*mn, n + 1, mn + nrhs )
+                 lwkopt = max( lwkmin,mn + 2*n + nb*( n + 1 ), 2*mn + nb*nrhs )
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGELSY', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( mn==0 .or. nrhs==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters
+           smlnum = stdlib_qlamch( 'S' ) / stdlib_qlamch( 'P' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! scale a, b if max entries outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', m, n, a, lda, work )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_qlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_qlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_qlaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              rank = 0
+              go to 70
+           end if
+           bnrm = stdlib_qlange( 'M', m, nrhs, b, ldb, work )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_qlascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_qlascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! compute qr factorization with column pivoting of a:
+              ! a * p = q * r
+           call stdlib_qgeqp3( m, n, a, lda, jpvt, work( 1 ), work( mn+1 ),lwork-mn, info )
+                     
+           wsize = mn + work( mn+1 )
+           ! workspace: mn+2*n+nb*(n+1).
+           ! details of householder rotations stored in work(1:mn).
+           ! determine rank using incremental condition estimation
+           work( ismin ) = one
+           work( ismax ) = one
+           smax = abs( a( 1, 1 ) )
+           smin = smax
+           if( abs( a( 1, 1 ) )==zero ) then
+              rank = 0
+              call stdlib_qlaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              go to 70
+           else
+              rank = 1
+           end if
+           10 continue
+           if( rank<mn ) then
+              i = rank + 1
+              call stdlib_qlaic1( imin, rank, work( ismin ), smin, a( 1, i ),a( i, i ), sminpr, &
+                        s1, c1 )
+              call stdlib_qlaic1( imax, rank, work( ismax ), smax, a( 1, i ),a( i, i ), smaxpr, &
+                        s2, c2 )
+              if( smaxpr*rcond<=sminpr ) then
+                 do i = 1, rank
+                    work( ismin+i-1 ) = s1*work( ismin+i-1 )
+                    work( ismax+i-1 ) = s2*work( ismax+i-1 )
+                 end do
+                 work( ismin+rank ) = c1
+                 work( ismax+rank ) = c2
+                 smin = sminpr
+                 smax = smaxpr
+                 rank = rank + 1
+                 go to 10
+              end if
+           end if
+           ! workspace: 3*mn.
+           ! logically partition r = [ r11 r12 ]
+                                   ! [  0  r22 ]
+           ! where r11 = r(1:rank,1:rank)
+           ! [r11,r12] = [ t11, 0 ] * y
+           if( rank<n )call stdlib_qtzrzf( rank, n, a, lda, work( mn+1 ), work( 2*mn+1 ),lwork-&
+                     2*mn, info )
+           ! workspace: 2*mn.
+           ! details of householder rotations stored in work(mn+1:2*mn)
+           ! b(1:m,1:nrhs) := q**t * b(1:m,1:nrhs)
+           call stdlib_qormqr( 'LEFT', 'TRANSPOSE', m, nrhs, mn, a, lda, work( 1 ),b, ldb, work( &
+                     2*mn+1 ), lwork-2*mn, info )
+           wsize = max( wsize, 2*mn+work( 2*mn+1 ) )
+           ! workspace: 2*mn+nb*nrhs.
+           ! b(1:rank,1:nrhs) := inv(t11) * b(1:rank,1:nrhs)
+           call stdlib_qtrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', rank,nrhs, one, a, lda,&
+                      b, ldb )
+           do j = 1, nrhs
+              do i = rank + 1, n
+                 b( i, j ) = zero
+              end do
+           end do
+           ! b(1:n,1:nrhs) := y**t * b(1:n,1:nrhs)
+           if( rank<n ) then
+              call stdlib_qormrz( 'LEFT', 'TRANSPOSE', n, nrhs, rank, n-rank, a,lda, work( mn+1 ),&
+                         b, ldb, work( 2*mn+1 ),lwork-2*mn, info )
+           end if
+           ! workspace: 2*mn+nrhs.
+           ! b(1:n,1:nrhs) := p * b(1:n,1:nrhs)
+           do j = 1, nrhs
+              do i = 1, n
+                 work( jpvt( i ) ) = b( i, j )
+              end do
+              call stdlib_qcopy( n, work( 1 ), 1, b( 1, j ), 1 )
+           end do
+           ! workspace: n.
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_qlascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_qlascl( 'U', 0, 0, smlnum, anrm, rank, rank, a, lda,info )
+           else if( iascl==2 ) then
+              call stdlib_qlascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_qlascl( 'U', 0, 0, bignum, anrm, rank, rank, a, lda,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_qlascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_qlascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           70 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qgelsy
+
+     !> DGEMLQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product
+     !> of blocked elementary reflectors computed by short wide LQ
+     !> factorization (DGELQ)
+
+     pure subroutine stdlib_qgemlq( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, tsize, lwork, ldc
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), t(*)
+           real(qp), intent(inout) :: c(ldc,*)
+           real(qp), intent(out) :: work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: mb, nb, lw, nblcks, mn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork==-1
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'T' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           mb = int( t( 2 ),KIND=ilp)
+           nb = int( t( 3 ),KIND=ilp)
+           if( left ) then
+             lw = n * mb
+             mn = m
+           else
+             lw = m * mb
+             mn = n
+           end if
+           if( ( nb>k ) .and. ( mn>k ) ) then
+             if( mod( mn - k, nb - k ) == 0 ) then
+               nblcks = ( mn - k ) / ( nb - k )
+             else
+               nblcks = ( mn - k ) / ( nb - k ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+             info = -1
+           else if( .not.tran .and. .not.notran ) then
+             info = -2
+           else if( m<0 ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 .or. k>mn ) then
+             info = -5
+           else if( lda<max( 1, k ) ) then
+             info = -7
+           else if( tsize<5 ) then
+             info = -9
+           else if( ldc<max( 1, m ) ) then
+             info = -11
+           else if( ( lwork<max( 1, lw ) ) .and. ( .not.lquery ) ) then
+             info = -13
+           end if
+           if( info==0 ) then
+             work( 1 ) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DGEMLQ', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n, k )==0 ) then
+             return
+           end if
+           if( ( left .and. m<=k ) .or. ( right .and. n<=k ).or. ( nb<=k ) .or. ( nb>=max( m, n, &
+                     k ) ) ) then
+             call stdlib_qgemlqt( side, trans, m, n, k, mb, a, lda,t( 6 ), mb, c, ldc, work, info &
+                       )
+           else
+             call stdlib_qlamswlq( side, trans, m, n, k, mb, nb, a, lda, t( 6 ),mb, c, ldc, work, &
+                       lwork, info )
+           end if
+           work( 1 ) = lw
+           return
+     end subroutine stdlib_qgemlq
+
+     !> DGEMLQT: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q C            C Q
+     !> TRANS = 'T':   Q**T C            C Q**T
+     !> where Q is a real orthogonal matrix defined as the product of K
+     !> elementary reflectors:
+     !> Q = H(1) H(2) . . . H(K) = I - V T V**T
+     !> generated using the compact WY representation as returned by DGELQT.
+     !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+
+     pure subroutine stdlib_qgemlqt( side, trans, m, n, k, mb, v, ldv, t, ldt,c, ldc, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, ldc, m, n, mb, ldt
+           ! Array Arguments 
+           real(qp), intent(in) :: v(ldv,*), t(ldt,*)
+           real(qp), intent(inout) :: c(ldc,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, ldwork, kf, q
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'T' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( left ) then
+              ldwork = max( 1, n )
+              q = m
+           else if ( right ) then
+              ldwork = max( 1, m )
+              q = n
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>q ) then
+              info = -5
+           else if( mb<1 .or. (mb>k .and. k>0)) then
+              info = -6
+           else if( ldv<max( 1, k ) ) then
+               info = -8
+           else if( ldt<mb ) then
+              info = -10
+           else if( ldc<max( 1, m ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEMLQT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. notran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_qlarfb( 'L', 'T', 'F', 'R', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. tran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_qlarfb( 'R', 'N', 'F', 'R', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           else if( left .and. tran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_qlarfb( 'L', 'N', 'F', 'R', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. notran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_qlarfb( 'R', 'T', 'F', 'R', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           end if
+           return
+     end subroutine stdlib_qgemlqt
+
+     !> DGEMQR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product
+     !> of blocked elementary reflectors computed by tall skinny
+     !> QR factorization (DGEQR)
+
+     pure subroutine stdlib_qgemqr( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, tsize, lwork, ldc
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), t(*)
+           real(qp), intent(inout) :: c(ldc,*)
+           real(qp), intent(out) :: work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: mb, nb, lw, nblcks, mn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork==-1
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'T' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           mb = int( t( 2 ),KIND=ilp)
+           nb = int( t( 3 ),KIND=ilp)
+           if( left ) then
+             lw = n * nb
+             mn = m
+           else
+             lw = mb * nb
+             mn = n
+           end if
+           if( ( mb>k ) .and. ( mn>k ) ) then
+             if( mod( mn - k, mb - k )==0 ) then
+               nblcks = ( mn - k ) / ( mb - k )
+             else
+               nblcks = ( mn - k ) / ( mb - k ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+             info = -1
+           else if( .not.tran .and. .not.notran ) then
+             info = -2
+           else if( m<0 ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 .or. k>mn ) then
+             info = -5
+           else if( lda<max( 1, mn ) ) then
+             info = -7
+           else if( tsize<5 ) then
+             info = -9
+           else if( ldc<max( 1, m ) ) then
+             info = -11
+           else if( ( lwork<max( 1, lw ) ) .and. ( .not.lquery ) ) then
+             info = -13
+           end if
+           if( info==0 ) then
+             work( 1 ) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DGEMQR', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n, k )==0 ) then
+             return
+           end if
+           if( ( left .and. m<=k ) .or. ( right .and. n<=k ).or. ( mb<=k ) .or. ( mb>=max( m, n, &
+                     k ) ) ) then
+             call stdlib_qgemqrt( side, trans, m, n, k, nb, a, lda, t( 6 ),nb, c, ldc, work, info &
+                       )
+           else
+             call stdlib_qlamtsqr( side, trans, m, n, k, mb, nb, a, lda, t( 6 ),nb, c, ldc, work, &
+                       lwork, info )
+           end if
+           work( 1 ) = lw
+           return
+     end subroutine stdlib_qgemqr
+
+     !> DGEMQRT: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q C            C Q
+     !> TRANS = 'T':   Q**T C            C Q**T
+     !> where Q is a real orthogonal matrix defined as the product of K
+     !> elementary reflectors:
+     !> Q = H(1) H(2) . . . H(K) = I - V T V**T
+     !> generated using the compact WY representation as returned by DGEQRT.
+     !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+
+     pure subroutine stdlib_qgemqrt( side, trans, m, n, k, nb, v, ldv, t, ldt,c, ldc, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, ldc, m, n, nb, ldt
+           ! Array Arguments 
+           real(qp), intent(in) :: v(ldv,*), t(ldt,*)
+           real(qp), intent(inout) :: c(ldc,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, ldwork, kf, q
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'T' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( left ) then
+              ldwork = max( 1, n )
+              q = m
+           else if ( right ) then
+              ldwork = max( 1, m )
+              q = n
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>q ) then
+              info = -5
+           else if( nb<1 .or. (nb>k .and. k>0)) then
+              info = -6
+           else if( ldv<max( 1, q ) ) then
+              info = -8
+           else if( ldt<nb ) then
+              info = -10
+           else if( ldc<max( 1, m ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEMQRT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. tran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_qlarfb( 'L', 'T', 'F', 'C', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. notran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_qlarfb( 'R', 'N', 'F', 'C', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           else if( left .and. notran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_qlarfb( 'L', 'N', 'F', 'C', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. tran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_qlarfb( 'R', 'T', 'F', 'C', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           end if
+           return
+     end subroutine stdlib_qgemqrt
+
+     !> DGEQL2: computes a QL factorization of a real m by n matrix A:
+     !> A = Q * L.
+
+     pure subroutine stdlib_qgeql2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQL2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = k, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! a(1:m-k+i-1,n-k+i)
+              call stdlib_qlarfg( m-k+i, a( m-k+i, n-k+i ), a( 1, n-k+i ), 1,tau( i ) )
+              ! apply h(i) to a(1:m-k+i,1:n-k+i-1) from the left
+              aii = a( m-k+i, n-k+i )
+              a( m-k+i, n-k+i ) = one
+              call stdlib_qlarf( 'LEFT', m-k+i, n-k+i-1, a( 1, n-k+i ), 1, tau( i ),a, lda, work )
+                        
+              a( m-k+i, n-k+i ) = aii
+           end do
+           return
+     end subroutine stdlib_qgeql2
+
+     !> DGEQLF: computes a QL factorization of a real M-by-N matrix A:
+     !> A = Q * L.
+
+     pure subroutine stdlib_qgeqlf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ki, kk, ldwork, lwkopt, mu, nb, nbmin, nu, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              k = min( m, n )
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'DGEQLF', ' ', m, n, -1, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, n ) .and. .not.lquery ) then
+                 info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQLF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DGEQLF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGEQLF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially.
+              ! the last kk columns are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              do i = k - kk + ki + 1, k - kk + 1, -nb
+                 ib = min( k-i+1, nb )
+                 ! compute the ql factorization of the current block
+                 ! a(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1)
+                 call stdlib_qgeql2( m-k+i+ib-1, ib, a( 1, n-k+i ), lda, tau( i ),work, iinfo )
+                           
+                 if( n-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_qlarft( 'BACKWARD', 'COLUMNWISE', m-k+i+ib-1, ib,a( 1, n-k+i ), &
+                              lda, tau( i ), work, ldwork )
+                    ! apply h**t to a(1:m-k+i+ib-1,1:n-k+i-1) from the left
+                    call stdlib_qlarfb( 'LEFT', 'TRANSPOSE', 'BACKWARD','COLUMNWISE', m-k+i+ib-1, &
+                    n-k+i-1, ib,a( 1, n-k+i ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+              mu = m - k + i + nb - 1
+              nu = n - k + i + nb - 1
+           else
+              mu = m
+              nu = n
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 .and. nu>0 )call stdlib_qgeql2( mu, nu, a, lda, tau, work, iinfo )
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_qgeqlf
+
+     !> DGEQP3: computes a QR factorization with column pivoting of a
+     !> matrix A:  A*P = Q*R  using Level 3 BLAS.
+
+     pure subroutine stdlib_qgeqp3( m, n, a, lda, jpvt, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: inb = 1
+           integer(ilp), parameter :: inbmin = 2
+           integer(ilp), parameter :: ixover = 3
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: fjb, iws, j, jb, lwkopt, minmn, minws, na, nb, nbmin, nfxd, nx, sm, &
+                     sminmn, sn, topbmn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+        ! ====================
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              minmn = min( m, n )
+              if( minmn==0 ) then
+                 iws = 1
+                 lwkopt = 1
+              else
+                 iws = 3*n + 1
+                 nb = stdlib_ilaenv( inb, 'DGEQRF', ' ', m, n, -1, -1 )
+                 lwkopt = 2*n + ( n + 1 )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( ( lwork<iws ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQP3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! move initial columns up front.
+           nfxd = 1
+           do j = 1, n
+              if( jpvt( j )/=0 ) then
+                 if( j/=nfxd ) then
+                    call stdlib_qswap( m, a( 1, j ), 1, a( 1, nfxd ), 1 )
+                    jpvt( j ) = jpvt( nfxd )
+                    jpvt( nfxd ) = j
+                 else
+                    jpvt( j ) = j
+                 end if
+                 nfxd = nfxd + 1
+              else
+                 jpvt( j ) = j
+              end if
+           end do
+           nfxd = nfxd - 1
+           ! factorize fixed columns
+        ! =======================
+           ! compute the qr factorization of fixed columns and update
+           ! remaining columns.
+           if( nfxd>0 ) then
+              na = min( m, nfxd )
+      ! cc      call stdlib_qgeqr2( m, na, a, lda, tau, work, info )
+              call stdlib_qgeqrf( m, na, a, lda, tau, work, lwork, info )
+              iws = max( iws, int( work( 1 ),KIND=ilp) )
+              if( na<n ) then
+      ! cc         call stdlib_qorm2r( 'left', 'transpose', m, n-na, na, a, lda,
+      ! cc  $                   tau, a( 1, na+1 ), lda, work, info )
+                 call stdlib_qormqr( 'LEFT', 'TRANSPOSE', m, n-na, na, a, lda, tau,a( 1, na+1 ), &
+                           lda, work, lwork, info )
+                 iws = max( iws, int( work( 1 ),KIND=ilp) )
+              end if
+           end if
+           ! factorize free columns
+        ! ======================
+           if( nfxd<minmn ) then
+              sm = m - nfxd
+              sn = n - nfxd
+              sminmn = minmn - nfxd
+              ! determine the block size.
+              nb = stdlib_ilaenv( inb, 'DGEQRF', ' ', sm, sn, -1, -1 )
+              nbmin = 2
+              nx = 0
+              if( ( nb>1 ) .and. ( nb<sminmn ) ) then
+                 ! determine when to cross over from blocked to unblocked code.
+                 nx = max( 0, stdlib_ilaenv( ixover, 'DGEQRF', ' ', sm, sn, -1,-1 ) )
+                 if( nx<sminmn ) then
+                    ! determine if workspace is large enough for blocked code.
+                    minws = 2*sn + ( sn+1 )*nb
+                    iws = max( iws, minws )
+                    if( lwork<minws ) then
+                       ! not enough workspace to use optimal nb: reduce nb and
+                       ! determine the minimum value of nb.
+                       nb = ( lwork-2*sn ) / ( sn+1 )
+                       nbmin = max( 2, stdlib_ilaenv( inbmin, 'DGEQRF', ' ', sm, sn,-1, -1 ) )
+                                 
+                    end if
+                 end if
+              end if
+              ! initialize partial column norms. the first n elements of work
+              ! store the exact column norms.
+              do j = nfxd + 1, n
+                 work( j ) = stdlib_qnrm2( sm, a( nfxd+1, j ), 1 )
+                 work( n+j ) = work( j )
+              end do
+              if( ( nb>=nbmin ) .and. ( nb<sminmn ) .and.( nx<sminmn ) ) then
+                 ! use blocked code initially.
+                 j = nfxd + 1
+                 ! compute factorization: while loop.
+                 topbmn = minmn - nx
+                 30 continue
+                 if( j<=topbmn ) then
+                    jb = min( nb, topbmn-j+1 )
+                    ! factorize jb columns among columns j:n.
+                    call stdlib_qlaqps( m, n-j+1, j-1, jb, fjb, a( 1, j ), lda,jpvt( j ), tau( j )&
+                              , work( j ), work( n+j ),work( 2*n+1 ), work( 2*n+jb+1 ), n-j+1 )
+                    j = j + fjb
+                    go to 30
+                 end if
+              else
+                 j = nfxd + 1
+              end if
+              ! use unblocked code to factor the last or only block.
+              if( j<=minmn )call stdlib_qlaqp2( m, n-j+1, j-1, a( 1, j ), lda, jpvt( j ),tau( j ),&
+                         work( j ), work( n+j ),work( 2*n+1 ) )
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_qgeqp3
+
+     !> DGEQR: computes a QR factorization of a real M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     pure subroutine stdlib_qgeqr( m, n, a, lda, t, tsize, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, tsize, lwork
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: t(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, lminws, mint, minw
+           integer(ilp) :: mb, nb, mintsz, nblcks
+           ! Intrinsic Functions 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( tsize==-1 .or. tsize==-2 .or.lwork==-1 .or. lwork==-2 )
+           mint = .false.
+           minw = .false.
+           if( tsize==-2 .or. lwork==-2 ) then
+             if( tsize/=-1 ) mint = .true.
+             if( lwork/=-1 ) minw = .true.
+           end if
+           ! determine the block size
+           if( min( m, n )>0 ) then
+             mb = stdlib_ilaenv( 1, 'DGEQR ', ' ', m, n, 1, -1 )
+             nb = stdlib_ilaenv( 1, 'DGEQR ', ' ', m, n, 2, -1 )
+           else
+             mb = m
+             nb = 1
+           end if
+           if( mb>m .or. mb<=n ) mb = m
+           if( nb>min( m, n ) .or. nb<1 ) nb = 1
+           mintsz = n + 5
+           if( mb>n .and. m>n ) then
+             if( mod( m - n, mb - n )==0 ) then
+               nblcks = ( m - n ) / ( mb - n )
+             else
+               nblcks = ( m - n ) / ( mb - n ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           ! determine if the workspace size satisfies minimal size
+           lminws = .false.
+           if( ( tsize<max( 1, nb*n*nblcks + 5 ) .or. lwork<nb*n ).and. ( lwork>=n ) .and. ( &
+                     tsize>=mintsz ).and. ( .not.lquery ) ) then
+             if( tsize<max( 1, nb*n*nblcks + 5 ) ) then
+               lminws = .true.
+               nb = 1
+               mb = m
+             end if
+             if( lwork<nb*n ) then
+               lminws = .true.
+               nb = 1
+             end if
+           end if
+           if( m<0 ) then
+             info = -1
+           else if( n<0 ) then
+             info = -2
+           else if( lda<max( 1, m ) ) then
+             info = -4
+           else if( tsize<max( 1, nb*n*nblcks + 5 ).and. ( .not.lquery ) .and. ( .not.lminws ) ) &
+                     then
+             info = -6
+           else if( ( lwork<max( 1, n*nb ) ) .and. ( .not.lquery ).and. ( .not.lminws ) ) &
+                     then
+             info = -8
+           end if
+           if( info==0 ) then
+             if( mint ) then
+               t( 1 ) = mintsz
+             else
+               t( 1 ) = nb*n*nblcks + 5
+             end if
+             t( 2 ) = mb
+             t( 3 ) = nb
+             if( minw ) then
+               work( 1 ) = max( 1, n )
+             else
+               work( 1 ) = max( 1, nb*n )
+             end if
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DGEQR', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+             return
+           end if
+           ! the qr decomposition
+           if( ( m<=n ) .or. ( mb<=n ) .or. ( mb>=m ) ) then
+             call stdlib_qgeqrt( m, n, nb, a, lda, t( 6 ), nb, work, info )
+           else
+             call stdlib_qlatsqr( m, n, mb, nb, a, lda, t( 6 ), nb, work,lwork, info )
+           end if
+           work( 1 ) = max( 1, nb*n )
+           return
+     end subroutine stdlib_qgeqr
+
+     !> DGEQR2: computes a QR factorization of a real m-by-n matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a m-by-m orthogonal matrix;
+     !> R is an upper-triangular n-by-n matrix;
+     !> 0 is a (m-n)-by-n zero matrix, if m > n.
+
+     pure subroutine stdlib_qgeqr2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQR2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+              call stdlib_qlarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tau( i ) )
+              if( i<n ) then
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 aii = a( i, i )
+                 a( i, i ) = one
+                 call stdlib_qlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tau( i ),a( i, i+1 ), lda, &
+                           work )
+                 a( i, i ) = aii
+              end if
+           end do
+           return
+     end subroutine stdlib_qgeqr2
+
+     !> DGEQR2P: computes a QR factorization of a real m-by-n matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a m-by-m orthogonal matrix;
+     !> R is an upper-triangular n-by-n matrix with nonnegative diagonal
+     !> entries;
+     !> 0 is a (m-n)-by-n zero matrix, if m > n.
+
+     subroutine stdlib_qgeqr2p( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQR2P', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+              call stdlib_qlarfgp( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tau( i ) )
+              if( i<n ) then
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 aii = a( i, i )
+                 a( i, i ) = one
+                 call stdlib_qlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tau( i ),a( i, i+1 ), lda, &
+                           work )
+                 a( i, i ) = aii
+              end if
+           end do
+           return
+     end subroutine stdlib_qgeqr2p
+
+     !> DGEQRF: computes a QR factorization of a real M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     pure subroutine stdlib_qgeqrf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           k = min( m, n )
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DGEQRF', ' ', m, n, -1, -1 )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( .not.lquery ) then
+              if( lwork<=0 .or. ( m>0 .and. lwork<max( 1, n ) ) )info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQRF', -info )
+              return
+           else if( lquery ) then
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DGEQRF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGEQRF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the qr factorization of the current block
+                 ! a(i:m,i:i+ib-1)
+                 call stdlib_qgeqr2( m-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_qlarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h**t to a(i:m,i+ib:n) from the left
+                    call stdlib_qlarfb( 'LEFT', 'TRANSPOSE', 'FORWARD','COLUMNWISE', m-i+1, n-i-&
+                    ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 ), ldwork &
+                              )
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_qgeqr2( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_qgeqrf
+
+     !> DGEQR2P computes a QR factorization of a real M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix with nonnegative diagonal
+     !> entries;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     subroutine stdlib_qgeqrfp( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DGEQRF', ' ', m, n, -1, -1 )
+           lwkopt = n*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQRFP', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DGEQRF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGEQRF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the qr factorization of the current block
+                 ! a(i:m,i:i+ib-1)
+                 call stdlib_qgeqr2p( m-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_qlarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h**t to a(i:m,i+ib:n) from the left
+                    call stdlib_qlarfb( 'LEFT', 'TRANSPOSE', 'FORWARD','COLUMNWISE', m-i+1, n-i-&
+                    ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 ), ldwork &
+                              )
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_qgeqr2p( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_qgeqrfp
+
+     !> DGEQRT: computes a blocked QR factorization of a real M-by-N matrix A
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_qgeqrt( m, n, nb, a, lda, t, ldt, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, nb
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk), parameter :: use_recursive_qr = .true.
+           integer(ilp) :: i, ib, iinfo, k
+           
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nb<1 .or. ( nb>min(m,n) .and. min(m,n)>0 ) )then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<nb ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQRT', -info )
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) return
+           ! blocked loop of length k
+           do i = 1, k,  nb
+              ib = min( k-i+1, nb )
+           ! compute the qr factorization of the current block a(i:m,i:i+ib-1)
+              if( use_recursive_qr ) then
+                 call stdlib_qgeqrt3( m-i+1, ib, a(i,i), lda, t(1,i), ldt, iinfo )
+              else
+                 call stdlib_qgeqrt2( m-i+1, ib, a(i,i), lda, t(1,i), ldt, iinfo )
+              end if
+              if( i+ib<=n ) then
+           ! update by applying h**t to a(i:m,i+ib:n) from the left
+                 call stdlib_qlarfb( 'L', 'T', 'F', 'C', m-i+1, n-i-ib+1, ib,a( i, i ), lda, t( 1,&
+                            i ), ldt,a( i, i+ib ), lda, work , n-i-ib+1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_qgeqrt
+
+     !> DGEQRT2: computes a QR factorization of a real M-by-N matrix A,
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_qgeqrt2( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(qp) :: aii, alpha
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( m<n ) then
+              info = -1
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQRT2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elem. refl. h(i) to annihilate a(i+1:m,i), tau(i) -> t(i,1)
+              call stdlib_qlarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,t( i, 1 ) )
+              if( i<n ) then
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 aii = a( i, i )
+                 a( i, i ) = one
+                 ! w(1:n-i) := a(i:m,i+1:n)^h * a(i:m,i) [w = t(:,n)]
+                 call stdlib_qgemv( 'T',m-i+1, n-i, one, a( i, i+1 ), lda,a( i, i ), 1, zero, t( &
+                           1, n ), 1 )
+                 ! a(i:m,i+1:n) = a(i:m,i+1:n) + alpha*a(i:m,i)*w(1:n-1)^h
+                 alpha = -(t( i, 1 ))
+                 call stdlib_qger( m-i+1, n-i, alpha, a( i, i ), 1,t( 1, n ), 1, a( i, i+1 ), lda &
+                           )
+                 a( i, i ) = aii
+              end if
+           end do
+           do i = 2, n
+              aii = a( i, i )
+              a( i, i ) = one
+              ! t(1:i-1,i) := alpha * a(i:m,1:i-1)**t * a(i:m,i)
+              alpha = -t( i, 1 )
+              call stdlib_qgemv( 'T', m-i+1, i-1, alpha, a( i, 1 ), lda,a( i, i ), 1, zero, t( 1, &
+                        i ), 1 )
+              a( i, i ) = aii
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+              call stdlib_qtrmv( 'U', 'N', 'N', i-1, t, ldt, t( 1, i ), 1 )
+                 ! t(i,i) = tau(i)
+                 t( i, i ) = t( i, 1 )
+                 t( i, 1) = zero
+           end do
+     end subroutine stdlib_qgeqrt2
+
+     !> DGEQRT3: recursively computes a QR factorization of a real M-by-N
+     !> matrix A, using the compact WY representation of Q.
+     !> Based on the algorithm of Elmroth and Gustavson,
+     !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+
+     pure recursive subroutine stdlib_qgeqrt3( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, ldt
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, j, j1, n1, n2, iinfo
+           ! Executable Statements 
+           info = 0
+           if( n < 0 ) then
+              info = -2
+           else if( m < n ) then
+              info = -1
+           else if( lda < max( 1, m ) ) then
+              info = -4
+           else if( ldt < max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGEQRT3', -info )
+              return
+           end if
+           if( n==1 ) then
+              ! compute householder transform when n=1
+              call stdlib_qlarfg( m, a(1,1), a( min( 2, m ), 1 ), 1, t(1,1) )
+           else
+              ! otherwise, split a into blocks...
+              n1 = n/2
+              n2 = n-n1
+              j1 = min( n1+1, n )
+              i1 = min( n+1, m )
+              ! compute a(1:m,1:n1) <- (y1,r1,t1), where q1 = i - y1 t1 y1^h
+              call stdlib_qgeqrt3( m, n1, a, lda, t, ldt, iinfo )
+              ! compute a(1:m,j1:n) = q1^h a(1:m,j1:n) [workspace: t(1:n1,j1:n)]
+              do j=1,n2
+                 do i=1,n1
+                    t( i, j+n1 ) = a( i, j+n1 )
+                 end do
+              end do
+              call stdlib_qtrmm( 'L', 'L', 'T', 'U', n1, n2, one,a, lda, t( 1, j1 ), ldt )
+              call stdlib_qgemm( 'T', 'N', n1, n2, m-n1, one, a( j1, 1 ), lda,a( j1, j1 ), lda, &
+                        one, t( 1, j1 ), ldt)
+              call stdlib_qtrmm( 'L', 'U', 'T', 'N', n1, n2, one,t, ldt, t( 1, j1 ), ldt )
+              call stdlib_qgemm( 'N', 'N', m-n1, n2, n1, -one, a( j1, 1 ), lda,t( 1, j1 ), ldt, &
+                        one, a( j1, j1 ), lda )
+              call stdlib_qtrmm( 'L', 'L', 'N', 'U', n1, n2, one,a, lda, t( 1, j1 ), ldt )
+              do j=1,n2
+                 do i=1,n1
+                    a( i, j+n1 ) = a( i, j+n1 ) - t( i, j+n1 )
+                 end do
+              end do
+              ! compute a(j1:m,j1:n) <- (y2,r2,t2) where q2 = i - y2 t2 y2^h
+              call stdlib_qgeqrt3( m-n1, n2, a( j1, j1 ), lda,t( j1, j1 ), ldt, iinfo )
+              ! compute t3 = t(1:n1,j1:n) = -t1 y1^h y2 t2
+              do i=1,n1
+                 do j=1,n2
+                    t( i, j+n1 ) = (a( j+n1, i ))
+                 end do
+              end do
+              call stdlib_qtrmm( 'R', 'L', 'N', 'U', n1, n2, one,a( j1, j1 ), lda, t( 1, j1 ), &
+                        ldt )
+              call stdlib_qgemm( 'T', 'N', n1, n2, m-n, one, a( i1, 1 ), lda,a( i1, j1 ), lda, &
+                        one, t( 1, j1 ), ldt )
+              call stdlib_qtrmm( 'L', 'U', 'N', 'N', n1, n2, -one, t, ldt,t( 1, j1 ), ldt )
+                        
+              call stdlib_qtrmm( 'R', 'U', 'N', 'N', n1, n2, one,t( j1, j1 ), ldt, t( 1, j1 ), &
+                        ldt )
+              ! y = (y1,y2); r = [ r1  a(1:n1,j1:n) ];  t = [t1 t3]
+                               ! [  0        r2     ]       [ 0 t2]
+           end if
+           return
+     end subroutine stdlib_qgeqrt3
+
+     !> DGERFS: improves the computed solution to a system of linear
+     !> equations and provides error bounds and backward error estimates for
+     !> the solution.
+
+     pure subroutine stdlib_qgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*)
+           real(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transt
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGERFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_qcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_qgemv( trans, n, n, -one, a, lda, x( 1, j ), 1, one,work( n+1 ), 1 )
+                        
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(op(a))*abs(x) + abs(b).
+              if( notran ) then
+                 do k = 1, n
+                    xk = abs( x( k, j ) )
+                    do i = 1, n
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                    end do
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    do i = 1, n
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_qgetrs( trans, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                 call stdlib_qaxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_qlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_qlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_qgetrs( transt, n, 1, af, ldaf, ipiv, work( n+1 ),n, info )
+                              
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_qgetrs( trans, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_qgerfs
+
+     !> DGERQ2: computes an RQ factorization of a real m by n matrix A:
+     !> A = R * Q.
+
+     pure subroutine stdlib_qgerq2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGERQ2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = k, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! a(m-k+i,1:n-k+i-1)
+              call stdlib_qlarfg( n-k+i, a( m-k+i, n-k+i ), a( m-k+i, 1 ), lda,tau( i ) )
+              ! apply h(i) to a(1:m-k+i-1,1:n-k+i) from the right
+              aii = a( m-k+i, n-k+i )
+              a( m-k+i, n-k+i ) = one
+              call stdlib_qlarf( 'RIGHT', m-k+i-1, n-k+i, a( m-k+i, 1 ), lda,tau( i ), a, lda, &
+                        work )
+              a( m-k+i, n-k+i ) = aii
+           end do
+           return
+     end subroutine stdlib_qgerq2
+
+     !> DGERQF: computes an RQ factorization of a real M-by-N matrix A:
+     !> A = R * Q.
+
+     pure subroutine stdlib_qgerqf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ki, kk, ldwork, lwkopt, mu, nb, nbmin, nu, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              k = min( m, n )
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'DGERQF', ' ', m, n, -1, -1 )
+                 lwkopt = m*nb
+              end if
+              work( 1 ) = lwkopt
+              if ( .not.lquery ) then
+                 if( lwork<=0 .or. ( n>0 .and. lwork<max( 1, m ) ) )info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGERQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DGERQF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGERQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially.
+              ! the last kk rows are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              do i = k - kk + ki + 1, k - kk + 1, -nb
+                 ib = min( k-i+1, nb )
+                 ! compute the rq factorization of the current block
+                 ! a(m-k+i:m-k+i+ib-1,1:n-k+i+ib-1)
+                 call stdlib_qgerq2( ib, n-k+i+ib-1, a( m-k+i, 1 ), lda, tau( i ),work, iinfo )
+                           
+                 if( m-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_qlarft( 'BACKWARD', 'ROWWISE', n-k+i+ib-1, ib,a( m-k+i, 1 ), lda, &
+                              tau( i ), work, ldwork )
+                    ! apply h to a(1:m-k+i-1,1:n-k+i+ib-1) from the right
+                    call stdlib_qlarfb( 'RIGHT', 'NO TRANSPOSE', 'BACKWARD','ROWWISE', m-k+i-1, n-&
+                    k+i+ib-1, ib,a( m-k+i, 1 ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+              mu = m - k + i + nb - 1
+              nu = n - k + i + nb - 1
+           else
+              mu = m
+              nu = n
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 .and. nu>0 )call stdlib_qgerq2( mu, nu, a, lda, tau, work, iinfo )
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_qgerqf
+
+     !> DGESC2: solves a system of linear equations
+     !> A * X = scale* RHS
+     !> with a general N-by-N matrix A using the LU factorization with
+     !> complete pivoting computed by DGETC2.
+
+     pure subroutine stdlib_qgesc2( n, a, lda, rhs, ipiv, jpiv, scale )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(out) :: scale
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*), jpiv(*)
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: rhs(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: bignum, eps, smlnum, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+            ! set constant to control overflow
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! apply permutations ipiv to rhs
+           call stdlib_qlaswp( 1, rhs, lda, 1, n-1, ipiv, 1 )
+           ! solve for l part
+           do i = 1, n - 1
+              do j = i + 1, n
+                 rhs( j ) = rhs( j ) - a( j, i )*rhs( i )
+              end do
+           end do
+           ! solve for u part
+           scale = one
+           ! check for scaling
+           i = stdlib_iqamax( n, rhs, 1 )
+           if( two*smlnum*abs( rhs( i ) )>abs( a( n, n ) ) ) then
+              temp = ( one / two ) / abs( rhs( i ) )
+              call stdlib_qscal( n, temp, rhs( 1 ), 1 )
+              scale = scale*temp
+           end if
+           do i = n, 1, -1
+              temp = one / a( i, i )
+              rhs( i ) = rhs( i )*temp
+              do j = i + 1, n
+                 rhs( i ) = rhs( i ) - rhs( j )*( a( i, j )*temp )
+              end do
+           end do
+           ! apply permutations jpiv to the solution (rhs)
+           call stdlib_qlaswp( 1, rhs, lda, 1, n-1, jpiv, -1 )
+           return
+     end subroutine stdlib_qgesc2
+
+     !> DGESDD: computes the singular value decomposition (SVD) of a real
+     !> M-by-N matrix A, optionally computing the left and right singular
+     !> vectors.  If singular vectors are desired, it uses a
+     !> divide-and-conquer algorithm.
+     !> The SVD is written
+     !> A = U * SIGMA * transpose(V)
+     !> where SIGMA is an M-by-N matrix which is zero except for its
+     !> min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
+     !> V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
+     !> are the singular values of A; they are real and non-negative, and
+     !> are returned in descending order.  The first min(m,n) columns of
+     !> U and V are the left and right singular vectors of A.
+     !> Note that the routine returns VT = V**T, not V.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_qgesdd( jobz, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, iwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldvt, lwork, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: s(*), u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wntqa, wntqas, wntqn, wntqo, wntqs
+           integer(ilp) :: bdspac, blk, chunk, i, ie, ierr, il, ir, iscl, itau, itaup, itauq, iu, &
+                     ivt, ldwkvt, ldwrkl, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr, nwork, wrkbl
+           integer(ilp) :: lwork_qgebrd_mn, lwork_qgebrd_mm, lwork_qgebrd_nn, lwork_qgelqf_mn, &
+           lwork_qgeqrf_mn, lwork_qorgbr_p_mm, lwork_qorgbr_q_nn, lwork_qorglq_mn, &
+           lwork_qorglq_nn, lwork_qorgqr_mm, lwork_qorgqr_mn, lwork_qormbr_prt_mm, &
+           lwork_qormbr_qln_mm, lwork_qormbr_prt_mn, lwork_qormbr_qln_mn, lwork_qormbr_prt_nn, &
+                     lwork_qormbr_qln_nn
+           real(qp) :: anrm, bignum, eps, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(qp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info   = 0
+           minmn  = min( m, n )
+           wntqa  = stdlib_lsame( jobz, 'A' )
+           wntqs  = stdlib_lsame( jobz, 'S' )
+           wntqas = wntqa .or. wntqs
+           wntqo  = stdlib_lsame( jobz, 'O' )
+           wntqn  = stdlib_lsame( jobz, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.( wntqa .or. wntqs .or. wntqo .or. wntqn ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldu<1 .or. ( wntqas .and. ldu<m ) .or.( wntqo .and. m<n .and. ldu<m ) ) &
+                     then
+              info = -8
+           else if( ldvt<1 .or. ( wntqa .and. ldvt<n ) .or.( wntqs .and. ldvt<minmn ) .or.( wntqo &
+                     .and. m>=n .and. ldvt<n ) ) then
+              info = -10
+           end if
+           ! compute workspace
+             ! note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace allocated at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              bdspac = 0
+              mnthr  = int( minmn*11.0_qp / 6.0_qp,KIND=ilp)
+              if( m>=n .and. minmn>0 ) then
+                 ! compute space needed for stdlib_qbdsdc
+                 if( wntqn ) then
+                    ! stdlib_qbdsdc needs only 4*n (or 6*n for uplo=l for lapack <= 3.6_qp)
+                    ! keep 7*n for backwards compatibility.
+                    bdspac = 7*n
+                 else
+                    bdspac = 3*n*n + 4*n
+                 end if
+                 ! compute space preferred for each routine
+                 call stdlib_qgebrd( m, n, dum(1), m, dum(1), dum(1), dum(1),dum(1), dum(1), -1, &
+                           ierr )
+                 lwork_qgebrd_mn = int( dum(1),KIND=ilp)
+                 call stdlib_qgebrd( n, n, dum(1), n, dum(1), dum(1), dum(1),dum(1), dum(1), -1, &
+                           ierr )
+                 lwork_qgebrd_nn = int( dum(1),KIND=ilp)
+                 call stdlib_qgeqrf( m, n, dum(1), m, dum(1), dum(1), -1, ierr )
+                 lwork_qgeqrf_mn = int( dum(1),KIND=ilp)
+                 call stdlib_qorgbr( 'Q', n, n, n, dum(1), n, dum(1), dum(1), -1,ierr )
+                 lwork_qorgbr_q_nn = int( dum(1),KIND=ilp)
+                 call stdlib_qorgqr( m, m, n, dum(1), m, dum(1), dum(1), -1, ierr )
+                 lwork_qorgqr_mm = int( dum(1),KIND=ilp)
+                 call stdlib_qorgqr( m, n, n, dum(1), m, dum(1), dum(1), -1, ierr )
+                 lwork_qorgqr_mn = int( dum(1),KIND=ilp)
+                 call stdlib_qormbr( 'P', 'R', 'T', n, n, n, dum(1), n,dum(1), dum(1), n, dum(1), &
+                           -1, ierr )
+                 lwork_qormbr_prt_nn = int( dum(1),KIND=ilp)
+                 call stdlib_qormbr( 'Q', 'L', 'N', n, n, n, dum(1), n,dum(1), dum(1), n, dum(1), &
+                           -1, ierr )
+                 lwork_qormbr_qln_nn = int( dum(1),KIND=ilp)
+                 call stdlib_qormbr( 'Q', 'L', 'N', m, n, n, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_qormbr_qln_mn = int( dum(1),KIND=ilp)
+                 call stdlib_qormbr( 'Q', 'L', 'N', m, m, n, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_qormbr_qln_mm = int( dum(1),KIND=ilp)
+                 if( m>=mnthr ) then
+                    if( wntqn ) then
+                       ! path 1 (m >> n, jobz='n')
+                       wrkbl = n + lwork_qgeqrf_mn
+                       wrkbl = max( wrkbl, 3*n + lwork_qgebrd_nn )
+                       maxwrk = max( wrkbl, bdspac + n )
+                       minwrk = bdspac + n
+                    else if( wntqo ) then
+                       ! path 2 (m >> n, jobz='o')
+                       wrkbl = n + lwork_qgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_qorgqr_mn )
+                       wrkbl = max( wrkbl, 3*n + lwork_qgebrd_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_qormbr_qln_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_qormbr_prt_nn )
+                       wrkbl = max( wrkbl, 3*n + bdspac )
+                       maxwrk = wrkbl + 2*n*n
+                       minwrk = bdspac + 2*n*n + 3*n
+                    else if( wntqs ) then
+                       ! path 3 (m >> n, jobz='s')
+                       wrkbl = n + lwork_qgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_qorgqr_mn )
+                       wrkbl = max( wrkbl, 3*n + lwork_qgebrd_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_qormbr_qln_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_qormbr_prt_nn )
+                       wrkbl = max( wrkbl, 3*n + bdspac )
+                       maxwrk = wrkbl + n*n
+                       minwrk = bdspac + n*n + 3*n
+                    else if( wntqa ) then
+                       ! path 4 (m >> n, jobz='a')
+                       wrkbl = n + lwork_qgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_qorgqr_mm )
+                       wrkbl = max( wrkbl, 3*n + lwork_qgebrd_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_qormbr_qln_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_qormbr_prt_nn )
+                       wrkbl = max( wrkbl, 3*n + bdspac )
+                       maxwrk = wrkbl + n*n
+                       minwrk = n*n + max( 3*n + bdspac, n + m )
+                    end if
+                 else
+                    ! path 5 (m >= n, but not much larger)
+                    wrkbl = 3*n + lwork_qgebrd_mn
+                    if( wntqn ) then
+                       ! path 5n (m >= n, jobz='n')
+                       maxwrk = max( wrkbl, 3*n + bdspac )
+                       minwrk = 3*n + max( m, bdspac )
+                    else if( wntqo ) then
+                       ! path 5o (m >= n, jobz='o')
+                       wrkbl = max( wrkbl, 3*n + lwork_qormbr_prt_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_qormbr_qln_mn )
+                       wrkbl = max( wrkbl, 3*n + bdspac )
+                       maxwrk = wrkbl + m*n
+                       minwrk = 3*n + max( m, n*n + bdspac )
+                    else if( wntqs ) then
+                       ! path 5s (m >= n, jobz='s')
+                       wrkbl = max( wrkbl, 3*n + lwork_qormbr_qln_mn )
+                       wrkbl = max( wrkbl, 3*n + lwork_qormbr_prt_nn )
+                       maxwrk = max( wrkbl, 3*n + bdspac )
+                       minwrk = 3*n + max( m, bdspac )
+                    else if( wntqa ) then
+                       ! path 5a (m >= n, jobz='a')
+                       wrkbl = max( wrkbl, 3*n + lwork_qormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*n + lwork_qormbr_prt_nn )
+                       maxwrk = max( wrkbl, 3*n + bdspac )
+                       minwrk = 3*n + max( m, bdspac )
+                    end if
+                 end if
+              else if( minmn>0 ) then
+                 ! compute space needed for stdlib_qbdsdc
+                 if( wntqn ) then
+                    ! stdlib_qbdsdc needs only 4*n (or 6*n for uplo=l for lapack <= 3.6_qp)
+                    ! keep 7*n for backwards compatibility.
+                    bdspac = 7*m
+                 else
+                    bdspac = 3*m*m + 4*m
+                 end if
+                 ! compute space preferred for each routine
+                 call stdlib_qgebrd( m, n, dum(1), m, dum(1), dum(1), dum(1),dum(1), dum(1), -1, &
+                           ierr )
+                 lwork_qgebrd_mn = int( dum(1),KIND=ilp)
+                 call stdlib_qgebrd( m, m, a, m, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                           
+                 lwork_qgebrd_mm = int( dum(1),KIND=ilp)
+                 call stdlib_qgelqf( m, n, a, m, dum(1), dum(1), -1, ierr )
+                 lwork_qgelqf_mn = int( dum(1),KIND=ilp)
+                 call stdlib_qorglq( n, n, m, dum(1), n, dum(1), dum(1), -1, ierr )
+                 lwork_qorglq_nn = int( dum(1),KIND=ilp)
+                 call stdlib_qorglq( m, n, m, a, m, dum(1), dum(1), -1, ierr )
+                 lwork_qorglq_mn = int( dum(1),KIND=ilp)
+                 call stdlib_qorgbr( 'P', m, m, m, a, n, dum(1), dum(1), -1, ierr )
+                 lwork_qorgbr_p_mm = int( dum(1),KIND=ilp)
+                 call stdlib_qormbr( 'P', 'R', 'T', m, m, m, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_qormbr_prt_mm = int( dum(1),KIND=ilp)
+                 call stdlib_qormbr( 'P', 'R', 'T', m, n, m, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_qormbr_prt_mn = int( dum(1),KIND=ilp)
+                 call stdlib_qormbr( 'P', 'R', 'T', n, n, m, dum(1), n,dum(1), dum(1), n, dum(1), &
+                           -1, ierr )
+                 lwork_qormbr_prt_nn = int( dum(1),KIND=ilp)
+                 call stdlib_qormbr( 'Q', 'L', 'N', m, m, m, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_qormbr_qln_mm = int( dum(1),KIND=ilp)
+                 if( n>=mnthr ) then
+                    if( wntqn ) then
+                       ! path 1t (n >> m, jobz='n')
+                       wrkbl = m + lwork_qgelqf_mn
+                       wrkbl = max( wrkbl, 3*m + lwork_qgebrd_mm )
+                       maxwrk = max( wrkbl, bdspac + m )
+                       minwrk = bdspac + m
+                    else if( wntqo ) then
+                       ! path 2t (n >> m, jobz='o')
+                       wrkbl = m + lwork_qgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_qorglq_mn )
+                       wrkbl = max( wrkbl, 3*m + lwork_qgebrd_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_qormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_qormbr_prt_mm )
+                       wrkbl = max( wrkbl, 3*m + bdspac )
+                       maxwrk = wrkbl + 2*m*m
+                       minwrk = bdspac + 2*m*m + 3*m
+                    else if( wntqs ) then
+                       ! path 3t (n >> m, jobz='s')
+                       wrkbl = m + lwork_qgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_qorglq_mn )
+                       wrkbl = max( wrkbl, 3*m + lwork_qgebrd_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_qormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_qormbr_prt_mm )
+                       wrkbl = max( wrkbl, 3*m + bdspac )
+                       maxwrk = wrkbl + m*m
+                       minwrk = bdspac + m*m + 3*m
+                    else if( wntqa ) then
+                       ! path 4t (n >> m, jobz='a')
+                       wrkbl = m + lwork_qgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_qorglq_nn )
+                       wrkbl = max( wrkbl, 3*m + lwork_qgebrd_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_qormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_qormbr_prt_mm )
+                       wrkbl = max( wrkbl, 3*m + bdspac )
+                       maxwrk = wrkbl + m*m
+                       minwrk = m*m + max( 3*m + bdspac, m + n )
+                    end if
+                 else
+                    ! path 5t (n > m, but not much larger)
+                    wrkbl = 3*m + lwork_qgebrd_mn
+                    if( wntqn ) then
+                       ! path 5tn (n > m, jobz='n')
+                       maxwrk = max( wrkbl, 3*m + bdspac )
+                       minwrk = 3*m + max( n, bdspac )
+                    else if( wntqo ) then
+                       ! path 5to (n > m, jobz='o')
+                       wrkbl = max( wrkbl, 3*m + lwork_qormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_qormbr_prt_mn )
+                       wrkbl = max( wrkbl, 3*m + bdspac )
+                       maxwrk = wrkbl + m*n
+                       minwrk = 3*m + max( n, m*m + bdspac )
+                    else if( wntqs ) then
+                       ! path 5ts (n > m, jobz='s')
+                       wrkbl = max( wrkbl, 3*m + lwork_qormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_qormbr_prt_mn )
+                       maxwrk = max( wrkbl, 3*m + bdspac )
+                       minwrk = 3*m + max( n, bdspac )
+                    else if( wntqa ) then
+                       ! path 5ta (n > m, jobz='a')
+                       wrkbl = max( wrkbl, 3*m + lwork_qormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_qormbr_prt_nn )
+                       maxwrk = max( wrkbl, 3*m + bdspac )
+                       minwrk = 3*m + max( n, bdspac )
+                    end if
+                 end if
+              end if
+              maxwrk = max( maxwrk, minwrk )
+              work( 1 ) = stdlib_qroundup_lwork( maxwrk )
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGESDD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = sqrt( stdlib_qlamch( 'S' ) ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', m, n, a, lda, dum )
+           if( stdlib_qisnan( anrm ) ) then
+               info = -4
+               return
+           end if
+           iscl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              iscl = 1
+              call stdlib_qlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, ierr )
+           else if( anrm>bignum ) then
+              iscl = 1
+              call stdlib_qlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, ierr )
+           end if
+           if( m>=n ) then
+              ! a has at least as many rows as columns. if a has sufficiently
+              ! more rows than columns, first reduce using the qr
+              ! decomposition (if sufficient workspace available)
+              if( m>=mnthr ) then
+                 if( wntqn ) then
+                    ! path 1 (m >> n, jobz='n')
+                    ! no singular vectors to be computed
+                    itau = 1
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! workspace: need   n [tau] + n    [work]
+                    ! workspace: prefer n [tau] + n*nb [work]
+                    call stdlib_qgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! zero out below r
+                    call stdlib_qlaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ), lda )
+                    ie = 1
+                    itauq = ie + n
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! workspace: need   3*n [e, tauq, taup] + n      [work]
+                    ! workspace: prefer 3*n [e, tauq, taup] + 2*n*nb [work]
+                    call stdlib_qgebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( nwork ), lwork-nwork+1,ierr )
+                    nwork = ie + n
+                    ! perform bidiagonal svd, computing singular values only
+                    ! workspace: need   n [e] + bdspac
+                    call stdlib_qbdsdc( 'U', 'N', n, s, work( ie ), dum, 1, dum, 1,dum, idum, &
+                              work( nwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 2 (m >> n, jobz = 'o')
+                    ! n left singular vectors to be overwritten on a and
+                    ! n right singular vectors to be computed in vt
+                    ir = 1
+                    ! work(ir) is ldwrkr by n
+                    if( lwork >= lda*n + n*n + 3*n + bdspac ) then
+                       ldwrkr = lda
+                    else
+                       ldwrkr = ( lwork - n*n - 3*n - bdspac ) / n
+                    end if
+                    itau = ir + ldwrkr*n
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! workspace: need   n*n [r] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    call stdlib_qgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! copy r to work(ir), zeroing out below it
+                    call stdlib_qlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                    call stdlib_qlaset( 'L', n - 1, n - 1, zero, zero, work(ir+1),ldwrkr )
+                    ! generate q in a
+                    ! workspace: need   n*n [r] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    call stdlib_qorgqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ie = itau
+                    itauq = ie + n
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in work(ir)
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n      [work]
+                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + 2*n*nb [work]
+                    call stdlib_qgebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork - nwork + 1, ierr )
+                    ! work(iu) is n by n
+                    iu = nwork
+                    nwork = iu + n*n
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in work(iu) and computing right
+                    ! singular vectors of bidiagonal matrix in vt
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u] + bdspac
+                    call stdlib_qbdsdc( 'U', 'I', n, s, work( ie ), work( iu ), n,vt, ldvt, dum, &
+                              idum, work( nwork ), iwork,info )
+                    ! overwrite work(iu) by left singular vectors of r
+                    ! and vt by right singular vectors of r
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u] + n    [work]
+                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
+                    call stdlib_qormbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                              work( iu ), n, work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_qormbr( 'P', 'R', 'T', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork - nwork + 1, ierr )
+                    ! multiply q in a by left singular vectors of r in
+                    ! work(iu), storing result in work(ir) and copying to a
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u]
+                    ! workspace: prefer m*n [r] + 3*n [e, tauq, taup] + n*n [u]
+                    do i = 1, m, ldwrkr
+                       chunk = min( m - i + 1, ldwrkr )
+                       call stdlib_qgemm( 'N', 'N', chunk, n, n, one, a( i, 1 ),lda, work( iu ), &
+                                 n, zero, work( ir ),ldwrkr )
+                       call stdlib_qlacpy( 'F', chunk, n, work( ir ), ldwrkr,a( i, 1 ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 3 (m >> n, jobz='s')
+                    ! n left singular vectors to be computed in u and
+                    ! n right singular vectors to be computed in vt
+                    ir = 1
+                    ! work(ir) is n by n
+                    ldwrkr = n
+                    itau = ir + ldwrkr*n
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! workspace: need   n*n [r] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    call stdlib_qgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! copy r to work(ir), zeroing out below it
+                    call stdlib_qlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                    call stdlib_qlaset( 'L', n - 1, n - 1, zero, zero, work(ir+1),ldwrkr )
+                    ! generate q in a
+                    ! workspace: need   n*n [r] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    call stdlib_qorgqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ie = itau
+                    itauq = ie + n
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in work(ir)
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n      [work]
+                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + 2*n*nb [work]
+                    call stdlib_qgebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork - nwork + 1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagoal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + bdspac
+                    call stdlib_qbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! overwrite u by left singular vectors of r and vt
+                    ! by right singular vectors of r
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n    [work]
+                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + n*nb [work]
+                    call stdlib_qormbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                              u, ldu, work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_qormbr( 'P', 'R', 'T', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork - nwork + 1, ierr )
+                    ! multiply q in a by left singular vectors of r in
+                    ! work(ir), storing result in u
+                    ! workspace: need   n*n [r]
+                    call stdlib_qlacpy( 'F', n, n, u, ldu, work( ir ), ldwrkr )
+                    call stdlib_qgemm( 'N', 'N', m, n, n, one, a, lda, work( ir ),ldwrkr, zero, u,&
+                               ldu )
+                 else if( wntqa ) then
+                    ! path 4 (m >> n, jobz='a')
+                    ! m left singular vectors to be computed in u and
+                    ! n right singular vectors to be computed in vt
+                    iu = 1
+                    ! work(iu) is n by n
+                    ldwrku = n
+                    itau = iu + ldwrku*n
+                    nwork = itau + n
+                    ! compute a=q*r, copying result to u
+                    ! workspace: need   n*n [u] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [u] + n [tau] + n*nb [work]
+                    call stdlib_qgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    call stdlib_qlacpy( 'L', m, n, a, lda, u, ldu )
+                    ! generate q in u
+                    ! workspace: need   n*n [u] + n [tau] + m    [work]
+                    ! workspace: prefer n*n [u] + n [tau] + m*nb [work]
+                    call stdlib_qorgqr( m, m, n, u, ldu, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ! produce r in a, zeroing out other entries
+                    call stdlib_qlaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ), lda )
+                    ie = itau
+                    itauq = ie + n
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + n      [work]
+                    ! workspace: prefer n*n [u] + 3*n [e, tauq, taup] + 2*n*nb [work]
+                    call stdlib_qgebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( nwork ), lwork-nwork+1,ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in work(iu) and computing right
+                    ! singular vectors of bidiagonal matrix in vt
+                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + bdspac
+                    call stdlib_qbdsdc( 'U', 'I', n, s, work( ie ), work( iu ), n,vt, ldvt, dum, &
+                              idum, work( nwork ), iwork,info )
+                    ! overwrite work(iu) by left singular vectors of r and vt
+                    ! by right singular vectors of r
+                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + n    [work]
+                    ! workspace: prefer n*n [u] + 3*n [e, tauq, taup] + n*nb [work]
+                    call stdlib_qormbr( 'Q', 'L', 'N', n, n, n, a, lda,work( itauq ), work( iu ), &
+                              ldwrku,work( nwork ), lwork - nwork + 1, ierr )
+                    call stdlib_qormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    ! multiply q in u by left singular vectors of r in
+                    ! work(iu), storing result in a
+                    ! workspace: need   n*n [u]
+                    call stdlib_qgemm( 'N', 'N', m, n, n, one, u, ldu, work( iu ),ldwrku, zero, a,&
+                               lda )
+                    ! copy left singular vectors of a from a to u
+                    call stdlib_qlacpy( 'F', m, n, a, lda, u, ldu )
+                 end if
+              else
+                 ! m < mnthr
+                 ! path 5 (m >= n, but not much larger)
+                 ! reduce to bidiagonal form without qr decomposition
+                 ie = 1
+                 itauq = ie + n
+                 itaup = itauq + n
+                 nwork = itaup + n
+                 ! bidiagonalize a
+                 ! workspace: need   3*n [e, tauq, taup] + m        [work]
+                 ! workspace: prefer 3*n [e, tauq, taup] + (m+n)*nb [work]
+                 call stdlib_qgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 5n (m >= n, jobz='n')
+                    ! perform bidiagonal svd, only computing singular values
+                    ! workspace: need   3*n [e, tauq, taup] + bdspac
+                    call stdlib_qbdsdc( 'U', 'N', n, s, work( ie ), dum, 1, dum, 1,dum, idum, &
+                              work( nwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 5o (m >= n, jobz='o')
+                    iu = nwork
+                    if( lwork >= m*n + 3*n + bdspac ) then
+                       ! work( iu ) is m by n
+                       ldwrku = m
+                       nwork = iu + ldwrku*n
+                       call stdlib_qlaset( 'F', m, n, zero, zero, work( iu ),ldwrku )
+                       ! ir is unused; silence compile warnings
+                       ir = -1
+                    else
+                       ! work( iu ) is n by n
+                       ldwrku = n
+                       nwork = iu + ldwrku*n
+                       ! work(ir) is ldwrkr by n
+                       ir = nwork
+                       ldwrkr = ( lwork - n*n - 3*n ) / n
+                    end if
+                    nwork = iu + ldwrku*n
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in work(iu) and computing right
+                    ! singular vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*n [e, tauq, taup] + n*n [u] + bdspac
+                    call stdlib_qbdsdc( 'U', 'I', n, s, work( ie ), work( iu ),ldwrku, vt, ldvt, &
+                              dum, idum, work( nwork ),iwork, info )
+                    ! overwrite vt by right singular vectors of a
+                    ! workspace: need   3*n [e, tauq, taup] + n*n [u] + n    [work]
+                    ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
+                    call stdlib_qormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    if( lwork >= m*n + 3*n + bdspac ) then
+                       ! path 5o-fast
+                       ! overwrite work(iu) by left singular vectors of a
+                       ! workspace: need   3*n [e, tauq, taup] + m*n [u] + n    [work]
+                       ! workspace: prefer 3*n [e, tauq, taup] + m*n [u] + n*nb [work]
+                       call stdlib_qormbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), work( iu &
+                                 ), ldwrku,work( nwork ), lwork - nwork + 1, ierr )
+                       ! copy left singular vectors of a from work(iu) to a
+                       call stdlib_qlacpy( 'F', m, n, work( iu ), ldwrku, a, lda )
+                    else
+                       ! path 5o-slow
+                       ! generate q in a
+                       ! workspace: need   3*n [e, tauq, taup] + n*n [u] + n    [work]
+                       ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
+                       call stdlib_qorgbr( 'Q', m, n, n, a, lda, work( itauq ),work( nwork ), &
+                                 lwork - nwork + 1, ierr )
+                       ! multiply q in a by left singular vectors of
+                       ! bidiagonal matrix in work(iu), storing result in
+                       ! work(ir) and copying to a
+                       ! workspace: need   3*n [e, tauq, taup] + n*n [u] + nb*n [r]
+                       ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + m*n  [r]
+                       do i = 1, m, ldwrkr
+                          chunk = min( m - i + 1, ldwrkr )
+                          call stdlib_qgemm( 'N', 'N', chunk, n, n, one, a( i, 1 ),lda, work( iu )&
+                                    , ldwrku, zero,work( ir ), ldwrkr )
+                          call stdlib_qlacpy( 'F', chunk, n, work( ir ), ldwrkr,a( i, 1 ), lda )
+                                    
+                       end do
+                    end if
+                 else if( wntqs ) then
+                    ! path 5s (m >= n, jobz='s')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*n [e, tauq, taup] + bdspac
+                    call stdlib_qlaset( 'F', m, n, zero, zero, u, ldu )
+                    call stdlib_qbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! overwrite u by left singular vectors of a and vt
+                    ! by right singular vectors of a
+                    ! workspace: need   3*n [e, tauq, taup] + n    [work]
+                    ! workspace: prefer 3*n [e, tauq, taup] + n*nb [work]
+                    call stdlib_qormbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_qormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                 else if( wntqa ) then
+                    ! path 5a (m >= n, jobz='a')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*n [e, tauq, taup] + bdspac
+                    call stdlib_qlaset( 'F', m, m, zero, zero, u, ldu )
+                    call stdlib_qbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! set the right corner of u to identity matrix
+                    if( m>n ) then
+                       call stdlib_qlaset( 'F', m - n, m - n, zero, one, u(n+1,n+1),ldu )
+                    end if
+                    ! overwrite u by left singular vectors of a and vt
+                    ! by right singular vectors of a
+                    ! workspace: need   3*n [e, tauq, taup] + m    [work]
+                    ! workspace: prefer 3*n [e, tauq, taup] + m*nb [work]
+                    call stdlib_qormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_qormbr( 'P', 'R', 'T', n, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                 end if
+              end if
+           else
+              ! a has more columns than rows. if a has sufficiently more
+              ! columns than rows, first reduce using the lq decomposition (if
+              ! sufficient workspace available)
+              if( n>=mnthr ) then
+                 if( wntqn ) then
+                    ! path 1t (n >> m, jobz='n')
+                    ! no singular vectors to be computed
+                    itau = 1
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! workspace: need   m [tau] + m [work]
+                    ! workspace: prefer m [tau] + m*nb [work]
+                    call stdlib_qgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! zero out above l
+                    call stdlib_qlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ), lda )
+                    ie = 1
+                    itauq = ie + m
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! workspace: need   3*m [e, tauq, taup] + m      [work]
+                    ! workspace: prefer 3*m [e, tauq, taup] + 2*m*nb [work]
+                    call stdlib_qgebrd( m, m, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( nwork ), lwork-nwork+1,ierr )
+                    nwork = ie + m
+                    ! perform bidiagonal svd, computing singular values only
+                    ! workspace: need   m [e] + bdspac
+                    call stdlib_qbdsdc( 'U', 'N', m, s, work( ie ), dum, 1, dum, 1,dum, idum, &
+                              work( nwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 2t (n >> m, jobz='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! m left singular vectors to be computed in u
+                    ivt = 1
+                    ! work(ivt) is m by m
+                    ! work(il)  is m by m; it is later resized to m by chunk for gemm
+                    il = ivt + m*m
+                    if( lwork >= m*n + m*m + 3*m + bdspac ) then
+                       ldwrkl = m
+                       chunk = n
+                    else
+                       ldwrkl = m
+                       chunk = ( lwork - m*m ) / m
+                    end if
+                    itau = il + ldwrkl*m
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! workspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
+                    call stdlib_qgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! copy l to work(il), zeroing about above it
+                    call stdlib_qlacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
+                    call stdlib_qlaset( 'U', m - 1, m - 1, zero, zero,work( il + ldwrkl ), ldwrkl &
+                              )
+                    ! generate q in a
+                    ! workspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
+                    call stdlib_qorglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ie = itau
+                    itauq = ie + m
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in work(il)
+                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m      [work]
+                    ! workspace: prefer m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + 2*m*nb [work]
+                    call stdlib_qgebrd( m, m, work( il ), ldwrkl, s, work( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork - nwork + 1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u, and computing right singular
+                    ! vectors of bidiagonal matrix in work(ivt)
+                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + bdspac
+                    call stdlib_qbdsdc( 'U', 'I', m, s, work( ie ), u, ldu,work( ivt ), m, dum, &
+                              idum, work( nwork ),iwork, info )
+                    ! overwrite u by left singular vectors of l and work(ivt)
+                    ! by right singular vectors of l
+                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m    [work]
+                    ! workspace: prefer m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m*nb [work]
+                    call stdlib_qormbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
+                              u, ldu, work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_qormbr( 'P', 'R', 'T', m, m, m, work( il ), ldwrkl,work( itaup ), &
+                              work( ivt ), m,work( nwork ), lwork - nwork + 1, ierr )
+                    ! multiply right singular vectors of l in work(ivt) by q
+                    ! in a, storing result in work(il) and copying to a
+                    ! workspace: need   m*m [vt] + m*m [l]
+                    ! workspace: prefer m*m [vt] + m*n [l]
+                    ! at this point, l is resized as m by chunk.
+                    do i = 1, n, chunk
+                       blk = min( n - i + 1, chunk )
+                       call stdlib_qgemm( 'N', 'N', m, blk, m, one, work( ivt ), m,a( 1, i ), lda,&
+                                  zero, work( il ), ldwrkl )
+                       call stdlib_qlacpy( 'F', m, blk, work( il ), ldwrkl,a( 1, i ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 3t (n >> m, jobz='s')
+                    ! m right singular vectors to be computed in vt and
+                    ! m left singular vectors to be computed in u
+                    il = 1
+                    ! work(il) is m by m
+                    ldwrkl = m
+                    itau = il + ldwrkl*m
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! workspace: need   m*m [l] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [l] + m [tau] + m*nb [work]
+                    call stdlib_qgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! copy l to work(il), zeroing out above it
+                    call stdlib_qlacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
+                    call stdlib_qlaset( 'U', m - 1, m - 1, zero, zero,work( il + ldwrkl ), ldwrkl &
+                              )
+                    ! generate q in a
+                    ! workspace: need   m*m [l] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [l] + m [tau] + m*nb [work]
+                    call stdlib_qorglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ie = itau
+                    itauq = ie + m
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in work(iu).
+                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + m      [work]
+                    ! workspace: prefer m*m [l] + 3*m [e, tauq, taup] + 2*m*nb [work]
+                    call stdlib_qgebrd( m, m, work( il ), ldwrkl, s, work( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork - nwork + 1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + bdspac
+                    call stdlib_qbdsdc( 'U', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! overwrite u by left singular vectors of l and vt
+                    ! by right singular vectors of l
+                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + m    [work]
+                    ! workspace: prefer m*m [l] + 3*m [e, tauq, taup] + m*nb [work]
+                    call stdlib_qormbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
+                              u, ldu, work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_qormbr( 'P', 'R', 'T', m, m, m, work( il ), ldwrkl,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork - nwork + 1, ierr )
+                    ! multiply right singular vectors of l in work(il) by
+                    ! q in a, storing result in vt
+                    ! workspace: need   m*m [l]
+                    call stdlib_qlacpy( 'F', m, m, vt, ldvt, work( il ), ldwrkl )
+                    call stdlib_qgemm( 'N', 'N', m, n, m, one, work( il ), ldwrkl,a, lda, zero, &
+                              vt, ldvt )
+                 else if( wntqa ) then
+                    ! path 4t (n >> m, jobz='a')
+                    ! n right singular vectors to be computed in vt and
+                    ! m left singular vectors to be computed in u
+                    ivt = 1
+                    ! work(ivt) is m by m
+                    ldwkvt = m
+                    itau = ivt + ldwkvt*m
+                    nwork = itau + m
+                    ! compute a=l*q, copying result to vt
+                    ! workspace: need   m*m [vt] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [vt] + m [tau] + m*nb [work]
+                    call stdlib_qgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    call stdlib_qlacpy( 'U', m, n, a, lda, vt, ldvt )
+                    ! generate q in vt
+                    ! workspace: need   m*m [vt] + m [tau] + n    [work]
+                    ! workspace: prefer m*m [vt] + m [tau] + n*nb [work]
+                    call stdlib_qorglq( n, n, m, vt, ldvt, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ! produce l in a, zeroing out other entries
+                    call stdlib_qlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ), lda )
+                    ie = itau
+                    itauq = ie + m
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup] + m      [work]
+                    ! workspace: prefer m*m [vt] + 3*m [e, tauq, taup] + 2*m*nb [work]
+                    call stdlib_qgebrd( m, m, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( nwork ), lwork-nwork+1,ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in work(ivt)
+                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup] + bdspac
+                    call stdlib_qbdsdc( 'U', 'I', m, s, work( ie ), u, ldu,work( ivt ), ldwkvt, &
+                              dum, idum,work( nwork ), iwork, info )
+                    ! overwrite u by left singular vectors of l and work(ivt)
+                    ! by right singular vectors of l
+                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup]+ m    [work]
+                    ! workspace: prefer m*m [vt] + 3*m [e, tauq, taup]+ m*nb [work]
+                    call stdlib_qormbr( 'Q', 'L', 'N', m, m, m, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_qormbr( 'P', 'R', 'T', m, m, m, a, lda,work( itaup ), work( ivt ),&
+                               ldwkvt,work( nwork ), lwork - nwork + 1, ierr )
+                    ! multiply right singular vectors of l in work(ivt) by
+                    ! q in vt, storing result in a
+                    ! workspace: need   m*m [vt]
+                    call stdlib_qgemm( 'N', 'N', m, n, m, one, work( ivt ), ldwkvt,vt, ldvt, zero,&
+                               a, lda )
+                    ! copy right singular vectors of a from a to vt
+                    call stdlib_qlacpy( 'F', m, n, a, lda, vt, ldvt )
+                 end if
+              else
+                 ! n < mnthr
+                 ! path 5t (n > m, but not much larger)
+                 ! reduce to bidiagonal form without lq decomposition
+                 ie = 1
+                 itauq = ie + m
+                 itaup = itauq + m
+                 nwork = itaup + m
+                 ! bidiagonalize a
+                 ! workspace: need   3*m [e, tauq, taup] + n        [work]
+                 ! workspace: prefer 3*m [e, tauq, taup] + (m+n)*nb [work]
+                 call stdlib_qgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 5tn (n > m, jobz='n')
+                    ! perform bidiagonal svd, only computing singular values
+                    ! workspace: need   3*m [e, tauq, taup] + bdspac
+                    call stdlib_qbdsdc( 'L', 'N', m, s, work( ie ), dum, 1, dum, 1,dum, idum, &
+                              work( nwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 5to (n > m, jobz='o')
+                    ldwkvt = m
+                    ivt = nwork
+                    if( lwork >= m*n + 3*m + bdspac ) then
+                       ! work( ivt ) is m by n
+                       call stdlib_qlaset( 'F', m, n, zero, zero, work( ivt ),ldwkvt )
+                       nwork = ivt + ldwkvt*n
+                       ! il is unused; silence compile warnings
+                       il = -1
+                    else
+                       ! work( ivt ) is m by m
+                       nwork = ivt + ldwkvt*m
+                       il = nwork
+                       ! work(il) is m by chunk
+                       chunk = ( lwork - m*m - 3*m ) / m
+                    end if
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in work(ivt)
+                    ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + bdspac
+                    call stdlib_qbdsdc( 'L', 'I', m, s, work( ie ), u, ldu,work( ivt ), ldwkvt, &
+                              dum, idum,work( nwork ), iwork, info )
+                    ! overwrite u by left singular vectors of a
+                    ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m    [work]
+                    ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*nb [work]
+                    call stdlib_qormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    if( lwork >= m*n + 3*m + bdspac ) then
+                       ! path 5to-fast
+                       ! overwrite work(ivt) by left singular vectors of a
+                       ! workspace: need   3*m [e, tauq, taup] + m*n [vt] + m    [work]
+                       ! workspace: prefer 3*m [e, tauq, taup] + m*n [vt] + m*nb [work]
+                       call stdlib_qormbr( 'P', 'R', 'T', m, n, m, a, lda,work( itaup ), work( &
+                                 ivt ), ldwkvt,work( nwork ), lwork - nwork + 1, ierr )
+                       ! copy right singular vectors of a from work(ivt) to a
+                       call stdlib_qlacpy( 'F', m, n, work( ivt ), ldwkvt, a, lda )
+                    else
+                       ! path 5to-slow
+                       ! generate p**t in a
+                       ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m    [work]
+                       ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*nb [work]
+                       call stdlib_qorgbr( 'P', m, n, m, a, lda, work( itaup ),work( nwork ), &
+                                 lwork - nwork + 1, ierr )
+                       ! multiply q in a by right singular vectors of
+                       ! bidiagonal matrix in work(ivt), storing result in
+                       ! work(il) and copying to a
+                       ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m*nb [l]
+                       ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*n  [l]
+                       do i = 1, n, chunk
+                          blk = min( n - i + 1, chunk )
+                          call stdlib_qgemm( 'N', 'N', m, blk, m, one, work( ivt ),ldwkvt, a( 1, &
+                                    i ), lda, zero,work( il ), m )
+                          call stdlib_qlacpy( 'F', m, blk, work( il ), m, a( 1, i ),lda )
+                       end do
+                    end if
+                 else if( wntqs ) then
+                    ! path 5ts (n > m, jobz='s')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*m [e, tauq, taup] + bdspac
+                    call stdlib_qlaset( 'F', m, n, zero, zero, vt, ldvt )
+                    call stdlib_qbdsdc( 'L', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! overwrite u by left singular vectors of a and vt
+                    ! by right singular vectors of a
+                    ! workspace: need   3*m [e, tauq, taup] + m    [work]
+                    ! workspace: prefer 3*m [e, tauq, taup] + m*nb [work]
+                    call stdlib_qormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_qormbr( 'P', 'R', 'T', m, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                 else if( wntqa ) then
+                    ! path 5ta (n > m, jobz='a')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*m [e, tauq, taup] + bdspac
+                    call stdlib_qlaset( 'F', n, n, zero, zero, vt, ldvt )
+                    call stdlib_qbdsdc( 'L', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! set the right corner of vt to identity matrix
+                    if( n>m ) then
+                       call stdlib_qlaset( 'F', n-m, n-m, zero, one, vt(m+1,m+1),ldvt )
+                    end if
+                    ! overwrite u by left singular vectors of a and vt
+                    ! by right singular vectors of a
+                    ! workspace: need   3*m [e, tauq, taup] + n    [work]
+                    ! workspace: prefer 3*m [e, tauq, taup] + n*nb [work]
+                    call stdlib_qormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_qormbr( 'P', 'R', 'T', n, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                 end if
+              end if
+           end if
+           ! undo scaling if necessary
+           if( iscl==1 ) then
+              if( anrm>bignum )call stdlib_qlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( anrm<smlnum )call stdlib_qlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+           end if
+           ! return optimal workspace in work(1)
+           work( 1 ) = stdlib_qroundup_lwork( maxwrk )
+           return
+     end subroutine stdlib_qgesdd
+
+     !> DGESV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> The LU decomposition with partial pivoting and row interchanges is
+     !> used to factor A as
+     !> A = P * L * U,
+     !> where P is a permutation matrix, L is unit lower triangular, and U is
+     !> upper triangular.  The factored form of A is then used to solve the
+     !> system of equations A * X = B.
+
+     pure subroutine stdlib_qgesv( n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGESV ', -info )
+              return
+           end if
+           ! compute the lu factorization of a.
+           call stdlib_qgetrf( n, n, a, lda, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_qgetrs( 'NO TRANSPOSE', n, nrhs, a, lda, ipiv, b, ldb,info )
+           end if
+           return
+     end subroutine stdlib_qgesv
+
+     !> DGESVD: computes the singular value decomposition (SVD) of a real
+     !> M-by-N matrix A, optionally computing the left and/or right singular
+     !> vectors. The SVD is written
+     !> A = U * SIGMA * transpose(V)
+     !> where SIGMA is an M-by-N matrix which is zero except for its
+     !> min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
+     !> V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
+     !> are the singular values of A; they are real and non-negative, and
+     !> are returned in descending order.  The first min(m,n) columns of
+     !> U and V are the left and right singular vectors of A.
+     !> Note that the routine returns V**T, not V.
+
+     subroutine stdlib_qgesvd( jobu, jobvt, m, n, a, lda, s, u, ldu,vt, ldvt, work, lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu, jobvt
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldvt, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: s(*), u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wntua, wntuas, wntun, wntuo, wntus, wntva, wntvas, wntvn, wntvo,&
+                      wntvs
+           integer(ilp) :: bdspac, blk, chunk, i, ie, ierr, ir, iscl, itau, itaup, itauq, iu, &
+           iwork, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr, ncu, ncvt, nru, nrvt, &
+                     wrkbl
+           integer(ilp) :: lwork_qgeqrf, lwork_qorgqr_n, lwork_qorgqr_m, lwork_qgebrd, &
+                     lwork_qorgbr_p, lwork_qorgbr_q, lwork_qgelqf, lwork_qorglq_n, lwork_qorglq_m
+           real(qp) :: anrm, bignum, eps, smlnum
+           ! Local Arrays 
+           real(qp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           minmn = min( m, n )
+           wntua = stdlib_lsame( jobu, 'A' )
+           wntus = stdlib_lsame( jobu, 'S' )
+           wntuas = wntua .or. wntus
+           wntuo = stdlib_lsame( jobu, 'O' )
+           wntun = stdlib_lsame( jobu, 'N' )
+           wntva = stdlib_lsame( jobvt, 'A' )
+           wntvs = stdlib_lsame( jobvt, 'S' )
+           wntvas = wntva .or. wntvs
+           wntvo = stdlib_lsame( jobvt, 'O' )
+           wntvn = stdlib_lsame( jobvt, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.( wntua .or. wntus .or. wntuo .or. wntun ) ) then
+              info = -1
+           else if( .not.( wntva .or. wntvs .or. wntvo .or. wntvn ) .or.( wntvo .and. wntuo ) ) &
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldu<1 .or. ( wntuas .and. ldu<m ) ) then
+              info = -9
+           else if( ldvt<1 .or. ( wntva .and. ldvt<n ) .or.( wntvs .and. ldvt<minmn ) ) &
+                     then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( m>=n .and. minmn>0 ) then
+                 ! compute space needed for stdlib_qbdsqr
+                 mnthr = stdlib_ilaenv( 6, 'DGESVD', jobu // jobvt, m, n, 0, 0 )
+                 bdspac = 5*n
+                 ! compute space needed for stdlib_qgeqrf
+                 call stdlib_qgeqrf( m, n, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_qgeqrf = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_qorgqr
+                 call stdlib_qorgqr( m, n, n, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_qorgqr_n = int( dum(1),KIND=ilp)
+                 call stdlib_qorgqr( m, m, n, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_qorgqr_m = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_qgebrd
+                 call stdlib_qgebrd( n, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                           
+                 lwork_qgebrd = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_qorgbr p
+                 call stdlib_qorgbr( 'P', n, n, n, a, lda, dum(1),dum(1), -1, ierr )
+                 lwork_qorgbr_p = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_qorgbr q
+                 call stdlib_qorgbr( 'Q', n, n, n, a, lda, dum(1),dum(1), -1, ierr )
+                 lwork_qorgbr_q = int( dum(1),KIND=ilp)
+                 if( m>=mnthr ) then
+                    if( wntun ) then
+                       ! path 1 (m much larger than n, jobu='n')
+                       maxwrk = n + lwork_qgeqrf
+                       maxwrk = max( maxwrk, 3*n + lwork_qgebrd )
+                       if( wntvo .or. wntvas )maxwrk = max( maxwrk, 3*n + lwork_qorgbr_p )
+                       maxwrk = max( maxwrk, bdspac )
+                       minwrk = max( 4*n, bdspac )
+                    else if( wntuo .and. wntvn ) then
+                       ! path 2 (m much larger than n, jobu='o', jobvt='n')
+                       wrkbl = n + lwork_qgeqrf
+                       wrkbl = max( wrkbl, n + lwork_qorgqr_n )
+                       wrkbl = max( wrkbl, 3*n + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_qorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = max( n*n + wrkbl, n*n + m*n + n )
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntuo .and. wntvas ) then
+                       ! path 3 (m much larger than n, jobu='o', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_qgeqrf
+                       wrkbl = max( wrkbl, n + lwork_qorgqr_n )
+                       wrkbl = max( wrkbl, 3*n + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_qorgbr_q )
+                       wrkbl = max( wrkbl, 3*n + lwork_qorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = max( n*n + wrkbl, n*n + m*n + n )
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntus .and. wntvn ) then
+                       ! path 4 (m much larger than n, jobu='s', jobvt='n')
+                       wrkbl = n + lwork_qgeqrf
+                       wrkbl = max( wrkbl, n + lwork_qorgqr_n )
+                       wrkbl = max( wrkbl, 3*n + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_qorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = n*n + wrkbl
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntus .and. wntvo ) then
+                       ! path 5 (m much larger than n, jobu='s', jobvt='o')
+                       wrkbl = n + lwork_qgeqrf
+                       wrkbl = max( wrkbl, n + lwork_qorgqr_n )
+                       wrkbl = max( wrkbl, 3*n + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_qorgbr_q )
+                       wrkbl = max( wrkbl, 3*n + lwork_qorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = 2*n*n + wrkbl
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntus .and. wntvas ) then
+                       ! path 6 (m much larger than n, jobu='s', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_qgeqrf
+                       wrkbl = max( wrkbl, n + lwork_qorgqr_n )
+                       wrkbl = max( wrkbl, 3*n + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_qorgbr_q )
+                       wrkbl = max( wrkbl, 3*n + lwork_qorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = n*n + wrkbl
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntua .and. wntvn ) then
+                       ! path 7 (m much larger than n, jobu='a', jobvt='n')
+                       wrkbl = n + lwork_qgeqrf
+                       wrkbl = max( wrkbl, n + lwork_qorgqr_m )
+                       wrkbl = max( wrkbl, 3*n + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_qorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = n*n + wrkbl
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntua .and. wntvo ) then
+                       ! path 8 (m much larger than n, jobu='a', jobvt='o')
+                       wrkbl = n + lwork_qgeqrf
+                       wrkbl = max( wrkbl, n + lwork_qorgqr_m )
+                       wrkbl = max( wrkbl, 3*n + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_qorgbr_q )
+                       wrkbl = max( wrkbl, 3*n + lwork_qorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = 2*n*n + wrkbl
+                       minwrk = max( 3*n + m, bdspac )
+                    else if( wntua .and. wntvas ) then
+                       ! path 9 (m much larger than n, jobu='a', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_qgeqrf
+                       wrkbl = max( wrkbl, n + lwork_qorgqr_m )
+                       wrkbl = max( wrkbl, 3*n + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*n + lwork_qorgbr_q )
+                       wrkbl = max( wrkbl, 3*n + lwork_qorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = n*n + wrkbl
+                       minwrk = max( 3*n + m, bdspac )
+                    end if
+                 else
+                    ! path 10 (m at least n, but not much larger)
+                    call stdlib_qgebrd( m, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                              
+                    lwork_qgebrd = int( dum(1),KIND=ilp)
+                    maxwrk = 3*n + lwork_qgebrd
+                    if( wntus .or. wntuo ) then
+                       call stdlib_qorgbr( 'Q', m, n, n, a, lda, dum(1),dum(1), -1, ierr )
+                       lwork_qorgbr_q = int( dum(1),KIND=ilp)
+                       maxwrk = max( maxwrk, 3*n + lwork_qorgbr_q )
+                    end if
+                    if( wntua ) then
+                       call stdlib_qorgbr( 'Q', m, m, n, a, lda, dum(1),dum(1), -1, ierr )
+                       lwork_qorgbr_q = int( dum(1),KIND=ilp)
+                       maxwrk = max( maxwrk, 3*n + lwork_qorgbr_q )
+                    end if
+                    if( .not.wntvn ) then
+                      maxwrk = max( maxwrk, 3*n + lwork_qorgbr_p )
+                    end if
+                    maxwrk = max( maxwrk, bdspac )
+                    minwrk = max( 3*n + m, bdspac )
+                 end if
+              else if( minmn>0 ) then
+                 ! compute space needed for stdlib_qbdsqr
+                 mnthr = stdlib_ilaenv( 6, 'DGESVD', jobu // jobvt, m, n, 0, 0 )
+                 bdspac = 5*m
+                 ! compute space needed for stdlib_qgelqf
+                 call stdlib_qgelqf( m, n, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_qgelqf = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_qorglq
+                 call stdlib_qorglq( n, n, m, dum(1), n, dum(1), dum(1), -1, ierr )
+                 lwork_qorglq_n = int( dum(1),KIND=ilp)
+                 call stdlib_qorglq( m, n, m, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_qorglq_m = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_qgebrd
+                 call stdlib_qgebrd( m, m, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                           
+                 lwork_qgebrd = int( dum(1),KIND=ilp)
+                  ! compute space needed for stdlib_qorgbr p
+                 call stdlib_qorgbr( 'P', m, m, m, a, n, dum(1),dum(1), -1, ierr )
+                 lwork_qorgbr_p = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_qorgbr q
+                 call stdlib_qorgbr( 'Q', m, m, m, a, n, dum(1),dum(1), -1, ierr )
+                 lwork_qorgbr_q = int( dum(1),KIND=ilp)
+                 if( n>=mnthr ) then
+                    if( wntvn ) then
+                       ! path 1t(n much larger than m, jobvt='n')
+                       maxwrk = m + lwork_qgelqf
+                       maxwrk = max( maxwrk, 3*m + lwork_qgebrd )
+                       if( wntuo .or. wntuas )maxwrk = max( maxwrk, 3*m + lwork_qorgbr_q )
+                       maxwrk = max( maxwrk, bdspac )
+                       minwrk = max( 4*m, bdspac )
+                    else if( wntvo .and. wntun ) then
+                       ! path 2t(n much larger than m, jobu='n', jobvt='o')
+                       wrkbl = m + lwork_qgelqf
+                       wrkbl = max( wrkbl, m + lwork_qorglq_m )
+                       wrkbl = max( wrkbl, 3*m + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_qorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = max( m*m + wrkbl, m*m + m*n + m )
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntvo .and. wntuas ) then
+                       ! path 3t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='o')
+                       wrkbl = m + lwork_qgelqf
+                       wrkbl = max( wrkbl, m + lwork_qorglq_m )
+                       wrkbl = max( wrkbl, 3*m + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_qorgbr_p )
+                       wrkbl = max( wrkbl, 3*m + lwork_qorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = max( m*m + wrkbl, m*m + m*n + m )
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntvs .and. wntun ) then
+                       ! path 4t(n much larger than m, jobu='n', jobvt='s')
+                       wrkbl = m + lwork_qgelqf
+                       wrkbl = max( wrkbl, m + lwork_qorglq_m )
+                       wrkbl = max( wrkbl, 3*m + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_qorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = m*m + wrkbl
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntvs .and. wntuo ) then
+                       ! path 5t(n much larger than m, jobu='o', jobvt='s')
+                       wrkbl = m + lwork_qgelqf
+                       wrkbl = max( wrkbl, m + lwork_qorglq_m )
+                       wrkbl = max( wrkbl, 3*m + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_qorgbr_p )
+                       wrkbl = max( wrkbl, 3*m + lwork_qorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = 2*m*m + wrkbl
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntvs .and. wntuas ) then
+                       ! path 6t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='s')
+                       wrkbl = m + lwork_qgelqf
+                       wrkbl = max( wrkbl, m + lwork_qorglq_m )
+                       wrkbl = max( wrkbl, 3*m + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_qorgbr_p )
+                       wrkbl = max( wrkbl, 3*m + lwork_qorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = m*m + wrkbl
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntva .and. wntun ) then
+                       ! path 7t(n much larger than m, jobu='n', jobvt='a')
+                       wrkbl = m + lwork_qgelqf
+                       wrkbl = max( wrkbl, m + lwork_qorglq_n )
+                       wrkbl = max( wrkbl, 3*m + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_qorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = m*m + wrkbl
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntva .and. wntuo ) then
+                       ! path 8t(n much larger than m, jobu='o', jobvt='a')
+                       wrkbl = m + lwork_qgelqf
+                       wrkbl = max( wrkbl, m + lwork_qorglq_n )
+                       wrkbl = max( wrkbl, 3*m + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_qorgbr_p )
+                       wrkbl = max( wrkbl, 3*m + lwork_qorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = 2*m*m + wrkbl
+                       minwrk = max( 3*m + n, bdspac )
+                    else if( wntva .and. wntuas ) then
+                       ! path 9t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='a')
+                       wrkbl = m + lwork_qgelqf
+                       wrkbl = max( wrkbl, m + lwork_qorglq_n )
+                       wrkbl = max( wrkbl, 3*m + lwork_qgebrd )
+                       wrkbl = max( wrkbl, 3*m + lwork_qorgbr_p )
+                       wrkbl = max( wrkbl, 3*m + lwork_qorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = m*m + wrkbl
+                       minwrk = max( 3*m + n, bdspac )
+                    end if
+                 else
+                    ! path 10t(n greater than m, but not much larger)
+                    call stdlib_qgebrd( m, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                              
+                    lwork_qgebrd = int( dum(1),KIND=ilp)
+                    maxwrk = 3*m + lwork_qgebrd
+                    if( wntvs .or. wntvo ) then
+                      ! compute space needed for stdlib_qorgbr p
+                      call stdlib_qorgbr( 'P', m, n, m, a, n, dum(1),dum(1), -1, ierr )
+                      lwork_qorgbr_p = int( dum(1),KIND=ilp)
+                      maxwrk = max( maxwrk, 3*m + lwork_qorgbr_p )
+                    end if
+                    if( wntva ) then
+                      call stdlib_qorgbr( 'P', n, n, m, a, n, dum(1),dum(1), -1, ierr )
+                      lwork_qorgbr_p = int( dum(1),KIND=ilp)
+                      maxwrk = max( maxwrk, 3*m + lwork_qorgbr_p )
+                    end if
+                    if( .not.wntun ) then
+                       maxwrk = max( maxwrk, 3*m + lwork_qorgbr_q )
+                    end if
+                    maxwrk = max( maxwrk, bdspac )
+                    minwrk = max( 3*m + n, bdspac )
+                 end if
+              end if
+              maxwrk = max( maxwrk, minwrk )
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGESVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = sqrt( stdlib_qlamch( 'S' ) ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', m, n, a, lda, dum )
+           iscl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              iscl = 1
+              call stdlib_qlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, ierr )
+           else if( anrm>bignum ) then
+              iscl = 1
+              call stdlib_qlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, ierr )
+           end if
+           if( m>=n ) then
+              ! a has at least as many rows as columns. if a has sufficiently
+              ! more rows than columns, first reduce using the qr
+              ! decomposition (if sufficient workspace available)
+              if( m>=mnthr ) then
+                 if( wntun ) then
+                    ! path 1 (m much larger than n, jobu='n')
+                    ! no left singular vectors to be computed
+                    itau = 1
+                    iwork = itau + n
+                    ! compute a=q*r
+                    ! (workspace: need 2*n, prefer n + n*nb)
+                    call stdlib_qgeqrf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                              ierr )
+                    ! zero out below r
+                    if( n > 1 ) then
+                       call stdlib_qlaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ),lda )
+                    end if
+                    ie = 1
+                    itauq = ie + n
+                    itaup = itauq + n
+                    iwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                    call stdlib_qgebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( iwork ), lwork-iwork+1,ierr )
+                    ncvt = 0
+                    if( wntvo .or. wntvas ) then
+                       ! if right singular vectors desired, generate p'.
+                       ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                       call stdlib_qorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       ncvt = n
+                    end if
+                    iwork = ie + n
+                    ! perform bidiagonal qr iteration, computing right
+                    ! singular vectors of a in a if desired
+                    ! (workspace: need bdspac)
+                    call stdlib_qbdsqr( 'U', n, ncvt, 0, 0, s, work( ie ), a, lda,dum, 1, dum, 1, &
+                              work( iwork ), info )
+                    ! if right singular vectors desired in vt, copy them there
+                    if( wntvas )call stdlib_qlacpy( 'F', n, n, a, lda, vt, ldvt )
+                 else if( wntuo .and. wntvn ) then
+                    ! path 2 (m much larger than n, jobu='o', jobvt='n')
+                    ! n left singular vectors to be overwritten on a and
+                    ! no right singular vectors to be computed
+                    if( lwork>=n*n+max( 4*n, bdspac ) ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n + n ) + lda*n ) then
+                          ! work(iu) is lda by n, work(ir) is lda by n
+                          ldwrku = lda
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n + n ) + n*n ) then
+                          ! work(iu) is lda by n, work(ir) is n by n
+                          ldwrku = lda
+                          ldwrkr = n
+                       else
+                          ! work(iu) is ldwrku by n, work(ir) is n by n
+                          ldwrku = ( lwork-n*n-n ) / n
+                          ldwrkr = n
+                       end if
+                       itau = ir + ldwrkr*n
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                       call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to work(ir) and zero out below it
+                       call stdlib_qlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                       call stdlib_qlaset( 'L', n-1, n-1, zero, zero, work( ir+1 ),ldwrkr )
+                                 
+                       ! generate q in a
+                       ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                       call stdlib_qorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + n
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in work(ir)
+                       ! (workspace: need n*n + 4*n, prefer n*n + 3*n + 2*n*nb)
+                       call stdlib_qgebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
+                                 work( itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing r
+                       ! (workspace: need n*n + 4*n, prefer n*n + 3*n + n*nb)
+                       call stdlib_qorgbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       iwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of r in work(ir)
+                       ! (workspace: need n*n + bdspac)
+                       call stdlib_qbdsqr( 'U', n, 0, n, 0, s, work( ie ), dum, 1,work( ir ), &
+                                 ldwrkr, dum, 1,work( iwork ), info )
+                       iu = ie + n
+                       ! multiply q in a by left singular vectors of r in
+                       ! work(ir), storing result in work(iu) and copying to a
+                       ! (workspace: need n*n + 2*n, prefer n*n + m*n + n)
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_qgemm( 'N', 'N', chunk, n, n, one, a( i, 1 ),lda, work( ir )&
+                                    , ldwrkr, zero,work( iu ), ldwrku )
+                          call stdlib_qlacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       ie = 1
+                       itauq = ie + n
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize a
+                       ! (workspace: need 3*n + m, prefer 3*n + (m + n)*nb)
+                       call stdlib_qgebrd( m, n, a, lda, s, work( ie ),work( itauq ), work( itaup &
+                                 ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing a
+                       ! (workspace: need 4*n, prefer 3*n + n*nb)
+                       call stdlib_qorgbr( 'Q', m, n, n, a, lda, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in a
+                       ! (workspace: need bdspac)
+                       call stdlib_qbdsqr( 'U', n, 0, m, 0, s, work( ie ), dum, 1,a, lda, dum, 1, &
+                                 work( iwork ), info )
+                    end if
+                 else if( wntuo .and. wntvas ) then
+                    ! path 3 (m much larger than n, jobu='o', jobvt='s' or 'a')
+                    ! n left singular vectors to be overwritten on a and
+                    ! n right singular vectors to be computed in vt
+                    if( lwork>=n*n+max( 4*n, bdspac ) ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n + n ) + lda*n ) then
+                          ! work(iu) is lda by n and work(ir) is lda by n
+                          ldwrku = lda
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n + n ) + n*n ) then
+                          ! work(iu) is lda by n and work(ir) is n by n
+                          ldwrku = lda
+                          ldwrkr = n
+                       else
+                          ! work(iu) is ldwrku by n and work(ir) is n by n
+                          ldwrku = ( lwork-n*n-n ) / n
+                          ldwrkr = n
+                       end if
+                       itau = ir + ldwrkr*n
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                       call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to vt, zeroing out below it
+                       call stdlib_qlacpy( 'U', n, n, a, lda, vt, ldvt )
+                       if( n>1 )call stdlib_qlaset( 'L', n-1, n-1, zero, zero,vt( 2, 1 ), ldvt )
+                                 
+                       ! generate q in a
+                       ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                       call stdlib_qorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + n
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in vt, copying result to work(ir)
+                       ! (workspace: need n*n + 4*n, prefer n*n + 3*n + 2*n*nb)
+                       call stdlib_qgebrd( n, n, vt, ldvt, s, work( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       call stdlib_qlacpy( 'L', n, n, vt, ldvt, work( ir ), ldwrkr )
+                       ! generate left vectors bidiagonalizing r in work(ir)
+                       ! (workspace: need n*n + 4*n, prefer n*n + 3*n + n*nb)
+                       call stdlib_qorgbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing r in vt
+                       ! (workspace: need n*n + 4*n-1, prefer n*n + 3*n + (n-1)*nb)
+                       call stdlib_qorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of r in work(ir) and computing right
+                       ! singular vectors of r in vt
+                       ! (workspace: need n*n + bdspac)
+                       call stdlib_qbdsqr( 'U', n, n, n, 0, s, work( ie ), vt, ldvt,work( ir ), &
+                                 ldwrkr, dum, 1,work( iwork ), info )
+                       iu = ie + n
+                       ! multiply q in a by left singular vectors of r in
+                       ! work(ir), storing result in work(iu) and copying to a
+                       ! (workspace: need n*n + 2*n, prefer n*n + m*n + n)
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_qgemm( 'N', 'N', chunk, n, n, one, a( i, 1 ),lda, work( ir )&
+                                    , ldwrkr, zero,work( iu ), ldwrku )
+                          call stdlib_qlacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       itau = 1
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (workspace: need 2*n, prefer n + n*nb)
+                       call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to vt, zeroing out below it
+                       call stdlib_qlacpy( 'U', n, n, a, lda, vt, ldvt )
+                       if( n>1 )call stdlib_qlaset( 'L', n-1, n-1, zero, zero,vt( 2, 1 ), ldvt )
+                                 
+                       ! generate q in a
+                       ! (workspace: need 2*n, prefer n + n*nb)
+                       call stdlib_qorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + n
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in vt
+                       ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                       call stdlib_qgebrd( n, n, vt, ldvt, s, work( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! multiply q in a by left vectors bidiagonalizing r
+                       ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                       call stdlib_qormbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), a, lda,&
+                                  work( iwork ),lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing r in vt
+                       ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                       call stdlib_qorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in a and computing right
+                       ! singular vectors of a in vt
+                       ! (workspace: need bdspac)
+                       call stdlib_qbdsqr( 'U', n, n, m, 0, s, work( ie ), vt, ldvt,a, lda, dum, &
+                                 1, work( iwork ), info )
+                    end if
+                 else if( wntus ) then
+                    if( wntvn ) then
+                       ! path 4 (m much larger than n, jobu='s', jobvt='n')
+                       ! n left singular vectors to be computed in u and
+                       ! no right singular vectors to be computed
+                       if( lwork>=n*n+max( 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(ir) is lda by n
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is n by n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                          call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(ir), zeroing out below it
+                          call stdlib_qlacpy( 'U', n, n, a, lda, work( ir ),ldwrkr )
+                          call stdlib_qlaset( 'L', n-1, n-1, zero, zero,work( ir+1 ), ldwrkr )
+                                    
+                          ! generate q in a
+                          ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                          call stdlib_qorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(ir)
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + 2*n*nb)
+                          call stdlib_qgebrd( n, n, work( ir ), ldwrkr, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left vectors bidiagonalizing r in work(ir)
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + n*nb)
+                          call stdlib_qorgbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(ir)
+                          ! (workspace: need n*n + bdspac)
+                          call stdlib_qbdsqr( 'U', n, 0, n, 0, s, work( ie ), dum,1, work( ir ), &
+                                    ldwrkr, dum, 1,work( iwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(ir), storing result in u
+                          ! (workspace: need n*n)
+                          call stdlib_qgemm( 'N', 'N', m, n, n, one, a, lda,work( ir ), ldwrkr, &
+                                    zero, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_qorgqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_qlaset( 'L', n-1, n-1, zero, zero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                          call stdlib_qgebrd( n, n, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left vectors bidiagonalizing r
+                          ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                          call stdlib_qormbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u
+                          ! (workspace: need bdspac)
+                          call stdlib_qbdsqr( 'U', n, 0, m, 0, s, work( ie ), dum,1, u, ldu, dum, &
+                                    1, work( iwork ),info )
+                       end if
+                    else if( wntvo ) then
+                       ! path 5 (m much larger than n, jobu='s', jobvt='o')
+                       ! n left singular vectors to be computed in u and
+                       ! n right singular vectors to be overwritten on a
+                       if( lwork>=2*n*n+max( 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*n ) then
+                             ! work(iu) is lda by n and work(ir) is lda by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda + n )*n ) then
+                             ! work(iu) is lda by n and work(ir) is n by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          else
+                             ! work(iu) is n by n and work(ir) is n by n
+                             ldwrku = n
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (workspace: need 2*n*n + 2*n, prefer 2*n*n + n + n*nb)
+                          call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_qlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_qlaset( 'L', n-1, n-1, zero, zero,work( iu+1 ), ldwrku )
+                                    
+                          ! generate q in a
+                          ! (workspace: need 2*n*n + 2*n, prefer 2*n*n + n + n*nb)
+                          call stdlib_qorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to
+                          ! work(ir)
+                          ! (workspace: need 2*n*n + 4*n,
+                                      ! prefer 2*n*n+3*n+2*n*nb)
+                          call stdlib_qgebrd( n, n, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_qlacpy( 'U', n, n, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (workspace: need 2*n*n + 4*n, prefer 2*n*n + 3*n + n*nb)
+                          call stdlib_qorgbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (workspace: need 2*n*n + 4*n-1,
+                                      ! prefer 2*n*n+3*n+(n-1)*nb)
+                          call stdlib_qorgbr( 'P', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in work(ir)
+                          ! (workspace: need 2*n*n + bdspac)
+                          call stdlib_qbdsqr( 'U', n, n, n, 0, s, work( ie ),work( ir ), ldwrkr, &
+                                    work( iu ),ldwrku, dum, 1, work( iwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(iu), storing result in u
+                          ! (workspace: need n*n)
+                          call stdlib_qgemm( 'N', 'N', m, n, n, one, a, lda,work( iu ), ldwrku, &
+                                    zero, u, ldu )
+                          ! copy right singular vectors of r to a
+                          ! (workspace: need n*n)
+                          call stdlib_qlacpy( 'F', n, n, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_qorgqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_qlaset( 'L', n-1, n-1, zero, zero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                          call stdlib_qgebrd( n, n, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left vectors bidiagonalizing r
+                          ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                          call stdlib_qormbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right vectors bidiagonalizing r in a
+                          ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                          call stdlib_qorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in a
+                          ! (workspace: need bdspac)
+                          call stdlib_qbdsqr( 'U', n, n, m, 0, s, work( ie ), a,lda, u, ldu, dum, &
+                                    1, work( iwork ),info )
+                       end if
+                    else if( wntvas ) then
+                       ! path 6 (m much larger than n, jobu='s', jobvt='s'
+                               ! or 'a')
+                       ! n left singular vectors to be computed in u and
+                       ! n right singular vectors to be computed in vt
+                       if( lwork>=n*n+max( 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is n by n
+                             ldwrku = n
+                          end if
+                          itau = iu + ldwrku*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                          call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_qlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_qlaset( 'L', n-1, n-1, zero, zero,work( iu+1 ), ldwrku )
+                                    
+                          ! generate q in a
+                          ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                          call stdlib_qorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to vt
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + 2*n*nb)
+                          call stdlib_qgebrd( n, n, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_qlacpy( 'U', n, n, work( iu ), ldwrku, vt,ldvt )
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + n*nb)
+                          call stdlib_qorgbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (workspace: need n*n + 4*n-1,
+                                      ! prefer n*n+3*n+(n-1)*nb)
+                          call stdlib_qorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in vt
+                          ! (workspace: need n*n + bdspac)
+                          call stdlib_qbdsqr( 'U', n, n, n, 0, s, work( ie ), vt,ldvt, work( iu ),&
+                                     ldwrku, dum, 1,work( iwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(iu), storing result in u
+                          ! (workspace: need n*n)
+                          call stdlib_qgemm( 'N', 'N', m, n, n, one, a, lda,work( iu ), ldwrku, &
+                                    zero, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_qorgqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to vt, zeroing out below it
+                          call stdlib_qlacpy( 'U', n, n, a, lda, vt, ldvt )
+                          if( n>1 )call stdlib_qlaset( 'L', n-1, n-1, zero, zero,vt( 2, 1 ), ldvt &
+                                    )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in vt
+                          ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                          call stdlib_qgebrd( n, n, vt, ldvt, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in vt
+                          ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                          call stdlib_qormbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                          call stdlib_qorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_qbdsqr( 'U', n, n, m, 0, s, work( ie ), vt,ldvt, u, ldu, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    end if
+                 else if( wntua ) then
+                    if( wntvn ) then
+                       ! path 7 (m much larger than n, jobu='a', jobvt='n')
+                       ! m left singular vectors to be computed in u and
+                       ! no right singular vectors to be computed
+                       if( lwork>=n*n+max( n+m, 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(ir) is lda by n
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is n by n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                          call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! copy r to work(ir), zeroing out below it
+                          call stdlib_qlacpy( 'U', n, n, a, lda, work( ir ),ldwrkr )
+                          call stdlib_qlaset( 'L', n-1, n-1, zero, zero,work( ir+1 ), ldwrkr )
+                                    
+                          ! generate q in u
+                          ! (workspace: need n*n + n + m, prefer n*n + n + m*nb)
+                          call stdlib_qorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(ir)
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + 2*n*nb)
+                          call stdlib_qgebrd( n, n, work( ir ), ldwrkr, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + n*nb)
+                          call stdlib_qorgbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(ir)
+                          ! (workspace: need n*n + bdspac)
+                          call stdlib_qbdsqr( 'U', n, 0, n, 0, s, work( ie ), dum,1, work( ir ), &
+                                    ldwrkr, dum, 1,work( iwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(ir), storing result in a
+                          ! (workspace: need n*n)
+                          call stdlib_qgemm( 'N', 'N', m, n, n, one, u, ldu,work( ir ), ldwrkr, &
+                                    zero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_qlacpy( 'F', m, n, a, lda, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need n + m, prefer n + m*nb)
+                          call stdlib_qorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_qlaset( 'L', n-1, n-1, zero, zero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                          call stdlib_qgebrd( n, n, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in a
+                          ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                          call stdlib_qormbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u
+                          ! (workspace: need bdspac)
+                          call stdlib_qbdsqr( 'U', n, 0, m, 0, s, work( ie ), dum,1, u, ldu, dum, &
+                                    1, work( iwork ),info )
+                       end if
+                    else if( wntvo ) then
+                       ! path 8 (m much larger than n, jobu='a', jobvt='o')
+                       ! m left singular vectors to be computed in u and
+                       ! n right singular vectors to be overwritten on a
+                       if( lwork>=2*n*n+max( n+m, 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*n ) then
+                             ! work(iu) is lda by n and work(ir) is lda by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda + n )*n ) then
+                             ! work(iu) is lda by n and work(ir) is n by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          else
+                             ! work(iu) is n by n and work(ir) is n by n
+                             ldwrku = n
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n*n + 2*n, prefer 2*n*n + n + n*nb)
+                          call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need 2*n*n + n + m, prefer 2*n*n + n + m*nb)
+                          call stdlib_qorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_qlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_qlaset( 'L', n-1, n-1, zero, zero,work( iu+1 ), ldwrku )
+                                    
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to
+                          ! work(ir)
+                          ! (workspace: need 2*n*n + 4*n,
+                                      ! prefer 2*n*n+3*n+2*n*nb)
+                          call stdlib_qgebrd( n, n, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_qlacpy( 'U', n, n, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (workspace: need 2*n*n + 4*n, prefer 2*n*n + 3*n + n*nb)
+                          call stdlib_qorgbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (workspace: need 2*n*n + 4*n-1,
+                                      ! prefer 2*n*n+3*n+(n-1)*nb)
+                          call stdlib_qorgbr( 'P', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in work(ir)
+                          ! (workspace: need 2*n*n + bdspac)
+                          call stdlib_qbdsqr( 'U', n, n, n, 0, s, work( ie ),work( ir ), ldwrkr, &
+                                    work( iu ),ldwrku, dum, 1, work( iwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(iu), storing result in a
+                          ! (workspace: need n*n)
+                          call stdlib_qgemm( 'N', 'N', m, n, n, one, u, ldu,work( iu ), ldwrku, &
+                                    zero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_qlacpy( 'F', m, n, a, lda, u, ldu )
+                          ! copy right singular vectors of r from work(ir) to a
+                          call stdlib_qlacpy( 'F', n, n, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need n + m, prefer n + m*nb)
+                          call stdlib_qorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_qlaset( 'L', n-1, n-1, zero, zero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                          call stdlib_qgebrd( n, n, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in a
+                          ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                          call stdlib_qormbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in a
+                          ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                          call stdlib_qorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in a
+                          ! (workspace: need bdspac)
+                          call stdlib_qbdsqr( 'U', n, n, m, 0, s, work( ie ), a,lda, u, ldu, dum, &
+                                    1, work( iwork ),info )
+                       end if
+                    else if( wntvas ) then
+                       ! path 9 (m much larger than n, jobu='a', jobvt='s'
+                               ! or 'a')
+                       ! m left singular vectors to be computed in u and
+                       ! n right singular vectors to be computed in vt
+                       if( lwork>=n*n+max( n+m, 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is n by n
+                             ldwrku = n
+                          end if
+                          itau = iu + ldwrku*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need n*n + 2*n, prefer n*n + n + n*nb)
+                          call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need n*n + n + m, prefer n*n + n + m*nb)
+                          call stdlib_qorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_qlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_qlaset( 'L', n-1, n-1, zero, zero,work( iu+1 ), ldwrku )
+                                    
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to vt
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + 2*n*nb)
+                          call stdlib_qgebrd( n, n, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_qlacpy( 'U', n, n, work( iu ), ldwrku, vt,ldvt )
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (workspace: need n*n + 4*n, prefer n*n + 3*n + n*nb)
+                          call stdlib_qorgbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (workspace: need n*n + 4*n-1,
+                                      ! prefer n*n+3*n+(n-1)*nb)
+                          call stdlib_qorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in vt
+                          ! (workspace: need n*n + bdspac)
+                          call stdlib_qbdsqr( 'U', n, n, n, 0, s, work( ie ), vt,ldvt, work( iu ),&
+                                     ldwrku, dum, 1,work( iwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(iu), storing result in a
+                          ! (workspace: need n*n)
+                          call stdlib_qgemm( 'N', 'N', m, n, n, one, u, ldu,work( iu ), ldwrku, &
+                                    zero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_qlacpy( 'F', m, n, a, lda, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n + n*nb)
+                          call stdlib_qgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need n + m, prefer n + m*nb)
+                          call stdlib_qorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r from a to vt, zeroing out below it
+                          call stdlib_qlacpy( 'U', n, n, a, lda, vt, ldvt )
+                          if( n>1 )call stdlib_qlaset( 'L', n-1, n-1, zero, zero,vt( 2, 1 ), ldvt &
+                                    )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in vt
+                          ! (workspace: need 4*n, prefer 3*n + 2*n*nb)
+                          call stdlib_qgebrd( n, n, vt, ldvt, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in vt
+                          ! (workspace: need 3*n + m, prefer 3*n + m*nb)
+                          call stdlib_qormbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                          call stdlib_qorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_qbdsqr( 'U', n, n, m, 0, s, work( ie ), vt,ldvt, u, ldu, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    end if
+                 end if
+              else
+                 ! m < mnthr
+                 ! path 10 (m at least n, but not much larger)
+                 ! reduce to bidiagonal form without qr decomposition
+                 ie = 1
+                 itauq = ie + n
+                 itaup = itauq + n
+                 iwork = itaup + n
+                 ! bidiagonalize a
+                 ! (workspace: need 3*n + m, prefer 3*n + (m + n)*nb)
+                 call stdlib_qgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                           work( iwork ), lwork-iwork+1,ierr )
+                 if( wntuas ) then
+                    ! if left singular vectors desired in u, copy result to u
+                    ! and generate left bidiagonalizing vectors in u
+                    ! (workspace: need 3*n + ncu, prefer 3*n + ncu*nb)
+                    call stdlib_qlacpy( 'L', m, n, a, lda, u, ldu )
+                    if( wntus )ncu = n
+                    if( wntua )ncu = m
+                    call stdlib_qorgbr( 'Q', m, ncu, n, u, ldu, work( itauq ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntvas ) then
+                    ! if right singular vectors desired in vt, copy result to
+                    ! vt and generate right bidiagonalizing vectors in vt
+                    ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                    call stdlib_qlacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_qorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntuo ) then
+                    ! if left singular vectors desired in a, generate left
+                    ! bidiagonalizing vectors in a
+                    ! (workspace: need 4*n, prefer 3*n + n*nb)
+                    call stdlib_qorgbr( 'Q', m, n, n, a, lda, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvo ) then
+                    ! if right singular vectors desired in a, generate right
+                    ! bidiagonalizing vectors in a
+                    ! (workspace: need 4*n-1, prefer 3*n + (n-1)*nb)
+                    call stdlib_qorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 iwork = ie + n
+                 if( wntuas .or. wntuo )nru = m
+                 if( wntun )nru = 0
+                 if( wntvas .or. wntvo )ncvt = n
+                 if( wntvn )ncvt = 0
+                 if( ( .not.wntuo ) .and. ( .not.wntvo ) ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in vt
+                    ! (workspace: need bdspac)
+                    call stdlib_qbdsqr( 'U', n, ncvt, nru, 0, s, work( ie ), vt,ldvt, u, ldu, dum,&
+                               1, work( iwork ), info )
+                 else if( ( .not.wntuo ) .and. wntvo ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in a
+                    ! (workspace: need bdspac)
+                    call stdlib_qbdsqr( 'U', n, ncvt, nru, 0, s, work( ie ), a, lda,u, ldu, dum, &
+                              1, work( iwork ), info )
+                 else
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in a and computing right singular
+                    ! vectors in vt
+                    ! (workspace: need bdspac)
+                    call stdlib_qbdsqr( 'U', n, ncvt, nru, 0, s, work( ie ), vt,ldvt, a, lda, dum,&
+                               1, work( iwork ), info )
+                 end if
+              end if
+           else
+              ! a has more columns than rows. if a has sufficiently more
+              ! columns than rows, first reduce using the lq decomposition (if
+              ! sufficient workspace available)
+              if( n>=mnthr ) then
+                 if( wntvn ) then
+                    ! path 1t(n much larger than m, jobvt='n')
+                    ! no right singular vectors to be computed
+                    itau = 1
+                    iwork = itau + m
+                    ! compute a=l*q
+                    ! (workspace: need 2*m, prefer m + m*nb)
+                    call stdlib_qgelqf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                              ierr )
+                    ! zero out above l
+                    call stdlib_qlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ), lda )
+                    ie = 1
+                    itauq = ie + m
+                    itaup = itauq + m
+                    iwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                    call stdlib_qgebrd( m, m, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( iwork ), lwork-iwork+1,ierr )
+                    if( wntuo .or. wntuas ) then
+                       ! if left singular vectors desired, generate q
+                       ! (workspace: need 4*m, prefer 3*m + m*nb)
+                       call stdlib_qorgbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                    end if
+                    iwork = ie + m
+                    nru = 0
+                    if( wntuo .or. wntuas )nru = m
+                    ! perform bidiagonal qr iteration, computing left singular
+                    ! vectors of a in a if desired
+                    ! (workspace: need bdspac)
+                    call stdlib_qbdsqr( 'U', m, 0, nru, 0, s, work( ie ), dum, 1, a,lda, dum, 1, &
+                              work( iwork ), info )
+                    ! if left singular vectors desired in u, copy them there
+                    if( wntuas )call stdlib_qlacpy( 'F', m, m, a, lda, u, ldu )
+                 else if( wntvo .and. wntun ) then
+                    ! path 2t(n much larger than m, jobu='n', jobvt='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! no left singular vectors to be computed
+                    if( lwork>=m*m+max( 4*m, bdspac ) ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n + m ) + lda*m ) then
+                          ! work(iu) is lda by n and work(ir) is lda by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n + m ) + m*m ) then
+                          ! work(iu) is lda by n and work(ir) is m by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = m
+                       else
+                          ! work(iu) is m by chunk and work(ir) is m by m
+                          ldwrku = m
+                          chunk = ( lwork-m*m-m ) / m
+                          ldwrkr = m
+                       end if
+                       itau = ir + ldwrkr*m
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                       call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to work(ir) and zero out above it
+                       call stdlib_qlacpy( 'L', m, m, a, lda, work( ir ), ldwrkr )
+                       call stdlib_qlaset( 'U', m-1, m-1, zero, zero,work( ir+ldwrkr ), ldwrkr )
+                                 
+                       ! generate q in a
+                       ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                       call stdlib_qorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + m
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in work(ir)
+                       ! (workspace: need m*m + 4*m, prefer m*m + 3*m + 2*m*nb)
+                       call stdlib_qgebrd( m, m, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
+                                 work( itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing l
+                       ! (workspace: need m*m + 4*m-1, prefer m*m + 3*m + (m-1)*nb)
+                       call stdlib_qorgbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       iwork = ie + m
+                       ! perform bidiagonal qr iteration, computing right
+                       ! singular vectors of l in work(ir)
+                       ! (workspace: need m*m + bdspac)
+                       call stdlib_qbdsqr( 'U', m, m, 0, 0, s, work( ie ),work( ir ), ldwrkr, dum,&
+                                  1, dum, 1,work( iwork ), info )
+                       iu = ie + m
+                       ! multiply right singular vectors of l in work(ir) by q
+                       ! in a, storing result in work(iu) and copying to a
+                       ! (workspace: need m*m + 2*m, prefer m*m + m*n + m)
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_qgemm( 'N', 'N', m, blk, m, one, work( ir ),ldwrkr, a( 1, i &
+                                    ), lda, zero,work( iu ), ldwrku )
+                          call stdlib_qlacpy( 'F', m, blk, work( iu ), ldwrku,a( 1, i ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       ie = 1
+                       itauq = ie + m
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize a
+                       ! (workspace: need 3*m + n, prefer 3*m + (m + n)*nb)
+                       call stdlib_qgebrd( m, n, a, lda, s, work( ie ),work( itauq ), work( itaup &
+                                 ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing a
+                       ! (workspace: need 4*m, prefer 3*m + m*nb)
+                       call stdlib_qorgbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + m
+                       ! perform bidiagonal qr iteration, computing right
+                       ! singular vectors of a in a
+                       ! (workspace: need bdspac)
+                       call stdlib_qbdsqr( 'L', m, n, 0, 0, s, work( ie ), a, lda,dum, 1, dum, 1, &
+                                 work( iwork ), info )
+                    end if
+                 else if( wntvo .and. wntuas ) then
+                    ! path 3t(n much larger than m, jobu='s' or 'a', jobvt='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! m left singular vectors to be computed in u
+                    if( lwork>=m*m+max( 4*m, bdspac ) ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n + m ) + lda*m ) then
+                          ! work(iu) is lda by n and work(ir) is lda by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n + m ) + m*m ) then
+                          ! work(iu) is lda by n and work(ir) is m by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = m
+                       else
+                          ! work(iu) is m by chunk and work(ir) is m by m
+                          ldwrku = m
+                          chunk = ( lwork-m*m-m ) / m
+                          ldwrkr = m
+                       end if
+                       itau = ir + ldwrkr*m
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                       call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to u, zeroing about above it
+                       call stdlib_qlacpy( 'L', m, m, a, lda, u, ldu )
+                       call stdlib_qlaset( 'U', m-1, m-1, zero, zero, u( 1, 2 ),ldu )
+                       ! generate q in a
+                       ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                       call stdlib_qorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + m
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in u, copying result to work(ir)
+                       ! (workspace: need m*m + 4*m, prefer m*m + 3*m + 2*m*nb)
+                       call stdlib_qgebrd( m, m, u, ldu, s, work( ie ),work( itauq ), work( itaup &
+                                 ),work( iwork ), lwork-iwork+1, ierr )
+                       call stdlib_qlacpy( 'U', m, m, u, ldu, work( ir ), ldwrkr )
+                       ! generate right vectors bidiagonalizing l in work(ir)
+                       ! (workspace: need m*m + 4*m-1, prefer m*m + 3*m + (m-1)*nb)
+                       call stdlib_qorgbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing l in u
+                       ! (workspace: need m*m + 4*m, prefer m*m + 3*m + m*nb)
+                       call stdlib_qorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + m
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of l in u, and computing right
+                       ! singular vectors of l in work(ir)
+                       ! (workspace: need m*m + bdspac)
+                       call stdlib_qbdsqr( 'U', m, m, m, 0, s, work( ie ),work( ir ), ldwrkr, u, &
+                                 ldu, dum, 1,work( iwork ), info )
+                       iu = ie + m
+                       ! multiply right singular vectors of l in work(ir) by q
+                       ! in a, storing result in work(iu) and copying to a
+                       ! (workspace: need m*m + 2*m, prefer m*m + m*n + m))
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_qgemm( 'N', 'N', m, blk, m, one, work( ir ),ldwrkr, a( 1, i &
+                                    ), lda, zero,work( iu ), ldwrku )
+                          call stdlib_qlacpy( 'F', m, blk, work( iu ), ldwrku,a( 1, i ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       itau = 1
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (workspace: need 2*m, prefer m + m*nb)
+                       call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to u, zeroing out above it
+                       call stdlib_qlacpy( 'L', m, m, a, lda, u, ldu )
+                       call stdlib_qlaset( 'U', m-1, m-1, zero, zero, u( 1, 2 ),ldu )
+                       ! generate q in a
+                       ! (workspace: need 2*m, prefer m + m*nb)
+                       call stdlib_qorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + m
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in u
+                       ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                       call stdlib_qgebrd( m, m, u, ldu, s, work( ie ),work( itauq ), work( itaup &
+                                 ),work( iwork ), lwork-iwork+1, ierr )
+                       ! multiply right vectors bidiagonalizing l by q in a
+                       ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                       call stdlib_qormbr( 'P', 'L', 'T', m, n, m, u, ldu,work( itaup ), a, lda, &
+                                 work( iwork ),lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing l in u
+                       ! (workspace: need 4*m, prefer 3*m + m*nb)
+                       call stdlib_qorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + m
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in u and computing right
+                       ! singular vectors of a in a
+                       ! (workspace: need bdspac)
+                       call stdlib_qbdsqr( 'U', m, n, m, 0, s, work( ie ), a, lda,u, ldu, dum, 1, &
+                                 work( iwork ), info )
+                    end if
+                 else if( wntvs ) then
+                    if( wntun ) then
+                       ! path 4t(n much larger than m, jobu='n', jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! no left singular vectors to be computed
+                       if( lwork>=m*m+max( 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(ir) is lda by m
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is m by m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                          call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(ir), zeroing out above it
+                          call stdlib_qlacpy( 'L', m, m, a, lda, work( ir ),ldwrkr )
+                          call stdlib_qlaset( 'U', m-1, m-1, zero, zero,work( ir+ldwrkr ), ldwrkr &
+                                    )
+                          ! generate q in a
+                          ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                          call stdlib_qorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(ir)
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + 2*m*nb)
+                          call stdlib_qgebrd( m, m, work( ir ), ldwrkr, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right vectors bidiagonalizing l in
+                          ! work(ir)
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + (m-1)*nb)
+                          call stdlib_qorgbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of l in work(ir)
+                          ! (workspace: need m*m + bdspac)
+                          call stdlib_qbdsqr( 'U', m, m, 0, 0, s, work( ie ),work( ir ), ldwrkr, &
+                                    dum, 1, dum, 1,work( iwork ), info )
+                          ! multiply right singular vectors of l in work(ir) by
+                          ! q in a, storing result in vt
+                          ! (workspace: need m*m)
+                          call stdlib_qgemm( 'N', 'N', m, n, m, one, work( ir ),ldwrkr, a, lda, &
+                                    zero, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy result to vt
+                          call stdlib_qlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_qorglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_qlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ),lda )
+                          ! bidiagonalize l in a
+                          ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                          call stdlib_qgebrd( m, m, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right vectors bidiagonalizing l by q in vt
+                          ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                          call stdlib_qormbr( 'P', 'L', 'T', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_qbdsqr( 'U', m, n, 0, 0, s, work( ie ), vt,ldvt, dum, 1, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    else if( wntuo ) then
+                       ! path 5t(n much larger than m, jobu='o', jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! m left singular vectors to be overwritten on a
+                       if( lwork>=2*m*m+max( 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*m ) then
+                             ! work(iu) is lda by m and work(ir) is lda by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda + m )*m ) then
+                             ! work(iu) is lda by m and work(ir) is m by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          else
+                             ! work(iu) is m by m and work(ir) is m by m
+                             ldwrku = m
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (workspace: need 2*m*m + 2*m, prefer 2*m*m + m + m*nb)
+                          call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out below it
+                          call stdlib_qlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_qlaset( 'U', m-1, m-1, zero, zero,work( iu+ldwrku ), ldwrku &
+                                    )
+                          ! generate q in a
+                          ! (workspace: need 2*m*m + 2*m, prefer 2*m*m + m + m*nb)
+                          call stdlib_qorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to
+                          ! work(ir)
+                          ! (workspace: need 2*m*m + 4*m,
+                                      ! prefer 2*m*m+3*m+2*m*nb)
+                          call stdlib_qgebrd( m, m, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_qlacpy( 'L', m, m, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (workspace: need 2*m*m + 4*m-1,
+                                      ! prefer 2*m*m+3*m+(m-1)*nb)
+                          call stdlib_qorgbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (workspace: need 2*m*m + 4*m, prefer 2*m*m + 3*m + m*nb)
+                          call stdlib_qorgbr( 'Q', m, m, m, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in work(ir) and computing
+                          ! right singular vectors of l in work(iu)
+                          ! (workspace: need 2*m*m + bdspac)
+                          call stdlib_qbdsqr( 'U', m, m, m, 0, s, work( ie ),work( iu ), ldwrku, &
+                                    work( ir ),ldwrkr, dum, 1, work( iwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in a, storing result in vt
+                          ! (workspace: need m*m)
+                          call stdlib_qgemm( 'N', 'N', m, n, m, one, work( iu ),ldwrku, a, lda, &
+                                    zero, vt, ldvt )
+                          ! copy left singular vectors of l to a
+                          ! (workspace: need m*m)
+                          call stdlib_qlacpy( 'F', m, m, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_qorglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_qlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ),lda )
+                          ! bidiagonalize l in a
+                          ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                          call stdlib_qgebrd( m, m, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right vectors bidiagonalizing l by q in vt
+                          ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                          call stdlib_qormbr( 'P', 'L', 'T', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors of l in a
+                          ! (workspace: need 4*m, prefer 3*m + m*nb)
+                          call stdlib_qorgbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, compute left
+                          ! singular vectors of a in a and compute right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_qbdsqr( 'U', m, n, m, 0, s, work( ie ), vt,ldvt, a, lda, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    else if( wntuas ) then
+                       ! path 6t(n much larger than m, jobu='s' or 'a',
+                               ! jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! m left singular vectors to be computed in u
+                       if( lwork>=m*m+max( 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is lda by m
+                             ldwrku = m
+                          end if
+                          itau = iu + ldwrku*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                          call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_qlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_qlaset( 'U', m-1, m-1, zero, zero,work( iu+ldwrku ), ldwrku &
+                                    )
+                          ! generate q in a
+                          ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                          call stdlib_qorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to u
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + 2*m*nb)
+                          call stdlib_qgebrd( m, m, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_qlacpy( 'L', m, m, work( iu ), ldwrku, u,ldu )
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (workspace: need m*m + 4*m-1,
+                                      ! prefer m*m+3*m+(m-1)*nb)
+                          call stdlib_qorgbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + m*nb)
+                          call stdlib_qorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in u and computing right
+                          ! singular vectors of l in work(iu)
+                          ! (workspace: need m*m + bdspac)
+                          call stdlib_qbdsqr( 'U', m, m, m, 0, s, work( ie ),work( iu ), ldwrku, &
+                                    u, ldu, dum, 1,work( iwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in a, storing result in vt
+                          ! (workspace: need m*m)
+                          call stdlib_qgemm( 'N', 'N', m, n, m, one, work( iu ),ldwrku, a, lda, &
+                                    zero, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_qorglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to u, zeroing out above it
+                          call stdlib_qlacpy( 'L', m, m, a, lda, u, ldu )
+                          call stdlib_qlaset( 'U', m-1, m-1, zero, zero, u( 1, 2 ),ldu )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in u
+                          ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                          call stdlib_qgebrd( m, m, u, ldu, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in u by q
+                          ! in vt
+                          ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                          call stdlib_qormbr( 'P', 'L', 'T', m, n, m, u, ldu,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (workspace: need 4*m, prefer 3*m + m*nb)
+                          call stdlib_qorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_qbdsqr( 'U', m, n, m, 0, s, work( ie ), vt,ldvt, u, ldu, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    end if
+                 else if( wntva ) then
+                    if( wntun ) then
+                       ! path 7t(n much larger than m, jobu='n', jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! no left singular vectors to be computed
+                       if( lwork>=m*m+max( n + m, 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(ir) is lda by m
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is m by m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                          call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! copy l to work(ir), zeroing out above it
+                          call stdlib_qlacpy( 'L', m, m, a, lda, work( ir ),ldwrkr )
+                          call stdlib_qlaset( 'U', m-1, m-1, zero, zero,work( ir+ldwrkr ), ldwrkr &
+                                    )
+                          ! generate q in vt
+                          ! (workspace: need m*m + m + n, prefer m*m + m + n*nb)
+                          call stdlib_qorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(ir)
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + 2*m*nb)
+                          call stdlib_qgebrd( m, m, work( ir ), ldwrkr, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (workspace: need m*m + 4*m-1,
+                                      ! prefer m*m+3*m+(m-1)*nb)
+                          call stdlib_qorgbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of l in work(ir)
+                          ! (workspace: need m*m + bdspac)
+                          call stdlib_qbdsqr( 'U', m, m, 0, 0, s, work( ie ),work( ir ), ldwrkr, &
+                                    dum, 1, dum, 1,work( iwork ), info )
+                          ! multiply right singular vectors of l in work(ir) by
+                          ! q in vt, storing result in a
+                          ! (workspace: need m*m)
+                          call stdlib_qgemm( 'N', 'N', m, n, m, one, work( ir ),ldwrkr, vt, ldvt, &
+                                    zero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_qlacpy( 'F', m, n, a, lda, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need m + n, prefer m + n*nb)
+                          call stdlib_qorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_qlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ),lda )
+                          ! bidiagonalize l in a
+                          ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                          call stdlib_qgebrd( m, m, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in a by q
+                          ! in vt
+                          ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                          call stdlib_qormbr( 'P', 'L', 'T', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_qbdsqr( 'U', m, n, 0, 0, s, work( ie ), vt,ldvt, dum, 1, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    else if( wntuo ) then
+                       ! path 8t(n much larger than m, jobu='o', jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! m left singular vectors to be overwritten on a
+                       if( lwork>=2*m*m+max( n + m, 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*m ) then
+                             ! work(iu) is lda by m and work(ir) is lda by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda + m )*m ) then
+                             ! work(iu) is lda by m and work(ir) is m by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          else
+                             ! work(iu) is m by m and work(ir) is m by m
+                             ldwrku = m
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m*m + 2*m, prefer 2*m*m + m + m*nb)
+                          call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need 2*m*m + m + n, prefer 2*m*m + m + n*nb)
+                          call stdlib_qorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_qlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_qlaset( 'U', m-1, m-1, zero, zero,work( iu+ldwrku ), ldwrku &
+                                    )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to
+                          ! work(ir)
+                          ! (workspace: need 2*m*m + 4*m,
+                                      ! prefer 2*m*m+3*m+2*m*nb)
+                          call stdlib_qgebrd( m, m, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_qlacpy( 'L', m, m, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (workspace: need 2*m*m + 4*m-1,
+                                      ! prefer 2*m*m+3*m+(m-1)*nb)
+                          call stdlib_qorgbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (workspace: need 2*m*m + 4*m, prefer 2*m*m + 3*m + m*nb)
+                          call stdlib_qorgbr( 'Q', m, m, m, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in work(ir) and computing
+                          ! right singular vectors of l in work(iu)
+                          ! (workspace: need 2*m*m + bdspac)
+                          call stdlib_qbdsqr( 'U', m, m, m, 0, s, work( ie ),work( iu ), ldwrku, &
+                                    work( ir ),ldwrkr, dum, 1, work( iwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in vt, storing result in a
+                          ! (workspace: need m*m)
+                          call stdlib_qgemm( 'N', 'N', m, n, m, one, work( iu ),ldwrku, vt, ldvt, &
+                                    zero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_qlacpy( 'F', m, n, a, lda, vt, ldvt )
+                          ! copy left singular vectors of a from work(ir) to a
+                          call stdlib_qlacpy( 'F', m, m, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need m + n, prefer m + n*nb)
+                          call stdlib_qorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_qlaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ),lda )
+                          ! bidiagonalize l in a
+                          ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                          call stdlib_qgebrd( m, m, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in a by q
+                          ! in vt
+                          ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                          call stdlib_qormbr( 'P', 'L', 'T', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in a
+                          ! (workspace: need 4*m, prefer 3*m + m*nb)
+                          call stdlib_qorgbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in a and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_qbdsqr( 'U', m, n, m, 0, s, work( ie ), vt,ldvt, a, lda, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    else if( wntuas ) then
+                       ! path 9t(n much larger than m, jobu='s' or 'a',
+                               ! jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! m left singular vectors to be computed in u
+                       if( lwork>=m*m+max( n + m, 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(iu) is lda by m
+                             ldwrku = lda
+                          else
+                             ! work(iu) is m by m
+                             ldwrku = m
+                          end if
+                          itau = iu + ldwrku*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need m*m + 2*m, prefer m*m + m + m*nb)
+                          call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need m*m + m + n, prefer m*m + m + n*nb)
+                          call stdlib_qorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_qlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_qlaset( 'U', m-1, m-1, zero, zero,work( iu+ldwrku ), ldwrku &
+                                    )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to u
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + 2*m*nb)
+                          call stdlib_qgebrd( m, m, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_qlacpy( 'L', m, m, work( iu ), ldwrku, u,ldu )
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + (m-1)*nb)
+                          call stdlib_qorgbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (workspace: need m*m + 4*m, prefer m*m + 3*m + m*nb)
+                          call stdlib_qorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in u and computing right
+                          ! singular vectors of l in work(iu)
+                          ! (workspace: need m*m + bdspac)
+                          call stdlib_qbdsqr( 'U', m, m, m, 0, s, work( ie ),work( iu ), ldwrku, &
+                                    u, ldu, dum, 1,work( iwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in vt, storing result in a
+                          ! (workspace: need m*m)
+                          call stdlib_qgemm( 'N', 'N', m, n, m, one, work( iu ),ldwrku, vt, ldvt, &
+                                    zero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_qlacpy( 'F', m, n, a, lda, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m + m*nb)
+                          call stdlib_qgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_qlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need m + n, prefer m + n*nb)
+                          call stdlib_qorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to u, zeroing out above it
+                          call stdlib_qlacpy( 'L', m, m, a, lda, u, ldu )
+                          call stdlib_qlaset( 'U', m-1, m-1, zero, zero, u( 1, 2 ),ldu )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in u
+                          ! (workspace: need 4*m, prefer 3*m + 2*m*nb)
+                          call stdlib_qgebrd( m, m, u, ldu, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in u by q
+                          ! in vt
+                          ! (workspace: need 3*m + n, prefer 3*m + n*nb)
+                          call stdlib_qormbr( 'P', 'L', 'T', m, n, m, u, ldu,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (workspace: need 4*m, prefer 3*m + m*nb)
+                          call stdlib_qorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_qbdsqr( 'U', m, n, m, 0, s, work( ie ), vt,ldvt, u, ldu, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    end if
+                 end if
+              else
+                 ! n < mnthr
+                 ! path 10t(n greater than m, but not much larger)
+                 ! reduce to bidiagonal form without lq decomposition
+                 ie = 1
+                 itauq = ie + m
+                 itaup = itauq + m
+                 iwork = itaup + m
+                 ! bidiagonalize a
+                 ! (workspace: need 3*m + n, prefer 3*m + (m + n)*nb)
+                 call stdlib_qgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                           work( iwork ), lwork-iwork+1,ierr )
+                 if( wntuas ) then
+                    ! if left singular vectors desired in u, copy result to u
+                    ! and generate left bidiagonalizing vectors in u
+                    ! (workspace: need 4*m-1, prefer 3*m + (m-1)*nb)
+                    call stdlib_qlacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_qorgbr( 'Q', m, m, n, u, ldu, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvas ) then
+                    ! if right singular vectors desired in vt, copy result to
+                    ! vt and generate right bidiagonalizing vectors in vt
+                    ! (workspace: need 3*m + nrvt, prefer 3*m + nrvt*nb)
+                    call stdlib_qlacpy( 'U', m, n, a, lda, vt, ldvt )
+                    if( wntva )nrvt = n
+                    if( wntvs )nrvt = m
+                    call stdlib_qorgbr( 'P', nrvt, n, m, vt, ldvt, work( itaup ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntuo ) then
+                    ! if left singular vectors desired in a, generate left
+                    ! bidiagonalizing vectors in a
+                    ! (workspace: need 4*m-1, prefer 3*m + (m-1)*nb)
+                    call stdlib_qorgbr( 'Q', m, m, n, a, lda, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvo ) then
+                    ! if right singular vectors desired in a, generate right
+                    ! bidiagonalizing vectors in a
+                    ! (workspace: need 4*m, prefer 3*m + m*nb)
+                    call stdlib_qorgbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 iwork = ie + m
+                 if( wntuas .or. wntuo )nru = m
+                 if( wntun )nru = 0
+                 if( wntvas .or. wntvo )ncvt = n
+                 if( wntvn )ncvt = 0
+                 if( ( .not.wntuo ) .and. ( .not.wntvo ) ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in vt
+                    ! (workspace: need bdspac)
+                    call stdlib_qbdsqr( 'L', m, ncvt, nru, 0, s, work( ie ), vt,ldvt, u, ldu, dum,&
+                               1, work( iwork ), info )
+                 else if( ( .not.wntuo ) .and. wntvo ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in a
+                    ! (workspace: need bdspac)
+                    call stdlib_qbdsqr( 'L', m, ncvt, nru, 0, s, work( ie ), a, lda,u, ldu, dum, &
+                              1, work( iwork ), info )
+                 else
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in a and computing right singular
+                    ! vectors in vt
+                    ! (workspace: need bdspac)
+                    call stdlib_qbdsqr( 'L', m, ncvt, nru, 0, s, work( ie ), vt,ldvt, a, lda, dum,&
+                               1, work( iwork ), info )
+                 end if
+              end if
+           end if
+           ! if stdlib_qbdsqr failed to converge, copy unconverged superdiagonals
+           ! to work( 2:minmn )
+           if( info/=0 ) then
+              if( ie>2 ) then
+                 do i = 1, minmn - 1
+                    work( i+1 ) = work( i+ie-1 )
+                 end do
+              end if
+              if( ie<2 ) then
+                 do i = minmn - 1, 1, -1
+                    work( i+1 ) = work( i+ie-1 )
+                 end do
+              end if
+           end if
+           ! undo scaling if necessary
+           if( iscl==1 ) then
+              if( anrm>bignum )call stdlib_qlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm>bignum )call stdlib_qlascl( 'G', 0, 0, bignum, anrm, minmn-1,&
+                         1, work( 2 ),minmn, ierr )
+              if( anrm<smlnum )call stdlib_qlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm<smlnum )call stdlib_qlascl( 'G', 0, 0, smlnum, anrm, minmn-1,&
+                         1, work( 2 ),minmn, ierr )
+           end if
+           ! return optimal workspace in work(1)
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_qgesvd
+
+     !> DGESVDQ: computes the singular value decomposition (SVD) of a real
+     !> M-by-N matrix A, where M >= N. The SVD of A is written as
+     !> [++]   [xx]   [x0]   [xx]
+     !> A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+     !> [++]   [xx]
+     !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+     !> matrix, and V is an N-by-N orthogonal matrix. The diagonal elements
+     !> of SIGMA are the singular values of A. The columns of U and V are the
+     !> left and the right singular vectors of A, respectively.
+
+     subroutine stdlib_qgesvdq( joba, jobp, jobr, jobu, jobv, m, n, a, lda,s, u, ldu, v, ldv, &
+               numrank, iwork, liwork,work, lwork, rwork, lrwork, info )
+           ! Scalar Arguments 
+           character, intent(in) :: joba, jobp, jobr, jobu, jobv
+           integer(ilp), intent(in) :: m, n, lda, ldu, ldv, liwork, lrwork
+           integer(ilp), intent(out) :: numrank, info
+           integer(ilp), intent(inout) :: lwork
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: u(ldu,*), v(ldv,*), work(*)
+           real(qp), intent(out) :: s(*), rwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: ierr, iwoff, nr, n1, optratio, p, q
+           integer(ilp) :: lwcon, lwqp3, lwrk_qgelqf, lwrk_qgesvd, lwrk_qgesvd2, lwrk_qgeqp3, &
+           lwrk_qgeqrf, lwrk_qormlq, lwrk_qormqr, lwrk_qormqr2, lwlqf, lwqrf, lwsvd, lwsvd2, &
+                     lworq, lworq2, lworlq, minwrk, minwrk2, optwrk, optwrk2, iminwrk, rminwrk
+           logical(lk) :: accla, acclm, acclh, ascaled, conda, dntwu, dntwv, lquery, lsvc0, lsvec,&
+                      rowprm, rsvec, rtrans, wntua, wntuf, wntur, wntus, wntva, wntvr
+           real(qp) :: big, epsln, rtmp, sconda, sfmin
+           ! Local Arrays
+           real(qp) :: rdummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,sqrt
+           ! test the input arguments
+           wntus  = stdlib_lsame( jobu, 'S' ) .or. stdlib_lsame( jobu, 'U' )
+           wntur  = stdlib_lsame( jobu, 'R' )
+           wntua  = stdlib_lsame( jobu, 'A' )
+           wntuf  = stdlib_lsame( jobu, 'F' )
+           lsvc0  = wntus .or. wntur .or. wntua
+           lsvec  = lsvc0 .or. wntuf
+           dntwu  = stdlib_lsame( jobu, 'N' )
+           wntvr  = stdlib_lsame( jobv, 'R' )
+           wntva  = stdlib_lsame( jobv, 'A' ) .or. stdlib_lsame( jobv, 'V' )
+           rsvec  = wntvr .or. wntva
+           dntwv  = stdlib_lsame( jobv, 'N' )
+           accla  = stdlib_lsame( joba, 'A' )
+           acclm  = stdlib_lsame( joba, 'M' )
+           conda  = stdlib_lsame( joba, 'E' )
+           acclh  = stdlib_lsame( joba, 'H' ) .or. conda
+           rowprm = stdlib_lsame( jobp, 'P' )
+           rtrans = stdlib_lsame( jobr, 'T' )
+           if ( rowprm ) then
+              if ( conda ) then
+                 iminwrk = max( 1, n + m - 1 + n )
+              else
+                 iminwrk = max( 1, n + m - 1 )
+              end if
+              rminwrk = max( 2, m )
+           else
+              if ( conda ) then
+                 iminwrk = max( 1, n + n )
+              else
+                 iminwrk = max( 1, n )
+              end if
+              rminwrk = 2
+           end if
+           lquery = (liwork == -1 .or. lwork == -1 .or. lrwork == -1)
+           info  = 0
+           if ( .not. ( accla .or. acclm .or. acclh ) ) then
+              info = -1
+           else if ( .not.( rowprm .or. stdlib_lsame( jobp, 'N' ) ) ) then
+               info = -2
+           else if ( .not.( rtrans .or. stdlib_lsame( jobr, 'N' ) ) ) then
+               info = -3
+           else if ( .not.( lsvec .or. dntwu ) ) then
+              info = -4
+           else if ( wntur .and. wntva ) then
+              info = -5
+           else if ( .not.( rsvec .or. dntwv )) then
+              info = -5
+           else if ( m<0 ) then
+              info = -6
+           else if ( ( n<0 ) .or. ( n>m ) ) then
+              info = -7
+           else if ( lda<max( 1, m ) ) then
+              info = -9
+           else if ( ldu<1 .or. ( lsvc0 .and. ldu<m ) .or.( wntuf .and. ldu<n ) ) then
+              info = -12
+           else if ( ldv<1 .or. ( rsvec .and. ldv<n ) .or.( conda .and. ldv<n ) ) then
+              info = -14
+           else if ( liwork < iminwrk .and. .not. lquery ) then
+              info = -17
+           end if
+           if ( info == 0 ) then
+              ! Compute The Minimal And The Optimal Workspace Lengths
+              ! [[the expressions for computing the minimal and the optimal
+              ! values of lwork are written with a lot of redundancy and
+              ! can be simplified. however, this detailed form is easier for
+              ! maintenance and modifications of the code.]]
+              ! Minimal Workspace Length For Stdlib_Dgeqp3 Of An M X N Matrix
+              lwqp3 = 3 * n + 1
+              ! Minimal Workspace Length For Stdlib_Dormqr To Build Left Singular Vectors
+              if ( wntus .or. wntur ) then
+                  lworq  = max( n  , 1 )
+              else if ( wntua ) then
+                  lworq = max( m , 1 )
+              end if
+              ! Minimal Workspace Length For Stdlib_Dpocon Of An N X N Matrix
+              lwcon = 3 * n
+              ! Stdlib_Dgesvd Of An N X N Matrix
+              lwsvd = max( 5 * n, 1 )
+              if ( lquery ) then
+                  call stdlib_qgeqp3( m, n, a, lda, iwork, rdummy, rdummy, -1,ierr )
+                  lwrk_qgeqp3 = int( rdummy(1),KIND=ilp)
+                  if ( wntus .or. wntur ) then
+                      call stdlib_qormqr( 'L', 'N', m, n, n, a, lda, rdummy, u,ldu, rdummy, -1, &
+                                ierr )
+                      lwrk_qormqr = int( rdummy(1),KIND=ilp)
+                  else if ( wntua ) then
+                      call stdlib_qormqr( 'L', 'N', m, m, n, a, lda, rdummy, u,ldu, rdummy, -1, &
+                                ierr )
+                      lwrk_qormqr = int( rdummy(1),KIND=ilp)
+                  else
+                      lwrk_qormqr = 0
+                  end if
+              end if
+              minwrk = 2
+              optwrk = 2
+              if ( .not. (lsvec .or. rsvec )) then
+                  ! Minimal And Optimal Sizes Of The Workspace If
+                  ! only the singular values are requested
+                  if ( conda ) then
+                     minwrk = max( n+lwqp3, lwcon, lwsvd )
+                  else
+                     minwrk = max( n+lwqp3, lwsvd )
+                  end if
+                  if ( lquery ) then
+                      call stdlib_qgesvd( 'N', 'N', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                ierr )
+                      lwrk_qgesvd = int( rdummy(1),KIND=ilp)
+                      if ( conda ) then
+                         optwrk = max( n+lwrk_qgeqp3, n+lwcon, lwrk_qgesvd )
+                      else
+                         optwrk = max( n+lwrk_qgeqp3, lwrk_qgesvd )
+                      end if
+                  end if
+              else if ( lsvec .and. (.not.rsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Workspace If The
+                  ! singular values and the left singular vectors are requested
+                  if ( conda ) then
+                      minwrk = n + max( lwqp3, lwcon, lwsvd, lworq )
+                  else
+                      minwrk = n + max( lwqp3, lwsvd, lworq )
+                  end if
+                  if ( lquery ) then
+                     if ( rtrans ) then
+                        call stdlib_qgesvd( 'N', 'O', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                  ierr )
+                     else
+                        call stdlib_qgesvd( 'O', 'N', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                  ierr )
+                     end if
+                     lwrk_qgesvd = int( rdummy(1),KIND=ilp)
+                     if ( conda ) then
+                         optwrk = n + max( lwrk_qgeqp3, lwcon, lwrk_qgesvd,lwrk_qormqr )
+                     else
+                         optwrk = n + max( lwrk_qgeqp3, lwrk_qgesvd,lwrk_qormqr )
+                     end if
+                  end if
+              else if ( rsvec .and. (.not.lsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Workspace If The
+                  ! singular values and the right singular vectors are requested
+                  if ( conda ) then
+                      minwrk = n + max( lwqp3, lwcon, lwsvd )
+                  else
+                      minwrk = n + max( lwqp3, lwsvd )
+                  end if
+                  if ( lquery ) then
+                      if ( rtrans ) then
+                          call stdlib_qgesvd( 'O', 'N', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -&
+                                    1, ierr )
+                      else
+                          call stdlib_qgesvd( 'N', 'O', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -&
+                                    1, ierr )
+                      end if
+                      lwrk_qgesvd = int( rdummy(1),KIND=ilp)
+                      if ( conda ) then
+                          optwrk = n + max( lwrk_qgeqp3, lwcon, lwrk_qgesvd )
+                      else
+                          optwrk = n + max( lwrk_qgeqp3, lwrk_qgesvd )
+                      end if
+                  end if
+              else
+                  ! Minimal And Optimal Sizes Of The Workspace If The
+                  ! full svd is requested
+                  if ( rtrans ) then
+                      minwrk = max( lwqp3, lwsvd, lworq )
+                      if ( conda ) minwrk = max( minwrk, lwcon )
+                      minwrk = minwrk + n
+                      if ( wntva ) then
+                         ! .. minimal workspace length for n x n/2 stdlib_qgeqrf
+                         lwqrf  = max( n/2, 1 )
+                         ! .. minimal workspace length for n/2 x n/2 stdlib_qgesvd
+                         lwsvd2 = max( 5 * (n/2), 1 )
+                         lworq2 = max( n, 1 )
+                         minwrk2 = max( lwqp3, n/2+lwqrf, n/2+lwsvd2,n/2+lworq2, lworq )
+                         if ( conda ) minwrk2 = max( minwrk2, lwcon )
+                         minwrk2 = n + minwrk2
+                         minwrk = max( minwrk, minwrk2 )
+                      end if
+                  else
+                      minwrk = max( lwqp3, lwsvd, lworq )
+                      if ( conda ) minwrk = max( minwrk, lwcon )
+                      minwrk = minwrk + n
+                      if ( wntva ) then
+                         ! .. minimal workspace length for n/2 x n stdlib_qgelqf
+                         lwlqf  = max( n/2, 1 )
+                         lwsvd2 = max( 5 * (n/2), 1 )
+                         lworlq = max( n , 1 )
+                         minwrk2 = max( lwqp3, n/2+lwlqf, n/2+lwsvd2,n/2+lworlq, lworq )
+                         if ( conda ) minwrk2 = max( minwrk2, lwcon )
+                         minwrk2 = n + minwrk2
+                         minwrk = max( minwrk, minwrk2 )
+                      end if
+                  end if
+                  if ( lquery ) then
+                     if ( rtrans ) then
+                        call stdlib_qgesvd( 'O', 'A', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                  ierr )
+                        lwrk_qgesvd = int( rdummy(1),KIND=ilp)
+                        optwrk = max(lwrk_qgeqp3,lwrk_qgesvd,lwrk_qormqr)
+                        if ( conda ) optwrk = max( optwrk, lwcon )
+                        optwrk = n + optwrk
+                        if ( wntva ) then
+                            call stdlib_qgeqrf(n,n/2,u,ldu,rdummy,rdummy,-1,ierr)
+                            lwrk_qgeqrf = int( rdummy(1),KIND=ilp)
+                            call stdlib_qgesvd( 'S', 'O', n/2,n/2, v,ldv, s, u,ldu,v, ldv, rdummy,&
+                                       -1, ierr )
+                            lwrk_qgesvd2 = int( rdummy(1),KIND=ilp)
+                            call stdlib_qormqr( 'R', 'C', n, n, n/2, u, ldu, rdummy,v, ldv, &
+                                      rdummy, -1, ierr )
+                            lwrk_qormqr2 = int( rdummy(1),KIND=ilp)
+                            optwrk2 = max( lwrk_qgeqp3, n/2+lwrk_qgeqrf,n/2+lwrk_qgesvd2, n/2+&
+                                      lwrk_qormqr2 )
+                            if ( conda ) optwrk2 = max( optwrk2, lwcon )
+                            optwrk2 = n + optwrk2
+                            optwrk = max( optwrk, optwrk2 )
+                        end if
+                     else
+                        call stdlib_qgesvd( 'S', 'O', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                  ierr )
+                        lwrk_qgesvd = int( rdummy(1),KIND=ilp)
+                        optwrk = max(lwrk_qgeqp3,lwrk_qgesvd,lwrk_qormqr)
+                        if ( conda ) optwrk = max( optwrk, lwcon )
+                        optwrk = n + optwrk
+                        if ( wntva ) then
+                           call stdlib_qgelqf(n/2,n,u,ldu,rdummy,rdummy,-1,ierr)
+                           lwrk_qgelqf = int( rdummy(1),KIND=ilp)
+                           call stdlib_qgesvd( 'S','O', n/2,n/2, v, ldv, s, u, ldu,v, ldv, rdummy,&
+                                      -1, ierr )
+                           lwrk_qgesvd2 = int( rdummy(1),KIND=ilp)
+                           call stdlib_qormlq( 'R', 'N', n, n, n/2, u, ldu, rdummy,v, ldv, rdummy,&
+                                     -1,ierr )
+                           lwrk_qormlq = int( rdummy(1),KIND=ilp)
+                           optwrk2 = max( lwrk_qgeqp3, n/2+lwrk_qgelqf,n/2+lwrk_qgesvd2, n/2+&
+                                     lwrk_qormlq )
+                            if ( conda ) optwrk2 = max( optwrk2, lwcon )
+                            optwrk2 = n + optwrk2
+                            optwrk = max( optwrk, optwrk2 )
+                        end if
+                     end if
+                  end if
+              end if
+              minwrk = max( 2, minwrk )
+              optwrk = max( 2, optwrk )
+              if ( lwork < minwrk .and. (.not.lquery) ) info = -19
+           end if
+           if (info == 0 .and. lrwork < rminwrk .and. .not. lquery) then
+              info = -21
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGESVDQ', -info )
+              return
+           else if ( lquery ) then
+           ! return optimal workspace
+               iwork(1) = iminwrk
+               work(1) = optwrk
+               work(2) = minwrk
+               rwork(1) = rminwrk
+               return
+           end if
+           ! quick return if the matrix is void.
+           if( ( m==0 ) .or. ( n==0 ) ) then
+           ! All Output Is Void
+              return
+           end if
+           big = stdlib_qlamch('O')
+           ascaled = .false.
+           iwoff = 1
+           if ( rowprm ) then
+                 iwoff = m
+                 ! Reordering The Rows In Decreasing Sequence In The
+                 ! ell-infinity norm - this enhances numerical robustness in
+                 ! the case of differently scaled rows.
+                 do p = 1, m
+                     ! rwork(p) = abs( a(p,stdlib_izamax(n,a(p,1),lda)) )
+                     ! [[stdlib_qlange will return nan if an entry of the p-th row is nan]]
+                     rwork(p) = stdlib_qlange( 'M', 1, n, a(p,1), lda, rdummy )
+                     ! .. check for nan's and inf's
+                     if ( ( rwork(p) /= rwork(p) ) .or.( (rwork(p)*zero) /= zero ) ) then
+                         info = -8
+                         call stdlib_xerbla( 'DGESVDQ', -info )
+                         return
+                     end if
+                 end do
+                 do p = 1, m - 1
+                 q = stdlib_iqamax( m-p+1, rwork(p), 1 ) + p - 1
+                 iwork(n+p) = q
+                 if ( p /= q ) then
+                    rtmp     = rwork(p)
+                    rwork(p) = rwork(q)
+                    rwork(q) = rtmp
+                 end if
+                 end do
+                 if ( rwork(1) == zero ) then
+                    ! quick return: a is the m x n zero matrix.
+                    numrank = 0
+                    call stdlib_qlaset( 'G', n, 1, zero, zero, s, n )
+                    if ( wntus ) call stdlib_qlaset('G', m, n, zero, one, u, ldu)
+                    if ( wntua ) call stdlib_qlaset('G', m, m, zero, one, u, ldu)
+                    if ( wntva ) call stdlib_qlaset('G', n, n, zero, one, v, ldv)
+                    if ( wntuf ) then
+                        call stdlib_qlaset( 'G', n, 1, zero, zero, work, n )
+                        call stdlib_qlaset( 'G', m, n, zero,  one, u, ldu )
+                    end if
+                    do p = 1, n
+                        iwork(p) = p
+                    end do
+                    if ( rowprm ) then
+                        do p = n + 1, n + m - 1
+                            iwork(p) = p - n
+                        end do
+                    end if
+                    if ( conda ) rwork(1) = -1
+                    rwork(2) = -1
+                    return
+                 end if
+                 if ( rwork(1) > big / sqrt(real(m,KIND=qp)) ) then
+                     ! .. to prevent overflow in the qr factorization, scale the
+                     ! matrix by 1/sqrt(m) if too large entry detected
+                     call stdlib_qlascl('G',0,0,sqrt(real(m,KIND=qp)),one, m,n, a,lda, ierr)
+                               
+                     ascaled = .true.
+                 end if
+                 call stdlib_qlaswp( n, a, lda, 1, m-1, iwork(n+1), 1 )
+           end if
+          ! .. at this stage, preemptive scaling is done only to avoid column
+          ! norms overflows during the qr factorization. the svd procedure should
+          ! have its own scaling to save the singular values from overflows and
+          ! underflows. that depends on the svd procedure.
+           if ( .not.rowprm ) then
+               rtmp = stdlib_qlange( 'M', m, n, a, lda, rdummy )
+               if ( ( rtmp /= rtmp ) .or.( (rtmp*zero) /= zero ) ) then
+                    info = -8
+                    call stdlib_xerbla( 'DGESVDQ', -info )
+                    return
+               end if
+               if ( rtmp > big / sqrt(real(m,KIND=qp)) ) then
+                   ! .. to prevent overflow in the qr factorization, scale the
+                   ! matrix by 1/sqrt(m) if too large entry detected
+                   call stdlib_qlascl('G',0,0, sqrt(real(m,KIND=qp)),one, m,n, a,lda, ierr)
+                             
+                   ascaled = .true.
+               end if
+           end if
+           ! Qr Factorization With Column Pivoting
+           ! a * p = q * [ r ]
+                       ! [ 0 ]
+           do p = 1, n
+              ! All Columns Are Free Columns
+              iwork(p) = 0
+           end do
+           call stdlib_qgeqp3( m, n, a, lda, iwork, work, work(n+1), lwork-n,ierr )
+          ! if the user requested accuracy level allows truncation in the
+          ! computed upper triangular factor, the matrix r is examined and,
+          ! if possible, replaced with its leading upper trapezoidal part.
+           epsln = stdlib_qlamch('E')
+           sfmin = stdlib_qlamch('S')
+           ! small = sfmin / epsln
+           nr = n
+           if ( accla ) then
+              ! standard absolute error bound suffices. all sigma_i with
+              ! sigma_i < n*eps*||a||_f are flushed to zero. this is an
+              ! aggressive enforcement of lower numerical rank by introducing a
+              ! backward error of the order of n*eps*||a||_f.
+              nr = 1
+              rtmp = sqrt(real(n,KIND=qp))*epsln
+              do p = 2, n
+                 if ( abs(a(p,p)) < (rtmp*abs(a(1,1))) ) go to 3002
+                    nr = nr + 1
+              end do
+              3002 continue
+           elseif ( acclm ) then
+              ! .. similarly as above, only slightly more gentle (less aggressive).
+              ! sudden drop on the diagonal of r is used as the criterion for being
+              ! close-to-rank-deficient. the threshold is set to epsln=stdlib_qlamch('e').
+              ! [[this can be made more flexible by replacing this hard-coded value
+              ! with a user specified threshold.]] also, the values that underflow
+              ! will be truncated.
+              nr = 1
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < (epsln*abs(a(p-1,p-1))) ) .or.( abs(a(p,p)) < sfmin ) ) go &
+                           to 3402
+                 nr = nr + 1
+              end do
+              3402 continue
+           else
+              ! Rrqr Not Authorized To Determine Numerical Rank Except In The
+              ! obvious case of zero pivots.
+              ! .. inspect r for exact zeros on the diagonal;
+              ! r(i,i)=0 => r(i:n,i:n)=0.
+              nr = 1
+              do p = 2, n
+                 if ( abs(a(p,p)) == zero ) go to 3502
+                 nr = nr + 1
+              end do
+              3502 continue
+              if ( conda ) then
+                 ! estimate the scaled condition number of a. use the fact that it is
+                 ! the same as the scaled condition number of r.
+                    ! V Is Used As Workspace
+                    call stdlib_qlacpy( 'U', n, n, a, lda, v, ldv )
+                    ! only the leading nr x nr submatrix of the triangular factor
+                    ! is considered. only if nr=n will this give a reliable error
+                    ! bound. however, even for nr < n, this can be used on an
+                    ! expert level and obtain useful information in the sense of
+                    ! perturbation theory.
+                    do p = 1, nr
+                       rtmp = stdlib_qnrm2( p, v(1,p), 1 )
+                       call stdlib_qscal( p, one/rtmp, v(1,p), 1 )
+                    end do
+                    if ( .not. ( lsvec .or. rsvec ) ) then
+                        call stdlib_qpocon( 'U', nr, v, ldv, one, rtmp,work, iwork(n+iwoff), ierr &
+                                  )
+                    else
+                        call stdlib_qpocon( 'U', nr, v, ldv, one, rtmp,work(n+1), iwork(n+iwoff), &
+                                  ierr )
+                    end if
+                    sconda = one / sqrt(rtmp)
+                 ! for nr=n, sconda is an estimate of sqrt(||(r^* * r)^(-1)||_1),
+                 ! n^(-1/4) * sconda <= ||r^(-1)||_2 <= n^(1/4) * sconda
+                 ! see the reference [1] for more details.
+              end if
+           endif
+           if ( wntur ) then
+               n1 = nr
+           else if ( wntus .or. wntuf) then
+               n1 = n
+           else if ( wntua ) then
+               n1 = m
+           end if
+           if ( .not. ( rsvec .or. lsvec ) ) then
+      ! .......................................................................
+              ! Only The Singular Values Are Requested
+      ! .......................................................................
+              if ( rtrans ) then
+               ! .. compute the singular values of r**t = [a](1:nr,1:n)**t
+                 ! .. set the lower triangle of [a] to [a](1:nr,1:n)**t and
+                 ! the upper triangle of [a] to zero.
+                 do p = 1, min( n, nr )
+                    do q = p + 1, n
+                       a(q,p) = a(p,q)
+                       if ( q <= nr ) a(p,q) = zero
+                    end do
+                 end do
+                 call stdlib_qgesvd( 'N', 'N', n, nr, a, lda, s, u, ldu,v, ldv, work, lwork, info &
+                           )
+              else
+                 ! .. compute the singular values of r = [a](1:nr,1:n)
+                 if ( nr > 1 )call stdlib_qlaset( 'L', nr-1,nr-1, zero,zero, a(2,1), lda )
+                 call stdlib_qgesvd( 'N', 'N', nr, n, a, lda, s, u, ldu,v, ldv, work, lwork, info &
+                           )
+              end if
+           else if ( lsvec .and. ( .not. rsvec) ) then
+      ! .......................................................................
+             ! The Singular Values And The Left Singular Vectors Requested
+      ! .......................................................................""""""""
+              if ( rtrans ) then
+                  ! .. apply stdlib_qgesvd to r**t
+                  ! .. copy r**t into [u] and overwrite [u] with the right singular
+                  ! vectors of r
+                 do p = 1, nr
+                    do q = p, n
+                       u(q,p) = a(p,q)
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_qlaset( 'U', nr-1,nr-1, zero,zero, u(1,2), ldu )
+                 ! .. the left singular vectors not computed, the nr right singular
+                 ! vectors overwrite [u](1:nr,1:nr) as transposed. these
+                 ! will be pre-multiplied by q to build the left singular vectors of a.
+                    call stdlib_qgesvd( 'N', 'O', n, nr, u, ldu, s, u, ldu,u, ldu, work(n+1), &
+                              lwork-n, info )
+                    do p = 1, nr
+                        do q = p + 1, nr
+                           rtmp   = u(q,p)
+                           u(q,p) = u(p,q)
+                           u(p,q) = rtmp
+                        end do
+                    end do
+              else
+                  ! Apply Stdlib_Dgesvd To R
+                  ! .. copy r into [u] and overwrite [u] with the left singular vectors
+                  call stdlib_qlacpy( 'U', nr, n, a, lda, u, ldu )
+                  if ( nr > 1 )call stdlib_qlaset( 'L', nr-1, nr-1, zero, zero, u(2,1), ldu )
+                            
+                  ! .. the right singular vectors not computed, the nr left singular
+                  ! vectors overwrite [u](1:nr,1:nr)
+                     call stdlib_qgesvd( 'O', 'N', nr, n, u, ldu, s, u, ldu,v, ldv, work(n+1), &
+                               lwork-n, info )
+                     ! .. now [u](1:nr,1:nr) contains the nr left singular vectors of
+                     ! r. these will be pre-multiplied by q to build the left singular
+                     ! vectors of a.
+              end if
+                 ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+              if ( ( nr < m ) .and. ( .not.wntuf ) ) then
+                  call stdlib_qlaset('A', m-nr, nr, zero, zero, u(nr+1,1), ldu)
+                  if ( nr < n1 ) then
+                     call stdlib_qlaset( 'A',nr,n1-nr,zero,zero,u(1,nr+1), ldu )
+                     call stdlib_qlaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                  end if
+              end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! vectors matrix u.
+              if ( .not.wntuf )call stdlib_qormqr( 'L', 'N', m, n1, n, a, lda, work, u,ldu, work(&
+                        n+1), lwork-n, ierr )
+              if ( rowprm .and. .not.wntuf )call stdlib_qlaswp( n1, u, ldu, 1, m-1, iwork(n+1), -&
+                        1 )
+           else if ( rsvec .and. ( .not. lsvec ) ) then
+      ! .......................................................................
+             ! The Singular Values And The Right Singular Vectors Requested
+      ! .......................................................................
+               if ( rtrans ) then
+                  ! .. apply stdlib_qgesvd to r**t
+                  ! .. copy r**t into v and overwrite v with the left singular vectors
+                 do p = 1, nr
+                    do q = p, n
+                       v(q,p) = (a(p,q))
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_qlaset( 'U', nr-1,nr-1, zero,zero, v(1,2), ldv )
+                 ! .. the left singular vectors of r**t overwrite v, the right singular
+                 ! vectors not computed
+                 if ( wntvr .or. ( nr == n ) ) then
+                    call stdlib_qgesvd( 'O', 'N', n, nr, v, ldv, s, u, ldu,u, ldu, work(n+1), &
+                              lwork-n, info )
+                    do p = 1, nr
+                        do q = p + 1, nr
+                           rtmp   = v(q,p)
+                           v(q,p) = v(p,q)
+                           v(p,q) = rtmp
+                        end do
+                    end do
+                    if ( nr < n ) then
+                        do p = 1, nr
+                           do q = nr + 1, n
+                               v(p,q) = v(q,p)
+                           end do
+                        end do
+                    end if
+                    call stdlib_qlapmt( .false., nr, n, v, ldv, iwork )
+                 else
+                     ! .. need all n right singular vectors and nr < n
+                     ! [!] this is simple implementation that augments [v](1:n,1:nr)
+                     ! by padding a zero block. in the case nr << n, a more efficient
+                     ! way is to first use the qr factorization. for more details
+                     ! how to implement this, see the " full svd " branch.
+                     call stdlib_qlaset('G', n, n-nr, zero, zero, v(1,nr+1), ldv)
+                     call stdlib_qgesvd( 'O', 'N', n, n, v, ldv, s, u, ldu,u, ldu, work(n+1), &
+                               lwork-n, info )
+                     do p = 1, n
+                        do q = p + 1, n
+                           rtmp   = v(q,p)
+                           v(q,p) = v(p,q)
+                           v(p,q) = rtmp
+                        end do
+                     end do
+                     call stdlib_qlapmt( .false., n, n, v, ldv, iwork )
+                 end if
+               else
+                  ! Aply Stdlib_Dgesvd To R
+                  ! Copy R Into V And Overwrite V With The Right Singular Vectors
+                  call stdlib_qlacpy( 'U', nr, n, a, lda, v, ldv )
+                  if ( nr > 1 )call stdlib_qlaset( 'L', nr-1, nr-1, zero, zero, v(2,1), ldv )
+                            
+                  ! .. the right singular vectors overwrite v, the nr left singular
+                  ! vectors stored in u(1:nr,1:nr)
+                  if ( wntvr .or. ( nr == n ) ) then
+                     call stdlib_qgesvd( 'N', 'O', nr, n, v, ldv, s, u, ldu,v, ldv, work(n+1), &
+                               lwork-n, info )
+                     call stdlib_qlapmt( .false., nr, n, v, ldv, iwork )
+                     ! .. now [v](1:nr,1:n) contains v(1:n,1:nr)**t
+                  else
+                     ! .. need all n right singular vectors and nr < n
+                     ! [!] this is simple implementation that augments [v](1:nr,1:n)
+                     ! by padding a zero block. in the case nr << n, a more efficient
+                     ! way is to first use the lq factorization. for more details
+                     ! how to implement this, see the " full svd " branch.
+                      call stdlib_qlaset('G', n-nr, n, zero,zero, v(nr+1,1), ldv)
+                      call stdlib_qgesvd( 'N', 'O', n, n, v, ldv, s, u, ldu,v, ldv, work(n+1), &
+                                lwork-n, info )
+                      call stdlib_qlapmt( .false., n, n, v, ldv, iwork )
+                  end if
+                  ! .. now [v] contains the transposed matrix of the right singular
+                  ! vectors of a.
+               end if
+           else
+      ! .......................................................................
+             ! Full Svd Requested
+      ! .......................................................................
+              if ( rtrans ) then
+                  ! .. apply stdlib_qgesvd to r**t [[this option is left for r
+                 if ( wntvr .or. ( nr == n ) ) then
+                  ! .. copy r**t into [v] and overwrite [v] with the left singular
+                  ! vectors of r**t
+                 do p = 1, nr
+                    do q = p, n
+                       v(q,p) = a(p,q)
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_qlaset( 'U', nr-1,nr-1, zero,zero, v(1,2), ldv )
+                 ! .. the left singular vectors of r**t overwrite [v], the nr right
+                 ! singular vectors of r**t stored in [u](1:nr,1:nr) as transposed
+                    call stdlib_qgesvd( 'O', 'A', n, nr, v, ldv, s, v, ldv,u, ldu, work(n+1), &
+                              lwork-n, info )
+                    ! Assemble V
+                    do p = 1, nr
+                       do q = p + 1, nr
+                          rtmp   = v(q,p)
+                          v(q,p) = v(p,q)
+                          v(p,q) = rtmp
+                       end do
+                    end do
+                    if ( nr < n ) then
+                        do p = 1, nr
+                           do q = nr+1, n
+                              v(p,q) = v(q,p)
+                           end do
+                        end do
+                    end if
+                    call stdlib_qlapmt( .false., nr, n, v, ldv, iwork )
+                     do p = 1, nr
+                        do q = p + 1, nr
+                           rtmp   = u(q,p)
+                           u(q,p) = u(p,q)
+                           u(p,q) = rtmp
+                        end do
+                     end do
+                     if ( ( nr < m ) .and. .not.(wntuf)) then
+                       call stdlib_qlaset('A', m-nr,nr, zero,zero, u(nr+1,1), ldu)
+                       if ( nr < n1 ) then
+                          call stdlib_qlaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                          call stdlib_qlaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                       end if
+                    end if
+                 else
+                     ! .. need all n right singular vectors and nr < n
+                  ! .. copy r**t into [v] and overwrite [v] with the left singular
+                  ! vectors of r**t
+                     ! [[the optimal ratio n/nr for using qrf instead of padding
+                       ! with zeros. here hard coded to 2; it must be at least
+                       ! two due to work space constraints.]]
+                     ! optratio = stdlib_ilaenv(6, 'dgesvd', 's' // 'o', nr,n,0,0)
+                     ! optratio = max( optratio, 2 )
+                     optratio = 2
+                     if ( optratio*nr > n ) then
+                        do p = 1, nr
+                           do q = p, n
+                              v(q,p) = a(p,q)
+                           end do
+                        end do
+                        if ( nr > 1 )call stdlib_qlaset('U',nr-1,nr-1, zero,zero, v(1,2),ldv)
+                                  
+                        call stdlib_qlaset('A',n,n-nr,zero,zero,v(1,nr+1),ldv)
+                        call stdlib_qgesvd( 'O', 'A', n, n, v, ldv, s, v, ldv,u, ldu, work(n+1), &
+                                  lwork-n, info )
+                        do p = 1, n
+                           do q = p + 1, n
+                              rtmp   = v(q,p)
+                              v(q,p) = v(p,q)
+                              v(p,q) = rtmp
+                           end do
+                        end do
+                        call stdlib_qlapmt( .false., n, n, v, ldv, iwork )
+                    ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x n1), i.e. (m x n) or (m x m).
+                        do p = 1, n
+                           do q = p + 1, n
+                              rtmp   = u(q,p)
+                              u(q,p) = u(p,q)
+                              u(p,q) = rtmp
+                           end do
+                        end do
+                        if ( ( n < m ) .and. .not.(wntuf)) then
+                           call stdlib_qlaset('A',m-n,n,zero,zero,u(n+1,1),ldu)
+                           if ( n < n1 ) then
+                             call stdlib_qlaset('A',n,n1-n,zero,zero,u(1,n+1),ldu)
+                             call stdlib_qlaset('A',m-n,n1-n,zero,one,u(n+1,n+1), ldu )
+                           end if
+                        end if
+                     else
+                        ! .. copy r**t into [u] and overwrite [u] with the right
+                        ! singular vectors of r
+                        do p = 1, nr
+                           do q = p, n
+                              u(q,nr+p) = a(p,q)
+                           end do
+                        end do
+                        if ( nr > 1 )call stdlib_qlaset('U',nr-1,nr-1,zero,zero,u(1,nr+2),ldu)
+                                  
+                        call stdlib_qgeqrf( n, nr, u(1,nr+1), ldu, work(n+1),work(n+nr+1), lwork-&
+                                  n-nr, ierr )
+                        do p = 1, nr
+                            do q = 1, n
+                                v(q,p) = u(p,nr+q)
+                            end do
+                        end do
+                       call stdlib_qlaset('U',nr-1,nr-1,zero,zero,v(1,2),ldv)
+                       call stdlib_qgesvd( 'S', 'O', nr, nr, v, ldv, s, u, ldu,v,ldv, work(n+nr+1)&
+                                 ,lwork-n-nr, info )
+                       call stdlib_qlaset('A',n-nr,nr,zero,zero,v(nr+1,1),ldv)
+                       call stdlib_qlaset('A',nr,n-nr,zero,zero,v(1,nr+1),ldv)
+                       call stdlib_qlaset('A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv)
+                       call stdlib_qormqr('R','C', n, n, nr, u(1,nr+1), ldu,work(n+1),v,ldv,work(&
+                                 n+nr+1),lwork-n-nr,ierr)
+                       call stdlib_qlapmt( .false., n, n, v, ldv, iwork )
+                       ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                       ! (m x nr) or (m x n) or (m x m).
+                       if ( ( nr < m ) .and. .not.(wntuf)) then
+                          call stdlib_qlaset('A',m-nr,nr,zero,zero,u(nr+1,1),ldu)
+                          if ( nr < n1 ) then
+                          call stdlib_qlaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                          call stdlib_qlaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1),ldu)
+                          end if
+                       end if
+                     end if
+                 end if
+              else
+                  ! .. apply stdlib_qgesvd to r [[this is the recommended option]]
+                  if ( wntvr .or. ( nr == n ) ) then
+                      ! .. copy r into [v] and overwrite v with the right singular vectors
+                      call stdlib_qlacpy( 'U', nr, n, a, lda, v, ldv )
+                     if ( nr > 1 )call stdlib_qlaset( 'L', nr-1,nr-1, zero,zero, v(2,1), ldv )
+                               
+                     ! .. the right singular vectors of r overwrite [v], the nr left
+                     ! singular vectors of r stored in [u](1:nr,1:nr)
+                     call stdlib_qgesvd( 'S', 'O', nr, n, v, ldv, s, u, ldu,v, ldv, work(n+1), &
+                               lwork-n, info )
+                     call stdlib_qlapmt( .false., nr, n, v, ldv, iwork )
+                     ! .. now [v](1:nr,1:n) contains v(1:n,1:nr)**t
+                     ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+                    if ( ( nr < m ) .and. .not.(wntuf)) then
+                       call stdlib_qlaset('A', m-nr,nr, zero,zero, u(nr+1,1), ldu)
+                       if ( nr < n1 ) then
+                          call stdlib_qlaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                          call stdlib_qlaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                       end if
+                    end if
+                  else
+                    ! .. need all n right singular vectors and nr < n
+                    ! The Requested Number Of The Left Singular Vectors
+                     ! is then n1 (n or m)
+                     ! [[the optimal ratio n/nr for using lq instead of padding
+                       ! with zeros. here hard coded to 2; it must be at least
+                       ! two due to work space constraints.]]
+                     ! optratio = stdlib_ilaenv(6, 'dgesvd', 's' // 'o', nr,n,0,0)
+                     ! optratio = max( optratio, 2 )
+                    optratio = 2
+                    if ( optratio * nr > n ) then
+                       call stdlib_qlacpy( 'U', nr, n, a, lda, v, ldv )
+                       if ( nr > 1 )call stdlib_qlaset('L', nr-1,nr-1, zero,zero, v(2,1),ldv)
+                                 
+                    ! .. the right singular vectors of r overwrite [v], the nr left
+                       ! singular vectors of r stored in [u](1:nr,1:nr)
+                       call stdlib_qlaset('A', n-nr,n, zero,zero, v(nr+1,1),ldv)
+                       call stdlib_qgesvd( 'S', 'O', n, n, v, ldv, s, u, ldu,v, ldv, work(n+1), &
+                                 lwork-n, info )
+                       call stdlib_qlapmt( .false., n, n, v, ldv, iwork )
+                       ! .. now [v] contains the transposed matrix of the right
+                       ! singular vectors of a. the leading n left singular vectors
+                       ! are in [u](1:n,1:n)
+                       ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                       ! (m x n1), i.e. (m x n) or (m x m).
+                       if ( ( n < m ) .and. .not.(wntuf)) then
+                           call stdlib_qlaset('A',m-n,n,zero,zero,u(n+1,1),ldu)
+                           if ( n < n1 ) then
+                             call stdlib_qlaset('A',n,n1-n,zero,zero,u(1,n+1),ldu)
+                             call stdlib_qlaset( 'A',m-n,n1-n,zero,one,u(n+1,n+1), ldu )
+                           end if
+                       end if
+                    else
+                       call stdlib_qlacpy( 'U', nr, n, a, lda, u(nr+1,1), ldu )
+                       if ( nr > 1 )call stdlib_qlaset('L',nr-1,nr-1,zero,zero,u(nr+2,1),ldu)
+                                 
+                       call stdlib_qgelqf( nr, n, u(nr+1,1), ldu, work(n+1),work(n+nr+1), lwork-n-&
+                                 nr, ierr )
+                       call stdlib_qlacpy('L',nr,nr,u(nr+1,1),ldu,v,ldv)
+                       if ( nr > 1 )call stdlib_qlaset('U',nr-1,nr-1,zero,zero,v(1,2),ldv)
+                       call stdlib_qgesvd( 'S', 'O', nr, nr, v, ldv, s, u, ldu,v, ldv, work(n+nr+&
+                                 1), lwork-n-nr, info )
+                       call stdlib_qlaset('A',n-nr,nr,zero,zero,v(nr+1,1),ldv)
+                       call stdlib_qlaset('A',nr,n-nr,zero,zero,v(1,nr+1),ldv)
+                       call stdlib_qlaset('A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv)
+                       call stdlib_qormlq('R','N',n,n,nr,u(nr+1,1),ldu,work(n+1),v, ldv, work(n+&
+                                 nr+1),lwork-n-nr,ierr)
+                       call stdlib_qlapmt( .false., n, n, v, ldv, iwork )
+                     ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+                       if ( ( nr < m ) .and. .not.(wntuf)) then
+                          call stdlib_qlaset('A',m-nr,nr,zero,zero,u(nr+1,1),ldu)
+                          if ( nr < n1 ) then
+                          call stdlib_qlaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                          call stdlib_qlaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                          end if
+                       end if
+                    end if
+                  end if
+              ! .. end of the "r**t or r" branch
+              end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! vectors matrix u.
+              if ( .not. wntuf )call stdlib_qormqr( 'L', 'N', m, n1, n, a, lda, work, u,ldu, work(&
+                        n+1), lwork-n, ierr )
+              if ( rowprm .and. .not.wntuf )call stdlib_qlaswp( n1, u, ldu, 1, m-1, iwork(n+1), -&
+                        1 )
+           ! ... end of the "full svd" branch
+           end if
+           ! check whether some singular values are returned as zeros, e.g.
+           ! due to underflow, and update the numerical rank.
+           p = nr
+           do q = p, 1, -1
+               if ( s(q) > zero ) go to 4002
+               nr = nr - 1
+           end do
+           4002 continue
+           ! .. if numerical rank deficiency is detected, the truncated
+           ! singular values are set to zero.
+           if ( nr < n ) call stdlib_qlaset( 'G', n-nr,1, zero,zero, s(nr+1), n )
+           ! .. undo scaling; this may cause overflow in the largest singular
+           ! values.
+           if ( ascaled )call stdlib_qlascl( 'G',0,0, one,sqrt(real(m,KIND=qp)), nr,1, s, n, ierr &
+                     )
+           if ( conda ) rwork(1) = sconda
+           rwork(2) = p - nr
+           ! .. p-nr is the number of singular values that are computed as
+           ! exact zeros in stdlib_qgesvd() applied to the (possibly truncated)
+           ! full row rank triangular (trapezoidal) factor of a.
+           numrank = nr
+           return
+     end subroutine stdlib_qgesvdq
+
+     !> DGESVJ: computes the singular value decomposition (SVD) of a real
+     !> M-by-N matrix A, where M >= N. The SVD of A is written as
+     !> [++]   [xx]   [x0]   [xx]
+     !> A = U * SIGMA * V^t,  [++] = [xx] * [ox] * [xx]
+     !> [++]   [xx]
+     !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+     !> matrix, and V is an N-by-N orthogonal matrix. The diagonal elements
+     !> of SIGMA are the singular values of A. The columns of U and V are the
+     !> left and the right singular vectors of A, respectively.
+     !> DGESVJ can sometimes compute tiny singular values and their singular vectors much
+     !> more accurately than other SVD routines, see below under Further Details.
+
+     pure subroutine stdlib_qgesvj( joba, jobu, jobv, m, n, a, lda, sva, mv, v,ldv, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n
+           character, intent(in) :: joba, jobu, jobv
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), v(ldv,*), work(lwork)
+           real(qp), intent(out) :: sva(n)
+        ! =====================================================================
+           ! Local Parameters 
+           integer(ilp), parameter :: nsweep = 30
+           
+           
+           ! Local Scalars 
+           real(qp) :: aapp, aapp0, aapq, aaqq, apoaq, aqoap, big, bigtheta, cs, ctol, epsln, &
+           large, mxaapq, mxsinj, rootbig, rooteps, rootsfmin, roottol, skl, sfmin, small, sn, t, &
+                     temp1, theta, thsign, tol
+           integer(ilp) :: blskip, emptsw, i, ibr, ierr, igl, ijblsk, ir1, iswrot, jbc, jgl, kbl, &
+                     lkahead, mvl, n2, n34, n4, nbl, notrot, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, goscale, lower, lsvec, noscale, rotok, rsvec, uctol, &
+                     upper
+           ! Local Arrays 
+           real(qp) :: fastr(5)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,sign,sqrt
+           ! from lapack
+           ! from lapack
+           ! Executable Statements 
+           ! test the input arguments
+           lsvec = stdlib_lsame( jobu, 'U' )
+           uctol = stdlib_lsame( jobu, 'C' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           applv = stdlib_lsame( jobv, 'A' )
+           upper = stdlib_lsame( joba, 'U' )
+           lower = stdlib_lsame( joba, 'L' )
+           if( .not.( upper .or. lower .or. stdlib_lsame( joba, 'G' ) ) ) then
+              info = -1
+           else if( .not.( lsvec .or. uctol .or. stdlib_lsame( jobu, 'N' ) ) ) then
+              info = -2
+           else if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -5
+           else if( lda<m ) then
+              info = -7
+           else if( mv<0 ) then
+              info = -9
+           else if( ( rsvec .and. ( ldv<n ) ) .or.( applv .and. ( ldv<mv ) ) ) then
+              info = -11
+           else if( uctol .and. ( work( 1 )<=one ) ) then
+              info = -12
+           else if( lwork<max( m+n, 6 ) ) then
+              info = -13
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGESVJ', -info )
+              return
+           end if
+       ! #:) quick return for void matrix
+           if( ( m==0 ) .or. ( n==0 ) )return
+           ! set numerical parameters
+           ! the stopping criterion for jacobi rotations is
+           ! max_{i<>j}|a(:,i)^t * a(:,j)|/(||a(:,i)||*||a(:,j)||) < ctol*eps
+           ! where eps is the round-off and ctol is defined as follows:
+           if( uctol ) then
+              ! ... user controlled
+              ctol = work( 1 )
+           else
+              ! ... default
+              if( lsvec .or. rsvec .or. applv ) then
+                 ctol = sqrt( real( m,KIND=qp) )
+              else
+                 ctol = real( m,KIND=qp)
+              end if
+           end if
+           ! ... and the machine dependent parameters are
+      ! [!]  (make sure that stdlib_qlamch() works properly on the target machine.)
+           epsln = stdlib_qlamch( 'EPSILON' )
+           rooteps = sqrt( epsln )
+           sfmin = stdlib_qlamch( 'SAFEMINIMUM' )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / epsln
+           big = stdlib_qlamch( 'OVERFLOW' )
+           ! big         = one    / sfmin
+           rootbig = one / rootsfmin
+           large = big / sqrt( real( m*n,KIND=qp) )
+           bigtheta = one / rooteps
+           tol = ctol*epsln
+           roottol = sqrt( tol )
+           if( real( m,KIND=qp)*epsln>=one ) then
+              info = -4
+              call stdlib_xerbla( 'DGESVJ', -info )
+              return
+           end if
+           ! initialize the right singular vector matrix.
+           if( rsvec ) then
+              mvl = n
+              call stdlib_qlaset( 'A', mvl, n, zero, one, v, ldv )
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           ! initialize sva( 1:n ) = ( ||a e_i||_2, i = 1:n )
+      ! (!)  if necessary, scale a to protect the largest singular value
+           ! from overflow. it is possible that saving the largest singular
+           ! value destroys the information about the small ones.
+           ! this initial scaling is almost minimal in the sense that the
+           ! goal is to make sure that no column norm overflows, and that
+           ! sqrt(n)*max_i sva(i) does not overflow. if infinite entries
+           ! in a are detected, the procedure returns with info=-6.
+           skl= one / sqrt( real( m,KIND=qp)*real( n,KIND=qp) )
+           noscale = .true.
+           goscale = .true.
+           if( lower ) then
+              ! the input matrix is m-by-n lower triangular (trapezoidal)
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_qlassq( m-p+1, a( p, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'DGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl)
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           else if( upper ) then
+              ! the input matrix is m-by-n upper triangular (trapezoidal)
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_qlassq( p, a( 1, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'DGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl)
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           else
+              ! the input matrix is m-by-n general dense
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_qlassq( m, a( 1, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'DGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl)
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           end if
+           if( noscale )skl= one
+           ! move the smaller part of the spectrum from the underflow threshold
+      ! (!)  start by determining the position of the nonzero entries of the
+           ! array sva() relative to ( sfmin, big ).
+           aapp = zero
+           aaqq = big
+           do p = 1, n
+              if( sva( p )/=zero )aaqq = min( aaqq, sva( p ) )
+              aapp = max( aapp, sva( p ) )
+           end do
+       ! #:) quick return for zero matrix
+           if( aapp==zero ) then
+              if( lsvec )call stdlib_qlaset( 'G', m, n, zero, one, a, lda )
+              work( 1 ) = one
+              work( 2 ) = zero
+              work( 3 ) = zero
+              work( 4 ) = zero
+              work( 5 ) = zero
+              work( 6 ) = zero
+              return
+           end if
+       ! #:) quick return for one-column matrix
+           if( n==1 ) then
+              if( lsvec )call stdlib_qlascl( 'G', 0, 0, sva( 1 ), skl, m, 1,a( 1, 1 ), lda, ierr )
+                        
+              work( 1 ) = one / skl
+              if( sva( 1 )>=sfmin ) then
+                 work( 2 ) = one
+              else
+                 work( 2 ) = zero
+              end if
+              work( 3 ) = zero
+              work( 4 ) = zero
+              work( 5 ) = zero
+              work( 6 ) = zero
+              return
+           end if
+           ! protect small singular values from underflow, and try to
+           ! avoid underflows/overflows in computing jacobi rotations.
+           sn = sqrt( sfmin / epsln )
+           temp1 = sqrt( big / real( n,KIND=qp) )
+           if( ( aapp<=sn ) .or. ( aaqq>=temp1 ) .or.( ( sn<=aaqq ) .and. ( aapp<=temp1 ) ) ) &
+                     then
+              temp1 = min( big, temp1 / aapp )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq<=sn ) .and. ( aapp<=temp1 ) ) then
+              temp1 = min( sn / aaqq, big / ( aapp*sqrt( real( n,KIND=qp) ) ) )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq>=sn ) .and. ( aapp>=temp1 ) ) then
+              temp1 = max( sn / aaqq, temp1 / aapp )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq<=sn ) .and. ( aapp>=temp1 ) ) then
+              temp1 = min( sn / aaqq, big / ( sqrt( real( n,KIND=qp) )*aapp ) )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else
+              temp1 = one
+           end if
+           ! scale, if necessary
+           if( temp1/=one ) then
+              call stdlib_qlascl( 'G', 0, 0, one, temp1, n, 1, sva, n, ierr )
+           end if
+           skl= temp1*skl
+           if( skl/=one ) then
+              call stdlib_qlascl( joba, 0, 0, one, skl, m, n, a, lda, ierr )
+              skl= one / skl
+           end if
+           ! row-cyclic jacobi svd algorithm with column pivoting
+           emptsw = ( n*( n-1 ) ) / 2
+           notrot = 0
+           fastr( 1 ) = zero
+           ! a is represented in factored form a = a * diag(work), where diag(work)
+           ! is initialized to identity. work is updated during fast scaled
+           ! rotations.
+           do q = 1, n
+              work( q ) = one
+           end do
+           swband = 3
+      ! [tp] swband is a tuning parameter [tp]. it is meaningful and effective
+           ! if stdlib_qgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_qgesvj. for sweeps i=1:swband the procedure
+           ! works on pivots inside a band-like region around the diagonal.
+           ! the boundaries are determined dynamically, based on the number of
+           ! pivots above a threshold.
+           kbl = min( 8, n )
+      ! [tp] kbl is a tuning parameter that defines the tile size in the
+           ! tiling of the p-q loops of pivot pairs. in general, an optimal
+           ! value of kbl depends on the matrix dimensions and on the
+           ! parameters of the computer's memory.
+           nbl = n / kbl
+           if( ( nbl*kbl )/=n )nbl = nbl + 1
+           blskip = kbl**2
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           lkahead = 1
+      ! [tp] lkahead is a tuning parameter.
+           ! quasi block transformations, using the lower (upper) triangular
+           ! structure of the input matrix. the quasi-block-cycling usually
+           ! invokes cubic convergence. big part of this cycle is done inside
+           ! canonical subspaces of dimensions less than m.
+           if( ( lower .or. upper ) .and. ( n>max( 64, 4*kbl ) ) ) then
+      ! [tp] the number of partition levels and the actual partition are
+           ! tuning parameters.
+              n4 = n / 4
+              n2 = n / 2
+              n34 = 3*n4
+              if( applv ) then
+                 q = 0
+              else
+                 q = 1
+              end if
+              if( lower ) then
+           ! this works very well on lower triangular matrices, in particular
+           ! in the framework of the preconditioned jacobi svd (xgejsv).
+           ! the idea is simple:
+           ! [+ 0 0 0]   note that jacobi transformations of [0 0]
+           ! [+ + 0 0]                                       [0 0]
+           ! [+ + x 0]   actually work on [x 0]              [x 0]
+           ! [+ + x x]                    [x x].             [x x]
+                 call stdlib_qgsvj0( jobv, m-n34, n-n34, a( n34+1, n34+1 ), lda,work( n34+1 ), &
+                 sva( n34+1 ), mvl,v( n34*q+1, n34+1 ), ldv, epsln, sfmin, tol,2, work( n+1 ), &
+                           lwork-n, ierr )
+                 call stdlib_qgsvj0( jobv, m-n2, n34-n2, a( n2+1, n2+1 ), lda,work( n2+1 ), sva( &
+                 n2+1 ), mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 2,work( n+1 ), lwork-n, &
+                           ierr )
+                 call stdlib_qgsvj1( jobv, m-n2, n-n2, n4, a( n2+1, n2+1 ), lda,work( n2+1 ), sva(&
+                  n2+1 ), mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 1,work( n+1 ), lwork-n, &
+                            ierr )
+                 call stdlib_qgsvj0( jobv, m-n4, n2-n4, a( n4+1, n4+1 ), lda,work( n4+1 ), sva( &
+                 n4+1 ), mvl,v( n4*q+1, n4+1 ), ldv, epsln, sfmin, tol, 1,work( n+1 ), lwork-n, &
+                           ierr )
+                 call stdlib_qgsvj0( jobv, m, n4, a, lda, work, sva, mvl, v, ldv,epsln, sfmin, &
+                           tol, 1, work( n+1 ), lwork-n,ierr )
+                 call stdlib_qgsvj1( jobv, m, n2, n4, a, lda, work, sva, mvl, v,ldv, epsln, sfmin,&
+                            tol, 1, work( n+1 ),lwork-n, ierr )
+              else if( upper ) then
+                 call stdlib_qgsvj0( jobv, n4, n4, a, lda, work, sva, mvl, v, ldv,epsln, sfmin, &
+                           tol, 2, work( n+1 ), lwork-n,ierr )
+                 call stdlib_qgsvj0( jobv, n2, n4, a( 1, n4+1 ), lda, work( n4+1 ),sva( n4+1 ), &
+                 mvl, v( n4*q+1, n4+1 ), ldv,epsln, sfmin, tol, 1, work( n+1 ), lwork-n,ierr )
+                           
+                 call stdlib_qgsvj1( jobv, n2, n2, n4, a, lda, work, sva, mvl, v,ldv, epsln, &
+                           sfmin, tol, 1, work( n+1 ),lwork-n, ierr )
+                 call stdlib_qgsvj0( jobv, n2+n4, n4, a( 1, n2+1 ), lda,work( n2+1 ), sva( n2+1 ),&
+                  mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 1,work( n+1 ), lwork-n, ierr )
+                            
+              end if
+           end if
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+           ! each sweep is unrolled using kbl-by-kbl tiles over the pivot pairs
+           ! 1 <= p < q <= n. this is the first step toward a blocked implementation
+           ! of the rotations. new implementation, based on block transformations,
+           ! is under development.
+              loop_2000: do ibr = 1, nbl
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_1002: do ir1 = 0, min( lkahead, nbl-ibr )
+                    igl = igl + ir1*kbl
+                    loop_2001: do p = igl, min( igl+kbl-1, n-1 )
+           ! .. de rijk's pivoting
+                       q = stdlib_iqamax( n-p+1, sva( p ), 1 ) + p - 1
+                       if( p/=q ) then
+                          call stdlib_qswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                          if( rsvec )call stdlib_qswap( mvl, v( 1, p ), 1,v( 1, q ), 1 )
+                          temp1 = sva( p )
+                          sva( p ) = sva( q )
+                          sva( q ) = temp1
+                          temp1 = work( p )
+                          work( p ) = work( q )
+                          work( q ) = temp1
+                       end if
+                       if( ir1==0 ) then
+              ! column norms are periodically updated by explicit
+              ! norm computation.
+              ! caveat:
+              ! unfortunately, some blas implementations compute stdlib_qnrm2(m,a(1,p),1)
+              ! as sqrt(stdlib_qdot(m,a(1,p),1,a(1,p),1)), which may cause the result to
+              ! overflow for ||a(:,p)||_2 > sqrt(overflow_threshold), and to
+              ! underflow for ||a(:,p)||_2 < sqrt(underflow_threshold).
+              ! hence, stdlib_qnrm2 cannot be trusted, not even in the case when
+              ! the true norm is far from the under(over)flow boundaries.
+              ! if properly implemented stdlib_qnrm2 is available, the if-then-else
+              ! below should read "aapp = stdlib_qnrm2( m, a(1,p), 1 ) * work(p)".
+                          if( ( sva( p )<rootbig ) .and.( sva( p )>rootsfmin ) ) then
+                             sva( p ) = stdlib_qnrm2( m, a( 1, p ), 1 )*work( p )
+                          else
+                             temp1 = zero
+                             aapp = one
+                             call stdlib_qlassq( m, a( 1, p ), 1, temp1, aapp )
+                             sva( p ) = temp1*sqrt( aapp )*work( p )
+                          end if
+                          aapp = sva( p )
+                       else
+                          aapp = sva( p )
+                       end if
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2002: do q = p + 1, min( igl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+                                if( aaqq>=one ) then
+                                   rotok = ( small*aapp )<=aaqq
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_qdot( m, a( 1, p ), 1, a( 1,q ), 1 )*work( &
+                                                p )*work( q ) /aaqq ) / aapp
+                                   else
+                                      call stdlib_qcopy( m, a( 1, p ), 1,work( n+1 ), 1 )
+                                      call stdlib_qlascl( 'G', 0, 0, aapp,work( p ), m, 1,work( n+&
+                                                1 ), lda, ierr )
+                                      aapq = stdlib_qdot( m, work( n+1 ), 1,a( 1, q ), 1 )*work( &
+                                                q ) / aaqq
+                                   end if
+                                else
+                                   rotok = aapp<=( aaqq / small )
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_qdot( m, a( 1, p ), 1, a( 1,q ), 1 )*work( &
+                                                p )*work( q ) /aaqq ) / aapp
+                                   else
+                                      call stdlib_qcopy( m, a( 1, q ), 1,work( n+1 ), 1 )
+                                      call stdlib_qlascl( 'G', 0, 0, aaqq,work( q ), m, 1,work( n+&
+                                                1 ), lda, ierr )
+                                      aapq = stdlib_qdot( m, work( n+1 ), 1,a( 1, p ), 1 )*work( &
+                                                p ) / aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                 ! Rotate
+      ! [rtd]      rotated = rotated + one
+                                   if( ir1==0 ) then
+                                      notrot = 0
+                                      pskipped = 0
+                                      iswrot = iswrot + 1
+                                   end if
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs(aqoap-apoaq)/aapq
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*work( p ) / work( q )
+                                         fastr( 4 ) = -t*work( q ) /work( p )
+                                         call stdlib_qrotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_qrotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = work( p ) / work( q )
+                                         aqoap = work( q ) / work( p )
+                                         if( work( p )>=one ) then
+                                            if( work( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               work( p ) = work( p )*cs
+                                               work( q ) = work( q )*cs
+                                               call stdlib_qrotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_qrotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_qaxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_qaxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               work( p ) = work( p )*cs
+                                               work( q ) = work( q ) / cs
+                                               if( rsvec ) then
+                                                  call stdlib_qaxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_qaxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                            end if
+                                         else
+                                            if( work( q )>=one ) then
+                                               call stdlib_qaxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_qaxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               work( p ) = work( p ) / cs
+                                               work( q ) = work( q )*cs
+                                               if( rsvec ) then
+                                                  call stdlib_qaxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_qaxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                            else
+                                               if( work( p )>=work( q ) )then
+                                                  call stdlib_qaxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_qaxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  work( p ) = work( p )*cs
+                                                  work( q ) = work( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_qaxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_qaxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_qaxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_qaxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  work( p ) = work( p ) / cs
+                                                  work( q ) = work( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_qaxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_qaxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                      call stdlib_qcopy( m, a( 1, p ), 1,work( n+1 ), 1 )
+                                      call stdlib_qlascl( 'G', 0, 0, aapp, one, m,1, work( n+1 ), &
+                                                lda,ierr )
+                                      call stdlib_qlascl( 'G', 0, 0, aaqq, one, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      temp1 = -aapq*work( p ) / work( q )
+                                      call stdlib_qaxpy( m, temp1, work( n+1 ), 1,a( 1, q ), 1 )
+                                                
+                                      call stdlib_qlascl( 'G', 0, 0, one, aaqq, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                      mxsinj = max( mxsinj, sfmin )
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! recompute sva(q), sva(p).
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_qnrm2( m, a( 1, q ), 1 )*work( q )
+                                                   
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_qlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*work( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_qnrm2( m, a( 1, p ), 1 )*work( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_qlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*work( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                                else
+              ! a(:,p) and a(:,q) already numerically orthogonal
+                                   if( ir1==0 )notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                end if
+                             else
+              ! a(:,q) is zero column
+                                if( ir1==0 )notrot = notrot + 1
+                                pskipped = pskipped + 1
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                if( ir1==0 )aapp = -aapp
+                                notrot = 0
+                                go to 2103
+                             end if
+                          end do loop_2002
+           ! end q-loop
+           2103 continue
+           ! bailed out of q-loop
+                          sva( p ) = aapp
+                       else
+                          sva( p ) = aapp
+                          if( ( ir1==0 ) .and. ( aapp==zero ) )notrot = notrot + min( igl+kbl-1, &
+                                    n ) - p
+                       end if
+                    end do loop_2001
+           ! end of the p-loop
+           ! end of doing the block ( ibr, ibr )
+                 end do loop_1002
+           ! end of ir1-loop
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = ibr + 1, nbl
+                    jgl = ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! safe gram matrix computation
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_qdot( m, a( 1, p ), 1, a( 1,q ), 1 )*work( &
+                                                p )*work( q ) /aaqq ) / aapp
+                                   else
+                                      call stdlib_qcopy( m, a( 1, p ), 1,work( n+1 ), 1 )
+                                      call stdlib_qlascl( 'G', 0, 0, aapp,work( p ), m, 1,work( n+&
+                                                1 ), lda, ierr )
+                                      aapq = stdlib_qdot( m, work( n+1 ), 1,a( 1, q ), 1 )*work( &
+                                                q ) / aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_qdot( m, a( 1, p ), 1, a( 1,q ), 1 )*work( &
+                                                p )*work( q ) /aaqq ) / aapp
+                                   else
+                                      call stdlib_qcopy( m, a( 1, q ), 1,work( n+1 ), 1 )
+                                      call stdlib_qlascl( 'G', 0, 0, aaqq,work( q ), m, 1,work( n+&
+                                                1 ), lda, ierr )
+                                      aapq = stdlib_qdot( m, work( n+1 ), 1,a( 1, p ), 1 )*work( &
+                                                p ) / aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                                   notrot = 0
+      ! [rtd]      rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs(aqoap-apoaq)/aapq
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*work( p ) / work( q )
+                                         fastr( 4 ) = -t*work( q ) /work( p )
+                                         call stdlib_qrotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_qrotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = work( p ) / work( q )
+                                         aqoap = work( q ) / work( p )
+                                         if( work( p )>=one ) then
+                                            if( work( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               work( p ) = work( p )*cs
+                                               work( q ) = work( q )*cs
+                                               call stdlib_qrotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_qrotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_qaxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_qaxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_qaxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_qaxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                               work( p ) = work( p )*cs
+                                               work( q ) = work( q ) / cs
+                                            end if
+                                         else
+                                            if( work( q )>=one ) then
+                                               call stdlib_qaxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_qaxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_qaxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_qaxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                               work( p ) = work( p ) / cs
+                                               work( q ) = work( q )*cs
+                                            else
+                                               if( work( p )>=work( q ) )then
+                                                  call stdlib_qaxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_qaxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  work( p ) = work( p )*cs
+                                                  work( q ) = work( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_qaxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_qaxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_qaxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_qaxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  work( p ) = work( p ) / cs
+                                                  work( q ) = work( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_qaxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_qaxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                                      if( aapp>aaqq ) then
+                                         call stdlib_qcopy( m, a( 1, p ), 1,work( n+1 ), 1 )
+                                                   
+                                         call stdlib_qlascl( 'G', 0, 0, aapp, one,m, 1, work( n+1 &
+                                                   ), lda,ierr )
+                                         call stdlib_qlascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*work( p ) / work( q )
+                                         call stdlib_qaxpy( m, temp1, work( n+1 ),1, a( 1, q ), 1 &
+                                                   )
+                                         call stdlib_qlascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      else
+                                         call stdlib_qcopy( m, a( 1, q ), 1,work( n+1 ), 1 )
+                                                   
+                                         call stdlib_qlascl( 'G', 0, 0, aaqq, one,m, 1, work( n+1 &
+                                                   ), lda,ierr )
+                                         call stdlib_qlascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*work( q ) / work( p )
+                                         call stdlib_qaxpy( m, temp1, work( n+1 ),1, a( 1, p ), 1 &
+                                                   )
+                                         call stdlib_qlascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q)
+                 ! .. recompute sva(q)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_qnrm2( m, a( 1, q ), 1 )*work( q )
+                                                   
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_qlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*work( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_qnrm2( m, a( 1, p ), 1 )*work( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_qlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*work( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_qnrm2( m, a( 1, n ), 1 )*work( n )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_qlassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )*work( n )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<sqrt( real( n,KIND=qp) )*tol ) .and. ( real( n,&
+                        KIND=qp)*mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:( reaching this point means that the procedure has not converged.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means numerical convergence after the i-th
+           ! sweep.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the singular values and find how many are above
+           ! the underflow threshold.
+           n2 = 0
+           n4 = 0
+           do p = 1, n - 1
+              q = stdlib_iqamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 temp1 = work( p )
+                 work( p ) = work( q )
+                 work( q ) = temp1
+                 call stdlib_qswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_qswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+              if( sva( p )/=zero ) then
+                 n4 = n4 + 1
+                 if( sva( p )*skl>sfmin )n2 = n2 + 1
+              end if
+           end do
+           if( sva( n )/=zero ) then
+              n4 = n4 + 1
+              if( sva( n )*skl>sfmin )n2 = n2 + 1
+           end if
+           ! normalize the left singular vectors.
+           if( lsvec .or. uctol ) then
+              do p = 1, n2
+                 call stdlib_qscal( m, work( p ) / sva( p ), a( 1, p ), 1 )
+              end do
+           end if
+           ! scale the product of jacobi rotations (assemble the fast rotations).
+           if( rsvec ) then
+              if( applv ) then
+                 do p = 1, n
+                    call stdlib_qscal( mvl, work( p ), v( 1, p ), 1 )
+                 end do
+              else
+                 do p = 1, n
+                    temp1 = one / stdlib_qnrm2( mvl, v( 1, p ), 1 )
+                    call stdlib_qscal( mvl, temp1, v( 1, p ), 1 )
+                 end do
+              end if
+           end if
+           ! undo scaling, if necessary (and possible).
+           if( ( ( skl>one ) .and. ( sva( 1 )<( big / skl) ) ).or. ( ( skl<one ) .and. ( sva( max(&
+                      n2, 1 ) ) >( sfmin / skl) ) ) ) then
+              do p = 1, n
+                 sva( p ) = skl*sva( p )
+              end do
+              skl= one
+           end if
+           work( 1 ) = skl
+           ! the singular values of a are skl*sva(1:n). if skl/=one
+           ! then some of the singular values may overflow or underflow and
+           ! the spectrum is given in this factored representation.
+           work( 2 ) = real( n4,KIND=qp)
+           ! n4 is the number of computed nonzero singular values of a.
+           work( 3 ) = real( n2,KIND=qp)
+           ! n2 is the number of singular values of a greater than sfmin.
+           ! if n2<n, sva(n2:n) contains zeros and/or denormalized numbers
+           ! that may carry some information.
+           work( 4 ) = real( i,KIND=qp)
+           ! i is the index of the last sweep before declaring convergence.
+           work( 5 ) = mxaapq
+           ! mxaapq is the largest absolute value of scaled pivots in the
+           ! last sweep
+           work( 6 ) = mxsinj
+           ! mxsinj is the largest absolute value of the sines of jacobi angles
+           ! in the last sweep
+           return
+     end subroutine stdlib_qgesvj
+
+     !> DGESVX: uses the LU factorization to compute the solution to a real
+     !> system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_qgesvx( fact, trans, n, nrhs, a, lda, af, ldaf, ipiv,equed, r, c, b, ldb, &
+               x, ldx, rcond, ferr, berr,work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: a(lda,*), af(ldaf,*), b(ldb,*), c(*), r(*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: colequ, equil, nofact, notran, rowequ
+           character :: norm
+           integer(ilp) :: i, infequ, j
+           real(qp) :: amax, anorm, bignum, colcnd, rcmax, rcmin, rowcnd, rpvgrw, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rowequ = .false.
+              colequ = .false.
+           else
+              rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+              colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rowequ .or. colequ .or. stdlib_lsame( &
+                     equed, 'N' ) ) ) then
+              info = -10
+           else
+              if( rowequ ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, r( j ) )
+                    rcmax = max( rcmax, r( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -11
+                 else if( n>0 ) then
+                    rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    rowcnd = one
+                 end if
+              end if
+              if( colequ .and. info==0 ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, c( j ) )
+                    rcmax = max( rcmax, c( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -12
+                 else if( n>0 ) then
+                    colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    colcnd = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -14
+                 else if( ldx<max( 1, n ) ) then
+                    info = -16
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGESVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_qgeequ( n, n, a, lda, r, c, rowcnd, colcnd, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_qlaqge( n, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+                 rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+                 colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( notran ) then
+              if( rowequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       b( i, j ) = r( i )*b( i, j )
+                    end do
+                 end do
+              end if
+           else if( colequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = c( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the lu factorization of a.
+              call stdlib_qlacpy( 'FULL', n, n, a, lda, af, ldaf )
+              call stdlib_qgetrf( n, n, af, ldaf, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 ) then
+                 ! compute the reciprocal pivot growth factor of the
+                 ! leading rank-deficient info columns of a.
+                 rpvgrw = stdlib_qlantr( 'M', 'U', 'N', info, info, af, ldaf,work )
+                 if( rpvgrw==zero ) then
+                    rpvgrw = one
+                 else
+                    rpvgrw = stdlib_qlange( 'M', n, info, a, lda, work ) / rpvgrw
+                 end if
+                 work( 1 ) = rpvgrw
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a and the
+           ! reciprocal pivot growth factor rpvgrw.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_qlange( norm, n, n, a, lda, work )
+           rpvgrw = stdlib_qlantr( 'M', 'U', 'N', n, n, af, ldaf, work )
+           if( rpvgrw==zero ) then
+              rpvgrw = one
+           else
+              rpvgrw = stdlib_qlange( 'M', n, n, a, lda, work ) / rpvgrw
+           end if
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_qgecon( norm, n, af, ldaf, anorm, rcond, work, iwork, info )
+           ! compute the solution matrix x.
+           call stdlib_qlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_qgetrs( trans, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_qgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, iwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( notran ) then
+              if( colequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       x( i, j ) = c( i )*x( i, j )
+                    end do
+                 end do
+                 do j = 1, nrhs
+                    ferr( j ) = ferr( j ) / colcnd
+                 end do
+              end if
+           else if( rowequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = r( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / rowcnd
+              end do
+           end if
+           work( 1 ) = rpvgrw
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_qgesvx
+
+     !> DGETC2: computes an LU factorization with complete pivoting of the
+     !> n-by-n matrix A. The factorization has the form A = P * L * U * Q,
+     !> where P and Q are permutation matrices, L is lower triangular with
+     !> unit diagonal elements and U is upper triangular.
+     !> This is the Level 2 BLAS algorithm.
+
+     pure subroutine stdlib_qgetc2( n, a, lda, ipiv, jpiv, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*), jpiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ip, ipv, j, jp, jpv
+           real(qp) :: bignum, eps, smin, smlnum, xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! handle the case n=1 by itself
+           if( n==1 ) then
+              ipiv( 1 ) = 1
+              jpiv( 1 ) = 1
+              if( abs( a( 1, 1 ) )<smlnum ) then
+                 info = 1
+                 a( 1, 1 ) = smlnum
+              end if
+              return
+           end if
+           ! factorize a using complete pivoting.
+           ! set pivots less than smin to smin.
+           loop_40: do i = 1, n - 1
+              ! find max element in matrix a
+              xmax = zero
+              do ip = i, n
+                 do jp = i, n
+                    if( abs( a( ip, jp ) )>=xmax ) then
+                       xmax = abs( a( ip, jp ) )
+                       ipv = ip
+                       jpv = jp
+                    end if
+                 end do
+              end do
+              if( i==1 )smin = max( eps*xmax, smlnum )
+              ! swap rows
+              if( ipv/=i )call stdlib_qswap( n, a( ipv, 1 ), lda, a( i, 1 ), lda )
+              ipiv( i ) = ipv
+              ! swap columns
+              if( jpv/=i )call stdlib_qswap( n, a( 1, jpv ), 1, a( 1, i ), 1 )
+              jpiv( i ) = jpv
+              ! check for singularity
+              if( abs( a( i, i ) )<smin ) then
+                 info = i
+                 a( i, i ) = smin
+              end if
+              do j = i + 1, n
+                 a( j, i ) = a( j, i ) / a( i, i )
+              end do
+              call stdlib_qger( n-i, n-i, -one, a( i+1, i ), 1, a( i, i+1 ), lda,a( i+1, i+1 ), &
+                        lda )
+           end do loop_40
+           if( abs( a( n, n ) )<smin ) then
+              info = n
+              a( n, n ) = smin
+           end if
+           ! set last pivots to n
+           ipiv( n ) = n
+           jpiv( n ) = n
+           return
+     end subroutine stdlib_qgetc2
+
+     !> DGETF2: computes an LU factorization of a general m-by-n matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the right-looking Level 2 BLAS version of the algorithm.
+
+     pure subroutine stdlib_qgetf2( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: sfmin
+           integer(ilp) :: i, j, jp
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGETF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch('S')
+           do j = 1, min( m, n )
+              ! find pivot and test for singularity.
+              jp = j - 1 + stdlib_iqamax( m-j+1, a( j, j ), 1 )
+              ipiv( j ) = jp
+              if( a( jp, j )/=zero ) then
+                 ! apply the interchange to columns 1:n.
+                 if( jp/=j )call stdlib_qswap( n, a( j, 1 ), lda, a( jp, 1 ), lda )
+                 ! compute elements j+1:m of j-th column.
+                 if( j<m ) then
+                    if( abs(a( j, j )) >= sfmin ) then
+                       call stdlib_qscal( m-j, one / a( j, j ), a( j+1, j ), 1 )
+                    else
+                      do i = 1, m-j
+                         a( j+i, j ) = a( j+i, j ) / a( j, j )
+                      end do
+                    end if
+                 end if
+              else if( info==0 ) then
+                 info = j
+              end if
+              if( j<min( m, n ) ) then
+                 ! update trailing submatrix.
+                 call stdlib_qger( m-j, n-j, -one, a( j+1, j ), 1, a( j, j+1 ), lda,a( j+1, j+1 ),&
+                            lda )
+              end if
+           end do
+           return
+     end subroutine stdlib_qgetf2
+
+     !> DGETRF: computes an LU factorization of a general M-by-N matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the right-looking Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_qgetrf( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGETRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'DGETRF', ' ', m, n, -1, -1 )
+           if( nb<=1 .or. nb>=min( m, n ) ) then
+              ! use unblocked code.
+              call stdlib_qgetrf2( m, n, a, lda, ipiv, info )
+           else
+              ! use blocked code.
+              do j = 1, min( m, n ), nb
+                 jb = min( min( m, n )-j+1, nb )
+                 ! factor diagonal and subdiagonal blocks and test for exact
+                 ! singularity.
+                 call stdlib_qgetrf2( m-j+1, jb, a( j, j ), lda, ipiv( j ), iinfo )
+                 ! adjust info and the pivot indices.
+                 if( info==0 .and. iinfo>0 )info = iinfo + j - 1
+                 do i = j, min( m, j+jb-1 )
+                    ipiv( i ) = j - 1 + ipiv( i )
+                 end do
+                 ! apply interchanges to columns 1:j-1.
+                 call stdlib_qlaswp( j-1, a, lda, j, j+jb-1, ipiv, 1 )
+                 if( j+jb<=n ) then
+                    ! apply interchanges to columns j+jb:n.
+                    call stdlib_qlaswp( n-j-jb+1, a( 1, j+jb ), lda, j, j+jb-1,ipiv, 1 )
+                    ! compute block row of u.
+                    call stdlib_qtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', jb,n-j-jb+1, one, &
+                              a( j, j ), lda, a( j, j+jb ),lda )
+                    if( j+jb<=m ) then
+                       ! update trailing submatrix.
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-j-jb+1,n-j-jb+1, jb, -&
+                       one, a( j+jb, j ), lda,a( j, j+jb ), lda, one, a( j+jb, j+jb ),lda )
+                                 
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_qgetrf
+
+     !> DGETRF2: computes an LU factorization of a general M-by-N matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the recursive version of the algorithm. It divides
+     !> the matrix into four submatrices:
+     !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+     !> A = [ -----|----- ]  with n1 = min(m,n)/2
+     !> [  A21 | A22  ]       n2 = n-n1
+     !> [ A11 ]
+     !> The subroutine calls itself to factor [ --- ],
+     !> [ A12 ]
+     !> [ A12 ]
+     !> do the swaps on [ --- ], solve A12, update A22,
+     !> [ A22 ]
+     !> then calls itself to factor A22 and do the swaps on A21.
+
+     pure recursive subroutine stdlib_qgetrf2( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: sfmin, temp
+           integer(ilp) :: i, iinfo, n1, n2
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGETRF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if ( m==1 ) then
+              ! use unblocked code for one row case
+              ! just need to handle ipiv and info
+              ipiv( 1 ) = 1
+              if ( a(1,1)==zero )info = 1
+           else if( n==1 ) then
+              ! use unblocked code for one column case
+              ! compute machine safe minimum
+              sfmin = stdlib_qlamch('S')
+              ! find pivot and test for singularity
+              i = stdlib_iqamax( m, a( 1, 1 ), 1 )
+              ipiv( 1 ) = i
+              if( a( i, 1 )/=zero ) then
+                 ! apply the interchange
+                 if( i/=1 ) then
+                    temp = a( 1, 1 )
+                    a( 1, 1 ) = a( i, 1 )
+                    a( i, 1 ) = temp
+                 end if
+                 ! compute elements 2:m of the column
+                 if( abs(a( 1, 1 )) >= sfmin ) then
+                    call stdlib_qscal( m-1, one / a( 1, 1 ), a( 2, 1 ), 1 )
+                 else
+                    do i = 1, m-1
+                       a( 1+i, 1 ) = a( 1+i, 1 ) / a( 1, 1 )
+                    end do
+                 end if
+              else
+                 info = 1
+              end if
+           else
+              ! use recursive code
+              n1 = min( m, n ) / 2
+              n2 = n-n1
+                     ! [ a11 ]
+              ! factor [ --- ]
+                     ! [ a21 ]
+              call stdlib_qgetrf2( m, n1, a, lda, ipiv, iinfo )
+              if ( info==0 .and. iinfo>0 )info = iinfo
+                                    ! [ a12 ]
+              ! apply interchanges to [ --- ]
+                                    ! [ a22 ]
+              call stdlib_qlaswp( n2, a( 1, n1+1 ), lda, 1, n1, ipiv, 1 )
+              ! solve a12
+              call stdlib_qtrsm( 'L', 'L', 'N', 'U', n1, n2, one, a, lda,a( 1, n1+1 ), lda )
+                        
+              ! update a22
+              call stdlib_qgemm( 'N', 'N', m-n1, n2, n1, -one, a( n1+1, 1 ), lda,a( 1, n1+1 ), &
+                        lda, one, a( n1+1, n1+1 ), lda )
+              ! factor a22
+              call stdlib_qgetrf2( m-n1, n2, a( n1+1, n1+1 ), lda, ipiv( n1+1 ),iinfo )
+              ! adjust info and the pivot indices
+              if ( info==0 .and. iinfo>0 )info = iinfo + n1
+              do i = n1+1, min( m, n )
+                 ipiv( i ) = ipiv( i ) + n1
+              end do
+              ! apply interchanges to a21
+              call stdlib_qlaswp( n1, a( 1, 1 ), lda, n1+1, min( m, n), ipiv, 1 )
+           end if
+           return
+     end subroutine stdlib_qgetrf2
+
+     !> DGETRI: computes the inverse of a matrix using the LU factorization
+     !> computed by DGETRF.
+     !> This method inverts U and then computes inv(A) by solving the system
+     !> inv(A)*L = inv(U) for inv(A).
+
+     pure subroutine stdlib_qgetri( n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iws, j, jb, jj, jp, ldwork, lwkopt, nb, nbmin, nn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DGETRI', ' ', n, -1, -1, -1 )
+           lwkopt = n*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGETRI', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form inv(u).  if info > 0 from stdlib_qtrtri, then u is singular,
+           ! and the inverse is not computed.
+           call stdlib_qtrtri( 'UPPER', 'NON-UNIT', n, a, lda, info )
+           if( info>0 )return
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = max( ldwork*nb, 1 )
+              if( lwork<iws ) then
+                 nb = lwork / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DGETRI', ' ', n, -1, -1, -1 ) )
+              end if
+           else
+              iws = n
+           end if
+           ! solve the equation inv(a)*l = inv(u) for inv(a).
+           if( nb<nbmin .or. nb>=n ) then
+              ! use unblocked code.
+              do j = n, 1, -1
+                 ! copy current column of l to work and replace with zeros.
+                 do i = j + 1, n
+                    work( i ) = a( i, j )
+                    a( i, j ) = zero
+                 end do
+                 ! compute current column of inv(a).
+                 if( j<n )call stdlib_qgemv( 'NO TRANSPOSE', n, n-j, -one, a( 1, j+1 ),lda, work( &
+                           j+1 ), 1, one, a( 1, j ), 1 )
+              end do
+           else
+              ! use blocked code.
+              nn = ( ( n-1 ) / nb )*nb + 1
+              do j = nn, 1, -nb
+                 jb = min( nb, n-j+1 )
+                 ! copy current block column of l to work and replace with
+                 ! zeros.
+                 do jj = j, j + jb - 1
+                    do i = jj + 1, n
+                       work( i+( jj-j )*ldwork ) = a( i, jj )
+                       a( i, jj ) = zero
+                    end do
+                 end do
+                 ! compute current block column of inv(a).
+                 if( j+jb<=n )call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, jb,n-j-jb+1, -&
+                           one, a( 1, j+jb ), lda,work( j+jb ), ldwork, one, a( 1, j ), lda )
+                 call stdlib_qtrsm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n, jb,one, work( j )&
+                           , ldwork, a( 1, j ), lda )
+              end do
+           end if
+           ! apply column interchanges.
+           do j = n - 1, 1, -1
+              jp = ipiv( j )
+              if( jp/=j )call stdlib_qswap( n, a( 1, j ), 1, a( 1, jp ), 1 )
+           end do
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_qgetri
+
+     !> DGETRS: solves a system of linear equations
+     !> A * X = B  or  A**T * X = B
+     !> with a general N-by-N matrix A using the LU factorization computed
+     !> by DGETRF.
+
+     pure subroutine stdlib_qgetrs( trans, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGETRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( notran ) then
+              ! solve a * x = b.
+              ! apply row interchanges to the right hand sides.
+              call stdlib_qlaswp( nrhs, b, ldb, 1, n, ipiv, 1 )
+              ! solve l*x = b, overwriting b with x.
+              call stdlib_qtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n, nrhs,one, a, lda, b, &
+                        ldb )
+              ! solve u*x = b, overwriting b with x.
+              call stdlib_qtrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, one, a, lda,&
+                         b, ldb )
+           else
+              ! solve a**t * x = b.
+              ! solve u**t *x = b, overwriting b with x.
+              call stdlib_qtrsm( 'LEFT', 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, nrhs,one, a, lda, b,&
+                         ldb )
+              ! solve l**t *x = b, overwriting b with x.
+              call stdlib_qtrsm( 'LEFT', 'LOWER', 'TRANSPOSE', 'UNIT', n, nrhs, one,a, lda, b, &
+                        ldb )
+              ! apply row interchanges to the solution vectors.
+              call stdlib_qlaswp( nrhs, b, ldb, 1, n, ipiv, -1 )
+           end if
+           return
+     end subroutine stdlib_qgetrs
+
+     !> DGETSLS: solves overdetermined or underdetermined real linear systems
+     !> involving an M-by-N matrix A, using a tall skinny QR or short wide LQ
+     !> factorization of A.  It is assumed that A has full rank.
+     !> The following options are provided:
+     !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A*X ||.
+     !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+     !> an underdetermined system A * X = B.
+     !> 3. If TRANS = 'T' and m >= n:  find the minimum norm solution of
+     !> an undetermined system A**T * X = B.
+     !> 4. If TRANS = 'T' and m < n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A**T * X ||.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+
+     subroutine stdlib_qgetsls( trans, m, n, nrhs, a, lda, b, ldb,work, lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, tran
+           integer(ilp) :: i, iascl, ibscl, j, maxmn, brow, scllen, tszo, tszm, lwo, lwm, lw1, &
+                     lw2, wsizeo, wsizem, info2
+           real(qp) :: anrm, bignum, bnrm, smlnum, tq(5), workq(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,int
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           maxmn = max( m, n )
+           tran  = stdlib_lsame( trans, 'T' )
+           lquery = ( lwork==-1 .or. lwork==-2 )
+           if( .not.( stdlib_lsame( trans, 'N' ) .or.stdlib_lsame( trans, 'T' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+           ! determine the optimum and minimum lwork
+            if( m>=n ) then
+              call stdlib_qgeqr( m, n, a, lda, tq, -1, workq, -1, info2 )
+              tszo = int( tq( 1 ),KIND=ilp)
+              lwo  = int( workq( 1 ),KIND=ilp)
+              call stdlib_qgemqr( 'L', trans, m, nrhs, n, a, lda, tq,tszo, b, ldb, workq, -1, &
+                        info2 )
+              lwo  = max( lwo, int( workq( 1 ),KIND=ilp) )
+              call stdlib_qgeqr( m, n, a, lda, tq, -2, workq, -2, info2 )
+              tszm = int( tq( 1 ),KIND=ilp)
+              lwm  = int( workq( 1 ),KIND=ilp)
+              call stdlib_qgemqr( 'L', trans, m, nrhs, n, a, lda, tq,tszm, b, ldb, workq, -1, &
+                        info2 )
+              lwm = max( lwm, int( workq( 1 ),KIND=ilp) )
+              wsizeo = tszo + lwo
+              wsizem = tszm + lwm
+            else
+              call stdlib_qgelq( m, n, a, lda, tq, -1, workq, -1, info2 )
+              tszo = int( tq( 1 ),KIND=ilp)
+              lwo  = int( workq( 1 ),KIND=ilp)
+              call stdlib_qgemlq( 'L', trans, n, nrhs, m, a, lda, tq,tszo, b, ldb, workq, -1, &
+                        info2 )
+              lwo  = max( lwo, int( workq( 1 ),KIND=ilp) )
+              call stdlib_qgelq( m, n, a, lda, tq, -2, workq, -2, info2 )
+              tszm = int( tq( 1 ),KIND=ilp)
+              lwm  = int( workq( 1 ),KIND=ilp)
+              call stdlib_qgemlq( 'L', trans, n, nrhs, m, a, lda, tq,tszm, b, ldb, workq, -1, &
+                        info2 )
+              lwm  = max( lwm, int( workq( 1 ),KIND=ilp) )
+              wsizeo = tszo + lwo
+              wsizem = tszm + lwm
+            end if
+            if( ( lwork<wsizem ).and.( .not.lquery ) ) then
+               info = -10
+            end if
+            work( 1 ) = real( wsizeo,KIND=qp)
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DGETSLS', -info )
+             return
+           end if
+           if( lquery ) then
+             if( lwork==-2 ) work( 1 ) = real( wsizem,KIND=qp)
+             return
+           end if
+           if( lwork<wsizeo ) then
+             lw1 = tszm
+             lw2 = lwm
+           else
+             lw1 = tszo
+             lw2 = lwo
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+                call stdlib_qlaset( 'FULL', max( m, n ), nrhs, zero, zero,b, ldb )
+                return
+           end if
+           ! get machine parameters
+            smlnum = stdlib_qlamch( 'S' ) / stdlib_qlamch( 'P' )
+            bignum = one / smlnum
+            call stdlib_qlabad( smlnum, bignum )
+           ! scale a, b if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', m, n, a, lda, work )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_qlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_qlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_qlaset( 'F', maxmn, nrhs, zero, zero, b, ldb )
+              go to 50
+           end if
+           brow = m
+           if ( tran ) then
+             brow = n
+           end if
+           bnrm = stdlib_qlange( 'M', brow, nrhs, b, ldb, work )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_qlascl( 'G', 0, 0, bnrm, smlnum, brow, nrhs, b, ldb,info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_qlascl( 'G', 0, 0, bnrm, bignum, brow, nrhs, b, ldb,info )
+              ibscl = 2
+           end if
+           if ( m>=n ) then
+              ! compute qr factorization of a
+             call stdlib_qgeqr( m, n, a, lda, work( lw2+1 ), lw1,work( 1 ), lw2, info )
+             if ( .not.tran ) then
+                 ! least-squares problem min || a * x - b ||
+                 ! b(1:m,1:nrhs) := q**t * b(1:m,1:nrhs)
+               call stdlib_qgemqr( 'L' , 'T', m, nrhs, n, a, lda,work( lw2+1 ), lw1, b, ldb, work(&
+                          1 ), lw2,info )
+                 ! b(1:n,1:nrhs) := inv(r) * b(1:n,1:nrhs)
+               call stdlib_qtrtrs( 'U', 'N', 'N', n, nrhs,a, lda, b, ldb, info )
+               if( info>0 ) then
+                 return
+               end if
+               scllen = n
+             else
+                 ! overdetermined system of equations a**t * x = b
+                 ! b(1:n,1:nrhs) := inv(r**t) * b(1:n,1:nrhs)
+                 call stdlib_qtrtrs( 'U', 'T', 'N', n, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(n+1:m,1:nrhs) = zero
+                 do j = 1, nrhs
+                    do i = n + 1, m
+                       b( i, j ) = zero
+                    end do
+                 end do
+                 ! b(1:m,1:nrhs) := q(1:n,:) * b(1:n,1:nrhs)
+                 call stdlib_qgemqr( 'L', 'N', m, nrhs, n, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 scllen = m
+              end if
+           else
+              ! compute lq factorization of a
+              call stdlib_qgelq( m, n, a, lda, work( lw2+1 ), lw1,work( 1 ), lw2, info )
+              ! workspace at least m, optimally m*nb.
+              if( .not.tran ) then
+                 ! underdetermined system of equations a * x = b
+                 ! b(1:m,1:nrhs) := inv(l) * b(1:m,1:nrhs)
+                 call stdlib_qtrtrs( 'L', 'N', 'N', m, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(m+1:n,1:nrhs) = 0
+                 do j = 1, nrhs
+                    do i = m + 1, n
+                       b( i, j ) = zero
+                    end do
+                 end do
+                 ! b(1:n,1:nrhs) := q(1:n,:)**t * b(1:m,1:nrhs)
+                 call stdlib_qgemlq( 'L', 'T', n, nrhs, m, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = n
+              else
+                 ! overdetermined system min || a**t * x - b ||
+                 ! b(1:n,1:nrhs) := q * b(1:n,1:nrhs)
+                 call stdlib_qgemlq( 'L', 'N', n, nrhs, m, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:m,1:nrhs) := inv(l**t) * b(1:m,1:nrhs)
+                 call stdlib_qtrtrs( 'LOWER', 'TRANSPOSE', 'NON-UNIT', m, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = m
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+             call stdlib_qlascl( 'G', 0, 0, anrm, smlnum, scllen, nrhs, b, ldb,info )
+           else if( iascl==2 ) then
+             call stdlib_qlascl( 'G', 0, 0, anrm, bignum, scllen, nrhs, b, ldb,info )
+           end if
+           if( ibscl==1 ) then
+             call stdlib_qlascl( 'G', 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb,info )
+           else if( ibscl==2 ) then
+             call stdlib_qlascl( 'G', 0, 0, bignum, bnrm, scllen, nrhs, b, ldb,info )
+           end if
+           50 continue
+           work( 1 ) = real( tszo + lwo,KIND=qp)
+           return
+     end subroutine stdlib_qgetsls
+
+     !> DGETSQRHRT: computes a NB2-sized column blocked QR-factorization
+     !> of a real M-by-N matrix A with M >= N,
+     !> A = Q * R.
+     !> The routine uses internally a NB1-sized column blocked and MB1-sized
+     !> row blocked TSQR-factorization and perfors the reconstruction
+     !> of the Householder vectors from the TSQR output. The routine also
+     !> converts the R_tsqr factor from the TSQR-factorization output into
+     !> the R factor that corresponds to the Householder QR-factorization,
+     !> A = Q_tsqr * R_tsqr = Q * R.
+     !> The output Q and R factors are stored in the same format as in DGEQRT
+     !> (Q is in blocked compact WY-representation). See the documentation
+     !> of DGEQRT for more details on the format.
+
+     pure subroutine stdlib_qgetsqrhrt( m, n, mb1, nb1, nb2, a, lda, t, ldt, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, nb1, nb2, mb1
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: t(ldt,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, lw1, lw2, lwt, ldwt, lworkopt, nb1local, nb2local, &
+                     num_all_row_blocks
+           ! Intrinsic Functions 
+           intrinsic :: ceiling,real,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery  = lwork==-1
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb1<=n ) then
+              info = -3
+           else if( nb1<1 ) then
+              info = -4
+           else if( nb2<1 ) then
+              info = -5
+           else if( lda<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1,  min( nb2, n ) ) ) then
+              info = -9
+           else
+              ! test the input lwork for the dimension of the array work.
+              ! this workspace is used to store array:
+              ! a) matrix t and work for stdlib_qlatsqr;
+              ! b) n-by-n upper-triangular factor r_tsqr;
+              ! c) matrix t and array work for stdlib_qorgtsqr_row;
+              ! d) diagonal d for stdlib_qorhr_col.
+              if( lwork<n*n+1 .and. .not.lquery ) then
+                 info = -11
+              else
+                 ! set block size for column blocks
+                 nb1local = min( nb1, n )
+                 num_all_row_blocks = max( 1,ceiling( real( m - n,KIND=qp) / real( mb1 - n,&
+                           KIND=qp) ) )
+                 ! length and leading dimension of work array to place
+                 ! t array in tsqr.
+                 lwt = num_all_row_blocks * n * nb1local
+                 ldwt = nb1local
+                 ! length of tsqr work array
+                 lw1 = nb1local * n
+                 ! length of stdlib_qorgtsqr_row work array.
+                 lw2 = nb1local * max( nb1local, ( n - nb1local ) )
+                 lworkopt = max( lwt + lw1, max( lwt+n*n+lw2, lwt+n*n+n ) )
+                 if( ( lwork<max( 1, lworkopt ) ).and.(.not.lquery) ) then
+                    info = -11
+                 end if
+              end if
+           end if
+           ! handle error in the input parameters and return workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGETSQRHRT', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = real( lworkopt,KIND=qp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = real( lworkopt,KIND=qp)
+              return
+           end if
+           nb2local = min( nb2, n )
+           ! (1) perform tsqr-factorization of the m-by-n matrix a.
+           call stdlib_qlatsqr( m, n, mb1, nb1local, a, lda, work, ldwt,work(lwt+1), lw1, iinfo )
+                     
+           ! (2) copy the factor r_tsqr stored in the upper-triangular part
+               ! of a into the square matrix in the work array
+               ! work(lwt+1:lwt+n*n) column-by-column.
+           do j = 1, n
+              call stdlib_qcopy( j, a( 1, j ), 1, work( lwt + n*(j-1)+1 ), 1 )
+           end do
+           ! (3) generate a m-by-n matrix q with orthonormal columns from
+           ! the result stored below the diagonal in the array a in place.
+           call stdlib_qorgtsqr_row( m, n, mb1, nb1local, a, lda, work, ldwt,work( lwt+n*n+1 ), &
+                     lw2, iinfo )
+           ! (4) perform the reconstruction of householder vectors from
+           ! the matrix q (stored in a) in place.
+           call stdlib_qorhr_col( m, n, nb2local, a, lda, t, ldt,work( lwt+n*n+1 ), iinfo )
+                     
+           ! (5) copy the factor r_tsqr stored in the square matrix in the
+           ! work array work(lwt+1:lwt+n*n) into the upper-triangular
+           ! part of a.
+           ! (6) compute from r_tsqr the factor r_hr corresponding to
+           ! the reconstructed householder vectors, i.e. r_hr = s * r_tsqr.
+           ! this multiplication by the sign matrix s on the left means
+           ! changing the sign of i-th row of the matrix r_tsqr according
+           ! to sign of the i-th diagonal element diag(i) of the matrix s.
+           ! diag is stored in work( lwt+n*n+1 ) from the stdlib_qorhr_col output.
+           ! (5) and (6) can be combined in a single loop, so the rows in a
+           ! are accessed only once.
+           do i = 1, n
+              if( work( lwt+n*n+i )==-one ) then
+                 do j = i, n
+                    a( i, j ) = -one * work( lwt+n*(j-1)+i )
+                 end do
+              else
+                 call stdlib_qcopy( n-i+1, work(lwt+n*(i-1)+i), n, a( i, i ), lda )
+              end if
+           end do
+           work( 1 ) = real( lworkopt,KIND=qp)
+           return
+     end subroutine stdlib_qgetsqrhrt
+
+     !> DGGBAK: forms the right or left eigenvectors of a real generalized
+     !> eigenvalue problem A*x = lambda*B*x, by backward transformation on
+     !> the computed eigenvectors of the balanced pair of matrices output by
+     !> DGGBAL.
+
+     pure subroutine stdlib_qggbak( job, side, n, ilo, ihi, lscale, rscale, m, v,ldv, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job, side
+           integer(ilp), intent(in) :: ihi, ilo, ldv, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(in) :: lscale(*), rscale(*)
+           real(qp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: leftv, rightv
+           integer(ilp) :: i, k
+           ! Intrinsic Functions 
+           intrinsic :: max,int
+           ! Executable Statements 
+           ! test the input parameters
+           rightv = stdlib_lsame( side, 'R' )
+           leftv = stdlib_lsame( side, 'L' )
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( .not.rightv .and. .not.leftv ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( n==0 .and. ihi==0 .and. ilo/=1 ) then
+              info = -4
+           else if( n>0 .and. ( ihi<ilo .or. ihi>max( 1, n ) ) )then
+              info = -5
+           else if( n==0 .and. ilo==1 .and. ihi/=0 ) then
+              info = -5
+           else if( m<0 ) then
+              info = -8
+           else if( ldv<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGBAK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( m==0 )return
+           if( stdlib_lsame( job, 'N' ) )return
+           if( ilo==ihi )go to 30
+           ! backward balance
+           if( stdlib_lsame( job, 'S' ) .or. stdlib_lsame( job, 'B' ) ) then
+              ! backward transformation on right eigenvectors
+              if( rightv ) then
+                 do i = ilo, ihi
+                    call stdlib_qscal( m, rscale( i ), v( i, 1 ), ldv )
+                 end do
+              end if
+              ! backward transformation on left eigenvectors
+              if( leftv ) then
+                 do i = ilo, ihi
+                    call stdlib_qscal( m, lscale( i ), v( i, 1 ), ldv )
+                 end do
+              end if
+           end if
+           ! backward permutation
+           30 continue
+           if( stdlib_lsame( job, 'P' ) .or. stdlib_lsame( job, 'B' ) ) then
+              ! backward permutation on right eigenvectors
+              if( rightv ) then
+                 if( ilo==1 )go to 50
+                 loop_40: do i = ilo - 1, 1, -1
+                    k = int(rscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_40
+                    call stdlib_qswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_40
+                 50 continue
+                 if( ihi==n )go to 70
+                 loop_60: do i = ihi + 1, n
+                    k = int(rscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_60
+                    call stdlib_qswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_60
+              end if
+              ! backward permutation on left eigenvectors
+              70 continue
+              if( leftv ) then
+                 if( ilo==1 )go to 90
+                 loop_80: do i = ilo - 1, 1, -1
+                    k = int(lscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_80
+                    call stdlib_qswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_80
+                 90 continue
+                 if( ihi==n )go to 110
+                 loop_100: do i = ihi + 1, n
+                    k = int(lscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_100
+                    call stdlib_qswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_100
+              end if
+           end if
+           110 continue
+           return
+     end subroutine stdlib_qggbak
+
+     !> DGGBAL: balances a pair of general real matrices (A,B).  This
+     !> involves, first, permuting A and B by similarity transformations to
+     !> isolate eigenvalues in the first 1 to ILO$-$1 and last IHI+1 to N
+     !> elements on the diagonal; and second, applying a diagonal similarity
+     !> transformation to rows and columns ILO to IHI to make the rows
+     !> and columns as close in norm as possible. Both steps are optional.
+     !> Balancing may reduce the 1-norm of the matrices, and improve the
+     !> accuracy of the computed eigenvalues and/or eigenvectors in the
+     !> generalized eigenvalue problem A*x = lambda*B*x.
+
+     pure subroutine stdlib_qggbal( job, n, a, lda, b, ldb, ilo, ihi, lscale,rscale, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldb, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: lscale(*), rscale(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sclfac = 1.0e+1_qp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, icab, iflow, ip1, ir, irab, it, j, jc, jp1, k, kount, l, lcab, lm1, &
+                     lrab, lsfmax, lsfmin, m, nr, nrp2
+           real(qp) :: alpha, basl, beta, cab, cmax, coef, coef2, coef5, cor, ew, ewc, gamma, &
+                     pgamma, rab, sfmax, sfmin, sum, t, ta, tb, tc
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log10,max,min,sign
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGBAL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              ilo = 1
+              ihi = n
+              return
+           end if
+           if( n==1 ) then
+              ilo = 1
+              ihi = n
+              lscale( 1 ) = one
+              rscale( 1 ) = one
+              return
+           end if
+           if( stdlib_lsame( job, 'N' ) ) then
+              ilo = 1
+              ihi = n
+              do i = 1, n
+                 lscale( i ) = one
+                 rscale( i ) = one
+              end do
+              return
+           end if
+           k = 1
+           l = n
+           if( stdlib_lsame( job, 'S' ) )go to 190
+           go to 30
+           ! permute the matrices a and b to isolate the eigenvalues.
+           ! find row with one nonzero in columns 1 through l
+           20 continue
+           l = lm1
+           if( l/=1 )go to 30
+           rscale( 1 ) = one
+           lscale( 1 ) = one
+           go to 190
+           30 continue
+           lm1 = l - 1
+           loop_80: do i = l, 1, -1
+              do j = 1, lm1
+                 jp1 = j + 1
+                 if( a( i, j )/=zero .or. b( i, j )/=zero )go to 50
+              end do
+              j = l
+              go to 70
+              50 continue
+              do j = jp1, l
+                 if( a( i, j )/=zero .or. b( i, j )/=zero )cycle loop_80
+              end do
+              j = jp1 - 1
+              70 continue
+              m = l
+              iflow = 1
+              go to 160
+           end do loop_80
+           go to 100
+           ! find column with one nonzero in rows k through n
+           90 continue
+           k = k + 1
+           100 continue
+           loop_150: do j = k, l
+              do i = k, lm1
+                 ip1 = i + 1
+                 if( a( i, j )/=zero .or. b( i, j )/=zero )go to 120
+              end do
+              i = l
+              go to 140
+              120 continue
+              do i = ip1, l
+                 if( a( i, j )/=zero .or. b( i, j )/=zero )cycle loop_150
+              end do
+              i = ip1 - 1
+              140 continue
+              m = k
+              iflow = 2
+              go to 160
+           end do loop_150
+           go to 190
+           ! permute rows m and i
+           160 continue
+           lscale( m ) = i
+           if( i==m )go to 170
+           call stdlib_qswap( n-k+1, a( i, k ), lda, a( m, k ), lda )
+           call stdlib_qswap( n-k+1, b( i, k ), ldb, b( m, k ), ldb )
+           ! permute columns m and j
+           170 continue
+           rscale( m ) = j
+           if( j==m )go to 180
+           call stdlib_qswap( l, a( 1, j ), 1, a( 1, m ), 1 )
+           call stdlib_qswap( l, b( 1, j ), 1, b( 1, m ), 1 )
+           180 continue
+           go to ( 20, 90 )iflow
+           190 continue
+           ilo = k
+           ihi = l
+           if( stdlib_lsame( job, 'P' ) ) then
+              do i = ilo, ihi
+                 lscale( i ) = one
+                 rscale( i ) = one
+              end do
+              return
+           end if
+           if( ilo==ihi )return
+           ! balance the submatrix in rows ilo to ihi.
+           nr = ihi - ilo + 1
+           do i = ilo, ihi
+              rscale( i ) = zero
+              lscale( i ) = zero
+              work( i ) = zero
+              work( i+n ) = zero
+              work( i+2*n ) = zero
+              work( i+3*n ) = zero
+              work( i+4*n ) = zero
+              work( i+5*n ) = zero
+           end do
+           ! compute right side vector in resulting linear equations
+           basl = log10( sclfac )
+           do i = ilo, ihi
+              do j = ilo, ihi
+                 tb = b( i, j )
+                 ta = a( i, j )
+                 if( ta==zero )go to 210
+                 ta = log10( abs( ta ) ) / basl
+                 210 continue
+                 if( tb==zero )go to 220
+                 tb = log10( abs( tb ) ) / basl
+                 220 continue
+                 work( i+4*n ) = work( i+4*n ) - ta - tb
+                 work( j+5*n ) = work( j+5*n ) - ta - tb
+              end do
+           end do
+           coef = one / real( 2*nr,KIND=qp)
+           coef2 = coef*coef
+           coef5 = half*coef2
+           nrp2 = nr + 2
+           beta = zero
+           it = 1
+           ! start generalized conjugate gradient iteration
+           250 continue
+           gamma = stdlib_qdot( nr, work( ilo+4*n ), 1, work( ilo+4*n ), 1 ) +stdlib_qdot( nr, &
+                     work( ilo+5*n ), 1, work( ilo+5*n ), 1 )
+           ew = zero
+           ewc = zero
+           do i = ilo, ihi
+              ew = ew + work( i+4*n )
+              ewc = ewc + work( i+5*n )
+           end do
+           gamma = coef*gamma - coef2*( ew**2+ewc**2 ) - coef5*( ew-ewc )**2
+           if( gamma==zero )go to 350
+           if( it/=1 )beta = gamma / pgamma
+           t = coef5*( ewc-three*ew )
+           tc = coef5*( ew-three*ewc )
+           call stdlib_qscal( nr, beta, work( ilo ), 1 )
+           call stdlib_qscal( nr, beta, work( ilo+n ), 1 )
+           call stdlib_qaxpy( nr, coef, work( ilo+4*n ), 1, work( ilo+n ), 1 )
+           call stdlib_qaxpy( nr, coef, work( ilo+5*n ), 1, work( ilo ), 1 )
+           do i = ilo, ihi
+              work( i ) = work( i ) + tc
+              work( i+n ) = work( i+n ) + t
+           end do
+           ! apply matrix to vector
+           do i = ilo, ihi
+              kount = 0
+              sum = zero
+              loop_290: do j = ilo, ihi
+                 if( a( i, j )==zero )go to 280
+                 kount = kount + 1
+                 sum = sum + work( j )
+                 280 continue
+                 if( b( i, j )==zero )cycle loop_290
+                 kount = kount + 1
+                 sum = sum + work( j )
+              end do loop_290
+              work( i+2*n ) = real( kount,KIND=qp)*work( i+n ) + sum
+           end do
+           do j = ilo, ihi
+              kount = 0
+              sum = zero
+              loop_320: do i = ilo, ihi
+                 if( a( i, j )==zero )go to 310
+                 kount = kount + 1
+                 sum = sum + work( i+n )
+                 310 continue
+                 if( b( i, j )==zero )cycle loop_320
+                 kount = kount + 1
+                 sum = sum + work( i+n )
+              end do loop_320
+              work( j+3*n ) = real( kount,KIND=qp)*work( j ) + sum
+           end do
+           sum = stdlib_qdot( nr, work( ilo+n ), 1, work( ilo+2*n ), 1 ) +stdlib_qdot( nr, work( &
+                     ilo ), 1, work( ilo+3*n ), 1 )
+           alpha = gamma / sum
+           ! determine correction to current iteration
+           cmax = zero
+           do i = ilo, ihi
+              cor = alpha*work( i+n )
+              if( abs( cor )>cmax )cmax = abs( cor )
+              lscale( i ) = lscale( i ) + cor
+              cor = alpha*work( i )
+              if( abs( cor )>cmax )cmax = abs( cor )
+              rscale( i ) = rscale( i ) + cor
+           end do
+           if( cmax<half )go to 350
+           call stdlib_qaxpy( nr, -alpha, work( ilo+2*n ), 1, work( ilo+4*n ), 1 )
+           call stdlib_qaxpy( nr, -alpha, work( ilo+3*n ), 1, work( ilo+5*n ), 1 )
+           pgamma = gamma
+           it = it + 1
+           if( it<=nrp2 )go to 250
+           ! end generalized conjugate gradient iteration
+           350 continue
+           sfmin = stdlib_qlamch( 'S' )
+           sfmax = one / sfmin
+           lsfmin = int( log10( sfmin ) / basl+one,KIND=ilp)
+           lsfmax = int( log10( sfmax ) / basl,KIND=ilp)
+           do i = ilo, ihi
+              irab = stdlib_iqamax( n-ilo+1, a( i, ilo ), lda )
+              rab = abs( a( i, irab+ilo-1 ) )
+              irab = stdlib_iqamax( n-ilo+1, b( i, ilo ), ldb )
+              rab = max( rab, abs( b( i, irab+ilo-1 ) ) )
+              lrab = int( log10( rab+sfmin ) / basl+one,KIND=ilp)
+              ir = int(lscale( i ) + sign( half, lscale( i ) ),KIND=ilp)
+              ir = min( max( ir, lsfmin ), lsfmax, lsfmax-lrab )
+              lscale( i ) = sclfac**ir
+              icab = stdlib_iqamax( ihi, a( 1, i ), 1 )
+              cab = abs( a( icab, i ) )
+              icab = stdlib_iqamax( ihi, b( 1, i ), 1 )
+              cab = max( cab, abs( b( icab, i ) ) )
+              lcab = int( log10( cab+sfmin ) / basl+one,KIND=ilp)
+              jc = int(rscale( i ) + sign( half, rscale( i ) ),KIND=ilp)
+              jc = min( max( jc, lsfmin ), lsfmax, lsfmax-lcab )
+              rscale( i ) = sclfac**jc
+           end do
+           ! row scaling of matrices a and b
+           do i = ilo, ihi
+              call stdlib_qscal( n-ilo+1, lscale( i ), a( i, ilo ), lda )
+              call stdlib_qscal( n-ilo+1, lscale( i ), b( i, ilo ), ldb )
+           end do
+           ! column scaling of matrices a and b
+           do j = ilo, ihi
+              call stdlib_qscal( ihi, rscale( j ), a( 1, j ), 1 )
+              call stdlib_qscal( ihi, rscale( j ), b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_qggbal
+
+     !> DGGES: computes for a pair of N-by-N real nonsymmetric matrices (A,B),
+     !> the generalized eigenvalues, the generalized real Schur form (S,T),
+     !> optionally, the left and/or right matrices of Schur vectors (VSL and
+     !> VSR). This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL)*S*(VSR)**T, (VSL)*T*(VSR)**T )
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> quasi-triangular matrix S and the upper triangular matrix T.The
+     !> leading columns of VSL and VSR then form an orthonormal basis for the
+     !> corresponding left and right eigenspaces (deflating subspaces).
+     !> (If only the generalized eigenvalues are needed, use the driver
+     !> DGGEV instead, which is faster.)
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0 or both being zero.
+     !> A pair of matrices (S,T) is in generalized real Schur form if T is
+     !> upper triangular with non-negative diagonal and S is block upper
+     !> triangular with 1-by-1 and 2-by-2 blocks.  1-by-1 blocks correspond
+     !> to real generalized eigenvalues, while 2-by-2 blocks of S will be
+     !> "standardized" by making the corresponding elements of T have the
+     !> form:
+     !> [  a  0  ]
+     !> [  0  b  ]
+     !> and the pair of corresponding 2-by-2 blocks in S and T will have a
+     !> complex conjugate pair of generalized eigenvalues.
+
+     subroutine stdlib_qgges( jobvsl, jobvsr, sort, selctg, n, a, lda, b, ldb,sdim, alphar, &
+               alphai, beta, vsl, ldvsl, vsr,ldvsr, work, lwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: alphai(*), alphar(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), &
+                     work(*)
+           ! Function Arguments 
+           procedure(stdlib_selctg_q) :: selctg
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, lst2sl, &
+                     wantst
+           integer(ilp) :: i, icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, ip, iright, irows, &
+                     itau, iwrk, maxwrk, minwrk
+           real(qp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pvsl, pvsr, safmax, safmin, &
+                     smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(qp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -15
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -17
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              if( n>0 )then
+                 minwrk = max( 8*n, 6*n + 16 )
+                 maxwrk = minwrk - n +n*stdlib_ilaenv( 1, 'DGEQRF', ' ', n, 1, n, 0 )
+                 maxwrk = max( maxwrk, minwrk - n +n*stdlib_ilaenv( 1, 'DORMQR', ' ', n, 1, n, -1 &
+                           ) )
+                 if( ilvsl ) then
+                    maxwrk = max( maxwrk, minwrk - n +n*stdlib_ilaenv( 1, 'DORGQR', ' ', n, 1, n, &
+                              -1 ) )
+                 end if
+              else
+                 minwrk = 1
+                 maxwrk = 1
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery )info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGES ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           safmin = stdlib_qlamch( 'S' )
+           safmax = one / safmin
+           call stdlib_qlabad( safmin, safmax )
+           smlnum = sqrt( safmin ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_qlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_qlange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_qlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (workspace: need 6*n + 2*n space for storing balancing factors)
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_qggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_qgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (workspace: need n, prefer n*nb)
+           call stdlib_qormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           ! (workspace: need n, prefer n*nb)
+           if( ilvsl ) then
+              call stdlib_qlaset( 'FULL', n, n, zero, one, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_qlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_qorgqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_qlaset( 'FULL', n, n, zero, one, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           call stdlib_qgghrd( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      ierr )
+           ! perform qz algorithm, computing schur vectors if desired
+           ! (workspace: need n)
+           iwrk = itau
+           call stdlib_qhgeqz( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vsl, ldvsl, vsr, ldvsr,work( iwrk ), lwork+1-iwrk, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 50
+           end if
+           ! sort eigenvalues alpha/beta if desired
+           ! (workspace: need 4*n+16 )
+           sdim = 0
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selctging
+              if( ilascl ) then
+                 call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n,ierr )
+                 call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n,ierr )
+              end if
+              if( ilbscl )call stdlib_qlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alphar( i ), alphai( i ), beta( i ) )
+              end do
+              call stdlib_qtgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb, alphar,alphai, beta, &
+              vsl, ldvsl, vsr, ldvsr, sdim, pvsl,pvsr, dif, work( iwrk ), lwork-iwrk+1, idum, 1,&
+                        ierr )
+              if( ierr==1 )info = n + 3
+           end if
+           ! apply back-permutation to vsl and vsr
+           ! (workspace: none needed)
+           if( ilvsl )call stdlib_qggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_qggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsr, ldvsr, ierr )
+           ! check if unscaling would cause over/underflow, if so, rescale
+           ! (alphar(i),alphai(i),beta(i)) so beta(i) is on the order of
+           ! b(i,i) and alphar(i) and alphai(i) are on the order of a(i,i)
+           if( ilascl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( alphar( i ) / safmax )>( anrmto / anrm ) .or.( safmin / alphar( i ) )>( &
+                              anrm / anrmto ) ) then
+                       work( 1 ) = abs( a( i, i ) / alphar( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    else if( ( alphai( i ) / safmax )>( anrmto / anrm ) .or.( safmin / alphai( i )&
+                               )>( anrm / anrmto ) )then
+                       work( 1 ) = abs( a( i, i+1 ) / alphai( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           if( ilbscl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( beta( i ) / safmax )>( bnrmto / bnrm ) .or.( safmin / beta( i ) )>( &
+                              bnrm / bnrmto ) ) then
+                       work( 1 ) = abs( b( i, i ) / beta( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_qlascl( 'H', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_qlascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_qlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = selctg( alphar( i ), alphai( i ), beta( i ) )
+                 if( alphai( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           50 continue
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_qgges
+
+     !> DGGES3: computes for a pair of N-by-N real nonsymmetric matrices (A,B),
+     !> the generalized eigenvalues, the generalized real Schur form (S,T),
+     !> optionally, the left and/or right matrices of Schur vectors (VSL and
+     !> VSR). This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL)*S*(VSR)**T, (VSL)*T*(VSR)**T )
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> quasi-triangular matrix S and the upper triangular matrix T.The
+     !> leading columns of VSL and VSR then form an orthonormal basis for the
+     !> corresponding left and right eigenspaces (deflating subspaces).
+     !> (If only the generalized eigenvalues are needed, use the driver
+     !> DGGEV instead, which is faster.)
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0 or both being zero.
+     !> A pair of matrices (S,T) is in generalized real Schur form if T is
+     !> upper triangular with non-negative diagonal and S is block upper
+     !> triangular with 1-by-1 and 2-by-2 blocks.  1-by-1 blocks correspond
+     !> to real generalized eigenvalues, while 2-by-2 blocks of S will be
+     !> "standardized" by making the corresponding elements of T have the
+     !> form:
+     !> [  a  0  ]
+     !> [  0  b  ]
+     !> and the pair of corresponding 2-by-2 blocks in S and T will have a
+     !> complex conjugate pair of generalized eigenvalues.
+
+     subroutine stdlib_qgges3( jobvsl, jobvsr, sort, selctg, n, a, lda, b,ldb, sdim, alphar, &
+               alphai, beta, vsl, ldvsl,vsr, ldvsr, work, lwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: alphai(*), alphar(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), &
+                     work(*)
+           ! Function Arguments 
+           procedure(stdlib_selctg_q) :: selctg
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, lst2sl, &
+                     wantst
+           integer(ilp) :: i, icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, ip, iright, irows, &
+                     itau, iwrk, lwkopt
+           real(qp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pvsl, pvsr, safmax, safmin, &
+                     smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(qp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -15
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -17
+           else if( lwork<6*n+16 .and. .not.lquery ) then
+              info = -19
+           end if
+           ! compute workspace
+           if( info==0 ) then
+              call stdlib_qgeqrf( n, n, b, ldb, work, work, -1, ierr )
+              lwkopt = max( 6*n+16, 3*n+int( work ( 1 ),KIND=ilp) )
+              call stdlib_qormqr( 'L', 'T', n, n, n, b, ldb, work, a, lda, work,-1, ierr )
+              lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+              if( ilvsl ) then
+                 call stdlib_qorgqr( n, n, n, vsl, ldvsl, work, work, -1, ierr )
+                 lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+              end if
+              call stdlib_qgghd3( jobvsl, jobvsr, n, 1, n, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr, &
+                        work, -1, ierr )
+              lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+              call stdlib_qlaqz0( 'S', jobvsl, jobvsr, n, 1, n, a, lda, b, ldb,alphar, alphai, &
+                        beta, vsl, ldvsl, vsr, ldvsr,work, -1, 0, ierr )
+              lwkopt = max( lwkopt, 2*n+int( work ( 1 ),KIND=ilp) )
+              if( wantst ) then
+                 call stdlib_qtgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb,alphar, alphai, &
+                 beta, vsl, ldvsl, vsr, ldvsr,sdim, pvsl, pvsr, dif, work, -1, idum, 1,ierr )
+                           
+                 lwkopt = max( lwkopt, 2*n+int( work ( 1 ),KIND=ilp) )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGES3 ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           safmin = stdlib_qlamch( 'S' )
+           safmax = one / safmin
+           call stdlib_qlabad( safmin, safmax )
+           smlnum = sqrt( safmin ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_qlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_qlange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_qlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_qggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_qgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           call stdlib_qormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           if( ilvsl ) then
+              call stdlib_qlaset( 'FULL', n, n, zero, one, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_qlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_qorgqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_qlaset( 'FULL', n, n, zero, one, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           call stdlib_qgghd3( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      work( iwrk ), lwork+1-iwrk,ierr )
+           ! perform qz algorithm, computing schur vectors if desired
+           iwrk = itau
+           call stdlib_qlaqz0( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vsl, ldvsl, vsr, ldvsr,work( iwrk ), lwork+1-iwrk, 0, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 50
+           end if
+           ! sort eigenvalues alpha/beta if desired
+           sdim = 0
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selctging
+              if( ilascl ) then
+                 call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n,ierr )
+                 call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n,ierr )
+              end if
+              if( ilbscl )call stdlib_qlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alphar( i ), alphai( i ), beta( i ) )
+              end do
+              call stdlib_qtgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb, alphar,alphai, beta, &
+              vsl, ldvsl, vsr, ldvsr, sdim, pvsl,pvsr, dif, work( iwrk ), lwork-iwrk+1, idum, 1,&
+                        ierr )
+              if( ierr==1 )info = n + 3
+           end if
+           ! apply back-permutation to vsl and vsr
+           if( ilvsl )call stdlib_qggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_qggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsr, ldvsr, ierr )
+           ! check if unscaling would cause over/underflow, if so, rescale
+           ! (alphar(i),alphai(i),beta(i)) so beta(i) is on the order of
+           ! b(i,i) and alphar(i) and alphai(i) are on the order of a(i,i)
+           if( ilascl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( alphar( i ) / safmax )>( anrmto / anrm ) .or.( safmin / alphar( i ) )>( &
+                              anrm / anrmto ) ) then
+                       work( 1 ) = abs( a( i, i ) / alphar( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    else if( ( alphai( i ) / safmax )>( anrmto / anrm ) .or.( safmin / alphai( i )&
+                               )>( anrm / anrmto ) )then
+                       work( 1 ) = abs( a( i, i+1 ) / alphai( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           if( ilbscl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( beta( i ) / safmax )>( bnrmto / bnrm ) .or.( safmin / beta( i ) )>( &
+                              bnrm / bnrmto ) ) then
+                       work( 1 ) = abs( b( i, i ) / beta( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_qlascl( 'H', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_qlascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_qlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = selctg( alphar( i ), alphai( i ), beta( i ) )
+                 if( alphai( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           50 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qgges3
+
+     !> DGGESX: computes for a pair of N-by-N real nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, the real Schur form (S,T), and,
+     !> optionally, the left and/or right matrices of Schur vectors (VSL and
+     !> VSR).  This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL) S (VSR)**T, (VSL) T (VSR)**T )
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> quasi-triangular matrix S and the upper triangular matrix T; computes
+     !> a reciprocal condition number for the average of the selected
+     !> eigenvalues (RCONDE); and computes a reciprocal condition number for
+     !> the right and left deflating subspaces corresponding to the selected
+     !> eigenvalues (RCONDV). The leading columns of VSL and VSR then form
+     !> an orthonormal basis for the corresponding left and right eigenspaces
+     !> (deflating subspaces).
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0 or for both being zero.
+     !> A pair of matrices (S,T) is in generalized real Schur form if T is
+     !> upper triangular with non-negative diagonal and S is block upper
+     !> triangular with 1-by-1 and 2-by-2 blocks.  1-by-1 blocks correspond
+     !> to real generalized eigenvalues, while 2-by-2 blocks of S will be
+     !> "standardized" by making the corresponding elements of T have the
+     !> form:
+     !> [  a  0  ]
+     !> [  0  b  ]
+     !> and the pair of corresponding 2-by-2 blocks in S and T will have a
+     !> complex conjugate pair of generalized eigenvalues.
+
+     subroutine stdlib_qggesx( jobvsl, jobvsr, sort, selctg, sense, n, a, lda,b, ldb, sdim, &
+     alphar, alphai, beta, vsl, ldvsl,vsr, ldvsr, rconde, rcondv, work, lwork, iwork,liwork, &
+               bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sense, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, liwork, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: alphai(*), alphar(*), beta(*), rconde(2), rcondv(2), vsl(&
+                     ldvsl,*), vsr(ldvsr,*), work(*)
+           ! Function Arguments 
+           procedure(stdlib_selctg_q) :: selctg
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, lst2sl, wantsb, &
+                     wantse, wantsn, wantst, wantsv
+           integer(ilp) :: i, icols, ierr, ihi, ijob, ijobvl, ijobvr, ileft, ilo, ip, iright, &
+                     irows, itau, iwrk, liwmin, lwrk, maxwrk, minwrk
+           real(qp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pl, pr, safmax, safmin, &
+                     smlnum
+           ! Local Arrays 
+           real(qp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( wantsn ) then
+              ijob = 0
+           else if( wantse ) then
+              ijob = 1
+           else if( wantsv ) then
+              ijob = 2
+           else if( wantsb ) then
+              ijob = 4
+           end if
+           ! test the input arguments
+           info = 0
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( .not.( wantsn .or. wantse .or. wantsv .or. wantsb ) .or.( .not.wantst .and. &
+                     .not.wantsn ) ) then
+              info = -5
+           else if( n<0 ) then
+              info = -6
+           else if( lda<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -16
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -18
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              if( n>0) then
+                 minwrk = max( 8*n, 6*n + 16 )
+                 maxwrk = minwrk - n +n*stdlib_ilaenv( 1, 'DGEQRF', ' ', n, 1, n, 0 )
+                 maxwrk = max( maxwrk, minwrk - n +n*stdlib_ilaenv( 1, 'DORMQR', ' ', n, 1, n, -1 &
+                           ) )
+                 if( ilvsl ) then
+                    maxwrk = max( maxwrk, minwrk - n +n*stdlib_ilaenv( 1, 'DORGQR', ' ', n, 1, n, &
+                              -1 ) )
+                 end if
+                 lwrk = maxwrk
+                 if( ijob>=1 )lwrk = max( lwrk, n*n/2 )
+              else
+                 minwrk = 1
+                 maxwrk = 1
+                 lwrk   = 1
+              end if
+              work( 1 ) = lwrk
+              if( wantsn .or. n==0 ) then
+                 liwmin = 1
+              else
+                 liwmin = n + 6
+              end if
+              iwork( 1 ) = liwmin
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -22
+              else if( liwork<liwmin  .and. .not.lquery ) then
+                 info = -24
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGESX', -info )
+              return
+           else if (lquery) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           safmin = stdlib_qlamch( 'S' )
+           safmax = one / safmin
+           call stdlib_qlabad( safmin, safmax )
+           smlnum = sqrt( safmin ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_qlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_qlange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_qlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (workspace: need 6*n + 2*n for permutation parameters)
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_qggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_qgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (workspace: need n, prefer n*nb)
+           call stdlib_qormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           ! (workspace: need n, prefer n*nb)
+           if( ilvsl ) then
+              call stdlib_qlaset( 'FULL', n, n, zero, one, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_qlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_qorgqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_qlaset( 'FULL', n, n, zero, one, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           call stdlib_qgghrd( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      ierr )
+           sdim = 0
+           ! perform qz algorithm, computing schur vectors if desired
+           ! (workspace: need n)
+           iwrk = itau
+           call stdlib_qhgeqz( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vsl, ldvsl, vsr, ldvsr,work( iwrk ), lwork+1-iwrk, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 60
+           end if
+           ! sort eigenvalues alpha/beta and compute the reciprocal of
+           ! condition number(s)
+           ! (workspace: if ijob >= 1, need max( 8*(n+1), 2*sdim*(n-sdim) )
+                       ! otherwise, need 8*(n+1) )
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selctging
+              if( ilascl ) then
+                 call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n,ierr )
+                 call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n,ierr )
+              end if
+              if( ilbscl )call stdlib_qlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alphar( i ), alphai( i ), beta( i ) )
+              end do
+              ! reorder eigenvalues, transform generalized schur vectors, and
+              ! compute reciprocal condition numbers
+              call stdlib_qtgsen( ijob, ilvsl, ilvsr, bwork, n, a, lda, b, ldb,alphar, alphai, &
+              beta, vsl, ldvsl, vsr, ldvsr,sdim, pl, pr, dif, work( iwrk ), lwork-iwrk+1,iwork, &
+                        liwork, ierr )
+              if( ijob>=1 )maxwrk = max( maxwrk, 2*sdim*( n-sdim ) )
+              if( ierr==-22 ) then
+                  ! not enough real workspace
+                 info = -22
+              else
+                 if( ijob==1 .or. ijob==4 ) then
+                    rconde( 1 ) = pl
+                    rconde( 2 ) = pr
+                 end if
+                 if( ijob==2 .or. ijob==4 ) then
+                    rcondv( 1 ) = dif( 1 )
+                    rcondv( 2 ) = dif( 2 )
+                 end if
+                 if( ierr==1 )info = n + 3
+              end if
+           end if
+           ! apply permutation to vsl and vsr
+           ! (workspace: none needed)
+           if( ilvsl )call stdlib_qggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_qggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsr, ldvsr, ierr )
+           ! check if unscaling would cause over/underflow, if so, rescale
+           ! (alphar(i),alphai(i),beta(i)) so beta(i) is on the order of
+           ! b(i,i) and alphar(i) and alphai(i) are on the order of a(i,i)
+           if( ilascl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( alphar( i ) / safmax )>( anrmto / anrm ) .or.( safmin / alphar( i ) )>( &
+                              anrm / anrmto ) ) then
+                       work( 1 ) = abs( a( i, i ) / alphar( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    else if( ( alphai( i ) / safmax )>( anrmto / anrm ) .or.( safmin / alphai( i )&
+                               )>( anrm / anrmto ) )then
+                       work( 1 ) = abs( a( i, i+1 ) / alphai( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           if( ilbscl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( beta( i ) / safmax )>( bnrmto / bnrm ) .or.( safmin / beta( i ) )>( &
+                              bnrm / bnrmto ) ) then
+                       work( 1 ) = abs( b( i, i ) / beta( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_qlascl( 'H', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_qlascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_qlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = selctg( alphar( i ), alphai( i ), beta( i ) )
+                 if( alphai( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           60 continue
+           work( 1 ) = maxwrk
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_qggesx
+
+     !> DGGEV: computes for a pair of N-by-N real nonsymmetric matrices (A,B)
+     !> the generalized eigenvalues, and optionally, the left and/or right
+     !> generalized eigenvectors.
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right eigenvector v(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j).
+     !> The left eigenvector u(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> u(j)**H * A  = lambda(j) * u(j)**H * B .
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_qggev( jobvl, jobvr, n, a, lda, b, ldb, alphar, alphai,beta, vl, ldvl, vr, &
+               ldvr, work, lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: alphai(*), alphar(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+                     
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery
+           character :: chtemp
+           integer(ilp) :: icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, in, iright, irows, itau, &
+                     iwrk, jc, jr, maxwrk, minwrk
+           real(qp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -12
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -14
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv. the workspace is
+             ! computed assuming ilo = 1 and ihi = n, the worst case.)
+           if( info==0 ) then
+              minwrk = max( 1, 8*n )
+              maxwrk = max( 1, n*( 7 +stdlib_ilaenv( 1, 'DGEQRF', ' ', n, 1, n, 0 ) ) )
+              maxwrk = max( maxwrk, n*( 7 +stdlib_ilaenv( 1, 'DORMQR', ' ', n, 1, n, 0 ) ) )
+                        
+              if( ilvl ) then
+                 maxwrk = max( maxwrk, n*( 7 +stdlib_ilaenv( 1, 'DORGQR', ' ', n, 1, n, -1 ) ) )
+                           
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery )info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_qlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_qlange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_qlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrices a, b to isolate eigenvalues if possible
+           ! (workspace: need 6*n)
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_qggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           if( ilv ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_qgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (workspace: need n, prefer n*nb)
+           call stdlib_qormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl
+           ! (workspace: need n, prefer n*nb)
+           if( ilvl ) then
+              call stdlib_qlaset( 'FULL', n, n, zero, one, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_qlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_qorgqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vr
+           if( ilvr )call stdlib_qlaset( 'FULL', n, n, zero, one, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           if( ilv ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_qgghrd( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        ierr )
+           else
+              call stdlib_qgghrd( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur forms and schur vectors)
+           ! (workspace: need n)
+           iwrk = itau
+           if( ilv ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_qhgeqz( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vl, ldvl, vr, ldvr,work( iwrk ), lwork+1-iwrk, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 110
+           end if
+           ! compute eigenvectors
+           ! (workspace: need 6*n)
+           if( ilv ) then
+              if( ilvl ) then
+                 if( ilvr ) then
+                    chtemp = 'B'
+                 else
+                    chtemp = 'L'
+                 end if
+              else
+                 chtemp = 'R'
+              end if
+              call stdlib_qtgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, n, &
+                        in, work( iwrk ), ierr )
+              if( ierr/=0 ) then
+                 info = n + 2
+                 go to 110
+              end if
+              ! undo balancing on vl and vr and normalization
+              ! (workspace: none needed)
+              if( ilvl ) then
+                 call stdlib_qggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, vl, &
+                           ldvl, ierr )
+                 loop_50: do jc = 1, n
+                    if( alphai( jc )<zero )cycle loop_50
+                    temp = zero
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          temp = max( temp, abs( vl( jr, jc ) ) )
+                       end do
+                    else
+                       do jr = 1, n
+                          temp = max( temp, abs( vl( jr, jc ) )+abs( vl( jr, jc+1 ) ) )
+                       end do
+                    end if
+                    if( temp<smlnum )cycle loop_50
+                    temp = one / temp
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          vl( jr, jc ) = vl( jr, jc )*temp
+                       end do
+                    else
+                       do jr = 1, n
+                          vl( jr, jc ) = vl( jr, jc )*temp
+                          vl( jr, jc+1 ) = vl( jr, jc+1 )*temp
+                       end do
+                    end if
+                 end do loop_50
+              end if
+              if( ilvr ) then
+                 call stdlib_qggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, vr, &
+                           ldvr, ierr )
+                 loop_100: do jc = 1, n
+                    if( alphai( jc )<zero )cycle loop_100
+                    temp = zero
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          temp = max( temp, abs( vr( jr, jc ) ) )
+                       end do
+                    else
+                       do jr = 1, n
+                          temp = max( temp, abs( vr( jr, jc ) )+abs( vr( jr, jc+1 ) ) )
+                       end do
+                    end if
+                    if( temp<smlnum )cycle loop_100
+                    temp = one / temp
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          vr( jr, jc ) = vr( jr, jc )*temp
+                       end do
+                    else
+                       do jr = 1, n
+                          vr( jr, jc ) = vr( jr, jc )*temp
+                          vr( jr, jc+1 ) = vr( jr, jc+1 )*temp
+                       end do
+                    end if
+                 end do loop_100
+              end if
+              ! end of eigenvector calculation
+           end if
+           ! undo scaling if necessary
+           110 continue
+           if( ilascl ) then
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_qlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_qggev
+
+     !> DGGEV3: computes for a pair of N-by-N real nonsymmetric matrices (A,B)
+     !> the generalized eigenvalues, and optionally, the left and/or right
+     !> generalized eigenvectors.
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right eigenvector v(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j).
+     !> The left eigenvector u(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> u(j)**H * A  = lambda(j) * u(j)**H * B .
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_qggev3( jobvl, jobvr, n, a, lda, b, ldb, alphar,alphai, beta, vl, ldvl, vr,&
+                ldvr, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: alphai(*), alphar(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+                     
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery
+           character :: chtemp
+           integer(ilp) :: icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, in, iright, irows, itau, &
+                     iwrk, jc, jr, lwkopt
+           real(qp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -12
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -14
+           else if( lwork<max( 1, 8*n ) .and. .not.lquery ) then
+              info = -16
+           end if
+           ! compute workspace
+           if( info==0 ) then
+              call stdlib_qgeqrf( n, n, b, ldb, work, work, -1, ierr )
+              lwkopt = max(1, 8*n, 3*n+int( work( 1 ),KIND=ilp) )
+              call stdlib_qormqr( 'L', 'T', n, n, n, b, ldb, work, a, lda, work, -1,ierr )
+              lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+              if( ilvl ) then
+                 call stdlib_qorgqr( n, n, n, vl, ldvl, work, work, -1, ierr )
+                 lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+              end if
+              if( ilv ) then
+                 call stdlib_qgghd3( jobvl, jobvr, n, 1, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                           work, -1, ierr )
+                 lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+                 call stdlib_qlaqz0( 'S', jobvl, jobvr, n, 1, n, a, lda, b, ldb,alphar, alphai, &
+                           beta, vl, ldvl, vr, ldvr,work, -1, 0, ierr )
+                 lwkopt = max( lwkopt, 2*n+int( work ( 1 ),KIND=ilp) )
+              else
+                 call stdlib_qgghd3( 'N', 'N', n, 1, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, work, -&
+                           1, ierr )
+                 lwkopt = max( lwkopt, 3*n+int( work ( 1 ),KIND=ilp) )
+                 call stdlib_qlaqz0( 'E', jobvl, jobvr, n, 1, n, a, lda, b, ldb,alphar, alphai, &
+                           beta, vl, ldvl, vr, ldvr,work, -1, 0, ierr )
+                 lwkopt = max( lwkopt, 2*n+int( work ( 1 ),KIND=ilp) )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGEV3 ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_qlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_qlange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_qlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrices a, b to isolate eigenvalues if possible
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_qggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           irows = ihi + 1 - ilo
+           if( ilv ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_qgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           call stdlib_qormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl
+           if( ilvl ) then
+              call stdlib_qlaset( 'FULL', n, n, zero, one, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_qlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_qorgqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vr
+           if( ilvr )call stdlib_qlaset( 'FULL', n, n, zero, one, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           if( ilv ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_qgghd3( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        work( iwrk ), lwork+1-iwrk, ierr )
+           else
+              call stdlib_qgghd3( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr,work( iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur forms and schur vectors)
+           iwrk = itau
+           if( ilv ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_qlaqz0( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vl, ldvl, vr, ldvr,work( iwrk ), lwork+1-iwrk, 0, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 110
+           end if
+           ! compute eigenvectors
+           if( ilv ) then
+              if( ilvl ) then
+                 if( ilvr ) then
+                    chtemp = 'B'
+                 else
+                    chtemp = 'L'
+                 end if
+              else
+                 chtemp = 'R'
+              end if
+              call stdlib_qtgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, n, &
+                        in, work( iwrk ), ierr )
+              if( ierr/=0 ) then
+                 info = n + 2
+                 go to 110
+              end if
+              ! undo balancing on vl and vr and normalization
+              if( ilvl ) then
+                 call stdlib_qggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, vl, &
+                           ldvl, ierr )
+                 loop_50: do jc = 1, n
+                    if( alphai( jc )<zero )cycle loop_50
+                    temp = zero
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          temp = max( temp, abs( vl( jr, jc ) ) )
+                       end do
+                    else
+                       do jr = 1, n
+                          temp = max( temp, abs( vl( jr, jc ) )+abs( vl( jr, jc+1 ) ) )
+                       end do
+                    end if
+                    if( temp<smlnum )cycle loop_50
+                    temp = one / temp
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          vl( jr, jc ) = vl( jr, jc )*temp
+                       end do
+                    else
+                       do jr = 1, n
+                          vl( jr, jc ) = vl( jr, jc )*temp
+                          vl( jr, jc+1 ) = vl( jr, jc+1 )*temp
+                       end do
+                    end if
+                 end do loop_50
+              end if
+              if( ilvr ) then
+                 call stdlib_qggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, vr, &
+                           ldvr, ierr )
+                 loop_100: do jc = 1, n
+                    if( alphai( jc )<zero )cycle loop_100
+                    temp = zero
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          temp = max( temp, abs( vr( jr, jc ) ) )
+                       end do
+                    else
+                       do jr = 1, n
+                          temp = max( temp, abs( vr( jr, jc ) )+abs( vr( jr, jc+1 ) ) )
+                       end do
+                    end if
+                    if( temp<smlnum )cycle loop_100
+                    temp = one / temp
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          vr( jr, jc ) = vr( jr, jc )*temp
+                       end do
+                    else
+                       do jr = 1, n
+                          vr( jr, jc ) = vr( jr, jc )*temp
+                          vr( jr, jc+1 ) = vr( jr, jc+1 )*temp
+                       end do
+                    end if
+                 end do loop_100
+              end if
+              ! end of eigenvector calculation
+           end if
+           ! undo scaling if necessary
+           110 continue
+           if( ilascl ) then
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_qlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qggev3
+
+     !> DGGEVX: computes for a pair of N-by-N real nonsymmetric matrices (A,B)
+     !> the generalized eigenvalues, and optionally, the left and/or right
+     !> generalized eigenvectors.
+     !> Optionally also, it computes a balancing transformation to improve
+     !> the conditioning of the eigenvalues and eigenvectors (ILO, IHI,
+     !> LSCALE, RSCALE, ABNRM, and BBNRM), reciprocal condition numbers for
+     !> the eigenvalues (RCONDE), and reciprocal condition numbers for the
+     !> right eigenvectors (RCONDV).
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right eigenvector v(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j) .
+     !> The left eigenvector u(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> u(j)**H * A  = lambda(j) * u(j)**H * B.
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_qggevx( balanc, jobvl, jobvr, sense, n, a, lda, b, ldb,alphar, alphai, &
+     beta, vl, ldvl, vr, ldvr, ilo,ihi, lscale, rscale, abnrm, bbnrm, rconde,rcondv, work, lwork, &
+               iwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: balanc, jobvl, jobvr, sense
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           real(qp), intent(out) :: abnrm, bbnrm
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: alphai(*), alphar(*), beta(*), lscale(*), rconde(*), rcondv(*)&
+                     , rscale(*), vl(ldvl,*), vr(ldvr,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery, noscl, pair, wantsb, wantse, &
+                     wantsn, wantsv
+           character :: chtemp
+           integer(ilp) :: i, icols, ierr, ijobvl, ijobvr, in, irows, itau, iwrk, iwrk1, j, jc, &
+                     jr, m, maxwrk, minwrk, mm
+           real(qp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           noscl  = stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc, 'P' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.( stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc,'S' ) .or. &
+                     stdlib_lsame( balanc, 'P' ) .or. stdlib_lsame( balanc, 'B' ) ) )then
+              info = -1
+           else if( ijobvl<=0 ) then
+              info = -2
+           else if( ijobvr<=0 ) then
+              info = -3
+           else if( .not.( wantsn .or. wantse .or. wantsb .or. wantsv ) )then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -14
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -16
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv. the workspace is
+             ! computed assuming ilo = 1 and ihi = n, the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 if( noscl .and. .not.ilv ) then
+                    minwrk = 2*n
+                 else
+                    minwrk = 6*n
+                 end if
+                 if( wantse .or. wantsb ) then
+                    minwrk = 10*n
+                 end if
+                 if( wantsv .or. wantsb ) then
+                    minwrk = max( minwrk, 2*n*( n + 4 ) + 16 )
+                 end if
+                 maxwrk = minwrk
+                 maxwrk = max( maxwrk,n + n*stdlib_ilaenv( 1, 'DGEQRF', ' ', n, 1, n, 0 ) )
+                           
+                 maxwrk = max( maxwrk,n + n*stdlib_ilaenv( 1, 'DORMQR', ' ', n, 1, n, 0 ) )
+                           
+                 if( ilvl ) then
+                    maxwrk = max( maxwrk, n +n*stdlib_ilaenv( 1, 'DORGQR', ' ', n, 1, n, 0 ) )
+                              
+                 end if
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -26
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_qlange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_qlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_qlange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_qlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute and/or balance the matrix pair (a,b)
+           ! (workspace: need 6*n if balanc = 's' or 'b', 1 otherwise)
+           call stdlib_qggbal( balanc, n, a, lda, b, ldb, ilo, ihi, lscale, rscale,work, ierr )
+                     
+           ! compute abnrm and bbnrm
+           abnrm = stdlib_qlange( '1', n, n, a, lda, work( 1 ) )
+           if( ilascl ) then
+              work( 1 ) = abnrm
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, 1, 1, work( 1 ), 1,ierr )
+              abnrm = work( 1 )
+           end if
+           bbnrm = stdlib_qlange( '1', n, n, b, ldb, work( 1 ) )
+           if( ilbscl ) then
+              work( 1 ) = bbnrm
+              call stdlib_qlascl( 'G', 0, 0, bnrmto, bnrm, 1, 1, work( 1 ), 1,ierr )
+              bbnrm = work( 1 )
+           end if
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (workspace: need n, prefer n*nb )
+           irows = ihi + 1 - ilo
+           if( ilv .or. .not.wantsn ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_qgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to a
+           ! (workspace: need n, prefer n*nb)
+           call stdlib_qormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl and/or vr
+           ! (workspace: need n, prefer n*nb)
+           if( ilvl ) then
+              call stdlib_qlaset( 'FULL', n, n, zero, one, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_qlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_qorgqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           if( ilvr )call stdlib_qlaset( 'FULL', n, n, zero, one, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           if( ilv .or. .not.wantsn ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_qgghrd( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        ierr )
+           else
+              call stdlib_qgghrd( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur forms and schur vectors)
+           ! (workspace: need n)
+           if( ilv .or. .not.wantsn ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_qhgeqz( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vl, ldvl, vr, ldvr, work,lwork, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 130
+           end if
+           ! compute eigenvectors and estimate condition numbers if desired
+           ! (workspace: stdlib_qtgevc: need 6*n
+                       ! stdlib_qtgsna: need 2*n*(n+2)+16 if sense = 'v' or 'b',
+                               ! need n otherwise )
+           if( ilv .or. .not.wantsn ) then
+              if( ilv ) then
+                 if( ilvl ) then
+                    if( ilvr ) then
+                       chtemp = 'B'
+                    else
+                       chtemp = 'L'
+                    end if
+                 else
+                    chtemp = 'R'
+                 end if
+                 call stdlib_qtgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, n,&
+                            in, work, ierr )
+                 if( ierr/=0 ) then
+                    info = n + 2
+                    go to 130
+                 end if
+              end if
+              if( .not.wantsn ) then
+                 ! compute eigenvectors (stdlib_qtgevc) and estimate condition
+                 ! numbers (stdlib_qtgsna). note that the definition of the condition
+                 ! number is not invariant under transformation (u,v) to
+                 ! (q*u, z*v), where (u,v) are eigenvectors of the generalized
+                 ! schur form (s,t), q and z are orthogonal matrices. in order
+                 ! to avoid using extra 2*n*n workspace, we have to recalculate
+                 ! eigenvectors and estimate one condition numbers at a time.
+                 pair = .false.
+                 loop_20: do i = 1, n
+                    if( pair ) then
+                       pair = .false.
+                       cycle loop_20
+                    end if
+                    mm = 1
+                    if( i<n ) then
+                       if( a( i+1, i )/=zero ) then
+                          pair = .true.
+                          mm = 2
+                       end if
+                    end if
+                    do j = 1, n
+                       bwork( j ) = .false.
+                    end do
+                    if( mm==1 ) then
+                       bwork( i ) = .true.
+                    else if( mm==2 ) then
+                       bwork( i ) = .true.
+                       bwork( i+1 ) = .true.
+                    end if
+                    iwrk = mm*n + 1
+                    iwrk1 = iwrk + mm*n
+                    ! compute a pair of left and right eigenvectors.
+                    ! (compute workspace: need up to 4*n + 6*n)
+                    if( wantse .or. wantsb ) then
+                       call stdlib_qtgevc( 'B', 'S', bwork, n, a, lda, b, ldb,work( 1 ), n, work( &
+                                 iwrk ), n, mm, m,work( iwrk1 ), ierr )
+                       if( ierr/=0 ) then
+                          info = n + 2
+                          go to 130
+                       end if
+                    end if
+                    call stdlib_qtgsna( sense, 'S', bwork, n, a, lda, b, ldb,work( 1 ), n, work( &
+                    iwrk ), n, rconde( i ),rcondv( i ), mm, m, work( iwrk1 ),lwork-iwrk1+1, iwork,&
+                               ierr )
+                 end do loop_20
+              end if
+           end if
+           ! undo balancing on vl and vr and normalization
+           ! (workspace: none needed)
+           if( ilvl ) then
+              call stdlib_qggbak( balanc, 'L', n, ilo, ihi, lscale, rscale, n, vl,ldvl, ierr )
+                        
+              loop_70: do jc = 1, n
+                 if( alphai( jc )<zero )cycle loop_70
+                 temp = zero
+                 if( alphai( jc )==zero ) then
+                    do jr = 1, n
+                       temp = max( temp, abs( vl( jr, jc ) ) )
+                    end do
+                 else
+                    do jr = 1, n
+                       temp = max( temp, abs( vl( jr, jc ) )+abs( vl( jr, jc+1 ) ) )
+                    end do
+                 end if
+                 if( temp<smlnum )cycle loop_70
+                 temp = one / temp
+                 if( alphai( jc )==zero ) then
+                    do jr = 1, n
+                       vl( jr, jc ) = vl( jr, jc )*temp
+                    end do
+                 else
+                    do jr = 1, n
+                       vl( jr, jc ) = vl( jr, jc )*temp
+                       vl( jr, jc+1 ) = vl( jr, jc+1 )*temp
+                    end do
+                 end if
+              end do loop_70
+           end if
+           if( ilvr ) then
+              call stdlib_qggbak( balanc, 'R', n, ilo, ihi, lscale, rscale, n, vr,ldvr, ierr )
+                        
+              loop_120: do jc = 1, n
+                 if( alphai( jc )<zero )cycle loop_120
+                 temp = zero
+                 if( alphai( jc )==zero ) then
+                    do jr = 1, n
+                       temp = max( temp, abs( vr( jr, jc ) ) )
+                    end do
+                 else
+                    do jr = 1, n
+                       temp = max( temp, abs( vr( jr, jc ) )+abs( vr( jr, jc+1 ) ) )
+                    end do
+                 end if
+                 if( temp<smlnum )cycle loop_120
+                 temp = one / temp
+                 if( alphai( jc )==zero ) then
+                    do jr = 1, n
+                       vr( jr, jc ) = vr( jr, jc )*temp
+                    end do
+                 else
+                    do jr = 1, n
+                       vr( jr, jc ) = vr( jr, jc )*temp
+                       vr( jr, jc+1 ) = vr( jr, jc+1 )*temp
+                    end do
+                 end if
+              end do loop_120
+           end if
+           ! undo scaling if necessary
+           130 continue
+           if( ilascl ) then
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_qlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_qggevx
+
+     !> DGGGLM: solves a general Gauss-Markov linear model (GLM) problem:
+     !> minimize || y ||_2   subject to   d = A*x + B*y
+     !> x
+     !> where A is an N-by-M matrix, B is an N-by-P matrix, and d is a
+     !> given N-vector. It is assumed that M <= N <= M+P, and
+     !> rank(A) = M    and    rank( A B ) = N.
+     !> Under these assumptions, the constrained equation is always
+     !> consistent, and there is a unique solution x and a minimal 2-norm
+     !> solution y, which is obtained using a generalized QR factorization
+     !> of the matrices (A, B) given by
+     !> A = Q*(R),   B = Q*T*Z.
+     !> (0)
+     !> In particular, if matrix B is square nonsingular, then the problem
+     !> GLM is equivalent to the following weighted linear least squares
+     !> problem
+     !> minimize || inv(B)*(d-A*x) ||_2
+     !> x
+     !> where inv(B) denotes the inverse of B.
+
+     pure subroutine stdlib_qggglm( n, m, p, a, lda, b, ldb, d, x, y, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*), d(*)
+           real(qp), intent(out) :: work(*), x(*), y(*)
+        ! ===================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, lopt, lwkmin, lwkopt, nb, nb1, nb2, nb3, nb4, np
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           np = min( n, p )
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( m<0 .or. m>n ) then
+              info = -2
+           else if( p<0 .or. p<n-m ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           ! calculate workspace
+           if( info==0) then
+              if( n==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'DGEQRF', ' ', n, m, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'DGERQF', ' ', n, m, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'DORMQR', ' ', n, m, p, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'DORMRQ', ' ', n, m, p, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = m + n + p
+                 lwkopt = m + np + max( n, p )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGGLM', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              do i = 1, m
+                 x(i) = zero
+              end do
+              do i = 1, p
+                 y(i) = zero
+              end do
+              return
+           end if
+           ! compute the gqr factorization of matrices a and b:
+                ! q**t*a = ( r11 ) m,    q**t*b*z**t = ( t11   t12 ) m
+                         ! (  0  ) n-m                 (  0    t22 ) n-m
+                            ! m                         m+p-n  n-m
+           ! where r11 and t22 are upper triangular, and q and z are
+           ! orthogonal.
+           call stdlib_qggqrf( n, m, p, a, lda, work, b, ldb, work( m+1 ),work( m+np+1 ), lwork-m-&
+                     np, info )
+           lopt = work( m+np+1 )
+           ! update left-hand-side vector d = q**t*d = ( d1 ) m
+                                                     ! ( d2 ) n-m
+           call stdlib_qormqr( 'LEFT', 'TRANSPOSE', n, 1, m, a, lda, work, d,max( 1, n ), work( m+&
+                     np+1 ), lwork-m-np, info )
+           lopt = max( lopt, int( work( m+np+1 ),KIND=ilp) )
+           ! solve t22*y2 = d2 for y2
+           if( n>m ) then
+              call stdlib_qtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', n-m, 1,b( m+1, m+p-n+1 ), &
+                        ldb, d( m+1 ), n-m, info )
+              if( info>0 ) then
+                 info = 1
+                 return
+              end if
+              call stdlib_qcopy( n-m, d( m+1 ), 1, y( m+p-n+1 ), 1 )
+           end if
+           ! set y1 = 0
+           do i = 1, m + p - n
+              y( i ) = zero
+           end do
+           ! update d1 = d1 - t12*y2
+           call stdlib_qgemv( 'NO TRANSPOSE', m, n-m, -one, b( 1, m+p-n+1 ), ldb,y( m+p-n+1 ), 1, &
+                     one, d, 1 )
+           ! solve triangular system: r11*x = d1
+           if( m>0 ) then
+              call stdlib_qtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', m, 1, a, lda,d, m, info )
+                        
+              if( info>0 ) then
+                 info = 2
+                 return
+              end if
+              ! copy d to x
+              call stdlib_qcopy( m, d, 1, x, 1 )
+           end if
+           ! backward transformation y = z**t *y
+           call stdlib_qormrq( 'LEFT', 'TRANSPOSE', p, 1, np,b( max( 1, n-p+1 ), 1 ), ldb, work( &
+                     m+1 ), y,max( 1, p ), work( m+np+1 ), lwork-m-np, info )
+           work( 1 ) = m + np + max( lopt, int( work( m+np+1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_qggglm
+
+     !> DGGHD3: reduces a pair of real matrices (A,B) to generalized upper
+     !> Hessenberg form using orthogonal transformations, where A is a
+     !> general matrix and B is upper triangular.  The form of the
+     !> generalized eigenvalue problem is
+     !> A*x = lambda*B*x,
+     !> and B is typically made upper triangular by computing its QR
+     !> factorization and moving the orthogonal matrix Q to the left side
+     !> of the equation.
+     !> This subroutine simultaneously reduces A to a Hessenberg matrix H:
+     !> Q**T*A*Z = H
+     !> and transforms B to another upper triangular matrix T:
+     !> Q**T*B*Z = T
+     !> in order to reduce the problem to its standard form
+     !> H*y = lambda*T*y
+     !> where y = Z**T*x.
+     !> The orthogonal matrices Q and Z are determined as products of Givens
+     !> rotations.  They may either be formed explicitly, or they may be
+     !> postmultiplied into input matrices Q1 and Z1, so that
+     !> Q1 * A * Z1**T = (Q1*Q) * H * (Z1*Z)**T
+     !> Q1 * B * Z1**T = (Q1*Q) * T * (Z1*Z)**T
+     !> If Q1 is the orthogonal matrix from the QR factorization of B in the
+     !> original equation A*x = lambda*B*x, then DGGHD3 reduces the original
+     !> problem to generalized Hessenberg form.
+     !> This is a blocked variant of DGGHRD, using matrix-matrix
+     !> multiplications for parts of the computation to enhance performance.
+
+     pure subroutine stdlib_qgghd3( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldb, ldq, ldz, n, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           real(qp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: blk22, initq, initz, lquery, wantq, wantz
+           character :: compq2, compz2
+           integer(ilp) :: cola, i, ierr, j, j0, jcol, jj, jrow, k, kacc22, len, lwkopt, n2nb, nb,&
+                      nblst, nbmin, nh, nnb, nx, ppw, ppwo, pw, top, topq
+           real(qp) :: c, c1, c2, s, s1, s2, temp, temp1, temp2, temp3
+           ! Intrinsic Functions 
+           intrinsic :: real,max
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DGGHD3', ' ', n, ilo, ihi, -1 )
+           lwkopt = max( 6*n*nb, 1 )
+           work( 1 ) = real( lwkopt,KIND=qp)
+           initq = stdlib_lsame( compq, 'I' )
+           wantq = initq .or. stdlib_lsame( compq, 'V' )
+           initz = stdlib_lsame( compz, 'I' )
+           wantz = initz .or. stdlib_lsame( compz, 'V' )
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -1
+           else if( .not.stdlib_lsame( compz, 'N' ) .and. .not.wantz ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ( wantq .and. ldq<n ) .or. ldq<1 ) then
+              info = -11
+           else if( ( wantz .and. ldz<n ) .or. ldz<1 ) then
+              info = -13
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGHD3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! initialize q and z if desired.
+           if( initq )call stdlib_qlaset( 'ALL', n, n, zero, one, q, ldq )
+           if( initz )call stdlib_qlaset( 'ALL', n, n, zero, one, z, ldz )
+           ! zero out lower triangle of b.
+           if( n>1 )call stdlib_qlaset( 'LOWER', n-1, n-1, zero, zero, b(2, 1), ldb )
+           ! quick return if possible
+           nh = ihi - ilo + 1
+           if( nh<=1 ) then
+              work( 1 ) = one
+              return
+           end if
+           ! determine the blocksize.
+           nbmin = stdlib_ilaenv( 2, 'DGGHD3', ' ', n, ilo, ihi, -1 )
+           if( nb>1 .and. nb<nh ) then
+              ! determine when to use unblocked instead of blocked code.
+              nx = max( nb, stdlib_ilaenv( 3, 'DGGHD3', ' ', n, ilo, ihi, -1 ) )
+              if( nx<nh ) then
+                 ! determine if workspace is large enough for blocked code.
+                 if( lwork<lwkopt ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code.
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGGHD3', ' ', n, ilo, ihi,-1 ) )
+                    if( lwork>=6*n*nbmin ) then
+                       nb = lwork / ( 6*n )
+                    else
+                       nb = 1
+                    end if
+                 end if
+              end if
+           end if
+           if( nb<nbmin .or. nb>=nh ) then
+              ! use unblocked code below
+              jcol = ilo
+           else
+              ! use blocked code
+              kacc22 = stdlib_ilaenv( 16, 'DGGHD3', ' ', n, ilo, ihi, -1 )
+              blk22 = kacc22==2
+              do jcol = ilo, ihi-2, nb
+                 nnb = min( nb, ihi-jcol-1 )
+                 ! initialize small orthogonal factors that will hold the
+                 ! accumulated givens rotations in workspace.
+                 ! n2nb   denotes the number of 2*nnb-by-2*nnb factors
+                 ! nblst  denotes the (possibly smaller) order of the last
+                        ! factor.
+                 n2nb = ( ihi-jcol-1 ) / nnb - 1
+                 nblst = ihi - jcol - n2nb*nnb
+                 call stdlib_qlaset( 'ALL', nblst, nblst, zero, one, work, nblst )
+                 pw = nblst * nblst + 1
+                 do i = 1, n2nb
+                    call stdlib_qlaset( 'ALL', 2*nnb, 2*nnb, zero, one,work( pw ), 2*nnb )
+                    pw = pw + 4*nnb*nnb
+                 end do
+                 ! reduce columns jcol:jcol+nnb-1 of a to hessenberg form.
+                 do j = jcol, jcol+nnb-1
+                    ! reduce jth column of a. store cosines and sines in jth
+                    ! column of a and b, respectively.
+                    do i = ihi, j+2, -1
+                       temp = a( i-1, j )
+                       call stdlib_qlartg( temp, a( i, j ), c, s, a( i-1, j ) )
+                       a( i, j ) = c
+                       b( i, j ) = s
+                    end do
+                    ! accumulate givens rotations into workspace array.
+                    ppw  = ( nblst + 1 )*( nblst - 2 ) - j + jcol + 1
+                    len  = 2 + j - jcol
+                    jrow = j + n2nb*nnb + 2
+                    do i = ihi, jrow, -1
+                       c = a( i, j )
+                       s = b( i, j )
+                       do jj = ppw, ppw+len-1
+                          temp = work( jj + nblst )
+                          work( jj + nblst ) = c*temp - s*work( jj )
+                          work( jj ) = s*temp + c*work( jj )
+                       end do
+                       len = len + 1
+                       ppw = ppw - nblst - 1
+                    end do
+                    ppwo = nblst*nblst + ( nnb+j-jcol-1 )*2*nnb + nnb
+                    j0 = jrow - nnb
+                    do jrow = j0, j+2, -nnb
+                       ppw = ppwo
+                       len  = 2 + j - jcol
+                       do i = jrow+nnb-1, jrow, -1
+                          c = a( i, j )
+                          s = b( i, j )
+                          do jj = ppw, ppw+len-1
+                             temp = work( jj + 2*nnb )
+                             work( jj + 2*nnb ) = c*temp - s*work( jj )
+                             work( jj ) = s*temp + c*work( jj )
+                          end do
+                          len = len + 1
+                          ppw = ppw - 2*nnb - 1
+                       end do
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                    ! top denotes the number of top rows in a and b that will
+                    ! not be updated during the next steps.
+                    if( jcol<=2 ) then
+                       top = 0
+                    else
+                       top = jcol
+                    end if
+                    ! propagate transformations through b and replace stored
+                    ! left sines/cosines by right sines/cosines.
+                    do jj = n, j+1, -1
+                       ! update jjth column of b.
+                       do i = min( jj+1, ihi ), j+2, -1
+                          c = a( i, j )
+                          s = b( i, j )
+                          temp = b( i, jj )
+                          b( i, jj ) = c*temp - s*b( i-1, jj )
+                          b( i-1, jj ) = s*temp + c*b( i-1, jj )
+                       end do
+                       ! annihilate b( jj+1, jj ).
+                       if( jj<ihi ) then
+                          temp = b( jj+1, jj+1 )
+                          call stdlib_qlartg( temp, b( jj+1, jj ), c, s,b( jj+1, jj+1 ) )
+                          b( jj+1, jj ) = zero
+                          call stdlib_qrot( jj-top, b( top+1, jj+1 ), 1,b( top+1, jj ), 1, c, s )
+                                    
+                          a( jj+1, j ) = c
+                          b( jj+1, j ) = -s
+                       end if
+                    end do
+                    ! update a by transformations from right.
+                    ! explicit loop unrolling provides better performance
+                    ! compared to stdlib_qlasr.
+                     ! call stdlib_qlasr( 'right', 'variable', 'backward', ihi-top,
+           ! $                     ihi-j, a( j+2, j ), b( j+2, j ),
+           ! $                     a( top+1, j+1 ), lda )
+                    jj = mod( ihi-j-1, 3 )
+                    do i = ihi-j-3, jj+1, -3
+                       c = a( j+1+i, j )
+                       s = -b( j+1+i, j )
+                       c1 = a( j+2+i, j )
+                       s1 = -b( j+2+i, j )
+                       c2 = a( j+3+i, j )
+                       s2 = -b( j+3+i, j )
+                       do k = top+1, ihi
+                          temp = a( k, j+i  )
+                          temp1 = a( k, j+i+1 )
+                          temp2 = a( k, j+i+2 )
+                          temp3 = a( k, j+i+3 )
+                          a( k, j+i+3 ) = c2*temp3 + s2*temp2
+                          temp2 = -s2*temp3 + c2*temp2
+                          a( k, j+i+2 ) = c1*temp2 + s1*temp1
+                          temp1 = -s1*temp2 + c1*temp1
+                          a( k, j+i+1 ) = c*temp1 + s*temp
+                          a( k, j+i ) = -s*temp1 + c*temp
+                       end do
+                    end do
+                    if( jj>0 ) then
+                       do i = jj, 1, -1
+                          call stdlib_qrot( ihi-top, a( top+1, j+i+1 ), 1,a( top+1, j+i ), 1, a( &
+                                    j+1+i, j ),-b( j+1+i, j ) )
+                       end do
+                    end if
+                    ! update (j+1)th column of a by transformations from left.
+                    if ( j < jcol + nnb - 1 ) then
+                       len  = 1 + j - jcol
+                       ! multiply with the trailing accumulated orthogonal
+                       ! matrix, which takes the form
+                              ! [  u11  u12  ]
+                          ! u = [            ],
+                              ! [  u21  u22  ]
+                       ! where u21 is a len-by-len matrix and u12 is lower
+                       ! triangular.
+                       jrow = ihi - nblst + 1
+                       call stdlib_qgemv( 'TRANSPOSE', nblst, len, one, work,nblst, a( jrow, j+1 )&
+                                 , 1, zero,work( pw ), 1 )
+                       ppw = pw + len
+                       do i = jrow, jrow+nblst-len-1
+                          work( ppw ) = a( i, j+1 )
+                          ppw = ppw + 1
+                       end do
+                       call stdlib_qtrmv( 'LOWER', 'TRANSPOSE', 'NON-UNIT',nblst-len, work( &
+                                 len*nblst + 1 ), nblst,work( pw+len ), 1 )
+                       call stdlib_qgemv( 'TRANSPOSE', len, nblst-len, one,work( (len+1)*nblst - &
+                       len + 1 ), nblst,a( jrow+nblst-len, j+1 ), 1, one,work( pw+len ), 1 )
+                                 
+                       ppw = pw
+                       do i = jrow, jrow+nblst-1
+                          a( i, j+1 ) = work( ppw )
+                          ppw = ppw + 1
+                       end do
+                       ! multiply with the other accumulated orthogonal
+                       ! matrices, which take the form
+                              ! [  u11  u12   0  ]
+                              ! [                ]
+                          ! u = [  u21  u22   0  ],
+                              ! [                ]
+                              ! [   0    0    i  ]
+                       ! where i denotes the (nnb-len)-by-(nnb-len) identity
+                       ! matrix, u21 is a len-by-len upper triangular matrix
+                       ! and u12 is an nnb-by-nnb lower triangular matrix.
+                       ppwo = 1 + nblst*nblst
+                       j0 = jrow - nnb
+                       do jrow = j0, jcol+1, -nnb
+                          ppw = pw + len
+                          do i = jrow, jrow+nnb-1
+                             work( ppw ) = a( i, j+1 )
+                             ppw = ppw + 1
+                          end do
+                          ppw = pw
+                          do i = jrow+nnb, jrow+nnb+len-1
+                             work( ppw ) = a( i, j+1 )
+                             ppw = ppw + 1
+                          end do
+                          call stdlib_qtrmv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', len,work( ppwo + &
+                                    nnb ), 2*nnb, work( pw ),1 )
+                          call stdlib_qtrmv( 'LOWER', 'TRANSPOSE', 'NON-UNIT', nnb,work( ppwo + &
+                                    2*len*nnb ),2*nnb, work( pw + len ), 1 )
+                          call stdlib_qgemv( 'TRANSPOSE', nnb, len, one,work( ppwo ), 2*nnb, a( &
+                                    jrow, j+1 ), 1,one, work( pw ), 1 )
+                          call stdlib_qgemv( 'TRANSPOSE', len, nnb, one,work( ppwo + 2*len*nnb + &
+                                    nnb ), 2*nnb,a( jrow+nnb, j+1 ), 1, one,work( pw+len ), 1 )
+                          ppw = pw
+                          do i = jrow, jrow+len+nnb-1
+                             a( i, j+1 ) = work( ppw )
+                             ppw = ppw + 1
+                          end do
+                          ppwo = ppwo + 4*nnb*nnb
+                       end do
+                    end if
+                 end do
+                 ! apply accumulated orthogonal matrices to a.
+                 cola = n - jcol - nnb + 1
+                 j = ihi - nblst + 1
+                 call stdlib_qgemm( 'TRANSPOSE', 'NO TRANSPOSE', nblst,cola, nblst, one, work, &
+                           nblst,a( j, jcol+nnb ), lda, zero, work( pw ),nblst )
+                 call stdlib_qlacpy( 'ALL', nblst, cola, work( pw ), nblst,a( j, jcol+nnb ), lda )
+                           
+                 ppwo = nblst*nblst + 1
+                 j0 = j - nnb
+                 do j = j0, jcol+1, -nnb
+                    if ( blk22 ) then
+                       ! exploit the structure of
+                              ! [  u11  u12  ]
+                          ! u = [            ]
+                              ! [  u21  u22  ],
+                       ! where all blocks are nnb-by-nnb, u21 is upper
+                       ! triangular and u12 is lower triangular.
+                       call stdlib_qorm22( 'LEFT', 'TRANSPOSE', 2*nnb, cola, nnb,nnb, work( ppwo )&
+                                 , 2*nnb,a( j, jcol+nnb ), lda, work( pw ),lwork-pw+1, ierr )
+                    else
+                       ! ignore the structure of u.
+                       call stdlib_qgemm( 'TRANSPOSE', 'NO TRANSPOSE', 2*nnb,cola, 2*nnb, one, &
+                                 work( ppwo ), 2*nnb,a( j, jcol+nnb ), lda, zero, work( pw ),2*nnb )
+                       call stdlib_qlacpy( 'ALL', 2*nnb, cola, work( pw ), 2*nnb,a( j, jcol+nnb ),&
+                                  lda )
+                    end if
+                    ppwo = ppwo + 4*nnb*nnb
+                 end do
+                 ! apply accumulated orthogonal matrices to q.
+                 if( wantq ) then
+                    j = ihi - nblst + 1
+                    if ( initq ) then
+                       topq = max( 2, j - jcol + 1 )
+                       nh  = ihi - topq + 1
+                    else
+                       topq = 1
+                       nh = n
+                    end if
+                    call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,nblst, nblst, one, q( &
+                              topq, j ), ldq,work, nblst, zero, work( pw ), nh )
+                    call stdlib_qlacpy( 'ALL', nh, nblst, work( pw ), nh,q( topq, j ), ldq )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( initq ) then
+                          topq = max( 2, j - jcol + 1 )
+                          nh  = ihi - topq + 1
+                       end if
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_qorm22( 'RIGHT', 'NO TRANSPOSE', nh, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,q( topq, j ), ldq, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,2*nnb, 2*nnb, one,&
+                                     q( topq, j ), ldq,work( ppwo ), 2*nnb, zero, work( pw ),nh )
+                          call stdlib_qlacpy( 'ALL', nh, 2*nnb, work( pw ), nh,q( topq, j ), ldq )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+                 ! accumulate right givens rotations if required.
+                 if ( wantz .or. top>0 ) then
+                    ! initialize small orthogonal factors that will hold the
+                    ! accumulated givens rotations in workspace.
+                    call stdlib_qlaset( 'ALL', nblst, nblst, zero, one, work,nblst )
+                    pw = nblst * nblst + 1
+                    do i = 1, n2nb
+                       call stdlib_qlaset( 'ALL', 2*nnb, 2*nnb, zero, one,work( pw ), 2*nnb )
+                                 
+                       pw = pw + 4*nnb*nnb
+                    end do
+                    ! accumulate givens rotations into workspace array.
+                    do j = jcol, jcol+nnb-1
+                       ppw  = ( nblst + 1 )*( nblst - 2 ) - j + jcol + 1
+                       len  = 2 + j - jcol
+                       jrow = j + n2nb*nnb + 2
+                       do i = ihi, jrow, -1
+                          c = a( i, j )
+                          a( i, j ) = zero
+                          s = b( i, j )
+                          b( i, j ) = zero
+                          do jj = ppw, ppw+len-1
+                             temp = work( jj + nblst )
+                             work( jj + nblst ) = c*temp - s*work( jj )
+                             work( jj ) = s*temp + c*work( jj )
+                          end do
+                          len = len + 1
+                          ppw = ppw - nblst - 1
+                       end do
+                       ppwo = nblst*nblst + ( nnb+j-jcol-1 )*2*nnb + nnb
+                       j0 = jrow - nnb
+                       do jrow = j0, j+2, -nnb
+                          ppw = ppwo
+                          len  = 2 + j - jcol
+                          do i = jrow+nnb-1, jrow, -1
+                             c = a( i, j )
+                             a( i, j ) = zero
+                             s = b( i, j )
+                             b( i, j ) = zero
+                             do jj = ppw, ppw+len-1
+                                temp = work( jj + 2*nnb )
+                                work( jj + 2*nnb ) = c*temp - s*work( jj )
+                                work( jj ) = s*temp + c*work( jj )
+                             end do
+                             len = len + 1
+                             ppw = ppw - 2*nnb - 1
+                          end do
+                          ppwo = ppwo + 4*nnb*nnb
+                       end do
+                    end do
+                 else
+                    call stdlib_qlaset( 'LOWER', ihi - jcol - 1, nnb, zero, zero,a( jcol + 2, &
+                              jcol ), lda )
+                    call stdlib_qlaset( 'LOWER', ihi - jcol - 1, nnb, zero, zero,b( jcol + 2, &
+                              jcol ), ldb )
+                 end if
+                 ! apply accumulated orthogonal matrices to a and b.
+                 if ( top>0 ) then
+                    j = ihi - nblst + 1
+                    call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,nblst, nblst, one, a( &
+                              1, j ), lda,work, nblst, zero, work( pw ), top )
+                    call stdlib_qlacpy( 'ALL', top, nblst, work( pw ), top,a( 1, j ), lda )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_qorm22( 'RIGHT', 'NO TRANSPOSE', top, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,a( 1, j ), lda, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,2*nnb, 2*nnb, &
+                                    one, a( 1, j ), lda,work( ppwo ), 2*nnb, zero,work( pw ), top )
+                          call stdlib_qlacpy( 'ALL', top, 2*nnb, work( pw ), top,a( 1, j ), lda )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                    j = ihi - nblst + 1
+                    call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,nblst, nblst, one, b( &
+                              1, j ), ldb,work, nblst, zero, work( pw ), top )
+                    call stdlib_qlacpy( 'ALL', top, nblst, work( pw ), top,b( 1, j ), ldb )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_qorm22( 'RIGHT', 'NO TRANSPOSE', top, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,b( 1, j ), ldb, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,2*nnb, 2*nnb, &
+                                    one, b( 1, j ), ldb,work( ppwo ), 2*nnb, zero,work( pw ), top )
+                          call stdlib_qlacpy( 'ALL', top, 2*nnb, work( pw ), top,b( 1, j ), ldb )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+                 ! apply accumulated orthogonal matrices to z.
+                 if( wantz ) then
+                    j = ihi - nblst + 1
+                    if ( initq ) then
+                       topq = max( 2, j - jcol + 1 )
+                       nh  = ihi - topq + 1
+                    else
+                       topq = 1
+                       nh = n
+                    end if
+                    call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,nblst, nblst, one, z( &
+                              topq, j ), ldz,work, nblst, zero, work( pw ), nh )
+                    call stdlib_qlacpy( 'ALL', nh, nblst, work( pw ), nh,z( topq, j ), ldz )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                          if ( initq ) then
+                          topq = max( 2, j - jcol + 1 )
+                          nh  = ihi - topq + 1
+                       end if
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_qorm22( 'RIGHT', 'NO TRANSPOSE', nh, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,z( topq, j ), ldz, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,2*nnb, 2*nnb, one,&
+                                     z( topq, j ), ldz,work( ppwo ), 2*nnb, zero, work( pw ),nh )
+                          call stdlib_qlacpy( 'ALL', nh, 2*nnb, work( pw ), nh,z( topq, j ), ldz )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+              end do
+           end if
+           ! use unblocked code to reduce the rest of the matrix
+           ! avoid re-initialization of modified q and z.
+           compq2 = compq
+           compz2 = compz
+           if ( jcol/=ilo ) then
+              if ( wantq )compq2 = 'V'
+              if ( wantz )compz2 = 'V'
+           end if
+           if ( jcol<ihi )call stdlib_qgghrd( compq2, compz2, n, jcol, ihi, a, lda, b, ldb, q,ldq,&
+                      z, ldz, ierr )
+           work( 1 ) = real( lwkopt,KIND=qp)
+           return
+     end subroutine stdlib_qgghd3
+
+     !> DGGHRD: reduces a pair of real matrices (A,B) to generalized upper
+     !> Hessenberg form using orthogonal transformations, where A is a
+     !> general matrix and B is upper triangular.  The form of the
+     !> generalized eigenvalue problem is
+     !> A*x = lambda*B*x,
+     !> and B is typically made upper triangular by computing its QR
+     !> factorization and moving the orthogonal matrix Q to the left side
+     !> of the equation.
+     !> This subroutine simultaneously reduces A to a Hessenberg matrix H:
+     !> Q**T*A*Z = H
+     !> and transforms B to another upper triangular matrix T:
+     !> Q**T*B*Z = T
+     !> in order to reduce the problem to its standard form
+     !> H*y = lambda*T*y
+     !> where y = Z**T*x.
+     !> The orthogonal matrices Q and Z are determined as products of Givens
+     !> rotations.  They may either be formed explicitly, or they may be
+     !> postmultiplied into input matrices Q1 and Z1, so that
+     !> Q1 * A * Z1**T = (Q1*Q) * H * (Z1*Z)**T
+     !> Q1 * B * Z1**T = (Q1*Q) * T * (Z1*Z)**T
+     !> If Q1 is the orthogonal matrix from the QR factorization of B in the
+     !> original equation A*x = lambda*B*x, then DGGHRD reduces the original
+     !> problem to generalized Hessenberg form.
+
+     pure subroutine stdlib_qgghrd( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldb, ldq, ldz, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilq, ilz
+           integer(ilp) :: icompq, icompz, jcol, jrow
+           real(qp) :: c, s, temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode compq
+           if( stdlib_lsame( compq, 'N' ) ) then
+              ilq = .false.
+              icompq = 1
+           else if( stdlib_lsame( compq, 'V' ) ) then
+              ilq = .true.
+              icompq = 2
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              ilq = .true.
+              icompq = 3
+           else
+              icompq = 0
+           end if
+           ! decode compz
+           if( stdlib_lsame( compz, 'N' ) ) then
+              ilz = .false.
+              icompz = 1
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              ilz = .true.
+              icompz = 2
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              ilz = .true.
+              icompz = 3
+           else
+              icompz = 0
+           end if
+           ! test the input parameters.
+           info = 0
+           if( icompq<=0 ) then
+              info = -1
+           else if( icompz<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ( ilq .and. ldq<n ) .or. ldq<1 ) then
+              info = -11
+           else if( ( ilz .and. ldz<n ) .or. ldz<1 ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGHRD', -info )
+              return
+           end if
+           ! initialize q and z if desired.
+           if( icompq==3 )call stdlib_qlaset( 'FULL', n, n, zero, one, q, ldq )
+           if( icompz==3 )call stdlib_qlaset( 'FULL', n, n, zero, one, z, ldz )
+           ! quick return if possible
+           if( n<=1 )return
+           ! zero out lower triangle of b
+           do jcol = 1, n - 1
+              do jrow = jcol + 1, n
+                 b( jrow, jcol ) = zero
+              end do
+           end do
+           ! reduce a and b
+           do jcol = ilo, ihi - 2
+              do jrow = ihi, jcol + 2, -1
+                 ! step 1: rotate rows jrow-1, jrow to kill a(jrow,jcol)
+                 temp = a( jrow-1, jcol )
+                 call stdlib_qlartg( temp, a( jrow, jcol ), c, s,a( jrow-1, jcol ) )
+                 a( jrow, jcol ) = zero
+                 call stdlib_qrot( n-jcol, a( jrow-1, jcol+1 ), lda,a( jrow, jcol+1 ), lda, c, s )
+                           
+                 call stdlib_qrot( n+2-jrow, b( jrow-1, jrow-1 ), ldb,b( jrow, jrow-1 ), ldb, c, &
+                           s )
+                 if( ilq )call stdlib_qrot( n, q( 1, jrow-1 ), 1, q( 1, jrow ), 1, c, s )
+                 ! step 2: rotate columns jrow, jrow-1 to kill b(jrow,jrow-1)
+                 temp = b( jrow, jrow )
+                 call stdlib_qlartg( temp, b( jrow, jrow-1 ), c, s,b( jrow, jrow ) )
+                 b( jrow, jrow-1 ) = zero
+                 call stdlib_qrot( ihi, a( 1, jrow ), 1, a( 1, jrow-1 ), 1, c, s )
+                 call stdlib_qrot( jrow-1, b( 1, jrow ), 1, b( 1, jrow-1 ), 1, c,s )
+                 if( ilz )call stdlib_qrot( n, z( 1, jrow ), 1, z( 1, jrow-1 ), 1, c, s )
+              end do
+           end do
+           return
+     end subroutine stdlib_qgghrd
+
+     !> DGGLSE: solves the linear equality-constrained least squares (LSE)
+     !> problem:
+     !> minimize || c - A*x ||_2   subject to   B*x = d
+     !> where A is an M-by-N matrix, B is a P-by-N matrix, c is a given
+     !> M-vector, and d is a given P-vector. It is assumed that
+     !> P <= N <= M+P, and
+     !> rank(B) = P and  rank( (A) ) = N.
+     !> ( (B) )
+     !> These conditions ensure that the LSE problem has a unique solution,
+     !> which is obtained using a generalized RQ factorization of the
+     !> matrices (B, A) given by
+     !> B = (0 R)*Q,   A = Z*T*Q.
+
+     pure subroutine stdlib_qgglse( m, n, p, a, lda, b, ldb, c, d, x, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*), c(*), d(*)
+           real(qp), intent(out) :: work(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkmin, lwkopt, mn, nb, nb1, nb2, nb3, nb4, nr
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( p<0 .or. p>n .or. p<n-m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, p ) ) then
+              info = -7
+           end if
+           ! calculate workspace
+           if( info==0) then
+              if( n==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'DGEQRF', ' ', m, n, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'DGERQF', ' ', m, n, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'DORMQR', ' ', m, n, p, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'DORMRQ', ' ', m, n, p, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = m + n + p
+                 lwkopt = p + mn + max( m, n )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGLSE', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! compute the grq factorization of matrices b and a:
+                  ! b*q**t = (  0  t12 ) p   z**t*a*q**t = ( r11 r12 ) n-p
+                              ! n-p  p                     (  0  r22 ) m+p-n
+                                                            ! n-p  p
+           ! where t12 and r11 are upper triangular, and q and z are
+           ! orthogonal.
+           call stdlib_qggrqf( p, m, n, b, ldb, work, a, lda, work( p+1 ),work( p+mn+1 ), lwork-p-&
+                     mn, info )
+           lopt = work( p+mn+1 )
+           ! update c = z**t *c = ( c1 ) n-p
+                                ! ( c2 ) m+p-n
+           call stdlib_qormqr( 'LEFT', 'TRANSPOSE', m, 1, mn, a, lda, work( p+1 ),c, max( 1, m ), &
+                     work( p+mn+1 ), lwork-p-mn, info )
+           lopt = max( lopt, int( work( p+mn+1 ),KIND=ilp) )
+           ! solve t12*x2 = d for x2
+           if( p>0 ) then
+              call stdlib_qtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', p, 1,b( 1, n-p+1 ), ldb, d,&
+                         p, info )
+              if( info>0 ) then
+                 info = 1
+                 return
+              end if
+              ! put the solution in x
+              call stdlib_qcopy( p, d, 1, x( n-p+1 ), 1 )
+              ! update c1
+              call stdlib_qgemv( 'NO TRANSPOSE', n-p, p, -one, a( 1, n-p+1 ), lda,d, 1, one, c, 1 &
+                        )
+           end if
+           ! solve r11*x1 = c1 for x1
+           if( n>p ) then
+              call stdlib_qtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n-p, 1,a, lda, c, n-p, &
+                        info )
+              if( info>0 ) then
+                 info = 2
+                 return
+              end if
+              ! put the solutions in x
+              call stdlib_qcopy( n-p, c, 1, x, 1 )
+           end if
+           ! compute the residual vector:
+           if( m<n ) then
+              nr = m + p - n
+              if( nr>0 )call stdlib_qgemv( 'NO TRANSPOSE', nr, n-m, -one, a( n-p+1, m+1 ),lda, d( &
+                        nr+1 ), 1, one, c( n-p+1 ), 1 )
+           else
+              nr = p
+           end if
+           if( nr>0 ) then
+              call stdlib_qtrmv( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', nr,a( n-p+1, n-p+1 ), lda, &
+                        d, 1 )
+              call stdlib_qaxpy( nr, -one, d, 1, c( n-p+1 ), 1 )
+           end if
+           ! backward transformation x = q**t*x
+           call stdlib_qormrq( 'LEFT', 'TRANSPOSE', n, 1, p, b, ldb, work( 1 ), x,n, work( p+mn+1 &
+                     ), lwork-p-mn, info )
+           work( 1 ) = p + mn + max( lopt, int( work( p+mn+1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_qgglse
+
+     !> DGGQRF: computes a generalized QR factorization of an N-by-M matrix A
+     !> and an N-by-P matrix B:
+     !> A = Q*R,        B = Q*T*Z,
+     !> where Q is an N-by-N orthogonal matrix, Z is a P-by-P orthogonal
+     !> matrix, and R and T assume one of the forms:
+     !> if N >= M,  R = ( R11 ) M  ,   or if N < M,  R = ( R11  R12 ) N,
+     !> (  0  ) N-M                         N   M-N
+     !> M
+     !> where R11 is upper triangular, and
+     !> if N <= P,  T = ( 0  T12 ) N,   or if N > P,  T = ( T11 ) N-P,
+     !> P-N  N                           ( T21 ) P
+     !> P
+     !> where T12 or T21 is upper triangular.
+     !> In particular, if B is square and nonsingular, the GQR factorization
+     !> of A and B implicitly gives the QR factorization of inv(B)*A:
+     !> inv(B)*A = Z**T*(inv(T)*R)
+     !> where inv(B) denotes the inverse of the matrix B, and Z**T denotes the
+     !> transpose of the matrix Z.
+
+     pure subroutine stdlib_qggqrf( n, m, p, a, lda, taua, b, ldb, taub, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: taua(*), taub(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkopt, nb, nb1, nb2, nb3
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'DGEQRF', ' ', n, m, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'DGERQF', ' ', n, p, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'DORMQR', ' ', n, m, p, -1 )
+           nb = max( nb1, nb2, nb3 )
+           lwkopt = max( n, m, p )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( p<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, n, m, p ) .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGQRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! qr factorization of n-by-m matrix a: a = q*r
+           call stdlib_qgeqrf( n, m, a, lda, taua, work, lwork, info )
+           lopt = work( 1 )
+           ! update b := q**t*b.
+           call stdlib_qormqr( 'LEFT', 'TRANSPOSE', n, p, min( n, m ), a, lda, taua,b, ldb, work, &
+                     lwork, info )
+           lopt = max( lopt, int( work( 1 ),KIND=ilp) )
+           ! rq factorization of n-by-p matrix b: b = t*z.
+           call stdlib_qgerqf( n, p, b, ldb, taub, work, lwork, info )
+           work( 1 ) = max( lopt, int( work( 1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_qggqrf
+
+     !> DGGRQF: computes a generalized RQ factorization of an M-by-N matrix A
+     !> and a P-by-N matrix B:
+     !> A = R*Q,        B = Z*T*Q,
+     !> where Q is an N-by-N orthogonal matrix, Z is a P-by-P orthogonal
+     !> matrix, and R and T assume one of the forms:
+     !> if M <= N,  R = ( 0  R12 ) M,   or if M > N,  R = ( R11 ) M-N,
+     !> N-M  M                           ( R21 ) N
+     !> N
+     !> where R12 or R21 is upper triangular, and
+     !> if P >= N,  T = ( T11 ) N  ,   or if P < N,  T = ( T11  T12 ) P,
+     !> (  0  ) P-N                         P   N-P
+     !> N
+     !> where T11 is upper triangular.
+     !> In particular, if B is square and nonsingular, the GRQ factorization
+     !> of A and B implicitly gives the RQ factorization of A*inv(B):
+     !> A*inv(B) = (R*inv(T))*Z**T
+     !> where inv(B) denotes the inverse of the matrix B, and Z**T denotes the
+     !> transpose of the matrix Z.
+
+     pure subroutine stdlib_qggrqf( m, p, n, a, lda, taua, b, ldb, taub, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: taua(*), taub(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkopt, nb, nb1, nb2, nb3
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'DGERQF', ' ', m, n, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'DGEQRF', ' ', p, n, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'DORMRQ', ' ', m, n, p, -1 )
+           nb = max( nb1, nb2, nb3 )
+           lwkopt = max( n, m, p )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( p<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, p ) ) then
+              info = -8
+           else if( lwork<max( 1, m, p, n ) .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGGRQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! rq factorization of m-by-n matrix a: a = r*q
+           call stdlib_qgerqf( m, n, a, lda, taua, work, lwork, info )
+           lopt = work( 1 )
+           ! update b := b*q**t
+           call stdlib_qormrq( 'RIGHT', 'TRANSPOSE', p, n, min( m, n ),a( max( 1, m-n+1 ), 1 ), &
+                     lda, taua, b, ldb, work,lwork, info )
+           lopt = max( lopt, int( work( 1 ),KIND=ilp) )
+           ! qr factorization of p-by-n matrix b: b = z*t
+           call stdlib_qgeqrf( p, n, b, ldb, taub, work, lwork, info )
+           work( 1 ) = max( lopt, int( work( 1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_qggrqf
+
+     !> DGSVJ0: is called from DGESVJ as a pre-processor and that is its main
+     !> purpose. It applies Jacobi rotations in the same way as DGESVJ does, but
+     !> it does not check convergence (stopping criterion). Few tuning
+     !> parameters (marked by [TP]) are available for the implementer.
+
+     pure subroutine stdlib_qgsvj0( jobv, m, n, a, lda, d, sva, mv, v, ldv, eps,sfmin, tol, &
+               nsweep, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n, nsweep
+           real(qp), intent(in) :: eps, sfmin, tol
+           character, intent(in) :: jobv
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), sva(n), d(n), v(ldv,*)
+           real(qp), intent(out) :: work(lwork)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: aapp, aapp0, aapq, aaqq, apoaq, aqoap, big, bigtheta, cs, mxaapq, mxsinj, &
+                     rootbig, rooteps, rootsfmin, roottol, small, sn, t, temp1, theta, thsign
+           integer(ilp) :: blskip, emptsw, i, ibr, ierr, igl, ijblsk, ir1, iswrot, jbc, jgl, kbl, &
+                     lkahead, mvl, nbl, notrot, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, rotok, rsvec
+           ! Local Arrays 
+           real(qp) :: fastr(5)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real,min,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           applv = stdlib_lsame( jobv, 'A' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -3
+           else if( lda<m ) then
+              info = -5
+           else if( ( rsvec.or.applv ) .and. ( mv<0 ) ) then
+              info = -8
+           else if( ( rsvec.and.( ldv<n ) ).or.( applv.and.( ldv<mv ) ) ) then
+              info = -10
+           else if( tol<=eps ) then
+              info = -13
+           else if( nsweep<0 ) then
+              info = -14
+           else if( lwork<m ) then
+              info = -16
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGSVJ0', -info )
+              return
+           end if
+           if( rsvec ) then
+              mvl = n
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           rooteps = sqrt( eps )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / eps
+           big = one / sfmin
+           rootbig = one / rootsfmin
+           bigtheta = one / rooteps
+           roottol = sqrt( tol )
+           ! -#- row-cyclic jacobi svd algorithm with column pivoting -#-
+           emptsw = ( n*( n-1 ) ) / 2
+           notrot = 0
+           fastr( 1 ) = zero
+           ! -#- row-cyclic pivot strategy with de rijk's pivoting -#-
+           swband = 0
+      ! [tp] swband is a tuning parameter. it is meaningful and effective
+           ! if stdlib_dgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_dgesvj. for sweeps i=1:swband the procedure
+           ! ......
+           kbl = min( 8, n )
+      ! [tp] kbl is a tuning parameter that defines the tile size in the
+           ! tiling of the p-q loops of pivot pairs. in general, an optimal
+           ! value of kbl depends on the matrix dimensions and on the
+           ! parameters of the computer's memory.
+           nbl = n / kbl
+           if( ( nbl*kbl )/=n )nbl = nbl + 1
+           blskip = ( kbl**2 ) + 1
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           lkahead = 1
+      ! [tp] lkahead is a tuning parameter.
+           swband = 0
+           pskipped = 0
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+              loop_2000: do ibr = 1, nbl
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_1002: do ir1 = 0, min( lkahead, nbl-ibr )
+                    igl = igl + ir1*kbl
+                    loop_2001: do p = igl, min( igl+kbl-1, n-1 )
+           ! .. de rijk's pivoting
+                       q = stdlib_iqamax( n-p+1, sva( p ), 1 ) + p - 1
+                       if( p/=q ) then
+                          call stdlib_qswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                          if( rsvec )call stdlib_qswap( mvl, v( 1, p ), 1,v( 1, q ), 1 )
+                          temp1 = sva( p )
+                          sva( p ) = sva( q )
+                          sva( q ) = temp1
+                          temp1 = d( p )
+                          d( p ) = d( q )
+                          d( q ) = temp1
+                       end if
+                       if( ir1==0 ) then
+              ! column norms are periodically updated by explicit
+              ! norm computation.
+              ! caveat:
+              ! some blas implementations compute stdlib_qnrm2(m,a(1,p),1)
+              ! as sqrt(stdlib_qdot(m,a(1,p),1,a(1,p),1)), which may result in
+              ! overflow for ||a(:,p)||_2 > sqrt(overflow_threshold), and
+              ! underflow for ||a(:,p)||_2 < sqrt(underflow_threshold).
+              ! hence, stdlib_qnrm2 cannot be trusted, not even in the case when
+              ! the true norm is far from the under(over)flow boundaries.
+              ! if properly implemented stdlib_qnrm2 is available, the if-then-else
+              ! below should read "aapp = stdlib_qnrm2( m, a(1,p), 1 ) * d(p)".
+                          if( ( sva( p )<rootbig ) .and.( sva( p )>rootsfmin ) ) then
+                             sva( p ) = stdlib_qnrm2( m, a( 1, p ), 1 )*d( p )
+                          else
+                             temp1 = zero
+                             aapp = one
+                             call stdlib_qlassq( m, a( 1, p ), 1, temp1, aapp )
+                             sva( p ) = temp1*sqrt( aapp )*d( p )
+                          end if
+                          aapp = sva( p )
+                       else
+                          aapp = sva( p )
+                       end if
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2002: do q = p + 1, min( igl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+                                if( aaqq>=one ) then
+                                   rotok = ( small*aapp )<=aaqq
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_qdot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_qcopy( m, a( 1, p ), 1, work, 1 )
+                                      call stdlib_qlascl( 'G', 0, 0, aapp, d( p ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_qdot( m, work, 1, a( 1, q ),1 )*d( q ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   rotok = aapp<=( aaqq / small )
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_qdot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_qcopy( m, a( 1, q ), 1, work, 1 )
+                                      call stdlib_qlascl( 'G', 0, 0, aaqq, d( q ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_qdot( m, work, 1, a( 1, p ),1 )*d( p ) / &
+                                                aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                 ! Rotate
+                 ! rotated = rotated + one
+                                   if( ir1==0 ) then
+                                      notrot = 0
+                                      pskipped = 0
+                                      iswrot = iswrot + 1
+                                   end if
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/aapq
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*d( p ) / d( q )
+                                         fastr( 4 ) = -t*d( q ) / d( p )
+                                         call stdlib_qrotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_qrotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = d( p ) / d( q )
+                                         aqoap = d( q ) / d( p )
+                                         if( d( p )>=one ) then
+                                            if( d( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q )*cs
+                                               call stdlib_qrotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_qrotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_qaxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_qaxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q ) / cs
+                                               if( rsvec ) then
+                                                  call stdlib_qaxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_qaxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                            end if
+                                         else
+                                            if( d( q )>=one ) then
+                                               call stdlib_qaxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_qaxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               d( p ) = d( p ) / cs
+                                               d( q ) = d( q )*cs
+                                               if( rsvec ) then
+                                                  call stdlib_qaxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_qaxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                            else
+                                               if( d( p )>=d( q ) ) then
+                                                  call stdlib_qaxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_qaxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  d( p ) = d( p )*cs
+                                                  d( q ) = d( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_qaxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_qaxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_qaxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_qaxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  d( p ) = d( p ) / cs
+                                                  d( q ) = d( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_qaxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_qaxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                      call stdlib_qcopy( m, a( 1, p ), 1, work, 1 )
+                                      call stdlib_qlascl( 'G', 0, 0, aapp, one, m,1, work, lda, &
+                                                ierr )
+                                      call stdlib_qlascl( 'G', 0, 0, aaqq, one, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      temp1 = -aapq*d( p ) / d( q )
+                                      call stdlib_qaxpy( m, temp1, work, 1,a( 1, q ), 1 )
+                                      call stdlib_qlascl( 'G', 0, 0, one, aaqq, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                      mxsinj = max( mxsinj, sfmin )
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! recompute sva(q), sva(p).
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_qnrm2( m, a( 1, q ), 1 )*d( q )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_qlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*d( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_qnrm2( m, a( 1, p ), 1 )*d( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_qlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*d( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                                else
+              ! a(:,p) and a(:,q) already numerically orthogonal
+                                   if( ir1==0 )notrot = notrot + 1
+                                   pskipped = pskipped + 1
+                                end if
+                             else
+              ! a(:,q) is zero column
+                                if( ir1==0 )notrot = notrot + 1
+                                pskipped = pskipped + 1
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                if( ir1==0 )aapp = -aapp
+                                notrot = 0
+                                go to 2103
+                             end if
+                          end do loop_2002
+           ! end q-loop
+           2103 continue
+           ! bailed out of q-loop
+                          sva( p ) = aapp
+                       else
+                          sva( p ) = aapp
+                          if( ( ir1==0 ) .and. ( aapp==zero ) )notrot = notrot + min( igl+kbl-1, &
+                                    n ) - p
+                       end if
+                    end do loop_2001
+           ! end of the p-loop
+           ! end of doing the block ( ibr, ibr )
+                 end do loop_1002
+           ! end of ir1-loop
+      ! ........................................................
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = ibr + 1, nbl
+                    jgl = ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! -#- m x 2 jacobi svd -#-
+              ! -#- safe gram matrix computation -#-
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_qdot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_qcopy( m, a( 1, p ), 1, work, 1 )
+                                      call stdlib_qlascl( 'G', 0, 0, aapp, d( p ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_qdot( m, work, 1, a( 1, q ),1 )*d( q ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_qdot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_qcopy( m, a( 1, q ), 1, work, 1 )
+                                      call stdlib_qlascl( 'G', 0, 0, aaqq, d( q ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_qdot( m, work, 1, a( 1, p ),1 )*d( p ) / &
+                                                aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                                   notrot = 0
+                 ! rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/aapq
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*d( p ) / d( q )
+                                         fastr( 4 ) = -t*d( q ) / d( p )
+                                         call stdlib_qrotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_qrotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = d( p ) / d( q )
+                                         aqoap = d( q ) / d( p )
+                                         if( d( p )>=one ) then
+                                            if( d( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q )*cs
+                                               call stdlib_qrotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_qrotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_qaxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_qaxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_qaxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_qaxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q ) / cs
+                                            end if
+                                         else
+                                            if( d( q )>=one ) then
+                                               call stdlib_qaxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_qaxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_qaxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_qaxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                               d( p ) = d( p ) / cs
+                                               d( q ) = d( q )*cs
+                                            else
+                                               if( d( p )>=d( q ) ) then
+                                                  call stdlib_qaxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_qaxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  d( p ) = d( p )*cs
+                                                  d( q ) = d( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_qaxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_qaxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_qaxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_qaxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  d( p ) = d( p ) / cs
+                                                  d( q ) = d( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_qaxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_qaxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                                      if( aapp>aaqq ) then
+                                         call stdlib_qcopy( m, a( 1, p ), 1, work,1 )
+                                         call stdlib_qlascl( 'G', 0, 0, aapp, one,m, 1, work, lda,&
+                                                    ierr )
+                                         call stdlib_qlascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*d( p ) / d( q )
+                                         call stdlib_qaxpy( m, temp1, work, 1,a( 1, q ), 1 )
+                                                   
+                                         call stdlib_qlascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      else
+                                         call stdlib_qcopy( m, a( 1, q ), 1, work,1 )
+                                         call stdlib_qlascl( 'G', 0, 0, aaqq, one,m, 1, work, lda,&
+                                                    ierr )
+                                         call stdlib_qlascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*d( q ) / d( p )
+                                         call stdlib_qaxpy( m, temp1, work, 1,a( 1, p ), 1 )
+                                                   
+                                         call stdlib_qlascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q)
+                 ! .. recompute sva(q)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_qnrm2( m, a( 1, q ), 1 )*d( q )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_qlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*d( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_qnrm2( m, a( 1, p ), 1 )*d( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_qlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*d( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_qnrm2( m, a( 1, n ), 1 )*d( n )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_qlassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )*d( n )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<real( n,KIND=qp)*tol ) .and.( real( n,KIND=qp)&
+                        *mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:) reaching this point means that the procedure has completed the given
+           ! number of iterations.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means that during the i-th sweep all pivots were
+           ! below the given tolerance, causing early exit.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the vector d.
+           do p = 1, n - 1
+              q = stdlib_iqamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 temp1 = d( p )
+                 d( p ) = d( q )
+                 d( q ) = temp1
+                 call stdlib_qswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_qswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_qgsvj0
+
+     !> DGSVJ1: is called from DGESVJ as a pre-processor and that is its main
+     !> purpose. It applies Jacobi rotations in the same way as DGESVJ does, but
+     !> it targets only particular pivots and it does not check convergence
+     !> (stopping criterion). Few tuning parameters (marked by [TP]) are
+     !> available for the implementer.
+     !> Further Details
+     !> ~~~~~~~~~~~~~~~
+     !> DGSVJ1 applies few sweeps of Jacobi rotations in the column space of
+     !> the input M-by-N matrix A. The pivot pairs are taken from the (1,2)
+     !> off-diagonal block in the corresponding N-by-N Gram matrix A^T * A. The
+     !> block-entries (tiles) of the (1,2) off-diagonal block are marked by the
+     !> [x]'s in the following scheme:
+     !> | *  *  * [x] [x] [x]|
+     !> | *  *  * [x] [x] [x]|    Row-cycling in the nblr-by-nblc [x] blocks.
+     !> | *  *  * [x] [x] [x]|    Row-cyclic pivoting inside each [x] block.
+     !> |[x] [x] [x] *  *  * |
+     !> |[x] [x] [x] *  *  * |
+     !> |[x] [x] [x] *  *  * |
+     !> In terms of the columns of A, the first N1 columns are rotated 'against'
+     !> the remaining N-N1 columns, trying to increase the angle between the
+     !> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is
+     !> tiled using quadratic tiles of side KBL. Here, KBL is a tuning parameter.
+     !> The number of sweeps is given in NSWEEP and the orthogonality threshold
+     !> is given in TOL.
+
+     pure subroutine stdlib_qgsvj1( jobv, m, n, n1, a, lda, d, sva, mv, v, ldv,eps, sfmin, tol, &
+               nsweep, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: eps, sfmin, tol
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n, n1, nsweep
+           character, intent(in) :: jobv
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), d(n), sva(n), v(ldv,*)
+           real(qp), intent(out) :: work(lwork)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: aapp, aapp0, aapq, aaqq, apoaq, aqoap, big, bigtheta, cs, large, mxaapq, &
+           mxsinj, rootbig, rooteps, rootsfmin, roottol, small, sn, t, temp1, theta, &
+                     thsign
+           integer(ilp) :: blskip, emptsw, i, ibr, igl, ierr, ijblsk, iswrot, jbc, jgl, kbl, mvl, &
+                     notrot, nblc, nblr, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, rotok, rsvec
+           ! Local Arrays 
+           real(qp) :: fastr(5)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real,min,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           applv = stdlib_lsame( jobv, 'A' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -3
+           else if( n1<0 ) then
+              info = -4
+           else if( lda<m ) then
+              info = -6
+           else if( ( rsvec.or.applv ) .and. ( mv<0 ) ) then
+              info = -9
+           else if( ( rsvec.and.( ldv<n ) ).or.( applv.and.( ldv<mv ) )  ) then
+              info = -11
+           else if( tol<=eps ) then
+              info = -14
+           else if( nsweep<0 ) then
+              info = -15
+           else if( lwork<m ) then
+              info = -17
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGSVJ1', -info )
+              return
+           end if
+           if( rsvec ) then
+              mvl = n
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           rooteps = sqrt( eps )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / eps
+           big = one / sfmin
+           rootbig = one / rootsfmin
+           large = big / sqrt( real( m*n,KIND=qp) )
+           bigtheta = one / rooteps
+           roottol = sqrt( tol )
+           ! Initialize The Right Singular Vector Matrix 
+           ! rsvec = stdlib_lsame( jobv, 'y' )
+           emptsw = n1*( n-n1 )
+           notrot = 0
+           fastr( 1 ) = zero
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           kbl = min( 8, n )
+           nblr = n1 / kbl
+           if( ( nblr*kbl )/=n1 )nblr = nblr + 1
+           ! .. the tiling is nblr-by-nblc [tiles]
+           nblc = ( n-n1 ) / kbl
+           if( ( nblc*kbl )/=( n-n1 ) )nblc = nblc + 1
+           blskip = ( kbl**2 ) + 1
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           swband = 0
+      ! [tp] swband is a tuning parameter. it is meaningful and effective
+           ! if stdlib_dgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_dgesvj.
+           ! | *   *   * [x] [x] [x]|
+           ! | *   *   * [x] [x] [x]|    row-cycling in the nblr-by-nblc [x] blocks.
+           ! | *   *   * [x] [x] [x]|    row-cyclic pivoting inside each [x] block.
+           ! |[x] [x] [x] *   *   * |
+           ! |[x] [x] [x] *   *   * |
+           ! |[x] [x] [x] *   *   * |
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+              loop_2000: do ibr = 1, nblr
+                 igl = ( ibr-1 )*kbl + 1
+      ! ........................................................
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = 1, nblc
+                    jgl = n1 + ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n1 )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! Safe Gram Matrix Computation 
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_qdot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_qcopy( m, a( 1, p ), 1, work, 1 )
+                                      call stdlib_qlascl( 'G', 0, 0, aapp, d( p ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_qdot( m, work, 1, a( 1, q ),1 )*d( q ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_qdot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_qcopy( m, a( 1, q ), 1, work, 1 )
+                                      call stdlib_qlascl( 'G', 0, 0, aaqq, d( q ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_qdot( m, work, 1, a( 1, p ),1 )*d( p ) / &
+                                                aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                                   notrot = 0
+                 ! rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs(aqoap-apoaq) / aapq
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*d( p ) / d( q )
+                                         fastr( 4 ) = -t*d( q ) / d( p )
+                                         call stdlib_qrotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_qrotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = d( p ) / d( q )
+                                         aqoap = d( q ) / d( p )
+                                         if( d( p )>=one ) then
+                                            if( d( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q )*cs
+                                               call stdlib_qrotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_qrotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_qaxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_qaxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_qaxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_qaxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q ) / cs
+                                            end if
+                                         else
+                                            if( d( q )>=one ) then
+                                               call stdlib_qaxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_qaxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_qaxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_qaxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                               d( p ) = d( p ) / cs
+                                               d( q ) = d( q )*cs
+                                            else
+                                               if( d( p )>=d( q ) ) then
+                                                  call stdlib_qaxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_qaxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  d( p ) = d( p )*cs
+                                                  d( q ) = d( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_qaxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_qaxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_qaxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_qaxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  d( p ) = d( p ) / cs
+                                                  d( q ) = d( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_qaxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_qaxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                                      if( aapp>aaqq ) then
+                                         call stdlib_qcopy( m, a( 1, p ), 1, work,1 )
+                                         call stdlib_qlascl( 'G', 0, 0, aapp, one,m, 1, work, lda,&
+                                                    ierr )
+                                         call stdlib_qlascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*d( p ) / d( q )
+                                         call stdlib_qaxpy( m, temp1, work, 1,a( 1, q ), 1 )
+                                                   
+                                         call stdlib_qlascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      else
+                                         call stdlib_qcopy( m, a( 1, q ), 1, work,1 )
+                                         call stdlib_qlascl( 'G', 0, 0, aaqq, one,m, 1, work, lda,&
+                                                    ierr )
+                                         call stdlib_qlascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*d( q ) / d( p )
+                                         call stdlib_qaxpy( m, temp1, work, 1,a( 1, p ), 1 )
+                                                   
+                                         call stdlib_qlascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q)
+                 ! .. recompute sva(q)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_qnrm2( m, a( 1, q ), 1 )*d( q )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_qlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*d( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_qnrm2( m, a( 1, p ), 1 )*d( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_qlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*d( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+                 ! skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+            ! if ( notrot >= emptsw )  go to 2011
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+      ! **      if ( notrot >= emptsw )  go to 2011
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **   if ( notrot >= emptsw ) go to 1994
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_qnrm2( m, a( 1, n ), 1 )*d( n )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_qlassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )*d( n )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<real( n,KIND=qp)*tol ) .and.( real( n,KIND=qp)&
+                        *mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:) reaching this point means that the procedure has completed the given
+           ! number of sweeps.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means that during the i-th sweep all pivots were
+           ! below the given threshold, causing early exit.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the vector d
+           do p = 1, n - 1
+              q = stdlib_iqamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 temp1 = d( p )
+                 d( p ) = d( q )
+                 d( q ) = temp1
+                 call stdlib_qswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_qswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_qgsvj1
+
+     !> DGTCON: estimates the reciprocal of the condition number of a real
+     !> tridiagonal matrix A using the LU factorization as computed by
+     !> DGTTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_qgtcon( norm, n, dl, d, du, du2, ipiv, anorm, rcond,work, iwork, info &
+               )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: d(*), dl(*), du(*), du2(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: onenrm
+           integer(ilp) :: i, kase, kase1
+           real(qp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGTCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           ! check that d(1:n) is non-zero.
+           do i = 1, n
+              if( d( i )==zero )return
+           end do
+           ainvnm = zero
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kase = 0
+           20 continue
+           call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(u)*inv(l).
+                 call stdlib_qgttrs( 'NO TRANSPOSE', n, 1, dl, d, du, du2, ipiv,work, n, info )
+                           
+              else
+                 ! multiply by inv(l**t)*inv(u**t).
+                 call stdlib_qgttrs( 'TRANSPOSE', n, 1, dl, d, du, du2, ipiv, work,n, info )
+                           
+              end if
+              go to 20
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_qgtcon
+
+     !> DGTRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is tridiagonal, and provides
+     !> error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_qgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2,ipiv, b, ldb, x, &
+               ldx, ferr, berr, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: b(ldb,*), d(*), df(*), dl(*), dlf(*), du(*), du2(*), duf(*)
+                     
+           real(qp), intent(out) :: berr(*), ferr(*), work(*)
+           real(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transn, transt
+           integer(ilp) :: count, i, j, kase, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -13
+           else if( ldx<max( 1, n ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGTRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'T'
+           else
+              transn = 'T'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = 4
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_110: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_qcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_qlagtm( trans, n, 1, -one, dl, d, du, x( 1, j ), ldx, one,work( n+1 ), &
+                        n )
+              ! compute abs(op(a))*abs(x) + abs(b) for use in the backward
+              ! error bound.
+              if( notran ) then
+                 if( n==1 ) then
+                    work( 1 ) = abs( b( 1, j ) ) + abs( d( 1 )*x( 1, j ) )
+                 else
+                    work( 1 ) = abs( b( 1, j ) ) + abs( d( 1 )*x( 1, j ) ) +abs( du( 1 )*x( 2, j )&
+                               )
+                    do i = 2, n - 1
+                       work( i ) = abs( b( i, j ) ) +abs( dl( i-1 )*x( i-1, j ) ) +abs( d( i )*x( &
+                                 i, j ) ) +abs( du( i )*x( i+1, j ) )
+                    end do
+                    work( n ) = abs( b( n, j ) ) +abs( dl( n-1 )*x( n-1, j ) ) +abs( d( n )*x( n, &
+                              j ) )
+                 end if
+              else
+                 if( n==1 ) then
+                    work( 1 ) = abs( b( 1, j ) ) + abs( d( 1 )*x( 1, j ) )
+                 else
+                    work( 1 ) = abs( b( 1, j ) ) + abs( d( 1 )*x( 1, j ) ) +abs( dl( 1 )*x( 2, j )&
+                               )
+                    do i = 2, n - 1
+                       work( i ) = abs( b( i, j ) ) +abs( du( i-1 )*x( i-1, j ) ) +abs( d( i )*x( &
+                                 i, j ) ) +abs( dl( i )*x( i+1, j ) )
+                    end do
+                    work( n ) = abs( b( n, j ) ) +abs( du( n-1 )*x( n-1, j ) ) +abs( d( n )*x( n, &
+                              j ) )
+                 end if
+              end if
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_qgttrs( trans, n, 1, dlf, df, duf, du2, ipiv,work( n+1 ), n, info )
+                           
+                 call stdlib_qaxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_qlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              70 continue
+              call stdlib_qlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_qgttrs( transt, n, 1, dlf, df, duf, du2, ipiv,work( n+1 ), n, &
+                              info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_qgttrs( transn, n, 1, dlf, df, duf, du2, ipiv,work( n+1 ), n, &
+                              info )
+                 end if
+                 go to 70
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_110
+           return
+     end subroutine stdlib_qgtrfs
+
+     !> DGTSV:  solves the equation
+     !> A*X = B,
+     !> where A is an n by n tridiagonal matrix, by Gaussian elimination with
+     !> partial pivoting.
+     !> Note that the equation  A**T*X = B  may be solved by interchanging the
+     !> order of the arguments DU and DL.
+
+     pure subroutine stdlib_qgtsv( n, nrhs, dl, d, du, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(inout) :: b(ldb,*), d(*), dl(*), du(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: fact, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGTSV ', -info )
+              return
+           end if
+           if( n==0 )return
+           if( nrhs==1 ) then
+              loop_10: do i = 1, n - 2
+                 if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                    ! no row interchange required
+                    if( d( i )/=zero ) then
+                       fact = dl( i ) / d( i )
+                       d( i+1 ) = d( i+1 ) - fact*du( i )
+                       b( i+1, 1 ) = b( i+1, 1 ) - fact*b( i, 1 )
+                    else
+                       info = i
+                       return
+                    end if
+                    dl( i ) = zero
+                 else
+                    ! interchange rows i and i+1
+                    fact = d( i ) / dl( i )
+                    d( i ) = dl( i )
+                    temp = d( i+1 )
+                    d( i+1 ) = du( i ) - fact*temp
+                    dl( i ) = du( i+1 )
+                    du( i+1 ) = -fact*dl( i )
+                    du( i ) = temp
+                    temp = b( i, 1 )
+                    b( i, 1 ) = b( i+1, 1 )
+                    b( i+1, 1 ) = temp - fact*b( i+1, 1 )
+                 end if
+              end do loop_10
+              if( n>1 ) then
+                 i = n - 1
+                 if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                    if( d( i )/=zero ) then
+                       fact = dl( i ) / d( i )
+                       d( i+1 ) = d( i+1 ) - fact*du( i )
+                       b( i+1, 1 ) = b( i+1, 1 ) - fact*b( i, 1 )
+                    else
+                       info = i
+                       return
+                    end if
+                 else
+                    fact = d( i ) / dl( i )
+                    d( i ) = dl( i )
+                    temp = d( i+1 )
+                    d( i+1 ) = du( i ) - fact*temp
+                    du( i ) = temp
+                    temp = b( i, 1 )
+                    b( i, 1 ) = b( i+1, 1 )
+                    b( i+1, 1 ) = temp - fact*b( i+1, 1 )
+                 end if
+              end if
+              if( d( n )==zero ) then
+                 info = n
+                 return
+              end if
+           else
+              loop_40: do i = 1, n - 2
+                 if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                    ! no row interchange required
+                    if( d( i )/=zero ) then
+                       fact = dl( i ) / d( i )
+                       d( i+1 ) = d( i+1 ) - fact*du( i )
+                       do j = 1, nrhs
+                          b( i+1, j ) = b( i+1, j ) - fact*b( i, j )
+                       end do
+                    else
+                       info = i
+                       return
+                    end if
+                    dl( i ) = zero
+                 else
+                    ! interchange rows i and i+1
+                    fact = d( i ) / dl( i )
+                    d( i ) = dl( i )
+                    temp = d( i+1 )
+                    d( i+1 ) = du( i ) - fact*temp
+                    dl( i ) = du( i+1 )
+                    du( i+1 ) = -fact*dl( i )
+                    du( i ) = temp
+                    do j = 1, nrhs
+                       temp = b( i, j )
+                       b( i, j ) = b( i+1, j )
+                       b( i+1, j ) = temp - fact*b( i+1, j )
+                    end do
+                 end if
+              end do loop_40
+              if( n>1 ) then
+                 i = n - 1
+                 if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                    if( d( i )/=zero ) then
+                       fact = dl( i ) / d( i )
+                       d( i+1 ) = d( i+1 ) - fact*du( i )
+                       do j = 1, nrhs
+                          b( i+1, j ) = b( i+1, j ) - fact*b( i, j )
+                       end do
+                    else
+                       info = i
+                       return
+                    end if
+                 else
+                    fact = d( i ) / dl( i )
+                    d( i ) = dl( i )
+                    temp = d( i+1 )
+                    d( i+1 ) = du( i ) - fact*temp
+                    du( i ) = temp
+                    do j = 1, nrhs
+                       temp = b( i, j )
+                       b( i, j ) = b( i+1, j )
+                       b( i+1, j ) = temp - fact*b( i+1, j )
+                    end do
+                 end if
+              end if
+              if( d( n )==zero ) then
+                 info = n
+                 return
+              end if
+           end if
+           ! back solve with the matrix u from the factorization.
+           if( nrhs<=2 ) then
+              j = 1
+              70 continue
+              b( n, j ) = b( n, j ) / d( n )
+              if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) / d( n-1 )
+              do i = n - 2, 1, -1
+                 b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-dl( i )*b( i+2, j ) ) / d( i )
+                           
+              end do
+              if( j<nrhs ) then
+                 j = j + 1
+                 go to 70
+              end if
+           else
+              do j = 1, nrhs
+                 b( n, j ) = b( n, j ) / d( n )
+                 if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                 do i = n - 2, 1, -1
+                    b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-dl( i )*b( i+2, j ) ) / d( i )
+                              
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_qgtsv
+
+     !> DGTSVX: uses the LU factorization to compute the solution to a real
+     !> system of linear equations A * X = B or A**T * X = B,
+     !> where A is a tridiagonal matrix of order N and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     pure subroutine stdlib_qgtsvx( fact, trans, n, nrhs, dl, d, du, dlf, df, duf,du2, ipiv, b, &
+               ldb, x, ldx, rcond, ferr, berr,work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: b(ldb,*), d(*), dl(*), du(*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+           real(qp), intent(inout) :: df(*), dlf(*), du2(*), duf(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact, notran
+           character :: norm
+           real(qp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -14
+           else if( ldx<max( 1, n ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGTSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the lu factorization of a.
+              call stdlib_qcopy( n, d, 1, df, 1 )
+              if( n>1 ) then
+                 call stdlib_qcopy( n-1, dl, 1, dlf, 1 )
+                 call stdlib_qcopy( n-1, du, 1, duf, 1 )
+              end if
+              call stdlib_qgttrf( n, dlf, df, duf, du2, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_qlangt( norm, n, dl, d, du )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_qgtcon( norm, n, dlf, df, duf, du2, ipiv, anorm, rcond, work,iwork, info )
+                     
+           ! compute the solution vectors x.
+           call stdlib_qlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_qgttrs( trans, n, nrhs, dlf, df, duf, du2, ipiv, x, ldx,info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_qgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2, ipiv,b, ldb, x, ldx, &
+                     ferr, berr, work, iwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_qgtsvx
+
+     !> DGTTRF: computes an LU factorization of a real tridiagonal matrix A
+     !> using elimination with partial pivoting and row interchanges.
+     !> The factorization has the form
+     !> A = L * U
+     !> where L is a product of permutation and unit lower bidiagonal
+     !> matrices and U is upper triangular with nonzeros in only the main
+     !> diagonal and first two superdiagonals.
+
+     pure subroutine stdlib_qgttrf( n, dl, d, du, du2, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: d(*), dl(*), du(*)
+           real(qp), intent(out) :: du2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: fact, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'DGTTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize ipiv(i) = i and du2(i) = 0
+           do i = 1, n
+              ipiv( i ) = i
+           end do
+           do i = 1, n - 2
+              du2( i ) = zero
+           end do
+           do i = 1, n - 2
+              if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                 ! no row interchange required, eliminate dl(i)
+                 if( d( i )/=zero ) then
+                    fact = dl( i ) / d( i )
+                    dl( i ) = fact
+                    d( i+1 ) = d( i+1 ) - fact*du( i )
+                 end if
+              else
+                 ! interchange rows i and i+1, eliminate dl(i)
+                 fact = d( i ) / dl( i )
+                 d( i ) = dl( i )
+                 dl( i ) = fact
+                 temp = du( i )
+                 du( i ) = d( i+1 )
+                 d( i+1 ) = temp - fact*d( i+1 )
+                 du2( i ) = du( i+1 )
+                 du( i+1 ) = -fact*du( i+1 )
+                 ipiv( i ) = i + 1
+              end if
+           end do
+           if( n>1 ) then
+              i = n - 1
+              if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                 if( d( i )/=zero ) then
+                    fact = dl( i ) / d( i )
+                    dl( i ) = fact
+                    d( i+1 ) = d( i+1 ) - fact*du( i )
+                 end if
+              else
+                 fact = d( i ) / dl( i )
+                 d( i ) = dl( i )
+                 dl( i ) = fact
+                 temp = du( i )
+                 du( i ) = d( i+1 )
+                 d( i+1 ) = temp - fact*d( i+1 )
+                 ipiv( i ) = i + 1
+              end if
+           end if
+           ! check for a zero on the diagonal of u.
+           do i = 1, n
+              if( d( i )==zero ) then
+                 info = i
+                 go to 50
+              end if
+           end do
+           50 continue
+           return
+     end subroutine stdlib_qgttrf
+
+     !> DGTTRS: solves one of the systems of equations
+     !> A*X = B  or  A**T*X = B,
+     !> with a tridiagonal matrix A using the LU factorization computed
+     !> by DGTTRF.
+
+     pure subroutine stdlib_qgttrs( trans, n, nrhs, dl, d, du, du2, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(inout) :: b(ldb,*)
+           real(qp), intent(in) :: d(*), dl(*), du(*), du2(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: itrans, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           notran = ( trans=='N' .or. trans=='N' )
+           if( .not.notran .and. .not.( trans=='T' .or. trans=='T' ) .and. .not.( trans=='C' .or. &
+                     trans=='C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( n, 1 ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DGTTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! decode trans
+           if( notran ) then
+              itrans = 0
+           else
+              itrans = 1
+           end if
+           ! determine the number of right-hand sides to solve at a time.
+           if( nrhs==1 ) then
+              nb = 1
+           else
+              nb = max( 1, stdlib_ilaenv( 1, 'DGTTRS', trans, n, nrhs, -1, -1 ) )
+           end if
+           if( nb>=nrhs ) then
+              call stdlib_qgtts2( itrans, n, nrhs, dl, d, du, du2, ipiv, b, ldb )
+           else
+              do j = 1, nrhs, nb
+                 jb = min( nrhs-j+1, nb )
+                 call stdlib_qgtts2( itrans, n, jb, dl, d, du, du2, ipiv, b( 1, j ),ldb )
+              end do
+           end if
+     end subroutine stdlib_qgttrs
+
+     !> DGTTS2: solves one of the systems of equations
+     !> A*X = B  or  A**T*X = B,
+     !> with a tridiagonal matrix A using the LU factorization computed
+     !> by DGTTRF.
+
+     pure subroutine stdlib_qgtts2( itrans, n, nrhs, dl, d, du, du2, ipiv, b, ldb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: itrans, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(inout) :: b(ldb,*)
+           real(qp), intent(in) :: d(*), dl(*), du(*), du2(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ip, j
+           real(qp) :: temp
+           ! Executable Statements 
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( itrans==0 ) then
+              ! solve a*x = b using the lu factorization of a,
+              ! overwriting each right hand side vector with its solution.
+              if( nrhs<=1 ) then
+                 j = 1
+                 10 continue
+                 ! solve l*x = b.
+                 do i = 1, n - 1
+                    ip = ipiv( i )
+                    temp = b( i+1-ip+i, j ) - dl( i )*b( ip, j )
+                    b( i, j ) = b( ip, j )
+                    b( i+1, j ) = temp
+                 end do
+                 ! solve u*x = b.
+                 b( n, j ) = b( n, j ) / d( n )
+                 if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                 do i = n - 2, 1, -1
+                    b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-du2( i )*b( i+2, j ) ) / d( i )
+                              
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 10
+                 end if
+              else
+                 do j = 1, nrhs
+                    ! solve l*x = b.
+                    do i = 1, n - 1
+                       if( ipiv( i )==i ) then
+                          b( i+1, j ) = b( i+1, j ) - dl( i )*b( i, j )
+                       else
+                          temp = b( i, j )
+                          b( i, j ) = b( i+1, j )
+                          b( i+1, j ) = temp - dl( i )*b( i, j )
+                       end if
+                    end do
+                    ! solve u*x = b.
+                    b( n, j ) = b( n, j ) / d( n )
+                    if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                    do i = n - 2, 1, -1
+                       b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-du2( i )*b( i+2, j ) ) / d( i )
+                                 
+                    end do
+                 end do
+              end if
+           else
+              ! solve a**t * x = b.
+              if( nrhs<=1 ) then
+                 ! solve u**t*x = b.
+                 j = 1
+                 70 continue
+                 b( 1, j ) = b( 1, j ) / d( 1 )
+                 if( n>1 )b( 2, j ) = ( b( 2, j )-du( 1 )*b( 1, j ) ) / d( 2 )
+                 do i = 3, n
+                    b( i, j ) = ( b( i, j )-du( i-1 )*b( i-1, j )-du2( i-2 )*b( i-2, j ) ) / d( i &
+                              )
+                 end do
+                 ! solve l**t*x = b.
+                 do i = n - 1, 1, -1
+                    ip = ipiv( i )
+                    temp = b( i, j ) - dl( i )*b( i+1, j )
+                    b( i, j ) = b( ip, j )
+                    b( ip, j ) = temp
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 70
+                 end if
+              else
+                 do j = 1, nrhs
+                    ! solve u**t*x = b.
+                    b( 1, j ) = b( 1, j ) / d( 1 )
+                    if( n>1 )b( 2, j ) = ( b( 2, j )-du( 1 )*b( 1, j ) ) / d( 2 )
+                    do i = 3, n
+                       b( i, j ) = ( b( i, j )-du( i-1 )*b( i-1, j )-du2( i-2 )*b( i-2, j ) ) / d(&
+                                  i )
+                    end do
+                    do i = n - 1, 1, -1
+                       if( ipiv( i )==i ) then
+                          b( i, j ) = b( i, j ) - dl( i )*b( i+1, j )
+                       else
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - dl( i )*temp
+                          b( i, j ) = temp
+                       end if
+                    end do
+                 end do
+              end if
+           end if
+     end subroutine stdlib_qgtts2
+
+     !> DHGEQZ: computes the eigenvalues of a real matrix pair (H,T),
+     !> where H is an upper Hessenberg matrix and T is upper triangular,
+     !> using the double-shift QZ method.
+     !> Matrix pairs of this type are produced by the reduction to
+     !> generalized upper Hessenberg form of a real matrix pair (A,B):
+     !> A = Q1*H*Z1**T,  B = Q1*T*Z1**T,
+     !> as computed by DGGHRD.
+     !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+     !> also reduced to generalized Schur form,
+     !> H = Q*S*Z**T,  T = Q*P*Z**T,
+     !> where Q and Z are orthogonal matrices, P is an upper triangular
+     !> matrix, and S is a quasi-triangular matrix with 1-by-1 and 2-by-2
+     !> diagonal blocks.
+     !> The 1-by-1 blocks correspond to real eigenvalues of the matrix pair
+     !> (H,T) and the 2-by-2 blocks correspond to complex conjugate pairs of
+     !> eigenvalues.
+     !> Additionally, the 2-by-2 upper triangular diagonal blocks of P
+     !> corresponding to 2-by-2 blocks of S are reduced to positive diagonal
+     !> form, i.e., if S(j+1,j) is non-zero, then P(j+1,j) = P(j,j+1) = 0,
+     !> P(j,j) > 0, and P(j+1,j+1) > 0.
+     !> Optionally, the orthogonal matrix Q from the generalized Schur
+     !> factorization may be postmultiplied into an input matrix Q1, and the
+     !> orthogonal matrix Z may be postmultiplied into an input matrix Z1.
+     !> If Q1 and Z1 are the orthogonal matrices from DGGHRD that reduced
+     !> the matrix pair (A,B) to generalized upper Hessenberg form, then the
+     !> output matrices Q1*Q and Z1*Z are the orthogonal factors from the
+     !> generalized Schur factorization of (A,B):
+     !> A = (Q1*Q)*S*(Z1*Z)**T,  B = (Q1*Q)*P*(Z1*Z)**T.
+     !> To avoid overflow, eigenvalues of the matrix pair (H,T) (equivalently,
+     !> of (A,B)) are computed as a pair of values (alpha,beta), where alpha is
+     !> complex and beta real.
+     !> If beta is nonzero, lambda = alpha / beta is an eigenvalue of the
+     !> generalized nonsymmetric eigenvalue problem (GNEP)
+     !> A*x = lambda*B*x
+     !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+     !> alternate form of the GNEP
+     !> mu*A*y = B*y.
+     !> Real eigenvalues can be read directly from the generalized Schur
+     !> form:
+     !> alpha = S(i,i), beta = P(i,i).
+     !> Ref: C.B. Moler
+     !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+     !> pp. 241--256.
+
+     subroutine stdlib_qhgeqz( job, compq, compz, n, ilo, ihi, h, ldh, t, ldt,alphar, alphai, &
+               beta, q, ldq, z, ldz, work,lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz, job
+           integer(ilp), intent(in) :: ihi, ilo, ldh, ldq, ldt, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(out) :: alphai(*), alphar(*), beta(*), work(*)
+           real(qp), intent(inout) :: h(ldh,*), q(ldq,*), t(ldt,*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: safety = 1.0e+2_qp
+          ! $                     safety = one )
+           
+           ! Local Scalars 
+           logical(lk) :: ilazr2, ilazro, ilpivt, ilq, ilschr, ilz, lquery
+           integer(ilp) :: icompq, icompz, ifirst, ifrstm, iiter, ilast, ilastm, in, ischur, &
+                     istart, j, jc, jch, jiter, jr, maxit
+           real(qp) :: a11, a12, a1i, a1r, a21, a22, a2i, a2r, ad11, ad11l, ad12, ad12l, ad21, &
+           ad21l, ad22, ad22l, ad32l, an, anorm, ascale, atol, b11, b1a, b1i, b1r, b22, b2a, b2i, &
+           b2r, bn, bnorm, bscale, btol, c, c11i, c11r, c12, c21, c22i, c22r, cl, cq, cr, cz, &
+           eshift, s, s1, s1inv, s2, safmax, safmin, scale, sl, sqi, sqr, sr, szi, szr, t1, tau, &
+           temp, temp2, tempi, tempr, u1, u12, u12l, u2, ulp, vs, w11, w12, w21, w22, wabs, wi, &
+                     wr, wr2
+           ! Local Arrays 
+           real(qp) :: v(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,sqrt
+           ! Executable Statements 
+           ! decode job, compq, compz
+           if( stdlib_lsame( job, 'E' ) ) then
+              ilschr = .false.
+              ischur = 1
+           else if( stdlib_lsame( job, 'S' ) ) then
+              ilschr = .true.
+              ischur = 2
+           else
+              ischur = 0
+           end if
+           if( stdlib_lsame( compq, 'N' ) ) then
+              ilq = .false.
+              icompq = 1
+           else if( stdlib_lsame( compq, 'V' ) ) then
+              ilq = .true.
+              icompq = 2
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              ilq = .true.
+              icompq = 3
+           else
+              icompq = 0
+           end if
+           if( stdlib_lsame( compz, 'N' ) ) then
+              ilz = .false.
+              icompz = 1
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              ilz = .true.
+              icompz = 2
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              ilz = .true.
+              icompz = 3
+           else
+              icompz = 0
+           end if
+           ! check argument values
+           info = 0
+           work( 1 ) = max( 1, n )
+           lquery = ( lwork==-1 )
+           if( ischur==0 ) then
+              info = -1
+           else if( icompq==0 ) then
+              info = -2
+           else if( icompz==0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 ) then
+              info = -5
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -6
+           else if( ldh<n ) then
+              info = -8
+           else if( ldt<n ) then
+              info = -10
+           else if( ldq<1 .or. ( ilq .and. ldq<n ) ) then
+              info = -15
+           else if( ldz<1 .or. ( ilz .and. ldz<n ) ) then
+              info = -17
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DHGEQZ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = real( 1,KIND=qp)
+              return
+           end if
+           ! initialize q and z
+           if( icompq==3 )call stdlib_qlaset( 'FULL', n, n, zero, one, q, ldq )
+           if( icompz==3 )call stdlib_qlaset( 'FULL', n, n, zero, one, z, ldz )
+           ! machine constants
+           in = ihi + 1 - ilo
+           safmin = stdlib_qlamch( 'S' )
+           safmax = one / safmin
+           ulp = stdlib_qlamch( 'E' )*stdlib_qlamch( 'B' )
+           anorm = stdlib_qlanhs( 'F', in, h( ilo, ilo ), ldh, work )
+           bnorm = stdlib_qlanhs( 'F', in, t( ilo, ilo ), ldt, work )
+           atol = max( safmin, ulp*anorm )
+           btol = max( safmin, ulp*bnorm )
+           ascale = one / max( safmin, anorm )
+           bscale = one / max( safmin, bnorm )
+           ! set eigenvalues ihi+1:n
+           do j = ihi + 1, n
+              if( t( j, j )<zero ) then
+                 if( ilschr ) then
+                    do jr = 1, j
+                       h( jr, j ) = -h( jr, j )
+                       t( jr, j ) = -t( jr, j )
+                    end do
+                 else
+                    h( j, j ) = -h( j, j )
+                    t( j, j ) = -t( j, j )
+                 end if
+                 if( ilz ) then
+                    do jr = 1, n
+                       z( jr, j ) = -z( jr, j )
+                    end do
+                 end if
+              end if
+              alphar( j ) = h( j, j )
+              alphai( j ) = zero
+              beta( j ) = t( j, j )
+           end do
+           ! if ihi < ilo, skip qz steps
+           if( ihi<ilo )go to 380
+           ! main qz iteration loop
+           ! initialize dynamic indices
+           ! eigenvalues ilast+1:n have been found.
+              ! column operations modify rows ifrstm:whatever.
+              ! row operations modify columns whatever:ilastm.
+           ! if only eigenvalues are being computed, then
+              ! ifrstm is the row of the last splitting row above row ilast;
+              ! this is always at least ilo.
+           ! iiter counts iterations since the last eigenvalue was found,
+              ! to tell when to use an extraordinary shift.
+           ! maxit is the maximum number of qz sweeps allowed.
+           ilast = ihi
+           if( ilschr ) then
+              ifrstm = 1
+              ilastm = n
+           else
+              ifrstm = ilo
+              ilastm = ihi
+           end if
+           iiter = 0
+           eshift = zero
+           maxit = 30*( ihi-ilo+1 )
+           loop_360: do jiter = 1, maxit
+              ! split the matrix if possible.
+              ! two tests:
+                 ! 1: h(j,j-1)=0  or  j=ilo
+                 ! 2: t(j,j)=0
+              if( ilast==ilo ) then
+                 ! special case: j=ilast
+                 go to 80
+              else
+                 if( abs( h( ilast, ilast-1 ) )<=max( safmin, ulp*(abs( h( ilast, ilast ) ) + abs(&
+                            h( ilast-1, ilast-1 ) )) ) ) then
+                    h( ilast, ilast-1 ) = zero
+                    go to 80
+                 end if
+              end if
+              if( abs( t( ilast, ilast ) )<=max( safmin, ulp*(abs( t( ilast - 1, ilast ) ) + abs( &
+                        t( ilast-1, ilast-1 )) ) ) ) then
+                 t( ilast, ilast ) = zero
+                 go to 70
+              end if
+              ! general case: j<ilast
+              loop_60: do j = ilast - 1, ilo, -1
+                 ! test 1: for h(j,j-1)=0 or j=ilo
+                 if( j==ilo ) then
+                    ilazro = .true.
+                 else
+                    if( abs( h( j, j-1 ) )<=max( safmin, ulp*(abs( h( j, j ) ) + abs( h( j-1, j-1 &
+                              ) )) ) ) then
+                       h( j, j-1 ) = zero
+                       ilazro = .true.
+                    else
+                       ilazro = .false.
+                    end if
+                 end if
+                 ! test 2: for t(j,j)=0
+                 temp = abs ( t( j, j + 1 ) )
+                 if ( j > ilo )temp = temp + abs ( t( j - 1, j ) )
+                 if( abs( t( j, j ) )<max( safmin,ulp*temp ) ) then
+                    t( j, j ) = zero
+                    ! test 1a: check for 2 consecutive small subdiagonals in a
+                    ilazr2 = .false.
+                    if( .not.ilazro ) then
+                       temp = abs( h( j, j-1 ) )
+                       temp2 = abs( h( j, j ) )
+                       tempr = max( temp, temp2 )
+                       if( tempr<one .and. tempr/=zero ) then
+                          temp = temp / tempr
+                          temp2 = temp2 / tempr
+                       end if
+                       if( temp*( ascale*abs( h( j+1, j ) ) )<=temp2*( ascale*atol ) )ilazr2 = &
+                                 .true.
+                    end if
+                    ! if both tests pass (1
+                    ! element of b in the block is zero, split a 1x1 block off
+                    ! at the top. (i.e., at the j-th row/column) the leading
+                    ! diagonal element of the remainder can also be zero, so
+                    ! this may have to be done repeatedly.
+                    if( ilazro .or. ilazr2 ) then
+                       do jch = j, ilast - 1
+                          temp = h( jch, jch )
+                          call stdlib_qlartg( temp, h( jch+1, jch ), c, s,h( jch, jch ) )
+                          h( jch+1, jch ) = zero
+                          call stdlib_qrot( ilastm-jch, h( jch, jch+1 ), ldh,h( jch+1, jch+1 ), &
+                                    ldh, c, s )
+                          call stdlib_qrot( ilastm-jch, t( jch, jch+1 ), ldt,t( jch+1, jch+1 ), &
+                                    ldt, c, s )
+                          if( ilq )call stdlib_qrot( n, q( 1, jch ), 1, q( 1, jch+1 ), 1,c, s )
+                                    
+                          if( ilazr2 )h( jch, jch-1 ) = h( jch, jch-1 )*c
+                          ilazr2 = .false.
+                          if( abs( t( jch+1, jch+1 ) )>=btol ) then
+                             if( jch+1>=ilast ) then
+                                go to 80
+                             else
+                                ifirst = jch + 1
+                                go to 110
+                             end if
+                          end if
+                          t( jch+1, jch+1 ) = zero
+                       end do
+                       go to 70
+                    else
+                       ! only test 2 passed -- chase the zero to t(ilast,ilast)
+                       ! then process as in the case t(ilast,ilast)=0
+                       do jch = j, ilast - 1
+                          temp = t( jch, jch+1 )
+                          call stdlib_qlartg( temp, t( jch+1, jch+1 ), c, s,t( jch, jch+1 ) )
+                                    
+                          t( jch+1, jch+1 ) = zero
+                          if( jch<ilastm-1 )call stdlib_qrot( ilastm-jch-1, t( jch, jch+2 ), ldt,&
+                                    t( jch+1, jch+2 ), ldt, c, s )
+                          call stdlib_qrot( ilastm-jch+2, h( jch, jch-1 ), ldh,h( jch+1, jch-1 ), &
+                                    ldh, c, s )
+                          if( ilq )call stdlib_qrot( n, q( 1, jch ), 1, q( 1, jch+1 ), 1,c, s )
+                                    
+                          temp = h( jch+1, jch )
+                          call stdlib_qlartg( temp, h( jch+1, jch-1 ), c, s,h( jch+1, jch ) )
+                                    
+                          h( jch+1, jch-1 ) = zero
+                          call stdlib_qrot( jch+1-ifrstm, h( ifrstm, jch ), 1,h( ifrstm, jch-1 ), &
+                                    1, c, s )
+                          call stdlib_qrot( jch-ifrstm, t( ifrstm, jch ), 1,t( ifrstm, jch-1 ), 1,&
+                                     c, s )
+                          if( ilz )call stdlib_qrot( n, z( 1, jch ), 1, z( 1, jch-1 ), 1,c, s )
+                                    
+                       end do
+                       go to 70
+                    end if
+                 else if( ilazro ) then
+                    ! only test 1 passed -- work on j:ilast
+                    ifirst = j
+                    go to 110
+                 end if
+                 ! neither test passed -- try next j
+              end do loop_60
+              ! (drop-through is "impossible")
+              info = n + 1
+              go to 420
+              ! t(ilast,ilast)=0 -- clear h(ilast,ilast-1) to split off a
+              ! 1x1 block.
+              70 continue
+              temp = h( ilast, ilast )
+              call stdlib_qlartg( temp, h( ilast, ilast-1 ), c, s,h( ilast, ilast ) )
+              h( ilast, ilast-1 ) = zero
+              call stdlib_qrot( ilast-ifrstm, h( ifrstm, ilast ), 1,h( ifrstm, ilast-1 ), 1, c, s &
+                        )
+              call stdlib_qrot( ilast-ifrstm, t( ifrstm, ilast ), 1,t( ifrstm, ilast-1 ), 1, c, s &
+                        )
+              if( ilz )call stdlib_qrot( n, z( 1, ilast ), 1, z( 1, ilast-1 ), 1, c, s )
+              ! h(ilast,ilast-1)=0 -- standardize b, set alphar, alphai,
+                                    ! and beta
+                                    80 continue
+              if( t( ilast, ilast )<zero ) then
+                 if( ilschr ) then
+                    do j = ifrstm, ilast
+                       h( j, ilast ) = -h( j, ilast )
+                       t( j, ilast ) = -t( j, ilast )
+                    end do
+                 else
+                    h( ilast, ilast ) = -h( ilast, ilast )
+                    t( ilast, ilast ) = -t( ilast, ilast )
+                 end if
+                 if( ilz ) then
+                    do j = 1, n
+                       z( j, ilast ) = -z( j, ilast )
+                    end do
+                 end if
+              end if
+              alphar( ilast ) = h( ilast, ilast )
+              alphai( ilast ) = zero
+              beta( ilast ) = t( ilast, ilast )
+              ! go to next block -- exit if finished.
+              ilast = ilast - 1
+              if( ilast<ilo )go to 380
+              ! reset counters
+              iiter = 0
+              eshift = zero
+              if( .not.ilschr ) then
+                 ilastm = ilast
+                 if( ifrstm>ilast )ifrstm = ilo
+              end if
+              go to 350
+              ! qz step
+              ! this iteration only involves rows/columns ifirst:ilast. we
+              ! assume ifirst < ilast, and that the diagonal of b is non-zero.
+              110 continue
+              iiter = iiter + 1
+              if( .not.ilschr ) then
+                 ifrstm = ifirst
+              end if
+              ! compute single shifts.
+              ! at this point, ifirst < ilast, and the diagonal elements of
+              ! t(ifirst:ilast,ifirst,ilast) are larger than btol (in
+              ! magnitude)
+              if( ( iiter / 10 )*10==iiter ) then
+                 ! exceptional shift.  chosen for no particularly good reason.
+                 ! (single shift only.)
+                 if( ( real( maxit,KIND=qp)*safmin )*abs( h( ilast, ilast-1 ) )<abs( t( ilast-1, &
+                           ilast-1 ) ) ) then
+                    eshift = h( ilast, ilast-1 ) /t( ilast-1, ilast-1 )
+                 else
+                    eshift = eshift + one / ( safmin*real( maxit,KIND=qp) )
+                 end if
+                 s1 = one
+                 wr = eshift
+              else
+                 ! shifts based on the generalized eigenvalues of the
+                 ! bottom-right 2x2 block of a and b. the first eigenvalue
+                 ! returned by stdlib_qlag2 is the wilkinson shift (aep p.512_qp),
+                 call stdlib_qlag2( h( ilast-1, ilast-1 ), ldh,t( ilast-1, ilast-1 ), ldt, &
+                           safmin*safety, s1,s2, wr, wr2, wi )
+                 if ( abs( (wr/s1)*t( ilast, ilast ) - h( ilast, ilast ) )> abs( (wr2/s2)*t( &
+                           ilast, ilast )- h( ilast, ilast ) ) ) then
+                    temp = wr
+                    wr = wr2
+                    wr2 = temp
+                    temp = s1
+                    s1 = s2
+                    s2 = temp
+                 end if
+                 temp = max( s1, safmin*max( one, abs( wr ), abs( wi ) ) )
+                 if( wi/=zero )go to 200
+              end if
+              ! fiddle with shift to avoid overflow
+              temp = min( ascale, one )*( half*safmax )
+              if( s1>temp ) then
+                 scale = temp / s1
+              else
+                 scale = one
+              end if
+              temp = min( bscale, one )*( half*safmax )
+              if( abs( wr )>temp )scale = min( scale, temp / abs( wr ) )
+              s1 = scale*s1
+              wr = scale*wr
+              ! now check for two consecutive small subdiagonals.
+              do j = ilast - 1, ifirst + 1, -1
+                 istart = j
+                 temp = abs( s1*h( j, j-1 ) )
+                 temp2 = abs( s1*h( j, j )-wr*t( j, j ) )
+                 tempr = max( temp, temp2 )
+                 if( tempr<one .and. tempr/=zero ) then
+                    temp = temp / tempr
+                    temp2 = temp2 / tempr
+                 end if
+                 if( abs( ( ascale*h( j+1, j ) )*temp )<=( ascale*atol )*temp2 )go to 130
+              end do
+              istart = ifirst
+              130 continue
+              ! do an implicit single-shift qz sweep.
+              ! initial q
+              temp = s1*h( istart, istart ) - wr*t( istart, istart )
+              temp2 = s1*h( istart+1, istart )
+              call stdlib_qlartg( temp, temp2, c, s, tempr )
+              ! sweep
+              loop_190: do j = istart, ilast - 1
+                 if( j>istart ) then
+                    temp = h( j, j-1 )
+                    call stdlib_qlartg( temp, h( j+1, j-1 ), c, s, h( j, j-1 ) )
+                    h( j+1, j-1 ) = zero
+                 end if
+                 do jc = j, ilastm
+                    temp = c*h( j, jc ) + s*h( j+1, jc )
+                    h( j+1, jc ) = -s*h( j, jc ) + c*h( j+1, jc )
+                    h( j, jc ) = temp
+                    temp2 = c*t( j, jc ) + s*t( j+1, jc )
+                    t( j+1, jc ) = -s*t( j, jc ) + c*t( j+1, jc )
+                    t( j, jc ) = temp2
+                 end do
+                 if( ilq ) then
+                    do jr = 1, n
+                       temp = c*q( jr, j ) + s*q( jr, j+1 )
+                       q( jr, j+1 ) = -s*q( jr, j ) + c*q( jr, j+1 )
+                       q( jr, j ) = temp
+                    end do
+                 end if
+                 temp = t( j+1, j+1 )
+                 call stdlib_qlartg( temp, t( j+1, j ), c, s, t( j+1, j+1 ) )
+                 t( j+1, j ) = zero
+                 do jr = ifrstm, min( j+2, ilast )
+                    temp = c*h( jr, j+1 ) + s*h( jr, j )
+                    h( jr, j ) = -s*h( jr, j+1 ) + c*h( jr, j )
+                    h( jr, j+1 ) = temp
+                 end do
+                 do jr = ifrstm, j
+                    temp = c*t( jr, j+1 ) + s*t( jr, j )
+                    t( jr, j ) = -s*t( jr, j+1 ) + c*t( jr, j )
+                    t( jr, j+1 ) = temp
+                 end do
+                 if( ilz ) then
+                    do jr = 1, n
+                       temp = c*z( jr, j+1 ) + s*z( jr, j )
+                       z( jr, j ) = -s*z( jr, j+1 ) + c*z( jr, j )
+                       z( jr, j+1 ) = temp
+                    end do
+                 end if
+              end do loop_190
+              go to 350
+              ! use francis double-shift
+              ! note: the francis double-shift should work with real shifts,
+                    ! but only if the block is at least 3x3.
+                    ! this code may break if this point is reached with
+                    ! a 2x2 block with real eigenvalues.
+                    200 continue
+              if( ifirst+1==ilast ) then
+                 ! special case -- 2x2 block with complex eigenvectors
+                 ! step 1: standardize, that is, rotate so that
+                             ! ( b11  0  )
+                         ! b = (         )  with b11 non-negative.
+                             ! (  0  b22 )
+                 call stdlib_qlasv2( t( ilast-1, ilast-1 ), t( ilast-1, ilast ),t( ilast, ilast ),&
+                            b22, b11, sr, cr, sl, cl )
+                 if( b11<zero ) then
+                    cr = -cr
+                    sr = -sr
+                    b11 = -b11
+                    b22 = -b22
+                 end if
+                 call stdlib_qrot( ilastm+1-ifirst, h( ilast-1, ilast-1 ), ldh,h( ilast, ilast-1 )&
+                           , ldh, cl, sl )
+                 call stdlib_qrot( ilast+1-ifrstm, h( ifrstm, ilast-1 ), 1,h( ifrstm, ilast ), 1, &
+                           cr, sr )
+                 if( ilast<ilastm )call stdlib_qrot( ilastm-ilast, t( ilast-1, ilast+1 ), ldt,t( &
+                           ilast, ilast+1 ), ldt, cl, sl )
+                 if( ifrstm<ilast-1 )call stdlib_qrot( ifirst-ifrstm, t( ifrstm, ilast-1 ), 1,t( &
+                           ifrstm, ilast ), 1, cr, sr )
+                 if( ilq )call stdlib_qrot( n, q( 1, ilast-1 ), 1, q( 1, ilast ), 1, cl,sl )
+                           
+                 if( ilz )call stdlib_qrot( n, z( 1, ilast-1 ), 1, z( 1, ilast ), 1, cr,sr )
+                           
+                 t( ilast-1, ilast-1 ) = b11
+                 t( ilast-1, ilast ) = zero
+                 t( ilast, ilast-1 ) = zero
+                 t( ilast, ilast ) = b22
+                 ! if b22 is negative, negate column ilast
+                 if( b22<zero ) then
+                    do j = ifrstm, ilast
+                       h( j, ilast ) = -h( j, ilast )
+                       t( j, ilast ) = -t( j, ilast )
+                    end do
+                    if( ilz ) then
+                       do j = 1, n
+                          z( j, ilast ) = -z( j, ilast )
+                       end do
+                    end if
+                    b22 = -b22
+                 end if
+                 ! step 2: compute alphar, alphai, and beta (see refs.)
+                 ! recompute shift
+                 call stdlib_qlag2( h( ilast-1, ilast-1 ), ldh,t( ilast-1, ilast-1 ), ldt, &
+                           safmin*safety, s1,temp, wr, temp2, wi )
+                 ! if standardization has perturbed the shift onto real line,
+                 ! do another (real single-shift) qr step.
+                 if( wi==zero )go to 350
+                 s1inv = one / s1
+                 ! do eispack (qzval) computation of alpha and beta
+                 a11 = h( ilast-1, ilast-1 )
+                 a21 = h( ilast, ilast-1 )
+                 a12 = h( ilast-1, ilast )
+                 a22 = h( ilast, ilast )
+                 ! compute complex givens rotation on right
+                 ! (assume some element of c = (sa - wb) > unfl )
+                                  ! __
+                 ! (sa - wb) ( cz   -sz )
+                           ! ( sz    cz )
+                 c11r = s1*a11 - wr*b11
+                 c11i = -wi*b11
+                 c12 = s1*a12
+                 c21 = s1*a21
+                 c22r = s1*a22 - wr*b22
+                 c22i = -wi*b22
+                 if( abs( c11r )+abs( c11i )+abs( c12 )>abs( c21 )+abs( c22r )+abs( c22i ) ) &
+                           then
+                    t1 = stdlib_qlapy3( c12, c11r, c11i )
+                    cz = c12 / t1
+                    szr = -c11r / t1
+                    szi = -c11i / t1
+                 else
+                    cz = stdlib_qlapy2( c22r, c22i )
+                    if( cz<=safmin ) then
+                       cz = zero
+                       szr = one
+                       szi = zero
+                    else
+                       tempr = c22r / cz
+                       tempi = c22i / cz
+                       t1 = stdlib_qlapy2( cz, c21 )
+                       cz = cz / t1
+                       szr = -c21*tempr / t1
+                       szi = c21*tempi / t1
+                    end if
+                 end if
+                 ! compute givens rotation on left
+                 ! (  cq   sq )
+                 ! (  __      )  a or b
+                 ! ( -sq   cq )
+                 an = abs( a11 ) + abs( a12 ) + abs( a21 ) + abs( a22 )
+                 bn = abs( b11 ) + abs( b22 )
+                 wabs = abs( wr ) + abs( wi )
+                 if( s1*an>wabs*bn ) then
+                    cq = cz*b11
+                    sqr = szr*b22
+                    sqi = -szi*b22
+                 else
+                    a1r = cz*a11 + szr*a12
+                    a1i = szi*a12
+                    a2r = cz*a21 + szr*a22
+                    a2i = szi*a22
+                    cq = stdlib_qlapy2( a1r, a1i )
+                    if( cq<=safmin ) then
+                       cq = zero
+                       sqr = one
+                       sqi = zero
+                    else
+                       tempr = a1r / cq
+                       tempi = a1i / cq
+                       sqr = tempr*a2r + tempi*a2i
+                       sqi = tempi*a2r - tempr*a2i
+                    end if
+                 end if
+                 t1 = stdlib_qlapy3( cq, sqr, sqi )
+                 cq = cq / t1
+                 sqr = sqr / t1
+                 sqi = sqi / t1
+                 ! compute diagonal elements of qbz
+                 tempr = sqr*szr - sqi*szi
+                 tempi = sqr*szi + sqi*szr
+                 b1r = cq*cz*b11 + tempr*b22
+                 b1i = tempi*b22
+                 b1a = stdlib_qlapy2( b1r, b1i )
+                 b2r = cq*cz*b22 + tempr*b11
+                 b2i = -tempi*b11
+                 b2a = stdlib_qlapy2( b2r, b2i )
+                 ! normalize so beta > 0, and im( alpha1 ) > 0
+                 beta( ilast-1 ) = b1a
+                 beta( ilast ) = b2a
+                 alphar( ilast-1 ) = ( wr*b1a )*s1inv
+                 alphai( ilast-1 ) = ( wi*b1a )*s1inv
+                 alphar( ilast ) = ( wr*b2a )*s1inv
+                 alphai( ilast ) = -( wi*b2a )*s1inv
+                 ! step 3: go to next block -- exit if finished.
+                 ilast = ifirst - 1
+                 if( ilast<ilo )go to 380
+                 ! reset counters
+                 iiter = 0
+                 eshift = zero
+                 if( .not.ilschr ) then
+                    ilastm = ilast
+                    if( ifrstm>ilast )ifrstm = ilo
+                 end if
+                 go to 350
+              else
+                 ! usual case: 3x3 or larger block, using francis implicit
+                             ! double-shift
+                                          ! 2
+                 ! eigenvalue equation is  w  - c w + d = 0,
+                                               ! -1 2        -1
+                 ! so compute 1st column of  (a b  )  - c a b   + d
+                 ! using the formula in qzit (from eispack)
+                 ! we assume that the block is at least 3x3
+                 ad11 = ( ascale*h( ilast-1, ilast-1 ) ) /( bscale*t( ilast-1, ilast-1 ) )
+                 ad21 = ( ascale*h( ilast, ilast-1 ) ) /( bscale*t( ilast-1, ilast-1 ) )
+                 ad12 = ( ascale*h( ilast-1, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 ad22 = ( ascale*h( ilast, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 u12 = t( ilast-1, ilast ) / t( ilast, ilast )
+                 ad11l = ( ascale*h( ifirst, ifirst ) ) /( bscale*t( ifirst, ifirst ) )
+                 ad21l = ( ascale*h( ifirst+1, ifirst ) ) /( bscale*t( ifirst, ifirst ) )
+                 ad12l = ( ascale*h( ifirst, ifirst+1 ) ) /( bscale*t( ifirst+1, ifirst+1 ) )
+                           
+                 ad22l = ( ascale*h( ifirst+1, ifirst+1 ) ) /( bscale*t( ifirst+1, ifirst+1 ) )
+                           
+                 ad32l = ( ascale*h( ifirst+2, ifirst+1 ) ) /( bscale*t( ifirst+1, ifirst+1 ) )
+                           
+                 u12l = t( ifirst, ifirst+1 ) / t( ifirst+1, ifirst+1 )
+                 v( 1 ) = ( ad11-ad11l )*( ad22-ad11l ) - ad12*ad21 +ad21*u12*ad11l + ( ad12l-&
+                           ad11l*u12l )*ad21l
+                 v( 2 ) = ( ( ad22l-ad11l )-ad21l*u12l-( ad11-ad11l )-( ad22-ad11l )+ad21*u12 )&
+                           *ad21l
+                 v( 3 ) = ad32l*ad21l
+                 istart = ifirst
+                 call stdlib_qlarfg( 3, v( 1 ), v( 2 ), 1, tau )
+                 v( 1 ) = one
+                 ! sweep
+                 loop_290: do j = istart, ilast - 2
+                    ! all but last elements: use 3x3 householder transforms.
+                    ! zero (j-1)st column of a
+                    if( j>istart ) then
+                       v( 1 ) = h( j, j-1 )
+                       v( 2 ) = h( j+1, j-1 )
+                       v( 3 ) = h( j+2, j-1 )
+                       call stdlib_qlarfg( 3, h( j, j-1 ), v( 2 ), 1, tau )
+                       v( 1 ) = one
+                       h( j+1, j-1 ) = zero
+                       h( j+2, j-1 ) = zero
+                    end if
+                    do jc = j, ilastm
+                       temp = tau*( h( j, jc )+v( 2 )*h( j+1, jc )+v( 3 )*h( j+2, jc ) )
+                       h( j, jc ) = h( j, jc ) - temp
+                       h( j+1, jc ) = h( j+1, jc ) - temp*v( 2 )
+                       h( j+2, jc ) = h( j+2, jc ) - temp*v( 3 )
+                       temp2 = tau*( t( j, jc )+v( 2 )*t( j+1, jc )+v( 3 )*t( j+2, jc ) )
+                       t( j, jc ) = t( j, jc ) - temp2
+                       t( j+1, jc ) = t( j+1, jc ) - temp2*v( 2 )
+                       t( j+2, jc ) = t( j+2, jc ) - temp2*v( 3 )
+                    end do
+                    if( ilq ) then
+                       do jr = 1, n
+                          temp = tau*( q( jr, j )+v( 2 )*q( jr, j+1 )+v( 3 )*q( jr, j+2 ) )
+                                    
+                          q( jr, j ) = q( jr, j ) - temp
+                          q( jr, j+1 ) = q( jr, j+1 ) - temp*v( 2 )
+                          q( jr, j+2 ) = q( jr, j+2 ) - temp*v( 3 )
+                       end do
+                    end if
+                    ! zero j-th column of b (see dlagbc for details)
+                    ! swap rows to pivot
+                    ilpivt = .false.
+                    temp = max( abs( t( j+1, j+1 ) ), abs( t( j+1, j+2 ) ) )
+                    temp2 = max( abs( t( j+2, j+1 ) ), abs( t( j+2, j+2 ) ) )
+                    if( max( temp, temp2 )<safmin ) then
+                       scale = zero
+                       u1 = one
+                       u2 = zero
+                       go to 250
+                    else if( temp>=temp2 ) then
+                       w11 = t( j+1, j+1 )
+                       w21 = t( j+2, j+1 )
+                       w12 = t( j+1, j+2 )
+                       w22 = t( j+2, j+2 )
+                       u1 = t( j+1, j )
+                       u2 = t( j+2, j )
+                    else
+                       w21 = t( j+1, j+1 )
+                       w11 = t( j+2, j+1 )
+                       w22 = t( j+1, j+2 )
+                       w12 = t( j+2, j+2 )
+                       u2 = t( j+1, j )
+                       u1 = t( j+2, j )
+                    end if
+                    ! swap columns if nec.
+                    if( abs( w12 )>abs( w11 ) ) then
+                       ilpivt = .true.
+                       temp = w12
+                       temp2 = w22
+                       w12 = w11
+                       w22 = w21
+                       w11 = temp
+                       w21 = temp2
+                    end if
+                    ! lu-factor
+                    temp = w21 / w11
+                    u2 = u2 - temp*u1
+                    w22 = w22 - temp*w12
+                    w21 = zero
+                    ! compute scale
+                    scale = one
+                    if( abs( w22 )<safmin ) then
+                       scale = zero
+                       u2 = one
+                       u1 = -w12 / w11
+                       go to 250
+                    end if
+                    if( abs( w22 )<abs( u2 ) )scale = abs( w22 / u2 )
+                    if( abs( w11 )<abs( u1 ) )scale = min( scale, abs( w11 / u1 ) )
+                    ! solve
+                    u2 = ( scale*u2 ) / w22
+                    u1 = ( scale*u1-w12*u2 ) / w11
+                    250 continue
+                    if( ilpivt ) then
+                       temp = u2
+                       u2 = u1
+                       u1 = temp
+                    end if
+                    ! compute householder vector
+                    t1 = sqrt( scale**2+u1**2+u2**2 )
+                    tau = one + scale / t1
+                    vs = -one / ( scale+t1 )
+                    v( 1 ) = one
+                    v( 2 ) = vs*u1
+                    v( 3 ) = vs*u2
+                    ! apply transformations from the right.
+                    do jr = ifrstm, min( j+3, ilast )
+                       temp = tau*( h( jr, j )+v( 2 )*h( jr, j+1 )+v( 3 )*h( jr, j+2 ) )
+                       h( jr, j ) = h( jr, j ) - temp
+                       h( jr, j+1 ) = h( jr, j+1 ) - temp*v( 2 )
+                       h( jr, j+2 ) = h( jr, j+2 ) - temp*v( 3 )
+                    end do
+                    do jr = ifrstm, j + 2
+                       temp = tau*( t( jr, j )+v( 2 )*t( jr, j+1 )+v( 3 )*t( jr, j+2 ) )
+                       t( jr, j ) = t( jr, j ) - temp
+                       t( jr, j+1 ) = t( jr, j+1 ) - temp*v( 2 )
+                       t( jr, j+2 ) = t( jr, j+2 ) - temp*v( 3 )
+                    end do
+                    if( ilz ) then
+                       do jr = 1, n
+                          temp = tau*( z( jr, j )+v( 2 )*z( jr, j+1 )+v( 3 )*z( jr, j+2 ) )
+                                    
+                          z( jr, j ) = z( jr, j ) - temp
+                          z( jr, j+1 ) = z( jr, j+1 ) - temp*v( 2 )
+                          z( jr, j+2 ) = z( jr, j+2 ) - temp*v( 3 )
+                       end do
+                    end if
+                    t( j+1, j ) = zero
+                    t( j+2, j ) = zero
+                 end do loop_290
+                 ! last elements: use givens rotations
+                 ! rotations from the left
+                 j = ilast - 1
+                 temp = h( j, j-1 )
+                 call stdlib_qlartg( temp, h( j+1, j-1 ), c, s, h( j, j-1 ) )
+                 h( j+1, j-1 ) = zero
+                 do jc = j, ilastm
+                    temp = c*h( j, jc ) + s*h( j+1, jc )
+                    h( j+1, jc ) = -s*h( j, jc ) + c*h( j+1, jc )
+                    h( j, jc ) = temp
+                    temp2 = c*t( j, jc ) + s*t( j+1, jc )
+                    t( j+1, jc ) = -s*t( j, jc ) + c*t( j+1, jc )
+                    t( j, jc ) = temp2
+                 end do
+                 if( ilq ) then
+                    do jr = 1, n
+                       temp = c*q( jr, j ) + s*q( jr, j+1 )
+                       q( jr, j+1 ) = -s*q( jr, j ) + c*q( jr, j+1 )
+                       q( jr, j ) = temp
+                    end do
+                 end if
+                 ! rotations from the right.
+                 temp = t( j+1, j+1 )
+                 call stdlib_qlartg( temp, t( j+1, j ), c, s, t( j+1, j+1 ) )
+                 t( j+1, j ) = zero
+                 do jr = ifrstm, ilast
+                    temp = c*h( jr, j+1 ) + s*h( jr, j )
+                    h( jr, j ) = -s*h( jr, j+1 ) + c*h( jr, j )
+                    h( jr, j+1 ) = temp
+                 end do
+                 do jr = ifrstm, ilast - 1
+                    temp = c*t( jr, j+1 ) + s*t( jr, j )
+                    t( jr, j ) = -s*t( jr, j+1 ) + c*t( jr, j )
+                    t( jr, j+1 ) = temp
+                 end do
+                 if( ilz ) then
+                    do jr = 1, n
+                       temp = c*z( jr, j+1 ) + s*z( jr, j )
+                       z( jr, j ) = -s*z( jr, j+1 ) + c*z( jr, j )
+                       z( jr, j+1 ) = temp
+                    end do
+                 end if
+                 ! end of double-shift code
+              end if
+              go to 350
+              ! end of iteration loop
+              350 continue
+           end do loop_360
+           ! drop-through = non-convergence
+           info = ilast
+           go to 420
+           ! successful completion of all qz steps
+           380 continue
+           ! set eigenvalues 1:ilo-1
+           do j = 1, ilo - 1
+              if( t( j, j )<zero ) then
+                 if( ilschr ) then
+                    do jr = 1, j
+                       h( jr, j ) = -h( jr, j )
+                       t( jr, j ) = -t( jr, j )
+                    end do
+                 else
+                    h( j, j ) = -h( j, j )
+                    t( j, j ) = -t( j, j )
+                 end if
+                 if( ilz ) then
+                    do jr = 1, n
+                       z( jr, j ) = -z( jr, j )
+                    end do
+                 end if
+              end if
+              alphar( j ) = h( j, j )
+              alphai( j ) = zero
+              beta( j ) = t( j, j )
+           end do
+           ! normal termination
+           info = 0
+           ! exit (other than argument error) -- return optimal workspace size
+           420 continue
+           work( 1 ) = real( n,KIND=qp)
+           return
+     end subroutine stdlib_qhgeqz
+
+     !> DHSEIN: uses inverse iteration to find specified right and/or left
+     !> eigenvectors of a real upper Hessenberg matrix H.
+     !> The right eigenvector x and the left eigenvector y of the matrix H
+     !> corresponding to an eigenvalue w are defined by:
+     !> H * x = w * x,     y**h * H = w * y**h
+     !> where y**h denotes the conjugate transpose of the vector y.
+
+     subroutine stdlib_qhsein( side, eigsrc, initv, select, n, h, ldh, wr, wi,vl, ldvl, vr, ldvr, &
+               mm, m, work, ifaill,ifailr, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: eigsrc, initv, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldh, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(inout) :: select(*)
+           integer(ilp), intent(out) :: ifaill(*), ifailr(*)
+           real(qp), intent(in) :: h(ldh,*), wi(*)
+           real(qp), intent(inout) :: vl(ldvl,*), vr(ldvr,*), wr(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: bothv, fromqr, leftv, noinit, pair, rightv
+           integer(ilp) :: i, iinfo, k, kl, kln, kr, ksi, ksr, ldwork
+           real(qp) :: bignum, eps3, hnorm, smlnum, ulp, unfl, wki, wkr
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           bothv = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv = stdlib_lsame( side, 'L' ) .or. bothv
+           fromqr = stdlib_lsame( eigsrc, 'Q' )
+           noinit = stdlib_lsame( initv, 'N' )
+           ! set m to the number of columns required to store the selected
+           ! eigenvectors, and standardize the array select.
+           m = 0
+           pair = .false.
+           do k = 1, n
+              if( pair ) then
+                 pair = .false.
+                 select( k ) = .false.
+              else
+                 if( wi( k )==zero ) then
+                    if( select( k ) )m = m + 1
+                 else
+                    pair = .true.
+                    if( select( k ) .or. select( k+1 ) ) then
+                       select( k ) = .true.
+                       m = m + 2
+                    end if
+                 end if
+              end if
+           end do
+           info = 0
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.fromqr .and. .not.stdlib_lsame( eigsrc, 'N' ) ) then
+              info = -2
+           else if( .not.noinit .and. .not.stdlib_lsame( initv, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( ldh<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -11
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -13
+           else if( mm<m ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DHSEIN', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! set machine-dependent constants.
+           unfl = stdlib_qlamch( 'SAFE MINIMUM' )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           bignum = ( one-ulp ) / smlnum
+           ldwork = n + 1
+           kl = 1
+           kln = 0
+           if( fromqr ) then
+              kr = 0
+           else
+              kr = n
+           end if
+           ksr = 1
+           loop_120: do k = 1, n
+              if( select( k ) ) then
+                 ! compute eigenvector(s) corresponding to w(k).
+                 if( fromqr ) then
+                    ! if affiliation of eigenvalues is known, check whether
+                    ! the matrix splits.
+                    ! determine kl and kr such that 1 <= kl <= k <= kr <= n
+                    ! and h(kl,kl-1) and h(kr+1,kr) are zero (or kl = 1 or
+                    ! kr = n).
+                    ! then inverse iteration can be performed with the
+                    ! submatrix h(kl:n,kl:n) for a left eigenvector, and with
+                    ! the submatrix h(1:kr,1:kr) for a right eigenvector.
+                    do i = k, kl + 1, -1
+                       if( h( i, i-1 )==zero )go to 30
+                    end do
+                    30 continue
+                    kl = i
+                    if( k>kr ) then
+                       do i = k, n - 1
+                          if( h( i+1, i )==zero )go to 50
+                       end do
+                       50 continue
+                       kr = i
+                    end if
+                 end if
+                 if( kl/=kln ) then
+                    kln = kl
+                    ! compute infinity-norm of submatrix h(kl:kr,kl:kr) if it
+                    ! has not ben computed before.
+                    hnorm = stdlib_qlanhs( 'I', kr-kl+1, h( kl, kl ), ldh, work )
+                    if( stdlib_qisnan( hnorm ) ) then
+                       info = -6
+                       return
+                    else if( hnorm>zero ) then
+                       eps3 = hnorm*ulp
+                    else
+                       eps3 = smlnum
+                    end if
+                 end if
+                 ! perturb eigenvalue if it is close to any previous
+                 ! selected eigenvalues affiliated to the submatrix
+                 ! h(kl:kr,kl:kr). close roots are modified by eps3.
+                 wkr = wr( k )
+                 wki = wi( k )
+                 60 continue
+                 do i = k - 1, kl, -1
+                    if( select( i ) .and. abs( wr( i )-wkr )+abs( wi( i )-wki )<eps3 ) &
+                              then
+                       wkr = wkr + eps3
+                       go to 60
+                    end if
+                 end do
+                 wr( k ) = wkr
+                 pair = wki/=zero
+                 if( pair ) then
+                    ksi = ksr + 1
+                 else
+                    ksi = ksr
+                 end if
+                 if( leftv ) then
+                    ! compute left eigenvector.
+                    call stdlib_qlaein( .false., noinit, n-kl+1, h( kl, kl ), ldh,wkr, wki, vl( &
+                    kl, ksr ), vl( kl, ksi ),work, ldwork, work( n*n+n+1 ), eps3, smlnum,bignum, &
+                              iinfo )
+                    if( iinfo>0 ) then
+                       if( pair ) then
+                          info = info + 2
+                       else
+                          info = info + 1
+                       end if
+                       ifaill( ksr ) = k
+                       ifaill( ksi ) = k
+                    else
+                       ifaill( ksr ) = 0
+                       ifaill( ksi ) = 0
+                    end if
+                    do i = 1, kl - 1
+                       vl( i, ksr ) = zero
+                    end do
+                    if( pair ) then
+                       do i = 1, kl - 1
+                          vl( i, ksi ) = zero
+                       end do
+                    end if
+                 end if
+                 if( rightv ) then
+                    ! compute right eigenvector.
+                    call stdlib_qlaein( .true., noinit, kr, h, ldh, wkr, wki,vr( 1, ksr ), vr( 1, &
+                              ksi ), work, ldwork,work( n*n+n+1 ), eps3, smlnum, bignum,iinfo )
+                    if( iinfo>0 ) then
+                       if( pair ) then
+                          info = info + 2
+                       else
+                          info = info + 1
+                       end if
+                       ifailr( ksr ) = k
+                       ifailr( ksi ) = k
+                    else
+                       ifailr( ksr ) = 0
+                       ifailr( ksi ) = 0
+                    end if
+                    do i = kr + 1, n
+                       vr( i, ksr ) = zero
+                    end do
+                    if( pair ) then
+                       do i = kr + 1, n
+                          vr( i, ksi ) = zero
+                       end do
+                    end if
+                 end if
+                 if( pair ) then
+                    ksr = ksr + 2
+                 else
+                    ksr = ksr + 1
+                 end if
+              end if
+           end do loop_120
+           return
+     end subroutine stdlib_qhsein
+
+     !> DHSEQR: computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+     !> Schur form), and Z is the orthogonal matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input orthogonal
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+     subroutine stdlib_qhseqr( job, compz, n, ilo, ihi, h, ldh, wr, wi, z,ldz, work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           character, intent(in) :: compz, job
+           ! Array Arguments 
+           real(qp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(qp), intent(out) :: wi(*), work(*), wr(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: nl = 49
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_qlahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== nl allocates some local workspace to help small matrices
+           ! .    through a rare stdlib_qlahqr failure.  nl > ntiny = 15 is
+           ! .    required and nl <= nmin = stdlib_ilaenv(ispec=12,...) is recom-
+           ! .    mended.  (the default value of nmin is 75.)  using nl = 49
+           ! .    allows up to six simultaneous shifts and a 16-by-16
+           ! .    deflation window.  ====
+           
+           
+           ! Local Arrays 
+           real(qp) :: hl(nl,nl), workl(nl)
+           ! Local Scalars 
+           integer(ilp) :: i, kbot, nmin
+           logical(lk) :: initz, lquery, wantt, wantz
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! ==== decode and check the input parameters. ====
+           wantt = stdlib_lsame( job, 'S' )
+           initz = stdlib_lsame( compz, 'I' )
+           wantz = initz .or. stdlib_lsame( compz, 'V' )
+           work( 1 ) = real( max( 1, n ),KIND=qp)
+           lquery = lwork==-1
+           info = 0
+           if( .not.stdlib_lsame( job, 'E' ) .and. .not.wantt ) then
+              info = -1
+           else if( .not.stdlib_lsame( compz, 'N' ) .and. .not.wantz ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -4
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -5
+           else if( ldh<max( 1, n ) ) then
+              info = -7
+           else if( ldz<1 .or. ( wantz .and. ldz<max( 1, n ) ) ) then
+              info = -11
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              ! ==== quick return in case of invalid argument. ====
+              call stdlib_xerbla( 'DHSEQR', -info )
+              return
+           else if( n==0 ) then
+              ! ==== quick return in case n = 0; nothing to do. ====
+              return
+           else if( lquery ) then
+              ! ==== quick return in case of a workspace query ====
+              call stdlib_qlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, ilo,ihi, z, ldz, &
+                        work, lwork, info )
+              ! ==== ensure reported workspace size is backward-compatible with
+              ! .    previous lapack versions. ====
+              work( 1 ) = max( real( max( 1, n ),KIND=qp), work( 1 ) )
+              return
+           else
+              ! ==== copy eigenvalues isolated by stdlib_qgebal ====
+              do i = 1, ilo - 1
+                 wr( i ) = h( i, i )
+                 wi( i ) = zero
+              end do
+              do i = ihi + 1, n
+                 wr( i ) = h( i, i )
+                 wi( i ) = zero
+              end do
+              ! ==== initialize z, if requested ====
+              if( initz )call stdlib_qlaset( 'A', n, n, zero, one, z, ldz )
+              ! ==== quick return if possible ====
+              if( ilo==ihi ) then
+                 wr( ilo ) = h( ilo, ilo )
+                 wi( ilo ) = zero
+                 return
+              end if
+              ! ==== stdlib_qlahqr/stdlib_qlaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'DHSEQR', job( : 1 ) // compz( : 1 ), n,ilo, ihi, lwork )
+                        
+              nmin = max( ntiny, nmin )
+              ! ==== stdlib_qlaqr0 for big matrices; stdlib_qlahqr for small ones ====
+              if( n>nmin ) then
+                 call stdlib_qlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, ilo,ihi, z, ldz, &
+                           work, lwork, info )
+              else
+                 ! ==== small matrix ====
+                 call stdlib_qlahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, ilo,ihi, z, ldz, &
+                           info )
+                 if( info>0 ) then
+                    ! ==== a rare stdlib_qlahqr failure!  stdlib_qlaqr0 sometimes succeeds
+                    ! .    when stdlib_qlahqr fails. ====
+                    kbot = info
+                    if( n>=nl ) then
+                       ! ==== larger matrices have enough subdiagonal scratch
+                       ! .    space to call stdlib_qlaqr0 directly. ====
+                       call stdlib_qlaqr0( wantt, wantz, n, ilo, kbot, h, ldh, wr,wi, ilo, ihi, z,&
+                                  ldz, work, lwork, info )
+                    else
+                       ! ==== tiny matrices don't have enough subdiagonal
+                       ! .    scratch space to benefit from stdlib_qlaqr0.  hence,
+                       ! .    tiny matrices must be copied into a larger
+                       ! .    array before calling stdlib_qlaqr0. ====
+                       call stdlib_qlacpy( 'A', n, n, h, ldh, hl, nl )
+                       hl( n+1, n ) = zero
+                       call stdlib_qlaset( 'A', nl, nl-n, zero, zero, hl( 1, n+1 ),nl )
+                       call stdlib_qlaqr0( wantt, wantz, nl, ilo, kbot, hl, nl, wr,wi, ilo, ihi, &
+                                 z, ldz, workl, nl, info )
+                       if( wantt .or. info/=0 )call stdlib_qlacpy( 'A', n, n, hl, nl, h, ldh )
+                                 
+                    end if
+                 end if
+              end if
+              ! ==== clear out the trash, if necessary. ====
+              if( ( wantt .or. info/=0 ) .and. n>2 )call stdlib_qlaset( 'L', n-2, n-2, zero, zero,&
+                         h( 3, 1 ), ldh )
+              ! ==== ensure reported workspace size is backward-compatible with
+              ! .    previous lapack versions. ====
+              work( 1 ) = max( real( max( 1, n ),KIND=qp), work( 1 ) )
+           end if
+     end subroutine stdlib_qhseqr
+
+     !> DISNAN: returns .TRUE. if its argument is NaN, and .FALSE.
+     !> otherwise.  To be replaced by the Fortran 2003 intrinsic in the
+     !> future.
+
+     pure logical(lk) function stdlib_qisnan( din )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: din
+        ! =====================================================================
+        ! Executable Statements 
+           stdlib_qisnan = stdlib_qlaisnan(din,din)
+           return
+     end function stdlib_qisnan
+
+     !> DLA_GBAMV:  performs one of the matrix-vector operations
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_qla_gbamv( trans, m, n, kl, ku, alpha, ab, ldab, x,incx, beta, y, incy )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, ldab, m, n, kl, ku, trans
+           ! Array Arguments 
+           real(qp), intent(in) :: ab(ldab,*), x(*)
+           real(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_wero
+           real(qp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky, lenx, leny, kd, ke
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( .not.( ( trans==stdlib_ilatrans( 'N' ) ).or. ( trans==stdlib_ilatrans( 'T' ) )&
+                     .or. ( trans==stdlib_ilatrans( 'C' ) ) ) ) then
+              info = 1
+           else if( m<0 )then
+              info = 2
+           else if( n<0 )then
+              info = 3
+           else if( kl<0 .or. kl>m-1 ) then
+              info = 4
+           else if( ku<0 .or. ku>n-1 ) then
+              info = 5
+           else if( ldab<kl+ku+1 )then
+              info = 6
+           else if( incx==0 )then
+              info = 8
+           else if( incy==0 )then
+              info = 11
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'DLA_GBAMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( m==0 ).or.( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if( trans==stdlib_ilatrans( 'N' ) )then
+              lenx = n
+              leny = m
+           else
+              lenx = m
+              leny = n
+           end if
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( lenx - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( leny - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(m*n) symb_wero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           kd = ku + 1
+           ke = kl + 1
+           iy = ky
+           if ( incx==1 ) then
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = abs( ab( kd+i-j, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = abs( ab( ke-i+j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       jx = kx
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = abs( ab( kd+i-j, j ) )
+                          symb_wero = symb_wero .and.( x( jx ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       jx = kx
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = abs( ab( ke-i+j, i ) )
+                          symb_wero = symb_wero .and.( x( jx ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qla_gbamv
+
+     !> DLA_GBRCOND: Estimates the Skeel condition number of  op(A) * op2(C)
+     !> where op2 is determined by CMODE as follows
+     !> CMODE =  1    op2(C) = C
+     !> CMODE =  0    op2(C) = I
+     !> CMODE = -1    op2(C) = inv(C)
+     !> The Skeel condition number  cond(A) = norminf( |inv(A)||A| )
+     !> is computed by computing scaling factors R such that
+     !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+     !> infinity-norm condition number.
+
+     real(qp) function stdlib_qla_gbrcond( trans, n, kl, ku, ab, ldab,afb, ldafb, ipiv, cmode, c,&
+               info, work, iwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: n, ldab, ldafb, kl, ku, cmode
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(in) :: ab(ldab,*), afb(ldafb,*), c(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notrans
+           integer(ilp) :: kase, i, j, kd, ke
+           real(qp) :: ainvnm, tmp
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           stdlib_qla_gbrcond = zero
+           info = 0
+           notrans = stdlib_lsame( trans, 'N' )
+           if ( .not. notrans .and. .not. stdlib_lsame(trans, 'T').and. .not. stdlib_lsame(trans, &
+                     'C') ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 .or. kl>n-1 ) then
+              info = -3
+           else if( ku<0 .or. ku>n-1 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLA_GBRCOND', -info )
+              return
+           end if
+           if( n==0 ) then
+              stdlib_qla_gbrcond = one
+              return
+           end if
+           ! compute the equilibration matrix r such that
+           ! inv(r)*a*c has unit 1-norm.
+           kd = ku + 1
+           ke = kl + 1
+           if ( notrans ) then
+              do i = 1, n
+                 tmp = zero
+                    if ( cmode == 1 ) then
+                       do j = max( i-kl, 1 ), min( i+ku, n )
+                          tmp = tmp + abs( ab( kd+i-j, j ) * c( j ) )
+                       end do
+                    else if ( cmode == 0 ) then
+                       do j = max( i-kl, 1 ), min( i+ku, n )
+                          tmp = tmp + abs( ab( kd+i-j, j ) )
+                       end do
+                    else
+                       do j = max( i-kl, 1 ), min( i+ku, n )
+                          tmp = tmp + abs( ab( kd+i-j, j ) / c( j ) )
+                       end do
+                    end if
+                 work( 2*n+i ) = tmp
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + abs( ab( ke-i+j, i ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + abs( ab( ke-i+j, i ) )
+                    end do
+                 else
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + abs( ab( ke-i+j, i ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+                 if ( notrans ) then
+                    call stdlib_qgbtrs( 'NO TRANSPOSE', n, kl, ku, 1, afb, ldafb,ipiv, work, n, &
+                              info )
+                 else
+                    call stdlib_qgbtrs( 'TRANSPOSE', n, kl, ku, 1, afb, ldafb, ipiv,work, n, info &
+                              )
+                 end if
+                 ! multiply by inv(c).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( notrans ) then
+                    call stdlib_qgbtrs( 'TRANSPOSE', n, kl, ku, 1, afb, ldafb, ipiv,work, n, info &
+                              )
+                 else
+                    call stdlib_qgbtrs( 'NO TRANSPOSE', n, kl, ku, 1, afb, ldafb,ipiv, work, n, &
+                              info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_qla_gbrcond = ( one / ainvnm )
+           return
+     end function stdlib_qla_gbrcond
+
+     !> DLA_GBRPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     pure real(qp) function stdlib_qla_gbrpvgrw( n, kl, ku, ncols, ab,ldab, afb, ldafb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, kl, ku, ncols, ldab, ldafb
+           ! Array Arguments 
+           real(qp), intent(in) :: ab(ldab,*), afb(ldafb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(qp) :: amax, umax, rpvgrw
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           rpvgrw = one
+           kd = ku + 1
+           do j = 1, ncols
+              amax = zero
+              umax = zero
+              do i = max( j-ku, 1 ), min( j+kl, n )
+                 amax = max( abs( ab( kd+i-j, j)), amax )
+              end do
+              do i = max( j-ku, 1 ), j
+                 umax = max( abs( afb( kd+i-j, j ) ), umax )
+              end do
+              if ( umax /= zero ) then
+                 rpvgrw = min( amax / umax, rpvgrw )
+              end if
+           end do
+           stdlib_qla_gbrpvgrw = rpvgrw
+     end function stdlib_qla_gbrpvgrw
+
+     !> DLA_GEAMV:  performs one of the matrix-vector operations
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_qla_geamv ( trans, m, n, alpha, a, lda, x, incx, beta,y, incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, m, n, trans
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), x(*)
+           real(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_wero
+           real(qp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky, lenx, leny
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( .not.( ( trans==stdlib_ilatrans( 'N' ) ).or. ( trans==stdlib_ilatrans( 'T' ) )&
+                     .or. ( trans==stdlib_ilatrans( 'C' )) ) ) then
+              info = 1
+           else if( m<0 )then
+              info = 2
+           else if( n<0 )then
+              info = 3
+           else if( lda<max( 1, m ) )then
+              info = 6
+           else if( incx==0 )then
+              info = 8
+           else if( incy==0 )then
+              info = 11
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'DLA_GEAMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( m==0 ).or.( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if( trans==stdlib_ilatrans( 'N' ) )then
+              lenx = n
+              leny = m
+           else
+              lenx = m
+              leny = n
+           end if
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( lenx - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( leny - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(m*n) symb_wero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, lenx
+                          temp = abs( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, lenx
+                          temp = abs( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       jx = kx
+                       do j = 1, lenx
+                          temp = abs( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( jx ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if (.not.symb_wero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       jx = kx
+                       do j = 1, lenx
+                          temp = abs( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( jx ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if (.not.symb_wero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qla_geamv
+
+     !> DLA_GERCOND: estimates the Skeel condition number of op(A) * op2(C)
+     !> where op2 is determined by CMODE as follows
+     !> CMODE =  1    op2(C) = C
+     !> CMODE =  0    op2(C) = I
+     !> CMODE = -1    op2(C) = inv(C)
+     !> The Skeel condition number cond(A) = norminf( |inv(A)||A| )
+     !> is computed by computing scaling factors R such that
+     !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+     !> infinity-norm condition number.
+
+     real(qp) function stdlib_qla_gercond( trans, n, a, lda, af,ldaf, ipiv, cmode, c,info, work, &
+               iwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: n, lda, ldaf, cmode
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: a(lda,*), af(ldaf,*), c(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notrans
+           integer(ilp) :: kase, i, j
+           real(qp) :: ainvnm, tmp
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           stdlib_qla_gercond = zero
+           info = 0
+           notrans = stdlib_lsame( trans, 'N' )
+           if ( .not. notrans .and. .not. stdlib_lsame(trans, 'T').and. .not. stdlib_lsame(trans, &
+                     'C') ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLA_GERCOND', -info )
+              return
+           end if
+           if( n==0 ) then
+              stdlib_qla_gercond = one
+              return
+           end if
+           ! compute the equilibration matrix r such that
+           ! inv(r)*a*c has unit 1-norm.
+           if (notrans) then
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, n
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, n
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                 else
+                    do j = 1, n
+                       tmp = tmp + abs( a( i, j ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, n
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, n
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                 else
+                    do j = 1, n
+                       tmp = tmp + abs( a( j, i ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work(i) = work(i) * work(2*n+i)
+                 end do
+                 if (notrans) then
+                    call stdlib_qgetrs( 'NO TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                              
+                 else
+                    call stdlib_qgetrs( 'TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                 end if
+                 ! multiply by inv(c).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if (notrans) then
+                    call stdlib_qgetrs( 'TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_qgetrs( 'NO TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                              
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_qla_gercond = ( one / ainvnm )
+           return
+     end function stdlib_qla_gercond
+
+     !> DLA_GERPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     pure real(qp) function stdlib_qla_gerpvgrw( n, ncols, a, lda, af,ldaf )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, ncols, lda, ldaf
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), af(ldaf,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: amax, umax, rpvgrw
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           rpvgrw = one
+           do j = 1, ncols
+              amax = zero
+              umax = zero
+              do i = 1, n
+                 amax = max( abs( a( i, j ) ), amax )
+              end do
+              do i = 1, j
+                 umax = max( abs( af( i, j ) ), umax )
+              end do
+              if ( umax /= zero ) then
+                 rpvgrw = min( amax / umax, rpvgrw )
+              end if
+           end do
+           stdlib_qla_gerpvgrw = rpvgrw
+     end function stdlib_qla_gerpvgrw
+
+     !> DLA_LIN_BERR: computes component-wise relative backward error from
+     !> the formula
+     !> max(i) ( abs(R(i)) / ( abs(op(A_s))*abs(Y) + abs(B_s) )(i) )
+     !> where abs(Z) is the component-wise absolute value of the matrix
+     !> or vector Z.
+
+     pure subroutine stdlib_qla_lin_berr ( n, nz, nrhs, res, ayb, berr )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, nz, nrhs
+           ! Array Arguments 
+           real(qp), intent(in) :: ayb(n,nrhs)
+           real(qp), intent(out) :: berr(nrhs)
+           real(qp), intent(in) :: res(n,nrhs)
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: tmp,safe1
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! adding safe1 to the numerator guards against spuriously zero
+           ! residuals.  a similar safeguard is in the sla_yyamv routine used
+           ! to compute ayb.
+           safe1 = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = (nz+1)*safe1
+           do j = 1, nrhs
+              berr(j) = zero
+              do i = 1, n
+                 if (ayb(i,j) /= zero) then
+                    tmp = (safe1+abs(res(i,j)))/ayb(i,j)
+                    berr(j) = max( berr(j), tmp )
+                 end if
+           ! if ayb is exactly 0.0_qp (and if computed by sla_yyamv), then we know
+           ! the true residual also must be exactly zero.
+              end do
+           end do
+     end subroutine stdlib_qla_lin_berr
+
+     !> DLA_PORCOND: Estimates the Skeel condition number of  op(A) * op2(C)
+     !> where op2 is determined by CMODE as follows
+     !> CMODE =  1    op2(C) = C
+     !> CMODE =  0    op2(C) = I
+     !> CMODE = -1    op2(C) = inv(C)
+     !> The Skeel condition number  cond(A) = norminf( |inv(A)||A| )
+     !> is computed by computing scaling factors R such that
+     !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+     !> infinity-norm condition number.
+
+     real(qp) function stdlib_qla_porcond( uplo, n, a, lda, af, ldaf,cmode, c, info, work,iwork )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, ldaf, cmode
+           integer(ilp), intent(out) :: info
+           real(qp), intent(in) :: a(lda,*), af(ldaf,*), c(*)
+           real(qp), intent(out) :: work(*)
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: kase, i, j
+           real(qp) :: ainvnm, tmp
+           logical(lk) :: up
+           ! Array Arguments 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           stdlib_qla_porcond = zero
+           info = 0
+           if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLA_PORCOND', -info )
+              return
+           end if
+           if( n==0 ) then
+              stdlib_qla_porcond = one
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute the equilibration matrix r such that
+           ! inv(r)*a*c has unit 1-norm.
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + abs( a( j ,i ) / c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) / c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           endif
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+                 if (up) then
+                    call stdlib_qpotrs( 'UPPER', n, 1, af, ldaf, work, n, info )
+                 else
+                    call stdlib_qpotrs( 'LOWER', n, 1, af, ldaf, work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_qpotrs( 'UPPER', n, 1, af, ldaf, work, n, info )
+                 else
+                    call stdlib_qpotrs( 'LOWER', n, 1, af, ldaf, work, n, info )
+                 endif
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_qla_porcond = ( one / ainvnm )
+           return
+     end function stdlib_qla_porcond
+
+     !> DLA_PORPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(qp) function stdlib_qla_porpvgrw( uplo, ncols, a, lda, af,ldaf, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: ncols, lda, ldaf
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: amax, umax, rpvgrw
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           ! stdlib_qpotrf will have factored only the ncolsxncols leading minor, so
+           ! we restrict the growth search to that minor and use only the first
+           ! 2*ncols workspace entries.
+           rpvgrw = one
+           do i = 1, 2*ncols
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column.
+           if ( upper ) then
+              do j = 1, ncols
+                 do i = 1, j
+                    work( ncols+j ) =max( abs( a( i, j ) ), work( ncols+j ) )
+                 end do
+              end do
+           else
+              do j = 1, ncols
+                 do i = j, ncols
+                    work( ncols+j ) =max( abs( a( i, j ) ), work( ncols+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of the factor in
+           ! af.  no pivoting, so no permutations.
+           if ( stdlib_lsame( 'UPPER', uplo ) ) then
+              do j = 1, ncols
+                 do i = 1, j
+                    work( j ) = max( abs( af( i, j ) ), work( j ) )
+                 end do
+              end do
+           else
+              do j = 1, ncols
+                 do i = j, ncols
+                    work( j ) = max( abs( af( i, j ) ), work( j ) )
+                 end do
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( stdlib_lsame( 'UPPER', uplo ) ) then
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( ncols+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( ncols+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_qla_porpvgrw = rpvgrw
+     end function stdlib_qla_porpvgrw
+
+     !> DLA_SYAMV:  performs the matrix-vector operation
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> n by n symmetric matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_qla_syamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n, uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), x(*)
+           real(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_wero
+           real(qp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( uplo/=stdlib_ilauplo( 'U' ) .and.uplo/=stdlib_ilauplo( 'L' ) ) then
+              info = 1
+           else if( n<0 )then
+              info = 2
+           else if( lda<max( 1, n ) )then
+              info = 5
+           else if( incx==0 )then
+              info = 7
+           else if( incy==0 )then
+              info = 10
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'DLA_SYAMV', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( n - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( n - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(n^2) symb_wero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = abs( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = abs( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = abs( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = abs( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = abs( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = abs( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = abs( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = abs( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qla_syamv
+
+     !> DLA_SYRCOND: estimates the Skeel condition number of  op(A) * op2(C)
+     !> where op2 is determined by CMODE as follows
+     !> CMODE =  1    op2(C) = C
+     !> CMODE =  0    op2(C) = I
+     !> CMODE = -1    op2(C) = inv(C)
+     !> The Skeel condition number cond(A) = norminf( |inv(A)||A| )
+     !> is computed by computing scaling factors R such that
+     !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+     !> infinity-norm condition number.
+
+     real(qp) function stdlib_qla_syrcond( uplo, n, a, lda, af, ldaf,ipiv, cmode, c, info, work,&
+               iwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, ldaf, cmode
+           integer(ilp), intent(out) :: info
+           ! Array Arguments
+           integer(ilp), intent(out) :: iwork(*)
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(in) :: a(lda,*), af(ldaf,*), c(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           character :: normin
+           integer(ilp) :: kase, i, j
+           real(qp) :: ainvnm, smlnum, tmp
+           logical(lk) :: up
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           stdlib_qla_syrcond = zero
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLA_SYRCOND', -info )
+              return
+           end if
+           if( n==0 ) then
+              stdlib_qla_syrcond = one
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute the equilibration matrix r such that
+           ! inv(r)*a*c has unit 1-norm.
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) / c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j) / c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           endif
+           ! estimate the norm of inv(op(a)).
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           ainvnm = zero
+           normin = 'N'
+           kase = 0
+           10 continue
+           call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+                 if ( up ) then
+                    call stdlib_qsytrs( 'U', n, 1, af, ldaf, ipiv, work, n, info )
+                 else
+                    call stdlib_qsytrs( 'L', n, 1, af, ldaf, ipiv, work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_qsytrs( 'U', n, 1, af, ldaf, ipiv, work, n, info )
+                 else
+                    call stdlib_qsytrs( 'L', n, 1, af, ldaf, ipiv, work, n, info )
+                 endif
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_qla_syrcond = ( one / ainvnm )
+           return
+     end function stdlib_qla_syrcond
+
+     !> DLA_SYRPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(qp) function stdlib_qla_syrpvgrw( uplo, n, info, a, lda, af,ldaf, ipiv, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, info, lda, ldaf
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: ncols, i, j, k, kp
+           real(qp) :: amax, umax, rpvgrw, tmp
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           if ( info==0 ) then
+              if ( upper ) then
+                 ncols = 1
+              else
+                 ncols = n
+              end if
+           else
+              ncols = info
+           end if
+           rpvgrw = one
+           do i = 1, 2*n
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column of a.  compute the max
+           ! for all n columns so we can apply the pivot permutation while
+           ! looping below.  assume a full factorization is the common case.
+           if ( upper ) then
+              do j = 1, n
+                 do i = 1, j
+                    work( n+i ) = max( abs( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( abs( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = j, n
+                    work( n+i ) = max( abs( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( abs( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of u or l.  also
+           ! permute the magnitudes of a above so they're in the same order as
+           ! the factor.
+           ! the iteration orders and permutations were copied from stdlib_qsytrs.
+           ! calls to stdlib_dswap would be severe overkill.
+           if ( upper ) then
+              k = n
+              do while ( k < ncols .and. k>0 )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = 1, k
+                       work( k ) = max( abs( af( i, k ) ), work( k ) )
+                    end do
+                    k = k - 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k-1 )
+                    work( n+k-1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = 1, k-1
+                       work( k ) = max( abs( af( i, k ) ), work( k ) )
+                       work( k-1 ) = max( abs( af( i, k-1 ) ), work( k-1 ) )
+                    end do
+                    work( k ) = max( abs( af( k, k ) ), work( k ) )
+                    k = k - 2
+                 end if
+              end do
+              k = ncols
+              do while ( k <= n )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k + 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k + 2
+                 end if
+              end do
+           else
+              k = 1
+              do while ( k <= ncols )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = k, n
+                       work( k ) = max( abs( af( i, k ) ), work( k ) )
+                    end do
+                    k = k + 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k+1 )
+                    work( n+k+1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = k+1, n
+                       work( k ) = max( abs( af( i, k ) ), work( k ) )
+                       work( k+1 ) = max( abs( af(i, k+1 ) ), work( k+1 ) )
+                    end do
+                    work( k ) = max( abs( af( k, k ) ), work( k ) )
+                    k = k + 2
+                 end if
+              end do
+              k = ncols
+              do while ( k >= 1 )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k - 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k - 2
+                 endif
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( upper ) then
+              do i = ncols, n
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_qla_syrpvgrw = rpvgrw
+     end function stdlib_qla_syrpvgrw
+
+     !> DLA_WWADDW: adds a vector W into a doubled-single vector (X, Y).
+     !> This works for all extant IBM's hex and binary floating point
+     !> arithmetic, but not for decimal.
+
+     pure subroutine stdlib_qla_wwaddw( n, x, y, w )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(inout) :: x(*), y(*)
+           real(qp), intent(in) :: w(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: s
+           integer(ilp) :: i
+           ! Executable Statements 
+           do 10 i = 1, n
+             s = x(i) + w(i)
+             s = (s + s) - s
+             y(i) = ((x(i) - s) + w(i)) + y(i)
+             x(i) = s
+             10 continue
+           return
+     end subroutine stdlib_qla_wwaddw
+
+     !> DLABAD: takes as input the values computed by DLAMCH for underflow and
+     !> overflow, and returns the square root of each of these values if the
+     !> log of LARGE is sufficiently large.  This subroutine is intended to
+     !> identify machines with a large exponent range, such as the Crays, and
+     !> redefine the underflow and overflow limits to be the square roots of
+     !> the values computed by DLAMCH.  This subroutine is needed because
+     !> DLAMCH does not compensate for poor arithmetic in the upper half of
+     !> the exponent range, as is found on a Cray.
+
+     pure subroutine stdlib_qlabad( small, large )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(inout) :: large, small
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: log10,sqrt
+           ! Executable Statements 
+           ! if it looks like we're on a cray, take the square root of
+           ! small and large to avoid overflow and underflow problems.
+           if( log10( large )>2000._qp ) then
+              small = sqrt( small )
+              large = sqrt( large )
+           end if
+           return
+     end subroutine stdlib_qlabad
+
+     !> DLABRD: reduces the first NB rows and columns of a real general
+     !> m by n matrix A to upper or lower bidiagonal form by an orthogonal
+     !> transformation Q**T * A * P, and returns the matrices X and Y which
+     !> are needed to apply the transformation to the unreduced part of A.
+     !> If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
+     !> bidiagonal form.
+     !> This is an auxiliary routine called by DGEBRD
+
+     pure subroutine stdlib_qlabrd( m, n, nb, a, lda, d, e, tauq, taup, x, ldx, y,ldy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldx, ldy, m, n, nb
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: d(*), e(*), taup(*), tauq(*), x(ldx,*), y(ldy,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           if( m>=n ) then
+              ! reduce to upper bidiagonal form
+              loop_10: do i = 1, nb
+                 ! update a(i:m,i)
+                 call stdlib_qgemv( 'NO TRANSPOSE', m-i+1, i-1, -one, a( i, 1 ),lda, y( i, 1 ), &
+                           ldy, one, a( i, i ), 1 )
+                 call stdlib_qgemv( 'NO TRANSPOSE', m-i+1, i-1, -one, x( i, 1 ),ldx, a( 1, i ), 1,&
+                            one, a( i, i ), 1 )
+                 ! generate reflection q(i) to annihilate a(i+1:m,i)
+                 call stdlib_qlarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tauq( i ) )
+                           
+                 d( i ) = a( i, i )
+                 if( i<n ) then
+                    a( i, i ) = one
+                    ! compute y(i+1:n,i)
+                    call stdlib_qgemv( 'TRANSPOSE', m-i+1, n-i, one, a( i, i+1 ),lda, a( i, i ), &
+                              1, zero, y( i+1, i ), 1 )
+                    call stdlib_qgemv( 'TRANSPOSE', m-i+1, i-1, one, a( i, 1 ), lda,a( i, i ), 1, &
+                              zero, y( 1, i ), 1 )
+                    call stdlib_qgemv( 'NO TRANSPOSE', n-i, i-1, -one, y( i+1, 1 ),ldy, y( 1, i ),&
+                               1, one, y( i+1, i ), 1 )
+                    call stdlib_qgemv( 'TRANSPOSE', m-i+1, i-1, one, x( i, 1 ), ldx,a( i, i ), 1, &
+                              zero, y( 1, i ), 1 )
+                    call stdlib_qgemv( 'TRANSPOSE', i-1, n-i, -one, a( 1, i+1 ),lda, y( 1, i ), 1,&
+                               one, y( i+1, i ), 1 )
+                    call stdlib_qscal( n-i, tauq( i ), y( i+1, i ), 1 )
+                    ! update a(i,i+1:n)
+                    call stdlib_qgemv( 'NO TRANSPOSE', n-i, i, -one, y( i+1, 1 ),ldy, a( i, 1 ), &
+                              lda, one, a( i, i+1 ), lda )
+                    call stdlib_qgemv( 'TRANSPOSE', i-1, n-i, -one, a( 1, i+1 ),lda, x( i, 1 ), &
+                              ldx, one, a( i, i+1 ), lda )
+                    ! generate reflection p(i) to annihilate a(i,i+2:n)
+                    call stdlib_qlarfg( n-i, a( i, i+1 ), a( i, min( i+2, n ) ),lda, taup( i ) )
+                              
+                    e( i ) = a( i, i+1 )
+                    a( i, i+1 ) = one
+                    ! compute x(i+1:m,i)
+                    call stdlib_qgemv( 'NO TRANSPOSE', m-i, n-i, one, a( i+1, i+1 ),lda, a( i, i+&
+                              1 ), lda, zero, x( i+1, i ), 1 )
+                    call stdlib_qgemv( 'TRANSPOSE', n-i, i, one, y( i+1, 1 ), ldy,a( i, i+1 ), &
+                              lda, zero, x( 1, i ), 1 )
+                    call stdlib_qgemv( 'NO TRANSPOSE', m-i, i, -one, a( i+1, 1 ),lda, x( 1, i ), &
+                              1, one, x( i+1, i ), 1 )
+                    call stdlib_qgemv( 'NO TRANSPOSE', i-1, n-i, one, a( 1, i+1 ),lda, a( i, i+1 )&
+                              , lda, zero, x( 1, i ), 1 )
+                    call stdlib_qgemv( 'NO TRANSPOSE', m-i, i-1, -one, x( i+1, 1 ),ldx, x( 1, i ),&
+                               1, one, x( i+1, i ), 1 )
+                    call stdlib_qscal( m-i, taup( i ), x( i+1, i ), 1 )
+                 end if
+              end do loop_10
+           else
+              ! reduce to lower bidiagonal form
+              loop_20: do i = 1, nb
+                 ! update a(i,i:n)
+                 call stdlib_qgemv( 'NO TRANSPOSE', n-i+1, i-1, -one, y( i, 1 ),ldy, a( i, 1 ), &
+                           lda, one, a( i, i ), lda )
+                 call stdlib_qgemv( 'TRANSPOSE', i-1, n-i+1, -one, a( 1, i ), lda,x( i, 1 ), ldx, &
+                           one, a( i, i ), lda )
+                 ! generate reflection p(i) to annihilate a(i,i+1:n)
+                 call stdlib_qlarfg( n-i+1, a( i, i ), a( i, min( i+1, n ) ), lda,taup( i ) )
+                           
+                 d( i ) = a( i, i )
+                 if( i<m ) then
+                    a( i, i ) = one
+                    ! compute x(i+1:m,i)
+                    call stdlib_qgemv( 'NO TRANSPOSE', m-i, n-i+1, one, a( i+1, i ),lda, a( i, i )&
+                              , lda, zero, x( i+1, i ), 1 )
+                    call stdlib_qgemv( 'TRANSPOSE', n-i+1, i-1, one, y( i, 1 ), ldy,a( i, i ), &
+                              lda, zero, x( 1, i ), 1 )
+                    call stdlib_qgemv( 'NO TRANSPOSE', m-i, i-1, -one, a( i+1, 1 ),lda, x( 1, i ),&
+                               1, one, x( i+1, i ), 1 )
+                    call stdlib_qgemv( 'NO TRANSPOSE', i-1, n-i+1, one, a( 1, i ),lda, a( i, i ), &
+                              lda, zero, x( 1, i ), 1 )
+                    call stdlib_qgemv( 'NO TRANSPOSE', m-i, i-1, -one, x( i+1, 1 ),ldx, x( 1, i ),&
+                               1, one, x( i+1, i ), 1 )
+                    call stdlib_qscal( m-i, taup( i ), x( i+1, i ), 1 )
+                    ! update a(i+1:m,i)
+                    call stdlib_qgemv( 'NO TRANSPOSE', m-i, i-1, -one, a( i+1, 1 ),lda, y( i, 1 ),&
+                               ldy, one, a( i+1, i ), 1 )
+                    call stdlib_qgemv( 'NO TRANSPOSE', m-i, i, -one, x( i+1, 1 ),ldx, a( 1, i ), &
+                              1, one, a( i+1, i ), 1 )
+                    ! generate reflection q(i) to annihilate a(i+2:m,i)
+                    call stdlib_qlarfg( m-i, a( i+1, i ), a( min( i+2, m ), i ), 1,tauq( i ) )
+                              
+                    e( i ) = a( i+1, i )
+                    a( i+1, i ) = one
+                    ! compute y(i+1:n,i)
+                    call stdlib_qgemv( 'TRANSPOSE', m-i, n-i, one, a( i+1, i+1 ),lda, a( i+1, i ),&
+                               1, zero, y( i+1, i ), 1 )
+                    call stdlib_qgemv( 'TRANSPOSE', m-i, i-1, one, a( i+1, 1 ), lda,a( i+1, i ), &
+                              1, zero, y( 1, i ), 1 )
+                    call stdlib_qgemv( 'NO TRANSPOSE', n-i, i-1, -one, y( i+1, 1 ),ldy, y( 1, i ),&
+                               1, one, y( i+1, i ), 1 )
+                    call stdlib_qgemv( 'TRANSPOSE', m-i, i, one, x( i+1, 1 ), ldx,a( i+1, i ), 1, &
+                              zero, y( 1, i ), 1 )
+                    call stdlib_qgemv( 'TRANSPOSE', i, n-i, -one, a( 1, i+1 ), lda,y( 1, i ), 1, &
+                              one, y( i+1, i ), 1 )
+                    call stdlib_qscal( n-i, tauq( i ), y( i+1, i ), 1 )
+                 end if
+              end do loop_20
+           end if
+           return
+     end subroutine stdlib_qlabrd
+
+     !> DLACN2: estimates the 1-norm of a square, real matrix A.
+     !> Reverse communication is used for evaluating matrix-vector products.
+
+     pure subroutine stdlib_qlacn2( n, v, x, isgn, est, kase, isave )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(inout) :: kase
+           integer(ilp), intent(in) :: n
+           real(qp), intent(inout) :: est
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isgn(*)
+           integer(ilp), intent(inout) :: isave(3)
+           real(qp), intent(out) :: v(*)
+           real(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, jlast
+           real(qp) :: altsgn, estold, temp, xs
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,nint
+           ! Executable Statements 
+           if( kase==0 ) then
+              do i = 1, n
+                 x( i ) = one / real( n,KIND=qp)
+              end do
+              kase = 1
+              isave( 1 ) = 1
+              return
+           end if
+           go to ( 20, 40, 70, 110, 140 )isave( 1 )
+           ! ................ entry   (isave( 1 ) = 1)
+           ! first iteration.  x has been overwritten by a*x.
+           20 continue
+           if( n==1 ) then
+              v( 1 ) = x( 1 )
+              est = abs( v( 1 ) )
+              ! ... quit
+              go to 150
+           end if
+           est = stdlib_qasum( n, x, 1 )
+           do i = 1, n
+              if( x(i)>=zero ) then
+                 x(i) = one
+              else
+                 x(i) = -one
+              end if
+              isgn( i ) = nint( x( i ),KIND=ilp)
+           end do
+           kase = 2
+           isave( 1 ) = 2
+           return
+           ! ................ entry   (isave( 1 ) = 2)
+           ! first iteration.  x has been overwritten by transpose(a)*x.
+           40 continue
+           isave( 2 ) = stdlib_iqamax( n, x, 1 )
+           isave( 3 ) = 2
+           ! main loop - iterations 2,3,...,itmax.
+           50 continue
+           do i = 1, n
+              x( i ) = zero
+           end do
+           x( isave( 2 ) ) = one
+           kase = 1
+           isave( 1 ) = 3
+           return
+           ! ................ entry   (isave( 1 ) = 3)
+           ! x has been overwritten by a*x.
+           70 continue
+           call stdlib_qcopy( n, x, 1, v, 1 )
+           estold = est
+           est = stdlib_qasum( n, v, 1 )
+           do i = 1, n
+              if( x(i)>=zero ) then
+                 xs = one
+              else
+                 xs = -one
+              end if
+              if( nint( xs,KIND=ilp)/=isgn( i ) )go to 90
+           end do
+           ! repeated sign vector detected, hence algorithm has converged.
+           go to 120
+           90 continue
+           ! test for cycling.
+           if( est<=estold )go to 120
+           do i = 1, n
+              if( x(i)>=zero ) then
+                 x(i) = one
+              else
+                 x(i) = -one
+              end if
+              isgn( i ) = nint( x( i ),KIND=ilp)
+           end do
+           kase = 2
+           isave( 1 ) = 4
+           return
+           ! ................ entry   (isave( 1 ) = 4)
+           ! x has been overwritten by transpose(a)*x.
+           110 continue
+           jlast = isave( 2 )
+           isave( 2 ) = stdlib_iqamax( n, x, 1 )
+           if( ( x( jlast )/=abs( x( isave( 2 ) ) ) ) .and.( isave( 3 )<itmax ) ) then
+              isave( 3 ) = isave( 3 ) + 1
+              go to 50
+           end if
+           ! iteration complete.  final stage.
+           120 continue
+           altsgn = one
+           do i = 1, n
+              x( i ) = altsgn*( one+real( i-1,KIND=qp) / real( n-1,KIND=qp) )
+              altsgn = -altsgn
+           end do
+           kase = 1
+           isave( 1 ) = 5
+           return
+           ! ................ entry   (isave( 1 ) = 5)
+           ! x has been overwritten by a*x.
+           140 continue
+           temp = two*( stdlib_qasum( n, x, 1 ) / real( 3*n,KIND=qp) )
+           if( temp>est ) then
+              call stdlib_qcopy( n, x, 1, v, 1 )
+              est = temp
+           end if
+           150 continue
+           kase = 0
+           return
+     end subroutine stdlib_qlacn2
+
+     !> DLACON: estimates the 1-norm of a square, real matrix A.
+     !> Reverse communication is used for evaluating matrix-vector products.
+
+     subroutine stdlib_qlacon( n, v, x, isgn, est, kase )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(inout) :: kase
+           integer(ilp), intent(in) :: n
+           real(qp), intent(inout) :: est
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isgn(*)
+           real(qp), intent(out) :: v(*)
+           real(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iter, j, jlast, jump
+           real(qp) :: altsgn, estold, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,nint,sign
+           ! Save Statement 
+           save
+           ! Executable Statements 
+           if( kase==0 ) then
+              do i = 1, n
+                 x( i ) = one / real( n,KIND=qp)
+              end do
+              kase = 1
+              jump = 1
+              return
+           end if
+           go to ( 20, 40, 70, 110, 140 )jump
+           ! ................ entry   (jump = 1)
+           ! first iteration.  x has been overwritten by a*x.
+           20 continue
+           if( n==1 ) then
+              v( 1 ) = x( 1 )
+              est = abs( v( 1 ) )
+              ! ... quit
+              go to 150
+           end if
+           est = stdlib_qasum( n, x, 1 )
+           do i = 1, n
+              x( i ) = sign( one, x( i ) )
+              isgn( i ) = nint( x( i ),KIND=ilp)
+           end do
+           kase = 2
+           jump = 2
+           return
+           ! ................ entry   (jump = 2)
+           ! first iteration.  x has been overwritten by transpose(a)*x.
+           40 continue
+           j = stdlib_iqamax( n, x, 1 )
+           iter = 2
+           ! main loop - iterations 2,3,...,itmax.
+           50 continue
+           do i = 1, n
+              x( i ) = zero
+           end do
+           x( j ) = one
+           kase = 1
+           jump = 3
+           return
+           ! ................ entry   (jump = 3)
+           ! x has been overwritten by a*x.
+           70 continue
+           call stdlib_qcopy( n, x, 1, v, 1 )
+           estold = est
+           est = stdlib_qasum( n, v, 1 )
+           do i = 1, n
+              if( nint( sign( one, x( i ) ),KIND=ilp)/=isgn( i ) )go to 90
+           end do
+           ! repeated sign vector detected, hence algorithm has converged.
+           go to 120
+           90 continue
+           ! test for cycling.
+           if( est<=estold )go to 120
+           do i = 1, n
+              x( i ) = sign( one, x( i ) )
+              isgn( i ) = nint( x( i ),KIND=ilp)
+           end do
+           kase = 2
+           jump = 4
+           return
+           ! ................ entry   (jump = 4)
+           ! x has been overwritten by transpose(a)*x.
+           110 continue
+           jlast = j
+           j = stdlib_iqamax( n, x, 1 )
+           if( ( x( jlast )/=abs( x( j ) ) ) .and. ( iter<itmax ) ) then
+              iter = iter + 1
+              go to 50
+           end if
+           ! iteration complete.  final stage.
+           120 continue
+           altsgn = one
+           do i = 1, n
+              x( i ) = altsgn*( one+real( i-1,KIND=qp) / real( n-1,KIND=qp) )
+              altsgn = -altsgn
+           end do
+           kase = 1
+           jump = 5
+           return
+           ! ................ entry   (jump = 5)
+           ! x has been overwritten by a*x.
+           140 continue
+           temp = two*( stdlib_qasum( n, x, 1 ) / real( 3*n,KIND=qp) )
+           if( temp>est ) then
+              call stdlib_qcopy( n, x, 1, v, 1 )
+              est = temp
+           end if
+           150 continue
+           kase = 0
+           return
+     end subroutine stdlib_qlacon
+
+     !> DLACPY: copies all or part of a two-dimensional matrix A to another
+     !> matrix B.
+
+     pure subroutine stdlib_qlacpy( uplo, m, n, a, lda, b, ldb )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              do j = 1, n
+                 do i = j, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = 1, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_qlacpy
+
+     !> DLADIV: performs complex division in  real arithmetic
+     !> a + i*b
+     !> p + i*q = ---------
+     !> c + i*d
+     !> The algorithm is due to Michael Baudin and Robert L. Smith
+     !> and can be found in the paper
+     !> "A Robust Complex Division in Scilab"
+
+     pure subroutine stdlib_qladiv( a, b, c, d, p, q )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: a, b, c, d
+           real(qp), intent(out) :: p, q
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: bs = 2.0_qp
+           
+           
+           
+           ! Local Scalars 
+           real(qp) :: aa, bb, cc, dd, ab, cd, s, ov, un, be, eps
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           aa = a
+           bb = b
+           cc = c
+           dd = d
+           ab = max( abs(a), abs(b) )
+           cd = max( abs(c), abs(d) )
+           s = one
+           ov = stdlib_qlamch( 'OVERFLOW THRESHOLD' )
+           un = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'EPSILON' )
+           be = bs / (eps*eps)
+           if( ab >= half*ov ) then
+              aa = half * aa
+              bb = half * bb
+              s  = two * s
+           end if
+           if( cd >= half*ov ) then
+              cc = half * cc
+              dd = half * dd
+              s  = half * s
+           end if
+           if( ab <= un*bs/eps ) then
+              aa = aa * be
+              bb = bb * be
+              s  = s / be
+           end if
+           if( cd <= un*bs/eps ) then
+              cc = cc * be
+              dd = dd * be
+              s  = s * be
+           end if
+           if( abs( d )<=abs( c ) ) then
+              call stdlib_qladiv1(aa, bb, cc, dd, p, q)
+           else
+              call stdlib_qladiv1(bb, aa, dd, cc, p, q)
+              q = -q
+           end if
+           p = p * s
+           q = q * s
+           return
+     end subroutine stdlib_qladiv
+
+
+     pure subroutine stdlib_qladiv1( a, b, c, d, p, q )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(inout) :: a
+           real(qp), intent(in) :: b, c, d
+           real(qp), intent(out) :: p, q
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: r, t
+           ! Executable Statements 
+           r = d / c
+           t = one / (c + d * r)
+           p = stdlib_qladiv2(a, b, c, d, r, t)
+           a = -a
+           q = stdlib_qladiv2(b, a, c, d, r, t)
+           return
+     end subroutine stdlib_qladiv1
+
+
+     pure real(qp) function stdlib_qladiv2( a, b, c, d, r, t )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: a, b, c, d, r, t
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: br
+           ! Executable Statements 
+           if( r/=zero ) then
+              br = b * r
+              if( br/=zero ) then
+                 stdlib_qladiv2 = (a + br) * t
+              else
+                 stdlib_qladiv2 = a * t + (b * t) * r
+              end if
+           else
+              stdlib_qladiv2 = (a + d * (b / c)) * t
+           end if
+           return
+     end function stdlib_qladiv2
+
+     !> DLAE2:  computes the eigenvalues of a 2-by-2 symmetric matrix
+     !> [  A   B  ]
+     !> [  B   C  ].
+     !> On return, RT1 is the eigenvalue of larger absolute value, and RT2
+     !> is the eigenvalue of smaller absolute value.
+
+     pure subroutine stdlib_qlae2( a, b, c, rt1, rt2 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: a, b, c
+           real(qp), intent(out) :: rt1, rt2
+       ! =====================================================================
+           
+           
+           
+           
+           ! Local Scalars 
+           real(qp) :: ab, acmn, acmx, adf, df, rt, sm, tb
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           ! compute the eigenvalues
+           sm = a + c
+           df = a - c
+           adf = abs( df )
+           tb = b + b
+           ab = abs( tb )
+           if( abs( a )>abs( c ) ) then
+              acmx = a
+              acmn = c
+           else
+              acmx = c
+              acmn = a
+           end if
+           if( adf>ab ) then
+              rt = adf*sqrt( one+( ab / adf )**2 )
+           else if( adf<ab ) then
+              rt = ab*sqrt( one+( adf / ab )**2 )
+           else
+              ! includes case ab=adf=0
+              rt = ab*sqrt( two )
+           end if
+           if( sm<zero ) then
+              rt1 = half*( sm-rt )
+              ! order of execution important.
+              ! to get fully accurate smaller eigenvalue,
+              ! next line needs to be executed in higher precision.
+              rt2 = ( acmx / rt1 )*acmn - ( b / rt1 )*b
+           else if( sm>zero ) then
+              rt1 = half*( sm+rt )
+              ! order of execution important.
+              ! to get fully accurate smaller eigenvalue,
+              ! next line needs to be executed in higher precision.
+              rt2 = ( acmx / rt1 )*acmn - ( b / rt1 )*b
+           else
+              ! includes case rt1 = rt2 = 0
+              rt1 = half*rt
+              rt2 = -half*rt
+           end if
+           return
+     end subroutine stdlib_qlae2
+
+     !> DLAEBZ: contains the iteration loops which compute and use the
+     !> function N(w), which is the count of eigenvalues of a symmetric
+     !> tridiagonal matrix T less than or equal to its argument  w.  It
+     !> performs a choice of two types of loops:
+     !> IJOB=1, followed by
+     !> IJOB=2: It takes as input a list of intervals and returns a list of
+     !> sufficiently small intervals whose union contains the same
+     !> eigenvalues as the union of the original intervals.
+     !> The input intervals are (AB(j,1),AB(j,2)], j=1,...,MINP.
+     !> The output interval (AB(j,1),AB(j,2)] will contain
+     !> eigenvalues NAB(j,1)+1,...,NAB(j,2), where 1 <= j <= MOUT.
+     !> IJOB=3: It performs a binary search in each input interval
+     !> (AB(j,1),AB(j,2)] for a point  w(j)  such that
+     !> N(w(j))=NVAL(j), and uses  C(j)  as the starting point of
+     !> the search.  If such a w(j) is found, then on output
+     !> AB(j,1)=AB(j,2)=w.  If no such w(j) is found, then on output
+     !> (AB(j,1),AB(j,2)] will be a small interval containing the
+     !> point where N(w) jumps through NVAL(j), unless that point
+     !> lies outside the initial interval.
+     !> Note that the intervals are in all cases half-open intervals,
+     !> i.e., of the form  (a,b] , which includes  b  but not  a .
+     !> To avoid underflow, the matrix should be scaled so that its largest
+     !> element is no greater than  overflow**(1/2) * underflow**(1/4)
+     !> in absolute value.  To assure the most accurate computation
+     !> of small eigenvalues, the matrix should be scaled to be
+     !> not much smaller than that, either.
+     !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+     !> Matrix", Report CS41, Computer Science Dept., Stanford
+     !> University, July 21, 1966
+     !> Note: the arguments are, in general, *not* checked for unreasonable
+     !> values.
+
+     pure subroutine stdlib_qlaebz( ijob, nitmax, n, mmax, minp, nbmin, abstol,reltol, pivmin, d, &
+               e, e2, nval, ab, c, mout,nab, work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ijob, minp, mmax, n, nbmin, nitmax
+           integer(ilp), intent(out) :: info, mout
+           real(qp), intent(in) :: abstol, pivmin, reltol
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           integer(ilp), intent(inout) :: nab(mmax,*), nval(*)
+           real(qp), intent(inout) :: ab(mmax,*), c(*)
+           real(qp), intent(in) :: d(*), e(*), e2(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: itmp1, itmp2, j, ji, jit, jp, kf, kfnew, kl, klnew
+           real(qp) :: tmp1, tmp2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! check for errors
+           info = 0
+           if( ijob<1 .or. ijob>3 ) then
+              info = -1
+              return
+           end if
+           ! initialize nab
+           if( ijob==1 ) then
+              ! compute the number of eigenvalues in the initial intervals.
+              mout = 0
+              do ji = 1, minp
+                 do jp = 1, 2
+                    tmp1 = d( 1 ) - ab( ji, jp )
+                    if( abs( tmp1 )<pivmin )tmp1 = -pivmin
+                    nab( ji, jp ) = 0
+                    if( tmp1<=zero )nab( ji, jp ) = 1
+                    do j = 2, n
+                       tmp1 = d( j ) - e2( j-1 ) / tmp1 - ab( ji, jp )
+                       if( abs( tmp1 )<pivmin )tmp1 = -pivmin
+                       if( tmp1<=zero )nab( ji, jp ) = nab( ji, jp ) + 1
+                    end do
+                 end do
+                 mout = mout + nab( ji, 2 ) - nab( ji, 1 )
+              end do
+              return
+           end if
+           ! initialize for loop
+           ! kf and kl have the following meaning:
+              ! intervals 1,...,kf-1 have converged.
+              ! intervals kf,...,kl  still need to be refined.
+           kf = 1
+           kl = minp
+           ! if ijob=2, initialize c.
+           ! if ijob=3, use the user-supplied starting point.
+           if( ijob==2 ) then
+              do ji = 1, minp
+                 c( ji ) = half*( ab( ji, 1 )+ab( ji, 2 ) )
+              end do
+           end if
+           ! iteration loop
+           loop_130: do jit = 1, nitmax
+              ! loop over intervals
+              if( kl-kf+1>=nbmin .and. nbmin>0 ) then
+                 ! begin of parallel version of the loop
+                 do ji = kf, kl
+                    ! compute n(c), the number of eigenvalues less than c
+                    work( ji ) = d( 1 ) - c( ji )
+                    iwork( ji ) = 0
+                    if( work( ji )<=pivmin ) then
+                       iwork( ji ) = 1
+                       work( ji ) = min( work( ji ), -pivmin )
+                    end if
+                    do j = 2, n
+                       work( ji ) = d( j ) - e2( j-1 ) / work( ji ) - c( ji )
+                       if( work( ji )<=pivmin ) then
+                          iwork( ji ) = iwork( ji ) + 1
+                          work( ji ) = min( work( ji ), -pivmin )
+                       end if
+                    end do
+                 end do
+                 if( ijob<=2 ) then
+                    ! ijob=2: choose all intervals containing eigenvalues.
+                    klnew = kl
+                    loop_70: do ji = kf, kl
+                       ! insure that n(w) is monotone
+                       iwork( ji ) = min( nab( ji, 2 ),max( nab( ji, 1 ), iwork( ji ) ) )
+                       ! update the queue -- add intervals if both halves
+                       ! contain eigenvalues.
+                       if( iwork( ji )==nab( ji, 2 ) ) then
+                          ! no eigenvalue in the upper interval:
+                          ! just use the lower interval.
+                          ab( ji, 2 ) = c( ji )
+                       else if( iwork( ji )==nab( ji, 1 ) ) then
+                          ! no eigenvalue in the lower interval:
+                          ! just use the upper interval.
+                          ab( ji, 1 ) = c( ji )
+                       else
+                          klnew = klnew + 1
+                          if( klnew<=mmax ) then
+                             ! eigenvalue in both intervals -- add upper to
+                             ! queue.
+                             ab( klnew, 2 ) = ab( ji, 2 )
+                             nab( klnew, 2 ) = nab( ji, 2 )
+                             ab( klnew, 1 ) = c( ji )
+                             nab( klnew, 1 ) = iwork( ji )
+                             ab( ji, 2 ) = c( ji )
+                             nab( ji, 2 ) = iwork( ji )
+                          else
+                             info = mmax + 1
+                          end if
+                       end if
+                    end do loop_70
+                    if( info/=0 )return
+                    kl = klnew
+                 else
+                    ! ijob=3: binary search.  keep only the interval containing
+                            ! w   s.t. n(w) = nval
+                    do ji = kf, kl
+                       if( iwork( ji )<=nval( ji ) ) then
+                          ab( ji, 1 ) = c( ji )
+                          nab( ji, 1 ) = iwork( ji )
+                       end if
+                       if( iwork( ji )>=nval( ji ) ) then
+                          ab( ji, 2 ) = c( ji )
+                          nab( ji, 2 ) = iwork( ji )
+                       end if
+                    end do
+                 end if
+              else
+                 ! end of parallel version of the loop
+                 ! begin of serial version of the loop
+                 klnew = kl
+                 loop_100: do ji = kf, kl
+                    ! compute n(w), the number of eigenvalues less than w
+                    tmp1 = c( ji )
+                    tmp2 = d( 1 ) - tmp1
+                    itmp1 = 0
+                    if( tmp2<=pivmin ) then
+                       itmp1 = 1
+                       tmp2 = min( tmp2, -pivmin )
+                    end if
+                    do j = 2, n
+                       tmp2 = d( j ) - e2( j-1 ) / tmp2 - tmp1
+                       if( tmp2<=pivmin ) then
+                          itmp1 = itmp1 + 1
+                          tmp2 = min( tmp2, -pivmin )
+                       end if
+                    end do
+                    if( ijob<=2 ) then
+                       ! ijob=2: choose all intervals containing eigenvalues.
+                       ! insure that n(w) is monotone
+                       itmp1 = min( nab( ji, 2 ),max( nab( ji, 1 ), itmp1 ) )
+                       ! update the queue -- add intervals if both halves
+                       ! contain eigenvalues.
+                       if( itmp1==nab( ji, 2 ) ) then
+                          ! no eigenvalue in the upper interval:
+                          ! just use the lower interval.
+                          ab( ji, 2 ) = tmp1
+                       else if( itmp1==nab( ji, 1 ) ) then
+                          ! no eigenvalue in the lower interval:
+                          ! just use the upper interval.
+                          ab( ji, 1 ) = tmp1
+                       else if( klnew<mmax ) then
+                          ! eigenvalue in both intervals -- add upper to queue.
+                          klnew = klnew + 1
+                          ab( klnew, 2 ) = ab( ji, 2 )
+                          nab( klnew, 2 ) = nab( ji, 2 )
+                          ab( klnew, 1 ) = tmp1
+                          nab( klnew, 1 ) = itmp1
+                          ab( ji, 2 ) = tmp1
+                          nab( ji, 2 ) = itmp1
+                       else
+                          info = mmax + 1
+                          return
+                       end if
+                    else
+                       ! ijob=3: binary search.  keep only the interval
+                               ! containing  w  s.t. n(w) = nval
+                       if( itmp1<=nval( ji ) ) then
+                          ab( ji, 1 ) = tmp1
+                          nab( ji, 1 ) = itmp1
+                       end if
+                       if( itmp1>=nval( ji ) ) then
+                          ab( ji, 2 ) = tmp1
+                          nab( ji, 2 ) = itmp1
+                       end if
+                    end if
+                 end do loop_100
+                 kl = klnew
+              end if
+              ! check for convergence
+              kfnew = kf
+              loop_110: do ji = kf, kl
+                 tmp1 = abs( ab( ji, 2 )-ab( ji, 1 ) )
+                 tmp2 = max( abs( ab( ji, 2 ) ), abs( ab( ji, 1 ) ) )
+                 if( tmp1<max( abstol, pivmin, reltol*tmp2 ) .or.nab( ji, 1 )>=nab( ji, 2 ) ) &
+                           then
+                    ! converged -- swap with position kfnew,
+                                 ! then increment kfnew
+                    if( ji>kfnew ) then
+                       tmp1 = ab( ji, 1 )
+                       tmp2 = ab( ji, 2 )
+                       itmp1 = nab( ji, 1 )
+                       itmp2 = nab( ji, 2 )
+                       ab( ji, 1 ) = ab( kfnew, 1 )
+                       ab( ji, 2 ) = ab( kfnew, 2 )
+                       nab( ji, 1 ) = nab( kfnew, 1 )
+                       nab( ji, 2 ) = nab( kfnew, 2 )
+                       ab( kfnew, 1 ) = tmp1
+                       ab( kfnew, 2 ) = tmp2
+                       nab( kfnew, 1 ) = itmp1
+                       nab( kfnew, 2 ) = itmp2
+                       if( ijob==3 ) then
+                          itmp1 = nval( ji )
+                          nval( ji ) = nval( kfnew )
+                          nval( kfnew ) = itmp1
+                       end if
+                    end if
+                    kfnew = kfnew + 1
+                 end if
+              end do loop_110
+              kf = kfnew
+              ! choose midpoints
+              do ji = kf, kl
+                 c( ji ) = half*( ab( ji, 1 )+ab( ji, 2 ) )
+              end do
+              ! if no more intervals to refine, quit.
+              if( kf>kl )go to 140
+           end do loop_130
+           ! converged
+           140 continue
+           info = max( kl+1-kf, 0 )
+           mout = kl
+           return
+     end subroutine stdlib_qlaebz
+
+     !> DLAED0: computes all eigenvalues and corresponding eigenvectors of a
+     !> symmetric tridiagonal matrix using the divide and conquer method.
+
+     pure subroutine stdlib_qlaed0( icompq, qsiz, n, d, e, q, ldq, qstore, ldqs,work, iwork, info &
+               )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, ldq, ldqs, n, qsiz
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: d(*), e(*), q(ldq,*)
+           real(qp), intent(out) :: qstore(ldqs,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: curlvl, curprb, curr, i, igivcl, igivnm, igivpt, indxq, iperm, iprmpt, &
+           iq, iqptr, iwrem, j, k, lgn, matsiz, msd2, smlsiz, smm1, spm1, spm2, submat, subpbs, &
+                     tlvls
+           real(qp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( icompq<0 .or. icompq>2 ) then
+              info = -1
+           else if( ( icompq==1 ) .and. ( qsiz<max( 0, n ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldq<max( 1, n ) ) then
+              info = -7
+           else if( ldqs<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED0', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           smlsiz = stdlib_ilaenv( 9, 'DLAED0', ' ', 0, 0, 0, 0 )
+           ! determine the size and placement of the submatrices, and save in
+           ! the leading elements of iwork.
+           iwork( 1 ) = n
+           subpbs = 1
+           tlvls = 0
+           10 continue
+           if( iwork( subpbs )>smlsiz ) then
+              do j = subpbs, 1, -1
+                 iwork( 2*j ) = ( iwork( j )+1 ) / 2
+                 iwork( 2*j-1 ) = iwork( j ) / 2
+              end do
+              tlvls = tlvls + 1
+              subpbs = 2*subpbs
+              go to 10
+           end if
+           do j = 2, subpbs
+              iwork( j ) = iwork( j ) + iwork( j-1 )
+           end do
+           ! divide the matrix into subpbs submatrices of size at most smlsiz+1
+           ! using rank-1 modifications (cuts).
+           spm1 = subpbs - 1
+           do i = 1, spm1
+              submat = iwork( i ) + 1
+              smm1 = submat - 1
+              d( smm1 ) = d( smm1 ) - abs( e( smm1 ) )
+              d( submat ) = d( submat ) - abs( e( smm1 ) )
+           end do
+           indxq = 4*n + 3
+           if( icompq/=2 ) then
+              ! set up workspaces for eigenvalues only/accumulate new vectors
+              ! routine
+              temp = log( real( n,KIND=qp) ) / log( two )
+              lgn = int( temp,KIND=ilp)
+              if( 2**lgn<n )lgn = lgn + 1
+              if( 2**lgn<n )lgn = lgn + 1
+              iprmpt = indxq + n + 1
+              iperm = iprmpt + n*lgn
+              iqptr = iperm + n*lgn
+              igivpt = iqptr + n + 2
+              igivcl = igivpt + n*lgn
+              igivnm = 1
+              iq = igivnm + 2*n*lgn
+              iwrem = iq + n**2 + 1
+              ! initialize pointers
+              do i = 0, subpbs
+                 iwork( iprmpt+i ) = 1
+                 iwork( igivpt+i ) = 1
+              end do
+              iwork( iqptr ) = 1
+           end if
+           ! solve each submatrix eigenproblem at the bottom of the divide and
+           ! conquer tree.
+           curr = 0
+           loop_70: do i = 0, spm1
+              if( i==0 ) then
+                 submat = 1
+                 matsiz = iwork( 1 )
+              else
+                 submat = iwork( i ) + 1
+                 matsiz = iwork( i+1 ) - iwork( i )
+              end if
+              if( icompq==2 ) then
+                 call stdlib_qsteqr( 'I', matsiz, d( submat ), e( submat ),q( submat, submat ), &
+                           ldq, work, info )
+                 if( info/=0 )go to 130
+              else
+                 call stdlib_qsteqr( 'I', matsiz, d( submat ), e( submat ),work( iq-1+iwork( &
+                           iqptr+curr ) ), matsiz, work,info )
+                 if( info/=0 )go to 130
+                 if( icompq==1 ) then
+                    call stdlib_qgemm( 'N', 'N', qsiz, matsiz, matsiz, one,q( 1, submat ), ldq, &
+                    work( iq-1+iwork( iqptr+curr ) ), matsiz, zero, qstore( 1, submat ),ldqs )
+                              
+                 end if
+                 iwork( iqptr+curr+1 ) = iwork( iqptr+curr ) + matsiz**2
+                 curr = curr + 1
+              end if
+              k = 1
+              do j = submat, iwork( i+1 )
+                 iwork( indxq+j ) = k
+                 k = k + 1
+              end do
+           end do loop_70
+           ! successively merge eigensystems of adjacent submatrices
+           ! into eigensystem for the corresponding larger matrix.
+           ! while ( subpbs > 1 )
+           curlvl = 1
+           80 continue
+           if( subpbs>1 ) then
+              spm2 = subpbs - 2
+              loop_90: do i = 0, spm2, 2
+                 if( i==0 ) then
+                    submat = 1
+                    matsiz = iwork( 2 )
+                    msd2 = iwork( 1 )
+                    curprb = 0
+                 else
+                    submat = iwork( i ) + 1
+                    matsiz = iwork( i+2 ) - iwork( i )
+                    msd2 = matsiz / 2
+                    curprb = curprb + 1
+                 end if
+           ! merge lower order eigensystems (of size msd2 and matsiz - msd2)
+           ! into an eigensystem of size matsiz.
+           ! stdlib_qlaed1 is used only for the full eigensystem of a tridiagonal
+           ! matrix.
+           ! stdlib_qlaed7 handles the cases in which eigenvalues only or eigenvalues
+           ! and eigenvectors of a full symmetric matrix (which was reduced to
+           ! tridiagonal form) are desired.
+                 if( icompq==2 ) then
+                    call stdlib_qlaed1( matsiz, d( submat ), q( submat, submat ),ldq, iwork( &
+                    indxq+submat ),e( submat+msd2-1 ), msd2, work,iwork( subpbs+1 ), info )
+                              
+                 else
+                    call stdlib_qlaed7( icompq, matsiz, qsiz, tlvls, curlvl, curprb,d( submat ), &
+                    qstore( 1, submat ), ldqs,iwork( indxq+submat ), e( submat+msd2-1 ),msd2, &
+                    work( iq ), iwork( iqptr ),iwork( iprmpt ), iwork( iperm ),iwork( igivpt ), &
+                    iwork( igivcl ),work( igivnm ), work( iwrem ),iwork( subpbs+1 ), info )
+                              
+                 end if
+                 if( info/=0 )go to 130
+                 iwork( i / 2+1 ) = iwork( i+2 )
+              end do loop_90
+              subpbs = subpbs / 2
+              curlvl = curlvl + 1
+              go to 80
+           end if
+           ! end while
+           ! re-merge the eigenvalues/vectors which were deflated at the final
+           ! merge step.
+           if( icompq==1 ) then
+              do i = 1, n
+                 j = iwork( indxq+i )
+                 work( i ) = d( j )
+                 call stdlib_qcopy( qsiz, qstore( 1, j ), 1, q( 1, i ), 1 )
+              end do
+              call stdlib_qcopy( n, work, 1, d, 1 )
+           else if( icompq==2 ) then
+              do i = 1, n
+                 j = iwork( indxq+i )
+                 work( i ) = d( j )
+                 call stdlib_qcopy( n, q( 1, j ), 1, work( n*i+1 ), 1 )
+              end do
+              call stdlib_qcopy( n, work, 1, d, 1 )
+              call stdlib_qlacpy( 'A', n, n, work( n+1 ), n, q, ldq )
+           else
+              do i = 1, n
+                 j = iwork( indxq+i )
+                 work( i ) = d( j )
+              end do
+              call stdlib_qcopy( n, work, 1, d, 1 )
+           end if
+           go to 140
+           130 continue
+           info = submat*( n+1 ) + submat + matsiz - 1
+           140 continue
+           return
+     end subroutine stdlib_qlaed0
+
+     !> DLAED1: computes the updated eigensystem of a diagonal
+     !> matrix after modification by a rank-one symmetric matrix.  This
+     !> routine is used only for the eigenproblem which requires all
+     !> eigenvalues and eigenvectors of a tridiagonal matrix.  DLAED7 handles
+     !> the case in which eigenvalues only or eigenvalues and eigenvectors
+     !> of a full symmetric matrix (which was reduced to tridiagonal form)
+     !> are desired.
+     !> T = Q(in) ( D(in) + RHO * Z*Z**T ) Q**T(in) = Q(out) * D(out) * Q**T(out)
+     !> where Z = Q**T*u, u is a vector of length N with ones in the
+     !> CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+     !> The eigenvectors of the original matrix are stored in Q, and the
+     !> eigenvalues are in D.  The algorithm consists of three stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple eigenvalues or if there is a zero in
+     !> the Z vector.  For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one.  This stage is
+     !> performed by the routine DLAED2.
+     !> The second stage consists of calculating the updated
+     !> eigenvalues. This is done by finding the roots of the secular
+     !> equation via the routine DLAED4 (as called by DLAED3).
+     !> This routine also calculates the eigenvectors of the current
+     !> problem.
+     !> The final stage consists of computing the updated eigenvectors
+     !> directly using the updated eigenvalues.  The eigenvectors for
+     !> the current problem are multiplied with the eigenvectors from
+     !> the overall problem.
+
+     pure subroutine stdlib_qlaed1( n, d, q, ldq, indxq, rho, cutpnt, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: cutpnt, ldq, n
+           integer(ilp), intent(out) :: info
+           real(qp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: indxq(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: d(*), q(ldq,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: coltyp, i, idlmda, indx, indxc, indxp, iq2, is, iw, iz, k, n1, n2, &
+                     zpp1
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( ldq<max( 1, n ) ) then
+              info = -4
+           else if( min( 1, n / 2 )>cutpnt .or. ( n / 2 )<cutpnt ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED1', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! the following values are integer pointers which indicate
+           ! the portion of the workspace
+           ! used by a particular array in stdlib_qlaed2 and stdlib_qlaed3.
+           iz = 1
+           idlmda = iz + n
+           iw = idlmda + n
+           iq2 = iw + n
+           indx = 1
+           indxc = indx + n
+           coltyp = indxc + n
+           indxp = coltyp + n
+           ! form the z-vector which consists of the last row of q_1 and the
+           ! first row of q_2.
+           call stdlib_qcopy( cutpnt, q( cutpnt, 1 ), ldq, work( iz ), 1 )
+           zpp1 = cutpnt + 1
+           call stdlib_qcopy( n-cutpnt, q( zpp1, zpp1 ), ldq, work( iz+cutpnt ), 1 )
+           ! deflate eigenvalues.
+           call stdlib_qlaed2( k, n, cutpnt, d, q, ldq, indxq, rho, work( iz ),work( idlmda ), &
+           work( iw ), work( iq2 ),iwork( indx ), iwork( indxc ), iwork( indxp ),iwork( coltyp ), &
+                     info )
+           if( info/=0 )go to 20
+           ! solve secular equation.
+           if( k/=0 ) then
+              is = ( iwork( coltyp )+iwork( coltyp+1 ) )*cutpnt +( iwork( coltyp+1 )+iwork( &
+                        coltyp+2 ) )*( n-cutpnt ) + iq2
+              call stdlib_qlaed3( k, n, cutpnt, d, q, ldq, rho, work( idlmda ),work( iq2 ), iwork(&
+                         indxc ), iwork( coltyp ),work( iw ), work( is ), info )
+              if( info/=0 )go to 20
+           ! prepare the indxq sorting permutation.
+              n1 = k
+              n2 = n - k
+              call stdlib_qlamrg( n1, n2, d, 1, -1, indxq )
+           else
+              do i = 1, n
+                 indxq( i ) = i
+              end do
+           end if
+           20 continue
+           return
+     end subroutine stdlib_qlaed1
+
+     !> DLAED2: merges the two sets of eigenvalues together into a single
+     !> sorted set.  Then it tries to deflate the size of the problem.
+     !> There are two ways in which deflation can occur:  when two or more
+     !> eigenvalues are close together or if there is a tiny entry in the
+     !> Z vector.  For each such occurrence the order of the related secular
+     !> equation problem is reduced by one.
+
+     pure subroutine stdlib_qlaed2( k, n, n1, d, q, ldq, indxq, rho, z, dlamda, w,q2, indx, indxc,&
+                indxp, coltyp, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, k
+           integer(ilp), intent(in) :: ldq, n, n1
+           real(qp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(out) :: coltyp(*), indx(*), indxc(*), indxp(*)
+           integer(ilp), intent(inout) :: indxq(*)
+           real(qp), intent(inout) :: d(*), q(ldq,*), z(*)
+           real(qp), intent(out) :: dlamda(*), q2(*), w(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: mone = -1.0_qp
+           
+           ! Local Arrays 
+           integer(ilp) :: ctot(4), psm(4)
+           ! Local Scalars 
+           integer(ilp) :: ct, i, imax, iq1, iq2, j, jmax, js, k2, n1p1, n2, nj, pj
+           real(qp) :: c, eps, s, t, tau, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           else if( min( 1, ( n / 2 ) )>n1 .or. ( n / 2 )<n1 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           n2 = n - n1
+           n1p1 = n1 + 1
+           if( rho<zero ) then
+              call stdlib_qscal( n2, mone, z( n1p1 ), 1 )
+           end if
+           ! normalize z so that norm(z) = 1.  since z is the concatenation of
+           ! two normalized vectors, norm2(z) = sqrt(2).
+           t = one / sqrt( two )
+           call stdlib_qscal( n, t, z, 1 )
+           ! rho = abs( norm(z)**2 * rho )
+           rho = abs( two*rho )
+           ! sort the eigenvalues into increasing order
+           do i = n1p1, n
+              indxq( i ) = indxq( i ) + n1
+           end do
+           ! re-integrate the deflated parts from the last pass
+           do i = 1, n
+              dlamda( i ) = d( indxq( i ) )
+           end do
+           call stdlib_qlamrg( n1, n2, dlamda, 1, 1, indxc )
+           do i = 1, n
+              indx( i ) = indxq( indxc( i ) )
+           end do
+           ! calculate the allowable deflation tolerance
+           imax = stdlib_iqamax( n, z, 1 )
+           jmax = stdlib_iqamax( n, d, 1 )
+           eps = stdlib_qlamch( 'EPSILON' )
+           tol = eight*eps*max( abs( d( jmax ) ), abs( z( imax ) ) )
+           ! if the rank-1 modifier is small enough, no more needs to be done
+           ! except to reorganize q so that its columns correspond with the
+           ! elements in d.
+           if( rho*abs( z( imax ) )<=tol ) then
+              k = 0
+              iq2 = 1
+              do j = 1, n
+                 i = indx( j )
+                 call stdlib_qcopy( n, q( 1, i ), 1, q2( iq2 ), 1 )
+                 dlamda( j ) = d( i )
+                 iq2 = iq2 + n
+              end do
+              call stdlib_qlacpy( 'A', n, n, q2, n, q, ldq )
+              call stdlib_qcopy( n, dlamda, 1, d, 1 )
+              go to 190
+           end if
+           ! if there are multiple eigenvalues then the problem deflates.  here
+           ! the number of equal eigenvalues are found.  as each equal
+           ! eigenvalue is found, an elementary reflector is computed to rotate
+           ! the corresponding eigensubspace so that the corresponding
+           ! components of z are zero in this new basis.
+           do i = 1, n1
+              coltyp( i ) = 1
+           end do
+           do i = n1p1, n
+              coltyp( i ) = 3
+           end do
+           k = 0
+           k2 = n + 1
+           do j = 1, n
+              nj = indx( j )
+              if( rho*abs( z( nj ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 coltyp( nj ) = 4
+                 indxp( k2 ) = nj
+                 if( j==n )go to 100
+              else
+                 pj = nj
+                 go to 80
+              end if
+           end do
+           80 continue
+           j = j + 1
+           nj = indx( j )
+           if( j>n )go to 100
+           if( rho*abs( z( nj ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              coltyp( nj ) = 4
+              indxp( k2 ) = nj
+           else
+              ! check if eigenvalues are close enough to allow deflation.
+              s = z( pj )
+              c = z( nj )
+              ! find sqrt(a**2+b**2) without overflow or
+              ! destructive underflow.
+              tau = stdlib_qlapy2( c, s )
+              t = d( nj ) - d( pj )
+              c = c / tau
+              s = -s / tau
+              if( abs( t*c*s )<=tol ) then
+                 ! deflation is possible.
+                 z( nj ) = tau
+                 z( pj ) = zero
+                 if( coltyp( nj )/=coltyp( pj ) )coltyp( nj ) = 2
+                 coltyp( pj ) = 4
+                 call stdlib_qrot( n, q( 1, pj ), 1, q( 1, nj ), 1, c, s )
+                 t = d( pj )*c**2 + d( nj )*s**2
+                 d( nj ) = d( pj )*s**2 + d( nj )*c**2
+                 d( pj ) = t
+                 k2 = k2 - 1
+                 i = 1
+                 90 continue
+                 if( k2+i<=n ) then
+                    if( d( pj )<d( indxp( k2+i ) ) ) then
+                       indxp( k2+i-1 ) = indxp( k2+i )
+                       indxp( k2+i ) = pj
+                       i = i + 1
+                       go to 90
+                    else
+                       indxp( k2+i-1 ) = pj
+                    end if
+                 else
+                    indxp( k2+i-1 ) = pj
+                 end if
+                 pj = nj
+              else
+                 k = k + 1
+                 dlamda( k ) = d( pj )
+                 w( k ) = z( pj )
+                 indxp( k ) = pj
+                 pj = nj
+              end if
+           end if
+           go to 80
+           100 continue
+           ! record the last eigenvalue.
+           k = k + 1
+           dlamda( k ) = d( pj )
+           w( k ) = z( pj )
+           indxp( k ) = pj
+           ! count up the total number of the various types of columns, then
+           ! form a permutation which positions the four column types into
+           ! four uniform groups (although one or more of these groups may be
+           ! empty).
+           do j = 1, 4
+              ctot( j ) = 0
+           end do
+           do j = 1, n
+              ct = coltyp( j )
+              ctot( ct ) = ctot( ct ) + 1
+           end do
+           ! psm(*) = position in submatrix (of types 1 through 4)
+           psm( 1 ) = 1
+           psm( 2 ) = 1 + ctot( 1 )
+           psm( 3 ) = psm( 2 ) + ctot( 2 )
+           psm( 4 ) = psm( 3 ) + ctot( 3 )
+           k = n - ctot( 4 )
+           ! fill out the indxc array so that the permutation which it induces
+           ! will place all type-1 columns first, all type-2 columns next,
+           ! then all type-3's, and finally all type-4's.
+           do j = 1, n
+              js = indxp( j )
+              ct = coltyp( js )
+              indx( psm( ct ) ) = js
+              indxc( psm( ct ) ) = j
+              psm( ct ) = psm( ct ) + 1
+           end do
+           ! sort the eigenvalues and corresponding eigenvectors into dlamda
+           ! and q2 respectively.  the eigenvalues/vectors which were not
+           ! deflated go into the first k slots of dlamda and q2 respectively,
+           ! while those which were deflated go into the last n - k slots.
+           i = 1
+           iq1 = 1
+           iq2 = 1 + ( ctot( 1 )+ctot( 2 ) )*n1
+           do j = 1, ctot( 1 )
+              js = indx( i )
+              call stdlib_qcopy( n1, q( 1, js ), 1, q2( iq1 ), 1 )
+              z( i ) = d( js )
+              i = i + 1
+              iq1 = iq1 + n1
+           end do
+           do j = 1, ctot( 2 )
+              js = indx( i )
+              call stdlib_qcopy( n1, q( 1, js ), 1, q2( iq1 ), 1 )
+              call stdlib_qcopy( n2, q( n1+1, js ), 1, q2( iq2 ), 1 )
+              z( i ) = d( js )
+              i = i + 1
+              iq1 = iq1 + n1
+              iq2 = iq2 + n2
+           end do
+           do j = 1, ctot( 3 )
+              js = indx( i )
+              call stdlib_qcopy( n2, q( n1+1, js ), 1, q2( iq2 ), 1 )
+              z( i ) = d( js )
+              i = i + 1
+              iq2 = iq2 + n2
+           end do
+           iq1 = iq2
+           do j = 1, ctot( 4 )
+              js = indx( i )
+              call stdlib_qcopy( n, q( 1, js ), 1, q2( iq2 ), 1 )
+              iq2 = iq2 + n
+              z( i ) = d( js )
+              i = i + 1
+           end do
+           ! the deflated eigenvalues and their corresponding vectors go back
+           ! into the last n - k slots of d and q respectively.
+           if( k<n ) then
+              call stdlib_qlacpy( 'A', n, ctot( 4 ), q2( iq1 ), n,q( 1, k+1 ), ldq )
+              call stdlib_qcopy( n-k, z( k+1 ), 1, d( k+1 ), 1 )
+           end if
+           ! copy ctot into coltyp for referencing in stdlib_qlaed3.
+           do j = 1, 4
+              coltyp( j ) = ctot( j )
+           end do
+           190 continue
+           return
+     end subroutine stdlib_qlaed2
+
+     !> DLAED3: finds the roots of the secular equation, as defined by the
+     !> values in D, W, and RHO, between 1 and K.  It makes the
+     !> appropriate calls to DLAED4 and then updates the eigenvectors by
+     !> multiplying the matrix of eigenvectors of the pair of eigensystems
+     !> being combined by the matrix of eigenvectors of the K-by-K system
+     !> which is solved here.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_qlaed3( k, n, n1, d, q, ldq, rho, dlamda, q2, indx,ctot, w, s, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldq, n, n1
+           real(qp), intent(in) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ctot(*), indx(*)
+           real(qp), intent(out) :: d(*), q(ldq,*), s(*)
+           real(qp), intent(inout) :: dlamda(*), w(*)
+           real(qp), intent(in) :: q2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, iq2, j, n12, n2, n23
+           real(qp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( k<0 ) then
+              info = -1
+           else if( n<k ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==0 )return
+           ! modify values dlamda(i) to make sure all dlamda(i)-dlamda(j) can
+           ! be computed with high relative accuracy (barring over/underflow).
+           ! this is a problem on machines without a guard digit in
+           ! add/subtract (cray xmp, cray ymp, cray c 90 and cray 2).
+           ! the following code replaces dlamda(i) by 2*dlamda(i)-dlamda(i),
+           ! which on any of these machines zeros out the bottommost
+           ! bit of dlamda(i) if it is 1; this makes the subsequent
+           ! subtractions dlamda(i)-dlamda(j) unproblematic when cancellation
+           ! occurs. on binary machines with a guard digit (almost all
+           ! machines) it does not change dlamda(i) at all. on hexadecimal
+           ! and decimal machines with a guard digit, it slightly
+           ! changes the bottommost bits of dlamda(i). it does not account
+           ! for hexadecimal or decimal machines without guard digits
+           ! (we know of none). we use a subroutine call to compute
+           ! 2*dlambda(i) to prevent optimizing compilers from eliminating
+           ! this code.
+           do i = 1, k
+              dlamda( i ) = stdlib_qlamc3( dlamda( i ), dlamda( i ) ) - dlamda( i )
+           end do
+           do j = 1, k
+              call stdlib_qlaed4( k, j, dlamda, w, q( 1, j ), rho, d( j ), info )
+              ! if the zero finder fails, the computation is terminated.
+              if( info/=0 )go to 120
+           end do
+           if( k==1 )go to 110
+           if( k==2 ) then
+              do j = 1, k
+                 w( 1 ) = q( 1, j )
+                 w( 2 ) = q( 2, j )
+                 ii = indx( 1 )
+                 q( 1, j ) = w( ii )
+                 ii = indx( 2 )
+                 q( 2, j ) = w( ii )
+              end do
+              go to 110
+           end if
+           ! compute updated w.
+           call stdlib_qcopy( k, w, 1, s, 1 )
+           ! initialize w(i) = q(i,i)
+           call stdlib_qcopy( k, q, ldq+1, w, 1 )
+           do j = 1, k
+              do i = 1, j - 1
+                 w( i ) = w( i )*( q( i, j ) / ( dlamda( i )-dlamda( j ) ) )
+              end do
+              do i = j + 1, k
+                 w( i ) = w( i )*( q( i, j ) / ( dlamda( i )-dlamda( j ) ) )
+              end do
+           end do
+           do i = 1, k
+              w( i ) = sign( sqrt( -w( i ) ), s( i ) )
+           end do
+           ! compute eigenvectors of the modified rank-1 modification.
+           do j = 1, k
+              do i = 1, k
+                 s( i ) = w( i ) / q( i, j )
+              end do
+              temp = stdlib_qnrm2( k, s, 1 )
+              do i = 1, k
+                 ii = indx( i )
+                 q( i, j ) = s( ii ) / temp
+              end do
+           end do
+           ! compute the updated eigenvectors.
+           110 continue
+           n2 = n - n1
+           n12 = ctot( 1 ) + ctot( 2 )
+           n23 = ctot( 2 ) + ctot( 3 )
+           call stdlib_qlacpy( 'A', n23, k, q( ctot( 1 )+1, 1 ), ldq, s, n23 )
+           iq2 = n1*n12 + 1
+           if( n23/=0 ) then
+              call stdlib_qgemm( 'N', 'N', n2, k, n23, one, q2( iq2 ), n2, s, n23,zero, q( n1+1, &
+                        1 ), ldq )
+           else
+              call stdlib_qlaset( 'A', n2, k, zero, zero, q( n1+1, 1 ), ldq )
+           end if
+           call stdlib_qlacpy( 'A', n12, k, q, ldq, s, n12 )
+           if( n12/=0 ) then
+              call stdlib_qgemm( 'N', 'N', n1, k, n12, one, q2, n1, s, n12, zero, q,ldq )
+           else
+              call stdlib_qlaset( 'A', n1, k, zero, zero, q( 1, 1 ), ldq )
+           end if
+           120 continue
+           return
+     end subroutine stdlib_qlaed3
+
+     !> This subroutine computes the I-th updated eigenvalue of a symmetric
+     !> rank-one modification to a diagonal matrix whose elements are
+     !> given in the array d, and that
+     !> D(i) < D(j)  for  i < j
+     !> and that RHO > 0.  This is arranged by the calling routine, and is
+     !> no loss in generality.  The rank-one modified system is thus
+     !> diag( D )  +  RHO * Z * Z_transpose.
+     !> where we assume the Euclidean norm of Z is 1.
+     !> The method consists of approximating the rational functions in the
+     !> secular equation by simpler interpolating rational functions.
+
+     pure subroutine stdlib_qlaed4( n, i, d, z, delta, rho, dlam, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i, n
+           integer(ilp), intent(out) :: info
+           real(qp), intent(out) :: dlam
+           real(qp), intent(in) :: rho
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*), z(*)
+           real(qp), intent(out) :: delta(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 30
+           
+           
+           ! Local Scalars 
+           logical(lk) :: orgati, swtch, swtch3
+           integer(ilp) :: ii, iim1, iip1, ip1, iter, j, niter
+           real(qp) :: a, b, c, del, dltlb, dltub, dphi, dpsi, dw, eps, erretm, eta, midpt, phi, &
+                     prew, psi, rhoinv, tau, temp, temp1, w
+           ! Local Arrays 
+           real(qp) :: zz(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! since this routine is called in an inner loop, we do no argument
+           ! checking.
+           ! quick return for n=1 and 2.
+           info = 0
+           if( n==1 ) then
+               ! presumably, i=1 upon entry
+              dlam = d( 1 ) + rho*z( 1 )*z( 1 )
+              delta( 1 ) = one
+              return
+           end if
+           if( n==2 ) then
+              call stdlib_qlaed5( i, d, z, delta, rho, dlam )
+              return
+           end if
+           ! compute machine epsilon
+           eps = stdlib_qlamch( 'EPSILON' )
+           rhoinv = one / rho
+           ! the case i = n
+           if( i==n ) then
+              ! initialize some basic variables
+              ii = n - 1
+              niter = 1
+              ! calculate initial guess
+              midpt = rho / two
+              ! if ||z||_2 is not one, then temp should be set to
+              ! rho * ||z||_2^2 / two
+              do j = 1, n
+                 delta( j ) = ( d( j )-d( i ) ) - midpt
+              end do
+              psi = zero
+              do j = 1, n - 2
+                 psi = psi + z( j )*z( j ) / delta( j )
+              end do
+              c = rhoinv + psi
+              w = c + z( ii )*z( ii ) / delta( ii ) +z( n )*z( n ) / delta( n )
+              if( w<=zero ) then
+                 temp = z( n-1 )*z( n-1 ) / ( d( n )-d( n-1 )+rho ) +z( n )*z( n ) / rho
+                 if( c<=temp ) then
+                    tau = rho
+                 else
+                    del = d( n ) - d( n-1 )
+                    a = -c*del + z( n-1 )*z( n-1 ) + z( n )*z( n )
+                    b = z( n )*z( n )*del
+                    if( a<zero ) then
+                       tau = two*b / ( sqrt( a*a+four*b*c )-a )
+                    else
+                       tau = ( a+sqrt( a*a+four*b*c ) ) / ( two*c )
+                    end if
+                 end if
+                 ! it can be proved that
+                     ! d(n)+rho/2 <= lambda(n) < d(n)+tau <= d(n)+rho
+                 dltlb = midpt
+                 dltub = rho
+              else
+                 del = d( n ) - d( n-1 )
+                 a = -c*del + z( n-1 )*z( n-1 ) + z( n )*z( n )
+                 b = z( n )*z( n )*del
+                 if( a<zero ) then
+                    tau = two*b / ( sqrt( a*a+four*b*c )-a )
+                 else
+                    tau = ( a+sqrt( a*a+four*b*c ) ) / ( two*c )
+                 end if
+                 ! it can be proved that
+                     ! d(n) < d(n)+tau < lambda(n) < d(n)+rho/2
+                 dltlb = zero
+                 dltub = midpt
+              end if
+              do j = 1, n
+                 delta( j ) = ( d( j )-d( i ) ) - tau
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, ii
+                 temp = z( j ) / delta( j )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              temp = z( n ) / delta( n )
+              phi = z( n )*temp
+              dphi = temp*temp
+              erretm = eight*( -phi-psi ) + erretm - phi + rhoinv +abs( tau )*( dpsi+dphi )
+                        
+              w = rhoinv + phi + psi
+              ! test for convergence
+              if( abs( w )<=eps*erretm ) then
+                 dlam = d( i ) + tau
+                 go to 250
+              end if
+              if( w<=zero ) then
+                 dltlb = max( dltlb, tau )
+              else
+                 dltub = min( dltub, tau )
+              end if
+              ! calculate the new step
+              niter = niter + 1
+              c = w - delta( n-1 )*dpsi - delta( n )*dphi
+              a = ( delta( n-1 )+delta( n ) )*w -delta( n-1 )*delta( n )*( dpsi+dphi )
+              b = delta( n-1 )*delta( n )*w
+              if( c<zero )c = abs( c )
+              if( c==zero ) then
+                ! eta = b/a
+                 ! eta = rho - tau
+                 eta = dltub - tau
+              else if( a>=zero ) then
+                 eta = ( a+sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+              else
+                 eta = two*b / ( a-sqrt( abs( a*a-four*b*c ) ) )
+              end if
+              ! note, eta should be positive if w is negative, and
+              ! eta should be negative otherwise. however,
+              ! if for some reason caused by roundoff, eta*w > 0,
+              ! we simply use one newton step instead. this way
+              ! will guarantee eta*w < 0.
+              if( w*eta>zero )eta = -w / ( dpsi+dphi )
+              temp = tau + eta
+              if( temp>dltub .or. temp<dltlb ) then
+                 if( w<zero ) then
+                    eta = ( dltub-tau ) / two
+                 else
+                    eta = ( dltlb-tau ) / two
+                 end if
+              end if
+              do j = 1, n
+                 delta( j ) = delta( j ) - eta
+              end do
+              tau = tau + eta
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, ii
+                 temp = z( j ) / delta( j )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              temp = z( n ) / delta( n )
+              phi = z( n )*temp
+              dphi = temp*temp
+              erretm = eight*( -phi-psi ) + erretm - phi + rhoinv +abs( tau )*( dpsi+dphi )
+                        
+              w = rhoinv + phi + psi
+              ! main loop to update the values of the array   delta
+              iter = niter + 1
+              loop_90: do niter = iter, maxit
+                 ! test for convergence
+                 if( abs( w )<=eps*erretm ) then
+                    dlam = d( i ) + tau
+                    go to 250
+                 end if
+                 if( w<=zero ) then
+                    dltlb = max( dltlb, tau )
+                 else
+                    dltub = min( dltub, tau )
+                 end if
+                 ! calculate the new step
+                 c = w - delta( n-1 )*dpsi - delta( n )*dphi
+                 a = ( delta( n-1 )+delta( n ) )*w -delta( n-1 )*delta( n )*( dpsi+dphi )
+                 b = delta( n-1 )*delta( n )*w
+                 if( a>=zero ) then
+                    eta = ( a+sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 else
+                    eta = two*b / ( a-sqrt( abs( a*a-four*b*c ) ) )
+                 end if
+                 ! note, eta should be positive if w is negative, and
+                 ! eta should be negative otherwise. however,
+                 ! if for some reason caused by roundoff, eta*w > 0,
+                 ! we simply use one newton step instead. this way
+                 ! will guarantee eta*w < 0.
+                 if( w*eta>zero )eta = -w / ( dpsi+dphi )
+                 temp = tau + eta
+                 if( temp>dltub .or. temp<dltlb ) then
+                    if( w<zero ) then
+                       eta = ( dltub-tau ) / two
+                    else
+                       eta = ( dltlb-tau ) / two
+                    end if
+                 end if
+                 do j = 1, n
+                    delta( j ) = delta( j ) - eta
+                 end do
+                 tau = tau + eta
+                 ! evaluate psi and the derivative dpsi
+                 dpsi = zero
+                 psi = zero
+                 erretm = zero
+                 do j = 1, ii
+                    temp = z( j ) / delta( j )
+                    psi = psi + z( j )*temp
+                    dpsi = dpsi + temp*temp
+                    erretm = erretm + psi
+                 end do
+                 erretm = abs( erretm )
+                 ! evaluate phi and the derivative dphi
+                 temp = z( n ) / delta( n )
+                 phi = z( n )*temp
+                 dphi = temp*temp
+                 erretm = eight*( -phi-psi ) + erretm - phi + rhoinv +abs( tau )*( dpsi+dphi )
+                           
+                 w = rhoinv + phi + psi
+              end do loop_90
+              ! return with info = 1, niter = maxit and not converged
+              info = 1
+              dlam = d( i ) + tau
+              go to 250
+              ! end for the case i = n
+           else
+              ! the case for i < n
+              niter = 1
+              ip1 = i + 1
+              ! calculate initial guess
+              del = d( ip1 ) - d( i )
+              midpt = del / two
+              do j = 1, n
+                 delta( j ) = ( d( j )-d( i ) ) - midpt
+              end do
+              psi = zero
+              do j = 1, i - 1
+                 psi = psi + z( j )*z( j ) / delta( j )
+              end do
+              phi = zero
+              do j = n, i + 2, -1
+                 phi = phi + z( j )*z( j ) / delta( j )
+              end do
+              c = rhoinv + psi + phi
+              w = c + z( i )*z( i ) / delta( i ) +z( ip1 )*z( ip1 ) / delta( ip1 )
+              if( w>zero ) then
+                 ! d(i)< the ith eigenvalue < (d(i)+d(i+1))/2
+                 ! we choose d(i) as origin.
+                 orgati = .true.
+                 a = c*del + z( i )*z( i ) + z( ip1 )*z( ip1 )
+                 b = z( i )*z( i )*del
+                 if( a>zero ) then
+                    tau = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                 else
+                    tau = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 end if
+                 dltlb = zero
+                 dltub = midpt
+              else
+                 ! (d(i)+d(i+1))/2 <= the ith eigenvalue < d(i+1)
+                 ! we choose d(i+1) as origin.
+                 orgati = .false.
+                 a = c*del - z( i )*z( i ) - z( ip1 )*z( ip1 )
+                 b = z( ip1 )*z( ip1 )*del
+                 if( a<zero ) then
+                    tau = two*b / ( a-sqrt( abs( a*a+four*b*c ) ) )
+                 else
+                    tau = -( a+sqrt( abs( a*a+four*b*c ) ) ) / ( two*c )
+                 end if
+                 dltlb = -midpt
+                 dltub = zero
+              end if
+              if( orgati ) then
+                 do j = 1, n
+                    delta( j ) = ( d( j )-d( i ) ) - tau
+                 end do
+              else
+                 do j = 1, n
+                    delta( j ) = ( d( j )-d( ip1 ) ) - tau
+                 end do
+              end if
+              if( orgati ) then
+                 ii = i
+              else
+                 ii = i + 1
+              end if
+              iim1 = ii - 1
+              iip1 = ii + 1
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, iim1
+                 temp = z( j ) / delta( j )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              dphi = zero
+              phi = zero
+              do j = n, iip1, -1
+                 temp = z( j ) / delta( j )
+                 phi = phi + z( j )*temp
+                 dphi = dphi + temp*temp
+                 erretm = erretm + phi
+              end do
+              w = rhoinv + phi + psi
+              ! w is the value of the secular function with
+              ! its ii-th element removed.
+              swtch3 = .false.
+              if( orgati ) then
+                 if( w<zero )swtch3 = .true.
+              else
+                 if( w>zero )swtch3 = .true.
+              end if
+              if( ii==1 .or. ii==n )swtch3 = .false.
+              temp = z( ii ) / delta( ii )
+              dw = dpsi + dphi + temp*temp
+              temp = z( ii )*temp
+              w = w + temp
+              erretm = eight*( phi-psi ) + erretm + two*rhoinv +three*abs( temp ) + abs( tau )&
+                        *dw
+              ! test for convergence
+              if( abs( w )<=eps*erretm ) then
+                 if( orgati ) then
+                    dlam = d( i ) + tau
+                 else
+                    dlam = d( ip1 ) + tau
+                 end if
+                 go to 250
+              end if
+              if( w<=zero ) then
+                 dltlb = max( dltlb, tau )
+              else
+                 dltub = min( dltub, tau )
+              end if
+              ! calculate the new step
+              niter = niter + 1
+              if( .not.swtch3 ) then
+                 if( orgati ) then
+                    c = w - delta( ip1 )*dw - ( d( i )-d( ip1 ) )*( z( i ) / delta( i ) )&
+                              **2
+                 else
+                    c = w - delta( i )*dw - ( d( ip1 )-d( i ) )*( z( ip1 ) / delta( ip1 ) )&
+                              **2
+                 end if
+                 a = ( delta( i )+delta( ip1 ) )*w -delta( i )*delta( ip1 )*dw
+                 b = delta( i )*delta( ip1 )*w
+                 if( c==zero ) then
+                    if( a==zero ) then
+                       if( orgati ) then
+                          a = z( i )*z( i ) + delta( ip1 )*delta( ip1 )*( dpsi+dphi )
+                       else
+                          a = z( ip1 )*z( ip1 ) + delta( i )*delta( i )*( dpsi+dphi )
+                       end if
+                    end if
+                    eta = b / a
+                 else if( a<=zero ) then
+                    eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 else
+                    eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                 end if
+              else
+                 ! interpolation using three most relevant poles
+                 temp = rhoinv + psi + phi
+                 if( orgati ) then
+                    temp1 = z( iim1 ) / delta( iim1 )
+                    temp1 = temp1*temp1
+                    c = temp - delta( iip1 )*( dpsi+dphi ) -( d( iim1 )-d( iip1 ) )*temp1
+                    zz( 1 ) = z( iim1 )*z( iim1 )
+                    zz( 3 ) = delta( iip1 )*delta( iip1 )*( ( dpsi-temp1 )+dphi )
+                 else
+                    temp1 = z( iip1 ) / delta( iip1 )
+                    temp1 = temp1*temp1
+                    c = temp - delta( iim1 )*( dpsi+dphi ) -( d( iip1 )-d( iim1 ) )*temp1
+                    zz( 1 ) = delta( iim1 )*delta( iim1 )*( dpsi+( dphi-temp1 ) )
+                    zz( 3 ) = z( iip1 )*z( iip1 )
+                 end if
+                 zz( 2 ) = z( ii )*z( ii )
+                 call stdlib_qlaed6( niter, orgati, c, delta( iim1 ), zz, w, eta,info )
+                 if( info/=0 )go to 250
+              end if
+              ! note, eta should be positive if w is negative, and
+              ! eta should be negative otherwise. however,
+              ! if for some reason caused by roundoff, eta*w > 0,
+              ! we simply use one newton step instead. this way
+              ! will guarantee eta*w < 0.
+              if( w*eta>=zero )eta = -w / dw
+              temp = tau + eta
+              if( temp>dltub .or. temp<dltlb ) then
+                 if( w<zero ) then
+                    eta = ( dltub-tau ) / two
+                 else
+                    eta = ( dltlb-tau ) / two
+                 end if
+              end if
+              prew = w
+              do j = 1, n
+                 delta( j ) = delta( j ) - eta
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, iim1
+                 temp = z( j ) / delta( j )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              dphi = zero
+              phi = zero
+              do j = n, iip1, -1
+                 temp = z( j ) / delta( j )
+                 phi = phi + z( j )*temp
+                 dphi = dphi + temp*temp
+                 erretm = erretm + phi
+              end do
+              temp = z( ii ) / delta( ii )
+              dw = dpsi + dphi + temp*temp
+              temp = z( ii )*temp
+              w = rhoinv + phi + psi + temp
+              erretm = eight*( phi-psi ) + erretm + two*rhoinv +three*abs( temp ) + abs( tau+eta )&
+                        *dw
+              swtch = .false.
+              if( orgati ) then
+                 if( -w>abs( prew ) / ten )swtch = .true.
+              else
+                 if( w>abs( prew ) / ten )swtch = .true.
+              end if
+              tau = tau + eta
+              ! main loop to update the values of the array   delta
+              iter = niter + 1
+              loop_240: do niter = iter, maxit
+                 ! test for convergence
+                 if( abs( w )<=eps*erretm ) then
+                    if( orgati ) then
+                       dlam = d( i ) + tau
+                    else
+                       dlam = d( ip1 ) + tau
+                    end if
+                    go to 250
+                 end if
+                 if( w<=zero ) then
+                    dltlb = max( dltlb, tau )
+                 else
+                    dltub = min( dltub, tau )
+                 end if
+                 ! calculate the new step
+                 if( .not.swtch3 ) then
+                    if( .not.swtch ) then
+                       if( orgati ) then
+                          c = w - delta( ip1 )*dw -( d( i )-d( ip1 ) )*( z( i ) / delta( i ) )&
+                                    **2
+                       else
+                          c = w - delta( i )*dw - ( d( ip1 )-d( i ) )*( z( ip1 ) / delta( ip1 ) )&
+                                    **2
+                       end if
+                    else
+                       temp = z( ii ) / delta( ii )
+                       if( orgati ) then
+                          dpsi = dpsi + temp*temp
+                       else
+                          dphi = dphi + temp*temp
+                       end if
+                       c = w - delta( i )*dpsi - delta( ip1 )*dphi
+                    end if
+                    a = ( delta( i )+delta( ip1 ) )*w -delta( i )*delta( ip1 )*dw
+                    b = delta( i )*delta( ip1 )*w
+                    if( c==zero ) then
+                       if( a==zero ) then
+                          if( .not.swtch ) then
+                             if( orgati ) then
+                                a = z( i )*z( i ) + delta( ip1 )*delta( ip1 )*( dpsi+dphi )
+                                          
+                             else
+                                a = z( ip1 )*z( ip1 ) +delta( i )*delta( i )*( dpsi+dphi )
+                             end if
+                          else
+                             a = delta( i )*delta( i )*dpsi +delta( ip1 )*delta( ip1 )&
+                                       *dphi
+                          end if
+                       end if
+                       eta = b / a
+                    else if( a<=zero ) then
+                       eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                    else
+                       eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                    end if
+                 else
+                    ! interpolation using three most relevant poles
+                    temp = rhoinv + psi + phi
+                    if( swtch ) then
+                       c = temp - delta( iim1 )*dpsi - delta( iip1 )*dphi
+                       zz( 1 ) = delta( iim1 )*delta( iim1 )*dpsi
+                       zz( 3 ) = delta( iip1 )*delta( iip1 )*dphi
+                    else
+                       if( orgati ) then
+                          temp1 = z( iim1 ) / delta( iim1 )
+                          temp1 = temp1*temp1
+                          c = temp - delta( iip1 )*( dpsi+dphi ) -( d( iim1 )-d( iip1 ) )&
+                                    *temp1
+                          zz( 1 ) = z( iim1 )*z( iim1 )
+                          zz( 3 ) = delta( iip1 )*delta( iip1 )*( ( dpsi-temp1 )+dphi )
+                       else
+                          temp1 = z( iip1 ) / delta( iip1 )
+                          temp1 = temp1*temp1
+                          c = temp - delta( iim1 )*( dpsi+dphi ) -( d( iip1 )-d( iim1 ) )&
+                                    *temp1
+                          zz( 1 ) = delta( iim1 )*delta( iim1 )*( dpsi+( dphi-temp1 ) )
+                          zz( 3 ) = z( iip1 )*z( iip1 )
+                       end if
+                    end if
+                    call stdlib_qlaed6( niter, orgati, c, delta( iim1 ), zz, w, eta,info )
+                    if( info/=0 )go to 250
+                 end if
+                 ! note, eta should be positive if w is negative, and
+                 ! eta should be negative otherwise. however,
+                 ! if for some reason caused by roundoff, eta*w > 0,
+                 ! we simply use one newton step instead. this way
+                 ! will guarantee eta*w < 0.
+                 if( w*eta>=zero )eta = -w / dw
+                 temp = tau + eta
+                 if( temp>dltub .or. temp<dltlb ) then
+                    if( w<zero ) then
+                       eta = ( dltub-tau ) / two
+                    else
+                       eta = ( dltlb-tau ) / two
+                    end if
+                 end if
+                 do j = 1, n
+                    delta( j ) = delta( j ) - eta
+                 end do
+                 tau = tau + eta
+                 prew = w
+                 ! evaluate psi and the derivative dpsi
+                 dpsi = zero
+                 psi = zero
+                 erretm = zero
+                 do j = 1, iim1
+                    temp = z( j ) / delta( j )
+                    psi = psi + z( j )*temp
+                    dpsi = dpsi + temp*temp
+                    erretm = erretm + psi
+                 end do
+                 erretm = abs( erretm )
+                 ! evaluate phi and the derivative dphi
+                 dphi = zero
+                 phi = zero
+                 do j = n, iip1, -1
+                    temp = z( j ) / delta( j )
+                    phi = phi + z( j )*temp
+                    dphi = dphi + temp*temp
+                    erretm = erretm + phi
+                 end do
+                 temp = z( ii ) / delta( ii )
+                 dw = dpsi + dphi + temp*temp
+                 temp = z( ii )*temp
+                 w = rhoinv + phi + psi + temp
+                 erretm = eight*( phi-psi ) + erretm + two*rhoinv +three*abs( temp ) + abs( tau )&
+                           *dw
+                 if( w*prew>zero .and. abs( w )>abs( prew ) / ten )swtch = .not.swtch
+              end do loop_240
+              ! return with info = 1, niter = maxit and not converged
+              info = 1
+              if( orgati ) then
+                 dlam = d( i ) + tau
+              else
+                 dlam = d( ip1 ) + tau
+              end if
+           end if
+           250 continue
+           return
+     end subroutine stdlib_qlaed4
+
+     !> This subroutine computes the I-th eigenvalue of a symmetric rank-one
+     !> modification of a 2-by-2 diagonal matrix
+     !> diag( D )  +  RHO * Z * transpose(Z) .
+     !> The diagonal elements in the array D are assumed to satisfy
+     !> D(i) < D(j)  for  i < j .
+     !> We also assume RHO > 0 and that the Euclidean norm of the vector
+     !> Z is one.
+
+     pure subroutine stdlib_qlaed5( i, d, z, delta, rho, dlam )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i
+           real(qp), intent(out) :: dlam
+           real(qp), intent(in) :: rho
+           ! Array Arguments 
+           real(qp), intent(in) :: d(2), z(2)
+           real(qp), intent(out) :: delta(2)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: b, c, del, tau, temp, w
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           del = d( 2 ) - d( 1 )
+           if( i==1 ) then
+              w = one + two*rho*( z( 2 )*z( 2 )-z( 1 )*z( 1 ) ) / del
+              if( w>zero ) then
+                 b = del + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+                 c = rho*z( 1 )*z( 1 )*del
+                 ! b > zero, always
+                 tau = two*c / ( b+sqrt( abs( b*b-four*c ) ) )
+                 dlam = d( 1 ) + tau
+                 delta( 1 ) = -z( 1 ) / tau
+                 delta( 2 ) = z( 2 ) / ( del-tau )
+              else
+                 b = -del + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+                 c = rho*z( 2 )*z( 2 )*del
+                 if( b>zero ) then
+                    tau = -two*c / ( b+sqrt( b*b+four*c ) )
+                 else
+                    tau = ( b-sqrt( b*b+four*c ) ) / two
+                 end if
+                 dlam = d( 2 ) + tau
+                 delta( 1 ) = -z( 1 ) / ( del+tau )
+                 delta( 2 ) = -z( 2 ) / tau
+              end if
+              temp = sqrt( delta( 1 )*delta( 1 )+delta( 2 )*delta( 2 ) )
+              delta( 1 ) = delta( 1 ) / temp
+              delta( 2 ) = delta( 2 ) / temp
+           else
+           ! now i=2
+              b = -del + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+              c = rho*z( 2 )*z( 2 )*del
+              if( b>zero ) then
+                 tau = ( b+sqrt( b*b+four*c ) ) / two
+              else
+                 tau = two*c / ( -b+sqrt( b*b+four*c ) )
+              end if
+              dlam = d( 2 ) + tau
+              delta( 1 ) = -z( 1 ) / ( del+tau )
+              delta( 2 ) = -z( 2 ) / tau
+              temp = sqrt( delta( 1 )*delta( 1 )+delta( 2 )*delta( 2 ) )
+              delta( 1 ) = delta( 1 ) / temp
+              delta( 2 ) = delta( 2 ) / temp
+           end if
+           return
+     end subroutine stdlib_qlaed5
+
+     !> DLAED6: computes the positive or negative root (closest to the origin)
+     !> of
+     !> z(1)        z(2)        z(3)
+     !> f(x) =   rho + --------- + ---------- + ---------
+     !> d(1)-x      d(2)-x      d(3)-x
+     !> It is assumed that
+     !> if ORGATI = .true. the root is between d(2) and d(3);
+     !> otherwise it is between d(1) and d(2)
+     !> This routine will be called by DLAED4 when necessary. In most cases,
+     !> the root sought is the smallest in magnitude, though it might not be
+     !> in some extremely rare situations.
+
+     pure subroutine stdlib_qlaed6( kniter, orgati, rho, d, z, finit, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: orgati
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kniter
+           real(qp), intent(in) :: finit, rho
+           real(qp), intent(out) :: tau
+           ! Array Arguments 
+           real(qp), intent(in) :: d(3), z(3)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 40
+           
+           
+           ! Local Arrays 
+           real(qp) :: dscale(3), zscale(3)
+           ! Local Scalars 
+           logical(lk) :: scale
+           integer(ilp) :: i, iter, niter
+           real(qp) :: a, b, base, c, ddf, df, eps, erretm, eta, f, fc, sclfac, sclinv, small1, &
+                     small2, sminv1, sminv2, temp, temp1, temp2, temp3, temp4, lbd, ubd
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           if( orgati ) then
+              lbd = d(2)
+              ubd = d(3)
+           else
+              lbd = d(1)
+              ubd = d(2)
+           end if
+           if( finit < zero )then
+              lbd = zero
+           else
+              ubd = zero
+           end if
+           niter = 1
+           tau = zero
+           if( kniter==2 ) then
+              if( orgati ) then
+                 temp = ( d( 3 )-d( 2 ) ) / two
+                 c = rho + z( 1 ) / ( ( d( 1 )-d( 2 ) )-temp )
+                 a = c*( d( 2 )+d( 3 ) ) + z( 2 ) + z( 3 )
+                 b = c*d( 2 )*d( 3 ) + z( 2 )*d( 3 ) + z( 3 )*d( 2 )
+              else
+                 temp = ( d( 1 )-d( 2 ) ) / two
+                 c = rho + z( 3 ) / ( ( d( 3 )-d( 2 ) )-temp )
+                 a = c*( d( 1 )+d( 2 ) ) + z( 1 ) + z( 2 )
+                 b = c*d( 1 )*d( 2 ) + z( 1 )*d( 2 ) + z( 2 )*d( 1 )
+              end if
+              temp = max( abs( a ), abs( b ), abs( c ) )
+              a = a / temp
+              b = b / temp
+              c = c / temp
+              if( c==zero ) then
+                 tau = b / a
+              else if( a<=zero ) then
+                 tau = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+              else
+                 tau = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+              end if
+              if( tau < lbd .or. tau > ubd )tau = ( lbd+ubd )/two
+              if( d(1)==tau .or. d(2)==tau .or. d(3)==tau ) then
+                 tau = zero
+              else
+                 temp = finit + tau*z(1)/( d(1)*( d( 1 )-tau ) ) +tau*z(2)/( d(2)*( d( 2 )-tau ) )&
+                            +tau*z(3)/( d(3)*( d( 3 )-tau ) )
+                 if( temp <= zero )then
+                    lbd = tau
+                 else
+                    ubd = tau
+                 end if
+                 if( abs( finit )<=abs( temp ) )tau = zero
+              end if
+           end if
+           ! get machine parameters for possible scaling to avoid overflow
+           ! modified by sven: parameters small1, sminv1, small2,
+           ! sminv2, eps are not saved anymore between one call to the
+           ! others but recomputed at each call
+           eps = stdlib_qlamch( 'EPSILON' )
+           base = stdlib_qlamch( 'BASE' )
+           small1 = base**( int( log( stdlib_qlamch( 'SAFMIN' ) ) / log( base ) /three,KIND=ilp) )
+                     
+           sminv1 = one / small1
+           small2 = small1*small1
+           sminv2 = sminv1*sminv1
+           ! determine if scaling of inputs necessary to avoid overflow
+           ! when computing 1/temp**3
+           if( orgati ) then
+              temp = min( abs( d( 2 )-tau ), abs( d( 3 )-tau ) )
+           else
+              temp = min( abs( d( 1 )-tau ), abs( d( 2 )-tau ) )
+           end if
+           scale = .false.
+           if( temp<=small1 ) then
+              scale = .true.
+              if( temp<=small2 ) then
+              ! scale up by power of radix nearest 1/safmin**(2/3)
+                 sclfac = sminv2
+                 sclinv = small2
+              else
+              ! scale up by power of radix nearest 1/safmin**(1/3)
+                 sclfac = sminv1
+                 sclinv = small1
+              end if
+              ! scaling up safe because d, z, tau scaled elsewhere to be o(1)
+              do i = 1, 3
+                 dscale( i ) = d( i )*sclfac
+                 zscale( i ) = z( i )*sclfac
+              end do
+              tau = tau*sclfac
+              lbd = lbd*sclfac
+              ubd = ubd*sclfac
+           else
+              ! copy d and z to dscale and zscale
+              do i = 1, 3
+                 dscale( i ) = d( i )
+                 zscale( i ) = z( i )
+              end do
+           end if
+           fc = zero
+           df = zero
+           ddf = zero
+           do i = 1, 3
+              temp = one / ( dscale( i )-tau )
+              temp1 = zscale( i )*temp
+              temp2 = temp1*temp
+              temp3 = temp2*temp
+              fc = fc + temp1 / dscale( i )
+              df = df + temp2
+              ddf = ddf + temp3
+           end do
+           f = finit + tau*fc
+           if( abs( f )<=zero )go to 60
+           if( f <= zero )then
+              lbd = tau
+           else
+              ubd = tau
+           end if
+              ! iteration begins -- use gragg-thornton-warner cubic convergent
+                                  ! scheme
+           ! it is not hard to see that
+                 ! 1) iterations will go up monotonically
+                    ! if finit < 0;
+                 ! 2) iterations will go down monotonically
+                    ! if finit > 0.
+           iter = niter + 1
+           loop_50: do niter = iter, maxit
+              if( orgati ) then
+                 temp1 = dscale( 2 ) - tau
+                 temp2 = dscale( 3 ) - tau
+              else
+                 temp1 = dscale( 1 ) - tau
+                 temp2 = dscale( 2 ) - tau
+              end if
+              a = ( temp1+temp2 )*f - temp1*temp2*df
+              b = temp1*temp2*f
+              c = f - ( temp1+temp2 )*df + temp1*temp2*ddf
+              temp = max( abs( a ), abs( b ), abs( c ) )
+              a = a / temp
+              b = b / temp
+              c = c / temp
+              if( c==zero ) then
+                 eta = b / a
+              else if( a<=zero ) then
+                 eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+              else
+                 eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+              end if
+              if( f*eta>=zero ) then
+                 eta = -f / df
+              end if
+              tau = tau + eta
+              if( tau < lbd .or. tau > ubd )tau = ( lbd + ubd )/two
+              fc = zero
+              erretm = zero
+              df = zero
+              ddf = zero
+              do i = 1, 3
+                 if ( ( dscale( i )-tau )/=zero ) then
+                    temp = one / ( dscale( i )-tau )
+                    temp1 = zscale( i )*temp
+                    temp2 = temp1*temp
+                    temp3 = temp2*temp
+                    temp4 = temp1 / dscale( i )
+                    fc = fc + temp4
+                    erretm = erretm + abs( temp4 )
+                    df = df + temp2
+                    ddf = ddf + temp3
+                 else
+                    go to 60
+                 end if
+              end do
+              f = finit + tau*fc
+              erretm = eight*( abs( finit )+abs( tau )*erretm ) +abs( tau )*df
+              if( ( abs( f )<=four*eps*erretm ) .or.( (ubd-lbd)<=four*eps*abs(tau) )  )go to &
+                        60
+              if( f <= zero )then
+                 lbd = tau
+              else
+                 ubd = tau
+              end if
+           end do loop_50
+           info = 1
+           60 continue
+           ! undo scaling
+           if( scale )tau = tau*sclinv
+           return
+     end subroutine stdlib_qlaed6
+
+     !> DLAED7: computes the updated eigensystem of a diagonal
+     !> matrix after modification by a rank-one symmetric matrix. This
+     !> routine is used only for the eigenproblem which requires all
+     !> eigenvalues and optionally eigenvectors of a dense symmetric matrix
+     !> that has been reduced to tridiagonal form.  DLAED1 handles
+     !> the case in which all eigenvalues and eigenvectors of a symmetric
+     !> tridiagonal matrix are desired.
+     !> T = Q(in) ( D(in) + RHO * Z*Z**T ) Q**T(in) = Q(out) * D(out) * Q**T(out)
+     !> where Z = Q**Tu, u is a vector of length N with ones in the
+     !> CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+     !> The eigenvectors of the original matrix are stored in Q, and the
+     !> eigenvalues are in D.  The algorithm consists of three stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple eigenvalues or if there is a zero in
+     !> the Z vector.  For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one.  This stage is
+     !> performed by the routine DLAED8.
+     !> The second stage consists of calculating the updated
+     !> eigenvalues. This is done by finding the roots of the secular
+     !> equation via the routine DLAED4 (as called by DLAED9).
+     !> This routine also calculates the eigenvectors of the current
+     !> problem.
+     !> The final stage consists of computing the updated eigenvectors
+     !> directly using the updated eigenvalues.  The eigenvectors for
+     !> the current problem are multiplied with the eigenvectors from
+     !> the overall problem.
+
+     pure subroutine stdlib_qlaed7( icompq, n, qsiz, tlvls, curlvl, curpbm, d, q,ldq, indxq, rho, &
+               cutpnt, qstore, qptr, prmptr,perm, givptr, givcol, givnum, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: curlvl, curpbm, cutpnt, icompq, ldq, n, qsiz, tlvls
+           integer(ilp), intent(out) :: info
+           real(qp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: givcol(2,*), givptr(*), perm(*), prmptr(*), qptr(*)
+                     
+           integer(ilp), intent(out) :: indxq(*), iwork(*)
+           real(qp), intent(inout) :: d(*), givnum(2,*), q(ldq,*), qstore(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: coltyp, curr, i, idlmda, indx, indxc, indxp, iq2, is, iw, iz, k, ldq2, &
+                     n1, n2, ptr
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( icompq<0 .or. icompq>1 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( icompq==1 .and. qsiz<n ) then
+              info = -3
+           else if( ldq<max( 1, n ) ) then
+              info = -9
+           else if( min( 1, n )>cutpnt .or. n<cutpnt ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED7', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! the following values are for bookkeeping purposes only.  they are
+           ! integer pointers which indicate the portion of the workspace
+           ! used by a particular array in stdlib_qlaed8 and stdlib_qlaed9.
+           if( icompq==1 ) then
+              ldq2 = qsiz
+           else
+              ldq2 = n
+           end if
+           iz = 1
+           idlmda = iz + n
+           iw = idlmda + n
+           iq2 = iw + n
+           is = iq2 + n*ldq2
+           indx = 1
+           indxc = indx + n
+           coltyp = indxc + n
+           indxp = coltyp + n
+           ! form the z-vector which consists of the last row of q_1 and the
+           ! first row of q_2.
+           ptr = 1 + 2**tlvls
+           do i = 1, curlvl - 1
+              ptr = ptr + 2**( tlvls-i )
+           end do
+           curr = ptr + curpbm
+           call stdlib_qlaeda( n, tlvls, curlvl, curpbm, prmptr, perm, givptr,givcol, givnum, &
+                     qstore, qptr, work( iz ),work( iz+n ), info )
+           ! when solving the final problem, we no longer need the stored data,
+           ! so we will overwrite the data from this level onto the previously
+           ! used storage space.
+           if( curlvl==tlvls ) then
+              qptr( curr ) = 1
+              prmptr( curr ) = 1
+              givptr( curr ) = 1
+           end if
+           ! sort and deflate eigenvalues.
+           call stdlib_qlaed8( icompq, k, n, qsiz, d, q, ldq, indxq, rho, cutpnt,work( iz ), work(&
+            idlmda ), work( iq2 ), ldq2,work( iw ), perm( prmptr( curr ) ), givptr( curr+1 ),&
+            givcol( 1, givptr( curr ) ),givnum( 1, givptr( curr ) ), iwork( indxp ),iwork( indx ),&
+                       info )
+           prmptr( curr+1 ) = prmptr( curr ) + n
+           givptr( curr+1 ) = givptr( curr+1 ) + givptr( curr )
+           ! solve secular equation.
+           if( k/=0 ) then
+              call stdlib_qlaed9( k, 1, k, n, d, work( is ), k, rho, work( idlmda ),work( iw ), &
+                        qstore( qptr( curr ) ), k, info )
+              if( info/=0 )go to 30
+              if( icompq==1 ) then
+                 call stdlib_qgemm( 'N', 'N', qsiz, k, k, one, work( iq2 ), ldq2,qstore( qptr( &
+                           curr ) ), k, zero, q, ldq )
+              end if
+              qptr( curr+1 ) = qptr( curr ) + k**2
+           ! prepare the indxq sorting permutation.
+              n1 = k
+              n2 = n - k
+              call stdlib_qlamrg( n1, n2, d, 1, -1, indxq )
+           else
+              qptr( curr+1 ) = qptr( curr )
+              do i = 1, n
+                 indxq( i ) = i
+              end do
+           end if
+           30 continue
+           return
+     end subroutine stdlib_qlaed7
+
+     !> DLAED8: merges the two sets of eigenvalues together into a single
+     !> sorted set.  Then it tries to deflate the size of the problem.
+     !> There are two ways in which deflation can occur:  when two or more
+     !> eigenvalues are close together or if there is a tiny element in the
+     !> Z vector.  For each such occurrence the order of the related secular
+     !> equation problem is reduced by one.
+
+     pure subroutine stdlib_qlaed8( icompq, k, n, qsiz, d, q, ldq, indxq, rho,cutpnt, z, dlamda, &
+               q2, ldq2, w, perm, givptr,givcol, givnum, indxp, indx, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: cutpnt, icompq, ldq, ldq2, n, qsiz
+           integer(ilp), intent(out) :: givptr, info, k
+           real(qp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(2,*), indx(*), indxp(*), perm(*)
+           integer(ilp), intent(inout) :: indxq(*)
+           real(qp), intent(inout) :: d(*), q(ldq,*), z(*)
+           real(qp), intent(out) :: dlamda(*), givnum(2,*), q2(ldq2,*), w(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: mone = -1.0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, imax, j, jlam, jmax, jp, k2, n1, n1p1, n2
+           real(qp) :: c, eps, s, t, tau, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( icompq<0 .or. icompq>1 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -3
+           else if( icompq==1 .and. qsiz<n ) then
+              info = -4
+           else if( ldq<max( 1, n ) ) then
+              info = -7
+           else if( cutpnt<min( 1, n ) .or. cutpnt>n ) then
+              info = -10
+           else if( ldq2<max( 1, n ) ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED8', -info )
+              return
+           end if
+           ! need to initialize givptr to o here in case of quick exit
+           ! to prevent an unspecified code behavior (usually sigfault)
+           ! when iwork array on entry to *stedc is not zeroed
+           ! (or at least some iwork entries which used in *laed7 for givptr).
+           givptr = 0
+           ! quick return if possible
+           if( n==0 )return
+           n1 = cutpnt
+           n2 = n - n1
+           n1p1 = n1 + 1
+           if( rho<zero ) then
+              call stdlib_qscal( n2, mone, z( n1p1 ), 1 )
+           end if
+           ! normalize z so that norm(z) = 1
+           t = one / sqrt( two )
+           do j = 1, n
+              indx( j ) = j
+           end do
+           call stdlib_qscal( n, t, z, 1 )
+           rho = abs( two*rho )
+           ! sort the eigenvalues into increasing order
+           do i = cutpnt + 1, n
+              indxq( i ) = indxq( i ) + cutpnt
+           end do
+           do i = 1, n
+              dlamda( i ) = d( indxq( i ) )
+              w( i ) = z( indxq( i ) )
+           end do
+           i = 1
+           j = cutpnt + 1
+           call stdlib_qlamrg( n1, n2, dlamda, 1, 1, indx )
+           do i = 1, n
+              d( i ) = dlamda( indx( i ) )
+              z( i ) = w( indx( i ) )
+           end do
+           ! calculate the allowable deflation tolerance
+           imax = stdlib_iqamax( n, z, 1 )
+           jmax = stdlib_iqamax( n, d, 1 )
+           eps = stdlib_qlamch( 'EPSILON' )
+           tol = eight*eps*abs( d( jmax ) )
+           ! if the rank-1 modifier is small enough, no more needs to be done
+           ! except to reorganize q so that its columns correspond with the
+           ! elements in d.
+           if( rho*abs( z( imax ) )<=tol ) then
+              k = 0
+              if( icompq==0 ) then
+                 do j = 1, n
+                    perm( j ) = indxq( indx( j ) )
+                 end do
+              else
+                 do j = 1, n
+                    perm( j ) = indxq( indx( j ) )
+                    call stdlib_qcopy( qsiz, q( 1, perm( j ) ), 1, q2( 1, j ), 1 )
+                 end do
+                 call stdlib_qlacpy( 'A', qsiz, n, q2( 1, 1 ), ldq2, q( 1, 1 ),ldq )
+              end if
+              return
+           end if
+           ! if there are multiple eigenvalues then the problem deflates.  here
+           ! the number of equal eigenvalues are found.  as each equal
+           ! eigenvalue is found, an elementary reflector is computed to rotate
+           ! the corresponding eigensubspace so that the corresponding
+           ! components of z are zero in this new basis.
+           k = 0
+           k2 = n + 1
+           do j = 1, n
+              if( rho*abs( z( j ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 indxp( k2 ) = j
+                 if( j==n )go to 110
+              else
+                 jlam = j
+                 go to 80
+              end if
+           end do
+           80 continue
+           j = j + 1
+           if( j>n )go to 100
+           if( rho*abs( z( j ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              indxp( k2 ) = j
+           else
+              ! check if eigenvalues are close enough to allow deflation.
+              s = z( jlam )
+              c = z( j )
+              ! find sqrt(a**2+b**2) without overflow or
+              ! destructive underflow.
+              tau = stdlib_qlapy2( c, s )
+              t = d( j ) - d( jlam )
+              c = c / tau
+              s = -s / tau
+              if( abs( t*c*s )<=tol ) then
+                 ! deflation is possible.
+                 z( j ) = tau
+                 z( jlam ) = zero
+                 ! record the appropriate givens rotation
+                 givptr = givptr + 1
+                 givcol( 1, givptr ) = indxq( indx( jlam ) )
+                 givcol( 2, givptr ) = indxq( indx( j ) )
+                 givnum( 1, givptr ) = c
+                 givnum( 2, givptr ) = s
+                 if( icompq==1 ) then
+                    call stdlib_qrot( qsiz, q( 1, indxq( indx( jlam ) ) ), 1,q( 1, indxq( indx( j &
+                              ) ) ), 1, c, s )
+                 end if
+                 t = d( jlam )*c*c + d( j )*s*s
+                 d( j ) = d( jlam )*s*s + d( j )*c*c
+                 d( jlam ) = t
+                 k2 = k2 - 1
+                 i = 1
+                 90 continue
+                 if( k2+i<=n ) then
+                    if( d( jlam )<d( indxp( k2+i ) ) ) then
+                       indxp( k2+i-1 ) = indxp( k2+i )
+                       indxp( k2+i ) = jlam
+                       i = i + 1
+                       go to 90
+                    else
+                       indxp( k2+i-1 ) = jlam
+                    end if
+                 else
+                    indxp( k2+i-1 ) = jlam
+                 end if
+                 jlam = j
+              else
+                 k = k + 1
+                 w( k ) = z( jlam )
+                 dlamda( k ) = d( jlam )
+                 indxp( k ) = jlam
+                 jlam = j
+              end if
+           end if
+           go to 80
+           100 continue
+           ! record the last eigenvalue.
+           k = k + 1
+           w( k ) = z( jlam )
+           dlamda( k ) = d( jlam )
+           indxp( k ) = jlam
+           110 continue
+           ! sort the eigenvalues and corresponding eigenvectors into dlamda
+           ! and q2 respectively.  the eigenvalues/vectors which were not
+           ! deflated go into the first k slots of dlamda and q2 respectively,
+           ! while those which were deflated go into the last n - k slots.
+           if( icompq==0 ) then
+              do j = 1, n
+                 jp = indxp( j )
+                 dlamda( j ) = d( jp )
+                 perm( j ) = indxq( indx( jp ) )
+              end do
+           else
+              do j = 1, n
+                 jp = indxp( j )
+                 dlamda( j ) = d( jp )
+                 perm( j ) = indxq( indx( jp ) )
+                 call stdlib_qcopy( qsiz, q( 1, perm( j ) ), 1, q2( 1, j ), 1 )
+              end do
+           end if
+           ! the deflated eigenvalues and their corresponding vectors go back
+           ! into the last n - k slots of d and q respectively.
+           if( k<n ) then
+              if( icompq==0 ) then
+                 call stdlib_qcopy( n-k, dlamda( k+1 ), 1, d( k+1 ), 1 )
+              else
+                 call stdlib_qcopy( n-k, dlamda( k+1 ), 1, d( k+1 ), 1 )
+                 call stdlib_qlacpy( 'A', qsiz, n-k, q2( 1, k+1 ), ldq2,q( 1, k+1 ), ldq )
+              end if
+           end if
+           return
+     end subroutine stdlib_qlaed8
+
+     !> DLAED9: finds the roots of the secular equation, as defined by the
+     !> values in D, Z, and RHO, between KSTART and KSTOP.  It makes the
+     !> appropriate calls to DLAED4 and then stores the new matrix of
+     !> eigenvectors for use in calculating the next level of Z vectors.
+
+     pure subroutine stdlib_qlaed9( k, kstart, kstop, n, d, q, ldq, rho, dlamda, w,s, lds, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, kstart, kstop, ldq, lds, n
+           real(qp), intent(in) :: rho
+           ! Array Arguments 
+           real(qp), intent(out) :: d(*), q(ldq,*), s(lds,*)
+           real(qp), intent(inout) :: dlamda(*), w(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( k<0 ) then
+              info = -1
+           else if( kstart<1 .or. kstart>max( 1, k ) ) then
+              info = -2
+           else if( max( 1, kstop )<kstart .or. kstop>max( 1, k ) )then
+              info = -3
+           else if( n<k ) then
+              info = -4
+           else if( ldq<max( 1, k ) ) then
+              info = -7
+           else if( lds<max( 1, k ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAED9', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==0 )return
+           ! modify values dlamda(i) to make sure all dlamda(i)-dlamda(j) can
+           ! be computed with high relative accuracy (barring over/underflow).
+           ! this is a problem on machines without a guard digit in
+           ! add/subtract (cray xmp, cray ymp, cray c 90 and cray 2).
+           ! the following code replaces dlamda(i) by 2*dlamda(i)-dlamda(i),
+           ! which on any of these machines zeros out the bottommost
+           ! bit of dlamda(i) if it is 1; this makes the subsequent
+           ! subtractions dlamda(i)-dlamda(j) unproblematic when cancellation
+           ! occurs. on binary machines with a guard digit (almost all
+           ! machines) it does not change dlamda(i) at all. on hexadecimal
+           ! and decimal machines with a guard digit, it slightly
+           ! changes the bottommost bits of dlamda(i). it does not account
+           ! for hexadecimal or decimal machines without guard digits
+           ! (we know of none). we use a subroutine call to compute
+           ! 2*dlambda(i) to prevent optimizing compilers from eliminating
+           ! this code.
+           do i = 1, n
+              dlamda( i ) = stdlib_qlamc3( dlamda( i ), dlamda( i ) ) - dlamda( i )
+           end do
+           do j = kstart, kstop
+              call stdlib_qlaed4( k, j, dlamda, w, q( 1, j ), rho, d( j ), info )
+              ! if the zero finder fails, the computation is terminated.
+              if( info/=0 )go to 120
+           end do
+           if( k==1 .or. k==2 ) then
+              do i = 1, k
+                 do j = 1, k
+                    s( j, i ) = q( j, i )
+                 end do
+              end do
+              go to 120
+           end if
+           ! compute updated w.
+           call stdlib_qcopy( k, w, 1, s, 1 )
+           ! initialize w(i) = q(i,i)
+           call stdlib_qcopy( k, q, ldq+1, w, 1 )
+           do j = 1, k
+              do i = 1, j - 1
+                 w( i ) = w( i )*( q( i, j ) / ( dlamda( i )-dlamda( j ) ) )
+              end do
+              do i = j + 1, k
+                 w( i ) = w( i )*( q( i, j ) / ( dlamda( i )-dlamda( j ) ) )
+              end do
+           end do
+           do i = 1, k
+              w( i ) = sign( sqrt( -w( i ) ), s( i, 1 ) )
+           end do
+           ! compute eigenvectors of the modified rank-1 modification.
+           do j = 1, k
+              do i = 1, k
+                 q( i, j ) = w( i ) / q( i, j )
+              end do
+              temp = stdlib_qnrm2( k, q( 1, j ), 1 )
+              do i = 1, k
+                 s( i, j ) = q( i, j ) / temp
+              end do
+           end do
+           120 continue
+           return
+     end subroutine stdlib_qlaed9
+
+     !> DLAEDA: computes the Z vector corresponding to the merge step in the
+     !> CURLVLth step of the merge process with TLVLS steps for the CURPBMth
+     !> problem.
+
+     pure subroutine stdlib_qlaeda( n, tlvls, curlvl, curpbm, prmptr, perm, givptr,givcol, givnum,&
+                q, qptr, z, ztemp, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: curlvl, curpbm, n, tlvls
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(2,*), givptr(*), perm(*), prmptr(*), qptr(*)
+           real(qp), intent(in) :: givnum(2,*), q(*)
+           real(qp), intent(out) :: z(*), ztemp(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: bsiz1, bsiz2, curr, i, k, mid, psiz1, psiz2, ptr, zptr1
+           ! Intrinsic Functions 
+           intrinsic :: real,int,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAEDA', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine location of first number in second half.
+           mid = n / 2 + 1
+           ! gather last/first rows of appropriate eigenblocks into center of z
+           ptr = 1
+           ! determine location of lowest level subproblem in the full storage
+           ! scheme
+           curr = ptr + curpbm*2**curlvl + 2**( curlvl-1 ) - 1
+           ! determine size of these matrices.  we add half to the value of
+           ! the sqrt in case the machine underestimates one of these square
+           ! roots.
+           bsiz1 = int( half+sqrt( real( qptr( curr+1 )-qptr( curr ),KIND=qp) ),KIND=ilp)
+           bsiz2 = int( half+sqrt( real( qptr( curr+2 )-qptr( curr+1 ),KIND=qp) ),KIND=ilp)
+                     
+           do k = 1, mid - bsiz1 - 1
+              z( k ) = zero
+           end do
+           call stdlib_qcopy( bsiz1, q( qptr( curr )+bsiz1-1 ), bsiz1,z( mid-bsiz1 ), 1 )
+           call stdlib_qcopy( bsiz2, q( qptr( curr+1 ) ), bsiz2, z( mid ), 1 )
+           do k = mid + bsiz2, n
+              z( k ) = zero
+           end do
+           ! loop through remaining levels 1 -> curlvl applying the givens
+           ! rotations and permutation and then multiplying the center matrices
+           ! against the current z.
+           ptr = 2**tlvls + 1
+           loop_70: do k = 1, curlvl - 1
+              curr = ptr + curpbm*2**( curlvl-k ) + 2**( curlvl-k-1 ) - 1
+              psiz1 = prmptr( curr+1 ) - prmptr( curr )
+              psiz2 = prmptr( curr+2 ) - prmptr( curr+1 )
+              zptr1 = mid - psiz1
+             ! apply givens at curr and curr+1
+              do i = givptr( curr ), givptr( curr+1 ) - 1
+                 call stdlib_qrot( 1, z( zptr1+givcol( 1, i )-1 ), 1,z( zptr1+givcol( 2, i )-1 ), &
+                           1, givnum( 1, i ),givnum( 2, i ) )
+              end do
+              do i = givptr( curr+1 ), givptr( curr+2 ) - 1
+                 call stdlib_qrot( 1, z( mid-1+givcol( 1, i ) ), 1,z( mid-1+givcol( 2, i ) ), 1, &
+                           givnum( 1, i ),givnum( 2, i ) )
+              end do
+              psiz1 = prmptr( curr+1 ) - prmptr( curr )
+              psiz2 = prmptr( curr+2 ) - prmptr( curr+1 )
+              do i = 0, psiz1 - 1
+                 ztemp( i+1 ) = z( zptr1+perm( prmptr( curr )+i )-1 )
+              end do
+              do i = 0, psiz2 - 1
+                 ztemp( psiz1+i+1 ) = z( mid+perm( prmptr( curr+1 )+i )-1 )
+              end do
+              ! multiply blocks at curr and curr+1
+              ! determine size of these matrices.  we add half to the value of
+              ! the sqrt in case the machine underestimates one of these
+              ! square roots.
+              bsiz1 = int( half+sqrt( real( qptr( curr+1 )-qptr( curr ),KIND=qp) ),KIND=ilp)
+                        
+              bsiz2 = int( half+sqrt( real( qptr( curr+2 )-qptr( curr+1 ),KIND=qp) ),KIND=ilp)
+                        
+              if( bsiz1>0 ) then
+                 call stdlib_qgemv( 'T', bsiz1, bsiz1, one, q( qptr( curr ) ),bsiz1, ztemp( 1 ), &
+                           1, zero, z( zptr1 ), 1 )
+              end if
+              call stdlib_qcopy( psiz1-bsiz1, ztemp( bsiz1+1 ), 1, z( zptr1+bsiz1 ),1 )
+              if( bsiz2>0 ) then
+                 call stdlib_qgemv( 'T', bsiz2, bsiz2, one, q( qptr( curr+1 ) ),bsiz2, ztemp( &
+                           psiz1+1 ), 1, zero, z( mid ), 1 )
+              end if
+              call stdlib_qcopy( psiz2-bsiz2, ztemp( psiz1+bsiz2+1 ), 1,z( mid+bsiz2 ), 1 )
+                        
+              ptr = ptr + 2**( tlvls-k )
+           end do loop_70
+           return
+     end subroutine stdlib_qlaeda
+
+     !> DLAEIN: uses inverse iteration to find a right or left eigenvector
+     !> corresponding to the eigenvalue (WR,WI) of a real upper Hessenberg
+     !> matrix H.
+
+     pure subroutine stdlib_qlaein( rightv, noinit, n, h, ldh, wr, wi, vr, vi, b,ldb, work, eps3, &
+               smlnum, bignum, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: noinit, rightv
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldh, n
+           real(qp), intent(in) :: bignum, eps3, smlnum, wi, wr
+           ! Array Arguments 
+           real(qp), intent(out) :: b(ldb,*), work(*)
+           real(qp), intent(in) :: h(ldh,*)
+           real(qp), intent(inout) :: vi(*), vr(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: tenth = 1.0e-1_qp
+           
+           ! Local Scalars 
+           character :: normin, trans
+           integer(ilp) :: i, i1, i2, i3, ierr, its, j
+           real(qp) :: absbii, absbjj, ei, ej, growto, norm, nrmsml, rec, rootn, scale, temp, &
+                     vcrit, vmax, vnorm, w, w1, x, xi, xr, y
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,sqrt
+           ! Executable Statements 
+           info = 0
+           ! growto is the threshold used in the acceptance test for an
+           ! eigenvector.
+           rootn = sqrt( real( n,KIND=qp) )
+           growto = tenth / rootn
+           nrmsml = max( one, eps3*rootn )*smlnum
+           ! form b = h - (wr,wi)*i (except that the subdiagonal elements and
+           ! the imaginary parts of the diagonal elements are not stored).
+           do j = 1, n
+              do i = 1, j - 1
+                 b( i, j ) = h( i, j )
+              end do
+              b( j, j ) = h( j, j ) - wr
+           end do
+           if( wi==zero ) then
+              ! real eigenvalue.
+              if( noinit ) then
+                 ! set initial vector.
+                 do i = 1, n
+                    vr( i ) = eps3
+                 end do
+              else
+                 ! scale supplied initial vector.
+                 vnorm = stdlib_qnrm2( n, vr, 1 )
+                 call stdlib_qscal( n, ( eps3*rootn ) / max( vnorm, nrmsml ), vr,1 )
+              end if
+              if( rightv ) then
+                 ! lu decomposition with partial pivoting of b, replacing zero
+                 ! pivots by eps3.
+                 do i = 1, n - 1
+                    ei = h( i+1, i )
+                    if( abs( b( i, i ) )<abs( ei ) ) then
+                       ! interchange rows and eliminate.
+                       x = b( i, i ) / ei
+                       b( i, i ) = ei
+                       do j = i + 1, n
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - x*temp
+                          b( i, j ) = temp
+                       end do
+                    else
+                       ! eliminate without interchange.
+                       if( b( i, i )==zero )b( i, i ) = eps3
+                       x = ei / b( i, i )
+                       if( x/=zero ) then
+                          do j = i + 1, n
+                             b( i+1, j ) = b( i+1, j ) - x*b( i, j )
+                          end do
+                       end if
+                    end if
+                 end do
+                 if( b( n, n )==zero )b( n, n ) = eps3
+                 trans = 'N'
+              else
+                 ! ul decomposition with partial pivoting of b, replacing zero
+                 ! pivots by eps3.
+                 do j = n, 2, -1
+                    ej = h( j, j-1 )
+                    if( abs( b( j, j ) )<abs( ej ) ) then
+                       ! interchange columns and eliminate.
+                       x = b( j, j ) / ej
+                       b( j, j ) = ej
+                       do i = 1, j - 1
+                          temp = b( i, j-1 )
+                          b( i, j-1 ) = b( i, j ) - x*temp
+                          b( i, j ) = temp
+                       end do
+                    else
+                       ! eliminate without interchange.
+                       if( b( j, j )==zero )b( j, j ) = eps3
+                       x = ej / b( j, j )
+                       if( x/=zero ) then
+                          do i = 1, j - 1
+                             b( i, j-1 ) = b( i, j-1 ) - x*b( i, j )
+                          end do
+                       end if
+                    end if
+                 end do
+                 if( b( 1, 1 )==zero )b( 1, 1 ) = eps3
+                 trans = 'T'
+              end if
+              normin = 'N'
+              do its = 1, n
+                 ! solve u*x = scale*v for a right eigenvector
+                   ! or u**t*x = scale*v for a left eigenvector,
+                 ! overwriting x on v.
+                 call stdlib_qlatrs( 'UPPER', trans, 'NONUNIT', normin, n, b, ldb,vr, scale, work,&
+                            ierr )
+                 normin = 'Y'
+                 ! test for sufficient growth in the norm of v.
+                 vnorm = stdlib_qasum( n, vr, 1 )
+                 if( vnorm>=growto*scale )go to 120
+                 ! choose new orthogonal starting vector and try again.
+                 temp = eps3 / ( rootn+one )
+                 vr( 1 ) = eps3
+                 do i = 2, n
+                    vr( i ) = temp
+                 end do
+                 vr( n-its+1 ) = vr( n-its+1 ) - eps3*rootn
+              end do
+              ! failure to find eigenvector in n iterations.
+              info = 1
+              120 continue
+              ! normalize eigenvector.
+              i = stdlib_iqamax( n, vr, 1 )
+              call stdlib_qscal( n, one / abs( vr( i ) ), vr, 1 )
+           else
+              ! complex eigenvalue.
+              if( noinit ) then
+                 ! set initial vector.
+                 do i = 1, n
+                    vr( i ) = eps3
+                    vi( i ) = zero
+                 end do
+              else
+                 ! scale supplied initial vector.
+                 norm = stdlib_qlapy2( stdlib_qnrm2( n, vr, 1 ), stdlib_qnrm2( n, vi, 1 ) )
+                           
+                 rec = ( eps3*rootn ) / max( norm, nrmsml )
+                 call stdlib_qscal( n, rec, vr, 1 )
+                 call stdlib_qscal( n, rec, vi, 1 )
+              end if
+              if( rightv ) then
+                 ! lu decomposition with partial pivoting of b, replacing zero
+                 ! pivots by eps3.
+                 ! the imaginary part of the (i,j)-th element of u is stored in
+                 ! b(j+1,i).
+                 b( 2, 1 ) = -wi
+                 do i = 2, n
+                    b( i+1, 1 ) = zero
+                 end do
+                 loop_170: do i = 1, n - 1
+                    absbii = stdlib_qlapy2( b( i, i ), b( i+1, i ) )
+                    ei = h( i+1, i )
+                    if( absbii<abs( ei ) ) then
+                       ! interchange rows and eliminate.
+                       xr = b( i, i ) / ei
+                       xi = b( i+1, i ) / ei
+                       b( i, i ) = ei
+                       b( i+1, i ) = zero
+                       do j = i + 1, n
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - xr*temp
+                          b( j+1, i+1 ) = b( j+1, i ) - xi*temp
+                          b( i, j ) = temp
+                          b( j+1, i ) = zero
+                       end do
+                       b( i+2, i ) = -wi
+                       b( i+1, i+1 ) = b( i+1, i+1 ) - xi*wi
+                       b( i+2, i+1 ) = b( i+2, i+1 ) + xr*wi
+                    else
+                       ! eliminate without interchanging rows.
+                       if( absbii==zero ) then
+                          b( i, i ) = eps3
+                          b( i+1, i ) = zero
+                          absbii = eps3
+                       end if
+                       ei = ( ei / absbii ) / absbii
+                       xr = b( i, i )*ei
+                       xi = -b( i+1, i )*ei
+                       do j = i + 1, n
+                          b( i+1, j ) = b( i+1, j ) - xr*b( i, j ) +xi*b( j+1, i )
+                          b( j+1, i+1 ) = -xr*b( j+1, i ) - xi*b( i, j )
+                       end do
+                       b( i+2, i+1 ) = b( i+2, i+1 ) - wi
+                    end if
+                    ! compute 1-norm of offdiagonal elements of i-th row.
+                    work( i ) = stdlib_qasum( n-i, b( i, i+1 ), ldb ) +stdlib_qasum( n-i, b( i+2, &
+                              i ), 1 )
+                 end do loop_170
+                 if( b( n, n )==zero .and. b( n+1, n )==zero )b( n, n ) = eps3
+                 work( n ) = zero
+                 i1 = n
+                 i2 = 1
+                 i3 = -1
+              else
+                 ! ul decomposition with partial pivoting of conjg(b),
+                 ! replacing zero pivots by eps3.
+                 ! the imaginary part of the (i,j)-th element of u is stored in
+                 ! b(j+1,i).
+                 b( n+1, n ) = wi
+                 do j = 1, n - 1
+                    b( n+1, j ) = zero
+                 end do
+                 loop_210: do j = n, 2, -1
+                    ej = h( j, j-1 )
+                    absbjj = stdlib_qlapy2( b( j, j ), b( j+1, j ) )
+                    if( absbjj<abs( ej ) ) then
+                       ! interchange columns and eliminate
+                       xr = b( j, j ) / ej
+                       xi = b( j+1, j ) / ej
+                       b( j, j ) = ej
+                       b( j+1, j ) = zero
+                       do i = 1, j - 1
+                          temp = b( i, j-1 )
+                          b( i, j-1 ) = b( i, j ) - xr*temp
+                          b( j, i ) = b( j+1, i ) - xi*temp
+                          b( i, j ) = temp
+                          b( j+1, i ) = zero
+                       end do
+                       b( j+1, j-1 ) = wi
+                       b( j-1, j-1 ) = b( j-1, j-1 ) + xi*wi
+                       b( j, j-1 ) = b( j, j-1 ) - xr*wi
+                    else
+                       ! eliminate without interchange.
+                       if( absbjj==zero ) then
+                          b( j, j ) = eps3
+                          b( j+1, j ) = zero
+                          absbjj = eps3
+                       end if
+                       ej = ( ej / absbjj ) / absbjj
+                       xr = b( j, j )*ej
+                       xi = -b( j+1, j )*ej
+                       do i = 1, j - 1
+                          b( i, j-1 ) = b( i, j-1 ) - xr*b( i, j ) +xi*b( j+1, i )
+                          b( j, i ) = -xr*b( j+1, i ) - xi*b( i, j )
+                       end do
+                       b( j, j-1 ) = b( j, j-1 ) + wi
+                    end if
+                    ! compute 1-norm of offdiagonal elements of j-th column.
+                    work( j ) = stdlib_qasum( j-1, b( 1, j ), 1 ) +stdlib_qasum( j-1, b( j+1, 1 ),&
+                               ldb )
+                 end do loop_210
+                 if( b( 1, 1 )==zero .and. b( 2, 1 )==zero )b( 1, 1 ) = eps3
+                 work( 1 ) = zero
+                 i1 = 1
+                 i2 = n
+                 i3 = 1
+              end if
+              loop_270: do its = 1, n
+                 scale = one
+                 vmax = one
+                 vcrit = bignum
+                 ! solve u*(xr,xi) = scale*(vr,vi) for a right eigenvector,
+                   ! or u**t*(xr,xi) = scale*(vr,vi) for a left eigenvector,
+                 ! overwriting (xr,xi) on (vr,vi).
+                 loop_250: do i = i1, i2, i3
+                    if( work( i )>vcrit ) then
+                       rec = one / vmax
+                       call stdlib_qscal( n, rec, vr, 1 )
+                       call stdlib_qscal( n, rec, vi, 1 )
+                       scale = scale*rec
+                       vmax = one
+                       vcrit = bignum
+                    end if
+                    xr = vr( i )
+                    xi = vi( i )
+                    if( rightv ) then
+                       do j = i + 1, n
+                          xr = xr - b( i, j )*vr( j ) + b( j+1, i )*vi( j )
+                          xi = xi - b( i, j )*vi( j ) - b( j+1, i )*vr( j )
+                       end do
+                    else
+                       do j = 1, i - 1
+                          xr = xr - b( j, i )*vr( j ) + b( i+1, j )*vi( j )
+                          xi = xi - b( j, i )*vi( j ) - b( i+1, j )*vr( j )
+                       end do
+                    end if
+                    w = abs( b( i, i ) ) + abs( b( i+1, i ) )
+                    if( w>smlnum ) then
+                       if( w<one ) then
+                          w1 = abs( xr ) + abs( xi )
+                          if( w1>w*bignum ) then
+                             rec = one / w1
+                             call stdlib_qscal( n, rec, vr, 1 )
+                             call stdlib_qscal( n, rec, vi, 1 )
+                             xr = vr( i )
+                             xi = vi( i )
+                             scale = scale*rec
+                             vmax = vmax*rec
+                          end if
+                       end if
+                       ! divide by diagonal element of b.
+                       call stdlib_qladiv( xr, xi, b( i, i ), b( i+1, i ), vr( i ),vi( i ) )
+                                 
+                       vmax = max( abs( vr( i ) )+abs( vi( i ) ), vmax )
+                       vcrit = bignum / vmax
+                    else
+                       do j = 1, n
+                          vr( j ) = zero
+                          vi( j ) = zero
+                       end do
+                       vr( i ) = one
+                       vi( i ) = one
+                       scale = zero
+                       vmax = one
+                       vcrit = bignum
+                    end if
+                 end do loop_250
+                 ! test for sufficient growth in the norm of (vr,vi).
+                 vnorm = stdlib_qasum( n, vr, 1 ) + stdlib_qasum( n, vi, 1 )
+                 if( vnorm>=growto*scale )go to 280
+                 ! choose a new orthogonal starting vector and try again.
+                 y = eps3 / ( rootn+one )
+                 vr( 1 ) = eps3
+                 vi( 1 ) = zero
+                 do i = 2, n
+                    vr( i ) = y
+                    vi( i ) = zero
+                 end do
+                 vr( n-its+1 ) = vr( n-its+1 ) - eps3*rootn
+              end do loop_270
+              ! failure to find eigenvector in n iterations
+              info = 1
+              280 continue
+              ! normalize eigenvector.
+              vnorm = zero
+              do i = 1, n
+                 vnorm = max( vnorm, abs( vr( i ) )+abs( vi( i ) ) )
+              end do
+              call stdlib_qscal( n, one / vnorm, vr, 1 )
+              call stdlib_qscal( n, one / vnorm, vi, 1 )
+           end if
+           return
+     end subroutine stdlib_qlaein
+
+     !> DLAEV2: computes the eigendecomposition of a 2-by-2 symmetric matrix
+     !> [  A   B  ]
+     !> [  B   C  ].
+     !> On return, RT1 is the eigenvalue of larger absolute value, RT2 is the
+     !> eigenvalue of smaller absolute value, and (CS1,SN1) is the unit right
+     !> eigenvector for RT1, giving the decomposition
+     !> [ CS1  SN1 ] [  A   B  ] [ CS1 -SN1 ]  =  [ RT1  0  ]
+     !> [-SN1  CS1 ] [  B   C  ] [ SN1  CS1 ]     [  0  RT2 ].
+
+     pure subroutine stdlib_qlaev2( a, b, c, rt1, rt2, cs1, sn1 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: a, b, c
+           real(qp), intent(out) :: cs1, rt1, rt2, sn1
+       ! =====================================================================
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: sgn1, sgn2
+           real(qp) :: ab, acmn, acmx, acs, adf, cs, ct, df, rt, sm, tb, tn
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           ! compute the eigenvalues
+           sm = a + c
+           df = a - c
+           adf = abs( df )
+           tb = b + b
+           ab = abs( tb )
+           if( abs( a )>abs( c ) ) then
+              acmx = a
+              acmn = c
+           else
+              acmx = c
+              acmn = a
+           end if
+           if( adf>ab ) then
+              rt = adf*sqrt( one+( ab / adf )**2 )
+           else if( adf<ab ) then
+              rt = ab*sqrt( one+( adf / ab )**2 )
+           else
+              ! includes case ab=adf=0
+              rt = ab*sqrt( two )
+           end if
+           if( sm<zero ) then
+              rt1 = half*( sm-rt )
+              sgn1 = -1
+              ! order of execution important.
+              ! to get fully accurate smaller eigenvalue,
+              ! next line needs to be executed in higher precision.
+              rt2 = ( acmx / rt1 )*acmn - ( b / rt1 )*b
+           else if( sm>zero ) then
+              rt1 = half*( sm+rt )
+              sgn1 = 1
+              ! order of execution important.
+              ! to get fully accurate smaller eigenvalue,
+              ! next line needs to be executed in higher precision.
+              rt2 = ( acmx / rt1 )*acmn - ( b / rt1 )*b
+           else
+              ! includes case rt1 = rt2 = 0
+              rt1 = half*rt
+              rt2 = -half*rt
+              sgn1 = 1
+           end if
+           ! compute the eigenvector
+           if( df>=zero ) then
+              cs = df + rt
+              sgn2 = 1
+           else
+              cs = df - rt
+              sgn2 = -1
+           end if
+           acs = abs( cs )
+           if( acs>ab ) then
+              ct = -tb / cs
+              sn1 = one / sqrt( one+ct*ct )
+              cs1 = ct*sn1
+           else
+              if( ab==zero ) then
+                 cs1 = one
+                 sn1 = zero
+              else
+                 tn = -cs / tb
+                 cs1 = one / sqrt( one+tn*tn )
+                 sn1 = tn*cs1
+              end if
+           end if
+           if( sgn1==sgn2 ) then
+              tn = cs1
+              cs1 = -sn1
+              sn1 = tn
+           end if
+           return
+     end subroutine stdlib_qlaev2
+
+     !> DLAEXC: swaps adjacent diagonal blocks T11 and T22 of order 1 or 2 in
+     !> an upper quasi-triangular matrix T by an orthogonal similarity
+     !> transformation.
+     !> T must be in Schur canonical form, that is, block upper triangular
+     !> with 1-by-1 and 2-by-2 diagonal blocks; each 2-by-2 diagonal block
+     !> has its diagonal elements equal and its off-diagonal elements of
+     !> opposite sign.
+
+     subroutine stdlib_qlaexc( wantq, n, t, ldt, q, ldq, j1, n1, n2, work,info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: j1, ldq, ldt, n, n1, n2
+           ! Array Arguments 
+           real(qp), intent(inout) :: q(ldq,*), t(ldt,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ldd = 4
+           integer(ilp), parameter :: ldx = 2
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: ierr, j2, j3, j4, k, nd
+           real(qp) :: cs, dnorm, eps, scale, smlnum, sn, t11, t22, t33, tau, tau1, tau2, temp, &
+                     thresh, wi1, wi2, wr1, wr2, xnorm
+           ! Local Arrays 
+           real(qp) :: d(ldd,4), u(3), u1(3), u2(3), x(ldx,2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 .or. n1==0 .or. n2==0 )return
+           if( j1+n1>n )return
+           j2 = j1 + 1
+           j3 = j1 + 2
+           j4 = j1 + 3
+           if( n1==1 .and. n2==1 ) then
+              ! swap two 1-by-1 blocks.
+              t11 = t( j1, j1 )
+              t22 = t( j2, j2 )
+              ! determine the transformation to perform the interchange.
+              call stdlib_qlartg( t( j1, j2 ), t22-t11, cs, sn, temp )
+              ! apply transformation to the matrix t.
+              if( j3<=n )call stdlib_qrot( n-j1-1, t( j1, j3 ), ldt, t( j2, j3 ), ldt, cs,sn )
+                        
+              call stdlib_qrot( j1-1, t( 1, j1 ), 1, t( 1, j2 ), 1, cs, sn )
+              t( j1, j1 ) = t22
+              t( j2, j2 ) = t11
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_qrot( n, q( 1, j1 ), 1, q( 1, j2 ), 1, cs, sn )
+              end if
+           else
+              ! swapping involves at least one 2-by-2 block.
+              ! copy the diagonal block of order n1+n2 to the local array d
+              ! and compute its norm.
+              nd = n1 + n2
+              call stdlib_qlacpy( 'FULL', nd, nd, t( j1, j1 ), ldt, d, ldd )
+              dnorm = stdlib_qlange( 'MAX', nd, nd, d, ldd, work )
+              ! compute machine-dependent threshold for test for accepting
+              ! swap.
+              eps = stdlib_qlamch( 'P' )
+              smlnum = stdlib_qlamch( 'S' ) / eps
+              thresh = max( ten*eps*dnorm, smlnum )
+              ! solve t11*x - x*t22 = scale*t12 for x.
+              call stdlib_qlasy2( .false., .false., -1, n1, n2, d, ldd,d( n1+1, n1+1 ), ldd, d( 1,&
+                         n1+1 ), ldd, scale, x,ldx, xnorm, ierr )
+              ! swap the adjacent diagonal blocks.
+              k = n1 + n1 + n2 - 3
+              go to ( 10, 20, 30 )k
+              10 continue
+              ! n1 = 1, n2 = 2: generate elementary reflector h so that:
+              ! ( scale, x11, x12 ) h = ( 0, 0, * )
+              u( 1 ) = scale
+              u( 2 ) = x( 1, 1 )
+              u( 3 ) = x( 1, 2 )
+              call stdlib_qlarfg( 3, u( 3 ), u, 1, tau )
+              u( 3 ) = one
+              t11 = t( j1, j1 )
+              ! perform swap provisionally on diagonal block in d.
+              call stdlib_qlarfx( 'L', 3, 3, u, tau, d, ldd, work )
+              call stdlib_qlarfx( 'R', 3, 3, u, tau, d, ldd, work )
+              ! test whether to reject swap.
+              if( max( abs( d( 3, 1 ) ), abs( d( 3, 2 ) ), abs( d( 3,3 )-t11 ) )>thresh )go to &
+                        50
+              ! accept swap: apply transformation to the entire matrix t.
+              call stdlib_qlarfx( 'L', 3, n-j1+1, u, tau, t( j1, j1 ), ldt, work )
+              call stdlib_qlarfx( 'R', j2, 3, u, tau, t( 1, j1 ), ldt, work )
+              t( j3, j1 ) = zero
+              t( j3, j2 ) = zero
+              t( j3, j3 ) = t11
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_qlarfx( 'R', n, 3, u, tau, q( 1, j1 ), ldq, work )
+              end if
+              go to 40
+              20 continue
+              ! n1 = 2, n2 = 1: generate elementary reflector h so that:
+              ! h (  -x11 ) = ( * )
+                ! (  -x21 ) = ( 0 )
+                ! ( scale ) = ( 0 )
+              u( 1 ) = -x( 1, 1 )
+              u( 2 ) = -x( 2, 1 )
+              u( 3 ) = scale
+              call stdlib_qlarfg( 3, u( 1 ), u( 2 ), 1, tau )
+              u( 1 ) = one
+              t33 = t( j3, j3 )
+              ! perform swap provisionally on diagonal block in d.
+              call stdlib_qlarfx( 'L', 3, 3, u, tau, d, ldd, work )
+              call stdlib_qlarfx( 'R', 3, 3, u, tau, d, ldd, work )
+              ! test whether to reject swap.
+              if( max( abs( d( 2, 1 ) ), abs( d( 3, 1 ) ), abs( d( 1,1 )-t33 ) )>thresh )go to &
+                        50
+              ! accept swap: apply transformation to the entire matrix t.
+              call stdlib_qlarfx( 'R', j3, 3, u, tau, t( 1, j1 ), ldt, work )
+              call stdlib_qlarfx( 'L', 3, n-j1, u, tau, t( j1, j2 ), ldt, work )
+              t( j1, j1 ) = t33
+              t( j2, j1 ) = zero
+              t( j3, j1 ) = zero
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_qlarfx( 'R', n, 3, u, tau, q( 1, j1 ), ldq, work )
+              end if
+              go to 40
+              30 continue
+              ! n1 = 2, n2 = 2: generate elementary reflectors h(1) and h(2) so
+              ! that:
+              ! h(2) h(1) (  -x11  -x12 ) = (  *  * )
+                        ! (  -x21  -x22 )   (  0  * )
+                        ! ( scale    0  )   (  0  0 )
+                        ! (    0  scale )   (  0  0 )
+              u1( 1 ) = -x( 1, 1 )
+              u1( 2 ) = -x( 2, 1 )
+              u1( 3 ) = scale
+              call stdlib_qlarfg( 3, u1( 1 ), u1( 2 ), 1, tau1 )
+              u1( 1 ) = one
+              temp = -tau1*( x( 1, 2 )+u1( 2 )*x( 2, 2 ) )
+              u2( 1 ) = -temp*u1( 2 ) - x( 2, 2 )
+              u2( 2 ) = -temp*u1( 3 )
+              u2( 3 ) = scale
+              call stdlib_qlarfg( 3, u2( 1 ), u2( 2 ), 1, tau2 )
+              u2( 1 ) = one
+              ! perform swap provisionally on diagonal block in d.
+              call stdlib_qlarfx( 'L', 3, 4, u1, tau1, d, ldd, work )
+              call stdlib_qlarfx( 'R', 4, 3, u1, tau1, d, ldd, work )
+              call stdlib_qlarfx( 'L', 3, 4, u2, tau2, d( 2, 1 ), ldd, work )
+              call stdlib_qlarfx( 'R', 4, 3, u2, tau2, d( 1, 2 ), ldd, work )
+              ! test whether to reject swap.
+              if( max( abs( d( 3, 1 ) ), abs( d( 3, 2 ) ), abs( d( 4, 1 ) ),abs( d( 4, 2 ) ) )&
+                        >thresh )go to 50
+              ! accept swap: apply transformation to the entire matrix t.
+              call stdlib_qlarfx( 'L', 3, n-j1+1, u1, tau1, t( j1, j1 ), ldt, work )
+              call stdlib_qlarfx( 'R', j4, 3, u1, tau1, t( 1, j1 ), ldt, work )
+              call stdlib_qlarfx( 'L', 3, n-j1+1, u2, tau2, t( j2, j1 ), ldt, work )
+              call stdlib_qlarfx( 'R', j4, 3, u2, tau2, t( 1, j2 ), ldt, work )
+              t( j3, j1 ) = zero
+              t( j3, j2 ) = zero
+              t( j4, j1 ) = zero
+              t( j4, j2 ) = zero
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_qlarfx( 'R', n, 3, u1, tau1, q( 1, j1 ), ldq, work )
+                 call stdlib_qlarfx( 'R', n, 3, u2, tau2, q( 1, j2 ), ldq, work )
+              end if
+              40 continue
+              if( n2==2 ) then
+                 ! standardize new 2-by-2 block t11
+                 call stdlib_qlanv2( t( j1, j1 ), t( j1, j2 ), t( j2, j1 ),t( j2, j2 ), wr1, wi1, &
+                           wr2, wi2, cs, sn )
+                 call stdlib_qrot( n-j1-1, t( j1, j1+2 ), ldt, t( j2, j1+2 ), ldt,cs, sn )
+                 call stdlib_qrot( j1-1, t( 1, j1 ), 1, t( 1, j2 ), 1, cs, sn )
+                 if( wantq )call stdlib_qrot( n, q( 1, j1 ), 1, q( 1, j2 ), 1, cs, sn )
+              end if
+              if( n1==2 ) then
+                 ! standardize new 2-by-2 block t22
+                 j3 = j1 + n2
+                 j4 = j3 + 1
+                 call stdlib_qlanv2( t( j3, j3 ), t( j3, j4 ), t( j4, j3 ),t( j4, j4 ), wr1, wi1, &
+                           wr2, wi2, cs, sn )
+                 if( j3+2<=n )call stdlib_qrot( n-j3-1, t( j3, j3+2 ), ldt, t( j4, j3+2 ),ldt, cs,&
+                            sn )
+                 call stdlib_qrot( j3-1, t( 1, j3 ), 1, t( 1, j4 ), 1, cs, sn )
+                 if( wantq )call stdlib_qrot( n, q( 1, j3 ), 1, q( 1, j4 ), 1, cs, sn )
+              end if
+           end if
+           return
+           ! exit with info = 1 if swap was rejected.
+           50 continue
+           info = 1
+           return
+     end subroutine stdlib_qlaexc
+
+     !> DLAG2: computes the eigenvalues of a 2 x 2 generalized eigenvalue
+     !> problem  A - w B, with scaling as necessary to avoid over-/underflow.
+     !> The scaling factor "s" results in a modified eigenvalue equation
+     !> s A - w B
+     !> where  s  is a non-negative scaling factor chosen so that  w,  w B,
+     !> and  s A  do not overflow and, if possible, do not underflow, either.
+
+     pure subroutine stdlib_qlag2( a, lda, b, ldb, safmin, scale1, scale2, wr1,wr2, wi )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldb
+           real(qp), intent(in) :: safmin
+           real(qp), intent(out) :: scale1, scale2, wi, wr1, wr2
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: fuzzy1 = one+1.0e-5_qp
+           
+           
+           
+           ! Local Scalars 
+           real(qp) :: a11, a12, a21, a22, abi22, anorm, as11, as12, as22, ascale, b11, b12, b22, &
+           binv11, binv22, bmin, bnorm, bscale, bsize, c1, c2, c3, c4, c5, diff, discr, pp, qq, r,&
+            rtmax, rtmin, s1, s2, safmax, shift, ss, sum, wabs, wbig, wdet, wscale, wsize, &
+                      wsmall
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sign,sqrt
+           ! Executable Statements 
+           rtmin = sqrt( safmin )
+           rtmax = one / rtmin
+           safmax = one / safmin
+           ! scale a
+           anorm = max( abs( a( 1, 1 ) )+abs( a( 2, 1 ) ),abs( a( 1, 2 ) )+abs( a( 2, 2 ) ), &
+                     safmin )
+           ascale = one / anorm
+           a11 = ascale*a( 1, 1 )
+           a21 = ascale*a( 2, 1 )
+           a12 = ascale*a( 1, 2 )
+           a22 = ascale*a( 2, 2 )
+           ! perturb b if necessary to insure non-singularity
+           b11 = b( 1, 1 )
+           b12 = b( 1, 2 )
+           b22 = b( 2, 2 )
+           bmin = rtmin*max( abs( b11 ), abs( b12 ), abs( b22 ), rtmin )
+           if( abs( b11 )<bmin )b11 = sign( bmin, b11 )
+           if( abs( b22 )<bmin )b22 = sign( bmin, b22 )
+           ! scale b
+           bnorm = max( abs( b11 ), abs( b12 )+abs( b22 ), safmin )
+           bsize = max( abs( b11 ), abs( b22 ) )
+           bscale = one / bsize
+           b11 = b11*bscale
+           b12 = b12*bscale
+           b22 = b22*bscale
+           ! compute larger eigenvalue by method described by c. van loan
+           ! ( as is a shifted by -shift*b )
+           binv11 = one / b11
+           binv22 = one / b22
+           s1 = a11*binv11
+           s2 = a22*binv22
+           if( abs( s1 )<=abs( s2 ) ) then
+              as12 = a12 - s1*b12
+              as22 = a22 - s1*b22
+              ss = a21*( binv11*binv22 )
+              abi22 = as22*binv22 - ss*b12
+              pp = half*abi22
+              shift = s1
+           else
+              as12 = a12 - s2*b12
+              as11 = a11 - s2*b11
+              ss = a21*( binv11*binv22 )
+              abi22 = -ss*b12
+              pp = half*( as11*binv11+abi22 )
+              shift = s2
+           end if
+           qq = ss*as12
+           if( abs( pp*rtmin )>=one ) then
+              discr = ( rtmin*pp )**2 + qq*safmin
+              r = sqrt( abs( discr ) )*rtmax
+           else
+              if( pp**2+abs( qq )<=safmin ) then
+                 discr = ( rtmax*pp )**2 + qq*safmax
+                 r = sqrt( abs( discr ) )*rtmin
+              else
+                 discr = pp**2 + qq
+                 r = sqrt( abs( discr ) )
+              end if
+           end if
+           ! note: the test of r in the following if is to cover the case when
+                 ! discr is small and negative and is flushed to zero during
+                 ! the calculation of r.  on machines which have a consistent
+                 ! flush-to-zero threshold and handle numbers above that
+                 ! threshold correctly, it would not be necessary.
+           if( discr>=zero .or. r==zero ) then
+              sum = pp + sign( r, pp )
+              diff = pp - sign( r, pp )
+              wbig = shift + sum
+              ! compute smaller eigenvalue
+              wsmall = shift + diff
+              if( half*abs( wbig )>max( abs( wsmall ), safmin ) ) then
+                 wdet = ( a11*a22-a12*a21 )*( binv11*binv22 )
+                 wsmall = wdet / wbig
+              end if
+              ! choose (real) eigenvalue closest to 2,2 element of a*b**(-1)
+              ! for wr1.
+              if( pp>abi22 ) then
+                 wr1 = min( wbig, wsmall )
+                 wr2 = max( wbig, wsmall )
+              else
+                 wr1 = max( wbig, wsmall )
+                 wr2 = min( wbig, wsmall )
+              end if
+              wi = zero
+           else
+              ! complex eigenvalues
+              wr1 = shift + pp
+              wr2 = wr1
+              wi = r
+           end if
+           ! further scaling to avoid underflow and overflow in computing
+           ! scale1 and overflow in computing w*b.
+           ! this scale factor (wscale) is bounded from above using c1 and c2,
+           ! and from below using c3 and c4.
+              ! c1 implements the condition  s a  must never overflow.
+              ! c2 implements the condition  w b  must never overflow.
+              ! c3, with c2,
+                 ! implement the condition that s a - w b must never overflow.
+              ! c4 implements the condition  s    should not underflow.
+              ! c5 implements the condition  max(s,|w|) should be at least 2.
+           c1 = bsize*( safmin*max( one, ascale ) )
+           c2 = safmin*max( one, bnorm )
+           c3 = bsize*safmin
+           if( ascale<=one .and. bsize<=one ) then
+              c4 = min( one, ( ascale / safmin )*bsize )
+           else
+              c4 = one
+           end if
+           if( ascale<=one .or. bsize<=one ) then
+              c5 = min( one, ascale*bsize )
+           else
+              c5 = one
+           end if
+           ! scale first eigenvalue
+           wabs = abs( wr1 ) + abs( wi )
+           wsize = max( safmin, c1, fuzzy1*( wabs*c2+c3 ),min( c4, half*max( wabs, c5 ) ) )
+                     
+           if( wsize/=one ) then
+              wscale = one / wsize
+              if( wsize>one ) then
+                 scale1 = ( max( ascale, bsize )*wscale )*min( ascale, bsize )
+              else
+                 scale1 = ( min( ascale, bsize )*wscale )*max( ascale, bsize )
+              end if
+              wr1 = wr1*wscale
+              if( wi/=zero ) then
+                 wi = wi*wscale
+                 wr2 = wr1
+                 scale2 = scale1
+              end if
+           else
+              scale1 = ascale*bsize
+              scale2 = scale1
+           end if
+           ! scale second eigenvalue (if real)
+           if( wi==zero ) then
+              wsize = max( safmin, c1, fuzzy1*( abs( wr2 )*c2+c3 ),min( c4, half*max( abs( wr2 ), &
+                        c5 ) ) )
+              if( wsize/=one ) then
+                 wscale = one / wsize
+                 if( wsize>one ) then
+                    scale2 = ( max( ascale, bsize )*wscale )*min( ascale, bsize )
+                 else
+                    scale2 = ( min( ascale, bsize )*wscale )*max( ascale, bsize )
+                 end if
+                 wr2 = wr2*wscale
+              else
+                 scale2 = ascale*bsize
+              end if
+           end if
+           return
+     end subroutine stdlib_qlag2
+
+     !> DLAG2S: converts a DOUBLE PRECISION matrix, SA, to a SINGLE
+     !> PRECISION matrix, A.
+     !> RMAX is the overflow for the SINGLE PRECISION arithmetic
+     !> DLAG2S checks that all the entries of A are between -RMAX and
+     !> RMAX. If not the conversion is aborted and a flag is raised.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_qlag2s( m, n, a, lda, sa, ldsa, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, m, n
+           ! Array Arguments 
+           real(dp), intent(out) :: sa(ldsa,*)
+           real(qp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: rmax
+           ! Executable Statements 
+           rmax = stdlib_dlamch( 'O' )
+           do j = 1, n
+              do i = 1, m
+                 if( ( a( i, j )<-rmax ) .or. ( a( i, j )>rmax ) ) then
+                    info = 1
+                    go to 30
+                 end if
+                 sa( i, j ) = a( i, j )
+              end do
+           end do
+           info = 0
+           30 continue
+           return
+     end subroutine stdlib_qlag2s
+
+     !> DLAGS2: computes 2-by-2 orthogonal matrices U, V and Q, such
+     !> that if ( UPPER ) then
+     !> U**T *A*Q = U**T *( A1 A2 )*Q = ( x  0  )
+     !> ( 0  A3 )     ( x  x  )
+     !> and
+     !> V**T*B*Q = V**T *( B1 B2 )*Q = ( x  0  )
+     !> ( 0  B3 )     ( x  x  )
+     !> or if ( .NOT.UPPER ) then
+     !> U**T *A*Q = U**T *( A1 0  )*Q = ( x  x  )
+     !> ( A2 A3 )     ( 0  x  )
+     !> and
+     !> V**T*B*Q = V**T*( B1 0  )*Q = ( x  x  )
+     !> ( B2 B3 )     ( 0  x  )
+     !> The rows of the transformed A and B are parallel, where
+     !> U = (  CSU  SNU ), V = (  CSV SNV ), Q = (  CSQ   SNQ )
+     !> ( -SNU  CSU )      ( -SNV CSV )      ( -SNQ   CSQ )
+     !> Z**T denotes the transpose of Z.
+
+     pure subroutine stdlib_qlags2( upper, a1, a2, a3, b1, b2, b3, csu, snu, csv,snv, csq, snq )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: upper
+           real(qp), intent(in) :: a1, a2, a3, b1, b2, b3
+           real(qp), intent(out) :: csq, csu, csv, snq, snu, snv
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: a, aua11, aua12, aua21, aua22, avb11, avb12, avb21, avb22, b, c, csl, csr, &
+           d, r, s1, s2, snl, snr, ua11, ua11r, ua12, ua21, ua22, ua22r, vb11, vb11r, vb12, vb21, &
+                     vb22, vb22r
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           if( upper ) then
+              ! input matrices a and b are upper triangular matrices
+              ! form matrix c = a*adj(b) = ( a b )
+                                         ! ( 0 d )
+              a = a1*b3
+              d = a3*b1
+              b = a2*b1 - a1*b2
+              ! the svd of real 2-by-2 triangular c
+               ! ( csl -snl )*( a b )*(  csr  snr ) = ( r 0 )
+               ! ( snl  csl ) ( 0 d ) ( -snr  csr )   ( 0 t )
+              call stdlib_qlasv2( a, b, d, s1, s2, snr, csr, snl, csl )
+              if( abs( csl )>=abs( snl ) .or. abs( csr )>=abs( snr ) )then
+                 ! compute the (1,1) and (1,2) elements of u**t *a and v**t *b,
+                 ! and (1,2) element of |u|**t *|a| and |v|**t *|b|.
+                 ua11r = csl*a1
+                 ua12 = csl*a2 + snl*a3
+                 vb11r = csr*b1
+                 vb12 = csr*b2 + snr*b3
+                 aua12 = abs( csl )*abs( a2 ) + abs( snl )*abs( a3 )
+                 avb12 = abs( csr )*abs( b2 ) + abs( snr )*abs( b3 )
+                 ! zero (1,2) elements of u**t *a and v**t *b
+                 if( ( abs( ua11r )+abs( ua12 ) )/=zero ) then
+                    if( aua12 / ( abs( ua11r )+abs( ua12 ) )<=avb12 /( abs( vb11r )+abs( vb12 ) ) &
+                              ) then
+                       call stdlib_qlartg( -ua11r, ua12, csq, snq, r )
+                    else
+                       call stdlib_qlartg( -vb11r, vb12, csq, snq, r )
+                    end if
+                 else
+                    call stdlib_qlartg( -vb11r, vb12, csq, snq, r )
+                 end if
+                 csu = csl
+                 snu = -snl
+                 csv = csr
+                 snv = -snr
+              else
+                 ! compute the (2,1) and (2,2) elements of u**t *a and v**t *b,
+                 ! and (2,2) element of |u|**t *|a| and |v|**t *|b|.
+                 ua21 = -snl*a1
+                 ua22 = -snl*a2 + csl*a3
+                 vb21 = -snr*b1
+                 vb22 = -snr*b2 + csr*b3
+                 aua22 = abs( snl )*abs( a2 ) + abs( csl )*abs( a3 )
+                 avb22 = abs( snr )*abs( b2 ) + abs( csr )*abs( b3 )
+                 ! zero (2,2) elements of u**t*a and v**t*b, and then swap.
+                 if( ( abs( ua21 )+abs( ua22 ) )/=zero ) then
+                    if( aua22 / ( abs( ua21 )+abs( ua22 ) )<=avb22 /( abs( vb21 )+abs( vb22 ) ) ) &
+                              then
+                       call stdlib_qlartg( -ua21, ua22, csq, snq, r )
+                    else
+                       call stdlib_qlartg( -vb21, vb22, csq, snq, r )
+                    end if
+                 else
+                    call stdlib_qlartg( -vb21, vb22, csq, snq, r )
+                 end if
+                 csu = snl
+                 snu = csl
+                 csv = snr
+                 snv = csr
+              end if
+           else
+              ! input matrices a and b are lower triangular matrices
+              ! form matrix c = a*adj(b) = ( a 0 )
+                                         ! ( c d )
+              a = a1*b3
+              d = a3*b1
+              c = a2*b3 - a3*b2
+              ! the svd of real 2-by-2 triangular c
+               ! ( csl -snl )*( a 0 )*(  csr  snr ) = ( r 0 )
+               ! ( snl  csl ) ( c d ) ( -snr  csr )   ( 0 t )
+              call stdlib_qlasv2( a, c, d, s1, s2, snr, csr, snl, csl )
+              if( abs( csr )>=abs( snr ) .or. abs( csl )>=abs( snl ) )then
+                 ! compute the (2,1) and (2,2) elements of u**t *a and v**t *b,
+                 ! and (2,1) element of |u|**t *|a| and |v|**t *|b|.
+                 ua21 = -snr*a1 + csr*a2
+                 ua22r = csr*a3
+                 vb21 = -snl*b1 + csl*b2
+                 vb22r = csl*b3
+                 aua21 = abs( snr )*abs( a1 ) + abs( csr )*abs( a2 )
+                 avb21 = abs( snl )*abs( b1 ) + abs( csl )*abs( b2 )
+                 ! zero (2,1) elements of u**t *a and v**t *b.
+                 if( ( abs( ua21 )+abs( ua22r ) )/=zero ) then
+                    if( aua21 / ( abs( ua21 )+abs( ua22r ) )<=avb21 /( abs( vb21 )+abs( vb22r ) ) &
+                              ) then
+                       call stdlib_qlartg( ua22r, ua21, csq, snq, r )
+                    else
+                       call stdlib_qlartg( vb22r, vb21, csq, snq, r )
+                    end if
+                 else
+                    call stdlib_qlartg( vb22r, vb21, csq, snq, r )
+                 end if
+                 csu = csr
+                 snu = -snr
+                 csv = csl
+                 snv = -snl
+              else
+                 ! compute the (1,1) and (1,2) elements of u**t *a and v**t *b,
+                 ! and (1,1) element of |u|**t *|a| and |v|**t *|b|.
+                 ua11 = csr*a1 + snr*a2
+                 ua12 = snr*a3
+                 vb11 = csl*b1 + snl*b2
+                 vb12 = snl*b3
+                 aua11 = abs( csr )*abs( a1 ) + abs( snr )*abs( a2 )
+                 avb11 = abs( csl )*abs( b1 ) + abs( snl )*abs( b2 )
+                 ! zero (1,1) elements of u**t*a and v**t*b, and then swap.
+                 if( ( abs( ua11 )+abs( ua12 ) )/=zero ) then
+                    if( aua11 / ( abs( ua11 )+abs( ua12 ) )<=avb11 /( abs( vb11 )+abs( vb12 ) ) ) &
+                              then
+                       call stdlib_qlartg( ua12, ua11, csq, snq, r )
+                    else
+                       call stdlib_qlartg( vb12, vb11, csq, snq, r )
+                    end if
+                 else
+                    call stdlib_qlartg( vb12, vb11, csq, snq, r )
+                 end if
+                 csu = snr
+                 snu = csr
+                 csv = snl
+                 snv = csl
+              end if
+           end if
+           return
+     end subroutine stdlib_qlags2
+
+     !> DLAGTF: factorizes the matrix (T - lambda*I), where T is an n by n
+     !> tridiagonal matrix and lambda is a scalar, as
+     !> T - lambda*I = PLU,
+     !> where P is a permutation matrix, L is a unit lower tridiagonal matrix
+     !> with at most one non-zero sub-diagonal elements per column and U is
+     !> an upper triangular matrix with at most two non-zero super-diagonal
+     !> elements per column.
+     !> The factorization is obtained by Gaussian elimination with partial
+     !> pivoting and implicit row scaling.
+     !> The parameter LAMBDA is included in the routine so that DLAGTF may
+     !> be used, in conjunction with DLAGTS, to obtain eigenvectors of T by
+     !> inverse iteration.
+
+     pure subroutine stdlib_qlagtf( n, a, lambda, b, c, tol, d, in, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: lambda, tol
+           ! Array Arguments 
+           integer(ilp), intent(out) :: in(*)
+           real(qp), intent(inout) :: a(*), b(*), c(*)
+           real(qp), intent(out) :: d(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: k
+           real(qp) :: eps, mult, piv1, piv2, scale1, scale2, temp, tl
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'DLAGTF', -info )
+              return
+           end if
+           if( n==0 )return
+           a( 1 ) = a( 1 ) - lambda
+           in( n ) = 0
+           if( n==1 ) then
+              if( a( 1 )==zero )in( 1 ) = 1
+              return
+           end if
+           eps = stdlib_qlamch( 'EPSILON' )
+           tl = max( tol, eps )
+           scale1 = abs( a( 1 ) ) + abs( b( 1 ) )
+           loop_10: do k = 1, n - 1
+              a( k+1 ) = a( k+1 ) - lambda
+              scale2 = abs( c( k ) ) + abs( a( k+1 ) )
+              if( k<( n-1 ) )scale2 = scale2 + abs( b( k+1 ) )
+              if( a( k )==zero ) then
+                 piv1 = zero
+              else
+                 piv1 = abs( a( k ) ) / scale1
+              end if
+              if( c( k )==zero ) then
+                 in( k ) = 0
+                 piv2 = zero
+                 scale1 = scale2
+                 if( k<( n-1 ) )d( k ) = zero
+              else
+                 piv2 = abs( c( k ) ) / scale2
+                 if( piv2<=piv1 ) then
+                    in( k ) = 0
+                    scale1 = scale2
+                    c( k ) = c( k ) / a( k )
+                    a( k+1 ) = a( k+1 ) - c( k )*b( k )
+                    if( k<( n-1 ) )d( k ) = zero
+                 else
+                    in( k ) = 1
+                    mult = a( k ) / c( k )
+                    a( k ) = c( k )
+                    temp = a( k+1 )
+                    a( k+1 ) = b( k ) - mult*temp
+                    if( k<( n-1 ) ) then
+                       d( k ) = b( k+1 )
+                       b( k+1 ) = -mult*d( k )
+                    end if
+                    b( k ) = temp
+                    c( k ) = mult
+                 end if
+              end if
+              if( ( max( piv1, piv2 )<=tl ) .and. ( in( n )==0 ) )in( n ) = k
+           end do loop_10
+           if( ( abs( a( n ) )<=scale1*tl ) .and. ( in( n )==0 ) )in( n ) = n
+           return
+     end subroutine stdlib_qlagtf
+
+     !> DLAGTM: performs a matrix-vector product of the form
+     !> B := alpha * A * X + beta * B
+     !> where A is a tridiagonal matrix of order N, B and X are N by NRHS
+     !> matrices, and alpha and beta are real scalars, each of which may be
+     !> 0., 1., or -1.
+
+     pure subroutine stdlib_qlagtm( trans, n, nrhs, alpha, dl, d, du, x, ldx, beta,b, ldb )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(qp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           real(qp), intent(inout) :: b(ldb,*)
+           real(qp), intent(in) :: d(*), dl(*), du(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           if( n==0 )return
+           ! multiply b by beta if beta/=1.
+           if( beta==zero ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = zero
+                 end do
+              end do
+           else if( beta==-one ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = -b( i, j )
+                 end do
+              end do
+           end if
+           if( alpha==one ) then
+              if( stdlib_lsame( trans, 'N' ) ) then
+                 ! compute b := b + a*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j ) +du( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) + dl( n-1 )*x( n-1, j ) +d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + dl( i-1 )*x( i-1, j ) +d( i )*x( i, j ) + du( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else
+                 ! compute b := b + a**t*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j ) +dl( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) + du( n-1 )*x( n-1, j ) +d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + du( i-1 )*x( i-1, j ) +d( i )*x( i, j ) + dl( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              end if
+           else if( alpha==-one ) then
+              if( stdlib_lsame( trans, 'N' ) ) then
+                 ! compute b := b - a*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j ) -du( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) - dl( n-1 )*x( n-1, j ) -d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - dl( i-1 )*x( i-1, j ) -d( i )*x( i, j ) - du( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else
+                 ! compute b := b - a**t*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j ) -dl( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) - du( n-1 )*x( n-1, j ) -d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - du( i-1 )*x( i-1, j ) -d( i )*x( i, j ) - dl( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qlagtm
+
+     !> DLAGTS: may be used to solve one of the systems of equations
+     !> (T - lambda*I)*x = y   or   (T - lambda*I)**T*x = y,
+     !> where T is an n by n tridiagonal matrix, for x, following the
+     !> factorization of (T - lambda*I) as
+     !> (T - lambda*I) = P*L*U ,
+     !> by routine DLAGTF. The choice of equation to be solved is
+     !> controlled by the argument JOB, and in each case there is an option
+     !> to perturb zero or very small diagonal elements of U, this option
+     !> being intended for use in applications such as inverse iteration.
+
+     pure subroutine stdlib_qlagts( job, n, a, b, c, d, in, y, tol, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: job, n
+           real(qp), intent(inout) :: tol
+           ! Array Arguments 
+           integer(ilp), intent(in) :: in(*)
+           real(qp), intent(in) :: a(*), b(*), c(*), d(*)
+           real(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: k
+           real(qp) :: absak, ak, bignum, eps, pert, sfmin, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sign
+           ! Executable Statements 
+           info = 0
+           if( ( abs( job )>2 ) .or. ( job==0 ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAGTS', -info )
+              return
+           end if
+           if( n==0 )return
+           eps = stdlib_qlamch( 'EPSILON' )
+           sfmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           bignum = one / sfmin
+           if( job<0 ) then
+              if( tol<=zero ) then
+                 tol = abs( a( 1 ) )
+                 if( n>1 )tol = max( tol, abs( a( 2 ) ), abs( b( 1 ) ) )
+                 do k = 3, n
+                    tol = max( tol, abs( a( k ) ), abs( b( k-1 ) ),abs( d( k-2 ) ) )
+                 end do
+                 tol = tol*eps
+                 if( tol==zero )tol = eps
+              end if
+           end if
+           if( abs( job )==1 ) then
+              do k = 2, n
+                 if( in( k-1 )==0 ) then
+                    y( k ) = y( k ) - c( k-1 )*y( k-1 )
+                 else
+                    temp = y( k-1 )
+                    y( k-1 ) = y( k )
+                    y( k ) = temp - c( k-1 )*y( k )
+                 end if
+              end do
+              if( job==1 ) then
+                 loop_30: do k = n, 1, -1
+                    if( k<=n-2 ) then
+                       temp = y( k ) - b( k )*y( k+1 ) - d( k )*y( k+2 )
+                    else if( k==n-1 ) then
+                       temp = y( k ) - b( k )*y( k+1 )
+                    else
+                       temp = y( k )
+                    end if
+                    ak = a( k )
+                    absak = abs( ak )
+                    if( absak<one ) then
+                       if( absak<sfmin ) then
+                          if( absak==zero .or. abs( temp )*sfmin>absak )then
+                             info = k
+                             return
+                          else
+                             temp = temp*bignum
+                             ak = ak*bignum
+                          end if
+                       else if( abs( temp )>absak*bignum ) then
+                          info = k
+                          return
+                       end if
+                    end if
+                    y( k ) = temp / ak
+                 end do loop_30
+              else
+                 loop_50: do k = n, 1, -1
+                    if( k<=n-2 ) then
+                       temp = y( k ) - b( k )*y( k+1 ) - d( k )*y( k+2 )
+                    else if( k==n-1 ) then
+                       temp = y( k ) - b( k )*y( k+1 )
+                    else
+                       temp = y( k )
+                    end if
+                    ak = a( k )
+                    pert = sign( tol, ak )
+                    40 continue
+                    absak = abs( ak )
+                    if( absak<one ) then
+                       if( absak<sfmin ) then
+                          if( absak==zero .or. abs( temp )*sfmin>absak )then
+                             ak = ak + pert
+                             pert = 2*pert
+                             go to 40
+                          else
+                             temp = temp*bignum
+                             ak = ak*bignum
+                          end if
+                       else if( abs( temp )>absak*bignum ) then
+                          ak = ak + pert
+                          pert = 2*pert
+                          go to 40
+                       end if
+                    end if
+                    y( k ) = temp / ak
+                 end do loop_50
+              end if
+           else
+              ! come to here if  job = 2 or -2
+              if( job==2 ) then
+                 loop_60: do k = 1, n
+                    if( k>=3 ) then
+                       temp = y( k ) - b( k-1 )*y( k-1 ) - d( k-2 )*y( k-2 )
+                    else if( k==2 ) then
+                       temp = y( k ) - b( k-1 )*y( k-1 )
+                    else
+                       temp = y( k )
+                    end if
+                    ak = a( k )
+                    absak = abs( ak )
+                    if( absak<one ) then
+                       if( absak<sfmin ) then
+                          if( absak==zero .or. abs( temp )*sfmin>absak )then
+                             info = k
+                             return
+                          else
+                             temp = temp*bignum
+                             ak = ak*bignum
+                          end if
+                       else if( abs( temp )>absak*bignum ) then
+                          info = k
+                          return
+                       end if
+                    end if
+                    y( k ) = temp / ak
+                 end do loop_60
+              else
+                 loop_80: do k = 1, n
+                    if( k>=3 ) then
+                       temp = y( k ) - b( k-1 )*y( k-1 ) - d( k-2 )*y( k-2 )
+                    else if( k==2 ) then
+                       temp = y( k ) - b( k-1 )*y( k-1 )
+                    else
+                       temp = y( k )
+                    end if
+                    ak = a( k )
+                    pert = sign( tol, ak )
+                    70 continue
+                    absak = abs( ak )
+                    if( absak<one ) then
+                       if( absak<sfmin ) then
+                          if( absak==zero .or. abs( temp )*sfmin>absak )then
+                             ak = ak + pert
+                             pert = 2*pert
+                             go to 70
+                          else
+                             temp = temp*bignum
+                             ak = ak*bignum
+                          end if
+                       else if( abs( temp )>absak*bignum ) then
+                          ak = ak + pert
+                          pert = 2*pert
+                          go to 70
+                       end if
+                    end if
+                    y( k ) = temp / ak
+                 end do loop_80
+              end if
+              do k = n, 2, -1
+                 if( in( k-1 )==0 ) then
+                    y( k-1 ) = y( k-1 ) - c( k-1 )*y( k )
+                 else
+                    temp = y( k-1 )
+                    y( k-1 ) = y( k )
+                    y( k ) = temp - c( k-1 )*y( k )
+                 end if
+              end do
+           end if
+     end subroutine stdlib_qlagts
+
+     !> DLAGV2: computes the Generalized Schur factorization of a real 2-by-2
+     !> matrix pencil (A,B) where B is upper triangular. This routine
+     !> computes orthogonal (rotation) matrices given by CSL, SNL and CSR,
+     !> SNR such that
+     !> 1) if the pencil (A,B) has two real eigenvalues (include 0/0 or 1/0
+     !> types), then
+     !> [ a11 a12 ] := [  CSL  SNL ] [ a11 a12 ] [  CSR -SNR ]
+     !> [  0  a22 ]    [ -SNL  CSL ] [ a21 a22 ] [  SNR  CSR ]
+     !> [ b11 b12 ] := [  CSL  SNL ] [ b11 b12 ] [  CSR -SNR ]
+     !> [  0  b22 ]    [ -SNL  CSL ] [  0  b22 ] [  SNR  CSR ],
+     !> 2) if the pencil (A,B) has a pair of complex conjugate eigenvalues,
+     !> then
+     !> [ a11 a12 ] := [  CSL  SNL ] [ a11 a12 ] [  CSR -SNR ]
+     !> [ a21 a22 ]    [ -SNL  CSL ] [ a21 a22 ] [  SNR  CSR ]
+     !> [ b11  0  ] := [  CSL  SNL ] [ b11 b12 ] [  CSR -SNR ]
+     !> [  0  b22 ]    [ -SNL  CSL ] [  0  b22 ] [  SNR  CSR ]
+     !> where b11 >= b22 > 0.
+
+     pure subroutine stdlib_qlagv2( a, lda, b, ldb, alphar, alphai, beta, csl, snl,csr, snr )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldb
+           real(qp), intent(out) :: csl, csr, snl, snr
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: alphai(2), alphar(2), beta(2)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: anorm, ascale, bnorm, bscale, h1, h2, h3, qq, r, rr, safmin, scale1, &
+                     scale2, t, ulp, wi, wr1, wr2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           safmin = stdlib_qlamch( 'S' )
+           ulp = stdlib_qlamch( 'P' )
+           ! scale a
+           anorm = max( abs( a( 1, 1 ) )+abs( a( 2, 1 ) ),abs( a( 1, 2 ) )+abs( a( 2, 2 ) ), &
+                     safmin )
+           ascale = one / anorm
+           a( 1, 1 ) = ascale*a( 1, 1 )
+           a( 1, 2 ) = ascale*a( 1, 2 )
+           a( 2, 1 ) = ascale*a( 2, 1 )
+           a( 2, 2 ) = ascale*a( 2, 2 )
+           ! scale b
+           bnorm = max( abs( b( 1, 1 ) ), abs( b( 1, 2 ) )+abs( b( 2, 2 ) ),safmin )
+           bscale = one / bnorm
+           b( 1, 1 ) = bscale*b( 1, 1 )
+           b( 1, 2 ) = bscale*b( 1, 2 )
+           b( 2, 2 ) = bscale*b( 2, 2 )
+           ! check if a can be deflated
+           if( abs( a( 2, 1 ) )<=ulp ) then
+              csl = one
+              snl = zero
+              csr = one
+              snr = zero
+              a( 2, 1 ) = zero
+              b( 2, 1 ) = zero
+              wi = zero
+           ! check if b is singular
+           else if( abs( b( 1, 1 ) )<=ulp ) then
+              call stdlib_qlartg( a( 1, 1 ), a( 2, 1 ), csl, snl, r )
+              csr = one
+              snr = zero
+              call stdlib_qrot( 2, a( 1, 1 ), lda, a( 2, 1 ), lda, csl, snl )
+              call stdlib_qrot( 2, b( 1, 1 ), ldb, b( 2, 1 ), ldb, csl, snl )
+              a( 2, 1 ) = zero
+              b( 1, 1 ) = zero
+              b( 2, 1 ) = zero
+              wi = zero
+           else if( abs( b( 2, 2 ) )<=ulp ) then
+              call stdlib_qlartg( a( 2, 2 ), a( 2, 1 ), csr, snr, t )
+              snr = -snr
+              call stdlib_qrot( 2, a( 1, 1 ), 1, a( 1, 2 ), 1, csr, snr )
+              call stdlib_qrot( 2, b( 1, 1 ), 1, b( 1, 2 ), 1, csr, snr )
+              csl = one
+              snl = zero
+              a( 2, 1 ) = zero
+              b( 2, 1 ) = zero
+              b( 2, 2 ) = zero
+              wi = zero
+           else
+              ! b is nonsingular, first compute the eigenvalues of (a,b)
+              call stdlib_qlag2( a, lda, b, ldb, safmin, scale1, scale2, wr1, wr2,wi )
+              if( wi==zero ) then
+                 ! two real eigenvalues, compute s*a-w*b
+                 h1 = scale1*a( 1, 1 ) - wr1*b( 1, 1 )
+                 h2 = scale1*a( 1, 2 ) - wr1*b( 1, 2 )
+                 h3 = scale1*a( 2, 2 ) - wr1*b( 2, 2 )
+                 rr = stdlib_qlapy2( h1, h2 )
+                 qq = stdlib_qlapy2( scale1*a( 2, 1 ), h3 )
+                 if( rr>qq ) then
+                    ! find right rotation matrix to zero 1,1 element of
+                    ! (sa - wb)
+                    call stdlib_qlartg( h2, h1, csr, snr, t )
+                 else
+                    ! find right rotation matrix to zero 2,1 element of
+                    ! (sa - wb)
+                    call stdlib_qlartg( h3, scale1*a( 2, 1 ), csr, snr, t )
+                 end if
+                 snr = -snr
+                 call stdlib_qrot( 2, a( 1, 1 ), 1, a( 1, 2 ), 1, csr, snr )
+                 call stdlib_qrot( 2, b( 1, 1 ), 1, b( 1, 2 ), 1, csr, snr )
+                 ! compute inf norms of a and b
+                 h1 = max( abs( a( 1, 1 ) )+abs( a( 1, 2 ) ),abs( a( 2, 1 ) )+abs( a( 2, 2 ) ) )
+                           
+                 h2 = max( abs( b( 1, 1 ) )+abs( b( 1, 2 ) ),abs( b( 2, 1 ) )+abs( b( 2, 2 ) ) )
+                           
+                 if( ( scale1*h1 )>=abs( wr1 )*h2 ) then
+                    ! find left rotation matrix q to zero out b(2,1)
+                    call stdlib_qlartg( b( 1, 1 ), b( 2, 1 ), csl, snl, r )
+                 else
+                    ! find left rotation matrix q to zero out a(2,1)
+                    call stdlib_qlartg( a( 1, 1 ), a( 2, 1 ), csl, snl, r )
+                 end if
+                 call stdlib_qrot( 2, a( 1, 1 ), lda, a( 2, 1 ), lda, csl, snl )
+                 call stdlib_qrot( 2, b( 1, 1 ), ldb, b( 2, 1 ), ldb, csl, snl )
+                 a( 2, 1 ) = zero
+                 b( 2, 1 ) = zero
+              else
+                 ! a pair of complex conjugate eigenvalues
+                 ! first compute the svd of the matrix b
+                 call stdlib_qlasv2( b( 1, 1 ), b( 1, 2 ), b( 2, 2 ), r, t, snr,csr, snl, csl )
+                           
+                 ! form (a,b) := q(a,b)z**t where q is left rotation matrix and
+                 ! z is right rotation matrix computed from stdlib_qlasv2
+                 call stdlib_qrot( 2, a( 1, 1 ), lda, a( 2, 1 ), lda, csl, snl )
+                 call stdlib_qrot( 2, b( 1, 1 ), ldb, b( 2, 1 ), ldb, csl, snl )
+                 call stdlib_qrot( 2, a( 1, 1 ), 1, a( 1, 2 ), 1, csr, snr )
+                 call stdlib_qrot( 2, b( 1, 1 ), 1, b( 1, 2 ), 1, csr, snr )
+                 b( 2, 1 ) = zero
+                 b( 1, 2 ) = zero
+              end if
+           end if
+           ! unscaling
+           a( 1, 1 ) = anorm*a( 1, 1 )
+           a( 2, 1 ) = anorm*a( 2, 1 )
+           a( 1, 2 ) = anorm*a( 1, 2 )
+           a( 2, 2 ) = anorm*a( 2, 2 )
+           b( 1, 1 ) = bnorm*b( 1, 1 )
+           b( 2, 1 ) = bnorm*b( 2, 1 )
+           b( 1, 2 ) = bnorm*b( 1, 2 )
+           b( 2, 2 ) = bnorm*b( 2, 2 )
+           if( wi==zero ) then
+              alphar( 1 ) = a( 1, 1 )
+              alphar( 2 ) = a( 2, 2 )
+              alphai( 1 ) = zero
+              alphai( 2 ) = zero
+              beta( 1 ) = b( 1, 1 )
+              beta( 2 ) = b( 2, 2 )
+           else
+              alphar( 1 ) = anorm*wr1 / scale1 / bnorm
+              alphai( 1 ) = anorm*wi / scale1 / bnorm
+              alphar( 2 ) = alphar( 1 )
+              alphai( 2 ) = -alphai( 1 )
+              beta( 1 ) = one
+              beta( 2 ) = one
+           end if
+           return
+     end subroutine stdlib_qlagv2
+
+     !> DLAHQR: is an auxiliary routine called by DHSEQR to update the
+     !> eigenvalues and Schur decomposition already computed by DHSEQR, by
+     !> dealing with the Hessenberg submatrix in rows and columns ILO to
+     !> IHI.
+
+     pure subroutine stdlib_qlahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, ldz, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(qp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(qp), intent(out) :: wi(*), wr(*)
+        ! =========================================================
+           ! Parameters 
+           real(qp), parameter :: dat1 = 3.0_qp/4.0_qp
+           real(qp), parameter :: dat2 = -0.4375_qp
+           integer(ilp), parameter :: kexsh = 10
+           
+           
+           
+           ! Local Scalars 
+           real(qp) :: aa, ab, ba, bb, cs, det, h11, h12, h21, h21s, h22, rt1i, rt1r, rt2i, rt2r, &
+                     rtdisc, s, safmax, safmin, smlnum, sn, sum, t1, t2, t3, tr, tst, ulp, v2, v3
+           integer(ilp) :: i, i1, i2, its, itmax, j, k, l, m, nh, nr, nz, kdefl
+           ! Local Arrays 
+           real(qp) :: v(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           if( ilo==ihi ) then
+              wr( ilo ) = h( ilo, ilo )
+              wi( ilo ) = zero
+              return
+           end if
+           ! ==== clear out the trash ====
+           do j = ilo, ihi - 3
+              h( j+2, j ) = zero
+              h( j+3, j ) = zero
+           end do
+           if( ilo<=ihi-2 )h( ihi, ihi-2 ) = zero
+           nh = ihi - ilo + 1
+           nz = ihiz - iloz + 1
+           ! set machine-dependent constants for the stopping criterion.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = one / safmin
+           call stdlib_qlabad( safmin, safmax )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin*( real( nh,KIND=qp) / ulp )
+           ! i1 and i2 are the indices of the first row and last column of h
+           ! to which transformations must be applied. if eigenvalues only are
+           ! being computed, i1 and i2 are set inside the main loop.
+           if( wantt ) then
+              i1 = 1
+              i2 = n
+           end if
+           ! itmax is the total number of qr iterations allowed.
+           itmax = 30 * max( 10, nh )
+           ! kdefl counts the number of iterations since a deflation
+           kdefl = 0
+           ! the main loop begins here. i is the loop index and decreases from
+           ! ihi to ilo in steps of 1 or 2. each iteration of the loop works
+           ! with the active submatrix in rows and columns l to i.
+           ! eigenvalues i+1 to ihi have already converged. either l = ilo or
+           ! h(l,l-1) is negligible so that the matrix splits.
+           i = ihi
+           20 continue
+           l = ilo
+           if( i<ilo )go to 160
+           ! perform qr iterations on rows and columns ilo to i until a
+           ! submatrix of order 1 or 2 splits off at the bottom because a
+           ! subdiagonal element has become negligible.
+           loop_140: do its = 0, itmax
+              ! look for a single small subdiagonal element.
+              do k = i, l + 1, -1
+                 if( abs( h( k, k-1 ) )<=smlnum )go to 40
+                 tst = abs( h( k-1, k-1 ) ) + abs( h( k, k ) )
+                 if( tst==zero ) then
+                    if( k-2>=ilo )tst = tst + abs( h( k-1, k-2 ) )
+                    if( k+1<=ihi )tst = tst + abs( h( k+1, k ) )
+                 end if
+                 ! ==== the following is a conservative small subdiagonal
+                 ! .    deflation  criterion due to ahues
+                 ! .    1997). it has better mathematical foundation and
+                 ! .    improves accuracy in some cases.  ====
+                 if( abs( h( k, k-1 ) )<=ulp*tst ) then
+                    ab = max( abs( h( k, k-1 ) ), abs( h( k-1, k ) ) )
+                    ba = min( abs( h( k, k-1 ) ), abs( h( k-1, k ) ) )
+                    aa = max( abs( h( k, k ) ),abs( h( k-1, k-1 )-h( k, k ) ) )
+                    bb = min( abs( h( k, k ) ),abs( h( k-1, k-1 )-h( k, k ) ) )
+                    s = aa + ab
+                    if( ba*( ab / s )<=max( smlnum,ulp*( bb*( aa / s ) ) ) )go to 40
+                 end if
+              end do
+              40 continue
+              l = k
+              if( l>ilo ) then
+                 ! h(l,l-1) is negligible
+                 h( l, l-1 ) = zero
+              end if
+              ! exit from loop if a submatrix of order 1 or 2 has split off.
+              if( l>=i-1 )go to 150
+              kdefl = kdefl + 1
+              ! now the active submatrix is in rows and columns l to i. if
+              ! eigenvalues only are being computed, only the active submatrix
+              ! need be transformed.
+              if( .not.wantt ) then
+                 i1 = l
+                 i2 = i
+              end if
+              if( mod(kdefl,2*kexsh)==0 ) then
+                 ! exceptional shift.
+                 s = abs( h( i, i-1 ) ) + abs( h( i-1, i-2 ) )
+                 h11 = dat1*s + h( i, i )
+                 h12 = dat2*s
+                 h21 = s
+                 h22 = h11
+              else if( mod(kdefl,kexsh)==0 ) then
+                 ! exceptional shift.
+                 s = abs( h( l+1, l ) ) + abs( h( l+2, l+1 ) )
+                 h11 = dat1*s + h( l, l )
+                 h12 = dat2*s
+                 h21 = s
+                 h22 = h11
+              else
+                 ! prepare to use francis' double shift
+                 ! (i.e. 2nd degree generalized rayleigh quotient)
+                 h11 = h( i-1, i-1 )
+                 h21 = h( i, i-1 )
+                 h12 = h( i-1, i )
+                 h22 = h( i, i )
+              end if
+              s = abs( h11 ) + abs( h12 ) + abs( h21 ) + abs( h22 )
+              if( s==zero ) then
+                 rt1r = zero
+                 rt1i = zero
+                 rt2r = zero
+                 rt2i = zero
+              else
+                 h11 = h11 / s
+                 h21 = h21 / s
+                 h12 = h12 / s
+                 h22 = h22 / s
+                 tr = ( h11+h22 ) / two
+                 det = ( h11-tr )*( h22-tr ) - h12*h21
+                 rtdisc = sqrt( abs( det ) )
+                 if( det>=zero ) then
+                    ! ==== complex conjugate shifts ====
+                    rt1r = tr*s
+                    rt2r = rt1r
+                    rt1i = rtdisc*s
+                    rt2i = -rt1i
+                 else
+                    ! ==== realshifts (use only one of them,KIND=qp)  ====
+                    rt1r = tr + rtdisc
+                    rt2r = tr - rtdisc
+                    if( abs( rt1r-h22 )<=abs( rt2r-h22 ) ) then
+                       rt1r = rt1r*s
+                       rt2r = rt1r
+                    else
+                       rt2r = rt2r*s
+                       rt1r = rt2r
+                    end if
+                    rt1i = zero
+                    rt2i = zero
+                 end if
+              end if
+              ! look for two consecutive small subdiagonal elements.
+              do m = i - 2, l, -1
+                 ! determine the effect of starting the double-shift qr
+                 ! iteration at row m, and see if this would make h(m,m-1)
+                 ! negligible.  (the following uses scaling to avoid
+                 ! overflows and most underflows.)
+                 h21s = h( m+1, m )
+                 s = abs( h( m, m )-rt2r ) + abs( rt2i ) + abs( h21s )
+                 h21s = h( m+1, m ) / s
+                 v( 1 ) = h21s*h( m, m+1 ) + ( h( m, m )-rt1r )*( ( h( m, m )-rt2r ) / s ) - &
+                           rt1i*( rt2i / s )
+                 v( 2 ) = h21s*( h( m, m )+h( m+1, m+1 )-rt1r-rt2r )
+                 v( 3 ) = h21s*h( m+2, m+1 )
+                 s = abs( v( 1 ) ) + abs( v( 2 ) ) + abs( v( 3 ) )
+                 v( 1 ) = v( 1 ) / s
+                 v( 2 ) = v( 2 ) / s
+                 v( 3 ) = v( 3 ) / s
+                 if( m==l )go to 60
+                 if( abs( h( m, m-1 ) )*( abs( v( 2 ) )+abs( v( 3 ) ) )<=ulp*abs( v( 1 ) )*( abs( &
+                           h( m-1, m-1 ) )+abs( h( m,m ) )+abs( h( m+1, m+1 ) ) ) )go to 60
+              end do
+              60 continue
+              ! double-shift qr step
+              loop_130: do k = m, i - 1
+                 ! the first iteration of this loop determines a reflection g
+                 ! from the vector v and applies it from left and right to h,
+                 ! thus creating a nonzero bulge below the subdiagonal.
+                 ! each subsequent iteration determines a reflection g to
+                 ! restore the hessenberg form in the (k-1)th column, and thus
+                 ! chases the bulge one step toward the bottom of the active
+                 ! submatrix. nr is the order of g.
+                 nr = min( 3, i-k+1 )
+                 if( k>m )call stdlib_qcopy( nr, h( k, k-1 ), 1, v, 1 )
+                 call stdlib_qlarfg( nr, v( 1 ), v( 2 ), 1, t1 )
+                 if( k>m ) then
+                    h( k, k-1 ) = v( 1 )
+                    h( k+1, k-1 ) = zero
+                    if( k<i-1 )h( k+2, k-1 ) = zero
+                 else if( m>l ) then
+                     ! ==== use the following instead of
+                     ! .    h( k, k-1 ) = -h( k, k-1 ) to
+                     ! .    avoid a bug when v(2) and v(3)
+                     ! .    underflow. ====
+                    h( k, k-1 ) = h( k, k-1 )*( one-t1 )
+                 end if
+                 v2 = v( 2 )
+                 t2 = t1*v2
+                 if( nr==3 ) then
+                    v3 = v( 3 )
+                    t3 = t1*v3
+                    ! apply g from the left to transform the rows of the matrix
+                    ! in columns k to i2.
+                    do j = k, i2
+                       sum = h( k, j ) + v2*h( k+1, j ) + v3*h( k+2, j )
+                       h( k, j ) = h( k, j ) - sum*t1
+                       h( k+1, j ) = h( k+1, j ) - sum*t2
+                       h( k+2, j ) = h( k+2, j ) - sum*t3
+                    end do
+                    ! apply g from the right to transform the columns of the
+                    ! matrix in rows i1 to min(k+3,i).
+                    do j = i1, min( k+3, i )
+                       sum = h( j, k ) + v2*h( j, k+1 ) + v3*h( j, k+2 )
+                       h( j, k ) = h( j, k ) - sum*t1
+                       h( j, k+1 ) = h( j, k+1 ) - sum*t2
+                       h( j, k+2 ) = h( j, k+2 ) - sum*t3
+                    end do
+                    if( wantz ) then
+                       ! accumulate transformations in the matrix z
+                       do j = iloz, ihiz
+                          sum = z( j, k ) + v2*z( j, k+1 ) + v3*z( j, k+2 )
+                          z( j, k ) = z( j, k ) - sum*t1
+                          z( j, k+1 ) = z( j, k+1 ) - sum*t2
+                          z( j, k+2 ) = z( j, k+2 ) - sum*t3
+                       end do
+                    end if
+                 else if( nr==2 ) then
+                    ! apply g from the left to transform the rows of the matrix
+                    ! in columns k to i2.
+                    do j = k, i2
+                       sum = h( k, j ) + v2*h( k+1, j )
+                       h( k, j ) = h( k, j ) - sum*t1
+                       h( k+1, j ) = h( k+1, j ) - sum*t2
+                    end do
+                    ! apply g from the right to transform the columns of the
+                    ! matrix in rows i1 to min(k+3,i).
+                    do j = i1, i
+                       sum = h( j, k ) + v2*h( j, k+1 )
+                       h( j, k ) = h( j, k ) - sum*t1
+                       h( j, k+1 ) = h( j, k+1 ) - sum*t2
+                    end do
+                    if( wantz ) then
+                       ! accumulate transformations in the matrix z
+                       do j = iloz, ihiz
+                          sum = z( j, k ) + v2*z( j, k+1 )
+                          z( j, k ) = z( j, k ) - sum*t1
+                          z( j, k+1 ) = z( j, k+1 ) - sum*t2
+                       end do
+                    end if
+                 end if
+              end do loop_130
+           end do loop_140
+           ! failure to converge in remaining number of iterations
+           info = i
+           return
+           150 continue
+           if( l==i ) then
+              ! h(i,i-1) is negligible: one eigenvalue has converged.
+              wr( i ) = h( i, i )
+              wi( i ) = zero
+           else if( l==i-1 ) then
+              ! h(i-1,i-2) is negligible: a pair of eigenvalues have converged.
+              ! transform the 2-by-2 submatrix to standard schur form,
+              ! and compute and store the eigenvalues.
+              call stdlib_qlanv2( h( i-1, i-1 ), h( i-1, i ), h( i, i-1 ),h( i, i ), wr( i-1 ), &
+                        wi( i-1 ), wr( i ), wi( i ),cs, sn )
+              if( wantt ) then
+                 ! apply the transformation to the rest of h.
+                 if( i2>i )call stdlib_qrot( i2-i, h( i-1, i+1 ), ldh, h( i, i+1 ), ldh,cs, sn )
+                           
+                 call stdlib_qrot( i-i1-1, h( i1, i-1 ), 1, h( i1, i ), 1, cs, sn )
+              end if
+              if( wantz ) then
+                 ! apply the transformation to z.
+                 call stdlib_qrot( nz, z( iloz, i-1 ), 1, z( iloz, i ), 1, cs, sn )
+              end if
+           end if
+           ! reset deflation counter
+           kdefl = 0
+           ! return to start of the main loop with new value of i.
+           i = l - 1
+           go to 20
+           160 continue
+           return
+     end subroutine stdlib_qlahqr
+
+     !> DLAHR2: reduces the first NB columns of A real general n-BY-(n-k+1)
+     !> matrix A so that elements below the k-th subdiagonal are zero. The
+     !> reduction is performed by an orthogonal similarity transformation
+     !> Q**T * A * Q. The routine returns the matrices V and T which determine
+     !> Q as a block reflector I - V*T*V**T, and also the matrix Y = A * V * T.
+     !> This is an auxiliary routine called by DGEHRD.
+
+     pure subroutine stdlib_qlahr2( n, k, nb, a, lda, tau, t, ldt, y, ldy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: k, lda, ldt, ldy, n, nb
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: t(ldt,nb), tau(nb), y(ldy,nb)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 )return
+           loop_10: do i = 1, nb
+              if( i>1 ) then
+                 ! update a(k+1:n,i)
+                 ! update i-th column of a - y * v**t
+                 call stdlib_qgemv( 'NO TRANSPOSE', n-k, i-1, -one, y(k+1,1), ldy,a( k+i-1, 1 ), &
+                           lda, one, a( k+1, i ), 1 )
+                 ! apply i - v * t**t * v**t to this column (call it b) from the
+                 ! left, using the last column of t as workspace
+                 ! let  v = ( v1 )   and   b = ( b1 )   (first i-1 rows)
+                          ! ( v2 )             ( b2 )
+                 ! where v1 is unit lower triangular
+                 ! w := v1**t * b1
+                 call stdlib_qcopy( i-1, a( k+1, i ), 1, t( 1, nb ), 1 )
+                 call stdlib_qtrmv( 'LOWER', 'TRANSPOSE', 'UNIT',i-1, a( k+1, 1 ),lda, t( 1, nb ),&
+                            1 )
+                 ! w := w + v2**t * b2
+                 call stdlib_qgemv( 'TRANSPOSE', n-k-i+1, i-1,one, a( k+i, 1 ),lda, a( k+i, i ), &
+                           1, one, t( 1, nb ), 1 )
+                 ! w := t**t * w
+                 call stdlib_qtrmv( 'UPPER', 'TRANSPOSE', 'NON-UNIT',i-1, t, ldt,t( 1, nb ), 1 )
+                           
+                 ! b2 := b2 - v2*w
+                 call stdlib_qgemv( 'NO TRANSPOSE', n-k-i+1, i-1, -one,a( k+i, 1 ),lda, t( 1, nb )&
+                           , 1, one, a( k+i, i ), 1 )
+                 ! b1 := b1 - v1*w
+                 call stdlib_qtrmv( 'LOWER', 'NO TRANSPOSE','UNIT', i-1,a( k+1, 1 ), lda, t( 1, &
+                           nb ), 1 )
+                 call stdlib_qaxpy( i-1, -one, t( 1, nb ), 1, a( k+1, i ), 1 )
+                 a( k+i-1, i-1 ) = ei
+              end if
+              ! generate the elementary reflector h(i) to annihilate
+              ! a(k+i+1:n,i)
+              call stdlib_qlarfg( n-k-i+1, a( k+i, i ), a( min( k+i+1, n ), i ), 1,tau( i ) )
+                        
+              ei = a( k+i, i )
+              a( k+i, i ) = one
+              ! compute  y(k+1:n,i)
+              call stdlib_qgemv( 'NO TRANSPOSE', n-k, n-k-i+1,one, a( k+1, i+1 ),lda, a( k+i, i ),&
+                         1, zero, y( k+1, i ), 1 )
+              call stdlib_qgemv( 'TRANSPOSE', n-k-i+1, i-1,one, a( k+i, 1 ), lda,a( k+i, i ), 1, &
+                        zero, t( 1, i ), 1 )
+              call stdlib_qgemv( 'NO TRANSPOSE', n-k, i-1, -one,y( k+1, 1 ), ldy,t( 1, i ), 1, &
+                        one, y( k+1, i ), 1 )
+              call stdlib_qscal( n-k, tau( i ), y( k+1, i ), 1 )
+              ! compute t(1:i,i)
+              call stdlib_qscal( i-1, -tau( i ), t( 1, i ), 1 )
+              call stdlib_qtrmv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',i-1, t, ldt,t( 1, i ), 1 )
+                        
+              t( i, i ) = tau( i )
+           end do loop_10
+           a( k+nb, nb ) = ei
+           ! compute y(1:k,1:nb)
+           call stdlib_qlacpy( 'ALL', k, nb, a( 1, 2 ), lda, y, ldy )
+           call stdlib_qtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE','UNIT', k, nb,one, a( k+1, 1 ), &
+                     lda, y, ldy )
+           if( n>k+nb )call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', k,nb, n-k-nb, one,a( 1, &
+                     2+nb ), lda, a( k+1+nb, 1 ), lda, one, y,ldy )
+           call stdlib_qtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE','NON-UNIT', k, nb,one, t, ldt, y, &
+                     ldy )
+           return
+     end subroutine stdlib_qlahr2
+
+     !> DLAIC1: applies one step of incremental condition estimation in
+     !> its simplest version:
+     !> Let x, twonorm(x) = 1, be an approximate singular vector of an j-by-j
+     !> lower triangular matrix L, such that
+     !> twonorm(L*x) = sest
+     !> Then DLAIC1 computes sestpr, s, c such that
+     !> the vector
+     !> [ s*x ]
+     !> xhat = [  c  ]
+     !> is an approximate singular vector of
+     !> [ L       0  ]
+     !> Lhat = [ w**T gamma ]
+     !> in the sense that
+     !> twonorm(Lhat*xhat) = sestpr.
+     !> Depending on JOB, an estimate for the largest or smallest singular
+     !> value is computed.
+     !> Note that [s c]**T and sestpr**2 is an eigenpair of the system
+     !> diag(sest*sest, 0) + [alpha  gamma] * [ alpha ]
+     !> [ gamma ]
+     !> where  alpha =  x**T*w.
+
+     pure subroutine stdlib_qlaic1( job, j, x, sest, w, gamma, sestpr, s, c )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: j, job
+           real(qp), intent(out) :: c, s, sestpr
+           real(qp), intent(in) :: gamma, sest
+           ! Array Arguments 
+           real(qp), intent(in) :: w(j), x(j)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(qp) :: absalp, absest, absgam, alpha, b, cosine, eps, norma, s1, s2, sine, t, &
+                     test, tmp, zeta1, zeta2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sign,sqrt
+           ! Executable Statements 
+           eps = stdlib_qlamch( 'EPSILON' )
+           alpha = stdlib_qdot( j, x, 1, w, 1 )
+           absalp = abs( alpha )
+           absgam = abs( gamma )
+           absest = abs( sest )
+           if( job==1 ) then
+              ! estimating largest singular value
+              ! special cases
+              if( sest==zero ) then
+                 s1 = max( absgam, absalp )
+                 if( s1==zero ) then
+                    s = zero
+                    c = one
+                    sestpr = zero
+                 else
+                    s = alpha / s1
+                    c = gamma / s1
+                    tmp = sqrt( s*s+c*c )
+                    s = s / tmp
+                    c = c / tmp
+                    sestpr = s1*tmp
+                 end if
+                 return
+              else if( absgam<=eps*absest ) then
+                 s = one
+                 c = zero
+                 tmp = max( absest, absalp )
+                 s1 = absest / tmp
+                 s2 = absalp / tmp
+                 sestpr = tmp*sqrt( s1*s1+s2*s2 )
+                 return
+              else if( absalp<=eps*absest ) then
+                 s1 = absgam
+                 s2 = absest
+                 if( s1<=s2 ) then
+                    s = one
+                    c = zero
+                    sestpr = s2
+                 else
+                    s = zero
+                    c = one
+                    sestpr = s1
+                 end if
+                 return
+              else if( absest<=eps*absalp .or. absest<=eps*absgam ) then
+                 s1 = absgam
+                 s2 = absalp
+                 if( s1<=s2 ) then
+                    tmp = s1 / s2
+                    s = sqrt( one+tmp*tmp )
+                    sestpr = s2*s
+                    c = ( gamma / s2 ) / s
+                    s = sign( one, alpha ) / s
+                 else
+                    tmp = s2 / s1
+                    c = sqrt( one+tmp*tmp )
+                    sestpr = s1*c
+                    s = ( alpha / s1 ) / c
+                    c = sign( one, gamma ) / c
+                 end if
+                 return
+              else
+                 ! normal case
+                 zeta1 = alpha / absest
+                 zeta2 = gamma / absest
+                 b = ( one-zeta1*zeta1-zeta2*zeta2 )*half
+                 c = zeta1*zeta1
+                 if( b>zero ) then
+                    t = c / ( b+sqrt( b*b+c ) )
+                 else
+                    t = sqrt( b*b+c ) - b
+                 end if
+                 sine = -zeta1 / t
+                 cosine = -zeta2 / ( one+t )
+                 tmp = sqrt( sine*sine+cosine*cosine )
+                 s = sine / tmp
+                 c = cosine / tmp
+                 sestpr = sqrt( t+one )*absest
+                 return
+              end if
+           else if( job==2 ) then
+              ! estimating smallest singular value
+              ! special cases
+              if( sest==zero ) then
+                 sestpr = zero
+                 if( max( absgam, absalp )==zero ) then
+                    sine = one
+                    cosine = zero
+                 else
+                    sine = -gamma
+                    cosine = alpha
+                 end if
+                 s1 = max( abs( sine ), abs( cosine ) )
+                 s = sine / s1
+                 c = cosine / s1
+                 tmp = sqrt( s*s+c*c )
+                 s = s / tmp
+                 c = c / tmp
+                 return
+              else if( absgam<=eps*absest ) then
+                 s = zero
+                 c = one
+                 sestpr = absgam
+                 return
+              else if( absalp<=eps*absest ) then
+                 s1 = absgam
+                 s2 = absest
+                 if( s1<=s2 ) then
+                    s = zero
+                    c = one
+                    sestpr = s1
+                 else
+                    s = one
+                    c = zero
+                    sestpr = s2
+                 end if
+                 return
+              else if( absest<=eps*absalp .or. absest<=eps*absgam ) then
+                 s1 = absgam
+                 s2 = absalp
+                 if( s1<=s2 ) then
+                    tmp = s1 / s2
+                    c = sqrt( one+tmp*tmp )
+                    sestpr = absest*( tmp / c )
+                    s = -( gamma / s2 ) / c
+                    c = sign( one, alpha ) / c
+                 else
+                    tmp = s2 / s1
+                    s = sqrt( one+tmp*tmp )
+                    sestpr = absest / s
+                    c = ( alpha / s1 ) / s
+                    s = -sign( one, gamma ) / s
+                 end if
+                 return
+              else
+                 ! normal case
+                 zeta1 = alpha / absest
+                 zeta2 = gamma / absest
+                 norma = max( one+zeta1*zeta1+abs( zeta1*zeta2 ),abs( zeta1*zeta2 )+zeta2*zeta2 )
+                           
+                 ! see if root is closer to zero or to one
+                 test = one + two*( zeta1-zeta2 )*( zeta1+zeta2 )
+                 if( test>=zero ) then
+                    ! root is close to zero, compute directly
+                    b = ( zeta1*zeta1+zeta2*zeta2+one )*half
+                    c = zeta2*zeta2
+                    t = c / ( b+sqrt( abs( b*b-c ) ) )
+                    sine = zeta1 / ( one-t )
+                    cosine = -zeta2 / t
+                    sestpr = sqrt( t+four*eps*eps*norma )*absest
+                 else
+                    ! root is closer to one, shift by that amount
+                    b = ( zeta2*zeta2+zeta1*zeta1-one )*half
+                    c = zeta1*zeta1
+                    if( b>=zero ) then
+                       t = -c / ( b+sqrt( b*b+c ) )
+                    else
+                       t = b - sqrt( b*b+c )
+                    end if
+                    sine = -zeta1 / t
+                    cosine = -zeta2 / ( one+t )
+                    sestpr = sqrt( one+t+four*eps*eps*norma )*absest
+                 end if
+                 tmp = sqrt( sine*sine+cosine*cosine )
+                 s = sine / tmp
+                 c = cosine / tmp
+                 return
+              end if
+           end if
+           return
+     end subroutine stdlib_qlaic1
+
+     !> This routine is not for general use.  It exists solely to avoid
+     !> over-optimization in DISNAN.
+     !> DLAISNAN: checks for NaNs by comparing its two arguments for
+     !> inequality.  NaN is the only floating-point value where NaN != NaN
+     !> returns .TRUE.  To check for NaNs, pass the same variable as both
+     !> arguments.
+     !> A compiler must assume that the two arguments are
+     !> not the same variable, and the test will not be optimized away.
+     !> Interprocedural or whole-program optimization may delete this
+     !> test.  The ISNAN functions will be replaced by the correct
+     !> Fortran 03 intrinsic once the intrinsic is widely available.
+
+     pure logical(lk) function stdlib_qlaisnan( din1, din2 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: din1, din2
+        ! =====================================================================
+        ! Executable Statements 
+           stdlib_qlaisnan = (din1/=din2)
+           return
+     end function stdlib_qlaisnan
+
+     !> DLALN2: solves a system of the form  (ca A - w D ) X = s B
+     !> or (ca A**T - w D) X = s B   with possible scaling ("s") and
+     !> perturbation of A.  (A**T means A-transpose.)
+     !> A is an NA x NA real matrix, ca is a real scalar, D is an NA x NA
+     !> real diagonal matrix, w is a real or complex value, and X and B are
+     !> NA x 1 matrices -- real if w is real, complex if w is complex.  NA
+     !> may be 1 or 2.
+     !> If w is complex, X and B are represented as NA x 2 matrices,
+     !> the first column of each being the real part and the second
+     !> being the imaginary part.
+     !> "s" is a scaling factor (<= 1), computed by DLALN2, which is
+     !> so chosen that X can be computed without overflow.  X is further
+     !> scaled if necessary to assure that norm(ca A - w D)*norm(X) is less
+     !> than overflow.
+     !> If both singular values of (ca A - w D) are less than SMIN,
+     !> SMIN*identity will be used instead of (ca A - w D).  If only one
+     !> singular value is less than SMIN, one element of (ca A - w D) will be
+     !> perturbed enough to make the smallest singular value roughly SMIN.
+     !> If both singular values are at least SMIN, (ca A - w D) will not be
+     !> perturbed.  In any case, the perturbation will be at most some small
+     !> multiple of max( SMIN, ulp*norm(ca A - w D) ).  The singular values
+     !> are computed by infinity-norm approximations, and thus will only be
+     !> correct to a factor of 2 or so.
+     !> Note: all input quantities are assumed to be smaller than overflow
+     !> by a reasonable factor.  (See BIGNUM.)
+
+     pure subroutine stdlib_qlaln2( ltrans, na, nw, smin, ca, a, lda, d1, d2, b,ldb, wr, wi, x, &
+               ldx, scale, xnorm, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: ltrans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldx, na, nw
+           real(qp), intent(in) :: ca, d1, d2, smin, wi, wr
+           real(qp), intent(out) :: scale, xnorm
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: x(ldx,*)
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: icmax, j
+           real(qp) :: bbnd, bi1, bi2, bignum, bnorm, br1, br2, ci21, ci22, cmax, cnorm, cr21, &
+           cr22, csi, csr, li21, lr21, smini, smlnum, temp, u22abs, ui11, ui11r, ui12, ui12s, &
+                     ui22, ur11, ur11r, ur12, ur12s, ur22, xi1, xi2, xr1, xr2
+           ! Local Arrays 
+           logical(lk) :: rswap(4), zswap(4)
+           integer(ilp) :: ipivot(4,4)
+           real(qp) :: ci(2,2), civ(4), cr(2,2), crv(4)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Equivalences 
+           equivalence        ( ci( 1, 1 ), civ( 1 ) ),( cr( 1, 1 ), crv( 1 ) )
+           ! Data Statements 
+           zswap = [.false.,.false.,.true.,.true.]
+           rswap = [.false.,.true.,.false.,.true.]
+           ipivot = reshape([1,2,3,4,2,1,4,3,3,4,1,2,4,3,2,1],[4,4])
+           ! Executable Statements 
+           ! compute bignum
+           smlnum = two*stdlib_qlamch( 'SAFE MINIMUM' )
+           bignum = one / smlnum
+           smini = max( smin, smlnum )
+           ! don't check for input errors
+           info = 0
+           ! standard initializations
+           scale = one
+           if( na==1 ) then
+              ! 1 x 1  (i.e., scalar) system   c x = b
+              if( nw==1 ) then
+                 ! real 1x1 system.
+                 ! c = ca a - w d
+                 csr = ca*a( 1, 1 ) - wr*d1
+                 cnorm = abs( csr )
+                 ! if | c | < smini, use c = smini
+                 if( cnorm<smini ) then
+                    csr = smini
+                    cnorm = smini
+                    info = 1
+                 end if
+                 ! check scaling for  x = b / c
+                 bnorm = abs( b( 1, 1 ) )
+                 if( cnorm<one .and. bnorm>one ) then
+                    if( bnorm>bignum*cnorm )scale = one / bnorm
+                 end if
+                 ! compute x
+                 x( 1, 1 ) = ( b( 1, 1 )*scale ) / csr
+                 xnorm = abs( x( 1, 1 ) )
+              else
+                 ! complex 1x1 system (w is complex)
+                 ! c = ca a - w d
+                 csr = ca*a( 1, 1 ) - wr*d1
+                 csi = -wi*d1
+                 cnorm = abs( csr ) + abs( csi )
+                 ! if | c | < smini, use c = smini
+                 if( cnorm<smini ) then
+                    csr = smini
+                    csi = zero
+                    cnorm = smini
+                    info = 1
+                 end if
+                 ! check scaling for  x = b / c
+                 bnorm = abs( b( 1, 1 ) ) + abs( b( 1, 2 ) )
+                 if( cnorm<one .and. bnorm>one ) then
+                    if( bnorm>bignum*cnorm )scale = one / bnorm
+                 end if
+                 ! compute x
+                 call stdlib_qladiv( scale*b( 1, 1 ), scale*b( 1, 2 ), csr, csi,x( 1, 1 ), x( 1, &
+                           2 ) )
+                 xnorm = abs( x( 1, 1 ) ) + abs( x( 1, 2 ) )
+              end if
+           else
+              ! 2x2 system
+              ! compute the realpart of  c = ca a - w d  (or  ca a**t - w d,KIND=qp)
+              cr( 1, 1 ) = ca*a( 1, 1 ) - wr*d1
+              cr( 2, 2 ) = ca*a( 2, 2 ) - wr*d2
+              if( ltrans ) then
+                 cr( 1, 2 ) = ca*a( 2, 1 )
+                 cr( 2, 1 ) = ca*a( 1, 2 )
+              else
+                 cr( 2, 1 ) = ca*a( 2, 1 )
+                 cr( 1, 2 ) = ca*a( 1, 2 )
+              end if
+              if( nw==1 ) then
+                 ! real2x2 system  (w is real,KIND=qp)
+                 ! find the largest element in c
+                 cmax = zero
+                 icmax = 0
+                 do j = 1, 4
+                    if( abs( crv( j ) )>cmax ) then
+                       cmax = abs( crv( j ) )
+                       icmax = j
+                    end if
+                 end do
+                 ! if norm(c) < smini, use smini*identity.
+                 if( cmax<smini ) then
+                    bnorm = max( abs( b( 1, 1 ) ), abs( b( 2, 1 ) ) )
+                    if( smini<one .and. bnorm>one ) then
+                       if( bnorm>bignum*smini )scale = one / bnorm
+                    end if
+                    temp = scale / smini
+                    x( 1, 1 ) = temp*b( 1, 1 )
+                    x( 2, 1 ) = temp*b( 2, 1 )
+                    xnorm = temp*bnorm
+                    info = 1
+                    return
+                 end if
+                 ! gaussian elimination with complete pivoting.
+                 ur11 = crv( icmax )
+                 cr21 = crv( ipivot( 2, icmax ) )
+                 ur12 = crv( ipivot( 3, icmax ) )
+                 cr22 = crv( ipivot( 4, icmax ) )
+                 ur11r = one / ur11
+                 lr21 = ur11r*cr21
+                 ur22 = cr22 - ur12*lr21
+                 ! if smaller pivot < smini, use smini
+                 if( abs( ur22 )<smini ) then
+                    ur22 = smini
+                    info = 1
+                 end if
+                 if( rswap( icmax ) ) then
+                    br1 = b( 2, 1 )
+                    br2 = b( 1, 1 )
+                 else
+                    br1 = b( 1, 1 )
+                    br2 = b( 2, 1 )
+                 end if
+                 br2 = br2 - lr21*br1
+                 bbnd = max( abs( br1*( ur22*ur11r ) ), abs( br2 ) )
+                 if( bbnd>one .and. abs( ur22 )<one ) then
+                    if( bbnd>=bignum*abs( ur22 ) )scale = one / bbnd
+                 end if
+                 xr2 = ( br2*scale ) / ur22
+                 xr1 = ( scale*br1 )*ur11r - xr2*( ur11r*ur12 )
+                 if( zswap( icmax ) ) then
+                    x( 1, 1 ) = xr2
+                    x( 2, 1 ) = xr1
+                 else
+                    x( 1, 1 ) = xr1
+                    x( 2, 1 ) = xr2
+                 end if
+                 xnorm = max( abs( xr1 ), abs( xr2 ) )
+                 ! further scaling if  norm(a) norm(x) > overflow
+                 if( xnorm>one .and. cmax>one ) then
+                    if( xnorm>bignum / cmax ) then
+                       temp = cmax / bignum
+                       x( 1, 1 ) = temp*x( 1, 1 )
+                       x( 2, 1 ) = temp*x( 2, 1 )
+                       xnorm = temp*xnorm
+                       scale = temp*scale
+                    end if
+                 end if
+              else
+                 ! complex 2x2 system  (w is complex)
+                 ! find the largest element in c
+                 ci( 1, 1 ) = -wi*d1
+                 ci( 2, 1 ) = zero
+                 ci( 1, 2 ) = zero
+                 ci( 2, 2 ) = -wi*d2
+                 cmax = zero
+                 icmax = 0
+                 do j = 1, 4
+                    if( abs( crv( j ) )+abs( civ( j ) )>cmax ) then
+                       cmax = abs( crv( j ) ) + abs( civ( j ) )
+                       icmax = j
+                    end if
+                 end do
+                 ! if norm(c) < smini, use smini*identity.
+                 if( cmax<smini ) then
+                    bnorm = max( abs( b( 1, 1 ) )+abs( b( 1, 2 ) ),abs( b( 2, 1 ) )+abs( b( 2, 2 )&
+                               ) )
+                    if( smini<one .and. bnorm>one ) then
+                       if( bnorm>bignum*smini )scale = one / bnorm
+                    end if
+                    temp = scale / smini
+                    x( 1, 1 ) = temp*b( 1, 1 )
+                    x( 2, 1 ) = temp*b( 2, 1 )
+                    x( 1, 2 ) = temp*b( 1, 2 )
+                    x( 2, 2 ) = temp*b( 2, 2 )
+                    xnorm = temp*bnorm
+                    info = 1
+                    return
+                 end if
+                 ! gaussian elimination with complete pivoting.
+                 ur11 = crv( icmax )
+                 ui11 = civ( icmax )
+                 cr21 = crv( ipivot( 2, icmax ) )
+                 ci21 = civ( ipivot( 2, icmax ) )
+                 ur12 = crv( ipivot( 3, icmax ) )
+                 ui12 = civ( ipivot( 3, icmax ) )
+                 cr22 = crv( ipivot( 4, icmax ) )
+                 ci22 = civ( ipivot( 4, icmax ) )
+                 if( icmax==1 .or. icmax==4 ) then
+                    ! code when off-diagonals of pivoted c are real
+                    if( abs( ur11 )>abs( ui11 ) ) then
+                       temp = ui11 / ur11
+                       ur11r = one / ( ur11*( one+temp**2 ) )
+                       ui11r = -temp*ur11r
+                    else
+                       temp = ur11 / ui11
+                       ui11r = -one / ( ui11*( one+temp**2 ) )
+                       ur11r = -temp*ui11r
+                    end if
+                    lr21 = cr21*ur11r
+                    li21 = cr21*ui11r
+                    ur12s = ur12*ur11r
+                    ui12s = ur12*ui11r
+                    ur22 = cr22 - ur12*lr21
+                    ui22 = ci22 - ur12*li21
+                 else
+                    ! code when diagonals of pivoted c are real
+                    ur11r = one / ur11
+                    ui11r = zero
+                    lr21 = cr21*ur11r
+                    li21 = ci21*ur11r
+                    ur12s = ur12*ur11r
+                    ui12s = ui12*ur11r
+                    ur22 = cr22 - ur12*lr21 + ui12*li21
+                    ui22 = -ur12*li21 - ui12*lr21
+                 end if
+                 u22abs = abs( ur22 ) + abs( ui22 )
+                 ! if smaller pivot < smini, use smini
+                 if( u22abs<smini ) then
+                    ur22 = smini
+                    ui22 = zero
+                    info = 1
+                 end if
+                 if( rswap( icmax ) ) then
+                    br2 = b( 1, 1 )
+                    br1 = b( 2, 1 )
+                    bi2 = b( 1, 2 )
+                    bi1 = b( 2, 2 )
+                 else
+                    br1 = b( 1, 1 )
+                    br2 = b( 2, 1 )
+                    bi1 = b( 1, 2 )
+                    bi2 = b( 2, 2 )
+                 end if
+                 br2 = br2 - lr21*br1 + li21*bi1
+                 bi2 = bi2 - li21*br1 - lr21*bi1
+                 bbnd = max( ( abs( br1 )+abs( bi1 ) )*( u22abs*( abs( ur11r )+abs( ui11r ) ) ),&
+                           abs( br2 )+abs( bi2 ) )
+                 if( bbnd>one .and. u22abs<one ) then
+                    if( bbnd>=bignum*u22abs ) then
+                       scale = one / bbnd
+                       br1 = scale*br1
+                       bi1 = scale*bi1
+                       br2 = scale*br2
+                       bi2 = scale*bi2
+                    end if
+                 end if
+                 call stdlib_qladiv( br2, bi2, ur22, ui22, xr2, xi2 )
+                 xr1 = ur11r*br1 - ui11r*bi1 - ur12s*xr2 + ui12s*xi2
+                 xi1 = ui11r*br1 + ur11r*bi1 - ui12s*xr2 - ur12s*xi2
+                 if( zswap( icmax ) ) then
+                    x( 1, 1 ) = xr2
+                    x( 2, 1 ) = xr1
+                    x( 1, 2 ) = xi2
+                    x( 2, 2 ) = xi1
+                 else
+                    x( 1, 1 ) = xr1
+                    x( 2, 1 ) = xr2
+                    x( 1, 2 ) = xi1
+                    x( 2, 2 ) = xi2
+                 end if
+                 xnorm = max( abs( xr1 )+abs( xi1 ), abs( xr2 )+abs( xi2 ) )
+                 ! further scaling if  norm(a) norm(x) > overflow
+                 if( xnorm>one .and. cmax>one ) then
+                    if( xnorm>bignum / cmax ) then
+                       temp = cmax / bignum
+                       x( 1, 1 ) = temp*x( 1, 1 )
+                       x( 2, 1 ) = temp*x( 2, 1 )
+                       x( 1, 2 ) = temp*x( 1, 2 )
+                       x( 2, 2 ) = temp*x( 2, 2 )
+                       xnorm = temp*xnorm
+                       scale = temp*scale
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qlaln2
+
+     !> DLALS0: applies back the multiplying factors of either the left or the
+     !> right singular vector matrix of a diagonal matrix appended by a row
+     !> to the right hand side matrix B in solving the least squares problem
+     !> using the divide-and-conquer SVD approach.
+     !> For the left singular vector matrix, three types of orthogonal
+     !> matrices are involved:
+     !> (1L) Givens rotations: the number of such rotations is GIVPTR; the
+     !> pairs of columns/rows they were applied to are stored in GIVCOL;
+     !> and the C- and S-values of these rotations are stored in GIVNUM.
+     !> (2L) Permutation. The (NL+1)-st row of B is to be moved to the first
+     !> row, and for J=2:N, PERM(J)-th row of B is to be moved to the
+     !> J-th row.
+     !> (3L) The left singular vector matrix of the remaining matrix.
+     !> For the right singular vector matrix, four types of orthogonal
+     !> matrices are involved:
+     !> (1R) The right singular vector matrix of the remaining matrix.
+     !> (2R) If SQRE = 1, one extra Givens rotation to generate the right
+     !> null space.
+     !> (3R) The inverse transformation of (2L).
+     !> (4R) The inverse transformation of (1L).
+
+     pure subroutine stdlib_qlals0( icompq, nl, nr, sqre, nrhs, b, ldb, bx, ldbx,perm, givptr, &
+               givcol, ldgcol, givnum, ldgnum,poles, difl, difr, z, k, c, s, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: givptr, icompq, k, ldb, ldbx, ldgcol, ldgnum, nl, nr, nrhs,&
+                      sqre
+           integer(ilp), intent(out) :: info
+           real(qp), intent(in) :: c, s
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(ldgcol,*), perm(*)
+           real(qp), intent(inout) :: b(ldb,*)
+           real(qp), intent(out) :: bx(ldbx,*), work(*)
+           real(qp), intent(in) :: difl(*), difr(ldgnum,*), givnum(ldgnum,*), poles(ldgnum,*), z(&
+                     *)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, m, n, nlp1
+           real(qp) :: diflj, difrj, dj, dsigj, dsigjp, temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           n = nl + nr + 1
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( nl<1 ) then
+              info = -2
+           else if( nr<1 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( nrhs<1 ) then
+              info = -5
+           else if( ldb<n ) then
+              info = -7
+           else if( ldbx<n ) then
+              info = -9
+           else if( givptr<0 ) then
+              info = -11
+           else if( ldgcol<n ) then
+              info = -13
+           else if( ldgnum<n ) then
+              info = -15
+           else if( k<1 ) then
+              info = -20
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLALS0', -info )
+              return
+           end if
+           m = n + sqre
+           nlp1 = nl + 1
+           if( icompq==0 ) then
+              ! apply back orthogonal transformations from the left.
+              ! step (1l): apply back the givens rotations performed.
+              do i = 1, givptr
+                 call stdlib_qrot( nrhs, b( givcol( i, 2 ), 1 ), ldb,b( givcol( i, 1 ), 1 ), ldb, &
+                           givnum( i, 2 ),givnum( i, 1 ) )
+              end do
+              ! step (2l): permute rows of b.
+              call stdlib_qcopy( nrhs, b( nlp1, 1 ), ldb, bx( 1, 1 ), ldbx )
+              do i = 2, n
+                 call stdlib_qcopy( nrhs, b( perm( i ), 1 ), ldb, bx( i, 1 ), ldbx )
+              end do
+              ! step (3l): apply the inverse of the left singular vector
+              ! matrix to bx.
+              if( k==1 ) then
+                 call stdlib_qcopy( nrhs, bx, ldbx, b, ldb )
+                 if( z( 1 )<zero ) then
+                    call stdlib_qscal( nrhs, negone, b, ldb )
+                 end if
+              else
+                 loop_50: do j = 1, k
+                    diflj = difl( j )
+                    dj = poles( j, 1 )
+                    dsigj = -poles( j, 2 )
+                    if( j<k ) then
+                       difrj = -difr( j, 1 )
+                       dsigjp = -poles( j+1, 2 )
+                    end if
+                    if( ( z( j )==zero ) .or. ( poles( j, 2 )==zero ) )then
+                       work( j ) = zero
+                    else
+                       work( j ) = -poles( j, 2 )*z( j ) / diflj /( poles( j, 2 )+dj )
+                    end if
+                    do i = 1, j - 1
+                       if( ( z( i )==zero ) .or.( poles( i, 2 )==zero ) ) then
+                          work( i ) = zero
+                       else
+                          work( i ) = poles( i, 2 )*z( i ) /( stdlib_qlamc3( poles( i, 2 ), dsigj &
+                                    )-diflj ) / ( poles( i, 2 )+dj )
+                       end if
+                    end do
+                    do i = j + 1, k
+                       if( ( z( i )==zero ) .or.( poles( i, 2 )==zero ) ) then
+                          work( i ) = zero
+                       else
+                          work( i ) = poles( i, 2 )*z( i ) /( stdlib_qlamc3( poles( i, 2 ), &
+                                    dsigjp )+difrj ) / ( poles( i, 2 )+dj )
+                       end if
+                    end do
+                    work( 1 ) = negone
+                    temp = stdlib_qnrm2( k, work, 1 )
+                    call stdlib_qgemv( 'T', k, nrhs, one, bx, ldbx, work, 1, zero,b( j, 1 ), ldb )
+                              
+                    call stdlib_qlascl( 'G', 0, 0, temp, one, 1, nrhs, b( j, 1 ),ldb, info )
+                              
+                 end do loop_50
+              end if
+              ! move the deflated rows of bx to b also.
+              if( k<max( m, n ) )call stdlib_qlacpy( 'A', n-k, nrhs, bx( k+1, 1 ), ldbx,b( k+1, 1 &
+                        ), ldb )
+           else
+              ! apply back the right orthogonal transformations.
+              ! step (1r): apply back the new right singular vector matrix
+              ! to b.
+              if( k==1 ) then
+                 call stdlib_qcopy( nrhs, b, ldb, bx, ldbx )
+              else
+                 do j = 1, k
+                    dsigj = poles( j, 2 )
+                    if( z( j )==zero ) then
+                       work( j ) = zero
+                    else
+                       work( j ) = -z( j ) / difl( j ) /( dsigj+poles( j, 1 ) ) / difr( j, 2 )
+                                 
+                    end if
+                    do i = 1, j - 1
+                       if( z( j )==zero ) then
+                          work( i ) = zero
+                       else
+                          work( i ) = z( j ) / ( stdlib_qlamc3( dsigj, -poles( i+1,2 ) )-difr( i, &
+                                    1 ) ) /( dsigj+poles( i, 1 ) ) / difr( i, 2 )
+                       end if
+                    end do
+                    do i = j + 1, k
+                       if( z( j )==zero ) then
+                          work( i ) = zero
+                       else
+                          work( i ) = z( j ) / ( stdlib_qlamc3( dsigj, -poles( i,2 ) )-difl( i ) )&
+                                     /( dsigj+poles( i, 1 ) ) / difr( i, 2 )
+                       end if
+                    end do
+                    call stdlib_qgemv( 'T', k, nrhs, one, b, ldb, work, 1, zero,bx( j, 1 ), ldbx )
+                              
+                 end do
+              end if
+              ! step (2r): if sqre = 1, apply back the rotation that is
+              ! related to the right null space of the subproblem.
+              if( sqre==1 ) then
+                 call stdlib_qcopy( nrhs, b( m, 1 ), ldb, bx( m, 1 ), ldbx )
+                 call stdlib_qrot( nrhs, bx( 1, 1 ), ldbx, bx( m, 1 ), ldbx, c, s )
+              end if
+              if( k<max( m, n ) )call stdlib_qlacpy( 'A', n-k, nrhs, b( k+1, 1 ), ldb, bx( k+1, 1 &
+                        ),ldbx )
+              ! step (3r): permute rows of b.
+              call stdlib_qcopy( nrhs, bx( 1, 1 ), ldbx, b( nlp1, 1 ), ldb )
+              if( sqre==1 ) then
+                 call stdlib_qcopy( nrhs, bx( m, 1 ), ldbx, b( m, 1 ), ldb )
+              end if
+              do i = 2, n
+                 call stdlib_qcopy( nrhs, bx( i, 1 ), ldbx, b( perm( i ), 1 ), ldb )
+              end do
+              ! step (4r): apply back the givens rotations performed.
+              do i = givptr, 1, -1
+                 call stdlib_qrot( nrhs, b( givcol( i, 2 ), 1 ), ldb,b( givcol( i, 1 ), 1 ), ldb, &
+                           givnum( i, 2 ),-givnum( i, 1 ) )
+              end do
+           end if
+           return
+     end subroutine stdlib_qlals0
+
+     !> DLALSA: is an itermediate step in solving the least squares problem
+     !> by computing the SVD of the coefficient matrix in compact form (The
+     !> singular vectors are computed as products of simple orthorgonal
+     !> matrices.).
+     !> If ICOMPQ = 0, DLALSA applies the inverse of the left singular vector
+     !> matrix of an upper bidiagonal matrix to the right hand side; and if
+     !> ICOMPQ = 1, DLALSA applies the right singular vector matrix to the
+     !> right hand side. The singular vector matrices were generated in
+     !> compact form by DLALSA.
+
+     pure subroutine stdlib_qlalsa( icompq, smlsiz, n, nrhs, b, ldb, bx, ldbx, u,ldu, vt, k, difl,&
+                difr, z, poles, givptr,givcol, ldgcol, perm, givnum, c, s, work,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, ldb, ldbx, ldgcol, ldu, n, nrhs, smlsiz
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(ldgcol,*), givptr(*), k(*), perm(ldgcol,*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: b(ldb,*)
+           real(qp), intent(out) :: bx(ldbx,*), work(*)
+           real(qp), intent(in) :: c(*), difl(ldu,*), difr(ldu,*), givnum(ldu,*), poles(ldu,*), s(&
+                     *), u(ldu,*), vt(ldu,*), z(ldu,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ic, im1, inode, j, lf, ll, lvl, lvl2, nd, ndb1, ndiml, ndimr, &
+                     nl, nlf, nlp1, nlvl, nr, nrf, nrp1, sqre
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( smlsiz<3 ) then
+              info = -2
+           else if( n<smlsiz ) then
+              info = -3
+           else if( nrhs<1 ) then
+              info = -4
+           else if( ldb<n ) then
+              info = -6
+           else if( ldbx<n ) then
+              info = -8
+           else if( ldu<n ) then
+              info = -10
+           else if( ldgcol<n ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLALSA', -info )
+              return
+           end if
+           ! book-keeping and  setting up the computation tree.
+           inode = 1
+           ndiml = inode + n
+           ndimr = ndiml + n
+           call stdlib_qlasdt( n, nlvl, nd, iwork( inode ), iwork( ndiml ),iwork( ndimr ), smlsiz &
+                     )
+           ! the following code applies back the left singular vector factors.
+           ! for applying back the right singular vector factors, go to 50.
+           if( icompq==1 ) then
+              go to 50
+           end if
+           ! the nodes on the bottom level of the tree were solved
+           ! by stdlib_qlasdq. the corresponding left and right singular vector
+           ! matrices are in explicit form. first apply back the left
+           ! singular vector matrices.
+           ndb1 = ( nd+1 ) / 2
+           do i = ndb1, nd
+              ! ic : center row of each node
+              ! nl : number of rows of left  subproblem
+              ! nr : number of rows of right subproblem
+              ! nlf: starting row of the left   subproblem
+              ! nrf: starting row of the right  subproblem
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nr = iwork( ndimr+i1 )
+              nlf = ic - nl
+              nrf = ic + 1
+              call stdlib_qgemm( 'T', 'N', nl, nrhs, nl, one, u( nlf, 1 ), ldu,b( nlf, 1 ), ldb, &
+                        zero, bx( nlf, 1 ), ldbx )
+              call stdlib_qgemm( 'T', 'N', nr, nrhs, nr, one, u( nrf, 1 ), ldu,b( nrf, 1 ), ldb, &
+                        zero, bx( nrf, 1 ), ldbx )
+           end do
+           ! next copy the rows of b that correspond to unchanged rows
+           ! in the bidiagonal matrix to bx.
+           do i = 1, nd
+              ic = iwork( inode+i-1 )
+              call stdlib_qcopy( nrhs, b( ic, 1 ), ldb, bx( ic, 1 ), ldbx )
+           end do
+           ! finally go through the left singular vector matrices of all
+           ! the other subproblems bottom-up on the tree.
+           j = 2**nlvl
+           sqre = 0
+           do lvl = nlvl, 1, -1
+              lvl2 = 2*lvl - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = lf, ll
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 j = j - 1
+                 call stdlib_qlals0( icompq, nl, nr, sqre, nrhs, bx( nlf, 1 ), ldbx,b( nlf, 1 ), &
+                 ldb, perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ), ldgcol,givnum( nlf, lvl2 &
+                 ), ldu, poles( nlf, lvl2 ),difl( nlf, lvl ), difr( nlf, lvl2 ),z( nlf, lvl ), k( &
+                           j ), c( j ), s( j ), work,info )
+              end do
+           end do
+           go to 90
+           ! icompq = 1: applying back the right singular vector factors.
+           50 continue
+           ! first now go through the right singular vector matrices of all
+           ! the tree nodes top-down.
+           j = 0
+           loop_70: do lvl = 1, nlvl
+              lvl2 = 2*lvl - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl.
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = ll, lf, -1
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 if( i==ll ) then
+                    sqre = 0
+                 else
+                    sqre = 1
+                 end if
+                 j = j + 1
+                 call stdlib_qlals0( icompq, nl, nr, sqre, nrhs, b( nlf, 1 ), ldb,bx( nlf, 1 ), &
+                 ldbx, perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ), ldgcol,givnum( nlf, &
+                 lvl2 ), ldu, poles( nlf, lvl2 ),difl( nlf, lvl ), difr( nlf, lvl2 ),z( nlf, lvl )&
+                           , k( j ), c( j ), s( j ), work,info )
+              end do
+           end do loop_70
+           ! the nodes on the bottom level of the tree were solved
+           ! by stdlib_qlasdq. the corresponding right singular vector
+           ! matrices are in explicit form. apply them back.
+           ndb1 = ( nd+1 ) / 2
+           do i = ndb1, nd
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nr = iwork( ndimr+i1 )
+              nlp1 = nl + 1
+              if( i==nd ) then
+                 nrp1 = nr
+              else
+                 nrp1 = nr + 1
+              end if
+              nlf = ic - nl
+              nrf = ic + 1
+              call stdlib_qgemm( 'T', 'N', nlp1, nrhs, nlp1, one, vt( nlf, 1 ), ldu,b( nlf, 1 ), &
+                        ldb, zero, bx( nlf, 1 ), ldbx )
+              call stdlib_qgemm( 'T', 'N', nrp1, nrhs, nrp1, one, vt( nrf, 1 ), ldu,b( nrf, 1 ), &
+                        ldb, zero, bx( nrf, 1 ), ldbx )
+           end do
+           90 continue
+           return
+     end subroutine stdlib_qlalsa
+
+     !> DLALSD: uses the singular value decomposition of A to solve the least
+     !> squares problem of finding X to minimize the Euclidean norm of each
+     !> column of A*X-B, where A is N-by-N upper bidiagonal, and X and B
+     !> are N-by-NRHS. The solution X overwrites B.
+     !> The singular values of A smaller than RCOND times the largest
+     !> singular value are treated as zero in solving the least squares
+     !> problem; in this case a minimum norm solution is returned.
+     !> The actual singular values are returned in D in ascending order.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_qlalsd( uplo, smlsiz, n, nrhs, d, e, b, ldb, rcond,rank, work, iwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: ldb, n, nrhs, smlsiz
+           real(qp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: b(ldb,*), d(*), e(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: bx, bxst, c, difl, difr, givcol, givnum, givptr, i, icmpq1, icmpq2, &
+           iwk, j, k, nlvl, nm1, nsize, nsub, nwork, perm, poles, s, sizei, smlszp, sqre, st, st1,&
+                      u, vt, z
+           real(qp) :: cs, eps, orgnrm, r, rcnd, sn, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -3
+           else if( nrhs<1 ) then
+              info = -4
+           else if( ( ldb<1 ) .or. ( ldb<n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLALSD', -info )
+              return
+           end if
+           eps = stdlib_qlamch( 'EPSILON' )
+           ! set up the tolerance.
+           if( ( rcond<=zero ) .or. ( rcond>=one ) ) then
+              rcnd = eps
+           else
+              rcnd = rcond
+           end if
+           rank = 0
+           ! quick return if possible.
+           if( n==0 ) then
+              return
+           else if( n==1 ) then
+              if( d( 1 )==zero ) then
+                 call stdlib_qlaset( 'A', 1, nrhs, zero, zero, b, ldb )
+              else
+                 rank = 1
+                 call stdlib_qlascl( 'G', 0, 0, d( 1 ), one, 1, nrhs, b, ldb, info )
+                 d( 1 ) = abs( d( 1 ) )
+              end if
+              return
+           end if
+           ! rotate the matrix if it is lower bidiagonal.
+           if( uplo=='L' ) then
+              do i = 1, n - 1
+                 call stdlib_qlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( nrhs==1 ) then
+                    call stdlib_qrot( 1, b( i, 1 ), 1, b( i+1, 1 ), 1, cs, sn )
+                 else
+                    work( i*2-1 ) = cs
+                    work( i*2 ) = sn
+                 end if
+              end do
+              if( nrhs>1 ) then
+                 do i = 1, nrhs
+                    do j = 1, n - 1
+                       cs = work( j*2-1 )
+                       sn = work( j*2 )
+                       call stdlib_qrot( 1, b( j, i ), 1, b( j+1, i ), 1, cs, sn )
+                    end do
+                 end do
+              end if
+           end if
+           ! scale.
+           nm1 = n - 1
+           orgnrm = stdlib_qlanst( 'M', n, d, e )
+           if( orgnrm==zero ) then
+              call stdlib_qlaset( 'A', n, nrhs, zero, zero, b, ldb )
+              return
+           end if
+           call stdlib_qlascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, info )
+           call stdlib_qlascl( 'G', 0, 0, orgnrm, one, nm1, 1, e, nm1, info )
+           ! if n is smaller than the minimum divide size smlsiz, then solve
+           ! the problem with another solver.
+           if( n<=smlsiz ) then
+              nwork = 1 + n*n
+              call stdlib_qlaset( 'A', n, n, zero, one, work, n )
+              call stdlib_qlasdq( 'U', 0, n, n, 0, nrhs, d, e, work, n, work, n, b,ldb, work( &
+                        nwork ), info )
+              if( info/=0 ) then
+                 return
+              end if
+              tol = rcnd*abs( d( stdlib_iqamax( n, d, 1 ) ) )
+              do i = 1, n
+                 if( d( i )<=tol ) then
+                    call stdlib_qlaset( 'A', 1, nrhs, zero, zero, b( i, 1 ), ldb )
+                 else
+                    call stdlib_qlascl( 'G', 0, 0, d( i ), one, 1, nrhs, b( i, 1 ),ldb, info )
+                              
+                    rank = rank + 1
+                 end if
+              end do
+              call stdlib_qgemm( 'T', 'N', n, nrhs, n, one, work, n, b, ldb, zero,work( nwork ), &
+                        n )
+              call stdlib_qlacpy( 'A', n, nrhs, work( nwork ), n, b, ldb )
+              ! unscale.
+              call stdlib_qlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+              call stdlib_qlasrt( 'D', n, d, info )
+              call stdlib_qlascl( 'G', 0, 0, orgnrm, one, n, nrhs, b, ldb, info )
+              return
+           end if
+           ! book-keeping and setting up some constants.
+           nlvl = int( log( real( n,KIND=qp) / real( smlsiz+1,KIND=qp) ) / log( two ),KIND=ilp) + &
+                     1
+           smlszp = smlsiz + 1
+           u = 1
+           vt = 1 + smlsiz*n
+           difl = vt + smlszp*n
+           difr = difl + nlvl*n
+           z = difr + nlvl*n*2
+           c = z + nlvl*n
+           s = c + n
+           poles = s + n
+           givnum = poles + 2*nlvl*n
+           bx = givnum + 2*nlvl*n
+           nwork = bx + n*nrhs
+           sizei = 1 + n
+           k = sizei + n
+           givptr = k + n
+           perm = givptr + n
+           givcol = perm + nlvl*n
+           iwk = givcol + nlvl*n*2
+           st = 1
+           sqre = 0
+           icmpq1 = 1
+           icmpq2 = 0
+           nsub = 0
+           do i = 1, n
+              if( abs( d( i ) )<eps ) then
+                 d( i ) = sign( eps, d( i ) )
+              end if
+           end do
+           loop_60: do i = 1, nm1
+              if( ( abs( e( i ) )<eps ) .or. ( i==nm1 ) ) then
+                 nsub = nsub + 1
+                 iwork( nsub ) = st
+                 ! subproblem found. first determine its size and then
+                 ! apply divide and conquer on it.
+                 if( i<nm1 ) then
+                    ! a subproblem with e(i) small for i < nm1.
+                    nsize = i - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                 else if( abs( e( i ) )>=eps ) then
+                    ! a subproblem with e(nm1) not too small but i = nm1.
+                    nsize = n - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                 else
+                    ! a subproblem with e(nm1) small. this implies an
+                    ! 1-by-1 subproblem at d(n), which is not solved
+                    ! explicitly.
+                    nsize = i - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                    nsub = nsub + 1
+                    iwork( nsub ) = n
+                    iwork( sizei+nsub-1 ) = 1
+                    call stdlib_qcopy( nrhs, b( n, 1 ), ldb, work( bx+nm1 ), n )
+                 end if
+                 st1 = st - 1
+                 if( nsize==1 ) then
+                    ! this is a 1-by-1 subproblem and is not solved
+                    ! explicitly.
+                    call stdlib_qcopy( nrhs, b( st, 1 ), ldb, work( bx+st1 ), n )
+                 else if( nsize<=smlsiz ) then
+                    ! this is a small subproblem and is solved by stdlib_qlasdq.
+                    call stdlib_qlaset( 'A', nsize, nsize, zero, one,work( vt+st1 ), n )
+                    call stdlib_qlasdq( 'U', 0, nsize, nsize, 0, nrhs, d( st ),e( st ), work( vt+&
+                              st1 ), n, work( nwork ),n, b( st, 1 ), ldb, work( nwork ), info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                    call stdlib_qlacpy( 'A', nsize, nrhs, b( st, 1 ), ldb,work( bx+st1 ), n )
+                              
+                 else
+                    ! a large problem. solve it using divide and conquer.
+                    call stdlib_qlasda( icmpq1, smlsiz, nsize, sqre, d( st ),e( st ), work( u+st1 &
+                    ), n, work( vt+st1 ),iwork( k+st1 ), work( difl+st1 ),work( difr+st1 ), work( &
+                    z+st1 ),work( poles+st1 ), iwork( givptr+st1 ),iwork( givcol+st1 ), n, iwork( &
+                    perm+st1 ),work( givnum+st1 ), work( c+st1 ),work( s+st1 ), work( nwork ), &
+                              iwork( iwk ),info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                    bxst = bx + st1
+                    call stdlib_qlalsa( icmpq2, smlsiz, nsize, nrhs, b( st, 1 ),ldb, work( bxst ),&
+                     n, work( u+st1 ), n,work( vt+st1 ), iwork( k+st1 ),work( difl+st1 ), work( &
+                     difr+st1 ),work( z+st1 ), work( poles+st1 ),iwork( givptr+st1 ), iwork( &
+                     givcol+st1 ), n,iwork( perm+st1 ), work( givnum+st1 ),work( c+st1 ), work( s+&
+                               st1 ), work( nwork ),iwork( iwk ), info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                 end if
+                 st = i + 1
+              end if
+           end do loop_60
+           ! apply the singular values and treat the tiny ones as zero.
+           tol = rcnd*abs( d( stdlib_iqamax( n, d, 1 ) ) )
+           do i = 1, n
+              ! some of the elements in d can be negative because 1-by-1
+              ! subproblems were not solved explicitly.
+              if( abs( d( i ) )<=tol ) then
+                 call stdlib_qlaset( 'A', 1, nrhs, zero, zero, work( bx+i-1 ), n )
+              else
+                 rank = rank + 1
+                 call stdlib_qlascl( 'G', 0, 0, d( i ), one, 1, nrhs,work( bx+i-1 ), n, info )
+                           
+              end if
+              d( i ) = abs( d( i ) )
+           end do
+           ! now apply back the right singular vectors.
+           icmpq2 = 1
+           do i = 1, nsub
+              st = iwork( i )
+              st1 = st - 1
+              nsize = iwork( sizei+i-1 )
+              bxst = bx + st1
+              if( nsize==1 ) then
+                 call stdlib_qcopy( nrhs, work( bxst ), n, b( st, 1 ), ldb )
+              else if( nsize<=smlsiz ) then
+                 call stdlib_qgemm( 'T', 'N', nsize, nrhs, nsize, one,work( vt+st1 ), n, work( &
+                           bxst ), n, zero,b( st, 1 ), ldb )
+              else
+                 call stdlib_qlalsa( icmpq2, smlsiz, nsize, nrhs, work( bxst ), n,b( st, 1 ), ldb,&
+                  work( u+st1 ), n,work( vt+st1 ), iwork( k+st1 ),work( difl+st1 ), work( difr+&
+                  st1 ),work( z+st1 ), work( poles+st1 ),iwork( givptr+st1 ), iwork( givcol+st1 ),&
+                   n,iwork( perm+st1 ), work( givnum+st1 ),work( c+st1 ), work( s+st1 ), work( &
+                             nwork ),iwork( iwk ), info )
+                 if( info/=0 ) then
+                    return
+                 end if
+              end if
+           end do
+           ! unscale and sort the singular values.
+           call stdlib_qlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+           call stdlib_qlasrt( 'D', n, d, info )
+           call stdlib_qlascl( 'G', 0, 0, orgnrm, one, n, nrhs, b, ldb, info )
+           return
+     end subroutine stdlib_qlalsd
+
+     !> DLAMCH: determines quad precision machine parameters.
+
+     pure real(qp) function stdlib_qlamch( cmach )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: cmach
+       ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: rnd, eps, sfmin, small, rmach
+           ! Intrinsic Functions 
+           intrinsic :: digits,epsilon,huge,maxexponent,minexponent,radix,tiny
+           ! Executable Statements 
+           ! assume rounding, not chopping. always.
+           rnd = one
+           if( one==rnd ) then
+              eps = epsilon(zero) * 0.5
+           else
+              eps = epsilon(zero)
+           end if
+           if( stdlib_lsame( cmach, 'E' ) ) then
+              rmach = eps
+           else if( stdlib_lsame( cmach, 'S' ) ) then
+              sfmin = tiny(zero)
+              small = one / huge(zero)
+              if( small>=sfmin ) then
+                 ! use small plus a bit, to avoid the possibility of rounding
+                 ! causing overflow when computing  1/sfmin.
+                 sfmin = small*( one+eps )
+              end if
+              rmach = sfmin
+           else if( stdlib_lsame( cmach, 'B' ) ) then
+              rmach = radix(zero)
+           else if( stdlib_lsame( cmach, 'P' ) ) then
+              rmach = eps * radix(zero)
+           else if( stdlib_lsame( cmach, 'N' ) ) then
+              rmach = digits(zero)
+           else if( stdlib_lsame( cmach, 'R' ) ) then
+              rmach = rnd
+           else if( stdlib_lsame( cmach, 'M' ) ) then
+              rmach = minexponent(zero)
+           else if( stdlib_lsame( cmach, 'U' ) ) then
+              rmach = tiny(zero)
+           else if( stdlib_lsame( cmach, 'L' ) ) then
+              rmach = maxexponent(zero)
+           else if( stdlib_lsame( cmach, 'O' ) ) then
+              rmach = huge(zero)
+           else
+              rmach = zero
+           end if
+           stdlib_qlamch = rmach
+           return
+     end function stdlib_qlamch
+
+
+     pure real(qp) function stdlib_qlamc3( a, b )
+        ! -- lapack auxiliary routine --
+           ! univ. of tennessee, univ. of california berkeley and nag ltd..
+           ! Scalar Arguments 
+           real(qp), intent(in) :: a, b
+       ! =====================================================================
+           ! Executable Statements 
+           stdlib_qlamc3 = a + b
+           return
+     end function stdlib_qlamc3
+
+     !> DLAMRG: will create a permutation list which will merge the elements
+     !> of A (which is composed of two independently sorted sets) into a
+     !> single set which is sorted in ascending order.
+
+     pure subroutine stdlib_qlamrg( n1, n2, a, dtrd1, dtrd2, index )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: dtrd1, dtrd2, n1, n2
+           ! Array Arguments 
+           integer(ilp), intent(out) :: index(*)
+           real(qp), intent(in) :: a(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ind1, ind2, n1sv, n2sv
+           ! Executable Statements 
+           n1sv = n1
+           n2sv = n2
+           if( dtrd1>0 ) then
+              ind1 = 1
+           else
+              ind1 = n1
+           end if
+           if( dtrd2>0 ) then
+              ind2 = 1 + n1
+           else
+              ind2 = n1 + n2
+           end if
+           i = 1
+           ! while ( (n1sv > 0)
+           10 continue
+           if( n1sv>0 .and. n2sv>0 ) then
+              if( a( ind1 )<=a( ind2 ) ) then
+                 index( i ) = ind1
+                 i = i + 1
+                 ind1 = ind1 + dtrd1
+                 n1sv = n1sv - 1
+              else
+                 index( i ) = ind2
+                 i = i + 1
+                 ind2 = ind2 + dtrd2
+                 n2sv = n2sv - 1
+              end if
+              go to 10
+           end if
+           ! end while
+           if( n1sv==0 ) then
+              do n1sv = 1, n2sv
+                 index( i ) = ind2
+                 i = i + 1
+                 ind2 = ind2 + dtrd2
+              end do
+           else
+           ! n2sv == 0
+              do n2sv = 1, n1sv
+                 index( i ) = ind1
+                 i = i + 1
+                 ind1 = ind1 + dtrd1
+              end do
+           end if
+           return
+     end subroutine stdlib_qlamrg
+
+     !> DLAMSWLQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of blocked
+     !> elementary reflectors computed by short wide LQ
+     !> factorization (DLASWLQ)
+
+     pure subroutine stdlib_qlamswlq( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, mb, nb, ldt, lwork, ldc
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), t(ldt,*)
+           real(qp), intent(out) :: work(*)
+           real(qp), intent(inout) :: c(ldc,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: i, ii, kk, ctr, lw
+           ! External Subroutines 
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork<0
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'T' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           if (left) then
+             lw = n * mb
+           else
+             lw = m * mb
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( k<0 ) then
+             info = -5
+           else if( m<k ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<mb .or. mb<1) then
+             info = -6
+           else if( lda<max( 1, k ) ) then
+             info = -9
+           else if( ldt<max( 1, mb) ) then
+             info = -11
+           else if( ldc<max( 1, m ) ) then
+              info = -13
+           else if(( lwork<max(1,lw)).and.(.not.lquery)) then
+             info = -15
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DLAMSWLQ', -info )
+             work(1) = lw
+             return
+           else if (lquery) then
+             work(1) = lw
+             return
+           end if
+           ! quick return if possible
+           if( min(m,n,k)==0 ) then
+             return
+           end if
+           if((nb<=k).or.(nb>=max(m,n,k))) then
+             call stdlib_qgemlqt( side, trans, m, n, k, mb, a, lda,t, ldt, c, ldc, work, info)
+                       
+             return
+           end if
+           if(left.and.tran) then
+               ! multiply q to the last block of c
+               kk = mod((m-k),(nb-k))
+               ctr = (m-k)/(nb-k)
+               if (kk>0) then
+                 ii=m-kk+1
+                 call stdlib_qtpmlqt('L','T',kk , n, k, 0, mb, a(1,ii), lda,t(1,ctr*k+1), ldt, c(&
+                           1,1), ldc,c(ii,1), ldc, work, info )
+               else
+                 ii=m+1
+               end if
+               do i=ii-(nb-k),nb+1,-(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+nb)
+                 ctr = ctr - 1
+                 call stdlib_qtpmlqt('L','T',nb-k , n, k, 0,mb, a(1,i), lda,t(1, ctr*k+1),ldt, c(&
+                           1,1), ldc,c(i,1), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:nb)
+               call stdlib_qgemlqt('L','T',nb , n, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (left.and.notran) then
+               ! multiply q to the first block of c
+              kk = mod((m-k),(nb-k))
+              ii=m-kk+1
+              ctr = 1
+              call stdlib_qgemlqt('L','N',nb , n, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=nb+1,ii-nb+k,(nb-k)
+               ! multiply q to the current block of c (i:i+nb,1:n)
+               call stdlib_qtpmlqt('L','N',nb-k , n, k, 0,mb, a(1,i), lda,t(1,ctr*k+1), ldt, c(1,&
+                         1), ldc,c(i,1), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=m) then
+               ! multiply q to the last block of c
+               call stdlib_qtpmlqt('L','N',kk , n, k, 0, mb, a(1,ii), lda,t(1,ctr*k+1), ldt, c(1,&
+                         1), ldc,c(ii,1), ldc, work, info )
+              end if
+           else if(right.and.notran) then
+               ! multiply q to the last block of c
+               kk = mod((n-k),(nb-k))
+               ctr = (n-k)/(nb-k)
+               if (kk>0) then
+                 ii=n-kk+1
+                 call stdlib_qtpmlqt('R','N',m , kk, k, 0, mb, a(1, ii), lda,t(1,ctr *k+1), ldt, &
+                           c(1,1), ldc,c(1,ii), ldc, work, info )
+               else
+                 ii=n+1
+               end if
+               do i=ii-(nb-k),nb+1,-(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+                  ctr = ctr - 1
+                  call stdlib_qtpmlqt('R','N', m, nb-k, k, 0, mb, a(1, i), lda,t(1,ctr*k+1), ldt, &
+                            c(1,1), ldc,c(1,i), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:mb)
+               call stdlib_qgemlqt('R','N',m , nb, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (right.and.tran) then
+             ! multiply q to the first block of c
+              kk = mod((n-k),(nb-k))
+              ctr = 1
+              ii=n-kk+1
+              call stdlib_qgemlqt('R','T',m , nb, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=nb+1,ii-nb+k,(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               call stdlib_qtpmlqt('R','T',m , nb-k, k, 0,mb, a(1,i), lda,t(1,ctr*k+1), ldt, c(1,&
+                         1), ldc,c(1,i), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=n) then
+             ! multiply q to the last block of c
+               call stdlib_qtpmlqt('R','T',m , kk, k, 0,mb, a(1,ii), lda,t(1,ctr*k+1),ldt, c(1,1),&
+                          ldc,c(1,ii), ldc, work, info )
+              end if
+           end if
+           work(1) = lw
+           return
+     end subroutine stdlib_qlamswlq
+
+     !> DLAMTSQR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product
+     !> of blocked elementary reflectors computed by tall skinny
+     !> QR factorization (DLATSQR)
+
+     pure subroutine stdlib_qlamtsqr( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, mb, nb, ldt, lwork, ldc
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), t(ldt,*)
+           real(qp), intent(out) :: work(*)
+           real(qp), intent(inout) :: c(ldc,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: i, ii, kk, lw, ctr, q
+           ! External Subroutines 
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork<0
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'T' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           if (left) then
+             lw = n * nb
+             q = m
+           else
+             lw = mb * nb
+             q = n
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<k ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 ) then
+             info = -5
+           else if( k<nb .or. nb<1 ) then
+             info = -7
+           else if( lda<max( 1, q ) ) then
+             info = -9
+           else if( ldt<max( 1, nb) ) then
+             info = -11
+           else if( ldc<max( 1, m ) ) then
+              info = -13
+           else if(( lwork<max(1,lw)).and.(.not.lquery)) then
+             info = -15
+           end if
+           ! determine the block size if it is tall skinny or short and wide
+           if( info==0)  then
+               work(1) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DLAMTSQR', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n,k)==0 ) then
+             return
+           end if
+           if((mb<=k).or.(mb>=max(m,n,k))) then
+             call stdlib_qgemqrt( side, trans, m, n, k, nb, a, lda,t, ldt, c, ldc, work, info)
+                       
+             return
+            end if
+           if(left.and.notran) then
+               ! multiply q to the last block of c
+              kk = mod((m-k),(mb-k))
+              ctr = (m-k)/(mb-k)
+              if (kk>0) then
+                ii=m-kk+1
+                call stdlib_qtpmqrt('L','N',kk , n, k, 0, nb, a(ii,1), lda,t(1,ctr*k+1),ldt , c(1,&
+                          1), ldc,c(ii,1), ldc, work, info )
+              else
+                ii=m+1
+              end if
+              do i=ii-(mb-k),mb+1,-(mb-k)
+               ! multiply q to the current block of c (i:i+mb,1:n)
+                ctr = ctr - 1
+                call stdlib_qtpmqrt('L','N',mb-k , n, k, 0,nb, a(i,1), lda,t(1,ctr*k+1),ldt, c(1,&
+                          1), ldc,c(i,1), ldc, work, info )
+              end do
+               ! multiply q to the first block of c (1:mb,1:n)
+              call stdlib_qgemqrt('L','N',mb , n, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+           else if (left.and.tran) then
+               ! multiply q to the first block of c
+              kk = mod((m-k),(mb-k))
+              ii=m-kk+1
+              ctr = 1
+              call stdlib_qgemqrt('L','T',mb , n, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=mb+1,ii-mb+k,(mb-k)
+               ! multiply q to the current block of c (i:i+mb,1:n)
+               call stdlib_qtpmqrt('L','T',mb-k , n, k, 0,nb, a(i,1), lda,t(1,ctr * k + 1),ldt, c(&
+                         1,1), ldc,c(i,1), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=m) then
+               ! multiply q to the last block of c
+               call stdlib_qtpmqrt('L','T',kk , n, k, 0,nb, a(ii,1), lda,t(1,ctr * k + 1), ldt, c(&
+                         1,1), ldc,c(ii,1), ldc, work, info )
+              end if
+           else if(right.and.tran) then
+               ! multiply q to the last block of c
+               kk = mod((n-k),(mb-k))
+               ctr = (n-k)/(mb-k)
+               if (kk>0) then
+                 ii=n-kk+1
+                 call stdlib_qtpmqrt('R','T',m , kk, k, 0, nb, a(ii,1), lda,t(1,ctr*k+1), ldt, c(&
+                           1,1), ldc,c(1,ii), ldc, work, info )
+               else
+                 ii=n+1
+               end if
+               do i=ii-(mb-k),mb+1,-(mb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+                 ctr = ctr - 1
+                 call stdlib_qtpmqrt('R','T',m , mb-k, k, 0,nb, a(i,1), lda,t(1,ctr*k+1), ldt, c(&
+                           1,1), ldc,c(1,i), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:mb)
+               call stdlib_qgemqrt('R','T',m , mb, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (right.and.notran) then
+               ! multiply q to the first block of c
+              kk = mod((n-k),(mb-k))
+              ii=n-kk+1
+              ctr = 1
+              call stdlib_qgemqrt('R','N', m, mb , k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=mb+1,ii-mb+k,(mb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               call stdlib_qtpmqrt('R','N', m, mb-k, k, 0,nb, a(i,1), lda,t(1, ctr * k + 1),ldt, &
+                         c(1,1), ldc,c(1,i), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=n) then
+               ! multiply q to the last block of c
+               call stdlib_qtpmqrt('R','N', m, kk , k, 0,nb, a(ii,1), lda,t(1, ctr * k + 1),ldt, &
+                         c(1,1), ldc,c(1,ii), ldc, work, info )
+              end if
+           end if
+           work(1) = lw
+           return
+     end subroutine stdlib_qlamtsqr
+
+     !> DLANEG: computes the Sturm count, the number of negative pivots
+     !> encountered while factoring tridiagonal T - sigma I = L D L^T.
+     !> This implementation works directly on the factors without forming
+     !> the tridiagonal matrix T.  The Sturm count is also the number of
+     !> eigenvalues of T less than sigma.
+     !> This routine is called from DLARRB.
+     !> The current routine does not use the PIVMIN parameter but rather
+     !> requires IEEE-754 propagation of Infinities and NaNs.  This
+     !> routine also has no input range restrictions but does require
+     !> default exception handling such that x/0 produces Inf when x is
+     !> non-zero, and Inf/Inf produces NaN.  For more information, see:
+     !> Marques, Riedy, and Voemel, "Benefits of IEEE-754 Features in
+     !> Modern Symmetric Tridiagonal Eigensolvers," SIAM Journal on
+     !> Scientific Computing, v28, n5, 2006.  DOI 10.1137/050641624
+     !> (Tech report version in LAWN 172 with the same title.)
+
+     pure integer(ilp) function stdlib_qlaneg( n, d, lld, sigma, pivmin, r )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, r
+           real(qp), intent(in) :: pivmin, sigma
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*), lld(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: blklen = 128
+           
+           ! some architectures propagate infinities and nans very slowly, so
+           ! the code computes counts in blklen chunks.  then a nan can
+           ! propagate at most blklen columns before being detected.  this is
+           ! not a general tuning parameter; it needs only to be just large
+           ! enough that the overhead is tiny in common cases.
+           
+           ! Local Scalars 
+           integer(ilp) :: bj, j, neg1, neg2, negcnt
+           real(qp) :: bsav, dminus, dplus, gamma, p, t, tmp
+           logical(lk) :: sawnan
+           ! Intrinsic Functions 
+           intrinsic :: min,max
+           ! Executable Statements 
+           negcnt = 0
+           ! i) upper part: l d l^t - sigma i = l+ d+ l+^t
+           t = -sigma
+           loop_210: do bj = 1, r-1, blklen
+              neg1 = 0
+              bsav = t
+              do j = bj, min(bj+blklen-1, r-1)
+                 dplus = d( j ) + t
+                 if( dplus<zero ) neg1 = neg1 + 1
+                 tmp = t / dplus
+                 t = tmp * lld( j ) - sigma
+              end do
+              sawnan = stdlib_qisnan( t )
+           ! run a slower version of the above loop if a nan is detected.
+           ! a nan should occur only with a zero pivot after an infinite
+           ! pivot.  in that case, substituting 1 for t/dplus is the
+           ! correct limit.
+              if( sawnan ) then
+                 neg1 = 0
+                 t = bsav
+                 do j = bj, min(bj+blklen-1, r-1)
+                    dplus = d( j ) + t
+                    if( dplus<zero ) neg1 = neg1 + 1
+                    tmp = t / dplus
+                    if (stdlib_qisnan(tmp)) tmp = one
+                    t = tmp * lld(j) - sigma
+                 end do
+              end if
+              negcnt = negcnt + neg1
+           end do loop_210
+           ! ii) lower part: l d l^t - sigma i = u- d- u-^t
+           p = d( n ) - sigma
+           do bj = n-1, r, -blklen
+              neg2 = 0
+              bsav = p
+              do j = bj, max(bj-blklen+1, r), -1
+                 dminus = lld( j ) + p
+                 if( dminus<zero ) neg2 = neg2 + 1
+                 tmp = p / dminus
+                 p = tmp * d( j ) - sigma
+              end do
+              sawnan = stdlib_qisnan( p )
+           ! as above, run a slower version that substitutes 1 for inf/inf.
+              if( sawnan ) then
+                 neg2 = 0
+                 p = bsav
+                 do j = bj, max(bj-blklen+1, r), -1
+                    dminus = lld( j ) + p
+                    if( dminus<zero ) neg2 = neg2 + 1
+                    tmp = p / dminus
+                    if (stdlib_qisnan(tmp)) tmp = one
+                    p = tmp * d(j) - sigma
+                 end do
+              end if
+              negcnt = negcnt + neg2
+           end do
+           ! iii) twist index
+             ! t was shifted by sigma initially.
+           gamma = (t + sigma) + p
+           if( gamma<zero ) negcnt = negcnt+1
+           stdlib_qlaneg = negcnt
+     end function stdlib_qlaneg
+
+     !> DLANGB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n band matrix  A,  with kl sub-diagonals and ku super-diagonals.
+
+     real(qp) function stdlib_qlangb( norm, n, kl, ku, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: kl, ku, ldab, n
+           ! Array Arguments 
+           real(qp), intent(in) :: ab(ldab,*)
+           real(qp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k, l
+           real(qp) :: scale, sum, value, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                    temp = abs( ab( i, j ) )
+                    if( value<temp .or. stdlib_qisnan( temp ) ) value = temp
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                    sum = sum + abs( ab( i, j ) )
+                 end do
+                 if( value<sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 k = ku + 1 - j
+                 do i = max( 1, j-ku ), min( n, j+kl )
+                    work( i ) = work( i ) + abs( ab( k+i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, n
+                 temp = work( i )
+                 if( value<temp .or. stdlib_qisnan( temp ) ) value = temp
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 l = max( 1, j-ku )
+                 k = ku + 1 - j + l
+                 call stdlib_qlassq( min( n, j+kl )-l+1, ab( k, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_qlangb = value
+           return
+     end function stdlib_qlangb
+
+     !> DLANGE:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real matrix A.
+
+     real(qp) function stdlib_qlange( norm, m, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: scale, sum, value, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( min( m, n )==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = 1, m
+                    temp = abs( a( i, j ) )
+                    if( value<temp .or. stdlib_qisnan( temp ) ) value = temp
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = 1, m
+                    sum = sum + abs( a( i, j ) )
+                 end do
+                 if( value<sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, m
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 do i = 1, m
+                    work( i ) = work( i ) + abs( a( i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, m
+                 temp = work( i )
+                 if( value<temp .or. stdlib_qisnan( temp ) ) value = temp
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 call stdlib_qlassq( m, a( 1, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_qlange = value
+           return
+     end function stdlib_qlange
+
+     !> DLANGT:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real tridiagonal matrix A.
+
+     pure real(qp) function stdlib_qlangt( norm, n, dl, d, du )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*), dl(*), du(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: anorm, scale, sum, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n<=0 ) then
+              anorm = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              anorm = abs( d( n ) )
+              do i = 1, n - 1
+                 if( anorm<abs( dl( i ) ) .or. stdlib_qisnan( abs( dl( i ) ) ) )anorm = abs(dl(i))
+                           
+                 if( anorm<abs( d( i ) ) .or. stdlib_qisnan( abs( d( i ) ) ) )anorm = abs(d(i))
+                           
+                 if( anorm<abs( du( i ) ) .or. stdlib_qisnan (abs( du( i ) ) ) )anorm = abs(du(i))
+                           
+              end do
+           else if( stdlib_lsame( norm, 'O' ) .or. norm=='1' ) then
+              ! find norm1(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( dl( 1 ) )
+                 temp = abs( d( n ) )+abs( du( n-1 ) )
+                 if( anorm < temp .or. stdlib_qisnan( temp ) ) anorm = temp
+                 do i = 2, n - 1
+                    temp = abs( d( i ) )+abs( dl( i ) )+abs( du( i-1 ) )
+                    if( anorm < temp .or. stdlib_qisnan( temp ) ) anorm = temp
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( du( 1 ) )
+                 temp = abs( d( n ) )+abs( dl( n-1 ) )
+                 if( anorm < temp .or. stdlib_qisnan( temp ) ) anorm = temp
+                 do i = 2, n - 1
+                    temp = abs( d( i ) )+abs( du( i ) )+abs( dl( i-1 ) )
+                    if( anorm < temp .or. stdlib_qisnan( temp ) ) anorm = temp
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              call stdlib_qlassq( n, d, 1, scale, sum )
+              if( n>1 ) then
+                 call stdlib_qlassq( n-1, dl, 1, scale, sum )
+                 call stdlib_qlassq( n-1, du, 1, scale, sum )
+              end if
+              anorm = scale*sqrt( sum )
+           end if
+           stdlib_qlangt = anorm
+           return
+     end function stdlib_qlangt
+
+     !> DLANHS:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> Hessenberg matrix A.
+
+     real(qp) function stdlib_qlanhs( norm, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = 1, min( n, j+1 )
+                    sum = abs( a( i, j ) )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = 1, min( n, j+1 )
+                    sum = sum + abs( a( i, j ) )
+                 end do
+                 if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 do i = 1, min( n, j+1 )
+                    work( i ) = work( i ) + abs( a( i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 call stdlib_qlassq( min( n, j+1 ), a( 1, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_qlanhs = value
+           return
+     end function stdlib_qlanhs
+
+     !> DLANSB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n symmetric band matrix A,  with k super-diagonals.
+
+     real(qp) function stdlib_qlansb( norm, uplo, n, k, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(qp), intent(in) :: ab(ldab,*)
+           real(qp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           real(qp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = max( k+2-j, 1 ), k + 1
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                 end do
+              else
+                 do j = 1, n
+                    do i = 1, min( n+1-j, k+1 )
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    l = k + 1 - j
+                    do i = max( 1, j-k ), j - 1
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( ab( k+1, j ) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( ab( 1, j ) )
+                    l = 1 - j
+                    do i = j + 1, min( n, j+k )
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( k>0 ) then
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 2, n
+                       call stdlib_qlassq( min( j-1, k ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                    l = k + 1
+                 else
+                    do j = 1, n - 1
+                       call stdlib_qlassq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                    end do
+                    l = 1
+                 end if
+                 sum = 2*sum
+              else
+                 l = 1
+              end if
+              call stdlib_qlassq( n, ab( l, 1 ), ldab, scale, sum )
+              value = scale*sqrt( sum )
+           end if
+           stdlib_qlansb = value
+           return
+     end function stdlib_qlansb
+
+     !> DLANSF: returns the value of the one norm, or the Frobenius norm, or
+     !> the infinity norm, or the element of largest absolute value of a
+     !> real symmetric matrix A in RFP format.
+
+     real(qp) function stdlib_qlansf( norm, transr, uplo, n, a, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, transr, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(in) :: a(0:*)
+           real(qp), intent(out) :: work(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, ifm, ilu, noe, n1, k, l, lda
+           real(qp) :: scale, s, value, aa, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              stdlib_qlansf = zero
+              return
+           else if( n==1 ) then
+              stdlib_qlansf = abs( a(0) )
+              return
+           end if
+           ! set noe = 1 if n is odd. if n is even set noe=0
+           noe = 1
+           if( mod( n, 2 )==0 )noe = 0
+           ! set ifm = 0 when form='t or 't' and 1 otherwise
+           ifm = 1
+           if( stdlib_lsame( transr, 'T' ) )ifm = 0
+           ! set ilu = 0 when uplo='u or 'u' and 1 otherwise
+           ilu = 1
+           if( stdlib_lsame( uplo, 'U' ) )ilu = 0
+           ! set lda = (n+1)/2 when ifm = 0
+           ! set lda = n when ifm = 1 and noe = 1
+           ! set lda = n+1 when ifm = 1 and noe = 0
+           if( ifm==1 ) then
+              if( noe==1 ) then
+                 lda = n
+              else
+                 ! noe=0
+                 lda = n + 1
+              end if
+           else
+              ! ifm=0
+              lda = ( n+1 ) / 2
+           end if
+           if( stdlib_lsame( norm, 'M' ) ) then
+             ! find max(abs(a(i,j))).
+              k = ( n+1 ) / 2
+              value = zero
+              if( noe==1 ) then
+                 ! n is odd
+                 if( ifm==1 ) then
+                 ! a is n by k
+                    do j = 0, k - 1
+                       do i = 0, n - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    end do
+                 else
+                    ! xpose case; a is k by n
+                    do j = 0, n - 1
+                       do i = 0, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    end do
+                 end if
+              else
+                 ! n is even
+                 if( ifm==1 ) then
+                    ! a is n+1 by k
+                    do j = 0, k - 1
+                       do i = 0, n
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    end do
+                 else
+                    ! xpose case; a is k by n+1
+                    do j = 0, n
+                       do i = 0, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              if( ifm==1 ) then
+                 k = n / 2
+                 if( noe==1 ) then
+                    ! n is odd
+                    if( ilu==0 ) then
+                       do i = 0, k - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k
+                          s = zero
+                          do i = 0, k + j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(i,j+k)
+                             s = s + aa
+                             work( i ) = work( i ) + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j+k,j+k)
+                          work( j+k ) = s + aa
+                          if( i==k+k )go to 10
+                          i = i + 1
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j,j)
+                          work( j ) = work( j ) + aa
+                          s = zero
+                          do l = j + 1, k - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       10 continue
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu = 1
+                       k = k + 1
+                       ! k=(n+1)/2 for n odd and ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = k - 1, 0, -1
+                          s = zero
+                          do i = 0, j - 2
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,i+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + aa
+                          end do
+                          if( j>0 ) then
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,j+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + s
+                             ! i=j
+                             i = i + 1
+                          end if
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j,j)
+                          work( j ) = aa
+                          s = zero
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    end if
+                 else
+                    ! n is even
+                    if( ilu==0 ) then
+                       do i = 0, k - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 1
+                          s = zero
+                          do i = 0, k + j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(i,j+k)
+                             s = s + aa
+                             work( i ) = work( i ) + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j+k,j+k)
+                          work( j+k ) = s + aa
+                          i = i + 1
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j,j)
+                          work( j ) = work( j ) + aa
+                          s = zero
+                          do l = j + 1, k - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu = 1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = k - 1, 0, -1
+                          s = zero
+                          do i = 0, j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,i+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j+k,j+k)
+                          s = s + aa
+                          work( i+k ) = work( i+k ) + s
+                          ! i=j
+                          i = i + 1
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j,j)
+                          work( j ) = aa
+                          s = zero
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    end if
+                 end if
+              else
+                 ! ifm=0
+                 k = n / 2
+                 if( noe==1 ) then
+                    ! n is odd
+                    if( ilu==0 ) then
+                       n1 = k
+                       ! n/2
+                       k = k + 1
+                       ! k is the row size and lda
+                       do i = n1, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, n1 - 1
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,n1+i)
+                             work( i+n1 ) = work( i+n1 ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = s
+                       end do
+                       ! j=n1=k-1 is special
+                       s = abs( a( 0+j*lda ) )
+                       ! a(k-1,k-1)
+                       do i = 1, k - 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k-1,i+n1)
+                          work( i+n1 ) = work( i+n1 ) + aa
+                          s = s + aa
+                       end do
+                       work( j ) = work( j ) + s
+                       do j = k, n - 1
+                          s = zero
+                          do i = 0, j - k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(i,j-k)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          ! i=j-k
+                          aa = abs( a( i+j*lda ) )
+                          ! a(j-k,j-k)
+                          s = s + aa
+                          work( j-k ) = work( j-k ) + s
+                          i = i + 1
+                          s = abs( a( i+j*lda ) )
+                          ! a(j,j)
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,l)
+                             work( l ) = work( l ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu=1
+                       k = k + 1
+                       ! k=(n+1)/2 for n odd and ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 2
+                          ! process
+                          s = zero
+                          do i = 0, j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! i=j so process of a(j,j)
+                          s = s + aa
+                          work( j ) = s
+                          ! is initialised here
+                          i = i + 1
+                          ! i=j process a(j+k,j+k)
+                          aa = abs( a( i+j*lda ) )
+                          s = aa
+                          do l = k + j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(l,k+j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( k+j ) = work( k+j ) + s
+                       end do
+                       ! j=k-1 is special :process col a(k-1,0:k-1)
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,i)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( a( i+j*lda ) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = s
+                       ! done with col j=k+1
+                       do j = k, n - 1
+                          ! process col j of a = a(j,0:k-1)
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    end if
+                 else
+                    ! n is even
+                    if( ilu==0 ) then
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 1
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i+k)
+                             work( i+k ) = work( i+k ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = s
+                       end do
+                       ! j=k
+                       aa = abs( a( 0+j*lda ) )
+                       ! a(k,k)
+                       s = aa
+                       do i = 1, k - 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,k+i)
+                          work( i+k ) = work( i+k ) + aa
+                          s = s + aa
+                       end do
+                       work( j ) = work( j ) + s
+                       do j = k + 1, n - 1
+                          s = zero
+                          do i = 0, j - 2 - k
+                             aa = abs( a( i+j*lda ) )
+                             ! a(i,j-k-1)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                           ! i=j-1-k
+                          aa = abs( a( i+j*lda ) )
+                          ! a(j-k-1,j-k-1)
+                          s = s + aa
+                          work( j-k-1 ) = work( j-k-1 ) + s
+                          i = i + 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(j,j)
+                          s = aa
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,l)
+                             work( l ) = work( l ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       ! j=n
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(i,k-1)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( a( i+j*lda ) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = work( i ) + s
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       ! j=0 is special :process col a(k:n-1,k)
+                       s = abs( a( 0 ) )
+                       ! a(k,k)
+                       do i = 1, k - 1
+                          aa = abs( a( i ) )
+                          ! a(k+i,k)
+                          work( i+k ) = work( i+k ) + aa
+                          s = s + aa
+                       end do
+                       work( k ) = work( k ) + s
+                       do j = 1, k - 1
+                          ! process
+                          s = zero
+                          do i = 0, j - 2
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j-1,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! i=j-1 so process of a(j-1,j-1)
+                          s = s + aa
+                          work( j-1 ) = s
+                          ! is initialised here
+                          i = i + 1
+                          ! i=j process a(j+k,j+k)
+                          aa = abs( a( i+j*lda ) )
+                          s = aa
+                          do l = k + j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(l,k+j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( k+j ) = work( k+j ) + s
+                       end do
+                       ! j=k is special :process col a(k,0:k-1)
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,i)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( a( i+j*lda ) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = s
+                       ! done with col j=k+1
+                       do j = k + 1, n
+                          ! process col j-1 of a = a(j-1,0:k-1)
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j-1,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          work( j-1 ) = work( j-1 ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    end if
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+             ! find normf(a).
+              k = ( n+1 ) / 2
+              scale = zero
+              s = one
+              if( noe==1 ) then
+                 ! n is odd
+                 if( ifm==1 ) then
+                    ! a is normal
+                    if( ilu==0 ) then
+                       ! a is upper
+                       do j = 0, k - 3
+                          call stdlib_qlassq( k-j-2, a( k+j+1+j*lda ), 1, scale, s )
+                          ! l at a(k,0)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_qlassq( k+j-1, a( 0+j*lda ), 1, scale, s )
+                          ! trap u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_qlassq( k-1, a( k ), lda+1, scale, s )
+                       ! tri l at a(k,0)
+                       call stdlib_qlassq( k, a( k-1 ), lda+1, scale, s )
+                       ! tri u at a(k-1,0)
+                    else
+                       ! ilu=1
+                       do j = 0, k - 1
+                          call stdlib_qlassq( n-j-1, a( j+1+j*lda ), 1, scale, s )
+                          ! trap l at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_qlassq( j, a( 0+( 1+j )*lda ), 1, scale, s )
+                          ! u at a(0,1)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_qlassq( k, a( 0 ), lda+1, scale, s )
+                       ! tri l at a(0,0)
+                       call stdlib_qlassq( k-1, a( 0+lda ), lda+1, scale, s )
+                       ! tri u at a(0,1)
+                    end if
+                 else
+                    ! a is xpose
+                    if( ilu==0 ) then
+                       ! a**t is upper
+                       do j = 1, k - 2
+                          call stdlib_qlassq( j, a( 0+( k+j )*lda ), 1, scale, s )
+                          ! u at a(0,k)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_qlassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k-1 rect. at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_qlassq( k-j-1, a( j+1+( j+k-1 )*lda ), 1,scale, s )
+                          ! l at a(0,k-1)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_qlassq( k-1, a( 0+k*lda ), lda+1, scale, s )
+                       ! tri u at a(0,k)
+                       call stdlib_qlassq( k, a( 0+( k-1 )*lda ), lda+1, scale, s )
+                       ! tri l at a(0,k-1)
+                    else
+                       ! a**t is lower
+                       do j = 1, k - 1
+                          call stdlib_qlassq( j, a( 0+j*lda ), 1, scale, s )
+                          ! u at a(0,0)
+                       end do
+                       do j = k, n - 1
+                          call stdlib_qlassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k-1 rect. at a(0,k)
+                       end do
+                       do j = 0, k - 3
+                          call stdlib_qlassq( k-j-2, a( j+2+j*lda ), 1, scale, s )
+                          ! l at a(1,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_qlassq( k, a( 0 ), lda+1, scale, s )
+                       ! tri u at a(0,0)
+                       call stdlib_qlassq( k-1, a( 1 ), lda+1, scale, s )
+                       ! tri l at a(1,0)
+                    end if
+                 end if
+              else
+                 ! n is even
+                 if( ifm==1 ) then
+                    ! a is normal
+                    if( ilu==0 ) then
+                       ! a is upper
+                       do j = 0, k - 2
+                          call stdlib_qlassq( k-j-1, a( k+j+2+j*lda ), 1, scale, s )
+                          ! l at a(k+1,0)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_qlassq( k+j, a( 0+j*lda ), 1, scale, s )
+                          ! trap u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_qlassq( k, a( k+1 ), lda+1, scale, s )
+                       ! tri l at a(k+1,0)
+                       call stdlib_qlassq( k, a( k ), lda+1, scale, s )
+                       ! tri u at a(k,0)
+                    else
+                       ! ilu=1
+                       do j = 0, k - 1
+                          call stdlib_qlassq( n-j-1, a( j+2+j*lda ), 1, scale, s )
+                          ! trap l at a(1,0)
+                       end do
+                       do j = 1, k - 1
+                          call stdlib_qlassq( j, a( 0+j*lda ), 1, scale, s )
+                          ! u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_qlassq( k, a( 1 ), lda+1, scale, s )
+                       ! tri l at a(1,0)
+                       call stdlib_qlassq( k, a( 0 ), lda+1, scale, s )
+                       ! tri u at a(0,0)
+                    end if
+                 else
+                    ! a is xpose
+                    if( ilu==0 ) then
+                       ! a**t is upper
+                       do j = 1, k - 1
+                          call stdlib_qlassq( j, a( 0+( k+1+j )*lda ), 1, scale, s )
+                          ! u at a(0,k+1)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_qlassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k rect. at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_qlassq( k-j-1, a( j+1+( j+k )*lda ), 1, scale,s )
+                          ! l at a(0,k)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_qlassq( k, a( 0+( k+1 )*lda ), lda+1, scale, s )
+                       ! tri u at a(0,k+1)
+                       call stdlib_qlassq( k, a( 0+k*lda ), lda+1, scale, s )
+                       ! tri l at a(0,k)
+                    else
+                       ! a**t is lower
+                       do j = 1, k - 1
+                          call stdlib_qlassq( j, a( 0+( j+1 )*lda ), 1, scale, s )
+                          ! u at a(0,1)
+                       end do
+                       do j = k + 1, n
+                          call stdlib_qlassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k rect. at a(0,k+1)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_qlassq( k-j-1, a( j+1+j*lda ), 1, scale, s )
+                          ! l at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_qlassq( k, a( lda ), lda+1, scale, s )
+                       ! tri l at a(0,1)
+                       call stdlib_qlassq( k, a( 0 ), lda+1, scale, s )
+                       ! tri u at a(0,0)
+                    end if
+                 end if
+              end if
+              value = scale*sqrt( s )
+           end if
+           stdlib_qlansf = value
+           return
+     end function stdlib_qlansf
+
+     !> DLANSP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real symmetric matrix A,  supplied in packed form.
+
+     real(qp) function stdlib_qlansp( norm, uplo, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(in) :: ap(*)
+           real(qp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k
+           real(qp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 k = 1
+                 do j = 1, n
+                    do i = k, k + j - 1
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                    k = k + j
+                 end do
+              else
+                 k = 1
+                 do j = 1, n
+                    do i = k, k + n - j
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                    k = k + n - j + 1
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    work( j ) = sum + abs( ap( k ) )
+                    k = k + 1
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( ap( k ) )
+                    k = k + 1
+                    do i = j + 1, n
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              k = 2
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_qlassq( j-1, ap( k ), 1, scale, sum )
+                    k = k + j
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_qlassq( n-j, ap( k ), 1, scale, sum )
+                    k = k + n - j + 1
+                 end do
+              end if
+              sum = 2*sum
+              k = 1
+              do i = 1, n
+                 if( ap( k )/=zero ) then
+                    absa = abs( ap( k ) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    k = k + i + 1
+                 else
+                    k = k + n - i + 1
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_qlansp = value
+           return
+     end function stdlib_qlansp
+
+     !> DLANST:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real symmetric tridiagonal matrix A.
+
+     pure real(qp) function stdlib_qlanst( norm, n, d, e )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*), e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: anorm, scale, sum
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n<=0 ) then
+              anorm = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              anorm = abs( d( n ) )
+              do i = 1, n - 1
+                 sum = abs( d( i ) )
+                 if( anorm < sum .or. stdlib_qisnan( sum ) ) anorm = sum
+                 sum = abs( e( i ) )
+                 if( anorm < sum .or. stdlib_qisnan( sum ) ) anorm = sum
+              end do
+           else if( stdlib_lsame( norm, 'O' ) .or. norm=='1' .or.stdlib_lsame( norm, 'I' ) ) &
+                     then
+              ! find norm1(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( e( 1 ) )
+                 sum = abs( e( n-1 ) )+abs( d( n ) )
+                 if( anorm < sum .or. stdlib_qisnan( sum ) ) anorm = sum
+                 do i = 2, n - 1
+                    sum = abs( d( i ) )+abs( e( i ) )+abs( e( i-1 ) )
+                    if( anorm < sum .or. stdlib_qisnan( sum ) ) anorm = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( n>1 ) then
+                 call stdlib_qlassq( n-1, e, 1, scale, sum )
+                 sum = 2*sum
+              end if
+              call stdlib_qlassq( n, d, 1, scale, sum )
+              anorm = scale*sqrt( sum )
+           end if
+           stdlib_qlanst = anorm
+           return
+     end function stdlib_qlanst
+
+     !> DLANSY:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real symmetric matrix A.
+
+     real(qp) function stdlib_qlansy( norm, uplo, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = 1, j
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                 end do
+              else
+                 do j = 1, n
+                    do i = j, n
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( a( j, j ) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( a( j, j ) )
+                    do i = j + 1, n
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_qlassq( j-1, a( 1, j ), 1, scale, sum )
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_qlassq( n-j, a( j+1, j ), 1, scale, sum )
+                 end do
+              end if
+              sum = 2*sum
+              call stdlib_qlassq( n, a, lda+1, scale, sum )
+              value = scale*sqrt( sum )
+           end if
+           stdlib_qlansy = value
+           return
+     end function stdlib_qlansy
+
+     !> DLANTB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n triangular band matrix A,  with ( k + 1 ) diagonals.
+
+     real(qp) function stdlib_qlantb( norm, uplo, diag, n, k, ab,ldab, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(qp), intent(in) :: ab(ldab,*)
+           real(qp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j, l
+           real(qp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = max( k+2-j, 1 ), k
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = 2, min( n+1-j, k+1 )
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = max( k+2-j, 1 ), k + 1
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = 1, min( n+1-j, k+1 )
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = max( k+2-j, 1 ), k
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = max( k+2-j, 1 ), k + 1
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = 2, min( n+1-j, k+1 )
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = 1, min( n+1-j, k+1 )
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       l = k + 1 - j
+                       do i = max( 1, j-k ), j - 1
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       l = k + 1 - j
+                       do i = max( 1, j-k ), j
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       l = 1 - j
+                       do i = j + 1, min( n, j+k )
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       l = 1 - j
+                       do i = j, min( n, j+k )
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 end if
+              end if
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    if( k>0 ) then
+                       do j = 2, n
+                          call stdlib_qlassq( min( j-1, k ),ab( max( k+2-j, 1 ), j ), 1, scale,&
+                                    sum )
+                       end do
+                    end if
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_qlassq( min( j, k+1 ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    if( k>0 ) then
+                       do j = 1, n - 1
+                          call stdlib_qlassq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                       end do
+                    end if
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_qlassq( min( n-j+1, k+1 ), ab( 1, j ), 1, scale,sum )
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_qlantb = value
+           return
+     end function stdlib_qlantb
+
+     !> DLANTP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> triangular matrix A, supplied in packed form.
+
+     real(qp) function stdlib_qlantp( norm, uplo, diag, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(in) :: ap(*)
+           real(qp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j, k
+           real(qp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              k = 1
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = k, k + j - 2
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                       k = k + j
+                    end do
+                 else
+                    do j = 1, n
+                       do i = k + 1, k + n - j
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                       k = k + n - j + 1
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = k, k + j - 1
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                       k = k + j
+                    end do
+                 else
+                    do j = 1, n
+                       do i = k, k + n - j
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                       k = k + n - j + 1
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              k = 1
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = k, k + j - 2
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = k, k + j - 1
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    end if
+                    k = k + j
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = k + 1, k + n - j
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = k, k + n - j
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    end if
+                    k = k + n - j + 1
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       do i = 1, j - 1
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                       k = k + 1
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = 1, j
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       k = k + 1
+                       do i = j + 1, n
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j, n
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 end if
+              end if
+              value = zero
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    k = 2
+                    do j = 2, n
+                       call stdlib_qlassq( j-1, ap( k ), 1, scale, sum )
+                       k = k + j
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    k = 1
+                    do j = 1, n
+                       call stdlib_qlassq( j, ap( k ), 1, scale, sum )
+                       k = k + j
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    k = 2
+                    do j = 1, n - 1
+                       call stdlib_qlassq( n-j, ap( k ), 1, scale, sum )
+                       k = k + n - j + 1
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    k = 1
+                    do j = 1, n
+                       call stdlib_qlassq( n-j+1, ap( k ), 1, scale, sum )
+                       k = k + n - j + 1
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_qlantp = value
+           return
+     end function stdlib_qlantp
+
+     !> DLANTR:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> trapezoidal or triangular matrix A.
+
+     real(qp) function stdlib_qlantr( norm, uplo, diag, m, n, a, lda,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j
+           real(qp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( min( m, n )==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = 1, min( m, j-1 )
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = j + 1, m
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = 1, min( m, j )
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = j, m
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( ( udiag ) .and. ( j<=m ) ) then
+                       sum = one
+                       do i = 1, j - 1
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = 1, min( m, j )
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = j + 1, m
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = j, m
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, m
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       do i = 1, min( m, j-1 )
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = 1, min( m, j )
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, min( m, n )
+                       work( i ) = one
+                    end do
+                    do i = n + 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j + 1, m
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j, m
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 end if
+              end if
+              value = zero
+              do i = 1, m
+                 sum = work( i )
+                 if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = min( m, n )
+                    do j = 2, n
+                       call stdlib_qlassq( min( m, j-1 ), a( 1, j ), 1, scale, sum )
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_qlassq( min( m, j ), a( 1, j ), 1, scale, sum )
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = min( m, n )
+                    do j = 1, n
+                       call stdlib_qlassq( m-j, a( min( m, j+1 ), j ), 1, scale,sum )
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_qlassq( m-j+1, a( j, j ), 1, scale, sum )
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_qlantr = value
+           return
+     end function stdlib_qlantr
+
+     !> DLANV2: computes the Schur factorization of a real 2-by-2 nonsymmetric
+     !> matrix in standard form:
+     !> [ A  B ] = [ CS -SN ] [ AA  BB ] [ CS  SN ]
+     !> [ C  D ]   [ SN  CS ] [ CC  DD ] [-SN  CS ]
+     !> where either
+     !> 1) CC = 0 so that AA and DD are real eigenvalues of the matrix, or
+     !> 2) AA = DD and BB*CC < 0, so that AA + or - sqrt(BB*CC) are complex
+     !> conjugate eigenvalues.
+
+     pure subroutine stdlib_qlanv2( a, b, c, d, rt1r, rt1i, rt2r, rt2i, cs, sn )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(inout) :: a, b, c, d
+           real(qp), intent(out) :: cs, rt1i, rt1r, rt2i, rt2r, sn
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: multpl = 4.0e+0_qp
+           
+           
+           ! Local Scalars 
+           real(qp) :: aa, bb, bcmax, bcmis, cc, cs1, dd, eps, p, sab, sac, scale, sigma, sn1, &
+                     tau, temp, z, safmin, safmn2, safmx2
+           integer(ilp) :: count
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sign,sqrt
+           ! Executable Statements 
+           safmin = stdlib_qlamch( 'S' )
+           eps = stdlib_qlamch( 'P' )
+           safmn2 = stdlib_qlamch( 'B' )**int( log( safmin / eps ) /log( stdlib_qlamch( 'B' ) ) / &
+                     two,KIND=ilp)
+           safmx2 = one / safmn2
+           if( c==zero ) then
+              cs = one
+              sn = zero
+           else if( b==zero ) then
+              ! swap rows and columns
+              cs = zero
+              sn = one
+              temp = d
+              d = a
+              a = temp
+              b = -c
+              c = zero
+           else if( ( a-d )==zero .and. sign( one, b )/=sign( one, c ) )then
+              cs = one
+              sn = zero
+           else
+              temp = a - d
+              p = half*temp
+              bcmax = max( abs( b ), abs( c ) )
+              bcmis = min( abs( b ), abs( c ) )*sign( one, b )*sign( one, c )
+              scale = max( abs( p ), bcmax )
+              z = ( p / scale )*p + ( bcmax / scale )*bcmis
+              ! if z is of the order of the machine accuracy, postpone the
+              ! decision on the nature of eigenvalues
+              if( z>=multpl*eps ) then
+                 ! real eigenvalues. compute a and d.
+                 z = p + sign( sqrt( scale )*sqrt( z ), p )
+                 a = d + z
+                 d = d - ( bcmax / z )*bcmis
+                 ! compute b and the rotation matrix
+                 tau = stdlib_qlapy2( c, z )
+                 cs = z / tau
+                 sn = c / tau
+                 b = b - c
+                 c = zero
+              else
+                 ! complex eigenvalues, or real(almost,KIND=qp) equal eigenvalues.
+                 ! make diagonal elements equal.
+                 count = 0
+                 sigma = b + c
+                 10 continue
+                 count = count + 1
+                 scale = max( abs(temp), abs(sigma) )
+                 if( scale>=safmx2 ) then
+                    sigma = sigma * safmn2
+                    temp = temp * safmn2
+                    if (count <= 20)goto 10
+                 end if
+                 if( scale<=safmn2 ) then
+                    sigma = sigma * safmx2
+                    temp = temp * safmx2
+                    if (count <= 20)goto 10
+                 end if
+                 p = half*temp
+                 tau = stdlib_qlapy2( sigma, temp )
+                 cs = sqrt( half*( one+abs( sigma ) / tau ) )
+                 sn = -( p / ( tau*cs ) )*sign( one, sigma )
+                 ! compute [ aa  bb ] = [ a  b ] [ cs -sn ]
+                         ! [ cc  dd ]   [ c  d ] [ sn  cs ]
+                 aa = a*cs + b*sn
+                 bb = -a*sn + b*cs
+                 cc = c*cs + d*sn
+                 dd = -c*sn + d*cs
+                 ! compute [ a  b ] = [ cs  sn ] [ aa  bb ]
+                         ! [ c  d ]   [-sn  cs ] [ cc  dd ]
+                 a = aa*cs + cc*sn
+                 b = bb*cs + dd*sn
+                 c = -aa*sn + cc*cs
+                 d = -bb*sn + dd*cs
+                 temp = half*( a+d )
+                 a = temp
+                 d = temp
+                 if( c/=zero ) then
+                    if( b/=zero ) then
+                       if( sign( one, b )==sign( one, c ) ) then
+                          ! real eigenvalues: reduce to upper triangular form
+                          sab = sqrt( abs( b ) )
+                          sac = sqrt( abs( c ) )
+                          p = sign( sab*sac, c )
+                          tau = one / sqrt( abs( b+c ) )
+                          a = temp + p
+                          d = temp - p
+                          b = b - c
+                          c = zero
+                          cs1 = sab*tau
+                          sn1 = sac*tau
+                          temp = cs*cs1 - sn*sn1
+                          sn = cs*sn1 + sn*cs1
+                          cs = temp
+                       end if
+                    else
+                       b = -c
+                       c = zero
+                       temp = cs
+                       cs = -sn
+                       sn = temp
+                    end if
+                 end if
+              end if
+           end if
+           ! store eigenvalues in (rt1r,rt1i) and (rt2r,rt2i).
+           rt1r = a
+           rt2r = d
+           if( c==zero ) then
+              rt1i = zero
+              rt2i = zero
+           else
+              rt1i = sqrt( abs( b ) )*sqrt( abs( c ) )
+              rt2i = -rt1i
+           end if
+           return
+     end subroutine stdlib_qlanv2
+
+     !> DLAORHR_COL_GETRFNP: computes the modified LU factorization without
+     !> pivoting of a real general M-by-N matrix A. The factorization has
+     !> the form:
+     !> A - S = L * U,
+     !> where:
+     !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+     !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+     !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+     !> i-1 steps of Gaussian elimination. This means that the diagonal
+     !> element at each step of "modified" Gaussian elimination is
+     !> at least one in absolute value (so that division-by-zero not
+     !> not possible during the division by the diagonal element);
+     !> L is a M-by-N lower triangular matrix with unit diagonal elements
+     !> (lower trapezoidal if M > N);
+     !> and U is a M-by-N upper triangular matrix
+     !> (upper trapezoidal if M < N).
+     !> This routine is an auxiliary routine used in the Householder
+     !> reconstruction routine DORHR_COL. In DORHR_COL, this routine is
+     !> applied to an M-by-N matrix A with orthonormal columns, where each
+     !> element is bounded by one in absolute value. With the choice of
+     !> the matrix S above, one can show that the diagonal element at each
+     !> step of Gaussian elimination is the largest (in absolute value) in
+     !> the column on or below the diagonal, so that no pivoting is required
+     !> for numerical stability [1].
+     !> For more details on the Householder reconstruction algorithm,
+     !> including the modified LU factorization, see [1].
+     !> This is the blocked right-looking version of the algorithm,
+     !> calling Level 3 BLAS to update the submatrix. To factorize a block,
+     !> this routine calls the recursive routine DLAORHR_COL_GETRFNP2.
+     !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+     !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+     !> E. Solomonik, J. Parallel Distrib. Comput.,
+     !> vol. 85, pp. 3-31, 2015.
+
+     pure subroutine stdlib_qlaorhr_col_getrfnp( m, n, a, lda, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: d(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: iinfo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAORHR_COL_GETRFNP', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'DLAORHR_COL_GETRFNP', ' ', m, n, -1, -1 )
+           if( nb<=1 .or. nb>=min( m, n ) ) then
+              ! use unblocked code.
+              call stdlib_qlaorhr_col_getrfnp2( m, n, a, lda, d, info )
+           else
+              ! use blocked code.
+              do j = 1, min( m, n ), nb
+                 jb = min( min( m, n )-j+1, nb )
+                 ! factor diagonal and subdiagonal blocks.
+                 call stdlib_qlaorhr_col_getrfnp2( m-j+1, jb, a( j, j ), lda,d( j ), iinfo )
+                           
+                 if( j+jb<=n ) then
+                    ! compute block row of u.
+                    call stdlib_qtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', jb,n-j-jb+1, one, &
+                              a( j, j ), lda, a( j, j+jb ),lda )
+                    if( j+jb<=m ) then
+                       ! update trailing submatrix.
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-j-jb+1,n-j-jb+1, jb, -&
+                       one, a( j+jb, j ), lda,a( j, j+jb ), lda, one, a( j+jb, j+jb ),lda )
+                                 
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_qlaorhr_col_getrfnp
+
+     !> DLAORHR_COL_GETRFNP2: computes the modified LU factorization without
+     !> pivoting of a real general M-by-N matrix A. The factorization has
+     !> the form:
+     !> A - S = L * U,
+     !> where:
+     !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+     !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+     !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+     !> i-1 steps of Gaussian elimination. This means that the diagonal
+     !> element at each step of "modified" Gaussian elimination is at
+     !> least one in absolute value (so that division-by-zero not
+     !> possible during the division by the diagonal element);
+     !> L is a M-by-N lower triangular matrix with unit diagonal elements
+     !> (lower trapezoidal if M > N);
+     !> and U is a M-by-N upper triangular matrix
+     !> (upper trapezoidal if M < N).
+     !> This routine is an auxiliary routine used in the Householder
+     !> reconstruction routine DORHR_COL. In DORHR_COL, this routine is
+     !> applied to an M-by-N matrix A with orthonormal columns, where each
+     !> element is bounded by one in absolute value. With the choice of
+     !> the matrix S above, one can show that the diagonal element at each
+     !> step of Gaussian elimination is the largest (in absolute value) in
+     !> the column on or below the diagonal, so that no pivoting is required
+     !> for numerical stability [1].
+     !> For more details on the Householder reconstruction algorithm,
+     !> including the modified LU factorization, see [1].
+     !> This is the recursive version of the LU factorization algorithm.
+     !> Denote A - S by B. The algorithm divides the matrix B into four
+     !> submatrices:
+     !> [  B11 | B12  ]  where B11 is n1 by n1,
+     !> B = [ -----|----- ]        B21 is (m-n1) by n1,
+     !> [  B21 | B22  ]        B12 is n1 by n2,
+     !> B22 is (m-n1) by n2,
+     !> with n1 = min(m,n)/2, n2 = n-n1.
+     !> The subroutine calls itself to factor B11, solves for B21,
+     !> solves for B12, updates B22, then calls itself to factor B22.
+     !> For more details on the recursive LU algorithm, see [2].
+     !> DLAORHR_COL_GETRFNP2 is called to factorize a block by the blocked
+     !> routine DLAORHR_COL_GETRFNP, which uses blocked code calling
+     !> Level 3 BLAS to update the submatrix. However, DLAORHR_COL_GETRFNP2
+     !> is self-sufficient and can be used without DLAORHR_COL_GETRFNP.
+     !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+     !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+     !> E. Solomonik, J. Parallel Distrib. Comput.,
+     !> vol. 85, pp. 3-31, 2015.
+     !> [2] "Recursion leads to automatic variable blocking for dense linear
+     !> algebra algorithms", F. Gustavson, IBM J. of Res. and Dev.,
+     !> vol. 41, no. 6, pp. 737-755, 1997.
+
+     pure recursive subroutine stdlib_qlaorhr_col_getrfnp2( m, n, a, lda, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: d(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: sfmin
+           integer(ilp) :: i, iinfo, n1, n2
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAORHR_COL_GETRFNP2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 )return
+           if ( m==1 ) then
+              ! one row case, (also recursion termination case),
+              ! use unblocked code
+              ! transfer the sign
+              d( 1 ) = -sign( one, a( 1, 1 ) )
+              ! construct the row of u
+              a( 1, 1 ) = a( 1, 1 ) - d( 1 )
+           else if( n==1 ) then
+              ! one column case, (also recursion termination case),
+              ! use unblocked code
+              ! transfer the sign
+              d( 1 ) = -sign( one, a( 1, 1 ) )
+              ! construct the row of u
+              a( 1, 1 ) = a( 1, 1 ) - d( 1 )
+              ! scale the elements 2:m of the column
+              ! determine machine safe minimum
+              sfmin = stdlib_qlamch('S')
+              ! construct the subdiagonal elements of l
+              if( abs( a( 1, 1 ) ) >= sfmin ) then
+                 call stdlib_qscal( m-1, one / a( 1, 1 ), a( 2, 1 ), 1 )
+              else
+                 do i = 2, m
+                    a( i, 1 ) = a( i, 1 ) / a( 1, 1 )
+                 end do
+              end if
+           else
+              ! divide the matrix b into four submatrices
+              n1 = min( m, n ) / 2
+              n2 = n-n1
+              ! factor b11, recursive call
+              call stdlib_qlaorhr_col_getrfnp2( n1, n1, a, lda, d, iinfo )
+              ! solve for b21
+              call stdlib_qtrsm( 'R', 'U', 'N', 'N', m-n1, n1, one, a, lda,a( n1+1, 1 ), lda )
+                        
+              ! solve for b12
+              call stdlib_qtrsm( 'L', 'L', 'N', 'U', n1, n2, one, a, lda,a( 1, n1+1 ), lda )
+                        
+              ! update b22, i.e. compute the schur complement
+              ! b22 := b22 - b21*b12
+              call stdlib_qgemm( 'N', 'N', m-n1, n2, n1, -one, a( n1+1, 1 ), lda,a( 1, n1+1 ), &
+                        lda, one, a( n1+1, n1+1 ), lda )
+              ! factor b22, recursive call
+              call stdlib_qlaorhr_col_getrfnp2( m-n1, n2, a( n1+1, n1+1 ), lda,d( n1+1 ), iinfo )
+                        
+           end if
+           return
+     end subroutine stdlib_qlaorhr_col_getrfnp2
+
+     !> Given two column vectors X and Y, let
+     !> A = ( X Y ).
+     !> The subroutine first computes the QR factorization of A = Q*R,
+     !> and then computes the SVD of the 2-by-2 upper triangular matrix R.
+     !> The smaller singular value of R is returned in SSMIN, which is used
+     !> as the measurement of the linear dependency of the vectors X and Y.
+
+     pure subroutine stdlib_qlapll( n, x, incx, y, incy, ssmin )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           real(qp), intent(out) :: ssmin
+           ! Array Arguments 
+           real(qp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: a11, a12, a22, c, ssmax, tau
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 ) then
+              ssmin = zero
+              return
+           end if
+           ! compute the qr factorization of the n-by-2 matrix ( x y )
+           call stdlib_qlarfg( n, x( 1 ), x( 1+incx ), incx, tau )
+           a11 = x( 1 )
+           x( 1 ) = one
+           c = -tau*stdlib_qdot( n, x, incx, y, incy )
+           call stdlib_qaxpy( n, c, x, incx, y, incy )
+           call stdlib_qlarfg( n-1, y( 1+incy ), y( 1+2*incy ), incy, tau )
+           a12 = y( 1 )
+           a22 = y( 1+incy )
+           ! compute the svd of 2-by-2 upper triangular matrix.
+           call stdlib_qlas2( a11, a12, a22, ssmin, ssmax )
+           return
+     end subroutine stdlib_qlapll
+
+     !> DLAPMR: rearranges the rows of the M by N matrix X as specified
+     !> by the permutation K(1),K(2),...,K(M) of the integers 1,...,M.
+     !> If FORWRD = .TRUE.,  forward permutation:
+     !> X(K(I),*) is moved X(I,*) for I = 1,2,...,M.
+     !> If FORWRD = .FALSE., backward permutation:
+     !> X(I,*) is moved to X(K(I),*) for I = 1,2,...,M.
+
+     pure subroutine stdlib_qlapmr( forwrd, m, n, x, ldx, k )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: forwrd
+           integer(ilp), intent(in) :: ldx, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: k(*)
+           real(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, in, j, jj
+           real(qp) :: temp
+           ! Executable Statements 
+           if( m<=1 )return
+           do i = 1, m
+              k( i ) = -k( i )
+           end do
+           if( forwrd ) then
+              ! forward permutation
+              do i = 1, m
+                 if( k( i )>0 )go to 40
+                 j = i
+                 k( j ) = -k( j )
+                 in = k( j )
+                 20 continue
+                 if( k( in )>0 )go to 40
+                 do jj = 1, n
+                    temp = x( j, jj )
+                    x( j, jj ) = x( in, jj )
+                    x( in, jj ) = temp
+                 end do
+                 k( in ) = -k( in )
+                 j = in
+                 in = k( in )
+                 go to 20
+                 40 continue
+              end do
+           else
+              ! backward permutation
+              do i = 1, m
+                 if( k( i )>0 )go to 80
+                 k( i ) = -k( i )
+                 j = k( i )
+                 60 continue
+                 if( j==i )go to 80
+                 do jj = 1, n
+                    temp = x( i, jj )
+                    x( i, jj ) = x( j, jj )
+                    x( j, jj ) = temp
+                 end do
+                 k( j ) = -k( j )
+                 j = k( j )
+                 go to 60
+                 80 continue
+              end do
+           end if
+           return
+     end subroutine stdlib_qlapmr
+
+     !> DLAPMT: rearranges the columns of the M by N matrix X as specified
+     !> by the permutation K(1),K(2),...,K(N) of the integers 1,...,N.
+     !> If FORWRD = .TRUE.,  forward permutation:
+     !> X(*,K(J)) is moved X(*,J) for J = 1,2,...,N.
+     !> If FORWRD = .FALSE., backward permutation:
+     !> X(*,J) is moved to X(*,K(J)) for J = 1,2,...,N.
+
+     pure subroutine stdlib_qlapmt( forwrd, m, n, x, ldx, k )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: forwrd
+           integer(ilp), intent(in) :: ldx, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: k(*)
+           real(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ii, in, j
+           real(qp) :: temp
+           ! Executable Statements 
+           if( n<=1 )return
+           do i = 1, n
+              k( i ) = -k( i )
+           end do
+           if( forwrd ) then
+              ! forward permutation
+              do i = 1, n
+                 if( k( i )>0 )go to 40
+                 j = i
+                 k( j ) = -k( j )
+                 in = k( j )
+                 20 continue
+                 if( k( in )>0 )go to 40
+                 do ii = 1, m
+                    temp = x( ii, j )
+                    x( ii, j ) = x( ii, in )
+                    x( ii, in ) = temp
+                 end do
+                 k( in ) = -k( in )
+                 j = in
+                 in = k( in )
+                 go to 20
+                 40 continue
+              end do
+           else
+              ! backward permutation
+              do i = 1, n
+                 if( k( i )>0 )go to 80
+                 k( i ) = -k( i )
+                 j = k( i )
+                 60 continue
+                 if( j==i )go to 80
+                 do ii = 1, m
+                    temp = x( ii, i )
+                    x( ii, i ) = x( ii, j )
+                    x( ii, j ) = temp
+                 end do
+                 k( j ) = -k( j )
+                 j = k( j )
+                 go to 60
+                 80 continue
+              end do
+           end if
+           return
+     end subroutine stdlib_qlapmt
+
+     !> DLAPY2: returns sqrt(x**2+y**2), taking care not to cause unnecessary
+     !> overflow and unnecessary underflow.
+
+     pure real(qp) function stdlib_qlapy2( x, y )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: x, y
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(qp) :: w, xabs, yabs, z, hugeval
+           logical(lk) :: x_is_nan, y_is_nan
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           x_is_nan = stdlib_qisnan( x )
+           y_is_nan = stdlib_qisnan( y )
+           if ( x_is_nan ) stdlib_qlapy2 = x
+           if ( y_is_nan ) stdlib_qlapy2 = y
+           hugeval = stdlib_qlamch( 'OVERFLOW' )
+           if ( .not.( x_is_nan.or.y_is_nan ) ) then
+              xabs = abs( x )
+              yabs = abs( y )
+              w = max( xabs, yabs )
+              z = min( xabs, yabs )
+              if( z==zero .or. w>hugeval ) then
+                 stdlib_qlapy2 = w
+              else
+                 stdlib_qlapy2 = w*sqrt( one+( z / w )**2 )
+              end if
+           end if
+           return
+     end function stdlib_qlapy2
+
+     !> DLAPY3: returns sqrt(x**2+y**2+z**2), taking care not to cause
+     !> unnecessary overflow and unnecessary underflow.
+
+     pure real(qp) function stdlib_qlapy3( x, y, z )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: x, y, z
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: w, xabs, yabs, zabs, hugeval
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           hugeval = stdlib_qlamch( 'OVERFLOW' )
+           xabs = abs( x )
+           yabs = abs( y )
+           zabs = abs( z )
+           w = max( xabs, yabs, zabs )
+           if( w==zero .or. w>hugeval ) then
+           ! w can be zero for max(0,nan,0)
+           ! adding all three entries together will make sure
+           ! nan will not disappear.
+              stdlib_qlapy3 =  xabs + yabs + zabs
+           else
+              stdlib_qlapy3 = w*sqrt( ( xabs / w )**2+( yabs / w )**2+( zabs / w )**2 )
+           end if
+           return
+     end function stdlib_qlapy3
+
+     !> DLAQGB: equilibrates a general M by N band matrix A with KL
+     !> subdiagonals and KU superdiagonals using the row and scaling factors
+     !> in the vectors R and C.
+
+     pure subroutine stdlib_qlaqgb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(qp), intent(in) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(qp), intent(inout) :: ab(ldab,*)
+           real(qp), intent(in) :: c(*), r(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1e+0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           large = one / small
+           if( rowcnd>=thresh .and. amax>=small .and. amax<=large )then
+              ! no row scaling
+              if( colcnd>=thresh ) then
+                 ! no column scaling
+                 equed = 'N'
+              else
+                 ! column scaling
+                 do j = 1, n
+                    cj = c( j )
+                    do i = max( 1, j-ku ), min( m, j+kl )
+                       ab( ku+1+i-j, j ) = cj*ab( ku+1+i-j, j )
+                    end do
+                 end do
+                 equed = 'C'
+              end if
+           else if( colcnd>=thresh ) then
+              ! row scaling, no column scaling
+              do j = 1, n
+                 do i = max( 1, j-ku ), min( m, j+kl )
+                    ab( ku+1+i-j, j ) = r( i )*ab( ku+1+i-j, j )
+                 end do
+              end do
+              equed = 'R'
+           else
+              ! row and column scaling
+              do j = 1, n
+                 cj = c( j )
+                 do i = max( 1, j-ku ), min( m, j+kl )
+                    ab( ku+1+i-j, j ) = cj*r( i )*ab( ku+1+i-j, j )
+                 end do
+              end do
+              equed = 'B'
+           end if
+           return
+     end subroutine stdlib_qlaqgb
+
+     !> DLAQGE: equilibrates a general M by N matrix A using the row and
+     !> column scaling factors in the vectors R and C.
+
+     pure subroutine stdlib_qlaqge( m, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           integer(ilp), intent(in) :: lda, m, n
+           real(qp), intent(in) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: c(*), r(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1e+0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           large = one / small
+           if( rowcnd>=thresh .and. amax>=small .and. amax<=large )then
+              ! no row scaling
+              if( colcnd>=thresh ) then
+                 ! no column scaling
+                 equed = 'N'
+              else
+                 ! column scaling
+                 do j = 1, n
+                    cj = c( j )
+                    do i = 1, m
+                       a( i, j ) = cj*a( i, j )
+                    end do
+                 end do
+                 equed = 'C'
+              end if
+           else if( colcnd>=thresh ) then
+              ! row scaling, no column scaling
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = r( i )*a( i, j )
+                 end do
+              end do
+              equed = 'R'
+           else
+              ! row and column scaling
+              do j = 1, n
+                 cj = c( j )
+                 do i = 1, m
+                    a( i, j ) = cj*r( i )*a( i, j )
+                 end do
+              end do
+              equed = 'B'
+           end if
+           return
+     end subroutine stdlib_qlaqge
+
+     !> DLAQP2: computes a QR factorization with column pivoting of
+     !> the block A(OFFSET+1:M,1:N).
+     !> The block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.
+
+     pure subroutine stdlib_qlaqp2( m, n, offset, a, lda, jpvt, tau, vn1, vn2,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, m, n, offset
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(qp), intent(inout) :: a(lda,*), vn1(*), vn2(*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, itemp, j, mn, offpi, pvt
+           real(qp) :: aii, temp, temp2, tol3z
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           mn = min( m-offset, n )
+           tol3z = sqrt(stdlib_qlamch('EPSILON'))
+           ! compute factorization.
+           loop_20: do i = 1, mn
+              offpi = offset + i
+              ! determine ith pivot column and swap if necessary.
+              pvt = ( i-1 ) + stdlib_iqamax( n-i+1, vn1( i ), 1 )
+              if( pvt/=i ) then
+                 call stdlib_qswap( m, a( 1, pvt ), 1, a( 1, i ), 1 )
+                 itemp = jpvt( pvt )
+                 jpvt( pvt ) = jpvt( i )
+                 jpvt( i ) = itemp
+                 vn1( pvt ) = vn1( i )
+                 vn2( pvt ) = vn2( i )
+              end if
+              ! generate elementary reflector h(i).
+              if( offpi<m ) then
+                 call stdlib_qlarfg( m-offpi+1, a( offpi, i ), a( offpi+1, i ), 1,tau( i ) )
+                           
+              else
+                 call stdlib_qlarfg( 1, a( m, i ), a( m, i ), 1, tau( i ) )
+              end if
+              if( i<n ) then
+                 ! apply h(i)**t to a(offset+i:m,i+1:n) from the left.
+                 aii = a( offpi, i )
+                 a( offpi, i ) = one
+                 call stdlib_qlarf( 'LEFT', m-offpi+1, n-i, a( offpi, i ), 1,tau( i ), a( offpi, &
+                           i+1 ), lda, work( 1 ) )
+                 a( offpi, i ) = aii
+              end if
+              ! update partial column norms.
+              do j = i + 1, n
+                 if( vn1( j )/=zero ) then
+                    ! note: the following 4 lines follow from the analysis in
+                    ! lapack working note 176.
+                    temp = one - ( abs( a( offpi, j ) ) / vn1( j ) )**2
+                    temp = max( temp, zero )
+                    temp2 = temp*( vn1( j ) / vn2( j ) )**2
+                    if( temp2 <= tol3z ) then
+                       if( offpi<m ) then
+                          vn1( j ) = stdlib_qnrm2( m-offpi, a( offpi+1, j ), 1 )
+                          vn2( j ) = vn1( j )
+                       else
+                          vn1( j ) = zero
+                          vn2( j ) = zero
+                       end if
+                    else
+                       vn1( j ) = vn1( j )*sqrt( temp )
+                    end if
+                 end if
+              end do
+           end do loop_20
+           return
+     end subroutine stdlib_qlaqp2
+
+     !> DLAQPS: computes a step of QR factorization with column pivoting
+     !> of a real M-by-N matrix A by using Blas-3.  It tries to factorize
+     !> NB columns from A starting from the row OFFSET+1, and updates all
+     !> of the matrix with Blas-3 xGEMM.
+     !> In some cases, due to catastrophic cancellations, it cannot
+     !> factorize NB columns.  Hence, the actual number of factorized
+     !> columns is returned in KB.
+     !> Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.
+
+     pure subroutine stdlib_qlaqps( m, n, offset, nb, kb, a, lda, jpvt, tau, vn1,vn2, auxv, f, &
+               ldf )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: kb
+           integer(ilp), intent(in) :: lda, ldf, m, n, nb, offset
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(qp), intent(inout) :: a(lda,*), auxv(*), f(ldf,*), vn1(*), vn2(*)
+           real(qp), intent(out) :: tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: itemp, j, k, lastrk, lsticc, pvt, rk
+           real(qp) :: akk, temp, temp2, tol3z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,nint,sqrt
+           ! Executable Statements 
+           lastrk = min( m, n+offset )
+           lsticc = 0
+           k = 0
+           tol3z = sqrt(stdlib_qlamch('EPSILON'))
+           ! beginning of while loop.
+           10 continue
+           if( ( k<nb ) .and. ( lsticc==0 ) ) then
+              k = k + 1
+              rk = offset + k
+              ! determine ith pivot column and swap if necessary
+              pvt = ( k-1 ) + stdlib_iqamax( n-k+1, vn1( k ), 1 )
+              if( pvt/=k ) then
+                 call stdlib_qswap( m, a( 1, pvt ), 1, a( 1, k ), 1 )
+                 call stdlib_qswap( k-1, f( pvt, 1 ), ldf, f( k, 1 ), ldf )
+                 itemp = jpvt( pvt )
+                 jpvt( pvt ) = jpvt( k )
+                 jpvt( k ) = itemp
+                 vn1( pvt ) = vn1( k )
+                 vn2( pvt ) = vn2( k )
+              end if
+              ! apply previous householder reflectors to column k:
+              ! a(rk:m,k) := a(rk:m,k) - a(rk:m,1:k-1)*f(k,1:k-1)**t.
+              if( k>1 ) then
+                 call stdlib_qgemv( 'NO TRANSPOSE', m-rk+1, k-1, -one, a( rk, 1 ),lda, f( k, 1 ), &
+                           ldf, one, a( rk, k ), 1 )
+              end if
+              ! generate elementary reflector h(k).
+              if( rk<m ) then
+                 call stdlib_qlarfg( m-rk+1, a( rk, k ), a( rk+1, k ), 1, tau( k ) )
+              else
+                 call stdlib_qlarfg( 1, a( rk, k ), a( rk, k ), 1, tau( k ) )
+              end if
+              akk = a( rk, k )
+              a( rk, k ) = one
+              ! compute kth column of f:
+              ! compute  f(k+1:n,k) := tau(k)*a(rk:m,k+1:n)**t*a(rk:m,k).
+              if( k<n ) then
+                 call stdlib_qgemv( 'TRANSPOSE', m-rk+1, n-k, tau( k ),a( rk, k+1 ), lda, a( rk, &
+                           k ), 1, zero,f( k+1, k ), 1 )
+              end if
+              ! padding f(1:k,k) with zeros.
+              do j = 1, k
+                 f( j, k ) = zero
+              end do
+              ! incremental updating of f:
+              ! f(1:n,k) := f(1:n,k) - tau(k)*f(1:n,1:k-1)*a(rk:m,1:k-1)**t
+                          ! *a(rk:m,k).
+              if( k>1 ) then
+                 call stdlib_qgemv( 'TRANSPOSE', m-rk+1, k-1, -tau( k ), a( rk, 1 ),lda, a( rk, k &
+                           ), 1, zero, auxv( 1 ), 1 )
+                 call stdlib_qgemv( 'NO TRANSPOSE', n, k-1, one, f( 1, 1 ), ldf,auxv( 1 ), 1, one,&
+                            f( 1, k ), 1 )
+              end if
+              ! update the current row of a:
+              ! a(rk,k+1:n) := a(rk,k+1:n) - a(rk,1:k)*f(k+1:n,1:k)**t.
+              if( k<n ) then
+                 call stdlib_qgemv( 'NO TRANSPOSE', n-k, k, -one, f( k+1, 1 ), ldf,a( rk, 1 ), &
+                           lda, one, a( rk, k+1 ), lda )
+              end if
+              ! update partial column norms.
+              if( rk<lastrk ) then
+                 do j = k + 1, n
+                    if( vn1( j )/=zero ) then
+                       ! note: the following 4 lines follow from the analysis in
+                       ! lapack working note 176.
+                       temp = abs( a( rk, j ) ) / vn1( j )
+                       temp = max( zero, ( one+temp )*( one-temp ) )
+                       temp2 = temp*( vn1( j ) / vn2( j ) )**2
+                       if( temp2 <= tol3z ) then
+                          vn2( j ) = real( lsticc,KIND=qp)
+                          lsticc = j
+                       else
+                          vn1( j ) = vn1( j )*sqrt( temp )
+                       end if
+                    end if
+                 end do
+              end if
+              a( rk, k ) = akk
+              ! end of while loop.
+              go to 10
+           end if
+           kb = k
+           rk = offset + kb
+           ! apply the block reflector to the rest of the matrix:
+           ! a(offset+kb+1:m,kb+1:n) := a(offset+kb+1:m,kb+1:n) -
+                               ! a(offset+kb+1:m,1:kb)*f(kb+1:n,1:kb)**t.
+           if( kb<min( n, m-offset ) ) then
+              call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', m-rk, n-kb, kb, -one,a( rk+1, 1 ), &
+                        lda, f( kb+1, 1 ), ldf, one,a( rk+1, kb+1 ), lda )
+           end if
+           ! recomputation of difficult columns.
+           40 continue
+           if( lsticc>0 ) then
+              itemp = nint( vn2( lsticc ),KIND=ilp)
+              vn1( lsticc ) = stdlib_qnrm2( m-rk, a( rk+1, lsticc ), 1 )
+              ! note: the computation of vn1( lsticc ) relies on the fact that
+              ! stdlib_dnrm2 does not fail on vectors with norm below the value of
+              ! sqrt(stdlib_qlamch('s'))
+              vn2( lsticc ) = vn1( lsticc )
+              lsticc = itemp
+              go to 40
+           end if
+           return
+     end subroutine stdlib_qlaqps
+
+     !> DLAQR0: computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+     !> Schur form), and Z is the orthogonal matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input orthogonal
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+     subroutine stdlib_qlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, ldz, work,&
+                lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(qp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(qp), intent(out) :: wi(*), work(*), wr(*)
+        ! ================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: kexnw = 5
+           integer(ilp), parameter :: kexsh = 6
+           real(qp), parameter :: wilk1 = 0.75_qp
+           real(qp), parameter :: wilk2 = -0.4375_qp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_qlahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== exceptional deflation windows:  try to cure rare
+           ! .    slow convergence by varying the size of the
+           ! .    deflation window after kexnw iterations. ====
+           
+           ! ==== exceptional shifts: try to cure rare slow convergence
+           ! .    with ad-hoc exceptional shifts every kexsh iterations.
+           ! .    ====
+           
+           ! ==== the constants wilk1 and wilk2 are used to form the
+           ! .    exceptional shifts. ====
+           
+           
+           ! Local Scalars 
+           real(qp) :: aa, bb, cc, cs, dd, sn, ss, swap
+           integer(ilp) :: i, inf, it, itmax, k, kacc22, kbot, kdu, ks, kt, ktop, ku, kv, kwh, &
+           kwtop, kwv, ld, ls, lwkopt, ndec, ndfl, nh, nho, nibble, nmin, ns, nsmax, nsr, nve, nw,&
+                      nwmax, nwr, nwupbd
+           logical(lk) :: sorted
+           character :: jbcmpz*2
+           ! Local Arrays 
+           real(qp) :: zdum(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,max,min,mod
+           ! Executable Statements 
+           info = 0
+           ! ==== quick return for n = 0: nothing to do. ====
+           if( n==0 ) then
+              work( 1 ) = one
+              return
+           end if
+           if( n<=ntiny ) then
+              ! ==== tiny matrices must use stdlib_qlahqr. ====
+              lwkopt = 1
+              if( lwork/=-1 )call stdlib_qlahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, &
+                        ihiz, z, ldz, info )
+           else
+              ! ==== use small bulge multi-shift qr with aggressive early
+              ! .    deflation on larger-than-tiny matrices. ====
+              ! ==== hope for the best. ====
+              info = 0
+              ! ==== set up job flags for stdlib_ilaenv. ====
+              if( wantt ) then
+                 jbcmpz( 1: 1 ) = 'S'
+              else
+                 jbcmpz( 1: 1 ) = 'E'
+              end if
+              if( wantz ) then
+                 jbcmpz( 2: 2 ) = 'V'
+              else
+                 jbcmpz( 2: 2 ) = 'N'
+              end if
+              ! ==== nwr = recommended deflation window size.  at this
+              ! .    point,  n > ntiny = 15, so there is enough
+              ! .    subdiagonal workspace for nwr>=2 as required.
+              ! .    (in fact, there is enough subdiagonal space for
+              ! .    nwr>=4.) ====
+              nwr = stdlib_ilaenv( 13, 'DLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nwr = max( 2, nwr )
+              nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+              ! ==== nsr = recommended number of simultaneous shifts.
+              ! .    at this point n > ntiny = 15, so there is at
+              ! .    enough subdiagonal workspace for nsr to be even
+              ! .    and greater than or equal to two as required. ====
+              nsr = stdlib_ilaenv( 15, 'DLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nsr = min( nsr, ( n-3 ) / 6, ihi-ilo )
+              nsr = max( 2, nsr-mod( nsr, 2 ) )
+              ! ==== estimate optimal workspace ====
+              ! ==== workspace query call to stdlib_qlaqr3 ====
+              call stdlib_qlaqr3( wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz,ihiz, z, ldz, ls,&
+                         ld, wr, wi, h, ldh, n, h, ldh,n, h, ldh, work, -1 )
+              ! ==== optimal workspace = max(stdlib_qlaqr5, stdlib_qlaqr3) ====
+              lwkopt = max( 3*nsr / 2, int( work( 1 ),KIND=ilp) )
+              ! ==== quick return in case of workspace query. ====
+              if( lwork==-1 ) then
+                 work( 1 ) = real( lwkopt,KIND=qp)
+                 return
+              end if
+              ! ==== stdlib_qlahqr/stdlib_qlaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'DLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nmin = max( ntiny, nmin )
+              ! ==== nibble crossover point ====
+              nibble = stdlib_ilaenv( 14, 'DLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nibble = max( 0, nibble )
+              ! ==== accumulate reflections during ttswp?  use block
+              ! .    2-by-2 structure during matrix-matrix multiply? ====
+              kacc22 = stdlib_ilaenv( 16, 'DLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              kacc22 = max( 0, kacc22 )
+              kacc22 = min( 2, kacc22 )
+              ! ==== nwmax = the largest possible deflation window for
+              ! .    which there is sufficient workspace. ====
+              nwmax = min( ( n-1 ) / 3, lwork / 2 )
+              nw = nwmax
+              ! ==== nsmax = the largest number of simultaneous shifts
+              ! .    for which there is sufficient workspace. ====
+              nsmax = min( ( n-3 ) / 6, 2*lwork / 3 )
+              nsmax = nsmax - mod( nsmax, 2 )
+              ! ==== ndfl: an iteration count restarted at deflation. ====
+              ndfl = 1
+              ! ==== itmax = iteration limit ====
+              itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )
+              ! ==== last row and column in the active block ====
+              kbot = ihi
+              ! ==== main loop ====
+              loop_80: do it = 1, itmax
+                 ! ==== done when kbot falls below ilo ====
+                 if( kbot<ilo )go to 90
+                 ! ==== locate active block ====
+                 do k = kbot, ilo + 1, -1
+                    if( h( k, k-1 )==zero )go to 20
+                 end do
+                 k = ilo
+                 20 continue
+                 ktop = k
+                 ! ==== select deflation window size:
+                 ! .    typical case:
+                 ! .      if possible and advisable, nibble the entire
+                 ! .      active block.  if not, use size min(nwr,nwmax)
+                 ! .      or min(nwr+1,nwmax) depending upon which has
+                 ! .      the smaller corresponding subdiagonal entry
+                 ! .      (a heuristic).
+                 ! .    exceptional case:
+                 ! .      if there have been no deflations in kexnw or
+                 ! .      more iterations, then vary the deflation window
+                 ! .      size.   at first, because, larger windows are,
+                 ! .      in general, more powerful than smaller ones,
+                 ! .      rapidly increase the window to the maximum possible.
+                 ! .      then, gradually reduce the window size. ====
+                 nh = kbot - ktop + 1
+                 nwupbd = min( nh, nwmax )
+                 if( ndfl<kexnw ) then
+                    nw = min( nwupbd, nwr )
+                 else
+                    nw = min( nwupbd, 2*nw )
+                 end if
+                 if( nw<nwmax ) then
+                    if( nw>=nh-1 ) then
+                       nw = nh
+                    else
+                       kwtop = kbot - nw + 1
+                       if( abs( h( kwtop, kwtop-1 ) )>abs( h( kwtop-1, kwtop-2 ) ) )nw = nw + &
+                                 1
+                    end if
+                 end if
+                 if( ndfl<kexnw ) then
+                    ndec = -1
+                 else if( ndec>=0 .or. nw>=nwupbd ) then
+                    ndec = ndec + 1
+                    if( nw-ndec<2 )ndec = 0
+                    nw = nw - ndec
+                 end if
+                 ! ==== aggressive early deflation:
+                 ! .    split workspace under the subdiagonal into
+                 ! .      - an nw-by-nw work array v in the lower
+                 ! .        left-hand-corner,
+                 ! .      - an nw-by-at-least-nw-but-more-is-better
+                 ! .        (nw-by-nho) horizontal work array along
+                 ! .        the bottom edge,
+                 ! .      - an at-least-nw-but-more-is-better (nhv-by-nw)
+                 ! .        vertical work array along the left-hand-edge.
+                 ! .        ====
+                 kv = n - nw + 1
+                 kt = nw + 1
+                 nho = ( n-nw-1 ) - kt + 1
+                 kwv = nw + 2
+                 nve = ( n-nw ) - kwv + 1
+                 ! ==== aggressive early deflation ====
+                 call stdlib_qlaqr3( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+                 ls, ld, wr, wi, h( kv, 1 ), ldh,nho, h( kv, kt ), ldh, nve, h( kwv, 1 ), ldh,&
+                           work, lwork )
+                 ! ==== adjust kbot accounting for new deflations. ====
+                 kbot = kbot - ld
+                 ! ==== ks points to the shifts. ====
+                 ks = kbot - ls + 1
+                 ! ==== skip an expensive qr sweep if there is a (partly
+                 ! .    heuristic) reason to expect that many eigenvalues
+                 ! .    will deflate without it.  here, the qr sweep is
+                 ! .    skipped if many eigenvalues have just been deflated
+                 ! .    or if the remaining active block is small.
+                 if( ( ld==0 ) .or. ( ( 100*ld<=nw*nibble ) .and. ( kbot-ktop+1>min( nmin, nwmax )&
+                            ) ) ) then
+                    ! ==== ns = nominal number of simultaneous shifts.
+                    ! .    this may be lowered (slightly) if stdlib_qlaqr3
+                    ! .    did not provide that many shifts. ====
+                    ns = min( nsmax, nsr, max( 2, kbot-ktop ) )
+                    ns = ns - mod( ns, 2 )
+                    ! ==== if there have been no deflations
+                    ! .    in a multiple of kexsh iterations,
+                    ! .    then try exceptional shifts.
+                    ! .    otherwise use shifts provided by
+                    ! .    stdlib_qlaqr3 above or from the eigenvalues
+                    ! .    of a trailing principal submatrix. ====
+                    if( mod( ndfl, kexsh )==0 ) then
+                       ks = kbot - ns + 1
+                       do i = kbot, max( ks+1, ktop+2 ), -2
+                          ss = abs( h( i, i-1 ) ) + abs( h( i-1, i-2 ) )
+                          aa = wilk1*ss + h( i, i )
+                          bb = ss
+                          cc = wilk2*ss
+                          dd = aa
+                          call stdlib_qlanv2( aa, bb, cc, dd, wr( i-1 ), wi( i-1 ),wr( i ), wi( i &
+                                    ), cs, sn )
+                       end do
+                       if( ks==ktop ) then
+                          wr( ks+1 ) = h( ks+1, ks+1 )
+                          wi( ks+1 ) = zero
+                          wr( ks ) = wr( ks+1 )
+                          wi( ks ) = wi( ks+1 )
+                       end if
+                    else
+                       ! ==== got ns/2 or fewer shifts? use stdlib_qlaqr4 or
+                       ! .    stdlib_qlahqr on a trailing principal submatrix to
+                       ! .    get more. (since ns<=nsmax<=(n-3)/6,
+                       ! .    there is enough space below the subdiagonal
+                       ! .    to fit an ns-by-ns scratch array.) ====
+                       if( kbot-ks+1<=ns / 2 ) then
+                          ks = kbot - ns + 1
+                          kt = n - ns + 1
+                          call stdlib_qlacpy( 'A', ns, ns, h( ks, ks ), ldh,h( kt, 1 ), ldh )
+                                    
+                          if( ns>nmin ) then
+                             call stdlib_qlaqr4( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, wr( &
+                                       ks ),wi( ks ), 1, 1, zdum, 1, work,lwork, inf )
+                          else
+                             call stdlib_qlahqr( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, wr( &
+                                       ks ),wi( ks ), 1, 1, zdum, 1, inf )
+                          end if
+                          ks = ks + inf
+                          ! ==== in case of a rare qr failure use
+                          ! .    eigenvalues of the trailing 2-by-2
+                          ! .    principal submatrix.  ====
+                          if( ks>=kbot ) then
+                             aa = h( kbot-1, kbot-1 )
+                             cc = h( kbot, kbot-1 )
+                             bb = h( kbot-1, kbot )
+                             dd = h( kbot, kbot )
+                             call stdlib_qlanv2( aa, bb, cc, dd, wr( kbot-1 ),wi( kbot-1 ), wr( &
+                                       kbot ),wi( kbot ), cs, sn )
+                             ks = kbot - 1
+                          end if
+                       end if
+                       if( kbot-ks+1>ns ) then
+                          ! ==== sort the shifts (helps a little)
+                          ! .    bubble sort keeps complex conjugate
+                          ! .    pairs together. ====
+                          sorted = .false.
+                          do k = kbot, ks + 1, -1
+                             if( sorted )go to 60
+                             sorted = .true.
+                             do i = ks, k - 1
+                                if( abs( wr( i ) )+abs( wi( i ) )<abs( wr( i+1 ) )+abs( wi( i+1 ) &
+                                          ) ) then
+                                   sorted = .false.
+                                   swap = wr( i )
+                                   wr( i ) = wr( i+1 )
+                                   wr( i+1 ) = swap
+                                   swap = wi( i )
+                                   wi( i ) = wi( i+1 )
+                                   wi( i+1 ) = swap
+                                end if
+                             end do
+                          end do
+                          60 continue
+                       end if
+                       ! ==== shuffle shifts into pairs of real shifts
+                       ! .    and pairs of complex conjugate shifts
+                       ! .    assuming complex conjugate shifts are
+                       ! .    already adjacent to one another. (yes,
+                       ! .    they are.)  ====
+                       do i = kbot, ks + 2, -2
+                          if( wi( i )/=-wi( i-1 ) ) then
+                             swap = wr( i )
+                             wr( i ) = wr( i-1 )
+                             wr( i-1 ) = wr( i-2 )
+                             wr( i-2 ) = swap
+                             swap = wi( i )
+                             wi( i ) = wi( i-1 )
+                             wi( i-1 ) = wi( i-2 )
+                             wi( i-2 ) = swap
+                          end if
+                       end do
+                    end if
+                    ! ==== if there are only two shifts and both are
+                    ! .    real, then use only one.  ====
+                    if( kbot-ks+1==2 ) then
+                       if( wi( kbot )==zero ) then
+                          if( abs( wr( kbot )-h( kbot, kbot ) )<abs( wr( kbot-1 )-h( kbot, kbot ) &
+                                    ) ) then
+                             wr( kbot-1 ) = wr( kbot )
+                          else
+                             wr( kbot ) = wr( kbot-1 )
+                          end if
+                       end if
+                    end if
+                    ! ==== use up to ns of the the smallest magnitude
+                    ! .    shifts.  if there aren't ns shifts available,
+                    ! .    then use them all, possibly dropping one to
+                    ! .    make the number of shifts even. ====
+                    ns = min( ns, kbot-ks+1 )
+                    ns = ns - mod( ns, 2 )
+                    ks = kbot - ns + 1
+                    ! ==== small-bulge multi-shift qr sweep:
+                    ! .    split workspace under the subdiagonal into
+                    ! .    - a kdu-by-kdu work array u in the lower
+                    ! .      left-hand-corner,
+                    ! .    - a kdu-by-at-least-kdu-but-more-is-better
+                    ! .      (kdu-by-nho) horizontal work array wh along
+                    ! .      the bottom edge,
+                    ! .    - and an at-least-kdu-but-more-is-better-by-kdu
+                    ! .      (nve-by-kdu) vertical work wv arrow along
+                    ! .      the left-hand-edge. ====
+                    kdu = 2*ns
+                    ku = n - kdu + 1
+                    kwh = kdu + 1
+                    nho = ( n-kdu+1-4 ) - ( kdu+1 ) + 1
+                    kwv = kdu + 4
+                    nve = n - kdu - kwv + 1
+                    ! ==== small-bulge multi-shift qr sweep ====
+                    call stdlib_qlaqr5( wantt, wantz, kacc22, n, ktop, kbot, ns,wr( ks ), wi( ks )&
+                    , h, ldh, iloz, ihiz, z,ldz, work, 3, h( ku, 1 ), ldh, nve,h( kwv, 1 ), ldh, &
+                              nho, h( ku, kwh ), ldh )
+                 end if
+                 ! ==== note progress (or the lack of it). ====
+                 if( ld>0 ) then
+                    ndfl = 1
+                 else
+                    ndfl = ndfl + 1
+                 end if
+                 ! ==== end of main loop ====
+              end do loop_80
+              ! ==== iteration limit exceeded.  set info to show where
+              ! .    the problem occurred and exit. ====
+              info = kbot
+              90 continue
+           end if
+           ! ==== return the optimal value of lwork. ====
+           work( 1 ) = real( lwkopt,KIND=qp)
+     end subroutine stdlib_qlaqr0
+
+     !> Given a 2-by-2 or 3-by-3 matrix H, DLAQR1: sets v to a
+     !> scalar multiple of the first column of the product
+     !> (*)  K = (H - (sr1 + i*si1)*I)*(H - (sr2 + i*si2)*I)
+     !> scaling to avoid overflows and most underflows. It
+     !> is assumed that either
+     !> 1) sr1 = sr2 and si1 = -si2
+     !> or
+     !> 2) si1 = si2 = 0.
+     !> This is useful for starting double implicit shift bulges
+     !> in the QR algorithm.
+
+     pure subroutine stdlib_qlaqr1( n, h, ldh, sr1, si1, sr2, si2, v )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: si1, si2, sr1, sr2
+           integer(ilp), intent(in) :: ldh, n
+           ! Array Arguments 
+           real(qp), intent(in) :: h(ldh,*)
+           real(qp), intent(out) :: v(*)
+        ! ================================================================
+           
+           ! Local Scalars 
+           real(qp) :: h21s, h31s, s
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! quick return if possible
+           if( n/=2 .and. n/=3 ) then
+              return
+           end if
+           if( n==2 ) then
+              s = abs( h( 1, 1 )-sr2 ) + abs( si2 ) + abs( h( 2, 1 ) )
+              if( s==zero ) then
+                 v( 1 ) = zero
+                 v( 2 ) = zero
+              else
+                 h21s = h( 2, 1 ) / s
+                 v( 1 ) = h21s*h( 1, 2 ) + ( h( 1, 1 )-sr1 )*( ( h( 1, 1 )-sr2 ) / s ) - si1*( &
+                           si2 / s )
+                 v( 2 ) = h21s*( h( 1, 1 )+h( 2, 2 )-sr1-sr2 )
+              end if
+           else
+              s = abs( h( 1, 1 )-sr2 ) + abs( si2 ) + abs( h( 2, 1 ) ) +abs( h( 3, 1 ) )
+              if( s==zero ) then
+                 v( 1 ) = zero
+                 v( 2 ) = zero
+                 v( 3 ) = zero
+              else
+                 h21s = h( 2, 1 ) / s
+                 h31s = h( 3, 1 ) / s
+                 v( 1 ) = ( h( 1, 1 )-sr1 )*( ( h( 1, 1 )-sr2 ) / s ) -si1*( si2 / s ) + h( 1, 2 )&
+                           *h21s + h( 1, 3 )*h31s
+                 v( 2 ) = h21s*( h( 1, 1 )+h( 2, 2 )-sr1-sr2 ) +h( 2, 3 )*h31s
+                 v( 3 ) = h31s*( h( 1, 1 )+h( 3, 3 )-sr1-sr2 ) +h21s*h( 3, 2 )
+              end if
+           end if
+     end subroutine stdlib_qlaqr1
+
+     !> DLAQR2: is identical to DLAQR3 except that it avoids
+     !> recursion by calling DLAHQR instead of DLAQR4.
+     !> Aggressive early deflation:
+     !> This subroutine accepts as input an upper Hessenberg matrix
+     !> H and performs an orthogonal similarity transformation
+     !> designed to detect and deflate fully converged eigenvalues from
+     !> a trailing principal submatrix.  On output H has been over-
+     !> written by a new Hessenberg matrix that is a perturbation of
+     !> an orthogonal similarity transformation of H.  It is to be
+     !> hoped that the final version of H has many zero subdiagonal
+     !> entries.
+
+     subroutine stdlib_qlaqr2( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, ns, nd,&
+                sr, si, v, ldv, nh, t,ldt, nv, wv, ldwv, work, lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kbot, ktop, ldh, ldt, ldv, ldwv, ldz, lwork, n,&
+                      nh, nv, nw
+           integer(ilp), intent(out) :: nd, ns
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(qp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(qp), intent(out) :: si(*), sr(*), t(ldt,*), v(ldv,*), work(*), wv(ldwv,*)
+        ! ================================================================
+           
+           ! Local Scalars 
+           real(qp) :: aa, bb, beta, cc, cs, dd, evi, evk, foo, s, safmax, safmin, smlnum, sn, &
+                     tau, ulp
+           integer(ilp) :: i, ifst, ilst, info, infqr, j, jw, k, kcol, kend, kln, krow, kwtop, &
+                     ltop, lwk1, lwk2, lwkopt
+           logical(lk) :: bulge, sorted
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,max,min,sqrt
+           ! Executable Statements 
+           ! ==== estimate optimal workspace. ====
+           jw = min( nw, kbot-ktop+1 )
+           if( jw<=2 ) then
+              lwkopt = 1
+           else
+              ! ==== workspace query call to stdlib_qgehrd ====
+              call stdlib_qgehrd( jw, 1, jw-1, t, ldt, work, work, -1, info )
+              lwk1 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_qormhr ====
+              call stdlib_qormhr( 'R', 'N', jw, jw, 1, jw-1, t, ldt, work, v, ldv,work, -1, info )
+                        
+              lwk2 = int( work( 1 ),KIND=ilp)
+              ! ==== optimal workspace ====
+              lwkopt = jw + max( lwk1, lwk2 )
+           end if
+           ! ==== quick return in case of workspace query. ====
+           if( lwork==-1 ) then
+              work( 1 ) = real( lwkopt,KIND=qp)
+              return
+           end if
+           ! ==== nothing to do ...
+           ! ... for an empty active block ... ====
+           ns = 0
+           nd = 0
+           work( 1 ) = one
+           if( ktop>kbot )return
+           ! ... nor for an empty deflation window. ====
+           if( nw<1 )return
+           ! ==== machine constants ====
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = one / safmin
+           call stdlib_qlabad( safmin, safmax )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=qp) / ulp )
+           ! ==== setup deflation window ====
+           jw = min( nw, kbot-ktop+1 )
+           kwtop = kbot - jw + 1
+           if( kwtop==ktop ) then
+              s = zero
+           else
+              s = h( kwtop, kwtop-1 )
+           end if
+           if( kbot==kwtop ) then
+              ! ==== 1-by-1 deflation window: not much to do ====
+              sr( kwtop ) = h( kwtop, kwtop )
+              si( kwtop ) = zero
+              ns = 1
+              nd = 0
+              if( abs( s )<=max( smlnum, ulp*abs( h( kwtop, kwtop ) ) ) )then
+                 ns = 0
+                 nd = 1
+                 if( kwtop>ktop )h( kwtop, kwtop-1 ) = zero
+              end if
+              work( 1 ) = one
+              return
+           end if
+           ! ==== convert to spike-triangular form.  (in case of a
+           ! .    rare qr failure, this routine continues to do
+           ! .    aggressive early deflation using that part of
+           ! .    the deflation window that converged using infqr
+           ! .    here and there to keep track.) ====
+           call stdlib_qlacpy( 'U', jw, jw, h( kwtop, kwtop ), ldh, t, ldt )
+           call stdlib_qcopy( jw-1, h( kwtop+1, kwtop ), ldh+1, t( 2, 1 ), ldt+1 )
+           call stdlib_qlaset( 'A', jw, jw, zero, one, v, ldv )
+           call stdlib_qlahqr( .true., .true., jw, 1, jw, t, ldt, sr( kwtop ),si( kwtop ), 1, jw, &
+                     v, ldv, infqr )
+           ! ==== stdlib_qtrexc needs a clean margin near the diagonal ====
+           do j = 1, jw - 3
+              t( j+2, j ) = zero
+              t( j+3, j ) = zero
+           end do
+           if( jw>2 )t( jw, jw-2 ) = zero
+           ! ==== deflation detection loop ====
+           ns = jw
+           ilst = infqr + 1
+           20 continue
+           if( ilst<=ns ) then
+              if( ns==1 ) then
+                 bulge = .false.
+              else
+                 bulge = t( ns, ns-1 )/=zero
+              end if
+              ! ==== small spike tip test for deflation ====
+              if( .not.bulge ) then
+                 ! ==== real eigenvalue ====
+                 foo = abs( t( ns, ns ) )
+                 if( foo==zero )foo = abs( s )
+                 if( abs( s*v( 1, ns ) )<=max( smlnum, ulp*foo ) ) then
+                    ! ==== deflatable ====
+                    ns = ns - 1
+                 else
+                    ! ==== undeflatable.   move it up out of the way.
+                    ! .    (stdlib_qtrexc can not fail in this case.) ====
+                    ifst = ns
+                    call stdlib_qtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    ilst = ilst + 1
+                 end if
+              else
+                 ! ==== complex conjugate pair ====
+                 foo = abs( t( ns, ns ) ) + sqrt( abs( t( ns, ns-1 ) ) )*sqrt( abs( t( ns-1, ns ) &
+                           ) )
+                 if( foo==zero )foo = abs( s )
+                 if( max( abs( s*v( 1, ns ) ), abs( s*v( 1, ns-1 ) ) )<=max( smlnum, ulp*foo ) ) &
+                           then
+                    ! ==== deflatable ====
+                    ns = ns - 2
+                 else
+                    ! ==== undeflatable. move them up out of the way.
+                    ! .    fortunately, stdlib_qtrexc does the right thing with
+                    ! .    ilst in case of a rare exchange failure. ====
+                    ifst = ns
+                    call stdlib_qtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    ilst = ilst + 2
+                 end if
+              end if
+              ! ==== end deflation detection loop ====
+              go to 20
+           end if
+              ! ==== return to hessenberg form ====
+           if( ns==0 )s = zero
+           if( ns<jw ) then
+              ! ==== sorting diagonal blocks of t improves accuracy for
+              ! .    graded matrices.  bubble sort deals well with
+              ! .    exchange failures. ====
+              sorted = .false.
+              i = ns + 1
+              30 continue
+              if( sorted )go to 50
+              sorted = .true.
+              kend = i - 1
+              i = infqr + 1
+              if( i==ns ) then
+                 k = i + 1
+              else if( t( i+1, i )==zero ) then
+                 k = i + 1
+              else
+                 k = i + 2
+              end if
+              40 continue
+              if( k<=kend ) then
+                 if( k==i+1 ) then
+                    evi = abs( t( i, i ) )
+                 else
+                    evi = abs( t( i, i ) ) + sqrt( abs( t( i+1, i ) ) )*sqrt( abs( t( i, i+1 ) ) )
+                              
+                 end if
+                 if( k==kend ) then
+                    evk = abs( t( k, k ) )
+                 else if( t( k+1, k )==zero ) then
+                    evk = abs( t( k, k ) )
+                 else
+                    evk = abs( t( k, k ) ) + sqrt( abs( t( k+1, k ) ) )*sqrt( abs( t( k, k+1 ) ) )
+                              
+                 end if
+                 if( evi>=evk ) then
+                    i = k
+                 else
+                    sorted = .false.
+                    ifst = i
+                    ilst = k
+                    call stdlib_qtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    if( info==0 ) then
+                       i = ilst
+                    else
+                       i = k
+                    end if
+                 end if
+                 if( i==kend ) then
+                    k = i + 1
+                 else if( t( i+1, i )==zero ) then
+                    k = i + 1
+                 else
+                    k = i + 2
+                 end if
+                 go to 40
+              end if
+              go to 30
+              50 continue
+           end if
+           ! ==== restore shift/eigenvalue array from t ====
+           i = jw
+           60 continue
+           if( i>=infqr+1 ) then
+              if( i==infqr+1 ) then
+                 sr( kwtop+i-1 ) = t( i, i )
+                 si( kwtop+i-1 ) = zero
+                 i = i - 1
+              else if( t( i, i-1 )==zero ) then
+                 sr( kwtop+i-1 ) = t( i, i )
+                 si( kwtop+i-1 ) = zero
+                 i = i - 1
+              else
+                 aa = t( i-1, i-1 )
+                 cc = t( i, i-1 )
+                 bb = t( i-1, i )
+                 dd = t( i, i )
+                 call stdlib_qlanv2( aa, bb, cc, dd, sr( kwtop+i-2 ),si( kwtop+i-2 ), sr( kwtop+i-&
+                           1 ),si( kwtop+i-1 ), cs, sn )
+                 i = i - 2
+              end if
+              go to 60
+           end if
+           if( ns<jw .or. s==zero ) then
+              if( ns>1 .and. s/=zero ) then
+                 ! ==== reflect spike back into lower triangle ====
+                 call stdlib_qcopy( ns, v, ldv, work, 1 )
+                 beta = work( 1 )
+                 call stdlib_qlarfg( ns, beta, work( 2 ), 1, tau )
+                 work( 1 ) = one
+                 call stdlib_qlaset( 'L', jw-2, jw-2, zero, zero, t( 3, 1 ), ldt )
+                 call stdlib_qlarf( 'L', ns, jw, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_qlarf( 'R', ns, ns, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_qlarf( 'R', jw, ns, work, 1, tau, v, ldv,work( jw+1 ) )
+                 call stdlib_qgehrd( jw, 1, ns, t, ldt, work, work( jw+1 ),lwork-jw, info )
+                           
+              end if
+              ! ==== copy updated reduced window into place ====
+              if( kwtop>1 )h( kwtop, kwtop-1 ) = s*v( 1, 1 )
+              call stdlib_qlacpy( 'U', jw, jw, t, ldt, h( kwtop, kwtop ), ldh )
+              call stdlib_qcopy( jw-1, t( 2, 1 ), ldt+1, h( kwtop+1, kwtop ),ldh+1 )
+              ! ==== accumulate orthogonal matrix in order update
+              ! .    h and z, if requested.  ====
+              if( ns>1 .and. s/=zero )call stdlib_qormhr( 'R', 'N', jw, ns, 1, ns, t, ldt, work, &
+                        v, ldv,work( jw+1 ), lwork-jw, info )
+              ! ==== update vertical slab in h ====
+              if( wantt ) then
+                 ltop = 1
+              else
+                 ltop = ktop
+              end if
+              do krow = ltop, kwtop - 1, nv
+                 kln = min( nv, kwtop-krow )
+                 call stdlib_qgemm( 'N', 'N', kln, jw, jw, one, h( krow, kwtop ),ldh, v, ldv, &
+                           zero, wv, ldwv )
+                 call stdlib_qlacpy( 'A', kln, jw, wv, ldwv, h( krow, kwtop ), ldh )
+              end do
+              ! ==== update horizontal slab in h ====
+              if( wantt ) then
+                 do kcol = kbot + 1, n, nh
+                    kln = min( nh, n-kcol+1 )
+                    call stdlib_qgemm( 'C', 'N', jw, kln, jw, one, v, ldv,h( kwtop, kcol ), ldh, &
+                              zero, t, ldt )
+                    call stdlib_qlacpy( 'A', jw, kln, t, ldt, h( kwtop, kcol ),ldh )
+                 end do
+              end if
+              ! ==== update vertical slab in z ====
+              if( wantz ) then
+                 do krow = iloz, ihiz, nv
+                    kln = min( nv, ihiz-krow+1 )
+                    call stdlib_qgemm( 'N', 'N', kln, jw, jw, one, z( krow, kwtop ),ldz, v, ldv, &
+                              zero, wv, ldwv )
+                    call stdlib_qlacpy( 'A', kln, jw, wv, ldwv, z( krow, kwtop ),ldz )
+                 end do
+              end if
+           end if
+           ! ==== return the number of deflations ... ====
+           nd = jw - ns
+           ! ==== ... and the number of shifts. (subtracting
+           ! .    infqr from the spike length takes care
+           ! .    of the case of a rare qr failure while
+           ! .    calculating eigenvalues of the deflation
+           ! .    window.)  ====
+           ns = ns - infqr
+            ! ==== return optimal workspace. ====
+           work( 1 ) = real( lwkopt,KIND=qp)
+     end subroutine stdlib_qlaqr2
+
+     !> Aggressive early deflation:
+     !> DLAQR3: accepts as input an upper Hessenberg matrix
+     !> H and performs an orthogonal similarity transformation
+     !> designed to detect and deflate fully converged eigenvalues from
+     !> a trailing principal submatrix.  On output H has been over-
+     !> written by a new Hessenberg matrix that is a perturbation of
+     !> an orthogonal similarity transformation of H.  It is to be
+     !> hoped that the final version of H has many zero subdiagonal
+     !> entries.
+
+     subroutine stdlib_qlaqr3( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, ns, nd,&
+                sr, si, v, ldv, nh, t,ldt, nv, wv, ldwv, work, lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kbot, ktop, ldh, ldt, ldv, ldwv, ldz, lwork, n,&
+                      nh, nv, nw
+           integer(ilp), intent(out) :: nd, ns
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(qp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(qp), intent(out) :: si(*), sr(*), t(ldt,*), v(ldv,*), work(*), wv(ldwv,*)
+        ! ================================================================
+           
+           ! Local Scalars 
+           real(qp) :: aa, bb, beta, cc, cs, dd, evi, evk, foo, s, safmax, safmin, smlnum, sn, &
+                     tau, ulp
+           integer(ilp) :: i, ifst, ilst, info, infqr, j, jw, k, kcol, kend, kln, krow, kwtop, &
+                     ltop, lwk1, lwk2, lwk3, lwkopt, nmin
+           logical(lk) :: bulge, sorted
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,max,min,sqrt
+           ! Executable Statements 
+           ! ==== estimate optimal workspace. ====
+           jw = min( nw, kbot-ktop+1 )
+           if( jw<=2 ) then
+              lwkopt = 1
+           else
+              ! ==== workspace query call to stdlib_qgehrd ====
+              call stdlib_qgehrd( jw, 1, jw-1, t, ldt, work, work, -1, info )
+              lwk1 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_qormhr ====
+              call stdlib_qormhr( 'R', 'N', jw, jw, 1, jw-1, t, ldt, work, v, ldv,work, -1, info )
+                        
+              lwk2 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_qlaqr4 ====
+              call stdlib_qlaqr4( .true., .true., jw, 1, jw, t, ldt, sr, si, 1, jw,v, ldv, work, -&
+                        1, infqr )
+              lwk3 = int( work( 1 ),KIND=ilp)
+              ! ==== optimal workspace ====
+              lwkopt = max( jw+max( lwk1, lwk2 ), lwk3 )
+           end if
+           ! ==== quick return in case of workspace query. ====
+           if( lwork==-1 ) then
+              work( 1 ) = real( lwkopt,KIND=qp)
+              return
+           end if
+           ! ==== nothing to do ...
+           ! ... for an empty active block ... ====
+           ns = 0
+           nd = 0
+           work( 1 ) = one
+           if( ktop>kbot )return
+           ! ... nor for an empty deflation window. ====
+           if( nw<1 )return
+           ! ==== machine constants ====
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = one / safmin
+           call stdlib_qlabad( safmin, safmax )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=qp) / ulp )
+           ! ==== setup deflation window ====
+           jw = min( nw, kbot-ktop+1 )
+           kwtop = kbot - jw + 1
+           if( kwtop==ktop ) then
+              s = zero
+           else
+              s = h( kwtop, kwtop-1 )
+           end if
+           if( kbot==kwtop ) then
+              ! ==== 1-by-1 deflation window: not much to do ====
+              sr( kwtop ) = h( kwtop, kwtop )
+              si( kwtop ) = zero
+              ns = 1
+              nd = 0
+              if( abs( s )<=max( smlnum, ulp*abs( h( kwtop, kwtop ) ) ) )then
+                 ns = 0
+                 nd = 1
+                 if( kwtop>ktop )h( kwtop, kwtop-1 ) = zero
+              end if
+              work( 1 ) = one
+              return
+           end if
+           ! ==== convert to spike-triangular form.  (in case of a
+           ! .    rare qr failure, this routine continues to do
+           ! .    aggressive early deflation using that part of
+           ! .    the deflation window that converged using infqr
+           ! .    here and there to keep track.) ====
+           call stdlib_qlacpy( 'U', jw, jw, h( kwtop, kwtop ), ldh, t, ldt )
+           call stdlib_qcopy( jw-1, h( kwtop+1, kwtop ), ldh+1, t( 2, 1 ), ldt+1 )
+           call stdlib_qlaset( 'A', jw, jw, zero, one, v, ldv )
+           nmin = stdlib_ilaenv( 12, 'DLAQR3', 'SV', jw, 1, jw, lwork )
+           if( jw>nmin ) then
+              call stdlib_qlaqr4( .true., .true., jw, 1, jw, t, ldt, sr( kwtop ),si( kwtop ), 1, &
+                        jw, v, ldv, work, lwork, infqr )
+           else
+              call stdlib_qlahqr( .true., .true., jw, 1, jw, t, ldt, sr( kwtop ),si( kwtop ), 1, &
+                        jw, v, ldv, infqr )
+           end if
+           ! ==== stdlib_qtrexc needs a clean margin near the diagonal ====
+           do j = 1, jw - 3
+              t( j+2, j ) = zero
+              t( j+3, j ) = zero
+           end do
+           if( jw>2 )t( jw, jw-2 ) = zero
+           ! ==== deflation detection loop ====
+           ns = jw
+           ilst = infqr + 1
+           20 continue
+           if( ilst<=ns ) then
+              if( ns==1 ) then
+                 bulge = .false.
+              else
+                 bulge = t( ns, ns-1 )/=zero
+              end if
+              ! ==== small spike tip test for deflation ====
+              if( .not. bulge ) then
+                 ! ==== real eigenvalue ====
+                 foo = abs( t( ns, ns ) )
+                 if( foo==zero )foo = abs( s )
+                 if( abs( s*v( 1, ns ) )<=max( smlnum, ulp*foo ) ) then
+                    ! ==== deflatable ====
+                    ns = ns - 1
+                 else
+                    ! ==== undeflatable.   move it up out of the way.
+                    ! .    (stdlib_qtrexc can not fail in this case.) ====
+                    ifst = ns
+                    call stdlib_qtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    ilst = ilst + 1
+                 end if
+              else
+                 ! ==== complex conjugate pair ====
+                 foo = abs( t( ns, ns ) ) + sqrt( abs( t( ns, ns-1 ) ) )*sqrt( abs( t( ns-1, ns ) &
+                           ) )
+                 if( foo==zero )foo = abs( s )
+                 if( max( abs( s*v( 1, ns ) ), abs( s*v( 1, ns-1 ) ) )<=max( smlnum, ulp*foo ) ) &
+                           then
+                    ! ==== deflatable ====
+                    ns = ns - 2
+                 else
+                    ! ==== undeflatable. move them up out of the way.
+                    ! .    fortunately, stdlib_qtrexc does the right thing with
+                    ! .    ilst in case of a rare exchange failure. ====
+                    ifst = ns
+                    call stdlib_qtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    ilst = ilst + 2
+                 end if
+              end if
+              ! ==== end deflation detection loop ====
+              go to 20
+           end if
+              ! ==== return to hessenberg form ====
+           if( ns==0 )s = zero
+           if( ns<jw ) then
+              ! ==== sorting diagonal blocks of t improves accuracy for
+              ! .    graded matrices.  bubble sort deals well with
+              ! .    exchange failures. ====
+              sorted = .false.
+              i = ns + 1
+              30 continue
+              if( sorted )go to 50
+              sorted = .true.
+              kend = i - 1
+              i = infqr + 1
+              if( i==ns ) then
+                 k = i + 1
+              else if( t( i+1, i )==zero ) then
+                 k = i + 1
+              else
+                 k = i + 2
+              end if
+              40 continue
+              if( k<=kend ) then
+                 if( k==i+1 ) then
+                    evi = abs( t( i, i ) )
+                 else
+                    evi = abs( t( i, i ) ) + sqrt( abs( t( i+1, i ) ) )*sqrt( abs( t( i, i+1 ) ) )
+                              
+                 end if
+                 if( k==kend ) then
+                    evk = abs( t( k, k ) )
+                 else if( t( k+1, k )==zero ) then
+                    evk = abs( t( k, k ) )
+                 else
+                    evk = abs( t( k, k ) ) + sqrt( abs( t( k+1, k ) ) )*sqrt( abs( t( k, k+1 ) ) )
+                              
+                 end if
+                 if( evi>=evk ) then
+                    i = k
+                 else
+                    sorted = .false.
+                    ifst = i
+                    ilst = k
+                    call stdlib_qtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    if( info==0 ) then
+                       i = ilst
+                    else
+                       i = k
+                    end if
+                 end if
+                 if( i==kend ) then
+                    k = i + 1
+                 else if( t( i+1, i )==zero ) then
+                    k = i + 1
+                 else
+                    k = i + 2
+                 end if
+                 go to 40
+              end if
+              go to 30
+              50 continue
+           end if
+           ! ==== restore shift/eigenvalue array from t ====
+           i = jw
+           60 continue
+           if( i>=infqr+1 ) then
+              if( i==infqr+1 ) then
+                 sr( kwtop+i-1 ) = t( i, i )
+                 si( kwtop+i-1 ) = zero
+                 i = i - 1
+              else if( t( i, i-1 )==zero ) then
+                 sr( kwtop+i-1 ) = t( i, i )
+                 si( kwtop+i-1 ) = zero
+                 i = i - 1
+              else
+                 aa = t( i-1, i-1 )
+                 cc = t( i, i-1 )
+                 bb = t( i-1, i )
+                 dd = t( i, i )
+                 call stdlib_qlanv2( aa, bb, cc, dd, sr( kwtop+i-2 ),si( kwtop+i-2 ), sr( kwtop+i-&
+                           1 ),si( kwtop+i-1 ), cs, sn )
+                 i = i - 2
+              end if
+              go to 60
+           end if
+           if( ns<jw .or. s==zero ) then
+              if( ns>1 .and. s/=zero ) then
+                 ! ==== reflect spike back into lower triangle ====
+                 call stdlib_qcopy( ns, v, ldv, work, 1 )
+                 beta = work( 1 )
+                 call stdlib_qlarfg( ns, beta, work( 2 ), 1, tau )
+                 work( 1 ) = one
+                 call stdlib_qlaset( 'L', jw-2, jw-2, zero, zero, t( 3, 1 ), ldt )
+                 call stdlib_qlarf( 'L', ns, jw, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_qlarf( 'R', ns, ns, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_qlarf( 'R', jw, ns, work, 1, tau, v, ldv,work( jw+1 ) )
+                 call stdlib_qgehrd( jw, 1, ns, t, ldt, work, work( jw+1 ),lwork-jw, info )
+                           
+              end if
+              ! ==== copy updated reduced window into place ====
+              if( kwtop>1 )h( kwtop, kwtop-1 ) = s*v( 1, 1 )
+              call stdlib_qlacpy( 'U', jw, jw, t, ldt, h( kwtop, kwtop ), ldh )
+              call stdlib_qcopy( jw-1, t( 2, 1 ), ldt+1, h( kwtop+1, kwtop ),ldh+1 )
+              ! ==== accumulate orthogonal matrix in order update
+              ! .    h and z, if requested.  ====
+              if( ns>1 .and. s/=zero )call stdlib_qormhr( 'R', 'N', jw, ns, 1, ns, t, ldt, work, &
+                        v, ldv,work( jw+1 ), lwork-jw, info )
+              ! ==== update vertical slab in h ====
+              if( wantt ) then
+                 ltop = 1
+              else
+                 ltop = ktop
+              end if
+              do krow = ltop, kwtop - 1, nv
+                 kln = min( nv, kwtop-krow )
+                 call stdlib_qgemm( 'N', 'N', kln, jw, jw, one, h( krow, kwtop ),ldh, v, ldv, &
+                           zero, wv, ldwv )
+                 call stdlib_qlacpy( 'A', kln, jw, wv, ldwv, h( krow, kwtop ), ldh )
+              end do
+              ! ==== update horizontal slab in h ====
+              if( wantt ) then
+                 do kcol = kbot + 1, n, nh
+                    kln = min( nh, n-kcol+1 )
+                    call stdlib_qgemm( 'C', 'N', jw, kln, jw, one, v, ldv,h( kwtop, kcol ), ldh, &
+                              zero, t, ldt )
+                    call stdlib_qlacpy( 'A', jw, kln, t, ldt, h( kwtop, kcol ),ldh )
+                 end do
+              end if
+              ! ==== update vertical slab in z ====
+              if( wantz ) then
+                 do krow = iloz, ihiz, nv
+                    kln = min( nv, ihiz-krow+1 )
+                    call stdlib_qgemm( 'N', 'N', kln, jw, jw, one, z( krow, kwtop ),ldz, v, ldv, &
+                              zero, wv, ldwv )
+                    call stdlib_qlacpy( 'A', kln, jw, wv, ldwv, z( krow, kwtop ),ldz )
+                 end do
+              end if
+           end if
+           ! ==== return the number of deflations ... ====
+           nd = jw - ns
+           ! ==== ... and the number of shifts. (subtracting
+           ! .    infqr from the spike length takes care
+           ! .    of the case of a rare qr failure while
+           ! .    calculating eigenvalues of the deflation
+           ! .    window.)  ====
+           ns = ns - infqr
+            ! ==== return optimal workspace. ====
+           work( 1 ) = real( lwkopt,KIND=qp)
+     end subroutine stdlib_qlaqr3
+
+     !> DLAQR4: implements one level of recursion for DLAQR0.
+     !> It is a complete implementation of the small bulge multi-shift
+     !> QR algorithm.  It may be called by DLAQR0 and, for large enough
+     !> deflation window size, it may be called by DLAQR3.  This
+     !> subroutine is identical to DLAQR0 except that it calls DLAQR2
+     !> instead of DLAQR3.
+     !> DLAQR4 computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+     !> Schur form), and Z is the orthogonal matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input orthogonal
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+     subroutine stdlib_qlaqr4( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, ldz, work,&
+                lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(qp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(qp), intent(out) :: wi(*), work(*), wr(*)
+        ! ================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: kexnw = 5
+           integer(ilp), parameter :: kexsh = 6
+           real(qp), parameter :: wilk1 = 0.75_qp
+           real(qp), parameter :: wilk2 = -0.4375_qp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_qlahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== exceptional deflation windows:  try to cure rare
+           ! .    slow convergence by varying the size of the
+           ! .    deflation window after kexnw iterations. ====
+           
+           ! ==== exceptional shifts: try to cure rare slow convergence
+           ! .    with ad-hoc exceptional shifts every kexsh iterations.
+           ! .    ====
+           
+           ! ==== the constants wilk1 and wilk2 are used to form the
+           ! .    exceptional shifts. ====
+           
+           
+           ! Local Scalars 
+           real(qp) :: aa, bb, cc, cs, dd, sn, ss, swap
+           integer(ilp) :: i, inf, it, itmax, k, kacc22, kbot, kdu, ks, kt, ktop, ku, kv, kwh, &
+           kwtop, kwv, ld, ls, lwkopt, ndec, ndfl, nh, nho, nibble, nmin, ns, nsmax, nsr, nve, nw,&
+                      nwmax, nwr, nwupbd
+           logical(lk) :: sorted
+           character :: jbcmpz*2
+           ! Local Arrays 
+           real(qp) :: zdum(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,max,min,mod
+           ! Executable Statements 
+           info = 0
+           ! ==== quick return for n = 0: nothing to do. ====
+           if( n==0 ) then
+              work( 1 ) = one
+              return
+           end if
+           if( n<=ntiny ) then
+              ! ==== tiny matrices must use stdlib_qlahqr. ====
+              lwkopt = 1
+              if( lwork/=-1 )call stdlib_qlahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, &
+                        ihiz, z, ldz, info )
+           else
+              ! ==== use small bulge multi-shift qr with aggressive early
+              ! .    deflation on larger-than-tiny matrices. ====
+              ! ==== hope for the best. ====
+              info = 0
+              ! ==== set up job flags for stdlib_ilaenv. ====
+              if( wantt ) then
+                 jbcmpz( 1: 1 ) = 'S'
+              else
+                 jbcmpz( 1: 1 ) = 'E'
+              end if
+              if( wantz ) then
+                 jbcmpz( 2: 2 ) = 'V'
+              else
+                 jbcmpz( 2: 2 ) = 'N'
+              end if
+              ! ==== nwr = recommended deflation window size.  at this
+              ! .    point,  n > ntiny = 15, so there is enough
+              ! .    subdiagonal workspace for nwr>=2 as required.
+              ! .    (in fact, there is enough subdiagonal space for
+              ! .    nwr>=4.) ====
+              nwr = stdlib_ilaenv( 13, 'DLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nwr = max( 2, nwr )
+              nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+              ! ==== nsr = recommended number of simultaneous shifts.
+              ! .    at this point n > ntiny = 15, so there is at
+              ! .    enough subdiagonal workspace for nsr to be even
+              ! .    and greater than or equal to two as required. ====
+              nsr = stdlib_ilaenv( 15, 'DLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nsr = min( nsr, ( n-3 ) / 6, ihi-ilo )
+              nsr = max( 2, nsr-mod( nsr, 2 ) )
+              ! ==== estimate optimal workspace ====
+              ! ==== workspace query call to stdlib_qlaqr2 ====
+              call stdlib_qlaqr2( wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz,ihiz, z, ldz, ls,&
+                         ld, wr, wi, h, ldh, n, h, ldh,n, h, ldh, work, -1 )
+              ! ==== optimal workspace = max(stdlib_qlaqr5, stdlib_qlaqr2) ====
+              lwkopt = max( 3*nsr / 2, int( work( 1 ),KIND=ilp) )
+              ! ==== quick return in case of workspace query. ====
+              if( lwork==-1 ) then
+                 work( 1 ) = real( lwkopt,KIND=qp)
+                 return
+              end if
+              ! ==== stdlib_qlahqr/stdlib_qlaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'DLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nmin = max( ntiny, nmin )
+              ! ==== nibble crossover point ====
+              nibble = stdlib_ilaenv( 14, 'DLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nibble = max( 0, nibble )
+              ! ==== accumulate reflections during ttswp?  use block
+              ! .    2-by-2 structure during matrix-matrix multiply? ====
+              kacc22 = stdlib_ilaenv( 16, 'DLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              kacc22 = max( 0, kacc22 )
+              kacc22 = min( 2, kacc22 )
+              ! ==== nwmax = the largest possible deflation window for
+              ! .    which there is sufficient workspace. ====
+              nwmax = min( ( n-1 ) / 3, lwork / 2 )
+              nw = nwmax
+              ! ==== nsmax = the largest number of simultaneous shifts
+              ! .    for which there is sufficient workspace. ====
+              nsmax = min( ( n-3 ) / 6, 2*lwork / 3 )
+              nsmax = nsmax - mod( nsmax, 2 )
+              ! ==== ndfl: an iteration count restarted at deflation. ====
+              ndfl = 1
+              ! ==== itmax = iteration limit ====
+              itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )
+              ! ==== last row and column in the active block ====
+              kbot = ihi
+              ! ==== main loop ====
+              loop_80: do it = 1, itmax
+                 ! ==== done when kbot falls below ilo ====
+                 if( kbot<ilo )go to 90
+                 ! ==== locate active block ====
+                 do k = kbot, ilo + 1, -1
+                    if( h( k, k-1 )==zero )go to 20
+                 end do
+                 k = ilo
+                 20 continue
+                 ktop = k
+                 ! ==== select deflation window size:
+                 ! .    typical case:
+                 ! .      if possible and advisable, nibble the entire
+                 ! .      active block.  if not, use size min(nwr,nwmax)
+                 ! .      or min(nwr+1,nwmax) depending upon which has
+                 ! .      the smaller corresponding subdiagonal entry
+                 ! .      (a heuristic).
+                 ! .    exceptional case:
+                 ! .      if there have been no deflations in kexnw or
+                 ! .      more iterations, then vary the deflation window
+                 ! .      size.   at first, because, larger windows are,
+                 ! .      in general, more powerful than smaller ones,
+                 ! .      rapidly increase the window to the maximum possible.
+                 ! .      then, gradually reduce the window size. ====
+                 nh = kbot - ktop + 1
+                 nwupbd = min( nh, nwmax )
+                 if( ndfl<kexnw ) then
+                    nw = min( nwupbd, nwr )
+                 else
+                    nw = min( nwupbd, 2*nw )
+                 end if
+                 if( nw<nwmax ) then
+                    if( nw>=nh-1 ) then
+                       nw = nh
+                    else
+                       kwtop = kbot - nw + 1
+                       if( abs( h( kwtop, kwtop-1 ) )>abs( h( kwtop-1, kwtop-2 ) ) )nw = nw + &
+                                 1
+                    end if
+                 end if
+                 if( ndfl<kexnw ) then
+                    ndec = -1
+                 else if( ndec>=0 .or. nw>=nwupbd ) then
+                    ndec = ndec + 1
+                    if( nw-ndec<2 )ndec = 0
+                    nw = nw - ndec
+                 end if
+                 ! ==== aggressive early deflation:
+                 ! .    split workspace under the subdiagonal into
+                 ! .      - an nw-by-nw work array v in the lower
+                 ! .        left-hand-corner,
+                 ! .      - an nw-by-at-least-nw-but-more-is-better
+                 ! .        (nw-by-nho) horizontal work array along
+                 ! .        the bottom edge,
+                 ! .      - an at-least-nw-but-more-is-better (nhv-by-nw)
+                 ! .        vertical work array along the left-hand-edge.
+                 ! .        ====
+                 kv = n - nw + 1
+                 kt = nw + 1
+                 nho = ( n-nw-1 ) - kt + 1
+                 kwv = nw + 2
+                 nve = ( n-nw ) - kwv + 1
+                 ! ==== aggressive early deflation ====
+                 call stdlib_qlaqr2( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+                 ls, ld, wr, wi, h( kv, 1 ), ldh,nho, h( kv, kt ), ldh, nve, h( kwv, 1 ), ldh,&
+                           work, lwork )
+                 ! ==== adjust kbot accounting for new deflations. ====
+                 kbot = kbot - ld
+                 ! ==== ks points to the shifts. ====
+                 ks = kbot - ls + 1
+                 ! ==== skip an expensive qr sweep if there is a (partly
+                 ! .    heuristic) reason to expect that many eigenvalues
+                 ! .    will deflate without it.  here, the qr sweep is
+                 ! .    skipped if many eigenvalues have just been deflated
+                 ! .    or if the remaining active block is small.
+                 if( ( ld==0 ) .or. ( ( 100*ld<=nw*nibble ) .and. ( kbot-ktop+1>min( nmin, nwmax )&
+                            ) ) ) then
+                    ! ==== ns = nominal number of simultaneous shifts.
+                    ! .    this may be lowered (slightly) if stdlib_qlaqr2
+                    ! .    did not provide that many shifts. ====
+                    ns = min( nsmax, nsr, max( 2, kbot-ktop ) )
+                    ns = ns - mod( ns, 2 )
+                    ! ==== if there have been no deflations
+                    ! .    in a multiple of kexsh iterations,
+                    ! .    then try exceptional shifts.
+                    ! .    otherwise use shifts provided by
+                    ! .    stdlib_qlaqr2 above or from the eigenvalues
+                    ! .    of a trailing principal submatrix. ====
+                    if( mod( ndfl, kexsh )==0 ) then
+                       ks = kbot - ns + 1
+                       do i = kbot, max( ks+1, ktop+2 ), -2
+                          ss = abs( h( i, i-1 ) ) + abs( h( i-1, i-2 ) )
+                          aa = wilk1*ss + h( i, i )
+                          bb = ss
+                          cc = wilk2*ss
+                          dd = aa
+                          call stdlib_qlanv2( aa, bb, cc, dd, wr( i-1 ), wi( i-1 ),wr( i ), wi( i &
+                                    ), cs, sn )
+                       end do
+                       if( ks==ktop ) then
+                          wr( ks+1 ) = h( ks+1, ks+1 )
+                          wi( ks+1 ) = zero
+                          wr( ks ) = wr( ks+1 )
+                          wi( ks ) = wi( ks+1 )
+                       end if
+                    else
+                       ! ==== got ns/2 or fewer shifts? use stdlib_qlahqr
+                       ! .    on a trailing principal submatrix to
+                       ! .    get more. (since ns<=nsmax<=(n-3)/6,
+                       ! .    there is enough space below the subdiagonal
+                       ! .    to fit an ns-by-ns scratch array.) ====
+                       if( kbot-ks+1<=ns / 2 ) then
+                          ks = kbot - ns + 1
+                          kt = n - ns + 1
+                          call stdlib_qlacpy( 'A', ns, ns, h( ks, ks ), ldh,h( kt, 1 ), ldh )
+                                    
+                          call stdlib_qlahqr( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, wr( ks &
+                                    ), wi( ks ),1, 1, zdum, 1, inf )
+                          ks = ks + inf
+                          ! ==== in case of a rare qr failure use
+                          ! .    eigenvalues of the trailing 2-by-2
+                          ! .    principal submatrix.  ====
+                          if( ks>=kbot ) then
+                             aa = h( kbot-1, kbot-1 )
+                             cc = h( kbot, kbot-1 )
+                             bb = h( kbot-1, kbot )
+                             dd = h( kbot, kbot )
+                             call stdlib_qlanv2( aa, bb, cc, dd, wr( kbot-1 ),wi( kbot-1 ), wr( &
+                                       kbot ),wi( kbot ), cs, sn )
+                             ks = kbot - 1
+                          end if
+                       end if
+                       if( kbot-ks+1>ns ) then
+                          ! ==== sort the shifts (helps a little)
+                          ! .    bubble sort keeps complex conjugate
+                          ! .    pairs together. ====
+                          sorted = .false.
+                          do k = kbot, ks + 1, -1
+                             if( sorted )go to 60
+                             sorted = .true.
+                             do i = ks, k - 1
+                                if( abs( wr( i ) )+abs( wi( i ) )<abs( wr( i+1 ) )+abs( wi( i+1 ) &
+                                          ) ) then
+                                   sorted = .false.
+                                   swap = wr( i )
+                                   wr( i ) = wr( i+1 )
+                                   wr( i+1 ) = swap
+                                   swap = wi( i )
+                                   wi( i ) = wi( i+1 )
+                                   wi( i+1 ) = swap
+                                end if
+                             end do
+                          end do
+                          60 continue
+                       end if
+                       ! ==== shuffle shifts into pairs of real shifts
+                       ! .    and pairs of complex conjugate shifts
+                       ! .    assuming complex conjugate shifts are
+                       ! .    already adjacent to one another. (yes,
+                       ! .    they are.)  ====
+                       do i = kbot, ks + 2, -2
+                          if( wi( i )/=-wi( i-1 ) ) then
+                             swap = wr( i )
+                             wr( i ) = wr( i-1 )
+                             wr( i-1 ) = wr( i-2 )
+                             wr( i-2 ) = swap
+                             swap = wi( i )
+                             wi( i ) = wi( i-1 )
+                             wi( i-1 ) = wi( i-2 )
+                             wi( i-2 ) = swap
+                          end if
+                       end do
+                    end if
+                    ! ==== if there are only two shifts and both are
+                    ! .    real, then use only one.  ====
+                    if( kbot-ks+1==2 ) then
+                       if( wi( kbot )==zero ) then
+                          if( abs( wr( kbot )-h( kbot, kbot ) )<abs( wr( kbot-1 )-h( kbot, kbot ) &
+                                    ) ) then
+                             wr( kbot-1 ) = wr( kbot )
+                          else
+                             wr( kbot ) = wr( kbot-1 )
+                          end if
+                       end if
+                    end if
+                    ! ==== use up to ns of the the smallest magnitude
+                    ! .    shifts.  if there aren't ns shifts available,
+                    ! .    then use them all, possibly dropping one to
+                    ! .    make the number of shifts even. ====
+                    ns = min( ns, kbot-ks+1 )
+                    ns = ns - mod( ns, 2 )
+                    ks = kbot - ns + 1
+                    ! ==== small-bulge multi-shift qr sweep:
+                    ! .    split workspace under the subdiagonal into
+                    ! .    - a kdu-by-kdu work array u in the lower
+                    ! .      left-hand-corner,
+                    ! .    - a kdu-by-at-least-kdu-but-more-is-better
+                    ! .      (kdu-by-nho) horizontal work array wh along
+                    ! .      the bottom edge,
+                    ! .    - and an at-least-kdu-but-more-is-better-by-kdu
+                    ! .      (nve-by-kdu) vertical work wv arrow along
+                    ! .      the left-hand-edge. ====
+                    kdu = 2*ns
+                    ku = n - kdu + 1
+                    kwh = kdu + 1
+                    nho = ( n-kdu+1-4 ) - ( kdu+1 ) + 1
+                    kwv = kdu + 4
+                    nve = n - kdu - kwv + 1
+                    ! ==== small-bulge multi-shift qr sweep ====
+                    call stdlib_qlaqr5( wantt, wantz, kacc22, n, ktop, kbot, ns,wr( ks ), wi( ks )&
+                    , h, ldh, iloz, ihiz, z,ldz, work, 3, h( ku, 1 ), ldh, nve,h( kwv, 1 ), ldh, &
+                              nho, h( ku, kwh ), ldh )
+                 end if
+                 ! ==== note progress (or the lack of it). ====
+                 if( ld>0 ) then
+                    ndfl = 1
+                 else
+                    ndfl = ndfl + 1
+                 end if
+                 ! ==== end of main loop ====
+              end do loop_80
+              ! ==== iteration limit exceeded.  set info to show where
+              ! .    the problem occurred and exit. ====
+              info = kbot
+              90 continue
+           end if
+           ! ==== return the optimal value of lwork. ====
+           work( 1 ) = real( lwkopt,KIND=qp)
+     end subroutine stdlib_qlaqr4
+
+     !> DLAQR5:, called by DLAQR0, performs a
+     !> single small-bulge multi-shift QR sweep.
+
+     pure subroutine stdlib_qlaqr5( wantt, wantz, kacc22, n, ktop, kbot, nshfts,sr, si, h, ldh, &
+               iloz, ihiz, z, ldz, v, ldv, u,ldu, nv, wv, ldwv, nh, wh, ldwh )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kacc22, kbot, ktop, ldh, ldu, ldv, ldwh, ldwv, &
+                     ldz, n, nh, nshfts, nv
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(qp), intent(inout) :: h(ldh,*), si(*), sr(*), z(ldz,*)
+           real(qp), intent(out) :: u(ldu,*), v(ldv,*), wh(ldwh,*), wv(ldwv,*)
+        ! ================================================================
+           
+           ! Local Scalars 
+           real(qp) :: alpha, beta, h11, h12, h21, h22, refsum, safmax, safmin, scl, smlnum, swap,&
+                      tst1, tst2, ulp
+           integer(ilp) :: i, i2, i4, incol, j, jbot, jcol, jlen, jrow, jtop, k, k1, kdu, kms, &
+                     krcol, m, m22, mbot, mtop, nbmps, ndcol, ns, nu
+           logical(lk) :: accum, bmp22
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,mod
+           ! Local Arrays 
+           real(qp) :: vt(3)
+           ! Executable Statements 
+           ! ==== if there are no shifts, then there is nothing to do. ====
+           if( nshfts<2 )return
+           ! ==== if the active block is empty or 1-by-1, then there
+           ! .    is nothing to do. ====
+           if( ktop>=kbot )return
+           ! ==== shuffle shifts into pairs of real shifts and pairs
+           ! .    of complex conjugate shifts assuming complex
+           ! .    conjugate shifts are already adjacent to one
+           ! .    another. ====
+           do i = 1, nshfts - 2, 2
+              if( si( i )/=-si( i+1 ) ) then
+                 swap = sr( i )
+                 sr( i ) = sr( i+1 )
+                 sr( i+1 ) = sr( i+2 )
+                 sr( i+2 ) = swap
+                 swap = si( i )
+                 si( i ) = si( i+1 )
+                 si( i+1 ) = si( i+2 )
+                 si( i+2 ) = swap
+              end if
+           end do
+           ! ==== nshfts is supposed to be even, but if it is odd,
+           ! .    then simply reduce it by one.  the shuffle above
+           ! .    ensures that the dropped shift is real and that
+           ! .    the remaining shifts are paired. ====
+           ns = nshfts - mod( nshfts, 2 )
+           ! ==== machine constants for deflation ====
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = one / safmin
+           call stdlib_qlabad( safmin, safmax )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=qp) / ulp )
+           ! ==== use accumulated reflections to update far-from-diagonal
+           ! .    entries ? ====
+           accum = ( kacc22==1 ) .or. ( kacc22==2 )
+           ! ==== clear trash ====
+           if( ktop+2<=kbot )h( ktop+2, ktop ) = zero
+           ! ==== nbmps = number of 2-shift bulges in the chain ====
+           nbmps = ns / 2
+           ! ==== kdu = width of slab ====
+           kdu = 4*nbmps
+           ! ==== create and chase chains of nbmps bulges ====
+           loop_180: do incol = ktop - 2*nbmps + 1, kbot - 2, 2*nbmps
+              ! jtop = index from which updates from the right start.
+              if( accum ) then
+                 jtop = max( ktop, incol )
+              else if( wantt ) then
+                 jtop = 1
+              else
+                 jtop = ktop
+              end if
+              ndcol = incol + kdu
+              if( accum )call stdlib_qlaset( 'ALL', kdu, kdu, zero, one, u, ldu )
+              ! ==== near-the-diagonal bulge chase.  the following loop
+              ! .    performs the near-the-diagonal part of a small bulge
+              ! .    multi-shift qr sweep.  each 4*nbmps column diagonal
+              ! .    chunk extends from column incol to column ndcol
+              ! .    (including both column incol and column ndcol). the
+              ! .    following loop chases a 2*nbmps+1 column long chain of
+              ! .    nbmps bulges 2*nbmps columns to the right.  (incol
+              ! .    may be less than ktop and and ndcol may be greater than
+              ! .    kbot indicating phantom columns from which to chase
+              ! .    bulges before they are actually introduced or to which
+              ! .    to chase bulges beyond column kbot.)  ====
+              loop_145: do krcol = incol, min( incol+2*nbmps-1, kbot-2 )
+                 ! ==== bulges number mtop to mbot are active double implicit
+                 ! .    shift bulges.  there may or may not also be small
+                 ! .    2-by-2 bulge, if there is room.  the inactive bulges
+                 ! .    (if any) must wait until the active bulges have moved
+                 ! .    down the diagonal to make room.  the phantom matrix
+                 ! .    paradigm described above helps keep track.  ====
+                 mtop = max( 1, ( ktop-krcol ) / 2+1 )
+                 mbot = min( nbmps, ( kbot-krcol-1 ) / 2 )
+                 m22 = mbot + 1
+                 bmp22 = ( mbot<nbmps ) .and. ( krcol+2*( m22-1 ) )==( kbot-2 )
+                 ! ==== generate reflections to chase the chain right
+                 ! .    one column.  (the minimum value of k is ktop-1.) ====
+                 if ( bmp22 ) then
+                    ! ==== special case: 2-by-2 reflection at bottom treated
+                    ! .    separately ====
+                    k = krcol + 2*( m22-1 )
+                    if( k==ktop-1 ) then
+                       call stdlib_qlaqr1( 2, h( k+1, k+1 ), ldh, sr( 2*m22-1 ),si( 2*m22-1 ), sr(&
+                                  2*m22 ), si( 2*m22 ),v( 1, m22 ) )
+                       beta = v( 1, m22 )
+                       call stdlib_qlarfg( 2, beta, v( 2, m22 ), 1, v( 1, m22 ) )
+                    else
+                       beta = h( k+1, k )
+                       v( 2, m22 ) = h( k+2, k )
+                       call stdlib_qlarfg( 2, beta, v( 2, m22 ), 1, v( 1, m22 ) )
+                       h( k+1, k ) = beta
+                       h( k+2, k ) = zero
+                    end if
+                    ! ==== perform update from right within
+                    ! .    computational window. ====
+                    do j = jtop, min( kbot, k+3 )
+                       refsum = v( 1, m22 )*( h( j, k+1 )+v( 2, m22 )*h( j, k+2 ) )
+                       h( j, k+1 ) = h( j, k+1 ) - refsum
+                       h( j, k+2 ) = h( j, k+2 ) - refsum*v( 2, m22 )
+                    end do
+                    ! ==== perform update from left within
+                    ! .    computational window. ====
+                    if( accum ) then
+                       jbot = min( ndcol, kbot )
+                    else if( wantt ) then
+                       jbot = n
+                    else
+                       jbot = kbot
+                    end if
+                    do j = k+1, jbot
+                       refsum = v( 1, m22 )*( h( k+1, j )+v( 2, m22 )*h( k+2, j ) )
+                       h( k+1, j ) = h( k+1, j ) - refsum
+                       h( k+2, j ) = h( k+2, j ) - refsum*v( 2, m22 )
+                    end do
+                    ! ==== the following convergence test requires that
+                    ! .    the tradition small-compared-to-nearby-diagonals
+                    ! .    criterion and the ahues
+                    ! .    criteria both be satisfied.  the latter improves
+                    ! .    accuracy in some examples. falling back on an
+                    ! .    alternate convergence criterion when tst1 or tst2
+                    ! .    is zero (as done here) is traditional but probably
+                    ! .    unnecessary. ====
+                    if( k>=ktop ) then
+                       if( h( k+1, k )/=zero ) then
+                          tst1 = abs( h( k, k ) ) + abs( h( k+1, k+1 ) )
+                          if( tst1==zero ) then
+                             if( k>=ktop+1 )tst1 = tst1 + abs( h( k, k-1 ) )
+                             if( k>=ktop+2 )tst1 = tst1 + abs( h( k, k-2 ) )
+                             if( k>=ktop+3 )tst1 = tst1 + abs( h( k, k-3 ) )
+                             if( k<=kbot-2 )tst1 = tst1 + abs( h( k+2, k+1 ) )
+                             if( k<=kbot-3 )tst1 = tst1 + abs( h( k+3, k+1 ) )
+                             if( k<=kbot-4 )tst1 = tst1 + abs( h( k+4, k+1 ) )
+                          end if
+                          if( abs( h( k+1, k ) )<=max( smlnum, ulp*tst1 ) ) then
+                             h12 = max( abs( h( k+1, k ) ),abs( h( k, k+1 ) ) )
+                             h21 = min( abs( h( k+1, k ) ),abs( h( k, k+1 ) ) )
+                             h11 = max( abs( h( k+1, k+1 ) ),abs( h( k, k )-h( k+1, k+1 ) ) )
+                                       
+                             h22 = min( abs( h( k+1, k+1 ) ),abs( h( k, k )-h( k+1, k+1 ) ) )
+                                       
+                             scl = h11 + h12
+                             tst2 = h22*( h11 / scl )
+                             if( tst2==zero .or. h21*( h12 / scl )<=max( smlnum, ulp*tst2 ) ) &
+                                       then
+                                h( k+1, k ) = zero
+                             end if
+                          end if
+                       end if
+                    end if
+                    ! ==== accumulate orthogonal transformations. ====
+                    if( accum ) then
+                       kms = k - incol
+                       do j = max( 1, ktop-incol ), kdu
+                          refsum = v( 1, m22 )*( u( j, kms+1 )+v( 2, m22 )*u( j, kms+2 ) )
+                          u( j, kms+1 ) = u( j, kms+1 ) - refsum
+                          u( j, kms+2 ) = u( j, kms+2 ) - refsum*v( 2, m22 )
+                       end do
+                    else if( wantz ) then
+                       do j = iloz, ihiz
+                          refsum = v( 1, m22 )*( z( j, k+1 )+v( 2, m22 )*z( j, k+2 ) )
+                          z( j, k+1 ) = z( j, k+1 ) - refsum
+                          z( j, k+2 ) = z( j, k+2 ) - refsum*v( 2, m22 )
+                       end do
+                    end if
+                 end if
+                 ! ==== normal case: chain of 3-by-3 reflections ====
+                 loop_80: do m = mbot, mtop, -1
+                    k = krcol + 2*( m-1 )
+                    if( k==ktop-1 ) then
+                       call stdlib_qlaqr1( 3, h( ktop, ktop ), ldh, sr( 2*m-1 ),si( 2*m-1 ), sr( &
+                                 2*m ), si( 2*m ),v( 1, m ) )
+                       alpha = v( 1, m )
+                       call stdlib_qlarfg( 3, alpha, v( 2, m ), 1, v( 1, m ) )
+                    else
+                       ! ==== perform delayed transformation of row below
+                       ! .    mth bulge. exploit fact that first two elements
+                       ! .    of row are actually zero. ====
+                       refsum = v( 1, m )*v( 3, m )*h( k+3, k+2 )
+                       h( k+3, k   ) = -refsum
+                       h( k+3, k+1 ) = -refsum*v( 2, m )
+                       h( k+3, k+2 ) = h( k+3, k+2 ) - refsum*v( 3, m )
+                       ! ==== calculate reflection to move
+                       ! .    mth bulge one step. ====
+                       beta      = h( k+1, k )
+                       v( 2, m ) = h( k+2, k )
+                       v( 3, m ) = h( k+3, k )
+                       call stdlib_qlarfg( 3, beta, v( 2, m ), 1, v( 1, m ) )
+                       ! ==== a bulge may collapse because of vigilant
+                       ! .    deflation or destructive underflow.  in the
+                       ! .    underflow case, try the two-small-subdiagonals
+                       ! .    trick to try to reinflate the bulge.  ====
+                       if( h( k+3, k )/=zero .or. h( k+3, k+1 )/=zero .or. h( k+3, k+2 )==zero ) &
+                                 then
+                          ! ==== typical case: not collapsed (yet). ====
+                          h( k+1, k ) = beta
+                          h( k+2, k ) = zero
+                          h( k+3, k ) = zero
+                       else
+                          ! ==== atypical case: collapsed.  attempt to
+                          ! .    reintroduce ignoring h(k+1,k) and h(k+2,k).
+                          ! .    if the fill resulting from the new
+                          ! .    reflector is too large, then abandon it.
+                          ! .    otherwise, use the new one. ====
+                          call stdlib_qlaqr1( 3, h( k+1, k+1 ), ldh, sr( 2*m-1 ),si( 2*m-1 ), sr( &
+                                    2*m ), si( 2*m ),vt )
+                          alpha = vt( 1 )
+                          call stdlib_qlarfg( 3, alpha, vt( 2 ), 1, vt( 1 ) )
+                          refsum = vt( 1 )*( h( k+1, k )+vt( 2 )*h( k+2, k ) )
+                          if( abs( h( k+2, k )-refsum*vt( 2 ) )+abs( refsum*vt( 3 ) )>ulp*( abs( &
+                                    h( k, k ) )+abs( h( k+1,k+1 ) )+abs( h( k+2, k+2 ) ) ) ) then
+                             ! ==== starting a new bulge here would
+                             ! .    create non-negligible fill.  use
+                             ! .    the old one with trepidation. ====
+                             h( k+1, k ) = beta
+                             h( k+2, k ) = zero
+                             h( k+3, k ) = zero
+                          else
+                             ! ==== starting a new bulge here would
+                             ! .    create only negligible fill.
+                             ! .    replace the old reflector with
+                             ! .    the new one. ====
+                             h( k+1, k ) = h( k+1, k ) - refsum
+                             h( k+2, k ) = zero
+                             h( k+3, k ) = zero
+                             v( 1, m ) = vt( 1 )
+                             v( 2, m ) = vt( 2 )
+                             v( 3, m ) = vt( 3 )
+                          end if
+                       end if
+                    end if
+                    ! ====  apply reflection from the right and
+                    ! .     the first column of update from the left.
+                    ! .     these updates are required for the vigilant
+                    ! .     deflation check. we still delay most of the
+                    ! .     updates from the left for efficiency. ====
+                    do j = jtop, min( kbot, k+3 )
+                       refsum = v( 1, m )*( h( j, k+1 )+v( 2, m )*h( j, k+2 )+v( 3, m )*h( j, k+3 &
+                                 ) )
+                       h( j, k+1 ) = h( j, k+1 ) - refsum
+                       h( j, k+2 ) = h( j, k+2 ) - refsum*v( 2, m )
+                       h( j, k+3 ) = h( j, k+3 ) - refsum*v( 3, m )
+                    end do
+                    ! ==== perform update from left for subsequent
+                    ! .    column. ====
+                    refsum = v( 1, m )*( h( k+1, k+1 )+v( 2, m )*h( k+2, k+1 )+v( 3, m )*h( k+3, &
+                              k+1 ) )
+                    h( k+1, k+1 ) = h( k+1, k+1 ) - refsum
+                    h( k+2, k+1 ) = h( k+2, k+1 ) - refsum*v( 2, m )
+                    h( k+3, k+1 ) = h( k+3, k+1 ) - refsum*v( 3, m )
+                    ! ==== the following convergence test requires that
+                    ! .    the tradition small-compared-to-nearby-diagonals
+                    ! .    criterion and the ahues
+                    ! .    criteria both be satisfied.  the latter improves
+                    ! .    accuracy in some examples. falling back on an
+                    ! .    alternate convergence criterion when tst1 or tst2
+                    ! .    is zero (as done here) is traditional but probably
+                    ! .    unnecessary. ====
+                    if( k<ktop)cycle
+                    if( h( k+1, k )/=zero ) then
+                       tst1 = abs( h( k, k ) ) + abs( h( k+1, k+1 ) )
+                       if( tst1==zero ) then
+                          if( k>=ktop+1 )tst1 = tst1 + abs( h( k, k-1 ) )
+                          if( k>=ktop+2 )tst1 = tst1 + abs( h( k, k-2 ) )
+                          if( k>=ktop+3 )tst1 = tst1 + abs( h( k, k-3 ) )
+                          if( k<=kbot-2 )tst1 = tst1 + abs( h( k+2, k+1 ) )
+                          if( k<=kbot-3 )tst1 = tst1 + abs( h( k+3, k+1 ) )
+                          if( k<=kbot-4 )tst1 = tst1 + abs( h( k+4, k+1 ) )
+                       end if
+                       if( abs( h( k+1, k ) )<=max( smlnum, ulp*tst1 ) )then
+                          h12 = max( abs( h( k+1, k ) ), abs( h( k, k+1 ) ) )
+                          h21 = min( abs( h( k+1, k ) ), abs( h( k, k+1 ) ) )
+                          h11 = max( abs( h( k+1, k+1 ) ),abs( h( k, k )-h( k+1, k+1 ) ) )
+                          h22 = min( abs( h( k+1, k+1 ) ),abs( h( k, k )-h( k+1, k+1 ) ) )
+                          scl = h11 + h12
+                          tst2 = h22*( h11 / scl )
+                          if( tst2==zero .or. h21*( h12 / scl )<=max( smlnum, ulp*tst2 ) ) &
+                                    then
+                             h( k+1, k ) = zero
+                          end if
+                       end if
+                    end if
+                 end do loop_80
+                 ! ==== multiply h by reflections from the left ====
+                 if( accum ) then
+                    jbot = min( ndcol, kbot )
+                 else if( wantt ) then
+                    jbot = n
+                 else
+                    jbot = kbot
+                 end if
+                 do m = mbot, mtop, -1
+                    k = krcol + 2*( m-1 )
+                    do j = max( ktop, krcol + 2*m ), jbot
+                       refsum = v( 1, m )*( h( k+1, j )+v( 2, m )*h( k+2, j )+v( 3, m )*h( k+3, j &
+                                 ) )
+                       h( k+1, j ) = h( k+1, j ) - refsum
+                       h( k+2, j ) = h( k+2, j ) - refsum*v( 2, m )
+                       h( k+3, j ) = h( k+3, j ) - refsum*v( 3, m )
+                    end do
+                 end do
+                 ! ==== accumulate orthogonal transformations. ====
+                 if( accum ) then
+                    ! ==== accumulate u. (if needed, update z later
+                    ! .    with an efficient matrix-matrix
+                    ! .    multiply.) ====
+                    do m = mbot, mtop, -1
+                       k = krcol + 2*( m-1 )
+                       kms = k - incol
+                       i2 = max( 1, ktop-incol )
+                       i2 = max( i2, kms-(krcol-incol)+1 )
+                       i4 = min( kdu, krcol + 2*( mbot-1 ) - incol + 5 )
+                       do j = i2, i4
+                          refsum = v( 1, m )*( u( j, kms+1 )+v( 2, m )*u( j, kms+2 )+v( 3, m )*u( &
+                                    j, kms+3 ) )
+                          u( j, kms+1 ) = u( j, kms+1 ) - refsum
+                          u( j, kms+2 ) = u( j, kms+2 ) - refsum*v( 2, m )
+                          u( j, kms+3 ) = u( j, kms+3 ) - refsum*v( 3, m )
+                       end do
+                    end do
+                 else if( wantz ) then
+                    ! ==== u is not accumulated, so update z
+                    ! .    now by multiplying by reflections
+                    ! .    from the right. ====
+                    do m = mbot, mtop, -1
+                       k = krcol + 2*( m-1 )
+                       do j = iloz, ihiz
+                          refsum = v( 1, m )*( z( j, k+1 )+v( 2, m )*z( j, k+2 )+v( 3, m )*z( j, &
+                                    k+3 ) )
+                          z( j, k+1 ) = z( j, k+1 ) - refsum
+                          z( j, k+2 ) = z( j, k+2 ) - refsum*v( 2, m )
+                          z( j, k+3 ) = z( j, k+3 ) - refsum*v( 3, m )
+                       end do
+                    end do
+                 end if
+                 ! ==== end of near-the-diagonal bulge chase. ====
+              end do loop_145
+              ! ==== use u (if accumulated) to update far-from-diagonal
+              ! .    entries in h.  if required, use u to update z as
+              ! .    well. ====
+              if( accum ) then
+                 if( wantt ) then
+                    jtop = 1
+                    jbot = n
+                 else
+                    jtop = ktop
+                    jbot = kbot
+                 end if
+                 k1 = max( 1, ktop-incol )
+                 nu = ( kdu-max( 0, ndcol-kbot ) ) - k1 + 1
+                 ! ==== horizontal multiply ====
+                 do jcol = min( ndcol, kbot ) + 1, jbot, nh
+                    jlen = min( nh, jbot-jcol+1 )
+                    call stdlib_qgemm( 'C', 'N', nu, jlen, nu, one, u( k1, k1 ),ldu, h( incol+k1, &
+                              jcol ), ldh, zero, wh,ldwh )
+                    call stdlib_qlacpy( 'ALL', nu, jlen, wh, ldwh,h( incol+k1, jcol ), ldh )
+                              
+                 end do
+                 ! ==== vertical multiply ====
+                 do jrow = jtop, max( ktop, incol ) - 1, nv
+                    jlen = min( nv, max( ktop, incol )-jrow )
+                    call stdlib_qgemm( 'N', 'N', jlen, nu, nu, one,h( jrow, incol+k1 ), ldh, u( &
+                              k1, k1 ),ldu, zero, wv, ldwv )
+                    call stdlib_qlacpy( 'ALL', jlen, nu, wv, ldwv,h( jrow, incol+k1 ), ldh )
+                              
+                 end do
+                 ! ==== z multiply (also vertical) ====
+                 if( wantz ) then
+                    do jrow = iloz, ihiz, nv
+                       jlen = min( nv, ihiz-jrow+1 )
+                       call stdlib_qgemm( 'N', 'N', jlen, nu, nu, one,z( jrow, incol+k1 ), ldz, u(&
+                                  k1, k1 ),ldu, zero, wv, ldwv )
+                       call stdlib_qlacpy( 'ALL', jlen, nu, wv, ldwv,z( jrow, incol+k1 ), ldz )
+                                 
+                    end do
+                 end if
+              end if
+           end do loop_180
+     end subroutine stdlib_qlaqr5
+
+     !> DLAQSB: equilibrates a symmetric band matrix A using the scaling
+     !> factors in the vector S.
+
+     pure subroutine stdlib_qlaqsb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(qp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(inout) :: ab(ldab,*)
+           real(qp), intent(in) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1e+0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored in band format.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = max( 1, j-kd ), j
+                       ab( kd+1+i-j, j ) = cj*s( i )*ab( kd+1+i-j, j )
+                    end do
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, min( n, j+kd )
+                       ab( 1+i-j, j ) = cj*s( i )*ab( 1+i-j, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_qlaqsb
+
+     !> DLAQSP: equilibrates a symmetric matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_qlaqsp( uplo, n, ap, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(inout) :: ap(*)
+           real(qp), intent(in) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1e+0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jc
+           real(qp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j
+                       ap( jc+i-1 ) = cj*s( i )*ap( jc+i-1 )
+                    end do
+                    jc = jc + j
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, n
+                       ap( jc+i-j ) = cj*s( i )*ap( jc+i-j )
+                    end do
+                    jc = jc + n - j + 1
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_qlaqsp
+
+     !> DLAQSY: equilibrates a symmetric matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_qlaqsy( uplo, n, a, lda, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1e+0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, n
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_qlaqsy
+
+     !> DLAQTR: solves the real quasi-triangular system
+     !> op(T)*p = scale*c,               if LREAL = .TRUE.
+     !> or the complex quasi-triangular systems
+     !> op(T + iB)*(p+iq) = scale*(c+id),  if LREAL = .FALSE.
+     !> in real arithmetic, where T is upper quasi-triangular.
+     !> If LREAL = .FALSE., then the first diagonal block of T must be
+     !> 1 by 1, B is the specially structured matrix
+     !> B = [ b(1) b(2) ... b(n) ]
+     !> [       w            ]
+     !> [           w        ]
+     !> [              .     ]
+     !> [                 w  ]
+     !> op(A) = A or A**T, A**T denotes the transpose of
+     !> matrix A.
+     !> On input, X = [ c ].  On output, X = [ p ].
+     !> [ d ]                  [ q ]
+     !> This subroutine is designed for the condition number estimation
+     !> in routine DTRSNA.
+
+     subroutine stdlib_qlaqtr( ltran, lreal, n, t, ldt, b, w, scale, x, work,info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: lreal, ltran
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldt, n
+           real(qp), intent(out) :: scale
+           real(qp), intent(in) :: w
+           ! Array Arguments 
+           real(qp), intent(in) :: b(*), t(ldt,*)
+           real(qp), intent(out) :: work(*)
+           real(qp), intent(inout) :: x(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: i, ierr, j, j1, j2, jnext, k, n1, n2
+           real(qp) :: bignum, eps, rec, scaloc, si, smin, sminw, smlnum, sr, tjj, tmp, xj, xmax, &
+                     xnorm, z
+           ! Local Arrays 
+           real(qp) :: d(2,2), v(2,2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! do not test the input parameters for errors
+           notran = .not.ltran
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' ) / eps
+           bignum = one / smlnum
+           xnorm = stdlib_qlange( 'M', n, n, t, ldt, d )
+           if( .not.lreal )xnorm = max( xnorm, abs( w ), stdlib_qlange( 'M', n, 1, b, n, d ) )
+                     
+           smin = max( smlnum, eps*xnorm )
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           work( 1 ) = zero
+           do j = 2, n
+              work( j ) = stdlib_qasum( j-1, t( 1, j ), 1 )
+           end do
+           if( .not.lreal ) then
+              do i = 2, n
+                 work( i ) = work( i ) + abs( b( i ) )
+              end do
+           end if
+           n2 = 2*n
+           n1 = n
+           if( .not.lreal )n1 = n2
+           k = stdlib_iqamax( n1, x, 1 )
+           xmax = abs( x( k ) )
+           scale = one
+           if( xmax>bignum ) then
+              scale = bignum / xmax
+              call stdlib_qscal( n1, scale, x, 1 )
+              xmax = bignum
+           end if
+           if( lreal ) then
+              if( notran ) then
+                 ! solve t*p = scale*c
+                 jnext = n
+                 loop_30: do j = n, 1, -1
+                    if( j>jnext )cycle loop_30
+                    j1 = j
+                    j2 = j
+                    jnext = j - 1
+                    if( j>1 ) then
+                       if( t( j, j-1 )/=zero ) then
+                          j1 = j - 1
+                          jnext = j - 2
+                       end if
+                    end if
+                    if( j1==j2 ) then
+                       ! meet 1 by 1 diagonal block
+                       ! scale to avoid overflow when computing
+                           ! x(j) = b(j)/t(j,j)
+                       xj = abs( x( j1 ) )
+                       tjj = abs( t( j1, j1 ) )
+                       tmp = t( j1, j1 )
+                       if( tjj<smin ) then
+                          tmp = smin
+                          tjj = smin
+                          info = 1
+                       end if
+                       if( xj==zero )cycle loop_30
+                       if( tjj<one ) then
+                          if( xj>bignum*tjj ) then
+                             rec = one / xj
+                             call stdlib_qscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j1 ) = x( j1 ) / tmp
+                       xj = abs( x( j1 ) )
+                       ! scale x if necessary to avoid overflow when adding a
+                       ! multiple of column j1 of t.
+                       if( xj>one ) then
+                          rec = one / xj
+                          if( work( j1 )>( bignum-xmax )*rec ) then
+                             call stdlib_qscal( n, rec, x, 1 )
+                             scale = scale*rec
+                          end if
+                       end if
+                       if( j1>1 ) then
+                          call stdlib_qaxpy( j1-1, -x( j1 ), t( 1, j1 ), 1, x, 1 )
+                          k = stdlib_iqamax( j1-1, x, 1 )
+                          xmax = abs( x( k ) )
+                       end if
+                    else
+                       ! meet 2 by 2 diagonal block
+                       ! call 2 by 2 linear system solve, to take
+                       ! care of possible overflow by scaling factor.
+                       d( 1, 1 ) = x( j1 )
+                       d( 2, 1 ) = x( j2 )
+                       call stdlib_qlaln2( .false., 2, 1, smin, one, t( j1, j1 ),ldt, one, one, d,&
+                                  2, zero, zero, v, 2,scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 2
+                       if( scaloc/=one ) then
+                          call stdlib_qscal( n, scaloc, x, 1 )
+                          scale = scale*scaloc
+                       end if
+                       x( j1 ) = v( 1, 1 )
+                       x( j2 ) = v( 2, 1 )
+                       ! scale v(1,1) (= x(j1)) and/or v(2,1) (=x(j2))
+                       ! to avoid overflow in updating right-hand side.
+                       xj = max( abs( v( 1, 1 ) ), abs( v( 2, 1 ) ) )
+                       if( xj>one ) then
+                          rec = one / xj
+                          if( max( work( j1 ), work( j2 ) )>( bignum-xmax )*rec ) then
+                             call stdlib_qscal( n, rec, x, 1 )
+                             scale = scale*rec
+                          end if
+                       end if
+                       ! update right-hand side
+                       if( j1>1 ) then
+                          call stdlib_qaxpy( j1-1, -x( j1 ), t( 1, j1 ), 1, x, 1 )
+                          call stdlib_qaxpy( j1-1, -x( j2 ), t( 1, j2 ), 1, x, 1 )
+                          k = stdlib_iqamax( j1-1, x, 1 )
+                          xmax = abs( x( k ) )
+                       end if
+                    end if
+                 end do loop_30
+              else
+                 ! solve t**t*p = scale*c
+                 jnext = 1
+                 loop_40: do j = 1, n
+                    if( j<jnext )cycle loop_40
+                    j1 = j
+                    j2 = j
+                    jnext = j + 1
+                    if( j<n ) then
+                       if( t( j+1, j )/=zero ) then
+                          j2 = j + 1
+                          jnext = j + 2
+                       end if
+                    end if
+                    if( j1==j2 ) then
+                       ! 1 by 1 diagonal block
+                       ! scale if necessary to avoid overflow in forming the
+                       ! right-hand side element by inner product.
+                       xj = abs( x( j1 ) )
+                       if( xmax>one ) then
+                          rec = one / xmax
+                          if( work( j1 )>( bignum-xj )*rec ) then
+                             call stdlib_qscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j1 ) = x( j1 ) - stdlib_qdot( j1-1, t( 1, j1 ), 1, x, 1 )
+                       xj = abs( x( j1 ) )
+                       tjj = abs( t( j1, j1 ) )
+                       tmp = t( j1, j1 )
+                       if( tjj<smin ) then
+                          tmp = smin
+                          tjj = smin
+                          info = 1
+                       end if
+                       if( tjj<one ) then
+                          if( xj>bignum*tjj ) then
+                             rec = one / xj
+                             call stdlib_qscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j1 ) = x( j1 ) / tmp
+                       xmax = max( xmax, abs( x( j1 ) ) )
+                    else
+                       ! 2 by 2 diagonal block
+                       ! scale if necessary to avoid overflow in forming the
+                       ! right-hand side elements by inner product.
+                       xj = max( abs( x( j1 ) ), abs( x( j2 ) ) )
+                       if( xmax>one ) then
+                          rec = one / xmax
+                          if( max( work( j2 ), work( j1 ) )>( bignum-xj )*rec ) then
+                             call stdlib_qscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       d( 1, 1 ) = x( j1 ) - stdlib_qdot( j1-1, t( 1, j1 ), 1, x,1 )
+                       d( 2, 1 ) = x( j2 ) - stdlib_qdot( j1-1, t( 1, j2 ), 1, x,1 )
+                       call stdlib_qlaln2( .true., 2, 1, smin, one, t( j1, j1 ),ldt, one, one, d, &
+                                 2, zero, zero, v, 2,scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 2
+                       if( scaloc/=one ) then
+                          call stdlib_qscal( n, scaloc, x, 1 )
+                          scale = scale*scaloc
+                       end if
+                       x( j1 ) = v( 1, 1 )
+                       x( j2 ) = v( 2, 1 )
+                       xmax = max( abs( x( j1 ) ), abs( x( j2 ) ), xmax )
+                    end if
+                 end do loop_40
+              end if
+           else
+              sminw = max( eps*abs( w ), smin )
+              if( notran ) then
+                 ! solve (t + ib)*(p+iq) = c+id
+                 jnext = n
+                 loop_70: do j = n, 1, -1
+                    if( j>jnext )cycle loop_70
+                    j1 = j
+                    j2 = j
+                    jnext = j - 1
+                    if( j>1 ) then
+                       if( t( j, j-1 )/=zero ) then
+                          j1 = j - 1
+                          jnext = j - 2
+                       end if
+                    end if
+                    if( j1==j2 ) then
+                       ! 1 by 1 diagonal block
+                       ! scale if necessary to avoid overflow in division
+                       z = w
+                       if( j1==1 )z = b( 1 )
+                       xj = abs( x( j1 ) ) + abs( x( n+j1 ) )
+                       tjj = abs( t( j1, j1 ) ) + abs( z )
+                       tmp = t( j1, j1 )
+                       if( tjj<sminw ) then
+                          tmp = sminw
+                          tjj = sminw
+                          info = 1
+                       end if
+                       if( xj==zero )cycle loop_70
+                       if( tjj<one ) then
+                          if( xj>bignum*tjj ) then
+                             rec = one / xj
+                             call stdlib_qscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       call stdlib_qladiv( x( j1 ), x( n+j1 ), tmp, z, sr, si )
+                       x( j1 ) = sr
+                       x( n+j1 ) = si
+                       xj = abs( x( j1 ) ) + abs( x( n+j1 ) )
+                       ! scale x if necessary to avoid overflow when adding a
+                       ! multiple of column j1 of t.
+                       if( xj>one ) then
+                          rec = one / xj
+                          if( work( j1 )>( bignum-xmax )*rec ) then
+                             call stdlib_qscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                          end if
+                       end if
+                       if( j1>1 ) then
+                          call stdlib_qaxpy( j1-1, -x( j1 ), t( 1, j1 ), 1, x, 1 )
+                          call stdlib_qaxpy( j1-1, -x( n+j1 ), t( 1, j1 ), 1,x( n+1 ), 1 )
+                          x( 1 ) = x( 1 ) + b( j1 )*x( n+j1 )
+                          x( n+1 ) = x( n+1 ) - b( j1 )*x( j1 )
+                          xmax = zero
+                          do k = 1, j1 - 1
+                             xmax = max( xmax, abs( x( k ) )+abs( x( k+n ) ) )
+                          end do
+                       end if
+                    else
+                       ! meet 2 by 2 diagonal block
+                       d( 1, 1 ) = x( j1 )
+                       d( 2, 1 ) = x( j2 )
+                       d( 1, 2 ) = x( n+j1 )
+                       d( 2, 2 ) = x( n+j2 )
+                       call stdlib_qlaln2( .false., 2, 2, sminw, one, t( j1, j1 ),ldt, one, one, &
+                                 d, 2, zero, -w, v, 2,scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 2
+                       if( scaloc/=one ) then
+                          call stdlib_qscal( 2*n, scaloc, x, 1 )
+                          scale = scaloc*scale
+                       end if
+                       x( j1 ) = v( 1, 1 )
+                       x( j2 ) = v( 2, 1 )
+                       x( n+j1 ) = v( 1, 2 )
+                       x( n+j2 ) = v( 2, 2 )
+                       ! scale x(j1), .... to avoid overflow in
+                       ! updating right hand side.
+                       xj = max( abs( v( 1, 1 ) )+abs( v( 1, 2 ) ),abs( v( 2, 1 ) )+abs( v( 2, 2 )&
+                                  ) )
+                       if( xj>one ) then
+                          rec = one / xj
+                          if( max( work( j1 ), work( j2 ) )>( bignum-xmax )*rec ) then
+                             call stdlib_qscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                          end if
+                       end if
+                       ! update the right-hand side.
+                       if( j1>1 ) then
+                          call stdlib_qaxpy( j1-1, -x( j1 ), t( 1, j1 ), 1, x, 1 )
+                          call stdlib_qaxpy( j1-1, -x( j2 ), t( 1, j2 ), 1, x, 1 )
+                          call stdlib_qaxpy( j1-1, -x( n+j1 ), t( 1, j1 ), 1,x( n+1 ), 1 )
+                          call stdlib_qaxpy( j1-1, -x( n+j2 ), t( 1, j2 ), 1,x( n+1 ), 1 )
+                          x( 1 ) = x( 1 ) + b( j1 )*x( n+j1 ) +b( j2 )*x( n+j2 )
+                          x( n+1 ) = x( n+1 ) - b( j1 )*x( j1 ) -b( j2 )*x( j2 )
+                          xmax = zero
+                          do k = 1, j1 - 1
+                             xmax = max( abs( x( k ) )+abs( x( k+n ) ),xmax )
+                          end do
+                       end if
+                    end if
+                 end do loop_70
+              else
+                 ! solve (t + ib)**t*(p+iq) = c+id
+                 jnext = 1
+                 loop_80: do j = 1, n
+                    if( j<jnext )cycle loop_80
+                    j1 = j
+                    j2 = j
+                    jnext = j + 1
+                    if( j<n ) then
+                       if( t( j+1, j )/=zero ) then
+                          j2 = j + 1
+                          jnext = j + 2
+                       end if
+                    end if
+                    if( j1==j2 ) then
+                       ! 1 by 1 diagonal block
+                       ! scale if necessary to avoid overflow in forming the
+                       ! right-hand side element by inner product.
+                       xj = abs( x( j1 ) ) + abs( x( j1+n ) )
+                       if( xmax>one ) then
+                          rec = one / xmax
+                          if( work( j1 )>( bignum-xj )*rec ) then
+                             call stdlib_qscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j1 ) = x( j1 ) - stdlib_qdot( j1-1, t( 1, j1 ), 1, x, 1 )
+                       x( n+j1 ) = x( n+j1 ) - stdlib_qdot( j1-1, t( 1, j1 ), 1,x( n+1 ), 1 )
+                                 
+                       if( j1>1 ) then
+                          x( j1 ) = x( j1 ) - b( j1 )*x( n+1 )
+                          x( n+j1 ) = x( n+j1 ) + b( j1 )*x( 1 )
+                       end if
+                       xj = abs( x( j1 ) ) + abs( x( j1+n ) )
+                       z = w
+                       if( j1==1 )z = b( 1 )
+                       ! scale if necessary to avoid overflow in
+                       ! complex division
+                       tjj = abs( t( j1, j1 ) ) + abs( z )
+                       tmp = t( j1, j1 )
+                       if( tjj<sminw ) then
+                          tmp = sminw
+                          tjj = sminw
+                          info = 1
+                       end if
+                       if( tjj<one ) then
+                          if( xj>bignum*tjj ) then
+                             rec = one / xj
+                             call stdlib_qscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       call stdlib_qladiv( x( j1 ), x( n+j1 ), tmp, -z, sr, si )
+                       x( j1 ) = sr
+                       x( j1+n ) = si
+                       xmax = max( abs( x( j1 ) )+abs( x( j1+n ) ), xmax )
+                    else
+                       ! 2 by 2 diagonal block
+                       ! scale if necessary to avoid overflow in forming the
+                       ! right-hand side element by inner product.
+                       xj = max( abs( x( j1 ) )+abs( x( n+j1 ) ),abs( x( j2 ) )+abs( x( n+j2 ) ) )
+                                 
+                       if( xmax>one ) then
+                          rec = one / xmax
+                          if( max( work( j1 ), work( j2 ) )>( bignum-xj ) / xmax ) then
+                             call stdlib_qscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       d( 1, 1 ) = x( j1 ) - stdlib_qdot( j1-1, t( 1, j1 ), 1, x,1 )
+                       d( 2, 1 ) = x( j2 ) - stdlib_qdot( j1-1, t( 1, j2 ), 1, x,1 )
+                       d( 1, 2 ) = x( n+j1 ) - stdlib_qdot( j1-1, t( 1, j1 ), 1,x( n+1 ), 1 )
+                                 
+                       d( 2, 2 ) = x( n+j2 ) - stdlib_qdot( j1-1, t( 1, j2 ), 1,x( n+1 ), 1 )
+                                 
+                       d( 1, 1 ) = d( 1, 1 ) - b( j1 )*x( n+1 )
+                       d( 2, 1 ) = d( 2, 1 ) - b( j2 )*x( n+1 )
+                       d( 1, 2 ) = d( 1, 2 ) + b( j1 )*x( 1 )
+                       d( 2, 2 ) = d( 2, 2 ) + b( j2 )*x( 1 )
+                       call stdlib_qlaln2( .true., 2, 2, sminw, one, t( j1, j1 ),ldt, one, one, d,&
+                                  2, zero, w, v, 2,scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 2
+                       if( scaloc/=one ) then
+                          call stdlib_qscal( n2, scaloc, x, 1 )
+                          scale = scaloc*scale
+                       end if
+                       x( j1 ) = v( 1, 1 )
+                       x( j2 ) = v( 2, 1 )
+                       x( n+j1 ) = v( 1, 2 )
+                       x( n+j2 ) = v( 2, 2 )
+                       xmax = max( abs( x( j1 ) )+abs( x( n+j1 ) ),abs( x( j2 ) )+abs( x( n+j2 ) )&
+                                 , xmax )
+                    end if
+                 end do loop_80
+              end if
+           end if
+           return
+     end subroutine stdlib_qlaqtr
+
+     !> DLAQZ0: computes the eigenvalues of a real matrix pair (H,T),
+     !> where H is an upper Hessenberg matrix and T is upper triangular,
+     !> using the double-shift QZ method.
+     !> Matrix pairs of this type are produced by the reduction to
+     !> generalized upper Hessenberg form of a real matrix pair (A,B):
+     !> A = Q1*H*Z1**T,  B = Q1*T*Z1**T,
+     !> as computed by DGGHRD.
+     !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+     !> also reduced to generalized Schur form,
+     !> H = Q*S*Z**T,  T = Q*P*Z**T,
+     !> where Q and Z are orthogonal matrices, P is an upper triangular
+     !> matrix, and S is a quasi-triangular matrix with 1-by-1 and 2-by-2
+     !> diagonal blocks.
+     !> The 1-by-1 blocks correspond to real eigenvalues of the matrix pair
+     !> (H,T) and the 2-by-2 blocks correspond to complex conjugate pairs of
+     !> eigenvalues.
+     !> Additionally, the 2-by-2 upper triangular diagonal blocks of P
+     !> corresponding to 2-by-2 blocks of S are reduced to positive diagonal
+     !> form, i.e., if S(j+1,j) is non-zero, then P(j+1,j) = P(j,j+1) = 0,
+     !> P(j,j) > 0, and P(j+1,j+1) > 0.
+     !> Optionally, the orthogonal matrix Q from the generalized Schur
+     !> factorization may be postmultiplied into an input matrix Q1, and the
+     !> orthogonal matrix Z may be postmultiplied into an input matrix Z1.
+     !> If Q1 and Z1 are the orthogonal matrices from DGGHRD that reduced
+     !> the matrix pair (A,B) to generalized upper Hessenberg form, then the
+     !> output matrices Q1*Q and Z1*Z are the orthogonal factors from the
+     !> generalized Schur factorization of (A,B):
+     !> A = (Q1*Q)*S*(Z1*Z)**T,  B = (Q1*Q)*P*(Z1*Z)**T.
+     !> To avoid overflow, eigenvalues of the matrix pair (H,T) (equivalently,
+     !> of (A,B)) are computed as a pair of values (alpha,beta), where alpha is
+     !> complex and beta real.
+     !> If beta is nonzero, lambda = alpha / beta is an eigenvalue of the
+     !> generalized nonsymmetric eigenvalue problem (GNEP)
+     !> A*x = lambda*B*x
+     !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+     !> alternate form of the GNEP
+     !> mu*A*y = B*y.
+     !> Real eigenvalues can be read directly from the generalized Schur
+     !> form:
+     !> alpha = S(i,i), beta = P(i,i).
+     !> Ref: C.B. Moler
+     !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+     !> pp. 241--256.
+     !> Ref: B. Kagstrom, D. Kressner, "Multishift Variants of the QZ
+     !> Algorithm with Aggressive Early Deflation", SIAM J. Numer.
+     !> Anal., 29(2006), pp. 199--227.
+     !> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
+     !> multipole rational QZ method with agressive early deflation"
+
+     recursive subroutine stdlib_qlaqz0( wants, wantq, wantz, n, ilo, ihi, a,lda, b, ldb, alphar, &
+               alphai, beta,q, ldq, z, ldz, work, lwork, rec,info )
+           ! arguments
+           character, intent( in ) :: wants, wantq, wantz
+           integer(ilp), intent( in ) :: n, ilo, ihi, lda, ldb, ldq, ldz, lwork,rec
+           integer(ilp), intent( out ) :: info
+           real(qp), intent( inout ) :: a( lda, * ), b( ldb, * ),q( ldq, * ), z( ldz, * ), alphar(&
+                      * ),alphai( * ), beta( * ), work( * )
+           
+           ! local scalars
+           real(qp) :: smlnum, ulp, eshift, safmin, safmax, c1, s1, temp, swap
+           integer(ilp) :: istart, istop, iiter, maxit, istart2, k, ld, nshifts, nblock, nw, nmin,&
+            nibble, n_undeflated, n_qeflated, ns, sweep_info, shiftpos, lworkreq, k2, istartm, &
+            istopm, iwants, iwantq, iwantz, norm_info, aed_info, nwr, nbr, nsr, itemp1, itemp2, &
+                      rcost, i
+           logical(lk) :: ilschur, ilq, ilz
+           character :: jbcmpz*3
+           if( stdlib_lsame( wants, 'E' ) ) then
+              ilschur = .false.
+              iwants = 1
+           else if( stdlib_lsame( wants, 'S' ) ) then
+              ilschur = .true.
+              iwants = 2
+           else
+              iwants = 0
+           end if
+           if( stdlib_lsame( wantq, 'N' ) ) then
+              ilq = .false.
+              iwantq = 1
+           else if( stdlib_lsame( wantq, 'V' ) ) then
+              ilq = .true.
+              iwantq = 2
+           else if( stdlib_lsame( wantq, 'I' ) ) then
+              ilq = .true.
+              iwantq = 3
+           else
+              iwantq = 0
+           end if
+           if( stdlib_lsame( wantz, 'N' ) ) then
+              ilz = .false.
+              iwantz = 1
+           else if( stdlib_lsame( wantz, 'V' ) ) then
+              ilz = .true.
+              iwantz = 2
+           else if( stdlib_lsame( wantz, 'I' ) ) then
+              ilz = .true.
+              iwantz = 3
+           else
+              iwantz = 0
+           end if
+           ! check argument values
+           info = 0
+           if( iwants==0 ) then
+              info = -1
+           else if( iwantq==0 ) then
+              info = -2
+           else if( iwantz==0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 ) then
+              info = -5
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -6
+           else if( lda<n ) then
+              info = -8
+           else if( ldb<n ) then
+              info = -10
+           else if( ldq<1 .or. ( ilq .and. ldq<n ) ) then
+              info = -15
+           else if( ldz<1 .or. ( ilz .and. ldz<n ) ) then
+              info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAQZ0', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = real( 1,KIND=qp)
+              return
+           end if
+           ! get the parameters
+           jbcmpz( 1:1 ) = wants
+           jbcmpz( 2:2 ) = wantq
+           jbcmpz( 3:3 ) = wantz
+           nmin = stdlib_ilaenv( 12, 'DLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nwr = stdlib_ilaenv( 13, 'DLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nwr = max( 2, nwr )
+           nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+           nibble = stdlib_ilaenv( 14, 'DLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nsr = stdlib_ilaenv( 15, 'DLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nsr = min( nsr, ( n+6 ) / 9, ihi-ilo )
+           nsr = max( 2, nsr-mod( nsr, 2 ) )
+           rcost = stdlib_ilaenv( 17, 'DLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           itemp1 = int( nsr/sqrt( 1+2*nsr/( real( rcost,KIND=qp)/100*n ) ),KIND=ilp)
+           itemp1 = ( ( itemp1-1 )/4 )*4+4
+           nbr = nsr+itemp1
+           if( n < nmin .or. rec >= 2 ) then
+              call stdlib_qhgeqz( wants, wantq, wantz, n, ilo, ihi, a, lda, b, ldb,alphar, alphai,&
+                         beta, q, ldq, z, ldz, work,lwork, info )
+              return
+           end if
+           ! find out required workspace
+           ! workspace query to stdlib_qlaqz3
+           nw = max( nwr, nmin )
+           call stdlib_qlaqz3( ilschur, ilq, ilz, n, ilo, ihi, nw, a, lda, b, ldb,q, ldq, z, ldz, &
+           n_undeflated, n_qeflated, alphar,alphai, beta, work, nw, work, nw, work, -1, rec,&
+                     aed_info )
+           itemp1 = int( work( 1 ),KIND=ilp)
+           ! workspace query to stdlib_qlaqz4
+           call stdlib_qlaqz4( ilschur, ilq, ilz, n, ilo, ihi, nsr, nbr, alphar,alphai, beta, a, &
+                     lda, b, ldb, q, ldq, z, ldz, work,nbr, work, nbr, work, -1, sweep_info )
+           itemp2 = int( work( 1 ),KIND=ilp)
+           lworkreq = max( itemp1+2*nw**2, itemp2+2*nbr**2 )
+           if ( lwork ==-1 ) then
+              work( 1 ) = real( lworkreq,KIND=qp)
+              return
+           else if ( lwork < lworkreq ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAQZ0', info )
+              return
+           end if
+           ! initialize q and z
+           if( iwantq==3 ) call stdlib_qlaset( 'FULL', n, n, zero, one, q, ldq )
+           if( iwantz==3 ) call stdlib_qlaset( 'FULL', n, n, zero, one, z, ldz )
+           ! get machine constants
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_qlabad( safmin, safmax )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=qp)/ulp )
+           istart = ilo
+           istop = ihi
+           maxit = 3*( ihi-ilo+1 )
+           ld = 0
+           do iiter = 1, maxit
+              if( iiter >= maxit ) then
+                 info = istop+1
+                 goto 80
+              end if
+              if ( istart+1 >= istop ) then
+                 istop = istart
+                 exit
+              end if
+              ! check deflations at the end
+              if ( abs( a( istop-1, istop-2 ) ) <= max( smlnum,ulp*( abs( a( istop-1, istop-1 ) )+&
+                        abs( a( istop-2,istop-2 ) ) ) ) ) then
+                 a( istop-1, istop-2 ) = zero
+                 istop = istop-2
+                 ld = 0
+                 eshift = zero
+              else if ( abs( a( istop, istop-1 ) ) <= max( smlnum,ulp*( abs( a( istop, istop ) )+&
+                        abs( a( istop-1,istop-1 ) ) ) ) ) then
+                 a( istop, istop-1 ) = zero
+                 istop = istop-1
+                 ld = 0
+                 eshift = zero
+              end if
+              ! check deflations at the start
+              if ( abs( a( istart+2, istart+1 ) ) <= max( smlnum,ulp*( abs( a( istart+1, istart+1 &
+                        ) )+abs( a( istart+2,istart+2 ) ) ) ) ) then
+                 a( istart+2, istart+1 ) = zero
+                 istart = istart+2
+                 ld = 0
+                 eshift = zero
+              else if ( abs( a( istart+1, istart ) ) <= max( smlnum,ulp*( abs( a( istart, istart )&
+                         )+abs( a( istart+1,istart+1 ) ) ) ) ) then
+                 a( istart+1, istart ) = zero
+                 istart = istart+1
+                 ld = 0
+                 eshift = zero
+              end if
+              if ( istart+1 >= istop ) then
+                 exit
+              end if
+              ! check interior deflations
+              istart2 = istart
+              do k = istop, istart+1, -1
+                 if ( abs( a( k, k-1 ) ) <= max( smlnum, ulp*( abs( a( k,k ) )+abs( a( k-1, k-1 ) &
+                           ) ) ) ) then
+                    a( k, k-1 ) = zero
+                    istart2 = k
+                    exit
+                 end if
+              end do
+              ! get range to apply rotations to
+              if ( ilschur ) then
+                 istartm = 1
+                 istopm = n
+              else
+                 istartm = istart2
+                 istopm = istop
+              end if
+              ! check infinite eigenvalues, this is done without blocking so might
+              ! slow down the method when many infinite eigenvalues are present
+              k = istop
+              do while ( k>=istart2 )
+                 temp = zero
+                 if( k < istop ) then
+                    temp = temp+abs( b( k, k+1 ) )
+                 end if
+                 if( k > istart2 ) then
+                    temp = temp+abs( b( k-1, k ) )
+                 end if
+                 if( abs( b( k, k ) ) < max( smlnum, ulp*temp ) ) then
+                    ! a diagonal element of b is negligable, move it
+                    ! to the top and deflate it
+                    do k2 = k, istart2+1, -1
+                       call stdlib_qlartg( b( k2-1, k2 ), b( k2-1, k2-1 ), c1, s1,temp )
+                       b( k2-1, k2 ) = temp
+                       b( k2-1, k2-1 ) = zero
+                       call stdlib_qrot( k2-2-istartm+1, b( istartm, k2 ), 1,b( istartm, k2-1 ), &
+                                 1, c1, s1 )
+                       call stdlib_qrot( min( k2+1, istop )-istartm+1, a( istartm,k2 ), 1, a( &
+                                 istartm, k2-1 ), 1, c1, s1 )
+                       if ( ilz ) then
+                          call stdlib_qrot( n, z( 1, k2 ), 1, z( 1, k2-1 ), 1, c1,s1 )
+                       end if
+                       if( k2<istop ) then
+                          call stdlib_qlartg( a( k2, k2-1 ), a( k2+1, k2-1 ), c1,s1, temp )
+                                    
+                          a( k2, k2-1 ) = temp
+                          a( k2+1, k2-1 ) = zero
+                          call stdlib_qrot( istopm-k2+1, a( k2, k2 ), lda, a( k2+1,k2 ), lda, c1, &
+                                    s1 )
+                          call stdlib_qrot( istopm-k2+1, b( k2, k2 ), ldb, b( k2+1,k2 ), ldb, c1, &
+                                    s1 )
+                          if( ilq ) then
+                             call stdlib_qrot( n, q( 1, k2 ), 1, q( 1, k2+1 ), 1,c1, s1 )
+                          end if
+                       end if
+                    end do
+                    if( istart2<istop )then
+                       call stdlib_qlartg( a( istart2, istart2 ), a( istart2+1,istart2 ), c1, s1, &
+                                 temp )
+                       a( istart2, istart2 ) = temp
+                       a( istart2+1, istart2 ) = zero
+                       call stdlib_qrot( istopm-( istart2+1 )+1, a( istart2,istart2+1 ), lda, a( &
+                                 istart2+1,istart2+1 ), lda, c1, s1 )
+                       call stdlib_qrot( istopm-( istart2+1 )+1, b( istart2,istart2+1 ), ldb, b( &
+                                 istart2+1,istart2+1 ), ldb, c1, s1 )
+                       if( ilq ) then
+                          call stdlib_qrot( n, q( 1, istart2 ), 1, q( 1,istart2+1 ), 1, c1, s1 )
+                                    
+                       end if
+                    end if
+                    istart2 = istart2+1
+                 end if
+                 k = k-1
+              end do
+              ! istart2 now points to the top of the bottom right
+              ! unreduced hessenberg block
+              if ( istart2 >= istop ) then
+                 istop = istart2-1
+                 ld = 0
+                 eshift = zero
+                 cycle
+              end if
+              nw = nwr
+              nshifts = nsr
+              nblock = nbr
+              if ( istop-istart2+1 < nmin ) then
+                 ! setting nw to the size of the subblock will make aed deflate
+                 ! all the eigenvalues. this is slightly more efficient than just
+                 ! using stdlib_qhgeqz because the off diagonal part gets updated via blas.
+                 if ( istop-istart+1 < nmin ) then
+                    nw = istop-istart+1
+                    istart2 = istart
+                 else
+                    nw = istop-istart2+1
+                 end if
+              end if
+              ! time for aed
+              call stdlib_qlaqz3( ilschur, ilq, ilz, n, istart2, istop, nw, a, lda,b, ldb, q, ldq,&
+               z, ldz, n_undeflated, n_qeflated,alphar, alphai, beta, work, nw, work( nw**2+1 ),&
+                         nw, work( 2*nw**2+1 ), lwork-2*nw**2, rec,aed_info )
+              if ( n_qeflated > 0 ) then
+                 istop = istop-n_qeflated
+                 ld = 0
+                 eshift = zero
+              end if
+              if ( 100*n_qeflated > nibble*( n_qeflated+n_undeflated ) .or.istop-istart2+1 < nmin &
+                        ) then
+                 ! aed has uncovered many eigenvalues. skip a qz sweep and run
+                 ! aed again.
+                 cycle
+              end if
+              ld = ld+1
+              ns = min( nshifts, istop-istart2 )
+              ns = min( ns, n_undeflated )
+              shiftpos = istop-n_qeflated-n_undeflated+1
+              ! shuffle shifts to put double shifts in front
+              ! this ensures that we don't split up a double shift
+              do i = shiftpos, shiftpos+n_undeflated-1, 2
+                 if( alphai( i )/=-alphai( i+1 ) ) then
+                    swap = alphar( i )
+                    alphar( i ) = alphar( i+1 )
+                    alphar( i+1 ) = alphar( i+2 )
+                    alphar( i+2 ) = swap
+                    swap = alphai( i )
+                    alphai( i ) = alphai( i+1 )
+                    alphai( i+1 ) = alphai( i+2 )
+                    alphai( i+2 ) = swap
+                    swap = beta( i )
+                    beta( i ) = beta( i+1 )
+                    beta( i+1 ) = beta( i+2 )
+                    beta( i+2 ) = swap
+                 end if
+              end do
+              if ( mod( ld, 6 ) == 0 ) then
+                 ! exceptional shift.  chosen for no particularly good reason.
+                 if( ( real( maxit,KIND=qp)*safmin )*abs( a( istop,istop-1 ) )<abs( a( istop-1, &
+                           istop-1 ) ) ) then
+                    eshift = a( istop, istop-1 )/b( istop-1, istop-1 )
+                 else
+                    eshift = eshift+one/( safmin*real( maxit,KIND=qp) )
+                 end if
+                 alphar( shiftpos ) = one
+                 alphar( shiftpos+1 ) = zero
+                 alphai( shiftpos ) = zero
+                 alphai( shiftpos+1 ) = zero
+                 beta( shiftpos ) = eshift
+                 beta( shiftpos+1 ) = eshift
+                 ns = 2
+              end if
+              ! time for a qz sweep
+              call stdlib_qlaqz4( ilschur, ilq, ilz, n, istart2, istop, ns, nblock,alphar( &
+              shiftpos ), alphai( shiftpos ),beta( shiftpos ), a, lda, b, ldb, q, ldq, z, ldz,&
+              work, nblock, work( nblock**2+1 ), nblock,work( 2*nblock**2+1 ), lwork-2*nblock**2,&
+                        sweep_info )
+           end do
+           ! call stdlib_qhgeqz to normalize the eigenvalue blocks and set the eigenvalues
+           ! if all the eigenvalues have been found, stdlib_qhgeqz will not do any iterations
+           ! and only normalize the blocks. in case of a rare convergence failure,
+           ! the single shift might perform better.
+        80 call stdlib_qhgeqz( wants, wantq, wantz, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                  beta, q, ldq, z, ldz, work, lwork,norm_info )
+           info = norm_info
+     end subroutine stdlib_qlaqz0
+
+     !> Given a 3-by-3 matrix pencil (A,B), DLAQZ1: sets v to a
+     !> scalar multiple of the first column of the product
+     !> (*)  K = (A - (beta2*sr2 - i*si)*B)*B^(-1)*(beta1*A - (sr2 + i*si2)*B)*B^(-1).
+     !> It is assumed that either
+     !> 1) sr1 = sr2
+     !> or
+     !> 2) si = 0.
+     !> This is useful for starting double implicit shift bulges
+     !> in the QZ algorithm.
+
+     pure subroutine stdlib_qlaqz1( a, lda, b, ldb, sr1, sr2, si, beta1, beta2,v )
+           ! arguments
+           integer(ilp), intent( in ) :: lda, ldb
+           real(qp), intent( in ) :: a( lda, * ), b( ldb, * ), sr1,sr2, si, beta1, beta2
+           real(qp), intent( out ) :: v( * )
+           
+           ! local scalars
+           real(qp) :: w(2), safmin, safmax, scale1, scale2
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           ! calculate first shifted vector
+           w( 1 ) = beta1*a( 1, 1 )-sr1*b( 1, 1 )
+           w( 2 ) = beta1*a( 2, 1 )-sr1*b( 2, 1 )
+           scale1 = sqrt( abs( w( 1 ) ) ) * sqrt( abs( w( 2 ) ) )
+           if( scale1 >= safmin .and. scale1 <= safmax ) then
+              w( 1 ) = w( 1 )/scale1
+              w( 2 ) = w( 2 )/scale1
+           end if
+           ! solve linear system
+           w( 2 ) = w( 2 )/b( 2, 2 )
+           w( 1 ) = ( w( 1 )-b( 1, 2 )*w( 2 ) )/b( 1, 1 )
+           scale2 = sqrt( abs( w( 1 ) ) ) * sqrt( abs( w( 2 ) ) )
+           if( scale2 >= safmin .and. scale2 <= safmax ) then
+              w( 1 ) = w( 1 )/scale2
+              w( 2 ) = w( 2 )/scale2
+           end if
+           ! apply second shift
+           v( 1 ) = beta2*( a( 1, 1 )*w( 1 )+a( 1, 2 )*w( 2 ) )-sr2*( b( 1,1 )*w( 1 )+b( 1, 2 )*w(&
+                      2 ) )
+           v( 2 ) = beta2*( a( 2, 1 )*w( 1 )+a( 2, 2 )*w( 2 ) )-sr2*( b( 2,1 )*w( 1 )+b( 2, 2 )*w(&
+                      2 ) )
+           v( 3 ) = beta2*( a( 3, 1 )*w( 1 )+a( 3, 2 )*w( 2 ) )-sr2*( b( 3,1 )*w( 1 )+b( 3, 2 )*w(&
+                      2 ) )
+           ! account for imaginary part
+           v( 1 ) = v( 1 )+si*si*b( 1, 1 )/scale1/scale2
+           ! check for overflow
+           if( abs( v( 1 ) )>safmax .or. abs( v( 2 ) ) > safmax .or.abs( v( 3 ) )>safmax .or. &
+           stdlib_qisnan( v( 1 ) ) .or.stdlib_qisnan( v( 2 ) ) .or. stdlib_qisnan( v( 3 ) ) ) &
+                     then
+              v( 1 ) = zero
+              v( 2 ) = zero
+              v( 3 ) = zero
+           end if
+     end subroutine stdlib_qlaqz1
+
+     !> DLAQZ2: chases a 2x2 shift bulge in a matrix pencil down a single position
+
+     pure subroutine stdlib_qlaqz2( ilq, ilz, k, istartm, istopm, ihi, a, lda, b,ldb, nq, qstart, &
+               q, ldq, nz, zstart, z, ldz )
+           ! arguments
+           logical(lk), intent( in ) :: ilq, ilz
+           integer(ilp), intent( in ) :: k, lda, ldb, ldq, ldz, istartm, istopm,nq, nz, qstart, &
+                     zstart, ihi
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           
+           ! local variables
+           real(qp) :: h(2,3), c1, s1, c2, s2, temp
+           if( k+2 == ihi ) then
+              ! shift is located on the edge of the matrix, remove it
+              h = b( ihi-1:ihi, ihi-2:ihi )
+              ! make h upper triangular
+              call stdlib_qlartg( h( 1, 1 ), h( 2, 1 ), c1, s1, temp )
+              h( 2, 1 ) = zero
+              h( 1, 1 ) = temp
+              call stdlib_qrot( 2, h( 1, 2 ), 2, h( 2, 2 ), 2, c1, s1 )
+              call stdlib_qlartg( h( 2, 3 ), h( 2, 2 ), c1, s1, temp )
+              call stdlib_qrot( 1, h( 1, 3 ), 1, h( 1, 2 ), 1, c1, s1 )
+              call stdlib_qlartg( h( 1, 2 ), h( 1, 1 ), c2, s2, temp )
+              call stdlib_qrot( ihi-istartm+1, b( istartm, ihi ), 1, b( istartm,ihi-1 ), 1, c1, &
+                        s1 )
+              call stdlib_qrot( ihi-istartm+1, b( istartm, ihi-1 ), 1, b( istartm,ihi-2 ), 1, c2, &
+                        s2 )
+              b( ihi-1, ihi-2 ) = zero
+              b( ihi, ihi-2 ) = zero
+              call stdlib_qrot( ihi-istartm+1, a( istartm, ihi ), 1, a( istartm,ihi-1 ), 1, c1, &
+                        s1 )
+              call stdlib_qrot( ihi-istartm+1, a( istartm, ihi-1 ), 1, a( istartm,ihi-2 ), 1, c2, &
+                        s2 )
+              if ( ilz ) then
+                 call stdlib_qrot( nz, z( 1, ihi-zstart+1 ), 1, z( 1, ihi-1-zstart+1 ), 1, c1, s1 &
+                           )
+                 call stdlib_qrot( nz, z( 1, ihi-1-zstart+1 ), 1, z( 1,ihi-2-zstart+1 ), 1, c2, &
+                           s2 )
+              end if
+              call stdlib_qlartg( a( ihi-1, ihi-2 ), a( ihi, ihi-2 ), c1, s1,temp )
+              a( ihi-1, ihi-2 ) = temp
+              a( ihi, ihi-2 ) = zero
+              call stdlib_qrot( istopm-ihi+2, a( ihi-1, ihi-1 ), lda, a( ihi,ihi-1 ), lda, c1, s1 &
+                        )
+              call stdlib_qrot( istopm-ihi+2, b( ihi-1, ihi-1 ), ldb, b( ihi,ihi-1 ), ldb, c1, s1 &
+                        )
+              if ( ilq ) then
+                 call stdlib_qrot( nq, q( 1, ihi-1-qstart+1 ), 1, q( 1, ihi-qstart+1 ), 1, c1, s1 &
+                           )
+              end if
+              call stdlib_qlartg( b( ihi, ihi ), b( ihi, ihi-1 ), c1, s1, temp )
+              b( ihi, ihi ) = temp
+              b( ihi, ihi-1 ) = zero
+              call stdlib_qrot( ihi-istartm, b( istartm, ihi ), 1, b( istartm,ihi-1 ), 1, c1, s1 )
+                        
+              call stdlib_qrot( ihi-istartm+1, a( istartm, ihi ), 1, a( istartm,ihi-1 ), 1, c1, &
+                        s1 )
+              if ( ilz ) then
+                 call stdlib_qrot( nz, z( 1, ihi-zstart+1 ), 1, z( 1, ihi-1-zstart+1 ), 1, c1, s1 &
+                           )
+              end if
+           else
+              ! normal operation, move bulge down
+              h = b( k+1:k+2, k:k+2 )
+              ! make h upper triangular
+              call stdlib_qlartg( h( 1, 1 ), h( 2, 1 ), c1, s1, temp )
+              h( 2, 1 ) = zero
+              h( 1, 1 ) = temp
+              call stdlib_qrot( 2, h( 1, 2 ), 2, h( 2, 2 ), 2, c1, s1 )
+              ! calculate z1 and z2
+              call stdlib_qlartg( h( 2, 3 ), h( 2, 2 ), c1, s1, temp )
+              call stdlib_qrot( 1, h( 1, 3 ), 1, h( 1, 2 ), 1, c1, s1 )
+              call stdlib_qlartg( h( 1, 2 ), h( 1, 1 ), c2, s2, temp )
+              ! apply transformations from the right
+              call stdlib_qrot( k+3-istartm+1, a( istartm, k+2 ), 1, a( istartm,k+1 ), 1, c1, s1 )
+                        
+              call stdlib_qrot( k+3-istartm+1, a( istartm, k+1 ), 1, a( istartm,k ), 1, c2, s2 )
+                        
+              call stdlib_qrot( k+2-istartm+1, b( istartm, k+2 ), 1, b( istartm,k+1 ), 1, c1, s1 )
+                        
+              call stdlib_qrot( k+2-istartm+1, b( istartm, k+1 ), 1, b( istartm,k ), 1, c2, s2 )
+                        
+              if ( ilz ) then
+                 call stdlib_qrot( nz, z( 1, k+2-zstart+1 ), 1, z( 1, k+1-zstart+1 ), 1, c1, s1 )
+                           
+                 call stdlib_qrot( nz, z( 1, k+1-zstart+1 ), 1, z( 1, k-zstart+1 ),1, c2, s2 )
+                           
+              end if
+              b( k+1, k ) = zero
+              b( k+2, k ) = zero
+              ! calculate q1 and q2
+              call stdlib_qlartg( a( k+2, k ), a( k+3, k ), c1, s1, temp )
+              a( k+2, k ) = temp
+              a( k+3, k ) = zero
+              call stdlib_qlartg( a( k+1, k ), a( k+2, k ), c2, s2, temp )
+              a( k+1, k ) = temp
+              a( k+2, k ) = zero
+              ! apply transformations from the left
+              call stdlib_qrot( istopm-k, a( k+2, k+1 ), lda, a( k+3, k+1 ), lda,c1, s1 )
+              call stdlib_qrot( istopm-k, a( k+1, k+1 ), lda, a( k+2, k+1 ), lda,c2, s2 )
+              call stdlib_qrot( istopm-k, b( k+2, k+1 ), ldb, b( k+3, k+1 ), ldb,c1, s1 )
+              call stdlib_qrot( istopm-k, b( k+1, k+1 ), ldb, b( k+2, k+1 ), ldb,c2, s2 )
+              if ( ilq ) then
+                 call stdlib_qrot( nq, q( 1, k+2-qstart+1 ), 1, q( 1, k+3-qstart+1 ), 1, c1, s1 )
+                           
+                 call stdlib_qrot( nq, q( 1, k+1-qstart+1 ), 1, q( 1, k+2-qstart+1 ), 1, c2, s2 )
+                           
+              end if
+           end if
+     end subroutine stdlib_qlaqz2
+
+     !> DLAQZ3: performs AED
+
+     recursive subroutine stdlib_qlaqz3( ilschur, ilq, ilz, n, ilo, ihi, nw,a, lda, b, ldb, q, &
+               ldq, z, ldz, ns,nd, alphar, alphai, beta, qc, ldqc,zc, ldzc, work, lwork, rec, info )
+           ! arguments
+           logical(lk), intent( in ) :: ilschur, ilq, ilz
+           integer(ilp), intent( in ) :: n, ilo, ihi, nw, lda, ldb, ldq, ldz,ldqc, ldzc, lwork, &
+                     rec
+           real(qp), intent( inout ) :: a( lda, * ), b( ldb, * ),q( ldq, * ), z( ldz, * ), alphar(&
+                      * ),alphai( * ), beta( * )
+           integer(ilp), intent( out ) :: ns, nd, info
+           real(qp), intent(inout) :: qc(ldqc,*), zc(ldzc,*)
+           real(qp), intent(out) :: work(*)
+           
+           ! local scalars
+           logical(lk) :: bulge
+           integer(ilp) :: jw, kwtop, kwbot, istopm, istartm, k, k2, dtgexc_info, ifst, ilst, &
+                     lworkreq, qz_small_info
+           real(qp) :: s, smlnum, ulp, safmin, safmax, c1, s1, temp
+           info = 0
+           ! set up deflation window
+           jw = min( nw, ihi-ilo+1 )
+           kwtop = ihi-jw+1
+           if ( kwtop == ilo ) then
+              s = zero
+           else
+              s = a( kwtop, kwtop-1 )
+           end if
+           ! determine required workspace
+           ifst = 1
+           ilst = jw
+           call stdlib_qtgexc( .true., .true., jw, a, lda, b, ldb, qc, ldqc, zc,ldzc, ifst, ilst, &
+                     work, -1, dtgexc_info )
+           lworkreq = int( work( 1 ),KIND=ilp)
+           call stdlib_qlaqz0( 'S', 'V', 'V', jw, 1, jw, a( kwtop, kwtop ), lda,b( kwtop, kwtop ),&
+                      ldb, alphar, alphai, beta, qc,ldqc, zc, ldzc, work, -1, rec+1, qz_small_info )
+           lworkreq = max( lworkreq, int( work( 1 ),KIND=ilp)+2*jw**2 )
+           lworkreq = max( lworkreq, n*nw, 2*nw**2+n )
+           if ( lwork ==-1 ) then
+              ! workspace query, quick return
+              work( 1 ) = lworkreq
+              return
+           else if ( lwork < lworkreq ) then
+              info = -26
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAQZ3', -info )
+              return
+           end if
+           ! get machine constants
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_qlabad( safmin, safmax )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=qp)/ulp )
+           if ( ihi == kwtop ) then
+              ! 1 by 1 deflation window, just try a regular deflation
+              alphar( kwtop ) = a( kwtop, kwtop )
+              alphai( kwtop ) = zero
+              beta( kwtop ) = b( kwtop, kwtop )
+              ns = 1
+              nd = 0
+              if ( abs( s ) <= max( smlnum, ulp*abs( a( kwtop,kwtop ) ) ) ) then
+                 ns = 0
+                 nd = 1
+                 if ( kwtop > ilo ) then
+                    a( kwtop, kwtop-1 ) = zero
+                 end if
+              end if
+           end if
+           ! store window in case of convergence failure
+           call stdlib_qlacpy( 'ALL', jw, jw, a( kwtop, kwtop ), lda, work, jw )
+           call stdlib_qlacpy( 'ALL', jw, jw, b( kwtop, kwtop ), ldb, work( jw**2+1 ), jw )
+                     
+           ! transform window to real schur form
+           call stdlib_qlaset( 'FULL', jw, jw, zero, one, qc, ldqc )
+           call stdlib_qlaset( 'FULL', jw, jw, zero, one, zc, ldzc )
+           call stdlib_qlaqz0( 'S', 'V', 'V', jw, 1, jw, a( kwtop, kwtop ), lda,b( kwtop, kwtop ),&
+            ldb, alphar, alphai, beta, qc,ldqc, zc, ldzc, work( 2*jw**2+1 ), lwork-2*jw**2,rec+1, &
+                      qz_small_info )
+           if( qz_small_info /= 0 ) then
+              ! convergence failure, restore the window and exit
+              nd = 0
+              ns = jw-qz_small_info
+              call stdlib_qlacpy( 'ALL', jw, jw, work, jw, a( kwtop, kwtop ), lda )
+              call stdlib_qlacpy( 'ALL', jw, jw, work( jw**2+1 ), jw, b( kwtop,kwtop ), ldb )
+                        
+              return
+           end if
+           ! deflation detection loop
+           if ( kwtop == ilo .or. s == zero ) then
+              kwbot = kwtop-1
+           else
+              kwbot = ihi
+              k = 1
+              k2 = 1
+              do while ( k <= jw )
+                 bulge = .false.
+                 if ( kwbot-kwtop+1 >= 2 ) then
+                    bulge = a( kwbot, kwbot-1 ) /= zero
+                 end if
+                 if ( bulge ) then
+                    ! try to deflate complex conjugate eigenvalue pair
+                    temp = abs( a( kwbot, kwbot ) )+sqrt( abs( a( kwbot,kwbot-1 ) ) )*sqrt( abs( &
+                              a( kwbot-1, kwbot ) ) )
+                    if( temp == zero )then
+                       temp = abs( s )
+                    end if
+                    if ( max( abs( s*qc( 1, kwbot-kwtop ) ), abs( s*qc( 1,kwbot-kwtop+1 ) ) ) <= &
+                              max( smlnum,ulp*temp ) ) then
+                       ! deflatable
+                       kwbot = kwbot-2
+                    else
+                       ! not deflatable, move out of the way
+                       ifst = kwbot-kwtop+1
+                       ilst = k2
+                       call stdlib_qtgexc( .true., .true., jw, a( kwtop, kwtop ),lda, b( kwtop, &
+                       kwtop ), ldb, qc, ldqc,zc, ldzc, ifst, ilst, work, lwork,dtgexc_info )
+                                 
+                       k2 = k2+2
+                    end if
+                    k = k+2
+                 else
+                    ! try to deflate real eigenvalue
+                    temp = abs( a( kwbot, kwbot ) )
+                    if( temp == zero ) then
+                       temp = abs( s )
+                    end if
+                    if ( ( abs( s*qc( 1, kwbot-kwtop+1 ) ) ) <= max( ulp*temp, smlnum ) ) &
+                              then
+                       ! deflatable
+                       kwbot = kwbot-1
+                    else
+                       ! not deflatable, move out of the way
+                       ifst = kwbot-kwtop+1
+                       ilst = k2
+                       call stdlib_qtgexc( .true., .true., jw, a( kwtop, kwtop ),lda, b( kwtop, &
+                       kwtop ), ldb, qc, ldqc,zc, ldzc, ifst, ilst, work, lwork,dtgexc_info )
+                                 
+                       k2 = k2+1
+                    end if
+                    k = k+1
+                 end if
+              end do
+           end if
+           ! store eigenvalues
+           nd = ihi-kwbot
+           ns = jw-nd
+           k = kwtop
+           do while ( k <= ihi )
+              bulge = .false.
+              if ( k < ihi ) then
+                 if ( a( k+1, k ) /= zero ) then
+                    bulge = .true.
+                 end if
+              end if
+              if ( bulge ) then
+                 ! 2x2 eigenvalue block
+                 call stdlib_qlag2( a( k, k ), lda, b( k, k ), ldb, safmin,beta( k ), beta( k+1 ),&
+                            alphar( k ),alphar( k+1 ), alphai( k ) )
+                 alphai( k+1 ) = -alphai( k )
+                 k = k+2
+              else
+                 ! 1x1 eigenvalue block
+                 alphar( k ) = a( k, k )
+                 alphai( k ) = zero
+                 beta( k ) = b( k, k )
+                 k = k+1
+              end if
+           end do
+           if ( kwtop /= ilo .and. s /= zero ) then
+              ! reflect spike back, this will create optimally packed bulges
+              a( kwtop:kwbot, kwtop-1 ) = a( kwtop, kwtop-1 )*qc( 1,1:jw-nd )
+              do k = kwbot-1, kwtop, -1
+                 call stdlib_qlartg( a( k, kwtop-1 ), a( k+1, kwtop-1 ), c1, s1,temp )
+                 a( k, kwtop-1 ) = temp
+                 a( k+1, kwtop-1 ) = zero
+                 k2 = max( kwtop, k-1 )
+                 call stdlib_qrot( ihi-k2+1, a( k, k2 ), lda, a( k+1, k2 ), lda, c1,s1 )
+                 call stdlib_qrot( ihi-( k-1 )+1, b( k, k-1 ), ldb, b( k+1, k-1 ),ldb, c1, s1 )
+                           
+                 call stdlib_qrot( jw, qc( 1, k-kwtop+1 ), 1, qc( 1, k+1-kwtop+1 ),1, c1, s1 )
+                           
+              end do
+              ! chase bulges down
+              istartm = kwtop
+              istopm = ihi
+              k = kwbot-1
+              do while ( k >= kwtop )
+                 if ( ( k >= kwtop+1 ) .and. a( k+1, k-1 ) /= zero ) then
+                    ! move double pole block down and remove it
+                    do k2 = k-1, kwbot-2
+                       call stdlib_qlaqz2( .true., .true., k2, kwtop, kwtop+jw-1,kwbot, a, lda, b,&
+                                  ldb, jw, kwtop, qc,ldqc, jw, kwtop, zc, ldzc )
+                    end do
+                    k = k-2
+                 else
+                    ! k points to single shift
+                    do k2 = k, kwbot-2
+                       ! move shift down
+                       call stdlib_qlartg( b( k2+1, k2+1 ), b( k2+1, k2 ), c1, s1,temp )
+                       b( k2+1, k2+1 ) = temp
+                       b( k2+1, k2 ) = zero
+                       call stdlib_qrot( k2+2-istartm+1, a( istartm, k2+1 ), 1,a( istartm, k2 ), &
+                                 1, c1, s1 )
+                       call stdlib_qrot( k2-istartm+1, b( istartm, k2+1 ), 1,b( istartm, k2 ), 1, &
+                                 c1, s1 )
+                       call stdlib_qrot( jw, zc( 1, k2+1-kwtop+1 ), 1, zc( 1,k2-kwtop+1 ), 1, c1, &
+                                 s1 )
+                       call stdlib_qlartg( a( k2+1, k2 ), a( k2+2, k2 ), c1, s1,temp )
+                       a( k2+1, k2 ) = temp
+                       a( k2+2, k2 ) = zero
+                       call stdlib_qrot( istopm-k2, a( k2+1, k2+1 ), lda, a( k2+2,k2+1 ), lda, c1,&
+                                  s1 )
+                       call stdlib_qrot( istopm-k2, b( k2+1, k2+1 ), ldb, b( k2+2,k2+1 ), ldb, c1,&
+                                  s1 )
+                       call stdlib_qrot( jw, qc( 1, k2+1-kwtop+1 ), 1, qc( 1,k2+2-kwtop+1 ), 1, &
+                                 c1, s1 )
+                    end do
+                    ! remove the shift
+                    call stdlib_qlartg( b( kwbot, kwbot ), b( kwbot, kwbot-1 ), c1,s1, temp )
+                              
+                    b( kwbot, kwbot ) = temp
+                    b( kwbot, kwbot-1 ) = zero
+                    call stdlib_qrot( kwbot-istartm, b( istartm, kwbot ), 1,b( istartm, kwbot-1 ),&
+                               1, c1, s1 )
+                    call stdlib_qrot( kwbot-istartm+1, a( istartm, kwbot ), 1,a( istartm, kwbot-1 &
+                              ), 1, c1, s1 )
+                    call stdlib_qrot( jw, zc( 1, kwbot-kwtop+1 ), 1, zc( 1,kwbot-1-kwtop+1 ), 1, &
+                              c1, s1 )
+                    k = k-1
+                 end if
+              end do
+           end if
+           ! apply qc and zc to rest of the matrix
+           if ( ilschur ) then
+              istartm = 1
+              istopm = n
+           else
+              istartm = ilo
+              istopm = ihi
+           end if
+           if ( istopm-ihi > 0 ) then
+              call stdlib_qgemm( 'T', 'N', jw, istopm-ihi, jw, one, qc, ldqc,a( kwtop, ihi+1 ), &
+                        lda, zero, work, jw )
+              call stdlib_qlacpy( 'ALL', jw, istopm-ihi, work, jw, a( kwtop,ihi+1 ), lda )
+              call stdlib_qgemm( 'T', 'N', jw, istopm-ihi, jw, one, qc, ldqc,b( kwtop, ihi+1 ), &
+                        ldb, zero, work, jw )
+              call stdlib_qlacpy( 'ALL', jw, istopm-ihi, work, jw, b( kwtop,ihi+1 ), ldb )
+           end if
+           if ( ilq ) then
+              call stdlib_qgemm( 'N', 'N', n, jw, jw, one, q( 1, kwtop ), ldq, qc,ldqc, zero, &
+                        work, n )
+              call stdlib_qlacpy( 'ALL', n, jw, work, n, q( 1, kwtop ), ldq )
+           end if
+           if ( kwtop-1-istartm+1 > 0 ) then
+              call stdlib_qgemm( 'N', 'N', kwtop-istartm, jw, jw, one, a( istartm,kwtop ), lda, &
+                        zc, ldzc, zero, work,kwtop-istartm )
+              call stdlib_qlacpy( 'ALL', kwtop-istartm, jw, work, kwtop-istartm,a( istartm, kwtop &
+                        ), lda )
+              call stdlib_qgemm( 'N', 'N', kwtop-istartm, jw, jw, one, b( istartm,kwtop ), ldb, &
+                        zc, ldzc, zero, work,kwtop-istartm )
+              call stdlib_qlacpy( 'ALL', kwtop-istartm, jw, work, kwtop-istartm,b( istartm, kwtop &
+                        ), ldb )
+           end if
+           if ( ilz ) then
+              call stdlib_qgemm( 'N', 'N', n, jw, jw, one, z( 1, kwtop ), ldz, zc,ldzc, zero, &
+                        work, n )
+              call stdlib_qlacpy( 'ALL', n, jw, work, n, z( 1, kwtop ), ldz )
+           end if
+     end subroutine stdlib_qlaqz3
+
+     !> DLAQZ4: Executes a single multishift QZ sweep
+
+     pure subroutine stdlib_qlaqz4( ilschur, ilq, ilz, n, ilo, ihi, nshifts,nblock_qesired, sr, &
+               si, ss, a, lda, b, ldb, q,ldq, z, ldz, qc, ldqc, zc, ldzc, work, lwork,info )
+           ! function arguments
+           logical(lk), intent( in ) :: ilschur, ilq, ilz
+           integer(ilp), intent( in ) :: n, ilo, ihi, lda, ldb, ldq, ldz, lwork,nshifts, &
+                     nblock_qesired, ldqc, ldzc
+           real(qp), intent( inout ) :: a( lda, * ), b( ldb, * ),q( ldq, * ), z( ldz, * ), qc( &
+                     ldqc, * ),zc( ldzc, * ), work( * ), sr( * ), si( * ),ss( * )
+           integer(ilp), intent( out ) :: info
+           
+           ! local scalars
+           integer(ilp) :: i, j, ns, istartm, istopm, sheight, swidth, k, np, istartb, istopb, &
+                     ishift, nblock, npos
+           real(qp) :: temp, v(3), c1, s1, c2, s2, swap
+           info = 0
+           if ( nblock_qesired < nshifts+1 ) then
+              info = -8
+           end if
+           if ( lwork ==-1 ) then
+              ! workspace query, quick return
+              work( 1 ) = n*nblock_qesired
+              return
+           else if ( lwork < n*nblock_qesired ) then
+              info = -25
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAQZ4', -info )
+              return
+           end if
+           ! executable statements
+           if ( nshifts < 2 ) then
+              return
+           end if
+           if ( ilo >= ihi ) then
+              return
+           end if
+           if ( ilschur ) then
+              istartm = 1
+              istopm = n
+           else
+              istartm = ilo
+              istopm = ihi
+           end if
+           ! shuffle shifts into pairs of real shifts and pairs
+           ! of complex conjugate shifts assuming complex
+           ! conjugate shifts are already adjacent to one
+           ! another
+           do i = 1, nshifts-2, 2
+              if( si( i )/=-si( i+1 ) ) then
+                 swap = sr( i )
+                 sr( i ) = sr( i+1 )
+                 sr( i+1 ) = sr( i+2 )
+                 sr( i+2 ) = swap
+                 swap = si( i )
+                 si( i ) = si( i+1 )
+                 si( i+1 ) = si( i+2 )
+                 si( i+2 ) = swap
+                 swap = ss( i )
+                 ss( i ) = ss( i+1 )
+                 ss( i+1 ) = ss( i+2 )
+                 ss( i+2 ) = swap
+              end if
+           end do
+           ! nshfts is supposed to be even, but if it is odd,
+           ! then simply reduce it by one.  the shuffle above
+           ! ensures that the dropped shift is real and that
+           ! the remaining shifts are paired.
+           ns = nshifts-mod( nshifts, 2 )
+           npos = max( nblock_qesired-ns, 1 )
+           ! the following block introduces the shifts and chases
+           ! them down one by one just enough to make space for
+           ! the other shifts. the near-the-diagonal block is
+           ! of size (ns+1) x ns.
+           call stdlib_qlaset( 'FULL', ns+1, ns+1, zero, one, qc, ldqc )
+           call stdlib_qlaset( 'FULL', ns, ns, zero, one, zc, ldzc )
+           do i = 1, ns, 2
+              ! introduce the shift
+              call stdlib_qlaqz1( a( ilo, ilo ), lda, b( ilo, ilo ), ldb, sr( i ),sr( i+1 ), si( &
+                        i ), ss( i ), ss( i+1 ), v )
+              temp = v( 2 )
+              call stdlib_qlartg( temp, v( 3 ), c1, s1, v( 2 ) )
+              call stdlib_qlartg( v( 1 ), v( 2 ), c2, s2, temp )
+              call stdlib_qrot( ns, a( ilo+1, ilo ), lda, a( ilo+2, ilo ), lda, c1,s1 )
+              call stdlib_qrot( ns, a( ilo, ilo ), lda, a( ilo+1, ilo ), lda, c2,s2 )
+              call stdlib_qrot( ns, b( ilo+1, ilo ), ldb, b( ilo+2, ilo ), ldb, c1,s1 )
+              call stdlib_qrot( ns, b( ilo, ilo ), ldb, b( ilo+1, ilo ), ldb, c2,s2 )
+              call stdlib_qrot( ns+1, qc( 1, 2 ), 1, qc( 1, 3 ), 1, c1, s1 )
+              call stdlib_qrot( ns+1, qc( 1, 1 ), 1, qc( 1, 2 ), 1, c2, s2 )
+              ! chase the shift down
+              do j = 1, ns-1-i
+                 call stdlib_qlaqz2( .true., .true., j, 1, ns, ihi-ilo+1, a( ilo,ilo ), lda, b( &
+                           ilo, ilo ), ldb, ns+1, 1, qc,ldqc, ns, 1, zc, ldzc )
+              end do
+           end do
+           ! update the rest of the pencil
+           ! update a(ilo:ilo+ns,ilo+ns:istopm) and b(ilo:ilo+ns,ilo+ns:istopm)
+           ! from the left with qc(1:ns+1,1:ns+1)'
+           sheight = ns+1
+           swidth = istopm-( ilo+ns )+1
+           if ( swidth > 0 ) then
+              call stdlib_qgemm( 'T', 'N', sheight, swidth, sheight, one, qc, ldqc,a( ilo, ilo+ns &
+                        ), lda, zero, work, sheight )
+              call stdlib_qlacpy( 'ALL', sheight, swidth, work, sheight, a( ilo,ilo+ns ), lda )
+                        
+              call stdlib_qgemm( 'T', 'N', sheight, swidth, sheight, one, qc, ldqc,b( ilo, ilo+ns &
+                        ), ldb, zero, work, sheight )
+              call stdlib_qlacpy( 'ALL', sheight, swidth, work, sheight, b( ilo,ilo+ns ), ldb )
+                        
+           end if
+           if ( ilq ) then
+              call stdlib_qgemm( 'N', 'N', n, sheight, sheight, one, q( 1, ilo ),ldq, qc, ldqc, &
+                        zero, work, n )
+              call stdlib_qlacpy( 'ALL', n, sheight, work, n, q( 1, ilo ), ldq )
+           end if
+           ! update a(istartm:ilo-1,ilo:ilo+ns-1) and b(istartm:ilo-1,ilo:ilo+ns-1)
+           ! from the right with zc(1:ns,1:ns)
+           sheight = ilo-1-istartm+1
+           swidth = ns
+           if ( sheight > 0 ) then
+              call stdlib_qgemm( 'N', 'N', sheight, swidth, swidth, one, a( istartm,ilo ), lda, &
+                        zc, ldzc, zero, work, sheight )
+              call stdlib_qlacpy( 'ALL', sheight, swidth, work, sheight, a( istartm,ilo ), lda )
+                        
+              call stdlib_qgemm( 'N', 'N', sheight, swidth, swidth, one, b( istartm,ilo ), ldb, &
+                        zc, ldzc, zero, work, sheight )
+              call stdlib_qlacpy( 'ALL', sheight, swidth, work, sheight, b( istartm,ilo ), ldb )
+                        
+           end if
+           if ( ilz ) then
+              call stdlib_qgemm( 'N', 'N', n, swidth, swidth, one, z( 1, ilo ), ldz,zc, ldzc, &
+                        zero, work, n )
+              call stdlib_qlacpy( 'ALL', n, swidth, work, n, z( 1, ilo ), ldz )
+           end if
+           ! the following block chases the shifts down to the bottom
+           ! right block. if possible, a shift is moved down npos
+           ! positions at a time
+           k = ilo
+           do while ( k < ihi-ns )
+              np = min( ihi-ns-k, npos )
+              ! size of the near-the-diagonal block
+              nblock = ns+np
+              ! istartb points to the first row we will be updating
+              istartb = k+1
+              ! istopb points to the last column we will be updating
+              istopb = k+nblock-1
+              call stdlib_qlaset( 'FULL', ns+np, ns+np, zero, one, qc, ldqc )
+              call stdlib_qlaset( 'FULL', ns+np, ns+np, zero, one, zc, ldzc )
+              ! near the diagonal shift chase
+              do i = ns-1, 0, -2
+                 do j = 0, np-1
+                    ! move down the block with index k+i+j-1, updating
+                    ! the (ns+np x ns+np) block:
+                    ! (k:k+ns+np,k:k+ns+np-1)
+                    call stdlib_qlaqz2( .true., .true., k+i+j-1, istartb, istopb,ihi, a, lda, b, &
+                              ldb, nblock, k+1, qc, ldqc,nblock, k, zc, ldzc )
+                 end do
+              end do
+              ! update rest of the pencil
+              ! update a(k+1:k+ns+np, k+ns+np:istopm) and
+              ! b(k+1:k+ns+np, k+ns+np:istopm)
+              ! from the left with qc(1:ns+np,1:ns+np)'
+              sheight = ns+np
+              swidth = istopm-( k+ns+np )+1
+              if ( swidth > 0 ) then
+                 call stdlib_qgemm( 'T', 'N', sheight, swidth, sheight, one, qc,ldqc, a( k+1, k+&
+                           ns+np ), lda, zero, work,sheight )
+                 call stdlib_qlacpy( 'ALL', sheight, swidth, work, sheight, a( k+1,k+ns+np ), lda &
+                           )
+                 call stdlib_qgemm( 'T', 'N', sheight, swidth, sheight, one, qc,ldqc, b( k+1, k+&
+                           ns+np ), ldb, zero, work,sheight )
+                 call stdlib_qlacpy( 'ALL', sheight, swidth, work, sheight, b( k+1,k+ns+np ), ldb &
+                           )
+              end if
+              if ( ilq ) then
+                 call stdlib_qgemm( 'N', 'N', n, nblock, nblock, one, q( 1, k+1 ),ldq, qc, ldqc, &
+                           zero, work, n )
+                 call stdlib_qlacpy( 'ALL', n, nblock, work, n, q( 1, k+1 ), ldq )
+              end if
+              ! update a(istartm:k,k:k+ns+npos-1) and b(istartm:k,k:k+ns+npos-1)
+              ! from the right with zc(1:ns+np,1:ns+np)
+              sheight = k-istartm+1
+              swidth = nblock
+              if ( sheight > 0 ) then
+                 call stdlib_qgemm( 'N', 'N', sheight, swidth, swidth, one,a( istartm, k ), lda, &
+                           zc, ldzc, zero, work,sheight )
+                 call stdlib_qlacpy( 'ALL', sheight, swidth, work, sheight,a( istartm, k ), lda )
+                           
+                 call stdlib_qgemm( 'N', 'N', sheight, swidth, swidth, one,b( istartm, k ), ldb, &
+                           zc, ldzc, zero, work,sheight )
+                 call stdlib_qlacpy( 'ALL', sheight, swidth, work, sheight,b( istartm, k ), ldb )
+                           
+              end if
+              if ( ilz ) then
+                 call stdlib_qgemm( 'N', 'N', n, nblock, nblock, one, z( 1, k ),ldz, zc, ldzc, &
+                           zero, work, n )
+                 call stdlib_qlacpy( 'ALL', n, nblock, work, n, z( 1, k ), ldz )
+              end if
+              k = k+np
+           end do
+           ! the following block removes the shifts from the bottom right corner
+           ! one by one. updates are initially applied to a(ihi-ns+1:ihi,ihi-ns:ihi).
+           call stdlib_qlaset( 'FULL', ns, ns, zero, one, qc, ldqc )
+           call stdlib_qlaset( 'FULL', ns+1, ns+1, zero, one, zc, ldzc )
+           ! istartb points to the first row we will be updating
+           istartb = ihi-ns+1
+           ! istopb points to the last column we will be updating
+           istopb = ihi
+           do i = 1, ns, 2
+              ! chase the shift down to the bottom right corner
+              do ishift = ihi-i-1, ihi-2
+                 call stdlib_qlaqz2( .true., .true., ishift, istartb, istopb, ihi,a, lda, b, ldb, &
+                           ns, ihi-ns+1, qc, ldqc, ns+1,ihi-ns, zc, ldzc )
+              end do
+           end do
+           ! update rest of the pencil
+           ! update a(ihi-ns+1:ihi, ihi+1:istopm)
+           ! from the left with qc(1:ns,1:ns)'
+           sheight = ns
+           swidth = istopm-( ihi+1 )+1
+           if ( swidth > 0 ) then
+              call stdlib_qgemm( 'T', 'N', sheight, swidth, sheight, one, qc, ldqc,a( ihi-ns+1, &
+                        ihi+1 ), lda, zero, work, sheight )
+              call stdlib_qlacpy( 'ALL', sheight, swidth, work, sheight,a( ihi-ns+1, ihi+1 ), lda &
+                        )
+              call stdlib_qgemm( 'T', 'N', sheight, swidth, sheight, one, qc, ldqc,b( ihi-ns+1, &
+                        ihi+1 ), ldb, zero, work, sheight )
+              call stdlib_qlacpy( 'ALL', sheight, swidth, work, sheight,b( ihi-ns+1, ihi+1 ), ldb &
+                        )
+           end if
+           if ( ilq ) then
+              call stdlib_qgemm( 'N', 'N', n, ns, ns, one, q( 1, ihi-ns+1 ), ldq,qc, ldqc, zero, &
+                        work, n )
+              call stdlib_qlacpy( 'ALL', n, ns, work, n, q( 1, ihi-ns+1 ), ldq )
+           end if
+           ! update a(istartm:ihi-ns,ihi-ns:ihi)
+           ! from the right with zc(1:ns+1,1:ns+1)
+           sheight = ihi-ns-istartm+1
+           swidth = ns+1
+           if ( sheight > 0 ) then
+              call stdlib_qgemm( 'N', 'N', sheight, swidth, swidth, one, a( istartm,ihi-ns ), lda,&
+                         zc, ldzc, zero, work, sheight )
+              call stdlib_qlacpy( 'ALL', sheight, swidth, work, sheight, a( istartm,ihi-ns ), lda &
+                        )
+              call stdlib_qgemm( 'N', 'N', sheight, swidth, swidth, one, b( istartm,ihi-ns ), ldb,&
+                         zc, ldzc, zero, work, sheight )
+              call stdlib_qlacpy( 'ALL', sheight, swidth, work, sheight, b( istartm,ihi-ns ), ldb &
+                        )
+           end if
+           if ( ilz ) then
+              call stdlib_qgemm( 'N', 'N', n, ns+1, ns+1, one, z( 1, ihi-ns ), ldz,zc, ldzc, zero,&
+                         work, n )
+              call stdlib_qlacpy( 'ALL', n, ns+1, work, n, z( 1, ihi-ns ), ldz )
+           end if
+     end subroutine stdlib_qlaqz4
+
+     !> DLAR1V: computes the (scaled) r-th column of the inverse of
+     !> the sumbmatrix in rows B1 through BN of the tridiagonal matrix
+     !> L D L**T - sigma I. When sigma is close to an eigenvalue, the
+     !> computed vector is an accurate eigenvector. Usually, r corresponds
+     !> to the index where the eigenvector is largest in magnitude.
+     !> The following steps accomplish this computation :
+     !> (a) Stationary qd transform,  L D L**T - sigma I = L(+) D(+) L(+)**T,
+     !> (b) Progressive qd transform, L D L**T - sigma I = U(-) D(-) U(-)**T,
+     !> (c) Computation of the diagonal elements of the inverse of
+     !> L D L**T - sigma I by combining the above transforms, and choosing
+     !> r as the index where the diagonal of the inverse is (one of the)
+     !> largest in magnitude.
+     !> (d) Computation of the (scaled) r-th column of the inverse using the
+     !> twisted factorization obtained by combining the top part of the
+     !> the stationary and the bottom part of the progressive transform.
+
+     pure subroutine stdlib_qlar1v( n, b1, bn, lambda, d, l, ld, lld,pivmin, gaptol, z, wantnc, &
+               negcnt, ztz, mingma,r, isuppz, nrminv, resid, rqcorr, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantnc
+           integer(ilp), intent(in) :: b1, bn, n
+           integer(ilp), intent(out) :: negcnt
+           integer(ilp), intent(inout) :: r
+           real(qp), intent(in) :: gaptol, lambda, pivmin
+           real(qp), intent(out) :: mingma, nrminv, resid, rqcorr, ztz
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*)
+           real(qp), intent(in) :: d(*), l(*), ld(*), lld(*)
+           real(qp), intent(out) :: work(*)
+           real(qp), intent(inout) :: z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: sawnan1, sawnan2
+           integer(ilp) :: i, indlpl, indp, inds, indumn, neg1, neg2, r1, r2
+           real(qp) :: dminus, dplus, eps, s, tmp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           eps = stdlib_qlamch( 'PRECISION' )
+           if( r==0 ) then
+              r1 = b1
+              r2 = bn
+           else
+              r1 = r
+              r2 = r
+           end if
+           ! storage for lplus
+           indlpl = 0
+           ! storage for uminus
+           indumn = n
+           inds = 2*n + 1
+           indp = 3*n + 1
+           if( b1==1 ) then
+              work( inds ) = zero
+           else
+              work( inds+b1-1 ) = lld( b1-1 )
+           end if
+           ! compute the stationary transform (using the differential form)
+           ! until the index r2.
+           sawnan1 = .false.
+           neg1 = 0
+           s = work( inds+b1-1 ) - lambda
+           do i = b1, r1 - 1
+              dplus = d( i ) + s
+              work( indlpl+i ) = ld( i ) / dplus
+              if(dplus<zero) neg1 = neg1 + 1
+              work( inds+i ) = s*work( indlpl+i )*l( i )
+              s = work( inds+i ) - lambda
+           end do
+           sawnan1 = stdlib_qisnan( s )
+           if( sawnan1 ) goto 60
+           do i = r1, r2 - 1
+              dplus = d( i ) + s
+              work( indlpl+i ) = ld( i ) / dplus
+              work( inds+i ) = s*work( indlpl+i )*l( i )
+              s = work( inds+i ) - lambda
+           end do
+           sawnan1 = stdlib_qisnan( s )
+           60 continue
+           if( sawnan1 ) then
+              ! runs a slower version of the above loop if a nan is detected
+              neg1 = 0
+              s = work( inds+b1-1 ) - lambda
+              do i = b1, r1 - 1
+                 dplus = d( i ) + s
+                 if(abs(dplus)<pivmin) dplus = -pivmin
+                 work( indlpl+i ) = ld( i ) / dplus
+                 if(dplus<zero) neg1 = neg1 + 1
+                 work( inds+i ) = s*work( indlpl+i )*l( i )
+                 if( work( indlpl+i )==zero )work( inds+i ) = lld( i )
+                 s = work( inds+i ) - lambda
+              end do
+              do i = r1, r2 - 1
+                 dplus = d( i ) + s
+                 if(abs(dplus)<pivmin) dplus = -pivmin
+                 work( indlpl+i ) = ld( i ) / dplus
+                 work( inds+i ) = s*work( indlpl+i )*l( i )
+                 if( work( indlpl+i )==zero )work( inds+i ) = lld( i )
+                 s = work( inds+i ) - lambda
+              end do
+           end if
+           ! compute the progressive transform (using the differential form)
+           ! until the index r1
+           sawnan2 = .false.
+           neg2 = 0
+           work( indp+bn-1 ) = d( bn ) - lambda
+           do i = bn - 1, r1, -1
+              dminus = lld( i ) + work( indp+i )
+              tmp = d( i ) / dminus
+              if(dminus<zero) neg2 = neg2 + 1
+              work( indumn+i ) = l( i )*tmp
+              work( indp+i-1 ) = work( indp+i )*tmp - lambda
+           end do
+           tmp = work( indp+r1-1 )
+           sawnan2 = stdlib_qisnan( tmp )
+           if( sawnan2 ) then
+              ! runs a slower version of the above loop if a nan is detected
+              neg2 = 0
+              do i = bn-1, r1, -1
+                 dminus = lld( i ) + work( indp+i )
+                 if(abs(dminus)<pivmin) dminus = -pivmin
+                 tmp = d( i ) / dminus
+                 if(dminus<zero) neg2 = neg2 + 1
+                 work( indumn+i ) = l( i )*tmp
+                 work( indp+i-1 ) = work( indp+i )*tmp - lambda
+                 if( tmp==zero )work( indp+i-1 ) = d( i ) - lambda
+              end do
+           end if
+           ! find the index (from r1 to r2) of the largest (in magnitude)
+           ! diagonal element of the inverse
+           mingma = work( inds+r1-1 ) + work( indp+r1-1 )
+           if( mingma<zero ) neg1 = neg1 + 1
+           if( wantnc ) then
+              negcnt = neg1 + neg2
+           else
+              negcnt = -1
+           endif
+           if( abs(mingma)==zero )mingma = eps*work( inds+r1-1 )
+           r = r1
+           do i = r1, r2 - 1
+              tmp = work( inds+i ) + work( indp+i )
+              if( tmp==zero )tmp = eps*work( inds+i )
+              if( abs( tmp )<=abs( mingma ) ) then
+                 mingma = tmp
+                 r = i + 1
+              end if
+           end do
+           ! compute the fp vector: solve n^t v = e_r
+           isuppz( 1 ) = b1
+           isuppz( 2 ) = bn
+           z( r ) = one
+           ztz = one
+           ! compute the fp vector upwards from r
+           if( .not.sawnan1 .and. .not.sawnan2 ) then
+              do i = r-1, b1, -1
+                 z( i ) = -( work( indlpl+i )*z( i+1 ) )
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i ) = zero
+                    isuppz( 1 ) = i + 1
+                    goto 220
+                 endif
+                 ztz = ztz + z( i )*z( i )
+              end do
+              220 continue
+           else
+              ! run slower loop if nan occurred.
+              do i = r - 1, b1, -1
+                 if( z( i+1 )==zero ) then
+                    z( i ) = -( ld( i+1 ) / ld( i ) )*z( i+2 )
+                 else
+                    z( i ) = -( work( indlpl+i )*z( i+1 ) )
+                 end if
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i ) = zero
+                    isuppz( 1 ) = i + 1
+                    go to 240
+                 end if
+                 ztz = ztz + z( i )*z( i )
+              end do
+              240 continue
+           endif
+           ! compute the fp vector downwards from r in blocks of size blksiz
+           if( .not.sawnan1 .and. .not.sawnan2 ) then
+              do i = r, bn-1
+                 z( i+1 ) = -( work( indumn+i )*z( i ) )
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i+1 ) = zero
+                    isuppz( 2 ) = i
+                    go to 260
+                 end if
+                 ztz = ztz + z( i+1 )*z( i+1 )
+              end do
+              260 continue
+           else
+              ! run slower loop if nan occurred.
+              do i = r, bn - 1
+                 if( z( i )==zero ) then
+                    z( i+1 ) = -( ld( i-1 ) / ld( i ) )*z( i-1 )
+                 else
+                    z( i+1 ) = -( work( indumn+i )*z( i ) )
+                 end if
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i+1 ) = zero
+                    isuppz( 2 ) = i
+                    go to 280
+                 end if
+                 ztz = ztz + z( i+1 )*z( i+1 )
+              end do
+              280 continue
+           end if
+           ! compute quantities for convergence test
+           tmp = one / ztz
+           nrminv = sqrt( tmp )
+           resid = abs( mingma )*nrminv
+           rqcorr = mingma*tmp
+           return
+     end subroutine stdlib_qlar1v
+
+     !> DLAR2V: applies a vector of real plane rotations from both sides to
+     !> a sequence of 2-by-2 real symmetric matrices, defined by the elements
+     !> of the vectors x, y and z. For i = 1,2,...,n
+     !> ( x(i)  z(i) ) := (  c(i)  s(i) ) ( x(i)  z(i) ) ( c(i) -s(i) )
+     !> ( z(i)  y(i) )    ( -s(i)  c(i) ) ( z(i)  y(i) ) ( s(i)  c(i) )
+
+     pure subroutine stdlib_qlar2v( n, x, y, z, incx, c, s, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, n
+           ! Array Arguments 
+           real(qp), intent(in) :: c(*), s(*)
+           real(qp), intent(inout) :: x(*), y(*), z(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix
+           real(qp) :: ci, si, t1, t2, t3, t4, t5, t6, xi, yi, zi
+           ! Executable Statements 
+           ix = 1
+           ic = 1
+           do i = 1, n
+              xi = x( ix )
+              yi = y( ix )
+              zi = z( ix )
+              ci = c( ic )
+              si = s( ic )
+              t1 = si*zi
+              t2 = ci*zi
+              t3 = t2 - si*xi
+              t4 = t2 + si*yi
+              t5 = ci*xi + t1
+              t6 = ci*yi - t1
+              x( ix ) = ci*t5 + si*t4
+              y( ix ) = ci*t6 - si*t3
+              z( ix ) = ci*t4 - si*t5
+              ix = ix + incx
+              ic = ic + incc
+           end do
+           return
+     end subroutine stdlib_qlar2v
+
+     !> DLARF: applies a real elementary reflector H to a real m by n matrix
+     !> C, from either the left or the right. H is represented in the form
+     !> H = I - tau * v * v**T
+     !> where tau is a real scalar and v is a real vector.
+     !> If tau = 0, then H is taken to be the unit matrix.
+
+     pure subroutine stdlib_qlarf( side, m, n, v, incv, tau, c, ldc, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: incv, ldc, m, n
+           real(qp), intent(in) :: tau
+           ! Array Arguments 
+           real(qp), intent(inout) :: c(ldc,*)
+           real(qp), intent(in) :: v(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: applyleft
+           integer(ilp) :: i, lastv, lastc
+           ! Executable Statements 
+           applyleft = stdlib_lsame( side, 'L' )
+           lastv = 0
+           lastc = 0
+           if( tau/=zero ) then
+           ! set up variables for scanning v.  lastv begins pointing to the end
+           ! of v.
+              if( applyleft ) then
+                 lastv = m
+              else
+                 lastv = n
+              end if
+              if( incv>0 ) then
+                 i = 1 + (lastv-1) * incv
+              else
+                 i = 1
+              end if
+           ! look for the last non-zero row in v.
+              do while( lastv>0 .and. v( i )==zero )
+                 lastv = lastv - 1
+                 i = i - incv
+              end do
+              if( applyleft ) then
+           ! scan for the last non-zero column in c(1:lastv,:).
+                 lastc = stdlib_ilaqlc(lastv, n, c, ldc)
+              else
+           ! scan for the last non-zero row in c(:,1:lastv).
+                 lastc = stdlib_ilaqlr(m, lastv, c, ldc)
+              end if
+           end if
+           ! note that lastc.eq.0_qp renders the blas operations null; no special
+           ! case is needed at this level.
+           if( applyleft ) then
+              ! form  h * c
+              if( lastv>0 ) then
+                 ! w(1:lastc,1) := c(1:lastv,1:lastc)**t * v(1:lastv,1)
+                 call stdlib_qgemv( 'TRANSPOSE', lastv, lastc, one, c, ldc, v, incv,zero, work, 1 &
+                           )
+                 ! c(1:lastv,1:lastc) := c(...) - v(1:lastv,1) * w(1:lastc,1)**t
+                 call stdlib_qger( lastv, lastc, -tau, v, incv, work, 1, c, ldc )
+              end if
+           else
+              ! form  c * h
+              if( lastv>0 ) then
+                 ! w(1:lastc,1) := c(1:lastc,1:lastv) * v(1:lastv,1)
+                 call stdlib_qgemv( 'NO TRANSPOSE', lastc, lastv, one, c, ldc,v, incv, zero, work,&
+                            1 )
+                 ! c(1:lastc,1:lastv) := c(...) - w(1:lastc,1) * v(1:lastv,1)**t
+                 call stdlib_qger( lastc, lastv, -tau, work, 1, v, incv, c, ldc )
+              end if
+           end if
+           return
+     end subroutine stdlib_qlarf
+
+     !> DLARFB: applies a real block reflector H or its transpose H**T to a
+     !> real m by n matrix C, from either the left or the right.
+
+     pure subroutine stdlib_qlarfb( side, trans, direct, storev, m, n, k, v, ldv,t, ldt, c, ldc, &
+               work, ldwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, ldc, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: c(ldc,*)
+           real(qp), intent(in) :: t(ldt,*), v(ldv,*)
+           real(qp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           character :: transt
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           if( stdlib_lsame( trans, 'N' ) ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           if( stdlib_lsame( storev, 'C' ) ) then
+              if( stdlib_lsame( direct, 'F' ) ) then
+                 ! let  v =  ( v1 )    (first k rows)
+                           ! ( v2 )
+                 ! where  v1  is unit lower triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**t * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**t * v  =  (c1**t * v1 + c2**t * v2)  (stored in work)
+                    ! w := c1**t
+                    do j = 1, k
+                       call stdlib_qcopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n,k, one, v, ldv,&
+                               work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c2**t * v2
+                       call stdlib_qgemm( 'TRANSPOSE', 'NO TRANSPOSE', n, k, m-k,one, c( k+1, 1 ),&
+                                  ldc, v( k+1, 1 ), ldv,one, work, ldwork )
+                    end if
+                    ! w := w * t**t  or  w * t
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', transt, 'NON-UNIT', n, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v * w**t
+                    if( m>k ) then
+                       ! c2 := c2 - v2 * w**t
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', m-k, n, k,-one, v( k+1, 1 )&
+                                 , ldv, work, ldwork, one,c( k+1, 1 ), ldc )
+                    end if
+                    ! w := w * v1**t
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'UNIT', n, k,one, v, ldv, &
+                              work, ldwork )
+                    ! c1 := c1 - w**t
+                    do j = 1, k
+                       do i = 1, n
+                          c( j, i ) = c( j, i ) - work( i, j )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**t  where  c = ( c1  c2 )
+                    ! w := c * v  =  (c1*v1 + c2*v2)  (stored in work)
+                    ! w := c1
+                    do j = 1, k
+                       call stdlib_qcopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', m,k, one, v, ldv,&
+                               work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c2 * v2
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, k, n-k,one, c( 1, k+&
+                                 1 ), ldc, v( k+1, 1 ), ldv,one, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**t
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', trans, 'NON-UNIT', m, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v**t
+                    if( n>k ) then
+                       ! c2 := c2 - w * v2**t
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, n-k, k,-one, work, &
+                                 ldwork, v( k+1, 1 ), ldv, one,c( 1, k+1 ), ldc )
+                    end if
+                    ! w := w * v1**t
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'UNIT', m, k,one, v, ldv, &
+                              work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, j ) = c( i, j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              else
+                 ! let  v =  ( v1 )
+                           ! ( v2 )    (last k rows)
+                 ! where  v2  is unit upper triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**t * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**t * v  =  (c1**t * v1 + c2**t * v2)  (stored in work)
+                    ! w := c2**t
+                    do j = 1, k
+                       call stdlib_qcopy( n, c( m-k+j, 1 ), ldc, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', n,k, one, v( m-k+&
+                              1, 1 ), ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c1**t * v1
+                       call stdlib_qgemm( 'TRANSPOSE', 'NO TRANSPOSE', n, k, m-k,one, c, ldc, v, &
+                                 ldv, one, work, ldwork )
+                    end if
+                    ! w := w * t**t  or  w * t
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v * w**t
+                    if( m>k ) then
+                       ! c1 := c1 - v1 * w**t
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', m-k, n, k,-one, v, ldv, &
+                                 work, ldwork, one, c, ldc )
+                    end if
+                    ! w := w * v2**t
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'UNIT', n, k,one, v( m-k+1, &
+                              1 ), ldv, work, ldwork )
+                    ! c2 := c2 - w**t
+                    do j = 1, k
+                       do i = 1, n
+                          c( m-k+j, i ) = c( m-k+j, i ) - work( i, j )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**t  where  c = ( c1  c2 )
+                    ! w := c * v  =  (c1*v1 + c2*v2)  (stored in work)
+                    ! w := c2
+                    do j = 1, k
+                       call stdlib_qcopy( m, c( 1, n-k+j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', m,k, one, v( n-k+&
+                              1, 1 ), ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c1 * v1
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, k, n-k,one, c, ldc, &
+                                 v, ldv, one, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**t
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v**t
+                    if( n>k ) then
+                       ! c1 := c1 - w * v1**t
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, n-k, k,-one, work, &
+                                 ldwork, v, ldv, one, c, ldc )
+                    end if
+                    ! w := w * v2**t
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'UNIT', m, k,one, v( n-k+1, &
+                              1 ), ldv, work, ldwork )
+                    ! c2 := c2 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, n-k+j ) = c( i, n-k+j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              end if
+           else if( stdlib_lsame( storev, 'R' ) ) then
+              if( stdlib_lsame( direct, 'F' ) ) then
+                 ! let  v =  ( v1  v2 )    (v1: first k columns)
+                 ! where  v1  is unit upper triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**t * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**t * v**t  =  (c1**t * v1**t + c2**t * v2**t) (stored in work)
+                    ! w := c1**t
+                    do j = 1, k
+                       call stdlib_qcopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1**t
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'UNIT', n, k,one, v, ldv, &
+                              work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c2**t * v2**t
+                       call stdlib_qgemm( 'TRANSPOSE', 'TRANSPOSE', n, k, m-k, one,c( k+1, 1 ), &
+                                 ldc, v( 1, k+1 ), ldv, one,work, ldwork )
+                    end if
+                    ! w := w * t**t  or  w * t
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', transt, 'NON-UNIT', n, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v**t * w**t
+                    if( m>k ) then
+                       ! c2 := c2 - v2**t * w**t
+                       call stdlib_qgemm( 'TRANSPOSE', 'TRANSPOSE', m-k, n, k, -one,v( 1, k+1 ), &
+                                 ldv, work, ldwork, one,c( k+1, 1 ), ldc )
+                    end if
+                    ! w := w * v1
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', n,k, one, v, ldv,&
+                               work, ldwork )
+                    ! c1 := c1 - w**t
+                    do j = 1, k
+                       do i = 1, n
+                          c( j, i ) = c( j, i ) - work( i, j )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**t  where  c = ( c1  c2 )
+                    ! w := c * v**t  =  (c1*v1**t + c2*v2**t)  (stored in work)
+                    ! w := c1
+                    do j = 1, k
+                       call stdlib_qcopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1**t
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'UNIT', m, k,one, v, ldv, &
+                              work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c2 * v2**t
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, k, n-k,one, c( 1, k+1 ),&
+                                  ldc, v( 1, k+1 ), ldv,one, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**t
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', trans, 'NON-UNIT', m, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v
+                    if( n>k ) then
+                       ! c2 := c2 - w * v2
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, n-k, k,-one, work, &
+                                 ldwork, v( 1, k+1 ), ldv, one,c( 1, k+1 ), ldc )
+                    end if
+                    ! w := w * v1
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', m,k, one, v, ldv,&
+                               work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, j ) = c( i, j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              else
+                 ! let  v =  ( v1  v2 )    (v2: last k columns)
+                 ! where  v2  is unit lower triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**t * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**t * v**t  =  (c1**t * v1**t + c2**t * v2**t) (stored in work)
+                    ! w := c2**t
+                    do j = 1, k
+                       call stdlib_qcopy( n, c( m-k+j, 1 ), ldc, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2**t
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'UNIT', n, k,one, v( 1, m-k+&
+                              1 ), ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c1**t * v1**t
+                       call stdlib_qgemm( 'TRANSPOSE', 'TRANSPOSE', n, k, m-k, one,c, ldc, v, ldv,&
+                                  one, work, ldwork )
+                    end if
+                    ! w := w * t**t  or  w * t
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v**t * w**t
+                    if( m>k ) then
+                       ! c1 := c1 - v1**t * w**t
+                       call stdlib_qgemm( 'TRANSPOSE', 'TRANSPOSE', m-k, n, k, -one,v, ldv, work, &
+                                 ldwork, one, c, ldc )
+                    end if
+                    ! w := w * v2
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n,k, one, v( 1, &
+                              m-k+1 ), ldv, work, ldwork )
+                    ! c2 := c2 - w**t
+                    do j = 1, k
+                       do i = 1, n
+                          c( m-k+j, i ) = c( m-k+j, i ) - work( i, j )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h'  where  c = ( c1  c2 )
+                    ! w := c * v**t  =  (c1*v1**t + c2*v2**t)  (stored in work)
+                    ! w := c2
+                    do j = 1, k
+                       call stdlib_qcopy( m, c( 1, n-k+j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2**t
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'UNIT', m, k,one, v( 1, n-k+&
+                              1 ), ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c1 * v1**t
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, k, n-k,one, c, ldc, v, &
+                                 ldv, one, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**t
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v
+                    if( n>k ) then
+                       ! c1 := c1 - w * v1
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, n-k, k,-one, work, &
+                                 ldwork, v, ldv, one, c, ldc )
+                    end if
+                    ! w := w * v2
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', m,k, one, v( 1, &
+                              n-k+1 ), ldv, work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, n-k+j ) = c( i, n-k+j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qlarfb
+
+     !> DLARFB_GETT: applies a real Householder block reflector H from the
+     !> left to a real (K+M)-by-N  "triangular-pentagonal" matrix
+     !> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+     !> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+     !> in the array B. The block reflector H is stored in a compact
+     !> WY-representation, where the elementary reflectors are in the
+     !> arrays A, B and T. See Further Details section.
+
+     pure subroutine stdlib_qlarfb_gett( ident, m, n, k, t, ldt, a, lda, b, ldb,work, ldwork )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: ident
+           integer(ilp), intent(in) :: k, lda, ldb, ldt, ldwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(in) :: t(ldt,*)
+           real(qp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnotident
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<0 .or. n<=0 .or. k==0 .or. k>n )return
+           lnotident = .not.stdlib_lsame( ident, 'I' )
+           ! ------------------------------------------------------------------
+           ! first step. computation of the column block 2:
+              ! ( a2 ) := h * ( a2 )
+              ! ( b2 )        ( b2 )
+           ! ------------------------------------------------------------------
+           if( n>k ) then
+              ! col2_(1) compute w2: = a2. therefore, copy a2 = a(1:k, k+1:n)
+              ! into w2=work(1:k, 1:n-k) column-by-column.
+              do j = 1, n-k
+                 call stdlib_qcopy( k, a( 1, k+j ), 1, work( 1, j ), 1 )
+              end do
+              if( lnotident ) then
+                 ! col2_(2) compute w2: = (v1**t) * w2 = (a1**t) * w2,
+                 ! v1 is not an identy matrix, but unit lower-triangular
+                 ! v1 stored in a1 (diagonal ones are not stored).
+                 call stdlib_qtrmm( 'L', 'L', 'T', 'U', k, n-k, one, a, lda,work, ldwork )
+              end if
+              ! col2_(3) compute w2: = w2 + (v2**t) * b2 = w2 + (b1**t) * b2
+              ! v2 stored in b1.
+              if( m>0 ) then
+                 call stdlib_qgemm( 'T', 'N', k, n-k, m, one, b, ldb,b( 1, k+1 ), ldb, one, work, &
+                           ldwork )
+              end if
+              ! col2_(4) compute w2: = t * w2,
+              ! t is upper-triangular.
+              call stdlib_qtrmm( 'L', 'U', 'N', 'N', k, n-k, one, t, ldt,work, ldwork )
+              ! col2_(5) compute b2: = b2 - v2 * w2 = b2 - b1 * w2,
+              ! v2 stored in b1.
+              if( m>0 ) then
+                 call stdlib_qgemm( 'N', 'N', m, n-k, k, -one, b, ldb,work, ldwork, one, b( 1, k+&
+                           1 ), ldb )
+              end if
+              if( lnotident ) then
+                 ! col2_(6) compute w2: = v1 * w2 = a1 * w2,
+                 ! v1 is not an identity matrix, but unit lower-triangular,
+                 ! v1 stored in a1 (diagonal ones are not stored).
+                 call stdlib_qtrmm( 'L', 'L', 'N', 'U', k, n-k, one, a, lda,work, ldwork )
+              end if
+              ! col2_(7) compute a2: = a2 - w2 =
+                                   ! = a(1:k, k+1:n-k) - work(1:k, 1:n-k),
+              ! column-by-column.
+              do j = 1, n-k
+                 do i = 1, k
+                    a( i, k+j ) = a( i, k+j ) - work( i, j )
+                 end do
+              end do
+           end if
+           ! ------------------------------------------------------------------
+           ! second step. computation of the column block 1:
+              ! ( a1 ) := h * ( a1 )
+              ! ( b1 )        (  0 )
+           ! ------------------------------------------------------------------
+           ! col1_(1) compute w1: = a1. copy the upper-triangular
+           ! a1 = a(1:k, 1:k) into the upper-triangular
+           ! w1 = work(1:k, 1:k) column-by-column.
+           do j = 1, k
+              call stdlib_qcopy( j, a( 1, j ), 1, work( 1, j ), 1 )
+           end do
+           ! set the subdiagonal elements of w1 to zero column-by-column.
+           do j = 1, k - 1
+              do i = j + 1, k
+                 work( i, j ) = zero
+              end do
+           end do
+           if( lnotident ) then
+              ! col1_(2) compute w1: = (v1**t) * w1 = (a1**t) * w1,
+              ! v1 is not an identity matrix, but unit lower-triangular
+              ! v1 stored in a1 (diagonal ones are not stored),
+              ! w1 is upper-triangular with zeroes below the diagonal.
+              call stdlib_qtrmm( 'L', 'L', 'T', 'U', k, k, one, a, lda,work, ldwork )
+           end if
+           ! col1_(3) compute w1: = t * w1,
+           ! t is upper-triangular,
+           ! w1 is upper-triangular with zeroes below the diagonal.
+           call stdlib_qtrmm( 'L', 'U', 'N', 'N', k, k, one, t, ldt,work, ldwork )
+           ! col1_(4) compute b1: = - v2 * w1 = - b1 * w1,
+           ! v2 = b1, w1 is upper-triangular with zeroes below the diagonal.
+           if( m>0 ) then
+              call stdlib_qtrmm( 'R', 'U', 'N', 'N', m, k, -one, work, ldwork,b, ldb )
+           end if
+           if( lnotident ) then
+              ! col1_(5) compute w1: = v1 * w1 = a1 * w1,
+              ! v1 is not an identity matrix, but unit lower-triangular
+              ! v1 stored in a1 (diagonal ones are not stored),
+              ! w1 is upper-triangular on input with zeroes below the diagonal,
+              ! and square on output.
+              call stdlib_qtrmm( 'L', 'L', 'N', 'U', k, k, one, a, lda,work, ldwork )
+              ! col1_(6) compute a1: = a1 - w1 = a(1:k, 1:k) - work(1:k, 1:k)
+              ! column-by-column. a1 is upper-triangular on input.
+              ! if ident, a1 is square on output, and w1 is square,
+              ! if not ident, a1 is upper-triangular on output,
+              ! w1 is upper-triangular.
+              ! col1_(6)_a compute elements of a1 below the diagonal.
+              do j = 1, k - 1
+                 do i = j + 1, k
+                    a( i, j ) = - work( i, j )
+                 end do
+              end do
+           end if
+           ! col1_(6)_b compute elements of a1 on and above the diagonal.
+           do j = 1, k
+              do i = 1, j
+                 a( i, j ) = a( i, j ) - work( i, j )
+              end do
+           end do
+           return
+     end subroutine stdlib_qlarfb_gett
+
+     !> DLARFG: generates a real elementary reflector H of order n, such
+     !> that
+     !> H * ( alpha ) = ( beta ),   H**T * H = I.
+     !> (   x   )   (   0  )
+     !> where alpha and beta are scalars, and x is an (n-1)-element real
+     !> vector. H is represented in the form
+     !> H = I - tau * ( 1 ) * ( 1 v**T ) ,
+     !> ( v )
+     !> where tau is a real scalar and v is a real (n-1)-element
+     !> vector.
+     !> If the elements of x are all zero, then tau = 0 and H is taken to be
+     !> the unit matrix.
+     !> Otherwise  1 <= tau <= 2.
+
+     pure subroutine stdlib_qlarfg( n, alpha, x, incx, tau )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           real(qp), intent(inout) :: alpha
+           real(qp), intent(out) :: tau
+           ! Array Arguments 
+           real(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, knt
+           real(qp) :: beta, rsafmn, safmin, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign
+           ! Executable Statements 
+           if( n<=1 ) then
+              tau = zero
+              return
+           end if
+           xnorm = stdlib_qnrm2( n-1, x, incx )
+           if( xnorm==zero ) then
+              ! h  =  i
+              tau = zero
+           else
+              ! general case
+              beta = -sign( stdlib_qlapy2( alpha, xnorm ), alpha )
+              safmin = stdlib_qlamch( 'S' ) / stdlib_qlamch( 'E' )
+              knt = 0
+              if( abs( beta )<safmin ) then
+                 ! xnorm, beta may be inaccurate; scale x and recompute them
+                 rsafmn = one / safmin
+                 10 continue
+                 knt = knt + 1
+                 call stdlib_qscal( n-1, rsafmn, x, incx )
+                 beta = beta*rsafmn
+                 alpha = alpha*rsafmn
+                 if( (abs( beta )<safmin) .and. (knt < 20) )go to 10
+                 ! new beta is at most 1, at least safmin
+                 xnorm = stdlib_qnrm2( n-1, x, incx )
+                 beta = -sign( stdlib_qlapy2( alpha, xnorm ), alpha )
+              end if
+              tau = ( beta-alpha ) / beta
+              call stdlib_qscal( n-1, one / ( alpha-beta ), x, incx )
+              ! if alpha is subnormal, it may lose relative accuracy
+              do j = 1, knt
+                 beta = beta*safmin
+              end do
+              alpha = beta
+           end if
+           return
+     end subroutine stdlib_qlarfg
+
+     !> DLARFGP: generates a real elementary reflector H of order n, such
+     !> that
+     !> H * ( alpha ) = ( beta ),   H**T * H = I.
+     !> (   x   )   (   0  )
+     !> where alpha and beta are scalars, beta is non-negative, and x is
+     !> an (n-1)-element real vector.  H is represented in the form
+     !> H = I - tau * ( 1 ) * ( 1 v**T ) ,
+     !> ( v )
+     !> where tau is a real scalar and v is a real (n-1)-element
+     !> vector.
+     !> If the elements of x are all zero, then tau = 0 and H is taken to be
+     !> the unit matrix.
+
+     subroutine stdlib_qlarfgp( n, alpha, x, incx, tau )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           real(qp), intent(inout) :: alpha
+           real(qp), intent(out) :: tau
+           ! Array Arguments 
+           real(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, knt
+           real(qp) :: beta, bignum, savealpha, smlnum, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign
+           ! Executable Statements 
+           if( n<=0 ) then
+              tau = zero
+              return
+           end if
+           xnorm = stdlib_qnrm2( n-1, x, incx )
+           if( xnorm==zero ) then
+              ! h  =  [+/-1, 0; i], sign chosen so alpha >= 0
+              if( alpha>=zero ) then
+                 ! when tau.eq.zero, the vector is special-cased to be
+                 ! all zeros in the application routines.  we do not need
+                 ! to clear it.
+                 tau = zero
+              else
+                 ! however, the application routines rely on explicit
+                 ! zero checks when tau.ne.zero, and we must clear x.
+                 tau = two
+                 do j = 1, n-1
+                    x( 1 + (j-1)*incx ) = 0
+                 end do
+                 alpha = -alpha
+              end if
+           else
+              ! general case
+              beta = sign( stdlib_qlapy2( alpha, xnorm ), alpha )
+              smlnum = stdlib_qlamch( 'S' ) / stdlib_qlamch( 'E' )
+              knt = 0
+              if( abs( beta )<smlnum ) then
+                 ! xnorm, beta may be inaccurate; scale x and recompute them
+                 bignum = one / smlnum
+                 10 continue
+                 knt = knt + 1
+                 call stdlib_qscal( n-1, bignum, x, incx )
+                 beta = beta*bignum
+                 alpha = alpha*bignum
+                 if( (abs( beta )<smlnum) .and. (knt < 20) )go to 10
+                 ! new beta is at most 1, at least smlnum
+                 xnorm = stdlib_qnrm2( n-1, x, incx )
+                 beta = sign( stdlib_qlapy2( alpha, xnorm ), alpha )
+              end if
+              savealpha = alpha
+              alpha = alpha + beta
+              if( beta<zero ) then
+                 beta = -beta
+                 tau = -alpha / beta
+              else
+                 alpha = xnorm * (xnorm/alpha)
+                 tau = alpha / beta
+                 alpha = -alpha
+              end if
+              if ( abs(tau)<=smlnum ) then
+                 ! in the case where the computed tau ends up being a denormalized number,
+                 ! it loses relative accuracy. this is a big problem. solution: flush tau
+                 ! to zero. this explains the next if statement.
+                 ! (bug report provided by pat quillen from mathworks on jul 29, 2009.)
+                 ! (thanks pat. thanks mathworks.)
+                 if( savealpha>=zero ) then
+                    tau = zero
+                 else
+                    tau = two
+                    do j = 1, n-1
+                       x( 1 + (j-1)*incx ) = 0
+                    end do
+                    beta = -savealpha
+                 end if
+              else
+                 ! this is the general case.
+                 call stdlib_qscal( n-1, one / alpha, x, incx )
+              end if
+              ! if beta is subnormal, it may lose relative accuracy
+              do j = 1, knt
+                 beta = beta*smlnum
+              end do
+              alpha = beta
+           end if
+           return
+     end subroutine stdlib_qlarfgp
+
+     !> DLARFT: forms the triangular factor T of a real block reflector H
+     !> of order n, which is defined as a product of k elementary reflectors.
+     !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+     !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+     !> If STOREV = 'C', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th column of the array V, and
+     !> H  =  I - V * T * V**T
+     !> If STOREV = 'R', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th row of the array V, and
+     !> H  =  I - V**T * T * V
+
+     pure subroutine stdlib_qlarft( direct, storev, n, k, v, ldv, tau, t, ldt )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, storev
+           integer(ilp), intent(in) :: k, ldt, ldv, n
+           ! Array Arguments 
+           real(qp), intent(out) :: t(ldt,*)
+           real(qp), intent(in) :: tau(*), v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, prevlastv, lastv
+           ! Executable Statements 
+           ! quick return if possible
+           if( n==0 )return
+           if( stdlib_lsame( direct, 'F' ) ) then
+              prevlastv = n
+              do i = 1, k
+                 prevlastv = max( i, prevlastv )
+                 if( tau( i )==zero ) then
+                    ! h(i)  =  i
+                    do j = 1, i
+                       t( j, i ) = zero
+                    end do
+                 else
+                    ! general case
+                    if( stdlib_lsame( storev, 'C' ) ) then
+                       ! skip any trailing zeros.
+                       do lastv = n, i+1, -1
+                          if( v( lastv, i )/=zero ) exit
+                       end do
+                       do j = 1, i-1
+                          t( j, i ) = -tau( i ) * v( i , j )
+                       end do
+                       j = min( lastv, prevlastv )
+                       ! t(1:i-1,i) := - tau(i) * v(i:j,1:i-1)**t * v(i:j,i)
+                       call stdlib_qgemv( 'TRANSPOSE', j-i, i-1, -tau( i ),v( i+1, 1 ), ldv, v( i+&
+                                 1, i ), 1, one,t( 1, i ), 1 )
+                    else
+                       ! skip any trailing zeros.
+                       do lastv = n, i+1, -1
+                          if( v( i, lastv )/=zero ) exit
+                       end do
+                       do j = 1, i-1
+                          t( j, i ) = -tau( i ) * v( j , i )
+                       end do
+                       j = min( lastv, prevlastv )
+                       ! t(1:i-1,i) := - tau(i) * v(1:i-1,i:j) * v(i,i:j)**t
+                       call stdlib_qgemv( 'NO TRANSPOSE', i-1, j-i, -tau( i ),v( 1, i+1 ), ldv, v(&
+                                  i, i+1 ), ldv, one,t( 1, i ), 1 )
+                    end if
+                    ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+                    call stdlib_qtrmv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', i-1, t,ldt, t( 1, i ),&
+                               1 )
+                    t( i, i ) = tau( i )
+                    if( i>1 ) then
+                       prevlastv = max( prevlastv, lastv )
+                    else
+                       prevlastv = lastv
+                    end if
+                 end if
+              end do
+           else
+              prevlastv = 1
+              do i = k, 1, -1
+                 if( tau( i )==zero ) then
+                    ! h(i)  =  i
+                    do j = i, k
+                       t( j, i ) = zero
+                    end do
+                 else
+                    ! general case
+                    if( i<k ) then
+                       if( stdlib_lsame( storev, 'C' ) ) then
+                          ! skip any leading zeros.
+                          do lastv = 1, i-1
+                             if( v( lastv, i )/=zero ) exit
+                          end do
+                          do j = i+1, k
+                             t( j, i ) = -tau( i ) * v( n-k+i , j )
+                          end do
+                          j = max( lastv, prevlastv )
+                          ! t(i+1:k,i) = -tau(i) * v(j:n-k+i,i+1:k)**t * v(j:n-k+i,i)
+                          call stdlib_qgemv( 'TRANSPOSE', n-k+i-j, k-i, -tau( i ),v( j, i+1 ), &
+                                    ldv, v( j, i ), 1, one,t( i+1, i ), 1 )
+                       else
+                          ! skip any leading zeros.
+                          do lastv = 1, i-1
+                             if( v( i, lastv )/=zero ) exit
+                          end do
+                          do j = i+1, k
+                             t( j, i ) = -tau( i ) * v( j, n-k+i )
+                          end do
+                          j = max( lastv, prevlastv )
+                          ! t(i+1:k,i) = -tau(i) * v(i+1:k,j:n-k+i) * v(i,j:n-k+i)**t
+                          call stdlib_qgemv( 'NO TRANSPOSE', k-i, n-k+i-j,-tau( i ), v( i+1, j ), &
+                                    ldv, v( i, j ), ldv,one, t( i+1, i ), 1 )
+                       end if
+                       ! t(i+1:k,i) := t(i+1:k,i+1:k) * t(i+1:k,i)
+                       call stdlib_qtrmv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', k-i,t( i+1, i+1 ), &
+                                 ldt, t( i+1, i ), 1 )
+                       if( i>1 ) then
+                          prevlastv = min( prevlastv, lastv )
+                       else
+                          prevlastv = lastv
+                       end if
+                    end if
+                    t( i, i ) = tau( i )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_qlarft
+
+     !> DLARFX: applies a real elementary reflector H to a real m by n
+     !> matrix C, from either the left or the right. H is represented in the
+     !> form
+     !> H = I - tau * v * v**T
+     !> where tau is a real scalar and v is a real vector.
+     !> If tau = 0, then H is taken to be the unit matrix
+     !> This version uses inline code if H has order < 11.
+
+     pure subroutine stdlib_qlarfx( side, m, n, v, tau, c, ldc, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: ldc, m, n
+           real(qp), intent(in) :: tau
+           ! Array Arguments 
+           real(qp), intent(inout) :: c(ldc,*)
+           real(qp), intent(in) :: v(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j
+           real(qp) :: sum, t1, t10, t2, t3, t4, t5, t6, t7, t8, t9, v1, v10, v2, v3, v4, v5, v6, &
+                     v7, v8, v9
+           ! Executable Statements 
+           if( tau==zero )return
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c, where h has order m.
+              go to ( 10, 30, 50, 70, 90, 110, 130, 150,170, 190 )m
+              ! code for general m
+              call stdlib_qlarf( side, m, n, v, 1, tau, c, ldc, work )
+              go to 410
+              10 continue
+              ! special code for 1 x 1 householder
+              t1 = one - tau*v( 1 )*v( 1 )
+              do j = 1, n
+                 c( 1, j ) = t1*c( 1, j )
+              end do
+              go to 410
+              30 continue
+              ! special code for 2 x 2 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+              end do
+              go to 410
+              50 continue
+              ! special code for 3 x 3 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+              end do
+              go to 410
+              70 continue
+              ! special code for 4 x 4 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+              end do
+              go to 410
+              90 continue
+              ! special code for 5 x 5 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j )
+                           
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+              end do
+              go to 410
+              110 continue
+              ! special code for 6 x 6 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+              end do
+              go to 410
+              130 continue
+              ! special code for 7 x 7 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+              end do
+              go to 410
+              150 continue
+              ! special code for 8 x 8 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+              end do
+              go to 410
+              170 continue
+              ! special code for 9 x 9 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              v9 = v( 9 )
+              t9 = tau*v9
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j ) + v9*c( 9, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+                 c( 9, j ) = c( 9, j ) - sum*t9
+              end do
+              go to 410
+              190 continue
+              ! special code for 10 x 10 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              v9 = v( 9 )
+              t9 = tau*v9
+              v10 = v( 10 )
+              t10 = tau*v10
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j ) + v9*c( 9, j ) +v10*c( 10, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+                 c( 9, j ) = c( 9, j ) - sum*t9
+                 c( 10, j ) = c( 10, j ) - sum*t10
+              end do
+              go to 410
+           else
+              ! form  c * h, where h has order n.
+              go to ( 210, 230, 250, 270, 290, 310, 330, 350,370, 390 )n
+              ! code for general n
+              call stdlib_qlarf( side, m, n, v, 1, tau, c, ldc, work )
+              go to 410
+              210 continue
+              ! special code for 1 x 1 householder
+              t1 = one - tau*v( 1 )*v( 1 )
+              do j = 1, m
+                 c( j, 1 ) = t1*c( j, 1 )
+              end do
+              go to 410
+              230 continue
+              ! special code for 2 x 2 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+              end do
+              go to 410
+              250 continue
+              ! special code for 3 x 3 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+              end do
+              go to 410
+              270 continue
+              ! special code for 4 x 4 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+              end do
+              go to 410
+              290 continue
+              ! special code for 5 x 5 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 )
+                           
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+              end do
+              go to 410
+              310 continue
+              ! special code for 6 x 6 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+              end do
+              go to 410
+              330 continue
+              ! special code for 7 x 7 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+              end do
+              go to 410
+              350 continue
+              ! special code for 8 x 8 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+              end do
+              go to 410
+              370 continue
+              ! special code for 9 x 9 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              v9 = v( 9 )
+              t9 = tau*v9
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 ) + v9*c( j, 9 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+                 c( j, 9 ) = c( j, 9 ) - sum*t9
+              end do
+              go to 410
+              390 continue
+              ! special code for 10 x 10 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              v9 = v( 9 )
+              t9 = tau*v9
+              v10 = v( 10 )
+              t10 = tau*v10
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 ) + v9*c( j, 9 ) +v10*c( j, 10 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+                 c( j, 9 ) = c( j, 9 ) - sum*t9
+                 c( j, 10 ) = c( j, 10 ) - sum*t10
+              end do
+              go to 410
+           end if
+           410 continue
+           return
+     end subroutine stdlib_qlarfx
+
+     !> DLARFY: applies an elementary reflector, or Householder matrix, H,
+     !> to an n x n symmetric matrix C, from both the left and the right.
+     !> H is represented in the form
+     !> H = I - tau * v * v'
+     !> where  tau  is a scalar and  v  is a vector.
+     !> If  tau  is  zero, then  H  is taken to be the unit matrix.
+
+     pure subroutine stdlib_qlarfy( uplo, n, v, incv, tau, c, ldc, work )
+        ! -- lapack test routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incv, ldc, n
+           real(qp), intent(in) :: tau
+           ! Array Arguments 
+           real(qp), intent(inout) :: c(ldc,*)
+           real(qp), intent(in) :: v(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: alpha
+           ! Executable Statements 
+           if( tau==zero )return
+           ! form  w:= c * v
+           call stdlib_qsymv( uplo, n, one, c, ldc, v, incv, zero, work, 1 )
+           alpha = -half*tau*stdlib_qdot( n, work, 1, v, incv )
+           call stdlib_qaxpy( n, alpha, v, incv, work, 1 )
+           ! c := c - v * w' - w * v'
+           call stdlib_qsyr2( uplo, n, -tau, v, incv, work, 1, c, ldc )
+           return
+     end subroutine stdlib_qlarfy
+
+     !> DLARGV: generates a vector of real plane rotations, determined by
+     !> elements of the real vectors x and y. For i = 1,2,...,n
+     !> (  c(i)  s(i) ) ( x(i) ) = ( a(i) )
+     !> ( -s(i)  c(i) ) ( y(i) ) = (   0  )
+
+     pure subroutine stdlib_qlargv( n, x, incx, y, incy, c, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, incy, n
+           ! Array Arguments 
+           real(qp), intent(out) :: c(*)
+           real(qp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix, iy
+           real(qp) :: f, g, t, tt
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           ix = 1
+           iy = 1
+           ic = 1
+           loop_10: do i = 1, n
+              f = x( ix )
+              g = y( iy )
+              if( g==zero ) then
+                 c( ic ) = one
+              else if( f==zero ) then
+                 c( ic ) = zero
+                 y( iy ) = one
+                 x( ix ) = g
+              else if( abs( f )>abs( g ) ) then
+                 t = g / f
+                 tt = sqrt( one+t*t )
+                 c( ic ) = one / tt
+                 y( iy ) = t*c( ic )
+                 x( ix ) = f*tt
+              else
+                 t = f / g
+                 tt = sqrt( one+t*t )
+                 y( iy ) = one / tt
+                 c( ic ) = t*y( iy )
+                 x( ix ) = g*tt
+              end if
+              ic = ic + incc
+              iy = iy + incy
+              ix = ix + incx
+           end do loop_10
+           return
+     end subroutine stdlib_qlargv
+
+     !> DLARNV: returns a vector of n random real numbers from a uniform or
+     !> normal distribution.
+
+     pure subroutine stdlib_qlarnv( idist, iseed, n, x )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: idist, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: iseed(4)
+           real(qp), intent(out) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: lv = 128
+           real(qp), parameter :: twopi = 6.28318530717958647692528676655900576839e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, il, il2, iv
+           ! Local Arrays 
+           real(qp) :: u(lv)
+           ! Intrinsic Functions 
+           intrinsic :: cos,log,min,sqrt
+           ! Executable Statements 
+           do 40 iv = 1, n, lv / 2
+              il = min( lv / 2, n-iv+1 )
+              if( idist==3 ) then
+                 il2 = 2*il
+              else
+                 il2 = il
+              end if
+              ! call stdlib_qlaruv to generate il2 numbers from a uniform (0,1)
+              ! distribution (il2 <= lv)
+              call stdlib_qlaruv( iseed, il2, u )
+              if( idist==1 ) then
+                 ! copy generated numbers
+                 do i = 1, il
+                    x( iv+i-1 ) = u( i )
+                 end do
+              else if( idist==2 ) then
+                 ! convert generated numbers to uniform (-1,1) distribution
+                 do i = 1, il
+                    x( iv+i-1 ) = two*u( i ) - one
+                 end do
+              else if( idist==3 ) then
+                 ! convert generated numbers to normal (0,1) distribution
+                 do i = 1, il
+                    x( iv+i-1 ) = sqrt( -two*log( u( 2*i-1 ) ) )*cos( twopi*u( 2*i ) )
+                 end do
+              end if
+              40 continue
+           return
+     end subroutine stdlib_qlarnv
+
+     !> Compute the splitting points with threshold SPLTOL.
+     !> DLARRA: sets any "small" off-diagonal elements to zero.
+
+     pure subroutine stdlib_qlarra( n, d, e, e2, spltol, tnrm,nsplit, isplit, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, nsplit
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: spltol, tnrm
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isplit(*)
+           real(qp), intent(in) :: d(*)
+           real(qp), intent(inout) :: e(*), e2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: eabs, tmp1
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           ! compute splitting points
+           nsplit = 1
+           if(spltol<zero) then
+              ! criterion based on absolute off-diagonal value
+              tmp1 = abs(spltol)* tnrm
+              do i = 1, n-1
+                 eabs = abs( e(i) )
+                 if( eabs <= tmp1) then
+                    e(i) = zero
+                    e2(i) = zero
+                    isplit( nsplit ) = i
+                    nsplit = nsplit + 1
+                 end if
+              end do
+           else
+              ! criterion that guarantees relative accuracy
+              do i = 1, n-1
+                 eabs = abs( e(i) )
+                 if( eabs <= spltol * sqrt(abs(d(i)))*sqrt(abs(d(i+1))) )then
+                    e(i) = zero
+                    e2(i) = zero
+                    isplit( nsplit ) = i
+                    nsplit = nsplit + 1
+                 end if
+              end do
+           endif
+           isplit( nsplit ) = n
+           return
+     end subroutine stdlib_qlarra
+
+     !> Given the relatively robust representation(RRR) L D L^T, DLARRB:
+     !> does "limited" bisection to refine the eigenvalues of L D L^T,
+     !> W( IFIRST-OFFSET ) through W( ILAST-OFFSET ), to more accuracy. Initial
+     !> guesses for these eigenvalues are input in W, the corresponding estimate
+     !> of the error in these guesses and their gaps are input in WERR
+     !> and WGAP, respectively. During bisection, intervals
+     !> [left, right] are maintained by storing their mid-points and
+     !> semi-widths in the arrays W and WERR respectively.
+
+     pure subroutine stdlib_qlarrb( n, d, lld, ifirst, ilast, rtol1,rtol2, offset, w, wgap, werr, &
+               work, iwork,pivmin, spdiam, twist, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ifirst, ilast, n, offset, twist
+           integer(ilp), intent(out) :: info
+           real(qp), intent(in) :: pivmin, rtol1, rtol2, spdiam
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: d(*), lld(*)
+           real(qp), intent(inout) :: w(*), werr(*), wgap(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           integer(ilp) :: maxitr
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ii, ip, iter, k, negcnt, next, nint, olnint, prev, r
+           real(qp) :: back, cvrgd, gap, left, lgap, mid, mnwdth, rgap, right, tmp, width
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           maxitr = int( ( log( spdiam+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+           mnwdth = two * pivmin
+           r = twist
+           if((r<1).or.(r>n)) r = n
+           ! initialize unconverged intervals in [ work(2*i-1), work(2*i) ].
+           ! the sturm count, count( work(2*i-1) ) is arranged to be i-1, while
+           ! count( work(2*i) ) is stored in iwork( 2*i ). the integer iwork( 2*i-1 )
+           ! for an unconverged interval is set to the index of the next unconverged
+           ! interval, and is -1 or 0 for a converged interval. thus a linked
+           ! list of unconverged intervals is set up.
+           i1 = ifirst
+           ! the number of unconverged intervals
+           nint = 0
+           ! the last unconverged interval found
+           prev = 0
+           rgap = wgap( i1-offset )
+           loop_75: do i = i1, ilast
+              k = 2*i
+              ii = i - offset
+              left = w( ii ) - werr( ii )
+              right = w( ii ) + werr( ii )
+              lgap = rgap
+              rgap = wgap( ii )
+              gap = min( lgap, rgap )
+              ! make sure that [left,right] contains the desired eigenvalue
+              ! compute negcount from dstqds facto l+d+l+^t = l d l^t - left
+              ! do while( negcnt(left)>i-1 )
+              back = werr( ii )
+              20 continue
+              negcnt = stdlib_qlaneg( n, d, lld, left, pivmin, r )
+              if( negcnt>i-1 ) then
+                 left = left - back
+                 back = two*back
+                 go to 20
+              end if
+              ! do while( negcnt(right)<i )
+              ! compute negcount from dstqds facto l+d+l+^t = l d l^t - right
+              back = werr( ii )
+              50 continue
+              negcnt = stdlib_qlaneg( n, d, lld, right, pivmin, r )
+               if( negcnt<i ) then
+                  right = right + back
+                  back = two*back
+                  go to 50
+               end if
+              width = half*abs( left - right )
+              tmp = max( abs( left ), abs( right ) )
+              cvrgd = max(rtol1*gap,rtol2*tmp)
+              if( width<=cvrgd .or. width<=mnwdth ) then
+                 ! this interval has already converged and does not need refinement.
+                 ! (note that the gaps might change through refining the
+                  ! eigenvalues, however, they can only get bigger.)
+                 ! remove it from the list.
+                 iwork( k-1 ) = -1
+                 ! make sure that i1 always points to the first unconverged interval
+                 if((i==i1).and.(i<ilast)) i1 = i + 1
+                 if((prev>=i1).and.(i<=ilast)) iwork( 2*prev-1 ) = i + 1
+              else
+                 ! unconverged interval found
+                 prev = i
+                 nint = nint + 1
+                 iwork( k-1 ) = i + 1
+                 iwork( k ) = negcnt
+              end if
+              work( k-1 ) = left
+              work( k ) = right
+           end do loop_75
+           ! do while( nint>0 ), i.e. there are still unconverged intervals
+           ! and while (iter<maxitr)
+           iter = 0
+           80 continue
+           prev = i1 - 1
+           i = i1
+           olnint = nint
+           loop_100: do ip = 1, olnint
+              k = 2*i
+              ii = i - offset
+              rgap = wgap( ii )
+              lgap = rgap
+              if(ii>1) lgap = wgap( ii-1 )
+              gap = min( lgap, rgap )
+              next = iwork( k-1 )
+              left = work( k-1 )
+              right = work( k )
+              mid = half*( left + right )
+              ! semiwidth of interval
+              width = right - mid
+              tmp = max( abs( left ), abs( right ) )
+              cvrgd = max(rtol1*gap,rtol2*tmp)
+              if( ( width<=cvrgd ) .or. ( width<=mnwdth ).or.( iter==maxitr ) )then
+                 ! reduce number of unconverged intervals
+                 nint = nint - 1
+                 ! mark interval as converged.
+                 iwork( k-1 ) = 0
+                 if( i1==i ) then
+                    i1 = next
+                 else
+                    ! prev holds the last unconverged interval previously examined
+                    if(prev>=i1) iwork( 2*prev-1 ) = next
+                 end if
+                 i = next
+                 cycle loop_100
+              end if
+              prev = i
+              ! perform one bisection step
+              negcnt = stdlib_qlaneg( n, d, lld, mid, pivmin, r )
+              if( negcnt<=i-1 ) then
+                 work( k-1 ) = mid
+              else
+                 work( k ) = mid
+              end if
+              i = next
+           end do loop_100
+           iter = iter + 1
+           ! do another loop if there are still unconverged intervals
+           ! however, in the last iteration, all intervals are accepted
+           ! since this is the best we can do.
+           if( ( nint>0 ).and.(iter<=maxitr) ) go to 80
+           ! at this point, all the intervals have converged
+           do i = ifirst, ilast
+              k = 2*i
+              ii = i - offset
+              ! all intervals marked by '0' have been refined.
+              if( iwork( k-1 )==0 ) then
+                 w( ii ) = half*( work( k-1 )+work( k ) )
+                 werr( ii ) = work( k ) - w( ii )
+              end if
+           end do
+           do i = ifirst+1, ilast
+              k = 2*i
+              ii = i - offset
+              wgap( ii-1 ) = max( zero,w(ii) - werr (ii) - w( ii-1 ) - werr( ii-1 ))
+           end do
+           return
+     end subroutine stdlib_qlarrb
+
+     !> Find the number of eigenvalues of the symmetric tridiagonal matrix T
+     !> that are in the interval (VL,VU] if JOBT = 'T', and of L D L^T
+     !> if JOBT = 'L'.
+
+     pure subroutine stdlib_qlarrc( jobt, n, vl, vu, d, e, pivmin,eigcnt, lcnt, rcnt, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobt
+           integer(ilp), intent(out) :: eigcnt, info, lcnt, rcnt
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: pivmin, vl, vu
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*), e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           logical(lk) :: matt
+           real(qp) :: lpivot, rpivot, sl, su, tmp, tmp2
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           lcnt = 0
+           rcnt = 0
+           eigcnt = 0
+           matt = stdlib_lsame( jobt, 'T' )
+           if (matt) then
+              ! sturm sequence count on t
+              lpivot = d( 1 ) - vl
+              rpivot = d( 1 ) - vu
+              if( lpivot<=zero ) then
+                 lcnt = lcnt + 1
+              endif
+              if( rpivot<=zero ) then
+                 rcnt = rcnt + 1
+              endif
+              do i = 1, n-1
+                 tmp = e(i)**2
+                 lpivot = ( d( i+1 )-vl ) - tmp/lpivot
+                 rpivot = ( d( i+1 )-vu ) - tmp/rpivot
+                 if( lpivot<=zero ) then
+                    lcnt = lcnt + 1
+                 endif
+                 if( rpivot<=zero ) then
+                    rcnt = rcnt + 1
+                 endif
+              end do
+           else
+              ! sturm sequence count on l d l^t
+              sl = -vl
+              su = -vu
+              do i = 1, n - 1
+                 lpivot = d( i ) + sl
+                 rpivot = d( i ) + su
+                 if( lpivot<=zero ) then
+                    lcnt = lcnt + 1
+                 endif
+                 if( rpivot<=zero ) then
+                    rcnt = rcnt + 1
+                 endif
+                 tmp = e(i) * d(i) * e(i)
+                 tmp2 = tmp / lpivot
+                 if( tmp2==zero ) then
+                    sl =  tmp - vl
+                 else
+                    sl = sl*tmp2 - vl
+                 end if
+                 tmp2 = tmp / rpivot
+                 if( tmp2==zero ) then
+                    su =  tmp - vu
+                 else
+                    su = su*tmp2 - vu
+                 end if
+              end do
+              lpivot = d( n ) + sl
+              rpivot = d( n ) + su
+              if( lpivot<=zero ) then
+                 lcnt = lcnt + 1
+              endif
+              if( rpivot<=zero ) then
+                 rcnt = rcnt + 1
+              endif
+           endif
+           eigcnt = rcnt - lcnt
+           return
+     end subroutine stdlib_qlarrc
+
+     !> DLARRD: computes the eigenvalues of a symmetric tridiagonal
+     !> matrix T to suitable accuracy. This is an auxiliary code to be
+     !> called from DSTEMR.
+     !> The user may ask for all eigenvalues, all eigenvalues
+     !> in the half-open interval (VL, VU], or the IL-th through IU-th
+     !> eigenvalues.
+     !> To avoid overflow, the matrix must be scaled so that its
+     !> largest element is no greater than overflow**(1/2) * underflow**(1/4) in absolute value, and for greatest
+     !> accuracy, it should not be much smaller than that.
+     !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+     !> Matrix", Report CS41, Computer Science Dept., Stanford
+     !> University, July 21, 1966.
+
+     pure subroutine stdlib_qlarrd( range, order, n, vl, vu, il, iu, gers,reltol, d, e, e2, &
+               pivmin, nsplit, isplit,m, w, werr, wl, wu, iblock, indexw,work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: order, range
+           integer(ilp), intent(in) :: il, iu, n, nsplit
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: pivmin, reltol, vl, vu
+           real(qp), intent(out) :: wl, wu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iblock(*), indexw(*), iwork(*)
+           integer(ilp), intent(in) :: isplit(*)
+           real(qp), intent(in) :: d(*), e(*), e2(*), gers(*)
+           real(qp), intent(out) :: w(*), werr(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: fudge = two
+           integer(ilp), parameter :: allrng = 1
+           integer(ilp), parameter :: valrng = 2
+           integer(ilp), parameter :: indrng = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ncnvrg, toofew
+           integer(ilp) :: i, ib, ibegin, idiscl, idiscu, ie, iend, iinfo, im, in, ioff, iout, &
+                     irange, itmax, itmp1, itmp2, iw, iwoff, j, jblk, jdisc, je, jee, nb, nwl, nwu
+           real(qp) :: atoli, eps, gl, gu, rtoli, tmp1, tmp2, tnorm, uflow, wkill, wlu, &
+                     wul
+           ! Local Arrays 
+           integer(ilp) :: idumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,min
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           ! decode range
+           if( stdlib_lsame( range, 'A' ) ) then
+              irange = allrng
+           else if( stdlib_lsame( range, 'V' ) ) then
+              irange = valrng
+           else if( stdlib_lsame( range, 'I' ) ) then
+              irange = indrng
+           else
+              irange = 0
+           end if
+           ! check for errors
+           if( irange<=0 ) then
+              info = -1
+           else if( .not.(stdlib_lsame(order,'B').or.stdlib_lsame(order,'E')) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( irange==valrng ) then
+              if( vl>=vu )info = -5
+           else if( irange==indrng .and.( il<1 .or. il>max( 1, n ) ) ) then
+              info = -6
+           else if( irange==indrng .and.( iu<min( n, il ) .or. iu>n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              return
+           end if
+           ! initialize error flags
+           info = 0
+           ncnvrg = .false.
+           toofew = .false.
+           ! quick return if possible
+           m = 0
+           if( n==0 ) return
+           ! simplification:
+           if( irange==indrng .and. il==1 .and. iu==n ) irange = 1
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           uflow = stdlib_qlamch( 'U' )
+           ! special case when n=1
+           ! treat case of 1x1 matrix for quick return
+           if( n==1 ) then
+              if( (irange==allrng).or.((irange==valrng).and.(d(1)>vl).and.(d(1)<=vu)).or.((&
+                        irange==indrng).and.(il==1).and.(iu==1)) ) then
+                 m = 1
+                 w(1) = d(1)
+                 ! the computation error of the eigenvalue is zero
+                 werr(1) = zero
+                 iblock( 1 ) = 1
+                 indexw( 1 ) = 1
+              endif
+              return
+           end if
+           ! nb is the minimum vector length for vector bisection, or 0
+           ! if only scalar is to be done.
+           nb = stdlib_ilaenv( 1, 'DSTEBZ', ' ', n, -1, -1, -1 )
+           if( nb<=1 ) nb = 0
+           ! find global spectral radius
+           gl = d(1)
+           gu = d(1)
+           do i = 1,n
+              gl =  min( gl, gers( 2*i - 1))
+              gu = max( gu, gers(2*i) )
+           end do
+           ! compute global gerschgorin bounds and spectral diameter
+           tnorm = max( abs( gl ), abs( gu ) )
+           gl = gl - fudge*tnorm*eps*n - fudge*two*pivmin
+           gu = gu + fudge*tnorm*eps*n + fudge*two*pivmin
+           ! [jan/28/2009] remove the line below since spdiam variable not use
+           ! spdiam = gu - gl
+           ! input arguments for stdlib_qlaebz:
+           ! the relative tolerance.  an interval (a,b] lies within
+           ! "relative tolerance" if  b-a < reltol*max(|a|,|b|),
+           rtoli = reltol
+           ! set the absolute tolerance for interval convergence to zero to force
+           ! interval convergence based on relative size of the interval.
+           ! this is dangerous because intervals might not converge when reltol is
+           ! small. but at least a very small number should be selected so that for
+           ! strongly graded matrices, the code can get relatively accurate
+           ! eigenvalues.
+           atoli = fudge*two*uflow + fudge*two*pivmin
+           if( irange==indrng ) then
+              ! range='i': compute an interval containing eigenvalues
+              ! il through iu. the initial interval [gl,gu] from the global
+              ! gerschgorin bounds gl and gu is refined by stdlib_qlaebz.
+              itmax = int( ( log( tnorm+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+              work( n+1 ) = gl
+              work( n+2 ) = gl
+              work( n+3 ) = gu
+              work( n+4 ) = gu
+              work( n+5 ) = gl
+              work( n+6 ) = gu
+              iwork( 1 ) = -1
+              iwork( 2 ) = -1
+              iwork( 3 ) = n + 1
+              iwork( 4 ) = n + 1
+              iwork( 5 ) = il - 1
+              iwork( 6 ) = iu
+              call stdlib_qlaebz( 3, itmax, n, 2, 2, nb, atoli, rtoli, pivmin,d, e, e2, iwork( 5 )&
+                        , work( n+1 ), work( n+5 ), iout,iwork, w, iblock, iinfo )
+              if( iinfo /= 0 ) then
+                 info = iinfo
+                 return
+              end if
+              ! on exit, output intervals may not be ordered by ascending negcount
+              if( iwork( 6 )==iu ) then
+                 wl = work( n+1 )
+                 wlu = work( n+3 )
+                 nwl = iwork( 1 )
+                 wu = work( n+4 )
+                 wul = work( n+2 )
+                 nwu = iwork( 4 )
+              else
+                 wl = work( n+2 )
+                 wlu = work( n+4 )
+                 nwl = iwork( 2 )
+                 wu = work( n+3 )
+                 wul = work( n+1 )
+                 nwu = iwork( 3 )
+              end if
+              ! on exit, the interval [wl, wlu] contains a value with negcount nwl,
+              ! and [wul, wu] contains a value with negcount nwu.
+              if( nwl<0 .or. nwl>=n .or. nwu<1 .or. nwu>n ) then
+                 info = 4
+                 return
+              end if
+           elseif( irange==valrng ) then
+              wl = vl
+              wu = vu
+           elseif( irange==allrng ) then
+              wl = gl
+              wu = gu
+           endif
+           ! find eigenvalues -- loop over blocks and recompute nwl and nwu.
+           ! nwl accumulates the number of eigenvalues .le. wl,
+           ! nwu accumulates the number of eigenvalues .le. wu
+           m = 0
+           iend = 0
+           info = 0
+           nwl = 0
+           nwu = 0
+           loop_70: do jblk = 1, nsplit
+              ioff = iend
+              ibegin = ioff + 1
+              iend = isplit( jblk )
+              in = iend - ioff
+              if( in==1 ) then
+                 ! 1x1 block
+                 if( wl>=d( ibegin )-pivmin )nwl = nwl + 1
+                 if( wu>=d( ibegin )-pivmin )nwu = nwu + 1
+                 if( irange==allrng .or.( wl<d( ibegin )-pivmin.and. wu>= d( ibegin )-pivmin ) ) &
+                           then
+                    m = m + 1
+                    w( m ) = d( ibegin )
+                    werr(m) = zero
+                    ! the gap for a single block doesn't matter for the later
+                    ! algorithm and is assigned an arbitrary large value
+                    iblock( m ) = jblk
+                    indexw( m ) = 1
+                 end if
+              ! disabled 2x2 case because of a failure on the following matrix
+              ! range = 'i', il = iu = 4
+                ! original tridiagonal, d = [
+                 ! -0.150102010615740e+00_qp
+                 ! -0.849897989384260e+00_qp
+                 ! -0.128208148052635e-15_qp
+                  ! 0.128257718286320e-15_qp
+                ! ];
+                ! e = [
+                 ! -0.357171383266986e+00_qp
+                 ! -0.180411241501588e-15_qp
+                 ! -0.175152352710251e-15_qp
+                ! ];
+               ! else if( in==2 ) then
+      ! *           2x2 block
+                  ! disc = sqrt( (half*(d(ibegin)-d(iend)))**2 + e(ibegin)**2 )
+                  ! tmp1 = half*(d(ibegin)+d(iend))
+                  ! l1 = tmp1 - disc
+                  ! if( wl>= l1-pivmin )
+           ! $         nwl = nwl + 1
+                  ! if( wu>= l1-pivmin )
+           ! $         nwu = nwu + 1
+                  ! if( irange==allrng .or. ( wl<l1-pivmin .and. wu>=
+           ! $          l1-pivmin ) ) then
+                     ! m = m + 1
+                     ! w( m ) = l1
+      ! *              the uncertainty of eigenvalues of a 2x2 matrix is very small
+                     ! werr( m ) = eps * abs( w( m ) ) * two
+                     ! iblock( m ) = jblk
+                     ! indexw( m ) = 1
+                  ! endif
+                  ! l2 = tmp1 + disc
+                  ! if( wl>= l2-pivmin )
+           ! $         nwl = nwl + 1
+                  ! if( wu>= l2-pivmin )
+           ! $         nwu = nwu + 1
+                  ! if( irange==allrng .or. ( wl<l2-pivmin .and. wu>=
+           ! $          l2-pivmin ) ) then
+                     ! m = m + 1
+                     ! w( m ) = l2
+      ! *              the uncertainty of eigenvalues of a 2x2 matrix is very small
+                     ! werr( m ) = eps * abs( w( m ) ) * two
+                     ! iblock( m ) = jblk
+                     ! indexw( m ) = 2
+                  ! endif
+              else
+                 ! general case - block of size in >= 2
+                 ! compute local gerschgorin interval and use it as the initial
+                 ! interval for stdlib_qlaebz
+                 gu = d( ibegin )
+                 gl = d( ibegin )
+                 tmp1 = zero
+                 do j = ibegin, iend
+                    gl =  min( gl, gers( 2*j - 1))
+                    gu = max( gu, gers(2*j) )
+                 end do
+                 ! [jan/28/2009]
+                 ! change spdiam by tnorm in lines 2 and 3 thereafter
+                 ! line 1: remove computation of spdiam (not useful anymore)
+                 ! spdiam = gu - gl
+                 ! gl = gl - fudge*spdiam*eps*in - fudge*pivmin
+                 ! gu = gu + fudge*spdiam*eps*in + fudge*pivmin
+                 gl = gl - fudge*tnorm*eps*in - fudge*pivmin
+                 gu = gu + fudge*tnorm*eps*in + fudge*pivmin
+                 if( irange>1 ) then
+                    if( gu<wl ) then
+                       ! the local block contains none of the wanted eigenvalues
+                       nwl = nwl + in
+                       nwu = nwu + in
+                       cycle loop_70
+                    end if
+                    ! refine search interval if possible, only range (wl,wu] matters
+                    gl = max( gl, wl )
+                    gu = min( gu, wu )
+                    if( gl>=gu )cycle loop_70
+                 end if
+                 ! find negcount of initial interval boundaries gl and gu
+                 work( n+1 ) = gl
+                 work( n+in+1 ) = gu
+                 call stdlib_qlaebz( 1, 0, in, in, 1, nb, atoli, rtoli, pivmin,d( ibegin ), e( &
+                 ibegin ), e2( ibegin ),idumma, work( n+1 ), work( n+2*in+1 ), im,iwork, w( m+1 ),&
+                            iblock( m+1 ), iinfo )
+                 if( iinfo /= 0 ) then
+                    info = iinfo
+                    return
+                 end if
+                 nwl = nwl + iwork( 1 )
+                 nwu = nwu + iwork( in+1 )
+                 iwoff = m - iwork( 1 )
+                 ! compute eigenvalues
+                 itmax = int( ( log( gu-gl+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + &
+                           2
+                 call stdlib_qlaebz( 2, itmax, in, in, 1, nb, atoli, rtoli, pivmin,d( ibegin ), e(&
+                  ibegin ), e2( ibegin ),idumma, work( n+1 ), work( n+2*in+1 ), iout,iwork, w( m+&
+                            1 ), iblock( m+1 ), iinfo )
+                 if( iinfo /= 0 ) then
+                    info = iinfo
+                    return
+                 end if
+                 ! copy eigenvalues into w and iblock
+                 ! use -jblk for block number for unconverged eigenvalues.
+                 ! loop over the number of output intervals from stdlib_qlaebz
+                 do j = 1, iout
+                    ! eigenvalue approximation is middle point of interval
+                    tmp1 = half*( work( j+n )+work( j+in+n ) )
+                    ! semi length of error interval
+                    tmp2 = half*abs( work( j+n )-work( j+in+n ) )
+                    if( j>iout-iinfo ) then
+                       ! flag non-convergence.
+                       ncnvrg = .true.
+                       ib = -jblk
+                    else
+                       ib = jblk
+                    end if
+                    do je = iwork( j ) + 1 + iwoff,iwork( j+in ) + iwoff
+                       w( je ) = tmp1
+                       werr( je ) = tmp2
+                       indexw( je ) = je - iwoff
+                       iblock( je ) = ib
+                    end do
+                 end do
+                 m = m + im
+              end if
+           end do loop_70
+           ! if range='i', then (wl,wu) contains eigenvalues nwl+1,...,nwu
+           ! if nwl+1 < il or nwu > iu, discard extra eigenvalues.
+           if( irange==indrng ) then
+              idiscl = il - 1 - nwl
+              idiscu = nwu - iu
+              if( idiscl>0 ) then
+                 im = 0
+                 do je = 1, m
+                    ! remove some of the smallest eigenvalues from the left so that
+                    ! at the end idiscl =0. move all eigenvalues up to the left.
+                    if( w( je )<=wlu .and. idiscl>0 ) then
+                       idiscl = idiscl - 1
+                    else
+                       im = im + 1
+                       w( im ) = w( je )
+                       werr( im ) = werr( je )
+                       indexw( im ) = indexw( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 m = im
+              end if
+              if( idiscu>0 ) then
+                 ! remove some of the largest eigenvalues from the right so that
+                 ! at the end idiscu =0. move all eigenvalues up to the left.
+                 im=m+1
+                 do je = m, 1, -1
+                    if( w( je )>=wul .and. idiscu>0 ) then
+                       idiscu = idiscu - 1
+                    else
+                       im = im - 1
+                       w( im ) = w( je )
+                       werr( im ) = werr( je )
+                       indexw( im ) = indexw( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 jee = 0
+                 do je = im, m
+                    jee = jee + 1
+                    w( jee ) = w( je )
+                    werr( jee ) = werr( je )
+                    indexw( jee ) = indexw( je )
+                    iblock( jee ) = iblock( je )
+                 end do
+                 m = m-im+1
+              end if
+              if( idiscl>0 .or. idiscu>0 ) then
+                 ! code to deal with effects of bad arithmetic. (if n(w) is
+                 ! monotone non-decreasing, this should never happen.)
+                 ! some low eigenvalues to be discarded are not in (wl,wlu],
+                 ! or high eigenvalues to be discarded are not in (wul,wu]
+                 ! so just kill off the smallest idiscl/largest idiscu
+                 ! eigenvalues, by marking the corresponding iblock = 0
+                 if( idiscl>0 ) then
+                    wkill = wu
+                    do jdisc = 1, idiscl
+                       iw = 0
+                       do je = 1, m
+                          if( iblock( je )/=0 .and.( w( je )<wkill .or. iw==0 ) ) then
+                             iw = je
+                             wkill = w( je )
+                          end if
+                       end do
+                       iblock( iw ) = 0
+                    end do
+                 end if
+                 if( idiscu>0 ) then
+                    wkill = wl
+                    do jdisc = 1, idiscu
+                       iw = 0
+                       do je = 1, m
+                          if( iblock( je )/=0 .and.( w( je )>=wkill .or. iw==0 ) ) then
+                             iw = je
+                             wkill = w( je )
+                          end if
+                       end do
+                       iblock( iw ) = 0
+                    end do
+                 end if
+                 ! now erase all eigenvalues with iblock set to zero
+                 im = 0
+                 do je = 1, m
+                    if( iblock( je )/=0 ) then
+                       im = im + 1
+                       w( im ) = w( je )
+                       werr( im ) = werr( je )
+                       indexw( im ) = indexw( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 m = im
+              end if
+              if( idiscl<0 .or. idiscu<0 ) then
+                 toofew = .true.
+              end if
+           end if
+           if(( irange==allrng .and. m/=n ).or.( irange==indrng .and. m/=iu-il+1 ) ) then
+              toofew = .true.
+           end if
+           ! if order='b', do nothing the eigenvalues are already sorted by
+              ! block.
+           ! if order='e', sort the eigenvalues from smallest to largest
+           if( stdlib_lsame(order,'E') .and. nsplit>1 ) then
+              do je = 1, m - 1
+                 ie = 0
+                 tmp1 = w( je )
+                 do j = je + 1, m
+                    if( w( j )<tmp1 ) then
+                       ie = j
+                       tmp1 = w( j )
+                    end if
+                 end do
+                 if( ie/=0 ) then
+                    tmp2 = werr( ie )
+                    itmp1 = iblock( ie )
+                    itmp2 = indexw( ie )
+                    w( ie ) = w( je )
+                    werr( ie ) = werr( je )
+                    iblock( ie ) = iblock( je )
+                    indexw( ie ) = indexw( je )
+                    w( je ) = tmp1
+                    werr( je ) = tmp2
+                    iblock( je ) = itmp1
+                    indexw( je ) = itmp2
+                 end if
+              end do
+           end if
+           info = 0
+           if( ncnvrg )info = info + 1
+           if( toofew )info = info + 2
+           return
+     end subroutine stdlib_qlarrd
+
+     !> To find the desired eigenvalues of a given real symmetric
+     !> tridiagonal matrix T, DLARRE: sets any "small" off-diagonal
+     !> elements to zero, and for each unreduced block T_i, it finds
+     !> (a) a suitable shift at one end of the block's spectrum,
+     !> (b) the base representation, T_i - sigma_i I = L_i D_i L_i^T, and
+     !> (c) eigenvalues of each L_i D_i L_i^T.
+     !> The representations and eigenvalues found are then used by
+     !> DSTEMR to compute the eigenvectors of T.
+     !> The accuracy varies depending on whether bisection is used to
+     !> find a few eigenvalues or the dqds algorithm (subroutine DLASQ2) to
+     !> conpute all and then discard any unwanted one.
+     !> As an added benefit, DLARRE also outputs the n
+     !> Gerschgorin intervals for the matrices L_i D_i L_i^T.
+
+     pure subroutine stdlib_qlarre( range, n, vl, vu, il, iu, d, e, e2,rtol1, rtol2, spltol, &
+               nsplit, isplit, m,w, werr, wgap, iblock, indexw, gers, pivmin,work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: range
+           integer(ilp), intent(in) :: il, iu, n
+           integer(ilp), intent(out) :: info, m, nsplit
+           real(qp), intent(out) :: pivmin
+           real(qp), intent(in) :: rtol1, rtol2, spltol
+           real(qp), intent(inout) :: vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iblock(*), isplit(*), iwork(*), indexw(*)
+           real(qp), intent(inout) :: d(*), e(*), e2(*)
+           real(qp), intent(out) :: gers(*), w(*), werr(*), wgap(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: hndrd = 100.0_qp
+           real(qp), parameter :: pert = 8.0_qp
+           real(qp), parameter :: fourth = one/four
+           real(qp), parameter :: fac = half
+           real(qp), parameter :: maxgrowth = 64.0_qp
+           real(qp), parameter :: fudge = 2.0_qp
+           integer(ilp), parameter :: maxtry = 6
+           integer(ilp), parameter :: allrng = 1
+           integer(ilp), parameter :: indrng = 2
+           integer(ilp), parameter :: valrng = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: forceb, norep, usedqd
+           integer(ilp) :: cnt, cnt1, cnt2, i, ibegin, idum, iend, iinfo, in, indl, indu, irange, &
+                     j, jblk, mb, mm, wbegin, wend
+           real(qp) :: avgap, bsrtol, clwdth, dmax, dpivot, eabs, emax, eold, eps, gl, gu, isleft,&
+                      isrght, rtl, rtol, s1, s2, safmin, sgndef, sigma, spdiam, tau, tmp, tmp1
+           ! Local Arrays 
+           integer(ilp) :: iseed(4)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           ! decode range
+           if( stdlib_lsame( range, 'A' ) ) then
+              irange = allrng
+           else if( stdlib_lsame( range, 'V' ) ) then
+              irange = valrng
+           else if( stdlib_lsame( range, 'I' ) ) then
+              irange = indrng
+           end if
+           m = 0
+           ! get machine constants
+           safmin = stdlib_qlamch( 'S' )
+           eps = stdlib_qlamch( 'P' )
+           ! set parameters
+           rtl = sqrt(eps)
+           bsrtol = sqrt(eps)
+           ! treat case of 1x1 matrix for quick return
+           if( n==1 ) then
+              if( (irange==allrng).or.((irange==valrng).and.(d(1)>vl).and.(d(1)<=vu)).or.((&
+                        irange==indrng).and.(il==1).and.(iu==1)) ) then
+                 m = 1
+                 w(1) = d(1)
+                 ! the computation error of the eigenvalue is zero
+                 werr(1) = zero
+                 wgap(1) = zero
+                 iblock( 1 ) = 1
+                 indexw( 1 ) = 1
+                 gers(1) = d( 1 )
+                 gers(2) = d( 1 )
+              endif
+              ! store the shift for the initial rrr, which is zero in this case
+              e(1) = zero
+              return
+           end if
+           ! general case: tridiagonal matrix of order > 1
+           ! init werr, wgap. compute gerschgorin intervals and spectral diameter.
+           ! compute maximum off-diagonal entry and pivmin.
+           gl = d(1)
+           gu = d(1)
+           eold = zero
+           emax = zero
+           e(n) = zero
+           do i = 1,n
+              werr(i) = zero
+              wgap(i) = zero
+              eabs = abs( e(i) )
+              if( eabs >= emax ) then
+                 emax = eabs
+              end if
+              tmp1 = eabs + eold
+              gers( 2*i-1) = d(i) - tmp1
+              gl =  min( gl, gers( 2*i - 1))
+              gers( 2*i ) = d(i) + tmp1
+              gu = max( gu, gers(2*i) )
+              eold  = eabs
+           end do
+           ! the minimum pivot allowed in the sturm sequence for t
+           pivmin = safmin * max( one, emax**2 )
+           ! compute spectral diameter. the gerschgorin bounds give an
+           ! estimate that is wrong by at most a factor of sqrt(2)
+           spdiam = gu - gl
+           ! compute splitting points
+           call stdlib_qlarra( n, d, e, e2, spltol, spdiam,nsplit, isplit, iinfo )
+           ! can force use of bisection instead of faster dqds.
+           ! option left in the code for future multisection work.
+           forceb = .false.
+           ! initialize usedqd, dqds should be used for allrng unless someone
+           ! explicitly wants bisection.
+           usedqd = (( irange==allrng ) .and. (.not.forceb))
+           if( (irange==allrng) .and. (.not. forceb) ) then
+              ! set interval [vl,vu] that contains all eigenvalues
+              vl = gl
+              vu = gu
+           else
+              ! we call stdlib_qlarrd to find crude approximations to the eigenvalues
+              ! in the desired range. in case irange = indrng, we also obtain the
+              ! interval (vl,vu] that contains all the wanted eigenvalues.
+              ! an interval [left,right] has converged if
+              ! right-left<rtol*max(abs(left),abs(right))
+              ! stdlib_qlarrd needs a work of size 4*n, iwork of size 3*n
+              call stdlib_qlarrd( range, 'B', n, vl, vu, il, iu, gers,bsrtol, d, e, e2, pivmin, &
+                        nsplit, isplit,mm, w, werr, vl, vu, iblock, indexw,work, iwork, iinfo )
+              if( iinfo/=0 ) then
+                 info = -1
+                 return
+              endif
+              ! make sure that the entries m+1 to n in w, werr, iblock, indexw are 0
+              do i = mm+1,n
+                 w( i ) = zero
+                 werr( i ) = zero
+                 iblock( i ) = 0
+                 indexw( i ) = 0
+              end do
+           end if
+      ! **
+           ! loop over unreduced blocks
+           ibegin = 1
+           wbegin = 1
+           loop_170: do jblk = 1, nsplit
+              iend = isplit( jblk )
+              in = iend - ibegin + 1
+              ! 1 x 1 block
+              if( in==1 ) then
+                 if( (irange==allrng).or.( (irange==valrng).and.( d( ibegin )>vl ).and.( d( &
+                           ibegin )<=vu ) ).or. ( (irange==indrng).and.(iblock(wbegin)==jblk))) then
+                    m = m + 1
+                    w( m ) = d( ibegin )
+                    werr(m) = zero
+                    ! the gap for a single block doesn't matter for the later
+                    ! algorithm and is assigned an arbitrary large value
+                    wgap(m) = zero
+                    iblock( m ) = jblk
+                    indexw( m ) = 1
+                    wbegin = wbegin + 1
+                 endif
+                 ! e( iend ) holds the shift for the initial rrr
+                 e( iend ) = zero
+                 ibegin = iend + 1
+                 cycle loop_170
+              end if
+              ! blocks of size larger than 1x1
+              ! e( iend ) will hold the shift for the initial rrr, for now set it =0
+              e( iend ) = zero
+              ! find local outer bounds gl,gu for the block
+              gl = d(ibegin)
+              gu = d(ibegin)
+              do i = ibegin , iend
+                 gl = min( gers( 2*i-1 ), gl )
+                 gu = max( gers( 2*i ), gu )
+              end do
+              spdiam = gu - gl
+              if(.not. ((irange==allrng).and.(.not.forceb)) ) then
+                 ! count the number of eigenvalues in the current block.
+                 mb = 0
+                 do i = wbegin,mm
+                    if( iblock(i)==jblk ) then
+                       mb = mb+1
+                    else
+                       goto 21
+                    endif
+                 end do
+                 21 continue
+                 if( mb==0) then
+                    ! no eigenvalue in the current block lies in the desired range
+                    ! e( iend ) holds the shift for the initial rrr
+                    e( iend ) = zero
+                    ibegin = iend + 1
+                    cycle loop_170
+                 else
+                    ! decide whether dqds or bisection is more efficient
+                    usedqd = ( (mb > fac*in) .and. (.not.forceb) )
+                    wend = wbegin + mb - 1
+                    ! calculate gaps for the current block
+                    ! in later stages, when representations for individual
+                    ! eigenvalues are different, we use sigma = e( iend ).
+                    sigma = zero
+                    do i = wbegin, wend - 1
+                       wgap( i ) = max( zero,w(i+1)-werr(i+1) - (w(i)+werr(i)) )
+                    end do
+                    wgap( wend ) = max( zero,vu - sigma - (w( wend )+werr( wend )))
+                    ! find local index of the first and last desired evalue.
+                    indl = indexw(wbegin)
+                    indu = indexw( wend )
+                 endif
+              endif
+              if(( (irange==allrng) .and. (.not. forceb) ).or.usedqd) then
+                 ! case of dqds
+                 ! find approximations to the extremal eigenvalues of the block
+                 call stdlib_qlarrk( in, 1, gl, gu, d(ibegin),e2(ibegin), pivmin, rtl, tmp, tmp1, &
+                           iinfo )
+                 if( iinfo/=0 ) then
+                    info = -1
+                    return
+                 endif
+                 isleft = max(gl, tmp - tmp1- hndrd * eps* abs(tmp - tmp1))
+                 call stdlib_qlarrk( in, in, gl, gu, d(ibegin),e2(ibegin), pivmin, rtl, tmp, tmp1,&
+                            iinfo )
+                 if( iinfo/=0 ) then
+                    info = -1
+                    return
+                 endif
+                 isrght = min(gu, tmp + tmp1+ hndrd * eps * abs(tmp + tmp1))
+                 ! improve the estimate of the spectral diameter
+                 spdiam = isrght - isleft
+              else
+                 ! case of bisection
+                 ! find approximations to the wanted extremal eigenvalues
+                 isleft = max(gl, w(wbegin) - werr(wbegin)- hndrd * eps*abs(w(wbegin)- werr(&
+                           wbegin) ))
+                 isrght = min(gu,w(wend) + werr(wend)+ hndrd * eps * abs(w(wend)+ werr(wend)))
+                           
+              endif
+              ! decide whether the base representation for the current block
+              ! l_jblk d_jblk l_jblk^t = t_jblk - sigma_jblk i
+              ! should be on the left or the right end of the current block.
+              ! the strategy is to shift to the end which is "more populated"
+              ! furthermore, decide whether to use dqds for the computation of
+              ! dqds is chosen if all eigenvalues are desired or the number of
+              ! eigenvalues to be computed is large compared to the blocksize.
+              if( ( irange==allrng ) .and. (.not.forceb) ) then
+                 ! if all the eigenvalues have to be computed, we use dqd
+                 usedqd = .true.
+                 ! indl is the local index of the first eigenvalue to compute
+                 indl = 1
+                 indu = in
+                 ! mb =  number of eigenvalues to compute
+                 mb = in
+                 wend = wbegin + mb - 1
+                 ! define 1/4 and 3/4 points of the spectrum
+                 s1 = isleft + fourth * spdiam
+                 s2 = isrght - fourth * spdiam
+              else
+                 ! stdlib_qlarrd has computed iblock and indexw for each eigenvalue
+                 ! approximation.
+                 ! choose sigma
+                 if( usedqd ) then
+                    s1 = isleft + fourth * spdiam
+                    s2 = isrght - fourth * spdiam
+                 else
+                    tmp = min(isrght,vu) -  max(isleft,vl)
+                    s1 =  max(isleft,vl) + fourth * tmp
+                    s2 =  min(isrght,vu) - fourth * tmp
+                 endif
+              endif
+              ! compute the negcount at the 1/4 and 3/4 points
+              if(mb>1) then
+                 call stdlib_qlarrc( 'T', in, s1, s2, d(ibegin),e(ibegin), pivmin, cnt, cnt1, &
+                           cnt2, iinfo)
+              endif
+              if(mb==1) then
+                 sigma = gl
+                 sgndef = one
+              elseif( cnt1 - indl >= indu - cnt2 ) then
+                 if( ( irange==allrng ) .and. (.not.forceb) ) then
+                    sigma = max(isleft,gl)
+                 elseif( usedqd ) then
+                    ! use gerschgorin bound as shift to get pos def matrix
+                    ! for dqds
+                    sigma = isleft
+                 else
+                    ! use approximation of the first desired eigenvalue of the
+                    ! block as shift
+                    sigma = max(isleft,vl)
+                 endif
+                 sgndef = one
+              else
+                 if( ( irange==allrng ) .and. (.not.forceb) ) then
+                    sigma = min(isrght,gu)
+                 elseif( usedqd ) then
+                    ! use gerschgorin bound as shift to get neg def matrix
+                    ! for dqds
+                    sigma = isrght
+                 else
+                    ! use approximation of the first desired eigenvalue of the
+                    ! block as shift
+                    sigma = min(isrght,vu)
+                 endif
+                 sgndef = -one
+              endif
+              ! an initial sigma has been chosen that will be used for computing
+              ! t - sigma i = l d l^t
+              ! define the increment tau of the shift in case the initial shift
+              ! needs to be refined to obtain a factorization with not too much
+              ! element growth.
+              if( usedqd ) then
+                 ! the initial sigma was to the outer end of the spectrum
+                 ! the matrix is definite and we need not retreat.
+                 tau = spdiam*eps*n + two*pivmin
+                 tau = max( tau,two*eps*abs(sigma) )
+              else
+                 if(mb>1) then
+                    clwdth = w(wend) + werr(wend) - w(wbegin) - werr(wbegin)
+                    avgap = abs(clwdth / real(wend-wbegin,KIND=qp))
+                    if( sgndef==one ) then
+                       tau = half*max(wgap(wbegin),avgap)
+                       tau = max(tau,werr(wbegin))
+                    else
+                       tau = half*max(wgap(wend-1),avgap)
+                       tau = max(tau,werr(wend))
+                    endif
+                 else
+                    tau = werr(wbegin)
+                 endif
+              endif
+              loop_80: do idum = 1, maxtry
+                 ! compute l d l^t factorization of tridiagonal matrix t - sigma i.
+                 ! store d in work(1:in), l in work(in+1:2*in), and reciprocals of
+                 ! pivots in work(2*in+1:3*in)
+                 dpivot = d( ibegin ) - sigma
+                 work( 1 ) = dpivot
+                 dmax = abs( work(1) )
+                 j = ibegin
+                 do i = 1, in - 1
+                    work( 2*in+i ) = one / work( i )
+                    tmp = e( j )*work( 2*in+i )
+                    work( in+i ) = tmp
+                    dpivot = ( d( j+1 )-sigma ) - tmp*e( j )
+                    work( i+1 ) = dpivot
+                    dmax = max( dmax, abs(dpivot) )
+                    j = j + 1
+                 end do
+                 ! check for element growth
+                 if( dmax > maxgrowth*spdiam ) then
+                    norep = .true.
+                 else
+                    norep = .false.
+                 endif
+                 if( usedqd .and. .not.norep ) then
+                    ! ensure the definiteness of the representation
+                    ! all entries of d (of l d l^t) must have the same sign
+                    do i = 1, in
+                       tmp = sgndef*work( i )
+                       if( tmp<zero ) norep = .true.
+                    end do
+                 endif
+                 if(norep) then
+                    ! note that in the case of irange=allrng, we use the gerschgorin
+                    ! shift which makes the matrix definite. so we should end up
+                    ! here really only in the case of irange = valrng or indrng.
+                    if( idum==maxtry-1 ) then
+                       if( sgndef==one ) then
+                          ! the fudged gerschgorin shift should succeed
+                          sigma =gl - fudge*spdiam*eps*n - fudge*two*pivmin
+                       else
+                          sigma =gu + fudge*spdiam*eps*n + fudge*two*pivmin
+                       end if
+                    else
+                       sigma = sigma - sgndef * tau
+                       tau = two * tau
+                    end if
+                 else
+                    ! an initial rrr is found
+                    go to 83
+                 end if
+              end do loop_80
+              ! if the program reaches this point, no base representation could be
+              ! found in maxtry iterations.
+              info = 2
+              return
+              83 continue
+              ! at this point, we have found an initial base representation
+              ! t - sigma i = l d l^t with not too much element growth.
+              ! store the shift.
+              e( iend ) = sigma
+              ! store d and l.
+              call stdlib_qcopy( in, work, 1, d( ibegin ), 1 )
+              call stdlib_qcopy( in-1, work( in+1 ), 1, e( ibegin ), 1 )
+              if(mb>1 ) then
+                 ! perturb each entry of the base representation by a small
+                 ! (but random) relative amount to overcome difficulties with
+                 ! glued matrices.
+                 do i = 1, 4
+                    iseed( i ) = 1
+                 end do
+                 call stdlib_qlarnv(2, iseed, 2*in-1, work(1))
+                 do i = 1,in-1
+                    d(ibegin+i-1) = d(ibegin+i-1)*(one+eps*pert*work(i))
+                    e(ibegin+i-1) = e(ibegin+i-1)*(one+eps*pert*work(in+i))
+                 end do
+                 d(iend) = d(iend)*(one+eps*four*work(in))
+              endif
+              ! don't update the gerschgorin intervals because keeping track
+              ! of the updates would be too much work in stdlib_qlarrv.
+              ! we update w instead and use it to locate the proper gerschgorin
+              ! intervals.
+              ! compute the required eigenvalues of l d l' by bisection or dqds
+              if ( .not.usedqd ) then
+                 ! if stdlib_qlarrd has been used, shift the eigenvalue approximations
+                 ! according to their representation. this is necessary for
+                 ! a uniform stdlib_qlarrv since dqds computes eigenvalues of the
+                 ! shifted representation. in stdlib_qlarrv, w will always hold the
+                 ! unshifted eigenvalue approximation.
+                 do j=wbegin,wend
+                    w(j) = w(j) - sigma
+                    werr(j) = werr(j) + abs(w(j)) * eps
+                 end do
+                 ! call stdlib_qlarrb to reduce eigenvalue error of the approximations
+                 ! from stdlib_qlarrd
+                 do i = ibegin, iend-1
+                    work( i ) = d( i ) * e( i )**2
+                 end do
+                 ! use bisection to find ev from indl to indu
+                 call stdlib_qlarrb(in, d(ibegin), work(ibegin),indl, indu, rtol1, rtol2, indl-1,&
+                 w(wbegin), wgap(wbegin), werr(wbegin),work( 2*n+1 ), iwork, pivmin, spdiam,in, &
+                           iinfo )
+                 if( iinfo /= 0 ) then
+                    info = -4
+                    return
+                 end if
+                 ! stdlib_qlarrb computes all gaps correctly except for the last one
+                 ! record distance to vu/gu
+                 wgap( wend ) = max( zero,( vu-sigma ) - ( w( wend ) + werr( wend ) ) )
+                 do i = indl, indu
+                    m = m + 1
+                    iblock(m) = jblk
+                    indexw(m) = i
+                 end do
+              else
+                 ! call dqds to get all eigs (and then possibly delete unwanted
+                 ! eigenvalues).
+                 ! note that dqds finds the eigenvalues of the l d l^t representation
+                 ! of t to high relative accuracy. high relative accuracy
+                 ! might be lost when the shift of the rrr is subtracted to obtain
+                 ! the eigenvalues of t. however, t is not guaranteed to define its
+                 ! eigenvalues to high relative accuracy anyway.
+                 ! set rtol to the order of the tolerance used in stdlib_qlasq2
+                 ! this is an estimated error, the worst case bound is 4*n*eps
+                 ! which is usually too large and requires unnecessary work to be
+                 ! done by bisection when computing the eigenvectors
+                 rtol = log(real(in,KIND=qp)) * four * eps
+                 j = ibegin
+                 do i = 1, in - 1
+                    work( 2*i-1 ) = abs( d( j ) )
+                    work( 2*i ) = e( j )*e( j )*work( 2*i-1 )
+                    j = j + 1
+                 end do
+                 work( 2*in-1 ) = abs( d( iend ) )
+                 work( 2*in ) = zero
+                 call stdlib_qlasq2( in, work, iinfo )
+                 if( iinfo /= 0 ) then
+                    ! if iinfo = -5 then an index is part of a tight cluster
+                    ! and should be changed. the index is in iwork(1) and the
+                    ! gap is in work(n+1)
+                    info = -5
+                    return
+                 else
+                    ! test that all eigenvalues are positive as expected
+                    do i = 1, in
+                       if( work( i )<zero ) then
+                          info = -6
+                          return
+                       endif
+                    end do
+                 end if
+                 if( sgndef>zero ) then
+                    do i = indl, indu
+                       m = m + 1
+                       w( m ) = work( in-i+1 )
+                       iblock( m ) = jblk
+                       indexw( m ) = i
+                    end do
+                 else
+                    do i = indl, indu
+                       m = m + 1
+                       w( m ) = -work( i )
+                       iblock( m ) = jblk
+                       indexw( m ) = i
+                    end do
+                 end if
+                 do i = m - mb + 1, m
+                    ! the value of rtol below should be the tolerance in stdlib_qlasq2
+                    werr( i ) = rtol * abs( w(i) )
+                 end do
+                 do i = m - mb + 1, m - 1
+                    ! compute the right gap between the intervals
+                    wgap( i ) = max( zero,w(i+1)-werr(i+1) - (w(i)+werr(i)) )
+                 end do
+                 wgap( m ) = max( zero,( vu-sigma ) - ( w( m ) + werr( m ) ) )
+              end if
+              ! proceed with next block
+              ibegin = iend + 1
+              wbegin = wend + 1
+           end do loop_170
+           return
+     end subroutine stdlib_qlarre
+
+     !> Given the initial representation L D L^T and its cluster of close
+     !> eigenvalues (in a relative measure), W( CLSTRT ), W( CLSTRT+1 ), ...
+     !> W( CLEND ), DLARRF: finds a new relatively robust representation
+     !> L D L^T - SIGMA I = L(+) D(+) L(+)^T such that at least one of the
+     !> eigenvalues of L(+) D(+) L(+)^T is relatively isolated.
+
+     pure subroutine stdlib_qlarrf( n, d, l, ld, clstrt, clend,w, wgap, werr,spdiam, clgapl, &
+               clgapr, pivmin, sigma,dplus, lplus, work, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: clstrt, clend, n
+           integer(ilp), intent(out) :: info
+           real(qp), intent(in) :: clgapl, clgapr, pivmin, spdiam
+           real(qp), intent(out) :: sigma
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*), l(*), ld(*), w(*), werr(*)
+           real(qp), intent(out) :: dplus(*), lplus(*), work(*)
+           real(qp), intent(inout) :: wgap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: quart = 0.25_qp
+           real(qp), parameter :: maxgrowth1 = 8._qp
+           real(qp), parameter :: maxgrowth2 = 8._qp
+           integer(ilp), parameter :: ktrymax = 1
+           integer(ilp), parameter :: sleft = 1
+           integer(ilp), parameter :: sright = 2
+           
+           ! Local Scalars 
+           logical(lk) :: dorrr1, forcer, nofail, sawnan1, sawnan2, tryrrr1
+           integer(ilp) :: i, indx, ktry,    shift
+           real(qp) :: avgap, bestshift, clwdth, eps, fact, fail, fail2, growthbound, ldelta, &
+           ldmax, lsigma, max1, max2, mingap, oldp, prod, rdelta, rdmax, rrr1, rrr2, rsigma, s, &
+                     smlgrowth, tmp, znm2
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           fact = real(2**ktrymax,KIND=qp)
+           eps = stdlib_qlamch( 'PRECISION' )
+           shift = 0
+           forcer = .false.
+           ! note that we cannot guarantee that for any of the shifts tried,
+           ! the factorization has a small or even moderate element growth.
+           ! there could be ritz values at both ends of the cluster and despite
+           ! backing off, there are examples where all factorizations tried
+           ! (in ieee mode, allowing zero pivots
+           ! element growth.
+           ! for this reason, we should use pivmin in this subroutine so that at
+           ! least the l d l^t factorization exists. it can be checked afterwards
+           ! whether the element growth caused bad residuals/orthogonality.
+           ! decide whether the code should accept the best among all
+           ! representations despite large element growth or signal info=1
+           ! setting nofail to .false. for quick fix for bug 113
+           nofail = .false.
+           ! compute the average gap length of the cluster
+           clwdth = abs(w(clend)-w(clstrt)) + werr(clend) + werr(clstrt)
+           avgap = clwdth / real(clend-clstrt,KIND=qp)
+           mingap = min(clgapl, clgapr)
+           ! initial values for shifts to both ends of cluster
+           lsigma = min(w( clstrt ),w( clend )) - werr( clstrt )
+           rsigma = max(w( clstrt ),w( clend )) + werr( clend )
+           ! use a small fudge to make sure that we really shift to the outside
+           lsigma = lsigma - abs(lsigma)* four * eps
+           rsigma = rsigma + abs(rsigma)* four * eps
+           ! compute upper bounds for how much to back off the initial shifts
+           ldmax = quart * mingap + two * pivmin
+           rdmax = quart * mingap + two * pivmin
+           ldelta = max(avgap,wgap( clstrt ))/fact
+           rdelta = max(avgap,wgap( clend-1 ))/fact
+           ! initialize the record of the best representation found
+           s = stdlib_qlamch( 'S' )
+           smlgrowth = one / s
+           fail = real(n-1,KIND=qp)*mingap/(spdiam*eps)
+           fail2 = real(n-1,KIND=qp)*mingap/(spdiam*sqrt(eps))
+           bestshift = lsigma
+           ! while (ktry <= ktrymax)
+           ktry = 0
+           growthbound = maxgrowth1*spdiam
+           5 continue
+           sawnan1 = .false.
+           sawnan2 = .false.
+           ! ensure that we do not back off too much of the initial shifts
+           ldelta = min(ldmax,ldelta)
+           rdelta = min(rdmax,rdelta)
+           ! compute the element growth when shifting to both ends of the cluster
+           ! accept the shift if there is no element growth at one of the two ends
+           ! left end
+           s = -lsigma
+           dplus( 1 ) = d( 1 ) + s
+           if(abs(dplus(1))<pivmin) then
+              dplus(1) = -pivmin
+              ! need to set sawnan1 because refined rrr test should not be used
+              ! in this case
+              sawnan1 = .true.
+           endif
+           max1 = abs( dplus( 1 ) )
+           do i = 1, n - 1
+              lplus( i ) = ld( i ) / dplus( i )
+              s = s*lplus( i )*l( i ) - lsigma
+              dplus( i+1 ) = d( i+1 ) + s
+              if(abs(dplus(i+1))<pivmin) then
+                 dplus(i+1) = -pivmin
+                 ! need to set sawnan1 because refined rrr test should not be used
+                 ! in this case
+                 sawnan1 = .true.
+              endif
+              max1 = max( max1,abs(dplus(i+1)) )
+           end do
+           sawnan1 = sawnan1 .or.  stdlib_qisnan( max1 )
+           if( forcer .or.(max1<=growthbound .and. .not.sawnan1 ) ) then
+              sigma = lsigma
+              shift = sleft
+              goto 100
+           endif
+           ! right end
+           s = -rsigma
+           work( 1 ) = d( 1 ) + s
+           if(abs(work(1))<pivmin) then
+              work(1) = -pivmin
+              ! need to set sawnan2 because refined rrr test should not be used
+              ! in this case
+              sawnan2 = .true.
+           endif
+           max2 = abs( work( 1 ) )
+           do i = 1, n - 1
+              work( n+i ) = ld( i ) / work( i )
+              s = s*work( n+i )*l( i ) - rsigma
+              work( i+1 ) = d( i+1 ) + s
+              if(abs(work(i+1))<pivmin) then
+                 work(i+1) = -pivmin
+                 ! need to set sawnan2 because refined rrr test should not be used
+                 ! in this case
+                 sawnan2 = .true.
+              endif
+              max2 = max( max2,abs(work(i+1)) )
+           end do
+           sawnan2 = sawnan2 .or.  stdlib_qisnan( max2 )
+           if( forcer .or.(max2<=growthbound .and. .not.sawnan2 ) ) then
+              sigma = rsigma
+              shift = sright
+              goto 100
+           endif
+           ! if we are at this point, both shifts led to too much element growth
+           ! record the better of the two shifts (provided it didn't lead to nan)
+           if(sawnan1.and.sawnan2) then
+              ! both max1 and max2 are nan
+              goto 50
+           else
+              if( .not.sawnan1 ) then
+                 indx = 1
+                 if(max1<=smlgrowth) then
+                    smlgrowth = max1
+                    bestshift = lsigma
+                 endif
+              endif
+              if( .not.sawnan2 ) then
+                 if(sawnan1 .or. max2<=max1) indx = 2
+                 if(max2<=smlgrowth) then
+                    smlgrowth = max2
+                    bestshift = rsigma
+                 endif
+              endif
+           endif
+           ! if we are here, both the left and the right shift led to
+           ! element growth. if the element growth is moderate, then
+           ! we may still accept the representation, if it passes a
+           ! refined test for rrr. this test supposes that no nan occurred.
+           ! moreover, we use the refined rrr test only for isolated clusters.
+           if((clwdth<mingap/real(128,KIND=qp)) .and.(min(max1,max2)<fail2).and.(.not.sawnan1)&
+                     .and.(.not.sawnan2)) then
+              dorrr1 = .true.
+           else
+              dorrr1 = .false.
+           endif
+           tryrrr1 = .true.
+           if( tryrrr1 .and. dorrr1 ) then
+           if(indx==1) then
+              tmp = abs( dplus( n ) )
+              znm2 = one
+              prod = one
+              oldp = one
+              do i = n-1, 1, -1
+                 if( prod <= eps ) then
+                    prod =((dplus(i+1)*work(n+i+1))/(dplus(i)*work(n+i)))*oldp
+                 else
+                    prod = prod*abs(work(n+i))
+                 end if
+                 oldp = prod
+                 znm2 = znm2 + prod**2
+                 tmp = max( tmp, abs( dplus( i ) * prod ))
+              end do
+              rrr1 = tmp/( spdiam * sqrt( znm2 ) )
+              if (rrr1<=maxgrowth2) then
+                 sigma = lsigma
+                 shift = sleft
+                 goto 100
+              endif
+           else if(indx==2) then
+              tmp = abs( work( n ) )
+              znm2 = one
+              prod = one
+              oldp = one
+              do i = n-1, 1, -1
+                 if( prod <= eps ) then
+                    prod = ((work(i+1)*lplus(i+1))/(work(i)*lplus(i)))*oldp
+                 else
+                    prod = prod*abs(lplus(i))
+                 end if
+                 oldp = prod
+                 znm2 = znm2 + prod**2
+                 tmp = max( tmp, abs( work( i ) * prod ))
+              end do
+              rrr2 = tmp/( spdiam * sqrt( znm2 ) )
+              if (rrr2<=maxgrowth2) then
+                 sigma = rsigma
+                 shift = sright
+                 goto 100
+              endif
+           end if
+           endif
+           50 continue
+           if (ktry<ktrymax) then
+              ! if we are here, both shifts failed also the rrr test.
+              ! back off to the outside
+              lsigma = max( lsigma - ldelta,lsigma - ldmax)
+              rsigma = min( rsigma + rdelta,rsigma + rdmax )
+              ldelta = two * ldelta
+              rdelta = two * rdelta
+              ktry = ktry + 1
+              goto 5
+           else
+              ! none of the representations investigated satisfied our
+              ! criteria. take the best one we found.
+              if((smlgrowth<fail).or.nofail) then
+                 lsigma = bestshift
+                 rsigma = bestshift
+                 forcer = .true.
+                 goto 5
+              else
+                 info = 1
+                 return
+              endif
+           end if
+           100 continue
+           if (shift==sleft) then
+           elseif (shift==sright) then
+              ! store new l and d back into dplus, lplus
+              call stdlib_qcopy( n, work, 1, dplus, 1 )
+              call stdlib_qcopy( n-1, work(n+1), 1, lplus, 1 )
+           endif
+           return
+     end subroutine stdlib_qlarrf
+
+     !> Given the initial eigenvalue approximations of T, DLARRJ:
+     !> does  bisection to refine the eigenvalues of T,
+     !> W( IFIRST-OFFSET ) through W( ILAST-OFFSET ), to more accuracy. Initial
+     !> guesses for these eigenvalues are input in W, the corresponding estimate
+     !> of the error in these guesses in WERR. During bisection, intervals
+     !> [left, right] are maintained by storing their mid-points and
+     !> semi-widths in the arrays W and WERR respectively.
+
+     pure subroutine stdlib_qlarrj( n, d, e2, ifirst, ilast,rtol, offset, w, werr, work, iwork,&
+               pivmin, spdiam, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ifirst, ilast, n, offset
+           integer(ilp), intent(out) :: info
+           real(qp), intent(in) :: pivmin, rtol, spdiam
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: d(*), e2(*)
+           real(qp), intent(inout) :: w(*), werr(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           integer(ilp) :: maxitr
+           ! Local Scalars 
+           integer(ilp) :: cnt, i, i1, i2, ii, iter, j, k, next, nint, olnint, p, prev, &
+                     savi1
+           real(qp) :: dplus, fac, left, mid, right, s, tmp, width
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           maxitr = int( ( log( spdiam+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+           ! initialize unconverged intervals in [ work(2*i-1), work(2*i) ].
+           ! the sturm count, count( work(2*i-1) ) is arranged to be i-1, while
+           ! count( work(2*i) ) is stored in iwork( 2*i ). the integer iwork( 2*i-1 )
+           ! for an unconverged interval is set to the index of the next unconverged
+           ! interval, and is -1 or 0 for a converged interval. thus a linked
+           ! list of unconverged intervals is set up.
+           i1 = ifirst
+           i2 = ilast
+           ! the number of unconverged intervals
+           nint = 0
+           ! the last unconverged interval found
+           prev = 0
+           loop_75: do i = i1, i2
+              k = 2*i
+              ii = i - offset
+              left = w( ii ) - werr( ii )
+              mid = w(ii)
+              right = w( ii ) + werr( ii )
+              width = right - mid
+              tmp = max( abs( left ), abs( right ) )
+              ! the following test prevents the test of converged intervals
+              if( width<rtol*tmp ) then
+                 ! this interval has already converged and does not need refinement.
+                 ! (note that the gaps might change through refining the
+                  ! eigenvalues, however, they can only get bigger.)
+                 ! remove it from the list.
+                 iwork( k-1 ) = -1
+                 ! make sure that i1 always points to the first unconverged interval
+                 if((i==i1).and.(i<i2)) i1 = i + 1
+                 if((prev>=i1).and.(i<=i2)) iwork( 2*prev-1 ) = i + 1
+              else
+                 ! unconverged interval found
+                 prev = i
+                 ! make sure that [left,right] contains the desired eigenvalue
+                 ! do while( cnt(left)>i-1 )
+                 fac = one
+                 20 continue
+                 cnt = 0
+                 s = left
+                 dplus = d( 1 ) - s
+                 if( dplus<zero ) cnt = cnt + 1
+                 do j = 2, n
+                    dplus = d( j ) - s - e2( j-1 )/dplus
+                    if( dplus<zero ) cnt = cnt + 1
+                 end do
+                 if( cnt>i-1 ) then
+                    left = left - werr( ii )*fac
+                    fac = two*fac
+                    go to 20
+                 end if
+                 ! do while( cnt(right)<i )
+                 fac = one
+                 50 continue
+                 cnt = 0
+                 s = right
+                 dplus = d( 1 ) - s
+                 if( dplus<zero ) cnt = cnt + 1
+                 do j = 2, n
+                    dplus = d( j ) - s - e2( j-1 )/dplus
+                    if( dplus<zero ) cnt = cnt + 1
+                 end do
+                 if( cnt<i ) then
+                    right = right + werr( ii )*fac
+                    fac = two*fac
+                    go to 50
+                 end if
+                 nint = nint + 1
+                 iwork( k-1 ) = i + 1
+                 iwork( k ) = cnt
+              end if
+              work( k-1 ) = left
+              work( k ) = right
+           end do loop_75
+           savi1 = i1
+           ! do while( nint>0 ), i.e. there are still unconverged intervals
+           ! and while (iter<maxitr)
+           iter = 0
+           80 continue
+           prev = i1 - 1
+           i = i1
+           olnint = nint
+           loop_100: do p = 1, olnint
+              k = 2*i
+              ii = i - offset
+              next = iwork( k-1 )
+              left = work( k-1 )
+              right = work( k )
+              mid = half*( left + right )
+              ! semiwidth of interval
+              width = right - mid
+              tmp = max( abs( left ), abs( right ) )
+              if( ( width<rtol*tmp ) .or.(iter==maxitr) )then
+                 ! reduce number of unconverged intervals
+                 nint = nint - 1
+                 ! mark interval as converged.
+                 iwork( k-1 ) = 0
+                 if( i1==i ) then
+                    i1 = next
+                 else
+                    ! prev holds the last unconverged interval previously examined
+                    if(prev>=i1) iwork( 2*prev-1 ) = next
+                 end if
+                 i = next
+                 cycle loop_100
+              end if
+              prev = i
+              ! perform one bisection step
+              cnt = 0
+              s = mid
+              dplus = d( 1 ) - s
+              if( dplus<zero ) cnt = cnt + 1
+              do j = 2, n
+                 dplus = d( j ) - s - e2( j-1 )/dplus
+                 if( dplus<zero ) cnt = cnt + 1
+              end do
+              if( cnt<=i-1 ) then
+                 work( k-1 ) = mid
+              else
+                 work( k ) = mid
+              end if
+              i = next
+           end do loop_100
+           iter = iter + 1
+           ! do another loop if there are still unconverged intervals
+           ! however, in the last iteration, all intervals are accepted
+           ! since this is the best we can do.
+           if( ( nint>0 ).and.(iter<=maxitr) ) go to 80
+           ! at this point, all the intervals have converged
+           do i = savi1, ilast
+              k = 2*i
+              ii = i - offset
+              ! all intervals marked by '0' have been refined.
+              if( iwork( k-1 )==0 ) then
+                 w( ii ) = half*( work( k-1 )+work( k ) )
+                 werr( ii ) = work( k ) - w( ii )
+              end if
+           end do
+           return
+     end subroutine stdlib_qlarrj
+
+     !> DLARRK: computes one eigenvalue of a symmetric tridiagonal
+     !> matrix T to suitable accuracy. This is an auxiliary code to be
+     !> called from DSTEMR.
+     !> To avoid overflow, the matrix must be scaled so that its
+     !> largest element is no greater than overflow**(1/2) * underflow**(1/4) in absolute value, and for greatest
+     !> accuracy, it should not be much smaller than that.
+     !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+     !> Matrix", Report CS41, Computer Science Dept., Stanford
+     !> University, July 21, 1966.
+
+     pure subroutine stdlib_qlarrk( n, iw, gl, gu,d, e2, pivmin, reltol, w, werr, info)
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: iw, n
+           real(qp), intent(in) :: pivmin, reltol, gl, gu
+           real(qp), intent(out) :: w, werr
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*), e2(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: fudge = two
+           
+           ! Local Scalars 
+           integer(ilp) :: i, it, itmax, negcnt
+           real(qp) :: atoli, eps, left, mid, right, rtoli, tmp1, tmp2, tnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              info = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           tnorm = max( abs( gl ), abs( gu ) )
+           rtoli = reltol
+           atoli = fudge*two*pivmin
+           itmax = int( ( log( tnorm+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+           info = -1
+           left = gl - fudge*tnorm*eps*n - fudge*two*pivmin
+           right = gu + fudge*tnorm*eps*n + fudge*two*pivmin
+           it = 0
+           10 continue
+           ! check if interval converged or maximum number of iterations reached
+           tmp1 = abs( right - left )
+           tmp2 = max( abs(right), abs(left) )
+           if( tmp1<max( atoli, pivmin, rtoli*tmp2 ) ) then
+              info = 0
+              goto 30
+           endif
+           if(it>itmax)goto 30
+           ! count number of negative pivots for mid-point
+           it = it + 1
+           mid = half * (left + right)
+           negcnt = 0
+           tmp1 = d( 1 ) - mid
+           if( abs( tmp1 )<pivmin )tmp1 = -pivmin
+           if( tmp1<=zero )negcnt = negcnt + 1
+           do i = 2, n
+              tmp1 = d( i ) - e2( i-1 ) / tmp1 - mid
+              if( abs( tmp1 )<pivmin )tmp1 = -pivmin
+              if( tmp1<=zero )negcnt = negcnt + 1
+           end do
+           if(negcnt>=iw) then
+              right = mid
+           else
+              left = mid
+           endif
+           goto 10
+           30 continue
+           ! converged or maximum number of iterations reached
+           w = half * (left + right)
+           werr = half * abs( right - left )
+           return
+     end subroutine stdlib_qlarrk
+
+     !> Perform tests to decide whether the symmetric tridiagonal matrix T
+     !> warrants expensive computations which guarantee high relative accuracy
+     !> in the eigenvalues.
+
+     pure subroutine stdlib_qlarrr( n, d, e, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*)
+           real(qp), intent(inout) :: e(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: relcond = 0.999_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           logical(lk) :: yesrel
+           real(qp) :: eps, safmin, smlnum, rmin, tmp, tmp2, offdig, offdig2
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              info = 0
+              return
+           end if
+           ! as a default, do not go for relative-accuracy preserving computations.
+           info = 1
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           rmin = sqrt( smlnum )
+           ! tests for relative accuracy
+           ! test for scaled diagonal dominance
+           ! scale the diagonal entries to one and check whether the sum of the
+           ! off-diagonals is less than one
+           ! the sdd relative error bounds have a 1/(1- 2*x) factor in them,
+           ! x = max(offdig + offdig2), so when x is close to 1/2, no relative
+           ! accuracy is promised.  in the notation of the code fragment below,
+           ! 1/(1 - (offdig + offdig2)) is the condition number.
+           ! we don't think it is worth going into "sdd mode" unless the relative
+           ! condition number is reasonable, not 1/macheps.
+           ! the threshold should be compatible with other thresholds used in the
+           ! code. we set  offdig + offdig2 <= .999_qp =: relcond, it corresponds
+           ! to losing at most 3 decimal digits: 1 / (1 - (offdig + offdig2)) <= 1000
+           ! instead of the current offdig + offdig2 < 1
+           yesrel = .true.
+           offdig = zero
+           tmp = sqrt(abs(d(1)))
+           if (tmp<rmin) yesrel = .false.
+           if(.not.yesrel) goto 11
+           do i = 2, n
+              tmp2 = sqrt(abs(d(i)))
+              if (tmp2<rmin) yesrel = .false.
+              if(.not.yesrel) goto 11
+              offdig2 = abs(e(i-1))/(tmp*tmp2)
+              if(offdig+offdig2>=relcond) yesrel = .false.
+              if(.not.yesrel) goto 11
+              tmp = tmp2
+              offdig = offdig2
+           end do
+           11 continue
+           if( yesrel ) then
+              info = 0
+              return
+           else
+           endif
+           ! *** more to be implemented ***
+           ! test if the lower bidiagonal matrix l from t = l d l^t
+           ! (zero shift facto) is well conditioned
+           ! test if the upper bidiagonal matrix u from t = u d u^t
+           ! (zero shift facto) is well conditioned.
+           ! in this case, the matrix needs to be flipped and, at the end
+           ! of the eigenvector computation, the flip needs to be applied
+           ! to the computed eigenvectors (and the support)
+           return
+     end subroutine stdlib_qlarrr
+
+     !> DLARRV: computes the eigenvectors of the tridiagonal matrix
+     !> T = L D L**T given L, D and APPROXIMATIONS to the eigenvalues of L D L**T.
+     !> The input eigenvalues should have been computed by DLARRE.
+
+     pure subroutine stdlib_qlarrv( n, vl, vu, d, l, pivmin,isplit, m, dol, dou, minrgp,rtol1, &
+               rtol2, w, werr, wgap,iblock, indexw, gers, z, ldz, isuppz,work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: dol, dou, ldz, m, n
+           integer(ilp), intent(out) :: info
+           real(qp), intent(in) :: minrgp, pivmin, vl, vu
+           real(qp), intent(inout) :: rtol1, rtol2
+           ! Array Arguments 
+           integer(ilp), intent(in) :: iblock(*), indexw(*), isplit(*)
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(qp), intent(inout) :: d(*), l(*), w(*), werr(*), wgap(*)
+           real(qp), intent(in) :: gers(*)
+           real(qp), intent(out) :: work(*)
+           real(qp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxitr = 10
+           
+           
+           ! Local Scalars 
+           logical(lk) :: eskip, needbs, stp2ii, tryrqc, usedbs, usedrq
+           integer(ilp) :: done, i, ibegin, idone, iend, ii, iindc1, iindc2, iindr, iindwk, iinfo,&
+            im, in, indeig, indld, indlld, indwrk, isupmn, isupmx, iter, itmp1, j, jblk, k, &
+            miniwsize, minwsize, nclus, ndepth, negcnt, newcls, newfst, newftt, newlst, newsiz, &
+            offset, oldcls, oldfst, oldien, oldlst, oldncl, p, parity, q, wbegin, wend, windex, &
+                      windmn, windpl, zfrom, zto, zusedl, zusedu, zusedw
+           real(qp) :: bstres, bstw, eps, fudge, gap, gaptol, gl, gu, lambda, left, lgap, mingma, &
+           nrminv, resid, rgap, right, rqcorr, rqtol, savgap, sgndef, sigma, spdiam, ssigma, tau, &
+                     tmp, tol, ztz
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( (n<=0).or.(m<=0) ) then
+              return
+           end if
+           ! the first n entries of work are reserved for the eigenvalues
+           indld = n+1
+           indlld= 2*n+1
+           indwrk= 3*n+1
+           minwsize = 12 * n
+           do i= 1,minwsize
+              work( i ) = zero
+           end do
+           ! iwork(iindr+1:iindr+n) hold the twist indices r for the
+           ! factorization used to compute the fp vector
+           iindr = 0
+           ! iwork(iindc1+1:iinc2+n) are used to store the clusters of the current
+           ! layer and the one above.
+           iindc1 = n
+           iindc2 = 2*n
+           iindwk = 3*n + 1
+           miniwsize = 7 * n
+           do i= 1,miniwsize
+              iwork( i ) = 0
+           end do
+           zusedl = 1
+           if(dol>1) then
+              ! set lower bound for use of z
+              zusedl = dol-1
+           endif
+           zusedu = m
+           if(dou<m) then
+              ! set lower bound for use of z
+              zusedu = dou+1
+           endif
+           ! the width of the part of z that is used
+           zusedw = zusedu - zusedl + 1
+           call stdlib_qlaset( 'FULL', n, zusedw, zero, zero,z(1,zusedl), ldz )
+           eps = stdlib_qlamch( 'PRECISION' )
+           rqtol = two * eps
+           ! set expert flags for standard code.
+           tryrqc = .true.
+           if((dol==1).and.(dou==m)) then
+           else
+              ! only selected eigenpairs are computed. since the other evalues
+              ! are not refined by rq iteration, bisection has to compute to full
+              ! accuracy.
+              rtol1 = four * eps
+              rtol2 = four * eps
+           endif
+           ! the entries wbegin:wend in w, werr, wgap correspond to the
+           ! desired eigenvalues. the support of the nonzero eigenvector
+           ! entries is contained in the interval ibegin:iend.
+           ! remark that if k eigenpairs are desired, then the eigenvectors
+           ! are stored in k contiguous columns of z.
+           ! done is the number of eigenvectors already computed
+           done = 0
+           ibegin = 1
+           wbegin = 1
+           loop_170: do jblk = 1, iblock( m )
+              iend = isplit( jblk )
+              sigma = l( iend )
+              ! find the eigenvectors of the submatrix indexed ibegin
+              ! through iend.
+              wend = wbegin - 1
+              15 continue
+              if( wend<m ) then
+                 if( iblock( wend+1 )==jblk ) then
+                    wend = wend + 1
+                    go to 15
+                 end if
+              end if
+              if( wend<wbegin ) then
+                 ibegin = iend + 1
+                 cycle loop_170
+              elseif( (wend<dol).or.(wbegin>dou) ) then
+                 ibegin = iend + 1
+                 wbegin = wend + 1
+                 cycle loop_170
+              end if
+              ! find local spectral diameter of the block
+              gl = gers( 2*ibegin-1 )
+              gu = gers( 2*ibegin )
+              do i = ibegin+1 , iend
+                 gl = min( gers( 2*i-1 ), gl )
+                 gu = max( gers( 2*i ), gu )
+              end do
+              spdiam = gu - gl
+              ! oldien is the last index of the previous block
+              oldien = ibegin - 1
+              ! calculate the size of the current block
+              in = iend - ibegin + 1
+              ! the number of eigenvalues in the current block
+              im = wend - wbegin + 1
+              ! this is for a 1x1 block
+              if( ibegin==iend ) then
+                 done = done+1
+                 z( ibegin, wbegin ) = one
+                 isuppz( 2*wbegin-1 ) = ibegin
+                 isuppz( 2*wbegin ) = ibegin
+                 w( wbegin ) = w( wbegin ) + sigma
+                 work( wbegin ) = w( wbegin )
+                 ibegin = iend + 1
+                 wbegin = wbegin + 1
+                 cycle loop_170
+              end if
+              ! the desired (shifted) eigenvalues are stored in w(wbegin:wend)
+              ! note that these can be approximations, in this case, the corresp.
+              ! entries of werr give the size of the uncertainty interval.
+              ! the eigenvalue approximations will be refined when necessary as
+              ! high relative accuracy is required for the computation of the
+              ! corresponding eigenvectors.
+              call stdlib_qcopy( im, w( wbegin ), 1,work( wbegin ), 1 )
+              ! we store in w the eigenvalue approximations w.r.t. the original
+              ! matrix t.
+              do i=1,im
+                 w(wbegin+i-1) = w(wbegin+i-1)+sigma
+              end do
+              ! ndepth is the current depth of the representation tree
+              ndepth = 0
+              ! parity is either 1 or 0
+              parity = 1
+              ! nclus is the number of clusters for the next level of the
+              ! representation tree, we start with nclus = 1 for the root
+              nclus = 1
+              iwork( iindc1+1 ) = 1
+              iwork( iindc1+2 ) = im
+              ! idone is the number of eigenvectors already computed in the current
+              ! block
+              idone = 0
+              ! loop while( idone<im )
+              ! generate the representation tree for the current block and
+              ! compute the eigenvectors
+              40 continue
+              if( idone<im ) then
+                 ! this is a crude protection against infinitely deep trees
+                 if( ndepth>m ) then
+                    info = -2
+                    return
+                 endif
+                 ! breadth first processing of the current level of the representation
+                 ! tree: oldncl = number of clusters on current level
+                 oldncl = nclus
+                 ! reset nclus to count the number of child clusters
+                 nclus = 0
+                 parity = 1 - parity
+                 if( parity==0 ) then
+                    oldcls = iindc1
+                    newcls = iindc2
+                 else
+                    oldcls = iindc2
+                    newcls = iindc1
+                 end if
+                 ! process the clusters on the current level
+                 loop_150: do i = 1, oldncl
+                    j = oldcls + 2*i
+                    ! oldfst, oldlst = first, last index of current cluster.
+                                     ! cluster indices start with 1 and are relative
+                                     ! to wbegin when accessing w, wgap, werr, z
+                    oldfst = iwork( j-1 )
+                    oldlst = iwork( j )
+                    if( ndepth>0 ) then
+                       ! retrieve relatively robust representation (rrr) of cluster
+                       ! that has been computed at the previous level
+                       ! the rrr is stored in z and overwritten once the eigenvectors
+                       ! have been computed or when the cluster is refined
+                       if((dol==1).and.(dou==m)) then
+                          ! get representation from location of the leftmost evalue
+                          ! of the cluster
+                          j = wbegin + oldfst - 1
+                       else
+                          if(wbegin+oldfst-1<dol) then
+                             ! get representation from the left end of z array
+                             j = dol - 1
+                          elseif(wbegin+oldfst-1>dou) then
+                             ! get representation from the right end of z array
+                             j = dou
+                          else
+                             j = wbegin + oldfst - 1
+                          endif
+                       endif
+                       call stdlib_qcopy( in, z( ibegin, j ), 1, d( ibegin ), 1 )
+                       call stdlib_qcopy( in-1, z( ibegin, j+1 ), 1, l( ibegin ),1 )
+                       sigma = z( iend, j+1 )
+                       ! set the corresponding entries in z to zero
+                       call stdlib_qlaset( 'FULL', in, 2, zero, zero,z( ibegin, j), ldz )
+                    end if
+                    ! compute dl and dll of current rrr
+                    do j = ibegin, iend-1
+                       tmp = d( j )*l( j )
+                       work( indld-1+j ) = tmp
+                       work( indlld-1+j ) = tmp*l( j )
+                    end do
+                    if( ndepth>0 ) then
+                       ! p and q are index of the first and last eigenvalue to compute
+                       ! within the current block
+                       p = indexw( wbegin-1+oldfst )
+                       q = indexw( wbegin-1+oldlst )
+                       ! offset for the arrays work, wgap and werr, i.e., the p-offset
+                       ! through the q-offset elements of these arrays are to be used.
+                        ! offset = p-oldfst
+                       offset = indexw( wbegin ) - 1
+                       ! perform limited bisection (if necessary) to get approximate
+                       ! eigenvalues to the precision needed.
+                       call stdlib_qlarrb( in, d( ibegin ),work(indlld+ibegin-1),p, q, rtol1, &
+                       rtol2, offset,work(wbegin),wgap(wbegin),werr(wbegin),work( indwrk ), iwork(&
+                                  iindwk ),pivmin, spdiam, in, iinfo )
+                       if( iinfo/=0 ) then
+                          info = -1
+                          return
+                       endif
+                       ! we also recompute the extremal gaps. w holds all eigenvalues
+                       ! of the unshifted matrix and must be used for computation
+                       ! of wgap, the entries of work might stem from rrrs with
+                       ! different shifts. the gaps from wbegin-1+oldfst to
+                       ! wbegin-1+oldlst are correctly computed in stdlib_qlarrb.
+                       ! however, we only allow the gaps to become greater since
+                       ! this is what should happen when we decrease werr
+                       if( oldfst>1) then
+                          wgap( wbegin+oldfst-2 ) =max(wgap(wbegin+oldfst-2),w(wbegin+oldfst-1)-&
+                          werr(wbegin+oldfst-1)- w(wbegin+oldfst-2)-werr(wbegin+oldfst-2) )
+                                    
+                       endif
+                       if( wbegin + oldlst -1 < wend ) then
+                          wgap( wbegin+oldlst-1 ) =max(wgap(wbegin+oldlst-1),w(wbegin+oldlst)-&
+                                    werr(wbegin+oldlst)- w(wbegin+oldlst-1)-werr(wbegin+oldlst-1) )
+                       endif
+                       ! each time the eigenvalues in work get refined, we store
+                       ! the newly found approximation with all shifts applied in w
+                       do j=oldfst,oldlst
+                          w(wbegin+j-1) = work(wbegin+j-1)+sigma
+                       end do
+                    end if
+                    ! process the current node.
+                    newfst = oldfst
+                    loop_140: do j = oldfst, oldlst
+                       if( j==oldlst ) then
+                          ! we are at the right end of the cluster, this is also the
+                          ! boundary of the child cluster
+                          newlst = j
+                       else if ( wgap( wbegin + j -1)>=minrgp* abs( work(wbegin + j -1) ) ) &
+                                 then
+                          ! the right relative gap is big enough, the child cluster
+                          ! (newfst,..,newlst) is well separated from the following
+                          newlst = j
+                        else
+                          ! inside a child cluster, the relative gap is not
+                          ! big enough.
+                          cycle loop_140
+                       end if
+                       ! compute size of child cluster found
+                       newsiz = newlst - newfst + 1
+                       ! newftt is the place in z where the new rrr or the computed
+                       ! eigenvector is to be stored
+                       if((dol==1).and.(dou==m)) then
+                          ! store representation at location of the leftmost evalue
+                          ! of the cluster
+                          newftt = wbegin + newfst - 1
+                       else
+                          if(wbegin+newfst-1<dol) then
+                             ! store representation at the left end of z array
+                             newftt = dol - 1
+                          elseif(wbegin+newfst-1>dou) then
+                             ! store representation at the right end of z array
+                             newftt = dou
+                          else
+                             newftt = wbegin + newfst - 1
+                          endif
+                       endif
+                       if( newsiz>1) then
+                          ! current child is not a singleton but a cluster.
+                          ! compute and store new representation of child.
+                          ! compute left and right cluster gap.
+                          ! lgap and rgap are not computed from work because
+                          ! the eigenvalue approximations may stem from rrrs
+                          ! different shifts. however, w hold all eigenvalues
+                          ! of the unshifted matrix. still, the entries in wgap
+                          ! have to be computed from work since the entries
+                          ! in w might be of the same order so that gaps are not
+                          ! exhibited correctly for very close eigenvalues.
+                          if( newfst==1 ) then
+                             lgap = max( zero,w(wbegin)-werr(wbegin) - vl )
+                         else
+                             lgap = wgap( wbegin+newfst-2 )
+                          endif
+                          rgap = wgap( wbegin+newlst-1 )
+                          ! compute left- and rightmost eigenvalue of child
+                          ! to high precision in order to shift as close
+                          ! as possible and obtain as large relative gaps
+                          ! as possible
+                          do k =1,2
+                             if(k==1) then
+                                p = indexw( wbegin-1+newfst )
+                             else
+                                p = indexw( wbegin-1+newlst )
+                             endif
+                             offset = indexw( wbegin ) - 1
+                             call stdlib_qlarrb( in, d(ibegin),work( indlld+ibegin-1 ),p,p,rqtol, &
+                             rqtol, offset,work(wbegin),wgap(wbegin),werr(wbegin),work( indwrk ),&
+                                       iwork( iindwk ), pivmin, spdiam,in, iinfo )
+                          end do
+                          if((wbegin+newlst-1<dol).or.(wbegin+newfst-1>dou)) then
+                             ! if the cluster contains no desired eigenvalues
+                             ! skip the computation of that branch of the rep. tree
+                             ! we could skip before the refinement of the extremal
+                             ! eigenvalues of the child, but then the representation
+                             ! tree could be different from the one when nothing is
+                             ! skipped. for this reason we skip at this place.
+                             idone = idone + newlst - newfst + 1
+                             goto 139
+                          endif
+                          ! compute rrr of child cluster.
+                          ! note that the new rrr is stored in z
+                          ! stdlib_qlarrf needs lwork = 2*n
+                          call stdlib_qlarrf( in, d( ibegin ), l( ibegin ),work(indld+ibegin-1),&
+                          newfst, newlst, work(wbegin),wgap(wbegin), werr(wbegin),spdiam, lgap, &
+                          rgap, pivmin, tau,z(ibegin, newftt),z(ibegin, newftt+1),work( indwrk ), &
+                                    iinfo )
+                          if( iinfo==0 ) then
+                             ! a new rrr for the cluster was found by stdlib_qlarrf
+                             ! update shift and store it
+                             ssigma = sigma + tau
+                             z( iend, newftt+1 ) = ssigma
+                             ! work() are the midpoints and werr() the semi-width
+                             ! note that the entries in w are unchanged.
+                             do k = newfst, newlst
+                                fudge =three*eps*abs(work(wbegin+k-1))
+                                work( wbegin + k - 1 ) =work( wbegin + k - 1) - tau
+                                fudge = fudge +four*eps*abs(work(wbegin+k-1))
+                                ! fudge errors
+                                werr( wbegin + k - 1 ) =werr( wbegin + k - 1 ) + fudge
+                                ! gaps are not fudged. provided that werr is small
+                                ! when eigenvalues are close, a zero gap indicates
+                                ! that a new representation is needed for resolving
+                                ! the cluster. a fudge could lead to a wrong decision
+                                ! of judging eigenvalues 'separated' which in
+                                ! reality are not. this could have a negative impact
+                                ! on the orthogonality of the computed eigenvectors.
+                             end do
+                             nclus = nclus + 1
+                             k = newcls + 2*nclus
+                             iwork( k-1 ) = newfst
+                             iwork( k ) = newlst
+                          else
+                             info = -2
+                             return
+                          endif
+                       else
+                          ! compute eigenvector of singleton
+                          iter = 0
+                          tol = four * log(real(in,KIND=qp)) * eps
+                          k = newfst
+                          windex = wbegin + k - 1
+                          windmn = max(windex - 1,1)
+                          windpl = min(windex + 1,m)
+                          lambda = work( windex )
+                          done = done + 1
+                          ! check if eigenvector computation is to be skipped
+                          if((windex<dol).or.(windex>dou)) then
+                             eskip = .true.
+                             goto 125
+                          else
+                             eskip = .false.
+                          endif
+                          left = work( windex ) - werr( windex )
+                          right = work( windex ) + werr( windex )
+                          indeig = indexw( windex )
+                          ! note that since we compute the eigenpairs for a child,
+                          ! all eigenvalue approximations are w.r.t the same shift.
+                          ! in this case, the entries in work should be used for
+                          ! computing the gaps since they exhibit even very small
+                          ! differences in the eigenvalues, as opposed to the
+                          ! entries in w which might "look" the same.
+                          if( k == 1) then
+                             ! in the case range='i' and with not much initial
+                             ! accuracy in lambda and vl, the formula
+                             ! lgap = max( zero, (sigma - vl) + lambda )
+                             ! can lead to an overestimation of the left gap and
+                             ! thus to inadequately early rqi 'convergence'.
+                             ! prevent this by forcing a small left gap.
+                             lgap = eps*max(abs(left),abs(right))
+                          else
+                             lgap = wgap(windmn)
+                          endif
+                          if( k == im) then
+                             ! in the case range='i' and with not much initial
+                             ! accuracy in lambda and vu, the formula
+                             ! can lead to an overestimation of the right gap and
+                             ! thus to inadequately early rqi 'convergence'.
+                             ! prevent this by forcing a small right gap.
+                             rgap = eps*max(abs(left),abs(right))
+                          else
+                             rgap = wgap(windex)
+                          endif
+                          gap = min( lgap, rgap )
+                          if(( k == 1).or.(k == im)) then
+                             ! the eigenvector support can become wrong
+                             ! because significant entries could be cut off due to a
+                             ! large gaptol parameter in lar1v. prevent this.
+                             gaptol = zero
+                          else
+                             gaptol = gap * eps
+                          endif
+                          isupmn = in
+                          isupmx = 1
+                          ! update wgap so that it holds the minimum gap
+                          ! to the left or the right. this is crucial in the
+                          ! case where bisection is used to ensure that the
+                          ! eigenvalue is refined up to the required precision.
+                          ! the correct value is restored afterwards.
+                          savgap = wgap(windex)
+                          wgap(windex) = gap
+                          ! we want to use the rayleigh quotient correction
+                          ! as often as possible since it converges quadratically
+                          ! when we are close enough to the desired eigenvalue.
+                          ! however, the rayleigh quotient can have the wrong sign
+                          ! and lead us away from the desired eigenvalue. in this
+                          ! case, the best we can do is to use bisection.
+                          usedbs = .false.
+                          usedrq = .false.
+                          ! bisection is initially turned off unless it is forced
+                          needbs =  .not.tryrqc
+                          120 continue
+                          ! check if bisection should be used to refine eigenvalue
+                          if(needbs) then
+                             ! take the bisection as new iterate
+                             usedbs = .true.
+                             itmp1 = iwork( iindr+windex )
+                             offset = indexw( wbegin ) - 1
+                             call stdlib_qlarrb( in, d(ibegin),work(indlld+ibegin-1),indeig,&
+                             indeig,zero, two*eps, offset,work(wbegin),wgap(wbegin),werr(wbegin),&
+                                       work( indwrk ),iwork( iindwk ), pivmin, spdiam,itmp1, iinfo )
+                             if( iinfo/=0 ) then
+                                info = -3
+                                return
+                             endif
+                             lambda = work( windex )
+                             ! reset twist index from inaccurate lambda to
+                             ! force computation of true mingma
+                             iwork( iindr+windex ) = 0
+                          endif
+                          ! given lambda, compute the eigenvector.
+                          call stdlib_qlar1v( in, 1, in, lambda, d( ibegin ),l( ibegin ), work(&
+                          indld+ibegin-1),work(indlld+ibegin-1),pivmin, gaptol, z( ibegin, windex &
+                          ),.not.usedbs, negcnt, ztz, mingma,iwork( iindr+windex ), isuppz( &
+                                    2*windex-1 ),nrminv, resid, rqcorr, work( indwrk ) )
+                          if(iter == 0) then
+                             bstres = resid
+                             bstw = lambda
+                          elseif(resid<bstres) then
+                             bstres = resid
+                             bstw = lambda
+                          endif
+                          isupmn = min(isupmn,isuppz( 2*windex-1 ))
+                          isupmx = max(isupmx,isuppz( 2*windex ))
+                          iter = iter + 1
+                          ! sin alpha <= |resid|/gap
+                          ! note that both the residual and the gap are
+                          ! proportional to the matrix, so ||t|| doesn't play
+                          ! a role in the quotient
+                          ! convergence test for rayleigh-quotient iteration
+                          ! (omitted when bisection has been used)
+                          if( resid>tol*gap .and. abs( rqcorr )>rqtol*abs( lambda ) .and. .not. &
+                                    usedbs)then
+                             ! we need to check that the rqcorr update doesn't
+                             ! move the eigenvalue away from the desired one and
+                             ! towards a neighbor. -> protection with bisection
+                             if(indeig<=negcnt) then
+                                ! the wanted eigenvalue lies to the left
+                                sgndef = -one
+                             else
+                                ! the wanted eigenvalue lies to the right
+                                sgndef = one
+                             endif
+                             ! we only use the rqcorr if it improves the
+                             ! the iterate reasonably.
+                             if( ( rqcorr*sgndef>=zero ).and.( lambda + rqcorr<= right).and.( &
+                                       lambda + rqcorr>= left)) then
+                                usedrq = .true.
+                                ! store new midpoint of bisection interval in work
+                                if(sgndef==one) then
+                                   ! the current lambda is on the left of the true
+                                   ! eigenvalue
+                                   left = lambda
+                                   ! we prefer to assume that the error estimate
+                                   ! is correct. we could make the interval not
+                                   ! as a bracket but to be modified if the rqcorr
+                                   ! chooses to. in this case, the right side should
+                                   ! be modified as follows:
+                                    ! right = max(right, lambda + rqcorr)
+                                else
+                                   ! the current lambda is on the right of the true
+                                   ! eigenvalue
+                                   right = lambda
+                                   ! see comment about assuming the error estimate is
+                                   ! correct above.
+                                    ! left = min(left, lambda + rqcorr)
+                                endif
+                                work( windex ) =half * (right + left)
+                                ! take rqcorr since it has the correct sign and
+                                ! improves the iterate reasonably
+                                lambda = lambda + rqcorr
+                                ! update width of error interval
+                                werr( windex ) =half * (right-left)
+                             else
+                                needbs = .true.
+                             endif
+                             if(right-left<rqtol*abs(lambda)) then
+                                   ! the eigenvalue is computed to bisection accuracy
+                                   ! compute eigenvector and stop
+                                usedbs = .true.
+                                goto 120
+                             elseif( iter<maxitr ) then
+                                goto 120
+                             elseif( iter==maxitr ) then
+                                needbs = .true.
+                                goto 120
+                             else
+                                info = 5
+                                return
+                             end if
+                          else
+                             stp2ii = .false.
+             if(usedrq .and. usedbs .and.bstres<=resid) then
+                                lambda = bstw
+                                stp2ii = .true.
+                             endif
+                             if (stp2ii) then
+                                ! improve error angle by second step
+                                call stdlib_qlar1v( in, 1, in, lambda,d( ibegin ), l( ibegin ),&
+                                work(indld+ibegin-1),work(indlld+ibegin-1),pivmin, gaptol, z( &
+                                ibegin, windex ),.not.usedbs, negcnt, ztz, mingma,iwork( iindr+&
+                                windex ),isuppz( 2*windex-1 ),nrminv, resid, rqcorr, work( indwrk &
+                                          ) )
+                             endif
+                             work( windex ) = lambda
+                          end if
+                          ! compute fp-vector support w.r.t. whole matrix
+                          isuppz( 2*windex-1 ) = isuppz( 2*windex-1 )+oldien
+                          isuppz( 2*windex ) = isuppz( 2*windex )+oldien
+                          zfrom = isuppz( 2*windex-1 )
+                          zto = isuppz( 2*windex )
+                          isupmn = isupmn + oldien
+                          isupmx = isupmx + oldien
+                          ! ensure vector is ok if support in the rqi has changed
+                          if(isupmn<zfrom) then
+                             do ii = isupmn,zfrom-1
+                                z( ii, windex ) = zero
+                             end do
+                          endif
+                          if(isupmx>zto) then
+                             do ii = zto+1,isupmx
+                                z( ii, windex ) = zero
+                             end do
+                          endif
+                          call stdlib_qscal( zto-zfrom+1, nrminv,z( zfrom, windex ), 1 )
+                          125 continue
+                          ! update w
+                          w( windex ) = lambda+sigma
+                          ! recompute the gaps on the left and right
+                          ! but only allow them to become larger and not
+                          ! smaller (which can only happen through "bad"
+                          ! cancellation and doesn't reflect the theory
+                          ! where the initial gaps are underestimated due
+                          ! to werr being too crude.)
+                          if(.not.eskip) then
+                             if( k>1) then
+                                wgap( windmn ) = max( wgap(windmn),w(windex)-werr(windex)- w(&
+                                          windmn)-werr(windmn) )
+                             endif
+                             if( windex<wend ) then
+                                wgap( windex ) = max( savgap,w( windpl )-werr( windpl )- w( &
+                                          windex )-werr( windex) )
+                             endif
+                          endif
+                          idone = idone + 1
+                       endif
+                       ! here ends the code for the current child
+                       139 continue
+                       ! proceed to any remaining child nodes
+                       newfst = j + 1
+                    end do loop_140
+                 end do loop_150
+                 ndepth = ndepth + 1
+                 go to 40
+              end if
+              ibegin = iend + 1
+              wbegin = wend + 1
+           end do loop_170
+           return
+     end subroutine stdlib_qlarrv
+
+     !> !
+     !>
+     !> DLARTG: generates a plane rotation so that
+     !> [  C  S  ]  .  [ F ]  =  [ R ]
+     !> [ -S  C  ]     [ G ]     [ 0 ]
+     !> where C**2 + S**2 = 1.
+     !> The mathematical formulas used for C and S are
+     !> R = sign(F) * sqrt(F**2 + G**2)
+     !> C = F / R
+     !> S = G / R
+     !> Hence C >= 0. The algorithm used to compute these quantities
+     !> incorporates scaling to avoid overflow or underflow in computing the
+     !> square root of the sum of squares.
+     !> This version is discontinuous in R at F = 0 but it returns the same
+     !> C and S as ZLARTG for complex inputs (F,0) and (G,0).
+     !> This is a more accurate version of the BLAS1 routine DROTG,
+     !> with the following other differences:
+     !> F and G are unchanged on return.
+     !> If G=0, then C=1 and S=0.
+     !> If F=0 and (G .ne. 0), then C=0 and S=sign(1,G) without doing any
+     !> floating point operations (saves work in DBDSQR when
+     !> there are zeros on the diagonal).
+     !> If F exceeds G in magnitude, C will be positive.
+     !> Below, wp=>dp stands for quad precision from LA_CONSTANTS module.
+
+     pure subroutine stdlib_qlartg( f, g, c, s, r )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! february 2021
+        ! Scalar Arguments 
+        real(qp), intent(out) :: c, r, s
+        real(qp), intent(in) :: f, g
+        ! Local Scalars 
+        real(qp) :: d, f1, fs, g1, gs, p, u, uu
+        ! Intrinsic Functions 
+        intrinsic :: abs,sign,sqrt
+        ! Executable Statements 
+        f1 = abs( f )
+        g1 = abs( g )
+        if( g == zero ) then
+           c = one
+           s = zero
+           r = f
+        else if( f == zero ) then
+           c = zero
+           s = sign( one, g )
+           r = g1
+     else if( f1 > rtmin .and. f1 < rtmax .and.            g1 > rtmin .and. g1 < rtmax ) &
+               then
+           d = sqrt( f*f + g*g )
+           p = one / d
+           c = f1*p
+           s = g*sign( p, f )
+           r = sign( d, f )
+        else
+           u = min( safmax, max( safmin, f1, g1 ) )
+           uu = one / u
+           fs = f*uu
+           gs = g*uu
+           d = sqrt( fs*fs + gs*gs )
+           p = one / d
+           c = abs( fs )*p
+           s = gs*sign( p, f )
+           r = sign( d, f )*u
+        end if
+        return
+     end subroutine stdlib_qlartg
+
+     !> DLARTGP: generates a plane rotation so that
+     !> [  CS  SN  ]  .  [ F ]  =  [ R ]   where CS**2 + SN**2 = 1.
+     !> [ -SN  CS  ]     [ G ]     [ 0 ]
+     !> This is a slower, more accurate version of the Level 1 BLAS routine DROTG,
+     !> with the following other differences:
+     !> F and G are unchanged on return.
+     !> If G=0, then CS=(+/-)1 and SN=0.
+     !> If F=0 and (G .ne. 0), then CS=0 and SN=(+/-)1.
+     !> The sign is chosen so that R >= 0.
+
+     pure subroutine stdlib_qlartgp( f, g, cs, sn, r )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(out) :: cs, r, sn
+           real(qp), intent(in) :: f, g
+        ! =====================================================================
+           
+           
+           
+           ! Local Scalars 
+           ! logical            first
+           integer(ilp) :: count, i
+           real(qp) :: eps, f1, g1, safmin, safmn2, safmx2, scale
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,sign,sqrt
+           ! Save Statement 
+           ! save               first, safmx2, safmin, safmn2
+           ! Data Statements 
+           ! data               first / .true. /
+           ! Executable Statements 
+           ! if( first ) then
+              safmin = stdlib_qlamch( 'S' )
+              eps = stdlib_qlamch( 'E' )
+              safmn2 = stdlib_qlamch( 'B' )**int( log( safmin / eps ) /log( stdlib_qlamch( 'B' ) )&
+                         / two,KIND=ilp)
+              safmx2 = one / safmn2
+              ! first = .false.
+           ! end if
+           if( g==zero ) then
+              cs = sign( one, f )
+              sn = zero
+              r = abs( f )
+           else if( f==zero ) then
+              cs = zero
+              sn = sign( one, g )
+              r = abs( g )
+           else
+              f1 = f
+              g1 = g
+              scale = max( abs( f1 ), abs( g1 ) )
+              if( scale>=safmx2 ) then
+                 count = 0
+                 10 continue
+                 count = count + 1
+                 f1 = f1*safmn2
+                 g1 = g1*safmn2
+                 scale = max( abs( f1 ), abs( g1 ) )
+                 if( scale>=safmx2 .and. count < 20 )go to 10
+                 r = sqrt( f1**2+g1**2 )
+                 cs = f1 / r
+                 sn = g1 / r
+                 do i = 1, count
+                    r = r*safmx2
+                 end do
+              else if( scale<=safmn2 ) then
+                 count = 0
+                 30 continue
+                 count = count + 1
+                 f1 = f1*safmx2
+                 g1 = g1*safmx2
+                 scale = max( abs( f1 ), abs( g1 ) )
+                 if( scale<=safmn2 )go to 30
+                 r = sqrt( f1**2+g1**2 )
+                 cs = f1 / r
+                 sn = g1 / r
+                 do i = 1, count
+                    r = r*safmn2
+                 end do
+              else
+                 r = sqrt( f1**2+g1**2 )
+                 cs = f1 / r
+                 sn = g1 / r
+              end if
+              if( r<zero ) then
+                 cs = -cs
+                 sn = -sn
+                 r = -r
+              end if
+           end if
+           return
+     end subroutine stdlib_qlartgp
+
+     !> DLARTGS: generates a plane rotation designed to introduce a bulge in
+     !> Golub-Reinsch-style implicit QR iteration for the bidiagonal SVD
+     !> problem. X and Y are the top-row entries, and SIGMA is the shift.
+     !> The computed CS and SN define a plane rotation satisfying
+     !> [  CS  SN  ]  .  [ X^2 - SIGMA ]  =  [ R ],
+     !> [ -SN  CS  ]     [    X * Y    ]     [ 0 ]
+     !> with R nonnegative.  If X^2 - SIGMA and X * Y are 0, then the
+     !> rotation is by PI/2.
+
+     pure subroutine stdlib_qlartgs( x, y, sigma, cs, sn )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(out) :: cs, sn
+           real(qp), intent(in) :: sigma, x, y
+        ! ===================================================================
+           
+           ! Local Scalars 
+           real(qp) :: r, s, thresh, w, z
+           thresh = stdlib_qlamch('E')
+           ! compute the first column of b**t*b - sigma^2*i, up to a scale
+           ! factor.
+           if( (sigma == zero .and. abs(x) < thresh) .or.(abs(x) == sigma .and. y == zero) ) &
+                     then
+              z = zero
+              w = zero
+           else if( sigma == zero ) then
+              if( x >= zero ) then
+                 z = x
+                 w = y
+              else
+                 z = -x
+                 w = -y
+              end if
+           else if( abs(x) < thresh ) then
+              z = -sigma*sigma
+              w = zero
+           else
+              if( x >= zero ) then
+                 s = one
+              else
+                 s = negone
+              end if
+              z = s * (abs(x)-sigma) * (s+sigma/x)
+              w = s * y
+           end if
+           ! generate the rotation.
+           ! call stdlib_qlartgp( z, w, cs, sn, r ) might seem more natural;
+           ! reordering the arguments ensures that if z = 0 then the rotation
+           ! is by pi/2.
+           call stdlib_qlartgp( w, z, sn, cs, r )
+           return
+           ! end stdlib_qlartgs
+     end subroutine stdlib_qlartgs
+
+     !> DLARTV: applies a vector of real plane rotations to elements of the
+     !> real vectors x and y. For i = 1,2,...,n
+     !> ( x(i) ) := (  c(i)  s(i) ) ( x(i) )
+     !> ( y(i) )    ( -s(i)  c(i) ) ( y(i) )
+
+     pure subroutine stdlib_qlartv( n, x, incx, y, incy, c, s, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, incy, n
+           ! Array Arguments 
+           real(qp), intent(in) :: c(*), s(*)
+           real(qp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix, iy
+           real(qp) :: xi, yi
+           ! Executable Statements 
+           ix = 1
+           iy = 1
+           ic = 1
+           do i = 1, n
+              xi = x( ix )
+              yi = y( iy )
+              x( ix ) = c( ic )*xi + s( ic )*yi
+              y( iy ) = c( ic )*yi - s( ic )*xi
+              ix = ix + incx
+              iy = iy + incy
+              ic = ic + incc
+           end do
+           return
+     end subroutine stdlib_qlartv
+
+     !> DLARUV: returns a vector of n random real numbers from a uniform (0,1)
+     !> distribution (n <= 128).
+     !> This is an auxiliary routine called by DLARNV and ZLARNV.
+
+     pure subroutine stdlib_qlaruv( iseed, n, x )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: iseed(4)
+           real(qp), intent(out) :: x(n)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: lv = 128
+           integer(ilp), parameter :: ipw2 = 4096
+           real(qp), parameter :: r = one/ipw2
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, i2, i3, i4, it1, it2, it3, it4, j
+           ! Local Arrays 
+           integer(ilp) :: mm(lv,4)
+           ! Intrinsic Functions 
+           intrinsic :: real,min,mod
+           ! Data Statements 
+           mm(1,1:4)=[494,322,2508,2549]
+           mm(2,1:4)=[2637,789,3754,1145]
+           mm(3,1:4)=[255,1440,1766,2253]
+           mm(4,1:4)=[2008,752,3572,305]
+           mm(5,1:4)=[1253,2859,2893,3301]
+           mm(6,1:4)=[3344,123,307,1065]
+           mm(7,1:4)=[4084,1848,1297,3133]
+           mm(8,1:4)=[1739,643,3966,2913]
+           mm(9,1:4)=[3143,2405,758,3285]
+           mm(10,1:4)=[3468,2638,2598,1241]
+           mm(11,1:4)=[688,2344,3406,1197]
+           mm(12,1:4)=[1657,46,2922,3729]
+           mm(13,1:4)=[1238,3814,1038,2501]
+           mm(14,1:4)=[3166,913,2934,1673]
+           mm(15,1:4)=[1292,3649,2091,541]
+           mm(16,1:4)=[3422,339,2451,2753]
+           mm(17,1:4)=[1270,3808,1580,949]
+           mm(18,1:4)=[2016,822,1958,2361]
+           mm(19,1:4)=[154,2832,2055,1165]
+           mm(20,1:4)=[2862,3078,1507,4081]
+           mm(21,1:4)=[697,3633,1078,2725]
+           mm(22,1:4)=[1706,2970,3273,3305]
+           mm(23,1:4)=[491,637,17,3069]
+           mm(24,1:4)=[931,2249,854,3617]
+           mm(25,1:4)=[1444,2081,2916,3733]
+           mm(26,1:4)=[444,4019,3971,409]
+           mm(27,1:4)=[3577,1478,2889,2157]
+           mm(28,1:4)=[3944,242,3831,1361]
+           mm(29,1:4)=[2184,481,2621,3973]
+           mm(30,1:4)=[1661,2075,1541,1865]
+           mm(31,1:4)=[3482,4058,893,2525]
+           mm(32,1:4)=[657,622,736,1409]
+           mm(33,1:4)=[3023,3376,3992,3445]
+           mm(34,1:4)=[3618,812,787,3577]
+           mm(35,1:4)=[1267,234,2125,77]
+           mm(36,1:4)=[1828,641,2364,3761]
+           mm(37,1:4)=[164,4005,2460,2149]
+           mm(38,1:4)=[3798,1122,257,1449]
+           mm(39,1:4)=[3087,3135,1574,3005]
+           mm(40,1:4)=[2400,2640,3912,225]
+           mm(41,1:4)=[2870,2302,1216,85]
+           mm(42,1:4)=[3876,40,3248,3673]
+           mm(43,1:4)=[1905,1832,3401,3117]
+           mm(44,1:4)=[1593,2247,2124,3089]
+           mm(45,1:4)=[1797,2034,2762,1349]
+           mm(46,1:4)=[1234,2637,149,2057]
+           mm(47,1:4)=[3460,1287,2245,413]
+           mm(48,1:4)=[328,1691,166,65]
+           mm(49,1:4)=[2861,496,466,1845]
+           mm(50,1:4)=[1950,1597,4018,697]
+           mm(51,1:4)=[617,2394,1399,3085]
+           mm(52,1:4)=[2070,2584,190,3441]
+           mm(53,1:4)=[3331,1843,2879,1573]
+           mm(54,1:4)=[769,336,153,3689]
+           mm(55,1:4)=[1558,1472,2320,2941]
+           mm(56,1:4)=[2412,2407,18,929]
+           mm(57,1:4)=[2800,433,712,533]
+           mm(58,1:4)=[189,2096,2159,2841]
+           mm(59,1:4)=[287,1761,2318,4077]
+           mm(60,1:4)=[2045,2810,2091,721]
+           mm(61,1:4)=[1227,566,3443,2821]
+           mm(62,1:4)=[2838,442,1510,2249]
+           mm(63,1:4)=[209,41,449,2397]
+           mm(64,1:4)=[2770,1238,1956,2817]
+           mm(65,1:4)=[3654,1086,2201,245]
+           mm(66,1:4)=[3993,603,3137,1913]
+           mm(67,1:4)=[192,840,3399,1997]
+           mm(68,1:4)=[2253,3168,1321,3121]
+           mm(69,1:4)=[3491,1499,2271,997]
+           mm(70,1:4)=[2889,1084,3667,1833]
+           mm(71,1:4)=[2857,3438,2703,2877]
+           mm(72,1:4)=[2094,2408,629,1633]
+           mm(73,1:4)=[1818,1589,2365,981]
+           mm(74,1:4)=[688,2391,2431,2009]
+           mm(75,1:4)=[1407,288,1113,941]
+           mm(76,1:4)=[634,26,3922,2449]
+           mm(77,1:4)=[3231,512,2554,197]
+           mm(78,1:4)=[815,1456,184,2441]
+           mm(79,1:4)=[3524,171,2099,285]
+           mm(80,1:4)=[1914,1677,3228,1473]
+           mm(81,1:4)=[516,2657,4012,2741]
+           mm(82,1:4)=[164,2270,1921,3129]
+           mm(83,1:4)=[303,2587,3452,909]
+           mm(84,1:4)=[2144,2961,3901,2801]
+           mm(85,1:4)=[3480,1970,572,421]
+           mm(86,1:4)=[119,1817,3309,4073]
+           mm(87,1:4)=[3357,676,3171,2813]
+           mm(88,1:4)=[837,1410,817,2337]
+           mm(89,1:4)=[2826,3723,3039,1429]
+           mm(90,1:4)=[2332,2803,1696,1177]
+           mm(91,1:4)=[2089,3185,1256,1901]
+           mm(92,1:4)=[3780,184,3715,81]
+           mm(93,1:4)=[1700,663,2077,1669]
+           mm(94,1:4)=[3712,499,3019,2633]
+           mm(95,1:4)=[150,3784,1497,2269]
+           mm(96,1:4)=[2000,1631,1101,129]
+           mm(97,1:4)=[3375,1925,717,1141]
+           mm(98,1:4)=[1621,3912,51,249]
+           mm(99,1:4)=[3090,1398,981,3917]
+           mm(100,1:4)=[3765,1349,1978,2481]
+           mm(101,1:4)=[1149,1441,1813,3941]
+           mm(102,1:4)=[3146,2224,3881,2217]
+           mm(103,1:4)=[33,2411,76,2749]
+           mm(104,1:4)=[3082,1907,3846,3041]
+           mm(105,1:4)=[2741,3192,3694,1877]
+           mm(106,1:4)=[359,2786,1682,345]
+           mm(107,1:4)=[3316,382,124,2861]
+           mm(108,1:4)=[1749,37,1660,1809]
+           mm(109,1:4)=[185,759,3997,3141]
+           mm(110,1:4)=[2784,2948,479,2825]
+           mm(111,1:4)=[2202,1862,1141,157]
+           mm(112,1:4)=[2199,3802,886,2881]
+           mm(113,1:4)=[1364,2423,3514,3637]
+           mm(114,1:4)=[1244,2051,1301,1465]
+           mm(115,1:4)=[2020,2295,3604,2829]
+           mm(116,1:4)=[3160,1332,1888,2161]
+           mm(117,1:4)=[2785,1832,1836,3365]
+           mm(118,1:4)=[2772,2405,1990,361]
+           mm(119,1:4)=[1217,3638,2058,2685]
+           mm(120,1:4)=[1822,3661,692,3745]
+           mm(121,1:4)=[1245,327,1194,2325]
+           mm(122,1:4)=[2252,3660,20,3609]
+           mm(123,1:4)=[3904,716,3285,3821]
+           mm(124,1:4)=[2774,1842,2046,3537]
+           mm(125,1:4)=[997,3987,2107,517]
+           mm(126,1:4)=[2573,1368,3508,3017]
+           mm(127,1:4)=[1148,1848,3525,2141]
+           mm(128,1:4)=[545,2366,3801,1537]
+           ! Executable Statements 
+           i1 = iseed( 1 )
+           i2 = iseed( 2 )
+           i3 = iseed( 3 )
+           i4 = iseed( 4 )
+           loop_10: do i = 1, min( n, lv )
+           20 continue
+              ! multiply the seed by i-th power of the multiplier modulo 2**48
+              it4 = i4*mm( i, 4 )
+              it3 = it4 / ipw2
+              it4 = it4 - ipw2*it3
+              it3 = it3 + i3*mm( i, 4 ) + i4*mm( i, 3 )
+              it2 = it3 / ipw2
+              it3 = it3 - ipw2*it2
+              it2 = it2 + i2*mm( i, 4 ) + i3*mm( i, 3 ) + i4*mm( i, 2 )
+              it1 = it2 / ipw2
+              it2 = it2 - ipw2*it1
+              it1 = it1 + i1*mm( i, 4 ) + i2*mm( i, 3 ) + i3*mm( i, 2 ) +i4*mm( i, 1 )
+              it1 = mod( it1, ipw2 )
+              ! convert 48-bit integer to a realnumber in the interval (0,1,KIND=qp)
+              x( i ) = r*( real( it1,KIND=qp)+r*( real( it2,KIND=qp)+r*( real( it3,KIND=qp)+&
+                        r*real( it4,KIND=qp) ) ) )
+              if (x( i )==1.0_qp) then
+                 ! if a real number has n bits of precision, and the first
+                 ! n bits of the 48-bit integer above happen to be all 1 (which
+                 ! will occur about once every 2**n calls), then x( i ) will
+                 ! be rounded to exactly one.
+                 ! since x( i ) is not supposed to return exactly 0.0_qp or 1.0_qp,
+                 ! the statistically correct thing to do in this situation is
+                 ! simply to iterate again.
+                 ! n.b. the case x( i ) = 0.0_qp should not be possible.
+                 i1 = i1 + 2
+                 i2 = i2 + 2
+                 i3 = i3 + 2
+                 i4 = i4 + 2
+                 goto 20
+              end if
+           end do loop_10
+           ! return final value of seed
+           iseed( 1 ) = it1
+           iseed( 2 ) = it2
+           iseed( 3 ) = it3
+           iseed( 4 ) = it4
+           return
+     end subroutine stdlib_qlaruv
+
+     !> DLARZ: applies a real elementary reflector H to a real M-by-N
+     !> matrix C, from either the left or the right. H is represented in the
+     !> form
+     !> H = I - tau * v * v**T
+     !> where tau is a real scalar and v is a real vector.
+     !> If tau = 0, then H is taken to be the unit matrix.
+     !> H is a product of k elementary reflectors as returned by DTZRZF.
+
+     pure subroutine stdlib_qlarz( side, m, n, l, v, incv, tau, c, ldc, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: incv, l, ldc, m, n
+           real(qp), intent(in) :: tau
+           ! Array Arguments 
+           real(qp), intent(inout) :: c(ldc,*)
+           real(qp), intent(in) :: v(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Executable Statements 
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c
+              if( tau/=zero ) then
+                 ! w( 1:n ) = c( 1, 1:n )
+                 call stdlib_qcopy( n, c, ldc, work, 1 )
+                 ! w( 1:n ) = w( 1:n ) + c( m-l+1:m, 1:n )**t * v( 1:l )
+                 call stdlib_qgemv( 'TRANSPOSE', l, n, one, c( m-l+1, 1 ), ldc, v,incv, one, work,&
+                            1 )
+                 ! c( 1, 1:n ) = c( 1, 1:n ) - tau * w( 1:n )
+                 call stdlib_qaxpy( n, -tau, work, 1, c, ldc )
+                 ! c( m-l+1:m, 1:n ) = c( m-l+1:m, 1:n ) - ...
+                                     ! tau * v( 1:l ) * w( 1:n )**t
+                 call stdlib_qger( l, n, -tau, v, incv, work, 1, c( m-l+1, 1 ),ldc )
+              end if
+           else
+              ! form  c * h
+              if( tau/=zero ) then
+                 ! w( 1:m ) = c( 1:m, 1 )
+                 call stdlib_qcopy( m, c, 1, work, 1 )
+                 ! w( 1:m ) = w( 1:m ) + c( 1:m, n-l+1:n, 1:n ) * v( 1:l )
+                 call stdlib_qgemv( 'NO TRANSPOSE', m, l, one, c( 1, n-l+1 ), ldc,v, incv, one, &
+                           work, 1 )
+                 ! c( 1:m, 1 ) = c( 1:m, 1 ) - tau * w( 1:m )
+                 call stdlib_qaxpy( m, -tau, work, 1, c, 1 )
+                 ! c( 1:m, n-l+1:n ) = c( 1:m, n-l+1:n ) - ...
+                                     ! tau * w( 1:m ) * v( 1:l )**t
+                 call stdlib_qger( m, l, -tau, work, 1, v, incv, c( 1, n-l+1 ),ldc )
+              end if
+           end if
+           return
+     end subroutine stdlib_qlarz
+
+     !> DLARZB: applies a real block reflector H or its transpose H**T to
+     !> a real distributed M-by-N  C from the left or the right.
+     !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+
+     pure subroutine stdlib_qlarzb( side, trans, direct, storev, m, n, k, l, v,ldv, t, ldt, c, &
+               ldc, work, ldwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, l, ldc, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: c(ldc,*), t(ldt,*), v(ldv,*)
+           real(qp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           character :: transt
+           integer(ilp) :: i, info, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           ! check for currently supported options
+           info = 0
+           if( .not.stdlib_lsame( direct, 'B' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( storev, 'R' ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLARZB', -info )
+              return
+           end if
+           if( stdlib_lsame( trans, 'N' ) ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c  or  h**t * c
+              ! w( 1:n, 1:k ) = c( 1:k, 1:n )**t
+              do j = 1, k
+                 call stdlib_qcopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+              end do
+              ! w( 1:n, 1:k ) = w( 1:n, 1:k ) + ...
+                              ! c( m-l+1:m, 1:n )**t * v( 1:k, 1:l )**t
+              if( l>0 )call stdlib_qgemm( 'TRANSPOSE', 'TRANSPOSE', n, k, l, one,c( m-l+1, 1 ), &
+                        ldc, v, ldv, one, work, ldwork )
+              ! w( 1:n, 1:k ) = w( 1:n, 1:k ) * t**t  or  w( 1:m, 1:k ) * t
+              call stdlib_qtrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k, one, t,ldt, work, &
+                        ldwork )
+              ! c( 1:k, 1:n ) = c( 1:k, 1:n ) - w( 1:n, 1:k )**t
+              do j = 1, n
+                 do i = 1, k
+                    c( i, j ) = c( i, j ) - work( j, i )
+                 end do
+              end do
+              ! c( m-l+1:m, 1:n ) = c( m-l+1:m, 1:n ) - ...
+                                  ! v( 1:k, 1:l )**t * w( 1:n, 1:k )**t
+              if( l>0 )call stdlib_qgemm( 'TRANSPOSE', 'TRANSPOSE', l, n, k, -one, v, ldv,work, &
+                        ldwork, one, c( m-l+1, 1 ), ldc )
+           else if( stdlib_lsame( side, 'R' ) ) then
+              ! form  c * h  or  c * h**t
+              ! w( 1:m, 1:k ) = c( 1:m, 1:k )
+              do j = 1, k
+                 call stdlib_qcopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+              end do
+              ! w( 1:m, 1:k ) = w( 1:m, 1:k ) + ...
+                              ! c( 1:m, n-l+1:n ) * v( 1:k, 1:l )**t
+              if( l>0 )call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, k, l, one,c( 1, n-l+1 ),&
+                         ldc, v, ldv, one, work, ldwork )
+              ! w( 1:m, 1:k ) = w( 1:m, 1:k ) * t  or  w( 1:m, 1:k ) * t**t
+              call stdlib_qtrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k, one, t,ldt, work, &
+                        ldwork )
+              ! c( 1:m, 1:k ) = c( 1:m, 1:k ) - w( 1:m, 1:k )
+              do j = 1, k
+                 do i = 1, m
+                    c( i, j ) = c( i, j ) - work( i, j )
+                 end do
+              end do
+              ! c( 1:m, n-l+1:n ) = c( 1:m, n-l+1:n ) - ...
+                                  ! w( 1:m, 1:k ) * v( 1:k, 1:l )
+              if( l>0 )call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, l, k, -one,work, &
+                        ldwork, v, ldv, one, c( 1, n-l+1 ), ldc )
+           end if
+           return
+     end subroutine stdlib_qlarzb
+
+     !> DLARZT: forms the triangular factor T of a real block reflector
+     !> H of order > n, which is defined as a product of k elementary
+     !> reflectors.
+     !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+     !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+     !> If STOREV = 'C', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th column of the array V, and
+     !> H  =  I - V * T * V**T
+     !> If STOREV = 'R', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th row of the array V, and
+     !> H  =  I - V**T * T * V
+     !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+
+     pure subroutine stdlib_qlarzt( direct, storev, n, k, v, ldv, tau, t, ldt )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, storev
+           integer(ilp), intent(in) :: k, ldt, ldv, n
+           ! Array Arguments 
+           real(qp), intent(out) :: t(ldt,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j
+           ! Executable Statements 
+           ! check for currently supported options
+           info = 0
+           if( .not.stdlib_lsame( direct, 'B' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( storev, 'R' ) ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLARZT', -info )
+              return
+           end if
+           do i = k, 1, -1
+              if( tau( i )==zero ) then
+                 ! h(i)  =  i
+                 do j = i, k
+                    t( j, i ) = zero
+                 end do
+              else
+                 ! general case
+                 if( i<k ) then
+                    ! t(i+1:k,i) = - tau(i) * v(i+1:k,1:n) * v(i,1:n)**t
+                    call stdlib_qgemv( 'NO TRANSPOSE', k-i, n, -tau( i ),v( i+1, 1 ), ldv, v( i, &
+                              1 ), ldv, zero,t( i+1, i ), 1 )
+                    ! t(i+1:k,i) = t(i+1:k,i+1:k) * t(i+1:k,i)
+                    call stdlib_qtrmv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', k-i,t( i+1, i+1 ), &
+                              ldt, t( i+1, i ), 1 )
+                 end if
+                 t( i, i ) = tau( i )
+              end if
+           end do
+           return
+     end subroutine stdlib_qlarzt
+
+     !> DLAS2:  computes the singular values of the 2-by-2 matrix
+     !> [  F   G  ]
+     !> [  0   H  ].
+     !> On return, SSMIN is the smaller singular value and SSMAX is the
+     !> larger singular value.
+
+     pure subroutine stdlib_qlas2( f, g, h, ssmin, ssmax )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: f, g, h
+           real(qp), intent(out) :: ssmax, ssmin
+        ! ====================================================================
+           
+           
+           
+           ! Local Scalars 
+           real(qp) :: as, at, au, c, fa, fhmn, fhmx, ga, ha
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           fa = abs( f )
+           ga = abs( g )
+           ha = abs( h )
+           fhmn = min( fa, ha )
+           fhmx = max( fa, ha )
+           if( fhmn==zero ) then
+              ssmin = zero
+              if( fhmx==zero ) then
+                 ssmax = ga
+              else
+                 ssmax = max( fhmx, ga )*sqrt( one+( min( fhmx, ga ) / max( fhmx, ga ) )**2 )
+                           
+              end if
+           else
+              if( ga<fhmx ) then
+                 as = one + fhmn / fhmx
+                 at = ( fhmx-fhmn ) / fhmx
+                 au = ( ga / fhmx )**2
+                 c = two / ( sqrt( as*as+au )+sqrt( at*at+au ) )
+                 ssmin = fhmn*c
+                 ssmax = fhmx / c
+              else
+                 au = fhmx / ga
+                 if( au==zero ) then
+                    ! avoid possible harmful underflow if exponent range
+                    ! asymmetric (true ssmin may not underflow even if
+                    ! au underflows)
+                    ssmin = ( fhmn*fhmx ) / ga
+                    ssmax = ga
+                 else
+                    as = one + fhmn / fhmx
+                    at = ( fhmx-fhmn ) / fhmx
+                    c = one / ( sqrt( one+( as*au )**2 )+sqrt( one+( at*au )**2 ) )
+                    ssmin = ( fhmn*c )*au
+                    ssmin = ssmin + ssmin
+                    ssmax = ga / ( c+c )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qlas2
+
+     !> DLASCL: multiplies the M by N real matrix A by the real scalar
+     !> CTO/CFROM.  This is done without over/underflow as long as the final
+     !> result CTO*A(I,J)/CFROM does not over/underflow. TYPE specifies that
+     !> A may be full, upper triangular, lower triangular, upper Hessenberg,
+     !> or banded.
+
+     pure subroutine stdlib_qlascl( type, kl, ku, cfrom, cto, m, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: type
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, lda, m, n
+           real(qp), intent(in) :: cfrom, cto
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: i, itype, j, k1, k2, k3, k4
+           real(qp) :: bignum, cfrom1, cfromc, cto1, ctoc, mul, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( stdlib_lsame( type, 'G' ) ) then
+              itype = 0
+           else if( stdlib_lsame( type, 'L' ) ) then
+              itype = 1
+           else if( stdlib_lsame( type, 'U' ) ) then
+              itype = 2
+           else if( stdlib_lsame( type, 'H' ) ) then
+              itype = 3
+           else if( stdlib_lsame( type, 'B' ) ) then
+              itype = 4
+           else if( stdlib_lsame( type, 'Q' ) ) then
+              itype = 5
+           else if( stdlib_lsame( type, 'Z' ) ) then
+              itype = 6
+           else
+              itype = -1
+           end if
+           if( itype==-1 ) then
+              info = -1
+           else if( cfrom==zero .or. stdlib_qisnan(cfrom) ) then
+              info = -4
+           else if( stdlib_qisnan(cto) ) then
+              info = -5
+           else if( m<0 ) then
+              info = -6
+           else if( n<0 .or. ( itype==4 .and. n/=m ) .or.( itype==5 .and. n/=m ) ) then
+              info = -7
+           else if( itype<=3 .and. lda<max( 1, m ) ) then
+              info = -9
+           else if( itype>=4 ) then
+              if( kl<0 .or. kl>max( m-1, 0 ) ) then
+                 info = -2
+              else if( ku<0 .or. ku>max( n-1, 0 ) .or.( ( itype==4 .or. itype==5 ) .and. kl/=ku ) &
+                        )then
+                 info = -3
+              else if( ( itype==4 .and. lda<kl+1 ) .or.( itype==5 .and. lda<ku+1 ) .or.( itype==6 &
+                        .and. lda<2*kl+ku+1 ) ) then
+                 info = -9
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASCL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 )return
+           ! get machine parameters
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           cfromc = cfrom
+           ctoc = cto
+           10 continue
+           cfrom1 = cfromc*smlnum
+           if( cfrom1==cfromc ) then
+              ! cfromc is an inf.  multiply by a correctly signed zero for
+              ! finite ctoc, or a nan if ctoc is infinite.
+              mul = ctoc / cfromc
+              done = .true.
+              cto1 = ctoc
+           else
+              cto1 = ctoc / bignum
+              if( cto1==ctoc ) then
+                 ! ctoc is either 0 or an inf.  in both cases, ctoc itself
+                 ! serves as the correct multiplication factor.
+                 mul = ctoc
+                 done = .true.
+                 cfromc = one
+              else if( abs( cfrom1 )>abs( ctoc ) .and. ctoc/=zero ) then
+                 mul = smlnum
+                 done = .false.
+                 cfromc = cfrom1
+              else if( abs( cto1 )>abs( cfromc ) ) then
+                 mul = bignum
+                 done = .false.
+                 ctoc = cto1
+              else
+                 mul = ctoc / cfromc
+                 done = .true.
+              end if
+           end if
+           if( itype==0 ) then
+              ! full matrix
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==1 ) then
+              ! lower triangular matrix
+              do j = 1, n
+                 do i = j, m
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==2 ) then
+              ! upper triangular matrix
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==3 ) then
+              ! upper hessenberg matrix
+              do j = 1, n
+                 do i = 1, min( j+1, m )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==4 ) then
+              ! lower half of a symmetric band matrix
+              k3 = kl + 1
+              k4 = n + 1
+              do j = 1, n
+                 do i = 1, min( k3, k4-j )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==5 ) then
+              ! upper half of a symmetric band matrix
+              k1 = ku + 2
+              k3 = ku + 1
+              do j = 1, n
+                 do i = max( k1-j, 1 ), k3
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==6 ) then
+              ! band matrix
+              k1 = kl + ku + 2
+              k2 = kl + 1
+              k3 = 2*kl + ku + 1
+              k4 = kl + ku + 1 + m
+              do j = 1, n
+                 do i = max( k1-j, k2 ), min( k3, k4-j )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           end if
+           if( .not.done )go to 10
+           return
+     end subroutine stdlib_qlascl
+
+     !> Using a divide and conquer approach, DLASD0: computes the singular
+     !> value decomposition (SVD) of a real upper bidiagonal N-by-M
+     !> matrix B with diagonal D and offdiagonal E, where M = N + SQRE.
+     !> The algorithm computes orthogonal matrices U and VT such that
+     !> B = U * S * VT. The singular values S are overwritten on D.
+     !> A related subroutine, DLASDA, computes only the singular values,
+     !> and optionally, the singular vectors in compact form.
+
+     pure subroutine stdlib_qlasd0( n, sqre, d, e, u, ldu, vt, ldvt, smlsiz, iwork,work, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu, ldvt, n, smlsiz, sqre
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ic, idxq, idxqc, im1, inode, itemp, iwk, j, lf, ll, lvl, m, ncc,&
+                      nd, ndb1, ndiml, ndimr, nl, nlf, nlp1, nlvl, nr, nrf, nrp1, sqrei
+           real(qp) :: alpha, beta
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -2
+           end if
+           m = n + sqre
+           if( ldu<n ) then
+              info = -6
+           else if( ldvt<m ) then
+              info = -8
+           else if( smlsiz<3 ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD0', -info )
+              return
+           end if
+           ! if the input matrix is too small, call stdlib_qlasdq to find the svd.
+           if( n<=smlsiz ) then
+              call stdlib_qlasdq( 'U', sqre, n, m, n, 0, d, e, vt, ldvt, u, ldu, u,ldu, work, &
+                        info )
+              return
+           end if
+           ! set up the computation tree.
+           inode = 1
+           ndiml = inode + n
+           ndimr = ndiml + n
+           idxq = ndimr + n
+           iwk = idxq + n
+           call stdlib_qlasdt( n, nlvl, nd, iwork( inode ), iwork( ndiml ),iwork( ndimr ), smlsiz &
+                     )
+           ! for the nodes on bottom level of the tree, solve
+           ! their subproblems by stdlib_qlasdq.
+           ndb1 = ( nd+1 ) / 2
+           ncc = 0
+           loop_30: do i = ndb1, nd
+           ! ic : center row of each node
+           ! nl : number of rows of left  subproblem
+           ! nr : number of rows of right subproblem
+           ! nlf: starting row of the left   subproblem
+           ! nrf: starting row of the right  subproblem
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nlp1 = nl + 1
+              nr = iwork( ndimr+i1 )
+              nrp1 = nr + 1
+              nlf = ic - nl
+              nrf = ic + 1
+              sqrei = 1
+              call stdlib_qlasdq( 'U', sqrei, nl, nlp1, nl, ncc, d( nlf ), e( nlf ),vt( nlf, nlf )&
+                        , ldvt, u( nlf, nlf ), ldu,u( nlf, nlf ), ldu, work, info )
+              if( info/=0 ) then
+                 return
+              end if
+              itemp = idxq + nlf - 2
+              do j = 1, nl
+                 iwork( itemp+j ) = j
+              end do
+              if( i==nd ) then
+                 sqrei = sqre
+              else
+                 sqrei = 1
+              end if
+              nrp1 = nr + sqrei
+              call stdlib_qlasdq( 'U', sqrei, nr, nrp1, nr, ncc, d( nrf ), e( nrf ),vt( nrf, nrf )&
+                        , ldvt, u( nrf, nrf ), ldu,u( nrf, nrf ), ldu, work, info )
+              if( info/=0 ) then
+                 return
+              end if
+              itemp = idxq + ic
+              do j = 1, nr
+                 iwork( itemp+j-1 ) = j
+              end do
+           end do loop_30
+           ! now conquer each subproblem bottom-up.
+           loop_50: do lvl = nlvl, 1, -1
+              ! find the first node lf and last node ll on the
+              ! current level lvl.
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = lf, ll
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 if( ( sqre==0 ) .and. ( i==ll ) ) then
+                    sqrei = sqre
+                 else
+                    sqrei = 1
+                 end if
+                 idxqc = idxq + nlf - 1
+                 alpha = d( ic )
+                 beta = e( ic )
+                 call stdlib_qlasd1( nl, nr, sqrei, d( nlf ), alpha, beta,u( nlf, nlf ), ldu, vt( &
+                           nlf, nlf ), ldvt,iwork( idxqc ), iwork( iwk ), work, info )
+              ! report the possible convergence failure.
+                 if( info/=0 ) then
+                    return
+                 end if
+              end do
+           end do loop_50
+           return
+     end subroutine stdlib_qlasd0
+
+     !> DLASD1: computes the SVD of an upper bidiagonal N-by-M matrix B,
+     !> where N = NL + NR + 1 and M = N + SQRE. DLASD1 is called from DLASD0.
+     !> A related subroutine DLASD7 handles the case in which the singular
+     !> values (and the singular vectors in factored form) are desired.
+     !> DLASD1 computes the SVD as follows:
+     !> ( D1(in)    0    0       0 )
+     !> B = U(in) * (   Z1**T   a   Z2**T    b ) * VT(in)
+     !> (   0       0   D2(in)   0 )
+     !> = U(out) * ( D(out) 0) * VT(out)
+     !> where Z**T = (Z1**T a Z2**T b) = u**T VT**T, and u is a vector of dimension M
+     !> with ALPHA and BETA in the NL+1 and NL+2 th entries and zeros
+     !> elsewhere; and the entry b is empty if SQRE = 0.
+     !> The left singular vectors of the original matrix are stored in U, and
+     !> the transpose of the right singular vectors are stored in VT, and the
+     !> singular values are in D.  The algorithm consists of three stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple singular values or when there are zeros in
+     !> the Z vector.  For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one.  This stage is
+     !> performed by the routine DLASD2.
+     !> The second stage consists of calculating the updated
+     !> singular values. This is done by finding the square roots of the
+     !> roots of the secular equation via the routine DLASD4 (as called
+     !> by DLASD3). This routine also calculates the singular vectors of
+     !> the current problem.
+     !> The final stage consists of computing the updated singular vectors
+     !> directly using the updated singular values.  The singular vectors
+     !> for the current problem are multiplied with the singular vectors
+     !> from the overall problem.
+
+     pure subroutine stdlib_qlasd1( nl, nr, sqre, d, alpha, beta, u, ldu, vt, ldvt,idxq, iwork, &
+               work, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu, ldvt, nl, nr, sqre
+           real(qp), intent(inout) :: alpha, beta
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: idxq(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: d(*), u(ldu,*), vt(ldvt,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: coltyp, i, idx, idxc, idxp, iq, isigma, iu2, ivt2, iz, k, ldq, ldu2, &
+                     ldvt2, m, n, n1, n2
+           real(qp) :: orgnrm
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( nl<1 ) then
+              info = -1
+           else if( nr<1 ) then
+              info = -2
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD1', -info )
+              return
+           end if
+           n = nl + nr + 1
+           m = n + sqre
+           ! the following values are for bookkeeping purposes only.  they are
+           ! integer pointers which indicate the portion of the workspace
+           ! used by a particular array in stdlib_qlasd2 and stdlib_qlasd3.
+           ldu2 = n
+           ldvt2 = m
+           iz = 1
+           isigma = iz + m
+           iu2 = isigma + n
+           ivt2 = iu2 + ldu2*n
+           iq = ivt2 + ldvt2*m
+           idx = 1
+           idxc = idx + n
+           coltyp = idxc + n
+           idxp = coltyp + n
+           ! scale.
+           orgnrm = max( abs( alpha ), abs( beta ) )
+           d( nl+1 ) = zero
+           do i = 1, n
+              if( abs( d( i ) )>orgnrm ) then
+                 orgnrm = abs( d( i ) )
+              end if
+           end do
+           call stdlib_qlascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, info )
+           alpha = alpha / orgnrm
+           beta = beta / orgnrm
+           ! deflate singular values.
+           call stdlib_qlasd2( nl, nr, sqre, k, d, work( iz ), alpha, beta, u, ldu,vt, ldvt, work(&
+            isigma ), work( iu2 ), ldu2,work( ivt2 ), ldvt2, iwork( idxp ), iwork( idx ),iwork( &
+                      idxc ), idxq, iwork( coltyp ), info )
+           ! solve secular equation and update singular vectors.
+           ldq = k
+           call stdlib_qlasd3( nl, nr, sqre, k, d, work( iq ), ldq, work( isigma ),u, ldu, work( &
+           iu2 ), ldu2, vt, ldvt, work( ivt2 ),ldvt2, iwork( idxc ), iwork( coltyp ), work( iz ),&
+                     info )
+           ! report the convergence failure.
+           if( info/=0 ) then
+              return
+           end if
+           ! unscale.
+           call stdlib_qlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+           ! prepare the idxq sorting permutation.
+           n1 = k
+           n2 = n - k
+           call stdlib_qlamrg( n1, n2, d, 1, -1, idxq )
+           return
+     end subroutine stdlib_qlasd1
+
+     !> DLASD2: merges the two sets of singular values together into a single
+     !> sorted set.  Then it tries to deflate the size of the problem.
+     !> There are two ways in which deflation can occur:  when two or more
+     !> singular values are close together or if there is a tiny entry in the
+     !> Z vector.  For each such occurrence the order of the related secular
+     !> equation problem is reduced by one.
+     !> DLASD2 is called from DLASD1.
+
+     pure subroutine stdlib_qlasd2( nl, nr, sqre, k, d, z, alpha, beta, u, ldu, vt,ldvt, dsigma, &
+               u2, ldu2, vt2, ldvt2, idxp, idx,idxc, idxq, coltyp, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, k
+           integer(ilp), intent(in) :: ldu, ldu2, ldvt, ldvt2, nl, nr, sqre
+           real(qp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           integer(ilp), intent(out) :: coltyp(*), idx(*), idxc(*), idxp(*)
+           integer(ilp), intent(inout) :: idxq(*)
+           real(qp), intent(inout) :: d(*), u(ldu,*), vt(ldvt,*)
+           real(qp), intent(out) :: dsigma(*), u2(ldu2,*), vt2(ldvt2,*), z(*)
+        ! =====================================================================
+           
+           ! Local Arrays 
+           integer(ilp) :: ctot(4), psm(4)
+           ! Local Scalars 
+           integer(ilp) :: ct, i, idxi, idxj, idxjp, j, jp, jprev, k2, m, n, nlp1, nlp2
+           real(qp) :: c, eps, hlftol, s, tau, tol, z1
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( nl<1 ) then
+              info = -1
+           else if( nr<1 ) then
+              info = -2
+           else if( ( sqre/=1 ) .and. ( sqre/=0 ) ) then
+              info = -3
+           end if
+           n = nl + nr + 1
+           m = n + sqre
+           if( ldu<n ) then
+              info = -10
+           else if( ldvt<m ) then
+              info = -12
+           else if( ldu2<n ) then
+              info = -15
+           else if( ldvt2<m ) then
+              info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD2', -info )
+              return
+           end if
+           nlp1 = nl + 1
+           nlp2 = nl + 2
+           ! generate the first part of the vector z; and move the singular
+           ! values in the first part of d one position backward.
+           z1 = alpha*vt( nlp1, nlp1 )
+           z( 1 ) = z1
+           do i = nl, 1, -1
+              z( i+1 ) = alpha*vt( i, nlp1 )
+              d( i+1 ) = d( i )
+              idxq( i+1 ) = idxq( i ) + 1
+           end do
+           ! generate the second part of the vector z.
+           do i = nlp2, m
+              z( i ) = beta*vt( i, nlp2 )
+           end do
+           ! initialize some reference arrays.
+           do i = 2, nlp1
+              coltyp( i ) = 1
+           end do
+           do i = nlp2, n
+              coltyp( i ) = 2
+           end do
+           ! sort the singular values into increasing order
+           do i = nlp2, n
+              idxq( i ) = idxq( i ) + nlp1
+           end do
+           ! dsigma, idxc, idxc, and the first column of u2
+           ! are used as storage space.
+           do i = 2, n
+              dsigma( i ) = d( idxq( i ) )
+              u2( i, 1 ) = z( idxq( i ) )
+              idxc( i ) = coltyp( idxq( i ) )
+           end do
+           call stdlib_qlamrg( nl, nr, dsigma( 2 ), 1, 1, idx( 2 ) )
+           do i = 2, n
+              idxi = 1 + idx( i )
+              d( i ) = dsigma( idxi )
+              z( i ) = u2( idxi, 1 )
+              coltyp( i ) = idxc( idxi )
+           end do
+           ! calculate the allowable deflation tolerance
+           eps = stdlib_qlamch( 'EPSILON' )
+           tol = max( abs( alpha ), abs( beta ) )
+           tol = eight*eps*max( abs( d( n ) ), tol )
+           ! there are 2 kinds of deflation -- first a value in the z-vector
+           ! is small, second two (or more) singular values are very close
+           ! together (their difference is small).
+           ! if the value in the z-vector is small, we simply permute the
+           ! array so that the corresponding singular value is moved to the
+           ! end.
+           ! if two values in the d-vector are close, we perform a two-sided
+           ! rotation designed to make one of the corresponding z-vector
+           ! entries zero, and then permute the array so that the deflated
+           ! singular value is moved to the end.
+           ! if there are multiple singular values then the problem deflates.
+           ! here the number of equal singular values are found.  as each equal
+           ! singular value is found, an elementary reflector is computed to
+           ! rotate the corresponding singular subspace so that the
+           ! corresponding components of z are zero in this new basis.
+           k = 1
+           k2 = n + 1
+           do j = 2, n
+              if( abs( z( j ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 idxp( k2 ) = j
+                 coltyp( j ) = 4
+                 if( j==n )go to 120
+              else
+                 jprev = j
+                 go to 90
+              end if
+           end do
+           90 continue
+           j = jprev
+           100 continue
+           j = j + 1
+           if( j>n )go to 110
+           if( abs( z( j ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              idxp( k2 ) = j
+              coltyp( j ) = 4
+           else
+              ! check if singular values are close enough to allow deflation.
+              if( abs( d( j )-d( jprev ) )<=tol ) then
+                 ! deflation is possible.
+                 s = z( jprev )
+                 c = z( j )
+                 ! find sqrt(a**2+b**2) without overflow or
+                 ! destructive underflow.
+                 tau = stdlib_qlapy2( c, s )
+                 c = c / tau
+                 s = -s / tau
+                 z( j ) = tau
+                 z( jprev ) = zero
+                 ! apply back the givens rotation to the left and right
+                 ! singular vector matrices.
+                 idxjp = idxq( idx( jprev )+1 )
+                 idxj = idxq( idx( j )+1 )
+                 if( idxjp<=nlp1 ) then
+                    idxjp = idxjp - 1
+                 end if
+                 if( idxj<=nlp1 ) then
+                    idxj = idxj - 1
+                 end if
+                 call stdlib_qrot( n, u( 1, idxjp ), 1, u( 1, idxj ), 1, c, s )
+                 call stdlib_qrot( m, vt( idxjp, 1 ), ldvt, vt( idxj, 1 ), ldvt, c,s )
+                 if( coltyp( j )/=coltyp( jprev ) ) then
+                    coltyp( j ) = 3
+                 end if
+                 coltyp( jprev ) = 4
+                 k2 = k2 - 1
+                 idxp( k2 ) = jprev
+                 jprev = j
+              else
+                 k = k + 1
+                 u2( k, 1 ) = z( jprev )
+                 dsigma( k ) = d( jprev )
+                 idxp( k ) = jprev
+                 jprev = j
+              end if
+           end if
+           go to 100
+           110 continue
+           ! record the last singular value.
+           k = k + 1
+           u2( k, 1 ) = z( jprev )
+           dsigma( k ) = d( jprev )
+           idxp( k ) = jprev
+           120 continue
+           ! count up the total number of the various types of columns, then
+           ! form a permutation which positions the four column types into
+           ! four groups of uniform structure (although one or more of these
+           ! groups may be empty).
+           do j = 1, 4
+              ctot( j ) = 0
+           end do
+           do j = 2, n
+              ct = coltyp( j )
+              ctot( ct ) = ctot( ct ) + 1
+           end do
+           ! psm(*) = position in submatrix (of types 1 through 4)
+           psm( 1 ) = 2
+           psm( 2 ) = 2 + ctot( 1 )
+           psm( 3 ) = psm( 2 ) + ctot( 2 )
+           psm( 4 ) = psm( 3 ) + ctot( 3 )
+           ! fill out the idxc array so that the permutation which it induces
+           ! will place all type-1 columns first, all type-2 columns next,
+           ! then all type-3's, and finally all type-4's, starting from the
+           ! second column. this applies similarly to the rows of vt.
+           do j = 2, n
+              jp = idxp( j )
+              ct = coltyp( jp )
+              idxc( psm( ct ) ) = j
+              psm( ct ) = psm( ct ) + 1
+           end do
+           ! sort the singular values and corresponding singular vectors into
+           ! dsigma, u2, and vt2 respectively.  the singular values/vectors
+           ! which were not deflated go into the first k slots of dsigma, u2,
+           ! and vt2 respectively, while those which were deflated go into the
+           ! last n - k slots, except that the first column/row will be treated
+           ! separately.
+           do j = 2, n
+              jp = idxp( j )
+              dsigma( j ) = d( jp )
+              idxj = idxq( idx( idxp( idxc( j ) ) )+1 )
+              if( idxj<=nlp1 ) then
+                 idxj = idxj - 1
+              end if
+              call stdlib_qcopy( n, u( 1, idxj ), 1, u2( 1, j ), 1 )
+              call stdlib_qcopy( m, vt( idxj, 1 ), ldvt, vt2( j, 1 ), ldvt2 )
+           end do
+           ! determine dsigma(1), dsigma(2) and z(1)
+           dsigma( 1 ) = zero
+           hlftol = tol / two
+           if( abs( dsigma( 2 ) )<=hlftol )dsigma( 2 ) = hlftol
+           if( m>n ) then
+              z( 1 ) = stdlib_qlapy2( z1, z( m ) )
+              if( z( 1 )<=tol ) then
+                 c = one
+                 s = zero
+                 z( 1 ) = tol
+              else
+                 c = z1 / z( 1 )
+                 s = z( m ) / z( 1 )
+              end if
+           else
+              if( abs( z1 )<=tol ) then
+                 z( 1 ) = tol
+              else
+                 z( 1 ) = z1
+              end if
+           end if
+           ! move the rest of the updating row to z.
+           call stdlib_qcopy( k-1, u2( 2, 1 ), 1, z( 2 ), 1 )
+           ! determine the first column of u2, the first row of vt2 and the
+           ! last row of vt.
+           call stdlib_qlaset( 'A', n, 1, zero, zero, u2, ldu2 )
+           u2( nlp1, 1 ) = one
+           if( m>n ) then
+              do i = 1, nlp1
+                 vt( m, i ) = -s*vt( nlp1, i )
+                 vt2( 1, i ) = c*vt( nlp1, i )
+              end do
+              do i = nlp2, m
+                 vt2( 1, i ) = s*vt( m, i )
+                 vt( m, i ) = c*vt( m, i )
+              end do
+           else
+              call stdlib_qcopy( m, vt( nlp1, 1 ), ldvt, vt2( 1, 1 ), ldvt2 )
+           end if
+           if( m>n ) then
+              call stdlib_qcopy( m, vt( m, 1 ), ldvt, vt2( m, 1 ), ldvt2 )
+           end if
+           ! the deflated singular values and their corresponding vectors go
+           ! into the back of d, u, and v respectively.
+           if( n>k ) then
+              call stdlib_qcopy( n-k, dsigma( k+1 ), 1, d( k+1 ), 1 )
+              call stdlib_qlacpy( 'A', n, n-k, u2( 1, k+1 ), ldu2, u( 1, k+1 ),ldu )
+              call stdlib_qlacpy( 'A', n-k, m, vt2( k+1, 1 ), ldvt2, vt( k+1, 1 ),ldvt )
+           end if
+           ! copy ctot into coltyp for referencing in stdlib_qlasd3.
+           do j = 1, 4
+              coltyp( j ) = ctot( j )
+           end do
+           return
+     end subroutine stdlib_qlasd2
+
+     !> DLASD3: finds all the square roots of the roots of the secular
+     !> equation, as defined by the values in D and Z.  It makes the
+     !> appropriate calls to DLASD4 and then updates the singular
+     !> vectors by matrix multiplication.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+     !> DLASD3 is called from DLASD1.
+
+     pure subroutine stdlib_qlasd3( nl, nr, sqre, k, d, q, ldq, dsigma, u, ldu, u2,ldu2, vt, ldvt,&
+                vt2, ldvt2, idxc, ctot, z,info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldq, ldu, ldu2, ldvt, ldvt2, nl, nr, sqre
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ctot(*), idxc(*)
+           real(qp), intent(out) :: d(*), q(ldq,*), u(ldu,*), vt(ldvt,*)
+           real(qp), intent(inout) :: dsigma(*), vt2(ldvt2,*), z(*)
+           real(qp), intent(in) :: u2(ldu2,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: ctemp, i, j, jc, ktemp, m, n, nlp1, nlp2, nrp1
+           real(qp) :: rho, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( nl<1 ) then
+              info = -1
+           else if( nr<1 ) then
+              info = -2
+           else if( ( sqre/=1 ) .and. ( sqre/=0 ) ) then
+              info = -3
+           end if
+           n = nl + nr + 1
+           m = n + sqre
+           nlp1 = nl + 1
+           nlp2 = nl + 2
+           if( ( k<1 ) .or. ( k>n ) ) then
+              info = -4
+           else if( ldq<k ) then
+              info = -7
+           else if( ldu<n ) then
+              info = -10
+           else if( ldu2<n ) then
+              info = -12
+           else if( ldvt<m ) then
+              info = -14
+           else if( ldvt2<m ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==1 ) then
+              d( 1 ) = abs( z( 1 ) )
+              call stdlib_qcopy( m, vt2( 1, 1 ), ldvt2, vt( 1, 1 ), ldvt )
+              if( z( 1 )>zero ) then
+                 call stdlib_qcopy( n, u2( 1, 1 ), 1, u( 1, 1 ), 1 )
+              else
+                 do i = 1, n
+                    u( i, 1 ) = -u2( i, 1 )
+                 end do
+              end if
+              return
+           end if
+           ! modify values dsigma(i) to make sure all dsigma(i)-dsigma(j) can
+           ! be computed with high relative accuracy (barring over/underflow).
+           ! this is a problem on machines without a guard digit in
+           ! add/subtract (cray xmp, cray ymp, cray c 90 and cray 2).
+           ! the following code replaces dsigma(i) by 2*dsigma(i)-dsigma(i),
+           ! which on any of these machines zeros out the bottommost
+           ! bit of dsigma(i) if it is 1; this makes the subsequent
+           ! subtractions dsigma(i)-dsigma(j) unproblematic when cancellation
+           ! occurs. on binary machines with a guard digit (almost all
+           ! machines) it does not change dsigma(i) at all. on hexadecimal
+           ! and decimal machines with a guard digit, it slightly
+           ! changes the bottommost bits of dsigma(i). it does not account
+           ! for hexadecimal or decimal machines without guard digits
+           ! (we know of none). we use a subroutine call to compute
+           ! 2*dsigma(i) to prevent optimizing compilers from eliminating
+           ! this code.
+           do i = 1, k
+              dsigma( i ) = stdlib_qlamc3( dsigma( i ), dsigma( i ) ) - dsigma( i )
+           end do
+           ! keep a copy of z.
+           call stdlib_qcopy( k, z, 1, q, 1 )
+           ! normalize z.
+           rho = stdlib_qnrm2( k, z, 1 )
+           call stdlib_qlascl( 'G', 0, 0, rho, one, k, 1, z, k, info )
+           rho = rho*rho
+           ! find the new singular values.
+           do j = 1, k
+              call stdlib_qlasd4( k, j, dsigma, z, u( 1, j ), rho, d( j ),vt( 1, j ), info )
+                        
+              ! if the zero finder fails, report the convergence failure.
+              if( info/=0 ) then
+                 return
+              end if
+           end do
+           ! compute updated z.
+           do i = 1, k
+              z( i ) = u( i, k )*vt( i, k )
+              do j = 1, i - 1
+                 z( i ) = z( i )*( u( i, j )*vt( i, j ) /( dsigma( i )-dsigma( j ) ) /( dsigma( i &
+                           )+dsigma( j ) ) )
+              end do
+              do j = i, k - 1
+                 z( i ) = z( i )*( u( i, j )*vt( i, j ) /( dsigma( i )-dsigma( j+1 ) ) /( dsigma( &
+                           i )+dsigma( j+1 ) ) )
+              end do
+              z( i ) = sign( sqrt( abs( z( i ) ) ), q( i, 1 ) )
+           end do
+           ! compute left singular vectors of the modified diagonal matrix,
+           ! and store related information for the right singular vectors.
+           do i = 1, k
+              vt( 1, i ) = z( 1 ) / u( 1, i ) / vt( 1, i )
+              u( 1, i ) = negone
+              do j = 2, k
+                 vt( j, i ) = z( j ) / u( j, i ) / vt( j, i )
+                 u( j, i ) = dsigma( j )*vt( j, i )
+              end do
+              temp = stdlib_qnrm2( k, u( 1, i ), 1 )
+              q( 1, i ) = u( 1, i ) / temp
+              do j = 2, k
+                 jc = idxc( j )
+                 q( j, i ) = u( jc, i ) / temp
+              end do
+           end do
+           ! update the left singular vector matrix.
+           if( k==2 ) then
+              call stdlib_qgemm( 'N', 'N', n, k, k, one, u2, ldu2, q, ldq, zero, u,ldu )
+              go to 100
+           end if
+           if( ctot( 1 )>0 ) then
+              call stdlib_qgemm( 'N', 'N', nl, k, ctot( 1 ), one, u2( 1, 2 ), ldu2,q( 2, 1 ), ldq,&
+                         zero, u( 1, 1 ), ldu )
+              if( ctot( 3 )>0 ) then
+                 ktemp = 2 + ctot( 1 ) + ctot( 2 )
+                 call stdlib_qgemm( 'N', 'N', nl, k, ctot( 3 ), one, u2( 1, ktemp ),ldu2, q( &
+                           ktemp, 1 ), ldq, one, u( 1, 1 ), ldu )
+              end if
+           else if( ctot( 3 )>0 ) then
+              ktemp = 2 + ctot( 1 ) + ctot( 2 )
+              call stdlib_qgemm( 'N', 'N', nl, k, ctot( 3 ), one, u2( 1, ktemp ),ldu2, q( ktemp, &
+                        1 ), ldq, zero, u( 1, 1 ), ldu )
+           else
+              call stdlib_qlacpy( 'F', nl, k, u2, ldu2, u, ldu )
+           end if
+           call stdlib_qcopy( k, q( 1, 1 ), ldq, u( nlp1, 1 ), ldu )
+           ktemp = 2 + ctot( 1 )
+           ctemp = ctot( 2 ) + ctot( 3 )
+           call stdlib_qgemm( 'N', 'N', nr, k, ctemp, one, u2( nlp2, ktemp ), ldu2,q( ktemp, 1 ), &
+                     ldq, zero, u( nlp2, 1 ), ldu )
+           ! generate the right singular vectors.
+           100 continue
+           do i = 1, k
+              temp = stdlib_qnrm2( k, vt( 1, i ), 1 )
+              q( i, 1 ) = vt( 1, i ) / temp
+              do j = 2, k
+                 jc = idxc( j )
+                 q( i, j ) = vt( jc, i ) / temp
+              end do
+           end do
+           ! update the right singular vector matrix.
+           if( k==2 ) then
+              call stdlib_qgemm( 'N', 'N', k, m, k, one, q, ldq, vt2, ldvt2, zero,vt, ldvt )
+                        
+              return
+           end if
+           ktemp = 1 + ctot( 1 )
+           call stdlib_qgemm( 'N', 'N', k, nlp1, ktemp, one, q( 1, 1 ), ldq,vt2( 1, 1 ), ldvt2, &
+                     zero, vt( 1, 1 ), ldvt )
+           ktemp = 2 + ctot( 1 ) + ctot( 2 )
+           if( ktemp<=ldvt2 )call stdlib_qgemm( 'N', 'N', k, nlp1, ctot( 3 ), one, q( 1, ktemp ),&
+                     ldq, vt2( ktemp, 1 ), ldvt2, one, vt( 1, 1 ),ldvt )
+           ktemp = ctot( 1 ) + 1
+           nrp1 = nr + sqre
+           if( ktemp>1 ) then
+              do i = 1, k
+                 q( i, ktemp ) = q( i, 1 )
+              end do
+              do i = nlp2, m
+                 vt2( ktemp, i ) = vt2( 1, i )
+              end do
+           end if
+           ctemp = 1 + ctot( 2 ) + ctot( 3 )
+           call stdlib_qgemm( 'N', 'N', k, nrp1, ctemp, one, q( 1, ktemp ), ldq,vt2( ktemp, nlp2 )&
+                     , ldvt2, zero, vt( 1, nlp2 ), ldvt )
+           return
+     end subroutine stdlib_qlasd3
+
+     !> This subroutine computes the square root of the I-th updated
+     !> eigenvalue of a positive symmetric rank-one modification to
+     !> a positive diagonal matrix whose entries are given as the squares
+     !> of the corresponding entries in the array d, and that
+     !> 0 <= D(i) < D(j)  for  i < j
+     !> and that RHO > 0. This is arranged by the calling routine, and is
+     !> no loss in generality.  The rank-one modified system is thus
+     !> diag( D ) * diag( D ) +  RHO * Z * Z_transpose.
+     !> where we assume the Euclidean norm of Z is 1.
+     !> The method consists of approximating the rational functions in the
+     !> secular equation by simpler interpolating rational functions.
+
+     pure subroutine stdlib_qlasd4( n, i, d, z, delta, rho, sigma, work, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i, n
+           integer(ilp), intent(out) :: info
+           real(qp), intent(in) :: rho
+           real(qp), intent(out) :: sigma
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*), z(*)
+           real(qp), intent(out) :: delta(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 400
+           
+           
+           ! Local Scalars 
+           logical(lk) :: orgati, swtch, swtch3, geomavg
+           integer(ilp) :: ii, iim1, iip1, ip1, iter, j, niter
+           real(qp) :: a, b, c, delsq, delsq2, sq2, dphi, dpsi, dtiim, dtiip, dtipsq, dtisq, &
+           dtnsq, dtnsq1, dw, eps, erretm, eta, phi, prew, psi, rhoinv, sglb, sgub, tau, tau2, &
+                     temp, temp1, temp2, w
+           ! Local Arrays 
+           real(qp) :: dd(3), zz(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! since this routine is called in an inner loop, we do no argument
+           ! checking.
+           ! quick return for n=1 and 2.
+           info = 0
+           if( n==1 ) then
+              ! presumably, i=1 upon entry
+              sigma = sqrt( d( 1 )*d( 1 )+rho*z( 1 )*z( 1 ) )
+              delta( 1 ) = one
+              work( 1 ) = one
+              return
+           end if
+           if( n==2 ) then
+              call stdlib_qlasd5( i, d, z, delta, rho, sigma, work )
+              return
+           end if
+           ! compute machine epsilon
+           eps = stdlib_qlamch( 'EPSILON' )
+           rhoinv = one / rho
+           tau2= zero
+           ! the case i = n
+           if( i==n ) then
+              ! initialize some basic variables
+              ii = n - 1
+              niter = 1
+              ! calculate initial guess
+              temp = rho / two
+              ! if ||z||_2 is not one, then temp should be set to
+              ! rho * ||z||_2^2 / two
+              temp1 = temp / ( d( n )+sqrt( d( n )*d( n )+temp ) )
+              do j = 1, n
+                 work( j ) = d( j ) + d( n ) + temp1
+                 delta( j ) = ( d( j )-d( n ) ) - temp1
+              end do
+              psi = zero
+              do j = 1, n - 2
+                 psi = psi + z( j )*z( j ) / ( delta( j )*work( j ) )
+              end do
+              c = rhoinv + psi
+              w = c + z( ii )*z( ii ) / ( delta( ii )*work( ii ) ) +z( n )*z( n ) / ( delta( n )&
+                        *work( n ) )
+              if( w<=zero ) then
+                 temp1 = sqrt( d( n )*d( n )+rho )
+                 temp = z( n-1 )*z( n-1 ) / ( ( d( n-1 )+temp1 )*( d( n )-d( n-1 )+rho / ( d( n )+&
+                           temp1 ) ) ) +z( n )*z( n ) / rho
+                 ! the following tau2 is to approximate
+                 ! sigma_n^2 - d( n )*d( n )
+                 if( c<=temp ) then
+                    tau = rho
+                 else
+                    delsq = ( d( n )-d( n-1 ) )*( d( n )+d( n-1 ) )
+                    a = -c*delsq + z( n-1 )*z( n-1 ) + z( n )*z( n )
+                    b = z( n )*z( n )*delsq
+                    if( a<zero ) then
+                       tau2 = two*b / ( sqrt( a*a+four*b*c )-a )
+                    else
+                       tau2 = ( a+sqrt( a*a+four*b*c ) ) / ( two*c )
+                    end if
+                    tau = tau2 / ( d( n )+sqrt( d( n )*d( n )+tau2 ) )
+                 end if
+                 ! it can be proved that
+                     ! d(n)^2+rho/2 <= sigma_n^2 < d(n)^2+tau2 <= d(n)^2+rho
+              else
+                 delsq = ( d( n )-d( n-1 ) )*( d( n )+d( n-1 ) )
+                 a = -c*delsq + z( n-1 )*z( n-1 ) + z( n )*z( n )
+                 b = z( n )*z( n )*delsq
+                 ! the following tau2 is to approximate
+                 ! sigma_n^2 - d( n )*d( n )
+                 if( a<zero ) then
+                    tau2 = two*b / ( sqrt( a*a+four*b*c )-a )
+                 else
+                    tau2 = ( a+sqrt( a*a+four*b*c ) ) / ( two*c )
+                 end if
+                 tau = tau2 / ( d( n )+sqrt( d( n )*d( n )+tau2 ) )
+                 ! it can be proved that
+                 ! d(n)^2 < d(n)^2+tau2 < sigma(n)^2 < d(n)^2+rho/2
+              end if
+              ! the following tau is to approximate sigma_n - d( n )
+               ! tau = tau2 / ( d( n )+sqrt( d( n )*d( n )+tau2 ) )
+              sigma = d( n ) + tau
+              do j = 1, n
+                 delta( j ) = ( d( j )-d( n ) ) - tau
+                 work( j ) = d( j ) + d( n ) + tau
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, ii
+                 temp = z( j ) / ( delta( j )*work( j ) )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              temp = z( n ) / ( delta( n )*work( n ) )
+              phi = z( n )*temp
+              dphi = temp*temp
+              erretm = eight*( -phi-psi ) + erretm - phi + rhoinv
+          ! $          + abs( tau2 )*( dpsi+dphi )
+              w = rhoinv + phi + psi
+              ! test for convergence
+              if( abs( w )<=eps*erretm ) then
+                 go to 240
+              end if
+              ! calculate the new step
+              niter = niter + 1
+              dtnsq1 = work( n-1 )*delta( n-1 )
+              dtnsq = work( n )*delta( n )
+              c = w - dtnsq1*dpsi - dtnsq*dphi
+              a = ( dtnsq+dtnsq1 )*w - dtnsq*dtnsq1*( dpsi+dphi )
+              b = dtnsq*dtnsq1*w
+              if( c<zero )c = abs( c )
+              if( c==zero ) then
+                 eta = rho - sigma*sigma
+              else if( a>=zero ) then
+                 eta = ( a+sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+              else
+                 eta = two*b / ( a-sqrt( abs( a*a-four*b*c ) ) )
+              end if
+              ! note, eta should be positive if w is negative, and
+              ! eta should be negative otherwise. however,
+              ! if for some reason caused by roundoff, eta*w > 0,
+              ! we simply use one newton step instead. this way
+              ! will guarantee eta*w < 0.
+              if( w*eta>zero )eta = -w / ( dpsi+dphi )
+              temp = eta - dtnsq
+              if( temp>rho )eta = rho + dtnsq
+              eta = eta / ( sigma+sqrt( eta+sigma*sigma ) )
+              tau = tau + eta
+              sigma = sigma + eta
+              do j = 1, n
+                 delta( j ) = delta( j ) - eta
+                 work( j ) = work( j ) + eta
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, ii
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              tau2 = work( n )*delta( n )
+              temp = z( n ) / tau2
+              phi = z( n )*temp
+              dphi = temp*temp
+              erretm = eight*( -phi-psi ) + erretm - phi + rhoinv
+          ! $          + abs( tau2 )*( dpsi+dphi )
+              w = rhoinv + phi + psi
+              ! main loop to update the values of the array   delta
+              iter = niter + 1
+              loop_90: do niter = iter, maxit
+                 ! test for convergence
+                 if( abs( w )<=eps*erretm ) then
+                    go to 240
+                 end if
+                 ! calculate the new step
+                 dtnsq1 = work( n-1 )*delta( n-1 )
+                 dtnsq = work( n )*delta( n )
+                 c = w - dtnsq1*dpsi - dtnsq*dphi
+                 a = ( dtnsq+dtnsq1 )*w - dtnsq1*dtnsq*( dpsi+dphi )
+                 b = dtnsq1*dtnsq*w
+                 if( a>=zero ) then
+                    eta = ( a+sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 else
+                    eta = two*b / ( a-sqrt( abs( a*a-four*b*c ) ) )
+                 end if
+                 ! note, eta should be positive if w is negative, and
+                 ! eta should be negative otherwise. however,
+                 ! if for some reason caused by roundoff, eta*w > 0,
+                 ! we simply use one newton step instead. this way
+                 ! will guarantee eta*w < 0.
+                 if( w*eta>zero )eta = -w / ( dpsi+dphi )
+                 temp = eta - dtnsq
+                 if( temp<=zero )eta = eta / two
+                 eta = eta / ( sigma+sqrt( eta+sigma*sigma ) )
+                 tau = tau + eta
+                 sigma = sigma + eta
+                 do j = 1, n
+                    delta( j ) = delta( j ) - eta
+                    work( j ) = work( j ) + eta
+                 end do
+                 ! evaluate psi and the derivative dpsi
+                 dpsi = zero
+                 psi = zero
+                 erretm = zero
+                 do j = 1, ii
+                    temp = z( j ) / ( work( j )*delta( j ) )
+                    psi = psi + z( j )*temp
+                    dpsi = dpsi + temp*temp
+                    erretm = erretm + psi
+                 end do
+                 erretm = abs( erretm )
+                 ! evaluate phi and the derivative dphi
+                 tau2 = work( n )*delta( n )
+                 temp = z( n ) / tau2
+                 phi = z( n )*temp
+                 dphi = temp*temp
+                 erretm = eight*( -phi-psi ) + erretm - phi + rhoinv
+          ! $             + abs( tau2 )*( dpsi+dphi )
+                 w = rhoinv + phi + psi
+              end do loop_90
+              ! return with info = 1, niter = maxit and not converged
+              info = 1
+              go to 240
+              ! end for the case i = n
+           else
+              ! the case for i < n
+              niter = 1
+              ip1 = i + 1
+              ! calculate initial guess
+              delsq = ( d( ip1 )-d( i ) )*( d( ip1 )+d( i ) )
+              delsq2 = delsq / two
+              sq2=sqrt( ( d( i )*d( i )+d( ip1 )*d( ip1 ) ) / two )
+              temp = delsq2 / ( d( i )+sq2 )
+              do j = 1, n
+                 work( j ) = d( j ) + d( i ) + temp
+                 delta( j ) = ( d( j )-d( i ) ) - temp
+              end do
+              psi = zero
+              do j = 1, i - 1
+                 psi = psi + z( j )*z( j ) / ( work( j )*delta( j ) )
+              end do
+              phi = zero
+              do j = n, i + 2, -1
+                 phi = phi + z( j )*z( j ) / ( work( j )*delta( j ) )
+              end do
+              c = rhoinv + psi + phi
+              w = c + z( i )*z( i ) / ( work( i )*delta( i ) ) +z( ip1 )*z( ip1 ) / ( work( ip1 )&
+                        *delta( ip1 ) )
+              geomavg = .false.
+              if( w>zero ) then
+                 ! d(i)^2 < the ith sigma^2 < (d(i)^2+d(i+1)^2)/2
+                 ! we choose d(i) as origin.
+                 orgati = .true.
+                 ii = i
+                 sglb = zero
+                 sgub = delsq2  / ( d( i )+sq2 )
+                 a = c*delsq + z( i )*z( i ) + z( ip1 )*z( ip1 )
+                 b = z( i )*z( i )*delsq
+                 if( a>zero ) then
+                    tau2 = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                 else
+                    tau2 = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 end if
+                 ! tau2 now is an estimation of sigma^2 - d( i )^2. the
+                 ! following, however, is the corresponding estimation of
+                 ! sigma - d( i ).
+                 tau = tau2 / ( d( i )+sqrt( d( i )*d( i )+tau2 ) )
+                 temp = sqrt(eps)
+                 if( (d(i)<=temp*d(ip1)).and.(abs(z(i))<=temp).and.(d(i)>zero) ) then
+                    tau = min( ten*d(i), sgub )
+                    geomavg = .true.
+                 end if
+              else
+                 ! (d(i)^2+d(i+1)^2)/2 <= the ith sigma^2 < d(i+1)^2/2
+                 ! we choose d(i+1) as origin.
+                 orgati = .false.
+                 ii = ip1
+                 sglb = -delsq2  / ( d( ii )+sq2 )
+                 sgub = zero
+                 a = c*delsq - z( i )*z( i ) - z( ip1 )*z( ip1 )
+                 b = z( ip1 )*z( ip1 )*delsq
+                 if( a<zero ) then
+                    tau2 = two*b / ( a-sqrt( abs( a*a+four*b*c ) ) )
+                 else
+                    tau2 = -( a+sqrt( abs( a*a+four*b*c ) ) ) / ( two*c )
+                 end if
+                 ! tau2 now is an estimation of sigma^2 - d( ip1 )^2. the
+                 ! following, however, is the corresponding estimation of
+                 ! sigma - d( ip1 ).
+                 tau = tau2 / ( d( ip1 )+sqrt( abs( d( ip1 )*d( ip1 )+tau2 ) ) )
+              end if
+              sigma = d( ii ) + tau
+              do j = 1, n
+                 work( j ) = d( j ) + d( ii ) + tau
+                 delta( j ) = ( d( j )-d( ii ) ) - tau
+              end do
+              iim1 = ii - 1
+              iip1 = ii + 1
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, iim1
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              dphi = zero
+              phi = zero
+              do j = n, iip1, -1
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 phi = phi + z( j )*temp
+                 dphi = dphi + temp*temp
+                 erretm = erretm + phi
+              end do
+              w = rhoinv + phi + psi
+              ! w is the value of the secular function with
+              ! its ii-th element removed.
+              swtch3 = .false.
+              if( orgati ) then
+                 if( w<zero )swtch3 = .true.
+              else
+                 if( w>zero )swtch3 = .true.
+              end if
+              if( ii==1 .or. ii==n )swtch3 = .false.
+              temp = z( ii ) / ( work( ii )*delta( ii ) )
+              dw = dpsi + dphi + temp*temp
+              temp = z( ii )*temp
+              w = w + temp
+              erretm = eight*( phi-psi ) + erretm + two*rhoinv+ three*abs( temp )
+          ! $          + abs( tau2 )*dw
+              ! test for convergence
+              if( abs( w )<=eps*erretm ) then
+                 go to 240
+              end if
+              if( w<=zero ) then
+                 sglb = max( sglb, tau )
+              else
+                 sgub = min( sgub, tau )
+              end if
+              ! calculate the new step
+              niter = niter + 1
+              if( .not.swtch3 ) then
+                 dtipsq = work( ip1 )*delta( ip1 )
+                 dtisq = work( i )*delta( i )
+                 if( orgati ) then
+                    c = w - dtipsq*dw + delsq*( z( i ) / dtisq )**2
+                 else
+                    c = w - dtisq*dw - delsq*( z( ip1 ) / dtipsq )**2
+                 end if
+                 a = ( dtipsq+dtisq )*w - dtipsq*dtisq*dw
+                 b = dtipsq*dtisq*w
+                 if( c==zero ) then
+                    if( a==zero ) then
+                       if( orgati ) then
+                          a = z( i )*z( i ) + dtipsq*dtipsq*( dpsi+dphi )
+                       else
+                          a = z( ip1 )*z( ip1 ) + dtisq*dtisq*( dpsi+dphi )
+                       end if
+                    end if
+                    eta = b / a
+                 else if( a<=zero ) then
+                    eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 else
+                    eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                 end if
+              else
+                 ! interpolation using three most relevant poles
+                 dtiim = work( iim1 )*delta( iim1 )
+                 dtiip = work( iip1 )*delta( iip1 )
+                 temp = rhoinv + psi + phi
+                 if( orgati ) then
+                    temp1 = z( iim1 ) / dtiim
+                    temp1 = temp1*temp1
+                    c = ( temp - dtiip*( dpsi+dphi ) ) -( d( iim1 )-d( iip1 ) )*( d( iim1 )+d( &
+                              iip1 ) )*temp1
+                    zz( 1 ) = z( iim1 )*z( iim1 )
+                    if( dpsi<temp1 ) then
+                       zz( 3 ) = dtiip*dtiip*dphi
+                    else
+                       zz( 3 ) = dtiip*dtiip*( ( dpsi-temp1 )+dphi )
+                    end if
+                 else
+                    temp1 = z( iip1 ) / dtiip
+                    temp1 = temp1*temp1
+                    c = ( temp - dtiim*( dpsi+dphi ) ) -( d( iip1 )-d( iim1 ) )*( d( iim1 )+d( &
+                              iip1 ) )*temp1
+                    if( dphi<temp1 ) then
+                       zz( 1 ) = dtiim*dtiim*dpsi
+                    else
+                       zz( 1 ) = dtiim*dtiim*( dpsi+( dphi-temp1 ) )
+                    end if
+                    zz( 3 ) = z( iip1 )*z( iip1 )
+                 end if
+                 zz( 2 ) = z( ii )*z( ii )
+                 dd( 1 ) = dtiim
+                 dd( 2 ) = delta( ii )*work( ii )
+                 dd( 3 ) = dtiip
+                 call stdlib_qlaed6( niter, orgati, c, dd, zz, w, eta, info )
+                 if( info/=0 ) then
+                    ! if info is not 0, i.e., stdlib_qlaed6 failed, switch back
+                    ! to 2 pole interpolation.
+                    swtch3 = .false.
+                    info = 0
+                    dtipsq = work( ip1 )*delta( ip1 )
+                    dtisq = work( i )*delta( i )
+                    if( orgati ) then
+                       c = w - dtipsq*dw + delsq*( z( i ) / dtisq )**2
+                    else
+                       c = w - dtisq*dw - delsq*( z( ip1 ) / dtipsq )**2
+                    end if
+                    a = ( dtipsq+dtisq )*w - dtipsq*dtisq*dw
+                    b = dtipsq*dtisq*w
+                    if( c==zero ) then
+                       if( a==zero ) then
+                          if( orgati ) then
+                             a = z( i )*z( i ) + dtipsq*dtipsq*( dpsi+dphi )
+                          else
+                             a = z( ip1 )*z( ip1 ) + dtisq*dtisq*( dpsi+dphi)
+                          end if
+                       end if
+                       eta = b / a
+                    else if( a<=zero ) then
+                       eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                    else
+                       eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                    end if
+                 end if
+              end if
+              ! note, eta should be positive if w is negative, and
+              ! eta should be negative otherwise. however,
+              ! if for some reason caused by roundoff, eta*w > 0,
+              ! we simply use one newton step instead. this way
+              ! will guarantee eta*w < 0.
+              if( w*eta>=zero )eta = -w / dw
+              eta = eta / ( sigma+sqrt( sigma*sigma+eta ) )
+              temp = tau + eta
+              if( temp>sgub .or. temp<sglb ) then
+                 if( w<zero ) then
+                    eta = ( sgub-tau ) / two
+                 else
+                    eta = ( sglb-tau ) / two
+                 end if
+                 if( geomavg ) then
+                    if( w < zero ) then
+                       if( tau > zero ) then
+                          eta = sqrt(sgub*tau)-tau
+                       end if
+                    else
+                       if( sglb > zero ) then
+                          eta = sqrt(sglb*tau)-tau
+                       end if
+                    end if
+                 end if
+              end if
+              prew = w
+              tau = tau + eta
+              sigma = sigma + eta
+              do j = 1, n
+                 work( j ) = work( j ) + eta
+                 delta( j ) = delta( j ) - eta
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, iim1
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              dphi = zero
+              phi = zero
+              do j = n, iip1, -1
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 phi = phi + z( j )*temp
+                 dphi = dphi + temp*temp
+                 erretm = erretm + phi
+              end do
+              tau2 = work( ii )*delta( ii )
+              temp = z( ii ) / tau2
+              dw = dpsi + dphi + temp*temp
+              temp = z( ii )*temp
+              w = rhoinv + phi + psi + temp
+              erretm = eight*( phi-psi ) + erretm + two*rhoinv+ three*abs( temp )
+          ! $          + abs( tau2 )*dw
+              swtch = .false.
+              if( orgati ) then
+                 if( -w>abs( prew ) / ten )swtch = .true.
+              else
+                 if( w>abs( prew ) / ten )swtch = .true.
+              end if
+              ! main loop to update the values of the array   delta and work
+              iter = niter + 1
+              loop_230: do niter = iter, maxit
+                 ! test for convergence
+                 if( abs( w )<=eps*erretm ) then
+           ! $          .or. (sgub-sglb)<=eight*abs(sgub+sglb) ) then
+                    go to 240
+                 end if
+                 if( w<=zero ) then
+                    sglb = max( sglb, tau )
+                 else
+                    sgub = min( sgub, tau )
+                 end if
+                 ! calculate the new step
+                 if( .not.swtch3 ) then
+                    dtipsq = work( ip1 )*delta( ip1 )
+                    dtisq = work( i )*delta( i )
+                    if( .not.swtch ) then
+                       if( orgati ) then
+                          c = w - dtipsq*dw + delsq*( z( i ) / dtisq )**2
+                       else
+                          c = w - dtisq*dw - delsq*( z( ip1 ) / dtipsq )**2
+                       end if
+                    else
+                       temp = z( ii ) / ( work( ii )*delta( ii ) )
+                       if( orgati ) then
+                          dpsi = dpsi + temp*temp
+                       else
+                          dphi = dphi + temp*temp
+                       end if
+                       c = w - dtisq*dpsi - dtipsq*dphi
+                    end if
+                    a = ( dtipsq+dtisq )*w - dtipsq*dtisq*dw
+                    b = dtipsq*dtisq*w
+                    if( c==zero ) then
+                       if( a==zero ) then
+                          if( .not.swtch ) then
+                             if( orgati ) then
+                                a = z( i )*z( i ) + dtipsq*dtipsq*( dpsi+dphi )
+                             else
+                                a = z( ip1 )*z( ip1 ) +dtisq*dtisq*( dpsi+dphi )
+                             end if
+                          else
+                             a = dtisq*dtisq*dpsi + dtipsq*dtipsq*dphi
+                          end if
+                       end if
+                       eta = b / a
+                    else if( a<=zero ) then
+                       eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                    else
+                       eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                    end if
+                 else
+                    ! interpolation using three most relevant poles
+                    dtiim = work( iim1 )*delta( iim1 )
+                    dtiip = work( iip1 )*delta( iip1 )
+                    temp = rhoinv + psi + phi
+                    if( swtch ) then
+                       c = temp - dtiim*dpsi - dtiip*dphi
+                       zz( 1 ) = dtiim*dtiim*dpsi
+                       zz( 3 ) = dtiip*dtiip*dphi
+                    else
+                       if( orgati ) then
+                          temp1 = z( iim1 ) / dtiim
+                          temp1 = temp1*temp1
+                          temp2 = ( d( iim1 )-d( iip1 ) )*( d( iim1 )+d( iip1 ) )*temp1
+                          c = temp - dtiip*( dpsi+dphi ) - temp2
+                          zz( 1 ) = z( iim1 )*z( iim1 )
+                          if( dpsi<temp1 ) then
+                             zz( 3 ) = dtiip*dtiip*dphi
+                          else
+                             zz( 3 ) = dtiip*dtiip*( ( dpsi-temp1 )+dphi )
+                          end if
+                       else
+                          temp1 = z( iip1 ) / dtiip
+                          temp1 = temp1*temp1
+                          temp2 = ( d( iip1 )-d( iim1 ) )*( d( iim1 )+d( iip1 ) )*temp1
+                          c = temp - dtiim*( dpsi+dphi ) - temp2
+                          if( dphi<temp1 ) then
+                             zz( 1 ) = dtiim*dtiim*dpsi
+                          else
+                             zz( 1 ) = dtiim*dtiim*( dpsi+( dphi-temp1 ) )
+                          end if
+                          zz( 3 ) = z( iip1 )*z( iip1 )
+                       end if
+                    end if
+                    dd( 1 ) = dtiim
+                    dd( 2 ) = delta( ii )*work( ii )
+                    dd( 3 ) = dtiip
+                    call stdlib_qlaed6( niter, orgati, c, dd, zz, w, eta, info )
+                    if( info/=0 ) then
+                       ! if info is not 0, i.e., stdlib_qlaed6 failed, switch
+                       ! back to two pole interpolation
+                       swtch3 = .false.
+                       info = 0
+                       dtipsq = work( ip1 )*delta( ip1 )
+                       dtisq = work( i )*delta( i )
+                       if( .not.swtch ) then
+                          if( orgati ) then
+                             c = w - dtipsq*dw + delsq*( z( i )/dtisq )**2
+                          else
+                             c = w - dtisq*dw - delsq*( z( ip1 )/dtipsq )**2
+                          end if
+                       else
+                          temp = z( ii ) / ( work( ii )*delta( ii ) )
+                          if( orgati ) then
+                             dpsi = dpsi + temp*temp
+                          else
+                             dphi = dphi + temp*temp
+                          end if
+                          c = w - dtisq*dpsi - dtipsq*dphi
+                       end if
+                       a = ( dtipsq+dtisq )*w - dtipsq*dtisq*dw
+                       b = dtipsq*dtisq*w
+                       if( c==zero ) then
+                          if( a==zero ) then
+                             if( .not.swtch ) then
+                                if( orgati ) then
+                                   a = z( i )*z( i ) + dtipsq*dtipsq*( dpsi+dphi )
+                                else
+                                   a = z( ip1 )*z( ip1 ) +dtisq*dtisq*( dpsi+dphi )
+                                end if
+                             else
+                                a = dtisq*dtisq*dpsi + dtipsq*dtipsq*dphi
+                             end if
+                          end if
+                          eta = b / a
+                       else if( a<=zero ) then
+                          eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                       else
+                          eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                       end if
+                    end if
+                 end if
+                 ! note, eta should be positive if w is negative, and
+                 ! eta should be negative otherwise. however,
+                 ! if for some reason caused by roundoff, eta*w > 0,
+                 ! we simply use one newton step instead. this way
+                 ! will guarantee eta*w < 0.
+                 if( w*eta>=zero )eta = -w / dw
+                 eta = eta / ( sigma+sqrt( sigma*sigma+eta ) )
+                 temp=tau+eta
+                 if( temp>sgub .or. temp<sglb ) then
+                    if( w<zero ) then
+                       eta = ( sgub-tau ) / two
+                    else
+                       eta = ( sglb-tau ) / two
+                    end if
+                    if( geomavg ) then
+                       if( w < zero ) then
+                          if( tau > zero ) then
+                             eta = sqrt(sgub*tau)-tau
+                          end if
+                       else
+                          if( sglb > zero ) then
+                             eta = sqrt(sglb*tau)-tau
+                          end if
+                       end if
+                    end if
+                 end if
+                 prew = w
+                 tau = tau + eta
+                 sigma = sigma + eta
+                 do j = 1, n
+                    work( j ) = work( j ) + eta
+                    delta( j ) = delta( j ) - eta
+                 end do
+                 ! evaluate psi and the derivative dpsi
+                 dpsi = zero
+                 psi = zero
+                 erretm = zero
+                 do j = 1, iim1
+                    temp = z( j ) / ( work( j )*delta( j ) )
+                    psi = psi + z( j )*temp
+                    dpsi = dpsi + temp*temp
+                    erretm = erretm + psi
+                 end do
+                 erretm = abs( erretm )
+                 ! evaluate phi and the derivative dphi
+                 dphi = zero
+                 phi = zero
+                 do j = n, iip1, -1
+                    temp = z( j ) / ( work( j )*delta( j ) )
+                    phi = phi + z( j )*temp
+                    dphi = dphi + temp*temp
+                    erretm = erretm + phi
+                 end do
+                 tau2 = work( ii )*delta( ii )
+                 temp = z( ii ) / tau2
+                 dw = dpsi + dphi + temp*temp
+                 temp = z( ii )*temp
+                 w = rhoinv + phi + psi + temp
+                 erretm = eight*( phi-psi ) + erretm + two*rhoinv+ three*abs( temp )
+          ! $             + abs( tau2 )*dw
+                 if( w*prew>zero .and. abs( w )>abs( prew ) / ten )swtch = .not.swtch
+              end do loop_230
+              ! return with info = 1, niter = maxit and not converged
+              info = 1
+           end if
+           240 continue
+           return
+     end subroutine stdlib_qlasd4
+
+     !> This subroutine computes the square root of the I-th eigenvalue
+     !> of a positive symmetric rank-one modification of a 2-by-2 diagonal
+     !> matrix
+     !> diag( D ) * diag( D ) +  RHO * Z * transpose(Z) .
+     !> The diagonal entries in the array D are assumed to satisfy
+     !> 0 <= D(i) < D(j)  for  i < j .
+     !> We also assume RHO > 0 and that the Euclidean norm of the vector
+     !> Z is one.
+
+     pure subroutine stdlib_qlasd5( i, d, z, delta, rho, dsigma, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i
+           real(qp), intent(out) :: dsigma
+           real(qp), intent(in) :: rho
+           ! Array Arguments 
+           real(qp), intent(in) :: d(2), z(2)
+           real(qp), intent(out) :: delta(2), work(2)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: b, c, del, delsq, tau, w
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           del = d( 2 ) - d( 1 )
+           delsq = del*( d( 2 )+d( 1 ) )
+           if( i==1 ) then
+              w = one + four*rho*( z( 2 )*z( 2 ) / ( d( 1 )+three*d( 2 ) )-z( 1 )*z( 1 ) / ( &
+                        three*d( 1 )+d( 2 ) ) ) / del
+              if( w>zero ) then
+                 b = delsq + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+                 c = rho*z( 1 )*z( 1 )*delsq
+                 ! b > zero, always
+                 ! the following tau is dsigma * dsigma - d( 1 ) * d( 1 )
+                 tau = two*c / ( b+sqrt( abs( b*b-four*c ) ) )
+                 ! the following tau is dsigma - d( 1 )
+                 tau = tau / ( d( 1 )+sqrt( d( 1 )*d( 1 )+tau ) )
+                 dsigma = d( 1 ) + tau
+                 delta( 1 ) = -tau
+                 delta( 2 ) = del - tau
+                 work( 1 ) = two*d( 1 ) + tau
+                 work( 2 ) = ( d( 1 )+tau ) + d( 2 )
+                 ! delta( 1 ) = -z( 1 ) / tau
+                 ! delta( 2 ) = z( 2 ) / ( del-tau )
+              else
+                 b = -delsq + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+                 c = rho*z( 2 )*z( 2 )*delsq
+                 ! the following tau is dsigma * dsigma - d( 2 ) * d( 2 )
+                 if( b>zero ) then
+                    tau = -two*c / ( b+sqrt( b*b+four*c ) )
+                 else
+                    tau = ( b-sqrt( b*b+four*c ) ) / two
+                 end if
+                 ! the following tau is dsigma - d( 2 )
+                 tau = tau / ( d( 2 )+sqrt( abs( d( 2 )*d( 2 )+tau ) ) )
+                 dsigma = d( 2 ) + tau
+                 delta( 1 ) = -( del+tau )
+                 delta( 2 ) = -tau
+                 work( 1 ) = d( 1 ) + tau + d( 2 )
+                 work( 2 ) = two*d( 2 ) + tau
+                 ! delta( 1 ) = -z( 1 ) / ( del+tau )
+                 ! delta( 2 ) = -z( 2 ) / tau
+              end if
+              ! temp = sqrt( delta( 1 )*delta( 1 )+delta( 2 )*delta( 2 ) )
+              ! delta( 1 ) = delta( 1 ) / temp
+              ! delta( 2 ) = delta( 2 ) / temp
+           else
+              ! now i=2
+              b = -delsq + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+              c = rho*z( 2 )*z( 2 )*delsq
+              ! the following tau is dsigma * dsigma - d( 2 ) * d( 2 )
+              if( b>zero ) then
+                 tau = ( b+sqrt( b*b+four*c ) ) / two
+              else
+                 tau = two*c / ( -b+sqrt( b*b+four*c ) )
+              end if
+              ! the following tau is dsigma - d( 2 )
+              tau = tau / ( d( 2 )+sqrt( d( 2 )*d( 2 )+tau ) )
+              dsigma = d( 2 ) + tau
+              delta( 1 ) = -( del+tau )
+              delta( 2 ) = -tau
+              work( 1 ) = d( 1 ) + tau + d( 2 )
+              work( 2 ) = two*d( 2 ) + tau
+              ! delta( 1 ) = -z( 1 ) / ( del+tau )
+              ! delta( 2 ) = -z( 2 ) / tau
+              ! temp = sqrt( delta( 1 )*delta( 1 )+delta( 2 )*delta( 2 ) )
+              ! delta( 1 ) = delta( 1 ) / temp
+              ! delta( 2 ) = delta( 2 ) / temp
+           end if
+           return
+     end subroutine stdlib_qlasd5
+
+     !> DLASD6: computes the SVD of an updated upper bidiagonal matrix B
+     !> obtained by merging two smaller ones by appending a row. This
+     !> routine is used only for the problem which requires all singular
+     !> values and optionally singular vector matrices in factored form.
+     !> B is an N-by-M matrix with N = NL + NR + 1 and M = N + SQRE.
+     !> A related subroutine, DLASD1, handles the case in which all singular
+     !> values and singular vectors of the bidiagonal matrix are desired.
+     !> DLASD6 computes the SVD as follows:
+     !> ( D1(in)    0    0       0 )
+     !> B = U(in) * (   Z1**T   a   Z2**T    b ) * VT(in)
+     !> (   0       0   D2(in)   0 )
+     !> = U(out) * ( D(out) 0) * VT(out)
+     !> where Z**T = (Z1**T a Z2**T b) = u**T VT**T, and u is a vector of dimension M
+     !> with ALPHA and BETA in the NL+1 and NL+2 th entries and zeros
+     !> elsewhere; and the entry b is empty if SQRE = 0.
+     !> The singular values of B can be computed using D1, D2, the first
+     !> components of all the right singular vectors of the lower block, and
+     !> the last components of all the right singular vectors of the upper
+     !> block. These components are stored and updated in VF and VL,
+     !> respectively, in DLASD6. Hence U and VT are not explicitly
+     !> referenced.
+     !> The singular values are stored in D. The algorithm consists of two
+     !> stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple singular values or if there is a zero
+     !> in the Z vector. For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one. This stage is
+     !> performed by the routine DLASD7.
+     !> The second stage consists of calculating the updated
+     !> singular values. This is done by finding the roots of the
+     !> secular equation via the routine DLASD4 (as called by DLASD8).
+     !> This routine also updates VF and VL and computes the distances
+     !> between the updated singular values and the old singular
+     !> values.
+     !> DLASD6 is called from DLASDA.
+
+     pure subroutine stdlib_qlasd6( icompq, nl, nr, sqre, d, vf, vl, alpha, beta,idxq, perm, &
+     givptr, givcol, ldgcol, givnum,ldgnum, poles, difl, difr, z, k, c, s, work,iwork, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: givptr, info, k
+           integer(ilp), intent(in) :: icompq, ldgcol, ldgnum, nl, nr, sqre
+           real(qp), intent(inout) :: alpha, beta
+           real(qp), intent(out) :: c, s
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(ldgcol,*), iwork(*), perm(*)
+           integer(ilp), intent(inout) :: idxq(*)
+           real(qp), intent(inout) :: d(*), vf(*), vl(*)
+           real(qp), intent(out) :: difl(*), difr(*), givnum(ldgnum,*), poles(ldgnum,*), work(*), &
+                     z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, idx, idxc, idxp, isigma, ivfw, ivlw, iw, m, n, n1, n2
+           real(qp) :: orgnrm
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           n = nl + nr + 1
+           m = n + sqre
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( nl<1 ) then
+              info = -2
+           else if( nr<1 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( ldgcol<n ) then
+              info = -14
+           else if( ldgnum<n ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD6', -info )
+              return
+           end if
+           ! the following values are for bookkeeping purposes only.  they are
+           ! integer pointers which indicate the portion of the workspace
+           ! used by a particular array in stdlib_qlasd7 and stdlib_qlasd8.
+           isigma = 1
+           iw = isigma + n
+           ivfw = iw + m
+           ivlw = ivfw + m
+           idx = 1
+           idxc = idx + n
+           idxp = idxc + n
+           ! scale.
+           orgnrm = max( abs( alpha ), abs( beta ) )
+           d( nl+1 ) = zero
+           do i = 1, n
+              if( abs( d( i ) )>orgnrm ) then
+                 orgnrm = abs( d( i ) )
+              end if
+           end do
+           call stdlib_qlascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, info )
+           alpha = alpha / orgnrm
+           beta = beta / orgnrm
+           ! sort and deflate singular values.
+           call stdlib_qlasd7( icompq, nl, nr, sqre, k, d, z, work( iw ), vf,work( ivfw ), vl, &
+           work( ivlw ), alpha, beta,work( isigma ), iwork( idx ), iwork( idxp ), idxq,perm, &
+                     givptr, givcol, ldgcol, givnum, ldgnum, c, s,info )
+           ! solve secular equation, compute difl, difr, and update vf, vl.
+           call stdlib_qlasd8( icompq, k, d, z, vf, vl, difl, difr, ldgnum,work( isigma ), work( &
+                     iw ), info )
+           ! report the possible convergence failure.
+           if( info/=0 ) then
+              return
+           end if
+           ! save the poles if icompq = 1.
+           if( icompq==1 ) then
+              call stdlib_qcopy( k, d, 1, poles( 1, 1 ), 1 )
+              call stdlib_qcopy( k, work( isigma ), 1, poles( 1, 2 ), 1 )
+           end if
+           ! unscale.
+           call stdlib_qlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+           ! prepare the idxq sorting permutation.
+           n1 = k
+           n2 = n - k
+           call stdlib_qlamrg( n1, n2, d, 1, -1, idxq )
+           return
+     end subroutine stdlib_qlasd6
+
+     !> DLASD7: merges the two sets of singular values together into a single
+     !> sorted set. Then it tries to deflate the size of the problem. There
+     !> are two ways in which deflation can occur:  when two or more singular
+     !> values are close together or if there is a tiny entry in the Z
+     !> vector. For each such occurrence the order of the related
+     !> secular equation problem is reduced by one.
+     !> DLASD7 is called from DLASD6.
+
+     pure subroutine stdlib_qlasd7( icompq, nl, nr, sqre, k, d, z, zw, vf, vfw, vl,vlw, alpha, &
+     beta, dsigma, idx, idxp, idxq,perm, givptr, givcol, ldgcol, givnum, ldgnum,c, s, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: givptr, info, k
+           integer(ilp), intent(in) :: icompq, ldgcol, ldgnum, nl, nr, sqre
+           real(qp), intent(in) :: alpha, beta
+           real(qp), intent(out) :: c, s
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(ldgcol,*), idx(*), idxp(*), perm(*)
+           integer(ilp), intent(inout) :: idxq(*)
+           real(qp), intent(inout) :: d(*), vf(*), vl(*)
+           real(qp), intent(out) :: dsigma(*), givnum(ldgnum,*), vfw(*), vlw(*), z(*), zw(*)
+                     
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, idxi, idxj, idxjp, j, jp, jprev, k2, m, n, nlp1, nlp2
+           real(qp) :: eps, hlftol, tau, tol, z1
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           n = nl + nr + 1
+           m = n + sqre
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( nl<1 ) then
+              info = -2
+           else if( nr<1 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( ldgcol<n ) then
+              info = -22
+           else if( ldgnum<n ) then
+              info = -24
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD7', -info )
+              return
+           end if
+           nlp1 = nl + 1
+           nlp2 = nl + 2
+           if( icompq==1 ) then
+              givptr = 0
+           end if
+           ! generate the first part of the vector z and move the singular
+           ! values in the first part of d one position backward.
+           z1 = alpha*vl( nlp1 )
+           vl( nlp1 ) = zero
+           tau = vf( nlp1 )
+           do i = nl, 1, -1
+              z( i+1 ) = alpha*vl( i )
+              vl( i ) = zero
+              vf( i+1 ) = vf( i )
+              d( i+1 ) = d( i )
+              idxq( i+1 ) = idxq( i ) + 1
+           end do
+           vf( 1 ) = tau
+           ! generate the second part of the vector z.
+           do i = nlp2, m
+              z( i ) = beta*vf( i )
+              vf( i ) = zero
+           end do
+           ! sort the singular values into increasing order
+           do i = nlp2, n
+              idxq( i ) = idxq( i ) + nlp1
+           end do
+           ! dsigma, idxc, idxc, and zw are used as storage space.
+           do i = 2, n
+              dsigma( i ) = d( idxq( i ) )
+              zw( i ) = z( idxq( i ) )
+              vfw( i ) = vf( idxq( i ) )
+              vlw( i ) = vl( idxq( i ) )
+           end do
+           call stdlib_qlamrg( nl, nr, dsigma( 2 ), 1, 1, idx( 2 ) )
+           do i = 2, n
+              idxi = 1 + idx( i )
+              d( i ) = dsigma( idxi )
+              z( i ) = zw( idxi )
+              vf( i ) = vfw( idxi )
+              vl( i ) = vlw( idxi )
+           end do
+           ! calculate the allowable deflation tolerance
+           eps = stdlib_qlamch( 'EPSILON' )
+           tol = max( abs( alpha ), abs( beta ) )
+           tol = eight*eight*eps*max( abs( d( n ) ), tol )
+           ! there are 2 kinds of deflation -- first a value in the z-vector
+           ! is small, second two (or more) singular values are very close
+           ! together (their difference is small).
+           ! if the value in the z-vector is small, we simply permute the
+           ! array so that the corresponding singular value is moved to the
+           ! end.
+           ! if two values in the d-vector are close, we perform a two-sided
+           ! rotation designed to make one of the corresponding z-vector
+           ! entries zero, and then permute the array so that the deflated
+           ! singular value is moved to the end.
+           ! if there are multiple singular values then the problem deflates.
+           ! here the number of equal singular values are found.  as each equal
+           ! singular value is found, an elementary reflector is computed to
+           ! rotate the corresponding singular subspace so that the
+           ! corresponding components of z are zero in this new basis.
+           k = 1
+           k2 = n + 1
+           do j = 2, n
+              if( abs( z( j ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 idxp( k2 ) = j
+                 if( j==n )go to 100
+              else
+                 jprev = j
+                 go to 70
+              end if
+           end do
+           70 continue
+           j = jprev
+           80 continue
+           j = j + 1
+           if( j>n )go to 90
+           if( abs( z( j ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              idxp( k2 ) = j
+           else
+              ! check if singular values are close enough to allow deflation.
+              if( abs( d( j )-d( jprev ) )<=tol ) then
+                 ! deflation is possible.
+                 s = z( jprev )
+                 c = z( j )
+                 ! find sqrt(a**2+b**2) without overflow or
+                 ! destructive underflow.
+                 tau = stdlib_qlapy2( c, s )
+                 z( j ) = tau
+                 z( jprev ) = zero
+                 c = c / tau
+                 s = -s / tau
+                 ! record the appropriate givens rotation
+                 if( icompq==1 ) then
+                    givptr = givptr + 1
+                    idxjp = idxq( idx( jprev )+1 )
+                    idxj = idxq( idx( j )+1 )
+                    if( idxjp<=nlp1 ) then
+                       idxjp = idxjp - 1
+                    end if
+                    if( idxj<=nlp1 ) then
+                       idxj = idxj - 1
+                    end if
+                    givcol( givptr, 2 ) = idxjp
+                    givcol( givptr, 1 ) = idxj
+                    givnum( givptr, 2 ) = c
+                    givnum( givptr, 1 ) = s
+                 end if
+                 call stdlib_qrot( 1, vf( jprev ), 1, vf( j ), 1, c, s )
+                 call stdlib_qrot( 1, vl( jprev ), 1, vl( j ), 1, c, s )
+                 k2 = k2 - 1
+                 idxp( k2 ) = jprev
+                 jprev = j
+              else
+                 k = k + 1
+                 zw( k ) = z( jprev )
+                 dsigma( k ) = d( jprev )
+                 idxp( k ) = jprev
+                 jprev = j
+              end if
+           end if
+           go to 80
+           90 continue
+           ! record the last singular value.
+           k = k + 1
+           zw( k ) = z( jprev )
+           dsigma( k ) = d( jprev )
+           idxp( k ) = jprev
+           100 continue
+           ! sort the singular values into dsigma. the singular values which
+           ! were not deflated go into the first k slots of dsigma, except
+           ! that dsigma(1) is treated separately.
+           do j = 2, n
+              jp = idxp( j )
+              dsigma( j ) = d( jp )
+              vfw( j ) = vf( jp )
+              vlw( j ) = vl( jp )
+           end do
+           if( icompq==1 ) then
+              do j = 2, n
+                 jp = idxp( j )
+                 perm( j ) = idxq( idx( jp )+1 )
+                 if( perm( j )<=nlp1 ) then
+                    perm( j ) = perm( j ) - 1
+                 end if
+              end do
+           end if
+           ! the deflated singular values go back into the last n - k slots of
+           ! d.
+           call stdlib_qcopy( n-k, dsigma( k+1 ), 1, d( k+1 ), 1 )
+           ! determine dsigma(1), dsigma(2), z(1), vf(1), vl(1), vf(m), and
+           ! vl(m).
+           dsigma( 1 ) = zero
+           hlftol = tol / two
+           if( abs( dsigma( 2 ) )<=hlftol )dsigma( 2 ) = hlftol
+           if( m>n ) then
+              z( 1 ) = stdlib_qlapy2( z1, z( m ) )
+              if( z( 1 )<=tol ) then
+                 c = one
+                 s = zero
+                 z( 1 ) = tol
+              else
+                 c = z1 / z( 1 )
+                 s = -z( m ) / z( 1 )
+              end if
+              call stdlib_qrot( 1, vf( m ), 1, vf( 1 ), 1, c, s )
+              call stdlib_qrot( 1, vl( m ), 1, vl( 1 ), 1, c, s )
+           else
+              if( abs( z1 )<=tol ) then
+                 z( 1 ) = tol
+              else
+                 z( 1 ) = z1
+              end if
+           end if
+           ! restore z, vf, and vl.
+           call stdlib_qcopy( k-1, zw( 2 ), 1, z( 2 ), 1 )
+           call stdlib_qcopy( n-1, vfw( 2 ), 1, vf( 2 ), 1 )
+           call stdlib_qcopy( n-1, vlw( 2 ), 1, vl( 2 ), 1 )
+           return
+     end subroutine stdlib_qlasd7
+
+     !> DLASD8: finds the square roots of the roots of the secular equation,
+     !> as defined by the values in DSIGMA and Z. It makes the appropriate
+     !> calls to DLASD4, and stores, for each  element in D, the distance
+     !> to its two nearest poles (elements in DSIGMA). It also updates
+     !> the arrays VF and VL, the first and last components of all the
+     !> right singular vectors of the original bidiagonal matrix.
+     !> DLASD8 is called from DLASD6.
+
+     pure subroutine stdlib_qlasd8( icompq, k, d, z, vf, vl, difl, difr, lddifr,dsigma, work, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, k, lddifr
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(out) :: d(*), difl(*), difr(lddifr,*), work(*)
+           real(qp), intent(inout) :: dsigma(*), vf(*), vl(*), z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iwk1, iwk2, iwk2i, iwk3, iwk3i, j
+           real(qp) :: diflj, difrj, dj, dsigj, dsigjp, rho, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( k<1 ) then
+              info = -2
+           else if( lddifr<k ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASD8', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==1 ) then
+              d( 1 ) = abs( z( 1 ) )
+              difl( 1 ) = d( 1 )
+              if( icompq==1 ) then
+                 difl( 2 ) = one
+                 difr( 1, 2 ) = one
+              end if
+              return
+           end if
+           ! modify values dsigma(i) to make sure all dsigma(i)-dsigma(j) can
+           ! be computed with high relative accuracy (barring over/underflow).
+           ! this is a problem on machines without a guard digit in
+           ! add/subtract (cray xmp, cray ymp, cray c 90 and cray 2).
+           ! the following code replaces dsigma(i) by 2*dsigma(i)-dsigma(i),
+           ! which on any of these machines zeros out the bottommost
+           ! bit of dsigma(i) if it is 1; this makes the subsequent
+           ! subtractions dsigma(i)-dsigma(j) unproblematic when cancellation
+           ! occurs. on binary machines with a guard digit (almost all
+           ! machines) it does not change dsigma(i) at all. on hexadecimal
+           ! and decimal machines with a guard digit, it slightly
+           ! changes the bottommost bits of dsigma(i). it does not account
+           ! for hexadecimal or decimal machines without guard digits
+           ! (we know of none). we use a subroutine call to compute
+           ! 2*dlambda(i) to prevent optimizing compilers from eliminating
+           ! this code.
+           do i = 1, k
+              dsigma( i ) = stdlib_qlamc3( dsigma( i ), dsigma( i ) ) - dsigma( i )
+           end do
+           ! book keeping.
+           iwk1 = 1
+           iwk2 = iwk1 + k
+           iwk3 = iwk2 + k
+           iwk2i = iwk2 - 1
+           iwk3i = iwk3 - 1
+           ! normalize z.
+           rho = stdlib_qnrm2( k, z, 1 )
+           call stdlib_qlascl( 'G', 0, 0, rho, one, k, 1, z, k, info )
+           rho = rho*rho
+           ! initialize work(iwk3).
+           call stdlib_qlaset( 'A', k, 1, one, one, work( iwk3 ), k )
+           ! compute the updated singular values, the arrays difl, difr,
+           ! and the updated z.
+           do j = 1, k
+              call stdlib_qlasd4( k, j, dsigma, z, work( iwk1 ), rho, d( j ),work( iwk2 ), info )
+                        
+              ! if the root finder fails, report the convergence failure.
+              if( info/=0 ) then
+                 return
+              end if
+              work( iwk3i+j ) = work( iwk3i+j )*work( j )*work( iwk2i+j )
+              difl( j ) = -work( j )
+              difr( j, 1 ) = -work( j+1 )
+              do i = 1, j - 1
+                 work( iwk3i+i ) = work( iwk3i+i )*work( i )*work( iwk2i+i ) / ( dsigma( i )-&
+                           dsigma( j ) ) / ( dsigma( i )+dsigma( j ) )
+              end do
+              do i = j + 1, k
+                 work( iwk3i+i ) = work( iwk3i+i )*work( i )*work( iwk2i+i ) / ( dsigma( i )-&
+                           dsigma( j ) ) / ( dsigma( i )+dsigma( j ) )
+              end do
+           end do
+           ! compute updated z.
+           do i = 1, k
+              z( i ) = sign( sqrt( abs( work( iwk3i+i ) ) ), z( i ) )
+           end do
+           ! update vf and vl.
+           do j = 1, k
+              diflj = difl( j )
+              dj = d( j )
+              dsigj = -dsigma( j )
+              if( j<k ) then
+                 difrj = -difr( j, 1 )
+                 dsigjp = -dsigma( j+1 )
+              end if
+              work( j ) = -z( j ) / diflj / ( dsigma( j )+dj )
+              do i = 1, j - 1
+                 work( i ) = z( i ) / ( stdlib_qlamc3( dsigma( i ), dsigj )-diflj )/ ( dsigma( i )&
+                           +dj )
+              end do
+              do i = j + 1, k
+                 work( i ) = z( i ) / ( stdlib_qlamc3( dsigma( i ), dsigjp )+difrj )/ ( dsigma( i &
+                           )+dj )
+              end do
+              temp = stdlib_qnrm2( k, work, 1 )
+              work( iwk2i+j ) = stdlib_qdot( k, work, 1, vf, 1 ) / temp
+              work( iwk3i+j ) = stdlib_qdot( k, work, 1, vl, 1 ) / temp
+              if( icompq==1 ) then
+                 difr( j, 2 ) = temp
+              end if
+           end do
+           call stdlib_qcopy( k, work( iwk2 ), 1, vf, 1 )
+           call stdlib_qcopy( k, work( iwk3 ), 1, vl, 1 )
+           return
+     end subroutine stdlib_qlasd8
+
+     !> Using a divide and conquer approach, DLASDA: computes the singular
+     !> value decomposition (SVD) of a real upper bidiagonal N-by-M matrix
+     !> B with diagonal D and offdiagonal E, where M = N + SQRE. The
+     !> algorithm computes the singular values in the SVD B = U * S * VT.
+     !> The orthogonal matrices U and VT are optionally computed in
+     !> compact form.
+     !> A related subroutine, DLASD0, computes the singular values and
+     !> the singular vectors in explicit form.
+
+     pure subroutine stdlib_qlasda( icompq, smlsiz, n, sqre, d, e, u, ldu, vt, k,difl, difr, z, &
+               poles, givptr, givcol, ldgcol,perm, givnum, c, s, work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, ldgcol, ldu, n, smlsiz, sqre
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(ldgcol,*), givptr(*), iwork(*), k(*), perm(ldgcol,&
+                     *)
+           real(qp), intent(out) :: c(*), difl(ldu,*), difr(ldu,*), givnum(ldu,*), poles(ldu,*), &
+                     s(*), u(ldu,*), vt(ldu,*), work(*), z(ldu,*)
+           real(qp), intent(inout) :: d(*), e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ic, idxq, idxqi, im1, inode, itemp, iwk, j, lf, ll, lvl, lvl2, &
+           m, ncc, nd, ndb1, ndiml, ndimr, nl, nlf, nlp1, nlvl, nr, nrf, nrp1, nru, nwork1, &
+                     nwork2, smlszp, sqrei, vf, vfi, vl, vli
+           real(qp) :: alpha, beta
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( smlsiz<3 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( ldu<( n+sqre ) ) then
+              info = -8
+           else if( ldgcol<n ) then
+              info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASDA', -info )
+              return
+           end if
+           m = n + sqre
+           ! if the input matrix is too small, call stdlib_qlasdq to find the svd.
+           if( n<=smlsiz ) then
+              if( icompq==0 ) then
+                 call stdlib_qlasdq( 'U', sqre, n, 0, 0, 0, d, e, vt, ldu, u, ldu,u, ldu, work, &
+                           info )
+              else
+                 call stdlib_qlasdq( 'U', sqre, n, m, n, 0, d, e, vt, ldu, u, ldu,u, ldu, work, &
+                           info )
+              end if
+              return
+           end if
+           ! book-keeping and  set up the computation tree.
+           inode = 1
+           ndiml = inode + n
+           ndimr = ndiml + n
+           idxq = ndimr + n
+           iwk = idxq + n
+           ncc = 0
+           nru = 0
+           smlszp = smlsiz + 1
+           vf = 1
+           vl = vf + m
+           nwork1 = vl + m
+           nwork2 = nwork1 + smlszp*smlszp
+           call stdlib_qlasdt( n, nlvl, nd, iwork( inode ), iwork( ndiml ),iwork( ndimr ), smlsiz &
+                     )
+           ! for the nodes on bottom level of the tree, solve
+           ! their subproblems by stdlib_qlasdq.
+           ndb1 = ( nd+1 ) / 2
+           loop_30: do i = ndb1, nd
+              ! ic : center row of each node
+              ! nl : number of rows of left  subproblem
+              ! nr : number of rows of right subproblem
+              ! nlf: starting row of the left   subproblem
+              ! nrf: starting row of the right  subproblem
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nlp1 = nl + 1
+              nr = iwork( ndimr+i1 )
+              nlf = ic - nl
+              nrf = ic + 1
+              idxqi = idxq + nlf - 2
+              vfi = vf + nlf - 1
+              vli = vl + nlf - 1
+              sqrei = 1
+              if( icompq==0 ) then
+                 call stdlib_qlaset( 'A', nlp1, nlp1, zero, one, work( nwork1 ),smlszp )
+                 call stdlib_qlasdq( 'U', sqrei, nl, nlp1, nru, ncc, d( nlf ),e( nlf ), work( &
+                 nwork1 ), smlszp,work( nwork2 ), nl, work( nwork2 ), nl,work( nwork2 ), info )
+                           
+                 itemp = nwork1 + nl*smlszp
+                 call stdlib_qcopy( nlp1, work( nwork1 ), 1, work( vfi ), 1 )
+                 call stdlib_qcopy( nlp1, work( itemp ), 1, work( vli ), 1 )
+              else
+                 call stdlib_qlaset( 'A', nl, nl, zero, one, u( nlf, 1 ), ldu )
+                 call stdlib_qlaset( 'A', nlp1, nlp1, zero, one, vt( nlf, 1 ), ldu )
+                 call stdlib_qlasdq( 'U', sqrei, nl, nlp1, nl, ncc, d( nlf ),e( nlf ), vt( nlf, 1 &
+                           ), ldu, u( nlf, 1 ), ldu,u( nlf, 1 ), ldu, work( nwork1 ), info )
+                 call stdlib_qcopy( nlp1, vt( nlf, 1 ), 1, work( vfi ), 1 )
+                 call stdlib_qcopy( nlp1, vt( nlf, nlp1 ), 1, work( vli ), 1 )
+              end if
+              if( info/=0 ) then
+                 return
+              end if
+              do j = 1, nl
+                 iwork( idxqi+j ) = j
+              end do
+              if( ( i==nd ) .and. ( sqre==0 ) ) then
+                 sqrei = 0
+              else
+                 sqrei = 1
+              end if
+              idxqi = idxqi + nlp1
+              vfi = vfi + nlp1
+              vli = vli + nlp1
+              nrp1 = nr + sqrei
+              if( icompq==0 ) then
+                 call stdlib_qlaset( 'A', nrp1, nrp1, zero, one, work( nwork1 ),smlszp )
+                 call stdlib_qlasdq( 'U', sqrei, nr, nrp1, nru, ncc, d( nrf ),e( nrf ), work( &
+                 nwork1 ), smlszp,work( nwork2 ), nr, work( nwork2 ), nr,work( nwork2 ), info )
+                           
+                 itemp = nwork1 + ( nrp1-1 )*smlszp
+                 call stdlib_qcopy( nrp1, work( nwork1 ), 1, work( vfi ), 1 )
+                 call stdlib_qcopy( nrp1, work( itemp ), 1, work( vli ), 1 )
+              else
+                 call stdlib_qlaset( 'A', nr, nr, zero, one, u( nrf, 1 ), ldu )
+                 call stdlib_qlaset( 'A', nrp1, nrp1, zero, one, vt( nrf, 1 ), ldu )
+                 call stdlib_qlasdq( 'U', sqrei, nr, nrp1, nr, ncc, d( nrf ),e( nrf ), vt( nrf, 1 &
+                           ), ldu, u( nrf, 1 ), ldu,u( nrf, 1 ), ldu, work( nwork1 ), info )
+                 call stdlib_qcopy( nrp1, vt( nrf, 1 ), 1, work( vfi ), 1 )
+                 call stdlib_qcopy( nrp1, vt( nrf, nrp1 ), 1, work( vli ), 1 )
+              end if
+              if( info/=0 ) then
+                 return
+              end if
+              do j = 1, nr
+                 iwork( idxqi+j ) = j
+              end do
+           end do loop_30
+           ! now conquer each subproblem bottom-up.
+           j = 2**nlvl
+           loop_50: do lvl = nlvl, 1, -1
+              lvl2 = lvl*2 - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl.
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              loop_40: do i = lf, ll
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 if( i==ll ) then
+                    sqrei = sqre
+                 else
+                    sqrei = 1
+                 end if
+                 vfi = vf + nlf - 1
+                 vli = vl + nlf - 1
+                 idxqi = idxq + nlf - 1
+                 alpha = d( ic )
+                 beta = e( ic )
+                 if( icompq==0 ) then
+                    call stdlib_qlasd6( icompq, nl, nr, sqrei, d( nlf ),work( vfi ), work( vli ), &
+                    alpha, beta,iwork( idxqi ), perm, givptr( 1 ), givcol,ldgcol, givnum, ldu, &
+                    poles, difl, difr, z,k( 1 ), c( 1 ), s( 1 ), work( nwork1 ),iwork( iwk ), &
+                              info )
+                 else
+                    j = j - 1
+                    call stdlib_qlasd6( icompq, nl, nr, sqrei, d( nlf ),work( vfi ), work( vli ), &
+                    alpha, beta,iwork( idxqi ), perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ),&
+                     ldgcol,givnum( nlf, lvl2 ), ldu,poles( nlf, lvl2 ), difl( nlf, lvl ),difr( &
+                     nlf, lvl2 ), z( nlf, lvl ), k( j ),c( j ), s( j ), work( nwork1 ),iwork( iwk &
+                               ), info )
+                 end if
+                 if( info/=0 ) then
+                    return
+                 end if
+              end do loop_40
+           end do loop_50
+           return
+     end subroutine stdlib_qlasda
+
+     !> DLASDQ: computes the singular value decomposition (SVD) of a real
+     !> (upper or lower) bidiagonal matrix with diagonal D and offdiagonal
+     !> E, accumulating the transformations if desired. Letting B denote
+     !> the input bidiagonal matrix, the algorithm computes orthogonal
+     !> matrices Q and P such that B = Q * S * P**T (P**T denotes the transpose
+     !> of P). The singular values S are overwritten on D.
+     !> The input matrix U  is changed to U  * Q  if desired.
+     !> The input matrix VT is changed to P**T * VT if desired.
+     !> The input matrix C  is changed to Q**T * C  if desired.
+     !> See "Computing  Small Singular Values of Bidiagonal Matrices With
+     !> Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+     !> LAPACK Working Note #3, for a detailed description of the algorithm.
+
+     pure subroutine stdlib_qlasdq( uplo, sqre, n, ncvt, nru, ncc, d, e, vt, ldvt,u, ldu, c, ldc, &
+               work, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, ldu, ldvt, n, ncc, ncvt, nru, sqre
+           ! Array Arguments 
+           real(qp), intent(inout) :: c(ldc,*), d(*), e(*), u(ldu,*), vt(ldvt,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: rotate
+           integer(ilp) :: i, isub, iuplo, j, np1, sqre1
+           real(qp) :: cs, r, smin, sn
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           iuplo = 0
+           if( stdlib_lsame( uplo, 'U' ) )iuplo = 1
+           if( stdlib_lsame( uplo, 'L' ) )iuplo = 2
+           if( iuplo==0 ) then
+              info = -1
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ncvt<0 ) then
+              info = -4
+           else if( nru<0 ) then
+              info = -5
+           else if( ncc<0 ) then
+              info = -6
+           else if( ( ncvt==0 .and. ldvt<1 ) .or.( ncvt>0 .and. ldvt<max( 1, n ) ) ) then
+              info = -10
+           else if( ldu<max( 1, nru ) ) then
+              info = -12
+           else if( ( ncc==0 .and. ldc<1 ) .or.( ncc>0 .and. ldc<max( 1, n ) ) ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASDQ', -info )
+              return
+           end if
+           if( n==0 )return
+           ! rotate is true if any singular vectors desired, false otherwise
+           rotate = ( ncvt>0 ) .or. ( nru>0 ) .or. ( ncc>0 )
+           np1 = n + 1
+           sqre1 = sqre
+           ! if matrix non-square upper bidiagonal, rotate to be lower
+           ! bidiagonal.  the rotations are on the right.
+           if( ( iuplo==1 ) .and. ( sqre1==1 ) ) then
+              do i = 1, n - 1
+                 call stdlib_qlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( rotate ) then
+                    work( i ) = cs
+                    work( n+i ) = sn
+                 end if
+              end do
+              call stdlib_qlartg( d( n ), e( n ), cs, sn, r )
+              d( n ) = r
+              e( n ) = zero
+              if( rotate ) then
+                 work( n ) = cs
+                 work( n+n ) = sn
+              end if
+              iuplo = 2
+              sqre1 = 0
+              ! update singular vectors if desired.
+              if( ncvt>0 )call stdlib_qlasr( 'L', 'V', 'F', np1, ncvt, work( 1 ),work( np1 ), vt, &
+                        ldvt )
+           end if
+           ! if matrix lower bidiagonal, rotate to be upper bidiagonal
+           ! by applying givens rotations on the left.
+           if( iuplo==2 ) then
+              do i = 1, n - 1
+                 call stdlib_qlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( rotate ) then
+                    work( i ) = cs
+                    work( n+i ) = sn
+                 end if
+              end do
+              ! if matrix (n+1)-by-n lower bidiagonal, one additional
+              ! rotation is needed.
+              if( sqre1==1 ) then
+                 call stdlib_qlartg( d( n ), e( n ), cs, sn, r )
+                 d( n ) = r
+                 if( rotate ) then
+                    work( n ) = cs
+                    work( n+n ) = sn
+                 end if
+              end if
+              ! update singular vectors if desired.
+              if( nru>0 ) then
+                 if( sqre1==0 ) then
+                    call stdlib_qlasr( 'R', 'V', 'F', nru, n, work( 1 ),work( np1 ), u, ldu )
+                              
+                 else
+                    call stdlib_qlasr( 'R', 'V', 'F', nru, np1, work( 1 ),work( np1 ), u, ldu )
+                              
+                 end if
+              end if
+              if( ncc>0 ) then
+                 if( sqre1==0 ) then
+                    call stdlib_qlasr( 'L', 'V', 'F', n, ncc, work( 1 ),work( np1 ), c, ldc )
+                              
+                 else
+                    call stdlib_qlasr( 'L', 'V', 'F', np1, ncc, work( 1 ),work( np1 ), c, ldc )
+                              
+                 end if
+              end if
+           end if
+           ! call stdlib_qbdsqr to compute the svd of the reduced real
+           ! n-by-n upper bidiagonal matrix.
+           call stdlib_qbdsqr( 'U', n, ncvt, nru, ncc, d, e, vt, ldvt, u, ldu, c,ldc, work, info )
+                     
+           ! sort the singular values into ascending order (insertion sort on
+           ! singular values, but only one transposition per singular vector)
+           do i = 1, n
+              ! scan for smallest d(i).
+              isub = i
+              smin = d( i )
+              do j = i + 1, n
+                 if( d( j )<smin ) then
+                    isub = j
+                    smin = d( j )
+                 end if
+              end do
+              if( isub/=i ) then
+                 ! swap singular values and vectors.
+                 d( isub ) = d( i )
+                 d( i ) = smin
+                 if( ncvt>0 )call stdlib_qswap( ncvt, vt( isub, 1 ), ldvt, vt( i, 1 ), ldvt )
+                           
+                 if( nru>0 )call stdlib_qswap( nru, u( 1, isub ), 1, u( 1, i ), 1 )
+                 if( ncc>0 )call stdlib_qswap( ncc, c( isub, 1 ), ldc, c( i, 1 ), ldc )
+              end if
+           end do
+           return
+     end subroutine stdlib_qlasdq
+
+     !> DLASDT: creates a tree of subproblems for bidiagonal divide and
+     !> conquer.
+
+     pure subroutine stdlib_qlasdt( n, lvl, nd, inode, ndiml, ndimr, msub )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: lvl, nd
+           integer(ilp), intent(in) :: msub, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: inode(*), ndiml(*), ndimr(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, il, ir, llst, maxn, ncrnt, nlvl
+           real(qp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: real,int,log,max
+           ! Executable Statements 
+           ! find the number of levels on the tree.
+           maxn = max( 1, n )
+           temp = log( real( maxn,KIND=qp) / real( msub+1,KIND=qp) ) / log( two )
+           lvl = int( temp,KIND=ilp) + 1
+           i = n / 2
+           inode( 1 ) = i + 1
+           ndiml( 1 ) = i
+           ndimr( 1 ) = n - i - 1
+           il = 0
+           ir = 1
+           llst = 1
+           do nlvl = 1, lvl - 1
+              ! constructing the tree at (nlvl+1)-st level. the number of
+              ! nodes created on this level is llst * 2.
+              do i = 0, llst - 1
+                 il = il + 2
+                 ir = ir + 2
+                 ncrnt = llst + i
+                 ndiml( il ) = ndiml( ncrnt ) / 2
+                 ndimr( il ) = ndiml( ncrnt ) - ndiml( il ) - 1
+                 inode( il ) = inode( ncrnt ) - ndimr( il ) - 1
+                 ndiml( ir ) = ndimr( ncrnt ) / 2
+                 ndimr( ir ) = ndimr( ncrnt ) - ndiml( ir ) - 1
+                 inode( ir ) = inode( ncrnt ) + ndiml( ir ) + 1
+              end do
+              llst = llst*2
+           end do
+           nd = llst*2 - 1
+           return
+     end subroutine stdlib_qlasdt
+
+     !> DLASET: initializes an m-by-n matrix A to BETA on the diagonal and
+     !> ALPHA on the offdiagonals.
+
+     pure subroutine stdlib_qlaset( uplo, m, n, alpha, beta, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, m, n
+           real(qp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           real(qp), intent(out) :: a(lda,*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! set the strictly upper triangular or trapezoidal part of the
+              ! array to alpha.
+              do j = 2, n
+                 do i = 1, min( j-1, m )
+                    a( i, j ) = alpha
+                 end do
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              ! set the strictly lower triangular or trapezoidal part of the
+              ! array to alpha.
+              do j = 1, min( m, n )
+                 do i = j + 1, m
+                    a( i, j ) = alpha
+                 end do
+              end do
+           else
+              ! set the leading m-by-n submatrix to alpha.
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = alpha
+                 end do
+              end do
+           end if
+           ! set the first min(m,n) diagonal elements to beta.
+           do i = 1, min( m, n )
+              a( i, i ) = beta
+           end do
+           return
+     end subroutine stdlib_qlaset
+
+     !> DLASQ1: computes the singular values of a real N-by-N bidiagonal
+     !> matrix with diagonal D and off-diagonal E. The singular values
+     !> are computed to high relative accuracy, in the absence of
+     !> denormalization, underflow and overflow. The algorithm was first
+     !> presented in
+     !> "Accurate singular values and differential qd algorithms" by K. V.
+     !> Fernando and B. N. Parlett, Numer. Math., Vol-67, No. 2, pp. 191-230,
+     !> 1994,
+     !> and the present implementation is described in "An implementation of
+     !> the dqds Algorithm (Positive Case)", LAPACK Working Note.
+
+     pure subroutine stdlib_qlasq1( n, d, e, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo
+           real(qp) :: eps, scale, safmin, sigmn, sigmx
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'DLASQ1', -info )
+              return
+           else if( n==0 ) then
+              return
+           else if( n==1 ) then
+              d( 1 ) = abs( d( 1 ) )
+              return
+           else if( n==2 ) then
+              call stdlib_qlas2( d( 1 ), e( 1 ), d( 2 ), sigmn, sigmx )
+              d( 1 ) = sigmx
+              d( 2 ) = sigmn
+              return
+           end if
+           ! estimate the largest singular value.
+           sigmx = zero
+           do i = 1, n - 1
+              d( i ) = abs( d( i ) )
+              sigmx = max( sigmx, abs( e( i ) ) )
+           end do
+           d( n ) = abs( d( n ) )
+           ! early return if sigmx is zero (matrix is already diagonal).
+           if( sigmx==zero ) then
+              call stdlib_qlasrt( 'D', n, d, iinfo )
+              return
+           end if
+           do i = 1, n
+              sigmx = max( sigmx, d( i ) )
+           end do
+           ! copy d and e into work (in the z format) and scale (squaring the
+           ! input data makes scaling by a power of the radix pointless).
+           eps = stdlib_qlamch( 'PRECISION' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           scale = sqrt( eps / safmin )
+           call stdlib_qcopy( n, d, 1, work( 1 ), 2 )
+           call stdlib_qcopy( n-1, e, 1, work( 2 ), 2 )
+           call stdlib_qlascl( 'G', 0, 0, sigmx, scale, 2*n-1, 1, work, 2*n-1,iinfo )
+           ! compute the q's and e's.
+           do i = 1, 2*n - 1
+              work( i ) = work( i )**2
+           end do
+           work( 2*n ) = zero
+           call stdlib_qlasq2( n, work, info )
+           if( info==0 ) then
+              do i = 1, n
+                 d( i ) = sqrt( work( i ) )
+              end do
+              call stdlib_qlascl( 'G', 0, 0, scale, sigmx, n, 1, d, n, iinfo )
+           else if( info==2 ) then
+           ! maximum number of iterations exceeded.  move data from work
+           ! into d and e so the calling subroutine can try to finish
+              do i = 1, n
+                 d( i ) = sqrt( work( 2*i-1 ) )
+                 e( i ) = sqrt( work( 2*i ) )
+              end do
+              call stdlib_qlascl( 'G', 0, 0, scale, sigmx, n, 1, d, n, iinfo )
+              call stdlib_qlascl( 'G', 0, 0, scale, sigmx, n, 1, e, n, iinfo )
+           end if
+           return
+     end subroutine stdlib_qlasq1
+
+     !> DLASQ2: computes all the eigenvalues of the symmetric positive
+     !> definite tridiagonal matrix associated with the qd array Z to high
+     !> relative accuracy are computed to high relative accuracy, in the
+     !> absence of denormalization, underflow and overflow.
+     !> To see the relation of Z to the tridiagonal matrix, let L be a
+     !> unit lower bidiagonal matrix with subdiagonals Z(2,4,6,,..) and
+     !> let U be an upper bidiagonal matrix with 1's above and diagonal
+     !> Z(1,3,5,,..). The tridiagonal is L*U or, if you prefer, the
+     !> symmetric tridiagonal to which it is similar.
+     !> Note : DLASQ2 defines a logical variable, IEEE, which is true
+     !> on machines which follow ieee-754 floating-point standard in their
+     !> handling of infinities and NaNs, and false otherwise. This variable
+     !> is passed to DLASQ3.
+
+     pure subroutine stdlib_qlasq2( n, z, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(inout) :: z(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: cbias = 1.50_qp
+           real(qp), parameter :: hundrd = 100.0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ieee
+           integer(ilp) :: i0, i1, i4, iinfo, ipn4, iter, iwhila, iwhilb, k, kmin, n0, n1, nbig, &
+                     ndiv, nfail, pp, splt, ttype
+           real(qp) :: d, dee, deemin, desig, dmin, dmin1, dmin2, dn, dn1, dn2, e, emax, emin, &
+           eps, g, oldemn, qmax, qmin, s, safmin, sigma, t, tau, temp, tol, tol2, trace, zmax, &
+                     tempe, tempq
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments.
+           ! (in case stdlib_qlasq2 is not called by stdlib_qlasq1)
+           info = 0
+           eps = stdlib_qlamch( 'PRECISION' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           tol = eps*hundrd
+           tol2 = tol**2
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'DLASQ2', 1 )
+              return
+           else if( n==0 ) then
+              return
+           else if( n==1 ) then
+              ! 1-by-1 case.
+              if( z( 1 )<zero ) then
+                 info = -201
+                 call stdlib_xerbla( 'DLASQ2', 2 )
+              end if
+              return
+           else if( n==2 ) then
+              ! 2-by-2 case.
+              if( z( 1 )<zero ) then
+                 info = -201
+                 call stdlib_xerbla( 'DLASQ2', 2 )
+                 return
+              else if( z( 2 )<zero ) then
+                 info = -202
+                 call stdlib_xerbla( 'DLASQ2', 2 )
+                 return
+              else if( z( 3 )<zero ) then
+                info = -203
+                call stdlib_xerbla( 'DLASQ2', 2 )
+                return
+              else if( z( 3 )>z( 1 ) ) then
+                 d = z( 3 )
+                 z( 3 ) = z( 1 )
+                 z( 1 ) = d
+              end if
+              z( 5 ) = z( 1 ) + z( 2 ) + z( 3 )
+              if( z( 2 )>z( 3 )*tol2 ) then
+                 t = half*( ( z( 1 )-z( 3 ) )+z( 2 ) )
+                 s = z( 3 )*( z( 2 ) / t )
+                 if( s<=t ) then
+                    s = z( 3 )*( z( 2 ) / ( t*( one+sqrt( one+s / t ) ) ) )
+                 else
+                    s = z( 3 )*( z( 2 ) / ( t+sqrt( t )*sqrt( t+s ) ) )
+                 end if
+                 t = z( 1 ) + ( s+z( 2 ) )
+                 z( 3 ) = z( 3 )*( z( 1 ) / t )
+                 z( 1 ) = t
+              end if
+              z( 2 ) = z( 3 )
+              z( 6 ) = z( 2 ) + z( 1 )
+              return
+           end if
+           ! check for negative data and compute sums of q's and e's.
+           z( 2*n ) = zero
+           emin = z( 2 )
+           qmax = zero
+           zmax = zero
+           d = zero
+           e = zero
+           do k = 1, 2*( n-1 ), 2
+              if( z( k )<zero ) then
+                 info = -( 200+k )
+                 call stdlib_xerbla( 'DLASQ2', 2 )
+                 return
+              else if( z( k+1 )<zero ) then
+                 info = -( 200+k+1 )
+                 call stdlib_xerbla( 'DLASQ2', 2 )
+                 return
+              end if
+              d = d + z( k )
+              e = e + z( k+1 )
+              qmax = max( qmax, z( k ) )
+              emin = min( emin, z( k+1 ) )
+              zmax = max( qmax, zmax, z( k+1 ) )
+           end do
+           if( z( 2*n-1 )<zero ) then
+              info = -( 200+2*n-1 )
+              call stdlib_xerbla( 'DLASQ2', 2 )
+              return
+           end if
+           d = d + z( 2*n-1 )
+           qmax = max( qmax, z( 2*n-1 ) )
+           zmax = max( qmax, zmax )
+           ! check for diagonality.
+           if( e==zero ) then
+              do k = 2, n
+                 z( k ) = z( 2*k-1 )
+              end do
+              call stdlib_qlasrt( 'D', n, z, iinfo )
+              z( 2*n-1 ) = d
+              return
+           end if
+           trace = d + e
+           ! check for zero data.
+           if( trace==zero ) then
+              z( 2*n-1 ) = zero
+              return
+           end if
+           ! check whether the machine is ieee conformable.
+           ieee = ( stdlib_ilaenv( 10, 'DLASQ2', 'N', 1, 2, 3, 4 )==1 )
+           ! rearrange data for locality: z=(q1,qq1,e1,ee1,q2,qq2,e2,ee2,...).
+           do k = 2*n, 2, -2
+              z( 2*k ) = zero
+              z( 2*k-1 ) = z( k )
+              z( 2*k-2 ) = zero
+              z( 2*k-3 ) = z( k-1 )
+           end do
+           i0 = 1
+           n0 = n
+           ! reverse the qd-array, if warranted.
+           if( cbias*z( 4*i0-3 )<z( 4*n0-3 ) ) then
+              ipn4 = 4*( i0+n0 )
+              do i4 = 4*i0, 2*( i0+n0-1 ), 4
+                 temp = z( i4-3 )
+                 z( i4-3 ) = z( ipn4-i4-3 )
+                 z( ipn4-i4-3 ) = temp
+                 temp = z( i4-1 )
+                 z( i4-1 ) = z( ipn4-i4-5 )
+                 z( ipn4-i4-5 ) = temp
+              end do
+           end if
+           ! initial split checking via dqd and li's test.
+           pp = 0
+           loop_80: do k = 1, 2
+              d = z( 4*n0+pp-3 )
+              do i4 = 4*( n0-1 ) + pp, 4*i0 + pp, -4
+                 if( z( i4-1 )<=tol2*d ) then
+                    z( i4-1 ) = -zero
+                    d = z( i4-3 )
+                 else
+                    d = z( i4-3 )*( d / ( d+z( i4-1 ) ) )
+                 end if
+              end do
+              ! dqd maps z to zz plus li's test.
+              emin = z( 4*i0+pp+1 )
+              d = z( 4*i0+pp-3 )
+              do i4 = 4*i0 + pp, 4*( n0-1 ) + pp, 4
+                 z( i4-2*pp-2 ) = d + z( i4-1 )
+                 if( z( i4-1 )<=tol2*d ) then
+                    z( i4-1 ) = -zero
+                    z( i4-2*pp-2 ) = d
+                    z( i4-2*pp ) = zero
+                    d = z( i4+1 )
+                 else if( safmin*z( i4+1 )<z( i4-2*pp-2 ) .and.safmin*z( i4-2*pp-2 )<z( i4+1 ) ) &
+                           then
+                    temp = z( i4+1 ) / z( i4-2*pp-2 )
+                    z( i4-2*pp ) = z( i4-1 )*temp
+                    d = d*temp
+                 else
+                    z( i4-2*pp ) = z( i4+1 )*( z( i4-1 ) / z( i4-2*pp-2 ) )
+                    d = z( i4+1 )*( d / z( i4-2*pp-2 ) )
+                 end if
+                 emin = min( emin, z( i4-2*pp ) )
+              end do
+              z( 4*n0-pp-2 ) = d
+              ! now find qmax.
+              qmax = z( 4*i0-pp-2 )
+              do i4 = 4*i0 - pp + 2, 4*n0 - pp - 2, 4
+                 qmax = max( qmax, z( i4 ) )
+              end do
+              ! prepare for the next iteration on k.
+              pp = 1 - pp
+           end do loop_80
+           ! initialise variables to pass to stdlib_qlasq3.
+           ttype = 0
+           dmin1 = zero
+           dmin2 = zero
+           dn    = zero
+           dn1   = zero
+           dn2   = zero
+           g     = zero
+           tau   = zero
+           iter = 2
+           nfail = 0
+           ndiv = 2*( n0-i0 )
+           loop_160: do iwhila = 1, n + 1
+              if( n0<1 )go to 170
+              ! while array unfinished do
+              ! e(n0) holds the value of sigma when submatrix in i0:n0
+              ! splits from the rest of the array, but is negated.
+              desig = zero
+              if( n0==n ) then
+                 sigma = zero
+              else
+                 sigma = -z( 4*n0-1 )
+              end if
+              if( sigma<zero ) then
+                 info = 1
+                 return
+              end if
+              ! find last unreduced submatrix's top index i0, find qmax and
+              ! emin. find gershgorin-type bound if q's much greater than e's.
+              emax = zero
+              if( n0>i0 ) then
+                 emin = abs( z( 4*n0-5 ) )
+              else
+                 emin = zero
+              end if
+              qmin = z( 4*n0-3 )
+              qmax = qmin
+              do i4 = 4*n0, 8, -4
+                 if( z( i4-5 )<=zero )go to 100
+                 if( qmin>=four*emax ) then
+                    qmin = min( qmin, z( i4-3 ) )
+                    emax = max( emax, z( i4-5 ) )
+                 end if
+                 qmax = max( qmax, z( i4-7 )+z( i4-5 ) )
+                 emin = min( emin, z( i4-5 ) )
+              end do
+              i4 = 4
+              100 continue
+              i0 = i4 / 4
+              pp = 0
+              if( n0-i0>1 ) then
+                 dee = z( 4*i0-3 )
+                 deemin = dee
+                 kmin = i0
+                 do i4 = 4*i0+1, 4*n0-3, 4
+                    dee = z( i4 )*( dee /( dee+z( i4-2 ) ) )
+                    if( dee<=deemin ) then
+                       deemin = dee
+                       kmin = ( i4+3 )/4
+                    end if
+                 end do
+                 if( (kmin-i0)*2<n0-kmin .and.deemin<=half*z(4*n0-3) ) then
+                    ipn4 = 4*( i0+n0 )
+                    pp = 2
+                    do i4 = 4*i0, 2*( i0+n0-1 ), 4
+                       temp = z( i4-3 )
+                       z( i4-3 ) = z( ipn4-i4-3 )
+                       z( ipn4-i4-3 ) = temp
+                       temp = z( i4-2 )
+                       z( i4-2 ) = z( ipn4-i4-2 )
+                       z( ipn4-i4-2 ) = temp
+                       temp = z( i4-1 )
+                       z( i4-1 ) = z( ipn4-i4-5 )
+                       z( ipn4-i4-5 ) = temp
+                       temp = z( i4 )
+                       z( i4 ) = z( ipn4-i4-4 )
+                       z( ipn4-i4-4 ) = temp
+                    end do
+                 end if
+              end if
+              ! put -(initial shift) into dmin.
+              dmin = -max( zero, qmin-two*sqrt( qmin )*sqrt( emax ) )
+              ! now i0:n0 is unreduced.
+              ! pp = 0 for ping, pp = 1 for pong.
+              ! pp = 2 indicates that flipping was applied to the z array and
+                     ! and that the tests for deflation upon entry in stdlib_qlasq3
+                     ! should not be performed.
+              nbig = 100*( n0-i0+1 )
+              loop_140: do iwhilb = 1, nbig
+                 if( i0>n0 )go to 150
+                 ! while submatrix unfinished take a good dqds step.
+                 call stdlib_qlasq3( i0, n0, z, pp, dmin, sigma, desig, qmax, nfail,iter, ndiv, &
+                           ieee, ttype, dmin1, dmin2, dn, dn1,dn2, g, tau )
+                 pp = 1 - pp
+                 ! when emin is very small check for splits.
+                 if( pp==0 .and. n0-i0>=3 ) then
+                    if( z( 4*n0 )<=tol2*qmax .or.z( 4*n0-1 )<=tol2*sigma ) then
+                       splt = i0 - 1
+                       qmax = z( 4*i0-3 )
+                       emin = z( 4*i0-1 )
+                       oldemn = z( 4*i0 )
+                       do i4 = 4*i0, 4*( n0-3 ), 4
+                          if( z( i4 )<=tol2*z( i4-3 ) .or.z( i4-1 )<=tol2*sigma ) then
+                             z( i4-1 ) = -sigma
+                             splt = i4 / 4
+                             qmax = zero
+                             emin = z( i4+3 )
+                             oldemn = z( i4+4 )
+                          else
+                             qmax = max( qmax, z( i4+1 ) )
+                             emin = min( emin, z( i4-1 ) )
+                             oldemn = min( oldemn, z( i4 ) )
+                          end if
+                       end do
+                       z( 4*n0-1 ) = emin
+                       z( 4*n0 ) = oldemn
+                       i0 = splt + 1
+                    end if
+                 end if
+              end do loop_140
+              info = 2
+              ! maximum number of iterations exceeded, restore the shift
+              ! sigma and place the new d's and e's in a qd array.
+              ! this might need to be done for several blocks
+              i1 = i0
+              n1 = n0
+              145 continue
+              tempq = z( 4*i0-3 )
+              z( 4*i0-3 ) = z( 4*i0-3 ) + sigma
+              do k = i0+1, n0
+                 tempe = z( 4*k-5 )
+                 z( 4*k-5 ) = z( 4*k-5 ) * (tempq / z( 4*k-7 ))
+                 tempq = z( 4*k-3 )
+                 z( 4*k-3 ) = z( 4*k-3 ) + sigma + tempe - z( 4*k-5 )
+              end do
+              ! prepare to do this on the previous block if there is one
+              if( i1>1 ) then
+                 n1 = i1-1
+                 do while( ( i1>=2 ) .and. ( z(4*i1-5)>=zero ) )
+                    i1 = i1 - 1
+                 end do
+                 sigma = -z(4*n1-1)
+                 go to 145
+              end if
+              do k = 1, n
+                 z( 2*k-1 ) = z( 4*k-3 )
+              ! only the block 1..n0 is unfinished.  the rest of the e's
+              ! must be essentially zero, although sometimes other data
+              ! has been stored in them.
+                 if( k<n0 ) then
+                    z( 2*k ) = z( 4*k-1 )
+                 else
+                    z( 2*k ) = 0
+                 end if
+              end do
+              return
+              ! end iwhilb
+              150 continue
+           end do loop_160
+           info = 3
+           return
+           ! end iwhila
+           170 continue
+           ! move q's to the front.
+           do k = 2, n
+              z( k ) = z( 4*k-3 )
+           end do
+           ! sort and compute sum of eigenvalues.
+           call stdlib_qlasrt( 'D', n, z, iinfo )
+           e = zero
+           do k = n, 1, -1
+              e = e + z( k )
+           end do
+           ! store trace, sum(eigenvalues) and information on performance.
+           z( 2*n+1 ) = trace
+           z( 2*n+2 ) = e
+           z( 2*n+3 ) = real( iter,KIND=qp)
+           z( 2*n+4 ) = real( ndiv,KIND=qp) / real( n**2,KIND=qp)
+           z( 2*n+5 ) = hundrd*nfail / real( iter,KIND=qp)
+           return
+     end subroutine stdlib_qlasq2
+
+     !> DLASQ3: checks for deflation, computes a shift (TAU) and calls dqds.
+     !> In case of failure it changes shifts, and tries again until output
+     !> is positive.
+
+     pure subroutine stdlib_qlasq3( i0, n0, z, pp, dmin, sigma, desig, qmax, nfail,iter, ndiv, &
+               ieee, ttype, dmin1, dmin2, dn, dn1,dn2, g, tau )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: ieee
+           integer(ilp), intent(in) :: i0
+           integer(ilp), intent(inout) :: iter, n0, ndiv, nfail, pp
+           real(qp), intent(inout) :: desig, dmin1, dmin2, dn, dn1, dn2, g, qmax, tau
+           real(qp), intent(out) :: dmin, sigma
+           ! Array Arguments 
+           real(qp), intent(inout) :: z(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: cbias = 1.50_qp
+           real(qp), parameter :: qurtr = 0.250_qp
+           real(qp), parameter :: hundrd = 100.0_qp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: ipn4, j4, n0in, nn
+           integer(ilp), intent(inout) :: ttype
+           real(qp) :: eps, s, t, temp, tol, tol2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           n0in = n0
+           eps = stdlib_qlamch( 'PRECISION' )
+           tol = eps*hundrd
+           tol2 = tol**2
+           ! check for deflation.
+           10 continue
+           if( n0<i0 )return
+           if( n0==i0 )go to 20
+           nn = 4*n0 + pp
+           if( n0==( i0+1 ) )go to 40
+           ! check whether e(n0-1) is negligible, 1 eigenvalue.
+           if( z( nn-5 )>tol2*( sigma+z( nn-3 ) ) .and.z( nn-2*pp-4 )>tol2*z( nn-7 ) )go to &
+                     30
+           20 continue
+           z( 4*n0-3 ) = z( 4*n0+pp-3 ) + sigma
+           n0 = n0 - 1
+           go to 10
+           ! check  whether e(n0-2) is negligible, 2 eigenvalues.
+           30 continue
+           if( z( nn-9 )>tol2*sigma .and.z( nn-2*pp-8 )>tol2*z( nn-11 ) )go to 50
+           40 continue
+           if( z( nn-3 )>z( nn-7 ) ) then
+              s = z( nn-3 )
+              z( nn-3 ) = z( nn-7 )
+              z( nn-7 ) = s
+           end if
+           t = half*( ( z( nn-7 )-z( nn-3 ) )+z( nn-5 ) )
+           if( z( nn-5 )>z( nn-3 )*tol2.and.t/=zero ) then
+              s = z( nn-3 )*( z( nn-5 ) / t )
+              if( s<=t ) then
+                 s = z( nn-3 )*( z( nn-5 ) /( t*( one+sqrt( one+s / t ) ) ) )
+              else
+                 s = z( nn-3 )*( z( nn-5 ) / ( t+sqrt( t )*sqrt( t+s ) ) )
+              end if
+              t = z( nn-7 ) + ( s+z( nn-5 ) )
+              z( nn-3 ) = z( nn-3 )*( z( nn-7 ) / t )
+              z( nn-7 ) = t
+           end if
+           z( 4*n0-7 ) = z( nn-7 ) + sigma
+           z( 4*n0-3 ) = z( nn-3 ) + sigma
+           n0 = n0 - 2
+           go to 10
+           50 continue
+           if( pp==2 )pp = 0
+           ! reverse the qd-array, if warranted.
+           if( dmin<=zero .or. n0<n0in ) then
+              if( cbias*z( 4*i0+pp-3 )<z( 4*n0+pp-3 ) ) then
+                 ipn4 = 4*( i0+n0 )
+                 do j4 = 4*i0, 2*( i0+n0-1 ), 4
+                    temp = z( j4-3 )
+                    z( j4-3 ) = z( ipn4-j4-3 )
+                    z( ipn4-j4-3 ) = temp
+                    temp = z( j4-2 )
+                    z( j4-2 ) = z( ipn4-j4-2 )
+                    z( ipn4-j4-2 ) = temp
+                    temp = z( j4-1 )
+                    z( j4-1 ) = z( ipn4-j4-5 )
+                    z( ipn4-j4-5 ) = temp
+                    temp = z( j4 )
+                    z( j4 ) = z( ipn4-j4-4 )
+                    z( ipn4-j4-4 ) = temp
+                 end do
+                 if( n0-i0<=4 ) then
+                    z( 4*n0+pp-1 ) = z( 4*i0+pp-1 )
+                    z( 4*n0-pp ) = z( 4*i0-pp )
+                 end if
+                 dmin2 = min( dmin2, z( 4*n0+pp-1 ) )
+                 z( 4*n0+pp-1 ) = min( z( 4*n0+pp-1 ), z( 4*i0+pp-1 ),z( 4*i0+pp+3 ) )
+                 z( 4*n0-pp ) = min( z( 4*n0-pp ), z( 4*i0-pp ),z( 4*i0-pp+4 ) )
+                 qmax = max( qmax, z( 4*i0+pp-3 ), z( 4*i0+pp+1 ) )
+                 dmin = -zero
+              end if
+           end if
+           ! choose a shift.
+           call stdlib_qlasq4( i0, n0, z, pp, n0in, dmin, dmin1, dmin2, dn, dn1,dn2, tau, ttype, &
+                     g )
+           ! call dqds until dmin > 0.
+           70 continue
+           call stdlib_qlasq5( i0, n0, z, pp, tau, sigma, dmin, dmin1, dmin2, dn,dn1, dn2, ieee, &
+                     eps )
+           ndiv = ndiv + ( n0-i0+2 )
+           iter = iter + 1
+           ! check status.
+           if( dmin>=zero .and. dmin1>=zero ) then
+              ! success.
+              go to 90
+           else if( dmin<zero .and. dmin1>zero .and.z( 4*( n0-1 )-pp )<tol*( sigma+dn1 ) .and.abs(&
+                      dn )<tol*sigma ) then
+              ! convergence hidden by negative dn.
+              z( 4*( n0-1 )-pp+2 ) = zero
+              dmin = zero
+              go to 90
+           else if( dmin<zero ) then
+              ! tau too big. select new tau and try again.
+              nfail = nfail + 1
+              if( ttype<-22 ) then
+                 ! failed twice. play it safe.
+                 tau = zero
+              else if( dmin1>zero ) then
+                 ! late failure. gives excellent shift.
+                 tau = ( tau+dmin )*( one-two*eps )
+                 ttype = ttype - 11
+              else
+                 ! early failure. divide by 4.
+                 tau = qurtr*tau
+                 ttype = ttype - 12
+              end if
+              go to 70
+           else if( stdlib_qisnan( dmin ) ) then
+              ! nan.
+              if( tau==zero ) then
+                 go to 80
+              else
+                 tau = zero
+                 go to 70
+              end if
+           else
+              ! possible underflow. play it safe.
+              go to 80
+           end if
+           ! risk of underflow.
+           80 continue
+           call stdlib_qlasq6( i0, n0, z, pp, dmin, dmin1, dmin2, dn, dn1, dn2 )
+           ndiv = ndiv + ( n0-i0+2 )
+           iter = iter + 1
+           tau = zero
+           90 continue
+           if( tau<sigma ) then
+              desig = desig + tau
+              t = sigma + desig
+              desig = desig - ( t-sigma )
+           else
+              t = sigma + tau
+              desig = sigma - ( t-tau ) + desig
+           end if
+           sigma = t
+           return
+     end subroutine stdlib_qlasq3
+
+     !> DLASQ4: computes an approximation TAU to the smallest eigenvalue
+     !> using values of d from the previous transform.
+
+     pure subroutine stdlib_qlasq4( i0, n0, z, pp, n0in, dmin, dmin1, dmin2, dn,dn1, dn2, tau, &
+               ttype, g )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i0, n0, n0in, pp
+           integer(ilp), intent(out) :: ttype
+           real(qp), intent(in) :: dmin, dmin1, dmin2, dn, dn1, dn2
+           real(qp), intent(inout) :: g
+           real(qp), intent(out) :: tau
+           ! Array Arguments 
+           real(qp), intent(in) :: z(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: cnst1 = 0.5630_qp
+           real(qp), parameter :: cnst2 = 1.010_qp
+           real(qp), parameter :: cnst3 = 1.050_qp
+           real(qp), parameter :: qurtr = 0.250_qp
+           real(qp), parameter :: third = 0.3330_qp
+           real(qp), parameter :: hundrd = 100.0_qp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i4, nn, np
+           real(qp) :: a2, b1, b2, gam, gap1, gap2, s
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! a negative dmin forces the shift to take that absolute value
+           ! ttype records the type of shift.
+           if( dmin<=zero ) then
+              tau = -dmin
+              ttype = -1
+              return
+           end if
+           nn = 4*n0 + pp
+           if( n0in==n0 ) then
+              ! no eigenvalues deflated.
+              if( dmin==dn .or. dmin==dn1 ) then
+                 b1 = sqrt( z( nn-3 ) )*sqrt( z( nn-5 ) )
+                 b2 = sqrt( z( nn-7 ) )*sqrt( z( nn-9 ) )
+                 a2 = z( nn-7 ) + z( nn-5 )
+                 ! cases 2 and 3.
+                 if( dmin==dn .and. dmin1==dn1 ) then
+                    gap2 = dmin2 - a2 - dmin2*qurtr
+                    if( gap2>zero .and. gap2>b2 ) then
+                       gap1 = a2 - dn - ( b2 / gap2 )*b2
+                    else
+                       gap1 = a2 - dn - ( b1+b2 )
+                    end if
+                    if( gap1>zero .and. gap1>b1 ) then
+                       s = max( dn-( b1 / gap1 )*b1, half*dmin )
+                       ttype = -2
+                    else
+                       s = zero
+                       if( dn>b1 )s = dn - b1
+                       if( a2>( b1+b2 ) )s = min( s, a2-( b1+b2 ) )
+                       s = max( s, third*dmin )
+                       ttype = -3
+                    end if
+                 else
+                    ! case 4.
+                    ttype = -4
+                    s = qurtr*dmin
+                    if( dmin==dn ) then
+                       gam = dn
+                       a2 = zero
+                       if( z( nn-5 ) > z( nn-7 ) )return
+                       b2 = z( nn-5 ) / z( nn-7 )
+                       np = nn - 9
+                    else
+                       np = nn - 2*pp
+                       gam = dn1
+                       if( z( np-4 ) > z( np-2 ) )return
+                       a2 = z( np-4 ) / z( np-2 )
+                       if( z( nn-9 ) > z( nn-11 ) )return
+                       b2 = z( nn-9 ) / z( nn-11 )
+                       np = nn - 13
+                    end if
+                    ! approximate contribution to norm squared from i < nn-1.
+                    a2 = a2 + b2
+                    do i4 = np, 4*i0 - 1 + pp, -4
+                       if( b2==zero )go to 20
+                       b1 = b2
+                       if( z( i4 ) > z( i4-2 ) )return
+                       b2 = b2*( z( i4 ) / z( i4-2 ) )
+                       a2 = a2 + b2
+                       if( hundrd*max( b2, b1 )<a2 .or. cnst1<a2 )go to 20
+                    end do
+                    20 continue
+                    a2 = cnst3*a2
+                    ! rayleigh quotient residual bound.
+                    if( a2<cnst1 )s = gam*( one-sqrt( a2 ) ) / ( one+a2 )
+                 end if
+              else if( dmin==dn2 ) then
+                 ! case 5.
+                 ttype = -5
+                 s = qurtr*dmin
+                 ! compute contribution to norm squared from i > nn-2.
+                 np = nn - 2*pp
+                 b1 = z( np-2 )
+                 b2 = z( np-6 )
+                 gam = dn2
+                 if( z( np-8 )>b2 .or. z( np-4 )>b1 )return
+                 a2 = ( z( np-8 ) / b2 )*( one+z( np-4 ) / b1 )
+                 ! approximate contribution to norm squared from i < nn-2.
+                 if( n0-i0>2 ) then
+                    b2 = z( nn-13 ) / z( nn-15 )
+                    a2 = a2 + b2
+                    do i4 = nn - 17, 4*i0 - 1 + pp, -4
+                       if( b2==zero )go to 40
+                       b1 = b2
+                       if( z( i4 ) > z( i4-2 ) )return
+                       b2 = b2*( z( i4 ) / z( i4-2 ) )
+                       a2 = a2 + b2
+                       if( hundrd*max( b2, b1 )<a2 .or. cnst1<a2 )go to 40
+                    end do
+                    40 continue
+                    a2 = cnst3*a2
+                 end if
+                 if( a2<cnst1 )s = gam*( one-sqrt( a2 ) ) / ( one+a2 )
+              else
+                 ! case 6, no information to guide us.
+                 if( ttype==-6 ) then
+                    g = g + third*( one-g )
+                 else if( ttype==-18 ) then
+                    g = qurtr*third
+                 else
+                    g = qurtr
+                 end if
+                 s = g*dmin
+                 ttype = -6
+              end if
+           else if( n0in==( n0+1 ) ) then
+              ! one eigenvalue just deflated. use dmin1, dn1 for dmin and dn.
+              if( dmin1==dn1 .and. dmin2==dn2 ) then
+                 ! cases 7 and 8.
+                 ttype = -7
+                 s = third*dmin1
+                 if( z( nn-5 )>z( nn-7 ) )return
+                 b1 = z( nn-5 ) / z( nn-7 )
+                 b2 = b1
+                 if( b2==zero )go to 60
+                 do i4 = 4*n0 - 9 + pp, 4*i0 - 1 + pp, -4
+                    a2 = b1
+                    if( z( i4 )>z( i4-2 ) )return
+                    b1 = b1*( z( i4 ) / z( i4-2 ) )
+                    b2 = b2 + b1
+                    if( hundrd*max( b1, a2 )<b2 )go to 60
+                 end do
+                 60 continue
+                 b2 = sqrt( cnst3*b2 )
+                 a2 = dmin1 / ( one+b2**2 )
+                 gap2 = half*dmin2 - a2
+                 if( gap2>zero .and. gap2>b2*a2 ) then
+                    s = max( s, a2*( one-cnst2*a2*( b2 / gap2 )*b2 ) )
+                 else
+                    s = max( s, a2*( one-cnst2*b2 ) )
+                    ttype = -8
+                 end if
+              else
+                 ! case 9.
+                 s = qurtr*dmin1
+                 if( dmin1==dn1 )s = half*dmin1
+                 ttype = -9
+              end if
+           else if( n0in==( n0+2 ) ) then
+              ! two eigenvalues deflated. use dmin2, dn2 for dmin and dn.
+              ! cases 10 and 11.
+              if( dmin2==dn2 .and. two*z( nn-5 )<z( nn-7 ) ) then
+                 ttype = -10
+                 s = third*dmin2
+                 if( z( nn-5 )>z( nn-7 ) )return
+                 b1 = z( nn-5 ) / z( nn-7 )
+                 b2 = b1
+                 if( b2==zero )go to 80
+                 do i4 = 4*n0 - 9 + pp, 4*i0 - 1 + pp, -4
+                    if( z( i4 )>z( i4-2 ) )return
+                    b1 = b1*( z( i4 ) / z( i4-2 ) )
+                    b2 = b2 + b1
+                    if( hundrd*b1<b2 )go to 80
+                 end do
+                 80 continue
+                 b2 = sqrt( cnst3*b2 )
+                 a2 = dmin2 / ( one+b2**2 )
+                 gap2 = z( nn-7 ) + z( nn-9 ) -sqrt( z( nn-11 ) )*sqrt( z( nn-9 ) ) - a2
+                 if( gap2>zero .and. gap2>b2*a2 ) then
+                    s = max( s, a2*( one-cnst2*a2*( b2 / gap2 )*b2 ) )
+                 else
+                    s = max( s, a2*( one-cnst2*b2 ) )
+                 end if
+              else
+                 s = qurtr*dmin2
+                 ttype = -11
+              end if
+           else if( n0in>( n0+2 ) ) then
+              ! case 12, more than two eigenvalues deflated. no information.
+              s = zero
+              ttype = -12
+           end if
+           tau = s
+           return
+     end subroutine stdlib_qlasq4
+
+     !> DLASQ5: computes one dqds transform in ping-pong form, one
+     !> version for IEEE machines another for non IEEE machines.
+
+     pure subroutine stdlib_qlasq5( i0, n0, z, pp, tau, sigma, dmin, dmin1, dmin2,dn, dnm1, dnm2, &
+               ieee, eps )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: ieee
+           integer(ilp), intent(in) :: i0, n0, pp
+           real(qp), intent(out) :: dmin, dmin1, dmin2, dn, dnm1, dnm2
+           real(qp), intent(inout) :: tau
+           real(qp), intent(in) :: sigma, eps
+           ! Array Arguments 
+           real(qp), intent(inout) :: z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j4, j4p2
+           real(qp) :: d, emin, temp, dthresh
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( ( n0-i0-1 )<=0 )return
+           dthresh = eps*(sigma+tau)
+           if( tau<dthresh*half ) tau = zero
+           if( tau/=zero ) then
+           j4 = 4*i0 + pp - 3
+           emin = z( j4+4 )
+           d = z( j4 ) - tau
+           dmin = d
+           dmin1 = -z( j4 )
+           if( ieee ) then
+              ! code for ieee arithmetic.
+              if( pp==0 ) then
+                 do j4 = 4*i0, 4*( n0-3 ), 4
+                    z( j4-2 ) = d + z( j4-1 )
+                    temp = z( j4+1 ) / z( j4-2 )
+                    d = d*temp - tau
+                    dmin = min( dmin, d )
+                    z( j4 ) = z( j4-1 )*temp
+                    emin = min( z( j4 ), emin )
+                 end do
+              else
+                 do j4 = 4*i0, 4*( n0-3 ), 4
+                    z( j4-3 ) = d + z( j4 )
+                    temp = z( j4+2 ) / z( j4-3 )
+                    d = d*temp - tau
+                    dmin = min( dmin, d )
+                    z( j4-1 ) = z( j4 )*temp
+                    emin = min( z( j4-1 ), emin )
+                 end do
+              end if
+              ! unroll last two steps.
+              dnm2 = d
+              dmin2 = dmin
+              j4 = 4*( n0-2 ) - pp
+              j4p2 = j4 + 2*pp - 1
+              z( j4-2 ) = dnm2 + z( j4p2 )
+              z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+              dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) ) - tau
+              dmin = min( dmin, dnm1 )
+              dmin1 = dmin
+              j4 = j4 + 4
+              j4p2 = j4 + 2*pp - 1
+              z( j4-2 ) = dnm1 + z( j4p2 )
+              z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+              dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) ) - tau
+              dmin = min( dmin, dn )
+           else
+              ! code for non ieee arithmetic.
+              if( pp==0 ) then
+                 do j4 = 4*i0, 4*( n0-3 ), 4
+                    z( j4-2 ) = d + z( j4-1 )
+                    if( d<zero ) then
+                       return
+                    else
+                       z( j4 ) = z( j4+1 )*( z( j4-1 ) / z( j4-2 ) )
+                       d = z( j4+1 )*( d / z( j4-2 ) ) - tau
+                    end if
+                    dmin = min( dmin, d )
+                    emin = min( emin, z( j4 ) )
+                 end do
+              else
+                 do j4 = 4*i0, 4*( n0-3 ), 4
+                    z( j4-3 ) = d + z( j4 )
+                    if( d<zero ) then
+                       return
+                    else
+                       z( j4-1 ) = z( j4+2 )*( z( j4 ) / z( j4-3 ) )
+                       d = z( j4+2 )*( d / z( j4-3 ) ) - tau
+                    end if
+                    dmin = min( dmin, d )
+                    emin = min( emin, z( j4-1 ) )
+                 end do
+              end if
+              ! unroll last two steps.
+              dnm2 = d
+              dmin2 = dmin
+              j4 = 4*( n0-2 ) - pp
+              j4p2 = j4 + 2*pp - 1
+              z( j4-2 ) = dnm2 + z( j4p2 )
+              if( dnm2<zero ) then
+                 return
+              else
+                 z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                 dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) ) - tau
+              end if
+              dmin = min( dmin, dnm1 )
+              dmin1 = dmin
+              j4 = j4 + 4
+              j4p2 = j4 + 2*pp - 1
+              z( j4-2 ) = dnm1 + z( j4p2 )
+              if( dnm1<zero ) then
+                 return
+              else
+                 z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                 dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) ) - tau
+              end if
+              dmin = min( dmin, dn )
+           end if
+           else
+           ! this is the version that sets d's to zero if they are small enough
+              j4 = 4*i0 + pp - 3
+              emin = z( j4+4 )
+              d = z( j4 ) - tau
+              dmin = d
+              dmin1 = -z( j4 )
+              if( ieee ) then
+           ! code for ieee arithmetic.
+                 if( pp==0 ) then
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-2 ) = d + z( j4-1 )
+                       temp = z( j4+1 ) / z( j4-2 )
+                       d = d*temp - tau
+                       if( d<dthresh ) d = zero
+                       dmin = min( dmin, d )
+                       z( j4 ) = z( j4-1 )*temp
+                       emin = min( z( j4 ), emin )
+                    end do
+                 else
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-3 ) = d + z( j4 )
+                       temp = z( j4+2 ) / z( j4-3 )
+                       d = d*temp - tau
+                       if( d<dthresh ) d = zero
+                       dmin = min( dmin, d )
+                       z( j4-1 ) = z( j4 )*temp
+                       emin = min( z( j4-1 ), emin )
+                    end do
+                 end if
+           ! unroll last two steps.
+                 dnm2 = d
+                 dmin2 = dmin
+                 j4 = 4*( n0-2 ) - pp
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm2 + z( j4p2 )
+                 z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                 dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) ) - tau
+                 dmin = min( dmin, dnm1 )
+                 dmin1 = dmin
+                 j4 = j4 + 4
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm1 + z( j4p2 )
+                 z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                 dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) ) - tau
+                 dmin = min( dmin, dn )
+              else
+           ! code for non ieee arithmetic.
+                 if( pp==0 ) then
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-2 ) = d + z( j4-1 )
+                       if( d<zero ) then
+                          return
+                       else
+                          z( j4 ) = z( j4+1 )*( z( j4-1 ) / z( j4-2 ) )
+                          d = z( j4+1 )*( d / z( j4-2 ) ) - tau
+                       end if
+                       if( d<dthresh) d = zero
+                       dmin = min( dmin, d )
+                       emin = min( emin, z( j4 ) )
+                    end do
+                 else
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-3 ) = d + z( j4 )
+                       if( d<zero ) then
+                          return
+                       else
+                          z( j4-1 ) = z( j4+2 )*( z( j4 ) / z( j4-3 ) )
+                          d = z( j4+2 )*( d / z( j4-3 ) ) - tau
+                       end if
+                       if( d<dthresh) d = zero
+                       dmin = min( dmin, d )
+                       emin = min( emin, z( j4-1 ) )
+                    end do
+                 end if
+           ! unroll last two steps.
+                 dnm2 = d
+                 dmin2 = dmin
+                 j4 = 4*( n0-2 ) - pp
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm2 + z( j4p2 )
+                 if( dnm2<zero ) then
+                    return
+                 else
+                    z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                    dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) ) - tau
+                 end if
+                 dmin = min( dmin, dnm1 )
+                 dmin1 = dmin
+                 j4 = j4 + 4
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm1 + z( j4p2 )
+                 if( dnm1<zero ) then
+                    return
+                 else
+                    z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                    dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) ) - tau
+                 end if
+                 dmin = min( dmin, dn )
+              end if
+           end if
+           z( j4+2 ) = dn
+           z( 4*n0-pp ) = emin
+           return
+     end subroutine stdlib_qlasq5
+
+     !> DLASQ6: computes one dqd (shift equal to zero) transform in
+     !> ping-pong form, with protection against underflow and overflow.
+
+     pure subroutine stdlib_qlasq6( i0, n0, z, pp, dmin, dmin1, dmin2, dn,dnm1, dnm2 )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i0, n0, pp
+           real(qp), intent(out) :: dmin, dmin1, dmin2, dn, dnm1, dnm2
+           ! Array Arguments 
+           real(qp), intent(inout) :: z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j4, j4p2
+           real(qp) :: d, emin, safmin, temp
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( ( n0-i0-1 )<=0 )return
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           j4 = 4*i0 + pp - 3
+           emin = z( j4+4 )
+           d = z( j4 )
+           dmin = d
+           if( pp==0 ) then
+              do j4 = 4*i0, 4*( n0-3 ), 4
+                 z( j4-2 ) = d + z( j4-1 )
+                 if( z( j4-2 )==zero ) then
+                    z( j4 ) = zero
+                    d = z( j4+1 )
+                    dmin = d
+                    emin = zero
+                 else if( safmin*z( j4+1 )<z( j4-2 ) .and.safmin*z( j4-2 )<z( j4+1 ) ) &
+                           then
+                    temp = z( j4+1 ) / z( j4-2 )
+                    z( j4 ) = z( j4-1 )*temp
+                    d = d*temp
+                 else
+                    z( j4 ) = z( j4+1 )*( z( j4-1 ) / z( j4-2 ) )
+                    d = z( j4+1 )*( d / z( j4-2 ) )
+                 end if
+                 dmin = min( dmin, d )
+                 emin = min( emin, z( j4 ) )
+              end do
+           else
+              do j4 = 4*i0, 4*( n0-3 ), 4
+                 z( j4-3 ) = d + z( j4 )
+                 if( z( j4-3 )==zero ) then
+                    z( j4-1 ) = zero
+                    d = z( j4+2 )
+                    dmin = d
+                    emin = zero
+                 else if( safmin*z( j4+2 )<z( j4-3 ) .and.safmin*z( j4-3 )<z( j4+2 ) ) &
+                           then
+                    temp = z( j4+2 ) / z( j4-3 )
+                    z( j4-1 ) = z( j4 )*temp
+                    d = d*temp
+                 else
+                    z( j4-1 ) = z( j4+2 )*( z( j4 ) / z( j4-3 ) )
+                    d = z( j4+2 )*( d / z( j4-3 ) )
+                 end if
+                 dmin = min( dmin, d )
+                 emin = min( emin, z( j4-1 ) )
+              end do
+           end if
+           ! unroll last two steps.
+           dnm2 = d
+           dmin2 = dmin
+           j4 = 4*( n0-2 ) - pp
+           j4p2 = j4 + 2*pp - 1
+           z( j4-2 ) = dnm2 + z( j4p2 )
+           if( z( j4-2 )==zero ) then
+              z( j4 ) = zero
+              dnm1 = z( j4p2+2 )
+              dmin = dnm1
+              emin = zero
+           else if( safmin*z( j4p2+2 )<z( j4-2 ) .and.safmin*z( j4-2 )<z( j4p2+2 ) ) then
+              temp = z( j4p2+2 ) / z( j4-2 )
+              z( j4 ) = z( j4p2 )*temp
+              dnm1 = dnm2*temp
+           else
+              z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+              dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) )
+           end if
+           dmin = min( dmin, dnm1 )
+           dmin1 = dmin
+           j4 = j4 + 4
+           j4p2 = j4 + 2*pp - 1
+           z( j4-2 ) = dnm1 + z( j4p2 )
+           if( z( j4-2 )==zero ) then
+              z( j4 ) = zero
+              dn = z( j4p2+2 )
+              dmin = dn
+              emin = zero
+           else if( safmin*z( j4p2+2 )<z( j4-2 ) .and.safmin*z( j4-2 )<z( j4p2+2 ) ) then
+              temp = z( j4p2+2 ) / z( j4-2 )
+              z( j4 ) = z( j4p2 )*temp
+              dn = dnm1*temp
+           else
+              z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+              dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) )
+           end if
+           dmin = min( dmin, dn )
+           z( j4+2 ) = dn
+           z( 4*n0-pp ) = emin
+           return
+     end subroutine stdlib_qlasq6
+
+     !> DLASR: applies a sequence of plane rotations to a real matrix A,
+     !> from either the left or the right.
+     !> When SIDE = 'L', the transformation takes the form
+     !> A := P*A
+     !> and when SIDE = 'R', the transformation takes the form
+     !> A := A*P**T
+     !> where P is an orthogonal matrix consisting of a sequence of z plane
+     !> rotations, with z = M when SIDE = 'L' and z = N when SIDE = 'R',
+     !> and P**T is the transpose of P.
+     !> When DIRECT = 'F' (Forward sequence), then
+     !> P = P(z-1) * ... * P(2) * P(1)
+     !> and when DIRECT = 'B' (Backward sequence), then
+     !> P = P(1) * P(2) * ... * P(z-1)
+     !> where P(k) is a plane rotation matrix defined by the 2-by-2 rotation
+     !> R(k) = (  c(k)  s(k) )
+     !> = ( -s(k)  c(k) ).
+     !> When PIVOT = 'V' (Variable pivot), the rotation is performed
+     !> for the plane (k,k+1), i.e., P(k) has the form
+     !> P(k) = (  1                                            )
+     !> (       ...                                     )
+     !> (              1                                )
+     !> (                   c(k)  s(k)                  )
+     !> (                  -s(k)  c(k)                  )
+     !> (                                1              )
+     !> (                                     ...       )
+     !> (                                            1  )
+     !> where R(k) appears as a rank-2 modification to the identity matrix in
+     !> rows and columns k and k+1.
+     !> When PIVOT = 'T' (Top pivot), the rotation is performed for the
+     !> plane (1,k+1), so P(k) has the form
+     !> P(k) = (  c(k)                    s(k)                 )
+     !> (         1                                     )
+     !> (              ...                              )
+     !> (                     1                         )
+     !> ( -s(k)                    c(k)                 )
+     !> (                                 1             )
+     !> (                                      ...      )
+     !> (                                             1 )
+     !> where R(k) appears in rows and columns 1 and k+1.
+     !> Similarly, when PIVOT = 'B' (Bottom pivot), the rotation is
+     !> performed for the plane (k,z), giving P(k) the form
+     !> P(k) = ( 1                                             )
+     !> (      ...                                      )
+     !> (             1                                 )
+     !> (                  c(k)                    s(k) )
+     !> (                         1                     )
+     !> (                              ...              )
+     !> (                                     1         )
+     !> (                 -s(k)                    c(k) )
+     !> where R(k) appears in rows and columns k and z.  The rotations are
+     !> performed without ever forming P(k) explicitly.
+
+     pure subroutine stdlib_qlasr( side, pivot, direct, m, n, c, s, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, pivot, side
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: c(*), s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j
+           real(qp) :: ctemp, stemp, temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( .not.( stdlib_lsame( side, 'L' ) .or. stdlib_lsame( side, 'R' ) ) ) then
+              info = 1
+           else if( .not.( stdlib_lsame( pivot, 'V' ) .or. stdlib_lsame( pivot,'T' ) .or. &
+                     stdlib_lsame( pivot, 'B' ) ) ) then
+              info = 2
+           else if( .not.( stdlib_lsame( direct, 'F' ) .or. stdlib_lsame( direct, 'B' ) ) )&
+                     then
+              info = 3
+           else if( m<0 ) then
+              info = 4
+           else if( n<0 ) then
+              info = 5
+           else if( lda<max( 1, m ) ) then
+              info = 9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASR ', info )
+              return
+           end if
+           ! quick return if possible
+           if( ( m==0 ) .or. ( n==0 ) )return
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  p * a
+              if( stdlib_lsame( pivot, 'V' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, m - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j+1, i )
+                             a( j+1, i ) = ctemp*temp - stemp*a( j, i )
+                             a( j, i ) = stemp*temp + ctemp*a( j, i )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j+1, i )
+                             a( j+1, i ) = ctemp*temp - stemp*a( j, i )
+                             a( j, i ) = stemp*temp + ctemp*a( j, i )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'T' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 2, m
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = ctemp*temp - stemp*a( 1, i )
+                             a( 1, i ) = stemp*temp + ctemp*a( 1, i )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m, 2, -1
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = ctemp*temp - stemp*a( 1, i )
+                             a( 1, i ) = stemp*temp + ctemp*a( 1, i )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'B' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, m - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = stemp*a( m, i ) + ctemp*temp
+                             a( m, i ) = ctemp*a( m, i ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = stemp*a( m, i ) + ctemp*temp
+                             a( m, i ) = ctemp*a( m, i ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 end if
+              end if
+           else if( stdlib_lsame( side, 'R' ) ) then
+              ! form a * p**t
+              if( stdlib_lsame( pivot, 'V' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, n - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j+1 )
+                             a( i, j+1 ) = ctemp*temp - stemp*a( i, j )
+                             a( i, j ) = stemp*temp + ctemp*a( i, j )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j+1 )
+                             a( i, j+1 ) = ctemp*temp - stemp*a( i, j )
+                             a( i, j ) = stemp*temp + ctemp*a( i, j )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'T' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 2, n
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = ctemp*temp - stemp*a( i, 1 )
+                             a( i, 1 ) = stemp*temp + ctemp*a( i, 1 )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n, 2, -1
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = ctemp*temp - stemp*a( i, 1 )
+                             a( i, 1 ) = stemp*temp + ctemp*a( i, 1 )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'B' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, n - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = stemp*a( i, n ) + ctemp*temp
+                             a( i, n ) = ctemp*a( i, n ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = stemp*a( i, n ) + ctemp*temp
+                             a( i, n ) = ctemp*a( i, n ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qlasr
+
+     !> Sort the numbers in D in increasing order (if ID = 'I') or
+     !> in decreasing order (if ID = 'D' ).
+     !> Use Quick Sort, reverting to Insertion sort on arrays of
+     !> size <= 20. Dimension of STACK limits N to about 2**32.
+
+     pure subroutine stdlib_qlasrt( id, n, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: id
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(inout) :: d(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: select = 20
+           
+           ! Local Scalars 
+           integer(ilp) :: dir, endd, i, j, start, stkpnt
+           real(qp) :: d1, d2, d3, dmnmx, tmp
+           ! Local Arrays 
+           integer(ilp) :: stack(2,32)
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           dir = -1
+           if( stdlib_lsame( id, 'D' ) ) then
+              dir = 0
+           else if( stdlib_lsame( id, 'I' ) ) then
+              dir = 1
+           end if
+           if( dir==-1 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLASRT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 )return
+           stkpnt = 1
+           stack( 1, 1 ) = 1
+           stack( 2, 1 ) = n
+           10 continue
+           start = stack( 1, stkpnt )
+           endd = stack( 2, stkpnt )
+           stkpnt = stkpnt - 1
+           if( endd-start<=select .and. endd-start>0 ) then
+              ! do insertion sort on d( start:endd )
+              if( dir==0 ) then
+                 ! sort into decreasing order
+                 loop_30: do i = start + 1, endd
+                    do j = i, start + 1, -1
+                       if( d( j )>d( j-1 ) ) then
+                          dmnmx = d( j )
+                          d( j ) = d( j-1 )
+                          d( j-1 ) = dmnmx
+                       else
+                          cycle loop_30
+                       end if
+                    end do
+                 end do loop_30
+              else
+                 ! sort into increasing order
+                 loop_50: do i = start + 1, endd
+                    do j = i, start + 1, -1
+                       if( d( j )<d( j-1 ) ) then
+                          dmnmx = d( j )
+                          d( j ) = d( j-1 )
+                          d( j-1 ) = dmnmx
+                       else
+                          cycle loop_50
+                       end if
+                    end do
+                 end do loop_50
+              end if
+           else if( endd-start>select ) then
+              ! partition d( start:endd ) and stack parts, largest one first
+              ! choose partition entry as median of 3
+              d1 = d( start )
+              d2 = d( endd )
+              i = ( start+endd ) / 2
+              d3 = d( i )
+              if( d1<d2 ) then
+                 if( d3<d1 ) then
+                    dmnmx = d1
+                 else if( d3<d2 ) then
+                    dmnmx = d3
+                 else
+                    dmnmx = d2
+                 end if
+              else
+                 if( d3<d2 ) then
+                    dmnmx = d2
+                 else if( d3<d1 ) then
+                    dmnmx = d3
+                 else
+                    dmnmx = d1
+                 end if
+              end if
+              if( dir==0 ) then
+                 ! sort into decreasing order
+                 i = start - 1
+                 j = endd + 1
+                 60 continue
+                 70 continue
+                 j = j - 1
+                 if( d( j )<dmnmx )go to 70
+                 80 continue
+                 i = i + 1
+                 if( d( i )>dmnmx )go to 80
+                 if( i<j ) then
+                    tmp = d( i )
+                    d( i ) = d( j )
+                    d( j ) = tmp
+                    go to 60
+                 end if
+                 if( j-start>endd-j-1 ) then
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = start
+                    stack( 2, stkpnt ) = j
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = j + 1
+                    stack( 2, stkpnt ) = endd
+                 else
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = j + 1
+                    stack( 2, stkpnt ) = endd
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = start
+                    stack( 2, stkpnt ) = j
+                 end if
+              else
+                 ! sort into increasing order
+                 i = start - 1
+                 j = endd + 1
+                 90 continue
+                 100 continue
+                 j = j - 1
+                 if( d( j )>dmnmx )go to 100
+                 110 continue
+                 i = i + 1
+                 if( d( i )<dmnmx )go to 110
+                 if( i<j ) then
+                    tmp = d( i )
+                    d( i ) = d( j )
+                    d( j ) = tmp
+                    go to 90
+                 end if
+                 if( j-start>endd-j-1 ) then
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = start
+                    stack( 2, stkpnt ) = j
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = j + 1
+                    stack( 2, stkpnt ) = endd
+                 else
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = j + 1
+                    stack( 2, stkpnt ) = endd
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = start
+                    stack( 2, stkpnt ) = j
+                 end if
+              end if
+           end if
+           if( stkpnt>0 )go to 10
+           return
+     end subroutine stdlib_qlasrt
+
+     !> !
+     !>
+     !> DLASSQ:  returns the values  scl  and  smsq  such that
+     !> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+     !> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+     !> assumed to be non-negative.
+     !> scale and sumsq must be supplied in SCALE and SUMSQ and
+     !> scl and smsq are overwritten on SCALE and SUMSQ respectively.
+     !> If scale * sqrt( sumsq ) > tbig then
+     !> we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
+     !> and if 0 < scale * sqrt( sumsq ) < tsml then
+     !> we require:   scale <= sqrt( HUGE ) / ssml       on entry,
+     !> where
+     !> tbig -- upper threshold for values whose square is representable;
+     !> sbig -- scaling constant for big numbers; \see la_constants.f90
+     !> tsml -- lower threshold for values whose square is representable;
+     !> ssml -- scaling constant for small numbers; \see la_constants.f90
+     !> and
+     !> TINY*EPS -- tiniest representable number;
+     !> HUGE     -- biggest representable number.
+
+     pure subroutine stdlib_qlassq( n, x, incx, scl, sumsq )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+        ! Scalar Arguments 
+     integer(ilp), intent(in) :: incx, n
+        real(qp), intent(inout) :: scl, sumsq
+        ! Array Arguments 
+        real(qp), intent(in) :: x(*)
+        ! Local Scalars 
+     integer(ilp) :: i, ix
+     logical(lk) :: notbig
+        real(qp) :: abig, amed, asml, ax, ymax, ymin
+        ! quick return if possible
+        if( ieee_is_nan(scl) .or. ieee_is_nan(sumsq) ) return
+        if( sumsq == zero ) scl = one
+        if( scl == zero ) then
+           scl = one
+           sumsq = zero
+        end if
+        if (n <= 0) then
+           return
+        end if
+        ! compute the sum of squares in 3 accumulators:
+           ! abig -- sums of squares scaled down to avoid overflow
+           ! asml -- sums of squares scaled up to avoid underflow
+           ! amed -- sums of squares that do not require scaling
+        ! the thresholds and multipliers are
+           ! tbig -- values bigger than this are scaled down by sbig
+           ! tsml -- values smaller than this are scaled up by ssml
+        notbig = .true.
+        asml = zero
+        amed = zero
+        abig = zero
+        ix = 1
+        if( incx < 0 ) ix = 1 - (n-1)*incx
+        do i = 1, n
+           ax = abs(x(ix))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ix = ix + incx
+        end do
+        ! put the existing sum of squares into one of the accumulators
+        if( sumsq > zero ) then
+           ax = scl*sqrt( sumsq )
+           if (ax > tbig) then
+              ! we assume scl >= sqrt( tiny*eps ) / sbig
+              abig = abig + (scl*sbig)**2 * sumsq
+           else if (ax < tsml) then
+              ! we assume scl <= sqrt( huge ) / ssml
+              if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+           else
+              amed = amed + scl**2 * sumsq
+           end if
+        end if
+        ! combine abig and amed or amed and asml if more than one
+        ! accumulator was used.
+        if (abig > zero) then
+           ! combine abig and amed if abig > 0.
+           if (amed > zero .or. ieee_is_nan(amed)) then
+              abig = abig + (amed*sbig)*sbig
+           end if
+           scl = one / sbig
+           sumsq = abig
+        else if (asml > zero) then
+           ! combine amed and asml if asml > 0.
+           if (amed > zero .or. ieee_is_nan(amed)) then
+              amed = sqrt(amed)
+              asml = sqrt(asml) / ssml
+              if (asml > amed) then
+                 ymin = amed
+                 ymax = asml
+              else
+                 ymin = asml
+                 ymax = amed
+              end if
+              scl = one
+              sumsq = ymax**2*( one + (ymin/ymax)**2 )
+           else
+              scl = one / ssml
+              sumsq = asml
+           end if
+        else
+           ! otherwise all values are mid-range or zero
+           scl = one
+           sumsq = amed
+        end if
+        return
+     end subroutine stdlib_qlassq
+
+     !> DLASV2: computes the singular value decomposition of a 2-by-2
+     !> triangular matrix
+     !> [  F   G  ]
+     !> [  0   H  ].
+     !> On return, abs(SSMAX) is the larger singular value, abs(SSMIN) is the
+     !> smaller singular value, and (CSL,SNL) and (CSR,SNR) are the left and
+     !> right singular vectors for abs(SSMAX), giving the decomposition
+     !> [ CSL  SNL ] [  F   G  ] [ CSR -SNR ]  =  [ SSMAX   0   ]
+     !> [-SNL  CSL ] [  0   H  ] [ SNR  CSR ]     [  0    SSMIN ].
+
+     pure subroutine stdlib_qlasv2( f, g, h, ssmin, ssmax, snr, csr, snl, csl )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(out) :: csl, csr, snl, snr, ssmax, ssmin
+           real(qp), intent(in) :: f, g, h
+       ! =====================================================================
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: gasmal, swap
+           integer(ilp) :: pmax
+           real(qp) :: a, clt, crt, d, fa, ft, ga, gt, ha, ht, l, m, mm, r, s, slt, srt, t, temp, &
+                     tsign, tt
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,sqrt
+           ! Executable Statements 
+           ft = f
+           fa = abs( ft )
+           ht = h
+           ha = abs( h )
+           ! pmax points to the maximum absolute element of matrix
+             ! pmax = 1 if f largest in absolute values
+             ! pmax = 2 if g largest in absolute values
+             ! pmax = 3 if h largest in absolute values
+           pmax = 1
+           swap = ( ha>fa )
+           if( swap ) then
+              pmax = 3
+              temp = ft
+              ft = ht
+              ht = temp
+              temp = fa
+              fa = ha
+              ha = temp
+              ! now fa .ge. ha
+           end if
+           gt = g
+           ga = abs( gt )
+           if( ga==zero ) then
+              ! diagonal matrix
+              ssmin = ha
+              ssmax = fa
+              clt = one
+              crt = one
+              slt = zero
+              srt = zero
+           else
+              gasmal = .true.
+              if( ga>fa ) then
+                 pmax = 2
+                 if( ( fa / ga )<stdlib_qlamch( 'EPS' ) ) then
+                    ! case of very large ga
+                    gasmal = .false.
+                    ssmax = ga
+                    if( ha>one ) then
+                       ssmin = fa / ( ga / ha )
+                    else
+                       ssmin = ( fa / ga )*ha
+                    end if
+                    clt = one
+                    slt = ht / gt
+                    srt = one
+                    crt = ft / gt
+                 end if
+              end if
+              if( gasmal ) then
+                 ! normal case
+                 d = fa - ha
+                 if( d==fa ) then
+                    ! copes with infinite f or h
+                    l = one
+                 else
+                    l = d / fa
+                 end if
+                 ! note that 0 .le. l .le. 1
+                 m = gt / ft
+                 ! note that abs(m) .le. 1/macheps
+                 t = two - l
+                 ! note that t .ge. 1
+                 mm = m*m
+                 tt = t*t
+                 s = sqrt( tt+mm )
+                 ! note that 1 .le. s .le. 1 + 1/macheps
+                 if( l==zero ) then
+                    r = abs( m )
+                 else
+                    r = sqrt( l*l+mm )
+                 end if
+                 ! note that 0 .le. r .le. 1 + 1/macheps
+                 a = half*( s+r )
+                 ! note that 1 .le. a .le. 1 + abs(m)
+                 ssmin = ha / a
+                 ssmax = fa*a
+                 if( mm==zero ) then
+                    ! note that m is very tiny
+                    if( l==zero ) then
+                       t = sign( two, ft )*sign( one, gt )
+                    else
+                       t = gt / sign( d, ft ) + m / t
+                    end if
+                 else
+                    t = ( m / ( s+t )+m / ( r+l ) )*( one+a )
+                 end if
+                 l = sqrt( t*t+four )
+                 crt = two / l
+                 srt = t / l
+                 clt = ( crt+srt*m ) / a
+                 slt = ( ht / ft )*srt / a
+              end if
+           end if
+           if( swap ) then
+              csl = srt
+              snl = crt
+              csr = slt
+              snr = clt
+           else
+              csl = clt
+              snl = slt
+              csr = crt
+              snr = srt
+           end if
+           ! correct signs of ssmax and ssmin
+           if( pmax==1 )tsign = sign( one, csr )*sign( one, csl )*sign( one, f )
+           if( pmax==2 )tsign = sign( one, snr )*sign( one, csl )*sign( one, g )
+           if( pmax==3 )tsign = sign( one, snr )*sign( one, snl )*sign( one, h )
+           ssmax = sign( ssmax, tsign )
+           ssmin = sign( ssmin, tsign*sign( one, f )*sign( one, h ) )
+           return
+     end subroutine stdlib_qlasv2
+
+     !> DLASWLQ: computes a blocked Tall-Skinny LQ factorization of
+     !> a real M-by-N matrix A for M <= N:
+     !> A = ( L 0 ) *  Q,
+     !> where:
+     !> Q is a n-by-N orthogonal matrix, stored on exit in an implicit
+     !> form in the elements above the diagonal of the array A and in
+     !> the elements of the array T;
+     !> L is a lower-triangular M-by-M matrix stored on exit in
+     !> the elements on and below the diagonal of the array A.
+     !> 0 is a M-by-(N-M) zero matrix, if M < N, and is not stored.
+
+     pure subroutine stdlib_qlaswlq( m, n, mb, nb, a, lda, t, ldt, work, lwork,info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, mb, nb, lwork, ldt
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ii, kk, ctr
+           ! External Subroutines 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+             info = -1
+           else if( n<0 .or. n<m ) then
+             info = -2
+           else if( mb<1 .or. ( mb>m .and. m>0 )) then
+             info = -3
+           else if( nb<0 ) then
+             info = -4
+           else if( lda<max( 1, m ) ) then
+             info = -6
+           else if( ldt<mb ) then
+             info = -8
+           else if( ( lwork<m*mb) .and. (.not.lquery) ) then
+             info = -10
+           end if
+           if( info==0)  then
+           work(1) = mb*m
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DLASWLQ', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n)==0 ) then
+               return
+           end if
+           ! the lq decomposition
+            if((m>=n).or.(nb<=m).or.(nb>=n)) then
+             call stdlib_qgelqt( m, n, mb, a, lda, t, ldt, work, info)
+             return
+            end if
+            kk = mod((n-m),(nb-m))
+            ii=n-kk+1
+            ! compute the lq factorization of the first block a(1:m,1:nb)
+            call stdlib_qgelqt( m, nb, mb, a(1,1), lda, t, ldt, work, info)
+            ctr = 1
+            do i = nb+1, ii-nb+m , (nb-m)
+            ! compute the qr factorization of the current block a(1:m,i:i+nb-m)
+              call stdlib_qtplqt( m, nb-m, 0, mb, a(1,1), lda, a( 1, i ),lda, t(1, ctr * m + 1),&
+                        ldt, work, info )
+              ctr = ctr + 1
+            end do
+           ! compute the qr factorization of the last block a(1:m,ii:n)
+            if (ii<=n) then
+             call stdlib_qtplqt( m, kk, 0, mb, a(1,1), lda, a( 1, ii ),lda, t(1, ctr * m + 1), &
+                       ldt,work, info )
+            end if
+           work( 1 ) = m * mb
+           return
+     end subroutine stdlib_qlaswlq
+
+     !> DLASWP: performs a series of row interchanges on the matrix A.
+     !> One row interchange is initiated for each of rows K1 through K2 of A.
+
+     pure subroutine stdlib_qlaswp( n, a, lda, k1, k2, ipiv, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k1, k2, lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, i1, i2, inc, ip, ix, ix0, j, k, n32
+           real(qp) :: temp
+           ! Executable Statements 
+           ! interchange row i with row ipiv(k1+(i-k1)*abs(incx)) for each of rows
+           ! k1 through k2.
+           if( incx>0 ) then
+              ix0 = k1
+              i1 = k1
+              i2 = k2
+              inc = 1
+           else if( incx<0 ) then
+              ix0 = k1 + ( k1-k2 )*incx
+              i1 = k2
+              i2 = k1
+              inc = -1
+           else
+              return
+           end if
+           n32 = ( n / 32 )*32
+           if( n32/=0 ) then
+              do j = 1, n32, 32
+                 ix = ix0
+                 do i = i1, i2, inc
+                    ip = ipiv( ix )
+                    if( ip/=i ) then
+                       do k = j, j + 31
+                          temp = a( i, k )
+                          a( i, k ) = a( ip, k )
+                          a( ip, k ) = temp
+                       end do
+                    end if
+                    ix = ix + incx
+                 end do
+              end do
+           end if
+           if( n32/=n ) then
+              n32 = n32 + 1
+              ix = ix0
+              do i = i1, i2, inc
+                 ip = ipiv( ix )
+                 if( ip/=i ) then
+                    do k = n32, n
+                       temp = a( i, k )
+                       a( i, k ) = a( ip, k )
+                       a( ip, k ) = temp
+                    end do
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end subroutine stdlib_qlaswp
+
+     !> DLASY2: solves for the N1 by N2 matrix X, 1 <= N1,N2 <= 2, in
+     !> op(TL)*X + ISGN*X*op(TR) = SCALE*B,
+     !> where TL is N1 by N1, TR is N2 by N2, B is N1 by N2, and ISGN = 1 or
+     !> -1.  op(T) = T or T**T, where T**T denotes the transpose of T.
+
+     pure subroutine stdlib_qlasy2( ltranl, ltranr, isgn, n1, n2, tl, ldtl, tr,ldtr, b, ldb, &
+               scale, x, ldx, xnorm, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: ltranl, ltranr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: isgn, ldb, ldtl, ldtr, ldx, n1, n2
+           real(qp), intent(out) :: scale, xnorm
+           ! Array Arguments 
+           real(qp), intent(in) :: b(ldb,*), tl(ldtl,*), tr(ldtr,*)
+           real(qp), intent(out) :: x(ldx,*)
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: bswap, xswap
+           integer(ilp) :: i, ip, ipiv, ipsv, j, jp, jpsv, k
+           real(qp) :: bet, eps, gam, l21, sgn, smin, smlnum, tau1, temp, u11, u12, u22, &
+                     xmax
+           ! Local Arrays 
+           logical(lk) :: bswpiv(4), xswpiv(4)
+           integer(ilp) :: jpiv(4), locl21(4), locu12(4), locu22(4)
+           real(qp) :: btmp(4), t16(4,4), tmp(4), x2(2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Data Statements 
+           locu12 = [3,4,1,2]
+           locl21 = [2,1,4,3]
+           locu22 = [4,3,2,1]
+           xswpiv = [.false.,.false.,.true.,.true.]
+           bswpiv = [.false.,.true.,.false.,.true.]
+           ! Executable Statements 
+           ! do not check the input parameters for errors
+           info = 0
+           ! quick return if possible
+           if( n1==0 .or. n2==0 )return
+           ! set constants to control overflow
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' ) / eps
+           sgn = isgn
+           k = n1 + n1 + n2 - 2
+           go to ( 10, 20, 30, 50 )k
+           ! 1 by 1: tl11*x + sgn*x*tr11 = b11
+           10 continue
+           tau1 = tl( 1, 1 ) + sgn*tr( 1, 1 )
+           bet = abs( tau1 )
+           if( bet<=smlnum ) then
+              tau1 = smlnum
+              bet = smlnum
+              info = 1
+           end if
+           scale = one
+           gam = abs( b( 1, 1 ) )
+           if( smlnum*gam>bet )scale = one / gam
+           x( 1, 1 ) = ( b( 1, 1 )*scale ) / tau1
+           xnorm = abs( x( 1, 1 ) )
+           return
+           ! 1 by 2:
+           ! tl11*[x11 x12] + isgn*[x11 x12]*op[tr11 tr12]  = [b11 b12]
+                                             ! [tr21 tr22]
+                                             20 continue
+           smin = max( eps*max( abs( tl( 1, 1 ) ), abs( tr( 1, 1 ) ),abs( tr( 1, 2 ) ), abs( tr( &
+                     2, 1 ) ), abs( tr( 2, 2 ) ) ),smlnum )
+           tmp( 1 ) = tl( 1, 1 ) + sgn*tr( 1, 1 )
+           tmp( 4 ) = tl( 1, 1 ) + sgn*tr( 2, 2 )
+           if( ltranr ) then
+              tmp( 2 ) = sgn*tr( 2, 1 )
+              tmp( 3 ) = sgn*tr( 1, 2 )
+           else
+              tmp( 2 ) = sgn*tr( 1, 2 )
+              tmp( 3 ) = sgn*tr( 2, 1 )
+           end if
+           btmp( 1 ) = b( 1, 1 )
+           btmp( 2 ) = b( 1, 2 )
+           go to 40
+           ! 2 by 1:
+                ! op[tl11 tl12]*[x11] + isgn* [x11]*tr11  = [b11]
+                  ! [tl21 tl22] [x21]         [x21]         [b21]
+                  30 continue
+           smin = max( eps*max( abs( tr( 1, 1 ) ), abs( tl( 1, 1 ) ),abs( tl( 1, 2 ) ), abs( tl( &
+                     2, 1 ) ), abs( tl( 2, 2 ) ) ),smlnum )
+           tmp( 1 ) = tl( 1, 1 ) + sgn*tr( 1, 1 )
+           tmp( 4 ) = tl( 2, 2 ) + sgn*tr( 1, 1 )
+           if( ltranl ) then
+              tmp( 2 ) = tl( 1, 2 )
+              tmp( 3 ) = tl( 2, 1 )
+           else
+              tmp( 2 ) = tl( 2, 1 )
+              tmp( 3 ) = tl( 1, 2 )
+           end if
+           btmp( 1 ) = b( 1, 1 )
+           btmp( 2 ) = b( 2, 1 )
+           40 continue
+           ! solve 2 by 2 system using complete pivoting.
+           ! set pivots less than smin to smin.
+           ipiv = stdlib_iqamax( 4, tmp, 1 )
+           u11 = tmp( ipiv )
+           if( abs( u11 )<=smin ) then
+              info = 1
+              u11 = smin
+           end if
+           u12 = tmp( locu12( ipiv ) )
+           l21 = tmp( locl21( ipiv ) ) / u11
+           u22 = tmp( locu22( ipiv ) ) - u12*l21
+           xswap = xswpiv( ipiv )
+           bswap = bswpiv( ipiv )
+           if( abs( u22 )<=smin ) then
+              info = 1
+              u22 = smin
+           end if
+           if( bswap ) then
+              temp = btmp( 2 )
+              btmp( 2 ) = btmp( 1 ) - l21*temp
+              btmp( 1 ) = temp
+           else
+              btmp( 2 ) = btmp( 2 ) - l21*btmp( 1 )
+           end if
+           scale = one
+           if( ( two*smlnum )*abs( btmp( 2 ) )>abs( u22 ) .or.( two*smlnum )*abs( btmp( 1 ) )>abs(&
+                      u11 ) ) then
+              scale = half / max( abs( btmp( 1 ) ), abs( btmp( 2 ) ) )
+              btmp( 1 ) = btmp( 1 )*scale
+              btmp( 2 ) = btmp( 2 )*scale
+           end if
+           x2( 2 ) = btmp( 2 ) / u22
+           x2( 1 ) = btmp( 1 ) / u11 - ( u12 / u11 )*x2( 2 )
+           if( xswap ) then
+              temp = x2( 2 )
+              x2( 2 ) = x2( 1 )
+              x2( 1 ) = temp
+           end if
+           x( 1, 1 ) = x2( 1 )
+           if( n1==1 ) then
+              x( 1, 2 ) = x2( 2 )
+              xnorm = abs( x( 1, 1 ) ) + abs( x( 1, 2 ) )
+           else
+              x( 2, 1 ) = x2( 2 )
+              xnorm = max( abs( x( 1, 1 ) ), abs( x( 2, 1 ) ) )
+           end if
+           return
+           ! 2 by 2:
+           ! op[tl11 tl12]*[x11 x12] +isgn* [x11 x12]*op[tr11 tr12] = [b11 b12]
+             ! [tl21 tl22] [x21 x22]        [x21 x22]   [tr21 tr22]   [b21 b22]
+           ! solve equivalent 4 by 4 system using complete pivoting.
+           ! set pivots less than smin to smin.
+           50 continue
+           smin = max( abs( tr( 1, 1 ) ), abs( tr( 1, 2 ) ),abs( tr( 2, 1 ) ), abs( tr( 2, 2 ) ) )
+                     
+           smin = max( smin, abs( tl( 1, 1 ) ), abs( tl( 1, 2 ) ),abs( tl( 2, 1 ) ), abs( tl( 2, &
+                     2 ) ) )
+           smin = max( eps*smin, smlnum )
+           btmp( 1 ) = zero
+           call stdlib_qcopy( 16, btmp, 0, t16, 1 )
+           t16( 1, 1 ) = tl( 1, 1 ) + sgn*tr( 1, 1 )
+           t16( 2, 2 ) = tl( 2, 2 ) + sgn*tr( 1, 1 )
+           t16( 3, 3 ) = tl( 1, 1 ) + sgn*tr( 2, 2 )
+           t16( 4, 4 ) = tl( 2, 2 ) + sgn*tr( 2, 2 )
+           if( ltranl ) then
+              t16( 1, 2 ) = tl( 2, 1 )
+              t16( 2, 1 ) = tl( 1, 2 )
+              t16( 3, 4 ) = tl( 2, 1 )
+              t16( 4, 3 ) = tl( 1, 2 )
+           else
+              t16( 1, 2 ) = tl( 1, 2 )
+              t16( 2, 1 ) = tl( 2, 1 )
+              t16( 3, 4 ) = tl( 1, 2 )
+              t16( 4, 3 ) = tl( 2, 1 )
+           end if
+           if( ltranr ) then
+              t16( 1, 3 ) = sgn*tr( 1, 2 )
+              t16( 2, 4 ) = sgn*tr( 1, 2 )
+              t16( 3, 1 ) = sgn*tr( 2, 1 )
+              t16( 4, 2 ) = sgn*tr( 2, 1 )
+           else
+              t16( 1, 3 ) = sgn*tr( 2, 1 )
+              t16( 2, 4 ) = sgn*tr( 2, 1 )
+              t16( 3, 1 ) = sgn*tr( 1, 2 )
+              t16( 4, 2 ) = sgn*tr( 1, 2 )
+           end if
+           btmp( 1 ) = b( 1, 1 )
+           btmp( 2 ) = b( 2, 1 )
+           btmp( 3 ) = b( 1, 2 )
+           btmp( 4 ) = b( 2, 2 )
+           ! perform elimination
+           loop_100: do i = 1, 3
+              xmax = zero
+              do ip = i, 4
+                 do jp = i, 4
+                    if( abs( t16( ip, jp ) )>=xmax ) then
+                       xmax = abs( t16( ip, jp ) )
+                       ipsv = ip
+                       jpsv = jp
+                    end if
+                 end do
+              end do
+              if( ipsv/=i ) then
+                 call stdlib_qswap( 4, t16( ipsv, 1 ), 4, t16( i, 1 ), 4 )
+                 temp = btmp( i )
+                 btmp( i ) = btmp( ipsv )
+                 btmp( ipsv ) = temp
+              end if
+              if( jpsv/=i )call stdlib_qswap( 4, t16( 1, jpsv ), 1, t16( 1, i ), 1 )
+              jpiv( i ) = jpsv
+              if( abs( t16( i, i ) )<smin ) then
+                 info = 1
+                 t16( i, i ) = smin
+              end if
+              do j = i + 1, 4
+                 t16( j, i ) = t16( j, i ) / t16( i, i )
+                 btmp( j ) = btmp( j ) - t16( j, i )*btmp( i )
+                 do k = i + 1, 4
+                    t16( j, k ) = t16( j, k ) - t16( j, i )*t16( i, k )
+                 end do
+              end do
+           end do loop_100
+           if( abs( t16( 4, 4 ) )<smin ) then
+              info = 1
+              t16( 4, 4 ) = smin
+           end if
+           scale = one
+           if( ( eight*smlnum )*abs( btmp( 1 ) )>abs( t16( 1, 1 ) ) .or.( eight*smlnum )*abs( &
+           btmp( 2 ) )>abs( t16( 2, 2 ) ) .or.( eight*smlnum )*abs( btmp( 3 ) )>abs( t16( 3, 3 ) )&
+                      .or.( eight*smlnum )*abs( btmp( 4 ) )>abs( t16( 4, 4 ) ) ) then
+              scale = ( one / eight ) / max( abs( btmp( 1 ) ),abs( btmp( 2 ) ), abs( btmp( 3 ) ), &
+                        abs( btmp( 4 ) ) )
+              btmp( 1 ) = btmp( 1 )*scale
+              btmp( 2 ) = btmp( 2 )*scale
+              btmp( 3 ) = btmp( 3 )*scale
+              btmp( 4 ) = btmp( 4 )*scale
+           end if
+           do i = 1, 4
+              k = 5 - i
+              temp = one / t16( k, k )
+              tmp( k ) = btmp( k )*temp
+              do j = k + 1, 4
+                 tmp( k ) = tmp( k ) - ( temp*t16( k, j ) )*tmp( j )
+              end do
+           end do
+           do i = 1, 3
+              if( jpiv( 4-i )/=4-i ) then
+                 temp = tmp( 4-i )
+                 tmp( 4-i ) = tmp( jpiv( 4-i ) )
+                 tmp( jpiv( 4-i ) ) = temp
+              end if
+           end do
+           x( 1, 1 ) = tmp( 1 )
+           x( 2, 1 ) = tmp( 2 )
+           x( 1, 2 ) = tmp( 3 )
+           x( 2, 2 ) = tmp( 4 )
+           xnorm = max( abs( tmp( 1 ) )+abs( tmp( 3 ) ),abs( tmp( 2 ) )+abs( tmp( 4 ) ) )
+           return
+     end subroutine stdlib_qlasy2
+
+     !> DLASYF: computes a partial factorization of a real symmetric matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method. The partial
+     !> factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> DLASYF is an auxiliary routine called by DSYTRF. It uses blocked code
+     !> (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or
+     !> A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_qlasyf( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: imax, j, jb, jj, jmax, jp, k, kk, kkw, kp, kstep, kw
+           real(qp) :: absakk, alpha, colmax, d11, d21, d22, r1, rowmax, t
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              ! kw is the column of w which corresponds to column k of a
+              k = n
+              10 continue
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              ! copy column k of a to column kw of w and update it
+              call stdlib_qcopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_qgemv( 'NO TRANSPOSE', k, n-k, -one, a( 1, k+1 ), lda,w( k, kw+&
+                        1 ), ldw, one, w( 1, kw ), 1 )
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iqamax( k-1, w( 1, kw ), 1 )
+                 colmax = abs( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! copy column imax to column kw-1 of w and update it
+                    call stdlib_qcopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                    call stdlib_qcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                              
+                    if( k<n )call stdlib_qgemv( 'NO TRANSPOSE', k, n-k, -one, a( 1, k+1 ),lda, w( &
+                              imax, kw+1 ), ldw, one,w( 1, kw-1 ), 1 )
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = imax + stdlib_iqamax( k-imax, w( imax+1, kw-1 ), 1 )
+                    rowmax = abs( w( jmax, kw-1 ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_iqamax( imax-1, w( 1, kw-1 ), 1 )
+                       rowmax = max( rowmax, abs( w( jmax, kw-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( w( imax, kw-1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column kw-1 of w to column kw of w
+                       call stdlib_qcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = a( kk, kk )
+                    call stdlib_qcopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    if( kp>1 )call stdlib_qcopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_qswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_qswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! note: diagonal element u(k,k) is a unit element
+                    ! and not stored.
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    call stdlib_qcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    r1 = one / a( k, k )
+                    call stdlib_qscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored.
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! compose the columns of the inverse of 2-by-2 pivot
+                       ! block d in the following way to reduce the number
+                       ! of flops when we myltiply panel ( w(kw-1) w(kw) ) by
+                       ! this inverse
+                       ! d**(-1) = ( d11 d21 )**(-1) =
+                                 ! ( d21 d22 )
+                       ! = 1/(d11*d22-d21**2) * ( ( d22 ) (-d21 ) ) =
+                                              ! ( (-d21 ) ( d11 ) )
+                       ! = 1/d21 * 1/((d11/d21)*(d22/d21)-1) *
+                         ! * ( ( d22/d21 ) (      -1 ) ) =
+                           ! ( (      -1 ) ( d11/d21 ) )
+                       ! = 1/d21 * 1/(d22*d11-1) * ( ( d11 ) (  -1 ) ) =
+                                                 ! ( ( -1  ) ( d22 ) )
+                       ! = 1/d21 * t * ( ( d11 ) (  -1 ) )
+                                     ! ( (  -1 ) ( d22 ) )
+                       ! = d21 * ( ( d11 ) (  -1 ) )
+                               ! ( (  -1 ) ( d22 ) )
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / d21
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = one / ( d11*d22-one )
+                       d21 = t / d21
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = d21*( d11*w( j, kw-1 )-w( j, kw ) )
+                          a( j, k ) = d21*( d22*w( j, kw )-w( j, kw-1 ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_qgemv( 'NO TRANSPOSE', jj-j+1, n-k, -one,a( j, k+1 ), lda, w( jj, &
+                              kw+1 ), ldw, one,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k, -one,a( 1, k+1 ), &
+                           lda, w( j, kw+1 ), ldw, one,a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n looping backwards from k+1 to n
+              j = k + 1
+              60 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j + 1
+                 end if
+                 ! (note: here, j is used to determine row length. length n-j+1
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j + 1
+                 if( jp/=jj .and. j<=n )call stdlib_qswap( n-j+1, a( jp, j ), lda, a( jj, j ), &
+                           lda )
+              if( j<n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              ! copy column k of a to column k of w and update it
+              call stdlib_qcopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              call stdlib_qgemv( 'NO TRANSPOSE', n-k+1, k-1, -one, a( k, 1 ), lda,w( k, 1 ), ldw, &
+                        one, w( k, k ), 1 )
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iqamax( n-k, w( k+1, k ), 1 )
+                 colmax = abs( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! copy column imax to column k+1 of w and update it
+                    call stdlib_qcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1 )
+                    call stdlib_qcopy( n-imax+1, a( imax, imax ), 1, w( imax, k+1 ),1 )
+                    call stdlib_qgemv( 'NO TRANSPOSE', n-k+1, k-1, -one, a( k, 1 ),lda, w( imax, &
+                              1 ), ldw, one, w( k, k+1 ), 1 )
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = k - 1 + stdlib_iqamax( imax-k, w( k, k+1 ), 1 )
+                    rowmax = abs( w( jmax, k+1 ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_iqamax( n-imax, w( imax+1, k+1 ), 1 )
+                       rowmax = max( rowmax, abs( w( jmax, k+1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( w( imax, k+1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column k+1 of w to column k of w
+                       call stdlib_qcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = a( kk, kk )
+                    call stdlib_qcopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    if( kp<n )call stdlib_qcopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (columns k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_qswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_qswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    call stdlib_qcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       r1 = one / a( k, k )
+                       call stdlib_qscal( n-k, r1, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! (note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored)
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! compose the columns of the inverse of 2-by-2 pivot
+                       ! block d in the following way to reduce the number
+                       ! of flops when we myltiply panel ( w(k) w(k+1) ) by
+                       ! this inverse
+                       ! d**(-1) = ( d11 d21 )**(-1) =
+                                 ! ( d21 d22 )
+                       ! = 1/(d11*d22-d21**2) * ( ( d22 ) (-d21 ) ) =
+                                              ! ( (-d21 ) ( d11 ) )
+                       ! = 1/d21 * 1/((d11/d21)*(d22/d21)-1) *
+                         ! * ( ( d22/d21 ) (      -1 ) ) =
+                           ! ( (      -1 ) ( d11/d21 ) )
+                       ! = 1/d21 * 1/(d22*d11-1) * ( ( d11 ) (  -1 ) ) =
+                                                 ! ( ( -1  ) ( d22 ) )
+                       ! = 1/d21 * t * ( ( d11 ) (  -1 ) )
+                                     ! ( (  -1 ) ( d22 ) )
+                       ! = d21 * ( ( d11 ) (  -1 ) )
+                               ! ( (  -1 ) ( d22 ) )
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       d21 = t / d21
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = d21*( d11*w( j, k )-w( j, k+1 ) )
+                          a( j, k+1 ) = d21*( d22*w( j, k+1 )-w( j, k ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_qgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -one,a( jj, 1 ), lda, w( jj, &
+                              1 ), ldw, one,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           one, a( j+jb, 1 ), lda, w( j, 1 ), ldw,one, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! of rows in columns 1:k-1 looping backwards from k-1 to 1
+              j = k - 1
+              120 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j - 1
+                 end if
+                 ! (note: here, j is used to determine row length. length j
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j - 1
+                 if( jp/=jj .and. j>=1 )call stdlib_qswap( j, a( jp, 1 ), lda, a( jj, 1 ), lda )
+                           
+              if( j>1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_qlasyf
+
+     !> DLATRF_AA factorizes a panel of a real symmetric matrix A using
+     !> the Aasen's algorithm. The panel consists of a set of NB rows of A
+     !> when UPLO is U, or a set of NB columns when UPLO is L.
+     !> In order to factorize the panel, the Aasen's algorithm requires the
+     !> last row, or column, of the previous panel. The first row, or column,
+     !> of A is set to be the first row, or column, of an identity matrix,
+     !> which is used to factorize the first panel.
+     !> The resulting J-th row of U, or J-th column of L, is stored in the
+     !> (J-1)-th row, or column, of A (without the unit diagonals), while
+     !> the diagonal and subdiagonal of A are overwritten by those of T.
+
+     pure subroutine stdlib_qlasyf_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: m, nb, j1, lda, ldh
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*), h(ldh,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, k, k1, i1, i2, mj
+           real(qp) :: piv, alpha
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           j = 1
+           ! k1 is the first column of the panel to be factorized
+           ! i.e.,  k1 is 2 for the first block column, and 1 for the rest of the blocks
+           k1 = (2-j1)+1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              10 continue
+              if ( j>min(m, nb) )go to 20
+              ! k is the column to be factorized
+               ! when being called from stdlib_qsytrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:m, j) := a(j, j:m) - h(j:m, 1:(j-1)) * l(j1:(j-1), j),
+               ! where h(j:m, j) has been initialized to be a(j, j:m)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_qgemv( 'NO TRANSPOSE', mj, j-k1,-one, h( j, k1 ), ldh,a( 1, j ), 1,&
+                           one, h( j, j ), 1 )
+              end if
+              ! copy h(i:m, i) into work
+              call stdlib_qcopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j-1, j:m) * t(j-1,j),
+                  ! where a(j-1, j) stores t(j-1, j) and a(j-2, j:m) stores u(j-1, j:m)
+                 alpha = -a( k-1, j )
+                 call stdlib_qaxpy( mj, alpha, a( k-2, j ), lda, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( k, j ) = work( 1 )
+              if( j<m ) then
+                 ! compute work(2:m) = t(j, j) l(j, (j+1):m)
+                  ! where a(j, j) stores t(j, j) and a(j-1, (j+1):m) stores u(j, (j+1):m)
+                 if( k>1 ) then
+                    alpha = -a( k, j )
+                    call stdlib_qaxpy( m-j, alpha, a( k-1, j+1 ), lda,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:m)|)
+                 i2 = stdlib_iqamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply symmetric pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1, i1+1:m) with a(i1+1:m, i2)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_qswap( i2-i1-1, a( j1+i1-1, i1+1 ), lda,a( j1+i1, i2 ), 1 )
+                              
+                    ! swap a(i1, i2+1:m) with a(i2, i2+1:m)
+                    if( i2<m )call stdlib_qswap( m-i2, a( j1+i1-1, i2+1 ), lda,a( j1+i2-1, i2+1 ),&
+                               lda )
+                    ! swap a(i1, i1) with a(i2,i2)
+                    piv = a( i1+j1-1, i1 )
+                    a( j1+i1-1, i1 ) = a( j1+i2-1, i2 )
+                    a( j1+i2-1, i2 ) = piv
+                    ! swap h(i1, 1:j1) with h(i2, 1:j1)
+                    call stdlib_qswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_qswap( i1-k1+1, a( 1, i1 ), 1,a( 1, i2 ), 1 )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j, j+1) = t(j, j+1)
+                 a( k, j+1 ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:m, j+1) into h(j:m, j),
+                    call stdlib_qcopy( m-j, a( k+1, j+1 ), lda,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:m ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:m, j) = l(j+2:m, j+1)
+                 if( j<(m-1) ) then
+                    if( a( k, j+1 )/=zero ) then
+                       alpha = one / a( k, j+1 )
+                       call stdlib_qcopy( m-j-1, work( 3 ), 1, a( k, j+2 ), lda )
+                       call stdlib_qscal( m-j-1, alpha, a( k, j+2 ), lda )
+                    else
+                       call stdlib_qlaset( 'FULL', 1, m-j-1, zero, zero,a( k, j+2 ), lda)
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 10
+              20 continue
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              30 continue
+              if( j>min( m, nb ) )go to 40
+              ! k is the column to be factorized
+               ! when being called from stdlib_qsytrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:m, j) := a(j:m, j) - h(j:m, 1:(j-1)) * l(j, j1:(j-1))^t,
+               ! where h(j:m, j) has been initialized to be a(j:m, j)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_qgemv( 'NO TRANSPOSE', mj, j-k1,-one, h( j, k1 ), ldh,a( j, 1 ), lda,&
+                           one, h( j, j ), 1 )
+              end if
+              ! copy h(j:m, j) into work
+              call stdlib_qcopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j:m, j-1) * t(j-1,j),
+                  ! where a(j-1, j) = t(j-1, j) and a(j, j-2) = l(j, j-1)
+                 alpha = -a( j, k-1 )
+                 call stdlib_qaxpy( mj, alpha, a( j, k-2 ), 1, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( j, k ) = work( 1 )
+              if( j<m ) then
+                 ! compute work(2:m) = t(j, j) l((j+1):m, j)
+                  ! where a(j, j) = t(j, j) and a((j+1):m, j-1) = l((j+1):m, j)
+                 if( k>1 ) then
+                    alpha = -a( j, k )
+                    call stdlib_qaxpy( m-j, alpha, a( j+1, k-1 ), 1,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:m)|)
+                 i2 = stdlib_iqamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply symmetric pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1+1:m, i1) with a(i2, i1+1:m)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_qswap( i2-i1-1, a( i1+1, j1+i1-1 ), 1,a( i2, j1+i1 ), lda )
+                              
+                    ! swap a(i2+1:m, i1) with a(i2+1:m, i2)
+                    if( i2<m )call stdlib_qswap( m-i2, a( i2+1, j1+i1-1 ), 1,a( i2+1, j1+i2-1 ), &
+                              1 )
+                    ! swap a(i1, i1) with a(i2, i2)
+                    piv = a( i1, j1+i1-1 )
+                    a( i1, j1+i1-1 ) = a( i2, j1+i2-1 )
+                    a( i2, j1+i2-1 ) = piv
+                    ! swap h(i1, i1:j1) with h(i2, i2:j1)
+                    call stdlib_qswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_qswap( i1-k1+1, a( i1, 1 ), lda,a( i2, 1 ), lda )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j+1, j) = t(j+1, j)
+                 a( j+1, k ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:m, j+1) into h(j+1:m, j),
+                    call stdlib_qcopy( m-j, a( j+1, k+1 ), 1,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:m ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:m, j) = l(j+2:m, j+1)
+                 if( j<(m-1) ) then
+                    if( a( j+1, k )/=zero ) then
+                       alpha = one / a( j+1, k )
+                       call stdlib_qcopy( m-j-1, work( 3 ), 1, a( j+2, k ), 1 )
+                       call stdlib_qscal( m-j-1, alpha, a( j+2, k ), 1 )
+                    else
+                       call stdlib_qlaset( 'FULL', m-j-1, 1, zero, zero,a( j+2, k ), lda )
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 30
+              40 continue
+           end if
+           return
+     end subroutine stdlib_qlasyf_aa
+
+     !> DLASYF_RK: computes a partial factorization of a real symmetric
+     !> matrix A using the bounded Bunch-Kaufman (rook) diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L',
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> DLASYF_RK is an auxiliary routine called by DSYTRF_RK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_qlasyf_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: e(*), w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, j, jb, jj, jmax, k, kk, kw, kkw, kp, kstep, p, ii
+           real(qp) :: absakk, alpha, colmax, d11, d12, d21, d22, dtemp, r1, rowmax, t, &
+                     sfmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = zero
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              call stdlib_qcopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_qgemv( 'NO TRANSPOSE', k, n-k, -one, a( 1, k+1 ),lda, w( k, kw+&
+                        1 ), ldw, one, w( 1, kw ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iqamax( k-1, w( 1, kw ), 1 )
+                 colmax = abs( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_qcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = zero
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       call stdlib_qcopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                       call stdlib_qcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       if( k<n )call stdlib_qgemv( 'NO TRANSPOSE', k, n-k, -one,a( 1, k+1 ), lda, &
+                                 w( imax, kw+1 ), ldw,one, w( 1, kw-1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_iqamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = abs( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_iqamax( imax-1, w( 1, kw-1 ), 1 )
+                          dtemp = abs( w( itemp, kw-1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! abs( w( imax, kw-1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.(abs( w( imax, kw-1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_qcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_qcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_qcopy( k-p, a( p+1, k ), 1, a( p, p+1 ), lda )
+                    call stdlib_qcopy( p, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in last n-k+1 columns of a
+                    ! and last n-k+2 columns of w
+                    call stdlib_qswap( n-k+1, a( k, k ), lda, a( p, k ), lda )
+                    call stdlib_qswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kkw of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_qcopy( k-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_qcopy( kp, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last n-kk+1 columns
+                    ! of a and w
+                    call stdlib_qswap( n-kk+1, a( kk, kk ), lda, a( kp, kk ), lda )
+                    call stdlib_qswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! store u(k) in column k of a
+                    call stdlib_qcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          r1 = one / a( k, k )
+                          call stdlib_qscal( k-1, r1, a( 1, k ), 1 )
+                       else if( a( k, k )/=zero ) then
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = zero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now
+                    ! hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    if( k>2 ) then
+                       ! store u(k) and u(k-1) in columns k and k-1 of a
+                       d12 = w( k-1, kw )
+                       d11 = w( k, kw ) / d12
+                       d22 = w( k-1, kw-1 ) / d12
+                       t = one / ( d11*d22-one )
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( (d11*w( j, kw-1 )-w( j, kw ) ) /d12 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /d12 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy superdiagonal element of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = zero
+                    a( k, k ) = w( k, kw )
+                    e( k ) = w( k-1, kw )
+                    e( k-1 ) = zero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_qgemv( 'NO TRANSPOSE', jj-j+1, n-k, -one,a( j, k+1 ), lda, w( jj, &
+                              kw+1 ), ldw, one,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb,n-k, -one, a( &
+                           1, k+1 ), lda, w( j, kw+1 ),ldw, one, a( 1, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = zero
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update it
+              call stdlib_qcopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              if( k>1 )call stdlib_qgemv( 'NO TRANSPOSE', n-k+1, k-1, -one, a( k, 1 ),lda, w( k, &
+                        1 ), ldw, one, w( k, k ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iqamax( n-k, w( k+1, k ), 1 )
+                 colmax = abs( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_qcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = zero
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_qcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_qcopy( n-imax+1, a( imax, imax ), 1,w( imax, k+1 ), 1 )
+                       if( k>1 )call stdlib_qgemv( 'NO TRANSPOSE', n-k+1, k-1, -one,a( k, 1 ), &
+                                 lda, w( imax, 1 ), ldw,one, w( k, k+1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_iqamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = abs( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_iqamax( n-imax, w( imax+1, k+1 ), 1)
+                          dtemp = abs( w( itemp, k+1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! abs( w( imax, k+1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( w( imax, k+1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_qcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_qcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 72
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_qcopy( p-k, a( k, k ), 1, a( p, k ), lda )
+                    call stdlib_qcopy( n-p+1, a( p, k ), 1, a( p, p ), 1 )
+                    ! interchange rows k and p in first k columns of a
+                    ! and first k+1 columns of w
+                    call stdlib_qswap( k, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_qswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kk of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_qcopy( kp-k-1, a( k+1, kk ), 1, a( kp, k+1 ), lda )
+                    call stdlib_qcopy( n-kp+1, a( kp, kk ), 1, a( kp, kp ), 1 )
+                    ! interchange rows kk and kp in first kk columns of a and w
+                    call stdlib_qswap( kk, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_qswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    ! store l(k) in column k of a
+                    call stdlib_qcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          r1 = one / a( k, k )
+                          call stdlib_qscal( n-k, r1, a( k+1, k ), 1 )
+                       else if( a( k, k )/=zero ) then
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = zero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! store l(k) and l(k+1) in columns k and k+1 of a
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /d21 )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy subdiagonal element of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = zero
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    e( k ) = w( k+1, k )
+                    e( k+1 ) = zero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_qgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -one,a( jj, 1 ), lda, w( jj, &
+                              1 ), ldw, one,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           one, a( j+jb, 1 ), lda, w( j, 1 ),ldw, one, a( j+jb, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_qlasyf_rk
+
+     !> DLASYF_ROOK: computes a partial factorization of a real symmetric
+     !> matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> DLASYF_ROOK is an auxiliary routine called by DSYTRF_ROOK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_qlasyf_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, j, jb, jj, jmax, jp1, jp2, k, kk, kw, kkw, kp, kstep, p, &
+                     ii
+           real(qp) :: absakk, alpha, colmax, d11, d12, d21, d22, dtemp, r1, rowmax, t, &
+                     sfmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              call stdlib_qcopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_qgemv( 'NO TRANSPOSE', k, n-k, -one, a( 1, k+1 ),lda, w( k, kw+&
+                        1 ), ldw, one, w( 1, kw ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iqamax( k-1, w( 1, kw ), 1 )
+                 colmax = abs( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_qcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       call stdlib_qcopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                       call stdlib_qcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       if( k<n )call stdlib_qgemv( 'NO TRANSPOSE', k, n-k, -one,a( 1, k+1 ), lda, &
+                                 w( imax, kw+1 ), ldw,one, w( 1, kw-1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_iqamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = abs( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_iqamax( imax-1, w( 1, kw-1 ), 1 )
+                          dtemp = abs( w( itemp, kw-1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! abs( w( imax, kw-1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.(abs( w( imax, kw-1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_qcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_qcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_qcopy( k-p, a( p+1, k ), 1, a( p, p+1 ), lda )
+                    call stdlib_qcopy( p, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in last n-k+1 columns of a
+                    ! and last n-k+2 columns of w
+                    call stdlib_qswap( n-k+1, a( k, k ), lda, a( p, k ), lda )
+                    call stdlib_qswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kkw of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_qcopy( k-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_qcopy( kp, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last n-kk+1 columns
+                    ! of a and w
+                    call stdlib_qswap( n-kk+1, a( kk, kk ), lda, a( kp, kk ), lda )
+                    call stdlib_qswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! store u(k) in column k of a
+                    call stdlib_qcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          r1 = one / a( k, k )
+                          call stdlib_qscal( k-1, r1, a( 1, k ), 1 )
+                       else if( a( k, k )/=zero ) then
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now
+                    ! hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    if( k>2 ) then
+                       ! store u(k) and u(k-1) in columns k and k-1 of a
+                       d12 = w( k-1, kw )
+                       d11 = w( k, kw ) / d12
+                       d22 = w( k-1, kw-1 ) / d12
+                       t = one / ( d11*d22-one )
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( (d11*w( j, kw-1 )-w( j, kw ) ) /d12 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /d12 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_qgemv( 'NO TRANSPOSE', jj-j+1, n-k, -one,a( j, k+1 ), lda, w( jj, &
+                              kw+1 ), ldw, one,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb,n-k, -one, a( &
+                           1, k+1 ), lda, w( j, kw+1 ), ldw,one, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n
+              j = k + 1
+              60 continue
+                 kstep = 1
+                 jp1 = 1
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    j = j + 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 j = j + 1
+                 if( jp2/=jj .and. j<=n )call stdlib_qswap( n-j+1, a( jp2, j ), lda, a( jj, j ), &
+                           lda )
+                 jj = j - 1
+                 if( jp1/=jj .and. kstep==2 )call stdlib_qswap( n-j+1, a( jp1, j ), lda, a( jj, j &
+                           ), lda )
+              if( j<=n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update it
+              call stdlib_qcopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              if( k>1 )call stdlib_qgemv( 'NO TRANSPOSE', n-k+1, k-1, -one, a( k, 1 ),lda, w( k, &
+                        1 ), ldw, one, w( k, k ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iqamax( n-k, w( k+1, k ), 1 )
+                 colmax = abs( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_qcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_qcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_qcopy( n-imax+1, a( imax, imax ), 1,w( imax, k+1 ), 1 )
+                       if( k>1 )call stdlib_qgemv( 'NO TRANSPOSE', n-k+1, k-1, -one,a( k, 1 ), &
+                                 lda, w( imax, 1 ), ldw,one, w( k, k+1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_iqamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = abs( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_iqamax( n-imax, w( imax+1, k+1 ), 1)
+                          dtemp = abs( w( itemp, k+1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! abs( w( imax, k+1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( w( imax, k+1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_qcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_qcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 72
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_qcopy( p-k, a( k, k ), 1, a( p, k ), lda )
+                    call stdlib_qcopy( n-p+1, a( p, k ), 1, a( p, p ), 1 )
+                    ! interchange rows k and p in first k columns of a
+                    ! and first k+1 columns of w
+                    call stdlib_qswap( k, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_qswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kk of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_qcopy( kp-k-1, a( k+1, kk ), 1, a( kp, k+1 ), lda )
+                    call stdlib_qcopy( n-kp+1, a( kp, kk ), 1, a( kp, kp ), 1 )
+                    ! interchange rows kk and kp in first kk columns of a and w
+                    call stdlib_qswap( kk, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_qswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    ! store l(k) in column k of a
+                    call stdlib_qcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          r1 = one / a( k, k )
+                          call stdlib_qscal( n-k, r1, a( k+1, k ), 1 )
+                       else if( a( k, k )/=zero ) then
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! store l(k) and l(k+1) in columns k and k+1 of a
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /d21 )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_qgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -one,a( jj, 1 ), lda, w( jj, &
+                              1 ), ldw, one,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           one, a( j+jb, 1 ), lda, w( j, 1 ), ldw,one, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! in columns 1:k-1
+              j = k - 1
+              120 continue
+                 kstep = 1
+                 jp1 = 1
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    j = j - 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 j = j - 1
+                 if( jp2/=jj .and. j>=1 )call stdlib_qswap( j, a( jp2, 1 ), lda, a( jj, 1 ), lda )
+                           
+                 jj = j + 1
+                 if( jp1/=jj .and. kstep==2 )call stdlib_qswap( j, a( jp1, 1 ), lda, a( jj, 1 ), &
+                           lda )
+              if( j>=1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_qlasyf_rook
+
+     !> DLAT2S: converts a DOUBLE PRECISION triangular matrix, SA, to a SINGLE
+     !> PRECISION triangular matrix, A.
+     !> RMAX is the overflow for the SINGLE PRECISION arithmetic
+     !> DLAS2S checks that all the entries of A are between -RMAX and
+     !> RMAX. If not the conversion is aborted and a flag is raised.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_qlat2s( uplo, n, a, lda, sa, ldsa, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, n
+           ! Array Arguments 
+           real(dp), intent(out) :: sa(ldsa,*)
+           real(qp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: rmax
+           logical(lk) :: upper
+           ! Executable Statements 
+           rmax = stdlib_dlamch( 'O' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( upper ) then
+              do j = 1, n
+                 do i = 1, j
+                    if( ( a( i, j )<-rmax ) .or. ( a( i, j )>rmax ) )then
+                       info = 1
+                       go to 50
+                    end if
+                    sa( i, j ) = a( i, j )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = j, n
+                    if( ( a( i, j )<-rmax ) .or. ( a( i, j )>rmax ) )then
+                       info = 1
+                       go to 50
+                    end if
+                    sa( i, j ) = a( i, j )
+                 end do
+              end do
+           end if
+           50 continue
+           return
+     end subroutine stdlib_qlat2s
+
+     !> DLATBS: solves one of the triangular systems
+     !> A *x = s*b  or  A**T*x = s*b
+     !> with scaling to prevent overflow, where A is an upper or lower
+     !> triangular band matrix.  Here A**T denotes the transpose of A, x and b
+     !> are n-element vectors, and s is a scaling factor, usually less than
+     !> or equal to 1, chosen so that the components of x will be less than
+     !> the overflow threshold.  If the unscaled problem will not cause
+     !> overflow, the Level 2 BLAS routine DTBSV is called.  If the matrix A
+     !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+     !> non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_qlatbs( uplo, trans, diag, normin, n, kd, ab, ldab, x,scale, cnorm, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(qp), intent(out) :: scale
+           ! Array Arguments 
+           real(qp), intent(in) :: ab(ldab,*)
+           real(qp), intent(inout) :: cnorm(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, j, jfirst, jinc, jlast, jlen, maind
+           real(qp) :: bignum, grow, rec, smlnum, sumj, tjj, tjjs, tmax, tscal, uscal, xbnd, xj, &
+                     xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( kd<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLATBS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 do j = 1, n
+                    jlen = min( kd, j-1 )
+                    cnorm( j ) = stdlib_qasum( jlen, ab( kd+1-jlen, j ), 1 )
+                 end do
+              else
+                 ! a is lower triangular.
+                 do j = 1, n
+                    jlen = min( kd, n-j )
+                    if( jlen>0 ) then
+                       cnorm( j ) = stdlib_qasum( jlen, ab( 2, j ), 1 )
+                    else
+                       cnorm( j ) = zero
+                    end if
+                 end do
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum.
+           imax = stdlib_iqamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum ) then
+              tscal = one
+           else
+              tscal = one / ( smlnum*tmax )
+              call stdlib_qscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_qtbsv can be used.
+           j = stdlib_iqamax( n, x, 1 )
+           xmax = abs( x( j ) )
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+                 maind = kd + 1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+                 maind = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 50
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! m(j) = g(j-1) / abs(a(j,j))
+                    tjj = abs( ab( maind, j ) )
+                    xbnd = min( xbnd, min( one, tjj )*grow )
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              50 continue
+           else
+              ! compute the growth in a**t * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+                 maind = kd + 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+                 maind = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 80
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                    tjj = abs( ab( maind, j ) )
+                    if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              80 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_qtbsv( uplo, trans, diag, n, kd, ab, ldab, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = bignum / xmax
+                 call stdlib_qscal( n, scale, x, 1 )
+                 xmax = bignum
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 loop_110: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = abs( x( j ) )
+                    if( nounit ) then
+                       tjjs = ab( maind, j )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 100
+                    end if
+                    tjj = abs( tjjs )
+                    if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                       if( tjj<one ) then
+                          if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                             rec = one / xj
+                             call stdlib_qscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j ) = x( j ) / tjjs
+                       xj = abs( x( j ) )
+                    else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                       if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                          rec = ( tjj*bignum ) / xj
+                          if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                             rec = rec / cnorm( j )
+                          end if
+                          call stdlib_qscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                       x( j ) = x( j ) / tjjs
+                       xj = abs( x( j ) )
+                    else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                       do i = 1, n
+                          x( i ) = zero
+                       end do
+                       x( j ) = one
+                       xj = one
+                       scale = zero
+                       xmax = zero
+                    end if
+                    100 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_qscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_qscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(max(1,j-kd):j-1) := x(max(1,j-kd):j-1) -
+                                                   ! x(j)* a(max(1,j-kd):j-1,j)
+                          jlen = min( kd, j-1 )
+                          call stdlib_qaxpy( jlen, -x( j )*tscal,ab( kd+1-jlen, j ), 1, x( j-jlen &
+                                    ), 1 )
+                          i = stdlib_iqamax( j-1, x, 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                    else if( j<n ) then
+                       ! compute the update
+                          ! x(j+1:min(j+kd,n)) := x(j+1:min(j+kd,n)) -
+                                                ! x(j) * a(j+1:min(j+kd,n),j)
+                       jlen = min( kd, n-j )
+                       if( jlen>0 )call stdlib_qaxpy( jlen, -x( j )*tscal, ab( 2, j ), 1,x( j+1 ),&
+                                  1 )
+                       i = j + stdlib_iqamax( n-j, x( j+1 ), 1 )
+                       xmax = abs( x( i ) )
+                    end if
+                 end do loop_110
+              else
+                 ! solve a**t * x = b
+                 loop_160: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = abs( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = ab( maind, j )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = abs( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = uscal / tjjs
+                       end if
+                       if( rec<one ) then
+                          call stdlib_qscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    sumj = zero
+                    if( uscal==one ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_qdot to perform the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          sumj = stdlib_qdot( jlen, ab( kd+1-jlen, j ), 1,x( j-jlen ), 1 )
+                       else
+                          jlen = min( kd, n-j )
+                          if( jlen>0 )sumj = stdlib_qdot( jlen, ab( 2, j ), 1, x( j+1 ), 1 )
+                                    
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          do i = 1, jlen
+                             sumj = sumj + ( ab( kd+i-jlen, j )*uscal )*x( j-jlen-1+i )
+                          end do
+                       else
+                          jlen = min( kd, n-j )
+                          do i = 1, jlen
+                             sumj = sumj + ( ab( i+1, j )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==tscal ) then
+                       ! compute x(j) := ( x(j) - sumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - sumj
+                       xj = abs( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = ab( maind, j )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 150
+                       end if
+                       tjj = abs( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_qscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = x( j ) / tjjs
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_qscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = x( j ) / tjjs
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0, and compute a solution to a**t*x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       150 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - sumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = x( j ) / tjjs - sumj
+                    end if
+                    xmax = max( xmax, abs( x( j ) ) )
+                 end do loop_160
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_qscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_qlatbs
+
+     !> DLATDF: uses the LU factorization of the n-by-n matrix Z computed by
+     !> DGETC2 and computes a contribution to the reciprocal Dif-estimate
+     !> by solving Z * x = b for x, and choosing the r.h.s. b such that
+     !> the norm of x is as large as possible. On entry RHS = b holds the
+     !> contribution from earlier solved sub-systems, and on return RHS = x.
+     !> The factorization of Z returned by DGETC2 has the form Z = P*L*U*Q,
+     !> where P and Q are permutation matrices. L is lower triangular with
+     !> unit diagonal elements and U is upper triangular.
+
+     pure subroutine stdlib_qlatdf( ijob, n, z, ldz, rhs, rdsum, rdscal, ipiv,jpiv )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ijob, ldz, n
+           real(qp), intent(inout) :: rdscal, rdsum
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*), jpiv(*)
+           real(qp), intent(inout) :: rhs(*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxdim = 8
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j, k
+           real(qp) :: bm, bp, pmone, sminu, splus, temp
+           ! Local Arrays 
+           integer(ilp) :: iwork(maxdim)
+           real(qp) :: work(4*maxdim), xm(maxdim), xp(maxdim)
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( ijob/=2 ) then
+              ! apply permutations ipiv to rhs
+              call stdlib_qlaswp( 1, rhs, ldz, 1, n-1, ipiv, 1 )
+              ! solve for l-part choosing rhs either to +1 or -1.
+              pmone = -one
+              loop_10: do j = 1, n - 1
+                 bp = rhs( j ) + one
+                 bm = rhs( j ) - one
+                 splus = one
+                 ! look-ahead for l-part rhs(1:n-1) = + or -1, splus and
+                 ! smin computed more efficiently than in bsolve [1].
+                 splus = splus + stdlib_qdot( n-j, z( j+1, j ), 1, z( j+1, j ), 1 )
+                 sminu = stdlib_qdot( n-j, z( j+1, j ), 1, rhs( j+1 ), 1 )
+                 splus = splus*rhs( j )
+                 if( splus>sminu ) then
+                    rhs( j ) = bp
+                 else if( sminu>splus ) then
+                    rhs( j ) = bm
+                 else
+                    ! in this case the updating sums are equal and we can
+                    ! choose rhs(j) +1 or -1. the first time this happens
+                    ! we choose -1, thereafter +1. this is a simple way to
+                    ! get good estimates of matrices like byers well-known
+                    ! example (see [1]). (not done in bsolve.)
+                    rhs( j ) = rhs( j ) + pmone
+                    pmone = one
+                 end if
+                 ! compute the remaining r.h.s.
+                 temp = -rhs( j )
+                 call stdlib_qaxpy( n-j, temp, z( j+1, j ), 1, rhs( j+1 ), 1 )
+              end do loop_10
+              ! solve for u-part, look-ahead for rhs(n) = +-1. this is not done
+              ! in bsolve and will hopefully give us a better estimate because
+              ! any ill-conditioning of the original matrix is transferred to u
+              ! and not to l. u(n, n) is an approximation to sigma_min(lu).
+              call stdlib_qcopy( n-1, rhs, 1, xp, 1 )
+              xp( n ) = rhs( n ) + one
+              rhs( n ) = rhs( n ) - one
+              splus = zero
+              sminu = zero
+              do i = n, 1, -1
+                 temp = one / z( i, i )
+                 xp( i ) = xp( i )*temp
+                 rhs( i ) = rhs( i )*temp
+                 do k = i + 1, n
+                    xp( i ) = xp( i ) - xp( k )*( z( i, k )*temp )
+                    rhs( i ) = rhs( i ) - rhs( k )*( z( i, k )*temp )
+                 end do
+                 splus = splus + abs( xp( i ) )
+                 sminu = sminu + abs( rhs( i ) )
+              end do
+              if( splus>sminu )call stdlib_qcopy( n, xp, 1, rhs, 1 )
+              ! apply the permutations jpiv to the computed solution (rhs)
+              call stdlib_qlaswp( 1, rhs, ldz, 1, n-1, jpiv, -1 )
+              ! compute the sum of squares
+              call stdlib_qlassq( n, rhs, 1, rdscal, rdsum )
+           else
+              ! ijob = 2, compute approximate nullvector xm of z
+              call stdlib_qgecon( 'I', n, z, ldz, one, temp, work, iwork, info )
+              call stdlib_qcopy( n, work( n+1 ), 1, xm, 1 )
+              ! compute rhs
+              call stdlib_qlaswp( 1, xm, ldz, 1, n-1, ipiv, -1 )
+              temp = one / sqrt( stdlib_qdot( n, xm, 1, xm, 1 ) )
+              call stdlib_qscal( n, temp, xm, 1 )
+              call stdlib_qcopy( n, xm, 1, xp, 1 )
+              call stdlib_qaxpy( n, one, rhs, 1, xp, 1 )
+              call stdlib_qaxpy( n, -one, xm, 1, rhs, 1 )
+              call stdlib_qgesc2( n, z, ldz, rhs, ipiv, jpiv, temp )
+              call stdlib_qgesc2( n, z, ldz, xp, ipiv, jpiv, temp )
+              if( stdlib_qasum( n, xp, 1 )>stdlib_qasum( n, rhs, 1 ) )call stdlib_qcopy( n, xp, 1,&
+                         rhs, 1 )
+              ! compute the sum of squares
+              call stdlib_qlassq( n, rhs, 1, rdscal, rdsum )
+           end if
+           return
+     end subroutine stdlib_qlatdf
+
+     !> DLATPS: solves one of the triangular systems
+     !> A *x = s*b  or  A**T*x = s*b
+     !> with scaling to prevent overflow, where A is an upper or lower
+     !> triangular matrix stored in packed form.  Here A**T denotes the
+     !> transpose of A, x and b are n-element vectors, and s is a scaling
+     !> factor, usually less than or equal to 1, chosen so that the
+     !> components of x will be less than the overflow threshold.  If the
+     !> unscaled problem will not cause overflow, the Level 2 BLAS routine
+     !> DTPSV is called. If the matrix A is singular (A(j,j) = 0 for some j),
+     !> then s is set to 0 and a non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_qlatps( uplo, trans, diag, normin, n, ap, x, scale,cnorm, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(out) :: scale
+           ! Array Arguments 
+           real(qp), intent(in) :: ap(*)
+           real(qp), intent(inout) :: cnorm(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, ip, j, jfirst, jinc, jlast, jlen
+           real(qp) :: bignum, grow, rec, smlnum, sumj, tjj, tjjs, tmax, tscal, uscal, xbnd, xj, &
+                     xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLATPS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 ip = 1
+                 do j = 1, n
+                    cnorm( j ) = stdlib_qasum( j-1, ap( ip ), 1 )
+                    ip = ip + j
+                 end do
+              else
+                 ! a is lower triangular.
+                 ip = 1
+                 do j = 1, n - 1
+                    cnorm( j ) = stdlib_qasum( n-j, ap( ip+1 ), 1 )
+                    ip = ip + n - j + 1
+                 end do
+                 cnorm( n ) = zero
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum.
+           imax = stdlib_iqamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum ) then
+              tscal = one
+           else
+              tscal = one / ( smlnum*tmax )
+              call stdlib_qscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_qtpsv can be used.
+           j = stdlib_iqamax( n, x, 1 )
+           xmax = abs( x( j ) )
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 50
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = n
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! m(j) = g(j-1) / abs(a(j,j))
+                    tjj = abs( ap( ip ) )
+                    xbnd = min( xbnd, min( one, tjj )*grow )
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                    ip = ip + jinc*jlen
+                    jlen = jlen - 1
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              50 continue
+           else
+              ! compute the growth in a**t * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 80
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                    tjj = abs( ap( ip ) )
+                    if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              80 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_qtpsv( uplo, trans, diag, n, ap, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = bignum / xmax
+                 call stdlib_qscal( n, scale, x, 1 )
+                 xmax = bignum
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 loop_110: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = abs( x( j ) )
+                    if( nounit ) then
+                       tjjs = ap( ip )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 100
+                    end if
+                    tjj = abs( tjjs )
+                    if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                       if( tjj<one ) then
+                          if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                             rec = one / xj
+                             call stdlib_qscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j ) = x( j ) / tjjs
+                       xj = abs( x( j ) )
+                    else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                       if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                          rec = ( tjj*bignum ) / xj
+                          if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                             rec = rec / cnorm( j )
+                          end if
+                          call stdlib_qscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                       x( j ) = x( j ) / tjjs
+                       xj = abs( x( j ) )
+                    else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                       do i = 1, n
+                          x( i ) = zero
+                       end do
+                       x( j ) = one
+                       xj = one
+                       scale = zero
+                       xmax = zero
+                    end if
+                    100 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_qscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_qscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(1:j-1) := x(1:j-1) - x(j) * a(1:j-1,j)
+                          call stdlib_qaxpy( j-1, -x( j )*tscal, ap( ip-j+1 ), 1, x,1 )
+                          i = stdlib_iqamax( j-1, x, 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                       ip = ip - j
+                    else
+                       if( j<n ) then
+                          ! compute the update
+                             ! x(j+1:n) := x(j+1:n) - x(j) * a(j+1:n,j)
+                          call stdlib_qaxpy( n-j, -x( j )*tscal, ap( ip+1 ), 1,x( j+1 ), 1 )
+                                    
+                          i = j + stdlib_iqamax( n-j, x( j+1 ), 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                       ip = ip + n - j + 1
+                    end if
+                 end do loop_110
+              else
+                 ! solve a**t * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 loop_160: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = abs( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = ap( ip )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = abs( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = uscal / tjjs
+                       end if
+                       if( rec<one ) then
+                          call stdlib_qscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    sumj = zero
+                    if( uscal==one ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_qdot to perform the dot product.
+                       if( upper ) then
+                          sumj = stdlib_qdot( j-1, ap( ip-j+1 ), 1, x, 1 )
+                       else if( j<n ) then
+                          sumj = stdlib_qdot( n-j, ap( ip+1 ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             sumj = sumj + ( ap( ip-j+i )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = 1, n - j
+                             sumj = sumj + ( ap( ip+i )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==tscal ) then
+                       ! compute x(j) := ( x(j) - sumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - sumj
+                       xj = abs( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = ap( ip )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 150
+                       end if
+                       tjj = abs( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_qscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = x( j ) / tjjs
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_qscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = x( j ) / tjjs
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0, and compute a solution to a**t*x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       150 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j)  - sumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = x( j ) / tjjs - sumj
+                    end if
+                    xmax = max( xmax, abs( x( j ) ) )
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do loop_160
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_qscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_qlatps
+
+     !> DLATRD: reduces NB rows and columns of a real symmetric matrix A to
+     !> symmetric tridiagonal form by an orthogonal similarity
+     !> transformation Q**T * A * Q, and returns the matrices V and W which are
+     !> needed to apply the transformation to the unreduced part of A.
+     !> If UPLO = 'U', DLATRD reduces the last NB rows and columns of a
+     !> matrix, of which the upper triangle is supplied;
+     !> if UPLO = 'L', DLATRD reduces the first NB rows and columns of a
+     !> matrix, of which the lower triangle is supplied.
+     !> This is an auxiliary routine called by DSYTRD.
+
+     pure subroutine stdlib_qlatrd( uplo, n, nb, a, lda, e, tau, w, ldw )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: e(*), tau(*), w(ldw,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iw
+           real(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 )return
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! reduce last nb columns of upper triangle
+              loop_10: do i = n, n - nb + 1, -1
+                 iw = i - n + nb
+                 if( i<n ) then
+                    ! update a(1:i,i)
+                    call stdlib_qgemv( 'NO TRANSPOSE', i, n-i, -one, a( 1, i+1 ),lda, w( i, iw+1 )&
+                              , ldw, one, a( 1, i ), 1 )
+                    call stdlib_qgemv( 'NO TRANSPOSE', i, n-i, -one, w( 1, iw+1 ),ldw, a( i, i+1 )&
+                              , lda, one, a( 1, i ), 1 )
+                 end if
+                 if( i>1 ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(1:i-2,i)
+                    call stdlib_qlarfg( i-1, a( i-1, i ), a( 1, i ), 1, tau( i-1 ) )
+                    e( i-1 ) = a( i-1, i )
+                    a( i-1, i ) = one
+                    ! compute w(1:i-1,i)
+                    call stdlib_qsymv( 'UPPER', i-1, one, a, lda, a( 1, i ), 1,zero, w( 1, iw ), &
+                              1 )
+                    if( i<n ) then
+                       call stdlib_qgemv( 'TRANSPOSE', i-1, n-i, one, w( 1, iw+1 ),ldw, a( 1, i ),&
+                                  1, zero, w( i+1, iw ), 1 )
+                       call stdlib_qgemv( 'NO TRANSPOSE', i-1, n-i, -one,a( 1, i+1 ), lda, w( i+1,&
+                                  iw ), 1, one,w( 1, iw ), 1 )
+                       call stdlib_qgemv( 'TRANSPOSE', i-1, n-i, one, a( 1, i+1 ),lda, a( 1, i ), &
+                                 1, zero, w( i+1, iw ), 1 )
+                       call stdlib_qgemv( 'NO TRANSPOSE', i-1, n-i, -one,w( 1, iw+1 ), ldw, w( i+&
+                                 1, iw ), 1, one,w( 1, iw ), 1 )
+                    end if
+                    call stdlib_qscal( i-1, tau( i-1 ), w( 1, iw ), 1 )
+                    alpha = -half*tau( i-1 )*stdlib_qdot( i-1, w( 1, iw ), 1,a( 1, i ), 1 )
+                              
+                    call stdlib_qaxpy( i-1, alpha, a( 1, i ), 1, w( 1, iw ), 1 )
+                 end if
+              end do loop_10
+           else
+              ! reduce first nb columns of lower triangle
+              do i = 1, nb
+                 ! update a(i:n,i)
+                 call stdlib_qgemv( 'NO TRANSPOSE', n-i+1, i-1, -one, a( i, 1 ),lda, w( i, 1 ), &
+                           ldw, one, a( i, i ), 1 )
+                 call stdlib_qgemv( 'NO TRANSPOSE', n-i+1, i-1, -one, w( i, 1 ),ldw, a( i, 1 ), &
+                           lda, one, a( i, i ), 1 )
+                 if( i<n ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(i+2:n,i)
+                    call stdlib_qlarfg( n-i, a( i+1, i ), a( min( i+2, n ), i ), 1,tau( i ) )
+                              
+                    e( i ) = a( i+1, i )
+                    a( i+1, i ) = one
+                    ! compute w(i+1:n,i)
+                    call stdlib_qsymv( 'LOWER', n-i, one, a( i+1, i+1 ), lda,a( i+1, i ), 1, zero,&
+                               w( i+1, i ), 1 )
+                    call stdlib_qgemv( 'TRANSPOSE', n-i, i-1, one, w( i+1, 1 ), ldw,a( i+1, i ), &
+                              1, zero, w( 1, i ), 1 )
+                    call stdlib_qgemv( 'NO TRANSPOSE', n-i, i-1, -one, a( i+1, 1 ),lda, w( 1, i ),&
+                               1, one, w( i+1, i ), 1 )
+                    call stdlib_qgemv( 'TRANSPOSE', n-i, i-1, one, a( i+1, 1 ), lda,a( i+1, i ), &
+                              1, zero, w( 1, i ), 1 )
+                    call stdlib_qgemv( 'NO TRANSPOSE', n-i, i-1, -one, w( i+1, 1 ),ldw, w( 1, i ),&
+                               1, one, w( i+1, i ), 1 )
+                    call stdlib_qscal( n-i, tau( i ), w( i+1, i ), 1 )
+                    alpha = -half*tau( i )*stdlib_qdot( n-i, w( i+1, i ), 1,a( i+1, i ), 1 )
+                              
+                    call stdlib_qaxpy( n-i, alpha, a( i+1, i ), 1, w( i+1, i ), 1 )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_qlatrd
+
+     !> DLATRS: solves one of the triangular systems
+     !> A *x = s*b  or  A**T *x = s*b
+     !> with scaling to prevent overflow.  Here A is an upper or lower
+     !> triangular matrix, A**T denotes the transpose of A, x and b are
+     !> n-element vectors, and s is a scaling factor, usually less than
+     !> or equal to 1, chosen so that the components of x will be less than
+     !> the overflow threshold.  If the unscaled problem will not cause
+     !> overflow, the Level 2 BLAS routine DTRSV is called.  If the matrix A
+     !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+     !> non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_qlatrs( uplo, trans, diag, normin, n, a, lda, x, scale,cnorm, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(out) :: scale
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: cnorm(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, j, jfirst, jinc, jlast
+           real(qp) :: bignum, grow, rec, smlnum, sumj, tjj, tjjs, tmax, tscal, uscal, xbnd, xj, &
+                     xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLATRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 do j = 1, n
+                    cnorm( j ) = stdlib_qasum( j-1, a( 1, j ), 1 )
+                 end do
+              else
+                 ! a is lower triangular.
+                 do j = 1, n - 1
+                    cnorm( j ) = stdlib_qasum( n-j, a( j+1, j ), 1 )
+                 end do
+                 cnorm( n ) = zero
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum.
+           imax = stdlib_iqamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum ) then
+              tscal = one
+           else
+              tscal = one / ( smlnum*tmax )
+              call stdlib_qscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_qtrsv can be used.
+           j = stdlib_iqamax( n, x, 1 )
+           xmax = abs( x( j ) )
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 50
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! m(j) = g(j-1) / abs(a(j,j))
+                    tjj = abs( a( j, j ) )
+                    xbnd = min( xbnd, min( one, tjj )*grow )
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              50 continue
+           else
+              ! compute the growth in a**t * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 80
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                    tjj = abs( a( j, j ) )
+                    if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              80 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_qtrsv( uplo, trans, diag, n, a, lda, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = bignum / xmax
+                 call stdlib_qscal( n, scale, x, 1 )
+                 xmax = bignum
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 loop_110: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = abs( x( j ) )
+                    if( nounit ) then
+                       tjjs = a( j, j )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 100
+                    end if
+                    tjj = abs( tjjs )
+                    if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                       if( tjj<one ) then
+                          if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                             rec = one / xj
+                             call stdlib_qscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j ) = x( j ) / tjjs
+                       xj = abs( x( j ) )
+                    else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                       if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                          rec = ( tjj*bignum ) / xj
+                          if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                             rec = rec / cnorm( j )
+                          end if
+                          call stdlib_qscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                       x( j ) = x( j ) / tjjs
+                       xj = abs( x( j ) )
+                    else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                       do i = 1, n
+                          x( i ) = zero
+                       end do
+                       x( j ) = one
+                       xj = one
+                       scale = zero
+                       xmax = zero
+                    end if
+                    100 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_qscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_qscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(1:j-1) := x(1:j-1) - x(j) * a(1:j-1,j)
+                          call stdlib_qaxpy( j-1, -x( j )*tscal, a( 1, j ), 1, x,1 )
+                          i = stdlib_iqamax( j-1, x, 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                    else
+                       if( j<n ) then
+                          ! compute the update
+                             ! x(j+1:n) := x(j+1:n) - x(j) * a(j+1:n,j)
+                          call stdlib_qaxpy( n-j, -x( j )*tscal, a( j+1, j ), 1,x( j+1 ), 1 )
+                                    
+                          i = j + stdlib_iqamax( n-j, x( j+1 ), 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                    end if
+                 end do loop_110
+              else
+                 ! solve a**t * x = b
+                 loop_160: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = abs( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = a( j, j )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = abs( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = uscal / tjjs
+                       end if
+                       if( rec<one ) then
+                          call stdlib_qscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    sumj = zero
+                    if( uscal==one ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_qdot to perform the dot product.
+                       if( upper ) then
+                          sumj = stdlib_qdot( j-1, a( 1, j ), 1, x, 1 )
+                       else if( j<n ) then
+                          sumj = stdlib_qdot( n-j, a( j+1, j ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             sumj = sumj + ( a( i, j )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = j + 1, n
+                             sumj = sumj + ( a( i, j )*uscal )*x( i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==tscal ) then
+                       ! compute x(j) := ( x(j) - sumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - sumj
+                       xj = abs( x( j ) )
+                       if( nounit ) then
+                          tjjs = a( j, j )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 150
+                       end if
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                       tjj = abs( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_qscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = x( j ) / tjjs
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_qscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = x( j ) / tjjs
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0, and compute a solution to a**t*x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       150 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j)  - sumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = x( j ) / tjjs - sumj
+                    end if
+                    xmax = max( xmax, abs( x( j ) ) )
+                 end do loop_160
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_qscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_qlatrs
+
+     !> DLATRZ: factors the M-by-(M+L) real upper trapezoidal matrix
+     !> [ A1 A2 ] = [ A(1:M,1:M) A(1:M,N-L+1:N) ] as ( R  0 ) * Z, by means
+     !> of orthogonal transformations.  Z is an (M+L)-by-(M+L) orthogonal
+     !> matrix and, R and A1 are M-by-M upper triangular matrices.
+
+     pure subroutine stdlib_qlatrz( m, n, l, a, lda, tau, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: l, lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Executable Statements 
+           ! test the input arguments
+           ! quick return if possible
+           if( m==0 ) then
+              return
+           else if( m==n ) then
+              do i = 1, n
+                 tau( i ) = zero
+              end do
+              return
+           end if
+           do i = m, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! [ a(i,i) a(i,n-l+1:n) ]
+              call stdlib_qlarfg( l+1, a( i, i ), a( i, n-l+1 ), lda, tau( i ) )
+              ! apply h(i) to a(1:i-1,i:n) from the right
+              call stdlib_qlarz( 'RIGHT', i-1, n-i+1, l, a( i, n-l+1 ), lda,tau( i ), a( 1, i ), &
+                        lda, work )
+           end do
+           return
+     end subroutine stdlib_qlatrz
+
+     !> DLATSQR: computes a blocked Tall-Skinny QR factorization of
+     !> a real M-by-N matrix A for M >= N:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix, stored on exit in an implicit
+     !> form in the elements below the diagonal of the array A and in
+     !> the elements of the array T;
+     !> R is an upper-triangular N-by-N matrix, stored on exit in
+     !> the elements on and above the diagonal of the array A.
+     !> 0 is a (M-N)-by-N zero matrix, and is not stored.
+
+     pure subroutine stdlib_qlatsqr( m, n, mb, nb, a, lda, t, ldt, work,lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, mb, nb, ldt, lwork
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ii, kk, ctr
+           ! External Subroutines 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+             info = -1
+           else if( n<0 .or. m<n ) then
+             info = -2
+           else if( mb<1 ) then
+             info = -3
+           else if( nb<1 .or. ( nb>n .and. n>0 )) then
+             info = -4
+           else if( lda<max( 1, m ) ) then
+             info = -6
+           else if( ldt<nb ) then
+             info = -8
+           else if( lwork<(n*nb) .and. (.not.lquery) ) then
+             info = -10
+           end if
+           if( info==0)  then
+             work(1) = nb*n
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'DLATSQR', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n)==0 ) then
+               return
+           end if
+           ! the qr decomposition
+            if ((mb<=n).or.(mb>=m)) then
+              call stdlib_qgeqrt( m, n, nb, a, lda, t, ldt, work, info)
+              return
+            end if
+            kk = mod((m-n),(mb-n))
+            ii=m-kk+1
+            ! compute the qr factorization of the first block a(1:mb,1:n)
+            call stdlib_qgeqrt( mb, n, nb, a(1,1), lda, t, ldt, work, info )
+            ctr = 1
+            do i = mb+1, ii-mb+n ,  (mb-n)
+            ! compute the qr factorization of the current block a(i:i+mb-n,1:n)
+              call stdlib_qtpqrt( mb-n, n, 0, nb, a(1,1), lda, a( i, 1 ), lda,t(1, ctr * n + 1),&
+                        ldt, work, info )
+              ctr = ctr + 1
+            end do
+            ! compute the qr factorization of the last block a(ii:m,1:n)
+            if (ii<=m) then
+              call stdlib_qtpqrt( kk, n, 0, nb, a(1,1), lda, a( ii, 1 ), lda,t(1, ctr * n + 1), &
+                        ldt,work, info )
+            end if
+           work( 1 ) = n*nb
+           return
+     end subroutine stdlib_qlatsqr
+
+     !> DLAUU2: computes the product U * U**T or L**T * L, where the triangular
+     !> factor U or L is stored in the upper or lower triangular part of
+     !> the array A.
+     !> If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+     !> overwriting the factor U in A.
+     !> If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+     !> overwriting the factor L in A.
+     !> This is the unblocked form of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_qlauu2( uplo, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           real(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAUU2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the product u * u**t.
+              do i = 1, n
+                 aii = a( i, i )
+                 if( i<n ) then
+                    a( i, i ) = stdlib_qdot( n-i+1, a( i, i ), lda, a( i, i ), lda )
+                    call stdlib_qgemv( 'NO TRANSPOSE', i-1, n-i, one, a( 1, i+1 ),lda, a( i, i+1 )&
+                              , lda, aii, a( 1, i ), 1 )
+                 else
+                    call stdlib_qscal( i, aii, a( 1, i ), 1 )
+                 end if
+              end do
+           else
+              ! compute the product l**t * l.
+              do i = 1, n
+                 aii = a( i, i )
+                 if( i<n ) then
+                    a( i, i ) = stdlib_qdot( n-i+1, a( i, i ), 1, a( i, i ), 1 )
+                    call stdlib_qgemv( 'TRANSPOSE', n-i, i-1, one, a( i+1, 1 ), lda,a( i+1, i ), &
+                              1, aii, a( i, 1 ), lda )
+                 else
+                    call stdlib_qscal( i, aii, a( i, 1 ), lda )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_qlauu2
+
+     !> DLAUUM: computes the product U * U**T or L**T * L, where the triangular
+     !> factor U or L is stored in the upper or lower triangular part of
+     !> the array A.
+     !> If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+     !> overwriting the factor U in A.
+     !> If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+     !> overwriting the factor L in A.
+     !> This is the blocked form of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_qlauum( uplo, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ib, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DLAUUM', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'DLAUUM', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_qlauu2( uplo, n, a, lda, info )
+           else
+              ! use blocked code
+              if( upper ) then
+                 ! compute the product u * u**t.
+                 do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'NON-UNIT',i-1, ib, one, a( &
+                              i, i ), lda, a( 1, i ),lda )
+                    call stdlib_qlauu2( 'UPPER', ib, a( i, i ), lda, info )
+                    if( i+ib<=n ) then
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', i-1, ib,n-i-ib+1, one, a( &
+                                 1, i+ib ), lda,a( i, i+ib ), lda, one, a( 1, i ), lda )
+                       call stdlib_qsyrk( 'UPPER', 'NO TRANSPOSE', ib, n-i-ib+1,one, a( i, i+ib ),&
+                                  lda, one, a( i, i ),lda )
+                    end if
+                 end do
+              else
+                 ! compute the product l**t * l.
+                 do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    call stdlib_qtrmm( 'LEFT', 'LOWER', 'TRANSPOSE', 'NON-UNIT', ib,i-1, one, a( &
+                              i, i ), lda, a( i, 1 ), lda )
+                    call stdlib_qlauu2( 'LOWER', ib, a( i, i ), lda, info )
+                    if( i+ib<=n ) then
+                       call stdlib_qgemm( 'TRANSPOSE', 'NO TRANSPOSE', ib, i-1,n-i-ib+1, one, a( &
+                                 i+ib, i ), lda,a( i+ib, 1 ), lda, one, a( i, 1 ), lda )
+                       call stdlib_qsyrk( 'LOWER', 'TRANSPOSE', ib, n-i-ib+1, one,a( i+ib, i ), &
+                                 lda, one, a( i, i ), lda )
+                    end if
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qlauum
+
+     !> DOPGTR: generates a real orthogonal matrix Q which is defined as the
+     !> product of n-1 elementary reflectors H(i) of order n, as returned by
+     !> DSPTRD using packed storage:
+     !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+
+     pure subroutine stdlib_qopgtr( uplo, n, ap, tau, q, ldq, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldq, n
+           ! Array Arguments 
+           real(qp), intent(in) :: ap(*), tau(*)
+           real(qp), intent(out) :: q(ldq,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, ij, j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DOPGTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! q was determined by a call to stdlib_qsptrd with uplo = 'u'
+              ! unpack the vectors which define the elementary reflectors and
+              ! set the last row and column of q equal to those of the unit
+              ! matrix
+              ij = 2
+              do j = 1, n - 1
+                 do i = 1, j - 1
+                    q( i, j ) = ap( ij )
+                    ij = ij + 1
+                 end do
+                 ij = ij + 2
+                 q( n, j ) = zero
+              end do
+              do i = 1, n - 1
+                 q( i, n ) = zero
+              end do
+              q( n, n ) = one
+              ! generate q(1:n-1,1:n-1)
+              call stdlib_qorg2l( n-1, n-1, n-1, q, ldq, tau, work, iinfo )
+           else
+              ! q was determined by a call to stdlib_qsptrd with uplo = 'l'.
+              ! unpack the vectors which define the elementary reflectors and
+              ! set the first row and column of q equal to those of the unit
+              ! matrix
+              q( 1, 1 ) = one
+              do i = 2, n
+                 q( i, 1 ) = zero
+              end do
+              ij = 3
+              do j = 2, n
+                 q( 1, j ) = zero
+                 do i = j + 1, n
+                    q( i, j ) = ap( ij )
+                    ij = ij + 1
+                 end do
+                 ij = ij + 2
+              end do
+              if( n>1 ) then
+                 ! generate q(2:n,2:n)
+                 call stdlib_qorg2r( n-1, n-1, n-1, q( 2, 2 ), ldq, tau, work,iinfo )
+              end if
+           end if
+           return
+     end subroutine stdlib_qopgtr
+
+     !> DOPMTR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> nq-1 elementary reflectors, as returned by DSPTRD using packed
+     !> storage:
+     !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+     pure subroutine stdlib_qopmtr( side, uplo, trans, m, n, ap, tau, c, ldc, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ap(*), c(ldc,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: forwrd, left, notran, upper
+           integer(ilp) :: i, i1, i2, i3, ic, ii, jc, mi, ni, nq
+           real(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           upper = stdlib_lsame( uplo, 'U' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( ldc<max( 1, m ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DOPMTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if( upper ) then
+              ! q was determined by a call to stdlib_qsptrd with uplo = 'u'
+              forwrd = ( left .and. notran ) .or.( .not.left .and. .not.notran )
+              if( forwrd ) then
+                 i1 = 1
+                 i2 = nq - 1
+                 i3 = 1
+                 ii = 2
+              else
+                 i1 = nq - 1
+                 i2 = 1
+                 i3 = -1
+                 ii = nq*( nq+1 ) / 2 - 1
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              do i = i1, i2, i3
+                 if( left ) then
+                    ! h(i) is applied to c(1:i,1:n)
+                    mi = i
+                 else
+                    ! h(i) is applied to c(1:m,1:i)
+                    ni = i
+                 end if
+                 ! apply h(i)
+                 aii = ap( ii )
+                 ap( ii ) = one
+                 call stdlib_qlarf( side, mi, ni, ap( ii-i+1 ), 1, tau( i ), c, ldc,work )
+                 ap( ii ) = aii
+                 if( forwrd ) then
+                    ii = ii + i + 2
+                 else
+                    ii = ii - i - 1
+                 end if
+              end do
+           else
+              ! q was determined by a call to stdlib_qsptrd with uplo = 'l'.
+              forwrd = ( left .and. .not.notran ) .or.( .not.left .and. notran )
+              if( forwrd ) then
+                 i1 = 1
+                 i2 = nq - 1
+                 i3 = 1
+                 ii = 2
+              else
+                 i1 = nq - 1
+                 i2 = 1
+                 i3 = -1
+                 ii = nq*( nq+1 ) / 2 - 1
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              do i = i1, i2, i3
+                 aii = ap( ii )
+                 ap( ii ) = one
+                 if( left ) then
+                    ! h(i) is applied to c(i+1:m,1:n)
+                    mi = m - i
+                    ic = i + 1
+                 else
+                    ! h(i) is applied to c(1:m,i+1:n)
+                    ni = n - i
+                    jc = i + 1
+                 end if
+                 ! apply h(i)
+                 call stdlib_qlarf( side, mi, ni, ap( ii ), 1, tau( i ),c( ic, jc ), ldc, work )
+                           
+                 ap( ii ) = aii
+                 if( forwrd ) then
+                    ii = ii + nq - i + 1
+                 else
+                    ii = ii - nq + i - 2
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_qopmtr
+
+     !> DORBDB: simultaneously bidiagonalizes the blocks of an M-by-M
+     !> partitioned orthogonal matrix X:
+     !> [ B11 | B12 0  0 ]
+     !> [ X11 | X12 ]   [ P1 |    ] [  0  |  0 -I  0 ] [ Q1 |    ]**T
+     !> X = [-----------] = [---------] [----------------] [---------]   .
+     !> [ X21 | X22 ]   [    | P2 ] [ B21 | B22 0  0 ] [    | Q2 ]
+     !> [  0  |  0  0  I ]
+     !> X11 is P-by-Q. Q must be no larger than P, M-P, or M-Q. (If this is
+     !> not the case, then X must be transposed and/or permuted. This can be
+     !> done in constant time using the TRANS and SIGNS options. See DORCSD
+     !> for details.)
+     !> The orthogonal matrices P1, P2, Q1, and Q2 are P-by-P, (M-P)-by-
+     !> (M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. They are
+     !> represented implicitly by Householder vectors.
+     !> B11, B12, B21, and B22 are Q-by-Q bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_qorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+               ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: signs, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldx11, ldx12, ldx21, ldx22, lwork, m, p, q
+           ! Array Arguments 
+           real(qp), intent(out) :: phi(*), theta(*)
+           real(qp), intent(out) :: taup1(*), taup2(*), tauq1(*), tauq2(*), work(*)
+           real(qp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
+                     
+        ! ====================================================================
+           ! Parameters 
+           real(qp), parameter :: realone = 1.0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: colmajor, lquery
+           integer(ilp) :: i, lworkmin, lworkopt
+           real(qp) :: z1, z2, z3, z4
+           ! Intrinsic Functions
+           intrinsic :: atan2,cos,max,sin
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           if( .not. stdlib_lsame( signs, 'O' ) ) then
+              z1 = realone
+              z2 = realone
+              z3 = realone
+              z4 = realone
+           else
+              z1 = realone
+              z2 = -realone
+              z3 = realone
+              z4 = -realone
+           end if
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -3
+           else if( p < 0 .or. p > m ) then
+              info = -4
+           else if( q < 0 .or. q > p .or. q > m-p .or.q > m-q ) then
+              info = -5
+           else if( colmajor .and. ldx11 < max( 1, p ) ) then
+              info = -7
+           else if( .not.colmajor .and. ldx11 < max( 1, q ) ) then
+              info = -7
+           else if( colmajor .and. ldx12 < max( 1, p ) ) then
+              info = -9
+           else if( .not.colmajor .and. ldx12 < max( 1, m-q ) ) then
+              info = -9
+           else if( colmajor .and. ldx21 < max( 1, m-p ) ) then
+              info = -11
+           else if( .not.colmajor .and. ldx21 < max( 1, q ) ) then
+              info = -11
+           else if( colmajor .and. ldx22 < max( 1, m-p ) ) then
+              info = -13
+           else if( .not.colmajor .and. ldx22 < max( 1, m-q ) ) then
+              info = -13
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              lworkopt = m - q
+              lworkmin = m - q
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not. lquery ) then
+                 info = -21
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'XORBDB', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! handle column-major and row-major separately
+           if( colmajor ) then
+              ! reduce columns 1, ..., q of x11, x12, x21, and x22
+              do i = 1, q
+                 if( i == 1 ) then
+                    call stdlib_qscal( p-i+1, z1, x11(i,i), 1 )
+                 else
+                    call stdlib_qscal( p-i+1, z1*cos(phi(i-1)), x11(i,i), 1 )
+                    call stdlib_qaxpy( p-i+1, -z1*z3*z4*sin(phi(i-1)), x12(i,i-1),1, x11(i,i), 1 )
+                              
+                 end if
+                 if( i == 1 ) then
+                    call stdlib_qscal( m-p-i+1, z2, x21(i,i), 1 )
+                 else
+                    call stdlib_qscal( m-p-i+1, z2*cos(phi(i-1)), x21(i,i), 1 )
+                    call stdlib_qaxpy( m-p-i+1, -z2*z3*z4*sin(phi(i-1)), x22(i,i-1),1, x21(i,i), &
+                              1 )
+                 end if
+                 theta(i) = atan2( stdlib_qnrm2( m-p-i+1, x21(i,i), 1 ),stdlib_qnrm2( p-i+1, x11(&
+                           i,i), 1 ) )
+                 if( p > i ) then
+                    call stdlib_qlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+                 else if( p == i ) then
+                    call stdlib_qlarfgp( p-i+1, x11(i,i), x11(i,i), 1, taup1(i) )
+                 end if
+                 x11(i,i) = one
+                 if ( m-p > i ) then
+                    call stdlib_qlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1,taup2(i) )
+                 else if ( m-p == i ) then
+                    call stdlib_qlarfgp( m-p-i+1, x21(i,i), x21(i,i), 1, taup2(i) )
+                 end if
+                 x21(i,i) = one
+                 if ( q > i ) then
+                    call stdlib_qlarf( 'L', p-i+1, q-i, x11(i,i), 1, taup1(i),x11(i,i+1), ldx11, &
+                              work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_qlarf( 'L', p-i+1, m-q-i+1, x11(i,i), 1, taup1(i),x12(i,i), ldx12,&
+                               work )
+                 end if
+                 if ( q > i ) then
+                    call stdlib_qlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, taup2(i),x21(i,i+1), ldx21,&
+                               work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_qlarf( 'L', m-p-i+1, m-q-i+1, x21(i,i), 1, taup2(i),x22(i,i), &
+                              ldx22, work )
+                 end if
+                 if( i < q ) then
+                    call stdlib_qscal( q-i, -z1*z3*sin(theta(i)), x11(i,i+1),ldx11 )
+                    call stdlib_qaxpy( q-i, z2*z3*cos(theta(i)), x21(i,i+1), ldx21,x11(i,i+1), &
+                              ldx11 )
+                 end if
+                 call stdlib_qscal( m-q-i+1, -z1*z4*sin(theta(i)), x12(i,i), ldx12 )
+                 call stdlib_qaxpy( m-q-i+1, z2*z4*cos(theta(i)), x22(i,i), ldx22,x12(i,i), ldx12 &
+                           )
+                 if( i < q )phi(i) = atan2( stdlib_qnrm2( q-i, x11(i,i+1), ldx11 ),stdlib_qnrm2( &
+                           m-q-i+1, x12(i,i), ldx12 ) )
+                 if( i < q ) then
+                    if ( q-i == 1 ) then
+                       call stdlib_qlarfgp( q-i, x11(i,i+1), x11(i,i+1), ldx11,tauq1(i) )
+                    else
+                       call stdlib_qlarfgp( q-i, x11(i,i+1), x11(i,i+2), ldx11,tauq1(i) )
+                    end if
+                    x11(i,i+1) = one
+                 end if
+                 if ( q+i-1 < m ) then
+                    if ( m-q == i ) then
+                       call stdlib_qlarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
+                    else
+                       call stdlib_qlarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
+                                 
+                    end if
+                 end if
+                 x12(i,i) = one
+                 if( i < q ) then
+                    call stdlib_qlarf( 'R', p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x11(i+1,i+1), &
+                              ldx11, work )
+                    call stdlib_qlarf( 'R', m-p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x21(i+1,i+1), &
+                              ldx21, work )
+                 end if
+                 if ( p > i ) then
+                    call stdlib_qlarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
+                              ldx12, work )
+                 end if
+                 if ( m-p > i ) then
+                    call stdlib_qlarf( 'R', m-p-i, m-q-i+1, x12(i,i), ldx12,tauq2(i), x22(i+1,i), &
+                              ldx22, work )
+                 end if
+              end do
+              ! reduce columns q + 1, ..., p of x12, x22
+              do i = q + 1, p
+                 call stdlib_qscal( m-q-i+1, -z1*z4, x12(i,i), ldx12 )
+                 if ( i >= m-q ) then
+                    call stdlib_qlarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
+                 else
+                    call stdlib_qlarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
+                 end if
+                 x12(i,i) = one
+                 if ( p > i ) then
+                    call stdlib_qlarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
+                              ldx12, work )
+                 end if
+                 if( m-p-q >= 1 )call stdlib_qlarf( 'R', m-p-q, m-q-i+1, x12(i,i), ldx12,tauq2(i),&
+                            x22(q+1,i), ldx22, work )
+              end do
+              ! reduce columns p + 1, ..., m - q of x12, x22
+              do i = 1, m - p - q
+                 call stdlib_qscal( m-p-q-i+1, z2*z4, x22(q+i,p+i), ldx22 )
+                 if ( i == m-p-q ) then
+                    call stdlib_qlarfgp( m-p-q-i+1, x22(q+i,p+i), x22(q+i,p+i),ldx22, tauq2(p+i) )
+                              
+                 else
+                    call stdlib_qlarfgp( m-p-q-i+1, x22(q+i,p+i), x22(q+i,p+i+1),ldx22, tauq2(p+i)&
+                               )
+                 end if
+                 x22(q+i,p+i) = one
+                 if ( i < m-p-q ) then
+                    call stdlib_qlarf( 'R', m-p-q-i, m-p-q-i+1, x22(q+i,p+i), ldx22,tauq2(p+i), &
+                              x22(q+i+1,p+i), ldx22, work )
+                 end if
+              end do
+           else
+              ! reduce columns 1, ..., q of x11, x12, x21, x22
+              do i = 1, q
+                 if( i == 1 ) then
+                    call stdlib_qscal( p-i+1, z1, x11(i,i), ldx11 )
+                 else
+                    call stdlib_qscal( p-i+1, z1*cos(phi(i-1)), x11(i,i), ldx11 )
+                    call stdlib_qaxpy( p-i+1, -z1*z3*z4*sin(phi(i-1)), x12(i-1,i),ldx12, x11(i,i),&
+                               ldx11 )
+                 end if
+                 if( i == 1 ) then
+                    call stdlib_qscal( m-p-i+1, z2, x21(i,i), ldx21 )
+                 else
+                    call stdlib_qscal( m-p-i+1, z2*cos(phi(i-1)), x21(i,i), ldx21 )
+                    call stdlib_qaxpy( m-p-i+1, -z2*z3*z4*sin(phi(i-1)), x22(i-1,i),ldx22, x21(i,&
+                              i), ldx21 )
+                 end if
+                 theta(i) = atan2( stdlib_qnrm2( m-p-i+1, x21(i,i), ldx21 ),stdlib_qnrm2( p-i+1, &
+                           x11(i,i), ldx11 ) )
+                 call stdlib_qlarfgp( p-i+1, x11(i,i), x11(i,i+1), ldx11, taup1(i) )
+                 x11(i,i) = one
+                 if ( i == m-p ) then
+                    call stdlib_qlarfgp( m-p-i+1, x21(i,i), x21(i,i), ldx21,taup2(i) )
+                 else
+                    call stdlib_qlarfgp( m-p-i+1, x21(i,i), x21(i,i+1), ldx21,taup2(i) )
+                 end if
+                 x21(i,i) = one
+                 if ( q > i ) then
+                    call stdlib_qlarf( 'R', q-i, p-i+1, x11(i,i), ldx11, taup1(i),x11(i+1,i), &
+                              ldx11, work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_qlarf( 'R', m-q-i+1, p-i+1, x11(i,i), ldx11,taup1(i), x12(i,i), &
+                              ldx12, work )
+                 end if
+                 if ( q > i ) then
+                    call stdlib_qlarf( 'R', q-i, m-p-i+1, x21(i,i), ldx21, taup2(i),x21(i+1,i), &
+                              ldx21, work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_qlarf( 'R', m-q-i+1, m-p-i+1, x21(i,i), ldx21,taup2(i), x22(i,i), &
+                              ldx22, work )
+                 end if
+                 if( i < q ) then
+                    call stdlib_qscal( q-i, -z1*z3*sin(theta(i)), x11(i+1,i), 1 )
+                    call stdlib_qaxpy( q-i, z2*z3*cos(theta(i)), x21(i+1,i), 1,x11(i+1,i), 1 )
+                              
+                 end if
+                 call stdlib_qscal( m-q-i+1, -z1*z4*sin(theta(i)), x12(i,i), 1 )
+                 call stdlib_qaxpy( m-q-i+1, z2*z4*cos(theta(i)), x22(i,i), 1,x12(i,i), 1 )
+                           
+                 if( i < q )phi(i) = atan2( stdlib_qnrm2( q-i, x11(i+1,i), 1 ),stdlib_qnrm2( m-q-&
+                           i+1, x12(i,i), 1 ) )
+                 if( i < q ) then
+                    if ( q-i == 1) then
+                       call stdlib_qlarfgp( q-i, x11(i+1,i), x11(i+1,i), 1,tauq1(i) )
+                    else
+                       call stdlib_qlarfgp( q-i, x11(i+1,i), x11(i+2,i), 1,tauq1(i) )
+                    end if
+                    x11(i+1,i) = one
+                 end if
+                 if ( m-q > i ) then
+                    call stdlib_qlarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1,tauq2(i) )
+                 else
+                    call stdlib_qlarfgp( m-q-i+1, x12(i,i), x12(i,i), 1,tauq2(i) )
+                 end if
+                 x12(i,i) = one
+                 if( i < q ) then
+                    call stdlib_qlarf( 'L', q-i, p-i, x11(i+1,i), 1, tauq1(i),x11(i+1,i+1), ldx11,&
+                               work )
+                    call stdlib_qlarf( 'L', q-i, m-p-i, x11(i+1,i), 1, tauq1(i),x21(i+1,i+1), &
+                              ldx21, work )
+                 end if
+                 call stdlib_qlarf( 'L', m-q-i+1, p-i, x12(i,i), 1, tauq2(i),x12(i,i+1), ldx12, &
+                           work )
+                 if ( m-p-i > 0 ) then
+                    call stdlib_qlarf( 'L', m-q-i+1, m-p-i, x12(i,i), 1, tauq2(i),x22(i,i+1), &
+                              ldx22, work )
+                 end if
+              end do
+              ! reduce columns q + 1, ..., p of x12, x22
+              do i = q + 1, p
+                 call stdlib_qscal( m-q-i+1, -z1*z4, x12(i,i), 1 )
+                 call stdlib_qlarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1, tauq2(i) )
+                 x12(i,i) = one
+                 if ( p > i ) then
+                    call stdlib_qlarf( 'L', m-q-i+1, p-i, x12(i,i), 1, tauq2(i),x12(i,i+1), ldx12,&
+                               work )
+                 end if
+                 if( m-p-q >= 1 )call stdlib_qlarf( 'L', m-q-i+1, m-p-q, x12(i,i), 1, tauq2(i),&
+                           x22(i,q+1), ldx22, work )
+              end do
+              ! reduce columns p + 1, ..., m - q of x12, x22
+              do i = 1, m - p - q
+                 call stdlib_qscal( m-p-q-i+1, z2*z4, x22(p+i,q+i), 1 )
+                 if ( m-p-q == i ) then
+                    call stdlib_qlarfgp( m-p-q-i+1, x22(p+i,q+i), x22(p+i,q+i), 1,tauq2(p+i) )
+                              
+                 else
+                    call stdlib_qlarfgp( m-p-q-i+1, x22(p+i,q+i), x22(p+i+1,q+i), 1,tauq2(p+i) )
+                              
+                    call stdlib_qlarf( 'L', m-p-q-i+1, m-p-q-i, x22(p+i,q+i), 1,tauq2(p+i), x22(p+&
+                              i,q+i+1), ldx22, work )
+                 end if
+                 x22(p+i,q+i) = one
+              end do
+           end if
+           return
+     end subroutine stdlib_qorbdb
+
+     !> DORBDB1: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. Q must be no larger than P,
+     !> M-P, or M-Q. Routines DORBDB2, DORBDB3, and DORBDB4 handle cases in
+     !> which Q is not the minimum dimension.
+     !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are Q-by-Q bidiagonal matrices represented implicitly by
+     !> angles THETA, PHI.
+
+     subroutine stdlib_qorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(qp), intent(out) :: phi(*), theta(*)
+           real(qp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           real(qp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < q .or. m-p < q ) then
+              info = -2
+           else if( q < 0 .or. m-q < q ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p-1, m-p-1, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-2
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORBDB1', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce columns 1, ..., q of x11 and x21
+           do i = 1, q
+              call stdlib_qlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              call stdlib_qlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              theta(i) = atan2( x21(i,i), x11(i,i) )
+              c = cos( theta(i) )
+              s = sin( theta(i) )
+              x11(i,i) = one
+              x21(i,i) = one
+              call stdlib_qlarf( 'L', p-i+1, q-i, x11(i,i), 1, taup1(i), x11(i,i+1),ldx11, work(&
+                        ilarf) )
+              call stdlib_qlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, taup2(i),x21(i,i+1), ldx21, work(&
+                        ilarf) )
+              if( i < q ) then
+                 call stdlib_qrot( q-i, x11(i,i+1), ldx11, x21(i,i+1), ldx21, c, s )
+                 call stdlib_qlarfgp( q-i, x21(i,i+1), x21(i,i+2), ldx21, tauq1(i) )
+                 s = x21(i,i+1)
+                 x21(i,i+1) = one
+                 call stdlib_qlarf( 'R', p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x11(i+1,i+1), &
+                           ldx11, work(ilarf) )
+                 call stdlib_qlarf( 'R', m-p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x21(i+1,i+1), &
+                           ldx21, work(ilarf) )
+                 c = sqrt( stdlib_qnrm2( p-i, x11(i+1,i+1), 1 )**2+ stdlib_qnrm2( m-p-i, x21(i+1,&
+                           i+1), 1 )**2 )
+                 phi(i) = atan2( s, c )
+                 call stdlib_qorbdb5( p-i, m-p-i, q-i-1, x11(i+1,i+1), 1,x21(i+1,i+1), 1, x11(i+1,&
+                           i+2), ldx11,x21(i+1,i+2), ldx21, work(iorbdb5), lorbdb5,childinfo )
+              end if
+           end do
+           return
+     end subroutine stdlib_qorbdb1
+
+     !> DORBDB2: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. P must be no larger than M-P,
+     !> Q, or M-Q. Routines DORBDB1, DORBDB3, and DORBDB4 handle cases in
+     !> which P is not the minimum dimension.
+     !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are P-by-P bidiagonal matrices represented implicitly by
+     !> angles THETA, PHI.
+
+     subroutine stdlib_qorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(qp), intent(out) :: phi(*), theta(*)
+           real(qp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           real(qp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < 0 .or. p > m-p ) then
+              info = -2
+           else if( q < 0 .or. q < p .or. m-q < p ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p-1, m-p, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-1
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORBDB2', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce rows 1, ..., p of x11 and x21
+           do i = 1, p
+              if( i > 1 ) then
+                 call stdlib_qrot( q-i+1, x11(i,i), ldx11, x21(i-1,i), ldx21, c, s )
+              end if
+              call stdlib_qlarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
+              c = x11(i,i)
+              x11(i,i) = one
+              call stdlib_qlarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_qlarf( 'R', m-p-i+1, q-i+1, x11(i,i), ldx11, tauq1(i),x21(i,i), ldx21, &
+                        work(ilarf) )
+              s = sqrt( stdlib_qnrm2( p-i, x11(i+1,i), 1 )**2+ stdlib_qnrm2( m-p-i+1, x21(i,i), 1 &
+                        )**2 )
+              theta(i) = atan2( s, c )
+              call stdlib_qorbdb5( p-i, m-p-i+1, q-i, x11(i+1,i), 1, x21(i,i), 1,x11(i+1,i+1), &
+                        ldx11, x21(i,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
+              call stdlib_qscal( p-i, negone, x11(i+1,i), 1 )
+              call stdlib_qlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              if( i < p ) then
+                 call stdlib_qlarfgp( p-i, x11(i+1,i), x11(i+2,i), 1, taup1(i) )
+                 phi(i) = atan2( x11(i+1,i), x21(i,i) )
+                 c = cos( phi(i) )
+                 s = sin( phi(i) )
+                 x11(i+1,i) = one
+                 call stdlib_qlarf( 'L', p-i, q-i, x11(i+1,i), 1, taup1(i),x11(i+1,i+1), ldx11, &
+                           work(ilarf) )
+              end if
+              x21(i,i) = one
+              call stdlib_qlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, taup2(i),x21(i,i+1), ldx21, work(&
+                        ilarf) )
+           end do
+           ! reduce the bottom-right portion of x21 to the identity matrix
+           do i = p + 1, q
+              call stdlib_qlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              x21(i,i) = one
+              call stdlib_qlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, taup2(i),x21(i,i+1), ldx21, work(&
+                        ilarf) )
+           end do
+           return
+     end subroutine stdlib_qorbdb2
+
+     !> DORBDB3: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-P must be no larger than P,
+     !> Q, or M-Q. Routines DORBDB1, DORBDB2, and DORBDB4 handle cases in
+     !> which M-P is not the minimum dimension.
+     !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are (M-P)-by-(M-P) bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_qorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(qp), intent(out) :: phi(*), theta(*)
+           real(qp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           real(qp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( 2*p < m .or. p > m ) then
+              info = -2
+           else if( q < m-p .or. m-q < m-p ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p, m-p-1, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-1
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORBDB3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce rows 1, ..., m-p of x11 and x21
+           do i = 1, m-p
+              if( i > 1 ) then
+                 call stdlib_qrot( q-i+1, x11(i-1,i), ldx11, x21(i,i), ldx11, c, s )
+              end if
+              call stdlib_qlarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
+              s = x21(i,i)
+              x21(i,i) = one
+              call stdlib_qlarf( 'R', p-i+1, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_qlarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
+                        work(ilarf) )
+              c = sqrt( stdlib_qnrm2( p-i+1, x11(i,i), 1 )**2+ stdlib_qnrm2( m-p-i, x21(i+1,i), 1 &
+                        )**2 )
+              theta(i) = atan2( s, c )
+              call stdlib_qorbdb5( p-i+1, m-p-i, q-i, x11(i,i), 1, x21(i+1,i), 1,x11(i,i+1), &
+                        ldx11, x21(i+1,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
+              call stdlib_qlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              if( i < m-p ) then
+                 call stdlib_qlarfgp( m-p-i, x21(i+1,i), x21(i+2,i), 1, taup2(i) )
+                 phi(i) = atan2( x21(i+1,i), x11(i,i) )
+                 c = cos( phi(i) )
+                 s = sin( phi(i) )
+                 x21(i+1,i) = one
+                 call stdlib_qlarf( 'L', m-p-i, q-i, x21(i+1,i), 1, taup2(i),x21(i+1,i+1), ldx21, &
+                           work(ilarf) )
+              end if
+              x11(i,i) = one
+              call stdlib_qlarf( 'L', p-i+1, q-i, x11(i,i), 1, taup1(i), x11(i,i+1),ldx11, work(&
+                        ilarf) )
+           end do
+           ! reduce the bottom-right portion of x11 to the identity matrix
+           do i = m-p + 1, q
+              call stdlib_qlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              x11(i,i) = one
+              call stdlib_qlarf( 'L', p-i+1, q-i, x11(i,i), 1, taup1(i), x11(i,i+1),ldx11, work(&
+                        ilarf) )
+           end do
+           return
+     end subroutine stdlib_qorbdb3
+
+     !> DORBDB4: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-Q must be no larger than P,
+     !> M-P, or Q. Routines DORBDB1, DORBDB2, and DORBDB3 handle cases in
+     !> which M-Q is not the minimum dimension.
+     !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are (M-Q)-by-(M-Q) bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_qorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               phantom, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(qp), intent(out) :: phi(*), theta(*)
+           real(qp), intent(out) :: phantom(*), taup1(*), taup2(*), tauq1(*), work(*)
+           real(qp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, j, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < m-q .or. m-p < m-q ) then
+              info = -2
+           else if( q < m-q .or. q > m ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( q-1, p-1, m-p-1 )
+              iorbdb5 = 2
+              lorbdb5 = q
+              lworkopt = ilarf + llarf - 1
+              lworkopt = max( lworkopt, iorbdb5 + lorbdb5 - 1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORBDB4', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce columns 1, ..., m-q of x11 and x21
+           do i = 1, m-q
+              if( i == 1 ) then
+                 do j = 1, m
+                    phantom(j) = zero
+                 end do
+                 call stdlib_qorbdb5( p, m-p, q, phantom(1), 1, phantom(p+1), 1,x11, ldx11, x21, &
+                           ldx21, work(iorbdb5),lorbdb5, childinfo )
+                 call stdlib_qscal( p, negone, phantom(1), 1 )
+                 call stdlib_qlarfgp( p, phantom(1), phantom(2), 1, taup1(1) )
+                 call stdlib_qlarfgp( m-p, phantom(p+1), phantom(p+2), 1, taup2(1) )
+                 theta(i) = atan2( phantom(1), phantom(p+1) )
+                 c = cos( theta(i) )
+                 s = sin( theta(i) )
+                 phantom(1) = one
+                 phantom(p+1) = one
+                 call stdlib_qlarf( 'L', p, q, phantom(1), 1, taup1(1), x11, ldx11,work(ilarf) )
+                           
+                 call stdlib_qlarf( 'L', m-p, q, phantom(p+1), 1, taup2(1), x21,ldx21, work(ilarf)&
+                            )
+              else
+                 call stdlib_qorbdb5( p-i+1, m-p-i+1, q-i+1, x11(i,i-1), 1,x21(i,i-1), 1, x11(i,i)&
+                           , ldx11, x21(i,i),ldx21, work(iorbdb5), lorbdb5, childinfo )
+                 call stdlib_qscal( p-i+1, negone, x11(i,i-1), 1 )
+                 call stdlib_qlarfgp( p-i+1, x11(i,i-1), x11(i+1,i-1), 1, taup1(i) )
+                 call stdlib_qlarfgp( m-p-i+1, x21(i,i-1), x21(i+1,i-1), 1,taup2(i) )
+                 theta(i) = atan2( x11(i,i-1), x21(i,i-1) )
+                 c = cos( theta(i) )
+                 s = sin( theta(i) )
+                 x11(i,i-1) = one
+                 x21(i,i-1) = one
+                 call stdlib_qlarf( 'L', p-i+1, q-i+1, x11(i,i-1), 1, taup1(i),x11(i,i), ldx11, &
+                           work(ilarf) )
+                 call stdlib_qlarf( 'L', m-p-i+1, q-i+1, x21(i,i-1), 1, taup2(i),x21(i,i), ldx21, &
+                           work(ilarf) )
+              end if
+              call stdlib_qrot( q-i+1, x11(i,i), ldx11, x21(i,i), ldx21, s, -c )
+              call stdlib_qlarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
+              c = x21(i,i)
+              x21(i,i) = one
+              call stdlib_qlarf( 'R', p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_qlarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
+                        work(ilarf) )
+              if( i < m-q ) then
+                 s = sqrt( stdlib_qnrm2( p-i, x11(i+1,i), 1 )**2+ stdlib_qnrm2( m-p-i, x21(i+1,i),&
+                            1 )**2 )
+                 phi(i) = atan2( s, c )
+              end if
+           end do
+           ! reduce the bottom-right portion of x11 to [ i 0 ]
+           do i = m - q + 1, p
+              call stdlib_qlarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
+              x11(i,i) = one
+              call stdlib_qlarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_qlarf( 'R', q-p, q-i+1, x11(i,i), ldx11, tauq1(i),x21(m-q+1,i), ldx21, &
+                        work(ilarf) )
+           end do
+           ! reduce the bottom-right portion of x21 to [ 0 i ]
+           do i = p + 1, q
+              call stdlib_qlarfgp( q-i+1, x21(m-q+i-p,i), x21(m-q+i-p,i+1), ldx21,tauq1(i) )
+                        
+              x21(m-q+i-p,i) = one
+              call stdlib_qlarf( 'R', q-i, q-i+1, x21(m-q+i-p,i), ldx21, tauq1(i),x21(m-q+i-p+1,i)&
+                        , ldx21, work(ilarf) )
+           end do
+           return
+     end subroutine stdlib_qorbdb4
+
+     !> DORBDB5: orthogonalizes the column vector
+     !> X = [ X1 ]
+     !> [ X2 ]
+     !> with respect to the columns of
+     !> Q = [ Q1 ] .
+     !> [ Q2 ]
+     !> The columns of Q must be orthonormal.
+     !> If the projection is zero according to Kahan's "twice is enough"
+     !> criterion, then some other vector from the orthogonal complement
+     !> is returned. This vector is chosen in an arbitrary but deterministic
+     !> way.
+
+     pure subroutine stdlib_qorbdb5( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
+           real(qp), intent(out) :: work(*)
+           real(qp), intent(inout) :: x1(*), x2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: childinfo, i, j
+           ! Intrinsic Function 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           if( m1 < 0 ) then
+              info = -1
+           else if( m2 < 0 ) then
+              info = -2
+           else if( n < 0 ) then
+              info = -3
+           else if( incx1 < 1 ) then
+              info = -5
+           else if( incx2 < 1 ) then
+              info = -7
+           else if( ldq1 < max( 1, m1 ) ) then
+              info = -9
+           else if( ldq2 < max( 1, m2 ) ) then
+              info = -11
+           else if( lwork < n ) then
+              info = -13
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORBDB5', -info )
+              return
+           end if
+           ! project x onto the orthogonal complement of q
+           call stdlib_qorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2, ldq2,work, lwork, &
+                     childinfo )
+           ! if the projection is nonzero, then return
+           if( stdlib_qnrm2(m1,x1,incx1) /= zero.or. stdlib_qnrm2(m2,x2,incx2) /= zero ) &
+                     then
+              return
+           end if
+           ! project each standard basis vector e_1,...,e_m1 in turn, stopping
+           ! when a nonzero projection is found
+           do i = 1, m1
+              do j = 1, m1
+                 x1(j) = zero
+              end do
+              x1(i) = one
+              do j = 1, m2
+                 x2(j) = zero
+              end do
+              call stdlib_qorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                        lwork, childinfo )
+              if( stdlib_qnrm2(m1,x1,incx1) /= zero.or. stdlib_qnrm2(m2,x2,incx2) /= zero ) &
+                        then
+                 return
+              end if
+           end do
+           ! project each standard basis vector e_(m1+1),...,e_(m1+m2) in turn,
+           ! stopping when a nonzero projection is found
+           do i = 1, m2
+              do j = 1, m1
+                 x1(j) = zero
+              end do
+              do j = 1, m2
+                 x2(j) = zero
+              end do
+              x2(i) = one
+              call stdlib_qorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                        lwork, childinfo )
+              if( stdlib_qnrm2(m1,x1,incx1) /= zero.or. stdlib_qnrm2(m2,x2,incx2) /= zero ) &
+                        then
+                 return
+              end if
+           end do
+           return
+     end subroutine stdlib_qorbdb5
+
+     !> DORBDB6: orthogonalizes the column vector
+     !> X = [ X1 ]
+     !> [ X2 ]
+     !> with respect to the columns of
+     !> Q = [ Q1 ] .
+     !> [ Q2 ]
+     !> The columns of Q must be orthonormal.
+     !> If the projection is zero according to Kahan's "twice is enough"
+     !> criterion, then the zero vector is returned.
+
+     pure subroutine stdlib_qorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
+           real(qp), intent(out) :: work(*)
+           real(qp), intent(inout) :: x1(*), x2(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: alphasq = 0.01_qp
+           real(qp), parameter :: realone = 1.0_qp
+           real(qp), parameter :: realzero = 0.0_qp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: normsq1, normsq2, scl1, scl2, ssq1, ssq2
+           ! Intrinsic Function 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           if( m1 < 0 ) then
+              info = -1
+           else if( m2 < 0 ) then
+              info = -2
+           else if( n < 0 ) then
+              info = -3
+           else if( incx1 < 1 ) then
+              info = -5
+           else if( incx2 < 1 ) then
+              info = -7
+           else if( ldq1 < max( 1, m1 ) ) then
+              info = -9
+           else if( ldq2 < max( 1, m2 ) ) then
+              info = -11
+           else if( lwork < n ) then
+              info = -13
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORBDB6', -info )
+              return
+           end if
+           ! first, project x onto the orthogonal complement of q's column
+           ! space
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_qlassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_qlassq( m2, x2, incx2, scl2, ssq2 )
+           normsq1 = scl1**2*ssq1 + scl2**2*ssq2
+           if( m1 == 0 ) then
+              do i = 1, n
+                 work(i) = zero
+              end do
+           else
+              call stdlib_qgemv( 'C', m1, n, one, q1, ldq1, x1, incx1, zero, work,1 )
+           end if
+           call stdlib_qgemv( 'C', m2, n, one, q2, ldq2, x2, incx2, one, work, 1 )
+           call stdlib_qgemv( 'N', m1, n, negone, q1, ldq1, work, 1, one, x1,incx1 )
+           call stdlib_qgemv( 'N', m2, n, negone, q2, ldq2, work, 1, one, x2,incx2 )
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_qlassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_qlassq( m2, x2, incx2, scl2, ssq2 )
+           normsq2 = scl1**2*ssq1 + scl2**2*ssq2
+           ! if projection is sufficiently large in norm, then stop.
+           ! if projection is zero, then stop.
+           ! otherwise, project again.
+           if( normsq2 >= alphasq*normsq1 ) then
+              return
+           end if
+           if( normsq2 == zero ) then
+              return
+           end if
+           normsq1 = normsq2
+           do i = 1, n
+              work(i) = zero
+           end do
+           if( m1 == 0 ) then
+              do i = 1, n
+                 work(i) = zero
+              end do
+           else
+              call stdlib_qgemv( 'C', m1, n, one, q1, ldq1, x1, incx1, zero, work,1 )
+           end if
+           call stdlib_qgemv( 'C', m2, n, one, q2, ldq2, x2, incx2, one, work, 1 )
+           call stdlib_qgemv( 'N', m1, n, negone, q1, ldq1, work, 1, one, x1,incx1 )
+           call stdlib_qgemv( 'N', m2, n, negone, q2, ldq2, work, 1, one, x2,incx2 )
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_qlassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_qlassq( m1, x1, incx1, scl1, ssq1 )
+           normsq2 = scl1**2*ssq1 + scl2**2*ssq2
+           ! if second projection is sufficiently large in norm, then do
+           ! nothing more. alternatively, if it shrunk significantly, then
+           ! truncate it to zero.
+           if( normsq2 < alphasq*normsq1 ) then
+              do i = 1, m1
+                 x1(i) = zero
+              end do
+              do i = 1, m2
+                 x2(i) = zero
+              end do
+           end if
+           return
+     end subroutine stdlib_qorbdb6
+
+     !> DORCSD: computes the CS decomposition of an M-by-M partitioned
+     !> orthogonal matrix X:
+     !> [  I  0  0 |  0  0  0 ]
+     !> [  0  C  0 |  0 -S  0 ]
+     !> [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**T
+     !> X = [-----------] = [---------] [---------------------] [---------]   .
+     !> [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
+     !> [  0  S  0 |  0  C  0 ]
+     !> [  0  0  I |  0  0  0 ]
+     !> X11 is P-by-Q. The orthogonal matrices U1, U2, V1, and V2 are P-by-P,
+     !> (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
+     !> R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
+     !> which R = MIN(P,M-P,Q,M-Q).
+
+     recursive subroutine stdlib_qorcsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, x11, &
+     ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, &
+               work, lwork, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, signs, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, ldx11, ldx12, ldx21, ldx22, &
+                     lwork, m, p, q
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: theta(*)
+           real(qp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*), work(*)
+                     
+           real(qp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
+                     
+        ! ===================================================================
+           
+           ! Local Scalars 
+           character :: transt, signst
+           integer(ilp) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
+           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, itauq2, j, lbbcsdwork, &
+           lbbcsdworkmin, lbbcsdworkopt, lorbdbwork, lorbdbworkmin, lorbdbworkopt, lorglqwork, &
+           lorglqworkmin, lorglqworkopt, lorgqrwork, lorgqrworkmin, lorgqrworkopt, lworkmin, &
+                     lworkopt
+           logical(lk) :: colmajor, defaultsigns, lquery, wantu1, wantu2, wantv1t, wantv2t
+           ! Intrinsic Functions
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           wantv2t = stdlib_lsame( jobv2t, 'Y' )
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           defaultsigns = .not. stdlib_lsame( signs, 'O' )
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -7
+           else if( p < 0 .or. p > m ) then
+              info = -8
+           else if( q < 0 .or. q > m ) then
+              info = -9
+           else if ( colmajor .and.  ldx11 < max( 1, p ) ) then
+             info = -11
+           else if (.not. colmajor .and. ldx11 < max( 1, q ) ) then
+             info = -11
+           else if (colmajor .and. ldx12 < max( 1, p ) ) then
+             info = -13
+           else if (.not. colmajor .and. ldx12 < max( 1, m-q ) ) then
+             info = -13
+           else if (colmajor .and. ldx21 < max( 1, m-p ) ) then
+             info = -15
+           else if (.not. colmajor .and. ldx21 < max( 1, q ) ) then
+             info = -15
+           else if (colmajor .and. ldx22 < max( 1, m-p ) ) then
+             info = -17
+           else if (.not. colmajor .and. ldx22 < max( 1, m-q ) ) then
+             info = -17
+           else if( wantu1 .and. ldu1 < p ) then
+              info = -20
+           else if( wantu2 .and. ldu2 < m-p ) then
+              info = -22
+           else if( wantv1t .and. ldv1t < q ) then
+              info = -24
+           else if( wantv2t .and. ldv2t < m-q ) then
+              info = -26
+           end if
+           ! work with transpose if convenient
+           if( info == 0 .and. min( p, m-p ) < min( q, m-q ) ) then
+              if( colmajor ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              if( defaultsigns ) then
+                 signst = 'O'
+              else
+                 signst = 'D'
+              end if
+              call stdlib_qorcsd( jobv1t, jobv2t, jobu1, jobu2, transt, signst, m,q, p, x11, &
+              ldx11, x21, ldx21, x12, ldx12, x22,ldx22, theta, v1t, ldv1t, v2t, ldv2t, u1, ldu1,&
+                        u2, ldu2, work, lwork, iwork, info )
+              return
+           end if
+           ! work with permutation [ 0 i; i 0 ] * x * [ 0 i; i 0 ] if
+           ! convenient
+           if( info == 0 .and. m-q < q ) then
+              if( defaultsigns ) then
+                 signst = 'O'
+              else
+                 signst = 'D'
+              end if
+              call stdlib_qorcsd( jobu2, jobu1, jobv2t, jobv1t, trans, signst, m,m-p, m-q, x22, &
+              ldx22, x21, ldx21, x12, ldx12, x11,ldx11, theta, u2, ldu2, u1, ldu1, v2t, ldv2t, &
+                        v1t,ldv1t, work, lwork, iwork, info )
+              return
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              iphi = 2
+              itaup1 = iphi + max( 1, q - 1 )
+              itaup2 = itaup1 + max( 1, p )
+              itauq1 = itaup2 + max( 1, m - p )
+              itauq2 = itauq1 + max( 1, q )
+              iorgqr = itauq2 + max( 1, m - q )
+              call stdlib_qorgqr( m-q, m-q, m-q, u1, max(1,m-q), u1, work, -1,childinfo )
+              lorgqrworkopt = int( work(1),KIND=ilp)
+              lorgqrworkmin = max( 1, m - q )
+              iorglq = itauq2 + max( 1, m - q )
+              call stdlib_qorglq( m-q, m-q, m-q, u1, max(1,m-q), u1, work, -1,childinfo )
+              lorglqworkopt = int( work(1),KIND=ilp)
+              lorglqworkmin = max( 1, m - q )
+              iorbdb = itauq2 + max( 1, m - q )
+              call stdlib_qorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+                        ldx22, theta, v1t, u1, u2, v1t,v2t, work, -1, childinfo )
+              lorbdbworkopt = int( work(1),KIND=ilp)
+              lorbdbworkmin = lorbdbworkopt
+              ib11d = itauq2 + max( 1, m - q )
+              ib11e = ib11d + max( 1, q )
+              ib12d = ib11e + max( 1, q - 1 )
+              ib12e = ib12d + max( 1, q )
+              ib21d = ib12e + max( 1, q - 1 )
+              ib21e = ib21d + max( 1, q )
+              ib22d = ib21e + max( 1, q - 1 )
+              ib22e = ib22d + max( 1, q )
+              ibbcsd = ib22e + max( 1, q - 1 )
+              call stdlib_qbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, theta, u1, &
+              ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, u1, u1, u1, u1, u1, u1, u1, u1, work, -1,&
+                        childinfo )
+              lbbcsdworkopt = int( work(1),KIND=ilp)
+              lbbcsdworkmin = lbbcsdworkopt
+              lworkopt = max( iorgqr + lorgqrworkopt, iorglq + lorglqworkopt,iorbdb + &
+                        lorbdbworkopt, ibbcsd + lbbcsdworkopt ) - 1
+              lworkmin = max( iorgqr + lorgqrworkmin, iorglq + lorglqworkmin,iorbdb + &
+                        lorbdbworkopt, ibbcsd + lbbcsdworkmin ) - 1
+              work(1) = max(lworkopt,lworkmin)
+              if( lwork < lworkmin .and. .not. lquery ) then
+                 info = -22
+              else
+                 lorgqrwork = lwork - iorgqr + 1
+                 lorglqwork = lwork - iorglq + 1
+                 lorbdbwork = lwork - iorbdb + 1
+                 lbbcsdwork = lwork - ibbcsd + 1
+              end if
+           end if
+           ! abort if any illegal arguments
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORCSD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! transform to bidiagonal block form
+           call stdlib_qorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12, x21,ldx21, x22, &
+           ldx22, theta, work(iphi), work(itaup1),work(itaup2), work(itauq1), work(itauq2),work(&
+                     iorbdb), lorbdbwork, childinfo )
+           ! accumulate householder reflectors
+           if( colmajor ) then
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_qlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_qorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqrwork, &
+                           info)
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_qlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_qorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqrwork,&
+                            info )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_qlacpy( 'U', q-1, q-1, x11(1,2), ldx11, v1t(2,2),ldv1t )
+                 v1t(1, 1) = one
+                 do j = 2, q
+                    v1t(1,j) = zero
+                    v1t(j,1) = zero
+                 end do
+                 call stdlib_qorglq( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorglq), &
+                           lorglqwork, info )
+              end if
+              if( wantv2t .and. m-q > 0 ) then
+                 call stdlib_qlacpy( 'U', p, m-q, x12, ldx12, v2t, ldv2t )
+                 if (m-p > q) then
+                    call stdlib_qlacpy( 'U', m-p-q, m-p-q, x22(q+1,p+1), ldx22,v2t(p+1,p+1), &
+                              ldv2t )
+                 end if
+                 if (m > q) then
+                    call stdlib_qorglq( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorglq), &
+                              lorglqwork, info )
+                 end if
+              end if
+           else
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_qlacpy( 'U', q, p, x11, ldx11, u1, ldu1 )
+                 call stdlib_qorglq( p, p, q, u1, ldu1, work(itaup1), work(iorglq),lorglqwork, &
+                           info)
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_qlacpy( 'U', q, m-p, x21, ldx21, u2, ldu2 )
+                 call stdlib_qorglq( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorglq), lorglqwork,&
+                            info )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_qlacpy( 'L', q-1, q-1, x11(2,1), ldx11, v1t(2,2),ldv1t )
+                 v1t(1, 1) = one
+                 do j = 2, q
+                    v1t(1,j) = zero
+                    v1t(j,1) = zero
+                 end do
+                 call stdlib_qorgqr( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorgqr), &
+                           lorgqrwork, info )
+              end if
+              if( wantv2t .and. m-q > 0 ) then
+                 call stdlib_qlacpy( 'L', m-q, p, x12, ldx12, v2t, ldv2t )
+                 call stdlib_qlacpy( 'L', m-p-q, m-p-q, x22(p+1,q+1), ldx22,v2t(p+1,p+1), ldv2t )
+                           
+                 call stdlib_qorgqr( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorgqr), &
+                           lorgqrwork, info )
+              end if
+           end if
+           ! compute the csd of the matrix in bidiagonal-block form
+           call stdlib_qbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q, theta,work(iphi), u1,&
+            ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, work(ib11d), work(ib11e), work(ib12d),work(&
+            ib12e), work(ib21d), work(ib21e), work(ib22d),work(ib22e), work(ibbcsd), lbbcsdwork, &
+                      info )
+           ! permute rows and columns to place identity submatrices in top-
+           ! left corner of (1,1)-block and/or bottom-right corner of (1,2)-
+           ! block and/or bottom-right corner of (2,1)-block and/or top-left
+           ! corner of (2,2)-block
+           if( q > 0 .and. wantu2 ) then
+              do i = 1, q
+                 iwork(i) = m - p - q + i
+              end do
+              do i = q + 1, m - p
+                 iwork(i) = i - q
+              end do
+              if( colmajor ) then
+                 call stdlib_qlapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              else
+                 call stdlib_qlapmr( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           end if
+           if( m > 0 .and. wantv2t ) then
+              do i = 1, p
+                 iwork(i) = m - p - q + i
+              end do
+              do i = p + 1, m - q
+                 iwork(i) = i - p
+              end do
+              if( .not. colmajor ) then
+                 call stdlib_qlapmt( .false., m-q, m-q, v2t, ldv2t, iwork )
+              else
+                 call stdlib_qlapmr( .false., m-q, m-q, v2t, ldv2t, iwork )
+              end if
+           end if
+           return
+           ! end stdlib_qorcsd
+     end subroutine stdlib_qorcsd
+
+     !> DORCSD2BY1: computes the CS decomposition of an M-by-Q matrix X with
+     !> orthonormal columns that has been partitioned into a 2-by-1 block
+     !> structure:
+     !> [  I1 0  0 ]
+     !> [  0  C  0 ]
+     !> [ X11 ]   [ U1 |    ] [  0  0  0 ]
+     !> X = [-----] = [---------] [----------] V1**T .
+     !> [ X21 ]   [    | U2 ] [  0  0  0 ]
+     !> [  0  S  0 ]
+     !> [  0  0  I2]
+     !> X11 is P-by-Q. The orthogonal matrices U1, U2, and V1 are P-by-P,
+     !> (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
+     !> nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
+     !> R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
+     !> K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
+
+     subroutine stdlib_qorcsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta, &
+               u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, iwork, info )
+        ! -- lapack computational routine (3.5.0_qp) --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, lwork, ldx11, ldx21, m, p, q
+           ! Array Arguments 
+           real(qp), intent(out) :: theta(*)
+           real(qp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), work(*)
+           real(qp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
+           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, j, lbbcsd, lorbdb, &
+           lorglq, lorglqmin, lorglqopt, lorgqr, lorgqrmin, lorgqropt, lworkmin, lworkopt, &
+                     r
+           logical(lk) :: lquery, wantu1, wantu2, wantv1t
+           ! Local Arrays 
+           real(qp) :: dum1(1), dum2(1,1)
+           ! Intrinsic Function 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -4
+           else if( p < 0 .or. p > m ) then
+              info = -5
+           else if( q < 0 .or. q > m ) then
+              info = -6
+           else if( ldx11 < max( 1, p ) ) then
+              info = -8
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -10
+           else if( wantu1 .and. ldu1 < max( 1, p ) ) then
+              info = -13
+           else if( wantu2 .and. ldu2 < max( 1, m - p ) ) then
+              info = -15
+           else if( wantv1t .and. ldv1t < max( 1, q ) ) then
+              info = -17
+           end if
+           r = min( p, m-p, q, m-q )
+           ! compute workspace
+             ! work layout:
+           ! |-------------------------------------------------------|
+           ! | lworkopt (1)                                          |
+           ! |-------------------------------------------------------|
+           ! | phi (max(1,r-1))                                      |
+           ! |-------------------------------------------------------|
+           ! | taup1 (max(1,p))                        | b11d (r)    |
+           ! | taup2 (max(1,m-p))                      | b11e (r-1)  |
+           ! | tauq1 (max(1,q))                        | b12d (r)    |
+           ! |-----------------------------------------| b12e (r-1)  |
+           ! | stdlib_qorbdb work | stdlib_qorgqr work | stdlib_qorglq work | b21d (r)    |
+           ! |             |             |             | b21e (r-1)  |
+           ! |             |             |             | b22d (r)    |
+           ! |             |             |             | b22e (r-1)  |
+           ! |             |             |             | stdlib_qbbcsd work |
+           ! |-------------------------------------------------------|
+           if( info == 0 ) then
+              iphi = 2
+              ib11d = iphi + max( 1, r-1 )
+              ib11e = ib11d + max( 1, r )
+              ib12d = ib11e + max( 1, r - 1 )
+              ib12e = ib12d + max( 1, r )
+              ib21d = ib12e + max( 1, r - 1 )
+              ib21e = ib21d + max( 1, r )
+              ib22d = ib21e + max( 1, r - 1 )
+              ib22e = ib22d + max( 1, r )
+              ibbcsd = ib22e + max( 1, r - 1 )
+              itaup1 = iphi + max( 1, r-1 )
+              itaup2 = itaup1 + max( 1, p )
+              itauq1 = itaup2 + max( 1, m-p )
+              iorbdb = itauq1 + max( 1, q )
+              iorgqr = itauq1 + max( 1, q )
+              iorglq = itauq1 + max( 1, q )
+              lorgqrmin = 1
+              lorgqropt = 1
+              lorglqmin = 1
+              lorglqopt = 1
+              if( r == q ) then
+                 call stdlib_qorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
+                           dum1, work,-1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_qorgqr( p, p, q, u1, ldu1, dum1, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 endif
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_qorgqr( m-p, m-p, q, u2, ldu2, dum1, work(1),-1, childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_qorglq( q-1, q-1, q-1, v1t, ldv1t,dum1, work(1), -1, childinfo )
+                              
+                    lorglqmin = max( lorglqmin, q-1 )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_qbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,dum1, u1, &
+                 ldu1, u2, ldu2, v1t, ldv1t,dum2, 1, dum1, dum1, dum1,dum1, dum1, dum1, dum1,dum1,&
+                            work(1), -1, childinfo )
+                 lbbcsd = int( work(1),KIND=ilp)
+              else if( r == p ) then
+                 call stdlib_qorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
+                           dum1,work(1), -1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_qorgqr( p-1, p-1, p-1, u1(2,2), ldu1, dum1,work(1), -1, childinfo &
+                              )
+                    lorgqrmin = max( lorgqrmin, p-1 )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_qorgqr( m-p, m-p, q, u2, ldu2, dum1, work(1),-1, childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_qorglq( q, q, r, v1t, ldv1t, dum1, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_qbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,dum1, v1t, &
+                 ldv1t, dum2, 1, u1, ldu1,u2, ldu2, dum1, dum1, dum1,dum1, dum1, dum1, dum1,dum1, &
+                           work(1), -1, childinfo )
+                 lbbcsd = int( work(1),KIND=ilp)
+              else if( r == m-p ) then
+                 call stdlib_qorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
+                           dum1,work(1), -1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_qorgqr( p, p, q, u1, ldu1, dum1, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_qorgqr( m-p-1, m-p-1, m-p-1, u2(2,2), ldu2,dum1, work(1), -1, &
+                              childinfo )
+                    lorgqrmin = max( lorgqrmin, m-p-1 )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_qorglq( q, q, r, v1t, ldv1t, dum1, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_qbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, dum1, &
+                 dum2, 1, v1t, ldv1t, u2,ldu2, u1, ldu1, dum1, dum1, dum1,dum1, dum1, dum1, dum1,&
+                           dum1, work(1), -1, childinfo )
+                 lbbcsd = int( work(1),KIND=ilp)
+              else
+                 call stdlib_qorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
+                           dum1,dum1, work(1), -1, childinfo )
+                 lorbdb = m + int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_qorgqr( p, p, m-q, u1, ldu1, dum1, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_qorgqr( m-p, m-p, m-q, u2, ldu2, dum1, work(1),-1, childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_qorglq( q, q, q, v1t, ldv1t, dum1, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_qbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, dum1, u2, &
+                 ldu2, u1, ldu1, dum2,1, v1t, ldv1t, dum1, dum1, dum1,dum1, dum1, dum1, dum1,dum1,&
+                            work(1), -1, childinfo )
+                 lbbcsd = int( work(1),KIND=ilp)
+              end if
+              lworkmin = max( iorbdb+lorbdb-1,iorgqr+lorgqrmin-1,iorglq+lorglqmin-1,ibbcsd+lbbcsd-&
+                        1 )
+              lworkopt = max( iorbdb+lorbdb-1,iorgqr+lorgqropt-1,iorglq+lorglqopt-1,ibbcsd+lbbcsd-&
+                        1 )
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                 info = -19
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'DORCSD2BY1', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           lorgqr = lwork-iorgqr+1
+           lorglq = lwork-iorglq+1
+           ! handle four cases separately: r = q, r = p, r = m-p, and r = m-q,
+           ! in which r = min(p,m-p,q,m-q)
+           if( r == q ) then
+              ! case 1: r = q
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_qorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
+                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_qlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_qorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_qlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_qorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 v1t(1,1) = one
+                 do j = 2, q
+                    v1t(1,j) = zero
+                    v1t(j,1) = zero
+                 end do
+                 call stdlib_qlacpy( 'U', q-1, q-1, x21(1,2), ldx21, v1t(2,2),ldv1t )
+                 call stdlib_qorglq( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorglq), &
+                           lorglq, childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_qbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,work(iphi), u1, &
+              ldu1, u2, ldu2, v1t, ldv1t,dum2, 1, work(ib11d), work(ib11e),work(ib12d), work(&
+              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
+                        childinfo )
+              ! permute rows and columns to place zero submatrices in
+              ! preferred positions
+              if( q > 0 .and. wantu2 ) then
+                 do i = 1, q
+                    iwork(i) = m - p - q + i
+                 end do
+                 do i = q + 1, m - p
+                    iwork(i) = i - q
+                 end do
+                 call stdlib_qlapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           else if( r == p ) then
+              ! case 2: r = p
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_qorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
+                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 u1(1,1) = one
+                 do j = 2, p
+                    u1(1,j) = zero
+                    u1(j,1) = zero
+                 end do
+                 call stdlib_qlacpy( 'L', p-1, p-1, x11(2,1), ldx11, u1(2,2), ldu1 )
+                 call stdlib_qorgqr( p-1, p-1, p-1, u1(2,2), ldu1, work(itaup1),work(iorgqr), &
+                           lorgqr, childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_qlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_qorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_qlacpy( 'U', p, q, x11, ldx11, v1t, ldv1t )
+                 call stdlib_qorglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_qbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,work(iphi), v1t, &
+              ldv1t, dum2, 1, u1, ldu1, u2,ldu2, work(ib11d), work(ib11e), work(ib12d),work(ib12e)&
+              , work(ib21d), work(ib21e),work(ib22d), work(ib22e), work(ibbcsd), lbbcsd,childinfo &
+                        )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( q > 0 .and. wantu2 ) then
+                 do i = 1, q
+                    iwork(i) = m - p - q + i
+                 end do
+                 do i = q + 1, m - p
+                    iwork(i) = i - q
+                 end do
+                 call stdlib_qlapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           else if( r == m-p ) then
+              ! case 3: r = m-p
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_qorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
+                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_qlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_qorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 u2(1,1) = one
+                 do j = 2, m-p
+                    u2(1,j) = zero
+                    u2(j,1) = zero
+                 end do
+                 call stdlib_qlacpy( 'L', m-p-1, m-p-1, x21(2,1), ldx21, u2(2,2),ldu2 )
+                 call stdlib_qorgqr( m-p-1, m-p-1, m-p-1, u2(2,2), ldu2,work(itaup2), work(iorgqr)&
+                           , lorgqr, childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_qlacpy( 'U', m-p, q, x21, ldx21, v1t, ldv1t )
+                 call stdlib_qorglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_qbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, work(iphi), &
+              dum2, 1, v1t, ldv1t, u2,ldu2, u1, ldu1, work(ib11d), work(ib11e),work(ib12d), work(&
+              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
+                        childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( q > r ) then
+                 do i = 1, r
+                    iwork(i) = q - r + i
+                 end do
+                 do i = r + 1, q
+                    iwork(i) = i - r
+                 end do
+                 if( wantu1 ) then
+                    call stdlib_qlapmt( .false., p, q, u1, ldu1, iwork )
+                 end if
+                 if( wantv1t ) then
+                    call stdlib_qlapmr( .false., q, q, v1t, ldv1t, iwork )
+                 end if
+              end if
+           else
+              ! case 4: r = m-q
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_qorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
+              , work(itaup2),work(itauq1), work(iorbdb), work(iorbdb+m),lorbdb-m, childinfo )
+                        
+              ! accumulate householder reflectors
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_qcopy( m-p, work(iorbdb+p), 1, u2, 1 )
+              end if
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_qcopy( p, work(iorbdb), 1, u1, 1 )
+                 do j = 2, p
+                    u1(1,j) = zero
+                 end do
+                 call stdlib_qlacpy( 'L', p-1, m-q-1, x11(2,1), ldx11, u1(2,2),ldu1 )
+                 call stdlib_qorgqr( p, p, m-q, u1, ldu1, work(itaup1),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 do j = 2, m-p
+                    u2(1,j) = zero
+                 end do
+                 call stdlib_qlacpy( 'L', m-p-1, m-q-1, x21(2,1), ldx21, u2(2,2),ldu2 )
+                 call stdlib_qorgqr( m-p, m-p, m-q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_qlacpy( 'U', m-q, q, x21, ldx21, v1t, ldv1t )
+                 call stdlib_qlacpy( 'U', p-(m-q), q-(m-q), x11(m-q+1,m-q+1), ldx11,v1t(m-q+1,m-q+&
+                           1), ldv1t )
+                 call stdlib_qlacpy( 'U', -p+q, q-p, x21(m-q+1,p+1), ldx21,v1t(p+1,p+1), ldv1t )
+                           
+                 call stdlib_qorglq( q, q, q, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_qbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, work(iphi), &
+              u2, ldu2, u1, ldu1, dum2,1, v1t, ldv1t, work(ib11d), work(ib11e),work(ib12d), work(&
+              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
+                        childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( p > r ) then
+                 do i = 1, r
+                    iwork(i) = p - r + i
+                 end do
+                 do i = r + 1, p
+                    iwork(i) = i - r
+                 end do
+                 if( wantu1 ) then
+                    call stdlib_qlapmt( .false., p, p, u1, ldu1, iwork )
+                 end if
+                 if( wantv1t ) then
+                    call stdlib_qlapmr( .false., p, q, v1t, ldv1t, iwork )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qorcsd2by1
+
+     !> DORG2L: generates an m by n real matrix Q with orthonormal columns,
+     !> which is defined as the last n columns of a product of k elementary
+     !> reflectors of order m
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by DGEQLF.
+
+     pure subroutine stdlib_qorg2l( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORG2L', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           ! initialise columns 1:n-k to columns of the unit matrix
+           do j = 1, n - k
+              do l = 1, m
+                 a( l, j ) = zero
+              end do
+              a( m-n+j, j ) = one
+           end do
+           do i = 1, k
+              ii = n - k + i
+              ! apply h(i) to a(1:m-k+i,1:n-k+i) from the left
+              a( m-n+ii, ii ) = one
+              call stdlib_qlarf( 'LEFT', m-n+ii, ii-1, a( 1, ii ), 1, tau( i ), a,lda, work )
+                        
+              call stdlib_qscal( m-n+ii-1, -tau( i ), a( 1, ii ), 1 )
+              a( m-n+ii, ii ) = one - tau( i )
+              ! set a(m-k+i+1:m,n-k+i) to zero
+              do l = m - n + ii + 1, m
+                 a( l, ii ) = zero
+              end do
+           end do
+           return
+     end subroutine stdlib_qorg2l
+
+     !> DORG2R: generates an m by n real matrix Q with orthonormal columns,
+     !> which is defined as the first n columns of a product of k elementary
+     !> reflectors of order m
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by DGEQRF.
+
+     pure subroutine stdlib_qorg2r( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORG2R', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           ! initialise columns k+1:n to columns of the unit matrix
+           do j = k + 1, n
+              do l = 1, m
+                 a( l, j ) = zero
+              end do
+              a( j, j ) = one
+           end do
+           do i = k, 1, -1
+              ! apply h(i) to a(i:m,i:n) from the left
+              if( i<n ) then
+                 a( i, i ) = one
+                 call stdlib_qlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tau( i ),a( i, i+1 ), lda, &
+                           work )
+              end if
+              if( i<m )call stdlib_qscal( m-i, -tau( i ), a( i+1, i ), 1 )
+              a( i, i ) = one - tau( i )
+              ! set a(1:i-1,i) to zero
+              do l = 1, i - 1
+                 a( l, i ) = zero
+              end do
+           end do
+           return
+     end subroutine stdlib_qorg2r
+
+     !> DORGBR: generates one of the real orthogonal matrices Q or P**T
+     !> determined by DGEBRD when reducing a real matrix A to bidiagonal
+     !> form: A = Q * B * P**T.  Q and P**T are defined as products of
+     !> elementary reflectors H(i) or G(i) respectively.
+     !> If VECT = 'Q', A is assumed to have been an M-by-K matrix, and Q
+     !> is of order M:
+     !> if m >= k, Q = H(1) H(2) . . . H(k) and DORGBR returns the first n
+     !> columns of Q, where m >= n >= k;
+     !> if m < k, Q = H(1) H(2) . . . H(m-1) and DORGBR returns Q as an
+     !> M-by-M matrix.
+     !> If VECT = 'P', A is assumed to have been a K-by-N matrix, and P**T
+     !> is of order N:
+     !> if k < n, P**T = G(k) . . . G(2) G(1) and DORGBR returns the first m
+     !> rows of P**T, where n >= m >= k;
+     !> if k >= n, P**T = G(n-1) . . . G(2) G(1) and DORGBR returns P**T as
+     !> an N-by-N matrix.
+
+     pure subroutine stdlib_qorgbr( vect, m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantq
+           integer(ilp) :: i, iinfo, j, lwkopt, mn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           wantq = stdlib_lsame( vect, 'Q' )
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( .not.wantq .and. .not.stdlib_lsame( vect, 'P' ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 .or. ( wantq .and. ( n>m .or. n<min( m,k ) ) ) .or. ( .not.wantq .and. ( &
+                     m>n .or. m<min( n, k ) ) ) ) then
+              info = -3
+           else if( k<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( lwork<max( 1, mn ) .and. .not.lquery ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              work( 1 ) = 1
+              if( wantq ) then
+                 if( m>=k ) then
+                    call stdlib_qorgqr( m, n, k, a, lda, tau, work, -1, iinfo )
+                 else
+                    if( m>1 ) then
+                       call stdlib_qorgqr( m-1, m-1, m-1, a, lda, tau, work, -1,iinfo )
+                    end if
+                 end if
+              else
+                 if( k<n ) then
+                    call stdlib_qorglq( m, n, k, a, lda, tau, work, -1, iinfo )
+                 else
+                    if( n>1 ) then
+                       call stdlib_qorglq( n-1, n-1, n-1, a, lda, tau, work, -1,iinfo )
+                    end if
+                 end if
+              end if
+              lwkopt = work( 1 )
+              lwkopt = max (lwkopt, mn)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGBR', -info )
+              return
+           else if( lquery ) then
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( wantq ) then
+              ! form q, determined by a call to stdlib_qgebrd to reduce an m-by-k
+              ! matrix
+              if( m>=k ) then
+                 ! if m >= k, assume m >= n >= k
+                 call stdlib_qorgqr( m, n, k, a, lda, tau, work, lwork, iinfo )
+              else
+                 ! if m < k, assume m = n
+                 ! shift the vectors which define the elementary reflectors one
+                 ! column to the right, and set the first row and column of q
+                 ! to those of the unit matrix
+                 do j = m, 2, -1
+                    a( 1, j ) = zero
+                    do i = j + 1, m
+                       a( i, j ) = a( i, j-1 )
+                    end do
+                 end do
+                 a( 1, 1 ) = one
+                 do i = 2, m
+                    a( i, 1 ) = zero
+                 end do
+                 if( m>1 ) then
+                    ! form q(2:m,2:m)
+                    call stdlib_qorgqr( m-1, m-1, m-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                              
+                 end if
+              end if
+           else
+              ! form p**t, determined by a call to stdlib_qgebrd to reduce a k-by-n
+              ! matrix
+              if( k<n ) then
+                 ! if k < n, assume k <= m <= n
+                 call stdlib_qorglq( m, n, k, a, lda, tau, work, lwork, iinfo )
+              else
+                 ! if k >= n, assume m = n
+                 ! shift the vectors which define the elementary reflectors one
+                 ! row downward, and set the first row and column of p**t to
+                 ! those of the unit matrix
+                 a( 1, 1 ) = one
+                 do i = 2, n
+                    a( i, 1 ) = zero
+                 end do
+                 do j = 2, n
+                    do i = j - 1, 2, -1
+                       a( i, j ) = a( i-1, j )
+                    end do
+                    a( 1, j ) = zero
+                 end do
+                 if( n>1 ) then
+                    ! form p**t(2:n,2:n)
+                    call stdlib_qorglq( n-1, n-1, n-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                              
+                 end if
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qorgbr
+
+     !> DORGHR: generates a real orthogonal matrix Q which is defined as the
+     !> product of IHI-ILO elementary reflectors of order N, as returned by
+     !> DGEHRD:
+     !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+     pure subroutine stdlib_qorghr( n, ilo, ihi, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, lwkopt, nb, nh
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nh = ihi - ilo
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( lwork<max( 1, nh ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'DORGQR', ' ', nh, nh, nh, -1 )
+              lwkopt = max( 1, nh )*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGHR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! shift the vectors which define the elementary reflectors one
+           ! column to the right, and set the first ilo and the last n-ihi
+           ! rows and columns to those of the unit matrix
+           do j = ihi, ilo + 1, -1
+              do i = 1, j - 1
+                 a( i, j ) = zero
+              end do
+              do i = j + 1, ihi
+                 a( i, j ) = a( i, j-1 )
+              end do
+              do i = ihi + 1, n
+                 a( i, j ) = zero
+              end do
+           end do
+           do j = 1, ilo
+              do i = 1, n
+                 a( i, j ) = zero
+              end do
+              a( j, j ) = one
+           end do
+           do j = ihi + 1, n
+              do i = 1, n
+                 a( i, j ) = zero
+              end do
+              a( j, j ) = one
+           end do
+           if( nh>0 ) then
+              ! generate q(ilo+1:ihi,ilo+1:ihi)
+              call stdlib_qorgqr( nh, nh, nh, a( ilo+1, ilo+1 ), lda, tau( ilo ),work, lwork, &
+                        iinfo )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qorghr
+
+     !> DORGL2: generates an m by n real matrix Q with orthonormal rows,
+     !> which is defined as the first m rows of a product of k elementary
+     !> reflectors of order n
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by DGELQF.
+
+     pure subroutine stdlib_qorgl2( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGL2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 )return
+           if( k<m ) then
+              ! initialise rows k+1:m to rows of the unit matrix
+              do j = 1, n
+                 do l = k + 1, m
+                    a( l, j ) = zero
+                 end do
+                 if( j>k .and. j<=m )a( j, j ) = one
+              end do
+           end if
+           do i = k, 1, -1
+              ! apply h(i) to a(i:m,i:n) from the right
+              if( i<n ) then
+                 if( i<m ) then
+                    a( i, i ) = one
+                    call stdlib_qlarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda,tau( i ), a( i+1, i ), &
+                              lda, work )
+                 end if
+                 call stdlib_qscal( n-i, -tau( i ), a( i, i+1 ), lda )
+              end if
+              a( i, i ) = one - tau( i )
+              ! set a(i,1:i-1) to zero
+              do l = 1, i - 1
+                 a( i, l ) = zero
+              end do
+           end do
+           return
+     end subroutine stdlib_qorgl2
+
+     !> DORGLQ: generates an M-by-N real matrix Q with orthonormal rows,
+     !> which is defined as the first M rows of a product of K elementary
+     !> reflectors of order N
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by DGELQF.
+
+     pure subroutine stdlib_qorglq( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, ki, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DORGLQ', ' ', m, n, k, -1 )
+           lwkopt = max( 1, m )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( lwork<max( 1, m ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGLQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DORGLQ', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DORGLQ', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the last block.
+              ! the first kk rows are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              ! set a(kk+1:m,1:kk) to zero.
+              do j = 1, kk
+                 do i = kk + 1, m
+                    a( i, j ) = zero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the last or only block.
+           if( kk<m )call stdlib_qorgl2( m-kk, n-kk, k-kk, a( kk+1, kk+1 ), lda,tau( kk+1 ), work,&
+                      iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = ki + 1, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 if( i+ib<=m ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_qlarft( 'FORWARD', 'ROWWISE', n-i+1, ib, a( i, i ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h**t to a(i+ib:m,i:n) from the right
+                    call stdlib_qlarfb( 'RIGHT', 'TRANSPOSE', 'FORWARD', 'ROWWISE',m-i-ib+1, n-i+&
+                    1, ib, a( i, i ), lda, work,ldwork, a( i+ib, i ), lda, work( ib+1 ),ldwork )
+                              
+                 end if
+                 ! apply h**t to columns i:n of current block
+                 call stdlib_qorgl2( ib, n-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 ! set columns 1:i-1 of current block to zero
+                 do j = 1, i - 1
+                    do l = i, i + ib - 1
+                       a( l, j ) = zero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_qorglq
+
+     !> DORGQL: generates an M-by-N real matrix Q with orthonormal columns,
+     !> which is defined as the last N columns of a product of K elementary
+     !> reflectors of order M
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by DGEQLF.
+
+     pure subroutine stdlib_qorgql( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'DORGQL', ' ', m, n, k, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, n ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGQL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DORGQL', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DORGQL', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the first block.
+              ! the last kk columns are handled by the block method.
+              kk = min( k, ( ( k-nx+nb-1 ) / nb )*nb )
+              ! set a(m-kk+1:m,1:n-kk) to zero.
+              do j = 1, n - kk
+                 do i = m - kk + 1, m
+                    a( i, j ) = zero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the first or only block.
+           call stdlib_qorg2l( m-kk, n-kk, k-kk, a, lda, tau, work, iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = k - kk + 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 if( n-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_qlarft( 'BACKWARD', 'COLUMNWISE', m-k+i+ib-1, ib,a( 1, n-k+i ), &
+                              lda, tau( i ), work, ldwork )
+                    ! apply h to a(1:m-k+i+ib-1,1:n-k+i-1) from the left
+                    call stdlib_qlarfb( 'LEFT', 'NO TRANSPOSE', 'BACKWARD','COLUMNWISE', m-k+i+ib-&
+                    1, n-k+i-1, ib,a( 1, n-k+i ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+                 ! apply h to rows 1:m-k+i+ib-1 of current block
+                 call stdlib_qorg2l( m-k+i+ib-1, ib, ib, a( 1, n-k+i ), lda,tau( i ), work, iinfo &
+                           )
+                 ! set rows m-k+i+ib:m of current block to zero
+                 do j = n - k + i, n - k + i + ib - 1
+                    do l = m - k + i + ib, m
+                       a( l, j ) = zero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_qorgql
+
+     !> DORGQR: generates an M-by-N real matrix Q with orthonormal columns,
+     !> which is defined as the first N columns of a product of K elementary
+     !> reflectors of order M
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by DGEQRF.
+
+     pure subroutine stdlib_qorgqr( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, ki, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DORGQR', ' ', m, n, k, -1 )
+           lwkopt = max( 1, n )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGQR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DORGQR', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DORGQR', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the last block.
+              ! the first kk columns are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              ! set a(1:kk,kk+1:n) to zero.
+              do j = kk + 1, n
+                 do i = 1, kk
+                    a( i, j ) = zero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the last or only block.
+           if( kk<n )call stdlib_qorg2r( m-kk, n-kk, k-kk, a( kk+1, kk+1 ), lda,tau( kk+1 ), work,&
+                      iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = ki + 1, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_qlarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h to a(i:m,i+ib:n) from the left
+                    call stdlib_qlarfb( 'LEFT', 'NO TRANSPOSE', 'FORWARD','COLUMNWISE', m-i+1, n-&
+                    i-ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 ), &
+                              ldwork )
+                 end if
+                 ! apply h to rows i:m of current block
+                 call stdlib_qorg2r( m-i+1, ib, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 ! set rows 1:i-1 of current block to zero
+                 do j = i, i + ib - 1
+                    do l = 1, i - 1
+                       a( l, j ) = zero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_qorgqr
+
+     !> DORGR2: generates an m by n real matrix Q with orthonormal rows,
+     !> which is defined as the last m rows of a product of k elementary
+     !> reflectors of order n
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by DGERQF.
+
+     pure subroutine stdlib_qorgr2( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGR2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 )return
+           if( k<m ) then
+              ! initialise rows 1:m-k to rows of the unit matrix
+              do j = 1, n
+                 do l = 1, m - k
+                    a( l, j ) = zero
+                 end do
+                 if( j>n-m .and. j<=n-k )a( m-n+j, j ) = one
+              end do
+           end if
+           do i = 1, k
+              ii = m - k + i
+              ! apply h(i) to a(1:m-k+i,1:n-k+i) from the right
+              a( ii, n-m+ii ) = one
+              call stdlib_qlarf( 'RIGHT', ii-1, n-m+ii, a( ii, 1 ), lda, tau( i ),a, lda, work )
+                        
+              call stdlib_qscal( n-m+ii-1, -tau( i ), a( ii, 1 ), lda )
+              a( ii, n-m+ii ) = one - tau( i )
+              ! set a(m-k+i,n-k+i+1:n) to zero
+              do l = n - m + ii + 1, n
+                 a( ii, l ) = zero
+              end do
+           end do
+           return
+     end subroutine stdlib_qorgr2
+
+     !> DORGRQ: generates an M-by-N real matrix Q with orthonormal rows,
+     !> which is defined as the last M rows of a product of K elementary
+     !> reflectors of order N
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by DGERQF.
+
+     pure subroutine stdlib_qorgrq( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, ii, iinfo, iws, j, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'DORGRQ', ' ', m, n, k, -1 )
+                 lwkopt = m*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, m ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGRQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DORGRQ', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DORGRQ', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the first block.
+              ! the last kk rows are handled by the block method.
+              kk = min( k, ( ( k-nx+nb-1 ) / nb )*nb )
+              ! set a(1:m-kk,n-kk+1:n) to zero.
+              do j = n - kk + 1, n
+                 do i = 1, m - kk
+                    a( i, j ) = zero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the first or only block.
+           call stdlib_qorgr2( m-kk, n-kk, k-kk, a, lda, tau, work, iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = k - kk + 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 ii = m - k + i
+                 if( ii>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_qlarft( 'BACKWARD', 'ROWWISE', n-k+i+ib-1, ib,a( ii, 1 ), lda, &
+                              tau( i ), work, ldwork )
+                    ! apply h**t to a(1:m-k+i-1,1:n-k+i+ib-1) from the right
+                    call stdlib_qlarfb( 'RIGHT', 'TRANSPOSE', 'BACKWARD', 'ROWWISE',ii-1, n-k+i+&
+                              ib-1, ib, a( ii, 1 ), lda, work,ldwork, a, lda, work( ib+1 ), ldwork )
+                 end if
+                 ! apply h**t to columns 1:n-k+i+ib-1 of current block
+                 call stdlib_qorgr2( ib, n-k+i+ib-1, ib, a( ii, 1 ), lda, tau( i ),work, iinfo )
+                           
+                 ! set columns n-k+i+ib:n of current block to zero
+                 do l = n - k + i + ib, n
+                    do j = ii, ii + ib - 1
+                       a( j, l ) = zero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_qorgrq
+
+     !> DORGTR: generates a real orthogonal matrix Q which is defined as the
+     !> product of n-1 elementary reflectors of order N, as returned by
+     !> DSYTRD:
+     !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+
+     pure subroutine stdlib_qorgtr( uplo, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, j, lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, n-1 ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              if( upper ) then
+                 nb = stdlib_ilaenv( 1, 'DORGQL', ' ', n-1, n-1, n-1, -1 )
+              else
+                 nb = stdlib_ilaenv( 1, 'DORGQR', ' ', n-1, n-1, n-1, -1 )
+              end if
+              lwkopt = max( 1, n-1 )*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGTR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( upper ) then
+              ! q was determined by a call to stdlib_qsytrd with uplo = 'u'
+              ! shift the vectors which define the elementary reflectors one
+              ! column to the left, and set the last row and column of q to
+              ! those of the unit matrix
+              do j = 1, n - 1
+                 do i = 1, j - 1
+                    a( i, j ) = a( i, j+1 )
+                 end do
+                 a( n, j ) = zero
+              end do
+              do i = 1, n - 1
+                 a( i, n ) = zero
+              end do
+              a( n, n ) = one
+              ! generate q(1:n-1,1:n-1)
+              call stdlib_qorgql( n-1, n-1, n-1, a, lda, tau, work, lwork, iinfo )
+           else
+              ! q was determined by a call to stdlib_qsytrd with uplo = 'l'.
+              ! shift the vectors which define the elementary reflectors one
+              ! column to the right, and set the first row and column of q to
+              ! those of the unit matrix
+              do j = n, 2, -1
+                 a( 1, j ) = zero
+                 do i = j + 1, n
+                    a( i, j ) = a( i, j-1 )
+                 end do
+              end do
+              a( 1, 1 ) = one
+              do i = 2, n
+                 a( i, 1 ) = zero
+              end do
+              if( n>1 ) then
+                 ! generate q(2:n,2:n)
+                 call stdlib_qorgqr( n-1, n-1, n-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                           
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qorgtr
+
+     !> DORGTSQR: generates an M-by-N real matrix Q_out with orthonormal columns,
+     !> which are the first N columns of a product of real orthogonal
+     !> matrices of order M which are returned by DLATSQR
+     !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+     !> See the documentation for DLATSQR.
+
+     pure subroutine stdlib_qorgtsqr( m, n, mb, nb, a, lda, t, ldt, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, mb, nb
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: t(ldt,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: iinfo, ldc, lworkopt, lc, lw, nblocal, j
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           lquery  = lwork==-1
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb<=n ) then
+              info = -3
+           else if( nb<1 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -8
+           else
+              ! test the input lwork for the dimension of the array work.
+              ! this workspace is used to store array c(ldc, n) and work(lwork)
+              ! in the call to stdlib_qlamtsqr. see the documentation for stdlib_qlamtsqr.
+              if( lwork<2 .and. (.not.lquery) ) then
+                 info = -10
+              else
+                 ! set block size for column blocks
+                 nblocal = min( nb, n )
+                 ! lwork = -1, then set the size for the array c(ldc,n)
+                 ! in stdlib_qlamtsqr call and set the optimal size of the work array
+                 ! work(lwork) in stdlib_qlamtsqr call.
+                 ldc = m
+                 lc = ldc*n
+                 lw = n * nblocal
+                 lworkopt = lc+lw
+                 if( ( lwork<max( 1, lworkopt ) ).and.(.not.lquery) ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           ! handle error in the input parameters and return workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGTSQR', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = real( lworkopt,KIND=qp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = real( lworkopt,KIND=qp)
+              return
+           end if
+           ! (1) form explicitly the tall-skinny m-by-n left submatrix q1_in
+           ! of m-by-m orthogonal matrix q_in, which is implicitly stored in
+           ! the subdiagonal part of input array a and in the input array t.
+           ! perform by the following operation using the routine stdlib_qlamtsqr.
+               ! q1_in = q_in * ( i ), where i is a n-by-n identity matrix,
+                              ! ( 0 )        0 is a (m-n)-by-n zero matrix.
+           ! (1a) form m-by-n matrix in the array work(1:ldc*n) with ones
+           ! on the diagonal and zeros elsewhere.
+           call stdlib_qlaset( 'F', m, n, zero, one, work, ldc )
+           ! (1b)  on input, work(1:ldc*n) stores ( i );
+                                                ! ( 0 )
+                 ! on output, work(1:ldc*n) stores q1_in.
+           call stdlib_qlamtsqr( 'L', 'N', m, n, n, mb, nblocal, a, lda, t, ldt,work, ldc, work( &
+                     lc+1 ), lw, iinfo )
+           ! (2) copy the result from the part of the work array (1:m,1:n)
+           ! with the leading dimension ldc that starts at work(1) into
+           ! the output array a(1:m,1:n) column-by-column.
+           do j = 1, n
+              call stdlib_qcopy( m, work( (j-1)*ldc + 1 ), 1, a( 1, j ), 1 )
+           end do
+           work( 1 ) = real( lworkopt,KIND=qp)
+           return
+     end subroutine stdlib_qorgtsqr
+
+     !> DORGTSQR_ROW: generates an M-by-N real matrix Q_out with
+     !> orthonormal columns from the output of DLATSQR. These N orthonormal
+     !> columns are the first N columns of a product of complex unitary
+     !> matrices Q(k)_in of order M, which are returned by DLATSQR in
+     !> a special format.
+     !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+     !> The input matrices Q(k)_in are stored in row and column blocks in A.
+     !> See the documentation of DLATSQR for more details on the format of
+     !> Q(k)_in, where each Q(k)_in is represented by block Householder
+     !> transformations. This routine calls an auxiliary routine DLARFB_GETT,
+     !> where the computation is performed on each individual block. The
+     !> algorithm first sweeps NB-sized column blocks from the right to left
+     !> starting in the bottom row block and continues to the top row block
+     !> (hence _ROW in the routine name). This sweep is in reverse order of
+     !> the order in which DLATSQR generates the output blocks.
+
+     pure subroutine stdlib_qorgtsqr_row( m, n, mb, nb, a, lda, t, ldt, work,lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, mb, nb
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: t(ldt,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: nblocal, mb2, m_plus_one, itmp, ib_bottom, lworkopt, &
+                     num_all_row_blocks, jb_t, ib, imb, kb, kb_last, knb, mb1
+           ! Local Arrays 
+           real(qp) :: dummy(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           lquery  = lwork==-1
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb<=n ) then
+              info = -3
+           else if( nb<1 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           nblocal = min( nb, n )
+           ! determine the workspace size.
+           if( info==0 ) then
+              lworkopt = nblocal * max( nblocal, ( n - nblocal ) )
+           end if
+           ! handle error in the input parameters and handle the workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORGTSQR_ROW', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = real( lworkopt,KIND=qp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = real( lworkopt,KIND=qp)
+              return
+           end if
+           ! (0) set the upper-triangular part of the matrix a to zero and
+           ! its diagonal elements to one.
+           call stdlib_qlaset('U', m, n, zero, one, a, lda )
+           ! kb_last is the column index of the last column block reflector
+           ! in the matrices t and v.
+           kb_last = ( ( n-1 ) / nblocal ) * nblocal + 1
+           ! (1) bottom-up loop over row blocks of a, except the top row block.
+           ! note: if mb>=m, then the loop is never executed.
+           if ( mb<m ) then
+              ! mb2 is the row blocking size for the row blocks before the
+              ! first top row block in the matrix a. ib is the row index for
+              ! the row blocks in the matrix a before the first top row block.
+              ! ib_bottom is the row index for the last bottom row block
+              ! in the matrix a. jb_t is the column index of the corresponding
+              ! column block in the matrix t.
+              ! initialize variables.
+              ! num_all_row_blocks is the number of row blocks in the matrix a
+              ! including the first row block.
+              mb2 = mb - n
+              m_plus_one = m + 1
+              itmp = ( m - mb - 1 ) / mb2
+              ib_bottom = itmp * mb2 + mb + 1
+              num_all_row_blocks = itmp + 2
+              jb_t = num_all_row_blocks * n + 1
+              do ib = ib_bottom, mb+1, -mb2
+                 ! determine the block size imb for the current row block
+                 ! in the matrix a.
+                 imb = min( m_plus_one - ib, mb2 )
+                 ! determine the column index jb_t for the current column block
+                 ! in the matrix t.
+                 jb_t = jb_t - n
+                 ! apply column blocks of h in the row block from right to left.
+                 ! kb is the column index of the current column block reflector
+                 ! in the matrices t and v.
+                 do kb = kb_last, 1, -nblocal
+                    ! determine the size of the current column block knb in
+                    ! the matrices t and v.
+                    knb = min( nblocal, n - kb + 1 )
+                    call stdlib_qlarfb_gett( 'I', imb, n-kb+1, knb,t( 1, jb_t+kb-1 ), ldt, a( kb, &
+                              kb ), lda,a( ib, kb ), lda, work, knb )
+                 end do
+              end do
+           end if
+           ! (2) top row block of a.
+           ! note: if mb>=m, then we have only one row block of a of size m
+           ! and we work on the entire matrix a.
+           mb1 = min( mb, m )
+           ! apply column blocks of h in the top row block from right to left.
+           ! kb is the column index of the current block reflector in
+           ! the matrices t and v.
+           do kb = kb_last, 1, -nblocal
+              ! determine the size of the current column block knb in
+              ! the matrices t and v.
+              knb = min( nblocal, n - kb + 1 )
+              if( mb1-kb-knb+1==0 ) then
+                 ! in stdlib_dlarfb_gett parameters, when m=0, then the matrix b
+                 ! does not exist, hence we need to pass a dummy array
+                 ! reference dummy(1,1) to b with lddummy=1.
+                 call stdlib_qlarfb_gett( 'N', 0, n-kb+1, knb,t( 1, kb ), ldt, a( kb, kb ), lda,&
+                           dummy( 1, 1 ), 1, work, knb )
+              else
+                 call stdlib_qlarfb_gett( 'N', mb1-kb-knb+1, n-kb+1, knb,t( 1, kb ), ldt, a( kb, &
+                           kb ), lda,a( kb+knb, kb), lda, work, knb )
+              end if
+           end do
+           work( 1 ) = real( lworkopt,KIND=qp)
+           return
+     end subroutine stdlib_qorgtsqr_row
+
+     !> DORHR_COL: takes an M-by-N real matrix Q_in with orthonormal columns
+     !> as input, stored in A, and performs Householder Reconstruction (HR),
+     !> i.e. reconstructs Householder vectors V(i) implicitly representing
+     !> another M-by-N matrix Q_out, with the property that Q_in = Q_out*S,
+     !> where S is an N-by-N diagonal matrix with diagonal entries
+     !> equal to +1 or -1. The Householder vectors (columns V(i) of V) are
+     !> stored in A on output, and the diagonal entries of S are stored in D.
+     !> Block reflectors are also returned in T
+     !> (same output format as DGEQRT).
+
+     pure subroutine stdlib_qorhr_col( m, n, nb, a, lda, t, ldt, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, nb
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: d(*), t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo, j, jb, jbtemp1, jbtemp2, jnb, nplusone
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( nb<1 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -7
+           end if
+           ! handle error in the input parameters.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORHR_COL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              return
+           end if
+           ! on input, the m-by-n matrix a contains the orthogonal
+           ! m-by-n matrix q_in.
+           ! (1) compute the unit lower-trapezoidal v (ones on the diagonal
+           ! are not stored) by performing the "modified" lu-decomposition.
+           ! q_in - ( s ) = v * u = ( v1 ) * u,
+                  ! ( 0 )           ( v2 )
+           ! where 0 is an (m-n)-by-n zero matrix.
+           ! (1-1) factor v1 and u.
+           call stdlib_qlaorhr_col_getrfnp( n, n, a, lda, d, iinfo )
+           ! (1-2) solve for v2.
+           if( m>n ) then
+              call stdlib_qtrsm( 'R', 'U', 'N', 'N', m-n, n, one, a, lda,a( n+1, 1 ), lda )
+                        
+           end if
+           ! (2) reconstruct the block reflector t stored in t(1:nb, 1:n)
+           ! as a sequence of upper-triangular blocks with nb-size column
+           ! blocking.
+           ! loop over the column blocks of size nb of the array a(1:m,1:n)
+           ! and the array t(1:nb,1:n), jb is the column index of a column
+           ! block, jnb is the column block size at each step jb.
+           nplusone = n + 1
+           do jb = 1, n, nb
+              ! (2-0) determine the column block size jnb.
+              jnb = min( nplusone-jb, nb )
+              ! (2-1) copy the upper-triangular part of the current jnb-by-jnb
+              ! diagonal block u(jb) (of the n-by-n matrix u) stored
+              ! in a(jb:jb+jnb-1,jb:jb+jnb-1) into the upper-triangular part
+              ! of the current jnb-by-jnb block t(1:jnb,jb:jb+jnb-1)
+              ! column-by-column, total jnb*(jnb+1)/2 elements.
+              jbtemp1 = jb - 1
+              do j = jb, jb+jnb-1
+                 call stdlib_qcopy( j-jbtemp1, a( jb, j ), 1, t( 1, j ), 1 )
+              end do
+              ! (2-2) perform on the upper-triangular part of the current
+              ! jnb-by-jnb diagonal block u(jb) (of the n-by-n matrix u) stored
+              ! in t(1:jnb,jb:jb+jnb-1) the following operation in place:
+              ! (-1)*u(jb)*s(jb), i.e the result will be stored in the upper-
+              ! triangular part of t(1:jnb,jb:jb+jnb-1). this multiplication
+              ! of the jnb-by-jnb diagonal block u(jb) by the jnb-by-jnb
+              ! diagonal block s(jb) of the n-by-n sign matrix s from the
+              ! right means changing the sign of each j-th column of the block
+              ! u(jb) according to the sign of the diagonal element of the block
+              ! s(jb), i.e. s(j,j) that is stored in the array element d(j).
+              do j = jb, jb+jnb-1
+                 if( d( j )==one ) then
+                    call stdlib_qscal( j-jbtemp1, -one, t( 1, j ), 1 )
+                 end if
+              end do
+              ! (2-3) perform the triangular solve for the current block
+              ! matrix x(jb):
+                     ! x(jb) * (a(jb)**t) = b(jb), where:
+                     ! a(jb)**t  is a jnb-by-jnb unit upper-triangular
+                               ! coefficient block, and a(jb)=v1(jb), which
+                               ! is a jnb-by-jnb unit lower-triangular block
+                               ! stored in a(jb:jb+jnb-1,jb:jb+jnb-1).
+                               ! the n-by-n matrix v1 is the upper part
+                               ! of the m-by-n lower-trapezoidal matrix v
+                               ! stored in a(1:m,1:n);
+                     ! b(jb)     is a jnb-by-jnb  upper-triangular right-hand
+                               ! side block, b(jb) = (-1)*u(jb)*s(jb), and
+                               ! b(jb) is stored in t(1:jnb,jb:jb+jnb-1);
+                     ! x(jb)     is a jnb-by-jnb upper-triangular solution
+                               ! block, x(jb) is the upper-triangular block
+                               ! reflector t(jb), and x(jb) is stored
+                               ! in t(1:jnb,jb:jb+jnb-1).
+                   ! in other words, we perform the triangular solve for the
+                   ! upper-triangular block t(jb):
+                     ! t(jb) * (v1(jb)**t) = (-1)*u(jb)*s(jb).
+                   ! even though the blocks x(jb) and b(jb) are upper-
+                   ! triangular, the routine stdlib_qtrsm will access all jnb**2
+                   ! elements of the square t(1:jnb,jb:jb+jnb-1). therefore,
+                   ! we need to set to zero the elements of the block
+                   ! t(1:jnb,jb:jb+jnb-1) below the diagonal before the call
+                   ! to stdlib_qtrsm.
+              ! (2-3a) set the elements to zero.
+              jbtemp2 = jb - 2
+              do j = jb, jb+jnb-2
+                 do i = j-jbtemp2, nb
+                    t( i, j ) = zero
+                 end do
+              end do
+              ! (2-3b) perform the triangular solve.
+              call stdlib_qtrsm( 'R', 'L', 'T', 'U', jnb, jnb, one,a( jb, jb ), lda, t( 1, jb ), &
+                        ldt )
+           end do
+           return
+     end subroutine stdlib_qorhr_col
+
+
+     pure subroutine stdlib_qorm22( side, trans, m, n, n1, n2, q, ldq, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(in) :: m, n, n1, n2, ldq, ldc, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(in) :: q(ldq,*)
+           real(qp), intent(inout) :: c(ldc,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, ldwork, len, lwkopt, nb, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q;
+           ! nw is the minimum dimension of work.
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           nw = nq
+           if( n1==0 .or. n2==0 ) nw = 1
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) )&
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( n1<0 .or. n1+n2/=nq ) then
+              info = -5
+           else if( n2<0 ) then
+              info = -6
+           else if( ldq<max( 1, nq ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              lwkopt = m*n
+              work( 1 ) = real( lwkopt,KIND=qp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORM22', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! degenerate cases (n1 = 0 or n2 = 0) are handled using stdlib_qtrmm.
+           if( n1==0 ) then
+              call stdlib_qtrmm( side, 'UPPER', trans, 'NON-UNIT', m, n, one,q, ldq, c, ldc )
+                        
+              work( 1 ) = one
+              return
+           else if( n2==0 ) then
+              call stdlib_qtrmm( side, 'LOWER', trans, 'NON-UNIT', m, n, one,q, ldq, c, ldc )
+                        
+              work( 1 ) = one
+              return
+           end if
+           ! compute the largest chunk size available from the workspace.
+           nb = max( 1, min( lwork, lwkopt ) / nq )
+           if( left ) then
+              if( notran ) then
+                 do i = 1, n, nb
+                    len = min( nb, n-i+1 )
+                    ldwork = m
+                    ! multiply bottom part of c by q12.
+                    call stdlib_qlacpy( 'ALL', n1, len, c( n2+1, i ), ldc, work,ldwork )
+                    call stdlib_qtrmm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT',n1, len, one, &
+                              q( 1, n2+1 ), ldq, work,ldwork )
+                    ! multiply top part of c by q11.
+                    call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n1, len, n2,one, q, ldq, c(&
+                               1, i ), ldc, one, work,ldwork )
+                    ! multiply top part of c by q21.
+                    call stdlib_qlacpy( 'ALL', n2, len, c( 1, i ), ldc,work( n1+1 ), ldwork )
+                              
+                    call stdlib_qtrmm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',n2, len, one, &
+                              q( n1+1, 1 ), ldq,work( n1+1 ), ldwork )
+                    ! multiply bottom part of c by q22.
+                    call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n2, len, n1,one, q( n1+1, &
+                              n2+1 ), ldq, c( n2+1, i ), ldc,one, work( n1+1 ), ldwork )
+                    ! copy everything back.
+                    call stdlib_qlacpy( 'ALL', m, len, work, ldwork, c( 1, i ),ldc )
+                 end do
+              else
+                 do i = 1, n, nb
+                    len = min( nb, n-i+1 )
+                    ldwork = m
+                    ! multiply bottom part of c by q21**t.
+                    call stdlib_qlacpy( 'ALL', n2, len, c( n1+1, i ), ldc, work,ldwork )
+                    call stdlib_qtrmm( 'LEFT', 'UPPER', 'TRANSPOSE', 'NON-UNIT',n2, len, one, q( &
+                              n1+1, 1 ), ldq, work,ldwork )
+                    ! multiply top part of c by q11**t.
+                    call stdlib_qgemm( 'TRANSPOSE', 'NO TRANSPOSE', n2, len, n1,one, q, ldq, c( 1,&
+                               i ), ldc, one, work,ldwork )
+                    ! multiply top part of c by q12**t.
+                    call stdlib_qlacpy( 'ALL', n1, len, c( 1, i ), ldc,work( n2+1 ), ldwork )
+                              
+                    call stdlib_qtrmm( 'LEFT', 'LOWER', 'TRANSPOSE', 'NON-UNIT',n1, len, one, q( &
+                              1, n2+1 ), ldq,work( n2+1 ), ldwork )
+                    ! multiply bottom part of c by q22**t.
+                    call stdlib_qgemm( 'TRANSPOSE', 'NO TRANSPOSE', n1, len, n2,one, q( n1+1, n2+&
+                              1 ), ldq, c( n1+1, i ), ldc,one, work( n2+1 ), ldwork )
+                    ! copy everything back.
+                    call stdlib_qlacpy( 'ALL', m, len, work, ldwork, c( 1, i ),ldc )
+                 end do
+              end if
+           else
+              if( notran ) then
+                 do i = 1, m, nb
+                    len = min( nb, m-i+1 )
+                    ldwork = len
+                    ! multiply right part of c by q21.
+                    call stdlib_qlacpy( 'ALL', len, n2, c( i, n1+1 ), ldc, work,ldwork )
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',len, n2, one, &
+                              q( n1+1, 1 ), ldq, work,ldwork )
+                    ! multiply left part of c by q11.
+                    call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', len, n2, n1,one, c( i, 1 ),&
+                               ldc, q, ldq, one, work,ldwork )
+                    ! multiply left part of c by q12.
+                    call stdlib_qlacpy( 'ALL', len, n1, c( i, 1 ), ldc,work( 1 + n2*ldwork ), &
+                              ldwork )
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT',len, n1, one, &
+                              q( 1, n2+1 ), ldq,work( 1 + n2*ldwork ), ldwork )
+                    ! multiply right part of c by q22.
+                    call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', len, n1, n2,one, c( i, n1+&
+                              1 ), ldc, q( n1+1, n2+1 ), ldq,one, work( 1 + n2*ldwork ), ldwork )
+                    ! copy everything back.
+                    call stdlib_qlacpy( 'ALL', len, n, work, ldwork, c( i, 1 ),ldc )
+                 end do
+              else
+                 do i = 1, m, nb
+                    len = min( nb, m-i+1 )
+                    ldwork = len
+                    ! multiply right part of c by q12**t.
+                    call stdlib_qlacpy( 'ALL', len, n1, c( i, n2+1 ), ldc, work,ldwork )
+                    call stdlib_qtrmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'NON-UNIT',len, n1, one, q( &
+                              1, n2+1 ), ldq, work,ldwork )
+                    ! multiply left part of c by q11**t.
+                    call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', len, n1, n2,one, c( i, 1 ), &
+                              ldc, q, ldq, one, work,ldwork )
+                    ! multiply left part of c by q21**t.
+                    call stdlib_qlacpy( 'ALL', len, n2, c( i, 1 ), ldc,work( 1 + n1*ldwork ), &
+                              ldwork )
+                    call stdlib_qtrmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'NON-UNIT',len, n2, one, q( &
+                              n1+1, 1 ), ldq,work( 1 + n1*ldwork ), ldwork )
+                    ! multiply right part of c by q22**t.
+                    call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', len, n2, n1,one, c( i, n2+1 ),&
+                               ldc, q( n1+1, n2+1 ), ldq,one, work( 1 + n1*ldwork ), ldwork )
+                    ! copy everything back.
+                    call stdlib_qlacpy( 'ALL', len, n, work, ldwork, c( i, 1 ),ldc )
+                 end do
+              end if
+           end if
+           work( 1 ) = real( lwkopt,KIND=qp)
+           return
+     end subroutine stdlib_qorm22
+
+     !> DORM2L: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T * C  if SIDE = 'L' and TRANS = 'T', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by DGEQLF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_qorm2l( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, mi, ni, nq
+           real(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORM2L', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. notran ) .or. ( .not.left .and. .not.notran ) )then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+           else
+              mi = m
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) is applied to c(1:m-k+i,1:n)
+                 mi = m - k + i
+              else
+                 ! h(i) is applied to c(1:m,1:n-k+i)
+                 ni = n - k + i
+              end if
+              ! apply h(i)
+              aii = a( nq-k+i, i )
+              a( nq-k+i, i ) = one
+              call stdlib_qlarf( side, mi, ni, a( 1, i ), 1, tau( i ), c, ldc,work )
+              a( nq-k+i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_qorm2l
+
+     !> DORM2R: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T* C  if SIDE = 'L' and TRANS = 'T', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by DGEQRF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_qorm2r( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, jc, mi, ni, nq
+           real(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORM2R', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran ) .or. ( .not.left .and. notran ) )then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              jc = 1
+           else
+              mi = m
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i)
+              aii = a( i, i )
+              a( i, i ) = one
+              call stdlib_qlarf( side, mi, ni, a( i, i ), 1, tau( i ), c( ic, jc ),ldc, work )
+                        
+              a( i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_qorm2r
+
+     !> If VECT = 'Q', DORMBR: overwrites the general real M-by-N matrix C
+     !> with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> If VECT = 'P', DORMBR overwrites the general real M-by-N matrix C
+     !> with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      P * C          C * P
+     !> TRANS = 'T':      P**T * C       C * P**T
+     !> Here Q and P**T are the orthogonal matrices determined by DGEBRD when
+     !> reducing a real matrix A to bidiagonal form: A = Q * B * P**T. Q and
+     !> P**T are defined as products of elementary reflectors H(i) and G(i)
+     !> respectively.
+     !> Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the
+     !> order of the orthogonal matrix Q or P**T that is applied.
+     !> If VECT = 'Q', A is assumed to have been an NQ-by-K matrix:
+     !> if nq >= k, Q = H(1) H(2) . . . H(k);
+     !> if nq < k, Q = H(1) H(2) . . . H(nq-1).
+     !> If VECT = 'P', A is assumed to have been a K-by-NQ matrix:
+     !> if k < nq, P = G(1) G(2) . . . G(k);
+     !> if k >= nq, P = G(1) G(2) . . . G(nq-1).
+
+     pure subroutine stdlib_qormbr( vect, side, trans, m, n, k, a, lda, tau, c,ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: applyq, left, lquery, notran
+           character :: transt
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           applyq = stdlib_lsame( vect, 'Q' )
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q or p and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.applyq .and. .not.stdlib_lsame( vect, 'P' ) ) then
+              info = -1
+           else if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -2
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( k<0 ) then
+              info = -6
+           else if( ( applyq .and. lda<max( 1, nq ) ) .or.( .not.applyq .and. lda<max( 1, min( nq,&
+                      k ) ) ) )then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( applyq ) then
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'DORMQR', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'DORMQR', side // trans, m, n-1, n-1,-1 )
+                 end if
+              else
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'DORMLQ', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'DORMLQ', side // trans, m, n-1, n-1,-1 )
+                 end if
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMBR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           work( 1 ) = 1
+           if( m==0 .or. n==0 )return
+           if( applyq ) then
+              ! apply q
+              if( nq>=k ) then
+                 ! q was determined by a call to stdlib_qgebrd with nq >= k
+                 call stdlib_qormqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, iinfo &
+                           )
+              else if( nq>1 ) then
+                 ! q was determined by a call to stdlib_qgebrd with nq < k
+                 if( left ) then
+                    mi = m - 1
+                    ni = n
+                    i1 = 2
+                    i2 = 1
+                 else
+                    mi = m
+                    ni = n - 1
+                    i1 = 1
+                    i2 = 2
+                 end if
+                 call stdlib_qormqr( side, trans, mi, ni, nq-1, a( 2, 1 ), lda, tau,c( i1, i2 ), &
+                           ldc, work, lwork, iinfo )
+              end if
+           else
+              ! apply p
+              if( notran ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              if( nq>k ) then
+                 ! p was determined by a call to stdlib_qgebrd with nq > k
+                 call stdlib_qormlq( side, transt, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                           iinfo )
+              else if( nq>1 ) then
+                 ! p was determined by a call to stdlib_qgebrd with nq <= k
+                 if( left ) then
+                    mi = m - 1
+                    ni = n
+                    i1 = 2
+                    i2 = 1
+                 else
+                    mi = m
+                    ni = n - 1
+                    i1 = 1
+                    i2 = 2
+                 end if
+                 call stdlib_qormlq( side, transt, mi, ni, nq-1, a( 1, 2 ), lda,tau, c( i1, i2 ), &
+                           ldc, work, lwork, iinfo )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qormbr
+
+     !> DORMHR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> IHI-ILO elementary reflectors, as returned by DGEHRD:
+     !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+     pure subroutine stdlib_qormhr( side, trans, m, n, ilo, ihi, a, lda, tau, c,ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldc, lwork, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, lquery
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, nh, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nh = ihi - ilo
+           left = stdlib_lsame( side, 'L' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) )&
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 .or. ilo>max( 1, nq ) ) then
+              info = -5
+           else if( ihi<min( ilo, nq ) .or. ihi>nq ) then
+              info = -6
+           else if( lda<max( 1, nq ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( left ) then
+                 nb = stdlib_ilaenv( 1, 'DORMQR', side // trans, nh, n, nh, -1 )
+              else
+                 nb = stdlib_ilaenv( 1, 'DORMQR', side // trans, m, nh, nh, -1 )
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMHR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. nh==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( left ) then
+              mi = nh
+              ni = n
+              i1 = ilo + 1
+              i2 = 1
+           else
+              mi = m
+              ni = nh
+              i1 = 1
+              i2 = ilo + 1
+           end if
+           call stdlib_qormqr( side, trans, mi, ni, nh, a( ilo+1, ilo ), lda,tau( ilo ), c( i1, &
+                     i2 ), ldc, work, lwork, iinfo )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qormhr
+
+     !> DORML2: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T* C  if SIDE = 'L' and TRANS = 'T', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by DGELQF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_qorml2( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, jc, mi, ni, nq
+           real(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORML2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. notran ) .or. ( .not.left .and. .not.notran ) )then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              jc = 1
+           else
+              mi = m
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i)
+              aii = a( i, i )
+              a( i, i ) = one
+              call stdlib_qlarf( side, mi, ni, a( i, i ), lda, tau( i ),c( ic, jc ), ldc, work )
+                        
+              a( i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_qorml2
+
+     !> DORMLQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by DGELQF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_qormlq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, jc, ldwork, lwkopt, mi, nb, nbmin, &
+                     ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'DORMLQ', side // trans, m, n, k,-1 ) )
+              lwkopt = nw*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMLQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DORMLQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_qorml2( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. notran ) .or.( .not.left .and. .not.notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              if( notran ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i) h(i+1) . . . h(i+ib-1)
+                 call stdlib_qlarft( 'FORWARD', 'ROWWISE', nq-i+1, ib, a( i, i ),lda, tau( i ), &
+                           work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**t is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**t
+                 call stdlib_qlarfb( side, transt, 'FORWARD', 'ROWWISE', mi, ni, ib,a( i, i ), &
+                           lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qormlq
+
+     !> DORMQL: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by DGEQLF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_qormql( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, i1, i2, i3, ib, iinfo, iwt, ldwork, lwkopt, mi, nb, nbmin, ni, nq, &
+                     nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'DORMQL', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMQL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DORMQL', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_qorm2l( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. notran ) .or.( .not.left .and. .not.notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_qlarft( 'BACKWARD', 'COLUMNWISE', nq-k+i+ib-1, ib,a( 1, i ), lda, &
+                           tau( i ), work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(1:m-k+i+ib-1,1:n)
+                    mi = m - k + i + ib - 1
+                 else
+                    ! h or h**t is applied to c(1:m,1:n-k+i+ib-1)
+                    ni = n - k + i + ib - 1
+                 end if
+                 ! apply h or h**t
+                 call stdlib_qlarfb( side, trans, 'BACKWARD', 'COLUMNWISE', mi, ni,ib, a( 1, i ), &
+                           lda, work( iwt ), ldt, c, ldc,work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qormql
+
+     !> DORMQR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by DGEQRF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_qormqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, jc, ldwork, lwkopt, mi, nb, nbmin, &
+                     ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'DORMQR', side // trans, m, n, k,-1 ) )
+              lwkopt = nw*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMQR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DORMQR', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_qorm2r( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i) h(i+1) . . . h(i+ib-1)
+                 call stdlib_qlarft( 'FORWARD', 'COLUMNWISE', nq-i+1, ib, a( i, i ),lda, tau( i ),&
+                            work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**t is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**t
+                 call stdlib_qlarfb( side, trans, 'FORWARD', 'COLUMNWISE', mi, ni,ib, a( i, i ), &
+                           lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qormqr
+
+     !> DORMR2: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T* C  if SIDE = 'L' and TRANS = 'T', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by DGERQF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_qormr2( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, mi, ni, nq
+           real(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMR2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran ) .or. ( .not.left .and. notran ) )then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+           else
+              mi = m
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) is applied to c(1:m-k+i,1:n)
+                 mi = m - k + i
+              else
+                 ! h(i) is applied to c(1:m,1:n-k+i)
+                 ni = n - k + i
+              end if
+              ! apply h(i)
+              aii = a( i, nq-k+i )
+              a( i, nq-k+i ) = one
+              call stdlib_qlarf( side, mi, ni, a( i, 1 ), lda, tau( i ), c, ldc,work )
+              a( i, nq-k+i ) = aii
+           end do
+           return
+     end subroutine stdlib_qormr2
+
+     !> DORMR3: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by DTZRZF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_qormr3( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, l, lda, ldc, m, n
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), tau(*)
+           real(qp), intent(inout) :: c(ldc,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, ja, jc, mi, ni, nq
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( l<0 .or. ( left .and. ( l>m ) ) .or.( .not.left .and. ( l>n ) ) ) then
+              info = -6
+           else if( lda<max( 1, k ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMR3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran .or. .not.left .and. notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              ja = m - l + 1
+              jc = 1
+           else
+              mi = m
+              ja = n - l + 1
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**t is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) or h(i)**t is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i) or h(i)**t
+              call stdlib_qlarz( side, mi, ni, l, a( i, ja ), lda, tau( i ),c( ic, jc ), ldc, &
+                        work )
+           end do
+           return
+     end subroutine stdlib_qormr3
+
+     !> DORMRQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by DGERQF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_qormrq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, iinfo, iwt, ldwork, lwkopt, mi, nb, nbmin, ni, nq, &
+                     nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'DORMRQ', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMRQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DORMRQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_qormr2( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              if( notran ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_qlarft( 'BACKWARD', 'ROWWISE', nq-k+i+ib-1, ib,a( i, 1 ), lda, tau( &
+                           i ), work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(1:m-k+i+ib-1,1:n)
+                    mi = m - k + i + ib - 1
+                 else
+                    ! h or h**t is applied to c(1:m,1:n-k+i+ib-1)
+                    ni = n - k + i + ib - 1
+                 end if
+                 ! apply h or h**t
+                 call stdlib_qlarfb( side, transt, 'BACKWARD', 'ROWWISE', mi, ni,ib, a( i, 1 ), &
+                           lda, work( iwt ), ldt, c, ldc,work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qormrq
+
+     !> DORMRZ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by DTZRZF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_qormrz( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, l, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, ja, jc, ldwork, lwkopt, mi, nb, &
+                     nbmin, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( l<0 .or. ( left .and. ( l>m ) ) .or.( .not.left .and. ( l>n ) ) ) then
+              info = -6
+           else if( lda<max( 1, k ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'DORMRQ', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMRZ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DORMRQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_qormr3( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, iinfo )
+                        
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+                 ja = m - l + 1
+              else
+                 mi = m
+                 ic = 1
+                 ja = n - l + 1
+              end if
+              if( notran ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_qlarzt( 'BACKWARD', 'ROWWISE', l, ib, a( i, ja ), lda,tau( i ), work(&
+                            iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**t is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**t
+                 call stdlib_qlarzb( side, transt, 'BACKWARD', 'ROWWISE', mi, ni,ib, l, a( i, ja )&
+                           , lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qormrz
+
+     !> DORMTR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> nq-1 elementary reflectors, as returned by DSYTRD:
+     !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+     pure subroutine stdlib_qormtr( side, uplo, trans, m, n, a, lda, tau, c, ldc,work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(qp), intent(in) :: tau(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, lquery, upper
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) )&
+                     then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              if( upper ) then
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'DORMQL', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'DORMQL', side // trans, m, n-1, n-1,-1 )
+                 end if
+              else
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'DORMQR', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'DORMQR', side // trans, m, n-1, n-1,-1 )
+                 end if
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DORMTR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. nq==1 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( left ) then
+              mi = m - 1
+              ni = n
+           else
+              mi = m
+              ni = n - 1
+           end if
+           if( upper ) then
+              ! q was determined by a call to stdlib_qsytrd with uplo = 'u'
+              call stdlib_qormql( side, trans, mi, ni, nq-1, a( 1, 2 ), lda, tau, c,ldc, work, &
+                        lwork, iinfo )
+           else
+              ! q was determined by a call to stdlib_qsytrd with uplo = 'l'
+              if( left ) then
+                 i1 = 2
+                 i2 = 1
+              else
+                 i1 = 1
+                 i2 = 2
+              end if
+              call stdlib_qormqr( side, trans, mi, ni, nq-1, a( 2, 1 ), lda, tau,c( i1, i2 ), ldc,&
+                         work, lwork, iinfo )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qormtr
+
+     !> DPBCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric positive definite band matrix using the
+     !> Cholesky factorization A = U**T*U or A = L*L**T computed by DPBTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_qpbcon( uplo, n, kd, ab, ldab, anorm, rcond, work,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: ab(ldab,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(qp) :: ainvnm, scale, scalel, scaleu, smlnum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**t).
+                 call stdlib_qlatbs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scalel, work( 2*n+1 ),info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_qlatbs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scaleu, work( 2*n+1 ),info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_qlatbs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scalel, work( 2*n+1 ),info )
+                 normin = 'Y'
+                 ! multiply by inv(l**t).
+                 call stdlib_qlatbs( 'LOWER', 'TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scaleu, work( 2*n+1 ),info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_iqamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_qrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_qpbcon
+
+     !> DPBEQU: computes row and column scalings intended to equilibrate a
+     !> symmetric positive definite band matrix A and reduce its condition
+     !> number (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_qpbequ( uplo, n, kd, ab, ldab, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(qp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(in) :: ab(ldab,*)
+           real(qp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j
+           real(qp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           if( upper ) then
+              j = kd + 1
+           else
+              j = 1
+           end if
+           ! initialize smin and amax.
+           s( 1 ) = ab( j, 1 )
+           smin = s( 1 )
+           amax = s( 1 )
+           ! find the minimum and maximum diagonal elements.
+           do i = 2, n
+              s( i ) = ab( j, i )
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_qpbequ
+
+     !> DPBRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric positive definite
+     !> and banded, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_qpbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b,ldb, x, ldx, ferr, &
+               berr, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldafb, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*)
+           real(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, l, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldafb<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = min( n+1, 2*kd+2 )
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_qcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_qsbmv( uplo, n, kd, -one, ab, ldab, x( 1, j ), 1, one,work( n+1 ), 1 )
+                        
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    l = kd + 1 - k
+                    do i = max( 1, k-kd ), k - 1
+                       work( i ) = work( i ) + abs( ab( l+i, k ) )*xk
+                       s = s + abs( ab( l+i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + abs( ab( kd+1, k ) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( ab( 1, k ) )*xk
+                    l = 1 - k
+                    do i = k + 1, min( n, k+kd )
+                       work( i ) = work( i ) + abs( ab( l+i, k ) )*xk
+                       s = s + abs( ab( l+i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_qpbtrs( uplo, n, kd, 1, afb, ldafb, work( n+1 ), n,info )
+                 call stdlib_qaxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_qlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_qlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_qpbtrs( uplo, n, kd, 1, afb, ldafb, work( n+1 ), n,info )
+                    do i = 1, n
+                       work( n+i ) = work( n+i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( n+i )*work( i )
+                    end do
+                    call stdlib_qpbtrs( uplo, n, kd, 1, afb, ldafb, work( n+1 ), n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_qpbrfs
+
+     !> DPBSTF: computes a split Cholesky factorization of a real
+     !> symmetric positive definite band matrix A.
+     !> This routine is designed to be used in conjunction with DSBGST.
+     !> The factorization has the form  A = S**T*S  where S is a band matrix
+     !> of the same bandwidth as A and the following structure:
+     !> S = ( U    )
+     !> ( M  L )
+     !> where U is upper triangular of order m = (n+kd)/2, and L is lower
+     !> triangular of order n-m.
+
+     pure subroutine stdlib_qpbstf( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, kld, km, m
+           real(qp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBSTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           kld = max( 1, ldab-1 )
+           ! set the splitting point m.
+           m = ( n+kd ) / 2
+           if( upper ) then
+              ! factorize a(m+1:n,m+1:n) as l**t*l, and update a(1:m,1:m).
+              do j = n, m + 1, -1
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = ab( kd+1, j )
+                 if( ajj<=zero )go to 50
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 km = min( j-1, kd )
+                 ! compute elements j-km:j-1 of the j-th column and update the
+                 ! the leading submatrix within the band.
+                 call stdlib_qscal( km, one / ajj, ab( kd+1-km, j ), 1 )
+                 call stdlib_qsyr( 'UPPER', km, -one, ab( kd+1-km, j ), 1,ab( kd+1, j-km ), kld )
+                           
+              end do
+              ! factorize the updated submatrix a(1:m,1:m) as u**t*u.
+              do j = 1, m
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = ab( kd+1, j )
+                 if( ajj<=zero )go to 50
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 km = min( kd, m-j )
+                 ! compute elements j+1:j+km of the j-th row and update the
+                 ! trailing submatrix within the band.
+                 if( km>0 ) then
+                    call stdlib_qscal( km, one / ajj, ab( kd, j+1 ), kld )
+                    call stdlib_qsyr( 'UPPER', km, -one, ab( kd, j+1 ), kld,ab( kd+1, j+1 ), kld )
+                              
+                 end if
+              end do
+           else
+              ! factorize a(m+1:n,m+1:n) as l**t*l, and update a(1:m,1:m).
+              do j = n, m + 1, -1
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = ab( 1, j )
+                 if( ajj<=zero )go to 50
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 km = min( j-1, kd )
+                 ! compute elements j-km:j-1 of the j-th row and update the
+                 ! trailing submatrix within the band.
+                 call stdlib_qscal( km, one / ajj, ab( km+1, j-km ), kld )
+                 call stdlib_qsyr( 'LOWER', km, -one, ab( km+1, j-km ), kld,ab( 1, j-km ), kld )
+                           
+              end do
+              ! factorize the updated submatrix a(1:m,1:m) as u**t*u.
+              do j = 1, m
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = ab( 1, j )
+                 if( ajj<=zero )go to 50
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 km = min( kd, m-j )
+                 ! compute elements j+1:j+km of the j-th column and update the
+                 ! trailing submatrix within the band.
+                 if( km>0 ) then
+                    call stdlib_qscal( km, one / ajj, ab( 2, j ), 1 )
+                    call stdlib_qsyr( 'LOWER', km, -one, ab( 2, j ), 1,ab( 1, j+1 ), kld )
+                 end if
+              end do
+           end if
+           return
+           50 continue
+           info = j
+           return
+     end subroutine stdlib_qpbstf
+
+     !> DPBSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite band matrix and X
+     !> and B are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular band matrix, and L is a lower
+     !> triangular band matrix, with the same number of superdiagonals or
+     !> subdiagonals as A.  The factored form of A is then used to solve the
+     !> system of equations A * X = B.
+
+     pure subroutine stdlib_qpbsv( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(inout) :: ab(ldab,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**t*u or a = l*l**t.
+           call stdlib_qpbtrf( uplo, n, kd, ab, ldab, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_qpbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_qpbsv
+
+     !> DPBSVX: uses the Cholesky factorization A = U**T*U or A = L*L**T to
+     !> compute the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite band matrix and X
+     !> and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_qpbsvx( fact, uplo, n, kd, nrhs, ab, ldab, afb, ldafb,equed, s, b, ldb, x, &
+               ldx, rcond, ferr, berr,work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldafb, ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: ab(ldab,*), afb(ldafb,*), b(ldb,*), s(*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ, upper
+           integer(ilp) :: i, infequ, j, j1, j2
+           real(qp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           else if( ldafb<kd+1 ) then
+              info = -9
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -10
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -11
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -13
+                 else if( ldx<max( 1, n ) ) then
+                    info = -15
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_qpbequ( uplo, n, kd, ab, ldab, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_qlaqsb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right-hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**t *u or a = l*l**t.
+              if( upper ) then
+                 do j = 1, n
+                    j1 = max( j-kd, 1 )
+                    call stdlib_qcopy( j-j1+1, ab( kd+1-j+j1, j ), 1,afb( kd+1-j+j1, j ), 1 )
+                              
+                 end do
+              else
+                 do j = 1, n
+                    j2 = min( j+kd, n )
+                    call stdlib_qcopy( j2-j+1, ab( 1, j ), 1, afb( 1, j ), 1 )
+                 end do
+              end if
+              call stdlib_qpbtrf( uplo, n, kd, afb, ldafb, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_qlansb( '1', uplo, n, kd, ab, ldab, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_qpbcon( uplo, n, kd, afb, ldafb, anorm, rcond, work, iwork,info )
+           ! compute the solution matrix x.
+           call stdlib_qlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_qpbtrs( uplo, n, kd, nrhs, afb, ldafb, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_qpbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b, ldb, x,ldx, ferr, berr,&
+                      work, iwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_qpbsvx
+
+     !> DPBTF2: computes the Cholesky factorization of a real symmetric
+     !> positive definite band matrix A.
+     !> The factorization has the form
+     !> A = U**T * U ,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix, U**T is the transpose of U, and
+     !> L is lower triangular.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_qpbtf2( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, kld, kn
+           real(qp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           kld = max( 1, ldab-1 )
+           if( upper ) then
+              ! compute the cholesky factorization a = u**t*u.
+              do j = 1, n
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = ab( kd+1, j )
+                 if( ajj<=zero )go to 30
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 ! compute elements j+1:j+kn of row j and update the
+                 ! trailing submatrix within the band.
+                 kn = min( kd, n-j )
+                 if( kn>0 ) then
+                    call stdlib_qscal( kn, one / ajj, ab( kd, j+1 ), kld )
+                    call stdlib_qsyr( 'UPPER', kn, -one, ab( kd, j+1 ), kld,ab( kd+1, j+1 ), kld )
+                              
+                 end if
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**t.
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = ab( 1, j )
+                 if( ajj<=zero )go to 30
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 ! compute elements j+1:j+kn of column j and update the
+                 ! trailing submatrix within the band.
+                 kn = min( kd, n-j )
+                 if( kn>0 ) then
+                    call stdlib_qscal( kn, one / ajj, ab( 2, j ), 1 )
+                    call stdlib_qsyr( 'LOWER', kn, -one, ab( 2, j ), 1,ab( 1, j+1 ), kld )
+                 end if
+              end do
+           end if
+           return
+           30 continue
+           info = j
+           return
+     end subroutine stdlib_qpbtf2
+
+     !> DPBTRF: computes the Cholesky factorization of a real symmetric
+     !> positive definite band matrix A.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+
+     pure subroutine stdlib_qpbtrf( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 32
+           integer(ilp), parameter :: ldwork = nbmax+1
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i2, i3, ib, ii, j, jj, nb
+           ! Local Arrays 
+           real(qp) :: work(ldwork,nbmax)
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( .not.stdlib_lsame( uplo, 'U' ) ) .and.( .not.stdlib_lsame( uplo, 'L' ) ) ) &
+                     then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment
+           nb = stdlib_ilaenv( 1, 'DPBTRF', uplo, n, kd, -1, -1 )
+           ! the block size must not exceed the semi-bandwidth kd, and must not
+           ! exceed the limit set by the size of the local array work.
+           nb = min( nb, nbmax )
+           if( nb<=1 .or. nb>kd ) then
+              ! use unblocked code
+              call stdlib_qpbtf2( uplo, n, kd, ab, ldab, info )
+           else
+              ! use blocked code
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! compute the cholesky factorization of a symmetric band
+                 ! matrix, given the upper triangle of the matrix in band
+                 ! storage.
+                 ! zero the upper triangle of the work array.
+                 do j = 1, nb
+                    do i = 1, j - 1
+                       work( i, j ) = zero
+                    end do
+                 end do
+                 ! process the band matrix one diagonal block at a time.
+                 loop_70: do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    ! factorize the diagonal block
+                    call stdlib_qpotf2( uplo, ib, ab( kd+1, i ), ldab-1, ii )
+                    if( ii/=0 ) then
+                       info = i + ii - 1
+                       go to 150
+                    end if
+                    if( i+ib<=n ) then
+                       ! update the relevant part of the trailing submatrix.
+                       ! if a11 denotes the diagonal block which has just been
+                       ! factorized, then we need to update the remaining
+                       ! blocks in the diagram:
+                          ! a11   a12   a13
+                                ! a22   a23
+                                      ! a33
+                       ! the numbers of rows and columns in the partitioning
+                       ! are ib, i2, i3 respectively. the blocks a12, a22 and
+                       ! a23 are empty if ib = kd. the upper triangle of a13
+                       ! lies outside the band.
+                       i2 = min( kd-ib, n-i-ib+1 )
+                       i3 = min( ib, n-i-kd+1 )
+                       if( i2>0 ) then
+                          ! update a12
+                          call stdlib_qtrsm( 'LEFT', 'UPPER', 'TRANSPOSE','NON-UNIT', ib, i2, one,&
+                                     ab( kd+1, i ),ldab-1, ab( kd+1-ib, i+ib ), ldab-1 )
+                          ! update a22
+                          call stdlib_qsyrk( 'UPPER', 'TRANSPOSE', i2, ib, -one,ab( kd+1-ib, i+ib &
+                                    ), ldab-1, one,ab( kd+1, i+ib ), ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! copy the lower triangle of a13 into the work array.
+                          do jj = 1, i3
+                             do ii = jj, ib
+                                work( ii, jj ) = ab( ii-jj+1, jj+i+kd-1 )
+                             end do
+                          end do
+                          ! update a13 (in the work array).
+                          call stdlib_qtrsm( 'LEFT', 'UPPER', 'TRANSPOSE','NON-UNIT', ib, i3, one,&
+                                     ab( kd+1, i ),ldab-1, work, ldwork )
+                          ! update a23
+                          if( i2>0 )call stdlib_qgemm( 'TRANSPOSE', 'NO TRANSPOSE', i2, i3,ib, -&
+                          one, ab( kd+1-ib, i+ib ),ldab-1, work, ldwork, one,ab( 1+ib, i+kd ), &
+                                    ldab-1 )
+                          ! update a33
+                          call stdlib_qsyrk( 'UPPER', 'TRANSPOSE', i3, ib, -one,work, ldwork, one,&
+                                     ab( kd+1, i+kd ),ldab-1 )
+                          ! copy the lower triangle of a13 back into place.
+                          do jj = 1, i3
+                             do ii = jj, ib
+                                ab( ii-jj+1, jj+i+kd-1 ) = work( ii, jj )
+                             end do
+                          end do
+                       end if
+                    end if
+                 end do loop_70
+              else
+                 ! compute the cholesky factorization of a symmetric band
+                 ! matrix, given the lower triangle of the matrix in band
+                 ! storage.
+                 ! zero the lower triangle of the work array.
+                 do j = 1, nb
+                    do i = j + 1, nb
+                       work( i, j ) = zero
+                    end do
+                 end do
+                 ! process the band matrix one diagonal block at a time.
+                 loop_140: do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    ! factorize the diagonal block
+                    call stdlib_qpotf2( uplo, ib, ab( 1, i ), ldab-1, ii )
+                    if( ii/=0 ) then
+                       info = i + ii - 1
+                       go to 150
+                    end if
+                    if( i+ib<=n ) then
+                       ! update the relevant part of the trailing submatrix.
+                       ! if a11 denotes the diagonal block which has just been
+                       ! factorized, then we need to update the remaining
+                       ! blocks in the diagram:
+                          ! a11
+                          ! a21   a22
+                          ! a31   a32   a33
+                       ! the numbers of rows and columns in the partitioning
+                       ! are ib, i2, i3 respectively. the blocks a21, a22 and
+                       ! a32 are empty if ib = kd. the lower triangle of a31
+                       ! lies outside the band.
+                       i2 = min( kd-ib, n-i-ib+1 )
+                       i3 = min( ib, n-i-kd+1 )
+                       if( i2>0 ) then
+                          ! update a21
+                          call stdlib_qtrsm( 'RIGHT', 'LOWER', 'TRANSPOSE','NON-UNIT', i2, ib, &
+                                    one, ab( 1, i ),ldab-1, ab( 1+ib, i ), ldab-1 )
+                          ! update a22
+                          call stdlib_qsyrk( 'LOWER', 'NO TRANSPOSE', i2, ib, -one,ab( 1+ib, i ), &
+                                    ldab-1, one,ab( 1, i+ib ), ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! copy the upper triangle of a31 into the work array.
+                          do jj = 1, ib
+                             do ii = 1, min( jj, i3 )
+                                work( ii, jj ) = ab( kd+1-jj+ii, jj+i-1 )
+                             end do
+                          end do
+                          ! update a31 (in the work array).
+                          call stdlib_qtrsm( 'RIGHT', 'LOWER', 'TRANSPOSE','NON-UNIT', i3, ib, &
+                                    one, ab( 1, i ),ldab-1, work, ldwork )
+                          ! update a32
+                          if( i2>0 )call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', i3, i2,ib, -&
+                          one, work, ldwork,ab( 1+ib, i ), ldab-1, one,ab( 1+kd-ib, i+ib ), ldab-&
+                                    1 )
+                          ! update a33
+                          call stdlib_qsyrk( 'LOWER', 'NO TRANSPOSE', i3, ib, -one,work, ldwork, &
+                                    one, ab( 1, i+kd ),ldab-1 )
+                          ! copy the upper triangle of a31 back into place.
+                          do jj = 1, ib
+                             do ii = 1, min( jj, i3 )
+                                ab( kd+1-jj+ii, jj+i-1 ) = work( ii, jj )
+                             end do
+                          end do
+                       end if
+                    end if
+                 end do loop_140
+              end if
+           end if
+           return
+           150 continue
+           return
+     end subroutine stdlib_qpbtrf
+
+     !> DPBTRS: solves a system of linear equations A*X = B with a symmetric
+     !> positive definite band matrix A using the Cholesky factorization
+     !> A = U**T*U or A = L*L**T computed by DPBTRF.
+
+     pure subroutine stdlib_qpbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(in) :: ab(ldab,*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**t *u.
+              do j = 1, nrhs
+                 ! solve u**t *x = b, overwriting b with x.
+                 call stdlib_qtbsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j ), &
+                           1 )
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_qtbsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j )&
+                           , 1 )
+              end do
+           else
+              ! solve a*x = b where a = l*l**t.
+              do j = 1, nrhs
+                 ! solve l*x = b, overwriting b with x.
+                 call stdlib_qtbsv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j )&
+                           , 1 )
+                 ! solve l**t *x = b, overwriting b with x.
+                 call stdlib_qtbsv( 'LOWER', 'TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j ), &
+                           1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_qpbtrs
+
+     !> DPFTRF: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the block version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_qpftrf( transr, uplo, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(in) :: n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPFTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_qpotrf( 'L', n1, a( 0 ), n, info )
+                    if( info>0 )return
+                    call stdlib_qtrsm( 'R', 'L', 'T', 'N', n2, n1, one, a( 0 ), n,a( n1 ), n )
+                              
+                    call stdlib_qsyrk( 'U', 'N', n2, n1, -one, a( n1 ), n, one,a( n ), n )
+                    call stdlib_qpotrf( 'U', n2, a( n ), n, info )
+                    if( info>0 )info = info + n1
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_qpotrf( 'L', n1, a( n2 ), n, info )
+                    if( info>0 )return
+                    call stdlib_qtrsm( 'L', 'L', 'N', 'N', n1, n2, one, a( n2 ), n,a( 0 ), n )
+                              
+                    call stdlib_qsyrk( 'U', 'T', n2, n1, -one, a( 0 ), n, one,a( n1 ), n )
+                    call stdlib_qpotrf( 'U', n2, a( n1 ), n, info )
+                    if( info>0 )info = info + n1
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    call stdlib_qpotrf( 'U', n1, a( 0 ), n1, info )
+                    if( info>0 )return
+                    call stdlib_qtrsm( 'L', 'U', 'T', 'N', n1, n2, one, a( 0 ), n1,a( n1*n1 ), n1 &
+                              )
+                    call stdlib_qsyrk( 'L', 'T', n2, n1, -one, a( n1*n1 ), n1, one,a( 1 ), n1 )
+                              
+                    call stdlib_qpotrf( 'L', n2, a( 1 ), n1, info )
+                    if( info>0 )info = info + n1
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    call stdlib_qpotrf( 'U', n1, a( n2*n2 ), n2, info )
+                    if( info>0 )return
+                    call stdlib_qtrsm( 'R', 'U', 'N', 'N', n2, n1, one, a( n2*n2 ),n2, a( 0 ), n2 &
+                              )
+                    call stdlib_qsyrk( 'L', 'N', n2, n1, -one, a( 0 ), n2, one,a( n1*n2 ), n2 )
+                              
+                    call stdlib_qpotrf( 'L', n2, a( n1*n2 ), n2, info )
+                    if( info>0 )info = info + n1
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_qpotrf( 'L', k, a( 1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_qtrsm( 'R', 'L', 'T', 'N', k, k, one, a( 1 ), n+1,a( k+1 ), n+1 )
+                              
+                    call stdlib_qsyrk( 'U', 'N', k, k, -one, a( k+1 ), n+1, one,a( 0 ), n+1 )
+                              
+                    call stdlib_qpotrf( 'U', k, a( 0 ), n+1, info )
+                    if( info>0 )info = info + k
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_qpotrf( 'L', k, a( k+1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_qtrsm( 'L', 'L', 'N', 'N', k, k, one, a( k+1 ),n+1, a( 0 ), n+1 )
+                              
+                    call stdlib_qsyrk( 'U', 'T', k, k, -one, a( 0 ), n+1, one,a( k ), n+1 )
+                              
+                    call stdlib_qpotrf( 'U', k, a( k ), n+1, info )
+                    if( info>0 )info = info + k
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_qpotrf( 'U', k, a( 0+k ), k, info )
+                    if( info>0 )return
+                    call stdlib_qtrsm( 'L', 'U', 'T', 'N', k, k, one, a( k ), n1,a( k*( k+1 ) ), &
+                              k )
+                    call stdlib_qsyrk( 'L', 'T', k, k, -one, a( k*( k+1 ) ), k, one,a( 0 ), k )
+                              
+                    call stdlib_qpotrf( 'L', k, a( 0 ), k, info )
+                    if( info>0 )info = info + k
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_qpotrf( 'U', k, a( k*( k+1 ) ), k, info )
+                    if( info>0 )return
+                    call stdlib_qtrsm( 'R', 'U', 'N', 'N', k, k, one,a( k*( k+1 ) ), k, a( 0 ), k &
+                              )
+                    call stdlib_qsyrk( 'L', 'N', k, k, -one, a( 0 ), k, one,a( k*k ), k )
+                    call stdlib_qpotrf( 'L', k, a( k*k ), k, info )
+                    if( info>0 )info = info + k
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qpftrf
+
+     !> DPFTRI: computes the inverse of a (real) symmetric positive definite
+     !> matrix A using the Cholesky factorization A = U**T*U or A = L*L**T
+     !> computed by DPFTRF.
+
+     pure subroutine stdlib_qpftri( transr, uplo, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPFTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_qtftri( transr, uplo, 'N', n, a, info )
+           if( info>0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution of triangular matrix multiply: inv(u)*inv(u)^c or
+           ! inv(l)^c*inv(l). there are eight cases.
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                    ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                    ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_qlauum( 'L', n1, a( 0 ), n, info )
+                    call stdlib_qsyrk( 'L', 'T', n1, n2, one, a( n1 ), n, one,a( 0 ), n )
+                    call stdlib_qtrmm( 'L', 'U', 'N', 'N', n2, n1, one, a( n ), n,a( n1 ), n )
+                              
+                    call stdlib_qlauum( 'U', n2, a( n ), n, info )
+                 else
+                    ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                    ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                    ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_qlauum( 'L', n1, a( n2 ), n, info )
+                    call stdlib_qsyrk( 'L', 'N', n1, n2, one, a( 0 ), n, one,a( n2 ), n )
+                    call stdlib_qtrmm( 'R', 'U', 'T', 'N', n1, n2, one, a( n1 ), n,a( 0 ), n )
+                              
+                    call stdlib_qlauum( 'U', n2, a( n1 ), n, info )
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose, and n is odd
+                    ! t1 -> a(0), t2 -> a(1), s -> a(0+n1*n1)
+                    call stdlib_qlauum( 'U', n1, a( 0 ), n1, info )
+                    call stdlib_qsyrk( 'U', 'N', n1, n2, one, a( n1*n1 ), n1, one,a( 0 ), n1 )
+                              
+                    call stdlib_qtrmm( 'R', 'L', 'N', 'N', n1, n2, one, a( 1 ), n1,a( n1*n1 ), n1 &
+                              )
+                    call stdlib_qlauum( 'L', n2, a( 1 ), n1, info )
+                 else
+                    ! srpa for upper, transpose, and n is odd
+                    ! t1 -> a(0+n2*n2), t2 -> a(0+n1*n2), s -> a(0)
+                    call stdlib_qlauum( 'U', n1, a( n2*n2 ), n2, info )
+                    call stdlib_qsyrk( 'U', 'T', n1, n2, one, a( 0 ), n2, one,a( n2*n2 ), n2 )
+                              
+                    call stdlib_qtrmm( 'L', 'L', 'T', 'N', n2, n1, one, a( n1*n2 ),n2, a( 0 ), n2 &
+                              )
+                    call stdlib_qlauum( 'L', n2, a( n1*n2 ), n2, info )
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_qlauum( 'L', k, a( 1 ), n+1, info )
+                    call stdlib_qsyrk( 'L', 'T', k, k, one, a( k+1 ), n+1, one,a( 1 ), n+1 )
+                              
+                    call stdlib_qtrmm( 'L', 'U', 'N', 'N', k, k, one, a( 0 ), n+1,a( k+1 ), n+1 )
+                              
+                    call stdlib_qlauum( 'U', k, a( 0 ), n+1, info )
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_qlauum( 'L', k, a( k+1 ), n+1, info )
+                    call stdlib_qsyrk( 'L', 'N', k, k, one, a( 0 ), n+1, one,a( k+1 ), n+1 )
+                              
+                    call stdlib_qtrmm( 'R', 'U', 'T', 'N', k, k, one, a( k ), n+1,a( 0 ), n+1 )
+                              
+                    call stdlib_qlauum( 'U', k, a( k ), n+1, info )
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose, and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1),
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_qlauum( 'U', k, a( k ), k, info )
+                    call stdlib_qsyrk( 'U', 'N', k, k, one, a( k*( k+1 ) ), k, one,a( k ), k )
+                              
+                    call stdlib_qtrmm( 'R', 'L', 'N', 'N', k, k, one, a( 0 ), k,a( k*( k+1 ) ), k &
+                              )
+                    call stdlib_qlauum( 'L', k, a( 0 ), k, info )
+                 else
+                    ! srpa for upper, transpose, and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0),
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_qlauum( 'U', k, a( k*( k+1 ) ), k, info )
+                    call stdlib_qsyrk( 'U', 'T', k, k, one, a( 0 ), k, one,a( k*( k+1 ) ), k )
+                              
+                    call stdlib_qtrmm( 'L', 'L', 'T', 'N', k, k, one, a( k*k ), k,a( 0 ), k )
+                              
+                    call stdlib_qlauum( 'L', k, a( k*k ), k, info )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qpftri
+
+     !> DPFTRS: solves a system of linear equations A*X = B with a symmetric
+     !> positive definite matrix A using the Cholesky factorization
+     !> A = U**T*U or A = L*L**T computed by DPFTRF.
+
+     pure subroutine stdlib_qpftrs( transr, uplo, n, nrhs, a, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(in) :: a(0:*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, normaltransr
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPFTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! start execution: there are two triangular solves
+           if( lower ) then
+              call stdlib_qtfsm( transr, 'L', uplo, 'N', 'N', n, nrhs, one, a, b,ldb )
+              call stdlib_qtfsm( transr, 'L', uplo, 'T', 'N', n, nrhs, one, a, b,ldb )
+           else
+              call stdlib_qtfsm( transr, 'L', uplo, 'T', 'N', n, nrhs, one, a, b,ldb )
+              call stdlib_qtfsm( transr, 'L', uplo, 'N', 'N', n, nrhs, one, a, b,ldb )
+           end if
+           return
+     end subroutine stdlib_qpftrs
+
+     !> DPOCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric positive definite matrix using the
+     !> Cholesky factorization A = U**T*U or A = L*L**T computed by DPOTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_qpocon( uplo, n, a, lda, anorm, rcond, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(qp) :: ainvnm, scale, scalel, scaleu, smlnum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of inv(a).
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**t).
+                 call stdlib_qlatrs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n, a,lda, work, &
+                           scalel, work( 2*n+1 ), info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_qlatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           scaleu, work( 2*n+1 ), info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_qlatrs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           scalel, work( 2*n+1 ), info )
+                 normin = 'Y'
+                 ! multiply by inv(l**t).
+                 call stdlib_qlatrs( 'LOWER', 'TRANSPOSE', 'NON-UNIT', normin, n, a,lda, work, &
+                           scaleu, work( 2*n+1 ), info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_iqamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_qrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_qpocon
+
+     !> DPOEQU: computes row and column scalings intended to equilibrate a
+     !> symmetric positive definite matrix A and reduce its condition number
+     !> (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_qpoequ( n, a, lda, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           ! find the minimum and maximum diagonal elements.
+           s( 1 ) = a( 1, 1 )
+           smin = s( 1 )
+           amax = s( 1 )
+           do i = 2, n
+              s( i ) = a( i, i )
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_qpoequ
+
+     !> DPOEQUB: computes row and column scalings intended to equilibrate a
+     !> symmetric positive definite matrix A and reduce its condition number
+     !> (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+     !> This routine differs from DPOEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled diagonal entries are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_qpoequb( n, a, lda, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: smin, base, tmp
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt,log,int
+           ! Executable Statements 
+           ! test the input parameters.
+           ! positive definite only performs 1 pass of equilibration.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           base = stdlib_qlamch( 'B' )
+           tmp = -0.5e+0_qp / log ( base )
+           ! find the minimum and maximum diagonal elements.
+           s( 1 ) = a( 1, 1 )
+           smin = s( 1 )
+           amax = s( 1 )
+           do i = 2, n
+              s( i ) = a( i, i )
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = base ** int( tmp * log( s( i ) ),KIND=ilp)
+              end do
+              ! compute scond = min(s(i)) / max(s(i)).
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_qpoequb
+
+     !> DPORFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric positive definite,
+     !> and provides error bounds and backward error estimates for the
+     !> solution.
+
+     pure subroutine stdlib_qporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x,ldx, ferr, berr, &
+               work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*)
+           real(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPORFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_qcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_qsymv( uplo, n, -one, a, lda, x( 1, j ), 1, one,work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    do i = 1, k - 1
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + abs( a( k, k ) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( a( k, k ) )*xk
+                    do i = k + 1, n
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_qpotrs( uplo, n, 1, af, ldaf, work( n+1 ), n, info )
+                 call stdlib_qaxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_qlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_qlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_qpotrs( uplo, n, 1, af, ldaf, work( n+1 ), n, info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_qpotrs( uplo, n, 1, af, ldaf, work( n+1 ), n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_qporfs
+
+     !> DPOSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**T* U,  if UPLO = 'U', or
+     !> A = L * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is a lower triangular
+     !> matrix.  The factored form of A is then used to solve the system of
+     !> equations A * X = B.
+
+     pure subroutine stdlib_qposv( uplo, n, nrhs, a, lda, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**t*u or a = l*l**t.
+           call stdlib_qpotrf( uplo, n, a, lda, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_qpotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_qposv
+
+     !> DPOSVX: uses the Cholesky factorization A = U**T*U or A = L*L**T to
+     !> compute the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_qposvx( fact, uplo, n, nrhs, a, lda, af, ldaf, equed,s, b, ldb, x, ldx, &
+               rcond, ferr, berr, work,iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: a(lda,*), af(ldaf,*), b(ldb,*), s(*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ
+           integer(ilp) :: i, infequ, j
+           real(qp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -9
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -10
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -12
+                 else if( ldx<max( 1, n ) ) then
+                    info = -14
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_qpoequ( n, a, lda, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_qlaqsy( uplo, n, a, lda, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**t *u or a = l*l**t.
+              call stdlib_qlacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_qpotrf( uplo, n, af, ldaf, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_qlansy( '1', uplo, n, a, lda, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_qpocon( uplo, n, af, ldaf, anorm, rcond, work, iwork, info )
+           ! compute the solution matrix x.
+           call stdlib_qlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_qpotrs( uplo, n, nrhs, af, ldaf, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_qporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x, ldx,ferr, berr, work, &
+                     iwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_qposvx
+
+     !> DPOTF2: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**T * U ,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_qpotf2( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j
+           real(qp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the cholesky factorization a = u**t *u.
+              do j = 1, n
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = a( j, j ) - stdlib_qdot( j-1, a( 1, j ), 1, a( 1, j ), 1 )
+                 if( ajj<=zero.or.stdlib_qisnan( ajj ) ) then
+                    a( j, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of row j.
+                 if( j<n ) then
+                    call stdlib_qgemv( 'TRANSPOSE', j-1, n-j, -one, a( 1, j+1 ),lda, a( 1, j ), 1,&
+                               one, a( j, j+1 ), lda )
+                    call stdlib_qscal( n-j, one / ajj, a( j, j+1 ), lda )
+                 end if
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**t.
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = a( j, j ) - stdlib_qdot( j-1, a( j, 1 ), lda, a( j, 1 ),lda )
+                 if( ajj<=zero.or.stdlib_qisnan( ajj ) ) then
+                    a( j, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of column j.
+                 if( j<n ) then
+                    call stdlib_qgemv( 'NO TRANSPOSE', n-j, j-1, -one, a( j+1, 1 ),lda, a( j, 1 ),&
+                               lda, one, a( j+1, j ), 1 )
+                    call stdlib_qscal( n-j, one / ajj, a( j+1, j ), 1 )
+                 end if
+              end do
+           end if
+           go to 40
+           30 continue
+           info = j
+           40 continue
+           return
+     end subroutine stdlib_qpotf2
+
+     !> DPOTRF: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the block version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_qpotrf( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'DPOTRF', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code.
+              call stdlib_qpotrf2( uplo, n, a, lda, info )
+           else
+              ! use blocked code.
+              if( upper ) then
+                 ! compute the cholesky factorization a = u**t*u.
+                 do j = 1, n, nb
+                    ! update and factorize the current diagonal block and test
+                    ! for non-positive-definiteness.
+                    jb = min( nb, n-j+1 )
+                    call stdlib_qsyrk( 'UPPER', 'TRANSPOSE', jb, j-1, -one,a( 1, j ), lda, one, a(&
+                               j, j ), lda )
+                    call stdlib_qpotrf2( 'UPPER', jb, a( j, j ), lda, info )
+                    if( info/=0 )go to 30
+                    if( j+jb<=n ) then
+                       ! compute the current block row.
+                       call stdlib_qgemm( 'TRANSPOSE', 'NO TRANSPOSE', jb, n-j-jb+1,j-1, -one, a( &
+                                 1, j ), lda, a( 1, j+jb ),lda, one, a( j, j+jb ), lda )
+                       call stdlib_qtrsm( 'LEFT', 'UPPER', 'TRANSPOSE', 'NON-UNIT',jb, n-j-jb+1, &
+                                 one, a( j, j ), lda,a( j, j+jb ), lda )
+                    end if
+                 end do
+              else
+                 ! compute the cholesky factorization a = l*l**t.
+                 do j = 1, n, nb
+                    ! update and factorize the current diagonal block and test
+                    ! for non-positive-definiteness.
+                    jb = min( nb, n-j+1 )
+                    call stdlib_qsyrk( 'LOWER', 'NO TRANSPOSE', jb, j-1, -one,a( j, 1 ), lda, one,&
+                               a( j, j ), lda )
+                    call stdlib_qpotrf2( 'LOWER', jb, a( j, j ), lda, info )
+                    if( info/=0 )go to 30
+                    if( j+jb<=n ) then
+                       ! compute the current block column.
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,j-1, -one, a( &
+                                 j+jb, 1 ), lda, a( j, 1 ),lda, one, a( j+jb, j ), lda )
+                       call stdlib_qtrsm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'NON-UNIT',n-j-jb+1, jb, &
+                                 one, a( j, j ), lda,a( j+jb, j ), lda )
+                    end if
+                 end do
+              end if
+           end if
+           go to 40
+           30 continue
+           info = info + j - 1
+           40 continue
+           return
+     end subroutine stdlib_qpotrf
+
+     !> DPOTRF2: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A using the recursive algorithm.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the recursive version of the algorithm. It divides
+     !> the matrix into four submatrices:
+     !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+     !> A = [ -----|----- ]  with n1 = n/2
+     !> [  A21 | A22  ]       n2 = n-n1
+     !> The subroutine calls itself to factor A11. Update and scale A21
+     !> or A12, update A22 then calls itself to factor A22.
+
+     pure recursive subroutine stdlib_qpotrf2( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: n1, n2, iinfo
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOTRF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! n=1 case
+           if( n==1 ) then
+              ! test for non-positive-definiteness
+              if( a( 1, 1 )<=zero.or.stdlib_qisnan( a( 1, 1 ) ) ) then
+                 info = 1
+                 return
+              end if
+              ! factor
+              a( 1, 1 ) = sqrt( a( 1, 1 ) )
+           ! use recursive code
+           else
+              n1 = n/2
+              n2 = n-n1
+              ! factor a11
+              call stdlib_qpotrf2( uplo, n1, a( 1, 1 ), lda, iinfo )
+              if ( iinfo/=0 ) then
+                 info = iinfo
+                 return
+              end if
+              ! compute the cholesky factorization a = u**t*u
+              if( upper ) then
+                 ! update and scale a12
+                 call stdlib_qtrsm( 'L', 'U', 'T', 'N', n1, n2, one,a( 1, 1 ), lda, a( 1, n1+1 ), &
+                           lda )
+                 ! update and factor a22
+                 call stdlib_qsyrk( uplo, 'T', n2, n1, -one, a( 1, n1+1 ), lda,one, a( n1+1, n1+1 &
+                           ), lda )
+                 call stdlib_qpotrf2( uplo, n2, a( n1+1, n1+1 ), lda, iinfo )
+                 if ( iinfo/=0 ) then
+                    info = iinfo + n1
+                    return
+                 end if
+              ! compute the cholesky factorization a = l*l**t
+              else
+                 ! update and scale a21
+                 call stdlib_qtrsm( 'R', 'L', 'T', 'N', n2, n1, one,a( 1, 1 ), lda, a( n1+1, 1 ), &
+                           lda )
+                 ! update and factor a22
+                 call stdlib_qsyrk( uplo, 'N', n2, n1, -one, a( n1+1, 1 ), lda,one, a( n1+1, n1+1 &
+                           ), lda )
+                 call stdlib_qpotrf2( uplo, n2, a( n1+1, n1+1 ), lda, iinfo )
+                 if ( iinfo/=0 ) then
+                    info = iinfo + n1
+                    return
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qpotrf2
+
+     !> DPOTRI: computes the inverse of a real symmetric positive definite
+     !> matrix A using the Cholesky factorization A = U**T*U or A = L*L**T
+     !> computed by DPOTRF.
+
+     pure subroutine stdlib_qpotri( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_qtrtri( uplo, 'NON-UNIT', n, a, lda, info )
+           if( info>0 )return
+           ! form inv(u) * inv(u)**t or inv(l)**t * inv(l).
+           call stdlib_qlauum( uplo, n, a, lda, info )
+           return
+     end subroutine stdlib_qpotri
+
+     !> DPOTRS: solves a system of linear equations A*X = B with a symmetric
+     !> positive definite matrix A using the Cholesky factorization
+     !> A = U**T*U or A = L*L**T computed by DPOTRF.
+
+     pure subroutine stdlib_qpotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPOTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**t *u.
+              ! solve u**t *x = b, overwriting b with x.
+              call stdlib_qtrsm( 'LEFT', 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, nrhs,one, a, lda, b,&
+                         ldb )
+              ! solve u*x = b, overwriting b with x.
+              call stdlib_qtrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, one, a, lda,&
+                         b, ldb )
+           else
+              ! solve a*x = b where a = l*l**t.
+              ! solve l*x = b, overwriting b with x.
+              call stdlib_qtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, one, a, lda,&
+                         b, ldb )
+              ! solve l**t *x = b, overwriting b with x.
+              call stdlib_qtrsm( 'LEFT', 'LOWER', 'TRANSPOSE', 'NON-UNIT', n, nrhs,one, a, lda, b,&
+                         ldb )
+           end if
+           return
+     end subroutine stdlib_qpotrs
+
+     !> DPPCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric positive definite packed matrix using
+     !> the Cholesky factorization A = U**T*U or A = L*L**T computed by
+     !> DPPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_qppcon( uplo, n, ap, anorm, rcond, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: ap(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(qp) :: ainvnm, scale, scalel, scaleu, smlnum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**t).
+                 call stdlib_qlatps( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n,ap, work, scalel,&
+                            work( 2*n+1 ), info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_qlatps( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,ap, work, &
+                           scaleu, work( 2*n+1 ), info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_qlatps( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,ap, work, &
+                           scalel, work( 2*n+1 ), info )
+                 normin = 'Y'
+                 ! multiply by inv(l**t).
+                 call stdlib_qlatps( 'LOWER', 'TRANSPOSE', 'NON-UNIT', normin, n,ap, work, scaleu,&
+                            work( 2*n+1 ), info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_iqamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_qrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_qppcon
+
+     !> DPPEQU: computes row and column scalings intended to equilibrate a
+     !> symmetric positive definite matrix A in packed storage and reduce
+     !> its condition number (with respect to the two-norm).  S contains the
+     !> scale factors, S(i)=1/sqrt(A(i,i)), chosen so that the scaled matrix
+     !> B with elements B(i,j)=S(i)*A(i,j)*S(j) has ones on the diagonal.
+     !> This choice of S puts the condition number of B within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_qppequ( uplo, n, ap, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(in) :: ap(*)
+           real(qp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, jj
+           real(qp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           ! initialize smin and amax.
+           s( 1 ) = ap( 1 )
+           smin = s( 1 )
+           amax = s( 1 )
+           if( upper ) then
+              ! uplo = 'u':  upper triangle of a is stored.
+              ! find the minimum and maximum diagonal elements.
+              jj = 1
+              do i = 2, n
+                 jj = jj + i
+                 s( i ) = ap( jj )
+                 smin = min( smin, s( i ) )
+                 amax = max( amax, s( i ) )
+              end do
+           else
+              ! uplo = 'l':  lower triangle of a is stored.
+              ! find the minimum and maximum diagonal elements.
+              jj = 1
+              do i = 2, n
+                 jj = jj + n - i + 2
+                 s( i ) = ap( jj )
+                 smin = min( smin, s( i ) )
+                 amax = max( amax, s( i ) )
+              end do
+           end if
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_qppequ
+
+     !> DPPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric positive definite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_qpprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr,berr, work, &
+               iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*)
+           real(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_qcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_qspmv( uplo, n, -one, ap, x( 1, j ), 1, one, work( n+1 ),1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       work( i ) = work( i ) + abs( ap( ik ) )*xk
+                       s = s + abs( ap( ik ) )*abs( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    work( k ) = work( k ) + abs( ap( kk+k-1 ) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( ap( kk ) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       work( i ) = work( i ) + abs( ap( ik ) )*xk
+                       s = s + abs( ap( ik ) )*abs( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    work( k ) = work( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_qpptrs( uplo, n, 1, afp, work( n+1 ), n, info )
+                 call stdlib_qaxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_qlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_qlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_qpptrs( uplo, n, 1, afp, work( n+1 ), n, info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_qpptrs( uplo, n, 1, afp, work( n+1 ), n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_qpprfs
+
+     !> DPPSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix stored in
+     !> packed format and X and B are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**T* U,  if UPLO = 'U', or
+     !> A = L * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is a lower triangular
+     !> matrix.  The factored form of A is then used to solve the system of
+     !> equations A * X = B.
+
+     pure subroutine stdlib_qppsv( uplo, n, nrhs, ap, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**t*u or a = l*l**t.
+           call stdlib_qpptrf( uplo, n, ap, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_qpptrs( uplo, n, nrhs, ap, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_qppsv
+
+     !> DPPSVX: uses the Cholesky factorization A = U**T*U or A = L*L**T to
+     !> compute the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix stored in
+     !> packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_qppsvx( fact, uplo, n, nrhs, ap, afp, equed, s, b, ldb,x, ldx, rcond, ferr,&
+                berr, work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: afp(*), ap(*), b(ldb,*), s(*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ
+           integer(ilp) :: i, infequ, j
+           real(qp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -7
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -8
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -10
+                 else if( ldx<max( 1, n ) ) then
+                    info = -12
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_qppequ( uplo, n, ap, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_qlaqsp( uplo, n, ap, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right-hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**t * u or a = l * l**t.
+              call stdlib_qcopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_qpptrf( uplo, n, afp, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_qlansp( 'I', uplo, n, ap, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_qppcon( uplo, n, afp, anorm, rcond, work, iwork, info )
+           ! compute the solution matrix x.
+           call stdlib_qlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_qpptrs( uplo, n, nrhs, afp, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_qpprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr, berr,work, iwork, &
+                     info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_qppsvx
+
+     !> DPPTRF: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A stored in packed format.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+
+     pure subroutine stdlib_qpptrf( uplo, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jc, jj
+           real(qp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the cholesky factorization a = u**t*u.
+              jj = 0
+              do j = 1, n
+                 jc = jj + 1
+                 jj = jj + j
+                 ! compute elements 1:j-1 of column j.
+                 if( j>1 )call stdlib_qtpsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', j-1, ap,ap( jc ), &
+                           1 )
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = ap( jj ) - stdlib_qdot( j-1, ap( jc ), 1, ap( jc ), 1 )
+                 if( ajj<=zero ) then
+                    ap( jj ) = ajj
+                    go to 30
+                 end if
+                 ap( jj ) = sqrt( ajj )
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**t.
+              jj = 1
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = ap( jj )
+                 if( ajj<=zero ) then
+                    ap( jj ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 ap( jj ) = ajj
+                 ! compute elements j+1:n of column j and update the trailing
+                 ! submatrix.
+                 if( j<n ) then
+                    call stdlib_qscal( n-j, one / ajj, ap( jj+1 ), 1 )
+                    call stdlib_qspr( 'LOWER', n-j, -one, ap( jj+1 ), 1,ap( jj+n-j+1 ) )
+                    jj = jj + n - j + 1
+                 end if
+              end do
+           end if
+           go to 40
+           30 continue
+           info = j
+           40 continue
+           return
+     end subroutine stdlib_qpptrf
+
+     !> DPPTRI: computes the inverse of a real symmetric positive definite
+     !> matrix A using the Cholesky factorization A = U**T*U or A = L*L**T
+     !> computed by DPPTRF.
+
+     pure subroutine stdlib_qpptri( uplo, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jc, jj, jjn
+           real(qp) :: ajj
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_qtptri( uplo, 'NON-UNIT', n, ap, info )
+           if( info>0 )return
+           if( upper ) then
+              ! compute the product inv(u) * inv(u)**t.
+              jj = 0
+              do j = 1, n
+                 jc = jj + 1
+                 jj = jj + j
+                 if( j>1 )call stdlib_qspr( 'UPPER', j-1, one, ap( jc ), 1, ap )
+                 ajj = ap( jj )
+                 call stdlib_qscal( j, ajj, ap( jc ), 1 )
+              end do
+           else
+              ! compute the product inv(l)**t * inv(l).
+              jj = 1
+              do j = 1, n
+                 jjn = jj + n - j + 1
+                 ap( jj ) = stdlib_qdot( n-j+1, ap( jj ), 1, ap( jj ), 1 )
+                 if( j<n )call stdlib_qtpmv( 'LOWER', 'TRANSPOSE', 'NON-UNIT', n-j,ap( jjn ), ap( &
+                           jj+1 ), 1 )
+                 jj = jjn
+              end do
+           end if
+           return
+     end subroutine stdlib_qpptri
+
+     !> DPPTRS: solves a system of linear equations A*X = B with a symmetric
+     !> positive definite matrix A in packed storage using the Cholesky
+     !> factorization A = U**T*U or A = L*L**T computed by DPPTRF.
+
+     pure subroutine stdlib_qpptrs( uplo, n, nrhs, ap, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(in) :: ap(*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**t * u.
+              do i = 1, nrhs
+                 ! solve u**t *x = b, overwriting b with x.
+                 call stdlib_qtpsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_qtpsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                           
+              end do
+           else
+              ! solve a*x = b where a = l * l**t.
+              do i = 1, nrhs
+                 ! solve l*y = b, overwriting b with x.
+                 call stdlib_qtpsv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                           
+                 ! solve l**t *x = y, overwriting b with x.
+                 call stdlib_qtpsv( 'LOWER', 'TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_qpptrs
+
+     !> DPSTF2: computes the Cholesky factorization with complete
+     !> pivoting of a real symmetric positive semidefinite matrix A.
+     !> The factorization has the form
+     !> P**T * A * P = U**T * U ,  if UPLO = 'U',
+     !> P**T * A * P = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular, and
+     !> P is stored as vector PIV.
+     !> This algorithm does not attempt to check that A is positive
+     !> semidefinite. This version of the algorithm calls level 2 BLAS.
+
+     pure subroutine stdlib_qpstf2( uplo, n, a, lda, piv, rank, tol, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: tol
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(2*n)
+           integer(ilp), intent(out) :: piv(n)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: ajj, dstop, dtemp
+           integer(ilp) :: i, itemp, j, pvt
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt,maxloc
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPSTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize piv
+           do i = 1, n
+              piv( i ) = i
+           end do
+           ! compute stopping value
+           pvt = 1
+           ajj = a( pvt, pvt )
+           do i = 2, n
+              if( a( i, i )>ajj ) then
+                 pvt = i
+                 ajj = a( pvt, pvt )
+              end if
+           end do
+           if( ajj<=zero.or.stdlib_qisnan( ajj ) ) then
+              rank = 0
+              info = 1
+              go to 170
+           end if
+           ! compute stopping value if not supplied
+           if( tol<zero ) then
+              dstop = n * stdlib_qlamch( 'EPSILON' ) * ajj
+           else
+              dstop = tol
+           end if
+           ! set first half of work to zero, holds dot products
+           do i = 1, n
+              work( i ) = 0
+           end do
+           if( upper ) then
+              ! compute the cholesky factorization p**t * a * p = u**t * u
+              loop_130: do j = 1, n
+              ! find pivot, test for exit, else swap rows and columns
+              ! update dot products, compute possible pivots which are
+              ! stored in the second half of work
+                 do i = j, n
+                    if( j>1 ) then
+                       work( i ) = work( i ) + a( j-1, i )**2
+                    end if
+                    work( n+i ) = a( i, i ) - work( i )
+                 end do
+                 if( j>1 ) then
+                    itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                    pvt = itemp + j - 1
+                    ajj = work( n+pvt )
+                    if( ajj<=dstop.or.stdlib_qisnan( ajj ) ) then
+                       a( j, j ) = ajj
+                       go to 160
+                    end if
+                 end if
+                 if( j/=pvt ) then
+                    ! pivot ok, so can now swap pivot rows and columns
+                    a( pvt, pvt ) = a( j, j )
+                    call stdlib_qswap( j-1, a( 1, j ), 1, a( 1, pvt ), 1 )
+                    if( pvt<n )call stdlib_qswap( n-pvt, a( j, pvt+1 ), lda,a( pvt, pvt+1 ), lda )
+                              
+                    call stdlib_qswap( pvt-j-1, a( j, j+1 ), lda, a( j+1, pvt ), 1 )
+                    ! swap dot products and piv
+                    dtemp = work( j )
+                    work( j ) = work( pvt )
+                    work( pvt ) = dtemp
+                    itemp = piv( pvt )
+                    piv( pvt ) = piv( j )
+                    piv( j ) = itemp
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of row j
+                 if( j<n ) then
+                    call stdlib_qgemv( 'TRANS', j-1, n-j, -one, a( 1, j+1 ), lda,a( 1, j ), 1, &
+                              one, a( j, j+1 ), lda )
+                    call stdlib_qscal( n-j, one / ajj, a( j, j+1 ), lda )
+                 end if
+              end do loop_130
+           else
+              ! compute the cholesky factorization p**t * a * p = l * l**t
+              loop_150: do j = 1, n
+              ! find pivot, test for exit, else swap rows and columns
+              ! update dot products, compute possible pivots which are
+              ! stored in the second half of work
+                 do i = j, n
+                    if( j>1 ) then
+                       work( i ) = work( i ) + a( i, j-1 )**2
+                    end if
+                    work( n+i ) = a( i, i ) - work( i )
+                 end do
+                 if( j>1 ) then
+                    itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                    pvt = itemp + j - 1
+                    ajj = work( n+pvt )
+                    if( ajj<=dstop.or.stdlib_qisnan( ajj ) ) then
+                       a( j, j ) = ajj
+                       go to 160
+                    end if
+                 end if
+                 if( j/=pvt ) then
+                    ! pivot ok, so can now swap pivot rows and columns
+                    a( pvt, pvt ) = a( j, j )
+                    call stdlib_qswap( j-1, a( j, 1 ), lda, a( pvt, 1 ), lda )
+                    if( pvt<n )call stdlib_qswap( n-pvt, a( pvt+1, j ), 1, a( pvt+1, pvt ),1 )
+                              
+                    call stdlib_qswap( pvt-j-1, a( j+1, j ), 1, a( pvt, j+1 ), lda )
+                    ! swap dot products and piv
+                    dtemp = work( j )
+                    work( j ) = work( pvt )
+                    work( pvt ) = dtemp
+                    itemp = piv( pvt )
+                    piv( pvt ) = piv( j )
+                    piv( j ) = itemp
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of column j
+                 if( j<n ) then
+                    call stdlib_qgemv( 'NO TRANS', n-j, j-1, -one, a( j+1, 1 ), lda,a( j, 1 ), &
+                              lda, one, a( j+1, j ), 1 )
+                    call stdlib_qscal( n-j, one / ajj, a( j+1, j ), 1 )
+                 end if
+              end do loop_150
+           end if
+           ! ran to completion, a has full rank
+           rank = n
+           go to 170
+           160 continue
+           ! rank is number of steps completed.  set info = 1 to signal
+           ! that the factorization cannot be used to solve a system.
+           rank = j - 1
+           info = 1
+           170 continue
+           return
+     end subroutine stdlib_qpstf2
+
+     !> DPSTRF: computes the Cholesky factorization with complete
+     !> pivoting of a real symmetric positive semidefinite matrix A.
+     !> The factorization has the form
+     !> P**T * A * P = U**T * U ,  if UPLO = 'U',
+     !> P**T * A * P = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular, and
+     !> P is stored as vector PIV.
+     !> This algorithm does not attempt to check that A is positive
+     !> semidefinite. This version of the algorithm calls level 3 BLAS.
+
+     pure subroutine stdlib_qpstrf( uplo, n, a, lda, piv, rank, tol, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: tol
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(2*n)
+           integer(ilp), intent(out) :: piv(n)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: ajj, dstop, dtemp
+           integer(ilp) :: i, itemp, j, jb, k, nb, pvt
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt,maxloc
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPSTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get block size
+           nb = stdlib_ilaenv( 1, 'DPOTRF', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_qpstf2( uplo, n, a( 1, 1 ), lda, piv, rank, tol, work,info )
+              go to 200
+           else
+           ! initialize piv
+              do i = 1, n
+                 piv( i ) = i
+              end do
+           ! compute stopping value
+              pvt = 1
+              ajj = a( pvt, pvt )
+              do i = 2, n
+                 if( a( i, i )>ajj ) then
+                    pvt = i
+                    ajj = a( pvt, pvt )
+                 end if
+              end do
+              if( ajj<=zero.or.stdlib_qisnan( ajj ) ) then
+                 rank = 0
+                 info = 1
+                 go to 200
+              end if
+           ! compute stopping value if not supplied
+              if( tol<zero ) then
+                 dstop = n * stdlib_qlamch( 'EPSILON' ) * ajj
+              else
+                 dstop = tol
+              end if
+              if( upper ) then
+                 ! compute the cholesky factorization p**t * a * p = u**t * u
+                 loop_140: do k = 1, n, nb
+                    ! account for last block not being nb wide
+                    jb = min( nb, n-k+1 )
+                    ! set relevant part of first half of work to zero,
+                    ! holds dot products
+                    do i = k, n
+                       work( i ) = 0
+                    end do
+                    loop_130: do j = k, k + jb - 1
+                    ! find pivot, test for exit, else swap rows and columns
+                    ! update dot products, compute possible pivots which are
+                    ! stored in the second half of work
+                       do i = j, n
+                          if( j>k ) then
+                             work( i ) = work( i ) + a( j-1, i )**2
+                          end if
+                          work( n+i ) = a( i, i ) - work( i )
+                       end do
+                       if( j>1 ) then
+                          itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                          pvt = itemp + j - 1
+                          ajj = work( n+pvt )
+                          if( ajj<=dstop.or.stdlib_qisnan( ajj ) ) then
+                             a( j, j ) = ajj
+                             go to 190
+                          end if
+                       end if
+                       if( j/=pvt ) then
+                          ! pivot ok, so can now swap pivot rows and columns
+                          a( pvt, pvt ) = a( j, j )
+                          call stdlib_qswap( j-1, a( 1, j ), 1, a( 1, pvt ), 1 )
+                          if( pvt<n )call stdlib_qswap( n-pvt, a( j, pvt+1 ), lda,a( pvt, pvt+1 ),&
+                                     lda )
+                          call stdlib_qswap( pvt-j-1, a( j, j+1 ), lda,a( j+1, pvt ), 1 )
+                          ! swap dot products and piv
+                          dtemp = work( j )
+                          work( j ) = work( pvt )
+                          work( pvt ) = dtemp
+                          itemp = piv( pvt )
+                          piv( pvt ) = piv( j )
+                          piv( j ) = itemp
+                       end if
+                       ajj = sqrt( ajj )
+                       a( j, j ) = ajj
+                       ! compute elements j+1:n of row j.
+                       if( j<n ) then
+                          call stdlib_qgemv( 'TRANS', j-k, n-j, -one, a( k, j+1 ),lda, a( k, j ), &
+                                    1, one, a( j, j+1 ),lda )
+                          call stdlib_qscal( n-j, one / ajj, a( j, j+1 ), lda )
+                       end if
+                    end do loop_130
+                    ! update trailing matrix, j already incremented
+                    if( k+jb<=n ) then
+                       call stdlib_qsyrk( 'UPPER', 'TRANS', n-j+1, jb, -one,a( k, j ), lda, one, &
+                                 a( j, j ), lda )
+                    end if
+                 end do loop_140
+              else
+              ! compute the cholesky factorization p**t * a * p = l * l**t
+                 loop_180: do k = 1, n, nb
+                    ! account for last block not being nb wide
+                    jb = min( nb, n-k+1 )
+                    ! set relevant part of first half of work to zero,
+                    ! holds dot products
+                    do i = k, n
+                       work( i ) = 0
+                    end do
+                    loop_170: do j = k, k + jb - 1
+                    ! find pivot, test for exit, else swap rows and columns
+                    ! update dot products, compute possible pivots which are
+                    ! stored in the second half of work
+                       do i = j, n
+                          if( j>k ) then
+                             work( i ) = work( i ) + a( i, j-1 )**2
+                          end if
+                          work( n+i ) = a( i, i ) - work( i )
+                       end do
+                       if( j>1 ) then
+                          itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                          pvt = itemp + j - 1
+                          ajj = work( n+pvt )
+                          if( ajj<=dstop.or.stdlib_qisnan( ajj ) ) then
+                             a( j, j ) = ajj
+                             go to 190
+                          end if
+                       end if
+                       if( j/=pvt ) then
+                          ! pivot ok, so can now swap pivot rows and columns
+                          a( pvt, pvt ) = a( j, j )
+                          call stdlib_qswap( j-1, a( j, 1 ), lda, a( pvt, 1 ), lda )
+                          if( pvt<n )call stdlib_qswap( n-pvt, a( pvt+1, j ), 1,a( pvt+1, pvt ), &
+                                    1 )
+                          call stdlib_qswap( pvt-j-1, a( j+1, j ), 1, a( pvt, j+1 ),lda )
+                          ! swap dot products and piv
+                          dtemp = work( j )
+                          work( j ) = work( pvt )
+                          work( pvt ) = dtemp
+                          itemp = piv( pvt )
+                          piv( pvt ) = piv( j )
+                          piv( j ) = itemp
+                       end if
+                       ajj = sqrt( ajj )
+                       a( j, j ) = ajj
+                       ! compute elements j+1:n of column j.
+                       if( j<n ) then
+                          call stdlib_qgemv( 'NO TRANS', n-j, j-k, -one,a( j+1, k ), lda, a( j, k &
+                                    ), lda, one,a( j+1, j ), 1 )
+                          call stdlib_qscal( n-j, one / ajj, a( j+1, j ), 1 )
+                       end if
+                    end do loop_170
+                    ! update trailing matrix, j already incremented
+                    if( k+jb<=n ) then
+                       call stdlib_qsyrk( 'LOWER', 'NO TRANS', n-j+1, jb, -one,a( j, k ), lda, &
+                                 one, a( j, j ), lda )
+                    end if
+                 end do loop_180
+              end if
+           end if
+           ! ran to completion, a has full rank
+           rank = n
+           go to 200
+           190 continue
+           ! rank is the number of steps completed.  set info = 1 to signal
+           ! that the factorization cannot be used to solve a system.
+           rank = j - 1
+           info = 1
+           200 continue
+           return
+     end subroutine stdlib_qpstrf
+
+     !> DPTCON: computes the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric positive definite tridiagonal matrix
+     !> using the factorization A = L*D*L**T or A = U**T*D*U computed by
+     !> DPTTRF.
+     !> Norm(inv(A)) is computed by a direct method, and the reciprocal of
+     !> the condition number is computed as
+     !> RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_qptcon( n, d, e, anorm, rcond, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*), e(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ix
+           real(qp) :: ainvnm
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( anorm<zero ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPTCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           ! check that d(1:n) is positive.
+           do i = 1, n
+              if( d( i )<=zero )return
+           end do
+           ! solve m(a) * x = e, where m(a) = (m(i,j)) is given by
+              ! m(i,j) =  abs(a(i,j)), i = j,
+              ! m(i,j) = -abs(a(i,j)), i .ne. j,
+           ! and e = [ 1, 1, ..., 1 ]**t.  note m(a) = m(l)*d*m(l)**t.
+           ! solve m(l) * x = e.
+           work( 1 ) = one
+           do i = 2, n
+              work( i ) = one + work( i-1 )*abs( e( i-1 ) )
+           end do
+           ! solve d * m(l)**t * x = b.
+           work( n ) = work( n ) / d( n )
+           do i = n - 1, 1, -1
+              work( i ) = work( i ) / d( i ) + work( i+1 )*abs( e( i ) )
+           end do
+           ! compute ainvnm = max(x(i)), 1<=i<=n.
+           ix = stdlib_iqamax( n, work, 1 )
+           ainvnm = abs( work( ix ) )
+           ! compute the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_qptcon
+
+     !> DPTEQR: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric positive definite tridiagonal matrix by first factoring the
+     !> matrix using DPTTRF, and then calling DBDSQR to compute the singular
+     !> values of the bidiagonal factor.
+     !> This routine computes the eigenvalues of the positive definite
+     !> tridiagonal matrix to high relative accuracy.  This means that if the
+     !> eigenvalues range over many orders of magnitude in size, then the
+     !> small eigenvalues and corresponding eigenvectors will be computed
+     !> more accurately than, for example, with the standard QR method.
+     !> The eigenvectors of a full or band symmetric positive definite matrix
+     !> can also be found if DSYTRD, DSPTRD, or DSBTRD has been used to
+     !> reduce this matrix to tridiagonal form. (The reduction to tridiagonal
+     !> form, however, may preclude the possibility of obtaining high
+     !> relative accuracy in the small eigenvalues of the original matrix, if
+     !> these eigenvalues range over many orders of magnitude.)
+
+     pure subroutine stdlib_qpteqr( compz, n, d, e, z, ldz, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: d(*), e(*), z(ldz,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Arrays 
+           real(qp) :: c(1,1), vt(1,1)
+           ! Local Scalars 
+           integer(ilp) :: i, icompz, nru
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or. ( icompz>0 .and. ldz<max( 1,n ) ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPTEQR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz>0 )z( 1, 1 ) = one
+              return
+           end if
+           if( icompz==2 )call stdlib_qlaset( 'FULL', n, n, zero, one, z, ldz )
+           ! call stdlib_qpttrf to factor the matrix.
+           call stdlib_qpttrf( n, d, e, info )
+           if( info/=0 )return
+           do i = 1, n
+              d( i ) = sqrt( d( i ) )
+           end do
+           do i = 1, n - 1
+              e( i ) = e( i )*d( i )
+           end do
+           ! call stdlib_qbdsqr to compute the singular values/vectors of the
+           ! bidiagonal factor.
+           if( icompz>0 ) then
+              nru = n
+           else
+              nru = 0
+           end if
+           call stdlib_qbdsqr( 'LOWER', n, 0, nru, 0, d, e, vt, 1, z, ldz, c, 1,work, info )
+                     
+           ! square the singular values.
+           if( info==0 ) then
+              do i = 1, n
+                 d( i ) = d( i )*d( i )
+              end do
+           else
+              info = n + info
+           end if
+           return
+     end subroutine stdlib_qpteqr
+
+     !> DPTRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric positive definite
+     !> and tridiagonal, and provides error bounds and backward error
+     !> estimates for the solution.
+
+     pure subroutine stdlib_qptrfs( n, nrhs, d, e, df, ef, b, ldb, x, ldx, ferr,berr, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(in) :: b(ldb,*), d(*), df(*), e(*), ef(*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*)
+           real(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: count, i, ix, j, nz
+           real(qp) :: bi, cx, dx, eps, ex, lstres, s, safe1, safe2, safmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPTRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = 4
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_90: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x.  also compute
+              ! abs(a)*abs(x) + abs(b) for use in the backward error bound.
+              if( n==1 ) then
+                 bi = b( 1, j )
+                 dx = d( 1 )*x( 1, j )
+                 work( n+1 ) = bi - dx
+                 work( 1 ) = abs( bi ) + abs( dx )
+              else
+                 bi = b( 1, j )
+                 dx = d( 1 )*x( 1, j )
+                 ex = e( 1 )*x( 2, j )
+                 work( n+1 ) = bi - dx - ex
+                 work( 1 ) = abs( bi ) + abs( dx ) + abs( ex )
+                 do i = 2, n - 1
+                    bi = b( i, j )
+                    cx = e( i-1 )*x( i-1, j )
+                    dx = d( i )*x( i, j )
+                    ex = e( i )*x( i+1, j )
+                    work( n+i ) = bi - cx - dx - ex
+                    work( i ) = abs( bi ) + abs( cx ) + abs( dx ) + abs( ex )
+                 end do
+                 bi = b( n, j )
+                 cx = e( n-1 )*x( n-1, j )
+                 dx = d( n )*x( n, j )
+                 work( n+n ) = bi - cx - dx
+                 work( n ) = abs( bi ) + abs( cx ) + abs( dx )
+              end if
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_qpttrs( n, 1, df, ef, work( n+1 ), n, info )
+                 call stdlib_qaxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              ix = stdlib_iqamax( n, work, 1 )
+              ferr( j ) = work( ix )
+              ! estimate the norm of inv(a).
+              ! solve m(a) * x = e, where m(a) = (m(i,j)) is given by
+                 ! m(i,j) =  abs(a(i,j)), i = j,
+                 ! m(i,j) = -abs(a(i,j)), i .ne. j,
+              ! and e = [ 1, 1, ..., 1 ]**t.  note m(a) = m(l)*d*m(l)**t.
+              ! solve m(l) * x = e.
+              work( 1 ) = one
+              do i = 2, n
+                 work( i ) = one + work( i-1 )*abs( ef( i-1 ) )
+              end do
+              ! solve d * m(l)**t * x = b.
+              work( n ) = work( n ) / df( n )
+              do i = n - 1, 1, -1
+                 work( i ) = work( i ) / df( i ) + work( i+1 )*abs( ef( i ) )
+              end do
+              ! compute norm(inv(a)) = max(x(i)), 1<=i<=n.
+              ix = stdlib_iqamax( n, work, 1 )
+              ferr( j ) = ferr( j )*abs( work( ix ) )
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_90
+           return
+     end subroutine stdlib_qptrfs
+
+     !> DPTSV: computes the solution to a real system of linear equations
+     !> A*X = B, where A is an N-by-N symmetric positive definite tridiagonal
+     !> matrix, and X and B are N-by-NRHS matrices.
+     !> A is factored as A = L*D*L**T, and the factored form of A is then
+     !> used to solve the system of equations.
+
+     pure subroutine stdlib_qptsv( n, nrhs, d, e, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(inout) :: b(ldb,*), d(*), e(*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPTSV ', -info )
+              return
+           end if
+           ! compute the l*d*l**t (or u**t*d*u) factorization of a.
+           call stdlib_qpttrf( n, d, e, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_qpttrs( n, nrhs, d, e, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_qptsv
+
+     !> DPTSVX: uses the factorization A = L*D*L**T to compute the solution
+     !> to a real system of linear equations A*X = B, where A is an N-by-N
+     !> symmetric positive definite tridiagonal matrix and X and B are
+     !> N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     pure subroutine stdlib_qptsvx( fact, n, nrhs, d, e, df, ef, b, ldb, x, ldx,rcond, ferr, berr,&
+                work, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(in) :: b(ldb,*), d(*), e(*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+           real(qp), intent(inout) :: df(*), ef(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(qp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPTSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the l*d*l**t (or u**t*d*u) factorization of a.
+              call stdlib_qcopy( n, d, 1, df, 1 )
+              if( n>1 )call stdlib_qcopy( n-1, e, 1, ef, 1 )
+              call stdlib_qpttrf( n, df, ef, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_qlanst( '1', n, d, e )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_qptcon( n, df, ef, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_qlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_qpttrs( n, nrhs, df, ef, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_qptrfs( n, nrhs, d, e, df, ef, b, ldb, x, ldx, ferr, berr,work, info )
+                     
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_qptsvx
+
+     !> DPTTRF: computes the L*D*L**T factorization of a real symmetric
+     !> positive definite tridiagonal matrix A.  The factorization may also
+     !> be regarded as having the form A = U**T*D*U.
+
+     pure subroutine stdlib_qpttrf( n, d, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(inout) :: d(*), e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i4
+           real(qp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'DPTTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! compute the l*d*l**t (or u**t*d*u) factorization of a.
+           i4 = mod( n-1, 4 )
+           do i = 1, i4
+              if( d( i )<=zero ) then
+                 info = i
+                 go to 30
+              end if
+              ei = e( i )
+              e( i ) = ei / d( i )
+              d( i+1 ) = d( i+1 ) - e( i )*ei
+           end do
+           loop_20: do i = i4 + 1, n - 4, 4
+              ! drop out of the loop if d(i) <= 0: the matrix is not positive
+              ! definite.
+              if( d( i )<=zero ) then
+                 info = i
+                 go to 30
+              end if
+              ! solve for e(i) and d(i+1).
+              ei = e( i )
+              e( i ) = ei / d( i )
+              d( i+1 ) = d( i+1 ) - e( i )*ei
+              if( d( i+1 )<=zero ) then
+                 info = i + 1
+                 go to 30
+              end if
+              ! solve for e(i+1) and d(i+2).
+              ei = e( i+1 )
+              e( i+1 ) = ei / d( i+1 )
+              d( i+2 ) = d( i+2 ) - e( i+1 )*ei
+              if( d( i+2 )<=zero ) then
+                 info = i + 2
+                 go to 30
+              end if
+              ! solve for e(i+2) and d(i+3).
+              ei = e( i+2 )
+              e( i+2 ) = ei / d( i+2 )
+              d( i+3 ) = d( i+3 ) - e( i+2 )*ei
+              if( d( i+3 )<=zero ) then
+                 info = i + 3
+                 go to 30
+              end if
+              ! solve for e(i+3) and d(i+4).
+              ei = e( i+3 )
+              e( i+3 ) = ei / d( i+3 )
+              d( i+4 ) = d( i+4 ) - e( i+3 )*ei
+           end do loop_20
+           ! check d(n) for positive definiteness.
+           if( d( n )<=zero )info = n
+           30 continue
+           return
+     end subroutine stdlib_qpttrf
+
+     !> DPTTRS: solves a tridiagonal system of the form
+     !> A * X = B
+     !> using the L*D*L**T factorization of A computed by DPTTRF.  D is a
+     !> diagonal matrix specified in the vector D, L is a unit bidiagonal
+     !> matrix whose subdiagonal is specified in the vector E, and X and B
+     !> are N by NRHS matrices.
+
+     pure subroutine stdlib_qpttrs( n, nrhs, d, e, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(inout) :: b(ldb,*)
+           real(qp), intent(in) :: d(*), e(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DPTTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! determine the number of right-hand sides to solve at a time.
+           if( nrhs==1 ) then
+              nb = 1
+           else
+              nb = max( 1, stdlib_ilaenv( 1, 'DPTTRS', ' ', n, nrhs, -1, -1 ) )
+           end if
+           if( nb>=nrhs ) then
+              call stdlib_qptts2( n, nrhs, d, e, b, ldb )
+           else
+              do j = 1, nrhs, nb
+                 jb = min( nrhs-j+1, nb )
+                 call stdlib_qptts2( n, jb, d, e, b( 1, j ), ldb )
+              end do
+           end if
+           return
+     end subroutine stdlib_qpttrs
+
+     !> DPTTS2: solves a tridiagonal system of the form
+     !> A * X = B
+     !> using the L*D*L**T factorization of A computed by DPTTRF.  D is a
+     !> diagonal matrix specified in the vector D, L is a unit bidiagonal
+     !> matrix whose subdiagonal is specified in the vector E, and X and B
+     !> are N by NRHS matrices.
+
+     pure subroutine stdlib_qptts2( n, nrhs, d, e, b, ldb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(inout) :: b(ldb,*)
+           real(qp), intent(in) :: d(*), e(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 )call stdlib_qscal( nrhs, 1._qp / d( 1 ), b, ldb )
+              return
+           end if
+           ! solve a * x = b using the factorization a = l*d*l**t,
+           ! overwriting each right hand side vector with its solution.
+           do j = 1, nrhs
+                 ! solve l * x = b.
+              do i = 2, n
+                 b( i, j ) = b( i, j ) - b( i-1, j )*e( i-1 )
+              end do
+                 ! solve d * l**t * x = b.
+              b( n, j ) = b( n, j ) / d( n )
+              do i = n - 1, 1, -1
+                 b( i, j ) = b( i, j ) / d( i ) - b( i+1, j )*e( i )
+              end do
+           end do
+           return
+     end subroutine stdlib_qptts2
+
+     !> DRSCL: multiplies an n-element real vector x by the real scalar 1/a.
+     !> This is done without overflow or underflow as long as
+     !> the final result x/a does not overflow or underflow.
+
+     pure subroutine stdlib_qrscl( n, sa, sx, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           real(qp), intent(in) :: sa
+           ! Array Arguments 
+           real(qp), intent(inout) :: sx(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           real(qp) :: bignum, cden, cden1, cnum, cnum1, mul, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 )return
+           ! get machine parameters
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! initialize the denominator to sa and the numerator to 1.
+           cden = sa
+           cnum = one
+           10 continue
+           cden1 = cden*smlnum
+           cnum1 = cnum / bignum
+           if( abs( cden1 )>abs( cnum ) .and. cnum/=zero ) then
+              ! pre-multiply x by smlnum if cden is large compared to cnum.
+              mul = smlnum
+              done = .false.
+              cden = cden1
+           else if( abs( cnum1 )>abs( cden ) ) then
+              ! pre-multiply x by bignum if cden is small compared to cnum.
+              mul = bignum
+              done = .false.
+              cnum = cnum1
+           else
+              ! multiply x by cnum / cden and return.
+              mul = cnum / cden
+              done = .true.
+           end if
+           ! scale the vector x by mul
+           call stdlib_qscal( n, mul, sx, incx )
+           if( .not.done )go to 10
+           return
+     end subroutine stdlib_qrscl
+
+     !> DSB2ST_KERNELS: is an internal routine used by the DSYTRD_SB2ST
+     !> subroutine.
+
+     pure subroutine  stdlib_qsb2st_kernels( uplo, wantz, ttype,st, ed, sweep, n, nb, ib,a, lda, &
+               v, tau, ldvt, work)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: wantz
+           integer(ilp), intent(in) :: ttype, st, ed, sweep, n, nb, ib, lda, ldvt
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: v(*), tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j1, j2, lm, ln, vpos, taupos, dpos, ofdpos, ajeter
+           real(qp) :: ctmp
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ajeter = ib + ldvt
+           upper = stdlib_lsame( uplo, 'U' )
+           if( upper ) then
+               dpos    = 2 * nb + 1
+               ofdpos  = 2 * nb
+           else
+               dpos    = 1
+               ofdpos  = 2
+           endif
+           ! upper case
+           if( upper ) then
+               if( wantz ) then
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               else
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               endif
+               if( ttype==1 ) then
+                   lm = ed - st + 1
+                   v( vpos ) = one
+                   do i = 1, lm-1
+                       v( vpos+i )         = ( a( ofdpos-i, st+i ) )
+                       a( ofdpos-i, st+i ) = zero
+                   end do
+                   ctmp = ( a( ofdpos, st ) )
+                   call stdlib_qlarfg( lm, ctmp, v( vpos+1 ), 1,tau( taupos ) )
+                   a( ofdpos, st ) = ctmp
+                   lm = ed - st + 1
+                   call stdlib_qlarfy( uplo, lm, v( vpos ), 1,( tau( taupos ) ),a( dpos, st ), &
+                             lda-1, work)
+               endif
+               if( ttype==3 ) then
+                   lm = ed - st + 1
+                   call stdlib_qlarfy( uplo, lm, v( vpos ), 1,( tau( taupos ) ),a( dpos, st ), &
+                             lda-1, work)
+               endif
+               if( ttype==2 ) then
+                   j1 = ed+1
+                   j2 = min( ed+nb, n )
+                   ln = ed-st+1
+                   lm = j2-j1+1
+                   if( lm>0) then
+                       call stdlib_qlarfx( 'LEFT', ln, lm, v( vpos ),( tau( taupos ) ),a( dpos-nb,&
+                                  j1 ), lda-1, work)
+                       if( wantz ) then
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       else
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       endif
+                       v( vpos ) = one
+                       do i = 1, lm-1
+                           v( vpos+i )          =( a( dpos-nb-i, j1+i ) )
+                           a( dpos-nb-i, j1+i ) = zero
+                       end do
+                       ctmp = ( a( dpos-nb, j1 ) )
+                       call stdlib_qlarfg( lm, ctmp, v( vpos+1 ), 1, tau( taupos ) )
+                       a( dpos-nb, j1 ) = ctmp
+                       call stdlib_qlarfx( 'RIGHT', ln-1, lm, v( vpos ),tau( taupos ),a( dpos-nb+&
+                                 1, j1 ), lda-1, work)
+                   endif
+               endif
+           ! lower case
+           else
+               if( wantz ) then
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               else
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               endif
+               if( ttype==1 ) then
+                   lm = ed - st + 1
+                   v( vpos ) = one
+                   do i = 1, lm-1
+                       v( vpos+i )         = a( ofdpos+i, st-1 )
+                       a( ofdpos+i, st-1 ) = zero
+                   end do
+                   call stdlib_qlarfg( lm, a( ofdpos, st-1 ), v( vpos+1 ), 1,tau( taupos ) )
+                             
+                   lm = ed - st + 1
+                   call stdlib_qlarfy( uplo, lm, v( vpos ), 1,( tau( taupos ) ),a( dpos, st ), &
+                             lda-1, work)
+               endif
+               if( ttype==3 ) then
+                   lm = ed - st + 1
+                   call stdlib_qlarfy( uplo, lm, v( vpos ), 1,( tau( taupos ) ),a( dpos, st ), &
+                             lda-1, work)
+               endif
+               if( ttype==2 ) then
+                   j1 = ed+1
+                   j2 = min( ed+nb, n )
+                   ln = ed-st+1
+                   lm = j2-j1+1
+                   if( lm>0) then
+                       call stdlib_qlarfx( 'RIGHT', lm, ln, v( vpos ),tau( taupos ), a( dpos+nb, &
+                                 st ),lda-1, work)
+                       if( wantz ) then
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       else
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       endif
+                       v( vpos ) = one
+                       do i = 1, lm-1
+                           v( vpos+i )        = a( dpos+nb+i, st )
+                           a( dpos+nb+i, st ) = zero
+                       end do
+                       call stdlib_qlarfg( lm, a( dpos+nb, st ), v( vpos+1 ), 1,tau( taupos ) )
+                                 
+                       call stdlib_qlarfx( 'LEFT', lm, ln-1, v( vpos ),( tau( taupos ) ),a( dpos+&
+                                 nb-1, st+1 ), lda-1, work)
+                   endif
+               endif
+           endif
+           return
+     end subroutine stdlib_qsb2st_kernels
+
+     !> DSBEV: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a real symmetric band matrix A.
+
+     subroutine stdlib_qsbev( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldz, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ab(ldab,*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, wantz
+           integer(ilp) :: iinfo, imax, inde, indwrk, iscale
+           real(qp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( lower ) then
+                 w( 1 ) = ab( 1, 1 )
+              else
+                 w( 1 ) = ab( kd+1, 1 )
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_qlansb( 'M', uplo, n, kd, ab, ldab, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_qlascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_qlascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+           end if
+           ! call stdlib_qsbtrd to reduce symmetric band matrix to tridiagonal form.
+           inde = 1
+           indwrk = inde + n
+           call stdlib_qsbtrd( jobz, uplo, n, kd, ab, ldab, w, work( inde ), z, ldz,work( indwrk )&
+                     , iinfo )
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, call stdlib_dsteqr.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, work( inde ), info )
+           else
+              call stdlib_qsteqr( jobz, n, w, work( inde ), z, ldz, work( indwrk ),info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           return
+     end subroutine stdlib_qsbev
+
+     !> DSBEVD: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a real symmetric band matrix A. If eigenvectors are desired, it uses
+     !> a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_qsbevd( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,lwork, iwork, liwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: ab(ldab,*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, inde, indwk2, indwrk, iscale, liwmin, llwrk2, lwmin
+           real(qp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              liwmin = 1
+              lwmin = 1
+           else
+              if( wantz ) then
+                 liwmin = 3 + 5*n
+                 lwmin = 1 + 5*n + 2*n**2
+              else
+                 liwmin = 1
+                 lwmin = 2*n
+              end if
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = ab( 1, 1 )
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_qlansb( 'M', uplo, n, kd, ab, ldab, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_qlascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_qlascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+           end if
+           ! call stdlib_qsbtrd to reduce symmetric band matrix to tridiagonal form.
+           inde = 1
+           indwrk = inde + n
+           indwk2 = indwrk + n*n
+           llwrk2 = lwork - indwk2 + 1
+           call stdlib_qsbtrd( jobz, uplo, n, kd, ab, ldab, w, work( inde ), z, ldz,work( indwrk )&
+                     , iinfo )
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, call stdlib_dstedc.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, work( inde ), info )
+           else
+              call stdlib_qstedc( 'I', n, w, work( inde ), work( indwrk ), n,work( indwk2 ), &
+                        llwrk2, iwork, liwork, info )
+              call stdlib_qgemm( 'N', 'N', n, n, n, one, z, ldz, work( indwrk ), n,zero, work( &
+                        indwk2 ), n )
+              call stdlib_qlacpy( 'A', n, n, work( indwk2 ), n, z, ldz )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 )call stdlib_qscal( n, one / sigma, w, 1 )
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_qsbevd
+
+     !> DSBEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric band matrix A.  Eigenvalues and eigenvectors can
+     !> be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+
+     subroutine stdlib_qsbevx( jobz, range, uplo, n, kd, ab, ldab, q, ldq, vl,vu, il, iu, abstol, &
+               m, w, z, ldz, work, iwork,ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, kd, ldab, ldq, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(inout) :: ab(ldab,*)
+           real(qp), intent(out) :: q(ldq,*), w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwo, indwrk, &
+                     iscale, itmp1, j, jj, nsplit
+           real(qp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lower = stdlib_lsame( uplo, 'L' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           else if( wantz .and. ldq<max( 1, n ) ) then
+              info = -9
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -11
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -12
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -13
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -18
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              m = 1
+              if( lower ) then
+                 tmp1 = ab( 1, 1 )
+              else
+                 tmp1 = ab( kd+1, 1 )
+              end if
+              if( valeig ) then
+                 if( .not.( vl<tmp1 .and. vu>=tmp1 ) )m = 0
+              end if
+              if( m==1 ) then
+                 w( 1 ) = tmp1
+                 if( wantz )z( 1, 1 ) = one
+              end if
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           end if
+           anrm = stdlib_qlansb( 'M', uplo, n, kd, ab, ldab, work )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_qlascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_qlascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_qsbtrd to reduce symmetric band matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indwrk = inde + n
+           call stdlib_qsbtrd( jobz, uplo, n, kd, ab, ldab, work( indd ),work( inde ), q, ldq, &
+                     work( indwrk ), iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_qsterf or stdlib_dsteqr.  if this fails for some
+           ! eigenvalue, then try stdlib_qstebz.
+           test = .false.
+           if (indeig) then
+              if (il==1 .and. iu==n) then
+                 test = .true.
+              end if
+           end if
+           if ((alleig .or. test) .and. (abstol<=zero)) then
+              call stdlib_qcopy( n, work( indd ), 1, w, 1 )
+              indee = indwrk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_qcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_qsterf( n, w, work( indee ), info )
+              else
+                 call stdlib_qlacpy( 'A', n, n, q, ldq, z, ldz )
+                 call stdlib_qcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_qsteqr( jobz, n, w, work( indee ), z, ldz,work( indwrk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_qstebz and, if eigenvectors are desired, stdlib_dstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_qstebz( range, order, n, vll, vuu, il, iu, abstll,work( indd ), work( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwrk ),iwork( indiwo ), info &
+                     )
+           if( wantz ) then
+              call stdlib_qstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwrk ), iwork( indiwo ), ifail, info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_qstein.
+              do j = 1, m
+                 call stdlib_qcopy( n, z( 1, j ), 1, work( 1 ), 1 )
+                 call stdlib_qgemv( 'N', n, n, one, q, ldq, work, 1, zero,z( 1, j ), 1 )
+              end do
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           30 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_qswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_qsbevx
+
+     !> DSBGST: reduces a real symmetric-definite banded generalized
+     !> eigenproblem  A*x = lambda*B*x  to standard form  C*y = lambda*y,
+     !> such that C has the same bandwidth as A.
+     !> B must have been previously factorized as S**T*S by DPBSTF, using a
+     !> split Cholesky factorization. A is overwritten by C = X**T*A*X, where
+     !> X = S**(-1)*Q and Q is an orthogonal matrix chosen to preserve the
+     !> bandwidth of A.
+
+     pure subroutine stdlib_qsbgst( vect, uplo, n, ka, kb, ab, ldab, bb, ldbb, x,ldx, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldx, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ab(ldab,*)
+           real(qp), intent(in) :: bb(ldbb,*)
+           real(qp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: update, upper, wantx
+           integer(ilp) :: i, i0, i1, i2, inca, j, j1, j1t, j2, j2t, k, ka1, kb1, kbt, l, m, nr, &
+                     nrt, nx
+           real(qp) :: bii, ra, ra1, t
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           wantx = stdlib_lsame( vect, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           ka1 = ka + 1
+           kb1 = kb + 1
+           info = 0
+           if( .not.wantx .and. .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldx<1 .or. wantx .and. ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBGST', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           inca = ldab*ka1
+           ! initialize x to the unit matrix, if needed
+           if( wantx )call stdlib_qlaset( 'FULL', n, n, zero, one, x, ldx )
+           ! set m to the splitting point m. it must be the same value as is
+           ! used in stdlib_qpbstf. the chosen value allows the arrays work and rwork
+           ! to be of dimension (n).
+           m = ( n+kb ) / 2
+           ! the routine works in two phases, corresponding to the two halves
+           ! of the split cholesky factorization of b as s**t*s where
+           ! s = ( u    )
+               ! ( m  l )
+           ! with u upper triangular of order m, and l lower triangular of
+           ! order n-m. s has the same bandwidth as b.
+           ! s is treated as a product of elementary matrices:
+           ! s = s(m)*s(m-1)*...*s(2)*s(1)*s(m+1)*s(m+2)*...*s(n-1)*s(n)
+           ! where s(i) is determined by the i-th row of s.
+           ! in phase 1, the index i takes the values n, n-1, ... , m+1;
+           ! in phase 2, it takes the values 1, 2, ... , m.
+           ! for each value of i, the current matrix a is updated by forming
+           ! inv(s(i))**t*a*inv(s(i)). this creates a triangular bulge outside
+           ! the band of a. the bulge is then pushed down toward the bottom of
+           ! a in phase 1, and up toward the top of a in phase 2, by applying
+           ! plane rotations.
+           ! there are kb*(kb+1)/2 elements in the bulge, but at most 2*kb-1
+           ! of them are linearly independent, so annihilating a bulge requires
+           ! only 2*kb-1 plane rotations. the rotations are divided into a 1st
+           ! set of kb-1 rotations, and a 2nd set of kb rotations.
+           ! wherever possible, rotations are generated and applied in vector
+           ! operations of length nr between the indices j1 and j2 (sometimes
+           ! replaced by modified values nrt, j1t or j2t).
+           ! the cosines and sines of the rotations are stored in the array
+           ! work. the cosines of the 1st set of rotations are stored in
+           ! elements n+2:n+m-kb-1 and the sines of the 1st set in elements
+           ! 2:m-kb-1; the cosines of the 2nd set are stored in elements
+           ! n+m-kb+1:2*n and the sines of the second set in elements m-kb+1:n.
+           ! the bulges are not formed explicitly; nonzero elements outside the
+           ! band are created only when they are required for generating new
+           ! rotations; they are stored in the array work, in positions where
+           ! they are later overwritten by the sines of the rotations which
+           ! annihilate them.
+           ! **************************** phase 1 *****************************
+           ! the logical structure of this phase is:
+           ! update = .true.
+           ! do i = n, m + 1, -1
+              ! use s(i) to update a and create a new bulge
+              ! apply rotations to push all bulges ka positions downward
+           ! end do
+           ! update = .false.
+           ! do i = m + ka + 1, n - 1
+              ! apply rotations to push all bulges ka positions downward
+           ! end do
+           ! to avoid duplicating code, the two loops are merged.
+           update = .true.
+           i = n + 1
+           10 continue
+           if( update ) then
+              i = i - 1
+              kbt = min( kb, i-1 )
+              i0 = i - 1
+              i1 = min( n, i+ka )
+              i2 = i - kbt + ka1
+              if( i<m+1 ) then
+                 update = .false.
+                 i = i + 1
+                 i0 = m
+                 if( ka==0 )go to 480
+                 go to 10
+              end if
+           else
+              i = i + ka
+              if( i>n-1 )go to 480
+           end if
+           if( upper ) then
+              ! transform a, working with the upper triangle
+              if( update ) then
+                 ! form  inv(s(i))**t * a * inv(s(i))
+                 bii = bb( kb1, i )
+                 do j = i, i1
+                    ab( i-j+ka1, j ) = ab( i-j+ka1, j ) / bii
+                 end do
+                 do j = max( 1, i-ka ), i
+                    ab( j-i+ka1, i ) = ab( j-i+ka1, i ) / bii
+                 end do
+                 do k = i - kbt, i - 1
+                    do j = i - kbt, k
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( j-i+kb1, i )*ab( k-i+ka1, i ) -bb(&
+                        k-i+kb1, i )*ab( j-i+ka1, i ) +ab( ka1, i )*bb( j-i+kb1, i )*bb( k-i+kb1, &
+                                  i )
+                    end do
+                    do j = max( 1, i-ka ), i - kbt - 1
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( k-i+kb1, i )*ab( j-i+ka1, i )
+                                 
+                    end do
+                 end do
+                 do j = i, i1
+                    do k = max( j-ka, i-kbt ), i - 1
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( k-i+kb1, i )*ab( i-j+ka1, j )
+                                 
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_qscal( n-m, one / bii, x( m+1, i ), 1 )
+                    if( kbt>0 )call stdlib_qger( n-m, kbt, -one, x( m+1, i ), 1,bb( kb1-kbt, i ), &
+                              1, x( m+1, i-kbt ), ldx )
+                 end if
+                 ! store a(i,i1) in ra1 for use in next loop over k
+                 ra1 = ab( i-i1+ka1, i1 )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions down toward the bottom of the
+              ! band
+              loop_130: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i-k+ka<n .and. i-k>1 ) then
+                       ! generate rotation to annihilate a(i,i-k+ka+1)
+                       call stdlib_qlartg( ab( k+1, i-k+ka ), ra1,work( n+i-k+ka-m ), work( i-k+&
+                                 ka-m ),ra )
+                       ! create nonzero element a(i-k,i-k+ka+1) outside the
+                       ! band and store it in work(i-k)
+                       t = -bb( kb1-k, i )*ra1
+                       work( i-k ) = work( n+i-k+ka-m )*t -work( i-k+ka-m )*ab( 1, i-k+ka )
+                                 
+                       ab( 1, i-k+ka ) = work( i-k+ka-m )*t +work( n+i-k+ka-m )*ab( 1, i-k+ka )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = max( j2, i+2*ka-k+1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( n-j2t+ka ) / ka1
+                 do j = j2t, j1, ka1
+                    ! create nonzero element a(j-ka,j+1) outside the band
+                    ! and store it in work(j-m)
+                    work( j-m ) = work( j-m )*ab( 1, j+1 )
+                    ab( 1, j+1 ) = work( n+j-m )*ab( 1, j+1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_qlargv( nrt, ab( 1, j2t ), inca, work( j2t-m ), ka1,work( &
+                           n+j2t-m ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the right
+                    do l = 1, ka - 1
+                       call stdlib_qlartv( nr, ab( ka1-l, j2 ), inca,ab( ka-l, j2+1 ), inca, work(&
+                                  n+j2-m ),work( j2-m ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_qlar2v( nr, ab( ka1, j2 ), ab( ka1, j2+1 ),ab( ka, j2+1 ), inca, &
+                              work( n+j2-m ),work( j2-m ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca,work( n+j2-m ), work( j2-m ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j2, j1, ka1
+                       call stdlib_qrot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,work( n+j-m ), &
+                                 work( j-m ) )
+                    end do
+                 end if
+              end do loop_130
+              if( update ) then
+                 if( i2<=n .and. kbt>0 ) then
+                    ! create nonzero element a(i-kbt,i-kbt+ka+1) outside the
+                    ! band and store it in work(i-kbt)
+                    work( i-kbt ) = -bb( kb1-kbt, i )*ra1
+                 end if
+              end if
+              loop_170: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i - k - 1 + max( 2, k-i0+1 )*ka1
+                 else
+                    j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+ka+l ) / ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( l, j2-l+1 ), inca,ab( l+1, j2-l+1 ), &
+                              inca, work( n+j2-ka ),work( j2-ka ), ka1 )
+                 end do
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 do j = j1, j2, -ka1
+                    work( j ) = work( j-ka )
+                    work( n+j ) = work( n+j-ka )
+                 end do
+                 do j = j2, j1, ka1
+                    ! create nonzero element a(j-ka,j+1) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( 1, j+1 )
+                    ab( 1, j+1 ) = work( n+j )*ab( 1, j+1 )
+                 end do
+                 if( update ) then
+                    if( i-k<n-ka .and. k<=kbt )work( i-k+ka ) = work( i-k )
+                 end if
+              end do loop_170
+              loop_210: do k = kb, 1, -1
+                 j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_qlargv( nr, ab( 1, j2 ), inca, work( j2 ), ka1,work( n+j2 ), ka1 )
+                              
+                    ! apply rotations in 2nd set from the right
+                    do l = 1, ka - 1
+                       call stdlib_qlartv( nr, ab( ka1-l, j2 ), inca,ab( ka-l, j2+1 ), inca, work(&
+                                  n+j2 ),work( j2 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_qlar2v( nr, ab( ka1, j2 ), ab( ka1, j2+1 ),ab( ka, j2+1 ), inca, &
+                              work( n+j2 ),work( j2 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca, work( n+j2 ),work( j2 ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j2, j1, ka1
+                       call stdlib_qrot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,work( n+j ), work( &
+                                 j ) )
+                    end do
+                 end if
+              end do loop_210
+              do k = 1, kb - 1
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 ! finish applying rotations in 1st set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca,work( n+j2-m ), work( j2-m ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = n - 1, i - kb + 2*ka + 1, -1
+                    work( n+j-m ) = work( n+j-ka-m )
+                    work( j-m ) = work( j-ka-m )
+                 end do
+              end if
+           else
+              ! transform a, working with the lower triangle
+              if( update ) then
+                 ! form  inv(s(i))**t * a * inv(s(i))
+                 bii = bb( 1, i )
+                 do j = i, i1
+                    ab( j-i+1, i ) = ab( j-i+1, i ) / bii
+                 end do
+                 do j = max( 1, i-ka ), i
+                    ab( i-j+1, j ) = ab( i-j+1, j ) / bii
+                 end do
+                 do k = i - kbt, i - 1
+                    do j = i - kbt, k
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( i-j+1, j )*ab( i-k+1, k ) -bb( i-k+1, &
+                                 k )*ab( i-j+1, j ) +ab( 1, i )*bb( i-j+1, j )*bb( i-k+1, k )
+                    end do
+                    do j = max( 1, i-ka ), i - kbt - 1
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( i-k+1, k )*ab( i-j+1, j )
+                    end do
+                 end do
+                 do j = i, i1
+                    do k = max( j-ka, i-kbt ), i - 1
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( i-k+1, k )*ab( j-i+1, i )
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_qscal( n-m, one / bii, x( m+1, i ), 1 )
+                    if( kbt>0 )call stdlib_qger( n-m, kbt, -one, x( m+1, i ), 1,bb( kbt+1, i-kbt )&
+                              , ldbb-1,x( m+1, i-kbt ), ldx )
+                 end if
+                 ! store a(i1,i) in ra1 for use in next loop over k
+                 ra1 = ab( i1-i+1, i )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions down toward the bottom of the
+              ! band
+              loop_360: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i-k+ka<n .and. i-k>1 ) then
+                       ! generate rotation to annihilate a(i-k+ka+1,i)
+                       call stdlib_qlartg( ab( ka1-k, i ), ra1, work( n+i-k+ka-m ),work( i-k+ka-m &
+                                 ), ra )
+                       ! create nonzero element a(i-k+ka+1,i-k) outside the
+                       ! band and store it in work(i-k)
+                       t = -bb( k+1, i-k )*ra1
+                       work( i-k ) = work( n+i-k+ka-m )*t -work( i-k+ka-m )*ab( ka1, i-k )
+                       ab( ka1, i-k ) = work( i-k+ka-m )*t +work( n+i-k+ka-m )*ab( ka1, i-k )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = max( j2, i+2*ka-k+1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( n-j2t+ka ) / ka1
+                 do j = j2t, j1, ka1
+                    ! create nonzero element a(j+1,j-ka) outside the band
+                    ! and store it in work(j-m)
+                    work( j-m ) = work( j-m )*ab( ka1, j-ka+1 )
+                    ab( ka1, j-ka+1 ) = work( n+j-m )*ab( ka1, j-ka+1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_qlargv( nrt, ab( ka1, j2t-ka ), inca, work( j2t-m ),ka1, &
+                           work( n+j2t-m ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the left
+                    do l = 1, ka - 1
+                       call stdlib_qlartv( nr, ab( l+1, j2-l ), inca,ab( l+2, j2-l ), inca, work( &
+                                 n+j2-m ),work( j2-m ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_qlar2v( nr, ab( 1, j2 ), ab( 1, j2+1 ), ab( 2, j2 ),inca, work( n+&
+                              j2-m ), work( j2-m ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, work( n+j2-m ),work( j2-m ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j2, j1, ka1
+                       call stdlib_qrot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,work( n+j-m ), &
+                                 work( j-m ) )
+                    end do
+                 end if
+              end do loop_360
+              if( update ) then
+                 if( i2<=n .and. kbt>0 ) then
+                    ! create nonzero element a(i-kbt+ka+1,i-kbt) outside the
+                    ! band and store it in work(i-kbt)
+                    work( i-kbt ) = -bb( kbt+1, i-kbt )*ra1
+                 end if
+              end if
+              loop_400: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i - k - 1 + max( 2, k-i0+1 )*ka1
+                 else
+                    j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+ka+l ) / ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( ka1-l+1, j2-ka ), inca,ab( ka1-l, j2-&
+                              ka+1 ), inca,work( n+j2-ka ), work( j2-ka ), ka1 )
+                 end do
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 do j = j1, j2, -ka1
+                    work( j ) = work( j-ka )
+                    work( n+j ) = work( n+j-ka )
+                 end do
+                 do j = j2, j1, ka1
+                    ! create nonzero element a(j+1,j-ka) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( ka1, j-ka+1 )
+                    ab( ka1, j-ka+1 ) = work( n+j )*ab( ka1, j-ka+1 )
+                 end do
+                 if( update ) then
+                    if( i-k<n-ka .and. k<=kbt )work( i-k+ka ) = work( i-k )
+                 end if
+              end do loop_400
+              loop_440: do k = kb, 1, -1
+                 j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_qlargv( nr, ab( ka1, j2-ka ), inca, work( j2 ), ka1,work( n+j2 ), &
+                              ka1 )
+                    ! apply rotations in 2nd set from the left
+                    do l = 1, ka - 1
+                       call stdlib_qlartv( nr, ab( l+1, j2-l ), inca,ab( l+2, j2-l ), inca, work( &
+                                 n+j2 ),work( j2 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_qlar2v( nr, ab( 1, j2 ), ab( 1, j2+1 ), ab( 2, j2 ),inca, work( n+&
+                              j2 ), work( j2 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, work( n+j2 ),work( j2 ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j2, j1, ka1
+                       call stdlib_qrot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,work( n+j ), work( &
+                                 j ) )
+                    end do
+                 end if
+              end do loop_440
+              do k = 1, kb - 1
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 ! finish applying rotations in 1st set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, work( n+j2-m ),work( j2-m ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = n - 1, i - kb + 2*ka + 1, -1
+                    work( n+j-m ) = work( n+j-ka-m )
+                    work( j-m ) = work( j-ka-m )
+                 end do
+              end if
+           end if
+           go to 10
+           480 continue
+           ! **************************** phase 2 *****************************
+           ! the logical structure of this phase is:
+           ! update = .true.
+           ! do i = 1, m
+              ! use s(i) to update a and create a new bulge
+              ! apply rotations to push all bulges ka positions upward
+           ! end do
+           ! update = .false.
+           ! do i = m - ka - 1, 2, -1
+              ! apply rotations to push all bulges ka positions upward
+           ! end do
+           ! to avoid duplicating code, the two loops are merged.
+           update = .true.
+           i = 0
+           490 continue
+           if( update ) then
+              i = i + 1
+              kbt = min( kb, m-i )
+              i0 = i + 1
+              i1 = max( 1, i-ka )
+              i2 = i + kbt - ka1
+              if( i>m ) then
+                 update = .false.
+                 i = i - 1
+                 i0 = m + 1
+                 if( ka==0 )return
+                 go to 490
+              end if
+           else
+              i = i - ka
+              if( i<2 )return
+           end if
+           if( i<m-kbt ) then
+              nx = m
+           else
+              nx = n
+           end if
+           if( upper ) then
+              ! transform a, working with the upper triangle
+              if( update ) then
+                 ! form  inv(s(i))**t * a * inv(s(i))
+                 bii = bb( kb1, i )
+                 do j = i1, i
+                    ab( j-i+ka1, i ) = ab( j-i+ka1, i ) / bii
+                 end do
+                 do j = i, min( n, i+ka )
+                    ab( i-j+ka1, j ) = ab( i-j+ka1, j ) / bii
+                 end do
+                 do k = i + 1, i + kbt
+                    do j = k, i + kbt
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( i-j+kb1, j )*ab( i-k+ka1, k ) -bb(&
+                        i-k+kb1, k )*ab( i-j+ka1, j ) +ab( ka1, i )*bb( i-j+kb1, j )*bb( i-k+kb1, &
+                                  k )
+                    end do
+                    do j = i + kbt + 1, min( n, i+ka )
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( i-k+kb1, k )*ab( i-j+ka1, j )
+                                 
+                    end do
+                 end do
+                 do j = i1, i
+                    do k = i + 1, min( j+ka, i+kbt )
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( i-k+kb1, k )*ab( j-i+ka1, i )
+                                 
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_qscal( nx, one / bii, x( 1, i ), 1 )
+                    if( kbt>0 )call stdlib_qger( nx, kbt, -one, x( 1, i ), 1, bb( kb, i+1 ),ldbb-&
+                              1, x( 1, i+1 ), ldx )
+                 end if
+                 ! store a(i1,i) in ra1 for use in next loop over k
+                 ra1 = ab( i1-i+ka1, i )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions up toward the top of the band
+              loop_610: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i+k-ka1>0 .and. i+k<m ) then
+                       ! generate rotation to annihilate a(i+k-ka-1,i)
+                       call stdlib_qlartg( ab( k+1, i ), ra1, work( n+i+k-ka ),work( i+k-ka ), ra &
+                                 )
+                       ! create nonzero element a(i+k-ka-1,i+k) outside the
+                       ! band and store it in work(m-kb+i+k)
+                       t = -bb( kb1-k, i+k )*ra1
+                       work( m-kb+i+k ) = work( n+i+k-ka )*t -work( i+k-ka )*ab( 1, i+k )
+                       ab( 1, i+k ) = work( i+k-ka )*t +work( n+i+k-ka )*ab( 1, i+k )
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = min( j2, i-2*ka+k-1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( j2t+ka-1 ) / ka1
+                 do j = j1, j2t, ka1
+                    ! create nonzero element a(j-1,j+ka) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( 1, j+ka-1 )
+                    ab( 1, j+ka-1 ) = work( n+j )*ab( 1, j+ka-1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_qlargv( nrt, ab( 1, j1+ka ), inca, work( j1 ), ka1,work( &
+                           n+j1 ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the left
+                    do l = 1, ka - 1
+                       call stdlib_qlartv( nr, ab( ka1-l, j1+l ), inca,ab( ka-l, j1+l ), inca, &
+                                 work( n+j1 ),work( j1 ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_qlar2v( nr, ab( ka1, j1 ), ab( ka1, j1-1 ),ab( ka, j1 ), inca, &
+                              work( n+j1 ),work( j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                               work( n+j1t ),work( j1t ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j1, j2, ka1
+                       call stdlib_qrot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,work( n+j ), work( j ) )
+                                 
+                    end do
+                 end if
+              end do loop_610
+              if( update ) then
+                 if( i2>0 .and. kbt>0 ) then
+                    ! create nonzero element a(i+kbt-ka-1,i+kbt) outside the
+                    ! band and store it in work(m-kb+i+kbt)
+                    work( m-kb+i+kbt ) = -bb( kb1-kbt, i+kbt )*ra1
+                 end if
+              end if
+              loop_650: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i + k + 1 - max( 2, k+i0-m )*ka1
+                 else
+                    j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+ka+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( l, j1t+ka ), inca,ab( l+1, j1t+ka-1 ),&
+                               inca,work( n+m-kb+j1t+ka ),work( m-kb+j1t+ka ), ka1 )
+                 end do
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 do j = j1, j2, ka1
+                    work( m-kb+j ) = work( m-kb+j+ka )
+                    work( n+m-kb+j ) = work( n+m-kb+j+ka )
+                 end do
+                 do j = j1, j2, ka1
+                    ! create nonzero element a(j-1,j+ka) outside the band
+                    ! and store it in work(m-kb+j)
+                    work( m-kb+j ) = work( m-kb+j )*ab( 1, j+ka-1 )
+                    ab( 1, j+ka-1 ) = work( n+m-kb+j )*ab( 1, j+ka-1 )
+                 end do
+                 if( update ) then
+                    if( i+k>ka1 .and. k<=kbt )work( m-kb+i+k-ka ) = work( m-kb+i+k )
+                 end if
+              end do loop_650
+              loop_690: do k = kb, 1, -1
+                 j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_qlargv( nr, ab( 1, j1+ka ), inca, work( m-kb+j1 ),ka1, work( n+m-&
+                              kb+j1 ), ka1 )
+                    ! apply rotations in 2nd set from the left
+                    do l = 1, ka - 1
+                       call stdlib_qlartv( nr, ab( ka1-l, j1+l ), inca,ab( ka-l, j1+l ), inca,&
+                                 work( n+m-kb+j1 ), work( m-kb+j1 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_qlar2v( nr, ab( ka1, j1 ), ab( ka1, j1-1 ),ab( ka, j1 ), inca, &
+                              work( n+m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                              work( n+m-kb+j1t ), work( m-kb+j1t ),ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j1, j2, ka1
+                       call stdlib_qrot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,work( n+m-kb+j ), work( &
+                                 m-kb+j ) )
+                    end do
+                 end if
+              end do loop_690
+              do k = 1, kb - 1
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 ! finish applying rotations in 1st set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                               work( n+j1t ),work( j1t ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = 2, min( i+kb, m ) - 2*ka - 1
+                    work( n+j ) = work( n+j+ka )
+                    work( j ) = work( j+ka )
+                 end do
+              end if
+           else
+              ! transform a, working with the lower triangle
+              if( update ) then
+                 ! form  inv(s(i))**t * a * inv(s(i))
+                 bii = bb( 1, i )
+                 do j = i1, i
+                    ab( i-j+1, j ) = ab( i-j+1, j ) / bii
+                 end do
+                 do j = i, min( n, i+ka )
+                    ab( j-i+1, i ) = ab( j-i+1, i ) / bii
+                 end do
+                 do k = i + 1, i + kbt
+                    do j = k, i + kbt
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( j-i+1, i )*ab( k-i+1, i ) -bb( k-i+1, &
+                                 i )*ab( j-i+1, i ) +ab( 1, i )*bb( j-i+1, i )*bb( k-i+1, i )
+                    end do
+                    do j = i + kbt + 1, min( n, i+ka )
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( k-i+1, i )*ab( j-i+1, i )
+                    end do
+                 end do
+                 do j = i1, i
+                    do k = i + 1, min( j+ka, i+kbt )
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( k-i+1, i )*ab( i-j+1, j )
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_qscal( nx, one / bii, x( 1, i ), 1 )
+                    if( kbt>0 )call stdlib_qger( nx, kbt, -one, x( 1, i ), 1, bb( 2, i ), 1,x( 1, &
+                              i+1 ), ldx )
+                 end if
+                 ! store a(i,i1) in ra1 for use in next loop over k
+                 ra1 = ab( i-i1+1, i1 )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions up toward the top of the band
+              loop_840: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i+k-ka1>0 .and. i+k<m ) then
+                       ! generate rotation to annihilate a(i,i+k-ka-1)
+                       call stdlib_qlartg( ab( ka1-k, i+k-ka ), ra1,work( n+i+k-ka ), work( i+k-&
+                                 ka ), ra )
+                       ! create nonzero element a(i+k,i+k-ka-1) outside the
+                       ! band and store it in work(m-kb+i+k)
+                       t = -bb( k+1, i )*ra1
+                       work( m-kb+i+k ) = work( n+i+k-ka )*t -work( i+k-ka )*ab( ka1, i+k-ka )
+                                 
+                       ab( ka1, i+k-ka ) = work( i+k-ka )*t +work( n+i+k-ka )*ab( ka1, i+k-ka )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = min( j2, i-2*ka+k-1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( j2t+ka-1 ) / ka1
+                 do j = j1, j2t, ka1
+                    ! create nonzero element a(j+ka,j-1) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( ka1, j-1 )
+                    ab( ka1, j-1 ) = work( n+j )*ab( ka1, j-1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_qlargv( nrt, ab( ka1, j1 ), inca, work( j1 ), ka1,work( n+&
+                           j1 ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the right
+                    do l = 1, ka - 1
+                       call stdlib_qlartv( nr, ab( l+1, j1 ), inca, ab( l+2, j1-1 ),inca, work( n+&
+                                 j1 ), work( j1 ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_qlar2v( nr, ab( 1, j1 ), ab( 1, j1-1 ),ab( 2, j1-1 ), inca, work( &
+                              n+j1 ),work( j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,work( n+j1t ), work( j1t ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j1, j2, ka1
+                       call stdlib_qrot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,work( n+j ), work( j ) )
+                                 
+                    end do
+                 end if
+              end do loop_840
+              if( update ) then
+                 if( i2>0 .and. kbt>0 ) then
+                    ! create nonzero element a(i+kbt,i+kbt-ka-1) outside the
+                    ! band and store it in work(m-kb+i+kbt)
+                    work( m-kb+i+kbt ) = -bb( kbt+1, i )*ra1
+                 end if
+              end if
+              loop_880: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i + k + 1 - max( 2, k+i0-m )*ka1
+                 else
+                    j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+ka+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( ka1-l+1, j1t+l-1 ), inca,ab( ka1-l, &
+                              j1t+l-1 ), inca,work( n+m-kb+j1t+ka ),work( m-kb+j1t+ka ), ka1 )
+                 end do
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 do j = j1, j2, ka1
+                    work( m-kb+j ) = work( m-kb+j+ka )
+                    work( n+m-kb+j ) = work( n+m-kb+j+ka )
+                 end do
+                 do j = j1, j2, ka1
+                    ! create nonzero element a(j+ka,j-1) outside the band
+                    ! and store it in work(m-kb+j)
+                    work( m-kb+j ) = work( m-kb+j )*ab( ka1, j-1 )
+                    ab( ka1, j-1 ) = work( n+m-kb+j )*ab( ka1, j-1 )
+                 end do
+                 if( update ) then
+                    if( i+k>ka1 .and. k<=kbt )work( m-kb+i+k-ka ) = work( m-kb+i+k )
+                 end if
+              end do loop_880
+              loop_920: do k = kb, 1, -1
+                 j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_qlargv( nr, ab( ka1, j1 ), inca, work( m-kb+j1 ),ka1, work( n+m-&
+                              kb+j1 ), ka1 )
+                    ! apply rotations in 2nd set from the right
+                    do l = 1, ka - 1
+                       call stdlib_qlartv( nr, ab( l+1, j1 ), inca, ab( l+2, j1-1 ),inca, work( n+&
+                                 m-kb+j1 ), work( m-kb+j1 ),ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_qlar2v( nr, ab( 1, j1 ), ab( 1, j1-1 ),ab( 2, j1-1 ), inca, work( &
+                              n+m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,work( n+m-kb+j1t ), work( m-kb+j1t ),ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j1, j2, ka1
+                       call stdlib_qrot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,work( n+m-kb+j ), work( &
+                                 m-kb+j ) )
+                    end do
+                 end if
+              end do loop_920
+              do k = 1, kb - 1
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 ! finish applying rotations in 1st set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_qlartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,work( n+j1t ), work( j1t ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = 2, min( i+kb, m ) - 2*ka - 1
+                    work( n+j ) = work( n+j+ka )
+                    work( j ) = work( j+ka )
+                 end do
+              end if
+           end if
+           go to 490
+     end subroutine stdlib_qsbgst
+
+     !> DSBGV: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x. Here A and B are assumed to be symmetric
+     !> and banded, and B is also positive definite.
+
+     pure subroutine stdlib_qsbgv( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w, z,ldz, work, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldz, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper, wantz
+           character :: vect
+           integer(ilp) :: iinfo, inde, indwrk
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBGV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_qpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           inde = 1
+           indwrk = inde + n
+           call stdlib_qsbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, z, ldz,work( indwrk ), &
+                     iinfo )
+           ! reduce to tridiagonal form.
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_qsbtrd( vect, uplo, n, ka, ab, ldab, w, work( inde ), z, ldz,work( indwrk )&
+                     , iinfo )
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, call stdlib_dsteqr.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, work( inde ), info )
+           else
+              call stdlib_qsteqr( jobz, n, w, work( inde ), z, ldz, work( indwrk ),info )
+           end if
+           return
+     end subroutine stdlib_qsbgv
+
+     !> DSBGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite banded eigenproblem, of the
+     !> form A*x=(lambda)*B*x.  Here A and B are assumed to be symmetric and
+     !> banded, and B is also positive definite.  If eigenvectors are
+     !> desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_qsbgvd( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w,z, ldz, work, &
+               lwork, iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: vect
+           integer(ilp) :: iinfo, inde, indwk2, indwrk, liwmin, llwrk2, lwmin
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              liwmin = 1
+              lwmin = 1
+           else if( wantz ) then
+              liwmin = 3 + 5*n
+              lwmin = 1 + 5*n + 2*n**2
+           else
+              liwmin = 1
+              lwmin = 2*n
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -14
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_qpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           inde = 1
+           indwrk = inde + n
+           indwk2 = indwrk + n*n
+           llwrk2 = lwork - indwk2 + 1
+           call stdlib_qsbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, z, ldz,work, iinfo )
+                     
+           ! reduce to tridiagonal form.
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_qsbtrd( vect, uplo, n, ka, ab, ldab, w, work( inde ), z, ldz,work( indwrk )&
+                     , iinfo )
+           ! for eigenvalues only, call stdlib_qsterf. for eigenvectors, call stdlib_dstedc.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, work( inde ), info )
+           else
+              call stdlib_qstedc( 'I', n, w, work( inde ), work( indwrk ), n,work( indwk2 ), &
+                        llwrk2, iwork, liwork, info )
+              call stdlib_qgemm( 'N', 'N', n, n, n, one, z, ldz, work( indwrk ), n,zero, work( &
+                        indwk2 ), n )
+              call stdlib_qlacpy( 'A', n, n, work( indwk2 ), n, z, ldz )
+           end if
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_qsbgvd
+
+     !> DSBGVX: computes selected eigenvalues, and optionally, eigenvectors
+     !> of a real generalized symmetric-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x.  Here A and B are assumed to be symmetric
+     !> and banded, and B is also positive definite.  Eigenvalues and
+     !> eigenvectors can be selected by specifying either all eigenvalues,
+     !> a range of values or a range of indices for the desired eigenvalues.
+
+     pure subroutine stdlib_qsbgvx( jobz, range, uplo, n, ka, kb, ab, ldab, bb,ldbb, q, ldq, vl, &
+               vu, il, iu, abstol, m, w, z,ldz, work, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, ka, kb, ldab, ldbb, ldq, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           real(qp), intent(out) :: q(ldq,*), w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, upper, valeig, wantz
+           character :: order, vect
+           integer(ilp) :: i, iinfo, indd, inde, indee, indibl, indisp, indiwo, indwrk, itmp1, j, &
+                     jj, nsplit
+           real(qp) :: tmp1
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ka<0 ) then
+              info = -5
+           else if( kb<0 .or. kb>ka ) then
+              info = -6
+           else if( ldab<ka+1 ) then
+              info = -8
+           else if( ldbb<kb+1 ) then
+              info = -10
+           else if( ldq<1 .or. ( wantz .and. ldq<n ) ) then
+              info = -12
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -14
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -15
+                 else if ( iu<min( n, il ) .or. iu>n ) then
+                    info = -16
+                 end if
+              end if
+           end if
+           if( info==0) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -21
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBGVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_qpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           call stdlib_qsbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, q, ldq,work, iinfo )
+                     
+           ! reduce symmetric band matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indwrk = inde + n
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_qsbtrd( vect, uplo, n, ka, ab, ldab, work( indd ),work( inde ), q, ldq, &
+                     work( indwrk ), iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_qsterf or stdlib_dsteqr.  if this fails for some
+           ! eigenvalue, then try stdlib_qstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_qcopy( n, work( indd ), 1, w, 1 )
+              indee = indwrk + 2*n
+              call stdlib_qcopy( n-1, work( inde ), 1, work( indee ), 1 )
+              if( .not.wantz ) then
+                 call stdlib_qsterf( n, w, work( indee ), info )
+              else
+                 call stdlib_qlacpy( 'A', n, n, q, ldq, z, ldz )
+                 call stdlib_qsteqr( jobz, n, w, work( indee ), z, ldz,work( indwrk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_qstebz and, if eigenvectors are desired,
+           ! call stdlib_qstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_qstebz( range, order, n, vl, vu, il, iu, abstol,work( indd ), work( inde ),&
+            m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwrk ),iwork( indiwo ), info )
+                      
+           if( wantz ) then
+              call stdlib_qstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwrk ), iwork( indiwo ), ifail, info )
+              ! apply transformation matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_qstein.
+              do j = 1, m
+                 call stdlib_qcopy( n, z( 1, j ), 1, work( 1 ), 1 )
+                 call stdlib_qgemv( 'N', n, n, one, q, ldq, work, 1, zero,z( 1, j ), 1 )
+              end do
+           end if
+           30 continue
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_qswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_qsbgvx
+
+     !> DSBTRD: reduces a real symmetric band matrix A to symmetric
+     !> tridiagonal form T by an orthogonal similarity transformation:
+     !> Q**T * A * Q = T.
+
+     pure subroutine stdlib_qsbtrd( vect, uplo, n, kd, ab, ldab, d, e, q, ldq,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldq, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ab(ldab,*), q(ldq,*)
+           real(qp), intent(out) :: d(*), e(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: initq, upper, wantq
+           integer(ilp) :: i, i2, ibl, inca, incx, iqaend, iqb, iqend, j, j1, j1end, j1inc, j2, &
+                     jend, jin, jinc, k, kd1, kdm1, kdn, l, last, lend, nq, nr, nrt
+           real(qp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           initq = stdlib_lsame( vect, 'V' )
+           wantq = initq .or. stdlib_lsame( vect, 'U' )
+           upper = stdlib_lsame( uplo, 'U' )
+           kd1 = kd + 1
+           kdm1 = kd - 1
+           incx = ldab - 1
+           iqend = 1
+           info = 0
+           if( .not.wantq .and. .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd1 ) then
+              info = -6
+           else if( ldq<max( 1, n ) .and. wantq ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSBTRD', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize q to the unit matrix, if needed
+           if( initq )call stdlib_qlaset( 'FULL', n, n, zero, one, q, ldq )
+           ! wherever possible, plane rotations are generated and applied in
+           ! vector operations of length nr over the index set j1:j2:kd1.
+           ! the cosines and sines of the plane rotations are stored in the
+           ! arrays d and work.
+           inca = kd1*ldab
+           kdn = min( n-1, kd )
+           if( upper ) then
+              if( kd>1 ) then
+                 ! reduce to tridiagonal form, working with upper triangle
+                 nr = 0
+                 j1 = kdn + 2
+                 j2 = 1
+                 loop_90: do i = 1, n - 2
+                    ! reduce i-th row of matrix to tridiagonal form
+                    loop_80: do k = kdn + 1, 2, -1
+                       j1 = j1 + kdn
+                       j2 = j2 + kdn
+                       if( nr>0 ) then
+                          ! generate plane rotations to annihilate nonzero
+                          ! elements which have been created outside the band
+                          call stdlib_qlargv( nr, ab( 1, j1-1 ), inca, work( j1 ),kd1, d( j1 ), &
+                                    kd1 )
+                          ! apply rotations from the right
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_qlartv or stdlib_qrot is used
+                          if( nr>=2*kd-1 ) then
+                             do l = 1, kd - 1
+                                call stdlib_qlartv( nr, ab( l+1, j1-1 ), inca,ab( l, j1 ), inca, &
+                                          d( j1 ),work( j1 ), kd1 )
+                             end do
+                          else
+                             jend = j1 + ( nr-1 )*kd1
+                             do jinc = j1, jend, kd1
+                                call stdlib_qrot( kdm1, ab( 2, jinc-1 ), 1,ab( 1, jinc ), 1, d( &
+                                          jinc ),work( jinc ) )
+                             end do
+                          end if
+                       end if
+                       if( k>2 ) then
+                          if( k<=n-i+1 ) then
+                             ! generate plane rotation to annihilate a(i,i+k-1)
+                             ! within the band
+                             call stdlib_qlartg( ab( kd-k+3, i+k-2 ),ab( kd-k+2, i+k-1 ), d( i+k-&
+                                       1 ),work( i+k-1 ), temp )
+                             ab( kd-k+3, i+k-2 ) = temp
+                             ! apply rotation from the right
+                             call stdlib_qrot( k-3, ab( kd-k+4, i+k-2 ), 1,ab( kd-k+3, i+k-1 ), 1,&
+                                        d( i+k-1 ),work( i+k-1 ) )
+                          end if
+                          nr = nr + 1
+                          j1 = j1 - kdn - 1
+                       end if
+                       ! apply plane rotations from both sides to diagonal
+                       ! blocks
+                       if( nr>0 )call stdlib_qlar2v( nr, ab( kd1, j1-1 ), ab( kd1, j1 ),ab( kd, &
+                                 j1 ), inca, d( j1 ),work( j1 ), kd1 )
+                       ! apply plane rotations from the left
+                       if( nr>0 ) then
+                          if( 2*kd-1<nr ) then
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_qlartv or stdlib_qrot is used
+                             do l = 1, kd - 1
+                                if( j2+l>n ) then
+                                   nrt = nr - 1
+                                else
+                                   nrt = nr
+                                end if
+                                if( nrt>0 )call stdlib_qlartv( nrt, ab( kd-l, j1+l ), inca,ab( kd-&
+                                          l+1, j1+l ), inca,d( j1 ), work( j1 ), kd1 )
+                             end do
+                          else
+                             j1end = j1 + kd1*( nr-2 )
+                             if( j1end>=j1 ) then
+                                do jin = j1, j1end, kd1
+                                   call stdlib_qrot( kd-1, ab( kd-1, jin+1 ), incx,ab( kd, jin+1 )&
+                                             , incx,d( jin ), work( jin ) )
+                                end do
+                             end if
+                             lend = min( kdm1, n-j2 )
+                             last = j1end + kd1
+                             if( lend>0 )call stdlib_qrot( lend, ab( kd-1, last+1 ), incx,ab( kd, &
+                                       last+1 ), incx, d( last ),work( last ) )
+                          end if
+                       end if
+                       if( wantq ) then
+                          ! accumulate product of plane rotations in q
+                          if( initq ) then
+                       ! take advantage of the fact that q was
+                       ! initially the identity matrix
+                             iqend = max( iqend, j2 )
+                             i2 = max( 0, k-3 )
+                             iqaend = 1 + i*kd
+                             if( k==2 )iqaend = iqaend + kd
+                             iqaend = min( iqaend, iqend )
+                             do j = j1, j2, kd1
+                                ibl = i - i2 / kdm1
+                                i2 = i2 + 1
+                                iqb = max( 1, j-ibl )
+                                nq = 1 + iqaend - iqb
+                                iqaend = min( iqaend+kd, iqend )
+                                call stdlib_qrot( nq, q( iqb, j-1 ), 1, q( iqb, j ),1, d( j ), &
+                                          work( j ) )
+                             end do
+                          else
+                             do j = j1, j2, kd1
+                                call stdlib_qrot( n, q( 1, j-1 ), 1, q( 1, j ), 1,d( j ), work( j &
+                                          ) )
+                             end do
+                          end if
+                       end if
+                       if( j2+kdn>n ) then
+                          ! adjust j2 to keep within the bounds of the matrix
+                          nr = nr - 1
+                          j2 = j2 - kdn - 1
+                       end if
+                       do j = j1, j2, kd1
+                          ! create nonzero element a(j-1,j+kd) outside the band
+                          ! and store it in work
+                          work( j+kd ) = work( j )*ab( 1, j+kd )
+                          ab( 1, j+kd ) = d( j )*ab( 1, j+kd )
+                       end do
+                    end do loop_80
+                 end do loop_90
+              end if
+              if( kd>0 ) then
+                 ! copy off-diagonal elements to e
+                 do i = 1, n - 1
+                    e( i ) = ab( kd, i+1 )
+                 end do
+              else
+                 ! set e to zero if original matrix was diagonal
+                 do i = 1, n - 1
+                    e( i ) = zero
+                 end do
+              end if
+              ! copy diagonal elements to d
+              do i = 1, n
+                 d( i ) = ab( kd1, i )
+              end do
+           else
+              if( kd>1 ) then
+                 ! reduce to tridiagonal form, working with lower triangle
+                 nr = 0
+                 j1 = kdn + 2
+                 j2 = 1
+                 loop_210: do i = 1, n - 2
+                    ! reduce i-th column of matrix to tridiagonal form
+                    loop_200: do k = kdn + 1, 2, -1
+                       j1 = j1 + kdn
+                       j2 = j2 + kdn
+                       if( nr>0 ) then
+                          ! generate plane rotations to annihilate nonzero
+                          ! elements which have been created outside the band
+                          call stdlib_qlargv( nr, ab( kd1, j1-kd1 ), inca,work( j1 ), kd1, d( j1 )&
+                                    , kd1 )
+                          ! apply plane rotations from one side
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_qlartv or stdlib_qrot is used
+                          if( nr>2*kd-1 ) then
+                             do l = 1, kd - 1
+                                call stdlib_qlartv( nr, ab( kd1-l, j1-kd1+l ), inca,ab( kd1-l+1, &
+                                          j1-kd1+l ), inca,d( j1 ), work( j1 ), kd1 )
+                             end do
+                          else
+                             jend = j1 + kd1*( nr-1 )
+                             do jinc = j1, jend, kd1
+                                call stdlib_qrot( kdm1, ab( kd, jinc-kd ), incx,ab( kd1, jinc-kd )&
+                                          , incx,d( jinc ), work( jinc ) )
+                             end do
+                          end if
+                       end if
+                       if( k>2 ) then
+                          if( k<=n-i+1 ) then
+                             ! generate plane rotation to annihilate a(i+k-1,i)
+                             ! within the band
+                             call stdlib_qlartg( ab( k-1, i ), ab( k, i ),d( i+k-1 ), work( i+k-1 &
+                                       ), temp )
+                             ab( k-1, i ) = temp
+                             ! apply rotation from the left
+                             call stdlib_qrot( k-3, ab( k-2, i+1 ), ldab-1,ab( k-1, i+1 ), ldab-1,&
+                                        d( i+k-1 ),work( i+k-1 ) )
+                          end if
+                          nr = nr + 1
+                          j1 = j1 - kdn - 1
+                       end if
+                       ! apply plane rotations from both sides to diagonal
+                       ! blocks
+                       if( nr>0 )call stdlib_qlar2v( nr, ab( 1, j1-1 ), ab( 1, j1 ),ab( 2, j1-1 ),&
+                                  inca, d( j1 ),work( j1 ), kd1 )
+                       ! apply plane rotations from the right
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_qlartv or stdlib_qrot is used
+                       if( nr>0 ) then
+                          if( nr>2*kd-1 ) then
+                             do l = 1, kd - 1
+                                if( j2+l>n ) then
+                                   nrt = nr - 1
+                                else
+                                   nrt = nr
+                                end if
+                                if( nrt>0 )call stdlib_qlartv( nrt, ab( l+2, j1-1 ), inca,ab( l+1,&
+                                           j1 ), inca, d( j1 ),work( j1 ), kd1 )
+                             end do
+                          else
+                             j1end = j1 + kd1*( nr-2 )
+                             if( j1end>=j1 ) then
+                                do j1inc = j1, j1end, kd1
+                                   call stdlib_qrot( kdm1, ab( 3, j1inc-1 ), 1,ab( 2, j1inc ), 1, &
+                                             d( j1inc ),work( j1inc ) )
+                                end do
+                             end if
+                             lend = min( kdm1, n-j2 )
+                             last = j1end + kd1
+                             if( lend>0 )call stdlib_qrot( lend, ab( 3, last-1 ), 1,ab( 2, last ),&
+                                        1, d( last ),work( last ) )
+                          end if
+                       end if
+                       if( wantq ) then
+                          ! accumulate product of plane rotations in q
+                          if( initq ) then
+                       ! take advantage of the fact that q was
+                       ! initially the identity matrix
+                             iqend = max( iqend, j2 )
+                             i2 = max( 0, k-3 )
+                             iqaend = 1 + i*kd
+                             if( k==2 )iqaend = iqaend + kd
+                             iqaend = min( iqaend, iqend )
+                             do j = j1, j2, kd1
+                                ibl = i - i2 / kdm1
+                                i2 = i2 + 1
+                                iqb = max( 1, j-ibl )
+                                nq = 1 + iqaend - iqb
+                                iqaend = min( iqaend+kd, iqend )
+                                call stdlib_qrot( nq, q( iqb, j-1 ), 1, q( iqb, j ),1, d( j ), &
+                                          work( j ) )
+                             end do
+                          else
+                             do j = j1, j2, kd1
+                                call stdlib_qrot( n, q( 1, j-1 ), 1, q( 1, j ), 1,d( j ), work( j &
+                                          ) )
+                             end do
+                          end if
+                       end if
+                       if( j2+kdn>n ) then
+                          ! adjust j2 to keep within the bounds of the matrix
+                          nr = nr - 1
+                          j2 = j2 - kdn - 1
+                       end if
+                       do j = j1, j2, kd1
+                          ! create nonzero element a(j+kd,j-1) outside the
+                          ! band and store it in work
+                          work( j+kd ) = work( j )*ab( kd1, j )
+                          ab( kd1, j ) = d( j )*ab( kd1, j )
+                       end do
+                    end do loop_200
+                 end do loop_210
+              end if
+              if( kd>0 ) then
+                 ! copy off-diagonal elements to e
+                 do i = 1, n - 1
+                    e( i ) = ab( 2, i )
+                 end do
+              else
+                 ! set e to zero if original matrix was diagonal
+                 do i = 1, n - 1
+                    e( i ) = zero
+                 end do
+              end if
+              ! copy diagonal elements to d
+              do i = 1, n
+                 d( i ) = ab( 1, i )
+              end do
+           end if
+           return
+     end subroutine stdlib_qsbtrd
+
+     !> Level 3 BLAS like routine for C in RFP Format.
+     !> DSFRK: performs one of the symmetric rank--k operations
+     !> C := alpha*A*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*A + beta*C,
+     !> where alpha and beta are real scalars, C is an n--by--n symmetric
+     !> matrix and A is an n--by--k matrix in the first case and a k--by--n
+     !> matrix in the second case.
+
+     pure subroutine stdlib_qsfrk( transr, uplo, trans, n, k, alpha, a, lda, beta,c )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, n
+           character, intent(in) :: trans, transr, uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: c(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, normaltransr, nisodd, notrans
+           integer(ilp) :: info, nrowa, j, nk, n1, n2
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           notrans = stdlib_lsame( trans, 'N' )
+           if( notrans ) then
+              nrowa = n
+           else
+              nrowa = k
+           end if
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( .not.notrans .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( lda<max( 1, nrowa ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSFRK ', -info )
+              return
+           end if
+           ! quick return if possible.
+           ! the quick return case: ((alpha==0).and.(beta/=zero)) is not
+           ! done (it is in stdlib_qsyrk for example) and left in the general case.
+           if( ( n==0 ) .or. ( ( ( alpha==zero ) .or. ( k==0 ) ) .and.( beta==one ) ) )&
+                     return
+           if( ( alpha==zero ) .and. ( beta==zero ) ) then
+              do j = 1, ( ( n*( n+1 ) ) / 2 )
+                 c( j ) = zero
+              end do
+              return
+           end if
+           ! c is n-by-n.
+           ! if n is odd, set nisodd = .true., and n1 and n2.
+           ! if n is even, nisodd = .false., and nk.
+           if( mod( n, 2 )==0 ) then
+              nisodd = .false.
+              nk = n / 2
+           else
+              nisodd = .true.
+              if( lower ) then
+                 n2 = n / 2
+                 n1 = n - n2
+              else
+                 n1 = n / 2
+                 n2 = n - n1
+              end if
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is odd, transr = 'n', uplo = 'l', and trans = 'n'
+                       call stdlib_qsyrk( 'L', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n )
+                                 
+                       call stdlib_qsyrk( 'U', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( n+1 )&
+                                 , n )
+                       call stdlib_qgemm( 'N', 'T', n2, n1, k, alpha, a( n1+1, 1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( n1+1 ), n )
+                    else
+                       ! n is odd, transr = 'n', uplo = 'l', and trans = 't'
+                       call stdlib_qsyrk( 'L', 'T', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n )
+                                 
+                       call stdlib_qsyrk( 'U', 'T', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( n+1 )&
+                                 , n )
+                       call stdlib_qgemm( 'T', 'N', n2, n1, k, alpha, a( 1, n1+1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( n1+1 ), n )
+                    end if
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is odd, transr = 'n', uplo = 'u', and trans = 'n'
+                       call stdlib_qsyrk( 'L', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2+1 ), &
+                                 n )
+                       call stdlib_qsyrk( 'U', 'N', n2, k, alpha, a( n2, 1 ), lda,beta, c( n1+1 ),&
+                                  n )
+                       call stdlib_qgemm( 'N', 'T', n1, n2, k, alpha, a( 1, 1 ),lda, a( n2, 1 ), &
+                                 lda, beta, c( 1 ), n )
+                    else
+                       ! n is odd, transr = 'n', uplo = 'u', and trans = 't'
+                       call stdlib_qsyrk( 'L', 'T', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2+1 ), &
+                                 n )
+                       call stdlib_qsyrk( 'U', 'T', n2, k, alpha, a( 1, n2 ), lda,beta, c( n1+1 ),&
+                                  n )
+                       call stdlib_qgemm( 'T', 'N', n1, n2, k, alpha, a( 1, 1 ),lda, a( 1, n2 ), &
+                                 lda, beta, c( 1 ), n )
+                    end if
+                 end if
+              else
+                 ! n is odd, and transr = 't'
+                 if( lower ) then
+                    ! n is odd, transr = 't', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is odd, transr = 't', uplo = 'l', and trans = 'n'
+                       call stdlib_qsyrk( 'U', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n1 &
+                                 )
+                       call stdlib_qsyrk( 'L', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( 2 ), &
+                                 n1 )
+                       call stdlib_qgemm( 'N', 'T', n1, n2, k, alpha, a( 1, 1 ),lda, a( n1+1, 1 ),&
+                                  lda, beta,c( n1*n1+1 ), n1 )
+                    else
+                       ! n is odd, transr = 't', uplo = 'l', and trans = 't'
+                       call stdlib_qsyrk( 'U', 'T', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n1 &
+                                 )
+                       call stdlib_qsyrk( 'L', 'T', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( 2 ), &
+                                 n1 )
+                       call stdlib_qgemm( 'T', 'N', n1, n2, k, alpha, a( 1, 1 ),lda, a( 1, n1+1 ),&
+                                  lda, beta,c( n1*n1+1 ), n1 )
+                    end if
+                 else
+                    ! n is odd, transr = 't', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is odd, transr = 't', uplo = 'u', and trans = 'n'
+                       call stdlib_qsyrk( 'U', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2*n2+1 &
+                                 ), n2 )
+                       call stdlib_qsyrk( 'L', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( &
+                                 n1*n2+1 ), n2 )
+                       call stdlib_qgemm( 'N', 'T', n2, n1, k, alpha, a( n1+1, 1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( 1 ), n2 )
+                    else
+                       ! n is odd, transr = 't', uplo = 'u', and trans = 't'
+                       call stdlib_qsyrk( 'U', 'T', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2*n2+1 &
+                                 ), n2 )
+                       call stdlib_qsyrk( 'L', 'T', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( &
+                                 n1*n2+1 ), n2 )
+                       call stdlib_qgemm( 'T', 'N', n2, n1, k, alpha, a( 1, n1+1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( 1 ), n2 )
+                    end if
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is even, transr = 'n', uplo = 'l', and trans = 'n'
+                       call stdlib_qsyrk( 'L', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( 2 ), n+&
+                                 1 )
+                       call stdlib_qsyrk( 'U', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( 1 ), &
+                                 n+1 )
+                       call stdlib_qgemm( 'N', 'T', nk, nk, k, alpha, a( nk+1, 1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( nk+2 ),n+1 )
+                    else
+                       ! n is even, transr = 'n', uplo = 'l', and trans = 't'
+                       call stdlib_qsyrk( 'L', 'T', nk, k, alpha, a( 1, 1 ), lda,beta, c( 2 ), n+&
+                                 1 )
+                       call stdlib_qsyrk( 'U', 'T', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( 1 ), &
+                                 n+1 )
+                       call stdlib_qgemm( 'T', 'N', nk, nk, k, alpha, a( 1, nk+1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( nk+2 ),n+1 )
+                    end if
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is even, transr = 'n', uplo = 'u', and trans = 'n'
+                       call stdlib_qsyrk( 'L', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+2 ), &
+                                 n+1 )
+                       call stdlib_qsyrk( 'U', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( nk+1 &
+                                 ), n+1 )
+                       call stdlib_qgemm( 'N', 'T', nk, nk, k, alpha, a( 1, 1 ),lda, a( nk+1, 1 ),&
+                                  lda, beta, c( 1 ),n+1 )
+                    else
+                       ! n is even, transr = 'n', uplo = 'u', and trans = 't'
+                       call stdlib_qsyrk( 'L', 'T', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+2 ), &
+                                 n+1 )
+                       call stdlib_qsyrk( 'U', 'T', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( nk+1 &
+                                 ), n+1 )
+                       call stdlib_qgemm( 'T', 'N', nk, nk, k, alpha, a( 1, 1 ),lda, a( 1, nk+1 ),&
+                                  lda, beta, c( 1 ),n+1 )
+                    end if
+                 end if
+              else
+                 ! n is even, and transr = 't'
+                 if( lower ) then
+                    ! n is even, transr = 't', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is even, transr = 't', uplo = 'l', and trans = 'n'
+                       call stdlib_qsyrk( 'U', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+1 ), &
+                                 nk )
+                       call stdlib_qsyrk( 'L', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( 1 ), &
+                                 nk )
+                       call stdlib_qgemm( 'N', 'T', nk, nk, k, alpha, a( 1, 1 ),lda, a( nk+1, 1 ),&
+                                  lda, beta,c( ( ( nk+1 )*nk )+1 ), nk )
+                    else
+                       ! n is even, transr = 't', uplo = 'l', and trans = 't'
+                       call stdlib_qsyrk( 'U', 'T', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+1 ), &
+                                 nk )
+                       call stdlib_qsyrk( 'L', 'T', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( 1 ), &
+                                 nk )
+                       call stdlib_qgemm( 'T', 'N', nk, nk, k, alpha, a( 1, 1 ),lda, a( 1, nk+1 ),&
+                                  lda, beta,c( ( ( nk+1 )*nk )+1 ), nk )
+                    end if
+                 else
+                    ! n is even, transr = 't', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is even, transr = 't', uplo = 'u', and trans = 'n'
+                       call stdlib_qsyrk( 'U', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk*( nk+&
+                                 1 )+1 ), nk )
+                       call stdlib_qsyrk( 'L', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( &
+                                 nk*nk+1 ), nk )
+                       call stdlib_qgemm( 'N', 'T', nk, nk, k, alpha, a( nk+1, 1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( 1 ), nk )
+                    else
+                       ! n is even, transr = 't', uplo = 'u', and trans = 't'
+                       call stdlib_qsyrk( 'U', 'T', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk*( nk+&
+                                 1 )+1 ), nk )
+                       call stdlib_qsyrk( 'L', 'T', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( &
+                                 nk*nk+1 ), nk )
+                       call stdlib_qgemm( 'T', 'N', nk, nk, k, alpha, a( 1, nk+1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( 1 ), nk )
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qsfrk
+
+     !> DSGESV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> DSGESV first attempts to factorize the matrix in SINGLE PRECISION
+     !> and use this factorization within an iterative refinement procedure
+     !> to produce a solution with DOUBLE PRECISION normwise backward error
+     !> quality (see below). If the approach fails the method switches to a
+     !> DOUBLE PRECISION factorization and solve.
+     !> The iterative refinement is not going to be a winning strategy if
+     !> the ratio SINGLE PRECISION performance over DOUBLE PRECISION
+     !> performance is too small. A reasonable strategy should take the
+     !> number of right-hand sides and the size of the matrix into account.
+     !> This might be done with a call to ILAENV in the future. Up to now, we
+     !> always try iterative refinement.
+     !> The iterative refinement process is stopped if
+     !> ITER > ITERMAX
+     !> or for all the RHS we have:
+     !> RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
+     !> where
+     !> o ITER is the number of the current iteration in the iterative
+     !> refinement process
+     !> o RNRM is the infinity-norm of the residual
+     !> o XNRM is the infinity-norm of the solution
+     !> o ANRM is the infinity-operator-norm of the matrix A
+     !> o EPS is the machine epsilon returned by DLAMCH('Epsilon')
+     !> The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00
+     !> respectively.
+
+     subroutine stdlib_qsgesv( n, nrhs, a, lda, ipiv, b, ldb, x, ldx, work,swork, iter, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, iter
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(out) :: swork(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: b(ldb,*)
+           real(qp), intent(out) :: work(n,*), x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           logical(lk), parameter :: doitref = .true.
+           integer(ilp), parameter :: itermax = 30
+           real(qp), parameter :: bwdmax = 1.0e+00_qp
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iiter, ptsa, ptsx
+           real(qp) :: anrm, cte, eps, rnrm, xnrm
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,sqrt
+           ! Executable Statements 
+           info = 0
+           iter = 0
+           ! test the input parameters.
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSGESV', -info )
+              return
+           end if
+           ! quick return if (n==0).
+           if( n==0 )return
+           ! skip double precision iterative refinement if a priori slower
+           ! than quad precision factorization.
+           if( .not.doitref ) then
+              iter = -1
+              go to 40
+           end if
+           ! compute some constants.
+           anrm = stdlib_qlange( 'I', n, n, a, lda, work )
+           eps = stdlib_qlamch( 'EPSILON' )
+           cte = anrm*eps*sqrt( real( n,KIND=qp) )*bwdmax
+           ! set the indices ptsa, ptsx for referencing sa and sx in swork.
+           ptsa = 1
+           ptsx = ptsa + n*n
+           ! convert b from quad precision to double precision and store the
+           ! result in sx.
+           call stdlib_qlag2s( n, nrhs, b, ldb, swork( ptsx ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! convert a from quad precision to double precision and store the
+           ! result in sa.
+           call stdlib_qlag2s( n, n, a, lda, swork( ptsa ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! compute the lu factorization of sa.
+           call stdlib_dgetrf( n, n, swork( ptsa ), n, ipiv, info )
+           if( info/=0 ) then
+              iter = -3
+              go to 40
+           end if
+           ! solve the system sa*sx = sb.
+           call stdlib_dgetrs( 'NO TRANSPOSE', n, nrhs, swork( ptsa ), n, ipiv,swork( ptsx ), n, &
+                     info )
+           ! convert sx back to quad precision
+           call stdlib_dlag2q( n, nrhs, swork( ptsx ), n, x, ldx, info )
+           ! compute r = b - ax (r is work).
+           call stdlib_qlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+           call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, nrhs, n, negone, a,lda, x, ldx, &
+                     one, work, n )
+           ! check whether the nrhs normwise backward errors satisfy the
+           ! stopping criterion. if yes, set iter=0 and return.
+           do i = 1, nrhs
+              xnrm = abs( x( stdlib_iqamax( n, x( 1, i ), 1 ), i ) )
+              rnrm = abs( work( stdlib_iqamax( n, work( 1, i ), 1 ), i ) )
+              if( rnrm>xnrm*cte )go to 10
+           end do
+           ! if we are here, the nrhs normwise backward errors satisfy the
+           ! stopping criterion. we are good to exit.
+           iter = 0
+           return
+           10 continue
+           loop_30: do iiter = 1, itermax
+              ! convert r (in work) from quad precision to double precision
+              ! and store the result in sx.
+              call stdlib_qlag2s( n, nrhs, work, n, swork( ptsx ), n, info )
+              if( info/=0 ) then
+                 iter = -2
+                 go to 40
+              end if
+              ! solve the system sa*sx = sr.
+              call stdlib_dgetrs( 'NO TRANSPOSE', n, nrhs, swork( ptsa ), n, ipiv,swork( ptsx ), &
+                        n, info )
+              ! convert sx back to quad precision and update the current
+              ! iterate.
+              call stdlib_dlag2q( n, nrhs, swork( ptsx ), n, work, n, info )
+              do i = 1, nrhs
+                 call stdlib_qaxpy( n, one, work( 1, i ), 1, x( 1, i ), 1 )
+              end do
+              ! compute r = b - ax (r is work).
+              call stdlib_qlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+              call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, nrhs, n, negone,a, lda, x, &
+                        ldx, one, work, n )
+              ! check whether the nrhs normwise backward errors satisfy the
+              ! stopping criterion. if yes, set iter=iiter>0 and return.
+              do i = 1, nrhs
+                 xnrm = abs( x( stdlib_iqamax( n, x( 1, i ), 1 ), i ) )
+                 rnrm = abs( work( stdlib_iqamax( n, work( 1, i ), 1 ), i ) )
+                 if( rnrm>xnrm*cte )go to 20
+              end do
+              ! if we are here, the nrhs normwise backward errors satisfy the
+              ! stopping criterion, we are good to exit.
+              iter = iiter
+              return
+              20 continue
+           end do loop_30
+           ! if we are at this place of the code, this is because we have
+           ! performed iter=itermax iterations and never satisfied the
+           ! stopping criterion, set up the iter flag accordingly and follow up
+           ! on quad precision routine.
+           iter = -itermax - 1
+           40 continue
+           ! single-precision iterative refinement failed to converge to a
+           ! satisfactory solution, so we resort to quad precision.
+           call stdlib_qgetrf( n, n, a, lda, ipiv, info )
+           if( info/=0 )return
+           call stdlib_qlacpy( 'ALL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_qgetrs( 'NO TRANSPOSE', n, nrhs, a, lda, ipiv, x, ldx,info )
+           return
+     end subroutine stdlib_qsgesv
+
+     !> DSPCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric packed matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by DSPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_qspcon( uplo, n, ap, ipiv, anorm, rcond, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: ap(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ip, kase
+           real(qp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              ip = n*( n+1 ) / 2
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip - i
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              ip = 1
+              do i = 1, n
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip + n - i + 1
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_qsptrs( uplo, n, 1, ap, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_qspcon
+
+     !> DSPEV: computes all the eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric matrix A in packed storage.
+
+     subroutine stdlib_qspev( jobz, uplo, n, ap, w, z, ldz, work, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ap(*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantz
+           integer(ilp) :: iinfo, imax, inde, indtau, indwrk, iscale
+           real(qp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = ap( 1 )
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_qlansp( 'M', uplo, n, ap, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_qscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+           end if
+           ! call stdlib_qsptrd to reduce symmetric packed matrix to tridiagonal form.
+           inde = 1
+           indtau = inde + n
+           call stdlib_qsptrd( uplo, n, ap, w, work( inde ), work( indtau ), iinfo )
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, first call
+           ! stdlib_qopgtr to generate the orthogonal matrix, then call stdlib_qsteqr.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, work( inde ), info )
+           else
+              indwrk = indtau + n
+              call stdlib_qopgtr( uplo, n, ap, work( indtau ), z, ldz,work( indwrk ), iinfo )
+                        
+              call stdlib_qsteqr( jobz, n, w, work( inde ), z, ldz, work( indtau ),info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           return
+     end subroutine stdlib_qspev
+
+     !> DSPEVD: computes all the eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric matrix A in packed storage. If eigenvectors are
+     !> desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_qspevd( jobz, uplo, n, ap, w, z, ldz, work, lwork,iwork, liwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: ap(*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantz
+           integer(ilp) :: iinfo, inde, indtau, indwrk, iscale, liwmin, llwork, lwmin
+           real(qp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 liwmin = 1
+                 lwmin = 1
+              else
+                 if( wantz ) then
+                    liwmin = 3 + 5*n
+                    lwmin = 1 + 6*n + n**2
+                 else
+                    liwmin = 1
+                    lwmin = 2*n
+                 end if
+              end if
+              iwork( 1 ) = liwmin
+              work( 1 ) = lwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -9
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = ap( 1 )
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_qlansp( 'M', uplo, n, ap, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_qscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+           end if
+           ! call stdlib_qsptrd to reduce symmetric packed matrix to tridiagonal form.
+           inde = 1
+           indtau = inde + n
+           call stdlib_qsptrd( uplo, n, ap, w, work( inde ), work( indtau ), iinfo )
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, first call
+           ! stdlib_qstedc to generate the eigenvector matrix, work(indwrk), of the
+           ! tridiagonal matrix, then call stdlib_qopmtr to multiply it by the
+           ! householder transformations represented in ap.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, work( inde ), info )
+           else
+              indwrk = indtau + n
+              llwork = lwork - indwrk + 1
+              call stdlib_qstedc( 'I', n, w, work( inde ), z, ldz, work( indwrk ),llwork, iwork, &
+                        liwork, info )
+              call stdlib_qopmtr( 'L', uplo, 'N', n, n, ap, work( indtau ), z, ldz,work( indwrk ),&
+                         iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 )call stdlib_qscal( n, one / sigma, w, 1 )
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_qspevd
+
+     !> DSPEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric matrix A in packed storage.  Eigenvalues/vectors
+     !> can be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+
+     subroutine stdlib_qspevx( jobz, range, uplo, n, ap, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               work, iwork, ifail,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(inout) :: ap(*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwo, indtau, &
+                     indwrk, iscale, itmp1, j, jj, nsplit
+           real(qp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( stdlib_lsame( uplo, 'L' ) .or. stdlib_lsame( uplo, 'U' ) ) )&
+                     then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -7
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -8
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -9
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = ap( 1 )
+              else
+                 if( vl<ap( 1 ) .and. vu>=ap( 1 ) ) then
+                    m = 1
+                    w( 1 ) = ap( 1 )
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           end if
+           anrm = stdlib_qlansp( 'M', uplo, n, ap, work )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_qscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_qsptrd to reduce symmetric packed matrix to tridiagonal form.
+           indtau = 1
+           inde = indtau + n
+           indd = inde + n
+           indwrk = indd + n
+           call stdlib_qsptrd( uplo, n, ap, work( indd ), work( inde ),work( indtau ), iinfo )
+                     
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_qsterf or stdlib_qopgtr and stdlib_dsteqr.  if this fails
+           ! for some eigenvalue, then try stdlib_qstebz.
+           test = .false.
+           if (indeig) then
+              if (il==1 .and. iu==n) then
+                 test = .true.
+              end if
+           end if
+           if ((alleig .or. test) .and. (abstol<=zero)) then
+              call stdlib_qcopy( n, work( indd ), 1, w, 1 )
+              indee = indwrk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_qcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_qsterf( n, w, work( indee ), info )
+              else
+                 call stdlib_qopgtr( uplo, n, ap, work( indtau ), z, ldz,work( indwrk ), iinfo )
+                           
+                 call stdlib_qcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_qsteqr( jobz, n, w, work( indee ), z, ldz,work( indwrk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 20
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_qstebz and, if eigenvectors are desired, stdlib_dstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_qstebz( range, order, n, vll, vuu, il, iu, abstll,work( indd ), work( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwrk ),iwork( indiwo ), info &
+                     )
+           if( wantz ) then
+              call stdlib_qstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwrk ), iwork( indiwo ), ifail, info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_qstein.
+              call stdlib_qopmtr( 'L', uplo, 'N', n, m, ap, work( indtau ), z, ldz,work( indwrk ),&
+                         iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           20 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_qswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_qspevx
+
+     !> DSPGST: reduces a real symmetric-definite generalized eigenproblem
+     !> to standard form, using packed storage.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**T)*A*inv(U) or inv(L)*A*inv(L**T)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**T or L**T*A*L.
+     !> B must have been previously factorized as U**T*U or L*L**T by DPPTRF.
+
+     pure subroutine stdlib_qspgst( itype, uplo, n, ap, bp, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ap(*)
+           real(qp), intent(in) :: bp(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, j1, j1j1, jj, k, k1, k1k1, kk
+           real(qp) :: ajj, akk, bjj, bkk, ct
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPGST', -info )
+              return
+           end if
+           if( itype==1 ) then
+              if( upper ) then
+                 ! compute inv(u**t)*a*inv(u)
+                 ! j1 and jj are the indices of a(1,j) and a(j,j)
+                 jj = 0
+                 do j = 1, n
+                    j1 = jj + 1
+                    jj = jj + j
+                    ! compute the j-th column of the upper triangle of a
+                    bjj = bp( jj )
+                    call stdlib_qtpsv( uplo, 'TRANSPOSE', 'NONUNIT', j, bp,ap( j1 ), 1 )
+                    call stdlib_qspmv( uplo, j-1, -one, ap, bp( j1 ), 1, one,ap( j1 ), 1 )
+                    call stdlib_qscal( j-1, one / bjj, ap( j1 ), 1 )
+                    ap( jj ) = ( ap( jj )-stdlib_qdot( j-1, ap( j1 ), 1, bp( j1 ),1 ) ) / &
+                              bjj
+                 end do
+              else
+                 ! compute inv(l)*a*inv(l**t)
+                 ! kk and k1k1 are the indices of a(k,k) and a(k+1,k+1)
+                 kk = 1
+                 do k = 1, n
+                    k1k1 = kk + n - k + 1
+                    ! update the lower triangle of a(k:n,k:n)
+                    akk = ap( kk )
+                    bkk = bp( kk )
+                    akk = akk / bkk**2
+                    ap( kk ) = akk
+                    if( k<n ) then
+                       call stdlib_qscal( n-k, one / bkk, ap( kk+1 ), 1 )
+                       ct = -half*akk
+                       call stdlib_qaxpy( n-k, ct, bp( kk+1 ), 1, ap( kk+1 ), 1 )
+                       call stdlib_qspr2( uplo, n-k, -one, ap( kk+1 ), 1,bp( kk+1 ), 1, ap( k1k1 )&
+                                  )
+                       call stdlib_qaxpy( n-k, ct, bp( kk+1 ), 1, ap( kk+1 ), 1 )
+                       call stdlib_qtpsv( uplo, 'NO TRANSPOSE', 'NON-UNIT', n-k,bp( k1k1 ), ap( &
+                                 kk+1 ), 1 )
+                    end if
+                    kk = k1k1
+                 end do
+              end if
+           else
+              if( upper ) then
+                 ! compute u*a*u**t
+                 ! k1 and kk are the indices of a(1,k) and a(k,k)
+                 kk = 0
+                 do k = 1, n
+                    k1 = kk + 1
+                    kk = kk + k
+                    ! update the upper triangle of a(1:k,1:k)
+                    akk = ap( kk )
+                    bkk = bp( kk )
+                    call stdlib_qtpmv( uplo, 'NO TRANSPOSE', 'NON-UNIT', k-1, bp,ap( k1 ), 1 )
+                              
+                    ct = half*akk
+                    call stdlib_qaxpy( k-1, ct, bp( k1 ), 1, ap( k1 ), 1 )
+                    call stdlib_qspr2( uplo, k-1, one, ap( k1 ), 1, bp( k1 ), 1,ap )
+                    call stdlib_qaxpy( k-1, ct, bp( k1 ), 1, ap( k1 ), 1 )
+                    call stdlib_qscal( k-1, bkk, ap( k1 ), 1 )
+                    ap( kk ) = akk*bkk**2
+                 end do
+              else
+                 ! compute l**t *a*l
+                 ! jj and j1j1 are the indices of a(j,j) and a(j+1,j+1)
+                 jj = 1
+                 do j = 1, n
+                    j1j1 = jj + n - j + 1
+                    ! compute the j-th column of the lower triangle of a
+                    ajj = ap( jj )
+                    bjj = bp( jj )
+                    ap( jj ) = ajj*bjj + stdlib_qdot( n-j, ap( jj+1 ), 1,bp( jj+1 ), 1 )
+                    call stdlib_qscal( n-j, bjj, ap( jj+1 ), 1 )
+                    call stdlib_qspmv( uplo, n-j, one, ap( j1j1 ), bp( jj+1 ), 1,one, ap( jj+1 ), &
+                              1 )
+                    call stdlib_qtpmv( uplo, 'TRANSPOSE', 'NON-UNIT', n-j+1,bp( jj ), ap( jj ), 1 &
+                              )
+                    jj = j1j1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qspgst
+
+     !> DSPGV: computes all the eigenvalues and, optionally, the eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+     !> Here A and B are assumed to be symmetric, stored in packed format,
+     !> and B is also positive definite.
+
+     subroutine stdlib_qspgv( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, ldz, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ap(*), bp(*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper, wantz
+           character :: trans
+           integer(ilp) :: j, neig
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPGV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_qpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_qspgst( itype, uplo, n, ap, bp, info )
+           call stdlib_qspev( jobz, uplo, n, ap, w, z, ldz, work, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 do j = 1, neig
+                    call stdlib_qtpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, neig
+                    call stdlib_qtpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qspgv
+
+     !> DSPGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be symmetric, stored in packed format, and B is also
+     !> positive definite.
+     !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_qspgvd( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,lwork, iwork, liwork,&
+                info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: ap(*), bp(*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: j, liwmin, lwmin, neig
+           ! Intrinsic Functions 
+           intrinsic :: real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 liwmin = 1
+                 lwmin = 1
+              else
+                 if( wantz ) then
+                    liwmin = 3 + 5*n
+                    lwmin = 1 + 6*n + 2*n**2
+                 else
+                    liwmin = 1
+                    lwmin = 2*n
+                 end if
+              end if
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of bp.
+           call stdlib_qpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_qspgst( itype, uplo, n, ap, bp, info )
+           call stdlib_qspevd( jobz, uplo, n, ap, w, z, ldz, work, lwork, iwork,liwork, info )
+                     
+           lwmin = max( real( lwmin,KIND=qp), real( work( 1 ),KIND=qp) )
+           liwmin = max( real( liwmin,KIND=qp), real( iwork( 1 ),KIND=qp) )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 do j = 1, neig
+                    call stdlib_qtpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t *y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, neig
+                    call stdlib_qtpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_qspgvd
+
+     !> DSPGVX: computes selected eigenvalues, and optionally, eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A
+     !> and B are assumed to be symmetric, stored in packed storage, and B
+     !> is also positive definite.  Eigenvalues and eigenvectors can be
+     !> selected by specifying either a range of values or a range of indices
+     !> for the desired eigenvalues.
+
+     subroutine stdlib_qspgvx( itype, jobz, range, uplo, n, ap, bp, vl, vu,il, iu, abstol, m, w, &
+               z, ldz, work, iwork,ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, itype, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(inout) :: ap(*), bp(*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, upper, valeig, wantz
+           character :: trans
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           upper = stdlib_lsame( uplo, 'U' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -3
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl ) then
+                    info = -9
+                 end if
+              else if( indeig ) then
+                 if( il<1 ) then
+                    info = -10
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -11
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPGVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_qpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_qspgst( itype, uplo, n, ap, bp, info )
+           call stdlib_qspevx( jobz, range, uplo, n, ap, vl, vu, il, iu, abstol, m,w, z, ldz, &
+                     work, iwork, ifail, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              if( info>0 )m = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 do j = 1, m
+                    call stdlib_qtpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, m
+                    call stdlib_qtpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qspgvx
+
+     !> DSPOSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> DSPOSV first attempts to factorize the matrix in SINGLE PRECISION
+     !> and use this factorization within an iterative refinement procedure
+     !> to produce a solution with DOUBLE PRECISION normwise backward error
+     !> quality (see below). If the approach fails the method switches to a
+     !> DOUBLE PRECISION factorization and solve.
+     !> The iterative refinement is not going to be a winning strategy if
+     !> the ratio SINGLE PRECISION performance over DOUBLE PRECISION
+     !> performance is too small. A reasonable strategy should take the
+     !> number of right-hand sides and the size of the matrix into account.
+     !> This might be done with a call to ILAENV in the future. Up to now, we
+     !> always try iterative refinement.
+     !> The iterative refinement process is stopped if
+     !> ITER > ITERMAX
+     !> or for all the RHS we have:
+     !> RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
+     !> where
+     !> o ITER is the number of the current iteration in the iterative
+     !> refinement process
+     !> o RNRM is the infinity-norm of the residual
+     !> o XNRM is the infinity-norm of the solution
+     !> o ANRM is the infinity-operator-norm of the matrix A
+     !> o EPS is the machine epsilon returned by DLAMCH('Epsilon')
+     !> The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00
+     !> respectively.
+
+     subroutine stdlib_qsposv( uplo, n, nrhs, a, lda, b, ldb, x, ldx, work,swork, iter, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, iter
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(out) :: swork(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: b(ldb,*)
+           real(qp), intent(out) :: work(n,*), x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           logical(lk), parameter :: doitref = .true.
+           integer(ilp), parameter :: itermax = 30
+           real(qp), parameter :: bwdmax = 1.0e+00_qp
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iiter, ptsa, ptsx
+           real(qp) :: anrm, cte, eps, rnrm, xnrm
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,sqrt
+           ! Executable Statements 
+           info = 0
+           iter = 0
+           ! test the input parameters.
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPOSV', -info )
+              return
+           end if
+           ! quick return if (n==0).
+           if( n==0 )return
+           ! skip double precision iterative refinement if a priori slower
+           ! than quad precision factorization.
+           if( .not.doitref ) then
+              iter = -1
+              go to 40
+           end if
+           ! compute some constants.
+           anrm = stdlib_qlansy( 'I', uplo, n, a, lda, work )
+           eps = stdlib_qlamch( 'EPSILON' )
+           cte = anrm*eps*sqrt( real( n,KIND=qp) )*bwdmax
+           ! set the indices ptsa, ptsx for referencing sa and sx in swork.
+           ptsa = 1
+           ptsx = ptsa + n*n
+           ! convert b from quad precision to double precision and store the
+           ! result in sx.
+           call stdlib_qlag2s( n, nrhs, b, ldb, swork( ptsx ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! convert a from quad precision to double precision and store the
+           ! result in sa.
+           call stdlib_qlat2s( uplo, n, a, lda, swork( ptsa ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! compute the cholesky factorization of sa.
+           call stdlib_dpotrf( uplo, n, swork( ptsa ), n, info )
+           if( info/=0 ) then
+              iter = -3
+              go to 40
+           end if
+           ! solve the system sa*sx = sb.
+           call stdlib_dpotrs( uplo, n, nrhs, swork( ptsa ), n, swork( ptsx ), n,info )
+           ! convert sx back to quad precision
+           call stdlib_dlag2q( n, nrhs, swork( ptsx ), n, x, ldx, info )
+           ! compute r = b - ax (r is work).
+           call stdlib_qlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+           call stdlib_qsymm( 'LEFT', uplo, n, nrhs, negone, a, lda, x, ldx, one,work, n )
+           ! check whether the nrhs normwise backward errors satisfy the
+           ! stopping criterion. if yes, set iter=0 and return.
+           do i = 1, nrhs
+              xnrm = abs( x( stdlib_iqamax( n, x( 1, i ), 1 ), i ) )
+              rnrm = abs( work( stdlib_iqamax( n, work( 1, i ), 1 ), i ) )
+              if( rnrm>xnrm*cte )go to 10
+           end do
+           ! if we are here, the nrhs normwise backward errors satisfy the
+           ! stopping criterion. we are good to exit.
+           iter = 0
+           return
+           10 continue
+           loop_30: do iiter = 1, itermax
+              ! convert r (in work) from quad precision to double precision
+              ! and store the result in sx.
+              call stdlib_qlag2s( n, nrhs, work, n, swork( ptsx ), n, info )
+              if( info/=0 ) then
+                 iter = -2
+                 go to 40
+              end if
+              ! solve the system sa*sx = sr.
+              call stdlib_dpotrs( uplo, n, nrhs, swork( ptsa ), n, swork( ptsx ), n,info )
+              ! convert sx back to quad precision and update the current
+              ! iterate.
+              call stdlib_dlag2q( n, nrhs, swork( ptsx ), n, work, n, info )
+              do i = 1, nrhs
+                 call stdlib_qaxpy( n, one, work( 1, i ), 1, x( 1, i ), 1 )
+              end do
+              ! compute r = b - ax (r is work).
+              call stdlib_qlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+              call stdlib_qsymm( 'L', uplo, n, nrhs, negone, a, lda, x, ldx, one,work, n )
+              ! check whether the nrhs normwise backward errors satisfy the
+              ! stopping criterion. if yes, set iter=iiter>0 and return.
+              do i = 1, nrhs
+                 xnrm = abs( x( stdlib_iqamax( n, x( 1, i ), 1 ), i ) )
+                 rnrm = abs( work( stdlib_iqamax( n, work( 1, i ), 1 ), i ) )
+                 if( rnrm>xnrm*cte )go to 20
+              end do
+              ! if we are here, the nrhs normwise backward errors satisfy the
+              ! stopping criterion, we are good to exit.
+              iter = iiter
+              return
+              20 continue
+           end do loop_30
+           ! if we are at this place of the code, this is because we have
+           ! performed iter=itermax iterations and never satisfied the
+           ! stopping criterion, set up the iter flag accordingly and follow
+           ! up on quad precision routine.
+           iter = -itermax - 1
+           40 continue
+           ! single-precision iterative refinement failed to converge to a
+           ! satisfactory solution, so we resort to quad precision.
+           call stdlib_qpotrf( uplo, n, a, lda, info )
+           if( info/=0 )return
+           call stdlib_qlacpy( 'ALL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_qpotrs( uplo, n, nrhs, a, lda, x, ldx, info )
+           return
+     end subroutine stdlib_qsposv
+
+     !> DSPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric indefinite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_qsprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, work,&
+                iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*)
+           real(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_qcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_qspmv( uplo, n, -one, ap, x( 1, j ), 1, one, work( n+1 ),1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       work( i ) = work( i ) + abs( ap( ik ) )*xk
+                       s = s + abs( ap( ik ) )*abs( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    work( k ) = work( k ) + abs( ap( kk+k-1 ) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( ap( kk ) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       work( i ) = work( i ) + abs( ap( ik ) )*xk
+                       s = s + abs( ap( ik ) )*abs( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    work( k ) = work( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_qsptrs( uplo, n, 1, afp, ipiv, work( n+1 ), n, info )
+                 call stdlib_qaxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_qlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_qlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_qsptrs( uplo, n, 1, afp, ipiv, work( n+1 ), n,info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_qsptrs( uplo, n, 1, afp, ipiv, work( n+1 ), n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_qsprfs
+
+     !> DSPSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix stored in packed format and X
+     !> and B are N-by-NRHS matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, D is symmetric and block diagonal with 1-by-1
+     !> and 2-by-2 diagonal blocks.  The factored form of A is then used to
+     !> solve the system of equations A * X = B.
+
+     pure subroutine stdlib_qspsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPSV ', -info )
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_qsptrf( uplo, n, ap, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_qsptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_qspsv
+
+     !> DSPSVX: uses the diagonal pivoting factorization A = U*D*U**T or
+     !> A = L*D*L**T to compute the solution to a real system of linear
+     !> equations A * X = B, where A is an N-by-N symmetric matrix stored
+     !> in packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_qspsvx( fact, uplo, n, nrhs, ap, afp, ipiv, b, ldb, x,ldx, rcond, ferr, &
+               berr, work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: afp(*)
+           real(qp), intent(in) :: ap(*), b(ldb,*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(qp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+              call stdlib_qcopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_qsptrf( uplo, n, afp, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_qlansp( 'I', uplo, n, ap, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_qspcon( uplo, n, afp, ipiv, anorm, rcond, work, iwork, info )
+           ! compute the solution vectors x.
+           call stdlib_qlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_qsptrs( uplo, n, nrhs, afp, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_qsprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx, ferr,berr, work, &
+                     iwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_qspsvx
+
+     !> DSPTRD: reduces a real symmetric matrix A stored in packed form to
+     !> symmetric tridiagonal form T by an orthogonal similarity
+     !> transformation: Q**T * A * Q = T.
+
+     pure subroutine stdlib_qsptrd( uplo, n, ap, d, e, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ap(*)
+           real(qp), intent(out) :: d(*), e(*), tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, i1, i1i1, ii
+           real(qp) :: alpha, taui
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPTRD', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           if( upper ) then
+              ! reduce the upper triangle of a.
+              ! i1 is the index in ap of a(1,i+1).
+              i1 = n*( n-1 ) / 2 + 1
+              do i = n - 1, 1, -1
+                 ! generate elementary reflector h(i) = i - tau * v * v**t
+                 ! to annihilate a(1:i-1,i+1)
+                 call stdlib_qlarfg( i, ap( i1+i-1 ), ap( i1 ), 1, taui )
+                 e( i ) = ap( i1+i-1 )
+                 if( taui/=zero ) then
+                    ! apply h(i) from both sides to a(1:i,1:i)
+                    ap( i1+i-1 ) = one
+                    ! compute  y := tau * a * v  storing y in tau(1:i)
+                    call stdlib_qspmv( uplo, i, taui, ap, ap( i1 ), 1, zero, tau,1 )
+                    ! compute  w := y - 1/2 * tau * (y**t *v) * v
+                    alpha = -half*taui*stdlib_qdot( i, tau, 1, ap( i1 ), 1 )
+                    call stdlib_qaxpy( i, alpha, ap( i1 ), 1, tau, 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**t - w * v**t
+                    call stdlib_qspr2( uplo, i, -one, ap( i1 ), 1, tau, 1, ap )
+                    ap( i1+i-1 ) = e( i )
+                 end if
+                 d( i+1 ) = ap( i1+i )
+                 tau( i ) = taui
+                 i1 = i1 - i
+              end do
+              d( 1 ) = ap( 1 )
+           else
+              ! reduce the lower triangle of a. ii is the index in ap of
+              ! a(i,i) and i1i1 is the index of a(i+1,i+1).
+              ii = 1
+              do i = 1, n - 1
+                 i1i1 = ii + n - i + 1
+                 ! generate elementary reflector h(i) = i - tau * v * v**t
+                 ! to annihilate a(i+2:n,i)
+                 call stdlib_qlarfg( n-i, ap( ii+1 ), ap( ii+2 ), 1, taui )
+                 e( i ) = ap( ii+1 )
+                 if( taui/=zero ) then
+                    ! apply h(i) from both sides to a(i+1:n,i+1:n)
+                    ap( ii+1 ) = one
+                    ! compute  y := tau * a * v  storing y in tau(i:n-1)
+                    call stdlib_qspmv( uplo, n-i, taui, ap( i1i1 ), ap( ii+1 ), 1,zero, tau( i ), &
+                              1 )
+                    ! compute  w := y - 1/2 * tau * (y**t *v) * v
+                    alpha = -half*taui*stdlib_qdot( n-i, tau( i ), 1, ap( ii+1 ),1 )
+                    call stdlib_qaxpy( n-i, alpha, ap( ii+1 ), 1, tau( i ), 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**t - w * v**t
+                    call stdlib_qspr2( uplo, n-i, -one, ap( ii+1 ), 1, tau( i ), 1,ap( i1i1 ) )
+                              
+                    ap( ii+1 ) = e( i )
+                 end if
+                 d( i ) = ap( ii )
+                 tau( i ) = taui
+                 ii = i1i1
+              end do
+              d( n ) = ap( ii )
+           end if
+           return
+     end subroutine stdlib_qsptrd
+
+     !> DSPTRF: computes the factorization of a real symmetric matrix A stored
+     !> in packed format using the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+
+     pure subroutine stdlib_qsptrf( uplo, n, ap, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kc, kk, knc, kp, kpc, kstep, kx, npp
+           real(qp) :: absakk, alpha, colmax, d11, d12, d21, d22, r1, rowmax, t, wk, wkm1, &
+                     wkp1
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPTRF', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              kc = ( n-1 )*n / 2 + 1
+              10 continue
+              knc = kc
+              ! if k < 1, exit from loop
+              if( k<1 )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( ap( kc+k-1 ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k>1 ) then
+                 imax = stdlib_iqamax( k-1, ap( kc ), 1 )
+                 colmax = abs( ap( kc+imax-1 ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    rowmax = zero
+                    jmax = imax
+                    kx = imax*( imax+1 ) / 2 + imax
+                    do j = imax + 1, k
+                       if( abs( ap( kx ) )>rowmax ) then
+                          rowmax = abs( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + j
+                    end do
+                    kpc = ( imax-1 )*imax / 2 + 1
+                    if( imax>1 ) then
+                       jmax = stdlib_iqamax( imax-1, ap( kpc ), 1 )
+                       rowmax = max( rowmax, abs( ap( kpc+jmax-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( ap( kpc+imax-1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kstep==2 )knc = knc - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_qswap( kp-1, ap( knc ), 1, ap( kpc ), 1 )
+                    kx = kpc + kp - 1
+                    do j = kp + 1, kk - 1
+                       kx = kx + j - 1
+                       t = ap( knc+j-1 )
+                       ap( knc+j-1 ) = ap( kx )
+                       ap( kx ) = t
+                    end do
+                    t = ap( knc+kk-1 )
+                    ap( knc+kk-1 ) = ap( kpc+kp-1 )
+                    ap( kpc+kp-1 ) = t
+                    if( kstep==2 ) then
+                       t = ap( kc+k-2 )
+                       ap( kc+k-2 ) = ap( kc+kp-1 )
+                       ap( kc+kp-1 ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**t = a - w(k)*1/d(k)*w(k)**t
+                    r1 = one / ap( kc+k-1 )
+                    call stdlib_qspr( uplo, k-1, -r1, ap( kc ), 1, ap )
+                    ! store u(k) in column k
+                    call stdlib_qscal( k-1, r1, ap( kc ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**t
+                    if( k>2 ) then
+                       d12 = ap( k-1+( k-1 )*k / 2 )
+                       d22 = ap( k-1+( k-2 )*( k-1 ) / 2 ) / d12
+                       d11 = ap( k+( k-1 )*k / 2 ) / d12
+                       t = one / ( d11*d22-one )
+                       d12 = t / d12
+                       do j = k - 2, 1, -1
+                          wkm1 = d12*( d11*ap( j+( k-2 )*( k-1 ) / 2 )-ap( j+( k-1 )*k / 2 ) )
+                                    
+                          wk = d12*( d22*ap( j+( k-1 )*k / 2 )-ap( j+( k-2 )*( k-1 ) / 2 ) )
+                                    
+                          do i = j, 1, -1
+                             ap( i+( j-1 )*j / 2 ) = ap( i+( j-1 )*j / 2 ) -ap( i+( k-1 )*k / 2 )&
+                                       *wk -ap( i+( k-2 )*( k-1 ) / 2 )*wkm1
+                          end do
+                          ap( j+( k-1 )*k / 2 ) = wk
+                          ap( j+( k-2 )*( k-1 ) / 2 ) = wkm1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              kc = knc - k
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              kc = 1
+              npp = n*( n+1 ) / 2
+              60 continue
+              knc = kc
+              ! if k > n, exit from loop
+              if( k>n )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( ap( kc ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k<n ) then
+                 imax = k + stdlib_iqamax( n-k, ap( kc+1 ), 1 )
+                 colmax = abs( ap( kc+imax-k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    rowmax = zero
+                    kx = kc + imax - k
+                    do j = k, imax - 1
+                       if( abs( ap( kx ) )>rowmax ) then
+                          rowmax = abs( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + n - j
+                    end do
+                    kpc = npp - ( n-imax+1 )*( n-imax+2 ) / 2 + 1
+                    if( imax<n ) then
+                       jmax = imax + stdlib_iqamax( n-imax, ap( kpc+1 ), 1 )
+                       rowmax = max( rowmax, abs( ap( kpc+jmax-imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( ap( kpc ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kstep==2 )knc = knc + n - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_qswap( n-kp, ap( knc+kp-kk+1 ), 1, ap( kpc+1 ),1 )
+                              
+                    kx = knc + kp - kk
+                    do j = kk + 1, kp - 1
+                       kx = kx + n - j + 1
+                       t = ap( knc+j-kk )
+                       ap( knc+j-kk ) = ap( kx )
+                       ap( kx ) = t
+                    end do
+                    t = ap( knc )
+                    ap( knc ) = ap( kpc )
+                    ap( kpc ) = t
+                    if( kstep==2 ) then
+                       t = ap( kc+1 )
+                       ap( kc+1 ) = ap( kc+kp-k )
+                       ap( kc+kp-k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**t = a - w(k)*(1/d(k))*w(k)**t
+                       r1 = one / ap( kc )
+                       call stdlib_qspr( uplo, n-k, -r1, ap( kc+1 ), 1,ap( kc+n-k+1 ) )
+                       ! store l(k) in column k
+                       call stdlib_qscal( n-k, r1, ap( kc+1 ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**t
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**t
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d21 = ap( k+1+( k-1 )*( 2*n-k ) / 2 )
+                       d11 = ap( k+1+k*( 2*n-k-1 ) / 2 ) / d21
+                       d22 = ap( k+( k-1 )*( 2*n-k ) / 2 ) / d21
+                       t = one / ( d11*d22-one )
+                       d21 = t / d21
+                       do j = k + 2, n
+                          wk = d21*( d11*ap( j+( k-1 )*( 2*n-k ) / 2 )-ap( j+k*( 2*n-k-1 ) / 2 ) )
+                                    
+                          wkp1 = d21*( d22*ap( j+k*( 2*n-k-1 ) / 2 )-ap( j+( k-1 )*( 2*n-k ) / 2 )&
+                                     )
+                          do i = j, n
+                             ap( i+( j-1 )*( 2*n-j ) / 2 ) = ap( i+( j-1 )*( 2*n-j ) / 2 ) - ap( &
+                                       i+( k-1 )*( 2*n-k ) /2 )*wk - ap( i+k*( 2*n-k-1 ) / 2 )*wkp1
+                          end do
+                          ap( j+( k-1 )*( 2*n-k ) / 2 ) = wk
+                          ap( j+k*( 2*n-k-1 ) / 2 ) = wkp1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              kc = knc + n - k + 2
+              go to 60
+           end if
+           110 continue
+           return
+     end subroutine stdlib_qsptrf
+
+     !> DSPTRI: computes the inverse of a real symmetric indefinite matrix
+     !> A in packed storage using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by DSPTRF.
+
+     pure subroutine stdlib_qsptri( uplo, n, ap, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(inout) :: ap(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kcnext, kp, kpc, kstep, kx, npp
+           real(qp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              kp = n*( n+1 ) / 2
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. ap( kp )==zero )return
+                 kp = kp - info
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              kp = 1
+              do info = 1, n
+                 if( ipiv( info )>0 .and. ap( kp )==zero )return
+                 kp = kp + n - info + 1
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              kcnext = kc + k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc+k-1 ) = one / ap( kc+k-1 )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_qcopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_qspmv( uplo, k-1, -one, ap, work, 1, zero, ap( kc ),1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -stdlib_qdot( k-1, work, 1, ap( kc ), 1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( ap( kcnext+k-1 ) )
+                 ak = ap( kc+k-1 ) / t
+                 akp1 = ap( kcnext+k ) / t
+                 akkp1 = ap( kcnext+k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 ap( kc+k-1 ) = akp1 / d
+                 ap( kcnext+k ) = ak / d
+                 ap( kcnext+k-1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_qcopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_qspmv( uplo, k-1, -one, ap, work, 1, zero, ap( kc ),1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -stdlib_qdot( k-1, work, 1, ap( kc ), 1 )
+                    ap( kcnext+k-1 ) = ap( kcnext+k-1 ) -stdlib_qdot( k-1, ap( kc ), 1, ap( &
+                              kcnext ),1 )
+                    call stdlib_qcopy( k-1, ap( kcnext ), 1, work, 1 )
+                    call stdlib_qspmv( uplo, k-1, -one, ap, work, 1, zero,ap( kcnext ), 1 )
+                              
+                    ap( kcnext+k ) = ap( kcnext+k ) -stdlib_qdot( k-1, work, 1, ap( kcnext ), 1 )
+                              
+                 end if
+                 kstep = 2
+                 kcnext = kcnext + k + 1
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kpc = ( kp-1 )*kp / 2 + 1
+                 call stdlib_qswap( kp-1, ap( kc ), 1, ap( kpc ), 1 )
+                 kx = kpc + kp - 1
+                 do j = kp + 1, k - 1
+                    kx = kx + j - 1
+                    temp = ap( kc+j-1 )
+                    ap( kc+j-1 ) = ap( kx )
+                    ap( kx ) = temp
+                 end do
+                 temp = ap( kc+k-1 )
+                 ap( kc+k-1 ) = ap( kpc+kp-1 )
+                 ap( kpc+kp-1 ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc+k+k-1 )
+                    ap( kc+k+k-1 ) = ap( kc+k+kp-1 )
+                    ap( kc+k+kp-1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              kc = kcnext
+              go to 30
+              50 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              npp = n*( n+1 ) / 2
+              k = n
+              kc = npp
+              60 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 80
+              kcnext = kc - ( n-k+2 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc ) = one / ap( kc )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_qcopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_qspmv( uplo, n-k, -one, ap( kc+n-k+1 ), work, 1,zero, ap( kc+1 ), &
+                              1 )
+                    ap( kc ) = ap( kc ) - stdlib_qdot( n-k, work, 1, ap( kc+1 ), 1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( ap( kcnext+1 ) )
+                 ak = ap( kcnext ) / t
+                 akp1 = ap( kc ) / t
+                 akkp1 = ap( kcnext+1 ) / t
+                 d = t*( ak*akp1-one )
+                 ap( kcnext ) = akp1 / d
+                 ap( kc ) = ak / d
+                 ap( kcnext+1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_qcopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_qspmv( uplo, n-k, -one, ap( kc+( n-k+1 ) ), work, 1,zero, ap( kc+&
+                              1 ), 1 )
+                    ap( kc ) = ap( kc ) - stdlib_qdot( n-k, work, 1, ap( kc+1 ), 1 )
+                    ap( kcnext+1 ) = ap( kcnext+1 ) -stdlib_qdot( n-k, ap( kc+1 ), 1,ap( kcnext+2 &
+                              ), 1 )
+                    call stdlib_qcopy( n-k, ap( kcnext+2 ), 1, work, 1 )
+                    call stdlib_qspmv( uplo, n-k, -one, ap( kc+( n-k+1 ) ), work, 1,zero, ap( &
+                              kcnext+2 ), 1 )
+                    ap( kcnext ) = ap( kcnext ) -stdlib_qdot( n-k, work, 1, ap( kcnext+2 ), 1 )
+                              
+                 end if
+                 kstep = 2
+                 kcnext = kcnext - ( n-k+3 )
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kpc = npp - ( n-kp+1 )*( n-kp+2 ) / 2 + 1
+                 if( kp<n )call stdlib_qswap( n-kp, ap( kc+kp-k+1 ), 1, ap( kpc+1 ), 1 )
+                 kx = kc + kp - k
+                 do j = k + 1, kp - 1
+                    kx = kx + n - j + 1
+                    temp = ap( kc+j-k )
+                    ap( kc+j-k ) = ap( kx )
+                    ap( kx ) = temp
+                 end do
+                 temp = ap( kc )
+                 ap( kc ) = ap( kpc )
+                 ap( kpc ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc-n+k-1 )
+                    ap( kc-n+k-1 ) = ap( kc-n+kp-1 )
+                    ap( kc-n+kp-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              kc = kcnext
+              go to 60
+              80 continue
+           end if
+           return
+     end subroutine stdlib_qsptri
+
+     !> DSPTRS: solves a system of linear equations A*X = B with a real
+     !> symmetric matrix A stored in packed format using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by DSPTRF.
+
+     pure subroutine stdlib_qsptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(in) :: ap(*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kp
+           real(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              kc = kc - k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_qger( k-1, nrhs, -one, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_qscal( nrhs, one / ap( kc+k-1 ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_qswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_qger( k-2, nrhs, -one, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 call stdlib_qger( k-2, nrhs, -one, ap( kc-( k-1 ) ), 1,b( k-1, 1 ), ldb, b( 1, 1 &
+                           ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+k-2 )
+                 akm1 = ap( kc-1 ) / akm1k
+                 ak = ap( kc+k-1 ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc - k + 1
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_qgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb, ap( kc ),1, one, b( k, &
+                           1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + k
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 call stdlib_qgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb, ap( kc ),1, one, b( k, &
+                           1 ), ldb )
+                 call stdlib_qgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb,ap( kc+k ), 1, one, b( k+&
+                           1, 1 ), ldb )
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + 2*k + 1
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_qger( n-k, nrhs, -one, ap( kc+1 ), 1, b( k, 1 ),ldb, b( k+1,&
+                            1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_qscal( nrhs, one / ap( kc ), b( k, 1 ), ldb )
+                 kc = kc + n - k + 1
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_qswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_qger( n-k-1, nrhs, -one, ap( kc+2 ), 1, b( k, 1 ),ldb, b( k+2, 1 )&
+                              , ldb )
+                    call stdlib_qger( n-k-1, nrhs, -one, ap( kc+n-k+2 ), 1,b( k+1, 1 ), ldb, b( k+&
+                              2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+1 )
+                 akm1 = ap( kc ) / akm1k
+                 ak = ap( kc+n-k+1 ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc + 2*( n-k ) + 1
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              kc = kc - ( n-k+1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_qgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, ap( &
+                           kc+1 ), 1, one, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_qgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, ap( kc+1 ), &
+                              1, one, b( k, 1 ), ldb )
+                    call stdlib_qgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, ap( kc-( n-&
+                              k ) ), 1, one, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc - ( n-k+2 )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_qsptrs
+
+     !> DSTEBZ: computes the eigenvalues of a symmetric tridiagonal
+     !> matrix T.  The user may ask for all eigenvalues, all eigenvalues
+     !> in the half-open interval (VL, VU], or the IL-th through IU-th
+     !> eigenvalues.
+     !> To avoid overflow, the matrix must be scaled so that its
+     !> largest element is no greater than overflow**(1/2) * underflow**(1/4) in absolute value, and for greatest
+     !> accuracy, it should not be much smaller than that.
+     !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+     !> Matrix", Report CS41, Computer Science Dept., Stanford
+     !> University, July 21, 1966.
+
+     pure subroutine stdlib_qstebz( range, order, n, vl, vu, il, iu, abstol, d, e,m, nsplit, w, &
+               iblock, isplit, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: order, range
+           integer(ilp), intent(in) :: il, iu, n
+           integer(ilp), intent(out) :: info, m, nsplit
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iblock(*), isplit(*), iwork(*)
+           real(qp), intent(in) :: d(*), e(*)
+           real(qp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: fudge = 2.1_qp
+           real(qp), parameter :: relfac = 2.0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ncnvrg, toofew
+           integer(ilp) :: ib, ibegin, idiscl, idiscu, ie, iend, iinfo, im, in, ioff, iorder, &
+                     iout, irange, itmax, itmp1, iw, iwoff, j, jb, jdisc, je, nb, nwl, nwu
+           real(qp) :: atoli, bnorm, gl, gu, pivmin, rtoli, safemn, tmp1, tmp2, tnorm, ulp, wkill,&
+                      wl, wlu, wu, wul
+           ! Local Arrays 
+           integer(ilp) :: idumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! decode range
+           if( stdlib_lsame( range, 'A' ) ) then
+              irange = 1
+           else if( stdlib_lsame( range, 'V' ) ) then
+              irange = 2
+           else if( stdlib_lsame( range, 'I' ) ) then
+              irange = 3
+           else
+              irange = 0
+           end if
+           ! decode order
+           if( stdlib_lsame( order, 'B' ) ) then
+              iorder = 2
+           else if( stdlib_lsame( order, 'E' ) ) then
+              iorder = 1
+           else
+              iorder = 0
+           end if
+           ! check for errors
+           if( irange<=0 ) then
+              info = -1
+           else if( iorder<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( irange==2 ) then
+              if( vl>=vu )info = -5
+           else if( irange==3 .and. ( il<1 .or. il>max( 1, n ) ) )then
+              info = -6
+           else if( irange==3 .and. ( iu<min( n, il ) .or. iu>n ) )then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEBZ', -info )
+              return
+           end if
+           ! initialize error flags
+           info = 0
+           ncnvrg = .false.
+           toofew = .false.
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           ! simplifications:
+           if( irange==3 .and. il==1 .and. iu==n )irange = 1
+           ! get machine constants
+           ! nb is the minimum vector length for vector bisection, or 0
+           ! if only scalar is to be done.
+           safemn = stdlib_qlamch( 'S' )
+           ulp = stdlib_qlamch( 'P' )
+           rtoli = ulp*relfac
+           nb = stdlib_ilaenv( 1, 'DSTEBZ', ' ', n, -1, -1, -1 )
+           if( nb<=1 )nb = 0
+           ! special case when n=1
+           if( n==1 ) then
+              nsplit = 1
+              isplit( 1 ) = 1
+              if( irange==2 .and. ( vl>=d( 1 ) .or. vu<d( 1 ) ) ) then
+                 m = 0
+              else
+                 w( 1 ) = d( 1 )
+                 iblock( 1 ) = 1
+                 m = 1
+              end if
+              return
+           end if
+           ! compute splitting points
+           nsplit = 1
+           work( n ) = zero
+           pivmin = one
+           do j = 2, n
+              tmp1 = e( j-1 )**2
+              if( abs( d( j )*d( j-1 ) )*ulp**2+safemn>tmp1 ) then
+                 isplit( nsplit ) = j - 1
+                 nsplit = nsplit + 1
+                 work( j-1 ) = zero
+              else
+                 work( j-1 ) = tmp1
+                 pivmin = max( pivmin, tmp1 )
+              end if
+           end do
+           isplit( nsplit ) = n
+           pivmin = pivmin*safemn
+           ! compute interval and atoli
+           if( irange==3 ) then
+              ! range='i': compute the interval containing eigenvalues
+                         ! il through iu.
+              ! compute gershgorin interval for entire (split) matrix
+              ! and use it as the initial interval
+              gu = d( 1 )
+              gl = d( 1 )
+              tmp1 = zero
+              do j = 1, n - 1
+                 tmp2 = sqrt( work( j ) )
+                 gu = max( gu, d( j )+tmp1+tmp2 )
+                 gl = min( gl, d( j )-tmp1-tmp2 )
+                 tmp1 = tmp2
+              end do
+              gu = max( gu, d( n )+tmp1 )
+              gl = min( gl, d( n )-tmp1 )
+              tnorm = max( abs( gl ), abs( gu ) )
+              gl = gl - fudge*tnorm*ulp*n - fudge*two*pivmin
+              gu = gu + fudge*tnorm*ulp*n + fudge*pivmin
+              ! compute iteration parameters
+              itmax = int( ( log( tnorm+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+              if( abstol<=zero ) then
+                 atoli = ulp*tnorm
+              else
+                 atoli = abstol
+              end if
+              work( n+1 ) = gl
+              work( n+2 ) = gl
+              work( n+3 ) = gu
+              work( n+4 ) = gu
+              work( n+5 ) = gl
+              work( n+6 ) = gu
+              iwork( 1 ) = -1
+              iwork( 2 ) = -1
+              iwork( 3 ) = n + 1
+              iwork( 4 ) = n + 1
+              iwork( 5 ) = il - 1
+              iwork( 6 ) = iu
+              call stdlib_qlaebz( 3, itmax, n, 2, 2, nb, atoli, rtoli, pivmin, d, e,work, iwork( &
+                        5 ), work( n+1 ), work( n+5 ), iout,iwork, w, iblock, iinfo )
+              if( iwork( 6 )==iu ) then
+                 wl = work( n+1 )
+                 wlu = work( n+3 )
+                 nwl = iwork( 1 )
+                 wu = work( n+4 )
+                 wul = work( n+2 )
+                 nwu = iwork( 4 )
+              else
+                 wl = work( n+2 )
+                 wlu = work( n+4 )
+                 nwl = iwork( 2 )
+                 wu = work( n+3 )
+                 wul = work( n+1 )
+                 nwu = iwork( 3 )
+              end if
+              if( nwl<0 .or. nwl>=n .or. nwu<1 .or. nwu>n ) then
+                 info = 4
+                 return
+              end if
+           else
+              ! range='a' or 'v' -- set atoli
+              tnorm = max( abs( d( 1 ) )+abs( e( 1 ) ),abs( d( n ) )+abs( e( n-1 ) ) )
+              do j = 2, n - 1
+                 tnorm = max( tnorm, abs( d( j ) )+abs( e( j-1 ) )+abs( e( j ) ) )
+              end do
+              if( abstol<=zero ) then
+                 atoli = ulp*tnorm
+              else
+                 atoli = abstol
+              end if
+              if( irange==2 ) then
+                 wl = vl
+                 wu = vu
+              else
+                 wl = zero
+                 wu = zero
+              end if
+           end if
+           ! find eigenvalues -- loop over blocks and recompute nwl and nwu.
+           ! nwl accumulates the number of eigenvalues .le. wl,
+           ! nwu accumulates the number of eigenvalues .le. wu
+           m = 0
+           iend = 0
+           info = 0
+           nwl = 0
+           nwu = 0
+           loop_70: do jb = 1, nsplit
+              ioff = iend
+              ibegin = ioff + 1
+              iend = isplit( jb )
+              in = iend - ioff
+              if( in==1 ) then
+                 ! special case -- in=1
+                 if( irange==1 .or. wl>=d( ibegin )-pivmin )nwl = nwl + 1
+                 if( irange==1 .or. wu>=d( ibegin )-pivmin )nwu = nwu + 1
+                 if( irange==1 .or. ( wl<d( ibegin )-pivmin .and. wu>=d( ibegin )-pivmin ) ) &
+                           then
+                    m = m + 1
+                    w( m ) = d( ibegin )
+                    iblock( m ) = jb
+                 end if
+              else
+                 ! general case -- in > 1
+                 ! compute gershgorin interval
+                 ! and use it as the initial interval
+                 gu = d( ibegin )
+                 gl = d( ibegin )
+                 tmp1 = zero
+                 do j = ibegin, iend - 1
+                    tmp2 = abs( e( j ) )
+                    gu = max( gu, d( j )+tmp1+tmp2 )
+                    gl = min( gl, d( j )-tmp1-tmp2 )
+                    tmp1 = tmp2
+                 end do
+                 gu = max( gu, d( iend )+tmp1 )
+                 gl = min( gl, d( iend )-tmp1 )
+                 bnorm = max( abs( gl ), abs( gu ) )
+                 gl = gl - fudge*bnorm*ulp*in - fudge*pivmin
+                 gu = gu + fudge*bnorm*ulp*in + fudge*pivmin
+                 ! compute atoli for the current submatrix
+                 if( abstol<=zero ) then
+                    atoli = ulp*max( abs( gl ), abs( gu ) )
+                 else
+                    atoli = abstol
+                 end if
+                 if( irange>1 ) then
+                    if( gu<wl ) then
+                       nwl = nwl + in
+                       nwu = nwu + in
+                       cycle loop_70
+                    end if
+                    gl = max( gl, wl )
+                    gu = min( gu, wu )
+                    if( gl>=gu )cycle loop_70
+                 end if
+                 ! set up initial interval
+                 work( n+1 ) = gl
+                 work( n+in+1 ) = gu
+                 call stdlib_qlaebz( 1, 0, in, in, 1, nb, atoli, rtoli, pivmin,d( ibegin ), e( &
+                 ibegin ), work( ibegin ),idumma, work( n+1 ), work( n+2*in+1 ), im,iwork, w( m+1 &
+                           ), iblock( m+1 ), iinfo )
+                 nwl = nwl + iwork( 1 )
+                 nwu = nwu + iwork( in+1 )
+                 iwoff = m - iwork( 1 )
+                 ! compute eigenvalues
+                 itmax = int( ( log( gu-gl+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + &
+                           2
+                 call stdlib_qlaebz( 2, itmax, in, in, 1, nb, atoli, rtoli, pivmin,d( ibegin ), e(&
+                  ibegin ), work( ibegin ),idumma, work( n+1 ), work( n+2*in+1 ), iout,iwork, w( &
+                            m+1 ), iblock( m+1 ), iinfo )
+                 ! copy eigenvalues into w and iblock
+                 ! use -jb for block number for unconverged eigenvalues.
+                 do j = 1, iout
+                    tmp1 = half*( work( j+n )+work( j+in+n ) )
+                    ! flag non-convergence.
+                    if( j>iout-iinfo ) then
+                       ncnvrg = .true.
+                       ib = -jb
+                    else
+                       ib = jb
+                    end if
+                    do je = iwork( j ) + 1 + iwoff,iwork( j+in ) + iwoff
+                       w( je ) = tmp1
+                       iblock( je ) = ib
+                    end do
+                 end do
+                 m = m + im
+              end if
+           end do loop_70
+           ! if range='i', then (wl,wu) contains eigenvalues nwl+1,...,nwu
+           ! if nwl+1 < il or nwu > iu, discard extra eigenvalues.
+           if( irange==3 ) then
+              im = 0
+              idiscl = il - 1 - nwl
+              idiscu = nwu - iu
+              if( idiscl>0 .or. idiscu>0 ) then
+                 do je = 1, m
+                    if( w( je )<=wlu .and. idiscl>0 ) then
+                       idiscl = idiscl - 1
+                    else if( w( je )>=wul .and. idiscu>0 ) then
+                       idiscu = idiscu - 1
+                    else
+                       im = im + 1
+                       w( im ) = w( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 m = im
+              end if
+              if( idiscl>0 .or. idiscu>0 ) then
+                 ! code to deal with effects of bad arithmetic:
+                 ! some low eigenvalues to be discarded are not in (wl,wlu],
+                 ! or high eigenvalues to be discarded are not in (wul,wu]
+                 ! so just kill off the smallest idiscl/largest idiscu
+                 ! eigenvalues, by simply finding the smallest/largest
+                 ! eigenvalue(s).
+                 ! (if n(w) is monotone non-decreasing, this should never
+                     ! happen.)
+                 if( idiscl>0 ) then
+                    wkill = wu
+                    do jdisc = 1, idiscl
+                       iw = 0
+                       do je = 1, m
+                          if( iblock( je )/=0 .and.( w( je )<wkill .or. iw==0 ) ) then
+                             iw = je
+                             wkill = w( je )
+                          end if
+                       end do
+                       iblock( iw ) = 0
+                    end do
+                 end if
+                 if( idiscu>0 ) then
+                    wkill = wl
+                    do jdisc = 1, idiscu
+                       iw = 0
+                       do je = 1, m
+                          if( iblock( je )/=0 .and.( w( je )>wkill .or. iw==0 ) ) then
+                             iw = je
+                             wkill = w( je )
+                          end if
+                       end do
+                       iblock( iw ) = 0
+                    end do
+                 end if
+                 im = 0
+                 do je = 1, m
+                    if( iblock( je )/=0 ) then
+                       im = im + 1
+                       w( im ) = w( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 m = im
+              end if
+              if( idiscl<0 .or. idiscu<0 ) then
+                 toofew = .true.
+              end if
+           end if
+           ! if order='b', do nothing -- the eigenvalues are already sorted
+              ! by block.
+           ! if order='e', sort the eigenvalues from smallest to largest
+           if( iorder==1 .and. nsplit>1 ) then
+              do je = 1, m - 1
+                 ie = 0
+                 tmp1 = w( je )
+                 do j = je + 1, m
+                    if( w( j )<tmp1 ) then
+                       ie = j
+                       tmp1 = w( j )
+                    end if
+                 end do
+                 if( ie/=0 ) then
+                    itmp1 = iblock( ie )
+                    w( ie ) = w( je )
+                    iblock( ie ) = iblock( je )
+                    w( je ) = tmp1
+                    iblock( je ) = itmp1
+                 end if
+              end do
+           end if
+           info = 0
+           if( ncnvrg )info = info + 1
+           if( toofew )info = info + 2
+           return
+     end subroutine stdlib_qstebz
+
+     !> DSTEDC: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric tridiagonal matrix using the divide and conquer method.
+     !> The eigenvectors of a full or band real symmetric matrix can also be
+     !> found if DSYTRD or DSPTRD or DSBTRD has been used to reduce this
+     !> matrix to tridiagonal form.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.  See DLAED3 for details.
+
+     pure subroutine stdlib_qstedc( compz, n, d, e, z, ldz, work, lwork, iwork,liwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: d(*), e(*), z(ldz,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: finish, i, icompz, ii, j, k, lgn, liwmin, lwmin, m, smlsiz, start, &
+                     storez, strtrw
+           real(qp) :: eps, orgnrm, p, tiny
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log,max,mod,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or.( icompz>0 .and. ldz<max( 1, n ) ) ) then
+              info = -6
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              smlsiz = stdlib_ilaenv( 9, 'DSTEDC', ' ', 0, 0, 0, 0 )
+              if( n<=1 .or. icompz==0 ) then
+                 liwmin = 1
+                 lwmin = 1
+              else if( n<=smlsiz ) then
+                 liwmin = 1
+                 lwmin = 2*( n - 1 )
+              else
+                 lgn = int( log( real( n,KIND=qp) )/log( two ),KIND=ilp)
+                 if( 2**lgn<n )lgn = lgn + 1
+                 if( 2**lgn<n )lgn = lgn + 1
+                 if( icompz==1 ) then
+                    lwmin = 1 + 3*n + 2*n*lgn + 4*n**2
+                    liwmin = 6 + 6*n + 5*n*lgn
+                 else if( icompz==2 ) then
+                    lwmin = 1 + 4*n + n**2
+                    liwmin = 3 + 5*n
+                 end if
+              end if
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not. lquery ) then
+                 info = -8
+              else if( liwork<liwmin .and. .not. lquery ) then
+                 info = -10
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEDC', -info )
+              return
+           else if (lquery) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz/=0 )z( 1, 1 ) = one
+              return
+           end if
+           ! if the following conditional clause is removed, then the routine
+           ! will use the divide and conquer routine to compute only the
+           ! eigenvalues, which requires (3n + 3n**2) real workspace and
+           ! (2 + 5n + 2n lg(n)) integer workspace.
+           ! since on many architectures stdlib_qsterf is much faster than any other
+           ! algorithm for finding eigenvalues only, it is used here
+           ! as the default. if the conditional clause is removed, then
+           ! information on the size of workspace needs to be changed.
+           ! if compz = 'n', use stdlib_qsterf to compute the eigenvalues.
+           if( icompz==0 ) then
+              call stdlib_qsterf( n, d, e, info )
+              go to 50
+           end if
+           ! if n is smaller than the minimum divide size (smlsiz+1), then
+           ! solve the problem with another solver.
+           if( n<=smlsiz ) then
+              call stdlib_qsteqr( compz, n, d, e, z, ldz, work, info )
+           else
+              ! if compz = 'v', the z matrix must be stored elsewhere for later
+              ! use.
+              if( icompz==1 ) then
+                 storez = 1 + n*n
+              else
+                 storez = 1
+              end if
+              if( icompz==2 ) then
+                 call stdlib_qlaset( 'FULL', n, n, zero, one, z, ldz )
+              end if
+              ! scale.
+              orgnrm = stdlib_qlanst( 'M', n, d, e )
+              if( orgnrm==zero )go to 50
+              eps = stdlib_qlamch( 'EPSILON' )
+              start = 1
+              ! while ( start <= n )
+              10 continue
+              if( start<=n ) then
+                 ! let finish be the position of the next subdiagonal entry
+                 ! such that e( finish ) <= tiny or finish = n if no such
+                 ! subdiagonal exists.  the matrix identified by the elements
+                 ! between start and finish constitutes an independent
+                 ! sub-problem.
+                 finish = start
+                 20 continue
+                 if( finish<n ) then
+                    tiny = eps*sqrt( abs( d( finish ) ) )*sqrt( abs( d( finish+1 ) ) )
+                    if( abs( e( finish ) )>tiny ) then
+                       finish = finish + 1
+                       go to 20
+                    end if
+                 end if
+                 ! (sub) problem determined.  compute its size and solve it.
+                 m = finish - start + 1
+                 if( m==1 ) then
+                    start = finish + 1
+                    go to 10
+                 end if
+                 if( m>smlsiz ) then
+                    ! scale.
+                    orgnrm = stdlib_qlanst( 'M', m, d( start ), e( start ) )
+                    call stdlib_qlascl( 'G', 0, 0, orgnrm, one, m, 1, d( start ), m,info )
+                    call stdlib_qlascl( 'G', 0, 0, orgnrm, one, m-1, 1, e( start ),m-1, info )
+                              
+                    if( icompz==1 ) then
+                       strtrw = 1
+                    else
+                       strtrw = start
+                    end if
+                    call stdlib_qlaed0( icompz, n, m, d( start ), e( start ),z( strtrw, start ), &
+                              ldz, work( 1 ), n,work( storez ), iwork, info )
+                    if( info/=0 ) then
+                       info = ( info / ( m+1 )+start-1 )*( n+1 ) +mod( info, ( m+1 ) ) + start - &
+                                 1
+                       go to 50
+                    end if
+                    ! scale back.
+                    call stdlib_qlascl( 'G', 0, 0, one, orgnrm, m, 1, d( start ), m,info )
+                 else
+                    if( icompz==1 ) then
+                       ! since qr won't update a z matrix which is larger than
+                       ! the length of d, we must solve the sub-problem in a
+                       ! workspace and then multiply back into z.
+                       call stdlib_qsteqr( 'I', m, d( start ), e( start ), work, m,work( m*m+1 ), &
+                                 info )
+                       call stdlib_qlacpy( 'A', n, m, z( 1, start ), ldz,work( storez ), n )
+                                 
+                       call stdlib_qgemm( 'N', 'N', n, m, m, one,work( storez ), n, work, m, zero,&
+                                 z( 1, start ), ldz )
+                    else if( icompz==2 ) then
+                       call stdlib_qsteqr( 'I', m, d( start ), e( start ),z( start, start ), ldz, &
+                                 work, info )
+                    else
+                       call stdlib_qsterf( m, d( start ), e( start ), info )
+                    end if
+                    if( info/=0 ) then
+                       info = start*( n+1 ) + finish
+                       go to 50
+                    end if
+                 end if
+                 start = finish + 1
+                 go to 10
+              end if
+              ! endwhile
+              if( icompz==0 ) then
+                ! use quick sort
+                call stdlib_qlasrt( 'I', n, d, info )
+              else
+                ! use selection sort to minimize swaps of eigenvectors
+                do ii = 2, n
+                   i = ii - 1
+                   k = i
+                   p = d( i )
+                   do j = ii, n
+                      if( d( j )<p ) then
+                         k = j
+                         p = d( j )
+                      end if
+                   end do
+                   if( k/=i ) then
+                      d( k ) = d( i )
+                      d( i ) = p
+                      call stdlib_qswap( n, z( 1, i ), 1, z( 1, k ), 1 )
+                   end if
+                end do
+              end if
+           end if
+           50 continue
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_qstedc
+
+     !> DSTEGR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+     !> a well defined set of pairwise different real eigenvalues, the corresponding
+     !> real eigenvectors are pairwise orthogonal.
+     !> The spectrum may be computed either completely or partially by specifying
+     !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+     !> eigenvalues.
+     !> DSTEGR is a compatibility wrapper around the improved DSTEMR routine.
+     !> See DSTEMR for further details.
+     !> One important change is that the ABSTOL parameter no longer provides any
+     !> benefit and hence is no longer used.
+     !> Note : DSTEGR and DSTEMR work only on machines which follow
+     !> IEEE-754 floating-point standard in their handling of infinities and
+     !> NaNs.  Normal execution may create these exceptiona values and hence
+     !> may abort due to a floating point exception in environments which
+     !> do not conform to the IEEE-754 standard.
+
+     pure subroutine stdlib_qstegr( jobz, range, n, d, e, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               isuppz, work, lwork, iwork,liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           integer(ilp), intent(in) :: il, iu, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: w(*), work(*)
+           real(qp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: tryrac
+           ! Executable Statements 
+           info = 0
+           tryrac = .false.
+           call stdlib_qstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, n, isuppz, &
+                     tryrac, work, lwork,iwork, liwork, info )
+     end subroutine stdlib_qstegr
+
+     !> DSTEIN: computes the eigenvectors of a real symmetric tridiagonal
+     !> matrix T corresponding to specified eigenvalues, using inverse
+     !> iteration.
+     !> The maximum number of iterations allowed for each eigenvector is
+     !> specified by an internal parameter MAXITS (currently set to 5).
+
+     pure subroutine stdlib_qstein( n, d, e, m, w, iblock, isplit, z, ldz, work,iwork, ifail, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, m, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: iblock(*), isplit(*)
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(in) :: d(*), e(*), w(*)
+           real(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: odm3 = 1.0e-3_qp
+           real(qp), parameter :: odm1 = 1.0e-1_qp
+           integer(ilp), parameter :: maxits = 5
+           integer(ilp), parameter :: extra = 2
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: b1, blksiz, bn, gpind, i, iinfo, indrv1, indrv2, indrv3, indrv4, &
+                     indrv5, its, j, j1, jblk, jmax, nblk, nrmchk
+           real(qp) :: dtpcrt, eps, eps1, nrm, onenrm, ortol, pertol, scl, sep, tol, xj, xjm, &
+                     ztr
+           ! Local Arrays 
+           integer(ilp) :: iseed(4)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           do i = 1, m
+              ifail( i ) = 0
+           end do
+           if( n<0 ) then
+              info = -1
+           else if( m<0 .or. m>n ) then
+              info = -4
+           else if( ldz<max( 1, n ) ) then
+              info = -9
+           else
+              do j = 2, m
+                 if( iblock( j )<iblock( j-1 ) ) then
+                    info = -6
+                    go to 30
+                 end if
+                 if( iblock( j )==iblock( j-1 ) .and. w( j )<w( j-1 ) )then
+                    info = -5
+                    go to 30
+                 end if
+              end do
+              30 continue
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEIN', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) then
+              return
+           else if( n==1 ) then
+              z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           eps = stdlib_qlamch( 'PRECISION' )
+           ! initialize seed for random number generator stdlib_qlarnv.
+           do i = 1, 4
+              iseed( i ) = 1
+           end do
+           ! initialize pointers.
+           indrv1 = 0
+           indrv2 = indrv1 + n
+           indrv3 = indrv2 + n
+           indrv4 = indrv3 + n
+           indrv5 = indrv4 + n
+           ! compute eigenvectors of matrix blocks.
+           j1 = 1
+           loop_160: do nblk = 1, iblock( m )
+              ! find starting and ending indices of block nblk.
+              if( nblk==1 ) then
+                 b1 = 1
+              else
+                 b1 = isplit( nblk-1 ) + 1
+              end if
+              bn = isplit( nblk )
+              blksiz = bn - b1 + 1
+              if( blksiz==1 )go to 60
+              gpind = j1
+              ! compute reorthogonalization criterion and stopping criterion.
+              onenrm = abs( d( b1 ) ) + abs( e( b1 ) )
+              onenrm = max( onenrm, abs( d( bn ) )+abs( e( bn-1 ) ) )
+              do i = b1 + 1, bn - 1
+                 onenrm = max( onenrm, abs( d( i ) )+abs( e( i-1 ) )+abs( e( i ) ) )
+              end do
+              ortol = odm3*onenrm
+              dtpcrt = sqrt( odm1 / blksiz )
+              ! loop through eigenvalues of block nblk.
+              60 continue
+              jblk = 0
+              loop_150: do j = j1, m
+                 if( iblock( j )/=nblk ) then
+                    j1 = j
+                    cycle loop_160
+                 end if
+                 jblk = jblk + 1
+                 xj = w( j )
+                 ! skip all the work if the block size is one.
+                 if( blksiz==1 ) then
+                    work( indrv1+1 ) = one
+                    go to 120
+                 end if
+                 ! if eigenvalues j and j-1 are too close, add a relatively
+                 ! small perturbation.
+                 if( jblk>1 ) then
+                    eps1 = abs( eps*xj )
+                    pertol = ten*eps1
+                    sep = xj - xjm
+                    if( sep<pertol )xj = xjm + pertol
+                 end if
+                 its = 0
+                 nrmchk = 0
+                 ! get random starting vector.
+                 call stdlib_qlarnv( 2, iseed, blksiz, work( indrv1+1 ) )
+                 ! copy the matrix t so it won't be destroyed in factorization.
+                 call stdlib_qcopy( blksiz, d( b1 ), 1, work( indrv4+1 ), 1 )
+                 call stdlib_qcopy( blksiz-1, e( b1 ), 1, work( indrv2+2 ), 1 )
+                 call stdlib_qcopy( blksiz-1, e( b1 ), 1, work( indrv3+1 ), 1 )
+                 ! compute lu factors with partial pivoting  ( pt = lu )
+                 tol = zero
+                 call stdlib_qlagtf( blksiz, work( indrv4+1 ), xj, work( indrv2+2 ),work( indrv3+&
+                           1 ), tol, work( indrv5+1 ), iwork,iinfo )
+                 ! update iteration count.
+                 70 continue
+                 its = its + 1
+                 if( its>maxits )go to 100
+                 ! normalize and scale the righthand side vector pb.
+                 jmax = stdlib_iqamax( blksiz, work( indrv1+1 ), 1 )
+                 scl = blksiz*onenrm*max( eps,abs( work( indrv4+blksiz ) ) ) /abs( work( indrv1+&
+                           jmax ) )
+                 call stdlib_qscal( blksiz, scl, work( indrv1+1 ), 1 )
+                 ! solve the system lu = pb.
+                 call stdlib_qlagts( -1, blksiz, work( indrv4+1 ), work( indrv2+2 ),work( indrv3+&
+                           1 ), work( indrv5+1 ), iwork,work( indrv1+1 ), tol, iinfo )
+                 ! reorthogonalize by modified gram-schmidt if eigenvalues are
+                 ! close enough.
+                 if( jblk==1 )go to 90
+                 if( abs( xj-xjm )>ortol )gpind = j
+                 if( gpind/=j ) then
+                    do i = gpind, j - 1
+                       ztr = -stdlib_qdot( blksiz, work( indrv1+1 ), 1, z( b1, i ),1 )
+                       call stdlib_qaxpy( blksiz, ztr, z( b1, i ), 1,work( indrv1+1 ), 1 )
+                    end do
+                 end if
+                 ! check the infinity norm of the iterate.
+                 90 continue
+                 jmax = stdlib_iqamax( blksiz, work( indrv1+1 ), 1 )
+                 nrm = abs( work( indrv1+jmax ) )
+                 ! continue for additional iterations after norm reaches
+                 ! stopping criterion.
+                 if( nrm<dtpcrt )go to 70
+                 nrmchk = nrmchk + 1
+                 if( nrmchk<extra+1 )go to 70
+                 go to 110
+                 ! if stopping criterion was not satisfied, update info and
+                 ! store eigenvector number in array ifail.
+                 100 continue
+                 info = info + 1
+                 ifail( info ) = j
+                 ! accept iterate as jth eigenvector.
+                 110 continue
+                 scl = one / stdlib_qnrm2( blksiz, work( indrv1+1 ), 1 )
+                 jmax = stdlib_iqamax( blksiz, work( indrv1+1 ), 1 )
+                 if( work( indrv1+jmax )<zero )scl = -scl
+                 call stdlib_qscal( blksiz, scl, work( indrv1+1 ), 1 )
+                 120 continue
+                 do i = 1, n
+                    z( i, j ) = zero
+                 end do
+                 do i = 1, blksiz
+                    z( b1+i-1, j ) = work( indrv1+i )
+                 end do
+                 ! save the shift to check eigenvalue spacing at next
+                 ! iteration.
+                 xjm = xj
+              end do loop_150
+           end do loop_160
+           return
+     end subroutine stdlib_qstein
+
+     !> DSTEMR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+     !> a well defined set of pairwise different real eigenvalues, the corresponding
+     !> real eigenvectors are pairwise orthogonal.
+     !> The spectrum may be computed either completely or partially by specifying
+     !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+     !> eigenvalues.
+     !> Depending on the number of desired eigenvalues, these are computed either
+     !> by bisection or the dqds algorithm. Numerically orthogonal eigenvectors are
+     !> computed by the use of various suitable L D L^T factorizations near clusters
+     !> of close eigenvalues (referred to as RRRs, Relatively Robust
+     !> Representations). An informal sketch of the algorithm follows.
+     !> For each unreduced block (submatrix) of T,
+     !> (a) Compute T - sigma I  = L D L^T, so that L and D
+     !> define all the wanted eigenvalues to high relative accuracy.
+     !> This means that small relative changes in the entries of D and L
+     !> cause only small relative changes in the eigenvalues and
+     !> eigenvectors. The standard (unfactored) representation of the
+     !> tridiagonal matrix T does not have this property in general.
+     !> (b) Compute the eigenvalues to suitable accuracy.
+     !> If the eigenvectors are desired, the algorithm attains full
+     !> accuracy of the computed eigenvalues only right before
+     !> the corresponding vectors have to be computed, see steps c) and d).
+     !> (c) For each cluster of close eigenvalues, select a new
+     !> shift close to the cluster, find a new factorization, and refine
+     !> the shifted eigenvalues to suitable accuracy.
+     !> (d) For each eigenvalue with a large enough relative separation compute
+     !> the corresponding eigenvector by forming a rank revealing twisted
+     !> factorization. Go back to (c) for any clusters that remain.
+     !> For more details, see:
+     !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+     !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+     !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+     !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+     !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+     !> 2004.  Also LAPACK Working Note 154.
+     !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem",
+     !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Further Details
+     !> 1.DSTEMR works only on machines which follow IEEE-754
+     !> floating-point standard in their handling of infinities and NaNs.
+     !> This permits the use of efficient inner loops avoiding a check for
+     !> zero divisors.
+
+     pure subroutine stdlib_qstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, nzc, &
+               isuppz, tryrac, work, lwork,iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           logical(lk), intent(inout) :: tryrac
+           integer(ilp), intent(in) :: il, iu, ldz, nzc, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: w(*), work(*)
+           real(qp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: minrgp = 1.0e-3_qp
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lquery, valeig, wantz, zquery
+           integer(ilp) :: i, ibegin, iend, ifirst, iil, iindbl, iindw, iindwk, iinfo, iinspl, &
+           iiu, ilast, in, indd, inde2, inderr, indgp, indgrs, indwrk, itmp, itmp2, j, jblk, jj, &
+                     liwmin, lwmin, nsplit, nzcmin, offset, wbegin, wend
+           real(qp) :: bignum, cs, eps, pivmin, r1, r2, rmax, rmin, rtol1, rtol2, safmin, scale, &
+                     smlnum, sn, thresh, tmp, tnrm, wl, wu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ).or.( liwork==-1 ) )
+           zquery = ( nzc==-1 )
+           ! stdlib_qstemr needs work of size 6*n, iwork of size 3*n.
+           ! in addition, stdlib_qlarre needs work of size 6*n, iwork of size 5*n.
+           ! furthermore, stdlib_qlarrv needs work of size 12*n, iwork of size 7*n.
+           if( wantz ) then
+              lwmin = 18*n
+              liwmin = 10*n
+           else
+              ! need less workspace if only the eigenvalues are wanted
+              lwmin = 12*n
+              liwmin = 8*n
+           endif
+           wl = zero
+           wu = zero
+           iil = 0
+           iiu = 0
+           nsplit = 0
+           if( valeig ) then
+              ! we do not reference vl, vu in the cases range = 'i','a'
+              ! the interval (wl, wu] contains all the wanted eigenvalues.
+              ! it is either given by the user or computed in stdlib_qlarre.
+              wl = vl
+              wu = vu
+           elseif( indeig ) then
+              ! we do not reference il, iu in the cases range = 'v','a'
+              iil = il
+              iiu = iu
+           endif
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( valeig .and. n>0 .and. wu<=wl ) then
+              info = -7
+           else if( indeig .and. ( iil<1 .or. iil>n ) ) then
+              info = -8
+           else if( indeig .and. ( iiu<iil .or. iiu>n ) ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -13
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -17
+           else if( liwork<liwmin .and. .not.lquery ) then
+              info = -19
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( wantz .and. alleig ) then
+                 nzcmin = n
+              else if( wantz .and. valeig ) then
+                 call stdlib_qlarrc( 'T', n, vl, vu, d, e, safmin,nzcmin, itmp, itmp2, info )
+                           
+              else if( wantz .and. indeig ) then
+                 nzcmin = iiu-iil+1
+              else
+                 ! wantz == false.
+                 nzcmin = 0
+              endif
+              if( zquery .and. info==0 ) then
+                 z( 1,1 ) = nzcmin
+              else if( nzc<nzcmin .and. .not.zquery ) then
+                 info = -14
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEMR', -info )
+              return
+           else if( lquery .or. zquery ) then
+              return
+           end if
+           ! handle n = 0, 1, and 2 cases immediately
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = d( 1 )
+              else
+                 if( wl<d( 1 ) .and. wu>=d( 1 ) ) then
+                    m = 1
+                    w( 1 ) = d( 1 )
+                 end if
+              end if
+              if( wantz.and.(.not.zquery) ) then
+                 z( 1, 1 ) = one
+                 isuppz(1) = 1
+                 isuppz(2) = 1
+              end if
+              return
+           end if
+           if( n==2 ) then
+              if( .not.wantz ) then
+                 call stdlib_qlae2( d(1), e(1), d(2), r1, r2 )
+              else if( wantz.and.(.not.zquery) ) then
+                 call stdlib_qlaev2( d(1), e(1), d(2), r1, r2, cs, sn )
+              end if
+              if( alleig.or.(valeig.and.(r2>wl).and.(r2<=wu)).or.(indeig.and.(iil==1)) ) &
+                        then
+                 m = m+1
+                 w( m ) = r2
+                 if( wantz.and.(.not.zquery) ) then
+                    z( 1, m ) = -sn
+                    z( 2, m ) = cs
+                    ! note: at most one of sn and cs can be zero.
+                    if (sn/=zero) then
+                       if (cs/=zero) then
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 2
+                       else
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 1
+                       end if
+                    else
+                       isuppz(2*m-1) = 2
+                       isuppz(2*m) = 2
+                    end if
+                 endif
+              endif
+              if( alleig.or.(valeig.and.(r1>wl).and.(r1<=wu)).or.(indeig.and.(iiu==2)) ) &
+                        then
+                 m = m+1
+                 w( m ) = r1
+                 if( wantz.and.(.not.zquery) ) then
+                    z( 1, m ) = cs
+                    z( 2, m ) = sn
+                    ! note: at most one of sn and cs can be zero.
+                    if (sn/=zero) then
+                       if (cs/=zero) then
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 2
+                       else
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 1
+                       end if
+                    else
+                       isuppz(2*m-1) = 2
+                       isuppz(2*m) = 2
+                    end if
+                 endif
+              endif
+           else
+           ! continue with general n
+              indgrs = 1
+              inderr = 2*n + 1
+              indgp = 3*n + 1
+              indd = 4*n + 1
+              inde2 = 5*n + 1
+              indwrk = 6*n + 1
+              iinspl = 1
+              iindbl = n + 1
+              iindw = 2*n + 1
+              iindwk = 3*n + 1
+              ! scale matrix to allowable range, if necessary.
+              ! the allowable range is related to the pivmin parameter; see the
+              ! comments in stdlib_qlarrd.  the preference for scaling small values
+              ! up is heuristic; we expect users' matrices not to be close to the
+              ! rmax threshold.
+              scale = one
+              tnrm = stdlib_qlanst( 'M', n, d, e )
+              if( tnrm>zero .and. tnrm<rmin ) then
+                 scale = rmin / tnrm
+              else if( tnrm>rmax ) then
+                 scale = rmax / tnrm
+              end if
+              if( scale/=one ) then
+                 call stdlib_qscal( n, scale, d, 1 )
+                 call stdlib_qscal( n-1, scale, e, 1 )
+                 tnrm = tnrm*scale
+                 if( valeig ) then
+                    ! if eigenvalues in interval have to be found,
+                    ! scale (wl, wu] accordingly
+                    wl = wl*scale
+                    wu = wu*scale
+                 endif
+              end if
+              ! compute the desired eigenvalues of the tridiagonal after splitting
+              ! into smaller subblocks if the corresponding off-diagonal elements
+              ! are small
+              ! thresh is the splitting parameter for stdlib_qlarre
+              ! a negative thresh forces the old splitting criterion based on the
+              ! size of the off-diagonal. a positive thresh switches to splitting
+              ! which preserves relative accuracy.
+              if( tryrac ) then
+                 ! test whether the matrix warrants the more expensive relative approach.
+                 call stdlib_qlarrr( n, d, e, iinfo )
+              else
+                 ! the user does not care about relative accurately eigenvalues
+                 iinfo = -1
+              endif
+              ! set the splitting criterion
+              if (iinfo==0) then
+                 thresh = eps
+              else
+                 thresh = -eps
+                 ! relative accuracy is desired but t does not guarantee it
+                 tryrac = .false.
+              endif
+              if( tryrac ) then
+                 ! copy original diagonal, needed to guarantee relative accuracy
+                 call stdlib_qcopy(n,d,1,work(indd),1)
+              endif
+              ! store the squares of the offdiagonal values of t
+              do j = 1, n-1
+                 work( inde2+j-1 ) = e(j)**2
+              end do
+              ! set the tolerance parameters for bisection
+              if( .not.wantz ) then
+                 ! stdlib_qlarre computes the eigenvalues to full precision.
+                 rtol1 = four * eps
+                 rtol2 = four * eps
+              else
+                 ! stdlib_qlarre computes the eigenvalues to less than full precision.
+                 ! stdlib_qlarrv will refine the eigenvalue approximations, and we can
+                 ! need less accurate initial bisection in stdlib_qlarre.
+                 ! note: these settings do only affect the subset case and stdlib_qlarre
+                 rtol1 = sqrt(eps)
+                 rtol2 = max( sqrt(eps)*5.0e-3_qp, four * eps )
+              endif
+              call stdlib_qlarre( range, n, wl, wu, iil, iiu, d, e,work(inde2), rtol1, rtol2, &
+              thresh, nsplit,iwork( iinspl ), m, w, work( inderr ),work( indgp ), iwork( iindbl ),&
+              iwork( iindw ), work( indgrs ), pivmin,work( indwrk ), iwork( iindwk ), iinfo )
+                        
+              if( iinfo/=0 ) then
+                 info = 10 + abs( iinfo )
+                 return
+              end if
+              ! note that if range /= 'v', stdlib_qlarre computes bounds on the desired
+              ! part of the spectrum. all desired eigenvalues are contained in
+              ! (wl,wu]
+              if( wantz ) then
+                 ! compute the desired eigenvectors corresponding to the computed
+                 ! eigenvalues
+                 call stdlib_qlarrv( n, wl, wu, d, e,pivmin, iwork( iinspl ), m,1, m, minrgp, &
+                 rtol1, rtol2,w, work( inderr ), work( indgp ), iwork( iindbl ),iwork( iindw ), &
+                           work( indgrs ), z, ldz,isuppz, work( indwrk ), iwork( iindwk ), iinfo )
+                 if( iinfo/=0 ) then
+                    info = 20 + abs( iinfo )
+                    return
+                 end if
+              else
+                 ! stdlib_qlarre computes eigenvalues of the (shifted) root representation
+                 ! stdlib_qlarrv returns the eigenvalues of the unshifted matrix.
+                 ! however, if the eigenvectors are not desired by the user, we need
+                 ! to apply the corresponding shifts from stdlib_qlarre to obtain the
+                 ! eigenvalues of the original matrix.
+                 do j = 1, m
+                    itmp = iwork( iindbl+j-1 )
+                    w( j ) = w( j ) + e( iwork( iinspl+itmp-1 ) )
+                 end do
+              end if
+              if ( tryrac ) then
+                 ! refine computed eigenvalues so that they are relatively accurate
+                 ! with respect to the original matrix t.
+                 ibegin = 1
+                 wbegin = 1
+                 loop_39: do jblk = 1, iwork( iindbl+m-1 )
+                    iend = iwork( iinspl+jblk-1 )
+                    in = iend - ibegin + 1
+                    wend = wbegin - 1
+                    ! check if any eigenvalues have to be refined in this block
+                    36 continue
+                    if( wend<m ) then
+                       if( iwork( iindbl+wend )==jblk ) then
+                          wend = wend + 1
+                          go to 36
+                       end if
+                    end if
+                    if( wend<wbegin ) then
+                       ibegin = iend + 1
+                       cycle loop_39
+                    end if
+                    offset = iwork(iindw+wbegin-1)-1
+                    ifirst = iwork(iindw+wbegin-1)
+                    ilast = iwork(iindw+wend-1)
+                    rtol2 = four * eps
+                    call stdlib_qlarrj( in,work(indd+ibegin-1), work(inde2+ibegin-1),ifirst, &
+                    ilast, rtol2, offset, w(wbegin),work( inderr+wbegin-1 ),work( indwrk ), iwork(&
+                               iindwk ), pivmin,tnrm, iinfo )
+                    ibegin = iend + 1
+                    wbegin = wend + 1
+                 end do loop_39
+              endif
+              ! if matrix was scaled, then rescale eigenvalues appropriately.
+              if( scale/=one ) then
+                 call stdlib_qscal( m, one / scale, w, 1 )
+              end if
+           end if
+           ! if eigenvalues are not in increasing order, then sort them,
+           ! possibly along with eigenvectors.
+           if( nsplit>1 .or. n==2 ) then
+              if( .not. wantz ) then
+                 call stdlib_qlasrt( 'I', m, w, iinfo )
+                 if( iinfo/=0 ) then
+                    info = 3
+                    return
+                 end if
+              else
+                 do j = 1, m - 1
+                    i = 0
+                    tmp = w( j )
+                    do jj = j + 1, m
+                       if( w( jj )<tmp ) then
+                          i = jj
+                          tmp = w( jj )
+                       end if
+                    end do
+                    if( i/=0 ) then
+                       w( i ) = w( j )
+                       w( j ) = tmp
+                       if( wantz ) then
+                          call stdlib_qswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                          itmp = isuppz( 2*i-1 )
+                          isuppz( 2*i-1 ) = isuppz( 2*j-1 )
+                          isuppz( 2*j-1 ) = itmp
+                          itmp = isuppz( 2*i )
+                          isuppz( 2*i ) = isuppz( 2*j )
+                          isuppz( 2*j ) = itmp
+                       end if
+                    end if
+                 end do
+              end if
+           endif
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_qstemr
+
+     !> DSTEQR: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric tridiagonal matrix using the implicit QL or QR method.
+     !> The eigenvectors of a full or band symmetric matrix can also be found
+     !> if DSYTRD or DSPTRD or DSBTRD has been used to reduce this matrix to
+     !> tridiagonal form.
+
+     pure subroutine stdlib_qsteqr( compz, n, d, e, z, ldz, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: d(*), e(*), z(ldz,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 30
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, icompz, ii, iscale, j, jtot, k, l, l1, lend, lendm1, lendp1, lendsv,&
+                      lm1, lsv, m, mm, mm1, nm1, nmaxit
+           real(qp) :: anorm, b, c, eps, eps2, f, g, p, r, rt1, rt2, s, safmax, safmin, ssfmax, &
+                     ssfmin, tst
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or. ( icompz>0 .and. ldz<max( 1,n ) ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEQR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz==2 )z( 1, 1 ) = one
+              return
+           end if
+           ! determine the unit roundoff and over/underflow thresholds.
+           eps = stdlib_qlamch( 'E' )
+           eps2 = eps**2
+           safmin = stdlib_qlamch( 'S' )
+           safmax = one / safmin
+           ssfmax = sqrt( safmax ) / three
+           ssfmin = sqrt( safmin ) / eps2
+           ! compute the eigenvalues and eigenvectors of the tridiagonal
+           ! matrix.
+           if( icompz==2 )call stdlib_qlaset( 'FULL', n, n, zero, one, z, ldz )
+           nmaxit = n*maxit
+           jtot = 0
+           ! determine where the matrix splits and choose ql or qr iteration
+           ! for each block, according to whether top or bottom diagonal
+           ! element is smaller.
+           l1 = 1
+           nm1 = n - 1
+           10 continue
+           if( l1>n )go to 160
+           if( l1>1 )e( l1-1 ) = zero
+           if( l1<=nm1 ) then
+              do m = l1, nm1
+                 tst = abs( e( m ) )
+                 if( tst==zero )go to 30
+                 if( tst<=( sqrt( abs( d( m ) ) )*sqrt( abs( d( m+1 ) ) ) )*eps ) then
+                    e( m ) = zero
+                    go to 30
+                 end if
+              end do
+           end if
+           m = n
+           30 continue
+           l = l1
+           lsv = l
+           lend = m
+           lendsv = lend
+           l1 = m + 1
+           if( lend==l )go to 10
+           ! scale submatrix in rows and columns l to lend
+           anorm = stdlib_qlanst( 'M', lend-l+1, d( l ), e( l ) )
+           iscale = 0
+           if( anorm==zero )go to 10
+           if( anorm>ssfmax ) then
+              iscale = 1
+              call stdlib_qlascl( 'G', 0, 0, anorm, ssfmax, lend-l+1, 1, d( l ), n,info )
+              call stdlib_qlascl( 'G', 0, 0, anorm, ssfmax, lend-l, 1, e( l ), n,info )
+           else if( anorm<ssfmin ) then
+              iscale = 2
+              call stdlib_qlascl( 'G', 0, 0, anorm, ssfmin, lend-l+1, 1, d( l ), n,info )
+              call stdlib_qlascl( 'G', 0, 0, anorm, ssfmin, lend-l, 1, e( l ), n,info )
+           end if
+           ! choose between ql and qr iteration
+           if( abs( d( lend ) )<abs( d( l ) ) ) then
+              lend = lsv
+              l = lendsv
+           end if
+           if( lend>l ) then
+              ! ql iteration
+              ! look for small subdiagonal element.
+              40 continue
+              if( l/=lend ) then
+                 lendm1 = lend - 1
+                 do m = l, lendm1
+                    tst = abs( e( m ) )**2
+                    if( tst<=( eps2*abs( d( m ) ) )*abs( d( m+1 ) )+safmin )go to 60
+                 end do
+              end if
+              m = lend
+              60 continue
+              if( m<lend )e( m ) = zero
+              p = d( l )
+              if( m==l )go to 80
+              ! if remaining matrix is 2-by-2, use stdlib_qlae2 or stdlib_dlaev2
+              ! to compute its eigensystem.
+              if( m==l+1 ) then
+                 if( icompz>0 ) then
+                    call stdlib_qlaev2( d( l ), e( l ), d( l+1 ), rt1, rt2, c, s )
+                    work( l ) = c
+                    work( n-1+l ) = s
+                    call stdlib_qlasr( 'R', 'V', 'B', n, 2, work( l ),work( n-1+l ), z( 1, l ), &
+                              ldz )
+                 else
+                    call stdlib_qlae2( d( l ), e( l ), d( l+1 ), rt1, rt2 )
+                 end if
+                 d( l ) = rt1
+                 d( l+1 ) = rt2
+                 e( l ) = zero
+                 l = l + 2
+                 if( l<=lend )go to 40
+                 go to 140
+              end if
+              if( jtot==nmaxit )go to 140
+              jtot = jtot + 1
+              ! form shift.
+              g = ( d( l+1 )-p ) / ( two*e( l ) )
+              r = stdlib_qlapy2( g, one )
+              g = d( m ) - p + ( e( l ) / ( g+sign( r, g ) ) )
+              s = one
+              c = one
+              p = zero
+              ! inner loop
+              mm1 = m - 1
+              do i = mm1, l, -1
+                 f = s*e( i )
+                 b = c*e( i )
+                 call stdlib_qlartg( g, f, c, s, r )
+                 if( i/=m-1 )e( i+1 ) = r
+                 g = d( i+1 ) - p
+                 r = ( d( i )-g )*s + two*c*b
+                 p = s*r
+                 d( i+1 ) = g + p
+                 g = c*r - b
+                 ! if eigenvectors are desired, then save rotations.
+                 if( icompz>0 ) then
+                    work( i ) = c
+                    work( n-1+i ) = -s
+                 end if
+              end do
+              ! if eigenvectors are desired, then apply saved rotations.
+              if( icompz>0 ) then
+                 mm = m - l + 1
+                 call stdlib_qlasr( 'R', 'V', 'B', n, mm, work( l ), work( n-1+l ),z( 1, l ), ldz &
+                           )
+              end if
+              d( l ) = d( l ) - p
+              e( l ) = g
+              go to 40
+              ! eigenvalue found.
+              80 continue
+              d( l ) = p
+              l = l + 1
+              if( l<=lend )go to 40
+              go to 140
+           else
+              ! qr iteration
+              ! look for small superdiagonal element.
+              90 continue
+              if( l/=lend ) then
+                 lendp1 = lend + 1
+                 do m = l, lendp1, -1
+                    tst = abs( e( m-1 ) )**2
+                    if( tst<=( eps2*abs( d( m ) ) )*abs( d( m-1 ) )+safmin )go to 110
+                 end do
+              end if
+              m = lend
+              110 continue
+              if( m>lend )e( m-1 ) = zero
+              p = d( l )
+              if( m==l )go to 130
+              ! if remaining matrix is 2-by-2, use stdlib_qlae2 or stdlib_dlaev2
+              ! to compute its eigensystem.
+              if( m==l-1 ) then
+                 if( icompz>0 ) then
+                    call stdlib_qlaev2( d( l-1 ), e( l-1 ), d( l ), rt1, rt2, c, s )
+                    work( m ) = c
+                    work( n-1+m ) = s
+                    call stdlib_qlasr( 'R', 'V', 'F', n, 2, work( m ),work( n-1+m ), z( 1, l-1 ), &
+                              ldz )
+                 else
+                    call stdlib_qlae2( d( l-1 ), e( l-1 ), d( l ), rt1, rt2 )
+                 end if
+                 d( l-1 ) = rt1
+                 d( l ) = rt2
+                 e( l-1 ) = zero
+                 l = l - 2
+                 if( l>=lend )go to 90
+                 go to 140
+              end if
+              if( jtot==nmaxit )go to 140
+              jtot = jtot + 1
+              ! form shift.
+              g = ( d( l-1 )-p ) / ( two*e( l-1 ) )
+              r = stdlib_qlapy2( g, one )
+              g = d( m ) - p + ( e( l-1 ) / ( g+sign( r, g ) ) )
+              s = one
+              c = one
+              p = zero
+              ! inner loop
+              lm1 = l - 1
+              do i = m, lm1
+                 f = s*e( i )
+                 b = c*e( i )
+                 call stdlib_qlartg( g, f, c, s, r )
+                 if( i/=m )e( i-1 ) = r
+                 g = d( i ) - p
+                 r = ( d( i+1 )-g )*s + two*c*b
+                 p = s*r
+                 d( i ) = g + p
+                 g = c*r - b
+                 ! if eigenvectors are desired, then save rotations.
+                 if( icompz>0 ) then
+                    work( i ) = c
+                    work( n-1+i ) = s
+                 end if
+              end do
+              ! if eigenvectors are desired, then apply saved rotations.
+              if( icompz>0 ) then
+                 mm = l - m + 1
+                 call stdlib_qlasr( 'R', 'V', 'F', n, mm, work( m ), work( n-1+m ),z( 1, m ), ldz &
+                           )
+              end if
+              d( l ) = d( l ) - p
+              e( lm1 ) = g
+              go to 90
+              ! eigenvalue found.
+              130 continue
+              d( l ) = p
+              l = l - 1
+              if( l>=lend )go to 90
+              go to 140
+           end if
+           ! undo scaling if necessary
+           140 continue
+           if( iscale==1 ) then
+              call stdlib_qlascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv+1, 1,d( lsv ), n, info )
+                        
+              call stdlib_qlascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv, 1, e( lsv ),n, info )
+                        
+           else if( iscale==2 ) then
+              call stdlib_qlascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv+1, 1,d( lsv ), n, info )
+                        
+              call stdlib_qlascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv, 1, e( lsv ),n, info )
+                        
+           end if
+           ! check for no convergence to an eigenvalue after a total
+           ! of n*maxit iterations.
+           if( jtot<nmaxit )go to 10
+           do i = 1, n - 1
+              if( e( i )/=zero )info = info + 1
+           end do
+           go to 190
+           ! order eigenvalues and eigenvectors.
+           160 continue
+           if( icompz==0 ) then
+              ! use quick sort
+              call stdlib_qlasrt( 'I', n, d, info )
+           else
+              ! use selection sort to minimize swaps of eigenvectors
+              do ii = 2, n
+                 i = ii - 1
+                 k = i
+                 p = d( i )
+                 do j = ii, n
+                    if( d( j )<p ) then
+                       k = j
+                       p = d( j )
+                    end if
+                 end do
+                 if( k/=i ) then
+                    d( k ) = d( i )
+                    d( i ) = p
+                    call stdlib_qswap( n, z( 1, i ), 1, z( 1, k ), 1 )
+                 end if
+              end do
+           end if
+           190 continue
+           return
+     end subroutine stdlib_qsteqr
+
+     !> DSTERF: computes all eigenvalues of a symmetric tridiagonal matrix
+     !> using the Pal-Walker-Kahan variant of the QL or QR algorithm.
+
+     pure subroutine stdlib_qsterf( n, d, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(inout) :: d(*), e(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 30
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iscale, jtot, l, l1, lend, lendsv, lsv, m, nmaxit
+           real(qp) :: alpha, anorm, bb, c, eps, eps2, gamma, oldc, oldgam, p, r, rt1, rt2, rte, &
+                     s, safmax, safmin, sigma, ssfmax, ssfmin, rmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           ! quick return if possible
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'DSTERF', -info )
+              return
+           end if
+           if( n<=1 )return
+           ! determine the unit roundoff for this environment.
+           eps = stdlib_qlamch( 'E' )
+           eps2 = eps**2
+           safmin = stdlib_qlamch( 'S' )
+           safmax = one / safmin
+           ssfmax = sqrt( safmax ) / three
+           ssfmin = sqrt( safmin ) / eps2
+           rmax = stdlib_qlamch( 'O' )
+           ! compute the eigenvalues of the tridiagonal matrix.
+           nmaxit = n*maxit
+           sigma = zero
+           jtot = 0
+           ! determine where the matrix splits and choose ql or qr iteration
+           ! for each block, according to whether top or bottom diagonal
+           ! element is smaller.
+           l1 = 1
+           10 continue
+           if( l1>n )go to 170
+           if( l1>1 )e( l1-1 ) = zero
+           do m = l1, n - 1
+              if( abs( e( m ) )<=( sqrt( abs( d( m ) ) )*sqrt( abs( d( m+1 ) ) ) )*eps ) &
+                        then
+                 e( m ) = zero
+                 go to 30
+              end if
+           end do
+           m = n
+           30 continue
+           l = l1
+           lsv = l
+           lend = m
+           lendsv = lend
+           l1 = m + 1
+           if( lend==l )go to 10
+           ! scale submatrix in rows and columns l to lend
+           anorm = stdlib_qlanst( 'M', lend-l+1, d( l ), e( l ) )
+           iscale = 0
+           if( anorm==zero )go to 10
+           if( (anorm>ssfmax) ) then
+              iscale = 1
+              call stdlib_qlascl( 'G', 0, 0, anorm, ssfmax, lend-l+1, 1, d( l ), n,info )
+              call stdlib_qlascl( 'G', 0, 0, anorm, ssfmax, lend-l, 1, e( l ), n,info )
+           else if( anorm<ssfmin ) then
+              iscale = 2
+              call stdlib_qlascl( 'G', 0, 0, anorm, ssfmin, lend-l+1, 1, d( l ), n,info )
+              call stdlib_qlascl( 'G', 0, 0, anorm, ssfmin, lend-l, 1, e( l ), n,info )
+           end if
+           do i = l, lend - 1
+              e( i ) = e( i )**2
+           end do
+           ! choose between ql and qr iteration
+           if( abs( d( lend ) )<abs( d( l ) ) ) then
+              lend = lsv
+              l = lendsv
+           end if
+           if( lend>=l ) then
+              ! ql iteration
+              ! look for small subdiagonal element.
+              50 continue
+              if( l/=lend ) then
+                 do m = l, lend - 1
+                    if( abs( e( m ) )<=eps2*abs( d( m )*d( m+1 ) ) )go to 70
+                 end do
+              end if
+              m = lend
+              70 continue
+              if( m<lend )e( m ) = zero
+              p = d( l )
+              if( m==l )go to 90
+              ! if remaining matrix is 2 by 2, use stdlib_qlae2 to compute its
+              ! eigenvalues.
+              if( m==l+1 ) then
+                 rte = sqrt( e( l ) )
+                 call stdlib_qlae2( d( l ), rte, d( l+1 ), rt1, rt2 )
+                 d( l ) = rt1
+                 d( l+1 ) = rt2
+                 e( l ) = zero
+                 l = l + 2
+                 if( l<=lend )go to 50
+                 go to 150
+              end if
+              if( jtot==nmaxit )go to 150
+              jtot = jtot + 1
+              ! form shift.
+              rte = sqrt( e( l ) )
+              sigma = ( d( l+1 )-p ) / ( two*rte )
+              r = stdlib_qlapy2( sigma, one )
+              sigma = p - ( rte / ( sigma+sign( r, sigma ) ) )
+              c = one
+              s = zero
+              gamma = d( m ) - sigma
+              p = gamma*gamma
+              ! inner loop
+              do i = m - 1, l, -1
+                 bb = e( i )
+                 r = p + bb
+                 if( i/=m-1 )e( i+1 ) = s*r
+                 oldc = c
+                 c = p / r
+                 s = bb / r
+                 oldgam = gamma
+                 alpha = d( i )
+                 gamma = c*( alpha-sigma ) - s*oldgam
+                 d( i+1 ) = oldgam + ( alpha-gamma )
+                 if( c/=zero ) then
+                    p = ( gamma*gamma ) / c
+                 else
+                    p = oldc*bb
+                 end if
+              end do
+              e( l ) = s*p
+              d( l ) = sigma + gamma
+              go to 50
+              ! eigenvalue found.
+              90 continue
+              d( l ) = p
+              l = l + 1
+              if( l<=lend )go to 50
+              go to 150
+           else
+              ! qr iteration
+              ! look for small superdiagonal element.
+              100 continue
+              do m = l, lend + 1, -1
+                 if( abs( e( m-1 ) )<=eps2*abs( d( m )*d( m-1 ) ) )go to 120
+              end do
+              m = lend
+              120 continue
+              if( m>lend )e( m-1 ) = zero
+              p = d( l )
+              if( m==l )go to 140
+              ! if remaining matrix is 2 by 2, use stdlib_qlae2 to compute its
+              ! eigenvalues.
+              if( m==l-1 ) then
+                 rte = sqrt( e( l-1 ) )
+                 call stdlib_qlae2( d( l ), rte, d( l-1 ), rt1, rt2 )
+                 d( l ) = rt1
+                 d( l-1 ) = rt2
+                 e( l-1 ) = zero
+                 l = l - 2
+                 if( l>=lend )go to 100
+                 go to 150
+              end if
+              if( jtot==nmaxit )go to 150
+              jtot = jtot + 1
+              ! form shift.
+              rte = sqrt( e( l-1 ) )
+              sigma = ( d( l-1 )-p ) / ( two*rte )
+              r = stdlib_qlapy2( sigma, one )
+              sigma = p - ( rte / ( sigma+sign( r, sigma ) ) )
+              c = one
+              s = zero
+              gamma = d( m ) - sigma
+              p = gamma*gamma
+              ! inner loop
+              do i = m, l - 1
+                 bb = e( i )
+                 r = p + bb
+                 if( i/=m )e( i-1 ) = s*r
+                 oldc = c
+                 c = p / r
+                 s = bb / r
+                 oldgam = gamma
+                 alpha = d( i+1 )
+                 gamma = c*( alpha-sigma ) - s*oldgam
+                 d( i ) = oldgam + ( alpha-gamma )
+                 if( c/=zero ) then
+                    p = ( gamma*gamma ) / c
+                 else
+                    p = oldc*bb
+                 end if
+              end do
+              e( l-1 ) = s*p
+              d( l ) = sigma + gamma
+              go to 100
+              ! eigenvalue found.
+              140 continue
+              d( l ) = p
+              l = l - 1
+              if( l>=lend )go to 100
+              go to 150
+           end if
+           ! undo scaling if necessary
+           150 continue
+           if( iscale==1 )call stdlib_qlascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv+1, 1,d( lsv ), &
+                     n, info )
+           if( iscale==2 )call stdlib_qlascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv+1, 1,d( lsv ), &
+                     n, info )
+           ! check for no convergence to an eigenvalue after a total
+           ! of n*maxit iterations.
+           if( jtot<nmaxit )go to 10
+           do i = 1, n - 1
+              if( e( i )/=zero )info = info + 1
+           end do
+           go to 180
+           ! sort eigenvalues in increasing order.
+           170 continue
+           call stdlib_qlasrt( 'I', n, d, info )
+           180 continue
+           return
+     end subroutine stdlib_qsterf
+
+     !> DSTEV: computes all eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric tridiagonal matrix A.
+
+     pure subroutine stdlib_qstev( jobz, n, d, e, z, ldz, work, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantz
+           integer(ilp) :: imax, iscale
+           real(qp) :: bignum, eps, rmax, rmin, safmin, sigma, smlnum, tnrm
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           tnrm = stdlib_qlanst( 'M', n, d, e )
+           if( tnrm>zero .and. tnrm<rmin ) then
+              iscale = 1
+              sigma = rmin / tnrm
+           else if( tnrm>rmax ) then
+              iscale = 1
+              sigma = rmax / tnrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_qscal( n, sigma, d, 1 )
+              call stdlib_qscal( n-1, sigma, e( 1 ), 1 )
+           end if
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvalues and
+           ! eigenvectors, call stdlib_qsteqr.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, d, e, info )
+           else
+              call stdlib_qsteqr( 'I', n, d, e, z, ldz, work, info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, d, 1 )
+           end if
+           return
+     end subroutine stdlib_qstev
+
+     !> DSTEVD: computes all eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric tridiagonal matrix. If eigenvectors are desired, it
+     !> uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_qstevd( jobz, n, d, e, z, ldz, work, lwork, iwork,liwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantz
+           integer(ilp) :: iscale, liwmin, lwmin
+           real(qp) :: bignum, eps, rmax, rmin, safmin, sigma, smlnum, tnrm
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           liwmin = 1
+           lwmin = 1
+           if( n>1 .and. wantz ) then
+              lwmin = 1 + 4*n + n**2
+              liwmin = 3 + 5*n
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -6
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -8
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -10
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           tnrm = stdlib_qlanst( 'M', n, d, e )
+           if( tnrm>zero .and. tnrm<rmin ) then
+              iscale = 1
+              sigma = rmin / tnrm
+           else if( tnrm>rmax ) then
+              iscale = 1
+              sigma = rmax / tnrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_qscal( n, sigma, d, 1 )
+              call stdlib_qscal( n-1, sigma, e( 1 ), 1 )
+           end if
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvalues and
+           ! eigenvectors, call stdlib_qstedc.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, d, e, info )
+           else
+              call stdlib_qstedc( 'I', n, d, e, z, ldz, work, lwork, iwork, liwork,info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 )call stdlib_qscal( n, one / sigma, d, 1 )
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_qstevd
+
+     !> DSTEVR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T.  Eigenvalues and
+     !> eigenvectors can be selected by specifying either a range of values
+     !> or a range of indices for the desired eigenvalues.
+     !> Whenever possible, DSTEVR calls DSTEMR to compute the
+     !> eigenspectrum using Relatively Robust Representations.  DSTEMR
+     !> computes eigenvalues by the dqds algorithm, while orthogonal
+     !> eigenvectors are computed from various "good" L D L^T representations
+     !> (also known as Relatively Robust Representations). Gram-Schmidt
+     !> orthogonalization is avoided as far as possible. More specifically,
+     !> the various steps of the algorithm are as follows. For the i-th
+     !> unreduced block of T,
+     !> (a) Compute T - sigma_i = L_i D_i L_i^T, such that L_i D_i L_i^T
+     !> is a relatively robust representation,
+     !> (b) Compute the eigenvalues, lambda_j, of L_i D_i L_i^T to high
+     !> relative accuracy by the dqds algorithm,
+     !> (c) If there is a cluster of close eigenvalues, "choose" sigma_i
+     !> close to the cluster, and go to step (a),
+     !> (d) Given the approximate eigenvalue lambda_j of L_i D_i L_i^T,
+     !> compute the corresponding eigenvector by forming a
+     !> rank-revealing twisted factorization.
+     !> The desired accuracy of the output can be specified by the input
+     !> parameter ABSTOL.
+     !> For more details, see "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem", by Inderjit Dhillon,
+     !> Computer Science Division Technical Report No. UCB//CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Note 1 : DSTEVR calls DSTEMR when the full spectrum is requested
+     !> on machines which conform to the ieee-754 floating point standard.
+     !> DSTEVR calls DSTEBZ and DSTEIN on non-ieee machines and
+     !> when partial spectrum requests are made.
+     !> Normal execution of DSTEMR may create NaNs and infinities and
+     !> hence may abort due to a floating point exception in environments
+     !> which do not handle NaNs and infinities in the ieee standard default
+     !> manner.
+
+     pure subroutine stdlib_qstevr( jobz, range, n, d, e, vl, vu, il, iu, abstol,m, w, z, ldz, &
+               isuppz, work, lwork, iwork,liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           integer(ilp), intent(in) :: il, iu, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, lquery, valeig, wantz, tryrac
+           character :: order
+           integer(ilp) :: i, ieeeok, imax, indibl, indifl, indisp, indiwo, iscale, itmp1, j, jj, &
+                     liwmin, lwmin, nsplit
+           real(qp) :: bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, tnrm, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           ieeeok = stdlib_ilaenv( 10, 'DSTEVR', 'N', 1, 2, 3, 4 )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ) .or. ( liwork==-1 ) )
+           lwmin = max( 1, 20*n )
+           liwmin = max( 1, 10*n )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -7
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -8
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -9
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -14
+              end if
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -17
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -19
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEVR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = d( 1 )
+              else
+                 if( vl<d( 1 ) .and. vu>=d( 1 ) ) then
+                    m = 1
+                    w( 1 ) = d( 1 )
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           end if
+           tnrm = stdlib_qlanst( 'M', n, d, e )
+           if( tnrm>zero .and. tnrm<rmin ) then
+              iscale = 1
+              sigma = rmin / tnrm
+           else if( tnrm>rmax ) then
+              iscale = 1
+              sigma = rmax / tnrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_qscal( n, sigma, d, 1 )
+              call stdlib_qscal( n-1, sigma, e( 1 ), 1 )
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! initialize indices into workspaces.  note: these indices are used only
+           ! if stdlib_qsterf or stdlib_qstemr fail.
+           ! iwork(indibl:indibl+m-1) corresponds to iblock in stdlib_qstebz and
+           ! stores the block indices of each of the m<=n eigenvalues.
+           indibl = 1
+           ! iwork(indisp:indisp+nsplit-1) corresponds to isplit in stdlib_qstebz and
+           ! stores the starting and finishing indices of each block.
+           indisp = indibl + n
+           ! iwork(indifl:indifl+n-1) stores the indices of eigenvectors
+           ! that corresponding to eigenvectors that fail to converge in
+           ! stdlib_qstein.  this information is discarded; if any fail, the driver
+           ! returns info > 0.
+           indifl = indisp + n
+           ! indiwo is the offset of the remaining integer workspace.
+           indiwo = indisp + n
+           ! if all eigenvalues are desired, then
+           ! call stdlib_qsterf or stdlib_qstemr.  if this fails for some eigenvalue, then
+           ! try stdlib_qstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ieeeok==1 ) then
+              call stdlib_qcopy( n-1, e( 1 ), 1, work( 1 ), 1 )
+              if( .not.wantz ) then
+                 call stdlib_qcopy( n, d, 1, w, 1 )
+                 call stdlib_qsterf( n, w, work, info )
+              else
+                 call stdlib_qcopy( n, d, 1, work( n+1 ), 1 )
+                 if (abstol <= two*n*eps) then
+                    tryrac = .true.
+                 else
+                    tryrac = .false.
+                 end if
+                 call stdlib_qstemr( jobz, 'A', n, work( n+1 ), work, vl, vu, il,iu, m, w, z, ldz,&
+                            n, isuppz, tryrac,work( 2*n+1 ), lwork-2*n, iwork, liwork, info )
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 10
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_qstebz and, if eigenvectors are desired, stdlib_qstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           call stdlib_qstebz( range, order, n, vll, vuu, il, iu, abstol, d, e, m,nsplit, w, &
+                     iwork( indibl ), iwork( indisp ), work,iwork( indiwo ), info )
+           if( wantz ) then
+              call stdlib_qstein( n, d, e, m, w, iwork( indibl ), iwork( indisp ),z, ldz, work, &
+                        iwork( indiwo ), iwork( indifl ),info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           10 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( i )
+                    w( i ) = w( j )
+                    iwork( i ) = iwork( j )
+                    w( j ) = tmp1
+                    iwork( j ) = itmp1
+                    call stdlib_qswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                 end if
+              end do
+           end if
+            ! causes problems with tests 19
+            ! if (wantz .and. indeig ) z( 1,1) = z(1,1) / 1.002_qp + .002
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_qstevr
+
+     !> DSTEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix A.  Eigenvalues and
+     !> eigenvectors can be selected by specifying either a range of values
+     !> or a range of indices for the desired eigenvalues.
+
+     pure subroutine stdlib_qstevx( jobz, range, n, d, e, vl, vu, il, iu, abstol,m, w, z, ldz, &
+               work, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           integer(ilp), intent(in) :: il, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, imax, indibl, indisp, indiwo, indwrk, iscale, itmp1, j, jj, &
+                     nsplit
+           real(qp) :: bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, tnrm, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -7
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -8
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -9
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSTEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = d( 1 )
+              else
+                 if( vl<d( 1 ) .and. vu>=d( 1 ) ) then
+                    m = 1
+                    w( 1 ) = d( 1 )
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           end if
+           tnrm = stdlib_qlanst( 'M', n, d, e )
+           if( tnrm>zero .and. tnrm<rmin ) then
+              iscale = 1
+              sigma = rmin / tnrm
+           else if( tnrm>rmax ) then
+              iscale = 1
+              sigma = rmax / tnrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_qscal( n, sigma, d, 1 )
+              call stdlib_qscal( n-1, sigma, e( 1 ), 1 )
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! if all eigenvalues are desired and abstol is less than zero, then
+           ! call stdlib_qsterf or stdlib_dsteqr.  if this fails for some eigenvalue, then
+           ! try stdlib_qstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_qcopy( n, d, 1, w, 1 )
+              call stdlib_qcopy( n-1, e( 1 ), 1, work( 1 ), 1 )
+              indwrk = n + 1
+              if( .not.wantz ) then
+                 call stdlib_qsterf( n, w, work, info )
+              else
+                 call stdlib_qsteqr( 'I', n, w, work, z, ldz, work( indwrk ), info )
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 20
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_qstebz and, if eigenvectors are desired, stdlib_dstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indwrk = 1
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_qstebz( range, order, n, vll, vuu, il, iu, abstol, d, e, m,nsplit, w, &
+                     iwork( indibl ), iwork( indisp ),work( indwrk ), iwork( indiwo ), info )
+           if( wantz ) then
+              call stdlib_qstein( n, d, e, m, w, iwork( indibl ), iwork( indisp ),z, ldz, work( &
+                        indwrk ), iwork( indiwo ), ifail,info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           20 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_qswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_qstevx
+
+     !> DSYCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by DSYTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_qsycon( uplo, n, a, lda, ipiv, anorm, rcond, work,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(qp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_qsytrs( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_qsycon
+
+     !> DSYCON_ROOK: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by DSYTRF_ROOK.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_qsycon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(qp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYCON_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_qsytrs_rook( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_qsycon_rook
+
+     !> DSYCONV: convert A given by TRF into L and D and vice-versa.
+     !> Get Non-diag elements of D (returned in workspace) and
+     !> apply or reverse permutation done in TRF.
+
+     pure subroutine stdlib_qsyconv( uplo, way, n, a, lda, ipiv, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip, j
+           real(qp) :: temp
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYCONV', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+            ! a is upper
+            ! convert a (a is upper)
+              ! convert value
+              if ( convert ) then
+                 i=n
+                 e(1)=zero
+                 do while ( i > 1 )
+                    if( ipiv(i) < 0 ) then
+                       e(i)=a(i-1,i)
+                       e(i-1)=zero
+                       a(i-1,i)=zero
+                       i=i-1
+                    else
+                       e(i)=zero
+                    endif
+                    i=i-1
+                 end do
+              ! convert permutations
+              i=n
+              do while ( i >= 1 )
+                 if( ipiv(i) > 0) then
+                    ip=ipiv(i)
+                    if( i < n) then
+                       do j= i+1,n
+                         temp=a(ip,j)
+                         a(ip,j)=a(i,j)
+                         a(i,j)=temp
+                       end do
+                    endif
+                 else
+                   ip=-ipiv(i)
+                    if( i < n) then
+                  do j= i+1,n
+                      temp=a(ip,j)
+                      a(ip,j)=a(i-1,j)
+                      a(i-1,j)=temp
+                  end do
+                     endif
+                     i=i-1
+                endif
+                i=i-1
+             end do
+              else
+            ! revert a (a is upper)
+              ! revert permutations
+                 i=1
+                 do while ( i <= n )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if( i < n) then
+                       do j= i+1,n
+                         temp=a(ip,j)
+                         a(ip,j)=a(i,j)
+                         a(i,j)=temp
+                       end do
+                       endif
+                    else
+                      ip=-ipiv(i)
+                      i=i+1
+                      if( i < n) then
+                         do j= i+1,n
+                            temp=a(ip,j)
+                            a(ip,j)=a(i-1,j)
+                            a(i-1,j)=temp
+                         end do
+                      endif
+                    endif
+                    i=i+1
+                 end do
+              ! revert value
+                 i=n
+                 do while ( i > 1 )
+                    if( ipiv(i) < 0 ) then
+                       a(i-1,i)=e(i)
+                       i=i-1
+                    endif
+                    i=i-1
+                 end do
+              end if
+           else
+            ! a is lower
+              if ( convert ) then
+            ! convert a (a is lower)
+              ! convert value
+                 i=1
+                 e(n)=zero
+                 do while ( i <= n )
+                    if( i<n .and. ipiv(i) < 0 ) then
+                       e(i)=a(i+1,i)
+                       e(i+1)=zero
+                       a(i+1,i)=zero
+                       i=i+1
+                    else
+                       e(i)=zero
+                    endif
+                    i=i+1
+                 end do
+              ! convert permutations
+              i=1
+              do while ( i <= n )
+                 if( ipiv(i) > 0 ) then
+                    ip=ipiv(i)
+                    if (i > 1) then
+                    do j= 1,i-1
+                      temp=a(ip,j)
+                      a(ip,j)=a(i,j)
+                      a(i,j)=temp
+                    end do
+                    endif
+                 else
+                   ip=-ipiv(i)
+                   if (i > 1) then
+                   do j= 1,i-1
+                      temp=a(ip,j)
+                      a(ip,j)=a(i+1,j)
+                      a(i+1,j)=temp
+                   end do
+                   endif
+                   i=i+1
+                endif
+                i=i+1
+             end do
+              else
+            ! revert a (a is lower)
+              ! revert permutations
+                 i=n
+                 do while ( i >= 1 )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(i,j)
+                             a(i,j)=a(ip,j)
+                             a(ip,j)=temp
+                          end do
+                       endif
+                    else
+                       ip=-ipiv(i)
+                       i=i-1
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(i+1,j)
+                             a(i+1,j)=a(ip,j)
+                             a(ip,j)=temp
+                          end do
+                       endif
+                    endif
+                    i=i-1
+                 end do
+              ! revert value
+                 i=1
+                 do while ( i <= n-1 )
+                    if( ipiv(i) < 0 ) then
+                       a(i+1,i)=e(i)
+                       i=i+1
+                    endif
+                    i=i+1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qsyconv
+
+     !> If parameter WAY = 'C':
+     !> DSYCONVF: converts the factorization output format used in
+     !> DSYTRF provided on entry in parameter A into the factorization
+     !> output format used in DSYTRF_RK (or DSYTRF_BK) that is stored
+     !> on exit in parameters A and E. It also converts in place details of
+     !> the intechanges stored in IPIV from the format used in DSYTRF into
+     !> the format used in DSYTRF_RK (or DSYTRF_BK).
+     !> If parameter WAY = 'R':
+     !> DSYCONVF performs the conversion in reverse direction, i.e.
+     !> converts the factorization output format used in DSYTRF_RK
+     !> (or DSYTRF_BK) provided on entry in parameters A and E into
+     !> the factorization output format used in DSYTRF that is stored
+     !> on exit in parameter A. It also converts in place details of
+     !> the intechanges stored in IPIV from the format used in DSYTRF_RK
+     !> (or DSYTRF_BK) into the format used in DSYTRF.
+
+     pure subroutine stdlib_qsyconvf( uplo, way, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*), e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYCONVF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! begin a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 ! assign superdiagonal entries of d to array e and zero out
+                 ! corresponding entries in input storage a
+                 i = n
+                 e( 1 ) = zero
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       e( i ) = a( i-1, i )
+                       e( i-1 ) = zero
+                       a( i-1, i ) = zero
+                       i = i - 1
+                    else
+                       e( i ) = zero
+                    end if
+                    i = i - 1
+                 end do
+                 ! convert permutations and ipiv
+                 ! apply permutations to submatrices of upper part of a
+                 ! in factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_qswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i) in a(1:i,n-i:n)
+                       ip = -ipiv( i )
+                       if( i<n ) then
+                          if( ip/=(i-1) ) then
+                             call stdlib_qswap( n-i, a( i-1, i+1 ), lda,a( ip, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is no interchnge of rows i and and ipiv(i),
+                       ! so this should be reflected in ipiv format for
+                       ! *sytrf_rk ( or *sytrf_bk)
+                       ipiv( i ) = i
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations and ipiv
+                 ! apply permutations to submatrices of upper part of a
+                 ! in reverse factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_qswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i) in a(1:i,n-i:n)
+                       i = i + 1
+                       ip = -ipiv( i )
+                       if( i<n ) then
+                          if( ip/=(i-1) ) then
+                             call stdlib_qswap( n-i, a( ip, i+1 ), lda,a( i-1, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is one interchange of rows i-1 and ipiv(i-1),
+                       ! so this should be recorded in two consecutive entries
+                       ! in ipiv format for *sytrf
+                       ipiv( i ) = ipiv( i-1 )
+                    end if
+                    i = i + 1
+                 end do
+                 ! revert value
+                 ! assign superdiagonal entries of d from array e to
+                 ! superdiagonal entries of a.
+                 i = n
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       a( i-1, i ) = e( i )
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              ! end a is upper
+              end if
+           else
+              ! begin a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 ! assign subdiagonal entries of d to array e and zero out
+                 ! corresponding entries in input storage a
+                 i = 1
+                 e( n ) = zero
+                 do while ( i<=n )
+                    if( i<n .and. ipiv(i)<0 ) then
+                       e( i ) = a( i+1, i )
+                       e( i+1 ) = zero
+                       a( i+1, i ) = zero
+                       i = i + 1
+                    else
+                       e( i ) = zero
+                    end if
+                    i = i + 1
+                 end do
+                 ! convert permutations and ipiv
+                 ! apply permutations to submatrices of lower part of a
+                 ! in factorization order where k increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_qswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i) in a(i:n,1:i-1)
+                       ip = -ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=(i+1) ) then
+                             call stdlib_qswap( i-1, a( i+1, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is no interchnge of rows i and and ipiv(i),
+                       ! so this should be reflected in ipiv format for
+                       ! *sytrf_rk ( or *sytrf_bk)
+                       ipiv( i ) = i
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations and ipiv
+                 ! apply permutations to submatrices of lower part of a
+                 ! in reverse factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_qswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i) in a(i:n,1:i-1)
+                       i = i - 1
+                       ip = -ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=(i+1) ) then
+                             call stdlib_qswap( i-1, a( ip, 1 ), lda,a( i+1, 1 ), lda )
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is one interchange of rows i+1 and ipiv(i+1),
+                       ! so this should be recorded in consecutive entries
+                       ! in ipiv format for *sytrf
+                       ipiv( i ) = ipiv( i+1 )
+                    end if
+                    i = i - 1
+                 end do
+                 ! revert value
+                 ! assign subdiagonal entries of d from array e to
+                 ! subgiagonal entries of a.
+                 i = 1
+                 do while ( i<=n-1 )
+                    if( ipiv( i )<0 ) then
+                       a( i + 1, i ) = e( i )
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              end if
+              ! end a is lower
+           end if
+           return
+     end subroutine stdlib_qsyconvf
+
+     !> If parameter WAY = 'C':
+     !> DSYCONVF_ROOK: converts the factorization output format used in
+     !> DSYTRF_ROOK provided on entry in parameter A into the factorization
+     !> output format used in DSYTRF_RK (or DSYTRF_BK) that is stored
+     !> on exit in parameters A and E. IPIV format for DSYTRF_ROOK and
+     !> DSYTRF_RK (or DSYTRF_BK) is the same and is not converted.
+     !> If parameter WAY = 'R':
+     !> DSYCONVF_ROOK performs the conversion in reverse direction, i.e.
+     !> converts the factorization output format used in DSYTRF_RK
+     !> (or DSYTRF_BK) provided on entry in parameters A and E into
+     !> the factorization output format used in DSYTRF_ROOK that is stored
+     !> on exit in parameter A. IPIV format for DSYTRF_ROOK and
+     !> DSYTRF_RK (or DSYTRF_BK) is the same and is not converted.
+
+     pure subroutine stdlib_qsyconvf_rook( uplo, way, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*), e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip, ip2
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYCONVF_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! begin a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 ! assign superdiagonal entries of d to array e and zero out
+                 ! corresponding entries in input storage a
+                 i = n
+                 e( 1 ) = zero
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       e( i ) = a( i-1, i )
+                       e( i-1 ) = zero
+                       a( i-1, i ) = zero
+                       i = i - 1
+                    else
+                       e( i ) = zero
+                    end if
+                    i = i - 1
+                 end do
+                 ! convert permutations
+                 ! apply permutations to submatrices of upper part of a
+                 ! in factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_qswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i and ipiv(i) and i-1 and ipiv(i-1)
+                       ! in a(1:i,n-i:n)
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i-1 )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_qswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                          if( ip2/=(i-1) ) then
+                             call stdlib_qswap( n-i, a( i-1, i+1 ), lda,a( ip2, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations
+                 ! apply permutations to submatrices of upper part of a
+                 ! in reverse factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_qswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i-1) and i and ipiv(i)
+                       ! in a(1:i,n-i:n)
+                       i = i + 1
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i-1 )
+                       if( i<n ) then
+                          if( ip2/=(i-1) ) then
+                             call stdlib_qswap( n-i, a( ip2, i+1 ), lda,a( i-1, i+1 ), lda )
+                                       
+                          end if
+                          if( ip/=i ) then
+                             call stdlib_qswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    end if
+                    i = i + 1
+                 end do
+                 ! revert value
+                 ! assign superdiagonal entries of d from array e to
+                 ! superdiagonal entries of a.
+                 i = n
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       a( i-1, i ) = e( i )
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              ! end a is upper
+              end if
+           else
+              ! begin a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 ! assign subdiagonal entries of d to array e and zero out
+                 ! corresponding entries in input storage a
+                 i = 1
+                 e( n ) = zero
+                 do while ( i<=n )
+                    if( i<n .and. ipiv(i)<0 ) then
+                       e( i ) = a( i+1, i )
+                       e( i+1 ) = zero
+                       a( i+1, i ) = zero
+                       i = i + 1
+                    else
+                       e( i ) = zero
+                    end if
+                    i = i + 1
+                 end do
+                 ! convert permutations
+                 ! apply permutations to submatrices of lower part of a
+                 ! in factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_qswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i and ipiv(i) and i+1 and ipiv(i+1)
+                       ! in a(i:n,1:i-1)
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i+1 )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_qswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                          if( ip2/=(i+1) ) then
+                             call stdlib_qswap( i-1, a( i+1, 1 ), lda,a( ip2, 1 ), lda )
+                          end if
+                       end if
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations
+                 ! apply permutations to submatrices of lower part of a
+                 ! in reverse factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_qswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i+1) and i and ipiv(i)
+                       ! in a(i:n,1:i-1)
+                       i = i - 1
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i+1 )
+                       if ( i>1 ) then
+                          if( ip2/=(i+1) ) then
+                             call stdlib_qswap( i-1, a( ip2, 1 ), lda,a( i+1, 1 ), lda )
+                          end if
+                          if( ip/=i ) then
+                             call stdlib_qswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    end if
+                    i = i - 1
+                 end do
+                 ! revert value
+                 ! assign subdiagonal entries of d from array e to
+                 ! subgiagonal entries of a.
+                 i = 1
+                 do while ( i<=n-1 )
+                    if( ipiv( i )<0 ) then
+                       a( i + 1, i ) = e( i )
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              end if
+              ! end a is lower
+           end if
+           return
+     end subroutine stdlib_qsyconvf_rook
+
+     !> DSYEQUB: computes row and column scalings intended to equilibrate a
+     !> symmetric matrix A (with respect to the Euclidean norm) and reduce
+     !> its condition number. The scale factors S are computed by the BIN
+     !> algorithm (see references) so that the scaled matrix B with elements
+     !> B(i,j) = S(i)*A(i,j)*S(j) has a condition number within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_qsyequb( uplo, n, a, lda, s, scond, amax, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(out) :: amax, scond
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: s(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: max_iter = 100
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, iter
+           real(qp) :: avg, std, tol, c0, c1, c2, t, u, si, d, base, smin, smax, smlnum, bignum, &
+                     scale, sumsq
+           logical(lk) :: up
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if ( .not. ( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -1
+           else if ( n < 0 ) then
+              info = -2
+           else if ( lda < max( 1, n ) ) then
+              info = -4
+           end if
+           if ( info /= 0 ) then
+              call stdlib_xerbla( 'DSYEQUB', -info )
+              return
+           end if
+           up = stdlib_lsame( uplo, 'U' )
+           amax = zero
+           ! quick return if possible.
+           if ( n == 0 ) then
+              scond = one
+              return
+           end if
+           do i = 1, n
+              s( i ) = zero
+           end do
+           amax = zero
+           if ( up ) then
+              do j = 1, n
+                 do i = 1, j-1
+                    s( i ) = max( s( i ), abs( a( i, j ) ) )
+                    s( j ) = max( s( j ), abs( a( i, j ) ) )
+                    amax = max( amax, abs( a( i, j ) ) )
+                 end do
+                 s( j ) = max( s( j ), abs( a( j, j ) ) )
+                 amax = max( amax, abs( a( j, j ) ) )
+              end do
+           else
+              do j = 1, n
+                 s( j ) = max( s( j ), abs( a( j, j ) ) )
+                 amax = max( amax, abs( a( j, j ) ) )
+                 do i = j+1, n
+                    s( i ) = max( s( i ), abs( a( i, j ) ) )
+                    s( j ) = max( s( j ), abs( a( i, j ) ) )
+                    amax = max( amax, abs( a( i, j ) ) )
+                 end do
+              end do
+           end if
+           do j = 1, n
+              s( j ) = 1.0_qp / s( j )
+           end do
+           tol = one / sqrt( 2.0_qp * n )
+           do iter = 1, max_iter
+              scale = zero
+              sumsq = zero
+              ! beta = |a|s
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              if ( up ) then
+                 do j = 1, n
+                    do i = 1, j-1
+                       work( i ) = work( i ) + abs( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + abs( a( i, j ) ) * s( i )
+                    end do
+                    work( j ) = work( j ) + abs( a( j, j ) ) * s( j )
+                 end do
+              else
+                 do j = 1, n
+                    work( j ) = work( j ) + abs( a( j, j ) ) * s( j )
+                    do i = j+1, n
+                       work( i ) = work( i ) + abs( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + abs( a( i, j ) ) * s( i )
+                    end do
+                 end do
+              end if
+              ! avg = s^t beta / n
+              avg = zero
+              do i = 1, n
+                 avg = avg + s( i )*work( i )
+              end do
+              avg = avg / n
+              std = zero
+              do i = n+1, 2*n
+                 work( i ) = s( i-n ) * work( i-n ) - avg
+              end do
+              call stdlib_qlassq( n, work( n+1 ), 1, scale, sumsq )
+              std = scale * sqrt( sumsq / n )
+              if ( std < tol * avg ) goto 999
+              do i = 1, n
+                 t = abs( a( i, i ) )
+                 si = s( i )
+                 c2 = ( n-1 ) * t
+                 c1 = ( n-2 ) * ( work( i ) - t*si )
+                 c0 = -(t*si)*si + 2*work( i )*si - n*avg
+                 d = c1*c1 - 4*c0*c2
+                 if ( d <= 0 ) then
+                    info = -1
+                    return
+                 end if
+                 si = -2*c0 / ( c1 + sqrt( d ) )
+                 d = si - s( i )
+                 u = zero
+                 if ( up ) then
+                    do j = 1, i
+                       t = abs( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = abs( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 else
+                    do j = 1, i
+                       t = abs( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = abs( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 end if
+                 avg = avg + ( u + work( i ) ) * d / n
+                 s( i ) = si
+              end do
+           end do
+           999 continue
+           smlnum = stdlib_qlamch( 'SAFEMIN' )
+           bignum = one / smlnum
+           smin = bignum
+           smax = zero
+           t = one / sqrt( avg )
+           base = stdlib_qlamch( 'B' )
+           u = one / log( base )
+           do i = 1, n
+              s( i ) = base ** int( u * log( s( i ) * t ),KIND=ilp)
+              smin = min( smin, s( i ) )
+              smax = max( smax, s( i ) )
+           end do
+           scond = max( smin, smlnum ) / min( smax, bignum )
+     end subroutine stdlib_qsyequb
+
+     !> DSYEV: computes all eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric matrix A.
+
+     subroutine stdlib_qsyev( jobz, uplo, n, a, lda, w, work, lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indtau, indwrk, iscale, llwork, lwkopt, nb
+           real(qp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, ( nb+2 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, 3*n-1 ) .and. .not.lquery )info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              return
+           end if
+           if( n==1 ) then
+              w( 1 ) = a( 1, 1 )
+              work( 1 ) = 2
+              if( wantz )a( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_qlansy( 'M', uplo, n, a, lda, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 )call stdlib_qlascl( uplo, 0, 0, one, sigma, n, n, a, lda, info )
+           ! call stdlib_qsytrd to reduce symmetric matrix to tridiagonal form.
+           inde = 1
+           indtau = inde + n
+           indwrk = indtau + n
+           llwork = lwork - indwrk + 1
+           call stdlib_qsytrd( uplo, n, a, lda, w, work( inde ), work( indtau ),work( indwrk ), &
+                     llwork, iinfo )
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, first call
+           ! stdlib_qorgtr to generate the orthogonal matrix, then call stdlib_qsteqr.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, work( inde ), info )
+           else
+              call stdlib_qorgtr( uplo, n, a, lda, work( indtau ), work( indwrk ),llwork, iinfo )
+                        
+              call stdlib_qsteqr( jobz, n, w, work( inde ), a, lda, work( indtau ),info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsyev
+
+     !> DSYEVD: computes all eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric matrix A. If eigenvectors are desired, it uses a
+     !> divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+     !> Because of large use of BLAS of level 3, DSYEVD needs N**2 more
+     !> workspace than DSYEVX.
+
+     subroutine stdlib_qsyevd( jobz, uplo, n, a, lda, w, work, lwork, iwork,liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, inde, indtau, indwk2, indwrk, iscale, liopt, liwmin, llwork, &
+                     llwrk2, lopt, lwmin
+           real(qp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 liwmin = 1
+                 lwmin = 1
+                 lopt = lwmin
+                 liopt = liwmin
+              else
+                 if( wantz ) then
+                    liwmin = 3 + 5*n
+                    lwmin = 1 + 6*n + 2*n**2
+                 else
+                    liwmin = 1
+                    lwmin = 2*n + 1
+                 end if
+                 lopt = max( lwmin, 2*n +stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 ) )
+                           
+                 liopt = liwmin
+              end if
+              work( 1 ) = lopt
+              iwork( 1 ) = liopt
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -8
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -10
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = a( 1, 1 )
+              if( wantz )a( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_qlansy( 'M', uplo, n, a, lda, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 )call stdlib_qlascl( uplo, 0, 0, one, sigma, n, n, a, lda, info )
+           ! call stdlib_qsytrd to reduce symmetric matrix to tridiagonal form.
+           inde = 1
+           indtau = inde + n
+           indwrk = indtau + n
+           llwork = lwork - indwrk + 1
+           indwk2 = indwrk + n*n
+           llwrk2 = lwork - indwk2 + 1
+           call stdlib_qsytrd( uplo, n, a, lda, w, work( inde ), work( indtau ),work( indwrk ), &
+                     llwork, iinfo )
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, first call
+           ! stdlib_qstedc to generate the eigenvector matrix, work(indwrk), of the
+           ! tridiagonal matrix, then call stdlib_qormtr to multiply it by the
+           ! householder transformations stored in a.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, work( inde ), info )
+           else
+              call stdlib_qstedc( 'I', n, w, work( inde ), work( indwrk ), n,work( indwk2 ), &
+                        llwrk2, iwork, liwork, info )
+              call stdlib_qormtr( 'L', uplo, 'N', n, n, a, lda, work( indtau ),work( indwrk ), n, &
+                        work( indwk2 ), llwrk2, iinfo )
+              call stdlib_qlacpy( 'A', n, n, work( indwrk ), n, a, lda )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 )call stdlib_qscal( n, one / sigma, w, 1 )
+           work( 1 ) = lopt
+           iwork( 1 ) = liopt
+           return
+     end subroutine stdlib_qsyevd
+
+     !> DSYEVR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric matrix A.  Eigenvalues and eigenvectors can be
+     !> selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+     !> DSYEVR first reduces the matrix A to tridiagonal form T with a call
+     !> to DSYTRD.  Then, whenever possible, DSYEVR calls DSTEMR to compute
+     !> the eigenspectrum using Relatively Robust Representations.  DSTEMR
+     !> computes eigenvalues by the dqds algorithm, while orthogonal
+     !> eigenvectors are computed from various "good" L D L^T representations
+     !> (also known as Relatively Robust Representations). Gram-Schmidt
+     !> orthogonalization is avoided as far as possible. More specifically,
+     !> the various steps of the algorithm are as follows.
+     !> For each unreduced block (submatrix) of T,
+     !> (a) Compute T - sigma I  = L D L^T, so that L and D
+     !> define all the wanted eigenvalues to high relative accuracy.
+     !> This means that small relative changes in the entries of D and L
+     !> cause only small relative changes in the eigenvalues and
+     !> eigenvectors. The standard (unfactored) representation of the
+     !> tridiagonal matrix T does not have this property in general.
+     !> (b) Compute the eigenvalues to suitable accuracy.
+     !> If the eigenvectors are desired, the algorithm attains full
+     !> accuracy of the computed eigenvalues only right before
+     !> the corresponding vectors have to be computed, see steps c) and d).
+     !> (c) For each cluster of close eigenvalues, select a new
+     !> shift close to the cluster, find a new factorization, and refine
+     !> the shifted eigenvalues to suitable accuracy.
+     !> (d) For each eigenvalue with a large enough relative separation compute
+     !> the corresponding eigenvector by forming a rank revealing twisted
+     !> factorization. Go back to (c) for any clusters that remain.
+     !> The desired accuracy of the output can be specified by the input
+     !> parameter ABSTOL.
+     !> For more details, see DSTEMR's documentation and:
+     !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+     !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+     !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+     !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+     !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+     !> 2004.  Also LAPACK Working Note 154.
+     !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem",
+     !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Note 1 : DSYEVR calls DSTEMR when the full spectrum is requested
+     !> on machines which conform to the ieee-754 floating point standard.
+     !> DSYEVR calls DSTEBZ and DSTEIN on non-ieee machines and
+     !> when partial spectrum requests are made.
+     !> Normal execution of DSTEMR may create NaNs and infinities and
+     !> hence may abort due to a floating point exception in environments
+     !> which do not handle NaNs and infinities in the ieee standard default
+     !> manner.
+
+     subroutine stdlib_qsyevr( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               isuppz, work, lwork,iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, lda, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, lquery, valeig, wantz, tryrac
+           character :: order
+           integer(ilp) :: i, ieeeok, iinfo, imax, indd, inddd, inde, indee, indibl, indifl, &
+           indisp, indiwo, indtau, indwk, indwkn, iscale, j, jj, liwmin, llwork, llwrkn, lwkopt, &
+                     lwmin, nb, nsplit
+           real(qp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           ieeeok = stdlib_ilaenv( 10, 'DSYEVR', 'N', 1, 2, 3, 4 )
+           lower = stdlib_lsame( uplo, 'L' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ) .or. ( liwork==-1 ) )
+           lwmin = max( 1, 26*n )
+           liwmin = max( 1, 10*n )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -8
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -9
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -15
+              else if( lwork<lwmin .and. .not.lquery ) then
+                 info = -18
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 )
+              nb = max( nb, stdlib_ilaenv( 1, 'DORMTR', uplo, n, -1, -1, -1 ) )
+              lwkopt = max( ( nb+1 )*n, lwmin )
+              work( 1 ) = lwkopt
+              iwork( 1 ) = liwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYEVR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( n==1 ) then
+              work( 1 ) = 7
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = a( 1, 1 )
+              else
+                 if( vl<a( 1, 1 ) .and. vu>=a( 1, 1 ) ) then
+                    m = 1
+                    w( 1 ) = a( 1, 1 )
+                 end if
+              end if
+              if( wantz ) then
+                 z( 1, 1 ) = one
+                 isuppz( 1 ) = 1
+                 isuppz( 2 ) = 1
+              end if
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if (valeig) then
+              vll = vl
+              vuu = vu
+           end if
+           anrm = stdlib_qlansy( 'M', uplo, n, a, lda, work )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 do j = 1, n
+                    call stdlib_qscal( n-j+1, sigma, a( j, j ), 1 )
+                 end do
+              else
+                 do j = 1, n
+                    call stdlib_qscal( j, sigma, a( 1, j ), 1 )
+                 end do
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! initialize indices into workspaces.  note: the iwork indices are
+           ! used only if stdlib_qsterf or stdlib_qstemr fail.
+           ! work(indtau:indtau+n-1) stores the scalar factors of the
+           ! elementary reflectors used in stdlib_qsytrd.
+           indtau = 1
+           ! work(indd:indd+n-1) stores the tridiagonal's diagonal entries.
+           indd = indtau + n
+           ! work(inde:inde+n-1) stores the off-diagonal entries of the
+           ! tridiagonal matrix from stdlib_qsytrd.
+           inde = indd + n
+           ! work(inddd:inddd+n-1) is a copy of the diagonal entries over
+           ! -written by stdlib_qstemr (the stdlib_qsterf path copies the diagonal to w).
+           inddd = inde + n
+           ! work(indee:indee+n-1) is a copy of the off-diagonal entries over
+           ! -written while computing the eigenvalues in stdlib_qsterf and stdlib_qstemr.
+           indee = inddd + n
+           ! indwk is the starting offset of the left-over workspace, and
+           ! llwork is the remaining workspace size.
+           indwk = indee + n
+           llwork = lwork - indwk + 1
+           ! iwork(indibl:indibl+m-1) corresponds to iblock in stdlib_qstebz and
+           ! stores the block indices of each of the m<=n eigenvalues.
+           indibl = 1
+           ! iwork(indisp:indisp+nsplit-1) corresponds to isplit in stdlib_qstebz and
+           ! stores the starting and finishing indices of each block.
+           indisp = indibl + n
+           ! iwork(indifl:indifl+n-1) stores the indices of eigenvectors
+           ! that corresponding to eigenvectors that fail to converge in
+           ! stdlib_qstein.  this information is discarded; if any fail, the driver
+           ! returns info > 0.
+           indifl = indisp + n
+           ! indiwo is the offset of the remaining integer workspace.
+           indiwo = indifl + n
+           ! call stdlib_qsytrd to reduce symmetric matrix to tridiagonal form.
+           call stdlib_qsytrd( uplo, n, a, lda, work( indd ), work( inde ),work( indtau ), work( &
+                     indwk ), llwork, iinfo )
+           ! if all eigenvalues are desired
+           ! then call stdlib_qsterf or stdlib_qstemr and stdlib_qormtr.
+           if( ( alleig .or. ( indeig .and. il==1 .and. iu==n ) ) .and.ieeeok==1 ) then
+              if( .not.wantz ) then
+                 call stdlib_qcopy( n, work( indd ), 1, w, 1 )
+                 call stdlib_qcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_qsterf( n, w, work( indee ), info )
+              else
+                 call stdlib_qcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_qcopy( n, work( indd ), 1, work( inddd ), 1 )
+                 if (abstol <= two*n*eps) then
+                    tryrac = .true.
+                 else
+                    tryrac = .false.
+                 end if
+                 call stdlib_qstemr( jobz, 'A', n, work( inddd ), work( indee ),vl, vu, il, iu, m,&
+                            w, z, ldz, n, isuppz,tryrac, work( indwk ), lwork, iwork, liwork,info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_qstemr.
+                 if( wantz .and. info==0 ) then
+                    indwkn = inde
+                    llwrkn = lwork - indwkn + 1
+                    call stdlib_qormtr( 'L', uplo, 'N', n, m, a, lda,work( indtau ), z, ldz, work(&
+                               indwkn ),llwrkn, iinfo )
+                 end if
+              end if
+              if( info==0 ) then
+                 ! everything worked.  skip stdlib_qstebz/stdlib_qstein.  iwork(:) are
+                 ! undefined.
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_qstebz and, if eigenvectors are desired, stdlib_qstein.
+           ! also call stdlib_qstebz and stdlib_qstein if stdlib_qstemr fails.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           call stdlib_qstebz( range, order, n, vll, vuu, il, iu, abstll,work( indd ), work( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwk ),iwork( indiwo ), info )
+                     
+           if( wantz ) then
+              call stdlib_qstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwk ), iwork( indiwo ), iwork( indifl ),info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_qstein.
+              indwkn = inde
+              llwrkn = lwork - indwkn + 1
+              call stdlib_qormtr( 'L', uplo, 'N', n, m, a, lda, work( indtau ), z,ldz, work( &
+                        indwkn ), llwrkn, iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+        ! jump here if stdlib_qstemr/stdlib_qstein succeeded.
+        30 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.  note: we do not sort the ifail portion of iwork.
+           ! it may not be initialized (if stdlib_qstemr/stdlib_qstein succeeded), and we do
+           ! not return this detailed information to the user.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    w( i ) = w( j )
+                    w( j ) = tmp1
+                    call stdlib_qswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                 end if
+              end do
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_qsyevr
+
+     !> DSYEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric matrix A.  Eigenvalues and eigenvectors can be
+     !> selected by specifying either a range of values or a range of indices
+     !> for the desired eigenvalues.
+
+     subroutine stdlib_qsyevx( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               work, lwork, iwork,ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, lda, ldz, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, lquery, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwo, indtau, &
+           indwkn, indwrk, iscale, itmp1, j, jj, llwork, llwrkn, lwkmin, lwkopt, nb, &
+                     nsplit
+           real(qp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           lower = stdlib_lsame( uplo, 'L' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -8
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -9
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -15
+              end if
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwkmin = 1
+                 work( 1 ) = lwkmin
+              else
+                 lwkmin = 8*n
+                 nb = stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 )
+                 nb = max( nb, stdlib_ilaenv( 1, 'DORMTR', uplo, n, -1, -1, -1 ) )
+                 lwkopt = max( lwkmin, ( nb + 3 )*n )
+                 work( 1 ) = lwkopt
+              end if
+              if( lwork<lwkmin .and. .not.lquery )info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              return
+           end if
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = a( 1, 1 )
+              else
+                 if( vl<a( 1, 1 ) .and. vu>=a( 1, 1 ) ) then
+                    m = 1
+                    w( 1 ) = a( 1, 1 )
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           end if
+           anrm = stdlib_qlansy( 'M', uplo, n, a, lda, work )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 do j = 1, n
+                    call stdlib_qscal( n-j+1, sigma, a( j, j ), 1 )
+                 end do
+              else
+                 do j = 1, n
+                    call stdlib_qscal( j, sigma, a( 1, j ), 1 )
+                 end do
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_qsytrd to reduce symmetric matrix to tridiagonal form.
+           indtau = 1
+           inde = indtau + n
+           indd = inde + n
+           indwrk = indd + n
+           llwork = lwork - indwrk + 1
+           call stdlib_qsytrd( uplo, n, a, lda, work( indd ), work( inde ),work( indtau ), work( &
+                     indwrk ), llwork, iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal to
+           ! zero, then call stdlib_qsterf or stdlib_qorgtr and stdlib_dsteqr.  if this fails for
+           ! some eigenvalue, then try stdlib_qstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_qcopy( n, work( indd ), 1, w, 1 )
+              indee = indwrk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_qcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_qsterf( n, w, work( indee ), info )
+              else
+                 call stdlib_qlacpy( 'A', n, n, a, lda, z, ldz )
+                 call stdlib_qorgtr( uplo, n, z, ldz, work( indtau ),work( indwrk ), llwork, &
+                           iinfo )
+                 call stdlib_qcopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_qsteqr( jobz, n, w, work( indee ), z, ldz,work( indwrk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 40
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_qstebz and, if eigenvectors are desired, stdlib_dstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_qstebz( range, order, n, vll, vuu, il, iu, abstll,work( indd ), work( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwrk ),iwork( indiwo ), info &
+                     )
+           if( wantz ) then
+              call stdlib_qstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwrk ), iwork( indiwo ), ifail, info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_qstein.
+              indwkn = inde
+              llwrkn = lwork - indwkn + 1
+              call stdlib_qormtr( 'L', uplo, 'N', n, m, a, lda, work( indtau ), z,ldz, work( &
+                        indwkn ), llwrkn, iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           40 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_qswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsyevx
+
+     !> DSYGS2: reduces a real symmetric-definite generalized eigenproblem
+     !> to standard form.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**T)*A*inv(U) or inv(L)*A*inv(L**T)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**T or L**T *A*L.
+     !> B must have been previously factorized as U**T *U or L*L**T by DPOTRF.
+
+     pure subroutine stdlib_qsygs2( itype, uplo, n, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k
+           real(qp) :: akk, bkk, ct
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYGS2', -info )
+              return
+           end if
+           if( itype==1 ) then
+              if( upper ) then
+                 ! compute inv(u**t)*a*inv(u)
+                 do k = 1, n
+                    ! update the upper triangle of a(k:n,k:n)
+                    akk = a( k, k )
+                    bkk = b( k, k )
+                    akk = akk / bkk**2
+                    a( k, k ) = akk
+                    if( k<n ) then
+                       call stdlib_qscal( n-k, one / bkk, a( k, k+1 ), lda )
+                       ct = -half*akk
+                       call stdlib_qaxpy( n-k, ct, b( k, k+1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_qsyr2( uplo, n-k, -one, a( k, k+1 ), lda,b( k, k+1 ), ldb, a( &
+                                 k+1, k+1 ), lda )
+                       call stdlib_qaxpy( n-k, ct, b( k, k+1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_qtrsv( uplo, 'TRANSPOSE', 'NON-UNIT', n-k,b( k+1, k+1 ), ldb, &
+                                 a( k, k+1 ), lda )
+                    end if
+                 end do
+              else
+                 ! compute inv(l)*a*inv(l**t)
+                 do k = 1, n
+                    ! update the lower triangle of a(k:n,k:n)
+                    akk = a( k, k )
+                    bkk = b( k, k )
+                    akk = akk / bkk**2
+                    a( k, k ) = akk
+                    if( k<n ) then
+                       call stdlib_qscal( n-k, one / bkk, a( k+1, k ), 1 )
+                       ct = -half*akk
+                       call stdlib_qaxpy( n-k, ct, b( k+1, k ), 1, a( k+1, k ), 1 )
+                       call stdlib_qsyr2( uplo, n-k, -one, a( k+1, k ), 1,b( k+1, k ), 1, a( k+1, &
+                                 k+1 ), lda )
+                       call stdlib_qaxpy( n-k, ct, b( k+1, k ), 1, a( k+1, k ), 1 )
+                       call stdlib_qtrsv( uplo, 'NO TRANSPOSE', 'NON-UNIT', n-k,b( k+1, k+1 ), &
+                                 ldb, a( k+1, k ), 1 )
+                    end if
+                 end do
+              end if
+           else
+              if( upper ) then
+                 ! compute u*a*u**t
+                 do k = 1, n
+                    ! update the upper triangle of a(1:k,1:k)
+                    akk = a( k, k )
+                    bkk = b( k, k )
+                    call stdlib_qtrmv( uplo, 'NO TRANSPOSE', 'NON-UNIT', k-1, b,ldb, a( 1, k ), 1 &
+                              )
+                    ct = half*akk
+                    call stdlib_qaxpy( k-1, ct, b( 1, k ), 1, a( 1, k ), 1 )
+                    call stdlib_qsyr2( uplo, k-1, one, a( 1, k ), 1, b( 1, k ), 1,a, lda )
+                    call stdlib_qaxpy( k-1, ct, b( 1, k ), 1, a( 1, k ), 1 )
+                    call stdlib_qscal( k-1, bkk, a( 1, k ), 1 )
+                    a( k, k ) = akk*bkk**2
+                 end do
+              else
+                 ! compute l**t *a*l
+                 do k = 1, n
+                    ! update the lower triangle of a(1:k,1:k)
+                    akk = a( k, k )
+                    bkk = b( k, k )
+                    call stdlib_qtrmv( uplo, 'TRANSPOSE', 'NON-UNIT', k-1, b, ldb,a( k, 1 ), lda )
+                              
+                    ct = half*akk
+                    call stdlib_qaxpy( k-1, ct, b( k, 1 ), ldb, a( k, 1 ), lda )
+                    call stdlib_qsyr2( uplo, k-1, one, a( k, 1 ), lda, b( k, 1 ),ldb, a, lda )
+                              
+                    call stdlib_qaxpy( k-1, ct, b( k, 1 ), ldb, a( k, 1 ), lda )
+                    call stdlib_qscal( k-1, bkk, a( k, 1 ), lda )
+                    a( k, k ) = akk*bkk**2
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qsygs2
+
+     !> DSYGST: reduces a real symmetric-definite generalized eigenproblem
+     !> to standard form.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**T)*A*inv(U) or inv(L)*A*inv(L**T)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**T or L**T*A*L.
+     !> B must have been previously factorized as U**T*U or L*L**T by DPOTRF.
+
+     pure subroutine stdlib_qsygst( itype, uplo, n, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(in) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYGST', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'DSYGST', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_qsygs2( itype, uplo, n, a, lda, b, ldb, info )
+           else
+              ! use blocked code
+              if( itype==1 ) then
+                 if( upper ) then
+                    ! compute inv(u**t)*a*inv(u)
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the upper triangle of a(k:n,k:n)
+                       call stdlib_qsygs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                       if( k+kb<=n ) then
+                          call stdlib_qtrsm( 'LEFT', uplo, 'TRANSPOSE', 'NON-UNIT',kb, n-k-kb+1, &
+                                    one, b( k, k ), ldb,a( k, k+kb ), lda )
+                          call stdlib_qsymm( 'LEFT', uplo, kb, n-k-kb+1, -half,a( k, k ), lda, b( &
+                                    k, k+kb ), ldb, one,a( k, k+kb ), lda )
+                          call stdlib_qsyr2k( uplo, 'TRANSPOSE', n-k-kb+1, kb, -one,a( k, k+kb ), &
+                                    lda, b( k, k+kb ), ldb,one, a( k+kb, k+kb ), lda )
+                          call stdlib_qsymm( 'LEFT', uplo, kb, n-k-kb+1, -half,a( k, k ), lda, b( &
+                                    k, k+kb ), ldb, one,a( k, k+kb ), lda )
+                          call stdlib_qtrsm( 'RIGHT', uplo, 'NO TRANSPOSE','NON-UNIT', kb, n-k-kb+&
+                                    1, one,b( k+kb, k+kb ), ldb, a( k, k+kb ),lda )
+                       end if
+                    end do
+                 else
+                    ! compute inv(l)*a*inv(l**t)
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the lower triangle of a(k:n,k:n)
+                       call stdlib_qsygs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                       if( k+kb<=n ) then
+                          call stdlib_qtrsm( 'RIGHT', uplo, 'TRANSPOSE', 'NON-UNIT',n-k-kb+1, kb, &
+                                    one, b( k, k ), ldb,a( k+kb, k ), lda )
+                          call stdlib_qsymm( 'RIGHT', uplo, n-k-kb+1, kb, -half,a( k, k ), lda, b(&
+                                     k+kb, k ), ldb, one,a( k+kb, k ), lda )
+                          call stdlib_qsyr2k( uplo, 'NO TRANSPOSE', n-k-kb+1, kb,-one, a( k+kb, k &
+                                    ), lda, b( k+kb, k ),ldb, one, a( k+kb, k+kb ), lda )
+                          call stdlib_qsymm( 'RIGHT', uplo, n-k-kb+1, kb, -half,a( k, k ), lda, b(&
+                                     k+kb, k ), ldb, one,a( k+kb, k ), lda )
+                          call stdlib_qtrsm( 'LEFT', uplo, 'NO TRANSPOSE','NON-UNIT', n-k-kb+1, &
+                                    kb, one,b( k+kb, k+kb ), ldb, a( k+kb, k ),lda )
+                       end if
+                    end do
+                 end if
+              else
+                 if( upper ) then
+                    ! compute u*a*u**t
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the upper triangle of a(1:k+kb-1,1:k+kb-1)
+                       call stdlib_qtrmm( 'LEFT', uplo, 'NO TRANSPOSE', 'NON-UNIT',k-1, kb, one, &
+                                 b, ldb, a( 1, k ), lda )
+                       call stdlib_qsymm( 'RIGHT', uplo, k-1, kb, half, a( k, k ),lda, b( 1, k ), &
+                                 ldb, one, a( 1, k ), lda )
+                       call stdlib_qsyr2k( uplo, 'NO TRANSPOSE', k-1, kb, one,a( 1, k ), lda, b( &
+                                 1, k ), ldb, one, a,lda )
+                       call stdlib_qsymm( 'RIGHT', uplo, k-1, kb, half, a( k, k ),lda, b( 1, k ), &
+                                 ldb, one, a( 1, k ), lda )
+                       call stdlib_qtrmm( 'RIGHT', uplo, 'TRANSPOSE', 'NON-UNIT',k-1, kb, one, b( &
+                                 k, k ), ldb, a( 1, k ),lda )
+                       call stdlib_qsygs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                    end do
+                 else
+                    ! compute l**t*a*l
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the lower triangle of a(1:k+kb-1,1:k+kb-1)
+                       call stdlib_qtrmm( 'RIGHT', uplo, 'NO TRANSPOSE', 'NON-UNIT',kb, k-1, one, &
+                                 b, ldb, a( k, 1 ), lda )
+                       call stdlib_qsymm( 'LEFT', uplo, kb, k-1, half, a( k, k ),lda, b( k, 1 ), &
+                                 ldb, one, a( k, 1 ), lda )
+                       call stdlib_qsyr2k( uplo, 'TRANSPOSE', k-1, kb, one,a( k, 1 ), lda, b( k, &
+                                 1 ), ldb, one, a,lda )
+                       call stdlib_qsymm( 'LEFT', uplo, kb, k-1, half, a( k, k ),lda, b( k, 1 ), &
+                                 ldb, one, a( k, 1 ), lda )
+                       call stdlib_qtrmm( 'LEFT', uplo, 'TRANSPOSE', 'NON-UNIT', kb,k-1, one, b( &
+                                 k, k ), ldb, a( k, 1 ), lda )
+                       call stdlib_qsygs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qsygst
+
+     !> DSYGV: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+     !> Here A and B are assumed to be symmetric and B is also
+     !> positive definite.
+
+     subroutine stdlib_qsygv( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, lwork, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: lwkmin, lwkopt, nb, neig
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              lwkmin = max( 1, 3*n - 1 )
+              nb = stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( lwkmin, ( nb + 2 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYGV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_qpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_qsygst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_qsyev( jobz, uplo, n, a, lda, w, work, lwork, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 call stdlib_qtrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, neig, one,b, ldb, a, lda )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_qtrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, neig, one,b, ldb, a, lda )
+                           
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsygv
+
+     !> DSYGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be symmetric and B is also positive definite.
+     !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_qsygvd( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, iwork, liwork,&
+                info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: liopt, liwmin, lopt, lwmin
+           ! Intrinsic Functions 
+           intrinsic :: real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              liwmin = 1
+              lwmin = 1
+           else if( wantz ) then
+              liwmin = 3 + 5*n
+              lwmin = 1 + 6*n + 2*n**2
+           else
+              liwmin = 1
+              lwmin = 2*n + 1
+           end if
+           lopt = lwmin
+           liopt = liwmin
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              work( 1 ) = lopt
+              iwork( 1 ) = liopt
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_qpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_qsygst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_qsyevd( jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork,info )
+           lopt = max( real( lopt,KIND=qp), real( work( 1 ),KIND=qp) )
+           liopt = max( real( liopt,KIND=qp), real( iwork( 1 ),KIND=qp) )
+           if( wantz .and. info==0 ) then
+              ! backtransform eigenvectors to the original problem.
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 call stdlib_qtrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, n, one,b, ldb, a, lda )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_qtrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, n, one,b, ldb, a, lda )
+                           
+              end if
+           end if
+           work( 1 ) = lopt
+           iwork( 1 ) = liopt
+           return
+     end subroutine stdlib_qsygvd
+
+     !> DSYGVX: computes selected eigenvalues, and optionally, eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A
+     !> and B are assumed to be symmetric and B is also positive definite.
+     !> Eigenvalues and eigenvectors can be selected by specifying either a
+     !> range of values or a range of indices for the desired eigenvalues.
+
+     subroutine stdlib_qsygvx( itype, jobz, range, uplo, n, a, lda, b, ldb,vl, vu, il, iu, abstol,&
+                m, w, z, ldz, work,lwork, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, itype, iu, lda, ldb, ldz, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: w(*), work(*), z(ldz,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lquery, upper, valeig, wantz
+           character :: trans
+           integer(ilp) :: lwkmin, lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           upper = stdlib_lsame( uplo, 'U' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -3
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -11
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -12
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -13
+                 end if
+              end if
+           end if
+           if (info==0) then
+              if (ldz<1 .or. (wantz .and. ldz<n)) then
+                 info = -18
+              end if
+           end if
+           if( info==0 ) then
+              lwkmin = max( 1, 8*n )
+              nb = stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( lwkmin, ( nb + 3 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYGVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              return
+           end if
+           ! form a cholesky factorization of b.
+           call stdlib_qpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_qsygst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_qsyevx( jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol,m, w, z, ldz, &
+                     work, lwork, iwork, ifail, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              if( info>0 )m = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 call stdlib_qtrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, m, one, b,ldb, z, ldz )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_qtrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, m, one, b,ldb, z, ldz )
+                           
+              end if
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsygvx
+
+     !> DSYRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric indefinite, and
+     !> provides error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_qsyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*)
+           real(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_qcopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_qsymv( uplo, n, -one, a, lda, x( 1, j ), 1, one,work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    do i = 1, k - 1
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + abs( a( k, k ) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( a( k, k ) )*xk
+                    do i = k + 1, n
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_qsytrs( uplo, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                 call stdlib_qaxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_qlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_qlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_qsytrs( uplo, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_qsytrs( uplo, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_qsyrfs
+
+     !> DSYSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.  The factored form of A is then
+     !> used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_qsysv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_qsytrf( uplo, n, a, lda, ipiv, work, -1, info )
+                 lwkopt = work(1)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYSV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_qsytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              if ( lwork<n ) then
+              ! solve with trs ( use level blas 2)
+                 call stdlib_qsytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+              else
+              ! solve with trs2 ( use level blas 3)
+                 call stdlib_qsytrs2( uplo,n,nrhs,a,lda,ipiv,b,ldb,work,info )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsysv
+
+     !> DSYSV computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Aasen's algorithm is used to factor A as
+     !> A = U**T * T * U,  if UPLO = 'U', or
+     !> A = L * T * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is symmetric tridiagonal. The factored
+     !> form of A is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_qsysv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, lwkopt_sytrf, lwkopt_sytrs
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max(2*n, 3*n-2) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              call stdlib_qsytrf_aa( uplo, n, a, lda, ipiv, work, -1, info )
+              lwkopt_sytrf = int( work(1),KIND=ilp)
+              call stdlib_qsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,-1, info )
+              lwkopt_sytrs = int( work(1),KIND=ilp)
+              lwkopt = max( lwkopt_sytrf, lwkopt_sytrs )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYSV_AA ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u**t*t*u or a = l*t*l**t.
+           call stdlib_qsytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_qsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsysv_aa
+
+     !> DSYSV_RK: computes the solution to a real system of linear
+     !> equations A * X = B, where A is an N-by-N symmetric matrix
+     !> and X and B are N-by-NRHS matrices.
+     !> The bounded Bunch-Kaufman (rook) diagonal pivoting method is used
+     !> to factor A as
+     !> A = P*U*D*(U**T)*(P**T),  if UPLO = 'U', or
+     !> A = P*L*D*(L**T)*(P**T),  if UPLO = 'L',
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> DSYTRF_RK is called to compute the factorization of a real
+     !> symmetric matrix.  The factored form of A is then used to solve
+     !> the system of equations A * X = B by calling BLAS3 routine DSYTRS_3.
+
+     pure subroutine stdlib_qsysv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,work, lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_qsytrf_rk( uplo, n, a, lda, e, ipiv, work, -1, info )
+                 lwkopt = work(1)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYSV_RK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = p*u*d*(u**t)*(p**t) or
+           ! a = p*u*d*(u**t)*(p**t).
+           call stdlib_qsytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b with blas3 solver, overwriting b with x.
+              call stdlib_qsytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsysv_rk
+
+     !> DSYSV_ROOK: computes the solution to a real system of linear
+     !> equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> DSYTRF_ROOK is called to compute the factorization of a real
+     !> symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method.
+     !> The factored form of A is then used to solve the system
+     !> of equations A * X = B by calling DSYTRS_ROOK.
+
+     pure subroutine stdlib_qsysv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_qsytrf_rook( uplo, n, a, lda, ipiv, work, -1, info )
+                 lwkopt = work(1)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYSV_ROOK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_qsytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              ! solve with trs_rook ( use level 2 blas)
+              call stdlib_qsytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsysv_rook
+
+     !> DSYSVX: uses the diagonal pivoting factorization to compute the
+     !> solution to a real system of linear equations A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_qsysvx( fact, uplo, n, nrhs, a, lda, af, ldaf, ipiv, b,ldb, x, ldx, rcond, &
+               ferr, berr, work, lwork,iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, lwork, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: a(lda,*), b(ldb,*)
+           real(qp), intent(inout) :: af(ldaf,*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, nofact
+           integer(ilp) :: lwkopt, nb
+           real(qp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -11
+           else if( ldx<max( 1, n ) ) then
+              info = -13
+           else if( lwork<max( 1, 3*n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           if( info==0 ) then
+              lwkopt = max( 1, 3*n )
+              if( nofact ) then
+                 nb = stdlib_ilaenv( 1, 'DSYTRF', uplo, n, -1, -1, -1 )
+                 lwkopt = max( lwkopt, n*nb )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYSVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+              call stdlib_qlacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_qsytrf( uplo, n, af, ldaf, ipiv, work, lwork, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_qlansy( 'I', uplo, n, a, lda, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_qsycon( uplo, n, af, ldaf, ipiv, anorm, rcond, work, iwork,info )
+           ! compute the solution vectors x.
+           call stdlib_qlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_qsytrs( uplo, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_qsyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, iwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsysvx
+
+     !> DSYSWAPR: applies an elementary permutation on the rows and the columns of
+     !> a symmetric matrix.
+
+     pure subroutine stdlib_qsyswapr( uplo, n, a, lda, i1, i2)
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: i1, i2, lda, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,n)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           real(qp) :: tmp
+           ! Executable Statements 
+           upper = stdlib_lsame( uplo, 'U' )
+           if (upper) then
+               ! upper
+               ! first swap
+                ! - swap column i1 and i2 from i1 to i1-1
+              call stdlib_qswap( i1-1, a(1,i1), 1, a(1,i2), 1 )
+                ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap row i1 from i1+1 to i2-1 with col i2 from i1+1 to i2-1
+              tmp=a(i1,i1)
+              a(i1,i1)=a(i2,i2)
+              a(i2,i2)=tmp
+              do i=1,i2-i1-1
+                 tmp=a(i1,i1+i)
+                 a(i1,i1+i)=a(i1+i,i2)
+                 a(i1+i,i2)=tmp
+              end do
+                ! third swap
+                ! - swap row i1 and i2 from i2+1 to n
+              do i=i2+1,n
+                 tmp=a(i1,i)
+                 a(i1,i)=a(i2,i)
+                 a(i2,i)=tmp
+              end do
+             else
+               ! lower
+               ! first swap
+                ! - swap row i1 and i2 from i1 to i1-1
+              call stdlib_qswap( i1-1, a(i1,1), lda, a(i2,1), lda )
+               ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap col i1 from i1+1 to i2-1 with row i2 from i1+1 to i2-1
+               tmp=a(i1,i1)
+               a(i1,i1)=a(i2,i2)
+               a(i2,i2)=tmp
+               do i=1,i2-i1-1
+                  tmp=a(i1+i,i1)
+                  a(i1+i,i1)=a(i2,i1+i)
+                  a(i2,i1+i)=tmp
+               end do
+               ! third swap
+                ! - swap col i1 and i2 from i2+1 to n
+               do i=i2+1,n
+                  tmp=a(i,i1)
+                  a(i,i1)=a(i,i2)
+                  a(i,i2)=tmp
+               end do
+           endif
+     end subroutine stdlib_qsyswapr
+
+     !> DSYTD2: reduces a real symmetric matrix A to symmetric tridiagonal
+     !> form T by an orthogonal similarity transformation: Q**T * A * Q = T.
+
+     pure subroutine stdlib_qsytd2( uplo, n, a, lda, d, e, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: d(*), e(*), tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           real(qp) :: alpha, taui
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTD2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           if( upper ) then
+              ! reduce the upper triangle of a
+              do i = n - 1, 1, -1
+                 ! generate elementary reflector h(i) = i - tau * v * v**t
+                 ! to annihilate a(1:i-1,i+1)
+                 call stdlib_qlarfg( i, a( i, i+1 ), a( 1, i+1 ), 1, taui )
+                 e( i ) = a( i, i+1 )
+                 if( taui/=zero ) then
+                    ! apply h(i) from both sides to a(1:i,1:i)
+                    a( i, i+1 ) = one
+                    ! compute  x := tau * a * v  storing x in tau(1:i)
+                    call stdlib_qsymv( uplo, i, taui, a, lda, a( 1, i+1 ), 1, zero,tau, 1 )
+                              
+                    ! compute  w := x - 1/2 * tau * (x**t * v) * v
+                    alpha = -half*taui*stdlib_qdot( i, tau, 1, a( 1, i+1 ), 1 )
+                    call stdlib_qaxpy( i, alpha, a( 1, i+1 ), 1, tau, 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**t - w * v**t
+                    call stdlib_qsyr2( uplo, i, -one, a( 1, i+1 ), 1, tau, 1, a,lda )
+                    a( i, i+1 ) = e( i )
+                 end if
+                 d( i+1 ) = a( i+1, i+1 )
+                 tau( i ) = taui
+              end do
+              d( 1 ) = a( 1, 1 )
+           else
+              ! reduce the lower triangle of a
+              do i = 1, n - 1
+                 ! generate elementary reflector h(i) = i - tau * v * v**t
+                 ! to annihilate a(i+2:n,i)
+                 call stdlib_qlarfg( n-i, a( i+1, i ), a( min( i+2, n ), i ), 1,taui )
+                 e( i ) = a( i+1, i )
+                 if( taui/=zero ) then
+                    ! apply h(i) from both sides to a(i+1:n,i+1:n)
+                    a( i+1, i ) = one
+                    ! compute  x := tau * a * v  storing y in tau(i:n-1)
+                    call stdlib_qsymv( uplo, n-i, taui, a( i+1, i+1 ), lda,a( i+1, i ), 1, zero, &
+                              tau( i ), 1 )
+                    ! compute  w := x - 1/2 * tau * (x**t * v) * v
+                    alpha = -half*taui*stdlib_qdot( n-i, tau( i ), 1, a( i+1, i ),1 )
+                    call stdlib_qaxpy( n-i, alpha, a( i+1, i ), 1, tau( i ), 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**t - w * v**t
+                    call stdlib_qsyr2( uplo, n-i, -one, a( i+1, i ), 1, tau( i ), 1,a( i+1, i+1 ),&
+                               lda )
+                    a( i+1, i ) = e( i )
+                 end if
+                 d( i ) = a( i, i )
+                 tau( i ) = taui
+              end do
+              d( n ) = a( n, n )
+           end if
+           return
+     end subroutine stdlib_qsytd2
+
+     !> DSYTF2: computes the factorization of a real symmetric matrix A using
+     !> the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**T is the transpose of U, and D is symmetric and
+     !> block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_qsytf2( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kk, kp, kstep
+           real(qp) :: absakk, alpha, colmax, d11, d12, d21, d22, r1, rowmax, t, wk, wkm1, &
+                     wkp1
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTF2', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iqamax( k-1, a( 1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) .or. stdlib_qisnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = imax + stdlib_iqamax( k-imax, a( imax, imax+1 ), lda )
+                    rowmax = abs( a( imax, jmax ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_iqamax( imax-1, a( 1, imax ), 1 )
+                       rowmax = max( rowmax, abs( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( a( imax, imax ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_qswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    call stdlib_qswap( kk-kp-1, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**t = a - w(k)*1/d(k)*w(k)**t
+                    r1 = one / a( k, k )
+                    call stdlib_qsyr( uplo, k-1, -r1, a( 1, k ), 1, a, lda )
+                    ! store u(k) in column k
+                    call stdlib_qscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**t
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = one / ( d11*d22-one )
+                       d12 = t / d12
+                       do j = k - 2, 1, -1
+                          wkm1 = d12*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = d12*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - a( i, k )*wk -a( i, k-1 )*wkm1
+                          end do
+                          a( j, k ) = wk
+                          a( j, k-1 ) = wkm1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iqamax( n-k, a( k+1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) .or. stdlib_qisnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = k - 1 + stdlib_iqamax( imax-k, a( imax, k ), lda )
+                    rowmax = abs( a( imax, jmax ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_iqamax( n-imax, a( imax+1, imax ), 1 )
+                       rowmax = max( rowmax, abs( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( a( imax, imax ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_qswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    call stdlib_qswap( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**t = a - w(k)*(1/d(k))*w(k)**t
+                       d11 = one / a( k, k )
+                       call stdlib_qsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                 
+                       ! store l(k) in column k
+                       call stdlib_qscal( n-k, d11, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k)
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( (a(k) a(k+1))*d(k)**(-1) ) * (a(k) a(k+1))**t
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       d21 = t / d21
+                       do j = k + 2, n
+                          wk = d21*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = d21*( d22*a( j, k+1 )-a( j, k ) )
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - a( i, k )*wk -a( i, k+1 )*wkp1
+                          end do
+                          a( j, k ) = wk
+                          a( j, k+1 ) = wkp1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_qsytf2
+
+     !> DSYTF2_RK: computes the factorization of a real symmetric matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_qsytf2_rk( uplo, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: e(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper, done
+           integer(ilp) :: i, imax, j, jmax, itemp, k, kk, kp, kstep, p, ii
+           real(qp) :: absakk, alpha, colmax, d11, d12, d21, d22, rowmax, dtemp, t, wk, wkm1, &
+                     wkp1, sfmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTF2_RK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = zero
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 34
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iqamax( k-1, a( 1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = zero
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange,
+                    ! use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_iqamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = abs( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_iqamax( imax-1, a( 1, imax ), 1 )
+                          dtemp = abs( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( abs( a( imax, imax ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the leading
+                    ! submatrix a(1:k,1:k) if we have a 2-by-2 pivot
+                    if( p>1 )call stdlib_qswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    if( p<(k-1) )call stdlib_qswap( k-p-1, a( p+1, k ), 1, a( p, p+1 ),lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_qswap( n-k, a( k, k+1 ), lda, a( p, k+1 ), lda )
+                 end if
+                 ! second swap
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    if( kp>1 )call stdlib_qswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    if( ( kk>1 ) .and. ( kp<(kk-1) ) )call stdlib_qswap( kk-kp-1, a( kp+1, kk ), &
+                              1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_qswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = one / a( k, k )
+                          call stdlib_qsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_qscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_qsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = zero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = one / ( d11*d22-one )
+                       do j = k - 2, 1, -1
+                          wkm1 = t*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = t*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - (a( i, k ) / d12 )*wk -( a( i, k-1 ) / d12 )&
+                                       *wkm1
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d12
+                          a( j, k-1 ) = wkm1 / d12
+                       end do
+                    end if
+                    ! copy superdiagonal elements of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    e( k ) = a( k-1, k )
+                    e( k-1 ) = zero
+                    a( k-1, k ) = zero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              34 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = zero
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 64
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iqamax( n-k, a( k+1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = zero
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_iqamax( imax-k, a( imax, k ), lda )
+                          rowmax = abs( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_iqamax( n-imax, a( imax+1, imax ),1 )
+                          dtemp = abs( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( abs( a( imax, imax ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 42
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the trailing
+                    ! submatrix a(k:n,k:n) if we have a 2-by-2 pivot
+                    if( p<n )call stdlib_qswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    if( p>(k+1) )call stdlib_qswap( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_qswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                 end if
+                 ! second swap
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_qswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    if( ( kk<n ) .and. ( kp>(kk+1) ) )call stdlib_qswap( kp-kk-1, a( kk+1, kk ), &
+                              1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_qswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                    ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                    ! store l(k) in column k
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = one / a( k, k )
+                          call stdlib_qsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_qscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_qsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = zero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = t*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = t*( d22*a( j, k+1 )-a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - ( a( i, k ) / d21 )*wk -( a( i, k+1 ) / d21 )&
+                                       *wkp1
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d21
+                          a( j, k+1 ) = wkp1 / d21
+                       end do
+                    end if
+                    ! copy subdiagonal elements of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    e( k ) = a( k+1, k )
+                    e( k+1 ) = zero
+                    a( k+1, k ) = zero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+              64 continue
+           end if
+           return
+     end subroutine stdlib_qsytf2_rk
+
+     !> DSYTF2_ROOK: computes the factorization of a real symmetric matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**T is the transpose of U, and D is symmetric and
+     !> block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_qsytf2_rook( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper, done
+           integer(ilp) :: i, imax, j, jmax, itemp, k, kk, kp, kstep, p, ii
+           real(qp) :: absakk, alpha, colmax, d11, d12, d21, d22, rowmax, dtemp, t, wk, wkm1, &
+                     wkp1, sfmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTF2_ROOK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iqamax( k-1, a( 1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange,
+                    ! use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_iqamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = abs( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_iqamax( imax-1, a( 1, imax ), 1 )
+                          dtemp = abs( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( abs( a( imax, imax ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the leading
+                    ! submatrix a(1:k,1:k) if we have a 2-by-2 pivot
+                    if( p>1 )call stdlib_qswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    if( p<(k-1) )call stdlib_qswap( k-p-1, a( p+1, k ), 1, a( p, p+1 ),lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                 end if
+                 ! second swap
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    if( kp>1 )call stdlib_qswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    if( ( kk>1 ) .and. ( kp<(kk-1) ) )call stdlib_qswap( kk-kp-1, a( kp+1, kk ), &
+                              1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = one / a( k, k )
+                          call stdlib_qsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_qscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_qsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = one / ( d11*d22-one )
+                       do j = k - 2, 1, -1
+                          wkm1 = t*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = t*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - (a( i, k ) / d12 )*wk -( a( i, k-1 ) / d12 )&
+                                       *wkm1
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d12
+                          a( j, k-1 ) = wkm1 / d12
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iqamax( n-k, a( k+1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_iqamax( imax-k, a( imax, k ), lda )
+                          rowmax = abs( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_iqamax( n-imax, a( imax+1, imax ),1 )
+                          dtemp = abs( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( abs( a( imax, imax ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 42
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the trailing
+                    ! submatrix a(k:n,k:n) if we have a 2-by-2 pivot
+                    if( p<n )call stdlib_qswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    if( p>(k+1) )call stdlib_qswap( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                 end if
+                 ! second swap
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_qswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    if( ( kk<n ) .and. ( kp>(kk+1) ) )call stdlib_qswap( kp-kk-1, a( kk+1, kk ), &
+                              1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                    ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                    ! store l(k) in column k
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = one / a( k, k )
+                          call stdlib_qsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_qscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_qsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = t*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = t*( d22*a( j, k+1 )-a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - ( a( i, k ) / d21 )*wk -( a( i, k+1 ) / d21 )&
+                                       *wkp1
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d21
+                          a( j, k+1 ) = wkp1 / d21
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_qsytf2_rook
+
+     !> DSYTRD: reduces a real symmetric matrix A to real symmetric
+     !> tridiagonal form T by an orthogonal similarity transformation:
+     !> Q**T * A * Q = T.
+
+     pure subroutine stdlib_qsytrd( uplo, n, a, lda, d, e, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: d(*), e(*), tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, iws, j, kk, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              ! determine the block size.
+              nb = stdlib_ilaenv( 1, 'DSYTRD', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nx = n
+           iws = 1
+           if( nb>1 .and. nb<n ) then
+              ! determine when to cross over from blocked to unblocked code
+              ! (last block is always handled by unblocked code).
+              nx = max( nb, stdlib_ilaenv( 3, 'DSYTRD', uplo, n, -1, -1, -1 ) )
+              if( nx<n ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code by setting nx = n.
+                    nb = max( lwork / ldwork, 1 )
+                    nbmin = stdlib_ilaenv( 2, 'DSYTRD', uplo, n, -1, -1, -1 )
+                    if( nb<nbmin )nx = n
+                 end if
+              else
+                 nx = n
+              end if
+           else
+              nb = 1
+           end if
+           if( upper ) then
+              ! reduce the upper triangle of a.
+              ! columns 1:kk are handled by the unblocked method.
+              kk = n - ( ( n-nx+nb-1 ) / nb )*nb
+              do i = n - nb + 1, kk + 1, -nb
+                 ! reduce columns i:i+nb-1 to tridiagonal form and form the
+                 ! matrix w which is needed to update the unreduced part of
+                 ! the matrix
+                 call stdlib_qlatrd( uplo, i+nb-1, nb, a, lda, e, tau, work,ldwork )
+                 ! update the unreduced submatrix a(1:i-1,1:i-1), using an
+                 ! update of the form:  a := a - v*w**t - w*v**t
+                 call stdlib_qsyr2k( uplo, 'NO TRANSPOSE', i-1, nb, -one, a( 1, i ),lda, work, &
+                           ldwork, one, a, lda )
+                 ! copy superdiagonal elements back into a, and diagonal
+                 ! elements into d
+                 do j = i, i + nb - 1
+                    a( j-1, j ) = e( j-1 )
+                    d( j ) = a( j, j )
+                 end do
+              end do
+              ! use unblocked code to reduce the last or only block
+              call stdlib_qsytd2( uplo, kk, a, lda, d, e, tau, iinfo )
+           else
+              ! reduce the lower triangle of a
+              do i = 1, n - nx, nb
+                 ! reduce columns i:i+nb-1 to tridiagonal form and form the
+                 ! matrix w which is needed to update the unreduced part of
+                 ! the matrix
+                 call stdlib_qlatrd( uplo, n-i+1, nb, a( i, i ), lda, e( i ),tau( i ), work, &
+                           ldwork )
+                 ! update the unreduced submatrix a(i+ib:n,i+ib:n), using
+                 ! an update of the form:  a := a - v*w**t - w*v**t
+                 call stdlib_qsyr2k( uplo, 'NO TRANSPOSE', n-i-nb+1, nb, -one,a( i+nb, i ), lda, &
+                           work( nb+1 ), ldwork, one,a( i+nb, i+nb ), lda )
+                 ! copy subdiagonal elements back into a, and diagonal
+                 ! elements into d
+                 do j = i, i + nb - 1
+                    a( j+1, j ) = e( j )
+                    d( j ) = a( j, j )
+                 end do
+              end do
+              ! use unblocked code to reduce the last or only block
+              call stdlib_qsytd2( uplo, n-i+1, a( i, i ), lda, d( i ), e( i ),tau( i ), iinfo )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsytrd
+
+     !> DSYTRD_SB2ST: reduces a real symmetric band matrix A to real symmetric
+     !> tridiagonal form T by a orthogonal similarity transformation:
+     !> Q**T * A * Q = T.
+
+     pure subroutine stdlib_qsytrd_sb2st( stage1, vect, uplo, n, kd, ab, ldab,d, e, hous, lhous, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: stage1, uplo, vect
+           integer(ilp), intent(in) :: n, kd, ldab, lhous, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(out) :: d(*), e(*)
+           real(qp), intent(inout) :: ab(ldab,*)
+           real(qp), intent(out) :: hous(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: rzero = 0.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantq, upper, afters1
+           integer(ilp) :: i, m, k, ib, sweepid, myid, shift, stt, st, ed, stind, edind, &
+           blklastind, colpt, thed, stepercol, grsiz, thgrsiz, thgrnb, thgrid, nbtiles, ttype, &
+           tid, nthreads, debug, abdpos, abofdpos, dpos, ofdpos, awpos, inda, indw, apos, sizea, &
+                     lda, indv, indtau, sidev, sizetau, ldv, lhmin, lwmin
+           ! Intrinsic Functions 
+           intrinsic :: min,max,ceiling,real
+           ! Executable Statements 
+           ! determine the minimal workspace size required.
+           ! test the input parameters
+           debug   = 0
+           info    = 0
+           afters1 = stdlib_lsame( stage1, 'Y' )
+           wantq   = stdlib_lsame( vect, 'V' )
+           upper   = stdlib_lsame( uplo, 'U' )
+           lquery  = ( lwork==-1 ) .or. ( lhous==-1 )
+           ! determine the block size, the workspace size and the hous size.
+           ib     = stdlib_ilaenv2stage( 2, 'DSYTRD_SB2ST', vect, n, kd, -1, -1 )
+           lhmin  = stdlib_ilaenv2stage( 3, 'DSYTRD_SB2ST', vect, n, kd, ib, -1 )
+           lwmin  = stdlib_ilaenv2stage( 4, 'DSYTRD_SB2ST', vect, n, kd, ib, -1 )
+           if( .not.afters1 .and. .not.stdlib_lsame( stage1, 'N' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -2
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<(kd+1) ) then
+              info = -7
+           else if( lhous<lhmin .and. .not.lquery ) then
+              info = -11
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              hous( 1 ) = lhmin
+              work( 1 ) = lwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRD_SB2ST', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! determine pointer position
+           ldv      = kd + ib
+           sizetau  = 2 * n
+           sidev    = 2 * n
+           indtau   = 1
+           indv     = indtau + sizetau
+           lda      = 2 * kd + 1
+           sizea    = lda * n
+           inda     = 1
+           indw     = inda + sizea
+           nthreads = 1
+           tid      = 0
+           if( upper ) then
+               apos     = inda + kd
+               awpos    = inda
+               dpos     = apos + kd
+               ofdpos   = dpos - 1
+               abdpos   = kd + 1
+               abofdpos = kd
+           else
+               apos     = inda
+               awpos    = inda + kd + 1
+               dpos     = apos
+               ofdpos   = dpos + 1
+               abdpos   = 1
+               abofdpos = 2
+           endif
+           ! case kd=0:
+           ! the matrix is diagonal. we just copy it (convert to "real" for
+           ! real because d is double and the imaginary part should be 0)
+           ! and store it in d. a sequential code here is better or
+           ! in a parallel environment it might need two cores for d and e
+           if( kd==0 ) then
+               do i = 1, n
+                   d( i ) = ( ab( abdpos, i ) )
+               end do
+               do i = 1, n-1
+                   e( i ) = rzero
+               end do
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! case kd=1:
+           ! the matrix is already tridiagonal. we have to make diagonal
+           ! and offdiagonal elements real, and store them in d and e.
+           ! for that, for real precision just copy the diag and offdiag
+           ! to d and e while for the complex case the bulge chasing is
+           ! performed to convert the hermetian tridiagonal to symmetric
+           ! tridiagonal. a simpler conversion formula might be used, but then
+           ! updating the q matrix will be required and based if q is generated
+           ! or not this might complicate the story.
+           if( kd==1 ) then
+               do i = 1, n
+                   d( i ) = ( ab( abdpos, i ) )
+               end do
+               if( upper ) then
+                   do i = 1, n-1
+                      e( i ) = ( ab( abofdpos, i+1 ) )
+                   end do
+               else
+                   do i = 1, n-1
+                      e( i ) = ( ab( abofdpos, i ) )
+                   end do
+               endif
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! main code start here.
+           ! reduce the symmetric band of a to a tridiagonal matrix.
+           thgrsiz   = n
+           grsiz     = 1
+           shift     = 3
+           nbtiles   = ceiling( real(n,KIND=qp)/real(kd,KIND=qp) )
+           stepercol = ceiling( real(shift,KIND=qp)/real(grsiz,KIND=qp) )
+           thgrnb    = ceiling( real(n-1,KIND=qp)/real(thgrsiz,KIND=qp) )
+           call stdlib_qlacpy( "A", kd+1, n, ab, ldab, work( apos ), lda )
+           call stdlib_qlaset( "A", kd,   n, zero, zero, work( awpos ), lda )
+           ! openmp parallelisation start here
+           !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) &
+           !$OMP&         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) &
+           !$OMP&         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) &
+           !$OMP&         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) &
+           !$OMP&         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) &
+           !$OMP&         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+           !$OMP MASTER
+           ! main bulge chasing loop
+           loop_100: do thgrid = 1, thgrnb
+               stt  = (thgrid-1)*thgrsiz+1
+               thed = min( (stt + thgrsiz -1), (n-1))
+               loop_110: do i = stt, n-1
+                   ed = min( i, thed )
+                   if( stt>ed ) exit
+                   loop_120: do m = 1, stepercol
+                       st = stt
+                       loop_130: do sweepid = st, ed
+                           loop_140: do k = 1, grsiz
+                               myid  = (i-sweepid)*(stepercol*grsiz)+ (m-1)*grsiz + k
+                               if ( myid==1 ) then
+                                   ttype = 1
+                               else
+                                   ttype = mod( myid, 2 ) + 2
+                               endif
+                               if( ttype==2 ) then
+                                   colpt      = (myid/2)*kd + sweepid
+                                   stind      = colpt-kd+1
+                                   edind      = min(colpt,n)
+                                   blklastind = colpt
+                               else
+                                   colpt      = ((myid+1)/2)*kd + sweepid
+                                   stind      = colpt-kd+1
+                                   edind      = min(colpt,n)
+                                   if( ( stind>=edind-1 ).and.( edind==n ) ) then
+                                       blklastind = n
+                                   else
+                                       blklastind = 0
+                                   endif
+                               endif
+                               ! call the kernel
+                               !$ if( ttype/=1 ) then
+                                   !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) &
+                                   !$OMP&     DEPEND(in:WORK(MYID-1)) &
+                                   !$OMP&     DEPEND(out:WORK(MYID))
+                                   !$ tid      = omp_get_thread_num()
+                                   !$ call stdlib_qsb2st_kernels( uplo, wantq, ttype,stind, edind, &
+                                   !$ sweepid, n, kd, ib,work ( inda ), lda,hous( indv ), hous( &
+                                   !$ indtau ), ldv,work( indw + tid*kd ) )
+                                   !$OMP END TASK
+                               !$ else
+                                   !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) &
+                                   !$OMP&     DEPEND(out:WORK(MYID))
+                                   !$ tid      = omp_get_thread_num()
+                                   call stdlib_qsb2st_kernels( uplo, wantq, ttype,stind, edind, &
+                                   sweepid, n, kd, ib,work ( inda ), lda,hous( indv ), hous( &
+                                             indtau ), ldv,work( indw + tid*kd ) )
+                                   !$OMP END TASK
+                               !$ endif
+                               if ( blklastind>=(n-1) ) then
+                                   stt = stt + 1
+                                   exit
+                               endif
+                           end do loop_140
+                       end do loop_130
+                   end do loop_120
+               end do loop_110
+           end do loop_100
+           !$OMP END MASTER
+           !$OMP END PARALLEL
+           ! copy the diagonal from a to d. note that d is real thus only
+           ! the real part is needed, the imaginary part should be zero.
+           do i = 1, n
+               d( i ) = ( work( dpos+(i-1)*lda ) )
+           end do
+           ! copy the off diagonal from a to e. note that e is real thus only
+           ! the real part is needed, the imaginary part should be zero.
+           if( upper ) then
+               do i = 1, n-1
+                  e( i ) = ( work( ofdpos+i*lda ) )
+               end do
+           else
+               do i = 1, n-1
+                  e( i ) = ( work( ofdpos+(i-1)*lda ) )
+               end do
+           endif
+           hous( 1 ) = lhmin
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_qsytrd_sb2st
+
+     !> DSYTRD_SY2SB: reduces a real symmetric matrix A to real symmetric
+     !> band-diagonal form AB by a orthogonal similarity transformation:
+     !> Q**T * A * Q = AB.
+
+     pure subroutine stdlib_qsytrd_sy2sb( uplo, n, kd, a, lda, ab, ldab, tau,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldab, lwork, n, kd
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: ab(ldab,*), tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: rone = 1.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, j, iinfo, lwmin, pn, pk, lk, ldt, ldw, lds2, lds1, ls2, ls1, lw, lt,&
+                      tpos, wpos, s2pos, s1pos
+           ! Intrinsic Functions 
+           intrinsic :: min,max
+           ! Executable Statements 
+           ! determine the minimal workspace size required
+           ! and test the input parameters
+           info   = 0
+           upper  = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           lwmin  = stdlib_ilaenv2stage( 4, 'DSYTRD_SY2SB', '', n, kd, -1, -1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldab<max( 1, kd+1 ) ) then
+              info = -7
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRD_SY2SB', -info )
+              return
+           else if( lquery ) then
+              work( 1 ) = lwmin
+              return
+           end if
+           ! quick return if possible
+           ! copy the upper/lower portion of a into ab
+           if( n<=kd+1 ) then
+               if( upper ) then
+                   do i = 1, n
+                       lk = min( kd+1, i )
+                       call stdlib_qcopy( lk, a( i-lk+1, i ), 1,ab( kd+1-lk+1, i ), 1 )
+                   end do
+               else
+                   do i = 1, n
+                       lk = min( kd+1, n-i+1 )
+                       call stdlib_qcopy( lk, a( i, i ), 1, ab( 1, i ), 1 )
+                   end do
+               endif
+               work( 1 ) = 1
+               return
+           end if
+           ! determine the pointer position for the workspace
+           ldt    = kd
+           lds1   = kd
+           lt     = ldt*kd
+           lw     = n*kd
+           ls1    = lds1*kd
+           ls2    = lwmin - lt - lw - ls1
+            ! ls2 = n*max(kd,factoptnb)
+           tpos   = 1
+           wpos   = tpos  + lt
+           s1pos  = wpos  + lw
+           s2pos  = s1pos + ls1
+           if( upper ) then
+               ldw    = kd
+               lds2   = kd
+           else
+               ldw    = n
+               lds2   = n
+           endif
+           ! set the workspace of the triangular matrix t to zero once such a
+           ! way every time t is generated the upper/lower portion will be always zero
+           call stdlib_qlaset( "A", ldt, kd, zero, zero, work( tpos ), ldt )
+           if( upper ) then
+               do i = 1, n - kd, kd
+                  pn = n-i-kd+1
+                  pk = min( n-i-kd+1, kd )
+                  ! compute the lq factorization of the current block
+                  call stdlib_qgelqf( kd, pn, a( i, i+kd ), lda,tau( i ), work( s2pos ), ls2, &
+                            iinfo )
+                  ! copy the upper portion of a into ab
+                  do j = i, i+pk-1
+                     lk = min( kd, n-j ) + 1
+                     call stdlib_qcopy( lk, a( j, j ), lda, ab( kd+1, j ), ldab-1 )
+                  end do
+                  call stdlib_qlaset( 'LOWER', pk, pk, zero, one,a( i, i+kd ), lda )
+                  ! form the matrix t
+                  call stdlib_qlarft( 'FORWARD', 'ROWWISE', pn, pk,a( i, i+kd ), lda, tau( i ),&
+                            work( tpos ), ldt )
+                  ! compute w:
+                  call stdlib_qgemm( 'CONJUGATE', 'NO TRANSPOSE', pk, pn, pk,one,  work( tpos ), &
+                            ldt,a( i, i+kd ), lda,zero, work( s2pos ), lds2 )
+                  call stdlib_qsymm( 'RIGHT', uplo, pk, pn,one,  a( i+kd, i+kd ), lda,work( s2pos &
+                            ), lds2,zero, work( wpos ), ldw )
+                  call stdlib_qgemm( 'NO TRANSPOSE', 'CONJUGATE', pk, pk, pn,one,  work( wpos ), &
+                            ldw,work( s2pos ), lds2,zero, work( s1pos ), lds1 )
+                  call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pk, pn, pk,-half, work( &
+                            s1pos ), lds1,a( i, i+kd ), lda,one,   work( wpos ), ldw )
+                  ! update the unreduced submatrix a(i+kd:n,i+kd:n), using
+                  ! an update of the form:  a := a - v'*w - w'*v
+                  call stdlib_qsyr2k( uplo, 'CONJUGATE', pn, pk,-one, a( i, i+kd ), lda,work( &
+                            wpos ), ldw,rone, a( i+kd, i+kd ), lda )
+               end do
+              ! copy the upper band to ab which is the band storage matrix
+              do j = n-kd+1, n
+                 lk = min(kd, n-j) + 1
+                 call stdlib_qcopy( lk, a( j, j ), lda, ab( kd+1, j ), ldab-1 )
+              end do
+           else
+               ! reduce the lower triangle of a to lower band matrix
+               loop_40: do i = 1, n - kd, kd
+                  pn = n-i-kd+1
+                  pk = min( n-i-kd+1, kd )
+                  ! compute the qr factorization of the current block
+                  call stdlib_qgeqrf( pn, kd, a( i+kd, i ), lda,tau( i ), work( s2pos ), ls2, &
+                            iinfo )
+                  ! copy the upper portion of a into ab
+                  do j = i, i+pk-1
+                     lk = min( kd, n-j ) + 1
+                     call stdlib_qcopy( lk, a( j, j ), 1, ab( 1, j ), 1 )
+                  end do
+                  call stdlib_qlaset( 'UPPER', pk, pk, zero, one,a( i+kd, i ), lda )
+                  ! form the matrix t
+                  call stdlib_qlarft( 'FORWARD', 'COLUMNWISE', pn, pk,a( i+kd, i ), lda, tau( i ),&
+                            work( tpos ), ldt )
+                  ! compute w:
+                  call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pn, pk, pk,one, a( i+kd, i ),&
+                             lda,work( tpos ), ldt,zero, work( s2pos ), lds2 )
+                  call stdlib_qsymm( 'LEFT', uplo, pn, pk,one, a( i+kd, i+kd ), lda,work( s2pos ),&
+                             lds2,zero, work( wpos ), ldw )
+                  call stdlib_qgemm( 'CONJUGATE', 'NO TRANSPOSE', pk, pk, pn,one, work( s2pos ), &
+                            lds2,work( wpos ), ldw,zero, work( s1pos ), lds1 )
+                  call stdlib_qgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pn, pk, pk,-half, a( i+kd, i &
+                            ), lda,work( s1pos ), lds1,one, work( wpos ), ldw )
+                  ! update the unreduced submatrix a(i+kd:n,i+kd:n), using
+                  ! an update of the form:  a := a - v*w' - w*v'
+                  call stdlib_qsyr2k( uplo, 'NO TRANSPOSE', pn, pk,-one, a( i+kd, i ), lda,work( &
+                            wpos ), ldw,rone, a( i+kd, i+kd ), lda )
+                  ! ==================================================================
+                  ! restore a for comparison and checking to be removed
+                   ! do 45 j = i, i+pk-1
+                      ! lk = min( kd, n-j ) + 1
+                      ! call stdlib_qcopy( lk, ab( 1, j ), 1, a( j, j ), 1 )
+                      45 continue
+                  ! ==================================================================
+               end do loop_40
+              ! copy the lower band to ab which is the band storage matrix
+              do j = n-kd+1, n
+                 lk = min(kd, n-j) + 1
+                 call stdlib_qcopy( lk, a( j, j ), 1, ab( 1, j ), 1 )
+              end do
+           end if
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_qsytrd_sy2sb
+
+     !> DSYTRF: computes the factorization of a real symmetric matrix A using
+     !> the Bunch-Kaufman diagonal pivoting method.  The form of the
+     !> factorization is
+     !> A = U**T*D*U  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_qsytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'DSYTRF', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DSYTRF', uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_qlasyf;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_qlasyf( uplo, k, nb, kb, a, lda, ipiv, work, ldwork,iinfo )
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_qsytf2( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_qlasyf;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_qlasyf( uplo, n-k+1, nb, kb, a( k, k ), lda, ipiv( k ),work, ldwork, &
+                           iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_qsytf2( uplo, n-k+1, a( k, k ), lda, ipiv( k ), iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsytrf
+
+     !> DSYTRF_AA: computes the factorization of a real symmetric matrix A
+     !> using the Aasen's algorithm.  The form of the factorization is
+     !> A = U**T*T*U  or  A = L*T*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is a symmetric tridiagonal matrix.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_qsytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: j, lwkopt
+           integer(ilp) :: nb, mj, nj, k1, k2, j1, j2, j3, jb
+           real(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! determine the block size
+           nb = stdlib_ilaenv( 1, 'DSYTRF_AA', uplo, n, -1, -1, -1 )
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              lwkopt = (nb+1)*n
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRF_AA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return
+           if ( n==0 ) then
+               return
+           endif
+           ipiv( 1 ) = 1
+           if ( n==1 ) then
+              return
+           end if
+           ! adjust block size based on the workspace size
+           if( lwork<((1+nb)*n) ) then
+              nb = ( lwork-n ) / n
+           end if
+           if( upper ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              ! copy first row a(1, 1:n) into h(1:n) (stored in work(1:n))
+              call stdlib_qcopy( n, a( 1, 1 ), lda, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_qlasyf;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              10 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j + 1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_qlasyf_aa( uplo, 2-k1, n-j, jb,a( max(1, j), j+1 ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_qswap( j1-k1-2, a( 1, j2 ), 1,a( 1, ipiv(j2) ), 1 )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+               ! the row a(j1-1, j2-1:n) stores u(j1, j2+1:n) and
+               ! work stores the current block of the auxiriarly matrix h
+              if( j<n ) then
+                 ! if first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = a( j, j+1 )
+                    a( j, j+1 ) = one
+                    call stdlib_qcopy( n-j, a( j-1, j+1 ), lda,work( (j+1-j1+1)+jb*n ), 1 )
+                              
+                    call stdlib_qscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=1 and k2= 0 for the first panel,
+                     ! while k1=0 and k2=1 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_qgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_qgemv( 'NO TRANSPOSE', mj, jb+1,-one, work( j3-j1+1+k1*n ), &
+                                    n,a( j1-k2, j3 ), 1,one, a( j3, j3 ), lda )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block of j2-th block row with stdlib_qgemm
+                       call stdlib_qgemm( 'TRANSPOSE', 'TRANSPOSE',nj, n-j3+1, jb+1,-one, a( j1-&
+                                 k2, j2 ), lda,work( j3-j1+1+k1*n ), n,one, a( j2, j3 ), lda )
+                    end do
+                    ! recover t( j, j+1 )
+                    a( j, j+1 ) = alpha
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_qcopy( n-j, a( j+1, j+1 ), lda, work( 1 ), 1 )
+              end if
+              go to 10
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              ! copy first column a(1:n, 1) into h(1:n, 1)
+               ! (stored in work(1:n))
+              call stdlib_qcopy( n, a( 1, 1 ), 1, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_qlasyf;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              11 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j+1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_qlasyf_aa( uplo, 2-k1, n-j, jb,a( j+1, max(1, j) ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_qswap( j1-k1-2, a( j2, 1 ), lda,a( ipiv(j2), 1 ), lda )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+                ! a(j2+1, j1-1) stores l(j2+1, j1) and
+                ! work(j2+1, 1) stores h(j2+1, 1)
+              if( j<n ) then
+                 ! if first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = a( j+1, j )
+                    a( j+1, j ) = one
+                    call stdlib_qcopy( n-j, a( j+1, j-1 ), 1,work( (j+1-j1+1)+jb*n ), 1 )
+                    call stdlib_qscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=1 and k2= 0 for the first panel,
+                     ! while k1=0 and k2=1 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_qgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_qgemv( 'NO TRANSPOSE', mj, jb+1,-one, work( j3-j1+1+k1*n ), &
+                                    n,a( j3, j1-k2 ), lda,one, a( j3, j3 ), 1 )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block in j2-th block column with stdlib_qgemm
+                       call stdlib_qgemm( 'NO TRANSPOSE', 'TRANSPOSE',n-j3+1, nj, jb+1,-one, work(&
+                                  j3-j1+1+k1*n ), n,a( j2, j1-k2 ), lda,one, a( j3, j2 ), lda )
+                    end do
+                    ! recover t( j+1, j )
+                    a( j+1, j ) = alpha
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_qcopy( n-j, a( j+1, j+1 ), 1, work( 1 ), 1 )
+              end if
+              go to 11
+           end if
+           20 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsytrf_aa
+
+     !> DSYTRF_RK: computes the factorization of a real symmetric matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_qsytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, ip, iws, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'DSYTRF_RK', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRF_RK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DSYTRF_RK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_qlasyf_rk;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 15
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_qlasyf_rk( uplo, k, nb, kb, a, lda, e,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_qsytf2_rk( uplo, k, a, lda, e, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k-kb+1:k and apply row permutations to the
+              ! last k+1 colunms k+1:n after that block
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k<n ) then
+                 do i = k, ( k - kb + 1 ), -1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_qswap( n-k, a( i, k+1 ), lda,a( ip, k+1 ), lda )
+                    end if
+                 end do
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+              ! this label is the exit from main loop over k decreasing
+              ! from n to 1 in steps of kb
+              15 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_qlasyf_rk;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 35
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_qlasyf_rk( uplo, n-k+1, nb, kb, a( k, k ), lda, e( k ),ipiv( k ), &
+                           work, ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_qsytf2_rk( uplo, n-k+1, a( k, k ), lda, e( k ),ipiv( k ), iinfo )
+                           
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do i = k, k + kb - 1
+                 if( ipiv( i )>0 ) then
+                    ipiv( i ) = ipiv( i ) + k - 1
+                 else
+                    ipiv( i ) = ipiv( i ) - k + 1
+                 end if
+              end do
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k:k+kb-1 and apply row permutations to the
+              ! first k-1 colunms 1:k-1 before that block
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k>1 ) then
+                 do i = k, ( k + kb - 1 ), 1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_qswap( k-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                    end if
+                 end do
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+              ! this label is the exit from main loop over k increasing
+              ! from 1 to n in steps of kb
+              35 continue
+           ! end lower
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsytrf_rk
+
+     !> DSYTRF_ROOK: computes the factorization of a real symmetric matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+     !> The form of the factorization is
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_qsytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'DSYTRF_ROOK', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, n*nb )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRF_ROOK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'DSYTRF_ROOK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_qlasyf_rook;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_qlasyf_rook( uplo, k, nb, kb, a, lda,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_qsytf2_rook( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_qlasyf_rook;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_qlasyf_rook( uplo, n-k+1, nb, kb, a( k, k ), lda,ipiv( k ), work, &
+                           ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_qsytf2_rook( uplo, n-k+1, a( k, k ), lda, ipiv( k ),iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qsytrf_rook
+
+     !> DSYTRI: computes the inverse of a real symmetric indefinite matrix
+     !> A using the factorization A = U*D*U**T or A = L*D*L**T computed by
+     !> DSYTRF.
+
+     pure subroutine stdlib_qsytri( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kp, kstep
+           real(qp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==zero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 40
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_qcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_qsymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_qdot( k-1, work, 1, a( 1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k+1 ) )
+                 ak = a( k, k ) / t
+                 akp1 = a( k+1, k+1 ) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_qcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_qsymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_qdot( k-1, work, 1, a( 1, k ),1 )
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_qdot( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_qcopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_qsymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -stdlib_qdot( k-1, work, 1, a( 1, k+1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 call stdlib_qswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                 call stdlib_qswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              go to 30
+              40 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              50 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 60
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_qcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_qsymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_qdot( n-k, work, 1, a( k+1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k-1 ) )
+                 ak = a( k-1, k-1 ) / t
+                 akp1 = a( k, k ) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_qcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_qsymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_qdot( n-k, work, 1, a( k+1, k ),1 )
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_qdot( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 )
+                              
+                    call stdlib_qcopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_qsymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -stdlib_qdot( n-k, work, 1, a( k+1, k-1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 if( kp<n )call stdlib_qswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                 call stdlib_qswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              go to 50
+              60 continue
+           end if
+           return
+     end subroutine stdlib_qsytri
+
+     !> DSYTRI_ROOK: computes the inverse of a real symmetric
+     !> matrix A using the factorization A = U*D*U**T or A = L*D*L**T
+     !> computed by DSYTRF_ROOK.
+
+     pure subroutine stdlib_qsytri_rook( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kp, kstep
+           real(qp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRI_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==zero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 40
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_qcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_qsymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_qdot( k-1, work, 1, a( 1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k+1 ) )
+                 ak = a( k, k ) / t
+                 akp1 = a( k+1, k+1 ) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_qcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_qsymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_qdot( k-1, work, 1, a( 1, k ),1 )
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_qdot( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_qcopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_qsymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -stdlib_qdot( k-1, work, 1, a( 1, k+1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_qswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_qswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k+1 with -ipiv(k) and
+                 ! -ipiv(k+1)in the leading submatrix a(1:k+1,1:k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_qswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_qswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+                 k = k + 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_qswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_qswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k + 1
+              go to 30
+              40 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              50 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 60
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_qcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_qsymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_qdot( n-k, work, 1, a( k+1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k-1 ) )
+                 ak = a( k-1, k-1 ) / t
+                 akp1 = a( k, k ) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_qcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_qsymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_qdot( n-k, work, 1, a( k+1, k ),1 )
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_qdot( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 )
+                              
+                    call stdlib_qcopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_qsymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -stdlib_qdot( n-k, work, 1, a( k+1, k-1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_qswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_qswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k-1 with -ipiv(k) and
+                 ! -ipiv(k-1) in the trailing submatrix a(k-1:n,k-1:n)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_qswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_qswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+                 k = k - 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_qswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_qswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k - 1
+              go to 50
+              60 continue
+           end if
+           return
+     end subroutine stdlib_qsytri_rook
+
+     !> DSYTRS: solves a system of linear equations A*X = B with a real
+     !> symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by DSYTRF.
+
+     pure subroutine stdlib_qsytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           real(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_qger( k-1, nrhs, -one, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_qscal( nrhs, one / a( k, k ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_qswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_qger( k-2, nrhs, -one, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 call stdlib_qger( k-2, nrhs, -one, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ), &
+                           ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_qgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb, a( 1, k ),1, one, b( k, &
+                           1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 call stdlib_qgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb, a( 1, k ),1, one, b( k, &
+                           1 ), ldb )
+                 call stdlib_qgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb,a( 1, k+1 ), 1, one, b( &
+                           k+1, 1 ), ldb )
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_qger( n-k, nrhs, -one, a( k+1, k ), 1, b( k, 1 ),ldb, b( k+&
+                           1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_qscal( nrhs, one / a( k, k ), b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_qswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_qger( n-k-1, nrhs, -one, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, 1 &
+                              ), ldb )
+                    call stdlib_qger( n-k-1, nrhs, -one, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( k+&
+                              2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / akm1k
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_qgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+&
+                           1, k ), 1, one, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_qgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+1, k ),&
+                               1, one, b( k, 1 ), ldb )
+                    call stdlib_qgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+1, k-1 &
+                              ), 1, one, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_qsytrs
+
+     !> DSYTRS2: solves a system of linear equations A*X = B with a real
+     !> symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by DSYTRF and converted by DSYCONV.
+
+     pure subroutine stdlib_qsytrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, j, k, kp
+           real(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRS2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! convert a
+           call stdlib_qsyconv( uplo, 'C', n, a, lda, ipiv, work, iinfo )
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+             ! p**t * b
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp==-ipiv( k-1 ) )call stdlib_qswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb &
+                           )
+                 k=k-2
+              end if
+             end do
+        ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+             call stdlib_qtrsm('L','U','N','U',n,nrhs,one,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i=n
+              do while ( i >= 1 )
+                 if( ipiv(i) > 0 ) then
+                   call stdlib_qscal( nrhs, one / a( i, i ), b( i, 1 ), ldb )
+                 elseif ( i > 1) then
+                    if ( ipiv(i-1) == ipiv(i) ) then
+                       akm1k = work(i)
+                       akm1 = a( i-1, i-1 ) / akm1k
+                       ak = a( i, i ) / akm1k
+                       denom = akm1*ak - one
+                       do j = 1, nrhs
+                          bkm1 = b( i-1, j ) / akm1k
+                          bk = b( i, j ) / akm1k
+                          b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                          b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                    i = i - 1
+                    endif
+                 endif
+                 i = i - 1
+              end do
+            ! compute (u**t \ b) -> b   [ u**t \ (d \ (u \p**t * b) ) ]
+              call stdlib_qtrsm('L','U','T','U',n,nrhs,one,a,lda,b,ldb)
+             ! p * b  [ p * (u**t \ (d \ (u \p**t * b) )) ]
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k < n .and. kp==-ipiv( k+1 ) )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp,&
+                            1 ), ldb )
+                 k=k+2
+              endif
+             end do
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+             ! p**t * b
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k+1).
+                 kp = -ipiv( k+1 )
+                 if( kp==-ipiv( k ) )call stdlib_qswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                           
+                 k=k+2
+              endif
+             end do
+        ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+             call stdlib_qtrsm('L','L','N','U',n,nrhs,one,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i=1
+              do while ( i <= n )
+                 if( ipiv(i) > 0 ) then
+                   call stdlib_qscal( nrhs, one / a( i, i ), b( i, 1 ), ldb )
+                 else
+                       akm1k = work(i)
+                       akm1 = a( i, i ) / akm1k
+                       ak = a( i+1, i+1 ) / akm1k
+                       denom = akm1*ak - one
+                       do j = 1, nrhs
+                          bkm1 = b( i, j ) / akm1k
+                          bk = b( i+1, j ) / akm1k
+                          b( i, j ) = ( ak*bkm1-bk ) / denom
+                          b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                       i = i + 1
+                 endif
+                 i = i + 1
+              end do
+        ! compute (l**t \ b) -> b   [ l**t \ (d \ (l \p**t * b) ) ]
+             call stdlib_qtrsm('L','L','T','U',n,nrhs,one,a,lda,b,ldb)
+             ! p * b  [ p * (l**t \ (d \ (l \p**t * b) )) ]
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k>1 .and. kp==-ipiv( k-1 ) )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, &
+                           1 ), ldb )
+                 k=k-2
+              endif
+             end do
+           end if
+           ! revert a
+           call stdlib_qsyconv( uplo, 'R', n, a, lda, ipiv, work, iinfo )
+           return
+     end subroutine stdlib_qsytrs2
+
+     !> DSYTRS_3: solves a system of linear equations A * X = B with a real
+     !> symmetric matrix A using the factorization computed
+     !> by DSYTRF_RK or DSYTRF_BK:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This algorithm is using Level 3 BLAS.
+
+     pure subroutine stdlib_qsytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(in) :: a(lda,*), e(*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j, k, kp
+           real(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRS_3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! begin upper
+              ! solve a*x = b, where a = u*d*u**t.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+              call stdlib_qtrsm( 'L', 'U', 'N', 'U', n, nrhs, one, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i = n
+              do while ( i>=1 )
+                 if( ipiv( i )>0 ) then
+                    call stdlib_qscal( nrhs, one / a( i, i ), b( i, 1 ), ldb )
+                 else if ( i>1 ) then
+                    akm1k = e( i )
+                    akm1 = a( i-1, i-1 ) / akm1k
+                    ak = a( i, i ) / akm1k
+                    denom = akm1*ak - one
+                    do j = 1, nrhs
+                       bkm1 = b( i-1, j ) / akm1k
+                       bk = b( i, j ) / akm1k
+                       b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                       b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i - 1
+                 end if
+                 i = i - 1
+              end do
+              ! compute (u**t \ b) -> b   [ u**t \ (d \ (u \p**t * b) ) ]
+              call stdlib_qtrsm( 'L', 'U', 'T', 'U', n, nrhs, one, a, lda, b, ldb )
+              ! p * b  [ p * (u**t \ (d \ (u \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+           else
+              ! begin lower
+              ! solve a*x = b, where a = l*d*l**t.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+              call stdlib_qtrsm( 'L', 'L', 'N', 'U', n, nrhs, one, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i = 1
+              do while ( i<=n )
+                 if( ipiv( i )>0 ) then
+                    call stdlib_qscal( nrhs, one / a( i, i ), b( i, 1 ), ldb )
+                 else if( i<n ) then
+                    akm1k = e( i )
+                    akm1 = a( i, i ) / akm1k
+                    ak = a( i+1, i+1 ) / akm1k
+                    denom = akm1*ak - one
+                    do  j = 1, nrhs
+                       bkm1 = b( i, j ) / akm1k
+                       bk = b( i+1, j ) / akm1k
+                       b( i, j ) = ( ak*bkm1-bk ) / denom
+                       b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i + 1
+                 end if
+                 i = i + 1
+              end do
+              ! compute (l**t \ b) -> b   [ l**t \ (d \ (l \p**t * b) ) ]
+              call stdlib_qtrsm('L', 'L', 'T', 'U', n, nrhs, one, a, lda, b, ldb )
+              ! p * b  [ p * (l**t \ (d \ (l \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! end lower
+           end if
+           return
+     end subroutine stdlib_qsytrs_3
+
+     !> DSYTRS_AA: solves a system of linear equations A*X = B with a real
+     !> symmetric matrix A using the factorization A = U**T*T*U or
+     !> A = L*T*L**T computed by DSYTRF_AA.
+
+     pure subroutine stdlib_qsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, nrhs, lda, ldb, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: b(ldb,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           logical(lk) :: lquery, upper
+           integer(ilp) :: k, kp, lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, 3*n-2 ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRS_AA', -info )
+              return
+           else if( lquery ) then
+              lwkopt = (3*n-2)
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u**t*t*u.
+              ! 1) forward substitution with u**t
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute u**t \ b -> b    [ (u**t \p**t * b) ]
+                 call stdlib_qtrsm('L', 'U', 'T', 'U', n-1, nrhs, one, a( 1, 2 ),lda, b( 2, 1 ), &
+                           ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (u**t \p**t * b) ]
+              call stdlib_qlacpy( 'F', 1, n, a( 1, 1 ), lda+1, work( n ), 1)
+              if( n>1 ) then
+                 call stdlib_qlacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 1 ), 1 )
+                 call stdlib_qlacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 2*n ), 1 )
+              end if
+              call stdlib_qgtsv( n, nrhs, work( 1 ), work( n ), work( 2*n ), b, ldb,info )
+              ! 3) backward substitution with u
+              if( n>1 ) then
+                 ! compute u \ b -> b   [ u \ (t \ (u**t \p**t * b) ) ]
+                 call stdlib_qtrsm( 'L', 'U', 'N', 'U', n-1, nrhs, one, a( 1, 2 ),lda, b( 2, 1 ), &
+                           ldb)
+                 ! pivot, p * b -> b  [ p * (u \ (t \ (u**t \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           else
+              ! solve a*x = b, where a = l*t*l**t.
+              ! 1) forward substitution with l
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute l \ b -> b    [ (l \p**t * b) ]
+                 call stdlib_qtrsm( 'L', 'L', 'N', 'U', n-1, nrhs, one, a( 2, 1 ),lda, b( 2, 1 ), &
+                           ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (l \p**t * b) ]
+              call stdlib_qlacpy( 'F', 1, n, a(1, 1), lda+1, work(n), 1)
+              if( n>1 ) then
+                 call stdlib_qlacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 1 ), 1 )
+                 call stdlib_qlacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 2*n ), 1 )
+              end if
+              call stdlib_qgtsv( n, nrhs, work( 1 ), work(n), work( 2*n ), b, ldb,info)
+              ! 3) backward substitution with l**t
+              if( n>1 ) then
+                 ! compute (l**t \ b) -> b   [ l**t \ (t \ (l \p**t * b) ) ]
+                 call stdlib_qtrsm( 'L', 'L', 'T', 'U', n-1, nrhs, one, a( 2, 1 ),lda, b( 2, 1 ), &
+                           ldb)
+                 ! pivot, p * b -> b  [ p * (l**t \ (t \ (l \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qsytrs_aa
+
+     !> DSYTRS_ROOK: solves a system of linear equations A*X = B with
+     !> a real symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by DSYTRF_ROOK.
+
+     pure subroutine stdlib_qsytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           real(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DSYTRS_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_qger( k-1, nrhs, -one, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_qscal( nrhs, one / a( k, k ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k) then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_qswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k>2 ) then
+                    call stdlib_qger( k-2, nrhs, -one, a( 1, k ), 1, b( k, 1 ),ldb, b( 1, 1 ), &
+                              ldb )
+                    call stdlib_qger( k-2, nrhs, -one, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ),&
+                               ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 )call stdlib_qgemv( 'TRANSPOSE', k-1, nrhs, -one, b,ldb, a( 1, k ), 1, &
+                           one, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_qgemv( 'TRANSPOSE', k-1, nrhs, -one, b,ldb, a( 1, k ), 1, one, b( &
+                              k, 1 ), ldb )
+                    call stdlib_qgemv( 'TRANSPOSE', k-1, nrhs, -one, b,ldb, a( 1, k+1 ), 1, one, &
+                              b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k) then k+1 and -ipiv(k+1).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_qswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_qger( n-k, nrhs, -one, a( k+1, k ), 1, b( k, 1 ),ldb, b( k+&
+                           1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_qscal( nrhs, one / a( k, k ), b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k) then k+1 and -ipiv(k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_qswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_qger( n-k-1, nrhs, -one, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, 1 &
+                              ), ldb )
+                    call stdlib_qger( n-k-1, nrhs, -one, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( k+&
+                              2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / akm1k
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_qgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+&
+                           1, k ), 1, one, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_qgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+1, k ),&
+                               1, one, b( k, 1 ), ldb )
+                    call stdlib_qgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+1, k-1 &
+                              ), 1, one, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k) then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_qswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_qswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_qsytrs_rook
+
+     !> DTBCON: estimates the reciprocal of the condition number of a
+     !> triangular band matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_qtbcon( norm, uplo, diag, n, kd, ab, ldab, rcond, work,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: ab(ldab,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(qp) :: ainvnm, anorm, scale, smlnum, xnorm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=qp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_qlantb( norm, uplo, diag, n, kd, ab, ldab, work )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_qlatbs( uplo, 'NO TRANSPOSE', diag, normin, n, kd,ab, ldab, work, &
+                              scale, work( 2*n+1 ), info )
+                 else
+                    ! multiply by inv(a**t).
+                    call stdlib_qlatbs( uplo, 'TRANSPOSE', diag, normin, n, kd, ab,ldab, work, &
+                              scale, work( 2*n+1 ), info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_iqamax( n, work, 1 )
+                    xnorm = abs( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_qrscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_qtbcon
+
+     !> DTBRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular band
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by DTBTRS or some other
+     !> means before entering this routine.  DTBRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_qtbrfs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, x, ldx, ferr,&
+                berr, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: ab(ldab,*), b(ldb,*), x(ldx,*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transt
+           integer(ilp) :: i, j, k, kase, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = kd + 2
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a or a**t, depending on trans.
+              call stdlib_qcopy( n, x( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_qtbmv( uplo, trans, diag, n, kd, ab, ldab, work( n+1 ),1 )
+              call stdlib_qaxpy( n, -one, b( 1, j ), 1, work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = max( 1, k-kd ), k
+                             work( i ) = work( i ) +abs( ab( kd+1+i-k, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = max( 1, k-kd ), k - 1
+                             work( i ) = work( i ) +abs( ab( kd+1+i-k, k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k, min( n, k+kd )
+                             work( i ) = work( i ) + abs( ab( 1+i-k, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k + 1, min( n, k+kd )
+                             work( i ) = work( i ) + abs( ab( 1+i-k, k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**t)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = max( 1, k-kd ), k
+                             s = s + abs( ab( kd+1+i-k, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = max( 1, k-kd ), k - 1
+                             s = s + abs( ab( kd+1+i-k, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, min( n, k+kd )
+                             s = s + abs( ab( 1+i-k, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = k + 1, min( n, k+kd )
+                             s = s + abs( ab( 1+i-k, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_qlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_qlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_qtbsv( uplo, transt, diag, n, kd, ab, ldab,work( n+1 ), 1 )
+                              
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_qtbsv( uplo, trans, diag, n, kd, ab, ldab,work( n+1 ), 1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_qtbrfs
+
+     !> DTBTRS: solves a triangular system of the form
+     !> A * X = B  or  A**T * X = B,
+     !> where A is a triangular band matrix of order N, and B is an
+     !> N-by NRHS matrix.  A check is made to verify that A is nonsingular.
+
+     pure subroutine stdlib_qtbtrs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(in) :: ab(ldab,*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nounit = stdlib_lsame( diag, 'N' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              if( upper ) then
+                 do info = 1, n
+                    if( ab( kd+1, info )==zero )return
+                 end do
+              else
+                 do info = 1, n
+                    if( ab( 1, info )==zero )return
+                 end do
+              end if
+           end if
+           info = 0
+           ! solve a * x = b  or  a**t * x = b.
+           do j = 1, nrhs
+              call stdlib_qtbsv( uplo, trans, diag, n, kd, ab, ldab, b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_qtbtrs
+
+     !> Level 3 BLAS like routine for A in RFP Format.
+     !> DTFSM:  solves the matrix equation
+     !> op( A )*X = alpha*B  or  X*op( A ) = alpha*B
+     !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T.
+     !> A is in Rectangular Full Packed (RFP) Format.
+     !> The matrix X is overwritten on B.
+
+     pure subroutine stdlib_qtfsm( transr, side, uplo, trans, diag, m, n, alpha, a,b, ldb )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, diag, side, trans, uplo
+           integer(ilp), intent(in) :: ldb, m, n
+           real(qp), intent(in) :: alpha
+           ! Array Arguments 
+           real(qp), intent(in) :: a(0:*)
+           real(qp), intent(inout) :: b(0:ldb-1,0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lside, misodd, nisodd, normaltransr, notrans
+           integer(ilp) :: m1, m2, n1, n2, k, info, i, j
+           ! Intrinsic Functions 
+           intrinsic :: max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lside = stdlib_lsame( side, 'L' )
+           lower = stdlib_lsame( uplo, 'L' )
+           notrans = stdlib_lsame( trans, 'N' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lside .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -2
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -3
+           else if( .not.notrans .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -4
+           else if( .not.stdlib_lsame( diag, 'N' ) .and. .not.stdlib_lsame( diag, 'U' ) )&
+                     then
+              info = -5
+           else if( m<0 ) then
+              info = -6
+           else if( n<0 ) then
+              info = -7
+           else if( ldb<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTFSM ', -info )
+              return
+           end if
+           ! quick return when ( (n==0).or.(m==0) )
+           if( ( m==0 ) .or. ( n==0 ) )return
+           ! quick return when alpha==(0e+0_qp)
+           if( alpha==zero ) then
+              do j = 0, n - 1
+                 do i = 0, m - 1
+                    b( i, j ) = zero
+                 end do
+              end do
+              return
+           end if
+           if( lside ) then
+              ! side = 'l'
+              ! a is m-by-m.
+              ! if m is odd, set nisodd = .true., and m1 and m2.
+              ! if m is even, nisodd = .false., and m.
+              if( mod( m, 2 )==0 ) then
+                 misodd = .false.
+                 k = m / 2
+              else
+                 misodd = .true.
+                 if( lower ) then
+                    m2 = m / 2
+                    m1 = m - m2
+                 else
+                    m1 = m / 2
+                    m2 = m - m1
+                 end if
+              end if
+              if( misodd ) then
+                 ! side = 'l' and n is odd
+                 if( normaltransr ) then
+                    ! side = 'l', n is odd, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='l', n is odd, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'n'
+                          if( m==1 ) then
+                             call stdlib_qtrsm( 'L', 'L', 'N', diag, m1, n, alpha,a, m, b, ldb )
+                                       
+                          else
+                             call stdlib_qtrsm( 'L', 'L', 'N', diag, m1, n, alpha,a( 0 ), m, b, &
+                                       ldb )
+                             call stdlib_qgemm( 'N', 'N', m2, n, m1, -one, a( m1 ),m, b, ldb, &
+                                       alpha, b( m1, 0 ), ldb )
+                             call stdlib_qtrsm( 'L', 'U', 'T', diag, m2, n, one,a( m ), m, b( m1, &
+                                       0 ), ldb )
+                          end if
+                       else
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 't'
+                          if( m==1 ) then
+                             call stdlib_qtrsm( 'L', 'L', 'T', diag, m1, n, alpha,a( 0 ), m, b, &
+                                       ldb )
+                          else
+                             call stdlib_qtrsm( 'L', 'U', 'N', diag, m2, n, alpha,a( m ), m, b( &
+                                       m1, 0 ), ldb )
+                             call stdlib_qgemm( 'T', 'N', m1, n, m2, -one, a( m1 ),m, b( m1, 0 ), &
+                                       ldb, alpha, b, ldb )
+                             call stdlib_qtrsm( 'L', 'L', 'T', diag, m1, n, one,a( 0 ), m, b, ldb &
+                                       )
+                          end if
+                       end if
+                    else
+                       ! side  ='l', n is odd, transr = 'n', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_qtrsm( 'L', 'L', 'N', diag, m1, n, alpha,a( m2 ), m, b, ldb &
+                                    )
+                          call stdlib_qgemm( 'T', 'N', m2, n, m1, -one, a( 0 ), m,b, ldb, alpha, &
+                                    b( m1, 0 ), ldb )
+                          call stdlib_qtrsm( 'L', 'U', 'T', diag, m2, n, one,a( m1 ), m, b( m1, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 't'
+                          call stdlib_qtrsm( 'L', 'U', 'N', diag, m2, n, alpha,a( m1 ), m, b( m1, &
+                                    0 ), ldb )
+                          call stdlib_qgemm( 'N', 'N', m1, n, m2, -one, a( 0 ), m,b( m1, 0 ), ldb,&
+                                     alpha, b, ldb )
+                          call stdlib_qtrsm( 'L', 'L', 'T', diag, m1, n, one,a( m2 ), m, b, ldb )
+                                    
+                       end if
+                    end if
+                 else
+                    ! side = 'l', n is odd, and transr = 't'
+                    if( lower ) then
+                       ! side  ='l', n is odd, transr = 't', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is odd, transr = 't', uplo = 'l', and
+                          ! trans = 'n'
+                          if( m==1 ) then
+                             call stdlib_qtrsm( 'L', 'U', 'T', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                          else
+                             call stdlib_qtrsm( 'L', 'U', 'T', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                             call stdlib_qgemm( 'T', 'N', m2, n, m1, -one,a( m1*m1 ), m1, b, ldb, &
+                                       alpha,b( m1, 0 ), ldb )
+                             call stdlib_qtrsm( 'L', 'L', 'N', diag, m2, n, one,a( 1 ), m1, b( m1,&
+                                        0 ), ldb )
+                          end if
+                       else
+                          ! side  ='l', n is odd, transr = 't', uplo = 'l', and
+                          ! trans = 't'
+                          if( m==1 ) then
+                             call stdlib_qtrsm( 'L', 'U', 'N', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                          else
+                             call stdlib_qtrsm( 'L', 'L', 'T', diag, m2, n, alpha,a( 1 ), m1, b( &
+                                       m1, 0 ), ldb )
+                             call stdlib_qgemm( 'N', 'N', m1, n, m2, -one,a( m1*m1 ), m1, b( m1, &
+                                       0 ), ldb,alpha, b, ldb )
+                             call stdlib_qtrsm( 'L', 'U', 'N', diag, m1, n, one,a( 0 ), m1, b, &
+                                       ldb )
+                          end if
+                       end if
+                    else
+                       ! side  ='l', n is odd, transr = 't', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is odd, transr = 't', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_qtrsm( 'L', 'U', 'T', diag, m1, n, alpha,a( m2*m2 ), m2, b, &
+                                    ldb )
+                          call stdlib_qgemm( 'N', 'N', m2, n, m1, -one, a( 0 ), m2,b, ldb, alpha, &
+                                    b( m1, 0 ), ldb )
+                          call stdlib_qtrsm( 'L', 'L', 'N', diag, m2, n, one,a( m1*m2 ), m2, b( &
+                                    m1, 0 ), ldb )
+                       else
+                          ! side  ='l', n is odd, transr = 't', uplo = 'u', and
+                          ! trans = 't'
+                          call stdlib_qtrsm( 'L', 'L', 'T', diag, m2, n, alpha,a( m1*m2 ), m2, b( &
+                                    m1, 0 ), ldb )
+                          call stdlib_qgemm( 'T', 'N', m1, n, m2, -one, a( 0 ), m2,b( m1, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_qtrsm( 'L', 'U', 'N', diag, m1, n, one,a( m2*m2 ), m2, b, &
+                                    ldb )
+                       end if
+                    end if
+                 end if
+              else
+                 ! side = 'l' and n is even
+                 if( normaltransr ) then
+                    ! side = 'l', n is even, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='l', n is even, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_qtrsm( 'L', 'L', 'N', diag, k, n, alpha,a( 1 ), m+1, b, ldb &
+                                    )
+                          call stdlib_qgemm( 'N', 'N', k, n, k, -one, a( k+1 ),m+1, b, ldb, alpha,&
+                                     b( k, 0 ), ldb )
+                          call stdlib_qtrsm( 'L', 'U', 'T', diag, k, n, one,a( 0 ), m+1, b( k, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 't'
+                          call stdlib_qtrsm( 'L', 'U', 'N', diag, k, n, alpha,a( 0 ), m+1, b( k, &
+                                    0 ), ldb )
+                          call stdlib_qgemm( 'T', 'N', k, n, k, -one, a( k+1 ),m+1, b( k, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_qtrsm( 'L', 'L', 'T', diag, k, n, one,a( 1 ), m+1, b, ldb )
+                                    
+                       end if
+                    else
+                       ! side  ='l', n is even, transr = 'n', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_qtrsm( 'L', 'L', 'N', diag, k, n, alpha,a( k+1 ), m+1, b, &
+                                    ldb )
+                          call stdlib_qgemm( 'T', 'N', k, n, k, -one, a( 0 ), m+1,b, ldb, alpha, &
+                                    b( k, 0 ), ldb )
+                          call stdlib_qtrsm( 'L', 'U', 'T', diag, k, n, one,a( k ), m+1, b( k, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 't'
+                          call stdlib_qtrsm( 'L', 'U', 'N', diag, k, n, alpha,a( k ), m+1, b( k, &
+                                    0 ), ldb )
+                          call stdlib_qgemm( 'N', 'N', k, n, k, -one, a( 0 ), m+1,b( k, 0 ), ldb, &
+                                    alpha, b, ldb )
+                          call stdlib_qtrsm( 'L', 'L', 'T', diag, k, n, one,a( k+1 ), m+1, b, ldb &
+                                    )
+                       end if
+                    end if
+                 else
+                    ! side = 'l', n is even, and transr = 't'
+                    if( lower ) then
+                       ! side  ='l', n is even, transr = 't', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is even, transr = 't', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_qtrsm( 'L', 'U', 'T', diag, k, n, alpha,a( k ), k, b, ldb )
+                                    
+                          call stdlib_qgemm( 'T', 'N', k, n, k, -one,a( k*( k+1 ) ), k, b, ldb, &
+                                    alpha,b( k, 0 ), ldb )
+                          call stdlib_qtrsm( 'L', 'L', 'N', diag, k, n, one,a( 0 ), k, b( k, 0 ), &
+                                    ldb )
+                       else
+                          ! side  ='l', n is even, transr = 't', uplo = 'l',
+                          ! and trans = 't'
+                          call stdlib_qtrsm( 'L', 'L', 'T', diag, k, n, alpha,a( 0 ), k, b( k, 0 )&
+                                    , ldb )
+                          call stdlib_qgemm( 'N', 'N', k, n, k, -one,a( k*( k+1 ) ), k, b( k, 0 ),&
+                                     ldb,alpha, b, ldb )
+                          call stdlib_qtrsm( 'L', 'U', 'N', diag, k, n, one,a( k ), k, b, ldb )
+                                    
+                       end if
+                    else
+                       ! side  ='l', n is even, transr = 't', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is even, transr = 't', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_qtrsm( 'L', 'U', 'T', diag, k, n, alpha,a( k*( k+1 ) ), k, &
+                                    b, ldb )
+                          call stdlib_qgemm( 'N', 'N', k, n, k, -one, a( 0 ), k, b,ldb, alpha, b( &
+                                    k, 0 ), ldb )
+                          call stdlib_qtrsm( 'L', 'L', 'N', diag, k, n, one,a( k*k ), k, b( k, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='l', n is even, transr = 't', uplo = 'u',
+                          ! and trans = 't'
+                          call stdlib_qtrsm( 'L', 'L', 'T', diag, k, n, alpha,a( k*k ), k, b( k, &
+                                    0 ), ldb )
+                          call stdlib_qgemm( 'T', 'N', k, n, k, -one, a( 0 ), k,b( k, 0 ), ldb, &
+                                    alpha, b, ldb )
+                          call stdlib_qtrsm( 'L', 'U', 'N', diag, k, n, one,a( k*( k+1 ) ), k, b, &
+                                    ldb )
+                       end if
+                    end if
+                 end if
+              end if
+           else
+              ! side = 'r'
+              ! a is n-by-n.
+              ! if n is odd, set nisodd = .true., and n1 and n2.
+              ! if n is even, nisodd = .false., and k.
+              if( mod( n, 2 )==0 ) then
+                 nisodd = .false.
+                 k = n / 2
+              else
+                 nisodd = .true.
+                 if( lower ) then
+                    n2 = n / 2
+                    n1 = n - n2
+                 else
+                    n1 = n / 2
+                    n2 = n - n1
+                 end if
+              end if
+              if( nisodd ) then
+                 ! side = 'r' and n is odd
+                 if( normaltransr ) then
+                    ! side = 'r', n is odd, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='r', n is odd, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'n'
+                          call stdlib_qtrsm( 'R', 'U', 'T', diag, m, n2, alpha,a( n ), n, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_qgemm( 'N', 'N', m, n1, n2, -one, b( 0, n1 ),ldb, a( n1 ), &
+                                    n, alpha, b( 0, 0 ),ldb )
+                          call stdlib_qtrsm( 'R', 'L', 'N', diag, m, n1, one,a( 0 ), n, b( 0, 0 ),&
+                                     ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 't'
+                          call stdlib_qtrsm( 'R', 'L', 'T', diag, m, n1, alpha,a( 0 ), n, b( 0, 0 &
+                                    ), ldb )
+                          call stdlib_qgemm( 'N', 'T', m, n2, n1, -one, b( 0, 0 ),ldb, a( n1 ), n,&
+                                     alpha, b( 0, n1 ),ldb )
+                          call stdlib_qtrsm( 'R', 'U', 'N', diag, m, n2, one,a( n ), n, b( 0, n1 )&
+                                    , ldb )
+                       end if
+                    else
+                       ! side  ='r', n is odd, transr = 'n', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_qtrsm( 'R', 'L', 'T', diag, m, n1, alpha,a( n2 ), n, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_qgemm( 'N', 'N', m, n2, n1, -one, b( 0, 0 ),ldb, a( 0 ), n, &
+                                    alpha, b( 0, n1 ),ldb )
+                          call stdlib_qtrsm( 'R', 'U', 'N', diag, m, n2, one,a( n1 ), n, b( 0, n1 &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 't'
+                          call stdlib_qtrsm( 'R', 'U', 'T', diag, m, n2, alpha,a( n1 ), n, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_qgemm( 'N', 'T', m, n1, n2, -one, b( 0, n1 ),ldb, a( 0 ), n,&
+                                     alpha, b( 0, 0 ), ldb )
+                          call stdlib_qtrsm( 'R', 'L', 'N', diag, m, n1, one,a( n2 ), n, b( 0, 0 )&
+                                    , ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'r', n is odd, and transr = 't'
+                    if( lower ) then
+                       ! side  ='r', n is odd, transr = 't', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 't', uplo = 'l', and
+                          ! trans = 'n'
+                          call stdlib_qtrsm( 'R', 'L', 'N', diag, m, n2, alpha,a( 1 ), n1, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_qgemm( 'N', 'T', m, n1, n2, -one, b( 0, n1 ),ldb, a( n1*n1 )&
+                                    , n1, alpha, b( 0, 0 ),ldb )
+                          call stdlib_qtrsm( 'R', 'U', 'T', diag, m, n1, one,a( 0 ), n1, b( 0, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 't', uplo = 'l', and
+                          ! trans = 't'
+                          call stdlib_qtrsm( 'R', 'U', 'N', diag, m, n1, alpha,a( 0 ), n1, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_qgemm( 'N', 'N', m, n2, n1, -one, b( 0, 0 ),ldb, a( n1*n1 ),&
+                                     n1, alpha, b( 0, n1 ),ldb )
+                          call stdlib_qtrsm( 'R', 'L', 'T', diag, m, n2, one,a( 1 ), n1, b( 0, n1 &
+                                    ), ldb )
+                       end if
+                    else
+                       ! side  ='r', n is odd, transr = 't', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 't', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_qtrsm( 'R', 'U', 'N', diag, m, n1, alpha,a( n2*n2 ), n2, b( &
+                                    0, 0 ), ldb )
+                          call stdlib_qgemm( 'N', 'T', m, n2, n1, -one, b( 0, 0 ),ldb, a( 0 ), n2,&
+                                     alpha, b( 0, n1 ),ldb )
+                          call stdlib_qtrsm( 'R', 'L', 'T', diag, m, n2, one,a( n1*n2 ), n2, b( 0,&
+                                     n1 ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 't', uplo = 'u', and
+                          ! trans = 't'
+                          call stdlib_qtrsm( 'R', 'L', 'N', diag, m, n2, alpha,a( n1*n2 ), n2, b( &
+                                    0, n1 ), ldb )
+                          call stdlib_qgemm( 'N', 'N', m, n1, n2, -one, b( 0, n1 ),ldb, a( 0 ), &
+                                    n2, alpha, b( 0, 0 ),ldb )
+                          call stdlib_qtrsm( 'R', 'U', 'T', diag, m, n1, one,a( n2*n2 ), n2, b( 0,&
+                                     0 ), ldb )
+                       end if
+                    end if
+                 end if
+              else
+                 ! side = 'r' and n is even
+                 if( normaltransr ) then
+                    ! side = 'r', n is even, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='r', n is even, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_qtrsm( 'R', 'U', 'T', diag, m, k, alpha,a( 0 ), n+1, b( 0, &
+                                    k ), ldb )
+                          call stdlib_qgemm( 'N', 'N', m, k, k, -one, b( 0, k ),ldb, a( k+1 ), n+&
+                                    1, alpha, b( 0, 0 ),ldb )
+                          call stdlib_qtrsm( 'R', 'L', 'N', diag, m, k, one,a( 1 ), n+1, b( 0, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 't'
+                          call stdlib_qtrsm( 'R', 'L', 'T', diag, m, k, alpha,a( 1 ), n+1, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_qgemm( 'N', 'T', m, k, k, -one, b( 0, 0 ),ldb, a( k+1 ), n+&
+                                    1, alpha, b( 0, k ),ldb )
+                          call stdlib_qtrsm( 'R', 'U', 'N', diag, m, k, one,a( 0 ), n+1, b( 0, k )&
+                                    , ldb )
+                       end if
+                    else
+                       ! side  ='r', n is even, transr = 'n', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_qtrsm( 'R', 'L', 'T', diag, m, k, alpha,a( k+1 ), n+1, b( 0,&
+                                     0 ), ldb )
+                          call stdlib_qgemm( 'N', 'N', m, k, k, -one, b( 0, 0 ),ldb, a( 0 ), n+1, &
+                                    alpha, b( 0, k ),ldb )
+                          call stdlib_qtrsm( 'R', 'U', 'N', diag, m, k, one,a( k ), n+1, b( 0, k )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 't'
+                          call stdlib_qtrsm( 'R', 'U', 'T', diag, m, k, alpha,a( k ), n+1, b( 0, &
+                                    k ), ldb )
+                          call stdlib_qgemm( 'N', 'T', m, k, k, -one, b( 0, k ),ldb, a( 0 ), n+1, &
+                                    alpha, b( 0, 0 ),ldb )
+                          call stdlib_qtrsm( 'R', 'L', 'N', diag, m, k, one,a( k+1 ), n+1, b( 0, &
+                                    0 ), ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'r', n is even, and transr = 't'
+                    if( lower ) then
+                       ! side  ='r', n is even, transr = 't', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 't', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_qtrsm( 'R', 'L', 'N', diag, m, k, alpha,a( 0 ), k, b( 0, k )&
+                                    , ldb )
+                          call stdlib_qgemm( 'N', 'T', m, k, k, -one, b( 0, k ),ldb, a( ( k+1 )*k &
+                                    ), k, alpha,b( 0, 0 ), ldb )
+                          call stdlib_qtrsm( 'R', 'U', 'T', diag, m, k, one,a( k ), k, b( 0, 0 ), &
+                                    ldb )
+                       else
+                          ! side  ='r', n is even, transr = 't', uplo = 'l',
+                          ! and trans = 't'
+                          call stdlib_qtrsm( 'R', 'U', 'N', diag, m, k, alpha,a( k ), k, b( 0, 0 )&
+                                    , ldb )
+                          call stdlib_qgemm( 'N', 'N', m, k, k, -one, b( 0, 0 ),ldb, a( ( k+1 )*k &
+                                    ), k, alpha,b( 0, k ), ldb )
+                          call stdlib_qtrsm( 'R', 'L', 'T', diag, m, k, one,a( 0 ), k, b( 0, k ), &
+                                    ldb )
+                       end if
+                    else
+                       ! side  ='r', n is even, transr = 't', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 't', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_qtrsm( 'R', 'U', 'N', diag, m, k, alpha,a( ( k+1 )*k ), k, &
+                                    b( 0, 0 ), ldb )
+                          call stdlib_qgemm( 'N', 'T', m, k, k, -one, b( 0, 0 ),ldb, a( 0 ), k, &
+                                    alpha, b( 0, k ), ldb )
+                          call stdlib_qtrsm( 'R', 'L', 'T', diag, m, k, one,a( k*k ), k, b( 0, k )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is even, transr = 't', uplo = 'u',
+                          ! and trans = 't'
+                          call stdlib_qtrsm( 'R', 'L', 'N', diag, m, k, alpha,a( k*k ), k, b( 0, &
+                                    k ), ldb )
+                          call stdlib_qgemm( 'N', 'N', m, k, k, -one, b( 0, k ),ldb, a( 0 ), k, &
+                                    alpha, b( 0, 0 ), ldb )
+                          call stdlib_qtrsm( 'R', 'U', 'T', diag, m, k, one,a( ( k+1 )*k ), k, b( &
+                                    0, 0 ), ldb )
+                       end if
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qtfsm
+
+     !> DTFTRI: computes the inverse of a triangular matrix A stored in RFP
+     !> format.
+     !> This is a Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_qtftri( transr, uplo, diag, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo, diag
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( .not.stdlib_lsame( diag, 'N' ) .and. .not.stdlib_lsame( diag, 'U' ) )&
+                     then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTFTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_qtrtri( 'L', diag, n1, a( 0 ), n, info )
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'R', 'L', 'N', diag, n2, n1, -one, a( 0 ),n, a( n1 ), n )
+                              
+                    call stdlib_qtrtri( 'U', diag, n2, a( n ), n, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'L', 'U', 'T', diag, n2, n1, one, a( n ), n,a( n1 ), n )
+                              
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_qtrtri( 'L', diag, n1, a( n2 ), n, info )
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'L', 'L', 'T', diag, n1, n2, -one, a( n2 ),n, a( 0 ), n )
+                              
+                    call stdlib_qtrtri( 'U', diag, n2, a( n1 ), n, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'R', 'U', 'N', diag, n1, n2, one, a( n1 ),n, a( 0 ), n )
+                              
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0), t2 -> a(1), s -> a(0+n1*n1)
+                    call stdlib_qtrtri( 'U', diag, n1, a( 0 ), n1, info )
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'L', 'U', 'N', diag, n1, n2, -one, a( 0 ),n1, a( n1*n1 ), &
+                              n1 )
+                    call stdlib_qtrtri( 'L', diag, n2, a( 1 ), n1, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'R', 'L', 'T', diag, n1, n2, one, a( 1 ),n1, a( n1*n1 ), &
+                              n1 )
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0+n2*n2), t2 -> a(0+n1*n2), s -> a(0)
+                    call stdlib_qtrtri( 'U', diag, n1, a( n2*n2 ), n2, info )
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'R', 'U', 'T', diag, n2, n1, -one,a( n2*n2 ), n2, a( 0 ), &
+                              n2 )
+                    call stdlib_qtrtri( 'L', diag, n2, a( n1*n2 ), n2, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'L', 'L', 'N', diag, n2, n1, one,a( n1*n2 ), n2, a( 0 ), &
+                              n2 )
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_qtrtri( 'L', diag, k, a( 1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'R', 'L', 'N', diag, k, k, -one, a( 1 ),n+1, a( k+1 ), n+1 &
+                              )
+                    call stdlib_qtrtri( 'U', diag, k, a( 0 ), n+1, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'L', 'U', 'T', diag, k, k, one, a( 0 ), n+1,a( k+1 ), n+1 )
+                              
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_qtrtri( 'L', diag, k, a( k+1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'L', 'L', 'T', diag, k, k, -one, a( k+1 ),n+1, a( 0 ), n+1 &
+                              )
+                    call stdlib_qtrtri( 'U', diag, k, a( k ), n+1, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'R', 'U', 'N', diag, k, k, one, a( k ), n+1,a( 0 ), n+1 )
+                              
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_qtrtri( 'U', diag, k, a( k ), k, info )
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'L', 'U', 'N', diag, k, k, -one, a( k ), k,a( k*( k+1 ) ), &
+                              k )
+                    call stdlib_qtrtri( 'L', diag, k, a( 0 ), k, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'R', 'L', 'T', diag, k, k, one, a( 0 ), k,a( k*( k+1 ) ), &
+                              k )
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_qtrtri( 'U', diag, k, a( k*( k+1 ) ), k, info )
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'R', 'U', 'T', diag, k, k, -one,a( k*( k+1 ) ), k, a( 0 ), &
+                              k )
+                    call stdlib_qtrtri( 'L', diag, k, a( k*k ), k, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_qtrmm( 'L', 'L', 'N', diag, k, k, one, a( k*k ), k,a( 0 ), k )
+                              
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qtftri
+
+     !> DTFTTP: copies a triangular matrix A from rectangular full packed
+     !> format (TF) to standard packed format (TP).
+
+     pure subroutine stdlib_qtfttp( transr, uplo, n, arf, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(out) :: ap(0:*)
+           real(qp), intent(in) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt
+           integer(ilp) :: i, j, ij
+           integer(ilp) :: ijp, jp, lda, js
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTFTTP', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( normaltransr ) then
+                 ap( 0 ) = arf( 0 )
+              else
+                 ap( 0 ) = arf( 0 )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           ! set lda of arf^c; arf^c is (0:(n+1)/2-1,0:n-noe)
+           ! where noe = 0 if n is even, noe = 1 if n is odd
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              lda = n + 1
+           else
+              nisodd = .true.
+              lda = n
+           end if
+           ! arf^c has lda rows and n+1-noe cols
+           if( .not.normaltransr )lda = ( n+1 ) / 2
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda = n
+                    ijp = 0
+                    jp = 0
+                    do j = 0, n2
+                       do i = j, n - 1
+                          ij = i + jp
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, n2 - 1
+                       do j = 1 + i, n2
+                          ij = i + j*lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    ijp = 0
+                    do j = 0, n1 - 1
+                       ij = n2 + j
+                       do i = 0, j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = n1, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ijp = 0
+                    do i = 0, n2
+                       do ij = i*( lda+1 ), n*lda - 1, lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 1
+                    do j = 0, n2 - 1
+                       do ij = js, js + n2 - j - 1
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    ijp = 0
+                    js = n2*lda
+                    do j = 0, n1 - 1
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, n1
+                       do ij = i, i + ( n1+i )*lda, lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    ijp = 0
+                    jp = 0
+                    do j = 0, k - 1
+                       do i = j, n - 1
+                          ij = 1 + i + jp
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, k - 1
+                       do j = i, k - 1
+                          ij = i + j*lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    ijp = 0
+                    do j = 0, k - 1
+                       ij = k + 1 + j
+                       do i = 0, j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = k, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    ijp = 0
+                    do i = 0, k - 1
+                       do ij = i + ( i+1 )*lda, ( n+1 )*lda - 1, lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 0
+                    do j = 0, k - 1
+                       do ij = js, js + k - j - 1
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    ijp = 0
+                    js = ( k+1 )*lda
+                    do j = 0, k - 1
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, k - 1
+                       do ij = i, i + ( k+i )*lda, lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qtfttp
+
+     !> DTFTTR: copies a triangular matrix A from rectangular full packed
+     !> format (TF) to standard full format (TR).
+
+     pure subroutine stdlib_qtfttr( transr, uplo, n, arf, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           real(qp), intent(out) :: a(0:lda-1,0:*)
+           real(qp), intent(in) :: arf(0:*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt, nx2, np1x2
+           integer(ilp) :: i, j, l, ij
+           ! Intrinsic Functions 
+           intrinsic :: max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTFTTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 ) then
+                 a( 0, 0 ) = arf( 0 )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower: for n even n1=n2=k
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true., lda=n+1 and a is (n+1)--by--k2.
+           ! if n is even, set k = n/2 and nisodd = .false., lda=n and a is
+           ! n--by--(n+1)/2.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              if( .not.lower )np1x2 = n + n + 2
+           else
+              nisodd = .true.
+              if( .not.lower )nx2 = n + n
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    ij = 0
+                    do j = 0, n2
+                       do i = n1, n2 + j
+                          a( n2+j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    ij = nt - n
+                    do j = n - 1, n1, -1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = j - n1, n1 - 1
+                          a( j-n1, l ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       ij = ij - nx2
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! n is odd, transr = 't', and uplo = 'l'
+                    ij = 0
+                    do j = 0, n2 - 1
+                       do i = 0, j
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do i = n1 + j, n - 1
+                          a( i, n1+j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = n2, n - 1
+                       do i = 0, n1 - 1
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 't', and uplo = 'u'
+                    ij = 0
+                    do j = 0, n1
+                       do i = n1, n - 1
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, n1 - 1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = n2 + j, n - 1
+                          a( n2+j, l ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    ij = 0
+                    do j = 0, k - 1
+                       do i = k, k + j
+                          a( k+j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    ij = nt - n - 1
+                    do j = n - 1, k, -1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = j - k, k - 1
+                          a( j-k, l ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       ij = ij - np1x2
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! n is even, transr = 't', and uplo = 'l'
+                    ij = 0
+                    j = k
+                    do i = k, n - 1
+                       a( i, j ) = arf( ij )
+                       ij = ij + 1
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do i = k + 1 + j, n - 1
+                          a( i, k+1+j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = k - 1, n - 1
+                       do i = 0, k - 1
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 't', and uplo = 'u'
+                    ij = 0
+                    do j = 0, k
+                       do i = k, n - 1
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = k + 1 + j, n - 1
+                          a( k+1+j, l ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    ! note that here, on exit of the loop, j = k-1
+                    do i = 0, j
+                       a( i, j ) = arf( ij )
+                       ij = ij + 1
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qtfttr
+
+     !> DTGEVC: computes some or all of the right and/or left eigenvectors of
+     !> a pair of real matrices (S,P), where S is a quasi-triangular matrix
+     !> and P is upper triangular.  Matrix pairs of this type are produced by
+     !> the generalized Schur factorization of a matrix pair (A,B):
+     !> A = Q*S*Z**T,  B = Q*P*Z**T
+     !> as computed by DGGHRD + DHGEQZ.
+     !> The right eigenvector x and the left eigenvector y of (S,P)
+     !> corresponding to an eigenvalue w are defined by:
+     !> S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
+     !> where y**H denotes the conjugate tranpose of y.
+     !> The eigenvalues are not input to this routine, but are computed
+     !> directly from the diagonal blocks of S and P.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of (S,P), or the products Z*X and/or Q*Y,
+     !> where Z and Q are input matrices.
+     !> If Q and Z are the orthogonal factors from the generalized Schur
+     !> factorization of a matrix pair (A,B), then Z*X and Q*Y
+     !> are the matrices of right and left eigenvectors of (A,B).
+
+     pure subroutine stdlib_qtgevc( side, howmny, select, n, s, lds, p, ldp, vl,ldvl, vr, ldvr, &
+               mm, m, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldp, lds, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(qp), intent(in) :: p(ldp,*), s(lds,*)
+           real(qp), intent(inout) :: vl(ldvl,*), vr(ldvr,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: safety = 1.0e+2_qp
+           
+           ! Local Scalars 
+           logical(lk) :: compl, compr, il2by2, ilabad, ilall, ilback, ilbbad, ilcomp, ilcplx, &
+                     lsa, lsb
+           integer(ilp) :: i, ibeg, ieig, iend, ihwmny, iinfo, im, iside, j, ja, jc, je, jr, jw, &
+                     na, nw
+           real(qp) :: acoef, acoefa, anorm, ascale, bcoefa, bcoefi, bcoefr, big, bignum, bnorm, &
+           bscale, cim2a, cim2b, cimaga, cimagb, cre2a, cre2b, creala, crealb, dmin, safmin, &
+                     salfar, sbeta, scale, small, temp, temp2, temp2i, temp2r, ulp, xmax, xscale
+           ! Local Arrays 
+           real(qp) :: bdiag(2), sum(2,2), sums(2,2), sump(2,2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! decode and test the input parameters
+           if( stdlib_lsame( howmny, 'A' ) ) then
+              ihwmny = 1
+              ilall = .true.
+              ilback = .false.
+           else if( stdlib_lsame( howmny, 'S' ) ) then
+              ihwmny = 2
+              ilall = .false.
+              ilback = .false.
+           else if( stdlib_lsame( howmny, 'B' ) ) then
+              ihwmny = 3
+              ilall = .true.
+              ilback = .true.
+           else
+              ihwmny = -1
+              ilall = .true.
+           end if
+           if( stdlib_lsame( side, 'R' ) ) then
+              iside = 1
+              compl = .false.
+              compr = .true.
+           else if( stdlib_lsame( side, 'L' ) ) then
+              iside = 2
+              compl = .true.
+              compr = .false.
+           else if( stdlib_lsame( side, 'B' ) ) then
+              iside = 3
+              compl = .true.
+              compr = .true.
+           else
+              iside = -1
+           end if
+           info = 0
+           if( iside<0 ) then
+              info = -1
+           else if( ihwmny<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lds<max( 1, n ) ) then
+              info = -6
+           else if( ldp<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGEVC', -info )
+              return
+           end if
+           ! count the number of eigenvectors to be computed
+           if( .not.ilall ) then
+              im = 0
+              ilcplx = .false.
+              loop_10: do j = 1, n
+                 if( ilcplx ) then
+                    ilcplx = .false.
+                    cycle loop_10
+                 end if
+                 if( j<n ) then
+                    if( s( j+1, j )/=zero )ilcplx = .true.
+                 end if
+                 if( ilcplx ) then
+                    if( select( j ) .or. select( j+1 ) )im = im + 2
+                 else
+                    if( select( j ) )im = im + 1
+                 end if
+              end do loop_10
+           else
+              im = n
+           end if
+           ! check 2-by-2 diagonal blocks of a, b
+           ilabad = .false.
+           ilbbad = .false.
+           do j = 1, n - 1
+              if( s( j+1, j )/=zero ) then
+                 if( p( j, j )==zero .or. p( j+1, j+1 )==zero .or.p( j, j+1 )/=zero )ilbbad = &
+                           .true.
+                 if( j<n-1 ) then
+                    if( s( j+2, j+1 )/=zero )ilabad = .true.
+                 end if
+              end if
+           end do
+           if( ilabad ) then
+              info = -5
+           else if( ilbbad ) then
+              info = -7
+           else if( compl .and. ldvl<n .or. ldvl<1 ) then
+              info = -10
+           else if( compr .and. ldvr<n .or. ldvr<1 ) then
+              info = -12
+           else if( mm<im ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGEVC', -info )
+              return
+           end if
+           ! quick return if possible
+           m = im
+           if( n==0 )return
+           ! machine constants
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           big = one / safmin
+           call stdlib_qlabad( safmin, big )
+           ulp = stdlib_qlamch( 'EPSILON' )*stdlib_qlamch( 'BASE' )
+           small = safmin*n / ulp
+           big = one / small
+           bignum = one / ( safmin*n )
+           ! compute the 1-norm of each column of the strictly upper triangular
+           ! part (i.e., excluding all elements belonging to the diagonal
+           ! blocks) of a and b to check for possible overflow in the
+           ! triangular solver.
+           anorm = abs( s( 1, 1 ) )
+           if( n>1 )anorm = anorm + abs( s( 2, 1 ) )
+           bnorm = abs( p( 1, 1 ) )
+           work( 1 ) = zero
+           work( n+1 ) = zero
+           do j = 2, n
+              temp = zero
+              temp2 = zero
+              if( s( j, j-1 )==zero ) then
+                 iend = j - 1
+              else
+                 iend = j - 2
+              end if
+              do i = 1, iend
+                 temp = temp + abs( s( i, j ) )
+                 temp2 = temp2 + abs( p( i, j ) )
+              end do
+              work( j ) = temp
+              work( n+j ) = temp2
+              do i = iend + 1, min( j+1, n )
+                 temp = temp + abs( s( i, j ) )
+                 temp2 = temp2 + abs( p( i, j ) )
+              end do
+              anorm = max( anorm, temp )
+              bnorm = max( bnorm, temp2 )
+           end do
+           ascale = one / max( anorm, safmin )
+           bscale = one / max( bnorm, safmin )
+           ! left eigenvectors
+           if( compl ) then
+              ieig = 0
+              ! main loop over eigenvalues
+              ilcplx = .false.
+              loop_220: do je = 1, n
+                 ! skip this iteration if (a) howmny='s' and select=.false., or
+                 ! (b) this would be the second of a complex pair.
+                 ! check for complex eigenvalue, so as to be sure of which
+                 ! entry(-ies) of select to look at.
+                 if( ilcplx ) then
+                    ilcplx = .false.
+                    cycle loop_220
+                 end if
+                 nw = 1
+                 if( je<n ) then
+                    if( s( je+1, je )/=zero ) then
+                       ilcplx = .true.
+                       nw = 2
+                    end if
+                 end if
+                 if( ilall ) then
+                    ilcomp = .true.
+                 else if( ilcplx ) then
+                    ilcomp = select( je ) .or. select( je+1 )
+                 else
+                    ilcomp = select( je )
+                 end if
+                 if( .not.ilcomp )cycle loop_220
+                 ! decide if (a) singular pencil, (b) real eigenvalue, or
+                 ! (c) complex eigenvalue.
+                 if( .not.ilcplx ) then
+                    if( abs( s( je, je ) )<=safmin .and.abs( p( je, je ) )<=safmin ) then
+                       ! singular matrix pencil -- return unit eigenvector
+                       ieig = ieig + 1
+                       do jr = 1, n
+                          vl( jr, ieig ) = zero
+                       end do
+                       vl( ieig, ieig ) = one
+                       cycle loop_220
+                    end if
+                 end if
+                 ! clear vector
+                 do jr = 1, nw*n
+                    work( 2*n+jr ) = zero
+                 end do
+                                                       ! t
+                 ! compute coefficients in  ( a a - b b )  y = 0
+                    ! a  is  acoef
+                    ! b  is  bcoefr + i*bcoefi
+                 if( .not.ilcplx ) then
+                    ! real eigenvalue
+                    temp = one / max( abs( s( je, je ) )*ascale,abs( p( je, je ) )*bscale, safmin &
+                              )
+                    salfar = ( temp*s( je, je ) )*ascale
+                    sbeta = ( temp*p( je, je ) )*bscale
+                    acoef = sbeta*ascale
+                    bcoefr = salfar*bscale
+                    bcoefi = zero
+                    ! scale to avoid underflow
+                    scale = one
+                    lsa = abs( sbeta )>=safmin .and. abs( acoef )<small
+                    lsb = abs( salfar )>=safmin .and. abs( bcoefr )<small
+                    if( lsa )scale = ( small / abs( sbeta ) )*min( anorm, big )
+                    if( lsb )scale = max( scale, ( small / abs( salfar ) )*min( bnorm, big ) )
+                              
+                    if( lsa .or. lsb ) then
+                       scale = min( scale, one /( safmin*max( one, abs( acoef ),abs( bcoefr ) ) ) &
+                                 )
+                       if( lsa ) then
+                          acoef = ascale*( scale*sbeta )
+                       else
+                          acoef = scale*acoef
+                       end if
+                       if( lsb ) then
+                          bcoefr = bscale*( scale*salfar )
+                       else
+                          bcoefr = scale*bcoefr
+                       end if
+                    end if
+                    acoefa = abs( acoef )
+                    bcoefa = abs( bcoefr )
+                    ! first component is 1
+                    work( 2*n+je ) = one
+                    xmax = one
+                 else
+                    ! complex eigenvalue
+                    call stdlib_qlag2( s( je, je ), lds, p( je, je ), ldp,safmin*safety, acoef, &
+                              temp, bcoefr, temp2,bcoefi )
+                    bcoefi = -bcoefi
+                    if( bcoefi==zero ) then
+                       info = je
+                       return
+                    end if
+                    ! scale to avoid over/underflow
+                    acoefa = abs( acoef )
+                    bcoefa = abs( bcoefr ) + abs( bcoefi )
+                    scale = one
+                    if( acoefa*ulp<safmin .and. acoefa>=safmin )scale = ( safmin / ulp ) / &
+                              acoefa
+                    if( bcoefa*ulp<safmin .and. bcoefa>=safmin )scale = max( scale, ( safmin / &
+                              ulp ) / bcoefa )
+                    if( safmin*acoefa>ascale )scale = ascale / ( safmin*acoefa )
+                    if( safmin*bcoefa>bscale )scale = min( scale, bscale / ( safmin*bcoefa ) )
+                              
+                    if( scale/=one ) then
+                       acoef = scale*acoef
+                       acoefa = abs( acoef )
+                       bcoefr = scale*bcoefr
+                       bcoefi = scale*bcoefi
+                       bcoefa = abs( bcoefr ) + abs( bcoefi )
+                    end if
+                    ! compute first two components of eigenvector
+                    temp = acoef*s( je+1, je )
+                    temp2r = acoef*s( je, je ) - bcoefr*p( je, je )
+                    temp2i = -bcoefi*p( je, je )
+                    if( abs( temp )>abs( temp2r )+abs( temp2i ) ) then
+                       work( 2*n+je ) = one
+                       work( 3*n+je ) = zero
+                       work( 2*n+je+1 ) = -temp2r / temp
+                       work( 3*n+je+1 ) = -temp2i / temp
+                    else
+                       work( 2*n+je+1 ) = one
+                       work( 3*n+je+1 ) = zero
+                       temp = acoef*s( je, je+1 )
+                       work( 2*n+je ) = ( bcoefr*p( je+1, je+1 )-acoef*s( je+1, je+1 ) ) / &
+                                 temp
+                       work( 3*n+je ) = bcoefi*p( je+1, je+1 ) / temp
+                    end if
+                    xmax = max( abs( work( 2*n+je ) )+abs( work( 3*n+je ) ),abs( work( 2*n+je+1 ) &
+                              )+abs( work( 3*n+je+1 ) ) )
+                 end if
+                 dmin = max( ulp*acoefa*anorm, ulp*bcoefa*bnorm, safmin )
+                                                 ! t
+                 ! triangular solve of  (a a - b b)  y = 0
+                                         ! t
+                 ! (rowwise in  (a a - b b) , or columnwise in (a a - b b) )
+                 il2by2 = .false.
+                 loop_160: do j = je + nw, n
+                    if( il2by2 ) then
+                       il2by2 = .false.
+                       cycle loop_160
+                    end if
+                    na = 1
+                    bdiag( 1 ) = p( j, j )
+                    if( j<n ) then
+                       if( s( j+1, j )/=zero ) then
+                          il2by2 = .true.
+                          bdiag( 2 ) = p( j+1, j+1 )
+                          na = 2
+                       end if
+                    end if
+                    ! check whether scaling is necessary for dot products
+                    xscale = one / max( one, xmax )
+                    temp = max( work( j ), work( n+j ),acoefa*work( j )+bcoefa*work( n+j ) )
+                              
+                    if( il2by2 )temp = max( temp, work( j+1 ), work( n+j+1 ),acoefa*work( j+1 )+&
+                              bcoefa*work( n+j+1 ) )
+                    if( temp>bignum*xscale ) then
+                       do jw = 0, nw - 1
+                          do jr = je, j - 1
+                             work( ( jw+2 )*n+jr ) = xscale*work( ( jw+2 )*n+jr )
+                          end do
+                       end do
+                       xmax = xmax*xscale
+                    end if
+                    ! compute dot products
+                          ! j-1
+                    ! sum = sum  conjg( a*s(k,j) - b*p(k,j) )*x(k)
+                          ! k=je
+                    ! to reduce the op count, this is done as
+                    ! _        j-1                  _        j-1
+                    ! a*conjg( sum  s(k,j)*x(k) ) - b*conjg( sum  p(k,j)*x(k) )
+                             ! k=je                          k=je
+                    ! which may cause underflow problems if a or b are close
+                    ! to underflow.  (e.g., less than small.)
+                    do jw = 1, nw
+                       do ja = 1, na
+                          sums( ja, jw ) = zero
+                          sump( ja, jw ) = zero
+                          do jr = je, j - 1
+                             sums( ja, jw ) = sums( ja, jw ) +s( jr, j+ja-1 )*work( ( jw+1 )*n+jr &
+                                       )
+                             sump( ja, jw ) = sump( ja, jw ) +p( jr, j+ja-1 )*work( ( jw+1 )*n+jr &
+                                       )
+                          end do
+                       end do
+                    end do
+                    do ja = 1, na
+                       if( ilcplx ) then
+                          sum( ja, 1 ) = -acoef*sums( ja, 1 ) +bcoefr*sump( ja, 1 ) -bcoefi*sump( &
+                                    ja, 2 )
+                          sum( ja, 2 ) = -acoef*sums( ja, 2 ) +bcoefr*sump( ja, 2 ) +bcoefi*sump( &
+                                    ja, 1 )
+                       else
+                          sum( ja, 1 ) = -acoef*sums( ja, 1 ) +bcoefr*sump( ja, 1 )
+                       end if
+                    end do
+                                        ! t
+                    ! solve  ( a a - b b )  y = sum(,)
+                    ! with scaling and perturbation of the denominator
+                    call stdlib_qlaln2( .true., na, nw, dmin, acoef, s( j, j ), lds,bdiag( 1 ), &
+                    bdiag( 2 ), sum, 2, bcoefr,bcoefi, work( 2*n+j ), n, scale, temp,iinfo )
+                              
+                    if( scale<one ) then
+                       do jw = 0, nw - 1
+                          do jr = je, j - 1
+                             work( ( jw+2 )*n+jr ) = scale*work( ( jw+2 )*n+jr )
+                          end do
+                       end do
+                       xmax = scale*xmax
+                    end if
+                    xmax = max( xmax, temp )
+                 end do loop_160
+                 ! copy eigenvector to vl, back transforming if
+                 ! howmny='b'.
+                 ieig = ieig + 1
+                 if( ilback ) then
+                    do jw = 0, nw - 1
+                       call stdlib_qgemv( 'N', n, n+1-je, one, vl( 1, je ), ldvl,work( ( jw+2 )*n+&
+                                 je ), 1, zero,work( ( jw+4 )*n+1 ), 1 )
+                    end do
+                    call stdlib_qlacpy( ' ', n, nw, work( 4*n+1 ), n, vl( 1, je ),ldvl )
+                    ibeg = 1
+                 else
+                    call stdlib_qlacpy( ' ', n, nw, work( 2*n+1 ), n, vl( 1, ieig ),ldvl )
+                    ibeg = je
+                 end if
+                 ! scale eigenvector
+                 xmax = zero
+                 if( ilcplx ) then
+                    do j = ibeg, n
+                       xmax = max( xmax, abs( vl( j, ieig ) )+abs( vl( j, ieig+1 ) ) )
+                    end do
+                 else
+                    do j = ibeg, n
+                       xmax = max( xmax, abs( vl( j, ieig ) ) )
+                    end do
+                 end if
+                 if( xmax>safmin ) then
+                    xscale = one / xmax
+                    do jw = 0, nw - 1
+                       do jr = ibeg, n
+                          vl( jr, ieig+jw ) = xscale*vl( jr, ieig+jw )
+                       end do
+                    end do
+                 end if
+                 ieig = ieig + nw - 1
+              end do loop_220
+           end if
+           ! right eigenvectors
+           if( compr ) then
+              ieig = im + 1
+              ! main loop over eigenvalues
+              ilcplx = .false.
+              loop_500: do je = n, 1, -1
+                 ! skip this iteration if (a) howmny='s' and select=.false., or
+                 ! (b) this would be the second of a complex pair.
+                 ! check for complex eigenvalue, so as to be sure of which
+                 ! entry(-ies) of select to look at -- if complex, select(je)
+                 ! or select(je-1).
+                 ! if this is a complex pair, the 2-by-2 diagonal block
+                 ! corresponding to the eigenvalue is in rows/columns je-1:je
+                 if( ilcplx ) then
+                    ilcplx = .false.
+                    cycle loop_500
+                 end if
+                 nw = 1
+                 if( je>1 ) then
+                    if( s( je, je-1 )/=zero ) then
+                       ilcplx = .true.
+                       nw = 2
+                    end if
+                 end if
+                 if( ilall ) then
+                    ilcomp = .true.
+                 else if( ilcplx ) then
+                    ilcomp = select( je ) .or. select( je-1 )
+                 else
+                    ilcomp = select( je )
+                 end if
+                 if( .not.ilcomp )cycle loop_500
+                 ! decide if (a) singular pencil, (b) real eigenvalue, or
+                 ! (c) complex eigenvalue.
+                 if( .not.ilcplx ) then
+                    if( abs( s( je, je ) )<=safmin .and.abs( p( je, je ) )<=safmin ) then
+                       ! singular matrix pencil -- unit eigenvector
+                       ieig = ieig - 1
+                       do jr = 1, n
+                          vr( jr, ieig ) = zero
+                       end do
+                       vr( ieig, ieig ) = one
+                       cycle loop_500
+                    end if
+                 end if
+                 ! clear vector
+                 do jw = 0, nw - 1
+                    do jr = 1, n
+                       work( ( jw+2 )*n+jr ) = zero
+                    end do
+                 end do
+                 ! compute coefficients in  ( a a - b b ) x = 0
+                    ! a  is  acoef
+                    ! b  is  bcoefr + i*bcoefi
+                 if( .not.ilcplx ) then
+                    ! real eigenvalue
+                    temp = one / max( abs( s( je, je ) )*ascale,abs( p( je, je ) )*bscale, safmin &
+                              )
+                    salfar = ( temp*s( je, je ) )*ascale
+                    sbeta = ( temp*p( je, je ) )*bscale
+                    acoef = sbeta*ascale
+                    bcoefr = salfar*bscale
+                    bcoefi = zero
+                    ! scale to avoid underflow
+                    scale = one
+                    lsa = abs( sbeta )>=safmin .and. abs( acoef )<small
+                    lsb = abs( salfar )>=safmin .and. abs( bcoefr )<small
+                    if( lsa )scale = ( small / abs( sbeta ) )*min( anorm, big )
+                    if( lsb )scale = max( scale, ( small / abs( salfar ) )*min( bnorm, big ) )
+                              
+                    if( lsa .or. lsb ) then
+                       scale = min( scale, one /( safmin*max( one, abs( acoef ),abs( bcoefr ) ) ) &
+                                 )
+                       if( lsa ) then
+                          acoef = ascale*( scale*sbeta )
+                       else
+                          acoef = scale*acoef
+                       end if
+                       if( lsb ) then
+                          bcoefr = bscale*( scale*salfar )
+                       else
+                          bcoefr = scale*bcoefr
+                       end if
+                    end if
+                    acoefa = abs( acoef )
+                    bcoefa = abs( bcoefr )
+                    ! first component is 1
+                    work( 2*n+je ) = one
+                    xmax = one
+                    ! compute contribution from column je of a and b to sum
+                    ! (see "further details", above.)
+                    do jr = 1, je - 1
+                       work( 2*n+jr ) = bcoefr*p( jr, je ) -acoef*s( jr, je )
+                    end do
+                 else
+                    ! complex eigenvalue
+                    call stdlib_qlag2( s( je-1, je-1 ), lds, p( je-1, je-1 ), ldp,safmin*safety, &
+                              acoef, temp, bcoefr, temp2,bcoefi )
+                    if( bcoefi==zero ) then
+                       info = je - 1
+                       return
+                    end if
+                    ! scale to avoid over/underflow
+                    acoefa = abs( acoef )
+                    bcoefa = abs( bcoefr ) + abs( bcoefi )
+                    scale = one
+                    if( acoefa*ulp<safmin .and. acoefa>=safmin )scale = ( safmin / ulp ) / &
+                              acoefa
+                    if( bcoefa*ulp<safmin .and. bcoefa>=safmin )scale = max( scale, ( safmin / &
+                              ulp ) / bcoefa )
+                    if( safmin*acoefa>ascale )scale = ascale / ( safmin*acoefa )
+                    if( safmin*bcoefa>bscale )scale = min( scale, bscale / ( safmin*bcoefa ) )
+                              
+                    if( scale/=one ) then
+                       acoef = scale*acoef
+                       acoefa = abs( acoef )
+                       bcoefr = scale*bcoefr
+                       bcoefi = scale*bcoefi
+                       bcoefa = abs( bcoefr ) + abs( bcoefi )
+                    end if
+                    ! compute first two components of eigenvector
+                    ! and contribution to sums
+                    temp = acoef*s( je, je-1 )
+                    temp2r = acoef*s( je, je ) - bcoefr*p( je, je )
+                    temp2i = -bcoefi*p( je, je )
+                    if( abs( temp )>=abs( temp2r )+abs( temp2i ) ) then
+                       work( 2*n+je ) = one
+                       work( 3*n+je ) = zero
+                       work( 2*n+je-1 ) = -temp2r / temp
+                       work( 3*n+je-1 ) = -temp2i / temp
+                    else
+                       work( 2*n+je-1 ) = one
+                       work( 3*n+je-1 ) = zero
+                       temp = acoef*s( je-1, je )
+                       work( 2*n+je ) = ( bcoefr*p( je-1, je-1 )-acoef*s( je-1, je-1 ) ) / &
+                                 temp
+                       work( 3*n+je ) = bcoefi*p( je-1, je-1 ) / temp
+                    end if
+                    xmax = max( abs( work( 2*n+je ) )+abs( work( 3*n+je ) ),abs( work( 2*n+je-1 ) &
+                              )+abs( work( 3*n+je-1 ) ) )
+                    ! compute contribution from columns je and je-1
+                    ! of a and b to the sums.
+                    creala = acoef*work( 2*n+je-1 )
+                    cimaga = acoef*work( 3*n+je-1 )
+                    crealb = bcoefr*work( 2*n+je-1 ) -bcoefi*work( 3*n+je-1 )
+                    cimagb = bcoefi*work( 2*n+je-1 ) +bcoefr*work( 3*n+je-1 )
+                    cre2a = acoef*work( 2*n+je )
+                    cim2a = acoef*work( 3*n+je )
+                    cre2b = bcoefr*work( 2*n+je ) - bcoefi*work( 3*n+je )
+                    cim2b = bcoefi*work( 2*n+je ) + bcoefr*work( 3*n+je )
+                    do jr = 1, je - 2
+                       work( 2*n+jr ) = -creala*s( jr, je-1 ) +crealb*p( jr, je-1 ) -cre2a*s( jr, &
+                                 je ) + cre2b*p( jr, je )
+                       work( 3*n+jr ) = -cimaga*s( jr, je-1 ) +cimagb*p( jr, je-1 ) -cim2a*s( jr, &
+                                 je ) + cim2b*p( jr, je )
+                    end do
+                 end if
+                 dmin = max( ulp*acoefa*anorm, ulp*bcoefa*bnorm, safmin )
+                 ! columnwise triangular solve of  (a a - b b)  x = 0
+                 il2by2 = .false.
+                 loop_370: do j = je - nw, 1, -1
+                    ! if a 2-by-2 block, is in position j-1:j, wait until
+                    ! next iteration to process it (when it will be j:j+1)
+                    if( .not.il2by2 .and. j>1 ) then
+                       if( s( j, j-1 )/=zero ) then
+                          il2by2 = .true.
+                          cycle loop_370
+                       end if
+                    end if
+                    bdiag( 1 ) = p( j, j )
+                    if( il2by2 ) then
+                       na = 2
+                       bdiag( 2 ) = p( j+1, j+1 )
+                    else
+                       na = 1
+                    end if
+                    ! compute x(j) (and x(j+1), if 2-by-2 block)
+                    call stdlib_qlaln2( .false., na, nw, dmin, acoef, s( j, j ),lds, bdiag( 1 ), &
+                    bdiag( 2 ), work( 2*n+j ),n, bcoefr, bcoefi, sum, 2, scale, temp,iinfo )
+                              
+                    if( scale<one ) then
+                       do jw = 0, nw - 1
+                          do jr = 1, je
+                             work( ( jw+2 )*n+jr ) = scale*work( ( jw+2 )*n+jr )
+                          end do
+                       end do
+                    end if
+                    xmax = max( scale*xmax, temp )
+                    do jw = 1, nw
+                       do ja = 1, na
+                          work( ( jw+1 )*n+j+ja-1 ) = sum( ja, jw )
+                       end do
+                    end do
+                    ! w = w + x(j)*(a s(*,j) - b p(*,j) ) with scaling
+                    if( j>1 ) then
+                       ! check whether scaling is necessary for sum.
+                       xscale = one / max( one, xmax )
+                       temp = acoefa*work( j ) + bcoefa*work( n+j )
+                       if( il2by2 )temp = max( temp, acoefa*work( j+1 )+bcoefa*work( n+j+1 ) )
+                                 
+                       temp = max( temp, acoefa, bcoefa )
+                       if( temp>bignum*xscale ) then
+                          do jw = 0, nw - 1
+                             do jr = 1, je
+                                work( ( jw+2 )*n+jr ) = xscale*work( ( jw+2 )*n+jr )
+                             end do
+                          end do
+                          xmax = xmax*xscale
+                       end if
+                       ! compute the contributions of the off-diagonals of
+                       ! column j (and j+1, if 2-by-2 block) of a and b to the
+                       ! sums.
+                       do ja = 1, na
+                          if( ilcplx ) then
+                             creala = acoef*work( 2*n+j+ja-1 )
+                             cimaga = acoef*work( 3*n+j+ja-1 )
+                             crealb = bcoefr*work( 2*n+j+ja-1 ) -bcoefi*work( 3*n+j+ja-1 )
+                             cimagb = bcoefi*work( 2*n+j+ja-1 ) +bcoefr*work( 3*n+j+ja-1 )
+                             do jr = 1, j - 1
+                                work( 2*n+jr ) = work( 2*n+jr ) -creala*s( jr, j+ja-1 ) +crealb*p(&
+                                           jr, j+ja-1 )
+                                work( 3*n+jr ) = work( 3*n+jr ) -cimaga*s( jr, j+ja-1 ) +cimagb*p(&
+                                           jr, j+ja-1 )
+                             end do
+                          else
+                             creala = acoef*work( 2*n+j+ja-1 )
+                             crealb = bcoefr*work( 2*n+j+ja-1 )
+                             do jr = 1, j - 1
+                                work( 2*n+jr ) = work( 2*n+jr ) -creala*s( jr, j+ja-1 ) +crealb*p(&
+                                           jr, j+ja-1 )
+                             end do
+                          end if
+                       end do
+                    end if
+                    il2by2 = .false.
+                 end do loop_370
+                 ! copy eigenvector to vr, back transforming if
+                 ! howmny='b'.
+                 ieig = ieig - nw
+                 if( ilback ) then
+                    do jw = 0, nw - 1
+                       do jr = 1, n
+                          work( ( jw+4 )*n+jr ) = work( ( jw+2 )*n+1 )*vr( jr, 1 )
+                       end do
+                       ! a series of compiler directives to defeat
+                       ! vectorization for the next loop
+                       do jc = 2, je
+                          do jr = 1, n
+                             work( ( jw+4 )*n+jr ) = work( ( jw+4 )*n+jr ) +work( ( jw+2 )*n+jc )&
+                                       *vr( jr, jc )
+                          end do
+                       end do
+                    end do
+                    do jw = 0, nw - 1
+                       do jr = 1, n
+                          vr( jr, ieig+jw ) = work( ( jw+4 )*n+jr )
+                       end do
+                    end do
+                    iend = n
+                 else
+                    do jw = 0, nw - 1
+                       do jr = 1, n
+                          vr( jr, ieig+jw ) = work( ( jw+2 )*n+jr )
+                       end do
+                    end do
+                    iend = je
+                 end if
+                 ! scale eigenvector
+                 xmax = zero
+                 if( ilcplx ) then
+                    do j = 1, iend
+                       xmax = max( xmax, abs( vr( j, ieig ) )+abs( vr( j, ieig+1 ) ) )
+                    end do
+                 else
+                    do j = 1, iend
+                       xmax = max( xmax, abs( vr( j, ieig ) ) )
+                    end do
+                 end if
+                 if( xmax>safmin ) then
+                    xscale = one / xmax
+                    do jw = 0, nw - 1
+                       do jr = 1, iend
+                          vr( jr, ieig+jw ) = xscale*vr( jr, ieig+jw )
+                       end do
+                    end do
+                 end if
+              end do loop_500
+           end if
+           return
+     end subroutine stdlib_qtgevc
+
+     !> DTGEX2: swaps adjacent diagonal blocks (A11, B11) and (A22, B22)
+     !> of size 1-by-1 or 2-by-2 in an upper (quasi) triangular matrix pair
+     !> (A, B) by an orthogonal equivalence transformation.
+     !> (A, B) must be in generalized real Schur canonical form (as returned
+     !> by DGGES), i.e. A is block upper triangular with 1-by-1 and 2-by-2
+     !> diagonal blocks. B is upper triangular.
+     !> Optionally, the matrices Q and Z of generalized Schur vectors are
+     !> updated.
+     !> Q(in) * A(in) * Z(in)**T = Q(out) * A(out) * Z(out)**T
+     !> Q(in) * B(in) * Z(in)**T = Q(out) * B(out) * Z(out)**T
+
+     pure subroutine stdlib_qtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, j1, n1, n2, &
+               work, lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: j1, lda, ldb, ldq, ldz, lwork, n, n1, n2
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+        ! replaced various illegal calls to stdlib_qcopy by calls to stdlib_qlaset, or by do
+        ! loops. sven hammarling, 1/5/02.
+           ! Parameters 
+           real(qp), parameter :: twenty = 2.0e+01_qp
+           integer(ilp), parameter :: ldst = 4
+           logical(lk), parameter :: wands = .true.
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: strong, weak
+           integer(ilp) :: i, idum, linfo, m
+           real(qp) :: bqra21, brqa21, ddum, dnorma, dnormb, dscale, dsum, eps, f, g, sa, sb, &
+                     scale, smlnum, thresha, threshb
+           ! Local Arrays 
+           integer(ilp) :: iwork(ldst)
+           real(qp) :: ai(2), ar(2), be(2), ir(ldst,ldst), ircop(ldst,ldst), li(ldst,ldst), licop(&
+           ldst,ldst), s(ldst,ldst), scpy(ldst,ldst), t(ldst,ldst), taul(ldst), taur(ldst), tcpy(&
+                     ldst,ldst)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=1 .or. n1<=0 .or. n2<=0 )return
+           if( n1>n .or. ( j1+n1 )>n )return
+           m = n1 + n2
+           if( lwork<max( 1, n*m, m*m*2 ) ) then
+              info = -16
+              work( 1 ) = max( 1, n*m, m*m*2 )
+              return
+           end if
+           weak = .false.
+           strong = .false.
+           ! make a local copy of selected block
+           call stdlib_qlaset( 'FULL', ldst, ldst, zero, zero, li, ldst )
+           call stdlib_qlaset( 'FULL', ldst, ldst, zero, zero, ir, ldst )
+           call stdlib_qlacpy( 'FULL', m, m, a( j1, j1 ), lda, s, ldst )
+           call stdlib_qlacpy( 'FULL', m, m, b( j1, j1 ), ldb, t, ldst )
+           ! compute threshold for testing acceptance of swapping.
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' ) / eps
+           dscale = zero
+           dsum = one
+           call stdlib_qlacpy( 'FULL', m, m, s, ldst, work, m )
+           call stdlib_qlassq( m*m, work, 1, dscale, dsum )
+           dnorma = dscale*sqrt( dsum )
+           dscale = zero
+           dsum = one
+           call stdlib_qlacpy( 'FULL', m, m, t, ldst, work, m )
+           call stdlib_qlassq( m*m, work, 1, dscale, dsum )
+           dnormb = dscale*sqrt( dsum )
+           ! thres has been changed from
+              ! thresh = max( ten*eps*sa, smlnum )
+           ! to
+              ! thresh = max( twenty*eps*sa, smlnum )
+           ! on 04/01/10.
+           ! "bug" reported by ondra kamenik, confirmed by julie langou, fixed by
+           ! jim demmel and guillaume revy. see forum post 1783.
+           thresha = max( twenty*eps*dnorma, smlnum )
+           threshb = max( twenty*eps*dnormb, smlnum )
+           if( m==2 ) then
+              ! case 1: swap 1-by-1 and 1-by-1 blocks.
+              ! compute orthogonal ql and rq that swap 1-by-1 and 1-by-1 blocks
+              ! using givens rotations and perform the swap tentatively.
+              f = s( 2, 2 )*t( 1, 1 ) - t( 2, 2 )*s( 1, 1 )
+              g = s( 2, 2 )*t( 1, 2 ) - t( 2, 2 )*s( 1, 2 )
+              sa = abs( s( 2, 2 ) ) * abs( t( 1, 1 ) )
+              sb = abs( s( 1, 1 ) ) * abs( t( 2, 2 ) )
+              call stdlib_qlartg( f, g, ir( 1, 2 ), ir( 1, 1 ), ddum )
+              ir( 2, 1 ) = -ir( 1, 2 )
+              ir( 2, 2 ) = ir( 1, 1 )
+              call stdlib_qrot( 2, s( 1, 1 ), 1, s( 1, 2 ), 1, ir( 1, 1 ),ir( 2, 1 ) )
+              call stdlib_qrot( 2, t( 1, 1 ), 1, t( 1, 2 ), 1, ir( 1, 1 ),ir( 2, 1 ) )
+              if( sa>=sb ) then
+                 call stdlib_qlartg( s( 1, 1 ), s( 2, 1 ), li( 1, 1 ), li( 2, 1 ),ddum )
+              else
+                 call stdlib_qlartg( t( 1, 1 ), t( 2, 1 ), li( 1, 1 ), li( 2, 1 ),ddum )
+              end if
+              call stdlib_qrot( 2, s( 1, 1 ), ldst, s( 2, 1 ), ldst, li( 1, 1 ),li( 2, 1 ) )
+                        
+              call stdlib_qrot( 2, t( 1, 1 ), ldst, t( 2, 1 ), ldst, li( 1, 1 ),li( 2, 1 ) )
+                        
+              li( 2, 2 ) = li( 1, 1 )
+              li( 1, 2 ) = -li( 2, 1 )
+              ! weak stability test: |s21| <= o(eps f-norm((a)))
+                                 ! and  |t21| <= o(eps f-norm((b)))
+              weak = abs( s( 2, 1 ) ) <= thresha .and.abs( t( 2, 1 ) ) <= threshb
+              if( .not.weak )go to 70
+              if( wands ) then
+                 ! strong stability test:
+                     ! f-norm((a-ql**h*s*qr)) <= o(eps*f-norm((a)))
+                     ! and
+                     ! f-norm((b-ql**h*t*qr)) <= o(eps*f-norm((b)))
+                 call stdlib_qlacpy( 'FULL', m, m, a( j1, j1 ), lda, work( m*m+1 ),m )
+                 call stdlib_qgemm( 'N', 'N', m, m, m, one, li, ldst, s, ldst, zero,work, m )
+                           
+                 call stdlib_qgemm( 'N', 'T', m, m, m, -one, work, m, ir, ldst, one,work( m*m+1 ),&
+                            m )
+                 dscale = zero
+                 dsum = one
+                 call stdlib_qlassq( m*m, work( m*m+1 ), 1, dscale, dsum )
+                 sa = dscale*sqrt( dsum )
+                 call stdlib_qlacpy( 'FULL', m, m, b( j1, j1 ), ldb, work( m*m+1 ),m )
+                 call stdlib_qgemm( 'N', 'N', m, m, m, one, li, ldst, t, ldst, zero,work, m )
+                           
+                 call stdlib_qgemm( 'N', 'T', m, m, m, -one, work, m, ir, ldst, one,work( m*m+1 ),&
+                            m )
+                 dscale = zero
+                 dsum = one
+                 call stdlib_qlassq( m*m, work( m*m+1 ), 1, dscale, dsum )
+                 sb = dscale*sqrt( dsum )
+                 strong = sa<=thresha .and. sb<=threshb
+                 if( .not.strong )go to 70
+              end if
+              ! update (a(j1:j1+m-1, m+j1:n), b(j1:j1+m-1, m+j1:n)) and
+                     ! (a(1:j1-1, j1:j1+m), b(1:j1-1, j1:j1+m)).
+              call stdlib_qrot( j1+1, a( 1, j1 ), 1, a( 1, j1+1 ), 1, ir( 1, 1 ),ir( 2, 1 ) )
+                        
+              call stdlib_qrot( j1+1, b( 1, j1 ), 1, b( 1, j1+1 ), 1, ir( 1, 1 ),ir( 2, 1 ) )
+                        
+              call stdlib_qrot( n-j1+1, a( j1, j1 ), lda, a( j1+1, j1 ), lda,li( 1, 1 ), li( 2, 1 &
+                        ) )
+              call stdlib_qrot( n-j1+1, b( j1, j1 ), ldb, b( j1+1, j1 ), ldb,li( 1, 1 ), li( 2, 1 &
+                        ) )
+              ! set  n1-by-n2 (2,1) - blocks to zero.
+              a( j1+1, j1 ) = zero
+              b( j1+1, j1 ) = zero
+              ! accumulate transformations into q and z if requested.
+              if( wantz )call stdlib_qrot( n, z( 1, j1 ), 1, z( 1, j1+1 ), 1, ir( 1, 1 ),ir( 2, 1 &
+                        ) )
+              if( wantq )call stdlib_qrot( n, q( 1, j1 ), 1, q( 1, j1+1 ), 1, li( 1, 1 ),li( 2, 1 &
+                        ) )
+              ! exit with info = 0 if swap was successfully performed.
+              return
+           else
+              ! case 2: swap 1-by-1 and 2-by-2 blocks, or 2-by-2
+                      ! and 2-by-2 blocks.
+              ! solve the generalized sylvester equation
+                       ! s11 * r - l * s22 = scale * s12
+                       ! t11 * r - l * t22 = scale * t12
+              ! for r and l. solutions in li and ir.
+              call stdlib_qlacpy( 'FULL', n1, n2, t( 1, n1+1 ), ldst, li, ldst )
+              call stdlib_qlacpy( 'FULL', n1, n2, s( 1, n1+1 ), ldst,ir( n2+1, n1+1 ), ldst )
+                        
+              call stdlib_qtgsy2( 'N', 0, n1, n2, s, ldst, s( n1+1, n1+1 ), ldst,ir( n2+1, n1+1 ),&
+               ldst, t, ldst, t( n1+1, n1+1 ),ldst, li, ldst, scale, dsum, dscale, iwork, idum,&
+                         linfo )
+              if( linfo/=0 )go to 70
+              ! compute orthogonal matrix ql:
+                          ! ql**t * li = [ tl ]
+                                       ! [ 0  ]
+              ! where
+                          ! li =  [      -l              ]
+                                ! [ scale * identity(n2) ]
+              do i = 1, n2
+                 call stdlib_qscal( n1, -one, li( 1, i ), 1 )
+                 li( n1+i, i ) = scale
+              end do
+              call stdlib_qgeqr2( m, n2, li, ldst, taul, work, linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_qorg2r( m, m, n2, li, ldst, taul, work, linfo )
+              if( linfo/=0 )go to 70
+              ! compute orthogonal matrix rq:
+                          ! ir * rq**t =   [ 0  tr],
+               ! where ir = [ scale * identity(n1), r ]
+              do i = 1, n1
+                 ir( n2+i, i ) = scale
+              end do
+              call stdlib_qgerq2( n1, m, ir( n2+1, 1 ), ldst, taur, work, linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_qorgr2( m, m, n1, ir, ldst, taur, work, linfo )
+              if( linfo/=0 )go to 70
+              ! perform the swapping tentatively:
+              call stdlib_qgemm( 'T', 'N', m, m, m, one, li, ldst, s, ldst, zero,work, m )
+              call stdlib_qgemm( 'N', 'T', m, m, m, one, work, m, ir, ldst, zero, s,ldst )
+              call stdlib_qgemm( 'T', 'N', m, m, m, one, li, ldst, t, ldst, zero,work, m )
+              call stdlib_qgemm( 'N', 'T', m, m, m, one, work, m, ir, ldst, zero, t,ldst )
+              call stdlib_qlacpy( 'F', m, m, s, ldst, scpy, ldst )
+              call stdlib_qlacpy( 'F', m, m, t, ldst, tcpy, ldst )
+              call stdlib_qlacpy( 'F', m, m, ir, ldst, ircop, ldst )
+              call stdlib_qlacpy( 'F', m, m, li, ldst, licop, ldst )
+              ! triangularize the b-part by an rq factorization.
+              ! apply transformation (from left) to a-part, giving s.
+              call stdlib_qgerq2( m, m, t, ldst, taur, work, linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_qormr2( 'R', 'T', m, m, m, t, ldst, taur, s, ldst, work,linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_qormr2( 'L', 'N', m, m, m, t, ldst, taur, ir, ldst, work,linfo )
+              if( linfo/=0 )go to 70
+              ! compute f-norm(s21) in brqa21. (t21 is 0.)
+              dscale = zero
+              dsum = one
+              do i = 1, n2
+                 call stdlib_qlassq( n1, s( n2+1, i ), 1, dscale, dsum )
+              end do
+              brqa21 = dscale*sqrt( dsum )
+              ! triangularize the b-part by a qr factorization.
+              ! apply transformation (from right) to a-part, giving s.
+              call stdlib_qgeqr2( m, m, tcpy, ldst, taul, work, linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_qorm2r( 'L', 'T', m, m, m, tcpy, ldst, taul, scpy, ldst,work, info )
+                        
+              call stdlib_qorm2r( 'R', 'N', m, m, m, tcpy, ldst, taul, licop, ldst,work, info )
+                        
+              if( linfo/=0 )go to 70
+              ! compute f-norm(s21) in bqra21. (t21 is 0.)
+              dscale = zero
+              dsum = one
+              do i = 1, n2
+                 call stdlib_qlassq( n1, scpy( n2+1, i ), 1, dscale, dsum )
+              end do
+              bqra21 = dscale*sqrt( dsum )
+              ! decide which method to use.
+                ! weak stability test:
+                   ! f-norm(s21) <= o(eps * f-norm((s)))
+              if( bqra21<=brqa21 .and. bqra21<=thresha ) then
+                 call stdlib_qlacpy( 'F', m, m, scpy, ldst, s, ldst )
+                 call stdlib_qlacpy( 'F', m, m, tcpy, ldst, t, ldst )
+                 call stdlib_qlacpy( 'F', m, m, ircop, ldst, ir, ldst )
+                 call stdlib_qlacpy( 'F', m, m, licop, ldst, li, ldst )
+              else if( brqa21>=thresha ) then
+                 go to 70
+              end if
+              ! set lower triangle of b-part to zero
+              call stdlib_qlaset( 'LOWER', m-1, m-1, zero, zero, t(2,1), ldst )
+              if( wands ) then
+                 ! strong stability test:
+                     ! f-norm((a-ql**h*s*qr)) <= o(eps*f-norm((a)))
+                     ! and
+                     ! f-norm((b-ql**h*t*qr)) <= o(eps*f-norm((b)))
+                 call stdlib_qlacpy( 'FULL', m, m, a( j1, j1 ), lda, work( m*m+1 ),m )
+                 call stdlib_qgemm( 'N', 'N', m, m, m, one, li, ldst, s, ldst, zero,work, m )
+                           
+                 call stdlib_qgemm( 'N', 'N', m, m, m, -one, work, m, ir, ldst, one,work( m*m+1 ),&
+                            m )
+                 dscale = zero
+                 dsum = one
+                 call stdlib_qlassq( m*m, work( m*m+1 ), 1, dscale, dsum )
+                 sa = dscale*sqrt( dsum )
+                 call stdlib_qlacpy( 'FULL', m, m, b( j1, j1 ), ldb, work( m*m+1 ),m )
+                 call stdlib_qgemm( 'N', 'N', m, m, m, one, li, ldst, t, ldst, zero,work, m )
+                           
+                 call stdlib_qgemm( 'N', 'N', m, m, m, -one, work, m, ir, ldst, one,work( m*m+1 ),&
+                            m )
+                 dscale = zero
+                 dsum = one
+                 call stdlib_qlassq( m*m, work( m*m+1 ), 1, dscale, dsum )
+                 sb = dscale*sqrt( dsum )
+                 strong = sa<=thresha .and. sb<=threshb
+                 if( .not.strong )go to 70
+              end if
+              ! if the swap is accepted ("weakly" and "strongly"), apply the
+              ! transformations and set n1-by-n2 (2,1)-block to zero.
+              call stdlib_qlaset( 'FULL', n1, n2, zero, zero, s(n2+1,1), ldst )
+              ! copy back m-by-m diagonal block starting at index j1 of (a, b)
+              call stdlib_qlacpy( 'F', m, m, s, ldst, a( j1, j1 ), lda )
+              call stdlib_qlacpy( 'F', m, m, t, ldst, b( j1, j1 ), ldb )
+              call stdlib_qlaset( 'FULL', ldst, ldst, zero, zero, t, ldst )
+              ! standardize existing 2-by-2 blocks.
+              call stdlib_qlaset( 'FULL', m, m, zero, zero, work, m )
+              work( 1 ) = one
+              t( 1, 1 ) = one
+              idum = lwork - m*m - 2
+              if( n2>1 ) then
+                 call stdlib_qlagv2( a( j1, j1 ), lda, b( j1, j1 ), ldb, ar, ai, be,work( 1 ), &
+                           work( 2 ), t( 1, 1 ), t( 2, 1 ) )
+                 work( m+1 ) = -work( 2 )
+                 work( m+2 ) = work( 1 )
+                 t( n2, n2 ) = t( 1, 1 )
+                 t( 1, 2 ) = -t( 2, 1 )
+              end if
+              work( m*m ) = one
+              t( m, m ) = one
+              if( n1>1 ) then
+                 call stdlib_qlagv2( a( j1+n2, j1+n2 ), lda, b( j1+n2, j1+n2 ), ldb,taur, taul, &
+                 work( m*m+1 ), work( n2*m+n2+1 ),work( n2*m+n2+2 ), t( n2+1, n2+1 ),t( m, m-1 ) )
+                           
+                 work( m*m ) = work( n2*m+n2+1 )
+                 work( m*m-1 ) = -work( n2*m+n2+2 )
+                 t( m, m ) = t( n2+1, n2+1 )
+                 t( m-1, m ) = -t( m, m-1 )
+              end if
+              call stdlib_qgemm( 'T', 'N', n2, n1, n2, one, work, m, a( j1, j1+n2 ),lda, zero, &
+                        work( m*m+1 ), n2 )
+              call stdlib_qlacpy( 'FULL', n2, n1, work( m*m+1 ), n2, a( j1, j1+n2 ),lda )
+              call stdlib_qgemm( 'T', 'N', n2, n1, n2, one, work, m, b( j1, j1+n2 ),ldb, zero, &
+                        work( m*m+1 ), n2 )
+              call stdlib_qlacpy( 'FULL', n2, n1, work( m*m+1 ), n2, b( j1, j1+n2 ),ldb )
+              call stdlib_qgemm( 'N', 'N', m, m, m, one, li, ldst, work, m, zero,work( m*m+1 ), m &
+                        )
+              call stdlib_qlacpy( 'FULL', m, m, work( m*m+1 ), m, li, ldst )
+              call stdlib_qgemm( 'N', 'N', n2, n1, n1, one, a( j1, j1+n2 ), lda,t( n2+1, n2+1 ), &
+                        ldst, zero, work, n2 )
+              call stdlib_qlacpy( 'FULL', n2, n1, work, n2, a( j1, j1+n2 ), lda )
+              call stdlib_qgemm( 'N', 'N', n2, n1, n1, one, b( j1, j1+n2 ), ldb,t( n2+1, n2+1 ), &
+                        ldst, zero, work, n2 )
+              call stdlib_qlacpy( 'FULL', n2, n1, work, n2, b( j1, j1+n2 ), ldb )
+              call stdlib_qgemm( 'T', 'N', m, m, m, one, ir, ldst, t, ldst, zero,work, m )
+              call stdlib_qlacpy( 'FULL', m, m, work, m, ir, ldst )
+              ! accumulate transformations into q and z if requested.
+              if( wantq ) then
+                 call stdlib_qgemm( 'N', 'N', n, m, m, one, q( 1, j1 ), ldq, li,ldst, zero, work, &
+                           n )
+                 call stdlib_qlacpy( 'FULL', n, m, work, n, q( 1, j1 ), ldq )
+              end if
+              if( wantz ) then
+                 call stdlib_qgemm( 'N', 'N', n, m, m, one, z( 1, j1 ), ldz, ir,ldst, zero, work, &
+                           n )
+                 call stdlib_qlacpy( 'FULL', n, m, work, n, z( 1, j1 ), ldz )
+              end if
+              ! update (a(j1:j1+m-1, m+j1:n), b(j1:j1+m-1, m+j1:n)) and
+                      ! (a(1:j1-1, j1:j1+m), b(1:j1-1, j1:j1+m)).
+              i = j1 + m
+              if( i<=n ) then
+                 call stdlib_qgemm( 'T', 'N', m, n-i+1, m, one, li, ldst,a( j1, i ), lda, zero, &
+                           work, m )
+                 call stdlib_qlacpy( 'FULL', m, n-i+1, work, m, a( j1, i ), lda )
+                 call stdlib_qgemm( 'T', 'N', m, n-i+1, m, one, li, ldst,b( j1, i ), ldb, zero, &
+                           work, m )
+                 call stdlib_qlacpy( 'FULL', m, n-i+1, work, m, b( j1, i ), ldb )
+              end if
+              i = j1 - 1
+              if( i>0 ) then
+                 call stdlib_qgemm( 'N', 'N', i, m, m, one, a( 1, j1 ), lda, ir,ldst, zero, work, &
+                           i )
+                 call stdlib_qlacpy( 'FULL', i, m, work, i, a( 1, j1 ), lda )
+                 call stdlib_qgemm( 'N', 'N', i, m, m, one, b( 1, j1 ), ldb, ir,ldst, zero, work, &
+                           i )
+                 call stdlib_qlacpy( 'FULL', i, m, work, i, b( 1, j1 ), ldb )
+              end if
+              ! exit with info = 0 if swap was successfully performed.
+              return
+           end if
+           ! exit with info = 1 if swap was rejected.
+           70 continue
+           info = 1
+           return
+     end subroutine stdlib_qtgex2
+
+     !> DTGEXC: reorders the generalized real Schur decomposition of a real
+     !> matrix pair (A,B) using an orthogonal equivalence transformation
+     !> (A, B) = Q * (A, B) * Z**T,
+     !> so that the diagonal block of (A, B) with row index IFST is moved
+     !> to row ILST.
+     !> (A, B) must be in generalized real Schur canonical form (as returned
+     !> by DGGES), i.e. A is block upper triangular with 1-by-1 and 2-by-2
+     !> diagonal blocks. B is upper triangular.
+     !> Optionally, the matrices Q and Z of generalized Schur vectors are
+     !> updated.
+     !> Q(in) * A(in) * Z(in)**T = Q(out) * A(out) * Z(out)**T
+     !> Q(in) * B(in) * Z(in)**T = Q(out) * B(out) * Z(out)**T
+
+     pure subroutine stdlib_qtgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, ifst, ilst, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(inout) :: ifst, ilst
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldq, ldz, lwork, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: here, lwmin, nbf, nbl, nbnext
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode and test input arguments.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldq<1 .or. wantq .and. ( ldq<max( 1, n ) ) ) then
+              info = -9
+           else if( ldz<1 .or. wantz .and. ( ldz<max( 1, n ) ) ) then
+              info = -11
+           else if( ifst<1 .or. ifst>n ) then
+              info = -12
+           else if( ilst<1 .or. ilst>n ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwmin = 1
+              else
+                 lwmin = 4*n + 16
+              end if
+              work(1) = lwmin
+              if (lwork<lwmin .and. .not.lquery) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGEXC', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 )return
+           ! determine the first row of the specified block and find out
+           ! if it is 1-by-1 or 2-by-2.
+           if( ifst>1 ) then
+              if( a( ifst, ifst-1 )/=zero )ifst = ifst - 1
+           end if
+           nbf = 1
+           if( ifst<n ) then
+              if( a( ifst+1, ifst )/=zero )nbf = 2
+           end if
+           ! determine the first row of the final block
+           ! and find out if it is 1-by-1 or 2-by-2.
+           if( ilst>1 ) then
+              if( a( ilst, ilst-1 )/=zero )ilst = ilst - 1
+           end if
+           nbl = 1
+           if( ilst<n ) then
+              if( a( ilst+1, ilst )/=zero )nbl = 2
+           end if
+           if( ifst==ilst )return
+           if( ifst<ilst ) then
+              ! update ilst.
+              if( nbf==2 .and. nbl==1 )ilst = ilst - 1
+              if( nbf==1 .and. nbl==2 )ilst = ilst + 1
+              here = ifst
+              10 continue
+              ! swap with next one below.
+              if( nbf==1 .or. nbf==2 ) then
+                 ! current block either 1-by-1 or 2-by-2.
+                 nbnext = 1
+                 if( here+nbf+1<=n ) then
+                    if( a( here+nbf+1, here+nbf )/=zero )nbnext = 2
+                 end if
+                 call stdlib_qtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here, nbf, &
+                           nbnext, work, lwork, info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 here = here + nbnext
+                 ! test if 2-by-2 block breaks into two 1-by-1 blocks.
+                 if( nbf==2 ) then
+                    if( a( here+1, here )==zero )nbf = 3
+                 end if
+              else
+                 ! current block consists of two 1-by-1 blocks, each of which
+                 ! must be swapped individually.
+                 nbnext = 1
+                 if( here+3<=n ) then
+                    if( a( here+3, here+2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_qtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here+1, 1, &
+                           nbnext, work, lwork, info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 if( nbnext==1 ) then
+                    ! swap two 1-by-1 blocks.
+                    call stdlib_qtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here, 1, &
+                              1, work, lwork, info )
+                    if( info/=0 ) then
+                       ilst = here
+                       return
+                    end if
+                    here = here + 1
+                 else
+                    ! recompute nbnext in case of 2-by-2 split.
+                    if( a( here+2, here+1 )==zero )nbnext = 1
+                    if( nbnext==2 ) then
+                       ! 2-by-2 block did not split.
+                       call stdlib_qtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, nbnext, work, lwork,info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here + 2
+                    else
+                       ! 2-by-2 block did split.
+                       call stdlib_qtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here + 1
+                       call stdlib_qtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here + 1
+                    end if
+                 end if
+              end if
+              if( here<ilst )go to 10
+           else
+              here = ifst
+              20 continue
+              ! swap with next one below.
+              if( nbf==1 .or. nbf==2 ) then
+                 ! current block either 1-by-1 or 2-by-2.
+                 nbnext = 1
+                 if( here>=3 ) then
+                    if( a( here-1, here-2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_qtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here-nbnext, &
+                           nbnext, nbf, work, lwork,info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 here = here - nbnext
+                 ! test if 2-by-2 block breaks into two 1-by-1 blocks.
+                 if( nbf==2 ) then
+                    if( a( here+1, here )==zero )nbf = 3
+                 end if
+              else
+                 ! current block consists of two 1-by-1 blocks, each of which
+                 ! must be swapped individually.
+                 nbnext = 1
+                 if( here>=3 ) then
+                    if( a( here-1, here-2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_qtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here-nbnext, &
+                           nbnext, 1, work, lwork,info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 if( nbnext==1 ) then
+                    ! swap two 1-by-1 blocks.
+                    call stdlib_qtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here, &
+                              nbnext, 1, work, lwork, info )
+                    if( info/=0 ) then
+                       ilst = here
+                       return
+                    end if
+                    here = here - 1
+                 else
+                   ! recompute nbnext in case of 2-by-2 split.
+                    if( a( here, here-1 )==zero )nbnext = 1
+                    if( nbnext==2 ) then
+                       ! 2-by-2 block did not split.
+                       call stdlib_qtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here-1,&
+                                  2, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here - 2
+                    else
+                       ! 2-by-2 block did split.
+                       call stdlib_qtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here - 1
+                       call stdlib_qtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here - 1
+                    end if
+                 end if
+              end if
+              if( here>ilst )go to 20
+           end if
+           ilst = here
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_qtgexc
+
+     !> DTGSEN: reorders the generalized real Schur decomposition of a real
+     !> matrix pair (A, B) (in terms of an orthonormal equivalence trans-
+     !> formation Q**T * (A, B) * Z), so that a selected cluster of eigenvalues
+     !> appears in the leading diagonal blocks of the upper quasi-triangular
+     !> matrix A and the upper triangular B. The leading columns of Q and
+     !> Z form orthonormal bases of the corresponding left and right eigen-
+     !> spaces (deflating subspaces). (A, B) must be in generalized real
+     !> Schur canonical form (as returned by DGGES), i.e. A is block upper
+     !> triangular with 1-by-1 and 2-by-2 diagonal blocks. B is upper
+     !> triangular.
+     !> DTGSEN also computes the generalized eigenvalues
+     !> w(j) = (ALPHAR(j) + i*ALPHAI(j))/BETA(j)
+     !> of the reordered matrix pair (A, B).
+     !> Optionally, DTGSEN computes the estimates of reciprocal condition
+     !> numbers for eigenvalues and eigenspaces. These are Difu[(A11,B11),
+     !> (A22,B22)] and Difl[(A11,B11), (A22,B22)], i.e. the separation(s)
+     !> between the matrix pairs (A11, B11) and (A22,B22) that correspond to
+     !> the selected cluster and the eigenvalues outside the cluster, resp.,
+     !> and norms of "projections" onto left and right eigenspaces w.r.t.
+     !> the selected cluster in the (1,1)-block.
+
+     pure subroutine stdlib_qtgsen( ijob, wantq, wantz, select, n, a, lda, b, ldb,alphar, alphai, &
+               beta, q, ldq, z, ldz, m, pl,pr, dif, work, lwork, iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldq, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(out) :: pl, pr
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           real(qp), intent(out) :: alphai(*), alphar(*), beta(*), dif(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: idifjb = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, pair, swap, wantd, wantd1, wantd2, wantp
+           integer(ilp) :: i, ierr, ijb, k, kase, kk, ks, liwmin, lwmin, mn2, n1, n2
+           real(qp) :: dscale, dsum, eps, rdscal, smlnum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max,sign,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           info = 0
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( ijob<0 .or. ijob>5 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -14
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGSEN', -info )
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' ) / eps
+           ierr = 0
+           wantp = ijob==1 .or. ijob>=4
+           wantd1 = ijob==2 .or. ijob==4
+           wantd2 = ijob==3 .or. ijob==5
+           wantd = wantd1 .or. wantd2
+           ! set m to the dimension of the specified pair of deflating
+           ! subspaces.
+           m = 0
+           pair = .false.
+           if( .not.lquery .or. ijob/=0 ) then
+           do k = 1, n
+              if( pair ) then
+                 pair = .false.
+              else
+                 if( k<n ) then
+                    if( a( k+1, k )==zero ) then
+                       if( select( k ) )m = m + 1
+                    else
+                       pair = .true.
+                       if( select( k ) .or. select( k+1 ) )m = m + 2
+                    end if
+                 else
+                    if( select( n ) )m = m + 1
+                 end if
+              end if
+           end do
+           end if
+           if( ijob==1 .or. ijob==2 .or. ijob==4 ) then
+              lwmin = max( 1, 4*n+16, 2*m*( n-m ) )
+              liwmin = max( 1, n+6 )
+           else if( ijob==3 .or. ijob==5 ) then
+              lwmin = max( 1, 4*n+16, 4*m*( n-m ) )
+              liwmin = max( 1, 2*m*( n-m ), n+6 )
+           else
+              lwmin = max( 1, 4*n+16 )
+              liwmin = 1
+           end if
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           if( lwork<lwmin .and. .not.lquery ) then
+              info = -22
+           else if( liwork<liwmin .and. .not.lquery ) then
+              info = -24
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGSEN', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( m==n .or. m==0 ) then
+              if( wantp ) then
+                 pl = one
+                 pr = one
+              end if
+              if( wantd ) then
+                 dscale = zero
+                 dsum = one
+                 do i = 1, n
+                    call stdlib_qlassq( n, a( 1, i ), 1, dscale, dsum )
+                    call stdlib_qlassq( n, b( 1, i ), 1, dscale, dsum )
+                 end do
+                 dif( 1 ) = dscale*sqrt( dsum )
+                 dif( 2 ) = dif( 1 )
+              end if
+              go to 60
+           end if
+           ! collect the selected blocks at the top-left corner of (a, b).
+           ks = 0
+           pair = .false.
+           loop_30: do k = 1, n
+              if( pair ) then
+                 pair = .false.
+              else
+                 swap = select( k )
+                 if( k<n ) then
+                    if( a( k+1, k )/=zero ) then
+                       pair = .true.
+                       swap = swap .or. select( k+1 )
+                    end if
+                 end if
+                 if( swap ) then
+                    ks = ks + 1
+                    ! swap the k-th block to position ks.
+                    ! perform the reordering of diagonal blocks in (a, b)
+                    ! by orthogonal transformation matrices and update
+                    ! q and z accordingly (if requested):
+                    kk = k
+                    if( k/=ks )call stdlib_qtgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz,&
+                               kk, ks, work, lwork, ierr )
+                    if( ierr>0 ) then
+                       ! swap is rejected: exit.
+                       info = 1
+                       if( wantp ) then
+                          pl = zero
+                          pr = zero
+                       end if
+                       if( wantd ) then
+                          dif( 1 ) = zero
+                          dif( 2 ) = zero
+                       end if
+                       go to 60
+                    end if
+                    if( pair )ks = ks + 1
+                 end if
+              end if
+           end do loop_30
+           if( wantp ) then
+              ! solve generalized sylvester equation for r and l
+              ! and compute pl and pr.
+              n1 = m
+              n2 = n - m
+              i = n1 + 1
+              ijb = 0
+              call stdlib_qlacpy( 'FULL', n1, n2, a( 1, i ), lda, work, n1 )
+              call stdlib_qlacpy( 'FULL', n1, n2, b( 1, i ), ldb, work( n1*n2+1 ),n1 )
+              call stdlib_qtgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda, work,n1, b, ldb, b( i,&
+               i ), ldb, work( n1*n2+1 ), n1,dscale, dif( 1 ), work( n1*n2*2+1 ),lwork-2*n1*n2, &
+                         iwork, ierr )
+              ! estimate the reciprocal of norms of "projections" onto left
+              ! and right eigenspaces.
+              rdscal = zero
+              dsum = one
+              call stdlib_qlassq( n1*n2, work, 1, rdscal, dsum )
+              pl = rdscal*sqrt( dsum )
+              if( pl==zero ) then
+                 pl = one
+              else
+                 pl = dscale / ( sqrt( dscale*dscale / pl+pl )*sqrt( pl ) )
+              end if
+              rdscal = zero
+              dsum = one
+              call stdlib_qlassq( n1*n2, work( n1*n2+1 ), 1, rdscal, dsum )
+              pr = rdscal*sqrt( dsum )
+              if( pr==zero ) then
+                 pr = one
+              else
+                 pr = dscale / ( sqrt( dscale*dscale / pr+pr )*sqrt( pr ) )
+              end if
+           end if
+           if( wantd ) then
+              ! compute estimates of difu and difl.
+              if( wantd1 ) then
+                 n1 = m
+                 n2 = n - m
+                 i = n1 + 1
+                 ijb = idifjb
+                 ! frobenius norm-based difu-estimate.
+                 call stdlib_qtgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda, work,n1, b, ldb, b(&
+                  i, i ), ldb, work( n1*n2+1 ),n1, dscale, dif( 1 ), work( 2*n1*n2+1 ),lwork-&
+                            2*n1*n2, iwork, ierr )
+                 ! frobenius norm-based difl-estimate.
+                 call stdlib_qtgsyl( 'N', ijb, n2, n1, a( i, i ), lda, a, lda, work,n2, b( i, i ),&
+                  ldb, b, ldb, work( n1*n2+1 ),n2, dscale, dif( 2 ), work( 2*n1*n2+1 ),lwork-&
+                            2*n1*n2, iwork, ierr )
+              else
+                 ! compute 1-norm-based estimates of difu and difl using
+                 ! reversed communication with stdlib_qlacn2. in each step a
+                 ! generalized sylvester equation or a transposed variant
+                 ! is solved.
+                 kase = 0
+                 n1 = m
+                 n2 = n - m
+                 i = n1 + 1
+                 ijb = 0
+                 mn2 = 2*n1*n2
+                 ! 1-norm-based estimate of difu.
+                 40 continue
+                 call stdlib_qlacn2( mn2, work( mn2+1 ), work, iwork, dif( 1 ),kase, isave )
+                           
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve generalized sylvester equation.
+                       call stdlib_qtgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda,work, n1, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n1, dscale, dif( 1 ),work( 2*n1*n2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    else
+                       ! solve the transposed variant.
+                       call stdlib_qtgsyl( 'T', ijb, n1, n2, a, lda, a( i, i ), lda,work, n1, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n1, dscale, dif( 1 ),work( 2*n1*n2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    end if
+                    go to 40
+                 end if
+                 dif( 1 ) = dscale / dif( 1 )
+                 ! 1-norm-based estimate of difl.
+                 50 continue
+                 call stdlib_qlacn2( mn2, work( mn2+1 ), work, iwork, dif( 2 ),kase, isave )
+                           
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve generalized sylvester equation.
+                       call stdlib_qtgsyl( 'N', ijb, n2, n1, a( i, i ), lda, a, lda,work, n2, b( &
+                       i, i ), ldb, b, ldb,work( n1*n2+1 ), n2, dscale, dif( 2 ),work( 2*n1*n2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    else
+                       ! solve the transposed variant.
+                       call stdlib_qtgsyl( 'T', ijb, n2, n1, a( i, i ), lda, a, lda,work, n2, b( &
+                       i, i ), ldb, b, ldb,work( n1*n2+1 ), n2, dscale, dif( 2 ),work( 2*n1*n2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    end if
+                    go to 50
+                 end if
+                 dif( 2 ) = dscale / dif( 2 )
+              end if
+           end if
+           60 continue
+           ! compute generalized eigenvalues of reordered pair (a, b) and
+           ! normalize the generalized schur form.
+           pair = .false.
+           loop_80: do k = 1, n
+              if( pair ) then
+                 pair = .false.
+              else
+                 if( k<n ) then
+                    if( a( k+1, k )/=zero ) then
+                       pair = .true.
+                    end if
+                 end if
+                 if( pair ) then
+                   ! compute the eigenvalue(s) at position k.
+                    work( 1 ) = a( k, k )
+                    work( 2 ) = a( k+1, k )
+                    work( 3 ) = a( k, k+1 )
+                    work( 4 ) = a( k+1, k+1 )
+                    work( 5 ) = b( k, k )
+                    work( 6 ) = b( k+1, k )
+                    work( 7 ) = b( k, k+1 )
+                    work( 8 ) = b( k+1, k+1 )
+                    call stdlib_qlag2( work, 2, work( 5 ), 2, smlnum*eps, beta( k ),beta( k+1 ), &
+                              alphar( k ), alphar( k+1 ),alphai( k ) )
+                    alphai( k+1 ) = -alphai( k )
+                 else
+                    if( sign( one, b( k, k ) )<zero ) then
+                       ! if b(k,k) is negative, make it positive
+                       do i = 1, n
+                          a( k, i ) = -a( k, i )
+                          b( k, i ) = -b( k, i )
+                          if( wantq ) q( i, k ) = -q( i, k )
+                       end do
+                    end if
+                    alphar( k ) = a( k, k )
+                    alphai( k ) = zero
+                    beta( k ) = b( k, k )
+                 end if
+              end if
+           end do loop_80
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_qtgsen
+
+     !> DTGSJA: computes the generalized singular value decomposition (GSVD)
+     !> of two real upper triangular (or trapezoidal) matrices A and B.
+     !> On entry, it is assumed that matrices A and B have the following
+     !> forms, which may be obtained by the preprocessing subroutine DGGSVP
+     !> from a general M-by-N matrix A and P-by-N matrix B:
+     !> N-K-L  K    L
+     !> A =    K ( 0    A12  A13 ) if M-K-L >= 0;
+     !> L ( 0     0   A23 )
+     !> M-K-L ( 0     0    0  )
+     !> N-K-L  K    L
+     !> A =  K ( 0    A12  A13 ) if M-K-L < 0;
+     !> M-K ( 0     0   A23 )
+     !> N-K-L  K    L
+     !> B =  L ( 0     0   B13 )
+     !> P-L ( 0     0    0  )
+     !> where the K-by-K matrix A12 and L-by-L matrix B13 are nonsingular
+     !> upper triangular; A23 is L-by-L upper triangular if M-K-L >= 0,
+     !> otherwise A23 is (M-K)-by-L upper trapezoidal.
+     !> On exit,
+     !> U**T *A*Q = D1*( 0 R ),    V**T *B*Q = D2*( 0 R ),
+     !> where U, V and Q are orthogonal matrices.
+     !> R is a nonsingular upper triangular matrix, and D1 and D2 are
+     !> ``diagonal'' matrices, which are of the following structures:
+     !> If M-K-L >= 0,
+     !> K  L
+     !> D1 =     K ( I  0 )
+     !> L ( 0  C )
+     !> M-K-L ( 0  0 )
+     !> K  L
+     !> D2 = L   ( 0  S )
+     !> P-L ( 0  0 )
+     !> N-K-L  K    L
+     !> ( 0 R ) = K (  0   R11  R12 ) K
+     !> L (  0    0   R22 ) L
+     !> where
+     !> C = diag( ALPHA(K+1), ... , ALPHA(K+L) ),
+     !> S = diag( BETA(K+1),  ... , BETA(K+L) ),
+     !> C**2 + S**2 = I.
+     !> R is stored in A(1:K+L,N-K-L+1:N) on exit.
+     !> If M-K-L < 0,
+     !> K M-K K+L-M
+     !> D1 =   K ( I  0    0   )
+     !> M-K ( 0  C    0   )
+     !> K M-K K+L-M
+     !> D2 =   M-K ( 0  S    0   )
+     !> K+L-M ( 0  0    I   )
+     !> P-L ( 0  0    0   )
+     !> N-K-L  K   M-K  K+L-M
+     !> ( 0 R ) =    K ( 0    R11  R12  R13  )
+     !> M-K ( 0     0   R22  R23  )
+     !> K+L-M ( 0     0    0   R33  )
+     !> where
+     !> C = diag( ALPHA(K+1), ... , ALPHA(M) ),
+     !> S = diag( BETA(K+1),  ... , BETA(M) ),
+     !> C**2 + S**2 = I.
+     !> R = ( R11 R12 R13 ) is stored in A(1:M, N-K-L+1:N) and R33 is stored
+     !> (  0  R22 R23 )
+     !> in B(M-K+1:L,N+M-K-L+1:N) on exit.
+     !> The computation of the orthogonal transformation matrices U, V or Q
+     !> is optional.  These matrices may either be formed explicitly, or they
+     !> may be postmultiplied into input matrices U1, V1, or Q1.
+
+     pure subroutine stdlib_qtgsja( jobu, jobv, jobq, m, p, n, k, l, a, lda, b,ldb, tola, tolb, &
+               alpha, beta, u, ldu, v, ldv,q, ldq, work, ncycle, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobq, jobu, jobv
+           integer(ilp), intent(out) :: info, ncycle
+           integer(ilp), intent(in) :: k, l, lda, ldb, ldq, ldu, ldv, m, n, p
+           real(qp), intent(in) :: tola, tolb
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), u(ldu,*), v(ldv,*)
+           real(qp), intent(out) :: alpha(*), beta(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 40
+           real(qp), parameter :: hugenum = huge(zero)
+           
+           
+           ! Local Scalars 
+           logical(lk) :: initq, initu, initv, upper, wantq, wantu, wantv
+           integer(ilp) :: i, j, kcycle
+           real(qp) :: a1, a2, a3, b1, b2, b3, csq, csu, csv, error, gamma, rwk, snq, snu, snv, &
+                     ssmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,huge
+           ! Executable Statements 
+           ! decode and test the input parameters
+           initu = stdlib_lsame( jobu, 'I' )
+           wantu = initu .or. stdlib_lsame( jobu, 'U' )
+           initv = stdlib_lsame( jobv, 'I' )
+           wantv = initv .or. stdlib_lsame( jobv, 'V' )
+           initq = stdlib_lsame( jobq, 'I' )
+           wantq = initq .or. stdlib_lsame( jobq, 'Q' )
+           info = 0
+           if( .not.( initu .or. wantu .or. stdlib_lsame( jobu, 'N' ) ) ) then
+              info = -1
+           else if( .not.( initv .or. wantv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -2
+           else if( .not.( initq .or. wantq .or. stdlib_lsame( jobq, 'N' ) ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( p<0 ) then
+              info = -5
+           else if( n<0 ) then
+              info = -6
+           else if( lda<max( 1, m ) ) then
+              info = -10
+           else if( ldb<max( 1, p ) ) then
+              info = -12
+           else if( ldu<1 .or. ( wantu .and. ldu<m ) ) then
+              info = -18
+           else if( ldv<1 .or. ( wantv .and. ldv<p ) ) then
+              info = -20
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -22
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGSJA', -info )
+              return
+           end if
+           ! initialize u, v and q, if necessary
+           if( initu )call stdlib_qlaset( 'FULL', m, m, zero, one, u, ldu )
+           if( initv )call stdlib_qlaset( 'FULL', p, p, zero, one, v, ldv )
+           if( initq )call stdlib_qlaset( 'FULL', n, n, zero, one, q, ldq )
+           ! loop until convergence
+           upper = .false.
+           loop_40: do kcycle = 1, maxit
+              upper = .not.upper
+              loop_20: do i = 1, l - 1
+                 loop_10: do j = i + 1, l
+                    a1 = zero
+                    a2 = zero
+                    a3 = zero
+                    if( k+i<=m )a1 = a( k+i, n-l+i )
+                    if( k+j<=m )a3 = a( k+j, n-l+j )
+                    b1 = b( i, n-l+i )
+                    b3 = b( j, n-l+j )
+                    if( upper ) then
+                       if( k+i<=m )a2 = a( k+i, n-l+j )
+                       b2 = b( i, n-l+j )
+                    else
+                       if( k+j<=m )a2 = a( k+j, n-l+i )
+                       b2 = b( j, n-l+i )
+                    end if
+                    call stdlib_qlags2( upper, a1, a2, a3, b1, b2, b3, csu, snu,csv, snv, csq, &
+                              snq )
+                    ! update (k+i)-th and (k+j)-th rows of matrix a: u**t *a
+                    if( k+j<=m )call stdlib_qrot( l, a( k+j, n-l+1 ), lda, a( k+i, n-l+1 ),lda, &
+                              csu, snu )
+                    ! update i-th and j-th rows of matrix b: v**t *b
+                    call stdlib_qrot( l, b( j, n-l+1 ), ldb, b( i, n-l+1 ), ldb,csv, snv )
+                    ! update (n-l+i)-th and (n-l+j)-th columns of matrices
+                    ! a and b: a*q and b*q
+                    call stdlib_qrot( min( k+l, m ), a( 1, n-l+j ), 1,a( 1, n-l+i ), 1, csq, snq )
+                              
+                    call stdlib_qrot( l, b( 1, n-l+j ), 1, b( 1, n-l+i ), 1, csq,snq )
+                    if( upper ) then
+                       if( k+i<=m )a( k+i, n-l+j ) = zero
+                       b( i, n-l+j ) = zero
+                    else
+                       if( k+j<=m )a( k+j, n-l+i ) = zero
+                       b( j, n-l+i ) = zero
+                    end if
+                    ! update orthogonal matrices u, v, q, if desired.
+                    if( wantu .and. k+j<=m )call stdlib_qrot( m, u( 1, k+j ), 1, u( 1, k+i ), 1, &
+                              csu,snu )
+                    if( wantv )call stdlib_qrot( p, v( 1, j ), 1, v( 1, i ), 1, csv, snv )
+                    if( wantq )call stdlib_qrot( n, q( 1, n-l+j ), 1, q( 1, n-l+i ), 1, csq,snq )
+                              
+                 end do loop_10
+              end do loop_20
+              if( .not.upper ) then
+                 ! the matrices a13 and b13 were lower triangular at the start
+                 ! of the cycle, and are now upper triangular.
+                 ! convergence test: test the parallelism of the corresponding
+                 ! rows of a and b.
+                 error = zero
+                 do i = 1, min( l, m-k )
+                    call stdlib_qcopy( l-i+1, a( k+i, n-l+i ), lda, work, 1 )
+                    call stdlib_qcopy( l-i+1, b( i, n-l+i ), ldb, work( l+1 ), 1 )
+                    call stdlib_qlapll( l-i+1, work, 1, work( l+1 ), 1, ssmin )
+                    error = max( error, ssmin )
+                 end do
+                 if( abs( error )<=min( tola, tolb ) )go to 50
+              end if
+              ! end of cycle loop
+           end do loop_40
+           ! the algorithm has not converged after maxit cycles.
+           info = 1
+           go to 100
+           50 continue
+           ! if error <= min(tola,tolb), then the algorithm has converged.
+           ! compute the generalized singular value pairs (alpha, beta), and
+           ! set the triangular matrix r to array a.
+           do i = 1, k
+              alpha( i ) = one
+              beta( i ) = zero
+           end do
+           do i = 1, min( l, m-k )
+              a1 = a( k+i, n-l+i )
+              b1 = b( i, n-l+i )
+              gamma = b1 / a1
+              if( (gamma<=hugenum).and.(gamma>=-hugenum) ) then
+                 ! change sign if necessary
+                 if( gamma<zero ) then
+                    call stdlib_qscal( l-i+1, -one, b( i, n-l+i ), ldb )
+                    if( wantv )call stdlib_qscal( p, -one, v( 1, i ), 1 )
+                 end if
+                 call stdlib_qlartg( abs( gamma ), one, beta( k+i ), alpha( k+i ),rwk )
+                 if( alpha( k+i )>=beta( k+i ) ) then
+                    call stdlib_qscal( l-i+1, one / alpha( k+i ), a( k+i, n-l+i ),lda )
+                 else
+                    call stdlib_qscal( l-i+1, one / beta( k+i ), b( i, n-l+i ),ldb )
+                    call stdlib_qcopy( l-i+1, b( i, n-l+i ), ldb, a( k+i, n-l+i ),lda )
+                 end if
+              else
+                 alpha( k+i ) = zero
+                 beta( k+i ) = one
+                 call stdlib_qcopy( l-i+1, b( i, n-l+i ), ldb, a( k+i, n-l+i ),lda )
+              end if
+           end do
+           ! post-assignment
+           do i = m + 1, k + l
+              alpha( i ) = zero
+              beta( i ) = one
+           end do
+           if( k+l<n ) then
+              do i = k + l + 1, n
+                 alpha( i ) = zero
+                 beta( i ) = zero
+              end do
+           end if
+           100 continue
+           ncycle = kcycle
+           return
+     end subroutine stdlib_qtgsja
+
+     !> DTGSNA: estimates reciprocal condition numbers for specified
+     !> eigenvalues and/or eigenvectors of a matrix pair (A, B) in
+     !> generalized real Schur canonical form (or of any matrix pair
+     !> (Q*A*Z**T, Q*B*Z**T) with orthogonal matrices Q and Z, where
+     !> Z**T denotes the transpose of Z.
+     !> (A, B) must be in generalized real Schur form (as returned by DGGES),
+     !> i.e. A is block upper triangular with 1-by-1 and 2-by-2 diagonal
+     !> blocks. B is upper triangular.
+
+     pure subroutine stdlib_qtgsna( job, howmny, select, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, s, &
+               dif, mm, m, work, lwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: a(lda,*), b(ldb,*), vl(ldvl,*), vr(ldvr,*)
+           real(qp), intent(out) :: dif(*), s(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: difdri = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, pair, somcon, wantbh, wantdf, wants
+           integer(ilp) :: i, ierr, ifst, ilst, iz, k, ks, lwmin, n1, n2
+           real(qp) :: alphai, alphar, alprqt, beta, c1, c2, cond, eps, lnrm, rnrm, root1, root2, &
+                     scale, smlnum, tmpii, tmpir, tmpri, tmprr, uhav, uhavi, uhbv, uhbvi
+           ! Local Arrays 
+           real(qp) :: dummy(1), dummy1(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantdf = stdlib_lsame( job, 'V' ) .or. wantbh
+           somcon = stdlib_lsame( howmny, 'S' )
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.wants .and. .not.wantdf ) then
+              info = -1
+           else if( .not.stdlib_lsame( howmny, 'A' ) .and. .not.somcon ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( wants .and. ldvl<n ) then
+              info = -10
+           else if( wants .and. ldvr<n ) then
+              info = -12
+           else
+              ! set m to the number of eigenpairs for which condition numbers
+              ! are required, and test mm.
+              if( somcon ) then
+                 m = 0
+                 pair = .false.
+                 do k = 1, n
+                    if( pair ) then
+                       pair = .false.
+                    else
+                       if( k<n ) then
+                          if( a( k+1, k )==zero ) then
+                             if( select( k ) )m = m + 1
+                          else
+                             pair = .true.
+                             if( select( k ) .or. select( k+1 ) )m = m + 2
+                          end if
+                       else
+                          if( select( n ) )m = m + 1
+                       end if
+                    end if
+                 end do
+              else
+                 m = n
+              end if
+              if( n==0 ) then
+                 lwmin = 1
+              else if( stdlib_lsame( job, 'V' ) .or. stdlib_lsame( job, 'B' ) ) then
+                 lwmin = 2*n*( n + 2 ) + 16
+              else
+                 lwmin = n
+              end if
+              work( 1 ) = lwmin
+              if( mm<m ) then
+                 info = -15
+              else if( lwork<lwmin .and. .not.lquery ) then
+                 info = -18
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGSNA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' ) / eps
+           ks = 0
+           pair = .false.
+           loop_20: do k = 1, n
+              ! determine whether a(k,k) begins a 1-by-1 or 2-by-2 block.
+              if( pair ) then
+                 pair = .false.
+                 cycle loop_20
+              else
+                 if( k<n )pair = a( k+1, k )/=zero
+              end if
+              ! determine whether condition numbers are required for the k-th
+              ! eigenpair.
+              if( somcon ) then
+                 if( pair ) then
+                    if( .not.select( k ) .and. .not.select( k+1 ) )cycle loop_20
+                 else
+                    if( .not.select( k ) )cycle loop_20
+                 end if
+              end if
+              ks = ks + 1
+              if( wants ) then
+                 ! compute the reciprocal condition number of the k-th
+                 ! eigenvalue.
+                 if( pair ) then
+                    ! complex eigenvalue pair.
+                    rnrm = stdlib_qlapy2( stdlib_qnrm2( n, vr( 1, ks ), 1 ),stdlib_qnrm2( n, vr( &
+                              1, ks+1 ), 1 ) )
+                    lnrm = stdlib_qlapy2( stdlib_qnrm2( n, vl( 1, ks ), 1 ),stdlib_qnrm2( n, vl( &
+                              1, ks+1 ), 1 ) )
+                    call stdlib_qgemv( 'N', n, n, one, a, lda, vr( 1, ks ), 1, zero,work, 1 )
+                              
+                    tmprr = stdlib_qdot( n, work, 1, vl( 1, ks ), 1 )
+                    tmpri = stdlib_qdot( n, work, 1, vl( 1, ks+1 ), 1 )
+                    call stdlib_qgemv( 'N', n, n, one, a, lda, vr( 1, ks+1 ), 1,zero, work, 1 )
+                              
+                    tmpii = stdlib_qdot( n, work, 1, vl( 1, ks+1 ), 1 )
+                    tmpir = stdlib_qdot( n, work, 1, vl( 1, ks ), 1 )
+                    uhav = tmprr + tmpii
+                    uhavi = tmpir - tmpri
+                    call stdlib_qgemv( 'N', n, n, one, b, ldb, vr( 1, ks ), 1, zero,work, 1 )
+                              
+                    tmprr = stdlib_qdot( n, work, 1, vl( 1, ks ), 1 )
+                    tmpri = stdlib_qdot( n, work, 1, vl( 1, ks+1 ), 1 )
+                    call stdlib_qgemv( 'N', n, n, one, b, ldb, vr( 1, ks+1 ), 1,zero, work, 1 )
+                              
+                    tmpii = stdlib_qdot( n, work, 1, vl( 1, ks+1 ), 1 )
+                    tmpir = stdlib_qdot( n, work, 1, vl( 1, ks ), 1 )
+                    uhbv = tmprr + tmpii
+                    uhbvi = tmpir - tmpri
+                    uhav = stdlib_qlapy2( uhav, uhavi )
+                    uhbv = stdlib_qlapy2( uhbv, uhbvi )
+                    cond = stdlib_qlapy2( uhav, uhbv )
+                    s( ks ) = cond / ( rnrm*lnrm )
+                    s( ks+1 ) = s( ks )
+                 else
+                    ! real eigenvalue.
+                    rnrm = stdlib_qnrm2( n, vr( 1, ks ), 1 )
+                    lnrm = stdlib_qnrm2( n, vl( 1, ks ), 1 )
+                    call stdlib_qgemv( 'N', n, n, one, a, lda, vr( 1, ks ), 1, zero,work, 1 )
+                              
+                    uhav = stdlib_qdot( n, work, 1, vl( 1, ks ), 1 )
+                    call stdlib_qgemv( 'N', n, n, one, b, ldb, vr( 1, ks ), 1, zero,work, 1 )
+                              
+                    uhbv = stdlib_qdot( n, work, 1, vl( 1, ks ), 1 )
+                    cond = stdlib_qlapy2( uhav, uhbv )
+                    if( cond==zero ) then
+                       s( ks ) = -one
+                    else
+                       s( ks ) = cond / ( rnrm*lnrm )
+                    end if
+                 end if
+              end if
+              if( wantdf ) then
+                 if( n==1 ) then
+                    dif( ks ) = stdlib_qlapy2( a( 1, 1 ), b( 1, 1 ) )
+                    cycle loop_20
+                 end if
+                 ! estimate the reciprocal condition number of the k-th
+                 ! eigenvectors.
+                 if( pair ) then
+                    ! copy the  2-by 2 pencil beginning at (a(k,k), b(k, k)).
+                    ! compute the eigenvalue(s) at position k.
+                    work( 1 ) = a( k, k )
+                    work( 2 ) = a( k+1, k )
+                    work( 3 ) = a( k, k+1 )
+                    work( 4 ) = a( k+1, k+1 )
+                    work( 5 ) = b( k, k )
+                    work( 6 ) = b( k+1, k )
+                    work( 7 ) = b( k, k+1 )
+                    work( 8 ) = b( k+1, k+1 )
+                    call stdlib_qlag2( work, 2, work( 5 ), 2, smlnum*eps, beta,dummy1( 1 ), &
+                              alphar, dummy( 1 ), alphai )
+                    alprqt = one
+                    c1 = two*( alphar*alphar+alphai*alphai+beta*beta )
+                    c2 = four*beta*beta*alphai*alphai
+                    root1 = c1 + sqrt( c1*c1-4.0_qp*c2 )
+                    root2 = c2 / root1
+                    root1 = root1 / two
+                    cond = min( sqrt( root1 ), sqrt( root2 ) )
+                 end if
+                 ! copy the matrix (a, b) to the array work and swap the
+                 ! diagonal block beginning at a(k,k) to the (1,1) position.
+                 call stdlib_qlacpy( 'FULL', n, n, a, lda, work, n )
+                 call stdlib_qlacpy( 'FULL', n, n, b, ldb, work( n*n+1 ), n )
+                 ifst = k
+                 ilst = 1
+                 call stdlib_qtgexc( .false., .false., n, work, n, work( n*n+1 ), n,dummy, 1, &
+                           dummy1, 1, ifst, ilst,work( n*n*2+1 ), lwork-2*n*n, ierr )
+                 if( ierr>0 ) then
+                    ! ill-conditioned problem - swap rejected.
+                    dif( ks ) = zero
+                 else
+                    ! reordering successful, solve generalized sylvester
+                    ! equation for r and l,
+                               ! a22 * r - l * a11 = a12
+                               ! b22 * r - l * b11 = b12,
+                    ! and compute estimate of difl((a11,b11), (a22, b22)).
+                    n1 = 1
+                    if( work( 2 )/=zero )n1 = 2
+                    n2 = n - n1
+                    if( n2==0 ) then
+                       dif( ks ) = cond
+                    else
+                       i = n*n + 1
+                       iz = 2*n*n + 1
+                       call stdlib_qtgsyl( 'N', difdri, n2, n1, work( n*n1+n1+1 ),n, work, n, &
+                       work( n1+1 ), n,work( n*n1+n1+i ), n, work( i ), n,work( n1+i ), n, scale, &
+                                 dif( ks ),work( iz+1 ), lwork-2*n*n, iwork, ierr )
+                       if( pair )dif( ks ) = min( max( one, alprqt )*dif( ks ),cond )
+                    end if
+                 end if
+                 if( pair )dif( ks+1 ) = dif( ks )
+              end if
+              if( pair )ks = ks + 1
+           end do loop_20
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_qtgsna
+
+     !> DTGSY2: solves the generalized Sylvester equation:
+     !> A * R - L * B = scale * C                (1)
+     !> D * R - L * E = scale * F,
+     !> using Level 1 and 2 BLAS. where R and L are unknown M-by-N matrices,
+     !> (A, D), (B, E) and (C, F) are given matrix pairs of size M-by-M,
+     !> N-by-N and M-by-N, respectively, with real entries. (A, D) and (B, E)
+     !> must be in generalized Schur canonical form, i.e. A, B are upper
+     !> quasi triangular and D, E are upper triangular. The solution (R, L)
+     !> overwrites (C, F). 0 <= SCALE <= 1 is an output scaling factor
+     !> chosen to avoid overflow.
+     !> In matrix notation solving equation (1) corresponds to solve
+     !> Z*x = scale*b, where Z is defined as
+     !> Z = [ kron(In, A)  -kron(B**T, Im) ]             (2)
+     !> [ kron(In, D)  -kron(E**T, Im) ],
+     !> Ik is the identity matrix of size k and X**T is the transpose of X.
+     !> kron(X, Y) is the Kronecker product between the matrices X and Y.
+     !> In the process of solving (1), we solve a number of such systems
+     !> where Dim(In), Dim(In) = 1 or 2.
+     !> If TRANS = 'T', solve the transposed system Z**T*y = scale*b for y,
+     !> which is equivalent to solve for R and L in
+     !> A**T * R  + D**T * L   = scale * C           (3)
+     !> R  * B**T + L  * E**T  = scale * -F
+     !> This case is used to compute an estimate of Dif[(A, D), (B, E)] =
+     !> sigma_min(Z) using reverse communication with DLACON.
+     !> DTGSY2 also (IJOB >= 1) contributes to the computation in DTGSYL
+     !> of an upper bound on the separation between to matrix pairs. Then
+     !> the input (A, D), (B, E) are sub-pencils of the matrix pair in
+     !> DTGSYL. See DTGSYL for details.
+
+     pure subroutine stdlib_qtgsy2( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f, &
+               ldf, scale, rdsum, rdscal,iwork, pq, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldc, ldd, lde, ldf, m, n
+           integer(ilp), intent(out) :: info, pq
+           real(qp), intent(inout) :: rdscal, rdsum
+           real(qp), intent(out) :: scale
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: a(lda,*), b(ldb,*), d(ldd,*), e(lde,*)
+           real(qp), intent(inout) :: c(ldc,*), f(ldf,*)
+        ! =====================================================================
+        ! replaced various illegal calls to stdlib_qcopy by calls to stdlib_qlaset.
+        ! sven hammarling, 27/5/02.
+           ! Parameters 
+           integer(ilp), parameter :: ldz = 8
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: i, ie, ierr, ii, is, isp1, j, je, jj, js, jsp1, k, mb, nb, p, q, &
+                     zdim
+           real(qp) :: alpha, scaloc
+           ! Local Arrays 
+           integer(ilp) :: ipiv(ldz), jpiv(ldz)
+           real(qp) :: rhs(ldz), z(ldz,ldz)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode and test input parameters
+           info = 0
+           ierr = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -1
+           else if( notran ) then
+              if( ( ijob<0 ) .or. ( ijob>2 ) ) then
+                 info = -2
+              end if
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 info = -3
+              else if( n<=0 ) then
+                 info = -4
+              else if( lda<max( 1, m ) ) then
+                 info = -6
+              else if( ldb<max( 1, n ) ) then
+                 info = -8
+              else if( ldc<max( 1, m ) ) then
+                 info = -10
+              else if( ldd<max( 1, m ) ) then
+                 info = -12
+              else if( lde<max( 1, n ) ) then
+                 info = -14
+              else if( ldf<max( 1, m ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGSY2', -info )
+              return
+           end if
+           ! determine block structure of a
+           pq = 0
+           p = 0
+           i = 1
+           10 continue
+           if( i>m )go to 20
+           p = p + 1
+           iwork( p ) = i
+           if( i==m )go to 20
+           if( a( i+1, i )/=zero ) then
+              i = i + 2
+           else
+              i = i + 1
+           end if
+           go to 10
+           20 continue
+           iwork( p+1 ) = m + 1
+           ! determine block structure of b
+           q = p + 1
+           j = 1
+           30 continue
+           if( j>n )go to 40
+           q = q + 1
+           iwork( q ) = j
+           if( j==n )go to 40
+           if( b( j+1, j )/=zero ) then
+              j = j + 2
+           else
+              j = j + 1
+           end if
+           go to 30
+           40 continue
+           iwork( q+1 ) = n + 1
+           pq = p*( q-p-1 )
+           if( notran ) then
+              ! solve (i, j) - subsystem
+                 ! a(i, i) * r(i, j) - l(i, j) * b(j, j) = c(i, j)
+                 ! d(i, i) * r(i, j) - l(i, j) * e(j, j) = f(i, j)
+              ! for i = p, p - 1, ..., 1; j = 1, 2, ..., q
+              scale = one
+              scaloc = one
+              loop_120: do j = p + 2, q
+                 js = iwork( j )
+                 jsp1 = js + 1
+                 je = iwork( j+1 ) - 1
+                 nb = je - js + 1
+                 loop_110: do i = p, 1, -1
+                    is = iwork( i )
+                    isp1 = is + 1
+                    ie = iwork( i+1 ) - 1
+                    mb = ie - is + 1
+                    zdim = mb*nb*2
+                    if( ( mb==1 ) .and. ( nb==1 ) ) then
+                       ! build a 2-by-2 system z * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = d( is, is )
+                       z( 1, 2 ) = -b( js, js )
+                       z( 2, 2 ) = -e( js, js )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = f( is, js )
+                       ! solve z * x = rhs
+                       call stdlib_qgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       if( ijob==0 ) then
+                          call stdlib_qgesc2( zdim, z, ldz, rhs, ipiv, jpiv,scaloc )
+                          if( scaloc/=one ) then
+                             do k = 1, n
+                                call stdlib_qscal( m, scaloc, c( 1, k ), 1 )
+                                call stdlib_qscal( m, scaloc, f( 1, k ), 1 )
+                             end do
+                             scale = scale*scaloc
+                          end if
+                       else
+                          call stdlib_qlatdf( ijob, zdim, z, ldz, rhs, rdsum,rdscal, ipiv, jpiv )
+                                    
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       f( is, js ) = rhs( 2 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          alpha = -rhs( 1 )
+                          call stdlib_qaxpy( is-1, alpha, a( 1, is ), 1, c( 1, js ),1 )
+                          call stdlib_qaxpy( is-1, alpha, d( 1, is ), 1, f( 1, js ),1 )
+                       end if
+                       if( j<q ) then
+                          call stdlib_qaxpy( n-je, rhs( 2 ), b( js, je+1 ), ldb,c( is, je+1 ), &
+                                    ldc )
+                          call stdlib_qaxpy( n-je, rhs( 2 ), e( js, je+1 ), lde,f( is, je+1 ), &
+                                    ldf )
+                       end if
+                    else if( ( mb==1 ) .and. ( nb==2 ) ) then
+                       ! build a 4-by-4 system z * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = zero
+                       z( 3, 1 ) = d( is, is )
+                       z( 4, 1 ) = zero
+                       z( 1, 2 ) = zero
+                       z( 2, 2 ) = a( is, is )
+                       z( 3, 2 ) = zero
+                       z( 4, 2 ) = d( is, is )
+                       z( 1, 3 ) = -b( js, js )
+                       z( 2, 3 ) = -b( js, jsp1 )
+                       z( 3, 3 ) = -e( js, js )
+                       z( 4, 3 ) = -e( js, jsp1 )
+                       z( 1, 4 ) = -b( jsp1, js )
+                       z( 2, 4 ) = -b( jsp1, jsp1 )
+                       z( 3, 4 ) = zero
+                       z( 4, 4 ) = -e( jsp1, jsp1 )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = c( is, jsp1 )
+                       rhs( 3 ) = f( is, js )
+                       rhs( 4 ) = f( is, jsp1 )
+                       ! solve z * x = rhs
+                       call stdlib_qgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       if( ijob==0 ) then
+                          call stdlib_qgesc2( zdim, z, ldz, rhs, ipiv, jpiv,scaloc )
+                          if( scaloc/=one ) then
+                             do k = 1, n
+                                call stdlib_qscal( m, scaloc, c( 1, k ), 1 )
+                                call stdlib_qscal( m, scaloc, f( 1, k ), 1 )
+                             end do
+                             scale = scale*scaloc
+                          end if
+                       else
+                          call stdlib_qlatdf( ijob, zdim, z, ldz, rhs, rdsum,rdscal, ipiv, jpiv )
+                                    
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       c( is, jsp1 ) = rhs( 2 )
+                       f( is, js ) = rhs( 3 )
+                       f( is, jsp1 ) = rhs( 4 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          call stdlib_qger( is-1, nb, -one, a( 1, is ), 1, rhs( 1 ),1, c( 1, js ),&
+                                     ldc )
+                          call stdlib_qger( is-1, nb, -one, d( 1, is ), 1, rhs( 1 ),1, f( 1, js ),&
+                                     ldf )
+                       end if
+                       if( j<q ) then
+                          call stdlib_qaxpy( n-je, rhs( 3 ), b( js, je+1 ), ldb,c( is, je+1 ), &
+                                    ldc )
+                          call stdlib_qaxpy( n-je, rhs( 3 ), e( js, je+1 ), lde,f( is, je+1 ), &
+                                    ldf )
+                          call stdlib_qaxpy( n-je, rhs( 4 ), b( jsp1, je+1 ), ldb,c( is, je+1 ), &
+                                    ldc )
+                          call stdlib_qaxpy( n-je, rhs( 4 ), e( jsp1, je+1 ), lde,f( is, je+1 ), &
+                                    ldf )
+                       end if
+                    else if( ( mb==2 ) .and. ( nb==1 ) ) then
+                       ! build a 4-by-4 system z * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = a( isp1, is )
+                       z( 3, 1 ) = d( is, is )
+                       z( 4, 1 ) = zero
+                       z( 1, 2 ) = a( is, isp1 )
+                       z( 2, 2 ) = a( isp1, isp1 )
+                       z( 3, 2 ) = d( is, isp1 )
+                       z( 4, 2 ) = d( isp1, isp1 )
+                       z( 1, 3 ) = -b( js, js )
+                       z( 2, 3 ) = zero
+                       z( 3, 3 ) = -e( js, js )
+                       z( 4, 3 ) = zero
+                       z( 1, 4 ) = zero
+                       z( 2, 4 ) = -b( js, js )
+                       z( 3, 4 ) = zero
+                       z( 4, 4 ) = -e( js, js )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = c( isp1, js )
+                       rhs( 3 ) = f( is, js )
+                       rhs( 4 ) = f( isp1, js )
+                       ! solve z * x = rhs
+                       call stdlib_qgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       if( ijob==0 ) then
+                          call stdlib_qgesc2( zdim, z, ldz, rhs, ipiv, jpiv,scaloc )
+                          if( scaloc/=one ) then
+                             do k = 1, n
+                                call stdlib_qscal( m, scaloc, c( 1, k ), 1 )
+                                call stdlib_qscal( m, scaloc, f( 1, k ), 1 )
+                             end do
+                             scale = scale*scaloc
+                          end if
+                       else
+                          call stdlib_qlatdf( ijob, zdim, z, ldz, rhs, rdsum,rdscal, ipiv, jpiv )
+                                    
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       c( isp1, js ) = rhs( 2 )
+                       f( is, js ) = rhs( 3 )
+                       f( isp1, js ) = rhs( 4 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          call stdlib_qgemv( 'N', is-1, mb, -one, a( 1, is ), lda,rhs( 1 ), 1, &
+                                    one, c( 1, js ), 1 )
+                          call stdlib_qgemv( 'N', is-1, mb, -one, d( 1, is ), ldd,rhs( 1 ), 1, &
+                                    one, f( 1, js ), 1 )
+                       end if
+                       if( j<q ) then
+                          call stdlib_qger( mb, n-je, one, rhs( 3 ), 1,b( js, je+1 ), ldb, c( is, &
+                                    je+1 ), ldc )
+                          call stdlib_qger( mb, n-je, one, rhs( 3 ), 1,e( js, je+1 ), lde, f( is, &
+                                    je+1 ), ldf )
+                       end if
+                    else if( ( mb==2 ) .and. ( nb==2 ) ) then
+                       ! build an 8-by-8 system z * x = rhs
+                       call stdlib_qlaset( 'F', ldz, ldz, zero, zero, z, ldz )
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = a( isp1, is )
+                       z( 5, 1 ) = d( is, is )
+                       z( 1, 2 ) = a( is, isp1 )
+                       z( 2, 2 ) = a( isp1, isp1 )
+                       z( 5, 2 ) = d( is, isp1 )
+                       z( 6, 2 ) = d( isp1, isp1 )
+                       z( 3, 3 ) = a( is, is )
+                       z( 4, 3 ) = a( isp1, is )
+                       z( 7, 3 ) = d( is, is )
+                       z( 3, 4 ) = a( is, isp1 )
+                       z( 4, 4 ) = a( isp1, isp1 )
+                       z( 7, 4 ) = d( is, isp1 )
+                       z( 8, 4 ) = d( isp1, isp1 )
+                       z( 1, 5 ) = -b( js, js )
+                       z( 3, 5 ) = -b( js, jsp1 )
+                       z( 5, 5 ) = -e( js, js )
+                       z( 7, 5 ) = -e( js, jsp1 )
+                       z( 2, 6 ) = -b( js, js )
+                       z( 4, 6 ) = -b( js, jsp1 )
+                       z( 6, 6 ) = -e( js, js )
+                       z( 8, 6 ) = -e( js, jsp1 )
+                       z( 1, 7 ) = -b( jsp1, js )
+                       z( 3, 7 ) = -b( jsp1, jsp1 )
+                       z( 7, 7 ) = -e( jsp1, jsp1 )
+                       z( 2, 8 ) = -b( jsp1, js )
+                       z( 4, 8 ) = -b( jsp1, jsp1 )
+                       z( 8, 8 ) = -e( jsp1, jsp1 )
+                       ! set up right hand side(s)
+                       k = 1
+                       ii = mb*nb + 1
+                       do jj = 0, nb - 1
+                          call stdlib_qcopy( mb, c( is, js+jj ), 1, rhs( k ), 1 )
+                          call stdlib_qcopy( mb, f( is, js+jj ), 1, rhs( ii ), 1 )
+                          k = k + mb
+                          ii = ii + mb
+                       end do
+                       ! solve z * x = rhs
+                       call stdlib_qgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       if( ijob==0 ) then
+                          call stdlib_qgesc2( zdim, z, ldz, rhs, ipiv, jpiv,scaloc )
+                          if( scaloc/=one ) then
+                             do k = 1, n
+                                call stdlib_qscal( m, scaloc, c( 1, k ), 1 )
+                                call stdlib_qscal( m, scaloc, f( 1, k ), 1 )
+                             end do
+                             scale = scale*scaloc
+                          end if
+                       else
+                          call stdlib_qlatdf( ijob, zdim, z, ldz, rhs, rdsum,rdscal, ipiv, jpiv )
+                                    
+                       end if
+                       ! unpack solution vector(s)
+                       k = 1
+                       ii = mb*nb + 1
+                       do jj = 0, nb - 1
+                          call stdlib_qcopy( mb, rhs( k ), 1, c( is, js+jj ), 1 )
+                          call stdlib_qcopy( mb, rhs( ii ), 1, f( is, js+jj ), 1 )
+                          k = k + mb
+                          ii = ii + mb
+                       end do
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          call stdlib_qgemm( 'N', 'N', is-1, nb, mb, -one,a( 1, is ), lda, rhs( 1 &
+                                    ), mb, one,c( 1, js ), ldc )
+                          call stdlib_qgemm( 'N', 'N', is-1, nb, mb, -one,d( 1, is ), ldd, rhs( 1 &
+                                    ), mb, one,f( 1, js ), ldf )
+                       end if
+                       if( j<q ) then
+                          k = mb*nb + 1
+                          call stdlib_qgemm( 'N', 'N', mb, n-je, nb, one, rhs( k ),mb, b( js, je+&
+                                    1 ), ldb, one,c( is, je+1 ), ldc )
+                          call stdlib_qgemm( 'N', 'N', mb, n-je, nb, one, rhs( k ),mb, e( js, je+&
+                                    1 ), lde, one,f( is, je+1 ), ldf )
+                       end if
+                    end if
+                 end do loop_110
+              end do loop_120
+           else
+              ! solve (i, j) - subsystem
+                   ! a(i, i)**t * r(i, j) + d(i, i)**t * l(j, j)  =  c(i, j)
+                   ! r(i, i)  * b(j, j) + l(i, j)  * e(j, j)  = -f(i, j)
+              ! for i = 1, 2, ..., p, j = q, q - 1, ..., 1
+              scale = one
+              scaloc = one
+              loop_200: do i = 1, p
+                 is = iwork( i )
+                 isp1 = is + 1
+                 ie = iwork ( i+1 ) - 1
+                 mb = ie - is + 1
+                 loop_190: do j = q, p + 2, -1
+                    js = iwork( j )
+                    jsp1 = js + 1
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    zdim = mb*nb*2
+                    if( ( mb==1 ) .and. ( nb==1 ) ) then
+                       ! build a 2-by-2 system z**t * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = -b( js, js )
+                       z( 1, 2 ) = d( is, is )
+                       z( 2, 2 ) = -e( js, js )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = f( is, js )
+                       ! solve z**t * x = rhs
+                       call stdlib_qgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       call stdlib_qgesc2( zdim, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_qscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       f( is, js ) = rhs( 2 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( j>p+2 ) then
+                          alpha = rhs( 1 )
+                          call stdlib_qaxpy( js-1, alpha, b( 1, js ), 1, f( is, 1 ),ldf )
+                          alpha = rhs( 2 )
+                          call stdlib_qaxpy( js-1, alpha, e( 1, js ), 1, f( is, 1 ),ldf )
+                       end if
+                       if( i<p ) then
+                          alpha = -rhs( 1 )
+                          call stdlib_qaxpy( m-ie, alpha, a( is, ie+1 ), lda,c( ie+1, js ), 1 )
+                                    
+                          alpha = -rhs( 2 )
+                          call stdlib_qaxpy( m-ie, alpha, d( is, ie+1 ), ldd,c( ie+1, js ), 1 )
+                                    
+                       end if
+                    else if( ( mb==1 ) .and. ( nb==2 ) ) then
+                       ! build a 4-by-4 system z**t * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = zero
+                       z( 3, 1 ) = -b( js, js )
+                       z( 4, 1 ) = -b( jsp1, js )
+                       z( 1, 2 ) = zero
+                       z( 2, 2 ) = a( is, is )
+                       z( 3, 2 ) = -b( js, jsp1 )
+                       z( 4, 2 ) = -b( jsp1, jsp1 )
+                       z( 1, 3 ) = d( is, is )
+                       z( 2, 3 ) = zero
+                       z( 3, 3 ) = -e( js, js )
+                       z( 4, 3 ) = zero
+                       z( 1, 4 ) = zero
+                       z( 2, 4 ) = d( is, is )
+                       z( 3, 4 ) = -e( js, jsp1 )
+                       z( 4, 4 ) = -e( jsp1, jsp1 )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = c( is, jsp1 )
+                       rhs( 3 ) = f( is, js )
+                       rhs( 4 ) = f( is, jsp1 )
+                       ! solve z**t * x = rhs
+                       call stdlib_qgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       call stdlib_qgesc2( zdim, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_qscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       c( is, jsp1 ) = rhs( 2 )
+                       f( is, js ) = rhs( 3 )
+                       f( is, jsp1 ) = rhs( 4 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( j>p+2 ) then
+                          call stdlib_qaxpy( js-1, rhs( 1 ), b( 1, js ), 1,f( is, 1 ), ldf )
+                                    
+                          call stdlib_qaxpy( js-1, rhs( 2 ), b( 1, jsp1 ), 1,f( is, 1 ), ldf )
+                                    
+                          call stdlib_qaxpy( js-1, rhs( 3 ), e( 1, js ), 1,f( is, 1 ), ldf )
+                                    
+                          call stdlib_qaxpy( js-1, rhs( 4 ), e( 1, jsp1 ), 1,f( is, 1 ), ldf )
+                                    
+                       end if
+                       if( i<p ) then
+                          call stdlib_qger( m-ie, nb, -one, a( is, ie+1 ), lda,rhs( 1 ), 1, c( ie+&
+                                    1, js ), ldc )
+                          call stdlib_qger( m-ie, nb, -one, d( is, ie+1 ), ldd,rhs( 3 ), 1, c( ie+&
+                                    1, js ), ldc )
+                       end if
+                    else if( ( mb==2 ) .and. ( nb==1 ) ) then
+                       ! build a 4-by-4 system z**t * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = a( is, isp1 )
+                       z( 3, 1 ) = -b( js, js )
+                       z( 4, 1 ) = zero
+                       z( 1, 2 ) = a( isp1, is )
+                       z( 2, 2 ) = a( isp1, isp1 )
+                       z( 3, 2 ) = zero
+                       z( 4, 2 ) = -b( js, js )
+                       z( 1, 3 ) = d( is, is )
+                       z( 2, 3 ) = d( is, isp1 )
+                       z( 3, 3 ) = -e( js, js )
+                       z( 4, 3 ) = zero
+                       z( 1, 4 ) = zero
+                       z( 2, 4 ) = d( isp1, isp1 )
+                       z( 3, 4 ) = zero
+                       z( 4, 4 ) = -e( js, js )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = c( isp1, js )
+                       rhs( 3 ) = f( is, js )
+                       rhs( 4 ) = f( isp1, js )
+                       ! solve z**t * x = rhs
+                       call stdlib_qgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       call stdlib_qgesc2( zdim, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_qscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       c( isp1, js ) = rhs( 2 )
+                       f( is, js ) = rhs( 3 )
+                       f( isp1, js ) = rhs( 4 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( j>p+2 ) then
+                          call stdlib_qger( mb, js-1, one, rhs( 1 ), 1, b( 1, js ),1, f( is, 1 ), &
+                                    ldf )
+                          call stdlib_qger( mb, js-1, one, rhs( 3 ), 1, e( 1, js ),1, f( is, 1 ), &
+                                    ldf )
+                       end if
+                       if( i<p ) then
+                          call stdlib_qgemv( 'T', mb, m-ie, -one, a( is, ie+1 ),lda, rhs( 1 ), 1, &
+                                    one, c( ie+1, js ),1 )
+                          call stdlib_qgemv( 'T', mb, m-ie, -one, d( is, ie+1 ),ldd, rhs( 3 ), 1, &
+                                    one, c( ie+1, js ),1 )
+                       end if
+                    else if( ( mb==2 ) .and. ( nb==2 ) ) then
+                       ! build an 8-by-8 system z**t * x = rhs
+                       call stdlib_qlaset( 'F', ldz, ldz, zero, zero, z, ldz )
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = a( is, isp1 )
+                       z( 5, 1 ) = -b( js, js )
+                       z( 7, 1 ) = -b( jsp1, js )
+                       z( 1, 2 ) = a( isp1, is )
+                       z( 2, 2 ) = a( isp1, isp1 )
+                       z( 6, 2 ) = -b( js, js )
+                       z( 8, 2 ) = -b( jsp1, js )
+                       z( 3, 3 ) = a( is, is )
+                       z( 4, 3 ) = a( is, isp1 )
+                       z( 5, 3 ) = -b( js, jsp1 )
+                       z( 7, 3 ) = -b( jsp1, jsp1 )
+                       z( 3, 4 ) = a( isp1, is )
+                       z( 4, 4 ) = a( isp1, isp1 )
+                       z( 6, 4 ) = -b( js, jsp1 )
+                       z( 8, 4 ) = -b( jsp1, jsp1 )
+                       z( 1, 5 ) = d( is, is )
+                       z( 2, 5 ) = d( is, isp1 )
+                       z( 5, 5 ) = -e( js, js )
+                       z( 2, 6 ) = d( isp1, isp1 )
+                       z( 6, 6 ) = -e( js, js )
+                       z( 3, 7 ) = d( is, is )
+                       z( 4, 7 ) = d( is, isp1 )
+                       z( 5, 7 ) = -e( js, jsp1 )
+                       z( 7, 7 ) = -e( jsp1, jsp1 )
+                       z( 4, 8 ) = d( isp1, isp1 )
+                       z( 6, 8 ) = -e( js, jsp1 )
+                       z( 8, 8 ) = -e( jsp1, jsp1 )
+                       ! set up right hand side(s)
+                       k = 1
+                       ii = mb*nb + 1
+                       do jj = 0, nb - 1
+                          call stdlib_qcopy( mb, c( is, js+jj ), 1, rhs( k ), 1 )
+                          call stdlib_qcopy( mb, f( is, js+jj ), 1, rhs( ii ), 1 )
+                          k = k + mb
+                          ii = ii + mb
+                       end do
+                       ! solve z**t * x = rhs
+                       call stdlib_qgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       call stdlib_qgesc2( zdim, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_qscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! unpack solution vector(s)
+                       k = 1
+                       ii = mb*nb + 1
+                       do jj = 0, nb - 1
+                          call stdlib_qcopy( mb, rhs( k ), 1, c( is, js+jj ), 1 )
+                          call stdlib_qcopy( mb, rhs( ii ), 1, f( is, js+jj ), 1 )
+                          k = k + mb
+                          ii = ii + mb
+                       end do
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( j>p+2 ) then
+                          call stdlib_qgemm( 'N', 'T', mb, js-1, nb, one,c( is, js ), ldc, b( 1, &
+                                    js ), ldb, one,f( is, 1 ), ldf )
+                          call stdlib_qgemm( 'N', 'T', mb, js-1, nb, one,f( is, js ), ldf, e( 1, &
+                                    js ), lde, one,f( is, 1 ), ldf )
+                       end if
+                       if( i<p ) then
+                          call stdlib_qgemm( 'T', 'N', m-ie, nb, mb, -one,a( is, ie+1 ), lda, c( &
+                                    is, js ), ldc,one, c( ie+1, js ), ldc )
+                          call stdlib_qgemm( 'T', 'N', m-ie, nb, mb, -one,d( is, ie+1 ), ldd, f( &
+                                    is, js ), ldf,one, c( ie+1, js ), ldc )
+                       end if
+                    end if
+                 end do loop_190
+              end do loop_200
+           end if
+           return
+     end subroutine stdlib_qtgsy2
+
+     !> DTGSYL: solves the generalized Sylvester equation:
+     !> A * R - L * B = scale * C                 (1)
+     !> D * R - L * E = scale * F
+     !> where R and L are unknown m-by-n matrices, (A, D), (B, E) and
+     !> (C, F) are given matrix pairs of size m-by-m, n-by-n and m-by-n,
+     !> respectively, with real entries. (A, D) and (B, E) must be in
+     !> generalized (real) Schur canonical form, i.e. A, B are upper quasi
+     !> triangular and D, E are upper triangular.
+     !> The solution (R, L) overwrites (C, F). 0 <= SCALE <= 1 is an output
+     !> scaling factor chosen to avoid overflow.
+     !> In matrix notation (1) is equivalent to solve  Zx = scale b, where
+     !> Z is defined as
+     !> Z = [ kron(In, A)  -kron(B**T, Im) ]         (2)
+     !> [ kron(In, D)  -kron(E**T, Im) ].
+     !> Here Ik is the identity matrix of size k and X**T is the transpose of
+     !> X. kron(X, Y) is the Kronecker product between the matrices X and Y.
+     !> If TRANS = 'T', DTGSYL solves the transposed system Z**T*y = scale*b,
+     !> which is equivalent to solve for R and L in
+     !> A**T * R + D**T * L = scale * C           (3)
+     !> R * B**T + L * E**T = scale * -F
+     !> This case (TRANS = 'T') is used to compute an one-norm-based estimate
+     !> of Dif[(A,D), (B,E)], the separation between the matrix pairs (A,D)
+     !> and (B,E), using DLACON.
+     !> If IJOB >= 1, DTGSYL computes a Frobenius norm-based estimate
+     !> of Dif[(A,D),(B,E)]. That is, the reciprocal of a lower bound on the
+     !> reciprocal of the smallest singular value of Z. See [1-2] for more
+     !> information.
+     !> This is a level 3 BLAS algorithm.
+
+     pure subroutine stdlib_qtgsyl( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f, &
+               ldf, scale, dif, work, lwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldc, ldd, lde, ldf, lwork, m, n
+           integer(ilp), intent(out) :: info
+           real(qp), intent(out) :: dif, scale
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: a(lda,*), b(ldb,*), d(ldd,*), e(lde,*)
+           real(qp), intent(inout) :: c(ldc,*), f(ldf,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+        ! replaced various illegal calls to stdlib_qcopy by calls to stdlib_qlaset.
+        ! sven hammarling, 1/5/02.
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, notran
+           integer(ilp) :: i, ie, ifunc, iround, is, isolve, j, je, js, k, linfo, lwmin, mb, nb, &
+                     p, ppqq, pq, q
+           real(qp) :: dscale, dsum, scale2, scaloc
+           ! Intrinsic Functions 
+           intrinsic :: real,max,sqrt
+           ! Executable Statements 
+           ! decode and test input parameters
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -1
+           else if( notran ) then
+              if( ( ijob<0 ) .or. ( ijob>4 ) ) then
+                 info = -2
+              end if
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 info = -3
+              else if( n<=0 ) then
+                 info = -4
+              else if( lda<max( 1, m ) ) then
+                 info = -6
+              else if( ldb<max( 1, n ) ) then
+                 info = -8
+              else if( ldc<max( 1, m ) ) then
+                 info = -10
+              else if( ldd<max( 1, m ) ) then
+                 info = -12
+              else if( lde<max( 1, n ) ) then
+                 info = -14
+              else if( ldf<max( 1, m ) ) then
+                 info = -16
+              end if
+           end if
+           if( info==0 ) then
+              if( notran ) then
+                 if( ijob==1 .or. ijob==2 ) then
+                    lwmin = max( 1, 2*m*n )
+                 else
+                    lwmin = 1
+                 end if
+              else
+                 lwmin = 1
+              end if
+              work( 1 ) = lwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTGSYL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              scale = 1
+              if( notran ) then
+                 if( ijob/=0 ) then
+                    dif = 0
+                 end if
+              end if
+              return
+           end if
+           ! determine optimal block sizes mb and nb
+           mb = stdlib_ilaenv( 2, 'DTGSYL', trans, m, n, -1, -1 )
+           nb = stdlib_ilaenv( 5, 'DTGSYL', trans, m, n, -1, -1 )
+           isolve = 1
+           ifunc = 0
+           if( notran ) then
+              if( ijob>=3 ) then
+                 ifunc = ijob - 2
+                 call stdlib_qlaset( 'F', m, n, zero, zero, c, ldc )
+                 call stdlib_qlaset( 'F', m, n, zero, zero, f, ldf )
+              else if( ijob>=1 ) then
+                 isolve = 2
+              end if
+           end if
+           if( ( mb<=1 .and. nb<=1 ) .or. ( mb>=m .and. nb>=n ) )then
+              loop_30: do iround = 1, isolve
+                 ! use unblocked level 2 solver
+                 dscale = zero
+                 dsum = one
+                 pq = 0
+                 call stdlib_qtgsy2( trans, ifunc, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f,&
+                            ldf, scale, dsum, dscale,iwork, pq, info )
+                 if( dscale/=zero ) then
+                    if( ijob==1 .or. ijob==3 ) then
+                       dif = sqrt( real( 2*m*n,KIND=qp) ) / ( dscale*sqrt( dsum ) )
+                    else
+                       dif = sqrt( real( pq,KIND=qp) ) / ( dscale*sqrt( dsum ) )
+                    end if
+                 end if
+                 if( isolve==2 .and. iround==1 ) then
+                    if( notran ) then
+                       ifunc = ijob
+                    end if
+                    scale2 = scale
+                    call stdlib_qlacpy( 'F', m, n, c, ldc, work, m )
+                    call stdlib_qlacpy( 'F', m, n, f, ldf, work( m*n+1 ), m )
+                    call stdlib_qlaset( 'F', m, n, zero, zero, c, ldc )
+                    call stdlib_qlaset( 'F', m, n, zero, zero, f, ldf )
+                 else if( isolve==2 .and. iround==2 ) then
+                    call stdlib_qlacpy( 'F', m, n, work, m, c, ldc )
+                    call stdlib_qlacpy( 'F', m, n, work( m*n+1 ), m, f, ldf )
+                    scale = scale2
+                 end if
+              end do loop_30
+              return
+           end if
+           ! determine block structure of a
+           p = 0
+           i = 1
+           40 continue
+           if( i>m )go to 50
+           p = p + 1
+           iwork( p ) = i
+           i = i + mb
+           if( i>=m )go to 50
+           if( a( i, i-1 )/=zero )i = i + 1
+           go to 40
+           50 continue
+           iwork( p+1 ) = m + 1
+           if( iwork( p )==iwork( p+1 ) )p = p - 1
+           ! determine block structure of b
+           q = p + 1
+           j = 1
+           60 continue
+           if( j>n )go to 70
+           q = q + 1
+           iwork( q ) = j
+           j = j + nb
+           if( j>=n )go to 70
+           if( b( j, j-1 )/=zero )j = j + 1
+           go to 60
+           70 continue
+           iwork( q+1 ) = n + 1
+           if( iwork( q )==iwork( q+1 ) )q = q - 1
+           if( notran ) then
+              loop_150: do iround = 1, isolve
+                 ! solve (i, j)-subsystem
+                     ! a(i, i) * r(i, j) - l(i, j) * b(j, j) = c(i, j)
+                     ! d(i, i) * r(i, j) - l(i, j) * e(j, j) = f(i, j)
+                 ! for i = p, p - 1,..., 1; j = 1, 2,..., q
+                 dscale = zero
+                 dsum = one
+                 pq = 0
+                 scale = one
+                 loop_130: do j = p + 2, q
+                    js = iwork( j )
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    loop_120: do i = p, 1, -1
+                       is = iwork( i )
+                       ie = iwork( i+1 ) - 1
+                       mb = ie - is + 1
+                       ppqq = 0
+                       call stdlib_qtgsy2( trans, ifunc, mb, nb, a( is, is ), lda,b( js, js ), &
+                       ldb, c( is, js ), ldc,d( is, is ), ldd, e( js, js ), lde,f( is, js ), ldf, &
+                                 scaloc, dsum, dscale,iwork( q+2 ), ppqq, linfo )
+                       if( linfo>0 )info = linfo
+                       pq = pq + ppqq
+                       if( scaloc/=one ) then
+                          do k = 1, js - 1
+                             call stdlib_qscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_qscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          do k = js, je
+                             call stdlib_qscal( is-1, scaloc, c( 1, k ), 1 )
+                             call stdlib_qscal( is-1, scaloc, f( 1, k ), 1 )
+                          end do
+                          do k = js, je
+                             call stdlib_qscal( m-ie, scaloc, c( ie+1, k ), 1 )
+                             call stdlib_qscal( m-ie, scaloc, f( ie+1, k ), 1 )
+                          end do
+                          do k = je + 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_qscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          call stdlib_qgemm( 'N', 'N', is-1, nb, mb, -one,a( 1, is ), lda, c( is, &
+                                    js ), ldc, one,c( 1, js ), ldc )
+                          call stdlib_qgemm( 'N', 'N', is-1, nb, mb, -one,d( 1, is ), ldd, c( is, &
+                                    js ), ldc, one,f( 1, js ), ldf )
+                       end if
+                       if( j<q ) then
+                          call stdlib_qgemm( 'N', 'N', mb, n-je, nb, one,f( is, js ), ldf, b( js, &
+                                    je+1 ), ldb,one, c( is, je+1 ), ldc )
+                          call stdlib_qgemm( 'N', 'N', mb, n-je, nb, one,f( is, js ), ldf, e( js, &
+                                    je+1 ), lde,one, f( is, je+1 ), ldf )
+                       end if
+                    end do loop_120
+                 end do loop_130
+                 if( dscale/=zero ) then
+                    if( ijob==1 .or. ijob==3 ) then
+                       dif = sqrt( real( 2*m*n,KIND=qp) ) / ( dscale*sqrt( dsum ) )
+                    else
+                       dif = sqrt( real( pq,KIND=qp) ) / ( dscale*sqrt( dsum ) )
+                    end if
+                 end if
+                 if( isolve==2 .and. iround==1 ) then
+                    if( notran ) then
+                       ifunc = ijob
+                    end if
+                    scale2 = scale
+                    call stdlib_qlacpy( 'F', m, n, c, ldc, work, m )
+                    call stdlib_qlacpy( 'F', m, n, f, ldf, work( m*n+1 ), m )
+                    call stdlib_qlaset( 'F', m, n, zero, zero, c, ldc )
+                    call stdlib_qlaset( 'F', m, n, zero, zero, f, ldf )
+                 else if( isolve==2 .and. iround==2 ) then
+                    call stdlib_qlacpy( 'F', m, n, work, m, c, ldc )
+                    call stdlib_qlacpy( 'F', m, n, work( m*n+1 ), m, f, ldf )
+                    scale = scale2
+                 end if
+              end do loop_150
+           else
+              ! solve transposed (i, j)-subsystem
+                   ! a(i, i)**t * r(i, j)  + d(i, i)**t * l(i, j)  =  c(i, j)
+                   ! r(i, j)  * b(j, j)**t + l(i, j)  * e(j, j)**t = -f(i, j)
+              ! for i = 1,2,..., p; j = q, q-1,..., 1
+              scale = one
+              loop_210: do i = 1, p
+                 is = iwork( i )
+                 ie = iwork( i+1 ) - 1
+                 mb = ie - is + 1
+                 loop_200: do j = q, p + 2, -1
+                    js = iwork( j )
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    call stdlib_qtgsy2( trans, ifunc, mb, nb, a( is, is ), lda,b( js, js ), ldb, &
+                    c( is, js ), ldc,d( is, is ), ldd, e( js, js ), lde,f( is, js ), ldf, scaloc, &
+                              dsum, dscale,iwork( q+2 ), ppqq, linfo )
+                    if( linfo>0 )info = linfo
+                    if( scaloc/=one ) then
+                       do k = 1, js - 1
+                          call stdlib_qscal( m, scaloc, c( 1, k ), 1 )
+                          call stdlib_qscal( m, scaloc, f( 1, k ), 1 )
+                       end do
+                       do k = js, je
+                          call stdlib_qscal( is-1, scaloc, c( 1, k ), 1 )
+                          call stdlib_qscal( is-1, scaloc, f( 1, k ), 1 )
+                       end do
+                       do k = js, je
+                          call stdlib_qscal( m-ie, scaloc, c( ie+1, k ), 1 )
+                          call stdlib_qscal( m-ie, scaloc, f( ie+1, k ), 1 )
+                       end do
+                       do k = je + 1, n
+                          call stdlib_qscal( m, scaloc, c( 1, k ), 1 )
+                          call stdlib_qscal( m, scaloc, f( 1, k ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    ! substitute r(i, j) and l(i, j) into remaining equation.
+                    if( j>p+2 ) then
+                       call stdlib_qgemm( 'N', 'T', mb, js-1, nb, one, c( is, js ),ldc, b( 1, js )&
+                                 , ldb, one, f( is, 1 ),ldf )
+                       call stdlib_qgemm( 'N', 'T', mb, js-1, nb, one, f( is, js ),ldf, e( 1, js )&
+                                 , lde, one, f( is, 1 ),ldf )
+                    end if
+                    if( i<p ) then
+                       call stdlib_qgemm( 'T', 'N', m-ie, nb, mb, -one,a( is, ie+1 ), lda, c( is, &
+                                 js ), ldc, one,c( ie+1, js ), ldc )
+                       call stdlib_qgemm( 'T', 'N', m-ie, nb, mb, -one,d( is, ie+1 ), ldd, f( is, &
+                                 js ), ldf, one,c( ie+1, js ), ldc )
+                    end if
+                 end do loop_200
+              end do loop_210
+           end if
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_qtgsyl
+
+     !> DTPCON: estimates the reciprocal of the condition number of a packed
+     !> triangular matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_qtpcon( norm, uplo, diag, n, ap, rcond, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: ap(*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(qp) :: ainvnm, anorm, scale, smlnum, xnorm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=qp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_qlantp( norm, uplo, diag, n, ap, work )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_qlatps( uplo, 'NO TRANSPOSE', diag, normin, n, ap,work, scale, &
+                              work( 2*n+1 ), info )
+                 else
+                    ! multiply by inv(a**t).
+                    call stdlib_qlatps( uplo, 'TRANSPOSE', diag, normin, n, ap,work, scale, work( &
+                              2*n+1 ), info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_iqamax( n, work, 1 )
+                    xnorm = abs( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_qrscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_qtpcon
+
+     !> DTPLQT: computes a blocked LQ factorization of a real
+     !> "triangular-pentagonal" matrix C, which is composed of a
+     !> triangular block A and pentagonal block B, using the compact
+     !> WY representation for Q.
+
+     pure subroutine stdlib_qtplqt( m, n, l, mb, a, lda, b, ldb, t, ldt, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l, mb
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, lb, nb, iinfo
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. (l>min(m,n) .and. min(m,n)>=0)) then
+              info = -3
+           else if( mb<1 .or. (mb>m .and. m>0)) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m ) ) then
+              info = -8
+           else if( ldt<mb ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPLQT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) return
+           do i = 1, m, mb
+           ! compute the qr factorization of the current block
+              ib = min( m-i+1, mb )
+              nb = min( n-l+i+ib-1, n )
+              if( i>=l ) then
+                 lb = 0
+              else
+                 lb = nb-n+l-i+1
+              end if
+              call stdlib_qtplqt2( ib, nb, lb, a(i,i), lda, b( i, 1 ), ldb,t(1, i ), ldt, iinfo )
+                        
+           ! update by applying h**t to b(i+ib:m,:) from the right
+              if( i+ib<=m ) then
+                 call stdlib_qtprfb( 'R', 'N', 'F', 'R', m-i-ib+1, nb, ib, lb,b( i, 1 ), ldb, t( &
+                           1, i ), ldt,a( i+ib, i ), lda, b( i+ib, 1 ), ldb,work, m-i-ib+1)
+              end if
+           end do
+           return
+     end subroutine stdlib_qtplqt
+
+     !> DTPLQT2: computes a LQ a factorization of a real "triangular-pentagonal"
+     !> matrix C, which is composed of a triangular block A and pentagonal block B,
+     !> using the compact WY representation for Q.
+
+     pure subroutine stdlib_qtplqt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, p, mp, np
+           real(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. l>min(m,n) ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1, m ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPLQT2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) return
+           do i = 1, m
+              ! generate elementary reflector h(i) to annihilate b(i,:)
+              p = n-l+min( l, i )
+              call stdlib_qlarfg( p+1, a( i, i ), b( i, 1 ), ldb, t( 1, i ) )
+              if( i<m ) then
+                 ! w(m-i:1) := c(i+1:m,i:n) * c(i,i:n) [use w = t(m,:)]
+                 do j = 1, m-i
+                    t( m, j ) = (a( i+j, i ))
+                 end do
+                 call stdlib_qgemv( 'N', m-i, p, one, b( i+1, 1 ), ldb,b( i, 1 ), ldb, one, t( m, &
+                           1 ), ldt )
+                 ! c(i+1:m,i:n) = c(i+1:m,i:n) + alpha * c(i,i:n)*w(m-1:1)^h
+                 alpha = -(t( 1, i ))
+                 do j = 1, m-i
+                    a( i+j, i ) = a( i+j, i ) + alpha*(t( m, j ))
+                 end do
+                 call stdlib_qger( m-i, p, alpha,  t( m, 1 ), ldt,b( i, 1 ), ldb, b( i+1, 1 ), &
+                           ldb )
+              end if
+           end do
+           do i = 2, m
+              ! t(i,1:i-1) := c(i:i-1,1:n) * (alpha * c(i,i:n)^h)
+              alpha = -t( 1, i )
+              do j = 1, i-1
+                 t( i, j ) = zero
+              end do
+              p = min( i-1, l )
+              np = min( n-l+1, n )
+              mp = min( p+1, m )
+              ! triangular part of b2
+              do j = 1, p
+                 t( i, j ) = alpha*b( i, n-l+j )
+              end do
+              call stdlib_qtrmv( 'L', 'N', 'N', p, b( 1, np ), ldb,t( i, 1 ), ldt )
+              ! rectangular part of b2
+              call stdlib_qgemv( 'N', i-1-p, l,  alpha, b( mp, np ), ldb,b( i, np ), ldb, zero, t(&
+                         i,mp ), ldt )
+              ! b1
+              call stdlib_qgemv( 'N', i-1, n-l, alpha, b, ldb, b( i, 1 ), ldb,one, t( i, 1 ), ldt &
+                        )
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(i,1:i-1)
+             call stdlib_qtrmv( 'L', 'T', 'N', i-1, t, ldt, t( i, 1 ), ldt )
+              ! t(i,i) = tau(i)
+              t( i, i ) = t( 1, i )
+              t( 1, i ) = zero
+           end do
+           do i=1,m
+              do j= i+1,m
+                 t(i,j)=t(j,i)
+                 t(j,i)= zero
+              end do
+           end do
+     end subroutine stdlib_qtplqt2
+
+     !> DTPMQRT applies a real orthogonal matrix Q obtained from a
+     !> "triangular-pentagonal" real block reflector H to a general
+     !> real matrix C, which consists of two blocks A and B.
+
+     pure subroutine stdlib_qtpmlqt( side, trans, m, n, k, l, mb, v, ldv, t, ldt,a, lda, b, ldb, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, lda, ldb, m, n, l, mb, ldt
+           ! Array Arguments 
+           real(qp), intent(in) :: v(ldv,*), t(ldt,*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, nb, lb, kf, ldaq
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'T' )
+           notran = stdlib_lsame( trans, 'N' )
+           if ( left ) then
+              ldaq = max( 1, k )
+           else if ( right ) then
+              ldaq = max( 1, m )
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( l<0 .or. l>k ) then
+              info = -6
+           else if( mb<1 .or. (mb>k .and. k>0) ) then
+              info = -7
+           else if( ldv<k ) then
+              info = -9
+           else if( ldt<mb ) then
+              info = -11
+           else if( lda<ldaq ) then
+              info = -13
+           else if( ldb<max( 1, m ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPMLQT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. notran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = 0
+                 end if
+                 call stdlib_qtprfb( 'L', 'T', 'F', 'R', nb, n, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. tran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = nb-n+l-i+1
+                 end if
+                 call stdlib_qtprfb( 'R', 'N', 'F', 'R', m, nb, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           else if( left .and. tran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = 0
+                 end if
+                 call stdlib_qtprfb( 'L', 'N', 'F', 'R', nb, n, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. notran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = nb-n+l-i+1
+                 end if
+                 call stdlib_qtprfb( 'R', 'T', 'F', 'R', m, nb, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           end if
+           return
+     end subroutine stdlib_qtpmlqt
+
+     !> DTPMQRT: applies a real orthogonal matrix Q obtained from a
+     !> "triangular-pentagonal" real block reflector H to a general
+     !> real matrix C, which consists of two blocks A and B.
+
+     pure subroutine stdlib_qtpmqrt( side, trans, m, n, k, l, nb, v, ldv, t, ldt,a, lda, b, ldb, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, lda, ldb, m, n, l, nb, ldt
+           ! Array Arguments 
+           real(qp), intent(in) :: v(ldv,*), t(ldt,*)
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, mb, lb, kf, ldaq, ldvq
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'T' )
+           notran = stdlib_lsame( trans, 'N' )
+           if ( left ) then
+              ldvq = max( 1, m )
+              ldaq = max( 1, k )
+           else if ( right ) then
+              ldvq = max( 1, n )
+              ldaq = max( 1, m )
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( l<0 .or. l>k ) then
+              info = -6
+           else if( nb<1 .or. (nb>k .and. k>0) ) then
+              info = -7
+           else if( ldv<ldvq ) then
+              info = -9
+           else if( ldt<nb ) then
+              info = -11
+           else if( lda<ldaq ) then
+              info = -13
+           else if( ldb<max( 1, m ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPMQRT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. tran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-m+l-i+1
+                 end if
+                 call stdlib_qtprfb( 'L', 'T', 'F', 'C', mb, n, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. notran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-n+l-i+1
+                 end if
+                 call stdlib_qtprfb( 'R', 'N', 'F', 'C', m, mb, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           else if( left .and. notran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-m+l-i+1
+                 end if
+                 call stdlib_qtprfb( 'L', 'N', 'F', 'C', mb, n, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. tran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-n+l-i+1
+                 end if
+                 call stdlib_qtprfb( 'R', 'T', 'F', 'C', m, mb, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           end if
+           return
+     end subroutine stdlib_qtpmqrt
+
+     !> DTPQRT: computes a blocked QR factorization of a real
+     !> "triangular-pentagonal" matrix C, which is composed of a
+     !> triangular block A and pentagonal block B, using the compact
+     !> WY representation for Q.
+
+     pure subroutine stdlib_qtpqrt( m, n, l, nb, a, lda, b, ldb, t, ldt, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l, nb
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, lb, mb, iinfo
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. (l>min(m,n) .and. min(m,n)>=0)) then
+              info = -3
+           else if( nb<1 .or. (nb>n .and. n>0)) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, m ) ) then
+              info = -8
+           else if( ldt<nb ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPQRT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) return
+           do i = 1, n, nb
+           ! compute the qr factorization of the current block
+              ib = min( n-i+1, nb )
+              mb = min( m-l+i+ib-1, m )
+              if( i>=l ) then
+                 lb = 0
+              else
+                 lb = mb-m+l-i+1
+              end if
+              call stdlib_qtpqrt2( mb, ib, lb, a(i,i), lda, b( 1, i ), ldb,t(1, i ), ldt, iinfo )
+                        
+           ! update by applying h**t to b(:,i+ib:n) from the left
+              if( i+ib<=n ) then
+                 call stdlib_qtprfb( 'L', 'T', 'F', 'C', mb, n-i-ib+1, ib, lb,b( 1, i ), ldb, t( &
+                           1, i ), ldt,a( i, i+ib ), lda, b( 1, i+ib ), ldb,work, ib )
+              end if
+           end do
+           return
+     end subroutine stdlib_qtpqrt
+
+     !> DTPQRT2: computes a QR factorization of a real "triangular-pentagonal"
+     !> matrix C, which is composed of a triangular block A and pentagonal block B,
+     !> using the compact WY representation for Q.
+
+     pure subroutine stdlib_qtpqrt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, p, mp, np
+           real(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. l>min(m,n) ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPQRT2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) return
+           do i = 1, n
+              ! generate elementary reflector h(i) to annihilate b(:,i)
+              p = m-l+min( l, i )
+              call stdlib_qlarfg( p+1, a( i, i ), b( 1, i ), 1, t( i, 1 ) )
+              if( i<n ) then
+                 ! w(1:n-i) := c(i:m,i+1:n)^h * c(i:m,i) [use w = t(:,n)]
+                 do j = 1, n-i
+                    t( j, n ) = (a( i, i+j ))
+                 end do
+                 call stdlib_qgemv( 'T', p, n-i, one, b( 1, i+1 ), ldb,b( 1, i ), 1, one, t( 1, n &
+                           ), 1 )
+                 ! c(i:m,i+1:n) = c(i:m,i+1:n) + alpha*c(i:m,i)*w(1:n-1)^h
+                 alpha = -(t( i, 1 ))
+                 do j = 1, n-i
+                    a( i, i+j ) = a( i, i+j ) + alpha*(t( j, n ))
+                 end do
+                 call stdlib_qger( p, n-i, alpha, b( 1, i ), 1,t( 1, n ), 1, b( 1, i+1 ), ldb )
+                           
+              end if
+           end do
+           do i = 2, n
+              ! t(1:i-1,i) := c(i:m,1:i-1)^h * (alpha * c(i:m,i))
+              alpha = -t( i, 1 )
+              do j = 1, i-1
+                 t( j, i ) = zero
+              end do
+              p = min( i-1, l )
+              mp = min( m-l+1, m )
+              np = min( p+1, n )
+              ! triangular part of b2
+              do j = 1, p
+                 t( j, i ) = alpha*b( m-l+j, i )
+              end do
+              call stdlib_qtrmv( 'U', 'T', 'N', p, b( mp, 1 ), ldb,t( 1, i ), 1 )
+              ! rectangular part of b2
+              call stdlib_qgemv( 'T', l, i-1-p, alpha, b( mp, np ), ldb,b( mp, i ), 1, zero, t( &
+                        np, i ), 1 )
+              ! b1
+              call stdlib_qgemv( 'T', m-l, i-1, alpha, b, ldb, b( 1, i ), 1,one, t( 1, i ), 1 )
+                        
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+              call stdlib_qtrmv( 'U', 'N', 'N', i-1, t, ldt, t( 1, i ), 1 )
+              ! t(i,i) = tau(i)
+              t( i, i ) = t( i, 1 )
+              t( i, 1 ) = zero
+           end do
+     end subroutine stdlib_qtpqrt2
+
+     !> DTPRFB: applies a real "triangular-pentagonal" block reflector H or its
+     !> transpose H**T to a real matrix C, which is composed of two
+     !> blocks A and B, either from the left or right.
+
+     pure subroutine stdlib_qtprfb( side, trans, direct, storev, m, n, k, l,v, ldv, t, ldt, a, &
+               lda, b, ldb, work, ldwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, l, lda, ldb, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(qp), intent(in) :: t(ldt,*), v(ldv,*)
+           real(qp), intent(out) :: work(ldwork,*)
+        ! ==========================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, mp, np, kp
+           logical(lk) :: left, forward, column, right, backward, row
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 .or. k<=0 .or. l<0 ) return
+           if( stdlib_lsame( storev, 'C' ) ) then
+              column = .true.
+              row = .false.
+           else if ( stdlib_lsame( storev, 'R' ) ) then
+              column = .false.
+              row = .true.
+           else
+              column = .false.
+              row = .false.
+           end if
+           if( stdlib_lsame( side, 'L' ) ) then
+              left = .true.
+              right = .false.
+           else if( stdlib_lsame( side, 'R' ) ) then
+              left = .false.
+              right = .true.
+           else
+              left = .false.
+              right = .false.
+           end if
+           if( stdlib_lsame( direct, 'F' ) ) then
+              forward = .true.
+              backward = .false.
+           else if( stdlib_lsame( direct, 'B' ) ) then
+              forward = .false.
+              backward = .true.
+           else
+              forward = .false.
+              backward = .false.
+           end if
+       ! ---------------------------------------------------------------------------
+           if( column .and. forward .and. left  ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i ]    (k-by-k)
+                        ! [ v ]    (m-by-k)
+              ! form  h c  or  h**t c  where  c = [ a ]  (k-by-n)
+                                                ! [ b ]  (m-by-n)
+              ! h = i - w t w**t          or  h**t = i - w t**t w**t
+              ! a = a -   t (a + v**t b)  or  a = a -   t**t (a + v**t b)
+              ! b = b - v t (a + v**t b)  or  b = b - v t**t (a + v**t b)
+       ! ---------------------------------------------------------------------------
+              mp = min( m-l+1, m )
+              kp = min( l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( i, j ) = b( m-l+i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'L', 'U', 'T', 'N', l, n, one, v( mp, 1 ), ldv,work, ldwork )
+                        
+              call stdlib_qgemm( 'T', 'N', l, n, m-l, one, v, ldv, b, ldb,one, work, ldwork )
+                        
+              call stdlib_qgemm( 'T', 'N', k-l, n, m, one, v( 1, kp ), ldv,b, ldb, zero, work( kp,&
+                         1 ), ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'L', 'U', trans, 'N', k, n, one, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_qgemm( 'N', 'N', m-l, n, k, -one, v, ldv, work, ldwork,one, b, ldb )
+                        
+              call stdlib_qgemm( 'N', 'N', l, n, k-l, -one, v( mp, kp ), ldv,work( kp, 1 ), &
+                        ldwork, one, b( mp, 1 ),  ldb )
+              call stdlib_qtrmm( 'L', 'U', 'N', 'N', l, n, one, v( mp, 1 ), ldv,work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, l
+                    b( m-l+i, j ) = b( m-l+i, j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. forward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i ]    (k-by-k)
+                        ! [ v ]    (n-by-k)
+              ! form  c h or  c h**t  where  c = [ a b ] (a is m-by-k, b is m-by-n)
+              ! h = i - w t w**t          or  h**t = i - w t**t w**t
+              ! a = a - (a + b v) t      or  a = a - (a + b v) t**t
+              ! b = b - (a + b v) t v**t  or  b = b - (a + b v) t**t v**t
+       ! ---------------------------------------------------------------------------
+              np = min( n-l+1, n )
+              kp = min( l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, j ) = b( i, n-l+j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'R', 'U', 'N', 'N', m, l, one, v( np, 1 ), ldv,work, ldwork )
+                        
+              call stdlib_qgemm( 'N', 'N', m, l, n-l, one, b, ldb,v, ldv, one, work, ldwork )
+                        
+              call stdlib_qgemm( 'N', 'N', m, k-l, n, one, b, ldb,v( 1, kp ), ldv, zero, work( 1, &
+                        kp ), ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'R', 'U', trans, 'N', m, k, one, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_qgemm( 'N', 'T', m, n-l, k, -one, work, ldwork,v, ldv, one, b, ldb )
+                        
+              call stdlib_qgemm( 'N', 'T', m, l, k-l, -one, work( 1, kp ), ldwork,v( np, kp ), &
+                        ldv, one, b( 1, np ), ldb )
+              call stdlib_qtrmm( 'R', 'U', 'T', 'N', m, l, one, v( np, 1 ), ldv,work, ldwork )
+                        
+              do j = 1, l
+                 do i = 1, m
+                    b( i, n-l+j ) = b( i, n-l+j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. backward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v ]    (m-by-k)
+                        ! [ i ]    (k-by-k)
+              ! form  h c  or  h**t c  where  c = [ b ]  (m-by-n)
+                                                ! [ a ]  (k-by-n)
+              ! h = i - w t w**t          or  h**t = i - w t**t w**t
+              ! a = a -   t (a + v**t b)  or  a = a -   t**t (a + v**t b)
+              ! b = b - v t (a + v**t b)  or  b = b - v t**t (a + v**t b)
+       ! ---------------------------------------------------------------------------
+              mp = min( l+1, m )
+              kp = min( k-l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( k-l+i, j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'L', 'L', 'T', 'N', l, n, one, v( 1, kp ), ldv,work( kp, 1 ), &
+                        ldwork )
+              call stdlib_qgemm( 'T', 'N', l, n, m-l, one, v( mp, kp ), ldv,b( mp, 1 ), ldb, one, &
+                        work( kp, 1 ), ldwork )
+              call stdlib_qgemm( 'T', 'N', k-l, n, m, one, v, ldv,b, ldb, zero, work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'L', 'L', trans, 'N', k, n, one, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_qgemm( 'N', 'N', m-l, n, k, -one, v( mp, 1 ), ldv,work, ldwork, one, b( &
+                        mp, 1 ), ldb )
+              call stdlib_qgemm( 'N', 'N', l, n, k-l, -one, v, ldv,work, ldwork, one, b,  ldb )
+                        
+              call stdlib_qtrmm( 'L', 'L', 'N', 'N', l, n, one, v( 1, kp ), ldv,work( kp, 1 ), &
+                        ldwork )
+              do j = 1, n
+                 do i = 1, l
+                    b( i, j ) = b( i, j ) - work( k-l+i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. backward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v ]    (n-by-k)
+                        ! [ i ]    (k-by-k)
+              ! form  c h  or  c h**t  where  c = [ b a ] (b is m-by-n, a is m-by-k)
+              ! h = i - w t w**t          or  h**t = i - w t**t w**t
+              ! a = a - (a + b v) t      or  a = a - (a + b v) t**t
+              ! b = b - (a + b v) t v**t  or  b = b - (a + b v) t**t v**t
+       ! ---------------------------------------------------------------------------
+              np = min( l+1, n )
+              kp = min( k-l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, k-l+j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'R', 'L', 'N', 'N', m, l, one, v( 1, kp ), ldv,work( 1, kp ), &
+                        ldwork )
+              call stdlib_qgemm( 'N', 'N', m, l, n-l, one, b( 1, np ), ldb,v( np, kp ), ldv, one, &
+                        work( 1, kp ), ldwork )
+              call stdlib_qgemm( 'N', 'N', m, k-l, n, one, b, ldb,v, ldv, zero, work, ldwork )
+                        
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'R', 'L', trans, 'N', m, k, one, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_qgemm( 'N', 'T', m, n-l, k, -one, work, ldwork,v( np, 1 ), ldv, one, b( &
+                        1, np ), ldb )
+              call stdlib_qgemm( 'N', 'T', m, l, k-l, -one, work, ldwork,v, ldv, one, b, ldb )
+                        
+              call stdlib_qtrmm( 'R', 'L', 'T', 'N', m, l, one, v( 1, kp ), ldv,work( 1, kp ), &
+                        ldwork )
+              do j = 1, l
+                 do i = 1, m
+                    b( i, j ) = b( i, j ) - work( i, k-l+j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. forward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i v ] ( i is k-by-k, v is k-by-m )
+              ! form  h c  or  h**t c  where  c = [ a ]  (k-by-n)
+                                                ! [ b ]  (m-by-n)
+              ! h = i - w**t t w          or  h**t = i - w**t t**t w
+              ! a = a -     t (a + v b)  or  a = a -     t**t (a + v b)
+              ! b = b - v**t t (a + v b)  or  b = b - v**t t**t (a + v b)
+       ! ---------------------------------------------------------------------------
+              mp = min( m-l+1, m )
+              kp = min( l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( i, j ) = b( m-l+i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'L', 'L', 'N', 'N', l, n, one, v( 1, mp ), ldv,work, ldb )
+                        
+              call stdlib_qgemm( 'N', 'N', l, n, m-l, one, v, ldv,b, ldb,one, work, ldwork )
+                        
+              call stdlib_qgemm( 'N', 'N', k-l, n, m, one, v( kp, 1 ), ldv,b, ldb, zero, work( kp,&
+                         1 ), ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'L', 'U', trans, 'N', k, n, one, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', m-l, n, k, -one, v, ldv, work, ldwork,one, b, ldb )
+                        
+              call stdlib_qgemm( 'T', 'N', l, n, k-l, -one, v( kp, mp ), ldv,work( kp, 1 ), &
+                        ldwork, one, b( mp, 1 ), ldb )
+              call stdlib_qtrmm( 'L', 'L', 'T', 'N', l, n, one, v( 1, mp ), ldv,work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, l
+                    b( m-l+i, j ) = b( m-l+i, j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. forward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i v ] ( i is k-by-k, v is k-by-n )
+              ! form  c h  or  c h**t  where  c = [ a b ] (a is m-by-k, b is m-by-n)
+              ! h = i - w**t t w            or  h**t = i - w**t t**t w
+              ! a = a - (a + b v**t) t      or  a = a - (a + b v**t) t**t
+              ! b = b - (a + b v**t) t v    or  b = b - (a + b v**t) t**t v
+       ! ---------------------------------------------------------------------------
+              np = min( n-l+1, n )
+              kp = min( l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, j ) = b( i, n-l+j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'R', 'L', 'T', 'N', m, l, one, v( 1, np ), ldv,work, ldwork )
+                        
+              call stdlib_qgemm( 'N', 'T', m, l, n-l, one, b, ldb, v, ldv,one, work, ldwork )
+                        
+              call stdlib_qgemm( 'N', 'T', m, k-l, n, one, b, ldb,v( kp, 1 ), ldv, zero, work( 1, &
+                        kp ), ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'R', 'U', trans, 'N', m, k, one, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_qgemm( 'N', 'N', m, n-l, k, -one, work, ldwork,v, ldv, one, b, ldb )
+                        
+              call stdlib_qgemm( 'N', 'N', m, l, k-l, -one, work( 1, kp ), ldwork,v( kp, np ), &
+                        ldv, one, b( 1, np ), ldb )
+              call stdlib_qtrmm( 'R', 'L', 'N', 'N', m, l, one, v( 1, np ), ldv,work, ldwork )
+                        
+              do j = 1, l
+                 do i = 1, m
+                    b( i, n-l+j ) = b( i, n-l+j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. backward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v i ] ( i is k-by-k, v is k-by-m )
+              ! form  h c  or  h**t c  where  c = [ b ]  (m-by-n)
+                                                ! [ a ]  (k-by-n)
+              ! h = i - w**t t w          or  h**t = i - w**t t**t w
+              ! a = a -     t (a + v b)  or  a = a -     t**t (a + v b)
+              ! b = b - v**t t (a + v b)  or  b = b - v**t t**t (a + v b)
+       ! ---------------------------------------------------------------------------
+              mp = min( l+1, m )
+              kp = min( k-l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( k-l+i, j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'L', 'U', 'N', 'N', l, n, one, v( kp, 1 ), ldv,work( kp, 1 ), &
+                        ldwork )
+              call stdlib_qgemm( 'N', 'N', l, n, m-l, one, v( kp, mp ), ldv,b( mp, 1 ), ldb, one, &
+                        work( kp, 1 ), ldwork )
+              call stdlib_qgemm( 'N', 'N', k-l, n, m, one, v, ldv, b, ldb,zero, work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'L', 'L ', trans, 'N', k, n, one, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', m-l, n, k, -one, v( 1, mp ), ldv,work, ldwork, one, b( &
+                        mp, 1 ), ldb )
+              call stdlib_qgemm( 'T', 'N', l, n, k-l, -one, v, ldv,work, ldwork, one, b, ldb )
+                        
+              call stdlib_qtrmm( 'L', 'U', 'T', 'N', l, n, one, v( kp, 1 ), ldv,work( kp, 1 ), &
+                        ldwork )
+              do j = 1, n
+                 do i = 1, l
+                    b( i, j ) = b( i, j ) - work( k-l+i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. backward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v i ] ( i is k-by-k, v is k-by-n )
+              ! form  c h  or  c h**t  where  c = [ b a ] (a is m-by-k, b is m-by-n)
+              ! h = i - w**t t w            or  h**t = i - w**t t**t w
+              ! a = a - (a + b v**t) t      or  a = a - (a + b v**t) t**t
+              ! b = b - (a + b v**t) t v    or  b = b - (a + b v**t) t**t v
+       ! ---------------------------------------------------------------------------
+              np = min( l+1, n )
+              kp = min( k-l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, k-l+j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'R', 'U', 'T', 'N', m, l, one, v( kp, 1 ), ldv,work( 1, kp ), &
+                        ldwork )
+              call stdlib_qgemm( 'N', 'T', m, l, n-l, one, b( 1, np ), ldb,v( kp, np ), ldv, one, &
+                        work( 1, kp ), ldwork )
+              call stdlib_qgemm( 'N', 'T', m, k-l, n, one, b, ldb, v, ldv,zero, work, ldwork )
+                        
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_qtrmm( 'R', 'L', trans, 'N', m, k, one, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_qgemm( 'N', 'N', m, n-l, k, -one, work, ldwork,v( 1, np ), ldv, one, b( &
+                        1, np ), ldb )
+              call stdlib_qgemm( 'N', 'N', m, l, k-l , -one, work, ldwork,v, ldv, one, b, ldb )
+                        
+              call stdlib_qtrmm( 'R', 'U', 'N', 'N', m, l, one, v( kp, 1 ), ldv,work( 1, kp ), &
+                        ldwork )
+              do j = 1, l
+                 do i = 1, m
+                    b( i, j ) = b( i, j ) - work( i, k-l+j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_qtprfb
+
+     !> DTPRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular packed
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by DTPTRS or some other
+     !> means before entering this routine.  DTPRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_qtprfs( uplo, trans, diag, n, nrhs, ap, b, ldb, x, ldx,ferr, berr, &
+               work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: ap(*), b(ldb,*), x(ldx,*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transt
+           integer(ilp) :: i, j, k, kase, kc, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a or a**t, depending on trans.
+              call stdlib_qcopy( n, x( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_qtpmv( uplo, trans, diag, n, ap, work( n+1 ), 1 )
+              call stdlib_qaxpy( n, -one, b( 1, j ), 1, work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = 1, k
+                             work( i ) = work( i ) + abs( ap( kc+i-1 ) )*xk
+                          end do
+                          kc = kc + k
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = 1, k - 1
+                             work( i ) = work( i ) + abs( ap( kc+i-1 ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                          kc = kc + k
+                       end do
+                    end if
+                 else
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k, n
+                             work( i ) = work( i ) + abs( ap( kc+i-k ) )*xk
+                          end do
+                          kc = kc + n - k + 1
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k + 1, n
+                             work( i ) = work( i ) + abs( ap( kc+i-k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                          kc = kc + n - k + 1
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**t)*abs(x) + abs(b).
+                 if( upper ) then
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = 1, k
+                             s = s + abs( ap( kc+i-1 ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                          kc = kc + k
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = 1, k - 1
+                             s = s + abs( ap( kc+i-1 ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                          kc = kc + k
+                       end do
+                    end if
+                 else
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, n
+                             s = s + abs( ap( kc+i-k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                          kc = kc + n - k + 1
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = k + 1, n
+                             s = s + abs( ap( kc+i-k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                          kc = kc + n - k + 1
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_qlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_qlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_qtpsv( uplo, transt, diag, n, ap, work( n+1 ), 1 )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_qtpsv( uplo, trans, diag, n, ap, work( n+1 ), 1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_qtprfs
+
+     !> DTPTRI: computes the inverse of a real upper or lower triangular
+     !> matrix A stored in packed format.
+
+     pure subroutine stdlib_qtptri( uplo, diag, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jc, jclast, jj
+           real(qp) :: ajj
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPTRI', -info )
+              return
+           end if
+           ! check for singularity if non-unit.
+           if( nounit ) then
+              if( upper ) then
+                 jj = 0
+                 do info = 1, n
+                    jj = jj + info
+                    if( ap( jj )==zero )return
+                 end do
+              else
+                 jj = 1
+                 do info = 1, n
+                    if( ap( jj )==zero )return
+                    jj = jj + n - info + 1
+                 end do
+              end if
+              info = 0
+           end if
+           if( upper ) then
+              ! compute inverse of upper triangular matrix.
+              jc = 1
+              do j = 1, n
+                 if( nounit ) then
+                    ap( jc+j-1 ) = one / ap( jc+j-1 )
+                    ajj = -ap( jc+j-1 )
+                 else
+                    ajj = -one
+                 end if
+                 ! compute elements 1:j-1 of j-th column.
+                 call stdlib_qtpmv( 'UPPER', 'NO TRANSPOSE', diag, j-1, ap,ap( jc ), 1 )
+                 call stdlib_qscal( j-1, ajj, ap( jc ), 1 )
+                 jc = jc + j
+              end do
+           else
+              ! compute inverse of lower triangular matrix.
+              jc = n*( n+1 ) / 2
+              do j = n, 1, -1
+                 if( nounit ) then
+                    ap( jc ) = one / ap( jc )
+                    ajj = -ap( jc )
+                 else
+                    ajj = -one
+                 end if
+                 if( j<n ) then
+                    ! compute elements j+1:n of j-th column.
+                    call stdlib_qtpmv( 'LOWER', 'NO TRANSPOSE', diag, n-j,ap( jclast ), ap( jc+1 )&
+                              , 1 )
+                    call stdlib_qscal( n-j, ajj, ap( jc+1 ), 1 )
+                 end if
+                 jclast = jc
+                 jc = jc - n + j - 2
+              end do
+           end if
+           return
+     end subroutine stdlib_qtptri
+
+     !> DTPTRS: solves a triangular system of the form
+     !> A * X = B  or  A**T * X = B,
+     !> where A is a triangular matrix of order N stored in packed format,
+     !> and B is an N-by-NRHS matrix.  A check is made to verify that A is
+     !> nonsingular.
+
+     pure subroutine stdlib_qtptrs( uplo, trans, diag, n, nrhs, ap, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(in) :: ap(*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jc
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              if( upper ) then
+                 jc = 1
+                 do info = 1, n
+                    if( ap( jc+info-1 )==zero )return
+                    jc = jc + info
+                 end do
+              else
+                 jc = 1
+                 do info = 1, n
+                    if( ap( jc )==zero )return
+                    jc = jc + n - info + 1
+                 end do
+              end if
+           end if
+           info = 0
+           ! solve a * x = b  or  a**t * x = b.
+           do j = 1, nrhs
+              call stdlib_qtpsv( uplo, trans, diag, n, ap, b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_qtptrs
+
+     !> DTPTTF: copies a triangular matrix A from standard packed format (TP)
+     !> to rectangular full packed format (TF).
+
+     pure subroutine stdlib_qtpttf( transr, uplo, n, ap, arf, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(in) :: ap(0:*)
+           real(qp), intent(out) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt
+           integer(ilp) :: i, j, ij
+           integer(ilp) :: ijp, jp, lda, js
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPTTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( normaltransr ) then
+                 arf( 0 ) = ap( 0 )
+              else
+                 arf( 0 ) = ap( 0 )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           ! set lda of arf^c; arf^c is (0:(n+1)/2-1,0:n-noe)
+           ! where noe = 0 if n is even, noe = 1 if n is odd
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              lda = n + 1
+           else
+              nisodd = .true.
+              lda = n
+           end if
+           ! arf^c has lda rows and n+1-noe cols
+           if( .not.normaltransr )lda = ( n+1 ) / 2
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    ijp = 0
+                    jp = 0
+                    do j = 0, n2
+                       do i = j, n - 1
+                          ij = i + jp
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, n2 - 1
+                       do j = 1 + i, n2
+                          ij = i + j*lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    ijp = 0
+                    do j = 0, n1 - 1
+                       ij = n2 + j
+                       do i = 0, j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = n1, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! n is odd, transr = 't', and uplo = 'l'
+                    ijp = 0
+                    do i = 0, n2
+                       do ij = i*( lda+1 ), n*lda - 1, lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 1
+                    do j = 0, n2 - 1
+                       do ij = js, js + n2 - j - 1
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! n is odd, transr = 't', and uplo = 'u'
+                    ijp = 0
+                    js = n2*lda
+                    do j = 0, n1 - 1
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, n1
+                       do ij = i, i + ( n1+i )*lda, lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    ijp = 0
+                    jp = 0
+                    do j = 0, k - 1
+                       do i = j, n - 1
+                          ij = 1 + i + jp
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, k - 1
+                       do j = i, k - 1
+                          ij = i + j*lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    ijp = 0
+                    do j = 0, k - 1
+                       ij = k + 1 + j
+                       do i = 0, j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = k, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! n is even, transr = 't', and uplo = 'l'
+                    ijp = 0
+                    do i = 0, k - 1
+                       do ij = i + ( i+1 )*lda, ( n+1 )*lda - 1, lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 0
+                    do j = 0, k - 1
+                       do ij = js, js + k - j - 1
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! n is even, transr = 't', and uplo = 'u'
+                    ijp = 0
+                    js = ( k+1 )*lda
+                    do j = 0, k - 1
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, k - 1
+                       do ij = i, i + ( k+i )*lda, lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qtpttf
+
+     !> DTPTTR: copies a triangular matrix A from standard packed format (TP)
+     !> to standard full format (TR).
+
+     pure subroutine stdlib_qtpttr( uplo, n, ap, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           real(qp), intent(out) :: a(lda,*)
+           real(qp), intent(in) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower
+           integer(ilp) :: i, j, k
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTPTTR', -info )
+              return
+           end if
+           if( lower ) then
+              k = 0
+              do j = 1, n
+                 do i = j, n
+                    k = k + 1
+                    a( i, j ) = ap( k )
+                 end do
+              end do
+           else
+              k = 0
+              do j = 1, n
+                 do i = 1, j
+                    k = k + 1
+                    a( i, j ) = ap( k )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_qtpttr
+
+     !> DTRCON: estimates the reciprocal of the condition number of a
+     !> triangular matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_qtrcon( norm, uplo, diag, n, a, lda, rcond, work,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(qp) :: ainvnm, anorm, scale, smlnum, xnorm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=qp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_qlantr( norm, uplo, diag, n, n, a, lda, work )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_qlacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_qlatrs( uplo, 'NO TRANSPOSE', diag, normin, n, a,lda, work, scale,&
+                               work( 2*n+1 ), info )
+                 else
+                    ! multiply by inv(a**t).
+                    call stdlib_qlatrs( uplo, 'TRANSPOSE', diag, normin, n, a, lda,work, scale, &
+                              work( 2*n+1 ), info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_iqamax( n, work, 1 )
+                    xnorm = abs( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_qrscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_qtrcon
+
+     !> DTREVC: computes some or all of the right and/or left eigenvectors of
+     !> a real upper quasi-triangular matrix T.
+     !> Matrices of this type are produced by the Schur factorization of
+     !> a real general matrix:  A = Q*T*Q**T, as computed by DHSEQR.
+     !> The right eigenvector x and the left eigenvector y of T corresponding
+     !> to an eigenvalue w are defined by:
+     !> T*x = w*x,     (y**H)*T = w*(y**H)
+     !> where y**H denotes the conjugate transpose of y.
+     !> The eigenvalues are not input to this routine, but are read directly
+     !> from the diagonal blocks of T.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+     !> input matrix.  If Q is the orthogonal factor that reduces a matrix
+     !> A to Schur form T, then Q*X and Q*Y are the matrices of right and
+     !> left eigenvectors of A.
+
+     pure subroutine stdlib_qtrevc( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(inout) :: select(*)
+           real(qp), intent(in) :: t(ldt,*)
+           real(qp), intent(inout) :: vl(ldvl,*), vr(ldvr,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: allv, bothv, leftv, over, pair, rightv, somev
+           integer(ilp) :: i, ierr, ii, ip, is, j, j1, j2, jnxt, k, ki, n2
+           real(qp) :: beta, bignum, emax, ovfl, rec, remax, scale, smin, smlnum, ulp, unfl, &
+                     vcrit, vmax, wi, wr, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Local Arrays 
+           real(qp) :: x(2,2)
+           ! Executable Statements 
+           ! decode and test the input parameters
+           bothv = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv = stdlib_lsame( side, 'L' ) .or. bothv
+           allv = stdlib_lsame( howmny, 'A' )
+           over = stdlib_lsame( howmny, 'B' )
+           somev = stdlib_lsame( howmny, 'S' )
+           info = 0
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.allv .and. .not.over .and. .not.somev ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -10
+           else
+              ! set m to the number of columns required to store the selected
+              ! eigenvectors, standardize the array select if necessary, and
+              ! test mm.
+              if( somev ) then
+                 m = 0
+                 pair = .false.
+                 do j = 1, n
+                    if( pair ) then
+                       pair = .false.
+                       select( j ) = .false.
+                    else
+                       if( j<n ) then
+                          if( t( j+1, j )==zero ) then
+                             if( select( j ) )m = m + 1
+                          else
+                             pair = .true.
+                             if( select( j ) .or. select( j+1 ) ) then
+                                select( j ) = .true.
+                                m = m + 2
+                             end if
+                          end if
+                       else
+                          if( select( n ) )m = m + 1
+                       end if
+                    end if
+                 end do
+              else
+                 m = n
+              end if
+              if( mm<m ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTREVC', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! set the constants to control overflow.
+           unfl = stdlib_qlamch( 'SAFE MINIMUM' )
+           ovfl = one / unfl
+           call stdlib_qlabad( unfl, ovfl )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           bignum = ( one-ulp ) / smlnum
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           work( 1 ) = zero
+           do j = 2, n
+              work( j ) = zero
+              do i = 1, j - 1
+                 work( j ) = work( j ) + abs( t( i, j ) )
+              end do
+           end do
+           ! index ip is used to specify the real or complex eigenvalue:
+             ! ip = 0, real eigenvalue,
+                  ! 1, first of conjugate complex pair: (wr,wi)
+                 ! -1, second of conjugate complex pair: (wr,wi)
+           n2 = 2*n
+           if( rightv ) then
+              ! compute right eigenvectors.
+              ip = 0
+              is = m
+              loop_140: do ki = n, 1, -1
+                 if( ip==1 )go to 130
+                 if( ki==1 )go to 40
+                 if( t( ki, ki-1 )==zero )go to 40
+                 ip = -1
+                 40 continue
+                 if( somev ) then
+                    if( ip==0 ) then
+                       if( .not.select( ki ) )go to 130
+                    else
+                       if( .not.select( ki-1 ) )go to 130
+                    end if
+                 end if
+                 ! compute the ki-th eigenvalue (wr,wi).
+                 wr = t( ki, ki )
+                 wi = zero
+                 if( ip/=0 )wi = sqrt( abs( t( ki, ki-1 ) ) )*sqrt( abs( t( ki-1, ki ) ) )
+                 smin = max( ulp*( abs( wr )+abs( wi ) ), smlnum )
+                 if( ip==0 ) then
+                    ! real right eigenvector
+                    work( ki+n ) = one
+                    ! form right-hand side
+                    do k = 1, ki - 1
+                       work( k+n ) = -t( k, ki )
+                    end do
+                    ! solve the upper quasi-triangular system:
+                       ! (t(1:ki-1,1:ki-1) - wr)*x = scale*work.
+                    jnxt = ki - 1
+                    loop_60: do j = ki - 1, 1, -1
+                       if( j>jnxt )cycle loop_60
+                       j1 = j
+                       j2 = j
+                       jnxt = j - 1
+                       if( j>1 ) then
+                          if( t( j, j-1 )/=zero ) then
+                             j1 = j - 1
+                             jnxt = j - 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          call stdlib_qlaln2( .false., 1, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale x(1,1) to avoid overflow when updating
+                          ! the right-hand side.
+                          if( xnorm>one ) then
+                             if( work( j )>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_qscal( ki, scale, work( 1+n ), 1 )
+                          work( j+n ) = x( 1, 1 )
+                          ! update right-hand side
+                          call stdlib_qaxpy( j-1, -x( 1, 1 ), t( 1, j ), 1,work( 1+n ), 1 )
+                                    
+                       else
+                          ! 2-by-2 diagonal block
+                          call stdlib_qlaln2( .false., 2, 1, smin, one,t( j-1, j-1 ), ldt, one, &
+                                    one,work( j-1+n ), n, wr, zero, x, 2,scale, xnorm, ierr )
+                          ! scale x(1,1) and x(2,1) to avoid overflow when
+                          ! updating the right-hand side.
+                          if( xnorm>one ) then
+                             beta = max( work( j-1 ), work( j ) )
+                             if( beta>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                x( 2, 1 ) = x( 2, 1 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_qscal( ki, scale, work( 1+n ), 1 )
+                          work( j-1+n ) = x( 1, 1 )
+                          work( j+n ) = x( 2, 1 )
+                          ! update right-hand side
+                          call stdlib_qaxpy( j-2, -x( 1, 1 ), t( 1, j-1 ), 1,work( 1+n ), 1 )
+                                    
+                          call stdlib_qaxpy( j-2, -x( 2, 1 ), t( 1, j ), 1,work( 1+n ), 1 )
+                                    
+                       end if
+                    end do loop_60
+                    ! copy the vector x or q*x to vr and normalize.
+                    if( .not.over ) then
+                       call stdlib_qcopy( ki, work( 1+n ), 1, vr( 1, is ), 1 )
+                       ii = stdlib_iqamax( ki, vr( 1, is ), 1 )
+                       remax = one / abs( vr( ii, is ) )
+                       call stdlib_qscal( ki, remax, vr( 1, is ), 1 )
+                       do k = ki + 1, n
+                          vr( k, is ) = zero
+                       end do
+                    else
+                       if( ki>1 )call stdlib_qgemv( 'N', n, ki-1, one, vr, ldvr,work( 1+n ), 1, &
+                                 work( ki+n ),vr( 1, ki ), 1 )
+                       ii = stdlib_iqamax( n, vr( 1, ki ), 1 )
+                       remax = one / abs( vr( ii, ki ) )
+                       call stdlib_qscal( n, remax, vr( 1, ki ), 1 )
+                    end if
+                 else
+                    ! complex right eigenvector.
+                    ! initial solve
+                      ! [ (t(ki-1,ki-1) t(ki-1,ki) ) - (wr + i* wi)]*x = 0.
+                      ! [ (t(ki,ki-1)   t(ki,ki)   )               ]
+                    if( abs( t( ki-1, ki ) )>=abs( t( ki, ki-1 ) ) ) then
+                       work( ki-1+n ) = one
+                       work( ki+n2 ) = wi / t( ki-1, ki )
+                    else
+                       work( ki-1+n ) = -wi / t( ki, ki-1 )
+                       work( ki+n2 ) = one
+                    end if
+                    work( ki+n ) = zero
+                    work( ki-1+n2 ) = zero
+                    ! form right-hand side
+                    do k = 1, ki - 2
+                       work( k+n ) = -work( ki-1+n )*t( k, ki-1 )
+                       work( k+n2 ) = -work( ki+n2 )*t( k, ki )
+                    end do
+                    ! solve upper quasi-triangular system:
+                    ! (t(1:ki-2,1:ki-2) - (wr+i*wi))*x = scale*(work+i*work2)
+                    jnxt = ki - 2
+                    loop_90: do j = ki - 2, 1, -1
+                       if( j>jnxt )cycle loop_90
+                       j1 = j
+                       j2 = j
+                       jnxt = j - 1
+                       if( j>1 ) then
+                          if( t( j, j-1 )/=zero ) then
+                             j1 = j - 1
+                             jnxt = j - 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          call stdlib_qlaln2( .false., 1, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr, wi,x, 2, scale, xnorm, ierr )
+                          ! scale x(1,1) and x(1,2) to avoid overflow when
+                          ! updating the right-hand side.
+                          if( xnorm>one ) then
+                             if( work( j )>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                x( 1, 2 ) = x( 1, 2 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_qscal( ki, scale, work( 1+n ), 1 )
+                             call stdlib_qscal( ki, scale, work( 1+n2 ), 1 )
+                          end if
+                          work( j+n ) = x( 1, 1 )
+                          work( j+n2 ) = x( 1, 2 )
+                          ! update the right-hand side
+                          call stdlib_qaxpy( j-1, -x( 1, 1 ), t( 1, j ), 1,work( 1+n ), 1 )
+                                    
+                          call stdlib_qaxpy( j-1, -x( 1, 2 ), t( 1, j ), 1,work( 1+n2 ), 1 )
+                                    
+                       else
+                          ! 2-by-2 diagonal block
+                          call stdlib_qlaln2( .false., 2, 2, smin, one,t( j-1, j-1 ), ldt, one, &
+                                    one,work( j-1+n ), n, wr, wi, x, 2, scale,xnorm, ierr )
+                          ! scale x to avoid overflow when updating
+                          ! the right-hand side.
+                          if( xnorm>one ) then
+                             beta = max( work( j-1 ), work( j ) )
+                             if( beta>bignum / xnorm ) then
+                                rec = one / xnorm
+                                x( 1, 1 ) = x( 1, 1 )*rec
+                                x( 1, 2 ) = x( 1, 2 )*rec
+                                x( 2, 1 ) = x( 2, 1 )*rec
+                                x( 2, 2 ) = x( 2, 2 )*rec
+                                scale = scale*rec
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_qscal( ki, scale, work( 1+n ), 1 )
+                             call stdlib_qscal( ki, scale, work( 1+n2 ), 1 )
+                          end if
+                          work( j-1+n ) = x( 1, 1 )
+                          work( j+n ) = x( 2, 1 )
+                          work( j-1+n2 ) = x( 1, 2 )
+                          work( j+n2 ) = x( 2, 2 )
+                          ! update the right-hand side
+                          call stdlib_qaxpy( j-2, -x( 1, 1 ), t( 1, j-1 ), 1,work( 1+n ), 1 )
+                                    
+                          call stdlib_qaxpy( j-2, -x( 2, 1 ), t( 1, j ), 1,work( 1+n ), 1 )
+                                    
+                          call stdlib_qaxpy( j-2, -x( 1, 2 ), t( 1, j-1 ), 1,work( 1+n2 ), 1 )
+                                    
+                          call stdlib_qaxpy( j-2, -x( 2, 2 ), t( 1, j ), 1,work( 1+n2 ), 1 )
+                                    
+                       end if
+                    end do loop_90
+                    ! copy the vector x or q*x to vr and normalize.
+                    if( .not.over ) then
+                       call stdlib_qcopy( ki, work( 1+n ), 1, vr( 1, is-1 ), 1 )
+                       call stdlib_qcopy( ki, work( 1+n2 ), 1, vr( 1, is ), 1 )
+                       emax = zero
+                       do k = 1, ki
+                          emax = max( emax, abs( vr( k, is-1 ) )+abs( vr( k, is ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_qscal( ki, remax, vr( 1, is-1 ), 1 )
+                       call stdlib_qscal( ki, remax, vr( 1, is ), 1 )
+                       do k = ki + 1, n
+                          vr( k, is-1 ) = zero
+                          vr( k, is ) = zero
+                       end do
+                    else
+                       if( ki>2 ) then
+                          call stdlib_qgemv( 'N', n, ki-2, one, vr, ldvr,work( 1+n ), 1, work( ki-&
+                                    1+n ),vr( 1, ki-1 ), 1 )
+                          call stdlib_qgemv( 'N', n, ki-2, one, vr, ldvr,work( 1+n2 ), 1, work( &
+                                    ki+n2 ),vr( 1, ki ), 1 )
+                       else
+                          call stdlib_qscal( n, work( ki-1+n ), vr( 1, ki-1 ), 1 )
+                          call stdlib_qscal( n, work( ki+n2 ), vr( 1, ki ), 1 )
+                       end if
+                       emax = zero
+                       do k = 1, n
+                          emax = max( emax, abs( vr( k, ki-1 ) )+abs( vr( k, ki ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_qscal( n, remax, vr( 1, ki-1 ), 1 )
+                       call stdlib_qscal( n, remax, vr( 1, ki ), 1 )
+                    end if
+                 end if
+                 is = is - 1
+                 if( ip/=0 )is = is - 1
+                 130 continue
+                 if( ip==1 )ip = 0
+                 if( ip==-1 )ip = 1
+              end do loop_140
+           end if
+           if( leftv ) then
+              ! compute left eigenvectors.
+              ip = 0
+              is = 1
+              loop_260: do ki = 1, n
+                 if( ip==-1 )go to 250
+                 if( ki==n )go to 150
+                 if( t( ki+1, ki )==zero )go to 150
+                 ip = 1
+                 150 continue
+                 if( somev ) then
+                    if( .not.select( ki ) )go to 250
+                 end if
+                 ! compute the ki-th eigenvalue (wr,wi).
+                 wr = t( ki, ki )
+                 wi = zero
+                 if( ip/=0 )wi = sqrt( abs( t( ki, ki+1 ) ) )*sqrt( abs( t( ki+1, ki ) ) )
+                 smin = max( ulp*( abs( wr )+abs( wi ) ), smlnum )
+                 if( ip==0 ) then
+                    ! real left eigenvector.
+                    work( ki+n ) = one
+                    ! form right-hand side
+                    do k = ki + 1, n
+                       work( k+n ) = -t( ki, k )
+                    end do
+                    ! solve the quasi-triangular system:
+                       ! (t(ki+1:n,ki+1:n) - wr)**t*x = scale*work
+                    vmax = one
+                    vcrit = bignum
+                    jnxt = ki + 1
+                    loop_170: do j = ki + 1, n
+                       if( j<jnxt )cycle loop_170
+                       j1 = j
+                       j2 = j
+                       jnxt = j + 1
+                       if( j<n ) then
+                          if( t( j+1, j )/=zero ) then
+                             j2 = j + 1
+                             jnxt = j + 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side.
+                          if( work( j )>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_qscal( n-ki+1, rec, work( ki+n ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+n ) = work( j+n ) -stdlib_qdot( j-ki-1, t( ki+1, j ), 1,work( &
+                                    ki+1+n ), 1 )
+                          ! solve (t(j,j)-wr)**t*x = work
+                          call stdlib_qlaln2( .false., 1, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_qscal( n-ki+1, scale, work( ki+n ), 1 )
+                                    
+                          work( j+n ) = x( 1, 1 )
+                          vmax = max( abs( work( j+n ) ), vmax )
+                          vcrit = bignum / vmax
+                       else
+                          ! 2-by-2 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side.
+                          beta = max( work( j ), work( j+1 ) )
+                          if( beta>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_qscal( n-ki+1, rec, work( ki+n ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+n ) = work( j+n ) -stdlib_qdot( j-ki-1, t( ki+1, j ), 1,work( &
+                                    ki+1+n ), 1 )
+                          work( j+1+n ) = work( j+1+n ) -stdlib_qdot( j-ki-1, t( ki+1, j+1 ), 1,&
+                                    work( ki+1+n ), 1 )
+                          ! solve
+                            ! [t(j,j)-wr   t(j,j+1)     ]**t * x = scale*( work1 )
+                            ! [t(j+1,j)    t(j+1,j+1)-wr]                ( work2 )
+                          call stdlib_qlaln2( .true., 2, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_qscal( n-ki+1, scale, work( ki+n ), 1 )
+                                    
+                          work( j+n ) = x( 1, 1 )
+                          work( j+1+n ) = x( 2, 1 )
+                          vmax = max( abs( work( j+n ) ),abs( work( j+1+n ) ), vmax )
+                          vcrit = bignum / vmax
+                       end if
+                    end do loop_170
+                    ! copy the vector x or q*x to vl and normalize.
+                    if( .not.over ) then
+                       call stdlib_qcopy( n-ki+1, work( ki+n ), 1, vl( ki, is ), 1 )
+                       ii = stdlib_iqamax( n-ki+1, vl( ki, is ), 1 ) + ki - 1
+                       remax = one / abs( vl( ii, is ) )
+                       call stdlib_qscal( n-ki+1, remax, vl( ki, is ), 1 )
+                       do k = 1, ki - 1
+                          vl( k, is ) = zero
+                       end do
+                    else
+                       if( ki<n )call stdlib_qgemv( 'N', n, n-ki, one, vl( 1, ki+1 ), ldvl,work( &
+                                 ki+1+n ), 1, work( ki+n ),vl( 1, ki ), 1 )
+                       ii = stdlib_iqamax( n, vl( 1, ki ), 1 )
+                       remax = one / abs( vl( ii, ki ) )
+                       call stdlib_qscal( n, remax, vl( 1, ki ), 1 )
+                    end if
+                 else
+                    ! complex left eigenvector.
+                     ! initial solve:
+                       ! ((t(ki,ki)    t(ki,ki+1) )**t - (wr - i* wi))*x = 0.
+                       ! ((t(ki+1,ki) t(ki+1,ki+1))                )
+                    if( abs( t( ki, ki+1 ) )>=abs( t( ki+1, ki ) ) ) then
+                       work( ki+n ) = wi / t( ki, ki+1 )
+                       work( ki+1+n2 ) = one
+                    else
+                       work( ki+n ) = one
+                       work( ki+1+n2 ) = -wi / t( ki+1, ki )
+                    end if
+                    work( ki+1+n ) = zero
+                    work( ki+n2 ) = zero
+                    ! form right-hand side
+                    do k = ki + 2, n
+                       work( k+n ) = -work( ki+n )*t( ki, k )
+                       work( k+n2 ) = -work( ki+1+n2 )*t( ki+1, k )
+                    end do
+                    ! solve complex quasi-triangular system:
+                    ! ( t(ki+2,n:ki+2,n) - (wr-i*wi) )*x = work1+i*work2
+                    vmax = one
+                    vcrit = bignum
+                    jnxt = ki + 2
+                    loop_200: do j = ki + 2, n
+                       if( j<jnxt )cycle loop_200
+                       j1 = j
+                       j2 = j
+                       jnxt = j + 1
+                       if( j<n ) then
+                          if( t( j+1, j )/=zero ) then
+                             j2 = j + 1
+                             jnxt = j + 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          ! scale if necessary to avoid overflow when
+                          ! forming the right-hand side elements.
+                          if( work( j )>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_qscal( n-ki+1, rec, work( ki+n ), 1 )
+                             call stdlib_qscal( n-ki+1, rec, work( ki+n2 ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+n ) = work( j+n ) -stdlib_qdot( j-ki-2, t( ki+2, j ), 1,work( &
+                                    ki+2+n ), 1 )
+                          work( j+n2 ) = work( j+n2 ) -stdlib_qdot( j-ki-2, t( ki+2, j ), 1,work( &
+                                    ki+2+n2 ), 1 )
+                          ! solve (t(j,j)-(wr-i*wi))*(x11+i*x12)= wk+i*wk2
+                          call stdlib_qlaln2( .false., 1, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,-wi, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_qscal( n-ki+1, scale, work( ki+n ), 1 )
+                             call stdlib_qscal( n-ki+1, scale, work( ki+n2 ), 1 )
+                          end if
+                          work( j+n ) = x( 1, 1 )
+                          work( j+n2 ) = x( 1, 2 )
+                          vmax = max( abs( work( j+n ) ),abs( work( j+n2 ) ), vmax )
+                          vcrit = bignum / vmax
+                       else
+                          ! 2-by-2 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side elements.
+                          beta = max( work( j ), work( j+1 ) )
+                          if( beta>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_qscal( n-ki+1, rec, work( ki+n ), 1 )
+                             call stdlib_qscal( n-ki+1, rec, work( ki+n2 ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+n ) = work( j+n ) -stdlib_qdot( j-ki-2, t( ki+2, j ), 1,work( &
+                                    ki+2+n ), 1 )
+                          work( j+n2 ) = work( j+n2 ) -stdlib_qdot( j-ki-2, t( ki+2, j ), 1,work( &
+                                    ki+2+n2 ), 1 )
+                          work( j+1+n ) = work( j+1+n ) -stdlib_qdot( j-ki-2, t( ki+2, j+1 ), 1,&
+                                    work( ki+2+n ), 1 )
+                          work( j+1+n2 ) = work( j+1+n2 ) -stdlib_qdot( j-ki-2, t( ki+2, j+1 ), 1,&
+                                    work( ki+2+n2 ), 1 )
+                          ! solve 2-by-2 complex linear equation
+                            ! ([t(j,j)   t(j,j+1)  ]**t-(wr-i*wi)*i)*x = scale*b
+                            ! ([t(j+1,j) t(j+1,j+1)]               )
+                          call stdlib_qlaln2( .true., 2, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,-wi, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_qscal( n-ki+1, scale, work( ki+n ), 1 )
+                             call stdlib_qscal( n-ki+1, scale, work( ki+n2 ), 1 )
+                          end if
+                          work( j+n ) = x( 1, 1 )
+                          work( j+n2 ) = x( 1, 2 )
+                          work( j+1+n ) = x( 2, 1 )
+                          work( j+1+n2 ) = x( 2, 2 )
+                          vmax = max( abs( x( 1, 1 ) ), abs( x( 1, 2 ) ),abs( x( 2, 1 ) ), abs( x(&
+                                     2, 2 ) ), vmax )
+                          vcrit = bignum / vmax
+                       end if
+                    end do loop_200
+                    ! copy the vector x or q*x to vl and normalize.
+                    if( .not.over ) then
+                       call stdlib_qcopy( n-ki+1, work( ki+n ), 1, vl( ki, is ), 1 )
+                       call stdlib_qcopy( n-ki+1, work( ki+n2 ), 1, vl( ki, is+1 ),1 )
+                       emax = zero
+                       do k = ki, n
+                          emax = max( emax, abs( vl( k, is ) )+abs( vl( k, is+1 ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_qscal( n-ki+1, remax, vl( ki, is ), 1 )
+                       call stdlib_qscal( n-ki+1, remax, vl( ki, is+1 ), 1 )
+                       do k = 1, ki - 1
+                          vl( k, is ) = zero
+                          vl( k, is+1 ) = zero
+                       end do
+                    else
+                       if( ki<n-1 ) then
+                          call stdlib_qgemv( 'N', n, n-ki-1, one, vl( 1, ki+2 ),ldvl, work( ki+2+&
+                                    n ), 1, work( ki+n ),vl( 1, ki ), 1 )
+                          call stdlib_qgemv( 'N', n, n-ki-1, one, vl( 1, ki+2 ),ldvl, work( ki+2+&
+                                    n2 ), 1,work( ki+1+n2 ), vl( 1, ki+1 ), 1 )
+                       else
+                          call stdlib_qscal( n, work( ki+n ), vl( 1, ki ), 1 )
+                          call stdlib_qscal( n, work( ki+1+n2 ), vl( 1, ki+1 ), 1 )
+                       end if
+                       emax = zero
+                       do k = 1, n
+                          emax = max( emax, abs( vl( k, ki ) )+abs( vl( k, ki+1 ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_qscal( n, remax, vl( 1, ki ), 1 )
+                       call stdlib_qscal( n, remax, vl( 1, ki+1 ), 1 )
+                    end if
+                 end if
+                 is = is + 1
+                 if( ip/=0 )is = is + 1
+                 250 continue
+                 if( ip==-1 )ip = 0
+                 if( ip==1 )ip = -1
+              end do loop_260
+           end if
+           return
+     end subroutine stdlib_qtrevc
+
+     !> DTREVC3: computes some or all of the right and/or left eigenvectors of
+     !> a real upper quasi-triangular matrix T.
+     !> Matrices of this type are produced by the Schur factorization of
+     !> a real general matrix:  A = Q*T*Q**T, as computed by DHSEQR.
+     !> The right eigenvector x and the left eigenvector y of T corresponding
+     !> to an eigenvalue w are defined by:
+     !> T*x = w*x,     (y**T)*T = w*(y**T)
+     !> where y**T denotes the transpose of the vector y.
+     !> The eigenvalues are not input to this routine, but are read directly
+     !> from the diagonal blocks of T.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+     !> input matrix. If Q is the orthogonal factor that reduces a matrix
+     !> A to Schur form T, then Q*X and Q*Y are the matrices of right and
+     !> left eigenvectors of A.
+     !> This uses a Level 3 BLAS version of the back transformation.
+
+     pure subroutine stdlib_qtrevc3( side, howmny, select, n, t, ldt, vl, ldvl,vr, ldvr, mm, m, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, lwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(inout) :: select(*)
+           real(qp), intent(in) :: t(ldt,*)
+           real(qp), intent(inout) :: vl(ldvl,*), vr(ldvr,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmin = 8
+           integer(ilp), parameter :: nbmax = 128
+           
+           
+           ! Local Scalars 
+           logical(lk) :: allv, bothv, leftv, lquery, over, pair, rightv, somev
+           integer(ilp) :: i, ierr, ii, ip, is, j, j1, j2, jnxt, k, ki, iv, maxwrk, nb, &
+                     ki2
+           real(qp) :: beta, bignum, emax, ovfl, rec, remax, scale, smin, smlnum, ulp, unfl, &
+                     vcrit, vmax, wi, wr, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Local Arrays 
+           real(qp) :: x(2,2)
+           integer(ilp) :: iscomplex(nbmax)
+           ! Executable Statements 
+           ! decode and test the input parameters
+           bothv  = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv  = stdlib_lsame( side, 'L' ) .or. bothv
+           allv  = stdlib_lsame( howmny, 'A' )
+           over  = stdlib_lsame( howmny, 'B' )
+           somev = stdlib_lsame( howmny, 'S' )
+           info = 0
+           nb = stdlib_ilaenv( 1, 'DTREVC', side // howmny, n, -1, -1, -1 )
+           maxwrk = n + 2*n*nb
+           work(1) = maxwrk
+           lquery = ( lwork==-1 )
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.allv .and. .not.over .and. .not.somev ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -10
+           else if( lwork<max( 1, 3*n ) .and. .not.lquery ) then
+              info = -14
+           else
+              ! set m to the number of columns required to store the selected
+              ! eigenvectors, standardize the array select if necessary, and
+              ! test mm.
+              if( somev ) then
+                 m = 0
+                 pair = .false.
+                 do j = 1, n
+                    if( pair ) then
+                       pair = .false.
+                       select( j ) = .false.
+                    else
+                       if( j<n ) then
+                          if( t( j+1, j )==zero ) then
+                             if( select( j ) )m = m + 1
+                          else
+                             pair = .true.
+                             if( select( j ) .or. select( j+1 ) ) then
+                                select( j ) = .true.
+                                m = m + 2
+                             end if
+                          end if
+                       else
+                          if( select( n ) )m = m + 1
+                       end if
+                    end if
+                 end do
+              else
+                 m = n
+              end if
+              if( mm<m ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTREVC3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! use blocked version of back-transformation if sufficient workspace.
+           ! zero-out the workspace to avoid potential nan propagation.
+           if( over .and. lwork >= n + 2*n*nbmin ) then
+              nb = (lwork - n) / (2*n)
+              nb = min( nb, nbmax )
+              call stdlib_qlaset( 'F', n, 1+2*nb, zero, zero, work, n )
+           else
+              nb = 1
+           end if
+           ! set the constants to control overflow.
+           unfl = stdlib_qlamch( 'SAFE MINIMUM' )
+           ovfl = one / unfl
+           call stdlib_qlabad( unfl, ovfl )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           bignum = ( one-ulp ) / smlnum
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           work( 1 ) = zero
+           do j = 2, n
+              work( j ) = zero
+              do i = 1, j - 1
+                 work( j ) = work( j ) + abs( t( i, j ) )
+              end do
+           end do
+           ! index ip is used to specify the real or complex eigenvalue:
+             ! ip = 0, real eigenvalue,
+                  ! 1, first  of conjugate complex pair: (wr,wi)
+                 ! -1, second of conjugate complex pair: (wr,wi)
+             ! iscomplex array stores ip for each column in current block.
+           if( rightv ) then
+              ! ============================================================
+              ! compute right eigenvectors.
+              ! iv is index of column in current block.
+              ! for complex right vector, uses iv-1 for real part and iv for complex part.
+              ! non-blocked version always uses iv=2;
+              ! blocked     version starts with iv=nb, goes down to 1 or 2.
+              ! (note the "0-th" column is used for 1-norms computed above.)
+              iv = 2
+              if( nb>2 ) then
+                 iv = nb
+              end if
+              ip = 0
+              is = m
+              loop_140: do ki = n, 1, -1
+                 if( ip==-1 ) then
+                    ! previous iteration (ki+1) was second of conjugate pair,
+                    ! so this ki is first of conjugate pair; skip to end of loop
+                    ip = 1
+                    cycle loop_140
+                 else if( ki==1 ) then
+                    ! last column, so this ki must be real eigenvalue
+                    ip = 0
+                 else if( t( ki, ki-1 )==zero ) then
+                    ! zero on sub-diagonal, so this ki is real eigenvalue
+                    ip = 0
+                 else
+                    ! non-zero on sub-diagonal, so this ki is second of conjugate pair
+                    ip = -1
+                 end if
+                 if( somev ) then
+                    if( ip==0 ) then
+                       if( .not.select( ki ) )cycle loop_140
+                    else
+                       if( .not.select( ki-1 ) )cycle loop_140
+                    end if
+                 end if
+                 ! compute the ki-th eigenvalue (wr,wi).
+                 wr = t( ki, ki )
+                 wi = zero
+                 if( ip/=0 )wi = sqrt( abs( t( ki, ki-1 ) ) )*sqrt( abs( t( ki-1, ki ) ) )
+                 smin = max( ulp*( abs( wr )+abs( wi ) ), smlnum )
+                 if( ip==0 ) then
+                    ! --------------------------------------------------------
+                    ! real right eigenvector
+                    work( ki + iv*n ) = one
+                    ! form right-hand side.
+                    do k = 1, ki - 1
+                       work( k + iv*n ) = -t( k, ki )
+                    end do
+                    ! solve upper quasi-triangular system:
+                    ! [ t(1:ki-1,1:ki-1) - wr ]*x = scale*work.
+                    jnxt = ki - 1
+                    loop_60: do j = ki - 1, 1, -1
+                       if( j>jnxt )cycle loop_60
+                       j1 = j
+                       j2 = j
+                       jnxt = j - 1
+                       if( j>1 ) then
+                          if( t( j, j-1 )/=zero ) then
+                             j1   = j - 1
+                             jnxt = j - 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          call stdlib_qlaln2( .false., 1, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale x(1,1) to avoid overflow when updating
+                          ! the right-hand side.
+                          if( xnorm>one ) then
+                             if( work( j )>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_qscal( ki, scale, work( 1+iv*n ), 1 )
+                                    
+                          work( j+iv*n ) = x( 1, 1 )
+                          ! update right-hand side
+                          call stdlib_qaxpy( j-1, -x( 1, 1 ), t( 1, j ), 1,work( 1+iv*n ), 1 )
+                                    
+                       else
+                          ! 2-by-2 diagonal block
+                          call stdlib_qlaln2( .false., 2, 1, smin, one,t( j-1, j-1 ), ldt, one, &
+                                    one,work( j-1+iv*n ), n, wr, zero, x, 2,scale, xnorm, ierr )
+                          ! scale x(1,1) and x(2,1) to avoid overflow when
+                          ! updating the right-hand side.
+                          if( xnorm>one ) then
+                             beta = max( work( j-1 ), work( j ) )
+                             if( beta>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                x( 2, 1 ) = x( 2, 1 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_qscal( ki, scale, work( 1+iv*n ), 1 )
+                                    
+                          work( j-1+iv*n ) = x( 1, 1 )
+                          work( j  +iv*n ) = x( 2, 1 )
+                          ! update right-hand side
+                          call stdlib_qaxpy( j-2, -x( 1, 1 ), t( 1, j-1 ), 1,work( 1+iv*n ), 1 )
+                                    
+                          call stdlib_qaxpy( j-2, -x( 2, 1 ), t( 1, j ), 1,work( 1+iv*n ), 1 )
+                                    
+                       end if
+                    end do loop_60
+                    ! copy the vector x or q*x to vr and normalize.
+                    if( .not.over ) then
+                       ! ------------------------------
+                       ! no back-transform: copy x to vr and normalize.
+                       call stdlib_qcopy( ki, work( 1 + iv*n ), 1, vr( 1, is ), 1 )
+                       ii = stdlib_iqamax( ki, vr( 1, is ), 1 )
+                       remax = one / abs( vr( ii, is ) )
+                       call stdlib_qscal( ki, remax, vr( 1, is ), 1 )
+                       do k = ki + 1, n
+                          vr( k, is ) = zero
+                       end do
+                    else if( nb==1 ) then
+                       ! ------------------------------
+                       ! version 1: back-transform each vector with gemv, q*x.
+                       if( ki>1 )call stdlib_qgemv( 'N', n, ki-1, one, vr, ldvr,work( 1 + iv*n ), &
+                                 1, work( ki + iv*n ),vr( 1, ki ), 1 )
+                       ii = stdlib_iqamax( n, vr( 1, ki ), 1 )
+                       remax = one / abs( vr( ii, ki ) )
+                       call stdlib_qscal( n, remax, vr( 1, ki ), 1 )
+                    else
+                       ! ------------------------------
+                       ! version 2: back-transform block of vectors with gemm
+                       ! zero out below vector
+                       do k = ki + 1, n
+                          work( k + iv*n ) = zero
+                       end do
+                       iscomplex( iv ) = ip
+                       ! back-transform and normalization is done below
+                    end if
+                 else
+                    ! --------------------------------------------------------
+                    ! complex right eigenvector.
+                    ! initial solve
+                    ! [ ( t(ki-1,ki-1) t(ki-1,ki) ) - (wr + i*wi) ]*x = 0.
+                    ! [ ( t(ki,  ki-1) t(ki,  ki) )               ]
+                    if( abs( t( ki-1, ki ) )>=abs( t( ki, ki-1 ) ) ) then
+                       work( ki-1 + (iv-1)*n ) = one
+                       work( ki   + (iv  )*n ) = wi / t( ki-1, ki )
+                    else
+                       work( ki-1 + (iv-1)*n ) = -wi / t( ki, ki-1 )
+                       work( ki   + (iv  )*n ) = one
+                    end if
+                    work( ki   + (iv-1)*n ) = zero
+                    work( ki-1 + (iv  )*n ) = zero
+                    ! form right-hand side.
+                    do k = 1, ki - 2
+                       work( k+(iv-1)*n ) = -work( ki-1+(iv-1)*n )*t(k,ki-1)
+                       work( k+(iv  )*n ) = -work( ki  +(iv  )*n )*t(k,ki  )
+                    end do
+                    ! solve upper quasi-triangular system:
+                    ! [ t(1:ki-2,1:ki-2) - (wr+i*wi) ]*x = scale*(work+i*work2)
+                    jnxt = ki - 2
+                    loop_90: do j = ki - 2, 1, -1
+                       if( j>jnxt )cycle loop_90
+                       j1 = j
+                       j2 = j
+                       jnxt = j - 1
+                       if( j>1 ) then
+                          if( t( j, j-1 )/=zero ) then
+                             j1   = j - 1
+                             jnxt = j - 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          call stdlib_qlaln2( .false., 1, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+(iv-1)*n ), n,wr, wi, x, 2, scale, xnorm, ierr )
+                          ! scale x(1,1) and x(1,2) to avoid overflow when
+                          ! updating the right-hand side.
+                          if( xnorm>one ) then
+                             if( work( j )>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                x( 1, 2 ) = x( 1, 2 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_qscal( ki, scale, work( 1+(iv-1)*n ), 1 )
+                             call stdlib_qscal( ki, scale, work( 1+(iv  )*n ), 1 )
+                          end if
+                          work( j+(iv-1)*n ) = x( 1, 1 )
+                          work( j+(iv  )*n ) = x( 1, 2 )
+                          ! update the right-hand side
+                          call stdlib_qaxpy( j-1, -x( 1, 1 ), t( 1, j ), 1,work( 1+(iv-1)*n ), 1 )
+                                    
+                          call stdlib_qaxpy( j-1, -x( 1, 2 ), t( 1, j ), 1,work( 1+(iv  )*n ), 1 )
+                                    
+                       else
+                          ! 2-by-2 diagonal block
+                          call stdlib_qlaln2( .false., 2, 2, smin, one,t( j-1, j-1 ), ldt, one, &
+                                    one,work( j-1+(iv-1)*n ), n, wr, wi, x, 2,scale, xnorm, ierr )
+                          ! scale x to avoid overflow when updating
+                          ! the right-hand side.
+                          if( xnorm>one ) then
+                             beta = max( work( j-1 ), work( j ) )
+                             if( beta>bignum / xnorm ) then
+                                rec = one / xnorm
+                                x( 1, 1 ) = x( 1, 1 )*rec
+                                x( 1, 2 ) = x( 1, 2 )*rec
+                                x( 2, 1 ) = x( 2, 1 )*rec
+                                x( 2, 2 ) = x( 2, 2 )*rec
+                                scale = scale*rec
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_qscal( ki, scale, work( 1+(iv-1)*n ), 1 )
+                             call stdlib_qscal( ki, scale, work( 1+(iv  )*n ), 1 )
+                          end if
+                          work( j-1+(iv-1)*n ) = x( 1, 1 )
+                          work( j  +(iv-1)*n ) = x( 2, 1 )
+                          work( j-1+(iv  )*n ) = x( 1, 2 )
+                          work( j  +(iv  )*n ) = x( 2, 2 )
+                          ! update the right-hand side
+                          call stdlib_qaxpy( j-2, -x( 1, 1 ), t( 1, j-1 ), 1,work( 1+(iv-1)*n   ),&
+                                     1 )
+                          call stdlib_qaxpy( j-2, -x( 2, 1 ), t( 1, j ), 1,work( 1+(iv-1)*n   ), &
+                                    1 )
+                          call stdlib_qaxpy( j-2, -x( 1, 2 ), t( 1, j-1 ), 1,work( 1+(iv  )*n ), &
+                                    1 )
+                          call stdlib_qaxpy( j-2, -x( 2, 2 ), t( 1, j ), 1,work( 1+(iv  )*n ), 1 )
+                                    
+                       end if
+                    end do loop_90
+                    ! copy the vector x or q*x to vr and normalize.
+                    if( .not.over ) then
+                       ! ------------------------------
+                       ! no back-transform: copy x to vr and normalize.
+                       call stdlib_qcopy( ki, work( 1+(iv-1)*n ), 1, vr(1,is-1), 1 )
+                       call stdlib_qcopy( ki, work( 1+(iv  )*n ), 1, vr(1,is  ), 1 )
+                       emax = zero
+                       do k = 1, ki
+                          emax = max( emax, abs( vr( k, is-1 ) )+abs( vr( k, is   ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_qscal( ki, remax, vr( 1, is-1 ), 1 )
+                       call stdlib_qscal( ki, remax, vr( 1, is   ), 1 )
+                       do k = ki + 1, n
+                          vr( k, is-1 ) = zero
+                          vr( k, is   ) = zero
+                       end do
+                    else if( nb==1 ) then
+                       ! ------------------------------
+                       ! version 1: back-transform each vector with gemv, q*x.
+                       if( ki>2 ) then
+                          call stdlib_qgemv( 'N', n, ki-2, one, vr, ldvr,work( 1    + (iv-1)*n ), &
+                                    1,work( ki-1 + (iv-1)*n ), vr(1,ki-1), 1)
+                          call stdlib_qgemv( 'N', n, ki-2, one, vr, ldvr,work( 1  + (iv)*n ), 1,&
+                                    work( ki + (iv)*n ), vr( 1, ki ), 1 )
+                       else
+                          call stdlib_qscal( n, work(ki-1+(iv-1)*n), vr(1,ki-1), 1)
+                          call stdlib_qscal( n, work(ki  +(iv  )*n), vr(1,ki  ), 1)
+                       end if
+                       emax = zero
+                       do k = 1, n
+                          emax = max( emax, abs( vr( k, ki-1 ) )+abs( vr( k, ki   ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_qscal( n, remax, vr( 1, ki-1 ), 1 )
+                       call stdlib_qscal( n, remax, vr( 1, ki   ), 1 )
+                    else
+                       ! ------------------------------
+                       ! version 2: back-transform block of vectors with gemm
+                       ! zero out below vector
+                       do k = ki + 1, n
+                          work( k + (iv-1)*n ) = zero
+                          work( k + (iv  )*n ) = zero
+                       end do
+                       iscomplex( iv-1 ) = -ip
+                       iscomplex( iv   ) =  ip
+                       iv = iv - 1
+                       ! back-transform and normalization is done below
+                    end if
+                 end if
+                 if( nb>1 ) then
+                    ! --------------------------------------------------------
+                    ! blocked version of back-transform
+                    ! for complex case, ki2 includes both vectors (ki-1 and ki)
+                    if( ip==0 ) then
+                       ki2 = ki
+                    else
+                       ki2 = ki - 1
+                    end if
+                    ! columns iv:nb of work are valid vectors.
+                    ! when the number of vectors stored reaches nb-1 or nb,
+                    ! or if this was last vector, do the gemm
+                    if( (iv<=2) .or. (ki2==1) ) then
+                       call stdlib_qgemm( 'N', 'N', n, nb-iv+1, ki2+nb-iv, one,vr, ldvr,work( 1 + &
+                                 (iv)*n    ), n,zero,work( 1 + (nb+iv)*n ), n )
+                       ! normalize vectors
+                       do k = iv, nb
+                          if( iscomplex(k)==0 ) then
+                             ! real eigenvector
+                             ii = stdlib_iqamax( n, work( 1 + (nb+k)*n ), 1 )
+                             remax = one / abs( work( ii + (nb+k)*n ) )
+                          else if( iscomplex(k)==1 ) then
+                             ! first eigenvector of conjugate pair
+                             emax = zero
+                             do ii = 1, n
+                                emax = max( emax,abs( work( ii + (nb+k  )*n ) )+abs( work( ii + (&
+                                          nb+k+1)*n ) ) )
+                             end do
+                             remax = one / emax
+                          ! else if iscomplex(k)==-1
+                             ! second eigenvector of conjugate pair
+                             ! reuse same remax as previous k
+                          end if
+                          call stdlib_qscal( n, remax, work( 1 + (nb+k)*n ), 1 )
+                       end do
+                       call stdlib_qlacpy( 'F', n, nb-iv+1,work( 1 + (nb+iv)*n ), n,vr( 1, ki2 ), &
+                                 ldvr )
+                       iv = nb
+                    else
+                       iv = iv - 1
+                    end if
+                 end if ! blocked back-transform
+                 is = is - 1
+                 if( ip/=0 )is = is - 1
+              end do loop_140
+           end if
+           if( leftv ) then
+              ! ============================================================
+              ! compute left eigenvectors.
+              ! iv is index of column in current block.
+              ! for complex left vector, uses iv for real part and iv+1 for complex part.
+              ! non-blocked version always uses iv=1;
+              ! blocked     version starts with iv=1, goes up to nb-1 or nb.
+              ! (note the "0-th" column is used for 1-norms computed above.)
+              iv = 1
+              ip = 0
+              is = 1
+              loop_260: do ki = 1, n
+                 if( ip==1 ) then
+                    ! previous iteration (ki-1) was first of conjugate pair,
+                    ! so this ki is second of conjugate pair; skip to end of loop
+                    ip = -1
+                    cycle loop_260
+                 else if( ki==n ) then
+                    ! last column, so this ki must be real eigenvalue
+                    ip = 0
+                 else if( t( ki+1, ki )==zero ) then
+                    ! zero on sub-diagonal, so this ki is real eigenvalue
+                    ip = 0
+                 else
+                    ! non-zero on sub-diagonal, so this ki is first of conjugate pair
+                    ip = 1
+                 end if
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_260
+                 end if
+                 ! compute the ki-th eigenvalue (wr,wi).
+                 wr = t( ki, ki )
+                 wi = zero
+                 if( ip/=0 )wi = sqrt( abs( t( ki, ki+1 ) ) )*sqrt( abs( t( ki+1, ki ) ) )
+                 smin = max( ulp*( abs( wr )+abs( wi ) ), smlnum )
+                 if( ip==0 ) then
+                    ! --------------------------------------------------------
+                    ! real left eigenvector
+                    work( ki + iv*n ) = one
+                    ! form right-hand side.
+                    do k = ki + 1, n
+                       work( k + iv*n ) = -t( ki, k )
+                    end do
+                    ! solve transposed quasi-triangular system:
+                    ! [ t(ki+1:n,ki+1:n) - wr ]**t * x = scale*work
+                    vmax = one
+                    vcrit = bignum
+                    jnxt = ki + 1
+                    loop_170: do j = ki + 1, n
+                       if( j<jnxt )cycle loop_170
+                       j1 = j
+                       j2 = j
+                       jnxt = j + 1
+                       if( j<n ) then
+                          if( t( j+1, j )/=zero ) then
+                             j2 = j + 1
+                             jnxt = j + 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side.
+                          if( work( j )>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_qscal( n-ki+1, rec, work( ki+iv*n ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+iv*n ) = work( j+iv*n ) -stdlib_qdot( j-ki-1, t( ki+1, j ), 1,&
+                                    work( ki+1+iv*n ), 1 )
+                          ! solve [ t(j,j) - wr ]**t * x = work
+                          call stdlib_qlaln2( .false., 1, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_qscal( n-ki+1, scale, work( ki+iv*n ), 1 )
+                                    
+                          work( j+iv*n ) = x( 1, 1 )
+                          vmax = max( abs( work( j+iv*n ) ), vmax )
+                          vcrit = bignum / vmax
+                       else
+                          ! 2-by-2 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side.
+                          beta = max( work( j ), work( j+1 ) )
+                          if( beta>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_qscal( n-ki+1, rec, work( ki+iv*n ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+iv*n ) = work( j+iv*n ) -stdlib_qdot( j-ki-1, t( ki+1, j ), 1,&
+                                    work( ki+1+iv*n ), 1 )
+                          work( j+1+iv*n ) = work( j+1+iv*n ) -stdlib_qdot( j-ki-1, t( ki+1, j+1 )&
+                                    , 1,work( ki+1+iv*n ), 1 )
+                          ! solve
+                          ! [ t(j,j)-wr   t(j,j+1)      ]**t * x = scale*( work1 )
+                          ! [ t(j+1,j)    t(j+1,j+1)-wr ]                ( work2 )
+                          call stdlib_qlaln2( .true., 2, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_qscal( n-ki+1, scale, work( ki+iv*n ), 1 )
+                                    
+                          work( j  +iv*n ) = x( 1, 1 )
+                          work( j+1+iv*n ) = x( 2, 1 )
+                          vmax = max( abs( work( j  +iv*n ) ),abs( work( j+1+iv*n ) ), vmax )
+                                    
+                          vcrit = bignum / vmax
+                       end if
+                    end do loop_170
+                    ! copy the vector x or q*x to vl and normalize.
+                    if( .not.over ) then
+                       ! ------------------------------
+                       ! no back-transform: copy x to vl and normalize.
+                       call stdlib_qcopy( n-ki+1, work( ki + iv*n ), 1,vl( ki, is ), 1 )
+                       ii = stdlib_iqamax( n-ki+1, vl( ki, is ), 1 ) + ki - 1
+                       remax = one / abs( vl( ii, is ) )
+                       call stdlib_qscal( n-ki+1, remax, vl( ki, is ), 1 )
+                       do k = 1, ki - 1
+                          vl( k, is ) = zero
+                       end do
+                    else if( nb==1 ) then
+                       ! ------------------------------
+                       ! version 1: back-transform each vector with gemv, q*x.
+                       if( ki<n )call stdlib_qgemv( 'N', n, n-ki, one,vl( 1, ki+1 ), ldvl,work( &
+                                 ki+1 + iv*n ), 1,work( ki   + iv*n ), vl( 1, ki ), 1 )
+                       ii = stdlib_iqamax( n, vl( 1, ki ), 1 )
+                       remax = one / abs( vl( ii, ki ) )
+                       call stdlib_qscal( n, remax, vl( 1, ki ), 1 )
+                    else
+                       ! ------------------------------
+                       ! version 2: back-transform block of vectors with gemm
+                       ! zero out above vector
+                       ! could go from ki-nv+1 to ki-1
+                       do k = 1, ki - 1
+                          work( k + iv*n ) = zero
+                       end do
+                       iscomplex( iv ) = ip
+                       ! back-transform and normalization is done below
+                    end if
+                 else
+                    ! --------------------------------------------------------
+                    ! complex left eigenvector.
+                    ! initial solve:
+                    ! [ ( t(ki,ki)    t(ki,ki+1)  )**t - (wr - i* wi) ]*x = 0.
+                    ! [ ( t(ki+1,ki) t(ki+1,ki+1) )                   ]
+                    if( abs( t( ki, ki+1 ) )>=abs( t( ki+1, ki ) ) ) then
+                       work( ki   + (iv  )*n ) = wi / t( ki, ki+1 )
+                       work( ki+1 + (iv+1)*n ) = one
+                    else
+                       work( ki   + (iv  )*n ) = one
+                       work( ki+1 + (iv+1)*n ) = -wi / t( ki+1, ki )
+                    end if
+                    work( ki+1 + (iv  )*n ) = zero
+                    work( ki   + (iv+1)*n ) = zero
+                    ! form right-hand side.
+                    do k = ki + 2, n
+                       work( k+(iv  )*n ) = -work( ki  +(iv  )*n )*t(ki,  k)
+                       work( k+(iv+1)*n ) = -work( ki+1+(iv+1)*n )*t(ki+1,k)
+                    end do
+                    ! solve transposed quasi-triangular system:
+                    ! [ t(ki+2:n,ki+2:n)**t - (wr-i*wi) ]*x = work1+i*work2
+                    vmax = one
+                    vcrit = bignum
+                    jnxt = ki + 2
+                    loop_200: do j = ki + 2, n
+                       if( j<jnxt )cycle loop_200
+                       j1 = j
+                       j2 = j
+                       jnxt = j + 1
+                       if( j<n ) then
+                          if( t( j+1, j )/=zero ) then
+                             j2 = j + 1
+                             jnxt = j + 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          ! scale if necessary to avoid overflow when
+                          ! forming the right-hand side elements.
+                          if( work( j )>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_qscal( n-ki+1, rec, work(ki+(iv  )*n), 1 )
+                             call stdlib_qscal( n-ki+1, rec, work(ki+(iv+1)*n), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+(iv  )*n ) = work( j+(iv)*n ) -stdlib_qdot( j-ki-2, t( ki+2, j )&
+                                    , 1,work( ki+2+(iv)*n ), 1 )
+                          work( j+(iv+1)*n ) = work( j+(iv+1)*n ) -stdlib_qdot( j-ki-2, t( ki+2, &
+                                    j ), 1,work( ki+2+(iv+1)*n ), 1 )
+                          ! solve [ t(j,j)-(wr-i*wi) ]*(x11+i*x12)= wk+i*wk2
+                          call stdlib_qlaln2( .false., 1, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,-wi, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_qscal( n-ki+1, scale, work(ki+(iv  )*n), 1)
+                             call stdlib_qscal( n-ki+1, scale, work(ki+(iv+1)*n), 1)
+                          end if
+                          work( j+(iv  )*n ) = x( 1, 1 )
+                          work( j+(iv+1)*n ) = x( 1, 2 )
+                          vmax = max( abs( work( j+(iv  )*n ) ),abs( work( j+(iv+1)*n ) ), vmax )
+                                    
+                          vcrit = bignum / vmax
+                       else
+                          ! 2-by-2 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side elements.
+                          beta = max( work( j ), work( j+1 ) )
+                          if( beta>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_qscal( n-ki+1, rec, work(ki+(iv  )*n), 1 )
+                             call stdlib_qscal( n-ki+1, rec, work(ki+(iv+1)*n), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j  +(iv  )*n ) = work( j+(iv)*n ) -stdlib_qdot( j-ki-2, t( ki+2, &
+                                    j ), 1,work( ki+2+(iv)*n ), 1 )
+                          work( j  +(iv+1)*n ) = work( j+(iv+1)*n ) -stdlib_qdot( j-ki-2, t( ki+2,&
+                                     j ), 1,work( ki+2+(iv+1)*n ), 1 )
+                          work( j+1+(iv  )*n ) = work( j+1+(iv)*n ) -stdlib_qdot( j-ki-2, t( ki+2,&
+                                     j+1 ), 1,work( ki+2+(iv)*n ), 1 )
+                          work( j+1+(iv+1)*n ) = work( j+1+(iv+1)*n ) -stdlib_qdot( j-ki-2, t( ki+&
+                                    2, j+1 ), 1,work( ki+2+(iv+1)*n ), 1 )
+                          ! solve 2-by-2 complex linear equation
+                          ! [ (t(j,j)   t(j,j+1)  )**t - (wr-i*wi)*i ]*x = scale*b
+                          ! [ (t(j+1,j) t(j+1,j+1))                  ]
+                          call stdlib_qlaln2( .true., 2, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,-wi, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_qscal( n-ki+1, scale, work(ki+(iv  )*n), 1)
+                             call stdlib_qscal( n-ki+1, scale, work(ki+(iv+1)*n), 1)
+                          end if
+                          work( j  +(iv  )*n ) = x( 1, 1 )
+                          work( j  +(iv+1)*n ) = x( 1, 2 )
+                          work( j+1+(iv  )*n ) = x( 2, 1 )
+                          work( j+1+(iv+1)*n ) = x( 2, 2 )
+                          vmax = max( abs( x( 1, 1 ) ), abs( x( 1, 2 ) ),abs( x( 2, 1 ) ), abs( x(&
+                                     2, 2 ) ),vmax )
+                          vcrit = bignum / vmax
+                       end if
+                    end do loop_200
+                    ! copy the vector x or q*x to vl and normalize.
+                    if( .not.over ) then
+                       ! ------------------------------
+                       ! no back-transform: copy x to vl and normalize.
+                       call stdlib_qcopy( n-ki+1, work( ki + (iv  )*n ), 1,vl( ki, is   ), 1 )
+                                 
+                       call stdlib_qcopy( n-ki+1, work( ki + (iv+1)*n ), 1,vl( ki, is+1 ), 1 )
+                                 
+                       emax = zero
+                       do k = ki, n
+                          emax = max( emax, abs( vl( k, is   ) )+abs( vl( k, is+1 ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_qscal( n-ki+1, remax, vl( ki, is   ), 1 )
+                       call stdlib_qscal( n-ki+1, remax, vl( ki, is+1 ), 1 )
+                       do k = 1, ki - 1
+                          vl( k, is   ) = zero
+                          vl( k, is+1 ) = zero
+                       end do
+                    else if( nb==1 ) then
+                       ! ------------------------------
+                       ! version 1: back-transform each vector with gemv, q*x.
+                       if( ki<n-1 ) then
+                          call stdlib_qgemv( 'N', n, n-ki-1, one,vl( 1, ki+2 ), ldvl,work( ki+2 + &
+                                    (iv)*n ), 1,work( ki   + (iv)*n ),vl( 1, ki ), 1 )
+                          call stdlib_qgemv( 'N', n, n-ki-1, one,vl( 1, ki+2 ), ldvl,work( ki+2 + &
+                                    (iv+1)*n ), 1,work( ki+1 + (iv+1)*n ),vl( 1, ki+1 ), 1 )
+                       else
+                          call stdlib_qscal( n, work(ki+  (iv  )*n), vl(1, ki  ), 1)
+                          call stdlib_qscal( n, work(ki+1+(iv+1)*n), vl(1, ki+1), 1)
+                       end if
+                       emax = zero
+                       do k = 1, n
+                          emax = max( emax, abs( vl( k, ki   ) )+abs( vl( k, ki+1 ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_qscal( n, remax, vl( 1, ki   ), 1 )
+                       call stdlib_qscal( n, remax, vl( 1, ki+1 ), 1 )
+                    else
+                       ! ------------------------------
+                       ! version 2: back-transform block of vectors with gemm
+                       ! zero out above vector
+                       ! could go from ki-nv+1 to ki-1
+                       do k = 1, ki - 1
+                          work( k + (iv  )*n ) = zero
+                          work( k + (iv+1)*n ) = zero
+                       end do
+                       iscomplex( iv   ) =  ip
+                       iscomplex( iv+1 ) = -ip
+                       iv = iv + 1
+                       ! back-transform and normalization is done below
+                    end if
+                 end if
+                 if( nb>1 ) then
+                    ! --------------------------------------------------------
+                    ! blocked version of back-transform
+                    ! for complex case, ki2 includes both vectors (ki and ki+1)
+                    if( ip==0 ) then
+                       ki2 = ki
+                    else
+                       ki2 = ki + 1
+                    end if
+                    ! columns 1:iv of work are valid vectors.
+                    ! when the number of vectors stored reaches nb-1 or nb,
+                    ! or if this was last vector, do the gemm
+                    if( (iv>=nb-1) .or. (ki2==n) ) then
+                       call stdlib_qgemm( 'N', 'N', n, iv, n-ki2+iv, one,vl( 1, ki2-iv+1 ), ldvl,&
+                                 work( ki2-iv+1 + (1)*n ), n,zero,work( 1 + (nb+1)*n ), n )
+                       ! normalize vectors
+                       do k = 1, iv
+                          if( iscomplex(k)==0) then
+                             ! real eigenvector
+                             ii = stdlib_iqamax( n, work( 1 + (nb+k)*n ), 1 )
+                             remax = one / abs( work( ii + (nb+k)*n ) )
+                          else if( iscomplex(k)==1) then
+                             ! first eigenvector of conjugate pair
+                             emax = zero
+                             do ii = 1, n
+                                emax = max( emax,abs( work( ii + (nb+k  )*n ) )+abs( work( ii + (&
+                                          nb+k+1)*n ) ) )
+                             end do
+                             remax = one / emax
+                          ! else if iscomplex(k)==-1
+                             ! second eigenvector of conjugate pair
+                             ! reuse same remax as previous k
+                          end if
+                          call stdlib_qscal( n, remax, work( 1 + (nb+k)*n ), 1 )
+                       end do
+                       call stdlib_qlacpy( 'F', n, iv,work( 1 + (nb+1)*n ), n,vl( 1, ki2-iv+1 ), &
+                                 ldvl )
+                       iv = 1
+                    else
+                       iv = iv + 1
+                    end if
+                 end if ! blocked back-transform
+                 is = is + 1
+                 if( ip/=0 )is = is + 1
+              end do loop_260
+           end if
+           return
+     end subroutine stdlib_qtrevc3
+
+     !> DTREXC: reorders the real Schur factorization of a real matrix
+     !> A = Q*T*Q**T, so that the diagonal block of T with row index IFST is
+     !> moved to row ILST.
+     !> The real Schur form T is reordered by an orthogonal similarity
+     !> transformation Z**T*T*Z, and optionally the matrix Q of Schur vectors
+     !> is updated by postmultiplying it with Z.
+     !> T must be in Schur canonical form (as returned by DHSEQR), that is,
+     !> block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; each
+     !> 2-by-2 diagonal block has its diagonal elements equal and its
+     !> off-diagonal elements of opposite sign.
+
+     subroutine stdlib_qtrexc( compq, n, t, ldt, q, ldq, ifst, ilst, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq
+           integer(ilp), intent(inout) :: ifst, ilst
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldq, ldt, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: q(ldq,*), t(ldt,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantq
+           integer(ilp) :: here, nbf, nbl, nbnext
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode and test the input arguments.
+           info = 0
+           wantq = stdlib_lsame( compq, 'V' )
+           if( .not.wantq .and. .not.stdlib_lsame( compq, 'N' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldt<max( 1, n ) ) then
+              info = -4
+           else if( ldq<1 .or. ( wantq .and. ldq<max( 1, n ) ) ) then
+              info = -6
+           else if(( ifst<1 .or. ifst>n ).and.( n>0 )) then
+              info = -7
+           else if(( ilst<1 .or. ilst>n ).and.( n>0 )) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTREXC', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 )return
+           ! determine the first row of specified block
+           ! and find out it is 1 by 1 or 2 by 2.
+           if( ifst>1 ) then
+              if( t( ifst, ifst-1 )/=zero )ifst = ifst - 1
+           end if
+           nbf = 1
+           if( ifst<n ) then
+              if( t( ifst+1, ifst )/=zero )nbf = 2
+           end if
+           ! determine the first row of the final block
+           ! and find out it is 1 by 1 or 2 by 2.
+           if( ilst>1 ) then
+              if( t( ilst, ilst-1 )/=zero )ilst = ilst - 1
+           end if
+           nbl = 1
+           if( ilst<n ) then
+              if( t( ilst+1, ilst )/=zero )nbl = 2
+           end if
+           if( ifst==ilst )return
+           if( ifst<ilst ) then
+              ! update ilst
+              if( nbf==2 .and. nbl==1 )ilst = ilst - 1
+              if( nbf==1 .and. nbl==2 )ilst = ilst + 1
+              here = ifst
+              10 continue
+              ! swap block with next one below
+              if( nbf==1 .or. nbf==2 ) then
+                 ! current block either 1 by 1 or 2 by 2
+                 nbnext = 1
+                 if( here+nbf+1<=n ) then
+                    if( t( here+nbf+1, here+nbf )/=zero )nbnext = 2
+                 end if
+                 call stdlib_qlaexc( wantq, n, t, ldt, q, ldq, here, nbf, nbnext,work, info )
+                           
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 here = here + nbnext
+                 ! test if 2 by 2 block breaks into two 1 by 1 blocks
+                 if( nbf==2 ) then
+                    if( t( here+1, here )==zero )nbf = 3
+                 end if
+              else
+                 ! current block consists of two 1 by 1 blocks each of which
+                 ! must be swapped individually
+                 nbnext = 1
+                 if( here+3<=n ) then
+                    if( t( here+3, here+2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_qlaexc( wantq, n, t, ldt, q, ldq, here+1, 1, nbnext,work, info )
+                           
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 if( nbnext==1 ) then
+                    ! swap two 1 by 1 blocks, no problems possible
+                    call stdlib_qlaexc( wantq, n, t, ldt, q, ldq, here, 1, nbnext,work, info )
+                              
+                    here = here + 1
+                 else
+                    ! recompute nbnext in case 2 by 2 split
+                    if( t( here+2, here+1 )==zero )nbnext = 1
+                    if( nbnext==2 ) then
+                       ! 2 by 2 block did not split
+                       call stdlib_qlaexc( wantq, n, t, ldt, q, ldq, here, 1,nbnext, work, info )
+                                 
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here + 2
+                    else
+                       ! 2 by 2 block did split
+                       call stdlib_qlaexc( wantq, n, t, ldt, q, ldq, here, 1, 1,work, info )
+                                 
+                       call stdlib_qlaexc( wantq, n, t, ldt, q, ldq, here+1, 1, 1,work, info )
+                                 
+                       here = here + 2
+                    end if
+                 end if
+              end if
+              if( here<ilst )go to 10
+           else
+              here = ifst
+              20 continue
+              ! swap block with next one above
+              if( nbf==1 .or. nbf==2 ) then
+                 ! current block either 1 by 1 or 2 by 2
+                 nbnext = 1
+                 if( here>=3 ) then
+                    if( t( here-1, here-2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_qlaexc( wantq, n, t, ldt, q, ldq, here-nbnext, nbnext,nbf, work, &
+                           info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 here = here - nbnext
+                 ! test if 2 by 2 block breaks into two 1 by 1 blocks
+                 if( nbf==2 ) then
+                    if( t( here+1, here )==zero )nbf = 3
+                 end if
+              else
+                 ! current block consists of two 1 by 1 blocks each of which
+                 ! must be swapped individually
+                 nbnext = 1
+                 if( here>=3 ) then
+                    if( t( here-1, here-2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_qlaexc( wantq, n, t, ldt, q, ldq, here-nbnext, nbnext,1, work, info )
+                           
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 if( nbnext==1 ) then
+                    ! swap two 1 by 1 blocks, no problems possible
+                    call stdlib_qlaexc( wantq, n, t, ldt, q, ldq, here, nbnext, 1,work, info )
+                              
+                    here = here - 1
+                 else
+                    ! recompute nbnext in case 2 by 2 split
+                    if( t( here, here-1 )==zero )nbnext = 1
+                    if( nbnext==2 ) then
+                       ! 2 by 2 block did not split
+                       call stdlib_qlaexc( wantq, n, t, ldt, q, ldq, here-1, 2, 1,work, info )
+                                 
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here - 2
+                    else
+                       ! 2 by 2 block did split
+                       call stdlib_qlaexc( wantq, n, t, ldt, q, ldq, here, 1, 1,work, info )
+                                 
+                       call stdlib_qlaexc( wantq, n, t, ldt, q, ldq, here-1, 1, 1,work, info )
+                                 
+                       here = here - 2
+                    end if
+                 end if
+              end if
+              if( here>ilst )go to 20
+           end if
+           ilst = here
+           return
+     end subroutine stdlib_qtrexc
+
+     !> DTRRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by DTRTRS or some other
+     !> means before entering this routine.  DTRRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_qtrrfs( uplo, trans, diag, n, nrhs, a, lda, b, ldb, x,ldx, ferr, berr,&
+                work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: a(lda,*), b(ldb,*), x(ldx,*)
+           real(qp), intent(out) :: berr(*), ferr(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transt
+           integer(ilp) :: i, j, k, kase, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a or a**t, depending on trans.
+              call stdlib_qcopy( n, x( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_qtrmv( uplo, trans, diag, n, a, lda, work( n+1 ), 1 )
+              call stdlib_qaxpy( n, -one, b( 1, j ), 1, work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = 1, k
+                             work( i ) = work( i ) + abs( a( i, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = 1, k - 1
+                             work( i ) = work( i ) + abs( a( i, k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k, n
+                             work( i ) = work( i ) + abs( a( i, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k + 1, n
+                             work( i ) = work( i ) + abs( a( i, k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**t)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = 1, k
+                             s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = 1, k - 1
+                             s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, n
+                             s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = k + 1, n
+                             s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_qlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_qlacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_qtrsv( uplo, transt, diag, n, a, lda, work( n+1 ),1 )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_qtrsv( uplo, trans, diag, n, a, lda, work( n+1 ),1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_qtrrfs
+
+     !> DTRSEN: reorders the real Schur factorization of a real matrix
+     !> A = Q*T*Q**T, so that a selected cluster of eigenvalues appears in
+     !> the leading diagonal blocks of the upper quasi-triangular matrix T,
+     !> and the leading columns of Q form an orthonormal basis of the
+     !> corresponding right invariant subspace.
+     !> Optionally the routine computes the reciprocal condition numbers of
+     !> the cluster of eigenvalues and/or the invariant subspace.
+     !> T must be in Schur canonical form (as returned by DHSEQR), that is,
+     !> block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; each
+     !> 2-by-2 diagonal block has its diagonal elements equal and its
+     !> off-diagonal elements of opposite sign.
+
+     subroutine stdlib_qtrsen( job, compq, select, n, t, ldt, q, ldq, wr, wi,m, s, sep, work, &
+               lwork, iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldq, ldt, liwork, lwork, n
+           real(qp), intent(out) :: s, sep
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: q(ldq,*), t(ldt,*)
+           real(qp), intent(out) :: wi(*), work(*), wr(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, pair, swap, wantbh, wantq, wants, wantsp
+           integer(ilp) :: ierr, k, kase, kk, ks, liwmin, lwmin, n1, n2, nn
+           real(qp) :: est, rnorm, scale
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantsp = stdlib_lsame( job, 'V' ) .or. wantbh
+           wantq = stdlib_lsame( compq, 'V' )
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.wants .and. .not.wantsp )then
+              info = -1
+           else if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -8
+           else
+              ! set m to the dimension of the specified invariant subspace,
+              ! and test lwork and liwork.
+              m = 0
+              pair = .false.
+              do k = 1, n
+                 if( pair ) then
+                    pair = .false.
+                 else
+                    if( k<n ) then
+                       if( t( k+1, k )==zero ) then
+                          if( select( k ) )m = m + 1
+                       else
+                          pair = .true.
+                          if( select( k ) .or. select( k+1 ) )m = m + 2
+                       end if
+                    else
+                       if( select( n ) )m = m + 1
+                    end if
+                 end if
+              end do
+              n1 = m
+              n2 = n - m
+              nn = n1*n2
+              if( wantsp ) then
+                 lwmin = max( 1, 2*nn )
+                 liwmin = max( 1, nn )
+              else if( stdlib_lsame( job, 'N' ) ) then
+                 lwmin = max( 1, n )
+                 liwmin = 1
+              else if( stdlib_lsame( job, 'E' ) ) then
+                 lwmin = max( 1, nn )
+                 liwmin = 1
+              end if
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -15
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -17
+              end if
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRSEN', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( m==n .or. m==0 ) then
+              if( wants )s = one
+              if( wantsp )sep = stdlib_qlange( '1', n, n, t, ldt, work )
+              go to 40
+           end if
+           ! collect the selected blocks at the top-left corner of t.
+           ks = 0
+           pair = .false.
+           loop_20: do k = 1, n
+              if( pair ) then
+                 pair = .false.
+              else
+                 swap = select( k )
+                 if( k<n ) then
+                    if( t( k+1, k )/=zero ) then
+                       pair = .true.
+                       swap = swap .or. select( k+1 )
+                    end if
+                 end if
+                 if( swap ) then
+                    ks = ks + 1
+                    ! swap the k-th block to position ks.
+                    ierr = 0
+                    kk = k
+                    if( k/=ks )call stdlib_qtrexc( compq, n, t, ldt, q, ldq, kk, ks, work,ierr )
+                              
+                    if( ierr==1 .or. ierr==2 ) then
+                       ! blocks too close to swap: exit.
+                       info = 1
+                       if( wants )s = zero
+                       if( wantsp )sep = zero
+                       go to 40
+                    end if
+                    if( pair )ks = ks + 1
+                 end if
+              end if
+           end do loop_20
+           if( wants ) then
+              ! solve sylvester equation for r:
+                 ! t11*r - r*t22 = scale*t12
+              call stdlib_qlacpy( 'F', n1, n2, t( 1, n1+1 ), ldt, work, n1 )
+              call stdlib_qtrsyl( 'N', 'N', -1, n1, n2, t, ldt, t( n1+1, n1+1 ),ldt, work, n1, &
+                        scale, ierr )
+              ! estimate the reciprocal of the condition number of the cluster
+              ! of eigenvalues.
+              rnorm = stdlib_qlange( 'F', n1, n2, work, n1, work )
+              if( rnorm==zero ) then
+                 s = one
+              else
+                 s = scale / ( sqrt( scale*scale / rnorm+rnorm )*sqrt( rnorm ) )
+              end if
+           end if
+           if( wantsp ) then
+              ! estimate sep(t11,t22).
+              est = zero
+              kase = 0
+              30 continue
+              call stdlib_qlacn2( nn, work( nn+1 ), work, iwork, est, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! solve  t11*r - r*t22 = scale*x.
+                    call stdlib_qtrsyl( 'N', 'N', -1, n1, n2, t, ldt,t( n1+1, n1+1 ), ldt, work, &
+                              n1, scale,ierr )
+                 else
+                    ! solve t11**t*r - r*t22**t = scale*x.
+                    call stdlib_qtrsyl( 'T', 'T', -1, n1, n2, t, ldt,t( n1+1, n1+1 ), ldt, work, &
+                              n1, scale,ierr )
+                 end if
+                 go to 30
+              end if
+              sep = scale / est
+           end if
+           40 continue
+           ! store the output eigenvalues in wr and wi.
+           do k = 1, n
+              wr( k ) = t( k, k )
+              wi( k ) = zero
+           end do
+           do k = 1, n - 1
+              if( t( k+1, k )/=zero ) then
+                 wi( k ) = sqrt( abs( t( k, k+1 ) ) )*sqrt( abs( t( k+1, k ) ) )
+                 wi( k+1 ) = -wi( k )
+              end if
+           end do
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_qtrsen
+
+     !> DTRSNA: estimates reciprocal condition numbers for specified
+     !> eigenvalues and/or right eigenvectors of a real upper
+     !> quasi-triangular matrix T (or of any matrix Q*T*Q**T with Q
+     !> orthogonal).
+     !> T must be in Schur canonical form (as returned by DHSEQR), that is,
+     !> block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; each
+     !> 2-by-2 diagonal block has its diagonal elements equal and its
+     !> off-diagonal elements of opposite sign.
+
+     subroutine stdlib_qtrsna( job, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, s, sep, mm, m, &
+               work, ldwork, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, ldwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: s(*), sep(*), work(ldwork,*)
+           real(qp), intent(in) :: t(ldt,*), vl(ldvl,*), vr(ldvr,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: pair, somcon, wantbh, wants, wantsp
+           integer(ilp) :: i, ierr, ifst, ilst, j, k, kase, ks, n2, nn
+           real(qp) :: bignum, cond, cs, delta, dumm, eps, est, lnrm, mu, prod, prod1, prod2, &
+                     rnrm, scale, smlnum, sn
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           real(qp) :: dummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantsp = stdlib_lsame( job, 'V' ) .or. wantbh
+           somcon = stdlib_lsame( howmny, 'S' )
+           info = 0
+           if( .not.wants .and. .not.wantsp ) then
+              info = -1
+           else if( .not.stdlib_lsame( howmny, 'A' ) .and. .not.somcon ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( wants .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( wants .and. ldvr<n ) ) then
+              info = -10
+           else
+              ! set m to the number of eigenpairs for which condition numbers
+              ! are required, and test mm.
+              if( somcon ) then
+                 m = 0
+                 pair = .false.
+                 do k = 1, n
+                    if( pair ) then
+                       pair = .false.
+                    else
+                       if( k<n ) then
+                          if( t( k+1, k )==zero ) then
+                             if( select( k ) )m = m + 1
+                          else
+                             pair = .true.
+                             if( select( k ) .or. select( k+1 ) )m = m + 2
+                          end if
+                       else
+                          if( select( n ) )m = m + 1
+                       end if
+                    end if
+                 end do
+              else
+                 m = n
+              end if
+              if( mm<m ) then
+                 info = -13
+              else if( ldwork<1 .or. ( wantsp .and. ldwork<n ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRSNA', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( somcon ) then
+                 if( .not.select( 1 ) )return
+              end if
+              if( wants )s( 1 ) = one
+              if( wantsp )sep( 1 ) = abs( t( 1, 1 ) )
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ks = 0
+           pair = .false.
+           loop_60: do k = 1, n
+              ! determine whether t(k,k) begins a 1-by-1 or 2-by-2 block.
+              if( pair ) then
+                 pair = .false.
+                 cycle loop_60
+              else
+                 if( k<n )pair = t( k+1, k )/=zero
+              end if
+              ! determine whether condition numbers are required for the k-th
+              ! eigenpair.
+              if( somcon ) then
+                 if( pair ) then
+                    if( .not.select( k ) .and. .not.select( k+1 ) )cycle loop_60
+                 else
+                    if( .not.select( k ) )cycle loop_60
+                 end if
+              end if
+              ks = ks + 1
+              if( wants ) then
+                 ! compute the reciprocal condition number of the k-th
+                 ! eigenvalue.
+                 if( .not.pair ) then
+                    ! real eigenvalue.
+                    prod = stdlib_qdot( n, vr( 1, ks ), 1, vl( 1, ks ), 1 )
+                    rnrm = stdlib_qnrm2( n, vr( 1, ks ), 1 )
+                    lnrm = stdlib_qnrm2( n, vl( 1, ks ), 1 )
+                    s( ks ) = abs( prod ) / ( rnrm*lnrm )
+                 else
+                    ! complex eigenvalue.
+                    prod1 = stdlib_qdot( n, vr( 1, ks ), 1, vl( 1, ks ), 1 )
+                    prod1 = prod1 + stdlib_qdot( n, vr( 1, ks+1 ), 1, vl( 1, ks+1 ),1 )
+                    prod2 = stdlib_qdot( n, vl( 1, ks ), 1, vr( 1, ks+1 ), 1 )
+                    prod2 = prod2 - stdlib_qdot( n, vl( 1, ks+1 ), 1, vr( 1, ks ),1 )
+                    rnrm = stdlib_qlapy2( stdlib_qnrm2( n, vr( 1, ks ), 1 ),stdlib_qnrm2( n, vr( &
+                              1, ks+1 ), 1 ) )
+                    lnrm = stdlib_qlapy2( stdlib_qnrm2( n, vl( 1, ks ), 1 ),stdlib_qnrm2( n, vl( &
+                              1, ks+1 ), 1 ) )
+                    cond = stdlib_qlapy2( prod1, prod2 ) / ( rnrm*lnrm )
+                    s( ks ) = cond
+                    s( ks+1 ) = cond
+                 end if
+              end if
+              if( wantsp ) then
+                 ! estimate the reciprocal condition number of the k-th
+                 ! eigenvector.
+                 ! copy the matrix t to the array work and swap the diagonal
+                 ! block beginning at t(k,k) to the (1,1) position.
+                 call stdlib_qlacpy( 'FULL', n, n, t, ldt, work, ldwork )
+                 ifst = k
+                 ilst = 1
+                 call stdlib_qtrexc( 'NO Q', n, work, ldwork, dummy, 1, ifst, ilst,work( 1, n+1 ),&
+                            ierr )
+                 if( ierr==1 .or. ierr==2 ) then
+                    ! could not swap because blocks not well separated
+                    scale = one
+                    est = bignum
+                 else
+                    ! reordering successful
+                    if( work( 2, 1 )==zero ) then
+                       ! form c = t22 - lambda*i in work(2:n,2:n).
+                       do i = 2, n
+                          work( i, i ) = work( i, i ) - work( 1, 1 )
+                       end do
+                       n2 = 1
+                       nn = n - 1
+                    else
+                       ! triangularize the 2 by 2 block by unitary
+                       ! transformation u = [  cs   i*ss ]
+                                          ! [ i*ss   cs  ].
+                       ! such that the (1,1) position of work is complex
+                       ! eigenvalue lambda with positive imaginary part. (2,2)
+                       ! position of work is the complex eigenvalue lambda
+                       ! with negative imaginary  part.
+                       mu = sqrt( abs( work( 1, 2 ) ) )*sqrt( abs( work( 2, 1 ) ) )
+                       delta = stdlib_qlapy2( mu, work( 2, 1 ) )
+                       cs = mu / delta
+                       sn = -work( 2, 1 ) / delta
+                       ! form
+                       ! c**t = work(2:n,2:n) + i*[rwork(1) ..... rwork(n-1) ]
+                                                ! [   mu                     ]
+                                                ! [         ..               ]
+                                                ! [             ..           ]
+                                                ! [                  mu      ]
+                       ! where c**t is transpose of matrix c,
+                       ! and rwork is stored starting in the n+1-st column of
+                       ! work.
+                       do j = 3, n
+                          work( 2, j ) = cs*work( 2, j )
+                          work( j, j ) = work( j, j ) - work( 1, 1 )
+                       end do
+                       work( 2, 2 ) = zero
+                       work( 1, n+1 ) = two*mu
+                       do i = 2, n - 1
+                          work( i, n+1 ) = sn*work( 1, i+1 )
+                       end do
+                       n2 = 2
+                       nn = 2*( n-1 )
+                    end if
+                    ! estimate norm(inv(c**t))
+                    est = zero
+                    kase = 0
+                    50 continue
+                    call stdlib_qlacn2( nn, work( 1, n+2 ), work( 1, n+4 ), iwork,est, kase, &
+                              isave )
+                    if( kase/=0 ) then
+                       if( kase==1 ) then
+                          if( n2==1 ) then
+                             ! real eigenvalue: solve c**t*x = scale*c.
+                             call stdlib_qlaqtr( .true., .true., n-1, work( 2, 2 ),ldwork, dummy, &
+                                       dumm, scale,work( 1, n+4 ), work( 1, n+6 ),ierr )
+                          else
+                             ! complex eigenvalue: solve
+                             ! c**t*(p+iq) = scale*(c+id) in real arithmetic.
+                             call stdlib_qlaqtr( .true., .false., n-1, work( 2, 2 ),ldwork, work( &
+                                       1, n+1 ), mu, scale,work( 1, n+4 ), work( 1, n+6 ),ierr )
+                          end if
+                       else
+                          if( n2==1 ) then
+                             ! real eigenvalue: solve c*x = scale*c.
+                             call stdlib_qlaqtr( .false., .true., n-1, work( 2, 2 ),ldwork, dummy,&
+                                        dumm, scale,work( 1, n+4 ), work( 1, n+6 ),ierr )
+                          else
+                             ! complex eigenvalue: solve
+                             ! c*(p+iq) = scale*(c+id) in real arithmetic.
+                             call stdlib_qlaqtr( .false., .false., n-1,work( 2, 2 ), ldwork,work( &
+                                       1, n+1 ), mu, scale,work( 1, n+4 ), work( 1, n+6 ),ierr )
+                          end if
+                       end if
+                       go to 50
+                    end if
+                 end if
+                 sep( ks ) = scale / max( est, smlnum )
+                 if( pair )sep( ks+1 ) = sep( ks )
+              end if
+              if( pair )ks = ks + 1
+           end do loop_60
+           return
+     end subroutine stdlib_qtrsna
+
+     !> DTRSYL: solves the real Sylvester matrix equation:
+     !> op(A)*X + X*op(B) = scale*C or
+     !> op(A)*X - X*op(B) = scale*C,
+     !> where op(A) = A or A**T, and  A and B are both upper quasi-
+     !> triangular. A is M-by-M and B is N-by-N; the right hand side C and
+     !> the solution X are M-by-N; and scale is an output scale factor, set
+     !> <= 1 to avoid overflow in X.
+     !> A and B must be in Schur canonical form (as returned by DHSEQR), that
+     !> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks;
+     !> each 2-by-2 diagonal block has its diagonal elements equal and its
+     !> off-diagonal elements of opposite sign.
+
+     subroutine stdlib_qtrsyl( trana, tranb, isgn, m, n, a, lda, b, ldb, c,ldc, scale, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trana, tranb
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: isgn, lda, ldb, ldc, m, n
+           real(qp), intent(out) :: scale
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*), b(ldb,*)
+           real(qp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notrna, notrnb
+           integer(ilp) :: ierr, j, k, k1, k2, knext, l, l1, l2, lnext
+           real(qp) :: a11, bignum, da11, db, eps, scaloc, sgn, smin, smlnum, suml, sumr, &
+                     xnorm
+           ! Local Arrays 
+           real(qp) :: dum(1), vec(2,2), x(2,2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min
+           ! Executable Statements 
+           ! decode and test input parameters
+           notrna = stdlib_lsame( trana, 'N' )
+           notrnb = stdlib_lsame( tranb, 'N' )
+           info = 0
+           if( .not.notrna .and. .not.stdlib_lsame( trana, 'T' ) .and. .not.stdlib_lsame( trana, &
+                     'C' ) ) then
+              info = -1
+           else if( .not.notrnb .and. .not.stdlib_lsame( tranb, 'T' ) .and. .not.stdlib_lsame( &
+                     tranb, 'C' ) ) then
+              info = -2
+           else if( isgn/=1 .and. isgn/=-1 ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, m ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRSYL', -info )
+              return
+           end if
+           ! quick return if possible
+           scale = one
+           if( m==0 .or. n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = smlnum*real( m*n,KIND=qp) / eps
+           bignum = one / smlnum
+           smin = max( smlnum, eps*stdlib_qlange( 'M', m, m, a, lda, dum ),eps*stdlib_qlange( 'M',&
+                      n, n, b, ldb, dum ) )
+           sgn = isgn
+           if( notrna .and. notrnb ) then
+              ! solve    a*x + isgn*x*b = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! bottom-left corner column by column by
+               ! a(k,k)*x(k,l) + isgn*x(k,l)*b(l,l) = c(k,l) - r(k,l)
+              ! where
+                        ! m                         l-1
+              ! r(k,l) = sum [a(k,i)*x(i,l)] + isgn*sum [x(k,j)*b(j,l)].
+                      ! i=k+1                       j=1
+              ! start column loop (index = l)
+              ! l1 (l2) : column index of the first (first) row of x(k,l).
+              lnext = 1
+              loop_60: do l = 1, n
+                 if( l<lnext )cycle loop_60
+                 if( l==n ) then
+                    l1 = l
+                    l2 = l
+                 else
+                    if( b( l+1, l )/=zero ) then
+                       l1 = l
+                       l2 = l + 1
+                       lnext = l + 2
+                    else
+                       l1 = l
+                       l2 = l
+                       lnext = l + 1
+                    end if
+                 end if
+                 ! start row loop (index = k)
+                 ! k1 (k2): row index of the first (last) row of x(k,l).
+                 knext = m
+                 loop_50: do k = m, 1, -1
+                    if( k>knext )cycle loop_50
+                    if( k==1 ) then
+                       k1 = k
+                       k2 = k
+                    else
+                       if( a( k, k-1 )/=zero ) then
+                          k1 = k - 1
+                          k2 = k
+                          knext = k - 2
+                       else
+                          k1 = k
+                          k2 = k
+                          knext = k - 1
+                       end if
+                    end if
+                    if( l1==l2 .and. k1==k2 ) then
+                       suml = stdlib_qdot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       scaloc = one
+                       a11 = a( k1, k1 ) + sgn*b( l1, l1 )
+                       da11 = abs( a11 )
+                       if( da11<=smin ) then
+                          a11 = smin
+                          da11 = smin
+                          info = 1
+                       end if
+                       db = abs( vec( 1, 1 ) )
+                       if( da11<one .and. db>one ) then
+                          if( db>bignum*da11 )scaloc = one / db
+                       end if
+                       x( 1, 1 ) = ( vec( 1, 1 )*scaloc ) / a11
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                    else if( l1==l2 .and. k1/=k2 ) then
+                       suml = stdlib_qdot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k2, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       call stdlib_qlaln2( .false., 2, 1, smin, one, a( k1, k1 ),lda, one, one, &
+                                 vec, 2, -sgn*b( l1, l1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k2, l1 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1==k2 ) then
+                       suml = stdlib_qdot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = sgn*( c( k1, l1 )-( suml+sgn*sumr ) )
+                       suml = stdlib_qdot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k1, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 2, 1 ) = sgn*( c( k1, l2 )-( suml+sgn*sumr ) )
+                       call stdlib_qlaln2( .true., 2, 1, smin, one, b( l1, l1 ),ldb, one, one, &
+                                 vec, 2, -sgn*a( k1, k1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1/=k2 ) then
+                       suml = stdlib_qdot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k1, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 1, 2 ) = c( k1, l2 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k2, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k2, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 2, 2 ) = c( k2, l2 ) - ( suml+sgn*sumr )
+                       call stdlib_qlasy2( .false., .false., isgn, 2, 2,a( k1, k1 ), lda, b( l1, &
+                                 l1 ), ldb, vec,2, scaloc, x, 2, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 1, 2 )
+                       c( k2, l1 ) = x( 2, 1 )
+                       c( k2, l2 ) = x( 2, 2 )
+                    end if
+                 end do loop_50
+              end do loop_60
+           else if( .not.notrna .and. notrnb ) then
+              ! solve    a**t *x + isgn*x*b = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! upper-left corner column by column by
+                ! a(k,k)**t*x(k,l) + isgn*x(k,l)*b(l,l) = c(k,l) - r(k,l)
+              ! where
+                         ! k-1        t                    l-1
+                ! r(k,l) = sum [a(i,k)**t*x(i,l)] +isgn*sum [x(k,j)*b(j,l)]
+                         ! i=1                          j=1
+              ! start column loop (index = l)
+              ! l1 (l2): column index of the first (last) row of x(k,l)
+              lnext = 1
+              loop_120: do l = 1, n
+                 if( l<lnext )cycle loop_120
+                 if( l==n ) then
+                    l1 = l
+                    l2 = l
+                 else
+                    if( b( l+1, l )/=zero ) then
+                       l1 = l
+                       l2 = l + 1
+                       lnext = l + 2
+                    else
+                       l1 = l
+                       l2 = l
+                       lnext = l + 1
+                    end if
+                 end if
+                 ! start row loop (index = k)
+                 ! k1 (k2): row index of the first (last) row of x(k,l)
+                 knext = 1
+                 loop_110: do k = 1, m
+                    if( k<knext )cycle loop_110
+                    if( k==m ) then
+                       k1 = k
+                       k2 = k
+                    else
+                       if( a( k+1, k )/=zero ) then
+                          k1 = k
+                          k2 = k + 1
+                          knext = k + 2
+                       else
+                          k1 = k
+                          k2 = k
+                          knext = k + 1
+                       end if
+                    end if
+                    if( l1==l2 .and. k1==k2 ) then
+                       suml = stdlib_qdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       scaloc = one
+                       a11 = a( k1, k1 ) + sgn*b( l1, l1 )
+                       da11 = abs( a11 )
+                       if( da11<=smin ) then
+                          a11 = smin
+                          da11 = smin
+                          info = 1
+                       end if
+                       db = abs( vec( 1, 1 ) )
+                       if( da11<one .and. db>one ) then
+                          if( db>bignum*da11 )scaloc = one / db
+                       end if
+                       x( 1, 1 ) = ( vec( 1, 1 )*scaloc ) / a11
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                    else if( l1==l2 .and. k1/=k2 ) then
+                       suml = stdlib_qdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( k1-1, a( 1, k2 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k2, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       call stdlib_qlaln2( .true., 2, 1, smin, one, a( k1, k1 ),lda, one, one, &
+                                 vec, 2, -sgn*b( l1, l1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k2, l1 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1==k2 ) then
+                       suml = stdlib_qdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = sgn*( c( k1, l1 )-( suml+sgn*sumr ) )
+                       suml = stdlib_qdot( k1-1, a( 1, k1 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k1, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 2, 1 ) = sgn*( c( k1, l2 )-( suml+sgn*sumr ) )
+                       call stdlib_qlaln2( .true., 2, 1, smin, one, b( l1, l1 ),ldb, one, one, &
+                                 vec, 2, -sgn*a( k1, k1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1/=k2 ) then
+                       suml = stdlib_qdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( k1-1, a( 1, k1 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k1, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 1, 2 ) = c( k1, l2 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( k1-1, a( 1, k2 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k2, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( k1-1, a( 1, k2 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_qdot( l1-1, c( k2, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 2, 2 ) = c( k2, l2 ) - ( suml+sgn*sumr )
+                       call stdlib_qlasy2( .true., .false., isgn, 2, 2, a( k1, k1 ),lda, b( l1, &
+                                 l1 ), ldb, vec, 2, scaloc, x,2, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 1, 2 )
+                       c( k2, l1 ) = x( 2, 1 )
+                       c( k2, l2 ) = x( 2, 2 )
+                    end if
+                 end do loop_110
+              end do loop_120
+           else if( .not.notrna .and. .not.notrnb ) then
+              ! solve    a**t*x + isgn*x*b**t = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! top-right corner column by column by
+                 ! a(k,k)**t*x(k,l) + isgn*x(k,l)*b(l,l)**t = c(k,l) - r(k,l)
+              ! where
+                           ! k-1                            n
+                  ! r(k,l) = sum [a(i,k)**t*x(i,l)] + isgn*sum [x(k,j)*b(l,j)**t].
+                           ! i=1                          j=l+1
+              ! start column loop (index = l)
+              ! l1 (l2): column index of the first (last) row of x(k,l)
+              lnext = n
+              loop_180: do l = n, 1, -1
+                 if( l>lnext )cycle loop_180
+                 if( l==1 ) then
+                    l1 = l
+                    l2 = l
+                 else
+                    if( b( l, l-1 )/=zero ) then
+                       l1 = l - 1
+                       l2 = l
+                       lnext = l - 2
+                    else
+                       l1 = l
+                       l2 = l
+                       lnext = l - 1
+                    end if
+                 end if
+                 ! start row loop (index = k)
+                 ! k1 (k2): row index of the first (last) row of x(k,l)
+                 knext = 1
+                 loop_170: do k = 1, m
+                    if( k<knext )cycle loop_170
+                    if( k==m ) then
+                       k1 = k
+                       k2 = k
+                    else
+                       if( a( k+1, k )/=zero ) then
+                          k1 = k
+                          k2 = k + 1
+                          knext = k + 2
+                       else
+                          k1 = k
+                          k2 = k
+                          knext = k + 1
+                       end if
+                    end if
+                    if( l1==l2 .and. k1==k2 ) then
+                       suml = stdlib_qdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_qdot( n-l1, c( k1, min( l1+1, n ) ), ldc,b( l1, min( l1+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       scaloc = one
+                       a11 = a( k1, k1 ) + sgn*b( l1, l1 )
+                       da11 = abs( a11 )
+                       if( da11<=smin ) then
+                          a11 = smin
+                          da11 = smin
+                          info = 1
+                       end if
+                       db = abs( vec( 1, 1 ) )
+                       if( da11<one .and. db>one ) then
+                          if( db>bignum*da11 )scaloc = one / db
+                       end if
+                       x( 1, 1 ) = ( vec( 1, 1 )*scaloc ) / a11
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                    else if( l1==l2 .and. k1/=k2 ) then
+                       suml = stdlib_qdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( k1-1, a( 1, k2 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       call stdlib_qlaln2( .true., 2, 1, smin, one, a( k1, k1 ),lda, one, one, &
+                                 vec, 2, -sgn*b( l1, l1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k2, l1 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1==k2 ) then
+                       suml = stdlib_qdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = sgn*( c( k1, l1 )-( suml+sgn*sumr ) )
+                       suml = stdlib_qdot( k1-1, a( 1, k1 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = sgn*( c( k1, l2 )-( suml+sgn*sumr ) )
+                       call stdlib_qlaln2( .false., 2, 1, smin, one, b( l1, l1 ),ldb, one, one, &
+                                 vec, 2, -sgn*a( k1, k1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1/=k2 ) then
+                       suml = stdlib_qdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( k1-1, a( 1, k1 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 2 ) = c( k1, l2 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( k1-1, a( 1, k2 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( k1-1, a( 1, k2 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 2 ) = c( k2, l2 ) - ( suml+sgn*sumr )
+                       call stdlib_qlasy2( .true., .true., isgn, 2, 2, a( k1, k1 ),lda, b( l1, l1 &
+                                 ), ldb, vec, 2, scaloc, x,2, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 1, 2 )
+                       c( k2, l1 ) = x( 2, 1 )
+                       c( k2, l2 ) = x( 2, 2 )
+                    end if
+                 end do loop_170
+              end do loop_180
+           else if( notrna .and. .not.notrnb ) then
+              ! solve    a*x + isgn*x*b**t = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! bottom-right corner column by column by
+                  ! a(k,k)*x(k,l) + isgn*x(k,l)*b(l,l)**t = c(k,l) - r(k,l)
+              ! where
+                            ! m                          n
+                  ! r(k,l) = sum [a(k,i)*x(i,l)] + isgn*sum [x(k,j)*b(l,j)**t].
+                          ! i=k+1                      j=l+1
+              ! start column loop (index = l)
+              ! l1 (l2): column index of the first (last) row of x(k,l)
+              lnext = n
+              loop_240: do l = n, 1, -1
+                 if( l>lnext )cycle loop_240
+                 if( l==1 ) then
+                    l1 = l
+                    l2 = l
+                 else
+                    if( b( l, l-1 )/=zero ) then
+                       l1 = l - 1
+                       l2 = l
+                       lnext = l - 2
+                    else
+                       l1 = l
+                       l2 = l
+                       lnext = l - 1
+                    end if
+                 end if
+                 ! start row loop (index = k)
+                 ! k1 (k2): row index of the first (last) row of x(k,l)
+                 knext = m
+                 loop_230: do k = m, 1, -1
+                    if( k>knext )cycle loop_230
+                    if( k==1 ) then
+                       k1 = k
+                       k2 = k
+                    else
+                       if( a( k, k-1 )/=zero ) then
+                          k1 = k - 1
+                          k2 = k
+                          knext = k - 2
+                       else
+                          k1 = k
+                          k2 = k
+                          knext = k - 1
+                       end if
+                    end if
+                    if( l1==l2 .and. k1==k2 ) then
+                       suml = stdlib_qdot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_qdot( n-l1, c( k1, min( l1+1, n ) ), ldc,b( l1, min( l1+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       scaloc = one
+                       a11 = a( k1, k1 ) + sgn*b( l1, l1 )
+                       da11 = abs( a11 )
+                       if( da11<=smin ) then
+                          a11 = smin
+                          da11 = smin
+                          info = 1
+                       end if
+                       db = abs( vec( 1, 1 ) )
+                       if( da11<one .and. db>one ) then
+                          if( db>bignum*da11 )scaloc = one / db
+                       end if
+                       x( 1, 1 ) = ( vec( 1, 1 )*scaloc ) / a11
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                    else if( l1==l2 .and. k1/=k2 ) then
+                       suml = stdlib_qdot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       call stdlib_qlaln2( .false., 2, 1, smin, one, a( k1, k1 ),lda, one, one, &
+                                 vec, 2, -sgn*b( l1, l1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k2, l1 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1==k2 ) then
+                       suml = stdlib_qdot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = sgn*( c( k1, l1 )-( suml+sgn*sumr ) )
+                       suml = stdlib_qdot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = sgn*( c( k1, l2 )-( suml+sgn*sumr ) )
+                       call stdlib_qlaln2( .false., 2, 1, smin, one, b( l1, l1 ),ldb, one, one, &
+                                 vec, 2, -sgn*a( k1, k1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1/=k2 ) then
+                       suml = stdlib_qdot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 2 ) = c( k1, l2 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_qdot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_qdot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 2 ) = c( k2, l2 ) - ( suml+sgn*sumr )
+                       call stdlib_qlasy2( .false., .true., isgn, 2, 2, a( k1, k1 ),lda, b( l1, &
+                                 l1 ), ldb, vec, 2, scaloc, x,2, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_qscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 1, 2 )
+                       c( k2, l1 ) = x( 2, 1 )
+                       c( k2, l2 ) = x( 2, 2 )
+                    end if
+                 end do loop_230
+              end do loop_240
+           end if
+           return
+     end subroutine stdlib_qtrsyl
+
+     !> DTRTI2: computes the inverse of a real upper or lower triangular
+     !> matrix.
+     !> This is the Level 2 BLAS version of the algorithm.
+
+     pure subroutine stdlib_qtrti2( uplo, diag, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j
+           real(qp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRTI2', -info )
+              return
+           end if
+           if( upper ) then
+              ! compute inverse of upper triangular matrix.
+              do j = 1, n
+                 if( nounit ) then
+                    a( j, j ) = one / a( j, j )
+                    ajj = -a( j, j )
+                 else
+                    ajj = -one
+                 end if
+                 ! compute elements 1:j-1 of j-th column.
+                 call stdlib_qtrmv( 'UPPER', 'NO TRANSPOSE', diag, j-1, a, lda,a( 1, j ), 1 )
+                           
+                 call stdlib_qscal( j-1, ajj, a( 1, j ), 1 )
+              end do
+           else
+              ! compute inverse of lower triangular matrix.
+              do j = n, 1, -1
+                 if( nounit ) then
+                    a( j, j ) = one / a( j, j )
+                    ajj = -a( j, j )
+                 else
+                    ajj = -one
+                 end if
+                 if( j<n ) then
+                    ! compute elements j+1:n of j-th column.
+                    call stdlib_qtrmv( 'LOWER', 'NO TRANSPOSE', diag, n-j,a( j+1, j+1 ), lda, a( &
+                              j+1, j ), 1 )
+                    call stdlib_qscal( n-j, ajj, a( j+1, j ), 1 )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_qtrti2
+
+     !> DTRTRI: computes the inverse of a real upper or lower triangular
+     !> matrix A.
+     !> This is the Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_qtrtri( uplo, diag, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jb, nb, nn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity if non-unit.
+           if( nounit ) then
+              do info = 1, n
+                 if( a( info, info )==zero )return
+              end do
+              info = 0
+           end if
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'DTRTRI', uplo // diag, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_qtrti2( uplo, diag, n, a, lda, info )
+           else
+              ! use blocked code
+              if( upper ) then
+                 ! compute inverse of upper triangular matrix
+                 do j = 1, n, nb
+                    jb = min( nb, n-j+1 )
+                    ! compute rows 1:j-1 of current block column
+                    call stdlib_qtrmm( 'LEFT', 'UPPER', 'NO TRANSPOSE', diag, j-1,jb, one, a, lda,&
+                               a( 1, j ), lda )
+                    call stdlib_qtrsm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', diag, j-1,jb, -one, a( j,&
+                               j ), lda, a( 1, j ), lda )
+                    ! compute inverse of current diagonal block
+                    call stdlib_qtrti2( 'UPPER', diag, jb, a( j, j ), lda, info )
+                 end do
+              else
+                 ! compute inverse of lower triangular matrix
+                 nn = ( ( n-1 ) / nb )*nb + 1
+                 do j = nn, 1, -nb
+                    jb = min( nb, n-j+1 )
+                    if( j+jb<=n ) then
+                       ! compute rows j+jb:n of current block column
+                       call stdlib_qtrmm( 'LEFT', 'LOWER', 'NO TRANSPOSE', diag,n-j-jb+1, jb, one,&
+                                  a( j+jb, j+jb ), lda,a( j+jb, j ), lda )
+                       call stdlib_qtrsm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', diag,n-j-jb+1, jb, -&
+                                 one, a( j, j ), lda,a( j+jb, j ), lda )
+                    end if
+                    ! compute inverse of current diagonal block
+                    call stdlib_qtrti2( 'LOWER', diag, jb, a( j, j ), lda, info )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_qtrtri
+
+     !> DTRTRS: solves a triangular system of the form
+     !> A * X = B  or  A**T * X = B,
+     !> where A is a triangular matrix of order N, and B is an N-by-NRHS
+     !> matrix.  A check is made to verify that A is nonsingular.
+
+     pure subroutine stdlib_qtrtrs( uplo, trans, diag, n, nrhs, a, lda, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              do info = 1, n
+                 if( a( info, info )==zero )return
+              end do
+           end if
+           info = 0
+           ! solve a * x = b  or  a**t * x = b.
+           call stdlib_qtrsm( 'LEFT', uplo, trans, diag, n, nrhs, one, a, lda, b,ldb )
+           return
+     end subroutine stdlib_qtrtrs
+
+     !> DTRTTF: copies a triangular matrix A from standard full format (TR)
+     !> to rectangular full packed format (TF) .
+
+     pure subroutine stdlib_qtrttf( transr, uplo, n, a, lda, arf, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           real(qp), intent(in) :: a(0:lda-1,0:*)
+           real(qp), intent(out) :: arf(0:*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: i, ij, j, k, l, n1, n2, nt, nx2, np1x2
+           ! Intrinsic Functions 
+           intrinsic :: max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRTTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 ) then
+                 arf( 0 ) = a( 0, 0 )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower: for n even n1=n2=k
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true., lda=n+1 and a is (n+1)--by--k2.
+           ! if n is even, set k = n/2 and nisodd = .false., lda=n and a is
+           ! n--by--(n+1)/2.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              if( .not.lower )np1x2 = n + n + 2
+           else
+              nisodd = .true.
+              if( .not.lower )nx2 = n + n
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    ij = 0
+                    do j = 0, n2
+                       do i = n1, n2 + j
+                          arf( ij ) = a( n2+j, i )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    ij = nt - n
+                    do j = n - 1, n1, -1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = j - n1, n1 - 1
+                          arf( ij ) = a( j-n1, l )
+                          ij = ij + 1
+                       end do
+                       ij = ij - nx2
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! n is odd, transr = 't', and uplo = 'l'
+                    ij = 0
+                    do j = 0, n2 - 1
+                       do i = 0, j
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                       do i = n1 + j, n - 1
+                          arf( ij ) = a( i, n1+j )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = n2, n - 1
+                       do i = 0, n1 - 1
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 't', and uplo = 'u'
+                    ij = 0
+                    do j = 0, n1
+                       do i = n1, n - 1
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, n1 - 1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = n2 + j, n - 1
+                          arf( ij ) = a( n2+j, l )
+                          ij = ij + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    ij = 0
+                    do j = 0, k - 1
+                       do i = k, k + j
+                          arf( ij ) = a( k+j, i )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    ij = nt - n - 1
+                    do j = n - 1, k, -1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = j - k, k - 1
+                          arf( ij ) = a( j-k, l )
+                          ij = ij + 1
+                       end do
+                       ij = ij - np1x2
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! n is even, transr = 't', and uplo = 'l'
+                    ij = 0
+                    j = k
+                    do i = k, n - 1
+                       arf( ij ) = a( i, j )
+                       ij = ij + 1
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                       do i = k + 1 + j, n - 1
+                          arf( ij ) = a( i, k+1+j )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = k - 1, n - 1
+                       do i = 0, k - 1
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 't', and uplo = 'u'
+                    ij = 0
+                    do j = 0, k
+                       do i = k, n - 1
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = k + 1 + j, n - 1
+                          arf( ij ) = a( k+1+j, l )
+                          ij = ij + 1
+                       end do
+                    end do
+                    ! note that here, on exit of the loop, j = k-1
+                    do i = 0, j
+                       arf( ij ) = a( i, j )
+                       ij = ij + 1
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_qtrttf
+
+     !> DTRTTP: copies a triangular matrix A from full format (TR) to standard
+     !> packed format (TP).
+
+     pure subroutine stdlib_qtrttp( uplo, n, a, lda, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower
+           integer(ilp) :: i, j, k
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTRTTP', -info )
+              return
+           end if
+           if( lower ) then
+              k = 0
+              do j = 1, n
+                 do i = j, n
+                    k = k + 1
+                    ap( k ) = a( i, j )
+                 end do
+              end do
+           else
+              k = 0
+              do j = 1, n
+                 do i = 1, j
+                    k = k + 1
+                    ap( k ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_qtrttp
+
+     !> DTZRZF: reduces the M-by-N ( M<=N ) real upper trapezoidal matrix A
+     !> to upper triangular form by means of orthogonal transformations.
+     !> The upper trapezoidal matrix A is factored as
+     !> A = ( R  0 ) * Z,
+     !> where Z is an N-by-N orthogonal matrix and R is an M-by-M upper
+     !> triangular matrix.
+
+     pure subroutine stdlib_qtzrzf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iws, ki, kk, ldwork, lwkmin, lwkopt, m1, mu, nb, nbmin, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              if( m==0 .or. m==n ) then
+                 lwkopt = 1
+                 lwkmin = 1
+              else
+                 ! determine the block size.
+                 nb = stdlib_ilaenv( 1, 'DGERQF', ' ', m, n, -1, -1 )
+                 lwkopt = m*nb
+                 lwkmin = max( 1, m )
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'DTZRZF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 ) then
+              return
+           else if( m==n ) then
+              do i = 1, n
+                 tau( i ) = zero
+              end do
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = m
+           if( nb>1 .and. nb<m ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'DGERQF', ' ', m, n, -1, -1 ) )
+              if( nx<m ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'DGERQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<m .and. nx<m ) then
+              ! use blocked code initially.
+              ! the last kk rows are handled by the block method.
+              m1 = min( m+1, n )
+              ki = ( ( m-nx-1 ) / nb )*nb
+              kk = min( m, ki+nb )
+              do i = m - kk + ki + 1, m - kk + 1, -nb
+                 ib = min( m-i+1, nb )
+                 ! compute the tz factorization of the current block
+                 ! a(i:i+ib-1,i:n)
+                 call stdlib_qlatrz( ib, n-i+1, n-m, a( i, i ), lda, tau( i ),work )
+                 if( i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_qlarzt( 'BACKWARD', 'ROWWISE', n-m, ib, a( i, m1 ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h to a(1:i-1,i:n) from the right
+                    call stdlib_qlarzb( 'RIGHT', 'NO TRANSPOSE', 'BACKWARD','ROWWISE', i-1, n-i+1,&
+                     ib, n-m, a( i, m1 ),lda, work, ldwork, a( 1, i ), lda,work( ib+1 ), ldwork )
+                               
+                 end if
+              end do
+              mu = i + nb - 1
+           else
+              mu = m
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 )call stdlib_qlatrz( mu, n, n-m, a, lda, tau, work )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_qtzrzf
+
+     !> DZSUM1: takes the sum of the absolute values of a complex
+     !> vector and returns a quad precision result.
+     !> Based on DZASUM from the Level 1 BLAS.
+     !> The change is to use the 'genuine' absolute value.
+
+     pure real(qp) function stdlib_qzsum1( n, cx, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(qp), intent(in) :: cx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, nincx
+           real(qp) :: stemp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           stdlib_qzsum1 = zero
+           stemp = zero
+           if( n<=0 )return
+           if( incx==1 )go to 20
+           ! code for increment not equal to 1
+           nincx = n*incx
+           do i = 1, nincx, incx
+              ! next line modified.
+              stemp = stemp + abs( cx( i ) )
+           end do
+           stdlib_qzsum1 = stemp
+           return
+           ! code for increment equal to 1
+           20 continue
+           do i = 1, n
+              ! next line modified.
+              stemp = stemp + abs( cx( i ) )
+           end do
+           stdlib_qzsum1 = stemp
+           return
+     end function stdlib_qzsum1
+
+     !> DLAG2Q: converts a SINGLE PRECISION matrix, SA, to a DOUBLE
+     !> PRECISION matrix, A.
+     !> Note that while it is possible to overflow while converting
+     !> from double to single, it is not possible to overflow when
+     !> converting from single to double.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_qlag2q( m, n, sa, ldsa, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, m, n
+           ! Array Arguments 
+           real(qp), intent(in) :: sa(ldsa,*)
+           real(qp), intent(out) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           info = 0
+           do j = 1, n
+              do i = 1, m
+                 a( i, j ) = sa( i, j )
+              end do
+           end do
+           return
+     end subroutine stdlib_qlag2q
+
+
+
+end module stdlib_linalg_lapack_q
+#:endif
diff --git a/src/stdlib_linalg_lapack_s.fypp b/src/stdlib_linalg_lapack_s.fypp
new file mode 100644
index 000000000..51897399a
--- /dev/null
+++ b/src/stdlib_linalg_lapack_s.fypp
@@ -0,0 +1,84837 @@
+#:include "common.fypp" 
+module stdlib_linalg_lapack_s
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas
+     use stdlib_linalg_lapack_aux
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+     public :: stdlib_sbbcsd
+     public :: stdlib_sbdsdc
+     public :: stdlib_sbdsqr
+     public :: stdlib_scsum1
+     public :: stdlib_sdisna
+     public :: stdlib_sgbbrd
+     public :: stdlib_sgbcon
+     public :: stdlib_sgbequ
+     public :: stdlib_sgbequb
+     public :: stdlib_sgbrfs
+     public :: stdlib_sgbsv
+     public :: stdlib_sgbsvx
+     public :: stdlib_sgbtf2
+     public :: stdlib_sgbtrf
+     public :: stdlib_sgbtrs
+     public :: stdlib_sgebak
+     public :: stdlib_sgebal
+     public :: stdlib_sgebd2
+     public :: stdlib_sgebrd
+     public :: stdlib_sgecon
+     public :: stdlib_sgeequ
+     public :: stdlib_sgeequb
+     public :: stdlib_sgees
+     public :: stdlib_sgeesx
+     public :: stdlib_sgeev
+     public :: stdlib_sgeevx
+     public :: stdlib_sgehd2
+     public :: stdlib_sgehrd
+     public :: stdlib_sgejsv
+     public :: stdlib_sgelq
+     public :: stdlib_sgelq2
+     public :: stdlib_sgelqf
+     public :: stdlib_sgelqt
+     public :: stdlib_sgelqt3
+     public :: stdlib_sgels
+     public :: stdlib_sgelsd
+     public :: stdlib_sgelss
+     public :: stdlib_sgelsy
+     public :: stdlib_sgemlq
+     public :: stdlib_sgemlqt
+     public :: stdlib_sgemqr
+     public :: stdlib_sgemqrt
+     public :: stdlib_sgeql2
+     public :: stdlib_sgeqlf
+     public :: stdlib_sgeqp3
+     public :: stdlib_sgeqr
+     public :: stdlib_sgeqr2
+     public :: stdlib_sgeqr2p
+     public :: stdlib_sgeqrf
+     public :: stdlib_sgeqrfp
+     public :: stdlib_sgeqrt
+     public :: stdlib_sgeqrt2
+     public :: stdlib_sgeqrt3
+     public :: stdlib_sgerfs
+     public :: stdlib_sgerq2
+     public :: stdlib_sgerqf
+     public :: stdlib_sgesc2
+     public :: stdlib_sgesdd
+     public :: stdlib_sgesv
+     public :: stdlib_sgesvd
+     public :: stdlib_sgesvdq
+     public :: stdlib_sgesvj
+     public :: stdlib_sgesvx
+     public :: stdlib_sgetc2
+     public :: stdlib_sgetf2
+     public :: stdlib_sgetrf
+     public :: stdlib_sgetrf2
+     public :: stdlib_sgetri
+     public :: stdlib_sgetrs
+     public :: stdlib_sgetsls
+     public :: stdlib_sgetsqrhrt
+     public :: stdlib_sggbak
+     public :: stdlib_sggbal
+     public :: stdlib_sgges
+     public :: stdlib_sgges3
+     public :: stdlib_sggesx
+     public :: stdlib_sggev
+     public :: stdlib_sggev3
+     public :: stdlib_sggevx
+     public :: stdlib_sggglm
+     public :: stdlib_sgghd3
+     public :: stdlib_sgghrd
+     public :: stdlib_sgglse
+     public :: stdlib_sggqrf
+     public :: stdlib_sggrqf
+     public :: stdlib_sgsvj0
+     public :: stdlib_sgsvj1
+     public :: stdlib_sgtcon
+     public :: stdlib_sgtrfs
+     public :: stdlib_sgtsv
+     public :: stdlib_sgtsvx
+     public :: stdlib_sgttrf
+     public :: stdlib_sgttrs
+     public :: stdlib_sgtts2
+     public :: stdlib_shgeqz
+     public :: stdlib_shsein
+     public :: stdlib_shseqr
+     public :: stdlib_sisnan
+     public :: stdlib_sla_gbamv
+     public :: stdlib_sla_gbrcond
+     public :: stdlib_sla_gbrpvgrw
+     public :: stdlib_sla_geamv
+     public :: stdlib_sla_gercond
+     public :: stdlib_sla_gerpvgrw
+     public :: stdlib_sla_lin_berr
+     public :: stdlib_sla_porcond
+     public :: stdlib_sla_porpvgrw
+     public :: stdlib_sla_syamv
+     public :: stdlib_sla_syrcond
+     public :: stdlib_sla_syrpvgrw
+     public :: stdlib_sla_wwaddw
+     public :: stdlib_slabad
+     public :: stdlib_slabrd
+     public :: stdlib_slacn2
+     public :: stdlib_slacon
+     public :: stdlib_slacpy
+     public :: stdlib_sladiv
+     public :: stdlib_sladiv1
+     public :: stdlib_sladiv2
+     public :: stdlib_slae2
+     public :: stdlib_slaebz
+     public :: stdlib_slaed0
+     public :: stdlib_slaed1
+     public :: stdlib_slaed2
+     public :: stdlib_slaed3
+     public :: stdlib_slaed4
+     public :: stdlib_slaed5
+     public :: stdlib_slaed6
+     public :: stdlib_slaed7
+     public :: stdlib_slaed8
+     public :: stdlib_slaed9
+     public :: stdlib_slaeda
+     public :: stdlib_slaein
+     public :: stdlib_slaev2
+     public :: stdlib_slaexc
+     public :: stdlib_slag2
+     public :: stdlib_slag2d
+     public :: stdlib_slags2
+     public :: stdlib_slagtf
+     public :: stdlib_slagtm
+     public :: stdlib_slagts
+     public :: stdlib_slagv2
+     public :: stdlib_slahqr
+     public :: stdlib_slahr2
+     public :: stdlib_slaic1
+     public :: stdlib_slaisnan
+     public :: stdlib_slaln2
+     public :: stdlib_slals0
+     public :: stdlib_slalsa
+     public :: stdlib_slalsd
+     public :: stdlib_slamch
+     public :: stdlib_slamc3
+     public :: stdlib_slamrg
+     public :: stdlib_slamswlq
+     public :: stdlib_slamtsqr
+     public :: stdlib_slaneg
+     public :: stdlib_slangb
+     public :: stdlib_slange
+     public :: stdlib_slangt
+     public :: stdlib_slanhs
+     public :: stdlib_slansb
+     public :: stdlib_slansf
+     public :: stdlib_slansp
+     public :: stdlib_slanst
+     public :: stdlib_slansy
+     public :: stdlib_slantb
+     public :: stdlib_slantp
+     public :: stdlib_slantr
+     public :: stdlib_slanv2
+     public :: stdlib_slaorhr_col_getrfnp
+     public :: stdlib_slaorhr_col_getrfnp2
+     public :: stdlib_slapll
+     public :: stdlib_slapmr
+     public :: stdlib_slapmt
+     public :: stdlib_slapy2
+     public :: stdlib_slapy3
+     public :: stdlib_slaqgb
+     public :: stdlib_slaqge
+     public :: stdlib_slaqp2
+     public :: stdlib_slaqps
+     public :: stdlib_slaqr0
+     public :: stdlib_slaqr1
+     public :: stdlib_slaqr2
+     public :: stdlib_slaqr3
+     public :: stdlib_slaqr4
+     public :: stdlib_slaqr5
+     public :: stdlib_slaqsb
+     public :: stdlib_slaqsp
+     public :: stdlib_slaqsy
+     public :: stdlib_slaqtr
+     public :: stdlib_slaqz0
+     public :: stdlib_slaqz1
+     public :: stdlib_slaqz2
+     public :: stdlib_slaqz3
+     public :: stdlib_slaqz4
+     public :: stdlib_slar1v
+     public :: stdlib_slar2v
+     public :: stdlib_slarf
+     public :: stdlib_slarfb
+     public :: stdlib_slarfb_gett
+     public :: stdlib_slarfg
+     public :: stdlib_slarfgp
+     public :: stdlib_slarft
+     public :: stdlib_slarfx
+     public :: stdlib_slarfy
+     public :: stdlib_slargv
+     public :: stdlib_slarnv
+     public :: stdlib_slarra
+     public :: stdlib_slarrb
+     public :: stdlib_slarrc
+     public :: stdlib_slarrd
+     public :: stdlib_slarre
+     public :: stdlib_slarrf
+     public :: stdlib_slarrj
+     public :: stdlib_slarrk
+     public :: stdlib_slarrr
+     public :: stdlib_slarrv
+     public :: stdlib_slartg
+     public :: stdlib_slartgp
+     public :: stdlib_slartgs
+     public :: stdlib_slartv
+     public :: stdlib_slaruv
+     public :: stdlib_slarz
+     public :: stdlib_slarzb
+     public :: stdlib_slarzt
+     public :: stdlib_slas2
+     public :: stdlib_slascl
+     public :: stdlib_slasd0
+     public :: stdlib_slasd1
+     public :: stdlib_slasd2
+     public :: stdlib_slasd3
+     public :: stdlib_slasd4
+     public :: stdlib_slasd5
+     public :: stdlib_slasd6
+     public :: stdlib_slasd7
+     public :: stdlib_slasd8
+     public :: stdlib_slasda
+     public :: stdlib_slasdq
+     public :: stdlib_slasdt
+     public :: stdlib_slaset
+     public :: stdlib_slasq1
+     public :: stdlib_slasq2
+     public :: stdlib_slasq3
+     public :: stdlib_slasq4
+     public :: stdlib_slasq5
+     public :: stdlib_slasq6
+     public :: stdlib_slasr
+     public :: stdlib_slasrt
+     public :: stdlib_slassq
+     public :: stdlib_slasv2
+     public :: stdlib_slaswlq
+     public :: stdlib_slaswp
+     public :: stdlib_slasy2
+     public :: stdlib_slasyf
+     public :: stdlib_slasyf_aa
+     public :: stdlib_slasyf_rk
+     public :: stdlib_slasyf_rook
+     public :: stdlib_slatbs
+     public :: stdlib_slatdf
+     public :: stdlib_slatps
+     public :: stdlib_slatrd
+     public :: stdlib_slatrs
+     public :: stdlib_slatrz
+     public :: stdlib_slatsqr
+     public :: stdlib_slauu2
+     public :: stdlib_slauum
+     public :: stdlib_sopgtr
+     public :: stdlib_sopmtr
+     public :: stdlib_sorbdb
+     public :: stdlib_sorbdb1
+     public :: stdlib_sorbdb2
+     public :: stdlib_sorbdb3
+     public :: stdlib_sorbdb4
+     public :: stdlib_sorbdb5
+     public :: stdlib_sorbdb6
+     public :: stdlib_sorcsd
+     public :: stdlib_sorcsd2by1
+     public :: stdlib_sorg2l
+     public :: stdlib_sorg2r
+     public :: stdlib_sorgbr
+     public :: stdlib_sorghr
+     public :: stdlib_sorgl2
+     public :: stdlib_sorglq
+     public :: stdlib_sorgql
+     public :: stdlib_sorgqr
+     public :: stdlib_sorgr2
+     public :: stdlib_sorgrq
+     public :: stdlib_sorgtr
+     public :: stdlib_sorgtsqr
+     public :: stdlib_sorgtsqr_row
+     public :: stdlib_sorhr_col
+     public :: stdlib_sorm22
+     public :: stdlib_sorm2l
+     public :: stdlib_sorm2r
+     public :: stdlib_sormbr
+     public :: stdlib_sormhr
+     public :: stdlib_sorml2
+     public :: stdlib_sormlq
+     public :: stdlib_sormql
+     public :: stdlib_sormqr
+     public :: stdlib_sormr2
+     public :: stdlib_sormr3
+     public :: stdlib_sormrq
+     public :: stdlib_sormrz
+     public :: stdlib_sormtr
+     public :: stdlib_spbcon
+     public :: stdlib_spbequ
+     public :: stdlib_spbrfs
+     public :: stdlib_spbstf
+     public :: stdlib_spbsv
+     public :: stdlib_spbsvx
+     public :: stdlib_spbtf2
+     public :: stdlib_spbtrf
+     public :: stdlib_spbtrs
+     public :: stdlib_spftrf
+     public :: stdlib_spftri
+     public :: stdlib_spftrs
+     public :: stdlib_spocon
+     public :: stdlib_spoequ
+     public :: stdlib_spoequb
+     public :: stdlib_sporfs
+     public :: stdlib_sposv
+     public :: stdlib_sposvx
+     public :: stdlib_spotf2
+     public :: stdlib_spotrf
+     public :: stdlib_spotrf2
+     public :: stdlib_spotri
+     public :: stdlib_spotrs
+     public :: stdlib_sppcon
+     public :: stdlib_sppequ
+     public :: stdlib_spprfs
+     public :: stdlib_sppsv
+     public :: stdlib_sppsvx
+     public :: stdlib_spptrf
+     public :: stdlib_spptri
+     public :: stdlib_spptrs
+     public :: stdlib_spstf2
+     public :: stdlib_spstrf
+     public :: stdlib_sptcon
+     public :: stdlib_spteqr
+     public :: stdlib_sptrfs
+     public :: stdlib_sptsv
+     public :: stdlib_sptsvx
+     public :: stdlib_spttrf
+     public :: stdlib_spttrs
+     public :: stdlib_sptts2
+     public :: stdlib_srscl
+     public :: stdlib_ssb2st_kernels
+     public :: stdlib_ssbev
+     public :: stdlib_ssbevd
+     public :: stdlib_ssbevx
+     public :: stdlib_ssbgst
+     public :: stdlib_ssbgv
+     public :: stdlib_ssbgvd
+     public :: stdlib_ssbgvx
+     public :: stdlib_ssbtrd
+     public :: stdlib_ssfrk
+     public :: stdlib_sspcon
+     public :: stdlib_sspev
+     public :: stdlib_sspevd
+     public :: stdlib_sspevx
+     public :: stdlib_sspgst
+     public :: stdlib_sspgv
+     public :: stdlib_sspgvd
+     public :: stdlib_sspgvx
+     public :: stdlib_ssprfs
+     public :: stdlib_sspsv
+     public :: stdlib_sspsvx
+     public :: stdlib_ssptrd
+     public :: stdlib_ssptrf
+     public :: stdlib_ssptri
+     public :: stdlib_ssptrs
+     public :: stdlib_sstebz
+     public :: stdlib_sstedc
+     public :: stdlib_sstegr
+     public :: stdlib_sstein
+     public :: stdlib_sstemr
+     public :: stdlib_ssteqr
+     public :: stdlib_ssterf
+     public :: stdlib_sstev
+     public :: stdlib_sstevd
+     public :: stdlib_sstevr
+     public :: stdlib_sstevx
+     public :: stdlib_ssycon
+     public :: stdlib_ssycon_rook
+     public :: stdlib_ssyconv
+     public :: stdlib_ssyconvf
+     public :: stdlib_ssyconvf_rook
+     public :: stdlib_ssyequb
+     public :: stdlib_ssyev
+     public :: stdlib_ssyevd
+     public :: stdlib_ssyevr
+     public :: stdlib_ssyevx
+     public :: stdlib_ssygs2
+     public :: stdlib_ssygst
+     public :: stdlib_ssygv
+     public :: stdlib_ssygvd
+     public :: stdlib_ssygvx
+     public :: stdlib_ssyrfs
+     public :: stdlib_ssysv
+     public :: stdlib_ssysv_aa
+     public :: stdlib_ssysv_rk
+     public :: stdlib_ssysv_rook
+     public :: stdlib_ssysvx
+     public :: stdlib_ssyswapr
+     public :: stdlib_ssytd2
+     public :: stdlib_ssytf2
+     public :: stdlib_ssytf2_rk
+     public :: stdlib_ssytf2_rook
+     public :: stdlib_ssytrd
+     public :: stdlib_ssytrd_sb2st
+     public :: stdlib_ssytrd_sy2sb
+     public :: stdlib_ssytrf
+     public :: stdlib_ssytrf_aa
+     public :: stdlib_ssytrf_rk
+     public :: stdlib_ssytrf_rook
+     public :: stdlib_ssytri
+     public :: stdlib_ssytri_rook
+     public :: stdlib_ssytrs
+     public :: stdlib_ssytrs2
+     public :: stdlib_ssytrs_3
+     public :: stdlib_ssytrs_aa
+     public :: stdlib_ssytrs_rook
+     public :: stdlib_stbcon
+     public :: stdlib_stbrfs
+     public :: stdlib_stbtrs
+     public :: stdlib_stfsm
+     public :: stdlib_stftri
+     public :: stdlib_stfttp
+     public :: stdlib_stfttr
+     public :: stdlib_stgevc
+     public :: stdlib_stgex2
+     public :: stdlib_stgexc
+     public :: stdlib_stgsen
+     public :: stdlib_stgsja
+     public :: stdlib_stgsna
+     public :: stdlib_stgsy2
+     public :: stdlib_stgsyl
+     public :: stdlib_stpcon
+     public :: stdlib_stplqt
+     public :: stdlib_stplqt2
+     public :: stdlib_stpmlqt
+     public :: stdlib_stpmqrt
+     public :: stdlib_stpqrt
+     public :: stdlib_stpqrt2
+     public :: stdlib_stprfb
+     public :: stdlib_stprfs
+     public :: stdlib_stptri
+     public :: stdlib_stptrs
+     public :: stdlib_stpttf
+     public :: stdlib_stpttr
+     public :: stdlib_strcon
+     public :: stdlib_strevc
+     public :: stdlib_strevc3
+     public :: stdlib_strexc
+     public :: stdlib_strrfs
+     public :: stdlib_strsen
+     public :: stdlib_strsna
+     public :: stdlib_strsyl
+     public :: stdlib_strti2
+     public :: stdlib_strtri
+     public :: stdlib_strtrs
+     public :: stdlib_strttf
+     public :: stdlib_strttp
+     public :: stdlib_stzrzf
+
+     ! 32-bit real constants 
+     real(sp),    parameter, private ::     negone = -1.00_sp
+     real(sp),    parameter, private ::       zero = 0.00_sp
+     real(sp),    parameter, private ::       half = 0.50_sp
+     real(sp),    parameter, private ::        one = 1.00_sp
+     real(sp),    parameter, private ::        two = 2.00_sp
+     real(sp),    parameter, private ::      three = 3.00_sp
+     real(sp),    parameter, private ::       four = 4.00_sp
+     real(sp),    parameter, private ::      eight = 8.00_sp
+     real(sp),    parameter, private ::        ten = 10.00_sp
+
+     ! 32-bit complex constants 
+     complex(sp), parameter, private :: czero   = ( 0.0_sp,0.0_sp)
+     complex(sp), parameter, private :: chalf   = ( 0.5_sp,0.0_sp)
+     complex(sp), parameter, private :: cone    = ( 1.0_sp,0.0_sp)
+     complex(sp), parameter, private :: cnegone = (-1.0_sp,0.0_sp)
+
+     ! 32-bit scaling constants 
+     integer,     parameter, private :: maxexp = maxexponent(zero) 
+     integer,     parameter, private :: minexp = minexponent(zero) 
+     real(sp),    parameter, private :: rradix = real(radix(zero),sp) 
+     real(sp),    parameter, private :: ulp    = epsilon(zero) 
+     real(sp),    parameter, private :: eps    = ulp*half 
+     real(sp),    parameter, private :: safmin = rradix**max(minexp-1,1-maxexp) 
+     real(sp),    parameter, private :: safmax = one/safmin 
+     real(sp),    parameter, private :: smlnum = safmin/ulp 
+     real(sp),    parameter, private :: bignum = safmax*ulp 
+     real(sp),    parameter, private :: rtmin  = sqrt(smlnum) 
+     real(sp),    parameter, private :: rtmax  = sqrt(bignum) 
+
+     ! 32-bit Blue's scaling constants 
+     ! ssml>=1/s and sbig==1/S with s,S as defined in https://doi.org/10.1145/355769.355771 
+     real(sp),    parameter, private :: tsml   = rradix**ceiling((minexp-1)*half) 
+     real(sp),    parameter, private :: tbig   = rradix**floor((maxexp-digits(zero)+1)*half) 
+     real(sp),    parameter, private :: ssml   = rradix**(-floor((minexp-digits(zero))*half)) 
+     real(sp),    parameter, private :: sbig   = rradix**(-ceiling((maxexp+digits(zero)-1)*half)) 
+
+
+     contains
+
+     !> SCSUM1: takes the sum of the absolute values of a complex
+     !> vector and returns a single precision result.
+     !> Based on SCASUM from the Level 1 BLAS.
+     !> The change is to use the 'genuine' absolute value.
+
+     pure real(sp)             function stdlib_scsum1( n, cx, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(sp), intent(in) :: cx(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, nincx
+           real(sp) :: stemp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           stdlib_scsum1 = zero
+           stemp = zero
+           if( n<=0 )return
+           if( incx==1 )go to 20
+           ! code for increment not equal to 1
+           nincx = n*incx
+           do i = 1, nincx, incx
+              ! next line modified.
+              stemp = stemp + abs( cx( i ) )
+           end do
+           stdlib_scsum1 = stemp
+           return
+           ! code for increment equal to 1
+           20 continue
+           do i = 1, n
+              ! next line modified.
+              stemp = stemp + abs( cx( i ) )
+           end do
+           stdlib_scsum1 = stemp
+           return
+     end function stdlib_scsum1
+
+     !> SGBTF2: computes an LU factorization of a real m-by-n band matrix A
+     !> using partial pivoting with row interchanges.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_sgbtf2( m, n, kl, ku, ab, ldab, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jp, ju, km, kv
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! kv is the number of superdiagonals in the factor u, allowing for
+           ! fill-in.
+           kv = ku + kl
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+kv+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGBTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! gaussian elimination with partial pivoting
+           ! set fill-in elements in columns ku+2 to kv to zero.
+           do j = ku + 2, min( kv, n )
+              do i = kv - j + 2, kl
+                 ab( i, j ) = zero
+              end do
+           end do
+           ! ju is the index of the last column affected by the current stage
+           ! of the factorization.
+           ju = 1
+           loop_40: do j = 1, min( m, n )
+              ! set fill-in elements in column j+kv to zero.
+              if( j+kv<=n ) then
+                 do i = 1, kl
+                    ab( i, j+kv ) = zero
+                 end do
+              end if
+              ! find pivot and test for singularity. km is the number of
+              ! subdiagonal elements in the current column.
+              km = min( kl, m-j )
+              jp = stdlib_isamax( km+1, ab( kv+1, j ), 1 )
+              ipiv( j ) = jp + j - 1
+              if( ab( kv+jp, j )/=zero ) then
+                 ju = max( ju, min( j+ku+jp-1, n ) )
+                 ! apply interchange to columns j to ju.
+                 if( jp/=1 )call stdlib_sswap( ju-j+1, ab( kv+jp, j ), ldab-1,ab( kv+1, j ), ldab-&
+                           1 )
+                 if( km>0 ) then
+                    ! compute multipliers.
+                    call stdlib_sscal( km, one / ab( kv+1, j ), ab( kv+2, j ), 1 )
+                    ! update trailing submatrix within the band.
+                    if( ju>j )call stdlib_sger( km, ju-j, -one, ab( kv+2, j ), 1,ab( kv, j+1 ), &
+                              ldab-1, ab( kv+1, j+1 ),ldab-1 )
+                 end if
+              else
+                 ! if pivot is zero, set info to the index of the pivot
+                 ! unless a zero pivot has already been found.
+                 if( info==0 )info = j
+              end if
+           end do loop_40
+           return
+     end subroutine stdlib_sgbtf2
+
+     !> SGBTRS: solves a system of linear equations
+     !> A * X = B  or  A**T * X = B
+     !> with a general band matrix A using the LU factorization computed
+     !> by SGBTRF.
+
+     pure subroutine stdlib_sgbtrs( trans, n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(in) :: ab(ldab,*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnoti, notran
+           integer(ilp) :: i, j, kd, l, lm
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<( 2*kl+ku+1 ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           kd = ku + kl + 1
+           lnoti = kl>0
+           if( notran ) then
+              ! solve  a*x = b.
+              ! solve l*x = b, overwriting b with x.
+              ! l is represented as a product of permutations and unit lower
+              ! triangular matrices l = p(1) * l(1) * ... * p(n-1) * l(n-1),
+              ! where each transformation l(i) is a rank-one modification of
+              ! the identity matrix.
+              if( lnoti ) then
+                 do j = 1, n - 1
+                    lm = min( kl, n-j )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_sswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                    call stdlib_sger( lm, nrhs, -one, ab( kd+1, j ), 1, b( j, 1 ),ldb, b( j+1, 1 )&
+                              , ldb )
+                 end do
+              end if
+              do i = 1, nrhs
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_stbsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, kl+ku,ab, ldab, b( 1, &
+                           i ), 1 )
+              end do
+           else
+              ! solve a**t*x = b.
+              do i = 1, nrhs
+                 ! solve u**t*x = b, overwriting b with x.
+                 call stdlib_stbsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, kl+ku, ab,ldab, b( 1, i )&
+                           , 1 )
+              end do
+              ! solve l**t*x = b, overwriting b with x.
+              if( lnoti ) then
+                 do j = n - 1, 1, -1
+                    lm = min( kl, n-j )
+                    call stdlib_sgemv( 'TRANSPOSE', lm, nrhs, -one, b( j+1, 1 ),ldb, ab( kd+1, j )&
+                              , 1, one, b( j, 1 ), ldb )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_sswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_sgbtrs
+
+     !> SGEBAK: forms the right or left eigenvectors of a real general matrix
+     !> by backward transformation on the computed eigenvectors of the
+     !> balanced matrix output by SGEBAL.
+
+     pure subroutine stdlib_sgebak( job, side, n, ilo, ihi, scale, m, v, ldv,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job, side
+           integer(ilp), intent(in) :: ihi, ilo, ldv, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(inout) :: v(ldv,*)
+           real(sp), intent(in) :: scale(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: leftv, rightv
+           integer(ilp) :: i, ii, k
+           real(sp) :: s
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! decode and test the input parameters
+           rightv = stdlib_lsame( side, 'R' )
+           leftv = stdlib_lsame( side, 'L' )
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( .not.rightv .and. .not.leftv ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -4
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -5
+           else if( m<0 ) then
+              info = -7
+           else if( ldv<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEBAK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( m==0 )return
+           if( stdlib_lsame( job, 'N' ) )return
+           if( ilo==ihi )go to 30
+           ! backward balance
+           if( stdlib_lsame( job, 'S' ) .or. stdlib_lsame( job, 'B' ) ) then
+              if( rightv ) then
+                 do i = ilo, ihi
+                    s = scale( i )
+                    call stdlib_sscal( m, s, v( i, 1 ), ldv )
+                 end do
+              end if
+              if( leftv ) then
+                 do i = ilo, ihi
+                    s = one / scale( i )
+                    call stdlib_sscal( m, s, v( i, 1 ), ldv )
+                 end do
+              end if
+           end if
+           ! backward permutation
+           ! for  i = ilo-1 step -1 until 1,
+                    ! ihi+1 step 1 until n do --
+                    30 continue
+           if( stdlib_lsame( job, 'P' ) .or. stdlib_lsame( job, 'B' ) ) then
+              if( rightv ) then
+                 loop_40: do ii = 1, n
+                    i = ii
+                    if( i>=ilo .and. i<=ihi )cycle loop_40
+                    if( i<ilo )i = ilo - ii
+                    k = scale( i )
+                    if( k==i )cycle loop_40
+                    call stdlib_sswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_40
+              end if
+              if( leftv ) then
+                 loop_50: do ii = 1, n
+                    i = ii
+                    if( i>=ilo .and. i<=ihi )cycle loop_50
+                    if( i<ilo )i = ilo - ii
+                    k = scale( i )
+                    if( k==i )cycle loop_50
+                    call stdlib_sswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_50
+              end if
+           end if
+           return
+     end subroutine stdlib_sgebak
+
+     !> SGGBAK: forms the right or left eigenvectors of a real generalized
+     !> eigenvalue problem A*x = lambda*B*x, by backward transformation on
+     !> the computed eigenvectors of the balanced pair of matrices output by
+     !> SGGBAL.
+
+     pure subroutine stdlib_sggbak( job, side, n, ilo, ihi, lscale, rscale, m, v,ldv, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job, side
+           integer(ilp), intent(in) :: ihi, ilo, ldv, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(in) :: lscale(*), rscale(*)
+           real(sp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: leftv, rightv
+           integer(ilp) :: i, k
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           rightv = stdlib_lsame( side, 'R' )
+           leftv = stdlib_lsame( side, 'L' )
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( .not.rightv .and. .not.leftv ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( n==0 .and. ihi==0 .and. ilo/=1 ) then
+              info = -4
+           else if( n>0 .and. ( ihi<ilo .or. ihi>max( 1, n ) ) )then
+              info = -5
+           else if( n==0 .and. ilo==1 .and. ihi/=0 ) then
+              info = -5
+           else if( m<0 ) then
+              info = -8
+           else if( ldv<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGBAK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( m==0 )return
+           if( stdlib_lsame( job, 'N' ) )return
+           if( ilo==ihi )go to 30
+           ! backward balance
+           if( stdlib_lsame( job, 'S' ) .or. stdlib_lsame( job, 'B' ) ) then
+              ! backward transformation on right eigenvectors
+              if( rightv ) then
+                 do i = ilo, ihi
+                    call stdlib_sscal( m, rscale( i ), v( i, 1 ), ldv )
+                 end do
+              end if
+              ! backward transformation on left eigenvectors
+              if( leftv ) then
+                 do i = ilo, ihi
+                    call stdlib_sscal( m, lscale( i ), v( i, 1 ), ldv )
+                 end do
+              end if
+           end if
+           ! backward permutation
+           30 continue
+           if( stdlib_lsame( job, 'P' ) .or. stdlib_lsame( job, 'B' ) ) then
+              ! backward permutation on right eigenvectors
+              if( rightv ) then
+                 if( ilo==1 )go to 50
+                 loop_40: do i = ilo - 1, 1, -1
+                    k = rscale( i )
+                    if( k==i )cycle loop_40
+                    call stdlib_sswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_40
+                 50 continue
+                 if( ihi==n )go to 70
+                 loop_60: do i = ihi + 1, n
+                    k = rscale( i )
+                    if( k==i )cycle loop_60
+                    call stdlib_sswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_60
+              end if
+              ! backward permutation on left eigenvectors
+              70 continue
+              if( leftv ) then
+                 if( ilo==1 )go to 90
+                 loop_80: do i = ilo - 1, 1, -1
+                    k = lscale( i )
+                    if( k==i )cycle loop_80
+                    call stdlib_sswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_80
+                 90 continue
+                 if( ihi==n )go to 110
+                 loop_100: do i = ihi + 1, n
+                    k = lscale( i )
+                    if( k==i )cycle loop_100
+                    call stdlib_sswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_100
+              end if
+           end if
+           110 continue
+           return
+     end subroutine stdlib_sggbak
+
+     !> SGTSV:  solves the equation
+     !> A*X = B,
+     !> where A is an n by n tridiagonal matrix, by Gaussian elimination with
+     !> partial pivoting.
+     !> Note that the equation  A**T*X = B  may be solved by interchanging the
+     !> order of the arguments DU and DL.
+
+     pure subroutine stdlib_sgtsv( n, nrhs, dl, d, du, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(inout) :: b(ldb,*), d(*), dl(*), du(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: fact, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGTSV ', -info )
+              return
+           end if
+           if( n==0 )return
+           if( nrhs==1 ) then
+              loop_10: do i = 1, n - 2
+                 if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                    ! no row interchange required
+                    if( d( i )/=zero ) then
+                       fact = dl( i ) / d( i )
+                       d( i+1 ) = d( i+1 ) - fact*du( i )
+                       b( i+1, 1 ) = b( i+1, 1 ) - fact*b( i, 1 )
+                    else
+                       info = i
+                       return
+                    end if
+                    dl( i ) = zero
+                 else
+                    ! interchange rows i and i+1
+                    fact = d( i ) / dl( i )
+                    d( i ) = dl( i )
+                    temp = d( i+1 )
+                    d( i+1 ) = du( i ) - fact*temp
+                    dl( i ) = du( i+1 )
+                    du( i+1 ) = -fact*dl( i )
+                    du( i ) = temp
+                    temp = b( i, 1 )
+                    b( i, 1 ) = b( i+1, 1 )
+                    b( i+1, 1 ) = temp - fact*b( i+1, 1 )
+                 end if
+              end do loop_10
+              if( n>1 ) then
+                 i = n - 1
+                 if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                    if( d( i )/=zero ) then
+                       fact = dl( i ) / d( i )
+                       d( i+1 ) = d( i+1 ) - fact*du( i )
+                       b( i+1, 1 ) = b( i+1, 1 ) - fact*b( i, 1 )
+                    else
+                       info = i
+                       return
+                    end if
+                 else
+                    fact = d( i ) / dl( i )
+                    d( i ) = dl( i )
+                    temp = d( i+1 )
+                    d( i+1 ) = du( i ) - fact*temp
+                    du( i ) = temp
+                    temp = b( i, 1 )
+                    b( i, 1 ) = b( i+1, 1 )
+                    b( i+1, 1 ) = temp - fact*b( i+1, 1 )
+                 end if
+              end if
+              if( d( n )==zero ) then
+                 info = n
+                 return
+              end if
+           else
+              loop_40: do i = 1, n - 2
+                 if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                    ! no row interchange required
+                    if( d( i )/=zero ) then
+                       fact = dl( i ) / d( i )
+                       d( i+1 ) = d( i+1 ) - fact*du( i )
+                       do j = 1, nrhs
+                          b( i+1, j ) = b( i+1, j ) - fact*b( i, j )
+                       end do
+                    else
+                       info = i
+                       return
+                    end if
+                    dl( i ) = zero
+                 else
+                    ! interchange rows i and i+1
+                    fact = d( i ) / dl( i )
+                    d( i ) = dl( i )
+                    temp = d( i+1 )
+                    d( i+1 ) = du( i ) - fact*temp
+                    dl( i ) = du( i+1 )
+                    du( i+1 ) = -fact*dl( i )
+                    du( i ) = temp
+                    do j = 1, nrhs
+                       temp = b( i, j )
+                       b( i, j ) = b( i+1, j )
+                       b( i+1, j ) = temp - fact*b( i+1, j )
+                    end do
+                 end if
+              end do loop_40
+              if( n>1 ) then
+                 i = n - 1
+                 if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                    if( d( i )/=zero ) then
+                       fact = dl( i ) / d( i )
+                       d( i+1 ) = d( i+1 ) - fact*du( i )
+                       do j = 1, nrhs
+                          b( i+1, j ) = b( i+1, j ) - fact*b( i, j )
+                       end do
+                    else
+                       info = i
+                       return
+                    end if
+                 else
+                    fact = d( i ) / dl( i )
+                    d( i ) = dl( i )
+                    temp = d( i+1 )
+                    d( i+1 ) = du( i ) - fact*temp
+                    du( i ) = temp
+                    do j = 1, nrhs
+                       temp = b( i, j )
+                       b( i, j ) = b( i+1, j )
+                       b( i+1, j ) = temp - fact*b( i+1, j )
+                    end do
+                 end if
+              end if
+              if( d( n )==zero ) then
+                 info = n
+                 return
+              end if
+           end if
+           ! back solve with the matrix u from the factorization.
+           if( nrhs<=2 ) then
+              j = 1
+              70 continue
+              b( n, j ) = b( n, j ) / d( n )
+              if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) / d( n-1 )
+              do i = n - 2, 1, -1
+                 b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-dl( i )*b( i+2, j ) ) / d( i )
+                           
+              end do
+              if( j<nrhs ) then
+                 j = j + 1
+                 go to 70
+              end if
+           else
+              do j = 1, nrhs
+                 b( n, j ) = b( n, j ) / d( n )
+                 if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                 do i = n - 2, 1, -1
+                    b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-dl( i )*b( i+2, j ) ) / d( i )
+                              
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_sgtsv
+
+     !> SGTTRF: computes an LU factorization of a real tridiagonal matrix A
+     !> using elimination with partial pivoting and row interchanges.
+     !> The factorization has the form
+     !> A = L * U
+     !> where L is a product of permutation and unit lower bidiagonal
+     !> matrices and U is upper triangular with nonzeros in only the main
+     !> diagonal and first two superdiagonals.
+
+     pure subroutine stdlib_sgttrf( n, dl, d, du, du2, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: d(*), dl(*), du(*)
+           real(sp), intent(out) :: du2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: fact, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'SGTTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize ipiv(i) = i and du2(i) = 0
+           do i = 1, n
+              ipiv( i ) = i
+           end do
+           do i = 1, n - 2
+              du2( i ) = zero
+           end do
+           do i = 1, n - 2
+              if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                 ! no row interchange required, eliminate dl(i)
+                 if( d( i )/=zero ) then
+                    fact = dl( i ) / d( i )
+                    dl( i ) = fact
+                    d( i+1 ) = d( i+1 ) - fact*du( i )
+                 end if
+              else
+                 ! interchange rows i and i+1, eliminate dl(i)
+                 fact = d( i ) / dl( i )
+                 d( i ) = dl( i )
+                 dl( i ) = fact
+                 temp = du( i )
+                 du( i ) = d( i+1 )
+                 d( i+1 ) = temp - fact*d( i+1 )
+                 du2( i ) = du( i+1 )
+                 du( i+1 ) = -fact*du( i+1 )
+                 ipiv( i ) = i + 1
+              end if
+           end do
+           if( n>1 ) then
+              i = n - 1
+              if( abs( d( i ) )>=abs( dl( i ) ) ) then
+                 if( d( i )/=zero ) then
+                    fact = dl( i ) / d( i )
+                    dl( i ) = fact
+                    d( i+1 ) = d( i+1 ) - fact*du( i )
+                 end if
+              else
+                 fact = d( i ) / dl( i )
+                 d( i ) = dl( i )
+                 dl( i ) = fact
+                 temp = du( i )
+                 du( i ) = d( i+1 )
+                 d( i+1 ) = temp - fact*d( i+1 )
+                 ipiv( i ) = i + 1
+              end if
+           end if
+           ! check for a zero on the diagonal of u.
+           do i = 1, n
+              if( d( i )==zero ) then
+                 info = i
+                 go to 50
+              end if
+           end do
+           50 continue
+           return
+     end subroutine stdlib_sgttrf
+
+     !> SGTTS2: solves one of the systems of equations
+     !> A*X = B  or  A**T*X = B,
+     !> with a tridiagonal matrix A using the LU factorization computed
+     !> by SGTTRF.
+
+     pure subroutine stdlib_sgtts2( itrans, n, nrhs, dl, d, du, du2, ipiv, b, ldb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: itrans, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(inout) :: b(ldb,*)
+           real(sp), intent(in) :: d(*), dl(*), du(*), du2(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ip, j
+           real(sp) :: temp
+           ! Executable Statements 
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( itrans==0 ) then
+              ! solve a*x = b using the lu factorization of a,
+              ! overwriting each right hand side vector with its solution.
+              if( nrhs<=1 ) then
+                 j = 1
+                 10 continue
+                 ! solve l*x = b.
+                 do i = 1, n - 1
+                    ip = ipiv( i )
+                    temp = b( i+1-ip+i, j ) - dl( i )*b( ip, j )
+                    b( i, j ) = b( ip, j )
+                    b( i+1, j ) = temp
+                 end do
+                 ! solve u*x = b.
+                 b( n, j ) = b( n, j ) / d( n )
+                 if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                 do i = n - 2, 1, -1
+                    b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-du2( i )*b( i+2, j ) ) / d( i )
+                              
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 10
+                 end if
+              else
+                 do j = 1, nrhs
+                    ! solve l*x = b.
+                    do i = 1, n - 1
+                       if( ipiv( i )==i ) then
+                          b( i+1, j ) = b( i+1, j ) - dl( i )*b( i, j )
+                       else
+                          temp = b( i, j )
+                          b( i, j ) = b( i+1, j )
+                          b( i+1, j ) = temp - dl( i )*b( i, j )
+                       end if
+                    end do
+                    ! solve u*x = b.
+                    b( n, j ) = b( n, j ) / d( n )
+                    if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                    do i = n - 2, 1, -1
+                       b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-du2( i )*b( i+2, j ) ) / d( i )
+                                 
+                    end do
+                 end do
+              end if
+           else
+              ! solve a**t * x = b.
+              if( nrhs<=1 ) then
+                 ! solve u**t*x = b.
+                 j = 1
+                 70 continue
+                 b( 1, j ) = b( 1, j ) / d( 1 )
+                 if( n>1 )b( 2, j ) = ( b( 2, j )-du( 1 )*b( 1, j ) ) / d( 2 )
+                 do i = 3, n
+                    b( i, j ) = ( b( i, j )-du( i-1 )*b( i-1, j )-du2( i-2 )*b( i-2, j ) ) / d( i &
+                              )
+                 end do
+                 ! solve l**t*x = b.
+                 do i = n - 1, 1, -1
+                    ip = ipiv( i )
+                    temp = b( i, j ) - dl( i )*b( i+1, j )
+                    b( i, j ) = b( ip, j )
+                    b( ip, j ) = temp
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 70
+                 end if
+              else
+                 do j = 1, nrhs
+                    ! solve u**t*x = b.
+                    b( 1, j ) = b( 1, j ) / d( 1 )
+                    if( n>1 )b( 2, j ) = ( b( 2, j )-du( 1 )*b( 1, j ) ) / d( 2 )
+                    do i = 3, n
+                       b( i, j ) = ( b( i, j )-du( i-1 )*b( i-1, j )-du2( i-2 )*b( i-2, j ) ) / d(&
+                                  i )
+                    end do
+                    do i = n - 1, 1, -1
+                       if( ipiv( i )==i ) then
+                          b( i, j ) = b( i, j ) - dl( i )*b( i+1, j )
+                       else
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - dl( i )*temp
+                          b( i, j ) = temp
+                       end if
+                    end do
+                 end do
+              end if
+           end if
+     end subroutine stdlib_sgtts2
+
+     !> SLA_GBRPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     pure real(sp) function stdlib_sla_gbrpvgrw( n, kl, ku, ncols, ab, ldab, afb,ldafb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, kl, ku, ncols, ldab, ldafb
+           ! Array Arguments 
+           real(sp), intent(in) :: ab(ldab,*), afb(ldafb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(sp) :: amax, umax, rpvgrw
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           rpvgrw = one
+           kd = ku + 1
+           do j = 1, ncols
+              amax = zero
+              umax = zero
+              do i = max( j-ku, 1 ), min( j+kl, n )
+                 amax = max( abs( ab( kd+i-j, j)), amax )
+              end do
+              do i = max( j-ku, 1 ), j
+                 umax = max( abs( afb( kd+i-j, j ) ), umax )
+              end do
+              if ( umax /= 0.0_sp ) then
+                 rpvgrw = min( amax / umax, rpvgrw )
+              end if
+           end do
+           stdlib_sla_gbrpvgrw = rpvgrw
+     end function stdlib_sla_gbrpvgrw
+
+     !> SLA_GERPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     pure real(sp) function stdlib_sla_gerpvgrw( n, ncols, a, lda, af, ldaf )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, ncols, lda, ldaf
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), af(ldaf,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: amax, umax, rpvgrw
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           rpvgrw = one
+           do j = 1, ncols
+              amax = zero
+              umax = zero
+              do i = 1, n
+                 amax = max( abs( a( i, j ) ), amax )
+              end do
+              do i = 1, j
+                 umax = max( abs( af( i, j ) ), umax )
+              end do
+              if ( umax /= 0.0_sp ) then
+                 rpvgrw = min( amax / umax, rpvgrw )
+              end if
+           end do
+           stdlib_sla_gerpvgrw = rpvgrw
+     end function stdlib_sla_gerpvgrw
+
+     !> SLA_WWADDW: adds a vector W into a doubled-single vector (X, Y).
+     !> This works for all extant IBM's hex and binary floating point
+     !> arithmetic, but not for decimal.
+
+     pure subroutine stdlib_sla_wwaddw( n, x, y, w )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(inout) :: x(*), y(*)
+           real(sp), intent(in) :: w(*)
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: s
+           integer(ilp) :: i
+           ! Executable Statements 
+           do 10 i = 1, n
+             s = x(i) + w(i)
+             s = (s + s) - s
+             y(i) = ((x(i) - s) + w(i)) + y(i)
+             x(i) = s
+             10 continue
+           return
+     end subroutine stdlib_sla_wwaddw
+
+     !> SLABAD: takes as input the values computed by SLAMCH for underflow and
+     !> overflow, and returns the square root of each of these values if the
+     !> log of LARGE is sufficiently large.  This subroutine is intended to
+     !> identify machines with a large exponent range, such as the Crays, and
+     !> redefine the underflow and overflow limits to be the square roots of
+     !> the values computed by SLAMCH.  This subroutine is needed because
+     !> SLAMCH does not compensate for poor arithmetic in the upper half of
+     !> the exponent range, as is found on a Cray.
+
+     pure subroutine stdlib_slabad( small, large )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(inout) :: large, small
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: log10,sqrt
+           ! Executable Statements 
+           ! if it looks like we're on a cray, take the square root of
+           ! small and large to avoid overflow and underflow problems.
+           if( log10( large )>2000. ) then
+              small = sqrt( small )
+              large = sqrt( large )
+           end if
+           return
+     end subroutine stdlib_slabad
+
+     !> SLACN2: estimates the 1-norm of a square, real matrix A.
+     !> Reverse communication is used for evaluating matrix-vector products.
+
+     pure subroutine stdlib_slacn2( n, v, x, isgn, est, kase, isave )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(inout) :: kase
+           integer(ilp), intent(in) :: n
+           real(sp), intent(inout) :: est
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isgn(*)
+           integer(ilp), intent(inout) :: isave(3)
+           real(sp), intent(out) :: v(*)
+           real(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, jlast
+           real(sp) :: altsgn, estold, temp, xs
+           ! Intrinsic Functions 
+           intrinsic :: abs,nint,real
+           ! Executable Statements 
+           if( kase==0 ) then
+              do i = 1, n
+                 x( i ) = one / real( n,KIND=sp)
+              end do
+              kase = 1
+              isave( 1 ) = 1
+              return
+           end if
+           go to ( 20, 40, 70, 110, 140 )isave( 1 )
+           ! ................ entry   (isave( 1 ) = 1)
+           ! first iteration.  x has been overwritten by a*x.
+           20 continue
+           if( n==1 ) then
+              v( 1 ) = x( 1 )
+              est = abs( v( 1 ) )
+              ! ... quit
+              go to 150
+           end if
+           est = stdlib_sasum( n, x, 1 )
+           do i = 1, n
+              if( x(i)>=zero ) then
+                 x(i) = one
+              else
+                 x(i) = -one
+              end if
+              isgn( i ) = nint( x( i ),KIND=ilp)
+           end do
+           kase = 2
+           isave( 1 ) = 2
+           return
+           ! ................ entry   (isave( 1 ) = 2)
+           ! first iteration.  x has been overwritten by transpose(a)*x.
+           40 continue
+           isave( 2 ) = stdlib_isamax( n, x, 1 )
+           isave( 3 ) = 2
+           ! main loop - iterations 2,3,...,itmax.
+           50 continue
+           do i = 1, n
+              x( i ) = zero
+           end do
+           x( isave( 2 ) ) = one
+           kase = 1
+           isave( 1 ) = 3
+           return
+           ! ................ entry   (isave( 1 ) = 3)
+           ! x has been overwritten by a*x.
+           70 continue
+           call stdlib_scopy( n, x, 1, v, 1 )
+           estold = est
+           est = stdlib_sasum( n, v, 1 )
+           do i = 1, n
+              if( x(i)>=zero ) then
+                 xs = one
+              else
+                 xs = -one
+              end if
+              if( nint( xs,KIND=ilp)/=isgn( i ) )go to 90
+           end do
+           ! repeated sign vector detected, hence algorithm has converged.
+           go to 120
+           90 continue
+           ! test for cycling.
+           if( est<=estold )go to 120
+           do i = 1, n
+              if( x(i)>=zero ) then
+                 x(i) = one
+              else
+                 x(i) = -one
+              end if
+              isgn( i ) = nint( x( i ),KIND=ilp)
+           end do
+           kase = 2
+           isave( 1 ) = 4
+           return
+           ! ................ entry   (isave( 1 ) = 4)
+           ! x has been overwritten by transpose(a)*x.
+           110 continue
+           jlast = isave( 2 )
+           isave( 2 ) = stdlib_isamax( n, x, 1 )
+           if( ( x( jlast )/=abs( x( isave( 2 ) ) ) ) .and.( isave( 3 )<itmax ) ) then
+              isave( 3 ) = isave( 3 ) + 1
+              go to 50
+           end if
+           ! iteration complete.  final stage.
+           120 continue
+           altsgn = one
+           do i = 1, n
+              x( i ) = altsgn*( one+real( i-1,KIND=sp) / real( n-1,KIND=sp) )
+              altsgn = -altsgn
+           end do
+           kase = 1
+           isave( 1 ) = 5
+           return
+           ! ................ entry   (isave( 1 ) = 5)
+           ! x has been overwritten by a*x.
+           140 continue
+           temp = two*( stdlib_sasum( n, x, 1 ) / real( 3*n,KIND=sp) )
+           if( temp>est ) then
+              call stdlib_scopy( n, x, 1, v, 1 )
+              est = temp
+           end if
+           150 continue
+           kase = 0
+           return
+     end subroutine stdlib_slacn2
+
+     !> SLACON: estimates the 1-norm of a square, real matrix A.
+     !> Reverse communication is used for evaluating matrix-vector products.
+
+     subroutine stdlib_slacon( n, v, x, isgn, est, kase )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(inout) :: kase
+           integer(ilp), intent(in) :: n
+           real(sp), intent(inout) :: est
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isgn(*)
+           real(sp), intent(out) :: v(*)
+           real(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iter, j, jlast, jump
+           real(sp) :: altsgn, estold, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,nint,real,sign
+           ! Save Statement 
+           save
+           ! Executable Statements 
+           if( kase==0 ) then
+              do i = 1, n
+                 x( i ) = one / real( n,KIND=sp)
+              end do
+              kase = 1
+              jump = 1
+              return
+           end if
+           go to ( 20, 40, 70, 110, 140 )jump
+           ! ................ entry   (jump = 1)
+           ! first iteration.  x has been overwritten by a*x.
+           20 continue
+           if( n==1 ) then
+              v( 1 ) = x( 1 )
+              est = abs( v( 1 ) )
+              ! ... quit
+              go to 150
+           end if
+           est = stdlib_sasum( n, x, 1 )
+           do i = 1, n
+              x( i ) = sign( one, x( i ) )
+              isgn( i ) = nint( x( i ),KIND=ilp)
+           end do
+           kase = 2
+           jump = 2
+           return
+           ! ................ entry   (jump = 2)
+           ! first iteration.  x has been overwritten by transpose(a)*x.
+           40 continue
+           j = stdlib_isamax( n, x, 1 )
+           iter = 2
+           ! main loop - iterations 2,3,...,itmax.
+           50 continue
+           do i = 1, n
+              x( i ) = zero
+           end do
+           x( j ) = one
+           kase = 1
+           jump = 3
+           return
+           ! ................ entry   (jump = 3)
+           ! x has been overwritten by a*x.
+           70 continue
+           call stdlib_scopy( n, x, 1, v, 1 )
+           estold = est
+           est = stdlib_sasum( n, v, 1 )
+           do i = 1, n
+              if( nint( sign( one, x( i ) ),KIND=ilp)/=isgn( i ) )go to 90
+           end do
+           ! repeated sign vector detected, hence algorithm has converged.
+           go to 120
+           90 continue
+           ! test for cycling.
+           if( est<=estold )go to 120
+           do i = 1, n
+              x( i ) = sign( one, x( i ) )
+              isgn( i ) = nint( x( i ),KIND=ilp)
+           end do
+           kase = 2
+           jump = 4
+           return
+           ! ................ entry   (jump = 4)
+           ! x has been overwritten by transpose(a)*x.
+           110 continue
+           jlast = j
+           j = stdlib_isamax( n, x, 1 )
+           if( ( x( jlast )/=abs( x( j ) ) ) .and. ( iter<itmax ) ) then
+              iter = iter + 1
+              go to 50
+           end if
+           ! iteration complete.  final stage.
+           120 continue
+           altsgn = one
+           do i = 1, n
+              x( i ) = altsgn*( one+real( i-1,KIND=sp) / real( n-1,KIND=sp) )
+              altsgn = -altsgn
+           end do
+           kase = 1
+           jump = 5
+           return
+           ! ................ entry   (jump = 5)
+           ! x has been overwritten by a*x.
+           140 continue
+           temp = two*( stdlib_sasum( n, x, 1 ) / real( 3*n,KIND=sp) )
+           if( temp>est ) then
+              call stdlib_scopy( n, x, 1, v, 1 )
+              est = temp
+           end if
+           150 continue
+           kase = 0
+           return
+     end subroutine stdlib_slacon
+
+     !> SLACPY: copies all or part of a two-dimensional matrix A to another
+     !> matrix B.
+
+     pure subroutine stdlib_slacpy( uplo, m, n, a, lda, b, ldb )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              do j = 1, n
+                 do i = j, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = 1, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_slacpy
+
+
+     pure real(sp) function stdlib_sladiv2( a, b, c, d, r, t )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: a, b, c, d, r, t
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: br
+           ! Executable Statements 
+           if( r/=zero ) then
+              br = b * r
+              if( br/=zero ) then
+                 stdlib_sladiv2 = (a + br) * t
+              else
+                 stdlib_sladiv2 = a * t + (b * t) * r
+              end if
+           else
+              stdlib_sladiv2 = (a + d * (b / c)) * t
+           end if
+           return
+     end function stdlib_sladiv2
+
+     !> SLAE2:  computes the eigenvalues of a 2-by-2 symmetric matrix
+     !> [  A   B  ]
+     !> [  B   C  ].
+     !> On return, RT1 is the eigenvalue of larger absolute value, and RT2
+     !> is the eigenvalue of smaller absolute value.
+
+     pure subroutine stdlib_slae2( a, b, c, rt1, rt2 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: a, b, c
+           real(sp), intent(out) :: rt1, rt2
+       ! =====================================================================
+           
+           
+           
+           
+           ! Local Scalars 
+           real(sp) :: ab, acmn, acmx, adf, df, rt, sm, tb
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           ! compute the eigenvalues
+           sm = a + c
+           df = a - c
+           adf = abs( df )
+           tb = b + b
+           ab = abs( tb )
+           if( abs( a )>abs( c ) ) then
+              acmx = a
+              acmn = c
+           else
+              acmx = c
+              acmn = a
+           end if
+           if( adf>ab ) then
+              rt = adf*sqrt( one+( ab / adf )**2 )
+           else if( adf<ab ) then
+              rt = ab*sqrt( one+( adf / ab )**2 )
+           else
+              ! includes case ab=adf=0
+              rt = ab*sqrt( two )
+           end if
+           if( sm<zero ) then
+              rt1 = half*( sm-rt )
+              ! order of execution important.
+              ! to get fully accurate smaller eigenvalue,
+              ! next line needs to be executed in higher precision.
+              rt2 = ( acmx / rt1 )*acmn - ( b / rt1 )*b
+           else if( sm>zero ) then
+              rt1 = half*( sm+rt )
+              ! order of execution important.
+              ! to get fully accurate smaller eigenvalue,
+              ! next line needs to be executed in higher precision.
+              rt2 = ( acmx / rt1 )*acmn - ( b / rt1 )*b
+           else
+              ! includes case rt1 = rt2 = 0
+              rt1 = half*rt
+              rt2 = -half*rt
+           end if
+           return
+     end subroutine stdlib_slae2
+
+     !> SLAEBZ: contains the iteration loops which compute and use the
+     !> function N(w), which is the count of eigenvalues of a symmetric
+     !> tridiagonal matrix T less than or equal to its argument  w.  It
+     !> performs a choice of two types of loops:
+     !> IJOB=1, followed by
+     !> IJOB=2: It takes as input a list of intervals and returns a list of
+     !> sufficiently small intervals whose union contains the same
+     !> eigenvalues as the union of the original intervals.
+     !> The input intervals are (AB(j,1),AB(j,2)], j=1,...,MINP.
+     !> The output interval (AB(j,1),AB(j,2)] will contain
+     !> eigenvalues NAB(j,1)+1,...,NAB(j,2), where 1 <= j <= MOUT.
+     !> IJOB=3: It performs a binary search in each input interval
+     !> (AB(j,1),AB(j,2)] for a point  w(j)  such that
+     !> N(w(j))=NVAL(j), and uses  C(j)  as the starting point of
+     !> the search.  If such a w(j) is found, then on output
+     !> AB(j,1)=AB(j,2)=w.  If no such w(j) is found, then on output
+     !> (AB(j,1),AB(j,2)] will be a small interval containing the
+     !> point where N(w) jumps through NVAL(j), unless that point
+     !> lies outside the initial interval.
+     !> Note that the intervals are in all cases half-open intervals,
+     !> i.e., of the form  (a,b] , which includes  b  but not  a .
+     !> To avoid underflow, the matrix should be scaled so that its largest
+     !> element is no greater than  overflow**(1/2) * underflow**(1/4)
+     !> in absolute value.  To assure the most accurate computation
+     !> of small eigenvalues, the matrix should be scaled to be
+     !> not much smaller than that, either.
+     !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+     !> Matrix", Report CS41, Computer Science Dept., Stanford
+     !> University, July 21, 1966
+     !> Note: the arguments are, in general, *not* checked for unreasonable
+     !> values.
+
+     pure subroutine stdlib_slaebz( ijob, nitmax, n, mmax, minp, nbmin, abstol,reltol, pivmin, d, &
+               e, e2, nval, ab, c, mout,nab, work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ijob, minp, mmax, n, nbmin, nitmax
+           integer(ilp), intent(out) :: info, mout
+           real(sp), intent(in) :: abstol, pivmin, reltol
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           integer(ilp), intent(inout) :: nab(mmax,*), nval(*)
+           real(sp), intent(inout) :: ab(mmax,*), c(*)
+           real(sp), intent(in) :: d(*), e(*), e2(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: itmp1, itmp2, j, ji, jit, jp, kf, kfnew, kl, klnew
+           real(sp) :: tmp1, tmp2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! check for errors
+           info = 0
+           if( ijob<1 .or. ijob>3 ) then
+              info = -1
+              return
+           end if
+           ! initialize nab
+           if( ijob==1 ) then
+              ! compute the number of eigenvalues in the initial intervals.
+              mout = 0
+              do ji = 1, minp
+                 do jp = 1, 2
+                    tmp1 = d( 1 ) - ab( ji, jp )
+                    if( abs( tmp1 )<pivmin )tmp1 = -pivmin
+                    nab( ji, jp ) = 0
+                    if( tmp1<=zero )nab( ji, jp ) = 1
+                    do j = 2, n
+                       tmp1 = d( j ) - e2( j-1 ) / tmp1 - ab( ji, jp )
+                       if( abs( tmp1 )<pivmin )tmp1 = -pivmin
+                       if( tmp1<=zero )nab( ji, jp ) = nab( ji, jp ) + 1
+                    end do
+                 end do
+                 mout = mout + nab( ji, 2 ) - nab( ji, 1 )
+              end do
+              return
+           end if
+           ! initialize for loop
+           ! kf and kl have the following meaning:
+              ! intervals 1,...,kf-1 have converged.
+              ! intervals kf,...,kl  still need to be refined.
+           kf = 1
+           kl = minp
+           ! if ijob=2, initialize c.
+           ! if ijob=3, use the user-supplied starting point.
+           if( ijob==2 ) then
+              do ji = 1, minp
+                 c( ji ) = half*( ab( ji, 1 )+ab( ji, 2 ) )
+              end do
+           end if
+           ! iteration loop
+           loop_130: do jit = 1, nitmax
+              ! loop over intervals
+              if( kl-kf+1>=nbmin .and. nbmin>0 ) then
+                 ! begin of parallel version of the loop
+                 do ji = kf, kl
+                    ! compute n(c), the number of eigenvalues less than c
+                    work( ji ) = d( 1 ) - c( ji )
+                    iwork( ji ) = 0
+                    if( work( ji )<=pivmin ) then
+                       iwork( ji ) = 1
+                       work( ji ) = min( work( ji ), -pivmin )
+                    end if
+                    do j = 2, n
+                       work( ji ) = d( j ) - e2( j-1 ) / work( ji ) - c( ji )
+                       if( work( ji )<=pivmin ) then
+                          iwork( ji ) = iwork( ji ) + 1
+                          work( ji ) = min( work( ji ), -pivmin )
+                       end if
+                    end do
+                 end do
+                 if( ijob<=2 ) then
+                    ! ijob=2: choose all intervals containing eigenvalues.
+                    klnew = kl
+                    loop_70: do ji = kf, kl
+                       ! insure that n(w) is monotone
+                       iwork( ji ) = min( nab( ji, 2 ),max( nab( ji, 1 ), iwork( ji ) ) )
+                       ! update the queue -- add intervals if both halves
+                       ! contain eigenvalues.
+                       if( iwork( ji )==nab( ji, 2 ) ) then
+                          ! no eigenvalue in the upper interval:
+                          ! just use the lower interval.
+                          ab( ji, 2 ) = c( ji )
+                       else if( iwork( ji )==nab( ji, 1 ) ) then
+                          ! no eigenvalue in the lower interval:
+                          ! just use the upper interval.
+                          ab( ji, 1 ) = c( ji )
+                       else
+                          klnew = klnew + 1
+                          if( klnew<=mmax ) then
+                             ! eigenvalue in both intervals -- add upper to
+                             ! queue.
+                             ab( klnew, 2 ) = ab( ji, 2 )
+                             nab( klnew, 2 ) = nab( ji, 2 )
+                             ab( klnew, 1 ) = c( ji )
+                             nab( klnew, 1 ) = iwork( ji )
+                             ab( ji, 2 ) = c( ji )
+                             nab( ji, 2 ) = iwork( ji )
+                          else
+                             info = mmax + 1
+                          end if
+                       end if
+                    end do loop_70
+                    if( info/=0 )return
+                    kl = klnew
+                 else
+                    ! ijob=3: binary search.  keep only the interval containing
+                            ! w   s.t. n(w) = nval
+                    do ji = kf, kl
+                       if( iwork( ji )<=nval( ji ) ) then
+                          ab( ji, 1 ) = c( ji )
+                          nab( ji, 1 ) = iwork( ji )
+                       end if
+                       if( iwork( ji )>=nval( ji ) ) then
+                          ab( ji, 2 ) = c( ji )
+                          nab( ji, 2 ) = iwork( ji )
+                       end if
+                    end do
+                 end if
+              else
+                 ! end of parallel version of the loop
+                 ! begin of serial version of the loop
+                 klnew = kl
+                 loop_100: do ji = kf, kl
+                    ! compute n(w), the number of eigenvalues less than w
+                    tmp1 = c( ji )
+                    tmp2 = d( 1 ) - tmp1
+                    itmp1 = 0
+                    if( tmp2<=pivmin ) then
+                       itmp1 = 1
+                       tmp2 = min( tmp2, -pivmin )
+                    end if
+                    do j = 2, n
+                       tmp2 = d( j ) - e2( j-1 ) / tmp2 - tmp1
+                       if( tmp2<=pivmin ) then
+                          itmp1 = itmp1 + 1
+                          tmp2 = min( tmp2, -pivmin )
+                       end if
+                    end do
+                    if( ijob<=2 ) then
+                       ! ijob=2: choose all intervals containing eigenvalues.
+                       ! insure that n(w) is monotone
+                       itmp1 = min( nab( ji, 2 ),max( nab( ji, 1 ), itmp1 ) )
+                       ! update the queue -- add intervals if both halves
+                       ! contain eigenvalues.
+                       if( itmp1==nab( ji, 2 ) ) then
+                          ! no eigenvalue in the upper interval:
+                          ! just use the lower interval.
+                          ab( ji, 2 ) = tmp1
+                       else if( itmp1==nab( ji, 1 ) ) then
+                          ! no eigenvalue in the lower interval:
+                          ! just use the upper interval.
+                          ab( ji, 1 ) = tmp1
+                       else if( klnew<mmax ) then
+                          ! eigenvalue in both intervals -- add upper to queue.
+                          klnew = klnew + 1
+                          ab( klnew, 2 ) = ab( ji, 2 )
+                          nab( klnew, 2 ) = nab( ji, 2 )
+                          ab( klnew, 1 ) = tmp1
+                          nab( klnew, 1 ) = itmp1
+                          ab( ji, 2 ) = tmp1
+                          nab( ji, 2 ) = itmp1
+                       else
+                          info = mmax + 1
+                          return
+                       end if
+                    else
+                       ! ijob=3: binary search.  keep only the interval
+                               ! containing  w  s.t. n(w) = nval
+                       if( itmp1<=nval( ji ) ) then
+                          ab( ji, 1 ) = tmp1
+                          nab( ji, 1 ) = itmp1
+                       end if
+                       if( itmp1>=nval( ji ) ) then
+                          ab( ji, 2 ) = tmp1
+                          nab( ji, 2 ) = itmp1
+                       end if
+                    end if
+                 end do loop_100
+                 kl = klnew
+              end if
+              ! check for convergence
+              kfnew = kf
+              loop_110: do ji = kf, kl
+                 tmp1 = abs( ab( ji, 2 )-ab( ji, 1 ) )
+                 tmp2 = max( abs( ab( ji, 2 ) ), abs( ab( ji, 1 ) ) )
+                 if( tmp1<max( abstol, pivmin, reltol*tmp2 ) .or.nab( ji, 1 )>=nab( ji, 2 ) ) &
+                           then
+                    ! converged -- swap with position kfnew,
+                                 ! then increment kfnew
+                    if( ji>kfnew ) then
+                       tmp1 = ab( ji, 1 )
+                       tmp2 = ab( ji, 2 )
+                       itmp1 = nab( ji, 1 )
+                       itmp2 = nab( ji, 2 )
+                       ab( ji, 1 ) = ab( kfnew, 1 )
+                       ab( ji, 2 ) = ab( kfnew, 2 )
+                       nab( ji, 1 ) = nab( kfnew, 1 )
+                       nab( ji, 2 ) = nab( kfnew, 2 )
+                       ab( kfnew, 1 ) = tmp1
+                       ab( kfnew, 2 ) = tmp2
+                       nab( kfnew, 1 ) = itmp1
+                       nab( kfnew, 2 ) = itmp2
+                       if( ijob==3 ) then
+                          itmp1 = nval( ji )
+                          nval( ji ) = nval( kfnew )
+                          nval( kfnew ) = itmp1
+                       end if
+                    end if
+                    kfnew = kfnew + 1
+                 end if
+              end do loop_110
+              kf = kfnew
+              ! choose midpoints
+              do ji = kf, kl
+                 c( ji ) = half*( ab( ji, 1 )+ab( ji, 2 ) )
+              end do
+              ! if no more intervals to refine, quit.
+              if( kf>kl )go to 140
+           end do loop_130
+           ! converged
+           140 continue
+           info = max( kl+1-kf, 0 )
+           mout = kl
+           return
+     end subroutine stdlib_slaebz
+
+     !> This subroutine computes the I-th eigenvalue of a symmetric rank-one
+     !> modification of a 2-by-2 diagonal matrix
+     !> diag( D )  +  RHO * Z * transpose(Z) .
+     !> The diagonal elements in the array D are assumed to satisfy
+     !> D(i) < D(j)  for  i < j .
+     !> We also assume RHO > 0 and that the Euclidean norm of the vector
+     !> Z is one.
+
+     pure subroutine stdlib_slaed5( i, d, z, delta, rho, dlam )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i
+           real(sp), intent(out) :: dlam
+           real(sp), intent(in) :: rho
+           ! Array Arguments 
+           real(sp), intent(in) :: d(2), z(2)
+           real(sp), intent(out) :: delta(2)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: b, c, del, tau, temp, w
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           del = d( 2 ) - d( 1 )
+           if( i==1 ) then
+              w = one + two*rho*( z( 2 )*z( 2 )-z( 1 )*z( 1 ) ) / del
+              if( w>zero ) then
+                 b = del + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+                 c = rho*z( 1 )*z( 1 )*del
+                 ! b > zero, always
+                 tau = two*c / ( b+sqrt( abs( b*b-four*c ) ) )
+                 dlam = d( 1 ) + tau
+                 delta( 1 ) = -z( 1 ) / tau
+                 delta( 2 ) = z( 2 ) / ( del-tau )
+              else
+                 b = -del + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+                 c = rho*z( 2 )*z( 2 )*del
+                 if( b>zero ) then
+                    tau = -two*c / ( b+sqrt( b*b+four*c ) )
+                 else
+                    tau = ( b-sqrt( b*b+four*c ) ) / two
+                 end if
+                 dlam = d( 2 ) + tau
+                 delta( 1 ) = -z( 1 ) / ( del+tau )
+                 delta( 2 ) = -z( 2 ) / tau
+              end if
+              temp = sqrt( delta( 1 )*delta( 1 )+delta( 2 )*delta( 2 ) )
+              delta( 1 ) = delta( 1 ) / temp
+              delta( 2 ) = delta( 2 ) / temp
+           else
+           ! now i=2
+              b = -del + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+              c = rho*z( 2 )*z( 2 )*del
+              if( b>zero ) then
+                 tau = ( b+sqrt( b*b+four*c ) ) / two
+              else
+                 tau = two*c / ( -b+sqrt( b*b+four*c ) )
+              end if
+              dlam = d( 2 ) + tau
+              delta( 1 ) = -z( 1 ) / ( del+tau )
+              delta( 2 ) = -z( 2 ) / tau
+              temp = sqrt( delta( 1 )*delta( 1 )+delta( 2 )*delta( 2 ) )
+              delta( 1 ) = delta( 1 ) / temp
+              delta( 2 ) = delta( 2 ) / temp
+           end if
+           return
+     end subroutine stdlib_slaed5
+
+     !> SLAEDA: computes the Z vector corresponding to the merge step in the
+     !> CURLVLth step of the merge process with TLVLS steps for the CURPBMth
+     !> problem.
+
+     pure subroutine stdlib_slaeda( n, tlvls, curlvl, curpbm, prmptr, perm, givptr,givcol, givnum,&
+                q, qptr, z, ztemp, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: curlvl, curpbm, n, tlvls
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(2,*), givptr(*), perm(*), prmptr(*), qptr(*)
+           real(sp), intent(in) :: givnum(2,*), q(*)
+           real(sp), intent(out) :: z(*), ztemp(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: bsiz1, bsiz2, curr, i, k, mid, psiz1, psiz2, ptr, zptr1
+           ! Intrinsic Functions 
+           intrinsic :: int,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAEDA', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine location of first number in second half.
+           mid = n / 2 + 1
+           ! gather last/first rows of appropriate eigenblocks into center of z
+           ptr = 1
+           ! determine location of lowest level subproblem in the full storage
+           ! scheme
+           curr = ptr + curpbm*2**curlvl + 2**( curlvl-1 ) - 1
+           ! determine size of these matrices.  we add half to the value of
+           ! the sqrt in case the machine underestimates one of these square
+           ! roots.
+           bsiz1 = int( half+sqrt( real( qptr( curr+1 )-qptr( curr ),KIND=sp) ),KIND=ilp)
+           bsiz2 = int( half+sqrt( real( qptr( curr+2 )-qptr( curr+1 ),KIND=sp) ),KIND=ilp)
+                     
+           do k = 1, mid - bsiz1 - 1
+              z( k ) = zero
+           end do
+           call stdlib_scopy( bsiz1, q( qptr( curr )+bsiz1-1 ), bsiz1,z( mid-bsiz1 ), 1 )
+           call stdlib_scopy( bsiz2, q( qptr( curr+1 ) ), bsiz2, z( mid ), 1 )
+           do k = mid + bsiz2, n
+              z( k ) = zero
+           end do
+           ! loop through remaining levels 1 -> curlvl applying the givens
+           ! rotations and permutation and then multiplying the center matrices
+           ! against the current z.
+           ptr = 2**tlvls + 1
+           loop_70: do k = 1, curlvl - 1
+              curr = ptr + curpbm*2**( curlvl-k ) + 2**( curlvl-k-1 ) - 1
+              psiz1 = prmptr( curr+1 ) - prmptr( curr )
+              psiz2 = prmptr( curr+2 ) - prmptr( curr+1 )
+              zptr1 = mid - psiz1
+             ! apply givens at curr and curr+1
+              do i = givptr( curr ), givptr( curr+1 ) - 1
+                 call stdlib_srot( 1, z( zptr1+givcol( 1, i )-1 ), 1,z( zptr1+givcol( 2, i )-1 ), &
+                           1, givnum( 1, i ),givnum( 2, i ) )
+              end do
+              do i = givptr( curr+1 ), givptr( curr+2 ) - 1
+                 call stdlib_srot( 1, z( mid-1+givcol( 1, i ) ), 1,z( mid-1+givcol( 2, i ) ), 1, &
+                           givnum( 1, i ),givnum( 2, i ) )
+              end do
+              psiz1 = prmptr( curr+1 ) - prmptr( curr )
+              psiz2 = prmptr( curr+2 ) - prmptr( curr+1 )
+              do i = 0, psiz1 - 1
+                 ztemp( i+1 ) = z( zptr1+perm( prmptr( curr )+i )-1 )
+              end do
+              do i = 0, psiz2 - 1
+                 ztemp( psiz1+i+1 ) = z( mid+perm( prmptr( curr+1 )+i )-1 )
+              end do
+              ! multiply blocks at curr and curr+1
+              ! determine size of these matrices.  we add half to the value of
+              ! the sqrt in case the machine underestimates one of these
+              ! square roots.
+              bsiz1 = int( half+sqrt( real( qptr( curr+1 )-qptr( curr ),KIND=sp) ),KIND=ilp)
+                        
+              bsiz2 = int( half+sqrt( real( qptr( curr+2 )-qptr( curr+1 ),KIND=sp) ),KIND=ilp)
+                        
+              if( bsiz1>0 ) then
+                 call stdlib_sgemv( 'T', bsiz1, bsiz1, one, q( qptr( curr ) ),bsiz1, ztemp( 1 ), &
+                           1, zero, z( zptr1 ), 1 )
+              end if
+              call stdlib_scopy( psiz1-bsiz1, ztemp( bsiz1+1 ), 1, z( zptr1+bsiz1 ),1 )
+              if( bsiz2>0 ) then
+                 call stdlib_sgemv( 'T', bsiz2, bsiz2, one, q( qptr( curr+1 ) ),bsiz2, ztemp( &
+                           psiz1+1 ), 1, zero, z( mid ), 1 )
+              end if
+              call stdlib_scopy( psiz2-bsiz2, ztemp( psiz1+bsiz2+1 ), 1,z( mid+bsiz2 ), 1 )
+                        
+              ptr = ptr + 2**( tlvls-k )
+           end do loop_70
+           return
+     end subroutine stdlib_slaeda
+
+     !> SLAEV2: computes the eigendecomposition of a 2-by-2 symmetric matrix
+     !> [  A   B  ]
+     !> [  B   C  ].
+     !> On return, RT1 is the eigenvalue of larger absolute value, RT2 is the
+     !> eigenvalue of smaller absolute value, and (CS1,SN1) is the unit right
+     !> eigenvector for RT1, giving the decomposition
+     !> [ CS1  SN1 ] [  A   B  ] [ CS1 -SN1 ]  =  [ RT1  0  ]
+     !> [-SN1  CS1 ] [  B   C  ] [ SN1  CS1 ]     [  0  RT2 ].
+
+     pure subroutine stdlib_slaev2( a, b, c, rt1, rt2, cs1, sn1 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: a, b, c
+           real(sp), intent(out) :: cs1, rt1, rt2, sn1
+       ! =====================================================================
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: sgn1, sgn2
+           real(sp) :: ab, acmn, acmx, acs, adf, cs, ct, df, rt, sm, tb, tn
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           ! compute the eigenvalues
+           sm = a + c
+           df = a - c
+           adf = abs( df )
+           tb = b + b
+           ab = abs( tb )
+           if( abs( a )>abs( c ) ) then
+              acmx = a
+              acmn = c
+           else
+              acmx = c
+              acmn = a
+           end if
+           if( adf>ab ) then
+              rt = adf*sqrt( one+( ab / adf )**2 )
+           else if( adf<ab ) then
+              rt = ab*sqrt( one+( adf / ab )**2 )
+           else
+              ! includes case ab=adf=0
+              rt = ab*sqrt( two )
+           end if
+           if( sm<zero ) then
+              rt1 = half*( sm-rt )
+              sgn1 = -1
+              ! order of execution important.
+              ! to get fully accurate smaller eigenvalue,
+              ! next line needs to be executed in higher precision.
+              rt2 = ( acmx / rt1 )*acmn - ( b / rt1 )*b
+           else if( sm>zero ) then
+              rt1 = half*( sm+rt )
+              sgn1 = 1
+              ! order of execution important.
+              ! to get fully accurate smaller eigenvalue,
+              ! next line needs to be executed in higher precision.
+              rt2 = ( acmx / rt1 )*acmn - ( b / rt1 )*b
+           else
+              ! includes case rt1 = rt2 = 0
+              rt1 = half*rt
+              rt2 = -half*rt
+              sgn1 = 1
+           end if
+           ! compute the eigenvector
+           if( df>=zero ) then
+              cs = df + rt
+              sgn2 = 1
+           else
+              cs = df - rt
+              sgn2 = -1
+           end if
+           acs = abs( cs )
+           if( acs>ab ) then
+              ct = -tb / cs
+              sn1 = one / sqrt( one+ct*ct )
+              cs1 = ct*sn1
+           else
+              if( ab==zero ) then
+                 cs1 = one
+                 sn1 = zero
+              else
+                 tn = -cs / tb
+                 cs1 = one / sqrt( one+tn*tn )
+                 sn1 = tn*cs1
+              end if
+           end if
+           if( sgn1==sgn2 ) then
+              tn = cs1
+              cs1 = -sn1
+              sn1 = tn
+           end if
+           return
+     end subroutine stdlib_slaev2
+
+     !> SLAG2: computes the eigenvalues of a 2 x 2 generalized eigenvalue
+     !> problem  A - w B, with scaling as necessary to avoid over-/underflow.
+     !> The scaling factor "s" results in a modified eigenvalue equation
+     !> s A - w B
+     !> where  s  is a non-negative scaling factor chosen so that  w,  w B,
+     !> and  s A  do not overflow and, if possible, do not underflow, either.
+
+     pure subroutine stdlib_slag2( a, lda, b, ldb, safmin, scale1, scale2, wr1,wr2, wi )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldb
+           real(sp), intent(in) :: safmin
+           real(sp), intent(out) :: scale1, scale2, wi, wr1, wr2
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: fuzzy1 = one+1.0e-5_sp
+           
+           
+           
+           ! Local Scalars 
+           real(sp) :: a11, a12, a21, a22, abi22, anorm, as11, as12, as22, ascale, b11, b12, b22, &
+           binv11, binv22, bmin, bnorm, bscale, bsize, c1, c2, c3, c4, c5, diff, discr, pp, qq, r,&
+            rtmax, rtmin, s1, s2, safmax, shift, ss, sum, wabs, wbig, wdet, wscale, wsize, &
+                      wsmall
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sign,sqrt
+           ! Executable Statements 
+           rtmin = sqrt( safmin )
+           rtmax = one / rtmin
+           safmax = one / safmin
+           ! scale a
+           anorm = max( abs( a( 1, 1 ) )+abs( a( 2, 1 ) ),abs( a( 1, 2 ) )+abs( a( 2, 2 ) ), &
+                     safmin )
+           ascale = one / anorm
+           a11 = ascale*a( 1, 1 )
+           a21 = ascale*a( 2, 1 )
+           a12 = ascale*a( 1, 2 )
+           a22 = ascale*a( 2, 2 )
+           ! perturb b if necessary to insure non-singularity
+           b11 = b( 1, 1 )
+           b12 = b( 1, 2 )
+           b22 = b( 2, 2 )
+           bmin = rtmin*max( abs( b11 ), abs( b12 ), abs( b22 ), rtmin )
+           if( abs( b11 )<bmin )b11 = sign( bmin, b11 )
+           if( abs( b22 )<bmin )b22 = sign( bmin, b22 )
+           ! scale b
+           bnorm = max( abs( b11 ), abs( b12 )+abs( b22 ), safmin )
+           bsize = max( abs( b11 ), abs( b22 ) )
+           bscale = one / bsize
+           b11 = b11*bscale
+           b12 = b12*bscale
+           b22 = b22*bscale
+           ! compute larger eigenvalue by method described by c. van loan
+           ! ( as is a shifted by -shift*b )
+           binv11 = one / b11
+           binv22 = one / b22
+           s1 = a11*binv11
+           s2 = a22*binv22
+           if( abs( s1 )<=abs( s2 ) ) then
+              as12 = a12 - s1*b12
+              as22 = a22 - s1*b22
+              ss = a21*( binv11*binv22 )
+              abi22 = as22*binv22 - ss*b12
+              pp = half*abi22
+              shift = s1
+           else
+              as12 = a12 - s2*b12
+              as11 = a11 - s2*b11
+              ss = a21*( binv11*binv22 )
+              abi22 = -ss*b12
+              pp = half*( as11*binv11+abi22 )
+              shift = s2
+           end if
+           qq = ss*as12
+           if( abs( pp*rtmin )>=one ) then
+              discr = ( rtmin*pp )**2 + qq*safmin
+              r = sqrt( abs( discr ) )*rtmax
+           else
+              if( pp**2+abs( qq )<=safmin ) then
+                 discr = ( rtmax*pp )**2 + qq*safmax
+                 r = sqrt( abs( discr ) )*rtmin
+              else
+                 discr = pp**2 + qq
+                 r = sqrt( abs( discr ) )
+              end if
+           end if
+           ! note: the test of r in the following if is to cover the case when
+                 ! discr is small and negative and is flushed to zero during
+                 ! the calculation of r.  on machines which have a consistent
+                 ! flush-to-zero threshold and handle numbers above that
+                 ! threshold correctly, it would not be necessary.
+           if( discr>=zero .or. r==zero ) then
+              sum = pp + sign( r, pp )
+              diff = pp - sign( r, pp )
+              wbig = shift + sum
+              ! compute smaller eigenvalue
+              wsmall = shift + diff
+              if( half*abs( wbig )>max( abs( wsmall ), safmin ) ) then
+                 wdet = ( a11*a22-a12*a21 )*( binv11*binv22 )
+                 wsmall = wdet / wbig
+              end if
+              ! choose (real) eigenvalue closest to 2,2 element of a*b**(-1)
+              ! for wr1.
+              if( pp>abi22 ) then
+                 wr1 = min( wbig, wsmall )
+                 wr2 = max( wbig, wsmall )
+              else
+                 wr1 = max( wbig, wsmall )
+                 wr2 = min( wbig, wsmall )
+              end if
+              wi = zero
+           else
+              ! complex eigenvalues
+              wr1 = shift + pp
+              wr2 = wr1
+              wi = r
+           end if
+           ! further scaling to avoid underflow and overflow in computing
+           ! scale1 and overflow in computing w*b.
+           ! this scale factor (wscale) is bounded from above using c1 and c2,
+           ! and from below using c3 and c4.
+              ! c1 implements the condition  s a  must never overflow.
+              ! c2 implements the condition  w b  must never overflow.
+              ! c3, with c2,
+                 ! implement the condition that s a - w b must never overflow.
+              ! c4 implements the condition  s    should not underflow.
+              ! c5 implements the condition  max(s,|w|) should be at least 2.
+           c1 = bsize*( safmin*max( one, ascale ) )
+           c2 = safmin*max( one, bnorm )
+           c3 = bsize*safmin
+           if( ascale<=one .and. bsize<=one ) then
+              c4 = min( one, ( ascale / safmin )*bsize )
+           else
+              c4 = one
+           end if
+           if( ascale<=one .or. bsize<=one ) then
+              c5 = min( one, ascale*bsize )
+           else
+              c5 = one
+           end if
+           ! scale first eigenvalue
+           wabs = abs( wr1 ) + abs( wi )
+           wsize = max( safmin, c1, fuzzy1*( wabs*c2+c3 ),min( c4, half*max( wabs, c5 ) ) )
+                     
+           if( wsize/=one ) then
+              wscale = one / wsize
+              if( wsize>one ) then
+                 scale1 = ( max( ascale, bsize )*wscale )*min( ascale, bsize )
+              else
+                 scale1 = ( min( ascale, bsize )*wscale )*max( ascale, bsize )
+              end if
+              wr1 = wr1*wscale
+              if( wi/=zero ) then
+                 wi = wi*wscale
+                 wr2 = wr1
+                 scale2 = scale1
+              end if
+           else
+              scale1 = ascale*bsize
+              scale2 = scale1
+           end if
+           ! scale second eigenvalue (if real)
+           if( wi==zero ) then
+              wsize = max( safmin, c1, fuzzy1*( abs( wr2 )*c2+c3 ),min( c4, half*max( abs( wr2 ), &
+                        c5 ) ) )
+              if( wsize/=one ) then
+                 wscale = one / wsize
+                 if( wsize>one ) then
+                    scale2 = ( max( ascale, bsize )*wscale )*min( ascale, bsize )
+                 else
+                    scale2 = ( min( ascale, bsize )*wscale )*max( ascale, bsize )
+                 end if
+                 wr2 = wr2*wscale
+              else
+                 scale2 = ascale*bsize
+              end if
+           end if
+           return
+     end subroutine stdlib_slag2
+
+     !> SLAG2D: converts a SINGLE PRECISION matrix, SA, to a DOUBLE
+     !> PRECISION matrix, A.
+     !> Note that while it is possible to overflow while converting
+     !> from double to single, it is not possible to overflow when
+     !> converting from single to double.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_slag2d( m, n, sa, ldsa, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, m, n
+           ! Array Arguments 
+           real(sp), intent(in) :: sa(ldsa,*)
+           real(dp), intent(out) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           info = 0
+           do j = 1, n
+              do i = 1, m
+                 a( i, j ) = sa( i, j )
+              end do
+           end do
+           return
+     end subroutine stdlib_slag2d
+
+     !> SLAGTM: performs a matrix-vector product of the form
+     !> B := alpha * A * X + beta * B
+     !> where A is a tridiagonal matrix of order N, B and X are N by NRHS
+     !> matrices, and alpha and beta are real scalars, each of which may be
+     !> 0., 1., or -1.
+
+     pure subroutine stdlib_slagtm( trans, n, nrhs, alpha, dl, d, du, x, ldx, beta,b, ldb )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(sp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           real(sp), intent(inout) :: b(ldb,*)
+           real(sp), intent(in) :: d(*), dl(*), du(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           if( n==0 )return
+           ! multiply b by beta if beta/=1.
+           if( beta==zero ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = zero
+                 end do
+              end do
+           else if( beta==-one ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = -b( i, j )
+                 end do
+              end do
+           end if
+           if( alpha==one ) then
+              if( stdlib_lsame( trans, 'N' ) ) then
+                 ! compute b := b + a*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j ) +du( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) + dl( n-1 )*x( n-1, j ) +d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + dl( i-1 )*x( i-1, j ) +d( i )*x( i, j ) + du( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else
+                 ! compute b := b + a**t*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j ) +dl( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) + du( n-1 )*x( n-1, j ) +d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + du( i-1 )*x( i-1, j ) +d( i )*x( i, j ) + dl( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              end if
+           else if( alpha==-one ) then
+              if( stdlib_lsame( trans, 'N' ) ) then
+                 ! compute b := b - a*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j ) -du( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) - dl( n-1 )*x( n-1, j ) -d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - dl( i-1 )*x( i-1, j ) -d( i )*x( i, j ) - du( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else
+                 ! compute b := b - a**t*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j ) -dl( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) - du( n-1 )*x( n-1, j ) -d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - du( i-1 )*x( i-1, j ) -d( i )*x( i, j ) - dl( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_slagtm
+
+     !> This routine is not for general use.  It exists solely to avoid
+     !> over-optimization in SISNAN.
+     !> SLAISNAN: checks for NaNs by comparing its two arguments for
+     !> inequality.  NaN is the only floating-point value where NaN != NaN
+     !> returns .TRUE.  To check for NaNs, pass the same variable as both
+     !> arguments.
+     !> A compiler must assume that the two arguments are
+     !> not the same variable, and the test will not be optimized away.
+     !> Interprocedural or whole-program optimization may delete this
+     !> test.  The ISNAN functions will be replaced by the correct
+     !> Fortran 03 intrinsic once the intrinsic is widely available.
+
+     pure logical(lk) function stdlib_slaisnan( sin1, sin2 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: sin1, sin2
+        ! =====================================================================
+        ! Executable Statements 
+           stdlib_slaisnan = (sin1/=sin2)
+           return
+     end function stdlib_slaisnan
+
+     !> SLAMCH: determines single precision machine parameters.
+
+     pure real(sp)             function stdlib_slamch( cmach )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: cmach
+       ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: rnd, eps, sfmin, small, rmach
+           ! Intrinsic Functions 
+           intrinsic :: digits,epsilon,huge,maxexponent,minexponent,radix,tiny
+           ! Executable Statements 
+           ! assume rounding, not chopping. always.
+           rnd = one
+           if( one==rnd ) then
+              eps = epsilon(zero) * 0.5
+           else
+              eps = epsilon(zero)
+           end if
+           if( stdlib_lsame( cmach, 'E' ) ) then
+              rmach = eps
+           else if( stdlib_lsame( cmach, 'S' ) ) then
+              sfmin = tiny(zero)
+              small = one / huge(zero)
+              if( small>=sfmin ) then
+                 ! use small plus a bit, to avoid the possibility of rounding
+                 ! causing overflow when computing  1/sfmin.
+                 sfmin = small*( one+eps )
+              end if
+              rmach = sfmin
+           else if( stdlib_lsame( cmach, 'B' ) ) then
+              rmach = radix(zero)
+           else if( stdlib_lsame( cmach, 'P' ) ) then
+              rmach = eps * radix(zero)
+           else if( stdlib_lsame( cmach, 'N' ) ) then
+              rmach = digits(zero)
+           else if( stdlib_lsame( cmach, 'R' ) ) then
+              rmach = rnd
+           else if( stdlib_lsame( cmach, 'M' ) ) then
+              rmach = minexponent(zero)
+           else if( stdlib_lsame( cmach, 'U' ) ) then
+              rmach = tiny(zero)
+           else if( stdlib_lsame( cmach, 'L' ) ) then
+              rmach = maxexponent(zero)
+           else if( stdlib_lsame( cmach, 'O' ) ) then
+              rmach = huge(zero)
+           else
+              rmach = zero
+           end if
+           stdlib_slamch = rmach
+           return
+     end function stdlib_slamch
+
+
+     pure real(sp)             function stdlib_slamc3( a, b )
+        ! -- lapack auxiliary routine --
+           ! univ. of tennessee, univ. of california berkeley and nag ltd..
+           ! Scalar Arguments 
+           real(sp), intent(in) :: a, b
+       ! =====================================================================
+           ! Executable Statements 
+           stdlib_slamc3 = a + b
+           return
+     end function stdlib_slamc3
+
+     !> SLAMRG: will create a permutation list which will merge the elements
+     !> of A (which is composed of two independently sorted sets) into a
+     !> single set which is sorted in ascending order.
+
+     pure subroutine stdlib_slamrg( n1, n2, a, strd1, strd2, index )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n1, n2, strd1, strd2
+           ! Array Arguments 
+           integer(ilp), intent(out) :: index(*)
+           real(sp), intent(in) :: a(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ind1, ind2, n1sv, n2sv
+           ! Executable Statements 
+           n1sv = n1
+           n2sv = n2
+           if( strd1>0 ) then
+              ind1 = 1
+           else
+              ind1 = n1
+           end if
+           if( strd2>0 ) then
+              ind2 = 1 + n1
+           else
+              ind2 = n1 + n2
+           end if
+           i = 1
+           ! while ( (n1sv > 0)
+           10 continue
+           if( n1sv>0 .and. n2sv>0 ) then
+              if( a( ind1 )<=a( ind2 ) ) then
+                 index( i ) = ind1
+                 i = i + 1
+                 ind1 = ind1 + strd1
+                 n1sv = n1sv - 1
+              else
+                 index( i ) = ind2
+                 i = i + 1
+                 ind2 = ind2 + strd2
+                 n2sv = n2sv - 1
+              end if
+              go to 10
+           end if
+           ! end while
+           if( n1sv==0 ) then
+              do n1sv = 1, n2sv
+                 index( i ) = ind2
+                 i = i + 1
+                 ind2 = ind2 + strd2
+              end do
+           else
+           ! n2sv == 0
+              do n2sv = 1, n1sv
+                 index( i ) = ind1
+                 i = i + 1
+                 ind1 = ind1 + strd1
+              end do
+           end if
+           return
+     end subroutine stdlib_slamrg
+
+     !> SLAORHR_COL_GETRFNP2: computes the modified LU factorization without
+     !> pivoting of a real general M-by-N matrix A. The factorization has
+     !> the form:
+     !> A - S = L * U,
+     !> where:
+     !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+     !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+     !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+     !> i-1 steps of Gaussian elimination. This means that the diagonal
+     !> element at each step of "modified" Gaussian elimination is at
+     !> least one in absolute value (so that division-by-zero not
+     !> possible during the division by the diagonal element);
+     !> L is a M-by-N lower triangular matrix with unit diagonal elements
+     !> (lower trapezoidal if M > N);
+     !> and U is a M-by-N upper triangular matrix
+     !> (upper trapezoidal if M < N).
+     !> This routine is an auxiliary routine used in the Householder
+     !> reconstruction routine SORHR_COL. In SORHR_COL, this routine is
+     !> applied to an M-by-N matrix A with orthonormal columns, where each
+     !> element is bounded by one in absolute value. With the choice of
+     !> the matrix S above, one can show that the diagonal element at each
+     !> step of Gaussian elimination is the largest (in absolute value) in
+     !> the column on or below the diagonal, so that no pivoting is required
+     !> for numerical stability [1].
+     !> For more details on the Householder reconstruction algorithm,
+     !> including the modified LU factorization, see [1].
+     !> This is the recursive version of the LU factorization algorithm.
+     !> Denote A - S by B. The algorithm divides the matrix B into four
+     !> submatrices:
+     !> [  B11 | B12  ]  where B11 is n1 by n1,
+     !> B = [ -----|----- ]        B21 is (m-n1) by n1,
+     !> [  B21 | B22  ]        B12 is n1 by n2,
+     !> B22 is (m-n1) by n2,
+     !> with n1 = min(m,n)/2, n2 = n-n1.
+     !> The subroutine calls itself to factor B11, solves for B21,
+     !> solves for B12, updates B22, then calls itself to factor B22.
+     !> For more details on the recursive LU algorithm, see [2].
+     !> SLAORHR_COL_GETRFNP2 is called to factorize a block by the blocked
+     !> routine SLAORHR_COL_GETRFNP, which uses blocked code calling
+     !> Level 3 BLAS to update the submatrix. However, SLAORHR_COL_GETRFNP2
+     !> is self-sufficient and can be used without SLAORHR_COL_GETRFNP.
+     !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+     !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+     !> E. Solomonik, J. Parallel Distrib. Comput.,
+     !> vol. 85, pp. 3-31, 2015.
+     !> [2] "Recursion leads to automatic variable blocking for dense linear
+     !> algebra algorithms", F. Gustavson, IBM J. of Res. and Dev.,
+     !> vol. 41, no. 6, pp. 737-755, 1997.
+
+     pure recursive subroutine stdlib_slaorhr_col_getrfnp2( m, n, a, lda, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: d(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: sfmin
+           integer(ilp) :: i, iinfo, n1, n2
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAORHR_COL_GETRFNP2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 )return
+           if ( m==1 ) then
+              ! one row case, (also recursion termination case),
+              ! use unblocked code
+              ! transfer the sign
+              d( 1 ) = -sign( one, a( 1, 1 ) )
+              ! construct the row of u
+              a( 1, 1 ) = a( 1, 1 ) - d( 1 )
+           else if( n==1 ) then
+              ! one column case, (also recursion termination case),
+              ! use unblocked code
+              ! transfer the sign
+              d( 1 ) = -sign( one, a( 1, 1 ) )
+              ! construct the row of u
+              a( 1, 1 ) = a( 1, 1 ) - d( 1 )
+              ! scale the elements 2:m of the column
+              ! determine machine safe minimum
+              sfmin = stdlib_slamch('S')
+              ! construct the subdiagonal elements of l
+              if( abs( a( 1, 1 ) ) >= sfmin ) then
+                 call stdlib_sscal( m-1, one / a( 1, 1 ), a( 2, 1 ), 1 )
+              else
+                 do i = 2, m
+                    a( i, 1 ) = a( i, 1 ) / a( 1, 1 )
+                 end do
+              end if
+           else
+              ! divide the matrix b into four submatrices
+              n1 = min( m, n ) / 2
+              n2 = n-n1
+              ! factor b11, recursive call
+              call stdlib_slaorhr_col_getrfnp2( n1, n1, a, lda, d, iinfo )
+              ! solve for b21
+              call stdlib_strsm( 'R', 'U', 'N', 'N', m-n1, n1, one, a, lda,a( n1+1, 1 ), lda )
+                        
+              ! solve for b12
+              call stdlib_strsm( 'L', 'L', 'N', 'U', n1, n2, one, a, lda,a( 1, n1+1 ), lda )
+                        
+              ! update b22, i.e. compute the schur complement
+              ! b22 := b22 - b21*b12
+              call stdlib_sgemm( 'N', 'N', m-n1, n2, n1, -one, a( n1+1, 1 ), lda,a( 1, n1+1 ), &
+                        lda, one, a( n1+1, n1+1 ), lda )
+              ! factor b22, recursive call
+              call stdlib_slaorhr_col_getrfnp2( m-n1, n2, a( n1+1, n1+1 ), lda,d( n1+1 ), iinfo )
+                        
+           end if
+           return
+     end subroutine stdlib_slaorhr_col_getrfnp2
+
+     !> SLAPMR: rearranges the rows of the M by N matrix X as specified
+     !> by the permutation K(1),K(2),...,K(M) of the integers 1,...,M.
+     !> If FORWRD = .TRUE.,  forward permutation:
+     !> X(K(I),*) is moved X(I,*) for I = 1,2,...,M.
+     !> If FORWRD = .FALSE., backward permutation:
+     !> X(I,*) is moved to X(K(I),*) for I = 1,2,...,M.
+
+     pure subroutine stdlib_slapmr( forwrd, m, n, x, ldx, k )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: forwrd
+           integer(ilp), intent(in) :: ldx, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: k(*)
+           real(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, in, j, jj
+           real(sp) :: temp
+           ! Executable Statements 
+           if( m<=1 )return
+           do i = 1, m
+              k( i ) = -k( i )
+           end do
+           if( forwrd ) then
+              ! forward permutation
+              do i = 1, m
+                 if( k( i )>0 )go to 40
+                 j = i
+                 k( j ) = -k( j )
+                 in = k( j )
+                 20 continue
+                 if( k( in )>0 )go to 40
+                 do jj = 1, n
+                    temp = x( j, jj )
+                    x( j, jj ) = x( in, jj )
+                    x( in, jj ) = temp
+                 end do
+                 k( in ) = -k( in )
+                 j = in
+                 in = k( in )
+                 go to 20
+                 40 continue
+              end do
+           else
+              ! backward permutation
+              do i = 1, m
+                 if( k( i )>0 )go to 80
+                 k( i ) = -k( i )
+                 j = k( i )
+                 60 continue
+                 if( j==i )go to 80
+                 do jj = 1, n
+                    temp = x( i, jj )
+                    x( i, jj ) = x( j, jj )
+                    x( j, jj ) = temp
+                 end do
+                 k( j ) = -k( j )
+                 j = k( j )
+                 go to 60
+                 80 continue
+              end do
+           end if
+           return
+     end subroutine stdlib_slapmr
+
+     !> SLAPMT: rearranges the columns of the M by N matrix X as specified
+     !> by the permutation K(1),K(2),...,K(N) of the integers 1,...,N.
+     !> If FORWRD = .TRUE.,  forward permutation:
+     !> X(*,K(J)) is moved X(*,J) for J = 1,2,...,N.
+     !> If FORWRD = .FALSE., backward permutation:
+     !> X(*,J) is moved to X(*,K(J)) for J = 1,2,...,N.
+
+     pure subroutine stdlib_slapmt( forwrd, m, n, x, ldx, k )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: forwrd
+           integer(ilp), intent(in) :: ldx, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: k(*)
+           real(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, in
+           real(sp) :: temp
+           ! Executable Statements 
+           if( n<=1 )return
+           do i = 1, n
+              k( i ) = -k( i )
+           end do
+           if( forwrd ) then
+              ! forward permutation
+              do i = 1, n
+                 if( k( i )>0 )go to 40
+                 j = i
+                 k( j ) = -k( j )
+                 in = k( j )
+                 20 continue
+                 if( k( in )>0 )go to 40
+                 do ii = 1, m
+                    temp = x( ii, j )
+                    x( ii, j ) = x( ii, in )
+                    x( ii, in ) = temp
+                 end do
+                 k( in ) = -k( in )
+                 j = in
+                 in = k( in )
+                 go to 20
+                 40 continue
+              end do
+           else
+              ! backward permutation
+              do i = 1, n
+                 if( k( i )>0 )go to 100
+                 k( i ) = -k( i )
+                 j = k( i )
+                 80 continue
+                 if( j==i )go to 100
+                 do ii = 1, m
+                    temp = x( ii, i )
+                    x( ii, i ) = x( ii, j )
+                    x( ii, j ) = temp
+                 end do
+                 k( j ) = -k( j )
+                 j = k( j )
+                 go to 80
+                 100 continue
+              end do
+           end if
+           return
+     end subroutine stdlib_slapmt
+
+     !> SLAPY3: returns sqrt(x**2+y**2+z**2), taking care not to cause
+     !> unnecessary overflow and unnecessary underflow.
+
+     pure real(sp)             function stdlib_slapy3( x, y, z )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: x, y, z
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: w, xabs, yabs, zabs, hugeval
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           hugeval = stdlib_slamch( 'OVERFLOW' )
+           xabs = abs( x )
+           yabs = abs( y )
+           zabs = abs( z )
+           w = max( xabs, yabs, zabs )
+           if( w==zero .or. w>hugeval ) then
+           ! w can be zero for max(0,nan,0)
+           ! adding all three entries together will make sure
+           ! nan will not disappear.
+              stdlib_slapy3 =  xabs + yabs + zabs
+           else
+              stdlib_slapy3 = w*sqrt( ( xabs / w )**2+( yabs / w )**2+( zabs / w )**2 )
+           end if
+           return
+     end function stdlib_slapy3
+
+     !> SLAQGB: equilibrates a general M by N band matrix A with KL
+     !> subdiagonals and KU superdiagonals using the row and scaling factors
+     !> in the vectors R and C.
+
+     pure subroutine stdlib_slaqgb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(sp), intent(in) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(sp), intent(inout) :: ab(ldab,*)
+           real(sp), intent(in) :: c(*), r(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1e+0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           large = one / small
+           if( rowcnd>=thresh .and. amax>=small .and. amax<=large )then
+              ! no row scaling
+              if( colcnd>=thresh ) then
+                 ! no column scaling
+                 equed = 'N'
+              else
+                 ! column scaling
+                 do j = 1, n
+                    cj = c( j )
+                    do i = max( 1, j-ku ), min( m, j+kl )
+                       ab( ku+1+i-j, j ) = cj*ab( ku+1+i-j, j )
+                    end do
+                 end do
+                 equed = 'C'
+              end if
+           else if( colcnd>=thresh ) then
+              ! row scaling, no column scaling
+              do j = 1, n
+                 do i = max( 1, j-ku ), min( m, j+kl )
+                    ab( ku+1+i-j, j ) = r( i )*ab( ku+1+i-j, j )
+                 end do
+              end do
+              equed = 'R'
+           else
+              ! row and column scaling
+              do j = 1, n
+                 cj = c( j )
+                 do i = max( 1, j-ku ), min( m, j+kl )
+                    ab( ku+1+i-j, j ) = cj*r( i )*ab( ku+1+i-j, j )
+                 end do
+              end do
+              equed = 'B'
+           end if
+           return
+     end subroutine stdlib_slaqgb
+
+     !> SLAQGE: equilibrates a general M by N matrix A using the row and
+     !> column scaling factors in the vectors R and C.
+
+     pure subroutine stdlib_slaqge( m, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           integer(ilp), intent(in) :: lda, m, n
+           real(sp), intent(in) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: c(*), r(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1e+0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           large = one / small
+           if( rowcnd>=thresh .and. amax>=small .and. amax<=large )then
+              ! no row scaling
+              if( colcnd>=thresh ) then
+                 ! no column scaling
+                 equed = 'N'
+              else
+                 ! column scaling
+                 do j = 1, n
+                    cj = c( j )
+                    do i = 1, m
+                       a( i, j ) = cj*a( i, j )
+                    end do
+                 end do
+                 equed = 'C'
+              end if
+           else if( colcnd>=thresh ) then
+              ! row scaling, no column scaling
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = r( i )*a( i, j )
+                 end do
+              end do
+              equed = 'R'
+           else
+              ! row and column scaling
+              do j = 1, n
+                 cj = c( j )
+                 do i = 1, m
+                    a( i, j ) = cj*r( i )*a( i, j )
+                 end do
+              end do
+              equed = 'B'
+           end if
+           return
+     end subroutine stdlib_slaqge
+
+     !> Given a 2-by-2 or 3-by-3 matrix H, SLAQR1: sets v to a
+     !> scalar multiple of the first column of the product
+     !> (*)  K = (H - (sr1 + i*si1)*I)*(H - (sr2 + i*si2)*I)
+     !> scaling to avoid overflows and most underflows. It
+     !> is assumed that either
+     !> 1) sr1 = sr2 and si1 = -si2
+     !> or
+     !> 2) si1 = si2 = 0.
+     !> This is useful for starting double implicit shift bulges
+     !> in the QR algorithm.
+
+     pure subroutine stdlib_slaqr1( n, h, ldh, sr1, si1, sr2, si2, v )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: si1, si2, sr1, sr2
+           integer(ilp), intent(in) :: ldh, n
+           ! Array Arguments 
+           real(sp), intent(in) :: h(ldh,*)
+           real(sp), intent(out) :: v(*)
+        ! ================================================================
+           
+           ! Local Scalars 
+           real(sp) :: h21s, h31s, s
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! quick return if possible
+           if( n/=2 .and. n/=3 ) then
+              return
+           end if
+           if( n==2 ) then
+              s = abs( h( 1, 1 )-sr2 ) + abs( si2 ) + abs( h( 2, 1 ) )
+              if( s==zero ) then
+                 v( 1 ) = zero
+                 v( 2 ) = zero
+              else
+                 h21s = h( 2, 1 ) / s
+                 v( 1 ) = h21s*h( 1, 2 ) + ( h( 1, 1 )-sr1 )*( ( h( 1, 1 )-sr2 ) / s ) - si1*( &
+                           si2 / s )
+                 v( 2 ) = h21s*( h( 1, 1 )+h( 2, 2 )-sr1-sr2 )
+              end if
+           else
+              s = abs( h( 1, 1 )-sr2 ) + abs( si2 ) + abs( h( 2, 1 ) ) +abs( h( 3, 1 ) )
+              if( s==zero ) then
+                 v( 1 ) = zero
+                 v( 2 ) = zero
+                 v( 3 ) = zero
+              else
+                 h21s = h( 2, 1 ) / s
+                 h31s = h( 3, 1 ) / s
+                 v( 1 ) = ( h( 1, 1 )-sr1 )*( ( h( 1, 1 )-sr2 ) / s ) -si1*( si2 / s ) + h( 1, 2 )&
+                           *h21s + h( 1, 3 )*h31s
+                 v( 2 ) = h21s*( h( 1, 1 )+h( 2, 2 )-sr1-sr2 ) +h( 2, 3 )*h31s
+                 v( 3 ) = h31s*( h( 1, 1 )+h( 3, 3 )-sr1-sr2 ) +h21s*h( 3, 2 )
+              end if
+           end if
+     end subroutine stdlib_slaqr1
+
+     !> SLAQSB: equilibrates a symmetric band matrix A using the scaling
+     !> factors in the vector S.
+
+     pure subroutine stdlib_slaqsb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(sp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(inout) :: ab(ldab,*)
+           real(sp), intent(in) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1e+0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored in band format.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = max( 1, j-kd ), j
+                       ab( kd+1+i-j, j ) = cj*s( i )*ab( kd+1+i-j, j )
+                    end do
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, min( n, j+kd )
+                       ab( 1+i-j, j ) = cj*s( i )*ab( 1+i-j, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_slaqsb
+
+     !> SLAQSP: equilibrates a symmetric matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_slaqsp( uplo, n, ap, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(inout) :: ap(*)
+           real(sp), intent(in) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1e+0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jc
+           real(sp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j
+                       ap( jc+i-1 ) = cj*s( i )*ap( jc+i-1 )
+                    end do
+                    jc = jc + j
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, n
+                       ap( jc+i-j ) = cj*s( i )*ap( jc+i-j )
+                    end do
+                    jc = jc + n - j + 1
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_slaqsp
+
+     !> SLAQSY: equilibrates a symmetric matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_slaqsy( uplo, n, a, lda, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: thresh = 0.1e+0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, n
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_slaqsy
+
+     !> SLAR2V: applies a vector of real plane rotations from both sides to
+     !> a sequence of 2-by-2 real symmetric matrices, defined by the elements
+     !> of the vectors x, y and z. For i = 1,2,...,n
+     !> ( x(i)  z(i) ) := (  c(i)  s(i) ) ( x(i)  z(i) ) ( c(i) -s(i) )
+     !> ( z(i)  y(i) )    ( -s(i)  c(i) ) ( z(i)  y(i) ) ( s(i)  c(i) )
+
+     pure subroutine stdlib_slar2v( n, x, y, z, incx, c, s, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, n
+           ! Array Arguments 
+           real(sp), intent(in) :: c(*), s(*)
+           real(sp), intent(inout) :: x(*), y(*), z(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix
+           real(sp) :: ci, si, t1, t2, t3, t4, t5, t6, xi, yi, zi
+           ! Executable Statements 
+           ix = 1
+           ic = 1
+           do i = 1, n
+              xi = x( ix )
+              yi = y( ix )
+              zi = z( ix )
+              ci = c( ic )
+              si = s( ic )
+              t1 = si*zi
+              t2 = ci*zi
+              t3 = t2 - si*xi
+              t4 = t2 + si*yi
+              t5 = ci*xi + t1
+              t6 = ci*yi - t1
+              x( ix ) = ci*t5 + si*t4
+              y( ix ) = ci*t6 - si*t3
+              z( ix ) = ci*t4 - si*t5
+              ix = ix + incx
+              ic = ic + incc
+           end do
+           return
+     end subroutine stdlib_slar2v
+
+     !> SLARF: applies a real elementary reflector H to a real m by n matrix
+     !> C, from either the left or the right. H is represented in the form
+     !> H = I - tau * v * v**T
+     !> where tau is a real scalar and v is a real vector.
+     !> If tau = 0, then H is taken to be the unit matrix.
+
+     pure subroutine stdlib_slarf( side, m, n, v, incv, tau, c, ldc, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: incv, ldc, m, n
+           real(sp), intent(in) :: tau
+           ! Array Arguments 
+           real(sp), intent(inout) :: c(ldc,*)
+           real(sp), intent(in) :: v(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: applyleft
+           integer(ilp) :: i, lastv, lastc
+           ! Executable Statements 
+           applyleft = stdlib_lsame( side, 'L' )
+           lastv = 0
+           lastc = 0
+           if( tau/=zero ) then
+           ! set up variables for scanning v.  lastv begins pointing to the end
+           ! of v.
+              if( applyleft ) then
+                 lastv = m
+              else
+                 lastv = n
+              end if
+              if( incv>0 ) then
+                 i = 1 + (lastv-1) * incv
+              else
+                 i = 1
+              end if
+           ! look for the last non-zero row in v.
+              do while( lastv>0 .and. v( i )==zero )
+                 lastv = lastv - 1
+                 i = i - incv
+              end do
+              if( applyleft ) then
+           ! scan for the last non-zero column in c(1:lastv,:).
+                 lastc = stdlib_ilaslc(lastv, n, c, ldc)
+              else
+           ! scan for the last non-zero row in c(:,1:lastv).
+                 lastc = stdlib_ilaslr(m, lastv, c, ldc)
+              end if
+           end if
+           ! note that lastc.eq.0_sp renders the blas operations null; no special
+           ! case is needed at this level.
+           if( applyleft ) then
+              ! form  h * c
+              if( lastv>0 ) then
+                 ! w(1:lastc,1) := c(1:lastv,1:lastc)**t * v(1:lastv,1)
+                 call stdlib_sgemv( 'TRANSPOSE', lastv, lastc, one, c, ldc, v, incv,zero, work, 1 &
+                           )
+                 ! c(1:lastv,1:lastc) := c(...) - v(1:lastv,1) * w(1:lastc,1)**t
+                 call stdlib_sger( lastv, lastc, -tau, v, incv, work, 1, c, ldc )
+              end if
+           else
+              ! form  c * h
+              if( lastv>0 ) then
+                 ! w(1:lastc,1) := c(1:lastc,1:lastv) * v(1:lastv,1)
+                 call stdlib_sgemv( 'NO TRANSPOSE', lastc, lastv, one, c, ldc,v, incv, zero, work,&
+                            1 )
+                 ! c(1:lastc,1:lastv) := c(...) - w(1:lastc,1) * v(1:lastv,1)**t
+                 call stdlib_sger( lastc, lastv, -tau, work, 1, v, incv, c, ldc )
+              end if
+           end if
+           return
+     end subroutine stdlib_slarf
+
+     !> SLARFB: applies a real block reflector H or its transpose H**T to a
+     !> real m by n matrix C, from either the left or the right.
+
+     pure subroutine stdlib_slarfb( side, trans, direct, storev, m, n, k, v, ldv,t, ldt, c, ldc, &
+               work, ldwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, ldc, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: c(ldc,*)
+           real(sp), intent(in) :: t(ldt,*), v(ldv,*)
+           real(sp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           character :: transt
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           if( stdlib_lsame( trans, 'N' ) ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           if( stdlib_lsame( storev, 'C' ) ) then
+              if( stdlib_lsame( direct, 'F' ) ) then
+                 ! let  v =  ( v1 )    (first k rows)
+                           ! ( v2 )
+                 ! where  v1  is unit lower triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**t * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**t * v  =  (c1**t * v1 + c2**t * v2)  (stored in work)
+                    ! w := c1**t
+                    do j = 1, k
+                       call stdlib_scopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1
+                    call stdlib_strmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n,k, one, v, ldv,&
+                               work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c2**t * v2
+                       call stdlib_sgemm( 'TRANSPOSE', 'NO TRANSPOSE', n, k, m-k,one, c( k+1, 1 ),&
+                                  ldc, v( k+1, 1 ), ldv,one, work, ldwork )
+                    end if
+                    ! w := w * t**t  or  w * t
+                    call stdlib_strmm( 'RIGHT', 'UPPER', transt, 'NON-UNIT', n, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v * w**t
+                    if( m>k ) then
+                       ! c2 := c2 - v2 * w**t
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', m-k, n, k,-one, v( k+1, 1 )&
+                                 , ldv, work, ldwork, one,c( k+1, 1 ), ldc )
+                    end if
+                    ! w := w * v1**t
+                    call stdlib_strmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'UNIT', n, k,one, v, ldv, &
+                              work, ldwork )
+                    ! c1 := c1 - w**t
+                    do j = 1, k
+                       do i = 1, n
+                          c( j, i ) = c( j, i ) - work( i, j )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**t  where  c = ( c1  c2 )
+                    ! w := c * v  =  (c1*v1 + c2*v2)  (stored in work)
+                    ! w := c1
+                    do j = 1, k
+                       call stdlib_scopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1
+                    call stdlib_strmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', m,k, one, v, ldv,&
+                               work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c2 * v2
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, k, n-k,one, c( 1, k+&
+                                 1 ), ldc, v( k+1, 1 ), ldv,one, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**t
+                    call stdlib_strmm( 'RIGHT', 'UPPER', trans, 'NON-UNIT', m, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v**t
+                    if( n>k ) then
+                       ! c2 := c2 - w * v2**t
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, n-k, k,-one, work, &
+                                 ldwork, v( k+1, 1 ), ldv, one,c( 1, k+1 ), ldc )
+                    end if
+                    ! w := w * v1**t
+                    call stdlib_strmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'UNIT', m, k,one, v, ldv, &
+                              work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, j ) = c( i, j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              else
+                 ! let  v =  ( v1 )
+                           ! ( v2 )    (last k rows)
+                 ! where  v2  is unit upper triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**t * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**t * v  =  (c1**t * v1 + c2**t * v2)  (stored in work)
+                    ! w := c2**t
+                    do j = 1, k
+                       call stdlib_scopy( n, c( m-k+j, 1 ), ldc, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2
+                    call stdlib_strmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', n,k, one, v( m-k+&
+                              1, 1 ), ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c1**t * v1
+                       call stdlib_sgemm( 'TRANSPOSE', 'NO TRANSPOSE', n, k, m-k,one, c, ldc, v, &
+                                 ldv, one, work, ldwork )
+                    end if
+                    ! w := w * t**t  or  w * t
+                    call stdlib_strmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v * w**t
+                    if( m>k ) then
+                       ! c1 := c1 - v1 * w**t
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', m-k, n, k,-one, v, ldv, &
+                                 work, ldwork, one, c, ldc )
+                    end if
+                    ! w := w * v2**t
+                    call stdlib_strmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'UNIT', n, k,one, v( m-k+1, &
+                              1 ), ldv, work, ldwork )
+                    ! c2 := c2 - w**t
+                    do j = 1, k
+                       do i = 1, n
+                          c( m-k+j, i ) = c( m-k+j, i ) - work( i, j )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h'  where  c = ( c1  c2 )
+                    ! w := c * v  =  (c1*v1 + c2*v2)  (stored in work)
+                    ! w := c2
+                    do j = 1, k
+                       call stdlib_scopy( m, c( 1, n-k+j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2
+                    call stdlib_strmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', m,k, one, v( n-k+&
+                              1, 1 ), ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c1 * v1
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, k, n-k,one, c, ldc, &
+                                 v, ldv, one, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**t
+                    call stdlib_strmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v**t
+                    if( n>k ) then
+                       ! c1 := c1 - w * v1**t
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, n-k, k,-one, work, &
+                                 ldwork, v, ldv, one, c, ldc )
+                    end if
+                    ! w := w * v2**t
+                    call stdlib_strmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'UNIT', m, k,one, v( n-k+1, &
+                              1 ), ldv, work, ldwork )
+                    ! c2 := c2 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, n-k+j ) = c( i, n-k+j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              end if
+           else if( stdlib_lsame( storev, 'R' ) ) then
+              if( stdlib_lsame( direct, 'F' ) ) then
+                 ! let  v =  ( v1  v2 )    (v1: first k columns)
+                 ! where  v1  is unit upper triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**t * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**t * v**t  =  (c1**t * v1**t + c2**t * v2**t) (stored in work)
+                    ! w := c1**t
+                    do j = 1, k
+                       call stdlib_scopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1**t
+                    call stdlib_strmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'UNIT', n, k,one, v, ldv, &
+                              work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c2**t * v2**t
+                       call stdlib_sgemm( 'TRANSPOSE', 'TRANSPOSE', n, k, m-k, one,c( k+1, 1 ), &
+                                 ldc, v( 1, k+1 ), ldv, one,work, ldwork )
+                    end if
+                    ! w := w * t**t  or  w * t
+                    call stdlib_strmm( 'RIGHT', 'UPPER', transt, 'NON-UNIT', n, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v**t * w**t
+                    if( m>k ) then
+                       ! c2 := c2 - v2**t * w**t
+                       call stdlib_sgemm( 'TRANSPOSE', 'TRANSPOSE', m-k, n, k, -one,v( 1, k+1 ), &
+                                 ldv, work, ldwork, one,c( k+1, 1 ), ldc )
+                    end if
+                    ! w := w * v1
+                    call stdlib_strmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', n,k, one, v, ldv,&
+                               work, ldwork )
+                    ! c1 := c1 - w**t
+                    do j = 1, k
+                       do i = 1, n
+                          c( j, i ) = c( j, i ) - work( i, j )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**t  where  c = ( c1  c2 )
+                    ! w := c * v**t  =  (c1*v1**t + c2*v2**t)  (stored in work)
+                    ! w := c1
+                    do j = 1, k
+                       call stdlib_scopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1**t
+                    call stdlib_strmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'UNIT', m, k,one, v, ldv, &
+                              work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c2 * v2**t
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, k, n-k,one, c( 1, k+1 ),&
+                                  ldc, v( 1, k+1 ), ldv,one, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**t
+                    call stdlib_strmm( 'RIGHT', 'UPPER', trans, 'NON-UNIT', m, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v
+                    if( n>k ) then
+                       ! c2 := c2 - w * v2
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, n-k, k,-one, work, &
+                                 ldwork, v( 1, k+1 ), ldv, one,c( 1, k+1 ), ldc )
+                    end if
+                    ! w := w * v1
+                    call stdlib_strmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', m,k, one, v, ldv,&
+                               work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, j ) = c( i, j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              else
+                 ! let  v =  ( v1  v2 )    (v2: last k columns)
+                 ! where  v2  is unit lower triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**t * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**t * v**t  =  (c1**t * v1**t + c2**t * v2**t) (stored in work)
+                    ! w := c2**t
+                    do j = 1, k
+                       call stdlib_scopy( n, c( m-k+j, 1 ), ldc, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2**t
+                    call stdlib_strmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'UNIT', n, k,one, v( 1, m-k+&
+                              1 ), ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c1**t * v1**t
+                       call stdlib_sgemm( 'TRANSPOSE', 'TRANSPOSE', n, k, m-k, one,c, ldc, v, ldv,&
+                                  one, work, ldwork )
+                    end if
+                    ! w := w * t**t  or  w * t
+                    call stdlib_strmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v**t * w**t
+                    if( m>k ) then
+                       ! c1 := c1 - v1**t * w**t
+                       call stdlib_sgemm( 'TRANSPOSE', 'TRANSPOSE', m-k, n, k, -one,v, ldv, work, &
+                                 ldwork, one, c, ldc )
+                    end if
+                    ! w := w * v2
+                    call stdlib_strmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n,k, one, v( 1, &
+                              m-k+1 ), ldv, work, ldwork )
+                    ! c2 := c2 - w**t
+                    do j = 1, k
+                       do i = 1, n
+                          c( m-k+j, i ) = c( m-k+j, i ) - work( i, j )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**t  where  c = ( c1  c2 )
+                    ! w := c * v**t  =  (c1*v1**t + c2*v2**t)  (stored in work)
+                    ! w := c2
+                    do j = 1, k
+                       call stdlib_scopy( m, c( 1, n-k+j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2**t
+                    call stdlib_strmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'UNIT', m, k,one, v( 1, n-k+&
+                              1 ), ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c1 * v1**t
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, k, n-k,one, c, ldc, v, &
+                                 ldv, one, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**t
+                    call stdlib_strmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k,one, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v
+                    if( n>k ) then
+                       ! c1 := c1 - w * v1
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, n-k, k,-one, work, &
+                                 ldwork, v, ldv, one, c, ldc )
+                    end if
+                    ! w := w * v2
+                    call stdlib_strmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', m,k, one, v( 1, &
+                              n-k+1 ), ldv, work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, n-k+j ) = c( i, n-k+j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_slarfb
+
+     !> SLARFB_GETT: applies a real Householder block reflector H from the
+     !> left to a real (K+M)-by-N  "triangular-pentagonal" matrix
+     !> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+     !> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+     !> in the array B. The block reflector H is stored in a compact
+     !> WY-representation, where the elementary reflectors are in the
+     !> arrays A, B and T. See Further Details section.
+
+     pure subroutine stdlib_slarfb_gett( ident, m, n, k, t, ldt, a, lda, b, ldb,work, ldwork )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: ident
+           integer(ilp), intent(in) :: k, lda, ldb, ldt, ldwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(in) :: t(ldt,*)
+           real(sp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnotident
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<0 .or. n<=0 .or. k==0 .or. k>n )return
+           lnotident = .not.stdlib_lsame( ident, 'I' )
+           ! ------------------------------------------------------------------
+           ! first step. computation of the column block 2:
+              ! ( a2 ) := h * ( a2 )
+              ! ( b2 )        ( b2 )
+           ! ------------------------------------------------------------------
+           if( n>k ) then
+              ! col2_(1) compute w2: = a2. therefore, copy a2 = a(1:k, k+1:n)
+              ! into w2=work(1:k, 1:n-k) column-by-column.
+              do j = 1, n-k
+                 call stdlib_scopy( k, a( 1, k+j ), 1, work( 1, j ), 1 )
+              end do
+              if( lnotident ) then
+                 ! col2_(2) compute w2: = (v1**t) * w2 = (a1**t) * w2,
+                 ! v1 is not an identy matrix, but unit lower-triangular
+                 ! v1 stored in a1 (diagonal ones are not stored).
+                 call stdlib_strmm( 'L', 'L', 'T', 'U', k, n-k, one, a, lda,work, ldwork )
+              end if
+              ! col2_(3) compute w2: = w2 + (v2**t) * b2 = w2 + (b1**t) * b2
+              ! v2 stored in b1.
+              if( m>0 ) then
+                 call stdlib_sgemm( 'T', 'N', k, n-k, m, one, b, ldb,b( 1, k+1 ), ldb, one, work, &
+                           ldwork )
+              end if
+              ! col2_(4) compute w2: = t * w2,
+              ! t is upper-triangular.
+              call stdlib_strmm( 'L', 'U', 'N', 'N', k, n-k, one, t, ldt,work, ldwork )
+              ! col2_(5) compute b2: = b2 - v2 * w2 = b2 - b1 * w2,
+              ! v2 stored in b1.
+              if( m>0 ) then
+                 call stdlib_sgemm( 'N', 'N', m, n-k, k, -one, b, ldb,work, ldwork, one, b( 1, k+&
+                           1 ), ldb )
+              end if
+              if( lnotident ) then
+                 ! col2_(6) compute w2: = v1 * w2 = a1 * w2,
+                 ! v1 is not an identity matrix, but unit lower-triangular,
+                 ! v1 stored in a1 (diagonal ones are not stored).
+                 call stdlib_strmm( 'L', 'L', 'N', 'U', k, n-k, one, a, lda,work, ldwork )
+              end if
+              ! col2_(7) compute a2: = a2 - w2 =
+                                   ! = a(1:k, k+1:n-k) - work(1:k, 1:n-k),
+              ! column-by-column.
+              do j = 1, n-k
+                 do i = 1, k
+                    a( i, k+j ) = a( i, k+j ) - work( i, j )
+                 end do
+              end do
+           end if
+           ! ------------------------------------------------------------------
+           ! second step. computation of the column block 1:
+              ! ( a1 ) := h * ( a1 )
+              ! ( b1 )        (  0 )
+           ! ------------------------------------------------------------------
+           ! col1_(1) compute w1: = a1. copy the upper-triangular
+           ! a1 = a(1:k, 1:k) into the upper-triangular
+           ! w1 = work(1:k, 1:k) column-by-column.
+           do j = 1, k
+              call stdlib_scopy( j, a( 1, j ), 1, work( 1, j ), 1 )
+           end do
+           ! set the subdiagonal elements of w1 to zero column-by-column.
+           do j = 1, k - 1
+              do i = j + 1, k
+                 work( i, j ) = zero
+              end do
+           end do
+           if( lnotident ) then
+              ! col1_(2) compute w1: = (v1**t) * w1 = (a1**t) * w1,
+              ! v1 is not an identity matrix, but unit lower-triangular
+              ! v1 stored in a1 (diagonal ones are not stored),
+              ! w1 is upper-triangular with zeroes below the diagonal.
+              call stdlib_strmm( 'L', 'L', 'T', 'U', k, k, one, a, lda,work, ldwork )
+           end if
+           ! col1_(3) compute w1: = t * w1,
+           ! t is upper-triangular,
+           ! w1 is upper-triangular with zeroes below the diagonal.
+           call stdlib_strmm( 'L', 'U', 'N', 'N', k, k, one, t, ldt,work, ldwork )
+           ! col1_(4) compute b1: = - v2 * w1 = - b1 * w1,
+           ! v2 = b1, w1 is upper-triangular with zeroes below the diagonal.
+           if( m>0 ) then
+              call stdlib_strmm( 'R', 'U', 'N', 'N', m, k, -one, work, ldwork,b, ldb )
+           end if
+           if( lnotident ) then
+              ! col1_(5) compute w1: = v1 * w1 = a1 * w1,
+              ! v1 is not an identity matrix, but unit lower-triangular
+              ! v1 stored in a1 (diagonal ones are not stored),
+              ! w1 is upper-triangular on input with zeroes below the diagonal,
+              ! and square on output.
+              call stdlib_strmm( 'L', 'L', 'N', 'U', k, k, one, a, lda,work, ldwork )
+              ! col1_(6) compute a1: = a1 - w1 = a(1:k, 1:k) - work(1:k, 1:k)
+              ! column-by-column. a1 is upper-triangular on input.
+              ! if ident, a1 is square on output, and w1 is square,
+              ! if not ident, a1 is upper-triangular on output,
+              ! w1 is upper-triangular.
+              ! col1_(6)_a compute elements of a1 below the diagonal.
+              do j = 1, k - 1
+                 do i = j + 1, k
+                    a( i, j ) = - work( i, j )
+                 end do
+              end do
+           end if
+           ! col1_(6)_b compute elements of a1 on and above the diagonal.
+           do j = 1, k
+              do i = 1, j
+                 a( i, j ) = a( i, j ) - work( i, j )
+              end do
+           end do
+           return
+     end subroutine stdlib_slarfb_gett
+
+     !> SLARFT: forms the triangular factor T of a real block reflector H
+     !> of order n, which is defined as a product of k elementary reflectors.
+     !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+     !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+     !> If STOREV = 'C', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th column of the array V, and
+     !> H  =  I - V * T * V**T
+     !> If STOREV = 'R', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th row of the array V, and
+     !> H  =  I - V**T * T * V
+
+     pure subroutine stdlib_slarft( direct, storev, n, k, v, ldv, tau, t, ldt )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, storev
+           integer(ilp), intent(in) :: k, ldt, ldv, n
+           ! Array Arguments 
+           real(sp), intent(out) :: t(ldt,*)
+           real(sp), intent(in) :: tau(*), v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, prevlastv, lastv
+           ! Executable Statements 
+           ! quick return if possible
+           if( n==0 )return
+           if( stdlib_lsame( direct, 'F' ) ) then
+              prevlastv = n
+              do i = 1, k
+                 prevlastv = max( i, prevlastv )
+                 if( tau( i )==zero ) then
+                    ! h(i)  =  i
+                    do j = 1, i
+                       t( j, i ) = zero
+                    end do
+                 else
+                    ! general case
+                    if( stdlib_lsame( storev, 'C' ) ) then
+                       ! skip any trailing zeros.
+                       do lastv = n, i+1, -1
+                          if( v( lastv, i )/=zero ) exit
+                       end do
+                       do j = 1, i-1
+                          t( j, i ) = -tau( i ) * v( i , j )
+                       end do
+                       j = min( lastv, prevlastv )
+                       ! t(1:i-1,i) := - tau(i) * v(i:j,1:i-1)**t * v(i:j,i)
+                       call stdlib_sgemv( 'TRANSPOSE', j-i, i-1, -tau( i ),v( i+1, 1 ), ldv, v( i+&
+                                 1, i ), 1, one,t( 1, i ), 1 )
+                    else
+                       ! skip any trailing zeros.
+                       do lastv = n, i+1, -1
+                          if( v( i, lastv )/=zero ) exit
+                       end do
+                       do j = 1, i-1
+                          t( j, i ) = -tau( i ) * v( j , i )
+                       end do
+                       j = min( lastv, prevlastv )
+                       ! t(1:i-1,i) := - tau(i) * v(1:i-1,i:j) * v(i,i:j)**t
+                       call stdlib_sgemv( 'NO TRANSPOSE', i-1, j-i, -tau( i ),v( 1, i+1 ), ldv, v(&
+                                  i, i+1 ), ldv,one, t( 1, i ), 1 )
+                    end if
+                    ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+                    call stdlib_strmv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', i-1, t,ldt, t( 1, i ),&
+                               1 )
+                    t( i, i ) = tau( i )
+                    if( i>1 ) then
+                       prevlastv = max( prevlastv, lastv )
+                    else
+                       prevlastv = lastv
+                    end if
+                 end if
+              end do
+           else
+              prevlastv = 1
+              do i = k, 1, -1
+                 if( tau( i )==zero ) then
+                    ! h(i)  =  i
+                    do j = i, k
+                       t( j, i ) = zero
+                    end do
+                 else
+                    ! general case
+                    if( i<k ) then
+                       if( stdlib_lsame( storev, 'C' ) ) then
+                          ! skip any leading zeros.
+                          do lastv = 1, i-1
+                             if( v( lastv, i )/=zero ) exit
+                          end do
+                          do j = i+1, k
+                             t( j, i ) = -tau( i ) * v( n-k+i , j )
+                          end do
+                          j = max( lastv, prevlastv )
+                          ! t(i+1:k,i) = -tau(i) * v(j:n-k+i,i+1:k)**t * v(j:n-k+i,i)
+                          call stdlib_sgemv( 'TRANSPOSE', n-k+i-j, k-i, -tau( i ),v( j, i+1 ), &
+                                    ldv, v( j, i ), 1, one,t( i+1, i ), 1 )
+                       else
+                          ! skip any leading zeros.
+                          do lastv = 1, i-1
+                             if( v( i, lastv )/=zero ) exit
+                          end do
+                          do j = i+1, k
+                             t( j, i ) = -tau( i ) * v( j, n-k+i )
+                          end do
+                          j = max( lastv, prevlastv )
+                          ! t(i+1:k,i) = -tau(i) * v(i+1:k,j:n-k+i) * v(i,j:n-k+i)**t
+                          call stdlib_sgemv( 'NO TRANSPOSE', k-i, n-k+i-j,-tau( i ), v( i+1, j ), &
+                                    ldv, v( i, j ), ldv,one, t( i+1, i ), 1 )
+                       end if
+                       ! t(i+1:k,i) := t(i+1:k,i+1:k) * t(i+1:k,i)
+                       call stdlib_strmv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', k-i,t( i+1, i+1 ), &
+                                 ldt, t( i+1, i ), 1 )
+                       if( i>1 ) then
+                          prevlastv = min( prevlastv, lastv )
+                       else
+                          prevlastv = lastv
+                       end if
+                    end if
+                    t( i, i ) = tau( i )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_slarft
+
+     !> SLARFX: applies a real elementary reflector H to a real m by n
+     !> matrix C, from either the left or the right. H is represented in the
+     !> form
+     !> H = I - tau * v * v**T
+     !> where tau is a real scalar and v is a real vector.
+     !> If tau = 0, then H is taken to be the unit matrix
+     !> This version uses inline code if H has order < 11.
+
+     pure subroutine stdlib_slarfx( side, m, n, v, tau, c, ldc, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: ldc, m, n
+           real(sp), intent(in) :: tau
+           ! Array Arguments 
+           real(sp), intent(inout) :: c(ldc,*)
+           real(sp), intent(in) :: v(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j
+           real(sp) :: sum, t1, t10, t2, t3, t4, t5, t6, t7, t8, t9, v1, v10, v2, v3, v4, v5, v6, &
+                     v7, v8, v9
+           ! Executable Statements 
+           if( tau==zero )return
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c, where h has order m.
+              go to ( 10, 30, 50, 70, 90, 110, 130, 150,170, 190 )m
+              ! code for general m
+              call stdlib_slarf( side, m, n, v, 1, tau, c, ldc, work )
+              go to 410
+              10 continue
+              ! special code for 1 x 1 householder
+              t1 = one - tau*v( 1 )*v( 1 )
+              do j = 1, n
+                 c( 1, j ) = t1*c( 1, j )
+              end do
+              go to 410
+              30 continue
+              ! special code for 2 x 2 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+              end do
+              go to 410
+              50 continue
+              ! special code for 3 x 3 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+              end do
+              go to 410
+              70 continue
+              ! special code for 4 x 4 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+              end do
+              go to 410
+              90 continue
+              ! special code for 5 x 5 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j )
+                           
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+              end do
+              go to 410
+              110 continue
+              ! special code for 6 x 6 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+              end do
+              go to 410
+              130 continue
+              ! special code for 7 x 7 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+              end do
+              go to 410
+              150 continue
+              ! special code for 8 x 8 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+              end do
+              go to 410
+              170 continue
+              ! special code for 9 x 9 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              v9 = v( 9 )
+              t9 = tau*v9
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j ) + v9*c( 9, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+                 c( 9, j ) = c( 9, j ) - sum*t9
+              end do
+              go to 410
+              190 continue
+              ! special code for 10 x 10 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              v9 = v( 9 )
+              t9 = tau*v9
+              v10 = v( 10 )
+              t10 = tau*v10
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j ) + v9*c( 9, j ) +v10*c( 10, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+                 c( 9, j ) = c( 9, j ) - sum*t9
+                 c( 10, j ) = c( 10, j ) - sum*t10
+              end do
+              go to 410
+           else
+              ! form  c * h, where h has order n.
+              go to ( 210, 230, 250, 270, 290, 310, 330, 350,370, 390 )n
+              ! code for general n
+              call stdlib_slarf( side, m, n, v, 1, tau, c, ldc, work )
+              go to 410
+              210 continue
+              ! special code for 1 x 1 householder
+              t1 = one - tau*v( 1 )*v( 1 )
+              do j = 1, m
+                 c( j, 1 ) = t1*c( j, 1 )
+              end do
+              go to 410
+              230 continue
+              ! special code for 2 x 2 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+              end do
+              go to 410
+              250 continue
+              ! special code for 3 x 3 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+              end do
+              go to 410
+              270 continue
+              ! special code for 4 x 4 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+              end do
+              go to 410
+              290 continue
+              ! special code for 5 x 5 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 )
+                           
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+              end do
+              go to 410
+              310 continue
+              ! special code for 6 x 6 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+              end do
+              go to 410
+              330 continue
+              ! special code for 7 x 7 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+              end do
+              go to 410
+              350 continue
+              ! special code for 8 x 8 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+              end do
+              go to 410
+              370 continue
+              ! special code for 9 x 9 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              v9 = v( 9 )
+              t9 = tau*v9
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 ) + v9*c( j, 9 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+                 c( j, 9 ) = c( j, 9 ) - sum*t9
+              end do
+              go to 410
+              390 continue
+              ! special code for 10 x 10 householder
+              v1 = v( 1 )
+              t1 = tau*v1
+              v2 = v( 2 )
+              t2 = tau*v2
+              v3 = v( 3 )
+              t3 = tau*v3
+              v4 = v( 4 )
+              t4 = tau*v4
+              v5 = v( 5 )
+              t5 = tau*v5
+              v6 = v( 6 )
+              t6 = tau*v6
+              v7 = v( 7 )
+              t7 = tau*v7
+              v8 = v( 8 )
+              t8 = tau*v8
+              v9 = v( 9 )
+              t9 = tau*v9
+              v10 = v( 10 )
+              t10 = tau*v10
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 ) + v9*c( j, 9 ) +v10*c( j, 10 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+                 c( j, 9 ) = c( j, 9 ) - sum*t9
+                 c( j, 10 ) = c( j, 10 ) - sum*t10
+              end do
+              go to 410
+           end if
+       410 return
+     end subroutine stdlib_slarfx
+
+     !> SLARFY: applies an elementary reflector, or Householder matrix, H,
+     !> to an n x n symmetric matrix C, from both the left and the right.
+     !> H is represented in the form
+     !> H = I - tau * v * v'
+     !> where  tau  is a scalar and  v  is a vector.
+     !> If  tau  is  zero, then  H  is taken to be the unit matrix.
+
+     pure subroutine stdlib_slarfy( uplo, n, v, incv, tau, c, ldc, work )
+        ! -- lapack test routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incv, ldc, n
+           real(sp), intent(in) :: tau
+           ! Array Arguments 
+           real(sp), intent(inout) :: c(ldc,*)
+           real(sp), intent(in) :: v(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: alpha
+           ! Executable Statements 
+           if( tau==zero )return
+           ! form  w:= c * v
+           call stdlib_ssymv( uplo, n, one, c, ldc, v, incv, zero, work, 1 )
+           alpha = -half*tau*stdlib_sdot( n, work, 1, v, incv )
+           call stdlib_saxpy( n, alpha, v, incv, work, 1 )
+           ! c := c - v * w' - w * v'
+           call stdlib_ssyr2( uplo, n, -tau, v, incv, work, 1, c, ldc )
+           return
+     end subroutine stdlib_slarfy
+
+     !> SLARGV: generates a vector of real plane rotations, determined by
+     !> elements of the real vectors x and y. For i = 1,2,...,n
+     !> (  c(i)  s(i) ) ( x(i) ) = ( a(i) )
+     !> ( -s(i)  c(i) ) ( y(i) ) = (   0  )
+
+     pure subroutine stdlib_slargv( n, x, incx, y, incy, c, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, incy, n
+           ! Array Arguments 
+           real(sp), intent(out) :: c(*)
+           real(sp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix, iy
+           real(sp) :: f, g, t, tt
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           ix = 1
+           iy = 1
+           ic = 1
+           loop_10: do i = 1, n
+              f = x( ix )
+              g = y( iy )
+              if( g==zero ) then
+                 c( ic ) = one
+              else if( f==zero ) then
+                 c( ic ) = zero
+                 y( iy ) = one
+                 x( ix ) = g
+              else if( abs( f )>abs( g ) ) then
+                 t = g / f
+                 tt = sqrt( one+t*t )
+                 c( ic ) = one / tt
+                 y( iy ) = t*c( ic )
+                 x( ix ) = f*tt
+              else
+                 t = f / g
+                 tt = sqrt( one+t*t )
+                 y( iy ) = one / tt
+                 c( ic ) = t*y( iy )
+                 x( ix ) = g*tt
+              end if
+              ic = ic + incc
+              iy = iy + incy
+              ix = ix + incx
+           end do loop_10
+           return
+     end subroutine stdlib_slargv
+
+     !> Compute the splitting points with threshold SPLTOL.
+     !> SLARRA: sets any "small" off-diagonal elements to zero.
+
+     pure subroutine stdlib_slarra( n, d, e, e2, spltol, tnrm,nsplit, isplit, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, nsplit
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: spltol, tnrm
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isplit(*)
+           real(sp), intent(in) :: d(*)
+           real(sp), intent(inout) :: e(*), e2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: eabs, tmp1
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           ! compute splitting points
+           nsplit = 1
+           if(spltol<zero) then
+              ! criterion based on absolute off-diagonal value
+              tmp1 = abs(spltol)* tnrm
+              do i = 1, n-1
+                 eabs = abs( e(i) )
+                 if( eabs <= tmp1) then
+                    e(i) = zero
+                    e2(i) = zero
+                    isplit( nsplit ) = i
+                    nsplit = nsplit + 1
+                 end if
+              end do
+           else
+              ! criterion that guarantees relative accuracy
+              do i = 1, n-1
+                 eabs = abs( e(i) )
+                 if( eabs <= spltol * sqrt(abs(d(i)))*sqrt(abs(d(i+1))) )then
+                    e(i) = zero
+                    e2(i) = zero
+                    isplit( nsplit ) = i
+                    nsplit = nsplit + 1
+                 end if
+              end do
+           endif
+           isplit( nsplit ) = n
+           return
+     end subroutine stdlib_slarra
+
+     !> Find the number of eigenvalues of the symmetric tridiagonal matrix T
+     !> that are in the interval (VL,VU] if JOBT = 'T', and of L D L^T
+     !> if JOBT = 'L'.
+
+     pure subroutine stdlib_slarrc( jobt, n, vl, vu, d, e, pivmin,eigcnt, lcnt, rcnt, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobt
+           integer(ilp), intent(out) :: eigcnt, info, lcnt, rcnt
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: pivmin, vl, vu
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*), e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           logical(lk) :: matt
+           real(sp) :: lpivot, rpivot, sl, su, tmp, tmp2
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           lcnt = 0
+           rcnt = 0
+           eigcnt = 0
+           matt = stdlib_lsame( jobt, 'T' )
+           if (matt) then
+              ! sturm sequence count on t
+              lpivot = d( 1 ) - vl
+              rpivot = d( 1 ) - vu
+              if( lpivot<=zero ) then
+                 lcnt = lcnt + 1
+              endif
+              if( rpivot<=zero ) then
+                 rcnt = rcnt + 1
+              endif
+              do i = 1, n-1
+                 tmp = e(i)**2
+                 lpivot = ( d( i+1 )-vl ) - tmp/lpivot
+                 rpivot = ( d( i+1 )-vu ) - tmp/rpivot
+                 if( lpivot<=zero ) then
+                    lcnt = lcnt + 1
+                 endif
+                 if( rpivot<=zero ) then
+                    rcnt = rcnt + 1
+                 endif
+              end do
+           else
+              ! sturm sequence count on l d l^t
+              sl = -vl
+              su = -vu
+              do i = 1, n - 1
+                 lpivot = d( i ) + sl
+                 rpivot = d( i ) + su
+                 if( lpivot<=zero ) then
+                    lcnt = lcnt + 1
+                 endif
+                 if( rpivot<=zero ) then
+                    rcnt = rcnt + 1
+                 endif
+                 tmp = e(i) * d(i) * e(i)
+                 tmp2 = tmp / lpivot
+                 if( tmp2==zero ) then
+                    sl =  tmp - vl
+                 else
+                    sl = sl*tmp2 - vl
+                 end if
+                 tmp2 = tmp / rpivot
+                 if( tmp2==zero ) then
+                    su =  tmp - vu
+                 else
+                    su = su*tmp2 - vu
+                 end if
+              end do
+              lpivot = d( n ) + sl
+              rpivot = d( n ) + su
+              if( lpivot<=zero ) then
+                 lcnt = lcnt + 1
+              endif
+              if( rpivot<=zero ) then
+                 rcnt = rcnt + 1
+              endif
+           endif
+           eigcnt = rcnt - lcnt
+           return
+     end subroutine stdlib_slarrc
+
+     !> SLARRD: computes the eigenvalues of a symmetric tridiagonal
+     !> matrix T to suitable accuracy. This is an auxiliary code to be
+     !> called from SSTEMR.
+     !> The user may ask for all eigenvalues, all eigenvalues
+     !> in the half-open interval (VL, VU], or the IL-th through IU-th
+     !> eigenvalues.
+     !> To avoid overflow, the matrix must be scaled so that its
+     !> largest element is no greater than overflow**(1/2) * underflow**(1/4) in absolute value, and for greatest
+     !> accuracy, it should not be much smaller than that.
+     !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+     !> Matrix", Report CS41, Computer Science Dept., Stanford
+     !> University, July 21, 1966.
+
+     pure subroutine stdlib_slarrd( range, order, n, vl, vu, il, iu, gers,reltol, d, e, e2, &
+               pivmin, nsplit, isplit,m, w, werr, wl, wu, iblock, indexw,work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: order, range
+           integer(ilp), intent(in) :: il, iu, n, nsplit
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: pivmin, reltol, vl, vu
+           real(sp), intent(out) :: wl, wu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iblock(*), indexw(*), iwork(*)
+           integer(ilp), intent(in) :: isplit(*)
+           real(sp), intent(in) :: d(*), e(*), e2(*), gers(*)
+           real(sp), intent(out) :: w(*), werr(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: fudge = two
+           integer(ilp), parameter :: allrng = 1
+           integer(ilp), parameter :: valrng = 2
+           integer(ilp), parameter :: indrng = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ncnvrg, toofew
+           integer(ilp) :: i, ib, ibegin, idiscl, idiscu, ie, iend, iinfo, im, in, ioff, iout, &
+                     irange, itmax, itmp1, itmp2, iw, iwoff, j, jblk, jdisc, je, jee, nb, nwl, nwu
+           real(sp) :: atoli, eps, gl, gu, rtoli, tmp1, tmp2, tnorm, uflow, wkill, wlu, &
+                     wul
+           ! Local Arrays 
+           integer(ilp) :: idumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,min
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           ! decode range
+           if( stdlib_lsame( range, 'A' ) ) then
+              irange = allrng
+           else if( stdlib_lsame( range, 'V' ) ) then
+              irange = valrng
+           else if( stdlib_lsame( range, 'I' ) ) then
+              irange = indrng
+           else
+              irange = 0
+           end if
+           ! check for errors
+           if( irange<=0 ) then
+              info = -1
+           else if( .not.(stdlib_lsame(order,'B').or.stdlib_lsame(order,'E')) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( irange==valrng ) then
+              if( vl>=vu )info = -5
+           else if( irange==indrng .and.( il<1 .or. il>max( 1, n ) ) ) then
+              info = -6
+           else if( irange==indrng .and.( iu<min( n, il ) .or. iu>n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              return
+           end if
+           ! initialize error flags
+           info = 0
+           ncnvrg = .false.
+           toofew = .false.
+           ! quick return if possible
+           m = 0
+           if( n==0 ) return
+           ! simplification:
+           if( irange==indrng .and. il==1 .and. iu==n ) irange = 1
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           uflow = stdlib_slamch( 'U' )
+           ! special case when n=1
+           ! treat case of 1x1 matrix for quick return
+           if( n==1 ) then
+              if( (irange==allrng).or.((irange==valrng).and.(d(1)>vl).and.(d(1)<=vu)).or.((&
+                        irange==indrng).and.(il==1).and.(iu==1)) ) then
+                 m = 1
+                 w(1) = d(1)
+                 ! the computation error of the eigenvalue is zero
+                 werr(1) = zero
+                 iblock( 1 ) = 1
+                 indexw( 1 ) = 1
+              endif
+              return
+           end if
+           ! nb is the minimum vector length for vector bisection, or 0
+           ! if only scalar is to be done.
+           nb = stdlib_ilaenv( 1, 'SSTEBZ', ' ', n, -1, -1, -1 )
+           if( nb<=1 ) nb = 0
+           ! find global spectral radius
+           gl = d(1)
+           gu = d(1)
+           do i = 1,n
+              gl =  min( gl, gers( 2*i - 1))
+              gu = max( gu, gers(2*i) )
+           end do
+           ! compute global gerschgorin bounds and spectral diameter
+           tnorm = max( abs( gl ), abs( gu ) )
+           gl = gl - fudge*tnorm*eps*n - fudge*two*pivmin
+           gu = gu + fudge*tnorm*eps*n + fudge*two*pivmin
+           ! [jan/28/2009] remove the line below since spdiam variable not use
+           ! spdiam = gu - gl
+           ! input arguments for stdlib_slaebz:
+           ! the relative tolerance.  an interval (a,b] lies within
+           ! "relative tolerance" if  b-a < reltol*max(|a|,|b|),
+           rtoli = reltol
+           ! set the absolute tolerance for interval convergence to zero to force
+           ! interval convergence based on relative size of the interval.
+           ! this is dangerous because intervals might not converge when reltol is
+           ! small. but at least a very small number should be selected so that for
+           ! strongly graded matrices, the code can get relatively accurate
+           ! eigenvalues.
+           atoli = fudge*two*uflow + fudge*two*pivmin
+           if( irange==indrng ) then
+              ! range='i': compute an interval containing eigenvalues
+              ! il through iu. the initial interval [gl,gu] from the global
+              ! gerschgorin bounds gl and gu is refined by stdlib_slaebz.
+              itmax = int( ( log( tnorm+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+              work( n+1 ) = gl
+              work( n+2 ) = gl
+              work( n+3 ) = gu
+              work( n+4 ) = gu
+              work( n+5 ) = gl
+              work( n+6 ) = gu
+              iwork( 1 ) = -1
+              iwork( 2 ) = -1
+              iwork( 3 ) = n + 1
+              iwork( 4 ) = n + 1
+              iwork( 5 ) = il - 1
+              iwork( 6 ) = iu
+              call stdlib_slaebz( 3, itmax, n, 2, 2, nb, atoli, rtoli, pivmin,d, e, e2, iwork( 5 )&
+                        , work( n+1 ), work( n+5 ), iout,iwork, w, iblock, iinfo )
+              if( iinfo /= 0 ) then
+                 info = iinfo
+                 return
+              end if
+              ! on exit, output intervals may not be ordered by ascending negcount
+              if( iwork( 6 )==iu ) then
+                 wl = work( n+1 )
+                 wlu = work( n+3 )
+                 nwl = iwork( 1 )
+                 wu = work( n+4 )
+                 wul = work( n+2 )
+                 nwu = iwork( 4 )
+              else
+                 wl = work( n+2 )
+                 wlu = work( n+4 )
+                 nwl = iwork( 2 )
+                 wu = work( n+3 )
+                 wul = work( n+1 )
+                 nwu = iwork( 3 )
+              end if
+              ! on exit, the interval [wl, wlu] contains a value with negcount nwl,
+              ! and [wul, wu] contains a value with negcount nwu.
+              if( nwl<0 .or. nwl>=n .or. nwu<1 .or. nwu>n ) then
+                 info = 4
+                 return
+              end if
+           elseif( irange==valrng ) then
+              wl = vl
+              wu = vu
+           elseif( irange==allrng ) then
+              wl = gl
+              wu = gu
+           endif
+           ! find eigenvalues -- loop over blocks and recompute nwl and nwu.
+           ! nwl accumulates the number of eigenvalues .le. wl,
+           ! nwu accumulates the number of eigenvalues .le. wu
+           m = 0
+           iend = 0
+           info = 0
+           nwl = 0
+           nwu = 0
+           loop_70: do jblk = 1, nsplit
+              ioff = iend
+              ibegin = ioff + 1
+              iend = isplit( jblk )
+              in = iend - ioff
+              if( in==1 ) then
+                 ! 1x1 block
+                 if( wl>=d( ibegin )-pivmin )nwl = nwl + 1
+                 if( wu>=d( ibegin )-pivmin )nwu = nwu + 1
+                 if( irange==allrng .or.( wl<d( ibegin )-pivmin.and. wu>= d( ibegin )-pivmin ) ) &
+                           then
+                    m = m + 1
+                    w( m ) = d( ibegin )
+                    werr(m) = zero
+                    ! the gap for a single block doesn't matter for the later
+                    ! algorithm and is assigned an arbitrary large value
+                    iblock( m ) = jblk
+                    indexw( m ) = 1
+                 end if
+              ! disabled 2x2 case because of a failure on the following matrix
+              ! range = 'i', il = iu = 4
+                ! original tridiagonal, d = [
+                 ! -0.150102010615740e+00_sp
+                 ! -0.849897989384260e+00_sp
+                 ! -0.128208148052635e-15_sp
+                  ! 0.128257718286320e-15_sp
+                ! ];
+                ! e = [
+                 ! -0.357171383266986e+00_sp
+                 ! -0.180411241501588e-15_sp
+                 ! -0.175152352710251e-15_sp
+                ! ];
+               ! else if( in==2 ) then
+      ! *           2x2 block
+                  ! disc = sqrt( (half*(d(ibegin)-d(iend)))**2 + e(ibegin)**2 )
+                  ! tmp1 = half*(d(ibegin)+d(iend))
+                  ! l1 = tmp1 - disc
+                  ! if( wl>= l1-pivmin )
+           ! $         nwl = nwl + 1
+                  ! if( wu>= l1-pivmin )
+           ! $         nwu = nwu + 1
+                  ! if( irange==allrng .or. ( wl<l1-pivmin .and. wu>=
+           ! $          l1-pivmin ) ) then
+                     ! m = m + 1
+                     ! w( m ) = l1
+      ! *              the uncertainty of eigenvalues of a 2x2 matrix is very small
+                     ! werr( m ) = eps * abs( w( m ) ) * two
+                     ! iblock( m ) = jblk
+                     ! indexw( m ) = 1
+                  ! endif
+                  ! l2 = tmp1 + disc
+                  ! if( wl>= l2-pivmin )
+           ! $         nwl = nwl + 1
+                  ! if( wu>= l2-pivmin )
+           ! $         nwu = nwu + 1
+                  ! if( irange==allrng .or. ( wl<l2-pivmin .and. wu>=
+           ! $          l2-pivmin ) ) then
+                     ! m = m + 1
+                     ! w( m ) = l2
+      ! *              the uncertainty of eigenvalues of a 2x2 matrix is very small
+                     ! werr( m ) = eps * abs( w( m ) ) * two
+                     ! iblock( m ) = jblk
+                     ! indexw( m ) = 2
+                  ! endif
+              else
+                 ! general case - block of size in >= 2
+                 ! compute local gerschgorin interval and use it as the initial
+                 ! interval for stdlib_slaebz
+                 gu = d( ibegin )
+                 gl = d( ibegin )
+                 tmp1 = zero
+                 do j = ibegin, iend
+                    gl =  min( gl, gers( 2*j - 1))
+                    gu = max( gu, gers(2*j) )
+                 end do
+                 ! [jan/28/2009]
+                 ! change spdiam by tnorm in lines 2 and 3 thereafter
+                 ! line 1: remove computation of spdiam (not useful anymore)
+                 ! spdiam = gu - gl
+                 ! gl = gl - fudge*spdiam*eps*in - fudge*pivmin
+                 ! gu = gu + fudge*spdiam*eps*in + fudge*pivmin
+                 gl = gl - fudge*tnorm*eps*in - fudge*pivmin
+                 gu = gu + fudge*tnorm*eps*in + fudge*pivmin
+                 if( irange>1 ) then
+                    if( gu<wl ) then
+                       ! the local block contains none of the wanted eigenvalues
+                       nwl = nwl + in
+                       nwu = nwu + in
+                       cycle loop_70
+                    end if
+                    ! refine search interval if possible, only range (wl,wu] matters
+                    gl = max( gl, wl )
+                    gu = min( gu, wu )
+                    if( gl>=gu )cycle loop_70
+                 end if
+                 ! find negcount of initial interval boundaries gl and gu
+                 work( n+1 ) = gl
+                 work( n+in+1 ) = gu
+                 call stdlib_slaebz( 1, 0, in, in, 1, nb, atoli, rtoli, pivmin,d( ibegin ), e( &
+                 ibegin ), e2( ibegin ),idumma, work( n+1 ), work( n+2*in+1 ), im,iwork, w( m+1 ),&
+                            iblock( m+1 ), iinfo )
+                 if( iinfo /= 0 ) then
+                    info = iinfo
+                    return
+                 end if
+                 nwl = nwl + iwork( 1 )
+                 nwu = nwu + iwork( in+1 )
+                 iwoff = m - iwork( 1 )
+                 ! compute eigenvalues
+                 itmax = int( ( log( gu-gl+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + &
+                           2
+                 call stdlib_slaebz( 2, itmax, in, in, 1, nb, atoli, rtoli, pivmin,d( ibegin ), e(&
+                  ibegin ), e2( ibegin ),idumma, work( n+1 ), work( n+2*in+1 ), iout,iwork, w( m+&
+                            1 ), iblock( m+1 ), iinfo )
+                 if( iinfo /= 0 ) then
+                    info = iinfo
+                    return
+                 end if
+                 ! copy eigenvalues into w and iblock
+                 ! use -jblk for block number for unconverged eigenvalues.
+                 ! loop over the number of output intervals from stdlib_slaebz
+                 do j = 1, iout
+                    ! eigenvalue approximation is middle point of interval
+                    tmp1 = half*( work( j+n )+work( j+in+n ) )
+                    ! semi length of error interval
+                    tmp2 = half*abs( work( j+n )-work( j+in+n ) )
+                    if( j>iout-iinfo ) then
+                       ! flag non-convergence.
+                       ncnvrg = .true.
+                       ib = -jblk
+                    else
+                       ib = jblk
+                    end if
+                    do je = iwork( j ) + 1 + iwoff,iwork( j+in ) + iwoff
+                       w( je ) = tmp1
+                       werr( je ) = tmp2
+                       indexw( je ) = je - iwoff
+                       iblock( je ) = ib
+                    end do
+                 end do
+                 m = m + im
+              end if
+           end do loop_70
+           ! if range='i', then (wl,wu) contains eigenvalues nwl+1,...,nwu
+           ! if nwl+1 < il or nwu > iu, discard extra eigenvalues.
+           if( irange==indrng ) then
+              idiscl = il - 1 - nwl
+              idiscu = nwu - iu
+              if( idiscl>0 ) then
+                 im = 0
+                 do je = 1, m
+                    ! remove some of the smallest eigenvalues from the left so that
+                    ! at the end idiscl =0. move all eigenvalues up to the left.
+                    if( w( je )<=wlu .and. idiscl>0 ) then
+                       idiscl = idiscl - 1
+                    else
+                       im = im + 1
+                       w( im ) = w( je )
+                       werr( im ) = werr( je )
+                       indexw( im ) = indexw( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 m = im
+              end if
+              if( idiscu>0 ) then
+                 ! remove some of the largest eigenvalues from the right so that
+                 ! at the end idiscu =0. move all eigenvalues up to the left.
+                 im=m+1
+                 do je = m, 1, -1
+                    if( w( je )>=wul .and. idiscu>0 ) then
+                       idiscu = idiscu - 1
+                    else
+                       im = im - 1
+                       w( im ) = w( je )
+                       werr( im ) = werr( je )
+                       indexw( im ) = indexw( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 jee = 0
+                 do je = im, m
+                    jee = jee + 1
+                    w( jee ) = w( je )
+                    werr( jee ) = werr( je )
+                    indexw( jee ) = indexw( je )
+                    iblock( jee ) = iblock( je )
+                 end do
+                 m = m-im+1
+              end if
+              if( idiscl>0 .or. idiscu>0 ) then
+                 ! code to deal with effects of bad arithmetic. (if n(w) is
+                 ! monotone non-decreasing, this should never happen.)
+                 ! some low eigenvalues to be discarded are not in (wl,wlu],
+                 ! or high eigenvalues to be discarded are not in (wul,wu]
+                 ! so just kill off the smallest idiscl/largest idiscu
+                 ! eigenvalues, by marking the corresponding iblock = 0
+                 if( idiscl>0 ) then
+                    wkill = wu
+                    do jdisc = 1, idiscl
+                       iw = 0
+                       do je = 1, m
+                          if( iblock( je )/=0 .and.( w( je )<wkill .or. iw==0 ) ) then
+                             iw = je
+                             wkill = w( je )
+                          end if
+                       end do
+                       iblock( iw ) = 0
+                    end do
+                 end if
+                 if( idiscu>0 ) then
+                    wkill = wl
+                    do jdisc = 1, idiscu
+                       iw = 0
+                       do je = 1, m
+                          if( iblock( je )/=0 .and.( w( je )>=wkill .or. iw==0 ) ) then
+                             iw = je
+                             wkill = w( je )
+                          end if
+                       end do
+                       iblock( iw ) = 0
+                    end do
+                 end if
+                 ! now erase all eigenvalues with iblock set to zero
+                 im = 0
+                 do je = 1, m
+                    if( iblock( je )/=0 ) then
+                       im = im + 1
+                       w( im ) = w( je )
+                       werr( im ) = werr( je )
+                       indexw( im ) = indexw( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 m = im
+              end if
+              if( idiscl<0 .or. idiscu<0 ) then
+                 toofew = .true.
+              end if
+           end if
+           if(( irange==allrng .and. m/=n ).or.( irange==indrng .and. m/=iu-il+1 ) ) then
+              toofew = .true.
+           end if
+           ! if order='b', do nothing the eigenvalues are already sorted by
+              ! block.
+           ! if order='e', sort the eigenvalues from smallest to largest
+           if( stdlib_lsame(order,'E') .and. nsplit>1 ) then
+              do je = 1, m - 1
+                 ie = 0
+                 tmp1 = w( je )
+                 do j = je + 1, m
+                    if( w( j )<tmp1 ) then
+                       ie = j
+                       tmp1 = w( j )
+                    end if
+                 end do
+                 if( ie/=0 ) then
+                    tmp2 = werr( ie )
+                    itmp1 = iblock( ie )
+                    itmp2 = indexw( ie )
+                    w( ie ) = w( je )
+                    werr( ie ) = werr( je )
+                    iblock( ie ) = iblock( je )
+                    indexw( ie ) = indexw( je )
+                    w( je ) = tmp1
+                    werr( je ) = tmp2
+                    iblock( je ) = itmp1
+                    indexw( je ) = itmp2
+                 end if
+              end do
+           end if
+           info = 0
+           if( ncnvrg )info = info + 1
+           if( toofew )info = info + 2
+           return
+     end subroutine stdlib_slarrd
+
+     !> Given the initial eigenvalue approximations of T, SLARRJ:
+     !> does  bisection to refine the eigenvalues of T,
+     !> W( IFIRST-OFFSET ) through W( ILAST-OFFSET ), to more accuracy. Initial
+     !> guesses for these eigenvalues are input in W, the corresponding estimate
+     !> of the error in these guesses in WERR. During bisection, intervals
+     !> [left, right] are maintained by storing their mid-points and
+     !> semi-widths in the arrays W and WERR respectively.
+
+     pure subroutine stdlib_slarrj( n, d, e2, ifirst, ilast,rtol, offset, w, werr, work, iwork,&
+               pivmin, spdiam, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ifirst, ilast, n, offset
+           integer(ilp), intent(out) :: info
+           real(sp), intent(in) :: pivmin, rtol, spdiam
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: d(*), e2(*)
+           real(sp), intent(inout) :: w(*), werr(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           integer(ilp) :: maxitr
+           ! Local Scalars 
+           integer(ilp) :: cnt, i, i1, i2, ii, iter, j, k, next, nint, olnint, p, prev, &
+                     savi1
+           real(sp) :: dplus, fac, left, mid, right, s, tmp, width
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           maxitr = int( ( log( spdiam+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+           ! initialize unconverged intervals in [ work(2*i-1), work(2*i) ].
+           ! the sturm count, count( work(2*i-1) ) is arranged to be i-1, while
+           ! count( work(2*i) ) is stored in iwork( 2*i ). the integer iwork( 2*i-1 )
+           ! for an unconverged interval is set to the index of the next unconverged
+           ! interval, and is -1 or 0 for a converged interval. thus a linked
+           ! list of unconverged intervals is set up.
+           i1 = ifirst
+           i2 = ilast
+           ! the number of unconverged intervals
+           nint = 0
+           ! the last unconverged interval found
+           prev = 0
+           loop_75: do i = i1, i2
+              k = 2*i
+              ii = i - offset
+              left = w( ii ) - werr( ii )
+              mid = w(ii)
+              right = w( ii ) + werr( ii )
+              width = right - mid
+              tmp = max( abs( left ), abs( right ) )
+              ! the following test prevents the test of converged intervals
+              if( width<rtol*tmp ) then
+                 ! this interval has already converged and does not need refinement.
+                 ! (note that the gaps might change through refining the
+                  ! eigenvalues, however, they can only get bigger.)
+                 ! remove it from the list.
+                 iwork( k-1 ) = -1
+                 ! make sure that i1 always points to the first unconverged interval
+                 if((i==i1).and.(i<i2)) i1 = i + 1
+                 if((prev>=i1).and.(i<=i2)) iwork( 2*prev-1 ) = i + 1
+              else
+                 ! unconverged interval found
+                 prev = i
+                 ! make sure that [left,right] contains the desired eigenvalue
+                 ! do while( cnt(left)>i-1 )
+                 fac = one
+                 20 continue
+                 cnt = 0
+                 s = left
+                 dplus = d( 1 ) - s
+                 if( dplus<zero ) cnt = cnt + 1
+                 do j = 2, n
+                    dplus = d( j ) - s - e2( j-1 )/dplus
+                    if( dplus<zero ) cnt = cnt + 1
+                 end do
+                 if( cnt>i-1 ) then
+                    left = left - werr( ii )*fac
+                    fac = two*fac
+                    go to 20
+                 end if
+                 ! do while( cnt(right)<i )
+                 fac = one
+                 50 continue
+                 cnt = 0
+                 s = right
+                 dplus = d( 1 ) - s
+                 if( dplus<zero ) cnt = cnt + 1
+                 do j = 2, n
+                    dplus = d( j ) - s - e2( j-1 )/dplus
+                    if( dplus<zero ) cnt = cnt + 1
+                 end do
+                 if( cnt<i ) then
+                    right = right + werr( ii )*fac
+                    fac = two*fac
+                    go to 50
+                 end if
+                 nint = nint + 1
+                 iwork( k-1 ) = i + 1
+                 iwork( k ) = cnt
+              end if
+              work( k-1 ) = left
+              work( k ) = right
+           end do loop_75
+           savi1 = i1
+           ! do while( nint>0 ), i.e. there are still unconverged intervals
+           ! and while (iter<maxitr)
+           iter = 0
+           80 continue
+           prev = i1 - 1
+           i = i1
+           olnint = nint
+           loop_100: do p = 1, olnint
+              k = 2*i
+              ii = i - offset
+              next = iwork( k-1 )
+              left = work( k-1 )
+              right = work( k )
+              mid = half*( left + right )
+              ! semiwidth of interval
+              width = right - mid
+              tmp = max( abs( left ), abs( right ) )
+              if( ( width<rtol*tmp ) .or.(iter==maxitr) )then
+                 ! reduce number of unconverged intervals
+                 nint = nint - 1
+                 ! mark interval as converged.
+                 iwork( k-1 ) = 0
+                 if( i1==i ) then
+                    i1 = next
+                 else
+                    ! prev holds the last unconverged interval previously examined
+                    if(prev>=i1) iwork( 2*prev-1 ) = next
+                 end if
+                 i = next
+                 cycle loop_100
+              end if
+              prev = i
+              ! perform one bisection step
+              cnt = 0
+              s = mid
+              dplus = d( 1 ) - s
+              if( dplus<zero ) cnt = cnt + 1
+              do j = 2, n
+                 dplus = d( j ) - s - e2( j-1 )/dplus
+                 if( dplus<zero ) cnt = cnt + 1
+              end do
+              if( cnt<=i-1 ) then
+                 work( k-1 ) = mid
+              else
+                 work( k ) = mid
+              end if
+              i = next
+           end do loop_100
+           iter = iter + 1
+           ! do another loop if there are still unconverged intervals
+           ! however, in the last iteration, all intervals are accepted
+           ! since this is the best we can do.
+           if( ( nint>0 ).and.(iter<=maxitr) ) go to 80
+           ! at this point, all the intervals have converged
+           do i = savi1, ilast
+              k = 2*i
+              ii = i - offset
+              ! all intervals marked by '0' have been refined.
+              if( iwork( k-1 )==0 ) then
+                 w( ii ) = half*( work( k-1 )+work( k ) )
+                 werr( ii ) = work( k ) - w( ii )
+              end if
+           end do
+           return
+     end subroutine stdlib_slarrj
+
+     !> SLARRK: computes one eigenvalue of a symmetric tridiagonal
+     !> matrix T to suitable accuracy. This is an auxiliary code to be
+     !> called from SSTEMR.
+     !> To avoid overflow, the matrix must be scaled so that its
+     !> largest element is no greater than overflow**(1/2) * underflow**(1/4) in absolute value, and for greatest
+     !> accuracy, it should not be much smaller than that.
+     !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+     !> Matrix", Report CS41, Computer Science Dept., Stanford
+     !> University, July 21, 1966.
+
+     pure subroutine stdlib_slarrk( n, iw, gl, gu,d, e2, pivmin, reltol, w, werr, info)
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: iw, n
+           real(sp), intent(in) :: pivmin, reltol, gl, gu
+           real(sp), intent(out) :: w, werr
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*), e2(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: fudge = two
+           
+           ! Local Scalars 
+           integer(ilp) :: i, it, itmax, negcnt
+           real(sp) :: atoli, eps, left, mid, right, rtoli, tmp1, tmp2, tnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              info = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           tnorm = max( abs( gl ), abs( gu ) )
+           rtoli = reltol
+           atoli = fudge*two*pivmin
+           itmax = int( ( log( tnorm+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+           info = -1
+           left = gl - fudge*tnorm*eps*n - fudge*two*pivmin
+           right = gu + fudge*tnorm*eps*n + fudge*two*pivmin
+           it = 0
+           10 continue
+           ! check if interval converged or maximum number of iterations reached
+           tmp1 = abs( right - left )
+           tmp2 = max( abs(right), abs(left) )
+           if( tmp1<max( atoli, pivmin, rtoli*tmp2 ) ) then
+              info = 0
+              goto 30
+           endif
+           if(it>itmax)goto 30
+           ! count number of negative pivots for mid-point
+           it = it + 1
+           mid = half * (left + right)
+           negcnt = 0
+           tmp1 = d( 1 ) - mid
+           if( abs( tmp1 )<pivmin )tmp1 = -pivmin
+           if( tmp1<=zero )negcnt = negcnt + 1
+           do i = 2, n
+              tmp1 = d( i ) - e2( i-1 ) / tmp1 - mid
+              if( abs( tmp1 )<pivmin )tmp1 = -pivmin
+              if( tmp1<=zero )negcnt = negcnt + 1
+           end do
+           if(negcnt>=iw) then
+              right = mid
+           else
+              left = mid
+           endif
+           goto 10
+           30 continue
+           ! converged or maximum number of iterations reached
+           w = half * (left + right)
+           werr = half * abs( right - left )
+           return
+     end subroutine stdlib_slarrk
+
+     !> Perform tests to decide whether the symmetric tridiagonal matrix T
+     !> warrants expensive computations which guarantee high relative accuracy
+     !> in the eigenvalues.
+
+     pure subroutine stdlib_slarrr( n, d, e, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*)
+           real(sp), intent(inout) :: e(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: relcond = 0.999_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           logical(lk) :: yesrel
+           real(sp) :: eps, safmin, smlnum, rmin, tmp, tmp2, offdig, offdig2
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              info = 0
+              return
+           end if
+           ! as a default, do not go for relative-accuracy preserving computations.
+           info = 1
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           rmin = sqrt( smlnum )
+           ! tests for relative accuracy
+           ! test for scaled diagonal dominance
+           ! scale the diagonal entries to one and check whether the sum of the
+           ! off-diagonals is less than one
+           ! the sdd relative error bounds have a 1/(1- 2*x) factor in them,
+           ! x = max(offdig + offdig2), so when x is close to 1/2, no relative
+           ! accuracy is promised.  in the notation of the code fragment below,
+           ! 1/(1 - (offdig + offdig2)) is the condition number.
+           ! we don't think it is worth going into "sdd mode" unless the relative
+           ! condition number is reasonable, not 1/macheps.
+           ! the threshold should be compatible with other thresholds used in the
+           ! code. we set  offdig + offdig2 <= .999_sp =: relcond, it corresponds
+           ! to losing at most 3 decimal digits: 1 / (1 - (offdig + offdig2)) <= 1000
+           ! instead of the current offdig + offdig2 < 1
+           yesrel = .true.
+           offdig = zero
+           tmp = sqrt(abs(d(1)))
+           if (tmp<rmin) yesrel = .false.
+           if(.not.yesrel) goto 11
+           do i = 2, n
+              tmp2 = sqrt(abs(d(i)))
+              if (tmp2<rmin) yesrel = .false.
+              if(.not.yesrel) goto 11
+              offdig2 = abs(e(i-1))/(tmp*tmp2)
+              if(offdig+offdig2>=relcond) yesrel = .false.
+              if(.not.yesrel) goto 11
+              tmp = tmp2
+              offdig = offdig2
+           end do
+           11 continue
+           if( yesrel ) then
+              info = 0
+              return
+           else
+           endif
+           ! *** more to be implemented ***
+           ! test if the lower bidiagonal matrix l from t = l d l^t
+           ! (zero shift facto) is well conditioned
+           ! test if the upper bidiagonal matrix u from t = u d u^t
+           ! (zero shift facto) is well conditioned.
+           ! in this case, the matrix needs to be flipped and, at the end
+           ! of the eigenvector computation, the flip needs to be applied
+           ! to the computed eigenvectors (and the support)
+           return
+     end subroutine stdlib_slarrr
+
+     !> !
+     !>
+     !> SLARTG: generates a plane rotation so that
+     !> [  C  S  ]  .  [ F ]  =  [ R ]
+     !> [ -S  C  ]     [ G ]     [ 0 ]
+     !> where C**2 + S**2 = 1.
+     !> The mathematical formulas used for C and S are
+     !> R = sign(F) * sqrt(F**2 + G**2)
+     !> C = F / R
+     !> S = G / R
+     !> Hence C >= 0. The algorithm used to compute these quantities
+     !> incorporates scaling to avoid overflow or underflow in computing the
+     !> square root of the sum of squares.
+     !> This version is discontinuous in R at F = 0 but it returns the same
+     !> C and S as SLARTG for complex inputs (F,0) and (G,0).
+     !> This is a more accurate version of the BLAS1 routine SROTG,
+     !> with the following other differences:
+     !> F and G are unchanged on return.
+     !> If G=0, then C=1 and S=0.
+     !> If F=0 and (G .ne. 0), then C=0 and S=sign(1,G) without doing any
+     !> floating point operations (saves work in SBDSQR when
+     !> there are zeros on the diagonal).
+     !> If F exceeds G in magnitude, C will be positive.
+     !> Below, wp=>sp stands for single precision from LA_CONSTANTS module.
+
+     pure subroutine stdlib_slartg( f, g, c, s, r )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! february 2021
+        ! Scalar Arguments 
+        real(sp), intent(out) :: c, r, s
+        real(sp), intent(in) :: f, g
+        ! Local Scalars 
+        real(sp) :: d, f1, fs, g1, gs, p, u, uu
+        ! Intrinsic Functions 
+        intrinsic :: abs,sign,sqrt
+        ! Executable Statements 
+        f1 = abs( f )
+        g1 = abs( g )
+        if( g == zero ) then
+           c = one
+           s = zero
+           r = f
+        else if( f == zero ) then
+           c = zero
+           s = sign( one, g )
+           r = g1
+     else if( f1 > rtmin .and. f1 < rtmax .and.            g1 > rtmin .and. g1 < rtmax ) &
+               then
+           d = sqrt( f*f + g*g )
+           p = one / d
+           c = f1*p
+           s = g*sign( p, f )
+           r = sign( d, f )
+        else
+           u = min( safmax, max( safmin, f1, g1 ) )
+           uu = one / u
+           fs = f*uu
+           gs = g*uu
+           d = sqrt( fs*fs + gs*gs )
+           p = one / d
+           c = abs( fs )*p
+           s = gs*sign( p, f )
+           r = sign( d, f )*u
+        end if
+        return
+     end subroutine stdlib_slartg
+
+     !> SLARTGP: generates a plane rotation so that
+     !> [  CS  SN  ]  .  [ F ]  =  [ R ]   where CS**2 + SN**2 = 1.
+     !> [ -SN  CS  ]     [ G ]     [ 0 ]
+     !> This is a slower, more accurate version of the Level 1 BLAS routine SROTG,
+     !> with the following other differences:
+     !> F and G are unchanged on return.
+     !> If G=0, then CS=(+/-)1 and SN=0.
+     !> If F=0 and (G .ne. 0), then CS=0 and SN=(+/-)1.
+     !> The sign is chosen so that R >= 0.
+
+     pure subroutine stdlib_slartgp( f, g, cs, sn, r )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(out) :: cs, r, sn
+           real(sp), intent(in) :: f, g
+        ! =====================================================================
+           
+           
+           
+           ! Local Scalars 
+           ! logical            first
+           integer(ilp) :: count, i
+           real(sp) :: eps, f1, g1, safmin, safmn2, safmx2, scale
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,sign,sqrt
+           ! Save Statement 
+           ! save               first, safmx2, safmin, safmn2
+           ! Data Statements 
+           ! data               first / .true. /
+           ! Executable Statements 
+           ! if( first ) then
+              safmin = stdlib_slamch( 'S' )
+              eps = stdlib_slamch( 'E' )
+              safmn2 = stdlib_slamch( 'B' )**int( log( safmin / eps ) /log( stdlib_slamch( 'B' ) )&
+                         / two,KIND=ilp)
+              safmx2 = one / safmn2
+              ! first = .false.
+           ! end if
+           if( g==zero ) then
+              cs = sign( one, f )
+              sn = zero
+              r = abs( f )
+           else if( f==zero ) then
+              cs = zero
+              sn = sign( one, g )
+              r = abs( g )
+           else
+              f1 = f
+              g1 = g
+              scale = max( abs( f1 ), abs( g1 ) )
+              if( scale>=safmx2 ) then
+                 count = 0
+                 10 continue
+                 count = count + 1
+                 f1 = f1*safmn2
+                 g1 = g1*safmn2
+                 scale = max( abs( f1 ), abs( g1 ) )
+                 if( scale>=safmx2 .and. count < 20)go to 10
+                 r = sqrt( f1**2+g1**2 )
+                 cs = f1 / r
+                 sn = g1 / r
+                 do i = 1, count
+                    r = r*safmx2
+                 end do
+              else if( scale<=safmn2 ) then
+                 count = 0
+                 30 continue
+                 count = count + 1
+                 f1 = f1*safmx2
+                 g1 = g1*safmx2
+                 scale = max( abs( f1 ), abs( g1 ) )
+                 if( scale<=safmn2 )go to 30
+                 r = sqrt( f1**2+g1**2 )
+                 cs = f1 / r
+                 sn = g1 / r
+                 do i = 1, count
+                    r = r*safmn2
+                 end do
+              else
+                 r = sqrt( f1**2+g1**2 )
+                 cs = f1 / r
+                 sn = g1 / r
+              end if
+              if( r<zero ) then
+                 cs = -cs
+                 sn = -sn
+                 r = -r
+              end if
+           end if
+           return
+     end subroutine stdlib_slartgp
+
+     !> SLARTGS: generates a plane rotation designed to introduce a bulge in
+     !> Golub-Reinsch-style implicit QR iteration for the bidiagonal SVD
+     !> problem. X and Y are the top-row entries, and SIGMA is the shift.
+     !> The computed CS and SN define a plane rotation satisfying
+     !> [  CS  SN  ]  .  [ X^2 - SIGMA ]  =  [ R ],
+     !> [ -SN  CS  ]     [    X * Y    ]     [ 0 ]
+     !> with R nonnegative.  If X^2 - SIGMA and X * Y are 0, then the
+     !> rotation is by PI/2.
+
+     pure subroutine stdlib_slartgs( x, y, sigma, cs, sn )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(out) :: cs, sn
+           real(sp), intent(in) :: sigma, x, y
+        ! ===================================================================
+           
+           ! Local Scalars 
+           real(sp) :: r, s, thresh, w, z
+           thresh = stdlib_slamch('E')
+           ! compute the first column of b**t*b - sigma^2*i, up to a scale
+           ! factor.
+           if( (sigma == zero .and. abs(x) < thresh) .or.(abs(x) == sigma .and. y == zero) ) &
+                     then
+              z = zero
+              w = zero
+           else if( sigma == zero ) then
+              if( x >= zero ) then
+                 z = x
+                 w = y
+              else
+                 z = -x
+                 w = -y
+              end if
+           else if( abs(x) < thresh ) then
+              z = -sigma*sigma
+              w = zero
+           else
+              if( x >= zero ) then
+                 s = one
+              else
+                 s = negone
+              end if
+              z = s * (abs(x)-sigma) * (s+sigma/x)
+              w = s * y
+           end if
+           ! generate the rotation.
+           ! call stdlib_slartgp( z, w, cs, sn, r ) might seem more natural;
+           ! reordering the arguments ensures that if z = 0 then the rotation
+           ! is by pi/2.
+           call stdlib_slartgp( w, z, sn, cs, r )
+           return
+           ! end stdlib_slartgs
+     end subroutine stdlib_slartgs
+
+     !> SLARTV: applies a vector of real plane rotations to elements of the
+     !> real vectors x and y. For i = 1,2,...,n
+     !> ( x(i) ) := (  c(i)  s(i) ) ( x(i) )
+     !> ( y(i) )    ( -s(i)  c(i) ) ( y(i) )
+
+     pure subroutine stdlib_slartv( n, x, incx, y, incy, c, s, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, incy, n
+           ! Array Arguments 
+           real(sp), intent(in) :: c(*), s(*)
+           real(sp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix, iy
+           real(sp) :: xi, yi
+           ! Executable Statements 
+           ix = 1
+           iy = 1
+           ic = 1
+           do i = 1, n
+              xi = x( ix )
+              yi = y( iy )
+              x( ix ) = c( ic )*xi + s( ic )*yi
+              y( iy ) = c( ic )*yi - s( ic )*xi
+              ix = ix + incx
+              iy = iy + incy
+              ic = ic + incc
+           end do
+           return
+     end subroutine stdlib_slartv
+
+     !> SLARUV: returns a vector of n random real numbers from a uniform (0,1)
+     !> distribution (n <= 128).
+     !> This is an auxiliary routine called by SLARNV and CLARNV.
+
+     pure subroutine stdlib_slaruv( iseed, n, x )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: iseed(4)
+           real(sp), intent(out) :: x(n)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: lv = 128
+           integer(ilp), parameter :: ipw2 = 4096
+           real(sp), parameter :: r = one/ipw2
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, i2, i3, i4, it1, it2, it3, it4, j
+           ! Local Arrays 
+           integer(ilp) :: mm(lv,4)
+           ! Intrinsic Functions 
+           intrinsic :: min,mod,real
+           ! Data Statements 
+           mm(1,1:4)=[494,322,2508,2549]
+           mm(2,1:4)=[2637,789,3754,1145]
+           mm(3,1:4)=[255,1440,1766,2253]
+           mm(4,1:4)=[2008,752,3572,305]
+           mm(5,1:4)=[1253,2859,2893,3301]
+           mm(6,1:4)=[3344,123,307,1065]
+           mm(7,1:4)=[4084,1848,1297,3133]
+           mm(8,1:4)=[1739,643,3966,2913]
+           mm(9,1:4)=[3143,2405,758,3285]
+           mm(10,1:4)=[3468,2638,2598,1241]
+           mm(11,1:4)=[688,2344,3406,1197]
+           mm(12,1:4)=[1657,46,2922,3729]
+           mm(13,1:4)=[1238,3814,1038,2501]
+           mm(14,1:4)=[3166,913,2934,1673]
+           mm(15,1:4)=[1292,3649,2091,541]
+           mm(16,1:4)=[3422,339,2451,2753]
+           mm(17,1:4)=[1270,3808,1580,949]
+           mm(18,1:4)=[2016,822,1958,2361]
+           mm(19,1:4)=[154,2832,2055,1165]
+           mm(20,1:4)=[2862,3078,1507,4081]
+           mm(21,1:4)=[697,3633,1078,2725]
+           mm(22,1:4)=[1706,2970,3273,3305]
+           mm(23,1:4)=[491,637,17,3069]
+           mm(24,1:4)=[931,2249,854,3617]
+           mm(25,1:4)=[1444,2081,2916,3733]
+           mm(26,1:4)=[444,4019,3971,409]
+           mm(27,1:4)=[3577,1478,2889,2157]
+           mm(28,1:4)=[3944,242,3831,1361]
+           mm(29,1:4)=[2184,481,2621,3973]
+           mm(30,1:4)=[1661,2075,1541,1865]
+           mm(31,1:4)=[3482,4058,893,2525]
+           mm(32,1:4)=[657,622,736,1409]
+           mm(33,1:4)=[3023,3376,3992,3445]
+           mm(34,1:4)=[3618,812,787,3577]
+           mm(35,1:4)=[1267,234,2125,77]
+           mm(36,1:4)=[1828,641,2364,3761]
+           mm(37,1:4)=[164,4005,2460,2149]
+           mm(38,1:4)=[3798,1122,257,1449]
+           mm(39,1:4)=[3087,3135,1574,3005]
+           mm(40,1:4)=[2400,2640,3912,225]
+           mm(41,1:4)=[2870,2302,1216,85]
+           mm(42,1:4)=[3876,40,3248,3673]
+           mm(43,1:4)=[1905,1832,3401,3117]
+           mm(44,1:4)=[1593,2247,2124,3089]
+           mm(45,1:4)=[1797,2034,2762,1349]
+           mm(46,1:4)=[1234,2637,149,2057]
+           mm(47,1:4)=[3460,1287,2245,413]
+           mm(48,1:4)=[328,1691,166,65]
+           mm(49,1:4)=[2861,496,466,1845]
+           mm(50,1:4)=[1950,1597,4018,697]
+           mm(51,1:4)=[617,2394,1399,3085]
+           mm(52,1:4)=[2070,2584,190,3441]
+           mm(53,1:4)=[3331,1843,2879,1573]
+           mm(54,1:4)=[769,336,153,3689]
+           mm(55,1:4)=[1558,1472,2320,2941]
+           mm(56,1:4)=[2412,2407,18,929]
+           mm(57,1:4)=[2800,433,712,533]
+           mm(58,1:4)=[189,2096,2159,2841]
+           mm(59,1:4)=[287,1761,2318,4077]
+           mm(60,1:4)=[2045,2810,2091,721]
+           mm(61,1:4)=[1227,566,3443,2821]
+           mm(62,1:4)=[2838,442,1510,2249]
+           mm(63,1:4)=[209,41,449,2397]
+           mm(64,1:4)=[2770,1238,1956,2817]
+           mm(65,1:4)=[3654,1086,2201,245]
+           mm(66,1:4)=[3993,603,3137,1913]
+           mm(67,1:4)=[192,840,3399,1997]
+           mm(68,1:4)=[2253,3168,1321,3121]
+           mm(69,1:4)=[3491,1499,2271,997]
+           mm(70,1:4)=[2889,1084,3667,1833]
+           mm(71,1:4)=[2857,3438,2703,2877]
+           mm(72,1:4)=[2094,2408,629,1633]
+           mm(73,1:4)=[1818,1589,2365,981]
+           mm(74,1:4)=[688,2391,2431,2009]
+           mm(75,1:4)=[1407,288,1113,941]
+           mm(76,1:4)=[634,26,3922,2449]
+           mm(77,1:4)=[3231,512,2554,197]
+           mm(78,1:4)=[815,1456,184,2441]
+           mm(79,1:4)=[3524,171,2099,285]
+           mm(80,1:4)=[1914,1677,3228,1473]
+           mm(81,1:4)=[516,2657,4012,2741]
+           mm(82,1:4)=[164,2270,1921,3129]
+           mm(83,1:4)=[303,2587,3452,909]
+           mm(84,1:4)=[2144,2961,3901,2801]
+           mm(85,1:4)=[3480,1970,572,421]
+           mm(86,1:4)=[119,1817,3309,4073]
+           mm(87,1:4)=[3357,676,3171,2813]
+           mm(88,1:4)=[837,1410,817,2337]
+           mm(89,1:4)=[2826,3723,3039,1429]
+           mm(90,1:4)=[2332,2803,1696,1177]
+           mm(91,1:4)=[2089,3185,1256,1901]
+           mm(92,1:4)=[3780,184,3715,81]
+           mm(93,1:4)=[1700,663,2077,1669]
+           mm(94,1:4)=[3712,499,3019,2633]
+           mm(95,1:4)=[150,3784,1497,2269]
+           mm(96,1:4)=[2000,1631,1101,129]
+           mm(97,1:4)=[3375,1925,717,1141]
+           mm(98,1:4)=[1621,3912,51,249]
+           mm(99,1:4)=[3090,1398,981,3917]
+           mm(100,1:4)=[3765,1349,1978,2481]
+           mm(101,1:4)=[1149,1441,1813,3941]
+           mm(102,1:4)=[3146,2224,3881,2217]
+           mm(103,1:4)=[33,2411,76,2749]
+           mm(104,1:4)=[3082,1907,3846,3041]
+           mm(105,1:4)=[2741,3192,3694,1877]
+           mm(106,1:4)=[359,2786,1682,345]
+           mm(107,1:4)=[3316,382,124,2861]
+           mm(108,1:4)=[1749,37,1660,1809]
+           mm(109,1:4)=[185,759,3997,3141]
+           mm(110,1:4)=[2784,2948,479,2825]
+           mm(111,1:4)=[2202,1862,1141,157]
+           mm(112,1:4)=[2199,3802,886,2881]
+           mm(113,1:4)=[1364,2423,3514,3637]
+           mm(114,1:4)=[1244,2051,1301,1465]
+           mm(115,1:4)=[2020,2295,3604,2829]
+           mm(116,1:4)=[3160,1332,1888,2161]
+           mm(117,1:4)=[2785,1832,1836,3365]
+           mm(118,1:4)=[2772,2405,1990,361]
+           mm(119,1:4)=[1217,3638,2058,2685]
+           mm(120,1:4)=[1822,3661,692,3745]
+           mm(121,1:4)=[1245,327,1194,2325]
+           mm(122,1:4)=[2252,3660,20,3609]
+           mm(123,1:4)=[3904,716,3285,3821]
+           mm(124,1:4)=[2774,1842,2046,3537]
+           mm(125,1:4)=[997,3987,2107,517]
+           mm(126,1:4)=[2573,1368,3508,3017]
+           mm(127,1:4)=[1148,1848,3525,2141]
+           mm(128,1:4)=[545,2366,3801,1537]
+           ! Executable Statements 
+           i1 = iseed( 1 )
+           i2 = iseed( 2 )
+           i3 = iseed( 3 )
+           i4 = iseed( 4 )
+           loop_10: do i = 1, min( n, lv )
+           20 continue
+              ! multiply the seed by i-th power of the multiplier modulo 2**48
+              it4 = i4*mm( i, 4 )
+              it3 = it4 / ipw2
+              it4 = it4 - ipw2*it3
+              it3 = it3 + i3*mm( i, 4 ) + i4*mm( i, 3 )
+              it2 = it3 / ipw2
+              it3 = it3 - ipw2*it2
+              it2 = it2 + i2*mm( i, 4 ) + i3*mm( i, 3 ) + i4*mm( i, 2 )
+              it1 = it2 / ipw2
+              it2 = it2 - ipw2*it1
+              it1 = it1 + i1*mm( i, 4 ) + i2*mm( i, 3 ) + i3*mm( i, 2 ) +i4*mm( i, 1 )
+              it1 = mod( it1, ipw2 )
+              ! convert 48-bit integer to a realnumber in the interval (0,1,KIND=sp)
+              x( i ) = r*( real( it1,KIND=sp)+r*( real( it2,KIND=sp)+r*( real( it3,KIND=sp)+&
+                        r*real( it4,KIND=sp) ) ) )
+              if (x( i )==1.0_sp) then
+                 ! if a real number has n bits of precision, and the first
+                 ! n bits of the 48-bit integer above happen to be all 1 (which
+                 ! will occur about once every 2**n calls), then x( i ) will
+                 ! be rounded to exactly one. in ieee single precision arithmetic,
+                 ! this will happen relatively often since n = 24.
+                 ! since x( i ) is not supposed to return exactly 0.0_sp or 1.0_sp,
+                 ! the statistically correct thing to do in this situation is
+                 ! simply to iterate again.
+                 ! n.b. the case x( i ) = 0.0_sp should not be possible.
+                 i1 = i1 + 2
+                 i2 = i2 + 2
+                 i3 = i3 + 2
+                 i4 = i4 + 2
+                 goto 20
+              end if
+           end do loop_10
+           ! return final value of seed
+           iseed( 1 ) = it1
+           iseed( 2 ) = it2
+           iseed( 3 ) = it3
+           iseed( 4 ) = it4
+           return
+     end subroutine stdlib_slaruv
+
+     !> SLARZ: applies a real elementary reflector H to a real M-by-N
+     !> matrix C, from either the left or the right. H is represented in the
+     !> form
+     !> H = I - tau * v * v**T
+     !> where tau is a real scalar and v is a real vector.
+     !> If tau = 0, then H is taken to be the unit matrix.
+     !> H is a product of k elementary reflectors as returned by STZRZF.
+
+     pure subroutine stdlib_slarz( side, m, n, l, v, incv, tau, c, ldc, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: incv, l, ldc, m, n
+           real(sp), intent(in) :: tau
+           ! Array Arguments 
+           real(sp), intent(inout) :: c(ldc,*)
+           real(sp), intent(in) :: v(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Executable Statements 
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c
+              if( tau/=zero ) then
+                 ! w( 1:n ) = c( 1, 1:n )
+                 call stdlib_scopy( n, c, ldc, work, 1 )
+                 ! w( 1:n ) = w( 1:n ) + c( m-l+1:m, 1:n )**t * v( 1:l )
+                 call stdlib_sgemv( 'TRANSPOSE', l, n, one, c( m-l+1, 1 ), ldc, v,incv, one, work,&
+                            1 )
+                 ! c( 1, 1:n ) = c( 1, 1:n ) - tau * w( 1:n )
+                 call stdlib_saxpy( n, -tau, work, 1, c, ldc )
+                 ! c( m-l+1:m, 1:n ) = c( m-l+1:m, 1:n ) - ...
+                                     ! tau * v( 1:l ) * w( 1:n )**t
+                 call stdlib_sger( l, n, -tau, v, incv, work, 1, c( m-l+1, 1 ),ldc )
+              end if
+           else
+              ! form  c * h
+              if( tau/=zero ) then
+                 ! w( 1:m ) = c( 1:m, 1 )
+                 call stdlib_scopy( m, c, 1, work, 1 )
+                 ! w( 1:m ) = w( 1:m ) + c( 1:m, n-l+1:n, 1:n ) * v( 1:l )
+                 call stdlib_sgemv( 'NO TRANSPOSE', m, l, one, c( 1, n-l+1 ), ldc,v, incv, one, &
+                           work, 1 )
+                 ! c( 1:m, 1 ) = c( 1:m, 1 ) - tau * w( 1:m )
+                 call stdlib_saxpy( m, -tau, work, 1, c, 1 )
+                 ! c( 1:m, n-l+1:n ) = c( 1:m, n-l+1:n ) - ...
+                                     ! tau * w( 1:m ) * v( 1:l )**t
+                 call stdlib_sger( m, l, -tau, work, 1, v, incv, c( 1, n-l+1 ),ldc )
+              end if
+           end if
+           return
+     end subroutine stdlib_slarz
+
+     !> SLARZB: applies a real block reflector H or its transpose H**T to
+     !> a real distributed M-by-N  C from the left or the right.
+     !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+
+     pure subroutine stdlib_slarzb( side, trans, direct, storev, m, n, k, l, v,ldv, t, ldt, c, &
+               ldc, work, ldwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, l, ldc, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: c(ldc,*), t(ldt,*), v(ldv,*)
+           real(sp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           character :: transt
+           integer(ilp) :: i, info, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           ! check for currently supported options
+           info = 0
+           if( .not.stdlib_lsame( direct, 'B' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( storev, 'R' ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLARZB', -info )
+              return
+           end if
+           if( stdlib_lsame( trans, 'N' ) ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c  or  h**t * c
+              ! w( 1:n, 1:k ) = c( 1:k, 1:n )**t
+              do j = 1, k
+                 call stdlib_scopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+              end do
+              ! w( 1:n, 1:k ) = w( 1:n, 1:k ) + ...
+                              ! c( m-l+1:m, 1:n )**t * v( 1:k, 1:l )**t
+              if( l>0 )call stdlib_sgemm( 'TRANSPOSE', 'TRANSPOSE', n, k, l, one,c( m-l+1, 1 ), &
+                        ldc, v, ldv, one, work, ldwork )
+              ! w( 1:n, 1:k ) = w( 1:n, 1:k ) * t**t  or  w( 1:m, 1:k ) * t
+              call stdlib_strmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k, one, t,ldt, work, &
+                        ldwork )
+              ! c( 1:k, 1:n ) = c( 1:k, 1:n ) - w( 1:n, 1:k )**t
+              do j = 1, n
+                 do i = 1, k
+                    c( i, j ) = c( i, j ) - work( j, i )
+                 end do
+              end do
+              ! c( m-l+1:m, 1:n ) = c( m-l+1:m, 1:n ) - ...
+                                  ! v( 1:k, 1:l )**t * w( 1:n, 1:k )**t
+              if( l>0 )call stdlib_sgemm( 'TRANSPOSE', 'TRANSPOSE', l, n, k, -one, v, ldv,work, &
+                        ldwork, one, c( m-l+1, 1 ), ldc )
+           else if( stdlib_lsame( side, 'R' ) ) then
+              ! form  c * h  or  c * h**t
+              ! w( 1:m, 1:k ) = c( 1:m, 1:k )
+              do j = 1, k
+                 call stdlib_scopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+              end do
+              ! w( 1:m, 1:k ) = w( 1:m, 1:k ) + ...
+                              ! c( 1:m, n-l+1:n ) * v( 1:k, 1:l )**t
+              if( l>0 )call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, k, l, one,c( 1, n-l+1 ),&
+                         ldc, v, ldv, one, work, ldwork )
+              ! w( 1:m, 1:k ) = w( 1:m, 1:k ) * t  or  w( 1:m, 1:k ) * t**t
+              call stdlib_strmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k, one, t,ldt, work, &
+                        ldwork )
+              ! c( 1:m, 1:k ) = c( 1:m, 1:k ) - w( 1:m, 1:k )
+              do j = 1, k
+                 do i = 1, m
+                    c( i, j ) = c( i, j ) - work( i, j )
+                 end do
+              end do
+              ! c( 1:m, n-l+1:n ) = c( 1:m, n-l+1:n ) - ...
+                                  ! w( 1:m, 1:k ) * v( 1:k, 1:l )
+              if( l>0 )call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, l, k, -one,work, &
+                        ldwork, v, ldv, one, c( 1, n-l+1 ), ldc )
+           end if
+           return
+     end subroutine stdlib_slarzb
+
+     !> SLARZT: forms the triangular factor T of a real block reflector
+     !> H of order > n, which is defined as a product of k elementary
+     !> reflectors.
+     !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+     !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+     !> If STOREV = 'C', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th column of the array V, and
+     !> H  =  I - V * T * V**T
+     !> If STOREV = 'R', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th row of the array V, and
+     !> H  =  I - V**T * T * V
+     !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+
+     pure subroutine stdlib_slarzt( direct, storev, n, k, v, ldv, tau, t, ldt )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, storev
+           integer(ilp), intent(in) :: k, ldt, ldv, n
+           ! Array Arguments 
+           real(sp), intent(out) :: t(ldt,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j
+           ! Executable Statements 
+           ! check for currently supported options
+           info = 0
+           if( .not.stdlib_lsame( direct, 'B' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( storev, 'R' ) ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLARZT', -info )
+              return
+           end if
+           do i = k, 1, -1
+              if( tau( i )==zero ) then
+                 ! h(i)  =  i
+                 do j = i, k
+                    t( j, i ) = zero
+                 end do
+              else
+                 ! general case
+                 if( i<k ) then
+                    ! t(i+1:k,i) = - tau(i) * v(i+1:k,1:n) * v(i,1:n)**t
+                    call stdlib_sgemv( 'NO TRANSPOSE', k-i, n, -tau( i ),v( i+1, 1 ), ldv, v( i, &
+                              1 ), ldv, zero,t( i+1, i ), 1 )
+                    ! t(i+1:k,i) = t(i+1:k,i+1:k) * t(i+1:k,i)
+                    call stdlib_strmv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', k-i,t( i+1, i+1 ), &
+                              ldt, t( i+1, i ), 1 )
+                 end if
+                 t( i, i ) = tau( i )
+              end if
+           end do
+           return
+     end subroutine stdlib_slarzt
+
+     !> SLAS2:  computes the singular values of the 2-by-2 matrix
+     !> [  F   G  ]
+     !> [  0   H  ].
+     !> On return, SSMIN is the smaller singular value and SSMAX is the
+     !> larger singular value.
+
+     pure subroutine stdlib_slas2( f, g, h, ssmin, ssmax )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: f, g, h
+           real(sp), intent(out) :: ssmax, ssmin
+        ! ====================================================================
+           
+           
+           
+           ! Local Scalars 
+           real(sp) :: as, at, au, c, fa, fhmn, fhmx, ga, ha
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           fa = abs( f )
+           ga = abs( g )
+           ha = abs( h )
+           fhmn = min( fa, ha )
+           fhmx = max( fa, ha )
+           if( fhmn==zero ) then
+              ssmin = zero
+              if( fhmx==zero ) then
+                 ssmax = ga
+              else
+                 ssmax = max( fhmx, ga )*sqrt( one+( min( fhmx, ga ) / max( fhmx, ga ) )**2 )
+                           
+              end if
+           else
+              if( ga<fhmx ) then
+                 as = one + fhmn / fhmx
+                 at = ( fhmx-fhmn ) / fhmx
+                 au = ( ga / fhmx )**2
+                 c = two / ( sqrt( as*as+au )+sqrt( at*at+au ) )
+                 ssmin = fhmn*c
+                 ssmax = fhmx / c
+              else
+                 au = fhmx / ga
+                 if( au==zero ) then
+                    ! avoid possible harmful underflow if exponent range
+                    ! asymmetric (true ssmin may not underflow even if
+                    ! au underflows)
+                    ssmin = ( fhmn*fhmx ) / ga
+                    ssmax = ga
+                 else
+                    as = one + fhmn / fhmx
+                    at = ( fhmx-fhmn ) / fhmx
+                    c = one / ( sqrt( one+( as*au )**2 )+sqrt( one+( at*au )**2 ) )
+                    ssmin = ( fhmn*c )*au
+                    ssmin = ssmin + ssmin
+                    ssmax = ga / ( c+c )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_slas2
+
+     !> This subroutine computes the square root of the I-th eigenvalue
+     !> of a positive symmetric rank-one modification of a 2-by-2 diagonal
+     !> matrix
+     !> diag( D ) * diag( D ) +  RHO * Z * transpose(Z) .
+     !> The diagonal entries in the array D are assumed to satisfy
+     !> 0 <= D(i) < D(j)  for  i < j .
+     !> We also assume RHO > 0 and that the Euclidean norm of the vector
+     !> Z is one.
+
+     pure subroutine stdlib_slasd5( i, d, z, delta, rho, dsigma, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i
+           real(sp), intent(out) :: dsigma
+           real(sp), intent(in) :: rho
+           ! Array Arguments 
+           real(sp), intent(in) :: d(2), z(2)
+           real(sp), intent(out) :: delta(2), work(2)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: b, c, del, delsq, tau, w
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           del = d( 2 ) - d( 1 )
+           delsq = del*( d( 2 )+d( 1 ) )
+           if( i==1 ) then
+              w = one + four*rho*( z( 2 )*z( 2 ) / ( d( 1 )+three*d( 2 ) )-z( 1 )*z( 1 ) / ( &
+                        three*d( 1 )+d( 2 ) ) ) / del
+              if( w>zero ) then
+                 b = delsq + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+                 c = rho*z( 1 )*z( 1 )*delsq
+                 ! b > zero, always
+                 ! the following tau is dsigma * dsigma - d( 1 ) * d( 1 )
+                 tau = two*c / ( b+sqrt( abs( b*b-four*c ) ) )
+                 ! the following tau is dsigma - d( 1 )
+                 tau = tau / ( d( 1 )+sqrt( d( 1 )*d( 1 )+tau ) )
+                 dsigma = d( 1 ) + tau
+                 delta( 1 ) = -tau
+                 delta( 2 ) = del - tau
+                 work( 1 ) = two*d( 1 ) + tau
+                 work( 2 ) = ( d( 1 )+tau ) + d( 2 )
+                 ! delta( 1 ) = -z( 1 ) / tau
+                 ! delta( 2 ) = z( 2 ) / ( del-tau )
+              else
+                 b = -delsq + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+                 c = rho*z( 2 )*z( 2 )*delsq
+                 ! the following tau is dsigma * dsigma - d( 2 ) * d( 2 )
+                 if( b>zero ) then
+                    tau = -two*c / ( b+sqrt( b*b+four*c ) )
+                 else
+                    tau = ( b-sqrt( b*b+four*c ) ) / two
+                 end if
+                 ! the following tau is dsigma - d( 2 )
+                 tau = tau / ( d( 2 )+sqrt( abs( d( 2 )*d( 2 )+tau ) ) )
+                 dsigma = d( 2 ) + tau
+                 delta( 1 ) = -( del+tau )
+                 delta( 2 ) = -tau
+                 work( 1 ) = d( 1 ) + tau + d( 2 )
+                 work( 2 ) = two*d( 2 ) + tau
+                 ! delta( 1 ) = -z( 1 ) / ( del+tau )
+                 ! delta( 2 ) = -z( 2 ) / tau
+              end if
+              ! temp = sqrt( delta( 1 )*delta( 1 )+delta( 2 )*delta( 2 ) )
+              ! delta( 1 ) = delta( 1 ) / temp
+              ! delta( 2 ) = delta( 2 ) / temp
+           else
+              ! now i=2
+              b = -delsq + rho*( z( 1 )*z( 1 )+z( 2 )*z( 2 ) )
+              c = rho*z( 2 )*z( 2 )*delsq
+              ! the following tau is dsigma * dsigma - d( 2 ) * d( 2 )
+              if( b>zero ) then
+                 tau = ( b+sqrt( b*b+four*c ) ) / two
+              else
+                 tau = two*c / ( -b+sqrt( b*b+four*c ) )
+              end if
+              ! the following tau is dsigma - d( 2 )
+              tau = tau / ( d( 2 )+sqrt( d( 2 )*d( 2 )+tau ) )
+              dsigma = d( 2 ) + tau
+              delta( 1 ) = -( del+tau )
+              delta( 2 ) = -tau
+              work( 1 ) = d( 1 ) + tau + d( 2 )
+              work( 2 ) = two*d( 2 ) + tau
+              ! delta( 1 ) = -z( 1 ) / ( del+tau )
+              ! delta( 2 ) = -z( 2 ) / tau
+              ! temp = sqrt( delta( 1 )*delta( 1 )+delta( 2 )*delta( 2 ) )
+              ! delta( 1 ) = delta( 1 ) / temp
+              ! delta( 2 ) = delta( 2 ) / temp
+           end if
+           return
+     end subroutine stdlib_slasd5
+
+     !> SLASDT: creates a tree of subproblems for bidiagonal divide and
+     !> conquer.
+
+     pure subroutine stdlib_slasdt( n, lvl, nd, inode, ndiml, ndimr, msub )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: lvl, nd
+           integer(ilp), intent(in) :: msub, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: inode(*), ndiml(*), ndimr(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, il, ir, llst, maxn, ncrnt, nlvl
+           real(sp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: int,log,max,real
+           ! Executable Statements 
+           ! find the number of levels on the tree.
+           maxn = max( 1, n )
+           temp = log( real( maxn,KIND=sp) / real( msub+1,KIND=sp) ) / log( two )
+           lvl = int( temp,KIND=ilp) + 1
+           i = n / 2
+           inode( 1 ) = i + 1
+           ndiml( 1 ) = i
+           ndimr( 1 ) = n - i - 1
+           il = 0
+           ir = 1
+           llst = 1
+           do nlvl = 1, lvl - 1
+              ! constructing the tree at (nlvl+1)-st level. the number of
+              ! nodes created on this level is llst * 2.
+              do i = 0, llst - 1
+                 il = il + 2
+                 ir = ir + 2
+                 ncrnt = llst + i
+                 ndiml( il ) = ndiml( ncrnt ) / 2
+                 ndimr( il ) = ndiml( ncrnt ) - ndiml( il ) - 1
+                 inode( il ) = inode( ncrnt ) - ndimr( il ) - 1
+                 ndiml( ir ) = ndimr( ncrnt ) / 2
+                 ndimr( ir ) = ndimr( ncrnt ) - ndiml( ir ) - 1
+                 inode( ir ) = inode( ncrnt ) + ndiml( ir ) + 1
+              end do
+              llst = llst*2
+           end do
+           nd = llst*2 - 1
+           return
+     end subroutine stdlib_slasdt
+
+     !> SLASET: initializes an m-by-n matrix A to BETA on the diagonal and
+     !> ALPHA on the offdiagonals.
+
+     pure subroutine stdlib_slaset( uplo, m, n, alpha, beta, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, m, n
+           real(sp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           real(sp), intent(out) :: a(lda,*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! set the strictly upper triangular or trapezoidal part of the
+              ! array to alpha.
+              do j = 2, n
+                 do i = 1, min( j-1, m )
+                    a( i, j ) = alpha
+                 end do
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              ! set the strictly lower triangular or trapezoidal part of the
+              ! array to alpha.
+              do j = 1, min( m, n )
+                 do i = j + 1, m
+                    a( i, j ) = alpha
+                 end do
+              end do
+           else
+              ! set the leading m-by-n submatrix to alpha.
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = alpha
+                 end do
+              end do
+           end if
+           ! set the first min(m,n) diagonal elements to beta.
+           do i = 1, min( m, n )
+              a( i, i ) = beta
+           end do
+           return
+     end subroutine stdlib_slaset
+
+     !> SLASQ4: computes an approximation TAU to the smallest eigenvalue
+     !> using values of d from the previous transform.
+
+     pure subroutine stdlib_slasq4( i0, n0, z, pp, n0in, dmin, dmin1, dmin2, dn,dn1, dn2, tau, &
+               ttype, g )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i0, n0, n0in, pp
+           integer(ilp), intent(out) :: ttype
+           real(sp), intent(in) :: dmin, dmin1, dmin2, dn, dn1, dn2
+           real(sp), intent(inout) :: g
+           real(sp), intent(out) :: tau
+           ! Array Arguments 
+           real(sp), intent(in) :: z(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: cnst1 = 0.5630_sp
+           real(sp), parameter :: cnst2 = 1.010_sp
+           real(sp), parameter :: cnst3 = 1.050_sp
+           real(sp), parameter :: qurtr = 0.250_sp
+           real(sp), parameter :: third = 0.3330_sp
+           real(sp), parameter :: hundrd = 100.0_sp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i4, nn, np
+           real(sp) :: a2, b1, b2, gam, gap1, gap2, s
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! a negative dmin forces the shift to take that absolute value
+           ! ttype records the type of shift.
+           if( dmin<=zero ) then
+              tau = -dmin
+              ttype = -1
+              return
+           end if
+           nn = 4*n0 + pp
+           if( n0in==n0 ) then
+              ! no eigenvalues deflated.
+              if( dmin==dn .or. dmin==dn1 ) then
+                 b1 = sqrt( z( nn-3 ) )*sqrt( z( nn-5 ) )
+                 b2 = sqrt( z( nn-7 ) )*sqrt( z( nn-9 ) )
+                 a2 = z( nn-7 ) + z( nn-5 )
+                 ! cases 2 and 3.
+                 if( dmin==dn .and. dmin1==dn1 ) then
+                    gap2 = dmin2 - a2 - dmin2*qurtr
+                    if( gap2>zero .and. gap2>b2 ) then
+                       gap1 = a2 - dn - ( b2 / gap2 )*b2
+                    else
+                       gap1 = a2 - dn - ( b1+b2 )
+                    end if
+                    if( gap1>zero .and. gap1>b1 ) then
+                       s = max( dn-( b1 / gap1 )*b1, half*dmin )
+                       ttype = -2
+                    else
+                       s = zero
+                       if( dn>b1 )s = dn - b1
+                       if( a2>( b1+b2 ) )s = min( s, a2-( b1+b2 ) )
+                       s = max( s, third*dmin )
+                       ttype = -3
+                    end if
+                 else
+                    ! case 4.
+                    ttype = -4
+                    s = qurtr*dmin
+                    if( dmin==dn ) then
+                       gam = dn
+                       a2 = zero
+                       if( z( nn-5 ) > z( nn-7 ) )return
+                       b2 = z( nn-5 ) / z( nn-7 )
+                       np = nn - 9
+                    else
+                       np = nn - 2*pp
+                       gam = dn1
+                       if( z( np-4 ) > z( np-2 ) )return
+                       a2 = z( np-4 ) / z( np-2 )
+                       if( z( nn-9 ) > z( nn-11 ) )return
+                       b2 = z( nn-9 ) / z( nn-11 )
+                       np = nn - 13
+                    end if
+                    ! approximate contribution to norm squared from i < nn-1.
+                    a2 = a2 + b2
+                    do i4 = np, 4*i0 - 1 + pp, -4
+                       if( b2==zero )go to 20
+                       b1 = b2
+                       if( z( i4 ) > z( i4-2 ) )return
+                       b2 = b2*( z( i4 ) / z( i4-2 ) )
+                       a2 = a2 + b2
+                       if( hundrd*max( b2, b1 )<a2 .or. cnst1<a2 )go to 20
+                    end do
+                    20 continue
+                    a2 = cnst3*a2
+                    ! rayleigh quotient residual bound.
+                    if( a2<cnst1 )s = gam*( one-sqrt( a2 ) ) / ( one+a2 )
+                 end if
+              else if( dmin==dn2 ) then
+                 ! case 5.
+                 ttype = -5
+                 s = qurtr*dmin
+                 ! compute contribution to norm squared from i > nn-2.
+                 np = nn - 2*pp
+                 b1 = z( np-2 )
+                 b2 = z( np-6 )
+                 gam = dn2
+                 if( z( np-8 )>b2 .or. z( np-4 )>b1 )return
+                 a2 = ( z( np-8 ) / b2 )*( one+z( np-4 ) / b1 )
+                 ! approximate contribution to norm squared from i < nn-2.
+                 if( n0-i0>2 ) then
+                    b2 = z( nn-13 ) / z( nn-15 )
+                    a2 = a2 + b2
+                    do i4 = nn - 17, 4*i0 - 1 + pp, -4
+                       if( b2==zero )go to 40
+                       b1 = b2
+                       if( z( i4 ) > z( i4-2 ) )return
+                       b2 = b2*( z( i4 ) / z( i4-2 ) )
+                       a2 = a2 + b2
+                       if( hundrd*max( b2, b1 )<a2 .or. cnst1<a2 )go to 40
+                    end do
+                    40 continue
+                    a2 = cnst3*a2
+                 end if
+                 if( a2<cnst1 )s = gam*( one-sqrt( a2 ) ) / ( one+a2 )
+              else
+                 ! case 6, no information to guide us.
+                 if( ttype==-6 ) then
+                    g = g + third*( one-g )
+                 else if( ttype==-18 ) then
+                    g = qurtr*third
+                 else
+                    g = qurtr
+                 end if
+                 s = g*dmin
+                 ttype = -6
+              end if
+           else if( n0in==( n0+1 ) ) then
+              ! one eigenvalue just deflated. use dmin1, dn1 for dmin and dn.
+              if( dmin1==dn1 .and. dmin2==dn2 ) then
+                 ! cases 7 and 8.
+                 ttype = -7
+                 s = third*dmin1
+                 if( z( nn-5 )>z( nn-7 ) )return
+                 b1 = z( nn-5 ) / z( nn-7 )
+                 b2 = b1
+                 if( b2==zero )go to 60
+                 do i4 = 4*n0 - 9 + pp, 4*i0 - 1 + pp, -4
+                    a2 = b1
+                    if( z( i4 )>z( i4-2 ) )return
+                    b1 = b1*( z( i4 ) / z( i4-2 ) )
+                    b2 = b2 + b1
+                    if( hundrd*max( b1, a2 )<b2 )go to 60
+                 end do
+                 60 continue
+                 b2 = sqrt( cnst3*b2 )
+                 a2 = dmin1 / ( one+b2**2 )
+                 gap2 = half*dmin2 - a2
+                 if( gap2>zero .and. gap2>b2*a2 ) then
+                    s = max( s, a2*( one-cnst2*a2*( b2 / gap2 )*b2 ) )
+                 else
+                    s = max( s, a2*( one-cnst2*b2 ) )
+                    ttype = -8
+                 end if
+              else
+                 ! case 9.
+                 s = qurtr*dmin1
+                 if( dmin1==dn1 )s = half*dmin1
+                 ttype = -9
+              end if
+           else if( n0in==( n0+2 ) ) then
+              ! two eigenvalues deflated. use dmin2, dn2 for dmin and dn.
+              ! cases 10 and 11.
+              if( dmin2==dn2 .and. two*z( nn-5 )<z( nn-7 ) ) then
+                 ttype = -10
+                 s = third*dmin2
+                 if( z( nn-5 )>z( nn-7 ) )return
+                 b1 = z( nn-5 ) / z( nn-7 )
+                 b2 = b1
+                 if( b2==zero )go to 80
+                 do i4 = 4*n0 - 9 + pp, 4*i0 - 1 + pp, -4
+                    if( z( i4 )>z( i4-2 ) )return
+                    b1 = b1*( z( i4 ) / z( i4-2 ) )
+                    b2 = b2 + b1
+                    if( hundrd*b1<b2 )go to 80
+                 end do
+                 80 continue
+                 b2 = sqrt( cnst3*b2 )
+                 a2 = dmin2 / ( one+b2**2 )
+                 gap2 = z( nn-7 ) + z( nn-9 ) -sqrt( z( nn-11 ) )*sqrt( z( nn-9 ) ) - a2
+                 if( gap2>zero .and. gap2>b2*a2 ) then
+                    s = max( s, a2*( one-cnst2*a2*( b2 / gap2 )*b2 ) )
+                 else
+                    s = max( s, a2*( one-cnst2*b2 ) )
+                 end if
+              else
+                 s = qurtr*dmin2
+                 ttype = -11
+              end if
+           else if( n0in>( n0+2 ) ) then
+              ! case 12, more than two eigenvalues deflated. no information.
+              s = zero
+              ttype = -12
+           end if
+           tau = s
+           return
+     end subroutine stdlib_slasq4
+
+     !> SLASQ5: computes one dqds transform in ping-pong form, one
+     !> version for IEEE machines another for non IEEE machines.
+
+     pure subroutine stdlib_slasq5( i0, n0, z, pp, tau, sigma, dmin, dmin1, dmin2,dn, dnm1, dnm2, &
+               ieee, eps )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: ieee
+           integer(ilp), intent(in) :: i0, n0, pp
+           real(sp), intent(out) :: dmin, dmin1, dmin2, dn, dnm1, dnm2
+           real(sp), intent(inout) :: tau
+           real(sp), intent(in) :: sigma, eps
+           ! Array Arguments 
+           real(sp), intent(inout) :: z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j4, j4p2
+           real(sp) :: d, emin, temp, dthresh
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( ( n0-i0-1 )<=0 )return
+           dthresh = eps*(sigma+tau)
+           if( tau<dthresh*half ) tau = zero
+           if( tau/=zero ) then
+              j4 = 4*i0 + pp - 3
+              emin = z( j4+4 )
+              d = z( j4 ) - tau
+              dmin = d
+              dmin1 = -z( j4 )
+              if( ieee ) then
+           ! code for ieee arithmetic.
+                 if( pp==0 ) then
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-2 ) = d + z( j4-1 )
+                       temp = z( j4+1 ) / z( j4-2 )
+                       d = d*temp - tau
+                       dmin = min( dmin, d )
+                       z( j4 ) = z( j4-1 )*temp
+                       emin = min( z( j4 ), emin )
+                    end do
+                 else
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-3 ) = d + z( j4 )
+                       temp = z( j4+2 ) / z( j4-3 )
+                       d = d*temp - tau
+                       dmin = min( dmin, d )
+                       z( j4-1 ) = z( j4 )*temp
+                       emin = min( z( j4-1 ), emin )
+                    end do
+                 end if
+           ! unroll last two steps.
+                 dnm2 = d
+                 dmin2 = dmin
+                 j4 = 4*( n0-2 ) - pp
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm2 + z( j4p2 )
+                 z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                 dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) ) - tau
+                 dmin = min( dmin, dnm1 )
+                 dmin1 = dmin
+                 j4 = j4 + 4
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm1 + z( j4p2 )
+                 z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                 dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) ) - tau
+                 dmin = min( dmin, dn )
+              else
+           ! code for non ieee arithmetic.
+                 if( pp==0 ) then
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-2 ) = d + z( j4-1 )
+                       if( d<zero ) then
+                          return
+                       else
+                          z( j4 ) = z( j4+1 )*( z( j4-1 ) / z( j4-2 ) )
+                          d = z( j4+1 )*( d / z( j4-2 ) ) - tau
+                       end if
+                       dmin = min( dmin, d )
+                       emin = min( emin, z( j4 ) )
+                    end do
+                 else
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-3 ) = d + z( j4 )
+                       if( d<zero ) then
+                          return
+                       else
+                          z( j4-1 ) = z( j4+2 )*( z( j4 ) / z( j4-3 ) )
+                          d = z( j4+2 )*( d / z( j4-3 ) ) - tau
+                       end if
+                       dmin = min( dmin, d )
+                       emin = min( emin, z( j4-1 ) )
+                    end do
+                 end if
+           ! unroll last two steps.
+                 dnm2 = d
+                 dmin2 = dmin
+                 j4 = 4*( n0-2 ) - pp
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm2 + z( j4p2 )
+                 if( dnm2<zero ) then
+                    return
+                 else
+                    z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                    dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) ) - tau
+                 end if
+                 dmin = min( dmin, dnm1 )
+                 dmin1 = dmin
+                 j4 = j4 + 4
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm1 + z( j4p2 )
+                 if( dnm1<zero ) then
+                    return
+                 else
+                    z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                    dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) ) - tau
+                 end if
+                 dmin = min( dmin, dn )
+              end if
+           else
+           ! this is the version that sets d's to zero if they are small enough
+              j4 = 4*i0 + pp - 3
+              emin = z( j4+4 )
+              d = z( j4 ) - tau
+              dmin = d
+              dmin1 = -z( j4 )
+              if( ieee ) then
+           ! code for ieee arithmetic.
+                 if( pp==0 ) then
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-2 ) = d + z( j4-1 )
+                       temp = z( j4+1 ) / z( j4-2 )
+                       d = d*temp - tau
+                       if( d<dthresh ) d = zero
+                       dmin = min( dmin, d )
+                       z( j4 ) = z( j4-1 )*temp
+                       emin = min( z( j4 ), emin )
+                    end do
+                 else
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-3 ) = d + z( j4 )
+                       temp = z( j4+2 ) / z( j4-3 )
+                       d = d*temp - tau
+                       if( d<dthresh ) d = zero
+                       dmin = min( dmin, d )
+                       z( j4-1 ) = z( j4 )*temp
+                       emin = min( z( j4-1 ), emin )
+                    end do
+                 end if
+           ! unroll last two steps.
+                 dnm2 = d
+                 dmin2 = dmin
+                 j4 = 4*( n0-2 ) - pp
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm2 + z( j4p2 )
+                 z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                 dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) ) - tau
+                 dmin = min( dmin, dnm1 )
+                 dmin1 = dmin
+                 j4 = j4 + 4
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm1 + z( j4p2 )
+                 z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                 dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) ) - tau
+                 dmin = min( dmin, dn )
+              else
+           ! code for non ieee arithmetic.
+                 if( pp==0 ) then
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-2 ) = d + z( j4-1 )
+                       if( d<zero ) then
+                          return
+                       else
+                          z( j4 ) = z( j4+1 )*( z( j4-1 ) / z( j4-2 ) )
+                          d = z( j4+1 )*( d / z( j4-2 ) ) - tau
+                       end if
+                       if( d<dthresh ) d = zero
+                       dmin = min( dmin, d )
+                       emin = min( emin, z( j4 ) )
+                    end do
+                 else
+                    do j4 = 4*i0, 4*( n0-3 ), 4
+                       z( j4-3 ) = d + z( j4 )
+                       if( d<zero ) then
+                          return
+                       else
+                          z( j4-1 ) = z( j4+2 )*( z( j4 ) / z( j4-3 ) )
+                          d = z( j4+2 )*( d / z( j4-3 ) ) - tau
+                       end if
+                       if( d<dthresh ) d = zero
+                       dmin = min( dmin, d )
+                       emin = min( emin, z( j4-1 ) )
+                    end do
+                 end if
+           ! unroll last two steps.
+                 dnm2 = d
+                 dmin2 = dmin
+                 j4 = 4*( n0-2 ) - pp
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm2 + z( j4p2 )
+                 if( dnm2<zero ) then
+                    return
+                 else
+                    z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                    dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) ) - tau
+                 end if
+                 dmin = min( dmin, dnm1 )
+                 dmin1 = dmin
+                 j4 = j4 + 4
+                 j4p2 = j4 + 2*pp - 1
+                 z( j4-2 ) = dnm1 + z( j4p2 )
+                 if( dnm1<zero ) then
+                    return
+                 else
+                    z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+                    dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) ) - tau
+                 end if
+                 dmin = min( dmin, dn )
+              end if
+           end if
+           z( j4+2 ) = dn
+           z( 4*n0-pp ) = emin
+           return
+     end subroutine stdlib_slasq5
+
+     !> SLASQ6: computes one dqd (shift equal to zero) transform in
+     !> ping-pong form, with protection against underflow and overflow.
+
+     pure subroutine stdlib_slasq6( i0, n0, z, pp, dmin, dmin1, dmin2, dn,dnm1, dnm2 )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i0, n0, pp
+           real(sp), intent(out) :: dmin, dmin1, dmin2, dn, dnm1, dnm2
+           ! Array Arguments 
+           real(sp), intent(inout) :: z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j4, j4p2
+           real(sp) :: d, emin, safmin, temp
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( ( n0-i0-1 )<=0 )return
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           j4 = 4*i0 + pp - 3
+           emin = z( j4+4 )
+           d = z( j4 )
+           dmin = d
+           if( pp==0 ) then
+              do j4 = 4*i0, 4*( n0-3 ), 4
+                 z( j4-2 ) = d + z( j4-1 )
+                 if( z( j4-2 )==zero ) then
+                    z( j4 ) = zero
+                    d = z( j4+1 )
+                    dmin = d
+                    emin = zero
+                 else if( safmin*z( j4+1 )<z( j4-2 ) .and.safmin*z( j4-2 )<z( j4+1 ) ) &
+                           then
+                    temp = z( j4+1 ) / z( j4-2 )
+                    z( j4 ) = z( j4-1 )*temp
+                    d = d*temp
+                 else
+                    z( j4 ) = z( j4+1 )*( z( j4-1 ) / z( j4-2 ) )
+                    d = z( j4+1 )*( d / z( j4-2 ) )
+                 end if
+                 dmin = min( dmin, d )
+                 emin = min( emin, z( j4 ) )
+              end do
+           else
+              do j4 = 4*i0, 4*( n0-3 ), 4
+                 z( j4-3 ) = d + z( j4 )
+                 if( z( j4-3 )==zero ) then
+                    z( j4-1 ) = zero
+                    d = z( j4+2 )
+                    dmin = d
+                    emin = zero
+                 else if( safmin*z( j4+2 )<z( j4-3 ) .and.safmin*z( j4-3 )<z( j4+2 ) ) &
+                           then
+                    temp = z( j4+2 ) / z( j4-3 )
+                    z( j4-1 ) = z( j4 )*temp
+                    d = d*temp
+                 else
+                    z( j4-1 ) = z( j4+2 )*( z( j4 ) / z( j4-3 ) )
+                    d = z( j4+2 )*( d / z( j4-3 ) )
+                 end if
+                 dmin = min( dmin, d )
+                 emin = min( emin, z( j4-1 ) )
+              end do
+           end if
+           ! unroll last two steps.
+           dnm2 = d
+           dmin2 = dmin
+           j4 = 4*( n0-2 ) - pp
+           j4p2 = j4 + 2*pp - 1
+           z( j4-2 ) = dnm2 + z( j4p2 )
+           if( z( j4-2 )==zero ) then
+              z( j4 ) = zero
+              dnm1 = z( j4p2+2 )
+              dmin = dnm1
+              emin = zero
+           else if( safmin*z( j4p2+2 )<z( j4-2 ) .and.safmin*z( j4-2 )<z( j4p2+2 ) ) then
+              temp = z( j4p2+2 ) / z( j4-2 )
+              z( j4 ) = z( j4p2 )*temp
+              dnm1 = dnm2*temp
+           else
+              z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+              dnm1 = z( j4p2+2 )*( dnm2 / z( j4-2 ) )
+           end if
+           dmin = min( dmin, dnm1 )
+           dmin1 = dmin
+           j4 = j4 + 4
+           j4p2 = j4 + 2*pp - 1
+           z( j4-2 ) = dnm1 + z( j4p2 )
+           if( z( j4-2 )==zero ) then
+              z( j4 ) = zero
+              dn = z( j4p2+2 )
+              dmin = dn
+              emin = zero
+           else if( safmin*z( j4p2+2 )<z( j4-2 ) .and.safmin*z( j4-2 )<z( j4p2+2 ) ) then
+              temp = z( j4p2+2 ) / z( j4-2 )
+              z( j4 ) = z( j4p2 )*temp
+              dn = dnm1*temp
+           else
+              z( j4 ) = z( j4p2+2 )*( z( j4p2 ) / z( j4-2 ) )
+              dn = z( j4p2+2 )*( dnm1 / z( j4-2 ) )
+           end if
+           dmin = min( dmin, dn )
+           z( j4+2 ) = dn
+           z( 4*n0-pp ) = emin
+           return
+     end subroutine stdlib_slasq6
+
+     !> SLASR: applies a sequence of plane rotations to a real matrix A,
+     !> from either the left or the right.
+     !> When SIDE = 'L', the transformation takes the form
+     !> A := P*A
+     !> and when SIDE = 'R', the transformation takes the form
+     !> A := A*P**T
+     !> where P is an orthogonal matrix consisting of a sequence of z plane
+     !> rotations, with z = M when SIDE = 'L' and z = N when SIDE = 'R',
+     !> and P**T is the transpose of P.
+     !> When DIRECT = 'F' (Forward sequence), then
+     !> P = P(z-1) * ... * P(2) * P(1)
+     !> and when DIRECT = 'B' (Backward sequence), then
+     !> P = P(1) * P(2) * ... * P(z-1)
+     !> where P(k) is a plane rotation matrix defined by the 2-by-2 rotation
+     !> R(k) = (  c(k)  s(k) )
+     !> = ( -s(k)  c(k) ).
+     !> When PIVOT = 'V' (Variable pivot), the rotation is performed
+     !> for the plane (k,k+1), i.e., P(k) has the form
+     !> P(k) = (  1                                            )
+     !> (       ...                                     )
+     !> (              1                                )
+     !> (                   c(k)  s(k)                  )
+     !> (                  -s(k)  c(k)                  )
+     !> (                                1              )
+     !> (                                     ...       )
+     !> (                                            1  )
+     !> where R(k) appears as a rank-2 modification to the identity matrix in
+     !> rows and columns k and k+1.
+     !> When PIVOT = 'T' (Top pivot), the rotation is performed for the
+     !> plane (1,k+1), so P(k) has the form
+     !> P(k) = (  c(k)                    s(k)                 )
+     !> (         1                                     )
+     !> (              ...                              )
+     !> (                     1                         )
+     !> ( -s(k)                    c(k)                 )
+     !> (                                 1             )
+     !> (                                      ...      )
+     !> (                                             1 )
+     !> where R(k) appears in rows and columns 1 and k+1.
+     !> Similarly, when PIVOT = 'B' (Bottom pivot), the rotation is
+     !> performed for the plane (k,z), giving P(k) the form
+     !> P(k) = ( 1                                             )
+     !> (      ...                                      )
+     !> (             1                                 )
+     !> (                  c(k)                    s(k) )
+     !> (                         1                     )
+     !> (                              ...              )
+     !> (                                     1         )
+     !> (                 -s(k)                    c(k) )
+     !> where R(k) appears in rows and columns k and z.  The rotations are
+     !> performed without ever forming P(k) explicitly.
+
+     pure subroutine stdlib_slasr( side, pivot, direct, m, n, c, s, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, pivot, side
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: c(*), s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j
+           real(sp) :: ctemp, stemp, temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( .not.( stdlib_lsame( side, 'L' ) .or. stdlib_lsame( side, 'R' ) ) ) then
+              info = 1
+           else if( .not.( stdlib_lsame( pivot, 'V' ) .or. stdlib_lsame( pivot,'T' ) .or. &
+                     stdlib_lsame( pivot, 'B' ) ) ) then
+              info = 2
+           else if( .not.( stdlib_lsame( direct, 'F' ) .or. stdlib_lsame( direct, 'B' ) ) )&
+                     then
+              info = 3
+           else if( m<0 ) then
+              info = 4
+           else if( n<0 ) then
+              info = 5
+           else if( lda<max( 1, m ) ) then
+              info = 9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLASR ', info )
+              return
+           end if
+           ! quick return if possible
+           if( ( m==0 ) .or. ( n==0 ) )return
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  p * a
+              if( stdlib_lsame( pivot, 'V' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, m - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j+1, i )
+                             a( j+1, i ) = ctemp*temp - stemp*a( j, i )
+                             a( j, i ) = stemp*temp + ctemp*a( j, i )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j+1, i )
+                             a( j+1, i ) = ctemp*temp - stemp*a( j, i )
+                             a( j, i ) = stemp*temp + ctemp*a( j, i )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'T' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 2, m
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = ctemp*temp - stemp*a( 1, i )
+                             a( 1, i ) = stemp*temp + ctemp*a( 1, i )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m, 2, -1
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = ctemp*temp - stemp*a( 1, i )
+                             a( 1, i ) = stemp*temp + ctemp*a( 1, i )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'B' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, m - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = stemp*a( m, i ) + ctemp*temp
+                             a( m, i ) = ctemp*a( m, i ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = stemp*a( m, i ) + ctemp*temp
+                             a( m, i ) = ctemp*a( m, i ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 end if
+              end if
+           else if( stdlib_lsame( side, 'R' ) ) then
+              ! form a * p**t
+              if( stdlib_lsame( pivot, 'V' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, n - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j+1 )
+                             a( i, j+1 ) = ctemp*temp - stemp*a( i, j )
+                             a( i, j ) = stemp*temp + ctemp*a( i, j )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j+1 )
+                             a( i, j+1 ) = ctemp*temp - stemp*a( i, j )
+                             a( i, j ) = stemp*temp + ctemp*a( i, j )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'T' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 2, n
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = ctemp*temp - stemp*a( i, 1 )
+                             a( i, 1 ) = stemp*temp + ctemp*a( i, 1 )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n, 2, -1
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = ctemp*temp - stemp*a( i, 1 )
+                             a( i, 1 ) = stemp*temp + ctemp*a( i, 1 )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'B' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, n - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = stemp*a( i, n ) + ctemp*temp
+                             a( i, n ) = ctemp*a( i, n ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = stemp*a( i, n ) + ctemp*temp
+                             a( i, n ) = ctemp*a( i, n ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_slasr
+
+     !> Sort the numbers in D in increasing order (if ID = 'I') or
+     !> in decreasing order (if ID = 'D' ).
+     !> Use Quick Sort, reverting to Insertion sort on arrays of
+     !> size <= 20. Dimension of STACK limits N to about 2**32.
+
+     pure subroutine stdlib_slasrt( id, n, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: id
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(inout) :: d(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: select = 20
+           
+           ! Local Scalars 
+           integer(ilp) :: dir, endd, i, j, start, stkpnt
+           real(sp) :: d1, d2, d3, dmnmx, tmp
+           ! Local Arrays 
+           integer(ilp) :: stack(2,32)
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           dir = -1
+           if( stdlib_lsame( id, 'D' ) ) then
+              dir = 0
+           else if( stdlib_lsame( id, 'I' ) ) then
+              dir = 1
+           end if
+           if( dir==-1 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLASRT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 )return
+           stkpnt = 1
+           stack( 1, 1 ) = 1
+           stack( 2, 1 ) = n
+           10 continue
+           start = stack( 1, stkpnt )
+           endd = stack( 2, stkpnt )
+           stkpnt = stkpnt - 1
+           if( endd-start<=select .and. endd-start>0 ) then
+              ! do insertion sort on d( start:endd )
+              if( dir==0 ) then
+                 ! sort into decreasing order
+                 loop_30: do i = start + 1, endd
+                    do j = i, start + 1, -1
+                       if( d( j )>d( j-1 ) ) then
+                          dmnmx = d( j )
+                          d( j ) = d( j-1 )
+                          d( j-1 ) = dmnmx
+                       else
+                          cycle loop_30
+                       end if
+                    end do
+                 end do loop_30
+              else
+                 ! sort into increasing order
+                 loop_50: do i = start + 1, endd
+                    do j = i, start + 1, -1
+                       if( d( j )<d( j-1 ) ) then
+                          dmnmx = d( j )
+                          d( j ) = d( j-1 )
+                          d( j-1 ) = dmnmx
+                       else
+                          cycle loop_50
+                       end if
+                    end do
+                 end do loop_50
+              end if
+           else if( endd-start>select ) then
+              ! partition d( start:endd ) and stack parts, largest one first
+              ! choose partition entry as median of 3
+              d1 = d( start )
+              d2 = d( endd )
+              i = ( start+endd ) / 2
+              d3 = d( i )
+              if( d1<d2 ) then
+                 if( d3<d1 ) then
+                    dmnmx = d1
+                 else if( d3<d2 ) then
+                    dmnmx = d3
+                 else
+                    dmnmx = d2
+                 end if
+              else
+                 if( d3<d2 ) then
+                    dmnmx = d2
+                 else if( d3<d1 ) then
+                    dmnmx = d3
+                 else
+                    dmnmx = d1
+                 end if
+              end if
+              if( dir==0 ) then
+                 ! sort into decreasing order
+                 i = start - 1
+                 j = endd + 1
+                 60 continue
+                 70 continue
+                 j = j - 1
+                 if( d( j )<dmnmx )go to 70
+                 80 continue
+                 i = i + 1
+                 if( d( i )>dmnmx )go to 80
+                 if( i<j ) then
+                    tmp = d( i )
+                    d( i ) = d( j )
+                    d( j ) = tmp
+                    go to 60
+                 end if
+                 if( j-start>endd-j-1 ) then
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = start
+                    stack( 2, stkpnt ) = j
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = j + 1
+                    stack( 2, stkpnt ) = endd
+                 else
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = j + 1
+                    stack( 2, stkpnt ) = endd
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = start
+                    stack( 2, stkpnt ) = j
+                 end if
+              else
+                 ! sort into increasing order
+                 i = start - 1
+                 j = endd + 1
+                 90 continue
+                 100 continue
+                 j = j - 1
+                 if( d( j )>dmnmx )go to 100
+                 110 continue
+                 i = i + 1
+                 if( d( i )<dmnmx )go to 110
+                 if( i<j ) then
+                    tmp = d( i )
+                    d( i ) = d( j )
+                    d( j ) = tmp
+                    go to 90
+                 end if
+                 if( j-start>endd-j-1 ) then
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = start
+                    stack( 2, stkpnt ) = j
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = j + 1
+                    stack( 2, stkpnt ) = endd
+                 else
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = j + 1
+                    stack( 2, stkpnt ) = endd
+                    stkpnt = stkpnt + 1
+                    stack( 1, stkpnt ) = start
+                    stack( 2, stkpnt ) = j
+                 end if
+              end if
+           end if
+           if( stkpnt>0 )go to 10
+           return
+     end subroutine stdlib_slasrt
+
+     !> !
+     !>
+     !> SLASSQ:  returns the values  scl  and  smsq  such that
+     !> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+     !> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+     !> assumed to be non-negative.
+     !> scale and sumsq must be supplied in SCALE and SUMSQ and
+     !> scl and smsq are overwritten on SCALE and SUMSQ respectively.
+     !> If scale * sqrt( sumsq ) > tbig then
+     !> we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
+     !> and if 0 < scale * sqrt( sumsq ) < tsml then
+     !> we require:   scale <= sqrt( HUGE ) / ssml       on entry,
+     !> where
+     !> tbig -- upper threshold for values whose square is representable;
+     !> sbig -- scaling constant for big numbers; \see la_constants.f90
+     !> tsml -- lower threshold for values whose square is representable;
+     !> ssml -- scaling constant for small numbers; \see la_constants.f90
+     !> and
+     !> TINY*EPS -- tiniest representable number;
+     !> HUGE     -- biggest representable number.
+
+     pure subroutine stdlib_slassq( n, x, incx, scl, sumsq )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+        ! Scalar Arguments 
+     integer(ilp), intent(in) :: incx, n
+        real(sp), intent(inout) :: scl, sumsq
+        ! Array Arguments 
+        real(sp), intent(in) :: x(*)
+        ! Local Scalars 
+     integer(ilp) :: i, ix
+     logical(lk) :: notbig
+        real(sp) :: abig, amed, asml, ax, ymax, ymin
+        ! quick return if possible
+        if( ieee_is_nan(scl) .or. ieee_is_nan(sumsq) ) return
+        if( sumsq == zero ) scl = one
+        if( scl == zero ) then
+           scl = one
+           sumsq = zero
+        end if
+        if (n <= 0) then
+           return
+        end if
+        ! compute the sum of squares in 3 accumulators:
+           ! abig -- sums of squares scaled down to avoid overflow
+           ! asml -- sums of squares scaled up to avoid underflow
+           ! amed -- sums of squares that do not require scaling
+        ! the thresholds and multipliers are
+           ! tbig -- values bigger than this are scaled down by sbig
+           ! tsml -- values smaller than this are scaled up by ssml
+        notbig = .true.
+        asml = zero
+        amed = zero
+        abig = zero
+        ix = 1
+        if( incx < 0 ) ix = 1 - (n-1)*incx
+        do i = 1, n
+           ax = abs(x(ix))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ix = ix + incx
+        end do
+        ! put the existing sum of squares into one of the accumulators
+        if( sumsq > zero ) then
+           ax = scl*sqrt( sumsq )
+           if (ax > tbig) then
+              ! we assume scl >= sqrt( tiny*eps ) / sbig
+              abig = abig + (scl*sbig)**2 * sumsq
+           else if (ax < tsml) then
+              ! we assume scl <= sqrt( huge ) / ssml
+              if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+           else
+              amed = amed + scl**2 * sumsq
+           end if
+        end if
+        ! combine abig and amed or amed and asml if more than one
+        ! accumulator was used.
+        if (abig > zero) then
+           ! combine abig and amed if abig > 0.
+           if (amed > zero .or. ieee_is_nan(amed)) then
+              abig = abig + (amed*sbig)*sbig
+           end if
+           scl = one / sbig
+           sumsq = abig
+        else if (asml > zero) then
+           ! combine amed and asml if asml > 0.
+           if (amed > zero .or. ieee_is_nan(amed)) then
+              amed = sqrt(amed)
+              asml = sqrt(asml) / ssml
+              if (asml > amed) then
+                 ymin = amed
+                 ymax = asml
+              else
+                 ymin = asml
+                 ymax = amed
+              end if
+              scl = one
+              sumsq = ymax**2*( one + (ymin/ymax)**2 )
+           else
+              scl = one / ssml
+              sumsq = asml
+           end if
+        else
+           ! otherwise all values are mid-range or zero
+           scl = one
+           sumsq = amed
+        end if
+        return
+     end subroutine stdlib_slassq
+
+     !> SLASV2: computes the singular value decomposition of a 2-by-2
+     !> triangular matrix
+     !> [  F   G  ]
+     !> [  0   H  ].
+     !> On return, abs(SSMAX) is the larger singular value, abs(SSMIN) is the
+     !> smaller singular value, and (CSL,SNL) and (CSR,SNR) are the left and
+     !> right singular vectors for abs(SSMAX), giving the decomposition
+     !> [ CSL  SNL ] [  F   G  ] [ CSR -SNR ]  =  [ SSMAX   0   ]
+     !> [-SNL  CSL ] [  0   H  ] [ SNR  CSR ]     [  0    SSMIN ].
+
+     pure subroutine stdlib_slasv2( f, g, h, ssmin, ssmax, snr, csr, snl, csl )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(out) :: csl, csr, snl, snr, ssmax, ssmin
+           real(sp), intent(in) :: f, g, h
+       ! =====================================================================
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: gasmal, swap
+           integer(ilp) :: pmax
+           real(sp) :: a, clt, crt, d, fa, ft, ga, gt, ha, ht, l, m, mm, r, s, slt, srt, t, temp, &
+                     tsign, tt
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,sqrt
+           ! Executable Statements 
+           ft = f
+           fa = abs( ft )
+           ht = h
+           ha = abs( h )
+           ! pmax points to the maximum absolute element of matrix
+             ! pmax = 1 if f largest in absolute values
+             ! pmax = 2 if g largest in absolute values
+             ! pmax = 3 if h largest in absolute values
+           pmax = 1
+           swap = ( ha>fa )
+           if( swap ) then
+              pmax = 3
+              temp = ft
+              ft = ht
+              ht = temp
+              temp = fa
+              fa = ha
+              ha = temp
+              ! now fa .ge. ha
+           end if
+           gt = g
+           ga = abs( gt )
+           if( ga==zero ) then
+              ! diagonal matrix
+              ssmin = ha
+              ssmax = fa
+              clt = one
+              crt = one
+              slt = zero
+              srt = zero
+           else
+              gasmal = .true.
+              if( ga>fa ) then
+                 pmax = 2
+                 if( ( fa / ga )<stdlib_slamch( 'EPS' ) ) then
+                    ! case of very large ga
+                    gasmal = .false.
+                    ssmax = ga
+                    if( ha>one ) then
+                       ssmin = fa / ( ga / ha )
+                    else
+                       ssmin = ( fa / ga )*ha
+                    end if
+                    clt = one
+                    slt = ht / gt
+                    srt = one
+                    crt = ft / gt
+                 end if
+              end if
+              if( gasmal ) then
+                 ! normal case
+                 d = fa - ha
+                 if( d==fa ) then
+                    ! copes with infinite f or h
+                    l = one
+                 else
+                    l = d / fa
+                 end if
+                 ! note that 0 .le. l .le. 1
+                 m = gt / ft
+                 ! note that abs(m) .le. 1/macheps
+                 t = two - l
+                 ! note that t .ge. 1
+                 mm = m*m
+                 tt = t*t
+                 s = sqrt( tt+mm )
+                 ! note that 1 .le. s .le. 1 + 1/macheps
+                 if( l==zero ) then
+                    r = abs( m )
+                 else
+                    r = sqrt( l*l+mm )
+                 end if
+                 ! note that 0 .le. r .le. 1 + 1/macheps
+                 a = half*( s+r )
+                 ! note that 1 .le. a .le. 1 + abs(m)
+                 ssmin = ha / a
+                 ssmax = fa*a
+                 if( mm==zero ) then
+                    ! note that m is very tiny
+                    if( l==zero ) then
+                       t = sign( two, ft )*sign( one, gt )
+                    else
+                       t = gt / sign( d, ft ) + m / t
+                    end if
+                 else
+                    t = ( m / ( s+t )+m / ( r+l ) )*( one+a )
+                 end if
+                 l = sqrt( t*t+four )
+                 crt = two / l
+                 srt = t / l
+                 clt = ( crt+srt*m ) / a
+                 slt = ( ht / ft )*srt / a
+              end if
+           end if
+           if( swap ) then
+              csl = srt
+              snl = crt
+              csr = slt
+              snr = clt
+           else
+              csl = clt
+              snl = slt
+              csr = crt
+              snr = srt
+           end if
+           ! correct signs of ssmax and ssmin
+           if( pmax==1 )tsign = sign( one, csr )*sign( one, csl )*sign( one, f )
+           if( pmax==2 )tsign = sign( one, snr )*sign( one, csl )*sign( one, g )
+           if( pmax==3 )tsign = sign( one, snr )*sign( one, snl )*sign( one, h )
+           ssmax = sign( ssmax, tsign )
+           ssmin = sign( ssmin, tsign*sign( one, f )*sign( one, h ) )
+           return
+     end subroutine stdlib_slasv2
+
+     !> SLASWP: performs a series of row interchanges on the matrix A.
+     !> One row interchange is initiated for each of rows K1 through K2 of A.
+
+     pure subroutine stdlib_slaswp( n, a, lda, k1, k2, ipiv, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k1, k2, lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, i1, i2, inc, ip, ix, ix0, j, k, n32
+           real(sp) :: temp
+           ! Executable Statements 
+           ! interchange row i with row ipiv(k1+(i-k1)*abs(incx)) for each of rows
+           ! k1 through k2.
+           if( incx>0 ) then
+              ix0 = k1
+              i1 = k1
+              i2 = k2
+              inc = 1
+           else if( incx<0 ) then
+              ix0 = k1 + ( k1-k2 )*incx
+              i1 = k2
+              i2 = k1
+              inc = -1
+           else
+              return
+           end if
+           n32 = ( n / 32 )*32
+           if( n32/=0 ) then
+              do j = 1, n32, 32
+                 ix = ix0
+                 do i = i1, i2, inc
+                    ip = ipiv( ix )
+                    if( ip/=i ) then
+                       do k = j, j + 31
+                          temp = a( i, k )
+                          a( i, k ) = a( ip, k )
+                          a( ip, k ) = temp
+                       end do
+                    end if
+                    ix = ix + incx
+                 end do
+              end do
+           end if
+           if( n32/=n ) then
+              n32 = n32 + 1
+              ix = ix0
+              do i = i1, i2, inc
+                 ip = ipiv( ix )
+                 if( ip/=i ) then
+                    do k = n32, n
+                       temp = a( i, k )
+                       a( i, k ) = a( ip, k )
+                       a( ip, k ) = temp
+                    end do
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end subroutine stdlib_slaswp
+
+     !> SLASY2: solves for the N1 by N2 matrix X, 1 <= N1,N2 <= 2, in
+     !> op(TL)*X + ISGN*X*op(TR) = SCALE*B,
+     !> where TL is N1 by N1, TR is N2 by N2, B is N1 by N2, and ISGN = 1 or
+     !> -1.  op(T) = T or T**T, where T**T denotes the transpose of T.
+
+     pure subroutine stdlib_slasy2( ltranl, ltranr, isgn, n1, n2, tl, ldtl, tr,ldtr, b, ldb, &
+               scale, x, ldx, xnorm, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: ltranl, ltranr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: isgn, ldb, ldtl, ldtr, ldx, n1, n2
+           real(sp), intent(out) :: scale, xnorm
+           ! Array Arguments 
+           real(sp), intent(in) :: b(ldb,*), tl(ldtl,*), tr(ldtr,*)
+           real(sp), intent(out) :: x(ldx,*)
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: bswap, xswap
+           integer(ilp) :: i, ip, ipiv, ipsv, j, jp, jpsv, k
+           real(sp) :: bet, eps, gam, l21, sgn, smin, smlnum, tau1, temp, u11, u12, u22, &
+                     xmax
+           ! Local Arrays 
+           logical(lk) :: bswpiv(4), xswpiv(4)
+           integer(ilp) :: jpiv(4), locl21(4), locu12(4), locu22(4)
+           real(sp) :: btmp(4), t16(4,4), tmp(4), x2(2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Data Statements 
+           locu12 = [3,4,1,2]
+           locl21 = [2,1,4,3]
+           locu22 = [4,3,2,1]
+           xswpiv = [.false.,.false.,.true.,.true.]
+           bswpiv = [.false.,.true.,.false.,.true.]
+           ! Executable Statements 
+           ! do not check the input parameters for errors
+           info = 0
+           ! quick return if possible
+           if( n1==0 .or. n2==0 )return
+           ! set constants to control overflow
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' ) / eps
+           sgn = isgn
+           k = n1 + n1 + n2 - 2
+           go to ( 10, 20, 30, 50 )k
+           ! 1 by 1: tl11*x + sgn*x*tr11 = b11
+           10 continue
+           tau1 = tl( 1, 1 ) + sgn*tr( 1, 1 )
+           bet = abs( tau1 )
+           if( bet<=smlnum ) then
+              tau1 = smlnum
+              bet = smlnum
+              info = 1
+           end if
+           scale = one
+           gam = abs( b( 1, 1 ) )
+           if( smlnum*gam>bet )scale = one / gam
+           x( 1, 1 ) = ( b( 1, 1 )*scale ) / tau1
+           xnorm = abs( x( 1, 1 ) )
+           return
+           ! 1 by 2:
+           ! tl11*[x11 x12] + isgn*[x11 x12]*op[tr11 tr12]  = [b11 b12]
+                                             ! [tr21 tr22]
+                                             20 continue
+           smin = max( eps*max( abs( tl( 1, 1 ) ), abs( tr( 1, 1 ) ),abs( tr( 1, 2 ) ), abs( tr( &
+                     2, 1 ) ), abs( tr( 2, 2 ) ) ),smlnum )
+           tmp( 1 ) = tl( 1, 1 ) + sgn*tr( 1, 1 )
+           tmp( 4 ) = tl( 1, 1 ) + sgn*tr( 2, 2 )
+           if( ltranr ) then
+              tmp( 2 ) = sgn*tr( 2, 1 )
+              tmp( 3 ) = sgn*tr( 1, 2 )
+           else
+              tmp( 2 ) = sgn*tr( 1, 2 )
+              tmp( 3 ) = sgn*tr( 2, 1 )
+           end if
+           btmp( 1 ) = b( 1, 1 )
+           btmp( 2 ) = b( 1, 2 )
+           go to 40
+           ! 2 by 1:
+                ! op[tl11 tl12]*[x11] + isgn* [x11]*tr11  = [b11]
+                  ! [tl21 tl22] [x21]         [x21]         [b21]
+                  30 continue
+           smin = max( eps*max( abs( tr( 1, 1 ) ), abs( tl( 1, 1 ) ),abs( tl( 1, 2 ) ), abs( tl( &
+                     2, 1 ) ), abs( tl( 2, 2 ) ) ),smlnum )
+           tmp( 1 ) = tl( 1, 1 ) + sgn*tr( 1, 1 )
+           tmp( 4 ) = tl( 2, 2 ) + sgn*tr( 1, 1 )
+           if( ltranl ) then
+              tmp( 2 ) = tl( 1, 2 )
+              tmp( 3 ) = tl( 2, 1 )
+           else
+              tmp( 2 ) = tl( 2, 1 )
+              tmp( 3 ) = tl( 1, 2 )
+           end if
+           btmp( 1 ) = b( 1, 1 )
+           btmp( 2 ) = b( 2, 1 )
+           40 continue
+           ! solve 2 by 2 system using complete pivoting.
+           ! set pivots less than smin to smin.
+           ipiv = stdlib_isamax( 4, tmp, 1 )
+           u11 = tmp( ipiv )
+           if( abs( u11 )<=smin ) then
+              info = 1
+              u11 = smin
+           end if
+           u12 = tmp( locu12( ipiv ) )
+           l21 = tmp( locl21( ipiv ) ) / u11
+           u22 = tmp( locu22( ipiv ) ) - u12*l21
+           xswap = xswpiv( ipiv )
+           bswap = bswpiv( ipiv )
+           if( abs( u22 )<=smin ) then
+              info = 1
+              u22 = smin
+           end if
+           if( bswap ) then
+              temp = btmp( 2 )
+              btmp( 2 ) = btmp( 1 ) - l21*temp
+              btmp( 1 ) = temp
+           else
+              btmp( 2 ) = btmp( 2 ) - l21*btmp( 1 )
+           end if
+           scale = one
+           if( ( two*smlnum )*abs( btmp( 2 ) )>abs( u22 ) .or.( two*smlnum )*abs( btmp( 1 ) )>abs(&
+                      u11 ) ) then
+              scale = half / max( abs( btmp( 1 ) ), abs( btmp( 2 ) ) )
+              btmp( 1 ) = btmp( 1 )*scale
+              btmp( 2 ) = btmp( 2 )*scale
+           end if
+           x2( 2 ) = btmp( 2 ) / u22
+           x2( 1 ) = btmp( 1 ) / u11 - ( u12 / u11 )*x2( 2 )
+           if( xswap ) then
+              temp = x2( 2 )
+              x2( 2 ) = x2( 1 )
+              x2( 1 ) = temp
+           end if
+           x( 1, 1 ) = x2( 1 )
+           if( n1==1 ) then
+              x( 1, 2 ) = x2( 2 )
+              xnorm = abs( x( 1, 1 ) ) + abs( x( 1, 2 ) )
+           else
+              x( 2, 1 ) = x2( 2 )
+              xnorm = max( abs( x( 1, 1 ) ), abs( x( 2, 1 ) ) )
+           end if
+           return
+           ! 2 by 2:
+           ! op[tl11 tl12]*[x11 x12] +isgn* [x11 x12]*op[tr11 tr12] = [b11 b12]
+             ! [tl21 tl22] [x21 x22]        [x21 x22]   [tr21 tr22]   [b21 b22]
+           ! solve equivalent 4 by 4 system using complete pivoting.
+           ! set pivots less than smin to smin.
+           50 continue
+           smin = max( abs( tr( 1, 1 ) ), abs( tr( 1, 2 ) ),abs( tr( 2, 1 ) ), abs( tr( 2, 2 ) ) )
+                     
+           smin = max( smin, abs( tl( 1, 1 ) ), abs( tl( 1, 2 ) ),abs( tl( 2, 1 ) ), abs( tl( 2, &
+                     2 ) ) )
+           smin = max( eps*smin, smlnum )
+           btmp( 1 ) = zero
+           call stdlib_scopy( 16, btmp, 0, t16, 1 )
+           t16( 1, 1 ) = tl( 1, 1 ) + sgn*tr( 1, 1 )
+           t16( 2, 2 ) = tl( 2, 2 ) + sgn*tr( 1, 1 )
+           t16( 3, 3 ) = tl( 1, 1 ) + sgn*tr( 2, 2 )
+           t16( 4, 4 ) = tl( 2, 2 ) + sgn*tr( 2, 2 )
+           if( ltranl ) then
+              t16( 1, 2 ) = tl( 2, 1 )
+              t16( 2, 1 ) = tl( 1, 2 )
+              t16( 3, 4 ) = tl( 2, 1 )
+              t16( 4, 3 ) = tl( 1, 2 )
+           else
+              t16( 1, 2 ) = tl( 1, 2 )
+              t16( 2, 1 ) = tl( 2, 1 )
+              t16( 3, 4 ) = tl( 1, 2 )
+              t16( 4, 3 ) = tl( 2, 1 )
+           end if
+           if( ltranr ) then
+              t16( 1, 3 ) = sgn*tr( 1, 2 )
+              t16( 2, 4 ) = sgn*tr( 1, 2 )
+              t16( 3, 1 ) = sgn*tr( 2, 1 )
+              t16( 4, 2 ) = sgn*tr( 2, 1 )
+           else
+              t16( 1, 3 ) = sgn*tr( 2, 1 )
+              t16( 2, 4 ) = sgn*tr( 2, 1 )
+              t16( 3, 1 ) = sgn*tr( 1, 2 )
+              t16( 4, 2 ) = sgn*tr( 1, 2 )
+           end if
+           btmp( 1 ) = b( 1, 1 )
+           btmp( 2 ) = b( 2, 1 )
+           btmp( 3 ) = b( 1, 2 )
+           btmp( 4 ) = b( 2, 2 )
+           ! perform elimination
+           loop_100: do i = 1, 3
+              xmax = zero
+              do ip = i, 4
+                 do jp = i, 4
+                    if( abs( t16( ip, jp ) )>=xmax ) then
+                       xmax = abs( t16( ip, jp ) )
+                       ipsv = ip
+                       jpsv = jp
+                    end if
+                 end do
+              end do
+              if( ipsv/=i ) then
+                 call stdlib_sswap( 4, t16( ipsv, 1 ), 4, t16( i, 1 ), 4 )
+                 temp = btmp( i )
+                 btmp( i ) = btmp( ipsv )
+                 btmp( ipsv ) = temp
+              end if
+              if( jpsv/=i )call stdlib_sswap( 4, t16( 1, jpsv ), 1, t16( 1, i ), 1 )
+              jpiv( i ) = jpsv
+              if( abs( t16( i, i ) )<smin ) then
+                 info = 1
+                 t16( i, i ) = smin
+              end if
+              do j = i + 1, 4
+                 t16( j, i ) = t16( j, i ) / t16( i, i )
+                 btmp( j ) = btmp( j ) - t16( j, i )*btmp( i )
+                 do k = i + 1, 4
+                    t16( j, k ) = t16( j, k ) - t16( j, i )*t16( i, k )
+                 end do
+              end do
+           end do loop_100
+           if( abs( t16( 4, 4 ) )<smin ) then
+              info = 1
+              t16( 4, 4 ) = smin
+           end if
+           scale = one
+           if( ( eight*smlnum )*abs( btmp( 1 ) )>abs( t16( 1, 1 ) ) .or.( eight*smlnum )*abs( &
+           btmp( 2 ) )>abs( t16( 2, 2 ) ) .or.( eight*smlnum )*abs( btmp( 3 ) )>abs( t16( 3, 3 ) )&
+                      .or.( eight*smlnum )*abs( btmp( 4 ) )>abs( t16( 4, 4 ) ) ) then
+              scale = ( one / eight ) / max( abs( btmp( 1 ) ),abs( btmp( 2 ) ), abs( btmp( 3 ) ), &
+                        abs( btmp( 4 ) ) )
+              btmp( 1 ) = btmp( 1 )*scale
+              btmp( 2 ) = btmp( 2 )*scale
+              btmp( 3 ) = btmp( 3 )*scale
+              btmp( 4 ) = btmp( 4 )*scale
+           end if
+           do i = 1, 4
+              k = 5 - i
+              temp = one / t16( k, k )
+              tmp( k ) = btmp( k )*temp
+              do j = k + 1, 4
+                 tmp( k ) = tmp( k ) - ( temp*t16( k, j ) )*tmp( j )
+              end do
+           end do
+           do i = 1, 3
+              if( jpiv( 4-i )/=4-i ) then
+                 temp = tmp( 4-i )
+                 tmp( 4-i ) = tmp( jpiv( 4-i ) )
+                 tmp( jpiv( 4-i ) ) = temp
+              end if
+           end do
+           x( 1, 1 ) = tmp( 1 )
+           x( 2, 1 ) = tmp( 2 )
+           x( 1, 2 ) = tmp( 3 )
+           x( 2, 2 ) = tmp( 4 )
+           xnorm = max( abs( tmp( 1 ) )+abs( tmp( 3 ) ),abs( tmp( 2 ) )+abs( tmp( 4 ) ) )
+           return
+     end subroutine stdlib_slasy2
+
+     !> SLASYF: computes a partial factorization of a real symmetric matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method. The partial
+     !> factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> SLASYF is an auxiliary routine called by SSYTRF. It uses blocked code
+     !> (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or
+     !> A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_slasyf( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: imax, j, jb, jj, jmax, jp, k, kk, kkw, kp, kstep, kw
+           real(sp) :: absakk, alpha, colmax, d11, d21, d22, r1, rowmax, t
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              ! kw is the column of w which corresponds to column k of a
+              k = n
+              10 continue
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              ! copy column k of a to column kw of w and update it
+              call stdlib_scopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_sgemv( 'NO TRANSPOSE', k, n-k, -one, a( 1, k+1 ), lda,w( k, kw+&
+                        1 ), ldw, one, w( 1, kw ), 1 )
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_isamax( k-1, w( 1, kw ), 1 )
+                 colmax = abs( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! copy column imax to column kw-1 of w and update it
+                    call stdlib_scopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                    call stdlib_scopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                              
+                    if( k<n )call stdlib_sgemv( 'NO TRANSPOSE', k, n-k, -one, a( 1, k+1 ),lda, w( &
+                              imax, kw+1 ), ldw, one,w( 1, kw-1 ), 1 )
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = imax + stdlib_isamax( k-imax, w( imax+1, kw-1 ), 1 )
+                    rowmax = abs( w( jmax, kw-1 ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_isamax( imax-1, w( 1, kw-1 ), 1 )
+                       rowmax = max( rowmax, abs( w( jmax, kw-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( w( imax, kw-1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column kw-1 of w to column kw of w
+                       call stdlib_scopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = a( kk, kk )
+                    call stdlib_scopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    if( kp>1 )call stdlib_scopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_sswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_sswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! note: diagonal element u(k,k) is a unit element
+                    ! and not stored.
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    call stdlib_scopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    r1 = one / a( k, k )
+                    call stdlib_sscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored.
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! compose the columns of the inverse of 2-by-2 pivot
+                       ! block d in the following way to reduce the number
+                       ! of flops when we myltiply panel ( w(kw-1) w(kw) ) by
+                       ! this inverse
+                       ! d**(-1) = ( d11 d21 )**(-1) =
+                                 ! ( d21 d22 )
+                       ! = 1/(d11*d22-d21**2) * ( ( d22 ) (-d21 ) ) =
+                                              ! ( (-d21 ) ( d11 ) )
+                       ! = 1/d21 * 1/((d11/d21)*(d22/d21)-1) *
+                         ! * ( ( d22/d21 ) (      -1 ) ) =
+                           ! ( (      -1 ) ( d11/d21 ) )
+                       ! = 1/d21 * 1/(d22*d11-1) * ( ( d11 ) (  -1 ) ) =
+                                                 ! ( ( -1  ) ( d22 ) )
+                       ! = 1/d21 * t * ( ( d11 ) (  -1 ) )
+                                     ! ( (  -1 ) ( d22 ) )
+                       ! = d21 * ( ( d11 ) (  -1 ) )
+                               ! ( (  -1 ) ( d22 ) )
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / d21
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = one / ( d11*d22-one )
+                       d21 = t / d21
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = d21*( d11*w( j, kw-1 )-w( j, kw ) )
+                          a( j, k ) = d21*( d22*w( j, kw )-w( j, kw-1 ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_sgemv( 'NO TRANSPOSE', jj-j+1, n-k, -one,a( j, k+1 ), lda, w( jj, &
+                              kw+1 ), ldw, one,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k, -one,a( 1, k+1 ), &
+                           lda, w( j, kw+1 ), ldw, one,a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n looping backwards from k+1 to n
+              j = k + 1
+              60 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j + 1
+                 end if
+                 ! (note: here, j is used to determine row length. length n-j+1
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j + 1
+                 if( jp/=jj .and. j<=n )call stdlib_sswap( n-j+1, a( jp, j ), lda, a( jj, j ), &
+                           lda )
+              if( j<n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              ! copy column k of a to column k of w and update it
+              call stdlib_scopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              call stdlib_sgemv( 'NO TRANSPOSE', n-k+1, k-1, -one, a( k, 1 ), lda,w( k, 1 ), ldw, &
+                        one, w( k, k ), 1 )
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_isamax( n-k, w( k+1, k ), 1 )
+                 colmax = abs( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! copy column imax to column k+1 of w and update it
+                    call stdlib_scopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1 )
+                    call stdlib_scopy( n-imax+1, a( imax, imax ), 1, w( imax, k+1 ),1 )
+                    call stdlib_sgemv( 'NO TRANSPOSE', n-k+1, k-1, -one, a( k, 1 ),lda, w( imax, &
+                              1 ), ldw, one, w( k, k+1 ), 1 )
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = k - 1 + stdlib_isamax( imax-k, w( k, k+1 ), 1 )
+                    rowmax = abs( w( jmax, k+1 ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_isamax( n-imax, w( imax+1, k+1 ), 1 )
+                       rowmax = max( rowmax, abs( w( jmax, k+1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( w( imax, k+1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column k+1 of w to column k of w
+                       call stdlib_scopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = a( kk, kk )
+                    call stdlib_scopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    if( kp<n )call stdlib_scopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (columns k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_sswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_sswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    call stdlib_scopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       r1 = one / a( k, k )
+                       call stdlib_sscal( n-k, r1, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! (note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored)
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! compose the columns of the inverse of 2-by-2 pivot
+                       ! block d in the following way to reduce the number
+                       ! of flops when we myltiply panel ( w(k) w(k+1) ) by
+                       ! this inverse
+                       ! d**(-1) = ( d11 d21 )**(-1) =
+                                 ! ( d21 d22 )
+                       ! = 1/(d11*d22-d21**2) * ( ( d22 ) (-d21 ) ) =
+                                              ! ( (-d21 ) ( d11 ) )
+                       ! = 1/d21 * 1/((d11/d21)*(d22/d21)-1) *
+                         ! * ( ( d22/d21 ) (      -1 ) ) =
+                           ! ( (      -1 ) ( d11/d21 ) )
+                       ! = 1/d21 * 1/(d22*d11-1) * ( ( d11 ) (  -1 ) ) =
+                                                 ! ( ( -1  ) ( d22 ) )
+                       ! = 1/d21 * t * ( ( d11 ) (  -1 ) )
+                                     ! ( (  -1 ) ( d22 ) )
+                       ! = d21 * ( ( d11 ) (  -1 ) )
+                               ! ( (  -1 ) ( d22 ) )
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       d21 = t / d21
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = d21*( d11*w( j, k )-w( j, k+1 ) )
+                          a( j, k+1 ) = d21*( d22*w( j, k+1 )-w( j, k ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_sgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -one,a( jj, 1 ), lda, w( jj, &
+                              1 ), ldw, one,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           one, a( j+jb, 1 ), lda, w( j, 1 ), ldw,one, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! of rows in columns 1:k-1 looping backwards from k-1 to 1
+              j = k - 1
+              120 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j - 1
+                 end if
+                 ! (note: here, j is used to determine row length. length j
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j - 1
+                 if( jp/=jj .and. j>=1 )call stdlib_sswap( j, a( jp, 1 ), lda, a( jj, 1 ), lda )
+                           
+              if( j>1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_slasyf
+
+     !> SLASYF_RK: computes a partial factorization of a real symmetric
+     !> matrix A using the bounded Bunch-Kaufman (rook) diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L',
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> SLASYF_RK is an auxiliary routine called by SSYTRF_RK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_slasyf_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: e(*), w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, j, jb, jj, jmax, k, kk, kw, kkw, kp, kstep, p, ii
+           real(sp) :: absakk, alpha, colmax, d11, d12, d21, d22, stemp, r1, rowmax, t, &
+                     sfmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = zero
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              call stdlib_scopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_sgemv( 'NO TRANSPOSE', k, n-k, -one, a( 1, k+1 ),lda, w( k, kw+&
+                        1 ), ldw, one, w( 1, kw ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_isamax( k-1, w( 1, kw ), 1 )
+                 colmax = abs( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_scopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = zero
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       call stdlib_scopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                       call stdlib_scopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       if( k<n )call stdlib_sgemv( 'NO TRANSPOSE', k, n-k, -one,a( 1, k+1 ), lda, &
+                                 w( imax, kw+1 ), ldw,one, w( 1, kw-1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_isamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = abs( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_isamax( imax-1, w( 1, kw-1 ), 1 )
+                          stemp = abs( w( itemp, kw-1 ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! abs( w( imax, kw-1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.(abs( w( imax, kw-1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_scopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_scopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_scopy( k-p, a( p+1, k ), 1, a( p, p+1 ), lda )
+                    call stdlib_scopy( p, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in last n-k+1 columns of a
+                    ! and last n-k+2 columns of w
+                    call stdlib_sswap( n-k+1, a( k, k ), lda, a( p, k ), lda )
+                    call stdlib_sswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kkw of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_scopy( k-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_scopy( kp, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last n-kk+1 columns
+                    ! of a and w
+                    call stdlib_sswap( n-kk+1, a( kk, kk ), lda, a( kp, kk ), lda )
+                    call stdlib_sswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! store u(k) in column k of a
+                    call stdlib_scopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          r1 = one / a( k, k )
+                          call stdlib_sscal( k-1, r1, a( 1, k ), 1 )
+                       else if( a( k, k )/=zero ) then
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = zero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now
+                    ! hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    if( k>2 ) then
+                       ! store u(k) and u(k-1) in columns k and k-1 of a
+                       d12 = w( k-1, kw )
+                       d11 = w( k, kw ) / d12
+                       d22 = w( k-1, kw-1 ) / d12
+                       t = one / ( d11*d22-one )
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( (d11*w( j, kw-1 )-w( j, kw ) ) /d12 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /d12 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy superdiagonal element of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = zero
+                    a( k, k ) = w( k, kw )
+                    e( k ) = w( k-1, kw )
+                    e( k-1 ) = zero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_sgemv( 'NO TRANSPOSE', jj-j+1, n-k, -one,a( j, k+1 ), lda, w( jj, &
+                              kw+1 ), ldw, one,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb,n-k, -one, a( &
+                           1, k+1 ), lda, w( j, kw+1 ),ldw, one, a( 1, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = zero
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update it
+              call stdlib_scopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              if( k>1 )call stdlib_sgemv( 'NO TRANSPOSE', n-k+1, k-1, -one, a( k, 1 ),lda, w( k, &
+                        1 ), ldw, one, w( k, k ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_isamax( n-k, w( k+1, k ), 1 )
+                 colmax = abs( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_scopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = zero
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_scopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_scopy( n-imax+1, a( imax, imax ), 1,w( imax, k+1 ), 1 )
+                       if( k>1 )call stdlib_sgemv( 'NO TRANSPOSE', n-k+1, k-1, -one,a( k, 1 ), &
+                                 lda, w( imax, 1 ), ldw,one, w( k, k+1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_isamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = abs( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_isamax( n-imax, w( imax+1, k+1 ), 1)
+                          stemp = abs( w( itemp, k+1 ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! abs( w( imax, k+1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( w( imax, k+1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_scopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_scopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 72
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_scopy( p-k, a( k, k ), 1, a( p, k ), lda )
+                    call stdlib_scopy( n-p+1, a( p, k ), 1, a( p, p ), 1 )
+                    ! interchange rows k and p in first k columns of a
+                    ! and first k+1 columns of w
+                    call stdlib_sswap( k, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_sswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kk of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_scopy( kp-k-1, a( k+1, kk ), 1, a( kp, k+1 ), lda )
+                    call stdlib_scopy( n-kp+1, a( kp, kk ), 1, a( kp, kp ), 1 )
+                    ! interchange rows kk and kp in first kk columns of a and w
+                    call stdlib_sswap( kk, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_sswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    ! store l(k) in column k of a
+                    call stdlib_scopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          r1 = one / a( k, k )
+                          call stdlib_sscal( n-k, r1, a( k+1, k ), 1 )
+                       else if( a( k, k )/=zero ) then
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = zero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! store l(k) and l(k+1) in columns k and k+1 of a
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /d21 )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy subdiagonal element of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = zero
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    e( k ) = w( k+1, k )
+                    e( k+1 ) = zero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_sgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -one,a( jj, 1 ), lda, w( jj, &
+                              1 ), ldw, one,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           one, a( j+jb, 1 ), lda, w( j, 1 ),ldw, one, a( j+jb, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_slasyf_rk
+
+     !> SLASYF_ROOK: computes a partial factorization of a real symmetric
+     !> matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> SLASYF_ROOK is an auxiliary routine called by SSYTRF_ROOK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_slasyf_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, j, jb, jj, jmax, jp1, jp2, k, kk, kw, kkw, kp, kstep, p, &
+                     ii
+           real(sp) :: absakk, alpha, colmax, d11, d12, d21, d22, stemp, r1, rowmax, t, &
+                     sfmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              call stdlib_scopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_sgemv( 'NO TRANSPOSE', k, n-k, -one, a( 1, k+1 ),lda, w( k, kw+&
+                        1 ), ldw, one, w( 1, kw ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_isamax( k-1, w( 1, kw ), 1 )
+                 colmax = abs( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_scopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       call stdlib_scopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                       call stdlib_scopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       if( k<n )call stdlib_sgemv( 'NO TRANSPOSE', k, n-k, -one,a( 1, k+1 ), lda, &
+                                 w( imax, kw+1 ), ldw,one, w( 1, kw-1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_isamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = abs( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_isamax( imax-1, w( 1, kw-1 ), 1 )
+                          stemp = abs( w( itemp, kw-1 ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! abs( w( imax, kw-1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.(abs( w( imax, kw-1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_scopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_scopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_scopy( k-p, a( p+1, k ), 1, a( p, p+1 ), lda )
+                    call stdlib_scopy( p, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in last n-k+1 columns of a
+                    ! and last n-k+2 columns of w
+                    call stdlib_sswap( n-k+1, a( k, k ), lda, a( p, k ), lda )
+                    call stdlib_sswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kkw of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_scopy( k-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_scopy( kp, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last n-kk+1 columns
+                    ! of a and w
+                    call stdlib_sswap( n-kk+1, a( kk, kk ), lda, a( kp, kk ), lda )
+                    call stdlib_sswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! store u(k) in column k of a
+                    call stdlib_scopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          r1 = one / a( k, k )
+                          call stdlib_sscal( k-1, r1, a( 1, k ), 1 )
+                       else if( a( k, k )/=zero ) then
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now
+                    ! hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    if( k>2 ) then
+                       ! store u(k) and u(k-1) in columns k and k-1 of a
+                       d12 = w( k-1, kw )
+                       d11 = w( k, kw ) / d12
+                       d22 = w( k-1, kw-1 ) / d12
+                       t = one / ( d11*d22-one )
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( (d11*w( j, kw-1 )-w( j, kw ) ) /d12 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /d12 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_sgemv( 'NO TRANSPOSE', jj-j+1, n-k, -one,a( j, k+1 ), lda, w( jj, &
+                              kw+1 ), ldw, one,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb,n-k, -one, a( &
+                           1, k+1 ), lda, w( j, kw+1 ), ldw,one, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n
+              j = k + 1
+              60 continue
+                 kstep = 1
+                 jp1 = 1
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    j = j + 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 j = j + 1
+                 if( jp2/=jj .and. j<=n )call stdlib_sswap( n-j+1, a( jp2, j ), lda, a( jj, j ), &
+                           lda )
+                 jj = j - 1
+                 if( jp1/=jj .and. kstep==2 )call stdlib_sswap( n-j+1, a( jp1, j ), lda, a( jj, j &
+                           ), lda )
+              if( j<=n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update it
+              call stdlib_scopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              if( k>1 )call stdlib_sgemv( 'NO TRANSPOSE', n-k+1, k-1, -one, a( k, 1 ),lda, w( k, &
+                        1 ), ldw, one, w( k, k ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_isamax( n-k, w( k+1, k ), 1 )
+                 colmax = abs( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_scopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_scopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_scopy( n-imax+1, a( imax, imax ), 1,w( imax, k+1 ), 1 )
+                       if( k>1 )call stdlib_sgemv( 'NO TRANSPOSE', n-k+1, k-1, -one,a( k, 1 ), &
+                                 lda, w( imax, 1 ), ldw,one, w( k, k+1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_isamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = abs( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_isamax( n-imax, w( imax+1, k+1 ), 1)
+                          stemp = abs( w( itemp, k+1 ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! abs( w( imax, k+1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( w( imax, k+1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_scopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_scopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 72
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_scopy( p-k, a( k, k ), 1, a( p, k ), lda )
+                    call stdlib_scopy( n-p+1, a( p, k ), 1, a( p, p ), 1 )
+                    ! interchange rows k and p in first k columns of a
+                    ! and first k+1 columns of w
+                    call stdlib_sswap( k, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_sswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kk of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_scopy( kp-k-1, a( k+1, kk ), 1, a( kp, k+1 ), lda )
+                    call stdlib_scopy( n-kp+1, a( kp, kk ), 1, a( kp, kp ), 1 )
+                    ! interchange rows kk and kp in first kk columns of a and w
+                    call stdlib_sswap( kk, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_sswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    ! store l(k) in column k of a
+                    call stdlib_scopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          r1 = one / a( k, k )
+                          call stdlib_sscal( n-k, r1, a( k+1, k ), 1 )
+                       else if( a( k, k )/=zero ) then
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! store l(k) and l(k+1) in columns k and k+1 of a
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /d21 )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_sgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -one,a( jj, 1 ), lda, w( jj, &
+                              1 ), ldw, one,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           one, a( j+jb, 1 ), lda, w( j, 1 ), ldw,one, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! in columns 1:k-1
+              j = k - 1
+              120 continue
+                 kstep = 1
+                 jp1 = 1
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    j = j - 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 j = j - 1
+                 if( jp2/=jj .and. j>=1 )call stdlib_sswap( j, a( jp2, 1 ), lda, a( jj, 1 ), lda )
+                           
+                 jj = j + 1
+                 if( jp1/=jj .and. kstep==2 )call stdlib_sswap( j, a( jp1, 1 ), lda, a( jj, 1 ), &
+                           lda )
+              if( j>=1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_slasyf_rook
+
+     !> SLATBS: solves one of the triangular systems
+     !> A *x = s*b  or  A**T*x = s*b
+     !> with scaling to prevent overflow, where A is an upper or lower
+     !> triangular band matrix.  Here A**T denotes the transpose of A, x and b
+     !> are n-element vectors, and s is a scaling factor, usually less than
+     !> or equal to 1, chosen so that the components of x will be less than
+     !> the overflow threshold.  If the unscaled problem will not cause
+     !> overflow, the Level 2 BLAS routine STBSV is called.  If the matrix A
+     !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+     !> non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_slatbs( uplo, trans, diag, normin, n, kd, ab, ldab, x,scale, cnorm, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(sp), intent(out) :: scale
+           ! Array Arguments 
+           real(sp), intent(in) :: ab(ldab,*)
+           real(sp), intent(inout) :: cnorm(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, j, jfirst, jinc, jlast, jlen, maind
+           real(sp) :: bignum, grow, rec, smlnum, sumj, tjj, tjjs, tmax, tscal, uscal, xbnd, xj, &
+                     xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( kd<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLATBS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 do j = 1, n
+                    jlen = min( kd, j-1 )
+                    cnorm( j ) = stdlib_sasum( jlen, ab( kd+1-jlen, j ), 1 )
+                 end do
+              else
+                 ! a is lower triangular.
+                 do j = 1, n
+                    jlen = min( kd, n-j )
+                    if( jlen>0 ) then
+                       cnorm( j ) = stdlib_sasum( jlen, ab( 2, j ), 1 )
+                    else
+                       cnorm( j ) = zero
+                    end if
+                 end do
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum.
+           imax = stdlib_isamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum ) then
+              tscal = one
+           else
+              tscal = one / ( smlnum*tmax )
+              call stdlib_sscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_stbsv can be used.
+           j = stdlib_isamax( n, x, 1 )
+           xmax = abs( x( j ) )
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+                 maind = kd + 1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+                 maind = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 50
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! m(j) = g(j-1) / abs(a(j,j))
+                    tjj = abs( ab( maind, j ) )
+                    xbnd = min( xbnd, min( one, tjj )*grow )
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              50 continue
+           else
+              ! compute the growth in a**t * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+                 maind = kd + 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+                 maind = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 80
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                    tjj = abs( ab( maind, j ) )
+                    if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              80 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_stbsv( uplo, trans, diag, n, kd, ab, ldab, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = bignum / xmax
+                 call stdlib_sscal( n, scale, x, 1 )
+                 xmax = bignum
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 loop_100: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = abs( x( j ) )
+                    if( nounit ) then
+                       tjjs = ab( maind, j )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 95
+                    end if
+                       tjj = abs( tjjs )
+                       if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                                rec = one / xj
+                                call stdlib_sscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = x( j ) / tjjs
+                          xj = abs( x( j ) )
+                       else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                             rec = ( tjj*bignum ) / xj
+                             if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                                rec = rec / cnorm( j )
+                             end if
+                             call stdlib_sscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = x( j ) / tjjs
+                          xj = abs( x( j ) )
+                       else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          xj = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       95 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_sscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_sscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(max(1,j-kd):j-1) := x(max(1,j-kd):j-1) -
+                                                   ! x(j)* a(max(1,j-kd):j-1,j)
+                          jlen = min( kd, j-1 )
+                          call stdlib_saxpy( jlen, -x( j )*tscal,ab( kd+1-jlen, j ), 1, x( j-jlen &
+                                    ), 1 )
+                          i = stdlib_isamax( j-1, x, 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                    else if( j<n ) then
+                       ! compute the update
+                          ! x(j+1:min(j+kd,n)) := x(j+1:min(j+kd,n)) -
+                                                ! x(j) * a(j+1:min(j+kd,n),j)
+                       jlen = min( kd, n-j )
+                       if( jlen>0 )call stdlib_saxpy( jlen, -x( j )*tscal, ab( 2, j ), 1,x( j+1 ),&
+                                  1 )
+                       i = j + stdlib_isamax( n-j, x( j+1 ), 1 )
+                       xmax = abs( x( i ) )
+                    end if
+                 end do loop_100
+              else
+                 ! solve a**t * x = b
+                 loop_140: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = abs( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = ab( maind, j )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                          tjj = abs( tjjs )
+                          if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                             rec = min( one, rec*tjj )
+                             uscal = uscal / tjjs
+                          end if
+                       if( rec<one ) then
+                          call stdlib_sscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    sumj = zero
+                    if( uscal==one ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_sdot to perform the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          sumj = stdlib_sdot( jlen, ab( kd+1-jlen, j ), 1,x( j-jlen ), 1 )
+                       else
+                          jlen = min( kd, n-j )
+                          if( jlen>0 )sumj = stdlib_sdot( jlen, ab( 2, j ), 1, x( j+1 ), 1 )
+                                    
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          do i = 1, jlen
+                             sumj = sumj + ( ab( kd+i-jlen, j )*uscal )*x( j-jlen-1+i )
+                          end do
+                       else
+                          jlen = min( kd, n-j )
+                          do i = 1, jlen
+                             sumj = sumj + ( ab( i+1, j )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==tscal ) then
+                       ! compute x(j) := ( x(j) - sumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - sumj
+                       xj = abs( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = ab( maind, j )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 135
+                       end if
+                          tjj = abs( tjjs )
+                          if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                             if( tjj<one ) then
+                                if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                   rec = one / xj
+                                   call stdlib_sscal( n, rec, x, 1 )
+                                   scale = scale*rec
+                                   xmax = xmax*rec
+                                end if
+                             end if
+                             x( j ) = x( j ) / tjjs
+                          else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                             if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                                rec = ( tjj*bignum ) / xj
+                                call stdlib_sscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                             x( j ) = x( j ) / tjjs
+                          else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0, and compute a solution to a**t*x = 0.
+                             do i = 1, n
+                                x( i ) = zero
+                             end do
+                             x( j ) = one
+                             scale = zero
+                             xmax = zero
+                          end if
+                          135 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - sumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = x( j ) / tjjs - sumj
+                    end if
+                    xmax = max( xmax, abs( x( j ) ) )
+                 end do loop_140
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_sscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_slatbs
+
+     !> SLATPS: solves one of the triangular systems
+     !> A *x = s*b  or  A**T*x = s*b
+     !> with scaling to prevent overflow, where A is an upper or lower
+     !> triangular matrix stored in packed form.  Here A**T denotes the
+     !> transpose of A, x and b are n-element vectors, and s is a scaling
+     !> factor, usually less than or equal to 1, chosen so that the
+     !> components of x will be less than the overflow threshold.  If the
+     !> unscaled problem will not cause overflow, the Level 2 BLAS routine
+     !> STPSV is called. If the matrix A is singular (A(j,j) = 0 for some j),
+     !> then s is set to 0 and a non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_slatps( uplo, trans, diag, normin, n, ap, x, scale,cnorm, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(out) :: scale
+           ! Array Arguments 
+           real(sp), intent(in) :: ap(*)
+           real(sp), intent(inout) :: cnorm(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, ip, j, jfirst, jinc, jlast, jlen
+           real(sp) :: bignum, grow, rec, smlnum, sumj, tjj, tjjs, tmax, tscal, uscal, xbnd, xj, &
+                     xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLATPS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 ip = 1
+                 do j = 1, n
+                    cnorm( j ) = stdlib_sasum( j-1, ap( ip ), 1 )
+                    ip = ip + j
+                 end do
+              else
+                 ! a is lower triangular.
+                 ip = 1
+                 do j = 1, n - 1
+                    cnorm( j ) = stdlib_sasum( n-j, ap( ip+1 ), 1 )
+                    ip = ip + n - j + 1
+                 end do
+                 cnorm( n ) = zero
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum.
+           imax = stdlib_isamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum ) then
+              tscal = one
+           else
+              tscal = one / ( smlnum*tmax )
+              call stdlib_sscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_stpsv can be used.
+           j = stdlib_isamax( n, x, 1 )
+           xmax = abs( x( j ) )
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 50
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = n
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! m(j) = g(j-1) / abs(a(j,j))
+                    tjj = abs( ap( ip ) )
+                    xbnd = min( xbnd, min( one, tjj )*grow )
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                    ip = ip + jinc*jlen
+                    jlen = jlen - 1
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              50 continue
+           else
+              ! compute the growth in a**t * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 80
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                    tjj = abs( ap( ip ) )
+                    if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              80 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_stpsv( uplo, trans, diag, n, ap, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = bignum / xmax
+                 call stdlib_sscal( n, scale, x, 1 )
+                 xmax = bignum
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 loop_100: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = abs( x( j ) )
+                    if( nounit ) then
+                       tjjs = ap( ip )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 95
+                    end if
+                       tjj = abs( tjjs )
+                       if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                                rec = one / xj
+                                call stdlib_sscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = x( j ) / tjjs
+                          xj = abs( x( j ) )
+                       else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                             rec = ( tjj*bignum ) / xj
+                             if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                                rec = rec / cnorm( j )
+                             end if
+                             call stdlib_sscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = x( j ) / tjjs
+                          xj = abs( x( j ) )
+                       else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          xj = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       95 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_sscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_sscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(1:j-1) := x(1:j-1) - x(j) * a(1:j-1,j)
+                          call stdlib_saxpy( j-1, -x( j )*tscal, ap( ip-j+1 ), 1, x,1 )
+                          i = stdlib_isamax( j-1, x, 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                       ip = ip - j
+                    else
+                       if( j<n ) then
+                          ! compute the update
+                             ! x(j+1:n) := x(j+1:n) - x(j) * a(j+1:n,j)
+                          call stdlib_saxpy( n-j, -x( j )*tscal, ap( ip+1 ), 1,x( j+1 ), 1 )
+                                    
+                          i = j + stdlib_isamax( n-j, x( j+1 ), 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                       ip = ip + n - j + 1
+                    end if
+                 end do loop_100
+              else
+                 ! solve a**t * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 loop_140: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = abs( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = ap( ip )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                          tjj = abs( tjjs )
+                          if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                             rec = min( one, rec*tjj )
+                             uscal = uscal / tjjs
+                          end if
+                       if( rec<one ) then
+                          call stdlib_sscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    sumj = zero
+                    if( uscal==one ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_sdot to perform the dot product.
+                       if( upper ) then
+                          sumj = stdlib_sdot( j-1, ap( ip-j+1 ), 1, x, 1 )
+                       else if( j<n ) then
+                          sumj = stdlib_sdot( n-j, ap( ip+1 ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             sumj = sumj + ( ap( ip-j+i )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = 1, n - j
+                             sumj = sumj + ( ap( ip+i )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==tscal ) then
+                       ! compute x(j) := ( x(j) - sumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - sumj
+                       xj = abs( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = ap( ip )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 135
+                       end if
+                          tjj = abs( tjjs )
+                          if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                             if( tjj<one ) then
+                                if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                   rec = one / xj
+                                   call stdlib_sscal( n, rec, x, 1 )
+                                   scale = scale*rec
+                                   xmax = xmax*rec
+                                end if
+                             end if
+                             x( j ) = x( j ) / tjjs
+                          else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                             if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                                rec = ( tjj*bignum ) / xj
+                                call stdlib_sscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                             x( j ) = x( j ) / tjjs
+                          else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0, and compute a solution to a**t*x = 0.
+                             do i = 1, n
+                                x( i ) = zero
+                             end do
+                             x( j ) = one
+                             scale = zero
+                             xmax = zero
+                          end if
+                          135 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j)  - sumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = x( j ) / tjjs - sumj
+                    end if
+                    xmax = max( xmax, abs( x( j ) ) )
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do loop_140
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_sscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_slatps
+
+     !> SLATRS: solves one of the triangular systems
+     !> A *x = s*b  or  A**T*x = s*b
+     !> with scaling to prevent overflow.  Here A is an upper or lower
+     !> triangular matrix, A**T denotes the transpose of A, x and b are
+     !> n-element vectors, and s is a scaling factor, usually less than
+     !> or equal to 1, chosen so that the components of x will be less than
+     !> the overflow threshold.  If the unscaled problem will not cause
+     !> overflow, the Level 2 BLAS routine STRSV is called.  If the matrix A
+     !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+     !> non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_slatrs( uplo, trans, diag, normin, n, a, lda, x, scale,cnorm, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(out) :: scale
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: cnorm(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, j, jfirst, jinc, jlast
+           real(sp) :: bignum, grow, rec, smlnum, sumj, tjj, tjjs, tmax, tscal, uscal, xbnd, xj, &
+                     xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLATRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' ) / stdlib_slamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 do j = 1, n
+                    cnorm( j ) = stdlib_sasum( j-1, a( 1, j ), 1 )
+                 end do
+              else
+                 ! a is lower triangular.
+                 do j = 1, n - 1
+                    cnorm( j ) = stdlib_sasum( n-j, a( j+1, j ), 1 )
+                 end do
+                 cnorm( n ) = zero
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum.
+           imax = stdlib_isamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum ) then
+              tscal = one
+           else
+              tscal = one / ( smlnum*tmax )
+              call stdlib_sscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_strsv can be used.
+           j = stdlib_isamax( n, x, 1 )
+           xmax = abs( x( j ) )
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 50
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! m(j) = g(j-1) / abs(a(j,j))
+                    tjj = abs( a( j, j ) )
+                    xbnd = min( xbnd, min( one, tjj )*grow )
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 50
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              50 continue
+           else
+              ! compute the growth in a**t * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 80
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = one / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                    tjj = abs( a( j, j ) )
+                    if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, one / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 80
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              80 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_strsv( uplo, trans, diag, n, a, lda, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = bignum / xmax
+                 call stdlib_sscal( n, scale, x, 1 )
+                 xmax = bignum
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 loop_100: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = abs( x( j ) )
+                    if( nounit ) then
+                       tjjs = a( j, j )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 95
+                    end if
+                       tjj = abs( tjjs )
+                       if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                                rec = one / xj
+                                call stdlib_sscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = x( j ) / tjjs
+                          xj = abs( x( j ) )
+                       else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                             rec = ( tjj*bignum ) / xj
+                             if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                                rec = rec / cnorm( j )
+                             end if
+                             call stdlib_sscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = x( j ) / tjjs
+                          xj = abs( x( j ) )
+                       else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          xj = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       95 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_sscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_sscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(1:j-1) := x(1:j-1) - x(j) * a(1:j-1,j)
+                          call stdlib_saxpy( j-1, -x( j )*tscal, a( 1, j ), 1, x,1 )
+                          i = stdlib_isamax( j-1, x, 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                    else
+                       if( j<n ) then
+                          ! compute the update
+                             ! x(j+1:n) := x(j+1:n) - x(j) * a(j+1:n,j)
+                          call stdlib_saxpy( n-j, -x( j )*tscal, a( j+1, j ), 1,x( j+1 ), 1 )
+                                    
+                          i = j + stdlib_isamax( n-j, x( j+1 ), 1 )
+                          xmax = abs( x( i ) )
+                       end if
+                    end if
+                 end do loop_100
+              else
+                 ! solve a**t * x = b
+                 loop_140: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = abs( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = a( j, j )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                          tjj = abs( tjjs )
+                          if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                             rec = min( one, rec*tjj )
+                             uscal = uscal / tjjs
+                          end if
+                       if( rec<one ) then
+                          call stdlib_sscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    sumj = zero
+                    if( uscal==one ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_sdot to perform the dot product.
+                       if( upper ) then
+                          sumj = stdlib_sdot( j-1, a( 1, j ), 1, x, 1 )
+                       else if( j<n ) then
+                          sumj = stdlib_sdot( n-j, a( j+1, j ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             sumj = sumj + ( a( i, j )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = j + 1, n
+                             sumj = sumj + ( a( i, j )*uscal )*x( i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==tscal ) then
+                       ! compute x(j) := ( x(j) - sumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - sumj
+                       xj = abs( x( j ) )
+                       if( nounit ) then
+                          tjjs = a( j, j )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 135
+                       end if
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjj = abs( tjjs )
+                          if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                             if( tjj<one ) then
+                                if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                   rec = one / xj
+                                   call stdlib_sscal( n, rec, x, 1 )
+                                   scale = scale*rec
+                                   xmax = xmax*rec
+                                end if
+                             end if
+                             x( j ) = x( j ) / tjjs
+                          else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                             if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                                rec = ( tjj*bignum ) / xj
+                                call stdlib_sscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                             x( j ) = x( j ) / tjjs
+                          else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0, and compute a solution to a**t*x = 0.
+                             do i = 1, n
+                                x( i ) = zero
+                             end do
+                             x( j ) = one
+                             scale = zero
+                             xmax = zero
+                          end if
+                          135 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j)  - sumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = x( j ) / tjjs - sumj
+                    end if
+                    xmax = max( xmax, abs( x( j ) ) )
+                 end do loop_140
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_sscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_slatrs
+
+     !> SLAUU2: computes the product U * U**T or L**T * L, where the triangular
+     !> factor U or L is stored in the upper or lower triangular part of
+     !> the array A.
+     !> If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+     !> overwriting the factor U in A.
+     !> If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+     !> overwriting the factor L in A.
+     !> This is the unblocked form of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_slauu2( uplo, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           real(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAUU2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the product u * u**t.
+              do i = 1, n
+                 aii = a( i, i )
+                 if( i<n ) then
+                    a( i, i ) = stdlib_sdot( n-i+1, a( i, i ), lda, a( i, i ), lda )
+                    call stdlib_sgemv( 'NO TRANSPOSE', i-1, n-i, one, a( 1, i+1 ),lda, a( i, i+1 )&
+                              , lda, aii, a( 1, i ), 1 )
+                 else
+                    call stdlib_sscal( i, aii, a( 1, i ), 1 )
+                 end if
+              end do
+           else
+              ! compute the product l**t * l.
+              do i = 1, n
+                 aii = a( i, i )
+                 if( i<n ) then
+                    a( i, i ) = stdlib_sdot( n-i+1, a( i, i ), 1, a( i, i ), 1 )
+                    call stdlib_sgemv( 'TRANSPOSE', n-i, i-1, one, a( i+1, 1 ), lda,a( i+1, i ), &
+                              1, aii, a( i, 1 ), lda )
+                 else
+                    call stdlib_sscal( i, aii, a( i, 1 ), lda )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_slauu2
+
+     !> SLAUUM: computes the product U * U**T or L**T * L, where the triangular
+     !> factor U or L is stored in the upper or lower triangular part of
+     !> the array A.
+     !> If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+     !> overwriting the factor U in A.
+     !> If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+     !> overwriting the factor L in A.
+     !> This is the blocked form of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_slauum( uplo, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ib, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAUUM', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'SLAUUM', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_slauu2( uplo, n, a, lda, info )
+           else
+              ! use blocked code
+              if( upper ) then
+                 ! compute the product u * u**t.
+                 do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    call stdlib_strmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'NON-UNIT',i-1, ib, one, a( &
+                              i, i ), lda, a( 1, i ),lda )
+                    call stdlib_slauu2( 'UPPER', ib, a( i, i ), lda, info )
+                    if( i+ib<=n ) then
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', i-1, ib,n-i-ib+1, one, a( &
+                                 1, i+ib ), lda,a( i, i+ib ), lda, one, a( 1, i ), lda )
+                       call stdlib_ssyrk( 'UPPER', 'NO TRANSPOSE', ib, n-i-ib+1,one, a( i, i+ib ),&
+                                  lda, one, a( i, i ),lda )
+                    end if
+                 end do
+              else
+                 ! compute the product l**t * l.
+                 do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    call stdlib_strmm( 'LEFT', 'LOWER', 'TRANSPOSE', 'NON-UNIT', ib,i-1, one, a( &
+                              i, i ), lda, a( i, 1 ), lda )
+                    call stdlib_slauu2( 'LOWER', ib, a( i, i ), lda, info )
+                    if( i+ib<=n ) then
+                       call stdlib_sgemm( 'TRANSPOSE', 'NO TRANSPOSE', ib, i-1,n-i-ib+1, one, a( &
+                                 i+ib, i ), lda,a( i+ib, 1 ), lda, one, a( i, 1 ), lda )
+                       call stdlib_ssyrk( 'LOWER', 'TRANSPOSE', ib, n-i-ib+1, one,a( i+ib, i ), &
+                                 lda, one, a( i, i ), lda )
+                    end if
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_slauum
+
+     !> SORBDB6: orthogonalizes the column vector
+     !> X = [ X1 ]
+     !> [ X2 ]
+     !> with respect to the columns of
+     !> Q = [ Q1 ] .
+     !> [ Q2 ]
+     !> The columns of Q must be orthonormal.
+     !> If the projection is zero according to Kahan's "twice is enough"
+     !> criterion, then the zero vector is returned.
+
+     pure subroutine stdlib_sorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
+           real(sp), intent(out) :: work(*)
+           real(sp), intent(inout) :: x1(*), x2(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: alphasq = 0.01_sp
+           real(sp), parameter :: realone = 1.0_sp
+           real(sp), parameter :: realzero = 0.0_sp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: normsq1, normsq2, scl1, scl2, ssq1, ssq2
+           ! Intrinsic Function 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           if( m1 < 0 ) then
+              info = -1
+           else if( m2 < 0 ) then
+              info = -2
+           else if( n < 0 ) then
+              info = -3
+           else if( incx1 < 1 ) then
+              info = -5
+           else if( incx2 < 1 ) then
+              info = -7
+           else if( ldq1 < max( 1, m1 ) ) then
+              info = -9
+           else if( ldq2 < max( 1, m2 ) ) then
+              info = -11
+           else if( lwork < n ) then
+              info = -13
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'SORBDB6', -info )
+              return
+           end if
+           ! first, project x onto the orthogonal complement of q's column
+           ! space
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_slassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_slassq( m2, x2, incx2, scl2, ssq2 )
+           normsq1 = scl1**2*ssq1 + scl2**2*ssq2
+           if( m1 == 0 ) then
+              do i = 1, n
+                 work(i) = zero
+              end do
+           else
+              call stdlib_sgemv( 'C', m1, n, one, q1, ldq1, x1, incx1, zero, work,1 )
+           end if
+           call stdlib_sgemv( 'C', m2, n, one, q2, ldq2, x2, incx2, one, work, 1 )
+           call stdlib_sgemv( 'N', m1, n, negone, q1, ldq1, work, 1, one, x1,incx1 )
+           call stdlib_sgemv( 'N', m2, n, negone, q2, ldq2, work, 1, one, x2,incx2 )
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_slassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_slassq( m2, x2, incx2, scl2, ssq2 )
+           normsq2 = scl1**2*ssq1 + scl2**2*ssq2
+           ! if projection is sufficiently large in norm, then stop.
+           ! if projection is zero, then stop.
+           ! otherwise, project again.
+           if( normsq2 >= alphasq*normsq1 ) then
+              return
+           end if
+           if( normsq2 == zero ) then
+              return
+           end if
+           normsq1 = normsq2
+           do i = 1, n
+              work(i) = zero
+           end do
+           if( m1 == 0 ) then
+              do i = 1, n
+                 work(i) = zero
+              end do
+           else
+              call stdlib_sgemv( 'C', m1, n, one, q1, ldq1, x1, incx1, zero, work,1 )
+           end if
+           call stdlib_sgemv( 'C', m2, n, one, q2, ldq2, x2, incx2, one, work, 1 )
+           call stdlib_sgemv( 'N', m1, n, negone, q1, ldq1, work, 1, one, x1,incx1 )
+           call stdlib_sgemv( 'N', m2, n, negone, q2, ldq2, work, 1, one, x2,incx2 )
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_slassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_slassq( m1, x1, incx1, scl1, ssq1 )
+           normsq2 = scl1**2*ssq1 + scl2**2*ssq2
+           ! if second projection is sufficiently large in norm, then do
+           ! nothing more. alternatively, if it shrunk significantly, then
+           ! truncate it to zero.
+           if( normsq2 < alphasq*normsq1 ) then
+              do i = 1, m1
+                 x1(i) = zero
+              end do
+              do i = 1, m2
+                 x2(i) = zero
+              end do
+           end if
+           return
+     end subroutine stdlib_sorbdb6
+
+     !> SORG2L: generates an m by n real matrix Q with orthonormal columns,
+     !> which is defined as the last n columns of a product of k elementary
+     !> reflectors of order m
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by SGEQLF.
+
+     pure subroutine stdlib_sorg2l( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORG2L', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           ! initialise columns 1:n-k to columns of the unit matrix
+           do j = 1, n - k
+              do l = 1, m
+                 a( l, j ) = zero
+              end do
+              a( m-n+j, j ) = one
+           end do
+           do i = 1, k
+              ii = n - k + i
+              ! apply h(i) to a(1:m-k+i,1:n-k+i) from the left
+              a( m-n+ii, ii ) = one
+              call stdlib_slarf( 'LEFT', m-n+ii, ii-1, a( 1, ii ), 1, tau( i ), a,lda, work )
+                        
+              call stdlib_sscal( m-n+ii-1, -tau( i ), a( 1, ii ), 1 )
+              a( m-n+ii, ii ) = one - tau( i )
+              ! set a(m-k+i+1:m,n-k+i) to zero
+              do l = m - n + ii + 1, m
+                 a( l, ii ) = zero
+              end do
+           end do
+           return
+     end subroutine stdlib_sorg2l
+
+     !> SORG2R: generates an m by n real matrix Q with orthonormal columns,
+     !> which is defined as the first n columns of a product of k elementary
+     !> reflectors of order m
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by SGEQRF.
+
+     pure subroutine stdlib_sorg2r( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORG2R', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           ! initialise columns k+1:n to columns of the unit matrix
+           do j = k + 1, n
+              do l = 1, m
+                 a( l, j ) = zero
+              end do
+              a( j, j ) = one
+           end do
+           do i = k, 1, -1
+              ! apply h(i) to a(i:m,i:n) from the left
+              if( i<n ) then
+                 a( i, i ) = one
+                 call stdlib_slarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tau( i ),a( i, i+1 ), lda, &
+                           work )
+              end if
+              if( i<m )call stdlib_sscal( m-i, -tau( i ), a( i+1, i ), 1 )
+              a( i, i ) = one - tau( i )
+              ! set a(1:i-1,i) to zero
+              do l = 1, i - 1
+                 a( l, i ) = zero
+              end do
+           end do
+           return
+     end subroutine stdlib_sorg2r
+
+     !> SORGL2: generates an m by n real matrix Q with orthonormal rows,
+     !> which is defined as the first m rows of a product of k elementary
+     !> reflectors of order n
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by SGELQF.
+
+     pure subroutine stdlib_sorgl2( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORGL2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 )return
+           if( k<m ) then
+              ! initialise rows k+1:m to rows of the unit matrix
+              do j = 1, n
+                 do l = k + 1, m
+                    a( l, j ) = zero
+                 end do
+                 if( j>k .and. j<=m )a( j, j ) = one
+              end do
+           end if
+           do i = k, 1, -1
+              ! apply h(i) to a(i:m,i:n) from the right
+              if( i<n ) then
+                 if( i<m ) then
+                    a( i, i ) = one
+                    call stdlib_slarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda,tau( i ), a( i+1, i ), &
+                              lda, work )
+                 end if
+                 call stdlib_sscal( n-i, -tau( i ), a( i, i+1 ), lda )
+              end if
+              a( i, i ) = one - tau( i )
+              ! set a(i,1:i-1) to zero
+              do l = 1, i - 1
+                 a( i, l ) = zero
+              end do
+           end do
+           return
+     end subroutine stdlib_sorgl2
+
+     !> SORGLQ: generates an M-by-N real matrix Q with orthonormal rows,
+     !> which is defined as the first M rows of a product of K elementary
+     !> reflectors of order N
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by SGELQF.
+
+     pure subroutine stdlib_sorglq( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, ki, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'SORGLQ', ' ', m, n, k, -1 )
+           lwkopt = max( 1, m )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( lwork<max( 1, m ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORGLQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'SORGLQ', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'SORGLQ', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the last block.
+              ! the first kk rows are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              ! set a(kk+1:m,1:kk) to zero.
+              do j = 1, kk
+                 do i = kk + 1, m
+                    a( i, j ) = zero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the last or only block.
+           if( kk<m )call stdlib_sorgl2( m-kk, n-kk, k-kk, a( kk+1, kk+1 ), lda,tau( kk+1 ), work,&
+                      iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = ki + 1, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 if( i+ib<=m ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_slarft( 'FORWARD', 'ROWWISE', n-i+1, ib, a( i, i ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h**t to a(i+ib:m,i:n) from the right
+                    call stdlib_slarfb( 'RIGHT', 'TRANSPOSE', 'FORWARD', 'ROWWISE',m-i-ib+1, n-i+&
+                    1, ib, a( i, i ), lda, work,ldwork, a( i+ib, i ), lda, work( ib+1 ),ldwork )
+                              
+                 end if
+                 ! apply h**t to columns i:n of current block
+                 call stdlib_sorgl2( ib, n-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 ! set columns 1:i-1 of current block to zero
+                 do j = 1, i - 1
+                    do l = i, i + ib - 1
+                       a( l, j ) = zero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_sorglq
+
+     !> SORGQL: generates an M-by-N real matrix Q with orthonormal columns,
+     !> which is defined as the last N columns of a product of K elementary
+     !> reflectors of order M
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by SGEQLF.
+
+     pure subroutine stdlib_sorgql( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'SORGQL', ' ', m, n, k, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, n ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORGQL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'SORGQL', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'SORGQL', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the first block.
+              ! the last kk columns are handled by the block method.
+              kk = min( k, ( ( k-nx+nb-1 ) / nb )*nb )
+              ! set a(m-kk+1:m,1:n-kk) to zero.
+              do j = 1, n - kk
+                 do i = m - kk + 1, m
+                    a( i, j ) = zero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the first or only block.
+           call stdlib_sorg2l( m-kk, n-kk, k-kk, a, lda, tau, work, iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = k - kk + 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 if( n-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_slarft( 'BACKWARD', 'COLUMNWISE', m-k+i+ib-1, ib,a( 1, n-k+i ), &
+                              lda, tau( i ), work, ldwork )
+                    ! apply h to a(1:m-k+i+ib-1,1:n-k+i-1) from the left
+                    call stdlib_slarfb( 'LEFT', 'NO TRANSPOSE', 'BACKWARD','COLUMNWISE', m-k+i+ib-&
+                    1, n-k+i-1, ib,a( 1, n-k+i ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+                 ! apply h to rows 1:m-k+i+ib-1 of current block
+                 call stdlib_sorg2l( m-k+i+ib-1, ib, ib, a( 1, n-k+i ), lda,tau( i ), work, iinfo &
+                           )
+                 ! set rows m-k+i+ib:m of current block to zero
+                 do j = n - k + i, n - k + i + ib - 1
+                    do l = m - k + i + ib, m
+                       a( l, j ) = zero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_sorgql
+
+     !> SORGQR: generates an M-by-N real matrix Q with orthonormal columns,
+     !> which is defined as the first N columns of a product of K elementary
+     !> reflectors of order M
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by SGEQRF.
+
+     pure subroutine stdlib_sorgqr( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, ki, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'SORGQR', ' ', m, n, k, -1 )
+           lwkopt = max( 1, n )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORGQR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'SORGQR', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'SORGQR', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the last block.
+              ! the first kk columns are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              ! set a(1:kk,kk+1:n) to zero.
+              do j = kk + 1, n
+                 do i = 1, kk
+                    a( i, j ) = zero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the last or only block.
+           if( kk<n )call stdlib_sorg2r( m-kk, n-kk, k-kk, a( kk+1, kk+1 ), lda,tau( kk+1 ), work,&
+                      iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = ki + 1, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_slarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h to a(i:m,i+ib:n) from the left
+                    call stdlib_slarfb( 'LEFT', 'NO TRANSPOSE', 'FORWARD','COLUMNWISE', m-i+1, n-&
+                    i-ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 ), &
+                              ldwork )
+                 end if
+                 ! apply h to rows i:m of current block
+                 call stdlib_sorg2r( m-i+1, ib, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 ! set rows 1:i-1 of current block to zero
+                 do j = i, i + ib - 1
+                    do l = 1, i - 1
+                       a( l, j ) = zero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_sorgqr
+
+     !> SORGR2: generates an m by n real matrix Q with orthonormal rows,
+     !> which is defined as the last m rows of a product of k elementary
+     !> reflectors of order n
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by SGERQF.
+
+     pure subroutine stdlib_sorgr2( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORGR2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 )return
+           if( k<m ) then
+              ! initialise rows 1:m-k to rows of the unit matrix
+              do j = 1, n
+                 do l = 1, m - k
+                    a( l, j ) = zero
+                 end do
+                 if( j>n-m .and. j<=n-k )a( m-n+j, j ) = one
+              end do
+           end if
+           do i = 1, k
+              ii = m - k + i
+              ! apply h(i) to a(1:m-k+i,1:n-k+i) from the right
+              a( ii, n-m+ii ) = one
+              call stdlib_slarf( 'RIGHT', ii-1, n-m+ii, a( ii, 1 ), lda, tau( i ),a, lda, work )
+                        
+              call stdlib_sscal( n-m+ii-1, -tau( i ), a( ii, 1 ), lda )
+              a( ii, n-m+ii ) = one - tau( i )
+              ! set a(m-k+i,n-k+i+1:n) to zero
+              do l = n - m + ii + 1, n
+                 a( ii, l ) = zero
+              end do
+           end do
+           return
+     end subroutine stdlib_sorgr2
+
+     !> SORGRQ: generates an M-by-N real matrix Q with orthonormal rows,
+     !> which is defined as the last M rows of a product of K elementary
+     !> reflectors of order N
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by SGERQF.
+
+     pure subroutine stdlib_sorgrq( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, ii, iinfo, iws, j, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'SORGRQ', ' ', m, n, k, -1 )
+                 lwkopt = m*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, m ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORGRQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'SORGRQ', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'SORGRQ', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the first block.
+              ! the last kk rows are handled by the block method.
+              kk = min( k, ( ( k-nx+nb-1 ) / nb )*nb )
+              ! set a(1:m-kk,n-kk+1:n) to zero.
+              do j = n - kk + 1, n
+                 do i = 1, m - kk
+                    a( i, j ) = zero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the first or only block.
+           call stdlib_sorgr2( m-kk, n-kk, k-kk, a, lda, tau, work, iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = k - kk + 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 ii = m - k + i
+                 if( ii>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_slarft( 'BACKWARD', 'ROWWISE', n-k+i+ib-1, ib,a( ii, 1 ), lda, &
+                              tau( i ), work, ldwork )
+                    ! apply h**t to a(1:m-k+i-1,1:n-k+i+ib-1) from the right
+                    call stdlib_slarfb( 'RIGHT', 'TRANSPOSE', 'BACKWARD', 'ROWWISE',ii-1, n-k+i+&
+                              ib-1, ib, a( ii, 1 ), lda, work,ldwork, a, lda, work( ib+1 ), ldwork )
+                 end if
+                 ! apply h**t to columns 1:n-k+i+ib-1 of current block
+                 call stdlib_sorgr2( ib, n-k+i+ib-1, ib, a( ii, 1 ), lda, tau( i ),work, iinfo )
+                           
+                 ! set columns n-k+i+ib:n of current block to zero
+                 do l = n - k + i + ib, n
+                    do j = ii, ii + ib - 1
+                       a( j, l ) = zero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_sorgrq
+
+     !> SORGTSQR_ROW: generates an M-by-N real matrix Q_out with
+     !> orthonormal columns from the output of SLATSQR. These N orthonormal
+     !> columns are the first N columns of a product of complex unitary
+     !> matrices Q(k)_in of order M, which are returned by SLATSQR in
+     !> a special format.
+     !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+     !> The input matrices Q(k)_in are stored in row and column blocks in A.
+     !> See the documentation of SLATSQR for more details on the format of
+     !> Q(k)_in, where each Q(k)_in is represented by block Householder
+     !> transformations. This routine calls an auxiliary routine SLARFB_GETT,
+     !> where the computation is performed on each individual block. The
+     !> algorithm first sweeps NB-sized column blocks from the right to left
+     !> starting in the bottom row block and continues to the top row block
+     !> (hence _ROW in the routine name). This sweep is in reverse order of
+     !> the order in which SLATSQR generates the output blocks.
+
+     pure subroutine stdlib_sorgtsqr_row( m, n, mb, nb, a, lda, t, ldt, work,lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, mb, nb
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: t(ldt,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: nblocal, mb2, m_plus_one, itmp, ib_bottom, lworkopt, &
+                     num_all_row_blocks, jb_t, ib, imb, kb, kb_last, knb, mb1
+           ! Local Arrays 
+           real(sp) :: dummy(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           lquery  = lwork==-1
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb<=n ) then
+              info = -3
+           else if( nb<1 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           nblocal = min( nb, n )
+           ! determine the workspace size.
+           if( info==0 ) then
+              lworkopt = nblocal * max( nblocal, ( n - nblocal ) )
+           end if
+           ! handle error in the input parameters and handle the workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORGTSQR_ROW', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = real( lworkopt,KIND=sp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = real( lworkopt,KIND=sp)
+              return
+           end if
+           ! (0) set the upper-triangular part of the matrix a to zero and
+           ! its diagonal elements to one.
+           call stdlib_slaset('U', m, n, zero, one, a, lda )
+           ! kb_last is the column index of the last column block reflector
+           ! in the matrices t and v.
+           kb_last = ( ( n-1 ) / nblocal ) * nblocal + 1
+           ! (1) bottom-up loop over row blocks of a, except the top row block.
+           ! note: if mb>=m, then the loop is never executed.
+           if ( mb<m ) then
+              ! mb2 is the row blocking size for the row blocks before the
+              ! first top row block in the matrix a. ib is the row index for
+              ! the row blocks in the matrix a before the first top row block.
+              ! ib_bottom is the row index for the last bottom row block
+              ! in the matrix a. jb_t is the column index of the corresponding
+              ! column block in the matrix t.
+              ! initialize variables.
+              ! num_all_row_blocks is the number of row blocks in the matrix a
+              ! including the first row block.
+              mb2 = mb - n
+              m_plus_one = m + 1
+              itmp = ( m - mb - 1 ) / mb2
+              ib_bottom = itmp * mb2 + mb + 1
+              num_all_row_blocks = itmp + 2
+              jb_t = num_all_row_blocks * n + 1
+              do ib = ib_bottom, mb+1, -mb2
+                 ! determine the block size imb for the current row block
+                 ! in the matrix a.
+                 imb = min( m_plus_one - ib, mb2 )
+                 ! determine the column index jb_t for the current column block
+                 ! in the matrix t.
+                 jb_t = jb_t - n
+                 ! apply column blocks of h in the row block from right to left.
+                 ! kb is the column index of the current column block reflector
+                 ! in the matrices t and v.
+                 do kb = kb_last, 1, -nblocal
+                    ! determine the size of the current column block knb in
+                    ! the matrices t and v.
+                    knb = min( nblocal, n - kb + 1 )
+                    call stdlib_slarfb_gett( 'I', imb, n-kb+1, knb,t( 1, jb_t+kb-1 ), ldt, a( kb, &
+                              kb ), lda,a( ib, kb ), lda, work, knb )
+                 end do
+              end do
+           end if
+           ! (2) top row block of a.
+           ! note: if mb>=m, then we have only one row block of a of size m
+           ! and we work on the entire matrix a.
+           mb1 = min( mb, m )
+           ! apply column blocks of h in the top row block from right to left.
+           ! kb is the column index of the current block reflector in
+           ! the matrices t and v.
+           do kb = kb_last, 1, -nblocal
+              ! determine the size of the current column block knb in
+              ! the matrices t and v.
+              knb = min( nblocal, n - kb + 1 )
+              if( mb1-kb-knb+1==0 ) then
+                 ! in stdlib_slarfb_gett parameters, when m=0, then the matrix b
+                 ! does not exist, hence we need to pass a dummy array
+                 ! reference dummy(1,1) to b with lddummy=1.
+                 call stdlib_slarfb_gett( 'N', 0, n-kb+1, knb,t( 1, kb ), ldt, a( kb, kb ), lda,&
+                           dummy( 1, 1 ), 1, work, knb )
+              else
+                 call stdlib_slarfb_gett( 'N', mb1-kb-knb+1, n-kb+1, knb,t( 1, kb ), ldt, a( kb, &
+                           kb ), lda,a( kb+knb, kb), lda, work, knb )
+              end if
+           end do
+           work( 1 ) = real( lworkopt,KIND=sp)
+           return
+     end subroutine stdlib_sorgtsqr_row
+
+
+     pure subroutine stdlib_sorm22( side, trans, m, n, n1, n2, q, ldq, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(in) :: m, n, n1, n2, ldq, ldc, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(in) :: q(ldq,*)
+           real(sp), intent(inout) :: c(ldc,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, ldwork, len, lwkopt, nb, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q;
+           ! nw is the minimum dimension of work.
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           nw = nq
+           if( n1==0 .or. n2==0 ) nw = 1
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) )&
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( n1<0 .or. n1+n2/=nq ) then
+              info = -5
+           else if( n2<0 ) then
+              info = -6
+           else if( ldq<max( 1, nq ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              lwkopt = m*n
+              work( 1 ) = real( lwkopt,KIND=sp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORM22', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! degenerate cases (n1 = 0 or n2 = 0) are handled using stdlib_strmm.
+           if( n1==0 ) then
+              call stdlib_strmm( side, 'UPPER', trans, 'NON-UNIT', m, n, one,q, ldq, c, ldc )
+                        
+              work( 1 ) = one
+              return
+           else if( n2==0 ) then
+              call stdlib_strmm( side, 'LOWER', trans, 'NON-UNIT', m, n, one,q, ldq, c, ldc )
+                        
+              work( 1 ) = one
+              return
+           end if
+           ! compute the largest chunk size available from the workspace.
+           nb = max( 1, min( lwork, lwkopt ) / nq )
+           if( left ) then
+              if( notran ) then
+                 do i = 1, n, nb
+                    len = min( nb, n-i+1 )
+                    ldwork = m
+                    ! multiply bottom part of c by q12.
+                    call stdlib_slacpy( 'ALL', n1, len, c( n2+1, i ), ldc, work,ldwork )
+                    call stdlib_strmm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT',n1, len, one, &
+                              q( 1, n2+1 ), ldq, work,ldwork )
+                    ! multiply top part of c by q11.
+                    call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n1, len, n2,one, q, ldq, c(&
+                               1, i ), ldc, one, work,ldwork )
+                    ! multiply top part of c by q21.
+                    call stdlib_slacpy( 'ALL', n2, len, c( 1, i ), ldc,work( n1+1 ), ldwork )
+                              
+                    call stdlib_strmm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',n2, len, one, &
+                              q( n1+1, 1 ), ldq,work( n1+1 ), ldwork )
+                    ! multiply bottom part of c by q22.
+                    call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n2, len, n1,one, q( n1+1, &
+                              n2+1 ), ldq, c( n2+1, i ), ldc,one, work( n1+1 ), ldwork )
+                    ! copy everything back.
+                    call stdlib_slacpy( 'ALL', m, len, work, ldwork, c( 1, i ),ldc )
+                 end do
+              else
+                 do i = 1, n, nb
+                    len = min( nb, n-i+1 )
+                    ldwork = m
+                    ! multiply bottom part of c by q21**t.
+                    call stdlib_slacpy( 'ALL', n2, len, c( n1+1, i ), ldc, work,ldwork )
+                    call stdlib_strmm( 'LEFT', 'UPPER', 'TRANSPOSE', 'NON-UNIT',n2, len, one, q( &
+                              n1+1, 1 ), ldq, work,ldwork )
+                    ! multiply top part of c by q11**t.
+                    call stdlib_sgemm( 'TRANSPOSE', 'NO TRANSPOSE', n2, len, n1,one, q, ldq, c( 1,&
+                               i ), ldc, one, work,ldwork )
+                    ! multiply top part of c by q12**t.
+                    call stdlib_slacpy( 'ALL', n1, len, c( 1, i ), ldc,work( n2+1 ), ldwork )
+                              
+                    call stdlib_strmm( 'LEFT', 'LOWER', 'TRANSPOSE', 'NON-UNIT',n1, len, one, q( &
+                              1, n2+1 ), ldq,work( n2+1 ), ldwork )
+                    ! multiply bottom part of c by q22**t.
+                    call stdlib_sgemm( 'TRANSPOSE', 'NO TRANSPOSE', n1, len, n2,one, q( n1+1, n2+&
+                              1 ), ldq, c( n1+1, i ), ldc,one, work( n2+1 ), ldwork )
+                    ! copy everything back.
+                    call stdlib_slacpy( 'ALL', m, len, work, ldwork, c( 1, i ),ldc )
+                 end do
+              end if
+           else
+              if( notran ) then
+                 do i = 1, m, nb
+                    len = min( nb, m-i+1 )
+                    ldwork = len
+                    ! multiply right part of c by q21.
+                    call stdlib_slacpy( 'ALL', len, n2, c( i, n1+1 ), ldc, work,ldwork )
+                    call stdlib_strmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',len, n2, one, &
+                              q( n1+1, 1 ), ldq, work,ldwork )
+                    ! multiply left part of c by q11.
+                    call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', len, n2, n1,one, c( i, 1 ),&
+                               ldc, q, ldq, one, work,ldwork )
+                    ! multiply left part of c by q12.
+                    call stdlib_slacpy( 'ALL', len, n1, c( i, 1 ), ldc,work( 1 + n2*ldwork ), &
+                              ldwork )
+                    call stdlib_strmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT',len, n1, one, &
+                              q( 1, n2+1 ), ldq,work( 1 + n2*ldwork ), ldwork )
+                    ! multiply right part of c by q22.
+                    call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', len, n1, n2,one, c( i, n1+&
+                              1 ), ldc, q( n1+1, n2+1 ), ldq,one, work( 1 + n2*ldwork ), ldwork )
+                    ! copy everything back.
+                    call stdlib_slacpy( 'ALL', len, n, work, ldwork, c( i, 1 ),ldc )
+                 end do
+              else
+                 do i = 1, m, nb
+                    len = min( nb, m-i+1 )
+                    ldwork = len
+                    ! multiply right part of c by q12**t.
+                    call stdlib_slacpy( 'ALL', len, n1, c( i, n2+1 ), ldc, work,ldwork )
+                    call stdlib_strmm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'NON-UNIT',len, n1, one, q( &
+                              1, n2+1 ), ldq, work,ldwork )
+                    ! multiply left part of c by q11**t.
+                    call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', len, n1, n2,one, c( i, 1 ), &
+                              ldc, q, ldq, one, work,ldwork )
+                    ! multiply left part of c by q21**t.
+                    call stdlib_slacpy( 'ALL', len, n2, c( i, 1 ), ldc,work( 1 + n1*ldwork ), &
+                              ldwork )
+                    call stdlib_strmm( 'RIGHT', 'UPPER', 'TRANSPOSE', 'NON-UNIT',len, n2, one, q( &
+                              n1+1, 1 ), ldq,work( 1 + n1*ldwork ), ldwork )
+                    ! multiply right part of c by q22**t.
+                    call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', len, n2, n1,one, c( i, n2+1 ),&
+                               ldc, q( n1+1, n2+1 ), ldq,one, work( 1 + n1*ldwork ), ldwork )
+                    ! copy everything back.
+                    call stdlib_slacpy( 'ALL', len, n, work, ldwork, c( i, 1 ),ldc )
+                 end do
+              end if
+           end if
+           work( 1 ) = real( lwkopt,KIND=sp)
+           return
+     end subroutine stdlib_sorm22
+
+     !> SORM2L: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T * C  if SIDE = 'L' and TRANS = 'T', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by SGEQLF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_sorm2l( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, mi, ni, nq
+           real(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORM2L', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. notran ) .or. ( .not.left .and. .not.notran ) )then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+           else
+              mi = m
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) is applied to c(1:m-k+i,1:n)
+                 mi = m - k + i
+              else
+                 ! h(i) is applied to c(1:m,1:n-k+i)
+                 ni = n - k + i
+              end if
+              ! apply h(i)
+              aii = a( nq-k+i, i )
+              a( nq-k+i, i ) = one
+              call stdlib_slarf( side, mi, ni, a( 1, i ), 1, tau( i ), c, ldc,work )
+              a( nq-k+i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_sorm2l
+
+     !> SORM2R: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T* C  if SIDE = 'L' and TRANS = 'T', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by SGEQRF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_sorm2r( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, jc, mi, ni, nq
+           real(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORM2R', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran ) .or. ( .not.left .and. notran ) )then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              jc = 1
+           else
+              mi = m
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i)
+              aii = a( i, i )
+              a( i, i ) = one
+              call stdlib_slarf( side, mi, ni, a( i, i ), 1, tau( i ), c( ic, jc ),ldc, work )
+                        
+              a( i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_sorm2r
+
+     !> SORML2: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T* C  if SIDE = 'L' and TRANS = 'T', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by SGELQF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_sorml2( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, jc, mi, ni, nq
+           real(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORML2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. notran ) .or. ( .not.left .and. .not.notran ) )then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              jc = 1
+           else
+              mi = m
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i)
+              aii = a( i, i )
+              a( i, i ) = one
+              call stdlib_slarf( side, mi, ni, a( i, i ), lda, tau( i ),c( ic, jc ), ldc, work )
+                        
+              a( i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_sorml2
+
+     !> SORMLQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by SGELQF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_sormlq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, jc, ldwork, lwkopt, mi, nb, nbmin, &
+                     ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'SORMLQ', side // trans, m, n, k,-1 ) )
+              lwkopt = nw*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORMLQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'SORMLQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_sorml2( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. notran ) .or.( .not.left .and. .not.notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              if( notran ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i) h(i+1) . . . h(i+ib-1)
+                 call stdlib_slarft( 'FORWARD', 'ROWWISE', nq-i+1, ib, a( i, i ),lda, tau( i ), &
+                           work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**t is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**t
+                 call stdlib_slarfb( side, transt, 'FORWARD', 'ROWWISE', mi, ni, ib,a( i, i ), &
+                           lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sormlq
+
+     !> SORMQL: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by SGEQLF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_sormql( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, i1, i2, i3, ib, iinfo, iwt, ldwork, lwkopt, mi, nb, nbmin, ni, nq, &
+                     nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+           ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'SORMQL', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORMQL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'SORMQL', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_sorm2l( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. notran ) .or.( .not.left .and. .not.notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_slarft( 'BACKWARD', 'COLUMNWISE', nq-k+i+ib-1, ib,a( 1, i ), lda, &
+                           tau( i ), work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(1:m-k+i+ib-1,1:n)
+                    mi = m - k + i + ib - 1
+                 else
+                    ! h or h**t is applied to c(1:m,1:n-k+i+ib-1)
+                    ni = n - k + i + ib - 1
+                 end if
+                 ! apply h or h**t
+                 call stdlib_slarfb( side, trans, 'BACKWARD', 'COLUMNWISE', mi, ni,ib, a( 1, i ), &
+                           lda, work( iwt ), ldt, c, ldc,work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sormql
+
+     !> SORMQR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by SGEQRF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_sormqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, jc, ldwork, lwkopt, mi, nb, nbmin, &
+                     ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'SORMQR', side // trans, m, n, k,-1 ) )
+              lwkopt = nw*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORMQR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'SORMQR', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_sorm2r( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i) h(i+1) . . . h(i+ib-1)
+                 call stdlib_slarft( 'FORWARD', 'COLUMNWISE', nq-i+1, ib, a( i, i ),lda, tau( i ),&
+                            work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**t is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**t
+                 call stdlib_slarfb( side, trans, 'FORWARD', 'COLUMNWISE', mi, ni,ib, a( i, i ), &
+                           lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sormqr
+
+     !> SORMR2: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T* C  if SIDE = 'L' and TRANS = 'T', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'T',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by SGERQF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_sormr2( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, mi, ni, nq
+           real(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORMR2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran ) .or. ( .not.left .and. notran ) )then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+           else
+              mi = m
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) is applied to c(1:m-k+i,1:n)
+                 mi = m - k + i
+              else
+                 ! h(i) is applied to c(1:m,1:n-k+i)
+                 ni = n - k + i
+              end if
+              ! apply h(i)
+              aii = a( i, nq-k+i )
+              a( i, nq-k+i ) = one
+              call stdlib_slarf( side, mi, ni, a( i, 1 ), lda, tau( i ), c, ldc,work )
+              a( i, nq-k+i ) = aii
+           end do
+           return
+     end subroutine stdlib_sormr2
+
+     !> SORMR3: overwrites the general real m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**T* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**T if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by STZRZF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_sormr3( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, l, lda, ldc, m, n
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), tau(*)
+           real(sp), intent(inout) :: c(ldc,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, ja, jc, mi, ni, nq
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( l<0 .or. ( left .and. ( l>m ) ) .or.( .not.left .and. ( l>n ) ) ) then
+              info = -6
+           else if( lda<max( 1, k ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORMR3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran .or. .not.left .and. notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              ja = m - l + 1
+              jc = 1
+           else
+              mi = m
+              ja = n - l + 1
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**t is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) or h(i)**t is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i) or h(i)**t
+              call stdlib_slarz( side, mi, ni, l, a( i, ja ), lda, tau( i ),c( ic, jc ), ldc, &
+                        work )
+           end do
+           return
+     end subroutine stdlib_sormr3
+
+     !> SORMRQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by SGERQF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_sormrq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, iinfo, iwt, ldwork, lwkopt, mi, nb, nbmin, ni, nq, &
+                     nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+           ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'SORMRQ', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORMRQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'SORMRQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_sormr2( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              if( notran ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_slarft( 'BACKWARD', 'ROWWISE', nq-k+i+ib-1, ib,a( i, 1 ), lda, tau( &
+                           i ), work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(1:m-k+i+ib-1,1:n)
+                    mi = m - k + i + ib - 1
+                 else
+                    ! h or h**t is applied to c(1:m,1:n-k+i+ib-1)
+                    ni = n - k + i + ib - 1
+                 end if
+                 ! apply h or h**t
+                 call stdlib_slarfb( side, transt, 'BACKWARD', 'ROWWISE', mi, ni,ib, a( i, 1 ), &
+                           lda, work( iwt ), ldt, c, ldc,work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sormrq
+
+     !> SORMRZ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by STZRZF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_sormrz( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, l, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, ja, jc, ldwork, lwkopt, mi, nb, &
+                     nbmin, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( l<0 .or. ( left .and. ( l>m ) ) .or.( .not.left .and. ( l>n ) ) ) then
+              info = -6
+           else if( lda<max( 1, k ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'SORMRQ', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORMRZ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'SORMRQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_sormr3( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, iinfo )
+                        
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+                 ja = m - l + 1
+              else
+                 mi = m
+                 ic = 1
+                 ja = n - l + 1
+              end if
+              if( notran ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_slarzt( 'BACKWARD', 'ROWWISE', l, ib, a( i, ja ), lda,tau( i ), work(&
+                            iwt ), ldt )
+                 if( left ) then
+                    ! h or h**t is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**t is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**t
+                 call stdlib_slarzb( side, transt, 'BACKWARD', 'ROWWISE', mi, ni,ib, l, a( i, ja )&
+                           , lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sormrz
+
+     !> SPBEQU: computes row and column scalings intended to equilibrate a
+     !> symmetric positive definite band matrix A and reduce its condition
+     !> number (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_spbequ( uplo, n, kd, ab, ldab, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(sp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(in) :: ab(ldab,*)
+           real(sp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j
+           real(sp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPBEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           if( upper ) then
+              j = kd + 1
+           else
+              j = 1
+           end if
+           ! initialize smin and amax.
+           s( 1 ) = ab( j, 1 )
+           smin = s( 1 )
+           amax = s( 1 )
+           ! find the minimum and maximum diagonal elements.
+           do i = 2, n
+              s( i ) = ab( j, i )
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_spbequ
+
+     !> SPBSTF: computes a split Cholesky factorization of a real
+     !> symmetric positive definite band matrix A.
+     !> This routine is designed to be used in conjunction with SSBGST.
+     !> The factorization has the form  A = S**T*S  where S is a band matrix
+     !> of the same bandwidth as A and the following structure:
+     !> S = ( U    )
+     !> ( M  L )
+     !> where U is upper triangular of order m = (n+kd)/2, and L is lower
+     !> triangular of order n-m.
+
+     pure subroutine stdlib_spbstf( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, kld, km, m
+           real(sp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPBSTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           kld = max( 1, ldab-1 )
+           ! set the splitting point m.
+           m = ( n+kd ) / 2
+           if( upper ) then
+              ! factorize a(m+1:n,m+1:n) as l**t*l, and update a(1:m,1:m).
+              do j = n, m + 1, -1
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = ab( kd+1, j )
+                 if( ajj<=zero )go to 50
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 km = min( j-1, kd )
+                 ! compute elements j-km:j-1 of the j-th column and update the
+                 ! the leading submatrix within the band.
+                 call stdlib_sscal( km, one / ajj, ab( kd+1-km, j ), 1 )
+                 call stdlib_ssyr( 'UPPER', km, -one, ab( kd+1-km, j ), 1,ab( kd+1, j-km ), kld )
+                           
+              end do
+              ! factorize the updated submatrix a(1:m,1:m) as u**t*u.
+              do j = 1, m
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = ab( kd+1, j )
+                 if( ajj<=zero )go to 50
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 km = min( kd, m-j )
+                 ! compute elements j+1:j+km of the j-th row and update the
+                 ! trailing submatrix within the band.
+                 if( km>0 ) then
+                    call stdlib_sscal( km, one / ajj, ab( kd, j+1 ), kld )
+                    call stdlib_ssyr( 'UPPER', km, -one, ab( kd, j+1 ), kld,ab( kd+1, j+1 ), kld )
+                              
+                 end if
+              end do
+           else
+              ! factorize a(m+1:n,m+1:n) as l**t*l, and update a(1:m,1:m).
+              do j = n, m + 1, -1
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = ab( 1, j )
+                 if( ajj<=zero )go to 50
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 km = min( j-1, kd )
+                 ! compute elements j-km:j-1 of the j-th row and update the
+                 ! trailing submatrix within the band.
+                 call stdlib_sscal( km, one / ajj, ab( km+1, j-km ), kld )
+                 call stdlib_ssyr( 'LOWER', km, -one, ab( km+1, j-km ), kld,ab( 1, j-km ), kld )
+                           
+              end do
+              ! factorize the updated submatrix a(1:m,1:m) as u**t*u.
+              do j = 1, m
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = ab( 1, j )
+                 if( ajj<=zero )go to 50
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 km = min( kd, m-j )
+                 ! compute elements j+1:j+km of the j-th column and update the
+                 ! trailing submatrix within the band.
+                 if( km>0 ) then
+                    call stdlib_sscal( km, one / ajj, ab( 2, j ), 1 )
+                    call stdlib_ssyr( 'LOWER', km, -one, ab( 2, j ), 1,ab( 1, j+1 ), kld )
+                 end if
+              end do
+           end if
+           return
+           50 continue
+           info = j
+           return
+     end subroutine stdlib_spbstf
+
+     !> SPBTF2: computes the Cholesky factorization of a real symmetric
+     !> positive definite band matrix A.
+     !> The factorization has the form
+     !> A = U**T * U ,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix, U**T is the transpose of U, and
+     !> L is lower triangular.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_spbtf2( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, kld, kn
+           real(sp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPBTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           kld = max( 1, ldab-1 )
+           if( upper ) then
+              ! compute the cholesky factorization a = u**t*u.
+              do j = 1, n
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = ab( kd+1, j )
+                 if( ajj<=zero )go to 30
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 ! compute elements j+1:j+kn of row j and update the
+                 ! trailing submatrix within the band.
+                 kn = min( kd, n-j )
+                 if( kn>0 ) then
+                    call stdlib_sscal( kn, one / ajj, ab( kd, j+1 ), kld )
+                    call stdlib_ssyr( 'UPPER', kn, -one, ab( kd, j+1 ), kld,ab( kd+1, j+1 ), kld )
+                              
+                 end if
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**t.
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = ab( 1, j )
+                 if( ajj<=zero )go to 30
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 ! compute elements j+1:j+kn of column j and update the
+                 ! trailing submatrix within the band.
+                 kn = min( kd, n-j )
+                 if( kn>0 ) then
+                    call stdlib_sscal( kn, one / ajj, ab( 2, j ), 1 )
+                    call stdlib_ssyr( 'LOWER', kn, -one, ab( 2, j ), 1,ab( 1, j+1 ), kld )
+                 end if
+              end do
+           end if
+           return
+           30 continue
+           info = j
+           return
+     end subroutine stdlib_spbtf2
+
+     !> SPBTRS: solves a system of linear equations A*X = B with a symmetric
+     !> positive definite band matrix A using the Cholesky factorization
+     !> A = U**T*U or A = L*L**T computed by SPBTRF.
+
+     pure subroutine stdlib_spbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(in) :: ab(ldab,*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**t *u.
+              do j = 1, nrhs
+                 ! solve u**t *x = b, overwriting b with x.
+                 call stdlib_stbsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j ), &
+                           1 )
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_stbsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j )&
+                           , 1 )
+              end do
+           else
+              ! solve a*x = b where a = l*l**t.
+              do j = 1, nrhs
+                 ! solve l*x = b, overwriting b with x.
+                 call stdlib_stbsv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j )&
+                           , 1 )
+                 ! solve l**t *x = b, overwriting b with x.
+                 call stdlib_stbsv( 'LOWER', 'TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j ), &
+                           1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_spbtrs
+
+     !> SPOEQU: computes row and column scalings intended to equilibrate a
+     !> symmetric positive definite matrix A and reduce its condition number
+     !> (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_spoequ( n, a, lda, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPOEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           ! find the minimum and maximum diagonal elements.
+           s( 1 ) = a( 1, 1 )
+           smin = s( 1 )
+           amax = s( 1 )
+           do i = 2, n
+              s( i ) = a( i, i )
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_spoequ
+
+     !> SPOEQUB: computes row and column scalings intended to equilibrate a
+     !> symmetric positive definite matrix A and reduce its condition number
+     !> (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+     !> This routine differs from SPOEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled diagonal entries are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_spoequb( n, a, lda, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: smin, base, tmp
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt,log,int
+           ! Executable Statements 
+           ! test the input parameters.
+           ! positive definite only performs 1 pass of equilibration.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPOEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           base = stdlib_slamch( 'B' )
+           tmp = -0.5_sp / log ( base )
+           ! find the minimum and maximum diagonal elements.
+           s( 1 ) = a( 1, 1 )
+           smin = s( 1 )
+           amax = s( 1 )
+           do i = 2, n
+              s( i ) = a( i, i )
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = base ** int( tmp * log( s( i ) ),KIND=ilp)
+              end do
+              ! compute scond = min(s(i)) / max(s(i)).
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_spoequb
+
+     !> SPOTRS: solves a system of linear equations A*X = B with a symmetric
+     !> positive definite matrix A using the Cholesky factorization
+     !> A = U**T*U or A = L*L**T computed by SPOTRF.
+
+     pure subroutine stdlib_spotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPOTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**t *u.
+              ! solve u**t *x = b, overwriting b with x.
+              call stdlib_strsm( 'LEFT', 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, nrhs,one, a, lda, b,&
+                         ldb )
+              ! solve u*x = b, overwriting b with x.
+              call stdlib_strsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, one, a, lda,&
+                         b, ldb )
+           else
+              ! solve a*x = b where a = l*l**t.
+              ! solve l*x = b, overwriting b with x.
+              call stdlib_strsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, one, a, lda,&
+                         b, ldb )
+              ! solve l**t *x = b, overwriting b with x.
+              call stdlib_strsm( 'LEFT', 'LOWER', 'TRANSPOSE', 'NON-UNIT', n, nrhs,one, a, lda, b,&
+                         ldb )
+           end if
+           return
+     end subroutine stdlib_spotrs
+
+     !> SPPEQU: computes row and column scalings intended to equilibrate a
+     !> symmetric positive definite matrix A in packed storage and reduce
+     !> its condition number (with respect to the two-norm).  S contains the
+     !> scale factors, S(i)=1/sqrt(A(i,i)), chosen so that the scaled matrix
+     !> B with elements B(i,j)=S(i)*A(i,j)*S(j) has ones on the diagonal.
+     !> This choice of S puts the condition number of B within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_sppequ( uplo, n, ap, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(sp), intent(in) :: ap(*)
+           real(sp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, jj
+           real(sp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPPEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           ! initialize smin and amax.
+           s( 1 ) = ap( 1 )
+           smin = s( 1 )
+           amax = s( 1 )
+           if( upper ) then
+              ! uplo = 'u':  upper triangle of a is stored.
+              ! find the minimum and maximum diagonal elements.
+              jj = 1
+              do i = 2, n
+                 jj = jj + i
+                 s( i ) = ap( jj )
+                 smin = min( smin, s( i ) )
+                 amax = max( amax, s( i ) )
+              end do
+           else
+              ! uplo = 'l':  lower triangle of a is stored.
+              ! find the minimum and maximum diagonal elements.
+              jj = 1
+              do i = 2, n
+                 jj = jj + n - i + 2
+                 s( i ) = ap( jj )
+                 smin = min( smin, s( i ) )
+                 amax = max( amax, s( i ) )
+              end do
+           end if
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_sppequ
+
+     !> SPPTRF: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A stored in packed format.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+
+     pure subroutine stdlib_spptrf( uplo, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jc, jj
+           real(sp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPPTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the cholesky factorization a = u**t*u.
+              jj = 0
+              do j = 1, n
+                 jc = jj + 1
+                 jj = jj + j
+                 ! compute elements 1:j-1 of column j.
+                 if( j>1 )call stdlib_stpsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', j-1, ap,ap( jc ), &
+                           1 )
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = ap( jj ) - stdlib_sdot( j-1, ap( jc ), 1, ap( jc ), 1 )
+                 if( ajj<=zero ) then
+                    ap( jj ) = ajj
+                    go to 30
+                 end if
+                 ap( jj ) = sqrt( ajj )
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**t.
+              jj = 1
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = ap( jj )
+                 if( ajj<=zero ) then
+                    ap( jj ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 ap( jj ) = ajj
+                 ! compute elements j+1:n of column j and update the trailing
+                 ! submatrix.
+                 if( j<n ) then
+                    call stdlib_sscal( n-j, one / ajj, ap( jj+1 ), 1 )
+                    call stdlib_sspr( 'LOWER', n-j, -one, ap( jj+1 ), 1,ap( jj+n-j+1 ) )
+                    jj = jj + n - j + 1
+                 end if
+              end do
+           end if
+           go to 40
+           30 continue
+           info = j
+           40 continue
+           return
+     end subroutine stdlib_spptrf
+
+     !> SPPTRS: solves a system of linear equations A*X = B with a symmetric
+     !> positive definite matrix A in packed storage using the Cholesky
+     !> factorization A = U**T*U or A = L*L**T computed by SPPTRF.
+
+     pure subroutine stdlib_spptrs( uplo, n, nrhs, ap, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(in) :: ap(*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**t * u.
+              do i = 1, nrhs
+                 ! solve u**t *x = b, overwriting b with x.
+                 call stdlib_stpsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_stpsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                           
+              end do
+           else
+              ! solve a*x = b where a = l * l**t.
+              do i = 1, nrhs
+                 ! solve l*y = b, overwriting b with x.
+                 call stdlib_stpsv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                           
+                 ! solve l**t *x = y, overwriting b with x.
+                 call stdlib_stpsv( 'LOWER', 'TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_spptrs
+
+     !> SPTCON: computes the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric positive definite tridiagonal matrix
+     !> using the factorization A = L*D*L**T or A = U**T*D*U computed by
+     !> SPTTRF.
+     !> Norm(inv(A)) is computed by a direct method, and the reciprocal of
+     !> the condition number is computed as
+     !> RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_sptcon( n, d, e, anorm, rcond, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*), e(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ix
+           real(sp) :: ainvnm
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( anorm<zero ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPTCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           ! check that d(1:n) is positive.
+           do i = 1, n
+              if( d( i )<=zero )return
+           end do
+           ! solve m(a) * x = e, where m(a) = (m(i,j)) is given by
+              ! m(i,j) =  abs(a(i,j)), i = j,
+              ! m(i,j) = -abs(a(i,j)), i .ne. j,
+           ! and e = [ 1, 1, ..., 1 ]**t.  note m(a) = m(l)*d*m(l)**t.
+           ! solve m(l) * x = e.
+           work( 1 ) = one
+           do i = 2, n
+              work( i ) = one + work( i-1 )*abs( e( i-1 ) )
+           end do
+           ! solve d * m(l)**t * x = b.
+           work( n ) = work( n ) / d( n )
+           do i = n - 1, 1, -1
+              work( i ) = work( i ) / d( i ) + work( i+1 )*abs( e( i ) )
+           end do
+           ! compute ainvnm = max(x(i)), 1<=i<=n.
+           ix = stdlib_isamax( n, work, 1 )
+           ainvnm = abs( work( ix ) )
+           ! compute the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_sptcon
+
+     !> SPTTRF: computes the L*D*L**T factorization of a real symmetric
+     !> positive definite tridiagonal matrix A.  The factorization may also
+     !> be regarded as having the form A = U**T*D*U.
+
+     pure subroutine stdlib_spttrf( n, d, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(inout) :: d(*), e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i4
+           real(sp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'SPTTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! compute the l*d*l**t (or u**t*d*u) factorization of a.
+           i4 = mod( n-1, 4 )
+           do i = 1, i4
+              if( d( i )<=zero ) then
+                 info = i
+                 go to 30
+              end if
+              ei = e( i )
+              e( i ) = ei / d( i )
+              d( i+1 ) = d( i+1 ) - e( i )*ei
+           end do
+           loop_20: do i = i4 + 1, n - 4, 4
+              ! drop out of the loop if d(i) <= 0: the matrix is not positive
+              ! definite.
+              if( d( i )<=zero ) then
+                 info = i
+                 go to 30
+              end if
+              ! solve for e(i) and d(i+1).
+              ei = e( i )
+              e( i ) = ei / d( i )
+              d( i+1 ) = d( i+1 ) - e( i )*ei
+              if( d( i+1 )<=zero ) then
+                 info = i + 1
+                 go to 30
+              end if
+              ! solve for e(i+1) and d(i+2).
+              ei = e( i+1 )
+              e( i+1 ) = ei / d( i+1 )
+              d( i+2 ) = d( i+2 ) - e( i+1 )*ei
+              if( d( i+2 )<=zero ) then
+                 info = i + 2
+                 go to 30
+              end if
+              ! solve for e(i+2) and d(i+3).
+              ei = e( i+2 )
+              e( i+2 ) = ei / d( i+2 )
+              d( i+3 ) = d( i+3 ) - e( i+2 )*ei
+              if( d( i+3 )<=zero ) then
+                 info = i + 3
+                 go to 30
+              end if
+              ! solve for e(i+3) and d(i+4).
+              ei = e( i+3 )
+              e( i+3 ) = ei / d( i+3 )
+              d( i+4 ) = d( i+4 ) - e( i+3 )*ei
+           end do loop_20
+           ! check d(n) for positive definiteness.
+           if( d( n )<=zero )info = n
+           30 continue
+           return
+     end subroutine stdlib_spttrf
+
+     !> SPTTS2: solves a tridiagonal system of the form
+     !> A * X = B
+     !> using the L*D*L**T factorization of A computed by SPTTRF.  D is a
+     !> diagonal matrix specified in the vector D, L is a unit bidiagonal
+     !> matrix whose subdiagonal is specified in the vector E, and X and B
+     !> are N by NRHS matrices.
+
+     pure subroutine stdlib_sptts2( n, nrhs, d, e, b, ldb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(inout) :: b(ldb,*)
+           real(sp), intent(in) :: d(*), e(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 )call stdlib_sscal( nrhs, 1. / d( 1 ), b, ldb )
+              return
+           end if
+           ! solve a * x = b using the factorization a = l*d*l**t,
+           ! overwriting each right hand side vector with its solution.
+           do j = 1, nrhs
+                 ! solve l * x = b.
+              do i = 2, n
+                 b( i, j ) = b( i, j ) - b( i-1, j )*e( i-1 )
+              end do
+                 ! solve d * l**t * x = b.
+              b( n, j ) = b( n, j ) / d( n )
+              do i = n - 1, 1, -1
+                 b( i, j ) = b( i, j ) / d( i ) - b( i+1, j )*e( i )
+              end do
+           end do
+           return
+     end subroutine stdlib_sptts2
+
+     !> SRSCL: multiplies an n-element real vector x by the real scalar 1/a.
+     !> This is done without overflow or underflow as long as
+     !> the final result x/a does not overflow or underflow.
+
+     pure subroutine stdlib_srscl( n, sa, sx, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           real(sp), intent(in) :: sa
+           ! Array Arguments 
+           real(sp), intent(inout) :: sx(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           real(sp) :: bignum, cden, cden1, cnum, cnum1, mul, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 )return
+           ! get machine parameters
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! initialize the denominator to sa and the numerator to 1.
+           cden = sa
+           cnum = one
+           10 continue
+           cden1 = cden*smlnum
+           cnum1 = cnum / bignum
+           if( abs( cden1 )>abs( cnum ) .and. cnum/=zero ) then
+              ! pre-multiply x by smlnum if cden is large compared to cnum.
+              mul = smlnum
+              done = .false.
+              cden = cden1
+           else if( abs( cnum1 )>abs( cden ) ) then
+              ! pre-multiply x by bignum if cden is small compared to cnum.
+              mul = bignum
+              done = .false.
+              cnum = cnum1
+           else
+              ! multiply x by cnum / cden and return.
+              mul = cnum / cden
+              done = .true.
+           end if
+           ! scale the vector x by mul
+           call stdlib_sscal( n, mul, sx, incx )
+           if( .not.done )go to 10
+           return
+     end subroutine stdlib_srscl
+
+     !> SSBGST: reduces a real symmetric-definite banded generalized
+     !> eigenproblem  A*x = lambda*B*x  to standard form  C*y = lambda*y,
+     !> such that C has the same bandwidth as A.
+     !> B must have been previously factorized as S**T*S by SPBSTF, using a
+     !> split Cholesky factorization. A is overwritten by C = X**T*A*X, where
+     !> X = S**(-1)*Q and Q is an orthogonal matrix chosen to preserve the
+     !> bandwidth of A.
+
+     pure subroutine stdlib_ssbgst( vect, uplo, n, ka, kb, ab, ldab, bb, ldbb, x,ldx, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldx, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ab(ldab,*)
+           real(sp), intent(in) :: bb(ldbb,*)
+           real(sp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: update, upper, wantx
+           integer(ilp) :: i, i0, i1, i2, inca, j, j1, j1t, j2, j2t, k, ka1, kb1, kbt, l, m, nr, &
+                     nrt, nx
+           real(sp) :: bii, ra, ra1, t
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           wantx = stdlib_lsame( vect, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           ka1 = ka + 1
+           kb1 = kb + 1
+           info = 0
+           if( .not.wantx .and. .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldx<1 .or. wantx .and. ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSBGST', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           inca = ldab*ka1
+           ! initialize x to the unit matrix, if needed
+           if( wantx )call stdlib_slaset( 'FULL', n, n, zero, one, x, ldx )
+           ! set m to the splitting point m. it must be the same value as is
+           ! used in stdlib_spbstf. the chosen value allows the arrays work and rwork
+           ! to be of dimension (n).
+           m = ( n+kb ) / 2
+           ! the routine works in two phases, corresponding to the two halves
+           ! of the split cholesky factorization of b as s**t*s where
+           ! s = ( u    )
+               ! ( m  l )
+           ! with u upper triangular of order m, and l lower triangular of
+           ! order n-m. s has the same bandwidth as b.
+           ! s is treated as a product of elementary matrices:
+           ! s = s(m)*s(m-1)*...*s(2)*s(1)*s(m+1)*s(m+2)*...*s(n-1)*s(n)
+           ! where s(i) is determined by the i-th row of s.
+           ! in phase 1, the index i takes the values n, n-1, ... , m+1;
+           ! in phase 2, it takes the values 1, 2, ... , m.
+           ! for each value of i, the current matrix a is updated by forming
+           ! inv(s(i))**t*a*inv(s(i)). this creates a triangular bulge outside
+           ! the band of a. the bulge is then pushed down toward the bottom of
+           ! a in phase 1, and up toward the top of a in phase 2, by applying
+           ! plane rotations.
+           ! there are kb*(kb+1)/2 elements in the bulge, but at most 2*kb-1
+           ! of them are linearly independent, so annihilating a bulge requires
+           ! only 2*kb-1 plane rotations. the rotations are divided into a 1st
+           ! set of kb-1 rotations, and a 2nd set of kb rotations.
+           ! wherever possible, rotations are generated and applied in vector
+           ! operations of length nr between the indices j1 and j2 (sometimes
+           ! replaced by modified values nrt, j1t or j2t).
+           ! the cosines and sines of the rotations are stored in the array
+           ! work. the cosines of the 1st set of rotations are stored in
+           ! elements n+2:n+m-kb-1 and the sines of the 1st set in elements
+           ! 2:m-kb-1; the cosines of the 2nd set are stored in elements
+           ! n+m-kb+1:2*n and the sines of the second set in elements m-kb+1:n.
+           ! the bulges are not formed explicitly; nonzero elements outside the
+           ! band are created only when they are required for generating new
+           ! rotations; they are stored in the array work, in positions where
+           ! they are later overwritten by the sines of the rotations which
+           ! annihilate them.
+           ! **************************** phase 1 *****************************
+           ! the logical structure of this phase is:
+           ! update = .true.
+           ! do i = n, m + 1, -1
+              ! use s(i) to update a and create a new bulge
+              ! apply rotations to push all bulges ka positions downward
+           ! end do
+           ! update = .false.
+           ! do i = m + ka + 1, n - 1
+              ! apply rotations to push all bulges ka positions downward
+           ! end do
+           ! to avoid duplicating code, the two loops are merged.
+           update = .true.
+           i = n + 1
+           10 continue
+           if( update ) then
+              i = i - 1
+              kbt = min( kb, i-1 )
+              i0 = i - 1
+              i1 = min( n, i+ka )
+              i2 = i - kbt + ka1
+              if( i<m+1 ) then
+                 update = .false.
+                 i = i + 1
+                 i0 = m
+                 if( ka==0 )go to 480
+                 go to 10
+              end if
+           else
+              i = i + ka
+              if( i>n-1 )go to 480
+           end if
+           if( upper ) then
+              ! transform a, working with the upper triangle
+              if( update ) then
+                 ! form  inv(s(i))**t * a * inv(s(i))
+                 bii = bb( kb1, i )
+                 do j = i, i1
+                    ab( i-j+ka1, j ) = ab( i-j+ka1, j ) / bii
+                 end do
+                 do j = max( 1, i-ka ), i
+                    ab( j-i+ka1, i ) = ab( j-i+ka1, i ) / bii
+                 end do
+                 do k = i - kbt, i - 1
+                    do j = i - kbt, k
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( j-i+kb1, i )*ab( k-i+ka1, i ) -bb(&
+                        k-i+kb1, i )*ab( j-i+ka1, i ) +ab( ka1, i )*bb( j-i+kb1, i )*bb( k-i+kb1, &
+                                  i )
+                    end do
+                    do j = max( 1, i-ka ), i - kbt - 1
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( k-i+kb1, i )*ab( j-i+ka1, i )
+                                 
+                    end do
+                 end do
+                 do j = i, i1
+                    do k = max( j-ka, i-kbt ), i - 1
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( k-i+kb1, i )*ab( i-j+ka1, j )
+                                 
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_sscal( n-m, one / bii, x( m+1, i ), 1 )
+                    if( kbt>0 )call stdlib_sger( n-m, kbt, -one, x( m+1, i ), 1,bb( kb1-kbt, i ), &
+                              1, x( m+1, i-kbt ), ldx )
+                 end if
+                 ! store a(i,i1) in ra1 for use in next loop over k
+                 ra1 = ab( i-i1+ka1, i1 )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions down toward the bottom of the
+              ! band
+              loop_130: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i-k+ka<n .and. i-k>1 ) then
+                       ! generate rotation to annihilate a(i,i-k+ka+1)
+                       call stdlib_slartg( ab( k+1, i-k+ka ), ra1,work( n+i-k+ka-m ), work( i-k+&
+                                 ka-m ),ra )
+                       ! create nonzero element a(i-k,i-k+ka+1) outside the
+                       ! band and store it in work(i-k)
+                       t = -bb( kb1-k, i )*ra1
+                       work( i-k ) = work( n+i-k+ka-m )*t -work( i-k+ka-m )*ab( 1, i-k+ka )
+                                 
+                       ab( 1, i-k+ka ) = work( i-k+ka-m )*t +work( n+i-k+ka-m )*ab( 1, i-k+ka )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = max( j2, i+2*ka-k+1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( n-j2t+ka ) / ka1
+                 do j = j2t, j1, ka1
+                    ! create nonzero element a(j-ka,j+1) outside the band
+                    ! and store it in work(j-m)
+                    work( j-m ) = work( j-m )*ab( 1, j+1 )
+                    ab( 1, j+1 ) = work( n+j-m )*ab( 1, j+1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_slargv( nrt, ab( 1, j2t ), inca, work( j2t-m ), ka1,work( &
+                           n+j2t-m ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the right
+                    do l = 1, ka - 1
+                       call stdlib_slartv( nr, ab( ka1-l, j2 ), inca,ab( ka-l, j2+1 ), inca, work(&
+                                  n+j2-m ),work( j2-m ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_slar2v( nr, ab( ka1, j2 ), ab( ka1, j2+1 ),ab( ka, j2+1 ), inca, &
+                              work( n+j2-m ),work( j2-m ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca,work( n+j2-m ), work( j2-m ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j2, j1, ka1
+                       call stdlib_srot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,work( n+j-m ), &
+                                 work( j-m ) )
+                    end do
+                 end if
+              end do loop_130
+              if( update ) then
+                 if( i2<=n .and. kbt>0 ) then
+                    ! create nonzero element a(i-kbt,i-kbt+ka+1) outside the
+                    ! band and store it in work(i-kbt)
+                    work( i-kbt ) = -bb( kb1-kbt, i )*ra1
+                 end if
+              end if
+              loop_170: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i - k - 1 + max( 2, k-i0+1 )*ka1
+                 else
+                    j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+ka+l ) / ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( l, j2-l+1 ), inca,ab( l+1, j2-l+1 ), &
+                              inca, work( n+j2-ka ),work( j2-ka ), ka1 )
+                 end do
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 do j = j1, j2, -ka1
+                    work( j ) = work( j-ka )
+                    work( n+j ) = work( n+j-ka )
+                 end do
+                 do j = j2, j1, ka1
+                    ! create nonzero element a(j-ka,j+1) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( 1, j+1 )
+                    ab( 1, j+1 ) = work( n+j )*ab( 1, j+1 )
+                 end do
+                 if( update ) then
+                    if( i-k<n-ka .and. k<=kbt )work( i-k+ka ) = work( i-k )
+                 end if
+              end do loop_170
+              loop_210: do k = kb, 1, -1
+                 j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_slargv( nr, ab( 1, j2 ), inca, work( j2 ), ka1,work( n+j2 ), ka1 )
+                              
+                    ! apply rotations in 2nd set from the right
+                    do l = 1, ka - 1
+                       call stdlib_slartv( nr, ab( ka1-l, j2 ), inca,ab( ka-l, j2+1 ), inca, work(&
+                                  n+j2 ),work( j2 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_slar2v( nr, ab( ka1, j2 ), ab( ka1, j2+1 ),ab( ka, j2+1 ), inca, &
+                              work( n+j2 ),work( j2 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca, work( n+j2 ),work( j2 ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j2, j1, ka1
+                       call stdlib_srot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,work( n+j ), work( &
+                                 j ) )
+                    end do
+                 end if
+              end do loop_210
+              do k = 1, kb - 1
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 ! finish applying rotations in 1st set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca,work( n+j2-m ), work( j2-m ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = n - 1, i - kb + 2*ka + 1, -1
+                    work( n+j-m ) = work( n+j-ka-m )
+                    work( j-m ) = work( j-ka-m )
+                 end do
+              end if
+           else
+              ! transform a, working with the lower triangle
+              if( update ) then
+                 ! form  inv(s(i))**t * a * inv(s(i))
+                 bii = bb( 1, i )
+                 do j = i, i1
+                    ab( j-i+1, i ) = ab( j-i+1, i ) / bii
+                 end do
+                 do j = max( 1, i-ka ), i
+                    ab( i-j+1, j ) = ab( i-j+1, j ) / bii
+                 end do
+                 do k = i - kbt, i - 1
+                    do j = i - kbt, k
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( i-j+1, j )*ab( i-k+1, k ) -bb( i-k+1, &
+                                 k )*ab( i-j+1, j ) +ab( 1, i )*bb( i-j+1, j )*bb( i-k+1, k )
+                    end do
+                    do j = max( 1, i-ka ), i - kbt - 1
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( i-k+1, k )*ab( i-j+1, j )
+                    end do
+                 end do
+                 do j = i, i1
+                    do k = max( j-ka, i-kbt ), i - 1
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( i-k+1, k )*ab( j-i+1, i )
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_sscal( n-m, one / bii, x( m+1, i ), 1 )
+                    if( kbt>0 )call stdlib_sger( n-m, kbt, -one, x( m+1, i ), 1,bb( kbt+1, i-kbt )&
+                              , ldbb-1,x( m+1, i-kbt ), ldx )
+                 end if
+                 ! store a(i1,i) in ra1 for use in next loop over k
+                 ra1 = ab( i1-i+1, i )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions down toward the bottom of the
+              ! band
+              loop_360: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i-k+ka<n .and. i-k>1 ) then
+                       ! generate rotation to annihilate a(i-k+ka+1,i)
+                       call stdlib_slartg( ab( ka1-k, i ), ra1, work( n+i-k+ka-m ),work( i-k+ka-m &
+                                 ), ra )
+                       ! create nonzero element a(i-k+ka+1,i-k) outside the
+                       ! band and store it in work(i-k)
+                       t = -bb( k+1, i-k )*ra1
+                       work( i-k ) = work( n+i-k+ka-m )*t -work( i-k+ka-m )*ab( ka1, i-k )
+                       ab( ka1, i-k ) = work( i-k+ka-m )*t +work( n+i-k+ka-m )*ab( ka1, i-k )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = max( j2, i+2*ka-k+1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( n-j2t+ka ) / ka1
+                 do j = j2t, j1, ka1
+                    ! create nonzero element a(j+1,j-ka) outside the band
+                    ! and store it in work(j-m)
+                    work( j-m ) = work( j-m )*ab( ka1, j-ka+1 )
+                    ab( ka1, j-ka+1 ) = work( n+j-m )*ab( ka1, j-ka+1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_slargv( nrt, ab( ka1, j2t-ka ), inca, work( j2t-m ),ka1, &
+                           work( n+j2t-m ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the left
+                    do l = 1, ka - 1
+                       call stdlib_slartv( nr, ab( l+1, j2-l ), inca,ab( l+2, j2-l ), inca, work( &
+                                 n+j2-m ),work( j2-m ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_slar2v( nr, ab( 1, j2 ), ab( 1, j2+1 ), ab( 2, j2 ),inca, work( n+&
+                              j2-m ), work( j2-m ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, work( n+j2-m ),work( j2-m ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j2, j1, ka1
+                       call stdlib_srot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,work( n+j-m ), &
+                                 work( j-m ) )
+                    end do
+                 end if
+              end do loop_360
+              if( update ) then
+                 if( i2<=n .and. kbt>0 ) then
+                    ! create nonzero element a(i-kbt+ka+1,i-kbt) outside the
+                    ! band and store it in work(i-kbt)
+                    work( i-kbt ) = -bb( kbt+1, i-kbt )*ra1
+                 end if
+              end if
+              loop_400: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i - k - 1 + max( 2, k-i0+1 )*ka1
+                 else
+                    j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+ka+l ) / ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( ka1-l+1, j2-ka ), inca,ab( ka1-l, j2-&
+                              ka+1 ), inca,work( n+j2-ka ), work( j2-ka ), ka1 )
+                 end do
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 do j = j1, j2, -ka1
+                    work( j ) = work( j-ka )
+                    work( n+j ) = work( n+j-ka )
+                 end do
+                 do j = j2, j1, ka1
+                    ! create nonzero element a(j+1,j-ka) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( ka1, j-ka+1 )
+                    ab( ka1, j-ka+1 ) = work( n+j )*ab( ka1, j-ka+1 )
+                 end do
+                 if( update ) then
+                    if( i-k<n-ka .and. k<=kbt )work( i-k+ka ) = work( i-k )
+                 end if
+              end do loop_400
+              loop_440: do k = kb, 1, -1
+                 j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_slargv( nr, ab( ka1, j2-ka ), inca, work( j2 ), ka1,work( n+j2 ), &
+                              ka1 )
+                    ! apply rotations in 2nd set from the left
+                    do l = 1, ka - 1
+                       call stdlib_slartv( nr, ab( l+1, j2-l ), inca,ab( l+2, j2-l ), inca, work( &
+                                 n+j2 ),work( j2 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_slar2v( nr, ab( 1, j2 ), ab( 1, j2+1 ), ab( 2, j2 ),inca, work( n+&
+                              j2 ), work( j2 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, work( n+j2 ),work( j2 ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j2, j1, ka1
+                       call stdlib_srot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,work( n+j ), work( &
+                                 j ) )
+                    end do
+                 end if
+              end do loop_440
+              do k = 1, kb - 1
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 ! finish applying rotations in 1st set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, work( n+j2-m ),work( j2-m ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = n - 1, i - kb + 2*ka + 1, -1
+                    work( n+j-m ) = work( n+j-ka-m )
+                    work( j-m ) = work( j-ka-m )
+                 end do
+              end if
+           end if
+           go to 10
+           480 continue
+           ! **************************** phase 2 *****************************
+           ! the logical structure of this phase is:
+           ! update = .true.
+           ! do i = 1, m
+              ! use s(i) to update a and create a new bulge
+              ! apply rotations to push all bulges ka positions upward
+           ! end do
+           ! update = .false.
+           ! do i = m - ka - 1, 2, -1
+              ! apply rotations to push all bulges ka positions upward
+           ! end do
+           ! to avoid duplicating code, the two loops are merged.
+           update = .true.
+           i = 0
+           490 continue
+           if( update ) then
+              i = i + 1
+              kbt = min( kb, m-i )
+              i0 = i + 1
+              i1 = max( 1, i-ka )
+              i2 = i + kbt - ka1
+              if( i>m ) then
+                 update = .false.
+                 i = i - 1
+                 i0 = m + 1
+                 if( ka==0 )return
+                 go to 490
+              end if
+           else
+              i = i - ka
+              if( i<2 )return
+           end if
+           if( i<m-kbt ) then
+              nx = m
+           else
+              nx = n
+           end if
+           if( upper ) then
+              ! transform a, working with the upper triangle
+              if( update ) then
+                 ! form  inv(s(i))**t * a * inv(s(i))
+                 bii = bb( kb1, i )
+                 do j = i1, i
+                    ab( j-i+ka1, i ) = ab( j-i+ka1, i ) / bii
+                 end do
+                 do j = i, min( n, i+ka )
+                    ab( i-j+ka1, j ) = ab( i-j+ka1, j ) / bii
+                 end do
+                 do k = i + 1, i + kbt
+                    do j = k, i + kbt
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( i-j+kb1, j )*ab( i-k+ka1, k ) -bb(&
+                        i-k+kb1, k )*ab( i-j+ka1, j ) +ab( ka1, i )*bb( i-j+kb1, j )*bb( i-k+kb1, &
+                                  k )
+                    end do
+                    do j = i + kbt + 1, min( n, i+ka )
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( i-k+kb1, k )*ab( i-j+ka1, j )
+                                 
+                    end do
+                 end do
+                 do j = i1, i
+                    do k = i + 1, min( j+ka, i+kbt )
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( i-k+kb1, k )*ab( j-i+ka1, i )
+                                 
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_sscal( nx, one / bii, x( 1, i ), 1 )
+                    if( kbt>0 )call stdlib_sger( nx, kbt, -one, x( 1, i ), 1, bb( kb, i+1 ),ldbb-&
+                              1, x( 1, i+1 ), ldx )
+                 end if
+                 ! store a(i1,i) in ra1 for use in next loop over k
+                 ra1 = ab( i1-i+ka1, i )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions up toward the top of the band
+              loop_610: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i+k-ka1>0 .and. i+k<m ) then
+                       ! generate rotation to annihilate a(i+k-ka-1,i)
+                       call stdlib_slartg( ab( k+1, i ), ra1, work( n+i+k-ka ),work( i+k-ka ), ra &
+                                 )
+                       ! create nonzero element a(i+k-ka-1,i+k) outside the
+                       ! band and store it in work(m-kb+i+k)
+                       t = -bb( kb1-k, i+k )*ra1
+                       work( m-kb+i+k ) = work( n+i+k-ka )*t -work( i+k-ka )*ab( 1, i+k )
+                       ab( 1, i+k ) = work( i+k-ka )*t +work( n+i+k-ka )*ab( 1, i+k )
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = min( j2, i-2*ka+k-1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( j2t+ka-1 ) / ka1
+                 do j = j1, j2t, ka1
+                    ! create nonzero element a(j-1,j+ka) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( 1, j+ka-1 )
+                    ab( 1, j+ka-1 ) = work( n+j )*ab( 1, j+ka-1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_slargv( nrt, ab( 1, j1+ka ), inca, work( j1 ), ka1,work( &
+                           n+j1 ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the left
+                    do l = 1, ka - 1
+                       call stdlib_slartv( nr, ab( ka1-l, j1+l ), inca,ab( ka-l, j1+l ), inca, &
+                                 work( n+j1 ),work( j1 ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_slar2v( nr, ab( ka1, j1 ), ab( ka1, j1-1 ),ab( ka, j1 ), inca, &
+                              work( n+j1 ),work( j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                               work( n+j1t ),work( j1t ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j1, j2, ka1
+                       call stdlib_srot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,work( n+j ), work( j ) )
+                                 
+                    end do
+                 end if
+              end do loop_610
+              if( update ) then
+                 if( i2>0 .and. kbt>0 ) then
+                    ! create nonzero element a(i+kbt-ka-1,i+kbt) outside the
+                    ! band and store it in work(m-kb+i+kbt)
+                    work( m-kb+i+kbt ) = -bb( kb1-kbt, i+kbt )*ra1
+                 end if
+              end if
+              loop_650: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i + k + 1 - max( 2, k+i0-m )*ka1
+                 else
+                    j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+ka+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( l, j1t+ka ), inca,ab( l+1, j1t+ka-1 ),&
+                               inca,work( n+m-kb+j1t+ka ),work( m-kb+j1t+ka ), ka1 )
+                 end do
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 do j = j1, j2, ka1
+                    work( m-kb+j ) = work( m-kb+j+ka )
+                    work( n+m-kb+j ) = work( n+m-kb+j+ka )
+                 end do
+                 do j = j1, j2, ka1
+                    ! create nonzero element a(j-1,j+ka) outside the band
+                    ! and store it in work(m-kb+j)
+                    work( m-kb+j ) = work( m-kb+j )*ab( 1, j+ka-1 )
+                    ab( 1, j+ka-1 ) = work( n+m-kb+j )*ab( 1, j+ka-1 )
+                 end do
+                 if( update ) then
+                    if( i+k>ka1 .and. k<=kbt )work( m-kb+i+k-ka ) = work( m-kb+i+k )
+                 end if
+              end do loop_650
+              loop_690: do k = kb, 1, -1
+                 j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_slargv( nr, ab( 1, j1+ka ), inca, work( m-kb+j1 ),ka1, work( n+m-&
+                              kb+j1 ), ka1 )
+                    ! apply rotations in 2nd set from the left
+                    do l = 1, ka - 1
+                       call stdlib_slartv( nr, ab( ka1-l, j1+l ), inca,ab( ka-l, j1+l ), inca,&
+                                 work( n+m-kb+j1 ), work( m-kb+j1 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_slar2v( nr, ab( ka1, j1 ), ab( ka1, j1-1 ),ab( ka, j1 ), inca, &
+                              work( n+m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                              work( n+m-kb+j1t ), work( m-kb+j1t ),ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j1, j2, ka1
+                       call stdlib_srot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,work( n+m-kb+j ), work( &
+                                 m-kb+j ) )
+                    end do
+                 end if
+              end do loop_690
+              do k = 1, kb - 1
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 ! finish applying rotations in 1st set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                               work( n+j1t ),work( j1t ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = 2, min( i+kb, m ) - 2*ka - 1
+                    work( n+j ) = work( n+j+ka )
+                    work( j ) = work( j+ka )
+                 end do
+              end if
+           else
+              ! transform a, working with the lower triangle
+              if( update ) then
+                 ! form  inv(s(i))**t * a * inv(s(i))
+                 bii = bb( 1, i )
+                 do j = i1, i
+                    ab( i-j+1, j ) = ab( i-j+1, j ) / bii
+                 end do
+                 do j = i, min( n, i+ka )
+                    ab( j-i+1, i ) = ab( j-i+1, i ) / bii
+                 end do
+                 do k = i + 1, i + kbt
+                    do j = k, i + kbt
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( j-i+1, i )*ab( k-i+1, i ) -bb( k-i+1, &
+                                 i )*ab( j-i+1, i ) +ab( 1, i )*bb( j-i+1, i )*bb( k-i+1, i )
+                    end do
+                    do j = i + kbt + 1, min( n, i+ka )
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( k-i+1, i )*ab( j-i+1, i )
+                    end do
+                 end do
+                 do j = i1, i
+                    do k = i + 1, min( j+ka, i+kbt )
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( k-i+1, i )*ab( i-j+1, j )
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_sscal( nx, one / bii, x( 1, i ), 1 )
+                    if( kbt>0 )call stdlib_sger( nx, kbt, -one, x( 1, i ), 1, bb( 2, i ), 1,x( 1, &
+                              i+1 ), ldx )
+                 end if
+                 ! store a(i,i1) in ra1 for use in next loop over k
+                 ra1 = ab( i-i1+1, i1 )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions up toward the top of the band
+              loop_840: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i+k-ka1>0 .and. i+k<m ) then
+                       ! generate rotation to annihilate a(i,i+k-ka-1)
+                       call stdlib_slartg( ab( ka1-k, i+k-ka ), ra1,work( n+i+k-ka ), work( i+k-&
+                                 ka ), ra )
+                       ! create nonzero element a(i+k,i+k-ka-1) outside the
+                       ! band and store it in work(m-kb+i+k)
+                       t = -bb( k+1, i )*ra1
+                       work( m-kb+i+k ) = work( n+i+k-ka )*t -work( i+k-ka )*ab( ka1, i+k-ka )
+                                 
+                       ab( ka1, i+k-ka ) = work( i+k-ka )*t +work( n+i+k-ka )*ab( ka1, i+k-ka )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = min( j2, i-2*ka+k-1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( j2t+ka-1 ) / ka1
+                 do j = j1, j2t, ka1
+                    ! create nonzero element a(j+ka,j-1) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( ka1, j-1 )
+                    ab( ka1, j-1 ) = work( n+j )*ab( ka1, j-1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_slargv( nrt, ab( ka1, j1 ), inca, work( j1 ), ka1,work( n+&
+                           j1 ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the right
+                    do l = 1, ka - 1
+                       call stdlib_slartv( nr, ab( l+1, j1 ), inca, ab( l+2, j1-1 ),inca, work( n+&
+                                 j1 ), work( j1 ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_slar2v( nr, ab( 1, j1 ), ab( 1, j1-1 ),ab( 2, j1-1 ), inca, work( &
+                              n+j1 ),work( j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,work( n+j1t ), work( j1t ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j1, j2, ka1
+                       call stdlib_srot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,work( n+j ), work( j ) )
+                                 
+                    end do
+                 end if
+              end do loop_840
+              if( update ) then
+                 if( i2>0 .and. kbt>0 ) then
+                    ! create nonzero element a(i+kbt,i+kbt-ka-1) outside the
+                    ! band and store it in work(m-kb+i+kbt)
+                    work( m-kb+i+kbt ) = -bb( kbt+1, i )*ra1
+                 end if
+              end if
+              loop_880: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i + k + 1 - max( 2, k+i0-m )*ka1
+                 else
+                    j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+ka+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( ka1-l+1, j1t+l-1 ), inca,ab( ka1-l, &
+                              j1t+l-1 ), inca,work( n+m-kb+j1t+ka ),work( m-kb+j1t+ka ), ka1 )
+                 end do
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 do j = j1, j2, ka1
+                    work( m-kb+j ) = work( m-kb+j+ka )
+                    work( n+m-kb+j ) = work( n+m-kb+j+ka )
+                 end do
+                 do j = j1, j2, ka1
+                    ! create nonzero element a(j+ka,j-1) outside the band
+                    ! and store it in work(m-kb+j)
+                    work( m-kb+j ) = work( m-kb+j )*ab( ka1, j-1 )
+                    ab( ka1, j-1 ) = work( n+m-kb+j )*ab( ka1, j-1 )
+                 end do
+                 if( update ) then
+                    if( i+k>ka1 .and. k<=kbt )work( m-kb+i+k-ka ) = work( m-kb+i+k )
+                 end if
+              end do loop_880
+              loop_920: do k = kb, 1, -1
+                 j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_slargv( nr, ab( ka1, j1 ), inca, work( m-kb+j1 ),ka1, work( n+m-&
+                              kb+j1 ), ka1 )
+                    ! apply rotations in 2nd set from the right
+                    do l = 1, ka - 1
+                       call stdlib_slartv( nr, ab( l+1, j1 ), inca, ab( l+2, j1-1 ),inca, work( n+&
+                                 m-kb+j1 ), work( m-kb+j1 ),ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_slar2v( nr, ab( 1, j1 ), ab( 1, j1-1 ),ab( 2, j1-1 ), inca, work( &
+                              n+m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,work( n+m-kb+j1t ), work( m-kb+j1t ),ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j1, j2, ka1
+                       call stdlib_srot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,work( n+m-kb+j ), work( &
+                                 m-kb+j ) )
+                    end do
+                 end if
+              end do loop_920
+              do k = 1, kb - 1
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 ! finish applying rotations in 1st set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_slartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,work( n+j1t ), work( j1t ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = 2, min( i+kb, m ) - 2*ka - 1
+                    work( n+j ) = work( n+j+ka )
+                    work( j ) = work( j+ka )
+                 end do
+              end if
+           end if
+           go to 490
+     end subroutine stdlib_ssbgst
+
+     !> SSBTRD: reduces a real symmetric band matrix A to symmetric
+     !> tridiagonal form T by an orthogonal similarity transformation:
+     !> Q**T * A * Q = T.
+
+     pure subroutine stdlib_ssbtrd( vect, uplo, n, kd, ab, ldab, d, e, q, ldq,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldq, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ab(ldab,*), q(ldq,*)
+           real(sp), intent(out) :: d(*), e(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: initq, upper, wantq
+           integer(ilp) :: i, i2, ibl, inca, incx, iqaend, iqb, iqend, j, j1, j1end, j1inc, j2, &
+                     jend, jin, jinc, k, kd1, kdm1, kdn, l, last, lend, nq, nr, nrt
+           real(sp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           initq = stdlib_lsame( vect, 'V' )
+           wantq = initq .or. stdlib_lsame( vect, 'U' )
+           upper = stdlib_lsame( uplo, 'U' )
+           kd1 = kd + 1
+           kdm1 = kd - 1
+           incx = ldab - 1
+           iqend = 1
+           info = 0
+           if( .not.wantq .and. .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd1 ) then
+              info = -6
+           else if( ldq<max( 1, n ) .and. wantq ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSBTRD', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize q to the unit matrix, if needed
+           if( initq )call stdlib_slaset( 'FULL', n, n, zero, one, q, ldq )
+           ! wherever possible, plane rotations are generated and applied in
+           ! vector operations of length nr over the index set j1:j2:kd1.
+           ! the cosines and sines of the plane rotations are stored in the
+           ! arrays d and work.
+           inca = kd1*ldab
+           kdn = min( n-1, kd )
+           if( upper ) then
+              if( kd>1 ) then
+                 ! reduce to tridiagonal form, working with upper triangle
+                 nr = 0
+                 j1 = kdn + 2
+                 j2 = 1
+                 loop_90: do i = 1, n - 2
+                    ! reduce i-th row of matrix to tridiagonal form
+                    loop_80: do k = kdn + 1, 2, -1
+                       j1 = j1 + kdn
+                       j2 = j2 + kdn
+                       if( nr>0 ) then
+                          ! generate plane rotations to annihilate nonzero
+                          ! elements which have been created outside the band
+                          call stdlib_slargv( nr, ab( 1, j1-1 ), inca, work( j1 ),kd1, d( j1 ), &
+                                    kd1 )
+                          ! apply rotations from the right
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_slartv or stdlib_srot is used
+                          if( nr>=2*kd-1 ) then
+                             do l = 1, kd - 1
+                                call stdlib_slartv( nr, ab( l+1, j1-1 ), inca,ab( l, j1 ), inca, &
+                                          d( j1 ),work( j1 ), kd1 )
+                             end do
+                          else
+                             jend = j1 + ( nr-1 )*kd1
+                             do jinc = j1, jend, kd1
+                                call stdlib_srot( kdm1, ab( 2, jinc-1 ), 1,ab( 1, jinc ), 1, d( &
+                                          jinc ),work( jinc ) )
+                             end do
+                          end if
+                       end if
+                       if( k>2 ) then
+                          if( k<=n-i+1 ) then
+                             ! generate plane rotation to annihilate a(i,i+k-1)
+                             ! within the band
+                             call stdlib_slartg( ab( kd-k+3, i+k-2 ),ab( kd-k+2, i+k-1 ), d( i+k-&
+                                       1 ),work( i+k-1 ), temp )
+                             ab( kd-k+3, i+k-2 ) = temp
+                             ! apply rotation from the right
+                             call stdlib_srot( k-3, ab( kd-k+4, i+k-2 ), 1,ab( kd-k+3, i+k-1 ), 1,&
+                                        d( i+k-1 ),work( i+k-1 ) )
+                          end if
+                          nr = nr + 1
+                          j1 = j1 - kdn - 1
+                       end if
+                       ! apply plane rotations from both sides to diagonal
+                       ! blocks
+                       if( nr>0 )call stdlib_slar2v( nr, ab( kd1, j1-1 ), ab( kd1, j1 ),ab( kd, &
+                                 j1 ), inca, d( j1 ),work( j1 ), kd1 )
+                       ! apply plane rotations from the left
+                       if( nr>0 ) then
+                          if( 2*kd-1<nr ) then
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_slartv or stdlib_srot is used
+                             do l = 1, kd - 1
+                                if( j2+l>n ) then
+                                   nrt = nr - 1
+                                else
+                                   nrt = nr
+                                end if
+                                if( nrt>0 )call stdlib_slartv( nrt, ab( kd-l, j1+l ), inca,ab( kd-&
+                                          l+1, j1+l ), inca,d( j1 ), work( j1 ), kd1 )
+                             end do
+                          else
+                             j1end = j1 + kd1*( nr-2 )
+                             if( j1end>=j1 ) then
+                                do jin = j1, j1end, kd1
+                                   call stdlib_srot( kd-1, ab( kd-1, jin+1 ), incx,ab( kd, jin+1 )&
+                                             , incx,d( jin ), work( jin ) )
+                                end do
+                             end if
+                             lend = min( kdm1, n-j2 )
+                             last = j1end + kd1
+                             if( lend>0 )call stdlib_srot( lend, ab( kd-1, last+1 ), incx,ab( kd, &
+                                       last+1 ), incx, d( last ),work( last ) )
+                          end if
+                       end if
+                       if( wantq ) then
+                          ! accumulate product of plane rotations in q
+                          if( initq ) then
+                       ! take advantage of the fact that q was
+                       ! initially the identity matrix
+                             iqend = max( iqend, j2 )
+                             i2 = max( 0, k-3 )
+                             iqaend = 1 + i*kd
+                             if( k==2 )iqaend = iqaend + kd
+                             iqaend = min( iqaend, iqend )
+                             do j = j1, j2, kd1
+                                ibl = i - i2 / kdm1
+                                i2 = i2 + 1
+                                iqb = max( 1, j-ibl )
+                                nq = 1 + iqaend - iqb
+                                iqaend = min( iqaend+kd, iqend )
+                                call stdlib_srot( nq, q( iqb, j-1 ), 1, q( iqb, j ),1, d( j ), &
+                                          work( j ) )
+                             end do
+                          else
+                             do j = j1, j2, kd1
+                                call stdlib_srot( n, q( 1, j-1 ), 1, q( 1, j ), 1,d( j ), work( j &
+                                          ) )
+                             end do
+                          end if
+                       end if
+                       if( j2+kdn>n ) then
+                          ! adjust j2 to keep within the bounds of the matrix
+                          nr = nr - 1
+                          j2 = j2 - kdn - 1
+                       end if
+                       do j = j1, j2, kd1
+                          ! create nonzero element a(j-1,j+kd) outside the band
+                          ! and store it in work
+                          work( j+kd ) = work( j )*ab( 1, j+kd )
+                          ab( 1, j+kd ) = d( j )*ab( 1, j+kd )
+                       end do
+                    end do loop_80
+                 end do loop_90
+              end if
+              if( kd>0 ) then
+                 ! copy off-diagonal elements to e
+                 do i = 1, n - 1
+                    e( i ) = ab( kd, i+1 )
+                 end do
+              else
+                 ! set e to zero if original matrix was diagonal
+                 do i = 1, n - 1
+                    e( i ) = zero
+                 end do
+              end if
+              ! copy diagonal elements to d
+              do i = 1, n
+                 d( i ) = ab( kd1, i )
+              end do
+           else
+              if( kd>1 ) then
+                 ! reduce to tridiagonal form, working with lower triangle
+                 nr = 0
+                 j1 = kdn + 2
+                 j2 = 1
+                 loop_210: do i = 1, n - 2
+                    ! reduce i-th column of matrix to tridiagonal form
+                    loop_200: do k = kdn + 1, 2, -1
+                       j1 = j1 + kdn
+                       j2 = j2 + kdn
+                       if( nr>0 ) then
+                          ! generate plane rotations to annihilate nonzero
+                          ! elements which have been created outside the band
+                          call stdlib_slargv( nr, ab( kd1, j1-kd1 ), inca,work( j1 ), kd1, d( j1 )&
+                                    , kd1 )
+                          ! apply plane rotations from one side
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_slartv or stdlib_srot is used
+                          if( nr>2*kd-1 ) then
+                             do l = 1, kd - 1
+                                call stdlib_slartv( nr, ab( kd1-l, j1-kd1+l ), inca,ab( kd1-l+1, &
+                                          j1-kd1+l ), inca,d( j1 ), work( j1 ), kd1 )
+                             end do
+                          else
+                             jend = j1 + kd1*( nr-1 )
+                             do jinc = j1, jend, kd1
+                                call stdlib_srot( kdm1, ab( kd, jinc-kd ), incx,ab( kd1, jinc-kd )&
+                                          , incx,d( jinc ), work( jinc ) )
+                             end do
+                          end if
+                       end if
+                       if( k>2 ) then
+                          if( k<=n-i+1 ) then
+                             ! generate plane rotation to annihilate a(i+k-1,i)
+                             ! within the band
+                             call stdlib_slartg( ab( k-1, i ), ab( k, i ),d( i+k-1 ), work( i+k-1 &
+                                       ), temp )
+                             ab( k-1, i ) = temp
+                             ! apply rotation from the left
+                             call stdlib_srot( k-3, ab( k-2, i+1 ), ldab-1,ab( k-1, i+1 ), ldab-1,&
+                                        d( i+k-1 ),work( i+k-1 ) )
+                          end if
+                          nr = nr + 1
+                          j1 = j1 - kdn - 1
+                       end if
+                       ! apply plane rotations from both sides to diagonal
+                       ! blocks
+                       if( nr>0 )call stdlib_slar2v( nr, ab( 1, j1-1 ), ab( 1, j1 ),ab( 2, j1-1 ),&
+                                  inca, d( j1 ),work( j1 ), kd1 )
+                       ! apply plane rotations from the right
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_slartv or stdlib_srot is used
+                       if( nr>0 ) then
+                          if( nr>2*kd-1 ) then
+                             do l = 1, kd - 1
+                                if( j2+l>n ) then
+                                   nrt = nr - 1
+                                else
+                                   nrt = nr
+                                end if
+                                if( nrt>0 )call stdlib_slartv( nrt, ab( l+2, j1-1 ), inca,ab( l+1,&
+                                           j1 ), inca, d( j1 ),work( j1 ), kd1 )
+                             end do
+                          else
+                             j1end = j1 + kd1*( nr-2 )
+                             if( j1end>=j1 ) then
+                                do j1inc = j1, j1end, kd1
+                                   call stdlib_srot( kdm1, ab( 3, j1inc-1 ), 1,ab( 2, j1inc ), 1, &
+                                             d( j1inc ),work( j1inc ) )
+                                end do
+                             end if
+                             lend = min( kdm1, n-j2 )
+                             last = j1end + kd1
+                             if( lend>0 )call stdlib_srot( lend, ab( 3, last-1 ), 1,ab( 2, last ),&
+                                        1, d( last ),work( last ) )
+                          end if
+                       end if
+                       if( wantq ) then
+                          ! accumulate product of plane rotations in q
+                          if( initq ) then
+                       ! take advantage of the fact that q was
+                       ! initially the identity matrix
+                             iqend = max( iqend, j2 )
+                             i2 = max( 0, k-3 )
+                             iqaend = 1 + i*kd
+                             if( k==2 )iqaend = iqaend + kd
+                             iqaend = min( iqaend, iqend )
+                             do j = j1, j2, kd1
+                                ibl = i - i2 / kdm1
+                                i2 = i2 + 1
+                                iqb = max( 1, j-ibl )
+                                nq = 1 + iqaend - iqb
+                                iqaend = min( iqaend+kd, iqend )
+                                call stdlib_srot( nq, q( iqb, j-1 ), 1, q( iqb, j ),1, d( j ), &
+                                          work( j ) )
+                             end do
+                          else
+                             do j = j1, j2, kd1
+                                call stdlib_srot( n, q( 1, j-1 ), 1, q( 1, j ), 1,d( j ), work( j &
+                                          ) )
+                             end do
+                          end if
+                       end if
+                       if( j2+kdn>n ) then
+                          ! adjust j2 to keep within the bounds of the matrix
+                          nr = nr - 1
+                          j2 = j2 - kdn - 1
+                       end if
+                       do j = j1, j2, kd1
+                          ! create nonzero element a(j+kd,j-1) outside the
+                          ! band and store it in work
+                          work( j+kd ) = work( j )*ab( kd1, j )
+                          ab( kd1, j ) = d( j )*ab( kd1, j )
+                       end do
+                    end do loop_200
+                 end do loop_210
+              end if
+              if( kd>0 ) then
+                 ! copy off-diagonal elements to e
+                 do i = 1, n - 1
+                    e( i ) = ab( 2, i )
+                 end do
+              else
+                 ! set e to zero if original matrix was diagonal
+                 do i = 1, n - 1
+                    e( i ) = zero
+                 end do
+              end if
+              ! copy diagonal elements to d
+              do i = 1, n
+                 d( i ) = ab( 1, i )
+              end do
+           end if
+           return
+     end subroutine stdlib_ssbtrd
+
+     !> Level 3 BLAS like routine for C in RFP Format.
+     !> SSFRK: performs one of the symmetric rank--k operations
+     !> C := alpha*A*A**T + beta*C,
+     !> or
+     !> C := alpha*A**T*A + beta*C,
+     !> where alpha and beta are real scalars, C is an n--by--n symmetric
+     !> matrix and A is an n--by--k matrix in the first case and a k--by--n
+     !> matrix in the second case.
+
+     pure subroutine stdlib_ssfrk( transr, uplo, trans, n, k, alpha, a, lda, beta,c )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, n
+           character, intent(in) :: trans, transr, uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: c(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, normaltransr, nisodd, notrans
+           integer(ilp) :: info, nrowa, j, nk, n1, n2
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           notrans = stdlib_lsame( trans, 'N' )
+           if( notrans ) then
+              nrowa = n
+           else
+              nrowa = k
+           end if
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( .not.notrans .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( lda<max( 1, nrowa ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSFRK ', -info )
+              return
+           end if
+           ! quick return if possible.
+           ! the quick return case: ((alpha==0).and.(beta/=zero)) is not
+           ! done (it is in stdlib_ssyrk for example) and left in the general case.
+           if( ( n==0 ) .or. ( ( ( alpha==zero ) .or. ( k==0 ) ) .and.( beta==one ) ) )&
+                     return
+           if( ( alpha==zero ) .and. ( beta==zero ) ) then
+              do j = 1, ( ( n*( n+1 ) ) / 2 )
+                 c( j ) = zero
+              end do
+              return
+           end if
+           ! c is n-by-n.
+           ! if n is odd, set nisodd = .true., and n1 and n2.
+           ! if n is even, nisodd = .false., and nk.
+           if( mod( n, 2 )==0 ) then
+              nisodd = .false.
+              nk = n / 2
+           else
+              nisodd = .true.
+              if( lower ) then
+                 n2 = n / 2
+                 n1 = n - n2
+              else
+                 n1 = n / 2
+                 n2 = n - n1
+              end if
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is odd, transr = 'n', uplo = 'l', and trans = 'n'
+                       call stdlib_ssyrk( 'L', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n )
+                                 
+                       call stdlib_ssyrk( 'U', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( n+1 )&
+                                 , n )
+                       call stdlib_sgemm( 'N', 'T', n2, n1, k, alpha, a( n1+1, 1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( n1+1 ), n )
+                    else
+                       ! n is odd, transr = 'n', uplo = 'l', and trans = 't'
+                       call stdlib_ssyrk( 'L', 'T', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n )
+                                 
+                       call stdlib_ssyrk( 'U', 'T', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( n+1 )&
+                                 , n )
+                       call stdlib_sgemm( 'T', 'N', n2, n1, k, alpha, a( 1, n1+1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( n1+1 ), n )
+                    end if
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is odd, transr = 'n', uplo = 'u', and trans = 'n'
+                       call stdlib_ssyrk( 'L', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2+1 ), &
+                                 n )
+                       call stdlib_ssyrk( 'U', 'N', n2, k, alpha, a( n2, 1 ), lda,beta, c( n1+1 ),&
+                                  n )
+                       call stdlib_sgemm( 'N', 'T', n1, n2, k, alpha, a( 1, 1 ),lda, a( n2, 1 ), &
+                                 lda, beta, c( 1 ), n )
+                    else
+                       ! n is odd, transr = 'n', uplo = 'u', and trans = 't'
+                       call stdlib_ssyrk( 'L', 'T', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2+1 ), &
+                                 n )
+                       call stdlib_ssyrk( 'U', 'T', n2, k, alpha, a( 1, n2 ), lda,beta, c( n1+1 ),&
+                                  n )
+                       call stdlib_sgemm( 'T', 'N', n1, n2, k, alpha, a( 1, 1 ),lda, a( 1, n2 ), &
+                                 lda, beta, c( 1 ), n )
+                    end if
+                 end if
+              else
+                 ! n is odd, and transr = 't'
+                 if( lower ) then
+                    ! n is odd, transr = 't', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is odd, transr = 't', uplo = 'l', and trans = 'n'
+                       call stdlib_ssyrk( 'U', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n1 &
+                                 )
+                       call stdlib_ssyrk( 'L', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( 2 ), &
+                                 n1 )
+                       call stdlib_sgemm( 'N', 'T', n1, n2, k, alpha, a( 1, 1 ),lda, a( n1+1, 1 ),&
+                                  lda, beta,c( n1*n1+1 ), n1 )
+                    else
+                       ! n is odd, transr = 't', uplo = 'l', and trans = 't'
+                       call stdlib_ssyrk( 'U', 'T', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n1 &
+                                 )
+                       call stdlib_ssyrk( 'L', 'T', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( 2 ), &
+                                 n1 )
+                       call stdlib_sgemm( 'T', 'N', n1, n2, k, alpha, a( 1, 1 ),lda, a( 1, n1+1 ),&
+                                  lda, beta,c( n1*n1+1 ), n1 )
+                    end if
+                 else
+                    ! n is odd, transr = 't', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is odd, transr = 't', uplo = 'u', and trans = 'n'
+                       call stdlib_ssyrk( 'U', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2*n2+1 &
+                                 ), n2 )
+                       call stdlib_ssyrk( 'L', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( &
+                                 n1*n2+1 ), n2 )
+                       call stdlib_sgemm( 'N', 'T', n2, n1, k, alpha, a( n1+1, 1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( 1 ), n2 )
+                    else
+                       ! n is odd, transr = 't', uplo = 'u', and trans = 't'
+                       call stdlib_ssyrk( 'U', 'T', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2*n2+1 &
+                                 ), n2 )
+                       call stdlib_ssyrk( 'L', 'T', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( &
+                                 n1*n2+1 ), n2 )
+                       call stdlib_sgemm( 'T', 'N', n2, n1, k, alpha, a( 1, n1+1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( 1 ), n2 )
+                    end if
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is even, transr = 'n', uplo = 'l', and trans = 'n'
+                       call stdlib_ssyrk( 'L', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( 2 ), n+&
+                                 1 )
+                       call stdlib_ssyrk( 'U', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( 1 ), &
+                                 n+1 )
+                       call stdlib_sgemm( 'N', 'T', nk, nk, k, alpha, a( nk+1, 1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( nk+2 ),n+1 )
+                    else
+                       ! n is even, transr = 'n', uplo = 'l', and trans = 't'
+                       call stdlib_ssyrk( 'L', 'T', nk, k, alpha, a( 1, 1 ), lda,beta, c( 2 ), n+&
+                                 1 )
+                       call stdlib_ssyrk( 'U', 'T', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( 1 ), &
+                                 n+1 )
+                       call stdlib_sgemm( 'T', 'N', nk, nk, k, alpha, a( 1, nk+1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( nk+2 ),n+1 )
+                    end if
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is even, transr = 'n', uplo = 'u', and trans = 'n'
+                       call stdlib_ssyrk( 'L', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+2 ), &
+                                 n+1 )
+                       call stdlib_ssyrk( 'U', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( nk+1 &
+                                 ), n+1 )
+                       call stdlib_sgemm( 'N', 'T', nk, nk, k, alpha, a( 1, 1 ),lda, a( nk+1, 1 ),&
+                                  lda, beta, c( 1 ),n+1 )
+                    else
+                       ! n is even, transr = 'n', uplo = 'u', and trans = 't'
+                       call stdlib_ssyrk( 'L', 'T', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+2 ), &
+                                 n+1 )
+                       call stdlib_ssyrk( 'U', 'T', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( nk+1 &
+                                 ), n+1 )
+                       call stdlib_sgemm( 'T', 'N', nk, nk, k, alpha, a( 1, 1 ),lda, a( 1, nk+1 ),&
+                                  lda, beta, c( 1 ),n+1 )
+                    end if
+                 end if
+              else
+                 ! n is even, and transr = 't'
+                 if( lower ) then
+                    ! n is even, transr = 't', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is even, transr = 't', uplo = 'l', and trans = 'n'
+                       call stdlib_ssyrk( 'U', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+1 ), &
+                                 nk )
+                       call stdlib_ssyrk( 'L', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( 1 ), &
+                                 nk )
+                       call stdlib_sgemm( 'N', 'T', nk, nk, k, alpha, a( 1, 1 ),lda, a( nk+1, 1 ),&
+                                  lda, beta,c( ( ( nk+1 )*nk )+1 ), nk )
+                    else
+                       ! n is even, transr = 't', uplo = 'l', and trans = 't'
+                       call stdlib_ssyrk( 'U', 'T', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+1 ), &
+                                 nk )
+                       call stdlib_ssyrk( 'L', 'T', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( 1 ), &
+                                 nk )
+                       call stdlib_sgemm( 'T', 'N', nk, nk, k, alpha, a( 1, 1 ),lda, a( 1, nk+1 ),&
+                                  lda, beta,c( ( ( nk+1 )*nk )+1 ), nk )
+                    end if
+                 else
+                    ! n is even, transr = 't', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is even, transr = 't', uplo = 'u', and trans = 'n'
+                       call stdlib_ssyrk( 'U', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk*( nk+&
+                                 1 )+1 ), nk )
+                       call stdlib_ssyrk( 'L', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( &
+                                 nk*nk+1 ), nk )
+                       call stdlib_sgemm( 'N', 'T', nk, nk, k, alpha, a( nk+1, 1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( 1 ), nk )
+                    else
+                       ! n is even, transr = 't', uplo = 'u', and trans = 't'
+                       call stdlib_ssyrk( 'U', 'T', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk*( nk+&
+                                 1 )+1 ), nk )
+                       call stdlib_ssyrk( 'L', 'T', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( &
+                                 nk*nk+1 ), nk )
+                       call stdlib_sgemm( 'T', 'N', nk, nk, k, alpha, a( 1, nk+1 ),lda, a( 1, 1 ),&
+                                  lda, beta, c( 1 ), nk )
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ssfrk
+
+     !> SSPGST: reduces a real symmetric-definite generalized eigenproblem
+     !> to standard form, using packed storage.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**T)*A*inv(U) or inv(L)*A*inv(L**T)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**T or L**T*A*L.
+     !> B must have been previously factorized as U**T*U or L*L**T by SPPTRF.
+
+     pure subroutine stdlib_sspgst( itype, uplo, n, ap, bp, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ap(*)
+           real(sp), intent(in) :: bp(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, j1, j1j1, jj, k, k1, k1k1, kk
+           real(sp) :: ajj, akk, bjj, bkk, ct
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPGST', -info )
+              return
+           end if
+           if( itype==1 ) then
+              if( upper ) then
+                 ! compute inv(u**t)*a*inv(u)
+                 ! j1 and jj are the indices of a(1,j) and a(j,j)
+                 jj = 0
+                 do j = 1, n
+                    j1 = jj + 1
+                    jj = jj + j
+                    ! compute the j-th column of the upper triangle of a
+                    bjj = bp( jj )
+                    call stdlib_stpsv( uplo, 'TRANSPOSE', 'NONUNIT', j, bp,ap( j1 ), 1 )
+                    call stdlib_sspmv( uplo, j-1, -one, ap, bp( j1 ), 1, one,ap( j1 ), 1 )
+                    call stdlib_sscal( j-1, one / bjj, ap( j1 ), 1 )
+                    ap( jj ) = ( ap( jj )-stdlib_sdot( j-1, ap( j1 ), 1, bp( j1 ),1 ) ) / &
+                              bjj
+                 end do
+              else
+                 ! compute inv(l)*a*inv(l**t)
+                 ! kk and k1k1 are the indices of a(k,k) and a(k+1,k+1)
+                 kk = 1
+                 do k = 1, n
+                    k1k1 = kk + n - k + 1
+                    ! update the lower triangle of a(k:n,k:n)
+                    akk = ap( kk )
+                    bkk = bp( kk )
+                    akk = akk / bkk**2
+                    ap( kk ) = akk
+                    if( k<n ) then
+                       call stdlib_sscal( n-k, one / bkk, ap( kk+1 ), 1 )
+                       ct = -half*akk
+                       call stdlib_saxpy( n-k, ct, bp( kk+1 ), 1, ap( kk+1 ), 1 )
+                       call stdlib_sspr2( uplo, n-k, -one, ap( kk+1 ), 1,bp( kk+1 ), 1, ap( k1k1 )&
+                                  )
+                       call stdlib_saxpy( n-k, ct, bp( kk+1 ), 1, ap( kk+1 ), 1 )
+                       call stdlib_stpsv( uplo, 'NO TRANSPOSE', 'NON-UNIT', n-k,bp( k1k1 ), ap( &
+                                 kk+1 ), 1 )
+                    end if
+                    kk = k1k1
+                 end do
+              end if
+           else
+              if( upper ) then
+                 ! compute u*a*u**t
+                 ! k1 and kk are the indices of a(1,k) and a(k,k)
+                 kk = 0
+                 do k = 1, n
+                    k1 = kk + 1
+                    kk = kk + k
+                    ! update the upper triangle of a(1:k,1:k)
+                    akk = ap( kk )
+                    bkk = bp( kk )
+                    call stdlib_stpmv( uplo, 'NO TRANSPOSE', 'NON-UNIT', k-1, bp,ap( k1 ), 1 )
+                              
+                    ct = half*akk
+                    call stdlib_saxpy( k-1, ct, bp( k1 ), 1, ap( k1 ), 1 )
+                    call stdlib_sspr2( uplo, k-1, one, ap( k1 ), 1, bp( k1 ), 1,ap )
+                    call stdlib_saxpy( k-1, ct, bp( k1 ), 1, ap( k1 ), 1 )
+                    call stdlib_sscal( k-1, bkk, ap( k1 ), 1 )
+                    ap( kk ) = akk*bkk**2
+                 end do
+              else
+                 ! compute l**t *a*l
+                 ! jj and j1j1 are the indices of a(j,j) and a(j+1,j+1)
+                 jj = 1
+                 do j = 1, n
+                    j1j1 = jj + n - j + 1
+                    ! compute the j-th column of the lower triangle of a
+                    ajj = ap( jj )
+                    bjj = bp( jj )
+                    ap( jj ) = ajj*bjj + stdlib_sdot( n-j, ap( jj+1 ), 1,bp( jj+1 ), 1 )
+                    call stdlib_sscal( n-j, bjj, ap( jj+1 ), 1 )
+                    call stdlib_sspmv( uplo, n-j, one, ap( j1j1 ), bp( jj+1 ), 1,one, ap( jj+1 ), &
+                              1 )
+                    call stdlib_stpmv( uplo, 'TRANSPOSE', 'NON-UNIT', n-j+1,bp( jj ), ap( jj ), 1 &
+                              )
+                    jj = j1j1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_sspgst
+
+     !> SSPTRF: computes the factorization of a real symmetric matrix A stored
+     !> in packed format using the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+
+     pure subroutine stdlib_ssptrf( uplo, n, ap, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kc, kk, knc, kp, kpc, kstep, kx, npp
+           real(sp) :: absakk, alpha, colmax, d11, d12, d21, d22, r1, rowmax, t, wk, wkm1, &
+                     wkp1
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPTRF', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              kc = ( n-1 )*n / 2 + 1
+              10 continue
+              knc = kc
+              ! if k < 1, exit from loop
+              if( k<1 )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( ap( kc+k-1 ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k>1 ) then
+                 imax = stdlib_isamax( k-1, ap( kc ), 1 )
+                 colmax = abs( ap( kc+imax-1 ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    rowmax = zero
+                    jmax = imax
+                    kx = imax*( imax+1 ) / 2 + imax
+                    do j = imax + 1, k
+                       if( abs( ap( kx ) )>rowmax ) then
+                          rowmax = abs( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + j
+                    end do
+                    kpc = ( imax-1 )*imax / 2 + 1
+                    if( imax>1 ) then
+                       jmax = stdlib_isamax( imax-1, ap( kpc ), 1 )
+                       rowmax = max( rowmax, abs( ap( kpc+jmax-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( ap( kpc+imax-1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kstep==2 )knc = knc - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_sswap( kp-1, ap( knc ), 1, ap( kpc ), 1 )
+                    kx = kpc + kp - 1
+                    do j = kp + 1, kk - 1
+                       kx = kx + j - 1
+                       t = ap( knc+j-1 )
+                       ap( knc+j-1 ) = ap( kx )
+                       ap( kx ) = t
+                    end do
+                    t = ap( knc+kk-1 )
+                    ap( knc+kk-1 ) = ap( kpc+kp-1 )
+                    ap( kpc+kp-1 ) = t
+                    if( kstep==2 ) then
+                       t = ap( kc+k-2 )
+                       ap( kc+k-2 ) = ap( kc+kp-1 )
+                       ap( kc+kp-1 ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**t = a - w(k)*1/d(k)*w(k)**t
+                    r1 = one / ap( kc+k-1 )
+                    call stdlib_sspr( uplo, k-1, -r1, ap( kc ), 1, ap )
+                    ! store u(k) in column k
+                    call stdlib_sscal( k-1, r1, ap( kc ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**t
+                    if( k>2 ) then
+                       d12 = ap( k-1+( k-1 )*k / 2 )
+                       d22 = ap( k-1+( k-2 )*( k-1 ) / 2 ) / d12
+                       d11 = ap( k+( k-1 )*k / 2 ) / d12
+                       t = one / ( d11*d22-one )
+                       d12 = t / d12
+                       do j = k - 2, 1, -1
+                          wkm1 = d12*( d11*ap( j+( k-2 )*( k-1 ) / 2 )-ap( j+( k-1 )*k / 2 ) )
+                                    
+                          wk = d12*( d22*ap( j+( k-1 )*k / 2 )-ap( j+( k-2 )*( k-1 ) / 2 ) )
+                                    
+                          do i = j, 1, -1
+                             ap( i+( j-1 )*j / 2 ) = ap( i+( j-1 )*j / 2 ) -ap( i+( k-1 )*k / 2 )&
+                                       *wk -ap( i+( k-2 )*( k-1 ) / 2 )*wkm1
+                          end do
+                          ap( j+( k-1 )*k / 2 ) = wk
+                          ap( j+( k-2 )*( k-1 ) / 2 ) = wkm1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              kc = knc - k
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              kc = 1
+              npp = n*( n+1 ) / 2
+              60 continue
+              knc = kc
+              ! if k > n, exit from loop
+              if( k>n )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( ap( kc ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k<n ) then
+                 imax = k + stdlib_isamax( n-k, ap( kc+1 ), 1 )
+                 colmax = abs( ap( kc+imax-k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    rowmax = zero
+                    kx = kc + imax - k
+                    do j = k, imax - 1
+                       if( abs( ap( kx ) )>rowmax ) then
+                          rowmax = abs( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + n - j
+                    end do
+                    kpc = npp - ( n-imax+1 )*( n-imax+2 ) / 2 + 1
+                    if( imax<n ) then
+                       jmax = imax + stdlib_isamax( n-imax, ap( kpc+1 ), 1 )
+                       rowmax = max( rowmax, abs( ap( kpc+jmax-imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( ap( kpc ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kstep==2 )knc = knc + n - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_sswap( n-kp, ap( knc+kp-kk+1 ), 1, ap( kpc+1 ),1 )
+                              
+                    kx = knc + kp - kk
+                    do j = kk + 1, kp - 1
+                       kx = kx + n - j + 1
+                       t = ap( knc+j-kk )
+                       ap( knc+j-kk ) = ap( kx )
+                       ap( kx ) = t
+                    end do
+                    t = ap( knc )
+                    ap( knc ) = ap( kpc )
+                    ap( kpc ) = t
+                    if( kstep==2 ) then
+                       t = ap( kc+1 )
+                       ap( kc+1 ) = ap( kc+kp-k )
+                       ap( kc+kp-k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**t = a - w(k)*(1/d(k))*w(k)**t
+                       r1 = one / ap( kc )
+                       call stdlib_sspr( uplo, n-k, -r1, ap( kc+1 ), 1,ap( kc+n-k+1 ) )
+                       ! store l(k) in column k
+                       call stdlib_sscal( n-k, r1, ap( kc+1 ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**t
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**t
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d21 = ap( k+1+( k-1 )*( 2*n-k ) / 2 )
+                       d11 = ap( k+1+k*( 2*n-k-1 ) / 2 ) / d21
+                       d22 = ap( k+( k-1 )*( 2*n-k ) / 2 ) / d21
+                       t = one / ( d11*d22-one )
+                       d21 = t / d21
+                       do j = k + 2, n
+                          wk = d21*( d11*ap( j+( k-1 )*( 2*n-k ) / 2 )-ap( j+k*( 2*n-k-1 ) / 2 ) )
+                                    
+                          wkp1 = d21*( d22*ap( j+k*( 2*n-k-1 ) / 2 )-ap( j+( k-1 )*( 2*n-k ) / 2 )&
+                                     )
+                          do i = j, n
+                             ap( i+( j-1 )*( 2*n-j ) / 2 ) = ap( i+( j-1 )*( 2*n-j ) / 2 ) - ap( &
+                                       i+( k-1 )*( 2*n-k ) /2 )*wk - ap( i+k*( 2*n-k-1 ) / 2 )*wkp1
+                          end do
+                          ap( j+( k-1 )*( 2*n-k ) / 2 ) = wk
+                          ap( j+k*( 2*n-k-1 ) / 2 ) = wkp1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              kc = knc + n - k + 2
+              go to 60
+           end if
+           110 continue
+           return
+     end subroutine stdlib_ssptrf
+
+     !> SSPTRI: computes the inverse of a real symmetric indefinite matrix
+     !> A in packed storage using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by SSPTRF.
+
+     pure subroutine stdlib_ssptri( uplo, n, ap, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(inout) :: ap(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kcnext, kp, kpc, kstep, kx, npp
+           real(sp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              kp = n*( n+1 ) / 2
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. ap( kp )==zero )return
+                 kp = kp - info
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              kp = 1
+              do info = 1, n
+                 if( ipiv( info )>0 .and. ap( kp )==zero )return
+                 kp = kp + n - info + 1
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              kcnext = kc + k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc+k-1 ) = one / ap( kc+k-1 )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_scopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_sspmv( uplo, k-1, -one, ap, work, 1, zero, ap( kc ),1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -stdlib_sdot( k-1, work, 1, ap( kc ), 1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( ap( kcnext+k-1 ) )
+                 ak = ap( kc+k-1 ) / t
+                 akp1 = ap( kcnext+k ) / t
+                 akkp1 = ap( kcnext+k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 ap( kc+k-1 ) = akp1 / d
+                 ap( kcnext+k ) = ak / d
+                 ap( kcnext+k-1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_scopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_sspmv( uplo, k-1, -one, ap, work, 1, zero, ap( kc ),1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -stdlib_sdot( k-1, work, 1, ap( kc ), 1 )
+                    ap( kcnext+k-1 ) = ap( kcnext+k-1 ) -stdlib_sdot( k-1, ap( kc ), 1, ap( &
+                              kcnext ),1 )
+                    call stdlib_scopy( k-1, ap( kcnext ), 1, work, 1 )
+                    call stdlib_sspmv( uplo, k-1, -one, ap, work, 1, zero,ap( kcnext ), 1 )
+                              
+                    ap( kcnext+k ) = ap( kcnext+k ) -stdlib_sdot( k-1, work, 1, ap( kcnext ), 1 )
+                              
+                 end if
+                 kstep = 2
+                 kcnext = kcnext + k + 1
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kpc = ( kp-1 )*kp / 2 + 1
+                 call stdlib_sswap( kp-1, ap( kc ), 1, ap( kpc ), 1 )
+                 kx = kpc + kp - 1
+                 do j = kp + 1, k - 1
+                    kx = kx + j - 1
+                    temp = ap( kc+j-1 )
+                    ap( kc+j-1 ) = ap( kx )
+                    ap( kx ) = temp
+                 end do
+                 temp = ap( kc+k-1 )
+                 ap( kc+k-1 ) = ap( kpc+kp-1 )
+                 ap( kpc+kp-1 ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc+k+k-1 )
+                    ap( kc+k+k-1 ) = ap( kc+k+kp-1 )
+                    ap( kc+k+kp-1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              kc = kcnext
+              go to 30
+              50 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              npp = n*( n+1 ) / 2
+              k = n
+              kc = npp
+              60 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 80
+              kcnext = kc - ( n-k+2 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc ) = one / ap( kc )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_scopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_sspmv( uplo, n-k, -one, ap( kc+n-k+1 ), work, 1,zero, ap( kc+1 ), &
+                              1 )
+                    ap( kc ) = ap( kc ) - stdlib_sdot( n-k, work, 1, ap( kc+1 ), 1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( ap( kcnext+1 ) )
+                 ak = ap( kcnext ) / t
+                 akp1 = ap( kc ) / t
+                 akkp1 = ap( kcnext+1 ) / t
+                 d = t*( ak*akp1-one )
+                 ap( kcnext ) = akp1 / d
+                 ap( kc ) = ak / d
+                 ap( kcnext+1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_scopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_sspmv( uplo, n-k, -one, ap( kc+( n-k+1 ) ), work, 1,zero, ap( kc+&
+                              1 ), 1 )
+                    ap( kc ) = ap( kc ) - stdlib_sdot( n-k, work, 1, ap( kc+1 ), 1 )
+                    ap( kcnext+1 ) = ap( kcnext+1 ) -stdlib_sdot( n-k, ap( kc+1 ), 1,ap( kcnext+2 &
+                              ), 1 )
+                    call stdlib_scopy( n-k, ap( kcnext+2 ), 1, work, 1 )
+                    call stdlib_sspmv( uplo, n-k, -one, ap( kc+( n-k+1 ) ), work, 1,zero, ap( &
+                              kcnext+2 ), 1 )
+                    ap( kcnext ) = ap( kcnext ) -stdlib_sdot( n-k, work, 1, ap( kcnext+2 ), 1 )
+                              
+                 end if
+                 kstep = 2
+                 kcnext = kcnext - ( n-k+3 )
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kpc = npp - ( n-kp+1 )*( n-kp+2 ) / 2 + 1
+                 if( kp<n )call stdlib_sswap( n-kp, ap( kc+kp-k+1 ), 1, ap( kpc+1 ), 1 )
+                 kx = kc + kp - k
+                 do j = k + 1, kp - 1
+                    kx = kx + n - j + 1
+                    temp = ap( kc+j-k )
+                    ap( kc+j-k ) = ap( kx )
+                    ap( kx ) = temp
+                 end do
+                 temp = ap( kc )
+                 ap( kc ) = ap( kpc )
+                 ap( kpc ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc-n+k-1 )
+                    ap( kc-n+k-1 ) = ap( kc-n+kp-1 )
+                    ap( kc-n+kp-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              kc = kcnext
+              go to 60
+              80 continue
+           end if
+           return
+     end subroutine stdlib_ssptri
+
+     !> SSPTRS: solves a system of linear equations A*X = B with a real
+     !> symmetric matrix A stored in packed format using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by SSPTRF.
+
+     pure subroutine stdlib_ssptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(in) :: ap(*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kp
+           real(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              kc = kc - k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_sger( k-1, nrhs, -one, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_sscal( nrhs, one / ap( kc+k-1 ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_sswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_sger( k-2, nrhs, -one, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 call stdlib_sger( k-2, nrhs, -one, ap( kc-( k-1 ) ), 1,b( k-1, 1 ), ldb, b( 1, 1 &
+                           ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+k-2 )
+                 akm1 = ap( kc-1 ) / akm1k
+                 ak = ap( kc+k-1 ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc - k + 1
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_sgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb, ap( kc ),1, one, b( k, &
+                           1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + k
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 call stdlib_sgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb, ap( kc ),1, one, b( k, &
+                           1 ), ldb )
+                 call stdlib_sgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb,ap( kc+k ), 1, one, b( k+&
+                           1, 1 ), ldb )
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + 2*k + 1
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_sger( n-k, nrhs, -one, ap( kc+1 ), 1, b( k, 1 ),ldb, b( k+1,&
+                            1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_sscal( nrhs, one / ap( kc ), b( k, 1 ), ldb )
+                 kc = kc + n - k + 1
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_sswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_sger( n-k-1, nrhs, -one, ap( kc+2 ), 1, b( k, 1 ),ldb, b( k+2, 1 )&
+                              , ldb )
+                    call stdlib_sger( n-k-1, nrhs, -one, ap( kc+n-k+2 ), 1,b( k+1, 1 ), ldb, b( k+&
+                              2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+1 )
+                 akm1 = ap( kc ) / akm1k
+                 ak = ap( kc+n-k+1 ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc + 2*( n-k ) + 1
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              kc = kc - ( n-k+1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_sgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, ap( &
+                           kc+1 ), 1, one, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_sgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, ap( kc+1 ), &
+                              1, one, b( k, 1 ), ldb )
+                    call stdlib_sgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, ap( kc-( n-&
+                              k ) ), 1, one, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc - ( n-k+2 )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_ssptrs
+
+     !> SSTEBZ: computes the eigenvalues of a symmetric tridiagonal
+     !> matrix T.  The user may ask for all eigenvalues, all eigenvalues
+     !> in the half-open interval (VL, VU], or the IL-th through IU-th
+     !> eigenvalues.
+     !> To avoid overflow, the matrix must be scaled so that its
+     !> largest element is no greater than overflow**(1/2) * underflow**(1/4) in absolute value, and for greatest
+     !> accuracy, it should not be much smaller than that.
+     !> See W. Kahan "Accurate Eigenvalues of a Symmetric Tridiagonal
+     !> Matrix", Report CS41, Computer Science Dept., Stanford
+     !> University, July 21, 1966.
+
+     pure subroutine stdlib_sstebz( range, order, n, vl, vu, il, iu, abstol, d, e,m, nsplit, w, &
+               iblock, isplit, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: order, range
+           integer(ilp), intent(in) :: il, iu, n
+           integer(ilp), intent(out) :: info, m, nsplit
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iblock(*), isplit(*), iwork(*)
+           real(sp), intent(in) :: d(*), e(*)
+           real(sp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: fudge = 2.1_sp
+           real(sp), parameter :: relfac = 2.0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ncnvrg, toofew
+           integer(ilp) :: ib, ibegin, idiscl, idiscu, ie, iend, iinfo, im, in, ioff, iorder, &
+                     iout, irange, itmax, itmp1, iw, iwoff, j, jb, jdisc, je, nb, nwl, nwu
+           real(sp) :: atoli, bnorm, gl, gu, pivmin, rtoli, safemn, tmp1, tmp2, tnorm, ulp, wkill,&
+                      wl, wlu, wu, wul
+           ! Local Arrays 
+           integer(ilp) :: idumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           ! decode range
+           if( stdlib_lsame( range, 'A' ) ) then
+              irange = 1
+           else if( stdlib_lsame( range, 'V' ) ) then
+              irange = 2
+           else if( stdlib_lsame( range, 'I' ) ) then
+              irange = 3
+           else
+              irange = 0
+           end if
+           ! decode order
+           if( stdlib_lsame( order, 'B' ) ) then
+              iorder = 2
+           else if( stdlib_lsame( order, 'E' ) ) then
+              iorder = 1
+           else
+              iorder = 0
+           end if
+           ! check for errors
+           if( irange<=0 ) then
+              info = -1
+           else if( iorder<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( irange==2 ) then
+              if( vl>=vu ) info = -5
+           else if( irange==3 .and. ( il<1 .or. il>max( 1, n ) ) )then
+              info = -6
+           else if( irange==3 .and. ( iu<min( n, il ) .or. iu>n ) )then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSTEBZ', -info )
+              return
+           end if
+           ! initialize error flags
+           info = 0
+           ncnvrg = .false.
+           toofew = .false.
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           ! simplifications:
+           if( irange==3 .and. il==1 .and. iu==n )irange = 1
+           ! get machine constants
+           ! nb is the minimum vector length for vector bisection, or 0
+           ! if only scalar is to be done.
+           safemn = stdlib_slamch( 'S' )
+           ulp = stdlib_slamch( 'P' )
+           rtoli = ulp*relfac
+           nb = stdlib_ilaenv( 1, 'SSTEBZ', ' ', n, -1, -1, -1 )
+           if( nb<=1 )nb = 0
+           ! special case when n=1
+           if( n==1 ) then
+              nsplit = 1
+              isplit( 1 ) = 1
+              if( irange==2 .and. ( vl>=d( 1 ) .or. vu<d( 1 ) ) ) then
+                 m = 0
+              else
+                 w( 1 ) = d( 1 )
+                 iblock( 1 ) = 1
+                 m = 1
+              end if
+              return
+           end if
+           ! compute splitting points
+           nsplit = 1
+           work( n ) = zero
+           pivmin = one
+           do j = 2, n
+              tmp1 = e( j-1 )**2
+              if( abs( d( j )*d( j-1 ) )*ulp**2+safemn>tmp1 ) then
+                 isplit( nsplit ) = j - 1
+                 nsplit = nsplit + 1
+                 work( j-1 ) = zero
+              else
+                 work( j-1 ) = tmp1
+                 pivmin = max( pivmin, tmp1 )
+              end if
+           end do
+           isplit( nsplit ) = n
+           pivmin = pivmin*safemn
+           ! compute interval and atoli
+           if( irange==3 ) then
+              ! range='i': compute the interval containing eigenvalues
+                         ! il through iu.
+              ! compute gershgorin interval for entire (split) matrix
+              ! and use it as the initial interval
+              gu = d( 1 )
+              gl = d( 1 )
+              tmp1 = zero
+              do j = 1, n - 1
+                 tmp2 = sqrt( work( j ) )
+                 gu = max( gu, d( j )+tmp1+tmp2 )
+                 gl = min( gl, d( j )-tmp1-tmp2 )
+                 tmp1 = tmp2
+              end do
+              gu = max( gu, d( n )+tmp1 )
+              gl = min( gl, d( n )-tmp1 )
+              tnorm = max( abs( gl ), abs( gu ) )
+              gl = gl - fudge*tnorm*ulp*n - fudge*two*pivmin
+              gu = gu + fudge*tnorm*ulp*n + fudge*pivmin
+              ! compute iteration parameters
+              itmax = int( ( log( tnorm+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+              if( abstol<=zero ) then
+                 atoli = ulp*tnorm
+              else
+                 atoli = abstol
+              end if
+              work( n+1 ) = gl
+              work( n+2 ) = gl
+              work( n+3 ) = gu
+              work( n+4 ) = gu
+              work( n+5 ) = gl
+              work( n+6 ) = gu
+              iwork( 1 ) = -1
+              iwork( 2 ) = -1
+              iwork( 3 ) = n + 1
+              iwork( 4 ) = n + 1
+              iwork( 5 ) = il - 1
+              iwork( 6 ) = iu
+              call stdlib_slaebz( 3, itmax, n, 2, 2, nb, atoli, rtoli, pivmin, d, e,work, iwork( &
+                        5 ), work( n+1 ), work( n+5 ), iout,iwork, w, iblock, iinfo )
+              if( iwork( 6 )==iu ) then
+                 wl = work( n+1 )
+                 wlu = work( n+3 )
+                 nwl = iwork( 1 )
+                 wu = work( n+4 )
+                 wul = work( n+2 )
+                 nwu = iwork( 4 )
+              else
+                 wl = work( n+2 )
+                 wlu = work( n+4 )
+                 nwl = iwork( 2 )
+                 wu = work( n+3 )
+                 wul = work( n+1 )
+                 nwu = iwork( 3 )
+              end if
+              if( nwl<0 .or. nwl>=n .or. nwu<1 .or. nwu>n ) then
+                 info = 4
+                 return
+              end if
+           else
+              ! range='a' or 'v' -- set atoli
+              tnorm = max( abs( d( 1 ) )+abs( e( 1 ) ),abs( d( n ) )+abs( e( n-1 ) ) )
+              do j = 2, n - 1
+                 tnorm = max( tnorm, abs( d( j ) )+abs( e( j-1 ) )+abs( e( j ) ) )
+              end do
+              if( abstol<=zero ) then
+                 atoli = ulp*tnorm
+              else
+                 atoli = abstol
+              end if
+              if( irange==2 ) then
+                 wl = vl
+                 wu = vu
+              else
+                 wl = zero
+                 wu = zero
+              end if
+           end if
+           ! find eigenvalues -- loop over blocks and recompute nwl and nwu.
+           ! nwl accumulates the number of eigenvalues .le. wl,
+           ! nwu accumulates the number of eigenvalues .le. wu
+           m = 0
+           iend = 0
+           info = 0
+           nwl = 0
+           nwu = 0
+           loop_70: do jb = 1, nsplit
+              ioff = iend
+              ibegin = ioff + 1
+              iend = isplit( jb )
+              in = iend - ioff
+              if( in==1 ) then
+                 ! special case -- in=1
+                 if( irange==1 .or. wl>=d( ibegin )-pivmin )nwl = nwl + 1
+                 if( irange==1 .or. wu>=d( ibegin )-pivmin )nwu = nwu + 1
+                 if( irange==1 .or. ( wl<d( ibegin )-pivmin .and. wu>=d( ibegin )-pivmin ) ) &
+                           then
+                    m = m + 1
+                    w( m ) = d( ibegin )
+                    iblock( m ) = jb
+                 end if
+              else
+                 ! general case -- in > 1
+                 ! compute gershgorin interval
+                 ! and use it as the initial interval
+                 gu = d( ibegin )
+                 gl = d( ibegin )
+                 tmp1 = zero
+                 do j = ibegin, iend - 1
+                    tmp2 = abs( e( j ) )
+                    gu = max( gu, d( j )+tmp1+tmp2 )
+                    gl = min( gl, d( j )-tmp1-tmp2 )
+                    tmp1 = tmp2
+                 end do
+                 gu = max( gu, d( iend )+tmp1 )
+                 gl = min( gl, d( iend )-tmp1 )
+                 bnorm = max( abs( gl ), abs( gu ) )
+                 gl = gl - fudge*bnorm*ulp*in - fudge*pivmin
+                 gu = gu + fudge*bnorm*ulp*in + fudge*pivmin
+                 ! compute atoli for the current submatrix
+                 if( abstol<=zero ) then
+                    atoli = ulp*max( abs( gl ), abs( gu ) )
+                 else
+                    atoli = abstol
+                 end if
+                 if( irange>1 ) then
+                    if( gu<wl ) then
+                       nwl = nwl + in
+                       nwu = nwu + in
+                       cycle loop_70
+                    end if
+                    gl = max( gl, wl )
+                    gu = min( gu, wu )
+                    if( gl>=gu )cycle loop_70
+                 end if
+                 ! set up initial interval
+                 work( n+1 ) = gl
+                 work( n+in+1 ) = gu
+                 call stdlib_slaebz( 1, 0, in, in, 1, nb, atoli, rtoli, pivmin,d( ibegin ), e( &
+                 ibegin ), work( ibegin ),idumma, work( n+1 ), work( n+2*in+1 ), im,iwork, w( m+1 &
+                           ), iblock( m+1 ), iinfo )
+                 nwl = nwl + iwork( 1 )
+                 nwu = nwu + iwork( in+1 )
+                 iwoff = m - iwork( 1 )
+                 ! compute eigenvalues
+                 itmax = int( ( log( gu-gl+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + &
+                           2
+                 call stdlib_slaebz( 2, itmax, in, in, 1, nb, atoli, rtoli, pivmin,d( ibegin ), e(&
+                  ibegin ), work( ibegin ),idumma, work( n+1 ), work( n+2*in+1 ), iout,iwork, w( &
+                            m+1 ), iblock( m+1 ), iinfo )
+                 ! copy eigenvalues into w and iblock
+                 ! use -jb for block number for unconverged eigenvalues.
+                 do j = 1, iout
+                    tmp1 = half*( work( j+n )+work( j+in+n ) )
+                    ! flag non-convergence.
+                    if( j>iout-iinfo ) then
+                       ncnvrg = .true.
+                       ib = -jb
+                    else
+                       ib = jb
+                    end if
+                    do je = iwork( j ) + 1 + iwoff,iwork( j+in ) + iwoff
+                       w( je ) = tmp1
+                       iblock( je ) = ib
+                    end do
+                 end do
+                 m = m + im
+              end if
+           end do loop_70
+           ! if range='i', then (wl,wu) contains eigenvalues nwl+1,...,nwu
+           ! if nwl+1 < il or nwu > iu, discard extra eigenvalues.
+           if( irange==3 ) then
+              im = 0
+              idiscl = il - 1 - nwl
+              idiscu = nwu - iu
+              if( idiscl>0 .or. idiscu>0 ) then
+                 do je = 1, m
+                    if( w( je )<=wlu .and. idiscl>0 ) then
+                       idiscl = idiscl - 1
+                    else if( w( je )>=wul .and. idiscu>0 ) then
+                       idiscu = idiscu - 1
+                    else
+                       im = im + 1
+                       w( im ) = w( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 m = im
+              end if
+              if( idiscl>0 .or. idiscu>0 ) then
+                 ! code to deal with effects of bad arithmetic:
+                 ! some low eigenvalues to be discarded are not in (wl,wlu],
+                 ! or high eigenvalues to be discarded are not in (wul,wu]
+                 ! so just kill off the smallest idiscl/largest idiscu
+                 ! eigenvalues, by simply finding the smallest/largest
+                 ! eigenvalue(s).
+                 ! (if n(w) is monotone non-decreasing, this should never
+                     ! happen.)
+                 if( idiscl>0 ) then
+                    wkill = wu
+                    do jdisc = 1, idiscl
+                       iw = 0
+                       do je = 1, m
+                          if( iblock( je )/=0 .and.( w( je )<wkill .or. iw==0 ) ) then
+                             iw = je
+                             wkill = w( je )
+                          end if
+                       end do
+                       iblock( iw ) = 0
+                    end do
+                 end if
+                 if( idiscu>0 ) then
+                    wkill = wl
+                    do jdisc = 1, idiscu
+                       iw = 0
+                       do je = 1, m
+                          if( iblock( je )/=0 .and.( w( je )>wkill .or. iw==0 ) ) then
+                             iw = je
+                             wkill = w( je )
+                          end if
+                       end do
+                       iblock( iw ) = 0
+                    end do
+                 end if
+                 im = 0
+                 do je = 1, m
+                    if( iblock( je )/=0 ) then
+                       im = im + 1
+                       w( im ) = w( je )
+                       iblock( im ) = iblock( je )
+                    end if
+                 end do
+                 m = im
+              end if
+              if( idiscl<0 .or. idiscu<0 ) then
+                 toofew = .true.
+              end if
+           end if
+           ! if order='b', do nothing -- the eigenvalues are already sorted
+              ! by block.
+           ! if order='e', sort the eigenvalues from smallest to largest
+           if( iorder==1 .and. nsplit>1 ) then
+              do je = 1, m - 1
+                 ie = 0
+                 tmp1 = w( je )
+                 do j = je + 1, m
+                    if( w( j )<tmp1 ) then
+                       ie = j
+                       tmp1 = w( j )
+                    end if
+                 end do
+                 if( ie/=0 ) then
+                    itmp1 = iblock( ie )
+                    w( ie ) = w( je )
+                    iblock( ie ) = iblock( je )
+                    w( je ) = tmp1
+                    iblock( je ) = itmp1
+                 end if
+              end do
+           end if
+           info = 0
+           if( ncnvrg )info = info + 1
+           if( toofew )info = info + 2
+           return
+     end subroutine stdlib_sstebz
+
+     !> SSYCONV: convert A given by TRF into L and D and vice-versa.
+     !> Get Non-diag elements of D (returned in workspace) and
+     !> apply or reverse permutation done in TRF.
+
+     pure subroutine stdlib_ssyconv( uplo, way, n, a, lda, ipiv, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip, j
+           real(sp) :: temp
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYCONV', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+            ! a is upper
+            ! convert a (a is upper)
+              ! convert value
+              if ( convert ) then
+                 i=n
+                 e(1)=zero
+                 do while ( i > 1 )
+                    if( ipiv(i) < 0 ) then
+                       e(i)=a(i-1,i)
+                       e(i-1)=zero
+                       a(i-1,i)=zero
+                       i=i-1
+                    else
+                       e(i)=zero
+                    endif
+                    i=i-1
+                 end do
+              ! convert permutations
+              i=n
+              do while ( i >= 1 )
+                 if( ipiv(i) > 0) then
+                    ip=ipiv(i)
+                    if( i < n) then
+                       do j= i+1,n
+                         temp=a(ip,j)
+                         a(ip,j)=a(i,j)
+                         a(i,j)=temp
+                       end do
+                    endif
+                 else
+                   ip=-ipiv(i)
+                    if( i < n) then
+                  do j= i+1,n
+                      temp=a(ip,j)
+                      a(ip,j)=a(i-1,j)
+                      a(i-1,j)=temp
+                  end do
+                     endif
+                     i=i-1
+                endif
+                i=i-1
+             end do
+              else
+            ! revert a (a is upper)
+              ! revert permutations
+                 i=1
+                 do while ( i <= n )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if( i < n) then
+                       do j= i+1,n
+                         temp=a(ip,j)
+                         a(ip,j)=a(i,j)
+                         a(i,j)=temp
+                       end do
+                       endif
+                    else
+                      ip=-ipiv(i)
+                      i=i+1
+                      if( i < n) then
+                         do j= i+1,n
+                            temp=a(ip,j)
+                            a(ip,j)=a(i-1,j)
+                            a(i-1,j)=temp
+                         end do
+                      endif
+                    endif
+                    i=i+1
+                 end do
+              ! revert value
+                 i=n
+                 do while ( i > 1 )
+                    if( ipiv(i) < 0 ) then
+                       a(i-1,i)=e(i)
+                       i=i-1
+                    endif
+                    i=i-1
+                 end do
+              end if
+           else
+            ! a is lower
+              if ( convert ) then
+            ! convert a (a is lower)
+              ! convert value
+                 i=1
+                 e(n)=zero
+                 do while ( i <= n )
+                    if( i<n .and. ipiv(i) < 0 ) then
+                       e(i)=a(i+1,i)
+                       e(i+1)=zero
+                       a(i+1,i)=zero
+                       i=i+1
+                    else
+                       e(i)=zero
+                    endif
+                    i=i+1
+                 end do
+              ! convert permutations
+              i=1
+              do while ( i <= n )
+                 if( ipiv(i) > 0 ) then
+                    ip=ipiv(i)
+                    if (i > 1) then
+                    do j= 1,i-1
+                      temp=a(ip,j)
+                      a(ip,j)=a(i,j)
+                      a(i,j)=temp
+                    end do
+                    endif
+                 else
+                   ip=-ipiv(i)
+                   if (i > 1) then
+                   do j= 1,i-1
+                      temp=a(ip,j)
+                      a(ip,j)=a(i+1,j)
+                      a(i+1,j)=temp
+                   end do
+                   endif
+                   i=i+1
+                endif
+                i=i+1
+             end do
+              else
+            ! revert a (a is lower)
+              ! revert permutations
+                 i=n
+                 do while ( i >= 1 )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(i,j)
+                             a(i,j)=a(ip,j)
+                             a(ip,j)=temp
+                          end do
+                       endif
+                    else
+                       ip=-ipiv(i)
+                       i=i-1
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(i+1,j)
+                             a(i+1,j)=a(ip,j)
+                             a(ip,j)=temp
+                          end do
+                       endif
+                    endif
+                    i=i-1
+                 end do
+              ! revert value
+                 i=1
+                 do while ( i <= n-1 )
+                    if( ipiv(i) < 0 ) then
+                       a(i+1,i)=e(i)
+                       i=i+1
+                    endif
+                    i=i+1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_ssyconv
+
+     !> If parameter WAY = 'C':
+     !> SSYCONVF: converts the factorization output format used in
+     !> SSYTRF provided on entry in parameter A into the factorization
+     !> output format used in SSYTRF_RK (or SSYTRF_BK) that is stored
+     !> on exit in parameters A and E. It also converts in place details of
+     !> the intechanges stored in IPIV from the format used in SSYTRF into
+     !> the format used in SSYTRF_RK (or SSYTRF_BK).
+     !> If parameter WAY = 'R':
+     !> SSYCONVF performs the conversion in reverse direction, i.e.
+     !> converts the factorization output format used in SSYTRF_RK
+     !> (or SSYTRF_BK) provided on entry in parameters A and E into
+     !> the factorization output format used in SSYTRF that is stored
+     !> on exit in parameter A. It also converts in place details of
+     !> the intechanges stored in IPIV from the format used in SSYTRF_RK
+     !> (or SSYTRF_BK) into the format used in SSYTRF.
+
+     pure subroutine stdlib_ssyconvf( uplo, way, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*), e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYCONVF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! begin a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 ! assign superdiagonal entries of d to array e and zero out
+                 ! corresponding entries in input storage a
+                 i = n
+                 e( 1 ) = zero
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       e( i ) = a( i-1, i )
+                       e( i-1 ) = zero
+                       a( i-1, i ) = zero
+                       i = i - 1
+                    else
+                       e( i ) = zero
+                    end if
+                    i = i - 1
+                 end do
+                 ! convert permutations and ipiv
+                 ! apply permutations to submatrices of upper part of a
+                 ! in factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_sswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i) in a(1:i,n-i:n)
+                       ip = -ipiv( i )
+                       if( i<n ) then
+                          if( ip/=(i-1) ) then
+                             call stdlib_sswap( n-i, a( i-1, i+1 ), lda,a( ip, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is no interchnge of rows i and and ipiv(i),
+                       ! so this should be reflected in ipiv format for
+                       ! *sytrf_rk ( or *sytrf_bk)
+                       ipiv( i ) = i
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations and ipiv
+                 ! apply permutations to submatrices of upper part of a
+                 ! in reverse factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_sswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i) in a(1:i,n-i:n)
+                       i = i + 1
+                       ip = -ipiv( i )
+                       if( i<n ) then
+                          if( ip/=(i-1) ) then
+                             call stdlib_sswap( n-i, a( ip, i+1 ), lda,a( i-1, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is one interchange of rows i-1 and ipiv(i-1),
+                       ! so this should be recorded in two consecutive entries
+                       ! in ipiv format for *sytrf
+                       ipiv( i ) = ipiv( i-1 )
+                    end if
+                    i = i + 1
+                 end do
+                 ! revert value
+                 ! assign superdiagonal entries of d from array e to
+                 ! superdiagonal entries of a.
+                 i = n
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       a( i-1, i ) = e( i )
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              ! end a is upper
+              end if
+           else
+              ! begin a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 ! assign subdiagonal entries of d to array e and zero out
+                 ! corresponding entries in input storage a
+                 i = 1
+                 e( n ) = zero
+                 do while ( i<=n )
+                    if( i<n .and. ipiv(i)<0 ) then
+                       e( i ) = a( i+1, i )
+                       e( i+1 ) = zero
+                       a( i+1, i ) = zero
+                       i = i + 1
+                    else
+                       e( i ) = zero
+                    end if
+                    i = i + 1
+                 end do
+                 ! convert permutations and ipiv
+                 ! apply permutations to submatrices of lower part of a
+                 ! in factorization order where k increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_sswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i) in a(i:n,1:i-1)
+                       ip = -ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=(i+1) ) then
+                             call stdlib_sswap( i-1, a( i+1, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is no interchnge of rows i and and ipiv(i),
+                       ! so this should be reflected in ipiv format for
+                       ! *sytrf_rk ( or *sytrf_bk)
+                       ipiv( i ) = i
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations and ipiv
+                 ! apply permutations to submatrices of lower part of a
+                 ! in reverse factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_sswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i) in a(i:n,1:i-1)
+                       i = i - 1
+                       ip = -ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=(i+1) ) then
+                             call stdlib_sswap( i-1, a( ip, 1 ), lda,a( i+1, 1 ), lda )
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is one interchange of rows i+1 and ipiv(i+1),
+                       ! so this should be recorded in consecutive entries
+                       ! in ipiv format for *sytrf
+                       ipiv( i ) = ipiv( i+1 )
+                    end if
+                    i = i - 1
+                 end do
+                 ! revert value
+                 ! assign subdiagonal entries of d from array e to
+                 ! subgiagonal entries of a.
+                 i = 1
+                 do while ( i<=n-1 )
+                    if( ipiv( i )<0 ) then
+                       a( i + 1, i ) = e( i )
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              end if
+              ! end a is lower
+           end if
+           return
+     end subroutine stdlib_ssyconvf
+
+     !> If parameter WAY = 'C':
+     !> SSYCONVF_ROOK: converts the factorization output format used in
+     !> SSYTRF_ROOK provided on entry in parameter A into the factorization
+     !> output format used in SSYTRF_RK (or SSYTRF_BK) that is stored
+     !> on exit in parameters A and E. IPIV format for SSYTRF_ROOK and
+     !> SSYTRF_RK (or SSYTRF_BK) is the same and is not converted.
+     !> If parameter WAY = 'R':
+     !> SSYCONVF_ROOK performs the conversion in reverse direction, i.e.
+     !> converts the factorization output format used in SSYTRF_RK
+     !> (or SSYTRF_BK) provided on entry in parameters A and E into
+     !> the factorization output format used in SSYTRF_ROOK that is stored
+     !> on exit in parameter A. IPIV format for SSYTRF_ROOK and
+     !> SSYTRF_RK (or SSYTRF_BK) is the same and is not converted.
+
+     pure subroutine stdlib_ssyconvf_rook( uplo, way, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*), e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip, ip2
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYCONVF_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! begin a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 ! assign superdiagonal entries of d to array e and zero out
+                 ! corresponding entries in input storage a
+                 i = n
+                 e( 1 ) = zero
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       e( i ) = a( i-1, i )
+                       e( i-1 ) = zero
+                       a( i-1, i ) = zero
+                       i = i - 1
+                    else
+                       e( i ) = zero
+                    end if
+                    i = i - 1
+                 end do
+                 ! convert permutations
+                 ! apply permutations to submatrices of upper part of a
+                 ! in factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_sswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i and ipiv(i) and i-1 and ipiv(i-1)
+                       ! in a(1:i,n-i:n)
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i-1 )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_sswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                          if( ip2/=(i-1) ) then
+                             call stdlib_sswap( n-i, a( i-1, i+1 ), lda,a( ip2, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations
+                 ! apply permutations to submatrices of upper part of a
+                 ! in reverse factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_sswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i-1) and i and ipiv(i)
+                       ! in a(1:i,n-i:n)
+                       i = i + 1
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i-1 )
+                       if( i<n ) then
+                          if( ip2/=(i-1) ) then
+                             call stdlib_sswap( n-i, a( ip2, i+1 ), lda,a( i-1, i+1 ), lda )
+                                       
+                          end if
+                          if( ip/=i ) then
+                             call stdlib_sswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    end if
+                    i = i + 1
+                 end do
+                 ! revert value
+                 ! assign superdiagonal entries of d from array e to
+                 ! superdiagonal entries of a.
+                 i = n
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       a( i-1, i ) = e( i )
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              ! end a is upper
+              end if
+           else
+              ! begin a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 ! assign subdiagonal entries of d to array e and zero out
+                 ! corresponding entries in input storage a
+                 i = 1
+                 e( n ) = zero
+                 do while ( i<=n )
+                    if( i<n .and. ipiv(i)<0 ) then
+                       e( i ) = a( i+1, i )
+                       e( i+1 ) = zero
+                       a( i+1, i ) = zero
+                       i = i + 1
+                    else
+                       e( i ) = zero
+                    end if
+                    i = i + 1
+                 end do
+                 ! convert permutations
+                 ! apply permutations to submatrices of lower part of a
+                 ! in factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_sswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i and ipiv(i) and i+1 and ipiv(i+1)
+                       ! in a(i:n,1:i-1)
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i+1 )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_sswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                          if( ip2/=(i+1) ) then
+                             call stdlib_sswap( i-1, a( i+1, 1 ), lda,a( ip2, 1 ), lda )
+                          end if
+                       end if
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations
+                 ! apply permutations to submatrices of lower part of a
+                 ! in reverse factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_sswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i+1) and i and ipiv(i)
+                       ! in a(i:n,1:i-1)
+                       i = i - 1
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i+1 )
+                       if ( i>1 ) then
+                          if( ip2/=(i+1) ) then
+                             call stdlib_sswap( i-1, a( ip2, 1 ), lda,a( i+1, 1 ), lda )
+                          end if
+                          if( ip/=i ) then
+                             call stdlib_sswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    end if
+                    i = i - 1
+                 end do
+                 ! revert value
+                 ! assign subdiagonal entries of d from array e to
+                 ! subgiagonal entries of a.
+                 i = 1
+                 do while ( i<=n-1 )
+                    if( ipiv( i )<0 ) then
+                       a( i + 1, i ) = e( i )
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              end if
+              ! end a is lower
+           end if
+           return
+     end subroutine stdlib_ssyconvf_rook
+
+     !> SSYEQUB: computes row and column scalings intended to equilibrate a
+     !> symmetric matrix A (with respect to the Euclidean norm) and reduce
+     !> its condition number. The scale factors S are computed by the BIN
+     !> algorithm (see references) so that the scaled matrix B with elements
+     !> B(i,j) = S(i)*A(i,j)*S(j) has a condition number within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_ssyequb( uplo, n, a, lda, s, scond, amax, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(out) :: amax, scond
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: s(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: max_iter = 100
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, iter
+           real(sp) :: avg, std, tol, c0, c1, c2, t, u, si, d, base, smin, smax, smlnum, bignum, &
+                     scale, sumsq
+           logical(lk) :: up
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if ( .not. ( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -1
+           else if ( n < 0 ) then
+              info = -2
+           else if ( lda < max( 1, n ) ) then
+              info = -4
+           end if
+           if ( info /= 0 ) then
+              call stdlib_xerbla( 'SSYEQUB', -info )
+              return
+           end if
+           up = stdlib_lsame( uplo, 'U' )
+           amax = zero
+           ! quick return if possible.
+           if ( n == 0 ) then
+              scond = one
+              return
+           end if
+           do i = 1, n
+              s( i ) = zero
+           end do
+           amax = zero
+           if ( up ) then
+              do j = 1, n
+                 do i = 1, j-1
+                    s( i ) = max( s( i ), abs( a( i, j ) ) )
+                    s( j ) = max( s( j ), abs( a( i, j ) ) )
+                    amax = max( amax, abs( a( i, j ) ) )
+                 end do
+                 s( j ) = max( s( j ), abs( a( j, j ) ) )
+                 amax = max( amax, abs( a( j, j ) ) )
+              end do
+           else
+              do j = 1, n
+                 s( j ) = max( s( j ), abs( a( j, j ) ) )
+                 amax = max( amax, abs( a( j, j ) ) )
+                 do i = j+1, n
+                    s( i ) = max( s( i ), abs( a( i, j ) ) )
+                    s( j ) = max( s( j ), abs( a( i, j ) ) )
+                    amax = max( amax, abs( a( i, j ) ) )
+                 end do
+              end do
+           end if
+           do j = 1, n
+              s( j ) = 1.0_sp / s( j )
+           end do
+           tol = one / sqrt( 2.0_sp * n )
+           do iter = 1, max_iter
+              scale = zero
+              sumsq = zero
+              ! beta = |a|s
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              if ( up ) then
+                 do j = 1, n
+                    do i = 1, j-1
+                       work( i ) = work( i ) + abs( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + abs( a( i, j ) ) * s( i )
+                    end do
+                    work( j ) = work( j ) + abs( a( j, j ) ) * s( j )
+                 end do
+              else
+                 do j = 1, n
+                    work( j ) = work( j ) + abs( a( j, j ) ) * s( j )
+                    do i = j+1, n
+                       work( i ) = work( i ) + abs( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + abs( a( i, j ) ) * s( i )
+                    end do
+                 end do
+              end if
+              ! avg = s^t beta / n
+              avg = zero
+              do i = 1, n
+                 avg = avg + s( i )*work( i )
+              end do
+              avg = avg / n
+              std = zero
+              do i = n+1, 2*n
+                 work( i ) = s( i-n ) * work( i-n ) - avg
+              end do
+              call stdlib_slassq( n, work( n+1 ), 1, scale, sumsq )
+              std = scale * sqrt( sumsq / n )
+              if ( std < tol * avg ) goto 999
+              do i = 1, n
+                 t = abs( a( i, i ) )
+                 si = s( i )
+                 c2 = ( n-1 ) * t
+                 c1 = ( n-2 ) * ( work( i ) - t*si )
+                 c0 = -(t*si)*si + 2*work( i )*si - n*avg
+                 d = c1*c1 - 4*c0*c2
+                 if ( d <= 0 ) then
+                    info = -1
+                    return
+                 end if
+                 si = -2*c0 / ( c1 + sqrt( d ) )
+                 d = si - s( i )
+                 u = zero
+                 if ( up ) then
+                    do j = 1, i
+                       t = abs( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = abs( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 else
+                    do j = 1, i
+                       t = abs( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = abs( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 end if
+                 avg = avg + ( u + work( i ) ) * d / n
+                 s( i ) = si
+              end do
+           end do
+           999 continue
+           smlnum = stdlib_slamch( 'SAFEMIN' )
+           bignum = one / smlnum
+           smin = bignum
+           smax = zero
+           t = one / sqrt( avg )
+           base = stdlib_slamch( 'B' )
+           u = one / log( base )
+           do i = 1, n
+              s( i ) = base ** int( u * log( s( i ) * t ),KIND=ilp)
+              smin = min( smin, s( i ) )
+              smax = max( smax, s( i ) )
+           end do
+           scond = max( smin, smlnum ) / min( smax, bignum )
+     end subroutine stdlib_ssyequb
+
+     !> SSYGS2: reduces a real symmetric-definite generalized eigenproblem
+     !> to standard form.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**T)*A*inv(U) or inv(L)*A*inv(L**T)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**T or L**T *A*L.
+     !> B must have been previously factorized as U**T *U or L*L**T by SPOTRF.
+
+     pure subroutine stdlib_ssygs2( itype, uplo, n, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k
+           real(sp) :: akk, bkk, ct
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYGS2', -info )
+              return
+           end if
+           if( itype==1 ) then
+              if( upper ) then
+                 ! compute inv(u**t)*a*inv(u)
+                 do k = 1, n
+                    ! update the upper triangle of a(k:n,k:n)
+                    akk = a( k, k )
+                    bkk = b( k, k )
+                    akk = akk / bkk**2
+                    a( k, k ) = akk
+                    if( k<n ) then
+                       call stdlib_sscal( n-k, one / bkk, a( k, k+1 ), lda )
+                       ct = -half*akk
+                       call stdlib_saxpy( n-k, ct, b( k, k+1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_ssyr2( uplo, n-k, -one, a( k, k+1 ), lda,b( k, k+1 ), ldb, a( &
+                                 k+1, k+1 ), lda )
+                       call stdlib_saxpy( n-k, ct, b( k, k+1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_strsv( uplo, 'TRANSPOSE', 'NON-UNIT', n-k,b( k+1, k+1 ), ldb, &
+                                 a( k, k+1 ), lda )
+                    end if
+                 end do
+              else
+                 ! compute inv(l)*a*inv(l**t)
+                 do k = 1, n
+                    ! update the lower triangle of a(k:n,k:n)
+                    akk = a( k, k )
+                    bkk = b( k, k )
+                    akk = akk / bkk**2
+                    a( k, k ) = akk
+                    if( k<n ) then
+                       call stdlib_sscal( n-k, one / bkk, a( k+1, k ), 1 )
+                       ct = -half*akk
+                       call stdlib_saxpy( n-k, ct, b( k+1, k ), 1, a( k+1, k ), 1 )
+                       call stdlib_ssyr2( uplo, n-k, -one, a( k+1, k ), 1,b( k+1, k ), 1, a( k+1, &
+                                 k+1 ), lda )
+                       call stdlib_saxpy( n-k, ct, b( k+1, k ), 1, a( k+1, k ), 1 )
+                       call stdlib_strsv( uplo, 'NO TRANSPOSE', 'NON-UNIT', n-k,b( k+1, k+1 ), &
+                                 ldb, a( k+1, k ), 1 )
+                    end if
+                 end do
+              end if
+           else
+              if( upper ) then
+                 ! compute u*a*u**t
+                 do k = 1, n
+                    ! update the upper triangle of a(1:k,1:k)
+                    akk = a( k, k )
+                    bkk = b( k, k )
+                    call stdlib_strmv( uplo, 'NO TRANSPOSE', 'NON-UNIT', k-1, b,ldb, a( 1, k ), 1 &
+                              )
+                    ct = half*akk
+                    call stdlib_saxpy( k-1, ct, b( 1, k ), 1, a( 1, k ), 1 )
+                    call stdlib_ssyr2( uplo, k-1, one, a( 1, k ), 1, b( 1, k ), 1,a, lda )
+                    call stdlib_saxpy( k-1, ct, b( 1, k ), 1, a( 1, k ), 1 )
+                    call stdlib_sscal( k-1, bkk, a( 1, k ), 1 )
+                    a( k, k ) = akk*bkk**2
+                 end do
+              else
+                 ! compute l**t *a*l
+                 do k = 1, n
+                    ! update the lower triangle of a(1:k,1:k)
+                    akk = a( k, k )
+                    bkk = b( k, k )
+                    call stdlib_strmv( uplo, 'TRANSPOSE', 'NON-UNIT', k-1, b, ldb,a( k, 1 ), lda )
+                              
+                    ct = half*akk
+                    call stdlib_saxpy( k-1, ct, b( k, 1 ), ldb, a( k, 1 ), lda )
+                    call stdlib_ssyr2( uplo, k-1, one, a( k, 1 ), lda, b( k, 1 ),ldb, a, lda )
+                              
+                    call stdlib_saxpy( k-1, ct, b( k, 1 ), ldb, a( k, 1 ), lda )
+                    call stdlib_sscal( k-1, bkk, a( k, 1 ), lda )
+                    a( k, k ) = akk*bkk**2
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_ssygs2
+
+     !> SSYGST: reduces a real symmetric-definite generalized eigenproblem
+     !> to standard form.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**T)*A*inv(U) or inv(L)*A*inv(L**T)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**T or L**T*A*L.
+     !> B must have been previously factorized as U**T*U or L*L**T by SPOTRF.
+
+     pure subroutine stdlib_ssygst( itype, uplo, n, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYGST', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'SSYGST', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_ssygs2( itype, uplo, n, a, lda, b, ldb, info )
+           else
+              ! use blocked code
+              if( itype==1 ) then
+                 if( upper ) then
+                    ! compute inv(u**t)*a*inv(u)
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the upper triangle of a(k:n,k:n)
+                       call stdlib_ssygs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                       if( k+kb<=n ) then
+                          call stdlib_strsm( 'LEFT', uplo, 'TRANSPOSE', 'NON-UNIT',kb, n-k-kb+1, &
+                                    one, b( k, k ), ldb,a( k, k+kb ), lda )
+                          call stdlib_ssymm( 'LEFT', uplo, kb, n-k-kb+1, -half,a( k, k ), lda, b( &
+                                    k, k+kb ), ldb, one,a( k, k+kb ), lda )
+                          call stdlib_ssyr2k( uplo, 'TRANSPOSE', n-k-kb+1, kb, -one,a( k, k+kb ), &
+                                    lda, b( k, k+kb ), ldb,one, a( k+kb, k+kb ), lda )
+                          call stdlib_ssymm( 'LEFT', uplo, kb, n-k-kb+1, -half,a( k, k ), lda, b( &
+                                    k, k+kb ), ldb, one,a( k, k+kb ), lda )
+                          call stdlib_strsm( 'RIGHT', uplo, 'NO TRANSPOSE','NON-UNIT', kb, n-k-kb+&
+                                    1, one,b( k+kb, k+kb ), ldb, a( k, k+kb ),lda )
+                       end if
+                    end do
+                 else
+                    ! compute inv(l)*a*inv(l**t)
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the lower triangle of a(k:n,k:n)
+                       call stdlib_ssygs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                       if( k+kb<=n ) then
+                          call stdlib_strsm( 'RIGHT', uplo, 'TRANSPOSE', 'NON-UNIT',n-k-kb+1, kb, &
+                                    one, b( k, k ), ldb,a( k+kb, k ), lda )
+                          call stdlib_ssymm( 'RIGHT', uplo, n-k-kb+1, kb, -half,a( k, k ), lda, b(&
+                                     k+kb, k ), ldb, one,a( k+kb, k ), lda )
+                          call stdlib_ssyr2k( uplo, 'NO TRANSPOSE', n-k-kb+1, kb,-one, a( k+kb, k &
+                                    ), lda, b( k+kb, k ),ldb, one, a( k+kb, k+kb ), lda )
+                          call stdlib_ssymm( 'RIGHT', uplo, n-k-kb+1, kb, -half,a( k, k ), lda, b(&
+                                     k+kb, k ), ldb, one,a( k+kb, k ), lda )
+                          call stdlib_strsm( 'LEFT', uplo, 'NO TRANSPOSE','NON-UNIT', n-k-kb+1, &
+                                    kb, one,b( k+kb, k+kb ), ldb, a( k+kb, k ),lda )
+                       end if
+                    end do
+                 end if
+              else
+                 if( upper ) then
+                    ! compute u*a*u**t
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the upper triangle of a(1:k+kb-1,1:k+kb-1)
+                       call stdlib_strmm( 'LEFT', uplo, 'NO TRANSPOSE', 'NON-UNIT',k-1, kb, one, &
+                                 b, ldb, a( 1, k ), lda )
+                       call stdlib_ssymm( 'RIGHT', uplo, k-1, kb, half, a( k, k ),lda, b( 1, k ), &
+                                 ldb, one, a( 1, k ), lda )
+                       call stdlib_ssyr2k( uplo, 'NO TRANSPOSE', k-1, kb, one,a( 1, k ), lda, b( &
+                                 1, k ), ldb, one, a,lda )
+                       call stdlib_ssymm( 'RIGHT', uplo, k-1, kb, half, a( k, k ),lda, b( 1, k ), &
+                                 ldb, one, a( 1, k ), lda )
+                       call stdlib_strmm( 'RIGHT', uplo, 'TRANSPOSE', 'NON-UNIT',k-1, kb, one, b( &
+                                 k, k ), ldb, a( 1, k ),lda )
+                       call stdlib_ssygs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                    end do
+                 else
+                    ! compute l**t*a*l
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the lower triangle of a(1:k+kb-1,1:k+kb-1)
+                       call stdlib_strmm( 'RIGHT', uplo, 'NO TRANSPOSE', 'NON-UNIT',kb, k-1, one, &
+                                 b, ldb, a( k, 1 ), lda )
+                       call stdlib_ssymm( 'LEFT', uplo, kb, k-1, half, a( k, k ),lda, b( k, 1 ), &
+                                 ldb, one, a( k, 1 ), lda )
+                       call stdlib_ssyr2k( uplo, 'TRANSPOSE', k-1, kb, one,a( k, 1 ), lda, b( k, &
+                                 1 ), ldb, one, a,lda )
+                       call stdlib_ssymm( 'LEFT', uplo, kb, k-1, half, a( k, k ),lda, b( k, 1 ), &
+                                 ldb, one, a( k, 1 ), lda )
+                       call stdlib_strmm( 'LEFT', uplo, 'TRANSPOSE', 'NON-UNIT', kb,k-1, one, b( &
+                                 k, k ), ldb, a( k, 1 ), lda )
+                       call stdlib_ssygs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ssygst
+
+     !> SSYSWAPR: applies an elementary permutation on the rows and the columns of
+     !> a symmetric matrix.
+
+     pure subroutine stdlib_ssyswapr( uplo, n, a, lda, i1, i2)
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: i1, i2, lda, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,n)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           real(sp) :: tmp
+           ! Executable Statements 
+           upper = stdlib_lsame( uplo, 'U' )
+           if (upper) then
+               ! upper
+               ! first swap
+                ! - swap column i1 and i2 from i1 to i1-1
+              call stdlib_sswap( i1-1, a(1,i1), 1, a(1,i2), 1 )
+                ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap row i1 from i1+1 to i2-1 with col i2 from i1+1 to i2-1
+              tmp=a(i1,i1)
+              a(i1,i1)=a(i2,i2)
+              a(i2,i2)=tmp
+              do i=1,i2-i1-1
+                 tmp=a(i1,i1+i)
+                 a(i1,i1+i)=a(i1+i,i2)
+                 a(i1+i,i2)=tmp
+              end do
+                ! third swap
+                ! - swap row i1 and i2 from i2+1 to n
+              do i=i2+1,n
+                 tmp=a(i1,i)
+                 a(i1,i)=a(i2,i)
+                 a(i2,i)=tmp
+              end do
+             else
+               ! lower
+               ! first swap
+                ! - swap row i1 and i2 from i1 to i1-1
+              call stdlib_sswap( i1-1, a(i1,1), lda, a(i2,1), lda )
+               ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap col i1 from i1+1 to i2-1 with row i2 from i1+1 to i2-1
+               tmp=a(i1,i1)
+               a(i1,i1)=a(i2,i2)
+               a(i2,i2)=tmp
+               do i=1,i2-i1-1
+                  tmp=a(i1+i,i1)
+                  a(i1+i,i1)=a(i2,i1+i)
+                  a(i2,i1+i)=tmp
+               end do
+               ! third swap
+                ! - swap col i1 and i2 from i2+1 to n
+               do i=i2+1,n
+                  tmp=a(i,i1)
+                  a(i,i1)=a(i,i2)
+                  a(i,i2)=tmp
+               end do
+           endif
+     end subroutine stdlib_ssyswapr
+
+     !> SSYTF2_RK: computes the factorization of a real symmetric matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_ssytf2_rk( uplo, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: e(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper, done
+           integer(ilp) :: i, imax, j, jmax, itemp, k, kk, kp, kstep, p, ii
+           real(sp) :: absakk, alpha, colmax, d11, d12, d21, d22, rowmax, stemp, t, wk, wkm1, &
+                     wkp1, sfmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTF2_RK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = zero
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 34
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_isamax( k-1, a( 1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = zero
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange,
+                    ! use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_isamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = abs( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_isamax( imax-1, a( 1, imax ), 1 )
+                          stemp = abs( a( itemp, imax ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( abs( a( imax, imax ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the leading
+                    ! submatrix a(1:k,1:k) if we have a 2-by-2 pivot
+                    if( p>1 )call stdlib_sswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    if( p<(k-1) )call stdlib_sswap( k-p-1, a( p+1, k ), 1, a( p, p+1 ),lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_sswap( n-k, a( k, k+1 ), lda, a( p, k+1 ), lda )
+                 end if
+                 ! second swap
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    if( kp>1 )call stdlib_sswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    if( ( kk>1 ) .and. ( kp<(kk-1) ) )call stdlib_sswap( kk-kp-1, a( kp+1, kk ), &
+                              1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_sswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = one / a( k, k )
+                          call stdlib_ssyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_sscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_ssyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = zero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = one / ( d11*d22-one )
+                       do j = k - 2, 1, -1
+                          wkm1 = t*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = t*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - (a( i, k ) / d12 )*wk -( a( i, k-1 ) / d12 )&
+                                       *wkm1
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d12
+                          a( j, k-1 ) = wkm1 / d12
+                       end do
+                    end if
+                    ! copy superdiagonal elements of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    e( k ) = a( k-1, k )
+                    e( k-1 ) = zero
+                    a( k-1, k ) = zero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              34 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = zero
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 64
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_isamax( n-k, a( k+1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = zero
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_isamax( imax-k, a( imax, k ), lda )
+                          rowmax = abs( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_isamax( n-imax, a( imax+1, imax ),1 )
+                          stemp = abs( a( itemp, imax ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( abs( a( imax, imax ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 42
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the trailing
+                    ! submatrix a(k:n,k:n) if we have a 2-by-2 pivot
+                    if( p<n )call stdlib_sswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    if( p>(k+1) )call stdlib_sswap( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_sswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                 end if
+                 ! second swap
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_sswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    if( ( kk<n ) .and. ( kp>(kk+1) ) )call stdlib_sswap( kp-kk-1, a( kk+1, kk ), &
+                              1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_sswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                    ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                    ! store l(k) in column k
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = one / a( k, k )
+                          call stdlib_ssyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_sscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_ssyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = zero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = t*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = t*( d22*a( j, k+1 )-a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - ( a( i, k ) / d21 )*wk -( a( i, k+1 ) / d21 )&
+                                       *wkp1
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d21
+                          a( j, k+1 ) = wkp1 / d21
+                       end do
+                    end if
+                    ! copy subdiagonal elements of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    e( k ) = a( k+1, k )
+                    e( k+1 ) = zero
+                    a( k+1, k ) = zero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+              64 continue
+           end if
+           return
+     end subroutine stdlib_ssytf2_rk
+
+     !> SSYTF2_ROOK: computes the factorization of a real symmetric matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**T is the transpose of U, and D is symmetric and
+     !> block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_ssytf2_rook( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper, done
+           integer(ilp) :: i, imax, j, jmax, itemp, k, kk, kp, kstep, p, ii
+           real(sp) :: absakk, alpha, colmax, d11, d12, d21, d22, rowmax, stemp, t, wk, wkm1, &
+                     wkp1, sfmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTF2_ROOK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_isamax( k-1, a( 1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange,
+                    ! use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_isamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = abs( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_isamax( imax-1, a( 1, imax ), 1 )
+                          stemp = abs( a( itemp, imax ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( abs( a( imax, imax ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the leading
+                    ! submatrix a(1:k,1:k) if we have a 2-by-2 pivot
+                    if( p>1 )call stdlib_sswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    if( p<(k-1) )call stdlib_sswap( k-p-1, a( p+1, k ), 1, a( p, p+1 ),lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                 end if
+                 ! second swap
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    if( kp>1 )call stdlib_sswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    if( ( kk>1 ) .and. ( kp<(kk-1) ) )call stdlib_sswap( kk-kp-1, a( kp+1, kk ), &
+                              1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = one / a( k, k )
+                          call stdlib_ssyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_sscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_ssyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = one / ( d11*d22-one )
+                       do j = k - 2, 1, -1
+                          wkm1 = t*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = t*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - (a( i, k ) / d12 )*wk -( a( i, k-1 ) / d12 )&
+                                       *wkm1
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d12
+                          a( j, k-1 ) = wkm1 / d12
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_isamax( n-k, a( k+1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_isamax( imax-k, a( imax, k ), lda )
+                          rowmax = abs( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_isamax( n-imax, a( imax+1, imax ),1 )
+                          stemp = abs( a( itemp, imax ) )
+                          if( stemp>rowmax ) then
+                             rowmax = stemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( abs( a( imax, imax ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 42
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the trailing
+                    ! submatrix a(k:n,k:n) if we have a 2-by-2 pivot
+                    if( p<n )call stdlib_sswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    if( p>(k+1) )call stdlib_sswap( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                 end if
+                 ! second swap
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_sswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    if( ( kk<n ) .and. ( kp>(kk+1) ) )call stdlib_sswap( kp-kk-1, a( kk+1, kk ), &
+                              1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                    ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                    ! store l(k) in column k
+                       if( abs( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = one / a( k, k )
+                          call stdlib_ssyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_sscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_ssyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = t*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = t*( d22*a( j, k+1 )-a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - ( a( i, k ) / d21 )*wk -( a( i, k+1 ) / d21 )&
+                                       *wkp1
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d21
+                          a( j, k+1 ) = wkp1 / d21
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_ssytf2_rook
+
+     !> SSYTRF_RK: computes the factorization of a real symmetric matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_ssytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, ip, iws, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'SSYTRF_RK', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRF_RK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'SSYTRF_RK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_slasyf_rk;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 15
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_slasyf_rk( uplo, k, nb, kb, a, lda, e,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_ssytf2_rk( uplo, k, a, lda, e, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k-kb+1:k and apply row permutations to the
+              ! last k+1 colunms k+1:n after that block
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k<n ) then
+                 do i = k, ( k - kb + 1 ), -1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_sswap( n-k, a( i, k+1 ), lda,a( ip, k+1 ), lda )
+                    end if
+                 end do
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+              ! this label is the exit from main loop over k decreasing
+              ! from n to 1 in steps of kb
+              15 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_slasyf_rk;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 35
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_slasyf_rk( uplo, n-k+1, nb, kb, a( k, k ), lda, e( k ),ipiv( k ), &
+                           work, ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_ssytf2_rk( uplo, n-k+1, a( k, k ), lda, e( k ),ipiv( k ), iinfo )
+                           
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do i = k, k + kb - 1
+                 if( ipiv( i )>0 ) then
+                    ipiv( i ) = ipiv( i ) + k - 1
+                 else
+                    ipiv( i ) = ipiv( i ) - k + 1
+                 end if
+              end do
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k:k+kb-1 and apply row permutations to the
+              ! first k-1 colunms 1:k-1 before that block
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k>1 ) then
+                 do i = k, ( k + kb - 1 ), 1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_sswap( k-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                    end if
+                 end do
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+              ! this label is the exit from main loop over k increasing
+              ! from 1 to n in steps of kb
+              35 continue
+           ! end lower
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssytrf_rk
+
+     !> SSYTRF_ROOK: computes the factorization of a real symmetric matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+     !> The form of the factorization is
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_ssytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'SSYTRF_ROOK', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, n*nb )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRF_ROOK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'SSYTRF_ROOK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_slasyf_rook;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_slasyf_rook( uplo, k, nb, kb, a, lda,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_ssytf2_rook( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_slasyf_rook;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_slasyf_rook( uplo, n-k+1, nb, kb, a( k, k ), lda,ipiv( k ), work, &
+                           ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_ssytf2_rook( uplo, n-k+1, a( k, k ), lda, ipiv( k ),iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssytrf_rook
+
+     !> SSYTRI: computes the inverse of a real symmetric indefinite matrix
+     !> A using the factorization A = U*D*U**T or A = L*D*L**T computed by
+     !> SSYTRF.
+
+     pure subroutine stdlib_ssytri( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kp, kstep
+           real(sp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==zero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 40
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_scopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_ssymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_sdot( k-1, work, 1, a( 1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k+1 ) )
+                 ak = a( k, k ) / t
+                 akp1 = a( k+1, k+1 ) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_scopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_ssymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_sdot( k-1, work, 1, a( 1, k ),1 )
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_sdot( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_scopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_ssymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -stdlib_sdot( k-1, work, 1, a( 1, k+1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 call stdlib_sswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                 call stdlib_sswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              go to 30
+              40 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              50 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 60
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_scopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_ssymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_sdot( n-k, work, 1, a( k+1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k-1 ) )
+                 ak = a( k-1, k-1 ) / t
+                 akp1 = a( k, k ) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_scopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_ssymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_sdot( n-k, work, 1, a( k+1, k ),1 )
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_sdot( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 )
+                              
+                    call stdlib_scopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_ssymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -stdlib_sdot( n-k, work, 1, a( k+1, k-1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 if( kp<n )call stdlib_sswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                 call stdlib_sswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              go to 50
+              60 continue
+           end if
+           return
+     end subroutine stdlib_ssytri
+
+     !> SSYTRI_ROOK: computes the inverse of a real symmetric
+     !> matrix A using the factorization A = U*D*U**T or A = L*D*L**T
+     !> computed by SSYTRF_ROOK.
+
+     pure subroutine stdlib_ssytri_rook( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kp, kstep
+           real(sp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRI_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==zero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 40
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_scopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_ssymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_sdot( k-1, work, 1, a( 1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k+1 ) )
+                 ak = a( k, k ) / t
+                 akp1 = a( k+1, k+1 ) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_scopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_ssymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_sdot( k-1, work, 1, a( 1, k ),1 )
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_sdot( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_scopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_ssymv( uplo, k-1, -one, a, lda, work, 1, zero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -stdlib_sdot( k-1, work, 1, a( 1, k+1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_sswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_sswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k+1 with -ipiv(k) and
+                 ! -ipiv(k+1)in the leading submatrix a(1:k+1,1:k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_sswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_sswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+                 k = k + 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_sswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_sswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k + 1
+              go to 30
+              40 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              50 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 60
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_scopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_ssymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_sdot( n-k, work, 1, a( k+1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k-1 ) )
+                 ak = a( k-1, k-1 ) / t
+                 akp1 = a( k, k ) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_scopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_ssymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_sdot( n-k, work, 1, a( k+1, k ),1 )
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_sdot( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 )
+                              
+                    call stdlib_scopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_ssymv( uplo, n-k, -one, a( k+1, k+1 ), lda, work, 1,zero, a( k+1, &
+                              k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -stdlib_sdot( n-k, work, 1, a( k+1, k-1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_sswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_sswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k-1 with -ipiv(k) and
+                 ! -ipiv(k-1) in the trailing submatrix a(k-1:n,k-1:n)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_sswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_sswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+                 k = k - 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_sswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_sswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k - 1
+              go to 50
+              60 continue
+           end if
+           return
+     end subroutine stdlib_ssytri_rook
+
+     !> SSYTRS: solves a system of linear equations A*X = B with a real
+     !> symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by SSYTRF.
+
+     pure subroutine stdlib_ssytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           real(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_sger( k-1, nrhs, -one, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_sscal( nrhs, one / a( k, k ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_sswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_sger( k-2, nrhs, -one, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 call stdlib_sger( k-2, nrhs, -one, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ), &
+                           ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_sgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb, a( 1, k ),1, one, b( k, &
+                           1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 call stdlib_sgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb, a( 1, k ),1, one, b( k, &
+                           1 ), ldb )
+                 call stdlib_sgemv( 'TRANSPOSE', k-1, nrhs, -one, b, ldb,a( 1, k+1 ), 1, one, b( &
+                           k+1, 1 ), ldb )
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_sger( n-k, nrhs, -one, a( k+1, k ), 1, b( k, 1 ),ldb, b( k+&
+                           1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_sscal( nrhs, one / a( k, k ), b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_sswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_sger( n-k-1, nrhs, -one, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, 1 &
+                              ), ldb )
+                    call stdlib_sger( n-k-1, nrhs, -one, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( k+&
+                              2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / akm1k
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_sgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+&
+                           1, k ), 1, one, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_sgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+1, k ),&
+                               1, one, b( k, 1 ), ldb )
+                    call stdlib_sgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+1, k-1 &
+                              ), 1, one, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_ssytrs
+
+     !> SSYTRS2: solves a system of linear equations A*X = B with a real
+     !> symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by SSYTRF and converted by SSYCONV.
+
+     pure subroutine stdlib_ssytrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, j, k, kp
+           real(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRS2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! convert a
+           call stdlib_ssyconv( uplo, 'C', n, a, lda, ipiv, work, iinfo )
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+             ! p**t * b
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp==-ipiv( k-1 ) )call stdlib_sswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb &
+                           )
+                 k=k-2
+              end if
+             end do
+        ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+             call stdlib_strsm('L','U','N','U',n,nrhs,one,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i=n
+              do while ( i >= 1 )
+                 if( ipiv(i) > 0 ) then
+                   call stdlib_sscal( nrhs, one / a( i, i ), b( i, 1 ), ldb )
+                 elseif ( i > 1) then
+                    if ( ipiv(i-1) == ipiv(i) ) then
+                       akm1k = work(i)
+                       akm1 = a( i-1, i-1 ) / akm1k
+                       ak = a( i, i ) / akm1k
+                       denom = akm1*ak - one
+                       do j = 1, nrhs
+                          bkm1 = b( i-1, j ) / akm1k
+                          bk = b( i, j ) / akm1k
+                          b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                          b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                    i = i - 1
+                    endif
+                 endif
+                 i = i - 1
+              end do
+            ! compute (u**t \ b) -> b   [ u**t \ (d \ (u \p**t * b) ) ]
+              call stdlib_strsm('L','U','T','U',n,nrhs,one,a,lda,b,ldb)
+             ! p * b  [ p * (u**t \ (d \ (u \p**t * b) )) ]
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k < n .and. kp==-ipiv( k+1 ) )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp,&
+                            1 ), ldb )
+                 k=k+2
+              endif
+             end do
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+             ! p**t * b
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k+1).
+                 kp = -ipiv( k+1 )
+                 if( kp==-ipiv( k ) )call stdlib_sswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                           
+                 k=k+2
+              endif
+             end do
+        ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+             call stdlib_strsm('L','L','N','U',n,nrhs,one,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i=1
+              do while ( i <= n )
+                 if( ipiv(i) > 0 ) then
+                   call stdlib_sscal( nrhs, one / a( i, i ), b( i, 1 ), ldb )
+                 else
+                       akm1k = work(i)
+                       akm1 = a( i, i ) / akm1k
+                       ak = a( i+1, i+1 ) / akm1k
+                       denom = akm1*ak - one
+                       do j = 1, nrhs
+                          bkm1 = b( i, j ) / akm1k
+                          bk = b( i+1, j ) / akm1k
+                          b( i, j ) = ( ak*bkm1-bk ) / denom
+                          b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                       i = i + 1
+                 endif
+                 i = i + 1
+              end do
+        ! compute (l**t \ b) -> b   [ l**t \ (d \ (l \p**t * b) ) ]
+             call stdlib_strsm('L','L','T','U',n,nrhs,one,a,lda,b,ldb)
+             ! p * b  [ p * (l**t \ (d \ (l \p**t * b) )) ]
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k>1 .and. kp==-ipiv( k-1 ) )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, &
+                           1 ), ldb )
+                 k=k-2
+              endif
+             end do
+           end if
+           ! revert a
+           call stdlib_ssyconv( uplo, 'R', n, a, lda, ipiv, work, iinfo )
+           return
+     end subroutine stdlib_ssytrs2
+
+     !> SSYTRS_3: solves a system of linear equations A * X = B with a real
+     !> symmetric matrix A using the factorization computed
+     !> by SSYTRF_RK or SSYTRF_BK:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This algorithm is using Level 3 BLAS.
+
+     pure subroutine stdlib_ssytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(in) :: a(lda,*), e(*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j, k, kp
+           real(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRS_3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! begin upper
+              ! solve a*x = b, where a = u*d*u**t.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+              call stdlib_strsm( 'L', 'U', 'N', 'U', n, nrhs, one, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i = n
+              do while ( i>=1 )
+                 if( ipiv( i )>0 ) then
+                    call stdlib_sscal( nrhs, one / a( i, i ), b( i, 1 ), ldb )
+                 else if ( i>1 ) then
+                    akm1k = e( i )
+                    akm1 = a( i-1, i-1 ) / akm1k
+                    ak = a( i, i ) / akm1k
+                    denom = akm1*ak - one
+                    do j = 1, nrhs
+                       bkm1 = b( i-1, j ) / akm1k
+                       bk = b( i, j ) / akm1k
+                       b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                       b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i - 1
+                 end if
+                 i = i - 1
+              end do
+              ! compute (u**t \ b) -> b   [ u**t \ (d \ (u \p**t * b) ) ]
+              call stdlib_strsm( 'L', 'U', 'T', 'U', n, nrhs, one, a, lda, b, ldb )
+              ! p * b  [ p * (u**t \ (d \ (u \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+           else
+              ! begin lower
+              ! solve a*x = b, where a = l*d*l**t.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+              call stdlib_strsm( 'L', 'L', 'N', 'U', n, nrhs, one, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i = 1
+              do while ( i<=n )
+                 if( ipiv( i )>0 ) then
+                    call stdlib_sscal( nrhs, one / a( i, i ), b( i, 1 ), ldb )
+                 else if( i<n ) then
+                    akm1k = e( i )
+                    akm1 = a( i, i ) / akm1k
+                    ak = a( i+1, i+1 ) / akm1k
+                    denom = akm1*ak - one
+                    do  j = 1, nrhs
+                       bkm1 = b( i, j ) / akm1k
+                       bk = b( i+1, j ) / akm1k
+                       b( i, j ) = ( ak*bkm1-bk ) / denom
+                       b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i + 1
+                 end if
+                 i = i + 1
+              end do
+              ! compute (l**t \ b) -> b   [ l**t \ (d \ (l \p**t * b) ) ]
+              call stdlib_strsm('L', 'L', 'T', 'U', n, nrhs, one, a, lda, b, ldb )
+              ! p * b  [ p * (l**t \ (d \ (l \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! end lower
+           end if
+           return
+     end subroutine stdlib_ssytrs_3
+
+     !> SSYTRS_AA: solves a system of linear equations A*X = B with a real
+     !> symmetric matrix A using the factorization A = U**T*T*U or
+     !> A = L*T*L**T computed by SSYTRF_AA.
+
+     pure subroutine stdlib_ssytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, nrhs, lda, ldb, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: b(ldb,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           logical(lk) :: lquery, upper
+           integer(ilp) :: k, kp, lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, 3*n-2 ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRS_AA', -info )
+              return
+           else if( lquery ) then
+              lwkopt = (3*n-2)
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u**t*t*u.
+              ! 1) forward substitution with u**t
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 k = 1
+                 do while ( k<=n )
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                    k = k + 1
+                 end do
+                 ! compute u**t \ b -> b    [ (u**t \p**t * b) ]
+                 call stdlib_strsm( 'L', 'U', 'T', 'U', n-1, nrhs, one, a( 1, 2 ),lda, b( 2, 1 ), &
+                           ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (u**t \p**t * b) ]
+              call stdlib_slacpy( 'F', 1, n, a(1, 1), lda+1, work(n), 1)
+              if( n>1 ) then
+                  call stdlib_slacpy( 'F', 1, n-1, a(1, 2), lda+1, work(1), 1)
+                  call stdlib_slacpy( 'F', 1, n-1, a(1, 2), lda+1, work(2*n), 1)
+              end if
+              call stdlib_sgtsv(n, nrhs, work(1), work(n), work(2*n), b, ldb,info)
+              ! 3) backward substitution with u
+              if( n>1 ) then
+                 ! compute u \ b -> b   [ u \ (t \ (u**t \p**t * b) ) ]
+                 call stdlib_strsm( 'L', 'U', 'N', 'U', n-1, nrhs, one, a( 1, 2 ),lda, b(2, 1), &
+                           ldb)
+                 ! pivot, p * b -> b  [ p * (u \ (t \ (u**t \p**t * b) )) ]
+                 k = n
+                 do while ( k>=1 )
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                    k = k - 1
+                 end do
+              end if
+           else
+              ! solve a*x = b, where a = l*t*l**t.
+              ! 1) forward substitution with l
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 k = 1
+                 do while ( k<=n )
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                    k = k + 1
+                 end do
+                 ! compute l \ b -> b    [ (l \p**t * b) ]
+                 call stdlib_strsm( 'L', 'L', 'N', 'U', n-1, nrhs, one, a( 2, 1),lda, b(2, 1), &
+                           ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (l \p**t * b) ]
+              call stdlib_slacpy( 'F', 1, n, a(1, 1), lda+1, work(n), 1)
+              if( n>1 ) then
+                  call stdlib_slacpy( 'F', 1, n-1, a(2, 1), lda+1, work(1), 1)
+                  call stdlib_slacpy( 'F', 1, n-1, a(2, 1), lda+1, work(2*n), 1)
+              end if
+              call stdlib_sgtsv(n, nrhs, work(1), work(n), work(2*n), b, ldb,info)
+              ! 3) backward substitution with l**t
+              if( n>1 ) then
+                 ! compute l**t \ b -> b   [ l**t \ (t \ (l \p**t * b) ) ]
+                 call stdlib_strsm( 'L', 'L', 'T', 'U', n-1, nrhs, one, a( 2, 1 ),lda, b( 2, 1 ), &
+                           ldb)
+                 ! pivot, p * b -> b  [ p * (l**t \ (t \ (l \p**t * b) )) ]
+                 k = n
+                 do while ( k>=1 )
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                    k = k - 1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_ssytrs_aa
+
+     !> SSYTRS_ROOK: solves a system of linear equations A*X = B with
+     !> a real symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by SSYTRF_ROOK.
+
+     pure subroutine stdlib_ssytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           real(sp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRS_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_sger( k-1, nrhs, -one, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_sscal( nrhs, one / a( k, k ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k) then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_sswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k>2 ) then
+                    call stdlib_sger( k-2, nrhs, -one, a( 1, k ), 1, b( k, 1 ),ldb, b( 1, 1 ), &
+                              ldb )
+                    call stdlib_sger( k-2, nrhs, -one, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ),&
+                               ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 )call stdlib_sgemv( 'TRANSPOSE', k-1, nrhs, -one, b,ldb, a( 1, k ), 1, &
+                           one, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_sgemv( 'TRANSPOSE', k-1, nrhs, -one, b,ldb, a( 1, k ), 1, one, b( &
+                              k, 1 ), ldb )
+                    call stdlib_sgemv( 'TRANSPOSE', k-1, nrhs, -one, b,ldb, a( 1, k+1 ), 1, one, &
+                              b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k) then k+1 and -ipiv(k+1).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_sswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_sger( n-k, nrhs, -one, a( k+1, k ), 1, b( k, 1 ),ldb, b( k+&
+                           1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_sscal( nrhs, one / a( k, k ), b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k) then k+1 and -ipiv(k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_sswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_sger( n-k-1, nrhs, -one, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, 1 &
+                              ), ldb )
+                    call stdlib_sger( n-k-1, nrhs, -one, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( k+&
+                              2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / akm1k
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - one
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_sgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+&
+                           1, k ), 1, one, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_sgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+1, k ),&
+                               1, one, b( k, 1 ), ldb )
+                    call stdlib_sgemv( 'TRANSPOSE', n-k, nrhs, -one, b( k+1, 1 ),ldb, a( k+1, k-1 &
+                              ), 1, one, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k) then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_sswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_sswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_ssytrs_rook
+
+     !> STBRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular band
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by STBTRS or some other
+     !> means before entering this routine.  STBRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_stbrfs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, x, ldx, ferr,&
+                berr, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: ab(ldab,*), b(ldb,*), x(ldx,*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transt
+           integer(ilp) :: i, j, k, kase, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = kd + 2
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a or a**t, depending on trans.
+              call stdlib_scopy( n, x( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_stbmv( uplo, trans, diag, n, kd, ab, ldab, work( n+1 ),1 )
+              call stdlib_saxpy( n, -one, b( 1, j ), 1, work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = max( 1, k-kd ), k
+                             work( i ) = work( i ) +abs( ab( kd+1+i-k, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = max( 1, k-kd ), k - 1
+                             work( i ) = work( i ) +abs( ab( kd+1+i-k, k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k, min( n, k+kd )
+                             work( i ) = work( i ) + abs( ab( 1+i-k, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k + 1, min( n, k+kd )
+                             work( i ) = work( i ) + abs( ab( 1+i-k, k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**t)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = max( 1, k-kd ), k
+                             s = s + abs( ab( kd+1+i-k, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = max( 1, k-kd ), k - 1
+                             s = s + abs( ab( kd+1+i-k, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, min( n, k+kd )
+                             s = s + abs( ab( 1+i-k, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = k + 1, min( n, k+kd )
+                             s = s + abs( ab( 1+i-k, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_slacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_slacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_stbsv( uplo, transt, diag, n, kd, ab, ldab,work( n+1 ), 1 )
+                              
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_stbsv( uplo, trans, diag, n, kd, ab, ldab,work( n+1 ), 1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_stbrfs
+
+     !> STBTRS: solves a triangular system of the form
+     !> A * X = B  or  A**T * X = B,
+     !> where A is a triangular band matrix of order N, and B is an
+     !> N-by NRHS matrix.  A check is made to verify that A is nonsingular.
+
+     pure subroutine stdlib_stbtrs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(in) :: ab(ldab,*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nounit = stdlib_lsame( diag, 'N' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              if( upper ) then
+                 do info = 1, n
+                    if( ab( kd+1, info )==zero )return
+                 end do
+              else
+                 do info = 1, n
+                    if( ab( 1, info )==zero )return
+                 end do
+              end if
+           end if
+           info = 0
+           ! solve a * x = b  or  a**t * x = b.
+           do j = 1, nrhs
+              call stdlib_stbsv( uplo, trans, diag, n, kd, ab, ldab, b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_stbtrs
+
+     !> Level 3 BLAS like routine for A in RFP Format.
+     !> STFSM:  solves the matrix equation
+     !> op( A )*X = alpha*B  or  X*op( A ) = alpha*B
+     !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**T.
+     !> A is in Rectangular Full Packed (RFP) Format.
+     !> The matrix X is overwritten on B.
+
+     pure subroutine stdlib_stfsm( transr, side, uplo, trans, diag, m, n, alpha, a,b, ldb )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, diag, side, trans, uplo
+           integer(ilp), intent(in) :: ldb, m, n
+           real(sp), intent(in) :: alpha
+           ! Array Arguments 
+           real(sp), intent(in) :: a(0:*)
+           real(sp), intent(inout) :: b(0:ldb-1,0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lside, misodd, nisodd, normaltransr, notrans
+           integer(ilp) :: m1, m2, n1, n2, k, info, i, j
+           ! Intrinsic Functions 
+           intrinsic :: max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lside = stdlib_lsame( side, 'L' )
+           lower = stdlib_lsame( uplo, 'L' )
+           notrans = stdlib_lsame( trans, 'N' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lside .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -2
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -3
+           else if( .not.notrans .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -4
+           else if( .not.stdlib_lsame( diag, 'N' ) .and. .not.stdlib_lsame( diag, 'U' ) )&
+                     then
+              info = -5
+           else if( m<0 ) then
+              info = -6
+           else if( n<0 ) then
+              info = -7
+           else if( ldb<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STFSM ', -info )
+              return
+           end if
+           ! quick return when ( (n==0).or.(m==0) )
+           if( ( m==0 ) .or. ( n==0 ) )return
+           ! quick return when alpha==(0e+0_sp)
+           if( alpha==zero ) then
+              do j = 0, n - 1
+                 do i = 0, m - 1
+                    b( i, j ) = zero
+                 end do
+              end do
+              return
+           end if
+           if( lside ) then
+              ! side = 'l'
+              ! a is m-by-m.
+              ! if m is odd, set nisodd = .true., and m1 and m2.
+              ! if m is even, nisodd = .false., and m.
+              if( mod( m, 2 )==0 ) then
+                 misodd = .false.
+                 k = m / 2
+              else
+                 misodd = .true.
+                 if( lower ) then
+                    m2 = m / 2
+                    m1 = m - m2
+                 else
+                    m1 = m / 2
+                    m2 = m - m1
+                 end if
+              end if
+              if( misodd ) then
+                 ! side = 'l' and n is odd
+                 if( normaltransr ) then
+                    ! side = 'l', n is odd, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='l', n is odd, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'n'
+                          if( m==1 ) then
+                             call stdlib_strsm( 'L', 'L', 'N', diag, m1, n, alpha,a, m, b, ldb )
+                                       
+                          else
+                             call stdlib_strsm( 'L', 'L', 'N', diag, m1, n, alpha,a( 0 ), m, b, &
+                                       ldb )
+                             call stdlib_sgemm( 'N', 'N', m2, n, m1, -one, a( m1 ),m, b, ldb, &
+                                       alpha, b( m1, 0 ), ldb )
+                             call stdlib_strsm( 'L', 'U', 'T', diag, m2, n, one,a( m ), m, b( m1, &
+                                       0 ), ldb )
+                          end if
+                       else
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 't'
+                          if( m==1 ) then
+                             call stdlib_strsm( 'L', 'L', 'T', diag, m1, n, alpha,a( 0 ), m, b, &
+                                       ldb )
+                          else
+                             call stdlib_strsm( 'L', 'U', 'N', diag, m2, n, alpha,a( m ), m, b( &
+                                       m1, 0 ), ldb )
+                             call stdlib_sgemm( 'T', 'N', m1, n, m2, -one, a( m1 ),m, b( m1, 0 ), &
+                                       ldb, alpha, b, ldb )
+                             call stdlib_strsm( 'L', 'L', 'T', diag, m1, n, one,a( 0 ), m, b, ldb &
+                                       )
+                          end if
+                       end if
+                    else
+                       ! side  ='l', n is odd, transr = 'n', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_strsm( 'L', 'L', 'N', diag, m1, n, alpha,a( m2 ), m, b, ldb &
+                                    )
+                          call stdlib_sgemm( 'T', 'N', m2, n, m1, -one, a( 0 ), m,b, ldb, alpha, &
+                                    b( m1, 0 ), ldb )
+                          call stdlib_strsm( 'L', 'U', 'T', diag, m2, n, one,a( m1 ), m, b( m1, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 't'
+                          call stdlib_strsm( 'L', 'U', 'N', diag, m2, n, alpha,a( m1 ), m, b( m1, &
+                                    0 ), ldb )
+                          call stdlib_sgemm( 'N', 'N', m1, n, m2, -one, a( 0 ), m,b( m1, 0 ), ldb,&
+                                     alpha, b, ldb )
+                          call stdlib_strsm( 'L', 'L', 'T', diag, m1, n, one,a( m2 ), m, b, ldb )
+                                    
+                       end if
+                    end if
+                 else
+                    ! side = 'l', n is odd, and transr = 't'
+                    if( lower ) then
+                       ! side  ='l', n is odd, transr = 't', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is odd, transr = 't', uplo = 'l', and
+                          ! trans = 'n'
+                          if( m==1 ) then
+                             call stdlib_strsm( 'L', 'U', 'T', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                          else
+                             call stdlib_strsm( 'L', 'U', 'T', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                             call stdlib_sgemm( 'T', 'N', m2, n, m1, -one,a( m1*m1 ), m1, b, ldb, &
+                                       alpha,b( m1, 0 ), ldb )
+                             call stdlib_strsm( 'L', 'L', 'N', diag, m2, n, one,a( 1 ), m1, b( m1,&
+                                        0 ), ldb )
+                          end if
+                       else
+                          ! side  ='l', n is odd, transr = 't', uplo = 'l', and
+                          ! trans = 't'
+                          if( m==1 ) then
+                             call stdlib_strsm( 'L', 'U', 'N', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                          else
+                             call stdlib_strsm( 'L', 'L', 'T', diag, m2, n, alpha,a( 1 ), m1, b( &
+                                       m1, 0 ), ldb )
+                             call stdlib_sgemm( 'N', 'N', m1, n, m2, -one,a( m1*m1 ), m1, b( m1, &
+                                       0 ), ldb,alpha, b, ldb )
+                             call stdlib_strsm( 'L', 'U', 'N', diag, m1, n, one,a( 0 ), m1, b, &
+                                       ldb )
+                          end if
+                       end if
+                    else
+                       ! side  ='l', n is odd, transr = 't', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is odd, transr = 't', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_strsm( 'L', 'U', 'T', diag, m1, n, alpha,a( m2*m2 ), m2, b, &
+                                    ldb )
+                          call stdlib_sgemm( 'N', 'N', m2, n, m1, -one, a( 0 ), m2,b, ldb, alpha, &
+                                    b( m1, 0 ), ldb )
+                          call stdlib_strsm( 'L', 'L', 'N', diag, m2, n, one,a( m1*m2 ), m2, b( &
+                                    m1, 0 ), ldb )
+                       else
+                          ! side  ='l', n is odd, transr = 't', uplo = 'u', and
+                          ! trans = 't'
+                          call stdlib_strsm( 'L', 'L', 'T', diag, m2, n, alpha,a( m1*m2 ), m2, b( &
+                                    m1, 0 ), ldb )
+                          call stdlib_sgemm( 'T', 'N', m1, n, m2, -one, a( 0 ), m2,b( m1, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_strsm( 'L', 'U', 'N', diag, m1, n, one,a( m2*m2 ), m2, b, &
+                                    ldb )
+                       end if
+                    end if
+                 end if
+              else
+                 ! side = 'l' and n is even
+                 if( normaltransr ) then
+                    ! side = 'l', n is even, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='l', n is even, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_strsm( 'L', 'L', 'N', diag, k, n, alpha,a( 1 ), m+1, b, ldb &
+                                    )
+                          call stdlib_sgemm( 'N', 'N', k, n, k, -one, a( k+1 ),m+1, b, ldb, alpha,&
+                                     b( k, 0 ), ldb )
+                          call stdlib_strsm( 'L', 'U', 'T', diag, k, n, one,a( 0 ), m+1, b( k, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 't'
+                          call stdlib_strsm( 'L', 'U', 'N', diag, k, n, alpha,a( 0 ), m+1, b( k, &
+                                    0 ), ldb )
+                          call stdlib_sgemm( 'T', 'N', k, n, k, -one, a( k+1 ),m+1, b( k, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_strsm( 'L', 'L', 'T', diag, k, n, one,a( 1 ), m+1, b, ldb )
+                                    
+                       end if
+                    else
+                       ! side  ='l', n is even, transr = 'n', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_strsm( 'L', 'L', 'N', diag, k, n, alpha,a( k+1 ), m+1, b, &
+                                    ldb )
+                          call stdlib_sgemm( 'T', 'N', k, n, k, -one, a( 0 ), m+1,b, ldb, alpha, &
+                                    b( k, 0 ), ldb )
+                          call stdlib_strsm( 'L', 'U', 'T', diag, k, n, one,a( k ), m+1, b( k, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 't'
+                          call stdlib_strsm( 'L', 'U', 'N', diag, k, n, alpha,a( k ), m+1, b( k, &
+                                    0 ), ldb )
+                          call stdlib_sgemm( 'N', 'N', k, n, k, -one, a( 0 ), m+1,b( k, 0 ), ldb, &
+                                    alpha, b, ldb )
+                          call stdlib_strsm( 'L', 'L', 'T', diag, k, n, one,a( k+1 ), m+1, b, ldb &
+                                    )
+                       end if
+                    end if
+                 else
+                    ! side = 'l', n is even, and transr = 't'
+                    if( lower ) then
+                       ! side  ='l', n is even, transr = 't', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is even, transr = 't', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_strsm( 'L', 'U', 'T', diag, k, n, alpha,a( k ), k, b, ldb )
+                                    
+                          call stdlib_sgemm( 'T', 'N', k, n, k, -one,a( k*( k+1 ) ), k, b, ldb, &
+                                    alpha,b( k, 0 ), ldb )
+                          call stdlib_strsm( 'L', 'L', 'N', diag, k, n, one,a( 0 ), k, b( k, 0 ), &
+                                    ldb )
+                       else
+                          ! side  ='l', n is even, transr = 't', uplo = 'l',
+                          ! and trans = 't'
+                          call stdlib_strsm( 'L', 'L', 'T', diag, k, n, alpha,a( 0 ), k, b( k, 0 )&
+                                    , ldb )
+                          call stdlib_sgemm( 'N', 'N', k, n, k, -one,a( k*( k+1 ) ), k, b( k, 0 ),&
+                                     ldb,alpha, b, ldb )
+                          call stdlib_strsm( 'L', 'U', 'N', diag, k, n, one,a( k ), k, b, ldb )
+                                    
+                       end if
+                    else
+                       ! side  ='l', n is even, transr = 't', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is even, transr = 't', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_strsm( 'L', 'U', 'T', diag, k, n, alpha,a( k*( k+1 ) ), k, &
+                                    b, ldb )
+                          call stdlib_sgemm( 'N', 'N', k, n, k, -one, a( 0 ), k, b,ldb, alpha, b( &
+                                    k, 0 ), ldb )
+                          call stdlib_strsm( 'L', 'L', 'N', diag, k, n, one,a( k*k ), k, b( k, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='l', n is even, transr = 't', uplo = 'u',
+                          ! and trans = 't'
+                          call stdlib_strsm( 'L', 'L', 'T', diag, k, n, alpha,a( k*k ), k, b( k, &
+                                    0 ), ldb )
+                          call stdlib_sgemm( 'T', 'N', k, n, k, -one, a( 0 ), k,b( k, 0 ), ldb, &
+                                    alpha, b, ldb )
+                          call stdlib_strsm( 'L', 'U', 'N', diag, k, n, one,a( k*( k+1 ) ), k, b, &
+                                    ldb )
+                       end if
+                    end if
+                 end if
+              end if
+           else
+              ! side = 'r'
+              ! a is n-by-n.
+              ! if n is odd, set nisodd = .true., and n1 and n2.
+              ! if n is even, nisodd = .false., and k.
+              if( mod( n, 2 )==0 ) then
+                 nisodd = .false.
+                 k = n / 2
+              else
+                 nisodd = .true.
+                 if( lower ) then
+                    n2 = n / 2
+                    n1 = n - n2
+                 else
+                    n1 = n / 2
+                    n2 = n - n1
+                 end if
+              end if
+              if( nisodd ) then
+                 ! side = 'r' and n is odd
+                 if( normaltransr ) then
+                    ! side = 'r', n is odd, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='r', n is odd, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'n'
+                          call stdlib_strsm( 'R', 'U', 'T', diag, m, n2, alpha,a( n ), n, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_sgemm( 'N', 'N', m, n1, n2, -one, b( 0, n1 ),ldb, a( n1 ), &
+                                    n, alpha, b( 0, 0 ),ldb )
+                          call stdlib_strsm( 'R', 'L', 'N', diag, m, n1, one,a( 0 ), n, b( 0, 0 ),&
+                                     ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 't'
+                          call stdlib_strsm( 'R', 'L', 'T', diag, m, n1, alpha,a( 0 ), n, b( 0, 0 &
+                                    ), ldb )
+                          call stdlib_sgemm( 'N', 'T', m, n2, n1, -one, b( 0, 0 ),ldb, a( n1 ), n,&
+                                     alpha, b( 0, n1 ),ldb )
+                          call stdlib_strsm( 'R', 'U', 'N', diag, m, n2, one,a( n ), n, b( 0, n1 )&
+                                    , ldb )
+                       end if
+                    else
+                       ! side  ='r', n is odd, transr = 'n', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_strsm( 'R', 'L', 'T', diag, m, n1, alpha,a( n2 ), n, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_sgemm( 'N', 'N', m, n2, n1, -one, b( 0, 0 ),ldb, a( 0 ), n, &
+                                    alpha, b( 0, n1 ),ldb )
+                          call stdlib_strsm( 'R', 'U', 'N', diag, m, n2, one,a( n1 ), n, b( 0, n1 &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 't'
+                          call stdlib_strsm( 'R', 'U', 'T', diag, m, n2, alpha,a( n1 ), n, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_sgemm( 'N', 'T', m, n1, n2, -one, b( 0, n1 ),ldb, a( 0 ), n,&
+                                     alpha, b( 0, 0 ), ldb )
+                          call stdlib_strsm( 'R', 'L', 'N', diag, m, n1, one,a( n2 ), n, b( 0, 0 )&
+                                    , ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'r', n is odd, and transr = 't'
+                    if( lower ) then
+                       ! side  ='r', n is odd, transr = 't', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 't', uplo = 'l', and
+                          ! trans = 'n'
+                          call stdlib_strsm( 'R', 'L', 'N', diag, m, n2, alpha,a( 1 ), n1, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_sgemm( 'N', 'T', m, n1, n2, -one, b( 0, n1 ),ldb, a( n1*n1 )&
+                                    , n1, alpha, b( 0, 0 ),ldb )
+                          call stdlib_strsm( 'R', 'U', 'T', diag, m, n1, one,a( 0 ), n1, b( 0, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 't', uplo = 'l', and
+                          ! trans = 't'
+                          call stdlib_strsm( 'R', 'U', 'N', diag, m, n1, alpha,a( 0 ), n1, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_sgemm( 'N', 'N', m, n2, n1, -one, b( 0, 0 ),ldb, a( n1*n1 ),&
+                                     n1, alpha, b( 0, n1 ),ldb )
+                          call stdlib_strsm( 'R', 'L', 'T', diag, m, n2, one,a( 1 ), n1, b( 0, n1 &
+                                    ), ldb )
+                       end if
+                    else
+                       ! side  ='r', n is odd, transr = 't', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 't', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_strsm( 'R', 'U', 'N', diag, m, n1, alpha,a( n2*n2 ), n2, b( &
+                                    0, 0 ), ldb )
+                          call stdlib_sgemm( 'N', 'T', m, n2, n1, -one, b( 0, 0 ),ldb, a( 0 ), n2,&
+                                     alpha, b( 0, n1 ),ldb )
+                          call stdlib_strsm( 'R', 'L', 'T', diag, m, n2, one,a( n1*n2 ), n2, b( 0,&
+                                     n1 ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 't', uplo = 'u', and
+                          ! trans = 't'
+                          call stdlib_strsm( 'R', 'L', 'N', diag, m, n2, alpha,a( n1*n2 ), n2, b( &
+                                    0, n1 ), ldb )
+                          call stdlib_sgemm( 'N', 'N', m, n1, n2, -one, b( 0, n1 ),ldb, a( 0 ), &
+                                    n2, alpha, b( 0, 0 ),ldb )
+                          call stdlib_strsm( 'R', 'U', 'T', diag, m, n1, one,a( n2*n2 ), n2, b( 0,&
+                                     0 ), ldb )
+                       end if
+                    end if
+                 end if
+              else
+                 ! side = 'r' and n is even
+                 if( normaltransr ) then
+                    ! side = 'r', n is even, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='r', n is even, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_strsm( 'R', 'U', 'T', diag, m, k, alpha,a( 0 ), n+1, b( 0, &
+                                    k ), ldb )
+                          call stdlib_sgemm( 'N', 'N', m, k, k, -one, b( 0, k ),ldb, a( k+1 ), n+&
+                                    1, alpha, b( 0, 0 ),ldb )
+                          call stdlib_strsm( 'R', 'L', 'N', diag, m, k, one,a( 1 ), n+1, b( 0, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 't'
+                          call stdlib_strsm( 'R', 'L', 'T', diag, m, k, alpha,a( 1 ), n+1, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_sgemm( 'N', 'T', m, k, k, -one, b( 0, 0 ),ldb, a( k+1 ), n+&
+                                    1, alpha, b( 0, k ),ldb )
+                          call stdlib_strsm( 'R', 'U', 'N', diag, m, k, one,a( 0 ), n+1, b( 0, k )&
+                                    , ldb )
+                       end if
+                    else
+                       ! side  ='r', n is even, transr = 'n', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_strsm( 'R', 'L', 'T', diag, m, k, alpha,a( k+1 ), n+1, b( 0,&
+                                     0 ), ldb )
+                          call stdlib_sgemm( 'N', 'N', m, k, k, -one, b( 0, 0 ),ldb, a( 0 ), n+1, &
+                                    alpha, b( 0, k ),ldb )
+                          call stdlib_strsm( 'R', 'U', 'N', diag, m, k, one,a( k ), n+1, b( 0, k )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 't'
+                          call stdlib_strsm( 'R', 'U', 'T', diag, m, k, alpha,a( k ), n+1, b( 0, &
+                                    k ), ldb )
+                          call stdlib_sgemm( 'N', 'T', m, k, k, -one, b( 0, k ),ldb, a( 0 ), n+1, &
+                                    alpha, b( 0, 0 ),ldb )
+                          call stdlib_strsm( 'R', 'L', 'N', diag, m, k, one,a( k+1 ), n+1, b( 0, &
+                                    0 ), ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'r', n is even, and transr = 't'
+                    if( lower ) then
+                       ! side  ='r', n is even, transr = 't', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 't', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_strsm( 'R', 'L', 'N', diag, m, k, alpha,a( 0 ), k, b( 0, k )&
+                                    , ldb )
+                          call stdlib_sgemm( 'N', 'T', m, k, k, -one, b( 0, k ),ldb, a( ( k+1 )*k &
+                                    ), k, alpha,b( 0, 0 ), ldb )
+                          call stdlib_strsm( 'R', 'U', 'T', diag, m, k, one,a( k ), k, b( 0, 0 ), &
+                                    ldb )
+                       else
+                          ! side  ='r', n is even, transr = 't', uplo = 'l',
+                          ! and trans = 't'
+                          call stdlib_strsm( 'R', 'U', 'N', diag, m, k, alpha,a( k ), k, b( 0, 0 )&
+                                    , ldb )
+                          call stdlib_sgemm( 'N', 'N', m, k, k, -one, b( 0, 0 ),ldb, a( ( k+1 )*k &
+                                    ), k, alpha,b( 0, k ), ldb )
+                          call stdlib_strsm( 'R', 'L', 'T', diag, m, k, one,a( 0 ), k, b( 0, k ), &
+                                    ldb )
+                       end if
+                    else
+                       ! side  ='r', n is even, transr = 't', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 't', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_strsm( 'R', 'U', 'N', diag, m, k, alpha,a( ( k+1 )*k ), k, &
+                                    b( 0, 0 ), ldb )
+                          call stdlib_sgemm( 'N', 'T', m, k, k, -one, b( 0, 0 ),ldb, a( 0 ), k, &
+                                    alpha, b( 0, k ), ldb )
+                          call stdlib_strsm( 'R', 'L', 'T', diag, m, k, one,a( k*k ), k, b( 0, k )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is even, transr = 't', uplo = 'u',
+                          ! and trans = 't'
+                          call stdlib_strsm( 'R', 'L', 'N', diag, m, k, alpha,a( k*k ), k, b( 0, &
+                                    k ), ldb )
+                          call stdlib_sgemm( 'N', 'N', m, k, k, -one, b( 0, k ),ldb, a( 0 ), k, &
+                                    alpha, b( 0, 0 ), ldb )
+                          call stdlib_strsm( 'R', 'U', 'T', diag, m, k, one,a( ( k+1 )*k ), k, b( &
+                                    0, 0 ), ldb )
+                       end if
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_stfsm
+
+     !> STFTTP: copies a triangular matrix A from rectangular full packed
+     !> format (TF) to standard packed format (TP).
+
+     pure subroutine stdlib_stfttp( transr, uplo, n, arf, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(out) :: ap(0:*)
+           real(sp), intent(in) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt
+           integer(ilp) :: i, j, ij
+           integer(ilp) :: ijp, jp, lda, js
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STFTTP', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( normaltransr ) then
+                 ap( 0 ) = arf( 0 )
+              else
+                 ap( 0 ) = arf( 0 )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           ! set lda of arf^c; arf^c is (0:(n+1)/2-1,0:n-noe)
+           ! where noe = 0 if n is even, noe = 1 if n is odd
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              lda = n + 1
+           else
+              nisodd = .true.
+              lda = n
+           end if
+           ! arf^c has lda rows and n+1-noe cols
+           if( .not.normaltransr )lda = ( n+1 ) / 2
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda = n
+                    ijp = 0
+                    jp = 0
+                    do j = 0, n2
+                       do i = j, n - 1
+                          ij = i + jp
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, n2 - 1
+                       do j = 1 + i, n2
+                          ij = i + j*lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    ijp = 0
+                    do j = 0, n1 - 1
+                       ij = n2 + j
+                       do i = 0, j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = n1, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ijp = 0
+                    do i = 0, n2
+                       do ij = i*( lda+1 ), n*lda - 1, lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 1
+                    do j = 0, n2 - 1
+                       do ij = js, js + n2 - j - 1
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    ijp = 0
+                    js = n2*lda
+                    do j = 0, n1 - 1
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, n1
+                       do ij = i, i + ( n1+i )*lda, lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    ijp = 0
+                    jp = 0
+                    do j = 0, k - 1
+                       do i = j, n - 1
+                          ij = 1 + i + jp
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, k - 1
+                       do j = i, k - 1
+                          ij = i + j*lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    ijp = 0
+                    do j = 0, k - 1
+                       ij = k + 1 + j
+                       do i = 0, j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = k, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    ijp = 0
+                    do i = 0, k - 1
+                       do ij = i + ( i+1 )*lda, ( n+1 )*lda - 1, lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 0
+                    do j = 0, k - 1
+                       do ij = js, js + k - j - 1
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    ijp = 0
+                    js = ( k+1 )*lda
+                    do j = 0, k - 1
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, k - 1
+                       do ij = i, i + ( k+i )*lda, lda
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_stfttp
+
+     !> STFTTR: copies a triangular matrix A from rectangular full packed
+     !> format (TF) to standard full format (TR).
+
+     pure subroutine stdlib_stfttr( transr, uplo, n, arf, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           real(sp), intent(out) :: a(0:lda-1,0:*)
+           real(sp), intent(in) :: arf(0:*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt, nx2, np1x2
+           integer(ilp) :: i, j, l, ij
+           ! Intrinsic Functions 
+           intrinsic :: max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STFTTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 ) then
+                 a( 0, 0 ) = arf( 0 )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower: for n even n1=n2=k
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true., lda=n+1 and a is (n+1)--by--k2.
+           ! if n is even, set k = n/2 and nisodd = .false., lda=n and a is
+           ! n--by--(n+1)/2.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              if( .not.lower )np1x2 = n + n + 2
+           else
+              nisodd = .true.
+              if( .not.lower )nx2 = n + n
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    ij = 0
+                    do j = 0, n2
+                       do i = n1, n2 + j
+                          a( n2+j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    ij = nt - n
+                    do j = n - 1, n1, -1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = j - n1, n1 - 1
+                          a( j-n1, l ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       ij = ij - nx2
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! n is odd, transr = 't', and uplo = 'l'
+                    ij = 0
+                    do j = 0, n2 - 1
+                       do i = 0, j
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do i = n1 + j, n - 1
+                          a( i, n1+j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = n2, n - 1
+                       do i = 0, n1 - 1
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 't', and uplo = 'u'
+                    ij = 0
+                    do j = 0, n1
+                       do i = n1, n - 1
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, n1 - 1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = n2 + j, n - 1
+                          a( n2+j, l ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    ij = 0
+                    do j = 0, k - 1
+                       do i = k, k + j
+                          a( k+j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    ij = nt - n - 1
+                    do j = n - 1, k, -1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = j - k, k - 1
+                          a( j-k, l ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       ij = ij - np1x2
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! n is even, transr = 't', and uplo = 'l'
+                    ij = 0
+                    j = k
+                    do i = k, n - 1
+                       a( i, j ) = arf( ij )
+                       ij = ij + 1
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do i = k + 1 + j, n - 1
+                          a( i, k+1+j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = k - 1, n - 1
+                       do i = 0, k - 1
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 't', and uplo = 'u'
+                    ij = 0
+                    do j = 0, k
+                       do i = k, n - 1
+                          a( j, i ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = k + 1 + j, n - 1
+                          a( k+1+j, l ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    ! note that here, on exit of the loop, j = k-1
+                    do i = 0, j
+                       a( i, j ) = arf( ij )
+                       ij = ij + 1
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_stfttr
+
+     !> STPRFB: applies a real "triangular-pentagonal" block reflector H or its
+     !> conjugate transpose H^H to a real matrix C, which is composed of two
+     !> blocks A and B, either from the left or right.
+
+     pure subroutine stdlib_stprfb( side, trans, direct, storev, m, n, k, l,v, ldv, t, ldt, a, &
+               lda, b, ldb, work, ldwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, l, lda, ldb, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(in) :: t(ldt,*), v(ldv,*)
+           real(sp), intent(out) :: work(ldwork,*)
+        ! ==========================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, mp, np, kp
+           logical(lk) :: left, forward, column, right, backward, row
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 .or. k<=0 .or. l<0 ) return
+           if( stdlib_lsame( storev, 'C' ) ) then
+              column = .true.
+              row = .false.
+           else if ( stdlib_lsame( storev, 'R' ) ) then
+              column = .false.
+              row = .true.
+           else
+              column = .false.
+              row = .false.
+           end if
+           if( stdlib_lsame( side, 'L' ) ) then
+              left = .true.
+              right = .false.
+           else if( stdlib_lsame( side, 'R' ) ) then
+              left = .false.
+              right = .true.
+           else
+              left = .false.
+              right = .false.
+           end if
+           if( stdlib_lsame( direct, 'F' ) ) then
+              forward = .true.
+              backward = .false.
+           else if( stdlib_lsame( direct, 'B' ) ) then
+              forward = .false.
+              backward = .true.
+           else
+              forward = .false.
+              backward = .false.
+           end if
+       ! ---------------------------------------------------------------------------
+           if( column .and. forward .and. left  ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i ]    (k-by-k)
+                        ! [ v ]    (m-by-k)
+              ! form  h c  or  h^h c  where  c = [ a ]  (k-by-n)
+                                               ! [ b ]  (m-by-n)
+              ! h = i - w t w^h          or  h^h = i - w t^h w^h
+              ! a = a -   t (a + v^h b)  or  a = a -   t^h (a + v^h b)
+              ! b = b - v t (a + v^h b)  or  b = b - v t^h (a + v^h b)
+       ! ---------------------------------------------------------------------------
+              mp = min( m-l+1, m )
+              kp = min( l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( i, j ) = b( m-l+i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'L', 'U', 'T', 'N', l, n, one, v( mp, 1 ), ldv,work, ldwork )
+                        
+              call stdlib_sgemm( 'T', 'N', l, n, m-l, one, v, ldv, b, ldb,one, work, ldwork )
+                        
+              call stdlib_sgemm( 'T', 'N', k-l, n, m, one, v( 1, kp ), ldv,b, ldb, zero, work( kp,&
+                         1 ), ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'L', 'U', trans, 'N', k, n, one, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_sgemm( 'N', 'N', m-l, n, k, -one, v, ldv, work, ldwork,one, b, ldb )
+                        
+              call stdlib_sgemm( 'N', 'N', l, n, k-l, -one, v( mp, kp ), ldv,work( kp, 1 ), &
+                        ldwork, one, b( mp, 1 ),  ldb )
+              call stdlib_strmm( 'L', 'U', 'N', 'N', l, n, one, v( mp, 1 ), ldv,work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, l
+                    b( m-l+i, j ) = b( m-l+i, j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. forward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i ]    (k-by-k)
+                        ! [ v ]    (n-by-k)
+              ! form  c h or  c h^h  where  c = [ a b ] (a is m-by-k, b is m-by-n)
+              ! h = i - w t w^h          or  h^h = i - w t^h w^h
+              ! a = a - (a + b v) t      or  a = a - (a + b v) t^h
+              ! b = b - (a + b v) t v^h  or  b = b - (a + b v) t^h v^h
+       ! ---------------------------------------------------------------------------
+              np = min( n-l+1, n )
+              kp = min( l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, j ) = b( i, n-l+j )
+                 end do
+              end do
+              call stdlib_strmm( 'R', 'U', 'N', 'N', m, l, one, v( np, 1 ), ldv,work, ldwork )
+                        
+              call stdlib_sgemm( 'N', 'N', m, l, n-l, one, b, ldb,v, ldv, one, work, ldwork )
+                        
+              call stdlib_sgemm( 'N', 'N', m, k-l, n, one, b, ldb,v( 1, kp ), ldv, zero, work( 1, &
+                        kp ), ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'R', 'U', trans, 'N', m, k, one, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_sgemm( 'N', 'T', m, n-l, k, -one, work, ldwork,v, ldv, one, b, ldb )
+                        
+              call stdlib_sgemm( 'N', 'T', m, l, k-l, -one, work( 1, kp ), ldwork,v( np, kp ), &
+                        ldv, one, b( 1, np ), ldb )
+              call stdlib_strmm( 'R', 'U', 'T', 'N', m, l, one, v( np, 1 ), ldv,work, ldwork )
+                        
+              do j = 1, l
+                 do i = 1, m
+                    b( i, n-l+j ) = b( i, n-l+j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. backward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v ]    (m-by-k)
+                        ! [ i ]    (k-by-k)
+              ! form  h c  or  h^h c  where  c = [ b ]  (m-by-n)
+                                               ! [ a ]  (k-by-n)
+              ! h = i - w t w^h          or  h^h = i - w t^h w^h
+              ! a = a -   t (a + v^h b)  or  a = a -   t^h (a + v^h b)
+              ! b = b - v t (a + v^h b)  or  b = b - v t^h (a + v^h b)
+       ! ---------------------------------------------------------------------------
+              mp = min( l+1, m )
+              kp = min( k-l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( k-l+i, j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'L', 'L', 'T', 'N', l, n, one, v( 1, kp ), ldv,work( kp, 1 ), &
+                        ldwork )
+              call stdlib_sgemm( 'T', 'N', l, n, m-l, one, v( mp, kp ), ldv,b( mp, 1 ), ldb, one, &
+                        work( kp, 1 ), ldwork )
+              call stdlib_sgemm( 'T', 'N', k-l, n, m, one, v, ldv,b, ldb, zero, work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'L', 'L', trans, 'N', k, n, one, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_sgemm( 'N', 'N', m-l, n, k, -one, v( mp, 1 ), ldv,work, ldwork, one, b( &
+                        mp, 1 ), ldb )
+              call stdlib_sgemm( 'N', 'N', l, n, k-l, -one, v, ldv,work, ldwork, one, b,  ldb )
+                        
+              call stdlib_strmm( 'L', 'L', 'N', 'N', l, n, one, v( 1, kp ), ldv,work( kp, 1 ), &
+                        ldwork )
+              do j = 1, n
+                 do i = 1, l
+                    b( i, j ) = b( i, j ) - work( k-l+i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. backward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v ]    (n-by-k)
+                        ! [ i ]    (k-by-k)
+              ! form  c h  or  c h^h  where  c = [ b a ] (b is m-by-n, a is m-by-k)
+              ! h = i - w t w^h          or  h^h = i - w t^h w^h
+              ! a = a - (a + b v) t      or  a = a - (a + b v) t^h
+              ! b = b - (a + b v) t v^h  or  b = b - (a + b v) t^h v^h
+       ! ---------------------------------------------------------------------------
+              np = min( l+1, n )
+              kp = min( k-l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, k-l+j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'R', 'L', 'N', 'N', m, l, one, v( 1, kp ), ldv,work( 1, kp ), &
+                        ldwork )
+              call stdlib_sgemm( 'N', 'N', m, l, n-l, one, b( 1, np ), ldb,v( np, kp ), ldv, one, &
+                        work( 1, kp ), ldwork )
+              call stdlib_sgemm( 'N', 'N', m, k-l, n, one, b, ldb,v, ldv, zero, work, ldwork )
+                        
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'R', 'L', trans, 'N', m, k, one, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_sgemm( 'N', 'T', m, n-l, k, -one, work, ldwork,v( np, 1 ), ldv, one, b( &
+                        1, np ), ldb )
+              call stdlib_sgemm( 'N', 'T', m, l, k-l, -one, work, ldwork,v, ldv, one, b, ldb )
+                        
+              call stdlib_strmm( 'R', 'L', 'T', 'N', m, l, one, v( 1, kp ), ldv,work( 1, kp ), &
+                        ldwork )
+              do j = 1, l
+                 do i = 1, m
+                    b( i, j ) = b( i, j ) - work( i, k-l+j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. forward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i v ] ( i is k-by-k, v is k-by-m )
+              ! form  h c  or  h^h c  where  c = [ a ]  (k-by-n)
+                                               ! [ b ]  (m-by-n)
+              ! h = i - w^h t w          or  h^h = i - w^h t^h w
+              ! a = a -     t (a + v b)  or  a = a -     t^h (a + v b)
+              ! b = b - v^h t (a + v b)  or  b = b - v^h t^h (a + v b)
+       ! ---------------------------------------------------------------------------
+              mp = min( m-l+1, m )
+              kp = min( l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( i, j ) = b( m-l+i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'L', 'L', 'N', 'N', l, n, one, v( 1, mp ), ldv,work, ldb )
+                        
+              call stdlib_sgemm( 'N', 'N', l, n, m-l, one, v, ldv,b, ldb,one, work, ldwork )
+                        
+              call stdlib_sgemm( 'N', 'N', k-l, n, m, one, v( kp, 1 ), ldv,b, ldb, zero, work( kp,&
+                         1 ), ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'L', 'U', trans, 'N', k, n, one, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', m-l, n, k, -one, v, ldv, work, ldwork,one, b, ldb )
+                        
+              call stdlib_sgemm( 'T', 'N', l, n, k-l, -one, v( kp, mp ), ldv,work( kp, 1 ), &
+                        ldwork, one, b( mp, 1 ), ldb )
+              call stdlib_strmm( 'L', 'L', 'T', 'N', l, n, one, v( 1, mp ), ldv,work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, l
+                    b( m-l+i, j ) = b( m-l+i, j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. forward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i v ] ( i is k-by-k, v is k-by-n )
+              ! form  c h  or  c h^h  where  c = [ a b ] (a is m-by-k, b is m-by-n)
+              ! h = i - w^h t w            or  h^h = i - w^h t^h w
+              ! a = a - (a + b v^h) t      or  a = a - (a + b v^h) t^h
+              ! b = b - (a + b v^h) t v    or  b = b - (a + b v^h) t^h v
+       ! ---------------------------------------------------------------------------
+              np = min( n-l+1, n )
+              kp = min( l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, j ) = b( i, n-l+j )
+                 end do
+              end do
+              call stdlib_strmm( 'R', 'L', 'T', 'N', m, l, one, v( 1, np ), ldv,work, ldwork )
+                        
+              call stdlib_sgemm( 'N', 'T', m, l, n-l, one, b, ldb, v, ldv,one, work, ldwork )
+                        
+              call stdlib_sgemm( 'N', 'T', m, k-l, n, one, b, ldb,v( kp, 1 ), ldv, zero, work( 1, &
+                        kp ), ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'R', 'U', trans, 'N', m, k, one, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_sgemm( 'N', 'N', m, n-l, k, -one, work, ldwork,v, ldv, one, b, ldb )
+                        
+              call stdlib_sgemm( 'N', 'N', m, l, k-l, -one, work( 1, kp ), ldwork,v( kp, np ), &
+                        ldv, one, b( 1, np ), ldb )
+              call stdlib_strmm( 'R', 'L', 'N', 'N', m, l, one, v( 1, np ), ldv,work, ldwork )
+                        
+              do j = 1, l
+                 do i = 1, m
+                    b( i, n-l+j ) = b( i, n-l+j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. backward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v i ] ( i is k-by-k, v is k-by-m )
+              ! form  h c  or  h^h c  where  c = [ b ]  (m-by-n)
+                                               ! [ a ]  (k-by-n)
+              ! h = i - w^h t w          or  h^h = i - w^h t^h w
+              ! a = a -     t (a + v b)  or  a = a -     t^h (a + v b)
+              ! b = b - v^h t (a + v b)  or  b = b - v^h t^h (a + v b)
+       ! ---------------------------------------------------------------------------
+              mp = min( l+1, m )
+              kp = min( k-l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( k-l+i, j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'L', 'U', 'N', 'N', l, n, one, v( kp, 1 ), ldv,work( kp, 1 ), &
+                        ldwork )
+              call stdlib_sgemm( 'N', 'N', l, n, m-l, one, v( kp, mp ), ldv,b( mp, 1 ), ldb, one, &
+                        work( kp, 1 ), ldwork )
+              call stdlib_sgemm( 'N', 'N', k-l, n, m, one, v, ldv, b, ldb,zero, work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'L', 'L ', trans, 'N', k, n, one, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_sgemm( 'T', 'N', m-l, n, k, -one, v( 1, mp ), ldv,work, ldwork, one, b( &
+                        mp, 1 ), ldb )
+              call stdlib_sgemm( 'T', 'N', l, n, k-l, -one, v, ldv,work, ldwork, one, b, ldb )
+                        
+              call stdlib_strmm( 'L', 'U', 'T', 'N', l, n, one, v( kp, 1 ), ldv,work( kp, 1 ), &
+                        ldwork )
+              do j = 1, n
+                 do i = 1, l
+                    b( i, j ) = b( i, j ) - work( k-l+i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. backward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v i ] ( i is k-by-k, v is k-by-n )
+              ! form  c h  or  c h^h  where  c = [ b a ] (a is m-by-k, b is m-by-n)
+              ! h = i - w^h t w            or  h^h = i - w^h t^h w
+              ! a = a - (a + b v^h) t      or  a = a - (a + b v^h) t^h
+              ! b = b - (a + b v^h) t v    or  b = b - (a + b v^h) t^h v
+       ! ---------------------------------------------------------------------------
+              np = min( l+1, n )
+              kp = min( k-l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, k-l+j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'R', 'U', 'T', 'N', m, l, one, v( kp, 1 ), ldv,work( 1, kp ), &
+                        ldwork )
+              call stdlib_sgemm( 'N', 'T', m, l, n-l, one, b( 1, np ), ldb,v( kp, np ), ldv, one, &
+                        work( 1, kp ), ldwork )
+              call stdlib_sgemm( 'N', 'T', m, k-l, n, one, b, ldb, v, ldv,zero, work, ldwork )
+                        
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_strmm( 'R', 'L', trans, 'N', m, k, one, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_sgemm( 'N', 'N', m, n-l, k, -one, work, ldwork,v( 1, np ), ldv, one, b( &
+                        1, np ), ldb )
+              call stdlib_sgemm( 'N', 'N', m, l, k-l , -one, work, ldwork,v, ldv, one, b, ldb )
+                        
+              call stdlib_strmm( 'R', 'U', 'N', 'N', m, l, one, v( kp, 1 ), ldv,work( 1, kp ), &
+                        ldwork )
+              do j = 1, l
+                 do i = 1, m
+                    b( i, j ) = b( i, j ) - work( i, k-l+j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_stprfb
+
+     !> STPRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular packed
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by STPTRS or some other
+     !> means before entering this routine.  STPRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_stprfs( uplo, trans, diag, n, nrhs, ap, b, ldb, x, ldx,ferr, berr, &
+               work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: ap(*), b(ldb,*), x(ldx,*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transt
+           integer(ilp) :: i, j, k, kase, kc, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a or a**t, depending on trans.
+              call stdlib_scopy( n, x( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_stpmv( uplo, trans, diag, n, ap, work( n+1 ), 1 )
+              call stdlib_saxpy( n, -one, b( 1, j ), 1, work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = 1, k
+                             work( i ) = work( i ) + abs( ap( kc+i-1 ) )*xk
+                          end do
+                          kc = kc + k
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = 1, k - 1
+                             work( i ) = work( i ) + abs( ap( kc+i-1 ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                          kc = kc + k
+                       end do
+                    end if
+                 else
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k, n
+                             work( i ) = work( i ) + abs( ap( kc+i-k ) )*xk
+                          end do
+                          kc = kc + n - k + 1
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k + 1, n
+                             work( i ) = work( i ) + abs( ap( kc+i-k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                          kc = kc + n - k + 1
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**t)*abs(x) + abs(b).
+                 if( upper ) then
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = 1, k
+                             s = s + abs( ap( kc+i-1 ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                          kc = kc + k
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = 1, k - 1
+                             s = s + abs( ap( kc+i-1 ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                          kc = kc + k
+                       end do
+                    end if
+                 else
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, n
+                             s = s + abs( ap( kc+i-k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                          kc = kc + n - k + 1
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = k + 1, n
+                             s = s + abs( ap( kc+i-k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                          kc = kc + n - k + 1
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_slacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_slacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_stpsv( uplo, transt, diag, n, ap, work( n+1 ), 1 )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_stpsv( uplo, trans, diag, n, ap, work( n+1 ), 1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_stprfs
+
+     !> STPTRI: computes the inverse of a real upper or lower triangular
+     !> matrix A stored in packed format.
+
+     pure subroutine stdlib_stptri( uplo, diag, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jc, jclast, jj
+           real(sp) :: ajj
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STPTRI', -info )
+              return
+           end if
+           ! check for singularity if non-unit.
+           if( nounit ) then
+              if( upper ) then
+                 jj = 0
+                 do info = 1, n
+                    jj = jj + info
+                    if( ap( jj )==zero )return
+                 end do
+              else
+                 jj = 1
+                 do info = 1, n
+                    if( ap( jj )==zero )return
+                    jj = jj + n - info + 1
+                 end do
+              end if
+              info = 0
+           end if
+           if( upper ) then
+              ! compute inverse of upper triangular matrix.
+              jc = 1
+              do j = 1, n
+                 if( nounit ) then
+                    ap( jc+j-1 ) = one / ap( jc+j-1 )
+                    ajj = -ap( jc+j-1 )
+                 else
+                    ajj = -one
+                 end if
+                 ! compute elements 1:j-1 of j-th column.
+                 call stdlib_stpmv( 'UPPER', 'NO TRANSPOSE', diag, j-1, ap,ap( jc ), 1 )
+                 call stdlib_sscal( j-1, ajj, ap( jc ), 1 )
+                 jc = jc + j
+              end do
+           else
+              ! compute inverse of lower triangular matrix.
+              jc = n*( n+1 ) / 2
+              do j = n, 1, -1
+                 if( nounit ) then
+                    ap( jc ) = one / ap( jc )
+                    ajj = -ap( jc )
+                 else
+                    ajj = -one
+                 end if
+                 if( j<n ) then
+                    ! compute elements j+1:n of j-th column.
+                    call stdlib_stpmv( 'LOWER', 'NO TRANSPOSE', diag, n-j,ap( jclast ), ap( jc+1 )&
+                              , 1 )
+                    call stdlib_sscal( n-j, ajj, ap( jc+1 ), 1 )
+                 end if
+                 jclast = jc
+                 jc = jc - n + j - 2
+              end do
+           end if
+           return
+     end subroutine stdlib_stptri
+
+     !> STPTRS: solves a triangular system of the form
+     !> A * X = B  or  A**T * X = B,
+     !> where A is a triangular matrix of order N stored in packed format,
+     !> and B is an N-by-NRHS matrix.  A check is made to verify that A is
+     !> nonsingular.
+
+     pure subroutine stdlib_stptrs( uplo, trans, diag, n, nrhs, ap, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(in) :: ap(*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jc
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              if( upper ) then
+                 jc = 1
+                 do info = 1, n
+                    if( ap( jc+info-1 )==zero )return
+                    jc = jc + info
+                 end do
+              else
+                 jc = 1
+                 do info = 1, n
+                    if( ap( jc )==zero )return
+                    jc = jc + n - info + 1
+                 end do
+              end if
+           end if
+           info = 0
+           ! solve a * x = b  or  a**t * x = b.
+           do j = 1, nrhs
+              call stdlib_stpsv( uplo, trans, diag, n, ap, b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_stptrs
+
+     !> STPTTF: copies a triangular matrix A from standard packed format (TP)
+     !> to rectangular full packed format (TF).
+
+     pure subroutine stdlib_stpttf( transr, uplo, n, ap, arf, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(in) :: ap(0:*)
+           real(sp), intent(out) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt
+           integer(ilp) :: i, j, ij
+           integer(ilp) :: ijp, jp, lda, js
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STPTTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( normaltransr ) then
+                 arf( 0 ) = ap( 0 )
+              else
+                 arf( 0 ) = ap( 0 )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           ! set lda of arf^c; arf^c is (0:(n+1)/2-1,0:n-noe)
+           ! where noe = 0 if n is even, noe = 1 if n is odd
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              lda = n + 1
+           else
+              nisodd = .true.
+              lda = n
+           end if
+           ! arf^c has lda rows and n+1-noe cols
+           if( .not.normaltransr )lda = ( n+1 ) / 2
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    ijp = 0
+                    jp = 0
+                    do j = 0, n2
+                       do i = j, n - 1
+                          ij = i + jp
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, n2 - 1
+                       do j = 1 + i, n2
+                          ij = i + j*lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    ijp = 0
+                    do j = 0, n1 - 1
+                       ij = n2 + j
+                       do i = 0, j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = n1, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! n is odd, transr = 't', and uplo = 'l'
+                    ijp = 0
+                    do i = 0, n2
+                       do ij = i*( lda+1 ), n*lda - 1, lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 1
+                    do j = 0, n2 - 1
+                       do ij = js, js + n2 - j - 1
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! n is odd, transr = 't', and uplo = 'u'
+                    ijp = 0
+                    js = n2*lda
+                    do j = 0, n1 - 1
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, n1
+                       do ij = i, i + ( n1+i )*lda, lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    ijp = 0
+                    jp = 0
+                    do j = 0, k - 1
+                       do i = j, n - 1
+                          ij = 1 + i + jp
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, k - 1
+                       do j = i, k - 1
+                          ij = i + j*lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    ijp = 0
+                    do j = 0, k - 1
+                       ij = k + 1 + j
+                       do i = 0, j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = k, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! n is even, transr = 't', and uplo = 'l'
+                    ijp = 0
+                    do i = 0, k - 1
+                       do ij = i + ( i+1 )*lda, ( n+1 )*lda - 1, lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 0
+                    do j = 0, k - 1
+                       do ij = js, js + k - j - 1
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! n is even, transr = 't', and uplo = 'u'
+                    ijp = 0
+                    js = ( k+1 )*lda
+                    do j = 0, k - 1
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, k - 1
+                       do ij = i, i + ( k+i )*lda, lda
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_stpttf
+
+     !> STPTTR: copies a triangular matrix A from standard packed format (TP)
+     !> to standard full format (TR).
+
+     pure subroutine stdlib_stpttr( uplo, n, ap, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           real(sp), intent(out) :: a(lda,*)
+           real(sp), intent(in) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower
+           integer(ilp) :: i, j, k
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STPTTR', -info )
+              return
+           end if
+           if( lower ) then
+              k = 0
+              do j = 1, n
+                 do i = j, n
+                    k = k + 1
+                    a( i, j ) = ap( k )
+                 end do
+              end do
+           else
+              k = 0
+              do j = 1, n
+                 do i = 1, j
+                    k = k + 1
+                    a( i, j ) = ap( k )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_stpttr
+
+     !> STRRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by STRTRS or some other
+     !> means before entering this routine.  STRRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_strrfs( uplo, trans, diag, n, nrhs, a, lda, b, ldb, x,ldx, ferr, berr,&
+                work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: a(lda,*), b(ldb,*), x(ldx,*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transt
+           integer(ilp) :: i, j, k, kase, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STRRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a or a**t, depending on trans.
+              call stdlib_scopy( n, x( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_strmv( uplo, trans, diag, n, a, lda, work( n+1 ), 1 )
+              call stdlib_saxpy( n, -one, b( 1, j ), 1, work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = 1, k
+                             work( i ) = work( i ) + abs( a( i, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = 1, k - 1
+                             work( i ) = work( i ) + abs( a( i, k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k, n
+                             work( i ) = work( i ) + abs( a( i, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = abs( x( k, j ) )
+                          do i = k + 1, n
+                             work( i ) = work( i ) + abs( a( i, k ) )*xk
+                          end do
+                          work( k ) = work( k ) + xk
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**t)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = 1, k
+                             s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = 1, k - 1
+                             s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, n
+                             s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = abs( x( k, j ) )
+                          do i = k + 1, n
+                             s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                          end do
+                          work( k ) = work( k ) + s
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_slacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_slacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_strsv( uplo, transt, diag, n, a, lda, work( n+1 ),1 )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_strsv( uplo, trans, diag, n, a, lda, work( n+1 ),1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_strrfs
+
+     !> STRTI2: computes the inverse of a real upper or lower triangular
+     !> matrix.
+     !> This is the Level 2 BLAS version of the algorithm.
+
+     pure subroutine stdlib_strti2( uplo, diag, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j
+           real(sp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STRTI2', -info )
+              return
+           end if
+           if( upper ) then
+              ! compute inverse of upper triangular matrix.
+              do j = 1, n
+                 if( nounit ) then
+                    a( j, j ) = one / a( j, j )
+                    ajj = -a( j, j )
+                 else
+                    ajj = -one
+                 end if
+                 ! compute elements 1:j-1 of j-th column.
+                 call stdlib_strmv( 'UPPER', 'NO TRANSPOSE', diag, j-1, a, lda,a( 1, j ), 1 )
+                           
+                 call stdlib_sscal( j-1, ajj, a( 1, j ), 1 )
+              end do
+           else
+              ! compute inverse of lower triangular matrix.
+              do j = n, 1, -1
+                 if( nounit ) then
+                    a( j, j ) = one / a( j, j )
+                    ajj = -a( j, j )
+                 else
+                    ajj = -one
+                 end if
+                 if( j<n ) then
+                    ! compute elements j+1:n of j-th column.
+                    call stdlib_strmv( 'LOWER', 'NO TRANSPOSE', diag, n-j,a( j+1, j+1 ), lda, a( &
+                              j+1, j ), 1 )
+                    call stdlib_sscal( n-j, ajj, a( j+1, j ), 1 )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_strti2
+
+     !> STRTRI: computes the inverse of a real upper or lower triangular
+     !> matrix A.
+     !> This is the Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_strtri( uplo, diag, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jb, nb, nn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STRTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity if non-unit.
+           if( nounit ) then
+              do info = 1, n
+                 if( a( info, info )==zero )return
+              end do
+              info = 0
+           end if
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'STRTRI', uplo // diag, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_strti2( uplo, diag, n, a, lda, info )
+           else
+              ! use blocked code
+              if( upper ) then
+                 ! compute inverse of upper triangular matrix
+                 do j = 1, n, nb
+                    jb = min( nb, n-j+1 )
+                    ! compute rows 1:j-1 of current block column
+                    call stdlib_strmm( 'LEFT', 'UPPER', 'NO TRANSPOSE', diag, j-1,jb, one, a, lda,&
+                               a( 1, j ), lda )
+                    call stdlib_strsm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', diag, j-1,jb, -one, a( j,&
+                               j ), lda, a( 1, j ), lda )
+                    ! compute inverse of current diagonal block
+                    call stdlib_strti2( 'UPPER', diag, jb, a( j, j ), lda, info )
+                 end do
+              else
+                 ! compute inverse of lower triangular matrix
+                 nn = ( ( n-1 ) / nb )*nb + 1
+                 do j = nn, 1, -nb
+                    jb = min( nb, n-j+1 )
+                    if( j+jb<=n ) then
+                       ! compute rows j+jb:n of current block column
+                       call stdlib_strmm( 'LEFT', 'LOWER', 'NO TRANSPOSE', diag,n-j-jb+1, jb, one,&
+                                  a( j+jb, j+jb ), lda,a( j+jb, j ), lda )
+                       call stdlib_strsm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', diag,n-j-jb+1, jb, -&
+                                 one, a( j, j ), lda,a( j+jb, j ), lda )
+                    end if
+                    ! compute inverse of current diagonal block
+                    call stdlib_strti2( 'LOWER', diag, jb, a( j, j ), lda, info )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_strtri
+
+     !> STRTRS: solves a triangular system of the form
+     !> A * X = B  or  A**T * X = B,
+     !> where A is a triangular matrix of order N, and B is an N-by-NRHS
+     !> matrix.  A check is made to verify that A is nonsingular.
+
+     pure subroutine stdlib_strtrs( uplo, trans, diag, n, nrhs, a, lda, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STRTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              do info = 1, n
+                 if( a( info, info )==zero )return
+              end do
+           end if
+           info = 0
+           ! solve a * x = b  or  a**t * x = b.
+           call stdlib_strsm( 'LEFT', uplo, trans, diag, n, nrhs, one, a, lda, b,ldb )
+           return
+     end subroutine stdlib_strtrs
+
+     !> STRTTF: copies a triangular matrix A from standard full format (TR)
+     !> to rectangular full packed format (TF) .
+
+     pure subroutine stdlib_strttf( transr, uplo, n, a, lda, arf, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           real(sp), intent(in) :: a(0:lda-1,0:*)
+           real(sp), intent(out) :: arf(0:*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: i, ij, j, k, l, n1, n2, nt, nx2, np1x2
+           ! Intrinsic Functions 
+           intrinsic :: max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STRTTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 ) then
+                 arf( 0 ) = a( 0, 0 )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower: for n even n1=n2=k
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true., lda=n+1 and a is (n+1)--by--k2.
+           ! if n is even, set k = n/2 and nisodd = .false., lda=n and a is
+           ! n--by--(n+1)/2.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              if( .not.lower )np1x2 = n + n + 2
+           else
+              nisodd = .true.
+              if( .not.lower )nx2 = n + n
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    ij = 0
+                    do j = 0, n2
+                       do i = n1, n2 + j
+                          arf( ij ) = a( n2+j, i )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    ij = nt - n
+                    do j = n - 1, n1, -1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = j - n1, n1 - 1
+                          arf( ij ) = a( j-n1, l )
+                          ij = ij + 1
+                       end do
+                       ij = ij - nx2
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! n is odd, transr = 't', and uplo = 'l'
+                    ij = 0
+                    do j = 0, n2 - 1
+                       do i = 0, j
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                       do i = n1 + j, n - 1
+                          arf( ij ) = a( i, n1+j )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = n2, n - 1
+                       do i = 0, n1 - 1
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is odd, transr = 't', and uplo = 'u'
+                    ij = 0
+                    do j = 0, n1
+                       do i = n1, n - 1
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, n1 - 1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = n2 + j, n - 1
+                          arf( ij ) = a( n2+j, l )
+                          ij = ij + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    ij = 0
+                    do j = 0, k - 1
+                       do i = k, k + j
+                          arf( ij ) = a( k+j, i )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    ij = nt - n - 1
+                    do j = n - 1, k, -1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = j - k, k - 1
+                          arf( ij ) = a( j-k, l )
+                          ij = ij + 1
+                       end do
+                       ij = ij - np1x2
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! n is even, transr = 't', and uplo = 'l'
+                    ij = 0
+                    j = k
+                    do i = k, n - 1
+                       arf( ij ) = a( i, j )
+                       ij = ij + 1
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                       do i = k + 1 + j, n - 1
+                          arf( ij ) = a( i, k+1+j )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = k - 1, n - 1
+                       do i = 0, k - 1
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! n is even, transr = 't', and uplo = 'u'
+                    ij = 0
+                    do j = 0, k
+                       do i = k, n - 1
+                          arf( ij ) = a( j, i )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = k + 1 + j, n - 1
+                          arf( ij ) = a( k+1+j, l )
+                          ij = ij + 1
+                       end do
+                    end do
+                    ! note that here, on exit of the loop, j = k-1
+                    do i = 0, j
+                       arf( ij ) = a( i, j )
+                       ij = ij + 1
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_strttf
+
+     !> STRTTP: copies a triangular matrix A from full format (TR) to standard
+     !> packed format (TP).
+
+     pure subroutine stdlib_strttp( uplo, n, a, lda, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower
+           integer(ilp) :: i, j, k
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STRTTP', -info )
+              return
+           end if
+           if( lower ) then
+              k = 0
+              do j = 1, n
+                 do i = j, n
+                    k = k + 1
+                    ap( k ) = a( i, j )
+                 end do
+              end do
+           else
+              k = 0
+              do j = 1, n
+                 do i = 1, j
+                    k = k + 1
+                    ap( k ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_strttp
+
+     !> SBBCSD: computes the CS decomposition of an orthogonal matrix in
+     !> bidiagonal-block form,
+     !> [ B11 | B12 0  0 ]
+     !> [  0  |  0 -I  0 ]
+     !> X = [----------------]
+     !> [ B21 | B22 0  0 ]
+     !> [  0  |  0  0  I ]
+     !> [  C | -S  0  0 ]
+     !> [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**T
+     !> = [---------] [---------------] [---------]   .
+     !> [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
+     !> [  0 |  0  0  I ]
+     !> X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
+     !> than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
+     !> transposed and/or permuted. This can be done in constant time using
+     !> the TRANS and SIGNS options. See SORCSD for details.)
+     !> The bidiagonal matrices B11, B12, B21, and B22 are represented
+     !> implicitly by angles THETA(1:Q) and PHI(1:Q-1).
+     !> The orthogonal matrices U1, U2, V1T, and V2T are input/output.
+     !> The input matrices are pre- or post-multiplied by the appropriate
+     !> singular vector matrices.
+
+     pure subroutine stdlib_sbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
+     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lwork, m, p, q
+           ! Array Arguments 
+           real(sp), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
+                      b22e(*), work(*)
+           real(sp), intent(inout) :: phi(*), theta(*)
+           real(sp), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
+        ! ===================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxitr = 6
+           real(sp), parameter :: hundred = 100.0_sp
+           real(sp), parameter :: meighth = -0.125_sp
+           real(sp), parameter :: piover2 = 1.57079632679489661923132169163975144210_sp
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
+                     wantu2, wantv1t, wantv2t
+           integer(ilp) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
+                     iv2tcs, iv2tsn, j, lworkmin, lworkopt, maxit, mini
+           real(sp) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
+                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
+           ! Intrinsic Functions 
+           intrinsic :: abs,atan2,cos,max,min,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           wantv2t = stdlib_lsame( jobv2t, 'Y' )
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           if( m < 0 ) then
+              info = -6
+           else if( p < 0 .or. p > m ) then
+              info = -7
+           else if( q < 0 .or. q > m ) then
+              info = -8
+           else if( q > p .or. q > m-p .or. q > m-q ) then
+              info = -8
+           else if( wantu1 .and. ldu1 < p ) then
+              info = -12
+           else if( wantu2 .and. ldu2 < m-p ) then
+              info = -14
+           else if( wantv1t .and. ldv1t < q ) then
+              info = -16
+           else if( wantv2t .and. ldv2t < m-q ) then
+              info = -18
+           end if
+           ! quick return if q = 0
+           if( info == 0 .and. q == 0 ) then
+              lworkmin = 1
+              work(1) = lworkmin
+              return
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              iu1cs = 1
+              iu1sn = iu1cs + q
+              iu2cs = iu1sn + q
+              iu2sn = iu2cs + q
+              iv1tcs = iu2sn + q
+              iv1tsn = iv1tcs + q
+              iv2tcs = iv1tsn + q
+              iv2tsn = iv2tcs + q
+              lworkopt = iv2tsn + q - 1
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not. lquery ) then
+                 info = -28
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'SBBCSD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'EPSILON' )
+           unfl = stdlib_slamch( 'SAFE MINIMUM' )
+           tolmul = max( ten, min( hundred, eps**meighth ) )
+           tol = tolmul*eps
+           thresh = max( tol, maxitr*q*q*unfl )
+           ! test for negligible sines or cosines
+           do i = 1, q
+              if( theta(i) < thresh ) then
+                 theta(i) = zero
+              else if( theta(i) > piover2-thresh ) then
+                 theta(i) = piover2
+              end if
+           end do
+           do i = 1, q-1
+              if( phi(i) < thresh ) then
+                 phi(i) = zero
+              else if( phi(i) > piover2-thresh ) then
+                 phi(i) = piover2
+              end if
+           end do
+           ! initial deflation
+           imax = q
+           do while( imax > 1 )
+              if( phi(imax-1) /= zero ) then
+                 exit
+              end if
+              imax = imax - 1
+           end do
+           imin = imax - 1
+           if  ( imin > 1 ) then
+              do while( phi(imin-1) /= zero )
+                 imin = imin - 1
+                 if  ( imin <= 1 ) exit
+              end do
+           end if
+           ! initialize iteration counter
+           maxit = maxitr*q*q
+           iter = 0
+           ! begin main iteration loop
+           do while( imax > 1 )
+              ! compute the matrix entries
+              b11d(imin) = cos( theta(imin) )
+              b21d(imin) = -sin( theta(imin) )
+              do i = imin, imax - 1
+                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
+                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
+                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
+                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
+                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
+                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
+                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
+                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
+              end do
+              b12d(imax) = sin( theta(imax) )
+              b22d(imax) = cos( theta(imax) )
+              ! abort if not converging; otherwise, increment iter
+              if( iter > maxit ) then
+                 info = 0
+                 do i = 1, q
+                    if( phi(i) /= zero )info = info + 1
+                 end do
+                 return
+              end if
+              iter = iter + imax - imin
+              ! compute shifts
+              thetamax = theta(imin)
+              thetamin = theta(imin)
+              do i = imin+1, imax
+                 if( theta(i) > thetamax )thetamax = theta(i)
+                 if( theta(i) < thetamin )thetamin = theta(i)
+              end do
+              if( thetamax > piover2 - thresh ) then
+                 ! zero on diagonals of b11 and b22; induce deflation with a
+                 ! zero shift
+                 mu = zero
+                 nu = one
+              else if( thetamin < thresh ) then
+                 ! zero on diagonals of b12 and b22; induce deflation with a
+                 ! zero shift
+                 mu = one
+                 nu = zero
+              else
+                 ! compute shifts for b11 and b21 and use the lesser
+                 call stdlib_slas2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
+                           
+                 call stdlib_slas2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
+                           
+                 if( sigma11 <= sigma21 ) then
+                    mu = sigma11
+                    nu = sqrt( one - mu**2 )
+                    if( mu < thresh ) then
+                       mu = zero
+                       nu = one
+                    end if
+                 else
+                    nu = sigma21
+                    mu = sqrt( 1.0_sp - nu**2 )
+                    if( nu < thresh ) then
+                       mu = one
+                       nu = zero
+                    end if
+                 end if
+              end if
+              ! rotate to produce bulges in b11 and b21
+              if( mu <= nu ) then
+                 call stdlib_slartgs( b11d(imin), b11e(imin), mu,work(iv1tcs+imin-1), work(iv1tsn+&
+                           imin-1) )
+              else
+                 call stdlib_slartgs( b21d(imin), b21e(imin), nu,work(iv1tcs+imin-1), work(iv1tsn+&
+                           imin-1) )
+              end if
+              temp = work(iv1tcs+imin-1)*b11d(imin) +work(iv1tsn+imin-1)*b11e(imin)
+              b11e(imin) = work(iv1tcs+imin-1)*b11e(imin) -work(iv1tsn+imin-1)*b11d(imin)
+              b11d(imin) = temp
+              b11bulge = work(iv1tsn+imin-1)*b11d(imin+1)
+              b11d(imin+1) = work(iv1tcs+imin-1)*b11d(imin+1)
+              temp = work(iv1tcs+imin-1)*b21d(imin) +work(iv1tsn+imin-1)*b21e(imin)
+              b21e(imin) = work(iv1tcs+imin-1)*b21e(imin) -work(iv1tsn+imin-1)*b21d(imin)
+              b21d(imin) = temp
+              b21bulge = work(iv1tsn+imin-1)*b21d(imin+1)
+              b21d(imin+1) = work(iv1tcs+imin-1)*b21d(imin+1)
+              ! compute theta(imin)
+              theta( imin ) = atan2( sqrt( b21d(imin)**2+b21bulge**2 ),sqrt( b11d(imin)**2+&
+                        b11bulge**2 ) )
+              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
+              if( b11d(imin)**2+b11bulge**2 > thresh**2 ) then
+                 call stdlib_slartgp( b11bulge, b11d(imin), work(iu1sn+imin-1),work(iu1cs+imin-1),&
+                            r )
+              else if( mu <= nu ) then
+                 call stdlib_slartgs( b11e( imin ), b11d( imin + 1 ), mu,work(iu1cs+imin-1), work(&
+                           iu1sn+imin-1) )
+              else
+                 call stdlib_slartgs( b12d( imin ), b12e( imin ), nu,work(iu1cs+imin-1), work(&
+                           iu1sn+imin-1) )
+              end if
+              if( b21d(imin)**2+b21bulge**2 > thresh**2 ) then
+                 call stdlib_slartgp( b21bulge, b21d(imin), work(iu2sn+imin-1),work(iu2cs+imin-1),&
+                            r )
+              else if( nu < mu ) then
+                 call stdlib_slartgs( b21e( imin ), b21d( imin + 1 ), nu,work(iu2cs+imin-1), work(&
+                           iu2sn+imin-1) )
+              else
+                 call stdlib_slartgs( b22d(imin), b22e(imin), mu,work(iu2cs+imin-1), work(iu2sn+&
+                           imin-1) )
+              end if
+              work(iu2cs+imin-1) = -work(iu2cs+imin-1)
+              work(iu2sn+imin-1) = -work(iu2sn+imin-1)
+              temp = work(iu1cs+imin-1)*b11e(imin) +work(iu1sn+imin-1)*b11d(imin+1)
+              b11d(imin+1) = work(iu1cs+imin-1)*b11d(imin+1) -work(iu1sn+imin-1)*b11e(imin)
+                        
+              b11e(imin) = temp
+              if( imax > imin+1 ) then
+                 b11bulge = work(iu1sn+imin-1)*b11e(imin+1)
+                 b11e(imin+1) = work(iu1cs+imin-1)*b11e(imin+1)
+              end if
+              temp = work(iu1cs+imin-1)*b12d(imin) +work(iu1sn+imin-1)*b12e(imin)
+              b12e(imin) = work(iu1cs+imin-1)*b12e(imin) -work(iu1sn+imin-1)*b12d(imin)
+              b12d(imin) = temp
+              b12bulge = work(iu1sn+imin-1)*b12d(imin+1)
+              b12d(imin+1) = work(iu1cs+imin-1)*b12d(imin+1)
+              temp = work(iu2cs+imin-1)*b21e(imin) +work(iu2sn+imin-1)*b21d(imin+1)
+              b21d(imin+1) = work(iu2cs+imin-1)*b21d(imin+1) -work(iu2sn+imin-1)*b21e(imin)
+                        
+              b21e(imin) = temp
+              if( imax > imin+1 ) then
+                 b21bulge = work(iu2sn+imin-1)*b21e(imin+1)
+                 b21e(imin+1) = work(iu2cs+imin-1)*b21e(imin+1)
+              end if
+              temp = work(iu2cs+imin-1)*b22d(imin) +work(iu2sn+imin-1)*b22e(imin)
+              b22e(imin) = work(iu2cs+imin-1)*b22e(imin) -work(iu2sn+imin-1)*b22d(imin)
+              b22d(imin) = temp
+              b22bulge = work(iu2sn+imin-1)*b22d(imin+1)
+              b22d(imin+1) = work(iu2cs+imin-1)*b22d(imin+1)
+              ! inner loop: chase bulges from b11(imin,imin+2),
+              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
+              ! bottom-right
+              do i = imin+1, imax-1
+                 ! compute phi(i-1)
+                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
+                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
+                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
+                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
+                 phi(i-1) = atan2( sqrt(x1**2+x2**2), sqrt(y1**2+y2**2) )
+                 ! determine if there are bulges to chase or if a new direct
+                 ! summand has been reached
+                 restart11 = b11e(i-1)**2 + b11bulge**2 <= thresh**2
+                 restart21 = b21e(i-1)**2 + b21bulge**2 <= thresh**2
+                 restart12 = b12d(i-1)**2 + b12bulge**2 <= thresh**2
+                 restart22 = b22d(i-1)**2 + b22bulge**2 <= thresh**2
+                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
+                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
+                 ! chasing by applying the original shift again.
+                 if( .not. restart11 .and. .not. restart21 ) then
+                    call stdlib_slartgp( x2, x1, work(iv1tsn+i-1), work(iv1tcs+i-1),r )
+                 else if( .not. restart11 .and. restart21 ) then
+                    call stdlib_slartgp( b11bulge, b11e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
+                              r )
+                 else if( restart11 .and. .not. restart21 ) then
+                    call stdlib_slartgp( b21bulge, b21e(i-1), work(iv1tsn+i-1),work(iv1tcs+i-1), &
+                              r )
+                 else if( mu <= nu ) then
+                    call stdlib_slartgs( b11d(i), b11e(i), mu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
+                              
+                 else
+                    call stdlib_slartgs( b21d(i), b21e(i), nu, work(iv1tcs+i-1),work(iv1tsn+i-1) )
+                              
+                 end if
+                 work(iv1tcs+i-1) = -work(iv1tcs+i-1)
+                 work(iv1tsn+i-1) = -work(iv1tsn+i-1)
+                 if( .not. restart12 .and. .not. restart22 ) then
+                    call stdlib_slartgp( y2, y1, work(iv2tsn+i-1-1),work(iv2tcs+i-1-1), r )
+                              
+                 else if( .not. restart12 .and. restart22 ) then
+                    call stdlib_slartgp( b12bulge, b12d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
+                              1), r )
+                 else if( restart12 .and. .not. restart22 ) then
+                    call stdlib_slartgp( b22bulge, b22d(i-1), work(iv2tsn+i-1-1),work(iv2tcs+i-1-&
+                              1), r )
+                 else if( nu < mu ) then
+                    call stdlib_slartgs( b12e(i-1), b12d(i), nu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
+                              1-1) )
+                 else
+                    call stdlib_slartgs( b22e(i-1), b22d(i), mu, work(iv2tcs+i-1-1),work(iv2tsn+i-&
+                              1-1) )
+                 end if
+                 temp = work(iv1tcs+i-1)*b11d(i) + work(iv1tsn+i-1)*b11e(i)
+                 b11e(i) = work(iv1tcs+i-1)*b11e(i) -work(iv1tsn+i-1)*b11d(i)
+                 b11d(i) = temp
+                 b11bulge = work(iv1tsn+i-1)*b11d(i+1)
+                 b11d(i+1) = work(iv1tcs+i-1)*b11d(i+1)
+                 temp = work(iv1tcs+i-1)*b21d(i) + work(iv1tsn+i-1)*b21e(i)
+                 b21e(i) = work(iv1tcs+i-1)*b21e(i) -work(iv1tsn+i-1)*b21d(i)
+                 b21d(i) = temp
+                 b21bulge = work(iv1tsn+i-1)*b21d(i+1)
+                 b21d(i+1) = work(iv1tcs+i-1)*b21d(i+1)
+                 temp = work(iv2tcs+i-1-1)*b12e(i-1) +work(iv2tsn+i-1-1)*b12d(i)
+                 b12d(i) = work(iv2tcs+i-1-1)*b12d(i) -work(iv2tsn+i-1-1)*b12e(i-1)
+                 b12e(i-1) = temp
+                 b12bulge = work(iv2tsn+i-1-1)*b12e(i)
+                 b12e(i) = work(iv2tcs+i-1-1)*b12e(i)
+                 temp = work(iv2tcs+i-1-1)*b22e(i-1) +work(iv2tsn+i-1-1)*b22d(i)
+                 b22d(i) = work(iv2tcs+i-1-1)*b22d(i) -work(iv2tsn+i-1-1)*b22e(i-1)
+                 b22e(i-1) = temp
+                 b22bulge = work(iv2tsn+i-1-1)*b22e(i)
+                 b22e(i) = work(iv2tcs+i-1-1)*b22e(i)
+                 ! compute theta(i)
+                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
+                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
+                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
+                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
+                 theta(i) = atan2( sqrt(y1**2+y2**2), sqrt(x1**2+x2**2) )
+                 ! determine if there are bulges to chase or if a new direct
+                 ! summand has been reached
+                 restart11 =   b11d(i)**2 + b11bulge**2 <= thresh**2
+                 restart12 = b12e(i-1)**2 + b12bulge**2 <= thresh**2
+                 restart21 =   b21d(i)**2 + b21bulge**2 <= thresh**2
+                 restart22 = b22e(i-1)**2 + b22bulge**2 <= thresh**2
+                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
+                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
+                 ! chasing by applying the original shift again.
+                 if( .not. restart11 .and. .not. restart12 ) then
+                    call stdlib_slartgp( x2, x1, work(iu1sn+i-1), work(iu1cs+i-1),r )
+                 else if( .not. restart11 .and. restart12 ) then
+                    call stdlib_slartgp( b11bulge, b11d(i), work(iu1sn+i-1),work(iu1cs+i-1), r )
+                              
+                 else if( restart11 .and. .not. restart12 ) then
+                    call stdlib_slartgp( b12bulge, b12e(i-1), work(iu1sn+i-1),work(iu1cs+i-1), r )
+                              
+                 else if( mu <= nu ) then
+                    call stdlib_slartgs( b11e(i), b11d(i+1), mu, work(iu1cs+i-1),work(iu1sn+i-1) )
+                              
+                 else
+                    call stdlib_slartgs( b12d(i), b12e(i), nu, work(iu1cs+i-1),work(iu1sn+i-1) )
+                              
+                 end if
+                 if( .not. restart21 .and. .not. restart22 ) then
+                    call stdlib_slartgp( y2, y1, work(iu2sn+i-1), work(iu2cs+i-1),r )
+                 else if( .not. restart21 .and. restart22 ) then
+                    call stdlib_slartgp( b21bulge, b21d(i), work(iu2sn+i-1),work(iu2cs+i-1), r )
+                              
+                 else if( restart21 .and. .not. restart22 ) then
+                    call stdlib_slartgp( b22bulge, b22e(i-1), work(iu2sn+i-1),work(iu2cs+i-1), r )
+                              
+                 else if( nu < mu ) then
+                    call stdlib_slartgs( b21e(i), b21e(i+1), nu, work(iu2cs+i-1),work(iu2sn+i-1) )
+                              
+                 else
+                    call stdlib_slartgs( b22d(i), b22e(i), mu, work(iu2cs+i-1),work(iu2sn+i-1) )
+                              
+                 end if
+                 work(iu2cs+i-1) = -work(iu2cs+i-1)
+                 work(iu2sn+i-1) = -work(iu2sn+i-1)
+                 temp = work(iu1cs+i-1)*b11e(i) + work(iu1sn+i-1)*b11d(i+1)
+                 b11d(i+1) = work(iu1cs+i-1)*b11d(i+1) -work(iu1sn+i-1)*b11e(i)
+                 b11e(i) = temp
+                 if( i < imax - 1 ) then
+                    b11bulge = work(iu1sn+i-1)*b11e(i+1)
+                    b11e(i+1) = work(iu1cs+i-1)*b11e(i+1)
+                 end if
+                 temp = work(iu2cs+i-1)*b21e(i) + work(iu2sn+i-1)*b21d(i+1)
+                 b21d(i+1) = work(iu2cs+i-1)*b21d(i+1) -work(iu2sn+i-1)*b21e(i)
+                 b21e(i) = temp
+                 if( i < imax - 1 ) then
+                    b21bulge = work(iu2sn+i-1)*b21e(i+1)
+                    b21e(i+1) = work(iu2cs+i-1)*b21e(i+1)
+                 end if
+                 temp = work(iu1cs+i-1)*b12d(i) + work(iu1sn+i-1)*b12e(i)
+                 b12e(i) = work(iu1cs+i-1)*b12e(i) - work(iu1sn+i-1)*b12d(i)
+                 b12d(i) = temp
+                 b12bulge = work(iu1sn+i-1)*b12d(i+1)
+                 b12d(i+1) = work(iu1cs+i-1)*b12d(i+1)
+                 temp = work(iu2cs+i-1)*b22d(i) + work(iu2sn+i-1)*b22e(i)
+                 b22e(i) = work(iu2cs+i-1)*b22e(i) - work(iu2sn+i-1)*b22d(i)
+                 b22d(i) = temp
+                 b22bulge = work(iu2sn+i-1)*b22d(i+1)
+                 b22d(i+1) = work(iu2cs+i-1)*b22d(i+1)
+              end do
+              ! compute phi(imax-1)
+              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
+              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
+              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
+              phi(imax-1) = atan2( abs(x1), sqrt(y1**2+y2**2) )
+              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
+              restart12 = b12d(imax-1)**2 + b12bulge**2 <= thresh**2
+              restart22 = b22d(imax-1)**2 + b22bulge**2 <= thresh**2
+              if( .not. restart12 .and. .not. restart22 ) then
+                 call stdlib_slartgp( y2, y1, work(iv2tsn+imax-1-1),work(iv2tcs+imax-1-1), r )
+                           
+              else if( .not. restart12 .and. restart22 ) then
+                 call stdlib_slartgp( b12bulge, b12d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
+                           imax-1-1), r )
+              else if( restart12 .and. .not. restart22 ) then
+                 call stdlib_slartgp( b22bulge, b22d(imax-1), work(iv2tsn+imax-1-1),work(iv2tcs+&
+                           imax-1-1), r )
+              else if( nu < mu ) then
+                 call stdlib_slartgs( b12e(imax-1), b12d(imax), nu,work(iv2tcs+imax-1-1), work(&
+                           iv2tsn+imax-1-1) )
+              else
+                 call stdlib_slartgs( b22e(imax-1), b22d(imax), mu,work(iv2tcs+imax-1-1), work(&
+                           iv2tsn+imax-1-1) )
+              end if
+              temp = work(iv2tcs+imax-1-1)*b12e(imax-1) +work(iv2tsn+imax-1-1)*b12d(imax)
+              b12d(imax) = work(iv2tcs+imax-1-1)*b12d(imax) -work(iv2tsn+imax-1-1)*b12e(imax-1)
+                        
+              b12e(imax-1) = temp
+              temp = work(iv2tcs+imax-1-1)*b22e(imax-1) +work(iv2tsn+imax-1-1)*b22d(imax)
+              b22d(imax) = work(iv2tcs+imax-1-1)*b22d(imax) -work(iv2tsn+imax-1-1)*b22e(imax-1)
+                        
+              b22e(imax-1) = temp
+              ! update singular vectors
+              if( wantu1 ) then
+                 if( colmajor ) then
+                    call stdlib_slasr( 'R', 'V', 'F', p, imax-imin+1,work(iu1cs+imin-1), work(&
+                              iu1sn+imin-1),u1(1,imin), ldu1 )
+                 else
+                    call stdlib_slasr( 'L', 'V', 'F', imax-imin+1, p,work(iu1cs+imin-1), work(&
+                              iu1sn+imin-1),u1(imin,1), ldu1 )
+                 end if
+              end if
+              if( wantu2 ) then
+                 if( colmajor ) then
+                    call stdlib_slasr( 'R', 'V', 'F', m-p, imax-imin+1,work(iu2cs+imin-1), work(&
+                              iu2sn+imin-1),u2(1,imin), ldu2 )
+                 else
+                    call stdlib_slasr( 'L', 'V', 'F', imax-imin+1, m-p,work(iu2cs+imin-1), work(&
+                              iu2sn+imin-1),u2(imin,1), ldu2 )
+                 end if
+              end if
+              if( wantv1t ) then
+                 if( colmajor ) then
+                    call stdlib_slasr( 'L', 'V', 'F', imax-imin+1, q,work(iv1tcs+imin-1), work(&
+                              iv1tsn+imin-1),v1t(imin,1), ldv1t )
+                 else
+                    call stdlib_slasr( 'R', 'V', 'F', q, imax-imin+1,work(iv1tcs+imin-1), work(&
+                              iv1tsn+imin-1),v1t(1,imin), ldv1t )
+                 end if
+              end if
+              if( wantv2t ) then
+                 if( colmajor ) then
+                    call stdlib_slasr( 'L', 'V', 'F', imax-imin+1, m-q,work(iv2tcs+imin-1), work(&
+                              iv2tsn+imin-1),v2t(imin,1), ldv2t )
+                 else
+                    call stdlib_slasr( 'R', 'V', 'F', m-q, imax-imin+1,work(iv2tcs+imin-1), work(&
+                              iv2tsn+imin-1),v2t(1,imin), ldv2t )
+                 end if
+              end if
+              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
+              if( b11e(imax-1)+b21e(imax-1) > 0 ) then
+                 b11d(imax) = -b11d(imax)
+                 b21d(imax) = -b21d(imax)
+                 if( wantv1t ) then
+                    if( colmajor ) then
+                       call stdlib_sscal( q, negone, v1t(imax,1), ldv1t )
+                    else
+                       call stdlib_sscal( q, negone, v1t(1,imax), 1 )
+                    end if
+                 end if
+              end if
+              ! compute theta(imax)
+              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
+              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
+              theta(imax) = atan2( abs(y1), abs(x1) )
+              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
+              ! and b22(imax,imax-1)
+              if( b11d(imax)+b12e(imax-1) < 0 ) then
+                 b12d(imax) = -b12d(imax)
+                 if( wantu1 ) then
+                    if( colmajor ) then
+                       call stdlib_sscal( p, negone, u1(1,imax), 1 )
+                    else
+                       call stdlib_sscal( p, negone, u1(imax,1), ldu1 )
+                    end if
+                 end if
+              end if
+              if( b21d(imax)+b22e(imax-1) > 0 ) then
+                 b22d(imax) = -b22d(imax)
+                 if( wantu2 ) then
+                    if( colmajor ) then
+                       call stdlib_sscal( m-p, negone, u2(1,imax), 1 )
+                    else
+                       call stdlib_sscal( m-p, negone, u2(imax,1), ldu2 )
+                    end if
+                 end if
+              end if
+              ! fix signs on b12(imax,imax) and b22(imax,imax)
+              if( b12d(imax)+b22d(imax) < 0 ) then
+                 if( wantv2t ) then
+                    if( colmajor ) then
+                       call stdlib_sscal( m-q, negone, v2t(imax,1), ldv2t )
+                    else
+                       call stdlib_sscal( m-q, negone, v2t(1,imax), 1 )
+                    end if
+                 end if
+              end if
+              ! test for negligible sines or cosines
+              do i = imin, imax
+                 if( theta(i) < thresh ) then
+                    theta(i) = zero
+                 else if( theta(i) > piover2-thresh ) then
+                    theta(i) = piover2
+                 end if
+              end do
+              do i = imin, imax-1
+                 if( phi(i) < thresh ) then
+                    phi(i) = zero
+                 else if( phi(i) > piover2-thresh ) then
+                    phi(i) = piover2
+                 end if
+              end do
+              ! deflate
+              if (imax > 1) then
+                 do while( phi(imax-1) == zero )
+                    imax = imax - 1
+                    if (imax <= 1) exit
+                 end do
+              end if
+              if( imin > imax - 1 )imin = imax - 1
+              if (imin > 1) then
+                 do while (phi(imin-1) /= zero)
+                     imin = imin - 1
+                     if (imin <= 1) exit
+                 end do
+              end if
+              ! repeat main iteration loop
+           end do
+           ! postprocessing: order theta from least to greatest
+           do i = 1, q
+              mini = i
+              thetamin = theta(i)
+              do j = i+1, q
+                 if( theta(j) < thetamin ) then
+                    mini = j
+                    thetamin = theta(j)
+                 end if
+              end do
+              if( mini /= i ) then
+                 theta(mini) = theta(i)
+                 theta(i) = thetamin
+                 if( colmajor ) then
+                    if( wantu1 )call stdlib_sswap( p, u1(1,i), 1, u1(1,mini), 1 )
+                    if( wantu2 )call stdlib_sswap( m-p, u2(1,i), 1, u2(1,mini), 1 )
+                    if( wantv1t )call stdlib_sswap( q, v1t(i,1), ldv1t, v1t(mini,1), ldv1t )
+                              
+                    if( wantv2t )call stdlib_sswap( m-q, v2t(i,1), ldv2t, v2t(mini,1),ldv2t )
+                              
+                 else
+                    if( wantu1 )call stdlib_sswap( p, u1(i,1), ldu1, u1(mini,1), ldu1 )
+                    if( wantu2 )call stdlib_sswap( m-p, u2(i,1), ldu2, u2(mini,1), ldu2 )
+                    if( wantv1t )call stdlib_sswap( q, v1t(1,i), 1, v1t(1,mini), 1 )
+                    if( wantv2t )call stdlib_sswap( m-q, v2t(1,i), 1, v2t(1,mini), 1 )
+                 end if
+              end if
+           end do
+           return
+     end subroutine stdlib_sbbcsd
+
+     !> SDISNA: computes the reciprocal condition numbers for the eigenvectors
+     !> of a real symmetric or complex Hermitian matrix or for the left or
+     !> right singular vectors of a general m-by-n matrix. The reciprocal
+     !> condition number is the 'gap' between the corresponding eigenvalue or
+     !> singular value and the nearest other one.
+     !> The bound on the error, measured by angle in radians, in the I-th
+     !> computed vector is given by
+     !> SLAMCH( 'E' ) * ( ANORM / SEP( I ) )
+     !> where ANORM = 2-norm(A) = max( abs( D(j) ) ).  SEP(I) is not allowed
+     !> to be smaller than SLAMCH( 'E' )*ANORM in order to limit the size of
+     !> the error bound.
+     !> SDISNA may also be used to compute error bounds for eigenvectors of
+     !> the generalized symmetric definite eigenproblem.
+
+     pure subroutine stdlib_sdisna( job, m, n, d, sep, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: m, n
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*)
+           real(sp), intent(out) :: sep(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: decr, eigen, incr, left, right, sing
+           integer(ilp) :: i, k
+           real(sp) :: anorm, eps, newgap, oldgap, safmin, thresh
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           eigen = stdlib_lsame( job, 'E' )
+           left = stdlib_lsame( job, 'L' )
+           right = stdlib_lsame( job, 'R' )
+           sing = left .or. right
+           if( eigen ) then
+              k = m
+           else if( sing ) then
+              k = min( m, n )
+           end if
+           if( .not.eigen .and. .not.sing ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( k<0 ) then
+              info = -3
+           else
+              incr = .true.
+              decr = .true.
+              do i = 1, k - 1
+                 if( incr )incr = incr .and. d( i )<=d( i+1 )
+                 if( decr )decr = decr .and. d( i )>=d( i+1 )
+              end do
+              if( sing .and. k>0 ) then
+                 if( incr )incr = incr .and. zero<=d( 1 )
+                 if( decr )decr = decr .and. d( k )>=zero
+              end if
+              if( .not.( incr .or. decr ) )info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SDISNA', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==0 )return
+           ! compute reciprocal condition numbers
+           if( k==1 ) then
+              sep( 1 ) = stdlib_slamch( 'O' )
+           else
+              oldgap = abs( d( 2 )-d( 1 ) )
+              sep( 1 ) = oldgap
+              do i = 2, k - 1
+                 newgap = abs( d( i+1 )-d( i ) )
+                 sep( i ) = min( oldgap, newgap )
+                 oldgap = newgap
+              end do
+              sep( k ) = oldgap
+           end if
+           if( sing ) then
+              if( ( left .and. m>n ) .or. ( right .and. m<n ) ) then
+                 if( incr )sep( 1 ) = min( sep( 1 ), d( 1 ) )
+                 if( decr )sep( k ) = min( sep( k ), d( k ) )
+              end if
+           end if
+           ! ensure that reciprocal condition numbers are not less than
+           ! threshold, in order to limit the size of the error bound
+           eps = stdlib_slamch( 'E' )
+           safmin = stdlib_slamch( 'S' )
+           anorm = max( abs( d( 1 ) ), abs( d( k ) ) )
+           if( anorm==zero ) then
+              thresh = eps
+           else
+              thresh = max( eps*anorm, safmin )
+           end if
+           do i = 1, k
+              sep( i ) = max( sep( i ), thresh )
+           end do
+           return
+     end subroutine stdlib_sdisna
+
+     !> SGBBRD: reduces a real general m-by-n band matrix A to upper
+     !> bidiagonal form B by an orthogonal transformation: Q**T * A * P = B.
+     !> The routine computes B, and optionally forms Q or P**T, or computes
+     !> Q**T*C for a given matrix C.
+
+     pure subroutine stdlib_sgbbrd( vect, m, n, ncc, kl, ku, ab, ldab, d, e, q,ldq, pt, ldpt, c, &
+               ldc, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldc, ldpt, ldq, m, n, ncc
+           ! Array Arguments 
+           real(sp), intent(inout) :: ab(ldab,*), c(ldc,*)
+           real(sp), intent(out) :: d(*), e(*), pt(ldpt,*), q(ldq,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantb, wantc, wantpt, wantq
+           integer(ilp) :: i, inca, j, j1, j2, kb, kb1, kk, klm, klu1, kun, l, minmn, ml, ml0, mn,&
+                      mu, mu0, nr, nrt
+           real(sp) :: ra, rb, rc, rs
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           wantb = stdlib_lsame( vect, 'B' )
+           wantq = stdlib_lsame( vect, 'Q' ) .or. wantb
+           wantpt = stdlib_lsame( vect, 'P' ) .or. wantb
+           wantc = ncc>0
+           klu1 = kl + ku + 1
+           info = 0
+           if( .not.wantq .and. .not.wantpt .and. .not.stdlib_lsame( vect, 'N' ) )then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ncc<0 ) then
+              info = -4
+           else if( kl<0 ) then
+              info = -5
+           else if( ku<0 ) then
+              info = -6
+           else if( ldab<klu1 ) then
+              info = -8
+           else if( ldq<1 .or. wantq .and. ldq<max( 1, m ) ) then
+              info = -12
+           else if( ldpt<1 .or. wantpt .and. ldpt<max( 1, n ) ) then
+              info = -14
+           else if( ldc<1 .or. wantc .and. ldc<max( 1, m ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGBBRD', -info )
+              return
+           end if
+           ! initialize q and p**t to the unit matrix, if needed
+           if( wantq )call stdlib_slaset( 'FULL', m, m, zero, one, q, ldq )
+           if( wantpt )call stdlib_slaset( 'FULL', n, n, zero, one, pt, ldpt )
+           ! quick return if possible.
+           if( m==0 .or. n==0 )return
+           minmn = min( m, n )
+           if( kl+ku>1 ) then
+              ! reduce to upper bidiagonal form if ku > 0; if ku = 0, reduce
+              ! first to lower bidiagonal form and then transform to upper
+              ! bidiagonal
+              if( ku>0 ) then
+                 ml0 = 1
+                 mu0 = 2
+              else
+                 ml0 = 2
+                 mu0 = 1
+              end if
+              ! wherever possible, plane rotations are generated and applied in
+              ! vector operations of length nr over the index set j1:j2:klu1.
+              ! the sines of the plane rotations are stored in work(1:max(m,n))
+              ! and the cosines in work(max(m,n)+1:2*max(m,n)).
+              mn = max( m, n )
+              klm = min( m-1, kl )
+              kun = min( n-1, ku )
+              kb = klm + kun
+              kb1 = kb + 1
+              inca = kb1*ldab
+              nr = 0
+              j1 = klm + 2
+              j2 = 1 - kun
+              loop_90: do i = 1, minmn
+                 ! reduce i-th column and i-th row of matrix to bidiagonal form
+                 ml = klm + 1
+                 mu = kun + 1
+                 loop_80: do kk = 1, kb
+                    j1 = j1 + kb
+                    j2 = j2 + kb
+                    ! generate plane rotations to annihilate nonzero elements
+                    ! which have been created below the band
+                    if( nr>0 )call stdlib_slargv( nr, ab( klu1, j1-klm-1 ), inca,work( j1 ), kb1, &
+                              work( mn+j1 ), kb1 )
+                    ! apply plane rotations from the left
+                    do l = 1, kb
+                       if( j2-klm+l-1>n ) then
+                          nrt = nr - 1
+                       else
+                          nrt = nr
+                       end if
+                       if( nrt>0 )call stdlib_slartv( nrt, ab( klu1-l, j1-klm+l-1 ), inca,ab( &
+                                 klu1-l+1, j1-klm+l-1 ), inca,work( mn+j1 ), work( j1 ), kb1 )
+                    end do
+                    if( ml>ml0 ) then
+                       if( ml<=m-i+1 ) then
+                          ! generate plane rotation to annihilate a(i+ml-1,i)
+                          ! within the band, and apply rotation from the left
+                          call stdlib_slartg( ab( ku+ml-1, i ), ab( ku+ml, i ),work( mn+i+ml-1 ), &
+                                    work( i+ml-1 ),ra )
+                          ab( ku+ml-1, i ) = ra
+                          if( i<n )call stdlib_srot( min( ku+ml-2, n-i ),ab( ku+ml-2, i+1 ), ldab-&
+                                    1,ab( ku+ml-1, i+1 ), ldab-1,work( mn+i+ml-1 ), work( i+ml-1 ) )
+                       end if
+                       nr = nr + 1
+                       j1 = j1 - kb1
+                    end if
+                    if( wantq ) then
+                       ! accumulate product of plane rotations in q
+                       do j = j1, j2, kb1
+                          call stdlib_srot( m, q( 1, j-1 ), 1, q( 1, j ), 1,work( mn+j ), work( j &
+                                    ) )
+                       end do
+                    end if
+                    if( wantc ) then
+                       ! apply plane rotations to c
+                       do j = j1, j2, kb1
+                          call stdlib_srot( ncc, c( j-1, 1 ), ldc, c( j, 1 ), ldc,work( mn+j ), &
+                                    work( j ) )
+                       end do
+                    end if
+                    if( j2+kun>n ) then
+                       ! adjust j2 to keep within the bounds of the matrix
+                       nr = nr - 1
+                       j2 = j2 - kb1
+                    end if
+                    do j = j1, j2, kb1
+                       ! create nonzero element a(j-1,j+ku) above the band
+                       ! and store it in work(n+1:2*n)
+                       work( j+kun ) = work( j )*ab( 1, j+kun )
+                       ab( 1, j+kun ) = work( mn+j )*ab( 1, j+kun )
+                    end do
+                    ! generate plane rotations to annihilate nonzero elements
+                    ! which have been generated above the band
+                    if( nr>0 )call stdlib_slargv( nr, ab( 1, j1+kun-1 ), inca,work( j1+kun ), kb1,&
+                               work( mn+j1+kun ),kb1 )
+                    ! apply plane rotations from the right
+                    do l = 1, kb
+                       if( j2+l-1>m ) then
+                          nrt = nr - 1
+                       else
+                          nrt = nr
+                       end if
+                       if( nrt>0 )call stdlib_slartv( nrt, ab( l+1, j1+kun-1 ), inca,ab( l, j1+&
+                                 kun ), inca,work( mn+j1+kun ), work( j1+kun ),kb1 )
+                    end do
+                    if( ml==ml0 .and. mu>mu0 ) then
+                       if( mu<=n-i+1 ) then
+                          ! generate plane rotation to annihilate a(i,i+mu-1)
+                          ! within the band, and apply rotation from the right
+                          call stdlib_slartg( ab( ku-mu+3, i+mu-2 ),ab( ku-mu+2, i+mu-1 ),work( &
+                                    mn+i+mu-1 ), work( i+mu-1 ),ra )
+                          ab( ku-mu+3, i+mu-2 ) = ra
+                          call stdlib_srot( min( kl+mu-2, m-i ),ab( ku-mu+4, i+mu-2 ), 1,ab( ku-&
+                                    mu+3, i+mu-1 ), 1,work( mn+i+mu-1 ), work( i+mu-1 ) )
+                       end if
+                       nr = nr + 1
+                       j1 = j1 - kb1
+                    end if
+                    if( wantpt ) then
+                       ! accumulate product of plane rotations in p**t
+                       do j = j1, j2, kb1
+                          call stdlib_srot( n, pt( j+kun-1, 1 ), ldpt,pt( j+kun, 1 ), ldpt, work( &
+                                    mn+j+kun ),work( j+kun ) )
+                       end do
+                    end if
+                    if( j2+kb>m ) then
+                       ! adjust j2 to keep within the bounds of the matrix
+                       nr = nr - 1
+                       j2 = j2 - kb1
+                    end if
+                    do j = j1, j2, kb1
+                       ! create nonzero element a(j+kl+ku,j+ku-1) below the
+                       ! band and store it in work(1:n)
+                       work( j+kb ) = work( j+kun )*ab( klu1, j+kun )
+                       ab( klu1, j+kun ) = work( mn+j+kun )*ab( klu1, j+kun )
+                    end do
+                    if( ml>ml0 ) then
+                       ml = ml - 1
+                    else
+                       mu = mu - 1
+                    end if
+                 end do loop_80
+              end do loop_90
+           end if
+           if( ku==0 .and. kl>0 ) then
+              ! a has been reduced to lower bidiagonal form
+              ! transform lower bidiagonal form to upper bidiagonal by applying
+              ! plane rotations from the left, storing diagonal elements in d
+              ! and off-diagonal elements in e
+              do i = 1, min( m-1, n )
+                 call stdlib_slartg( ab( 1, i ), ab( 2, i ), rc, rs, ra )
+                 d( i ) = ra
+                 if( i<n ) then
+                    e( i ) = rs*ab( 1, i+1 )
+                    ab( 1, i+1 ) = rc*ab( 1, i+1 )
+                 end if
+                 if( wantq )call stdlib_srot( m, q( 1, i ), 1, q( 1, i+1 ), 1, rc, rs )
+                 if( wantc )call stdlib_srot( ncc, c( i, 1 ), ldc, c( i+1, 1 ), ldc, rc,rs )
+                           
+              end do
+              if( m<=n )d( m ) = ab( 1, m )
+           else if( ku>0 ) then
+              ! a has been reduced to upper bidiagonal form
+              if( m<n ) then
+                 ! annihilate a(m,m+1) by applying plane rotations from the
+                 ! right, storing diagonal elements in d and off-diagonal
+                 ! elements in e
+                 rb = ab( ku, m+1 )
+                 do i = m, 1, -1
+                    call stdlib_slartg( ab( ku+1, i ), rb, rc, rs, ra )
+                    d( i ) = ra
+                    if( i>1 ) then
+                       rb = -rs*ab( ku, i )
+                       e( i-1 ) = rc*ab( ku, i )
+                    end if
+                    if( wantpt )call stdlib_srot( n, pt( i, 1 ), ldpt, pt( m+1, 1 ), ldpt,rc, rs )
+                              
+                 end do
+              else
+                 ! copy off-diagonal elements to e and diagonal elements to d
+                 do i = 1, minmn - 1
+                    e( i ) = ab( ku, i+1 )
+                 end do
+                 do i = 1, minmn
+                    d( i ) = ab( ku+1, i )
+                 end do
+              end if
+           else
+              ! a is diagonal. set elements of e to zero and copy diagonal
+              ! elements to d.
+              do i = 1, minmn - 1
+                 e( i ) = zero
+              end do
+              do i = 1, minmn
+                 d( i ) = ab( 1, i )
+              end do
+           end if
+           return
+     end subroutine stdlib_sgbbrd
+
+     !> SGBCON: estimates the reciprocal of the condition number of a real
+     !> general band matrix A, in either the 1-norm or the infinity-norm,
+     !> using the LU factorization computed by SGBTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     pure subroutine stdlib_sgbcon( norm, n, kl, ku, ab, ldab, ipiv, anorm, rcond,work, iwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: ab(ldab,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnoti, onenrm
+           character :: normin
+           integer(ilp) :: ix, j, jp, kase, kase1, kd, lm
+           real(sp) :: ainvnm, scale, smlnum, t
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<2*kl+ku+1 ) then
+              info = -6
+           else if( anorm<zero ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           ! estimate the norm of inv(a).
+           ainvnm = zero
+           normin = 'N'
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kd = kl + ku + 1
+           lnoti = kl>0
+           kase = 0
+           10 continue
+           call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(l).
+                 if( lnoti ) then
+                    do j = 1, n - 1
+                       lm = min( kl, n-j )
+                       jp = ipiv( j )
+                       t = work( jp )
+                       if( jp/=j ) then
+                          work( jp ) = work( j )
+                          work( j ) = t
+                       end if
+                       call stdlib_saxpy( lm, -t, ab( kd+1, j ), 1, work( j+1 ), 1 )
+                    end do
+                 end if
+                 ! multiply by inv(u).
+                 call stdlib_slatbs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kl+ku, ab, &
+                           ldab, work, scale, work( 2*n+1 ),info )
+              else
+                 ! multiply by inv(u**t).
+                 call stdlib_slatbs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n,kl+ku, ab, ldab, &
+                           work, scale, work( 2*n+1 ),info )
+                 ! multiply by inv(l**t).
+                 if( lnoti ) then
+                    do j = n - 1, 1, -1
+                       lm = min( kl, n-j )
+                       work( j ) = work( j ) - stdlib_sdot( lm, ab( kd+1, j ), 1,work( j+1 ), 1 )
+                                 
+                       jp = ipiv( j )
+                       if( jp/=j ) then
+                          t = work( jp )
+                          work( jp ) = work( j )
+                          work( j ) = t
+                       end if
+                    end do
+                 end if
+              end if
+              ! divide x by 1/scale if doing so will not cause overflow.
+              normin = 'Y'
+              if( scale/=one ) then
+                 ix = stdlib_isamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 40
+                 call stdlib_srscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           40 continue
+           return
+     end subroutine stdlib_sgbcon
+
+     !> SGBEQU: computes row and column scalings intended to equilibrate an
+     !> M-by-N band matrix A and reduce its condition number.  R returns the
+     !> row scale factors and C the column scale factors, chosen to try to
+     !> make the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+     !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+     !> number and BIGNUM = largest safe number.  Use of these scaling
+     !> factors is not guaranteed to reduce the condition number of A but
+     !> works well in practice.
+
+     pure subroutine stdlib_sgbequ( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(sp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(sp), intent(in) :: ab(ldab,*)
+           real(sp), intent(out) :: c(*), r(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(sp) :: bignum, rcmax, rcmin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGBEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 r( i ) = max( r( i ), abs( ab( kd+i-j, j ) ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i))
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 c( j ) = max( c( j ), abs( ab( kd+i-j, j ) )*r( i ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j))
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_sgbequ
+
+     !> SGBEQUB: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+     !> the radix.
+     !> R(i) and C(j) are restricted to be a power of the radix between
+     !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+     !> of these scaling factors is not guaranteed to reduce the condition
+     !> number of A but works well in practice.
+     !> This routine differs from SGEEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled entries' magnitudes are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_sgbequb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(sp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(sp), intent(in) :: ab(ldab,*)
+           real(sp), intent(out) :: c(*), r(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(sp) :: bignum, rcmax, rcmin, smlnum, radix, logrdx
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,log
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGBEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.  assume smlnum is a power of the radix.
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           radix = stdlib_slamch( 'B' )
+           logrdx = log(radix)
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 r( i ) = max( r( i ), abs( ab( kd+i-j, j ) ) )
+              end do
+           end do
+           do i = 1, m
+              if( r( i )>zero ) then
+                 r( i ) = radix**int( log( r( i ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i)).
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors.
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 c( j ) = max( c( j ), abs( ab( kd+i-j, j ) )*r( i ) )
+              end do
+              if( c( j )>zero ) then
+                 c( j ) = radix**int( log( c( j ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j)).
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_sgbequb
+
+     !> SGBRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is banded, and provides
+     !> error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_sgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb,ipiv, b, ldb, x, &
+               ldx, ferr, berr, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldafb, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*)
+           real(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transt
+           integer(ilp) :: count, i, j, k, kase, kk, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<kl+ku+1 ) then
+              info = -7
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -9
+           else if( ldb<max( 1, n ) ) then
+              info = -12
+           else if( ldx<max( 1, n ) ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = min( kl+ku+2, n+1 )
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_scopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_sgbmv( trans, n, n, kl, ku, -one, ab, ldab, x( 1, j ), 1,one, work( n+1 &
+                        ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(op(a))*abs(x) + abs(b).
+              if( notran ) then
+                 do k = 1, n
+                    kk = ku + 1 - k
+                    xk = abs( x( k, j ) )
+                    do i = max( 1, k-ku ), min( n, k+kl )
+                       work( i ) = work( i ) + abs( ab( kk+i, k ) )*xk
+                    end do
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    kk = ku + 1 - k
+                    do i = max( 1, k-ku ), min( n, k+kl )
+                       s = s + abs( ab( kk+i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_sgbtrs( trans, n, kl, ku, 1, afb, ldafb, ipiv,work( n+1 ), n, info )
+                           
+                 call stdlib_saxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_slacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_slacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_sgbtrs( transt, n, kl, ku, 1, afb, ldafb, ipiv,work( n+1 ), n, &
+                              info )
+                    do i = 1, n
+                       work( n+i ) = work( n+i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( n+i )*work( i )
+                    end do
+                    call stdlib_sgbtrs( trans, n, kl, ku, 1, afb, ldafb, ipiv,work( n+1 ), n, &
+                              info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_sgbrfs
+
+     !> SGBTRF: computes an LU factorization of a real m-by-n band matrix A
+     !> using partial pivoting with row interchanges.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_sgbtrf( m, n, kl, ku, ab, ldab, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldwork = nbmax+1
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i2, i3, ii, ip, j, j2, j3, jb, jj, jm, jp, ju, k2, km, kv, nb, &
+                     nw
+           real(sp) :: temp
+           ! Local Arrays 
+           real(sp) :: work13(ldwork,nbmax), work31(ldwork,nbmax)
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! kv is the number of superdiagonals in the factor u, allowing for
+           ! fill-in
+           kv = ku + kl
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+kv+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGBTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! determine the block size for this environment
+           nb = stdlib_ilaenv( 1, 'SGBTRF', ' ', m, n, kl, ku )
+           ! the block size must not exceed the limit set by the size of the
+           ! local arrays work13 and work31.
+           nb = min( nb, nbmax )
+           if( nb<=1 .or. nb>kl ) then
+              ! use unblocked code
+              call stdlib_sgbtf2( m, n, kl, ku, ab, ldab, ipiv, info )
+           else
+              ! use blocked code
+              ! zero the superdiagonal elements of the work array work13
+              do j = 1, nb
+                 do i = 1, j - 1
+                    work13( i, j ) = zero
+                 end do
+              end do
+              ! zero the subdiagonal elements of the work array work31
+              do j = 1, nb
+                 do i = j + 1, nb
+                    work31( i, j ) = zero
+                 end do
+              end do
+              ! gaussian elimination with partial pivoting
+              ! set fill-in elements in columns ku+2 to kv to zero
+              do j = ku + 2, min( kv, n )
+                 do i = kv - j + 2, kl
+                    ab( i, j ) = zero
+                 end do
+              end do
+              ! ju is the index of the last column affected by the current
+              ! stage of the factorization
+              ju = 1
+              loop_180: do j = 1, min( m, n ), nb
+                 jb = min( nb, min( m, n )-j+1 )
+                 ! the active part of the matrix is partitioned
+                    ! a11   a12   a13
+                    ! a21   a22   a23
+                    ! a31   a32   a33
+                 ! here a11, a21 and a31 denote the current block of jb columns
+                 ! which is about to be factorized. the number of rows in the
+                 ! partitioning are jb, i2, i3 respectively, and the numbers
+                 ! of columns are jb, j2, j3. the superdiagonal elements of a13
+                 ! and the subdiagonal elements of a31 lie outside the band.
+                 i2 = min( kl-jb, m-j-jb+1 )
+                 i3 = min( jb, m-j-kl+1 )
+                 ! j2 and j3 are computed after ju has been updated.
+                 ! factorize the current block of jb columns
+                 loop_80: do jj = j, j + jb - 1
+                    ! set fill-in elements in column jj+kv to zero
+                    if( jj+kv<=n ) then
+                       do i = 1, kl
+                          ab( i, jj+kv ) = zero
+                       end do
+                    end if
+                    ! find pivot and test for singularity. km is the number of
+                    ! subdiagonal elements in the current column.
+                    km = min( kl, m-jj )
+                    jp = stdlib_isamax( km+1, ab( kv+1, jj ), 1 )
+                    ipiv( jj ) = jp + jj - j
+                    if( ab( kv+jp, jj )/=zero ) then
+                       ju = max( ju, min( jj+ku+jp-1, n ) )
+                       if( jp/=1 ) then
+                          ! apply interchange to columns j to j+jb-1
+                          if( jp+jj-1<j+kl ) then
+                             call stdlib_sswap( jb, ab( kv+1+jj-j, j ), ldab-1,ab( kv+jp+jj-j, j )&
+                                       , ldab-1 )
+                          else
+                             ! the interchange affects columns j to jj-1 of a31
+                             ! which are stored in the work array work31
+                             call stdlib_sswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,work31( jp+jj-j-&
+                                       kl, 1 ), ldwork )
+                             call stdlib_sswap( j+jb-jj, ab( kv+1, jj ), ldab-1,ab( kv+jp, jj ), &
+                                       ldab-1 )
+                          end if
+                       end if
+                       ! compute multipliers
+                       call stdlib_sscal( km, one / ab( kv+1, jj ), ab( kv+2, jj ),1 )
+                       ! update trailing submatrix within the band and within
+                       ! the current block. jm is the index of the last column
+                       ! which needs to be updated.
+                       jm = min( ju, j+jb-1 )
+                       if( jm>jj )call stdlib_sger( km, jm-jj, -one, ab( kv+2, jj ), 1,ab( kv, jj+&
+                                 1 ), ldab-1,ab( kv+1, jj+1 ), ldab-1 )
+                    else
+                       ! if pivot is zero, set info to the index of the pivot
+                       ! unless a zero pivot has already been found.
+                       if( info==0 )info = jj
+                    end if
+                    ! copy current column of a31 into the work array work31
+                    nw = min( jj-j+1, i3 )
+                    if( nw>0 )call stdlib_scopy( nw, ab( kv+kl+1-jj+j, jj ), 1,work31( 1, jj-j+1 )&
+                              , 1 )
+                 end do loop_80
+                 if( j+jb<=n ) then
+                    ! apply the row interchanges to the other blocks.
+                    j2 = min( ju-j+1, kv ) - jb
+                    j3 = max( 0, ju-j-kv+1 )
+                    ! use stdlib_slaswp to apply the row interchanges to a12, a22, and
+                    ! a32.
+                    call stdlib_slaswp( j2, ab( kv+1-jb, j+jb ), ldab-1, 1, jb,ipiv( j ), 1 )
+                              
+                    ! adjust the pivot indices.
+                    do i = j, j + jb - 1
+                       ipiv( i ) = ipiv( i ) + j - 1
+                    end do
+                    ! apply the row interchanges to a13, a23, and a33
+                    ! columnwise.
+                    k2 = j - 1 + jb + j2
+                    do i = 1, j3
+                       jj = k2 + i
+                       do ii = j + i - 1, j + jb - 1
+                          ip = ipiv( ii )
+                          if( ip/=ii ) then
+                             temp = ab( kv+1+ii-jj, jj )
+                             ab( kv+1+ii-jj, jj ) = ab( kv+1+ip-jj, jj )
+                             ab( kv+1+ip-jj, jj ) = temp
+                          end if
+                       end do
+                    end do
+                    ! update the relevant part of the trailing submatrix
+                    if( j2>0 ) then
+                       ! update a12
+                       call stdlib_strsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT',jb, j2, one, ab(&
+                                  kv+1, j ), ldab-1,ab( kv+1-jb, j+jb ), ldab-1 )
+                       if( i2>0 ) then
+                          ! update a22
+                          call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i2, j2,jb, -one, ab( &
+                          kv+1+jb, j ), ldab-1,ab( kv+1-jb, j+jb ), ldab-1, one,ab( kv+1, j+jb ), &
+                                    ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! update a32
+                          call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i3, j2,jb, -one, &
+                          work31, ldwork,ab( kv+1-jb, j+jb ), ldab-1, one,ab( kv+kl+1-jb, j+jb ), &
+                                    ldab-1 )
+                       end if
+                    end if
+                    if( j3>0 ) then
+                       ! copy the lower triangle of a13 into the work array
+                       ! work13
+                       do jj = 1, j3
+                          do ii = jj, jb
+                             work13( ii, jj ) = ab( ii-jj+1, jj+j+kv-1 )
+                          end do
+                       end do
+                       ! update a13 in the work array
+                       call stdlib_strsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT',jb, j3, one, ab(&
+                                  kv+1, j ), ldab-1,work13, ldwork )
+                       if( i2>0 ) then
+                          ! update a23
+                          call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i2, j3,jb, -one, ab( &
+                          kv+1+jb, j ), ldab-1,work13, ldwork, one, ab( 1+jb, j+kv ),ldab-1 )
+                                    
+                       end if
+                       if( i3>0 ) then
+                          ! update a33
+                          call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i3, j3,jb, -one, &
+                                    work31, ldwork, work13,ldwork, one, ab( 1+kl, j+kv ), ldab-1 )
+                       end if
+                       ! copy the lower triangle of a13 back into place
+                       do jj = 1, j3
+                          do ii = jj, jb
+                             ab( ii-jj+1, jj+j+kv-1 ) = work13( ii, jj )
+                          end do
+                       end do
+                    end if
+                 else
+                    ! adjust the pivot indices.
+                    do i = j, j + jb - 1
+                       ipiv( i ) = ipiv( i ) + j - 1
+                    end do
+                 end if
+                 ! partially undo the interchanges in the current block to
+                 ! restore the upper triangular form of a31 and copy the upper
+                 ! triangle of a31 back into place
+                 do jj = j + jb - 1, j, -1
+                    jp = ipiv( jj ) - jj + 1
+                    if( jp/=1 ) then
+                       ! apply interchange to columns j to jj-1
+                       if( jp+jj-1<j+kl ) then
+                          ! the interchange does not affect a31
+                          call stdlib_sswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,ab( kv+jp+jj-j, j ),&
+                                     ldab-1 )
+                       else
+                          ! the interchange does affect a31
+                          call stdlib_sswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,work31( jp+jj-j-kl, &
+                                    1 ), ldwork )
+                       end if
+                    end if
+                    ! copy the current column of a31 back into place
+                    nw = min( i3, jj-j+1 )
+                    if( nw>0 )call stdlib_scopy( nw, work31( 1, jj-j+1 ), 1,ab( kv+kl+1-jj+j, jj )&
+                              , 1 )
+                 end do
+              end do loop_180
+           end if
+           return
+     end subroutine stdlib_sgbtrf
+
+     !> SGECON: estimates the reciprocal of the condition number of a general
+     !> real matrix A, in either the 1-norm or the infinity-norm, using
+     !> the LU factorization computed by SGETRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     pure subroutine stdlib_sgecon( norm, n, a, lda, anorm, rcond, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: onenrm
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(sp) :: ainvnm, scale, sl, smlnum, su
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGECON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           ! estimate the norm of inv(a).
+           ainvnm = zero
+           normin = 'N'
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kase = 0
+           10 continue
+           call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(l).
+                 call stdlib_slatrs( 'LOWER', 'NO TRANSPOSE', 'UNIT', normin, n, a,lda, work, sl, &
+                           work( 2*n+1 ), info )
+                 ! multiply by inv(u).
+                 call stdlib_slatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           su, work( 3*n+1 ), info )
+              else
+                 ! multiply by inv(u**t).
+                 call stdlib_slatrs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n, a,lda, work, su,&
+                            work( 3*n+1 ), info )
+                 ! multiply by inv(l**t).
+                 call stdlib_slatrs( 'LOWER', 'TRANSPOSE', 'UNIT', normin, n, a,lda, work, sl, &
+                           work( 2*n+1 ), info )
+              end if
+              ! divide x by 1/(sl*su) if doing so will not cause overflow.
+              scale = sl*su
+              normin = 'Y'
+              if( scale/=one ) then
+                 ix = stdlib_isamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_srscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_sgecon
+
+     !> SGEEQU: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+     !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+     !> number and BIGNUM = largest safe number.  Use of these scaling
+     !> factors is not guaranteed to reduce the condition number of A but
+     !> works well in practice.
+
+     pure subroutine stdlib_sgeequ( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           real(sp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: c(*), r(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: bignum, rcmax, rcmin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           do j = 1, n
+              do i = 1, m
+                 r( i ) = max( r( i ), abs( a( i, j ) ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i))
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = 1, m
+                 c( j ) = max( c( j ), abs( a( i, j ) )*r( i ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j))
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_sgeequ
+
+     !> SGEEQUB: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+     !> the radix.
+     !> R(i) and C(j) are restricted to be a power of the radix between
+     !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+     !> of these scaling factors is not guaranteed to reduce the condition
+     !> number of A but works well in practice.
+     !> This routine differs from SGEEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled entries' magnitudes are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_sgeequb( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           real(sp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: c(*), r(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: bignum, rcmax, rcmin, smlnum, radix, logrdx
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,log
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.  assume smlnum is a power of the radix.
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           radix = stdlib_slamch( 'B' )
+           logrdx = log( radix )
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           do j = 1, n
+              do i = 1, m
+                 r( i ) = max( r( i ), abs( a( i, j ) ) )
+              end do
+           end do
+           do i = 1, m
+              if( r( i )>zero ) then
+                 r( i ) = radix**int( log( r( i ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i)).
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = 1, m
+                 c( j ) = max( c( j ), abs( a( i, j ) )*r( i ) )
+              end do
+              if( c( j )>zero ) then
+                 c( j ) = radix**int( log( c( j ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j)).
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_sgeequb
+
+     !> DGEMLQT overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q C            C Q
+     !> TRANS = 'T':   Q**T C            C Q**T
+     !> where Q is a real orthogonal matrix defined as the product of K
+     !> elementary reflectors:
+     !> Q = H(1) H(2) . . . H(K) = I - V T V**T
+     !> generated using the compact WY representation as returned by SGELQT.
+     !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+
+     pure subroutine stdlib_sgemlqt( side, trans, m, n, k, mb, v, ldv, t, ldt,c, ldc, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, ldc, m, n, mb, ldt
+           ! Array Arguments 
+           real(sp), intent(in) :: v(ldv,*), t(ldt,*)
+           real(sp), intent(inout) :: c(ldc,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, ldwork, kf, q
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'T' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( left ) then
+              ldwork = max( 1, n )
+              q = m
+           else if ( right ) then
+              ldwork = max( 1, m )
+              q = n
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>q ) then
+              info = -5
+           else if( mb<1 .or. (mb>k .and. k>0)) then
+              info = -6
+           else if( ldv<max( 1, k ) ) then
+               info = -8
+           else if( ldt<mb ) then
+              info = -10
+           else if( ldc<max( 1, m ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEMLQT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. notran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_slarfb( 'L', 'T', 'F', 'R', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. tran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_slarfb( 'R', 'N', 'F', 'R', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           else if( left .and. tran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_slarfb( 'L', 'N', 'F', 'R', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. notran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_slarfb( 'R', 'T', 'F', 'R', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           end if
+           return
+     end subroutine stdlib_sgemlqt
+
+     !> SGEMQRT: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q C            C Q
+     !> TRANS = 'T':   Q**T C            C Q**T
+     !> where Q is a real orthogonal matrix defined as the product of K
+     !> elementary reflectors:
+     !> Q = H(1) H(2) . . . H(K) = I - V T V**T
+     !> generated using the compact WY representation as returned by SGEQRT.
+     !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+
+     pure subroutine stdlib_sgemqrt( side, trans, m, n, k, nb, v, ldv, t, ldt,c, ldc, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, ldc, m, n, nb, ldt
+           ! Array Arguments 
+           real(sp), intent(in) :: v(ldv,*), t(ldt,*)
+           real(sp), intent(inout) :: c(ldc,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, ldwork, kf, q
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'T' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( left ) then
+              ldwork = max( 1, n )
+              q = m
+           else if ( right ) then
+              ldwork = max( 1, m )
+              q = n
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>q ) then
+              info = -5
+           else if( nb<1 .or. (nb>k .and. k>0)) then
+              info = -6
+           else if( ldv<max( 1, q ) ) then
+              info = -8
+           else if( ldt<nb ) then
+              info = -10
+           else if( ldc<max( 1, m ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEMQRT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. tran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_slarfb( 'L', 'T', 'F', 'C', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. notran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_slarfb( 'R', 'N', 'F', 'C', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           else if( left .and. notran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_slarfb( 'L', 'N', 'F', 'C', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. tran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_slarfb( 'R', 'T', 'F', 'C', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           end if
+           return
+     end subroutine stdlib_sgemqrt
+
+     !> SGESC2: solves a system of linear equations
+     !> A * X = scale* RHS
+     !> with a general N-by-N matrix A using the LU factorization with
+     !> complete pivoting computed by SGETC2.
+
+     pure subroutine stdlib_sgesc2( n, a, lda, rhs, ipiv, jpiv, scale )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(out) :: scale
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*), jpiv(*)
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: rhs(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: bignum, eps, smlnum, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+            ! set constant to control overflow
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! apply permutations ipiv to rhs
+           call stdlib_slaswp( 1, rhs, lda, 1, n-1, ipiv, 1 )
+           ! solve for l part
+           do i = 1, n - 1
+              do j = i + 1, n
+                 rhs( j ) = rhs( j ) - a( j, i )*rhs( i )
+              end do
+           end do
+           ! solve for u part
+           scale = one
+           ! check for scaling
+           i = stdlib_isamax( n, rhs, 1 )
+           if( two*smlnum*abs( rhs( i ) )>abs( a( n, n ) ) ) then
+              temp = ( one / two ) / abs( rhs( i ) )
+              call stdlib_sscal( n, temp, rhs( 1 ), 1 )
+              scale = scale*temp
+           end if
+           do i = n, 1, -1
+              temp = one / a( i, i )
+              rhs( i ) = rhs( i )*temp
+              do j = i + 1, n
+                 rhs( i ) = rhs( i ) - rhs( j )*( a( i, j )*temp )
+              end do
+           end do
+           ! apply permutations jpiv to the solution (rhs)
+           call stdlib_slaswp( 1, rhs, lda, 1, n-1, jpiv, -1 )
+           return
+     end subroutine stdlib_sgesc2
+
+     !> SGETC2: computes an LU factorization with complete pivoting of the
+     !> n-by-n matrix A. The factorization has the form A = P * L * U * Q,
+     !> where P and Q are permutation matrices, L is lower triangular with
+     !> unit diagonal elements and U is upper triangular.
+     !> This is the Level 2 BLAS algorithm.
+
+     pure subroutine stdlib_sgetc2( n, a, lda, ipiv, jpiv, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*), jpiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ip, ipv, j, jp, jpv
+           real(sp) :: bignum, eps, smin, smlnum, xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! handle the case n=1 by itself
+           if( n==1 ) then
+              ipiv( 1 ) = 1
+              jpiv( 1 ) = 1
+              if( abs( a( 1, 1 ) )<smlnum ) then
+                 info = 1
+                 a( 1, 1 ) = smlnum
+              end if
+              return
+           end if
+           ! factorize a using complete pivoting.
+           ! set pivots less than smin to smin.
+           loop_40: do i = 1, n - 1
+              ! find max element in matrix a
+              xmax = zero
+              do ip = i, n
+                 do jp = i, n
+                    if( abs( a( ip, jp ) )>=xmax ) then
+                       xmax = abs( a( ip, jp ) )
+                       ipv = ip
+                       jpv = jp
+                    end if
+                 end do
+              end do
+              if( i==1 )smin = max( eps*xmax, smlnum )
+              ! swap rows
+              if( ipv/=i )call stdlib_sswap( n, a( ipv, 1 ), lda, a( i, 1 ), lda )
+              ipiv( i ) = ipv
+              ! swap columns
+              if( jpv/=i )call stdlib_sswap( n, a( 1, jpv ), 1, a( 1, i ), 1 )
+              jpiv( i ) = jpv
+              ! check for singularity
+              if( abs( a( i, i ) )<smin ) then
+                 info = i
+                 a( i, i ) = smin
+              end if
+              do j = i + 1, n
+                 a( j, i ) = a( j, i ) / a( i, i )
+              end do
+              call stdlib_sger( n-i, n-i, -one, a( i+1, i ), 1, a( i, i+1 ), lda,a( i+1, i+1 ), &
+                        lda )
+           end do loop_40
+           if( abs( a( n, n ) )<smin ) then
+              info = n
+              a( n, n ) = smin
+           end if
+           ! set last pivots to n
+           ipiv( n ) = n
+           jpiv( n ) = n
+           return
+     end subroutine stdlib_sgetc2
+
+     !> SGETF2: computes an LU factorization of a general m-by-n matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the right-looking Level 2 BLAS version of the algorithm.
+
+     pure subroutine stdlib_sgetf2( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: sfmin
+           integer(ilp) :: i, j, jp
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGETF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! compute machine safe minimum
+           sfmin = stdlib_slamch('S')
+           do j = 1, min( m, n )
+              ! find pivot and test for singularity.
+              jp = j - 1 + stdlib_isamax( m-j+1, a( j, j ), 1 )
+              ipiv( j ) = jp
+              if( a( jp, j )/=zero ) then
+                 ! apply the interchange to columns 1:n.
+                 if( jp/=j )call stdlib_sswap( n, a( j, 1 ), lda, a( jp, 1 ), lda )
+                 ! compute elements j+1:m of j-th column.
+                 if( j<m ) then
+                    if( abs(a( j, j )) >= sfmin ) then
+                       call stdlib_sscal( m-j, one / a( j, j ), a( j+1, j ), 1 )
+                    else
+                      do i = 1, m-j
+                         a( j+i, j ) = a( j+i, j ) / a( j, j )
+                      end do
+                    end if
+                 end if
+              else if( info==0 ) then
+                 info = j
+              end if
+              if( j<min( m, n ) ) then
+                 ! update trailing submatrix.
+                 call stdlib_sger( m-j, n-j, -one, a( j+1, j ), 1, a( j, j+1 ), lda,a( j+1, j+1 ),&
+                            lda )
+              end if
+           end do
+           return
+     end subroutine stdlib_sgetf2
+
+     !> SGETRF2: computes an LU factorization of a general M-by-N matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the recursive version of the algorithm. It divides
+     !> the matrix into four submatrices:
+     !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+     !> A = [ -----|----- ]  with n1 = min(m,n)/2
+     !> [  A21 | A22  ]       n2 = n-n1
+     !> [ A11 ]
+     !> The subroutine calls itself to factor [ --- ],
+     !> [ A12 ]
+     !> [ A12 ]
+     !> do the swaps on [ --- ], solve A12, update A22,
+     !> [ A22 ]
+     !> then calls itself to factor A22 and do the swaps on A21.
+
+     pure recursive subroutine stdlib_sgetrf2( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: sfmin, temp
+           integer(ilp) :: i, iinfo, n1, n2
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGETRF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if ( m==1 ) then
+              ! use unblocked code for one row case
+              ! just need to handle ipiv and info
+              ipiv( 1 ) = 1
+              if ( a(1,1)==zero )info = 1
+           else if( n==1 ) then
+              ! use unblocked code for one column case
+              ! compute machine safe minimum
+              sfmin = stdlib_slamch('S')
+              ! find pivot and test for singularity
+              i = stdlib_isamax( m, a( 1, 1 ), 1 )
+              ipiv( 1 ) = i
+              if( a( i, 1 )/=zero ) then
+                 ! apply the interchange
+                 if( i/=1 ) then
+                    temp = a( 1, 1 )
+                    a( 1, 1 ) = a( i, 1 )
+                    a( i, 1 ) = temp
+                 end if
+                 ! compute elements 2:m of the column
+                 if( abs(a( 1, 1 )) >= sfmin ) then
+                    call stdlib_sscal( m-1, one / a( 1, 1 ), a( 2, 1 ), 1 )
+                 else
+                    do i = 1, m-1
+                       a( 1+i, 1 ) = a( 1+i, 1 ) / a( 1, 1 )
+                    end do
+                 end if
+              else
+                 info = 1
+              end if
+           else
+              ! use recursive code
+              n1 = min( m, n ) / 2
+              n2 = n-n1
+                     ! [ a11 ]
+              ! factor [ --- ]
+                     ! [ a21 ]
+              call stdlib_sgetrf2( m, n1, a, lda, ipiv, iinfo )
+              if ( info==0 .and. iinfo>0 )info = iinfo
+                                    ! [ a12 ]
+              ! apply interchanges to [ --- ]
+                                    ! [ a22 ]
+              call stdlib_slaswp( n2, a( 1, n1+1 ), lda, 1, n1, ipiv, 1 )
+              ! solve a12
+              call stdlib_strsm( 'L', 'L', 'N', 'U', n1, n2, one, a, lda,a( 1, n1+1 ), lda )
+                        
+              ! update a22
+              call stdlib_sgemm( 'N', 'N', m-n1, n2, n1, -one, a( n1+1, 1 ), lda,a( 1, n1+1 ), &
+                        lda, one, a( n1+1, n1+1 ), lda )
+              ! factor a22
+              call stdlib_sgetrf2( m-n1, n2, a( n1+1, n1+1 ), lda, ipiv( n1+1 ),iinfo )
+              ! adjust info and the pivot indices
+              if ( info==0 .and. iinfo>0 )info = iinfo + n1
+              do i = n1+1, min( m, n )
+                 ipiv( i ) = ipiv( i ) + n1
+              end do
+              ! apply interchanges to a21
+              call stdlib_slaswp( n1, a( 1, 1 ), lda, n1+1, min( m, n), ipiv, 1 )
+           end if
+           return
+     end subroutine stdlib_sgetrf2
+
+     !> SGETRI: computes the inverse of a matrix using the LU factorization
+     !> computed by SGETRF.
+     !> This method inverts U and then computes inv(A) by solving the system
+     !> inv(A)*L = inv(U) for inv(A).
+
+     pure subroutine stdlib_sgetri( n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iws, j, jb, jj, jp, ldwork, lwkopt, nb, nbmin, nn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nb = stdlib_ilaenv( 1, 'SGETRI', ' ', n, -1, -1, -1 )
+           lwkopt = n*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGETRI', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form inv(u).  if info > 0 from stdlib_strtri, then u is singular,
+           ! and the inverse is not computed.
+           call stdlib_strtri( 'UPPER', 'NON-UNIT', n, a, lda, info )
+           if( info>0 )return
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = max( ldwork*nb, 1 )
+              if( lwork<iws ) then
+                 nb = lwork / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'SGETRI', ' ', n, -1, -1, -1 ) )
+              end if
+           else
+              iws = n
+           end if
+           ! solve the equation inv(a)*l = inv(u) for inv(a).
+           if( nb<nbmin .or. nb>=n ) then
+              ! use unblocked code.
+              do j = n, 1, -1
+                 ! copy current column of l to work and replace with zeros.
+                 do i = j + 1, n
+                    work( i ) = a( i, j )
+                    a( i, j ) = zero
+                 end do
+                 ! compute current column of inv(a).
+                 if( j<n )call stdlib_sgemv( 'NO TRANSPOSE', n, n-j, -one, a( 1, j+1 ),lda, work( &
+                           j+1 ), 1, one, a( 1, j ), 1 )
+              end do
+           else
+              ! use blocked code.
+              nn = ( ( n-1 ) / nb )*nb + 1
+              do j = nn, 1, -nb
+                 jb = min( nb, n-j+1 )
+                 ! copy current block column of l to work and replace with
+                 ! zeros.
+                 do jj = j, j + jb - 1
+                    do i = jj + 1, n
+                       work( i+( jj-j )*ldwork ) = a( i, jj )
+                       a( i, jj ) = zero
+                    end do
+                 end do
+                 ! compute current block column of inv(a).
+                 if( j+jb<=n )call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, jb,n-j-jb+1, -&
+                           one, a( 1, j+jb ), lda,work( j+jb ), ldwork, one, a( 1, j ), lda )
+                 call stdlib_strsm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n, jb,one, work( j )&
+                           , ldwork, a( 1, j ), lda )
+              end do
+           end if
+           ! apply column interchanges.
+           do j = n - 1, 1, -1
+              jp = ipiv( j )
+              if( jp/=j )call stdlib_sswap( n, a( 1, j ), 1, a( 1, jp ), 1 )
+           end do
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_sgetri
+
+     !> SGETRS: solves a system of linear equations
+     !> A * X = B  or  A**T * X = B
+     !> with a general N-by-N matrix A using the LU factorization computed
+     !> by SGETRF.
+
+     pure subroutine stdlib_sgetrs( trans, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGETRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( notran ) then
+              ! solve a * x = b.
+              ! apply row interchanges to the right hand sides.
+              call stdlib_slaswp( nrhs, b, ldb, 1, n, ipiv, 1 )
+              ! solve l*x = b, overwriting b with x.
+              call stdlib_strsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n, nrhs,one, a, lda, b, &
+                        ldb )
+              ! solve u*x = b, overwriting b with x.
+              call stdlib_strsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, one, a, lda,&
+                         b, ldb )
+           else
+              ! solve a**t * x = b.
+              ! solve u**t *x = b, overwriting b with x.
+              call stdlib_strsm( 'LEFT', 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, nrhs,one, a, lda, b,&
+                         ldb )
+              ! solve l**t *x = b, overwriting b with x.
+              call stdlib_strsm( 'LEFT', 'LOWER', 'TRANSPOSE', 'UNIT', n, nrhs, one,a, lda, b, &
+                        ldb )
+              ! apply row interchanges to the solution vectors.
+              call stdlib_slaswp( nrhs, b, ldb, 1, n, ipiv, -1 )
+           end if
+           return
+     end subroutine stdlib_sgetrs
+
+     !> SGGBAL: balances a pair of general real matrices (A,B).  This
+     !> involves, first, permuting A and B by similarity transformations to
+     !> isolate eigenvalues in the first 1 to ILO$-$1 and last IHI+1 to N
+     !> elements on the diagonal; and second, applying a diagonal similarity
+     !> transformation to rows and columns ILO to IHI to make the rows
+     !> and columns as close in norm as possible. Both steps are optional.
+     !> Balancing may reduce the 1-norm of the matrices, and improve the
+     !> accuracy of the computed eigenvalues and/or eigenvectors in the
+     !> generalized eigenvalue problem A*x = lambda*B*x.
+
+     pure subroutine stdlib_sggbal( job, n, a, lda, b, ldb, ilo, ihi, lscale,rscale, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldb, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: lscale(*), rscale(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sclfac = 1.0e+1_sp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, icab, iflow, ip1, ir, irab, it, j, jc, jp1, k, kount, l, lcab, lm1, &
+                     lrab, lsfmax, lsfmin, m, nr, nrp2
+           real(sp) :: alpha, basl, beta, cab, cmax, coef, coef2, coef5, cor, ew, ewc, gamma, &
+                     pgamma, rab, sfmax, sfmin, sum, t, ta, tb, tc
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log10,max,min,real,sign
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGBAL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              ilo = 1
+              ihi = n
+              return
+           end if
+           if( n==1 ) then
+              ilo = 1
+              ihi = n
+              lscale( 1 ) = one
+              rscale( 1 ) = one
+              return
+           end if
+           if( stdlib_lsame( job, 'N' ) ) then
+              ilo = 1
+              ihi = n
+              do i = 1, n
+                 lscale( i ) = one
+                 rscale( i ) = one
+              end do
+              return
+           end if
+           k = 1
+           l = n
+           if( stdlib_lsame( job, 'S' ) )go to 190
+           go to 30
+           ! permute the matrices a and b to isolate the eigenvalues.
+           ! find row with one nonzero in columns 1 through l
+           20 continue
+           l = lm1
+           if( l/=1 )go to 30
+           rscale( 1 ) = one
+           lscale( 1 ) = one
+           go to 190
+           30 continue
+           lm1 = l - 1
+           loop_80: do i = l, 1, -1
+              do j = 1, lm1
+                 jp1 = j + 1
+                 if( a( i, j )/=zero .or. b( i, j )/=zero )go to 50
+              end do
+              j = l
+              go to 70
+              50 continue
+              do j = jp1, l
+                 if( a( i, j )/=zero .or. b( i, j )/=zero )cycle loop_80
+              end do
+              j = jp1 - 1
+              70 continue
+              m = l
+              iflow = 1
+              go to 160
+           end do loop_80
+           go to 100
+           ! find column with one nonzero in rows k through n
+           90 continue
+           k = k + 1
+           100 continue
+           loop_150: do j = k, l
+              do i = k, lm1
+                 ip1 = i + 1
+                 if( a( i, j )/=zero .or. b( i, j )/=zero )go to 120
+              end do
+              i = l
+              go to 140
+              120 continue
+              do i = ip1, l
+                 if( a( i, j )/=zero .or. b( i, j )/=zero )cycle loop_150
+              end do
+              i = ip1 - 1
+              140 continue
+              m = k
+              iflow = 2
+              go to 160
+           end do loop_150
+           go to 190
+           ! permute rows m and i
+           160 continue
+           lscale( m ) = i
+           if( i==m )go to 170
+           call stdlib_sswap( n-k+1, a( i, k ), lda, a( m, k ), lda )
+           call stdlib_sswap( n-k+1, b( i, k ), ldb, b( m, k ), ldb )
+           ! permute columns m and j
+           170 continue
+           rscale( m ) = j
+           if( j==m )go to 180
+           call stdlib_sswap( l, a( 1, j ), 1, a( 1, m ), 1 )
+           call stdlib_sswap( l, b( 1, j ), 1, b( 1, m ), 1 )
+           180 continue
+           go to ( 20, 90 )iflow
+           190 continue
+           ilo = k
+           ihi = l
+           if( stdlib_lsame( job, 'P' ) ) then
+              do i = ilo, ihi
+                 lscale( i ) = one
+                 rscale( i ) = one
+              end do
+              return
+           end if
+           if( ilo==ihi )return
+           ! balance the submatrix in rows ilo to ihi.
+           nr = ihi - ilo + 1
+           do i = ilo, ihi
+              rscale( i ) = zero
+              lscale( i ) = zero
+              work( i ) = zero
+              work( i+n ) = zero
+              work( i+2*n ) = zero
+              work( i+3*n ) = zero
+              work( i+4*n ) = zero
+              work( i+5*n ) = zero
+           end do
+           ! compute right side vector in resulting linear equations
+           basl = log10( sclfac )
+           do i = ilo, ihi
+              do j = ilo, ihi
+                 tb = b( i, j )
+                 ta = a( i, j )
+                 if( ta==zero )go to 210
+                 ta = log10( abs( ta ) ) / basl
+                 210 continue
+                 if( tb==zero )go to 220
+                 tb = log10( abs( tb ) ) / basl
+                 220 continue
+                 work( i+4*n ) = work( i+4*n ) - ta - tb
+                 work( j+5*n ) = work( j+5*n ) - ta - tb
+              end do
+           end do
+           coef = one / real( 2*nr,KIND=sp)
+           coef2 = coef*coef
+           coef5 = half*coef2
+           nrp2 = nr + 2
+           beta = zero
+           it = 1
+           ! start generalized conjugate gradient iteration
+           250 continue
+           gamma = stdlib_sdot( nr, work( ilo+4*n ), 1, work( ilo+4*n ), 1 ) +stdlib_sdot( nr, &
+                     work( ilo+5*n ), 1, work( ilo+5*n ), 1 )
+           ew = zero
+           ewc = zero
+           do i = ilo, ihi
+              ew = ew + work( i+4*n )
+              ewc = ewc + work( i+5*n )
+           end do
+           gamma = coef*gamma - coef2*( ew**2+ewc**2 ) - coef5*( ew-ewc )**2
+           if( gamma==zero )go to 350
+           if( it/=1 )beta = gamma / pgamma
+           t = coef5*( ewc-three*ew )
+           tc = coef5*( ew-three*ewc )
+           call stdlib_sscal( nr, beta, work( ilo ), 1 )
+           call stdlib_sscal( nr, beta, work( ilo+n ), 1 )
+           call stdlib_saxpy( nr, coef, work( ilo+4*n ), 1, work( ilo+n ), 1 )
+           call stdlib_saxpy( nr, coef, work( ilo+5*n ), 1, work( ilo ), 1 )
+           do i = ilo, ihi
+              work( i ) = work( i ) + tc
+              work( i+n ) = work( i+n ) + t
+           end do
+           ! apply matrix to vector
+           do i = ilo, ihi
+              kount = 0
+              sum = zero
+              loop_290: do j = ilo, ihi
+                 if( a( i, j )==zero )go to 280
+                 kount = kount + 1
+                 sum = sum + work( j )
+                 280 continue
+                 if( b( i, j )==zero )cycle loop_290
+                 kount = kount + 1
+                 sum = sum + work( j )
+              end do loop_290
+              work( i+2*n ) = real( kount,KIND=sp)*work( i+n ) + sum
+           end do
+           do j = ilo, ihi
+              kount = 0
+              sum = zero
+              loop_320: do i = ilo, ihi
+                 if( a( i, j )==zero )go to 310
+                 kount = kount + 1
+                 sum = sum + work( i+n )
+                 310 continue
+                 if( b( i, j )==zero )cycle loop_320
+                 kount = kount + 1
+                 sum = sum + work( i+n )
+              end do loop_320
+              work( j+3*n ) = real( kount,KIND=sp)*work( j ) + sum
+           end do
+           sum = stdlib_sdot( nr, work( ilo+n ), 1, work( ilo+2*n ), 1 ) +stdlib_sdot( nr, work( &
+                     ilo ), 1, work( ilo+3*n ), 1 )
+           alpha = gamma / sum
+           ! determine correction to current iteration
+           cmax = zero
+           do i = ilo, ihi
+              cor = alpha*work( i+n )
+              if( abs( cor )>cmax )cmax = abs( cor )
+              lscale( i ) = lscale( i ) + cor
+              cor = alpha*work( i )
+              if( abs( cor )>cmax )cmax = abs( cor )
+              rscale( i ) = rscale( i ) + cor
+           end do
+           if( cmax<half )go to 350
+           call stdlib_saxpy( nr, -alpha, work( ilo+2*n ), 1, work( ilo+4*n ), 1 )
+           call stdlib_saxpy( nr, -alpha, work( ilo+3*n ), 1, work( ilo+5*n ), 1 )
+           pgamma = gamma
+           it = it + 1
+           if( it<=nrp2 )go to 250
+           ! end generalized conjugate gradient iteration
+           350 continue
+           sfmin = stdlib_slamch( 'S' )
+           sfmax = one / sfmin
+           lsfmin = int( log10( sfmin ) / basl+one,KIND=ilp)
+           lsfmax = int( log10( sfmax ) / basl,KIND=ilp)
+           do i = ilo, ihi
+              irab = stdlib_isamax( n-ilo+1, a( i, ilo ), lda )
+              rab = abs( a( i, irab+ilo-1 ) )
+              irab = stdlib_isamax( n-ilo+1, b( i, ilo ), ldb )
+              rab = max( rab, abs( b( i, irab+ilo-1 ) ) )
+              lrab = int( log10( rab+sfmin ) / basl+one,KIND=ilp)
+              ir = lscale( i ) + sign( half, lscale( i ) )
+              ir = min( max( ir, lsfmin ), lsfmax, lsfmax-lrab )
+              lscale( i ) = sclfac**ir
+              icab = stdlib_isamax( ihi, a( 1, i ), 1 )
+              cab = abs( a( icab, i ) )
+              icab = stdlib_isamax( ihi, b( 1, i ), 1 )
+              cab = max( cab, abs( b( icab, i ) ) )
+              lcab = int( log10( cab+sfmin ) / basl+one,KIND=ilp)
+              jc = rscale( i ) + sign( half, rscale( i ) )
+              jc = min( max( jc, lsfmin ), lsfmax, lsfmax-lcab )
+              rscale( i ) = sclfac**jc
+           end do
+           ! row scaling of matrices a and b
+           do i = ilo, ihi
+              call stdlib_sscal( n-ilo+1, lscale( i ), a( i, ilo ), lda )
+              call stdlib_sscal( n-ilo+1, lscale( i ), b( i, ilo ), ldb )
+           end do
+           ! column scaling of matrices a and b
+           do j = ilo, ihi
+              call stdlib_sscal( ihi, rscale( j ), a( 1, j ), 1 )
+              call stdlib_sscal( ihi, rscale( j ), b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_sggbal
+
+     !> SGGHRD: reduces a pair of real matrices (A,B) to generalized upper
+     !> Hessenberg form using orthogonal transformations, where A is a
+     !> general matrix and B is upper triangular.  The form of the
+     !> generalized eigenvalue problem is
+     !> A*x = lambda*B*x,
+     !> and B is typically made upper triangular by computing its QR
+     !> factorization and moving the orthogonal matrix Q to the left side
+     !> of the equation.
+     !> This subroutine simultaneously reduces A to a Hessenberg matrix H:
+     !> Q**T*A*Z = H
+     !> and transforms B to another upper triangular matrix T:
+     !> Q**T*B*Z = T
+     !> in order to reduce the problem to its standard form
+     !> H*y = lambda*T*y
+     !> where y = Z**T*x.
+     !> The orthogonal matrices Q and Z are determined as products of Givens
+     !> rotations.  They may either be formed explicitly, or they may be
+     !> postmultiplied into input matrices Q1 and Z1, so that
+     !> Q1 * A * Z1**T = (Q1*Q) * H * (Z1*Z)**T
+     !> Q1 * B * Z1**T = (Q1*Q) * T * (Z1*Z)**T
+     !> If Q1 is the orthogonal matrix from the QR factorization of B in the
+     !> original equation A*x = lambda*B*x, then SGGHRD reduces the original
+     !> problem to generalized Hessenberg form.
+
+     pure subroutine stdlib_sgghrd( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldb, ldq, ldz, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilq, ilz
+           integer(ilp) :: icompq, icompz, jcol, jrow
+           real(sp) :: c, s, temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode compq
+           if( stdlib_lsame( compq, 'N' ) ) then
+              ilq = .false.
+              icompq = 1
+           else if( stdlib_lsame( compq, 'V' ) ) then
+              ilq = .true.
+              icompq = 2
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              ilq = .true.
+              icompq = 3
+           else
+              icompq = 0
+           end if
+           ! decode compz
+           if( stdlib_lsame( compz, 'N' ) ) then
+              ilz = .false.
+              icompz = 1
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              ilz = .true.
+              icompz = 2
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              ilz = .true.
+              icompz = 3
+           else
+              icompz = 0
+           end if
+           ! test the input parameters.
+           info = 0
+           if( icompq<=0 ) then
+              info = -1
+           else if( icompz<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ( ilq .and. ldq<n ) .or. ldq<1 ) then
+              info = -11
+           else if( ( ilz .and. ldz<n ) .or. ldz<1 ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGHRD', -info )
+              return
+           end if
+           ! initialize q and z if desired.
+           if( icompq==3 )call stdlib_slaset( 'FULL', n, n, zero, one, q, ldq )
+           if( icompz==3 )call stdlib_slaset( 'FULL', n, n, zero, one, z, ldz )
+           ! quick return if possible
+           if( n<=1 )return
+           ! zero out lower triangle of b
+           do jcol = 1, n - 1
+              do jrow = jcol + 1, n
+                 b( jrow, jcol ) = zero
+              end do
+           end do
+           ! reduce a and b
+           do jcol = ilo, ihi - 2
+              do jrow = ihi, jcol + 2, -1
+                 ! step 1: rotate rows jrow-1, jrow to kill a(jrow,jcol)
+                 temp = a( jrow-1, jcol )
+                 call stdlib_slartg( temp, a( jrow, jcol ), c, s,a( jrow-1, jcol ) )
+                 a( jrow, jcol ) = zero
+                 call stdlib_srot( n-jcol, a( jrow-1, jcol+1 ), lda,a( jrow, jcol+1 ), lda, c, s )
+                           
+                 call stdlib_srot( n+2-jrow, b( jrow-1, jrow-1 ), ldb,b( jrow, jrow-1 ), ldb, c, &
+                           s )
+                 if( ilq )call stdlib_srot( n, q( 1, jrow-1 ), 1, q( 1, jrow ), 1, c, s )
+                 ! step 2: rotate columns jrow, jrow-1 to kill b(jrow,jrow-1)
+                 temp = b( jrow, jrow )
+                 call stdlib_slartg( temp, b( jrow, jrow-1 ), c, s,b( jrow, jrow ) )
+                 b( jrow, jrow-1 ) = zero
+                 call stdlib_srot( ihi, a( 1, jrow ), 1, a( 1, jrow-1 ), 1, c, s )
+                 call stdlib_srot( jrow-1, b( 1, jrow ), 1, b( 1, jrow-1 ), 1, c,s )
+                 if( ilz )call stdlib_srot( n, z( 1, jrow ), 1, z( 1, jrow-1 ), 1, c, s )
+              end do
+           end do
+           return
+     end subroutine stdlib_sgghrd
+
+     !> SGTTRS: solves one of the systems of equations
+     !> A*X = B  or  A**T*X = B,
+     !> with a tridiagonal matrix A using the LU factorization computed
+     !> by SGTTRF.
+
+     pure subroutine stdlib_sgttrs( trans, n, nrhs, dl, d, du, du2, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(inout) :: b(ldb,*)
+           real(sp), intent(in) :: d(*), dl(*), du(*), du2(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: itrans, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           notran = ( trans=='N' .or. trans=='N' )
+           if( .not.notran .and. .not.( trans=='T' .or. trans=='T' ) .and. .not.( trans=='C' .or. &
+                     trans=='C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( n, 1 ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGTTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! decode trans
+           if( notran ) then
+              itrans = 0
+           else
+              itrans = 1
+           end if
+           ! determine the number of right-hand sides to solve at a time.
+           if( nrhs==1 ) then
+              nb = 1
+           else
+              nb = max( 1, stdlib_ilaenv( 1, 'SGTTRS', trans, n, nrhs, -1, -1 ) )
+           end if
+           if( nb>=nrhs ) then
+              call stdlib_sgtts2( itrans, n, nrhs, dl, d, du, du2, ipiv, b, ldb )
+           else
+              do j = 1, nrhs, nb
+                 jb = min( nrhs-j+1, nb )
+                 call stdlib_sgtts2( itrans, n, jb, dl, d, du, du2, ipiv, b( 1, j ),ldb )
+              end do
+           end if
+     end subroutine stdlib_sgttrs
+
+     !> SISNAN: returns .TRUE. if its argument is NaN, and .FALSE.
+     !> otherwise.  To be replaced by the Fortran 2003 intrinsic in the
+     !> future.
+
+     pure logical(lk) function stdlib_sisnan( sin )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: sin
+        ! =====================================================================
+        ! Executable Statements 
+           stdlib_sisnan = stdlib_slaisnan(sin,sin)
+           return
+     end function stdlib_sisnan
+
+     !> SLA_GBAMV:  performs one of the matrix-vector operations
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_sla_gbamv( trans, m, n, kl, ku, alpha, ab, ldab, x,incx, beta, y, incy )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, ldab, m, n, kl, ku, trans
+           ! Array Arguments 
+           real(sp), intent(in) :: ab(ldab,*), x(*)
+           real(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(sp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky, lenx, leny, kd, ke
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( .not.( ( trans==stdlib_ilatrans( 'N' ) ).or. ( trans==stdlib_ilatrans( 'T' ) )&
+                     .or. ( trans==stdlib_ilatrans( 'C' ) ) ) ) then
+              info = 1
+           else if( m<0 )then
+              info = 2
+           else if( n<0 )then
+              info = 3
+           else if( kl<0 .or. kl>m-1 ) then
+              info = 4
+           else if( ku<0 .or. ku>n-1 ) then
+              info = 5
+           else if( ldab<kl+ku+1 )then
+              info = 6
+           else if( incx==0 )then
+              info = 8
+           else if( incy==0 )then
+              info = 11
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'SLA_GBAMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( m==0 ).or.( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if( trans==stdlib_ilatrans( 'N' ) )then
+              lenx = n
+              leny = m
+           else
+              lenx = m
+              leny = n
+           end if
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( lenx - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( leny - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(m*n) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           kd = ku + 1
+           ke = kl + 1
+           iy = ky
+           if ( incx==1 ) then
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = abs( ab( kd+i-j, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = abs( ab( ke-i+j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       jx = kx
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = abs( ab( kd+i-j, j ) )
+                          symb_zero = symb_zero .and.( x( jx ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       jx = kx
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = abs( ab( ke-i+j, i ) )
+                          symb_zero = symb_zero .and.( x( jx ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_sla_gbamv
+
+     !> SLA_GBRCOND: Estimates the Skeel condition number of  op(A) * op2(C)
+     !> where op2 is determined by CMODE as follows
+     !> CMODE =  1    op2(C) = C
+     !> CMODE =  0    op2(C) = I
+     !> CMODE = -1    op2(C) = inv(C)
+     !> The Skeel condition number  cond(A) = norminf( |inv(A)||A| )
+     !> is computed by computing scaling factors R such that
+     !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+     !> infinity-norm condition number.
+
+     real(sp) function stdlib_sla_gbrcond( trans, n, kl, ku, ab, ldab, afb, ldafb,ipiv, cmode, c, &
+               info, work, iwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: n, ldab, ldafb, kl, ku, cmode
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(in) :: ab(ldab,*), afb(ldafb,*), c(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notrans
+           integer(ilp) :: kase, i, j, kd, ke
+           real(sp) :: ainvnm, tmp
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           stdlib_sla_gbrcond = zero
+           info = 0
+           notrans = stdlib_lsame( trans, 'N' )
+           if ( .not. notrans .and. .not. stdlib_lsame(trans, 'T').and. .not. stdlib_lsame(trans, &
+                     'C') ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 .or. kl>n-1 ) then
+              info = -3
+           else if( ku<0 .or. ku>n-1 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLA_GBRCOND', -info )
+              return
+           end if
+           if( n==0 ) then
+              stdlib_sla_gbrcond = one
+              return
+           end if
+           ! compute the equilibration matrix r such that
+           ! inv(r)*a*c has unit 1-norm.
+           kd = ku + 1
+           ke = kl + 1
+           if ( notrans ) then
+              do i = 1, n
+                 tmp = zero
+                    if ( cmode == 1 ) then
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + abs( ab( kd+i-j, j ) * c( j ) )
+                    end do
+                    else if ( cmode == 0 ) then
+                       do j = max( i-kl, 1 ), min( i+ku, n )
+                          tmp = tmp + abs( ab( kd+i-j, j ) )
+                       end do
+                    else
+                       do j = max( i-kl, 1 ), min( i+ku, n )
+                          tmp = tmp + abs( ab( kd+i-j, j ) / c( j ) )
+                       end do
+                    end if
+                 work( 2*n+i ) = tmp
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + abs( ab( ke-i+j, i ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + abs( ab( ke-i+j, i ) )
+                    end do
+                 else
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + abs( ab( ke-i+j, i ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+                 if ( notrans ) then
+                    call stdlib_sgbtrs( 'NO TRANSPOSE', n, kl, ku, 1, afb, ldafb,ipiv, work, n, &
+                              info )
+                 else
+                    call stdlib_sgbtrs( 'TRANSPOSE', n, kl, ku, 1, afb, ldafb, ipiv,work, n, info &
+                              )
+                 end if
+                 ! multiply by inv(c).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( notrans ) then
+                    call stdlib_sgbtrs( 'TRANSPOSE', n, kl, ku, 1, afb, ldafb, ipiv,work, n, info &
+                              )
+                 else
+                    call stdlib_sgbtrs( 'NO TRANSPOSE', n, kl, ku, 1, afb, ldafb,ipiv, work, n, &
+                              info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= 0.0_sp )stdlib_sla_gbrcond = ( 1.0_sp / ainvnm )
+           return
+     end function stdlib_sla_gbrcond
+
+     !> SLA_GEAMV:  performs one of the matrix-vector operations
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_sla_geamv( trans, m, n, alpha, a, lda, x, incx, beta,y, incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, m, n, trans
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), x(*)
+           real(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(sp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky, lenx, leny
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( .not.( ( trans==stdlib_ilatrans( 'N' ) ).or. ( trans==stdlib_ilatrans( 'T' ) )&
+                     .or. ( trans==stdlib_ilatrans( 'C' )) ) ) then
+              info = 1
+           else if( m<0 )then
+              info = 2
+           else if( n<0 )then
+              info = 3
+           else if( lda<max( 1, m ) )then
+              info = 6
+           else if( incx==0 )then
+              info = 8
+           else if( incy==0 )then
+              info = 11
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'SLA_GEAMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( m==0 ).or.( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if( trans==stdlib_ilatrans( 'N' ) )then
+              lenx = n
+              leny = m
+           else
+              lenx = m
+              leny = n
+           end if
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( lenx - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( leny - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(m*n) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, lenx
+                          temp = abs( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, lenx
+                          temp = abs( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       jx = kx
+                       do j = 1, lenx
+                          temp = abs( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( jx ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if (.not.symb_zero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       jx = kx
+                       do j = 1, lenx
+                          temp = abs( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( jx ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if (.not.symb_zero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_sla_geamv
+
+     !> SLA_GERCOND: estimates the Skeel condition number of op(A) * op2(C)
+     !> where op2 is determined by CMODE as follows
+     !> CMODE =  1    op2(C) = C
+     !> CMODE =  0    op2(C) = I
+     !> CMODE = -1    op2(C) = inv(C)
+     !> The Skeel condition number cond(A) = norminf( |inv(A)||A| )
+     !> is computed by computing scaling factors R such that
+     !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+     !> infinity-norm condition number.
+
+     real(sp) function stdlib_sla_gercond( trans, n, a, lda, af, ldaf, ipiv,cmode, c, info, work, &
+               iwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: n, lda, ldaf, cmode
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: a(lda,*), af(ldaf,*), c(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notrans
+           integer(ilp) :: kase, i, j
+           real(sp) :: ainvnm, tmp
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           stdlib_sla_gercond = zero
+           info = 0
+           notrans = stdlib_lsame( trans, 'N' )
+           if ( .not. notrans .and. .not. stdlib_lsame(trans, 'T').and. .not. stdlib_lsame(trans, &
+                     'C') ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLA_GERCOND', -info )
+              return
+           end if
+           if( n==0 ) then
+              stdlib_sla_gercond = one
+              return
+           end if
+           ! compute the equilibration matrix r such that
+           ! inv(r)*a*c has unit 1-norm.
+           if (notrans) then
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, n
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, n
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                 else
+                    do j = 1, n
+                       tmp = tmp + abs( a( i, j ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, n
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, n
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                 else
+                    do j = 1, n
+                       tmp = tmp + abs( a( j, i ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work(i) = work(i) * work(2*n+i)
+                 end do
+                 if (notrans) then
+                    call stdlib_sgetrs( 'NO TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                              
+                 else
+                    call stdlib_sgetrs( 'TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                 end if
+                 ! multiply by inv(c).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if (notrans) then
+                    call stdlib_sgetrs( 'TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_sgetrs( 'NO TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                              
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= 0.0_sp )stdlib_sla_gercond = ( 1.0_sp / ainvnm )
+           return
+     end function stdlib_sla_gercond
+
+     !> SLA_LIN_BERR: computes componentwise relative backward error from
+     !> the formula
+     !> max(i) ( abs(R(i)) / ( abs(op(A_s))*abs(Y) + abs(B_s) )(i) )
+     !> where abs(Z) is the componentwise absolute value of the matrix
+     !> or vector Z.
+
+     pure subroutine stdlib_sla_lin_berr( n, nz, nrhs, res, ayb, berr )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, nz, nrhs
+           ! Array Arguments 
+           real(sp), intent(in) :: ayb(n,nrhs)
+           real(sp), intent(out) :: berr(nrhs)
+           real(sp), intent(in) :: res(n,nrhs)
+        ! =====================================================================
+           ! Local Scalars 
+           real(sp) :: tmp,safe1
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! adding safe1 to the numerator guards against spuriously zero
+           ! residuals.  a similar safeguard is in the sla_yyamv routine used
+           ! to compute ayb.
+           safe1 = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = (nz+1)*safe1
+           do j = 1, nrhs
+              berr(j) = zero
+              do i = 1, n
+                 if (ayb(i,j) /= 0.0_sp) then
+                    tmp = (safe1+abs(res(i,j)))/ayb(i,j)
+                    berr(j) = max( berr(j), tmp )
+                 end if
+           ! if ayb is exactly 0.0_sp (and if computed by sla_yyamv), then we know
+           ! the true residual also must be exactly zero.
+              end do
+           end do
+     end subroutine stdlib_sla_lin_berr
+
+     !> SLA_PORCOND: Estimates the Skeel condition number of  op(A) * op2(C)
+     !> where op2 is determined by CMODE as follows
+     !> CMODE =  1    op2(C) = C
+     !> CMODE =  0    op2(C) = I
+     !> CMODE = -1    op2(C) = inv(C)
+     !> The Skeel condition number  cond(A) = norminf( |inv(A)||A| )
+     !> is computed by computing scaling factors R such that
+     !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+     !> infinity-norm condition number.
+
+     real(sp) function stdlib_sla_porcond( uplo, n, a, lda, af, ldaf, cmode, c,info, work, iwork )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, ldaf, cmode
+           integer(ilp), intent(out) :: info
+           real(sp), intent(in) :: a(lda,*), af(ldaf,*), c(*)
+           real(sp), intent(out) :: work(*)
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: kase, i, j
+           real(sp) :: ainvnm, tmp
+           logical(lk) :: up
+           ! Array Arguments 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           stdlib_sla_porcond = zero
+           info = 0
+           if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLA_PORCOND', -info )
+              return
+           end if
+           if( n==0 ) then
+              stdlib_sla_porcond = one
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute the equilibration matrix r such that
+           ! inv(r)*a*c has unit 1-norm.
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + abs( a( j ,i ) / c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) / c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           endif
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+                 if (up) then
+                    call stdlib_spotrs( 'UPPER', n, 1, af, ldaf, work, n, info )
+                 else
+                    call stdlib_spotrs( 'LOWER', n, 1, af, ldaf, work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_spotrs( 'UPPER', n, 1, af, ldaf, work, n, info )
+                 else
+                    call stdlib_spotrs( 'LOWER', n, 1, af, ldaf, work, n, info )
+                 endif
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= 0.0_sp )stdlib_sla_porcond = ( 1.0_sp / ainvnm )
+           return
+     end function stdlib_sla_porcond
+
+     !> SLA_SYAMV:  performs the matrix-vector operation
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> n by n symmetric matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_sla_syamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n, uplo
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), x(*)
+           real(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(sp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( uplo/=stdlib_ilauplo( 'U' ) .and.uplo/=stdlib_ilauplo( 'L' ) ) then
+              info = 1
+           else if( n<0 )then
+              info = 2
+           else if( lda<max( 1, n ) )then
+              info = 5
+           else if( incx==0 )then
+              info = 7
+           else if( incy==0 )then
+              info = 10
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'SLA_SYAMV', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( n - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( n - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(n^2) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = abs( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = abs( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = abs( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = abs( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = abs( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = abs( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = abs( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = abs( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*abs( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_sla_syamv
+
+     !> SLA_SYRCOND: estimates the Skeel condition number of  op(A) * op2(C)
+     !> where op2 is determined by CMODE as follows
+     !> CMODE =  1    op2(C) = C
+     !> CMODE =  0    op2(C) = I
+     !> CMODE = -1    op2(C) = inv(C)
+     !> The Skeel condition number cond(A) = norminf( |inv(A)||A| )
+     !> is computed by computing scaling factors R such that
+     !> diag(R)*A*op2(C) is row equilibrated and computing the standard
+     !> infinity-norm condition number.
+
+     real(sp) function stdlib_sla_syrcond( uplo, n, a, lda, af, ldaf, ipiv, cmode,c, info, work, &
+               iwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, ldaf, cmode
+           integer(ilp), intent(out) :: info
+           ! Array Arguments
+           integer(ilp), intent(out) :: iwork(*)
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(in) :: a(lda,*), af(ldaf,*), c(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           character :: normin
+           integer(ilp) :: kase, i, j
+           real(sp) :: ainvnm, smlnum, tmp
+           logical(lk) :: up
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           stdlib_sla_syrcond = zero
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLA_SYRCOND', -info )
+              return
+           end if
+           if( n==0 ) then
+              stdlib_sla_syrcond = one
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute the equilibration matrix r such that
+           ! inv(r)*a*c has unit 1-norm.
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + abs( a( j, i ) / c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( i, j ) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( cmode == 1 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) * c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) * c( j ) )
+                    end do
+                 else if ( cmode == 0 ) then
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i ) )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + abs( a( i, j) / c( j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + abs( a( j, i) / c( j ) )
+                    end do
+                 end if
+                 work( 2*n+i ) = tmp
+              end do
+           endif
+           ! estimate the norm of inv(op(a)).
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           ainvnm = zero
+           normin = 'N'
+           kase = 0
+           10 continue
+           call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+                 if ( up ) then
+                    call stdlib_ssytrs( 'U', n, 1, af, ldaf, ipiv, work, n, info )
+                 else
+                    call stdlib_ssytrs( 'L', n, 1, af, ldaf, ipiv, work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( cmode == 1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) / c( i )
+                    end do
+                 else if ( cmode == -1 ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_ssytrs( 'U', n, 1, af, ldaf, ipiv, work, n, info )
+                 else
+                    call stdlib_ssytrs( 'L', n, 1, af, ldaf, ipiv, work, n, info )
+                 endif
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * work( 2*n+i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= 0.0_sp )stdlib_sla_syrcond = ( 1.0_sp / ainvnm )
+           return
+     end function stdlib_sla_syrcond
+
+     !> SLA_SYRPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(sp) function stdlib_sla_syrpvgrw( uplo, n, info, a, lda, af, ldaf, ipiv,work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, info, lda, ldaf
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(sp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: ncols, i, j, k, kp
+           real(sp) :: amax, umax, rpvgrw, tmp
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           if ( info==0 ) then
+              if ( upper ) then
+                 ncols = 1
+              else
+                 ncols = n
+              end if
+           else
+              ncols = info
+           end if
+           rpvgrw = one
+           do i = 1, 2*n
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column of a.  compute the max
+           ! for all n columns so we can apply the pivot permutation while
+           ! looping below.  assume a full factorization is the common case.
+           if ( upper ) then
+              do j = 1, n
+                 do i = 1, j
+                    work( n+i ) = max( abs( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( abs( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = j, n
+                    work( n+i ) = max( abs( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( abs( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of u or l.  also
+           ! permute the magnitudes of a above so they're in the same order as
+           ! the factor.
+           ! the iteration orders and permutations were copied from stdlib_ssytrs.
+           ! calls to stdlib_sswap would be severe overkill.
+           if ( upper ) then
+              k = n
+              do while ( k < ncols .and. k>0 )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = 1, k
+                       work( k ) = max( abs( af( i, k ) ), work( k ) )
+                    end do
+                    k = k - 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k-1 )
+                    work( n+k-1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = 1, k-1
+                       work( k ) = max( abs( af( i, k ) ), work( k ) )
+                       work( k-1 ) = max( abs( af( i, k-1 ) ), work( k-1 ) )
+                    end do
+                    work( k ) = max( abs( af( k, k ) ), work( k ) )
+                    k = k - 2
+                 end if
+              end do
+              k = ncols
+              do while ( k <= n )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k + 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k + 2
+                 end if
+              end do
+           else
+              k = 1
+              do while ( k <= ncols )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = k, n
+                       work( k ) = max( abs( af( i, k ) ), work( k ) )
+                    end do
+                    k = k + 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k+1 )
+                    work( n+k+1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = k+1, n
+                       work( k ) = max( abs( af( i, k ) ), work( k ) )
+                       work( k+1 ) = max( abs( af(i, k+1 ) ), work( k+1 ) )
+                    end do
+                    work( k ) = max( abs( af( k, k ) ), work( k ) )
+                    k = k + 2
+                 end if
+              end do
+              k = ncols
+              do while ( k >= 1 )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k - 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k - 2
+                 endif
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( upper ) then
+              do i = ncols, n
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= 0.0_sp ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= 0.0_sp ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_sla_syrpvgrw = rpvgrw
+     end function stdlib_sla_syrpvgrw
+
+
+     pure subroutine stdlib_sladiv1( a, b, c, d, p, q )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(inout) :: a
+           real(sp), intent(in) :: b, c, d
+           real(sp), intent(out) :: p, q
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: r, t
+           ! Executable Statements 
+           r = d / c
+           t = one / (c + d * r)
+           p = stdlib_sladiv2(a, b, c, d, r, t)
+           a = -a
+           q = stdlib_sladiv2(b, a, c, d, r, t)
+           return
+     end subroutine stdlib_sladiv1
+
+     !> SLAED6: computes the positive or negative root (closest to the origin)
+     !> of
+     !> z(1)        z(2)        z(3)
+     !> f(x) =   rho + --------- + ---------- + ---------
+     !> d(1)-x      d(2)-x      d(3)-x
+     !> It is assumed that
+     !> if ORGATI = .true. the root is between d(2) and d(3);
+     !> otherwise it is between d(1) and d(2)
+     !> This routine will be called by SLAED4 when necessary. In most cases,
+     !> the root sought is the smallest in magnitude, though it might not be
+     !> in some extremely rare situations.
+
+     pure subroutine stdlib_slaed6( kniter, orgati, rho, d, z, finit, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: orgati
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kniter
+           real(sp), intent(in) :: finit, rho
+           real(sp), intent(out) :: tau
+           ! Array Arguments 
+           real(sp), intent(in) :: d(3), z(3)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 40
+           
+           
+           ! Local Arrays 
+           real(sp) :: dscale(3), zscale(3)
+           ! Local Scalars 
+           logical(lk) :: scale
+           integer(ilp) :: i, iter, niter
+           real(sp) :: a, b, base, c, ddf, df, eps, erretm, eta, f, fc, sclfac, sclinv, small1, &
+                     small2, sminv1, sminv2, temp, temp1, temp2, temp3, temp4, lbd, ubd
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,min,sqrt
+           ! Executable Statements 
+           info = 0
+           if( orgati ) then
+              lbd = d(2)
+              ubd = d(3)
+           else
+              lbd = d(1)
+              ubd = d(2)
+           end if
+           if( finit < zero )then
+              lbd = zero
+           else
+              ubd = zero
+           end if
+           niter = 1
+           tau = zero
+           if( kniter==2 ) then
+              if( orgati ) then
+                 temp = ( d( 3 )-d( 2 ) ) / two
+                 c = rho + z( 1 ) / ( ( d( 1 )-d( 2 ) )-temp )
+                 a = c*( d( 2 )+d( 3 ) ) + z( 2 ) + z( 3 )
+                 b = c*d( 2 )*d( 3 ) + z( 2 )*d( 3 ) + z( 3 )*d( 2 )
+              else
+                 temp = ( d( 1 )-d( 2 ) ) / two
+                 c = rho + z( 3 ) / ( ( d( 3 )-d( 2 ) )-temp )
+                 a = c*( d( 1 )+d( 2 ) ) + z( 1 ) + z( 2 )
+                 b = c*d( 1 )*d( 2 ) + z( 1 )*d( 2 ) + z( 2 )*d( 1 )
+              end if
+              temp = max( abs( a ), abs( b ), abs( c ) )
+              a = a / temp
+              b = b / temp
+              c = c / temp
+              if( c==zero ) then
+                 tau = b / a
+              else if( a<=zero ) then
+                 tau = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+              else
+                 tau = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+              end if
+              if( tau < lbd .or. tau > ubd )tau = ( lbd+ubd )/two
+              if( d(1)==tau .or. d(2)==tau .or. d(3)==tau ) then
+                 tau = zero
+              else
+                 temp = finit + tau*z(1)/( d(1)*( d( 1 )-tau ) ) +tau*z(2)/( d(2)*( d( 2 )-tau ) )&
+                            +tau*z(3)/( d(3)*( d( 3 )-tau ) )
+                 if( temp <= zero )then
+                    lbd = tau
+                 else
+                    ubd = tau
+                 end if
+                 if( abs( finit )<=abs( temp ) )tau = zero
+              end if
+           end if
+           ! get machine parameters for possible scaling to avoid overflow
+           ! modified by sven: parameters small1, sminv1, small2,
+           ! sminv2, eps are not saved anymore between one call to the
+           ! others but recomputed at each call
+           eps = stdlib_slamch( 'EPSILON' )
+           base = stdlib_slamch( 'BASE' )
+           small1 = base**( int( log( stdlib_slamch( 'SAFMIN' ) ) / log( base ) /three,KIND=ilp) )
+                     
+           sminv1 = one / small1
+           small2 = small1*small1
+           sminv2 = sminv1*sminv1
+           ! determine if scaling of inputs necessary to avoid overflow
+           ! when computing 1/temp**3
+           if( orgati ) then
+              temp = min( abs( d( 2 )-tau ), abs( d( 3 )-tau ) )
+           else
+              temp = min( abs( d( 1 )-tau ), abs( d( 2 )-tau ) )
+           end if
+           scale = .false.
+           if( temp<=small1 ) then
+              scale = .true.
+              if( temp<=small2 ) then
+              ! scale up by power of radix nearest 1/safmin**(2/3)
+                 sclfac = sminv2
+                 sclinv = small2
+              else
+              ! scale up by power of radix nearest 1/safmin**(1/3)
+                 sclfac = sminv1
+                 sclinv = small1
+              end if
+              ! scaling up safe because d, z, tau scaled elsewhere to be o(1)
+              do i = 1, 3
+                 dscale( i ) = d( i )*sclfac
+                 zscale( i ) = z( i )*sclfac
+              end do
+              tau = tau*sclfac
+              lbd = lbd*sclfac
+              ubd = ubd*sclfac
+           else
+              ! copy d and z to dscale and zscale
+              do i = 1, 3
+                 dscale( i ) = d( i )
+                 zscale( i ) = z( i )
+              end do
+           end if
+           fc = zero
+           df = zero
+           ddf = zero
+           do i = 1, 3
+              temp = one / ( dscale( i )-tau )
+              temp1 = zscale( i )*temp
+              temp2 = temp1*temp
+              temp3 = temp2*temp
+              fc = fc + temp1 / dscale( i )
+              df = df + temp2
+              ddf = ddf + temp3
+           end do
+           f = finit + tau*fc
+           if( abs( f )<=zero )go to 60
+           if( f <= zero )then
+              lbd = tau
+           else
+              ubd = tau
+           end if
+              ! iteration begins -- use gragg-thornton-warner cubic convergent
+                                  ! scheme
+           ! it is not hard to see that
+                 ! 1) iterations will go up monotonically
+                    ! if finit < 0;
+                 ! 2) iterations will go down monotonically
+                    ! if finit > 0.
+           iter = niter + 1
+           loop_50: do niter = iter, maxit
+              if( orgati ) then
+                 temp1 = dscale( 2 ) - tau
+                 temp2 = dscale( 3 ) - tau
+              else
+                 temp1 = dscale( 1 ) - tau
+                 temp2 = dscale( 2 ) - tau
+              end if
+              a = ( temp1+temp2 )*f - temp1*temp2*df
+              b = temp1*temp2*f
+              c = f - ( temp1+temp2 )*df + temp1*temp2*ddf
+              temp = max( abs( a ), abs( b ), abs( c ) )
+              a = a / temp
+              b = b / temp
+              c = c / temp
+              if( c==zero ) then
+                 eta = b / a
+              else if( a<=zero ) then
+                 eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+              else
+                 eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+              end if
+              if( f*eta>=zero ) then
+                 eta = -f / df
+              end if
+              tau = tau + eta
+              if( tau < lbd .or. tau > ubd )tau = ( lbd + ubd )/two
+              fc = zero
+              erretm = zero
+              df = zero
+              ddf = zero
+              do i = 1, 3
+                 if ( ( dscale( i )-tau )/=zero ) then
+                    temp = one / ( dscale( i )-tau )
+                    temp1 = zscale( i )*temp
+                    temp2 = temp1*temp
+                    temp3 = temp2*temp
+                    temp4 = temp1 / dscale( i )
+                    fc = fc + temp4
+                    erretm = erretm + abs( temp4 )
+                    df = df + temp2
+                    ddf = ddf + temp3
+                 else
+                    go to 60
+                 end if
+              end do
+              f = finit + tau*fc
+              erretm = eight*( abs( finit )+abs( tau )*erretm ) +abs( tau )*df
+              if( ( abs( f )<=four*eps*erretm ) .or.( (ubd-lbd)<=four*eps*abs(tau) )  )go to &
+                        60
+              if( f <= zero )then
+                 lbd = tau
+              else
+                 ubd = tau
+              end if
+           end do loop_50
+           info = 1
+           60 continue
+           ! undo scaling
+           if( scale )tau = tau*sclinv
+           return
+     end subroutine stdlib_slaed6
+
+     !> SLAGS2: computes 2-by-2 orthogonal matrices U, V and Q, such
+     !> that if ( UPPER ) then
+     !> U**T *A*Q = U**T *( A1 A2 )*Q = ( x  0  )
+     !> ( 0  A3 )     ( x  x  )
+     !> and
+     !> V**T*B*Q = V**T *( B1 B2 )*Q = ( x  0  )
+     !> ( 0  B3 )     ( x  x  )
+     !> or if ( .NOT.UPPER ) then
+     !> U**T *A*Q = U**T *( A1 0  )*Q = ( x  x  )
+     !> ( A2 A3 )     ( 0  x  )
+     !> and
+     !> V**T*B*Q = V**T*( B1 0  )*Q = ( x  x  )
+     !> ( B2 B3 )     ( 0  x  )
+     !> The rows of the transformed A and B are parallel, where
+     !> U = (  CSU  SNU ), V = (  CSV SNV ), Q = (  CSQ   SNQ )
+     !> ( -SNU  CSU )      ( -SNV CSV )      ( -SNQ   CSQ )
+     !> Z**T denotes the transpose of Z.
+
+     pure subroutine stdlib_slags2( upper, a1, a2, a3, b1, b2, b3, csu, snu, csv,snv, csq, snq )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: upper
+           real(sp), intent(in) :: a1, a2, a3, b1, b2, b3
+           real(sp), intent(out) :: csq, csu, csv, snq, snu, snv
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: a, aua11, aua12, aua21, aua22, avb11, avb12, avb21, avb22, csl, csr, d, s1,&
+            s2, snl, snr, ua11r, ua22r, vb11r, vb22r, b, c, r, ua11, ua12, ua21, ua22, vb11, vb12,&
+                       vb21, vb22
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           if( upper ) then
+              ! input matrices a and b are upper triangular matrices
+              ! form matrix c = a*adj(b) = ( a b )
+                                         ! ( 0 d )
+              a = a1*b3
+              d = a3*b1
+              b = a2*b1 - a1*b2
+              ! the svd of real 2-by-2 triangular c
+               ! ( csl -snl )*( a b )*(  csr  snr ) = ( r 0 )
+               ! ( snl  csl ) ( 0 d ) ( -snr  csr )   ( 0 t )
+              call stdlib_slasv2( a, b, d, s1, s2, snr, csr, snl, csl )
+              if( abs( csl )>=abs( snl ) .or. abs( csr )>=abs( snr ) )then
+                 ! compute the (1,1) and (1,2) elements of u**t *a and v**t *b,
+                 ! and (1,2) element of |u|**t *|a| and |v|**t *|b|.
+                 ua11r = csl*a1
+                 ua12 = csl*a2 + snl*a3
+                 vb11r = csr*b1
+                 vb12 = csr*b2 + snr*b3
+                 aua12 = abs( csl )*abs( a2 ) + abs( snl )*abs( a3 )
+                 avb12 = abs( csr )*abs( b2 ) + abs( snr )*abs( b3 )
+                 ! zero (1,2) elements of u**t *a and v**t *b
+                 if( ( abs( ua11r )+abs( ua12 ) )/=zero ) then
+                    if( aua12 / ( abs( ua11r )+abs( ua12 ) )<=avb12 /( abs( vb11r )+abs( vb12 ) ) &
+                              ) then
+                       call stdlib_slartg( -ua11r, ua12, csq, snq, r )
+                    else
+                       call stdlib_slartg( -vb11r, vb12, csq, snq, r )
+                    end if
+                 else
+                    call stdlib_slartg( -vb11r, vb12, csq, snq, r )
+                 end if
+                 csu = csl
+                 snu = -snl
+                 csv = csr
+                 snv = -snr
+              else
+                 ! compute the (2,1) and (2,2) elements of u**t *a and v**t *b,
+                 ! and (2,2) element of |u|**t *|a| and |v|**t *|b|.
+                 ua21 = -snl*a1
+                 ua22 = -snl*a2 + csl*a3
+                 vb21 = -snr*b1
+                 vb22 = -snr*b2 + csr*b3
+                 aua22 = abs( snl )*abs( a2 ) + abs( csl )*abs( a3 )
+                 avb22 = abs( snr )*abs( b2 ) + abs( csr )*abs( b3 )
+                 ! zero (2,2) elements of u**t*a and v**t*b, and then swap.
+                 if( ( abs( ua21 )+abs( ua22 ) )/=zero ) then
+                    if( aua22 / ( abs( ua21 )+abs( ua22 ) )<=avb22 /( abs( vb21 )+abs( vb22 ) ) ) &
+                              then
+                       call stdlib_slartg( -ua21, ua22, csq, snq, r )
+                    else
+                       call stdlib_slartg( -vb21, vb22, csq, snq, r )
+                    end if
+                 else
+                    call stdlib_slartg( -vb21, vb22, csq, snq, r )
+                 end if
+                 csu = snl
+                 snu = csl
+                 csv = snr
+                 snv = csr
+              end if
+           else
+              ! input matrices a and b are lower triangular matrices
+              ! form matrix c = a*adj(b) = ( a 0 )
+                                         ! ( c d )
+              a = a1*b3
+              d = a3*b1
+              c = a2*b3 - a3*b2
+              ! the svd of real 2-by-2 triangular c
+               ! ( csl -snl )*( a 0 )*(  csr  snr ) = ( r 0 )
+               ! ( snl  csl ) ( c d ) ( -snr  csr )   ( 0 t )
+              call stdlib_slasv2( a, c, d, s1, s2, snr, csr, snl, csl )
+              if( abs( csr )>=abs( snr ) .or. abs( csl )>=abs( snl ) )then
+                 ! compute the (2,1) and (2,2) elements of u**t *a and v**t *b,
+                 ! and (2,1) element of |u|**t *|a| and |v|**t *|b|.
+                 ua21 = -snr*a1 + csr*a2
+                 ua22r = csr*a3
+                 vb21 = -snl*b1 + csl*b2
+                 vb22r = csl*b3
+                 aua21 = abs( snr )*abs( a1 ) + abs( csr )*abs( a2 )
+                 avb21 = abs( snl )*abs( b1 ) + abs( csl )*abs( b2 )
+                 ! zero (2,1) elements of u**t *a and v**t *b.
+                 if( ( abs( ua21 )+abs( ua22r ) )/=zero ) then
+                    if( aua21 / ( abs( ua21 )+abs( ua22r ) )<=avb21 /( abs( vb21 )+abs( vb22r ) ) &
+                              ) then
+                       call stdlib_slartg( ua22r, ua21, csq, snq, r )
+                    else
+                       call stdlib_slartg( vb22r, vb21, csq, snq, r )
+                    end if
+                 else
+                    call stdlib_slartg( vb22r, vb21, csq, snq, r )
+                 end if
+                 csu = csr
+                 snu = -snr
+                 csv = csl
+                 snv = -snl
+              else
+                 ! compute the (1,1) and (1,2) elements of u**t *a and v**t *b,
+                 ! and (1,1) element of |u|**t *|a| and |v|**t *|b|.
+                 ua11 = csr*a1 + snr*a2
+                 ua12 = snr*a3
+                 vb11 = csl*b1 + snl*b2
+                 vb12 = snl*b3
+                 aua11 = abs( csr )*abs( a1 ) + abs( snr )*abs( a2 )
+                 avb11 = abs( csl )*abs( b1 ) + abs( snl )*abs( b2 )
+                 ! zero (1,1) elements of u**t*a and v**t*b, and then swap.
+                 if( ( abs( ua11 )+abs( ua12 ) )/=zero ) then
+                    if( aua11 / ( abs( ua11 )+abs( ua12 ) )<=avb11 /( abs( vb11 )+abs( vb12 ) ) ) &
+                              then
+                       call stdlib_slartg( ua12, ua11, csq, snq, r )
+                    else
+                       call stdlib_slartg( vb12, vb11, csq, snq, r )
+                    end if
+                 else
+                    call stdlib_slartg( vb12, vb11, csq, snq, r )
+                 end if
+                 csu = snr
+                 snu = csr
+                 csv = snl
+                 snv = csl
+              end if
+           end if
+           return
+     end subroutine stdlib_slags2
+
+     !> SLAGTF: factorizes the matrix (T - lambda*I), where T is an n by n
+     !> tridiagonal matrix and lambda is a scalar, as
+     !> T - lambda*I = PLU,
+     !> where P is a permutation matrix, L is a unit lower tridiagonal matrix
+     !> with at most one non-zero sub-diagonal elements per column and U is
+     !> an upper triangular matrix with at most two non-zero super-diagonal
+     !> elements per column.
+     !> The factorization is obtained by Gaussian elimination with partial
+     !> pivoting and implicit row scaling.
+     !> The parameter LAMBDA is included in the routine so that SLAGTF may
+     !> be used, in conjunction with SLAGTS, to obtain eigenvectors of T by
+     !> inverse iteration.
+
+     pure subroutine stdlib_slagtf( n, a, lambda, b, c, tol, d, in, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: lambda, tol
+           ! Array Arguments 
+           integer(ilp), intent(out) :: in(*)
+           real(sp), intent(inout) :: a(*), b(*), c(*)
+           real(sp), intent(out) :: d(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: k
+           real(sp) :: eps, mult, piv1, piv2, scale1, scale2, temp, tl
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'SLAGTF', -info )
+              return
+           end if
+           if( n==0 )return
+           a( 1 ) = a( 1 ) - lambda
+           in( n ) = 0
+           if( n==1 ) then
+              if( a( 1 )==zero )in( 1 ) = 1
+              return
+           end if
+           eps = stdlib_slamch( 'EPSILON' )
+           tl = max( tol, eps )
+           scale1 = abs( a( 1 ) ) + abs( b( 1 ) )
+           loop_10: do k = 1, n - 1
+              a( k+1 ) = a( k+1 ) - lambda
+              scale2 = abs( c( k ) ) + abs( a( k+1 ) )
+              if( k<( n-1 ) )scale2 = scale2 + abs( b( k+1 ) )
+              if( a( k )==zero ) then
+                 piv1 = zero
+              else
+                 piv1 = abs( a( k ) ) / scale1
+              end if
+              if( c( k )==zero ) then
+                 in( k ) = 0
+                 piv2 = zero
+                 scale1 = scale2
+                 if( k<( n-1 ) )d( k ) = zero
+              else
+                 piv2 = abs( c( k ) ) / scale2
+                 if( piv2<=piv1 ) then
+                    in( k ) = 0
+                    scale1 = scale2
+                    c( k ) = c( k ) / a( k )
+                    a( k+1 ) = a( k+1 ) - c( k )*b( k )
+                    if( k<( n-1 ) )d( k ) = zero
+                 else
+                    in( k ) = 1
+                    mult = a( k ) / c( k )
+                    a( k ) = c( k )
+                    temp = a( k+1 )
+                    a( k+1 ) = b( k ) - mult*temp
+                    if( k<( n-1 ) ) then
+                       d( k ) = b( k+1 )
+                       b( k+1 ) = -mult*d( k )
+                    end if
+                    b( k ) = temp
+                    c( k ) = mult
+                 end if
+              end if
+              if( ( max( piv1, piv2 )<=tl ) .and. ( in( n )==0 ) )in( n ) = k
+           end do loop_10
+           if( ( abs( a( n ) )<=scale1*tl ) .and. ( in( n )==0 ) )in( n ) = n
+           return
+     end subroutine stdlib_slagtf
+
+     !> SLAGTS: may be used to solve one of the systems of equations
+     !> (T - lambda*I)*x = y   or   (T - lambda*I)**T*x = y,
+     !> where T is an n by n tridiagonal matrix, for x, following the
+     !> factorization of (T - lambda*I) as
+     !> (T - lambda*I) = P*L*U ,
+     !> by routine SLAGTF. The choice of equation to be solved is
+     !> controlled by the argument JOB, and in each case there is an option
+     !> to perturb zero or very small diagonal elements of U, this option
+     !> being intended for use in applications such as inverse iteration.
+
+     pure subroutine stdlib_slagts( job, n, a, b, c, d, in, y, tol, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: job, n
+           real(sp), intent(inout) :: tol
+           ! Array Arguments 
+           integer(ilp), intent(in) :: in(*)
+           real(sp), intent(in) :: a(*), b(*), c(*), d(*)
+           real(sp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: k
+           real(sp) :: absak, ak, bignum, eps, pert, sfmin, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sign
+           ! Executable Statements 
+           info = 0
+           if( ( abs( job )>2 ) .or. ( job==0 ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAGTS', -info )
+              return
+           end if
+           if( n==0 )return
+           eps = stdlib_slamch( 'EPSILON' )
+           sfmin = stdlib_slamch( 'SAFE MINIMUM' )
+           bignum = one / sfmin
+           if( job<0 ) then
+              if( tol<=zero ) then
+                 tol = abs( a( 1 ) )
+                 if( n>1 )tol = max( tol, abs( a( 2 ) ), abs( b( 1 ) ) )
+                 do k = 3, n
+                    tol = max( tol, abs( a( k ) ), abs( b( k-1 ) ),abs( d( k-2 ) ) )
+                 end do
+                 tol = tol*eps
+                 if( tol==zero )tol = eps
+              end if
+           end if
+           if( abs( job )==1 ) then
+              do k = 2, n
+                 if( in( k-1 )==0 ) then
+                    y( k ) = y( k ) - c( k-1 )*y( k-1 )
+                 else
+                    temp = y( k-1 )
+                    y( k-1 ) = y( k )
+                    y( k ) = temp - c( k-1 )*y( k )
+                 end if
+              end do
+              if( job==1 ) then
+                 loop_30: do k = n, 1, -1
+                    if( k<=n-2 ) then
+                       temp = y( k ) - b( k )*y( k+1 ) - d( k )*y( k+2 )
+                    else if( k==n-1 ) then
+                       temp = y( k ) - b( k )*y( k+1 )
+                    else
+                       temp = y( k )
+                    end if
+                    ak = a( k )
+                    absak = abs( ak )
+                    if( absak<one ) then
+                       if( absak<sfmin ) then
+                          if( absak==zero .or. abs( temp )*sfmin>absak )then
+                             info = k
+                             return
+                          else
+                             temp = temp*bignum
+                             ak = ak*bignum
+                          end if
+                       else if( abs( temp )>absak*bignum ) then
+                          info = k
+                          return
+                       end if
+                    end if
+                    y( k ) = temp / ak
+                 end do loop_30
+              else
+                 loop_50: do k = n, 1, -1
+                    if( k<=n-2 ) then
+                       temp = y( k ) - b( k )*y( k+1 ) - d( k )*y( k+2 )
+                    else if( k==n-1 ) then
+                       temp = y( k ) - b( k )*y( k+1 )
+                    else
+                       temp = y( k )
+                    end if
+                    ak = a( k )
+                    pert = sign( tol, ak )
+                    40 continue
+                    absak = abs( ak )
+                    if( absak<one ) then
+                       if( absak<sfmin ) then
+                          if( absak==zero .or. abs( temp )*sfmin>absak )then
+                             ak = ak + pert
+                             pert = 2*pert
+                             go to 40
+                          else
+                             temp = temp*bignum
+                             ak = ak*bignum
+                          end if
+                       else if( abs( temp )>absak*bignum ) then
+                          ak = ak + pert
+                          pert = 2*pert
+                          go to 40
+                       end if
+                    end if
+                    y( k ) = temp / ak
+                 end do loop_50
+              end if
+           else
+              ! come to here if  job = 2 or -2
+              if( job==2 ) then
+                 loop_60: do k = 1, n
+                    if( k>=3 ) then
+                       temp = y( k ) - b( k-1 )*y( k-1 ) - d( k-2 )*y( k-2 )
+                    else if( k==2 ) then
+                       temp = y( k ) - b( k-1 )*y( k-1 )
+                    else
+                       temp = y( k )
+                    end if
+                    ak = a( k )
+                    absak = abs( ak )
+                    if( absak<one ) then
+                       if( absak<sfmin ) then
+                          if( absak==zero .or. abs( temp )*sfmin>absak )then
+                             info = k
+                             return
+                          else
+                             temp = temp*bignum
+                             ak = ak*bignum
+                          end if
+                       else if( abs( temp )>absak*bignum ) then
+                          info = k
+                          return
+                       end if
+                    end if
+                    y( k ) = temp / ak
+                 end do loop_60
+              else
+                 loop_80: do k = 1, n
+                    if( k>=3 ) then
+                       temp = y( k ) - b( k-1 )*y( k-1 ) - d( k-2 )*y( k-2 )
+                    else if( k==2 ) then
+                       temp = y( k ) - b( k-1 )*y( k-1 )
+                    else
+                       temp = y( k )
+                    end if
+                    ak = a( k )
+                    pert = sign( tol, ak )
+                    70 continue
+                    absak = abs( ak )
+                    if( absak<one ) then
+                       if( absak<sfmin ) then
+                          if( absak==zero .or. abs( temp )*sfmin>absak )then
+                             ak = ak + pert
+                             pert = 2*pert
+                             go to 70
+                          else
+                             temp = temp*bignum
+                             ak = ak*bignum
+                          end if
+                       else if( abs( temp )>absak*bignum ) then
+                          ak = ak + pert
+                          pert = 2*pert
+                          go to 70
+                       end if
+                    end if
+                    y( k ) = temp / ak
+                 end do loop_80
+              end if
+              do k = n, 2, -1
+                 if( in( k-1 )==0 ) then
+                    y( k-1 ) = y( k-1 ) - c( k-1 )*y( k )
+                 else
+                    temp = y( k-1 )
+                    y( k-1 ) = y( k )
+                    y( k ) = temp - c( k-1 )*y( k )
+                 end if
+              end do
+           end if
+     end subroutine stdlib_slagts
+
+     !> SLAIC1: applies one step of incremental condition estimation in
+     !> its simplest version:
+     !> Let x, twonorm(x) = 1, be an approximate singular vector of an j-by-j
+     !> lower triangular matrix L, such that
+     !> twonorm(L*x) = sest
+     !> Then SLAIC1 computes sestpr, s, c such that
+     !> the vector
+     !> [ s*x ]
+     !> xhat = [  c  ]
+     !> is an approximate singular vector of
+     !> [ L      0  ]
+     !> Lhat = [ w**T gamma ]
+     !> in the sense that
+     !> twonorm(Lhat*xhat) = sestpr.
+     !> Depending on JOB, an estimate for the largest or smallest singular
+     !> value is computed.
+     !> Note that [s c]**T and sestpr**2 is an eigenpair of the system
+     !> diag(sest*sest, 0) + [alpha  gamma] * [ alpha ]
+     !> [ gamma ]
+     !> where  alpha =  x**T*w.
+
+     pure subroutine stdlib_slaic1( job, j, x, sest, w, gamma, sestpr, s, c )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: j, job
+           real(sp), intent(out) :: c, s, sestpr
+           real(sp), intent(in) :: gamma, sest
+           ! Array Arguments 
+           real(sp), intent(in) :: w(j), x(j)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(sp) :: absalp, absest, absgam, alpha, b, cosine, eps, norma, s1, s2, sine, t, &
+                     test, tmp, zeta1, zeta2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sign,sqrt
+           ! Executable Statements 
+           eps = stdlib_slamch( 'EPSILON' )
+           alpha = stdlib_sdot( j, x, 1, w, 1 )
+           absalp = abs( alpha )
+           absgam = abs( gamma )
+           absest = abs( sest )
+           if( job==1 ) then
+              ! estimating largest singular value
+              ! special cases
+              if( sest==zero ) then
+                 s1 = max( absgam, absalp )
+                 if( s1==zero ) then
+                    s = zero
+                    c = one
+                    sestpr = zero
+                 else
+                    s = alpha / s1
+                    c = gamma / s1
+                    tmp = sqrt( s*s+c*c )
+                    s = s / tmp
+                    c = c / tmp
+                    sestpr = s1*tmp
+                 end if
+                 return
+              else if( absgam<=eps*absest ) then
+                 s = one
+                 c = zero
+                 tmp = max( absest, absalp )
+                 s1 = absest / tmp
+                 s2 = absalp / tmp
+                 sestpr = tmp*sqrt( s1*s1+s2*s2 )
+                 return
+              else if( absalp<=eps*absest ) then
+                 s1 = absgam
+                 s2 = absest
+                 if( s1<=s2 ) then
+                    s = one
+                    c = zero
+                    sestpr = s2
+                 else
+                    s = zero
+                    c = one
+                    sestpr = s1
+                 end if
+                 return
+              else if( absest<=eps*absalp .or. absest<=eps*absgam ) then
+                 s1 = absgam
+                 s2 = absalp
+                 if( s1<=s2 ) then
+                    tmp = s1 / s2
+                    s = sqrt( one+tmp*tmp )
+                    sestpr = s2*s
+                    c = ( gamma / s2 ) / s
+                    s = sign( one, alpha ) / s
+                 else
+                    tmp = s2 / s1
+                    c = sqrt( one+tmp*tmp )
+                    sestpr = s1*c
+                    s = ( alpha / s1 ) / c
+                    c = sign( one, gamma ) / c
+                 end if
+                 return
+              else
+                 ! normal case
+                 zeta1 = alpha / absest
+                 zeta2 = gamma / absest
+                 b = ( one-zeta1*zeta1-zeta2*zeta2 )*half
+                 c = zeta1*zeta1
+                 if( b>zero ) then
+                    t = c / ( b+sqrt( b*b+c ) )
+                 else
+                    t = sqrt( b*b+c ) - b
+                 end if
+                 sine = -zeta1 / t
+                 cosine = -zeta2 / ( one+t )
+                 tmp = sqrt( sine*sine+cosine*cosine )
+                 s = sine / tmp
+                 c = cosine / tmp
+                 sestpr = sqrt( t+one )*absest
+                 return
+              end if
+           else if( job==2 ) then
+              ! estimating smallest singular value
+              ! special cases
+              if( sest==zero ) then
+                 sestpr = zero
+                 if( max( absgam, absalp )==zero ) then
+                    sine = one
+                    cosine = zero
+                 else
+                    sine = -gamma
+                    cosine = alpha
+                 end if
+                 s1 = max( abs( sine ), abs( cosine ) )
+                 s = sine / s1
+                 c = cosine / s1
+                 tmp = sqrt( s*s+c*c )
+                 s = s / tmp
+                 c = c / tmp
+                 return
+              else if( absgam<=eps*absest ) then
+                 s = zero
+                 c = one
+                 sestpr = absgam
+                 return
+              else if( absalp<=eps*absest ) then
+                 s1 = absgam
+                 s2 = absest
+                 if( s1<=s2 ) then
+                    s = zero
+                    c = one
+                    sestpr = s1
+                 else
+                    s = one
+                    c = zero
+                    sestpr = s2
+                 end if
+                 return
+              else if( absest<=eps*absalp .or. absest<=eps*absgam ) then
+                 s1 = absgam
+                 s2 = absalp
+                 if( s1<=s2 ) then
+                    tmp = s1 / s2
+                    c = sqrt( one+tmp*tmp )
+                    sestpr = absest*( tmp / c )
+                    s = -( gamma / s2 ) / c
+                    c = sign( one, alpha ) / c
+                 else
+                    tmp = s2 / s1
+                    s = sqrt( one+tmp*tmp )
+                    sestpr = absest / s
+                    c = ( alpha / s1 ) / s
+                    s = -sign( one, gamma ) / s
+                 end if
+                 return
+              else
+                 ! normal case
+                 zeta1 = alpha / absest
+                 zeta2 = gamma / absest
+                 norma = max( one+zeta1*zeta1+abs( zeta1*zeta2 ),abs( zeta1*zeta2 )+zeta2*zeta2 )
+                           
+                 ! see if root is closer to zero or to one
+                 test = one + two*( zeta1-zeta2 )*( zeta1+zeta2 )
+                 if( test>=zero ) then
+                    ! root is close to zero, compute directly
+                    b = ( zeta1*zeta1+zeta2*zeta2+one )*half
+                    c = zeta2*zeta2
+                    t = c / ( b+sqrt( abs( b*b-c ) ) )
+                    sine = zeta1 / ( one-t )
+                    cosine = -zeta2 / t
+                    sestpr = sqrt( t+four*eps*eps*norma )*absest
+                 else
+                    ! root is closer to one, shift by that amount
+                    b = ( zeta2*zeta2+zeta1*zeta1-one )*half
+                    c = zeta1*zeta1
+                    if( b>=zero ) then
+                       t = -c / ( b+sqrt( b*b+c ) )
+                    else
+                       t = b - sqrt( b*b+c )
+                    end if
+                    sine = -zeta1 / t
+                    cosine = -zeta2 / ( one+t )
+                    sestpr = sqrt( one+t+four*eps*eps*norma )*absest
+                 end if
+                 tmp = sqrt( sine*sine+cosine*cosine )
+                 s = sine / tmp
+                 c = cosine / tmp
+                 return
+              end if
+           end if
+           return
+     end subroutine stdlib_slaic1
+
+     !> SLANEG: computes the Sturm count, the number of negative pivots
+     !> encountered while factoring tridiagonal T - sigma I = L D L^T.
+     !> This implementation works directly on the factors without forming
+     !> the tridiagonal matrix T.  The Sturm count is also the number of
+     !> eigenvalues of T less than sigma.
+     !> This routine is called from SLARRB.
+     !> The current routine does not use the PIVMIN parameter but rather
+     !> requires IEEE-754 propagation of Infinities and NaNs.  This
+     !> routine also has no input range restrictions but does require
+     !> default exception handling such that x/0 produces Inf when x is
+     !> non-zero, and Inf/Inf produces NaN.  For more information, see:
+     !> Marques, Riedy, and Voemel, "Benefits of IEEE-754 Features in
+     !> Modern Symmetric Tridiagonal Eigensolvers," SIAM Journal on
+     !> Scientific Computing, v28, n5, 2006.  DOI 10.1137/050641624
+     !> (Tech report version in LAWN 172 with the same title.)
+
+     pure integer(ilp) function stdlib_slaneg( n, d, lld, sigma, pivmin, r )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, r
+           real(sp), intent(in) :: pivmin, sigma
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*), lld(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: blklen = 128
+           
+           ! some architectures propagate infinities and nans very slowly, so
+           ! the code computes counts in blklen chunks.  then a nan can
+           ! propagate at most blklen columns before being detected.  this is
+           ! not a general tuning parameter; it needs only to be just large
+           ! enough that the overhead is tiny in common cases.
+           
+           ! Local Scalars 
+           integer(ilp) :: bj, j, neg1, neg2, negcnt
+           real(sp) :: bsav, dminus, dplus, gamma, p, t, tmp
+           logical(lk) :: sawnan
+           ! Intrinsic Functions 
+           intrinsic :: min,max
+           ! Executable Statements 
+           negcnt = 0
+           ! i) upper part: l d l^t - sigma i = l+ d+ l+^t
+           t = -sigma
+           loop_210: do bj = 1, r-1, blklen
+              neg1 = 0
+              bsav = t
+              do j = bj, min(bj+blklen-1, r-1)
+                 dplus = d( j ) + t
+                 if( dplus<zero ) neg1 = neg1 + 1
+                 tmp = t / dplus
+                 t = tmp * lld( j ) - sigma
+              end do
+              sawnan = stdlib_sisnan( t )
+           ! run a slower version of the above loop if a nan is detected.
+           ! a nan should occur only with a zero pivot after an infinite
+           ! pivot.  in that case, substituting 1 for t/dplus is the
+           ! correct limit.
+              if( sawnan ) then
+                 neg1 = 0
+                 t = bsav
+                 do j = bj, min(bj+blklen-1, r-1)
+                    dplus = d( j ) + t
+                    if( dplus<zero ) neg1 = neg1 + 1
+                    tmp = t / dplus
+                    if (stdlib_sisnan(tmp)) tmp = one
+                    t = tmp * lld(j) - sigma
+                 end do
+              end if
+              negcnt = negcnt + neg1
+           end do loop_210
+           ! ii) lower part: l d l^t - sigma i = u- d- u-^t
+           p = d( n ) - sigma
+           do bj = n-1, r, -blklen
+              neg2 = 0
+              bsav = p
+              do j = bj, max(bj-blklen+1, r), -1
+                 dminus = lld( j ) + p
+                 if( dminus<zero ) neg2 = neg2 + 1
+                 tmp = p / dminus
+                 p = tmp * d( j ) - sigma
+              end do
+              sawnan = stdlib_sisnan( p )
+           ! as above, run a slower version that substitutes 1 for inf/inf.
+              if( sawnan ) then
+                 neg2 = 0
+                 p = bsav
+                 do j = bj, max(bj-blklen+1, r), -1
+                    dminus = lld( j ) + p
+                    if( dminus<zero ) neg2 = neg2 + 1
+                    tmp = p / dminus
+                    if (stdlib_sisnan(tmp)) tmp = one
+                    p = tmp * d(j) - sigma
+                 end do
+              end if
+              negcnt = negcnt + neg2
+           end do
+           ! iii) twist index
+             ! t was shifted by sigma initially.
+           gamma = (t + sigma) + p
+           if( gamma<zero ) negcnt = negcnt+1
+           stdlib_slaneg = negcnt
+     end function stdlib_slaneg
+
+     !> SLANGB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n band matrix  A,  with kl sub-diagonals and ku super-diagonals.
+
+     real(sp)             function stdlib_slangb( norm, n, kl, ku, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: kl, ku, ldab, n
+           ! Array Arguments 
+           real(sp), intent(in) :: ab(ldab,*)
+           real(sp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k, l
+           real(sp) :: scale, sum, value, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                    temp = abs( ab( i, j ) )
+                    if( value<temp .or. stdlib_sisnan( temp ) ) value = temp
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                    sum = sum + abs( ab( i, j ) )
+                 end do
+                 if( value<sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 k = ku + 1 - j
+                 do i = max( 1, j-ku ), min( n, j+kl )
+                    work( i ) = work( i ) + abs( ab( k+i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, n
+                 temp = work( i )
+                 if( value<temp .or. stdlib_sisnan( temp ) ) value = temp
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 l = max( 1, j-ku )
+                 k = ku + 1 - j + l
+                 call stdlib_slassq( min( n, j+kl )-l+1, ab( k, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_slangb = value
+           return
+     end function stdlib_slangb
+
+     !> SLANGE:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real matrix A.
+
+     real(sp)             function stdlib_slange( norm, m, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: scale, sum, value, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( min( m, n )==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = 1, m
+                    temp = abs( a( i, j ) )
+                    if( value<temp .or. stdlib_sisnan( temp ) ) value = temp
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = 1, m
+                    sum = sum + abs( a( i, j ) )
+                 end do
+                 if( value<sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, m
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 do i = 1, m
+                    work( i ) = work( i ) + abs( a( i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, m
+                 temp = work( i )
+                 if( value<temp .or. stdlib_sisnan( temp ) ) value = temp
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 call stdlib_slassq( m, a( 1, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_slange = value
+           return
+     end function stdlib_slange
+
+     !> SLANGT:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real tridiagonal matrix A.
+
+     pure real(sp)             function stdlib_slangt( norm, n, dl, d, du )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*), dl(*), du(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: anorm, scale, sum, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n<=0 ) then
+              anorm = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              anorm = abs( d( n ) )
+              do i = 1, n - 1
+                 if( anorm<abs( dl( i ) ) .or. stdlib_sisnan( abs( dl( i ) ) ) )anorm = abs(dl(i))
+                           
+                 if( anorm<abs( d( i ) ) .or. stdlib_sisnan( abs( d( i ) ) ) )anorm = abs(d(i))
+                           
+                 if( anorm<abs( du( i ) ) .or. stdlib_sisnan (abs( du( i ) ) ) )anorm = abs(du(i))
+                           
+              end do
+           else if( stdlib_lsame( norm, 'O' ) .or. norm=='1' ) then
+              ! find norm1(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( dl( 1 ) )
+                 temp = abs( d( n ) )+abs( du( n-1 ) )
+                 if( anorm < temp .or. stdlib_sisnan( temp ) ) anorm = temp
+                 do i = 2, n - 1
+                    temp = abs( d( i ) )+abs( dl( i ) )+abs( du( i-1 ) )
+                    if( anorm < temp .or. stdlib_sisnan( temp ) ) anorm = temp
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( du( 1 ) )
+                 temp = abs( d( n ) )+abs( dl( n-1 ) )
+                 if( anorm < temp .or. stdlib_sisnan( temp ) ) anorm = temp
+                 do i = 2, n - 1
+                    temp = abs( d( i ) )+abs( du( i ) )+abs( dl( i-1 ) )
+                    if( anorm < temp .or. stdlib_sisnan( temp ) ) anorm = temp
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              call stdlib_slassq( n, d, 1, scale, sum )
+              if( n>1 ) then
+                 call stdlib_slassq( n-1, dl, 1, scale, sum )
+                 call stdlib_slassq( n-1, du, 1, scale, sum )
+              end if
+              anorm = scale*sqrt( sum )
+           end if
+           stdlib_slangt = anorm
+           return
+     end function stdlib_slangt
+
+     !> SLANHS:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> Hessenberg matrix A.
+
+     real(sp)             function stdlib_slanhs( norm, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = 1, min( n, j+1 )
+                    sum = abs( a( i, j ) )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = 1, min( n, j+1 )
+                    sum = sum + abs( a( i, j ) )
+                 end do
+                 if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 do i = 1, min( n, j+1 )
+                    work( i ) = work( i ) + abs( a( i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 call stdlib_slassq( min( n, j+1 ), a( 1, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_slanhs = value
+           return
+     end function stdlib_slanhs
+
+     !> SLANSB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n symmetric band matrix A,  with k super-diagonals.
+
+     real(sp)             function stdlib_slansb( norm, uplo, n, k, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(sp), intent(in) :: ab(ldab,*)
+           real(sp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           real(sp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = max( k+2-j, 1 ), k + 1
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                 end do
+              else
+                 do j = 1, n
+                    do i = 1, min( n+1-j, k+1 )
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    l = k + 1 - j
+                    do i = max( 1, j-k ), j - 1
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( ab( k+1, j ) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( ab( 1, j ) )
+                    l = 1 - j
+                    do i = j + 1, min( n, j+k )
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( k>0 ) then
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 2, n
+                       call stdlib_slassq( min( j-1, k ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                    l = k + 1
+                 else
+                    do j = 1, n - 1
+                       call stdlib_slassq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                    end do
+                    l = 1
+                 end if
+                 sum = 2*sum
+              else
+                 l = 1
+              end if
+              call stdlib_slassq( n, ab( l, 1 ), ldab, scale, sum )
+              value = scale*sqrt( sum )
+           end if
+           stdlib_slansb = value
+           return
+     end function stdlib_slansb
+
+     !> SLANSF: returns the value of the one norm, or the Frobenius norm, or
+     !> the infinity norm, or the element of largest absolute value of a
+     !> real symmetric matrix A in RFP format.
+
+     real(sp) function stdlib_slansf( norm, transr, uplo, n, a, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, transr, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(in) :: a(0:*)
+           real(sp), intent(out) :: work(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, ifm, ilu, noe, n1, k, l, lda
+           real(sp) :: scale, s, value, aa, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              stdlib_slansf = zero
+              return
+           else if( n==1 ) then
+              stdlib_slansf = abs( a(0) )
+              return
+           end if
+           ! set noe = 1 if n is odd. if n is even set noe=0
+           noe = 1
+           if( mod( n, 2 )==0 )noe = 0
+           ! set ifm = 0 when form='t or 't' and 1 otherwise
+           ifm = 1
+           if( stdlib_lsame( transr, 'T' ) )ifm = 0
+           ! set ilu = 0 when uplo='u or 'u' and 1 otherwise
+           ilu = 1
+           if( stdlib_lsame( uplo, 'U' ) )ilu = 0
+           ! set lda = (n+1)/2 when ifm = 0
+           ! set lda = n when ifm = 1 and noe = 1
+           ! set lda = n+1 when ifm = 1 and noe = 0
+           if( ifm==1 ) then
+              if( noe==1 ) then
+                 lda = n
+              else
+                 ! noe=0
+                 lda = n + 1
+              end if
+           else
+              ! ifm=0
+              lda = ( n+1 ) / 2
+           end if
+           if( stdlib_lsame( norm, 'M' ) ) then
+             ! find max(abs(a(i,j))).
+              k = ( n+1 ) / 2
+              value = zero
+              if( noe==1 ) then
+                 ! n is odd
+                 if( ifm==1 ) then
+                 ! a is n by k
+                    do j = 0, k - 1
+                       do i = 0, n - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    end do
+                 else
+                    ! xpose case; a is k by n
+                    do j = 0, n - 1
+                       do i = 0, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    end do
+                 end if
+              else
+                 ! n is even
+                 if( ifm==1 ) then
+                    ! a is n+1 by k
+                    do j = 0, k - 1
+                       do i = 0, n
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    end do
+                 else
+                    ! xpose case; a is k by n+1
+                    do j = 0, n
+                       do i = 0, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              if( ifm==1 ) then
+                 k = n / 2
+                 if( noe==1 ) then
+                    ! n is odd
+                    if( ilu==0 ) then
+                       do i = 0, k - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k
+                          s = zero
+                          do i = 0, k + j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(i,j+k)
+                             s = s + aa
+                             work( i ) = work( i ) + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j+k,j+k)
+                          work( j+k ) = s + aa
+                          if( i==k+k )go to 10
+                          i = i + 1
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j,j)
+                          work( j ) = work( j ) + aa
+                          s = zero
+                          do l = j + 1, k - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       10 continue
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu = 1
+                       k = k + 1
+                       ! k=(n+1)/2 for n odd and ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = k - 1, 0, -1
+                          s = zero
+                          do i = 0, j - 2
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,i+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + aa
+                          end do
+                          if( j>0 ) then
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,j+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + s
+                             ! i=j
+                             i = i + 1
+                          end if
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j,j)
+                          work( j ) = aa
+                          s = zero
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    end if
+                 else
+                    ! n is even
+                    if( ilu==0 ) then
+                       do i = 0, k - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 1
+                          s = zero
+                          do i = 0, k + j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(i,j+k)
+                             s = s + aa
+                             work( i ) = work( i ) + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j+k,j+k)
+                          work( j+k ) = s + aa
+                          i = i + 1
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j,j)
+                          work( j ) = work( j ) + aa
+                          s = zero
+                          do l = j + 1, k - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu = 1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = k - 1, 0, -1
+                          s = zero
+                          do i = 0, j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,i+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j+k,j+k)
+                          s = s + aa
+                          work( i+k ) = work( i+k ) + s
+                          ! i=j
+                          i = i + 1
+                          aa = abs( a( i+j*lda ) )
+                          ! -> a(j,j)
+                          work( j ) = aa
+                          s = zero
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    end if
+                 end if
+              else
+                 ! ifm=0
+                 k = n / 2
+                 if( noe==1 ) then
+                    ! n is odd
+                    if( ilu==0 ) then
+                       n1 = k
+                       ! n/2
+                       k = k + 1
+                       ! k is the row size and lda
+                       do i = n1, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, n1 - 1
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,n1+i)
+                             work( i+n1 ) = work( i+n1 ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = s
+                       end do
+                       ! j=n1=k-1 is special
+                       s = abs( a( 0+j*lda ) )
+                       ! a(k-1,k-1)
+                       do i = 1, k - 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k-1,i+n1)
+                          work( i+n1 ) = work( i+n1 ) + aa
+                          s = s + aa
+                       end do
+                       work( j ) = work( j ) + s
+                       do j = k, n - 1
+                          s = zero
+                          do i = 0, j - k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(i,j-k)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          ! i=j-k
+                          aa = abs( a( i+j*lda ) )
+                          ! a(j-k,j-k)
+                          s = s + aa
+                          work( j-k ) = work( j-k ) + s
+                          i = i + 1
+                          s = abs( a( i+j*lda ) )
+                          ! a(j,j)
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,l)
+                             work( l ) = work( l ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu=1
+                       k = k + 1
+                       ! k=(n+1)/2 for n odd and ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 2
+                          ! process
+                          s = zero
+                          do i = 0, j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! i=j so process of a(j,j)
+                          s = s + aa
+                          work( j ) = s
+                          ! is initialised here
+                          i = i + 1
+                          ! i=j process a(j+k,j+k)
+                          aa = abs( a( i+j*lda ) )
+                          s = aa
+                          do l = k + j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(l,k+j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( k+j ) = work( k+j ) + s
+                       end do
+                       ! j=k-1 is special :process col a(k-1,0:k-1)
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,i)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( a( i+j*lda ) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = s
+                       ! done with col j=k+1
+                       do j = k, n - 1
+                          ! process col j of a = a(j,0:k-1)
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    end if
+                 else
+                    ! n is even
+                    if( ilu==0 ) then
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 1
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i+k)
+                             work( i+k ) = work( i+k ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = s
+                       end do
+                       ! j=k
+                       aa = abs( a( 0+j*lda ) )
+                       ! a(k,k)
+                       s = aa
+                       do i = 1, k - 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,k+i)
+                          work( i+k ) = work( i+k ) + aa
+                          s = s + aa
+                       end do
+                       work( j ) = work( j ) + s
+                       do j = k + 1, n - 1
+                          s = zero
+                          do i = 0, j - 2 - k
+                             aa = abs( a( i+j*lda ) )
+                             ! a(i,j-k-1)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                           ! i=j-1-k
+                          aa = abs( a( i+j*lda ) )
+                          ! a(j-k-1,j-k-1)
+                          s = s + aa
+                          work( j-k-1 ) = work( j-k-1 ) + s
+                          i = i + 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(j,j)
+                          s = aa
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,l)
+                             work( l ) = work( l ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       ! j=n
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(i,k-1)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( a( i+j*lda ) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = work( i ) + s
+                       value = work ( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       ! j=0 is special :process col a(k:n-1,k)
+                       s = abs( a( 0 ) )
+                       ! a(k,k)
+                       do i = 1, k - 1
+                          aa = abs( a( i ) )
+                          ! a(k+i,k)
+                          work( i+k ) = work( i+k ) + aa
+                          s = s + aa
+                       end do
+                       work( k ) = work( k ) + s
+                       do j = 1, k - 1
+                          ! process
+                          s = zero
+                          do i = 0, j - 2
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j-1,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          aa = abs( a( i+j*lda ) )
+                          ! i=j-1 so process of a(j-1,j-1)
+                          s = s + aa
+                          work( j-1 ) = s
+                          ! is initialised here
+                          i = i + 1
+                          ! i=j process a(j+k,j+k)
+                          aa = abs( a( i+j*lda ) )
+                          s = aa
+                          do l = k + j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(l,k+j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( k+j ) = work( k+j ) + s
+                       end do
+                       ! j=k is special :process col a(k,0:k-1)
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,i)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( a( i+j*lda ) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = s
+                       ! done with col j=k+1
+                       do j = k + 1, n
+                          ! process col j-1 of a = a(j-1,0:k-1)
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j-1,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          work( j-1 ) = work( j-1 ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_sisnan( temp ) )value = temp
+                       end do
+                    end if
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+             ! find normf(a).
+              k = ( n+1 ) / 2
+              scale = zero
+              s = one
+              if( noe==1 ) then
+                 ! n is odd
+                 if( ifm==1 ) then
+                    ! a is normal
+                    if( ilu==0 ) then
+                       ! a is upper
+                       do j = 0, k - 3
+                          call stdlib_slassq( k-j-2, a( k+j+1+j*lda ), 1, scale, s )
+                          ! l at a(k,0)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_slassq( k+j-1, a( 0+j*lda ), 1, scale, s )
+                          ! trap u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_slassq( k-1, a( k ), lda+1, scale, s )
+                       ! tri l at a(k,0)
+                       call stdlib_slassq( k, a( k-1 ), lda+1, scale, s )
+                       ! tri u at a(k-1,0)
+                    else
+                       ! ilu=1
+                       do j = 0, k - 1
+                          call stdlib_slassq( n-j-1, a( j+1+j*lda ), 1, scale, s )
+                          ! trap l at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_slassq( j, a( 0+( 1+j )*lda ), 1, scale, s )
+                          ! u at a(0,1)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_slassq( k, a( 0 ), lda+1, scale, s )
+                       ! tri l at a(0,0)
+                       call stdlib_slassq( k-1, a( 0+lda ), lda+1, scale, s )
+                       ! tri u at a(0,1)
+                    end if
+                 else
+                    ! a is xpose
+                    if( ilu==0 ) then
+                       ! a**t is upper
+                       do j = 1, k - 2
+                          call stdlib_slassq( j, a( 0+( k+j )*lda ), 1, scale, s )
+                          ! u at a(0,k)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_slassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k-1 rect. at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_slassq( k-j-1, a( j+1+( j+k-1 )*lda ), 1,scale, s )
+                          ! l at a(0,k-1)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_slassq( k-1, a( 0+k*lda ), lda+1, scale, s )
+                       ! tri u at a(0,k)
+                       call stdlib_slassq( k, a( 0+( k-1 )*lda ), lda+1, scale, s )
+                       ! tri l at a(0,k-1)
+                    else
+                       ! a**t is lower
+                       do j = 1, k - 1
+                          call stdlib_slassq( j, a( 0+j*lda ), 1, scale, s )
+                          ! u at a(0,0)
+                       end do
+                       do j = k, n - 1
+                          call stdlib_slassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k-1 rect. at a(0,k)
+                       end do
+                       do j = 0, k - 3
+                          call stdlib_slassq( k-j-2, a( j+2+j*lda ), 1, scale, s )
+                          ! l at a(1,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_slassq( k, a( 0 ), lda+1, scale, s )
+                       ! tri u at a(0,0)
+                       call stdlib_slassq( k-1, a( 1 ), lda+1, scale, s )
+                       ! tri l at a(1,0)
+                    end if
+                 end if
+              else
+                 ! n is even
+                 if( ifm==1 ) then
+                    ! a is normal
+                    if( ilu==0 ) then
+                       ! a is upper
+                       do j = 0, k - 2
+                          call stdlib_slassq( k-j-1, a( k+j+2+j*lda ), 1, scale, s )
+                          ! l at a(k+1,0)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_slassq( k+j, a( 0+j*lda ), 1, scale, s )
+                          ! trap u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_slassq( k, a( k+1 ), lda+1, scale, s )
+                       ! tri l at a(k+1,0)
+                       call stdlib_slassq( k, a( k ), lda+1, scale, s )
+                       ! tri u at a(k,0)
+                    else
+                       ! ilu=1
+                       do j = 0, k - 1
+                          call stdlib_slassq( n-j-1, a( j+2+j*lda ), 1, scale, s )
+                          ! trap l at a(1,0)
+                       end do
+                       do j = 1, k - 1
+                          call stdlib_slassq( j, a( 0+j*lda ), 1, scale, s )
+                          ! u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_slassq( k, a( 1 ), lda+1, scale, s )
+                       ! tri l at a(1,0)
+                       call stdlib_slassq( k, a( 0 ), lda+1, scale, s )
+                       ! tri u at a(0,0)
+                    end if
+                 else
+                    ! a is xpose
+                    if( ilu==0 ) then
+                       ! a**t is upper
+                       do j = 1, k - 1
+                          call stdlib_slassq( j, a( 0+( k+1+j )*lda ), 1, scale, s )
+                          ! u at a(0,k+1)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_slassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k rect. at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_slassq( k-j-1, a( j+1+( j+k )*lda ), 1, scale,s )
+                          ! l at a(0,k)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_slassq( k, a( 0+( k+1 )*lda ), lda+1, scale, s )
+                       ! tri u at a(0,k+1)
+                       call stdlib_slassq( k, a( 0+k*lda ), lda+1, scale, s )
+                       ! tri l at a(0,k)
+                    else
+                       ! a**t is lower
+                       do j = 1, k - 1
+                          call stdlib_slassq( j, a( 0+( j+1 )*lda ), 1, scale, s )
+                          ! u at a(0,1)
+                       end do
+                       do j = k + 1, n
+                          call stdlib_slassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k rect. at a(0,k+1)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_slassq( k-j-1, a( j+1+j*lda ), 1, scale, s )
+                          ! l at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       call stdlib_slassq( k, a( lda ), lda+1, scale, s )
+                       ! tri l at a(0,1)
+                       call stdlib_slassq( k, a( 0 ), lda+1, scale, s )
+                       ! tri u at a(0,0)
+                    end if
+                 end if
+              end if
+              value = scale*sqrt( s )
+           end if
+           stdlib_slansf = value
+           return
+     end function stdlib_slansf
+
+     !> SLANSP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real symmetric matrix A,  supplied in packed form.
+
+     real(sp)             function stdlib_slansp( norm, uplo, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(in) :: ap(*)
+           real(sp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k
+           real(sp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 k = 1
+                 do j = 1, n
+                    do i = k, k + j - 1
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                    k = k + j
+                 end do
+              else
+                 k = 1
+                 do j = 1, n
+                    do i = k, k + n - j
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                    k = k + n - j + 1
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    work( j ) = sum + abs( ap( k ) )
+                    k = k + 1
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( ap( k ) )
+                    k = k + 1
+                    do i = j + 1, n
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              k = 2
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_slassq( j-1, ap( k ), 1, scale, sum )
+                    k = k + j
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_slassq( n-j, ap( k ), 1, scale, sum )
+                    k = k + n - j + 1
+                 end do
+              end if
+              sum = 2*sum
+              k = 1
+              do i = 1, n
+                 if( ap( k )/=zero ) then
+                    absa = abs( ap( k ) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    k = k + i + 1
+                 else
+                    k = k + n - i + 1
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_slansp = value
+           return
+     end function stdlib_slansp
+
+     !> SLANST:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real symmetric tridiagonal matrix A.
+
+     pure real(sp)             function stdlib_slanst( norm, n, d, e )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*), e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: anorm, scale, sum
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n<=0 ) then
+              anorm = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              anorm = abs( d( n ) )
+              do i = 1, n - 1
+                 sum = abs( d( i ) )
+                 if( anorm < sum .or. stdlib_sisnan( sum ) ) anorm = sum
+                 sum = abs( e( i ) )
+                 if( anorm < sum .or. stdlib_sisnan( sum ) ) anorm = sum
+              end do
+           else if( stdlib_lsame( norm, 'O' ) .or. norm=='1' .or.stdlib_lsame( norm, 'I' ) ) &
+                     then
+              ! find norm1(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( e( 1 ) )
+                 sum = abs( e( n-1 ) )+abs( d( n ) )
+                 if( anorm < sum .or. stdlib_sisnan( sum ) ) anorm = sum
+                 do i = 2, n - 1
+                    sum = abs( d( i ) )+abs( e( i ) )+abs( e( i-1 ) )
+                    if( anorm < sum .or. stdlib_sisnan( sum ) ) anorm = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( n>1 ) then
+                 call stdlib_slassq( n-1, e, 1, scale, sum )
+                 sum = 2*sum
+              end if
+              call stdlib_slassq( n, d, 1, scale, sum )
+              anorm = scale*sqrt( sum )
+           end if
+           stdlib_slanst = anorm
+           return
+     end function stdlib_slanst
+
+     !> SLANSY:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> real symmetric matrix A.
+
+     real(sp)             function stdlib_slansy( norm, uplo, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = 1, j
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                 end do
+              else
+                 do j = 1, n
+                    do i = j, n
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( a( j, j ) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( a( j, j ) )
+                    do i = j + 1, n
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_slassq( j-1, a( 1, j ), 1, scale, sum )
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_slassq( n-j, a( j+1, j ), 1, scale, sum )
+                 end do
+              end if
+              sum = 2*sum
+              call stdlib_slassq( n, a, lda+1, scale, sum )
+              value = scale*sqrt( sum )
+           end if
+           stdlib_slansy = value
+           return
+     end function stdlib_slansy
+
+     !> SLANTB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n triangular band matrix A,  with ( k + 1 ) diagonals.
+
+     real(sp)             function stdlib_slantb( norm, uplo, diag, n, k, ab,ldab, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(sp), intent(in) :: ab(ldab,*)
+           real(sp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j, l
+           real(sp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = max( k+2-j, 1 ), k
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = 2, min( n+1-j, k+1 )
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = max( k+2-j, 1 ), k + 1
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = 1, min( n+1-j, k+1 )
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = max( k+2-j, 1 ), k
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = max( k+2-j, 1 ), k + 1
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = 2, min( n+1-j, k+1 )
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = 1, min( n+1-j, k+1 )
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       l = k + 1 - j
+                       do i = max( 1, j-k ), j - 1
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       l = k + 1 - j
+                       do i = max( 1, j-k ), j
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       l = 1 - j
+                       do i = j + 1, min( n, j+k )
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       l = 1 - j
+                       do i = j, min( n, j+k )
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 end if
+              end if
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    if( k>0 ) then
+                       do j = 2, n
+                          call stdlib_slassq( min( j-1, k ),ab( max( k+2-j, 1 ), j ), 1, scale,&
+                                    sum )
+                       end do
+                    end if
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_slassq( min( j, k+1 ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    if( k>0 ) then
+                       do j = 1, n - 1
+                          call stdlib_slassq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                       end do
+                    end if
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_slassq( min( n-j+1, k+1 ), ab( 1, j ), 1, scale,sum )
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_slantb = value
+           return
+     end function stdlib_slantb
+
+     !> SLANTP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> triangular matrix A, supplied in packed form.
+
+     real(sp)             function stdlib_slantp( norm, uplo, diag, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(in) :: ap(*)
+           real(sp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j, k
+           real(sp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              k = 1
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = k, k + j - 2
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                       k = k + j
+                    end do
+                 else
+                    do j = 1, n
+                       do i = k + 1, k + n - j
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                       k = k + n - j + 1
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = k, k + j - 1
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                       k = k + j
+                    end do
+                 else
+                    do j = 1, n
+                       do i = k, k + n - j
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                       k = k + n - j + 1
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              k = 1
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = k, k + j - 2
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = k, k + j - 1
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    end if
+                    k = k + j
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = k + 1, k + n - j
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = k, k + n - j
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    end if
+                    k = k + n - j + 1
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       do i = 1, j - 1
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                       k = k + 1
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = 1, j
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       k = k + 1
+                       do i = j + 1, n
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j, n
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 end if
+              end if
+              value = zero
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    k = 2
+                    do j = 2, n
+                       call stdlib_slassq( j-1, ap( k ), 1, scale, sum )
+                       k = k + j
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    k = 1
+                    do j = 1, n
+                       call stdlib_slassq( j, ap( k ), 1, scale, sum )
+                       k = k + j
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    k = 2
+                    do j = 1, n - 1
+                       call stdlib_slassq( n-j, ap( k ), 1, scale, sum )
+                       k = k + n - j + 1
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    k = 1
+                    do j = 1, n
+                       call stdlib_slassq( n-j+1, ap( k ), 1, scale, sum )
+                       k = k + n - j + 1
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_slantp = value
+           return
+     end function stdlib_slantp
+
+     !> SLANTR:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> trapezoidal or triangular matrix A.
+
+     real(sp)             function stdlib_slantr( norm, uplo, diag, m, n, a, lda,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j
+           real(sp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( min( m, n )==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = 1, min( m, j-1 )
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = j + 1, m
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = 1, min( m, j )
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = j, m
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( ( udiag ) .and. ( j<=m ) ) then
+                       sum = one
+                       do i = 1, j - 1
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = 1, min( m, j )
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = j + 1, m
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = j, m
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, m
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       do i = 1, min( m, j-1 )
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = 1, min( m, j )
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, min( m, n )
+                       work( i ) = one
+                    end do
+                    do i = n + 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j + 1, m
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j, m
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 end if
+              end if
+              value = zero
+              do i = 1, m
+                 sum = work( i )
+                 if( value < sum .or. stdlib_sisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = min( m, n )
+                    do j = 2, n
+                       call stdlib_slassq( min( m, j-1 ), a( 1, j ), 1, scale, sum )
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_slassq( min( m, j ), a( 1, j ), 1, scale, sum )
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = min( m, n )
+                    do j = 1, n
+                       call stdlib_slassq( m-j, a( min( m, j+1 ), j ), 1, scale,sum )
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_slassq( m-j+1, a( j, j ), 1, scale, sum )
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_slantr = value
+           return
+     end function stdlib_slantr
+
+     !> SLAORHR_COL_GETRFNP: computes the modified LU factorization without
+     !> pivoting of a real general M-by-N matrix A. The factorization has
+     !> the form:
+     !> A - S = L * U,
+     !> where:
+     !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+     !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+     !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+     !> i-1 steps of Gaussian elimination. This means that the diagonal
+     !> element at each step of "modified" Gaussian elimination is
+     !> at least one in absolute value (so that division-by-zero not
+     !> not possible during the division by the diagonal element);
+     !> L is a M-by-N lower triangular matrix with unit diagonal elements
+     !> (lower trapezoidal if M > N);
+     !> and U is a M-by-N upper triangular matrix
+     !> (upper trapezoidal if M < N).
+     !> This routine is an auxiliary routine used in the Householder
+     !> reconstruction routine SORHR_COL. In SORHR_COL, this routine is
+     !> applied to an M-by-N matrix A with orthonormal columns, where each
+     !> element is bounded by one in absolute value. With the choice of
+     !> the matrix S above, one can show that the diagonal element at each
+     !> step of Gaussian elimination is the largest (in absolute value) in
+     !> the column on or below the diagonal, so that no pivoting is required
+     !> for numerical stability [1].
+     !> For more details on the Householder reconstruction algorithm,
+     !> including the modified LU factorization, see [1].
+     !> This is the blocked right-looking version of the algorithm,
+     !> calling Level 3 BLAS to update the submatrix. To factorize a block,
+     !> this routine calls the recursive routine SLAORHR_COL_GETRFNP2.
+     !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+     !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+     !> E. Solomonik, J. Parallel Distrib. Comput.,
+     !> vol. 85, pp. 3-31, 2015.
+
+     pure subroutine stdlib_slaorhr_col_getrfnp( m, n, a, lda, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: d(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: iinfo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAORHR_COL_GETRFNP', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'SLAORHR_COL_GETRFNP', ' ', m, n, -1, -1 )
+           if( nb<=1 .or. nb>=min( m, n ) ) then
+              ! use unblocked code.
+              call stdlib_slaorhr_col_getrfnp2( m, n, a, lda, d, info )
+           else
+              ! use blocked code.
+              do j = 1, min( m, n ), nb
+                 jb = min( min( m, n )-j+1, nb )
+                 ! factor diagonal and subdiagonal blocks.
+                 call stdlib_slaorhr_col_getrfnp2( m-j+1, jb, a( j, j ), lda,d( j ), iinfo )
+                           
+                 if( j+jb<=n ) then
+                    ! compute block row of u.
+                    call stdlib_strsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', jb,n-j-jb+1, one, &
+                              a( j, j ), lda, a( j, j+jb ),lda )
+                    if( j+jb<=m ) then
+                       ! update trailing submatrix.
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-j-jb+1,n-j-jb+1, jb, -&
+                       one, a( j+jb, j ), lda,a( j, j+jb ), lda, one, a( j+jb, j+jb ),lda )
+                                 
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_slaorhr_col_getrfnp
+
+     !> SLAPY2: returns sqrt(x**2+y**2), taking care not to cause unnecessary
+     !> overflow and unnecessary underflow.
+
+     pure real(sp)             function stdlib_slapy2( x, y )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: x, y
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(sp) :: w, xabs, yabs, z, hugeval
+           logical(lk) :: x_is_nan, y_is_nan
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           x_is_nan = stdlib_sisnan( x )
+           y_is_nan = stdlib_sisnan( y )
+           if ( x_is_nan ) stdlib_slapy2 = x
+           if ( y_is_nan ) stdlib_slapy2 = y
+           hugeval = stdlib_slamch( 'OVERFLOW' )
+           if ( .not.( x_is_nan.or.y_is_nan ) ) then
+              xabs = abs( x )
+              yabs = abs( y )
+              w = max( xabs, yabs )
+              z = min( xabs, yabs )
+              if( z==zero .or. w>hugeval ) then
+                 stdlib_slapy2 = w
+              else
+                 stdlib_slapy2 = w*sqrt( one+( z / w )**2 )
+              end if
+           end if
+           return
+     end function stdlib_slapy2
+
+     !> Given a 3-by-3 matrix pencil (A,B), SLAQZ1: sets v to a
+     !> scalar multiple of the first column of the product
+     !> (*)  K = (A - (beta2*sr2 - i*si)*B)*B^(-1)*(beta1*A - (sr2 + i*si2)*B)*B^(-1).
+     !> It is assumed that either
+     !> 1) sr1 = sr2
+     !> or
+     !> 2) si = 0.
+     !> This is useful for starting double implicit shift bulges
+     !> in the QZ algorithm.
+
+     pure subroutine stdlib_slaqz1( a, lda, b, ldb, sr1, sr2, si, beta1, beta2,v )
+           ! arguments
+           integer(ilp), intent( in ) :: lda, ldb
+           real(sp), intent( in ) :: a( lda, * ), b( ldb, * ), sr1, sr2, si,beta1, beta2
+           real(sp), intent( out ) :: v( * )
+           
+           ! local scalars
+           real(sp) :: w(2), safmin, safmax, scale1, scale2
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           ! calculate first shifted vector
+           w( 1 ) = beta1*a( 1, 1 )-sr1*b( 1, 1 )
+           w( 2 ) = beta1*a( 2, 1 )-sr1*b( 2, 1 )
+           scale1 = sqrt( abs( w( 1 ) ) ) * sqrt( abs( w( 2 ) ) )
+           if( scale1 >= safmin .and. scale1 <= safmax ) then
+              w( 1 ) = w( 1 )/scale1
+              w( 2 ) = w( 2 )/scale1
+           end if
+           ! solve linear system
+           w( 2 ) = w( 2 )/b( 2, 2 )
+           w( 1 ) = ( w( 1 )-b( 1, 2 )*w( 2 ) )/b( 1, 1 )
+           scale2 = sqrt( abs( w( 1 ) ) ) * sqrt( abs( w( 2 ) ) )
+           if( scale2 >= safmin .and. scale2 <= safmax ) then
+              w( 1 ) = w( 1 )/scale2
+              w( 2 ) = w( 2 )/scale2
+           end if
+           ! apply second shift
+           v( 1 ) = beta2*( a( 1, 1 )*w( 1 )+a( 1, 2 )*w( 2 ) )-sr2*( b( 1,1 )*w( 1 )+b( 1, 2 )*w(&
+                      2 ) )
+           v( 2 ) = beta2*( a( 2, 1 )*w( 1 )+a( 2, 2 )*w( 2 ) )-sr2*( b( 2,1 )*w( 1 )+b( 2, 2 )*w(&
+                      2 ) )
+           v( 3 ) = beta2*( a( 3, 1 )*w( 1 )+a( 3, 2 )*w( 2 ) )-sr2*( b( 3,1 )*w( 1 )+b( 3, 2 )*w(&
+                      2 ) )
+           ! account for imaginary part
+           v( 1 ) = v( 1 )+si*si*b( 1, 1 )/scale1/scale2
+           ! check for overflow
+           if( abs( v( 1 ) )>safmax .or. abs( v( 2 ) ) > safmax .or.abs( v( 3 ) )>safmax .or. &
+           stdlib_sisnan( v( 1 ) ) .or.stdlib_sisnan( v( 2 ) ) .or. stdlib_sisnan( v( 3 ) ) ) &
+                     then
+              v( 1 ) = zero
+              v( 2 ) = zero
+              v( 3 ) = zero
+           end if
+     end subroutine stdlib_slaqz1
+
+     !> SLAQZ2: chases a 2x2 shift bulge in a matrix pencil down a single position
+
+     pure subroutine stdlib_slaqz2( ilq, ilz, k, istartm, istopm, ihi, a, lda, b,ldb, nq, qstart, &
+               q, ldq, nz, zstart, z, ldz )
+           ! arguments
+           logical(lk), intent( in ) :: ilq, ilz
+           integer(ilp), intent( in ) :: k, lda, ldb, ldq, ldz, istartm, istopm,nq, nz, qstart, &
+                     zstart, ihi
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           
+           ! local variables
+           real(sp) :: h(2,3), c1, s1, c2, s2, temp
+           if( k+2 == ihi ) then
+              ! shift is located on the edge of the matrix, remove it
+              h = b( ihi-1:ihi, ihi-2:ihi )
+              ! make h upper triangular
+              call stdlib_slartg( h( 1, 1 ), h( 2, 1 ), c1, s1, temp )
+              h( 2, 1 ) = zero
+              h( 1, 1 ) = temp
+              call stdlib_srot( 2, h( 1, 2 ), 2, h( 2, 2 ), 2, c1, s1 )
+              call stdlib_slartg( h( 2, 3 ), h( 2, 2 ), c1, s1, temp )
+              call stdlib_srot( 1, h( 1, 3 ), 1, h( 1, 2 ), 1, c1, s1 )
+              call stdlib_slartg( h( 1, 2 ), h( 1, 1 ), c2, s2, temp )
+              call stdlib_srot( ihi-istartm+1, b( istartm, ihi ), 1, b( istartm,ihi-1 ), 1, c1, &
+                        s1 )
+              call stdlib_srot( ihi-istartm+1, b( istartm, ihi-1 ), 1, b( istartm,ihi-2 ), 1, c2, &
+                        s2 )
+              b( ihi-1, ihi-2 ) = zero
+              b( ihi, ihi-2 ) = zero
+              call stdlib_srot( ihi-istartm+1, a( istartm, ihi ), 1, a( istartm,ihi-1 ), 1, c1, &
+                        s1 )
+              call stdlib_srot( ihi-istartm+1, a( istartm, ihi-1 ), 1, a( istartm,ihi-2 ), 1, c2, &
+                        s2 )
+              if ( ilz ) then
+                 call stdlib_srot( nz, z( 1, ihi-zstart+1 ), 1, z( 1, ihi-1-zstart+1 ), 1, c1, s1 &
+                           )
+                 call stdlib_srot( nz, z( 1, ihi-1-zstart+1 ), 1, z( 1,ihi-2-zstart+1 ), 1, c2, &
+                           s2 )
+              end if
+              call stdlib_slartg( a( ihi-1, ihi-2 ), a( ihi, ihi-2 ), c1, s1,temp )
+              a( ihi-1, ihi-2 ) = temp
+              a( ihi, ihi-2 ) = zero
+              call stdlib_srot( istopm-ihi+2, a( ihi-1, ihi-1 ), lda, a( ihi,ihi-1 ), lda, c1, s1 &
+                        )
+              call stdlib_srot( istopm-ihi+2, b( ihi-1, ihi-1 ), ldb, b( ihi,ihi-1 ), ldb, c1, s1 &
+                        )
+              if ( ilq ) then
+                 call stdlib_srot( nq, q( 1, ihi-1-qstart+1 ), 1, q( 1, ihi-qstart+1 ), 1, c1, s1 &
+                           )
+              end if
+              call stdlib_slartg( b( ihi, ihi ), b( ihi, ihi-1 ), c1, s1, temp )
+              b( ihi, ihi ) = temp
+              b( ihi, ihi-1 ) = zero
+              call stdlib_srot( ihi-istartm, b( istartm, ihi ), 1, b( istartm,ihi-1 ), 1, c1, s1 )
+                        
+              call stdlib_srot( ihi-istartm+1, a( istartm, ihi ), 1, a( istartm,ihi-1 ), 1, c1, &
+                        s1 )
+              if ( ilz ) then
+                 call stdlib_srot( nz, z( 1, ihi-zstart+1 ), 1, z( 1, ihi-1-zstart+1 ), 1, c1, s1 &
+                           )
+              end if
+           else
+              ! normal operation, move bulge down
+              h = b( k+1:k+2, k:k+2 )
+              ! make h upper triangular
+              call stdlib_slartg( h( 1, 1 ), h( 2, 1 ), c1, s1, temp )
+              h( 2, 1 ) = zero
+              h( 1, 1 ) = temp
+              call stdlib_srot( 2, h( 1, 2 ), 2, h( 2, 2 ), 2, c1, s1 )
+              ! calculate z1 and z2
+              call stdlib_slartg( h( 2, 3 ), h( 2, 2 ), c1, s1, temp )
+              call stdlib_srot( 1, h( 1, 3 ), 1, h( 1, 2 ), 1, c1, s1 )
+              call stdlib_slartg( h( 1, 2 ), h( 1, 1 ), c2, s2, temp )
+              ! apply transformations from the right
+              call stdlib_srot( k+3-istartm+1, a( istartm, k+2 ), 1, a( istartm,k+1 ), 1, c1, s1 )
+                        
+              call stdlib_srot( k+3-istartm+1, a( istartm, k+1 ), 1, a( istartm,k ), 1, c2, s2 )
+                        
+              call stdlib_srot( k+2-istartm+1, b( istartm, k+2 ), 1, b( istartm,k+1 ), 1, c1, s1 )
+                        
+              call stdlib_srot( k+2-istartm+1, b( istartm, k+1 ), 1, b( istartm,k ), 1, c2, s2 )
+                        
+              if ( ilz ) then
+                 call stdlib_srot( nz, z( 1, k+2-zstart+1 ), 1, z( 1, k+1-zstart+1 ), 1, c1, s1 )
+                           
+                 call stdlib_srot( nz, z( 1, k+1-zstart+1 ), 1, z( 1, k-zstart+1 ),1, c2, s2 )
+                           
+              end if
+              b( k+1, k ) = zero
+              b( k+2, k ) = zero
+              ! calculate q1 and q2
+              call stdlib_slartg( a( k+2, k ), a( k+3, k ), c1, s1, temp )
+              a( k+2, k ) = temp
+              a( k+3, k ) = zero
+              call stdlib_slartg( a( k+1, k ), a( k+2, k ), c2, s2, temp )
+              a( k+1, k ) = temp
+              a( k+2, k ) = zero
+           ! apply transformations from the left
+              call stdlib_srot( istopm-k, a( k+2, k+1 ), lda, a( k+3, k+1 ), lda,c1, s1 )
+              call stdlib_srot( istopm-k, a( k+1, k+1 ), lda, a( k+2, k+1 ), lda,c2, s2 )
+              call stdlib_srot( istopm-k, b( k+2, k+1 ), ldb, b( k+3, k+1 ), ldb,c1, s1 )
+              call stdlib_srot( istopm-k, b( k+1, k+1 ), ldb, b( k+2, k+1 ), ldb,c2, s2 )
+              if ( ilq ) then
+                 call stdlib_srot( nq, q( 1, k+2-qstart+1 ), 1, q( 1, k+3-qstart+1 ), 1, c1, s1 )
+                           
+                 call stdlib_srot( nq, q( 1, k+1-qstart+1 ), 1, q( 1, k+2-qstart+1 ), 1, c2, s2 )
+                           
+              end if
+           end if
+     end subroutine stdlib_slaqz2
+
+     !> SLAQZ4: Executes a single multishift QZ sweep
+
+     pure subroutine stdlib_slaqz4( ilschur, ilq, ilz, n, ilo, ihi, nshifts,nblock_desired, sr, &
+               si, ss, a, lda, b, ldb, q,ldq, z, ldz, qc, ldqc, zc, ldzc, work, lwork,info )
+           ! function arguments
+           logical(lk), intent( in ) :: ilschur, ilq, ilz
+           integer(ilp), intent( in ) :: n, ilo, ihi, lda, ldb, ldq, ldz, lwork,nshifts, &
+                     nblock_desired, ldqc, ldzc
+           real(sp), intent( inout ) :: a( lda, * ), b( ldb, * ), q( ldq, * ),z( ldz, * ), qc( &
+                     ldqc, * ), zc( ldzc, * ), work( * ), sr( * ),si( * ), ss( * )
+           integer(ilp), intent( out ) :: info
+           
+           ! local scalars
+           integer(ilp) :: i, j, ns, istartm, istopm, sheight, swidth, k, np, istartb, istopb, &
+                     ishift, nblock, npos
+           real(sp) :: temp, v(3), c1, s1, c2, s2, swap
+           info = 0
+           if ( nblock_desired < nshifts+1 ) then
+              info = -8
+           end if
+           if ( lwork ==-1 ) then
+              ! workspace query, quick return
+              work( 1 ) = n*nblock_desired
+              return
+           else if ( lwork < n*nblock_desired ) then
+              info = -25
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAQZ4', -info )
+              return
+           end if
+           ! executable statements
+           if ( nshifts < 2 ) then
+              return
+           end if
+           if ( ilo >= ihi ) then
+              return
+           end if
+           if ( ilschur ) then
+              istartm = 1
+              istopm = n
+           else
+              istartm = ilo
+              istopm = ihi
+           end if
+           ! shuffle shifts into pairs of real shifts and pairs
+           ! of complex conjugate shifts assuming complex
+           ! conjugate shifts are already adjacent to one
+           ! another
+           do i = 1, nshifts-2, 2
+              if( si( i )/=-si( i+1 ) ) then
+                 swap = sr( i )
+                 sr( i ) = sr( i+1 )
+                 sr( i+1 ) = sr( i+2 )
+                 sr( i+2 ) = swap
+                 swap = si( i )
+                 si( i ) = si( i+1 )
+                 si( i+1 ) = si( i+2 )
+                 si( i+2 ) = swap
+                 swap = ss( i )
+                 ss( i ) = ss( i+1 )
+                 ss( i+1 ) = ss( i+2 )
+                 ss( i+2 ) = swap
+              end if
+           end do
+           ! nshfts is supposed to be even, but if it is odd,
+           ! then simply reduce it by one.  the shuffle above
+           ! ensures that the dropped shift is real and that
+           ! the remaining shifts are paired.
+           ns = nshifts-mod( nshifts, 2 )
+           npos = max( nblock_desired-ns, 1 )
+           ! the following block introduces the shifts and chases
+           ! them down one by one just enough to make space for
+           ! the other shifts. the near-the-diagonal block is
+           ! of size (ns+1) x ns.
+           call stdlib_slaset( 'FULL', ns+1, ns+1, zero, one, qc, ldqc )
+           call stdlib_slaset( 'FULL', ns, ns, zero, one, zc, ldzc )
+           do i = 1, ns, 2
+              ! introduce the shift
+              call stdlib_slaqz1( a( ilo, ilo ), lda, b( ilo, ilo ), ldb, sr( i ),sr( i+1 ), si( &
+                        i ), ss( i ), ss( i+1 ), v )
+              temp = v( 2 )
+              call stdlib_slartg( temp, v( 3 ), c1, s1, v( 2 ) )
+              call stdlib_slartg( v( 1 ), v( 2 ), c2, s2, temp )
+              call stdlib_srot( ns, a( ilo+1, ilo ), lda, a( ilo+2, ilo ), lda, c1,s1 )
+              call stdlib_srot( ns, a( ilo, ilo ), lda, a( ilo+1, ilo ), lda, c2,s2 )
+              call stdlib_srot( ns, b( ilo+1, ilo ), ldb, b( ilo+2, ilo ), ldb, c1,s1 )
+              call stdlib_srot( ns, b( ilo, ilo ), ldb, b( ilo+1, ilo ), ldb, c2,s2 )
+              call stdlib_srot( ns+1, qc( 1, 2 ), 1, qc( 1, 3 ), 1, c1, s1 )
+              call stdlib_srot( ns+1, qc( 1, 1 ), 1, qc( 1, 2 ), 1, c2, s2 )
+              ! chase the shift down
+              do j = 1, ns-1-i
+                 call stdlib_slaqz2( .true., .true., j, 1, ns, ihi-ilo+1, a( ilo,ilo ), lda, b( &
+                           ilo, ilo ), ldb, ns+1, 1, qc,ldqc, ns, 1, zc, ldzc )
+              end do
+           end do
+           ! update the rest of the pencil
+           ! update a(ilo:ilo+ns,ilo+ns:istopm) and b(ilo:ilo+ns,ilo+ns:istopm)
+           ! from the left with qc(1:ns+1,1:ns+1)'
+           sheight = ns+1
+           swidth = istopm-( ilo+ns )+1
+           if ( swidth > 0 ) then
+              call stdlib_sgemm( 'T', 'N', sheight, swidth, sheight, one, qc, ldqc,a( ilo, ilo+ns &
+                        ), lda, zero, work, sheight )
+              call stdlib_slacpy( 'ALL', sheight, swidth, work, sheight, a( ilo,ilo+ns ), lda )
+                        
+              call stdlib_sgemm( 'T', 'N', sheight, swidth, sheight, one, qc, ldqc,b( ilo, ilo+ns &
+                        ), ldb, zero, work, sheight )
+              call stdlib_slacpy( 'ALL', sheight, swidth, work, sheight, b( ilo,ilo+ns ), ldb )
+                        
+           end if
+           if ( ilq ) then
+              call stdlib_sgemm( 'N', 'N', n, sheight, sheight, one, q( 1, ilo ),ldq, qc, ldqc, &
+                        zero, work, n )
+              call stdlib_slacpy( 'ALL', n, sheight, work, n, q( 1, ilo ), ldq )
+           end if
+           ! update a(istartm:ilo-1,ilo:ilo+ns-1) and b(istartm:ilo-1,ilo:ilo+ns-1)
+           ! from the right with zc(1:ns,1:ns)
+           sheight = ilo-1-istartm+1
+           swidth = ns
+           if ( sheight > 0 ) then
+              call stdlib_sgemm( 'N', 'N', sheight, swidth, swidth, one, a( istartm,ilo ), lda, &
+                        zc, ldzc, zero, work, sheight )
+              call stdlib_slacpy( 'ALL', sheight, swidth, work, sheight, a( istartm,ilo ), lda )
+                        
+              call stdlib_sgemm( 'N', 'N', sheight, swidth, swidth, one, b( istartm,ilo ), ldb, &
+                        zc, ldzc, zero, work, sheight )
+              call stdlib_slacpy( 'ALL', sheight, swidth, work, sheight, b( istartm,ilo ), ldb )
+                        
+           end if
+           if ( ilz ) then
+              call stdlib_sgemm( 'N', 'N', n, swidth, swidth, one, z( 1, ilo ), ldz,zc, ldzc, &
+                        zero, work, n )
+              call stdlib_slacpy( 'ALL', n, swidth, work, n, z( 1, ilo ), ldz )
+           end if
+           ! the following block chases the shifts down to the bottom
+           ! right block. if possible, a shift is moved down npos
+           ! positions at a time
+           k = ilo
+           do while ( k < ihi-ns )
+              np = min( ihi-ns-k, npos )
+              ! size of the near-the-diagonal block
+              nblock = ns+np
+              ! istartb points to the first row we will be updating
+              istartb = k+1
+              ! istopb points to the last column we will be updating
+              istopb = k+nblock-1
+              call stdlib_slaset( 'FULL', ns+np, ns+np, zero, one, qc, ldqc )
+              call stdlib_slaset( 'FULL', ns+np, ns+np, zero, one, zc, ldzc )
+              ! near the diagonal shift chase
+              do i = ns-1, 0, -2
+                 do j = 0, np-1
+                    ! move down the block with index k+i+j-1, updating
+                    ! the (ns+np x ns+np) block:
+                    ! (k:k+ns+np,k:k+ns+np-1)
+                    call stdlib_slaqz2( .true., .true., k+i+j-1, istartb, istopb,ihi, a, lda, b, &
+                              ldb, nblock, k+1, qc, ldqc,nblock, k, zc, ldzc )
+                 end do
+              end do
+              ! update rest of the pencil
+              ! update a(k+1:k+ns+np, k+ns+np:istopm) and
+              ! b(k+1:k+ns+np, k+ns+np:istopm)
+              ! from the left with qc(1:ns+np,1:ns+np)'
+              sheight = ns+np
+              swidth = istopm-( k+ns+np )+1
+              if ( swidth > 0 ) then
+                 call stdlib_sgemm( 'T', 'N', sheight, swidth, sheight, one, qc,ldqc, a( k+1, k+&
+                           ns+np ), lda, zero, work,sheight )
+                 call stdlib_slacpy( 'ALL', sheight, swidth, work, sheight, a( k+1,k+ns+np ), lda &
+                           )
+                 call stdlib_sgemm( 'T', 'N', sheight, swidth, sheight, one, qc,ldqc, b( k+1, k+&
+                           ns+np ), ldb, zero, work,sheight )
+                 call stdlib_slacpy( 'ALL', sheight, swidth, work, sheight, b( k+1,k+ns+np ), ldb &
+                           )
+              end if
+              if ( ilq ) then
+                 call stdlib_sgemm( 'N', 'N', n, nblock, nblock, one, q( 1, k+1 ),ldq, qc, ldqc, &
+                           zero, work, n )
+                 call stdlib_slacpy( 'ALL', n, nblock, work, n, q( 1, k+1 ), ldq )
+              end if
+              ! update a(istartm:k,k:k+ns+npos-1) and b(istartm:k,k:k+ns+npos-1)
+              ! from the right with zc(1:ns+np,1:ns+np)
+              sheight = k-istartm+1
+              swidth = nblock
+              if ( sheight > 0 ) then
+                 call stdlib_sgemm( 'N', 'N', sheight, swidth, swidth, one,a( istartm, k ), lda, &
+                           zc, ldzc, zero, work,sheight )
+                 call stdlib_slacpy( 'ALL', sheight, swidth, work, sheight,a( istartm, k ), lda )
+                           
+                 call stdlib_sgemm( 'N', 'N', sheight, swidth, swidth, one,b( istartm, k ), ldb, &
+                           zc, ldzc, zero, work,sheight )
+                 call stdlib_slacpy( 'ALL', sheight, swidth, work, sheight,b( istartm, k ), ldb )
+                           
+              end if
+              if ( ilz ) then
+                 call stdlib_sgemm( 'N', 'N', n, nblock, nblock, one, z( 1, k ),ldz, zc, ldzc, &
+                           zero, work, n )
+                 call stdlib_slacpy( 'ALL', n, nblock, work, n, z( 1, k ), ldz )
+              end if
+              k = k+np
+           end do
+           ! the following block removes the shifts from the bottom right corner
+           ! one by one. updates are initially applied to a(ihi-ns+1:ihi,ihi-ns:ihi).
+           call stdlib_slaset( 'FULL', ns, ns, zero, one, qc, ldqc )
+           call stdlib_slaset( 'FULL', ns+1, ns+1, zero, one, zc, ldzc )
+           ! istartb points to the first row we will be updating
+           istartb = ihi-ns+1
+           ! istopb points to the last column we will be updating
+           istopb = ihi
+           do i = 1, ns, 2
+              ! chase the shift down to the bottom right corner
+              do ishift = ihi-i-1, ihi-2
+                 call stdlib_slaqz2( .true., .true., ishift, istartb, istopb, ihi,a, lda, b, ldb, &
+                           ns, ihi-ns+1, qc, ldqc, ns+1,ihi-ns, zc, ldzc )
+              end do
+           end do
+           ! update rest of the pencil
+           ! update a(ihi-ns+1:ihi, ihi+1:istopm)
+           ! from the left with qc(1:ns,1:ns)'
+           sheight = ns
+           swidth = istopm-( ihi+1 )+1
+           if ( swidth > 0 ) then
+              call stdlib_sgemm( 'T', 'N', sheight, swidth, sheight, one, qc, ldqc,a( ihi-ns+1, &
+                        ihi+1 ), lda, zero, work, sheight )
+              call stdlib_slacpy( 'ALL', sheight, swidth, work, sheight,a( ihi-ns+1, ihi+1 ), lda &
+                        )
+              call stdlib_sgemm( 'T', 'N', sheight, swidth, sheight, one, qc, ldqc,b( ihi-ns+1, &
+                        ihi+1 ), ldb, zero, work, sheight )
+              call stdlib_slacpy( 'ALL', sheight, swidth, work, sheight,b( ihi-ns+1, ihi+1 ), ldb &
+                        )
+           end if
+           if ( ilq ) then
+              call stdlib_sgemm( 'N', 'N', n, ns, ns, one, q( 1, ihi-ns+1 ), ldq,qc, ldqc, zero, &
+                        work, n )
+              call stdlib_slacpy( 'ALL', n, ns, work, n, q( 1, ihi-ns+1 ), ldq )
+           end if
+           ! update a(istartm:ihi-ns,ihi-ns:ihi)
+           ! from the right with zc(1:ns+1,1:ns+1)
+           sheight = ihi-ns-istartm+1
+           swidth = ns+1
+           if ( sheight > 0 ) then
+              call stdlib_sgemm( 'N', 'N', sheight, swidth, swidth, one, a( istartm,ihi-ns ), lda,&
+                         zc, ldzc, zero, work, sheight )
+              call stdlib_slacpy( 'ALL', sheight, swidth, work, sheight, a( istartm,ihi-ns ), lda &
+                        )
+              call stdlib_sgemm( 'N', 'N', sheight, swidth, swidth, one, b( istartm,ihi-ns ), ldb,&
+                         zc, ldzc, zero, work, sheight )
+              call stdlib_slacpy( 'ALL', sheight, swidth, work, sheight, b( istartm,ihi-ns ), ldb &
+                        )
+           end if
+           if ( ilz ) then
+           call stdlib_sgemm( 'N', 'N', n, ns+1, ns+1, one, z( 1, ihi-ns ), ldz, zc,ldzc, zero, &
+                     work, n )
+              call stdlib_slacpy( 'ALL', n, ns+1, work, n, z( 1, ihi-ns ), ldz )
+           end if
+     end subroutine stdlib_slaqz4
+
+     !> SLAR1V: computes the (scaled) r-th column of the inverse of
+     !> the sumbmatrix in rows B1 through BN of the tridiagonal matrix
+     !> L D L**T - sigma I. When sigma is close to an eigenvalue, the
+     !> computed vector is an accurate eigenvector. Usually, r corresponds
+     !> to the index where the eigenvector is largest in magnitude.
+     !> The following steps accomplish this computation :
+     !> (a) Stationary qd transform,  L D L**T - sigma I = L(+) D(+) L(+)**T,
+     !> (b) Progressive qd transform, L D L**T - sigma I = U(-) D(-) U(-)**T,
+     !> (c) Computation of the diagonal elements of the inverse of
+     !> L D L**T - sigma I by combining the above transforms, and choosing
+     !> r as the index where the diagonal of the inverse is (one of the)
+     !> largest in magnitude.
+     !> (d) Computation of the (scaled) r-th column of the inverse using the
+     !> twisted factorization obtained by combining the top part of the
+     !> the stationary and the bottom part of the progressive transform.
+
+     pure subroutine stdlib_slar1v( n, b1, bn, lambda, d, l, ld, lld,pivmin, gaptol, z, wantnc, &
+               negcnt, ztz, mingma,r, isuppz, nrminv, resid, rqcorr, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantnc
+           integer(ilp), intent(in) :: b1, bn, n
+           integer(ilp), intent(out) :: negcnt
+           integer(ilp), intent(inout) :: r
+           real(sp), intent(in) :: gaptol, lambda, pivmin
+           real(sp), intent(out) :: mingma, nrminv, resid, rqcorr, ztz
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*)
+           real(sp), intent(in) :: d(*), l(*), ld(*), lld(*)
+           real(sp), intent(out) :: work(*)
+           real(sp), intent(inout) :: z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: sawnan1, sawnan2
+           integer(ilp) :: i, indlpl, indp, inds, indumn, neg1, neg2, r1, r2
+           real(sp) :: dminus, dplus, eps, s, tmp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           eps = stdlib_slamch( 'PRECISION' )
+           if( r==0 ) then
+              r1 = b1
+              r2 = bn
+           else
+              r1 = r
+              r2 = r
+           end if
+           ! storage for lplus
+           indlpl = 0
+           ! storage for uminus
+           indumn = n
+           inds = 2*n + 1
+           indp = 3*n + 1
+           if( b1==1 ) then
+              work( inds ) = zero
+           else
+              work( inds+b1-1 ) = lld( b1-1 )
+           end if
+           ! compute the stationary transform (using the differential form)
+           ! until the index r2.
+           sawnan1 = .false.
+           neg1 = 0
+           s = work( inds+b1-1 ) - lambda
+           do i = b1, r1 - 1
+              dplus = d( i ) + s
+              work( indlpl+i ) = ld( i ) / dplus
+              if(dplus<zero) neg1 = neg1 + 1
+              work( inds+i ) = s*work( indlpl+i )*l( i )
+              s = work( inds+i ) - lambda
+           end do
+           sawnan1 = stdlib_sisnan( s )
+           if( sawnan1 ) goto 60
+           do i = r1, r2 - 1
+              dplus = d( i ) + s
+              work( indlpl+i ) = ld( i ) / dplus
+              work( inds+i ) = s*work( indlpl+i )*l( i )
+              s = work( inds+i ) - lambda
+           end do
+           sawnan1 = stdlib_sisnan( s )
+           60 continue
+           if( sawnan1 ) then
+              ! runs a slower version of the above loop if a nan is detected
+              neg1 = 0
+              s = work( inds+b1-1 ) - lambda
+              do i = b1, r1 - 1
+                 dplus = d( i ) + s
+                 if(abs(dplus)<pivmin) dplus = -pivmin
+                 work( indlpl+i ) = ld( i ) / dplus
+                 if(dplus<zero) neg1 = neg1 + 1
+                 work( inds+i ) = s*work( indlpl+i )*l( i )
+                 if( work( indlpl+i )==zero )work( inds+i ) = lld( i )
+                 s = work( inds+i ) - lambda
+              end do
+              do i = r1, r2 - 1
+                 dplus = d( i ) + s
+                 if(abs(dplus)<pivmin) dplus = -pivmin
+                 work( indlpl+i ) = ld( i ) / dplus
+                 work( inds+i ) = s*work( indlpl+i )*l( i )
+                 if( work( indlpl+i )==zero )work( inds+i ) = lld( i )
+                 s = work( inds+i ) - lambda
+              end do
+           end if
+           ! compute the progressive transform (using the differential form)
+           ! until the index r1
+           sawnan2 = .false.
+           neg2 = 0
+           work( indp+bn-1 ) = d( bn ) - lambda
+           do i = bn - 1, r1, -1
+              dminus = lld( i ) + work( indp+i )
+              tmp = d( i ) / dminus
+              if(dminus<zero) neg2 = neg2 + 1
+              work( indumn+i ) = l( i )*tmp
+              work( indp+i-1 ) = work( indp+i )*tmp - lambda
+           end do
+           tmp = work( indp+r1-1 )
+           sawnan2 = stdlib_sisnan( tmp )
+           if( sawnan2 ) then
+              ! runs a slower version of the above loop if a nan is detected
+              neg2 = 0
+              do i = bn-1, r1, -1
+                 dminus = lld( i ) + work( indp+i )
+                 if(abs(dminus)<pivmin) dminus = -pivmin
+                 tmp = d( i ) / dminus
+                 if(dminus<zero) neg2 = neg2 + 1
+                 work( indumn+i ) = l( i )*tmp
+                 work( indp+i-1 ) = work( indp+i )*tmp - lambda
+                 if( tmp==zero )work( indp+i-1 ) = d( i ) - lambda
+              end do
+           end if
+           ! find the index (from r1 to r2) of the largest (in magnitude)
+           ! diagonal element of the inverse
+           mingma = work( inds+r1-1 ) + work( indp+r1-1 )
+           if( mingma<zero ) neg1 = neg1 + 1
+           if( wantnc ) then
+              negcnt = neg1 + neg2
+           else
+              negcnt = -1
+           endif
+           if( abs(mingma)==zero )mingma = eps*work( inds+r1-1 )
+           r = r1
+           do i = r1, r2 - 1
+              tmp = work( inds+i ) + work( indp+i )
+              if( tmp==zero )tmp = eps*work( inds+i )
+              if( abs( tmp )<=abs( mingma ) ) then
+                 mingma = tmp
+                 r = i + 1
+              end if
+           end do
+           ! compute the fp vector: solve n^t v = e_r
+           isuppz( 1 ) = b1
+           isuppz( 2 ) = bn
+           z( r ) = one
+           ztz = one
+           ! compute the fp vector upwards from r
+           if( .not.sawnan1 .and. .not.sawnan2 ) then
+              do i = r-1, b1, -1
+                 z( i ) = -( work( indlpl+i )*z( i+1 ) )
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i ) = zero
+                    isuppz( 1 ) = i + 1
+                    goto 220
+                 endif
+                 ztz = ztz + z( i )*z( i )
+              end do
+              220 continue
+           else
+              ! run slower loop if nan occurred.
+              do i = r - 1, b1, -1
+                 if( z( i+1 )==zero ) then
+                    z( i ) = -( ld( i+1 ) / ld( i ) )*z( i+2 )
+                 else
+                    z( i ) = -( work( indlpl+i )*z( i+1 ) )
+                 end if
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i ) = zero
+                    isuppz( 1 ) = i + 1
+                    go to 240
+                 end if
+                 ztz = ztz + z( i )*z( i )
+              end do
+              240 continue
+           endif
+           ! compute the fp vector downwards from r in blocks of size blksiz
+           if( .not.sawnan1 .and. .not.sawnan2 ) then
+              do i = r, bn-1
+                 z( i+1 ) = -( work( indumn+i )*z( i ) )
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i+1 ) = zero
+                    isuppz( 2 ) = i
+                    go to 260
+                 end if
+                 ztz = ztz + z( i+1 )*z( i+1 )
+              end do
+              260 continue
+           else
+              ! run slower loop if nan occurred.
+              do i = r, bn - 1
+                 if( z( i )==zero ) then
+                    z( i+1 ) = -( ld( i-1 ) / ld( i ) )*z( i-1 )
+                 else
+                    z( i+1 ) = -( work( indumn+i )*z( i ) )
+                 end if
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i+1 ) = zero
+                    isuppz( 2 ) = i
+                    go to 280
+                 end if
+                 ztz = ztz + z( i+1 )*z( i+1 )
+              end do
+              280 continue
+           end if
+           ! compute quantities for convergence test
+           tmp = one / ztz
+           nrminv = sqrt( tmp )
+           resid = abs( mingma )*nrminv
+           rqcorr = mingma*tmp
+           return
+     end subroutine stdlib_slar1v
+
+     !> SLARFG: generates a real elementary reflector H of order n, such
+     !> that
+     !> H * ( alpha ) = ( beta ),   H**T * H = I.
+     !> (   x   )   (   0  )
+     !> where alpha and beta are scalars, and x is an (n-1)-element real
+     !> vector. H is represented in the form
+     !> H = I - tau * ( 1 ) * ( 1 v**T ) ,
+     !> ( v )
+     !> where tau is a real scalar and v is a real (n-1)-element
+     !> vector.
+     !> If the elements of x are all zero, then tau = 0 and H is taken to be
+     !> the unit matrix.
+     !> Otherwise  1 <= tau <= 2.
+
+     pure subroutine stdlib_slarfg( n, alpha, x, incx, tau )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           real(sp), intent(inout) :: alpha
+           real(sp), intent(out) :: tau
+           ! Array Arguments 
+           real(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, knt
+           real(sp) :: beta, rsafmn, safmin, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign
+           ! Executable Statements 
+           if( n<=1 ) then
+              tau = zero
+              return
+           end if
+           xnorm = stdlib_snrm2( n-1, x, incx )
+           if( xnorm==zero ) then
+              ! h  =  i
+              tau = zero
+           else
+              ! general case
+              beta = -sign( stdlib_slapy2( alpha, xnorm ), alpha )
+              safmin = stdlib_slamch( 'S' ) / stdlib_slamch( 'E' )
+              knt = 0
+              if( abs( beta )<safmin ) then
+                 ! xnorm, beta may be inaccurate; scale x and recompute them
+                 rsafmn = one / safmin
+                 10 continue
+                 knt = knt + 1
+                 call stdlib_sscal( n-1, rsafmn, x, incx )
+                 beta = beta*rsafmn
+                 alpha = alpha*rsafmn
+                 if( (abs( beta )<safmin) .and. (knt < 20) )go to 10
+                 ! new beta is at most 1, at least safmin
+                 xnorm = stdlib_snrm2( n-1, x, incx )
+                 beta = -sign( stdlib_slapy2( alpha, xnorm ), alpha )
+              end if
+              tau = ( beta-alpha ) / beta
+              call stdlib_sscal( n-1, one / ( alpha-beta ), x, incx )
+              ! if alpha is subnormal, it may lose relative accuracy
+              do j = 1, knt
+                 beta = beta*safmin
+              end do
+              alpha = beta
+           end if
+           return
+     end subroutine stdlib_slarfg
+
+     !> SLARFGP: generates a real elementary reflector H of order n, such
+     !> that
+     !> H * ( alpha ) = ( beta ),   H**T * H = I.
+     !> (   x   )   (   0  )
+     !> where alpha and beta are scalars, beta is non-negative, and x is
+     !> an (n-1)-element real vector.  H is represented in the form
+     !> H = I - tau * ( 1 ) * ( 1 v**T ) ,
+     !> ( v )
+     !> where tau is a real scalar and v is a real (n-1)-element
+     !> vector.
+     !> If the elements of x are all zero, then tau = 0 and H is taken to be
+     !> the unit matrix.
+
+     subroutine stdlib_slarfgp( n, alpha, x, incx, tau )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           real(sp), intent(inout) :: alpha
+           real(sp), intent(out) :: tau
+           ! Array Arguments 
+           real(sp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, knt
+           real(sp) :: beta, bignum, savealpha, smlnum, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign
+           ! Executable Statements 
+           if( n<=0 ) then
+              tau = zero
+              return
+           end if
+           xnorm = stdlib_snrm2( n-1, x, incx )
+           if( xnorm==zero ) then
+              ! h  =  [+/-1, 0; i], sign chosen so alpha >= 0.
+              if( alpha>=zero ) then
+                 ! when tau.eq.zero, the vector is special-cased to be
+                 ! all zeros in the application routines.  we do not need
+                 ! to clear it.
+                 tau = zero
+              else
+                 ! however, the application routines rely on explicit
+                 ! zero checks when tau.ne.zero, and we must clear x.
+                 tau = two
+                 do j = 1, n-1
+                    x( 1 + (j-1)*incx ) = 0
+                 end do
+                 alpha = -alpha
+              end if
+           else
+              ! general case
+              beta = sign( stdlib_slapy2( alpha, xnorm ), alpha )
+              smlnum = stdlib_slamch( 'S' ) / stdlib_slamch( 'E' )
+              knt = 0
+              if( abs( beta )<smlnum ) then
+                 ! xnorm, beta may be inaccurate; scale x and recompute them
+                 bignum = one / smlnum
+                 10 continue
+                 knt = knt + 1
+                 call stdlib_sscal( n-1, bignum, x, incx )
+                 beta = beta*bignum
+                 alpha = alpha*bignum
+                 if( (abs( beta )<smlnum) .and. (knt < 20) )go to 10
+                 ! new beta is at most 1, at least smlnum
+                 xnorm = stdlib_snrm2( n-1, x, incx )
+                 beta = sign( stdlib_slapy2( alpha, xnorm ), alpha )
+              end if
+              savealpha = alpha
+              alpha = alpha + beta
+              if( beta<zero ) then
+                 beta = -beta
+                 tau = -alpha / beta
+              else
+                 alpha = xnorm * (xnorm/alpha)
+                 tau = alpha / beta
+                 alpha = -alpha
+              end if
+              if ( abs(tau)<=smlnum ) then
+                 ! in the case where the computed tau ends up being a denormalized number,
+                 ! it loses relative accuracy. this is a big problem. solution: flush tau
+                 ! to zero. this explains the next if statement.
+                 ! (bug report provided by pat quillen from mathworks on jul 29, 2009.)
+                 ! (thanks pat. thanks mathworks.)
+                 if( savealpha>=zero ) then
+                    tau = zero
+                 else
+                    tau = two
+                    do j = 1, n-1
+                       x( 1 + (j-1)*incx ) = 0
+                    end do
+                    beta = -savealpha
+                 end if
+              else
+                 ! this is the general case.
+                 call stdlib_sscal( n-1, one / alpha, x, incx )
+              end if
+              ! if beta is subnormal, it may lose relative accuracy
+              do j = 1, knt
+                 beta = beta*smlnum
+              end do
+              alpha = beta
+           end if
+           return
+     end subroutine stdlib_slarfgp
+
+     !> SLARNV: returns a vector of n random real numbers from a uniform or
+     !> normal distribution.
+
+     pure subroutine stdlib_slarnv( idist, iseed, n, x )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: idist, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: iseed(4)
+           real(sp), intent(out) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: lv = 128
+           real(sp), parameter :: twopi = 6.28318530717958647692528676655900576839e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, il, il2, iv
+           ! Local Arrays 
+           real(sp) :: u(lv)
+           ! Intrinsic Functions 
+           intrinsic :: cos,log,min,sqrt
+           ! Executable Statements 
+           do 40 iv = 1, n, lv / 2
+              il = min( lv / 2, n-iv+1 )
+              if( idist==3 ) then
+                 il2 = 2*il
+              else
+                 il2 = il
+              end if
+              ! call stdlib_slaruv to generate il2 numbers from a uniform (0,1)
+              ! distribution (il2 <= lv)
+              call stdlib_slaruv( iseed, il2, u )
+              if( idist==1 ) then
+                 ! copy generated numbers
+                 do i = 1, il
+                    x( iv+i-1 ) = u( i )
+                 end do
+              else if( idist==2 ) then
+                 ! convert generated numbers to uniform (-1,1) distribution
+                 do i = 1, il
+                    x( iv+i-1 ) = two*u( i ) - one
+                 end do
+              else if( idist==3 ) then
+                 ! convert generated numbers to normal (0,1) distribution
+                 do i = 1, il
+                    x( iv+i-1 ) = sqrt( -two*log( u( 2*i-1 ) ) )*cos( twopi*u( 2*i ) )
+                 end do
+              end if
+              40 continue
+           return
+     end subroutine stdlib_slarnv
+
+     !> Given the relatively robust representation(RRR) L D L^T, SLARRB:
+     !> does "limited" bisection to refine the eigenvalues of L D L^T,
+     !> W( IFIRST-OFFSET ) through W( ILAST-OFFSET ), to more accuracy. Initial
+     !> guesses for these eigenvalues are input in W, the corresponding estimate
+     !> of the error in these guesses and their gaps are input in WERR
+     !> and WGAP, respectively. During bisection, intervals
+     !> [left, right] are maintained by storing their mid-points and
+     !> semi-widths in the arrays W and WERR respectively.
+
+     pure subroutine stdlib_slarrb( n, d, lld, ifirst, ilast, rtol1,rtol2, offset, w, wgap, werr, &
+               work, iwork,pivmin, spdiam, twist, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ifirst, ilast, n, offset, twist
+           integer(ilp), intent(out) :: info
+           real(sp), intent(in) :: pivmin, rtol1, rtol2, spdiam
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: d(*), lld(*)
+           real(sp), intent(inout) :: w(*), werr(*), wgap(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           integer(ilp) :: maxitr
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ii, ip, iter, k, negcnt, next, nint, olnint, prev, r
+           real(sp) :: back, cvrgd, gap, left, lgap, mid, mnwdth, rgap, right, tmp, width
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           maxitr = int( ( log( spdiam+pivmin )-log( pivmin ) ) /log( two ),KIND=ilp) + 2
+           mnwdth = two * pivmin
+           r = twist
+           if((r<1).or.(r>n)) r = n
+           ! initialize unconverged intervals in [ work(2*i-1), work(2*i) ].
+           ! the sturm count, count( work(2*i-1) ) is arranged to be i-1, while
+           ! count( work(2*i) ) is stored in iwork( 2*i ). the integer iwork( 2*i-1 )
+           ! for an unconverged interval is set to the index of the next unconverged
+           ! interval, and is -1 or 0 for a converged interval. thus a linked
+           ! list of unconverged intervals is set up.
+           i1 = ifirst
+           ! the number of unconverged intervals
+           nint = 0
+           ! the last unconverged interval found
+           prev = 0
+           rgap = wgap( i1-offset )
+           loop_75: do i = i1, ilast
+              k = 2*i
+              ii = i - offset
+              left = w( ii ) - werr( ii )
+              right = w( ii ) + werr( ii )
+              lgap = rgap
+              rgap = wgap( ii )
+              gap = min( lgap, rgap )
+              ! make sure that [left,right] contains the desired eigenvalue
+              ! compute negcount from dstqds facto l+d+l+^t = l d l^t - left
+              ! do while( negcnt(left)>i-1 )
+              back = werr( ii )
+              20 continue
+              negcnt = stdlib_slaneg( n, d, lld, left, pivmin, r )
+              if( negcnt>i-1 ) then
+                 left = left - back
+                 back = two*back
+                 go to 20
+              end if
+              ! do while( negcnt(right)<i )
+              ! compute negcount from dstqds facto l+d+l+^t = l d l^t - right
+              back = werr( ii )
+              50 continue
+              negcnt = stdlib_slaneg( n, d, lld, right, pivmin, r )
+               if( negcnt<i ) then
+                  right = right + back
+                  back = two*back
+                  go to 50
+               end if
+              width = half*abs( left - right )
+              tmp = max( abs( left ), abs( right ) )
+              cvrgd = max(rtol1*gap,rtol2*tmp)
+              if( width<=cvrgd .or. width<=mnwdth ) then
+                 ! this interval has already converged and does not need refinement.
+                 ! (note that the gaps might change through refining the
+                  ! eigenvalues, however, they can only get bigger.)
+                 ! remove it from the list.
+                 iwork( k-1 ) = -1
+                 ! make sure that i1 always points to the first unconverged interval
+                 if((i==i1).and.(i<ilast)) i1 = i + 1
+                 if((prev>=i1).and.(i<=ilast)) iwork( 2*prev-1 ) = i + 1
+              else
+                 ! unconverged interval found
+                 prev = i
+                 nint = nint + 1
+                 iwork( k-1 ) = i + 1
+                 iwork( k ) = negcnt
+              end if
+              work( k-1 ) = left
+              work( k ) = right
+           end do loop_75
+           ! do while( nint>0 ), i.e. there are still unconverged intervals
+           ! and while (iter<maxitr)
+           iter = 0
+           80 continue
+           prev = i1 - 1
+           i = i1
+           olnint = nint
+           loop_100: do ip = 1, olnint
+              k = 2*i
+              ii = i - offset
+              rgap = wgap( ii )
+              lgap = rgap
+              if(ii>1) lgap = wgap( ii-1 )
+              gap = min( lgap, rgap )
+              next = iwork( k-1 )
+              left = work( k-1 )
+              right = work( k )
+              mid = half*( left + right )
+              ! semiwidth of interval
+              width = right - mid
+              tmp = max( abs( left ), abs( right ) )
+              cvrgd = max(rtol1*gap,rtol2*tmp)
+              if( ( width<=cvrgd ) .or. ( width<=mnwdth ).or.( iter==maxitr ) )then
+                 ! reduce number of unconverged intervals
+                 nint = nint - 1
+                 ! mark interval as converged.
+                 iwork( k-1 ) = 0
+                 if( i1==i ) then
+                    i1 = next
+                 else
+                    ! prev holds the last unconverged interval previously examined
+                    if(prev>=i1) iwork( 2*prev-1 ) = next
+                 end if
+                 i = next
+                 cycle loop_100
+              end if
+              prev = i
+              ! perform one bisection step
+              negcnt = stdlib_slaneg( n, d, lld, mid, pivmin, r )
+              if( negcnt<=i-1 ) then
+                 work( k-1 ) = mid
+              else
+                 work( k ) = mid
+              end if
+              i = next
+           end do loop_100
+           iter = iter + 1
+           ! do another loop if there are still unconverged intervals
+           ! however, in the last iteration, all intervals are accepted
+           ! since this is the best we can do.
+           if( ( nint>0 ).and.(iter<=maxitr) ) go to 80
+           ! at this point, all the intervals have converged
+           do i = ifirst, ilast
+              k = 2*i
+              ii = i - offset
+              ! all intervals marked by '0' have been refined.
+              if( iwork( k-1 )==0 ) then
+                 w( ii ) = half*( work( k-1 )+work( k ) )
+                 werr( ii ) = work( k ) - w( ii )
+              end if
+           end do
+           do i = ifirst+1, ilast
+              k = 2*i
+              ii = i - offset
+              wgap( ii-1 ) = max( zero,w(ii) - werr (ii) - w( ii-1 ) - werr( ii-1 ))
+           end do
+           return
+     end subroutine stdlib_slarrb
+
+     !> Given the initial representation L D L^T and its cluster of close
+     !> eigenvalues (in a relative measure), W( CLSTRT ), W( CLSTRT+1 ), ...
+     !> W( CLEND ), SLARRF: finds a new relatively robust representation
+     !> L D L^T - SIGMA I = L(+) D(+) L(+)^T such that at least one of the
+     !> eigenvalues of L(+) D(+) L(+)^T is relatively isolated.
+
+     pure subroutine stdlib_slarrf( n, d, l, ld, clstrt, clend,w, wgap, werr,spdiam, clgapl, &
+               clgapr, pivmin, sigma,dplus, lplus, work, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: clstrt, clend, n
+           integer(ilp), intent(out) :: info
+           real(sp), intent(in) :: clgapl, clgapr, pivmin, spdiam
+           real(sp), intent(out) :: sigma
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*), l(*), ld(*), w(*), werr(*)
+           real(sp), intent(out) :: dplus(*), lplus(*), work(*)
+           real(sp), intent(inout) :: wgap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: quart = 0.25_sp
+           real(sp), parameter :: maxgrowth1 = 8._sp
+           real(sp), parameter :: maxgrowth2 = 8._sp
+           integer(ilp), parameter :: ktrymax = 1
+           integer(ilp), parameter :: sleft = 1
+           integer(ilp), parameter :: sright = 2
+           
+           ! Local Scalars 
+           logical(lk) :: dorrr1, forcer, nofail, sawnan1, sawnan2, tryrrr1
+           integer(ilp) :: i, indx, ktry,    shift
+           real(sp) :: avgap, bestshift, clwdth, eps, fact, fail, fail2, growthbound, ldelta, &
+           ldmax, lsigma, max1, max2, mingap, oldp, prod, rdelta, rdmax, rrr1, rrr2, rsigma, s, &
+                     smlgrowth, tmp, znm2
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           fact = real(2**ktrymax,KIND=sp)
+           eps = stdlib_slamch( 'PRECISION' )
+           shift = 0
+           forcer = .false.
+           ! note that we cannot guarantee that for any of the shifts tried,
+           ! the factorization has a small or even moderate element growth.
+           ! there could be ritz values at both ends of the cluster and despite
+           ! backing off, there are examples where all factorizations tried
+           ! (in ieee mode, allowing zero pivots
+           ! element growth.
+           ! for this reason, we should use pivmin in this subroutine so that at
+           ! least the l d l^t factorization exists. it can be checked afterwards
+           ! whether the element growth caused bad residuals/orthogonality.
+           ! decide whether the code should accept the best among all
+           ! representations despite large element growth or signal info=1
+           ! setting nofail to .false. for quick fix for bug 113
+           nofail = .false.
+           ! compute the average gap length of the cluster
+           clwdth = abs(w(clend)-w(clstrt)) + werr(clend) + werr(clstrt)
+           avgap = clwdth / real(clend-clstrt,KIND=sp)
+           mingap = min(clgapl, clgapr)
+           ! initial values for shifts to both ends of cluster
+           lsigma = min(w( clstrt ),w( clend )) - werr( clstrt )
+           rsigma = max(w( clstrt ),w( clend )) + werr( clend )
+           ! use a small fudge to make sure that we really shift to the outside
+           lsigma = lsigma - abs(lsigma)* two * eps
+           rsigma = rsigma + abs(rsigma)* two * eps
+           ! compute upper bounds for how much to back off the initial shifts
+           ldmax = quart * mingap + two * pivmin
+           rdmax = quart * mingap + two * pivmin
+           ldelta = max(avgap,wgap( clstrt ))/fact
+           rdelta = max(avgap,wgap( clend-1 ))/fact
+           ! initialize the record of the best representation found
+           s = stdlib_slamch( 'S' )
+           smlgrowth = one / s
+           fail = real(n-1,KIND=sp)*mingap/(spdiam*eps)
+           fail2 = real(n-1,KIND=sp)*mingap/(spdiam*sqrt(eps))
+           bestshift = lsigma
+           ! while (ktry <= ktrymax)
+           ktry = 0
+           growthbound = maxgrowth1*spdiam
+           5 continue
+           sawnan1 = .false.
+           sawnan2 = .false.
+           ! ensure that we do not back off too much of the initial shifts
+           ldelta = min(ldmax,ldelta)
+           rdelta = min(rdmax,rdelta)
+           ! compute the element growth when shifting to both ends of the cluster
+           ! accept the shift if there is no element growth at one of the two ends
+           ! left end
+           s = -lsigma
+           dplus( 1 ) = d( 1 ) + s
+           if(abs(dplus(1))<pivmin) then
+              dplus(1) = -pivmin
+              ! need to set sawnan1 because refined rrr test should not be used
+              ! in this case
+              sawnan1 = .true.
+           endif
+           max1 = abs( dplus( 1 ) )
+           do i = 1, n - 1
+              lplus( i ) = ld( i ) / dplus( i )
+              s = s*lplus( i )*l( i ) - lsigma
+              dplus( i+1 ) = d( i+1 ) + s
+              if(abs(dplus(i+1))<pivmin) then
+                 dplus(i+1) = -pivmin
+                 ! need to set sawnan1 because refined rrr test should not be used
+                 ! in this case
+                 sawnan1 = .true.
+              endif
+              max1 = max( max1,abs(dplus(i+1)) )
+           end do
+           sawnan1 = sawnan1 .or.  stdlib_sisnan( max1 )
+           if( forcer .or.(max1<=growthbound .and. .not.sawnan1 ) ) then
+              sigma = lsigma
+              shift = sleft
+              goto 100
+           endif
+           ! right end
+           s = -rsigma
+           work( 1 ) = d( 1 ) + s
+           if(abs(work(1))<pivmin) then
+              work(1) = -pivmin
+              ! need to set sawnan2 because refined rrr test should not be used
+              ! in this case
+              sawnan2 = .true.
+           endif
+           max2 = abs( work( 1 ) )
+           do i = 1, n - 1
+              work( n+i ) = ld( i ) / work( i )
+              s = s*work( n+i )*l( i ) - rsigma
+              work( i+1 ) = d( i+1 ) + s
+              if(abs(work(i+1))<pivmin) then
+                 work(i+1) = -pivmin
+                 ! need to set sawnan2 because refined rrr test should not be used
+                 ! in this case
+                 sawnan2 = .true.
+              endif
+              max2 = max( max2,abs(work(i+1)) )
+           end do
+           sawnan2 = sawnan2 .or.  stdlib_sisnan( max2 )
+           if( forcer .or.(max2<=growthbound .and. .not.sawnan2 ) ) then
+              sigma = rsigma
+              shift = sright
+              goto 100
+           endif
+           ! if we are at this point, both shifts led to too much element growth
+           ! record the better of the two shifts (provided it didn't lead to nan)
+           if(sawnan1.and.sawnan2) then
+              ! both max1 and max2 are nan
+              goto 50
+           else
+              if( .not.sawnan1 ) then
+                 indx = 1
+                 if(max1<=smlgrowth) then
+                    smlgrowth = max1
+                    bestshift = lsigma
+                 endif
+              endif
+              if( .not.sawnan2 ) then
+                 if(sawnan1 .or. max2<=max1) indx = 2
+                 if(max2<=smlgrowth) then
+                    smlgrowth = max2
+                    bestshift = rsigma
+                 endif
+              endif
+           endif
+           ! if we are here, both the left and the right shift led to
+           ! element growth. if the element growth is moderate, then
+           ! we may still accept the representation, if it passes a
+           ! refined test for rrr. this test supposes that no nan occurred.
+           ! moreover, we use the refined rrr test only for isolated clusters.
+           if((clwdth<mingap/real(128,KIND=sp)) .and.(min(max1,max2)<fail2).and.(.not.sawnan1)&
+                     .and.(.not.sawnan2)) then
+              dorrr1 = .true.
+           else
+              dorrr1 = .false.
+           endif
+           tryrrr1 = .true.
+           if( tryrrr1 .and. dorrr1 ) then
+           if(indx==1) then
+              tmp = abs( dplus( n ) )
+              znm2 = one
+              prod = one
+              oldp = one
+              do i = n-1, 1, -1
+                 if( prod <= eps ) then
+                    prod =((dplus(i+1)*work(n+i+1))/(dplus(i)*work(n+i)))*oldp
+                 else
+                    prod = prod*abs(work(n+i))
+                 end if
+                 oldp = prod
+                 znm2 = znm2 + prod**2
+                 tmp = max( tmp, abs( dplus( i ) * prod ))
+              end do
+              rrr1 = tmp/( spdiam * sqrt( znm2 ) )
+              if (rrr1<=maxgrowth2) then
+                 sigma = lsigma
+                 shift = sleft
+                 goto 100
+              endif
+           else if(indx==2) then
+              tmp = abs( work( n ) )
+              znm2 = one
+              prod = one
+              oldp = one
+              do i = n-1, 1, -1
+                 if( prod <= eps ) then
+                    prod = ((work(i+1)*lplus(i+1))/(work(i)*lplus(i)))*oldp
+                 else
+                    prod = prod*abs(lplus(i))
+                 end if
+                 oldp = prod
+                 znm2 = znm2 + prod**2
+                 tmp = max( tmp, abs( work( i ) * prod ))
+              end do
+              rrr2 = tmp/( spdiam * sqrt( znm2 ) )
+              if (rrr2<=maxgrowth2) then
+                 sigma = rsigma
+                 shift = sright
+                 goto 100
+              endif
+           end if
+           endif
+           50 continue
+           if (ktry<ktrymax) then
+              ! if we are here, both shifts failed also the rrr test.
+              ! back off to the outside
+              lsigma = max( lsigma - ldelta,lsigma - ldmax)
+              rsigma = min( rsigma + rdelta,rsigma + rdmax )
+              ldelta = two * ldelta
+              rdelta = two * rdelta
+              ktry = ktry + 1
+              goto 5
+           else
+              ! none of the representations investigated satisfied our
+              ! criteria. take the best one we found.
+              if((smlgrowth<fail).or.nofail) then
+                 lsigma = bestshift
+                 rsigma = bestshift
+                 forcer = .true.
+                 goto 5
+              else
+                 info = 1
+                 return
+              endif
+           end if
+           100 continue
+           if (shift==sleft) then
+           elseif (shift==sright) then
+              ! store new l and d back into dplus, lplus
+              call stdlib_scopy( n, work, 1, dplus, 1 )
+              call stdlib_scopy( n-1, work(n+1), 1, lplus, 1 )
+           endif
+           return
+     end subroutine stdlib_slarrf
+
+     !> SLARRV: computes the eigenvectors of the tridiagonal matrix
+     !> T = L D L**T given L, D and APPROXIMATIONS to the eigenvalues of L D L**T.
+     !> The input eigenvalues should have been computed by SLARRE.
+
+     pure subroutine stdlib_slarrv( n, vl, vu, d, l, pivmin,isplit, m, dol, dou, minrgp,rtol1, &
+               rtol2, w, werr, wgap,iblock, indexw, gers, z, ldz, isuppz,work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: dol, dou, ldz, m, n
+           integer(ilp), intent(out) :: info
+           real(sp), intent(in) :: minrgp, pivmin, vl, vu
+           real(sp), intent(inout) :: rtol1, rtol2
+           ! Array Arguments 
+           integer(ilp), intent(in) :: iblock(*), indexw(*), isplit(*)
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(sp), intent(inout) :: d(*), l(*), w(*), werr(*), wgap(*)
+           real(sp), intent(in) :: gers(*)
+           real(sp), intent(out) :: work(*)
+           real(sp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxitr = 10
+           
+           
+           ! Local Scalars 
+           logical(lk) :: eskip, needbs, stp2ii, tryrqc, usedbs, usedrq
+           integer(ilp) :: done, i, ibegin, idone, iend, ii, iindc1, iindc2, iindr, iindwk, iinfo,&
+            im, in, indeig, indld, indlld, indwrk, isupmn, isupmx, iter, itmp1, j, jblk, k, &
+            miniwsize, minwsize, nclus, ndepth, negcnt, newcls, newfst, newftt, newlst, newsiz, &
+            offset, oldcls, oldfst, oldien, oldlst, oldncl, p, parity, q, wbegin, wend, windex, &
+                      windmn, windpl, zfrom, zto, zusedl, zusedu, zusedw
+           real(sp) :: bstres, bstw, eps, fudge, gap, gaptol, gl, gu, lambda, left, lgap, mingma, &
+           nrminv, resid, rgap, right, rqcorr, rqtol, savgap, sgndef, sigma, spdiam, ssigma, tau, &
+                     tmp, tol, ztz
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( (n<=0).or.(m<=0) ) then
+              return
+           end if
+           ! the first n entries of work are reserved for the eigenvalues
+           indld = n+1
+           indlld= 2*n+1
+           indwrk= 3*n+1
+           minwsize = 12 * n
+           do i= 1,minwsize
+              work( i ) = zero
+           end do
+           ! iwork(iindr+1:iindr+n) hold the twist indices r for the
+           ! factorization used to compute the fp vector
+           iindr = 0
+           ! iwork(iindc1+1:iinc2+n) are used to store the clusters of the current
+           ! layer and the one above.
+           iindc1 = n
+           iindc2 = 2*n
+           iindwk = 3*n + 1
+           miniwsize = 7 * n
+           do i= 1,miniwsize
+              iwork( i ) = 0
+           end do
+           zusedl = 1
+           if(dol>1) then
+              ! set lower bound for use of z
+              zusedl = dol-1
+           endif
+           zusedu = m
+           if(dou<m) then
+              ! set lower bound for use of z
+              zusedu = dou+1
+           endif
+           ! the width of the part of z that is used
+           zusedw = zusedu - zusedl + 1
+           call stdlib_slaset( 'FULL', n, zusedw, zero, zero,z(1,zusedl), ldz )
+           eps = stdlib_slamch( 'PRECISION' )
+           rqtol = two * eps
+           ! set expert flags for standard code.
+           tryrqc = .true.
+           if((dol==1).and.(dou==m)) then
+           else
+              ! only selected eigenpairs are computed. since the other evalues
+              ! are not refined by rq iteration, bisection has to compute to full
+              ! accuracy.
+              rtol1 = four * eps
+              rtol2 = four * eps
+           endif
+           ! the entries wbegin:wend in w, werr, wgap correspond to the
+           ! desired eigenvalues. the support of the nonzero eigenvector
+           ! entries is contained in the interval ibegin:iend.
+           ! remark that if k eigenpairs are desired, then the eigenvectors
+           ! are stored in k contiguous columns of z.
+           ! done is the number of eigenvectors already computed
+           done = 0
+           ibegin = 1
+           wbegin = 1
+           loop_170: do jblk = 1, iblock( m )
+              iend = isplit( jblk )
+              sigma = l( iend )
+              ! find the eigenvectors of the submatrix indexed ibegin
+              ! through iend.
+              wend = wbegin - 1
+              15 continue
+              if( wend<m ) then
+                 if( iblock( wend+1 )==jblk ) then
+                    wend = wend + 1
+                    go to 15
+                 end if
+              end if
+              if( wend<wbegin ) then
+                 ibegin = iend + 1
+                 cycle loop_170
+              elseif( (wend<dol).or.(wbegin>dou) ) then
+                 ibegin = iend + 1
+                 wbegin = wend + 1
+                 cycle loop_170
+              end if
+              ! find local spectral diameter of the block
+              gl = gers( 2*ibegin-1 )
+              gu = gers( 2*ibegin )
+              do i = ibegin+1 , iend
+                 gl = min( gers( 2*i-1 ), gl )
+                 gu = max( gers( 2*i ), gu )
+              end do
+              spdiam = gu - gl
+              ! oldien is the last index of the previous block
+              oldien = ibegin - 1
+              ! calculate the size of the current block
+              in = iend - ibegin + 1
+              ! the number of eigenvalues in the current block
+              im = wend - wbegin + 1
+              ! this is for a 1x1 block
+              if( ibegin==iend ) then
+                 done = done+1
+                 z( ibegin, wbegin ) = one
+                 isuppz( 2*wbegin-1 ) = ibegin
+                 isuppz( 2*wbegin ) = ibegin
+                 w( wbegin ) = w( wbegin ) + sigma
+                 work( wbegin ) = w( wbegin )
+                 ibegin = iend + 1
+                 wbegin = wbegin + 1
+                 cycle loop_170
+              end if
+              ! the desired (shifted) eigenvalues are stored in w(wbegin:wend)
+              ! note that these can be approximations, in this case, the corresp.
+              ! entries of werr give the size of the uncertainty interval.
+              ! the eigenvalue approximations will be refined when necessary as
+              ! high relative accuracy is required for the computation of the
+              ! corresponding eigenvectors.
+              call stdlib_scopy( im, w( wbegin ), 1,work( wbegin ), 1 )
+              ! we store in w the eigenvalue approximations w.r.t. the original
+              ! matrix t.
+              do i=1,im
+                 w(wbegin+i-1) = w(wbegin+i-1)+sigma
+              end do
+              ! ndepth is the current depth of the representation tree
+              ndepth = 0
+              ! parity is either 1 or 0
+              parity = 1
+              ! nclus is the number of clusters for the next level of the
+              ! representation tree, we start with nclus = 1 for the root
+              nclus = 1
+              iwork( iindc1+1 ) = 1
+              iwork( iindc1+2 ) = im
+              ! idone is the number of eigenvectors already computed in the current
+              ! block
+              idone = 0
+              ! loop while( idone<im )
+              ! generate the representation tree for the current block and
+              ! compute the eigenvectors
+              40 continue
+              if( idone<im ) then
+                 ! this is a crude protection against infinitely deep trees
+                 if( ndepth>m ) then
+                    info = -2
+                    return
+                 endif
+                 ! breadth first processing of the current level of the representation
+                 ! tree: oldncl = number of clusters on current level
+                 oldncl = nclus
+                 ! reset nclus to count the number of child clusters
+                 nclus = 0
+                 parity = 1 - parity
+                 if( parity==0 ) then
+                    oldcls = iindc1
+                    newcls = iindc2
+                 else
+                    oldcls = iindc2
+                    newcls = iindc1
+                 end if
+                 ! process the clusters on the current level
+                 loop_150: do i = 1, oldncl
+                    j = oldcls + 2*i
+                    ! oldfst, oldlst = first, last index of current cluster.
+                                     ! cluster indices start with 1 and are relative
+                                     ! to wbegin when accessing w, wgap, werr, z
+                    oldfst = iwork( j-1 )
+                    oldlst = iwork( j )
+                    if( ndepth>0 ) then
+                       ! retrieve relatively robust representation (rrr) of cluster
+                       ! that has been computed at the previous level
+                       ! the rrr is stored in z and overwritten once the eigenvectors
+                       ! have been computed or when the cluster is refined
+                       if((dol==1).and.(dou==m)) then
+                          ! get representation from location of the leftmost evalue
+                          ! of the cluster
+                          j = wbegin + oldfst - 1
+                       else
+                          if(wbegin+oldfst-1<dol) then
+                             ! get representation from the left end of z array
+                             j = dol - 1
+                          elseif(wbegin+oldfst-1>dou) then
+                             ! get representation from the right end of z array
+                             j = dou
+                          else
+                             j = wbegin + oldfst - 1
+                          endif
+                       endif
+                       call stdlib_scopy( in, z( ibegin, j ), 1, d( ibegin ), 1 )
+                       call stdlib_scopy( in-1, z( ibegin, j+1 ), 1, l( ibegin ),1 )
+                       sigma = z( iend, j+1 )
+                       ! set the corresponding entries in z to zero
+                       call stdlib_slaset( 'FULL', in, 2, zero, zero,z( ibegin, j), ldz )
+                    end if
+                    ! compute dl and dll of current rrr
+                    do j = ibegin, iend-1
+                       tmp = d( j )*l( j )
+                       work( indld-1+j ) = tmp
+                       work( indlld-1+j ) = tmp*l( j )
+                    end do
+                    if( ndepth>0 ) then
+                       ! p and q are index of the first and last eigenvalue to compute
+                       ! within the current block
+                       p = indexw( wbegin-1+oldfst )
+                       q = indexw( wbegin-1+oldlst )
+                       ! offset for the arrays work, wgap and werr, i.e., the p-offset
+                       ! through the q-offset elements of these arrays are to be used.
+                        ! offset = p-oldfst
+                       offset = indexw( wbegin ) - 1
+                       ! perform limited bisection (if necessary) to get approximate
+                       ! eigenvalues to the precision needed.
+                       call stdlib_slarrb( in, d( ibegin ),work(indlld+ibegin-1),p, q, rtol1, &
+                       rtol2, offset,work(wbegin),wgap(wbegin),werr(wbegin),work( indwrk ), iwork(&
+                                  iindwk ),pivmin, spdiam, in, iinfo )
+                       if( iinfo/=0 ) then
+                          info = -1
+                          return
+                       endif
+                       ! we also recompute the extremal gaps. w holds all eigenvalues
+                       ! of the unshifted matrix and must be used for computation
+                       ! of wgap, the entries of work might stem from rrrs with
+                       ! different shifts. the gaps from wbegin-1+oldfst to
+                       ! wbegin-1+oldlst are correctly computed in stdlib_slarrb.
+                       ! however, we only allow the gaps to become greater since
+                       ! this is what should happen when we decrease werr
+                       if( oldfst>1) then
+                          wgap( wbegin+oldfst-2 ) =max(wgap(wbegin+oldfst-2),w(wbegin+oldfst-1)-&
+                          werr(wbegin+oldfst-1)- w(wbegin+oldfst-2)-werr(wbegin+oldfst-2) )
+                                    
+                       endif
+                       if( wbegin + oldlst -1 < wend ) then
+                          wgap( wbegin+oldlst-1 ) =max(wgap(wbegin+oldlst-1),w(wbegin+oldlst)-&
+                                    werr(wbegin+oldlst)- w(wbegin+oldlst-1)-werr(wbegin+oldlst-1) )
+                       endif
+                       ! each time the eigenvalues in work get refined, we store
+                       ! the newly found approximation with all shifts applied in w
+                       do j=oldfst,oldlst
+                          w(wbegin+j-1) = work(wbegin+j-1)+sigma
+                       end do
+                    end if
+                    ! process the current node.
+                    newfst = oldfst
+                    loop_140: do j = oldfst, oldlst
+                       if( j==oldlst ) then
+                          ! we are at the right end of the cluster, this is also the
+                          ! boundary of the child cluster
+                          newlst = j
+                       else if ( wgap( wbegin + j -1)>=minrgp* abs( work(wbegin + j -1) ) ) &
+                                 then
+                          ! the right relative gap is big enough, the child cluster
+                          ! (newfst,..,newlst) is well separated from the following
+                          newlst = j
+                        else
+                          ! inside a child cluster, the relative gap is not
+                          ! big enough.
+                          cycle loop_140
+                       end if
+                       ! compute size of child cluster found
+                       newsiz = newlst - newfst + 1
+                       ! newftt is the place in z where the new rrr or the computed
+                       ! eigenvector is to be stored
+                       if((dol==1).and.(dou==m)) then
+                          ! store representation at location of the leftmost evalue
+                          ! of the cluster
+                          newftt = wbegin + newfst - 1
+                       else
+                          if(wbegin+newfst-1<dol) then
+                             ! store representation at the left end of z array
+                             newftt = dol - 1
+                          elseif(wbegin+newfst-1>dou) then
+                             ! store representation at the right end of z array
+                             newftt = dou
+                          else
+                             newftt = wbegin + newfst - 1
+                          endif
+                       endif
+                       if( newsiz>1) then
+                          ! current child is not a singleton but a cluster.
+                          ! compute and store new representation of child.
+                          ! compute left and right cluster gap.
+                          ! lgap and rgap are not computed from work because
+                          ! the eigenvalue approximations may stem from rrrs
+                          ! different shifts. however, w hold all eigenvalues
+                          ! of the unshifted matrix. still, the entries in wgap
+                          ! have to be computed from work since the entries
+                          ! in w might be of the same order so that gaps are not
+                          ! exhibited correctly for very close eigenvalues.
+                          if( newfst==1 ) then
+                             lgap = max( zero,w(wbegin)-werr(wbegin) - vl )
+                         else
+                             lgap = wgap( wbegin+newfst-2 )
+                          endif
+                          rgap = wgap( wbegin+newlst-1 )
+                          ! compute left- and rightmost eigenvalue of child
+                          ! to high precision in order to shift as close
+                          ! as possible and obtain as large relative gaps
+                          ! as possible
+                          do k =1,2
+                             if(k==1) then
+                                p = indexw( wbegin-1+newfst )
+                             else
+                                p = indexw( wbegin-1+newlst )
+                             endif
+                             offset = indexw( wbegin ) - 1
+                             call stdlib_slarrb( in, d(ibegin),work( indlld+ibegin-1 ),p,p,rqtol, &
+                             rqtol, offset,work(wbegin),wgap(wbegin),werr(wbegin),work( indwrk ),&
+                                       iwork( iindwk ), pivmin, spdiam,in, iinfo )
+                          end do
+                          if((wbegin+newlst-1<dol).or.(wbegin+newfst-1>dou)) then
+                             ! if the cluster contains no desired eigenvalues
+                             ! skip the computation of that branch of the rep. tree
+                             ! we could skip before the refinement of the extremal
+                             ! eigenvalues of the child, but then the representation
+                             ! tree could be different from the one when nothing is
+                             ! skipped. for this reason we skip at this place.
+                             idone = idone + newlst - newfst + 1
+                             goto 139
+                          endif
+                          ! compute rrr of child cluster.
+                          ! note that the new rrr is stored in z
+                          ! stdlib_slarrf needs lwork = 2*n
+                          call stdlib_slarrf( in, d( ibegin ), l( ibegin ),work(indld+ibegin-1),&
+                          newfst, newlst, work(wbegin),wgap(wbegin), werr(wbegin),spdiam, lgap, &
+                          rgap, pivmin, tau,z(ibegin, newftt),z(ibegin, newftt+1),work( indwrk ), &
+                                    iinfo )
+                          if( iinfo==0 ) then
+                             ! a new rrr for the cluster was found by stdlib_slarrf
+                             ! update shift and store it
+                             ssigma = sigma + tau
+                             z( iend, newftt+1 ) = ssigma
+                             ! work() are the midpoints and werr() the semi-width
+                             ! note that the entries in w are unchanged.
+                             do k = newfst, newlst
+                                fudge =three*eps*abs(work(wbegin+k-1))
+                                work( wbegin + k - 1 ) =work( wbegin + k - 1) - tau
+                                fudge = fudge +four*eps*abs(work(wbegin+k-1))
+                                ! fudge errors
+                                werr( wbegin + k - 1 ) =werr( wbegin + k - 1 ) + fudge
+                                ! gaps are not fudged. provided that werr is small
+                                ! when eigenvalues are close, a zero gap indicates
+                                ! that a new representation is needed for resolving
+                                ! the cluster. a fudge could lead to a wrong decision
+                                ! of judging eigenvalues 'separated' which in
+                                ! reality are not. this could have a negative impact
+                                ! on the orthogonality of the computed eigenvectors.
+                             end do
+                             nclus = nclus + 1
+                             k = newcls + 2*nclus
+                             iwork( k-1 ) = newfst
+                             iwork( k ) = newlst
+                          else
+                             info = -2
+                             return
+                          endif
+                       else
+                          ! compute eigenvector of singleton
+                          iter = 0
+                          tol = four * log(real(in,KIND=sp)) * eps
+                          k = newfst
+                          windex = wbegin + k - 1
+                          windmn = max(windex - 1,1)
+                          windpl = min(windex + 1,m)
+                          lambda = work( windex )
+                          done = done + 1
+                          ! check if eigenvector computation is to be skipped
+                          if((windex<dol).or.(windex>dou)) then
+                             eskip = .true.
+                             goto 125
+                          else
+                             eskip = .false.
+                          endif
+                          left = work( windex ) - werr( windex )
+                          right = work( windex ) + werr( windex )
+                          indeig = indexw( windex )
+                          ! note that since we compute the eigenpairs for a child,
+                          ! all eigenvalue approximations are w.r.t the same shift.
+                          ! in this case, the entries in work should be used for
+                          ! computing the gaps since they exhibit even very small
+                          ! differences in the eigenvalues, as opposed to the
+                          ! entries in w which might "look" the same.
+                          if( k == 1) then
+                             ! in the case range='i' and with not much initial
+                             ! accuracy in lambda and vl, the formula
+                             ! lgap = max( zero, (sigma - vl) + lambda )
+                             ! can lead to an overestimation of the left gap and
+                             ! thus to inadequately early rqi 'convergence'.
+                             ! prevent this by forcing a small left gap.
+                             lgap = eps*max(abs(left),abs(right))
+                          else
+                             lgap = wgap(windmn)
+                          endif
+                          if( k == im) then
+                             ! in the case range='i' and with not much initial
+                             ! accuracy in lambda and vu, the formula
+                             ! can lead to an overestimation of the right gap and
+                             ! thus to inadequately early rqi 'convergence'.
+                             ! prevent this by forcing a small right gap.
+                             rgap = eps*max(abs(left),abs(right))
+                          else
+                             rgap = wgap(windex)
+                          endif
+                          gap = min( lgap, rgap )
+                          if(( k == 1).or.(k == im)) then
+                             ! the eigenvector support can become wrong
+                             ! because significant entries could be cut off due to a
+                             ! large gaptol parameter in lar1v. prevent this.
+                             gaptol = zero
+                          else
+                             gaptol = gap * eps
+                          endif
+                          isupmn = in
+                          isupmx = 1
+                          ! update wgap so that it holds the minimum gap
+                          ! to the left or the right. this is crucial in the
+                          ! case where bisection is used to ensure that the
+                          ! eigenvalue is refined up to the required precision.
+                          ! the correct value is restored afterwards.
+                          savgap = wgap(windex)
+                          wgap(windex) = gap
+                          ! we want to use the rayleigh quotient correction
+                          ! as often as possible since it converges quadratically
+                          ! when we are close enough to the desired eigenvalue.
+                          ! however, the rayleigh quotient can have the wrong sign
+                          ! and lead us away from the desired eigenvalue. in this
+                          ! case, the best we can do is to use bisection.
+                          usedbs = .false.
+                          usedrq = .false.
+                          ! bisection is initially turned off unless it is forced
+                          needbs =  .not.tryrqc
+                          120 continue
+                          ! check if bisection should be used to refine eigenvalue
+                          if(needbs) then
+                             ! take the bisection as new iterate
+                             usedbs = .true.
+                             itmp1 = iwork( iindr+windex )
+                             offset = indexw( wbegin ) - 1
+                             call stdlib_slarrb( in, d(ibegin),work(indlld+ibegin-1),indeig,&
+                             indeig,zero, two*eps, offset,work(wbegin),wgap(wbegin),werr(wbegin),&
+                                       work( indwrk ),iwork( iindwk ), pivmin, spdiam,itmp1, iinfo )
+                             if( iinfo/=0 ) then
+                                info = -3
+                                return
+                             endif
+                             lambda = work( windex )
+                             ! reset twist index from inaccurate lambda to
+                             ! force computation of true mingma
+                             iwork( iindr+windex ) = 0
+                          endif
+                          ! given lambda, compute the eigenvector.
+                          call stdlib_slar1v( in, 1, in, lambda, d( ibegin ),l( ibegin ), work(&
+                          indld+ibegin-1),work(indlld+ibegin-1),pivmin, gaptol, z( ibegin, windex &
+                          ),.not.usedbs, negcnt, ztz, mingma,iwork( iindr+windex ), isuppz( &
+                                    2*windex-1 ),nrminv, resid, rqcorr, work( indwrk ) )
+                          if(iter == 0) then
+                             bstres = resid
+                             bstw = lambda
+                          elseif(resid<bstres) then
+                             bstres = resid
+                             bstw = lambda
+                          endif
+                          isupmn = min(isupmn,isuppz( 2*windex-1 ))
+                          isupmx = max(isupmx,isuppz( 2*windex ))
+                          iter = iter + 1
+                          ! sin alpha <= |resid|/gap
+                          ! note that both the residual and the gap are
+                          ! proportional to the matrix, so ||t|| doesn't play
+                          ! a role in the quotient
+                          ! convergence test for rayleigh-quotient iteration
+                          ! (omitted when bisection has been used)
+                          if( resid>tol*gap .and. abs( rqcorr )>rqtol*abs( lambda ) .and. .not. &
+                                    usedbs)then
+                             ! we need to check that the rqcorr update doesn't
+                             ! move the eigenvalue away from the desired one and
+                             ! towards a neighbor. -> protection with bisection
+                             if(indeig<=negcnt) then
+                                ! the wanted eigenvalue lies to the left
+                                sgndef = -one
+                             else
+                                ! the wanted eigenvalue lies to the right
+                                sgndef = one
+                             endif
+                             ! we only use the rqcorr if it improves the
+                             ! the iterate reasonably.
+                             if( ( rqcorr*sgndef>=zero ).and.( lambda + rqcorr<= right).and.( &
+                                       lambda + rqcorr>= left)) then
+                                usedrq = .true.
+                                ! store new midpoint of bisection interval in work
+                                if(sgndef==one) then
+                                   ! the current lambda is on the left of the true
+                                   ! eigenvalue
+                                   left = lambda
+                                   ! we prefer to assume that the error estimate
+                                   ! is correct. we could make the interval not
+                                   ! as a bracket but to be modified if the rqcorr
+                                   ! chooses to. in this case, the right side should
+                                   ! be modified as follows:
+                                    ! right = max(right, lambda + rqcorr)
+                                else
+                                   ! the current lambda is on the right of the true
+                                   ! eigenvalue
+                                   right = lambda
+                                   ! see comment about assuming the error estimate is
+                                   ! correct above.
+                                    ! left = min(left, lambda + rqcorr)
+                                endif
+                                work( windex ) =half * (right + left)
+                                ! take rqcorr since it has the correct sign and
+                                ! improves the iterate reasonably
+                                lambda = lambda + rqcorr
+                                ! update width of error interval
+                                werr( windex ) =half * (right-left)
+                             else
+                                needbs = .true.
+                             endif
+                             if(right-left<rqtol*abs(lambda)) then
+                                   ! the eigenvalue is computed to bisection accuracy
+                                   ! compute eigenvector and stop
+                                usedbs = .true.
+                                goto 120
+                             elseif( iter<maxitr ) then
+                                goto 120
+                             elseif( iter==maxitr ) then
+                                needbs = .true.
+                                goto 120
+                             else
+                                info = 5
+                                return
+                             end if
+                          else
+                             stp2ii = .false.
+             if(usedrq .and. usedbs .and.bstres<=resid) then
+                                lambda = bstw
+                                stp2ii = .true.
+                             endif
+                             if (stp2ii) then
+                                ! improve error angle by second step
+                                call stdlib_slar1v( in, 1, in, lambda,d( ibegin ), l( ibegin ),&
+                                work(indld+ibegin-1),work(indlld+ibegin-1),pivmin, gaptol, z( &
+                                ibegin, windex ),.not.usedbs, negcnt, ztz, mingma,iwork( iindr+&
+                                windex ),isuppz( 2*windex-1 ),nrminv, resid, rqcorr, work( indwrk &
+                                          ) )
+                             endif
+                             work( windex ) = lambda
+                          end if
+                          ! compute fp-vector support w.r.t. whole matrix
+                          isuppz( 2*windex-1 ) = isuppz( 2*windex-1 )+oldien
+                          isuppz( 2*windex ) = isuppz( 2*windex )+oldien
+                          zfrom = isuppz( 2*windex-1 )
+                          zto = isuppz( 2*windex )
+                          isupmn = isupmn + oldien
+                          isupmx = isupmx + oldien
+                          ! ensure vector is ok if support in the rqi has changed
+                          if(isupmn<zfrom) then
+                             do ii = isupmn,zfrom-1
+                                z( ii, windex ) = zero
+                             end do
+                          endif
+                          if(isupmx>zto) then
+                             do ii = zto+1,isupmx
+                                z( ii, windex ) = zero
+                             end do
+                          endif
+                          call stdlib_sscal( zto-zfrom+1, nrminv,z( zfrom, windex ), 1 )
+                          125 continue
+                          ! update w
+                          w( windex ) = lambda+sigma
+                          ! recompute the gaps on the left and right
+                          ! but only allow them to become larger and not
+                          ! smaller (which can only happen through "bad"
+                          ! cancellation and doesn't reflect the theory
+                          ! where the initial gaps are underestimated due
+                          ! to werr being too crude.)
+                          if(.not.eskip) then
+                             if( k>1) then
+                                wgap( windmn ) = max( wgap(windmn),w(windex)-werr(windex)- w(&
+                                          windmn)-werr(windmn) )
+                             endif
+                             if( windex<wend ) then
+                                wgap( windex ) = max( savgap,w( windpl )-werr( windpl )- w( &
+                                          windex )-werr( windex) )
+                             endif
+                          endif
+                          idone = idone + 1
+                       endif
+                       ! here ends the code for the current child
+                       139 continue
+                       ! proceed to any remaining child nodes
+                       newfst = j + 1
+                    end do loop_140
+                 end do loop_150
+                 ndepth = ndepth + 1
+                 go to 40
+              end if
+              ibegin = iend + 1
+              wbegin = wend + 1
+           end do loop_170
+           return
+     end subroutine stdlib_slarrv
+
+     !> SLASCL: multiplies the M by N real matrix A by the real scalar
+     !> CTO/CFROM.  This is done without over/underflow as long as the final
+     !> result CTO*A(I,J)/CFROM does not over/underflow. TYPE specifies that
+     !> A may be full, upper triangular, lower triangular, upper Hessenberg,
+     !> or banded.
+
+     pure subroutine stdlib_slascl( type, kl, ku, cfrom, cto, m, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: type
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, lda, m, n
+           real(sp), intent(in) :: cfrom, cto
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: i, itype, j, k1, k2, k3, k4
+           real(sp) :: bignum, cfrom1, cfromc, cto1, ctoc, mul, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( stdlib_lsame( type, 'G' ) ) then
+              itype = 0
+           else if( stdlib_lsame( type, 'L' ) ) then
+              itype = 1
+           else if( stdlib_lsame( type, 'U' ) ) then
+              itype = 2
+           else if( stdlib_lsame( type, 'H' ) ) then
+              itype = 3
+           else if( stdlib_lsame( type, 'B' ) ) then
+              itype = 4
+           else if( stdlib_lsame( type, 'Q' ) ) then
+              itype = 5
+           else if( stdlib_lsame( type, 'Z' ) ) then
+              itype = 6
+           else
+              itype = -1
+           end if
+           if( itype==-1 ) then
+              info = -1
+           else if( cfrom==zero .or. stdlib_sisnan(cfrom) ) then
+              info = -4
+           else if( stdlib_sisnan(cto) ) then
+              info = -5
+           else if( m<0 ) then
+              info = -6
+           else if( n<0 .or. ( itype==4 .and. n/=m ) .or.( itype==5 .and. n/=m ) ) then
+              info = -7
+           else if( itype<=3 .and. lda<max( 1, m ) ) then
+              info = -9
+           else if( itype>=4 ) then
+              if( kl<0 .or. kl>max( m-1, 0 ) ) then
+                 info = -2
+              else if( ku<0 .or. ku>max( n-1, 0 ) .or.( ( itype==4 .or. itype==5 ) .and. kl/=ku ) &
+                        )then
+                 info = -3
+              else if( ( itype==4 .and. lda<kl+1 ) .or.( itype==5 .and. lda<ku+1 ) .or.( itype==6 &
+                        .and. lda<2*kl+ku+1 ) ) then
+                 info = -9
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLASCL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 )return
+           ! get machine parameters
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           cfromc = cfrom
+           ctoc = cto
+           10 continue
+           cfrom1 = cfromc*smlnum
+           if( cfrom1==cfromc ) then
+              ! cfromc is an inf.  multiply by a correctly signed zero for
+              ! finite ctoc, or a nan if ctoc is infinite.
+              mul = ctoc / cfromc
+              done = .true.
+              cto1 = ctoc
+           else
+              cto1 = ctoc / bignum
+              if( cto1==ctoc ) then
+                 ! ctoc is either 0 or an inf.  in both cases, ctoc itself
+                 ! serves as the correct multiplication factor.
+                 mul = ctoc
+                 done = .true.
+                 cfromc = one
+              else if( abs( cfrom1 )>abs( ctoc ) .and. ctoc/=zero ) then
+                 mul = smlnum
+                 done = .false.
+                 cfromc = cfrom1
+              else if( abs( cto1 )>abs( cfromc ) ) then
+                 mul = bignum
+                 done = .false.
+                 ctoc = cto1
+              else
+                 mul = ctoc / cfromc
+                 done = .true.
+              end if
+           end if
+           if( itype==0 ) then
+              ! full matrix
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==1 ) then
+              ! lower triangular matrix
+              do j = 1, n
+                 do i = j, m
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==2 ) then
+              ! upper triangular matrix
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==3 ) then
+              ! upper hessenberg matrix
+              do j = 1, n
+                 do i = 1, min( j+1, m )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==4 ) then
+              ! lower half of a symmetric band matrix
+              k3 = kl + 1
+              k4 = n + 1
+              do j = 1, n
+                 do i = 1, min( k3, k4-j )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==5 ) then
+              ! upper half of a symmetric band matrix
+              k1 = ku + 2
+              k3 = ku + 1
+              do j = 1, n
+                 do i = max( k1-j, 1 ), k3
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==6 ) then
+              ! band matrix
+              k1 = kl + ku + 2
+              k2 = kl + 1
+              k3 = 2*kl + ku + 1
+              k4 = kl + ku + 1 + m
+              do j = 1, n
+                 do i = max( k1-j, k2 ), min( k3, k4-j )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           end if
+           if( .not.done )go to 10
+           return
+     end subroutine stdlib_slascl
+
+     !> This subroutine computes the square root of the I-th updated
+     !> eigenvalue of a positive symmetric rank-one modification to
+     !> a positive diagonal matrix whose entries are given as the squares
+     !> of the corresponding entries in the array d, and that
+     !> 0 <= D(i) < D(j)  for  i < j
+     !> and that RHO > 0. This is arranged by the calling routine, and is
+     !> no loss in generality.  The rank-one modified system is thus
+     !> diag( D ) * diag( D ) +  RHO * Z * Z_transpose.
+     !> where we assume the Euclidean norm of Z is 1.
+     !> The method consists of approximating the rational functions in the
+     !> secular equation by simpler interpolating rational functions.
+
+     pure subroutine stdlib_slasd4( n, i, d, z, delta, rho, sigma, work, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i, n
+           integer(ilp), intent(out) :: info
+           real(sp), intent(in) :: rho
+           real(sp), intent(out) :: sigma
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*), z(*)
+           real(sp), intent(out) :: delta(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 400
+           
+           
+           ! Local Scalars 
+           logical(lk) :: orgati, swtch, swtch3, geomavg
+           integer(ilp) :: ii, iim1, iip1, ip1, iter, j, niter
+           real(sp) :: a, b, c, delsq, delsq2, sq2, dphi, dpsi, dtiim, dtiip, dtipsq, dtisq, &
+           dtnsq, dtnsq1, dw, eps, erretm, eta, phi, prew, psi, rhoinv, sglb, sgub, tau, tau2, &
+                     temp, temp1, temp2, w
+           ! Local Arrays 
+           real(sp) :: dd(3), zz(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! since this routine is called in an inner loop, we do no argument
+           ! checking.
+           ! quick return for n=1 and 2.
+           info = 0
+           if( n==1 ) then
+              ! presumably, i=1 upon entry
+              sigma = sqrt( d( 1 )*d( 1 )+rho*z( 1 )*z( 1 ) )
+              delta( 1 ) = one
+              work( 1 ) = one
+              return
+           end if
+           if( n==2 ) then
+              call stdlib_slasd5( i, d, z, delta, rho, sigma, work )
+              return
+           end if
+           ! compute machine epsilon
+           eps = stdlib_slamch( 'EPSILON' )
+           rhoinv = one / rho
+           tau2= zero
+           ! the case i = n
+           if( i==n ) then
+              ! initialize some basic variables
+              ii = n - 1
+              niter = 1
+              ! calculate initial guess
+              temp = rho / two
+              ! if ||z||_2 is not one, then temp should be set to
+              ! rho * ||z||_2^2 / two
+              temp1 = temp / ( d( n )+sqrt( d( n )*d( n )+temp ) )
+              do j = 1, n
+                 work( j ) = d( j ) + d( n ) + temp1
+                 delta( j ) = ( d( j )-d( n ) ) - temp1
+              end do
+              psi = zero
+              do j = 1, n - 2
+                 psi = psi + z( j )*z( j ) / ( delta( j )*work( j ) )
+              end do
+              c = rhoinv + psi
+              w = c + z( ii )*z( ii ) / ( delta( ii )*work( ii ) ) +z( n )*z( n ) / ( delta( n )&
+                        *work( n ) )
+              if( w<=zero ) then
+                 temp1 = sqrt( d( n )*d( n )+rho )
+                 temp = z( n-1 )*z( n-1 ) / ( ( d( n-1 )+temp1 )*( d( n )-d( n-1 )+rho / ( d( n )+&
+                           temp1 ) ) ) +z( n )*z( n ) / rho
+                 ! the following tau2 is to approximate
+                 ! sigma_n^2 - d( n )*d( n )
+                 if( c<=temp ) then
+                    tau = rho
+                 else
+                    delsq = ( d( n )-d( n-1 ) )*( d( n )+d( n-1 ) )
+                    a = -c*delsq + z( n-1 )*z( n-1 ) + z( n )*z( n )
+                    b = z( n )*z( n )*delsq
+                    if( a<zero ) then
+                       tau2 = two*b / ( sqrt( a*a+four*b*c )-a )
+                    else
+                       tau2 = ( a+sqrt( a*a+four*b*c ) ) / ( two*c )
+                    end if
+                    tau = tau2 / ( d( n )+sqrt( d( n )*d( n )+tau2 ) )
+                 end if
+                 ! it can be proved that
+                     ! d(n)^2+rho/2 <= sigma_n^2 < d(n)^2+tau2 <= d(n)^2+rho
+              else
+                 delsq = ( d( n )-d( n-1 ) )*( d( n )+d( n-1 ) )
+                 a = -c*delsq + z( n-1 )*z( n-1 ) + z( n )*z( n )
+                 b = z( n )*z( n )*delsq
+                 ! the following tau2 is to approximate
+                 ! sigma_n^2 - d( n )*d( n )
+                 if( a<zero ) then
+                    tau2 = two*b / ( sqrt( a*a+four*b*c )-a )
+                 else
+                    tau2 = ( a+sqrt( a*a+four*b*c ) ) / ( two*c )
+                 end if
+                 tau = tau2 / ( d( n )+sqrt( d( n )*d( n )+tau2 ) )
+                 ! it can be proved that
+                 ! d(n)^2 < d(n)^2+tau2 < sigma(n)^2 < d(n)^2+rho/2
+              end if
+              ! the following tau is to approximate sigma_n - d( n )
+               ! tau = tau2 / ( d( n )+sqrt( d( n )*d( n )+tau2 ) )
+              sigma = d( n ) + tau
+              do j = 1, n
+                 delta( j ) = ( d( j )-d( n ) ) - tau
+                 work( j ) = d( j ) + d( n ) + tau
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, ii
+                 temp = z( j ) / ( delta( j )*work( j ) )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              temp = z( n ) / ( delta( n )*work( n ) )
+              phi = z( n )*temp
+              dphi = temp*temp
+              erretm = eight*( -phi-psi ) + erretm - phi + rhoinv
+          ! $          + abs( tau2 )*( dpsi+dphi )
+              w = rhoinv + phi + psi
+              ! test for convergence
+              if( abs( w )<=eps*erretm ) then
+                 go to 240
+              end if
+              ! calculate the new step
+              niter = niter + 1
+              dtnsq1 = work( n-1 )*delta( n-1 )
+              dtnsq = work( n )*delta( n )
+              c = w - dtnsq1*dpsi - dtnsq*dphi
+              a = ( dtnsq+dtnsq1 )*w - dtnsq*dtnsq1*( dpsi+dphi )
+              b = dtnsq*dtnsq1*w
+              if( c<zero )c = abs( c )
+              if( c==zero ) then
+                 eta = rho - sigma*sigma
+              else if( a>=zero ) then
+                 eta = ( a+sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+              else
+                 eta = two*b / ( a-sqrt( abs( a*a-four*b*c ) ) )
+              end if
+              ! note, eta should be positive if w is negative, and
+              ! eta should be negative otherwise. however,
+              ! if for some reason caused by roundoff, eta*w > 0,
+              ! we simply use one newton step instead. this way
+              ! will guarantee eta*w < 0.
+              if( w*eta>zero )eta = -w / ( dpsi+dphi )
+              temp = eta - dtnsq
+              if( temp>rho )eta = rho + dtnsq
+              eta = eta / ( sigma+sqrt( eta+sigma*sigma ) )
+              tau = tau + eta
+              sigma = sigma + eta
+              do j = 1, n
+                 delta( j ) = delta( j ) - eta
+                 work( j ) = work( j ) + eta
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, ii
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              tau2 = work( n )*delta( n )
+              temp = z( n ) / tau2
+              phi = z( n )*temp
+              dphi = temp*temp
+              erretm = eight*( -phi-psi ) + erretm - phi + rhoinv
+          ! $          + abs( tau2 )*( dpsi+dphi )
+              w = rhoinv + phi + psi
+              ! main loop to update the values of the array   delta
+              iter = niter + 1
+              loop_90: do niter = iter, maxit
+                 ! test for convergence
+                 if( abs( w )<=eps*erretm ) then
+                    go to 240
+                 end if
+                 ! calculate the new step
+                 dtnsq1 = work( n-1 )*delta( n-1 )
+                 dtnsq = work( n )*delta( n )
+                 c = w - dtnsq1*dpsi - dtnsq*dphi
+                 a = ( dtnsq+dtnsq1 )*w - dtnsq1*dtnsq*( dpsi+dphi )
+                 b = dtnsq1*dtnsq*w
+                 if( a>=zero ) then
+                    eta = ( a+sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 else
+                    eta = two*b / ( a-sqrt( abs( a*a-four*b*c ) ) )
+                 end if
+                 ! note, eta should be positive if w is negative, and
+                 ! eta should be negative otherwise. however,
+                 ! if for some reason caused by roundoff, eta*w > 0,
+                 ! we simply use one newton step instead. this way
+                 ! will guarantee eta*w < 0.
+                 if( w*eta>zero )eta = -w / ( dpsi+dphi )
+                 temp = eta - dtnsq
+                 if( temp<=zero )eta = eta / two
+                 eta = eta / ( sigma+sqrt( eta+sigma*sigma ) )
+                 tau = tau + eta
+                 sigma = sigma + eta
+                 do j = 1, n
+                    delta( j ) = delta( j ) - eta
+                    work( j ) = work( j ) + eta
+                 end do
+                 ! evaluate psi and the derivative dpsi
+                 dpsi = zero
+                 psi = zero
+                 erretm = zero
+                 do j = 1, ii
+                    temp = z( j ) / ( work( j )*delta( j ) )
+                    psi = psi + z( j )*temp
+                    dpsi = dpsi + temp*temp
+                    erretm = erretm + psi
+                 end do
+                 erretm = abs( erretm )
+                 ! evaluate phi and the derivative dphi
+                 tau2 = work( n )*delta( n )
+                 temp = z( n ) / tau2
+                 phi = z( n )*temp
+                 dphi = temp*temp
+                 erretm = eight*( -phi-psi ) + erretm - phi + rhoinv
+          ! $             + abs( tau2 )*( dpsi+dphi )
+                 w = rhoinv + phi + psi
+              end do loop_90
+              ! return with info = 1, niter = maxit and not converged
+              info = 1
+              go to 240
+              ! end for the case i = n
+           else
+              ! the case for i < n
+              niter = 1
+              ip1 = i + 1
+              ! calculate initial guess
+              delsq = ( d( ip1 )-d( i ) )*( d( ip1 )+d( i ) )
+              delsq2 = delsq / two
+              sq2=sqrt( ( d( i )*d( i )+d( ip1 )*d( ip1 ) ) / two )
+              temp = delsq2 / ( d( i )+sq2 )
+              do j = 1, n
+                 work( j ) = d( j ) + d( i ) + temp
+                 delta( j ) = ( d( j )-d( i ) ) - temp
+              end do
+              psi = zero
+              do j = 1, i - 1
+                 psi = psi + z( j )*z( j ) / ( work( j )*delta( j ) )
+              end do
+              phi = zero
+              do j = n, i + 2, -1
+                 phi = phi + z( j )*z( j ) / ( work( j )*delta( j ) )
+              end do
+              c = rhoinv + psi + phi
+              w = c + z( i )*z( i ) / ( work( i )*delta( i ) ) +z( ip1 )*z( ip1 ) / ( work( ip1 )&
+                        *delta( ip1 ) )
+              geomavg = .false.
+              if( w>zero ) then
+                 ! d(i)^2 < the ith sigma^2 < (d(i)^2+d(i+1)^2)/2
+                 ! we choose d(i) as origin.
+                 orgati = .true.
+                 ii = i
+                 sglb = zero
+                 sgub = delsq2  / ( d( i )+sq2 )
+                 a = c*delsq + z( i )*z( i ) + z( ip1 )*z( ip1 )
+                 b = z( i )*z( i )*delsq
+                 if( a>zero ) then
+                    tau2 = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                 else
+                    tau2 = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 end if
+                 ! tau2 now is an estimation of sigma^2 - d( i )^2. the
+                 ! following, however, is the corresponding estimation of
+                 ! sigma - d( i ).
+                 tau = tau2 / ( d( i )+sqrt( d( i )*d( i )+tau2 ) )
+                 temp = sqrt(eps)
+                 if( (d(i)<=temp*d(ip1)).and.(abs(z(i))<=temp).and.(d(i)>zero) ) then
+                    tau = min( ten*d(i), sgub )
+                    geomavg = .true.
+                 end if
+              else
+                 ! (d(i)^2+d(i+1)^2)/2 <= the ith sigma^2 < d(i+1)^2/2
+                 ! we choose d(i+1) as origin.
+                 orgati = .false.
+                 ii = ip1
+                 sglb = -delsq2  / ( d( ii )+sq2 )
+                 sgub = zero
+                 a = c*delsq - z( i )*z( i ) - z( ip1 )*z( ip1 )
+                 b = z( ip1 )*z( ip1 )*delsq
+                 if( a<zero ) then
+                    tau2 = two*b / ( a-sqrt( abs( a*a+four*b*c ) ) )
+                 else
+                    tau2 = -( a+sqrt( abs( a*a+four*b*c ) ) ) / ( two*c )
+                 end if
+                 ! tau2 now is an estimation of sigma^2 - d( ip1 )^2. the
+                 ! following, however, is the corresponding estimation of
+                 ! sigma - d( ip1 ).
+                 tau = tau2 / ( d( ip1 )+sqrt( abs( d( ip1 )*d( ip1 )+tau2 ) ) )
+              end if
+              sigma = d( ii ) + tau
+              do j = 1, n
+                 work( j ) = d( j ) + d( ii ) + tau
+                 delta( j ) = ( d( j )-d( ii ) ) - tau
+              end do
+              iim1 = ii - 1
+              iip1 = ii + 1
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, iim1
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              dphi = zero
+              phi = zero
+              do j = n, iip1, -1
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 phi = phi + z( j )*temp
+                 dphi = dphi + temp*temp
+                 erretm = erretm + phi
+              end do
+              w = rhoinv + phi + psi
+              ! w is the value of the secular function with
+              ! its ii-th element removed.
+              swtch3 = .false.
+              if( orgati ) then
+                 if( w<zero )swtch3 = .true.
+              else
+                 if( w>zero )swtch3 = .true.
+              end if
+              if( ii==1 .or. ii==n )swtch3 = .false.
+              temp = z( ii ) / ( work( ii )*delta( ii ) )
+              dw = dpsi + dphi + temp*temp
+              temp = z( ii )*temp
+              w = w + temp
+              erretm = eight*( phi-psi ) + erretm + two*rhoinv+ three*abs( temp )
+          ! $          + abs( tau2 )*dw
+              ! test for convergence
+              if( abs( w )<=eps*erretm ) then
+                 go to 240
+              end if
+              if( w<=zero ) then
+                 sglb = max( sglb, tau )
+              else
+                 sgub = min( sgub, tau )
+              end if
+              ! calculate the new step
+              niter = niter + 1
+              if( .not.swtch3 ) then
+                 dtipsq = work( ip1 )*delta( ip1 )
+                 dtisq = work( i )*delta( i )
+                 if( orgati ) then
+                    c = w - dtipsq*dw + delsq*( z( i ) / dtisq )**2
+                 else
+                    c = w - dtisq*dw - delsq*( z( ip1 ) / dtipsq )**2
+                 end if
+                 a = ( dtipsq+dtisq )*w - dtipsq*dtisq*dw
+                 b = dtipsq*dtisq*w
+                 if( c==zero ) then
+                    if( a==zero ) then
+                       if( orgati ) then
+                          a = z( i )*z( i ) + dtipsq*dtipsq*( dpsi+dphi )
+                       else
+                          a = z( ip1 )*z( ip1 ) + dtisq*dtisq*( dpsi+dphi )
+                       end if
+                    end if
+                    eta = b / a
+                 else if( a<=zero ) then
+                    eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 else
+                    eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                 end if
+              else
+                 ! interpolation using three most relevant poles
+                 dtiim = work( iim1 )*delta( iim1 )
+                 dtiip = work( iip1 )*delta( iip1 )
+                 temp = rhoinv + psi + phi
+                 if( orgati ) then
+                    temp1 = z( iim1 ) / dtiim
+                    temp1 = temp1*temp1
+                    c = ( temp - dtiip*( dpsi+dphi ) ) -( d( iim1 )-d( iip1 ) )*( d( iim1 )+d( &
+                              iip1 ) )*temp1
+                    zz( 1 ) = z( iim1 )*z( iim1 )
+                    if( dpsi<temp1 ) then
+                       zz( 3 ) = dtiip*dtiip*dphi
+                    else
+                       zz( 3 ) = dtiip*dtiip*( ( dpsi-temp1 )+dphi )
+                    end if
+                 else
+                    temp1 = z( iip1 ) / dtiip
+                    temp1 = temp1*temp1
+                    c = ( temp - dtiim*( dpsi+dphi ) ) -( d( iip1 )-d( iim1 ) )*( d( iim1 )+d( &
+                              iip1 ) )*temp1
+                    if( dphi<temp1 ) then
+                       zz( 1 ) = dtiim*dtiim*dpsi
+                    else
+                       zz( 1 ) = dtiim*dtiim*( dpsi+( dphi-temp1 ) )
+                    end if
+                    zz( 3 ) = z( iip1 )*z( iip1 )
+                 end if
+                 zz( 2 ) = z( ii )*z( ii )
+                 dd( 1 ) = dtiim
+                 dd( 2 ) = delta( ii )*work( ii )
+                 dd( 3 ) = dtiip
+                 call stdlib_slaed6( niter, orgati, c, dd, zz, w, eta, info )
+                 if( info/=0 ) then
+                    ! if info is not 0, i.e., stdlib_slaed6 failed, switch back
+                    ! to 2 pole interpolation.
+                    swtch3 = .false.
+                    info = 0
+                    dtipsq = work( ip1 )*delta( ip1 )
+                    dtisq = work( i )*delta( i )
+                    if( orgati ) then
+                       c = w - dtipsq*dw + delsq*( z( i ) / dtisq )**2
+                    else
+                       c = w - dtisq*dw - delsq*( z( ip1 ) / dtipsq )**2
+                    end if
+                    a = ( dtipsq+dtisq )*w - dtipsq*dtisq*dw
+                    b = dtipsq*dtisq*w
+                    if( c==zero ) then
+                       if( a==zero ) then
+                          if( orgati ) then
+                             a = z( i )*z( i ) + dtipsq*dtipsq*( dpsi+dphi )
+                          else
+                             a = z( ip1 )*z( ip1 ) + dtisq*dtisq*( dpsi+dphi)
+                          end if
+                       end if
+                       eta = b / a
+                    else if( a<=zero ) then
+                       eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                    else
+                       eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                    end if
+                 end if
+              end if
+              ! note, eta should be positive if w is negative, and
+              ! eta should be negative otherwise. however,
+              ! if for some reason caused by roundoff, eta*w > 0,
+              ! we simply use one newton step instead. this way
+              ! will guarantee eta*w < 0.
+              if( w*eta>=zero )eta = -w / dw
+              eta = eta / ( sigma+sqrt( sigma*sigma+eta ) )
+              temp = tau + eta
+              if( temp>sgub .or. temp<sglb ) then
+                 if( w<zero ) then
+                    eta = ( sgub-tau ) / two
+                 else
+                    eta = ( sglb-tau ) / two
+                 end if
+                 if( geomavg ) then
+                    if( w < zero ) then
+                       if( tau > zero ) then
+                          eta = sqrt(sgub*tau)-tau
+                       end if
+                    else
+                       if( sglb > zero ) then
+                          eta = sqrt(sglb*tau)-tau
+                       end if
+                    end if
+                 end if
+              end if
+              prew = w
+              tau = tau + eta
+              sigma = sigma + eta
+              do j = 1, n
+                 work( j ) = work( j ) + eta
+                 delta( j ) = delta( j ) - eta
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, iim1
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              dphi = zero
+              phi = zero
+              do j = n, iip1, -1
+                 temp = z( j ) / ( work( j )*delta( j ) )
+                 phi = phi + z( j )*temp
+                 dphi = dphi + temp*temp
+                 erretm = erretm + phi
+              end do
+              tau2 = work( ii )*delta( ii )
+              temp = z( ii ) / tau2
+              dw = dpsi + dphi + temp*temp
+              temp = z( ii )*temp
+              w = rhoinv + phi + psi + temp
+              erretm = eight*( phi-psi ) + erretm + two*rhoinv+ three*abs( temp )
+          ! $          + abs( tau2 )*dw
+              swtch = .false.
+              if( orgati ) then
+                 if( -w>abs( prew ) / ten )swtch = .true.
+              else
+                 if( w>abs( prew ) / ten )swtch = .true.
+              end if
+              ! main loop to update the values of the array   delta and work
+              iter = niter + 1
+              loop_230: do niter = iter, maxit
+                 ! test for convergence
+                 if( abs( w )<=eps*erretm ) then
+           ! $          .or. (sgub-sglb)<=eight*abs(sgub+sglb) ) then
+                    go to 240
+                 end if
+                 if( w<=zero ) then
+                    sglb = max( sglb, tau )
+                 else
+                    sgub = min( sgub, tau )
+                 end if
+                 ! calculate the new step
+                 if( .not.swtch3 ) then
+                    dtipsq = work( ip1 )*delta( ip1 )
+                    dtisq = work( i )*delta( i )
+                    if( .not.swtch ) then
+                       if( orgati ) then
+                          c = w - dtipsq*dw + delsq*( z( i ) / dtisq )**2
+                       else
+                          c = w - dtisq*dw - delsq*( z( ip1 ) / dtipsq )**2
+                       end if
+                    else
+                       temp = z( ii ) / ( work( ii )*delta( ii ) )
+                       if( orgati ) then
+                          dpsi = dpsi + temp*temp
+                       else
+                          dphi = dphi + temp*temp
+                       end if
+                       c = w - dtisq*dpsi - dtipsq*dphi
+                    end if
+                    a = ( dtipsq+dtisq )*w - dtipsq*dtisq*dw
+                    b = dtipsq*dtisq*w
+                    if( c==zero ) then
+                       if( a==zero ) then
+                          if( .not.swtch ) then
+                             if( orgati ) then
+                                a = z( i )*z( i ) + dtipsq*dtipsq*( dpsi+dphi )
+                             else
+                                a = z( ip1 )*z( ip1 ) +dtisq*dtisq*( dpsi+dphi )
+                             end if
+                          else
+                             a = dtisq*dtisq*dpsi + dtipsq*dtipsq*dphi
+                          end if
+                       end if
+                       eta = b / a
+                    else if( a<=zero ) then
+                       eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                    else
+                       eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                    end if
+                 else
+                    ! interpolation using three most relevant poles
+                    dtiim = work( iim1 )*delta( iim1 )
+                    dtiip = work( iip1 )*delta( iip1 )
+                    temp = rhoinv + psi + phi
+                    if( swtch ) then
+                       c = temp - dtiim*dpsi - dtiip*dphi
+                       zz( 1 ) = dtiim*dtiim*dpsi
+                       zz( 3 ) = dtiip*dtiip*dphi
+                    else
+                       if( orgati ) then
+                          temp1 = z( iim1 ) / dtiim
+                          temp1 = temp1*temp1
+                          temp2 = ( d( iim1 )-d( iip1 ) )*( d( iim1 )+d( iip1 ) )*temp1
+                          c = temp - dtiip*( dpsi+dphi ) - temp2
+                          zz( 1 ) = z( iim1 )*z( iim1 )
+                          if( dpsi<temp1 ) then
+                             zz( 3 ) = dtiip*dtiip*dphi
+                          else
+                             zz( 3 ) = dtiip*dtiip*( ( dpsi-temp1 )+dphi )
+                          end if
+                       else
+                          temp1 = z( iip1 ) / dtiip
+                          temp1 = temp1*temp1
+                          temp2 = ( d( iip1 )-d( iim1 ) )*( d( iim1 )+d( iip1 ) )*temp1
+                          c = temp - dtiim*( dpsi+dphi ) - temp2
+                          if( dphi<temp1 ) then
+                             zz( 1 ) = dtiim*dtiim*dpsi
+                          else
+                             zz( 1 ) = dtiim*dtiim*( dpsi+( dphi-temp1 ) )
+                          end if
+                          zz( 3 ) = z( iip1 )*z( iip1 )
+                       end if
+                    end if
+                    dd( 1 ) = dtiim
+                    dd( 2 ) = delta( ii )*work( ii )
+                    dd( 3 ) = dtiip
+                    call stdlib_slaed6( niter, orgati, c, dd, zz, w, eta, info )
+                    if( info/=0 ) then
+                       ! if info is not 0, i.e., stdlib_slaed6 failed, switch
+                       ! back to two pole interpolation
+                       swtch3 = .false.
+                       info = 0
+                       dtipsq = work( ip1 )*delta( ip1 )
+                       dtisq = work( i )*delta( i )
+                       if( .not.swtch ) then
+                          if( orgati ) then
+                             c = w - dtipsq*dw + delsq*( z( i )/dtisq )**2
+                          else
+                             c = w - dtisq*dw - delsq*( z( ip1 )/dtipsq )**2
+                          end if
+                       else
+                          temp = z( ii ) / ( work( ii )*delta( ii ) )
+                          if( orgati ) then
+                             dpsi = dpsi + temp*temp
+                          else
+                             dphi = dphi + temp*temp
+                          end if
+                          c = w - dtisq*dpsi - dtipsq*dphi
+                       end if
+                       a = ( dtipsq+dtisq )*w - dtipsq*dtisq*dw
+                       b = dtipsq*dtisq*w
+                       if( c==zero ) then
+                          if( a==zero ) then
+                             if( .not.swtch ) then
+                                if( orgati ) then
+                                   a = z( i )*z( i ) + dtipsq*dtipsq*( dpsi+dphi )
+                                else
+                                   a = z( ip1 )*z( ip1 ) +dtisq*dtisq*( dpsi+dphi )
+                                end if
+                             else
+                                a = dtisq*dtisq*dpsi + dtipsq*dtipsq*dphi
+                             end if
+                          end if
+                          eta = b / a
+                       else if( a<=zero ) then
+                          eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                       else
+                          eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                       end if
+                    end if
+                 end if
+                 ! note, eta should be positive if w is negative, and
+                 ! eta should be negative otherwise. however,
+                 ! if for some reason caused by roundoff, eta*w > 0,
+                 ! we simply use one newton step instead. this way
+                 ! will guarantee eta*w < 0.
+                 if( w*eta>=zero )eta = -w / dw
+                 eta = eta / ( sigma+sqrt( sigma*sigma+eta ) )
+                 temp=tau+eta
+                 if( temp>sgub .or. temp<sglb ) then
+                    if( w<zero ) then
+                       eta = ( sgub-tau ) / two
+                    else
+                       eta = ( sglb-tau ) / two
+                    end if
+                    if( geomavg ) then
+                       if( w < zero ) then
+                          if( tau > zero ) then
+                             eta = sqrt(sgub*tau)-tau
+                          end if
+                       else
+                          if( sglb > zero ) then
+                             eta = sqrt(sglb*tau)-tau
+                          end if
+                       end if
+                    end if
+                 end if
+                 prew = w
+                 tau = tau + eta
+                 sigma = sigma + eta
+                 do j = 1, n
+                    work( j ) = work( j ) + eta
+                    delta( j ) = delta( j ) - eta
+                 end do
+                 ! evaluate psi and the derivative dpsi
+                 dpsi = zero
+                 psi = zero
+                 erretm = zero
+                 do j = 1, iim1
+                    temp = z( j ) / ( work( j )*delta( j ) )
+                    psi = psi + z( j )*temp
+                    dpsi = dpsi + temp*temp
+                    erretm = erretm + psi
+                 end do
+                 erretm = abs( erretm )
+                 ! evaluate phi and the derivative dphi
+                 dphi = zero
+                 phi = zero
+                 do j = n, iip1, -1
+                    temp = z( j ) / ( work( j )*delta( j ) )
+                    phi = phi + z( j )*temp
+                    dphi = dphi + temp*temp
+                    erretm = erretm + phi
+                 end do
+                 tau2 = work( ii )*delta( ii )
+                 temp = z( ii ) / tau2
+                 dw = dpsi + dphi + temp*temp
+                 temp = z( ii )*temp
+                 w = rhoinv + phi + psi + temp
+                 erretm = eight*( phi-psi ) + erretm + two*rhoinv+ three*abs( temp )
+          ! $             + abs( tau2 )*dw
+                 if( w*prew>zero .and. abs( w )>abs( prew ) / ten )swtch = .not.swtch
+              end do loop_230
+              ! return with info = 1, niter = maxit and not converged
+              info = 1
+           end if
+           240 continue
+           return
+     end subroutine stdlib_slasd4
+
+     !> SLASD7: merges the two sets of singular values together into a single
+     !> sorted set. Then it tries to deflate the size of the problem. There
+     !> are two ways in which deflation can occur:  when two or more singular
+     !> values are close together or if there is a tiny entry in the Z
+     !> vector. For each such occurrence the order of the related
+     !> secular equation problem is reduced by one.
+     !> SLASD7 is called from SLASD6.
+
+     pure subroutine stdlib_slasd7( icompq, nl, nr, sqre, k, d, z, zw, vf, vfw, vl,vlw, alpha, &
+     beta, dsigma, idx, idxp, idxq,perm, givptr, givcol, ldgcol, givnum, ldgnum,c, s, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: givptr, info, k
+           integer(ilp), intent(in) :: icompq, ldgcol, ldgnum, nl, nr, sqre
+           real(sp), intent(in) :: alpha, beta
+           real(sp), intent(out) :: c, s
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(ldgcol,*), idx(*), idxp(*), perm(*)
+           integer(ilp), intent(inout) :: idxq(*)
+           real(sp), intent(inout) :: d(*), vf(*), vl(*)
+           real(sp), intent(out) :: dsigma(*), givnum(ldgnum,*), vfw(*), vlw(*), z(*), zw(*)
+                     
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, idxi, idxj, idxjp, j, jp, jprev, k2, m, n, nlp1, nlp2
+           real(sp) :: eps, hlftol, tau, tol, z1
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           n = nl + nr + 1
+           m = n + sqre
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( nl<1 ) then
+              info = -2
+           else if( nr<1 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( ldgcol<n ) then
+              info = -22
+           else if( ldgnum<n ) then
+              info = -24
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLASD7', -info )
+              return
+           end if
+           nlp1 = nl + 1
+           nlp2 = nl + 2
+           if( icompq==1 ) then
+              givptr = 0
+           end if
+           ! generate the first part of the vector z and move the singular
+           ! values in the first part of d one position backward.
+           z1 = alpha*vl( nlp1 )
+           vl( nlp1 ) = zero
+           tau = vf( nlp1 )
+           do i = nl, 1, -1
+              z( i+1 ) = alpha*vl( i )
+              vl( i ) = zero
+              vf( i+1 ) = vf( i )
+              d( i+1 ) = d( i )
+              idxq( i+1 ) = idxq( i ) + 1
+           end do
+           vf( 1 ) = tau
+           ! generate the second part of the vector z.
+           do i = nlp2, m
+              z( i ) = beta*vf( i )
+              vf( i ) = zero
+           end do
+           ! sort the singular values into increasing order
+           do i = nlp2, n
+              idxq( i ) = idxq( i ) + nlp1
+           end do
+           ! dsigma, idxc, idxc, and zw are used as storage space.
+           do i = 2, n
+              dsigma( i ) = d( idxq( i ) )
+              zw( i ) = z( idxq( i ) )
+              vfw( i ) = vf( idxq( i ) )
+              vlw( i ) = vl( idxq( i ) )
+           end do
+           call stdlib_slamrg( nl, nr, dsigma( 2 ), 1, 1, idx( 2 ) )
+           do i = 2, n
+              idxi = 1 + idx( i )
+              d( i ) = dsigma( idxi )
+              z( i ) = zw( idxi )
+              vf( i ) = vfw( idxi )
+              vl( i ) = vlw( idxi )
+           end do
+           ! calculate the allowable deflation tolerance
+           eps = stdlib_slamch( 'EPSILON' )
+           tol = max( abs( alpha ), abs( beta ) )
+           tol = eight*eight*eps*max( abs( d( n ) ), tol )
+           ! there are 2 kinds of deflation -- first a value in the z-vector
+           ! is small, second two (or more) singular values are very close
+           ! together (their difference is small).
+           ! if the value in the z-vector is small, we simply permute the
+           ! array so that the corresponding singular value is moved to the
+           ! end.
+           ! if two values in the d-vector are close, we perform a two-sided
+           ! rotation designed to make one of the corresponding z-vector
+           ! entries zero, and then permute the array so that the deflated
+           ! singular value is moved to the end.
+           ! if there are multiple singular values then the problem deflates.
+           ! here the number of equal singular values are found.  as each equal
+           ! singular value is found, an elementary reflector is computed to
+           ! rotate the corresponding singular subspace so that the
+           ! corresponding components of z are zero in this new basis.
+           k = 1
+           k2 = n + 1
+           do j = 2, n
+              if( abs( z( j ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 idxp( k2 ) = j
+                 if( j==n )go to 100
+              else
+                 jprev = j
+                 go to 70
+              end if
+           end do
+           70 continue
+           j = jprev
+           80 continue
+           j = j + 1
+           if( j>n )go to 90
+           if( abs( z( j ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              idxp( k2 ) = j
+           else
+              ! check if singular values are close enough to allow deflation.
+              if( abs( d( j )-d( jprev ) )<=tol ) then
+                 ! deflation is possible.
+                 s = z( jprev )
+                 c = z( j )
+                 ! find sqrt(a**2+b**2) without overflow or
+                 ! destructive underflow.
+                 tau = stdlib_slapy2( c, s )
+                 z( j ) = tau
+                 z( jprev ) = zero
+                 c = c / tau
+                 s = -s / tau
+                 ! record the appropriate givens rotation
+                 if( icompq==1 ) then
+                    givptr = givptr + 1
+                    idxjp = idxq( idx( jprev )+1 )
+                    idxj = idxq( idx( j )+1 )
+                    if( idxjp<=nlp1 ) then
+                       idxjp = idxjp - 1
+                    end if
+                    if( idxj<=nlp1 ) then
+                       idxj = idxj - 1
+                    end if
+                    givcol( givptr, 2 ) = idxjp
+                    givcol( givptr, 1 ) = idxj
+                    givnum( givptr, 2 ) = c
+                    givnum( givptr, 1 ) = s
+                 end if
+                 call stdlib_srot( 1, vf( jprev ), 1, vf( j ), 1, c, s )
+                 call stdlib_srot( 1, vl( jprev ), 1, vl( j ), 1, c, s )
+                 k2 = k2 - 1
+                 idxp( k2 ) = jprev
+                 jprev = j
+              else
+                 k = k + 1
+                 zw( k ) = z( jprev )
+                 dsigma( k ) = d( jprev )
+                 idxp( k ) = jprev
+                 jprev = j
+              end if
+           end if
+           go to 80
+           90 continue
+           ! record the last singular value.
+           k = k + 1
+           zw( k ) = z( jprev )
+           dsigma( k ) = d( jprev )
+           idxp( k ) = jprev
+           100 continue
+           ! sort the singular values into dsigma. the singular values which
+           ! were not deflated go into the first k slots of dsigma, except
+           ! that dsigma(1) is treated separately.
+           do j = 2, n
+              jp = idxp( j )
+              dsigma( j ) = d( jp )
+              vfw( j ) = vf( jp )
+              vlw( j ) = vl( jp )
+           end do
+           if( icompq==1 ) then
+              do j = 2, n
+                 jp = idxp( j )
+                 perm( j ) = idxq( idx( jp )+1 )
+                 if( perm( j )<=nlp1 ) then
+                    perm( j ) = perm( j ) - 1
+                 end if
+              end do
+           end if
+           ! the deflated singular values go back into the last n - k slots of
+           ! d.
+           call stdlib_scopy( n-k, dsigma( k+1 ), 1, d( k+1 ), 1 )
+           ! determine dsigma(1), dsigma(2), z(1), vf(1), vl(1), vf(m), and
+           ! vl(m).
+           dsigma( 1 ) = zero
+           hlftol = tol / two
+           if( abs( dsigma( 2 ) )<=hlftol )dsigma( 2 ) = hlftol
+           if( m>n ) then
+              z( 1 ) = stdlib_slapy2( z1, z( m ) )
+              if( z( 1 )<=tol ) then
+                 c = one
+                 s = zero
+                 z( 1 ) = tol
+              else
+                 c = z1 / z( 1 )
+                 s = -z( m ) / z( 1 )
+              end if
+              call stdlib_srot( 1, vf( m ), 1, vf( 1 ), 1, c, s )
+              call stdlib_srot( 1, vl( m ), 1, vl( 1 ), 1, c, s )
+           else
+              if( abs( z1 )<=tol ) then
+                 z( 1 ) = tol
+              else
+                 z( 1 ) = z1
+              end if
+           end if
+           ! restore z, vf, and vl.
+           call stdlib_scopy( k-1, zw( 2 ), 1, z( 2 ), 1 )
+           call stdlib_scopy( n-1, vfw( 2 ), 1, vf( 2 ), 1 )
+           call stdlib_scopy( n-1, vlw( 2 ), 1, vl( 2 ), 1 )
+           return
+     end subroutine stdlib_slasd7
+
+     !> SLASD8: finds the square roots of the roots of the secular equation,
+     !> as defined by the values in DSIGMA and Z. It makes the appropriate
+     !> calls to SLASD4, and stores, for each  element in D, the distance
+     !> to its two nearest poles (elements in DSIGMA). It also updates
+     !> the arrays VF and VL, the first and last components of all the
+     !> right singular vectors of the original bidiagonal matrix.
+     !> SLASD8 is called from SLASD6.
+
+     pure subroutine stdlib_slasd8( icompq, k, d, z, vf, vl, difl, difr, lddifr,dsigma, work, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, k, lddifr
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(out) :: d(*), difl(*), difr(lddifr,*), work(*)
+           real(sp), intent(inout) :: dsigma(*), vf(*), vl(*), z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iwk1, iwk2, iwk2i, iwk3, iwk3i, j
+           real(sp) :: diflj, difrj, dj, dsigj, dsigjp, rho, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( k<1 ) then
+              info = -2
+           else if( lddifr<k ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLASD8', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==1 ) then
+              d( 1 ) = abs( z( 1 ) )
+              difl( 1 ) = d( 1 )
+              if( icompq==1 ) then
+                 difl( 2 ) = one
+                 difr( 1, 2 ) = one
+              end if
+              return
+           end if
+           ! modify values dsigma(i) to make sure all dsigma(i)-dsigma(j) can
+           ! be computed with high relative accuracy (barring over/underflow).
+           ! this is a problem on machines without a guard digit in
+           ! add/subtract (cray xmp, cray ymp, cray c 90 and cray 2).
+           ! the following code replaces dsigma(i) by 2*dsigma(i)-dsigma(i),
+           ! which on any of these machines zeros out the bottommost
+           ! bit of dsigma(i) if it is 1; this makes the subsequent
+           ! subtractions dsigma(i)-dsigma(j) unproblematic when cancellation
+           ! occurs. on binary machines with a guard digit (almost all
+           ! machines) it does not change dsigma(i) at all. on hexadecimal
+           ! and decimal machines with a guard digit, it slightly
+           ! changes the bottommost bits of dsigma(i). it does not account
+           ! for hexadecimal or decimal machines without guard digits
+           ! (we know of none). we use a subroutine call to compute
+           ! 2*dlambda(i) to prevent optimizing compilers from eliminating
+           ! this code.
+           do i = 1, k
+              dsigma( i ) = stdlib_slamc3( dsigma( i ), dsigma( i ) ) - dsigma( i )
+           end do
+           ! book keeping.
+           iwk1 = 1
+           iwk2 = iwk1 + k
+           iwk3 = iwk2 + k
+           iwk2i = iwk2 - 1
+           iwk3i = iwk3 - 1
+           ! normalize z.
+           rho = stdlib_snrm2( k, z, 1 )
+           call stdlib_slascl( 'G', 0, 0, rho, one, k, 1, z, k, info )
+           rho = rho*rho
+           ! initialize work(iwk3).
+           call stdlib_slaset( 'A', k, 1, one, one, work( iwk3 ), k )
+           ! compute the updated singular values, the arrays difl, difr,
+           ! and the updated z.
+           do j = 1, k
+              call stdlib_slasd4( k, j, dsigma, z, work( iwk1 ), rho, d( j ),work( iwk2 ), info )
+                        
+              ! if the root finder fails, report the convergence failure.
+              if( info/=0 ) then
+                 return
+              end if
+              work( iwk3i+j ) = work( iwk3i+j )*work( j )*work( iwk2i+j )
+              difl( j ) = -work( j )
+              difr( j, 1 ) = -work( j+1 )
+              do i = 1, j - 1
+                 work( iwk3i+i ) = work( iwk3i+i )*work( i )*work( iwk2i+i ) / ( dsigma( i )-&
+                           dsigma( j ) ) / ( dsigma( i )+dsigma( j ) )
+              end do
+              do i = j + 1, k
+                 work( iwk3i+i ) = work( iwk3i+i )*work( i )*work( iwk2i+i ) / ( dsigma( i )-&
+                           dsigma( j ) ) / ( dsigma( i )+dsigma( j ) )
+              end do
+           end do
+           ! compute updated z.
+           do i = 1, k
+              z( i ) = sign( sqrt( abs( work( iwk3i+i ) ) ), z( i ) )
+           end do
+           ! update vf and vl.
+           do j = 1, k
+              diflj = difl( j )
+              dj = d( j )
+              dsigj = -dsigma( j )
+              if( j<k ) then
+                 difrj = -difr( j, 1 )
+                 dsigjp = -dsigma( j+1 )
+              end if
+              work( j ) = -z( j ) / diflj / ( dsigma( j )+dj )
+              do i = 1, j - 1
+                 work( i ) = z( i ) / ( stdlib_slamc3( dsigma( i ), dsigj )-diflj )/ ( dsigma( i )&
+                           +dj )
+              end do
+              do i = j + 1, k
+                 work( i ) = z( i ) / ( stdlib_slamc3( dsigma( i ), dsigjp )+difrj )/ ( dsigma( i &
+                           )+dj )
+              end do
+              temp = stdlib_snrm2( k, work, 1 )
+              work( iwk2i+j ) = stdlib_sdot( k, work, 1, vf, 1 ) / temp
+              work( iwk3i+j ) = stdlib_sdot( k, work, 1, vl, 1 ) / temp
+              if( icompq==1 ) then
+                 difr( j, 2 ) = temp
+              end if
+           end do
+           call stdlib_scopy( k, work( iwk2 ), 1, vf, 1 )
+           call stdlib_scopy( k, work( iwk3 ), 1, vl, 1 )
+           return
+     end subroutine stdlib_slasd8
+
+     !> SLASQ3: checks for deflation, computes a shift (TAU) and calls dqds.
+     !> In case of failure it changes shifts, and tries again until output
+     !> is positive.
+
+     pure subroutine stdlib_slasq3( i0, n0, z, pp, dmin, sigma, desig, qmax, nfail,iter, ndiv, &
+               ieee, ttype, dmin1, dmin2, dn, dn1,dn2, g, tau )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: ieee
+           integer(ilp), intent(in) :: i0
+           integer(ilp), intent(inout) :: iter, n0, ndiv, nfail, pp
+           real(sp), intent(inout) :: desig, dmin1, dmin2, dn, dn1, dn2, g, qmax, tau
+           real(sp), intent(out) :: dmin, sigma
+           ! Array Arguments 
+           real(sp), intent(inout) :: z(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: cbias = 1.50_sp
+           real(sp), parameter :: qurtr = 0.250_sp
+           real(sp), parameter :: hundrd = 100.0_sp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: ipn4, j4, n0in, nn
+           integer(ilp), intent(inout) :: ttype
+           real(sp) :: eps, s, t, temp, tol, tol2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           n0in = n0
+           eps = stdlib_slamch( 'PRECISION' )
+           tol = eps*hundrd
+           tol2 = tol**2
+           ! check for deflation.
+           10 continue
+           if( n0<i0 )return
+           if( n0==i0 )go to 20
+           nn = 4*n0 + pp
+           if( n0==( i0+1 ) )go to 40
+           ! check whether e(n0-1) is negligible, 1 eigenvalue.
+           if( z( nn-5 )>tol2*( sigma+z( nn-3 ) ) .and.z( nn-2*pp-4 )>tol2*z( nn-7 ) )go to &
+                     30
+           20 continue
+           z( 4*n0-3 ) = z( 4*n0+pp-3 ) + sigma
+           n0 = n0 - 1
+           go to 10
+           ! check  whether e(n0-2) is negligible, 2 eigenvalues.
+           30 continue
+           if( z( nn-9 )>tol2*sigma .and.z( nn-2*pp-8 )>tol2*z( nn-11 ) )go to 50
+           40 continue
+           if( z( nn-3 )>z( nn-7 ) ) then
+              s = z( nn-3 )
+              z( nn-3 ) = z( nn-7 )
+              z( nn-7 ) = s
+           end if
+           t = half*( ( z( nn-7 )-z( nn-3 ) )+z( nn-5 ) )
+           if( z( nn-5 )>z( nn-3 )*tol2.and.t/=zero ) then
+              s = z( nn-3 )*( z( nn-5 ) / t )
+              if( s<=t ) then
+                 s = z( nn-3 )*( z( nn-5 ) /( t*( one+sqrt( one+s / t ) ) ) )
+              else
+                 s = z( nn-3 )*( z( nn-5 ) / ( t+sqrt( t )*sqrt( t+s ) ) )
+              end if
+              t = z( nn-7 ) + ( s+z( nn-5 ) )
+              z( nn-3 ) = z( nn-3 )*( z( nn-7 ) / t )
+              z( nn-7 ) = t
+           end if
+           z( 4*n0-7 ) = z( nn-7 ) + sigma
+           z( 4*n0-3 ) = z( nn-3 ) + sigma
+           n0 = n0 - 2
+           go to 10
+           50 continue
+           if( pp==2 )pp = 0
+           ! reverse the qd-array, if warranted.
+           if( dmin<=zero .or. n0<n0in ) then
+              if( cbias*z( 4*i0+pp-3 )<z( 4*n0+pp-3 ) ) then
+                 ipn4 = 4*( i0+n0 )
+                 do j4 = 4*i0, 2*( i0+n0-1 ), 4
+                    temp = z( j4-3 )
+                    z( j4-3 ) = z( ipn4-j4-3 )
+                    z( ipn4-j4-3 ) = temp
+                    temp = z( j4-2 )
+                    z( j4-2 ) = z( ipn4-j4-2 )
+                    z( ipn4-j4-2 ) = temp
+                    temp = z( j4-1 )
+                    z( j4-1 ) = z( ipn4-j4-5 )
+                    z( ipn4-j4-5 ) = temp
+                    temp = z( j4 )
+                    z( j4 ) = z( ipn4-j4-4 )
+                    z( ipn4-j4-4 ) = temp
+                 end do
+                 if( n0-i0<=4 ) then
+                    z( 4*n0+pp-1 ) = z( 4*i0+pp-1 )
+                    z( 4*n0-pp ) = z( 4*i0-pp )
+                 end if
+                 dmin2 = min( dmin2, z( 4*n0+pp-1 ) )
+                 z( 4*n0+pp-1 ) = min( z( 4*n0+pp-1 ), z( 4*i0+pp-1 ),z( 4*i0+pp+3 ) )
+                 z( 4*n0-pp ) = min( z( 4*n0-pp ), z( 4*i0-pp ),z( 4*i0-pp+4 ) )
+                 qmax = max( qmax, z( 4*i0+pp-3 ), z( 4*i0+pp+1 ) )
+                 dmin = -zero
+              end if
+           end if
+           ! choose a shift.
+           call stdlib_slasq4( i0, n0, z, pp, n0in, dmin, dmin1, dmin2, dn, dn1,dn2, tau, ttype, &
+                     g )
+           ! call dqds until dmin > 0.
+           70 continue
+           call stdlib_slasq5( i0, n0, z, pp, tau, sigma, dmin, dmin1, dmin2, dn,dn1, dn2, ieee, &
+                     eps )
+           ndiv = ndiv + ( n0-i0+2 )
+           iter = iter + 1
+           ! check status.
+           if( dmin>=zero .and. dmin1>=zero ) then
+              ! success.
+              go to 90
+           else if( dmin<zero .and. dmin1>zero .and.z( 4*( n0-1 )-pp )<tol*( sigma+dn1 ) .and.abs(&
+                      dn )<tol*sigma ) then
+              ! convergence hidden by negative dn.
+              z( 4*( n0-1 )-pp+2 ) = zero
+              dmin = zero
+              go to 90
+           else if( dmin<zero ) then
+              ! tau too big. select new tau and try again.
+              nfail = nfail + 1
+              if( ttype<-22 ) then
+                 ! failed twice. play it safe.
+                 tau = zero
+              else if( dmin1>zero ) then
+                 ! late failure. gives excellent shift.
+                 tau = ( tau+dmin )*( one-two*eps )
+                 ttype = ttype - 11
+              else
+                 ! early failure. divide by 4.
+                 tau = qurtr*tau
+                 ttype = ttype - 12
+              end if
+              go to 70
+           else if( stdlib_sisnan( dmin ) ) then
+              ! nan.
+              if( tau==zero ) then
+                 go to 80
+              else
+                 tau = zero
+                 go to 70
+              end if
+           else
+              ! possible underflow. play it safe.
+              go to 80
+           end if
+           ! risk of underflow.
+           80 continue
+           call stdlib_slasq6( i0, n0, z, pp, dmin, dmin1, dmin2, dn, dn1, dn2 )
+           ndiv = ndiv + ( n0-i0+2 )
+           iter = iter + 1
+           tau = zero
+           90 continue
+           if( tau<sigma ) then
+              desig = desig + tau
+              t = sigma + desig
+              desig = desig - ( t-sigma )
+           else
+              t = sigma + tau
+              desig = sigma - ( t-tau ) + desig
+           end if
+           sigma = t
+           return
+     end subroutine stdlib_slasq3
+
+     !> SLATDF: uses the LU factorization of the n-by-n matrix Z computed by
+     !> SGETC2 and computes a contribution to the reciprocal Dif-estimate
+     !> by solving Z * x = b for x, and choosing the r.h.s. b such that
+     !> the norm of x is as large as possible. On entry RHS = b holds the
+     !> contribution from earlier solved sub-systems, and on return RHS = x.
+     !> The factorization of Z returned by SGETC2 has the form Z = P*L*U*Q,
+     !> where P and Q are permutation matrices. L is lower triangular with
+     !> unit diagonal elements and U is upper triangular.
+
+     pure subroutine stdlib_slatdf( ijob, n, z, ldz, rhs, rdsum, rdscal, ipiv,jpiv )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ijob, ldz, n
+           real(sp), intent(inout) :: rdscal, rdsum
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*), jpiv(*)
+           real(sp), intent(inout) :: rhs(*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxdim = 8
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j, k
+           real(sp) :: bm, bp, pmone, sminu, splus, temp
+           ! Local Arrays 
+           integer(ilp) :: iwork(maxdim)
+           real(sp) :: work(4*maxdim), xm(maxdim), xp(maxdim)
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( ijob/=2 ) then
+              ! apply permutations ipiv to rhs
+              call stdlib_slaswp( 1, rhs, ldz, 1, n-1, ipiv, 1 )
+              ! solve for l-part choosing rhs either to +1 or -1.
+              pmone = -one
+              loop_10: do j = 1, n - 1
+                 bp = rhs( j ) + one
+                 bm = rhs( j ) - one
+                 splus = one
+                 ! look-ahead for l-part rhs(1:n-1) = + or -1, splus and
+                 ! smin computed more efficiently than in bsolve [1].
+                 splus = splus + stdlib_sdot( n-j, z( j+1, j ), 1, z( j+1, j ), 1 )
+                 sminu = stdlib_sdot( n-j, z( j+1, j ), 1, rhs( j+1 ), 1 )
+                 splus = splus*rhs( j )
+                 if( splus>sminu ) then
+                    rhs( j ) = bp
+                 else if( sminu>splus ) then
+                    rhs( j ) = bm
+                 else
+                    ! in this case the updating sums are equal and we can
+                    ! choose rhs(j) +1 or -1. the first time this happens
+                    ! we choose -1, thereafter +1. this is a simple way to
+                    ! get good estimates of matrices like byers well-known
+                    ! example (see [1]). (not done in bsolve.)
+                    rhs( j ) = rhs( j ) + pmone
+                    pmone = one
+                 end if
+                 ! compute the remaining r.h.s.
+                 temp = -rhs( j )
+                 call stdlib_saxpy( n-j, temp, z( j+1, j ), 1, rhs( j+1 ), 1 )
+              end do loop_10
+              ! solve for u-part, look-ahead for rhs(n) = +-1. this is not done
+              ! in bsolve and will hopefully give us a better estimate because
+              ! any ill-conditioning of the original matrix is transferred to u
+              ! and not to l. u(n, n) is an approximation to sigma_min(lu).
+              call stdlib_scopy( n-1, rhs, 1, xp, 1 )
+              xp( n ) = rhs( n ) + one
+              rhs( n ) = rhs( n ) - one
+              splus = zero
+              sminu = zero
+              do i = n, 1, -1
+                 temp = one / z( i, i )
+                 xp( i ) = xp( i )*temp
+                 rhs( i ) = rhs( i )*temp
+                 do k = i + 1, n
+                    xp( i ) = xp( i ) - xp( k )*( z( i, k )*temp )
+                    rhs( i ) = rhs( i ) - rhs( k )*( z( i, k )*temp )
+                 end do
+                 splus = splus + abs( xp( i ) )
+                 sminu = sminu + abs( rhs( i ) )
+              end do
+              if( splus>sminu )call stdlib_scopy( n, xp, 1, rhs, 1 )
+              ! apply the permutations jpiv to the computed solution (rhs)
+              call stdlib_slaswp( 1, rhs, ldz, 1, n-1, jpiv, -1 )
+              ! compute the sum of squares
+              call stdlib_slassq( n, rhs, 1, rdscal, rdsum )
+           else
+              ! ijob = 2, compute approximate nullvector xm of z
+              call stdlib_sgecon( 'I', n, z, ldz, one, temp, work, iwork, info )
+              call stdlib_scopy( n, work( n+1 ), 1, xm, 1 )
+              ! compute rhs
+              call stdlib_slaswp( 1, xm, ldz, 1, n-1, ipiv, -1 )
+              temp = one / sqrt( stdlib_sdot( n, xm, 1, xm, 1 ) )
+              call stdlib_sscal( n, temp, xm, 1 )
+              call stdlib_scopy( n, xm, 1, xp, 1 )
+              call stdlib_saxpy( n, one, rhs, 1, xp, 1 )
+              call stdlib_saxpy( n, -one, xm, 1, rhs, 1 )
+              call stdlib_sgesc2( n, z, ldz, rhs, ipiv, jpiv, temp )
+              call stdlib_sgesc2( n, z, ldz, xp, ipiv, jpiv, temp )
+              if( stdlib_sasum( n, xp, 1 )>stdlib_sasum( n, rhs, 1 ) )call stdlib_scopy( n, xp, 1,&
+                         rhs, 1 )
+              ! compute the sum of squares
+              call stdlib_slassq( n, rhs, 1, rdscal, rdsum )
+           end if
+           return
+     end subroutine stdlib_slatdf
+
+     !> SLATRD: reduces NB rows and columns of a real symmetric matrix A to
+     !> symmetric tridiagonal form by an orthogonal similarity
+     !> transformation Q**T * A * Q, and returns the matrices V and W which are
+     !> needed to apply the transformation to the unreduced part of A.
+     !> If UPLO = 'U', SLATRD reduces the last NB rows and columns of a
+     !> matrix, of which the upper triangle is supplied;
+     !> if UPLO = 'L', SLATRD reduces the first NB rows and columns of a
+     !> matrix, of which the lower triangle is supplied.
+     !> This is an auxiliary routine called by SSYTRD.
+
+     pure subroutine stdlib_slatrd( uplo, n, nb, a, lda, e, tau, w, ldw )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: e(*), tau(*), w(ldw,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iw
+           real(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 )return
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! reduce last nb columns of upper triangle
+              loop_10: do i = n, n - nb + 1, -1
+                 iw = i - n + nb
+                 if( i<n ) then
+                    ! update a(1:i,i)
+                    call stdlib_sgemv( 'NO TRANSPOSE', i, n-i, -one, a( 1, i+1 ),lda, w( i, iw+1 )&
+                              , ldw, one, a( 1, i ), 1 )
+                    call stdlib_sgemv( 'NO TRANSPOSE', i, n-i, -one, w( 1, iw+1 ),ldw, a( i, i+1 )&
+                              , lda, one, a( 1, i ), 1 )
+                 end if
+                 if( i>1 ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(1:i-2,i)
+                    call stdlib_slarfg( i-1, a( i-1, i ), a( 1, i ), 1, tau( i-1 ) )
+                    e( i-1 ) = a( i-1, i )
+                    a( i-1, i ) = one
+                    ! compute w(1:i-1,i)
+                    call stdlib_ssymv( 'UPPER', i-1, one, a, lda, a( 1, i ), 1,zero, w( 1, iw ), &
+                              1 )
+                    if( i<n ) then
+                       call stdlib_sgemv( 'TRANSPOSE', i-1, n-i, one, w( 1, iw+1 ),ldw, a( 1, i ),&
+                                  1, zero, w( i+1, iw ), 1 )
+                       call stdlib_sgemv( 'NO TRANSPOSE', i-1, n-i, -one,a( 1, i+1 ), lda, w( i+1,&
+                                  iw ), 1, one,w( 1, iw ), 1 )
+                       call stdlib_sgemv( 'TRANSPOSE', i-1, n-i, one, a( 1, i+1 ),lda, a( 1, i ), &
+                                 1, zero, w( i+1, iw ), 1 )
+                       call stdlib_sgemv( 'NO TRANSPOSE', i-1, n-i, -one,w( 1, iw+1 ), ldw, w( i+&
+                                 1, iw ), 1, one,w( 1, iw ), 1 )
+                    end if
+                    call stdlib_sscal( i-1, tau( i-1 ), w( 1, iw ), 1 )
+                    alpha = -half*tau( i-1 )*stdlib_sdot( i-1, w( 1, iw ), 1,a( 1, i ), 1 )
+                              
+                    call stdlib_saxpy( i-1, alpha, a( 1, i ), 1, w( 1, iw ), 1 )
+                 end if
+              end do loop_10
+           else
+              ! reduce first nb columns of lower triangle
+              do i = 1, nb
+                 ! update a(i:n,i)
+                 call stdlib_sgemv( 'NO TRANSPOSE', n-i+1, i-1, -one, a( i, 1 ),lda, w( i, 1 ), &
+                           ldw, one, a( i, i ), 1 )
+                 call stdlib_sgemv( 'NO TRANSPOSE', n-i+1, i-1, -one, w( i, 1 ),ldw, a( i, 1 ), &
+                           lda, one, a( i, i ), 1 )
+                 if( i<n ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(i+2:n,i)
+                    call stdlib_slarfg( n-i, a( i+1, i ), a( min( i+2, n ), i ), 1,tau( i ) )
+                              
+                    e( i ) = a( i+1, i )
+                    a( i+1, i ) = one
+                    ! compute w(i+1:n,i)
+                    call stdlib_ssymv( 'LOWER', n-i, one, a( i+1, i+1 ), lda,a( i+1, i ), 1, zero,&
+                               w( i+1, i ), 1 )
+                    call stdlib_sgemv( 'TRANSPOSE', n-i, i-1, one, w( i+1, 1 ), ldw,a( i+1, i ), &
+                              1, zero, w( 1, i ), 1 )
+                    call stdlib_sgemv( 'NO TRANSPOSE', n-i, i-1, -one, a( i+1, 1 ),lda, w( 1, i ),&
+                               1, one, w( i+1, i ), 1 )
+                    call stdlib_sgemv( 'TRANSPOSE', n-i, i-1, one, a( i+1, 1 ), lda,a( i+1, i ), &
+                              1, zero, w( 1, i ), 1 )
+                    call stdlib_sgemv( 'NO TRANSPOSE', n-i, i-1, -one, w( i+1, 1 ),ldw, w( 1, i ),&
+                               1, one, w( i+1, i ), 1 )
+                    call stdlib_sscal( n-i, tau( i ), w( i+1, i ), 1 )
+                    alpha = -half*tau( i )*stdlib_sdot( n-i, w( i+1, i ), 1,a( i+1, i ), 1 )
+                              
+                    call stdlib_saxpy( n-i, alpha, a( i+1, i ), 1, w( i+1, i ), 1 )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_slatrd
+
+     !> SLATRZ: factors the M-by-(M+L) real upper trapezoidal matrix
+     !> [ A1 A2 ] = [ A(1:M,1:M) A(1:M,N-L+1:N) ] as ( R  0 ) * Z, by means
+     !> of orthogonal transformations.  Z is an (M+L)-by-(M+L) orthogonal
+     !> matrix and, R and A1 are M-by-M upper triangular matrices.
+
+     pure subroutine stdlib_slatrz( m, n, l, a, lda, tau, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: l, lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Executable Statements 
+           ! test the input arguments
+           ! quick return if possible
+           if( m==0 ) then
+              return
+           else if( m==n ) then
+              do i = 1, n
+                 tau( i ) = zero
+              end do
+              return
+           end if
+           do i = m, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! [ a(i,i) a(i,n-l+1:n) ]
+              call stdlib_slarfg( l+1, a( i, i ), a( i, n-l+1 ), lda, tau( i ) )
+              ! apply h(i) to a(1:i-1,i:n) from the right
+              call stdlib_slarz( 'RIGHT', i-1, n-i+1, l, a( i, n-l+1 ), lda,tau( i ), a( 1, i ), &
+                        lda, work )
+           end do
+           return
+     end subroutine stdlib_slatrz
+
+     !> SORBDB: simultaneously bidiagonalizes the blocks of an M-by-M
+     !> partitioned orthogonal matrix X:
+     !> [ B11 | B12 0  0 ]
+     !> [ X11 | X12 ]   [ P1 |    ] [  0  |  0 -I  0 ] [ Q1 |    ]**T
+     !> X = [-----------] = [---------] [----------------] [---------]   .
+     !> [ X21 | X22 ]   [    | P2 ] [ B21 | B22 0  0 ] [    | Q2 ]
+     !> [  0  |  0  0  I ]
+     !> X11 is P-by-Q. Q must be no larger than P, M-P, or M-Q. (If this is
+     !> not the case, then X must be transposed and/or permuted. This can be
+     !> done in constant time using the TRANS and SIGNS options. See SORCSD
+     !> for details.)
+     !> The orthogonal matrices P1, P2, Q1, and Q2 are P-by-P, (M-P)-by-
+     !> (M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. They are
+     !> represented implicitly by Householder vectors.
+     !> B11, B12, B21, and B22 are Q-by-Q bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_sorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+               ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: signs, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldx11, ldx12, ldx21, ldx22, lwork, m, p, q
+           ! Array Arguments 
+           real(sp), intent(out) :: phi(*), theta(*)
+           real(sp), intent(out) :: taup1(*), taup2(*), tauq1(*), tauq2(*), work(*)
+           real(sp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
+                     
+        ! ====================================================================
+           ! Parameters 
+           real(sp), parameter :: realone = 1.0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: colmajor, lquery
+           integer(ilp) :: i, lworkmin, lworkopt
+           real(sp) :: z1, z2, z3, z4
+           ! Intrinsic Functions
+           intrinsic :: atan2,cos,max,sin
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           if( .not. stdlib_lsame( signs, 'O' ) ) then
+              z1 = realone
+              z2 = realone
+              z3 = realone
+              z4 = realone
+           else
+              z1 = realone
+              z2 = -realone
+              z3 = realone
+              z4 = -realone
+           end if
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -3
+           else if( p < 0 .or. p > m ) then
+              info = -4
+           else if( q < 0 .or. q > p .or. q > m-p .or.q > m-q ) then
+              info = -5
+           else if( colmajor .and. ldx11 < max( 1, p ) ) then
+              info = -7
+           else if( .not.colmajor .and. ldx11 < max( 1, q ) ) then
+              info = -7
+           else if( colmajor .and. ldx12 < max( 1, p ) ) then
+              info = -9
+           else if( .not.colmajor .and. ldx12 < max( 1, m-q ) ) then
+              info = -9
+           else if( colmajor .and. ldx21 < max( 1, m-p ) ) then
+              info = -11
+           else if( .not.colmajor .and. ldx21 < max( 1, q ) ) then
+              info = -11
+           else if( colmajor .and. ldx22 < max( 1, m-p ) ) then
+              info = -13
+           else if( .not.colmajor .and. ldx22 < max( 1, m-q ) ) then
+              info = -13
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              lworkopt = m - q
+              lworkmin = m - q
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not. lquery ) then
+                 info = -21
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'XORBDB', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! handle column-major and row-major separately
+           if( colmajor ) then
+              ! reduce columns 1, ..., q of x11, x12, x21, and x22
+              do i = 1, q
+                 if( i == 1 ) then
+                    call stdlib_sscal( p-i+1, z1, x11(i,i), 1 )
+                 else
+                    call stdlib_sscal( p-i+1, z1*cos(phi(i-1)), x11(i,i), 1 )
+                    call stdlib_saxpy( p-i+1, -z1*z3*z4*sin(phi(i-1)), x12(i,i-1),1, x11(i,i), 1 )
+                              
+                 end if
+                 if( i == 1 ) then
+                    call stdlib_sscal( m-p-i+1, z2, x21(i,i), 1 )
+                 else
+                    call stdlib_sscal( m-p-i+1, z2*cos(phi(i-1)), x21(i,i), 1 )
+                    call stdlib_saxpy( m-p-i+1, -z2*z3*z4*sin(phi(i-1)), x22(i,i-1),1, x21(i,i), &
+                              1 )
+                 end if
+                 theta(i) = atan2( stdlib_snrm2( m-p-i+1, x21(i,i), 1 ),stdlib_snrm2( p-i+1, x11(&
+                           i,i), 1 ) )
+                 if( p > i ) then
+                    call stdlib_slarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+                 else if( p == i ) then
+                    call stdlib_slarfgp( p-i+1, x11(i,i), x11(i,i), 1, taup1(i) )
+                 end if
+                 x11(i,i) = one
+                 if ( m-p > i ) then
+                    call stdlib_slarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1,taup2(i) )
+                 else if ( m-p == i ) then
+                    call stdlib_slarfgp( m-p-i+1, x21(i,i), x21(i,i), 1, taup2(i) )
+                 end if
+                 x21(i,i) = one
+                 if ( q > i ) then
+                    call stdlib_slarf( 'L', p-i+1, q-i, x11(i,i), 1, taup1(i),x11(i,i+1), ldx11, &
+                              work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_slarf( 'L', p-i+1, m-q-i+1, x11(i,i), 1, taup1(i),x12(i,i), ldx12,&
+                               work )
+                 end if
+                 if ( q > i ) then
+                    call stdlib_slarf( 'L', m-p-i+1, q-i, x21(i,i), 1, taup2(i),x21(i,i+1), ldx21,&
+                               work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_slarf( 'L', m-p-i+1, m-q-i+1, x21(i,i), 1, taup2(i),x22(i,i), &
+                              ldx22, work )
+                 end if
+                 if( i < q ) then
+                    call stdlib_sscal( q-i, -z1*z3*sin(theta(i)), x11(i,i+1),ldx11 )
+                    call stdlib_saxpy( q-i, z2*z3*cos(theta(i)), x21(i,i+1), ldx21,x11(i,i+1), &
+                              ldx11 )
+                 end if
+                 call stdlib_sscal( m-q-i+1, -z1*z4*sin(theta(i)), x12(i,i), ldx12 )
+                 call stdlib_saxpy( m-q-i+1, z2*z4*cos(theta(i)), x22(i,i), ldx22,x12(i,i), ldx12 &
+                           )
+                 if( i < q )phi(i) = atan2( stdlib_snrm2( q-i, x11(i,i+1), ldx11 ),stdlib_snrm2( &
+                           m-q-i+1, x12(i,i), ldx12 ) )
+                 if( i < q ) then
+                    if ( q-i == 1 ) then
+                       call stdlib_slarfgp( q-i, x11(i,i+1), x11(i,i+1), ldx11,tauq1(i) )
+                    else
+                       call stdlib_slarfgp( q-i, x11(i,i+1), x11(i,i+2), ldx11,tauq1(i) )
+                    end if
+                    x11(i,i+1) = one
+                 end if
+                 if ( q+i-1 < m ) then
+                    if ( m-q == i ) then
+                       call stdlib_slarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
+                    else
+                       call stdlib_slarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
+                                 
+                    end if
+                 end if
+                 x12(i,i) = one
+                 if( i < q ) then
+                    call stdlib_slarf( 'R', p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x11(i+1,i+1), &
+                              ldx11, work )
+                    call stdlib_slarf( 'R', m-p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x21(i+1,i+1), &
+                              ldx21, work )
+                 end if
+                 if ( p > i ) then
+                    call stdlib_slarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
+                              ldx12, work )
+                 end if
+                 if ( m-p > i ) then
+                    call stdlib_slarf( 'R', m-p-i, m-q-i+1, x12(i,i), ldx12,tauq2(i), x22(i+1,i), &
+                              ldx22, work )
+                 end if
+              end do
+              ! reduce columns q + 1, ..., p of x12, x22
+              do i = q + 1, p
+                 call stdlib_sscal( m-q-i+1, -z1*z4, x12(i,i), ldx12 )
+                 if ( i >= m-q ) then
+                    call stdlib_slarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
+                 else
+                    call stdlib_slarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
+                 end if
+                 x12(i,i) = one
+                 if ( p > i ) then
+                    call stdlib_slarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
+                              ldx12, work )
+                 end if
+                 if( m-p-q >= 1 )call stdlib_slarf( 'R', m-p-q, m-q-i+1, x12(i,i), ldx12,tauq2(i),&
+                            x22(q+1,i), ldx22, work )
+              end do
+              ! reduce columns p + 1, ..., m - q of x12, x22
+              do i = 1, m - p - q
+                 call stdlib_sscal( m-p-q-i+1, z2*z4, x22(q+i,p+i), ldx22 )
+                 if ( i == m-p-q ) then
+                    call stdlib_slarfgp( m-p-q-i+1, x22(q+i,p+i), x22(q+i,p+i),ldx22, tauq2(p+i) )
+                              
+                 else
+                    call stdlib_slarfgp( m-p-q-i+1, x22(q+i,p+i), x22(q+i,p+i+1),ldx22, tauq2(p+i)&
+                               )
+                 end if
+                 x22(q+i,p+i) = one
+                 if ( i < m-p-q ) then
+                    call stdlib_slarf( 'R', m-p-q-i, m-p-q-i+1, x22(q+i,p+i), ldx22,tauq2(p+i), &
+                              x22(q+i+1,p+i), ldx22, work )
+                 end if
+              end do
+           else
+              ! reduce columns 1, ..., q of x11, x12, x21, x22
+              do i = 1, q
+                 if( i == 1 ) then
+                    call stdlib_sscal( p-i+1, z1, x11(i,i), ldx11 )
+                 else
+                    call stdlib_sscal( p-i+1, z1*cos(phi(i-1)), x11(i,i), ldx11 )
+                    call stdlib_saxpy( p-i+1, -z1*z3*z4*sin(phi(i-1)), x12(i-1,i),ldx12, x11(i,i),&
+                               ldx11 )
+                 end if
+                 if( i == 1 ) then
+                    call stdlib_sscal( m-p-i+1, z2, x21(i,i), ldx21 )
+                 else
+                    call stdlib_sscal( m-p-i+1, z2*cos(phi(i-1)), x21(i,i), ldx21 )
+                    call stdlib_saxpy( m-p-i+1, -z2*z3*z4*sin(phi(i-1)), x22(i-1,i),ldx22, x21(i,&
+                              i), ldx21 )
+                 end if
+                 theta(i) = atan2( stdlib_snrm2( m-p-i+1, x21(i,i), ldx21 ),stdlib_snrm2( p-i+1, &
+                           x11(i,i), ldx11 ) )
+                 call stdlib_slarfgp( p-i+1, x11(i,i), x11(i,i+1), ldx11, taup1(i) )
+                 x11(i,i) = one
+                 if ( i == m-p ) then
+                    call stdlib_slarfgp( m-p-i+1, x21(i,i), x21(i,i), ldx21,taup2(i) )
+                 else
+                    call stdlib_slarfgp( m-p-i+1, x21(i,i), x21(i,i+1), ldx21,taup2(i) )
+                 end if
+                 x21(i,i) = one
+                 if ( q > i ) then
+                    call stdlib_slarf( 'R', q-i, p-i+1, x11(i,i), ldx11, taup1(i),x11(i+1,i), &
+                              ldx11, work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_slarf( 'R', m-q-i+1, p-i+1, x11(i,i), ldx11,taup1(i), x12(i,i), &
+                              ldx12, work )
+                 end if
+                 if ( q > i ) then
+                    call stdlib_slarf( 'R', q-i, m-p-i+1, x21(i,i), ldx21, taup2(i),x21(i+1,i), &
+                              ldx21, work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_slarf( 'R', m-q-i+1, m-p-i+1, x21(i,i), ldx21,taup2(i), x22(i,i), &
+                              ldx22, work )
+                 end if
+                 if( i < q ) then
+                    call stdlib_sscal( q-i, -z1*z3*sin(theta(i)), x11(i+1,i), 1 )
+                    call stdlib_saxpy( q-i, z2*z3*cos(theta(i)), x21(i+1,i), 1,x11(i+1,i), 1 )
+                              
+                 end if
+                 call stdlib_sscal( m-q-i+1, -z1*z4*sin(theta(i)), x12(i,i), 1 )
+                 call stdlib_saxpy( m-q-i+1, z2*z4*cos(theta(i)), x22(i,i), 1,x12(i,i), 1 )
+                           
+                 if( i < q )phi(i) = atan2( stdlib_snrm2( q-i, x11(i+1,i), 1 ),stdlib_snrm2( m-q-&
+                           i+1, x12(i,i), 1 ) )
+                 if( i < q ) then
+                    if ( q-i == 1) then
+                       call stdlib_slarfgp( q-i, x11(i+1,i), x11(i+1,i), 1,tauq1(i) )
+                    else
+                       call stdlib_slarfgp( q-i, x11(i+1,i), x11(i+2,i), 1,tauq1(i) )
+                    end if
+                    x11(i+1,i) = one
+                 end if
+                 if ( m-q > i ) then
+                    call stdlib_slarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1,tauq2(i) )
+                 else
+                    call stdlib_slarfgp( m-q-i+1, x12(i,i), x12(i,i), 1,tauq2(i) )
+                 end if
+                 x12(i,i) = one
+                 if( i < q ) then
+                    call stdlib_slarf( 'L', q-i, p-i, x11(i+1,i), 1, tauq1(i),x11(i+1,i+1), ldx11,&
+                               work )
+                    call stdlib_slarf( 'L', q-i, m-p-i, x11(i+1,i), 1, tauq1(i),x21(i+1,i+1), &
+                              ldx21, work )
+                 end if
+                 call stdlib_slarf( 'L', m-q-i+1, p-i, x12(i,i), 1, tauq2(i),x12(i,i+1), ldx12, &
+                           work )
+                 if ( m-p-i > 0 ) then
+                    call stdlib_slarf( 'L', m-q-i+1, m-p-i, x12(i,i), 1, tauq2(i),x22(i,i+1), &
+                              ldx22, work )
+                 end if
+              end do
+              ! reduce columns q + 1, ..., p of x12, x22
+              do i = q + 1, p
+                 call stdlib_sscal( m-q-i+1, -z1*z4, x12(i,i), 1 )
+                 call stdlib_slarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1, tauq2(i) )
+                 x12(i,i) = one
+                 if ( p > i ) then
+                    call stdlib_slarf( 'L', m-q-i+1, p-i, x12(i,i), 1, tauq2(i),x12(i,i+1), ldx12,&
+                               work )
+                 end if
+                 if( m-p-q >= 1 )call stdlib_slarf( 'L', m-q-i+1, m-p-q, x12(i,i), 1, tauq2(i),&
+                           x22(i,q+1), ldx22, work )
+              end do
+              ! reduce columns p + 1, ..., m - q of x12, x22
+              do i = 1, m - p - q
+                 call stdlib_sscal( m-p-q-i+1, z2*z4, x22(p+i,q+i), 1 )
+                 if ( m-p-q == i ) then
+                    call stdlib_slarfgp( m-p-q-i+1, x22(p+i,q+i), x22(p+i,q+i), 1,tauq2(p+i) )
+                              
+                    x22(p+i,q+i) = one
+                 else
+                    call stdlib_slarfgp( m-p-q-i+1, x22(p+i,q+i), x22(p+i+1,q+i), 1,tauq2(p+i) )
+                              
+                    x22(p+i,q+i) = one
+                    call stdlib_slarf( 'L', m-p-q-i+1, m-p-q-i, x22(p+i,q+i), 1,tauq2(p+i), x22(p+&
+                              i,q+i+1), ldx22, work )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_sorbdb
+
+     !> SORBDB5: orthogonalizes the column vector
+     !> X = [ X1 ]
+     !> [ X2 ]
+     !> with respect to the columns of
+     !> Q = [ Q1 ] .
+     !> [ Q2 ]
+     !> The columns of Q must be orthonormal.
+     !> If the projection is zero according to Kahan's "twice is enough"
+     !> criterion, then some other vector from the orthogonal complement
+     !> is returned. This vector is chosen in an arbitrary but deterministic
+     !> way.
+
+     pure subroutine stdlib_sorbdb5( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
+           real(sp), intent(out) :: work(*)
+           real(sp), intent(inout) :: x1(*), x2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: childinfo, i, j
+           ! Intrinsic Function 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           if( m1 < 0 ) then
+              info = -1
+           else if( m2 < 0 ) then
+              info = -2
+           else if( n < 0 ) then
+              info = -3
+           else if( incx1 < 1 ) then
+              info = -5
+           else if( incx2 < 1 ) then
+              info = -7
+           else if( ldq1 < max( 1, m1 ) ) then
+              info = -9
+           else if( ldq2 < max( 1, m2 ) ) then
+              info = -11
+           else if( lwork < n ) then
+              info = -13
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'SORBDB5', -info )
+              return
+           end if
+           ! project x onto the orthogonal complement of q
+           call stdlib_sorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2, ldq2,work, lwork, &
+                     childinfo )
+           ! if the projection is nonzero, then return
+           if( stdlib_snrm2(m1,x1,incx1) /= zero.or. stdlib_snrm2(m2,x2,incx2) /= zero ) &
+                     then
+              return
+           end if
+           ! project each standard basis vector e_1,...,e_m1 in turn, stopping
+           ! when a nonzero projection is found
+           do i = 1, m1
+              do j = 1, m1
+                 x1(j) = zero
+              end do
+              x1(i) = one
+              do j = 1, m2
+                 x2(j) = zero
+              end do
+              call stdlib_sorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                        lwork, childinfo )
+              if( stdlib_snrm2(m1,x1,incx1) /= zero.or. stdlib_snrm2(m2,x2,incx2) /= zero ) &
+                        then
+                 return
+              end if
+           end do
+           ! project each standard basis vector e_(m1+1),...,e_(m1+m2) in turn,
+           ! stopping when a nonzero projection is found
+           do i = 1, m2
+              do j = 1, m1
+                 x1(j) = zero
+              end do
+              do j = 1, m2
+                 x2(j) = zero
+              end do
+              x2(i) = one
+              call stdlib_sorbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                        lwork, childinfo )
+              if( stdlib_snrm2(m1,x1,incx1) /= zero.or. stdlib_snrm2(m2,x2,incx2) /= zero ) &
+                        then
+                 return
+              end if
+           end do
+           return
+     end subroutine stdlib_sorbdb5
+
+     !> SORCSD: computes the CS decomposition of an M-by-M partitioned
+     !> orthogonal matrix X:
+     !> [  I  0  0 |  0  0  0 ]
+     !> [  0  C  0 |  0 -S  0 ]
+     !> [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**T
+     !> X = [-----------] = [---------] [---------------------] [---------]   .
+     !> [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
+     !> [  0  S  0 |  0  C  0 ]
+     !> [  0  0  I |  0  0  0 ]
+     !> X11 is P-by-Q. The orthogonal matrices U1, U2, V1, and V2 are P-by-P,
+     !> (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
+     !> R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
+     !> which R = MIN(P,M-P,Q,M-Q).
+
+     recursive subroutine stdlib_sorcsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, x11, &
+     ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, &
+               work, lwork, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, signs, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, ldx11, ldx12, ldx21, ldx22, &
+                     lwork, m, p, q
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: theta(*)
+           real(sp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*), work(*)
+                     
+           real(sp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
+                     
+        ! ===================================================================
+           
+           ! Local Arrays 
+           real(sp) :: dummy(1)
+           ! Local Scalars 
+           character :: transt, signst
+           integer(ilp) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
+           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, itauq2, j, lbbcsdwork, &
+           lbbcsdworkmin, lbbcsdworkopt, lorbdbwork, lorbdbworkmin, lorbdbworkopt, lorglqwork, &
+           lorglqworkmin, lorglqworkopt, lorgqrwork, lorgqrworkmin, lorgqrworkopt, lworkmin, &
+                     lworkopt
+           logical(lk) :: colmajor, defaultsigns, lquery, wantu1, wantu2, wantv1t, wantv2t
+           ! Intrinsic Functions
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           wantv2t = stdlib_lsame( jobv2t, 'Y' )
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           defaultsigns = .not. stdlib_lsame( signs, 'O' )
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -7
+           else if( p < 0 .or. p > m ) then
+              info = -8
+           else if( q < 0 .or. q > m ) then
+              info = -9
+           else if ( colmajor .and.  ldx11 < max( 1, p ) ) then
+             info = -11
+           else if (.not. colmajor .and. ldx11 < max( 1, q ) ) then
+             info = -11
+           else if (colmajor .and. ldx12 < max( 1, p ) ) then
+             info = -13
+           else if (.not. colmajor .and. ldx12 < max( 1, m-q ) ) then
+             info = -13
+           else if (colmajor .and. ldx21 < max( 1, m-p ) ) then
+             info = -15
+           else if (.not. colmajor .and. ldx21 < max( 1, q ) ) then
+             info = -15
+           else if (colmajor .and. ldx22 < max( 1, m-p ) ) then
+             info = -17
+           else if (.not. colmajor .and. ldx22 < max( 1, m-q ) ) then
+             info = -17
+           else if( wantu1 .and. ldu1 < p ) then
+              info = -20
+           else if( wantu2 .and. ldu2 < m-p ) then
+              info = -22
+           else if( wantv1t .and. ldv1t < q ) then
+              info = -24
+           else if( wantv2t .and. ldv2t < m-q ) then
+              info = -26
+           end if
+           ! work with transpose if convenient
+           if( info == 0 .and. min( p, m-p ) < min( q, m-q ) ) then
+              if( colmajor ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              if( defaultsigns ) then
+                 signst = 'O'
+              else
+                 signst = 'D'
+              end if
+              call stdlib_sorcsd( jobv1t, jobv2t, jobu1, jobu2, transt, signst, m,q, p, x11, &
+              ldx11, x21, ldx21, x12, ldx12, x22,ldx22, theta, v1t, ldv1t, v2t, ldv2t, u1, ldu1,&
+                        u2, ldu2, work, lwork, iwork, info )
+              return
+           end if
+           ! work with permutation [ 0 i; i 0 ] * x * [ 0 i; i 0 ] if
+           ! convenient
+           if( info == 0 .and. m-q < q ) then
+              if( defaultsigns ) then
+                 signst = 'O'
+              else
+                 signst = 'D'
+              end if
+              call stdlib_sorcsd( jobu2, jobu1, jobv2t, jobv1t, trans, signst, m,m-p, m-q, x22, &
+              ldx22, x21, ldx21, x12, ldx12, x11,ldx11, theta, u2, ldu2, u1, ldu1, v2t, ldv2t, &
+                        v1t,ldv1t, work, lwork, iwork, info )
+              return
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              iphi = 2
+              itaup1 = iphi + max( 1, q - 1 )
+              itaup2 = itaup1 + max( 1, p )
+              itauq1 = itaup2 + max( 1, m - p )
+              itauq2 = itauq1 + max( 1, q )
+              iorgqr = itauq2 + max( 1, m - q )
+              call stdlib_sorgqr( m-q, m-q, m-q, dummy, max(1,m-q), dummy, work, -1,childinfo )
+                        
+              lorgqrworkopt = int( work(1),KIND=ilp)
+              lorgqrworkmin = max( 1, m - q )
+              iorglq = itauq2 + max( 1, m - q )
+              call stdlib_sorglq( m-q, m-q, m-q, dummy, max(1,m-q), dummy, work, -1,childinfo )
+                        
+              lorglqworkopt = int( work(1),KIND=ilp)
+              lorglqworkmin = max( 1, m - q )
+              iorbdb = itauq2 + max( 1, m - q )
+              call stdlib_sorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+                        ldx22, dummy, dummy, dummy, dummy, dummy,dummy,work,-1,childinfo )
+              lorbdbworkopt = int( work(1),KIND=ilp)
+              lorbdbworkmin = lorbdbworkopt
+              ib11d = itauq2 + max( 1, m - q )
+              ib11e = ib11d + max( 1, q )
+              ib12d = ib11e + max( 1, q - 1 )
+              ib12e = ib12d + max( 1, q )
+              ib21d = ib12e + max( 1, q - 1 )
+              ib21e = ib21d + max( 1, q )
+              ib22d = ib21e + max( 1, q - 1 )
+              ib22e = ib22d + max( 1, q )
+              ibbcsd = ib22e + max( 1, q - 1 )
+              call stdlib_sbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,dummy, dummy, u1, &
+              ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, dummy, dummy, dummy, dummy, dummy, dummy,&
+                        dummy, dummy, work, -1, childinfo )
+              lbbcsdworkopt = int( work(1),KIND=ilp)
+              lbbcsdworkmin = lbbcsdworkopt
+              lworkopt = max( iorgqr + lorgqrworkopt, iorglq + lorglqworkopt,iorbdb + &
+                        lorbdbworkopt, ibbcsd + lbbcsdworkopt ) - 1
+              lworkmin = max( iorgqr + lorgqrworkmin, iorglq + lorglqworkmin,iorbdb + &
+                        lorbdbworkopt, ibbcsd + lbbcsdworkmin ) - 1
+              work(1) = max(lworkopt,lworkmin)
+              if( lwork < lworkmin .and. .not. lquery ) then
+                 info = -22
+              else
+                 lorgqrwork = lwork - iorgqr + 1
+                 lorglqwork = lwork - iorglq + 1
+                 lorbdbwork = lwork - iorbdb + 1
+                 lbbcsdwork = lwork - ibbcsd + 1
+              end if
+           end if
+           ! abort if any illegal arguments
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'SORCSD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! transform to bidiagonal block form
+           call stdlib_sorbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12, x21,ldx21, x22, &
+           ldx22, theta, work(iphi), work(itaup1),work(itaup2), work(itauq1), work(itauq2),work(&
+                     iorbdb), lorbdbwork, childinfo )
+           ! accumulate householder reflectors
+           if( colmajor ) then
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_slacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_sorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqrwork, &
+                           info)
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_slacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_sorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqrwork,&
+                            info )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_slacpy( 'U', q-1, q-1, x11(1,2), ldx11, v1t(2,2),ldv1t )
+                 v1t(1, 1) = one
+                 do j = 2, q
+                    v1t(1,j) = zero
+                    v1t(j,1) = zero
+                 end do
+                 call stdlib_sorglq( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorglq), &
+                           lorglqwork, info )
+              end if
+              if( wantv2t .and. m-q > 0 ) then
+                 call stdlib_slacpy( 'U', p, m-q, x12, ldx12, v2t, ldv2t )
+                 call stdlib_slacpy( 'U', m-p-q, m-p-q, x22(q+1,p+1), ldx22,v2t(p+1,p+1), ldv2t )
+                           
+                 call stdlib_sorglq( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorglq), &
+                           lorglqwork, info )
+              end if
+           else
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_slacpy( 'U', q, p, x11, ldx11, u1, ldu1 )
+                 call stdlib_sorglq( p, p, q, u1, ldu1, work(itaup1), work(iorglq),lorglqwork, &
+                           info)
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_slacpy( 'U', q, m-p, x21, ldx21, u2, ldu2 )
+                 call stdlib_sorglq( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorglq), lorglqwork,&
+                            info )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_slacpy( 'L', q-1, q-1, x11(2,1), ldx11, v1t(2,2),ldv1t )
+                 v1t(1, 1) = one
+                 do j = 2, q
+                    v1t(1,j) = zero
+                    v1t(j,1) = zero
+                 end do
+                 call stdlib_sorgqr( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorgqr), &
+                           lorgqrwork, info )
+              end if
+              if( wantv2t .and. m-q > 0 ) then
+                 call stdlib_slacpy( 'L', m-q, p, x12, ldx12, v2t, ldv2t )
+                 call stdlib_slacpy( 'L', m-p-q, m-p-q, x22(p+1,q+1), ldx22,v2t(p+1,p+1), ldv2t )
+                           
+                 call stdlib_sorgqr( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorgqr), &
+                           lorgqrwork, info )
+              end if
+           end if
+           ! compute the csd of the matrix in bidiagonal-block form
+           call stdlib_sbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q, theta,work(iphi), u1,&
+            ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, work(ib11d), work(ib11e), work(ib12d),work(&
+            ib12e), work(ib21d), work(ib21e), work(ib22d),work(ib22e), work(ibbcsd), lbbcsdwork, &
+                      info )
+           ! permute rows and columns to place identity submatrices in top-
+           ! left corner of (1,1)-block and/or bottom-right corner of (1,2)-
+           ! block and/or bottom-right corner of (2,1)-block and/or top-left
+           ! corner of (2,2)-block
+           if( q > 0 .and. wantu2 ) then
+              do i = 1, q
+                 iwork(i) = m - p - q + i
+              end do
+              do i = q + 1, m - p
+                 iwork(i) = i - q
+              end do
+              if( colmajor ) then
+                 call stdlib_slapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              else
+                 call stdlib_slapmr( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           end if
+           if( m > 0 .and. wantv2t ) then
+              do i = 1, p
+                 iwork(i) = m - p - q + i
+              end do
+              do i = p + 1, m - q
+                 iwork(i) = i - p
+              end do
+              if( .not. colmajor ) then
+                 call stdlib_slapmt( .false., m-q, m-q, v2t, ldv2t, iwork )
+              else
+                 call stdlib_slapmr( .false., m-q, m-q, v2t, ldv2t, iwork )
+              end if
+           end if
+           return
+           ! end stdlib_sorcsd
+     end subroutine stdlib_sorcsd
+
+     !> SORGHR: generates a real orthogonal matrix Q which is defined as the
+     !> product of IHI-ILO elementary reflectors of order N, as returned by
+     !> SGEHRD:
+     !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+     pure subroutine stdlib_sorghr( n, ilo, ihi, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, lwkopt, nb, nh
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nh = ihi - ilo
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( lwork<max( 1, nh ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'SORGQR', ' ', nh, nh, nh, -1 )
+              lwkopt = max( 1, nh )*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORGHR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! shift the vectors which define the elementary reflectors one
+           ! column to the right, and set the first ilo and the last n-ihi
+           ! rows and columns to those of the unit matrix
+           do j = ihi, ilo + 1, -1
+              do i = 1, j - 1
+                 a( i, j ) = zero
+              end do
+              do i = j + 1, ihi
+                 a( i, j ) = a( i, j-1 )
+              end do
+              do i = ihi + 1, n
+                 a( i, j ) = zero
+              end do
+           end do
+           do j = 1, ilo
+              do i = 1, n
+                 a( i, j ) = zero
+              end do
+              a( j, j ) = one
+           end do
+           do j = ihi + 1, n
+              do i = 1, n
+                 a( i, j ) = zero
+              end do
+              a( j, j ) = one
+           end do
+           if( nh>0 ) then
+              ! generate q(ilo+1:ihi,ilo+1:ihi)
+              call stdlib_sorgqr( nh, nh, nh, a( ilo+1, ilo+1 ), lda, tau( ilo ),work, lwork, &
+                        iinfo )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sorghr
+
+     !> SORHR_COL: takes an M-by-N real matrix Q_in with orthonormal columns
+     !> as input, stored in A, and performs Householder Reconstruction (HR),
+     !> i.e. reconstructs Householder vectors V(i) implicitly representing
+     !> another M-by-N matrix Q_out, with the property that Q_in = Q_out*S,
+     !> where S is an N-by-N diagonal matrix with diagonal entries
+     !> equal to +1 or -1. The Householder vectors (columns V(i) of V) are
+     !> stored in A on output, and the diagonal entries of S are stored in D.
+     !> Block reflectors are also returned in T
+     !> (same output format as SGEQRT).
+
+     pure subroutine stdlib_sorhr_col( m, n, nb, a, lda, t, ldt, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, nb
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: d(*), t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo, j, jb, jbtemp1, jbtemp2, jnb, nplusone
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( nb<1 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -7
+           end if
+           ! handle error in the input parameters.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORHR_COL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              return
+           end if
+           ! on input, the m-by-n matrix a contains the orthogonal
+           ! m-by-n matrix q_in.
+           ! (1) compute the unit lower-trapezoidal v (ones on the diagonal
+           ! are not stored) by performing the "modified" lu-decomposition.
+           ! q_in - ( s ) = v * u = ( v1 ) * u,
+                  ! ( 0 )           ( v2 )
+           ! where 0 is an (m-n)-by-n zero matrix.
+           ! (1-1) factor v1 and u.
+           call stdlib_slaorhr_col_getrfnp( n, n, a, lda, d, iinfo )
+           ! (1-2) solve for v2.
+           if( m>n ) then
+              call stdlib_strsm( 'R', 'U', 'N', 'N', m-n, n, one, a, lda,a( n+1, 1 ), lda )
+                        
+           end if
+           ! (2) reconstruct the block reflector t stored in t(1:nb, 1:n)
+           ! as a sequence of upper-triangular blocks with nb-size column
+           ! blocking.
+           ! loop over the column blocks of size nb of the array a(1:m,1:n)
+           ! and the array t(1:nb,1:n), jb is the column index of a column
+           ! block, jnb is the column block size at each step jb.
+           nplusone = n + 1
+           do jb = 1, n, nb
+              ! (2-0) determine the column block size jnb.
+              jnb = min( nplusone-jb, nb )
+              ! (2-1) copy the upper-triangular part of the current jnb-by-jnb
+              ! diagonal block u(jb) (of the n-by-n matrix u) stored
+              ! in a(jb:jb+jnb-1,jb:jb+jnb-1) into the upper-triangular part
+              ! of the current jnb-by-jnb block t(1:jnb,jb:jb+jnb-1)
+              ! column-by-column, total jnb*(jnb+1)/2 elements.
+              jbtemp1 = jb - 1
+              do j = jb, jb+jnb-1
+                 call stdlib_scopy( j-jbtemp1, a( jb, j ), 1, t( 1, j ), 1 )
+              end do
+              ! (2-2) perform on the upper-triangular part of the current
+              ! jnb-by-jnb diagonal block u(jb) (of the n-by-n matrix u) stored
+              ! in t(1:jnb,jb:jb+jnb-1) the following operation in place:
+              ! (-1)*u(jb)*s(jb), i.e the result will be stored in the upper-
+              ! triangular part of t(1:jnb,jb:jb+jnb-1). this multiplication
+              ! of the jnb-by-jnb diagonal block u(jb) by the jnb-by-jnb
+              ! diagonal block s(jb) of the n-by-n sign matrix s from the
+              ! right means changing the sign of each j-th column of the block
+              ! u(jb) according to the sign of the diagonal element of the block
+              ! s(jb), i.e. s(j,j) that is stored in the array element d(j).
+              do j = jb, jb+jnb-1
+                 if( d( j )==one ) then
+                    call stdlib_sscal( j-jbtemp1, -one, t( 1, j ), 1 )
+                 end if
+              end do
+              ! (2-3) perform the triangular solve for the current block
+              ! matrix x(jb):
+                     ! x(jb) * (a(jb)**t) = b(jb), where:
+                     ! a(jb)**t  is a jnb-by-jnb unit upper-triangular
+                               ! coefficient block, and a(jb)=v1(jb), which
+                               ! is a jnb-by-jnb unit lower-triangular block
+                               ! stored in a(jb:jb+jnb-1,jb:jb+jnb-1).
+                               ! the n-by-n matrix v1 is the upper part
+                               ! of the m-by-n lower-trapezoidal matrix v
+                               ! stored in a(1:m,1:n);
+                     ! b(jb)     is a jnb-by-jnb  upper-triangular right-hand
+                               ! side block, b(jb) = (-1)*u(jb)*s(jb), and
+                               ! b(jb) is stored in t(1:jnb,jb:jb+jnb-1);
+                     ! x(jb)     is a jnb-by-jnb upper-triangular solution
+                               ! block, x(jb) is the upper-triangular block
+                               ! reflector t(jb), and x(jb) is stored
+                               ! in t(1:jnb,jb:jb+jnb-1).
+                   ! in other words, we perform the triangular solve for the
+                   ! upper-triangular block t(jb):
+                     ! t(jb) * (v1(jb)**t) = (-1)*u(jb)*s(jb).
+                   ! even though the blocks x(jb) and b(jb) are upper-
+                   ! triangular, the routine stdlib_strsm will access all jnb**2
+                   ! elements of the square t(1:jnb,jb:jb+jnb-1). therefore,
+                   ! we need to set to zero the elements of the block
+                   ! t(1:jnb,jb:jb+jnb-1) below the diagonal before the call
+                   ! to stdlib_strsm.
+              ! (2-3a) set the elements to zero.
+              jbtemp2 = jb - 2
+              do j = jb, jb+jnb-2
+                 do i = j-jbtemp2, nb
+                    t( i, j ) = zero
+                 end do
+              end do
+              ! (2-3b) perform the triangular solve.
+              call stdlib_strsm( 'R', 'L', 'T', 'U', jnb, jnb, one,a( jb, jb ), lda, t( 1, jb ), &
+                        ldt )
+           end do
+           return
+     end subroutine stdlib_sorhr_col
+
+     !> SORMHR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> IHI-ILO elementary reflectors, as returned by SGEHRD:
+     !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+     pure subroutine stdlib_sormhr( side, trans, m, n, ilo, ihi, a, lda, tau, c,ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldc, lwork, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, lquery
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, nh, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nh = ihi - ilo
+           left = stdlib_lsame( side, 'L' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) )&
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 .or. ilo>max( 1, nq ) ) then
+              info = -5
+           else if( ihi<min( ilo, nq ) .or. ihi>nq ) then
+              info = -6
+           else if( lda<max( 1, nq ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( left ) then
+                 nb = stdlib_ilaenv( 1, 'SORMQR', side // trans, nh, n, nh, -1 )
+              else
+                 nb = stdlib_ilaenv( 1, 'SORMQR', side // trans, m, nh, nh, -1 )
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORMHR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. nh==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( left ) then
+              mi = nh
+              ni = n
+              i1 = ilo + 1
+              i2 = 1
+           else
+              mi = m
+              ni = nh
+              i1 = 1
+              i2 = ilo + 1
+           end if
+           call stdlib_sormqr( side, trans, mi, ni, nh, a( ilo+1, ilo ), lda,tau( ilo ), c( i1, &
+                     i2 ), ldc, work, lwork, iinfo )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sormhr
+
+     !> SPBCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric positive definite band matrix using the
+     !> Cholesky factorization A = U**T*U or A = L*L**T computed by SPBTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_spbcon( uplo, n, kd, ab, ldab, anorm, rcond, work,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: ab(ldab,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(sp) :: ainvnm, scale, scalel, scaleu, smlnum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**t).
+                 call stdlib_slatbs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scalel, work( 2*n+1 ),info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_slatbs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scaleu, work( 2*n+1 ),info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_slatbs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scalel, work( 2*n+1 ),info )
+                 normin = 'Y'
+                 ! multiply by inv(l**t).
+                 call stdlib_slatbs( 'LOWER', 'TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scaleu, work( 2*n+1 ),info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_isamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_srscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_spbcon
+
+     !> SPBRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric positive definite
+     !> and banded, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_spbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b,ldb, x, ldx, ferr, &
+               berr, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldafb, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*)
+           real(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, l, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldafb<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = min( n+1, 2*kd+2 )
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_scopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_ssbmv( uplo, n, kd, -one, ab, ldab, x( 1, j ), 1, one,work( n+1 ), 1 )
+                        
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    l = kd + 1 - k
+                    do i = max( 1, k-kd ), k - 1
+                       work( i ) = work( i ) + abs( ab( l+i, k ) )*xk
+                       s = s + abs( ab( l+i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + abs( ab( kd+1, k ) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( ab( 1, k ) )*xk
+                    l = 1 - k
+                    do i = k + 1, min( n, k+kd )
+                       work( i ) = work( i ) + abs( ab( l+i, k ) )*xk
+                       s = s + abs( ab( l+i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_spbtrs( uplo, n, kd, 1, afb, ldafb, work( n+1 ), n,info )
+                 call stdlib_saxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_slacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_slacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_spbtrs( uplo, n, kd, 1, afb, ldafb, work( n+1 ), n,info )
+                    do i = 1, n
+                       work( n+i ) = work( n+i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( n+i )*work( i )
+                    end do
+                    call stdlib_spbtrs( uplo, n, kd, 1, afb, ldafb, work( n+1 ), n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_spbrfs
+
+     !> SPFTRS: solves a system of linear equations A*X = B with a symmetric
+     !> positive definite matrix A using the Cholesky factorization
+     !> A = U**T*U or A = L*L**T computed by SPFTRF.
+
+     pure subroutine stdlib_spftrs( transr, uplo, n, nrhs, a, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(in) :: a(0:*)
+           real(sp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, normaltransr
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPFTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! start execution: there are two triangular solves
+           if( lower ) then
+              call stdlib_stfsm( transr, 'L', uplo, 'N', 'N', n, nrhs, one, a, b,ldb )
+              call stdlib_stfsm( transr, 'L', uplo, 'T', 'N', n, nrhs, one, a, b,ldb )
+           else
+              call stdlib_stfsm( transr, 'L', uplo, 'T', 'N', n, nrhs, one, a, b,ldb )
+              call stdlib_stfsm( transr, 'L', uplo, 'N', 'N', n, nrhs, one, a, b,ldb )
+           end if
+           return
+     end subroutine stdlib_spftrs
+
+     !> SPOCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric positive definite matrix using the
+     !> Cholesky factorization A = U**T*U or A = L*L**T computed by SPOTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_spocon( uplo, n, a, lda, anorm, rcond, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(sp) :: ainvnm, scale, scalel, scaleu, smlnum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPOCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of inv(a).
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**t).
+                 call stdlib_slatrs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n, a,lda, work, &
+                           scalel, work( 2*n+1 ), info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_slatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           scaleu, work( 2*n+1 ), info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_slatrs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           scalel, work( 2*n+1 ), info )
+                 normin = 'Y'
+                 ! multiply by inv(l**t).
+                 call stdlib_slatrs( 'LOWER', 'TRANSPOSE', 'NON-UNIT', normin, n, a,lda, work, &
+                           scaleu, work( 2*n+1 ), info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_isamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_srscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_spocon
+
+     !> SPORFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric positive definite,
+     !> and provides error bounds and backward error estimates for the
+     !> solution.
+
+     pure subroutine stdlib_sporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x,ldx, ferr, berr, &
+               work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*)
+           real(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPORFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_scopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_ssymv( uplo, n, -one, a, lda, x( 1, j ), 1, one,work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    do i = 1, k - 1
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + abs( a( k, k ) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( a( k, k ) )*xk
+                    do i = k + 1, n
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_spotrs( uplo, n, 1, af, ldaf, work( n+1 ), n, info )
+                 call stdlib_saxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_slacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_slacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_spotrs( uplo, n, 1, af, ldaf, work( n+1 ), n, info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_spotrs( uplo, n, 1, af, ldaf, work( n+1 ), n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_sporfs
+
+     !> SPOTF2: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**T * U ,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_spotf2( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j
+           real(sp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPOTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the cholesky factorization a = u**t *u.
+              do j = 1, n
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = a( j, j ) - stdlib_sdot( j-1, a( 1, j ), 1, a( 1, j ), 1 )
+                 if( ajj<=zero.or.stdlib_sisnan( ajj ) ) then
+                    a( j, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of row j.
+                 if( j<n ) then
+                    call stdlib_sgemv( 'TRANSPOSE', j-1, n-j, -one, a( 1, j+1 ),lda, a( 1, j ), 1,&
+                               one, a( j, j+1 ), lda )
+                    call stdlib_sscal( n-j, one / ajj, a( j, j+1 ), lda )
+                 end if
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**t.
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = a( j, j ) - stdlib_sdot( j-1, a( j, 1 ), lda, a( j, 1 ),lda )
+                 if( ajj<=zero.or.stdlib_sisnan( ajj ) ) then
+                    a( j, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of column j.
+                 if( j<n ) then
+                    call stdlib_sgemv( 'NO TRANSPOSE', n-j, j-1, -one, a( j+1, 1 ),lda, a( j, 1 ),&
+                               lda, one, a( j+1, j ), 1 )
+                    call stdlib_sscal( n-j, one / ajj, a( j+1, j ), 1 )
+                 end if
+              end do
+           end if
+           go to 40
+           30 continue
+           info = j
+           40 continue
+           return
+     end subroutine stdlib_spotf2
+
+     !> SPOTRF2: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A using the recursive algorithm.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the recursive version of the algorithm. It divides
+     !> the matrix into four submatrices:
+     !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+     !> A = [ -----|----- ]  with n1 = n/2
+     !> [  A21 | A22  ]       n2 = n-n1
+     !> The subroutine calls itself to factor A11. Update and scale A21
+     !> or A12, update A22 then call itself to factor A22.
+
+     pure recursive subroutine stdlib_spotrf2( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: n1, n2, iinfo
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPOTRF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! n=1 case
+           if( n==1 ) then
+              ! test for non-positive-definiteness
+              if( a( 1, 1 )<=zero.or.stdlib_sisnan( a( 1, 1 ) ) ) then
+                 info = 1
+                 return
+              end if
+              ! factor
+              a( 1, 1 ) = sqrt( a( 1, 1 ) )
+           ! use recursive code
+           else
+              n1 = n/2
+              n2 = n-n1
+              ! factor a11
+              call stdlib_spotrf2( uplo, n1, a( 1, 1 ), lda, iinfo )
+              if ( iinfo/=0 ) then
+                 info = iinfo
+                 return
+              end if
+              ! compute the cholesky factorization a = u**t*u
+              if( upper ) then
+                 ! update and scale a12
+                 call stdlib_strsm( 'L', 'U', 'T', 'N', n1, n2, one,a( 1, 1 ), lda, a( 1, n1+1 ), &
+                           lda )
+                 ! update and factor a22
+                 call stdlib_ssyrk( uplo, 'T', n2, n1, -one, a( 1, n1+1 ), lda,one, a( n1+1, n1+1 &
+                           ), lda )
+                 call stdlib_spotrf2( uplo, n2, a( n1+1, n1+1 ), lda, iinfo )
+                 if ( iinfo/=0 ) then
+                    info = iinfo + n1
+                    return
+                 end if
+              ! compute the cholesky factorization a = l*l**t
+              else
+                 ! update and scale a21
+                 call stdlib_strsm( 'R', 'L', 'T', 'N', n2, n1, one,a( 1, 1 ), lda, a( n1+1, 1 ), &
+                           lda )
+                 ! update and factor a22
+                 call stdlib_ssyrk( uplo, 'N', n2, n1, -one, a( n1+1, 1 ), lda,one, a( n1+1, n1+1 &
+                           ), lda )
+                 call stdlib_spotrf2( uplo, n2, a( n1+1, n1+1 ), lda, iinfo )
+                 if ( iinfo/=0 ) then
+                    info = iinfo + n1
+                    return
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_spotrf2
+
+     !> SPOTRI: computes the inverse of a real symmetric positive definite
+     !> matrix A using the Cholesky factorization A = U**T*U or A = L*L**T
+     !> computed by SPOTRF.
+
+     pure subroutine stdlib_spotri( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPOTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_strtri( uplo, 'NON-UNIT', n, a, lda, info )
+           if( info>0 )return
+           ! form inv(u) * inv(u)**t or inv(l)**t * inv(l).
+           call stdlib_slauum( uplo, n, a, lda, info )
+           return
+     end subroutine stdlib_spotri
+
+     !> SPPCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric positive definite packed matrix using
+     !> the Cholesky factorization A = U**T*U or A = L*L**T computed by
+     !> SPPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_sppcon( uplo, n, ap, anorm, rcond, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: ap(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(sp) :: ainvnm, scale, scalel, scaleu, smlnum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**t).
+                 call stdlib_slatps( 'UPPER', 'TRANSPOSE', 'NON-UNIT', normin, n,ap, work, scalel,&
+                            work( 2*n+1 ), info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_slatps( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,ap, work, &
+                           scaleu, work( 2*n+1 ), info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_slatps( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,ap, work, &
+                           scalel, work( 2*n+1 ), info )
+                 normin = 'Y'
+                 ! multiply by inv(l**t).
+                 call stdlib_slatps( 'LOWER', 'TRANSPOSE', 'NON-UNIT', normin, n,ap, work, scaleu,&
+                            work( 2*n+1 ), info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_isamax( n, work, 1 )
+                 if( scale<abs( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_srscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_sppcon
+
+     !> SPPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric positive definite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_spprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr,berr, work, &
+               iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*)
+           real(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_scopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_sspmv( uplo, n, -one, ap, x( 1, j ), 1, one, work( n+1 ),1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       work( i ) = work( i ) + abs( ap( ik ) )*xk
+                       s = s + abs( ap( ik ) )*abs( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    work( k ) = work( k ) + abs( ap( kk+k-1 ) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( ap( kk ) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       work( i ) = work( i ) + abs( ap( ik ) )*xk
+                       s = s + abs( ap( ik ) )*abs( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    work( k ) = work( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_spptrs( uplo, n, 1, afp, work( n+1 ), n, info )
+                 call stdlib_saxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_slacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_slacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_spptrs( uplo, n, 1, afp, work( n+1 ), n, info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_spptrs( uplo, n, 1, afp, work( n+1 ), n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_spprfs
+
+     !> SPPSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix stored in
+     !> packed format and X and B are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**T* U,  if UPLO = 'U', or
+     !> A = L * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is a lower triangular
+     !> matrix.  The factored form of A is then used to solve the system of
+     !> equations A * X = B.
+
+     pure subroutine stdlib_sppsv( uplo, n, nrhs, ap, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPPSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**t*u or a = l*l**t.
+           call stdlib_spptrf( uplo, n, ap, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_spptrs( uplo, n, nrhs, ap, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_sppsv
+
+     !> SPPSVX: uses the Cholesky factorization A = U**T*U or A = L*L**T to
+     !> compute the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix stored in
+     !> packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_sppsvx( fact, uplo, n, nrhs, ap, afp, equed, s, b, ldb,x, ldx, rcond, ferr,&
+                berr, work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: afp(*), ap(*), b(ldb,*), s(*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ
+           integer(ilp) :: i, infequ, j
+           real(sp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -7
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -8
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -10
+                 else if( ldx<max( 1, n ) ) then
+                    info = -12
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPPSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_sppequ( uplo, n, ap, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_slaqsp( uplo, n, ap, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right-hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**t * u or a = l * l**t.
+              call stdlib_scopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_spptrf( uplo, n, afp, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_slansp( 'I', uplo, n, ap, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_sppcon( uplo, n, afp, anorm, rcond, work, iwork, info )
+           ! compute the solution matrix x.
+           call stdlib_slacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_spptrs( uplo, n, nrhs, afp, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_spprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr, berr,work, iwork, &
+                     info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_sppsvx
+
+     !> SPPTRI: computes the inverse of a real symmetric positive definite
+     !> matrix A using the Cholesky factorization A = U**T*U or A = L*L**T
+     !> computed by SPPTRF.
+
+     pure subroutine stdlib_spptri( uplo, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jc, jj, jjn
+           real(sp) :: ajj
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_stptri( uplo, 'NON-UNIT', n, ap, info )
+           if( info>0 )return
+           if( upper ) then
+              ! compute the product inv(u) * inv(u)**t.
+              jj = 0
+              do j = 1, n
+                 jc = jj + 1
+                 jj = jj + j
+                 if( j>1 )call stdlib_sspr( 'UPPER', j-1, one, ap( jc ), 1, ap )
+                 ajj = ap( jj )
+                 call stdlib_sscal( j, ajj, ap( jc ), 1 )
+              end do
+           else
+              ! compute the product inv(l)**t * inv(l).
+              jj = 1
+              do j = 1, n
+                 jjn = jj + n - j + 1
+                 ap( jj ) = stdlib_sdot( n-j+1, ap( jj ), 1, ap( jj ), 1 )
+                 if( j<n )call stdlib_stpmv( 'LOWER', 'TRANSPOSE', 'NON-UNIT', n-j,ap( jjn ), ap( &
+                           jj+1 ), 1 )
+                 jj = jjn
+              end do
+           end if
+           return
+     end subroutine stdlib_spptri
+
+     !> SPSTF2: computes the Cholesky factorization with complete
+     !> pivoting of a real symmetric positive semidefinite matrix A.
+     !> The factorization has the form
+     !> P**T * A * P = U**T * U ,  if UPLO = 'U',
+     !> P**T * A * P = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular, and
+     !> P is stored as vector PIV.
+     !> This algorithm does not attempt to check that A is positive
+     !> semidefinite. This version of the algorithm calls level 2 BLAS.
+
+     pure subroutine stdlib_spstf2( uplo, n, a, lda, piv, rank, tol, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: tol
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(2*n)
+           integer(ilp), intent(out) :: piv(n)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: ajj, sstop, stemp
+           integer(ilp) :: i, itemp, j, pvt
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt,maxloc
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPSTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize piv
+           do i = 1, n
+              piv( i ) = i
+           end do
+           ! compute stopping value
+           pvt = 1
+           ajj = a( pvt, pvt )
+           do i = 2, n
+              if( a( i, i )>ajj ) then
+                 pvt = i
+                 ajj = a( pvt, pvt )
+              end if
+           end do
+           if( ajj<=zero.or.stdlib_sisnan( ajj ) ) then
+              rank = 0
+              info = 1
+              go to 170
+           end if
+           ! compute stopping value if not supplied
+           if( tol<zero ) then
+              sstop = n * stdlib_slamch( 'EPSILON' ) * ajj
+           else
+              sstop = tol
+           end if
+           ! set first half of work to zero, holds dot products
+           do i = 1, n
+              work( i ) = 0
+           end do
+           if( upper ) then
+              ! compute the cholesky factorization p**t * a * p = u**t * u
+              loop_130: do j = 1, n
+              ! find pivot, test for exit, else swap rows and columns
+              ! update dot products, compute possible pivots which are
+              ! stored in the second half of work
+                 do i = j, n
+                    if( j>1 ) then
+                       work( i ) = work( i ) + a( j-1, i )**2
+                    end if
+                    work( n+i ) = a( i, i ) - work( i )
+                 end do
+                 if( j>1 ) then
+                    itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                    pvt = itemp + j - 1
+                    ajj = work( n+pvt )
+                    if( ajj<=sstop.or.stdlib_sisnan( ajj ) ) then
+                       a( j, j ) = ajj
+                       go to 160
+                    end if
+                 end if
+                 if( j/=pvt ) then
+                    ! pivot ok, so can now swap pivot rows and columns
+                    a( pvt, pvt ) = a( j, j )
+                    call stdlib_sswap( j-1, a( 1, j ), 1, a( 1, pvt ), 1 )
+                    if( pvt<n )call stdlib_sswap( n-pvt, a( j, pvt+1 ), lda,a( pvt, pvt+1 ), lda )
+                              
+                    call stdlib_sswap( pvt-j-1, a( j, j+1 ), lda, a( j+1, pvt ), 1 )
+                    ! swap dot products and piv
+                    stemp = work( j )
+                    work( j ) = work( pvt )
+                    work( pvt ) = stemp
+                    itemp = piv( pvt )
+                    piv( pvt ) = piv( j )
+                    piv( j ) = itemp
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of row j
+                 if( j<n ) then
+                    call stdlib_sgemv( 'TRANS', j-1, n-j, -one, a( 1, j+1 ), lda,a( 1, j ), 1, &
+                              one, a( j, j+1 ), lda )
+                    call stdlib_sscal( n-j, one / ajj, a( j, j+1 ), lda )
+                 end if
+              end do loop_130
+           else
+              ! compute the cholesky factorization p**t * a * p = l * l**t
+              loop_150: do j = 1, n
+              ! find pivot, test for exit, else swap rows and columns
+              ! update dot products, compute possible pivots which are
+              ! stored in the second half of work
+                 do i = j, n
+                    if( j>1 ) then
+                       work( i ) = work( i ) + a( i, j-1 )**2
+                    end if
+                    work( n+i ) = a( i, i ) - work( i )
+                 end do
+                 if( j>1 ) then
+                    itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                    pvt = itemp + j - 1
+                    ajj = work( n+pvt )
+                    if( ajj<=sstop.or.stdlib_sisnan( ajj ) ) then
+                       a( j, j ) = ajj
+                       go to 160
+                    end if
+                 end if
+                 if( j/=pvt ) then
+                    ! pivot ok, so can now swap pivot rows and columns
+                    a( pvt, pvt ) = a( j, j )
+                    call stdlib_sswap( j-1, a( j, 1 ), lda, a( pvt, 1 ), lda )
+                    if( pvt<n )call stdlib_sswap( n-pvt, a( pvt+1, j ), 1, a( pvt+1, pvt ),1 )
+                              
+                    call stdlib_sswap( pvt-j-1, a( j+1, j ), 1, a( pvt, j+1 ), lda )
+                    ! swap dot products and piv
+                    stemp = work( j )
+                    work( j ) = work( pvt )
+                    work( pvt ) = stemp
+                    itemp = piv( pvt )
+                    piv( pvt ) = piv( j )
+                    piv( j ) = itemp
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of column j
+                 if( j<n ) then
+                    call stdlib_sgemv( 'NO TRANS', n-j, j-1, -one, a( j+1, 1 ), lda,a( j, 1 ), &
+                              lda, one, a( j+1, j ), 1 )
+                    call stdlib_sscal( n-j, one / ajj, a( j+1, j ), 1 )
+                 end if
+              end do loop_150
+           end if
+           ! ran to completion, a has full rank
+           rank = n
+           go to 170
+           160 continue
+           ! rank is number of steps completed.  set info = 1 to signal
+           ! that the factorization cannot be used to solve a system.
+           rank = j - 1
+           info = 1
+           170 continue
+           return
+     end subroutine stdlib_spstf2
+
+     !> SPSTRF: computes the Cholesky factorization with complete
+     !> pivoting of a real symmetric positive semidefinite matrix A.
+     !> The factorization has the form
+     !> P**T * A * P = U**T * U ,  if UPLO = 'U',
+     !> P**T * A * P = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular, and
+     !> P is stored as vector PIV.
+     !> This algorithm does not attempt to check that A is positive
+     !> semidefinite. This version of the algorithm calls level 3 BLAS.
+
+     pure subroutine stdlib_spstrf( uplo, n, a, lda, piv, rank, tol, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: tol
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(2*n)
+           integer(ilp), intent(out) :: piv(n)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: ajj, sstop, stemp
+           integer(ilp) :: i, itemp, j, jb, k, nb, pvt
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt,maxloc
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPSTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get block size
+           nb = stdlib_ilaenv( 1, 'SPOTRF', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_spstf2( uplo, n, a( 1, 1 ), lda, piv, rank, tol, work,info )
+              go to 200
+           else
+           ! initialize piv
+              do i = 1, n
+                 piv( i ) = i
+              end do
+           ! compute stopping value
+              pvt = 1
+              ajj = a( pvt, pvt )
+              do i = 2, n
+                 if( a( i, i )>ajj ) then
+                    pvt = i
+                    ajj = a( pvt, pvt )
+                 end if
+              end do
+              if( ajj<=zero.or.stdlib_sisnan( ajj ) ) then
+                 rank = 0
+                 info = 1
+                 go to 200
+              end if
+           ! compute stopping value if not supplied
+              if( tol<zero ) then
+                 sstop = n * stdlib_slamch( 'EPSILON' ) * ajj
+              else
+                 sstop = tol
+              end if
+              if( upper ) then
+                 ! compute the cholesky factorization p**t * a * p = u**t * u
+                 loop_140: do k = 1, n, nb
+                    ! account for last block not being nb wide
+                    jb = min( nb, n-k+1 )
+                    ! set relevant part of first half of work to zero,
+                    ! holds dot products
+                    do i = k, n
+                       work( i ) = 0
+                    end do
+                    loop_130: do j = k, k + jb - 1
+                    ! find pivot, test for exit, else swap rows and columns
+                    ! update dot products, compute possible pivots which are
+                    ! stored in the second half of work
+                       do i = j, n
+                          if( j>k ) then
+                             work( i ) = work( i ) + a( j-1, i )**2
+                          end if
+                          work( n+i ) = a( i, i ) - work( i )
+                       end do
+                       if( j>1 ) then
+                          itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                          pvt = itemp + j - 1
+                          ajj = work( n+pvt )
+                          if( ajj<=sstop.or.stdlib_sisnan( ajj ) ) then
+                             a( j, j ) = ajj
+                             go to 190
+                          end if
+                       end if
+                       if( j/=pvt ) then
+                          ! pivot ok, so can now swap pivot rows and columns
+                          a( pvt, pvt ) = a( j, j )
+                          call stdlib_sswap( j-1, a( 1, j ), 1, a( 1, pvt ), 1 )
+                          if( pvt<n )call stdlib_sswap( n-pvt, a( j, pvt+1 ), lda,a( pvt, pvt+1 ),&
+                                     lda )
+                          call stdlib_sswap( pvt-j-1, a( j, j+1 ), lda,a( j+1, pvt ), 1 )
+                          ! swap dot products and piv
+                          stemp = work( j )
+                          work( j ) = work( pvt )
+                          work( pvt ) = stemp
+                          itemp = piv( pvt )
+                          piv( pvt ) = piv( j )
+                          piv( j ) = itemp
+                       end if
+                       ajj = sqrt( ajj )
+                       a( j, j ) = ajj
+                       ! compute elements j+1:n of row j.
+                       if( j<n ) then
+                          call stdlib_sgemv( 'TRANS', j-k, n-j, -one, a( k, j+1 ),lda, a( k, j ), &
+                                    1, one, a( j, j+1 ),lda )
+                          call stdlib_sscal( n-j, one / ajj, a( j, j+1 ), lda )
+                       end if
+                    end do loop_130
+                    ! update trailing matrix, j already incremented
+                    if( k+jb<=n ) then
+                       call stdlib_ssyrk( 'UPPER', 'TRANS', n-j+1, jb, -one,a( k, j ), lda, one, &
+                                 a( j, j ), lda )
+                    end if
+                 end do loop_140
+              else
+              ! compute the cholesky factorization p**t * a * p = l * l**t
+                 loop_180: do k = 1, n, nb
+                    ! account for last block not being nb wide
+                    jb = min( nb, n-k+1 )
+                    ! set relevant part of first half of work to zero,
+                    ! holds dot products
+                    do i = k, n
+                       work( i ) = 0
+                    end do
+                    loop_170: do j = k, k + jb - 1
+                    ! find pivot, test for exit, else swap rows and columns
+                    ! update dot products, compute possible pivots which are
+                    ! stored in the second half of work
+                       do i = j, n
+                          if( j>k ) then
+                             work( i ) = work( i ) + a( i, j-1 )**2
+                          end if
+                          work( n+i ) = a( i, i ) - work( i )
+                       end do
+                       if( j>1 ) then
+                          itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                          pvt = itemp + j - 1
+                          ajj = work( n+pvt )
+                          if( ajj<=sstop.or.stdlib_sisnan( ajj ) ) then
+                             a( j, j ) = ajj
+                             go to 190
+                          end if
+                       end if
+                       if( j/=pvt ) then
+                          ! pivot ok, so can now swap pivot rows and columns
+                          a( pvt, pvt ) = a( j, j )
+                          call stdlib_sswap( j-1, a( j, 1 ), lda, a( pvt, 1 ), lda )
+                          if( pvt<n )call stdlib_sswap( n-pvt, a( pvt+1, j ), 1,a( pvt+1, pvt ), &
+                                    1 )
+                          call stdlib_sswap( pvt-j-1, a( j+1, j ), 1, a( pvt, j+1 ),lda )
+                          ! swap dot products and piv
+                          stemp = work( j )
+                          work( j ) = work( pvt )
+                          work( pvt ) = stemp
+                          itemp = piv( pvt )
+                          piv( pvt ) = piv( j )
+                          piv( j ) = itemp
+                       end if
+                       ajj = sqrt( ajj )
+                       a( j, j ) = ajj
+                       ! compute elements j+1:n of column j.
+                       if( j<n ) then
+                          call stdlib_sgemv( 'NO TRANS', n-j, j-k, -one,a( j+1, k ), lda, a( j, k &
+                                    ), lda, one,a( j+1, j ), 1 )
+                          call stdlib_sscal( n-j, one / ajj, a( j+1, j ), 1 )
+                       end if
+                    end do loop_170
+                    ! update trailing matrix, j already incremented
+                    if( k+jb<=n ) then
+                       call stdlib_ssyrk( 'LOWER', 'NO TRANS', n-j+1, jb, -one,a( j, k ), lda, &
+                                 one, a( j, j ), lda )
+                    end if
+                 end do loop_180
+              end if
+           end if
+           ! ran to completion, a has full rank
+           rank = n
+           go to 200
+           190 continue
+           ! rank is the number of steps completed.  set info = 1 to signal
+           ! that the factorization cannot be used to solve a system.
+           rank = j - 1
+           info = 1
+           200 continue
+           return
+     end subroutine stdlib_spstrf
+
+     !> SPTTRS: solves a tridiagonal system of the form
+     !> A * X = B
+     !> using the L*D*L**T factorization of A computed by SPTTRF.  D is a
+     !> diagonal matrix specified in the vector D, L is a unit bidiagonal
+     !> matrix whose subdiagonal is specified in the vector E, and X and B
+     !> are N by NRHS matrices.
+
+     pure subroutine stdlib_spttrs( n, nrhs, d, e, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(inout) :: b(ldb,*)
+           real(sp), intent(in) :: d(*), e(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPTTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! determine the number of right-hand sides to solve at a time.
+           if( nrhs==1 ) then
+              nb = 1
+           else
+              nb = max( 1, stdlib_ilaenv( 1, 'SPTTRS', ' ', n, nrhs, -1, -1 ) )
+           end if
+           if( nb>=nrhs ) then
+              call stdlib_sptts2( n, nrhs, d, e, b, ldb )
+           else
+              do j = 1, nrhs, nb
+                 jb = min( nrhs-j+1, nb )
+                 call stdlib_sptts2( n, jb, d, e, b( 1, j ), ldb )
+              end do
+           end if
+           return
+     end subroutine stdlib_spttrs
+
+     !> SSB2ST_KERNELS: is an internal routine used by the SSYTRD_SB2ST
+     !> subroutine.
+
+     pure subroutine  stdlib_ssb2st_kernels( uplo, wantz, ttype,st, ed, sweep, n, nb, ib,a, lda, &
+               v, tau, ldvt, work)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: wantz
+           integer(ilp), intent(in) :: ttype, st, ed, sweep, n, nb, ib, lda, ldvt
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: v(*), tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j1, j2, lm, ln, vpos, taupos, dpos, ofdpos, ajeter
+           real(sp) :: ctmp
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ajeter = ib + ldvt
+           upper = stdlib_lsame( uplo, 'U' )
+           if( upper ) then
+               dpos    = 2 * nb + 1
+               ofdpos  = 2 * nb
+           else
+               dpos    = 1
+               ofdpos  = 2
+           endif
+           ! upper case
+           if( upper ) then
+               if( wantz ) then
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               else
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               endif
+               if( ttype==1 ) then
+                   lm = ed - st + 1
+                   v( vpos ) = one
+                   do i = 1, lm-1
+                       v( vpos+i )         = ( a( ofdpos-i, st+i ) )
+                       a( ofdpos-i, st+i ) = zero
+                   end do
+                   ctmp = ( a( ofdpos, st ) )
+                   call stdlib_slarfg( lm, ctmp, v( vpos+1 ), 1,tau( taupos ) )
+                   a( ofdpos, st ) = ctmp
+                   lm = ed - st + 1
+                   call stdlib_slarfy( uplo, lm, v( vpos ), 1,( tau( taupos ) ),a( dpos, st ), &
+                             lda-1, work)
+               endif
+               if( ttype==3 ) then
+                   lm = ed - st + 1
+                   call stdlib_slarfy( uplo, lm, v( vpos ), 1,( tau( taupos ) ),a( dpos, st ), &
+                             lda-1, work)
+               endif
+               if( ttype==2 ) then
+                   j1 = ed+1
+                   j2 = min( ed+nb, n )
+                   ln = ed-st+1
+                   lm = j2-j1+1
+                   if( lm>0) then
+                       call stdlib_slarfx( 'LEFT', ln, lm, v( vpos ),( tau( taupos ) ),a( dpos-nb,&
+                                  j1 ), lda-1, work)
+                       if( wantz ) then
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       else
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       endif
+                       v( vpos ) = one
+                       do i = 1, lm-1
+                           v( vpos+i )          =( a( dpos-nb-i, j1+i ) )
+                           a( dpos-nb-i, j1+i ) = zero
+                       end do
+                       ctmp = ( a( dpos-nb, j1 ) )
+                       call stdlib_slarfg( lm, ctmp, v( vpos+1 ), 1, tau( taupos ) )
+                       a( dpos-nb, j1 ) = ctmp
+                       call stdlib_slarfx( 'RIGHT', ln-1, lm, v( vpos ),tau( taupos ),a( dpos-nb+&
+                                 1, j1 ), lda-1, work)
+                   endif
+               endif
+           ! lower case
+           else
+               if( wantz ) then
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               else
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               endif
+               if( ttype==1 ) then
+                   lm = ed - st + 1
+                   v( vpos ) = one
+                   do i = 1, lm-1
+                       v( vpos+i )         = a( ofdpos+i, st-1 )
+                       a( ofdpos+i, st-1 ) = zero
+                   end do
+                   call stdlib_slarfg( lm, a( ofdpos, st-1 ), v( vpos+1 ), 1,tau( taupos ) )
+                             
+                   lm = ed - st + 1
+                   call stdlib_slarfy( uplo, lm, v( vpos ), 1,( tau( taupos ) ),a( dpos, st ), &
+                             lda-1, work)
+               endif
+               if( ttype==3 ) then
+                   lm = ed - st + 1
+                   call stdlib_slarfy( uplo, lm, v( vpos ), 1,( tau( taupos ) ),a( dpos, st ), &
+                             lda-1, work)
+               endif
+               if( ttype==2 ) then
+                   j1 = ed+1
+                   j2 = min( ed+nb, n )
+                   ln = ed-st+1
+                   lm = j2-j1+1
+                   if( lm>0) then
+                       call stdlib_slarfx( 'RIGHT', lm, ln, v( vpos ),tau( taupos ), a( dpos+nb, &
+                                 st ),lda-1, work)
+                       if( wantz ) then
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       else
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       endif
+                       v( vpos ) = one
+                       do i = 1, lm-1
+                           v( vpos+i )        = a( dpos+nb+i, st )
+                           a( dpos+nb+i, st ) = zero
+                       end do
+                       call stdlib_slarfg( lm, a( dpos+nb, st ), v( vpos+1 ), 1,tau( taupos ) )
+                                 
+                       call stdlib_slarfx( 'LEFT', lm, ln-1, v( vpos ),( tau( taupos ) ),a( dpos+&
+                                 nb-1, st+1 ), lda-1, work)
+                   endif
+               endif
+           endif
+           return
+     end subroutine stdlib_ssb2st_kernels
+
+     !> SSPCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric packed matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by SSPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_sspcon( uplo, n, ap, ipiv, anorm, rcond, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: ap(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ip, kase
+           real(sp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              ip = n*( n+1 ) / 2
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip - i
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              ip = 1
+              do i = 1, n
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip + n - i + 1
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_ssptrs( uplo, n, 1, ap, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_sspcon
+
+     !> SSPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric indefinite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_ssprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, work,&
+                iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*)
+           real(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_scopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_sspmv( uplo, n, -one, ap, x( 1, j ), 1, one, work( n+1 ),1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       work( i ) = work( i ) + abs( ap( ik ) )*xk
+                       s = s + abs( ap( ik ) )*abs( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    work( k ) = work( k ) + abs( ap( kk+k-1 ) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( ap( kk ) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       work( i ) = work( i ) + abs( ap( ik ) )*xk
+                       s = s + abs( ap( ik ) )*abs( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    work( k ) = work( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_ssptrs( uplo, n, 1, afp, ipiv, work( n+1 ), n, info )
+                 call stdlib_saxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_slacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_slacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_ssptrs( uplo, n, 1, afp, ipiv, work( n+1 ), n,info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_ssptrs( uplo, n, 1, afp, ipiv, work( n+1 ), n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_ssprfs
+
+     !> SSPSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix stored in packed format and X
+     !> and B are N-by-NRHS matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, D is symmetric and block diagonal with 1-by-1
+     !> and 2-by-2 diagonal blocks.  The factored form of A is then used to
+     !> solve the system of equations A * X = B.
+
+     pure subroutine stdlib_sspsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPSV ', -info )
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_ssptrf( uplo, n, ap, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_ssptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_sspsv
+
+     !> SSPSVX: uses the diagonal pivoting factorization A = U*D*U**T or
+     !> A = L*D*L**T to compute the solution to a real system of linear
+     !> equations A * X = B, where A is an N-by-N symmetric matrix stored
+     !> in packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_sspsvx( fact, uplo, n, nrhs, ap, afp, ipiv, b, ldb, x,ldx, rcond, ferr, &
+               berr, work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: afp(*)
+           real(sp), intent(in) :: ap(*), b(ldb,*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(sp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+              call stdlib_scopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_ssptrf( uplo, n, afp, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_slansp( 'I', uplo, n, ap, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_sspcon( uplo, n, afp, ipiv, anorm, rcond, work, iwork, info )
+           ! compute the solution vectors x.
+           call stdlib_slacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_ssptrs( uplo, n, nrhs, afp, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_ssprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx, ferr,berr, work, &
+                     iwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_sspsvx
+
+     !> SSPTRD: reduces a real symmetric matrix A stored in packed form to
+     !> symmetric tridiagonal form T by an orthogonal similarity
+     !> transformation: Q**T * A * Q = T.
+
+     pure subroutine stdlib_ssptrd( uplo, n, ap, d, e, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ap(*)
+           real(sp), intent(out) :: d(*), e(*), tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, i1, i1i1, ii
+           real(sp) :: alpha, taui
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPTRD', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           if( upper ) then
+              ! reduce the upper triangle of a.
+              ! i1 is the index in ap of a(1,i+1).
+              i1 = n*( n-1 ) / 2 + 1
+              do i = n - 1, 1, -1
+                 ! generate elementary reflector h(i) = i - tau * v * v**t
+                 ! to annihilate a(1:i-1,i+1)
+                 call stdlib_slarfg( i, ap( i1+i-1 ), ap( i1 ), 1, taui )
+                 e( i ) = ap( i1+i-1 )
+                 if( taui/=zero ) then
+                    ! apply h(i) from both sides to a(1:i,1:i)
+                    ap( i1+i-1 ) = one
+                    ! compute  y := tau * a * v  storing y in tau(1:i)
+                    call stdlib_sspmv( uplo, i, taui, ap, ap( i1 ), 1, zero, tau,1 )
+                    ! compute  w := y - 1/2 * tau * (y**t *v) * v
+                    alpha = -half*taui*stdlib_sdot( i, tau, 1, ap( i1 ), 1 )
+                    call stdlib_saxpy( i, alpha, ap( i1 ), 1, tau, 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**t - w * v**t
+                    call stdlib_sspr2( uplo, i, -one, ap( i1 ), 1, tau, 1, ap )
+                    ap( i1+i-1 ) = e( i )
+                 end if
+                 d( i+1 ) = ap( i1+i )
+                 tau( i ) = taui
+                 i1 = i1 - i
+              end do
+              d( 1 ) = ap( 1 )
+           else
+              ! reduce the lower triangle of a. ii is the index in ap of
+              ! a(i,i) and i1i1 is the index of a(i+1,i+1).
+              ii = 1
+              do i = 1, n - 1
+                 i1i1 = ii + n - i + 1
+                 ! generate elementary reflector h(i) = i - tau * v * v**t
+                 ! to annihilate a(i+2:n,i)
+                 call stdlib_slarfg( n-i, ap( ii+1 ), ap( ii+2 ), 1, taui )
+                 e( i ) = ap( ii+1 )
+                 if( taui/=zero ) then
+                    ! apply h(i) from both sides to a(i+1:n,i+1:n)
+                    ap( ii+1 ) = one
+                    ! compute  y := tau * a * v  storing y in tau(i:n-1)
+                    call stdlib_sspmv( uplo, n-i, taui, ap( i1i1 ), ap( ii+1 ), 1,zero, tau( i ), &
+                              1 )
+                    ! compute  w := y - 1/2 * tau * (y**t *v) * v
+                    alpha = -half*taui*stdlib_sdot( n-i, tau( i ), 1, ap( ii+1 ),1 )
+                    call stdlib_saxpy( n-i, alpha, ap( ii+1 ), 1, tau( i ), 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**t - w * v**t
+                    call stdlib_sspr2( uplo, n-i, -one, ap( ii+1 ), 1, tau( i ), 1,ap( i1i1 ) )
+                              
+                    ap( ii+1 ) = e( i )
+                 end if
+                 d( i ) = ap( ii )
+                 tau( i ) = taui
+                 ii = i1i1
+              end do
+              d( n ) = ap( ii )
+           end if
+           return
+     end subroutine stdlib_ssptrd
+
+     !> SSTEIN: computes the eigenvectors of a real symmetric tridiagonal
+     !> matrix T corresponding to specified eigenvalues, using inverse
+     !> iteration.
+     !> The maximum number of iterations allowed for each eigenvector is
+     !> specified by an internal parameter MAXITS (currently set to 5).
+
+     pure subroutine stdlib_sstein( n, d, e, m, w, iblock, isplit, z, ldz, work,iwork, ifail, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, m, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: iblock(*), isplit(*)
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(in) :: d(*), e(*), w(*)
+           real(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: odm3 = 1.0e-3_sp
+           real(sp), parameter :: odm1 = 1.0e-1_sp
+           integer(ilp), parameter :: maxits = 5
+           integer(ilp), parameter :: extra = 2
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: b1, blksiz, bn, gpind, i, iinfo, indrv1, indrv2, indrv3, indrv4, &
+                     indrv5, its, j, j1, jblk, jmax, nblk, nrmchk
+           real(sp) :: ctr, eps, eps1, nrm, onenrm, ortol, pertol, scl, sep, stpcrt, tol, xj, &
+                     xjm
+           ! Local Arrays 
+           integer(ilp) :: iseed(4)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           do i = 1, m
+              ifail( i ) = 0
+           end do
+           if( n<0 ) then
+              info = -1
+           else if( m<0 .or. m>n ) then
+              info = -4
+           else if( ldz<max( 1, n ) ) then
+              info = -9
+           else
+              do j = 2, m
+                 if( iblock( j )<iblock( j-1 ) ) then
+                    info = -6
+                    go to 30
+                 end if
+                 if( iblock( j )==iblock( j-1 ) .and. w( j )<w( j-1 ) )then
+                    info = -5
+                    go to 30
+                 end if
+              end do
+              30 continue
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSTEIN', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) then
+              return
+           else if( n==1 ) then
+              z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           eps = stdlib_slamch( 'PRECISION' )
+           ! initialize seed for random number generator stdlib_slarnv.
+           do i = 1, 4
+              iseed( i ) = 1
+           end do
+           ! initialize pointers.
+           indrv1 = 0
+           indrv2 = indrv1 + n
+           indrv3 = indrv2 + n
+           indrv4 = indrv3 + n
+           indrv5 = indrv4 + n
+           ! compute eigenvectors of matrix blocks.
+           j1 = 1
+           loop_160: do nblk = 1, iblock( m )
+              ! find starting and ending indices of block nblk.
+              if( nblk==1 ) then
+                 b1 = 1
+              else
+                 b1 = isplit( nblk-1 ) + 1
+              end if
+              bn = isplit( nblk )
+              blksiz = bn - b1 + 1
+              if( blksiz==1 )go to 60
+              gpind = j1
+              ! compute reorthogonalization criterion and stopping criterion.
+              onenrm = abs( d( b1 ) ) + abs( e( b1 ) )
+              onenrm = max( onenrm, abs( d( bn ) )+abs( e( bn-1 ) ) )
+              do i = b1 + 1, bn - 1
+                 onenrm = max( onenrm, abs( d( i ) )+abs( e( i-1 ) )+abs( e( i ) ) )
+              end do
+              ortol = odm3*onenrm
+              stpcrt = sqrt( odm1 / blksiz )
+              ! loop through eigenvalues of block nblk.
+              60 continue
+              jblk = 0
+              loop_150: do j = j1, m
+                 if( iblock( j )/=nblk ) then
+                    j1 = j
+                    cycle loop_160
+                 end if
+                 jblk = jblk + 1
+                 xj = w( j )
+                 ! skip all the work if the block size is one.
+                 if( blksiz==1 ) then
+                    work( indrv1+1 ) = one
+                    go to 120
+                 end if
+                 ! if eigenvalues j and j-1 are too close, add a relatively
+                 ! small perturbation.
+                 if( jblk>1 ) then
+                    eps1 = abs( eps*xj )
+                    pertol = ten*eps1
+                    sep = xj - xjm
+                    if( sep<pertol )xj = xjm + pertol
+                 end if
+                 its = 0
+                 nrmchk = 0
+                 ! get random starting vector.
+                 call stdlib_slarnv( 2, iseed, blksiz, work( indrv1+1 ) )
+                 ! copy the matrix t so it won't be destroyed in factorization.
+                 call stdlib_scopy( blksiz, d( b1 ), 1, work( indrv4+1 ), 1 )
+                 call stdlib_scopy( blksiz-1, e( b1 ), 1, work( indrv2+2 ), 1 )
+                 call stdlib_scopy( blksiz-1, e( b1 ), 1, work( indrv3+1 ), 1 )
+                 ! compute lu factors with partial pivoting  ( pt = lu )
+                 tol = zero
+                 call stdlib_slagtf( blksiz, work( indrv4+1 ), xj, work( indrv2+2 ),work( indrv3+&
+                           1 ), tol, work( indrv5+1 ), iwork,iinfo )
+                 ! update iteration count.
+                 70 continue
+                 its = its + 1
+                 if( its>maxits )go to 100
+                 ! normalize and scale the righthand side vector pb.
+                 jmax = stdlib_isamax( blksiz, work( indrv1+1 ), 1 )
+                 scl = blksiz*onenrm*max( eps,abs( work( indrv4+blksiz ) ) ) /abs( work( indrv1+&
+                           jmax ) )
+                 call stdlib_sscal( blksiz, scl, work( indrv1+1 ), 1 )
+                 ! solve the system lu = pb.
+                 call stdlib_slagts( -1, blksiz, work( indrv4+1 ), work( indrv2+2 ),work( indrv3+&
+                           1 ), work( indrv5+1 ), iwork,work( indrv1+1 ), tol, iinfo )
+                 ! reorthogonalize by modified gram-schmidt if eigenvalues are
+                 ! close enough.
+                 if( jblk==1 )go to 90
+                 if( abs( xj-xjm )>ortol )gpind = j
+                 if( gpind/=j ) then
+                    do i = gpind, j - 1
+                       ctr = -stdlib_sdot( blksiz, work( indrv1+1 ), 1, z( b1, i ),1 )
+                       call stdlib_saxpy( blksiz, ctr, z( b1, i ), 1,work( indrv1+1 ), 1 )
+                    end do
+                 end if
+                 ! check the infinity norm of the iterate.
+                 90 continue
+                 jmax = stdlib_isamax( blksiz, work( indrv1+1 ), 1 )
+                 nrm = abs( work( indrv1+jmax ) )
+                 ! continue for additional iterations after norm reaches
+                 ! stopping criterion.
+                 if( nrm<stpcrt )go to 70
+                 nrmchk = nrmchk + 1
+                 if( nrmchk<extra+1 )go to 70
+                 go to 110
+                 ! if stopping criterion was not satisfied, update info and
+                 ! store eigenvector number in array ifail.
+                 100 continue
+                 info = info + 1
+                 ifail( info ) = j
+                 ! accept iterate as jth eigenvector.
+                 110 continue
+                 scl = one / stdlib_snrm2( blksiz, work( indrv1+1 ), 1 )
+                 jmax = stdlib_isamax( blksiz, work( indrv1+1 ), 1 )
+                 if( work( indrv1+jmax )<zero )scl = -scl
+                 call stdlib_sscal( blksiz, scl, work( indrv1+1 ), 1 )
+                 120 continue
+                 do i = 1, n
+                    z( i, j ) = zero
+                 end do
+                 do i = 1, blksiz
+                    z( b1+i-1, j ) = work( indrv1+i )
+                 end do
+                 ! save the shift to check eigenvalue spacing at next
+                 ! iteration.
+                 xjm = xj
+              end do loop_150
+           end do loop_160
+           return
+     end subroutine stdlib_sstein
+
+     !> SSTEQR: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric tridiagonal matrix using the implicit QL or QR method.
+     !> The eigenvectors of a full or band symmetric matrix can also be found
+     !> if SSYTRD or SSPTRD or SSBTRD has been used to reduce this matrix to
+     !> tridiagonal form.
+
+     pure subroutine stdlib_ssteqr( compz, n, d, e, z, ldz, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: d(*), e(*), z(ldz,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 30
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, icompz, ii, iscale, j, jtot, k, l, l1, lend, lendm1, lendp1, lendsv,&
+                      lm1, lsv, m, mm, mm1, nm1, nmaxit
+           real(sp) :: anorm, b, c, eps, eps2, f, g, p, r, rt1, rt2, s, safmax, safmin, ssfmax, &
+                     ssfmin, tst
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or. ( icompz>0 .and. ldz<max( 1,n ) ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSTEQR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz==2 )z( 1, 1 ) = one
+              return
+           end if
+           ! determine the unit roundoff and over/underflow thresholds.
+           eps = stdlib_slamch( 'E' )
+           eps2 = eps**2
+           safmin = stdlib_slamch( 'S' )
+           safmax = one / safmin
+           ssfmax = sqrt( safmax ) / three
+           ssfmin = sqrt( safmin ) / eps2
+           ! compute the eigenvalues and eigenvectors of the tridiagonal
+           ! matrix.
+           if( icompz==2 )call stdlib_slaset( 'FULL', n, n, zero, one, z, ldz )
+           nmaxit = n*maxit
+           jtot = 0
+           ! determine where the matrix splits and choose ql or qr iteration
+           ! for each block, according to whether top or bottom diagonal
+           ! element is smaller.
+           l1 = 1
+           nm1 = n - 1
+           10 continue
+           if( l1>n )go to 160
+           if( l1>1 )e( l1-1 ) = zero
+           if( l1<=nm1 ) then
+              do m = l1, nm1
+                 tst = abs( e( m ) )
+                 if( tst==zero )go to 30
+                 if( tst<=( sqrt( abs( d( m ) ) )*sqrt( abs( d( m+1 ) ) ) )*eps ) then
+                    e( m ) = zero
+                    go to 30
+                 end if
+              end do
+           end if
+           m = n
+           30 continue
+           l = l1
+           lsv = l
+           lend = m
+           lendsv = lend
+           l1 = m + 1
+           if( lend==l )go to 10
+           ! scale submatrix in rows and columns l to lend
+           anorm = stdlib_slanst( 'M', lend-l+1, d( l ), e( l ) )
+           iscale = 0
+           if( anorm==zero )go to 10
+           if( anorm>ssfmax ) then
+              iscale = 1
+              call stdlib_slascl( 'G', 0, 0, anorm, ssfmax, lend-l+1, 1, d( l ), n,info )
+              call stdlib_slascl( 'G', 0, 0, anorm, ssfmax, lend-l, 1, e( l ), n,info )
+           else if( anorm<ssfmin ) then
+              iscale = 2
+              call stdlib_slascl( 'G', 0, 0, anorm, ssfmin, lend-l+1, 1, d( l ), n,info )
+              call stdlib_slascl( 'G', 0, 0, anorm, ssfmin, lend-l, 1, e( l ), n,info )
+           end if
+           ! choose between ql and qr iteration
+           if( abs( d( lend ) )<abs( d( l ) ) ) then
+              lend = lsv
+              l = lendsv
+           end if
+           if( lend>l ) then
+              ! ql iteration
+              ! look for small subdiagonal element.
+              40 continue
+              if( l/=lend ) then
+                 lendm1 = lend - 1
+                 do m = l, lendm1
+                    tst = abs( e( m ) )**2
+                    if( tst<=( eps2*abs( d( m ) ) )*abs( d( m+1 ) )+safmin )go to 60
+                 end do
+              end if
+              m = lend
+              60 continue
+              if( m<lend )e( m ) = zero
+              p = d( l )
+              if( m==l )go to 80
+              ! if remaining matrix is 2-by-2, use stdlib_slae2 or stdlib_slaev2
+              ! to compute its eigensystem.
+              if( m==l+1 ) then
+                 if( icompz>0 ) then
+                    call stdlib_slaev2( d( l ), e( l ), d( l+1 ), rt1, rt2, c, s )
+                    work( l ) = c
+                    work( n-1+l ) = s
+                    call stdlib_slasr( 'R', 'V', 'B', n, 2, work( l ),work( n-1+l ), z( 1, l ), &
+                              ldz )
+                 else
+                    call stdlib_slae2( d( l ), e( l ), d( l+1 ), rt1, rt2 )
+                 end if
+                 d( l ) = rt1
+                 d( l+1 ) = rt2
+                 e( l ) = zero
+                 l = l + 2
+                 if( l<=lend )go to 40
+                 go to 140
+              end if
+              if( jtot==nmaxit )go to 140
+              jtot = jtot + 1
+              ! form shift.
+              g = ( d( l+1 )-p ) / ( two*e( l ) )
+              r = stdlib_slapy2( g, one )
+              g = d( m ) - p + ( e( l ) / ( g+sign( r, g ) ) )
+              s = one
+              c = one
+              p = zero
+              ! inner loop
+              mm1 = m - 1
+              do i = mm1, l, -1
+                 f = s*e( i )
+                 b = c*e( i )
+                 call stdlib_slartg( g, f, c, s, r )
+                 if( i/=m-1 )e( i+1 ) = r
+                 g = d( i+1 ) - p
+                 r = ( d( i )-g )*s + two*c*b
+                 p = s*r
+                 d( i+1 ) = g + p
+                 g = c*r - b
+                 ! if eigenvectors are desired, then save rotations.
+                 if( icompz>0 ) then
+                    work( i ) = c
+                    work( n-1+i ) = -s
+                 end if
+              end do
+              ! if eigenvectors are desired, then apply saved rotations.
+              if( icompz>0 ) then
+                 mm = m - l + 1
+                 call stdlib_slasr( 'R', 'V', 'B', n, mm, work( l ), work( n-1+l ),z( 1, l ), ldz &
+                           )
+              end if
+              d( l ) = d( l ) - p
+              e( l ) = g
+              go to 40
+              ! eigenvalue found.
+              80 continue
+              d( l ) = p
+              l = l + 1
+              if( l<=lend )go to 40
+              go to 140
+           else
+              ! qr iteration
+              ! look for small superdiagonal element.
+              90 continue
+              if( l/=lend ) then
+                 lendp1 = lend + 1
+                 do m = l, lendp1, -1
+                    tst = abs( e( m-1 ) )**2
+                    if( tst<=( eps2*abs( d( m ) ) )*abs( d( m-1 ) )+safmin )go to 110
+                 end do
+              end if
+              m = lend
+              110 continue
+              if( m>lend )e( m-1 ) = zero
+              p = d( l )
+              if( m==l )go to 130
+              ! if remaining matrix is 2-by-2, use stdlib_slae2 or stdlib_slaev2
+              ! to compute its eigensystem.
+              if( m==l-1 ) then
+                 if( icompz>0 ) then
+                    call stdlib_slaev2( d( l-1 ), e( l-1 ), d( l ), rt1, rt2, c, s )
+                    work( m ) = c
+                    work( n-1+m ) = s
+                    call stdlib_slasr( 'R', 'V', 'F', n, 2, work( m ),work( n-1+m ), z( 1, l-1 ), &
+                              ldz )
+                 else
+                    call stdlib_slae2( d( l-1 ), e( l-1 ), d( l ), rt1, rt2 )
+                 end if
+                 d( l-1 ) = rt1
+                 d( l ) = rt2
+                 e( l-1 ) = zero
+                 l = l - 2
+                 if( l>=lend )go to 90
+                 go to 140
+              end if
+              if( jtot==nmaxit )go to 140
+              jtot = jtot + 1
+              ! form shift.
+              g = ( d( l-1 )-p ) / ( two*e( l-1 ) )
+              r = stdlib_slapy2( g, one )
+              g = d( m ) - p + ( e( l-1 ) / ( g+sign( r, g ) ) )
+              s = one
+              c = one
+              p = zero
+              ! inner loop
+              lm1 = l - 1
+              do i = m, lm1
+                 f = s*e( i )
+                 b = c*e( i )
+                 call stdlib_slartg( g, f, c, s, r )
+                 if( i/=m )e( i-1 ) = r
+                 g = d( i ) - p
+                 r = ( d( i+1 )-g )*s + two*c*b
+                 p = s*r
+                 d( i ) = g + p
+                 g = c*r - b
+                 ! if eigenvectors are desired, then save rotations.
+                 if( icompz>0 ) then
+                    work( i ) = c
+                    work( n-1+i ) = s
+                 end if
+              end do
+              ! if eigenvectors are desired, then apply saved rotations.
+              if( icompz>0 ) then
+                 mm = l - m + 1
+                 call stdlib_slasr( 'R', 'V', 'F', n, mm, work( m ), work( n-1+m ),z( 1, m ), ldz &
+                           )
+              end if
+              d( l ) = d( l ) - p
+              e( lm1 ) = g
+              go to 90
+              ! eigenvalue found.
+              130 continue
+              d( l ) = p
+              l = l - 1
+              if( l>=lend )go to 90
+              go to 140
+           end if
+           ! undo scaling if necessary
+           140 continue
+           if( iscale==1 ) then
+              call stdlib_slascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv+1, 1,d( lsv ), n, info )
+                        
+              call stdlib_slascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv, 1, e( lsv ),n, info )
+                        
+           else if( iscale==2 ) then
+              call stdlib_slascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv+1, 1,d( lsv ), n, info )
+                        
+              call stdlib_slascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv, 1, e( lsv ),n, info )
+                        
+           end if
+           ! check for no convergence to an eigenvalue after a total
+           ! of n*maxit iterations.
+           if( jtot<nmaxit )go to 10
+           do i = 1, n - 1
+              if( e( i )/=zero )info = info + 1
+           end do
+           go to 190
+           ! order eigenvalues and eigenvectors.
+           160 continue
+           if( icompz==0 ) then
+              ! use quick sort
+              call stdlib_slasrt( 'I', n, d, info )
+           else
+              ! use selection sort to minimize swaps of eigenvectors
+              do ii = 2, n
+                 i = ii - 1
+                 k = i
+                 p = d( i )
+                 do j = ii, n
+                    if( d( j )<p ) then
+                       k = j
+                       p = d( j )
+                    end if
+                 end do
+                 if( k/=i ) then
+                    d( k ) = d( i )
+                    d( i ) = p
+                    call stdlib_sswap( n, z( 1, i ), 1, z( 1, k ), 1 )
+                 end if
+              end do
+           end if
+           190 continue
+           return
+     end subroutine stdlib_ssteqr
+
+     !> SSTERF: computes all eigenvalues of a symmetric tridiagonal matrix
+     !> using the Pal-Walker-Kahan variant of the QL or QR algorithm.
+
+     pure subroutine stdlib_ssterf( n, d, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(inout) :: d(*), e(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 30
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iscale, jtot, l, l1, lend, lendsv, lsv, m, nmaxit
+           real(sp) :: alpha, anorm, bb, c, eps, eps2, gamma, oldc, oldgam, p, r, rt1, rt2, rte, &
+                     s, safmax, safmin, sigma, ssfmax, ssfmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           ! quick return if possible
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'SSTERF', -info )
+              return
+           end if
+           if( n<=1 )return
+           ! determine the unit roundoff for this environment.
+           eps = stdlib_slamch( 'E' )
+           eps2 = eps**2
+           safmin = stdlib_slamch( 'S' )
+           safmax = one / safmin
+           ssfmax = sqrt( safmax ) / three
+           ssfmin = sqrt( safmin ) / eps2
+           ! compute the eigenvalues of the tridiagonal matrix.
+           nmaxit = n*maxit
+           sigma = zero
+           jtot = 0
+           ! determine where the matrix splits and choose ql or qr iteration
+           ! for each block, according to whether top or bottom diagonal
+           ! element is smaller.
+           l1 = 1
+           10 continue
+           if( l1>n )go to 170
+           if( l1>1 )e( l1-1 ) = zero
+           do m = l1, n - 1
+              if( abs( e( m ) )<=( sqrt( abs( d( m ) ) )*sqrt( abs( d( m+1 ) ) ) )*eps ) &
+                        then
+                 e( m ) = zero
+                 go to 30
+              end if
+           end do
+           m = n
+           30 continue
+           l = l1
+           lsv = l
+           lend = m
+           lendsv = lend
+           l1 = m + 1
+           if( lend==l )go to 10
+           ! scale submatrix in rows and columns l to lend
+           anorm = stdlib_slanst( 'M', lend-l+1, d( l ), e( l ) )
+           iscale = 0
+           if( anorm==zero )go to 10
+           if( anorm>ssfmax ) then
+              iscale = 1
+              call stdlib_slascl( 'G', 0, 0, anorm, ssfmax, lend-l+1, 1, d( l ), n,info )
+              call stdlib_slascl( 'G', 0, 0, anorm, ssfmax, lend-l, 1, e( l ), n,info )
+           else if( anorm<ssfmin ) then
+              iscale = 2
+              call stdlib_slascl( 'G', 0, 0, anorm, ssfmin, lend-l+1, 1, d( l ), n,info )
+              call stdlib_slascl( 'G', 0, 0, anorm, ssfmin, lend-l, 1, e( l ), n,info )
+           end if
+           do i = l, lend - 1
+              e( i ) = e( i )**2
+           end do
+           ! choose between ql and qr iteration
+           if( abs( d( lend ) )<abs( d( l ) ) ) then
+              lend = lsv
+              l = lendsv
+           end if
+           if( lend>=l ) then
+              ! ql iteration
+              ! look for small subdiagonal element.
+              50 continue
+              if( l/=lend ) then
+                 do m = l, lend - 1
+                    if( abs( e( m ) )<=eps2*abs( d( m )*d( m+1 ) ) )go to 70
+                 end do
+              end if
+              m = lend
+              70 continue
+              if( m<lend )e( m ) = zero
+              p = d( l )
+              if( m==l )go to 90
+              ! if remaining matrix is 2 by 2, use stdlib_slae2 to compute its
+              ! eigenvalues.
+              if( m==l+1 ) then
+                 rte = sqrt( e( l ) )
+                 call stdlib_slae2( d( l ), rte, d( l+1 ), rt1, rt2 )
+                 d( l ) = rt1
+                 d( l+1 ) = rt2
+                 e( l ) = zero
+                 l = l + 2
+                 if( l<=lend )go to 50
+                 go to 150
+              end if
+              if( jtot==nmaxit )go to 150
+              jtot = jtot + 1
+              ! form shift.
+              rte = sqrt( e( l ) )
+              sigma = ( d( l+1 )-p ) / ( two*rte )
+              r = stdlib_slapy2( sigma, one )
+              sigma = p - ( rte / ( sigma+sign( r, sigma ) ) )
+              c = one
+              s = zero
+              gamma = d( m ) - sigma
+              p = gamma*gamma
+              ! inner loop
+              do i = m - 1, l, -1
+                 bb = e( i )
+                 r = p + bb
+                 if( i/=m-1 )e( i+1 ) = s*r
+                 oldc = c
+                 c = p / r
+                 s = bb / r
+                 oldgam = gamma
+                 alpha = d( i )
+                 gamma = c*( alpha-sigma ) - s*oldgam
+                 d( i+1 ) = oldgam + ( alpha-gamma )
+                 if( c/=zero ) then
+                    p = ( gamma*gamma ) / c
+                 else
+                    p = oldc*bb
+                 end if
+              end do
+              e( l ) = s*p
+              d( l ) = sigma + gamma
+              go to 50
+              ! eigenvalue found.
+              90 continue
+              d( l ) = p
+              l = l + 1
+              if( l<=lend )go to 50
+              go to 150
+           else
+              ! qr iteration
+              ! look for small superdiagonal element.
+              100 continue
+              do m = l, lend + 1, -1
+                 if( abs( e( m-1 ) )<=eps2*abs( d( m )*d( m-1 ) ) )go to 120
+              end do
+              m = lend
+              120 continue
+              if( m>lend )e( m-1 ) = zero
+              p = d( l )
+              if( m==l )go to 140
+              ! if remaining matrix is 2 by 2, use stdlib_slae2 to compute its
+              ! eigenvalues.
+              if( m==l-1 ) then
+                 rte = sqrt( e( l-1 ) )
+                 call stdlib_slae2( d( l ), rte, d( l-1 ), rt1, rt2 )
+                 d( l ) = rt1
+                 d( l-1 ) = rt2
+                 e( l-1 ) = zero
+                 l = l - 2
+                 if( l>=lend )go to 100
+                 go to 150
+              end if
+              if( jtot==nmaxit )go to 150
+              jtot = jtot + 1
+              ! form shift.
+              rte = sqrt( e( l-1 ) )
+              sigma = ( d( l-1 )-p ) / ( two*rte )
+              r = stdlib_slapy2( sigma, one )
+              sigma = p - ( rte / ( sigma+sign( r, sigma ) ) )
+              c = one
+              s = zero
+              gamma = d( m ) - sigma
+              p = gamma*gamma
+              ! inner loop
+              do i = m, l - 1
+                 bb = e( i )
+                 r = p + bb
+                 if( i/=m )e( i-1 ) = s*r
+                 oldc = c
+                 c = p / r
+                 s = bb / r
+                 oldgam = gamma
+                 alpha = d( i+1 )
+                 gamma = c*( alpha-sigma ) - s*oldgam
+                 d( i ) = oldgam + ( alpha-gamma )
+                 if( c/=zero ) then
+                    p = ( gamma*gamma ) / c
+                 else
+                    p = oldc*bb
+                 end if
+              end do
+              e( l-1 ) = s*p
+              d( l ) = sigma + gamma
+              go to 100
+              ! eigenvalue found.
+              140 continue
+              d( l ) = p
+              l = l - 1
+              if( l>=lend )go to 100
+              go to 150
+           end if
+           ! undo scaling if necessary
+           150 continue
+           if( iscale==1 )call stdlib_slascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv+1, 1,d( lsv ), &
+                     n, info )
+           if( iscale==2 )call stdlib_slascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv+1, 1,d( lsv ), &
+                     n, info )
+           ! check for no convergence to an eigenvalue after a total
+           ! of n*maxit iterations.
+           if( jtot<nmaxit )go to 10
+           do i = 1, n - 1
+              if( e( i )/=zero )info = info + 1
+           end do
+           go to 180
+           ! sort eigenvalues in increasing order.
+           170 continue
+           call stdlib_slasrt( 'I', n, d, info )
+           180 continue
+           return
+     end subroutine stdlib_ssterf
+
+     !> SSTEV: computes all eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric tridiagonal matrix A.
+
+     pure subroutine stdlib_sstev( jobz, n, d, e, z, ldz, work, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantz
+           integer(ilp) :: imax, iscale
+           real(sp) :: bignum, eps, rmax, rmin, safmin, sigma, smlnum, tnrm
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSTEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           tnrm = stdlib_slanst( 'M', n, d, e )
+           if( tnrm>zero .and. tnrm<rmin ) then
+              iscale = 1
+              sigma = rmin / tnrm
+           else if( tnrm>rmax ) then
+              iscale = 1
+              sigma = rmax / tnrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_sscal( n, sigma, d, 1 )
+              call stdlib_sscal( n-1, sigma, e( 1 ), 1 )
+           end if
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvalues and
+           ! eigenvectors, call stdlib_ssteqr.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, d, e, info )
+           else
+              call stdlib_ssteqr( 'I', n, d, e, z, ldz, work, info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, d, 1 )
+           end if
+           return
+     end subroutine stdlib_sstev
+
+     !> SSTEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix A.  Eigenvalues and
+     !> eigenvectors can be selected by specifying either a range of values
+     !> or a range of indices for the desired eigenvalues.
+
+     pure subroutine stdlib_sstevx( jobz, range, n, d, e, vl, vu, il, iu, abstol,m, w, z, ldz, &
+               work, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           integer(ilp), intent(in) :: il, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, imax, indibl, indisp, indiwo, indwrk, iscale, itmp1, j, jj, &
+                     nsplit
+           real(sp) :: bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, tnrm, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -7
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -8
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -9
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSTEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = d( 1 )
+              else
+                 if( vl<d( 1 ) .and. vu>=d( 1 ) ) then
+                    m = 1
+                    w( 1 ) = d( 1 )
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           if ( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           endif
+           tnrm = stdlib_slanst( 'M', n, d, e )
+           if( tnrm>zero .and. tnrm<rmin ) then
+              iscale = 1
+              sigma = rmin / tnrm
+           else if( tnrm>rmax ) then
+              iscale = 1
+              sigma = rmax / tnrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_sscal( n, sigma, d, 1 )
+              call stdlib_sscal( n-1, sigma, e( 1 ), 1 )
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! if all eigenvalues are desired and abstol is less than zero, then
+           ! call stdlib_ssterf or stdlib_ssteqr.  if this fails for some eigenvalue, then
+           ! try stdlib_sstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_scopy( n, d, 1, w, 1 )
+              call stdlib_scopy( n-1, e( 1 ), 1, work( 1 ), 1 )
+              indwrk = n + 1
+              if( .not.wantz ) then
+                 call stdlib_ssterf( n, w, work, info )
+              else
+                 call stdlib_ssteqr( 'I', n, w, work, z, ldz, work( indwrk ), info )
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 20
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_sstebz and, if eigenvectors are desired, stdlib_sstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indwrk = 1
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_sstebz( range, order, n, vll, vuu, il, iu, abstol, d, e, m,nsplit, w, &
+                     iwork( indibl ), iwork( indisp ),work( indwrk ), iwork( indiwo ), info )
+           if( wantz ) then
+              call stdlib_sstein( n, d, e, m, w, iwork( indibl ), iwork( indisp ),z, ldz, work( &
+                        indwrk ), iwork( indiwo ), ifail,info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           20 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_sswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_sstevx
+
+     !> SSYCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by SSYTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_ssycon( uplo, n, a, lda, ipiv, anorm, rcond, work,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(sp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_ssytrs( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_ssycon
+
+     !> SSYCON_ROOK: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a real symmetric matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by SSYTRF_ROOK.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_ssycon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(sp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYCON_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_ssytrs_rook( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_ssycon_rook
+
+     !> SSYRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric indefinite, and
+     !> provides error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_ssyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*)
+           real(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_scopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_ssymv( uplo, n, -one, a, lda, x( 1, j ), 1, one,work( n+1 ), 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    do i = 1, k - 1
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + abs( a( k, k ) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = abs( x( k, j ) )
+                    work( k ) = work( k ) + abs( a( k, k ) )*xk
+                    do i = k + 1, n
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_ssytrs( uplo, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                 call stdlib_saxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_slacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_slacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_ssytrs( uplo, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_ssytrs( uplo, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_ssyrfs
+
+     !> SSYSV_RK: computes the solution to a real system of linear
+     !> equations A * X = B, where A is an N-by-N symmetric matrix
+     !> and X and B are N-by-NRHS matrices.
+     !> The bounded Bunch-Kaufman (rook) diagonal pivoting method is used
+     !> to factor A as
+     !> A = P*U*D*(U**T)*(P**T),  if UPLO = 'U', or
+     !> A = P*L*D*(L**T)*(P**T),  if UPLO = 'L',
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> SSYTRF_RK is called to compute the factorization of a real
+     !> symmetric matrix.  The factored form of A is then used to solve
+     !> the system of equations A * X = B by calling BLAS3 routine SSYTRS_3.
+
+     pure subroutine stdlib_ssysv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,work, lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_ssytrf_rk( uplo, n, a, lda, e, ipiv, work, -1, info )
+                 lwkopt = work(1)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYSV_RK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = p*u*d*(u**t)*(p**t) or
+           ! a = p*u*d*(u**t)*(p**t).
+           call stdlib_ssytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b with blas3 solver, overwriting b with x.
+              call stdlib_ssytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssysv_rk
+
+     !> SSYSV_ROOK: computes the solution to a real system of linear
+     !> equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> SSYTRF_ROOK is called to compute the factorization of a real
+     !> symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method.
+     !> The factored form of A is then used to solve the system
+     !> of equations A * X = B by calling SSYTRS_ROOK.
+
+     pure subroutine stdlib_ssysv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_ssytrf_rook( uplo, n, a, lda, ipiv, work, -1, info )
+                 lwkopt = work(1)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYSV_ROOK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_ssytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              ! solve with trs_rook ( use level 2 blas)
+              call stdlib_ssytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssysv_rook
+
+     !> SSYTD2: reduces a real symmetric matrix A to symmetric tridiagonal
+     !> form T by an orthogonal similarity transformation: Q**T * A * Q = T.
+
+     pure subroutine stdlib_ssytd2( uplo, n, a, lda, d, e, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: d(*), e(*), tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           real(sp) :: alpha, taui
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTD2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           if( upper ) then
+              ! reduce the upper triangle of a
+              do i = n - 1, 1, -1
+                 ! generate elementary reflector h(i) = i - tau * v * v**t
+                 ! to annihilate a(1:i-1,i+1)
+                 call stdlib_slarfg( i, a( i, i+1 ), a( 1, i+1 ), 1, taui )
+                 e( i ) = a( i, i+1 )
+                 if( taui/=zero ) then
+                    ! apply h(i) from both sides to a(1:i,1:i)
+                    a( i, i+1 ) = one
+                    ! compute  x := tau * a * v  storing x in tau(1:i)
+                    call stdlib_ssymv( uplo, i, taui, a, lda, a( 1, i+1 ), 1, zero,tau, 1 )
+                              
+                    ! compute  w := x - 1/2 * tau * (x**t * v) * v
+                    alpha = -half*taui*stdlib_sdot( i, tau, 1, a( 1, i+1 ), 1 )
+                    call stdlib_saxpy( i, alpha, a( 1, i+1 ), 1, tau, 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**t - w * v**t
+                    call stdlib_ssyr2( uplo, i, -one, a( 1, i+1 ), 1, tau, 1, a,lda )
+                    a( i, i+1 ) = e( i )
+                 end if
+                 d( i+1 ) = a( i+1, i+1 )
+                 tau( i ) = taui
+              end do
+              d( 1 ) = a( 1, 1 )
+           else
+              ! reduce the lower triangle of a
+              do i = 1, n - 1
+                 ! generate elementary reflector h(i) = i - tau * v * v**t
+                 ! to annihilate a(i+2:n,i)
+                 call stdlib_slarfg( n-i, a( i+1, i ), a( min( i+2, n ), i ), 1,taui )
+                 e( i ) = a( i+1, i )
+                 if( taui/=zero ) then
+                    ! apply h(i) from both sides to a(i+1:n,i+1:n)
+                    a( i+1, i ) = one
+                    ! compute  x := tau * a * v  storing y in tau(i:n-1)
+                    call stdlib_ssymv( uplo, n-i, taui, a( i+1, i+1 ), lda,a( i+1, i ), 1, zero, &
+                              tau( i ), 1 )
+                    ! compute  w := x - 1/2 * tau * (x**t * v) * v
+                    alpha = -half*taui*stdlib_sdot( n-i, tau( i ), 1, a( i+1, i ),1 )
+                    call stdlib_saxpy( n-i, alpha, a( i+1, i ), 1, tau( i ), 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**t - w * v**t
+                    call stdlib_ssyr2( uplo, n-i, -one, a( i+1, i ), 1, tau( i ), 1,a( i+1, i+1 ),&
+                               lda )
+                    a( i+1, i ) = e( i )
+                 end if
+                 d( i ) = a( i, i )
+                 tau( i ) = taui
+              end do
+              d( n ) = a( n, n )
+           end if
+           return
+     end subroutine stdlib_ssytd2
+
+     !> SSYTF2: computes the factorization of a real symmetric matrix A using
+     !> the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**T is the transpose of U, and D is symmetric and
+     !> block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_ssytf2( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sevten = 17.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kk, kp, kstep
+           real(sp) :: absakk, alpha, colmax, d11, d12, d21, d22, r1, rowmax, t, wk, wkm1, &
+                     wkp1
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTF2', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_isamax( k-1, a( 1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) .or. stdlib_sisnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = imax + stdlib_isamax( k-imax, a( imax, imax+1 ), lda )
+                    rowmax = abs( a( imax, jmax ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_isamax( imax-1, a( 1, imax ), 1 )
+                       rowmax = max( rowmax, abs( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( a( imax, imax ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_sswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    call stdlib_sswap( kk-kp-1, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**t = a - w(k)*1/d(k)*w(k)**t
+                    r1 = one / a( k, k )
+                    call stdlib_ssyr( uplo, k-1, -r1, a( 1, k ), 1, a, lda )
+                    ! store u(k) in column k
+                    call stdlib_sscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**t
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = one / ( d11*d22-one )
+                       d12 = t / d12
+                       do j = k - 2, 1, -1
+                          wkm1 = d12*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = d12*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - a( i, k )*wk -a( i, k-1 )*wkm1
+                          end do
+                          a( j, k ) = wk
+                          a( j, k-1 ) = wkm1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_isamax( n-k, a( k+1, k ), 1 )
+                 colmax = abs( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) .or. stdlib_sisnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = k - 1 + stdlib_isamax( imax-k, a( imax, k ), lda )
+                    rowmax = abs( a( imax, jmax ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_isamax( n-imax, a( imax+1, imax ), 1 )
+                       rowmax = max( rowmax, abs( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( a( imax, imax ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_sswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    call stdlib_sswap( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**t = a - w(k)*(1/d(k))*w(k)**t
+                       d11 = one / a( k, k )
+                       call stdlib_ssyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                 
+                       ! store l(k) in column k
+                       call stdlib_sscal( n-k, d11, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k)
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( (a(k) a(k+1))*d(k)**(-1) ) * (a(k) a(k+1))**t
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = one / ( d11*d22-one )
+                       d21 = t / d21
+                       do j = k + 2, n
+                          wk = d21*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = d21*( d22*a( j, k+1 )-a( j, k ) )
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - a( i, k )*wk -a( i, k+1 )*wkp1
+                          end do
+                          a( j, k ) = wk
+                          a( j, k+1 ) = wkp1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_ssytf2
+
+     !> SSYTRD: reduces a real symmetric matrix A to real symmetric
+     !> tridiagonal form T by an orthogonal similarity transformation:
+     !> Q**T * A * Q = T.
+
+     pure subroutine stdlib_ssytrd( uplo, n, a, lda, d, e, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: d(*), e(*), tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, iws, j, kk, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              ! determine the block size.
+              nb = stdlib_ilaenv( 1, 'SSYTRD', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nx = n
+           iws = 1
+           if( nb>1 .and. nb<n ) then
+              ! determine when to cross over from blocked to unblocked code
+              ! (last block is always handled by unblocked code).
+              nx = max( nb, stdlib_ilaenv( 3, 'SSYTRD', uplo, n, -1, -1, -1 ) )
+              if( nx<n ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code by setting nx = n.
+                    nb = max( lwork / ldwork, 1 )
+                    nbmin = stdlib_ilaenv( 2, 'SSYTRD', uplo, n, -1, -1, -1 )
+                    if( nb<nbmin )nx = n
+                 end if
+              else
+                 nx = n
+              end if
+           else
+              nb = 1
+           end if
+           if( upper ) then
+              ! reduce the upper triangle of a.
+              ! columns 1:kk are handled by the unblocked method.
+              kk = n - ( ( n-nx+nb-1 ) / nb )*nb
+              do i = n - nb + 1, kk + 1, -nb
+                 ! reduce columns i:i+nb-1 to tridiagonal form and form the
+                 ! matrix w which is needed to update the unreduced part of
+                 ! the matrix
+                 call stdlib_slatrd( uplo, i+nb-1, nb, a, lda, e, tau, work,ldwork )
+                 ! update the unreduced submatrix a(1:i-1,1:i-1), using an
+                 ! update of the form:  a := a - v*w**t - w*v**t
+                 call stdlib_ssyr2k( uplo, 'NO TRANSPOSE', i-1, nb, -one, a( 1, i ),lda, work, &
+                           ldwork, one, a, lda )
+                 ! copy superdiagonal elements back into a, and diagonal
+                 ! elements into d
+                 do j = i, i + nb - 1
+                    a( j-1, j ) = e( j-1 )
+                    d( j ) = a( j, j )
+                 end do
+              end do
+              ! use unblocked code to reduce the last or only block
+              call stdlib_ssytd2( uplo, kk, a, lda, d, e, tau, iinfo )
+           else
+              ! reduce the lower triangle of a
+              do i = 1, n - nx, nb
+                 ! reduce columns i:i+nb-1 to tridiagonal form and form the
+                 ! matrix w which is needed to update the unreduced part of
+                 ! the matrix
+                 call stdlib_slatrd( uplo, n-i+1, nb, a( i, i ), lda, e( i ),tau( i ), work, &
+                           ldwork )
+                 ! update the unreduced submatrix a(i+ib:n,i+ib:n), using
+                 ! an update of the form:  a := a - v*w**t - w*v**t
+                 call stdlib_ssyr2k( uplo, 'NO TRANSPOSE', n-i-nb+1, nb, -one,a( i+nb, i ), lda, &
+                           work( nb+1 ), ldwork, one,a( i+nb, i+nb ), lda )
+                 ! copy subdiagonal elements back into a, and diagonal
+                 ! elements into d
+                 do j = i, i + nb - 1
+                    a( j+1, j ) = e( j )
+                    d( j ) = a( j, j )
+                 end do
+              end do
+              ! use unblocked code to reduce the last or only block
+              call stdlib_ssytd2( uplo, n-i+1, a( i, i ), lda, d( i ), e( i ),tau( i ), iinfo )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssytrd
+
+     !> SSYTRD_SB2ST: reduces a real symmetric band matrix A to real symmetric
+     !> tridiagonal form T by a orthogonal similarity transformation:
+     !> Q**T * A * Q = T.
+
+     pure subroutine stdlib_ssytrd_sb2st( stage1, vect, uplo, n, kd, ab, ldab,d, e, hous, lhous, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: stage1, uplo, vect
+           integer(ilp), intent(in) :: n, kd, ldab, lhous, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(out) :: d(*), e(*)
+           real(sp), intent(inout) :: ab(ldab,*)
+           real(sp), intent(out) :: hous(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: rzero = 0.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantq, upper, afters1
+           integer(ilp) :: i, m, k, ib, sweepid, myid, shift, stt, st, ed, stind, edind, &
+           blklastind, colpt, thed, stepercol, grsiz, thgrsiz, thgrnb, thgrid, nbtiles, ttype, &
+           tid, nthreads, debug, abdpos, abofdpos, dpos, ofdpos, awpos, inda, indw, apos, sizea, &
+                     lda, indv, indtau, sisev, sizetau, ldv, lhmin, lwmin
+           ! Intrinsic Functions 
+           intrinsic :: min,max,ceiling,real
+           ! Executable Statements 
+           ! determine the minimal workspace size required.
+           ! test the input parameters
+           debug   = 0
+           info    = 0
+           afters1 = stdlib_lsame( stage1, 'Y' )
+           wantq   = stdlib_lsame( vect, 'V' )
+           upper   = stdlib_lsame( uplo, 'U' )
+           lquery  = ( lwork==-1 ) .or. ( lhous==-1 )
+           ! determine the block size, the workspace size and the hous size.
+           ib     = stdlib_ilaenv2stage( 2, 'SSYTRD_SB2ST', vect, n, kd, -1, -1 )
+           lhmin  = stdlib_ilaenv2stage( 3, 'SSYTRD_SB2ST', vect, n, kd, ib, -1 )
+           lwmin  = stdlib_ilaenv2stage( 4, 'SSYTRD_SB2ST', vect, n, kd, ib, -1 )
+           if( .not.afters1 .and. .not.stdlib_lsame( stage1, 'N' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -2
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<(kd+1) ) then
+              info = -7
+           else if( lhous<lhmin .and. .not.lquery ) then
+              info = -11
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              hous( 1 ) = lhmin
+              work( 1 ) = lwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRD_SB2ST', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! determine pointer position
+           ldv      = kd + ib
+           sizetau  = 2 * n
+           sisev    = 2 * n
+           indtau   = 1
+           indv     = indtau + sizetau
+           lda      = 2 * kd + 1
+           sizea    = lda * n
+           inda     = 1
+           indw     = inda + sizea
+           nthreads = 1
+           tid      = 0
+           if( upper ) then
+               apos     = inda + kd
+               awpos    = inda
+               dpos     = apos + kd
+               ofdpos   = dpos - 1
+               abdpos   = kd + 1
+               abofdpos = kd
+           else
+               apos     = inda
+               awpos    = inda + kd + 1
+               dpos     = apos
+               ofdpos   = dpos + 1
+               abdpos   = 1
+               abofdpos = 2
+           endif
+           ! case kd=0:
+           ! the matrix is diagonal. we just copy it (convert to "real" for
+           ! real because d is double and the imaginary part should be 0)
+           ! and store it in d. a sequential code here is better or
+           ! in a parallel environment it might need two cores for d and e
+           if( kd==0 ) then
+               do i = 1, n
+                   d( i ) = ( ab( abdpos, i ) )
+               end do
+               do i = 1, n-1
+                   e( i ) = rzero
+               end do
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! case kd=1:
+           ! the matrix is already tridiagonal. we have to make diagonal
+           ! and offdiagonal elements real, and store them in d and e.
+           ! for that, for real precision just copy the diag and offdiag
+           ! to d and e while for the complex case the bulge chasing is
+           ! performed to convert the hermetian tridiagonal to symmetric
+           ! tridiagonal. a simpler conversion formula might be used, but then
+           ! updating the q matrix will be required and based if q is generated
+           ! or not this might complicate the story.
+           if( kd==1 ) then
+               do i = 1, n
+                   d( i ) = ( ab( abdpos, i ) )
+               end do
+               if( upper ) then
+                   do i = 1, n-1
+                      e( i ) = ( ab( abofdpos, i+1 ) )
+                   end do
+               else
+                   do i = 1, n-1
+                      e( i ) = ( ab( abofdpos, i ) )
+                   end do
+               endif
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! main code start here.
+           ! reduce the symmetric band of a to a tridiagonal matrix.
+           thgrsiz   = n
+           grsiz     = 1
+           shift     = 3
+           nbtiles   = ceiling( real(n,KIND=sp)/real(kd,KIND=sp) )
+           stepercol = ceiling( real(shift,KIND=sp)/real(grsiz,KIND=sp) )
+           thgrnb    = ceiling( real(n-1,KIND=sp)/real(thgrsiz,KIND=sp) )
+           call stdlib_slacpy( "A", kd+1, n, ab, ldab, work( apos ), lda )
+           call stdlib_slaset( "A", kd,   n, zero, zero, work( awpos ), lda )
+           ! openmp parallelisation start here
+           !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) &
+           !$OMP&         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) &
+           !$OMP&         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) &
+           !$OMP&         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) &
+           !$OMP&         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) &
+           !$OMP&         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+           !$OMP MASTER
+           ! main bulge chasing loop
+           loop_100: do thgrid = 1, thgrnb
+               stt  = (thgrid-1)*thgrsiz+1
+               thed = min( (stt + thgrsiz -1), (n-1))
+               loop_110: do i = stt, n-1
+                   ed = min( i, thed )
+                   if( stt>ed ) exit
+                   loop_120: do m = 1, stepercol
+                       st = stt
+                       loop_130: do sweepid = st, ed
+                           loop_140: do k = 1, grsiz
+                               myid  = (i-sweepid)*(stepercol*grsiz)+ (m-1)*grsiz + k
+                               if ( myid==1 ) then
+                                   ttype = 1
+                               else
+                                   ttype = mod( myid, 2 ) + 2
+                               endif
+                               if( ttype==2 ) then
+                                   colpt      = (myid/2)*kd + sweepid
+                                   stind      = colpt-kd+1
+                                   edind      = min(colpt,n)
+                                   blklastind = colpt
+                               else
+                                   colpt      = ((myid+1)/2)*kd + sweepid
+                                   stind      = colpt-kd+1
+                                   edind      = min(colpt,n)
+                                   if( ( stind>=edind-1 ).and.( edind==n ) ) then
+                                       blklastind = n
+                                   else
+                                       blklastind = 0
+                                   endif
+                               endif
+                               ! call the kernel
+                               !$ if( ttype/=1 ) then
+                                   !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) &
+                                   !$OMP&     DEPEND(in:WORK(MYID-1)) &
+                                   !$OMP&     DEPEND(out:WORK(MYID))
+                                   !$ tid      = omp_get_thread_num()
+                                   !$ call stdlib_ssb2st_kernels( uplo, wantq, ttype,stind, edind, &
+                                   !$ sweepid, n, kd, ib,work ( inda ), lda,hous( indv ), hous( &
+                                   !$ indtau ), ldv,work( indw + tid*kd ) )
+                                   !$OMP END TASK
+                               !$ else
+                                   !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) &
+                                   !$OMP&     DEPEND(out:WORK(MYID))
+                                   !$ tid      = omp_get_thread_num()
+                                   call stdlib_ssb2st_kernels( uplo, wantq, ttype,stind, edind, &
+                                   sweepid, n, kd, ib,work ( inda ), lda,hous( indv ), hous( &
+                                             indtau ), ldv,work( indw + tid*kd ) )
+                                   !$OMP END TASK
+                               !$ endif
+                               if ( blklastind>=(n-1) ) then
+                                   stt = stt + 1
+                                   exit
+                               endif
+                           end do loop_140
+                       end do loop_130
+                   end do loop_120
+               end do loop_110
+           end do loop_100
+           !$OMP END MASTER
+           !$OMP END PARALLEL
+           ! copy the diagonal from a to d. note that d is real thus only
+           ! the real part is needed, the imaginary part should be zero.
+           do i = 1, n
+               d( i ) = ( work( dpos+(i-1)*lda ) )
+           end do
+           ! copy the off diagonal from a to e. note that e is real thus only
+           ! the real part is needed, the imaginary part should be zero.
+           if( upper ) then
+               do i = 1, n-1
+                  e( i ) = ( work( ofdpos+i*lda ) )
+               end do
+           else
+               do i = 1, n-1
+                  e( i ) = ( work( ofdpos+(i-1)*lda ) )
+               end do
+           endif
+           hous( 1 ) = lhmin
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_ssytrd_sb2st
+
+     !> SSYTRF: computes the factorization of a real symmetric matrix A using
+     !> the Bunch-Kaufman diagonal pivoting method.  The form of the
+     !> factorization is
+     !> A = U**T*D*U  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_ssytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'SSYTRF', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'SSYTRF', uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_slasyf;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_slasyf( uplo, k, nb, kb, a, lda, ipiv, work, ldwork,iinfo )
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_ssytf2( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_slasyf;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_slasyf( uplo, n-k+1, nb, kb, a( k, k ), lda, ipiv( k ),work, ldwork, &
+                           iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_ssytf2( uplo, n-k+1, a( k, k ), lda, ipiv( k ), iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssytrf
+
+     !> STBCON: estimates the reciprocal of the condition number of a
+     !> triangular band matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_stbcon( norm, uplo, diag, n, kd, ab, ldab, rcond, work,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: ab(ldab,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(sp) :: ainvnm, anorm, scale, smlnum, xnorm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=sp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_slantb( norm, uplo, diag, n, kd, ab, ldab, work )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_slatbs( uplo, 'NO TRANSPOSE', diag, normin, n, kd,ab, ldab, work, &
+                              scale, work( 2*n+1 ), info )
+                 else
+                    ! multiply by inv(a**t).
+                    call stdlib_slatbs( uplo, 'TRANSPOSE', diag, normin, n, kd, ab,ldab, work, &
+                              scale, work( 2*n+1 ), info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_isamax( n, work, 1 )
+                    xnorm = abs( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_srscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_stbcon
+
+     !> STFTRI: computes the inverse of a triangular matrix A stored in RFP
+     !> format.
+     !> This is a Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_stftri( transr, uplo, diag, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo, diag
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( .not.stdlib_lsame( diag, 'N' ) .and. .not.stdlib_lsame( diag, 'U' ) )&
+                     then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STFTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_strtri( 'L', diag, n1, a( 0 ), n, info )
+                    if( info>0 )return
+                    call stdlib_strmm( 'R', 'L', 'N', diag, n2, n1, -one, a( 0 ),n, a( n1 ), n )
+                              
+                    call stdlib_strtri( 'U', diag, n2, a( n ), n, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_strmm( 'L', 'U', 'T', diag, n2, n1, one, a( n ), n,a( n1 ), n )
+                              
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_strtri( 'L', diag, n1, a( n2 ), n, info )
+                    if( info>0 )return
+                    call stdlib_strmm( 'L', 'L', 'T', diag, n1, n2, -one, a( n2 ),n, a( 0 ), n )
+                              
+                    call stdlib_strtri( 'U', diag, n2, a( n1 ), n, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_strmm( 'R', 'U', 'N', diag, n1, n2, one, a( n1 ),n, a( 0 ), n )
+                              
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0), t2 -> a(1), s -> a(0+n1*n1)
+                    call stdlib_strtri( 'U', diag, n1, a( 0 ), n1, info )
+                    if( info>0 )return
+                    call stdlib_strmm( 'L', 'U', 'N', diag, n1, n2, -one, a( 0 ),n1, a( n1*n1 ), &
+                              n1 )
+                    call stdlib_strtri( 'L', diag, n2, a( 1 ), n1, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_strmm( 'R', 'L', 'T', diag, n1, n2, one, a( 1 ),n1, a( n1*n1 ), &
+                              n1 )
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0+n2*n2), t2 -> a(0+n1*n2), s -> a(0)
+                    call stdlib_strtri( 'U', diag, n1, a( n2*n2 ), n2, info )
+                    if( info>0 )return
+                    call stdlib_strmm( 'R', 'U', 'T', diag, n2, n1, -one,a( n2*n2 ), n2, a( 0 ), &
+                              n2 )
+                    call stdlib_strtri( 'L', diag, n2, a( n1*n2 ), n2, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_strmm( 'L', 'L', 'N', diag, n2, n1, one,a( n1*n2 ), n2, a( 0 ), &
+                              n2 )
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_strtri( 'L', diag, k, a( 1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_strmm( 'R', 'L', 'N', diag, k, k, -one, a( 1 ),n+1, a( k+1 ), n+1 &
+                              )
+                    call stdlib_strtri( 'U', diag, k, a( 0 ), n+1, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_strmm( 'L', 'U', 'T', diag, k, k, one, a( 0 ), n+1,a( k+1 ), n+1 )
+                              
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_strtri( 'L', diag, k, a( k+1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_strmm( 'L', 'L', 'T', diag, k, k, -one, a( k+1 ),n+1, a( 0 ), n+1 &
+                              )
+                    call stdlib_strtri( 'U', diag, k, a( k ), n+1, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_strmm( 'R', 'U', 'N', diag, k, k, one, a( k ), n+1,a( 0 ), n+1 )
+                              
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_strtri( 'U', diag, k, a( k ), k, info )
+                    if( info>0 )return
+                    call stdlib_strmm( 'L', 'U', 'N', diag, k, k, -one, a( k ), k,a( k*( k+1 ) ), &
+                              k )
+                    call stdlib_strtri( 'L', diag, k, a( 0 ), k, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_strmm( 'R', 'L', 'T', diag, k, k, one, a( 0 ), k,a( k*( k+1 ) ), &
+                              k )
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_strtri( 'U', diag, k, a( k*( k+1 ) ), k, info )
+                    if( info>0 )return
+                    call stdlib_strmm( 'R', 'U', 'T', diag, k, k, -one,a( k*( k+1 ) ), k, a( 0 ), &
+                              k )
+                    call stdlib_strtri( 'L', diag, k, a( k*k ), k, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_strmm( 'L', 'L', 'N', diag, k, k, one, a( k*k ), k,a( 0 ), k )
+                              
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_stftri
+
+     !> STGSY2: solves the generalized Sylvester equation:
+     !> A * R - L * B = scale * C                (1)
+     !> D * R - L * E = scale * F,
+     !> using Level 1 and 2 BLAS. where R and L are unknown M-by-N matrices,
+     !> (A, D), (B, E) and (C, F) are given matrix pairs of size M-by-M,
+     !> N-by-N and M-by-N, respectively, with real entries. (A, D) and (B, E)
+     !> must be in generalized Schur canonical form, i.e. A, B are upper
+     !> quasi triangular and D, E are upper triangular. The solution (R, L)
+     !> overwrites (C, F). 0 <= SCALE <= 1 is an output scaling factor
+     !> chosen to avoid overflow.
+     !> In matrix notation solving equation (1) corresponds to solve
+     !> Z*x = scale*b, where Z is defined as
+     !> Z = [ kron(In, A)  -kron(B**T, Im) ]             (2)
+     !> [ kron(In, D)  -kron(E**T, Im) ],
+     !> Ik is the identity matrix of size k and X**T is the transpose of X.
+     !> kron(X, Y) is the Kronecker product between the matrices X and Y.
+     !> In the process of solving (1), we solve a number of such systems
+     !> where Dim(In), Dim(In) = 1 or 2.
+     !> If TRANS = 'T', solve the transposed system Z**T*y = scale*b for y,
+     !> which is equivalent to solve for R and L in
+     !> A**T * R  + D**T * L   = scale * C           (3)
+     !> R  * B**T + L  * E**T  = scale * -F
+     !> This case is used to compute an estimate of Dif[(A, D), (B, E)] =
+     !> sigma_min(Z) using reverse communication with SLACON.
+     !> STGSY2 also (IJOB >= 1) contributes to the computation in STGSYL
+     !> of an upper bound on the separation between to matrix pairs. Then
+     !> the input (A, D), (B, E) are sub-pencils of the matrix pair in
+     !> STGSYL. See STGSYL for details.
+
+     pure subroutine stdlib_stgsy2( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f, &
+               ldf, scale, rdsum, rdscal,iwork, pq, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldc, ldd, lde, ldf, m, n
+           integer(ilp), intent(out) :: info, pq
+           real(sp), intent(inout) :: rdscal, rdsum
+           real(sp), intent(out) :: scale
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: a(lda,*), b(ldb,*), d(ldd,*), e(lde,*)
+           real(sp), intent(inout) :: c(ldc,*), f(ldf,*)
+        ! =====================================================================
+        ! replaced various illegal calls to stdlib_scopy by calls to stdlib_slaset.
+        ! sven hammarling, 27/5/02.
+           ! Parameters 
+           integer(ilp), parameter :: ldz = 8
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: i, ie, ierr, ii, is, isp1, j, je, jj, js, jsp1, k, mb, nb, p, q, &
+                     zdim
+           real(sp) :: alpha, scaloc
+           ! Local Arrays 
+           integer(ilp) :: ipiv(ldz), jpiv(ldz)
+           real(sp) :: rhs(ldz), z(ldz,ldz)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode and test input parameters
+           info = 0
+           ierr = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -1
+           else if( notran ) then
+              if( ( ijob<0 ) .or. ( ijob>2 ) ) then
+                 info = -2
+              end if
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 info = -3
+              else if( n<=0 ) then
+                 info = -4
+              else if( lda<max( 1, m ) ) then
+                 info = -6
+              else if( ldb<max( 1, n ) ) then
+                 info = -8
+              else if( ldc<max( 1, m ) ) then
+                 info = -10
+              else if( ldd<max( 1, m ) ) then
+                 info = -12
+              else if( lde<max( 1, n ) ) then
+                 info = -14
+              else if( ldf<max( 1, m ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STGSY2', -info )
+              return
+           end if
+           ! determine block structure of a
+           pq = 0
+           p = 0
+           i = 1
+           10 continue
+           if( i>m )go to 20
+           p = p + 1
+           iwork( p ) = i
+           if( i==m )go to 20
+           if( a( i+1, i )/=zero ) then
+              i = i + 2
+           else
+              i = i + 1
+           end if
+           go to 10
+           20 continue
+           iwork( p+1 ) = m + 1
+           ! determine block structure of b
+           q = p + 1
+           j = 1
+           30 continue
+           if( j>n )go to 40
+           q = q + 1
+           iwork( q ) = j
+           if( j==n )go to 40
+           if( b( j+1, j )/=zero ) then
+              j = j + 2
+           else
+              j = j + 1
+           end if
+           go to 30
+           40 continue
+           iwork( q+1 ) = n + 1
+           pq = p*( q-p-1 )
+           if( notran ) then
+              ! solve (i, j) - subsystem
+                 ! a(i, i) * r(i, j) - l(i, j) * b(j, j) = c(i, j)
+                 ! d(i, i) * r(i, j) - l(i, j) * e(j, j) = f(i, j)
+              ! for i = p, p - 1, ..., 1; j = 1, 2, ..., q
+              scale = one
+              scaloc = one
+              loop_120: do j = p + 2, q
+                 js = iwork( j )
+                 jsp1 = js + 1
+                 je = iwork( j+1 ) - 1
+                 nb = je - js + 1
+                 loop_110: do i = p, 1, -1
+                    is = iwork( i )
+                    isp1 = is + 1
+                    ie = iwork( i+1 ) - 1
+                    mb = ie - is + 1
+                    zdim = mb*nb*2
+                    if( ( mb==1 ) .and. ( nb==1 ) ) then
+                       ! build a 2-by-2 system z * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = d( is, is )
+                       z( 1, 2 ) = -b( js, js )
+                       z( 2, 2 ) = -e( js, js )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = f( is, js )
+                       ! solve z * x = rhs
+                       call stdlib_sgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       if( ijob==0 ) then
+                          call stdlib_sgesc2( zdim, z, ldz, rhs, ipiv, jpiv,scaloc )
+                          if( scaloc/=one ) then
+                             do k = 1, n
+                                call stdlib_sscal( m, scaloc, c( 1, k ), 1 )
+                                call stdlib_sscal( m, scaloc, f( 1, k ), 1 )
+                             end do
+                             scale = scale*scaloc
+                          end if
+                       else
+                          call stdlib_slatdf( ijob, zdim, z, ldz, rhs, rdsum,rdscal, ipiv, jpiv )
+                                    
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       f( is, js ) = rhs( 2 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          alpha = -rhs( 1 )
+                          call stdlib_saxpy( is-1, alpha, a( 1, is ), 1, c( 1, js ),1 )
+                          call stdlib_saxpy( is-1, alpha, d( 1, is ), 1, f( 1, js ),1 )
+                       end if
+                       if( j<q ) then
+                          call stdlib_saxpy( n-je, rhs( 2 ), b( js, je+1 ), ldb,c( is, je+1 ), &
+                                    ldc )
+                          call stdlib_saxpy( n-je, rhs( 2 ), e( js, je+1 ), lde,f( is, je+1 ), &
+                                    ldf )
+                       end if
+                    else if( ( mb==1 ) .and. ( nb==2 ) ) then
+                       ! build a 4-by-4 system z * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = zero
+                       z( 3, 1 ) = d( is, is )
+                       z( 4, 1 ) = zero
+                       z( 1, 2 ) = zero
+                       z( 2, 2 ) = a( is, is )
+                       z( 3, 2 ) = zero
+                       z( 4, 2 ) = d( is, is )
+                       z( 1, 3 ) = -b( js, js )
+                       z( 2, 3 ) = -b( js, jsp1 )
+                       z( 3, 3 ) = -e( js, js )
+                       z( 4, 3 ) = -e( js, jsp1 )
+                       z( 1, 4 ) = -b( jsp1, js )
+                       z( 2, 4 ) = -b( jsp1, jsp1 )
+                       z( 3, 4 ) = zero
+                       z( 4, 4 ) = -e( jsp1, jsp1 )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = c( is, jsp1 )
+                       rhs( 3 ) = f( is, js )
+                       rhs( 4 ) = f( is, jsp1 )
+                       ! solve z * x = rhs
+                       call stdlib_sgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       if( ijob==0 ) then
+                          call stdlib_sgesc2( zdim, z, ldz, rhs, ipiv, jpiv,scaloc )
+                          if( scaloc/=one ) then
+                             do k = 1, n
+                                call stdlib_sscal( m, scaloc, c( 1, k ), 1 )
+                                call stdlib_sscal( m, scaloc, f( 1, k ), 1 )
+                             end do
+                             scale = scale*scaloc
+                          end if
+                       else
+                          call stdlib_slatdf( ijob, zdim, z, ldz, rhs, rdsum,rdscal, ipiv, jpiv )
+                                    
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       c( is, jsp1 ) = rhs( 2 )
+                       f( is, js ) = rhs( 3 )
+                       f( is, jsp1 ) = rhs( 4 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          call stdlib_sger( is-1, nb, -one, a( 1, is ), 1, rhs( 1 ),1, c( 1, js ),&
+                                     ldc )
+                          call stdlib_sger( is-1, nb, -one, d( 1, is ), 1, rhs( 1 ),1, f( 1, js ),&
+                                     ldf )
+                       end if
+                       if( j<q ) then
+                          call stdlib_saxpy( n-je, rhs( 3 ), b( js, je+1 ), ldb,c( is, je+1 ), &
+                                    ldc )
+                          call stdlib_saxpy( n-je, rhs( 3 ), e( js, je+1 ), lde,f( is, je+1 ), &
+                                    ldf )
+                          call stdlib_saxpy( n-je, rhs( 4 ), b( jsp1, je+1 ), ldb,c( is, je+1 ), &
+                                    ldc )
+                          call stdlib_saxpy( n-je, rhs( 4 ), e( jsp1, je+1 ), lde,f( is, je+1 ), &
+                                    ldf )
+                       end if
+                    else if( ( mb==2 ) .and. ( nb==1 ) ) then
+                       ! build a 4-by-4 system z * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = a( isp1, is )
+                       z( 3, 1 ) = d( is, is )
+                       z( 4, 1 ) = zero
+                       z( 1, 2 ) = a( is, isp1 )
+                       z( 2, 2 ) = a( isp1, isp1 )
+                       z( 3, 2 ) = d( is, isp1 )
+                       z( 4, 2 ) = d( isp1, isp1 )
+                       z( 1, 3 ) = -b( js, js )
+                       z( 2, 3 ) = zero
+                       z( 3, 3 ) = -e( js, js )
+                       z( 4, 3 ) = zero
+                       z( 1, 4 ) = zero
+                       z( 2, 4 ) = -b( js, js )
+                       z( 3, 4 ) = zero
+                       z( 4, 4 ) = -e( js, js )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = c( isp1, js )
+                       rhs( 3 ) = f( is, js )
+                       rhs( 4 ) = f( isp1, js )
+                       ! solve z * x = rhs
+                       call stdlib_sgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       if( ijob==0 ) then
+                          call stdlib_sgesc2( zdim, z, ldz, rhs, ipiv, jpiv,scaloc )
+                          if( scaloc/=one ) then
+                             do k = 1, n
+                                call stdlib_sscal( m, scaloc, c( 1, k ), 1 )
+                                call stdlib_sscal( m, scaloc, f( 1, k ), 1 )
+                             end do
+                             scale = scale*scaloc
+                          end if
+                       else
+                          call stdlib_slatdf( ijob, zdim, z, ldz, rhs, rdsum,rdscal, ipiv, jpiv )
+                                    
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       c( isp1, js ) = rhs( 2 )
+                       f( is, js ) = rhs( 3 )
+                       f( isp1, js ) = rhs( 4 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          call stdlib_sgemv( 'N', is-1, mb, -one, a( 1, is ), lda,rhs( 1 ), 1, &
+                                    one, c( 1, js ), 1 )
+                          call stdlib_sgemv( 'N', is-1, mb, -one, d( 1, is ), ldd,rhs( 1 ), 1, &
+                                    one, f( 1, js ), 1 )
+                       end if
+                       if( j<q ) then
+                          call stdlib_sger( mb, n-je, one, rhs( 3 ), 1,b( js, je+1 ), ldb, c( is, &
+                                    je+1 ), ldc )
+                          call stdlib_sger( mb, n-je, one, rhs( 3 ), 1,e( js, je+1 ), lde, f( is, &
+                                    je+1 ), ldf )
+                       end if
+                    else if( ( mb==2 ) .and. ( nb==2 ) ) then
+                       ! build an 8-by-8 system z * x = rhs
+                       call stdlib_slaset( 'F', ldz, ldz, zero, zero, z, ldz )
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = a( isp1, is )
+                       z( 5, 1 ) = d( is, is )
+                       z( 1, 2 ) = a( is, isp1 )
+                       z( 2, 2 ) = a( isp1, isp1 )
+                       z( 5, 2 ) = d( is, isp1 )
+                       z( 6, 2 ) = d( isp1, isp1 )
+                       z( 3, 3 ) = a( is, is )
+                       z( 4, 3 ) = a( isp1, is )
+                       z( 7, 3 ) = d( is, is )
+                       z( 3, 4 ) = a( is, isp1 )
+                       z( 4, 4 ) = a( isp1, isp1 )
+                       z( 7, 4 ) = d( is, isp1 )
+                       z( 8, 4 ) = d( isp1, isp1 )
+                       z( 1, 5 ) = -b( js, js )
+                       z( 3, 5 ) = -b( js, jsp1 )
+                       z( 5, 5 ) = -e( js, js )
+                       z( 7, 5 ) = -e( js, jsp1 )
+                       z( 2, 6 ) = -b( js, js )
+                       z( 4, 6 ) = -b( js, jsp1 )
+                       z( 6, 6 ) = -e( js, js )
+                       z( 8, 6 ) = -e( js, jsp1 )
+                       z( 1, 7 ) = -b( jsp1, js )
+                       z( 3, 7 ) = -b( jsp1, jsp1 )
+                       z( 7, 7 ) = -e( jsp1, jsp1 )
+                       z( 2, 8 ) = -b( jsp1, js )
+                       z( 4, 8 ) = -b( jsp1, jsp1 )
+                       z( 8, 8 ) = -e( jsp1, jsp1 )
+                       ! set up right hand side(s)
+                       k = 1
+                       ii = mb*nb + 1
+                       do jj = 0, nb - 1
+                          call stdlib_scopy( mb, c( is, js+jj ), 1, rhs( k ), 1 )
+                          call stdlib_scopy( mb, f( is, js+jj ), 1, rhs( ii ), 1 )
+                          k = k + mb
+                          ii = ii + mb
+                       end do
+                       ! solve z * x = rhs
+                       call stdlib_sgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       if( ijob==0 ) then
+                          call stdlib_sgesc2( zdim, z, ldz, rhs, ipiv, jpiv,scaloc )
+                          if( scaloc/=one ) then
+                             do k = 1, n
+                                call stdlib_sscal( m, scaloc, c( 1, k ), 1 )
+                                call stdlib_sscal( m, scaloc, f( 1, k ), 1 )
+                             end do
+                             scale = scale*scaloc
+                          end if
+                       else
+                          call stdlib_slatdf( ijob, zdim, z, ldz, rhs, rdsum,rdscal, ipiv, jpiv )
+                                    
+                       end if
+                       ! unpack solution vector(s)
+                       k = 1
+                       ii = mb*nb + 1
+                       do jj = 0, nb - 1
+                          call stdlib_scopy( mb, rhs( k ), 1, c( is, js+jj ), 1 )
+                          call stdlib_scopy( mb, rhs( ii ), 1, f( is, js+jj ), 1 )
+                          k = k + mb
+                          ii = ii + mb
+                       end do
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          call stdlib_sgemm( 'N', 'N', is-1, nb, mb, -one,a( 1, is ), lda, rhs( 1 &
+                                    ), mb, one,c( 1, js ), ldc )
+                          call stdlib_sgemm( 'N', 'N', is-1, nb, mb, -one,d( 1, is ), ldd, rhs( 1 &
+                                    ), mb, one,f( 1, js ), ldf )
+                       end if
+                       if( j<q ) then
+                          k = mb*nb + 1
+                          call stdlib_sgemm( 'N', 'N', mb, n-je, nb, one, rhs( k ),mb, b( js, je+&
+                                    1 ), ldb, one,c( is, je+1 ), ldc )
+                          call stdlib_sgemm( 'N', 'N', mb, n-je, nb, one, rhs( k ),mb, e( js, je+&
+                                    1 ), lde, one,f( is, je+1 ), ldf )
+                       end if
+                    end if
+                 end do loop_110
+              end do loop_120
+           else
+              ! solve (i, j) - subsystem
+                   ! a(i, i)**t * r(i, j) + d(i, i)**t * l(j, j)  =  c(i, j)
+                   ! r(i, i)  * b(j, j) + l(i, j)  * e(j, j)  = -f(i, j)
+              ! for i = 1, 2, ..., p, j = q, q - 1, ..., 1
+              scale = one
+              scaloc = one
+              loop_200: do i = 1, p
+                 is = iwork( i )
+                 isp1 = is + 1
+                 ie = iwork( i+1 ) - 1
+                 mb = ie - is + 1
+                 loop_190: do j = q, p + 2, -1
+                    js = iwork( j )
+                    jsp1 = js + 1
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    zdim = mb*nb*2
+                    if( ( mb==1 ) .and. ( nb==1 ) ) then
+                       ! build a 2-by-2 system z**t * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = -b( js, js )
+                       z( 1, 2 ) = d( is, is )
+                       z( 2, 2 ) = -e( js, js )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = f( is, js )
+                       ! solve z**t * x = rhs
+                       call stdlib_sgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       call stdlib_sgesc2( zdim, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_sscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       f( is, js ) = rhs( 2 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( j>p+2 ) then
+                          alpha = rhs( 1 )
+                          call stdlib_saxpy( js-1, alpha, b( 1, js ), 1, f( is, 1 ),ldf )
+                          alpha = rhs( 2 )
+                          call stdlib_saxpy( js-1, alpha, e( 1, js ), 1, f( is, 1 ),ldf )
+                       end if
+                       if( i<p ) then
+                          alpha = -rhs( 1 )
+                          call stdlib_saxpy( m-ie, alpha, a( is, ie+1 ), lda,c( ie+1, js ), 1 )
+                                    
+                          alpha = -rhs( 2 )
+                          call stdlib_saxpy( m-ie, alpha, d( is, ie+1 ), ldd,c( ie+1, js ), 1 )
+                                    
+                       end if
+                    else if( ( mb==1 ) .and. ( nb==2 ) ) then
+                       ! build a 4-by-4 system z**t * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = zero
+                       z( 3, 1 ) = -b( js, js )
+                       z( 4, 1 ) = -b( jsp1, js )
+                       z( 1, 2 ) = zero
+                       z( 2, 2 ) = a( is, is )
+                       z( 3, 2 ) = -b( js, jsp1 )
+                       z( 4, 2 ) = -b( jsp1, jsp1 )
+                       z( 1, 3 ) = d( is, is )
+                       z( 2, 3 ) = zero
+                       z( 3, 3 ) = -e( js, js )
+                       z( 4, 3 ) = zero
+                       z( 1, 4 ) = zero
+                       z( 2, 4 ) = d( is, is )
+                       z( 3, 4 ) = -e( js, jsp1 )
+                       z( 4, 4 ) = -e( jsp1, jsp1 )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = c( is, jsp1 )
+                       rhs( 3 ) = f( is, js )
+                       rhs( 4 ) = f( is, jsp1 )
+                       ! solve z**t * x = rhs
+                       call stdlib_sgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       call stdlib_sgesc2( zdim, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_sscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       c( is, jsp1 ) = rhs( 2 )
+                       f( is, js ) = rhs( 3 )
+                       f( is, jsp1 ) = rhs( 4 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( j>p+2 ) then
+                          call stdlib_saxpy( js-1, rhs( 1 ), b( 1, js ), 1,f( is, 1 ), ldf )
+                                    
+                          call stdlib_saxpy( js-1, rhs( 2 ), b( 1, jsp1 ), 1,f( is, 1 ), ldf )
+                                    
+                          call stdlib_saxpy( js-1, rhs( 3 ), e( 1, js ), 1,f( is, 1 ), ldf )
+                                    
+                          call stdlib_saxpy( js-1, rhs( 4 ), e( 1, jsp1 ), 1,f( is, 1 ), ldf )
+                                    
+                       end if
+                       if( i<p ) then
+                          call stdlib_sger( m-ie, nb, -one, a( is, ie+1 ), lda,rhs( 1 ), 1, c( ie+&
+                                    1, js ), ldc )
+                          call stdlib_sger( m-ie, nb, -one, d( is, ie+1 ), ldd,rhs( 3 ), 1, c( ie+&
+                                    1, js ), ldc )
+                       end if
+                    else if( ( mb==2 ) .and. ( nb==1 ) ) then
+                       ! build a 4-by-4 system z**t * x = rhs
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = a( is, isp1 )
+                       z( 3, 1 ) = -b( js, js )
+                       z( 4, 1 ) = zero
+                       z( 1, 2 ) = a( isp1, is )
+                       z( 2, 2 ) = a( isp1, isp1 )
+                       z( 3, 2 ) = zero
+                       z( 4, 2 ) = -b( js, js )
+                       z( 1, 3 ) = d( is, is )
+                       z( 2, 3 ) = d( is, isp1 )
+                       z( 3, 3 ) = -e( js, js )
+                       z( 4, 3 ) = zero
+                       z( 1, 4 ) = zero
+                       z( 2, 4 ) = d( isp1, isp1 )
+                       z( 3, 4 ) = zero
+                       z( 4, 4 ) = -e( js, js )
+                       ! set up right hand side(s)
+                       rhs( 1 ) = c( is, js )
+                       rhs( 2 ) = c( isp1, js )
+                       rhs( 3 ) = f( is, js )
+                       rhs( 4 ) = f( isp1, js )
+                       ! solve z**t * x = rhs
+                       call stdlib_sgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       call stdlib_sgesc2( zdim, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_sscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! unpack solution vector(s)
+                       c( is, js ) = rhs( 1 )
+                       c( isp1, js ) = rhs( 2 )
+                       f( is, js ) = rhs( 3 )
+                       f( isp1, js ) = rhs( 4 )
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( j>p+2 ) then
+                          call stdlib_sger( mb, js-1, one, rhs( 1 ), 1, b( 1, js ),1, f( is, 1 ), &
+                                    ldf )
+                          call stdlib_sger( mb, js-1, one, rhs( 3 ), 1, e( 1, js ),1, f( is, 1 ), &
+                                    ldf )
+                       end if
+                       if( i<p ) then
+                          call stdlib_sgemv( 'T', mb, m-ie, -one, a( is, ie+1 ),lda, rhs( 1 ), 1, &
+                                    one, c( ie+1, js ),1 )
+                          call stdlib_sgemv( 'T', mb, m-ie, -one, d( is, ie+1 ),ldd, rhs( 3 ), 1, &
+                                    one, c( ie+1, js ),1 )
+                       end if
+                    else if( ( mb==2 ) .and. ( nb==2 ) ) then
+                       ! build an 8-by-8 system z**t * x = rhs
+                       call stdlib_slaset( 'F', ldz, ldz, zero, zero, z, ldz )
+                       z( 1, 1 ) = a( is, is )
+                       z( 2, 1 ) = a( is, isp1 )
+                       z( 5, 1 ) = -b( js, js )
+                       z( 7, 1 ) = -b( jsp1, js )
+                       z( 1, 2 ) = a( isp1, is )
+                       z( 2, 2 ) = a( isp1, isp1 )
+                       z( 6, 2 ) = -b( js, js )
+                       z( 8, 2 ) = -b( jsp1, js )
+                       z( 3, 3 ) = a( is, is )
+                       z( 4, 3 ) = a( is, isp1 )
+                       z( 5, 3 ) = -b( js, jsp1 )
+                       z( 7, 3 ) = -b( jsp1, jsp1 )
+                       z( 3, 4 ) = a( isp1, is )
+                       z( 4, 4 ) = a( isp1, isp1 )
+                       z( 6, 4 ) = -b( js, jsp1 )
+                       z( 8, 4 ) = -b( jsp1, jsp1 )
+                       z( 1, 5 ) = d( is, is )
+                       z( 2, 5 ) = d( is, isp1 )
+                       z( 5, 5 ) = -e( js, js )
+                       z( 2, 6 ) = d( isp1, isp1 )
+                       z( 6, 6 ) = -e( js, js )
+                       z( 3, 7 ) = d( is, is )
+                       z( 4, 7 ) = d( is, isp1 )
+                       z( 5, 7 ) = -e( js, jsp1 )
+                       z( 7, 7 ) = -e( jsp1, jsp1 )
+                       z( 4, 8 ) = d( isp1, isp1 )
+                       z( 6, 8 ) = -e( js, jsp1 )
+                       z( 8, 8 ) = -e( jsp1, jsp1 )
+                       ! set up right hand side(s)
+                       k = 1
+                       ii = mb*nb + 1
+                       do jj = 0, nb - 1
+                          call stdlib_scopy( mb, c( is, js+jj ), 1, rhs( k ), 1 )
+                          call stdlib_scopy( mb, f( is, js+jj ), 1, rhs( ii ), 1 )
+                          k = k + mb
+                          ii = ii + mb
+                       end do
+                       ! solve z**t * x = rhs
+                       call stdlib_sgetc2( zdim, z, ldz, ipiv, jpiv, ierr )
+                       if( ierr>0 )info = ierr
+                       call stdlib_sgesc2( zdim, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_sscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! unpack solution vector(s)
+                       k = 1
+                       ii = mb*nb + 1
+                       do jj = 0, nb - 1
+                          call stdlib_scopy( mb, rhs( k ), 1, c( is, js+jj ), 1 )
+                          call stdlib_scopy( mb, rhs( ii ), 1, f( is, js+jj ), 1 )
+                          k = k + mb
+                          ii = ii + mb
+                       end do
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( j>p+2 ) then
+                          call stdlib_sgemm( 'N', 'T', mb, js-1, nb, one,c( is, js ), ldc, b( 1, &
+                                    js ), ldb, one,f( is, 1 ), ldf )
+                          call stdlib_sgemm( 'N', 'T', mb, js-1, nb, one,f( is, js ), ldf, e( 1, &
+                                    js ), lde, one,f( is, 1 ), ldf )
+                       end if
+                       if( i<p ) then
+                          call stdlib_sgemm( 'T', 'N', m-ie, nb, mb, -one,a( is, ie+1 ), lda, c( &
+                                    is, js ), ldc,one, c( ie+1, js ), ldc )
+                          call stdlib_sgemm( 'T', 'N', m-ie, nb, mb, -one,d( is, ie+1 ), ldd, f( &
+                                    is, js ), ldf,one, c( ie+1, js ), ldc )
+                       end if
+                    end if
+                 end do loop_190
+              end do loop_200
+           end if
+           return
+     end subroutine stdlib_stgsy2
+
+     !> STGSYL: solves the generalized Sylvester equation:
+     !> A * R - L * B = scale * C                 (1)
+     !> D * R - L * E = scale * F
+     !> where R and L are unknown m-by-n matrices, (A, D), (B, E) and
+     !> (C, F) are given matrix pairs of size m-by-m, n-by-n and m-by-n,
+     !> respectively, with real entries. (A, D) and (B, E) must be in
+     !> generalized (real) Schur canonical form, i.e. A, B are upper quasi
+     !> triangular and D, E are upper triangular.
+     !> The solution (R, L) overwrites (C, F). 0 <= SCALE <= 1 is an output
+     !> scaling factor chosen to avoid overflow.
+     !> In matrix notation (1) is equivalent to solve  Zx = scale b, where
+     !> Z is defined as
+     !> Z = [ kron(In, A)  -kron(B**T, Im) ]         (2)
+     !> [ kron(In, D)  -kron(E**T, Im) ].
+     !> Here Ik is the identity matrix of size k and X**T is the transpose of
+     !> X. kron(X, Y) is the Kronecker product between the matrices X and Y.
+     !> If TRANS = 'T', STGSYL solves the transposed system Z**T*y = scale*b,
+     !> which is equivalent to solve for R and L in
+     !> A**T * R + D**T * L = scale * C           (3)
+     !> R * B**T + L * E**T = scale * -F
+     !> This case (TRANS = 'T') is used to compute an one-norm-based estimate
+     !> of Dif[(A,D), (B,E)], the separation between the matrix pairs (A,D)
+     !> and (B,E), using SLACON.
+     !> If IJOB >= 1, STGSYL computes a Frobenius norm-based estimate
+     !> of Dif[(A,D),(B,E)]. That is, the reciprocal of a lower bound on the
+     !> reciprocal of the smallest singular value of Z. See [1-2] for more
+     !> information.
+     !> This is a level 3 BLAS algorithm.
+
+     pure subroutine stdlib_stgsyl( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f, &
+               ldf, scale, dif, work, lwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldc, ldd, lde, ldf, lwork, m, n
+           integer(ilp), intent(out) :: info
+           real(sp), intent(out) :: dif, scale
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: a(lda,*), b(ldb,*), d(ldd,*), e(lde,*)
+           real(sp), intent(inout) :: c(ldc,*), f(ldf,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+        ! replaced various illegal calls to stdlib_scopy by calls to stdlib_slaset.
+        ! sven hammarling, 1/5/02.
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, notran
+           integer(ilp) :: i, ie, ifunc, iround, is, isolve, j, je, js, k, linfo, lwmin, mb, nb, &
+                     p, ppqq, pq, q
+           real(sp) :: dscale, dsum, scale2, scaloc
+           ! Intrinsic Functions 
+           intrinsic :: max,real,sqrt
+           ! Executable Statements 
+           ! decode and test input parameters
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -1
+           else if( notran ) then
+              if( ( ijob<0 ) .or. ( ijob>4 ) ) then
+                 info = -2
+              end if
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 info = -3
+              else if( n<=0 ) then
+                 info = -4
+              else if( lda<max( 1, m ) ) then
+                 info = -6
+              else if( ldb<max( 1, n ) ) then
+                 info = -8
+              else if( ldc<max( 1, m ) ) then
+                 info = -10
+              else if( ldd<max( 1, m ) ) then
+                 info = -12
+              else if( lde<max( 1, n ) ) then
+                 info = -14
+              else if( ldf<max( 1, m ) ) then
+                 info = -16
+              end if
+           end if
+           if( info==0 ) then
+              if( notran ) then
+                 if( ijob==1 .or. ijob==2 ) then
+                    lwmin = max( 1, 2*m*n )
+                 else
+                    lwmin = 1
+                 end if
+              else
+                 lwmin = 1
+              end if
+              work( 1 ) = lwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STGSYL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              scale = 1
+              if( notran ) then
+                 if( ijob/=0 ) then
+                    dif = 0
+                 end if
+              end if
+              return
+           end if
+           ! determine optimal block sizes mb and nb
+           mb = stdlib_ilaenv( 2, 'STGSYL', trans, m, n, -1, -1 )
+           nb = stdlib_ilaenv( 5, 'STGSYL', trans, m, n, -1, -1 )
+           isolve = 1
+           ifunc = 0
+           if( notran ) then
+              if( ijob>=3 ) then
+                 ifunc = ijob - 2
+                 call stdlib_slaset( 'F', m, n, zero, zero, c, ldc )
+                 call stdlib_slaset( 'F', m, n, zero, zero, f, ldf )
+              else if( ijob>=1 .and. notran ) then
+                 isolve = 2
+              end if
+           end if
+           if( ( mb<=1 .and. nb<=1 ) .or. ( mb>=m .and. nb>=n ) )then
+              loop_30: do iround = 1, isolve
+                 ! use unblocked level 2 solver
+                 dscale = zero
+                 dsum = one
+                 pq = 0
+                 call stdlib_stgsy2( trans, ifunc, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f,&
+                            ldf, scale, dsum, dscale,iwork, pq, info )
+                 if( dscale/=zero ) then
+                    if( ijob==1 .or. ijob==3 ) then
+                       dif = sqrt( real( 2*m*n,KIND=sp) ) / ( dscale*sqrt( dsum ) )
+                    else
+                       dif = sqrt( real( pq,KIND=sp) ) / ( dscale*sqrt( dsum ) )
+                    end if
+                 end if
+                 if( isolve==2 .and. iround==1 ) then
+                    if( notran ) then
+                       ifunc = ijob
+                    end if
+                    scale2 = scale
+                    call stdlib_slacpy( 'F', m, n, c, ldc, work, m )
+                    call stdlib_slacpy( 'F', m, n, f, ldf, work( m*n+1 ), m )
+                    call stdlib_slaset( 'F', m, n, zero, zero, c, ldc )
+                    call stdlib_slaset( 'F', m, n, zero, zero, f, ldf )
+                 else if( isolve==2 .and. iround==2 ) then
+                    call stdlib_slacpy( 'F', m, n, work, m, c, ldc )
+                    call stdlib_slacpy( 'F', m, n, work( m*n+1 ), m, f, ldf )
+                    scale = scale2
+                 end if
+              end do loop_30
+              return
+           end if
+           ! determine block structure of a
+           p = 0
+           i = 1
+           40 continue
+           if( i>m )go to 50
+           p = p + 1
+           iwork( p ) = i
+           i = i + mb
+           if( i>=m )go to 50
+           if( a( i, i-1 )/=zero )i = i + 1
+           go to 40
+           50 continue
+           iwork( p+1 ) = m + 1
+           if( iwork( p )==iwork( p+1 ) )p = p - 1
+           ! determine block structure of b
+           q = p + 1
+           j = 1
+           60 continue
+           if( j>n )go to 70
+           q = q + 1
+           iwork( q ) = j
+           j = j + nb
+           if( j>=n )go to 70
+           if( b( j, j-1 )/=zero )j = j + 1
+           go to 60
+           70 continue
+           iwork( q+1 ) = n + 1
+           if( iwork( q )==iwork( q+1 ) )q = q - 1
+           if( notran ) then
+              loop_150: do iround = 1, isolve
+                 ! solve (i, j)-subsystem
+                     ! a(i, i) * r(i, j) - l(i, j) * b(j, j) = c(i, j)
+                     ! d(i, i) * r(i, j) - l(i, j) * e(j, j) = f(i, j)
+                 ! for i = p, p - 1,..., 1; j = 1, 2,..., q
+                 dscale = zero
+                 dsum = one
+                 pq = 0
+                 scale = one
+                 loop_130: do j = p + 2, q
+                    js = iwork( j )
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    loop_120: do i = p, 1, -1
+                       is = iwork( i )
+                       ie = iwork( i+1 ) - 1
+                       mb = ie - is + 1
+                       ppqq = 0
+                       call stdlib_stgsy2( trans, ifunc, mb, nb, a( is, is ), lda,b( js, js ), &
+                       ldb, c( is, js ), ldc,d( is, is ), ldd, e( js, js ), lde,f( is, js ), ldf, &
+                                 scaloc, dsum, dscale,iwork( q+2 ), ppqq, linfo )
+                       if( linfo>0 )info = linfo
+                       pq = pq + ppqq
+                       if( scaloc/=one ) then
+                          do k = 1, js - 1
+                             call stdlib_sscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_sscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          do k = js, je
+                             call stdlib_sscal( is-1, scaloc, c( 1, k ), 1 )
+                             call stdlib_sscal( is-1, scaloc, f( 1, k ), 1 )
+                          end do
+                          do k = js, je
+                             call stdlib_sscal( m-ie, scaloc, c( ie+1, k ), 1 )
+                             call stdlib_sscal( m-ie, scaloc, f( ie+1, k ), 1 )
+                          end do
+                          do k = je + 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, k ), 1 )
+                             call stdlib_sscal( m, scaloc, f( 1, k ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! substitute r(i, j) and l(i, j) into remaining
+                       ! equation.
+                       if( i>1 ) then
+                          call stdlib_sgemm( 'N', 'N', is-1, nb, mb, -one,a( 1, is ), lda, c( is, &
+                                    js ), ldc, one,c( 1, js ), ldc )
+                          call stdlib_sgemm( 'N', 'N', is-1, nb, mb, -one,d( 1, is ), ldd, c( is, &
+                                    js ), ldc, one,f( 1, js ), ldf )
+                       end if
+                       if( j<q ) then
+                          call stdlib_sgemm( 'N', 'N', mb, n-je, nb, one,f( is, js ), ldf, b( js, &
+                                    je+1 ), ldb,one, c( is, je+1 ), ldc )
+                          call stdlib_sgemm( 'N', 'N', mb, n-je, nb, one,f( is, js ), ldf, e( js, &
+                                    je+1 ), lde,one, f( is, je+1 ), ldf )
+                       end if
+                    end do loop_120
+                 end do loop_130
+                 if( dscale/=zero ) then
+                    if( ijob==1 .or. ijob==3 ) then
+                       dif = sqrt( real( 2*m*n,KIND=sp) ) / ( dscale*sqrt( dsum ) )
+                    else
+                       dif = sqrt( real( pq,KIND=sp) ) / ( dscale*sqrt( dsum ) )
+                    end if
+                 end if
+                 if( isolve==2 .and. iround==1 ) then
+                    if( notran ) then
+                       ifunc = ijob
+                    end if
+                    scale2 = scale
+                    call stdlib_slacpy( 'F', m, n, c, ldc, work, m )
+                    call stdlib_slacpy( 'F', m, n, f, ldf, work( m*n+1 ), m )
+                    call stdlib_slaset( 'F', m, n, zero, zero, c, ldc )
+                    call stdlib_slaset( 'F', m, n, zero, zero, f, ldf )
+                 else if( isolve==2 .and. iround==2 ) then
+                    call stdlib_slacpy( 'F', m, n, work, m, c, ldc )
+                    call stdlib_slacpy( 'F', m, n, work( m*n+1 ), m, f, ldf )
+                    scale = scale2
+                 end if
+              end do loop_150
+           else
+              ! solve transposed (i, j)-subsystem
+                   ! a(i, i)**t * r(i, j)  + d(i, i)**t * l(i, j)  =  c(i, j)
+                   ! r(i, j)  * b(j, j)**t + l(i, j)  * e(j, j)**t = -f(i, j)
+              ! for i = 1,2,..., p; j = q, q-1,..., 1
+              scale = one
+              loop_210: do i = 1, p
+                 is = iwork( i )
+                 ie = iwork( i+1 ) - 1
+                 mb = ie - is + 1
+                 loop_200: do j = q, p + 2, -1
+                    js = iwork( j )
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    call stdlib_stgsy2( trans, ifunc, mb, nb, a( is, is ), lda,b( js, js ), ldb, &
+                    c( is, js ), ldc,d( is, is ), ldd, e( js, js ), lde,f( is, js ), ldf, scaloc, &
+                              dsum, dscale,iwork( q+2 ), ppqq, linfo )
+                    if( linfo>0 )info = linfo
+                    if( scaloc/=one ) then
+                       do k = 1, js - 1
+                          call stdlib_sscal( m, scaloc, c( 1, k ), 1 )
+                          call stdlib_sscal( m, scaloc, f( 1, k ), 1 )
+                       end do
+                       do k = js, je
+                          call stdlib_sscal( is-1, scaloc, c( 1, k ), 1 )
+                          call stdlib_sscal( is-1, scaloc, f( 1, k ), 1 )
+                       end do
+                       do k = js, je
+                          call stdlib_sscal( m-ie, scaloc, c( ie+1, k ), 1 )
+                          call stdlib_sscal( m-ie, scaloc, f( ie+1, k ), 1 )
+                       end do
+                       do k = je + 1, n
+                          call stdlib_sscal( m, scaloc, c( 1, k ), 1 )
+                          call stdlib_sscal( m, scaloc, f( 1, k ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    ! substitute r(i, j) and l(i, j) into remaining equation.
+                    if( j>p+2 ) then
+                       call stdlib_sgemm( 'N', 'T', mb, js-1, nb, one, c( is, js ),ldc, b( 1, js )&
+                                 , ldb, one, f( is, 1 ),ldf )
+                       call stdlib_sgemm( 'N', 'T', mb, js-1, nb, one, f( is, js ),ldf, e( 1, js )&
+                                 , lde, one, f( is, 1 ),ldf )
+                    end if
+                    if( i<p ) then
+                       call stdlib_sgemm( 'T', 'N', m-ie, nb, mb, -one,a( is, ie+1 ), lda, c( is, &
+                                 js ), ldc, one,c( ie+1, js ), ldc )
+                       call stdlib_sgemm( 'T', 'N', m-ie, nb, mb, -one,d( is, ie+1 ), ldd, f( is, &
+                                 js ), ldf, one,c( ie+1, js ), ldc )
+                    end if
+                 end do loop_200
+              end do loop_210
+           end if
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_stgsyl
+
+     !> STPCON: estimates the reciprocal of the condition number of a packed
+     !> triangular matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_stpcon( norm, uplo, diag, n, ap, rcond, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: ap(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(sp) :: ainvnm, anorm, scale, smlnum, xnorm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=sp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_slantp( norm, uplo, diag, n, ap, work )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_slatps( uplo, 'NO TRANSPOSE', diag, normin, n, ap,work, scale, &
+                              work( 2*n+1 ), info )
+                 else
+                    ! multiply by inv(a**t).
+                    call stdlib_slatps( uplo, 'TRANSPOSE', diag, normin, n, ap,work, scale, work( &
+                              2*n+1 ), info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_isamax( n, work, 1 )
+                    xnorm = abs( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_srscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_stpcon
+
+     !> STPLQT2: computes a LQ a factorization of a real "triangular-pentagonal"
+     !> matrix C, which is composed of a triangular block A and pentagonal block B,
+     !> using the compact WY representation for Q.
+
+     pure subroutine stdlib_stplqt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, p, mp, np
+           real(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. l>min(m,n) ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1, m ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STPLQT2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) return
+           do i = 1, m
+              ! generate elementary reflector h(i) to annihilate b(i,:)
+              p = n-l+min( l, i )
+              call stdlib_slarfg( p+1, a( i, i ), b( i, 1 ), ldb, t( 1, i ) )
+              if( i<m ) then
+                 ! w(m-i:1) := c(i+1:m,i:n) * c(i,i:n) [use w = t(m,:)]
+                 do j = 1, m-i
+                    t( m, j ) = (a( i+j, i ))
+                 end do
+                 call stdlib_sgemv( 'N', m-i, p, one, b( i+1, 1 ), ldb,b( i, 1 ), ldb, one, t( m, &
+                           1 ), ldt )
+                 ! c(i+1:m,i:n) = c(i+1:m,i:n) + alpha * c(i,i:n)*w(m-1:1)^h
+                 alpha = -(t( 1, i ))
+                 do j = 1, m-i
+                    a( i+j, i ) = a( i+j, i ) + alpha*(t( m, j ))
+                 end do
+                 call stdlib_sger( m-i, p, alpha,  t( m, 1 ), ldt,b( i, 1 ), ldb, b( i+1, 1 ), &
+                           ldb )
+              end if
+           end do
+           do i = 2, m
+              ! t(i,1:i-1) := c(i:i-1,1:n) * (alpha * c(i,i:n)^h)
+              alpha = -t( 1, i )
+              do j = 1, i-1
+                 t( i, j ) = zero
+              end do
+              p = min( i-1, l )
+              np = min( n-l+1, n )
+              mp = min( p+1, m )
+              ! triangular part of b2
+              do j = 1, p
+                 t( i, j ) = alpha*b( i, n-l+j )
+              end do
+              call stdlib_strmv( 'L', 'N', 'N', p, b( 1, np ), ldb,t( i, 1 ), ldt )
+              ! rectangular part of b2
+              call stdlib_sgemv( 'N', i-1-p, l,  alpha, b( mp, np ), ldb,b( i, np ), ldb, zero, t(&
+                         i,mp ), ldt )
+              ! b1
+              call stdlib_sgemv( 'N', i-1, n-l, alpha, b, ldb, b( i, 1 ), ldb,one, t( i, 1 ), ldt &
+                        )
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(i,1:i-1)
+             call stdlib_strmv( 'L', 'T', 'N', i-1, t, ldt, t( i, 1 ), ldt )
+              ! t(i,i) = tau(i)
+              t( i, i ) = t( 1, i )
+              t( 1, i ) = zero
+           end do
+           do i=1,m
+              do j= i+1,m
+                 t(i,j)=t(j,i)
+                 t(j,i)= zero
+              end do
+           end do
+     end subroutine stdlib_stplqt2
+
+     !> STPMLQT: applies a real orthogonal matrix Q obtained from a
+     !> "triangular-pentagonal" real block reflector H to a general
+     !> real matrix C, which consists of two blocks A and B.
+
+     pure subroutine stdlib_stpmlqt( side, trans, m, n, k, l, mb, v, ldv, t, ldt,a, lda, b, ldb, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, lda, ldb, m, n, l, mb, ldt
+           ! Array Arguments 
+           real(sp), intent(in) :: v(ldv,*), t(ldt,*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, nb, lb, kf, ldaq
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'T' )
+           notran = stdlib_lsame( trans, 'N' )
+           if ( left ) then
+              ldaq = max( 1, k )
+           else if ( right ) then
+              ldaq = max( 1, m )
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( l<0 .or. l>k ) then
+              info = -6
+           else if( mb<1 .or. (mb>k .and. k>0) ) then
+              info = -7
+           else if( ldv<k ) then
+              info = -9
+           else if( ldt<mb ) then
+              info = -11
+           else if( lda<ldaq ) then
+              info = -13
+           else if( ldb<max( 1, m ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STPMLQT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. notran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = 0
+                 end if
+                 call stdlib_stprfb( 'L', 'T', 'F', 'R', nb, n, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. tran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = nb-n+l-i+1
+                 end if
+                 call stdlib_stprfb( 'R', 'N', 'F', 'R', m, nb, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           else if( left .and. tran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = 0
+                 end if
+                 call stdlib_stprfb( 'L', 'N', 'F', 'R', nb, n, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. notran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = nb-n+l-i+1
+                 end if
+                 call stdlib_stprfb( 'R', 'T', 'F', 'R', m, nb, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           end if
+           return
+     end subroutine stdlib_stpmlqt
+
+     !> STPMQRT: applies a real orthogonal matrix Q obtained from a
+     !> "triangular-pentagonal" real block reflector H to a general
+     !> real matrix C, which consists of two blocks A and B.
+
+     pure subroutine stdlib_stpmqrt( side, trans, m, n, k, l, nb, v, ldv, t, ldt,a, lda, b, ldb, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, lda, ldb, m, n, l, nb, ldt
+           ! Array Arguments 
+           real(sp), intent(in) :: v(ldv,*), t(ldt,*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, mb, lb, kf, ldaq, ldvq
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'T' )
+           notran = stdlib_lsame( trans, 'N' )
+           if ( left ) then
+              ldvq = max( 1, m )
+              ldaq = max( 1, k )
+           else if ( right ) then
+              ldvq = max( 1, n )
+              ldaq = max( 1, m )
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( l<0 .or. l>k ) then
+              info = -6
+           else if( nb<1 .or. (nb>k .and. k>0) ) then
+              info = -7
+           else if( ldv<ldvq ) then
+              info = -9
+           else if( ldt<nb ) then
+              info = -11
+           else if( lda<ldaq ) then
+              info = -13
+           else if( ldb<max( 1, m ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STPMQRT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. tran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-m+l-i+1
+                 end if
+                 call stdlib_stprfb( 'L', 'T', 'F', 'C', mb, n, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. notran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-n+l-i+1
+                 end if
+                 call stdlib_stprfb( 'R', 'N', 'F', 'C', m, mb, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           else if( left .and. notran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-m+l-i+1
+                 end if
+                 call stdlib_stprfb( 'L', 'N', 'F', 'C', mb, n, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. tran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-n+l-i+1
+                 end if
+                 call stdlib_stprfb( 'R', 'T', 'F', 'C', m, mb, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           end if
+           return
+     end subroutine stdlib_stpmqrt
+
+     !> STPQRT2: computes a QR factorization of a real "triangular-pentagonal"
+     !> matrix C, which is composed of a triangular block A and pentagonal block B,
+     !> using the compact WY representation for Q.
+
+     pure subroutine stdlib_stpqrt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, p, mp, np
+           real(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. l>min(m,n) ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STPQRT2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) return
+           do i = 1, n
+              ! generate elementary reflector h(i) to annihilate b(:,i)
+              p = m-l+min( l, i )
+              call stdlib_slarfg( p+1, a( i, i ), b( 1, i ), 1, t( i, 1 ) )
+              if( i<n ) then
+                 ! w(1:n-i) := c(i:m,i+1:n)^h * c(i:m,i) [use w = t(:,n)]
+                 do j = 1, n-i
+                    t( j, n ) = (a( i, i+j ))
+                 end do
+                 call stdlib_sgemv( 'T', p, n-i, one, b( 1, i+1 ), ldb,b( 1, i ), 1, one, t( 1, n &
+                           ), 1 )
+                 ! c(i:m,i+1:n) = c(i:m,i+1:n) + alpha*c(i:m,i)*w(1:n-1)^h
+                 alpha = -(t( i, 1 ))
+                 do j = 1, n-i
+                    a( i, i+j ) = a( i, i+j ) + alpha*(t( j, n ))
+                 end do
+                 call stdlib_sger( p, n-i, alpha, b( 1, i ), 1,t( 1, n ), 1, b( 1, i+1 ), ldb )
+                           
+              end if
+           end do
+           do i = 2, n
+              ! t(1:i-1,i) := c(i:m,1:i-1)^h * (alpha * c(i:m,i))
+              alpha = -t( i, 1 )
+              do j = 1, i-1
+                 t( j, i ) = zero
+              end do
+              p = min( i-1, l )
+              mp = min( m-l+1, m )
+              np = min( p+1, n )
+              ! triangular part of b2
+              do j = 1, p
+                 t( j, i ) = alpha*b( m-l+j, i )
+              end do
+              call stdlib_strmv( 'U', 'T', 'N', p, b( mp, 1 ), ldb,t( 1, i ), 1 )
+              ! rectangular part of b2
+              call stdlib_sgemv( 'T', l, i-1-p, alpha, b( mp, np ), ldb,b( mp, i ), 1, zero, t( &
+                        np, i ), 1 )
+              ! b1
+              call stdlib_sgemv( 'T', m-l, i-1, alpha, b, ldb, b( 1, i ), 1,one, t( 1, i ), 1 )
+                        
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+              call stdlib_strmv( 'U', 'N', 'N', i-1, t, ldt, t( 1, i ), 1 )
+              ! t(i,i) = tau(i)
+              t( i, i ) = t( i, 1 )
+              t( i, 1 ) = zero
+           end do
+     end subroutine stdlib_stpqrt2
+
+     !> STRCON: estimates the reciprocal of the condition number of a
+     !> triangular matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_strcon( norm, uplo, diag, n, a, lda, rcond, work,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(sp) :: ainvnm, anorm, scale, smlnum, xnorm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STRCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_slamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=sp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_slantr( norm, uplo, diag, n, n, a, lda, work )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_slatrs( uplo, 'NO TRANSPOSE', diag, normin, n, a,lda, work, scale,&
+                               work( 2*n+1 ), info )
+                 else
+                    ! multiply by inv(a**t).
+                    call stdlib_slatrs( uplo, 'TRANSPOSE', diag, normin, n, a, lda,work, scale, &
+                              work( 2*n+1 ), info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_isamax( n, work, 1 )
+                    xnorm = abs( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_srscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_strcon
+
+     !> STZRZF: reduces the M-by-N ( M<=N ) real upper trapezoidal matrix A
+     !> to upper triangular form by means of orthogonal transformations.
+     !> The upper trapezoidal matrix A is factored as
+     !> A = ( R  0 ) * Z,
+     !> where Z is an N-by-N orthogonal matrix and R is an M-by-M upper
+     !> triangular matrix.
+
+     pure subroutine stdlib_stzrzf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iws, ki, kk, ldwork, lwkmin, lwkopt, m1, mu, nb, nbmin, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              if( m==0 .or. m==n ) then
+                 lwkopt = 1
+                 lwkmin = 1
+              else
+                 ! determine the block size.
+                 nb = stdlib_ilaenv( 1, 'SGERQF', ' ', m, n, -1, -1 )
+                 lwkopt = m*nb
+                 lwkmin = max( 1, m )
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STZRZF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 ) then
+              return
+           else if( m==n ) then
+              do i = 1, n
+                 tau( i ) = zero
+              end do
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = m
+           if( nb>1 .and. nb<m ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'SGERQF', ' ', m, n, -1, -1 ) )
+              if( nx<m ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'SGERQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<m .and. nx<m ) then
+              ! use blocked code initially.
+              ! the last kk rows are handled by the block method.
+              m1 = min( m+1, n )
+              ki = ( ( m-nx-1 ) / nb )*nb
+              kk = min( m, ki+nb )
+              do i = m - kk + ki + 1, m - kk + 1, -nb
+                 ib = min( m-i+1, nb )
+                 ! compute the tz factorization of the current block
+                 ! a(i:i+ib-1,i:n)
+                 call stdlib_slatrz( ib, n-i+1, n-m, a( i, i ), lda, tau( i ),work )
+                 if( i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_slarzt( 'BACKWARD', 'ROWWISE', n-m, ib, a( i, m1 ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h to a(1:i-1,i:n) from the right
+                    call stdlib_slarzb( 'RIGHT', 'NO TRANSPOSE', 'BACKWARD','ROWWISE', i-1, n-i+1,&
+                     ib, n-m, a( i, m1 ),lda, work, ldwork, a( 1, i ), lda,work( ib+1 ), ldwork )
+                               
+                 end if
+              end do
+              mu = i + nb - 1
+           else
+              mu = m
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 )call stdlib_slatrz( mu, n, n-m, a, lda, tau, work )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_stzrzf
+
+     !> SGBSV: computes the solution to a real system of linear equations
+     !> A * X = B, where A is a band matrix of order N with KL subdiagonals
+     !> and KU superdiagonals, and X and B are N-by-NRHS matrices.
+     !> The LU decomposition with partial pivoting and row interchanges is
+     !> used to factor A as A = L * U, where L is a product of permutation
+     !> and unit lower triangular matrices with KL subdiagonals, and U is
+     !> upper triangular with KL+KU superdiagonals.  The factored form of A
+     !> is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_sgbsv( n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: ab(ldab,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( kl<0 ) then
+              info = -2
+           else if( ku<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<2*kl+ku+1 ) then
+              info = -6
+           else if( ldb<max( n, 1 ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGBSV ', -info )
+              return
+           end if
+           ! compute the lu factorization of the band matrix a.
+           call stdlib_sgbtrf( n, n, kl, ku, ab, ldab, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_sgbtrs( 'NO TRANSPOSE', n, kl, ku, nrhs, ab, ldab, ipiv,b, ldb, info )
+                        
+           end if
+           return
+     end subroutine stdlib_sgbsv
+
+     !> SGBSVX: uses the LU factorization to compute the solution to a real
+     !> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+     !> where A is a band matrix of order N with KL subdiagonals and KU
+     !> superdiagonals, and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_sgbsvx( fact, trans, n, kl, ku, nrhs, ab, ldab, afb,ldafb, ipiv, equed, r, &
+               c, b, ldb, x, ldx,rcond, ferr, berr, work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldafb, ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: ab(ldab,*), afb(ldafb,*), b(ldb,*), c(*), r(*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+        ! moved setting of info = n+1 so info does not subsequently get
+        ! overwritten.  sven, 17 mar 05.
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: colequ, equil, nofact, notran, rowequ
+           character :: norm
+           integer(ilp) :: i, infequ, j, j1, j2
+           real(sp) :: amax, anorm, bignum, colcnd, rcmax, rcmin, rowcnd, rpvgrw, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rowequ = .false.
+              colequ = .false.
+           else
+              rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+              colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kl<0 ) then
+              info = -4
+           else if( ku<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kl+ku+1 ) then
+              info = -8
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -10
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rowequ .or. colequ .or. stdlib_lsame( &
+                     equed, 'N' ) ) ) then
+              info = -12
+           else
+              if( rowequ ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, r( j ) )
+                    rcmax = max( rcmax, r( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -13
+                 else if( n>0 ) then
+                    rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    rowcnd = one
+                 end if
+              end if
+              if( colequ .and. info==0 ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, c( j ) )
+                    rcmax = max( rcmax, c( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -14
+                 else if( n>0 ) then
+                    colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    colcnd = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -16
+                 else if( ldx<max( 1, n ) ) then
+                    info = -18
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGBSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_sgbequ( n, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, infequ )
+                        
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_slaqgb( n, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+                           
+                 rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+                 colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( notran ) then
+              if( rowequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       b( i, j ) = r( i )*b( i, j )
+                    end do
+                 end do
+              end if
+           else if( colequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = c( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the lu factorization of the band matrix a.
+              do j = 1, n
+                 j1 = max( j-ku, 1 )
+                 j2 = min( j+kl, n )
+                 call stdlib_scopy( j2-j1+1, ab( ku+1-j+j1, j ), 1,afb( kl+ku+1-j+j1, j ), 1 )
+                           
+              end do
+              call stdlib_sgbtrf( n, n, kl, ku, afb, ldafb, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 ) then
+                 ! compute the reciprocal pivot growth factor of the
+                 ! leading rank-deficient info columns of a.
+                 anorm = zero
+                 do j = 1, info
+                    do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                       anorm = max( anorm, abs( ab( i, j ) ) )
+                    end do
+                 end do
+                 rpvgrw = stdlib_slantb( 'M', 'U', 'N', info, min( info-1, kl+ku ),afb( max( 1, &
+                           kl+ku+2-info ), 1 ), ldafb,work )
+                 if( rpvgrw==zero ) then
+                    rpvgrw = one
+                 else
+                    rpvgrw = anorm / rpvgrw
+                 end if
+                 work( 1 ) = rpvgrw
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a and the
+           ! reciprocal pivot growth factor rpvgrw.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_slangb( norm, n, kl, ku, ab, ldab, work )
+           rpvgrw = stdlib_slantb( 'M', 'U', 'N', n, kl+ku, afb, ldafb, work )
+           if( rpvgrw==zero ) then
+              rpvgrw = one
+           else
+              rpvgrw = stdlib_slangb( 'M', n, kl, ku, ab, ldab, work ) / rpvgrw
+           end if
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_sgbcon( norm, n, kl, ku, afb, ldafb, ipiv, anorm, rcond,work, iwork, info )
+                     
+           ! compute the solution matrix x.
+           call stdlib_slacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_sgbtrs( trans, n, kl, ku, nrhs, afb, ldafb, ipiv, x, ldx,info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_sgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb, ipiv,b, ldb, x, ldx, &
+                     ferr, berr, work, iwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( notran ) then
+              if( colequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       x( i, j ) = c( i )*x( i, j )
+                    end do
+                 end do
+                 do j = 1, nrhs
+                    ferr( j ) = ferr( j ) / colcnd
+                 end do
+              end if
+           else if( rowequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = r( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / rowcnd
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           work( 1 ) = rpvgrw
+           return
+     end subroutine stdlib_sgbsvx
+
+     !> SGEBAL: balances a general real matrix A.  This involves, first,
+     !> permuting A by a similarity transformation to isolate eigenvalues
+     !> in the first 1 to ILO-1 and last IHI+1 to N elements on the
+     !> diagonal; and second, applying a diagonal similarity transformation
+     !> to rows and columns ILO to IHI to make the rows and columns as
+     !> close in norm as possible.  Both steps are optional.
+     !> Balancing may reduce the 1-norm of the matrix, and improve the
+     !> accuracy of the computed eigenvalues and/or eigenvectors.
+
+     pure subroutine stdlib_sgebal( job, n, a, lda, ilo, ihi, scale, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: scale(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: sclfac = 2.0e+0_sp
+           real(sp), parameter :: factor = 0.95e+0_sp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: noconv
+           integer(ilp) :: i, ica, iexc, ira, j, k, l, m
+           real(sp) :: c, ca, f, g, r, ra, s, sfmax1, sfmax2, sfmin1, sfmin2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! test the input parameters
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEBAL', -info )
+              return
+           end if
+           k = 1
+           l = n
+           if( n==0 )go to 210
+           if( stdlib_lsame( job, 'N' ) ) then
+              do i = 1, n
+                 scale( i ) = one
+              end do
+              go to 210
+           end if
+           if( stdlib_lsame( job, 'S' ) )go to 120
+           ! permutation to isolate eigenvalues if possible
+           go to 50
+           ! row and column exchange.
+           20 continue
+           scale( m ) = j
+           if( j==m )go to 30
+           call stdlib_sswap( l, a( 1, j ), 1, a( 1, m ), 1 )
+           call stdlib_sswap( n-k+1, a( j, k ), lda, a( m, k ), lda )
+           30 continue
+           go to ( 40, 80 )iexc
+           ! search for rows isolating an eigenvalue and push them down.
+           40 continue
+           if( l==1 )go to 210
+           l = l - 1
+           50 continue
+           loop_70: do j = l, 1, -1
+              loop_60: do i = 1, l
+                 if( i==j )cycle loop_60
+                 if( a( j, i )/=zero )cycle loop_70
+              end do loop_60
+              m = l
+              iexc = 1
+              go to 20
+           end do loop_70
+           go to 90
+           ! search for columns isolating an eigenvalue and push them left.
+           80 continue
+           k = k + 1
+           90 continue
+           loop_110: do j = k, l
+              loop_100: do i = k, l
+                 if( i==j )cycle loop_100
+                 if( a( i, j )/=zero )cycle loop_110
+              end do loop_100
+              m = k
+              iexc = 2
+              go to 20
+           end do loop_110
+           120 continue
+           do i = k, l
+              scale( i ) = one
+           end do
+           if( stdlib_lsame( job, 'P' ) )go to 210
+           ! balance the submatrix in rows k to l.
+           ! iterative loop for norm reduction
+           sfmin1 = stdlib_slamch( 'S' ) / stdlib_slamch( 'P' )
+           sfmax1 = one / sfmin1
+           sfmin2 = sfmin1*sclfac
+           sfmax2 = one / sfmin2
+           140 continue
+           noconv = .false.
+           loop_200: do i = k, l
+              c = stdlib_snrm2( l-k+1, a( k, i ), 1 )
+              r = stdlib_snrm2( l-k+1, a( i, k ), lda )
+              ica = stdlib_isamax( l, a( 1, i ), 1 )
+              ca = abs( a( ica, i ) )
+              ira = stdlib_isamax( n-k+1, a( i, k ), lda )
+              ra = abs( a( i, ira+k-1 ) )
+              ! guard against zero c or r due to underflow.
+              if( c==zero .or. r==zero )cycle loop_200
+              g = r / sclfac
+              f = one
+              s = c + r
+              160 continue
+              if( c>=g .or. max( f, c, ca )>=sfmax2 .or.min( r, g, ra )<=sfmin2 )go to 170
+              f = f*sclfac
+              c = c*sclfac
+              ca = ca*sclfac
+              r = r / sclfac
+              g = g / sclfac
+              ra = ra / sclfac
+              go to 160
+              170 continue
+              g = c / sclfac
+              180 continue
+              if( g<r .or. max( r, ra )>=sfmax2 .or.min( f, c, g, ca )<=sfmin2 )go to 190
+                 if( stdlib_sisnan( c+f+ca+r+g+ra ) ) then
+                 ! exit if nan to avoid infinite loop
+                 info = -3
+                 call stdlib_xerbla( 'SGEBAL', -info )
+                 return
+              end if
+              f = f / sclfac
+              c = c / sclfac
+              g = g / sclfac
+              ca = ca / sclfac
+              r = r*sclfac
+              ra = ra*sclfac
+              go to 180
+              ! now balance.
+              190 continue
+              if( ( c+r )>=factor*s )cycle loop_200
+              if( f<one .and. scale( i )<one ) then
+                 if( f*scale( i )<=sfmin1 )cycle loop_200
+              end if
+              if( f>one .and. scale( i )>one ) then
+                 if( scale( i )>=sfmax1 / f )cycle loop_200
+              end if
+              g = one / f
+              scale( i ) = scale( i )*f
+              noconv = .true.
+              call stdlib_sscal( n-k+1, g, a( i, k ), lda )
+              call stdlib_sscal( l, f, a( 1, i ), 1 )
+           end do loop_200
+           if( noconv )go to 140
+           210 continue
+           ilo = k
+           ihi = l
+           return
+     end subroutine stdlib_sgebal
+
+     !> SGEBD2: reduces a real general m by n matrix A to upper or lower
+     !> bidiagonal form B by an orthogonal transformation: Q**T * A * P = B.
+     !> If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+     pure subroutine stdlib_sgebd2( m, n, a, lda, d, e, tauq, taup, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: d(*), e(*), taup(*), tauq(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info<0 ) then
+              call stdlib_xerbla( 'SGEBD2', -info )
+              return
+           end if
+           if( m>=n ) then
+              ! reduce to upper bidiagonal form
+              do i = 1, n
+                 ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+                 call stdlib_slarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tauq( i ) )
+                           
+                 d( i ) = a( i, i )
+                 a( i, i ) = one
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 if( i<n )call stdlib_slarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tauq( i ),a( i, i+&
+                           1 ), lda, work )
+                 a( i, i ) = d( i )
+                 if( i<n ) then
+                    ! generate elementary reflector g(i) to annihilate
+                    ! a(i,i+2:n)
+                    call stdlib_slarfg( n-i, a( i, i+1 ), a( i, min( i+2, n ) ),lda, taup( i ) )
+                              
+                    e( i ) = a( i, i+1 )
+                    a( i, i+1 ) = one
+                    ! apply g(i) to a(i+1:m,i+1:n) from the right
+                    call stdlib_slarf( 'RIGHT', m-i, n-i, a( i, i+1 ), lda,taup( i ), a( i+1, i+1 &
+                              ), lda, work )
+                    a( i, i+1 ) = e( i )
+                 else
+                    taup( i ) = zero
+                 end if
+              end do
+           else
+              ! reduce to lower bidiagonal form
+              do i = 1, m
+                 ! generate elementary reflector g(i) to annihilate a(i,i+1:n)
+                 call stdlib_slarfg( n-i+1, a( i, i ), a( i, min( i+1, n ) ), lda,taup( i ) )
+                           
+                 d( i ) = a( i, i )
+                 a( i, i ) = one
+                 ! apply g(i) to a(i+1:m,i:n) from the right
+                 if( i<m )call stdlib_slarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda,taup( i ), a( i+&
+                           1, i ), lda, work )
+                 a( i, i ) = d( i )
+                 if( i<m ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(i+2:m,i)
+                    call stdlib_slarfg( m-i, a( i+1, i ), a( min( i+2, m ), i ), 1,tauq( i ) )
+                              
+                    e( i ) = a( i+1, i )
+                    a( i+1, i ) = one
+                    ! apply h(i) to a(i+1:m,i+1:n) from the left
+                    call stdlib_slarf( 'LEFT', m-i, n-i, a( i+1, i ), 1, tauq( i ),a( i+1, i+1 ), &
+                              lda, work )
+                    a( i+1, i ) = e( i )
+                 else
+                    tauq( i ) = zero
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_sgebd2
+
+     !> SGEHD2: reduces a real general matrix A to upper Hessenberg form H by
+     !> an orthogonal similarity transformation:  Q**T * A * Q = H .
+
+     pure subroutine stdlib_sgehd2( n, ilo, ihi, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEHD2', -info )
+              return
+           end if
+           do i = ilo, ihi - 1
+              ! compute elementary reflector h(i) to annihilate a(i+2:ihi,i)
+              call stdlib_slarfg( ihi-i, a( i+1, i ), a( min( i+2, n ), i ), 1,tau( i ) )
+              aii = a( i+1, i )
+              a( i+1, i ) = one
+              ! apply h(i) to a(1:ihi,i+1:ihi) from the right
+              call stdlib_slarf( 'RIGHT', ihi, ihi-i, a( i+1, i ), 1, tau( i ),a( 1, i+1 ), lda, &
+                        work )
+              ! apply h(i) to a(i+1:ihi,i+1:n) from the left
+              call stdlib_slarf( 'LEFT', ihi-i, n-i, a( i+1, i ), 1, tau( i ),a( i+1, i+1 ), lda, &
+                        work )
+              a( i+1, i ) = aii
+           end do
+           return
+     end subroutine stdlib_sgehd2
+
+     !> SGELQ2: computes an LQ factorization of a real m-by-n matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a n-by-n orthogonal matrix;
+     !> L is a lower-triangular m-by-m matrix;
+     !> 0 is a m-by-(n-m) zero matrix, if m < n.
+
+     pure subroutine stdlib_sgelq2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGELQ2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i,i+1:n)
+              call stdlib_slarfg( n-i+1, a( i, i ), a( i, min( i+1, n ) ), lda,tau( i ) )
+              if( i<m ) then
+                 ! apply h(i) to a(i+1:m,i:n) from the right
+                 aii = a( i, i )
+                 a( i, i ) = one
+                 call stdlib_slarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda, tau( i ),a( i+1, i ), &
+                           lda, work )
+                 a( i, i ) = aii
+              end if
+           end do
+           return
+     end subroutine stdlib_sgelq2
+
+     !> SGELQF: computes an LQ factorization of a real M-by-N matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a N-by-N orthogonal matrix;
+     !> L is a lower-triangular M-by-M matrix;
+     !> 0 is a M-by-(N-M) zero matrix, if M < N.
+
+     pure subroutine stdlib_sgelqf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'SGELQF', ' ', m, n, -1, -1 )
+           lwkopt = m*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, m ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGELQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'SGELQF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'SGELQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the lq factorization of the current block
+                 ! a(i:i+ib-1,i:n)
+                 call stdlib_sgelq2( ib, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=m ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_slarft( 'FORWARD', 'ROWWISE', n-i+1, ib, a( i, i ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h to a(i+ib:m,i:n) from the right
+                    call stdlib_slarfb( 'RIGHT', 'NO TRANSPOSE', 'FORWARD','ROWWISE', m-i-ib+1, n-&
+                    i+1, ib, a( i, i ),lda, work, ldwork, a( i+ib, i ), lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_sgelq2( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_sgelqf
+
+     !> SGELQT3: recursively computes a LQ factorization of a real M-by-N
+     !> matrix A, using the compact WY representation of Q.
+     !> Based on the algorithm of Elmroth and Gustavson,
+     !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+
+     pure recursive subroutine stdlib_sgelqt3( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, ldt
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, j, j1, m1, m2, iinfo
+           ! Executable Statements 
+           info = 0
+           if( m < 0 ) then
+              info = -1
+           else if( n < m ) then
+              info = -2
+           else if( lda < max( 1, m ) ) then
+              info = -4
+           else if( ldt < max( 1, m ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGELQT3', -info )
+              return
+           end if
+           if( m==1 ) then
+              ! compute householder transform when m=1
+              call stdlib_slarfg( n, a(1,1), a( 1, min( 2, n ) ), lda, t(1,1) )
+           else
+              ! otherwise, split a into blocks...
+              m1 = m/2
+              m2 = m-m1
+              i1 = min( m1+1, m )
+              j1 = min( m+1, n )
+              ! compute a(1:m1,1:n) <- (y1,r1,t1), where q1 = i - y1 t1 y1^h
+              call stdlib_sgelqt3( m1, n, a, lda, t, ldt, iinfo )
+              ! compute a(j1:m,1:n) = q1^h a(j1:m,1:n) [workspace: t(1:n1,j1:n)]
+              do i=1,m2
+                 do j=1,m1
+                    t(  i+m1, j ) = a( i+m1, j )
+                 end do
+              end do
+              call stdlib_strmm( 'R', 'U', 'T', 'U', m2, m1, one,a, lda, t( i1, 1 ), ldt )
+              call stdlib_sgemm( 'N', 'T', m2, m1, n-m1, one, a( i1, i1 ), lda,a( 1, i1 ), lda, &
+                        one, t( i1, 1 ), ldt)
+              call stdlib_strmm( 'R', 'U', 'N', 'N', m2, m1, one,t, ldt, t( i1, 1 ), ldt )
+              call stdlib_sgemm( 'N', 'N', m2, n-m1, m1, -one, t( i1, 1 ), ldt,a( 1, i1 ), lda, &
+                        one, a( i1, i1 ), lda )
+              call stdlib_strmm( 'R', 'U', 'N', 'U', m2, m1 , one,a, lda, t( i1, 1 ), ldt )
+                        
+              do i=1,m2
+                 do j=1,m1
+                    a(  i+m1, j ) = a( i+m1, j ) - t( i+m1, j )
+                    t( i+m1, j )=0
+                 end do
+              end do
+              ! compute a(j1:m,j1:n) <- (y2,r2,t2) where q2 = i - y2 t2 y2^h
+              call stdlib_sgelqt3( m2, n-m1, a( i1, i1 ), lda,t( i1, i1 ), ldt, iinfo )
+              ! compute t3 = t(j1:n1,1:n) = -t1 y1^h y2 t2
+              do i=1,m2
+                 do j=1,m1
+                    t( j, i+m1  ) = (a( j, i+m1 ))
+                 end do
+              end do
+              call stdlib_strmm( 'R', 'U', 'T', 'U', m1, m2, one,a( i1, i1 ), lda, t( 1, i1 ), &
+                        ldt )
+              call stdlib_sgemm( 'N', 'T', m1, m2, n-m, one, a( 1, j1 ), lda,a( i1, j1 ), lda, &
+                        one, t( 1, i1 ), ldt )
+              call stdlib_strmm( 'L', 'U', 'N', 'N', m1, m2, -one, t, ldt,t( 1, i1 ), ldt )
+                        
+              call stdlib_strmm( 'R', 'U', 'N', 'N', m1, m2, one,t( i1, i1 ), ldt, t( 1, i1 ), &
+                        ldt )
+              ! y = (y1,y2); l = [ l1            0  ];  t = [t1 t3]
+                               ! [ a(1:n1,j1:n)  l2 ]       [ 0 t2]
+           end if
+           return
+     end subroutine stdlib_sgelqt3
+
+     !> SGEQL2: computes a QL factorization of a real m by n matrix A:
+     !> A = Q * L.
+
+     pure subroutine stdlib_sgeql2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEQL2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = k, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! a(1:m-k+i-1,n-k+i)
+              call stdlib_slarfg( m-k+i, a( m-k+i, n-k+i ), a( 1, n-k+i ), 1,tau( i ) )
+              ! apply h(i) to a(1:m-k+i,1:n-k+i-1) from the left
+              aii = a( m-k+i, n-k+i )
+              a( m-k+i, n-k+i ) = one
+              call stdlib_slarf( 'LEFT', m-k+i, n-k+i-1, a( 1, n-k+i ), 1, tau( i ),a, lda, work )
+                        
+              a( m-k+i, n-k+i ) = aii
+           end do
+           return
+     end subroutine stdlib_sgeql2
+
+     !> SGEQLF: computes a QL factorization of a real M-by-N matrix A:
+     !> A = Q * L.
+
+     pure subroutine stdlib_sgeqlf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ki, kk, ldwork, lwkopt, mu, nb, nbmin, nu, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              k = min( m, n )
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'SGEQLF', ' ', m, n, -1, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, n ) .and. .not.lquery ) then
+                 info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEQLF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'SGEQLF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'SGEQLF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially.
+              ! the last kk columns are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              do i = k - kk + ki + 1, k - kk + 1, -nb
+                 ib = min( k-i+1, nb )
+                 ! compute the ql factorization of the current block
+                 ! a(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1)
+                 call stdlib_sgeql2( m-k+i+ib-1, ib, a( 1, n-k+i ), lda, tau( i ),work, iinfo )
+                           
+                 if( n-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_slarft( 'BACKWARD', 'COLUMNWISE', m-k+i+ib-1, ib,a( 1, n-k+i ), &
+                              lda, tau( i ), work, ldwork )
+                    ! apply h**t to a(1:m-k+i+ib-1,1:n-k+i-1) from the left
+                    call stdlib_slarfb( 'LEFT', 'TRANSPOSE', 'BACKWARD','COLUMNWISE', m-k+i+ib-1, &
+                    n-k+i-1, ib,a( 1, n-k+i ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+              mu = m - k + i + nb - 1
+              nu = n - k + i + nb - 1
+           else
+              mu = m
+              nu = n
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 .and. nu>0 )call stdlib_sgeql2( mu, nu, a, lda, tau, work, iinfo )
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_sgeqlf
+
+     !> SGEQR2: computes a QR factorization of a real m-by-n matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a m-by-m orthogonal matrix;
+     !> R is an upper-triangular n-by-n matrix;
+     !> 0 is a (m-n)-by-n zero matrix, if m > n.
+
+     pure subroutine stdlib_sgeqr2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEQR2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+              call stdlib_slarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tau( i ) )
+              if( i<n ) then
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 aii = a( i, i )
+                 a( i, i ) = one
+                 call stdlib_slarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tau( i ),a( i, i+1 ), lda, &
+                           work )
+                 a( i, i ) = aii
+              end if
+           end do
+           return
+     end subroutine stdlib_sgeqr2
+
+     !> SGEQR2P: computes a QR factorization of a real m-by-n matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a m-by-m orthogonal matrix;
+     !> R is an upper-triangular n-by-n matrix with nonnegative diagonal
+     !> entries;
+     !> 0 is a (m-n)-by-n zero matrix, if m > n.
+
+     subroutine stdlib_sgeqr2p( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEQR2P', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+              call stdlib_slarfgp( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tau( i ) )
+              if( i<n ) then
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 aii = a( i, i )
+                 a( i, i ) = one
+                 call stdlib_slarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tau( i ),a( i, i+1 ), lda, &
+                           work )
+                 a( i, i ) = aii
+              end if
+           end do
+           return
+     end subroutine stdlib_sgeqr2p
+
+     !> SGEQRF: computes a QR factorization of a real M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     pure subroutine stdlib_sgeqrf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           k = min( m, n )
+           info = 0
+           nb = stdlib_ilaenv( 1, 'SGEQRF', ' ', m, n, -1, -1 )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( .not.lquery ) then
+              if( lwork<=0 .or. ( m>0 .and. lwork<max( 1, n ) ) )info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEQRF', -info )
+              return
+           else if( lquery ) then
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'SGEQRF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'SGEQRF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the qr factorization of the current block
+                 ! a(i:m,i:i+ib-1)
+                 call stdlib_sgeqr2( m-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_slarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h**t to a(i:m,i+ib:n) from the left
+                    call stdlib_slarfb( 'LEFT', 'TRANSPOSE', 'FORWARD','COLUMNWISE', m-i+1, n-i-&
+                    ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 ), ldwork &
+                              )
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_sgeqr2( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_sgeqrf
+
+     !> SGEQR2P computes a QR factorization of a real M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix with nonnegative diagonal
+     !> entries;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     subroutine stdlib_sgeqrfp( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'SGEQRF', ' ', m, n, -1, -1 )
+           lwkopt = n*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEQRFP', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'SGEQRF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'SGEQRF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the qr factorization of the current block
+                 ! a(i:m,i:i+ib-1)
+                 call stdlib_sgeqr2p( m-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_slarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h**t to a(i:m,i+ib:n) from the left
+                    call stdlib_slarfb( 'LEFT', 'TRANSPOSE', 'FORWARD','COLUMNWISE', m-i+1, n-i-&
+                    ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 ), ldwork &
+                              )
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_sgeqr2p( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_sgeqrfp
+
+     !> SGEQRT2: computes a QR factorization of a real M-by-N matrix A,
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_sgeqrt2( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(sp) :: aii, alpha
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( m<n ) then
+              info = -1
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEQRT2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elem. refl. h(i) to annihilate a(i+1:m,i), tau(i) -> t(i,1)
+              call stdlib_slarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,t( i, 1 ) )
+              if( i<n ) then
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 aii = a( i, i )
+                 a( i, i ) = one
+                 ! w(1:n-i) := a(i:m,i+1:n)^h * a(i:m,i) [w = t(:,n)]
+                 call stdlib_sgemv( 'T',m-i+1, n-i, one, a( i, i+1 ), lda,a( i, i ), 1, zero, t( &
+                           1, n ), 1 )
+                 ! a(i:m,i+1:n) = a(i:m,i+1:n) + alpha*a(i:m,i)*w(1:n-1)^h
+                 alpha = -(t( i, 1 ))
+                 call stdlib_sger( m-i+1, n-i, alpha, a( i, i ), 1,t( 1, n ), 1, a( i, i+1 ), lda &
+                           )
+                 a( i, i ) = aii
+              end if
+           end do
+           do i = 2, n
+              aii = a( i, i )
+              a( i, i ) = one
+              ! t(1:i-1,i) := alpha * a(i:m,1:i-1)**t * a(i:m,i)
+              alpha = -t( i, 1 )
+              call stdlib_sgemv( 'T', m-i+1, i-1, alpha, a( i, 1 ), lda,a( i, i ), 1, zero, t( 1, &
+                        i ), 1 )
+              a( i, i ) = aii
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+              call stdlib_strmv( 'U', 'N', 'N', i-1, t, ldt, t( 1, i ), 1 )
+                 ! t(i,i) = tau(i)
+                 t( i, i ) = t( i, 1 )
+                 t( i, 1) = zero
+           end do
+     end subroutine stdlib_sgeqrt2
+
+     !> SGEQRT3: recursively computes a QR factorization of a real M-by-N
+     !> matrix A, using the compact WY representation of Q.
+     !> Based on the algorithm of Elmroth and Gustavson,
+     !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+
+     pure recursive subroutine stdlib_sgeqrt3( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, ldt
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, j, j1, n1, n2, iinfo
+           ! Executable Statements 
+           info = 0
+           if( n < 0 ) then
+              info = -2
+           else if( m < n ) then
+              info = -1
+           else if( lda < max( 1, m ) ) then
+              info = -4
+           else if( ldt < max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEQRT3', -info )
+              return
+           end if
+           if( n==1 ) then
+              ! compute householder transform when n=1
+              call stdlib_slarfg( m, a(1,1), a( min( 2, m ), 1 ), 1, t(1,1) )
+           else
+              ! otherwise, split a into blocks...
+              n1 = n/2
+              n2 = n-n1
+              j1 = min( n1+1, n )
+              i1 = min( n+1, m )
+              ! compute a(1:m,1:n1) <- (y1,r1,t1), where q1 = i - y1 t1 y1^h
+              call stdlib_sgeqrt3( m, n1, a, lda, t, ldt, iinfo )
+              ! compute a(1:m,j1:n) = q1^h a(1:m,j1:n) [workspace: t(1:n1,j1:n)]
+              do j=1,n2
+                 do i=1,n1
+                    t( i, j+n1 ) = a( i, j+n1 )
+                 end do
+              end do
+              call stdlib_strmm( 'L', 'L', 'T', 'U', n1, n2, one,a, lda, t( 1, j1 ), ldt )
+              call stdlib_sgemm( 'T', 'N', n1, n2, m-n1, one, a( j1, 1 ), lda,a( j1, j1 ), lda, &
+                        one, t( 1, j1 ), ldt)
+              call stdlib_strmm( 'L', 'U', 'T', 'N', n1, n2, one,t, ldt, t( 1, j1 ), ldt )
+              call stdlib_sgemm( 'N', 'N', m-n1, n2, n1, -one, a( j1, 1 ), lda,t( 1, j1 ), ldt, &
+                        one, a( j1, j1 ), lda )
+              call stdlib_strmm( 'L', 'L', 'N', 'U', n1, n2, one,a, lda, t( 1, j1 ), ldt )
+              do j=1,n2
+                 do i=1,n1
+                    a( i, j+n1 ) = a( i, j+n1 ) - t( i, j+n1 )
+                 end do
+              end do
+              ! compute a(j1:m,j1:n) <- (y2,r2,t2) where q2 = i - y2 t2 y2^h
+              call stdlib_sgeqrt3( m-n1, n2, a( j1, j1 ), lda,t( j1, j1 ), ldt, iinfo )
+              ! compute t3 = t(1:n1,j1:n) = -t1 y1^h y2 t2
+              do i=1,n1
+                 do j=1,n2
+                    t( i, j+n1 ) = (a( j+n1, i ))
+                 end do
+              end do
+              call stdlib_strmm( 'R', 'L', 'N', 'U', n1, n2, one,a( j1, j1 ), lda, t( 1, j1 ), &
+                        ldt )
+              call stdlib_sgemm( 'T', 'N', n1, n2, m-n, one, a( i1, 1 ), lda,a( i1, j1 ), lda, &
+                        one, t( 1, j1 ), ldt )
+              call stdlib_strmm( 'L', 'U', 'N', 'N', n1, n2, -one, t, ldt,t( 1, j1 ), ldt )
+                        
+              call stdlib_strmm( 'R', 'U', 'N', 'N', n1, n2, one,t( j1, j1 ), ldt, t( 1, j1 ), &
+                        ldt )
+              ! y = (y1,y2); r = [ r1  a(1:n1,j1:n) ];  t = [t1 t3]
+                               ! [  0        r2     ]       [ 0 t2]
+           end if
+           return
+     end subroutine stdlib_sgeqrt3
+
+     !> SGERFS: improves the computed solution to a system of linear
+     !> equations and provides error bounds and backward error estimates for
+     !> the solution.
+
+     pure subroutine stdlib_sgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*)
+           real(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transt
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGERFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transt = 'T'
+           else
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_scopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_sgemv( trans, n, n, -one, a, lda, x( 1, j ), 1, one,work( n+1 ), 1 )
+                        
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 work( i ) = abs( b( i, j ) )
+              end do
+              ! compute abs(op(a))*abs(x) + abs(b).
+              if( notran ) then
+                 do k = 1, n
+                    xk = abs( x( k, j ) )
+                    do i = 1, n
+                       work( i ) = work( i ) + abs( a( i, k ) )*xk
+                    end do
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    do i = 1, n
+                       s = s + abs( a( i, k ) )*abs( x( i, j ) )
+                    end do
+                    work( k ) = work( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_sgetrs( trans, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                 call stdlib_saxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_slacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_slacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_sgetrs( transt, n, 1, af, ldaf, ipiv, work( n+1 ),n, info )
+                              
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_sgetrs( trans, n, 1, af, ldaf, ipiv, work( n+1 ), n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_sgerfs
+
+     !> SGERQ2: computes an RQ factorization of a real m by n matrix A:
+     !> A = R * Q.
+
+     pure subroutine stdlib_sgerq2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           real(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGERQ2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = k, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! a(m-k+i,1:n-k+i-1)
+              call stdlib_slarfg( n-k+i, a( m-k+i, n-k+i ), a( m-k+i, 1 ), lda,tau( i ) )
+              ! apply h(i) to a(1:m-k+i-1,1:n-k+i) from the right
+              aii = a( m-k+i, n-k+i )
+              a( m-k+i, n-k+i ) = one
+              call stdlib_slarf( 'RIGHT', m-k+i-1, n-k+i, a( m-k+i, 1 ), lda,tau( i ), a, lda, &
+                        work )
+              a( m-k+i, n-k+i ) = aii
+           end do
+           return
+     end subroutine stdlib_sgerq2
+
+     !> SGERQF: computes an RQ factorization of a real M-by-N matrix A:
+     !> A = R * Q.
+
+     pure subroutine stdlib_sgerqf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ki, kk, ldwork, lwkopt, mu, nb, nbmin, nu, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              k = min( m, n )
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'SGERQF', ' ', m, n, -1, -1 )
+                 lwkopt = m*nb
+              end if
+              work( 1 ) = lwkopt
+              if ( .not.lquery ) then
+                 if( lwork<=0 .or. ( n>0 .and. lwork<max( 1, m ) ) )info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGERQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'SGERQF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'SGERQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially.
+              ! the last kk rows are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              do i = k - kk + ki + 1, k - kk + 1, -nb
+                 ib = min( k-i+1, nb )
+                 ! compute the rq factorization of the current block
+                 ! a(m-k+i:m-k+i+ib-1,1:n-k+i+ib-1)
+                 call stdlib_sgerq2( ib, n-k+i+ib-1, a( m-k+i, 1 ), lda, tau( i ),work, iinfo )
+                           
+                 if( m-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_slarft( 'BACKWARD', 'ROWWISE', n-k+i+ib-1, ib,a( m-k+i, 1 ), lda, &
+                              tau( i ), work, ldwork )
+                    ! apply h to a(1:m-k+i-1,1:n-k+i+ib-1) from the right
+                    call stdlib_slarfb( 'RIGHT', 'NO TRANSPOSE', 'BACKWARD','ROWWISE', m-k+i-1, n-&
+                    k+i+ib-1, ib,a( m-k+i, 1 ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+              mu = m - k + i + nb - 1
+              nu = n - k + i + nb - 1
+           else
+              mu = m
+              nu = n
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 .and. nu>0 )call stdlib_sgerq2( mu, nu, a, lda, tau, work, iinfo )
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_sgerqf
+
+     !> SGETRF: computes an LU factorization of a general M-by-N matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the right-looking Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_sgetrf( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGETRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'SGETRF', ' ', m, n, -1, -1 )
+           if( nb<=1 .or. nb>=min( m, n ) ) then
+              ! use unblocked code.
+              call stdlib_sgetrf2( m, n, a, lda, ipiv, info )
+           else
+              ! use blocked code.
+              do j = 1, min( m, n ), nb
+                 jb = min( min( m, n )-j+1, nb )
+                 ! factor diagonal and subdiagonal blocks and test for exact
+                 ! singularity.
+                 call stdlib_sgetrf2( m-j+1, jb, a( j, j ), lda, ipiv( j ), iinfo )
+                 ! adjust info and the pivot indices.
+                 if( info==0 .and. iinfo>0 )info = iinfo + j - 1
+                 do i = j, min( m, j+jb-1 )
+                    ipiv( i ) = j - 1 + ipiv( i )
+                 end do
+                 ! apply interchanges to columns 1:j-1.
+                 call stdlib_slaswp( j-1, a, lda, j, j+jb-1, ipiv, 1 )
+                 if( j+jb<=n ) then
+                    ! apply interchanges to columns j+jb:n.
+                    call stdlib_slaswp( n-j-jb+1, a( 1, j+jb ), lda, j, j+jb-1,ipiv, 1 )
+                    ! compute block row of u.
+                    call stdlib_strsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', jb,n-j-jb+1, one, &
+                              a( j, j ), lda, a( j, j+jb ),lda )
+                    if( j+jb<=m ) then
+                       ! update trailing submatrix.
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-j-jb+1,n-j-jb+1, jb, -&
+                       one, a( j+jb, j ), lda,a( j, j+jb ), lda, one, a( j+jb, j+jb ),lda )
+                                 
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_sgetrf
+
+     !> SGGHD3: reduces a pair of real matrices (A,B) to generalized upper
+     !> Hessenberg form using orthogonal transformations, where A is a
+     !> general matrix and B is upper triangular.  The form of the
+     !> generalized eigenvalue problem is
+     !> A*x = lambda*B*x,
+     !> and B is typically made upper triangular by computing its QR
+     !> factorization and moving the orthogonal matrix Q to the left side
+     !> of the equation.
+     !> This subroutine simultaneously reduces A to a Hessenberg matrix H:
+     !> Q**T*A*Z = H
+     !> and transforms B to another upper triangular matrix T:
+     !> Q**T*B*Z = T
+     !> in order to reduce the problem to its standard form
+     !> H*y = lambda*T*y
+     !> where y = Z**T*x.
+     !> The orthogonal matrices Q and Z are determined as products of Givens
+     !> rotations.  They may either be formed explicitly, or they may be
+     !> postmultiplied into input matrices Q1 and Z1, so that
+     !> Q1 * A * Z1**T = (Q1*Q) * H * (Z1*Z)**T
+     !> Q1 * B * Z1**T = (Q1*Q) * T * (Z1*Z)**T
+     !> If Q1 is the orthogonal matrix from the QR factorization of B in the
+     !> original equation A*x = lambda*B*x, then SGGHD3 reduces the original
+     !> problem to generalized Hessenberg form.
+     !> This is a blocked variant of SGGHRD, using matrix-matrix
+     !> multiplications for parts of the computation to enhance performance.
+
+     pure subroutine stdlib_sgghd3( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldb, ldq, ldz, n, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: blk22, initq, initz, lquery, wantq, wantz
+           character :: compq2, compz2
+           integer(ilp) :: cola, i, ierr, j, j0, jcol, jj, jrow, k, kacc22, len, lwkopt, n2nb, nb,&
+                      nblst, nbmin, nh, nnb, nx, ppw, ppwo, pw, top, topq
+           real(sp) :: c, c1, c2, s, s1, s2, temp, temp1, temp2, temp3
+           ! Intrinsic Functions 
+           intrinsic :: real,max
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           info = 0
+           nb = stdlib_ilaenv( 1, 'SGGHD3', ' ', n, ilo, ihi, -1 )
+           lwkopt = max( 6*n*nb, 1 )
+           work( 1 ) = real( lwkopt,KIND=sp)
+           initq = stdlib_lsame( compq, 'I' )
+           wantq = initq .or. stdlib_lsame( compq, 'V' )
+           initz = stdlib_lsame( compz, 'I' )
+           wantz = initz .or. stdlib_lsame( compz, 'V' )
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -1
+           else if( .not.stdlib_lsame( compz, 'N' ) .and. .not.wantz ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ( wantq .and. ldq<n ) .or. ldq<1 ) then
+              info = -11
+           else if( ( wantz .and. ldz<n ) .or. ldz<1 ) then
+              info = -13
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGHD3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! initialize q and z if desired.
+           if( initq )call stdlib_slaset( 'ALL', n, n, zero, one, q, ldq )
+           if( initz )call stdlib_slaset( 'ALL', n, n, zero, one, z, ldz )
+           ! zero out lower triangle of b.
+           if( n>1 )call stdlib_slaset( 'LOWER', n-1, n-1, zero, zero, b(2, 1), ldb )
+           ! quick return if possible
+           nh = ihi - ilo + 1
+           if( nh<=1 ) then
+              work( 1 ) = one
+              return
+           end if
+           ! determine the blocksize.
+           nbmin = stdlib_ilaenv( 2, 'SGGHD3', ' ', n, ilo, ihi, -1 )
+           if( nb>1 .and. nb<nh ) then
+              ! determine when to use unblocked instead of blocked code.
+              nx = max( nb, stdlib_ilaenv( 3, 'SGGHD3', ' ', n, ilo, ihi, -1 ) )
+              if( nx<nh ) then
+                 ! determine if workspace is large enough for blocked code.
+                 if( lwork<lwkopt ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code.
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'SGGHD3', ' ', n, ilo, ihi,-1 ) )
+                    if( lwork>=6*n*nbmin ) then
+                       nb = lwork / ( 6*n )
+                    else
+                       nb = 1
+                    end if
+                 end if
+              end if
+           end if
+           if( nb<nbmin .or. nb>=nh ) then
+              ! use unblocked code below
+              jcol = ilo
+           else
+              ! use blocked code
+              kacc22 = stdlib_ilaenv( 16, 'SGGHD3', ' ', n, ilo, ihi, -1 )
+              blk22 = kacc22==2
+              do jcol = ilo, ihi-2, nb
+                 nnb = min( nb, ihi-jcol-1 )
+                 ! initialize small orthogonal factors that will hold the
+                 ! accumulated givens rotations in workspace.
+                 ! n2nb   denotes the number of 2*nnb-by-2*nnb factors
+                 ! nblst  denotes the (possibly smaller) order of the last
+                        ! factor.
+                 n2nb = ( ihi-jcol-1 ) / nnb - 1
+                 nblst = ihi - jcol - n2nb*nnb
+                 call stdlib_slaset( 'ALL', nblst, nblst, zero, one, work, nblst )
+                 pw = nblst * nblst + 1
+                 do i = 1, n2nb
+                    call stdlib_slaset( 'ALL', 2*nnb, 2*nnb, zero, one,work( pw ), 2*nnb )
+                    pw = pw + 4*nnb*nnb
+                 end do
+                 ! reduce columns jcol:jcol+nnb-1 of a to hessenberg form.
+                 do j = jcol, jcol+nnb-1
+                    ! reduce jth column of a. store cosines and sines in jth
+                    ! column of a and b, respectively.
+                    do i = ihi, j+2, -1
+                       temp = a( i-1, j )
+                       call stdlib_slartg( temp, a( i, j ), c, s, a( i-1, j ) )
+                       a( i, j ) = c
+                       b( i, j ) = s
+                    end do
+                    ! accumulate givens rotations into workspace array.
+                    ppw  = ( nblst + 1 )*( nblst - 2 ) - j + jcol + 1
+                    len  = 2 + j - jcol
+                    jrow = j + n2nb*nnb + 2
+                    do i = ihi, jrow, -1
+                       c = a( i, j )
+                       s = b( i, j )
+                       do jj = ppw, ppw+len-1
+                          temp = work( jj + nblst )
+                          work( jj + nblst ) = c*temp - s*work( jj )
+                          work( jj ) = s*temp + c*work( jj )
+                       end do
+                       len = len + 1
+                       ppw = ppw - nblst - 1
+                    end do
+                    ppwo = nblst*nblst + ( nnb+j-jcol-1 )*2*nnb + nnb
+                    j0 = jrow - nnb
+                    do jrow = j0, j+2, -nnb
+                       ppw = ppwo
+                       len  = 2 + j - jcol
+                       do i = jrow+nnb-1, jrow, -1
+                          c = a( i, j )
+                          s = b( i, j )
+                          do jj = ppw, ppw+len-1
+                             temp = work( jj + 2*nnb )
+                             work( jj + 2*nnb ) = c*temp - s*work( jj )
+                             work( jj ) = s*temp + c*work( jj )
+                          end do
+                          len = len + 1
+                          ppw = ppw - 2*nnb - 1
+                       end do
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                    ! top denotes the number of top rows in a and b that will
+                    ! not be updated during the next steps.
+                    if( jcol<=2 ) then
+                       top = 0
+                    else
+                       top = jcol
+                    end if
+                    ! propagate transformations through b and replace stored
+                    ! left sines/cosines by right sines/cosines.
+                    do jj = n, j+1, -1
+                       ! update jjth column of b.
+                       do i = min( jj+1, ihi ), j+2, -1
+                          c = a( i, j )
+                          s = b( i, j )
+                          temp = b( i, jj )
+                          b( i, jj ) = c*temp - s*b( i-1, jj )
+                          b( i-1, jj ) = s*temp + c*b( i-1, jj )
+                       end do
+                       ! annihilate b( jj+1, jj ).
+                       if( jj<ihi ) then
+                          temp = b( jj+1, jj+1 )
+                          call stdlib_slartg( temp, b( jj+1, jj ), c, s,b( jj+1, jj+1 ) )
+                          b( jj+1, jj ) = zero
+                          call stdlib_srot( jj-top, b( top+1, jj+1 ), 1,b( top+1, jj ), 1, c, s )
+                                    
+                          a( jj+1, j ) = c
+                          b( jj+1, j ) = -s
+                       end if
+                    end do
+                    ! update a by transformations from right.
+                    ! explicit loop unrolling provides better performance
+                    ! compared to stdlib_slasr.
+                     ! call stdlib_slasr( 'right', 'variable', 'backward', ihi-top,
+           ! $                     ihi-j, a( j+2, j ), b( j+2, j ),
+           ! $                     a( top+1, j+1 ), lda )
+                    jj = mod( ihi-j-1, 3 )
+                    do i = ihi-j-3, jj+1, -3
+                       c = a( j+1+i, j )
+                       s = -b( j+1+i, j )
+                       c1 = a( j+2+i, j )
+                       s1 = -b( j+2+i, j )
+                       c2 = a( j+3+i, j )
+                       s2 = -b( j+3+i, j )
+                       do k = top+1, ihi
+                          temp = a( k, j+i  )
+                          temp1 = a( k, j+i+1 )
+                          temp2 = a( k, j+i+2 )
+                          temp3 = a( k, j+i+3 )
+                          a( k, j+i+3 ) = c2*temp3 + s2*temp2
+                          temp2 = -s2*temp3 + c2*temp2
+                          a( k, j+i+2 ) = c1*temp2 + s1*temp1
+                          temp1 = -s1*temp2 + c1*temp1
+                          a( k, j+i+1 ) = c*temp1 + s*temp
+                          a( k, j+i ) = -s*temp1 + c*temp
+                       end do
+                    end do
+                    if( jj>0 ) then
+                       do i = jj, 1, -1
+                          call stdlib_srot( ihi-top, a( top+1, j+i+1 ), 1,a( top+1, j+i ), 1, a( &
+                                    j+1+i, j ),-b( j+1+i, j ) )
+                       end do
+                    end if
+                    ! update (j+1)th column of a by transformations from left.
+                    if ( j < jcol + nnb - 1 ) then
+                       len  = 1 + j - jcol
+                       ! multiply with the trailing accumulated orthogonal
+                       ! matrix, which takes the form
+                              ! [  u11  u12  ]
+                          ! u = [            ],
+                              ! [  u21  u22  ]
+                       ! where u21 is a len-by-len matrix and u12 is lower
+                       ! triangular.
+                       jrow = ihi - nblst + 1
+                       call stdlib_sgemv( 'TRANSPOSE', nblst, len, one, work,nblst, a( jrow, j+1 )&
+                                 , 1, zero,work( pw ), 1 )
+                       ppw = pw + len
+                       do i = jrow, jrow+nblst-len-1
+                          work( ppw ) = a( i, j+1 )
+                          ppw = ppw + 1
+                       end do
+                       call stdlib_strmv( 'LOWER', 'TRANSPOSE', 'NON-UNIT',nblst-len, work( &
+                                 len*nblst + 1 ), nblst,work( pw+len ), 1 )
+                       call stdlib_sgemv( 'TRANSPOSE', len, nblst-len, one,work( (len+1)*nblst - &
+                       len + 1 ), nblst,a( jrow+nblst-len, j+1 ), 1, one,work( pw+len ), 1 )
+                                 
+                       ppw = pw
+                       do i = jrow, jrow+nblst-1
+                          a( i, j+1 ) = work( ppw )
+                          ppw = ppw + 1
+                       end do
+                       ! multiply with the other accumulated orthogonal
+                       ! matrices, which take the form
+                              ! [  u11  u12   0  ]
+                              ! [                ]
+                          ! u = [  u21  u22   0  ],
+                              ! [                ]
+                              ! [   0    0    i  ]
+                       ! where i denotes the (nnb-len)-by-(nnb-len) identity
+                       ! matrix, u21 is a len-by-len upper triangular matrix
+                       ! and u12 is an nnb-by-nnb lower triangular matrix.
+                       ppwo = 1 + nblst*nblst
+                       j0 = jrow - nnb
+                       do jrow = j0, jcol+1, -nnb
+                          ppw = pw + len
+                          do i = jrow, jrow+nnb-1
+                             work( ppw ) = a( i, j+1 )
+                             ppw = ppw + 1
+                          end do
+                          ppw = pw
+                          do i = jrow+nnb, jrow+nnb+len-1
+                             work( ppw ) = a( i, j+1 )
+                             ppw = ppw + 1
+                          end do
+                          call stdlib_strmv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', len,work( ppwo + &
+                                    nnb ), 2*nnb, work( pw ),1 )
+                          call stdlib_strmv( 'LOWER', 'TRANSPOSE', 'NON-UNIT', nnb,work( ppwo + &
+                                    2*len*nnb ),2*nnb, work( pw + len ), 1 )
+                          call stdlib_sgemv( 'TRANSPOSE', nnb, len, one,work( ppwo ), 2*nnb, a( &
+                                    jrow, j+1 ), 1,one, work( pw ), 1 )
+                          call stdlib_sgemv( 'TRANSPOSE', len, nnb, one,work( ppwo + 2*len*nnb + &
+                                    nnb ), 2*nnb,a( jrow+nnb, j+1 ), 1, one,work( pw+len ), 1 )
+                          ppw = pw
+                          do i = jrow, jrow+len+nnb-1
+                             a( i, j+1 ) = work( ppw )
+                             ppw = ppw + 1
+                          end do
+                          ppwo = ppwo + 4*nnb*nnb
+                       end do
+                    end if
+                 end do
+                 ! apply accumulated orthogonal matrices to a.
+                 cola = n - jcol - nnb + 1
+                 j = ihi - nblst + 1
+                 call stdlib_sgemm( 'TRANSPOSE', 'NO TRANSPOSE', nblst,cola, nblst, one, work, &
+                           nblst,a( j, jcol+nnb ), lda, zero, work( pw ),nblst )
+                 call stdlib_slacpy( 'ALL', nblst, cola, work( pw ), nblst,a( j, jcol+nnb ), lda )
+                           
+                 ppwo = nblst*nblst + 1
+                 j0 = j - nnb
+                 do j = j0, jcol+1, -nnb
+                    if ( blk22 ) then
+                       ! exploit the structure of
+                              ! [  u11  u12  ]
+                          ! u = [            ]
+                              ! [  u21  u22  ],
+                       ! where all blocks are nnb-by-nnb, u21 is upper
+                       ! triangular and u12 is lower triangular.
+                       call stdlib_sorm22( 'LEFT', 'TRANSPOSE', 2*nnb, cola, nnb,nnb, work( ppwo )&
+                                 , 2*nnb,a( j, jcol+nnb ), lda, work( pw ),lwork-pw+1, ierr )
+                    else
+                       ! ignore the structure of u.
+                       call stdlib_sgemm( 'TRANSPOSE', 'NO TRANSPOSE', 2*nnb,cola, 2*nnb, one, &
+                                 work( ppwo ), 2*nnb,a( j, jcol+nnb ), lda, zero, work( pw ),2*nnb )
+                       call stdlib_slacpy( 'ALL', 2*nnb, cola, work( pw ), 2*nnb,a( j, jcol+nnb ),&
+                                  lda )
+                    end if
+                    ppwo = ppwo + 4*nnb*nnb
+                 end do
+                 ! apply accumulated orthogonal matrices to q.
+                 if( wantq ) then
+                    j = ihi - nblst + 1
+                    if ( initq ) then
+                       topq = max( 2, j - jcol + 1 )
+                       nh  = ihi - topq + 1
+                    else
+                       topq = 1
+                       nh = n
+                    end if
+                    call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,nblst, nblst, one, q( &
+                              topq, j ), ldq,work, nblst, zero, work( pw ), nh )
+                    call stdlib_slacpy( 'ALL', nh, nblst, work( pw ), nh,q( topq, j ), ldq )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( initq ) then
+                          topq = max( 2, j - jcol + 1 )
+                          nh  = ihi - topq + 1
+                       end if
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_sorm22( 'RIGHT', 'NO TRANSPOSE', nh, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,q( topq, j ), ldq, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,2*nnb, 2*nnb, one,&
+                                     q( topq, j ), ldq,work( ppwo ), 2*nnb, zero, work( pw ),nh )
+                          call stdlib_slacpy( 'ALL', nh, 2*nnb, work( pw ), nh,q( topq, j ), ldq )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+                 ! accumulate right givens rotations if required.
+                 if ( wantz .or. top>0 ) then
+                    ! initialize small orthogonal factors that will hold the
+                    ! accumulated givens rotations in workspace.
+                    call stdlib_slaset( 'ALL', nblst, nblst, zero, one, work,nblst )
+                    pw = nblst * nblst + 1
+                    do i = 1, n2nb
+                       call stdlib_slaset( 'ALL', 2*nnb, 2*nnb, zero, one,work( pw ), 2*nnb )
+                                 
+                       pw = pw + 4*nnb*nnb
+                    end do
+                    ! accumulate givens rotations into workspace array.
+                    do j = jcol, jcol+nnb-1
+                       ppw  = ( nblst + 1 )*( nblst - 2 ) - j + jcol + 1
+                       len  = 2 + j - jcol
+                       jrow = j + n2nb*nnb + 2
+                       do i = ihi, jrow, -1
+                          c = a( i, j )
+                          a( i, j ) = zero
+                          s = b( i, j )
+                          b( i, j ) = zero
+                          do jj = ppw, ppw+len-1
+                             temp = work( jj + nblst )
+                             work( jj + nblst ) = c*temp - s*work( jj )
+                             work( jj ) = s*temp + c*work( jj )
+                          end do
+                          len = len + 1
+                          ppw = ppw - nblst - 1
+                       end do
+                       ppwo = nblst*nblst + ( nnb+j-jcol-1 )*2*nnb + nnb
+                       j0 = jrow - nnb
+                       do jrow = j0, j+2, -nnb
+                          ppw = ppwo
+                          len  = 2 + j - jcol
+                          do i = jrow+nnb-1, jrow, -1
+                             c = a( i, j )
+                             a( i, j ) = zero
+                             s = b( i, j )
+                             b( i, j ) = zero
+                             do jj = ppw, ppw+len-1
+                                temp = work( jj + 2*nnb )
+                                work( jj + 2*nnb ) = c*temp - s*work( jj )
+                                work( jj ) = s*temp + c*work( jj )
+                             end do
+                             len = len + 1
+                             ppw = ppw - 2*nnb - 1
+                          end do
+                          ppwo = ppwo + 4*nnb*nnb
+                       end do
+                    end do
+                 else
+                    call stdlib_slaset( 'LOWER', ihi - jcol - 1, nnb, zero, zero,a( jcol + 2, &
+                              jcol ), lda )
+                    call stdlib_slaset( 'LOWER', ihi - jcol - 1, nnb, zero, zero,b( jcol + 2, &
+                              jcol ), ldb )
+                 end if
+                 ! apply accumulated orthogonal matrices to a and b.
+                 if ( top>0 ) then
+                    j = ihi - nblst + 1
+                    call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,nblst, nblst, one, a( &
+                              1, j ), lda,work, nblst, zero, work( pw ), top )
+                    call stdlib_slacpy( 'ALL', top, nblst, work( pw ), top,a( 1, j ), lda )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_sorm22( 'RIGHT', 'NO TRANSPOSE', top, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,a( 1, j ), lda, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,2*nnb, 2*nnb, &
+                                    one, a( 1, j ), lda,work( ppwo ), 2*nnb, zero,work( pw ), top )
+                          call stdlib_slacpy( 'ALL', top, 2*nnb, work( pw ), top,a( 1, j ), lda )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                    j = ihi - nblst + 1
+                    call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,nblst, nblst, one, b( &
+                              1, j ), ldb,work, nblst, zero, work( pw ), top )
+                    call stdlib_slacpy( 'ALL', top, nblst, work( pw ), top,b( 1, j ), ldb )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_sorm22( 'RIGHT', 'NO TRANSPOSE', top, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,b( 1, j ), ldb, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,2*nnb, 2*nnb, &
+                                    one, b( 1, j ), ldb,work( ppwo ), 2*nnb, zero,work( pw ), top )
+                          call stdlib_slacpy( 'ALL', top, 2*nnb, work( pw ), top,b( 1, j ), ldb )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+                 ! apply accumulated orthogonal matrices to z.
+                 if( wantz ) then
+                    j = ihi - nblst + 1
+                    if ( initq ) then
+                       topq = max( 2, j - jcol + 1 )
+                       nh  = ihi - topq + 1
+                    else
+                       topq = 1
+                       nh = n
+                    end if
+                    call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,nblst, nblst, one, z( &
+                              topq, j ), ldz,work, nblst, zero, work( pw ), nh )
+                    call stdlib_slacpy( 'ALL', nh, nblst, work( pw ), nh,z( topq, j ), ldz )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                          if ( initq ) then
+                          topq = max( 2, j - jcol + 1 )
+                          nh  = ihi - topq + 1
+                       end if
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_sorm22( 'RIGHT', 'NO TRANSPOSE', nh, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,z( topq, j ), ldz, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,2*nnb, 2*nnb, one,&
+                                     z( topq, j ), ldz,work( ppwo ), 2*nnb, zero, work( pw ),nh )
+                          call stdlib_slacpy( 'ALL', nh, 2*nnb, work( pw ), nh,z( topq, j ), ldz )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+              end do
+           end if
+           ! use unblocked code to reduce the rest of the matrix
+           ! avoid re-initialization of modified q and z.
+           compq2 = compq
+           compz2 = compz
+           if ( jcol/=ilo ) then
+              if ( wantq )compq2 = 'V'
+              if ( wantz )compz2 = 'V'
+           end if
+           if ( jcol<ihi )call stdlib_sgghrd( compq2, compz2, n, jcol, ihi, a, lda, b, ldb, q,ldq,&
+                      z, ldz, ierr )
+           work( 1 ) = real( lwkopt,KIND=sp)
+           return
+     end subroutine stdlib_sgghd3
+
+     !> SGGQRF: computes a generalized QR factorization of an N-by-M matrix A
+     !> and an N-by-P matrix B:
+     !> A = Q*R,        B = Q*T*Z,
+     !> where Q is an N-by-N orthogonal matrix, Z is a P-by-P orthogonal
+     !> matrix, and R and T assume one of the forms:
+     !> if N >= M,  R = ( R11 ) M  ,   or if N < M,  R = ( R11  R12 ) N,
+     !> (  0  ) N-M                         N   M-N
+     !> M
+     !> where R11 is upper triangular, and
+     !> if N <= P,  T = ( 0  T12 ) N,   or if N > P,  T = ( T11 ) N-P,
+     !> P-N  N                           ( T21 ) P
+     !> P
+     !> where T12 or T21 is upper triangular.
+     !> In particular, if B is square and nonsingular, the GQR factorization
+     !> of A and B implicitly gives the QR factorization of inv(B)*A:
+     !> inv(B)*A = Z**T*(inv(T)*R)
+     !> where inv(B) denotes the inverse of the matrix B, and Z**T denotes the
+     !> transpose of the matrix Z.
+
+     pure subroutine stdlib_sggqrf( n, m, p, a, lda, taua, b, ldb, taub, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: taua(*), taub(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkopt, nb, nb1, nb2, nb3
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'SGEQRF', ' ', n, m, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'SGERQF', ' ', n, p, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'SORMQR', ' ', n, m, p, -1 )
+           nb = max( nb1, nb2, nb3 )
+           lwkopt = max( n, m, p )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( p<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, n, m, p ) .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGQRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! qr factorization of n-by-m matrix a: a = q*r
+           call stdlib_sgeqrf( n, m, a, lda, taua, work, lwork, info )
+           lopt = work( 1 )
+           ! update b := q**t*b.
+           call stdlib_sormqr( 'LEFT', 'TRANSPOSE', n, p, min( n, m ), a, lda, taua,b, ldb, work, &
+                     lwork, info )
+           lopt = max( lopt, int( work( 1 ),KIND=ilp) )
+           ! rq factorization of n-by-p matrix b: b = t*z.
+           call stdlib_sgerqf( n, p, b, ldb, taub, work, lwork, info )
+           work( 1 ) = max( lopt, int( work( 1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_sggqrf
+
+     !> SGGRQF: computes a generalized RQ factorization of an M-by-N matrix A
+     !> and a P-by-N matrix B:
+     !> A = R*Q,        B = Z*T*Q,
+     !> where Q is an N-by-N orthogonal matrix, Z is a P-by-P orthogonal
+     !> matrix, and R and T assume one of the forms:
+     !> if M <= N,  R = ( 0  R12 ) M,   or if M > N,  R = ( R11 ) M-N,
+     !> N-M  M                           ( R21 ) N
+     !> N
+     !> where R12 or R21 is upper triangular, and
+     !> if P >= N,  T = ( T11 ) N  ,   or if P < N,  T = ( T11  T12 ) P,
+     !> (  0  ) P-N                         P   N-P
+     !> N
+     !> where T11 is upper triangular.
+     !> In particular, if B is square and nonsingular, the GRQ factorization
+     !> of A and B implicitly gives the RQ factorization of A*inv(B):
+     !> A*inv(B) = (R*inv(T))*Z**T
+     !> where inv(B) denotes the inverse of the matrix B, and Z**T denotes the
+     !> transpose of the matrix Z.
+
+     pure subroutine stdlib_sggrqf( m, p, n, a, lda, taua, b, ldb, taub, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: taua(*), taub(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkopt, nb, nb1, nb2, nb3
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'SGERQF', ' ', m, n, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'SGEQRF', ' ', p, n, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'SORMRQ', ' ', m, n, p, -1 )
+           nb = max( nb1, nb2, nb3 )
+           lwkopt = max( n, m, p)*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( p<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, p ) ) then
+              info = -8
+           else if( lwork<max( 1, m, p, n ) .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGRQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! rq factorization of m-by-n matrix a: a = r*q
+           call stdlib_sgerqf( m, n, a, lda, taua, work, lwork, info )
+           lopt = work( 1 )
+           ! update b := b*q**t
+           call stdlib_sormrq( 'RIGHT', 'TRANSPOSE', p, n, min( m, n ),a( max( 1, m-n+1 ), 1 ), &
+                     lda, taua, b, ldb, work,lwork, info )
+           lopt = max( lopt, int( work( 1 ),KIND=ilp) )
+           ! qr factorization of p-by-n matrix b: b = z*t
+           call stdlib_sgeqrf( p, n, b, ldb, taub, work, lwork, info )
+           work( 1 ) = max( lopt, int( work( 1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_sggrqf
+
+     !> SGTCON: estimates the reciprocal of the condition number of a real
+     !> tridiagonal matrix A using the LU factorization as computed by
+     !> SGTTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_sgtcon( norm, n, dl, d, du, du2, ipiv, anorm, rcond,work, iwork, info &
+               )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(sp), intent(in) :: anorm
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: d(*), dl(*), du(*), du2(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: onenrm
+           integer(ilp) :: i, kase, kase1
+           real(sp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGTCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           ! check that d(1:n) is non-zero.
+           do i = 1, n
+              if( d( i )==zero )return
+           end do
+           ainvnm = zero
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kase = 0
+           20 continue
+           call stdlib_slacn2( n, work( n+1 ), work, iwork, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(u)*inv(l).
+                 call stdlib_sgttrs( 'NO TRANSPOSE', n, 1, dl, d, du, du2, ipiv,work, n, info )
+                           
+              else
+                 ! multiply by inv(l**t)*inv(u**t).
+                 call stdlib_sgttrs( 'TRANSPOSE', n, 1, dl, d, du, du2, ipiv, work,n, info )
+                           
+              end if
+              go to 20
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_sgtcon
+
+     !> SGTRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is tridiagonal, and provides
+     !> error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_sgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2,ipiv, b, ldb, x, &
+               ldx, ferr, berr, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: b(ldb,*), d(*), df(*), dl(*), dlf(*), du(*), du2(*), duf(*)
+                     
+           real(sp), intent(out) :: berr(*), ferr(*), work(*)
+           real(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transn, transt
+           integer(ilp) :: count, i, j, kase, nz
+           real(sp) :: eps, lstres, s, safe1, safe2, safmin
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -13
+           else if( ldx<max( 1, n ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGTRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'T'
+           else
+              transn = 'T'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = 4
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_110: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_scopy( n, b( 1, j ), 1, work( n+1 ), 1 )
+              call stdlib_slagtm( trans, n, 1, -one, dl, d, du, x( 1, j ), ldx, one,work( n+1 ), &
+                        n )
+              ! compute abs(op(a))*abs(x) + abs(b) for use in the backward
+              ! error bound.
+              if( notran ) then
+                 if( n==1 ) then
+                    work( 1 ) = abs( b( 1, j ) ) + abs( d( 1 )*x( 1, j ) )
+                 else
+                    work( 1 ) = abs( b( 1, j ) ) + abs( d( 1 )*x( 1, j ) ) +abs( du( 1 )*x( 2, j )&
+                               )
+                    do i = 2, n - 1
+                       work( i ) = abs( b( i, j ) ) +abs( dl( i-1 )*x( i-1, j ) ) +abs( d( i )*x( &
+                                 i, j ) ) +abs( du( i )*x( i+1, j ) )
+                    end do
+                    work( n ) = abs( b( n, j ) ) +abs( dl( n-1 )*x( n-1, j ) ) +abs( d( n )*x( n, &
+                              j ) )
+                 end if
+              else
+                 if( n==1 ) then
+                    work( 1 ) = abs( b( 1, j ) ) + abs( d( 1 )*x( 1, j ) )
+                 else
+                    work( 1 ) = abs( b( 1, j ) ) + abs( d( 1 )*x( 1, j ) ) +abs( dl( 1 )*x( 2, j )&
+                               )
+                    do i = 2, n - 1
+                       work( i ) = abs( b( i, j ) ) +abs( du( i-1 )*x( i-1, j ) ) +abs( d( i )*x( &
+                                 i, j ) ) +abs( dl( i )*x( i+1, j ) )
+                    end do
+                    work( n ) = abs( b( n, j ) ) +abs( du( n-1 )*x( n-1, j ) ) +abs( d( n )*x( n, &
+                              j ) )
+                 end if
+              end if
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_sgttrs( trans, n, 1, dlf, df, duf, du2, ipiv,work( n+1 ), n, info )
+                           
+                 call stdlib_saxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_slacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              kase = 0
+              70 continue
+              call stdlib_slacn2( n, work( 2*n+1 ), work( n+1 ), iwork, ferr( j ),kase, isave )
+                        
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**t).
+                    call stdlib_sgttrs( transt, n, 1, dlf, df, duf, du2, ipiv,work( n+1 ), n, &
+                              info )
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( n+i ) = work( i )*work( n+i )
+                    end do
+                    call stdlib_sgttrs( transn, n, 1, dlf, df, duf, du2, ipiv,work( n+1 ), n, &
+                              info )
+                 end if
+                 go to 70
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_110
+           return
+     end subroutine stdlib_sgtrfs
+
+     !> SGTSVX: uses the LU factorization to compute the solution to a real
+     !> system of linear equations A * X = B or A**T * X = B,
+     !> where A is a tridiagonal matrix of order N and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     pure subroutine stdlib_sgtsvx( fact, trans, n, nrhs, dl, d, du, dlf, df, duf,du2, ipiv, b, &
+               ldb, x, ldx, rcond, ferr, berr,work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: b(ldb,*), d(*), dl(*), du(*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+           real(sp), intent(inout) :: df(*), dlf(*), du2(*), duf(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact, notran
+           character :: norm
+           real(sp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -14
+           else if( ldx<max( 1, n ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGTSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the lu factorization of a.
+              call stdlib_scopy( n, d, 1, df, 1 )
+              if( n>1 ) then
+                 call stdlib_scopy( n-1, dl, 1, dlf, 1 )
+                 call stdlib_scopy( n-1, du, 1, duf, 1 )
+              end if
+              call stdlib_sgttrf( n, dlf, df, duf, du2, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_slangt( norm, n, dl, d, du )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_sgtcon( norm, n, dlf, df, duf, du2, ipiv, anorm, rcond, work,iwork, info )
+                     
+           ! compute the solution vectors x.
+           call stdlib_slacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_sgttrs( trans, n, nrhs, dlf, df, duf, du2, ipiv, x, ldx,info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_sgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2, ipiv,b, ldb, x, ldx, &
+                     ferr, berr, work, iwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_sgtsvx
+
+     !> SHGEQZ: computes the eigenvalues of a real matrix pair (H,T),
+     !> where H is an upper Hessenberg matrix and T is upper triangular,
+     !> using the double-shift QZ method.
+     !> Matrix pairs of this type are produced by the reduction to
+     !> generalized upper Hessenberg form of a real matrix pair (A,B):
+     !> A = Q1*H*Z1**T,  B = Q1*T*Z1**T,
+     !> as computed by SGGHRD.
+     !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+     !> also reduced to generalized Schur form,
+     !> H = Q*S*Z**T,  T = Q*P*Z**T,
+     !> where Q and Z are orthogonal matrices, P is an upper triangular
+     !> matrix, and S is a quasi-triangular matrix with 1-by-1 and 2-by-2
+     !> diagonal blocks.
+     !> The 1-by-1 blocks correspond to real eigenvalues of the matrix pair
+     !> (H,T) and the 2-by-2 blocks correspond to complex conjugate pairs of
+     !> eigenvalues.
+     !> Additionally, the 2-by-2 upper triangular diagonal blocks of P
+     !> corresponding to 2-by-2 blocks of S are reduced to positive diagonal
+     !> form, i.e., if S(j+1,j) is non-zero, then P(j+1,j) = P(j,j+1) = 0,
+     !> P(j,j) > 0, and P(j+1,j+1) > 0.
+     !> Optionally, the orthogonal matrix Q from the generalized Schur
+     !> factorization may be postmultiplied into an input matrix Q1, and the
+     !> orthogonal matrix Z may be postmultiplied into an input matrix Z1.
+     !> If Q1 and Z1 are the orthogonal matrices from SGGHRD that reduced
+     !> the matrix pair (A,B) to generalized upper Hessenberg form, then the
+     !> output matrices Q1*Q and Z1*Z are the orthogonal factors from the
+     !> generalized Schur factorization of (A,B):
+     !> A = (Q1*Q)*S*(Z1*Z)**T,  B = (Q1*Q)*P*(Z1*Z)**T.
+     !> To avoid overflow, eigenvalues of the matrix pair (H,T) (equivalently,
+     !> of (A,B)) are computed as a pair of values (alpha,beta), where alpha is
+     !> complex and beta real.
+     !> If beta is nonzero, lambda = alpha / beta is an eigenvalue of the
+     !> generalized nonsymmetric eigenvalue problem (GNEP)
+     !> A*x = lambda*B*x
+     !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+     !> alternate form of the GNEP
+     !> mu*A*y = B*y.
+     !> Real eigenvalues can be read directly from the generalized Schur
+     !> form:
+     !> alpha = S(i,i), beta = P(i,i).
+     !> Ref: C.B. Moler
+     !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+     !> pp. 241--256.
+
+     subroutine stdlib_shgeqz( job, compq, compz, n, ilo, ihi, h, ldh, t, ldt,alphar, alphai, &
+               beta, q, ldq, z, ldz, work,lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz, job
+           integer(ilp), intent(in) :: ihi, ilo, ldh, ldq, ldt, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(out) :: alphai(*), alphar(*), beta(*), work(*)
+           real(sp), intent(inout) :: h(ldh,*), q(ldq,*), t(ldt,*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: safety = 1.0e+2_sp
+          ! $                     safety = one )
+           
+           ! Local Scalars 
+           logical(lk) :: ilazr2, ilazro, ilpivt, ilq, ilschr, ilz, lquery
+           integer(ilp) :: icompq, icompz, ifirst, ifrstm, iiter, ilast, ilastm, in, ischur, &
+                     istart, j, jc, jch, jiter, jr, maxit
+           real(sp) :: a11, a12, a1i, a1r, a21, a22, a2i, a2r, ad11, ad11l, ad12, ad12l, ad21, &
+           ad21l, ad22, ad22l, ad32l, an, anorm, ascale, atol, b11, b1a, b1i, b1r, b22, b2a, b2i, &
+           b2r, bn, bnorm, bscale, btol, c, c11i, c11r, c12, c21, c22i, c22r, cl, cq, cr, cz, &
+           eshift, s, s1, s1inv, s2, safmax, safmin, scale, sl, sqi, sqr, sr, szi, szr, t1, tau, &
+           temp, temp2, tempi, tempr, u1, u12, u12l, u2, ulp, vs, w11, w12, w21, w22, wabs, wi, &
+                     wr, wr2
+           ! Local Arrays 
+           real(sp) :: v(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,sqrt
+           ! Executable Statements 
+           ! decode job, compq, compz
+           if( stdlib_lsame( job, 'E' ) ) then
+              ilschr = .false.
+              ischur = 1
+           else if( stdlib_lsame( job, 'S' ) ) then
+              ilschr = .true.
+              ischur = 2
+           else
+              ischur = 0
+           end if
+           if( stdlib_lsame( compq, 'N' ) ) then
+              ilq = .false.
+              icompq = 1
+           else if( stdlib_lsame( compq, 'V' ) ) then
+              ilq = .true.
+              icompq = 2
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              ilq = .true.
+              icompq = 3
+           else
+              icompq = 0
+           end if
+           if( stdlib_lsame( compz, 'N' ) ) then
+              ilz = .false.
+              icompz = 1
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              ilz = .true.
+              icompz = 2
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              ilz = .true.
+              icompz = 3
+           else
+              icompz = 0
+           end if
+           ! check argument values
+           info = 0
+           work( 1 ) = max( 1, n )
+           lquery = ( lwork==-1 )
+           if( ischur==0 ) then
+              info = -1
+           else if( icompq==0 ) then
+              info = -2
+           else if( icompz==0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 ) then
+              info = -5
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -6
+           else if( ldh<n ) then
+              info = -8
+           else if( ldt<n ) then
+              info = -10
+           else if( ldq<1 .or. ( ilq .and. ldq<n ) ) then
+              info = -15
+           else if( ldz<1 .or. ( ilz .and. ldz<n ) ) then
+              info = -17
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SHGEQZ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = real( 1,KIND=sp)
+              return
+           end if
+           ! initialize q and z
+           if( icompq==3 )call stdlib_slaset( 'FULL', n, n, zero, one, q, ldq )
+           if( icompz==3 )call stdlib_slaset( 'FULL', n, n, zero, one, z, ldz )
+           ! machine constants
+           in = ihi + 1 - ilo
+           safmin = stdlib_slamch( 'S' )
+           safmax = one / safmin
+           ulp = stdlib_slamch( 'E' )*stdlib_slamch( 'B' )
+           anorm = stdlib_slanhs( 'F', in, h( ilo, ilo ), ldh, work )
+           bnorm = stdlib_slanhs( 'F', in, t( ilo, ilo ), ldt, work )
+           atol = max( safmin, ulp*anorm )
+           btol = max( safmin, ulp*bnorm )
+           ascale = one / max( safmin, anorm )
+           bscale = one / max( safmin, bnorm )
+           ! set eigenvalues ihi+1:n
+           do j = ihi + 1, n
+              if( t( j, j )<zero ) then
+                 if( ilschr ) then
+                    do jr = 1, j
+                       h( jr, j ) = -h( jr, j )
+                       t( jr, j ) = -t( jr, j )
+                    end do
+                 else
+                    h( j, j ) = -h( j, j )
+                    t( j, j ) = -t( j, j )
+                 end if
+                 if( ilz ) then
+                    do jr = 1, n
+                       z( jr, j ) = -z( jr, j )
+                    end do
+                 end if
+              end if
+              alphar( j ) = h( j, j )
+              alphai( j ) = zero
+              beta( j ) = t( j, j )
+           end do
+           ! if ihi < ilo, skip qz steps
+           if( ihi<ilo )go to 380
+           ! main qz iteration loop
+           ! initialize dynamic indices
+           ! eigenvalues ilast+1:n have been found.
+              ! column operations modify rows ifrstm:whatever.
+              ! row operations modify columns whatever:ilastm.
+           ! if only eigenvalues are being computed, then
+              ! ifrstm is the row of the last splitting row above row ilast;
+              ! this is always at least ilo.
+           ! iiter counts iterations since the last eigenvalue was found,
+              ! to tell when to use an extraordinary shift.
+           ! maxit is the maximum number of qz sweeps allowed.
+           ilast = ihi
+           if( ilschr ) then
+              ifrstm = 1
+              ilastm = n
+           else
+              ifrstm = ilo
+              ilastm = ihi
+           end if
+           iiter = 0
+           eshift = zero
+           maxit = 30*( ihi-ilo+1 )
+           loop_360: do jiter = 1, maxit
+              ! split the matrix if possible.
+              ! two tests:
+                 ! 1: h(j,j-1)=0  or  j=ilo
+                 ! 2: t(j,j)=0
+              if( ilast==ilo ) then
+                 ! special case: j=ilast
+                 go to 80
+              else
+                 if( abs( h( ilast, ilast-1 ) )<=max( safmin, ulp*(abs( h( ilast, ilast ) ) + abs(&
+                            h( ilast-1, ilast-1 ) )) ) ) then
+                    h( ilast, ilast-1 ) = zero
+                    go to 80
+                 end if
+              end if
+              if( abs( t( ilast, ilast ) )<=max( safmin, ulp*(abs( t( ilast - 1, ilast ) ) + abs( &
+                        t( ilast-1, ilast-1 )) ) ) ) then
+                 t( ilast, ilast ) = zero
+                 go to 70
+              end if
+              ! general case: j<ilast
+              loop_60: do j = ilast - 1, ilo, -1
+                 ! test 1: for h(j,j-1)=0 or j=ilo
+                 if( j==ilo ) then
+                    ilazro = .true.
+                 else
+                    if( abs( h( j, j-1 ) )<=max( safmin, ulp*(abs( h( j, j ) ) + abs( h( j-1, j-1 &
+                              ) )) ) ) then
+                       h( j, j-1 ) = zero
+                       ilazro = .true.
+                    else
+                       ilazro = .false.
+                    end if
+                 end if
+                 ! test 2: for t(j,j)=0
+                 temp = abs ( t( j, j + 1 ) )
+                 if ( j > ilo )temp = temp + abs ( t( j - 1, j ) )
+                 if( abs( t( j, j ) )<max( safmin,ulp*temp ) ) then
+                    t( j, j ) = zero
+                    ! test 1a: check for 2 consecutive small subdiagonals in a
+                    ilazr2 = .false.
+                    if( .not.ilazro ) then
+                       temp = abs( h( j, j-1 ) )
+                       temp2 = abs( h( j, j ) )
+                       tempr = max( temp, temp2 )
+                       if( tempr<one .and. tempr/=zero ) then
+                          temp = temp / tempr
+                          temp2 = temp2 / tempr
+                       end if
+                       if( temp*( ascale*abs( h( j+1, j ) ) )<=temp2*( ascale*atol ) )ilazr2 = &
+                                 .true.
+                    end if
+                    ! if both tests pass (1
+                    ! element of b in the block is zero, split a 1x1 block off
+                    ! at the top. (i.e., at the j-th row/column) the leading
+                    ! diagonal element of the remainder can also be zero, so
+                    ! this may have to be done repeatedly.
+                    if( ilazro .or. ilazr2 ) then
+                       do jch = j, ilast - 1
+                          temp = h( jch, jch )
+                          call stdlib_slartg( temp, h( jch+1, jch ), c, s,h( jch, jch ) )
+                          h( jch+1, jch ) = zero
+                          call stdlib_srot( ilastm-jch, h( jch, jch+1 ), ldh,h( jch+1, jch+1 ), &
+                                    ldh, c, s )
+                          call stdlib_srot( ilastm-jch, t( jch, jch+1 ), ldt,t( jch+1, jch+1 ), &
+                                    ldt, c, s )
+                          if( ilq )call stdlib_srot( n, q( 1, jch ), 1, q( 1, jch+1 ), 1,c, s )
+                                    
+                          if( ilazr2 )h( jch, jch-1 ) = h( jch, jch-1 )*c
+                          ilazr2 = .false.
+                          if( abs( t( jch+1, jch+1 ) )>=btol ) then
+                             if( jch+1>=ilast ) then
+                                go to 80
+                             else
+                                ifirst = jch + 1
+                                go to 110
+                             end if
+                          end if
+                          t( jch+1, jch+1 ) = zero
+                       end do
+                       go to 70
+                    else
+                       ! only test 2 passed -- chase the zero to t(ilast,ilast)
+                       ! then process as in the case t(ilast,ilast)=0
+                       do jch = j, ilast - 1
+                          temp = t( jch, jch+1 )
+                          call stdlib_slartg( temp, t( jch+1, jch+1 ), c, s,t( jch, jch+1 ) )
+                                    
+                          t( jch+1, jch+1 ) = zero
+                          if( jch<ilastm-1 )call stdlib_srot( ilastm-jch-1, t( jch, jch+2 ), ldt,&
+                                    t( jch+1, jch+2 ), ldt, c, s )
+                          call stdlib_srot( ilastm-jch+2, h( jch, jch-1 ), ldh,h( jch+1, jch-1 ), &
+                                    ldh, c, s )
+                          if( ilq )call stdlib_srot( n, q( 1, jch ), 1, q( 1, jch+1 ), 1,c, s )
+                                    
+                          temp = h( jch+1, jch )
+                          call stdlib_slartg( temp, h( jch+1, jch-1 ), c, s,h( jch+1, jch ) )
+                                    
+                          h( jch+1, jch-1 ) = zero
+                          call stdlib_srot( jch+1-ifrstm, h( ifrstm, jch ), 1,h( ifrstm, jch-1 ), &
+                                    1, c, s )
+                          call stdlib_srot( jch-ifrstm, t( ifrstm, jch ), 1,t( ifrstm, jch-1 ), 1,&
+                                     c, s )
+                          if( ilz )call stdlib_srot( n, z( 1, jch ), 1, z( 1, jch-1 ), 1,c, s )
+                                    
+                       end do
+                       go to 70
+                    end if
+                 else if( ilazro ) then
+                    ! only test 1 passed -- work on j:ilast
+                    ifirst = j
+                    go to 110
+                 end if
+                 ! neither test passed -- try next j
+              end do loop_60
+              ! (drop-through is "impossible")
+              info = n + 1
+              go to 420
+              ! t(ilast,ilast)=0 -- clear h(ilast,ilast-1) to split off a
+              ! 1x1 block.
+              70 continue
+              temp = h( ilast, ilast )
+              call stdlib_slartg( temp, h( ilast, ilast-1 ), c, s,h( ilast, ilast ) )
+              h( ilast, ilast-1 ) = zero
+              call stdlib_srot( ilast-ifrstm, h( ifrstm, ilast ), 1,h( ifrstm, ilast-1 ), 1, c, s &
+                        )
+              call stdlib_srot( ilast-ifrstm, t( ifrstm, ilast ), 1,t( ifrstm, ilast-1 ), 1, c, s &
+                        )
+              if( ilz )call stdlib_srot( n, z( 1, ilast ), 1, z( 1, ilast-1 ), 1, c, s )
+              ! h(ilast,ilast-1)=0 -- standardize b, set alphar, alphai,
+                                    ! and beta
+                                    80 continue
+              if( t( ilast, ilast )<zero ) then
+                 if( ilschr ) then
+                    do j = ifrstm, ilast
+                       h( j, ilast ) = -h( j, ilast )
+                       t( j, ilast ) = -t( j, ilast )
+                    end do
+                 else
+                    h( ilast, ilast ) = -h( ilast, ilast )
+                    t( ilast, ilast ) = -t( ilast, ilast )
+                 end if
+                 if( ilz ) then
+                    do j = 1, n
+                       z( j, ilast ) = -z( j, ilast )
+                    end do
+                 end if
+              end if
+              alphar( ilast ) = h( ilast, ilast )
+              alphai( ilast ) = zero
+              beta( ilast ) = t( ilast, ilast )
+              ! go to next block -- exit if finished.
+              ilast = ilast - 1
+              if( ilast<ilo )go to 380
+              ! reset counters
+              iiter = 0
+              eshift = zero
+              if( .not.ilschr ) then
+                 ilastm = ilast
+                 if( ifrstm>ilast )ifrstm = ilo
+              end if
+              go to 350
+              ! qz step
+              ! this iteration only involves rows/columns ifirst:ilast. we
+              ! assume ifirst < ilast, and that the diagonal of b is non-zero.
+              110 continue
+              iiter = iiter + 1
+              if( .not.ilschr ) then
+                 ifrstm = ifirst
+              end if
+              ! compute single shifts.
+              ! at this point, ifirst < ilast, and the diagonal elements of
+              ! t(ifirst:ilast,ifirst,ilast) are larger than btol (in
+              ! magnitude)
+              if( ( iiter / 10 )*10==iiter ) then
+                 ! exceptional shift.  chosen for no particularly good reason.
+                 ! (single shift only.)
+                 if( ( real( maxit,KIND=sp)*safmin )*abs( h( ilast, ilast-1 ) )<abs( t( ilast-1, &
+                           ilast-1 ) ) ) then
+                    eshift = h( ilast, ilast-1 ) /t( ilast-1, ilast-1 )
+                 else
+                    eshift = eshift + one / ( safmin*real( maxit,KIND=sp) )
+                 end if
+                 s1 = one
+                 wr = eshift
+              else
+                 ! shifts based on the generalized eigenvalues of the
+                 ! bottom-right 2x2 block of a and b. the first eigenvalue
+                 ! returned by stdlib_slag2 is the wilkinson shift (aep p.512_sp),
+                 call stdlib_slag2( h( ilast-1, ilast-1 ), ldh,t( ilast-1, ilast-1 ), ldt, &
+                           safmin*safety, s1,s2, wr, wr2, wi )
+                 if ( abs( (wr/s1)*t( ilast, ilast ) - h( ilast, ilast ) )> abs( (wr2/s2)*t( &
+                           ilast, ilast )- h( ilast, ilast ) ) ) then
+                    temp = wr
+                    wr = wr2
+                    wr2 = temp
+                    temp = s1
+                    s1 = s2
+                    s2 = temp
+                 end if
+                 temp = max( s1, safmin*max( one, abs( wr ), abs( wi ) ) )
+                 if( wi/=zero )go to 200
+              end if
+              ! fiddle with shift to avoid overflow
+              temp = min( ascale, one )*( half*safmax )
+              if( s1>temp ) then
+                 scale = temp / s1
+              else
+                 scale = one
+              end if
+              temp = min( bscale, one )*( half*safmax )
+              if( abs( wr )>temp )scale = min( scale, temp / abs( wr ) )
+              s1 = scale*s1
+              wr = scale*wr
+              ! now check for two consecutive small subdiagonals.
+              do j = ilast - 1, ifirst + 1, -1
+                 istart = j
+                 temp = abs( s1*h( j, j-1 ) )
+                 temp2 = abs( s1*h( j, j )-wr*t( j, j ) )
+                 tempr = max( temp, temp2 )
+                 if( tempr<one .and. tempr/=zero ) then
+                    temp = temp / tempr
+                    temp2 = temp2 / tempr
+                 end if
+                 if( abs( ( ascale*h( j+1, j ) )*temp )<=( ascale*atol )*temp2 )go to 130
+              end do
+              istart = ifirst
+              130 continue
+              ! do an implicit single-shift qz sweep.
+              ! initial q
+              temp = s1*h( istart, istart ) - wr*t( istart, istart )
+              temp2 = s1*h( istart+1, istart )
+              call stdlib_slartg( temp, temp2, c, s, tempr )
+              ! sweep
+              loop_190: do j = istart, ilast - 1
+                 if( j>istart ) then
+                    temp = h( j, j-1 )
+                    call stdlib_slartg( temp, h( j+1, j-1 ), c, s, h( j, j-1 ) )
+                    h( j+1, j-1 ) = zero
+                 end if
+                 do jc = j, ilastm
+                    temp = c*h( j, jc ) + s*h( j+1, jc )
+                    h( j+1, jc ) = -s*h( j, jc ) + c*h( j+1, jc )
+                    h( j, jc ) = temp
+                    temp2 = c*t( j, jc ) + s*t( j+1, jc )
+                    t( j+1, jc ) = -s*t( j, jc ) + c*t( j+1, jc )
+                    t( j, jc ) = temp2
+                 end do
+                 if( ilq ) then
+                    do jr = 1, n
+                       temp = c*q( jr, j ) + s*q( jr, j+1 )
+                       q( jr, j+1 ) = -s*q( jr, j ) + c*q( jr, j+1 )
+                       q( jr, j ) = temp
+                    end do
+                 end if
+                 temp = t( j+1, j+1 )
+                 call stdlib_slartg( temp, t( j+1, j ), c, s, t( j+1, j+1 ) )
+                 t( j+1, j ) = zero
+                 do jr = ifrstm, min( j+2, ilast )
+                    temp = c*h( jr, j+1 ) + s*h( jr, j )
+                    h( jr, j ) = -s*h( jr, j+1 ) + c*h( jr, j )
+                    h( jr, j+1 ) = temp
+                 end do
+                 do jr = ifrstm, j
+                    temp = c*t( jr, j+1 ) + s*t( jr, j )
+                    t( jr, j ) = -s*t( jr, j+1 ) + c*t( jr, j )
+                    t( jr, j+1 ) = temp
+                 end do
+                 if( ilz ) then
+                    do jr = 1, n
+                       temp = c*z( jr, j+1 ) + s*z( jr, j )
+                       z( jr, j ) = -s*z( jr, j+1 ) + c*z( jr, j )
+                       z( jr, j+1 ) = temp
+                    end do
+                 end if
+              end do loop_190
+              go to 350
+              ! use francis double-shift
+              ! note: the francis double-shift should work with real shifts,
+                    ! but only if the block is at least 3x3.
+                    ! this code may break if this point is reached with
+                    ! a 2x2 block with real eigenvalues.
+                    200 continue
+              if( ifirst+1==ilast ) then
+                 ! special case -- 2x2 block with complex eigenvectors
+                 ! step 1: standardize, that is, rotate so that
+                             ! ( b11  0  )
+                         ! b = (         )  with b11 non-negative.
+                             ! (  0  b22 )
+                 call stdlib_slasv2( t( ilast-1, ilast-1 ), t( ilast-1, ilast ),t( ilast, ilast ),&
+                            b22, b11, sr, cr, sl, cl )
+                 if( b11<zero ) then
+                    cr = -cr
+                    sr = -sr
+                    b11 = -b11
+                    b22 = -b22
+                 end if
+                 call stdlib_srot( ilastm+1-ifirst, h( ilast-1, ilast-1 ), ldh,h( ilast, ilast-1 )&
+                           , ldh, cl, sl )
+                 call stdlib_srot( ilast+1-ifrstm, h( ifrstm, ilast-1 ), 1,h( ifrstm, ilast ), 1, &
+                           cr, sr )
+                 if( ilast<ilastm )call stdlib_srot( ilastm-ilast, t( ilast-1, ilast+1 ), ldt,t( &
+                           ilast, ilast+1 ), ldt, cl, sl )
+                 if( ifrstm<ilast-1 )call stdlib_srot( ifirst-ifrstm, t( ifrstm, ilast-1 ), 1,t( &
+                           ifrstm, ilast ), 1, cr, sr )
+                 if( ilq )call stdlib_srot( n, q( 1, ilast-1 ), 1, q( 1, ilast ), 1, cl,sl )
+                           
+                 if( ilz )call stdlib_srot( n, z( 1, ilast-1 ), 1, z( 1, ilast ), 1, cr,sr )
+                           
+                 t( ilast-1, ilast-1 ) = b11
+                 t( ilast-1, ilast ) = zero
+                 t( ilast, ilast-1 ) = zero
+                 t( ilast, ilast ) = b22
+                 ! if b22 is negative, negate column ilast
+                 if( b22<zero ) then
+                    do j = ifrstm, ilast
+                       h( j, ilast ) = -h( j, ilast )
+                       t( j, ilast ) = -t( j, ilast )
+                    end do
+                    if( ilz ) then
+                       do j = 1, n
+                          z( j, ilast ) = -z( j, ilast )
+                       end do
+                    end if
+                    b22 = -b22
+                 end if
+                 ! step 2: compute alphar, alphai, and beta (see refs.)
+                 ! recompute shift
+                 call stdlib_slag2( h( ilast-1, ilast-1 ), ldh,t( ilast-1, ilast-1 ), ldt, &
+                           safmin*safety, s1,temp, wr, temp2, wi )
+                 ! if standardization has perturbed the shift onto real line,
+                 ! do another (real single-shift) qr step.
+                 if( wi==zero )go to 350
+                 s1inv = one / s1
+                 ! do eispack (qzval) computation of alpha and beta
+                 a11 = h( ilast-1, ilast-1 )
+                 a21 = h( ilast, ilast-1 )
+                 a12 = h( ilast-1, ilast )
+                 a22 = h( ilast, ilast )
+                 ! compute complex givens rotation on right
+                 ! (assume some element of c = (sa - wb) > unfl )
+                                  ! __
+                 ! (sa - wb) ( cz   -sz )
+                           ! ( sz    cz )
+                 c11r = s1*a11 - wr*b11
+                 c11i = -wi*b11
+                 c12 = s1*a12
+                 c21 = s1*a21
+                 c22r = s1*a22 - wr*b22
+                 c22i = -wi*b22
+                 if( abs( c11r )+abs( c11i )+abs( c12 )>abs( c21 )+abs( c22r )+abs( c22i ) ) &
+                           then
+                    t1 = stdlib_slapy3( c12, c11r, c11i )
+                    cz = c12 / t1
+                    szr = -c11r / t1
+                    szi = -c11i / t1
+                 else
+                    cz = stdlib_slapy2( c22r, c22i )
+                    if( cz<=safmin ) then
+                       cz = zero
+                       szr = one
+                       szi = zero
+                    else
+                       tempr = c22r / cz
+                       tempi = c22i / cz
+                       t1 = stdlib_slapy2( cz, c21 )
+                       cz = cz / t1
+                       szr = -c21*tempr / t1
+                       szi = c21*tempi / t1
+                    end if
+                 end if
+                 ! compute givens rotation on left
+                 ! (  cq   sq )
+                 ! (  __      )  a or b
+                 ! ( -sq   cq )
+                 an = abs( a11 ) + abs( a12 ) + abs( a21 ) + abs( a22 )
+                 bn = abs( b11 ) + abs( b22 )
+                 wabs = abs( wr ) + abs( wi )
+                 if( s1*an>wabs*bn ) then
+                    cq = cz*b11
+                    sqr = szr*b22
+                    sqi = -szi*b22
+                 else
+                    a1r = cz*a11 + szr*a12
+                    a1i = szi*a12
+                    a2r = cz*a21 + szr*a22
+                    a2i = szi*a22
+                    cq = stdlib_slapy2( a1r, a1i )
+                    if( cq<=safmin ) then
+                       cq = zero
+                       sqr = one
+                       sqi = zero
+                    else
+                       tempr = a1r / cq
+                       tempi = a1i / cq
+                       sqr = tempr*a2r + tempi*a2i
+                       sqi = tempi*a2r - tempr*a2i
+                    end if
+                 end if
+                 t1 = stdlib_slapy3( cq, sqr, sqi )
+                 cq = cq / t1
+                 sqr = sqr / t1
+                 sqi = sqi / t1
+                 ! compute diagonal elements of qbz
+                 tempr = sqr*szr - sqi*szi
+                 tempi = sqr*szi + sqi*szr
+                 b1r = cq*cz*b11 + tempr*b22
+                 b1i = tempi*b22
+                 b1a = stdlib_slapy2( b1r, b1i )
+                 b2r = cq*cz*b22 + tempr*b11
+                 b2i = -tempi*b11
+                 b2a = stdlib_slapy2( b2r, b2i )
+                 ! normalize so beta > 0, and im( alpha1 ) > 0
+                 beta( ilast-1 ) = b1a
+                 beta( ilast ) = b2a
+                 alphar( ilast-1 ) = ( wr*b1a )*s1inv
+                 alphai( ilast-1 ) = ( wi*b1a )*s1inv
+                 alphar( ilast ) = ( wr*b2a )*s1inv
+                 alphai( ilast ) = -( wi*b2a )*s1inv
+                 ! step 3: go to next block -- exit if finished.
+                 ilast = ifirst - 1
+                 if( ilast<ilo )go to 380
+                 ! reset counters
+                 iiter = 0
+                 eshift = zero
+                 if( .not.ilschr ) then
+                    ilastm = ilast
+                    if( ifrstm>ilast )ifrstm = ilo
+                 end if
+                 go to 350
+              else
+                 ! usual case: 3x3 or larger block, using francis implicit
+                             ! double-shift
+                                          ! 2
+                 ! eigenvalue equation is  w  - c w + d = 0,
+                                               ! -1 2        -1
+                 ! so compute 1st column of  (a b  )  - c a b   + d
+                 ! using the formula in qzit (from eispack)
+                 ! we assume that the block is at least 3x3
+                 ad11 = ( ascale*h( ilast-1, ilast-1 ) ) /( bscale*t( ilast-1, ilast-1 ) )
+                 ad21 = ( ascale*h( ilast, ilast-1 ) ) /( bscale*t( ilast-1, ilast-1 ) )
+                 ad12 = ( ascale*h( ilast-1, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 ad22 = ( ascale*h( ilast, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 u12 = t( ilast-1, ilast ) / t( ilast, ilast )
+                 ad11l = ( ascale*h( ifirst, ifirst ) ) /( bscale*t( ifirst, ifirst ) )
+                 ad21l = ( ascale*h( ifirst+1, ifirst ) ) /( bscale*t( ifirst, ifirst ) )
+                 ad12l = ( ascale*h( ifirst, ifirst+1 ) ) /( bscale*t( ifirst+1, ifirst+1 ) )
+                           
+                 ad22l = ( ascale*h( ifirst+1, ifirst+1 ) ) /( bscale*t( ifirst+1, ifirst+1 ) )
+                           
+                 ad32l = ( ascale*h( ifirst+2, ifirst+1 ) ) /( bscale*t( ifirst+1, ifirst+1 ) )
+                           
+                 u12l = t( ifirst, ifirst+1 ) / t( ifirst+1, ifirst+1 )
+                 v( 1 ) = ( ad11-ad11l )*( ad22-ad11l ) - ad12*ad21 +ad21*u12*ad11l + ( ad12l-&
+                           ad11l*u12l )*ad21l
+                 v( 2 ) = ( ( ad22l-ad11l )-ad21l*u12l-( ad11-ad11l )-( ad22-ad11l )+ad21*u12 )&
+                           *ad21l
+                 v( 3 ) = ad32l*ad21l
+                 istart = ifirst
+                 call stdlib_slarfg( 3, v( 1 ), v( 2 ), 1, tau )
+                 v( 1 ) = one
+                 ! sweep
+                 loop_290: do j = istart, ilast - 2
+                    ! all but last elements: use 3x3 householder transforms.
+                    ! zero (j-1)st column of a
+                    if( j>istart ) then
+                       v( 1 ) = h( j, j-1 )
+                       v( 2 ) = h( j+1, j-1 )
+                       v( 3 ) = h( j+2, j-1 )
+                       call stdlib_slarfg( 3, h( j, j-1 ), v( 2 ), 1, tau )
+                       v( 1 ) = one
+                       h( j+1, j-1 ) = zero
+                       h( j+2, j-1 ) = zero
+                    end if
+                    do jc = j, ilastm
+                       temp = tau*( h( j, jc )+v( 2 )*h( j+1, jc )+v( 3 )*h( j+2, jc ) )
+                       h( j, jc ) = h( j, jc ) - temp
+                       h( j+1, jc ) = h( j+1, jc ) - temp*v( 2 )
+                       h( j+2, jc ) = h( j+2, jc ) - temp*v( 3 )
+                       temp2 = tau*( t( j, jc )+v( 2 )*t( j+1, jc )+v( 3 )*t( j+2, jc ) )
+                       t( j, jc ) = t( j, jc ) - temp2
+                       t( j+1, jc ) = t( j+1, jc ) - temp2*v( 2 )
+                       t( j+2, jc ) = t( j+2, jc ) - temp2*v( 3 )
+                    end do
+                    if( ilq ) then
+                       do jr = 1, n
+                          temp = tau*( q( jr, j )+v( 2 )*q( jr, j+1 )+v( 3 )*q( jr, j+2 ) )
+                                    
+                          q( jr, j ) = q( jr, j ) - temp
+                          q( jr, j+1 ) = q( jr, j+1 ) - temp*v( 2 )
+                          q( jr, j+2 ) = q( jr, j+2 ) - temp*v( 3 )
+                       end do
+                    end if
+                    ! zero j-th column of b (see slagbc for details)
+                    ! swap rows to pivot
+                    ilpivt = .false.
+                    temp = max( abs( t( j+1, j+1 ) ), abs( t( j+1, j+2 ) ) )
+                    temp2 = max( abs( t( j+2, j+1 ) ), abs( t( j+2, j+2 ) ) )
+                    if( max( temp, temp2 )<safmin ) then
+                       scale = zero
+                       u1 = one
+                       u2 = zero
+                       go to 250
+                    else if( temp>=temp2 ) then
+                       w11 = t( j+1, j+1 )
+                       w21 = t( j+2, j+1 )
+                       w12 = t( j+1, j+2 )
+                       w22 = t( j+2, j+2 )
+                       u1 = t( j+1, j )
+                       u2 = t( j+2, j )
+                    else
+                       w21 = t( j+1, j+1 )
+                       w11 = t( j+2, j+1 )
+                       w22 = t( j+1, j+2 )
+                       w12 = t( j+2, j+2 )
+                       u2 = t( j+1, j )
+                       u1 = t( j+2, j )
+                    end if
+                    ! swap columns if nec.
+                    if( abs( w12 )>abs( w11 ) ) then
+                       ilpivt = .true.
+                       temp = w12
+                       temp2 = w22
+                       w12 = w11
+                       w22 = w21
+                       w11 = temp
+                       w21 = temp2
+                    end if
+                    ! lu-factor
+                    temp = w21 / w11
+                    u2 = u2 - temp*u1
+                    w22 = w22 - temp*w12
+                    w21 = zero
+                    ! compute scale
+                    scale = one
+                    if( abs( w22 )<safmin ) then
+                       scale = zero
+                       u2 = one
+                       u1 = -w12 / w11
+                       go to 250
+                    end if
+                    if( abs( w22 )<abs( u2 ) )scale = abs( w22 / u2 )
+                    if( abs( w11 )<abs( u1 ) )scale = min( scale, abs( w11 / u1 ) )
+                    ! solve
+                    u2 = ( scale*u2 ) / w22
+                    u1 = ( scale*u1-w12*u2 ) / w11
+                    250 continue
+                    if( ilpivt ) then
+                       temp = u2
+                       u2 = u1
+                       u1 = temp
+                    end if
+                    ! compute householder vector
+                    t1 = sqrt( scale**2+u1**2+u2**2 )
+                    tau = one + scale / t1
+                    vs = -one / ( scale+t1 )
+                    v( 1 ) = one
+                    v( 2 ) = vs*u1
+                    v( 3 ) = vs*u2
+                    ! apply transformations from the right.
+                    do jr = ifrstm, min( j+3, ilast )
+                       temp = tau*( h( jr, j )+v( 2 )*h( jr, j+1 )+v( 3 )*h( jr, j+2 ) )
+                       h( jr, j ) = h( jr, j ) - temp
+                       h( jr, j+1 ) = h( jr, j+1 ) - temp*v( 2 )
+                       h( jr, j+2 ) = h( jr, j+2 ) - temp*v( 3 )
+                    end do
+                    do jr = ifrstm, j + 2
+                       temp = tau*( t( jr, j )+v( 2 )*t( jr, j+1 )+v( 3 )*t( jr, j+2 ) )
+                       t( jr, j ) = t( jr, j ) - temp
+                       t( jr, j+1 ) = t( jr, j+1 ) - temp*v( 2 )
+                       t( jr, j+2 ) = t( jr, j+2 ) - temp*v( 3 )
+                    end do
+                    if( ilz ) then
+                       do jr = 1, n
+                          temp = tau*( z( jr, j )+v( 2 )*z( jr, j+1 )+v( 3 )*z( jr, j+2 ) )
+                                    
+                          z( jr, j ) = z( jr, j ) - temp
+                          z( jr, j+1 ) = z( jr, j+1 ) - temp*v( 2 )
+                          z( jr, j+2 ) = z( jr, j+2 ) - temp*v( 3 )
+                       end do
+                    end if
+                    t( j+1, j ) = zero
+                    t( j+2, j ) = zero
+                 end do loop_290
+                 ! last elements: use givens rotations
+                 ! rotations from the left
+                 j = ilast - 1
+                 temp = h( j, j-1 )
+                 call stdlib_slartg( temp, h( j+1, j-1 ), c, s, h( j, j-1 ) )
+                 h( j+1, j-1 ) = zero
+                 do jc = j, ilastm
+                    temp = c*h( j, jc ) + s*h( j+1, jc )
+                    h( j+1, jc ) = -s*h( j, jc ) + c*h( j+1, jc )
+                    h( j, jc ) = temp
+                    temp2 = c*t( j, jc ) + s*t( j+1, jc )
+                    t( j+1, jc ) = -s*t( j, jc ) + c*t( j+1, jc )
+                    t( j, jc ) = temp2
+                 end do
+                 if( ilq ) then
+                    do jr = 1, n
+                       temp = c*q( jr, j ) + s*q( jr, j+1 )
+                       q( jr, j+1 ) = -s*q( jr, j ) + c*q( jr, j+1 )
+                       q( jr, j ) = temp
+                    end do
+                 end if
+                 ! rotations from the right.
+                 temp = t( j+1, j+1 )
+                 call stdlib_slartg( temp, t( j+1, j ), c, s, t( j+1, j+1 ) )
+                 t( j+1, j ) = zero
+                 do jr = ifrstm, ilast
+                    temp = c*h( jr, j+1 ) + s*h( jr, j )
+                    h( jr, j ) = -s*h( jr, j+1 ) + c*h( jr, j )
+                    h( jr, j+1 ) = temp
+                 end do
+                 do jr = ifrstm, ilast - 1
+                    temp = c*t( jr, j+1 ) + s*t( jr, j )
+                    t( jr, j ) = -s*t( jr, j+1 ) + c*t( jr, j )
+                    t( jr, j+1 ) = temp
+                 end do
+                 if( ilz ) then
+                    do jr = 1, n
+                       temp = c*z( jr, j+1 ) + s*z( jr, j )
+                       z( jr, j ) = -s*z( jr, j+1 ) + c*z( jr, j )
+                       z( jr, j+1 ) = temp
+                    end do
+                 end if
+                 ! end of double-shift code
+              end if
+              go to 350
+              ! end of iteration loop
+              350 continue
+           end do loop_360
+           ! drop-through = non-convergence
+           info = ilast
+           go to 420
+           ! successful completion of all qz steps
+           380 continue
+           ! set eigenvalues 1:ilo-1
+           do j = 1, ilo - 1
+              if( t( j, j )<zero ) then
+                 if( ilschr ) then
+                    do jr = 1, j
+                       h( jr, j ) = -h( jr, j )
+                       t( jr, j ) = -t( jr, j )
+                    end do
+                 else
+                    h( j, j ) = -h( j, j )
+                    t( j, j ) = -t( j, j )
+                 end if
+                 if( ilz ) then
+                    do jr = 1, n
+                       z( jr, j ) = -z( jr, j )
+                    end do
+                 end if
+              end if
+              alphar( j ) = h( j, j )
+              alphai( j ) = zero
+              beta( j ) = t( j, j )
+           end do
+           ! normal termination
+           info = 0
+           ! exit (other than argument error) -- return optimal workspace size
+           420 continue
+           work( 1 ) = real( n,KIND=sp)
+           return
+     end subroutine stdlib_shgeqz
+
+     !> SLABRD: reduces the first NB rows and columns of a real general
+     !> m by n matrix A to upper or lower bidiagonal form by an orthogonal
+     !> transformation Q**T * A * P, and returns the matrices X and Y which
+     !> are needed to apply the transformation to the unreduced part of A.
+     !> If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
+     !> bidiagonal form.
+     !> This is an auxiliary routine called by SGEBRD
+
+     pure subroutine stdlib_slabrd( m, n, nb, a, lda, d, e, tauq, taup, x, ldx, y,ldy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldx, ldy, m, n, nb
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: d(*), e(*), taup(*), tauq(*), x(ldx,*), y(ldy,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           if( m>=n ) then
+              ! reduce to upper bidiagonal form
+              loop_10: do i = 1, nb
+                 ! update a(i:m,i)
+                 call stdlib_sgemv( 'NO TRANSPOSE', m-i+1, i-1, -one, a( i, 1 ),lda, y( i, 1 ), &
+                           ldy, one, a( i, i ), 1 )
+                 call stdlib_sgemv( 'NO TRANSPOSE', m-i+1, i-1, -one, x( i, 1 ),ldx, a( 1, i ), 1,&
+                            one, a( i, i ), 1 )
+                 ! generate reflection q(i) to annihilate a(i+1:m,i)
+                 call stdlib_slarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tauq( i ) )
+                           
+                 d( i ) = a( i, i )
+                 if( i<n ) then
+                    a( i, i ) = one
+                    ! compute y(i+1:n,i)
+                    call stdlib_sgemv( 'TRANSPOSE', m-i+1, n-i, one, a( i, i+1 ),lda, a( i, i ), &
+                              1, zero, y( i+1, i ), 1 )
+                    call stdlib_sgemv( 'TRANSPOSE', m-i+1, i-1, one, a( i, 1 ), lda,a( i, i ), 1, &
+                              zero, y( 1, i ), 1 )
+                    call stdlib_sgemv( 'NO TRANSPOSE', n-i, i-1, -one, y( i+1, 1 ),ldy, y( 1, i ),&
+                               1, one, y( i+1, i ), 1 )
+                    call stdlib_sgemv( 'TRANSPOSE', m-i+1, i-1, one, x( i, 1 ), ldx,a( i, i ), 1, &
+                              zero, y( 1, i ), 1 )
+                    call stdlib_sgemv( 'TRANSPOSE', i-1, n-i, -one, a( 1, i+1 ),lda, y( 1, i ), 1,&
+                               one, y( i+1, i ), 1 )
+                    call stdlib_sscal( n-i, tauq( i ), y( i+1, i ), 1 )
+                    ! update a(i,i+1:n)
+                    call stdlib_sgemv( 'NO TRANSPOSE', n-i, i, -one, y( i+1, 1 ),ldy, a( i, 1 ), &
+                              lda, one, a( i, i+1 ), lda )
+                    call stdlib_sgemv( 'TRANSPOSE', i-1, n-i, -one, a( 1, i+1 ),lda, x( i, 1 ), &
+                              ldx, one, a( i, i+1 ), lda )
+                    ! generate reflection p(i) to annihilate a(i,i+2:n)
+                    call stdlib_slarfg( n-i, a( i, i+1 ), a( i, min( i+2, n ) ),lda, taup( i ) )
+                              
+                    e( i ) = a( i, i+1 )
+                    a( i, i+1 ) = one
+                    ! compute x(i+1:m,i)
+                    call stdlib_sgemv( 'NO TRANSPOSE', m-i, n-i, one, a( i+1, i+1 ),lda, a( i, i+&
+                              1 ), lda, zero, x( i+1, i ), 1 )
+                    call stdlib_sgemv( 'TRANSPOSE', n-i, i, one, y( i+1, 1 ), ldy,a( i, i+1 ), &
+                              lda, zero, x( 1, i ), 1 )
+                    call stdlib_sgemv( 'NO TRANSPOSE', m-i, i, -one, a( i+1, 1 ),lda, x( 1, i ), &
+                              1, one, x( i+1, i ), 1 )
+                    call stdlib_sgemv( 'NO TRANSPOSE', i-1, n-i, one, a( 1, i+1 ),lda, a( i, i+1 )&
+                              , lda, zero, x( 1, i ), 1 )
+                    call stdlib_sgemv( 'NO TRANSPOSE', m-i, i-1, -one, x( i+1, 1 ),ldx, x( 1, i ),&
+                               1, one, x( i+1, i ), 1 )
+                    call stdlib_sscal( m-i, taup( i ), x( i+1, i ), 1 )
+                 end if
+              end do loop_10
+           else
+              ! reduce to lower bidiagonal form
+              loop_20: do i = 1, nb
+                 ! update a(i,i:n)
+                 call stdlib_sgemv( 'NO TRANSPOSE', n-i+1, i-1, -one, y( i, 1 ),ldy, a( i, 1 ), &
+                           lda, one, a( i, i ), lda )
+                 call stdlib_sgemv( 'TRANSPOSE', i-1, n-i+1, -one, a( 1, i ), lda,x( i, 1 ), ldx, &
+                           one, a( i, i ), lda )
+                 ! generate reflection p(i) to annihilate a(i,i+1:n)
+                 call stdlib_slarfg( n-i+1, a( i, i ), a( i, min( i+1, n ) ), lda,taup( i ) )
+                           
+                 d( i ) = a( i, i )
+                 if( i<m ) then
+                    a( i, i ) = one
+                    ! compute x(i+1:m,i)
+                    call stdlib_sgemv( 'NO TRANSPOSE', m-i, n-i+1, one, a( i+1, i ),lda, a( i, i )&
+                              , lda, zero, x( i+1, i ), 1 )
+                    call stdlib_sgemv( 'TRANSPOSE', n-i+1, i-1, one, y( i, 1 ), ldy,a( i, i ), &
+                              lda, zero, x( 1, i ), 1 )
+                    call stdlib_sgemv( 'NO TRANSPOSE', m-i, i-1, -one, a( i+1, 1 ),lda, x( 1, i ),&
+                               1, one, x( i+1, i ), 1 )
+                    call stdlib_sgemv( 'NO TRANSPOSE', i-1, n-i+1, one, a( 1, i ),lda, a( i, i ), &
+                              lda, zero, x( 1, i ), 1 )
+                    call stdlib_sgemv( 'NO TRANSPOSE', m-i, i-1, -one, x( i+1, 1 ),ldx, x( 1, i ),&
+                               1, one, x( i+1, i ), 1 )
+                    call stdlib_sscal( m-i, taup( i ), x( i+1, i ), 1 )
+                    ! update a(i+1:m,i)
+                    call stdlib_sgemv( 'NO TRANSPOSE', m-i, i-1, -one, a( i+1, 1 ),lda, y( i, 1 ),&
+                               ldy, one, a( i+1, i ), 1 )
+                    call stdlib_sgemv( 'NO TRANSPOSE', m-i, i, -one, x( i+1, 1 ),ldx, a( 1, i ), &
+                              1, one, a( i+1, i ), 1 )
+                    ! generate reflection q(i) to annihilate a(i+2:m,i)
+                    call stdlib_slarfg( m-i, a( i+1, i ), a( min( i+2, m ), i ), 1,tauq( i ) )
+                              
+                    e( i ) = a( i+1, i )
+                    a( i+1, i ) = one
+                    ! compute y(i+1:n,i)
+                    call stdlib_sgemv( 'TRANSPOSE', m-i, n-i, one, a( i+1, i+1 ),lda, a( i+1, i ),&
+                               1, zero, y( i+1, i ), 1 )
+                    call stdlib_sgemv( 'TRANSPOSE', m-i, i-1, one, a( i+1, 1 ), lda,a( i+1, i ), &
+                              1, zero, y( 1, i ), 1 )
+                    call stdlib_sgemv( 'NO TRANSPOSE', n-i, i-1, -one, y( i+1, 1 ),ldy, y( 1, i ),&
+                               1, one, y( i+1, i ), 1 )
+                    call stdlib_sgemv( 'TRANSPOSE', m-i, i, one, x( i+1, 1 ), ldx,a( i+1, i ), 1, &
+                              zero, y( 1, i ), 1 )
+                    call stdlib_sgemv( 'TRANSPOSE', i, n-i, -one, a( 1, i+1 ), lda,y( 1, i ), 1, &
+                              one, y( i+1, i ), 1 )
+                    call stdlib_sscal( n-i, tauq( i ), y( i+1, i ), 1 )
+                 end if
+              end do loop_20
+           end if
+           return
+     end subroutine stdlib_slabrd
+
+     !> SLADIV: performs complex division in  real arithmetic
+     !> a + i*b
+     !> p + i*q = ---------
+     !> c + i*d
+     !> The algorithm is due to Michael Baudin and Robert L. Smith
+     !> and can be found in the paper
+     !> "A Robust Complex Division in Scilab"
+
+     pure subroutine stdlib_sladiv( a, b, c, d, p, q )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: a, b, c, d
+           real(sp), intent(out) :: p, q
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: bs = 2.0_sp
+           
+           
+           
+           ! Local Scalars 
+           real(sp) :: aa, bb, cc, dd, ab, cd, s, ov, un, be, eps
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           aa = a
+           bb = b
+           cc = c
+           dd = d
+           ab = max( abs(a), abs(b) )
+           cd = max( abs(c), abs(d) )
+           s = one
+           ov = stdlib_slamch( 'OVERFLOW THRESHOLD' )
+           un = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'EPSILON' )
+           be = bs / (eps*eps)
+           if( ab >= half*ov ) then
+              aa = half * aa
+              bb = half * bb
+              s  = two * s
+           end if
+           if( cd >= half*ov ) then
+              cc = half * cc
+              dd = half * dd
+              s  = half * s
+           end if
+           if( ab <= un*bs/eps ) then
+              aa = aa * be
+              bb = bb * be
+              s  = s / be
+           end if
+           if( cd <= un*bs/eps ) then
+              cc = cc * be
+              dd = dd * be
+              s  = s * be
+           end if
+           if( abs( d )<=abs( c ) ) then
+              call stdlib_sladiv1(aa, bb, cc, dd, p, q)
+           else
+              call stdlib_sladiv1(bb, aa, dd, cc, p, q)
+              q = -q
+           end if
+           p = p * s
+           q = q * s
+           return
+     end subroutine stdlib_sladiv
+
+     !> This subroutine computes the I-th updated eigenvalue of a symmetric
+     !> rank-one modification to a diagonal matrix whose elements are
+     !> given in the array d, and that
+     !> D(i) < D(j)  for  i < j
+     !> and that RHO > 0.  This is arranged by the calling routine, and is
+     !> no loss in generality.  The rank-one modified system is thus
+     !> diag( D )  +  RHO * Z * Z_transpose.
+     !> where we assume the Euclidean norm of Z is 1.
+     !> The method consists of approximating the rational functions in the
+     !> secular equation by simpler interpolating rational functions.
+
+     pure subroutine stdlib_slaed4( n, i, d, z, delta, rho, dlam, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: i, n
+           integer(ilp), intent(out) :: info
+           real(sp), intent(out) :: dlam
+           real(sp), intent(in) :: rho
+           ! Array Arguments 
+           real(sp), intent(in) :: d(*), z(*)
+           real(sp), intent(out) :: delta(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 30
+           
+           
+           ! Local Scalars 
+           logical(lk) :: orgati, swtch, swtch3
+           integer(ilp) :: ii, iim1, iip1, ip1, iter, j, niter
+           real(sp) :: a, b, c, del, dltlb, dltub, dphi, dpsi, dw, eps, erretm, eta, midpt, phi, &
+                     prew, psi, rhoinv, tau, temp, temp1, w
+           ! Local Arrays 
+           real(sp) :: zz(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! since this routine is called in an inner loop, we do no argument
+           ! checking.
+           ! quick return for n=1 and 2.
+           info = 0
+           if( n==1 ) then
+               ! presumably, i=1 upon entry
+              dlam = d( 1 ) + rho*z( 1 )*z( 1 )
+              delta( 1 ) = one
+              return
+           end if
+           if( n==2 ) then
+              call stdlib_slaed5( i, d, z, delta, rho, dlam )
+              return
+           end if
+           ! compute machine epsilon
+           eps = stdlib_slamch( 'EPSILON' )
+           rhoinv = one / rho
+           ! the case i = n
+           if( i==n ) then
+              ! initialize some basic variables
+              ii = n - 1
+              niter = 1
+              ! calculate initial guess
+              midpt = rho / two
+              ! if ||z||_2 is not one, then temp should be set to
+              ! rho * ||z||_2^2 / two
+              do j = 1, n
+                 delta( j ) = ( d( j )-d( i ) ) - midpt
+              end do
+              psi = zero
+              do j = 1, n - 2
+                 psi = psi + z( j )*z( j ) / delta( j )
+              end do
+              c = rhoinv + psi
+              w = c + z( ii )*z( ii ) / delta( ii ) +z( n )*z( n ) / delta( n )
+              if( w<=zero ) then
+                 temp = z( n-1 )*z( n-1 ) / ( d( n )-d( n-1 )+rho ) +z( n )*z( n ) / rho
+                 if( c<=temp ) then
+                    tau = rho
+                 else
+                    del = d( n ) - d( n-1 )
+                    a = -c*del + z( n-1 )*z( n-1 ) + z( n )*z( n )
+                    b = z( n )*z( n )*del
+                    if( a<zero ) then
+                       tau = two*b / ( sqrt( a*a+four*b*c )-a )
+                    else
+                       tau = ( a+sqrt( a*a+four*b*c ) ) / ( two*c )
+                    end if
+                 end if
+                 ! it can be proved that
+                     ! d(n)+rho/2 <= lambda(n) < d(n)+tau <= d(n)+rho
+                 dltlb = midpt
+                 dltub = rho
+              else
+                 del = d( n ) - d( n-1 )
+                 a = -c*del + z( n-1 )*z( n-1 ) + z( n )*z( n )
+                 b = z( n )*z( n )*del
+                 if( a<zero ) then
+                    tau = two*b / ( sqrt( a*a+four*b*c )-a )
+                 else
+                    tau = ( a+sqrt( a*a+four*b*c ) ) / ( two*c )
+                 end if
+                 ! it can be proved that
+                     ! d(n) < d(n)+tau < lambda(n) < d(n)+rho/2
+                 dltlb = zero
+                 dltub = midpt
+              end if
+              do j = 1, n
+                 delta( j ) = ( d( j )-d( i ) ) - tau
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, ii
+                 temp = z( j ) / delta( j )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              temp = z( n ) / delta( n )
+              phi = z( n )*temp
+              dphi = temp*temp
+              erretm = eight*( -phi-psi ) + erretm - phi + rhoinv +abs( tau )*( dpsi+dphi )
+                        
+              w = rhoinv + phi + psi
+              ! test for convergence
+              if( abs( w )<=eps*erretm ) then
+                 dlam = d( i ) + tau
+                 go to 250
+              end if
+              if( w<=zero ) then
+                 dltlb = max( dltlb, tau )
+              else
+                 dltub = min( dltub, tau )
+              end if
+              ! calculate the new step
+              niter = niter + 1
+              c = w - delta( n-1 )*dpsi - delta( n )*dphi
+              a = ( delta( n-1 )+delta( n ) )*w -delta( n-1 )*delta( n )*( dpsi+dphi )
+              b = delta( n-1 )*delta( n )*w
+              if( c<zero )c = abs( c )
+              if( c==zero ) then
+                ! eta = b/a
+                 ! eta = rho - tau
+                 eta = dltub - tau
+              else if( a>=zero ) then
+                 eta = ( a+sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+              else
+                 eta = two*b / ( a-sqrt( abs( a*a-four*b*c ) ) )
+              end if
+              ! note, eta should be positive if w is negative, and
+              ! eta should be negative otherwise. however,
+              ! if for some reason caused by roundoff, eta*w > 0,
+              ! we simply use one newton step instead. this way
+              ! will guarantee eta*w < 0.
+              if( w*eta>zero )eta = -w / ( dpsi+dphi )
+              temp = tau + eta
+              if( temp>dltub .or. temp<dltlb ) then
+                 if( w<zero ) then
+                    eta = ( dltub-tau ) / two
+                 else
+                    eta = ( dltlb-tau ) / two
+                 end if
+              end if
+              do j = 1, n
+                 delta( j ) = delta( j ) - eta
+              end do
+              tau = tau + eta
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, ii
+                 temp = z( j ) / delta( j )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              temp = z( n ) / delta( n )
+              phi = z( n )*temp
+              dphi = temp*temp
+              erretm = eight*( -phi-psi ) + erretm - phi + rhoinv +abs( tau )*( dpsi+dphi )
+                        
+              w = rhoinv + phi + psi
+              ! main loop to update the values of the array   delta
+              iter = niter + 1
+              loop_90: do niter = iter, maxit
+                 ! test for convergence
+                 if( abs( w )<=eps*erretm ) then
+                    dlam = d( i ) + tau
+                    go to 250
+                 end if
+                 if( w<=zero ) then
+                    dltlb = max( dltlb, tau )
+                 else
+                    dltub = min( dltub, tau )
+                 end if
+                 ! calculate the new step
+                 c = w - delta( n-1 )*dpsi - delta( n )*dphi
+                 a = ( delta( n-1 )+delta( n ) )*w -delta( n-1 )*delta( n )*( dpsi+dphi )
+                 b = delta( n-1 )*delta( n )*w
+                 if( a>=zero ) then
+                    eta = ( a+sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 else
+                    eta = two*b / ( a-sqrt( abs( a*a-four*b*c ) ) )
+                 end if
+                 ! note, eta should be positive if w is negative, and
+                 ! eta should be negative otherwise. however,
+                 ! if for some reason caused by roundoff, eta*w > 0,
+                 ! we simply use one newton step instead. this way
+                 ! will guarantee eta*w < 0.
+                 if( w*eta>zero )eta = -w / ( dpsi+dphi )
+                 temp = tau + eta
+                 if( temp>dltub .or. temp<dltlb ) then
+                    if( w<zero ) then
+                       eta = ( dltub-tau ) / two
+                    else
+                       eta = ( dltlb-tau ) / two
+                    end if
+                 end if
+                 do j = 1, n
+                    delta( j ) = delta( j ) - eta
+                 end do
+                 tau = tau + eta
+                 ! evaluate psi and the derivative dpsi
+                 dpsi = zero
+                 psi = zero
+                 erretm = zero
+                 do j = 1, ii
+                    temp = z( j ) / delta( j )
+                    psi = psi + z( j )*temp
+                    dpsi = dpsi + temp*temp
+                    erretm = erretm + psi
+                 end do
+                 erretm = abs( erretm )
+                 ! evaluate phi and the derivative dphi
+                 temp = z( n ) / delta( n )
+                 phi = z( n )*temp
+                 dphi = temp*temp
+                 erretm = eight*( -phi-psi ) + erretm - phi + rhoinv +abs( tau )*( dpsi+dphi )
+                           
+                 w = rhoinv + phi + psi
+              end do loop_90
+              ! return with info = 1, niter = maxit and not converged
+              info = 1
+              dlam = d( i ) + tau
+              go to 250
+              ! end for the case i = n
+           else
+              ! the case for i < n
+              niter = 1
+              ip1 = i + 1
+              ! calculate initial guess
+              del = d( ip1 ) - d( i )
+              midpt = del / two
+              do j = 1, n
+                 delta( j ) = ( d( j )-d( i ) ) - midpt
+              end do
+              psi = zero
+              do j = 1, i - 1
+                 psi = psi + z( j )*z( j ) / delta( j )
+              end do
+              phi = zero
+              do j = n, i + 2, -1
+                 phi = phi + z( j )*z( j ) / delta( j )
+              end do
+              c = rhoinv + psi + phi
+              w = c + z( i )*z( i ) / delta( i ) +z( ip1 )*z( ip1 ) / delta( ip1 )
+              if( w>zero ) then
+                 ! d(i)< the ith eigenvalue < (d(i)+d(i+1))/2
+                 ! we choose d(i) as origin.
+                 orgati = .true.
+                 a = c*del + z( i )*z( i ) + z( ip1 )*z( ip1 )
+                 b = z( i )*z( i )*del
+                 if( a>zero ) then
+                    tau = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                 else
+                    tau = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 end if
+                 dltlb = zero
+                 dltub = midpt
+              else
+                 ! (d(i)+d(i+1))/2 <= the ith eigenvalue < d(i+1)
+                 ! we choose d(i+1) as origin.
+                 orgati = .false.
+                 a = c*del - z( i )*z( i ) - z( ip1 )*z( ip1 )
+                 b = z( ip1 )*z( ip1 )*del
+                 if( a<zero ) then
+                    tau = two*b / ( a-sqrt( abs( a*a+four*b*c ) ) )
+                 else
+                    tau = -( a+sqrt( abs( a*a+four*b*c ) ) ) / ( two*c )
+                 end if
+                 dltlb = -midpt
+                 dltub = zero
+              end if
+              if( orgati ) then
+                 do j = 1, n
+                    delta( j ) = ( d( j )-d( i ) ) - tau
+                 end do
+              else
+                 do j = 1, n
+                    delta( j ) = ( d( j )-d( ip1 ) ) - tau
+                 end do
+              end if
+              if( orgati ) then
+                 ii = i
+              else
+                 ii = i + 1
+              end if
+              iim1 = ii - 1
+              iip1 = ii + 1
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, iim1
+                 temp = z( j ) / delta( j )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              dphi = zero
+              phi = zero
+              do j = n, iip1, -1
+                 temp = z( j ) / delta( j )
+                 phi = phi + z( j )*temp
+                 dphi = dphi + temp*temp
+                 erretm = erretm + phi
+              end do
+              w = rhoinv + phi + psi
+              ! w is the value of the secular function with
+              ! its ii-th element removed.
+              swtch3 = .false.
+              if( orgati ) then
+                 if( w<zero )swtch3 = .true.
+              else
+                 if( w>zero )swtch3 = .true.
+              end if
+              if( ii==1 .or. ii==n )swtch3 = .false.
+              temp = z( ii ) / delta( ii )
+              dw = dpsi + dphi + temp*temp
+              temp = z( ii )*temp
+              w = w + temp
+              erretm = eight*( phi-psi ) + erretm + two*rhoinv +three*abs( temp ) + abs( tau )&
+                        *dw
+              ! test for convergence
+              if( abs( w )<=eps*erretm ) then
+                 if( orgati ) then
+                    dlam = d( i ) + tau
+                 else
+                    dlam = d( ip1 ) + tau
+                 end if
+                 go to 250
+              end if
+              if( w<=zero ) then
+                 dltlb = max( dltlb, tau )
+              else
+                 dltub = min( dltub, tau )
+              end if
+              ! calculate the new step
+              niter = niter + 1
+              if( .not.swtch3 ) then
+                 if( orgati ) then
+                    c = w - delta( ip1 )*dw - ( d( i )-d( ip1 ) )*( z( i ) / delta( i ) )&
+                              **2
+                 else
+                    c = w - delta( i )*dw - ( d( ip1 )-d( i ) )*( z( ip1 ) / delta( ip1 ) )&
+                              **2
+                 end if
+                 a = ( delta( i )+delta( ip1 ) )*w -delta( i )*delta( ip1 )*dw
+                 b = delta( i )*delta( ip1 )*w
+                 if( c==zero ) then
+                    if( a==zero ) then
+                       if( orgati ) then
+                          a = z( i )*z( i ) + delta( ip1 )*delta( ip1 )*( dpsi+dphi )
+                       else
+                          a = z( ip1 )*z( ip1 ) + delta( i )*delta( i )*( dpsi+dphi )
+                       end if
+                    end if
+                    eta = b / a
+                 else if( a<=zero ) then
+                    eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                 else
+                    eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                 end if
+              else
+                 ! interpolation using three most relevant poles
+                 temp = rhoinv + psi + phi
+                 if( orgati ) then
+                    temp1 = z( iim1 ) / delta( iim1 )
+                    temp1 = temp1*temp1
+                    c = temp - delta( iip1 )*( dpsi+dphi ) -( d( iim1 )-d( iip1 ) )*temp1
+                    zz( 1 ) = z( iim1 )*z( iim1 )
+                    zz( 3 ) = delta( iip1 )*delta( iip1 )*( ( dpsi-temp1 )+dphi )
+                 else
+                    temp1 = z( iip1 ) / delta( iip1 )
+                    temp1 = temp1*temp1
+                    c = temp - delta( iim1 )*( dpsi+dphi ) -( d( iip1 )-d( iim1 ) )*temp1
+                    zz( 1 ) = delta( iim1 )*delta( iim1 )*( dpsi+( dphi-temp1 ) )
+                    zz( 3 ) = z( iip1 )*z( iip1 )
+                 end if
+                 zz( 2 ) = z( ii )*z( ii )
+                 call stdlib_slaed6( niter, orgati, c, delta( iim1 ), zz, w, eta,info )
+                 if( info/=0 )go to 250
+              end if
+              ! note, eta should be positive if w is negative, and
+              ! eta should be negative otherwise. however,
+              ! if for some reason caused by roundoff, eta*w > 0,
+              ! we simply use one newton step instead. this way
+              ! will guarantee eta*w < 0.
+              if( w*eta>=zero )eta = -w / dw
+              temp = tau + eta
+              if( temp>dltub .or. temp<dltlb ) then
+                 if( w<zero ) then
+                    eta = ( dltub-tau ) / two
+                 else
+                    eta = ( dltlb-tau ) / two
+                 end if
+              end if
+              prew = w
+              do j = 1, n
+                 delta( j ) = delta( j ) - eta
+              end do
+              ! evaluate psi and the derivative dpsi
+              dpsi = zero
+              psi = zero
+              erretm = zero
+              do j = 1, iim1
+                 temp = z( j ) / delta( j )
+                 psi = psi + z( j )*temp
+                 dpsi = dpsi + temp*temp
+                 erretm = erretm + psi
+              end do
+              erretm = abs( erretm )
+              ! evaluate phi and the derivative dphi
+              dphi = zero
+              phi = zero
+              do j = n, iip1, -1
+                 temp = z( j ) / delta( j )
+                 phi = phi + z( j )*temp
+                 dphi = dphi + temp*temp
+                 erretm = erretm + phi
+              end do
+              temp = z( ii ) / delta( ii )
+              dw = dpsi + dphi + temp*temp
+              temp = z( ii )*temp
+              w = rhoinv + phi + psi + temp
+              erretm = eight*( phi-psi ) + erretm + two*rhoinv +three*abs( temp ) + abs( tau+eta )&
+                        *dw
+              swtch = .false.
+              if( orgati ) then
+                 if( -w>abs( prew ) / ten )swtch = .true.
+              else
+                 if( w>abs( prew ) / ten )swtch = .true.
+              end if
+              tau = tau + eta
+              ! main loop to update the values of the array   delta
+              iter = niter + 1
+              loop_240: do niter = iter, maxit
+                 ! test for convergence
+                 if( abs( w )<=eps*erretm ) then
+                    if( orgati ) then
+                       dlam = d( i ) + tau
+                    else
+                       dlam = d( ip1 ) + tau
+                    end if
+                    go to 250
+                 end if
+                 if( w<=zero ) then
+                    dltlb = max( dltlb, tau )
+                 else
+                    dltub = min( dltub, tau )
+                 end if
+                 ! calculate the new step
+                 if( .not.swtch3 ) then
+                    if( .not.swtch ) then
+                       if( orgati ) then
+                          c = w - delta( ip1 )*dw -( d( i )-d( ip1 ) )*( z( i ) / delta( i ) )&
+                                    **2
+                       else
+                          c = w - delta( i )*dw - ( d( ip1 )-d( i ) )*( z( ip1 ) / delta( ip1 ) )&
+                                    **2
+                       end if
+                    else
+                       temp = z( ii ) / delta( ii )
+                       if( orgati ) then
+                          dpsi = dpsi + temp*temp
+                       else
+                          dphi = dphi + temp*temp
+                       end if
+                       c = w - delta( i )*dpsi - delta( ip1 )*dphi
+                    end if
+                    a = ( delta( i )+delta( ip1 ) )*w -delta( i )*delta( ip1 )*dw
+                    b = delta( i )*delta( ip1 )*w
+                    if( c==zero ) then
+                       if( a==zero ) then
+                          if( .not.swtch ) then
+                             if( orgati ) then
+                                a = z( i )*z( i ) + delta( ip1 )*delta( ip1 )*( dpsi+dphi )
+                                          
+                             else
+                                a = z( ip1 )*z( ip1 ) +delta( i )*delta( i )*( dpsi+dphi )
+                             end if
+                          else
+                             a = delta( i )*delta( i )*dpsi +delta( ip1 )*delta( ip1 )&
+                                       *dphi
+                          end if
+                       end if
+                       eta = b / a
+                    else if( a<=zero ) then
+                       eta = ( a-sqrt( abs( a*a-four*b*c ) ) ) / ( two*c )
+                    else
+                       eta = two*b / ( a+sqrt( abs( a*a-four*b*c ) ) )
+                    end if
+                 else
+                    ! interpolation using three most relevant poles
+                    temp = rhoinv + psi + phi
+                    if( swtch ) then
+                       c = temp - delta( iim1 )*dpsi - delta( iip1 )*dphi
+                       zz( 1 ) = delta( iim1 )*delta( iim1 )*dpsi
+                       zz( 3 ) = delta( iip1 )*delta( iip1 )*dphi
+                    else
+                       if( orgati ) then
+                          temp1 = z( iim1 ) / delta( iim1 )
+                          temp1 = temp1*temp1
+                          c = temp - delta( iip1 )*( dpsi+dphi ) -( d( iim1 )-d( iip1 ) )&
+                                    *temp1
+                          zz( 1 ) = z( iim1 )*z( iim1 )
+                          zz( 3 ) = delta( iip1 )*delta( iip1 )*( ( dpsi-temp1 )+dphi )
+                       else
+                          temp1 = z( iip1 ) / delta( iip1 )
+                          temp1 = temp1*temp1
+                          c = temp - delta( iim1 )*( dpsi+dphi ) -( d( iip1 )-d( iim1 ) )&
+                                    *temp1
+                          zz( 1 ) = delta( iim1 )*delta( iim1 )*( dpsi+( dphi-temp1 ) )
+                          zz( 3 ) = z( iip1 )*z( iip1 )
+                       end if
+                    end if
+                    call stdlib_slaed6( niter, orgati, c, delta( iim1 ), zz, w, eta,info )
+                    if( info/=0 )go to 250
+                 end if
+                 ! note, eta should be positive if w is negative, and
+                 ! eta should be negative otherwise. however,
+                 ! if for some reason caused by roundoff, eta*w > 0,
+                 ! we simply use one newton step instead. this way
+                 ! will guarantee eta*w < 0.
+                 if( w*eta>=zero )eta = -w / dw
+                 temp = tau + eta
+                 if( temp>dltub .or. temp<dltlb ) then
+                    if( w<zero ) then
+                       eta = ( dltub-tau ) / two
+                    else
+                       eta = ( dltlb-tau ) / two
+                    end if
+                 end if
+                 do j = 1, n
+                    delta( j ) = delta( j ) - eta
+                 end do
+                 tau = tau + eta
+                 prew = w
+                 ! evaluate psi and the derivative dpsi
+                 dpsi = zero
+                 psi = zero
+                 erretm = zero
+                 do j = 1, iim1
+                    temp = z( j ) / delta( j )
+                    psi = psi + z( j )*temp
+                    dpsi = dpsi + temp*temp
+                    erretm = erretm + psi
+                 end do
+                 erretm = abs( erretm )
+                 ! evaluate phi and the derivative dphi
+                 dphi = zero
+                 phi = zero
+                 do j = n, iip1, -1
+                    temp = z( j ) / delta( j )
+                    phi = phi + z( j )*temp
+                    dphi = dphi + temp*temp
+                    erretm = erretm + phi
+                 end do
+                 temp = z( ii ) / delta( ii )
+                 dw = dpsi + dphi + temp*temp
+                 temp = z( ii )*temp
+                 w = rhoinv + phi + psi + temp
+                 erretm = eight*( phi-psi ) + erretm + two*rhoinv +three*abs( temp ) + abs( tau )&
+                           *dw
+                 if( w*prew>zero .and. abs( w )>abs( prew ) / ten )swtch = .not.swtch
+              end do loop_240
+              ! return with info = 1, niter = maxit and not converged
+              info = 1
+              if( orgati ) then
+                 dlam = d( i ) + tau
+              else
+                 dlam = d( ip1 ) + tau
+              end if
+           end if
+           250 continue
+           return
+     end subroutine stdlib_slaed4
+
+     !> SLAED8: merges the two sets of eigenvalues together into a single
+     !> sorted set.  Then it tries to deflate the size of the problem.
+     !> There are two ways in which deflation can occur:  when two or more
+     !> eigenvalues are close together or if there is a tiny element in the
+     !> Z vector.  For each such occurrence the order of the related secular
+     !> equation problem is reduced by one.
+
+     pure subroutine stdlib_slaed8( icompq, k, n, qsiz, d, q, ldq, indxq, rho,cutpnt, z, dlamda, &
+               q2, ldq2, w, perm, givptr,givcol, givnum, indxp, indx, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: cutpnt, icompq, ldq, ldq2, n, qsiz
+           integer(ilp), intent(out) :: givptr, info, k
+           real(sp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(2,*), indx(*), indxp(*), perm(*)
+           integer(ilp), intent(inout) :: indxq(*)
+           real(sp), intent(inout) :: d(*), q(ldq,*), z(*)
+           real(sp), intent(out) :: dlamda(*), givnum(2,*), q2(ldq2,*), w(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: mone = -1.0_sp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, imax, j, jlam, jmax, jp, k2, n1, n1p1, n2
+           real(sp) :: c, eps, s, t, tau, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( icompq<0 .or. icompq>1 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -3
+           else if( icompq==1 .and. qsiz<n ) then
+              info = -4
+           else if( ldq<max( 1, n ) ) then
+              info = -7
+           else if( cutpnt<min( 1, n ) .or. cutpnt>n ) then
+              info = -10
+           else if( ldq2<max( 1, n ) ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAED8', -info )
+              return
+           end if
+           ! need to initialize givptr to o here in case of quick exit
+           ! to prevent an unspecified code behavior (usually sigfault)
+           ! when iwork array on entry to *stedc is not zeroed
+           ! (or at least some iwork entries which used in *laed7 for givptr).
+           givptr = 0
+           ! quick return if possible
+           if( n==0 )return
+           n1 = cutpnt
+           n2 = n - n1
+           n1p1 = n1 + 1
+           if( rho<zero ) then
+              call stdlib_sscal( n2, mone, z( n1p1 ), 1 )
+           end if
+           ! normalize z so that norm(z) = 1
+           t = one / sqrt( two )
+           do j = 1, n
+              indx( j ) = j
+           end do
+           call stdlib_sscal( n, t, z, 1 )
+           rho = abs( two*rho )
+           ! sort the eigenvalues into increasing order
+           do i = cutpnt + 1, n
+              indxq( i ) = indxq( i ) + cutpnt
+           end do
+           do i = 1, n
+              dlamda( i ) = d( indxq( i ) )
+              w( i ) = z( indxq( i ) )
+           end do
+           i = 1
+           j = cutpnt + 1
+           call stdlib_slamrg( n1, n2, dlamda, 1, 1, indx )
+           do i = 1, n
+              d( i ) = dlamda( indx( i ) )
+              z( i ) = w( indx( i ) )
+           end do
+           ! calculate the allowable deflation tolerance
+           imax = stdlib_isamax( n, z, 1 )
+           jmax = stdlib_isamax( n, d, 1 )
+           eps = stdlib_slamch( 'EPSILON' )
+           tol = eight*eps*abs( d( jmax ) )
+           ! if the rank-1 modifier is small enough, no more needs to be done
+           ! except to reorganize q so that its columns correspond with the
+           ! elements in d.
+           if( rho*abs( z( imax ) )<=tol ) then
+              k = 0
+              if( icompq==0 ) then
+                 do j = 1, n
+                    perm( j ) = indxq( indx( j ) )
+                 end do
+              else
+                 do j = 1, n
+                    perm( j ) = indxq( indx( j ) )
+                    call stdlib_scopy( qsiz, q( 1, perm( j ) ), 1, q2( 1, j ), 1 )
+                 end do
+                 call stdlib_slacpy( 'A', qsiz, n, q2( 1, 1 ), ldq2, q( 1, 1 ),ldq )
+              end if
+              return
+           end if
+           ! if there are multiple eigenvalues then the problem deflates.  here
+           ! the number of equal eigenvalues are found.  as each equal
+           ! eigenvalue is found, an elementary reflector is computed to rotate
+           ! the corresponding eigensubspace so that the corresponding
+           ! components of z are zero in this new basis.
+           k = 0
+           k2 = n + 1
+           do j = 1, n
+              if( rho*abs( z( j ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 indxp( k2 ) = j
+                 if( j==n )go to 110
+              else
+                 jlam = j
+                 go to 80
+              end if
+           end do
+           80 continue
+           j = j + 1
+           if( j>n )go to 100
+           if( rho*abs( z( j ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              indxp( k2 ) = j
+           else
+              ! check if eigenvalues are close enough to allow deflation.
+              s = z( jlam )
+              c = z( j )
+              ! find sqrt(a**2+b**2) without overflow or
+              ! destructive underflow.
+              tau = stdlib_slapy2( c, s )
+              t = d( j ) - d( jlam )
+              c = c / tau
+              s = -s / tau
+              if( abs( t*c*s )<=tol ) then
+                 ! deflation is possible.
+                 z( j ) = tau
+                 z( jlam ) = zero
+                 ! record the appropriate givens rotation
+                 givptr = givptr + 1
+                 givcol( 1, givptr ) = indxq( indx( jlam ) )
+                 givcol( 2, givptr ) = indxq( indx( j ) )
+                 givnum( 1, givptr ) = c
+                 givnum( 2, givptr ) = s
+                 if( icompq==1 ) then
+                    call stdlib_srot( qsiz, q( 1, indxq( indx( jlam ) ) ), 1,q( 1, indxq( indx( j &
+                              ) ) ), 1, c, s )
+                 end if
+                 t = d( jlam )*c*c + d( j )*s*s
+                 d( j ) = d( jlam )*s*s + d( j )*c*c
+                 d( jlam ) = t
+                 k2 = k2 - 1
+                 i = 1
+                 90 continue
+                 if( k2+i<=n ) then
+                    if( d( jlam )<d( indxp( k2+i ) ) ) then
+                       indxp( k2+i-1 ) = indxp( k2+i )
+                       indxp( k2+i ) = jlam
+                       i = i + 1
+                       go to 90
+                    else
+                       indxp( k2+i-1 ) = jlam
+                    end if
+                 else
+                    indxp( k2+i-1 ) = jlam
+                 end if
+                 jlam = j
+              else
+                 k = k + 1
+                 w( k ) = z( jlam )
+                 dlamda( k ) = d( jlam )
+                 indxp( k ) = jlam
+                 jlam = j
+              end if
+           end if
+           go to 80
+           100 continue
+           ! record the last eigenvalue.
+           k = k + 1
+           w( k ) = z( jlam )
+           dlamda( k ) = d( jlam )
+           indxp( k ) = jlam
+           110 continue
+           ! sort the eigenvalues and corresponding eigenvectors into dlamda
+           ! and q2 respectively.  the eigenvalues/vectors which were not
+           ! deflated go into the first k slots of dlamda and q2 respectively,
+           ! while those which were deflated go into the last n - k slots.
+           if( icompq==0 ) then
+              do j = 1, n
+                 jp = indxp( j )
+                 dlamda( j ) = d( jp )
+                 perm( j ) = indxq( indx( jp ) )
+              end do
+           else
+              do j = 1, n
+                 jp = indxp( j )
+                 dlamda( j ) = d( jp )
+                 perm( j ) = indxq( indx( jp ) )
+                 call stdlib_scopy( qsiz, q( 1, perm( j ) ), 1, q2( 1, j ), 1 )
+              end do
+           end if
+           ! the deflated eigenvalues and their corresponding vectors go back
+           ! into the last n - k slots of d and q respectively.
+           if( k<n ) then
+              if( icompq==0 ) then
+                 call stdlib_scopy( n-k, dlamda( k+1 ), 1, d( k+1 ), 1 )
+              else
+                 call stdlib_scopy( n-k, dlamda( k+1 ), 1, d( k+1 ), 1 )
+                 call stdlib_slacpy( 'A', qsiz, n-k, q2( 1, k+1 ), ldq2,q( 1, k+1 ), ldq )
+              end if
+           end if
+           return
+     end subroutine stdlib_slaed8
+
+     !> SLAED9: finds the roots of the secular equation, as defined by the
+     !> values in D, Z, and RHO, between KSTART and KSTOP.  It makes the
+     !> appropriate calls to SLAED4 and then stores the new matrix of
+     !> eigenvectors for use in calculating the next level of Z vectors.
+
+     pure subroutine stdlib_slaed9( k, kstart, kstop, n, d, q, ldq, rho, dlamda, w,s, lds, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, kstart, kstop, ldq, lds, n
+           real(sp), intent(in) :: rho
+           ! Array Arguments 
+           real(sp), intent(out) :: d(*), q(ldq,*), s(lds,*)
+           real(sp), intent(inout) :: dlamda(*), w(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( k<0 ) then
+              info = -1
+           else if( kstart<1 .or. kstart>max( 1, k ) ) then
+              info = -2
+           else if( max( 1, kstop )<kstart .or. kstop>max( 1, k ) )then
+              info = -3
+           else if( n<k ) then
+              info = -4
+           else if( ldq<max( 1, k ) ) then
+              info = -7
+           else if( lds<max( 1, k ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAED9', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==0 )return
+           ! modify values dlamda(i) to make sure all dlamda(i)-dlamda(j) can
+           ! be computed with high relative accuracy (barring over/underflow).
+           ! this is a problem on machines without a guard digit in
+           ! add/subtract (cray xmp, cray ymp, cray c 90 and cray 2).
+           ! the following code replaces dlamda(i) by 2*dlamda(i)-dlamda(i),
+           ! which on any of these machines zeros out the bottommost
+           ! bit of dlamda(i) if it is 1; this makes the subsequent
+           ! subtractions dlamda(i)-dlamda(j) unproblematic when cancellation
+           ! occurs. on binary machines with a guard digit (almost all
+           ! machines) it does not change dlamda(i) at all. on hexadecimal
+           ! and decimal machines with a guard digit, it slightly
+           ! changes the bottommost bits of dlamda(i). it does not account
+           ! for hexadecimal or decimal machines without guard digits
+           ! (we know of none). we use a subroutine call to compute
+           ! 2*dlambda(i) to prevent optimizing compilers from eliminating
+           ! this code.
+           do i = 1, n
+              dlamda( i ) = stdlib_slamc3( dlamda( i ), dlamda( i ) ) - dlamda( i )
+           end do
+           do j = kstart, kstop
+              call stdlib_slaed4( k, j, dlamda, w, q( 1, j ), rho, d( j ), info )
+              ! if the zero finder fails, the computation is terminated.
+              if( info/=0 )go to 120
+           end do
+           if( k==1 .or. k==2 ) then
+              do i = 1, k
+                 do j = 1, k
+                    s( j, i ) = q( j, i )
+                 end do
+              end do
+              go to 120
+           end if
+           ! compute updated w.
+           call stdlib_scopy( k, w, 1, s, 1 )
+           ! initialize w(i) = q(i,i)
+           call stdlib_scopy( k, q, ldq+1, w, 1 )
+           do j = 1, k
+              do i = 1, j - 1
+                 w( i ) = w( i )*( q( i, j ) / ( dlamda( i )-dlamda( j ) ) )
+              end do
+              do i = j + 1, k
+                 w( i ) = w( i )*( q( i, j ) / ( dlamda( i )-dlamda( j ) ) )
+              end do
+           end do
+           do i = 1, k
+              w( i ) = sign( sqrt( -w( i ) ), s( i, 1 ) )
+           end do
+           ! compute eigenvectors of the modified rank-1 modification.
+           do j = 1, k
+              do i = 1, k
+                 q( i, j ) = w( i ) / q( i, j )
+              end do
+              temp = stdlib_snrm2( k, q( 1, j ), 1 )
+              do i = 1, k
+                 s( i, j ) = q( i, j ) / temp
+              end do
+           end do
+           120 continue
+           return
+     end subroutine stdlib_slaed9
+
+     !> SLAEIN: uses inverse iteration to find a right or left eigenvector
+     !> corresponding to the eigenvalue (WR,WI) of a real upper Hessenberg
+     !> matrix H.
+
+     pure subroutine stdlib_slaein( rightv, noinit, n, h, ldh, wr, wi, vr, vi, b,ldb, work, eps3, &
+               smlnum, bignum, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: noinit, rightv
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldh, n
+           real(sp), intent(in) :: bignum, eps3, smlnum, wi, wr
+           ! Array Arguments 
+           real(sp), intent(out) :: b(ldb,*), work(*)
+           real(sp), intent(in) :: h(ldh,*)
+           real(sp), intent(inout) :: vi(*), vr(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: tenth = 1.0e-1_sp
+           
+           ! Local Scalars 
+           character :: normin, trans
+           integer(ilp) :: i, i1, i2, i3, ierr, its, j
+           real(sp) :: absbii, absbjj, ei, ej, growto, norm, nrmsml, rec, rootn, scale, temp, &
+                     vcrit, vmax, vnorm, w, w1, x, xi, xr, y
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real,sqrt
+           ! Executable Statements 
+           info = 0
+           ! growto is the threshold used in the acceptance test for an
+           ! eigenvector.
+           rootn = sqrt( real( n,KIND=sp) )
+           growto = tenth / rootn
+           nrmsml = max( one, eps3*rootn )*smlnum
+           ! form b = h - (wr,wi)*i (except that the subdiagonal elements and
+           ! the imaginary parts of the diagonal elements are not stored).
+           do j = 1, n
+              do i = 1, j - 1
+                 b( i, j ) = h( i, j )
+              end do
+              b( j, j ) = h( j, j ) - wr
+           end do
+           if( wi==zero ) then
+              ! real eigenvalue.
+              if( noinit ) then
+                 ! set initial vector.
+                 do i = 1, n
+                    vr( i ) = eps3
+                 end do
+              else
+                 ! scale supplied initial vector.
+                 vnorm = stdlib_snrm2( n, vr, 1 )
+                 call stdlib_sscal( n, ( eps3*rootn ) / max( vnorm, nrmsml ), vr,1 )
+              end if
+              if( rightv ) then
+                 ! lu decomposition with partial pivoting of b, replacing zero
+                 ! pivots by eps3.
+                 do i = 1, n - 1
+                    ei = h( i+1, i )
+                    if( abs( b( i, i ) )<abs( ei ) ) then
+                       ! interchange rows and eliminate.
+                       x = b( i, i ) / ei
+                       b( i, i ) = ei
+                       do j = i + 1, n
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - x*temp
+                          b( i, j ) = temp
+                       end do
+                    else
+                       ! eliminate without interchange.
+                       if( b( i, i )==zero )b( i, i ) = eps3
+                       x = ei / b( i, i )
+                       if( x/=zero ) then
+                          do j = i + 1, n
+                             b( i+1, j ) = b( i+1, j ) - x*b( i, j )
+                          end do
+                       end if
+                    end if
+                 end do
+                 if( b( n, n )==zero )b( n, n ) = eps3
+                 trans = 'N'
+              else
+                 ! ul decomposition with partial pivoting of b, replacing zero
+                 ! pivots by eps3.
+                 do j = n, 2, -1
+                    ej = h( j, j-1 )
+                    if( abs( b( j, j ) )<abs( ej ) ) then
+                       ! interchange columns and eliminate.
+                       x = b( j, j ) / ej
+                       b( j, j ) = ej
+                       do i = 1, j - 1
+                          temp = b( i, j-1 )
+                          b( i, j-1 ) = b( i, j ) - x*temp
+                          b( i, j ) = temp
+                       end do
+                    else
+                       ! eliminate without interchange.
+                       if( b( j, j )==zero )b( j, j ) = eps3
+                       x = ej / b( j, j )
+                       if( x/=zero ) then
+                          do i = 1, j - 1
+                             b( i, j-1 ) = b( i, j-1 ) - x*b( i, j )
+                          end do
+                       end if
+                    end if
+                 end do
+                 if( b( 1, 1 )==zero )b( 1, 1 ) = eps3
+                 trans = 'T'
+              end if
+              normin = 'N'
+              do its = 1, n
+                 ! solve u*x = scale*v for a right eigenvector
+                   ! or u**t*x = scale*v for a left eigenvector,
+                 ! overwriting x on v.
+                 call stdlib_slatrs( 'UPPER', trans, 'NONUNIT', normin, n, b, ldb,vr, scale, work,&
+                            ierr )
+                 normin = 'Y'
+                 ! test for sufficient growth in the norm of v.
+                 vnorm = stdlib_sasum( n, vr, 1 )
+                 if( vnorm>=growto*scale )go to 120
+                 ! choose new orthogonal starting vector and try again.
+                 temp = eps3 / ( rootn+one )
+                 vr( 1 ) = eps3
+                 do i = 2, n
+                    vr( i ) = temp
+                 end do
+                 vr( n-its+1 ) = vr( n-its+1 ) - eps3*rootn
+              end do
+              ! failure to find eigenvector in n iterations.
+              info = 1
+              120 continue
+              ! normalize eigenvector.
+              i = stdlib_isamax( n, vr, 1 )
+              call stdlib_sscal( n, one / abs( vr( i ) ), vr, 1 )
+           else
+              ! complex eigenvalue.
+              if( noinit ) then
+                 ! set initial vector.
+                 do i = 1, n
+                    vr( i ) = eps3
+                    vi( i ) = zero
+                 end do
+              else
+                 ! scale supplied initial vector.
+                 norm = stdlib_slapy2( stdlib_snrm2( n, vr, 1 ), stdlib_snrm2( n, vi, 1 ) )
+                           
+                 rec = ( eps3*rootn ) / max( norm, nrmsml )
+                 call stdlib_sscal( n, rec, vr, 1 )
+                 call stdlib_sscal( n, rec, vi, 1 )
+              end if
+              if( rightv ) then
+                 ! lu decomposition with partial pivoting of b, replacing zero
+                 ! pivots by eps3.
+                 ! the imaginary part of the (i,j)-th element of u is stored in
+                 ! b(j+1,i).
+                 b( 2, 1 ) = -wi
+                 do i = 2, n
+                    b( i+1, 1 ) = zero
+                 end do
+                 loop_170: do i = 1, n - 1
+                    absbii = stdlib_slapy2( b( i, i ), b( i+1, i ) )
+                    ei = h( i+1, i )
+                    if( absbii<abs( ei ) ) then
+                       ! interchange rows and eliminate.
+                       xr = b( i, i ) / ei
+                       xi = b( i+1, i ) / ei
+                       b( i, i ) = ei
+                       b( i+1, i ) = zero
+                       do j = i + 1, n
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - xr*temp
+                          b( j+1, i+1 ) = b( j+1, i ) - xi*temp
+                          b( i, j ) = temp
+                          b( j+1, i ) = zero
+                       end do
+                       b( i+2, i ) = -wi
+                       b( i+1, i+1 ) = b( i+1, i+1 ) - xi*wi
+                       b( i+2, i+1 ) = b( i+2, i+1 ) + xr*wi
+                    else
+                       ! eliminate without interchanging rows.
+                       if( absbii==zero ) then
+                          b( i, i ) = eps3
+                          b( i+1, i ) = zero
+                          absbii = eps3
+                       end if
+                       ei = ( ei / absbii ) / absbii
+                       xr = b( i, i )*ei
+                       xi = -b( i+1, i )*ei
+                       do j = i + 1, n
+                          b( i+1, j ) = b( i+1, j ) - xr*b( i, j ) +xi*b( j+1, i )
+                          b( j+1, i+1 ) = -xr*b( j+1, i ) - xi*b( i, j )
+                       end do
+                       b( i+2, i+1 ) = b( i+2, i+1 ) - wi
+                    end if
+                    ! compute 1-norm of offdiagonal elements of i-th row.
+                    work( i ) = stdlib_sasum( n-i, b( i, i+1 ), ldb ) +stdlib_sasum( n-i, b( i+2, &
+                              i ), 1 )
+                 end do loop_170
+                 if( b( n, n )==zero .and. b( n+1, n )==zero )b( n, n ) = eps3
+                 work( n ) = zero
+                 i1 = n
+                 i2 = 1
+                 i3 = -1
+              else
+                 ! ul decomposition with partial pivoting of conjg(b),
+                 ! replacing zero pivots by eps3.
+                 ! the imaginary part of the (i,j)-th element of u is stored in
+                 ! b(j+1,i).
+                 b( n+1, n ) = wi
+                 do j = 1, n - 1
+                    b( n+1, j ) = zero
+                 end do
+                 loop_210: do j = n, 2, -1
+                    ej = h( j, j-1 )
+                    absbjj = stdlib_slapy2( b( j, j ), b( j+1, j ) )
+                    if( absbjj<abs( ej ) ) then
+                       ! interchange columns and eliminate
+                       xr = b( j, j ) / ej
+                       xi = b( j+1, j ) / ej
+                       b( j, j ) = ej
+                       b( j+1, j ) = zero
+                       do i = 1, j - 1
+                          temp = b( i, j-1 )
+                          b( i, j-1 ) = b( i, j ) - xr*temp
+                          b( j, i ) = b( j+1, i ) - xi*temp
+                          b( i, j ) = temp
+                          b( j+1, i ) = zero
+                       end do
+                       b( j+1, j-1 ) = wi
+                       b( j-1, j-1 ) = b( j-1, j-1 ) + xi*wi
+                       b( j, j-1 ) = b( j, j-1 ) - xr*wi
+                    else
+                       ! eliminate without interchange.
+                       if( absbjj==zero ) then
+                          b( j, j ) = eps3
+                          b( j+1, j ) = zero
+                          absbjj = eps3
+                       end if
+                       ej = ( ej / absbjj ) / absbjj
+                       xr = b( j, j )*ej
+                       xi = -b( j+1, j )*ej
+                       do i = 1, j - 1
+                          b( i, j-1 ) = b( i, j-1 ) - xr*b( i, j ) +xi*b( j+1, i )
+                          b( j, i ) = -xr*b( j+1, i ) - xi*b( i, j )
+                       end do
+                       b( j, j-1 ) = b( j, j-1 ) + wi
+                    end if
+                    ! compute 1-norm of offdiagonal elements of j-th column.
+                    work( j ) = stdlib_sasum( j-1, b( 1, j ), 1 ) +stdlib_sasum( j-1, b( j+1, 1 ),&
+                               ldb )
+                 end do loop_210
+                 if( b( 1, 1 )==zero .and. b( 2, 1 )==zero )b( 1, 1 ) = eps3
+                 work( 1 ) = zero
+                 i1 = 1
+                 i2 = n
+                 i3 = 1
+              end if
+              loop_270: do its = 1, n
+                 scale = one
+                 vmax = one
+                 vcrit = bignum
+                 ! solve u*(xr,xi) = scale*(vr,vi) for a right eigenvector,
+                   ! or u**t*(xr,xi) = scale*(vr,vi) for a left eigenvector,
+                 ! overwriting (xr,xi) on (vr,vi).
+                 loop_250: do i = i1, i2, i3
+                    if( work( i )>vcrit ) then
+                       rec = one / vmax
+                       call stdlib_sscal( n, rec, vr, 1 )
+                       call stdlib_sscal( n, rec, vi, 1 )
+                       scale = scale*rec
+                       vmax = one
+                       vcrit = bignum
+                    end if
+                    xr = vr( i )
+                    xi = vi( i )
+                    if( rightv ) then
+                       do j = i + 1, n
+                          xr = xr - b( i, j )*vr( j ) + b( j+1, i )*vi( j )
+                          xi = xi - b( i, j )*vi( j ) - b( j+1, i )*vr( j )
+                       end do
+                    else
+                       do j = 1, i - 1
+                          xr = xr - b( j, i )*vr( j ) + b( i+1, j )*vi( j )
+                          xi = xi - b( j, i )*vi( j ) - b( i+1, j )*vr( j )
+                       end do
+                    end if
+                    w = abs( b( i, i ) ) + abs( b( i+1, i ) )
+                    if( w>smlnum ) then
+                       if( w<one ) then
+                          w1 = abs( xr ) + abs( xi )
+                          if( w1>w*bignum ) then
+                             rec = one / w1
+                             call stdlib_sscal( n, rec, vr, 1 )
+                             call stdlib_sscal( n, rec, vi, 1 )
+                             xr = vr( i )
+                             xi = vi( i )
+                             scale = scale*rec
+                             vmax = vmax*rec
+                          end if
+                       end if
+                       ! divide by diagonal element of b.
+                       call stdlib_sladiv( xr, xi, b( i, i ), b( i+1, i ), vr( i ),vi( i ) )
+                                 
+                       vmax = max( abs( vr( i ) )+abs( vi( i ) ), vmax )
+                       vcrit = bignum / vmax
+                    else
+                       do j = 1, n
+                          vr( j ) = zero
+                          vi( j ) = zero
+                       end do
+                       vr( i ) = one
+                       vi( i ) = one
+                       scale = zero
+                       vmax = one
+                       vcrit = bignum
+                    end if
+                 end do loop_250
+                 ! test for sufficient growth in the norm of (vr,vi).
+                 vnorm = stdlib_sasum( n, vr, 1 ) + stdlib_sasum( n, vi, 1 )
+                 if( vnorm>=growto*scale )go to 280
+                 ! choose a new orthogonal starting vector and try again.
+                 y = eps3 / ( rootn+one )
+                 vr( 1 ) = eps3
+                 vi( 1 ) = zero
+                 do i = 2, n
+                    vr( i ) = y
+                    vi( i ) = zero
+                 end do
+                 vr( n-its+1 ) = vr( n-its+1 ) - eps3*rootn
+              end do loop_270
+              ! failure to find eigenvector in n iterations
+              info = 1
+              280 continue
+              ! normalize eigenvector.
+              vnorm = zero
+              do i = 1, n
+                 vnorm = max( vnorm, abs( vr( i ) )+abs( vi( i ) ) )
+              end do
+              call stdlib_sscal( n, one / vnorm, vr, 1 )
+              call stdlib_sscal( n, one / vnorm, vi, 1 )
+           end if
+           return
+     end subroutine stdlib_slaein
+
+     !> SLAGV2: computes the Generalized Schur factorization of a real 2-by-2
+     !> matrix pencil (A,B) where B is upper triangular. This routine
+     !> computes orthogonal (rotation) matrices given by CSL, SNL and CSR,
+     !> SNR such that
+     !> 1) if the pencil (A,B) has two real eigenvalues (include 0/0 or 1/0
+     !> types), then
+     !> [ a11 a12 ] := [  CSL  SNL ] [ a11 a12 ] [  CSR -SNR ]
+     !> [  0  a22 ]    [ -SNL  CSL ] [ a21 a22 ] [  SNR  CSR ]
+     !> [ b11 b12 ] := [  CSL  SNL ] [ b11 b12 ] [  CSR -SNR ]
+     !> [  0  b22 ]    [ -SNL  CSL ] [  0  b22 ] [  SNR  CSR ],
+     !> 2) if the pencil (A,B) has a pair of complex conjugate eigenvalues,
+     !> then
+     !> [ a11 a12 ] := [  CSL  SNL ] [ a11 a12 ] [  CSR -SNR ]
+     !> [ a21 a22 ]    [ -SNL  CSL ] [ a21 a22 ] [  SNR  CSR ]
+     !> [ b11  0  ] := [  CSL  SNL ] [ b11 b12 ] [  CSR -SNR ]
+     !> [  0  b22 ]    [ -SNL  CSL ] [  0  b22 ] [  SNR  CSR ]
+     !> where b11 >= b22 > 0.
+
+     pure subroutine stdlib_slagv2( a, lda, b, ldb, alphar, alphai, beta, csl, snl,csr, snr )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldb
+           real(sp), intent(out) :: csl, csr, snl, snr
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: alphai(2), alphar(2), beta(2)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: anorm, ascale, bnorm, bscale, h1, h2, h3, qq, r, rr, safmin, scale1, &
+                     scale2, t, ulp, wi, wr1, wr2
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           safmin = stdlib_slamch( 'S' )
+           ulp = stdlib_slamch( 'P' )
+           ! scale a
+           anorm = max( abs( a( 1, 1 ) )+abs( a( 2, 1 ) ),abs( a( 1, 2 ) )+abs( a( 2, 2 ) ), &
+                     safmin )
+           ascale = one / anorm
+           a( 1, 1 ) = ascale*a( 1, 1 )
+           a( 1, 2 ) = ascale*a( 1, 2 )
+           a( 2, 1 ) = ascale*a( 2, 1 )
+           a( 2, 2 ) = ascale*a( 2, 2 )
+           ! scale b
+           bnorm = max( abs( b( 1, 1 ) ), abs( b( 1, 2 ) )+abs( b( 2, 2 ) ),safmin )
+           bscale = one / bnorm
+           b( 1, 1 ) = bscale*b( 1, 1 )
+           b( 1, 2 ) = bscale*b( 1, 2 )
+           b( 2, 2 ) = bscale*b( 2, 2 )
+           ! check if a can be deflated
+           if( abs( a( 2, 1 ) )<=ulp ) then
+              csl = one
+              snl = zero
+              csr = one
+              snr = zero
+              a( 2, 1 ) = zero
+              b( 2, 1 ) = zero
+              wi = zero
+           ! check if b is singular
+           else if( abs( b( 1, 1 ) )<=ulp ) then
+              call stdlib_slartg( a( 1, 1 ), a( 2, 1 ), csl, snl, r )
+              csr = one
+              snr = zero
+              call stdlib_srot( 2, a( 1, 1 ), lda, a( 2, 1 ), lda, csl, snl )
+              call stdlib_srot( 2, b( 1, 1 ), ldb, b( 2, 1 ), ldb, csl, snl )
+              a( 2, 1 ) = zero
+              b( 1, 1 ) = zero
+              b( 2, 1 ) = zero
+              wi = zero
+           else if( abs( b( 2, 2 ) )<=ulp ) then
+              call stdlib_slartg( a( 2, 2 ), a( 2, 1 ), csr, snr, t )
+              snr = -snr
+              call stdlib_srot( 2, a( 1, 1 ), 1, a( 1, 2 ), 1, csr, snr )
+              call stdlib_srot( 2, b( 1, 1 ), 1, b( 1, 2 ), 1, csr, snr )
+              csl = one
+              snl = zero
+              a( 2, 1 ) = zero
+              b( 2, 1 ) = zero
+              b( 2, 2 ) = zero
+              wi = zero
+           else
+              ! b is nonsingular, first compute the eigenvalues of (a,b)
+              call stdlib_slag2( a, lda, b, ldb, safmin, scale1, scale2, wr1, wr2,wi )
+              if( wi==zero ) then
+                 ! two real eigenvalues, compute s*a-w*b
+                 h1 = scale1*a( 1, 1 ) - wr1*b( 1, 1 )
+                 h2 = scale1*a( 1, 2 ) - wr1*b( 1, 2 )
+                 h3 = scale1*a( 2, 2 ) - wr1*b( 2, 2 )
+                 rr = stdlib_slapy2( h1, h2 )
+                 qq = stdlib_slapy2( scale1*a( 2, 1 ), h3 )
+                 if( rr>qq ) then
+                    ! find right rotation matrix to zero 1,1 element of
+                    ! (sa - wb)
+                    call stdlib_slartg( h2, h1, csr, snr, t )
+                 else
+                    ! find right rotation matrix to zero 2,1 element of
+                    ! (sa - wb)
+                    call stdlib_slartg( h3, scale1*a( 2, 1 ), csr, snr, t )
+                 end if
+                 snr = -snr
+                 call stdlib_srot( 2, a( 1, 1 ), 1, a( 1, 2 ), 1, csr, snr )
+                 call stdlib_srot( 2, b( 1, 1 ), 1, b( 1, 2 ), 1, csr, snr )
+                 ! compute inf norms of a and b
+                 h1 = max( abs( a( 1, 1 ) )+abs( a( 1, 2 ) ),abs( a( 2, 1 ) )+abs( a( 2, 2 ) ) )
+                           
+                 h2 = max( abs( b( 1, 1 ) )+abs( b( 1, 2 ) ),abs( b( 2, 1 ) )+abs( b( 2, 2 ) ) )
+                           
+                 if( ( scale1*h1 )>=abs( wr1 )*h2 ) then
+                    ! find left rotation matrix q to zero out b(2,1)
+                    call stdlib_slartg( b( 1, 1 ), b( 2, 1 ), csl, snl, r )
+                 else
+                    ! find left rotation matrix q to zero out a(2,1)
+                    call stdlib_slartg( a( 1, 1 ), a( 2, 1 ), csl, snl, r )
+                 end if
+                 call stdlib_srot( 2, a( 1, 1 ), lda, a( 2, 1 ), lda, csl, snl )
+                 call stdlib_srot( 2, b( 1, 1 ), ldb, b( 2, 1 ), ldb, csl, snl )
+                 a( 2, 1 ) = zero
+                 b( 2, 1 ) = zero
+              else
+                 ! a pair of complex conjugate eigenvalues
+                 ! first compute the svd of the matrix b
+                 call stdlib_slasv2( b( 1, 1 ), b( 1, 2 ), b( 2, 2 ), r, t, snr,csr, snl, csl )
+                           
+                 ! form (a,b) := q(a,b)z**t where q is left rotation matrix and
+                 ! z is right rotation matrix computed from stdlib_slasv2
+                 call stdlib_srot( 2, a( 1, 1 ), lda, a( 2, 1 ), lda, csl, snl )
+                 call stdlib_srot( 2, b( 1, 1 ), ldb, b( 2, 1 ), ldb, csl, snl )
+                 call stdlib_srot( 2, a( 1, 1 ), 1, a( 1, 2 ), 1, csr, snr )
+                 call stdlib_srot( 2, b( 1, 1 ), 1, b( 1, 2 ), 1, csr, snr )
+                 b( 2, 1 ) = zero
+                 b( 1, 2 ) = zero
+              end if
+           end if
+           ! unscaling
+           a( 1, 1 ) = anorm*a( 1, 1 )
+           a( 2, 1 ) = anorm*a( 2, 1 )
+           a( 1, 2 ) = anorm*a( 1, 2 )
+           a( 2, 2 ) = anorm*a( 2, 2 )
+           b( 1, 1 ) = bnorm*b( 1, 1 )
+           b( 2, 1 ) = bnorm*b( 2, 1 )
+           b( 1, 2 ) = bnorm*b( 1, 2 )
+           b( 2, 2 ) = bnorm*b( 2, 2 )
+           if( wi==zero ) then
+              alphar( 1 ) = a( 1, 1 )
+              alphar( 2 ) = a( 2, 2 )
+              alphai( 1 ) = zero
+              alphai( 2 ) = zero
+              beta( 1 ) = b( 1, 1 )
+              beta( 2 ) = b( 2, 2 )
+           else
+              alphar( 1 ) = anorm*wr1 / scale1 / bnorm
+              alphai( 1 ) = anorm*wi / scale1 / bnorm
+              alphar( 2 ) = alphar( 1 )
+              alphai( 2 ) = -alphai( 1 )
+              beta( 1 ) = one
+              beta( 2 ) = one
+           end if
+           return
+     end subroutine stdlib_slagv2
+
+     !> SLAHR2: reduces the first NB columns of A real general n-BY-(n-k+1)
+     !> matrix A so that elements below the k-th subdiagonal are zero. The
+     !> reduction is performed by an orthogonal similarity transformation
+     !> Q**T * A * Q. The routine returns the matrices V and T which determine
+     !> Q as a block reflector I - V*T*V**T, and also the matrix Y = A * V * T.
+     !> This is an auxiliary routine called by SGEHRD.
+
+     pure subroutine stdlib_slahr2( n, k, nb, a, lda, tau, t, ldt, y, ldy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: k, lda, ldt, ldy, n, nb
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: t(ldt,nb), tau(nb), y(ldy,nb)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(sp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 )return
+           loop_10: do i = 1, nb
+              if( i>1 ) then
+                 ! update a(k+1:n,i)
+                 ! update i-th column of a - y * v**t
+                 call stdlib_sgemv( 'NO TRANSPOSE', n-k, i-1, -one, y(k+1,1), ldy,a( k+i-1, 1 ), &
+                           lda, one, a( k+1, i ), 1 )
+                 ! apply i - v * t**t * v**t to this column (call it b) from the
+                 ! left, using the last column of t as workspace
+                 ! let  v = ( v1 )   and   b = ( b1 )   (first i-1 rows)
+                          ! ( v2 )             ( b2 )
+                 ! where v1 is unit lower triangular
+                 ! w := v1**t * b1
+                 call stdlib_scopy( i-1, a( k+1, i ), 1, t( 1, nb ), 1 )
+                 call stdlib_strmv( 'LOWER', 'TRANSPOSE', 'UNIT',i-1, a( k+1, 1 ),lda, t( 1, nb ),&
+                            1 )
+                 ! w := w + v2**t * b2
+                 call stdlib_sgemv( 'TRANSPOSE', n-k-i+1, i-1,one, a( k+i, 1 ),lda, a( k+i, i ), &
+                           1, one, t( 1, nb ), 1 )
+                 ! w := t**t * w
+                 call stdlib_strmv( 'UPPER', 'TRANSPOSE', 'NON-UNIT',i-1, t, ldt,t( 1, nb ), 1 )
+                           
+                 ! b2 := b2 - v2*w
+                 call stdlib_sgemv( 'NO TRANSPOSE', n-k-i+1, i-1, -one,a( k+i, 1 ),lda, t( 1, nb )&
+                           , 1, one, a( k+i, i ), 1 )
+                 ! b1 := b1 - v1*w
+                 call stdlib_strmv( 'LOWER', 'NO TRANSPOSE','UNIT', i-1,a( k+1, 1 ), lda, t( 1, &
+                           nb ), 1 )
+                 call stdlib_saxpy( i-1, -one, t( 1, nb ), 1, a( k+1, i ), 1 )
+                 a( k+i-1, i-1 ) = ei
+              end if
+              ! generate the elementary reflector h(i) to annihilate
+              ! a(k+i+1:n,i)
+              call stdlib_slarfg( n-k-i+1, a( k+i, i ), a( min( k+i+1, n ), i ), 1,tau( i ) )
+                        
+              ei = a( k+i, i )
+              a( k+i, i ) = one
+              ! compute  y(k+1:n,i)
+              call stdlib_sgemv( 'NO TRANSPOSE', n-k, n-k-i+1,one, a( k+1, i+1 ),lda, a( k+i, i ),&
+                         1, zero, y( k+1, i ), 1 )
+              call stdlib_sgemv( 'TRANSPOSE', n-k-i+1, i-1,one, a( k+i, 1 ), lda,a( k+i, i ), 1, &
+                        zero, t( 1, i ), 1 )
+              call stdlib_sgemv( 'NO TRANSPOSE', n-k, i-1, -one,y( k+1, 1 ), ldy,t( 1, i ), 1, &
+                        one, y( k+1, i ), 1 )
+              call stdlib_sscal( n-k, tau( i ), y( k+1, i ), 1 )
+              ! compute t(1:i,i)
+              call stdlib_sscal( i-1, -tau( i ), t( 1, i ), 1 )
+              call stdlib_strmv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',i-1, t, ldt,t( 1, i ), 1 )
+                        
+              t( i, i ) = tau( i )
+           end do loop_10
+           a( k+nb, nb ) = ei
+           ! compute y(1:k,1:nb)
+           call stdlib_slacpy( 'ALL', k, nb, a( 1, 2 ), lda, y, ldy )
+           call stdlib_strmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE','UNIT', k, nb,one, a( k+1, 1 ), &
+                     lda, y, ldy )
+           if( n>k+nb )call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', k,nb, n-k-nb, one,a( 1, &
+                     2+nb ), lda, a( k+1+nb, 1 ), lda, one, y,ldy )
+           call stdlib_strmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE','NON-UNIT', k, nb,one, t, ldt, y, &
+                     ldy )
+           return
+     end subroutine stdlib_slahr2
+
+     !> SLALN2: solves a system of the form  (ca A - w D ) X = s B
+     !> or (ca A**T - w D) X = s B   with possible scaling ("s") and
+     !> perturbation of A.  (A**T means A-transpose.)
+     !> A is an NA x NA real matrix, ca is a real scalar, D is an NA x NA
+     !> real diagonal matrix, w is a real or complex value, and X and B are
+     !> NA x 1 matrices -- real if w is real, complex if w is complex.  NA
+     !> may be 1 or 2.
+     !> If w is complex, X and B are represented as NA x 2 matrices,
+     !> the first column of each being the real part and the second
+     !> being the imaginary part.
+     !> "s" is a scaling factor (<= 1), computed by SLALN2, which is
+     !> so chosen that X can be computed without overflow.  X is further
+     !> scaled if necessary to assure that norm(ca A - w D)*norm(X) is less
+     !> than overflow.
+     !> If both singular values of (ca A - w D) are less than SMIN,
+     !> SMIN*identity will be used instead of (ca A - w D).  If only one
+     !> singular value is less than SMIN, one element of (ca A - w D) will be
+     !> perturbed enough to make the smallest singular value roughly SMIN.
+     !> If both singular values are at least SMIN, (ca A - w D) will not be
+     !> perturbed.  In any case, the perturbation will be at most some small
+     !> multiple of max( SMIN, ulp*norm(ca A - w D) ).  The singular values
+     !> are computed by infinity-norm approximations, and thus will only be
+     !> correct to a factor of 2 or so.
+     !> Note: all input quantities are assumed to be smaller than overflow
+     !> by a reasonable factor.  (See BIGNUM.)
+
+     pure subroutine stdlib_slaln2( ltrans, na, nw, smin, ca, a, lda, d1, d2, b,ldb, wr, wi, x, &
+               ldx, scale, xnorm, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: ltrans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldx, na, nw
+           real(sp), intent(in) :: ca, d1, d2, smin, wi, wr
+           real(sp), intent(out) :: scale, xnorm
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: x(ldx,*)
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: icmax, j
+           real(sp) :: bbnd, bi1, bi2, bignum, bnorm, br1, br2, ci21, ci22, cmax, cnorm, cr21, &
+           cr22, csi, csr, li21, lr21, smini, smlnum, temp, u22abs, ui11, ui11r, ui12, ui12s, &
+                     ui22, ur11, ur11r, ur12, ur12s, ur22, xi1, xi2, xr1, xr2
+           ! Local Arrays 
+           logical(lk) :: cswap(4), rswap(4)
+           integer(ilp) :: ipivot(4,4)
+           real(sp) :: ci(2,2), civ(4), cr(2,2), crv(4)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Equivalences 
+           equivalence        ( ci( 1, 1 ), civ( 1 ) ),( cr( 1, 1 ), crv( 1 ) )
+           ! Data Statements 
+           cswap = [.false.,.false.,.true.,.true.]
+           rswap = [.false.,.true.,.false.,.true.]
+           ipivot = reshape([1,2,3,4,2,1,4,3,3,4,1,2,4,3,2,1],[4,4])
+           ! Executable Statements 
+           ! compute bignum
+           smlnum = two*stdlib_slamch( 'SAFE MINIMUM' )
+           bignum = one / smlnum
+           smini = max( smin, smlnum )
+           ! don't check for input errors
+           info = 0
+           ! standard initializations
+           scale = one
+           if( na==1 ) then
+              ! 1 x 1  (i.e., scalar) system   c x = b
+              if( nw==1 ) then
+                 ! real 1x1 system.
+                 ! c = ca a - w d
+                 csr = ca*a( 1, 1 ) - wr*d1
+                 cnorm = abs( csr )
+                 ! if | c | < smini, use c = smini
+                 if( cnorm<smini ) then
+                    csr = smini
+                    cnorm = smini
+                    info = 1
+                 end if
+                 ! check scaling for  x = b / c
+                 bnorm = abs( b( 1, 1 ) )
+                 if( cnorm<one .and. bnorm>one ) then
+                    if( bnorm>bignum*cnorm )scale = one / bnorm
+                 end if
+                 ! compute x
+                 x( 1, 1 ) = ( b( 1, 1 )*scale ) / csr
+                 xnorm = abs( x( 1, 1 ) )
+              else
+                 ! complex 1x1 system (w is complex)
+                 ! c = ca a - w d
+                 csr = ca*a( 1, 1 ) - wr*d1
+                 csi = -wi*d1
+                 cnorm = abs( csr ) + abs( csi )
+                 ! if | c | < smini, use c = smini
+                 if( cnorm<smini ) then
+                    csr = smini
+                    csi = zero
+                    cnorm = smini
+                    info = 1
+                 end if
+                 ! check scaling for  x = b / c
+                 bnorm = abs( b( 1, 1 ) ) + abs( b( 1, 2 ) )
+                 if( cnorm<one .and. bnorm>one ) then
+                    if( bnorm>bignum*cnorm )scale = one / bnorm
+                 end if
+                 ! compute x
+                 call stdlib_sladiv( scale*b( 1, 1 ), scale*b( 1, 2 ), csr, csi,x( 1, 1 ), x( 1, &
+                           2 ) )
+                 xnorm = abs( x( 1, 1 ) ) + abs( x( 1, 2 ) )
+              end if
+           else
+              ! 2x2 system
+              ! compute the realpart of  c = ca a - w d  (or  ca a**t - w d,KIND=sp)
+              cr( 1, 1 ) = ca*a( 1, 1 ) - wr*d1
+              cr( 2, 2 ) = ca*a( 2, 2 ) - wr*d2
+              if( ltrans ) then
+                 cr( 1, 2 ) = ca*a( 2, 1 )
+                 cr( 2, 1 ) = ca*a( 1, 2 )
+              else
+                 cr( 2, 1 ) = ca*a( 2, 1 )
+                 cr( 1, 2 ) = ca*a( 1, 2 )
+              end if
+              if( nw==1 ) then
+                 ! real2x2 system  (w is real,KIND=sp)
+                 ! find the largest element in c
+                 cmax = zero
+                 icmax = 0
+                 do j = 1, 4
+                    if( abs( crv( j ) )>cmax ) then
+                       cmax = abs( crv( j ) )
+                       icmax = j
+                    end if
+                 end do
+                 ! if norm(c) < smini, use smini*identity.
+                 if( cmax<smini ) then
+                    bnorm = max( abs( b( 1, 1 ) ), abs( b( 2, 1 ) ) )
+                    if( smini<one .and. bnorm>one ) then
+                       if( bnorm>bignum*smini )scale = one / bnorm
+                    end if
+                    temp = scale / smini
+                    x( 1, 1 ) = temp*b( 1, 1 )
+                    x( 2, 1 ) = temp*b( 2, 1 )
+                    xnorm = temp*bnorm
+                    info = 1
+                    return
+                 end if
+                 ! gaussian elimination with complete pivoting.
+                 ur11 = crv( icmax )
+                 cr21 = crv( ipivot( 2, icmax ) )
+                 ur12 = crv( ipivot( 3, icmax ) )
+                 cr22 = crv( ipivot( 4, icmax ) )
+                 ur11r = one / ur11
+                 lr21 = ur11r*cr21
+                 ur22 = cr22 - ur12*lr21
+                 ! if smaller pivot < smini, use smini
+                 if( abs( ur22 )<smini ) then
+                    ur22 = smini
+                    info = 1
+                 end if
+                 if( rswap( icmax ) ) then
+                    br1 = b( 2, 1 )
+                    br2 = b( 1, 1 )
+                 else
+                    br1 = b( 1, 1 )
+                    br2 = b( 2, 1 )
+                 end if
+                 br2 = br2 - lr21*br1
+                 bbnd = max( abs( br1*( ur22*ur11r ) ), abs( br2 ) )
+                 if( bbnd>one .and. abs( ur22 )<one ) then
+                    if( bbnd>=bignum*abs( ur22 ) )scale = one / bbnd
+                 end if
+                 xr2 = ( br2*scale ) / ur22
+                 xr1 = ( scale*br1 )*ur11r - xr2*( ur11r*ur12 )
+                 if( cswap( icmax ) ) then
+                    x( 1, 1 ) = xr2
+                    x( 2, 1 ) = xr1
+                 else
+                    x( 1, 1 ) = xr1
+                    x( 2, 1 ) = xr2
+                 end if
+                 xnorm = max( abs( xr1 ), abs( xr2 ) )
+                 ! further scaling if  norm(a) norm(x) > overflow
+                 if( xnorm>one .and. cmax>one ) then
+                    if( xnorm>bignum / cmax ) then
+                       temp = cmax / bignum
+                       x( 1, 1 ) = temp*x( 1, 1 )
+                       x( 2, 1 ) = temp*x( 2, 1 )
+                       xnorm = temp*xnorm
+                       scale = temp*scale
+                    end if
+                 end if
+              else
+                 ! complex 2x2 system  (w is complex)
+                 ! find the largest element in c
+                 ci( 1, 1 ) = -wi*d1
+                 ci( 2, 1 ) = zero
+                 ci( 1, 2 ) = zero
+                 ci( 2, 2 ) = -wi*d2
+                 cmax = zero
+                 icmax = 0
+                 do j = 1, 4
+                    if( abs( crv( j ) )+abs( civ( j ) )>cmax ) then
+                       cmax = abs( crv( j ) ) + abs( civ( j ) )
+                       icmax = j
+                    end if
+                 end do
+                 ! if norm(c) < smini, use smini*identity.
+                 if( cmax<smini ) then
+                    bnorm = max( abs( b( 1, 1 ) )+abs( b( 1, 2 ) ),abs( b( 2, 1 ) )+abs( b( 2, 2 )&
+                               ) )
+                    if( smini<one .and. bnorm>one ) then
+                       if( bnorm>bignum*smini )scale = one / bnorm
+                    end if
+                    temp = scale / smini
+                    x( 1, 1 ) = temp*b( 1, 1 )
+                    x( 2, 1 ) = temp*b( 2, 1 )
+                    x( 1, 2 ) = temp*b( 1, 2 )
+                    x( 2, 2 ) = temp*b( 2, 2 )
+                    xnorm = temp*bnorm
+                    info = 1
+                    return
+                 end if
+                 ! gaussian elimination with complete pivoting.
+                 ur11 = crv( icmax )
+                 ui11 = civ( icmax )
+                 cr21 = crv( ipivot( 2, icmax ) )
+                 ci21 = civ( ipivot( 2, icmax ) )
+                 ur12 = crv( ipivot( 3, icmax ) )
+                 ui12 = civ( ipivot( 3, icmax ) )
+                 cr22 = crv( ipivot( 4, icmax ) )
+                 ci22 = civ( ipivot( 4, icmax ) )
+                 if( icmax==1 .or. icmax==4 ) then
+                    ! code when off-diagonals of pivoted c are real
+                    if( abs( ur11 )>abs( ui11 ) ) then
+                       temp = ui11 / ur11
+                       ur11r = one / ( ur11*( one+temp**2 ) )
+                       ui11r = -temp*ur11r
+                    else
+                       temp = ur11 / ui11
+                       ui11r = -one / ( ui11*( one+temp**2 ) )
+                       ur11r = -temp*ui11r
+                    end if
+                    lr21 = cr21*ur11r
+                    li21 = cr21*ui11r
+                    ur12s = ur12*ur11r
+                    ui12s = ur12*ui11r
+                    ur22 = cr22 - ur12*lr21
+                    ui22 = ci22 - ur12*li21
+                 else
+                    ! code when diagonals of pivoted c are real
+                    ur11r = one / ur11
+                    ui11r = zero
+                    lr21 = cr21*ur11r
+                    li21 = ci21*ur11r
+                    ur12s = ur12*ur11r
+                    ui12s = ui12*ur11r
+                    ur22 = cr22 - ur12*lr21 + ui12*li21
+                    ui22 = -ur12*li21 - ui12*lr21
+                 end if
+                 u22abs = abs( ur22 ) + abs( ui22 )
+                 ! if smaller pivot < smini, use smini
+                 if( u22abs<smini ) then
+                    ur22 = smini
+                    ui22 = zero
+                    info = 1
+                 end if
+                 if( rswap( icmax ) ) then
+                    br2 = b( 1, 1 )
+                    br1 = b( 2, 1 )
+                    bi2 = b( 1, 2 )
+                    bi1 = b( 2, 2 )
+                 else
+                    br1 = b( 1, 1 )
+                    br2 = b( 2, 1 )
+                    bi1 = b( 1, 2 )
+                    bi2 = b( 2, 2 )
+                 end if
+                 br2 = br2 - lr21*br1 + li21*bi1
+                 bi2 = bi2 - li21*br1 - lr21*bi1
+                 bbnd = max( ( abs( br1 )+abs( bi1 ) )*( u22abs*( abs( ur11r )+abs( ui11r ) ) ),&
+                           abs( br2 )+abs( bi2 ) )
+                 if( bbnd>one .and. u22abs<one ) then
+                    if( bbnd>=bignum*u22abs ) then
+                       scale = one / bbnd
+                       br1 = scale*br1
+                       bi1 = scale*bi1
+                       br2 = scale*br2
+                       bi2 = scale*bi2
+                    end if
+                 end if
+                 call stdlib_sladiv( br2, bi2, ur22, ui22, xr2, xi2 )
+                 xr1 = ur11r*br1 - ui11r*bi1 - ur12s*xr2 + ui12s*xi2
+                 xi1 = ui11r*br1 + ur11r*bi1 - ui12s*xr2 - ur12s*xi2
+                 if( cswap( icmax ) ) then
+                    x( 1, 1 ) = xr2
+                    x( 2, 1 ) = xr1
+                    x( 1, 2 ) = xi2
+                    x( 2, 2 ) = xi1
+                 else
+                    x( 1, 1 ) = xr1
+                    x( 2, 1 ) = xr2
+                    x( 1, 2 ) = xi1
+                    x( 2, 2 ) = xi2
+                 end if
+                 xnorm = max( abs( xr1 )+abs( xi1 ), abs( xr2 )+abs( xi2 ) )
+                 ! further scaling if  norm(a) norm(x) > overflow
+                 if( xnorm>one .and. cmax>one ) then
+                    if( xnorm>bignum / cmax ) then
+                       temp = cmax / bignum
+                       x( 1, 1 ) = temp*x( 1, 1 )
+                       x( 2, 1 ) = temp*x( 2, 1 )
+                       x( 1, 2 ) = temp*x( 1, 2 )
+                       x( 2, 2 ) = temp*x( 2, 2 )
+                       xnorm = temp*xnorm
+                       scale = temp*scale
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_slaln2
+
+     !> SLALS0: applies back the multiplying factors of either the left or the
+     !> right singular vector matrix of a diagonal matrix appended by a row
+     !> to the right hand side matrix B in solving the least squares problem
+     !> using the divide-and-conquer SVD approach.
+     !> For the left singular vector matrix, three types of orthogonal
+     !> matrices are involved:
+     !> (1L) Givens rotations: the number of such rotations is GIVPTR; the
+     !> pairs of columns/rows they were applied to are stored in GIVCOL;
+     !> and the C- and S-values of these rotations are stored in GIVNUM.
+     !> (2L) Permutation. The (NL+1)-st row of B is to be moved to the first
+     !> row, and for J=2:N, PERM(J)-th row of B is to be moved to the
+     !> J-th row.
+     !> (3L) The left singular vector matrix of the remaining matrix.
+     !> For the right singular vector matrix, four types of orthogonal
+     !> matrices are involved:
+     !> (1R) The right singular vector matrix of the remaining matrix.
+     !> (2R) If SQRE = 1, one extra Givens rotation to generate the right
+     !> null space.
+     !> (3R) The inverse transformation of (2L).
+     !> (4R) The inverse transformation of (1L).
+
+     pure subroutine stdlib_slals0( icompq, nl, nr, sqre, nrhs, b, ldb, bx, ldbx,perm, givptr, &
+               givcol, ldgcol, givnum, ldgnum,poles, difl, difr, z, k, c, s, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: givptr, icompq, k, ldb, ldbx, ldgcol, ldgnum, nl, nr, nrhs,&
+                      sqre
+           integer(ilp), intent(out) :: info
+           real(sp), intent(in) :: c, s
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(ldgcol,*), perm(*)
+           real(sp), intent(inout) :: b(ldb,*)
+           real(sp), intent(out) :: bx(ldbx,*), work(*)
+           real(sp), intent(in) :: difl(*), difr(ldgnum,*), givnum(ldgnum,*), poles(ldgnum,*), z(&
+                     *)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, m, n, nlp1
+           real(sp) :: diflj, difrj, dj, dsigj, dsigjp, temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           n = nl + nr + 1
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( nl<1 ) then
+              info = -2
+           else if( nr<1 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( nrhs<1 ) then
+              info = -5
+           else if( ldb<n ) then
+              info = -7
+           else if( ldbx<n ) then
+              info = -9
+           else if( givptr<0 ) then
+              info = -11
+           else if( ldgcol<n ) then
+              info = -13
+           else if( ldgnum<n ) then
+              info = -15
+           else if( k<1 ) then
+              info = -20
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLALS0', -info )
+              return
+           end if
+           m = n + sqre
+           nlp1 = nl + 1
+           if( icompq==0 ) then
+              ! apply back orthogonal transformations from the left.
+              ! step (1l): apply back the givens rotations performed.
+              do i = 1, givptr
+                 call stdlib_srot( nrhs, b( givcol( i, 2 ), 1 ), ldb,b( givcol( i, 1 ), 1 ), ldb, &
+                           givnum( i, 2 ),givnum( i, 1 ) )
+              end do
+              ! step (2l): permute rows of b.
+              call stdlib_scopy( nrhs, b( nlp1, 1 ), ldb, bx( 1, 1 ), ldbx )
+              do i = 2, n
+                 call stdlib_scopy( nrhs, b( perm( i ), 1 ), ldb, bx( i, 1 ), ldbx )
+              end do
+              ! step (3l): apply the inverse of the left singular vector
+              ! matrix to bx.
+              if( k==1 ) then
+                 call stdlib_scopy( nrhs, bx, ldbx, b, ldb )
+                 if( z( 1 )<zero ) then
+                    call stdlib_sscal( nrhs, negone, b, ldb )
+                 end if
+              else
+                 loop_50: do j = 1, k
+                    diflj = difl( j )
+                    dj = poles( j, 1 )
+                    dsigj = -poles( j, 2 )
+                    if( j<k ) then
+                       difrj = -difr( j, 1 )
+                       dsigjp = -poles( j+1, 2 )
+                    end if
+                    if( ( z( j )==zero ) .or. ( poles( j, 2 )==zero ) )then
+                       work( j ) = zero
+                    else
+                       work( j ) = -poles( j, 2 )*z( j ) / diflj /( poles( j, 2 )+dj )
+                    end if
+                    do i = 1, j - 1
+                       if( ( z( i )==zero ) .or.( poles( i, 2 )==zero ) ) then
+                          work( i ) = zero
+                       else
+                          work( i ) = poles( i, 2 )*z( i ) /( stdlib_slamc3( poles( i, 2 ), dsigj &
+                                    )-diflj ) / ( poles( i, 2 )+dj )
+                       end if
+                    end do
+                    do i = j + 1, k
+                       if( ( z( i )==zero ) .or.( poles( i, 2 )==zero ) ) then
+                          work( i ) = zero
+                       else
+                          work( i ) = poles( i, 2 )*z( i ) /( stdlib_slamc3( poles( i, 2 ), &
+                                    dsigjp )+difrj ) / ( poles( i, 2 )+dj )
+                       end if
+                    end do
+                    work( 1 ) = negone
+                    temp = stdlib_snrm2( k, work, 1 )
+                    call stdlib_sgemv( 'T', k, nrhs, one, bx, ldbx, work, 1, zero,b( j, 1 ), ldb )
+                              
+                    call stdlib_slascl( 'G', 0, 0, temp, one, 1, nrhs, b( j, 1 ),ldb, info )
+                              
+                 end do loop_50
+              end if
+              ! move the deflated rows of bx to b also.
+              if( k<max( m, n ) )call stdlib_slacpy( 'A', n-k, nrhs, bx( k+1, 1 ), ldbx,b( k+1, 1 &
+                        ), ldb )
+           else
+              ! apply back the right orthogonal transformations.
+              ! step (1r): apply back the new right singular vector matrix
+              ! to b.
+              if( k==1 ) then
+                 call stdlib_scopy( nrhs, b, ldb, bx, ldbx )
+              else
+                 do j = 1, k
+                    dsigj = poles( j, 2 )
+                    if( z( j )==zero ) then
+                       work( j ) = zero
+                    else
+                       work( j ) = -z( j ) / difl( j ) /( dsigj+poles( j, 1 ) ) / difr( j, 2 )
+                                 
+                    end if
+                    do i = 1, j - 1
+                       if( z( j )==zero ) then
+                          work( i ) = zero
+                       else
+                          work( i ) = z( j ) / ( stdlib_slamc3( dsigj, -poles( i+1,2 ) )-difr( i, &
+                                    1 ) ) /( dsigj+poles( i, 1 ) ) / difr( i, 2 )
+                       end if
+                    end do
+                    do i = j + 1, k
+                       if( z( j )==zero ) then
+                          work( i ) = zero
+                       else
+                          work( i ) = z( j ) / ( stdlib_slamc3( dsigj, -poles( i,2 ) )-difl( i ) )&
+                                     /( dsigj+poles( i, 1 ) ) / difr( i, 2 )
+                       end if
+                    end do
+                    call stdlib_sgemv( 'T', k, nrhs, one, b, ldb, work, 1, zero,bx( j, 1 ), ldbx )
+                              
+                 end do
+              end if
+              ! step (2r): if sqre = 1, apply back the rotation that is
+              ! related to the right null space of the subproblem.
+              if( sqre==1 ) then
+                 call stdlib_scopy( nrhs, b( m, 1 ), ldb, bx( m, 1 ), ldbx )
+                 call stdlib_srot( nrhs, bx( 1, 1 ), ldbx, bx( m, 1 ), ldbx, c, s )
+              end if
+              if( k<max( m, n ) )call stdlib_slacpy( 'A', n-k, nrhs, b( k+1, 1 ), ldb, bx( k+1, 1 &
+                        ),ldbx )
+              ! step (3r): permute rows of b.
+              call stdlib_scopy( nrhs, bx( 1, 1 ), ldbx, b( nlp1, 1 ), ldb )
+              if( sqre==1 ) then
+                 call stdlib_scopy( nrhs, bx( m, 1 ), ldbx, b( m, 1 ), ldb )
+              end if
+              do i = 2, n
+                 call stdlib_scopy( nrhs, bx( i, 1 ), ldbx, b( perm( i ), 1 ), ldb )
+              end do
+              ! step (4r): apply back the givens rotations performed.
+              do i = givptr, 1, -1
+                 call stdlib_srot( nrhs, b( givcol( i, 2 ), 1 ), ldb,b( givcol( i, 1 ), 1 ), ldb, &
+                           givnum( i, 2 ),-givnum( i, 1 ) )
+              end do
+           end if
+           return
+     end subroutine stdlib_slals0
+
+     !> SLAMSWLQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product of blocked
+     !> elementary reflectors computed by short wide LQ
+     !> factorization (SLASWLQ)
+
+     pure subroutine stdlib_slamswlq( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, mb, nb, ldt, lwork, ldc
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), t(ldt,*)
+           real(sp), intent(out) :: work(*)
+           real(sp), intent(inout) :: c(ldc,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: i, ii, kk, lw, ctr
+           ! External Subroutines 
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork<0
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'T' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           if (left) then
+             lw = n * mb
+           else
+             lw = m * mb
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( k<0 ) then
+             info = -5
+           else if( m<k ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<mb .or. mb<1) then
+             info = -6
+           else if( lda<max( 1, k ) ) then
+             info = -9
+           else if( ldt<max( 1, mb) ) then
+             info = -11
+           else if( ldc<max( 1, m ) ) then
+              info = -13
+           else if(( lwork<max(1,lw)).and.(.not.lquery)) then
+             info = -15
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'SLAMSWLQ', -info )
+             work(1) = lw
+             return
+           else if (lquery) then
+             work(1) = lw
+             return
+           end if
+           ! quick return if possible
+           if( min(m,n,k)==0 ) then
+             return
+           end if
+           if((nb<=k).or.(nb>=max(m,n,k))) then
+             call stdlib_sgemlqt( side, trans, m, n, k, mb, a, lda,t, ldt, c, ldc, work, info)
+                       
+             return
+           end if
+           if(left.and.tran) then
+               ! multiply q to the last block of c
+               kk = mod((m-k),(nb-k))
+               ctr = (m-k)/(nb-k)
+               if (kk>0) then
+                 ii=m-kk+1
+                 call stdlib_stpmlqt('L','T',kk , n, k, 0, mb, a(1,ii), lda,t(1,ctr*k+1), ldt, c(&
+                           1,1), ldc,c(ii,1), ldc, work, info )
+               else
+                 ii=m+1
+               end if
+               do i=ii-(nb-k),nb+1,-(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+nb)
+                 ctr = ctr - 1
+                 call stdlib_stpmlqt('L','T',nb-k , n, k, 0,mb, a(1,i), lda,t(1,ctr*k+1),ldt, c(1,&
+                           1), ldc,c(i,1), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:nb)
+               call stdlib_sgemlqt('L','T',nb , n, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (left.and.notran) then
+               ! multiply q to the first block of c
+              kk = mod((m-k),(nb-k))
+              ii=m-kk+1
+              ctr = 1
+              call stdlib_sgemlqt('L','N',nb , n, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=nb+1,ii-nb+k,(nb-k)
+               ! multiply q to the current block of c (i:i+nb,1:n)
+               call stdlib_stpmlqt('L','N',nb-k , n, k, 0,mb, a(1,i), lda,t(1,ctr * k+1), ldt, c(&
+                         1,1), ldc,c(i,1), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=m) then
+               ! multiply q to the last block of c
+               call stdlib_stpmlqt('L','N',kk , n, k, 0, mb, a(1,ii), lda,t(1,ctr*k+1), ldt, c(1,&
+                         1), ldc,c(ii,1), ldc, work, info )
+              end if
+           else if(right.and.notran) then
+               ! multiply q to the last block of c
+               kk = mod((n-k),(nb-k))
+               ctr = (n-k)/(nb-k)
+               if (kk>0) then
+                 ii=n-kk+1
+                 call stdlib_stpmlqt('R','N',m , kk, k, 0, mb, a(1, ii), lda,t(1,ctr*k+1), ldt, c(&
+                           1,1), ldc,c(1,ii), ldc, work, info )
+               else
+                 ii=n+1
+               end if
+               do i=ii-(nb-k),nb+1,-(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+                  ctr = ctr - 1
+                  call stdlib_stpmlqt('R','N', m, nb-k, k, 0, mb, a(1, i), lda,t(1,ctr*k+1), ldt, &
+                            c(1,1), ldc,c(1,i), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:mb)
+               call stdlib_sgemlqt('R','N',m , nb, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (right.and.tran) then
+             ! multiply q to the first block of c
+              kk = mod((n-k),(nb-k))
+              ii=n-kk+1
+              ctr = 1
+              call stdlib_sgemlqt('R','T',m , nb, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=nb+1,ii-nb+k,(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               call stdlib_stpmlqt('R','T',m , nb-k, k, 0,mb, a(1,i), lda,t(1, ctr*k+1), ldt, c(1,&
+                         1), ldc,c(1,i), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=n) then
+             ! multiply q to the last block of c
+               call stdlib_stpmlqt('R','T',m , kk, k, 0,mb, a(1,ii), lda,t(1,ctr*k+1),ldt, c(1,1),&
+                          ldc,c(1,ii), ldc, work, info )
+              end if
+           end if
+           work(1) = lw
+           return
+     end subroutine stdlib_slamswlq
+
+     !> SLAMTSQR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product
+     !> of blocked elementary reflectors computed by tall skinny
+     !> QR factorization (SLATSQR)
+
+     pure subroutine stdlib_slamtsqr( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, mb, nb, ldt, lwork, ldc
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), t(ldt,*)
+           real(sp), intent(out) :: work(*)
+           real(sp), intent(inout) :: c(ldc,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: i, ii, kk, lw, ctr, q
+           ! External Subroutines 
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork<0
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'T' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           if (left) then
+             lw = n * nb
+             q = m
+           else
+             lw = mb * nb
+             q = n
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<k ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 ) then
+             info = -5
+           else if( k<nb .or. nb<1 ) then
+             info = -7
+           else if( lda<max( 1, q ) ) then
+             info = -9
+           else if( ldt<max( 1, nb) ) then
+             info = -11
+           else if( ldc<max( 1, m ) ) then
+              info = -13
+           else if(( lwork<max(1,lw)).and.(.not.lquery)) then
+             info = -15
+           end if
+           ! determine the block size if it is tall skinny or short and wide
+           if( info==0)  then
+               work(1) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'SLAMTSQR', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n,k)==0 ) then
+             return
+           end if
+           if((mb<=k).or.(mb>=max(m,n,k))) then
+             call stdlib_sgemqrt( side, trans, m, n, k, nb, a, lda,t, ldt, c, ldc, work, info)
+                       
+             return
+            end if
+           if(left.and.notran) then
+               ! multiply q to the last block of c
+              kk = mod((m-k),(mb-k))
+              ctr = (m-k)/(mb-k)
+              if (kk>0) then
+                ii=m-kk+1
+                call stdlib_stpmqrt('L','N',kk , n, k, 0, nb, a(ii,1), lda,t(1,ctr*k+1),ldt , c(1,&
+                          1), ldc,c(ii,1), ldc, work, info )
+              else
+                ii=m+1
+              end if
+              do i=ii-(mb-k),mb+1,-(mb-k)
+               ! multiply q to the current block of c (i:i+mb,1:n)
+                ctr = ctr - 1
+                call stdlib_stpmqrt('L','N',mb-k , n, k, 0,nb, a(i,1), lda,t(1, ctr * k + 1), ldt,&
+                           c(1,1), ldc,c(i,1), ldc, work, info )
+              end do
+               ! multiply q to the first block of c (1:mb,1:n)
+              call stdlib_sgemqrt('L','N',mb , n, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+           else if (left.and.tran) then
+               ! multiply q to the first block of c
+              kk = mod((m-k),(mb-k))
+              ii=m-kk+1
+              ctr = 1
+              call stdlib_sgemqrt('L','T',mb , n, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=mb+1,ii-mb+k,(mb-k)
+               ! multiply q to the current block of c (i:i+mb,1:n)
+               call stdlib_stpmqrt('L','T',mb-k , n, k, 0,nb, a(i,1), lda,t(1,ctr * k + 1),ldt, c(&
+                         1,1), ldc,c(i,1), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=m) then
+               ! multiply q to the last block of c
+               call stdlib_stpmqrt('L','T',kk , n, k, 0,nb, a(ii,1), lda,t(1, ctr * k + 1), ldt, &
+                         c(1,1), ldc,c(ii,1), ldc, work, info )
+              end if
+           else if(right.and.tran) then
+               ! multiply q to the last block of c
+               kk = mod((n-k),(mb-k))
+               ctr = (n-k)/(mb-k)
+               if (kk>0) then
+                 ii=n-kk+1
+                 call stdlib_stpmqrt('R','T',m , kk, k, 0, nb, a(ii,1), lda,t(1, ctr * k + 1), &
+                           ldt, c(1,1), ldc,c(1,ii), ldc, work, info )
+               else
+                 ii=n+1
+               end if
+               do i=ii-(mb-k),mb+1,-(mb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+                 ctr = ctr - 1
+                 call stdlib_stpmqrt('R','T',m , mb-k, k, 0,nb, a(i,1), lda,t(1, ctr * k + 1), &
+                           ldt, c(1,1), ldc,c(1,i), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:mb)
+               call stdlib_sgemqrt('R','T',m , mb, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (right.and.notran) then
+               ! multiply q to the first block of c
+              kk = mod((n-k),(mb-k))
+              ii=n-kk+1
+              ctr = 1
+              call stdlib_sgemqrt('R','N', m, mb , k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=mb+1,ii-mb+k,(mb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               call stdlib_stpmqrt('R','N', m, mb-k, k, 0,nb, a(i,1), lda,t(1, ctr * k + 1),ldt, &
+                         c(1,1), ldc,c(1,i), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=n) then
+               ! multiply q to the last block of c
+               call stdlib_stpmqrt('R','N', m, kk , k, 0,nb, a(ii,1), lda,t(1, ctr * k + 1),ldt, &
+                         c(1,1), ldc,c(1,ii), ldc, work, info )
+              end if
+           end if
+           work(1) = lw
+           return
+     end subroutine stdlib_slamtsqr
+
+     !> SLANV2: computes the Schur factorization of a real 2-by-2 nonsymmetric
+     !> matrix in standard form:
+     !> [ A  B ] = [ CS -SN ] [ AA  BB ] [ CS  SN ]
+     !> [ C  D ]   [ SN  CS ] [ CC  DD ] [-SN  CS ]
+     !> where either
+     !> 1) CC = 0 so that AA and DD are real eigenvalues of the matrix, or
+     !> 2) AA = DD and BB*CC < 0, so that AA + or - sqrt(BB*CC) are complex
+     !> conjugate eigenvalues.
+
+     pure subroutine stdlib_slanv2( a, b, c, d, rt1r, rt1i, rt2r, rt2i, cs, sn )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(inout) :: a, b, c, d
+           real(sp), intent(out) :: cs, rt1i, rt1r, rt2i, rt2r, sn
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: multpl = 4.0e+0_sp
+           
+           
+           ! Local Scalars 
+           real(sp) :: aa, bb, bcmax, bcmis, cc, cs1, dd, eps, p, sab, sac, scale, sigma, sn1, &
+                     tau, temp, z, safmin, safmn2, safmx2
+           integer(ilp) :: count
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sign,sqrt
+           ! Executable Statements 
+           safmin = stdlib_slamch( 'S' )
+           eps = stdlib_slamch( 'P' )
+           safmn2 = stdlib_slamch( 'B' )**int( log( safmin / eps ) /log( stdlib_slamch( 'B' ) ) / &
+                     two,KIND=ilp)
+           safmx2 = one / safmn2
+           if( c==zero ) then
+              cs = one
+              sn = zero
+           else if( b==zero ) then
+              ! swap rows and columns
+              cs = zero
+              sn = one
+              temp = d
+              d = a
+              a = temp
+              b = -c
+              c = zero
+           else if( (a-d)==zero .and. sign( one, b )/=sign( one, c ) ) then
+              cs = one
+              sn = zero
+           else
+              temp = a - d
+              p = half*temp
+              bcmax = max( abs( b ), abs( c ) )
+              bcmis = min( abs( b ), abs( c ) )*sign( one, b )*sign( one, c )
+              scale = max( abs( p ), bcmax )
+              z = ( p / scale )*p + ( bcmax / scale )*bcmis
+              ! if z is of the order of the machine accuracy, postpone the
+              ! decision on the nature of eigenvalues
+              if( z>=multpl*eps ) then
+                 ! real eigenvalues. compute a and d.
+                 z = p + sign( sqrt( scale )*sqrt( z ), p )
+                 a = d + z
+                 d = d - ( bcmax / z )*bcmis
+                 ! compute b and the rotation matrix
+                 tau = stdlib_slapy2( c, z )
+                 cs = z / tau
+                 sn = c / tau
+                 b = b - c
+                 c = zero
+              else
+                 ! complex eigenvalues, or real(almost,KIND=sp) equal eigenvalues.
+                 ! make diagonal elements equal.
+                 count = 0
+                 sigma = b + c
+                 10 continue
+                 count = count + 1
+                 scale = max( abs(temp), abs(sigma) )
+                 if( scale>=safmx2 ) then
+                    sigma = sigma * safmn2
+                    temp = temp * safmn2
+                    if (count <= 20)goto 10
+                 end if
+                 if( scale<=safmn2 ) then
+                    sigma = sigma * safmx2
+                    temp = temp * safmx2
+                    if (count <= 20)goto 10
+                 end if
+                 p = half*temp
+                 tau = stdlib_slapy2( sigma, temp )
+                 cs = sqrt( half*( one+abs( sigma ) / tau ) )
+                 sn = -( p / ( tau*cs ) )*sign( one, sigma )
+                 ! compute [ aa  bb ] = [ a  b ] [ cs -sn ]
+                         ! [ cc  dd ]   [ c  d ] [ sn  cs ]
+                 aa = a*cs + b*sn
+                 bb = -a*sn + b*cs
+                 cc = c*cs + d*sn
+                 dd = -c*sn + d*cs
+                 ! compute [ a  b ] = [ cs  sn ] [ aa  bb ]
+                         ! [ c  d ]   [-sn  cs ] [ cc  dd ]
+                 a = aa*cs + cc*sn
+                 b = bb*cs + dd*sn
+                 c = -aa*sn + cc*cs
+                 d = -bb*sn + dd*cs
+                 temp = half*( a+d )
+                 a = temp
+                 d = temp
+                 if( c/=zero ) then
+                    if( b/=zero ) then
+                       if( sign( one, b )==sign( one, c ) ) then
+                          ! real eigenvalues: reduce to upper triangular form
+                          sab = sqrt( abs( b ) )
+                          sac = sqrt( abs( c ) )
+                          p = sign( sab*sac, c )
+                          tau = one / sqrt( abs( b+c ) )
+                          a = temp + p
+                          d = temp - p
+                          b = b - c
+                          c = zero
+                          cs1 = sab*tau
+                          sn1 = sac*tau
+                          temp = cs*cs1 - sn*sn1
+                          sn = cs*sn1 + sn*cs1
+                          cs = temp
+                       end if
+                    else
+                       b = -c
+                       c = zero
+                       temp = cs
+                       cs = -sn
+                       sn = temp
+                    end if
+                 end if
+              end if
+           end if
+           ! store eigenvalues in (rt1r,rt1i) and (rt2r,rt2i).
+           rt1r = a
+           rt2r = d
+           if( c==zero ) then
+              rt1i = zero
+              rt2i = zero
+           else
+              rt1i = sqrt( abs( b ) )*sqrt( abs( c ) )
+              rt2i = -rt1i
+           end if
+           return
+     end subroutine stdlib_slanv2
+
+     !> Given two column vectors X and Y, let
+     !> A = ( X Y ).
+     !> The subroutine first computes the QR factorization of A = Q*R,
+     !> and then computes the SVD of the 2-by-2 upper triangular matrix R.
+     !> The smaller singular value of R is returned in SSMIN, which is used
+     !> as the measurement of the linear dependency of the vectors X and Y.
+
+     pure subroutine stdlib_slapll( n, x, incx, y, incy, ssmin )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           real(sp), intent(out) :: ssmin
+           ! Array Arguments 
+           real(sp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: a11, a12, a22, c, ssmax, tau
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 ) then
+              ssmin = zero
+              return
+           end if
+           ! compute the qr factorization of the n-by-2 matrix ( x y )
+           call stdlib_slarfg( n, x( 1 ), x( 1+incx ), incx, tau )
+           a11 = x( 1 )
+           x( 1 ) = one
+           c = -tau*stdlib_sdot( n, x, incx, y, incy )
+           call stdlib_saxpy( n, c, x, incx, y, incy )
+           call stdlib_slarfg( n-1, y( 1+incy ), y( 1+2*incy ), incy, tau )
+           a12 = y( 1 )
+           a22 = y( 1+incy )
+           ! compute the svd of 2-by-2 upper triangular matrix.
+           call stdlib_slas2( a11, a12, a22, ssmin, ssmax )
+           return
+     end subroutine stdlib_slapll
+
+     !> SLAQP2: computes a QR factorization with column pivoting of
+     !> the block A(OFFSET+1:M,1:N).
+     !> The block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.
+
+     pure subroutine stdlib_slaqp2( m, n, offset, a, lda, jpvt, tau, vn1, vn2,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, m, n, offset
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(sp), intent(inout) :: a(lda,*), vn1(*), vn2(*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, itemp, j, mn, offpi, pvt
+           real(sp) :: aii, temp, temp2, tol3z
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           mn = min( m-offset, n )
+           tol3z = sqrt(stdlib_slamch('EPSILON'))
+           ! compute factorization.
+           loop_20: do i = 1, mn
+              offpi = offset + i
+              ! determine ith pivot column and swap if necessary.
+              pvt = ( i-1 ) + stdlib_isamax( n-i+1, vn1( i ), 1 )
+              if( pvt/=i ) then
+                 call stdlib_sswap( m, a( 1, pvt ), 1, a( 1, i ), 1 )
+                 itemp = jpvt( pvt )
+                 jpvt( pvt ) = jpvt( i )
+                 jpvt( i ) = itemp
+                 vn1( pvt ) = vn1( i )
+                 vn2( pvt ) = vn2( i )
+              end if
+              ! generate elementary reflector h(i).
+              if( offpi<m ) then
+                 call stdlib_slarfg( m-offpi+1, a( offpi, i ), a( offpi+1, i ), 1,tau( i ) )
+                           
+              else
+                 call stdlib_slarfg( 1, a( m, i ), a( m, i ), 1, tau( i ) )
+              end if
+              if( i<n ) then
+                 ! apply h(i)**t to a(offset+i:m,i+1:n) from the left.
+                 aii = a( offpi, i )
+                 a( offpi, i ) = one
+                 call stdlib_slarf( 'LEFT', m-offpi+1, n-i, a( offpi, i ), 1,tau( i ), a( offpi, &
+                           i+1 ), lda, work( 1 ) )
+                 a( offpi, i ) = aii
+              end if
+              ! update partial column norms.
+              do j = i + 1, n
+                 if( vn1( j )/=zero ) then
+                    ! note: the following 4 lines follow from the analysis in
+                    ! lapack working note 176.
+                    temp = one - ( abs( a( offpi, j ) ) / vn1( j ) )**2
+                    temp = max( temp, zero )
+                    temp2 = temp*( vn1( j ) / vn2( j ) )**2
+                    if( temp2 <= tol3z ) then
+                       if( offpi<m ) then
+                          vn1( j ) = stdlib_snrm2( m-offpi, a( offpi+1, j ), 1 )
+                          vn2( j ) = vn1( j )
+                       else
+                          vn1( j ) = zero
+                          vn2( j ) = zero
+                       end if
+                    else
+                       vn1( j ) = vn1( j )*sqrt( temp )
+                    end if
+                 end if
+              end do
+           end do loop_20
+           return
+     end subroutine stdlib_slaqp2
+
+     !> SLAQPS: computes a step of QR factorization with column pivoting
+     !> of a real M-by-N matrix A by using Blas-3.  It tries to factorize
+     !> NB columns from A starting from the row OFFSET+1, and updates all
+     !> of the matrix with Blas-3 xGEMM.
+     !> In some cases, due to catastrophic cancellations, it cannot
+     !> factorize NB columns.  Hence, the actual number of factorized
+     !> columns is returned in KB.
+     !> Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.
+
+     pure subroutine stdlib_slaqps( m, n, offset, nb, kb, a, lda, jpvt, tau, vn1,vn2, auxv, f, &
+               ldf )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: kb
+           integer(ilp), intent(in) :: lda, ldf, m, n, nb, offset
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(sp), intent(inout) :: a(lda,*), auxv(*), f(ldf,*), vn1(*), vn2(*)
+           real(sp), intent(out) :: tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: itemp, j, k, lastrk, lsticc, pvt, rk
+           real(sp) :: akk, temp, temp2, tol3z
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,nint,real,sqrt
+           ! Executable Statements 
+           lastrk = min( m, n+offset )
+           lsticc = 0
+           k = 0
+           tol3z = sqrt(stdlib_slamch('EPSILON'))
+           ! beginning of while loop.
+           10 continue
+           if( ( k<nb ) .and. ( lsticc==0 ) ) then
+              k = k + 1
+              rk = offset + k
+              ! determine ith pivot column and swap if necessary
+              pvt = ( k-1 ) + stdlib_isamax( n-k+1, vn1( k ), 1 )
+              if( pvt/=k ) then
+                 call stdlib_sswap( m, a( 1, pvt ), 1, a( 1, k ), 1 )
+                 call stdlib_sswap( k-1, f( pvt, 1 ), ldf, f( k, 1 ), ldf )
+                 itemp = jpvt( pvt )
+                 jpvt( pvt ) = jpvt( k )
+                 jpvt( k ) = itemp
+                 vn1( pvt ) = vn1( k )
+                 vn2( pvt ) = vn2( k )
+              end if
+              ! apply previous householder reflectors to column k:
+              ! a(rk:m,k) := a(rk:m,k) - a(rk:m,1:k-1)*f(k,1:k-1)**t.
+              if( k>1 ) then
+                 call stdlib_sgemv( 'NO TRANSPOSE', m-rk+1, k-1, -one, a( rk, 1 ),lda, f( k, 1 ), &
+                           ldf, one, a( rk, k ), 1 )
+              end if
+              ! generate elementary reflector h(k).
+              if( rk<m ) then
+                 call stdlib_slarfg( m-rk+1, a( rk, k ), a( rk+1, k ), 1, tau( k ) )
+              else
+                 call stdlib_slarfg( 1, a( rk, k ), a( rk, k ), 1, tau( k ) )
+              end if
+              akk = a( rk, k )
+              a( rk, k ) = one
+              ! compute kth column of f:
+              ! compute  f(k+1:n,k) := tau(k)*a(rk:m,k+1:n)**t*a(rk:m,k).
+              if( k<n ) then
+                 call stdlib_sgemv( 'TRANSPOSE', m-rk+1, n-k, tau( k ),a( rk, k+1 ), lda, a( rk, &
+                           k ), 1, zero,f( k+1, k ), 1 )
+              end if
+              ! padding f(1:k,k) with zeros.
+              do j = 1, k
+                 f( j, k ) = zero
+              end do
+              ! incremental updating of f:
+              ! f(1:n,k) := f(1:n,k) - tau(k)*f(1:n,1:k-1)*a(rk:m,1:k-1)**t
+                          ! *a(rk:m,k).
+              if( k>1 ) then
+                 call stdlib_sgemv( 'TRANSPOSE', m-rk+1, k-1, -tau( k ), a( rk, 1 ),lda, a( rk, k &
+                           ), 1, zero, auxv( 1 ), 1 )
+                 call stdlib_sgemv( 'NO TRANSPOSE', n, k-1, one, f( 1, 1 ), ldf,auxv( 1 ), 1, one,&
+                            f( 1, k ), 1 )
+              end if
+              ! update the current row of a:
+              ! a(rk,k+1:n) := a(rk,k+1:n) - a(rk,1:k)*f(k+1:n,1:k)**t.
+              if( k<n ) then
+                 call stdlib_sgemv( 'NO TRANSPOSE', n-k, k, -one, f( k+1, 1 ), ldf,a( rk, 1 ), &
+                           lda, one, a( rk, k+1 ), lda )
+              end if
+              ! update partial column norms.
+              if( rk<lastrk ) then
+                 do j = k + 1, n
+                    if( vn1( j )/=zero ) then
+                       ! note: the following 4 lines follow from the analysis in
+                       ! lapack working note 176.
+                       temp = abs( a( rk, j ) ) / vn1( j )
+                       temp = max( zero, ( one+temp )*( one-temp ) )
+                       temp2 = temp*( vn1( j ) / vn2( j ) )**2
+                       if( temp2 <= tol3z ) then
+                          vn2( j ) = real( lsticc,KIND=sp)
+                          lsticc = j
+                       else
+                          vn1( j ) = vn1( j )*sqrt( temp )
+                       end if
+                    end if
+                 end do
+              end if
+              a( rk, k ) = akk
+              ! end of while loop.
+              go to 10
+           end if
+           kb = k
+           rk = offset + kb
+           ! apply the block reflector to the rest of the matrix:
+           ! a(offset+kb+1:m,kb+1:n) := a(offset+kb+1:m,kb+1:n) -
+                               ! a(offset+kb+1:m,1:kb)*f(kb+1:n,1:kb)**t.
+           if( kb<min( n, m-offset ) ) then
+              call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', m-rk, n-kb, kb, -one,a( rk+1, 1 ), &
+                        lda, f( kb+1, 1 ), ldf, one,a( rk+1, kb+1 ), lda )
+           end if
+           ! recomputation of difficult columns.
+           40 continue
+           if( lsticc>0 ) then
+              itemp = nint( vn2( lsticc ),KIND=ilp)
+              vn1( lsticc ) = stdlib_snrm2( m-rk, a( rk+1, lsticc ), 1 )
+              ! note: the computation of vn1( lsticc ) relies on the fact that
+              ! stdlib_snrm2 does not fail on vectors with norm below the value of
+              ! sqrt(stdlib_dlamch('s'))
+              vn2( lsticc ) = vn1( lsticc )
+              lsticc = itemp
+              go to 40
+           end if
+           return
+     end subroutine stdlib_slaqps
+
+     !> SLAQR5:, called by SLAQR0, performs a
+     !> single small-bulge multi-shift QR sweep.
+
+     pure subroutine stdlib_slaqr5( wantt, wantz, kacc22, n, ktop, kbot, nshfts,sr, si, h, ldh, &
+               iloz, ihiz, z, ldz, v, ldv, u,ldu, nv, wv, ldwv, nh, wh, ldwh )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kacc22, kbot, ktop, ldh, ldu, ldv, ldwh, ldwv, &
+                     ldz, n, nh, nshfts, nv
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(sp), intent(inout) :: h(ldh,*), si(*), sr(*), z(ldz,*)
+           real(sp), intent(out) :: u(ldu,*), v(ldv,*), wh(ldwh,*), wv(ldwv,*)
+        ! ================================================================
+           
+           ! Local Scalars 
+           real(sp) :: alpha, beta, h11, h12, h21, h22, refsum, safmax, safmin, scl, smlnum, swap,&
+                      tst1, tst2, ulp
+           integer(ilp) :: i, i2, i4, incol, j, jbot, jcol, jlen, jrow, jtop, k, k1, kdu, kms, &
+                     krcol, m, m22, mbot, mtop, nbmps, ndcol, ns, nu
+           logical(lk) :: accum, bmp22
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,mod,real
+           ! Local Arrays 
+           real(sp) :: vt(3)
+           ! Executable Statements 
+           ! ==== if there are no shifts, then there is nothing to do. ====
+           if( nshfts<2 )return
+           ! ==== if the active block is empty or 1-by-1, then there
+           ! .    is nothing to do. ====
+           if( ktop>=kbot )return
+           ! ==== shuffle shifts into pairs of real shifts and pairs
+           ! .    of complex conjugate shifts assuming complex
+           ! .    conjugate shifts are already adjacent to one
+           ! .    another. ====
+           do i = 1, nshfts - 2, 2
+              if( si( i )/=-si( i+1 ) ) then
+                 swap = sr( i )
+                 sr( i ) = sr( i+1 )
+                 sr( i+1 ) = sr( i+2 )
+                 sr( i+2 ) = swap
+                 swap = si( i )
+                 si( i ) = si( i+1 )
+                 si( i+1 ) = si( i+2 )
+                 si( i+2 ) = swap
+              end if
+           end do
+           ! ==== nshfts is supposed to be even, but if it is odd,
+           ! .    then simply reduce it by one.  the shuffle above
+           ! .    ensures that the dropped shift is real and that
+           ! .    the remaining shifts are paired. ====
+           ns = nshfts - mod( nshfts, 2 )
+           ! ==== machine constants for deflation ====
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = one / safmin
+           call stdlib_slabad( safmin, safmax )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=sp) / ulp )
+           ! ==== use accumulated reflections to update far-from-diagonal
+           ! .    entries ? ====
+           accum = ( kacc22==1 ) .or. ( kacc22==2 )
+           ! ==== clear trash ====
+           if( ktop+2<=kbot )h( ktop+2, ktop ) = zero
+           ! ==== nbmps = number of 2-shift bulges in the chain ====
+           nbmps = ns / 2
+           ! ==== kdu = width of slab ====
+           kdu = 4*nbmps
+           ! ==== create and chase chains of nbmps bulges ====
+           loop_180: do incol = ktop - 2*nbmps + 1, kbot - 2, 2*nbmps
+              ! jtop = index from which updates from the right start.
+              if( accum ) then
+                 jtop = max( ktop, incol )
+              else if( wantt ) then
+                 jtop = 1
+              else
+                 jtop = ktop
+              end if
+              ndcol = incol + kdu
+              if( accum )call stdlib_slaset( 'ALL', kdu, kdu, zero, one, u, ldu )
+              ! ==== near-the-diagonal bulge chase.  the following loop
+              ! .    performs the near-the-diagonal part of a small bulge
+              ! .    multi-shift qr sweep.  each 4*nbmps column diagonal
+              ! .    chunk extends from column incol to column ndcol
+              ! .    (including both column incol and column ndcol). the
+              ! .    following loop chases a 2*nbmps+1 column long chain of
+              ! .    nbmps bulges 2*nbmps-1 columns to the right.  (incol
+              ! .    may be less than ktop and and ndcol may be greater than
+              ! .    kbot indicating phantom columns from which to chase
+              ! .    bulges before they are actually introduced or to which
+              ! .    to chase bulges beyond column kbot.)  ====
+              loop_145: do krcol = incol, min( incol+2*nbmps-1, kbot-2 )
+                 ! ==== bulges number mtop to mbot are active double implicit
+                 ! .    shift bulges.  there may or may not also be small
+                 ! .    2-by-2 bulge, if there is room.  the inactive bulges
+                 ! .    (if any) must wait until the active bulges have moved
+                 ! .    down the diagonal to make room.  the phantom matrix
+                 ! .    paradigm described above helps keep track.  ====
+                 mtop = max( 1, ( ktop-krcol ) / 2+1 )
+                 mbot = min( nbmps, ( kbot-krcol-1 ) / 2 )
+                 m22 = mbot + 1
+                 bmp22 = ( mbot<nbmps ) .and. ( krcol+2*( m22-1 ) )==( kbot-2 )
+                 ! ==== generate reflections to chase the chain right
+                 ! .    one column.  (the minimum value of k is ktop-1.) ====
+                 if ( bmp22 ) then
+                    ! ==== special case: 2-by-2 reflection at bottom treated
+                    ! .    separately ====
+                    k = krcol + 2*( m22-1 )
+                    if( k==ktop-1 ) then
+                       call stdlib_slaqr1( 2, h( k+1, k+1 ), ldh, sr( 2*m22-1 ),si( 2*m22-1 ), sr(&
+                                  2*m22 ), si( 2*m22 ),v( 1, m22 ) )
+                       beta = v( 1, m22 )
+                       call stdlib_slarfg( 2, beta, v( 2, m22 ), 1, v( 1, m22 ) )
+                    else
+                       beta = h( k+1, k )
+                       v( 2, m22 ) = h( k+2, k )
+                       call stdlib_slarfg( 2, beta, v( 2, m22 ), 1, v( 1, m22 ) )
+                       h( k+1, k ) = beta
+                       h( k+2, k ) = zero
+                    end if
+                    ! ==== perform update from right within
+                    ! .    computational window. ====
+                    do j = jtop, min( kbot, k+3 )
+                       refsum = v( 1, m22 )*( h( j, k+1 )+v( 2, m22 )*h( j, k+2 ) )
+                       h( j, k+1 ) = h( j, k+1 ) - refsum
+                       h( j, k+2 ) = h( j, k+2 ) - refsum*v( 2, m22 )
+                    end do
+                    ! ==== perform update from left within
+                    ! .    computational window. ====
+                    if( accum ) then
+                       jbot = min( ndcol, kbot )
+                    else if( wantt ) then
+                       jbot = n
+                    else
+                       jbot = kbot
+                    end if
+                    do j = k+1, jbot
+                       refsum = v( 1, m22 )*( h( k+1, j )+v( 2, m22 )*h( k+2, j ) )
+                       h( k+1, j ) = h( k+1, j ) - refsum
+                       h( k+2, j ) = h( k+2, j ) - refsum*v( 2, m22 )
+                    end do
+                    ! ==== the following convergence test requires that
+                    ! .    the tradition small-compared-to-nearby-diagonals
+                    ! .    criterion and the ahues
+                    ! .    criteria both be satisfied.  the latter improves
+                    ! .    accuracy in some examples. falling back on an
+                    ! .    alternate convergence criterion when tst1 or tst2
+                    ! .    is zero (as done here) is traditional but probably
+                    ! .    unnecessary. ====
+                    if( k>=ktop ) then
+                       if( h( k+1, k )/=zero ) then
+                          tst1 = abs( h( k, k ) ) + abs( h( k+1, k+1 ) )
+                          if( tst1==zero ) then
+                             if( k>=ktop+1 )tst1 = tst1 + abs( h( k, k-1 ) )
+                             if( k>=ktop+2 )tst1 = tst1 + abs( h( k, k-2 ) )
+                             if( k>=ktop+3 )tst1 = tst1 + abs( h( k, k-3 ) )
+                             if( k<=kbot-2 )tst1 = tst1 + abs( h( k+2, k+1 ) )
+                             if( k<=kbot-3 )tst1 = tst1 + abs( h( k+3, k+1 ) )
+                             if( k<=kbot-4 )tst1 = tst1 + abs( h( k+4, k+1 ) )
+                          end if
+                          if( abs( h( k+1, k ) )<=max( smlnum, ulp*tst1 ) )then
+                             h12 = max( abs( h( k+1, k ) ),abs( h( k, k+1 ) ) )
+                             h21 = min( abs( h( k+1, k ) ),abs( h( k, k+1 ) ) )
+                             h11 = max( abs( h( k+1, k+1 ) ),abs( h( k, k )-h( k+1, k+1 ) ) )
+                                       
+                             h22 = min( abs( h( k+1, k+1 ) ),abs( h( k, k )-h( k+1, k+1 ) ) )
+                                       
+                             scl = h11 + h12
+                             tst2 = h22*( h11 / scl )
+                             if( tst2==zero .or. h21*( h12 / scl )<=max( smlnum, ulp*tst2 ) ) &
+                                       then
+                                h( k+1, k ) = zero
+                             end if
+                          end if
+                       end if
+                    end if
+                    ! ==== accumulate orthogonal transformations. ====
+                    if( accum ) then
+                       kms = k - incol
+                       do j = max( 1, ktop-incol ), kdu
+                          refsum = v( 1, m22 )*( u( j, kms+1 )+v( 2, m22 )*u( j, kms+2 ) )
+                          u( j, kms+1 ) = u( j, kms+1 ) - refsum
+                          u( j, kms+2 ) = u( j, kms+2 ) - refsum*v( 2, m22 )
+                       end do
+                    else if( wantz ) then
+                       do j = iloz, ihiz
+                          refsum = v( 1, m22 )*( z( j, k+1 )+v( 2, m22 )*z( j, k+2 ) )
+                          z( j, k+1 ) = z( j, k+1 ) - refsum
+                          z( j, k+2 ) = z( j, k+2 ) - refsum*v( 2, m22 )
+                       end do
+                    end if
+                 end if
+                 ! ==== normal case: chain of 3-by-3 reflections ====
+                 loop_80: do m = mbot, mtop, -1
+                    k = krcol + 2*( m-1 )
+                    if( k==ktop-1 ) then
+                       call stdlib_slaqr1( 3, h( ktop, ktop ), ldh, sr( 2*m-1 ),si( 2*m-1 ), sr( &
+                                 2*m ), si( 2*m ),v( 1, m ) )
+                       alpha = v( 1, m )
+                       call stdlib_slarfg( 3, alpha, v( 2, m ), 1, v( 1, m ) )
+                    else
+                       ! ==== perform delayed transformation of row below
+                       ! .    mth bulge. exploit fact that first two elements
+                       ! .    of row are actually zero. ====
+                       refsum = v( 1, m )*v( 3, m )*h( k+3, k+2 )
+                       h( k+3, k   ) = -refsum
+                       h( k+3, k+1 ) = -refsum*v( 2, m )
+                       h( k+3, k+2 ) = h( k+3, k+2 ) - refsum*v( 3, m )
+                       ! ==== calculate reflection to move
+                       ! .    mth bulge one step. ====
+                       beta      = h( k+1, k )
+                       v( 2, m ) = h( k+2, k )
+                       v( 3, m ) = h( k+3, k )
+                       call stdlib_slarfg( 3, beta, v( 2, m ), 1, v( 1, m ) )
+                       ! ==== a bulge may collapse because of vigilant
+                       ! .    deflation or destructive underflow.  in the
+                       ! .    underflow case, try the two-small-subdiagonals
+                       ! .    trick to try to reinflate the bulge.  ====
+                       if( h( k+3, k )/=zero .or. h( k+3, k+1 )/=zero .or. h( k+3, k+2 )==zero ) &
+                                 then
+                          ! ==== typical case: not collapsed (yet). ====
+                          h( k+1, k ) = beta
+                          h( k+2, k ) = zero
+                          h( k+3, k ) = zero
+                       else
+                          ! ==== atypical case: collapsed.  attempt to
+                          ! .    reintroduce ignoring h(k+1,k) and h(k+2,k).
+                          ! .    if the fill resulting from the new
+                          ! .    reflector is too large, then abandon it.
+                          ! .    otherwise, use the new one. ====
+                          call stdlib_slaqr1( 3, h( k+1, k+1 ), ldh, sr( 2*m-1 ),si( 2*m-1 ), sr( &
+                                    2*m ), si( 2*m ),vt )
+                          alpha = vt( 1 )
+                          call stdlib_slarfg( 3, alpha, vt( 2 ), 1, vt( 1 ) )
+                          refsum = vt( 1 )*( h( k+1, k )+vt( 2 )*h( k+2, k ) )
+                          if( abs( h( k+2, k )-refsum*vt( 2 ) )+abs( refsum*vt( 3 ) )>ulp*( abs( &
+                                    h( k, k ) )+abs( h( k+1,k+1 ) )+abs( h( k+2, k+2 ) ) ) ) then
+                             ! ==== starting a new bulge here would
+                             ! .    create non-negligible fill.  use
+                             ! .    the old one with trepidation. ====
+                             h( k+1, k ) = beta
+                             h( k+2, k ) = zero
+                             h( k+3, k ) = zero
+                          else
+                             ! ==== starting a new bulge here would
+                             ! .    create only negligible fill.
+                             ! .    replace the old reflector with
+                             ! .    the new one. ====
+                             h( k+1, k ) = h( k+1, k ) - refsum
+                             h( k+2, k ) = zero
+                             h( k+3, k ) = zero
+                             v( 1, m ) = vt( 1 )
+                             v( 2, m ) = vt( 2 )
+                             v( 3, m ) = vt( 3 )
+                          end if
+                       end if
+                    end if
+                    ! ====  apply reflection from the right and
+                    ! .     the first column of update from the left.
+                    ! .     these updates are required for the vigilant
+                    ! .     deflation check. we still delay most of the
+                    ! .     updates from the left for efficiency. ====
+                    do j = jtop, min( kbot, k+3 )
+                       refsum = v( 1, m )*( h( j, k+1 )+v( 2, m )*h( j, k+2 )+v( 3, m )*h( j, k+3 &
+                                 ) )
+                       h( j, k+1 ) = h( j, k+1 ) - refsum
+                       h( j, k+2 ) = h( j, k+2 ) - refsum*v( 2, m )
+                       h( j, k+3 ) = h( j, k+3 ) - refsum*v( 3, m )
+                    end do
+                    ! ==== perform update from left for subsequent
+                    ! .    column. ====
+                    refsum = v( 1, m )*( h( k+1, k+1 )+v( 2, m )*h( k+2, k+1 )+v( 3, m )*h( k+3, &
+                              k+1 ) )
+                    h( k+1, k+1 ) = h( k+1, k+1 ) - refsum
+                    h( k+2, k+1 ) = h( k+2, k+1 ) - refsum*v( 2, m )
+                    h( k+3, k+1 ) = h( k+3, k+1 ) - refsum*v( 3, m )
+                    ! ==== the following convergence test requires that
+                    ! .    the tradition small-compared-to-nearby-diagonals
+                    ! .    criterion and the ahues
+                    ! .    criteria both be satisfied.  the latter improves
+                    ! .    accuracy in some examples. falling back on an
+                    ! .    alternate convergence criterion when tst1 or tst2
+                    ! .    is zero (as done here) is traditional but probably
+                    ! .    unnecessary. ====
+                    if( k<ktop)cycle
+                    if( h( k+1, k )/=zero ) then
+                       tst1 = abs( h( k, k ) ) + abs( h( k+1, k+1 ) )
+                       if( tst1==zero ) then
+                          if( k>=ktop+1 )tst1 = tst1 + abs( h( k, k-1 ) )
+                          if( k>=ktop+2 )tst1 = tst1 + abs( h( k, k-2 ) )
+                          if( k>=ktop+3 )tst1 = tst1 + abs( h( k, k-3 ) )
+                          if( k<=kbot-2 )tst1 = tst1 + abs( h( k+2, k+1 ) )
+                          if( k<=kbot-3 )tst1 = tst1 + abs( h( k+3, k+1 ) )
+                          if( k<=kbot-4 )tst1 = tst1 + abs( h( k+4, k+1 ) )
+                       end if
+                       if( abs( h( k+1, k ) )<=max( smlnum, ulp*tst1 ) )then
+                          h12 = max( abs( h( k+1, k ) ), abs( h( k, k+1 ) ) )
+                          h21 = min( abs( h( k+1, k ) ), abs( h( k, k+1 ) ) )
+                          h11 = max( abs( h( k+1, k+1 ) ),abs( h( k, k )-h( k+1, k+1 ) ) )
+                          h22 = min( abs( h( k+1, k+1 ) ),abs( h( k, k )-h( k+1, k+1 ) ) )
+                          scl = h11 + h12
+                          tst2 = h22*( h11 / scl )
+                          if( tst2==zero .or. h21*( h12 / scl )<=max( smlnum, ulp*tst2 ) ) &
+                                    then
+                             h( k+1, k ) = zero
+                          end if
+                       end if
+                    end if
+                 end do loop_80
+                 ! ==== multiply h by reflections from the left ====
+                 if( accum ) then
+                    jbot = min( ndcol, kbot )
+                 else if( wantt ) then
+                    jbot = n
+                 else
+                    jbot = kbot
+                 end if
+                 do m = mbot, mtop, -1
+                    k = krcol + 2*( m-1 )
+                    do j = max( ktop, krcol + 2*m ), jbot
+                       refsum = v( 1, m )*( h( k+1, j )+v( 2, m )*h( k+2, j )+v( 3, m )*h( k+3, j &
+                                 ) )
+                       h( k+1, j ) = h( k+1, j ) - refsum
+                       h( k+2, j ) = h( k+2, j ) - refsum*v( 2, m )
+                       h( k+3, j ) = h( k+3, j ) - refsum*v( 3, m )
+                    end do
+                 end do
+                 ! ==== accumulate orthogonal transformations. ====
+                 if( accum ) then
+                    ! ==== accumulate u. (if needed, update z later
+                    ! .    with an efficient matrix-matrix
+                    ! .    multiply.) ====
+                    do m = mbot, mtop, -1
+                       k = krcol + 2*( m-1 )
+                       kms = k - incol
+                       i2 = max( 1, ktop-incol )
+                       i2 = max( i2, kms-(krcol-incol)+1 )
+                       i4 = min( kdu, krcol + 2*( mbot-1 ) - incol + 5 )
+                       do j = i2, i4
+                          refsum = v( 1, m )*( u( j, kms+1 )+v( 2, m )*u( j, kms+2 )+v( 3, m )*u( &
+                                    j, kms+3 ) )
+                          u( j, kms+1 ) = u( j, kms+1 ) - refsum
+                          u( j, kms+2 ) = u( j, kms+2 ) - refsum*v( 2, m )
+                          u( j, kms+3 ) = u( j, kms+3 ) - refsum*v( 3, m )
+                       end do
+                    end do
+                 else if( wantz ) then
+                    ! ==== u is not accumulated, so update z
+                    ! .    now by multiplying by reflections
+                    ! .    from the right. ====
+                    do m = mbot, mtop, -1
+                       k = krcol + 2*( m-1 )
+                       do j = iloz, ihiz
+                          refsum = v( 1, m )*( z( j, k+1 )+v( 2, m )*z( j, k+2 )+v( 3, m )*z( j, &
+                                    k+3 ) )
+                          z( j, k+1 ) = z( j, k+1 ) - refsum
+                          z( j, k+2 ) = z( j, k+2 ) - refsum*v( 2, m )
+                          z( j, k+3 ) = z( j, k+3 ) - refsum*v( 3, m )
+                       end do
+                    end do
+                 end if
+                 ! ==== end of near-the-diagonal bulge chase. ====
+              end do loop_145
+              ! ==== use u (if accumulated) to update far-from-diagonal
+              ! .    entries in h.  if required, use u to update z as
+              ! .    well. ====
+              if( accum ) then
+                 if( wantt ) then
+                    jtop = 1
+                    jbot = n
+                 else
+                    jtop = ktop
+                    jbot = kbot
+                 end if
+                 k1 = max( 1, ktop-incol )
+                 nu = ( kdu-max( 0, ndcol-kbot ) ) - k1 + 1
+                 ! ==== horizontal multiply ====
+                 do jcol = min( ndcol, kbot ) + 1, jbot, nh
+                    jlen = min( nh, jbot-jcol+1 )
+                    call stdlib_sgemm( 'C', 'N', nu, jlen, nu, one, u( k1, k1 ),ldu, h( incol+k1, &
+                              jcol ), ldh, zero, wh,ldwh )
+                    call stdlib_slacpy( 'ALL', nu, jlen, wh, ldwh,h( incol+k1, jcol ), ldh )
+                              
+                 end do
+                 ! ==== vertical multiply ====
+                 do jrow = jtop, max( ktop, incol ) - 1, nv
+                    jlen = min( nv, max( ktop, incol )-jrow )
+                    call stdlib_sgemm( 'N', 'N', jlen, nu, nu, one,h( jrow, incol+k1 ), ldh, u( &
+                              k1, k1 ),ldu, zero, wv, ldwv )
+                    call stdlib_slacpy( 'ALL', jlen, nu, wv, ldwv,h( jrow, incol+k1 ), ldh )
+                              
+                 end do
+                 ! ==== z multiply (also vertical) ====
+                 if( wantz ) then
+                    do jrow = iloz, ihiz, nv
+                       jlen = min( nv, ihiz-jrow+1 )
+                       call stdlib_sgemm( 'N', 'N', jlen, nu, nu, one,z( jrow, incol+k1 ), ldz, u(&
+                                  k1, k1 ),ldu, zero, wv, ldwv )
+                       call stdlib_slacpy( 'ALL', jlen, nu, wv, ldwv,z( jrow, incol+k1 ), ldz )
+                                 
+                    end do
+                 end if
+              end if
+           end do loop_180
+     end subroutine stdlib_slaqr5
+
+     !> SLAQTR: solves the real quasi-triangular system
+     !> op(T)*p = scale*c,               if LREAL = .TRUE.
+     !> or the complex quasi-triangular systems
+     !> op(T + iB)*(p+iq) = scale*(c+id),  if LREAL = .FALSE.
+     !> in real arithmetic, where T is upper quasi-triangular.
+     !> If LREAL = .FALSE., then the first diagonal block of T must be
+     !> 1 by 1, B is the specially structured matrix
+     !> B = [ b(1) b(2) ... b(n) ]
+     !> [       w            ]
+     !> [           w        ]
+     !> [              .     ]
+     !> [                 w  ]
+     !> op(A) = A or A**T, A**T denotes the transpose of
+     !> matrix A.
+     !> On input, X = [ c ].  On output, X = [ p ].
+     !> [ d ]                  [ q ]
+     !> This subroutine is designed for the condition number estimation
+     !> in routine STRSNA.
+
+     subroutine stdlib_slaqtr( ltran, lreal, n, t, ldt, b, w, scale, x, work,info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: lreal, ltran
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldt, n
+           real(sp), intent(out) :: scale
+           real(sp), intent(in) :: w
+           ! Array Arguments 
+           real(sp), intent(in) :: b(*), t(ldt,*)
+           real(sp), intent(out) :: work(*)
+           real(sp), intent(inout) :: x(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: i, ierr, j, j1, j2, jnext, k, n1, n2
+           real(sp) :: bignum, eps, rec, scaloc, si, smin, sminw, smlnum, sr, tjj, tmp, xj, xmax, &
+                     xnorm, z
+           ! Local Arrays 
+           real(sp) :: d(2,2), v(2,2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! do not test the input parameters for errors
+           notran = .not.ltran
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' ) / eps
+           bignum = one / smlnum
+           xnorm = stdlib_slange( 'M', n, n, t, ldt, d )
+           if( .not.lreal )xnorm = max( xnorm, abs( w ), stdlib_slange( 'M', n, 1, b, n, d ) )
+                     
+           smin = max( smlnum, eps*xnorm )
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           work( 1 ) = zero
+           do j = 2, n
+              work( j ) = stdlib_sasum( j-1, t( 1, j ), 1 )
+           end do
+           if( .not.lreal ) then
+              do i = 2, n
+                 work( i ) = work( i ) + abs( b( i ) )
+              end do
+           end if
+           n2 = 2*n
+           n1 = n
+           if( .not.lreal )n1 = n2
+           k = stdlib_isamax( n1, x, 1 )
+           xmax = abs( x( k ) )
+           scale = one
+           if( xmax>bignum ) then
+              scale = bignum / xmax
+              call stdlib_sscal( n1, scale, x, 1 )
+              xmax = bignum
+           end if
+           if( lreal ) then
+              if( notran ) then
+                 ! solve t*p = scale*c
+                 jnext = n
+                 loop_30: do j = n, 1, -1
+                    if( j>jnext )cycle loop_30
+                    j1 = j
+                    j2 = j
+                    jnext = j - 1
+                    if( j>1 ) then
+                       if( t( j, j-1 )/=zero ) then
+                          j1 = j - 1
+                          jnext = j - 2
+                       end if
+                    end if
+                    if( j1==j2 ) then
+                       ! meet 1 by 1 diagonal block
+                       ! scale to avoid overflow when computing
+                           ! x(j) = b(j)/t(j,j)
+                       xj = abs( x( j1 ) )
+                       tjj = abs( t( j1, j1 ) )
+                       tmp = t( j1, j1 )
+                       if( tjj<smin ) then
+                          tmp = smin
+                          tjj = smin
+                          info = 1
+                       end if
+                       if( xj==zero )cycle loop_30
+                       if( tjj<one ) then
+                          if( xj>bignum*tjj ) then
+                             rec = one / xj
+                             call stdlib_sscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j1 ) = x( j1 ) / tmp
+                       xj = abs( x( j1 ) )
+                       ! scale x if necessary to avoid overflow when adding a
+                       ! multiple of column j1 of t.
+                       if( xj>one ) then
+                          rec = one / xj
+                          if( work( j1 )>( bignum-xmax )*rec ) then
+                             call stdlib_sscal( n, rec, x, 1 )
+                             scale = scale*rec
+                          end if
+                       end if
+                       if( j1>1 ) then
+                          call stdlib_saxpy( j1-1, -x( j1 ), t( 1, j1 ), 1, x, 1 )
+                          k = stdlib_isamax( j1-1, x, 1 )
+                          xmax = abs( x( k ) )
+                       end if
+                    else
+                       ! meet 2 by 2 diagonal block
+                       ! call 2 by 2 linear system solve, to take
+                       ! care of possible overflow by scaling factor.
+                       d( 1, 1 ) = x( j1 )
+                       d( 2, 1 ) = x( j2 )
+                       call stdlib_slaln2( .false., 2, 1, smin, one, t( j1, j1 ),ldt, one, one, d,&
+                                  2, zero, zero, v, 2,scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 2
+                       if( scaloc/=one ) then
+                          call stdlib_sscal( n, scaloc, x, 1 )
+                          scale = scale*scaloc
+                       end if
+                       x( j1 ) = v( 1, 1 )
+                       x( j2 ) = v( 2, 1 )
+                       ! scale v(1,1) (= x(j1)) and/or v(2,1) (=x(j2))
+                       ! to avoid overflow in updating right-hand side.
+                       xj = max( abs( v( 1, 1 ) ), abs( v( 2, 1 ) ) )
+                       if( xj>one ) then
+                          rec = one / xj
+                          if( max( work( j1 ), work( j2 ) )>( bignum-xmax )*rec ) then
+                             call stdlib_sscal( n, rec, x, 1 )
+                             scale = scale*rec
+                          end if
+                       end if
+                       ! update right-hand side
+                       if( j1>1 ) then
+                          call stdlib_saxpy( j1-1, -x( j1 ), t( 1, j1 ), 1, x, 1 )
+                          call stdlib_saxpy( j1-1, -x( j2 ), t( 1, j2 ), 1, x, 1 )
+                          k = stdlib_isamax( j1-1, x, 1 )
+                          xmax = abs( x( k ) )
+                       end if
+                    end if
+                 end do loop_30
+              else
+                 ! solve t**t*p = scale*c
+                 jnext = 1
+                 loop_40: do j = 1, n
+                    if( j<jnext )cycle loop_40
+                    j1 = j
+                    j2 = j
+                    jnext = j + 1
+                    if( j<n ) then
+                       if( t( j+1, j )/=zero ) then
+                          j2 = j + 1
+                          jnext = j + 2
+                       end if
+                    end if
+                    if( j1==j2 ) then
+                       ! 1 by 1 diagonal block
+                       ! scale if necessary to avoid overflow in forming the
+                       ! right-hand side element by inner product.
+                       xj = abs( x( j1 ) )
+                       if( xmax>one ) then
+                          rec = one / xmax
+                          if( work( j1 )>( bignum-xj )*rec ) then
+                             call stdlib_sscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j1 ) = x( j1 ) - stdlib_sdot( j1-1, t( 1, j1 ), 1, x, 1 )
+                       xj = abs( x( j1 ) )
+                       tjj = abs( t( j1, j1 ) )
+                       tmp = t( j1, j1 )
+                       if( tjj<smin ) then
+                          tmp = smin
+                          tjj = smin
+                          info = 1
+                       end if
+                       if( tjj<one ) then
+                          if( xj>bignum*tjj ) then
+                             rec = one / xj
+                             call stdlib_sscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j1 ) = x( j1 ) / tmp
+                       xmax = max( xmax, abs( x( j1 ) ) )
+                    else
+                       ! 2 by 2 diagonal block
+                       ! scale if necessary to avoid overflow in forming the
+                       ! right-hand side elements by inner product.
+                       xj = max( abs( x( j1 ) ), abs( x( j2 ) ) )
+                       if( xmax>one ) then
+                          rec = one / xmax
+                          if( max( work( j2 ), work( j1 ) )>( bignum-xj )*rec ) then
+                             call stdlib_sscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       d( 1, 1 ) = x( j1 ) - stdlib_sdot( j1-1, t( 1, j1 ), 1, x,1 )
+                       d( 2, 1 ) = x( j2 ) - stdlib_sdot( j1-1, t( 1, j2 ), 1, x,1 )
+                       call stdlib_slaln2( .true., 2, 1, smin, one, t( j1, j1 ),ldt, one, one, d, &
+                                 2, zero, zero, v, 2,scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 2
+                       if( scaloc/=one ) then
+                          call stdlib_sscal( n, scaloc, x, 1 )
+                          scale = scale*scaloc
+                       end if
+                       x( j1 ) = v( 1, 1 )
+                       x( j2 ) = v( 2, 1 )
+                       xmax = max( abs( x( j1 ) ), abs( x( j2 ) ), xmax )
+                    end if
+                 end do loop_40
+              end if
+           else
+              sminw = max( eps*abs( w ), smin )
+              if( notran ) then
+                 ! solve (t + ib)*(p+iq) = c+id
+                 jnext = n
+                 loop_70: do j = n, 1, -1
+                    if( j>jnext )cycle loop_70
+                    j1 = j
+                    j2 = j
+                    jnext = j - 1
+                    if( j>1 ) then
+                       if( t( j, j-1 )/=zero ) then
+                          j1 = j - 1
+                          jnext = j - 2
+                       end if
+                    end if
+                    if( j1==j2 ) then
+                       ! 1 by 1 diagonal block
+                       ! scale if necessary to avoid overflow in division
+                       z = w
+                       if( j1==1 )z = b( 1 )
+                       xj = abs( x( j1 ) ) + abs( x( n+j1 ) )
+                       tjj = abs( t( j1, j1 ) ) + abs( z )
+                       tmp = t( j1, j1 )
+                       if( tjj<sminw ) then
+                          tmp = sminw
+                          tjj = sminw
+                          info = 1
+                       end if
+                       if( xj==zero )cycle loop_70
+                       if( tjj<one ) then
+                          if( xj>bignum*tjj ) then
+                             rec = one / xj
+                             call stdlib_sscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       call stdlib_sladiv( x( j1 ), x( n+j1 ), tmp, z, sr, si )
+                       x( j1 ) = sr
+                       x( n+j1 ) = si
+                       xj = abs( x( j1 ) ) + abs( x( n+j1 ) )
+                       ! scale x if necessary to avoid overflow when adding a
+                       ! multiple of column j1 of t.
+                       if( xj>one ) then
+                          rec = one / xj
+                          if( work( j1 )>( bignum-xmax )*rec ) then
+                             call stdlib_sscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                          end if
+                       end if
+                       if( j1>1 ) then
+                          call stdlib_saxpy( j1-1, -x( j1 ), t( 1, j1 ), 1, x, 1 )
+                          call stdlib_saxpy( j1-1, -x( n+j1 ), t( 1, j1 ), 1,x( n+1 ), 1 )
+                          x( 1 ) = x( 1 ) + b( j1 )*x( n+j1 )
+                          x( n+1 ) = x( n+1 ) - b( j1 )*x( j1 )
+                          xmax = zero
+                          do k = 1, j1 - 1
+                             xmax = max( xmax, abs( x( k ) )+abs( x( k+n ) ) )
+                          end do
+                       end if
+                    else
+                       ! meet 2 by 2 diagonal block
+                       d( 1, 1 ) = x( j1 )
+                       d( 2, 1 ) = x( j2 )
+                       d( 1, 2 ) = x( n+j1 )
+                       d( 2, 2 ) = x( n+j2 )
+                       call stdlib_slaln2( .false., 2, 2, sminw, one, t( j1, j1 ),ldt, one, one, &
+                                 d, 2, zero, -w, v, 2,scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 2
+                       if( scaloc/=one ) then
+                          call stdlib_sscal( 2*n, scaloc, x, 1 )
+                          scale = scaloc*scale
+                       end if
+                       x( j1 ) = v( 1, 1 )
+                       x( j2 ) = v( 2, 1 )
+                       x( n+j1 ) = v( 1, 2 )
+                       x( n+j2 ) = v( 2, 2 )
+                       ! scale x(j1), .... to avoid overflow in
+                       ! updating right hand side.
+                       xj = max( abs( v( 1, 1 ) )+abs( v( 1, 2 ) ),abs( v( 2, 1 ) )+abs( v( 2, 2 )&
+                                  ) )
+                       if( xj>one ) then
+                          rec = one / xj
+                          if( max( work( j1 ), work( j2 ) )>( bignum-xmax )*rec ) then
+                             call stdlib_sscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                          end if
+                       end if
+                       ! update the right-hand side.
+                       if( j1>1 ) then
+                          call stdlib_saxpy( j1-1, -x( j1 ), t( 1, j1 ), 1, x, 1 )
+                          call stdlib_saxpy( j1-1, -x( j2 ), t( 1, j2 ), 1, x, 1 )
+                          call stdlib_saxpy( j1-1, -x( n+j1 ), t( 1, j1 ), 1,x( n+1 ), 1 )
+                          call stdlib_saxpy( j1-1, -x( n+j2 ), t( 1, j2 ), 1,x( n+1 ), 1 )
+                          x( 1 ) = x( 1 ) + b( j1 )*x( n+j1 ) +b( j2 )*x( n+j2 )
+                          x( n+1 ) = x( n+1 ) - b( j1 )*x( j1 ) -b( j2 )*x( j2 )
+                          xmax = zero
+                          do k = 1, j1 - 1
+                             xmax = max( abs( x( k ) )+abs( x( k+n ) ),xmax )
+                          end do
+                       end if
+                    end if
+                 end do loop_70
+              else
+                 ! solve (t + ib)**t*(p+iq) = c+id
+                 jnext = 1
+                 loop_80: do j = 1, n
+                    if( j<jnext )cycle loop_80
+                    j1 = j
+                    j2 = j
+                    jnext = j + 1
+                    if( j<n ) then
+                       if( t( j+1, j )/=zero ) then
+                          j2 = j + 1
+                          jnext = j + 2
+                       end if
+                    end if
+                    if( j1==j2 ) then
+                       ! 1 by 1 diagonal block
+                       ! scale if necessary to avoid overflow in forming the
+                       ! right-hand side element by inner product.
+                       xj = abs( x( j1 ) ) + abs( x( j1+n ) )
+                       if( xmax>one ) then
+                          rec = one / xmax
+                          if( work( j1 )>( bignum-xj )*rec ) then
+                             call stdlib_sscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j1 ) = x( j1 ) - stdlib_sdot( j1-1, t( 1, j1 ), 1, x, 1 )
+                       x( n+j1 ) = x( n+j1 ) - stdlib_sdot( j1-1, t( 1, j1 ), 1,x( n+1 ), 1 )
+                                 
+                       if( j1>1 ) then
+                          x( j1 ) = x( j1 ) - b( j1 )*x( n+1 )
+                          x( n+j1 ) = x( n+j1 ) + b( j1 )*x( 1 )
+                       end if
+                       xj = abs( x( j1 ) ) + abs( x( j1+n ) )
+                       z = w
+                       if( j1==1 )z = b( 1 )
+                       ! scale if necessary to avoid overflow in
+                       ! complex division
+                       tjj = abs( t( j1, j1 ) ) + abs( z )
+                       tmp = t( j1, j1 )
+                       if( tjj<sminw ) then
+                          tmp = sminw
+                          tjj = sminw
+                          info = 1
+                       end if
+                       if( tjj<one ) then
+                          if( xj>bignum*tjj ) then
+                             rec = one / xj
+                             call stdlib_sscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       call stdlib_sladiv( x( j1 ), x( n+j1 ), tmp, -z, sr, si )
+                       x( j1 ) = sr
+                       x( j1+n ) = si
+                       xmax = max( abs( x( j1 ) )+abs( x( j1+n ) ), xmax )
+                    else
+                       ! 2 by 2 diagonal block
+                       ! scale if necessary to avoid overflow in forming the
+                       ! right-hand side element by inner product.
+                       xj = max( abs( x( j1 ) )+abs( x( n+j1 ) ),abs( x( j2 ) )+abs( x( n+j2 ) ) )
+                                 
+                       if( xmax>one ) then
+                          rec = one / xmax
+                          if( max( work( j1 ), work( j2 ) )>( bignum-xj ) / xmax ) then
+                             call stdlib_sscal( n2, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       d( 1, 1 ) = x( j1 ) - stdlib_sdot( j1-1, t( 1, j1 ), 1, x,1 )
+                       d( 2, 1 ) = x( j2 ) - stdlib_sdot( j1-1, t( 1, j2 ), 1, x,1 )
+                       d( 1, 2 ) = x( n+j1 ) - stdlib_sdot( j1-1, t( 1, j1 ), 1,x( n+1 ), 1 )
+                                 
+                       d( 2, 2 ) = x( n+j2 ) - stdlib_sdot( j1-1, t( 1, j2 ), 1,x( n+1 ), 1 )
+                                 
+                       d( 1, 1 ) = d( 1, 1 ) - b( j1 )*x( n+1 )
+                       d( 2, 1 ) = d( 2, 1 ) - b( j2 )*x( n+1 )
+                       d( 1, 2 ) = d( 1, 2 ) + b( j1 )*x( 1 )
+                       d( 2, 2 ) = d( 2, 2 ) + b( j2 )*x( 1 )
+                       call stdlib_slaln2( .true., 2, 2, sminw, one, t( j1, j1 ),ldt, one, one, d,&
+                                  2, zero, w, v, 2,scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 2
+                       if( scaloc/=one ) then
+                          call stdlib_sscal( n2, scaloc, x, 1 )
+                          scale = scaloc*scale
+                       end if
+                       x( j1 ) = v( 1, 1 )
+                       x( j2 ) = v( 2, 1 )
+                       x( n+j1 ) = v( 1, 2 )
+                       x( n+j2 ) = v( 2, 2 )
+                       xmax = max( abs( x( j1 ) )+abs( x( n+j1 ) ),abs( x( j2 ) )+abs( x( n+j2 ) )&
+                                 , xmax )
+                    end if
+                 end do loop_80
+              end if
+           end if
+           return
+     end subroutine stdlib_slaqtr
+
+     !> SLASD3: finds all the square roots of the roots of the secular
+     !> equation, as defined by the values in D and Z.  It makes the
+     !> appropriate calls to SLASD4 and then updates the singular
+     !> vectors by matrix multiplication.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+     !> SLASD3 is called from SLASD1.
+
+     pure subroutine stdlib_slasd3( nl, nr, sqre, k, d, q, ldq, dsigma, u, ldu, u2,ldu2, vt, ldvt,&
+                vt2, ldvt2, idxc, ctot, z,info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldq, ldu, ldu2, ldvt, ldvt2, nl, nr, sqre
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ctot(*), idxc(*)
+           real(sp), intent(out) :: d(*), q(ldq,*), u(ldu,*), vt(ldvt,*)
+           real(sp), intent(inout) :: dsigma(*), vt2(ldvt2,*), z(*)
+           real(sp), intent(in) :: u2(ldu2,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: ctemp, i, j, jc, ktemp, m, n, nlp1, nlp2, nrp1
+           real(sp) :: rho, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( nl<1 ) then
+              info = -1
+           else if( nr<1 ) then
+              info = -2
+           else if( ( sqre/=1 ) .and. ( sqre/=0 ) ) then
+              info = -3
+           end if
+           n = nl + nr + 1
+           m = n + sqre
+           nlp1 = nl + 1
+           nlp2 = nl + 2
+           if( ( k<1 ) .or. ( k>n ) ) then
+              info = -4
+           else if( ldq<k ) then
+              info = -7
+           else if( ldu<n ) then
+              info = -10
+           else if( ldu2<n ) then
+              info = -12
+           else if( ldvt<m ) then
+              info = -14
+           else if( ldvt2<m ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLASD3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==1 ) then
+              d( 1 ) = abs( z( 1 ) )
+              call stdlib_scopy( m, vt2( 1, 1 ), ldvt2, vt( 1, 1 ), ldvt )
+              if( z( 1 )>zero ) then
+                 call stdlib_scopy( n, u2( 1, 1 ), 1, u( 1, 1 ), 1 )
+              else
+                 do i = 1, n
+                    u( i, 1 ) = -u2( i, 1 )
+                 end do
+              end if
+              return
+           end if
+           ! modify values dsigma(i) to make sure all dsigma(i)-dsigma(j) can
+           ! be computed with high relative accuracy (barring over/underflow).
+           ! this is a problem on machines without a guard digit in
+           ! add/subtract (cray xmp, cray ymp, cray c 90 and cray 2).
+           ! the following code replaces dsigma(i) by 2*dsigma(i)-dsigma(i),
+           ! which on any of these machines zeros out the bottommost
+           ! bit of dsigma(i) if it is 1; this makes the subsequent
+           ! subtractions dsigma(i)-dsigma(j) unproblematic when cancellation
+           ! occurs. on binary machines with a guard digit (almost all
+           ! machines) it does not change dsigma(i) at all. on hexadecimal
+           ! and decimal machines with a guard digit, it slightly
+           ! changes the bottommost bits of dsigma(i). it does not account
+           ! for hexadecimal or decimal machines without guard digits
+           ! (we know of none). we use a subroutine call to compute
+           ! 2*dsigma(i) to prevent optimizing compilers from eliminating
+           ! this code.
+           do i = 1, k
+              dsigma( i ) = stdlib_slamc3( dsigma( i ), dsigma( i ) ) - dsigma( i )
+           end do
+           ! keep a copy of z.
+           call stdlib_scopy( k, z, 1, q, 1 )
+           ! normalize z.
+           rho = stdlib_snrm2( k, z, 1 )
+           call stdlib_slascl( 'G', 0, 0, rho, one, k, 1, z, k, info )
+           rho = rho*rho
+           ! find the new singular values.
+           do j = 1, k
+              call stdlib_slasd4( k, j, dsigma, z, u( 1, j ), rho, d( j ),vt( 1, j ), info )
+                        
+              ! if the zero finder fails, report the convergence failure.
+              if( info/=0 ) then
+                 return
+              end if
+           end do
+           ! compute updated z.
+           do i = 1, k
+              z( i ) = u( i, k )*vt( i, k )
+              do j = 1, i - 1
+                 z( i ) = z( i )*( u( i, j )*vt( i, j ) /( dsigma( i )-dsigma( j ) ) /( dsigma( i &
+                           )+dsigma( j ) ) )
+              end do
+              do j = i, k - 1
+                 z( i ) = z( i )*( u( i, j )*vt( i, j ) /( dsigma( i )-dsigma( j+1 ) ) /( dsigma( &
+                           i )+dsigma( j+1 ) ) )
+              end do
+              z( i ) = sign( sqrt( abs( z( i ) ) ), q( i, 1 ) )
+           end do
+           ! compute left singular vectors of the modified diagonal matrix,
+           ! and store related information for the right singular vectors.
+           do i = 1, k
+              vt( 1, i ) = z( 1 ) / u( 1, i ) / vt( 1, i )
+              u( 1, i ) = negone
+              do j = 2, k
+                 vt( j, i ) = z( j ) / u( j, i ) / vt( j, i )
+                 u( j, i ) = dsigma( j )*vt( j, i )
+              end do
+              temp = stdlib_snrm2( k, u( 1, i ), 1 )
+              q( 1, i ) = u( 1, i ) / temp
+              do j = 2, k
+                 jc = idxc( j )
+                 q( j, i ) = u( jc, i ) / temp
+              end do
+           end do
+           ! update the left singular vector matrix.
+           if( k==2 ) then
+              call stdlib_sgemm( 'N', 'N', n, k, k, one, u2, ldu2, q, ldq, zero, u,ldu )
+              go to 100
+           end if
+           if( ctot( 1 )>0 ) then
+              call stdlib_sgemm( 'N', 'N', nl, k, ctot( 1 ), one, u2( 1, 2 ), ldu2,q( 2, 1 ), ldq,&
+                         zero, u( 1, 1 ), ldu )
+              if( ctot( 3 )>0 ) then
+                 ktemp = 2 + ctot( 1 ) + ctot( 2 )
+                 call stdlib_sgemm( 'N', 'N', nl, k, ctot( 3 ), one, u2( 1, ktemp ),ldu2, q( &
+                           ktemp, 1 ), ldq, one, u( 1, 1 ), ldu )
+              end if
+           else if( ctot( 3 )>0 ) then
+              ktemp = 2 + ctot( 1 ) + ctot( 2 )
+              call stdlib_sgemm( 'N', 'N', nl, k, ctot( 3 ), one, u2( 1, ktemp ),ldu2, q( ktemp, &
+                        1 ), ldq, zero, u( 1, 1 ), ldu )
+           else
+              call stdlib_slacpy( 'F', nl, k, u2, ldu2, u, ldu )
+           end if
+           call stdlib_scopy( k, q( 1, 1 ), ldq, u( nlp1, 1 ), ldu )
+           ktemp = 2 + ctot( 1 )
+           ctemp = ctot( 2 ) + ctot( 3 )
+           call stdlib_sgemm( 'N', 'N', nr, k, ctemp, one, u2( nlp2, ktemp ), ldu2,q( ktemp, 1 ), &
+                     ldq, zero, u( nlp2, 1 ), ldu )
+           ! generate the right singular vectors.
+           100 continue
+           do i = 1, k
+              temp = stdlib_snrm2( k, vt( 1, i ), 1 )
+              q( i, 1 ) = vt( 1, i ) / temp
+              do j = 2, k
+                 jc = idxc( j )
+                 q( i, j ) = vt( jc, i ) / temp
+              end do
+           end do
+           ! update the right singular vector matrix.
+           if( k==2 ) then
+              call stdlib_sgemm( 'N', 'N', k, m, k, one, q, ldq, vt2, ldvt2, zero,vt, ldvt )
+                        
+              return
+           end if
+           ktemp = 1 + ctot( 1 )
+           call stdlib_sgemm( 'N', 'N', k, nlp1, ktemp, one, q( 1, 1 ), ldq,vt2( 1, 1 ), ldvt2, &
+                     zero, vt( 1, 1 ), ldvt )
+           ktemp = 2 + ctot( 1 ) + ctot( 2 )
+           if( ktemp<=ldvt2 )call stdlib_sgemm( 'N', 'N', k, nlp1, ctot( 3 ), one, q( 1, ktemp ),&
+                     ldq, vt2( ktemp, 1 ), ldvt2, one, vt( 1, 1 ),ldvt )
+           ktemp = ctot( 1 ) + 1
+           nrp1 = nr + sqre
+           if( ktemp>1 ) then
+              do i = 1, k
+                 q( i, ktemp ) = q( i, 1 )
+              end do
+              do i = nlp2, m
+                 vt2( ktemp, i ) = vt2( 1, i )
+              end do
+           end if
+           ctemp = 1 + ctot( 2 ) + ctot( 3 )
+           call stdlib_sgemm( 'N', 'N', k, nrp1, ctemp, one, q( 1, ktemp ), ldq,vt2( ktemp, nlp2 )&
+                     , ldvt2, zero, vt( 1, nlp2 ), ldvt )
+           return
+     end subroutine stdlib_slasd3
+
+     !> SLASD6: computes the SVD of an updated upper bidiagonal matrix B
+     !> obtained by merging two smaller ones by appending a row. This
+     !> routine is used only for the problem which requires all singular
+     !> values and optionally singular vector matrices in factored form.
+     !> B is an N-by-M matrix with N = NL + NR + 1 and M = N + SQRE.
+     !> A related subroutine, SLASD1, handles the case in which all singular
+     !> values and singular vectors of the bidiagonal matrix are desired.
+     !> SLASD6 computes the SVD as follows:
+     !> ( D1(in)    0    0       0 )
+     !> B = U(in) * (   Z1**T   a   Z2**T    b ) * VT(in)
+     !> (   0       0   D2(in)   0 )
+     !> = U(out) * ( D(out) 0) * VT(out)
+     !> where Z**T = (Z1**T a Z2**T b) = u**T VT**T, and u is a vector of dimension M
+     !> with ALPHA and BETA in the NL+1 and NL+2 th entries and zeros
+     !> elsewhere; and the entry b is empty if SQRE = 0.
+     !> The singular values of B can be computed using D1, D2, the first
+     !> components of all the right singular vectors of the lower block, and
+     !> the last components of all the right singular vectors of the upper
+     !> block. These components are stored and updated in VF and VL,
+     !> respectively, in SLASD6. Hence U and VT are not explicitly
+     !> referenced.
+     !> The singular values are stored in D. The algorithm consists of two
+     !> stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple singular values or if there is a zero
+     !> in the Z vector. For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one. This stage is
+     !> performed by the routine SLASD7.
+     !> The second stage consists of calculating the updated
+     !> singular values. This is done by finding the roots of the
+     !> secular equation via the routine SLASD4 (as called by SLASD8).
+     !> This routine also updates VF and VL and computes the distances
+     !> between the updated singular values and the old singular
+     !> values.
+     !> SLASD6 is called from SLASDA.
+
+     pure subroutine stdlib_slasd6( icompq, nl, nr, sqre, d, vf, vl, alpha, beta,idxq, perm, &
+     givptr, givcol, ldgcol, givnum,ldgnum, poles, difl, difr, z, k, c, s, work,iwork, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: givptr, info, k
+           integer(ilp), intent(in) :: icompq, ldgcol, ldgnum, nl, nr, sqre
+           real(sp), intent(inout) :: alpha, beta
+           real(sp), intent(out) :: c, s
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(ldgcol,*), iwork(*), perm(*)
+           integer(ilp), intent(inout) :: idxq(*)
+           real(sp), intent(inout) :: d(*), vf(*), vl(*)
+           real(sp), intent(out) :: difl(*), difr(*), givnum(ldgnum,*), poles(ldgnum,*), work(*), &
+                     z(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, idx, idxc, idxp, isigma, ivfw, ivlw, iw, m, n, n1, n2
+           real(sp) :: orgnrm
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           n = nl + nr + 1
+           m = n + sqre
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( nl<1 ) then
+              info = -2
+           else if( nr<1 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( ldgcol<n ) then
+              info = -14
+           else if( ldgnum<n ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLASD6', -info )
+              return
+           end if
+           ! the following values are for bookkeeping purposes only.  they are
+           ! integer pointers which indicate the portion of the workspace
+           ! used by a particular array in stdlib_slasd7 and stdlib_slasd8.
+           isigma = 1
+           iw = isigma + n
+           ivfw = iw + m
+           ivlw = ivfw + m
+           idx = 1
+           idxc = idx + n
+           idxp = idxc + n
+           ! scale.
+           orgnrm = max( abs( alpha ), abs( beta ) )
+           d( nl+1 ) = zero
+           do i = 1, n
+              if( abs( d( i ) )>orgnrm ) then
+                 orgnrm = abs( d( i ) )
+              end if
+           end do
+           call stdlib_slascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, info )
+           alpha = alpha / orgnrm
+           beta = beta / orgnrm
+           ! sort and deflate singular values.
+           call stdlib_slasd7( icompq, nl, nr, sqre, k, d, z, work( iw ), vf,work( ivfw ), vl, &
+           work( ivlw ), alpha, beta,work( isigma ), iwork( idx ), iwork( idxp ), idxq,perm, &
+                     givptr, givcol, ldgcol, givnum, ldgnum, c, s,info )
+           ! solve secular equation, compute difl, difr, and update vf, vl.
+           call stdlib_slasd8( icompq, k, d, z, vf, vl, difl, difr, ldgnum,work( isigma ), work( &
+                     iw ), info )
+           ! report the possible convergence failure.
+           if( info/=0 ) then
+              return
+           end if
+           ! save the poles if icompq = 1.
+           if( icompq==1 ) then
+              call stdlib_scopy( k, d, 1, poles( 1, 1 ), 1 )
+              call stdlib_scopy( k, work( isigma ), 1, poles( 1, 2 ), 1 )
+           end if
+           ! unscale.
+           call stdlib_slascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+           ! prepare the idxq sorting permutation.
+           n1 = k
+           n2 = n - k
+           call stdlib_slamrg( n1, n2, d, 1, -1, idxq )
+           return
+     end subroutine stdlib_slasd6
+
+     !> SOPGTR: generates a real orthogonal matrix Q which is defined as the
+     !> product of n-1 elementary reflectors H(i) of order n, as returned by
+     !> SSPTRD using packed storage:
+     !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+
+     pure subroutine stdlib_sopgtr( uplo, n, ap, tau, q, ldq, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldq, n
+           ! Array Arguments 
+           real(sp), intent(in) :: ap(*), tau(*)
+           real(sp), intent(out) :: q(ldq,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, ij, j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SOPGTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! q was determined by a call to stdlib_ssptrd with uplo = 'u'
+              ! unpack the vectors which define the elementary reflectors and
+              ! set the last row and column of q equal to those of the unit
+              ! matrix
+              ij = 2
+              do j = 1, n - 1
+                 do i = 1, j - 1
+                    q( i, j ) = ap( ij )
+                    ij = ij + 1
+                 end do
+                 ij = ij + 2
+                 q( n, j ) = zero
+              end do
+              do i = 1, n - 1
+                 q( i, n ) = zero
+              end do
+              q( n, n ) = one
+              ! generate q(1:n-1,1:n-1)
+              call stdlib_sorg2l( n-1, n-1, n-1, q, ldq, tau, work, iinfo )
+           else
+              ! q was determined by a call to stdlib_ssptrd with uplo = 'l'.
+              ! unpack the vectors which define the elementary reflectors and
+              ! set the first row and column of q equal to those of the unit
+              ! matrix
+              q( 1, 1 ) = one
+              do i = 2, n
+                 q( i, 1 ) = zero
+              end do
+              ij = 3
+              do j = 2, n
+                 q( 1, j ) = zero
+                 do i = j + 1, n
+                    q( i, j ) = ap( ij )
+                    ij = ij + 1
+                 end do
+                 ij = ij + 2
+              end do
+              if( n>1 ) then
+                 ! generate q(2:n,2:n)
+                 call stdlib_sorg2r( n-1, n-1, n-1, q( 2, 2 ), ldq, tau, work,iinfo )
+              end if
+           end if
+           return
+     end subroutine stdlib_sopgtr
+
+     !> SOPMTR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> nq-1 elementary reflectors, as returned by SSPTRD using packed
+     !> storage:
+     !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+     pure subroutine stdlib_sopmtr( side, uplo, trans, m, n, ap, tau, c, ldc, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ap(*), c(ldc,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: forwrd, left, notran, upper
+           integer(ilp) :: i, i1, i2, i3, ic, ii, jc, mi, ni, nq
+           real(sp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           upper = stdlib_lsame( uplo, 'U' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( ldc<max( 1, m ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SOPMTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if( upper ) then
+              ! q was determined by a call to stdlib_ssptrd with uplo = 'u'
+              forwrd = ( left .and. notran ) .or.( .not.left .and. .not.notran )
+              if( forwrd ) then
+                 i1 = 1
+                 i2 = nq - 1
+                 i3 = 1
+                 ii = 2
+              else
+                 i1 = nq - 1
+                 i2 = 1
+                 i3 = -1
+                 ii = nq*( nq+1 ) / 2 - 1
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              do i = i1, i2, i3
+                 if( left ) then
+                    ! h(i) is applied to c(1:i,1:n)
+                    mi = i
+                 else
+                    ! h(i) is applied to c(1:m,1:i)
+                    ni = i
+                 end if
+                 ! apply h(i)
+                 aii = ap( ii )
+                 ap( ii ) = one
+                 call stdlib_slarf( side, mi, ni, ap( ii-i+1 ), 1, tau( i ), c, ldc,work )
+                 ap( ii ) = aii
+                 if( forwrd ) then
+                    ii = ii + i + 2
+                 else
+                    ii = ii - i - 1
+                 end if
+              end do
+           else
+              ! q was determined by a call to stdlib_ssptrd with uplo = 'l'.
+              forwrd = ( left .and. .not.notran ) .or.( .not.left .and. notran )
+              if( forwrd ) then
+                 i1 = 1
+                 i2 = nq - 1
+                 i3 = 1
+                 ii = 2
+              else
+                 i1 = nq - 1
+                 i2 = 1
+                 i3 = -1
+                 ii = nq*( nq+1 ) / 2 - 1
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              do i = i1, i2, i3
+                 aii = ap( ii )
+                 ap( ii ) = one
+                 if( left ) then
+                    ! h(i) is applied to c(i+1:m,1:n)
+                    mi = m - i
+                    ic = i + 1
+                 else
+                    ! h(i) is applied to c(1:m,i+1:n)
+                    ni = n - i
+                    jc = i + 1
+                 end if
+                 ! apply h(i)
+                 call stdlib_slarf( side, mi, ni, ap( ii ), 1, tau( i ),c( ic, jc ), ldc, work )
+                           
+                 ap( ii ) = aii
+                 if( forwrd ) then
+                    ii = ii + nq - i + 1
+                 else
+                    ii = ii - nq + i - 2
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_sopmtr
+
+     !> SORBDB1: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. Q must be no larger than P,
+     !> M-P, or M-Q. Routines SORBDB2, SORBDB3, and SORBDB4 handle cases in
+     !> which Q is not the minimum dimension.
+     !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are Q-by-Q bidiagonal matrices represented implicitly by
+     !> angles THETA, PHI.
+
+     subroutine stdlib_sorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(sp), intent(out) :: phi(*), theta(*)
+           real(sp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           real(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < q .or. m-p < q ) then
+              info = -2
+           else if( q < 0 .or. m-q < q ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p-1, m-p-1, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-2
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'SORBDB1', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce columns 1, ..., q of x11 and x21
+           do i = 1, q
+              call stdlib_slarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              call stdlib_slarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              theta(i) = atan2( x21(i,i), x11(i,i) )
+              c = cos( theta(i) )
+              s = sin( theta(i) )
+              x11(i,i) = one
+              x21(i,i) = one
+              call stdlib_slarf( 'L', p-i+1, q-i, x11(i,i), 1, taup1(i), x11(i,i+1),ldx11, work(&
+                        ilarf) )
+              call stdlib_slarf( 'L', m-p-i+1, q-i, x21(i,i), 1, taup2(i),x21(i,i+1), ldx21, work(&
+                        ilarf) )
+              if( i < q ) then
+                 call stdlib_srot( q-i, x11(i,i+1), ldx11, x21(i,i+1), ldx21, c, s )
+                 call stdlib_slarfgp( q-i, x21(i,i+1), x21(i,i+2), ldx21, tauq1(i) )
+                 s = x21(i,i+1)
+                 x21(i,i+1) = one
+                 call stdlib_slarf( 'R', p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x11(i+1,i+1), &
+                           ldx11, work(ilarf) )
+                 call stdlib_slarf( 'R', m-p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x21(i+1,i+1), &
+                           ldx21, work(ilarf) )
+                 c = sqrt( stdlib_snrm2( p-i, x11(i+1,i+1), 1 )**2+ stdlib_snrm2( m-p-i, x21(i+1,&
+                           i+1), 1 )**2 )
+                 phi(i) = atan2( s, c )
+                 call stdlib_sorbdb5( p-i, m-p-i, q-i-1, x11(i+1,i+1), 1,x21(i+1,i+1), 1, x11(i+1,&
+                           i+2), ldx11,x21(i+1,i+2), ldx21, work(iorbdb5), lorbdb5,childinfo )
+              end if
+           end do
+           return
+     end subroutine stdlib_sorbdb1
+
+     !> SORBDB2: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. P must be no larger than M-P,
+     !> Q, or M-Q. Routines SORBDB1, SORBDB3, and SORBDB4 handle cases in
+     !> which P is not the minimum dimension.
+     !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are P-by-P bidiagonal matrices represented implicitly by
+     !> angles THETA, PHI.
+
+     subroutine stdlib_sorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(sp), intent(out) :: phi(*), theta(*)
+           real(sp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           real(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < 0 .or. p > m-p ) then
+              info = -2
+           else if( q < 0 .or. q < p .or. m-q < p ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p-1, m-p, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-1
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'SORBDB2', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce rows 1, ..., p of x11 and x21
+           do i = 1, p
+              if( i > 1 ) then
+                 call stdlib_srot( q-i+1, x11(i,i), ldx11, x21(i-1,i), ldx21, c, s )
+              end if
+              call stdlib_slarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
+              c = x11(i,i)
+              x11(i,i) = one
+              call stdlib_slarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_slarf( 'R', m-p-i+1, q-i+1, x11(i,i), ldx11, tauq1(i),x21(i,i), ldx21, &
+                        work(ilarf) )
+              s = sqrt( stdlib_snrm2( p-i, x11(i+1,i), 1 )**2+ stdlib_snrm2( m-p-i+1, x21(i,i), 1 &
+                        )**2 )
+              theta(i) = atan2( s, c )
+              call stdlib_sorbdb5( p-i, m-p-i+1, q-i, x11(i+1,i), 1, x21(i,i), 1,x11(i+1,i+1), &
+                        ldx11, x21(i,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
+              call stdlib_sscal( p-i, negone, x11(i+1,i), 1 )
+              call stdlib_slarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              if( i < p ) then
+                 call stdlib_slarfgp( p-i, x11(i+1,i), x11(i+2,i), 1, taup1(i) )
+                 phi(i) = atan2( x11(i+1,i), x21(i,i) )
+                 c = cos( phi(i) )
+                 s = sin( phi(i) )
+                 x11(i+1,i) = one
+                 call stdlib_slarf( 'L', p-i, q-i, x11(i+1,i), 1, taup1(i),x11(i+1,i+1), ldx11, &
+                           work(ilarf) )
+              end if
+              x21(i,i) = one
+              call stdlib_slarf( 'L', m-p-i+1, q-i, x21(i,i), 1, taup2(i),x21(i,i+1), ldx21, work(&
+                        ilarf) )
+           end do
+           ! reduce the bottom-right portion of x21 to the identity matrix
+           do i = p + 1, q
+              call stdlib_slarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              x21(i,i) = one
+              call stdlib_slarf( 'L', m-p-i+1, q-i, x21(i,i), 1, taup2(i),x21(i,i+1), ldx21, work(&
+                        ilarf) )
+           end do
+           return
+     end subroutine stdlib_sorbdb2
+
+     !> SORBDB3: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-P must be no larger than P,
+     !> Q, or M-Q. Routines SORBDB1, SORBDB2, and SORBDB4 handle cases in
+     !> which M-P is not the minimum dimension.
+     !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are (M-P)-by-(M-P) bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_sorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(sp), intent(out) :: phi(*), theta(*)
+           real(sp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           real(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( 2*p < m .or. p > m ) then
+              info = -2
+           else if( q < m-p .or. m-q < m-p ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p, m-p-1, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-1
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'SORBDB3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce rows 1, ..., m-p of x11 and x21
+           do i = 1, m-p
+              if( i > 1 ) then
+                 call stdlib_srot( q-i+1, x11(i-1,i), ldx11, x21(i,i), ldx11, c, s )
+              end if
+              call stdlib_slarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
+              s = x21(i,i)
+              x21(i,i) = one
+              call stdlib_slarf( 'R', p-i+1, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_slarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
+                        work(ilarf) )
+              c = sqrt( stdlib_snrm2( p-i+1, x11(i,i), 1 )**2+ stdlib_snrm2( m-p-i, x21(i+1,i), 1 &
+                        )**2 )
+              theta(i) = atan2( s, c )
+              call stdlib_sorbdb5( p-i+1, m-p-i, q-i, x11(i,i), 1, x21(i+1,i), 1,x11(i,i+1), &
+                        ldx11, x21(i+1,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
+              call stdlib_slarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              if( i < m-p ) then
+                 call stdlib_slarfgp( m-p-i, x21(i+1,i), x21(i+2,i), 1, taup2(i) )
+                 phi(i) = atan2( x21(i+1,i), x11(i,i) )
+                 c = cos( phi(i) )
+                 s = sin( phi(i) )
+                 x21(i+1,i) = one
+                 call stdlib_slarf( 'L', m-p-i, q-i, x21(i+1,i), 1, taup2(i),x21(i+1,i+1), ldx21, &
+                           work(ilarf) )
+              end if
+              x11(i,i) = one
+              call stdlib_slarf( 'L', p-i+1, q-i, x11(i,i), 1, taup1(i), x11(i,i+1),ldx11, work(&
+                        ilarf) )
+           end do
+           ! reduce the bottom-right portion of x11 to the identity matrix
+           do i = m-p + 1, q
+              call stdlib_slarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              x11(i,i) = one
+              call stdlib_slarf( 'L', p-i+1, q-i, x11(i,i), 1, taup1(i), x11(i,i+1),ldx11, work(&
+                        ilarf) )
+           end do
+           return
+     end subroutine stdlib_sorbdb3
+
+     !> SORBDB4: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-Q must be no larger than P,
+     !> M-P, or Q. Routines SORBDB1, SORBDB2, and SORBDB3 handle cases in
+     !> which M-Q is not the minimum dimension.
+     !> The orthogonal matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are (M-Q)-by-(M-Q) bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_sorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               phantom, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(sp), intent(out) :: phi(*), theta(*)
+           real(sp), intent(out) :: phantom(*), taup1(*), taup2(*), tauq1(*), work(*)
+           real(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, j, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < m-q .or. m-p < m-q ) then
+              info = -2
+           else if( q < m-q .or. q > m ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( q-1, p-1, m-p-1 )
+              iorbdb5 = 2
+              lorbdb5 = q
+              lworkopt = ilarf + llarf - 1
+              lworkopt = max( lworkopt, iorbdb5 + lorbdb5 - 1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'SORBDB4', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce columns 1, ..., m-q of x11 and x21
+           do i = 1, m-q
+              if( i == 1 ) then
+                 do j = 1, m
+                    phantom(j) = zero
+                 end do
+                 call stdlib_sorbdb5( p, m-p, q, phantom(1), 1, phantom(p+1), 1,x11, ldx11, x21, &
+                           ldx21, work(iorbdb5),lorbdb5, childinfo )
+                 call stdlib_sscal( p, negone, phantom(1), 1 )
+                 call stdlib_slarfgp( p, phantom(1), phantom(2), 1, taup1(1) )
+                 call stdlib_slarfgp( m-p, phantom(p+1), phantom(p+2), 1, taup2(1) )
+                 theta(i) = atan2( phantom(1), phantom(p+1) )
+                 c = cos( theta(i) )
+                 s = sin( theta(i) )
+                 phantom(1) = one
+                 phantom(p+1) = one
+                 call stdlib_slarf( 'L', p, q, phantom(1), 1, taup1(1), x11, ldx11,work(ilarf) )
+                           
+                 call stdlib_slarf( 'L', m-p, q, phantom(p+1), 1, taup2(1), x21,ldx21, work(ilarf)&
+                            )
+              else
+                 call stdlib_sorbdb5( p-i+1, m-p-i+1, q-i+1, x11(i,i-1), 1,x21(i,i-1), 1, x11(i,i)&
+                           , ldx11, x21(i,i),ldx21, work(iorbdb5), lorbdb5, childinfo )
+                 call stdlib_sscal( p-i+1, negone, x11(i,i-1), 1 )
+                 call stdlib_slarfgp( p-i+1, x11(i,i-1), x11(i+1,i-1), 1, taup1(i) )
+                 call stdlib_slarfgp( m-p-i+1, x21(i,i-1), x21(i+1,i-1), 1,taup2(i) )
+                 theta(i) = atan2( x11(i,i-1), x21(i,i-1) )
+                 c = cos( theta(i) )
+                 s = sin( theta(i) )
+                 x11(i,i-1) = one
+                 x21(i,i-1) = one
+                 call stdlib_slarf( 'L', p-i+1, q-i+1, x11(i,i-1), 1, taup1(i),x11(i,i), ldx11, &
+                           work(ilarf) )
+                 call stdlib_slarf( 'L', m-p-i+1, q-i+1, x21(i,i-1), 1, taup2(i),x21(i,i), ldx21, &
+                           work(ilarf) )
+              end if
+              call stdlib_srot( q-i+1, x11(i,i), ldx11, x21(i,i), ldx21, s, -c )
+              call stdlib_slarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
+              c = x21(i,i)
+              x21(i,i) = one
+              call stdlib_slarf( 'R', p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_slarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
+                        work(ilarf) )
+              if( i < m-q ) then
+                 s = sqrt( stdlib_snrm2( p-i, x11(i+1,i), 1 )**2+ stdlib_snrm2( m-p-i, x21(i+1,i),&
+                            1 )**2 )
+                 phi(i) = atan2( s, c )
+              end if
+           end do
+           ! reduce the bottom-right portion of x11 to [ i 0 ]
+           do i = m - q + 1, p
+              call stdlib_slarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
+              x11(i,i) = one
+              call stdlib_slarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_slarf( 'R', q-p, q-i+1, x11(i,i), ldx11, tauq1(i),x21(m-q+1,i), ldx21, &
+                        work(ilarf) )
+           end do
+           ! reduce the bottom-right portion of x21 to [ 0 i ]
+           do i = p + 1, q
+              call stdlib_slarfgp( q-i+1, x21(m-q+i-p,i), x21(m-q+i-p,i+1), ldx21,tauq1(i) )
+                        
+              x21(m-q+i-p,i) = one
+              call stdlib_slarf( 'R', q-i, q-i+1, x21(m-q+i-p,i), ldx21, tauq1(i),x21(m-q+i-p+1,i)&
+                        , ldx21, work(ilarf) )
+           end do
+           return
+     end subroutine stdlib_sorbdb4
+
+     !> SORCSD2BY1: computes the CS decomposition of an M-by-Q matrix X with
+     !> orthonormal columns that has been partitioned into a 2-by-1 block
+     !> structure:
+     !> [  I1 0  0 ]
+     !> [  0  C  0 ]
+     !> [ X11 ]   [ U1 |    ] [  0  0  0 ]
+     !> X = [-----] = [---------] [----------] V1**T .
+     !> [ X21 ]   [    | U2 ] [  0  0  0 ]
+     !> [  0  S  0 ]
+     !> [  0  0  I2]
+     !> X11 is P-by-Q. The orthogonal matrices U1, U2, and V1 are P-by-P,
+     !> (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
+     !> nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
+     !> R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
+     !> K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
+
+     subroutine stdlib_sorcsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta, &
+               u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, lwork, ldx11, ldx21, m, p, q
+           ! Array Arguments 
+           real(sp), intent(out) :: theta(*)
+           real(sp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), work(*)
+           real(sp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
+           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, j, lbbcsd, lorbdb, &
+           lorglq, lorglqmin, lorglqopt, lorgqr, lorgqrmin, lorgqropt, lworkmin, lworkopt, &
+                     r
+           logical(lk) :: lquery, wantu1, wantu2, wantv1t
+           ! Local Arrays 
+           real(sp) :: dum1(1), dum2(1,1)
+           ! Intrinsic Function 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -4
+           else if( p < 0 .or. p > m ) then
+              info = -5
+           else if( q < 0 .or. q > m ) then
+              info = -6
+           else if( ldx11 < max( 1, p ) ) then
+              info = -8
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -10
+           else if( wantu1 .and. ldu1 < max( 1, p ) ) then
+              info = -13
+           else if( wantu2 .and. ldu2 < max( 1, m - p ) ) then
+              info = -15
+           else if( wantv1t .and. ldv1t < max( 1, q ) ) then
+              info = -17
+           end if
+           r = min( p, m-p, q, m-q )
+           ! compute workspace
+             ! work layout:
+           ! |-------------------------------------------------------|
+           ! | lworkopt (1)                                          |
+           ! |-------------------------------------------------------|
+           ! | phi (max(1,r-1))                                      |
+           ! |-------------------------------------------------------|
+           ! | taup1 (max(1,p))                        | b11d (r)    |
+           ! | taup2 (max(1,m-p))                      | b11e (r-1)  |
+           ! | tauq1 (max(1,q))                        | b12d (r)    |
+           ! |-----------------------------------------| b12e (r-1)  |
+           ! | stdlib_sorbdb work | stdlib_sorgqr work | stdlib_sorglq work | b21d (r)    |
+           ! |             |             |             | b21e (r-1)  |
+           ! |             |             |             | b22d (r)    |
+           ! |             |             |             | b22e (r-1)  |
+           ! |             |             |             | stdlib_sbbcsd work |
+           ! |-------------------------------------------------------|
+           if( info == 0 ) then
+              iphi = 2
+              ib11d = iphi + max( 1, r-1 )
+              ib11e = ib11d + max( 1, r )
+              ib12d = ib11e + max( 1, r - 1 )
+              ib12e = ib12d + max( 1, r )
+              ib21d = ib12e + max( 1, r - 1 )
+              ib21e = ib21d + max( 1, r )
+              ib22d = ib21e + max( 1, r - 1 )
+              ib22e = ib22d + max( 1, r )
+              ibbcsd = ib22e + max( 1, r - 1 )
+              itaup1 = iphi + max( 1, r-1 )
+              itaup2 = itaup1 + max( 1, p )
+              itauq1 = itaup2 + max( 1, m-p )
+              iorbdb = itauq1 + max( 1, q )
+              iorgqr = itauq1 + max( 1, q )
+              iorglq = itauq1 + max( 1, q )
+              lorgqrmin = 1
+              lorgqropt = 1
+              lorglqmin = 1
+              lorglqopt = 1
+              if( r == q ) then
+                 call stdlib_sorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
+                           dum1, work, -1,childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_sorgqr( p, p, q, u1, ldu1, dum1, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 endif
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_sorgqr( m-p, m-p, q, u2, ldu2, dum1, work(1), -1,childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_sorglq( q-1, q-1, q-1, v1t, ldv1t,dum1, work(1), -1, childinfo )
+                              
+                    lorglqmin = max( lorglqmin, q-1 )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_sbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,dum1, u1, &
+                 ldu1, u2, ldu2, v1t, ldv1t, dum2,1, dum1, dum1, dum1, dum1, dum1,dum1, dum1, &
+                           dum1, work(1), -1, childinfo)
+                 lbbcsd = int( work(1),KIND=ilp)
+              else if( r == p ) then
+                 call stdlib_sorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
+                           dum1, work(1), -1,childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_sorgqr( p-1, p-1, p-1, u1(2,2), ldu1, dum1,work(1), -1, childinfo &
+                              )
+                    lorgqrmin = max( lorgqrmin, p-1 )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_sorgqr( m-p, m-p, q, u2, ldu2, dum1, work(1), -1,childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_sorglq( q, q, r, v1t, ldv1t, dum1, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_sbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,dum1, v1t, &
+                 ldv1t, dum2, 1, u1, ldu1, u2,ldu2, dum1, dum1, dum1, dum1, dum1,dum1, dum1, dum1,&
+                            work(1), -1, childinfo)
+                 lbbcsd = int( work(1),KIND=ilp)
+              else if( r == m-p ) then
+                 call stdlib_sorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
+                           dum1, work(1), -1,childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_sorgqr( p, p, q, u1, ldu1, dum1, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_sorgqr( m-p-1, m-p-1, m-p-1, u2(2,2), ldu2, dum1,work(1), -1, &
+                              childinfo )
+                    lorgqrmin = max( lorgqrmin, m-p-1 )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_sorglq( q, q, r, v1t, ldv1t, dum1, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_sbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, dum1, &
+                 dum2, 1, v1t, ldv1t, u2, ldu2,u1, ldu1, dum1, dum1, dum1, dum1,dum1, dum1, dum1, &
+                           dum1, work(1), -1,childinfo )
+                 lbbcsd = int( work(1),KIND=ilp)
+              else
+                 call stdlib_sorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,dum1, dum1, dum1, &
+                           dum1, dum1,work(1), -1, childinfo )
+                 lorbdb = m + int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_sorgqr( p, p, m-q, u1, ldu1, dum1, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_sorgqr( m-p, m-p, m-q, u2, ldu2, dum1, work(1),-1, childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_sorglq( q, q, q, v1t, ldv1t, dum1, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_sbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, dum1, u2, &
+                 ldu2, u1, ldu1, dum2, 1,v1t, ldv1t, dum1, dum1, dum1, dum1,dum1, dum1, dum1, &
+                           dum1, work(1), -1,childinfo )
+                 lbbcsd = int( work(1),KIND=ilp)
+              end if
+              lworkmin = max( iorbdb+lorbdb-1,iorgqr+lorgqrmin-1,iorglq+lorglqmin-1,ibbcsd+lbbcsd-&
+                        1 )
+              lworkopt = max( iorbdb+lorbdb-1,iorgqr+lorgqropt-1,iorglq+lorglqopt-1,ibbcsd+lbbcsd-&
+                        1 )
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                 info = -19
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'SORCSD2BY1', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           lorgqr = lwork-iorgqr+1
+           lorglq = lwork-iorglq+1
+           ! handle four cases separately: r = q, r = p, r = m-p, and r = m-q,
+           ! in which r = min(p,m-p,q,m-q)
+           if( r == q ) then
+              ! case 1: r = q
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_sorbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
+                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_slacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_sorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_slacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_sorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 v1t(1,1) = one
+                 do j = 2, q
+                    v1t(1,j) = zero
+                    v1t(j,1) = zero
+                 end do
+                 call stdlib_slacpy( 'U', q-1, q-1, x21(1,2), ldx21, v1t(2,2),ldv1t )
+                 call stdlib_sorglq( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorglq), &
+                           lorglq, childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_sbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,work(iphi), u1, &
+              ldu1, u2, ldu2, v1t, ldv1t,dum2, 1, work(ib11d), work(ib11e), work(ib12d),work(&
+              ib12e), work(ib21d), work(ib21e),work(ib22d), work(ib22e), work(ibbcsd), lbbcsd,&
+                        childinfo )
+              ! permute rows and columns to place zero submatrices in
+              ! preferred positions
+              if( q > 0 .and. wantu2 ) then
+                 do i = 1, q
+                    iwork(i) = m - p - q + i
+                 end do
+                 do i = q + 1, m - p
+                    iwork(i) = i - q
+                 end do
+                 call stdlib_slapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           else if( r == p ) then
+              ! case 2: r = p
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_sorbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
+                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 u1(1,1) = one
+                 do j = 2, p
+                    u1(1,j) = zero
+                    u1(j,1) = zero
+                 end do
+                 call stdlib_slacpy( 'L', p-1, p-1, x11(2,1), ldx11, u1(2,2), ldu1 )
+                 call stdlib_sorgqr( p-1, p-1, p-1, u1(2,2), ldu1, work(itaup1),work(iorgqr), &
+                           lorgqr, childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_slacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_sorgqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_slacpy( 'U', p, q, x11, ldx11, v1t, ldv1t )
+                 call stdlib_sorglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_sbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,work(iphi), v1t, &
+              ldv1t, dum1, 1, u1, ldu1, u2,ldu2, work(ib11d), work(ib11e), work(ib12d),work(ib12e)&
+              , work(ib21d), work(ib21e),work(ib22d), work(ib22e), work(ibbcsd), lbbcsd,childinfo &
+                        )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( q > 0 .and. wantu2 ) then
+                 do i = 1, q
+                    iwork(i) = m - p - q + i
+                 end do
+                 do i = q + 1, m - p
+                    iwork(i) = i - q
+                 end do
+                 call stdlib_slapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           else if( r == m-p ) then
+              ! case 3: r = m-p
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_sorbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
+                        , work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_slacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_sorgqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 u2(1,1) = one
+                 do j = 2, m-p
+                    u2(1,j) = zero
+                    u2(j,1) = zero
+                 end do
+                 call stdlib_slacpy( 'L', m-p-1, m-p-1, x21(2,1), ldx21, u2(2,2),ldu2 )
+                 call stdlib_sorgqr( m-p-1, m-p-1, m-p-1, u2(2,2), ldu2,work(itaup2), work(iorgqr)&
+                           , lorgqr, childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_slacpy( 'U', m-p, q, x21, ldx21, v1t, ldv1t )
+                 call stdlib_sorglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_sbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, work(iphi), &
+              dum1, 1, v1t, ldv1t, u2,ldu2, u1, ldu1, work(ib11d), work(ib11e),work(ib12d), work(&
+              ib12e), work(ib21d),work(ib21e), work(ib22d), work(ib22e),work(ibbcsd), lbbcsd, &
+                        childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( q > r ) then
+                 do i = 1, r
+                    iwork(i) = q - r + i
+                 end do
+                 do i = r + 1, q
+                    iwork(i) = i - r
+                 end do
+                 if( wantu1 ) then
+                    call stdlib_slapmt( .false., p, q, u1, ldu1, iwork )
+                 end if
+                 if( wantv1t ) then
+                    call stdlib_slapmr( .false., q, q, v1t, ldv1t, iwork )
+                 end if
+              end if
+           else
+              ! case 4: r = m-q
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_sorbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,work(iphi), work(itaup1)&
+              , work(itaup2),work(itauq1), work(iorbdb), work(iorbdb+m),lorbdb-m, childinfo )
+                        
+              ! accumulate householder reflectors
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_scopy( m-p, work(iorbdb+p), 1, u2, 1 )
+              end if
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_scopy( p, work(iorbdb), 1, u1, 1 )
+                 do j = 2, p
+                    u1(1,j) = zero
+                 end do
+                 call stdlib_slacpy( 'L', p-1, m-q-1, x11(2,1), ldx11, u1(2,2),ldu1 )
+                 call stdlib_sorgqr( p, p, m-q, u1, ldu1, work(itaup1),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 do j = 2, m-p
+                    u2(1,j) = zero
+                 end do
+                 call stdlib_slacpy( 'L', m-p-1, m-q-1, x21(2,1), ldx21, u2(2,2),ldu2 )
+                 call stdlib_sorgqr( m-p, m-p, m-q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_slacpy( 'U', m-q, q, x21, ldx21, v1t, ldv1t )
+                 call stdlib_slacpy( 'U', p-(m-q), q-(m-q), x11(m-q+1,m-q+1), ldx11,v1t(m-q+1,m-q+&
+                           1), ldv1t )
+                 call stdlib_slacpy( 'U', -p+q, q-p, x21(m-q+1,p+1), ldx21,v1t(p+1,p+1), ldv1t )
+                           
+                 call stdlib_sorglq( q, q, q, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_sbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, work(iphi), &
+              u2, ldu2, u1, ldu1, dum1, 1,v1t, ldv1t, work(ib11d), work(ib11e), work(ib12d),work(&
+              ib12e), work(ib21d), work(ib21e),work(ib22d), work(ib22e), work(ibbcsd), lbbcsd,&
+                        childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( p > r ) then
+                 do i = 1, r
+                    iwork(i) = p - r + i
+                 end do
+                 do i = r + 1, p
+                    iwork(i) = i - r
+                 end do
+                 if( wantu1 ) then
+                    call stdlib_slapmt( .false., p, p, u1, ldu1, iwork )
+                 end if
+                 if( wantv1t ) then
+                    call stdlib_slapmr( .false., p, q, v1t, ldv1t, iwork )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_sorcsd2by1
+
+     !> SORGTR: generates a real orthogonal matrix Q which is defined as the
+     !> product of n-1 elementary reflectors of order N, as returned by
+     !> SSYTRD:
+     !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+
+     pure subroutine stdlib_sorgtr( uplo, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, j, lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, n-1 ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              if ( upper ) then
+                nb = stdlib_ilaenv( 1, 'SORGQL', ' ', n-1, n-1, n-1, -1 )
+              else
+                nb = stdlib_ilaenv( 1, 'SORGQR', ' ', n-1, n-1, n-1, -1 )
+              end if
+              lwkopt = max( 1, n-1 )*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORGTR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( upper ) then
+              ! q was determined by a call to stdlib_ssytrd with uplo = 'u'
+              ! shift the vectors which define the elementary reflectors one
+              ! column to the left, and set the last row and column of q to
+              ! those of the unit matrix
+              do j = 1, n - 1
+                 do i = 1, j - 1
+                    a( i, j ) = a( i, j+1 )
+                 end do
+                 a( n, j ) = zero
+              end do
+              do i = 1, n - 1
+                 a( i, n ) = zero
+              end do
+              a( n, n ) = one
+              ! generate q(1:n-1,1:n-1)
+              call stdlib_sorgql( n-1, n-1, n-1, a, lda, tau, work, lwork, iinfo )
+           else
+              ! q was determined by a call to stdlib_ssytrd with uplo = 'l'.
+              ! shift the vectors which define the elementary reflectors one
+              ! column to the right, and set the first row and column of q to
+              ! those of the unit matrix
+              do j = n, 2, -1
+                 a( 1, j ) = zero
+                 do i = j + 1, n
+                    a( i, j ) = a( i, j-1 )
+                 end do
+              end do
+              a( 1, 1 ) = one
+              do i = 2, n
+                 a( i, 1 ) = zero
+              end do
+              if( n>1 ) then
+                 ! generate q(2:n,2:n)
+                 call stdlib_sorgqr( n-1, n-1, n-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                           
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sorgtr
+
+     !> SORGTSQR: generates an M-by-N real matrix Q_out with orthonormal columns,
+     !> which are the first N columns of a product of real orthogonal
+     !> matrices of order M which are returned by SLATSQR
+     !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+     !> See the documentation for SLATSQR.
+
+     pure subroutine stdlib_sorgtsqr( m, n, mb, nb, a, lda, t, ldt, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, mb, nb
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: t(ldt,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: iinfo, ldc, lworkopt, lc, lw, nblocal, j
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           lquery  = lwork==-1
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb<=n ) then
+              info = -3
+           else if( nb<1 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -8
+           else
+              ! test the input lwork for the dimension of the array work.
+              ! this workspace is used to store array c(ldc, n) and work(lwork)
+              ! in the call to stdlib_slamtsqr. see the documentation for stdlib_slamtsqr.
+              if( lwork<2 .and. (.not.lquery) ) then
+                 info = -10
+              else
+                 ! set block size for column blocks
+                 nblocal = min( nb, n )
+                 ! lwork = -1, then set the size for the array c(ldc,n)
+                 ! in stdlib_slamtsqr call and set the optimal size of the work array
+                 ! work(lwork) in stdlib_slamtsqr call.
+                 ldc = m
+                 lc = ldc*n
+                 lw = n * nblocal
+                 lworkopt = lc+lw
+                 if( ( lwork<max( 1, lworkopt ) ).and.(.not.lquery) ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           ! handle error in the input parameters and return workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORGTSQR', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = real( lworkopt,KIND=sp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = real( lworkopt,KIND=sp)
+              return
+           end if
+           ! (1) form explicitly the tall-skinny m-by-n left submatrix q1_in
+           ! of m-by-m orthogonal matrix q_in, which is implicitly stored in
+           ! the subdiagonal part of input array a and in the input array t.
+           ! perform by the following operation using the routine stdlib_slamtsqr.
+               ! q1_in = q_in * ( i ), where i is a n-by-n identity matrix,
+                              ! ( 0 )        0 is a (m-n)-by-n zero matrix.
+           ! (1a) form m-by-n matrix in the array work(1:ldc*n) with ones
+           ! on the diagonal and zeros elsewhere.
+           call stdlib_slaset( 'F', m, n, zero, one, work, ldc )
+           ! (1b)  on input, work(1:ldc*n) stores ( i );
+                                                ! ( 0 )
+                 ! on output, work(1:ldc*n) stores q1_in.
+           call stdlib_slamtsqr( 'L', 'N', m, n, n, mb, nblocal, a, lda, t, ldt,work, ldc, work( &
+                     lc+1 ), lw, iinfo )
+           ! (2) copy the result from the part of the work array (1:m,1:n)
+           ! with the leading dimension ldc that starts at work(1) into
+           ! the output array a(1:m,1:n) column-by-column.
+           do j = 1, n
+              call stdlib_scopy( m, work( (j-1)*ldc + 1 ), 1, a( 1, j ), 1 )
+           end do
+           work( 1 ) = real( lworkopt,KIND=sp)
+           return
+     end subroutine stdlib_sorgtsqr
+
+     !> SORMTR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> nq-1 elementary reflectors, as returned by SSYTRD:
+     !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+     pure subroutine stdlib_sormtr( side, uplo, trans, m, n, a, lda, tau, c, ldc,work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, lquery, upper
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, ni, nb, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) )&
+                     then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              if( upper ) then
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'SORMQL', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'SORMQL', side // trans, m, n-1, n-1,-1 )
+                 end if
+              else
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'SORMQR', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'SORMQR', side // trans, m, n-1, n-1,-1 )
+                 end if
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORMTR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. nq==1 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( left ) then
+              mi = m - 1
+              ni = n
+           else
+              mi = m
+              ni = n - 1
+           end if
+           if( upper ) then
+              ! q was determined by a call to stdlib_ssytrd with uplo = 'u'
+              call stdlib_sormql( side, trans, mi, ni, nq-1, a( 1, 2 ), lda, tau, c,ldc, work, &
+                        lwork, iinfo )
+           else
+              ! q was determined by a call to stdlib_ssytrd with uplo = 'l'
+              if( left ) then
+                 i1 = 2
+                 i2 = 1
+              else
+                 i1 = 1
+                 i2 = 2
+              end if
+              call stdlib_sormqr( side, trans, mi, ni, nq-1, a( 2, 1 ), lda, tau,c( i1, i2 ), ldc,&
+                         work, lwork, iinfo )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sormtr
+
+     !> SPBTRF: computes the Cholesky factorization of a real symmetric
+     !> positive definite band matrix A.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+
+     pure subroutine stdlib_spbtrf( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 32
+           integer(ilp), parameter :: ldwork = nbmax+1
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i2, i3, ib, ii, j, jj, nb
+           ! Local Arrays 
+           real(sp) :: work(ldwork,nbmax)
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( .not.stdlib_lsame( uplo, 'U' ) ) .and.( .not.stdlib_lsame( uplo, 'L' ) ) ) &
+                     then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPBTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment
+           nb = stdlib_ilaenv( 1, 'SPBTRF', uplo, n, kd, -1, -1 )
+           ! the block size must not exceed the semi-bandwidth kd, and must not
+           ! exceed the limit set by the size of the local array work.
+           nb = min( nb, nbmax )
+           if( nb<=1 .or. nb>kd ) then
+              ! use unblocked code
+              call stdlib_spbtf2( uplo, n, kd, ab, ldab, info )
+           else
+              ! use blocked code
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! compute the cholesky factorization of a symmetric band
+                 ! matrix, given the upper triangle of the matrix in band
+                 ! storage.
+                 ! zero the upper triangle of the work array.
+                 do j = 1, nb
+                    do i = 1, j - 1
+                       work( i, j ) = zero
+                    end do
+                 end do
+                 ! process the band matrix one diagonal block at a time.
+                 loop_70: do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    ! factorize the diagonal block
+                    call stdlib_spotf2( uplo, ib, ab( kd+1, i ), ldab-1, ii )
+                    if( ii/=0 ) then
+                       info = i + ii - 1
+                       go to 150
+                    end if
+                    if( i+ib<=n ) then
+                       ! update the relevant part of the trailing submatrix.
+                       ! if a11 denotes the diagonal block which has just been
+                       ! factorized, then we need to update the remaining
+                       ! blocks in the diagram:
+                          ! a11   a12   a13
+                                ! a22   a23
+                                      ! a33
+                       ! the numbers of rows and columns in the partitioning
+                       ! are ib, i2, i3 respectively. the blocks a12, a22 and
+                       ! a23 are empty if ib = kd. the upper triangle of a13
+                       ! lies outside the band.
+                       i2 = min( kd-ib, n-i-ib+1 )
+                       i3 = min( ib, n-i-kd+1 )
+                       if( i2>0 ) then
+                          ! update a12
+                          call stdlib_strsm( 'LEFT', 'UPPER', 'TRANSPOSE','NON-UNIT', ib, i2, one,&
+                                     ab( kd+1, i ),ldab-1, ab( kd+1-ib, i+ib ), ldab-1 )
+                          ! update a22
+                          call stdlib_ssyrk( 'UPPER', 'TRANSPOSE', i2, ib, -one,ab( kd+1-ib, i+ib &
+                                    ), ldab-1, one,ab( kd+1, i+ib ), ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! copy the lower triangle of a13 into the work array.
+                          do jj = 1, i3
+                             do ii = jj, ib
+                                work( ii, jj ) = ab( ii-jj+1, jj+i+kd-1 )
+                             end do
+                          end do
+                          ! update a13 (in the work array).
+                          call stdlib_strsm( 'LEFT', 'UPPER', 'TRANSPOSE','NON-UNIT', ib, i3, one,&
+                                     ab( kd+1, i ),ldab-1, work, ldwork )
+                          ! update a23
+                          if( i2>0 )call stdlib_sgemm( 'TRANSPOSE', 'NO TRANSPOSE', i2, i3,ib, -&
+                          one, ab( kd+1-ib, i+ib ),ldab-1, work, ldwork, one,ab( 1+ib, i+kd ), &
+                                    ldab-1 )
+                          ! update a33
+                          call stdlib_ssyrk( 'UPPER', 'TRANSPOSE', i3, ib, -one,work, ldwork, one,&
+                                     ab( kd+1, i+kd ),ldab-1 )
+                          ! copy the lower triangle of a13 back into place.
+                          do jj = 1, i3
+                             do ii = jj, ib
+                                ab( ii-jj+1, jj+i+kd-1 ) = work( ii, jj )
+                             end do
+                          end do
+                       end if
+                    end if
+                 end do loop_70
+              else
+                 ! compute the cholesky factorization of a symmetric band
+                 ! matrix, given the lower triangle of the matrix in band
+                 ! storage.
+                 ! zero the lower triangle of the work array.
+                 do j = 1, nb
+                    do i = j + 1, nb
+                       work( i, j ) = zero
+                    end do
+                 end do
+                 ! process the band matrix one diagonal block at a time.
+                 loop_140: do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    ! factorize the diagonal block
+                    call stdlib_spotf2( uplo, ib, ab( 1, i ), ldab-1, ii )
+                    if( ii/=0 ) then
+                       info = i + ii - 1
+                       go to 150
+                    end if
+                    if( i+ib<=n ) then
+                       ! update the relevant part of the trailing submatrix.
+                       ! if a11 denotes the diagonal block which has just been
+                       ! factorized, then we need to update the remaining
+                       ! blocks in the diagram:
+                          ! a11
+                          ! a21   a22
+                          ! a31   a32   a33
+                       ! the numbers of rows and columns in the partitioning
+                       ! are ib, i2, i3 respectively. the blocks a21, a22 and
+                       ! a32 are empty if ib = kd. the lower triangle of a31
+                       ! lies outside the band.
+                       i2 = min( kd-ib, n-i-ib+1 )
+                       i3 = min( ib, n-i-kd+1 )
+                       if( i2>0 ) then
+                          ! update a21
+                          call stdlib_strsm( 'RIGHT', 'LOWER', 'TRANSPOSE','NON-UNIT', i2, ib, &
+                                    one, ab( 1, i ),ldab-1, ab( 1+ib, i ), ldab-1 )
+                          ! update a22
+                          call stdlib_ssyrk( 'LOWER', 'NO TRANSPOSE', i2, ib, -one,ab( 1+ib, i ), &
+                                    ldab-1, one,ab( 1, i+ib ), ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! copy the upper triangle of a31 into the work array.
+                          do jj = 1, ib
+                             do ii = 1, min( jj, i3 )
+                                work( ii, jj ) = ab( kd+1-jj+ii, jj+i-1 )
+                             end do
+                          end do
+                          ! update a31 (in the work array).
+                          call stdlib_strsm( 'RIGHT', 'LOWER', 'TRANSPOSE','NON-UNIT', i3, ib, &
+                                    one, ab( 1, i ),ldab-1, work, ldwork )
+                          ! update a32
+                          if( i2>0 )call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', i3, i2,ib, -&
+                          one, work, ldwork,ab( 1+ib, i ), ldab-1, one,ab( 1+kd-ib, i+ib ), ldab-&
+                                    1 )
+                          ! update a33
+                          call stdlib_ssyrk( 'LOWER', 'NO TRANSPOSE', i3, ib, -one,work, ldwork, &
+                                    one, ab( 1, i+kd ),ldab-1 )
+                          ! copy the upper triangle of a31 back into place.
+                          do jj = 1, ib
+                             do ii = 1, min( jj, i3 )
+                                ab( kd+1-jj+ii, jj+i-1 ) = work( ii, jj )
+                             end do
+                          end do
+                       end if
+                    end if
+                 end do loop_140
+              end if
+           end if
+           return
+           150 continue
+           return
+     end subroutine stdlib_spbtrf
+
+     !> SPFTRI: computes the inverse of a real (symmetric) positive definite
+     !> matrix A using the Cholesky factorization A = U**T*U or A = L*L**T
+     !> computed by SPFTRF.
+
+     pure subroutine stdlib_spftri( transr, uplo, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPFTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_stftri( transr, uplo, 'N', n, a, info )
+           if( info>0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution of triangular matrix multiply: inv(u)*inv(u)^c or
+           ! inv(l)^c*inv(l). there are eight cases.
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                    ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                    ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_slauum( 'L', n1, a( 0 ), n, info )
+                    call stdlib_ssyrk( 'L', 'T', n1, n2, one, a( n1 ), n, one,a( 0 ), n )
+                    call stdlib_strmm( 'L', 'U', 'N', 'N', n2, n1, one, a( n ), n,a( n1 ), n )
+                              
+                    call stdlib_slauum( 'U', n2, a( n ), n, info )
+                 else
+                    ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                    ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                    ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_slauum( 'L', n1, a( n2 ), n, info )
+                    call stdlib_ssyrk( 'L', 'N', n1, n2, one, a( 0 ), n, one,a( n2 ), n )
+                    call stdlib_strmm( 'R', 'U', 'T', 'N', n1, n2, one, a( n1 ), n,a( 0 ), n )
+                              
+                    call stdlib_slauum( 'U', n2, a( n1 ), n, info )
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose, and n is odd
+                    ! t1 -> a(0), t2 -> a(1), s -> a(0+n1*n1)
+                    call stdlib_slauum( 'U', n1, a( 0 ), n1, info )
+                    call stdlib_ssyrk( 'U', 'N', n1, n2, one, a( n1*n1 ), n1, one,a( 0 ), n1 )
+                              
+                    call stdlib_strmm( 'R', 'L', 'N', 'N', n1, n2, one, a( 1 ), n1,a( n1*n1 ), n1 &
+                              )
+                    call stdlib_slauum( 'L', n2, a( 1 ), n1, info )
+                 else
+                    ! srpa for upper, transpose, and n is odd
+                    ! t1 -> a(0+n2*n2), t2 -> a(0+n1*n2), s -> a(0)
+                    call stdlib_slauum( 'U', n1, a( n2*n2 ), n2, info )
+                    call stdlib_ssyrk( 'U', 'T', n1, n2, one, a( 0 ), n2, one,a( n2*n2 ), n2 )
+                              
+                    call stdlib_strmm( 'L', 'L', 'T', 'N', n2, n1, one, a( n1*n2 ),n2, a( 0 ), n2 &
+                              )
+                    call stdlib_slauum( 'L', n2, a( n1*n2 ), n2, info )
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_slauum( 'L', k, a( 1 ), n+1, info )
+                    call stdlib_ssyrk( 'L', 'T', k, k, one, a( k+1 ), n+1, one,a( 1 ), n+1 )
+                              
+                    call stdlib_strmm( 'L', 'U', 'N', 'N', k, k, one, a( 0 ), n+1,a( k+1 ), n+1 )
+                              
+                    call stdlib_slauum( 'U', k, a( 0 ), n+1, info )
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_slauum( 'L', k, a( k+1 ), n+1, info )
+                    call stdlib_ssyrk( 'L', 'N', k, k, one, a( 0 ), n+1, one,a( k+1 ), n+1 )
+                              
+                    call stdlib_strmm( 'R', 'U', 'T', 'N', k, k, one, a( k ), n+1,a( 0 ), n+1 )
+                              
+                    call stdlib_slauum( 'U', k, a( k ), n+1, info )
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose, and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1),
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_slauum( 'U', k, a( k ), k, info )
+                    call stdlib_ssyrk( 'U', 'N', k, k, one, a( k*( k+1 ) ), k, one,a( k ), k )
+                              
+                    call stdlib_strmm( 'R', 'L', 'N', 'N', k, k, one, a( 0 ), k,a( k*( k+1 ) ), k &
+                              )
+                    call stdlib_slauum( 'L', k, a( 0 ), k, info )
+                 else
+                    ! srpa for upper, transpose, and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0),
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_slauum( 'U', k, a( k*( k+1 ) ), k, info )
+                    call stdlib_ssyrk( 'U', 'T', k, k, one, a( 0 ), k, one,a( k*( k+1 ) ), k )
+                              
+                    call stdlib_strmm( 'L', 'L', 'T', 'N', k, k, one, a( k*k ), k,a( 0 ), k )
+                              
+                    call stdlib_slauum( 'L', k, a( k*k ), k, info )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_spftri
+
+     !> SPOTRF: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the block version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_spotrf( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPOTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'SPOTRF', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code.
+              call stdlib_spotrf2( uplo, n, a, lda, info )
+           else
+              ! use blocked code.
+              if( upper ) then
+                 ! compute the cholesky factorization a = u**t*u.
+                 do j = 1, n, nb
+                    ! update and factorize the current diagonal block and test
+                    ! for non-positive-definiteness.
+                    jb = min( nb, n-j+1 )
+                    call stdlib_ssyrk( 'UPPER', 'TRANSPOSE', jb, j-1, -one,a( 1, j ), lda, one, a(&
+                               j, j ), lda )
+                    call stdlib_spotrf2( 'UPPER', jb, a( j, j ), lda, info )
+                    if( info/=0 )go to 30
+                    if( j+jb<=n ) then
+                       ! compute the current block row.
+                       call stdlib_sgemm( 'TRANSPOSE', 'NO TRANSPOSE', jb, n-j-jb+1,j-1, -one, a( &
+                                 1, j ), lda, a( 1, j+jb ),lda, one, a( j, j+jb ), lda )
+                       call stdlib_strsm( 'LEFT', 'UPPER', 'TRANSPOSE', 'NON-UNIT',jb, n-j-jb+1, &
+                                 one, a( j, j ), lda,a( j, j+jb ), lda )
+                    end if
+                 end do
+              else
+                 ! compute the cholesky factorization a = l*l**t.
+                 do j = 1, n, nb
+                    ! update and factorize the current diagonal block and test
+                    ! for non-positive-definiteness.
+                    jb = min( nb, n-j+1 )
+                    call stdlib_ssyrk( 'LOWER', 'NO TRANSPOSE', jb, j-1, -one,a( j, 1 ), lda, one,&
+                               a( j, j ), lda )
+                    call stdlib_spotrf2( 'LOWER', jb, a( j, j ), lda, info )
+                    if( info/=0 )go to 30
+                    if( j+jb<=n ) then
+                       ! compute the current block column.
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,j-1, -one, a( &
+                                 j+jb, 1 ), lda, a( j, 1 ),lda, one, a( j+jb, j ), lda )
+                       call stdlib_strsm( 'RIGHT', 'LOWER', 'TRANSPOSE', 'NON-UNIT',n-j-jb+1, jb, &
+                                 one, a( j, j ), lda,a( j+jb, j ), lda )
+                    end if
+                 end do
+              end if
+           end if
+           go to 40
+           30 continue
+           info = info + j - 1
+           40 continue
+           return
+     end subroutine stdlib_spotrf
+
+     !> SPTRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric positive definite
+     !> and tridiagonal, and provides error bounds and backward error
+     !> estimates for the solution.
+
+     pure subroutine stdlib_sptrfs( n, nrhs, d, e, df, ef, b, ldb, x, ldx, ferr,berr, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(in) :: b(ldb,*), d(*), df(*), e(*), ef(*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*)
+           real(sp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: count, i, ix, j, nz
+           real(sp) :: bi, cx, dx, eps, ex, lstres, s, safe1, safe2, safmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPTRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = 4
+           eps = stdlib_slamch( 'EPSILON' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_90: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x.  also compute
+              ! abs(a)*abs(x) + abs(b) for use in the backward error bound.
+              if( n==1 ) then
+                 bi = b( 1, j )
+                 dx = d( 1 )*x( 1, j )
+                 work( n+1 ) = bi - dx
+                 work( 1 ) = abs( bi ) + abs( dx )
+              else
+                 bi = b( 1, j )
+                 dx = d( 1 )*x( 1, j )
+                 ex = e( 1 )*x( 2, j )
+                 work( n+1 ) = bi - dx - ex
+                 work( 1 ) = abs( bi ) + abs( dx ) + abs( ex )
+                 do i = 2, n - 1
+                    bi = b( i, j )
+                    cx = e( i-1 )*x( i-1, j )
+                    dx = d( i )*x( i, j )
+                    ex = e( i )*x( i+1, j )
+                    work( n+i ) = bi - cx - dx - ex
+                    work( i ) = abs( bi ) + abs( cx ) + abs( dx ) + abs( ex )
+                 end do
+                 bi = b( n, j )
+                 cx = e( n-1 )*x( n-1, j )
+                 dx = d( n )*x( n, j )
+                 work( n+n ) = bi - cx - dx
+                 work( n ) = abs( bi ) + abs( cx ) + abs( dx )
+              end if
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              s = zero
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    s = max( s, abs( work( n+i ) ) / work( i ) )
+                 else
+                    s = max( s, ( abs( work( n+i ) )+safe1 ) /( work( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_spttrs( n, 1, df, ef, work( n+1 ), n, info )
+                 call stdlib_saxpy( n, one, work( n+1 ), 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              do i = 1, n
+                 if( work( i )>safe2 ) then
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i )
+                 else
+                    work( i ) = abs( work( n+i ) ) + nz*eps*work( i ) + safe1
+                 end if
+              end do
+              ix = stdlib_isamax( n, work, 1 )
+              ferr( j ) = work( ix )
+              ! estimate the norm of inv(a).
+              ! solve m(a) * x = e, where m(a) = (m(i,j)) is given by
+                 ! m(i,j) =  abs(a(i,j)), i = j,
+                 ! m(i,j) = -abs(a(i,j)), i .ne. j,
+              ! and e = [ 1, 1, ..., 1 ]**t.  note m(a) = m(l)*d*m(l)**t.
+              ! solve m(l) * x = e.
+              work( 1 ) = one
+              do i = 2, n
+                 work( i ) = one + work( i-1 )*abs( ef( i-1 ) )
+              end do
+              ! solve d * m(l)**t * x = b.
+              work( n ) = work( n ) / df( n )
+              do i = n - 1, 1, -1
+                 work( i ) = work( i ) / df( i ) + work( i+1 )*abs( ef( i ) )
+              end do
+              ! compute norm(inv(a)) = max(x(i)), 1<=i<=n.
+              ix = stdlib_isamax( n, work, 1 )
+              ferr( j ) = ferr( j )*abs( work( ix ) )
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_90
+           return
+     end subroutine stdlib_sptrfs
+
+     !> SPTSV: computes the solution to a real system of linear equations
+     !> A*X = B, where A is an N-by-N symmetric positive definite tridiagonal
+     !> matrix, and X and B are N-by-NRHS matrices.
+     !> A is factored as A = L*D*L**T, and the factored form of A is then
+     !> used to solve the system of equations.
+
+     pure subroutine stdlib_sptsv( n, nrhs, d, e, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(inout) :: b(ldb,*), d(*), e(*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPTSV ', -info )
+              return
+           end if
+           ! compute the l*d*l**t (or u**t*d*u) factorization of a.
+           call stdlib_spttrf( n, d, e, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_spttrs( n, nrhs, d, e, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_sptsv
+
+     !> SPTSVX: uses the factorization A = L*D*L**T to compute the solution
+     !> to a real system of linear equations A*X = B, where A is an N-by-N
+     !> symmetric positive definite tridiagonal matrix and X and B are
+     !> N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     pure subroutine stdlib_sptsvx( fact, n, nrhs, d, e, df, ef, b, ldb, x, ldx,rcond, ferr, berr,&
+                work, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           real(sp), intent(in) :: b(ldb,*), d(*), e(*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+           real(sp), intent(inout) :: df(*), ef(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(sp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPTSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the l*d*l**t (or u**t*d*u) factorization of a.
+              call stdlib_scopy( n, d, 1, df, 1 )
+              if( n>1 )call stdlib_scopy( n-1, e, 1, ef, 1 )
+              call stdlib_spttrf( n, df, ef, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_slanst( '1', n, d, e )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_sptcon( n, df, ef, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_slacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_spttrs( n, nrhs, df, ef, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_sptrfs( n, nrhs, d, e, df, ef, b, ldb, x, ldx, ferr, berr,work, info )
+                     
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_sptsvx
+
+     !> SSBEV: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a real symmetric band matrix A.
+
+     subroutine stdlib_ssbev( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldz, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ab(ldab,*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, wantz
+           integer(ilp) :: iinfo, imax, inde, indwrk, iscale
+           real(sp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSBEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( lower ) then
+                 w( 1 ) = ab( 1, 1 )
+              else
+                 w( 1 ) = ab( kd+1, 1 )
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_slansb( 'M', uplo, n, kd, ab, ldab, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_slascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_slascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+           end if
+           ! call stdlib_ssbtrd to reduce symmetric band matrix to tridiagonal form.
+           inde = 1
+           indwrk = inde + n
+           call stdlib_ssbtrd( jobz, uplo, n, kd, ab, ldab, w, work( inde ), z, ldz,work( indwrk )&
+                     , iinfo )
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, call stdlib_ssteqr.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, work( inde ), info )
+           else
+              call stdlib_ssteqr( jobz, n, w, work( inde ), z, ldz, work( indwrk ),info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           return
+     end subroutine stdlib_ssbev
+
+     !> SSBEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric band matrix A.  Eigenvalues and eigenvectors can
+     !> be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+
+     subroutine stdlib_ssbevx( jobz, range, uplo, n, kd, ab, ldab, q, ldq, vl,vu, il, iu, abstol, &
+               m, w, z, ldz, work, iwork,ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, kd, ldab, ldq, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(inout) :: ab(ldab,*)
+           real(sp), intent(out) :: q(ldq,*), w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwo, indwrk, &
+                     iscale, itmp1, j, jj, nsplit
+           real(sp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lower = stdlib_lsame( uplo, 'L' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           else if( wantz .and. ldq<max( 1, n ) ) then
+              info = -9
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -11
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -12
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -13
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -18
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSBEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              m = 1
+              if( lower ) then
+                 tmp1 = ab( 1, 1 )
+              else
+                 tmp1 = ab( kd+1, 1 )
+              end if
+              if( valeig ) then
+                 if( .not.( vl<tmp1 .and. vu>=tmp1 ) )m = 0
+              end if
+              if( m==1 ) then
+                 w( 1 ) = tmp1
+                 if( wantz )z( 1, 1 ) = one
+              end if
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if ( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           endif
+           anrm = stdlib_slansb( 'M', uplo, n, kd, ab, ldab, work )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_slascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_slascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_ssbtrd to reduce symmetric band matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indwrk = inde + n
+           call stdlib_ssbtrd( jobz, uplo, n, kd, ab, ldab, work( indd ),work( inde ), q, ldq, &
+                     work( indwrk ), iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_ssterf or stdlib_ssteqr.  if this fails for some
+           ! eigenvalue, then try stdlib_sstebz.
+           test = .false.
+           if (indeig) then
+              if (il==1 .and. iu==n) then
+                 test = .true.
+              end if
+           end if
+           if ((alleig .or. test) .and. (abstol<=zero)) then
+              call stdlib_scopy( n, work( indd ), 1, w, 1 )
+              indee = indwrk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_scopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_ssterf( n, w, work( indee ), info )
+              else
+                 call stdlib_slacpy( 'A', n, n, q, ldq, z, ldz )
+                 call stdlib_scopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_ssteqr( jobz, n, w, work( indee ), z, ldz,work( indwrk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_sstebz and, if eigenvectors are desired, stdlib_sstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_sstebz( range, order, n, vll, vuu, il, iu, abstll,work( indd ), work( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwrk ),iwork( indiwo ), info &
+                     )
+           if( wantz ) then
+              call stdlib_sstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwrk ), iwork( indiwo ), ifail, info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_sstein.
+              do j = 1, m
+                 call stdlib_scopy( n, z( 1, j ), 1, work( 1 ), 1 )
+                 call stdlib_sgemv( 'N', n, n, one, q, ldq, work, 1, zero,z( 1, j ), 1 )
+              end do
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           30 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_sswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_ssbevx
+
+     !> SSBGV: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x. Here A and B are assumed to be symmetric
+     !> and banded, and B is also positive definite.
+
+     pure subroutine stdlib_ssbgv( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w, z,ldz, work, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldz, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper, wantz
+           character :: vect
+           integer(ilp) :: iinfo, inde, indwrk
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSBGV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_spbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           inde = 1
+           indwrk = inde + n
+           call stdlib_ssbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, z, ldz,work( indwrk ), &
+                     iinfo )
+           ! reduce to tridiagonal form.
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_ssbtrd( vect, uplo, n, ka, ab, ldab, w, work( inde ), z, ldz,work( indwrk )&
+                     , iinfo )
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, call stdlib_ssteqr.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, work( inde ), info )
+           else
+              call stdlib_ssteqr( jobz, n, w, work( inde ), z, ldz, work( indwrk ),info )
+           end if
+           return
+     end subroutine stdlib_ssbgv
+
+     !> SSBGVX: computes selected eigenvalues, and optionally, eigenvectors
+     !> of a real generalized symmetric-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x.  Here A and B are assumed to be symmetric
+     !> and banded, and B is also positive definite.  Eigenvalues and
+     !> eigenvectors can be selected by specifying either all eigenvalues,
+     !> a range of values or a range of indices for the desired eigenvalues.
+
+     pure subroutine stdlib_ssbgvx( jobz, range, uplo, n, ka, kb, ab, ldab, bb,ldbb, q, ldq, vl, &
+               vu, il, iu, abstol, m, w, z,ldz, work, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, ka, kb, ldab, ldbb, ldq, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           real(sp), intent(out) :: q(ldq,*), w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, upper, valeig, wantz
+           character :: order, vect
+           integer(ilp) :: i, iinfo, indd, inde, indee, indibl, indisp, indiwo, indwrk, itmp1, j, &
+                     jj, nsplit
+           real(sp) :: tmp1
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ka<0 ) then
+              info = -5
+           else if( kb<0 .or. kb>ka ) then
+              info = -6
+           else if( ldab<ka+1 ) then
+              info = -8
+           else if( ldbb<kb+1 ) then
+              info = -10
+           else if( ldq<1 .or. ( wantz .and. ldq<n ) ) then
+              info = -12
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -14
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -15
+                 else if ( iu<min( n, il ) .or. iu>n ) then
+                    info = -16
+                 end if
+              end if
+           end if
+           if( info==0) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -21
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSBGVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_spbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           call stdlib_ssbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, q, ldq,work, iinfo )
+                     
+           ! reduce symmetric band matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indwrk = inde + n
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_ssbtrd( vect, uplo, n, ka, ab, ldab, work( indd ),work( inde ), q, ldq, &
+                     work( indwrk ), iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_ssterf or stdlib_ssteqr.  if this fails for some
+           ! eigenvalue, then try stdlib_sstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_scopy( n, work( indd ), 1, w, 1 )
+              indee = indwrk + 2*n
+              call stdlib_scopy( n-1, work( inde ), 1, work( indee ), 1 )
+              if( .not.wantz ) then
+                 call stdlib_ssterf( n, w, work( indee ), info )
+              else
+                 call stdlib_slacpy( 'A', n, n, q, ldq, z, ldz )
+                 call stdlib_ssteqr( jobz, n, w, work( indee ), z, ldz,work( indwrk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_sstebz and, if eigenvectors are desired,
+           ! call stdlib_sstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_sstebz( range, order, n, vl, vu, il, iu, abstol,work( indd ), work( inde ),&
+            m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwrk ),iwork( indiwo ), info )
+                      
+           if( wantz ) then
+              call stdlib_sstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwrk ), iwork( indiwo ), ifail, info )
+              ! apply transformation matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_sstein.
+              do j = 1, m
+                 call stdlib_scopy( n, z( 1, j ), 1, work( 1 ), 1 )
+                 call stdlib_sgemv( 'N', n, n, one, q, ldq, work, 1, zero,z( 1, j ), 1 )
+              end do
+           end if
+           30 continue
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_sswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_ssbgvx
+
+     !> SSPEV: computes all the eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric matrix A in packed storage.
+
+     subroutine stdlib_sspev( jobz, uplo, n, ap, w, z, ldz, work, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ap(*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantz
+           integer(ilp) :: iinfo, imax, inde, indtau, indwrk, iscale
+           real(sp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = ap( 1 )
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_slansp( 'M', uplo, n, ap, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_sscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+           end if
+           ! call stdlib_ssptrd to reduce symmetric packed matrix to tridiagonal form.
+           inde = 1
+           indtau = inde + n
+           call stdlib_ssptrd( uplo, n, ap, w, work( inde ), work( indtau ), iinfo )
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, first call
+           ! stdlib_sopgtr to generate the orthogonal matrix, then call stdlib_ssteqr.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, work( inde ), info )
+           else
+              indwrk = indtau + n
+              call stdlib_sopgtr( uplo, n, ap, work( indtau ), z, ldz,work( indwrk ), iinfo )
+                        
+              call stdlib_ssteqr( jobz, n, w, work( inde ), z, ldz, work( indtau ),info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           return
+     end subroutine stdlib_sspev
+
+     !> SSPEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric matrix A in packed storage.  Eigenvalues/vectors
+     !> can be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+
+     subroutine stdlib_sspevx( jobz, range, uplo, n, ap, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               work, iwork, ifail,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(inout) :: ap(*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwo, indtau, &
+                     indwrk, iscale, itmp1, j, jj, nsplit
+           real(sp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( stdlib_lsame( uplo, 'L' ) .or. stdlib_lsame( uplo, 'U' ) ) )&
+                     then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -7
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -8
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -9
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = ap( 1 )
+              else
+                 if( vl<ap( 1 ) .and. vu>=ap( 1 ) ) then
+                    m = 1
+                    w( 1 ) = ap( 1 )
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if ( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           endif
+           anrm = stdlib_slansp( 'M', uplo, n, ap, work )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_sscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_ssptrd to reduce symmetric packed matrix to tridiagonal form.
+           indtau = 1
+           inde = indtau + n
+           indd = inde + n
+           indwrk = indd + n
+           call stdlib_ssptrd( uplo, n, ap, work( indd ), work( inde ),work( indtau ), iinfo )
+                     
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_ssterf or stdlib_sopgtr and stdlib_ssteqr.  if this fails
+           ! for some eigenvalue, then try stdlib_sstebz.
+           test = .false.
+           if (indeig) then
+              if (il==1 .and. iu==n) then
+                 test = .true.
+              end if
+           end if
+           if ((alleig .or. test) .and. (abstol<=zero)) then
+              call stdlib_scopy( n, work( indd ), 1, w, 1 )
+              indee = indwrk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_scopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_ssterf( n, w, work( indee ), info )
+              else
+                 call stdlib_sopgtr( uplo, n, ap, work( indtau ), z, ldz,work( indwrk ), iinfo )
+                           
+                 call stdlib_scopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_ssteqr( jobz, n, w, work( indee ), z, ldz,work( indwrk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 20
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_sstebz and, if eigenvectors are desired, stdlib_sstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_sstebz( range, order, n, vll, vuu, il, iu, abstll,work( indd ), work( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwrk ),iwork( indiwo ), info &
+                     )
+           if( wantz ) then
+              call stdlib_sstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwrk ), iwork( indiwo ), ifail, info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_sstein.
+              call stdlib_sopmtr( 'L', uplo, 'N', n, m, ap, work( indtau ), z, ldz,work( indwrk ),&
+                         iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           20 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_sswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_sspevx
+
+     !> SSPGV: computes all the eigenvalues and, optionally, the eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+     !> Here A and B are assumed to be symmetric, stored in packed format,
+     !> and B is also positive definite.
+
+     subroutine stdlib_sspgv( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, ldz, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: ap(*), bp(*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper, wantz
+           character :: trans
+           integer(ilp) :: j, neig
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPGV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_spptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_sspgst( itype, uplo, n, ap, bp, info )
+           call stdlib_sspev( jobz, uplo, n, ap, w, z, ldz, work, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 do j = 1, neig
+                    call stdlib_stpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, neig
+                    call stdlib_stpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_sspgv
+
+     !> SSPGVX: computes selected eigenvalues, and optionally, eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A
+     !> and B are assumed to be symmetric, stored in packed storage, and B
+     !> is also positive definite.  Eigenvalues and eigenvectors can be
+     !> selected by specifying either a range of values or a range of indices
+     !> for the desired eigenvalues.
+
+     subroutine stdlib_sspgvx( itype, jobz, range, uplo, n, ap, bp, vl, vu,il, iu, abstol, m, w, &
+               z, ldz, work, iwork,ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, itype, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(inout) :: ap(*), bp(*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, upper, valeig, wantz
+           character :: trans
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           upper = stdlib_lsame( uplo, 'U' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -3
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl ) then
+                    info = -9
+                 end if
+              else if( indeig ) then
+                 if( il<1 ) then
+                    info = -10
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -11
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPGVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_spptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_sspgst( itype, uplo, n, ap, bp, info )
+           call stdlib_sspevx( jobz, range, uplo, n, ap, vl, vu, il, iu, abstol, m,w, z, ldz, &
+                     work, iwork, ifail, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              if( info>0 )m = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 do j = 1, m
+                    call stdlib_stpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, m
+                    call stdlib_stpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_sspgvx
+
+     !> SSYEV: computes all eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric matrix A.
+
+     subroutine stdlib_ssyev( jobz, uplo, n, a, lda, w, work, lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indtau, indwrk, iscale, llwork, lwkopt, nb
+           real(sp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'SSYTRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, ( nb+2 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, 3*n-1 ) .and. .not.lquery )info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              return
+           end if
+           if( n==1 ) then
+              w( 1 ) = a( 1, 1 )
+              work( 1 ) = 2
+              if( wantz )a( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_slansy( 'M', uplo, n, a, lda, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 )call stdlib_slascl( uplo, 0, 0, one, sigma, n, n, a, lda, info )
+           ! call stdlib_ssytrd to reduce symmetric matrix to tridiagonal form.
+           inde = 1
+           indtau = inde + n
+           indwrk = indtau + n
+           llwork = lwork - indwrk + 1
+           call stdlib_ssytrd( uplo, n, a, lda, w, work( inde ), work( indtau ),work( indwrk ), &
+                     llwork, iinfo )
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, first call
+           ! stdlib_sorgtr to generate the orthogonal matrix, then call stdlib_ssteqr.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, work( inde ), info )
+           else
+              call stdlib_sorgtr( uplo, n, a, lda, work( indtau ), work( indwrk ),llwork, iinfo )
+                        
+              call stdlib_ssteqr( jobz, n, w, work( inde ), a, lda, work( indtau ),info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssyev
+
+     !> SSYEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric matrix A.  Eigenvalues and eigenvectors can be
+     !> selected by specifying either a range of values or a range of indices
+     !> for the desired eigenvalues.
+
+     subroutine stdlib_ssyevx( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               work, lwork, iwork,ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, lda, ldz, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, lquery, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwo, indtau, &
+           indwkn, indwrk, iscale, itmp1, j, jj, llwork, llwrkn, lwkmin, lwkopt, nb, &
+                     nsplit
+           real(sp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           lower = stdlib_lsame( uplo, 'L' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -8
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -9
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -15
+              end if
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwkmin = 1
+                 work( 1 ) = lwkmin
+              else
+                 lwkmin = 8*n
+                 nb = stdlib_ilaenv( 1, 'SSYTRD', uplo, n, -1, -1, -1 )
+                 nb = max( nb, stdlib_ilaenv( 1, 'SORMTR', uplo, n, -1, -1, -1 ) )
+                 lwkopt = max( lwkmin, ( nb + 3 )*n )
+                 work( 1 ) = lwkopt
+              end if
+              if( lwork<lwkmin .and. .not.lquery )info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              return
+           end if
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = a( 1, 1 )
+              else
+                 if( vl<a( 1, 1 ) .and. vu>=a( 1, 1 ) ) then
+                    m = 1
+                    w( 1 ) = a( 1, 1 )
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           end if
+           anrm = stdlib_slansy( 'M', uplo, n, a, lda, work )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 do j = 1, n
+                    call stdlib_sscal( n-j+1, sigma, a( j, j ), 1 )
+                 end do
+              else
+                 do j = 1, n
+                    call stdlib_sscal( j, sigma, a( 1, j ), 1 )
+                 end do
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_ssytrd to reduce symmetric matrix to tridiagonal form.
+           indtau = 1
+           inde = indtau + n
+           indd = inde + n
+           indwrk = indd + n
+           llwork = lwork - indwrk + 1
+           call stdlib_ssytrd( uplo, n, a, lda, work( indd ), work( inde ),work( indtau ), work( &
+                     indwrk ), llwork, iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal to
+           ! zero, then call stdlib_ssterf or stdlib_sorgtr and stdlib_ssteqr.  if this fails for
+           ! some eigenvalue, then try stdlib_sstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_scopy( n, work( indd ), 1, w, 1 )
+              indee = indwrk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_scopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_ssterf( n, w, work( indee ), info )
+              else
+                 call stdlib_slacpy( 'A', n, n, a, lda, z, ldz )
+                 call stdlib_sorgtr( uplo, n, z, ldz, work( indtau ),work( indwrk ), llwork, &
+                           iinfo )
+                 call stdlib_scopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_ssteqr( jobz, n, w, work( indee ), z, ldz,work( indwrk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 40
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_sstebz and, if eigenvectors are desired, stdlib_sstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwo = indisp + n
+           call stdlib_sstebz( range, order, n, vll, vuu, il, iu, abstll,work( indd ), work( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwrk ),iwork( indiwo ), info &
+                     )
+           if( wantz ) then
+              call stdlib_sstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwrk ), iwork( indiwo ), ifail, info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_sstein.
+              indwkn = inde
+              llwrkn = lwork - indwkn + 1
+              call stdlib_sormtr( 'L', uplo, 'N', n, m, a, lda, work( indtau ), z,ldz, work( &
+                        indwkn ), llwrkn, iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           40 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_sswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssyevx
+
+     !> SSYGV: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+     !> Here A and B are assumed to be symmetric and B is also
+     !> positive definite.
+
+     subroutine stdlib_ssygv( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, lwork, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: lwkmin, lwkopt, nb, neig
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              lwkmin = max( 1, 3*n - 1 )
+              nb = stdlib_ilaenv( 1, 'SSYTRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( lwkmin, ( nb + 2 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYGV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_spotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_ssygst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_ssyev( jobz, uplo, n, a, lda, w, work, lwork, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 call stdlib_strsm( 'LEFT', uplo, trans, 'NON-UNIT', n, neig, one,b, ldb, a, lda )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_strmm( 'LEFT', uplo, trans, 'NON-UNIT', n, neig, one,b, ldb, a, lda )
+                           
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssygv
+
+     !> SSYGVX: computes selected eigenvalues, and optionally, eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A
+     !> and B are assumed to be symmetric and B is also positive definite.
+     !> Eigenvalues and eigenvectors can be selected by specifying either a
+     !> range of values or a range of indices for the desired eigenvalues.
+
+     subroutine stdlib_ssygvx( itype, jobz, range, uplo, n, a, lda, b, ldb,vl, vu, il, iu, abstol,&
+                m, w, z, ldz, work,lwork, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, itype, iu, lda, ldb, ldz, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lquery, upper, valeig, wantz
+           character :: trans
+           integer(ilp) :: lwkmin, lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           upper = stdlib_lsame( uplo, 'U' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -3
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -11
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -12
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -13
+                 end if
+              end if
+           end if
+           if (info==0) then
+              if (ldz<1 .or. (wantz .and. ldz<n)) then
+                 info = -18
+              end if
+           end if
+           if( info==0 ) then
+              lwkmin = max( 1, 8*n )
+              nb = stdlib_ilaenv( 1, 'SSYTRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( lwkmin, ( nb + 3 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYGVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              return
+           end if
+           ! form a cholesky factorization of b.
+           call stdlib_spotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_ssygst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_ssyevx( jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol,m, w, z, ldz, &
+                     work, lwork, iwork, ifail, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              if( info>0 )m = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 call stdlib_strsm( 'LEFT', uplo, trans, 'NON-UNIT', n, m, one, b,ldb, z, ldz )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_strmm( 'LEFT', uplo, trans, 'NON-UNIT', n, m, one, b,ldb, z, ldz )
+                           
+              end if
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssygvx
+
+     !> SSYSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.  The factored form of A is then
+     !> used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_ssysv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_ssytrf( uplo, n, a, lda, ipiv, work, -1, info )
+                 lwkopt = work(1)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYSV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_ssytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              if ( lwork<n ) then
+              ! solve with trs ( use level blas 2)
+                 call stdlib_ssytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+              else
+              ! solve with trs2 ( use level blas 3)
+                 call stdlib_ssytrs2( uplo,n,nrhs,a,lda,ipiv,b,ldb,work,info )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssysv
+
+     !> SSYSVX: uses the diagonal pivoting factorization to compute the
+     !> solution to a real system of linear equations A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_ssysvx( fact, uplo, n, nrhs, a, lda, af, ldaf, ipiv, b,ldb, x, ldx, rcond, &
+               ferr, berr, work, lwork,iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, lwork, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: a(lda,*), b(ldb,*)
+           real(sp), intent(inout) :: af(ldaf,*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, nofact
+           integer(ilp) :: lwkopt, nb
+           real(sp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -11
+           else if( ldx<max( 1, n ) ) then
+              info = -13
+           else if( lwork<max( 1, 3*n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           if( info==0 ) then
+              lwkopt = max( 1, 3*n )
+              if( nofact ) then
+                 nb = stdlib_ilaenv( 1, 'SSYTRF', uplo, n, -1, -1, -1 )
+                 lwkopt = max( lwkopt, n*nb )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYSVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+              call stdlib_slacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_ssytrf( uplo, n, af, ldaf, ipiv, work, lwork, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_slansy( 'I', uplo, n, a, lda, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_ssycon( uplo, n, af, ldaf, ipiv, anorm, rcond, work, iwork,info )
+           ! compute the solution vectors x.
+           call stdlib_slacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_ssytrs( uplo, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_ssyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, iwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssysvx
+
+     !> SSYTRD_SY2SB: reduces a real symmetric matrix A to real symmetric
+     !> band-diagonal form AB by a orthogonal similarity transformation:
+     !> Q**T * A * Q = AB.
+
+     pure subroutine stdlib_ssytrd_sy2sb( uplo, n, kd, a, lda, ab, ldab, tau,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldab, lwork, n, kd
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: ab(ldab,*), tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: rone = 1.0e+0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, j, iinfo, lwmin, pn, pk, lk, ldt, ldw, lds2, lds1, ls2, ls1, lw, lt,&
+                      tpos, wpos, s2pos, s1pos
+           ! Intrinsic Functions 
+           intrinsic :: min,max
+           ! Executable Statements 
+           ! determine the minimal workspace size required
+           ! and test the input parameters
+           info   = 0
+           upper  = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           lwmin  = stdlib_ilaenv2stage( 4, 'SSYTRD_SY2SB', '', n, kd, -1, -1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldab<max( 1, kd+1 ) ) then
+              info = -7
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRD_SY2SB', -info )
+              return
+           else if( lquery ) then
+              work( 1 ) = lwmin
+              return
+           end if
+           ! quick return if possible
+           ! copy the upper/lower portion of a into ab
+           if( n<=kd+1 ) then
+               if( upper ) then
+                   do i = 1, n
+                       lk = min( kd+1, i )
+                       call stdlib_scopy( lk, a( i-lk+1, i ), 1,ab( kd+1-lk+1, i ), 1 )
+                   end do
+               else
+                   do i = 1, n
+                       lk = min( kd+1, n-i+1 )
+                       call stdlib_scopy( lk, a( i, i ), 1, ab( 1, i ), 1 )
+                   end do
+               endif
+               work( 1 ) = 1
+               return
+           end if
+           ! determine the pointer position for the workspace
+           ldt    = kd
+           lds1   = kd
+           lt     = ldt*kd
+           lw     = n*kd
+           ls1    = lds1*kd
+           ls2    = lwmin - lt - lw - ls1
+            ! ls2 = n*max(kd,factoptnb)
+           tpos   = 1
+           wpos   = tpos  + lt
+           s1pos  = wpos  + lw
+           s2pos  = s1pos + ls1
+           if( upper ) then
+               ldw    = kd
+               lds2   = kd
+           else
+               ldw    = n
+               lds2   = n
+           endif
+           ! set the workspace of the triangular matrix t to zero once such a
+           ! way every time t is generated the upper/lower portion will be always zero
+           call stdlib_slaset( "A", ldt, kd, zero, zero, work( tpos ), ldt )
+           if( upper ) then
+               do i = 1, n - kd, kd
+                  pn = n-i-kd+1
+                  pk = min( n-i-kd+1, kd )
+                  ! compute the lq factorization of the current block
+                  call stdlib_sgelqf( kd, pn, a( i, i+kd ), lda,tau( i ), work( s2pos ), ls2, &
+                            iinfo )
+                  ! copy the upper portion of a into ab
+                  do j = i, i+pk-1
+                     lk = min( kd, n-j ) + 1
+                     call stdlib_scopy( lk, a( j, j ), lda, ab( kd+1, j ), ldab-1 )
+                  end do
+                  call stdlib_slaset( 'LOWER', pk, pk, zero, one,a( i, i+kd ), lda )
+                  ! form the matrix t
+                  call stdlib_slarft( 'FORWARD', 'ROWWISE', pn, pk,a( i, i+kd ), lda, tau( i ),&
+                            work( tpos ), ldt )
+                  ! compute w:
+                  call stdlib_sgemm( 'CONJUGATE', 'NO TRANSPOSE', pk, pn, pk,one,  work( tpos ), &
+                            ldt,a( i, i+kd ), lda,zero, work( s2pos ), lds2 )
+                  call stdlib_ssymm( 'RIGHT', uplo, pk, pn,one,  a( i+kd, i+kd ), lda,work( s2pos &
+                            ), lds2,zero, work( wpos ), ldw )
+                  call stdlib_sgemm( 'NO TRANSPOSE', 'CONJUGATE', pk, pk, pn,one,  work( wpos ), &
+                            ldw,work( s2pos ), lds2,zero, work( s1pos ), lds1 )
+                  call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pk, pn, pk,-half, work( &
+                            s1pos ), lds1,a( i, i+kd ), lda,one,   work( wpos ), ldw )
+                  ! update the unreduced submatrix a(i+kd:n,i+kd:n), using
+                  ! an update of the form:  a := a - v'*w - w'*v
+                  call stdlib_ssyr2k( uplo, 'CONJUGATE', pn, pk,-one, a( i, i+kd ), lda,work( &
+                            wpos ), ldw,rone, a( i+kd, i+kd ), lda )
+               end do
+              ! copy the upper band to ab which is the band storage matrix
+              do j = n-kd+1, n
+                 lk = min(kd, n-j) + 1
+                 call stdlib_scopy( lk, a( j, j ), lda, ab( kd+1, j ), ldab-1 )
+              end do
+           else
+               ! reduce the lower triangle of a to lower band matrix
+               loop_40: do i = 1, n - kd, kd
+                  pn = n-i-kd+1
+                  pk = min( n-i-kd+1, kd )
+                  ! compute the qr factorization of the current block
+                  call stdlib_sgeqrf( pn, kd, a( i+kd, i ), lda,tau( i ), work( s2pos ), ls2, &
+                            iinfo )
+                  ! copy the upper portion of a into ab
+                  do j = i, i+pk-1
+                     lk = min( kd, n-j ) + 1
+                     call stdlib_scopy( lk, a( j, j ), 1, ab( 1, j ), 1 )
+                  end do
+                  call stdlib_slaset( 'UPPER', pk, pk, zero, one,a( i+kd, i ), lda )
+                  ! form the matrix t
+                  call stdlib_slarft( 'FORWARD', 'COLUMNWISE', pn, pk,a( i+kd, i ), lda, tau( i ),&
+                            work( tpos ), ldt )
+                  ! compute w:
+                  call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pn, pk, pk,one, a( i+kd, i ),&
+                             lda,work( tpos ), ldt,zero, work( s2pos ), lds2 )
+                  call stdlib_ssymm( 'LEFT', uplo, pn, pk,one, a( i+kd, i+kd ), lda,work( s2pos ),&
+                             lds2,zero, work( wpos ), ldw )
+                  call stdlib_sgemm( 'CONJUGATE', 'NO TRANSPOSE', pk, pk, pn,one, work( s2pos ), &
+                            lds2,work( wpos ), ldw,zero, work( s1pos ), lds1 )
+                  call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pn, pk, pk,-half, a( i+kd, i &
+                            ), lda,work( s1pos ), lds1,one, work( wpos ), ldw )
+                  ! update the unreduced submatrix a(i+kd:n,i+kd:n), using
+                  ! an update of the form:  a := a - v*w' - w*v'
+                  call stdlib_ssyr2k( uplo, 'NO TRANSPOSE', pn, pk,-one, a( i+kd, i ), lda,work( &
+                            wpos ), ldw,rone, a( i+kd, i+kd ), lda )
+                  ! ==================================================================
+                  ! restore a for comparison and checking to be removed
+                   ! do 45 j = i, i+pk-1
+                      ! lk = min( kd, n-j ) + 1
+                      ! call stdlib_scopy( lk, ab( 1, j ), 1, a( j, j ), 1 )
+                      45 continue
+                  ! ==================================================================
+               end do loop_40
+              ! copy the lower band to ab which is the band storage matrix
+              do j = n-kd+1, n
+                 lk = min(kd, n-j) + 1
+                 call stdlib_scopy( lk, a( j, j ), 1, ab( 1, j ), 1 )
+              end do
+           end if
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_ssytrd_sy2sb
+
+     !> STGEVC: computes some or all of the right and/or left eigenvectors of
+     !> a pair of real matrices (S,P), where S is a quasi-triangular matrix
+     !> and P is upper triangular.  Matrix pairs of this type are produced by
+     !> the generalized Schur factorization of a matrix pair (A,B):
+     !> A = Q*S*Z**T,  B = Q*P*Z**T
+     !> as computed by SGGHRD + SHGEQZ.
+     !> The right eigenvector x and the left eigenvector y of (S,P)
+     !> corresponding to an eigenvalue w are defined by:
+     !> S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
+     !> where y**H denotes the conjugate tranpose of y.
+     !> The eigenvalues are not input to this routine, but are computed
+     !> directly from the diagonal blocks of S and P.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of (S,P), or the products Z*X and/or Q*Y,
+     !> where Z and Q are input matrices.
+     !> If Q and Z are the orthogonal factors from the generalized Schur
+     !> factorization of a matrix pair (A,B), then Z*X and Q*Y
+     !> are the matrices of right and left eigenvectors of (A,B).
+
+     pure subroutine stdlib_stgevc( side, howmny, select, n, s, lds, p, ldp, vl,ldvl, vr, ldvr, &
+               mm, m, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldp, lds, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(sp), intent(in) :: p(ldp,*), s(lds,*)
+           real(sp), intent(inout) :: vl(ldvl,*), vr(ldvr,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: safety = 1.0e+2_sp
+           
+           ! Local Scalars 
+           logical(lk) :: compl, compr, il2by2, ilabad, ilall, ilback, ilbbad, ilcomp, ilcplx, &
+                     lsa, lsb
+           integer(ilp) :: i, ibeg, ieig, iend, ihwmny, iinfo, im, iside, j, ja, jc, je, jr, jw, &
+                     na, nw
+           real(sp) :: acoef, acoefa, anorm, ascale, bcoefa, bcoefi, bcoefr, big, bignum, bnorm, &
+           bscale, cim2a, cim2b, cimaga, cimagb, cre2a, cre2b, creala, crealb, dmin, safmin, &
+                     salfar, sbeta, scale, small, temp, temp2, temp2i, temp2r, ulp, xmax, xscale
+           ! Local Arrays 
+           real(sp) :: bdiag(2), sum(2,2), sums(2,2), sump(2,2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! decode and test the input parameters
+           if( stdlib_lsame( howmny, 'A' ) ) then
+              ihwmny = 1
+              ilall = .true.
+              ilback = .false.
+           else if( stdlib_lsame( howmny, 'S' ) ) then
+              ihwmny = 2
+              ilall = .false.
+              ilback = .false.
+           else if( stdlib_lsame( howmny, 'B' ) ) then
+              ihwmny = 3
+              ilall = .true.
+              ilback = .true.
+           else
+              ihwmny = -1
+              ilall = .true.
+           end if
+           if( stdlib_lsame( side, 'R' ) ) then
+              iside = 1
+              compl = .false.
+              compr = .true.
+           else if( stdlib_lsame( side, 'L' ) ) then
+              iside = 2
+              compl = .true.
+              compr = .false.
+           else if( stdlib_lsame( side, 'B' ) ) then
+              iside = 3
+              compl = .true.
+              compr = .true.
+           else
+              iside = -1
+           end if
+           info = 0
+           if( iside<0 ) then
+              info = -1
+           else if( ihwmny<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lds<max( 1, n ) ) then
+              info = -6
+           else if( ldp<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STGEVC', -info )
+              return
+           end if
+           ! count the number of eigenvectors to be computed
+           if( .not.ilall ) then
+              im = 0
+              ilcplx = .false.
+              loop_10: do j = 1, n
+                 if( ilcplx ) then
+                    ilcplx = .false.
+                    cycle loop_10
+                 end if
+                 if( j<n ) then
+                    if( s( j+1, j )/=zero )ilcplx = .true.
+                 end if
+                 if( ilcplx ) then
+                    if( select( j ) .or. select( j+1 ) )im = im + 2
+                 else
+                    if( select( j ) )im = im + 1
+                 end if
+              end do loop_10
+           else
+              im = n
+           end if
+           ! check 2-by-2 diagonal blocks of a, b
+           ilabad = .false.
+           ilbbad = .false.
+           do j = 1, n - 1
+              if( s( j+1, j )/=zero ) then
+                 if( p( j, j )==zero .or. p( j+1, j+1 )==zero .or.p( j, j+1 )/=zero )ilbbad = &
+                           .true.
+                 if( j<n-1 ) then
+                    if( s( j+2, j+1 )/=zero )ilabad = .true.
+                 end if
+              end if
+           end do
+           if( ilabad ) then
+              info = -5
+           else if( ilbbad ) then
+              info = -7
+           else if( compl .and. ldvl<n .or. ldvl<1 ) then
+              info = -10
+           else if( compr .and. ldvr<n .or. ldvr<1 ) then
+              info = -12
+           else if( mm<im ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STGEVC', -info )
+              return
+           end if
+           ! quick return if possible
+           m = im
+           if( n==0 )return
+           ! machine constants
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           big = one / safmin
+           call stdlib_slabad( safmin, big )
+           ulp = stdlib_slamch( 'EPSILON' )*stdlib_slamch( 'BASE' )
+           small = safmin*n / ulp
+           big = one / small
+           bignum = one / ( safmin*n )
+           ! compute the 1-norm of each column of the strictly upper triangular
+           ! part (i.e., excluding all elements belonging to the diagonal
+           ! blocks) of a and b to check for possible overflow in the
+           ! triangular solver.
+           anorm = abs( s( 1, 1 ) )
+           if( n>1 )anorm = anorm + abs( s( 2, 1 ) )
+           bnorm = abs( p( 1, 1 ) )
+           work( 1 ) = zero
+           work( n+1 ) = zero
+           do j = 2, n
+              temp = zero
+              temp2 = zero
+              if( s( j, j-1 )==zero ) then
+                 iend = j - 1
+              else
+                 iend = j - 2
+              end if
+              do i = 1, iend
+                 temp = temp + abs( s( i, j ) )
+                 temp2 = temp2 + abs( p( i, j ) )
+              end do
+              work( j ) = temp
+              work( n+j ) = temp2
+              do i = iend + 1, min( j+1, n )
+                 temp = temp + abs( s( i, j ) )
+                 temp2 = temp2 + abs( p( i, j ) )
+              end do
+              anorm = max( anorm, temp )
+              bnorm = max( bnorm, temp2 )
+           end do
+           ascale = one / max( anorm, safmin )
+           bscale = one / max( bnorm, safmin )
+           ! left eigenvectors
+           if( compl ) then
+              ieig = 0
+              ! main loop over eigenvalues
+              ilcplx = .false.
+              loop_220: do je = 1, n
+                 ! skip this iteration if (a) howmny='s' and select=.false., or
+                 ! (b) this would be the second of a complex pair.
+                 ! check for complex eigenvalue, so as to be sure of which
+                 ! entry(-ies) of select to look at.
+                 if( ilcplx ) then
+                    ilcplx = .false.
+                    cycle loop_220
+                 end if
+                 nw = 1
+                 if( je<n ) then
+                    if( s( je+1, je )/=zero ) then
+                       ilcplx = .true.
+                       nw = 2
+                    end if
+                 end if
+                 if( ilall ) then
+                    ilcomp = .true.
+                 else if( ilcplx ) then
+                    ilcomp = select( je ) .or. select( je+1 )
+                 else
+                    ilcomp = select( je )
+                 end if
+                 if( .not.ilcomp )cycle loop_220
+                 ! decide if (a) singular pencil, (b) real eigenvalue, or
+                 ! (c) complex eigenvalue.
+                 if( .not.ilcplx ) then
+                    if( abs( s( je, je ) )<=safmin .and.abs( p( je, je ) )<=safmin ) then
+                       ! singular matrix pencil -- return unit eigenvector
+                       ieig = ieig + 1
+                       do jr = 1, n
+                          vl( jr, ieig ) = zero
+                       end do
+                       vl( ieig, ieig ) = one
+                       cycle loop_220
+                    end if
+                 end if
+                 ! clear vector
+                 do jr = 1, nw*n
+                    work( 2*n+jr ) = zero
+                 end do
+                                                       ! t
+                 ! compute coefficients in  ( a a - b b )  y = 0
+                    ! a  is  acoef
+                    ! b  is  bcoefr + i*bcoefi
+                 if( .not.ilcplx ) then
+                    ! real eigenvalue
+                    temp = one / max( abs( s( je, je ) )*ascale,abs( p( je, je ) )*bscale, safmin &
+                              )
+                    salfar = ( temp*s( je, je ) )*ascale
+                    sbeta = ( temp*p( je, je ) )*bscale
+                    acoef = sbeta*ascale
+                    bcoefr = salfar*bscale
+                    bcoefi = zero
+                    ! scale to avoid underflow
+                    scale = one
+                    lsa = abs( sbeta )>=safmin .and. abs( acoef )<small
+                    lsb = abs( salfar )>=safmin .and. abs( bcoefr )<small
+                    if( lsa )scale = ( small / abs( sbeta ) )*min( anorm, big )
+                    if( lsb )scale = max( scale, ( small / abs( salfar ) )*min( bnorm, big ) )
+                              
+                    if( lsa .or. lsb ) then
+                       scale = min( scale, one /( safmin*max( one, abs( acoef ),abs( bcoefr ) ) ) &
+                                 )
+                       if( lsa ) then
+                          acoef = ascale*( scale*sbeta )
+                       else
+                          acoef = scale*acoef
+                       end if
+                       if( lsb ) then
+                          bcoefr = bscale*( scale*salfar )
+                       else
+                          bcoefr = scale*bcoefr
+                       end if
+                    end if
+                    acoefa = abs( acoef )
+                    bcoefa = abs( bcoefr )
+                    ! first component is 1
+                    work( 2*n+je ) = one
+                    xmax = one
+                 else
+                    ! complex eigenvalue
+                    call stdlib_slag2( s( je, je ), lds, p( je, je ), ldp,safmin*safety, acoef, &
+                              temp, bcoefr, temp2,bcoefi )
+                    bcoefi = -bcoefi
+                    if( bcoefi==zero ) then
+                       info = je
+                       return
+                    end if
+                    ! scale to avoid over/underflow
+                    acoefa = abs( acoef )
+                    bcoefa = abs( bcoefr ) + abs( bcoefi )
+                    scale = one
+                    if( acoefa*ulp<safmin .and. acoefa>=safmin )scale = ( safmin / ulp ) / &
+                              acoefa
+                    if( bcoefa*ulp<safmin .and. bcoefa>=safmin )scale = max( scale, ( safmin / &
+                              ulp ) / bcoefa )
+                    if( safmin*acoefa>ascale )scale = ascale / ( safmin*acoefa )
+                    if( safmin*bcoefa>bscale )scale = min( scale, bscale / ( safmin*bcoefa ) )
+                              
+                    if( scale/=one ) then
+                       acoef = scale*acoef
+                       acoefa = abs( acoef )
+                       bcoefr = scale*bcoefr
+                       bcoefi = scale*bcoefi
+                       bcoefa = abs( bcoefr ) + abs( bcoefi )
+                    end if
+                    ! compute first two components of eigenvector
+                    temp = acoef*s( je+1, je )
+                    temp2r = acoef*s( je, je ) - bcoefr*p( je, je )
+                    temp2i = -bcoefi*p( je, je )
+                    if( abs( temp )>abs( temp2r )+abs( temp2i ) ) then
+                       work( 2*n+je ) = one
+                       work( 3*n+je ) = zero
+                       work( 2*n+je+1 ) = -temp2r / temp
+                       work( 3*n+je+1 ) = -temp2i / temp
+                    else
+                       work( 2*n+je+1 ) = one
+                       work( 3*n+je+1 ) = zero
+                       temp = acoef*s( je, je+1 )
+                       work( 2*n+je ) = ( bcoefr*p( je+1, je+1 )-acoef*s( je+1, je+1 ) ) / &
+                                 temp
+                       work( 3*n+je ) = bcoefi*p( je+1, je+1 ) / temp
+                    end if
+                    xmax = max( abs( work( 2*n+je ) )+abs( work( 3*n+je ) ),abs( work( 2*n+je+1 ) &
+                              )+abs( work( 3*n+je+1 ) ) )
+                 end if
+                 dmin = max( ulp*acoefa*anorm, ulp*bcoefa*bnorm, safmin )
+                                                 ! t
+                 ! triangular solve of  (a a - b b)  y = 0
+                                         ! t
+                 ! (rowwise in  (a a - b b) , or columnwise in (a a - b b) )
+                 il2by2 = .false.
+                 loop_160: do j = je + nw, n
+                    if( il2by2 ) then
+                       il2by2 = .false.
+                       cycle loop_160
+                    end if
+                    na = 1
+                    bdiag( 1 ) = p( j, j )
+                    if( j<n ) then
+                       if( s( j+1, j )/=zero ) then
+                          il2by2 = .true.
+                          bdiag( 2 ) = p( j+1, j+1 )
+                          na = 2
+                       end if
+                    end if
+                    ! check whether scaling is necessary for dot products
+                    xscale = one / max( one, xmax )
+                    temp = max( work( j ), work( n+j ),acoefa*work( j )+bcoefa*work( n+j ) )
+                              
+                    if( il2by2 )temp = max( temp, work( j+1 ), work( n+j+1 ),acoefa*work( j+1 )+&
+                              bcoefa*work( n+j+1 ) )
+                    if( temp>bignum*xscale ) then
+                       do jw = 0, nw - 1
+                          do jr = je, j - 1
+                             work( ( jw+2 )*n+jr ) = xscale*work( ( jw+2 )*n+jr )
+                          end do
+                       end do
+                       xmax = xmax*xscale
+                    end if
+                    ! compute dot products
+                          ! j-1
+                    ! sum = sum  conjg( a*s(k,j) - b*p(k,j) )*x(k)
+                          ! k=je
+                    ! to reduce the op count, this is done as
+                    ! _        j-1                  _        j-1
+                    ! a*conjg( sum  s(k,j)*x(k) ) - b*conjg( sum  p(k,j)*x(k) )
+                             ! k=je                          k=je
+                    ! which may cause underflow problems if a or b are close
+                    ! to underflow.  (e.g., less than small.)
+                    do jw = 1, nw
+                       do ja = 1, na
+                          sums( ja, jw ) = zero
+                          sump( ja, jw ) = zero
+                          do jr = je, j - 1
+                             sums( ja, jw ) = sums( ja, jw ) +s( jr, j+ja-1 )*work( ( jw+1 )*n+jr &
+                                       )
+                             sump( ja, jw ) = sump( ja, jw ) +p( jr, j+ja-1 )*work( ( jw+1 )*n+jr &
+                                       )
+                          end do
+                       end do
+                    end do
+                    do ja = 1, na
+                       if( ilcplx ) then
+                          sum( ja, 1 ) = -acoef*sums( ja, 1 ) +bcoefr*sump( ja, 1 ) -bcoefi*sump( &
+                                    ja, 2 )
+                          sum( ja, 2 ) = -acoef*sums( ja, 2 ) +bcoefr*sump( ja, 2 ) +bcoefi*sump( &
+                                    ja, 1 )
+                       else
+                          sum( ja, 1 ) = -acoef*sums( ja, 1 ) +bcoefr*sump( ja, 1 )
+                       end if
+                    end do
+                                        ! t
+                    ! solve  ( a a - b b )  y = sum(,)
+                    ! with scaling and perturbation of the denominator
+                    call stdlib_slaln2( .true., na, nw, dmin, acoef, s( j, j ), lds,bdiag( 1 ), &
+                    bdiag( 2 ), sum, 2, bcoefr,bcoefi, work( 2*n+j ), n, scale, temp,iinfo )
+                              
+                    if( scale<one ) then
+                       do jw = 0, nw - 1
+                          do jr = je, j - 1
+                             work( ( jw+2 )*n+jr ) = scale*work( ( jw+2 )*n+jr )
+                          end do
+                       end do
+                       xmax = scale*xmax
+                    end if
+                    xmax = max( xmax, temp )
+                 end do loop_160
+                 ! copy eigenvector to vl, back transforming if
+                 ! howmny='b'.
+                 ieig = ieig + 1
+                 if( ilback ) then
+                    do jw = 0, nw - 1
+                       call stdlib_sgemv( 'N', n, n+1-je, one, vl( 1, je ), ldvl,work( ( jw+2 )*n+&
+                                 je ), 1, zero,work( ( jw+4 )*n+1 ), 1 )
+                    end do
+                    call stdlib_slacpy( ' ', n, nw, work( 4*n+1 ), n, vl( 1, je ),ldvl )
+                    ibeg = 1
+                 else
+                    call stdlib_slacpy( ' ', n, nw, work( 2*n+1 ), n, vl( 1, ieig ),ldvl )
+                    ibeg = je
+                 end if
+                 ! scale eigenvector
+                 xmax = zero
+                 if( ilcplx ) then
+                    do j = ibeg, n
+                       xmax = max( xmax, abs( vl( j, ieig ) )+abs( vl( j, ieig+1 ) ) )
+                    end do
+                 else
+                    do j = ibeg, n
+                       xmax = max( xmax, abs( vl( j, ieig ) ) )
+                    end do
+                 end if
+                 if( xmax>safmin ) then
+                    xscale = one / xmax
+                    do jw = 0, nw - 1
+                       do jr = ibeg, n
+                          vl( jr, ieig+jw ) = xscale*vl( jr, ieig+jw )
+                       end do
+                    end do
+                 end if
+                 ieig = ieig + nw - 1
+              end do loop_220
+           end if
+           ! right eigenvectors
+           if( compr ) then
+              ieig = im + 1
+              ! main loop over eigenvalues
+              ilcplx = .false.
+              loop_500: do je = n, 1, -1
+                 ! skip this iteration if (a) howmny='s' and select=.false., or
+                 ! (b) this would be the second of a complex pair.
+                 ! check for complex eigenvalue, so as to be sure of which
+                 ! entry(-ies) of select to look at -- if complex, select(je)
+                 ! or select(je-1).
+                 ! if this is a complex pair, the 2-by-2 diagonal block
+                 ! corresponding to the eigenvalue is in rows/columns je-1:je
+                 if( ilcplx ) then
+                    ilcplx = .false.
+                    cycle loop_500
+                 end if
+                 nw = 1
+                 if( je>1 ) then
+                    if( s( je, je-1 )/=zero ) then
+                       ilcplx = .true.
+                       nw = 2
+                    end if
+                 end if
+                 if( ilall ) then
+                    ilcomp = .true.
+                 else if( ilcplx ) then
+                    ilcomp = select( je ) .or. select( je-1 )
+                 else
+                    ilcomp = select( je )
+                 end if
+                 if( .not.ilcomp )cycle loop_500
+                 ! decide if (a) singular pencil, (b) real eigenvalue, or
+                 ! (c) complex eigenvalue.
+                 if( .not.ilcplx ) then
+                    if( abs( s( je, je ) )<=safmin .and.abs( p( je, je ) )<=safmin ) then
+                       ! singular matrix pencil -- unit eigenvector
+                       ieig = ieig - 1
+                       do jr = 1, n
+                          vr( jr, ieig ) = zero
+                       end do
+                       vr( ieig, ieig ) = one
+                       cycle loop_500
+                    end if
+                 end if
+                 ! clear vector
+                 do jw = 0, nw - 1
+                    do jr = 1, n
+                       work( ( jw+2 )*n+jr ) = zero
+                    end do
+                 end do
+                 ! compute coefficients in  ( a a - b b ) x = 0
+                    ! a  is  acoef
+                    ! b  is  bcoefr + i*bcoefi
+                 if( .not.ilcplx ) then
+                    ! real eigenvalue
+                    temp = one / max( abs( s( je, je ) )*ascale,abs( p( je, je ) )*bscale, safmin &
+                              )
+                    salfar = ( temp*s( je, je ) )*ascale
+                    sbeta = ( temp*p( je, je ) )*bscale
+                    acoef = sbeta*ascale
+                    bcoefr = salfar*bscale
+                    bcoefi = zero
+                    ! scale to avoid underflow
+                    scale = one
+                    lsa = abs( sbeta )>=safmin .and. abs( acoef )<small
+                    lsb = abs( salfar )>=safmin .and. abs( bcoefr )<small
+                    if( lsa )scale = ( small / abs( sbeta ) )*min( anorm, big )
+                    if( lsb )scale = max( scale, ( small / abs( salfar ) )*min( bnorm, big ) )
+                              
+                    if( lsa .or. lsb ) then
+                       scale = min( scale, one /( safmin*max( one, abs( acoef ),abs( bcoefr ) ) ) &
+                                 )
+                       if( lsa ) then
+                          acoef = ascale*( scale*sbeta )
+                       else
+                          acoef = scale*acoef
+                       end if
+                       if( lsb ) then
+                          bcoefr = bscale*( scale*salfar )
+                       else
+                          bcoefr = scale*bcoefr
+                       end if
+                    end if
+                    acoefa = abs( acoef )
+                    bcoefa = abs( bcoefr )
+                    ! first component is 1
+                    work( 2*n+je ) = one
+                    xmax = one
+                    ! compute contribution from column je of a and b to sum
+                    ! (see "further details", above.)
+                    do jr = 1, je - 1
+                       work( 2*n+jr ) = bcoefr*p( jr, je ) -acoef*s( jr, je )
+                    end do
+                 else
+                    ! complex eigenvalue
+                    call stdlib_slag2( s( je-1, je-1 ), lds, p( je-1, je-1 ), ldp,safmin*safety, &
+                              acoef, temp, bcoefr, temp2,bcoefi )
+                    if( bcoefi==zero ) then
+                       info = je - 1
+                       return
+                    end if
+                    ! scale to avoid over/underflow
+                    acoefa = abs( acoef )
+                    bcoefa = abs( bcoefr ) + abs( bcoefi )
+                    scale = one
+                    if( acoefa*ulp<safmin .and. acoefa>=safmin )scale = ( safmin / ulp ) / &
+                              acoefa
+                    if( bcoefa*ulp<safmin .and. bcoefa>=safmin )scale = max( scale, ( safmin / &
+                              ulp ) / bcoefa )
+                    if( safmin*acoefa>ascale )scale = ascale / ( safmin*acoefa )
+                    if( safmin*bcoefa>bscale )scale = min( scale, bscale / ( safmin*bcoefa ) )
+                              
+                    if( scale/=one ) then
+                       acoef = scale*acoef
+                       acoefa = abs( acoef )
+                       bcoefr = scale*bcoefr
+                       bcoefi = scale*bcoefi
+                       bcoefa = abs( bcoefr ) + abs( bcoefi )
+                    end if
+                    ! compute first two components of eigenvector
+                    ! and contribution to sums
+                    temp = acoef*s( je, je-1 )
+                    temp2r = acoef*s( je, je ) - bcoefr*p( je, je )
+                    temp2i = -bcoefi*p( je, je )
+                    if( abs( temp )>=abs( temp2r )+abs( temp2i ) ) then
+                       work( 2*n+je ) = one
+                       work( 3*n+je ) = zero
+                       work( 2*n+je-1 ) = -temp2r / temp
+                       work( 3*n+je-1 ) = -temp2i / temp
+                    else
+                       work( 2*n+je-1 ) = one
+                       work( 3*n+je-1 ) = zero
+                       temp = acoef*s( je-1, je )
+                       work( 2*n+je ) = ( bcoefr*p( je-1, je-1 )-acoef*s( je-1, je-1 ) ) / &
+                                 temp
+                       work( 3*n+je ) = bcoefi*p( je-1, je-1 ) / temp
+                    end if
+                    xmax = max( abs( work( 2*n+je ) )+abs( work( 3*n+je ) ),abs( work( 2*n+je-1 ) &
+                              )+abs( work( 3*n+je-1 ) ) )
+                    ! compute contribution from columns je and je-1
+                    ! of a and b to the sums.
+                    creala = acoef*work( 2*n+je-1 )
+                    cimaga = acoef*work( 3*n+je-1 )
+                    crealb = bcoefr*work( 2*n+je-1 ) -bcoefi*work( 3*n+je-1 )
+                    cimagb = bcoefi*work( 2*n+je-1 ) +bcoefr*work( 3*n+je-1 )
+                    cre2a = acoef*work( 2*n+je )
+                    cim2a = acoef*work( 3*n+je )
+                    cre2b = bcoefr*work( 2*n+je ) - bcoefi*work( 3*n+je )
+                    cim2b = bcoefi*work( 2*n+je ) + bcoefr*work( 3*n+je )
+                    do jr = 1, je - 2
+                       work( 2*n+jr ) = -creala*s( jr, je-1 ) +crealb*p( jr, je-1 ) -cre2a*s( jr, &
+                                 je ) + cre2b*p( jr, je )
+                       work( 3*n+jr ) = -cimaga*s( jr, je-1 ) +cimagb*p( jr, je-1 ) -cim2a*s( jr, &
+                                 je ) + cim2b*p( jr, je )
+                    end do
+                 end if
+                 dmin = max( ulp*acoefa*anorm, ulp*bcoefa*bnorm, safmin )
+                 ! columnwise triangular solve of  (a a - b b)  x = 0
+                 il2by2 = .false.
+                 loop_370: do j = je - nw, 1, -1
+                    ! if a 2-by-2 block, is in position j-1:j, wait until
+                    ! next iteration to process it (when it will be j:j+1)
+                    if( .not.il2by2 .and. j>1 ) then
+                       if( s( j, j-1 )/=zero ) then
+                          il2by2 = .true.
+                          cycle loop_370
+                       end if
+                    end if
+                    bdiag( 1 ) = p( j, j )
+                    if( il2by2 ) then
+                       na = 2
+                       bdiag( 2 ) = p( j+1, j+1 )
+                    else
+                       na = 1
+                    end if
+                    ! compute x(j) (and x(j+1), if 2-by-2 block)
+                    call stdlib_slaln2( .false., na, nw, dmin, acoef, s( j, j ),lds, bdiag( 1 ), &
+                    bdiag( 2 ), work( 2*n+j ),n, bcoefr, bcoefi, sum, 2, scale, temp,iinfo )
+                              
+                    if( scale<one ) then
+                       do jw = 0, nw - 1
+                          do jr = 1, je
+                             work( ( jw+2 )*n+jr ) = scale*work( ( jw+2 )*n+jr )
+                          end do
+                       end do
+                    end if
+                    xmax = max( scale*xmax, temp )
+                    do jw = 1, nw
+                       do ja = 1, na
+                          work( ( jw+1 )*n+j+ja-1 ) = sum( ja, jw )
+                       end do
+                    end do
+                    ! w = w + x(j)*(a s(*,j) - b p(*,j) ) with scaling
+                    if( j>1 ) then
+                       ! check whether scaling is necessary for sum.
+                       xscale = one / max( one, xmax )
+                       temp = acoefa*work( j ) + bcoefa*work( n+j )
+                       if( il2by2 )temp = max( temp, acoefa*work( j+1 )+bcoefa*work( n+j+1 ) )
+                                 
+                       temp = max( temp, acoefa, bcoefa )
+                       if( temp>bignum*xscale ) then
+                          do jw = 0, nw - 1
+                             do jr = 1, je
+                                work( ( jw+2 )*n+jr ) = xscale*work( ( jw+2 )*n+jr )
+                             end do
+                          end do
+                          xmax = xmax*xscale
+                       end if
+                       ! compute the contributions of the off-diagonals of
+                       ! column j (and j+1, if 2-by-2 block) of a and b to the
+                       ! sums.
+                       do ja = 1, na
+                          if( ilcplx ) then
+                             creala = acoef*work( 2*n+j+ja-1 )
+                             cimaga = acoef*work( 3*n+j+ja-1 )
+                             crealb = bcoefr*work( 2*n+j+ja-1 ) -bcoefi*work( 3*n+j+ja-1 )
+                             cimagb = bcoefi*work( 2*n+j+ja-1 ) +bcoefr*work( 3*n+j+ja-1 )
+                             do jr = 1, j - 1
+                                work( 2*n+jr ) = work( 2*n+jr ) -creala*s( jr, j+ja-1 ) +crealb*p(&
+                                           jr, j+ja-1 )
+                                work( 3*n+jr ) = work( 3*n+jr ) -cimaga*s( jr, j+ja-1 ) +cimagb*p(&
+                                           jr, j+ja-1 )
+                             end do
+                          else
+                             creala = acoef*work( 2*n+j+ja-1 )
+                             crealb = bcoefr*work( 2*n+j+ja-1 )
+                             do jr = 1, j - 1
+                                work( 2*n+jr ) = work( 2*n+jr ) -creala*s( jr, j+ja-1 ) +crealb*p(&
+                                           jr, j+ja-1 )
+                             end do
+                          end if
+                       end do
+                    end if
+                    il2by2 = .false.
+                 end do loop_370
+                 ! copy eigenvector to vr, back transforming if
+                 ! howmny='b'.
+                 ieig = ieig - nw
+                 if( ilback ) then
+                    do jw = 0, nw - 1
+                       do jr = 1, n
+                          work( ( jw+4 )*n+jr ) = work( ( jw+2 )*n+1 )*vr( jr, 1 )
+                       end do
+                       ! a series of compiler directives to defeat
+                       ! vectorization for the next loop
+                       do jc = 2, je
+                          do jr = 1, n
+                             work( ( jw+4 )*n+jr ) = work( ( jw+4 )*n+jr ) +work( ( jw+2 )*n+jc )&
+                                       *vr( jr, jc )
+                          end do
+                       end do
+                    end do
+                    do jw = 0, nw - 1
+                       do jr = 1, n
+                          vr( jr, ieig+jw ) = work( ( jw+4 )*n+jr )
+                       end do
+                    end do
+                    iend = n
+                 else
+                    do jw = 0, nw - 1
+                       do jr = 1, n
+                          vr( jr, ieig+jw ) = work( ( jw+2 )*n+jr )
+                       end do
+                    end do
+                    iend = je
+                 end if
+                 ! scale eigenvector
+                 xmax = zero
+                 if( ilcplx ) then
+                    do j = 1, iend
+                       xmax = max( xmax, abs( vr( j, ieig ) )+abs( vr( j, ieig+1 ) ) )
+                    end do
+                 else
+                    do j = 1, iend
+                       xmax = max( xmax, abs( vr( j, ieig ) ) )
+                    end do
+                 end if
+                 if( xmax>safmin ) then
+                    xscale = one / xmax
+                    do jw = 0, nw - 1
+                       do jr = 1, iend
+                          vr( jr, ieig+jw ) = xscale*vr( jr, ieig+jw )
+                       end do
+                    end do
+                 end if
+              end do loop_500
+           end if
+           return
+     end subroutine stdlib_stgevc
+
+     !> STGEX2: swaps adjacent diagonal blocks (A11, B11) and (A22, B22)
+     !> of size 1-by-1 or 2-by-2 in an upper (quasi) triangular matrix pair
+     !> (A, B) by an orthogonal equivalence transformation.
+     !> (A, B) must be in generalized real Schur canonical form (as returned
+     !> by SGGES), i.e. A is block upper triangular with 1-by-1 and 2-by-2
+     !> diagonal blocks. B is upper triangular.
+     !> Optionally, the matrices Q and Z of generalized Schur vectors are
+     !> updated.
+     !> Q(in) * A(in) * Z(in)**T = Q(out) * A(out) * Z(out)**T
+     !> Q(in) * B(in) * Z(in)**T = Q(out) * B(out) * Z(out)**T
+
+     pure subroutine stdlib_stgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, j1, n1, n2, &
+               work, lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: j1, lda, ldb, ldq, ldz, lwork, n, n1, n2
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+        ! replaced various illegal calls to stdlib_scopy by calls to stdlib_slaset, or by do
+        ! loops. sven hammarling, 1/5/02.
+           ! Parameters 
+           real(sp), parameter :: twenty = 2.0e+01_sp
+           integer(ilp), parameter :: ldst = 4
+           logical(lk), parameter :: wands = .true.
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: strong, weak
+           integer(ilp) :: i, idum, linfo, m
+           real(sp) :: bqra21, brqa21, ddum, dnorma, dnormb, dscale, dsum, eps, f, g, sa, sb, &
+                     scale, smlnum, thresha, threshb
+           ! Local Arrays 
+           integer(ilp) :: iwork(ldst)
+           real(sp) :: ai(2), ar(2), be(2), ir(ldst,ldst), ircop(ldst,ldst), li(ldst,ldst), licop(&
+           ldst,ldst), s(ldst,ldst), scpy(ldst,ldst), t(ldst,ldst), taul(ldst), taur(ldst), tcpy(&
+                     ldst,ldst)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=1 .or. n1<=0 .or. n2<=0 )return
+           if( n1>n .or. ( j1+n1 )>n )return
+           m = n1 + n2
+           if( lwork<max( n*m, m*m*2 ) ) then
+              info = -16
+              work( 1 ) = max( n*m, m*m*2 )
+              return
+           end if
+           weak = .false.
+           strong = .false.
+           ! make a local copy of selected block
+           call stdlib_slaset( 'FULL', ldst, ldst, zero, zero, li, ldst )
+           call stdlib_slaset( 'FULL', ldst, ldst, zero, zero, ir, ldst )
+           call stdlib_slacpy( 'FULL', m, m, a( j1, j1 ), lda, s, ldst )
+           call stdlib_slacpy( 'FULL', m, m, b( j1, j1 ), ldb, t, ldst )
+           ! compute threshold for testing acceptance of swapping.
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' ) / eps
+           dscale = zero
+           dsum = one
+           call stdlib_slacpy( 'FULL', m, m, s, ldst, work, m )
+           call stdlib_slassq( m*m, work, 1, dscale, dsum )
+           dnorma = dscale*sqrt( dsum )
+           dscale = zero
+           dsum = one
+           call stdlib_slacpy( 'FULL', m, m, t, ldst, work, m )
+           call stdlib_slassq( m*m, work, 1, dscale, dsum )
+           dnormb = dscale*sqrt( dsum )
+           ! thres has been changed from
+              ! thresh = max( ten*eps*sa, smlnum )
+           ! to
+              ! thresh = max( twenty*eps*sa, smlnum )
+           ! on 04/01/10.
+           ! "bug" reported by ondra kamenik, confirmed by julie langou, fixed by
+           ! jim demmel and guillaume revy. see forum post 1783.
+           thresha = max( twenty*eps*dnorma, smlnum )
+           threshb = max( twenty*eps*dnormb, smlnum )
+           if( m==2 ) then
+              ! case 1: swap 1-by-1 and 1-by-1 blocks.
+              ! compute orthogonal ql and rq that swap 1-by-1 and 1-by-1 blocks
+              ! using givens rotations and perform the swap tentatively.
+              f = s( 2, 2 )*t( 1, 1 ) - t( 2, 2 )*s( 1, 1 )
+              g = s( 2, 2 )*t( 1, 2 ) - t( 2, 2 )*s( 1, 2 )
+              sa = abs( s( 2, 2 ) ) * abs( t( 1, 1 ) )
+              sb = abs( s( 1, 1 ) ) * abs( t( 2, 2 ) )
+              call stdlib_slartg( f, g, ir( 1, 2 ), ir( 1, 1 ), ddum )
+              ir( 2, 1 ) = -ir( 1, 2 )
+              ir( 2, 2 ) = ir( 1, 1 )
+              call stdlib_srot( 2, s( 1, 1 ), 1, s( 1, 2 ), 1, ir( 1, 1 ),ir( 2, 1 ) )
+              call stdlib_srot( 2, t( 1, 1 ), 1, t( 1, 2 ), 1, ir( 1, 1 ),ir( 2, 1 ) )
+              if( sa>=sb ) then
+                 call stdlib_slartg( s( 1, 1 ), s( 2, 1 ), li( 1, 1 ), li( 2, 1 ),ddum )
+              else
+                 call stdlib_slartg( t( 1, 1 ), t( 2, 1 ), li( 1, 1 ), li( 2, 1 ),ddum )
+              end if
+              call stdlib_srot( 2, s( 1, 1 ), ldst, s( 2, 1 ), ldst, li( 1, 1 ),li( 2, 1 ) )
+                        
+              call stdlib_srot( 2, t( 1, 1 ), ldst, t( 2, 1 ), ldst, li( 1, 1 ),li( 2, 1 ) )
+                        
+              li( 2, 2 ) = li( 1, 1 )
+              li( 1, 2 ) = -li( 2, 1 )
+              ! weak stability test: |s21| <= o(eps f-norm((a)))
+                                 ! and  |t21| <= o(eps f-norm((b)))
+              weak = abs( s( 2, 1 ) ) <= thresha .and.abs( t( 2, 1 ) ) <= threshb
+              if( .not.weak )go to 70
+              if( wands ) then
+                 ! strong stability test:
+                     ! f-norm((a-ql**h*s*qr)) <= o(eps*f-norm((a)))
+                     ! and
+                     ! f-norm((b-ql**h*t*qr)) <= o(eps*f-norm((b)))
+                 call stdlib_slacpy( 'FULL', m, m, a( j1, j1 ), lda, work( m*m+1 ),m )
+                 call stdlib_sgemm( 'N', 'N', m, m, m, one, li, ldst, s, ldst, zero,work, m )
+                           
+                 call stdlib_sgemm( 'N', 'T', m, m, m, -one, work, m, ir, ldst, one,work( m*m+1 ),&
+                            m )
+                 dscale = zero
+                 dsum = one
+                 call stdlib_slassq( m*m, work( m*m+1 ), 1, dscale, dsum )
+                 sa = dscale*sqrt( dsum )
+                 call stdlib_slacpy( 'FULL', m, m, b( j1, j1 ), ldb, work( m*m+1 ),m )
+                 call stdlib_sgemm( 'N', 'N', m, m, m, one, li, ldst, t, ldst, zero,work, m )
+                           
+                 call stdlib_sgemm( 'N', 'T', m, m, m, -one, work, m, ir, ldst, one,work( m*m+1 ),&
+                            m )
+                 dscale = zero
+                 dsum = one
+                 call stdlib_slassq( m*m, work( m*m+1 ), 1, dscale, dsum )
+                 sb = dscale*sqrt( dsum )
+                 strong = sa<=thresha .and. sb<=threshb
+                 if( .not.strong )go to 70
+              end if
+              ! update (a(j1:j1+m-1, m+j1:n), b(j1:j1+m-1, m+j1:n)) and
+                     ! (a(1:j1-1, j1:j1+m), b(1:j1-1, j1:j1+m)).
+              call stdlib_srot( j1+1, a( 1, j1 ), 1, a( 1, j1+1 ), 1, ir( 1, 1 ),ir( 2, 1 ) )
+                        
+              call stdlib_srot( j1+1, b( 1, j1 ), 1, b( 1, j1+1 ), 1, ir( 1, 1 ),ir( 2, 1 ) )
+                        
+              call stdlib_srot( n-j1+1, a( j1, j1 ), lda, a( j1+1, j1 ), lda,li( 1, 1 ), li( 2, 1 &
+                        ) )
+              call stdlib_srot( n-j1+1, b( j1, j1 ), ldb, b( j1+1, j1 ), ldb,li( 1, 1 ), li( 2, 1 &
+                        ) )
+              ! set  n1-by-n2 (2,1) - blocks to zero.
+              a( j1+1, j1 ) = zero
+              b( j1+1, j1 ) = zero
+              ! accumulate transformations into q and z if requested.
+              if( wantz )call stdlib_srot( n, z( 1, j1 ), 1, z( 1, j1+1 ), 1, ir( 1, 1 ),ir( 2, 1 &
+                        ) )
+              if( wantq )call stdlib_srot( n, q( 1, j1 ), 1, q( 1, j1+1 ), 1, li( 1, 1 ),li( 2, 1 &
+                        ) )
+              ! exit with info = 0 if swap was successfully performed.
+              return
+           else
+              ! case 2: swap 1-by-1 and 2-by-2 blocks, or 2-by-2
+                      ! and 2-by-2 blocks.
+              ! solve the generalized sylvester equation
+                       ! s11 * r - l * s22 = scale * s12
+                       ! t11 * r - l * t22 = scale * t12
+              ! for r and l. solutions in li and ir.
+              call stdlib_slacpy( 'FULL', n1, n2, t( 1, n1+1 ), ldst, li, ldst )
+              call stdlib_slacpy( 'FULL', n1, n2, s( 1, n1+1 ), ldst,ir( n2+1, n1+1 ), ldst )
+                        
+              call stdlib_stgsy2( 'N', 0, n1, n2, s, ldst, s( n1+1, n1+1 ), ldst,ir( n2+1, n1+1 ),&
+               ldst, t, ldst, t( n1+1, n1+1 ),ldst, li, ldst, scale, dsum, dscale, iwork, idum,&
+                         linfo )
+              if( linfo/=0 )go to 70
+              ! compute orthogonal matrix ql:
+                          ! ql**t * li = [ tl ]
+                                       ! [ 0  ]
+              ! where
+                          ! li =  [      -l              ]
+                                ! [ scale * identity(n2) ]
+              do i = 1, n2
+                 call stdlib_sscal( n1, -one, li( 1, i ), 1 )
+                 li( n1+i, i ) = scale
+              end do
+              call stdlib_sgeqr2( m, n2, li, ldst, taul, work, linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_sorg2r( m, m, n2, li, ldst, taul, work, linfo )
+              if( linfo/=0 )go to 70
+              ! compute orthogonal matrix rq:
+                          ! ir * rq**t =   [ 0  tr],
+               ! where ir = [ scale * identity(n1), r ]
+              do i = 1, n1
+                 ir( n2+i, i ) = scale
+              end do
+              call stdlib_sgerq2( n1, m, ir( n2+1, 1 ), ldst, taur, work, linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_sorgr2( m, m, n1, ir, ldst, taur, work, linfo )
+              if( linfo/=0 )go to 70
+              ! perform the swapping tentatively:
+              call stdlib_sgemm( 'T', 'N', m, m, m, one, li, ldst, s, ldst, zero,work, m )
+              call stdlib_sgemm( 'N', 'T', m, m, m, one, work, m, ir, ldst, zero, s,ldst )
+              call stdlib_sgemm( 'T', 'N', m, m, m, one, li, ldst, t, ldst, zero,work, m )
+              call stdlib_sgemm( 'N', 'T', m, m, m, one, work, m, ir, ldst, zero, t,ldst )
+              call stdlib_slacpy( 'F', m, m, s, ldst, scpy, ldst )
+              call stdlib_slacpy( 'F', m, m, t, ldst, tcpy, ldst )
+              call stdlib_slacpy( 'F', m, m, ir, ldst, ircop, ldst )
+              call stdlib_slacpy( 'F', m, m, li, ldst, licop, ldst )
+              ! triangularize the b-part by an rq factorization.
+              ! apply transformation (from left) to a-part, giving s.
+              call stdlib_sgerq2( m, m, t, ldst, taur, work, linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_sormr2( 'R', 'T', m, m, m, t, ldst, taur, s, ldst, work,linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_sormr2( 'L', 'N', m, m, m, t, ldst, taur, ir, ldst, work,linfo )
+              if( linfo/=0 )go to 70
+              ! compute f-norm(s21) in brqa21. (t21 is 0.)
+              dscale = zero
+              dsum = one
+              do i = 1, n2
+                 call stdlib_slassq( n1, s( n2+1, i ), 1, dscale, dsum )
+              end do
+              brqa21 = dscale*sqrt( dsum )
+              ! triangularize the b-part by a qr factorization.
+              ! apply transformation (from right) to a-part, giving s.
+              call stdlib_sgeqr2( m, m, tcpy, ldst, taul, work, linfo )
+              if( linfo/=0 )go to 70
+              call stdlib_sorm2r( 'L', 'T', m, m, m, tcpy, ldst, taul, scpy, ldst,work, info )
+                        
+              call stdlib_sorm2r( 'R', 'N', m, m, m, tcpy, ldst, taul, licop, ldst,work, info )
+                        
+              if( linfo/=0 )go to 70
+              ! compute f-norm(s21) in bqra21. (t21 is 0.)
+              dscale = zero
+              dsum = one
+              do i = 1, n2
+                 call stdlib_slassq( n1, scpy( n2+1, i ), 1, dscale, dsum )
+              end do
+              bqra21 = dscale*sqrt( dsum )
+              ! decide which method to use.
+                ! weak stability test:
+                   ! f-norm(s21) <= o(eps * f-norm((s)))
+              if( bqra21<=brqa21 .and. bqra21<=thresha ) then
+                 call stdlib_slacpy( 'F', m, m, scpy, ldst, s, ldst )
+                 call stdlib_slacpy( 'F', m, m, tcpy, ldst, t, ldst )
+                 call stdlib_slacpy( 'F', m, m, ircop, ldst, ir, ldst )
+                 call stdlib_slacpy( 'F', m, m, licop, ldst, li, ldst )
+              else if( brqa21>=thresha ) then
+                 go to 70
+              end if
+              ! set lower triangle of b-part to zero
+              call stdlib_slaset( 'LOWER', m-1, m-1, zero, zero, t(2,1), ldst )
+              if( wands ) then
+                 ! strong stability test:
+                     ! f-norm((a-ql**h*s*qr)) <= o(eps*f-norm((a)))
+                     ! and
+                     ! f-norm((b-ql**h*t*qr)) <= o(eps*f-norm((b)))
+                 call stdlib_slacpy( 'FULL', m, m, a( j1, j1 ), lda, work( m*m+1 ),m )
+                 call stdlib_sgemm( 'N', 'N', m, m, m, one, li, ldst, s, ldst, zero,work, m )
+                           
+                 call stdlib_sgemm( 'N', 'N', m, m, m, -one, work, m, ir, ldst, one,work( m*m+1 ),&
+                            m )
+                 dscale = zero
+                 dsum = one
+                 call stdlib_slassq( m*m, work( m*m+1 ), 1, dscale, dsum )
+                 sa = dscale*sqrt( dsum )
+                 call stdlib_slacpy( 'FULL', m, m, b( j1, j1 ), ldb, work( m*m+1 ),m )
+                 call stdlib_sgemm( 'N', 'N', m, m, m, one, li, ldst, t, ldst, zero,work, m )
+                           
+                 call stdlib_sgemm( 'N', 'N', m, m, m, -one, work, m, ir, ldst, one,work( m*m+1 ),&
+                            m )
+                 dscale = zero
+                 dsum = one
+                 call stdlib_slassq( m*m, work( m*m+1 ), 1, dscale, dsum )
+                 sb = dscale*sqrt( dsum )
+                 strong = sa<=thresha .and. sb<=threshb
+                 if( .not.strong )go to 70
+              end if
+              ! if the swap is accepted ("weakly" and "strongly"), apply the
+              ! transformations and set n1-by-n2 (2,1)-block to zero.
+              call stdlib_slaset( 'FULL', n1, n2, zero, zero, s(n2+1,1), ldst )
+              ! copy back m-by-m diagonal block starting at index j1 of (a, b)
+              call stdlib_slacpy( 'F', m, m, s, ldst, a( j1, j1 ), lda )
+              call stdlib_slacpy( 'F', m, m, t, ldst, b( j1, j1 ), ldb )
+              call stdlib_slaset( 'FULL', ldst, ldst, zero, zero, t, ldst )
+              ! standardize existing 2-by-2 blocks.
+              call stdlib_slaset( 'FULL', m, m, zero, zero, work, m )
+              work( 1 ) = one
+              t( 1, 1 ) = one
+              idum = lwork - m*m - 2
+              if( n2>1 ) then
+                 call stdlib_slagv2( a( j1, j1 ), lda, b( j1, j1 ), ldb, ar, ai, be,work( 1 ), &
+                           work( 2 ), t( 1, 1 ), t( 2, 1 ) )
+                 work( m+1 ) = -work( 2 )
+                 work( m+2 ) = work( 1 )
+                 t( n2, n2 ) = t( 1, 1 )
+                 t( 1, 2 ) = -t( 2, 1 )
+              end if
+              work( m*m ) = one
+              t( m, m ) = one
+              if( n1>1 ) then
+                 call stdlib_slagv2( a( j1+n2, j1+n2 ), lda, b( j1+n2, j1+n2 ), ldb,taur, taul, &
+                 work( m*m+1 ), work( n2*m+n2+1 ),work( n2*m+n2+2 ), t( n2+1, n2+1 ),t( m, m-1 ) )
+                           
+                 work( m*m ) = work( n2*m+n2+1 )
+                 work( m*m-1 ) = -work( n2*m+n2+2 )
+                 t( m, m ) = t( n2+1, n2+1 )
+                 t( m-1, m ) = -t( m, m-1 )
+              end if
+              call stdlib_sgemm( 'T', 'N', n2, n1, n2, one, work, m, a( j1, j1+n2 ),lda, zero, &
+                        work( m*m+1 ), n2 )
+              call stdlib_slacpy( 'FULL', n2, n1, work( m*m+1 ), n2, a( j1, j1+n2 ),lda )
+              call stdlib_sgemm( 'T', 'N', n2, n1, n2, one, work, m, b( j1, j1+n2 ),ldb, zero, &
+                        work( m*m+1 ), n2 )
+              call stdlib_slacpy( 'FULL', n2, n1, work( m*m+1 ), n2, b( j1, j1+n2 ),ldb )
+              call stdlib_sgemm( 'N', 'N', m, m, m, one, li, ldst, work, m, zero,work( m*m+1 ), m &
+                        )
+              call stdlib_slacpy( 'FULL', m, m, work( m*m+1 ), m, li, ldst )
+              call stdlib_sgemm( 'N', 'N', n2, n1, n1, one, a( j1, j1+n2 ), lda,t( n2+1, n2+1 ), &
+                        ldst, zero, work, n2 )
+              call stdlib_slacpy( 'FULL', n2, n1, work, n2, a( j1, j1+n2 ), lda )
+              call stdlib_sgemm( 'N', 'N', n2, n1, n1, one, b( j1, j1+n2 ), ldb,t( n2+1, n2+1 ), &
+                        ldst, zero, work, n2 )
+              call stdlib_slacpy( 'FULL', n2, n1, work, n2, b( j1, j1+n2 ), ldb )
+              call stdlib_sgemm( 'T', 'N', m, m, m, one, ir, ldst, t, ldst, zero,work, m )
+              call stdlib_slacpy( 'FULL', m, m, work, m, ir, ldst )
+              ! accumulate transformations into q and z if requested.
+              if( wantq ) then
+                 call stdlib_sgemm( 'N', 'N', n, m, m, one, q( 1, j1 ), ldq, li,ldst, zero, work, &
+                           n )
+                 call stdlib_slacpy( 'FULL', n, m, work, n, q( 1, j1 ), ldq )
+              end if
+              if( wantz ) then
+                 call stdlib_sgemm( 'N', 'N', n, m, m, one, z( 1, j1 ), ldz, ir,ldst, zero, work, &
+                           n )
+                 call stdlib_slacpy( 'FULL', n, m, work, n, z( 1, j1 ), ldz )
+              end if
+              ! update (a(j1:j1+m-1, m+j1:n), b(j1:j1+m-1, m+j1:n)) and
+                      ! (a(1:j1-1, j1:j1+m), b(1:j1-1, j1:j1+m)).
+              i = j1 + m
+              if( i<=n ) then
+                 call stdlib_sgemm( 'T', 'N', m, n-i+1, m, one, li, ldst,a( j1, i ), lda, zero, &
+                           work, m )
+                 call stdlib_slacpy( 'FULL', m, n-i+1, work, m, a( j1, i ), lda )
+                 call stdlib_sgemm( 'T', 'N', m, n-i+1, m, one, li, ldst,b( j1, i ), ldb, zero, &
+                           work, m )
+                 call stdlib_slacpy( 'FULL', m, n-i+1, work, m, b( j1, i ), ldb )
+              end if
+              i = j1 - 1
+              if( i>0 ) then
+                 call stdlib_sgemm( 'N', 'N', i, m, m, one, a( 1, j1 ), lda, ir,ldst, zero, work, &
+                           i )
+                 call stdlib_slacpy( 'FULL', i, m, work, i, a( 1, j1 ), lda )
+                 call stdlib_sgemm( 'N', 'N', i, m, m, one, b( 1, j1 ), ldb, ir,ldst, zero, work, &
+                           i )
+                 call stdlib_slacpy( 'FULL', i, m, work, i, b( 1, j1 ), ldb )
+              end if
+              ! exit with info = 0 if swap was successfully performed.
+              return
+           end if
+           ! exit with info = 1 if swap was rejected.
+           70 continue
+           info = 1
+           return
+     end subroutine stdlib_stgex2
+
+     !> STGEXC: reorders the generalized real Schur decomposition of a real
+     !> matrix pair (A,B) using an orthogonal equivalence transformation
+     !> (A, B) = Q * (A, B) * Z**T,
+     !> so that the diagonal block of (A, B) with row index IFST is moved
+     !> to row ILST.
+     !> (A, B) must be in generalized real Schur canonical form (as returned
+     !> by SGGES), i.e. A is block upper triangular with 1-by-1 and 2-by-2
+     !> diagonal blocks. B is upper triangular.
+     !> Optionally, the matrices Q and Z of generalized Schur vectors are
+     !> updated.
+     !> Q(in) * A(in) * Z(in)**T = Q(out) * A(out) * Z(out)**T
+     !> Q(in) * B(in) * Z(in)**T = Q(out) * B(out) * Z(out)**T
+
+     pure subroutine stdlib_stgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, ifst, ilst, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(inout) :: ifst, ilst
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldq, ldz, lwork, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: here, lwmin, nbf, nbl, nbnext
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode and test input arguments.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldq<1 .or. wantq .and. ( ldq<max( 1, n ) ) ) then
+              info = -9
+           else if( ldz<1 .or. wantz .and. ( ldz<max( 1, n ) ) ) then
+              info = -11
+           else if( ifst<1 .or. ifst>n ) then
+              info = -12
+           else if( ilst<1 .or. ilst>n ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwmin = 1
+              else
+                 lwmin = 4*n + 16
+              end if
+              work(1) = lwmin
+              if (lwork<lwmin .and. .not.lquery) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STGEXC', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 )return
+           ! determine the first row of the specified block and find out
+           ! if it is 1-by-1 or 2-by-2.
+           if( ifst>1 ) then
+              if( a( ifst, ifst-1 )/=zero )ifst = ifst - 1
+           end if
+           nbf = 1
+           if( ifst<n ) then
+              if( a( ifst+1, ifst )/=zero )nbf = 2
+           end if
+           ! determine the first row of the final block
+           ! and find out if it is 1-by-1 or 2-by-2.
+           if( ilst>1 ) then
+              if( a( ilst, ilst-1 )/=zero )ilst = ilst - 1
+           end if
+           nbl = 1
+           if( ilst<n ) then
+              if( a( ilst+1, ilst )/=zero )nbl = 2
+           end if
+           if( ifst==ilst )return
+           if( ifst<ilst ) then
+              ! update ilst.
+              if( nbf==2 .and. nbl==1 )ilst = ilst - 1
+              if( nbf==1 .and. nbl==2 )ilst = ilst + 1
+              here = ifst
+              10 continue
+              ! swap with next one below.
+              if( nbf==1 .or. nbf==2 ) then
+                 ! current block either 1-by-1 or 2-by-2.
+                 nbnext = 1
+                 if( here+nbf+1<=n ) then
+                    if( a( here+nbf+1, here+nbf )/=zero )nbnext = 2
+                 end if
+                 call stdlib_stgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here, nbf, &
+                           nbnext, work, lwork, info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 here = here + nbnext
+                 ! test if 2-by-2 block breaks into two 1-by-1 blocks.
+                 if( nbf==2 ) then
+                    if( a( here+1, here )==zero )nbf = 3
+                 end if
+              else
+                 ! current block consists of two 1-by-1 blocks, each of which
+                 ! must be swapped individually.
+                 nbnext = 1
+                 if( here+3<=n ) then
+                    if( a( here+3, here+2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_stgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here+1, 1, &
+                           nbnext, work, lwork, info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 if( nbnext==1 ) then
+                    ! swap two 1-by-1 blocks.
+                    call stdlib_stgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here, 1, &
+                              1, work, lwork, info )
+                    if( info/=0 ) then
+                       ilst = here
+                       return
+                    end if
+                    here = here + 1
+                 else
+                    ! recompute nbnext in case of 2-by-2 split.
+                    if( a( here+2, here+1 )==zero )nbnext = 1
+                    if( nbnext==2 ) then
+                       ! 2-by-2 block did not split.
+                       call stdlib_stgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, nbnext, work, lwork,info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here + 2
+                    else
+                       ! 2-by-2 block did split.
+                       call stdlib_stgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here + 1
+                       call stdlib_stgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here + 1
+                    end if
+                 end if
+              end if
+              if( here<ilst )go to 10
+           else
+              here = ifst
+              20 continue
+              ! swap with next one below.
+              if( nbf==1 .or. nbf==2 ) then
+                 ! current block either 1-by-1 or 2-by-2.
+                 nbnext = 1
+                 if( here>=3 ) then
+                    if( a( here-1, here-2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_stgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here-nbnext, &
+                           nbnext, nbf, work, lwork,info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 here = here - nbnext
+                 ! test if 2-by-2 block breaks into two 1-by-1 blocks.
+                 if( nbf==2 ) then
+                    if( a( here+1, here )==zero )nbf = 3
+                 end if
+              else
+                 ! current block consists of two 1-by-1 blocks, each of which
+                 ! must be swapped individually.
+                 nbnext = 1
+                 if( here>=3 ) then
+                    if( a( here-1, here-2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_stgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here-nbnext, &
+                           nbnext, 1, work, lwork,info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 if( nbnext==1 ) then
+                    ! swap two 1-by-1 blocks.
+                    call stdlib_stgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, here, &
+                              nbnext, 1, work, lwork, info )
+                    if( info/=0 ) then
+                       ilst = here
+                       return
+                    end if
+                    here = here - 1
+                 else
+                   ! recompute nbnext in case of 2-by-2 split.
+                    if( a( here, here-1 )==zero )nbnext = 1
+                    if( nbnext==2 ) then
+                       ! 2-by-2 block did not split.
+                       call stdlib_stgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here-1,&
+                                  2, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here - 2
+                    else
+                       ! 2-by-2 block did split.
+                       call stdlib_stgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here - 1
+                       call stdlib_stgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz, here, &
+                                 1, 1, work, lwork, info )
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here - 1
+                    end if
+                 end if
+              end if
+              if( here>ilst )go to 20
+           end if
+           ilst = here
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_stgexc
+
+     !> STGSEN: reorders the generalized real Schur decomposition of a real
+     !> matrix pair (A, B) (in terms of an orthonormal equivalence trans-
+     !> formation Q**T * (A, B) * Z), so that a selected cluster of eigenvalues
+     !> appears in the leading diagonal blocks of the upper quasi-triangular
+     !> matrix A and the upper triangular B. The leading columns of Q and
+     !> Z form orthonormal bases of the corresponding left and right eigen-
+     !> spaces (deflating subspaces). (A, B) must be in generalized real
+     !> Schur canonical form (as returned by SGGES), i.e. A is block upper
+     !> triangular with 1-by-1 and 2-by-2 diagonal blocks. B is upper
+     !> triangular.
+     !> STGSEN also computes the generalized eigenvalues
+     !> w(j) = (ALPHAR(j) + i*ALPHAI(j))/BETA(j)
+     !> of the reordered matrix pair (A, B).
+     !> Optionally, STGSEN computes the estimates of reciprocal condition
+     !> numbers for eigenvalues and eigenspaces. These are Difu[(A11,B11),
+     !> (A22,B22)] and Difl[(A11,B11), (A22,B22)], i.e. the separation(s)
+     !> between the matrix pairs (A11, B11) and (A22,B22) that correspond to
+     !> the selected cluster and the eigenvalues outside the cluster, resp.,
+     !> and norms of "projections" onto left and right eigenspaces w.r.t.
+     !> the selected cluster in the (1,1)-block.
+
+     pure subroutine stdlib_stgsen( ijob, wantq, wantz, select, n, a, lda, b, ldb,alphar, alphai, &
+               beta, q, ldq, z, ldz, m, pl,pr, dif, work, lwork, iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldq, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(out) :: pl, pr
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           real(sp), intent(out) :: alphai(*), alphar(*), beta(*), dif(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: idifjb = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, pair, swap, wantd, wantd1, wantd2, wantp
+           integer(ilp) :: i, ierr, ijb, k, kase, kk, ks, liwmin, lwmin, mn2, n1, n2
+           real(sp) :: dscale, dsum, eps, rdscal, smlnum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max,sign,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           info = 0
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( ijob<0 .or. ijob>5 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -14
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STGSEN', -info )
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' ) / eps
+           ierr = 0
+           wantp = ijob==1 .or. ijob>=4
+           wantd1 = ijob==2 .or. ijob==4
+           wantd2 = ijob==3 .or. ijob==5
+           wantd = wantd1 .or. wantd2
+           ! set m to the dimension of the specified pair of deflating
+           ! subspaces.
+           m = 0
+           pair = .false.
+           if( .not.lquery .or. ijob/=0 ) then
+           do k = 1, n
+              if( pair ) then
+                 pair = .false.
+              else
+                 if( k<n ) then
+                    if( a( k+1, k )==zero ) then
+                       if( select( k ) )m = m + 1
+                    else
+                       pair = .true.
+                       if( select( k ) .or. select( k+1 ) )m = m + 2
+                    end if
+                 else
+                    if( select( n ) )m = m + 1
+                 end if
+              end if
+           end do
+           end if
+           if( ijob==1 .or. ijob==2 .or. ijob==4 ) then
+              lwmin = max( 1, 4*n+16, 2*m*(n-m) )
+              liwmin = max( 1, n+6 )
+           else if( ijob==3 .or. ijob==5 ) then
+              lwmin = max( 1, 4*n+16, 4*m*(n-m) )
+              liwmin = max( 1, 2*m*(n-m), n+6 )
+           else
+              lwmin = max( 1, 4*n+16 )
+              liwmin = 1
+           end if
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           if( lwork<lwmin .and. .not.lquery ) then
+              info = -22
+           else if( liwork<liwmin .and. .not.lquery ) then
+              info = -24
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STGSEN', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( m==n .or. m==0 ) then
+              if( wantp ) then
+                 pl = one
+                 pr = one
+              end if
+              if( wantd ) then
+                 dscale = zero
+                 dsum = one
+                 do i = 1, n
+                    call stdlib_slassq( n, a( 1, i ), 1, dscale, dsum )
+                    call stdlib_slassq( n, b( 1, i ), 1, dscale, dsum )
+                 end do
+                 dif( 1 ) = dscale*sqrt( dsum )
+                 dif( 2 ) = dif( 1 )
+              end if
+              go to 60
+           end if
+           ! collect the selected blocks at the top-left corner of (a, b).
+           ks = 0
+           pair = .false.
+           loop_30: do k = 1, n
+              if( pair ) then
+                 pair = .false.
+              else
+                 swap = select( k )
+                 if( k<n ) then
+                    if( a( k+1, k )/=zero ) then
+                       pair = .true.
+                       swap = swap .or. select( k+1 )
+                    end if
+                 end if
+                 if( swap ) then
+                    ks = ks + 1
+                    ! swap the k-th block to position ks.
+                    ! perform the reordering of diagonal blocks in (a, b)
+                    ! by orthogonal transformation matrices and update
+                    ! q and z accordingly (if requested):
+                    kk = k
+                    if( k/=ks )call stdlib_stgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq,z, ldz,&
+                               kk, ks, work, lwork, ierr )
+                    if( ierr>0 ) then
+                       ! swap is rejected: exit.
+                       info = 1
+                       if( wantp ) then
+                          pl = zero
+                          pr = zero
+                       end if
+                       if( wantd ) then
+                          dif( 1 ) = zero
+                          dif( 2 ) = zero
+                       end if
+                       go to 60
+                    end if
+                    if( pair )ks = ks + 1
+                 end if
+              end if
+           end do loop_30
+           if( wantp ) then
+              ! solve generalized sylvester equation for r and l
+              ! and compute pl and pr.
+              n1 = m
+              n2 = n - m
+              i = n1 + 1
+              ijb = 0
+              call stdlib_slacpy( 'FULL', n1, n2, a( 1, i ), lda, work, n1 )
+              call stdlib_slacpy( 'FULL', n1, n2, b( 1, i ), ldb, work( n1*n2+1 ),n1 )
+              call stdlib_stgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda, work,n1, b, ldb, b( i,&
+               i ), ldb, work( n1*n2+1 ), n1,dscale, dif( 1 ), work( n1*n2*2+1 ),lwork-2*n1*n2, &
+                         iwork, ierr )
+              ! estimate the reciprocal of norms of "projections" onto left
+              ! and right eigenspaces.
+              rdscal = zero
+              dsum = one
+              call stdlib_slassq( n1*n2, work, 1, rdscal, dsum )
+              pl = rdscal*sqrt( dsum )
+              if( pl==zero ) then
+                 pl = one
+              else
+                 pl = dscale / ( sqrt( dscale*dscale / pl+pl )*sqrt( pl ) )
+              end if
+              rdscal = zero
+              dsum = one
+              call stdlib_slassq( n1*n2, work( n1*n2+1 ), 1, rdscal, dsum )
+              pr = rdscal*sqrt( dsum )
+              if( pr==zero ) then
+                 pr = one
+              else
+                 pr = dscale / ( sqrt( dscale*dscale / pr+pr )*sqrt( pr ) )
+              end if
+           end if
+           if( wantd ) then
+              ! compute estimates of difu and difl.
+              if( wantd1 ) then
+                 n1 = m
+                 n2 = n - m
+                 i = n1 + 1
+                 ijb = idifjb
+                 ! frobenius norm-based difu-estimate.
+                 call stdlib_stgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda, work,n1, b, ldb, b(&
+                  i, i ), ldb, work( n1*n2+1 ),n1, dscale, dif( 1 ), work( 2*n1*n2+1 ),lwork-&
+                            2*n1*n2, iwork, ierr )
+                 ! frobenius norm-based difl-estimate.
+                 call stdlib_stgsyl( 'N', ijb, n2, n1, a( i, i ), lda, a, lda, work,n2, b( i, i ),&
+                  ldb, b, ldb, work( n1*n2+1 ),n2, dscale, dif( 2 ), work( 2*n1*n2+1 ),lwork-&
+                            2*n1*n2, iwork, ierr )
+              else
+                 ! compute 1-norm-based estimates of difu and difl using
+                 ! reversed communication with stdlib_slacn2. in each step a
+                 ! generalized sylvester equation or a transposed variant
+                 ! is solved.
+                 kase = 0
+                 n1 = m
+                 n2 = n - m
+                 i = n1 + 1
+                 ijb = 0
+                 mn2 = 2*n1*n2
+                 ! 1-norm-based estimate of difu.
+                 40 continue
+                 call stdlib_slacn2( mn2, work( mn2+1 ), work, iwork, dif( 1 ),kase, isave )
+                           
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve generalized sylvester equation.
+                       call stdlib_stgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda,work, n1, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n1, dscale, dif( 1 ),work( 2*n1*n2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    else
+                       ! solve the transposed variant.
+                       call stdlib_stgsyl( 'T', ijb, n1, n2, a, lda, a( i, i ), lda,work, n1, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n1, dscale, dif( 1 ),work( 2*n1*n2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    end if
+                    go to 40
+                 end if
+                 dif( 1 ) = dscale / dif( 1 )
+                 ! 1-norm-based estimate of difl.
+                 50 continue
+                 call stdlib_slacn2( mn2, work( mn2+1 ), work, iwork, dif( 2 ),kase, isave )
+                           
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve generalized sylvester equation.
+                       call stdlib_stgsyl( 'N', ijb, n2, n1, a( i, i ), lda, a, lda,work, n2, b( &
+                       i, i ), ldb, b, ldb,work( n1*n2+1 ), n2, dscale, dif( 2 ),work( 2*n1*n2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    else
+                       ! solve the transposed variant.
+                       call stdlib_stgsyl( 'T', ijb, n2, n1, a( i, i ), lda, a, lda,work, n2, b( &
+                       i, i ), ldb, b, ldb,work( n1*n2+1 ), n2, dscale, dif( 2 ),work( 2*n1*n2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    end if
+                    go to 50
+                 end if
+                 dif( 2 ) = dscale / dif( 2 )
+              end if
+           end if
+           60 continue
+           ! compute generalized eigenvalues of reordered pair (a, b) and
+           ! normalize the generalized schur form.
+           pair = .false.
+           loop_70: do k = 1, n
+              if( pair ) then
+                 pair = .false.
+              else
+                 if( k<n ) then
+                    if( a( k+1, k )/=zero ) then
+                       pair = .true.
+                    end if
+                 end if
+                 if( pair ) then
+                   ! compute the eigenvalue(s) at position k.
+                    work( 1 ) = a( k, k )
+                    work( 2 ) = a( k+1, k )
+                    work( 3 ) = a( k, k+1 )
+                    work( 4 ) = a( k+1, k+1 )
+                    work( 5 ) = b( k, k )
+                    work( 6 ) = b( k+1, k )
+                    work( 7 ) = b( k, k+1 )
+                    work( 8 ) = b( k+1, k+1 )
+                    call stdlib_slag2( work, 2, work( 5 ), 2, smlnum*eps, beta( k ),beta( k+1 ), &
+                              alphar( k ), alphar( k+1 ),alphai( k ) )
+                    alphai( k+1 ) = -alphai( k )
+                 else
+                    if( sign( one, b( k, k ) )<zero ) then
+                       ! if b(k,k) is negative, make it positive
+                       do i = 1, n
+                          a( k, i ) = -a( k, i )
+                          b( k, i ) = -b( k, i )
+                          if( wantq ) q( i, k ) = -q( i, k )
+                       end do
+                    end if
+                    alphar( k ) = a( k, k )
+                    alphai( k ) = zero
+                    beta( k ) = b( k, k )
+                 end if
+              end if
+           end do loop_70
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_stgsen
+
+     !> STGSJA: computes the generalized singular value decomposition (GSVD)
+     !> of two real upper triangular (or trapezoidal) matrices A and B.
+     !> On entry, it is assumed that matrices A and B have the following
+     !> forms, which may be obtained by the preprocessing subroutine SGGSVP
+     !> from a general M-by-N matrix A and P-by-N matrix B:
+     !> N-K-L  K    L
+     !> A =    K ( 0    A12  A13 ) if M-K-L >= 0;
+     !> L ( 0     0   A23 )
+     !> M-K-L ( 0     0    0  )
+     !> N-K-L  K    L
+     !> A =  K ( 0    A12  A13 ) if M-K-L < 0;
+     !> M-K ( 0     0   A23 )
+     !> N-K-L  K    L
+     !> B =  L ( 0     0   B13 )
+     !> P-L ( 0     0    0  )
+     !> where the K-by-K matrix A12 and L-by-L matrix B13 are nonsingular
+     !> upper triangular; A23 is L-by-L upper triangular if M-K-L >= 0,
+     !> otherwise A23 is (M-K)-by-L upper trapezoidal.
+     !> On exit,
+     !> U**T *A*Q = D1*( 0 R ),    V**T *B*Q = D2*( 0 R ),
+     !> where U, V and Q are orthogonal matrices.
+     !> R is a nonsingular upper triangular matrix, and D1 and D2 are
+     !> ``diagonal'' matrices, which are of the following structures:
+     !> If M-K-L >= 0,
+     !> K  L
+     !> D1 =     K ( I  0 )
+     !> L ( 0  C )
+     !> M-K-L ( 0  0 )
+     !> K  L
+     !> D2 = L   ( 0  S )
+     !> P-L ( 0  0 )
+     !> N-K-L  K    L
+     !> ( 0 R ) = K (  0   R11  R12 ) K
+     !> L (  0    0   R22 ) L
+     !> where
+     !> C = diag( ALPHA(K+1), ... , ALPHA(K+L) ),
+     !> S = diag( BETA(K+1),  ... , BETA(K+L) ),
+     !> C**2 + S**2 = I.
+     !> R is stored in A(1:K+L,N-K-L+1:N) on exit.
+     !> If M-K-L < 0,
+     !> K M-K K+L-M
+     !> D1 =   K ( I  0    0   )
+     !> M-K ( 0  C    0   )
+     !> K M-K K+L-M
+     !> D2 =   M-K ( 0  S    0   )
+     !> K+L-M ( 0  0    I   )
+     !> P-L ( 0  0    0   )
+     !> N-K-L  K   M-K  K+L-M
+     !> ( 0 R ) =    K ( 0    R11  R12  R13  )
+     !> M-K ( 0     0   R22  R23  )
+     !> K+L-M ( 0     0    0   R33  )
+     !> where
+     !> C = diag( ALPHA(K+1), ... , ALPHA(M) ),
+     !> S = diag( BETA(K+1),  ... , BETA(M) ),
+     !> C**2 + S**2 = I.
+     !> R = ( R11 R12 R13 ) is stored in A(1:M, N-K-L+1:N) and R33 is stored
+     !> (  0  R22 R23 )
+     !> in B(M-K+1:L,N+M-K-L+1:N) on exit.
+     !> The computation of the orthogonal transformation matrices U, V or Q
+     !> is optional.  These matrices may either be formed explicitly, or they
+     !> may be postmultiplied into input matrices U1, V1, or Q1.
+
+     pure subroutine stdlib_stgsja( jobu, jobv, jobq, m, p, n, k, l, a, lda, b,ldb, tola, tolb, &
+               alpha, beta, u, ldu, v, ldv,q, ldq, work, ncycle, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobq, jobu, jobv
+           integer(ilp), intent(out) :: info, ncycle
+           integer(ilp), intent(in) :: k, l, lda, ldb, ldq, ldu, ldv, m, n, p
+           real(sp), intent(in) :: tola, tolb
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), u(ldu,*), v(ldv,*)
+           real(sp), intent(out) :: alpha(*), beta(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 40
+           real(sp), parameter :: hugenum = huge(zero)
+           
+           
+           ! Local Scalars 
+           logical(lk) :: initq, initu, initv, upper, wantq, wantu, wantv
+           integer(ilp) :: i, j, kcycle
+           real(sp) :: a1, a2, a3, b1, b2, b3, csq, csu, csv, error, gamma, rwk, snq, snu, snv, &
+                     ssmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,huge
+           ! Executable Statements 
+           ! decode and test the input parameters
+           initu = stdlib_lsame( jobu, 'I' )
+           wantu = initu .or. stdlib_lsame( jobu, 'U' )
+           initv = stdlib_lsame( jobv, 'I' )
+           wantv = initv .or. stdlib_lsame( jobv, 'V' )
+           initq = stdlib_lsame( jobq, 'I' )
+           wantq = initq .or. stdlib_lsame( jobq, 'Q' )
+           info = 0
+           if( .not.( initu .or. wantu .or. stdlib_lsame( jobu, 'N' ) ) ) then
+              info = -1
+           else if( .not.( initv .or. wantv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -2
+           else if( .not.( initq .or. wantq .or. stdlib_lsame( jobq, 'N' ) ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( p<0 ) then
+              info = -5
+           else if( n<0 ) then
+              info = -6
+           else if( lda<max( 1, m ) ) then
+              info = -10
+           else if( ldb<max( 1, p ) ) then
+              info = -12
+           else if( ldu<1 .or. ( wantu .and. ldu<m ) ) then
+              info = -18
+           else if( ldv<1 .or. ( wantv .and. ldv<p ) ) then
+              info = -20
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -22
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STGSJA', -info )
+              return
+           end if
+           ! initialize u, v and q, if necessary
+           if( initu )call stdlib_slaset( 'FULL', m, m, zero, one, u, ldu )
+           if( initv )call stdlib_slaset( 'FULL', p, p, zero, one, v, ldv )
+           if( initq )call stdlib_slaset( 'FULL', n, n, zero, one, q, ldq )
+           ! loop until convergence
+           upper = .false.
+           loop_40: do kcycle = 1, maxit
+              upper = .not.upper
+              loop_20: do i = 1, l - 1
+                 loop_10: do j = i + 1, l
+                    a1 = zero
+                    a2 = zero
+                    a3 = zero
+                    if( k+i<=m )a1 = a( k+i, n-l+i )
+                    if( k+j<=m )a3 = a( k+j, n-l+j )
+                    b1 = b( i, n-l+i )
+                    b3 = b( j, n-l+j )
+                    if( upper ) then
+                       if( k+i<=m )a2 = a( k+i, n-l+j )
+                       b2 = b( i, n-l+j )
+                    else
+                       if( k+j<=m )a2 = a( k+j, n-l+i )
+                       b2 = b( j, n-l+i )
+                    end if
+                    call stdlib_slags2( upper, a1, a2, a3, b1, b2, b3, csu, snu,csv, snv, csq, &
+                              snq )
+                    ! update (k+i)-th and (k+j)-th rows of matrix a: u**t *a
+                    if( k+j<=m )call stdlib_srot( l, a( k+j, n-l+1 ), lda, a( k+i, n-l+1 ),lda, &
+                              csu, snu )
+                    ! update i-th and j-th rows of matrix b: v**t *b
+                    call stdlib_srot( l, b( j, n-l+1 ), ldb, b( i, n-l+1 ), ldb,csv, snv )
+                    ! update (n-l+i)-th and (n-l+j)-th columns of matrices
+                    ! a and b: a*q and b*q
+                    call stdlib_srot( min( k+l, m ), a( 1, n-l+j ), 1,a( 1, n-l+i ), 1, csq, snq )
+                              
+                    call stdlib_srot( l, b( 1, n-l+j ), 1, b( 1, n-l+i ), 1, csq,snq )
+                    if( upper ) then
+                       if( k+i<=m )a( k+i, n-l+j ) = zero
+                       b( i, n-l+j ) = zero
+                    else
+                       if( k+j<=m )a( k+j, n-l+i ) = zero
+                       b( j, n-l+i ) = zero
+                    end if
+                    ! update orthogonal matrices u, v, q, if desired.
+                    if( wantu .and. k+j<=m )call stdlib_srot( m, u( 1, k+j ), 1, u( 1, k+i ), 1, &
+                              csu,snu )
+                    if( wantv )call stdlib_srot( p, v( 1, j ), 1, v( 1, i ), 1, csv, snv )
+                    if( wantq )call stdlib_srot( n, q( 1, n-l+j ), 1, q( 1, n-l+i ), 1, csq,snq )
+                              
+                 end do loop_10
+              end do loop_20
+              if( .not.upper ) then
+                 ! the matrices a13 and b13 were lower triangular at the start
+                 ! of the cycle, and are now upper triangular.
+                 ! convergence test: test the parallelism of the corresponding
+                 ! rows of a and b.
+                 error = zero
+                 do i = 1, min( l, m-k )
+                    call stdlib_scopy( l-i+1, a( k+i, n-l+i ), lda, work, 1 )
+                    call stdlib_scopy( l-i+1, b( i, n-l+i ), ldb, work( l+1 ), 1 )
+                    call stdlib_slapll( l-i+1, work, 1, work( l+1 ), 1, ssmin )
+                    error = max( error, ssmin )
+                 end do
+                 if( abs( error )<=min( tola, tolb ) )go to 50
+              end if
+              ! end of cycle loop
+           end do loop_40
+           ! the algorithm has not converged after maxit cycles.
+           info = 1
+           go to 100
+           50 continue
+           ! if error <= min(tola,tolb), then the algorithm has converged.
+           ! compute the generalized singular value pairs (alpha, beta), and
+           ! set the triangular matrix r to array a.
+           do i = 1, k
+              alpha( i ) = one
+              beta( i ) = zero
+           end do
+           do i = 1, min( l, m-k )
+              a1 = a( k+i, n-l+i )
+              b1 = b( i, n-l+i )
+              gamma = b1 / a1
+              if( (gamma<=hugenum).and.(gamma>=-hugenum) ) then
+                 ! change sign if necessary
+                 if( gamma<zero ) then
+                    call stdlib_sscal( l-i+1, -one, b( i, n-l+i ), ldb )
+                    if( wantv )call stdlib_sscal( p, -one, v( 1, i ), 1 )
+                 end if
+                 call stdlib_slartg( abs( gamma ), one, beta( k+i ), alpha( k+i ),rwk )
+                 if( alpha( k+i )>=beta( k+i ) ) then
+                    call stdlib_sscal( l-i+1, one / alpha( k+i ), a( k+i, n-l+i ),lda )
+                 else
+                    call stdlib_sscal( l-i+1, one / beta( k+i ), b( i, n-l+i ),ldb )
+                    call stdlib_scopy( l-i+1, b( i, n-l+i ), ldb, a( k+i, n-l+i ),lda )
+                 end if
+              else
+                 alpha( k+i ) = zero
+                 beta( k+i ) = one
+                 call stdlib_scopy( l-i+1, b( i, n-l+i ), ldb, a( k+i, n-l+i ),lda )
+              end if
+           end do
+           ! post-assignment
+           do i = m + 1, k + l
+              alpha( i ) = zero
+              beta( i ) = one
+           end do
+           if( k+l<n ) then
+              do i = k + l + 1, n
+                 alpha( i ) = zero
+                 beta( i ) = zero
+              end do
+           end if
+           100 continue
+           ncycle = kcycle
+           return
+     end subroutine stdlib_stgsja
+
+     !> STGSNA: estimates reciprocal condition numbers for specified
+     !> eigenvalues and/or eigenvectors of a matrix pair (A, B) in
+     !> generalized real Schur canonical form (or of any matrix pair
+     !> (Q*A*Z**T, Q*B*Z**T) with orthogonal matrices Q and Z, where
+     !> Z**T denotes the transpose of Z.
+     !> (A, B) must be in generalized real Schur form (as returned by SGGES),
+     !> i.e. A is block upper triangular with 1-by-1 and 2-by-2 diagonal
+     !> blocks. B is upper triangular.
+
+     pure subroutine stdlib_stgsna( job, howmny, select, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, s, &
+               dif, mm, m, work, lwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(in) :: a(lda,*), b(ldb,*), vl(ldvl,*), vr(ldvr,*)
+           real(sp), intent(out) :: dif(*), s(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: difdri = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, pair, somcon, wantbh, wantdf, wants
+           integer(ilp) :: i, ierr, ifst, ilst, iz, k, ks, lwmin, n1, n2
+           real(sp) :: alphai, alphar, alprqt, beta, c1, c2, cond, eps, lnrm, rnrm, root1, root2, &
+                     scale, smlnum, tmpii, tmpir, tmpri, tmprr, uhav, uhavi, uhbv, uhbvi
+           ! Local Arrays 
+           real(sp) :: dummy(1), dummy1(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantdf = stdlib_lsame( job, 'V' ) .or. wantbh
+           somcon = stdlib_lsame( howmny, 'S' )
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.wants .and. .not.wantdf ) then
+              info = -1
+           else if( .not.stdlib_lsame( howmny, 'A' ) .and. .not.somcon ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( wants .and. ldvl<n ) then
+              info = -10
+           else if( wants .and. ldvr<n ) then
+              info = -12
+           else
+              ! set m to the number of eigenpairs for which condition numbers
+              ! are required, and test mm.
+              if( somcon ) then
+                 m = 0
+                 pair = .false.
+                 do k = 1, n
+                    if( pair ) then
+                       pair = .false.
+                    else
+                       if( k<n ) then
+                          if( a( k+1, k )==zero ) then
+                             if( select( k ) )m = m + 1
+                          else
+                             pair = .true.
+                             if( select( k ) .or. select( k+1 ) )m = m + 2
+                          end if
+                       else
+                          if( select( n ) )m = m + 1
+                       end if
+                    end if
+                 end do
+              else
+                 m = n
+              end if
+              if( n==0 ) then
+                 lwmin = 1
+              else if( stdlib_lsame( job, 'V' ) .or. stdlib_lsame( job, 'B' ) ) then
+                 lwmin = 2*n*( n + 2 ) + 16
+              else
+                 lwmin = n
+              end if
+              work( 1 ) = lwmin
+              if( mm<m ) then
+                 info = -15
+              else if( lwork<lwmin .and. .not.lquery ) then
+                 info = -18
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STGSNA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' ) / eps
+           ks = 0
+           pair = .false.
+           loop_20: do k = 1, n
+              ! determine whether a(k,k) begins a 1-by-1 or 2-by-2 block.
+              if( pair ) then
+                 pair = .false.
+                 cycle loop_20
+              else
+                 if( k<n )pair = a( k+1, k )/=zero
+              end if
+              ! determine whether condition numbers are required for the k-th
+              ! eigenpair.
+              if( somcon ) then
+                 if( pair ) then
+                    if( .not.select( k ) .and. .not.select( k+1 ) )cycle loop_20
+                 else
+                    if( .not.select( k ) )cycle loop_20
+                 end if
+              end if
+              ks = ks + 1
+              if( wants ) then
+                 ! compute the reciprocal condition number of the k-th
+                 ! eigenvalue.
+                 if( pair ) then
+                    ! complex eigenvalue pair.
+                    rnrm = stdlib_slapy2( stdlib_snrm2( n, vr( 1, ks ), 1 ),stdlib_snrm2( n, vr( &
+                              1, ks+1 ), 1 ) )
+                    lnrm = stdlib_slapy2( stdlib_snrm2( n, vl( 1, ks ), 1 ),stdlib_snrm2( n, vl( &
+                              1, ks+1 ), 1 ) )
+                    call stdlib_sgemv( 'N', n, n, one, a, lda, vr( 1, ks ), 1, zero,work, 1 )
+                              
+                    tmprr = stdlib_sdot( n, work, 1, vl( 1, ks ), 1 )
+                    tmpri = stdlib_sdot( n, work, 1, vl( 1, ks+1 ), 1 )
+                    call stdlib_sgemv( 'N', n, n, one, a, lda, vr( 1, ks+1 ), 1,zero, work, 1 )
+                              
+                    tmpii = stdlib_sdot( n, work, 1, vl( 1, ks+1 ), 1 )
+                    tmpir = stdlib_sdot( n, work, 1, vl( 1, ks ), 1 )
+                    uhav = tmprr + tmpii
+                    uhavi = tmpir - tmpri
+                    call stdlib_sgemv( 'N', n, n, one, b, ldb, vr( 1, ks ), 1, zero,work, 1 )
+                              
+                    tmprr = stdlib_sdot( n, work, 1, vl( 1, ks ), 1 )
+                    tmpri = stdlib_sdot( n, work, 1, vl( 1, ks+1 ), 1 )
+                    call stdlib_sgemv( 'N', n, n, one, b, ldb, vr( 1, ks+1 ), 1,zero, work, 1 )
+                              
+                    tmpii = stdlib_sdot( n, work, 1, vl( 1, ks+1 ), 1 )
+                    tmpir = stdlib_sdot( n, work, 1, vl( 1, ks ), 1 )
+                    uhbv = tmprr + tmpii
+                    uhbvi = tmpir - tmpri
+                    uhav = stdlib_slapy2( uhav, uhavi )
+                    uhbv = stdlib_slapy2( uhbv, uhbvi )
+                    cond = stdlib_slapy2( uhav, uhbv )
+                    s( ks ) = cond / ( rnrm*lnrm )
+                    s( ks+1 ) = s( ks )
+                 else
+                    ! real eigenvalue.
+                    rnrm = stdlib_snrm2( n, vr( 1, ks ), 1 )
+                    lnrm = stdlib_snrm2( n, vl( 1, ks ), 1 )
+                    call stdlib_sgemv( 'N', n, n, one, a, lda, vr( 1, ks ), 1, zero,work, 1 )
+                              
+                    uhav = stdlib_sdot( n, work, 1, vl( 1, ks ), 1 )
+                    call stdlib_sgemv( 'N', n, n, one, b, ldb, vr( 1, ks ), 1, zero,work, 1 )
+                              
+                    uhbv = stdlib_sdot( n, work, 1, vl( 1, ks ), 1 )
+                    cond = stdlib_slapy2( uhav, uhbv )
+                    if( cond==zero ) then
+                       s( ks ) = -one
+                    else
+                       s( ks ) = cond / ( rnrm*lnrm )
+                    end if
+                 end if
+              end if
+              if( wantdf ) then
+                 if( n==1 ) then
+                    dif( ks ) = stdlib_slapy2( a( 1, 1 ), b( 1, 1 ) )
+                    cycle loop_20
+                 end if
+                 ! estimate the reciprocal condition number of the k-th
+                 ! eigenvectors.
+                 if( pair ) then
+                    ! copy the  2-by 2 pencil beginning at (a(k,k), b(k, k)).
+                    ! compute the eigenvalue(s) at position k.
+                    work( 1 ) = a( k, k )
+                    work( 2 ) = a( k+1, k )
+                    work( 3 ) = a( k, k+1 )
+                    work( 4 ) = a( k+1, k+1 )
+                    work( 5 ) = b( k, k )
+                    work( 6 ) = b( k+1, k )
+                    work( 7 ) = b( k, k+1 )
+                    work( 8 ) = b( k+1, k+1 )
+                    call stdlib_slag2( work, 2, work( 5 ), 2, smlnum*eps, beta,dummy1( 1 ), &
+                              alphar, dummy( 1 ), alphai )
+                    alprqt = one
+                    c1 = two*( alphar*alphar+alphai*alphai+beta*beta )
+                    c2 = four*beta*beta*alphai*alphai
+                    root1 = c1 + sqrt( c1*c1-4.0_sp*c2 )
+                    root2 = c2 / root1
+                    root1 = root1 / two
+                    cond = min( sqrt( root1 ), sqrt( root2 ) )
+                 end if
+                 ! copy the matrix (a, b) to the array work and swap the
+                 ! diagonal block beginning at a(k,k) to the (1,1) position.
+                 call stdlib_slacpy( 'FULL', n, n, a, lda, work, n )
+                 call stdlib_slacpy( 'FULL', n, n, b, ldb, work( n*n+1 ), n )
+                 ifst = k
+                 ilst = 1
+                 call stdlib_stgexc( .false., .false., n, work, n, work( n*n+1 ), n,dummy, 1, &
+                           dummy1, 1, ifst, ilst,work( n*n*2+1 ), lwork-2*n*n, ierr )
+                 if( ierr>0 ) then
+                    ! ill-conditioned problem - swap rejected.
+                    dif( ks ) = zero
+                 else
+                    ! reordering successful, solve generalized sylvester
+                    ! equation for r and l,
+                               ! a22 * r - l * a11 = a12
+                               ! b22 * r - l * b11 = b12,
+                    ! and compute estimate of difl((a11,b11), (a22, b22)).
+                    n1 = 1
+                    if( work( 2 )/=zero )n1 = 2
+                    n2 = n - n1
+                    if( n2==0 ) then
+                       dif( ks ) = cond
+                    else
+                       i = n*n + 1
+                       iz = 2*n*n + 1
+                       call stdlib_stgsyl( 'N', difdri, n2, n1, work( n*n1+n1+1 ),n, work, n, &
+                       work( n1+1 ), n,work( n*n1+n1+i ), n, work( i ), n,work( n1+i ), n, scale, &
+                                 dif( ks ),work( iz+1 ), lwork-2*n*n, iwork, ierr )
+                       if( pair )dif( ks ) = min( max( one, alprqt )*dif( ks ),cond )
+                    end if
+                 end if
+                 if( pair )dif( ks+1 ) = dif( ks )
+              end if
+              if( pair )ks = ks + 1
+           end do loop_20
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_stgsna
+
+     !> STPLQT: computes a blocked LQ factorization of a real
+     !> "triangular-pentagonal" matrix C, which is composed of a
+     !> triangular block A and pentagonal block B, using the compact
+     !> WY representation for Q.
+
+     pure subroutine stdlib_stplqt( m, n, l, mb, a, lda, b, ldb, t, ldt, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l, mb
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, lb, nb, iinfo
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. (l>min(m,n) .and. min(m,n)>=0)) then
+              info = -3
+           else if( mb<1 .or. (mb>m .and. m>0)) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m ) ) then
+              info = -8
+           else if( ldt<mb ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STPLQT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) return
+           do i = 1, m, mb
+           ! compute the qr factorization of the current block
+              ib = min( m-i+1, mb )
+              nb = min( n-l+i+ib-1, n )
+              if( i>=l ) then
+                 lb = 0
+              else
+                 lb = nb-n+l-i+1
+              end if
+              call stdlib_stplqt2( ib, nb, lb, a(i,i), lda, b( i, 1 ), ldb,t(1, i ), ldt, iinfo )
+                        
+           ! update by applying h**t to b(i+ib:m,:) from the right
+              if( i+ib<=m ) then
+                 call stdlib_stprfb( 'R', 'N', 'F', 'R', m-i-ib+1, nb, ib, lb,b( i, 1 ), ldb, t( &
+                           1, i ), ldt,a( i+ib, i ), lda, b( i+ib, 1 ), ldb,work, m-i-ib+1)
+              end if
+           end do
+           return
+     end subroutine stdlib_stplqt
+
+     !> STPQRT: computes a blocked QR factorization of a real
+     !> "triangular-pentagonal" matrix C, which is composed of a
+     !> triangular block A and pentagonal block B, using the compact
+     !> WY representation for Q.
+
+     pure subroutine stdlib_stpqrt( m, n, l, nb, a, lda, b, ldb, t, ldt, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l, nb
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, lb, mb, iinfo
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. (l>min(m,n) .and. min(m,n)>=0)) then
+              info = -3
+           else if( nb<1 .or. (nb>n .and. n>0)) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, m ) ) then
+              info = -8
+           else if( ldt<nb ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STPQRT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) return
+           do i = 1, n, nb
+           ! compute the qr factorization of the current block
+              ib = min( n-i+1, nb )
+              mb = min( m-l+i+ib-1, m )
+              if( i>=l ) then
+                 lb = 0
+              else
+                 lb = mb-m+l-i+1
+              end if
+              call stdlib_stpqrt2( mb, ib, lb, a(i,i), lda, b( 1, i ), ldb,t(1, i ), ldt, iinfo )
+                        
+           ! update by applying h^h to b(:,i+ib:n) from the left
+              if( i+ib<=n ) then
+                 call stdlib_stprfb( 'L', 'T', 'F', 'C', mb, n-i-ib+1, ib, lb,b( 1, i ), ldb, t( &
+                           1, i ), ldt,a( i, i+ib ), lda, b( 1, i+ib ), ldb,work, ib )
+              end if
+           end do
+           return
+     end subroutine stdlib_stpqrt
+
+     !> STREVC: computes some or all of the right and/or left eigenvectors of
+     !> a real upper quasi-triangular matrix T.
+     !> Matrices of this type are produced by the Schur factorization of
+     !> a real general matrix:  A = Q*T*Q**T, as computed by SHSEQR.
+     !> The right eigenvector x and the left eigenvector y of T corresponding
+     !> to an eigenvalue w are defined by:
+     !> T*x = w*x,     (y**H)*T = w*(y**H)
+     !> where y**H denotes the conjugate transpose of y.
+     !> The eigenvalues are not input to this routine, but are read directly
+     !> from the diagonal blocks of T.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+     !> input matrix.  If Q is the orthogonal factor that reduces a matrix
+     !> A to Schur form T, then Q*X and Q*Y are the matrices of right and
+     !> left eigenvectors of A.
+
+     pure subroutine stdlib_strevc( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(inout) :: select(*)
+           real(sp), intent(in) :: t(ldt,*)
+           real(sp), intent(inout) :: vl(ldvl,*), vr(ldvr,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: allv, bothv, leftv, over, pair, rightv, somev
+           integer(ilp) :: i, ierr, ii, ip, is, j, j1, j2, jnxt, k, ki, n2
+           real(sp) :: beta, bignum, emax, ovfl, rec, remax, scale, smin, smlnum, ulp, unfl, &
+                     vcrit, vmax, wi, wr, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Local Arrays 
+           real(sp) :: x(2,2)
+           ! Executable Statements 
+           ! decode and test the input parameters
+           bothv = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv = stdlib_lsame( side, 'L' ) .or. bothv
+           allv = stdlib_lsame( howmny, 'A' )
+           over = stdlib_lsame( howmny, 'B' )
+           somev = stdlib_lsame( howmny, 'S' )
+           info = 0
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.allv .and. .not.over .and. .not.somev ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -10
+           else
+              ! set m to the number of columns required to store the selected
+              ! eigenvectors, standardize the array select if necessary, and
+              ! test mm.
+              if( somev ) then
+                 m = 0
+                 pair = .false.
+                 do j = 1, n
+                    if( pair ) then
+                       pair = .false.
+                       select( j ) = .false.
+                    else
+                       if( j<n ) then
+                          if( t( j+1, j )==zero ) then
+                             if( select( j ) )m = m + 1
+                          else
+                             pair = .true.
+                             if( select( j ) .or. select( j+1 ) ) then
+                                select( j ) = .true.
+                                m = m + 2
+                             end if
+                          end if
+                       else
+                          if( select( n ) )m = m + 1
+                       end if
+                    end if
+                 end do
+              else
+                 m = n
+              end if
+              if( mm<m ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STREVC', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! set the constants to control overflow.
+           unfl = stdlib_slamch( 'SAFE MINIMUM' )
+           ovfl = one / unfl
+           call stdlib_slabad( unfl, ovfl )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           bignum = ( one-ulp ) / smlnum
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           work( 1 ) = zero
+           do j = 2, n
+              work( j ) = zero
+              do i = 1, j - 1
+                 work( j ) = work( j ) + abs( t( i, j ) )
+              end do
+           end do
+           ! index ip is used to specify the real or complex eigenvalue:
+             ! ip = 0, real eigenvalue,
+                  ! 1, first of conjugate complex pair: (wr,wi)
+                 ! -1, second of conjugate complex pair: (wr,wi)
+           n2 = 2*n
+           if( rightv ) then
+              ! compute right eigenvectors.
+              ip = 0
+              is = m
+              loop_140: do ki = n, 1, -1
+                 if( ip==1 )go to 130
+                 if( ki==1 )go to 40
+                 if( t( ki, ki-1 )==zero )go to 40
+                 ip = -1
+                 40 continue
+                 if( somev ) then
+                    if( ip==0 ) then
+                       if( .not.select( ki ) )go to 130
+                    else
+                       if( .not.select( ki-1 ) )go to 130
+                    end if
+                 end if
+                 ! compute the ki-th eigenvalue (wr,wi).
+                 wr = t( ki, ki )
+                 wi = zero
+                 if( ip/=0 )wi = sqrt( abs( t( ki, ki-1 ) ) )*sqrt( abs( t( ki-1, ki ) ) )
+                 smin = max( ulp*( abs( wr )+abs( wi ) ), smlnum )
+                 if( ip==0 ) then
+                    ! real right eigenvector
+                    work( ki+n ) = one
+                    ! form right-hand side
+                    do k = 1, ki - 1
+                       work( k+n ) = -t( k, ki )
+                    end do
+                    ! solve the upper quasi-triangular system:
+                       ! (t(1:ki-1,1:ki-1) - wr)*x = scale*work.
+                    jnxt = ki - 1
+                    loop_60: do j = ki - 1, 1, -1
+                       if( j>jnxt )cycle loop_60
+                       j1 = j
+                       j2 = j
+                       jnxt = j - 1
+                       if( j>1 ) then
+                          if( t( j, j-1 )/=zero ) then
+                             j1 = j - 1
+                             jnxt = j - 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          call stdlib_slaln2( .false., 1, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale x(1,1) to avoid overflow when updating
+                          ! the right-hand side.
+                          if( xnorm>one ) then
+                             if( work( j )>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_sscal( ki, scale, work( 1+n ), 1 )
+                          work( j+n ) = x( 1, 1 )
+                          ! update right-hand side
+                          call stdlib_saxpy( j-1, -x( 1, 1 ), t( 1, j ), 1,work( 1+n ), 1 )
+                                    
+                       else
+                          ! 2-by-2 diagonal block
+                          call stdlib_slaln2( .false., 2, 1, smin, one,t( j-1, j-1 ), ldt, one, &
+                                    one,work( j-1+n ), n, wr, zero, x, 2,scale, xnorm, ierr )
+                          ! scale x(1,1) and x(2,1) to avoid overflow when
+                          ! updating the right-hand side.
+                          if( xnorm>one ) then
+                             beta = max( work( j-1 ), work( j ) )
+                             if( beta>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                x( 2, 1 ) = x( 2, 1 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_sscal( ki, scale, work( 1+n ), 1 )
+                          work( j-1+n ) = x( 1, 1 )
+                          work( j+n ) = x( 2, 1 )
+                          ! update right-hand side
+                          call stdlib_saxpy( j-2, -x( 1, 1 ), t( 1, j-1 ), 1,work( 1+n ), 1 )
+                                    
+                          call stdlib_saxpy( j-2, -x( 2, 1 ), t( 1, j ), 1,work( 1+n ), 1 )
+                                    
+                       end if
+                    end do loop_60
+                    ! copy the vector x or q*x to vr and normalize.
+                    if( .not.over ) then
+                       call stdlib_scopy( ki, work( 1+n ), 1, vr( 1, is ), 1 )
+                       ii = stdlib_isamax( ki, vr( 1, is ), 1 )
+                       remax = one / abs( vr( ii, is ) )
+                       call stdlib_sscal( ki, remax, vr( 1, is ), 1 )
+                       do k = ki + 1, n
+                          vr( k, is ) = zero
+                       end do
+                    else
+                       if( ki>1 )call stdlib_sgemv( 'N', n, ki-1, one, vr, ldvr,work( 1+n ), 1, &
+                                 work( ki+n ),vr( 1, ki ), 1 )
+                       ii = stdlib_isamax( n, vr( 1, ki ), 1 )
+                       remax = one / abs( vr( ii, ki ) )
+                       call stdlib_sscal( n, remax, vr( 1, ki ), 1 )
+                    end if
+                 else
+                    ! complex right eigenvector.
+                    ! initial solve
+                      ! [ (t(ki-1,ki-1) t(ki-1,ki) ) - (wr + i* wi)]*x = 0.
+                      ! [ (t(ki,ki-1)   t(ki,ki)   )               ]
+                    if( abs( t( ki-1, ki ) )>=abs( t( ki, ki-1 ) ) ) then
+                       work( ki-1+n ) = one
+                       work( ki+n2 ) = wi / t( ki-1, ki )
+                    else
+                       work( ki-1+n ) = -wi / t( ki, ki-1 )
+                       work( ki+n2 ) = one
+                    end if
+                    work( ki+n ) = zero
+                    work( ki-1+n2 ) = zero
+                    ! form right-hand side
+                    do k = 1, ki - 2
+                       work( k+n ) = -work( ki-1+n )*t( k, ki-1 )
+                       work( k+n2 ) = -work( ki+n2 )*t( k, ki )
+                    end do
+                    ! solve upper quasi-triangular system:
+                    ! (t(1:ki-2,1:ki-2) - (wr+i*wi))*x = scale*(work+i*work2)
+                    jnxt = ki - 2
+                    loop_90: do j = ki - 2, 1, -1
+                       if( j>jnxt )cycle loop_90
+                       j1 = j
+                       j2 = j
+                       jnxt = j - 1
+                       if( j>1 ) then
+                          if( t( j, j-1 )/=zero ) then
+                             j1 = j - 1
+                             jnxt = j - 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          call stdlib_slaln2( .false., 1, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr, wi,x, 2, scale, xnorm, ierr )
+                          ! scale x(1,1) and x(1,2) to avoid overflow when
+                          ! updating the right-hand side.
+                          if( xnorm>one ) then
+                             if( work( j )>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                x( 1, 2 ) = x( 1, 2 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_sscal( ki, scale, work( 1+n ), 1 )
+                             call stdlib_sscal( ki, scale, work( 1+n2 ), 1 )
+                          end if
+                          work( j+n ) = x( 1, 1 )
+                          work( j+n2 ) = x( 1, 2 )
+                          ! update the right-hand side
+                          call stdlib_saxpy( j-1, -x( 1, 1 ), t( 1, j ), 1,work( 1+n ), 1 )
+                                    
+                          call stdlib_saxpy( j-1, -x( 1, 2 ), t( 1, j ), 1,work( 1+n2 ), 1 )
+                                    
+                       else
+                          ! 2-by-2 diagonal block
+                          call stdlib_slaln2( .false., 2, 2, smin, one,t( j-1, j-1 ), ldt, one, &
+                                    one,work( j-1+n ), n, wr, wi, x, 2, scale,xnorm, ierr )
+                          ! scale x to avoid overflow when updating
+                          ! the right-hand side.
+                          if( xnorm>one ) then
+                             beta = max( work( j-1 ), work( j ) )
+                             if( beta>bignum / xnorm ) then
+                                rec = one / xnorm
+                                x( 1, 1 ) = x( 1, 1 )*rec
+                                x( 1, 2 ) = x( 1, 2 )*rec
+                                x( 2, 1 ) = x( 2, 1 )*rec
+                                x( 2, 2 ) = x( 2, 2 )*rec
+                                scale = scale*rec
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_sscal( ki, scale, work( 1+n ), 1 )
+                             call stdlib_sscal( ki, scale, work( 1+n2 ), 1 )
+                          end if
+                          work( j-1+n ) = x( 1, 1 )
+                          work( j+n ) = x( 2, 1 )
+                          work( j-1+n2 ) = x( 1, 2 )
+                          work( j+n2 ) = x( 2, 2 )
+                          ! update the right-hand side
+                          call stdlib_saxpy( j-2, -x( 1, 1 ), t( 1, j-1 ), 1,work( 1+n ), 1 )
+                                    
+                          call stdlib_saxpy( j-2, -x( 2, 1 ), t( 1, j ), 1,work( 1+n ), 1 )
+                                    
+                          call stdlib_saxpy( j-2, -x( 1, 2 ), t( 1, j-1 ), 1,work( 1+n2 ), 1 )
+                                    
+                          call stdlib_saxpy( j-2, -x( 2, 2 ), t( 1, j ), 1,work( 1+n2 ), 1 )
+                                    
+                       end if
+                    end do loop_90
+                    ! copy the vector x or q*x to vr and normalize.
+                    if( .not.over ) then
+                       call stdlib_scopy( ki, work( 1+n ), 1, vr( 1, is-1 ), 1 )
+                       call stdlib_scopy( ki, work( 1+n2 ), 1, vr( 1, is ), 1 )
+                       emax = zero
+                       do k = 1, ki
+                          emax = max( emax, abs( vr( k, is-1 ) )+abs( vr( k, is ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_sscal( ki, remax, vr( 1, is-1 ), 1 )
+                       call stdlib_sscal( ki, remax, vr( 1, is ), 1 )
+                       do k = ki + 1, n
+                          vr( k, is-1 ) = zero
+                          vr( k, is ) = zero
+                       end do
+                    else
+                       if( ki>2 ) then
+                          call stdlib_sgemv( 'N', n, ki-2, one, vr, ldvr,work( 1+n ), 1, work( ki-&
+                                    1+n ),vr( 1, ki-1 ), 1 )
+                          call stdlib_sgemv( 'N', n, ki-2, one, vr, ldvr,work( 1+n2 ), 1, work( &
+                                    ki+n2 ),vr( 1, ki ), 1 )
+                       else
+                          call stdlib_sscal( n, work( ki-1+n ), vr( 1, ki-1 ), 1 )
+                          call stdlib_sscal( n, work( ki+n2 ), vr( 1, ki ), 1 )
+                       end if
+                       emax = zero
+                       do k = 1, n
+                          emax = max( emax, abs( vr( k, ki-1 ) )+abs( vr( k, ki ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_sscal( n, remax, vr( 1, ki-1 ), 1 )
+                       call stdlib_sscal( n, remax, vr( 1, ki ), 1 )
+                    end if
+                 end if
+                 is = is - 1
+                 if( ip/=0 )is = is - 1
+                 130 continue
+                 if( ip==1 )ip = 0
+                 if( ip==-1 )ip = 1
+              end do loop_140
+           end if
+           if( leftv ) then
+              ! compute left eigenvectors.
+              ip = 0
+              is = 1
+              loop_260: do ki = 1, n
+                 if( ip==-1 )go to 250
+                 if( ki==n )go to 150
+                 if( t( ki+1, ki )==zero )go to 150
+                 ip = 1
+                 150 continue
+                 if( somev ) then
+                    if( .not.select( ki ) )go to 250
+                 end if
+                 ! compute the ki-th eigenvalue (wr,wi).
+                 wr = t( ki, ki )
+                 wi = zero
+                 if( ip/=0 )wi = sqrt( abs( t( ki, ki+1 ) ) )*sqrt( abs( t( ki+1, ki ) ) )
+                 smin = max( ulp*( abs( wr )+abs( wi ) ), smlnum )
+                 if( ip==0 ) then
+                    ! real left eigenvector.
+                    work( ki+n ) = one
+                    ! form right-hand side
+                    do k = ki + 1, n
+                       work( k+n ) = -t( ki, k )
+                    end do
+                    ! solve the quasi-triangular system:
+                       ! (t(ki+1:n,ki+1:n) - wr)**t*x = scale*work
+                    vmax = one
+                    vcrit = bignum
+                    jnxt = ki + 1
+                    loop_170: do j = ki + 1, n
+                       if( j<jnxt )cycle loop_170
+                       j1 = j
+                       j2 = j
+                       jnxt = j + 1
+                       if( j<n ) then
+                          if( t( j+1, j )/=zero ) then
+                             j2 = j + 1
+                             jnxt = j + 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side.
+                          if( work( j )>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_sscal( n-ki+1, rec, work( ki+n ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+n ) = work( j+n ) -stdlib_sdot( j-ki-1, t( ki+1, j ), 1,work( &
+                                    ki+1+n ), 1 )
+                          ! solve (t(j,j)-wr)**t*x = work
+                          call stdlib_slaln2( .false., 1, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_sscal( n-ki+1, scale, work( ki+n ), 1 )
+                                    
+                          work( j+n ) = x( 1, 1 )
+                          vmax = max( abs( work( j+n ) ), vmax )
+                          vcrit = bignum / vmax
+                       else
+                          ! 2-by-2 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side.
+                          beta = max( work( j ), work( j+1 ) )
+                          if( beta>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_sscal( n-ki+1, rec, work( ki+n ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+n ) = work( j+n ) -stdlib_sdot( j-ki-1, t( ki+1, j ), 1,work( &
+                                    ki+1+n ), 1 )
+                          work( j+1+n ) = work( j+1+n ) -stdlib_sdot( j-ki-1, t( ki+1, j+1 ), 1,&
+                                    work( ki+1+n ), 1 )
+                          ! solve
+                            ! [t(j,j)-wr   t(j,j+1)     ]**t* x = scale*( work1 )
+                            ! [t(j+1,j)    t(j+1,j+1)-wr]               ( work2 )
+                          call stdlib_slaln2( .true., 2, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_sscal( n-ki+1, scale, work( ki+n ), 1 )
+                                    
+                          work( j+n ) = x( 1, 1 )
+                          work( j+1+n ) = x( 2, 1 )
+                          vmax = max( abs( work( j+n ) ),abs( work( j+1+n ) ), vmax )
+                          vcrit = bignum / vmax
+                       end if
+                    end do loop_170
+                    ! copy the vector x or q*x to vl and normalize.
+                    if( .not.over ) then
+                       call stdlib_scopy( n-ki+1, work( ki+n ), 1, vl( ki, is ), 1 )
+                       ii = stdlib_isamax( n-ki+1, vl( ki, is ), 1 ) + ki - 1
+                       remax = one / abs( vl( ii, is ) )
+                       call stdlib_sscal( n-ki+1, remax, vl( ki, is ), 1 )
+                       do k = 1, ki - 1
+                          vl( k, is ) = zero
+                       end do
+                    else
+                       if( ki<n )call stdlib_sgemv( 'N', n, n-ki, one, vl( 1, ki+1 ), ldvl,work( &
+                                 ki+1+n ), 1, work( ki+n ),vl( 1, ki ), 1 )
+                       ii = stdlib_isamax( n, vl( 1, ki ), 1 )
+                       remax = one / abs( vl( ii, ki ) )
+                       call stdlib_sscal( n, remax, vl( 1, ki ), 1 )
+                    end if
+                 else
+                    ! complex left eigenvector.
+                     ! initial solve:
+                       ! ((t(ki,ki)    t(ki,ki+1) )**t - (wr - i* wi))*x = 0.
+                       ! ((t(ki+1,ki) t(ki+1,ki+1))                )
+                    if( abs( t( ki, ki+1 ) )>=abs( t( ki+1, ki ) ) ) then
+                       work( ki+n ) = wi / t( ki, ki+1 )
+                       work( ki+1+n2 ) = one
+                    else
+                       work( ki+n ) = one
+                       work( ki+1+n2 ) = -wi / t( ki+1, ki )
+                    end if
+                    work( ki+1+n ) = zero
+                    work( ki+n2 ) = zero
+                    ! form right-hand side
+                    do k = ki + 2, n
+                       work( k+n ) = -work( ki+n )*t( ki, k )
+                       work( k+n2 ) = -work( ki+1+n2 )*t( ki+1, k )
+                    end do
+                    ! solve complex quasi-triangular system:
+                    ! ( t(ki+2,n:ki+2,n) - (wr-i*wi) )*x = work1+i*work2
+                    vmax = one
+                    vcrit = bignum
+                    jnxt = ki + 2
+                    loop_200: do j = ki + 2, n
+                       if( j<jnxt )cycle loop_200
+                       j1 = j
+                       j2 = j
+                       jnxt = j + 1
+                       if( j<n ) then
+                          if( t( j+1, j )/=zero ) then
+                             j2 = j + 1
+                             jnxt = j + 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          ! scale if necessary to avoid overflow when
+                          ! forming the right-hand side elements.
+                          if( work( j )>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_sscal( n-ki+1, rec, work( ki+n ), 1 )
+                             call stdlib_sscal( n-ki+1, rec, work( ki+n2 ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+n ) = work( j+n ) -stdlib_sdot( j-ki-2, t( ki+2, j ), 1,work( &
+                                    ki+2+n ), 1 )
+                          work( j+n2 ) = work( j+n2 ) -stdlib_sdot( j-ki-2, t( ki+2, j ), 1,work( &
+                                    ki+2+n2 ), 1 )
+                          ! solve (t(j,j)-(wr-i*wi))*(x11+i*x12)= wk+i*wk2
+                          call stdlib_slaln2( .false., 1, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,-wi, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_sscal( n-ki+1, scale, work( ki+n ), 1 )
+                             call stdlib_sscal( n-ki+1, scale, work( ki+n2 ), 1 )
+                          end if
+                          work( j+n ) = x( 1, 1 )
+                          work( j+n2 ) = x( 1, 2 )
+                          vmax = max( abs( work( j+n ) ),abs( work( j+n2 ) ), vmax )
+                          vcrit = bignum / vmax
+                       else
+                          ! 2-by-2 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side elements.
+                          beta = max( work( j ), work( j+1 ) )
+                          if( beta>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_sscal( n-ki+1, rec, work( ki+n ), 1 )
+                             call stdlib_sscal( n-ki+1, rec, work( ki+n2 ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+n ) = work( j+n ) -stdlib_sdot( j-ki-2, t( ki+2, j ), 1,work( &
+                                    ki+2+n ), 1 )
+                          work( j+n2 ) = work( j+n2 ) -stdlib_sdot( j-ki-2, t( ki+2, j ), 1,work( &
+                                    ki+2+n2 ), 1 )
+                          work( j+1+n ) = work( j+1+n ) -stdlib_sdot( j-ki-2, t( ki+2, j+1 ), 1,&
+                                    work( ki+2+n ), 1 )
+                          work( j+1+n2 ) = work( j+1+n2 ) -stdlib_sdot( j-ki-2, t( ki+2, j+1 ), 1,&
+                                    work( ki+2+n2 ), 1 )
+                          ! solve 2-by-2 complex linear equation
+                            ! ([t(j,j)   t(j,j+1)  ]**t-(wr-i*wi)*i)*x = scale*b
+                            ! ([t(j+1,j) t(j+1,j+1)]               )
+                          call stdlib_slaln2( .true., 2, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+n ), n, wr,-wi, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_sscal( n-ki+1, scale, work( ki+n ), 1 )
+                             call stdlib_sscal( n-ki+1, scale, work( ki+n2 ), 1 )
+                          end if
+                          work( j+n ) = x( 1, 1 )
+                          work( j+n2 ) = x( 1, 2 )
+                          work( j+1+n ) = x( 2, 1 )
+                          work( j+1+n2 ) = x( 2, 2 )
+                          vmax = max( abs( x( 1, 1 ) ), abs( x( 1, 2 ) ),abs( x( 2, 1 ) ), abs( x(&
+                                     2, 2 ) ), vmax )
+                          vcrit = bignum / vmax
+                       end if
+                    end do loop_200
+                    ! copy the vector x or q*x to vl and normalize.
+                    if( .not.over ) then
+                       call stdlib_scopy( n-ki+1, work( ki+n ), 1, vl( ki, is ), 1 )
+                       call stdlib_scopy( n-ki+1, work( ki+n2 ), 1, vl( ki, is+1 ),1 )
+                       emax = zero
+                       do k = ki, n
+                          emax = max( emax, abs( vl( k, is ) )+abs( vl( k, is+1 ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_sscal( n-ki+1, remax, vl( ki, is ), 1 )
+                       call stdlib_sscal( n-ki+1, remax, vl( ki, is+1 ), 1 )
+                       do k = 1, ki - 1
+                          vl( k, is ) = zero
+                          vl( k, is+1 ) = zero
+                       end do
+                    else
+                       if( ki<n-1 ) then
+                          call stdlib_sgemv( 'N', n, n-ki-1, one, vl( 1, ki+2 ),ldvl, work( ki+2+&
+                                    n ), 1, work( ki+n ),vl( 1, ki ), 1 )
+                          call stdlib_sgemv( 'N', n, n-ki-1, one, vl( 1, ki+2 ),ldvl, work( ki+2+&
+                                    n2 ), 1,work( ki+1+n2 ), vl( 1, ki+1 ), 1 )
+                       else
+                          call stdlib_sscal( n, work( ki+n ), vl( 1, ki ), 1 )
+                          call stdlib_sscal( n, work( ki+1+n2 ), vl( 1, ki+1 ), 1 )
+                       end if
+                       emax = zero
+                       do k = 1, n
+                          emax = max( emax, abs( vl( k, ki ) )+abs( vl( k, ki+1 ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_sscal( n, remax, vl( 1, ki ), 1 )
+                       call stdlib_sscal( n, remax, vl( 1, ki+1 ), 1 )
+                    end if
+                 end if
+                 is = is + 1
+                 if( ip/=0 )is = is + 1
+                 250 continue
+                 if( ip==-1 )ip = 0
+                 if( ip==1 )ip = -1
+              end do loop_260
+           end if
+           return
+     end subroutine stdlib_strevc
+
+     !> STREVC3: computes some or all of the right and/or left eigenvectors of
+     !> a real upper quasi-triangular matrix T.
+     !> Matrices of this type are produced by the Schur factorization of
+     !> a real general matrix:  A = Q*T*Q**T, as computed by SHSEQR.
+     !> The right eigenvector x and the left eigenvector y of T corresponding
+     !> to an eigenvalue w are defined by:
+     !> T*x = w*x,     (y**T)*T = w*(y**T)
+     !> where y**T denotes the transpose of the vector y.
+     !> The eigenvalues are not input to this routine, but are read directly
+     !> from the diagonal blocks of T.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+     !> input matrix. If Q is the orthogonal factor that reduces a matrix
+     !> A to Schur form T, then Q*X and Q*Y are the matrices of right and
+     !> left eigenvectors of A.
+     !> This uses a Level 3 BLAS version of the back transformation.
+
+     pure subroutine stdlib_strevc3( side, howmny, select, n, t, ldt, vl, ldvl,vr, ldvr, mm, m, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, lwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(inout) :: select(*)
+           real(sp), intent(in) :: t(ldt,*)
+           real(sp), intent(inout) :: vl(ldvl,*), vr(ldvr,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmin = 8
+           integer(ilp), parameter :: nbmax = 128
+           
+           
+           ! Local Scalars 
+           logical(lk) :: allv, bothv, leftv, lquery, over, pair, rightv, somev
+           integer(ilp) :: i, ierr, ii, ip, is, j, j1, j2, jnxt, k, ki, iv, maxwrk, nb, &
+                     ki2
+           real(sp) :: beta, bignum, emax, ovfl, rec, remax, scale, smin, smlnum, ulp, unfl, &
+                     vcrit, vmax, wi, wr, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Local Arrays 
+           real(sp) :: x(2,2)
+           integer(ilp) :: iscomplex(nbmax)
+           ! Executable Statements 
+           ! decode and test the input parameters
+           bothv  = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv  = stdlib_lsame( side, 'L' ) .or. bothv
+           allv  = stdlib_lsame( howmny, 'A' )
+           over  = stdlib_lsame( howmny, 'B' )
+           somev = stdlib_lsame( howmny, 'S' )
+           info = 0
+           nb = stdlib_ilaenv( 1, 'STREVC', side // howmny, n, -1, -1, -1 )
+           maxwrk = n + 2*n*nb
+           work(1) = maxwrk
+           lquery = ( lwork==-1 )
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.allv .and. .not.over .and. .not.somev ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -10
+           else if( lwork<max( 1, 3*n ) .and. .not.lquery ) then
+              info = -14
+           else
+              ! set m to the number of columns required to store the selected
+              ! eigenvectors, standardize the array select if necessary, and
+              ! test mm.
+              if( somev ) then
+                 m = 0
+                 pair = .false.
+                 do j = 1, n
+                    if( pair ) then
+                       pair = .false.
+                       select( j ) = .false.
+                    else
+                       if( j<n ) then
+                          if( t( j+1, j )==zero ) then
+                             if( select( j ) )m = m + 1
+                          else
+                             pair = .true.
+                             if( select( j ) .or. select( j+1 ) ) then
+                                select( j ) = .true.
+                                m = m + 2
+                             end if
+                          end if
+                       else
+                          if( select( n ) )m = m + 1
+                       end if
+                    end if
+                 end do
+              else
+                 m = n
+              end if
+              if( mm<m ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STREVC3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! use blocked version of back-transformation if sufficient workspace.
+           ! zero-out the workspace to avoid potential nan propagation.
+           if( over .and. lwork >= n + 2*n*nbmin ) then
+              nb = (lwork - n) / (2*n)
+              nb = min( nb, nbmax )
+              call stdlib_slaset( 'F', n, 1+2*nb, zero, zero, work, n )
+           else
+              nb = 1
+           end if
+           ! set the constants to control overflow.
+           unfl = stdlib_slamch( 'SAFE MINIMUM' )
+           ovfl = one / unfl
+           call stdlib_slabad( unfl, ovfl )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           bignum = ( one-ulp ) / smlnum
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           work( 1 ) = zero
+           do j = 2, n
+              work( j ) = zero
+              do i = 1, j - 1
+                 work( j ) = work( j ) + abs( t( i, j ) )
+              end do
+           end do
+           ! index ip is used to specify the real or complex eigenvalue:
+             ! ip = 0, real eigenvalue,
+                  ! 1, first  of conjugate complex pair: (wr,wi)
+                 ! -1, second of conjugate complex pair: (wr,wi)
+             ! iscomplex array stores ip for each column in current block.
+           if( rightv ) then
+              ! ============================================================
+              ! compute right eigenvectors.
+              ! iv is index of column in current block.
+              ! for complex right vector, uses iv-1 for real part and iv for complex part.
+              ! non-blocked version always uses iv=2;
+              ! blocked     version starts with iv=nb, goes down to 1 or 2.
+              ! (note the "0-th" column is used for 1-norms computed above.)
+              iv = 2
+              if( nb>2 ) then
+                 iv = nb
+              end if
+              ip = 0
+              is = m
+              loop_140: do ki = n, 1, -1
+                 if( ip==-1 ) then
+                    ! previous iteration (ki+1) was second of conjugate pair,
+                    ! so this ki is first of conjugate pair; skip to end of loop
+                    ip = 1
+                    cycle loop_140
+                 else if( ki==1 ) then
+                    ! last column, so this ki must be real eigenvalue
+                    ip = 0
+                 else if( t( ki, ki-1 )==zero ) then
+                    ! zero on sub-diagonal, so this ki is real eigenvalue
+                    ip = 0
+                 else
+                    ! non-zero on sub-diagonal, so this ki is second of conjugate pair
+                    ip = -1
+                 end if
+                 if( somev ) then
+                    if( ip==0 ) then
+                       if( .not.select( ki ) )cycle loop_140
+                    else
+                       if( .not.select( ki-1 ) )cycle loop_140
+                    end if
+                 end if
+                 ! compute the ki-th eigenvalue (wr,wi).
+                 wr = t( ki, ki )
+                 wi = zero
+                 if( ip/=0 )wi = sqrt( abs( t( ki, ki-1 ) ) )*sqrt( abs( t( ki-1, ki ) ) )
+                 smin = max( ulp*( abs( wr )+abs( wi ) ), smlnum )
+                 if( ip==0 ) then
+                    ! --------------------------------------------------------
+                    ! real right eigenvector
+                    work( ki + iv*n ) = one
+                    ! form right-hand side.
+                    do k = 1, ki - 1
+                       work( k + iv*n ) = -t( k, ki )
+                    end do
+                    ! solve upper quasi-triangular system:
+                    ! [ t(1:ki-1,1:ki-1) - wr ]*x = scale*work.
+                    jnxt = ki - 1
+                    loop_60: do j = ki - 1, 1, -1
+                       if( j>jnxt )cycle loop_60
+                       j1 = j
+                       j2 = j
+                       jnxt = j - 1
+                       if( j>1 ) then
+                          if( t( j, j-1 )/=zero ) then
+                             j1   = j - 1
+                             jnxt = j - 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          call stdlib_slaln2( .false., 1, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale x(1,1) to avoid overflow when updating
+                          ! the right-hand side.
+                          if( xnorm>one ) then
+                             if( work( j )>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_sscal( ki, scale, work( 1+iv*n ), 1 )
+                                    
+                          work( j+iv*n ) = x( 1, 1 )
+                          ! update right-hand side
+                          call stdlib_saxpy( j-1, -x( 1, 1 ), t( 1, j ), 1,work( 1+iv*n ), 1 )
+                                    
+                       else
+                          ! 2-by-2 diagonal block
+                          call stdlib_slaln2( .false., 2, 1, smin, one,t( j-1, j-1 ), ldt, one, &
+                                    one,work( j-1+iv*n ), n, wr, zero, x, 2,scale, xnorm, ierr )
+                          ! scale x(1,1) and x(2,1) to avoid overflow when
+                          ! updating the right-hand side.
+                          if( xnorm>one ) then
+                             beta = max( work( j-1 ), work( j ) )
+                             if( beta>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                x( 2, 1 ) = x( 2, 1 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_sscal( ki, scale, work( 1+iv*n ), 1 )
+                                    
+                          work( j-1+iv*n ) = x( 1, 1 )
+                          work( j  +iv*n ) = x( 2, 1 )
+                          ! update right-hand side
+                          call stdlib_saxpy( j-2, -x( 1, 1 ), t( 1, j-1 ), 1,work( 1+iv*n ), 1 )
+                                    
+                          call stdlib_saxpy( j-2, -x( 2, 1 ), t( 1, j ), 1,work( 1+iv*n ), 1 )
+                                    
+                       end if
+                    end do loop_60
+                    ! copy the vector x or q*x to vr and normalize.
+                    if( .not.over ) then
+                       ! ------------------------------
+                       ! no back-transform: copy x to vr and normalize.
+                       call stdlib_scopy( ki, work( 1 + iv*n ), 1, vr( 1, is ), 1 )
+                       ii = stdlib_isamax( ki, vr( 1, is ), 1 )
+                       remax = one / abs( vr( ii, is ) )
+                       call stdlib_sscal( ki, remax, vr( 1, is ), 1 )
+                       do k = ki + 1, n
+                          vr( k, is ) = zero
+                       end do
+                    else if( nb==1 ) then
+                       ! ------------------------------
+                       ! version 1: back-transform each vector with gemv, q*x.
+                       if( ki>1 )call stdlib_sgemv( 'N', n, ki-1, one, vr, ldvr,work( 1 + iv*n ), &
+                                 1, work( ki + iv*n ),vr( 1, ki ), 1 )
+                       ii = stdlib_isamax( n, vr( 1, ki ), 1 )
+                       remax = one / abs( vr( ii, ki ) )
+                       call stdlib_sscal( n, remax, vr( 1, ki ), 1 )
+                    else
+                       ! ------------------------------
+                       ! version 2: back-transform block of vectors with gemm
+                       ! zero out below vector
+                       do k = ki + 1, n
+                          work( k + iv*n ) = zero
+                       end do
+                       iscomplex( iv ) = ip
+                       ! back-transform and normalization is done below
+                    end if
+                 else
+                    ! --------------------------------------------------------
+                    ! complex right eigenvector.
+                    ! initial solve
+                    ! [ ( t(ki-1,ki-1) t(ki-1,ki) ) - (wr + i*wi) ]*x = 0.
+                    ! [ ( t(ki,  ki-1) t(ki,  ki) )               ]
+                    if( abs( t( ki-1, ki ) )>=abs( t( ki, ki-1 ) ) ) then
+                       work( ki-1 + (iv-1)*n ) = one
+                       work( ki   + (iv  )*n ) = wi / t( ki-1, ki )
+                    else
+                       work( ki-1 + (iv-1)*n ) = -wi / t( ki, ki-1 )
+                       work( ki   + (iv  )*n ) = one
+                    end if
+                    work( ki   + (iv-1)*n ) = zero
+                    work( ki-1 + (iv  )*n ) = zero
+                    ! form right-hand side.
+                    do k = 1, ki - 2
+                       work( k+(iv-1)*n ) = -work( ki-1+(iv-1)*n )*t(k,ki-1)
+                       work( k+(iv  )*n ) = -work( ki  +(iv  )*n )*t(k,ki  )
+                    end do
+                    ! solve upper quasi-triangular system:
+                    ! [ t(1:ki-2,1:ki-2) - (wr+i*wi) ]*x = scale*(work+i*work2)
+                    jnxt = ki - 2
+                    loop_90: do j = ki - 2, 1, -1
+                       if( j>jnxt )cycle loop_90
+                       j1 = j
+                       j2 = j
+                       jnxt = j - 1
+                       if( j>1 ) then
+                          if( t( j, j-1 )/=zero ) then
+                             j1   = j - 1
+                             jnxt = j - 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          call stdlib_slaln2( .false., 1, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+(iv-1)*n ), n,wr, wi, x, 2, scale, xnorm, ierr )
+                          ! scale x(1,1) and x(1,2) to avoid overflow when
+                          ! updating the right-hand side.
+                          if( xnorm>one ) then
+                             if( work( j )>bignum / xnorm ) then
+                                x( 1, 1 ) = x( 1, 1 ) / xnorm
+                                x( 1, 2 ) = x( 1, 2 ) / xnorm
+                                scale = scale / xnorm
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_sscal( ki, scale, work( 1+(iv-1)*n ), 1 )
+                             call stdlib_sscal( ki, scale, work( 1+(iv  )*n ), 1 )
+                          end if
+                          work( j+(iv-1)*n ) = x( 1, 1 )
+                          work( j+(iv  )*n ) = x( 1, 2 )
+                          ! update the right-hand side
+                          call stdlib_saxpy( j-1, -x( 1, 1 ), t( 1, j ), 1,work( 1+(iv-1)*n ), 1 )
+                                    
+                          call stdlib_saxpy( j-1, -x( 1, 2 ), t( 1, j ), 1,work( 1+(iv  )*n ), 1 )
+                                    
+                       else
+                          ! 2-by-2 diagonal block
+                          call stdlib_slaln2( .false., 2, 2, smin, one,t( j-1, j-1 ), ldt, one, &
+                                    one,work( j-1+(iv-1)*n ), n, wr, wi, x, 2,scale, xnorm, ierr )
+                          ! scale x to avoid overflow when updating
+                          ! the right-hand side.
+                          if( xnorm>one ) then
+                             beta = max( work( j-1 ), work( j ) )
+                             if( beta>bignum / xnorm ) then
+                                rec = one / xnorm
+                                x( 1, 1 ) = x( 1, 1 )*rec
+                                x( 1, 2 ) = x( 1, 2 )*rec
+                                x( 2, 1 ) = x( 2, 1 )*rec
+                                x( 2, 2 ) = x( 2, 2 )*rec
+                                scale = scale*rec
+                             end if
+                          end if
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_sscal( ki, scale, work( 1+(iv-1)*n ), 1 )
+                             call stdlib_sscal( ki, scale, work( 1+(iv  )*n ), 1 )
+                          end if
+                          work( j-1+(iv-1)*n ) = x( 1, 1 )
+                          work( j  +(iv-1)*n ) = x( 2, 1 )
+                          work( j-1+(iv  )*n ) = x( 1, 2 )
+                          work( j  +(iv  )*n ) = x( 2, 2 )
+                          ! update the right-hand side
+                          call stdlib_saxpy( j-2, -x( 1, 1 ), t( 1, j-1 ), 1,work( 1+(iv-1)*n   ),&
+                                     1 )
+                          call stdlib_saxpy( j-2, -x( 2, 1 ), t( 1, j ), 1,work( 1+(iv-1)*n   ), &
+                                    1 )
+                          call stdlib_saxpy( j-2, -x( 1, 2 ), t( 1, j-1 ), 1,work( 1+(iv  )*n ), &
+                                    1 )
+                          call stdlib_saxpy( j-2, -x( 2, 2 ), t( 1, j ), 1,work( 1+(iv  )*n ), 1 )
+                                    
+                       end if
+                    end do loop_90
+                    ! copy the vector x or q*x to vr and normalize.
+                    if( .not.over ) then
+                       ! ------------------------------
+                       ! no back-transform: copy x to vr and normalize.
+                       call stdlib_scopy( ki, work( 1+(iv-1)*n ), 1, vr(1,is-1), 1 )
+                       call stdlib_scopy( ki, work( 1+(iv  )*n ), 1, vr(1,is  ), 1 )
+                       emax = zero
+                       do k = 1, ki
+                          emax = max( emax, abs( vr( k, is-1 ) )+abs( vr( k, is   ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_sscal( ki, remax, vr( 1, is-1 ), 1 )
+                       call stdlib_sscal( ki, remax, vr( 1, is   ), 1 )
+                       do k = ki + 1, n
+                          vr( k, is-1 ) = zero
+                          vr( k, is   ) = zero
+                       end do
+                    else if( nb==1 ) then
+                       ! ------------------------------
+                       ! version 1: back-transform each vector with gemv, q*x.
+                       if( ki>2 ) then
+                          call stdlib_sgemv( 'N', n, ki-2, one, vr, ldvr,work( 1    + (iv-1)*n ), &
+                                    1,work( ki-1 + (iv-1)*n ), vr(1,ki-1), 1)
+                          call stdlib_sgemv( 'N', n, ki-2, one, vr, ldvr,work( 1  + (iv)*n ), 1,&
+                                    work( ki + (iv)*n ), vr( 1, ki ), 1 )
+                       else
+                          call stdlib_sscal( n, work(ki-1+(iv-1)*n), vr(1,ki-1), 1)
+                          call stdlib_sscal( n, work(ki  +(iv  )*n), vr(1,ki  ), 1)
+                       end if
+                       emax = zero
+                       do k = 1, n
+                          emax = max( emax, abs( vr( k, ki-1 ) )+abs( vr( k, ki   ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_sscal( n, remax, vr( 1, ki-1 ), 1 )
+                       call stdlib_sscal( n, remax, vr( 1, ki   ), 1 )
+                    else
+                       ! ------------------------------
+                       ! version 2: back-transform block of vectors with gemm
+                       ! zero out below vector
+                       do k = ki + 1, n
+                          work( k + (iv-1)*n ) = zero
+                          work( k + (iv  )*n ) = zero
+                       end do
+                       iscomplex( iv-1 ) = -ip
+                       iscomplex( iv   ) =  ip
+                       iv = iv - 1
+                       ! back-transform and normalization is done below
+                    end if
+                 end if
+                 if( nb>1 ) then
+                    ! --------------------------------------------------------
+                    ! blocked version of back-transform
+                    ! for complex case, ki2 includes both vectors (ki-1 and ki)
+                    if( ip==0 ) then
+                       ki2 = ki
+                    else
+                       ki2 = ki - 1
+                    end if
+                    ! columns iv:nb of work are valid vectors.
+                    ! when the number of vectors stored reaches nb-1 or nb,
+                    ! or if this was last vector, do the gemm
+                    if( (iv<=2) .or. (ki2==1) ) then
+                       call stdlib_sgemm( 'N', 'N', n, nb-iv+1, ki2+nb-iv, one,vr, ldvr,work( 1 + &
+                                 (iv)*n    ), n,zero,work( 1 + (nb+iv)*n ), n )
+                       ! normalize vectors
+                       do k = iv, nb
+                          if( iscomplex(k)==0 ) then
+                             ! real eigenvector
+                             ii = stdlib_isamax( n, work( 1 + (nb+k)*n ), 1 )
+                             remax = one / abs( work( ii + (nb+k)*n ) )
+                          else if( iscomplex(k)==1 ) then
+                             ! first eigenvector of conjugate pair
+                             emax = zero
+                             do ii = 1, n
+                                emax = max( emax,abs( work( ii + (nb+k  )*n ) )+abs( work( ii + (&
+                                          nb+k+1)*n ) ) )
+                             end do
+                             remax = one / emax
+                          ! else if iscomplex(k)==-1
+                             ! second eigenvector of conjugate pair
+                             ! reuse same remax as previous k
+                          end if
+                          call stdlib_sscal( n, remax, work( 1 + (nb+k)*n ), 1 )
+                       end do
+                       call stdlib_slacpy( 'F', n, nb-iv+1,work( 1 + (nb+iv)*n ), n,vr( 1, ki2 ), &
+                                 ldvr )
+                       iv = nb
+                    else
+                       iv = iv - 1
+                    end if
+                 end if ! blocked back-transform
+                 is = is - 1
+                 if( ip/=0 )is = is - 1
+              end do loop_140
+           end if
+           if( leftv ) then
+              ! ============================================================
+              ! compute left eigenvectors.
+              ! iv is index of column in current block.
+              ! for complex left vector, uses iv for real part and iv+1 for complex part.
+              ! non-blocked version always uses iv=1;
+              ! blocked     version starts with iv=1, goes up to nb-1 or nb.
+              ! (note the "0-th" column is used for 1-norms computed above.)
+              iv = 1
+              ip = 0
+              is = 1
+              loop_260: do ki = 1, n
+                 if( ip==1 ) then
+                    ! previous iteration (ki-1) was first of conjugate pair,
+                    ! so this ki is second of conjugate pair; skip to end of loop
+                    ip = -1
+                    cycle loop_260
+                 else if( ki==n ) then
+                    ! last column, so this ki must be real eigenvalue
+                    ip = 0
+                 else if( t( ki+1, ki )==zero ) then
+                    ! zero on sub-diagonal, so this ki is real eigenvalue
+                    ip = 0
+                 else
+                    ! non-zero on sub-diagonal, so this ki is first of conjugate pair
+                    ip = 1
+                 end if
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_260
+                 end if
+                 ! compute the ki-th eigenvalue (wr,wi).
+                 wr = t( ki, ki )
+                 wi = zero
+                 if( ip/=0 )wi = sqrt( abs( t( ki, ki+1 ) ) )*sqrt( abs( t( ki+1, ki ) ) )
+                 smin = max( ulp*( abs( wr )+abs( wi ) ), smlnum )
+                 if( ip==0 ) then
+                    ! --------------------------------------------------------
+                    ! real left eigenvector
+                    work( ki + iv*n ) = one
+                    ! form right-hand side.
+                    do k = ki + 1, n
+                       work( k + iv*n ) = -t( ki, k )
+                    end do
+                    ! solve transposed quasi-triangular system:
+                    ! [ t(ki+1:n,ki+1:n) - wr ]**t * x = scale*work
+                    vmax = one
+                    vcrit = bignum
+                    jnxt = ki + 1
+                    loop_170: do j = ki + 1, n
+                       if( j<jnxt )cycle loop_170
+                       j1 = j
+                       j2 = j
+                       jnxt = j + 1
+                       if( j<n ) then
+                          if( t( j+1, j )/=zero ) then
+                             j2 = j + 1
+                             jnxt = j + 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side.
+                          if( work( j )>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_sscal( n-ki+1, rec, work( ki+iv*n ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+iv*n ) = work( j+iv*n ) -stdlib_sdot( j-ki-1, t( ki+1, j ), 1,&
+                                    work( ki+1+iv*n ), 1 )
+                          ! solve [ t(j,j) - wr ]**t * x = work
+                          call stdlib_slaln2( .false., 1, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_sscal( n-ki+1, scale, work( ki+iv*n ), 1 )
+                                    
+                          work( j+iv*n ) = x( 1, 1 )
+                          vmax = max( abs( work( j+iv*n ) ), vmax )
+                          vcrit = bignum / vmax
+                       else
+                          ! 2-by-2 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side.
+                          beta = max( work( j ), work( j+1 ) )
+                          if( beta>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_sscal( n-ki+1, rec, work( ki+iv*n ), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+iv*n ) = work( j+iv*n ) -stdlib_sdot( j-ki-1, t( ki+1, j ), 1,&
+                                    work( ki+1+iv*n ), 1 )
+                          work( j+1+iv*n ) = work( j+1+iv*n ) -stdlib_sdot( j-ki-1, t( ki+1, j+1 )&
+                                    , 1,work( ki+1+iv*n ), 1 )
+                          ! solve
+                          ! [ t(j,j)-wr   t(j,j+1)      ]**t * x = scale*( work1 )
+                          ! [ t(j+1,j)    t(j+1,j+1)-wr ]                ( work2 )
+                          call stdlib_slaln2( .true., 2, 1, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,zero, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one )call stdlib_sscal( n-ki+1, scale, work( ki+iv*n ), 1 )
+                                    
+                          work( j  +iv*n ) = x( 1, 1 )
+                          work( j+1+iv*n ) = x( 2, 1 )
+                          vmax = max( abs( work( j  +iv*n ) ),abs( work( j+1+iv*n ) ), vmax )
+                                    
+                          vcrit = bignum / vmax
+                       end if
+                    end do loop_170
+                    ! copy the vector x or q*x to vl and normalize.
+                    if( .not.over ) then
+                       ! ------------------------------
+                       ! no back-transform: copy x to vl and normalize.
+                       call stdlib_scopy( n-ki+1, work( ki + iv*n ), 1,vl( ki, is ), 1 )
+                       ii = stdlib_isamax( n-ki+1, vl( ki, is ), 1 ) + ki - 1
+                       remax = one / abs( vl( ii, is ) )
+                       call stdlib_sscal( n-ki+1, remax, vl( ki, is ), 1 )
+                       do k = 1, ki - 1
+                          vl( k, is ) = zero
+                       end do
+                    else if( nb==1 ) then
+                       ! ------------------------------
+                       ! version 1: back-transform each vector with gemv, q*x.
+                       if( ki<n )call stdlib_sgemv( 'N', n, n-ki, one,vl( 1, ki+1 ), ldvl,work( &
+                                 ki+1 + iv*n ), 1,work( ki   + iv*n ), vl( 1, ki ), 1 )
+                       ii = stdlib_isamax( n, vl( 1, ki ), 1 )
+                       remax = one / abs( vl( ii, ki ) )
+                       call stdlib_sscal( n, remax, vl( 1, ki ), 1 )
+                    else
+                       ! ------------------------------
+                       ! version 2: back-transform block of vectors with gemm
+                       ! zero out above vector
+                       ! could go from ki-nv+1 to ki-1
+                       do k = 1, ki - 1
+                          work( k + iv*n ) = zero
+                       end do
+                       iscomplex( iv ) = ip
+                       ! back-transform and normalization is done below
+                    end if
+                 else
+                    ! --------------------------------------------------------
+                    ! complex left eigenvector.
+                    ! initial solve:
+                    ! [ ( t(ki,ki)    t(ki,ki+1)  )**t - (wr - i* wi) ]*x = 0.
+                    ! [ ( t(ki+1,ki) t(ki+1,ki+1) )                   ]
+                    if( abs( t( ki, ki+1 ) )>=abs( t( ki+1, ki ) ) ) then
+                       work( ki   + (iv  )*n ) = wi / t( ki, ki+1 )
+                       work( ki+1 + (iv+1)*n ) = one
+                    else
+                       work( ki   + (iv  )*n ) = one
+                       work( ki+1 + (iv+1)*n ) = -wi / t( ki+1, ki )
+                    end if
+                    work( ki+1 + (iv  )*n ) = zero
+                    work( ki   + (iv+1)*n ) = zero
+                    ! form right-hand side.
+                    do k = ki + 2, n
+                       work( k+(iv  )*n ) = -work( ki  +(iv  )*n )*t(ki,  k)
+                       work( k+(iv+1)*n ) = -work( ki+1+(iv+1)*n )*t(ki+1,k)
+                    end do
+                    ! solve transposed quasi-triangular system:
+                    ! [ t(ki+2:n,ki+2:n)**t - (wr-i*wi) ]*x = work1+i*work2
+                    vmax = one
+                    vcrit = bignum
+                    jnxt = ki + 2
+                    loop_200: do j = ki + 2, n
+                       if( j<jnxt )cycle loop_200
+                       j1 = j
+                       j2 = j
+                       jnxt = j + 1
+                       if( j<n ) then
+                          if( t( j+1, j )/=zero ) then
+                             j2 = j + 1
+                             jnxt = j + 2
+                          end if
+                       end if
+                       if( j1==j2 ) then
+                          ! 1-by-1 diagonal block
+                          ! scale if necessary to avoid overflow when
+                          ! forming the right-hand side elements.
+                          if( work( j )>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_sscal( n-ki+1, rec, work(ki+(iv  )*n), 1 )
+                             call stdlib_sscal( n-ki+1, rec, work(ki+(iv+1)*n), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j+(iv  )*n ) = work( j+(iv)*n ) -stdlib_sdot( j-ki-2, t( ki+2, j )&
+                                    , 1,work( ki+2+(iv)*n ), 1 )
+                          work( j+(iv+1)*n ) = work( j+(iv+1)*n ) -stdlib_sdot( j-ki-2, t( ki+2, &
+                                    j ), 1,work( ki+2+(iv+1)*n ), 1 )
+                          ! solve [ t(j,j)-(wr-i*wi) ]*(x11+i*x12)= wk+i*wk2
+                          call stdlib_slaln2( .false., 1, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,-wi, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_sscal( n-ki+1, scale, work(ki+(iv  )*n), 1)
+                             call stdlib_sscal( n-ki+1, scale, work(ki+(iv+1)*n), 1)
+                          end if
+                          work( j+(iv  )*n ) = x( 1, 1 )
+                          work( j+(iv+1)*n ) = x( 1, 2 )
+                          vmax = max( abs( work( j+(iv  )*n ) ),abs( work( j+(iv+1)*n ) ), vmax )
+                                    
+                          vcrit = bignum / vmax
+                       else
+                          ! 2-by-2 diagonal block
+                          ! scale if necessary to avoid overflow when forming
+                          ! the right-hand side elements.
+                          beta = max( work( j ), work( j+1 ) )
+                          if( beta>vcrit ) then
+                             rec = one / vmax
+                             call stdlib_sscal( n-ki+1, rec, work(ki+(iv  )*n), 1 )
+                             call stdlib_sscal( n-ki+1, rec, work(ki+(iv+1)*n), 1 )
+                             vmax = one
+                             vcrit = bignum
+                          end if
+                          work( j  +(iv  )*n ) = work( j+(iv)*n ) -stdlib_sdot( j-ki-2, t( ki+2, &
+                                    j ), 1,work( ki+2+(iv)*n ), 1 )
+                          work( j  +(iv+1)*n ) = work( j+(iv+1)*n ) -stdlib_sdot( j-ki-2, t( ki+2,&
+                                     j ), 1,work( ki+2+(iv+1)*n ), 1 )
+                          work( j+1+(iv  )*n ) = work( j+1+(iv)*n ) -stdlib_sdot( j-ki-2, t( ki+2,&
+                                     j+1 ), 1,work( ki+2+(iv)*n ), 1 )
+                          work( j+1+(iv+1)*n ) = work( j+1+(iv+1)*n ) -stdlib_sdot( j-ki-2, t( ki+&
+                                    2, j+1 ), 1,work( ki+2+(iv+1)*n ), 1 )
+                          ! solve 2-by-2 complex linear equation
+                          ! [ (t(j,j)   t(j,j+1)  )**t - (wr-i*wi)*i ]*x = scale*b
+                          ! [ (t(j+1,j) t(j+1,j+1))                  ]
+                          call stdlib_slaln2( .true., 2, 2, smin, one, t( j, j ),ldt, one, one, &
+                                    work( j+iv*n ), n, wr,-wi, x, 2, scale, xnorm, ierr )
+                          ! scale if necessary
+                          if( scale/=one ) then
+                             call stdlib_sscal( n-ki+1, scale, work(ki+(iv  )*n), 1)
+                             call stdlib_sscal( n-ki+1, scale, work(ki+(iv+1)*n), 1)
+                          end if
+                          work( j  +(iv  )*n ) = x( 1, 1 )
+                          work( j  +(iv+1)*n ) = x( 1, 2 )
+                          work( j+1+(iv  )*n ) = x( 2, 1 )
+                          work( j+1+(iv+1)*n ) = x( 2, 2 )
+                          vmax = max( abs( x( 1, 1 ) ), abs( x( 1, 2 ) ),abs( x( 2, 1 ) ), abs( x(&
+                                     2, 2 ) ),vmax )
+                          vcrit = bignum / vmax
+                       end if
+                    end do loop_200
+                    ! copy the vector x or q*x to vl and normalize.
+                    if( .not.over ) then
+                       ! ------------------------------
+                       ! no back-transform: copy x to vl and normalize.
+                       call stdlib_scopy( n-ki+1, work( ki + (iv  )*n ), 1,vl( ki, is   ), 1 )
+                                 
+                       call stdlib_scopy( n-ki+1, work( ki + (iv+1)*n ), 1,vl( ki, is+1 ), 1 )
+                                 
+                       emax = zero
+                       do k = ki, n
+                          emax = max( emax, abs( vl( k, is   ) )+abs( vl( k, is+1 ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_sscal( n-ki+1, remax, vl( ki, is   ), 1 )
+                       call stdlib_sscal( n-ki+1, remax, vl( ki, is+1 ), 1 )
+                       do k = 1, ki - 1
+                          vl( k, is   ) = zero
+                          vl( k, is+1 ) = zero
+                       end do
+                    else if( nb==1 ) then
+                       ! ------------------------------
+                       ! version 1: back-transform each vector with gemv, q*x.
+                       if( ki<n-1 ) then
+                          call stdlib_sgemv( 'N', n, n-ki-1, one,vl( 1, ki+2 ), ldvl,work( ki+2 + &
+                                    (iv)*n ), 1,work( ki   + (iv)*n ),vl( 1, ki ), 1 )
+                          call stdlib_sgemv( 'N', n, n-ki-1, one,vl( 1, ki+2 ), ldvl,work( ki+2 + &
+                                    (iv+1)*n ), 1,work( ki+1 + (iv+1)*n ),vl( 1, ki+1 ), 1 )
+                       else
+                          call stdlib_sscal( n, work(ki+  (iv  )*n), vl(1, ki  ), 1)
+                          call stdlib_sscal( n, work(ki+1+(iv+1)*n), vl(1, ki+1), 1)
+                       end if
+                       emax = zero
+                       do k = 1, n
+                          emax = max( emax, abs( vl( k, ki   ) )+abs( vl( k, ki+1 ) ) )
+                       end do
+                       remax = one / emax
+                       call stdlib_sscal( n, remax, vl( 1, ki   ), 1 )
+                       call stdlib_sscal( n, remax, vl( 1, ki+1 ), 1 )
+                    else
+                       ! ------------------------------
+                       ! version 2: back-transform block of vectors with gemm
+                       ! zero out above vector
+                       ! could go from ki-nv+1 to ki-1
+                       do k = 1, ki - 1
+                          work( k + (iv  )*n ) = zero
+                          work( k + (iv+1)*n ) = zero
+                       end do
+                       iscomplex( iv   ) =  ip
+                       iscomplex( iv+1 ) = -ip
+                       iv = iv + 1
+                       ! back-transform and normalization is done below
+                    end if
+                 end if
+                 if( nb>1 ) then
+                    ! --------------------------------------------------------
+                    ! blocked version of back-transform
+                    ! for complex case, ki2 includes both vectors (ki and ki+1)
+                    if( ip==0 ) then
+                       ki2 = ki
+                    else
+                       ki2 = ki + 1
+                    end if
+                    ! columns 1:iv of work are valid vectors.
+                    ! when the number of vectors stored reaches nb-1 or nb,
+                    ! or if this was last vector, do the gemm
+                    if( (iv>=nb-1) .or. (ki2==n) ) then
+                       call stdlib_sgemm( 'N', 'N', n, iv, n-ki2+iv, one,vl( 1, ki2-iv+1 ), ldvl,&
+                                 work( ki2-iv+1 + (1)*n ), n,zero,work( 1 + (nb+1)*n ), n )
+                       ! normalize vectors
+                       do k = 1, iv
+                          if( iscomplex(k)==0) then
+                             ! real eigenvector
+                             ii = stdlib_isamax( n, work( 1 + (nb+k)*n ), 1 )
+                             remax = one / abs( work( ii + (nb+k)*n ) )
+                          else if( iscomplex(k)==1) then
+                             ! first eigenvector of conjugate pair
+                             emax = zero
+                             do ii = 1, n
+                                emax = max( emax,abs( work( ii + (nb+k  )*n ) )+abs( work( ii + (&
+                                          nb+k+1)*n ) ) )
+                             end do
+                             remax = one / emax
+                          ! else if iscomplex(k)==-1
+                             ! second eigenvector of conjugate pair
+                             ! reuse same remax as previous k
+                          end if
+                          call stdlib_sscal( n, remax, work( 1 + (nb+k)*n ), 1 )
+                       end do
+                       call stdlib_slacpy( 'F', n, iv,work( 1 + (nb+1)*n ), n,vl( 1, ki2-iv+1 ), &
+                                 ldvl )
+                       iv = 1
+                    else
+                       iv = iv + 1
+                    end if
+                 end if ! blocked back-transform
+                 is = is + 1
+                 if( ip/=0 )is = is + 1
+              end do loop_260
+           end if
+           return
+     end subroutine stdlib_strevc3
+
+     !> STRSYL: solves the real Sylvester matrix equation:
+     !> op(A)*X + X*op(B) = scale*C or
+     !> op(A)*X - X*op(B) = scale*C,
+     !> where op(A) = A or A**T, and  A and B are both upper quasi-
+     !> triangular. A is M-by-M and B is N-by-N; the right hand side C and
+     !> the solution X are M-by-N; and scale is an output scale factor, set
+     !> <= 1 to avoid overflow in X.
+     !> A and B must be in Schur canonical form (as returned by SHSEQR), that
+     !> is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks;
+     !> each 2-by-2 diagonal block has its diagonal elements equal and its
+     !> off-diagonal elements of opposite sign.
+
+     subroutine stdlib_strsyl( trana, tranb, isgn, m, n, a, lda, b, ldb, c,ldc, scale, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trana, tranb
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: isgn, lda, ldb, ldc, m, n
+           real(sp), intent(out) :: scale
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), b(ldb,*)
+           real(sp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notrna, notrnb
+           integer(ilp) :: ierr, j, k, k1, k2, knext, l, l1, l2, lnext
+           real(sp) :: a11, bignum, da11, db, eps, scaloc, sgn, smin, smlnum, suml, sumr, &
+                     xnorm
+           ! Local Arrays 
+           real(sp) :: dum(1), vec(2,2), x(2,2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real
+           ! Executable Statements 
+           ! decode and test input parameters
+           notrna = stdlib_lsame( trana, 'N' )
+           notrnb = stdlib_lsame( tranb, 'N' )
+           info = 0
+           if( .not.notrna .and. .not.stdlib_lsame( trana, 'T' ) .and. .not.stdlib_lsame( trana, &
+                     'C' ) ) then
+              info = -1
+           else if( .not.notrnb .and. .not.stdlib_lsame( tranb, 'T' ) .and. .not.stdlib_lsame( &
+                     tranb, 'C' ) ) then
+              info = -2
+           else if( isgn/=1 .and. isgn/=-1 ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, m ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STRSYL', -info )
+              return
+           end if
+           ! quick return if possible
+           scale = one
+           if( m==0 .or. n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = smlnum*real( m*n,KIND=sp) / eps
+           bignum = one / smlnum
+           smin = max( smlnum, eps*stdlib_slange( 'M', m, m, a, lda, dum ),eps*stdlib_slange( 'M',&
+                      n, n, b, ldb, dum ) )
+           sgn = isgn
+           if( notrna .and. notrnb ) then
+              ! solve    a*x + isgn*x*b = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! bottom-left corner column by column by
+               ! a(k,k)*x(k,l) + isgn*x(k,l)*b(l,l) = c(k,l) - r(k,l)
+              ! where
+                        ! m                         l-1
+              ! r(k,l) = sum [a(k,i)*x(i,l)] + isgn*sum [x(k,j)*b(j,l)].
+                      ! i=k+1                       j=1
+              ! start column loop (index = l)
+              ! l1 (l2) : column index of the first (first) row of x(k,l).
+              lnext = 1
+              loop_70: do l = 1, n
+                 if( l<lnext )cycle loop_70
+                 if( l==n ) then
+                    l1 = l
+                    l2 = l
+                 else
+                    if( b( l+1, l )/=zero ) then
+                       l1 = l
+                       l2 = l + 1
+                       lnext = l + 2
+                    else
+                       l1 = l
+                       l2 = l
+                       lnext = l + 1
+                    end if
+                 end if
+                 ! start row loop (index = k)
+                 ! k1 (k2): row index of the first (last) row of x(k,l).
+                 knext = m
+                 loop_60: do k = m, 1, -1
+                    if( k>knext )cycle loop_60
+                    if( k==1 ) then
+                       k1 = k
+                       k2 = k
+                    else
+                       if( a( k, k-1 )/=zero ) then
+                          k1 = k - 1
+                          k2 = k
+                          knext = k - 2
+                       else
+                          k1 = k
+                          k2 = k
+                          knext = k - 1
+                       end if
+                    end if
+                    if( l1==l2 .and. k1==k2 ) then
+                       suml = stdlib_sdot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       scaloc = one
+                       a11 = a( k1, k1 ) + sgn*b( l1, l1 )
+                       da11 = abs( a11 )
+                       if( da11<=smin ) then
+                          a11 = smin
+                          da11 = smin
+                          info = 1
+                       end if
+                       db = abs( vec( 1, 1 ) )
+                       if( da11<one .and. db>one ) then
+                          if( db>bignum*da11 )scaloc = one / db
+                       end if
+                       x( 1, 1 ) = ( vec( 1, 1 )*scaloc ) / a11
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                    else if( l1==l2 .and. k1/=k2 ) then
+                       suml = stdlib_sdot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k2, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       call stdlib_slaln2( .false., 2, 1, smin, one, a( k1, k1 ),lda, one, one, &
+                                 vec, 2, -sgn*b( l1, l1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k2, l1 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1==k2 ) then
+                       suml = stdlib_sdot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = sgn*( c( k1, l1 )-( suml+sgn*sumr ) )
+                       suml = stdlib_sdot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k1, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 2, 1 ) = sgn*( c( k1, l2 )-( suml+sgn*sumr ) )
+                       call stdlib_slaln2( .true., 2, 1, smin, one, b( l1, l1 ),ldb, one, one, &
+                                 vec, 2, -sgn*a( k1, k1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1/=k2 ) then
+                       suml = stdlib_sdot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k1, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 1, 2 ) = c( k1, l2 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k2, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k2, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 2, 2 ) = c( k2, l2 ) - ( suml+sgn*sumr )
+                       call stdlib_slasy2( .false., .false., isgn, 2, 2,a( k1, k1 ), lda, b( l1, &
+                                 l1 ), ldb, vec,2, scaloc, x, 2, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 1, 2 )
+                       c( k2, l1 ) = x( 2, 1 )
+                       c( k2, l2 ) = x( 2, 2 )
+                    end if
+                 end do loop_60
+              end do loop_70
+           else if( .not.notrna .and. notrnb ) then
+              ! solve    a**t *x + isgn*x*b = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! upper-left corner column by column by
+                ! a(k,k)**t*x(k,l) + isgn*x(k,l)*b(l,l) = c(k,l) - r(k,l)
+              ! where
+                         ! k-1                          l-1
+                ! r(k,l) = sum [a(i,k)**t*x(i,l)] +isgn*sum [x(k,j)*b(j,l)]
+                         ! i=1                          j=1
+              ! start column loop (index = l)
+              ! l1 (l2): column index of the first (last) row of x(k,l)
+              lnext = 1
+              loop_130: do l = 1, n
+                 if( l<lnext )cycle loop_130
+                 if( l==n ) then
+                    l1 = l
+                    l2 = l
+                 else
+                    if( b( l+1, l )/=zero ) then
+                       l1 = l
+                       l2 = l + 1
+                       lnext = l + 2
+                    else
+                       l1 = l
+                       l2 = l
+                       lnext = l + 1
+                    end if
+                 end if
+                 ! start row loop (index = k)
+                 ! k1 (k2): row index of the first (last) row of x(k,l)
+                 knext = 1
+                 loop_120: do k = 1, m
+                    if( k<knext )cycle loop_120
+                    if( k==m ) then
+                       k1 = k
+                       k2 = k
+                    else
+                       if( a( k+1, k )/=zero ) then
+                          k1 = k
+                          k2 = k + 1
+                          knext = k + 2
+                       else
+                          k1 = k
+                          k2 = k
+                          knext = k + 1
+                       end if
+                    end if
+                    if( l1==l2 .and. k1==k2 ) then
+                       suml = stdlib_sdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       scaloc = one
+                       a11 = a( k1, k1 ) + sgn*b( l1, l1 )
+                       da11 = abs( a11 )
+                       if( da11<=smin ) then
+                          a11 = smin
+                          da11 = smin
+                          info = 1
+                       end if
+                       db = abs( vec( 1, 1 ) )
+                       if( da11<one .and. db>one ) then
+                          if( db>bignum*da11 )scaloc = one / db
+                       end if
+                       x( 1, 1 ) = ( vec( 1, 1 )*scaloc ) / a11
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                    else if( l1==l2 .and. k1/=k2 ) then
+                       suml = stdlib_sdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( k1-1, a( 1, k2 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k2, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       call stdlib_slaln2( .true., 2, 1, smin, one, a( k1, k1 ),lda, one, one, &
+                                 vec, 2, -sgn*b( l1, l1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k2, l1 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1==k2 ) then
+                       suml = stdlib_sdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = sgn*( c( k1, l1 )-( suml+sgn*sumr ) )
+                       suml = stdlib_sdot( k1-1, a( 1, k1 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k1, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 2, 1 ) = sgn*( c( k1, l2 )-( suml+sgn*sumr ) )
+                       call stdlib_slaln2( .true., 2, 1, smin, one, b( l1, l1 ),ldb, one, one, &
+                                 vec, 2, -sgn*a( k1, k1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1/=k2 ) then
+                       suml = stdlib_sdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k1, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( k1-1, a( 1, k1 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k1, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 1, 2 ) = c( k1, l2 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( k1-1, a( 1, k2 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k2, 1 ), ldc, b( 1, l1 ), 1 )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( k1-1, a( 1, k2 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_sdot( l1-1, c( k2, 1 ), ldc, b( 1, l2 ), 1 )
+                       vec( 2, 2 ) = c( k2, l2 ) - ( suml+sgn*sumr )
+                       call stdlib_slasy2( .true., .false., isgn, 2, 2, a( k1, k1 ),lda, b( l1, &
+                                 l1 ), ldb, vec, 2, scaloc, x,2, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 1, 2 )
+                       c( k2, l1 ) = x( 2, 1 )
+                       c( k2, l2 ) = x( 2, 2 )
+                    end if
+                 end do loop_120
+              end do loop_130
+           else if( .not.notrna .and. .not.notrnb ) then
+              ! solve    a**t*x + isgn*x*b**t = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! top-right corner column by column by
+                 ! a(k,k)**t*x(k,l) + isgn*x(k,l)*b(l,l)**t = c(k,l) - r(k,l)
+              ! where
+                           ! k-1                            n
+                  ! r(k,l) = sum [a(i,k)**t*x(i,l)] + isgn*sum [x(k,j)*b(l,j)**t].
+                           ! i=1                          j=l+1
+              ! start column loop (index = l)
+              ! l1 (l2): column index of the first (last) row of x(k,l)
+              lnext = n
+              loop_190: do l = n, 1, -1
+                 if( l>lnext )cycle loop_190
+                 if( l==1 ) then
+                    l1 = l
+                    l2 = l
+                 else
+                    if( b( l, l-1 )/=zero ) then
+                       l1 = l - 1
+                       l2 = l
+                       lnext = l - 2
+                    else
+                       l1 = l
+                       l2 = l
+                       lnext = l - 1
+                    end if
+                 end if
+                 ! start row loop (index = k)
+                 ! k1 (k2): row index of the first (last) row of x(k,l)
+                 knext = 1
+                 loop_180: do k = 1, m
+                    if( k<knext )cycle loop_180
+                    if( k==m ) then
+                       k1 = k
+                       k2 = k
+                    else
+                       if( a( k+1, k )/=zero ) then
+                          k1 = k
+                          k2 = k + 1
+                          knext = k + 2
+                       else
+                          k1 = k
+                          k2 = k
+                          knext = k + 1
+                       end if
+                    end if
+                    if( l1==l2 .and. k1==k2 ) then
+                       suml = stdlib_sdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_sdot( n-l1, c( k1, min( l1+1, n ) ), ldc,b( l1, min( l1+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       scaloc = one
+                       a11 = a( k1, k1 ) + sgn*b( l1, l1 )
+                       da11 = abs( a11 )
+                       if( da11<=smin ) then
+                          a11 = smin
+                          da11 = smin
+                          info = 1
+                       end if
+                       db = abs( vec( 1, 1 ) )
+                       if( da11<one .and. db>one ) then
+                          if( db>bignum*da11 )scaloc = one / db
+                       end if
+                       x( 1, 1 ) = ( vec( 1, 1 )*scaloc ) / a11
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                    else if( l1==l2 .and. k1/=k2 ) then
+                       suml = stdlib_sdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( k1-1, a( 1, k2 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       call stdlib_slaln2( .true., 2, 1, smin, one, a( k1, k1 ),lda, one, one, &
+                                 vec, 2, -sgn*b( l1, l1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k2, l1 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1==k2 ) then
+                       suml = stdlib_sdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = sgn*( c( k1, l1 )-( suml+sgn*sumr ) )
+                       suml = stdlib_sdot( k1-1, a( 1, k1 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = sgn*( c( k1, l2 )-( suml+sgn*sumr ) )
+                       call stdlib_slaln2( .false., 2, 1, smin, one, b( l1, l1 ),ldb, one, one, &
+                                 vec, 2, -sgn*a( k1, k1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1/=k2 ) then
+                       suml = stdlib_sdot( k1-1, a( 1, k1 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( k1-1, a( 1, k1 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 2 ) = c( k1, l2 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( k1-1, a( 1, k2 ), 1, c( 1, l1 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( k1-1, a( 1, k2 ), 1, c( 1, l2 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l2, min(l2+1, n )&
+                                  ), ldb )
+                       vec( 2, 2 ) = c( k2, l2 ) - ( suml+sgn*sumr )
+                       call stdlib_slasy2( .true., .true., isgn, 2, 2, a( k1, k1 ),lda, b( l1, l1 &
+                                 ), ldb, vec, 2, scaloc, x,2, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 1, 2 )
+                       c( k2, l1 ) = x( 2, 1 )
+                       c( k2, l2 ) = x( 2, 2 )
+                    end if
+                 end do loop_180
+              end do loop_190
+           else if( notrna .and. .not.notrnb ) then
+              ! solve    a*x + isgn*x*b**t = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! bottom-right corner column by column by
+                  ! a(k,k)*x(k,l) + isgn*x(k,l)*b(l,l)**t = c(k,l) - r(k,l)
+              ! where
+                            ! m                          n
+                  ! r(k,l) = sum [a(k,i)*x(i,l)] + isgn*sum [x(k,j)*b(l,j)**t].
+                          ! i=k+1                      j=l+1
+              ! start column loop (index = l)
+              ! l1 (l2): column index of the first (last) row of x(k,l)
+              lnext = n
+              loop_250: do l = n, 1, -1
+                 if( l>lnext )cycle loop_250
+                 if( l==1 ) then
+                    l1 = l
+                    l2 = l
+                 else
+                    if( b( l, l-1 )/=zero ) then
+                       l1 = l - 1
+                       l2 = l
+                       lnext = l - 2
+                    else
+                       l1 = l
+                       l2 = l
+                       lnext = l - 1
+                    end if
+                 end if
+                 ! start row loop (index = k)
+                 ! k1 (k2): row index of the first (last) row of x(k,l)
+                 knext = m
+                 loop_240: do k = m, 1, -1
+                    if( k>knext )cycle loop_240
+                    if( k==1 ) then
+                       k1 = k
+                       k2 = k
+                    else
+                       if( a( k, k-1 )/=zero ) then
+                          k1 = k - 1
+                          k2 = k
+                          knext = k - 2
+                       else
+                          k1 = k
+                          k2 = k
+                          knext = k - 1
+                       end if
+                    end if
+                    if( l1==l2 .and. k1==k2 ) then
+                       suml = stdlib_sdot( m-k1, a( k1, min(k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_sdot( n-l1, c( k1, min( l1+1, n ) ), ldc,b( l1, min( l1+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       scaloc = one
+                       a11 = a( k1, k1 ) + sgn*b( l1, l1 )
+                       da11 = abs( a11 )
+                       if( da11<=smin ) then
+                          a11 = smin
+                          da11 = smin
+                          info = 1
+                       end if
+                       db = abs( vec( 1, 1 ) )
+                       if( da11<one .and. db>one ) then
+                          if( db>bignum*da11 )scaloc = one / db
+                       end if
+                       x( 1, 1 ) = ( vec( 1, 1 )*scaloc ) / a11
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                    else if( l1==l2 .and. k1/=k2 ) then
+                       suml = stdlib_sdot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       call stdlib_slaln2( .false., 2, 1, smin, one, a( k1, k1 ),lda, one, one, &
+                                 vec, 2, -sgn*b( l1, l1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k2, l1 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1==k2 ) then
+                       suml = stdlib_sdot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = sgn*( c( k1, l1 )-( suml+sgn*sumr ) )
+                       suml = stdlib_sdot( m-k1, a( k1, min( k1+1, m ) ), lda,c( min( k1+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = sgn*( c( k1, l2 )-( suml+sgn*sumr ) )
+                       call stdlib_slaln2( .false., 2, 1, smin, one, b( l1, l1 ),ldb, one, one, &
+                                 vec, 2, -sgn*a( k1, k1 ),zero, x, 2, scaloc, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 2, 1 )
+                    else if( l1/=l2 .and. k1/=k2 ) then
+                       suml = stdlib_sdot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 1 ) = c( k1, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( m-k2, a( k1, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k1, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 1, 2 ) = c( k1, l2 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l1 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l1, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 1 ) = c( k2, l1 ) - ( suml+sgn*sumr )
+                       suml = stdlib_sdot( m-k2, a( k2, min( k2+1, m ) ), lda,c( min( k2+1, m ), &
+                                 l2 ), 1 )
+                       sumr = stdlib_sdot( n-l2, c( k2, min( l2+1, n ) ), ldc,b( l2, min( l2+1, n &
+                                 ) ), ldb )
+                       vec( 2, 2 ) = c( k2, l2 ) - ( suml+sgn*sumr )
+                       call stdlib_slasy2( .false., .true., isgn, 2, 2, a( k1, k1 ),lda, b( l1, &
+                                 l1 ), ldb, vec, 2, scaloc, x,2, xnorm, ierr )
+                       if( ierr/=0 )info = 1
+                       if( scaloc/=one ) then
+                          do j = 1, n
+                             call stdlib_sscal( m, scaloc, c( 1, j ), 1 )
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       c( k1, l1 ) = x( 1, 1 )
+                       c( k1, l2 ) = x( 1, 2 )
+                       c( k2, l1 ) = x( 2, 1 )
+                       c( k2, l2 ) = x( 2, 2 )
+                    end if
+                 end do loop_240
+              end do loop_250
+           end if
+           return
+     end subroutine stdlib_strsyl
+
+     !> SGEBRD: reduces a general real M-by-N matrix A to upper or lower
+     !> bidiagonal form B by an orthogonal transformation: Q**T * A * P = B.
+     !> If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+     pure subroutine stdlib_sgebrd( m, n, a, lda, d, e, tauq, taup, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: d(*), e(*), taup(*), tauq(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, ldwrkx, ldwrky, lwkopt, minmn, nb, nbmin, nx, ws
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb = max( 1, stdlib_ilaenv( 1, 'SGEBRD', ' ', m, n, -1, -1 ) )
+           lwkopt = ( m+n )*nb
+           work( 1 ) = real( lwkopt,KIND=sp)
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, m, n ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info<0 ) then
+              call stdlib_xerbla( 'SGEBRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           minmn = min( m, n )
+           if( minmn==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ws = max( m, n )
+           ldwrkx = m
+           ldwrky = n
+           if( nb>1 .and. nb<minmn ) then
+              ! set the crossover point nx.
+              nx = max( nb, stdlib_ilaenv( 3, 'SGEBRD', ' ', m, n, -1, -1 ) )
+              ! determine when to switch from blocked to unblocked code.
+              if( nx<minmn ) then
+                 ws = ( m+n )*nb
+                 if( lwork<ws ) then
+                    ! not enough work space for the optimal nb, consider using
+                    ! a smaller block size.
+                    nbmin = stdlib_ilaenv( 2, 'SGEBRD', ' ', m, n, -1, -1 )
+                    if( lwork>=( m+n )*nbmin ) then
+                       nb = lwork / ( m+n )
+                    else
+                       nb = 1
+                       nx = minmn
+                    end if
+                 end if
+              end if
+           else
+              nx = minmn
+           end if
+           do i = 1, minmn - nx, nb
+              ! reduce rows and columns i:i+nb-1 to bidiagonal form and return
+              ! the matrices x and y which are needed to update the unreduced
+              ! part of the matrix
+              call stdlib_slabrd( m-i+1, n-i+1, nb, a( i, i ), lda, d( i ), e( i ),tauq( i ), &
+                        taup( i ), work, ldwrkx,work( ldwrkx*nb+1 ), ldwrky )
+              ! update the trailing submatrix a(i+nb:m,i+nb:n), using an update
+              ! of the form  a := a - v*y**t - x*u**t
+              call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE', m-i-nb+1, n-i-nb+1,nb, -one, a( i+&
+                        nb, i ), lda,work( ldwrkx*nb+nb+1 ), ldwrky, one,a( i+nb, i+nb ), lda )
+              call stdlib_sgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-i-nb+1, n-i-nb+1,nb, -one, &
+                        work( nb+1 ), ldwrkx, a( i, i+nb ), lda,one, a( i+nb, i+nb ), lda )
+              ! copy diagonal and off-diagonal elements of b back into a
+              if( m>=n ) then
+                 do j = i, i + nb - 1
+                    a( j, j ) = d( j )
+                    a( j, j+1 ) = e( j )
+                 end do
+              else
+                 do j = i, i + nb - 1
+                    a( j, j ) = d( j )
+                    a( j+1, j ) = e( j )
+                 end do
+              end if
+           end do
+           ! use unblocked code to reduce the remainder of the matrix
+           call stdlib_sgebd2( m-i+1, n-i+1, a( i, i ), lda, d( i ), e( i ),tauq( i ), taup( i ), &
+                     work, iinfo )
+           work( 1 ) = ws
+           return
+     end subroutine stdlib_sgebrd
+
+     !> SGEHRD: reduces a real general matrix A to upper Hessenberg form H by
+     !> an orthogonal similarity transformation:  Q**T * A * Q = H .
+
+     pure subroutine stdlib_sgehrd( n, ilo, ihi, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iwt, j, ldwork, lwkopt, nb, nbmin, nh, nx
+           real(sp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+             ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'SGEHRD', ' ', n, ilo, ihi, -1 ) )
+              lwkopt = n*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEHRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! set elements 1:ilo-1 and ihi:n-1 of tau to zero
+           do i = 1, ilo - 1
+              tau( i ) = zero
+           end do
+           do i = max( 1, ihi ), n - 1
+              tau( i ) = zero
+           end do
+           ! quick return if possible
+           nh = ihi - ilo + 1
+           if( nh<=1 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! determine the block size
+           nb = min( nbmax, stdlib_ilaenv( 1, 'SGEHRD', ' ', n, ilo, ihi, -1 ) )
+           nbmin = 2
+           if( nb>1 .and. nb<nh ) then
+              ! determine when to cross over from blocked to unblocked code
+              ! (last block is always handled by unblocked code)
+              nx = max( nb, stdlib_ilaenv( 3, 'SGEHRD', ' ', n, ilo, ihi, -1 ) )
+              if( nx<nh ) then
+                 ! determine if workspace is large enough for blocked code
+                 if( lwork<n*nb+tsize ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'SGEHRD', ' ', n, ilo, ihi,-1 ) )
+                    if( lwork>=(n*nbmin + tsize) ) then
+                       nb = (lwork-tsize) / n
+                    else
+                       nb = 1
+                    end if
+                 end if
+              end if
+           end if
+           ldwork = n
+           if( nb<nbmin .or. nb>=nh ) then
+              ! use unblocked code below
+              i = ilo
+           else
+              ! use blocked code
+              iwt = 1 + n*nb
+              do i = ilo, ihi - 1 - nx, nb
+                 ib = min( nb, ihi-i )
+                 ! reduce columns i:i+ib-1 to hessenberg form, returning the
+                 ! matrices v and t of the block reflector h = i - v*t*v**t
+                 ! which performs the reduction, and also the matrix y = a*v*t
+                 call stdlib_slahr2( ihi, i, ib, a( 1, i ), lda, tau( i ),work( iwt ), ldt, work, &
+                           ldwork )
+                 ! apply the block reflector h to a(1:ihi,i+ib:ihi) from the
+                 ! right, computing  a := a - y * v**t. v(i+ib,ib-1) must be set
+                 ! to 1
+                 ei = a( i+ib, i+ib-1 )
+                 a( i+ib, i+ib-1 ) = one
+                 call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE',ihi, ihi-i-ib+1,ib, -one, work, &
+                           ldwork, a( i+ib, i ), lda, one,a( 1, i+ib ), lda )
+                 a( i+ib, i+ib-1 ) = ei
+                 ! apply the block reflector h to a(1:i,i+1:i+ib-1) from the
+                 ! right
+                 call stdlib_strmm( 'RIGHT', 'LOWER', 'TRANSPOSE','UNIT', i, ib-1,one, a( i+1, i )&
+                           , lda, work, ldwork )
+                 do j = 0, ib-2
+                    call stdlib_saxpy( i, -one, work( ldwork*j+1 ), 1,a( 1, i+j+1 ), 1 )
+                 end do
+                 ! apply the block reflector h to a(i+1:ihi,i+ib:n) from the
+                 ! left
+                 call stdlib_slarfb( 'LEFT', 'TRANSPOSE', 'FORWARD','COLUMNWISE',ihi-i, n-i-ib+1, &
+                           ib, a( i+1, i ), lda,work( iwt ), ldt, a( i+1, i+ib ), lda,work, ldwork )
+              end do
+           end if
+           ! use unblocked code to reduce the rest of the matrix
+           call stdlib_sgehd2( n, i, ihi, a, lda, tau, work, iinfo )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sgehrd
+
+     !> DGELQT computes a blocked LQ factorization of a real M-by-N matrix A
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_sgelqt( m, n, mb, a, lda, t, ldt, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, mb
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, iinfo, k
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( mb<1 .or. ( mb>min(m,n) .and. min(m,n)>0 ) )then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<mb ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGELQT', -info )
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) return
+           ! blocked loop of length k
+           do i = 1, k,  mb
+              ib = min( k-i+1, mb )
+           ! compute the lq factorization of the current block a(i:m,i:i+ib-1)
+              call stdlib_sgelqt3( ib, n-i+1, a(i,i), lda, t(1,i), ldt, iinfo )
+              if( i+ib<=m ) then
+           ! update by applying h**t to a(i:m,i+ib:n) from the right
+              call stdlib_slarfb( 'R', 'N', 'F', 'R', m-i-ib+1, n-i+1, ib,a( i, i ), lda, t( 1, i &
+                        ), ldt,a( i+ib, i ), lda, work , m-i-ib+1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_sgelqt
+
+     !> SGELS: solves overdetermined or underdetermined real linear systems
+     !> involving an M-by-N matrix A, or its transpose, using a QR or LQ
+     !> factorization of A.  It is assumed that A has full rank.
+     !> The following options are provided:
+     !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A*X ||.
+     !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+     !> an underdetermined system A * X = B.
+     !> 3. If TRANS = 'T' and m >= n:  find the minimum norm solution of
+     !> an underdetermined system A**T * X = B.
+     !> 4. If TRANS = 'T' and m < n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A**T * X ||.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+
+     subroutine stdlib_sgels( trans, m, n, nrhs, a, lda, b, ldb, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, tpsd
+           integer(ilp) :: brow, i, iascl, ibscl, j, mn, nb, scllen, wsize
+           real(sp) :: anrm, bignum, bnrm, smlnum
+           ! Local Arrays 
+           real(sp) :: rwork(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( .not.( stdlib_lsame( trans, 'N' ) .or. stdlib_lsame( trans, 'T' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m, n ) ) then
+              info = -8
+           else if( lwork<max( 1, mn + max( mn, nrhs ) ) .and..not.lquery ) then
+              info = -10
+           end if
+           ! figure out optimal block size
+           if( info==0 .or. info==-10 ) then
+              tpsd = .true.
+              if( stdlib_lsame( trans, 'N' ) )tpsd = .false.
+              if( m>=n ) then
+                 nb = stdlib_ilaenv( 1, 'SGEQRF', ' ', m, n, -1, -1 )
+                 if( tpsd ) then
+                    nb = max( nb, stdlib_ilaenv( 1, 'SORMQR', 'LN', m, nrhs, n,-1 ) )
+                 else
+                    nb = max( nb, stdlib_ilaenv( 1, 'SORMQR', 'LT', m, nrhs, n,-1 ) )
+                 end if
+              else
+                 nb = stdlib_ilaenv( 1, 'SGELQF', ' ', m, n, -1, -1 )
+                 if( tpsd ) then
+                    nb = max( nb, stdlib_ilaenv( 1, 'SORMLQ', 'LT', n, nrhs, m,-1 ) )
+                 else
+                    nb = max( nb, stdlib_ilaenv( 1, 'SORMLQ', 'LN', n, nrhs, m,-1 ) )
+                 end if
+              end if
+              wsize = max( 1, mn + max( mn, nrhs )*nb )
+              work( 1 ) = real( wsize,KIND=sp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGELS ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+              call stdlib_slaset( 'FULL', max( m, n ), nrhs, zero, zero, b, ldb )
+              return
+           end if
+           ! get machine parameters
+           smlnum = stdlib_slamch( 'S' ) / stdlib_slamch( 'P' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! scale a, b if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_slascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_slascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_slaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              go to 50
+           end if
+           brow = m
+           if( tpsd )brow = n
+           bnrm = stdlib_slange( 'M', brow, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_slascl( 'G', 0, 0, bnrm, smlnum, brow, nrhs, b, ldb,info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_slascl( 'G', 0, 0, bnrm, bignum, brow, nrhs, b, ldb,info )
+              ibscl = 2
+           end if
+           if( m>=n ) then
+              ! compute qr factorization of a
+              call stdlib_sgeqrf( m, n, a, lda, work( 1 ), work( mn+1 ), lwork-mn,info )
+              ! workspace at least n, optimally n*nb
+              if( .not.tpsd ) then
+                 ! least-squares problem min || a * x - b ||
+                 ! b(1:m,1:nrhs) := q**t * b(1:m,1:nrhs)
+                 call stdlib_sormqr( 'LEFT', 'TRANSPOSE', m, nrhs, n, a, lda,work( 1 ), b, ldb, &
+                           work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:n,1:nrhs) := inv(r) * b(1:n,1:nrhs)
+                 call stdlib_strtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = n
+              else
+                 ! underdetermined system of equations a**t * x = b
+                 ! b(1:n,1:nrhs) := inv(r**t) * b(1:n,1:nrhs)
+                 call stdlib_strtrs( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(n+1:m,1:nrhs) = zero
+                 do j = 1, nrhs
+                    do i = n + 1, m
+                       b( i, j ) = zero
+                    end do
+                 end do
+                 ! b(1:m,1:nrhs) := q(1:n,:) * b(1:n,1:nrhs)
+                 call stdlib_sormqr( 'LEFT', 'NO TRANSPOSE', m, nrhs, n, a, lda,work( 1 ), b, ldb,&
+                            work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = m
+              end if
+           else
+              ! compute lq factorization of a
+              call stdlib_sgelqf( m, n, a, lda, work( 1 ), work( mn+1 ), lwork-mn,info )
+              ! workspace at least m, optimally m*nb.
+              if( .not.tpsd ) then
+                 ! underdetermined system of equations a * x = b
+                 ! b(1:m,1:nrhs) := inv(l) * b(1:m,1:nrhs)
+                 call stdlib_strtrs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', m, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(m+1:n,1:nrhs) = 0
+                 do j = 1, nrhs
+                    do i = m + 1, n
+                       b( i, j ) = zero
+                    end do
+                 end do
+                 ! b(1:n,1:nrhs) := q(1:n,:)**t * b(1:m,1:nrhs)
+                 call stdlib_sormlq( 'LEFT', 'TRANSPOSE', n, nrhs, m, a, lda,work( 1 ), b, ldb, &
+                           work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = n
+              else
+                 ! overdetermined system min || a**t * x - b ||
+                 ! b(1:n,1:nrhs) := q * b(1:n,1:nrhs)
+                 call stdlib_sormlq( 'LEFT', 'NO TRANSPOSE', n, nrhs, m, a, lda,work( 1 ), b, ldb,&
+                            work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:m,1:nrhs) := inv(l**t) * b(1:m,1:nrhs)
+                 call stdlib_strtrs( 'LOWER', 'TRANSPOSE', 'NON-UNIT', m, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = m
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_slascl( 'G', 0, 0, anrm, smlnum, scllen, nrhs, b, ldb,info )
+           else if( iascl==2 ) then
+              call stdlib_slascl( 'G', 0, 0, anrm, bignum, scllen, nrhs, b, ldb,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_slascl( 'G', 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb,info )
+           else if( ibscl==2 ) then
+              call stdlib_slascl( 'G', 0, 0, bignum, bnrm, scllen, nrhs, b, ldb,info )
+           end if
+           50 continue
+           work( 1 ) = real( wsize,KIND=sp)
+           return
+     end subroutine stdlib_sgels
+
+     !> SGEMLQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product
+     !> of blocked elementary reflectors computed by short wide LQ
+     !> factorization (SGELQ)
+
+     pure subroutine stdlib_sgemlq( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, tsize, lwork, ldc
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), t(*)
+           real(sp), intent(inout) :: c(ldc,*)
+           real(sp), intent(out) :: work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: mb, nb, lw, nblcks, mn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork==-1
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'T' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           mb = int( t( 2 ),KIND=ilp)
+           nb = int( t( 3 ),KIND=ilp)
+           if( left ) then
+             lw = n * mb
+             mn = m
+           else
+             lw = m * mb
+             mn = n
+           end if
+           if( ( nb>k ) .and. ( mn>k ) ) then
+             if( mod( mn - k, nb - k ) == 0 ) then
+               nblcks = ( mn - k ) / ( nb - k )
+             else
+               nblcks = ( mn - k ) / ( nb - k ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+             info = -1
+           else if( .not.tran .and. .not.notran ) then
+             info = -2
+           else if( m<0 ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 .or. k>mn ) then
+             info = -5
+           else if( lda<max( 1, k ) ) then
+             info = -7
+           else if( tsize<5 ) then
+             info = -9
+           else if( ldc<max( 1, m ) ) then
+             info = -11
+           else if( ( lwork<max( 1, lw ) ) .and. ( .not.lquery ) ) then
+             info = -13
+           end if
+           if( info==0 ) then
+             work( 1 ) = real( lw,KIND=sp)
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'SGEMLQ', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n, k )==0 ) then
+             return
+           end if
+           if( ( left .and. m<=k ) .or. ( right .and. n<=k ).or. ( nb<=k ) .or. ( nb>=max( m, n, &
+                     k ) ) ) then
+             call stdlib_sgemlqt( side, trans, m, n, k, mb, a, lda,t( 6 ), mb, c, ldc, work, info &
+                       )
+           else
+             call stdlib_slamswlq( side, trans, m, n, k, mb, nb, a, lda, t( 6 ),mb, c, ldc, work, &
+                       lwork, info )
+           end if
+           work( 1 ) = real( lw,KIND=sp)
+           return
+     end subroutine stdlib_sgemlq
+
+     !> SGEMQR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> where Q is a real orthogonal matrix defined as the product
+     !> of blocked elementary reflectors computed by tall skinny
+     !> QR factorization (SGEQR)
+
+     pure subroutine stdlib_sgemqr( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, tsize, lwork, ldc
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), t(*)
+           real(sp), intent(inout) :: c(ldc,*)
+           real(sp), intent(out) :: work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: mb, nb, lw, nblcks, mn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork==-1
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'T' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           mb = int( t( 2 ),KIND=ilp)
+           nb = int( t( 3 ),KIND=ilp)
+           if( left ) then
+             lw = n * nb
+             mn = m
+           else
+             lw = mb * nb
+             mn = n
+           end if
+           if( ( mb>k ) .and. ( mn>k ) ) then
+             if( mod( mn - k, mb - k )==0 ) then
+               nblcks = ( mn - k ) / ( mb - k )
+             else
+               nblcks = ( mn - k ) / ( mb - k ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+             info = -1
+           else if( .not.tran .and. .not.notran ) then
+             info = -2
+           else if( m<0 ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 .or. k>mn ) then
+             info = -5
+           else if( lda<max( 1, mn ) ) then
+             info = -7
+           else if( tsize<5 ) then
+             info = -9
+           else if( ldc<max( 1, m ) ) then
+             info = -11
+           else if( ( lwork<max( 1, lw ) ) .and. ( .not.lquery ) ) then
+             info = -13
+           end if
+           if( info==0 ) then
+             work( 1 ) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'SGEMQR', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n, k )==0 ) then
+             return
+           end if
+           if( ( left .and. m<=k ) .or. ( right .and. n<=k ).or. ( mb<=k ) .or. ( mb>=max( m, n, &
+                     k ) ) ) then
+             call stdlib_sgemqrt( side, trans, m, n, k, nb, a, lda, t( 6 ),nb, c, ldc, work, info &
+                       )
+           else
+             call stdlib_slamtsqr( side, trans, m, n, k, mb, nb, a, lda, t( 6 ),nb, c, ldc, work, &
+                       lwork, info )
+           end if
+           work( 1 ) = lw
+           return
+     end subroutine stdlib_sgemqr
+
+     !> SGEQP3: computes a QR factorization with column pivoting of a
+     !> matrix A:  A*P = Q*R  using Level 3 BLAS.
+
+     pure subroutine stdlib_sgeqp3( m, n, a, lda, jpvt, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: inb = 1
+           integer(ilp), parameter :: inbmin = 2
+           integer(ilp), parameter :: ixover = 3
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: fjb, iws, j, jb, lwkopt, minmn, minws, na, nb, nbmin, nfxd, nx, sm, &
+                     sminmn, sn, topbmn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! test input arguments
+        ! ====================
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              minmn = min( m, n )
+              if( minmn==0 ) then
+                 iws = 1
+                 lwkopt = 1
+              else
+                 iws = 3*n + 1
+                 nb = stdlib_ilaenv( inb, 'SGEQRF', ' ', m, n, -1, -1 )
+                 lwkopt = 2*n + ( n + 1 )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( ( lwork<iws ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEQP3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! move initial columns up front.
+           nfxd = 1
+           do j = 1, n
+              if( jpvt( j )/=0 ) then
+                 if( j/=nfxd ) then
+                    call stdlib_sswap( m, a( 1, j ), 1, a( 1, nfxd ), 1 )
+                    jpvt( j ) = jpvt( nfxd )
+                    jpvt( nfxd ) = j
+                 else
+                    jpvt( j ) = j
+                 end if
+                 nfxd = nfxd + 1
+              else
+                 jpvt( j ) = j
+              end if
+           end do
+           nfxd = nfxd - 1
+           ! factorize fixed columns
+        ! =======================
+           ! compute the qr factorization of fixed columns and update
+           ! remaining columns.
+           if( nfxd>0 ) then
+              na = min( m, nfxd )
+      ! cc      call stdlib_sgeqr2( m, na, a, lda, tau, work, info )
+              call stdlib_sgeqrf( m, na, a, lda, tau, work, lwork, info )
+              iws = max( iws, int( work( 1 ),KIND=ilp) )
+              if( na<n ) then
+      ! cc         call stdlib_sorm2r( 'left', 'transpose', m, n-na, na, a, lda,
+      ! cc  $                   tau, a( 1, na+1 ), lda, work, info )
+                 call stdlib_sormqr( 'LEFT', 'TRANSPOSE', m, n-na, na, a, lda, tau,a( 1, na+1 ), &
+                           lda, work, lwork, info )
+                 iws = max( iws, int( work( 1 ),KIND=ilp) )
+              end if
+           end if
+           ! factorize free columns
+        ! ======================
+           if( nfxd<minmn ) then
+              sm = m - nfxd
+              sn = n - nfxd
+              sminmn = minmn - nfxd
+              ! determine the block size.
+              nb = stdlib_ilaenv( inb, 'SGEQRF', ' ', sm, sn, -1, -1 )
+              nbmin = 2
+              nx = 0
+              if( ( nb>1 ) .and. ( nb<sminmn ) ) then
+                 ! determine when to cross over from blocked to unblocked code.
+                 nx = max( 0, stdlib_ilaenv( ixover, 'SGEQRF', ' ', sm, sn, -1,-1 ) )
+                 if( nx<sminmn ) then
+                    ! determine if workspace is large enough for blocked code.
+                    minws = 2*sn + ( sn+1 )*nb
+                    iws = max( iws, minws )
+                    if( lwork<minws ) then
+                       ! not enough workspace to use optimal nb: reduce nb and
+                       ! determine the minimum value of nb.
+                       nb = ( lwork-2*sn ) / ( sn+1 )
+                       nbmin = max( 2, stdlib_ilaenv( inbmin, 'SGEQRF', ' ', sm, sn,-1, -1 ) )
+                                 
+                    end if
+                 end if
+              end if
+              ! initialize partial column norms. the first n elements of work
+              ! store the exact column norms.
+              do j = nfxd + 1, n
+                 work( j ) = stdlib_snrm2( sm, a( nfxd+1, j ), 1 )
+                 work( n+j ) = work( j )
+              end do
+              if( ( nb>=nbmin ) .and. ( nb<sminmn ) .and.( nx<sminmn ) ) then
+                 ! use blocked code initially.
+                 j = nfxd + 1
+                 ! compute factorization: while loop.
+                 topbmn = minmn - nx
+                 30 continue
+                 if( j<=topbmn ) then
+                    jb = min( nb, topbmn-j+1 )
+                    ! factorize jb columns among columns j:n.
+                    call stdlib_slaqps( m, n-j+1, j-1, jb, fjb, a( 1, j ), lda,jpvt( j ), tau( j )&
+                              , work( j ), work( n+j ),work( 2*n+1 ), work( 2*n+jb+1 ), n-j+1 )
+                    j = j + fjb
+                    go to 30
+                 end if
+              else
+                 j = nfxd + 1
+              end if
+              ! use unblocked code to factor the last or only block.
+              if( j<=minmn )call stdlib_slaqp2( m, n-j+1, j-1, a( 1, j ), lda, jpvt( j ),tau( j ),&
+                         work( j ), work( n+j ),work( 2*n+1 ) )
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_sgeqp3
+
+     !> SGEQRT: computes a blocked QR factorization of a real M-by-N matrix A
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_sgeqrt( m, n, nb, a, lda, t, ldt, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, nb
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk), parameter :: use_recursive_qr = .true.
+           integer(ilp) :: i, ib, iinfo, k
+           
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nb<1 .or. ( nb>min(m,n) .and. min(m,n)>0 ) )then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<nb ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEQRT', -info )
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) return
+           ! blocked loop of length k
+           do i = 1, k,  nb
+              ib = min( k-i+1, nb )
+           ! compute the qr factorization of the current block a(i:m,i:i+ib-1)
+              if( use_recursive_qr ) then
+                 call stdlib_sgeqrt3( m-i+1, ib, a(i,i), lda, t(1,i), ldt, iinfo )
+              else
+                 call stdlib_sgeqrt2( m-i+1, ib, a(i,i), lda, t(1,i), ldt, iinfo )
+              end if
+              if( i+ib<=n ) then
+           ! update by applying h**t to a(i:m,i+ib:n) from the left
+                 call stdlib_slarfb( 'L', 'T', 'F', 'C', m-i+1, n-i-ib+1, ib,a( i, i ), lda, t( 1,&
+                            i ), ldt,a( i, i+ib ), lda, work , n-i-ib+1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_sgeqrt
+
+     !> SGESV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> The LU decomposition with partial pivoting and row interchanges is
+     !> used to factor A as
+     !> A = P * L * U,
+     !> where P is a permutation matrix, L is unit lower triangular, and U is
+     !> upper triangular.  The factored form of A is then used to solve the
+     !> system of equations A * X = B.
+
+     pure subroutine stdlib_sgesv( n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGESV ', -info )
+              return
+           end if
+           ! compute the lu factorization of a.
+           call stdlib_sgetrf( n, n, a, lda, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_sgetrs( 'NO TRANSPOSE', n, nrhs, a, lda, ipiv, b, ldb,info )
+           end if
+           return
+     end subroutine stdlib_sgesv
+
+     !> SGESVX: uses the LU factorization to compute the solution to a real
+     !> system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_sgesvx( fact, trans, n, nrhs, a, lda, af, ldaf, ipiv,equed, r, c, b, ldb, &
+               x, ldx, rcond, ferr, berr,work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: a(lda,*), af(ldaf,*), b(ldb,*), c(*), r(*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: colequ, equil, nofact, notran, rowequ
+           character :: norm
+           integer(ilp) :: i, infequ, j
+           real(sp) :: amax, anorm, bignum, colcnd, rcmax, rcmin, rowcnd, rpvgrw, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rowequ = .false.
+              colequ = .false.
+           else
+              rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+              colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rowequ .or. colequ .or. stdlib_lsame( &
+                     equed, 'N' ) ) ) then
+              info = -10
+           else
+              if( rowequ ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, r( j ) )
+                    rcmax = max( rcmax, r( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -11
+                 else if( n>0 ) then
+                    rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    rowcnd = one
+                 end if
+              end if
+              if( colequ .and. info==0 ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, c( j ) )
+                    rcmax = max( rcmax, c( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -12
+                 else if( n>0 ) then
+                    colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    colcnd = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -14
+                 else if( ldx<max( 1, n ) ) then
+                    info = -16
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGESVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_sgeequ( n, n, a, lda, r, c, rowcnd, colcnd, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_slaqge( n, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+                 rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+                 colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( notran ) then
+              if( rowequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       b( i, j ) = r( i )*b( i, j )
+                    end do
+                 end do
+              end if
+           else if( colequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = c( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the lu factorization of a.
+              call stdlib_slacpy( 'FULL', n, n, a, lda, af, ldaf )
+              call stdlib_sgetrf( n, n, af, ldaf, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 ) then
+                 ! compute the reciprocal pivot growth factor of the
+                 ! leading rank-deficient info columns of a.
+                 rpvgrw = stdlib_slantr( 'M', 'U', 'N', info, info, af, ldaf,work )
+                 if( rpvgrw==zero ) then
+                    rpvgrw = one
+                 else
+                    rpvgrw = stdlib_slange( 'M', n, info, a, lda, work ) / rpvgrw
+                 end if
+                 work( 1 ) = rpvgrw
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a and the
+           ! reciprocal pivot growth factor rpvgrw.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_slange( norm, n, n, a, lda, work )
+           rpvgrw = stdlib_slantr( 'M', 'U', 'N', n, n, af, ldaf, work )
+           if( rpvgrw==zero ) then
+              rpvgrw = one
+           else
+              rpvgrw = stdlib_slange( 'M', n, n, a, lda, work ) / rpvgrw
+           end if
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_sgecon( norm, n, af, ldaf, anorm, rcond, work, iwork, info )
+           ! compute the solution matrix x.
+           call stdlib_slacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_sgetrs( trans, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_sgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, iwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( notran ) then
+              if( colequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       x( i, j ) = c( i )*x( i, j )
+                    end do
+                 end do
+                 do j = 1, nrhs
+                    ferr( j ) = ferr( j ) / colcnd
+                 end do
+              end if
+           else if( rowequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = r( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / rowcnd
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           work( 1 ) = rpvgrw
+           return
+     end subroutine stdlib_sgesvx
+
+     !> SGGES: computes for a pair of N-by-N real nonsymmetric matrices (A,B),
+     !> the generalized eigenvalues, the generalized real Schur form (S,T),
+     !> optionally, the left and/or right matrices of Schur vectors (VSL and
+     !> VSR). This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL)*S*(VSR)**T, (VSL)*T*(VSR)**T )
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> quasi-triangular matrix S and the upper triangular matrix T.The
+     !> leading columns of VSL and VSR then form an orthonormal basis for the
+     !> corresponding left and right eigenspaces (deflating subspaces).
+     !> (If only the generalized eigenvalues are needed, use the driver
+     !> SGGEV instead, which is faster.)
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0 or both being zero.
+     !> A pair of matrices (S,T) is in generalized real Schur form if T is
+     !> upper triangular with non-negative diagonal and S is block upper
+     !> triangular with 1-by-1 and 2-by-2 blocks.  1-by-1 blocks correspond
+     !> to real generalized eigenvalues, while 2-by-2 blocks of S will be
+     !> "standardized" by making the corresponding elements of T have the
+     !> form:
+     !> [  a  0  ]
+     !> [  0  b  ]
+     !> and the pair of corresponding 2-by-2 blocks in S and T will have a
+     !> complex conjugate pair of generalized eigenvalues.
+
+     subroutine stdlib_sgges( jobvsl, jobvsr, sort, selctg, n, a, lda, b, ldb,sdim, alphar, &
+               alphai, beta, vsl, ldvsl, vsr,ldvsr, work, lwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: alphai(*), alphar(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), &
+                     work(*)
+           ! Function Arguments 
+           procedure(stdlib_selctg_s) :: selctg
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, lst2sl, &
+                     wantst
+           integer(ilp) :: i, icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, ip, iright, irows, &
+                     itau, iwrk, maxwrk, minwrk
+           real(sp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pvsl, pvsr, safmax, safmin, &
+                     smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(sp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -15
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -17
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              if( n>0 )then
+                 minwrk = max( 8*n, 6*n + 16 )
+                 maxwrk = minwrk - n +n*stdlib_ilaenv( 1, 'SGEQRF', ' ', n, 1, n, 0 )
+                 maxwrk = max( maxwrk, minwrk - n +n*stdlib_ilaenv( 1, 'SORMQR', ' ', n, 1, n, -1 &
+                           ) )
+                 if( ilvsl ) then
+                    maxwrk = max( maxwrk, minwrk - n +n*stdlib_ilaenv( 1, 'SORGQR', ' ', n, 1, n, &
+                              -1 ) )
+                 end if
+              else
+                 minwrk = 1
+                 maxwrk = 1
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery )info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGES ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           safmin = stdlib_slamch( 'S' )
+           safmax = one / safmin
+           call stdlib_slabad( safmin, safmax )
+           smlnum = sqrt( safmin ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_slascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_slange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_slascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (workspace: need 6*n + 2*n space for storing balancing factors)
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_sggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_sgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (workspace: need n, prefer n*nb)
+           call stdlib_sormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           ! (workspace: need n, prefer n*nb)
+           if( ilvsl ) then
+              call stdlib_slaset( 'FULL', n, n, zero, one, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_slacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_sorgqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_slaset( 'FULL', n, n, zero, one, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           call stdlib_sgghrd( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      ierr )
+           ! perform qz algorithm, computing schur vectors if desired
+           ! (workspace: need n)
+           iwrk = itau
+           call stdlib_shgeqz( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vsl, ldvsl, vsr, ldvsr,work( iwrk ), lwork+1-iwrk, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 40
+           end if
+           ! sort eigenvalues alpha/beta if desired
+           ! (workspace: need 4*n+16 )
+           sdim = 0
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selctging
+              if( ilascl ) then
+                 call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n,ierr )
+                 call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n,ierr )
+              end if
+              if( ilbscl )call stdlib_slascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alphar( i ), alphai( i ), beta( i ) )
+              end do
+              call stdlib_stgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb, alphar,alphai, beta, &
+              vsl, ldvsl, vsr, ldvsr, sdim, pvsl,pvsr, dif, work( iwrk ), lwork-iwrk+1, idum, 1,&
+                        ierr )
+              if( ierr==1 )info = n + 3
+           end if
+           ! apply back-permutation to vsl and vsr
+           ! (workspace: none needed)
+           if( ilvsl )call stdlib_sggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_sggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsr, ldvsr, ierr )
+           ! check if unscaling would cause over/underflow, if so, rescale
+           ! (alphar(i),alphai(i),beta(i)) so beta(i) is on the order of
+           ! b(i,i) and alphar(i) and alphai(i) are on the order of a(i,i)
+           if( ilascl )then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( alphar( i )/safmax )>( anrmto/anrm ) .or.( safmin/alphar( i ) )>( &
+                              anrm/anrmto ) ) then
+                       work( 1 ) = abs( a( i, i )/alphar( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    else if( ( alphai( i )/safmax )>( anrmto/anrm ) .or.( safmin/alphai( i ) )>( &
+                              anrm/anrmto ) ) then
+                       work( 1 ) = abs( a( i, i+1 )/alphai( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           if( ilbscl )then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                     if( ( beta( i )/safmax )>( bnrmto/bnrm ) .or.( safmin/beta( i ) )>( &
+                               bnrm/bnrmto ) ) then
+                        work( 1 ) = abs(b( i, i )/beta( i ))
+                        beta( i ) = beta( i )*work( 1 )
+                        alphar( i ) = alphar( i )*work( 1 )
+                        alphai( i ) = alphai( i )*work( 1 )
+                     end if
+                  end if
+              end do
+           end if
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_slascl( 'H', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_slascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_slascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = selctg( alphar( i ), alphai( i ), beta( i ) )
+                 if( alphai( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           40 continue
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_sgges
+
+     !> SGGESX: computes for a pair of N-by-N real nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, the real Schur form (S,T), and,
+     !> optionally, the left and/or right matrices of Schur vectors (VSL and
+     !> VSR).  This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL) S (VSR)**T, (VSL) T (VSR)**T )
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> quasi-triangular matrix S and the upper triangular matrix T; computes
+     !> a reciprocal condition number for the average of the selected
+     !> eigenvalues (RCONDE); and computes a reciprocal condition number for
+     !> the right and left deflating subspaces corresponding to the selected
+     !> eigenvalues (RCONDV). The leading columns of VSL and VSR then form
+     !> an orthonormal basis for the corresponding left and right eigenspaces
+     !> (deflating subspaces).
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0 or for both being zero.
+     !> A pair of matrices (S,T) is in generalized real Schur form if T is
+     !> upper triangular with non-negative diagonal and S is block upper
+     !> triangular with 1-by-1 and 2-by-2 blocks.  1-by-1 blocks correspond
+     !> to real generalized eigenvalues, while 2-by-2 blocks of S will be
+     !> "standardized" by making the corresponding elements of T have the
+     !> form:
+     !> [  a  0  ]
+     !> [  0  b  ]
+     !> and the pair of corresponding 2-by-2 blocks in S and T will have a
+     !> complex conjugate pair of generalized eigenvalues.
+
+     subroutine stdlib_sggesx( jobvsl, jobvsr, sort, selctg, sense, n, a, lda,b, ldb, sdim, &
+     alphar, alphai, beta, vsl, ldvsl,vsr, ldvsr, rconde, rcondv, work, lwork, iwork,liwork, &
+               bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sense, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, liwork, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: alphai(*), alphar(*), beta(*), rconde(2), rcondv(2), vsl(&
+                     ldvsl,*), vsr(ldvsr,*), work(*)
+           ! Function Arguments 
+           procedure(stdlib_selctg_s) :: selctg
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, lst2sl, wantsb, &
+                     wantse, wantsn, wantst, wantsv
+           integer(ilp) :: i, icols, ierr, ihi, ijob, ijobvl, ijobvr, ileft, ilo, ip, iright, &
+                     irows, itau, iwrk, liwmin, lwrk, maxwrk, minwrk
+           real(sp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pl, pr, safmax, safmin, &
+                     smlnum
+           ! Local Arrays 
+           real(sp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( wantsn ) then
+              ijob = 0
+           else if( wantse ) then
+              ijob = 1
+           else if( wantsv ) then
+              ijob = 2
+           else if( wantsb ) then
+              ijob = 4
+           end if
+           ! test the input arguments
+           info = 0
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( .not.( wantsn .or. wantse .or. wantsv .or. wantsb ) .or.( .not.wantst .and. &
+                     .not.wantsn ) ) then
+              info = -5
+           else if( n<0 ) then
+              info = -6
+           else if( lda<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -16
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -18
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              if( n>0) then
+                 minwrk = max( 8*n, 6*n + 16 )
+                 maxwrk = minwrk - n +n*stdlib_ilaenv( 1, 'SGEQRF', ' ', n, 1, n, 0 )
+                 maxwrk = max( maxwrk, minwrk - n +n*stdlib_ilaenv( 1, 'SORMQR', ' ', n, 1, n, -1 &
+                           ) )
+                 if( ilvsl ) then
+                    maxwrk = max( maxwrk, minwrk - n +n*stdlib_ilaenv( 1, 'SORGQR', ' ', n, 1, n, &
+                              -1 ) )
+                 end if
+                 lwrk = maxwrk
+                 if( ijob>=1 )lwrk = max( lwrk, n*n/2 )
+              else
+                 minwrk = 1
+                 maxwrk = 1
+                 lwrk   = 1
+              end if
+              work( 1 ) = lwrk
+              if( wantsn .or. n==0 ) then
+                 liwmin = 1
+              else
+                 liwmin = n + 6
+              end if
+              iwork( 1 ) = liwmin
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -22
+              else if( liwork<liwmin  .and. .not.lquery ) then
+                 info = -24
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGESX', -info )
+              return
+           else if (lquery) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           safmin = stdlib_slamch( 'S' )
+           safmax = one / safmin
+           call stdlib_slabad( safmin, safmax )
+           smlnum = sqrt( safmin ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_slascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_slange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_slascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (workspace: need 6*n + 2*n for permutation parameters)
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_sggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_sgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (workspace: need n, prefer n*nb)
+           call stdlib_sormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           ! (workspace: need n, prefer n*nb)
+           if( ilvsl ) then
+              call stdlib_slaset( 'FULL', n, n, zero, one, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_slacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_sorgqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_slaset( 'FULL', n, n, zero, one, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           call stdlib_sgghrd( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      ierr )
+           sdim = 0
+           ! perform qz algorithm, computing schur vectors if desired
+           ! (workspace: need n)
+           iwrk = itau
+           call stdlib_shgeqz( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vsl, ldvsl, vsr, ldvsr,work( iwrk ), lwork+1-iwrk, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 50
+           end if
+           ! sort eigenvalues alpha/beta and compute the reciprocal of
+           ! condition number(s)
+           ! (workspace: if ijob >= 1, need max( 8*(n+1), 2*sdim*(n-sdim) )
+                       ! otherwise, need 8*(n+1) )
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selctging
+              if( ilascl ) then
+                 call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n,ierr )
+                 call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n,ierr )
+              end if
+              if( ilbscl )call stdlib_slascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alphar( i ), alphai( i ), beta( i ) )
+              end do
+              ! reorder eigenvalues, transform generalized schur vectors, and
+              ! compute reciprocal condition numbers
+              call stdlib_stgsen( ijob, ilvsl, ilvsr, bwork, n, a, lda, b, ldb,alphar, alphai, &
+              beta, vsl, ldvsl, vsr, ldvsr,sdim, pl, pr, dif, work( iwrk ), lwork-iwrk+1,iwork, &
+                        liwork, ierr )
+              if( ijob>=1 )maxwrk = max( maxwrk, 2*sdim*( n-sdim ) )
+              if( ierr==-22 ) then
+                  ! not enough real workspace
+                 info = -22
+              else
+                 if( ijob==1 .or. ijob==4 ) then
+                    rconde( 1 ) = pl
+                    rconde( 2 ) = pr
+                 end if
+                 if( ijob==2 .or. ijob==4 ) then
+                    rcondv( 1 ) = dif( 1 )
+                    rcondv( 2 ) = dif( 2 )
+                 end if
+                 if( ierr==1 )info = n + 3
+              end if
+           end if
+           ! apply permutation to vsl and vsr
+           ! (workspace: none needed)
+           if( ilvsl )call stdlib_sggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_sggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsr, ldvsr, ierr )
+           ! check if unscaling would cause over/underflow, if so, rescale
+           ! (alphar(i),alphai(i),beta(i)) so beta(i) is on the order of
+           ! b(i,i) and alphar(i) and alphai(i) are on the order of a(i,i)
+           if( ilascl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( alphar( i ) / safmax )>( anrmto / anrm ) .or.( safmin / alphar( i ) )>( &
+                              anrm / anrmto ) )then
+                       work( 1 ) = abs( a( i, i ) / alphar( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    else if( ( alphai( i ) / safmax )>( anrmto / anrm ).or. ( safmin / alphai( i )&
+                               )>( anrm / anrmto ) )then
+                       work( 1 ) = abs( a( i, i+1 ) / alphai( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           if( ilbscl ) then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( beta( i ) / safmax )>( bnrmto / bnrm ) .or.( safmin / beta( i ) )>( &
+                              bnrm / bnrmto ) ) then
+                       work( 1 ) = abs( b( i, i ) / beta( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_slascl( 'H', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_slascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_slascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = selctg( alphar( i ), alphai( i ), beta( i ) )
+                 if( alphai( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           50 continue
+           work( 1 ) = maxwrk
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_sggesx
+
+     !> SGGEV: computes for a pair of N-by-N real nonsymmetric matrices (A,B)
+     !> the generalized eigenvalues, and optionally, the left and/or right
+     !> generalized eigenvectors.
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right eigenvector v(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j).
+     !> The left eigenvector u(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> u(j)**H * A  = lambda(j) * u(j)**H * B .
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_sggev( jobvl, jobvr, n, a, lda, b, ldb, alphar, alphai,beta, vl, ldvl, vr, &
+               ldvr, work, lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: alphai(*), alphar(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+                     
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery
+           character :: chtemp
+           integer(ilp) :: icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, in, iright, irows, itau, &
+                     iwrk, jc, jr, maxwrk, minwrk
+           real(sp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -12
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -14
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv. the workspace is
+             ! computed assuming ilo = 1 and ihi = n, the worst case.)
+           if( info==0 ) then
+              minwrk = max( 1, 8*n )
+              maxwrk = max( 1, n*( 7 +stdlib_ilaenv( 1, 'SGEQRF', ' ', n, 1, n, 0 ) ) )
+              maxwrk = max( maxwrk, n*( 7 +stdlib_ilaenv( 1, 'SORMQR', ' ', n, 1, n, 0 ) ) )
+                        
+              if( ilvl ) then
+                 maxwrk = max( maxwrk, n*( 7 +stdlib_ilaenv( 1, 'SORGQR', ' ', n, 1, n, -1 ) ) )
+                           
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery )info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_slascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_slange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_slascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrices a, b to isolate eigenvalues if possible
+           ! (workspace: need 6*n)
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_sggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           if( ilv ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_sgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (workspace: need n, prefer n*nb)
+           call stdlib_sormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl
+           ! (workspace: need n, prefer n*nb)
+           if( ilvl ) then
+              call stdlib_slaset( 'FULL', n, n, zero, one, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_slacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_sorgqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vr
+           if( ilvr )call stdlib_slaset( 'FULL', n, n, zero, one, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           if( ilv ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_sgghrd( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        ierr )
+           else
+              call stdlib_sgghrd( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur forms and schur vectors)
+           ! (workspace: need n)
+           iwrk = itau
+           if( ilv ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_shgeqz( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vl, ldvl, vr, ldvr,work( iwrk ), lwork+1-iwrk, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 110
+           end if
+           ! compute eigenvectors
+           ! (workspace: need 6*n)
+           if( ilv ) then
+              if( ilvl ) then
+                 if( ilvr ) then
+                    chtemp = 'B'
+                 else
+                    chtemp = 'L'
+                 end if
+              else
+                 chtemp = 'R'
+              end if
+              call stdlib_stgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, n, &
+                        in, work( iwrk ), ierr )
+              if( ierr/=0 ) then
+                 info = n + 2
+                 go to 110
+              end if
+              ! undo balancing on vl and vr and normalization
+              ! (workspace: none needed)
+              if( ilvl ) then
+                 call stdlib_sggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, vl, &
+                           ldvl, ierr )
+                 loop_50: do jc = 1, n
+                    if( alphai( jc )<zero )cycle loop_50
+                    temp = zero
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          temp = max( temp, abs( vl( jr, jc ) ) )
+                       end do
+                    else
+                       do jr = 1, n
+                          temp = max( temp, abs( vl( jr, jc ) )+abs( vl( jr, jc+1 ) ) )
+                       end do
+                    end if
+                    if( temp<smlnum )cycle loop_50
+                    temp = one / temp
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          vl( jr, jc ) = vl( jr, jc )*temp
+                       end do
+                    else
+                       do jr = 1, n
+                          vl( jr, jc ) = vl( jr, jc )*temp
+                          vl( jr, jc+1 ) = vl( jr, jc+1 )*temp
+                       end do
+                    end if
+                 end do loop_50
+              end if
+              if( ilvr ) then
+                 call stdlib_sggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, vr, &
+                           ldvr, ierr )
+                 loop_100: do jc = 1, n
+                    if( alphai( jc )<zero )cycle loop_100
+                    temp = zero
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          temp = max( temp, abs( vr( jr, jc ) ) )
+                       end do
+                    else
+                       do jr = 1, n
+                          temp = max( temp, abs( vr( jr, jc ) )+abs( vr( jr, jc+1 ) ) )
+                       end do
+                    end if
+                    if( temp<smlnum )cycle loop_100
+                    temp = one / temp
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          vr( jr, jc ) = vr( jr, jc )*temp
+                       end do
+                    else
+                       do jr = 1, n
+                          vr( jr, jc ) = vr( jr, jc )*temp
+                          vr( jr, jc+1 ) = vr( jr, jc+1 )*temp
+                       end do
+                    end if
+                 end do loop_100
+              end if
+              ! end of eigenvector calculation
+           end if
+           ! undo scaling if necessary
+           110 continue
+           if( ilascl ) then
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_slascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_sggev
+
+     !> SGGEVX: computes for a pair of N-by-N real nonsymmetric matrices (A,B)
+     !> the generalized eigenvalues, and optionally, the left and/or right
+     !> generalized eigenvectors.
+     !> Optionally also, it computes a balancing transformation to improve
+     !> the conditioning of the eigenvalues and eigenvectors (ILO, IHI,
+     !> LSCALE, RSCALE, ABNRM, and BBNRM), reciprocal condition numbers for
+     !> the eigenvalues (RCONDE), and reciprocal condition numbers for the
+     !> right eigenvectors (RCONDV).
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right eigenvector v(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j) .
+     !> The left eigenvector u(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> u(j)**H * A  = lambda(j) * u(j)**H * B.
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_sggevx( balanc, jobvl, jobvr, sense, n, a, lda, b, ldb,alphar, alphai, &
+     beta, vl, ldvl, vr, ldvr, ilo,ihi, lscale, rscale, abnrm, bbnrm, rconde,rcondv, work, lwork, &
+               iwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: balanc, jobvl, jobvr, sense
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           real(sp), intent(out) :: abnrm, bbnrm
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: alphai(*), alphar(*), beta(*), lscale(*), rconde(*), rcondv(*)&
+                     , rscale(*), vl(ldvl,*), vr(ldvr,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery, noscl, pair, wantsb, wantse, &
+                     wantsn, wantsv
+           character :: chtemp
+           integer(ilp) :: i, icols, ierr, ijobvl, ijobvr, in, irows, itau, iwrk, iwrk1, j, jc, &
+                     jr, m, maxwrk, minwrk, mm
+           real(sp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           noscl  = stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc, 'P' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.( noscl .or. stdlib_lsame( balanc, 'S' ) .or.stdlib_lsame( balanc, 'B' ) ) ) &
+                     then
+              info = -1
+           else if( ijobvl<=0 ) then
+              info = -2
+           else if( ijobvr<=0 ) then
+              info = -3
+           else if( .not.( wantsn .or. wantse .or. wantsb .or. wantsv ) )then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -14
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -16
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv. the workspace is
+             ! computed assuming ilo = 1 and ihi = n, the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 if( noscl .and. .not.ilv ) then
+                    minwrk = 2*n
+                 else
+                    minwrk = 6*n
+                 end if
+                 if( wantse ) then
+                    minwrk = 10*n
+                 else if( wantsv .or. wantsb ) then
+                    minwrk = 2*n*( n + 4 ) + 16
+                 end if
+                 maxwrk = minwrk
+                 maxwrk = max( maxwrk,n + n*stdlib_ilaenv( 1, 'SGEQRF', ' ', n, 1, n, 0 ) )
+                           
+                 maxwrk = max( maxwrk,n + n*stdlib_ilaenv( 1, 'SORMQR', ' ', n, 1, n, 0 ) )
+                           
+                 if( ilvl ) then
+                    maxwrk = max( maxwrk, n +n*stdlib_ilaenv( 1, 'SORGQR', ' ', n, 1, n, 0 ) )
+                              
+                 end if
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -26
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_slascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_slange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_slascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute and/or balance the matrix pair (a,b)
+           ! (workspace: need 6*n if balanc = 's' or 'b', 1 otherwise)
+           call stdlib_sggbal( balanc, n, a, lda, b, ldb, ilo, ihi, lscale, rscale,work, ierr )
+                     
+           ! compute abnrm and bbnrm
+           abnrm = stdlib_slange( '1', n, n, a, lda, work( 1 ) )
+           if( ilascl ) then
+              work( 1 ) = abnrm
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, 1, 1, work( 1 ), 1,ierr )
+              abnrm = work( 1 )
+           end if
+           bbnrm = stdlib_slange( '1', n, n, b, ldb, work( 1 ) )
+           if( ilbscl ) then
+              work( 1 ) = bbnrm
+              call stdlib_slascl( 'G', 0, 0, bnrmto, bnrm, 1, 1, work( 1 ), 1,ierr )
+              bbnrm = work( 1 )
+           end if
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (workspace: need n, prefer n*nb )
+           irows = ihi + 1 - ilo
+           if( ilv .or. .not.wantsn ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_sgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to a
+           ! (workspace: need n, prefer n*nb)
+           call stdlib_sormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl and/or vr
+           ! (workspace: need n, prefer n*nb)
+           if( ilvl ) then
+              call stdlib_slaset( 'FULL', n, n, zero, one, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_slacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_sorgqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           if( ilvr )call stdlib_slaset( 'FULL', n, n, zero, one, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           if( ilv .or. .not.wantsn ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_sgghrd( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        ierr )
+           else
+              call stdlib_sgghrd( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur forms and schur vectors)
+           ! (workspace: need n)
+           if( ilv .or. .not.wantsn ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_shgeqz( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vl, ldvl, vr, ldvr, work,lwork, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 130
+           end if
+           ! compute eigenvectors and estimate condition numbers if desired
+           ! (workspace: stdlib_stgevc: need 6*n
+                       ! stdlib_stgsna: need 2*n*(n+2)+16 if sense = 'v' or 'b',
+                               ! need n otherwise )
+           if( ilv .or. .not.wantsn ) then
+              if( ilv ) then
+                 if( ilvl ) then
+                    if( ilvr ) then
+                       chtemp = 'B'
+                    else
+                       chtemp = 'L'
+                    end if
+                 else
+                    chtemp = 'R'
+                 end if
+                 call stdlib_stgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, n,&
+                            in, work, ierr )
+                 if( ierr/=0 ) then
+                    info = n + 2
+                    go to 130
+                 end if
+              end if
+              if( .not.wantsn ) then
+                 ! compute eigenvectors (stdlib_stgevc) and estimate condition
+                 ! numbers (stdlib_stgsna). note that the definition of the condition
+                 ! number is not invariant under transformation (u,v) to
+                 ! (q*u, z*v), where (u,v) are eigenvectors of the generalized
+                 ! schur form (s,t), q and z are orthogonal matrices. in order
+                 ! to avoid using extra 2*n*n workspace, we have to recalculate
+                 ! eigenvectors and estimate one condition numbers at a time.
+                 pair = .false.
+                 loop_20: do i = 1, n
+                    if( pair ) then
+                       pair = .false.
+                       cycle loop_20
+                    end if
+                    mm = 1
+                    if( i<n ) then
+                       if( a( i+1, i )/=zero ) then
+                          pair = .true.
+                          mm = 2
+                       end if
+                    end if
+                    do j = 1, n
+                       bwork( j ) = .false.
+                    end do
+                    if( mm==1 ) then
+                       bwork( i ) = .true.
+                    else if( mm==2 ) then
+                       bwork( i ) = .true.
+                       bwork( i+1 ) = .true.
+                    end if
+                    iwrk = mm*n + 1
+                    iwrk1 = iwrk + mm*n
+                    ! compute a pair of left and right eigenvectors.
+                    ! (compute workspace: need up to 4*n + 6*n)
+                    if( wantse .or. wantsb ) then
+                       call stdlib_stgevc( 'B', 'S', bwork, n, a, lda, b, ldb,work( 1 ), n, work( &
+                                 iwrk ), n, mm, m,work( iwrk1 ), ierr )
+                       if( ierr/=0 ) then
+                          info = n + 2
+                          go to 130
+                       end if
+                    end if
+                    call stdlib_stgsna( sense, 'S', bwork, n, a, lda, b, ldb,work( 1 ), n, work( &
+                    iwrk ), n, rconde( i ),rcondv( i ), mm, m, work( iwrk1 ),lwork-iwrk1+1, iwork,&
+                               ierr )
+                 end do loop_20
+              end if
+           end if
+           ! undo balancing on vl and vr and normalization
+           ! (workspace: none needed)
+           if( ilvl ) then
+              call stdlib_sggbak( balanc, 'L', n, ilo, ihi, lscale, rscale, n, vl,ldvl, ierr )
+                        
+              loop_70: do jc = 1, n
+                 if( alphai( jc )<zero )cycle loop_70
+                 temp = zero
+                 if( alphai( jc )==zero ) then
+                    do jr = 1, n
+                       temp = max( temp, abs( vl( jr, jc ) ) )
+                    end do
+                 else
+                    do jr = 1, n
+                       temp = max( temp, abs( vl( jr, jc ) )+abs( vl( jr, jc+1 ) ) )
+                    end do
+                 end if
+                 if( temp<smlnum )cycle loop_70
+                 temp = one / temp
+                 if( alphai( jc )==zero ) then
+                    do jr = 1, n
+                       vl( jr, jc ) = vl( jr, jc )*temp
+                    end do
+                 else
+                    do jr = 1, n
+                       vl( jr, jc ) = vl( jr, jc )*temp
+                       vl( jr, jc+1 ) = vl( jr, jc+1 )*temp
+                    end do
+                 end if
+              end do loop_70
+           end if
+           if( ilvr ) then
+              call stdlib_sggbak( balanc, 'R', n, ilo, ihi, lscale, rscale, n, vr,ldvr, ierr )
+                        
+              loop_120: do jc = 1, n
+                 if( alphai( jc )<zero )cycle loop_120
+                 temp = zero
+                 if( alphai( jc )==zero ) then
+                    do jr = 1, n
+                       temp = max( temp, abs( vr( jr, jc ) ) )
+                    end do
+                 else
+                    do jr = 1, n
+                       temp = max( temp, abs( vr( jr, jc ) )+abs( vr( jr, jc+1 ) ) )
+                    end do
+                 end if
+                 if( temp<smlnum )cycle loop_120
+                 temp = one / temp
+                 if( alphai( jc )==zero ) then
+                    do jr = 1, n
+                       vr( jr, jc ) = vr( jr, jc )*temp
+                    end do
+                 else
+                    do jr = 1, n
+                       vr( jr, jc ) = vr( jr, jc )*temp
+                       vr( jr, jc+1 ) = vr( jr, jc+1 )*temp
+                    end do
+                 end if
+              end do loop_120
+           end if
+           ! undo scaling if necessary
+           130 continue
+           if( ilascl ) then
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_slascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_sggevx
+
+     !> SGGGLM: solves a general Gauss-Markov linear model (GLM) problem:
+     !> minimize || y ||_2   subject to   d = A*x + B*y
+     !> x
+     !> where A is an N-by-M matrix, B is an N-by-P matrix, and d is a
+     !> given N-vector. It is assumed that M <= N <= M+P, and
+     !> rank(A) = M    and    rank( A B ) = N.
+     !> Under these assumptions, the constrained equation is always
+     !> consistent, and there is a unique solution x and a minimal 2-norm
+     !> solution y, which is obtained using a generalized QR factorization
+     !> of the matrices (A, B) given by
+     !> A = Q*(R),   B = Q*T*Z.
+     !> (0)
+     !> In particular, if matrix B is square nonsingular, then the problem
+     !> GLM is equivalent to the following weighted linear least squares
+     !> problem
+     !> minimize || inv(B)*(d-A*x) ||_2
+     !> x
+     !> where inv(B) denotes the inverse of B.
+
+     pure subroutine stdlib_sggglm( n, m, p, a, lda, b, ldb, d, x, y, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*), d(*)
+           real(sp), intent(out) :: work(*), x(*), y(*)
+        ! ===================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, lopt, lwkmin, lwkopt, nb, nb1, nb2, nb3, nb4, np
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           np = min( n, p )
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( m<0 .or. m>n ) then
+              info = -2
+           else if( p<0 .or. p<n-m ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           ! calculate workspace
+           if( info==0) then
+              if( n==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'SGEQRF', ' ', n, m, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'SGERQF', ' ', n, m, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'SORMQR', ' ', n, m, p, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'SORMRQ', ' ', n, m, p, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = m + n + p
+                 lwkopt = m + np + max( n, p )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGGLM', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              do i = 1, m
+                 x(i) = zero
+              end do
+              do i = 1, p
+                 y(i) = zero
+              end do
+              return
+           end if
+           ! compute the gqr factorization of matrices a and b:
+                ! q**t*a = ( r11 ) m,    q**t*b*z**t = ( t11   t12 ) m
+                         ! (  0  ) n-m                 (  0    t22 ) n-m
+                            ! m                         m+p-n  n-m
+           ! where r11 and t22 are upper triangular, and q and z are
+           ! orthogonal.
+           call stdlib_sggqrf( n, m, p, a, lda, work, b, ldb, work( m+1 ),work( m+np+1 ), lwork-m-&
+                     np, info )
+           lopt = work( m+np+1 )
+           ! update left-hand-side vector d = q**t*d = ( d1 ) m
+                                                     ! ( d2 ) n-m
+           call stdlib_sormqr( 'LEFT', 'TRANSPOSE', n, 1, m, a, lda, work, d,max( 1, n ), work( m+&
+                     np+1 ), lwork-m-np, info )
+           lopt = max( lopt, int( work( m+np+1 ),KIND=ilp) )
+           ! solve t22*y2 = d2 for y2
+           if( n>m ) then
+              call stdlib_strtrs( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', n-m, 1,b( m+1, m+p-n+1 ), &
+                        ldb, d( m+1 ), n-m, info )
+              if( info>0 ) then
+                 info = 1
+                 return
+              end if
+              call stdlib_scopy( n-m, d( m+1 ), 1, y( m+p-n+1 ), 1 )
+           end if
+           ! set y1 = 0
+           do i = 1, m + p - n
+              y( i ) = zero
+           end do
+           ! update d1 = d1 - t12*y2
+           call stdlib_sgemv( 'NO TRANSPOSE', m, n-m, -one, b( 1, m+p-n+1 ), ldb,y( m+p-n+1 ), 1, &
+                     one, d, 1 )
+           ! solve triangular system: r11*x = d1
+           if( m>0 ) then
+              call stdlib_strtrs( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', m, 1, a, lda,d, m, info )
+                        
+              if( info>0 ) then
+                 info = 2
+                 return
+              end if
+              ! copy d to x
+              call stdlib_scopy( m, d, 1, x, 1 )
+           end if
+           ! backward transformation y = z**t *y
+           call stdlib_sormrq( 'LEFT', 'TRANSPOSE', p, 1, np,b( max( 1, n-p+1 ), 1 ), ldb, work( &
+                     m+1 ), y,max( 1, p ), work( m+np+1 ), lwork-m-np, info )
+           work( 1 ) = m + np + max( lopt, int( work( m+np+1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_sggglm
+
+     !> SGGLSE: solves the linear equality-constrained least squares (LSE)
+     !> problem:
+     !> minimize || c - A*x ||_2   subject to   B*x = d
+     !> where A is an M-by-N matrix, B is a P-by-N matrix, c is a given
+     !> M-vector, and d is a given P-vector. It is assumed that
+     !> P <= N <= M+P, and
+     !> rank(B) = P and  rank( (A) ) = N.
+     !> ( (B) )
+     !> These conditions ensure that the LSE problem has a unique solution,
+     !> which is obtained using a generalized RQ factorization of the
+     !> matrices (B, A) given by
+     !> B = (0 R)*Q,   A = Z*T*Q.
+
+     pure subroutine stdlib_sgglse( m, n, p, a, lda, b, ldb, c, d, x, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*), c(*), d(*)
+           real(sp), intent(out) :: work(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkmin, lwkopt, mn, nb, nb1, nb2, nb3, nb4, nr
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( p<0 .or. p>n .or. p<n-m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, p ) ) then
+              info = -7
+           end if
+           ! calculate workspace
+           if( info==0) then
+              if( n==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'SGEQRF', ' ', m, n, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'SGERQF', ' ', m, n, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'SORMQR', ' ', m, n, p, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'SORMRQ', ' ', m, n, p, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = m + n + p
+                 lwkopt = p + mn + max( m, n )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGLSE', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! compute the grq factorization of matrices b and a:
+                  ! b*q**t = (  0  t12 ) p   z**t*a*q**t = ( r11 r12 ) n-p
+                              ! n-p  p                     (  0  r22 ) m+p-n
+                                                            ! n-p  p
+           ! where t12 and r11 are upper triangular, and q and z are
+           ! orthogonal.
+           call stdlib_sggrqf( p, m, n, b, ldb, work, a, lda, work( p+1 ),work( p+mn+1 ), lwork-p-&
+                     mn, info )
+           lopt = work( p+mn+1 )
+           ! update c = z**t *c = ( c1 ) n-p
+                                ! ( c2 ) m+p-n
+           call stdlib_sormqr( 'LEFT', 'TRANSPOSE', m, 1, mn, a, lda, work( p+1 ),c, max( 1, m ), &
+                     work( p+mn+1 ), lwork-p-mn, info )
+           lopt = max( lopt, int( work( p+mn+1 ),KIND=ilp) )
+           ! solve t12*x2 = d for x2
+           if( p>0 ) then
+              call stdlib_strtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', p, 1,b( 1, n-p+1 ), ldb, d,&
+                         p, info )
+              if( info>0 ) then
+                 info = 1
+                 return
+              end if
+              ! put the solution in x
+              call stdlib_scopy( p, d, 1, x( n-p+1 ), 1 )
+              ! update c1
+              call stdlib_sgemv( 'NO TRANSPOSE', n-p, p, -one, a( 1, n-p+1 ), lda,d, 1, one, c, 1 &
+                        )
+           end if
+           ! solve r11*x1 = c1 for x1
+           if( n>p ) then
+              call stdlib_strtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n-p, 1,a, lda, c, n-p, &
+                        info )
+              if( info>0 ) then
+                 info = 2
+                 return
+              end if
+              ! put the solutions in x
+              call stdlib_scopy( n-p, c, 1, x, 1 )
+           end if
+           ! compute the residual vector:
+           if( m<n ) then
+              nr = m + p - n
+              if( nr>0 )call stdlib_sgemv( 'NO TRANSPOSE', nr, n-m, -one, a( n-p+1, m+1 ),lda, d( &
+                        nr+1 ), 1, one, c( n-p+1 ), 1 )
+           else
+              nr = p
+           end if
+           if( nr>0 ) then
+              call stdlib_strmv( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', nr,a( n-p+1, n-p+1 ), lda, &
+                        d, 1 )
+              call stdlib_saxpy( nr, -one, d, 1, c( n-p+1 ), 1 )
+           end if
+           ! backward transformation x = q**t*x
+           call stdlib_sormrq( 'LEFT', 'TRANSPOSE', n, 1, p, b, ldb, work( 1 ), x,n, work( p+mn+1 &
+                     ), lwork-p-mn, info )
+           work( 1 ) = p + mn + max( lopt, int( work( p+mn+1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_sgglse
+
+     !> SHSEIN: uses inverse iteration to find specified right and/or left
+     !> eigenvectors of a real upper Hessenberg matrix H.
+     !> The right eigenvector x and the left eigenvector y of the matrix H
+     !> corresponding to an eigenvalue w are defined by:
+     !> H * x = w * x,     y**h * H = w * y**h
+     !> where y**h denotes the conjugate transpose of the vector y.
+
+     subroutine stdlib_shsein( side, eigsrc, initv, select, n, h, ldh, wr, wi,vl, ldvl, vr, ldvr, &
+               mm, m, work, ifaill,ifailr, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: eigsrc, initv, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldh, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(inout) :: select(*)
+           integer(ilp), intent(out) :: ifaill(*), ifailr(*)
+           real(sp), intent(in) :: h(ldh,*), wi(*)
+           real(sp), intent(inout) :: vl(ldvl,*), vr(ldvr,*), wr(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: bothv, fromqr, leftv, noinit, pair, rightv
+           integer(ilp) :: i, iinfo, k, kl, kln, kr, ksi, ksr, ldwork
+           real(sp) :: bignum, eps3, hnorm, smlnum, ulp, unfl, wki, wkr
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           bothv = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv = stdlib_lsame( side, 'L' ) .or. bothv
+           fromqr = stdlib_lsame( eigsrc, 'Q' )
+           noinit = stdlib_lsame( initv, 'N' )
+           ! set m to the number of columns required to store the selected
+           ! eigenvectors, and standardize the array select.
+           m = 0
+           pair = .false.
+           do k = 1, n
+              if( pair ) then
+                 pair = .false.
+                 select( k ) = .false.
+              else
+                 if( wi( k )==zero ) then
+                    if( select( k ) )m = m + 1
+                 else
+                    pair = .true.
+                    if( select( k ) .or. select( k+1 ) ) then
+                       select( k ) = .true.
+                       m = m + 2
+                    end if
+                 end if
+              end if
+           end do
+           info = 0
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.fromqr .and. .not.stdlib_lsame( eigsrc, 'N' ) ) then
+              info = -2
+           else if( .not.noinit .and. .not.stdlib_lsame( initv, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( ldh<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -11
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -13
+           else if( mm<m ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SHSEIN', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! set machine-dependent constants.
+           unfl = stdlib_slamch( 'SAFE MINIMUM' )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           bignum = ( one-ulp ) / smlnum
+           ldwork = n + 1
+           kl = 1
+           kln = 0
+           if( fromqr ) then
+              kr = 0
+           else
+              kr = n
+           end if
+           ksr = 1
+           loop_120: do k = 1, n
+              if( select( k ) ) then
+                 ! compute eigenvector(s) corresponding to w(k).
+                 if( fromqr ) then
+                    ! if affiliation of eigenvalues is known, check whether
+                    ! the matrix splits.
+                    ! determine kl and kr such that 1 <= kl <= k <= kr <= n
+                    ! and h(kl,kl-1) and h(kr+1,kr) are zero (or kl = 1 or
+                    ! kr = n).
+                    ! then inverse iteration can be performed with the
+                    ! submatrix h(kl:n,kl:n) for a left eigenvector, and with
+                    ! the submatrix h(1:kr,1:kr) for a right eigenvector.
+                    do i = k, kl + 1, -1
+                       if( h( i, i-1 )==zero )go to 30
+                    end do
+                    30 continue
+                    kl = i
+                    if( k>kr ) then
+                       do i = k, n - 1
+                          if( h( i+1, i )==zero )go to 50
+                       end do
+                       50 continue
+                       kr = i
+                    end if
+                 end if
+                 if( kl/=kln ) then
+                    kln = kl
+                    ! compute infinity-norm of submatrix h(kl:kr,kl:kr) if it
+                    ! has not ben computed before.
+                    hnorm = stdlib_slanhs( 'I', kr-kl+1, h( kl, kl ), ldh, work )
+                    if( stdlib_sisnan( hnorm ) ) then
+                       info = -6
+                       return
+                    else if( hnorm>zero ) then
+                       eps3 = hnorm*ulp
+                    else
+                       eps3 = smlnum
+                    end if
+                 end if
+                 ! perturb eigenvalue if it is close to any previous
+                 ! selected eigenvalues affiliated to the submatrix
+                 ! h(kl:kr,kl:kr). close roots are modified by eps3.
+                 wkr = wr( k )
+                 wki = wi( k )
+                 60 continue
+                 do i = k - 1, kl, -1
+                    if( select( i ) .and. abs( wr( i )-wkr )+abs( wi( i )-wki )<eps3 ) &
+                              then
+                       wkr = wkr + eps3
+                       go to 60
+                    end if
+                 end do
+                 wr( k ) = wkr
+                 pair = wki/=zero
+                 if( pair ) then
+                    ksi = ksr + 1
+                 else
+                    ksi = ksr
+                 end if
+                 if( leftv ) then
+                    ! compute left eigenvector.
+                    call stdlib_slaein( .false., noinit, n-kl+1, h( kl, kl ), ldh,wkr, wki, vl( &
+                    kl, ksr ), vl( kl, ksi ),work, ldwork, work( n*n+n+1 ), eps3, smlnum,bignum, &
+                              iinfo )
+                    if( iinfo>0 ) then
+                       if( pair ) then
+                          info = info + 2
+                       else
+                          info = info + 1
+                       end if
+                       ifaill( ksr ) = k
+                       ifaill( ksi ) = k
+                    else
+                       ifaill( ksr ) = 0
+                       ifaill( ksi ) = 0
+                    end if
+                    do i = 1, kl - 1
+                       vl( i, ksr ) = zero
+                    end do
+                    if( pair ) then
+                       do i = 1, kl - 1
+                          vl( i, ksi ) = zero
+                       end do
+                    end if
+                 end if
+                 if( rightv ) then
+                    ! compute right eigenvector.
+                    call stdlib_slaein( .true., noinit, kr, h, ldh, wkr, wki,vr( 1, ksr ), vr( 1, &
+                              ksi ), work, ldwork,work( n*n+n+1 ), eps3, smlnum, bignum,iinfo )
+                    if( iinfo>0 ) then
+                       if( pair ) then
+                          info = info + 2
+                       else
+                          info = info + 1
+                       end if
+                       ifailr( ksr ) = k
+                       ifailr( ksi ) = k
+                    else
+                       ifailr( ksr ) = 0
+                       ifailr( ksi ) = 0
+                    end if
+                    do i = kr + 1, n
+                       vr( i, ksr ) = zero
+                    end do
+                    if( pair ) then
+                       do i = kr + 1, n
+                          vr( i, ksi ) = zero
+                       end do
+                    end if
+                 end if
+                 if( pair ) then
+                    ksr = ksr + 2
+                 else
+                    ksr = ksr + 1
+                 end if
+              end if
+           end do loop_120
+           return
+     end subroutine stdlib_shsein
+
+     !> SLA_PORPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(sp) function stdlib_sla_porpvgrw( uplo, ncols, a, lda, af, ldaf, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: ncols, lda, ldaf
+           ! Array Arguments 
+           real(sp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(sp) :: amax, umax, rpvgrw
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           ! stdlib_spotrf will have factored only the ncolsxncols leading minor, so
+           ! we restrict the growth search to that minor and use only the first
+           ! 2*ncols workspace entries.
+           rpvgrw = one
+           do i = 1, 2*ncols
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column.
+           if ( upper ) then
+              do j = 1, ncols
+                 do i = 1, j
+                    work( ncols+j ) =max( abs( a( i, j ) ), work( ncols+j ) )
+                 end do
+              end do
+           else
+              do j = 1, ncols
+                 do i = j, ncols
+                    work( ncols+j ) =max( abs( a( i, j ) ), work( ncols+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of the factor in
+           ! af.  no pivoting, so no permutations.
+           if ( stdlib_lsame( 'UPPER', uplo ) ) then
+              do j = 1, ncols
+                 do i = 1, j
+                    work( j ) = max( abs( af( i, j ) ), work( j ) )
+                 end do
+              end do
+           else
+              do j = 1, ncols
+                 do i = j, ncols
+                    work( j ) = max( abs( af( i, j ) ), work( j ) )
+                 end do
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( stdlib_lsame( 'UPPER', uplo ) ) then
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( ncols+i )
+                 if ( umax /= 0.0_sp ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( ncols+i )
+                 if ( umax /= 0.0_sp ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_sla_porpvgrw = rpvgrw
+     end function stdlib_sla_porpvgrw
+
+     !> SLAED3: finds the roots of the secular equation, as defined by the
+     !> values in D, W, and RHO, between 1 and K.  It makes the
+     !> appropriate calls to SLAED4 and then updates the eigenvectors by
+     !> multiplying the matrix of eigenvectors of the pair of eigensystems
+     !> being combined by the matrix of eigenvectors of the K-by-K system
+     !> which is solved here.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_slaed3( k, n, n1, d, q, ldq, rho, dlamda, q2, indx,ctot, w, s, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldq, n, n1
+           real(sp), intent(in) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ctot(*), indx(*)
+           real(sp), intent(out) :: d(*), q(ldq,*), s(*)
+           real(sp), intent(inout) :: dlamda(*), w(*)
+           real(sp), intent(in) :: q2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, iq2, j, n12, n2, n23
+           real(sp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( k<0 ) then
+              info = -1
+           else if( n<k ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAED3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( k==0 )return
+           ! modify values dlamda(i) to make sure all dlamda(i)-dlamda(j) can
+           ! be computed with high relative accuracy (barring over/underflow).
+           ! this is a problem on machines without a guard digit in
+           ! add/subtract (cray xmp, cray ymp, cray c 90 and cray 2).
+           ! the following code replaces dlamda(i) by 2*dlamda(i)-dlamda(i),
+           ! which on any of these machines zeros out the bottommost
+           ! bit of dlamda(i) if it is 1; this makes the subsequent
+           ! subtractions dlamda(i)-dlamda(j) unproblematic when cancellation
+           ! occurs. on binary machines with a guard digit (almost all
+           ! machines) it does not change dlamda(i) at all. on hexadecimal
+           ! and decimal machines with a guard digit, it slightly
+           ! changes the bottommost bits of dlamda(i). it does not account
+           ! for hexadecimal or decimal machines without guard digits
+           ! (we know of none). we use a subroutine call to compute
+           ! 2*dlambda(i) to prevent optimizing compilers from eliminating
+           ! this code.
+           do i = 1, k
+              dlamda( i ) = stdlib_slamc3( dlamda( i ), dlamda( i ) ) - dlamda( i )
+           end do
+           do j = 1, k
+              call stdlib_slaed4( k, j, dlamda, w, q( 1, j ), rho, d( j ), info )
+              ! if the zero finder fails, the computation is terminated.
+              if( info/=0 )go to 120
+           end do
+           if( k==1 )go to 110
+           if( k==2 ) then
+              do j = 1, k
+                 w( 1 ) = q( 1, j )
+                 w( 2 ) = q( 2, j )
+                 ii = indx( 1 )
+                 q( 1, j ) = w( ii )
+                 ii = indx( 2 )
+                 q( 2, j ) = w( ii )
+              end do
+              go to 110
+           end if
+           ! compute updated w.
+           call stdlib_scopy( k, w, 1, s, 1 )
+           ! initialize w(i) = q(i,i)
+           call stdlib_scopy( k, q, ldq+1, w, 1 )
+           do j = 1, k
+              do i = 1, j - 1
+                 w( i ) = w( i )*( q( i, j ) / ( dlamda( i )-dlamda( j ) ) )
+              end do
+              do i = j + 1, k
+                 w( i ) = w( i )*( q( i, j ) / ( dlamda( i )-dlamda( j ) ) )
+              end do
+           end do
+           do i = 1, k
+              w( i ) = sign( sqrt( -w( i ) ), s( i ) )
+           end do
+           ! compute eigenvectors of the modified rank-1 modification.
+           do j = 1, k
+              do i = 1, k
+                 s( i ) = w( i ) / q( i, j )
+              end do
+              temp = stdlib_snrm2( k, s, 1 )
+              do i = 1, k
+                 ii = indx( i )
+                 q( i, j ) = s( ii ) / temp
+              end do
+           end do
+           ! compute the updated eigenvectors.
+           110 continue
+           n2 = n - n1
+           n12 = ctot( 1 ) + ctot( 2 )
+           n23 = ctot( 2 ) + ctot( 3 )
+           call stdlib_slacpy( 'A', n23, k, q( ctot( 1 )+1, 1 ), ldq, s, n23 )
+           iq2 = n1*n12 + 1
+           if( n23/=0 ) then
+              call stdlib_sgemm( 'N', 'N', n2, k, n23, one, q2( iq2 ), n2, s, n23,zero, q( n1+1, &
+                        1 ), ldq )
+           else
+              call stdlib_slaset( 'A', n2, k, zero, zero, q( n1+1, 1 ), ldq )
+           end if
+           call stdlib_slacpy( 'A', n12, k, q, ldq, s, n12 )
+           if( n12/=0 ) then
+              call stdlib_sgemm( 'N', 'N', n1, k, n12, one, q2, n1, s, n12, zero, q,ldq )
+           else
+              call stdlib_slaset( 'A', n1, k, zero, zero, q( 1, 1 ), ldq )
+           end if
+           120 continue
+           return
+     end subroutine stdlib_slaed3
+
+     !> SLAED7: computes the updated eigensystem of a diagonal
+     !> matrix after modification by a rank-one symmetric matrix. This
+     !> routine is used only for the eigenproblem which requires all
+     !> eigenvalues and optionally eigenvectors of a dense symmetric matrix
+     !> that has been reduced to tridiagonal form.  SLAED1 handles
+     !> the case in which all eigenvalues and eigenvectors of a symmetric
+     !> tridiagonal matrix are desired.
+     !> T = Q(in) ( D(in) + RHO * Z*Z**T ) Q**T(in) = Q(out) * D(out) * Q**T(out)
+     !> where Z = Q**Tu, u is a vector of length N with ones in the
+     !> CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+     !> The eigenvectors of the original matrix are stored in Q, and the
+     !> eigenvalues are in D.  The algorithm consists of three stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple eigenvalues or if there is a zero in
+     !> the Z vector.  For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one.  This stage is
+     !> performed by the routine SLAED8.
+     !> The second stage consists of calculating the updated
+     !> eigenvalues. This is done by finding the roots of the secular
+     !> equation via the routine SLAED4 (as called by SLAED9).
+     !> This routine also calculates the eigenvectors of the current
+     !> problem.
+     !> The final stage consists of computing the updated eigenvectors
+     !> directly using the updated eigenvalues.  The eigenvectors for
+     !> the current problem are multiplied with the eigenvectors from
+     !> the overall problem.
+
+     pure subroutine stdlib_slaed7( icompq, n, qsiz, tlvls, curlvl, curpbm, d, q,ldq, indxq, rho, &
+               cutpnt, qstore, qptr, prmptr,perm, givptr, givcol, givnum, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: curlvl, curpbm, cutpnt, icompq, ldq, n, qsiz, tlvls
+           integer(ilp), intent(out) :: info
+           real(sp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: givcol(2,*), givptr(*), perm(*), prmptr(*), qptr(*)
+                     
+           integer(ilp), intent(out) :: indxq(*), iwork(*)
+           real(sp), intent(inout) :: d(*), givnum(2,*), q(ldq,*), qstore(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: coltyp, curr, i, idlmda, indx, indxc, indxp, iq2, is, iw, iz, k, ldq2, &
+                     n1, n2, ptr
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( icompq<0 .or. icompq>1 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( icompq==1 .and. qsiz<n ) then
+              info = -3
+           else if( ldq<max( 1, n ) ) then
+              info = -9
+           else if( min( 1, n )>cutpnt .or. n<cutpnt ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAED7', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! the following values are for bookkeeping purposes only.  they are
+           ! integer pointers which indicate the portion of the workspace
+           ! used by a particular array in stdlib_slaed8 and stdlib_slaed9.
+           if( icompq==1 ) then
+              ldq2 = qsiz
+           else
+              ldq2 = n
+           end if
+           iz = 1
+           idlmda = iz + n
+           iw = idlmda + n
+           iq2 = iw + n
+           is = iq2 + n*ldq2
+           indx = 1
+           indxc = indx + n
+           coltyp = indxc + n
+           indxp = coltyp + n
+           ! form the z-vector which consists of the last row of q_1 and the
+           ! first row of q_2.
+           ptr = 1 + 2**tlvls
+           do i = 1, curlvl - 1
+              ptr = ptr + 2**( tlvls-i )
+           end do
+           curr = ptr + curpbm
+           call stdlib_slaeda( n, tlvls, curlvl, curpbm, prmptr, perm, givptr,givcol, givnum, &
+                     qstore, qptr, work( iz ),work( iz+n ), info )
+           ! when solving the final problem, we no longer need the stored data,
+           ! so we will overwrite the data from this level onto the previously
+           ! used storage space.
+           if( curlvl==tlvls ) then
+              qptr( curr ) = 1
+              prmptr( curr ) = 1
+              givptr( curr ) = 1
+           end if
+           ! sort and deflate eigenvalues.
+           call stdlib_slaed8( icompq, k, n, qsiz, d, q, ldq, indxq, rho, cutpnt,work( iz ), work(&
+            idlmda ), work( iq2 ), ldq2,work( iw ), perm( prmptr( curr ) ), givptr( curr+1 ),&
+            givcol( 1, givptr( curr ) ),givnum( 1, givptr( curr ) ), iwork( indxp ),iwork( indx ),&
+                       info )
+           prmptr( curr+1 ) = prmptr( curr ) + n
+           givptr( curr+1 ) = givptr( curr+1 ) + givptr( curr )
+           ! solve secular equation.
+           if( k/=0 ) then
+              call stdlib_slaed9( k, 1, k, n, d, work( is ), k, rho, work( idlmda ),work( iw ), &
+                        qstore( qptr( curr ) ), k, info )
+              if( info/=0 )go to 30
+              if( icompq==1 ) then
+                 call stdlib_sgemm( 'N', 'N', qsiz, k, k, one, work( iq2 ), ldq2,qstore( qptr( &
+                           curr ) ), k, zero, q, ldq )
+              end if
+              qptr( curr+1 ) = qptr( curr ) + k**2
+           ! prepare the indxq sorting permutation.
+              n1 = k
+              n2 = n - k
+              call stdlib_slamrg( n1, n2, d, 1, -1, indxq )
+           else
+              qptr( curr+1 ) = qptr( curr )
+              do i = 1, n
+                 indxq( i ) = i
+              end do
+           end if
+           30 continue
+           return
+     end subroutine stdlib_slaed7
+
+     !> SLAEXC: swaps adjacent diagonal blocks T11 and T22 of order 1 or 2 in
+     !> an upper quasi-triangular matrix T by an orthogonal similarity
+     !> transformation.
+     !> T must be in Schur canonical form, that is, block upper triangular
+     !> with 1-by-1 and 2-by-2 diagonal blocks; each 2-by-2 diagonal block
+     !> has its diagonal elements equal and its off-diagonal elements of
+     !> opposite sign.
+
+     subroutine stdlib_slaexc( wantq, n, t, ldt, q, ldq, j1, n1, n2, work,info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: j1, ldq, ldt, n, n1, n2
+           ! Array Arguments 
+           real(sp), intent(inout) :: q(ldq,*), t(ldt,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ldd = 4
+           integer(ilp), parameter :: ldx = 2
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: ierr, j2, j3, j4, k, nd
+           real(sp) :: cs, dnorm, eps, scale, smlnum, sn, t11, t22, t33, tau, tau1, tau2, temp, &
+                     thresh, wi1, wi2, wr1, wr2, xnorm
+           ! Local Arrays 
+           real(sp) :: d(ldd,4), u(3), u1(3), u2(3), x(ldx,2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 .or. n1==0 .or. n2==0 )return
+           if( j1+n1>n )return
+           j2 = j1 + 1
+           j3 = j1 + 2
+           j4 = j1 + 3
+           if( n1==1 .and. n2==1 ) then
+              ! swap two 1-by-1 blocks.
+              t11 = t( j1, j1 )
+              t22 = t( j2, j2 )
+              ! determine the transformation to perform the interchange.
+              call stdlib_slartg( t( j1, j2 ), t22-t11, cs, sn, temp )
+              ! apply transformation to the matrix t.
+              if( j3<=n )call stdlib_srot( n-j1-1, t( j1, j3 ), ldt, t( j2, j3 ), ldt, cs,sn )
+                        
+              call stdlib_srot( j1-1, t( 1, j1 ), 1, t( 1, j2 ), 1, cs, sn )
+              t( j1, j1 ) = t22
+              t( j2, j2 ) = t11
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_srot( n, q( 1, j1 ), 1, q( 1, j2 ), 1, cs, sn )
+              end if
+           else
+              ! swapping involves at least one 2-by-2 block.
+              ! copy the diagonal block of order n1+n2 to the local array d
+              ! and compute its norm.
+              nd = n1 + n2
+              call stdlib_slacpy( 'FULL', nd, nd, t( j1, j1 ), ldt, d, ldd )
+              dnorm = stdlib_slange( 'MAX', nd, nd, d, ldd, work )
+              ! compute machine-dependent threshold for test for accepting
+              ! swap.
+              eps = stdlib_slamch( 'P' )
+              smlnum = stdlib_slamch( 'S' ) / eps
+              thresh = max( ten*eps*dnorm, smlnum )
+              ! solve t11*x - x*t22 = scale*t12 for x.
+              call stdlib_slasy2( .false., .false., -1, n1, n2, d, ldd,d( n1+1, n1+1 ), ldd, d( 1,&
+                         n1+1 ), ldd, scale, x,ldx, xnorm, ierr )
+              ! swap the adjacent diagonal blocks.
+              k = n1 + n1 + n2 - 3
+              go to ( 10, 20, 30 )k
+              10 continue
+              ! n1 = 1, n2 = 2: generate elementary reflector h so that:
+              ! ( scale, x11, x12 ) h = ( 0, 0, * )
+              u( 1 ) = scale
+              u( 2 ) = x( 1, 1 )
+              u( 3 ) = x( 1, 2 )
+              call stdlib_slarfg( 3, u( 3 ), u, 1, tau )
+              u( 3 ) = one
+              t11 = t( j1, j1 )
+              ! perform swap provisionally on diagonal block in d.
+              call stdlib_slarfx( 'L', 3, 3, u, tau, d, ldd, work )
+              call stdlib_slarfx( 'R', 3, 3, u, tau, d, ldd, work )
+              ! test whether to reject swap.
+              if( max( abs( d( 3, 1 ) ), abs( d( 3, 2 ) ), abs( d( 3,3 )-t11 ) )>thresh )go to &
+                        50
+              ! accept swap: apply transformation to the entire matrix t.
+              call stdlib_slarfx( 'L', 3, n-j1+1, u, tau, t( j1, j1 ), ldt, work )
+              call stdlib_slarfx( 'R', j2, 3, u, tau, t( 1, j1 ), ldt, work )
+              t( j3, j1 ) = zero
+              t( j3, j2 ) = zero
+              t( j3, j3 ) = t11
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_slarfx( 'R', n, 3, u, tau, q( 1, j1 ), ldq, work )
+              end if
+              go to 40
+              20 continue
+              ! n1 = 2, n2 = 1: generate elementary reflector h so that:
+              ! h (  -x11 ) = ( * )
+                ! (  -x21 ) = ( 0 )
+                ! ( scale ) = ( 0 )
+              u( 1 ) = -x( 1, 1 )
+              u( 2 ) = -x( 2, 1 )
+              u( 3 ) = scale
+              call stdlib_slarfg( 3, u( 1 ), u( 2 ), 1, tau )
+              u( 1 ) = one
+              t33 = t( j3, j3 )
+              ! perform swap provisionally on diagonal block in d.
+              call stdlib_slarfx( 'L', 3, 3, u, tau, d, ldd, work )
+              call stdlib_slarfx( 'R', 3, 3, u, tau, d, ldd, work )
+              ! test whether to reject swap.
+              if( max( abs( d( 2, 1 ) ), abs( d( 3, 1 ) ), abs( d( 1,1 )-t33 ) )>thresh )go to &
+                        50
+              ! accept swap: apply transformation to the entire matrix t.
+              call stdlib_slarfx( 'R', j3, 3, u, tau, t( 1, j1 ), ldt, work )
+              call stdlib_slarfx( 'L', 3, n-j1, u, tau, t( j1, j2 ), ldt, work )
+              t( j1, j1 ) = t33
+              t( j2, j1 ) = zero
+              t( j3, j1 ) = zero
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_slarfx( 'R', n, 3, u, tau, q( 1, j1 ), ldq, work )
+              end if
+              go to 40
+              30 continue
+              ! n1 = 2, n2 = 2: generate elementary reflectors h(1) and h(2) so
+              ! that:
+              ! h(2) h(1) (  -x11  -x12 ) = (  *  * )
+                        ! (  -x21  -x22 )   (  0  * )
+                        ! ( scale    0  )   (  0  0 )
+                        ! (    0  scale )   (  0  0 )
+              u1( 1 ) = -x( 1, 1 )
+              u1( 2 ) = -x( 2, 1 )
+              u1( 3 ) = scale
+              call stdlib_slarfg( 3, u1( 1 ), u1( 2 ), 1, tau1 )
+              u1( 1 ) = one
+              temp = -tau1*( x( 1, 2 )+u1( 2 )*x( 2, 2 ) )
+              u2( 1 ) = -temp*u1( 2 ) - x( 2, 2 )
+              u2( 2 ) = -temp*u1( 3 )
+              u2( 3 ) = scale
+              call stdlib_slarfg( 3, u2( 1 ), u2( 2 ), 1, tau2 )
+              u2( 1 ) = one
+              ! perform swap provisionally on diagonal block in d.
+              call stdlib_slarfx( 'L', 3, 4, u1, tau1, d, ldd, work )
+              call stdlib_slarfx( 'R', 4, 3, u1, tau1, d, ldd, work )
+              call stdlib_slarfx( 'L', 3, 4, u2, tau2, d( 2, 1 ), ldd, work )
+              call stdlib_slarfx( 'R', 4, 3, u2, tau2, d( 1, 2 ), ldd, work )
+              ! test whether to reject swap.
+              if( max( abs( d( 3, 1 ) ), abs( d( 3, 2 ) ), abs( d( 4, 1 ) ),abs( d( 4, 2 ) ) )&
+                        >thresh )go to 50
+              ! accept swap: apply transformation to the entire matrix t.
+              call stdlib_slarfx( 'L', 3, n-j1+1, u1, tau1, t( j1, j1 ), ldt, work )
+              call stdlib_slarfx( 'R', j4, 3, u1, tau1, t( 1, j1 ), ldt, work )
+              call stdlib_slarfx( 'L', 3, n-j1+1, u2, tau2, t( j2, j1 ), ldt, work )
+              call stdlib_slarfx( 'R', j4, 3, u2, tau2, t( 1, j2 ), ldt, work )
+              t( j3, j1 ) = zero
+              t( j3, j2 ) = zero
+              t( j4, j1 ) = zero
+              t( j4, j2 ) = zero
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_slarfx( 'R', n, 3, u1, tau1, q( 1, j1 ), ldq, work )
+                 call stdlib_slarfx( 'R', n, 3, u2, tau2, q( 1, j2 ), ldq, work )
+              end if
+              40 continue
+              if( n2==2 ) then
+                 ! standardize new 2-by-2 block t11
+                 call stdlib_slanv2( t( j1, j1 ), t( j1, j2 ), t( j2, j1 ),t( j2, j2 ), wr1, wi1, &
+                           wr2, wi2, cs, sn )
+                 call stdlib_srot( n-j1-1, t( j1, j1+2 ), ldt, t( j2, j1+2 ), ldt,cs, sn )
+                 call stdlib_srot( j1-1, t( 1, j1 ), 1, t( 1, j2 ), 1, cs, sn )
+                 if( wantq )call stdlib_srot( n, q( 1, j1 ), 1, q( 1, j2 ), 1, cs, sn )
+              end if
+              if( n1==2 ) then
+                 ! standardize new 2-by-2 block t22
+                 j3 = j1 + n2
+                 j4 = j3 + 1
+                 call stdlib_slanv2( t( j3, j3 ), t( j3, j4 ), t( j4, j3 ),t( j4, j4 ), wr1, wi1, &
+                           wr2, wi2, cs, sn )
+                 if( j3+2<=n )call stdlib_srot( n-j3-1, t( j3, j3+2 ), ldt, t( j4, j3+2 ),ldt, cs,&
+                            sn )
+                 call stdlib_srot( j3-1, t( 1, j3 ), 1, t( 1, j4 ), 1, cs, sn )
+                 if( wantq )call stdlib_srot( n, q( 1, j3 ), 1, q( 1, j4 ), 1, cs, sn )
+              end if
+           end if
+           return
+           ! exit with info = 1 if swap was rejected.
+        50 info = 1
+           return
+     end subroutine stdlib_slaexc
+
+     !> SLAHQR: is an auxiliary routine called by SHSEQR to update the
+     !> eigenvalues and Schur decomposition already computed by SHSEQR, by
+     !> dealing with the Hessenberg submatrix in rows and columns ILO to
+     !> IHI.
+
+     pure subroutine stdlib_slahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, ldz, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(sp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(sp), intent(out) :: wi(*), wr(*)
+        ! =========================================================
+           ! Parameters 
+           real(sp), parameter :: dat1 = 3.0_sp/4.0_sp
+           real(sp), parameter :: dat2 = -0.4375_sp
+           integer(ilp), parameter :: kexsh = 10
+           
+           
+           
+           ! Local Scalars 
+           real(sp) :: aa, ab, ba, bb, cs, det, h11, h12, h21, h21s, h22, rt1i, rt1r, rt2i, rt2r, &
+                     rtdisc, s, safmax, safmin, smlnum, sn, sum, t1, t2, t3, tr, tst, ulp, v2, v3
+           integer(ilp) :: i, i1, i2, its, itmax, j, k, l, m, nh, nr, nz, kdefl
+           ! Local Arrays 
+           real(sp) :: v(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,sqrt
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           if( ilo==ihi ) then
+              wr( ilo ) = h( ilo, ilo )
+              wi( ilo ) = zero
+              return
+           end if
+           ! ==== clear out the trash ====
+           do j = ilo, ihi - 3
+              h( j+2, j ) = zero
+              h( j+3, j ) = zero
+           end do
+           if( ilo<=ihi-2 )h( ihi, ihi-2 ) = zero
+           nh = ihi - ilo + 1
+           nz = ihiz - iloz + 1
+           ! set machine-dependent constants for the stopping criterion.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = one / safmin
+           call stdlib_slabad( safmin, safmax )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin*( real( nh,KIND=sp) / ulp )
+           ! i1 and i2 are the indices of the first row and last column of h
+           ! to which transformations must be applied. if eigenvalues only are
+           ! being computed, i1 and i2 are set inside the main loop.
+           if( wantt ) then
+              i1 = 1
+              i2 = n
+           end if
+           ! itmax is the total number of qr iterations allowed.
+           itmax = 30 * max( 10, nh )
+           ! kdefl counts the number of iterations since a deflation
+           kdefl = 0
+           ! the main loop begins here. i is the loop index and decreases from
+           ! ihi to ilo in steps of 1 or 2. each iteration of the loop works
+           ! with the active submatrix in rows and columns l to i.
+           ! eigenvalues i+1 to ihi have already converged. either l = ilo or
+           ! h(l,l-1) is negligible so that the matrix splits.
+           i = ihi
+           20 continue
+           l = ilo
+           if( i<ilo )go to 160
+           ! perform qr iterations on rows and columns ilo to i until a
+           ! submatrix of order 1 or 2 splits off at the bottom because a
+           ! subdiagonal element has become negligible.
+           loop_140: do its = 0, itmax
+              ! look for a single small subdiagonal element.
+              do k = i, l + 1, -1
+                 if( abs( h( k, k-1 ) )<=smlnum )go to 40
+                 tst = abs( h( k-1, k-1 ) ) + abs( h( k, k ) )
+                 if( tst==zero ) then
+                    if( k-2>=ilo )tst = tst + abs( h( k-1, k-2 ) )
+                    if( k+1<=ihi )tst = tst + abs( h( k+1, k ) )
+                 end if
+                 ! ==== the following is a conservative small subdiagonal
+                 ! .    deflation  criterion due to ahues
+                 ! .    1997). it has better mathematical foundation and
+                 ! .    improves accuracy in some cases.  ====
+                 if( abs( h( k, k-1 ) )<=ulp*tst ) then
+                    ab = max( abs( h( k, k-1 ) ), abs( h( k-1, k ) ) )
+                    ba = min( abs( h( k, k-1 ) ), abs( h( k-1, k ) ) )
+                    aa = max( abs( h( k, k ) ),abs( h( k-1, k-1 )-h( k, k ) ) )
+                    bb = min( abs( h( k, k ) ),abs( h( k-1, k-1 )-h( k, k ) ) )
+                    s = aa + ab
+                    if( ba*( ab / s )<=max( smlnum,ulp*( bb*( aa / s ) ) ) )go to 40
+                 end if
+              end do
+              40 continue
+              l = k
+              if( l>ilo ) then
+                 ! h(l,l-1) is negligible
+                 h( l, l-1 ) = zero
+              end if
+              ! exit from loop if a submatrix of order 1 or 2 has split off.
+              if( l>=i-1 )go to 150
+              kdefl = kdefl + 1
+              ! now the active submatrix is in rows and columns l to i. if
+              ! eigenvalues only are being computed, only the active submatrix
+              ! need be transformed.
+              if( .not.wantt ) then
+                 i1 = l
+                 i2 = i
+              end if
+              if( mod(kdefl,2*kexsh)==0 ) then
+                 ! exceptional shift.
+                 s = abs( h( i, i-1 ) ) + abs( h( i-1, i-2 ) )
+                 h11 = dat1*s + h( i, i )
+                 h12 = dat2*s
+                 h21 = s
+                 h22 = h11
+              else if( mod(kdefl,kexsh)==0 ) then
+                 ! exceptional shift.
+                 s = abs( h( l+1, l ) ) + abs( h( l+2, l+1 ) )
+                 h11 = dat1*s + h( l, l )
+                 h12 = dat2*s
+                 h21 = s
+                 h22 = h11
+              else
+                 ! prepare to use francis' double shift
+                 ! (i.e. 2nd degree generalized rayleigh quotient)
+                 h11 = h( i-1, i-1 )
+                 h21 = h( i, i-1 )
+                 h12 = h( i-1, i )
+                 h22 = h( i, i )
+              end if
+              s = abs( h11 ) + abs( h12 ) + abs( h21 ) + abs( h22 )
+              if( s==zero ) then
+                 rt1r = zero
+                 rt1i = zero
+                 rt2r = zero
+                 rt2i = zero
+              else
+                 h11 = h11 / s
+                 h21 = h21 / s
+                 h12 = h12 / s
+                 h22 = h22 / s
+                 tr = ( h11+h22 ) / two
+                 det = ( h11-tr )*( h22-tr ) - h12*h21
+                 rtdisc = sqrt( abs( det ) )
+                 if( det>=zero ) then
+                    ! ==== complex conjugate shifts ====
+                    rt1r = tr*s
+                    rt2r = rt1r
+                    rt1i = rtdisc*s
+                    rt2i = -rt1i
+                 else
+                    ! ==== realshifts (use only one of them,KIND=sp)  ====
+                    rt1r = tr + rtdisc
+                    rt2r = tr - rtdisc
+                    if( abs( rt1r-h22 )<=abs( rt2r-h22 ) ) then
+                       rt1r = rt1r*s
+                       rt2r = rt1r
+                    else
+                       rt2r = rt2r*s
+                       rt1r = rt2r
+                    end if
+                    rt1i = zero
+                    rt2i = zero
+                 end if
+              end if
+              ! look for two consecutive small subdiagonal elements.
+              do m = i - 2, l, -1
+                 ! determine the effect of starting the double-shift qr
+                 ! iteration at row m, and see if this would make h(m,m-1)
+                 ! negligible.  (the following uses scaling to avoid
+                 ! overflows and most underflows.)
+                 h21s = h( m+1, m )
+                 s = abs( h( m, m )-rt2r ) + abs( rt2i ) + abs( h21s )
+                 h21s = h( m+1, m ) / s
+                 v( 1 ) = h21s*h( m, m+1 ) + ( h( m, m )-rt1r )*( ( h( m, m )-rt2r ) / s ) - &
+                           rt1i*( rt2i / s )
+                 v( 2 ) = h21s*( h( m, m )+h( m+1, m+1 )-rt1r-rt2r )
+                 v( 3 ) = h21s*h( m+2, m+1 )
+                 s = abs( v( 1 ) ) + abs( v( 2 ) ) + abs( v( 3 ) )
+                 v( 1 ) = v( 1 ) / s
+                 v( 2 ) = v( 2 ) / s
+                 v( 3 ) = v( 3 ) / s
+                 if( m==l )go to 60
+                 if( abs( h( m, m-1 ) )*( abs( v( 2 ) )+abs( v( 3 ) ) )<=ulp*abs( v( 1 ) )*( abs( &
+                           h( m-1, m-1 ) )+abs( h( m,m ) )+abs( h( m+1, m+1 ) ) ) )go to 60
+              end do
+              60 continue
+              ! double-shift qr step
+              loop_130: do k = m, i - 1
+                 ! the first iteration of this loop determines a reflection g
+                 ! from the vector v and applies it from left and right to h,
+                 ! thus creating a nonzero bulge below the subdiagonal.
+                 ! each subsequent iteration determines a reflection g to
+                 ! restore the hessenberg form in the (k-1)th column, and thus
+                 ! chases the bulge one step toward the bottom of the active
+                 ! submatrix. nr is the order of g.
+                 nr = min( 3, i-k+1 )
+                 if( k>m )call stdlib_scopy( nr, h( k, k-1 ), 1, v, 1 )
+                 call stdlib_slarfg( nr, v( 1 ), v( 2 ), 1, t1 )
+                 if( k>m ) then
+                    h( k, k-1 ) = v( 1 )
+                    h( k+1, k-1 ) = zero
+                    if( k<i-1 )h( k+2, k-1 ) = zero
+                 else if( m>l ) then
+                     ! ==== use the following instead of
+                     ! .    h( k, k-1 ) = -h( k, k-1 ) to
+                     ! .    avoid a bug when v(2) and v(3)
+                     ! .    underflow. ====
+                    h( k, k-1 ) = h( k, k-1 )*( one-t1 )
+                 end if
+                 v2 = v( 2 )
+                 t2 = t1*v2
+                 if( nr==3 ) then
+                    v3 = v( 3 )
+                    t3 = t1*v3
+                    ! apply g from the left to transform the rows of the matrix
+                    ! in columns k to i2.
+                    do j = k, i2
+                       sum = h( k, j ) + v2*h( k+1, j ) + v3*h( k+2, j )
+                       h( k, j ) = h( k, j ) - sum*t1
+                       h( k+1, j ) = h( k+1, j ) - sum*t2
+                       h( k+2, j ) = h( k+2, j ) - sum*t3
+                    end do
+                    ! apply g from the right to transform the columns of the
+                    ! matrix in rows i1 to min(k+3,i).
+                    do j = i1, min( k+3, i )
+                       sum = h( j, k ) + v2*h( j, k+1 ) + v3*h( j, k+2 )
+                       h( j, k ) = h( j, k ) - sum*t1
+                       h( j, k+1 ) = h( j, k+1 ) - sum*t2
+                       h( j, k+2 ) = h( j, k+2 ) - sum*t3
+                    end do
+                    if( wantz ) then
+                       ! accumulate transformations in the matrix z
+                       do j = iloz, ihiz
+                          sum = z( j, k ) + v2*z( j, k+1 ) + v3*z( j, k+2 )
+                          z( j, k ) = z( j, k ) - sum*t1
+                          z( j, k+1 ) = z( j, k+1 ) - sum*t2
+                          z( j, k+2 ) = z( j, k+2 ) - sum*t3
+                       end do
+                    end if
+                 else if( nr==2 ) then
+                    ! apply g from the left to transform the rows of the matrix
+                    ! in columns k to i2.
+                    do j = k, i2
+                       sum = h( k, j ) + v2*h( k+1, j )
+                       h( k, j ) = h( k, j ) - sum*t1
+                       h( k+1, j ) = h( k+1, j ) - sum*t2
+                    end do
+                    ! apply g from the right to transform the columns of the
+                    ! matrix in rows i1 to min(k+3,i).
+                    do j = i1, i
+                       sum = h( j, k ) + v2*h( j, k+1 )
+                       h( j, k ) = h( j, k ) - sum*t1
+                       h( j, k+1 ) = h( j, k+1 ) - sum*t2
+                    end do
+                    if( wantz ) then
+                       ! accumulate transformations in the matrix z
+                       do j = iloz, ihiz
+                          sum = z( j, k ) + v2*z( j, k+1 )
+                          z( j, k ) = z( j, k ) - sum*t1
+                          z( j, k+1 ) = z( j, k+1 ) - sum*t2
+                       end do
+                    end if
+                 end if
+              end do loop_130
+           end do loop_140
+           ! failure to converge in remaining number of iterations
+           info = i
+           return
+           150 continue
+           if( l==i ) then
+              ! h(i,i-1) is negligible: one eigenvalue has converged.
+              wr( i ) = h( i, i )
+              wi( i ) = zero
+           else if( l==i-1 ) then
+              ! h(i-1,i-2) is negligible: a pair of eigenvalues have converged.
+              ! transform the 2-by-2 submatrix to standard schur form,
+              ! and compute and store the eigenvalues.
+              call stdlib_slanv2( h( i-1, i-1 ), h( i-1, i ), h( i, i-1 ),h( i, i ), wr( i-1 ), &
+                        wi( i-1 ), wr( i ), wi( i ),cs, sn )
+              if( wantt ) then
+                 ! apply the transformation to the rest of h.
+                 if( i2>i )call stdlib_srot( i2-i, h( i-1, i+1 ), ldh, h( i, i+1 ), ldh,cs, sn )
+                           
+                 call stdlib_srot( i-i1-1, h( i1, i-1 ), 1, h( i1, i ), 1, cs, sn )
+              end if
+              if( wantz ) then
+                 ! apply the transformation to z.
+                 call stdlib_srot( nz, z( iloz, i-1 ), 1, z( iloz, i ), 1, cs, sn )
+              end if
+           end if
+           ! reset deflation counter
+           kdefl = 0
+           ! return to start of the main loop with new value of i.
+           i = l - 1
+           go to 20
+           160 continue
+           return
+     end subroutine stdlib_slahqr
+
+     !> SLASD2: merges the two sets of singular values together into a single
+     !> sorted set.  Then it tries to deflate the size of the problem.
+     !> There are two ways in which deflation can occur:  when two or more
+     !> singular values are close together or if there is a tiny entry in the
+     !> Z vector.  For each such occurrence the order of the related secular
+     !> equation problem is reduced by one.
+     !> SLASD2 is called from SLASD1.
+
+     pure subroutine stdlib_slasd2( nl, nr, sqre, k, d, z, alpha, beta, u, ldu, vt,ldvt, dsigma, &
+               u2, ldu2, vt2, ldvt2, idxp, idx,idxc, idxq, coltyp, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, k
+           integer(ilp), intent(in) :: ldu, ldu2, ldvt, ldvt2, nl, nr, sqre
+           real(sp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           integer(ilp), intent(out) :: coltyp(*), idx(*), idxc(*), idxp(*)
+           integer(ilp), intent(inout) :: idxq(*)
+           real(sp), intent(inout) :: d(*), u(ldu,*), vt(ldvt,*)
+           real(sp), intent(out) :: dsigma(*), u2(ldu2,*), vt2(ldvt2,*), z(*)
+        ! =====================================================================
+           
+           ! Local Arrays 
+           integer(ilp) :: ctot(4), psm(4)
+           ! Local Scalars 
+           integer(ilp) :: ct, i, idxi, idxj, idxjp, j, jp, jprev, k2, m, n, nlp1, nlp2
+           real(sp) :: c, eps, hlftol, s, tau, tol, z1
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( nl<1 ) then
+              info = -1
+           else if( nr<1 ) then
+              info = -2
+           else if( ( sqre/=1 ) .and. ( sqre/=0 ) ) then
+              info = -3
+           end if
+           n = nl + nr + 1
+           m = n + sqre
+           if( ldu<n ) then
+              info = -10
+           else if( ldvt<m ) then
+              info = -12
+           else if( ldu2<n ) then
+              info = -15
+           else if( ldvt2<m ) then
+              info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLASD2', -info )
+              return
+           end if
+           nlp1 = nl + 1
+           nlp2 = nl + 2
+           ! generate the first part of the vector z; and move the singular
+           ! values in the first part of d one position backward.
+           z1 = alpha*vt( nlp1, nlp1 )
+           z( 1 ) = z1
+           do i = nl, 1, -1
+              z( i+1 ) = alpha*vt( i, nlp1 )
+              d( i+1 ) = d( i )
+              idxq( i+1 ) = idxq( i ) + 1
+           end do
+           ! generate the second part of the vector z.
+           do i = nlp2, m
+              z( i ) = beta*vt( i, nlp2 )
+           end do
+           ! initialize some reference arrays.
+           do i = 2, nlp1
+              coltyp( i ) = 1
+           end do
+           do i = nlp2, n
+              coltyp( i ) = 2
+           end do
+           ! sort the singular values into increasing order
+           do i = nlp2, n
+              idxq( i ) = idxq( i ) + nlp1
+           end do
+           ! dsigma, idxc, idxc, and the first column of u2
+           ! are used as storage space.
+           do i = 2, n
+              dsigma( i ) = d( idxq( i ) )
+              u2( i, 1 ) = z( idxq( i ) )
+              idxc( i ) = coltyp( idxq( i ) )
+           end do
+           call stdlib_slamrg( nl, nr, dsigma( 2 ), 1, 1, idx( 2 ) )
+           do i = 2, n
+              idxi = 1 + idx( i )
+              d( i ) = dsigma( idxi )
+              z( i ) = u2( idxi, 1 )
+              coltyp( i ) = idxc( idxi )
+           end do
+           ! calculate the allowable deflation tolerance
+           eps = stdlib_slamch( 'EPSILON' )
+           tol = max( abs( alpha ), abs( beta ) )
+           tol = eight*eps*max( abs( d( n ) ), tol )
+           ! there are 2 kinds of deflation -- first a value in the z-vector
+           ! is small, second two (or more) singular values are very close
+           ! together (their difference is small).
+           ! if the value in the z-vector is small, we simply permute the
+           ! array so that the corresponding singular value is moved to the
+           ! end.
+           ! if two values in the d-vector are close, we perform a two-sided
+           ! rotation designed to make one of the corresponding z-vector
+           ! entries zero, and then permute the array so that the deflated
+           ! singular value is moved to the end.
+           ! if there are multiple singular values then the problem deflates.
+           ! here the number of equal singular values are found.  as each equal
+           ! singular value is found, an elementary reflector is computed to
+           ! rotate the corresponding singular subspace so that the
+           ! corresponding components of z are zero in this new basis.
+           k = 1
+           k2 = n + 1
+           do j = 2, n
+              if( abs( z( j ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 idxp( k2 ) = j
+                 coltyp( j ) = 4
+                 if( j==n )go to 120
+              else
+                 jprev = j
+                 go to 90
+              end if
+           end do
+           90 continue
+           j = jprev
+           100 continue
+           j = j + 1
+           if( j>n )go to 110
+           if( abs( z( j ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              idxp( k2 ) = j
+              coltyp( j ) = 4
+           else
+              ! check if singular values are close enough to allow deflation.
+              if( abs( d( j )-d( jprev ) )<=tol ) then
+                 ! deflation is possible.
+                 s = z( jprev )
+                 c = z( j )
+                 ! find sqrt(a**2+b**2) without overflow or
+                 ! destructive underflow.
+                 tau = stdlib_slapy2( c, s )
+                 c = c / tau
+                 s = -s / tau
+                 z( j ) = tau
+                 z( jprev ) = zero
+                 ! apply back the givens rotation to the left and right
+                 ! singular vector matrices.
+                 idxjp = idxq( idx( jprev )+1 )
+                 idxj = idxq( idx( j )+1 )
+                 if( idxjp<=nlp1 ) then
+                    idxjp = idxjp - 1
+                 end if
+                 if( idxj<=nlp1 ) then
+                    idxj = idxj - 1
+                 end if
+                 call stdlib_srot( n, u( 1, idxjp ), 1, u( 1, idxj ), 1, c, s )
+                 call stdlib_srot( m, vt( idxjp, 1 ), ldvt, vt( idxj, 1 ), ldvt, c,s )
+                 if( coltyp( j )/=coltyp( jprev ) ) then
+                    coltyp( j ) = 3
+                 end if
+                 coltyp( jprev ) = 4
+                 k2 = k2 - 1
+                 idxp( k2 ) = jprev
+                 jprev = j
+              else
+                 k = k + 1
+                 u2( k, 1 ) = z( jprev )
+                 dsigma( k ) = d( jprev )
+                 idxp( k ) = jprev
+                 jprev = j
+              end if
+           end if
+           go to 100
+           110 continue
+           ! record the last singular value.
+           k = k + 1
+           u2( k, 1 ) = z( jprev )
+           dsigma( k ) = d( jprev )
+           idxp( k ) = jprev
+           120 continue
+           ! count up the total number of the various types of columns, then
+           ! form a permutation which positions the four column types into
+           ! four groups of uniform structure (although one or more of these
+           ! groups may be empty).
+           do j = 1, 4
+              ctot( j ) = 0
+           end do
+           do j = 2, n
+              ct = coltyp( j )
+              ctot( ct ) = ctot( ct ) + 1
+           end do
+           ! psm(*) = position in submatrix (of types 1 through 4)
+           psm( 1 ) = 2
+           psm( 2 ) = 2 + ctot( 1 )
+           psm( 3 ) = psm( 2 ) + ctot( 2 )
+           psm( 4 ) = psm( 3 ) + ctot( 3 )
+           ! fill out the idxc array so that the permutation which it induces
+           ! will place all type-1 columns first, all type-2 columns next,
+           ! then all type-3's, and finally all type-4's, starting from the
+           ! second column. this applies similarly to the rows of vt.
+           do j = 2, n
+              jp = idxp( j )
+              ct = coltyp( jp )
+              idxc( psm( ct ) ) = j
+              psm( ct ) = psm( ct ) + 1
+           end do
+           ! sort the singular values and corresponding singular vectors into
+           ! dsigma, u2, and vt2 respectively.  the singular values/vectors
+           ! which were not deflated go into the first k slots of dsigma, u2,
+           ! and vt2 respectively, while those which were deflated go into the
+           ! last n - k slots, except that the first column/row will be treated
+           ! separately.
+           do j = 2, n
+              jp = idxp( j )
+              dsigma( j ) = d( jp )
+              idxj = idxq( idx( idxp( idxc( j ) ) )+1 )
+              if( idxj<=nlp1 ) then
+                 idxj = idxj - 1
+              end if
+              call stdlib_scopy( n, u( 1, idxj ), 1, u2( 1, j ), 1 )
+              call stdlib_scopy( m, vt( idxj, 1 ), ldvt, vt2( j, 1 ), ldvt2 )
+           end do
+           ! determine dsigma(1), dsigma(2) and z(1)
+           dsigma( 1 ) = zero
+           hlftol = tol / two
+           if( abs( dsigma( 2 ) )<=hlftol )dsigma( 2 ) = hlftol
+           if( m>n ) then
+              z( 1 ) = stdlib_slapy2( z1, z( m ) )
+              if( z( 1 )<=tol ) then
+                 c = one
+                 s = zero
+                 z( 1 ) = tol
+              else
+                 c = z1 / z( 1 )
+                 s = z( m ) / z( 1 )
+              end if
+           else
+              if( abs( z1 )<=tol ) then
+                 z( 1 ) = tol
+              else
+                 z( 1 ) = z1
+              end if
+           end if
+           ! move the rest of the updating row to z.
+           call stdlib_scopy( k-1, u2( 2, 1 ), 1, z( 2 ), 1 )
+           ! determine the first column of u2, the first row of vt2 and the
+           ! last row of vt.
+           call stdlib_slaset( 'A', n, 1, zero, zero, u2, ldu2 )
+           u2( nlp1, 1 ) = one
+           if( m>n ) then
+              do i = 1, nlp1
+                 vt( m, i ) = -s*vt( nlp1, i )
+                 vt2( 1, i ) = c*vt( nlp1, i )
+              end do
+              do i = nlp2, m
+                 vt2( 1, i ) = s*vt( m, i )
+                 vt( m, i ) = c*vt( m, i )
+              end do
+           else
+              call stdlib_scopy( m, vt( nlp1, 1 ), ldvt, vt2( 1, 1 ), ldvt2 )
+           end if
+           if( m>n ) then
+              call stdlib_scopy( m, vt( m, 1 ), ldvt, vt2( m, 1 ), ldvt2 )
+           end if
+           ! the deflated singular values and their corresponding vectors go
+           ! into the back of d, u, and v respectively.
+           if( n>k ) then
+              call stdlib_scopy( n-k, dsigma( k+1 ), 1, d( k+1 ), 1 )
+              call stdlib_slacpy( 'A', n, n-k, u2( 1, k+1 ), ldu2, u( 1, k+1 ),ldu )
+              call stdlib_slacpy( 'A', n-k, m, vt2( k+1, 1 ), ldvt2, vt( k+1, 1 ),ldvt )
+           end if
+           ! copy ctot into coltyp for referencing in stdlib_slasd3.
+           do j = 1, 4
+              coltyp( j ) = ctot( j )
+           end do
+           return
+     end subroutine stdlib_slasd2
+
+     !> SLASWLQ: computes a blocked Tall-Skinny LQ factorization of
+     !> a real M-by-N matrix A for M <= N:
+     !> A = ( L 0 ) *  Q,
+     !> where:
+     !> Q is a n-by-N orthogonal matrix, stored on exit in an implicit
+     !> form in the elements above the diagonal of the array A and in
+     !> the elements of the array T;
+     !> L is a lower-triangular M-by-M matrix stored on exit in
+     !> the elements on and below the diagonal of the array A.
+     !> 0 is a M-by-(N-M) zero matrix, if M < N, and is not stored.
+
+     pure subroutine stdlib_slaswlq( m, n, mb, nb, a, lda, t, ldt, work, lwork,info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, mb, nb, lwork, ldt
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ii, kk, ctr
+           ! External Subroutines 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+             info = -1
+           else if( n<0 .or. n<m ) then
+             info = -2
+           else if( mb<1 .or. ( mb>m .and. m>0 )) then
+             info = -3
+           else if( nb<=0 ) then
+             info = -4
+           else if( lda<max( 1, m ) ) then
+             info = -6
+           else if( ldt<mb ) then
+             info = -8
+           else if( ( lwork<m*mb) .and. (.not.lquery) ) then
+             info = -10
+           end if
+           if( info==0)  then
+           work(1) = mb*m
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'SLASWLQ', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n)==0 ) then
+               return
+           end if
+           ! the lq decomposition
+            if((m>=n).or.(nb<=m).or.(nb>=n)) then
+             call stdlib_sgelqt( m, n, mb, a, lda, t, ldt, work, info)
+             return
+            end if
+            kk = mod((n-m),(nb-m))
+            ii=n-kk+1
+            ! compute the lq factorization of the first block a(1:m,1:nb)
+            call stdlib_sgelqt( m, nb, mb, a(1,1), lda, t, ldt, work, info)
+            ctr = 1
+            do i = nb+1, ii-nb+m , (nb-m)
+            ! compute the qr factorization of the current block a(1:m,i:i+nb-m)
+              call stdlib_stplqt( m, nb-m, 0, mb, a(1,1), lda, a( 1, i ),lda, t(1, ctr * m + 1),&
+                        ldt, work, info )
+              ctr = ctr + 1
+            end do
+           ! compute the qr factorization of the last block a(1:m,ii:n)
+            if (ii<=n) then
+             call stdlib_stplqt( m, kk, 0, mb, a(1,1), lda, a( 1, ii ),lda, t(1, ctr * m + 1), &
+                       ldt,work, info )
+            end if
+           work( 1 ) = m * mb
+           return
+     end subroutine stdlib_slaswlq
+
+     !> SLATSQR: computes a blocked Tall-Skinny QR factorization of
+     !> a real M-by-N matrix A for M >= N:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix, stored on exit in an implicit
+     !> form in the elements below the diagonal of the array A and in
+     !> the elements of the array T;
+     !> R is an upper-triangular N-by-N matrix, stored on exit in
+     !> the elements on and above the diagonal of the array A.
+     !> 0 is a (M-N)-by-N zero matrix, and is not stored.
+
+     pure subroutine stdlib_slatsqr( m, n, mb, nb, a, lda, t, ldt, work,lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, mb, nb, ldt, lwork
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ii, kk, ctr
+           ! External Subroutines 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+             info = -1
+           else if( n<0 .or. m<n ) then
+             info = -2
+           else if( mb<1 ) then
+             info = -3
+           else if( nb<1 .or. ( nb>n .and. n>0 )) then
+             info = -4
+           else if( lda<max( 1, m ) ) then
+             info = -6
+           else if( ldt<nb ) then
+             info = -8
+           else if( lwork<(n*nb) .and. (.not.lquery) ) then
+             info = -10
+           end if
+           if( info==0)  then
+             work(1) = nb*n
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'SLATSQR', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n)==0 ) then
+               return
+           end if
+           ! the qr decomposition
+            if ((mb<=n).or.(mb>=m)) then
+              call stdlib_sgeqrt( m, n, nb, a, lda, t, ldt, work, info)
+              return
+            end if
+            kk = mod((m-n),(mb-n))
+            ii=m-kk+1
+            ! compute the qr factorization of the first block a(1:mb,1:n)
+            call stdlib_sgeqrt( mb, n, nb, a(1,1), lda, t, ldt, work, info )
+            ctr = 1
+            do i = mb+1, ii-mb+n ,  (mb-n)
+            ! compute the qr factorization of the current block a(i:i+mb-n,1:n)
+              call stdlib_stpqrt( mb-n, n, 0, nb, a(1,1), lda, a( i, 1 ), lda,t(1, ctr * n + 1),&
+                        ldt, work, info )
+              ctr = ctr + 1
+            end do
+            ! compute the qr factorization of the last block a(ii:m,1:n)
+            if (ii<=m) then
+              call stdlib_stpqrt( kk, n, 0, nb, a(1,1), lda, a( ii, 1 ), lda,t(1, ctr * n + 1), &
+                        ldt,work, info )
+            end if
+           work( 1 ) = n*nb
+           return
+     end subroutine stdlib_slatsqr
+
+     !> SORGBR: generates one of the real orthogonal matrices Q or P**T
+     !> determined by SGEBRD when reducing a real matrix A to bidiagonal
+     !> form: A = Q * B * P**T.  Q and P**T are defined as products of
+     !> elementary reflectors H(i) or G(i) respectively.
+     !> If VECT = 'Q', A is assumed to have been an M-by-K matrix, and Q
+     !> is of order M:
+     !> if m >= k, Q = H(1) H(2) . . . H(k) and SORGBR returns the first n
+     !> columns of Q, where m >= n >= k;
+     !> if m < k, Q = H(1) H(2) . . . H(m-1) and SORGBR returns Q as an
+     !> M-by-M matrix.
+     !> If VECT = 'P', A is assumed to have been a K-by-N matrix, and P**T
+     !> is of order N:
+     !> if k < n, P**T = G(k) . . . G(2) G(1) and SORGBR returns the first m
+     !> rows of P**T, where n >= m >= k;
+     !> if k >= n, P**T = G(n-1) . . . G(2) G(1) and SORGBR returns P**T as
+     !> an N-by-N matrix.
+
+     pure subroutine stdlib_sorgbr( vect, m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantq
+           integer(ilp) :: i, iinfo, j, lwkopt, mn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           wantq = stdlib_lsame( vect, 'Q' )
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( .not.wantq .and. .not.stdlib_lsame( vect, 'P' ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 .or. ( wantq .and. ( n>m .or. n<min( m,k ) ) ) .or. ( .not.wantq .and. ( &
+                     m>n .or. m<min( n, k ) ) ) ) then
+              info = -3
+           else if( k<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( lwork<max( 1, mn ) .and. .not.lquery ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              work( 1 ) = 1
+              if( wantq ) then
+                 if( m>=k ) then
+                    call stdlib_sorgqr( m, n, k, a, lda, tau, work, -1, iinfo )
+                 else
+                    if( m>1 ) then
+                       call stdlib_sorgqr( m-1, m-1, m-1, a, lda, tau, work, -1,iinfo )
+                    end if
+                 end if
+              else
+                 if( k<n ) then
+                    call stdlib_sorglq( m, n, k, a, lda, tau, work, -1, iinfo )
+                 else
+                    if( n>1 ) then
+                       call stdlib_sorglq( n-1, n-1, n-1, a, lda, tau, work, -1,iinfo )
+                    end if
+                 end if
+              end if
+              lwkopt = work( 1 )
+              lwkopt = max (lwkopt, mn)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORGBR', -info )
+              return
+           else if( lquery ) then
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( wantq ) then
+              ! form q, determined by a call to stdlib_sgebrd to reduce an m-by-k
+              ! matrix
+              if( m>=k ) then
+                 ! if m >= k, assume m >= n >= k
+                 call stdlib_sorgqr( m, n, k, a, lda, tau, work, lwork, iinfo )
+              else
+                 ! if m < k, assume m = n
+                 ! shift the vectors which define the elementary reflectors one
+                 ! column to the right, and set the first row and column of q
+                 ! to those of the unit matrix
+                 do j = m, 2, -1
+                    a( 1, j ) = zero
+                    do i = j + 1, m
+                       a( i, j ) = a( i, j-1 )
+                    end do
+                 end do
+                 a( 1, 1 ) = one
+                 do i = 2, m
+                    a( i, 1 ) = zero
+                 end do
+                 if( m>1 ) then
+                    ! form q(2:m,2:m)
+                    call stdlib_sorgqr( m-1, m-1, m-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                              
+                 end if
+              end if
+           else
+              ! form p**t, determined by a call to stdlib_sgebrd to reduce a k-by-n
+              ! matrix
+              if( k<n ) then
+                 ! if k < n, assume k <= m <= n
+                 call stdlib_sorglq( m, n, k, a, lda, tau, work, lwork, iinfo )
+              else
+                 ! if k >= n, assume m = n
+                 ! shift the vectors which define the elementary reflectors one
+                 ! row downward, and set the first row and column of p**t to
+                 ! those of the unit matrix
+                 a( 1, 1 ) = one
+                 do i = 2, n
+                    a( i, 1 ) = zero
+                 end do
+                 do j = 2, n
+                    do i = j - 1, 2, -1
+                       a( i, j ) = a( i-1, j )
+                    end do
+                    a( 1, j ) = zero
+                 end do
+                 if( n>1 ) then
+                    ! form p**t(2:n,2:n)
+                    call stdlib_sorglq( n-1, n-1, n-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                              
+                 end if
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sorgbr
+
+     !> If VECT = 'Q', SORMBR: overwrites the general real M-by-N matrix C
+     !> with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**T * C       C * Q**T
+     !> If VECT = 'P', SORMBR overwrites the general real M-by-N matrix C
+     !> with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      P * C          C * P
+     !> TRANS = 'T':      P**T * C       C * P**T
+     !> Here Q and P**T are the orthogonal matrices determined by SGEBRD when
+     !> reducing a real matrix A to bidiagonal form: A = Q * B * P**T. Q and
+     !> P**T are defined as products of elementary reflectors H(i) and G(i)
+     !> respectively.
+     !> Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the
+     !> order of the orthogonal matrix Q or P**T that is applied.
+     !> If VECT = 'Q', A is assumed to have been an NQ-by-K matrix:
+     !> if nq >= k, Q = H(1) H(2) . . . H(k);
+     !> if nq < k, Q = H(1) H(2) . . . H(nq-1).
+     !> If VECT = 'P', A is assumed to have been a K-by-NQ matrix:
+     !> if k < nq, P = G(1) G(2) . . . G(k);
+     !> if k >= nq, P = G(1) G(2) . . . G(nq-1).
+
+     pure subroutine stdlib_sormbr( vect, side, trans, m, n, k, a, lda, tau, c,ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), c(ldc,*)
+           real(sp), intent(in) :: tau(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: applyq, left, lquery, notran
+           character :: transt
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           applyq = stdlib_lsame( vect, 'Q' )
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q or p and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.applyq .and. .not.stdlib_lsame( vect, 'P' ) ) then
+              info = -1
+           else if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -2
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( k<0 ) then
+              info = -6
+           else if( ( applyq .and. lda<max( 1, nq ) ) .or.( .not.applyq .and. lda<max( 1, min( nq,&
+                      k ) ) ) )then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( applyq ) then
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'SORMQR', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'SORMQR', side // trans, m, n-1, n-1,-1 )
+                 end if
+              else
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'SORMLQ', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'SORMLQ', side // trans, m, n-1, n-1,-1 )
+                 end if
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SORMBR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           work( 1 ) = 1
+           if( m==0 .or. n==0 )return
+           if( applyq ) then
+              ! apply q
+              if( nq>=k ) then
+                 ! q was determined by a call to stdlib_sgebrd with nq >= k
+                 call stdlib_sormqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, iinfo &
+                           )
+              else if( nq>1 ) then
+                 ! q was determined by a call to stdlib_sgebrd with nq < k
+                 if( left ) then
+                    mi = m - 1
+                    ni = n
+                    i1 = 2
+                    i2 = 1
+                 else
+                    mi = m
+                    ni = n - 1
+                    i1 = 1
+                    i2 = 2
+                 end if
+                 call stdlib_sormqr( side, trans, mi, ni, nq-1, a( 2, 1 ), lda, tau,c( i1, i2 ), &
+                           ldc, work, lwork, iinfo )
+              end if
+           else
+              ! apply p
+              if( notran ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              if( nq>k ) then
+                 ! p was determined by a call to stdlib_sgebrd with nq > k
+                 call stdlib_sormlq( side, transt, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                           iinfo )
+              else if( nq>1 ) then
+                 ! p was determined by a call to stdlib_sgebrd with nq <= k
+                 if( left ) then
+                    mi = m - 1
+                    ni = n
+                    i1 = 2
+                    i2 = 1
+                 else
+                    mi = m
+                    ni = n - 1
+                    i1 = 1
+                    i2 = 2
+                 end if
+                 call stdlib_sormlq( side, transt, mi, ni, nq-1, a( 1, 2 ), lda,tau, c( i1, i2 ), &
+                           ldc, work, lwork, iinfo )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sormbr
+
+     !> SPBSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite band matrix and X
+     !> and B are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular band matrix, and L is a lower
+     !> triangular band matrix, with the same number of superdiagonals or
+     !> subdiagonals as A.  The factored form of A is then used to solve the
+     !> system of equations A * X = B.
+
+     pure subroutine stdlib_spbsv( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(inout) :: ab(ldab,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPBSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**t*u or a = l*l**t.
+           call stdlib_spbtrf( uplo, n, kd, ab, ldab, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_spbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_spbsv
+
+     !> SPBSVX: uses the Cholesky factorization A = U**T*U or A = L*L**T to
+     !> compute the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite band matrix and X
+     !> and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_spbsvx( fact, uplo, n, kd, nrhs, ab, ldab, afb, ldafb,equed, s, b, ldb, x, &
+               ldx, rcond, ferr, berr,work, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldafb, ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: ab(ldab,*), afb(ldafb,*), b(ldb,*), s(*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ, upper
+           integer(ilp) :: i, infequ, j, j1, j2
+           real(sp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           else if( ldafb<kd+1 ) then
+              info = -9
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -10
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -11
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -13
+                 else if( ldx<max( 1, n ) ) then
+                    info = -15
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPBSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_spbequ( uplo, n, kd, ab, ldab, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_slaqsb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right-hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**t *u or a = l*l**t.
+              if( upper ) then
+                 do j = 1, n
+                    j1 = max( j-kd, 1 )
+                    call stdlib_scopy( j-j1+1, ab( kd+1-j+j1, j ), 1,afb( kd+1-j+j1, j ), 1 )
+                              
+                 end do
+              else
+                 do j = 1, n
+                    j2 = min( j+kd, n )
+                    call stdlib_scopy( j2-j+1, ab( 1, j ), 1, afb( 1, j ), 1 )
+                 end do
+              end if
+              call stdlib_spbtrf( uplo, n, kd, afb, ldafb, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_slansb( '1', uplo, n, kd, ab, ldab, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_spbcon( uplo, n, kd, afb, ldafb, anorm, rcond, work, iwork,info )
+           ! compute the solution matrix x.
+           call stdlib_slacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_spbtrs( uplo, n, kd, nrhs, afb, ldafb, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_spbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b, ldb, x,ldx, ferr, berr,&
+                      work, iwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_spbsvx
+
+     !> SPFTRF: computes the Cholesky factorization of a real symmetric
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**T * U,  if UPLO = 'U', or
+     !> A = L  * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the block version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_spftrf( transr, uplo, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(in) :: n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'T' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPFTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_spotrf( 'L', n1, a( 0 ), n, info )
+                    if( info>0 )return
+                    call stdlib_strsm( 'R', 'L', 'T', 'N', n2, n1, one, a( 0 ), n,a( n1 ), n )
+                              
+                    call stdlib_ssyrk( 'U', 'N', n2, n1, -one, a( n1 ), n, one,a( n ), n )
+                    call stdlib_spotrf( 'U', n2, a( n ), n, info )
+                    if( info>0 )info = info + n1
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_spotrf( 'L', n1, a( n2 ), n, info )
+                    if( info>0 )return
+                    call stdlib_strsm( 'L', 'L', 'N', 'N', n1, n2, one, a( n2 ), n,a( 0 ), n )
+                              
+                    call stdlib_ssyrk( 'U', 'T', n2, n1, -one, a( 0 ), n, one,a( n1 ), n )
+                    call stdlib_spotrf( 'U', n2, a( n1 ), n, info )
+                    if( info>0 )info = info + n1
+                 end if
+              else
+                 ! n is odd and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    call stdlib_spotrf( 'U', n1, a( 0 ), n1, info )
+                    if( info>0 )return
+                    call stdlib_strsm( 'L', 'U', 'T', 'N', n1, n2, one, a( 0 ), n1,a( n1*n1 ), n1 &
+                              )
+                    call stdlib_ssyrk( 'L', 'T', n2, n1, -one, a( n1*n1 ), n1, one,a( 1 ), n1 )
+                              
+                    call stdlib_spotrf( 'L', n2, a( 1 ), n1, info )
+                    if( info>0 )info = info + n1
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    call stdlib_spotrf( 'U', n1, a( n2*n2 ), n2, info )
+                    if( info>0 )return
+                    call stdlib_strsm( 'R', 'U', 'N', 'N', n2, n1, one, a( n2*n2 ),n2, a( 0 ), n2 &
+                              )
+                    call stdlib_ssyrk( 'L', 'N', n2, n1, -one, a( 0 ), n2, one,a( n1*n2 ), n2 )
+                              
+                    call stdlib_spotrf( 'L', n2, a( n1*n2 ), n2, info )
+                    if( info>0 )info = info + n1
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_spotrf( 'L', k, a( 1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_strsm( 'R', 'L', 'T', 'N', k, k, one, a( 1 ), n+1,a( k+1 ), n+1 )
+                              
+                    call stdlib_ssyrk( 'U', 'N', k, k, -one, a( k+1 ), n+1, one,a( 0 ), n+1 )
+                              
+                    call stdlib_spotrf( 'U', k, a( 0 ), n+1, info )
+                    if( info>0 )info = info + k
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_spotrf( 'L', k, a( k+1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_strsm( 'L', 'L', 'N', 'N', k, k, one, a( k+1 ),n+1, a( 0 ), n+1 )
+                              
+                    call stdlib_ssyrk( 'U', 'T', k, k, -one, a( 0 ), n+1, one,a( k ), n+1 )
+                              
+                    call stdlib_spotrf( 'U', k, a( k ), n+1, info )
+                    if( info>0 )info = info + k
+                 end if
+              else
+                 ! n is even and transr = 't'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_spotrf( 'U', k, a( 0+k ), k, info )
+                    if( info>0 )return
+                    call stdlib_strsm( 'L', 'U', 'T', 'N', k, k, one, a( k ), n1,a( k*( k+1 ) ), &
+                              k )
+                    call stdlib_ssyrk( 'L', 'T', k, k, -one, a( k*( k+1 ) ), k, one,a( 0 ), k )
+                              
+                    call stdlib_spotrf( 'L', k, a( 0 ), k, info )
+                    if( info>0 )info = info + k
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_spotrf( 'U', k, a( k*( k+1 ) ), k, info )
+                    if( info>0 )return
+                    call stdlib_strsm( 'R', 'U', 'N', 'N', k, k, one,a( k*( k+1 ) ), k, a( 0 ), k &
+                              )
+                    call stdlib_ssyrk( 'L', 'N', k, k, -one, a( 0 ), k, one,a( k*k ), k )
+                    call stdlib_spotrf( 'L', k, a( k*k ), k, info )
+                    if( info>0 )info = info + k
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_spftrf
+
+     !> SPOSV: computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**T* U,  if UPLO = 'U', or
+     !> A = L * L**T,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is a lower triangular
+     !> matrix.  The factored form of A is then used to solve the system of
+     !> equations A * X = B.
+
+     pure subroutine stdlib_sposv( uplo, n, nrhs, a, lda, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPOSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**t*u or a = l*l**t.
+           call stdlib_spotrf( uplo, n, a, lda, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_spotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_sposv
+
+     !> SPOSVX: uses the Cholesky factorization A = U**T*U or A = L*L**T to
+     !> compute the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_sposvx( fact, uplo, n, nrhs, a, lda, af, ldaf, equed,s, b, ldb, x, ldx, &
+               rcond, ferr, berr, work,iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           real(sp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: a(lda,*), af(ldaf,*), b(ldb,*), s(*)
+           real(sp), intent(out) :: berr(*), ferr(*), work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ
+           integer(ilp) :: i, infequ, j
+           real(sp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_slamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -9
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -10
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -12
+                 else if( ldx<max( 1, n ) ) then
+                    info = -14
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPOSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_spoequ( n, a, lda, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_slaqsy( uplo, n, a, lda, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**t *u or a = l*l**t.
+              call stdlib_slacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_spotrf( uplo, n, af, ldaf, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_slansy( '1', uplo, n, a, lda, work )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_spocon( uplo, n, af, ldaf, anorm, rcond, work, iwork, info )
+           ! compute the solution matrix x.
+           call stdlib_slacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_spotrs( uplo, n, nrhs, af, ldaf, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_sporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x, ldx,ferr, berr, work, &
+                     iwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_slamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_sposvx
+
+     !> STREXC: reorders the real Schur factorization of a real matrix
+     !> A = Q*T*Q**T, so that the diagonal block of T with row index IFST is
+     !> moved to row ILST.
+     !> The real Schur form T is reordered by an orthogonal similarity
+     !> transformation Z**T*T*Z, and optionally the matrix Q of Schur vectors
+     !> is updated by postmultiplying it with Z.
+     !> T must be in Schur canonical form (as returned by SHSEQR), that is,
+     !> block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; each
+     !> 2-by-2 diagonal block has its diagonal elements equal and its
+     !> off-diagonal elements of opposite sign.
+
+     subroutine stdlib_strexc( compq, n, t, ldt, q, ldq, ifst, ilst, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq
+           integer(ilp), intent(inout) :: ifst, ilst
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldq, ldt, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: q(ldq,*), t(ldt,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantq
+           integer(ilp) :: here, nbf, nbl, nbnext
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode and test the input arguments.
+           info = 0
+           wantq = stdlib_lsame( compq, 'V' )
+           if( .not.wantq .and. .not.stdlib_lsame( compq, 'N' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldt<max( 1, n ) ) then
+              info = -4
+           else if( ldq<1 .or. ( wantq .and. ldq<max( 1, n ) ) ) then
+              info = -6
+           else if(( ifst<1 .or. ifst>n ).and.( n>0 )) then
+              info = -7
+           else if(( ilst<1 .or. ilst>n ).and.( n>0 )) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STREXC', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 )return
+           ! determine the first row of specified block
+           ! and find out it is 1 by 1 or 2 by 2.
+           if( ifst>1 ) then
+              if( t( ifst, ifst-1 )/=zero )ifst = ifst - 1
+           end if
+           nbf = 1
+           if( ifst<n ) then
+              if( t( ifst+1, ifst )/=zero )nbf = 2
+           end if
+           ! determine the first row of the final block
+           ! and find out it is 1 by 1 or 2 by 2.
+           if( ilst>1 ) then
+              if( t( ilst, ilst-1 )/=zero )ilst = ilst - 1
+           end if
+           nbl = 1
+           if( ilst<n ) then
+              if( t( ilst+1, ilst )/=zero )nbl = 2
+           end if
+           if( ifst==ilst )return
+           if( ifst<ilst ) then
+              ! update ilst
+              if( nbf==2 .and. nbl==1 )ilst = ilst - 1
+              if( nbf==1 .and. nbl==2 )ilst = ilst + 1
+              here = ifst
+              10 continue
+              ! swap block with next one below
+              if( nbf==1 .or. nbf==2 ) then
+                 ! current block either 1 by 1 or 2 by 2
+                 nbnext = 1
+                 if( here+nbf+1<=n ) then
+                    if( t( here+nbf+1, here+nbf )/=zero )nbnext = 2
+                 end if
+                 call stdlib_slaexc( wantq, n, t, ldt, q, ldq, here, nbf, nbnext,work, info )
+                           
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 here = here + nbnext
+                 ! test if 2 by 2 block breaks into two 1 by 1 blocks
+                 if( nbf==2 ) then
+                    if( t( here+1, here )==zero )nbf = 3
+                 end if
+              else
+                 ! current block consists of two 1 by 1 blocks each of which
+                 ! must be swapped individually
+                 nbnext = 1
+                 if( here+3<=n ) then
+                    if( t( here+3, here+2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_slaexc( wantq, n, t, ldt, q, ldq, here+1, 1, nbnext,work, info )
+                           
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 if( nbnext==1 ) then
+                    ! swap two 1 by 1 blocks, no problems possible
+                    call stdlib_slaexc( wantq, n, t, ldt, q, ldq, here, 1, nbnext,work, info )
+                              
+                    here = here + 1
+                 else
+                    ! recompute nbnext in case 2 by 2 split
+                    if( t( here+2, here+1 )==zero )nbnext = 1
+                    if( nbnext==2 ) then
+                       ! 2 by 2 block did not split
+                       call stdlib_slaexc( wantq, n, t, ldt, q, ldq, here, 1,nbnext, work, info )
+                                 
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here + 2
+                    else
+                       ! 2 by 2 block did split
+                       call stdlib_slaexc( wantq, n, t, ldt, q, ldq, here, 1, 1,work, info )
+                                 
+                       call stdlib_slaexc( wantq, n, t, ldt, q, ldq, here+1, 1, 1,work, info )
+                                 
+                       here = here + 2
+                    end if
+                 end if
+              end if
+              if( here<ilst )go to 10
+           else
+              here = ifst
+              20 continue
+              ! swap block with next one above
+              if( nbf==1 .or. nbf==2 ) then
+                 ! current block either 1 by 1 or 2 by 2
+                 nbnext = 1
+                 if( here>=3 ) then
+                    if( t( here-1, here-2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_slaexc( wantq, n, t, ldt, q, ldq, here-nbnext, nbnext,nbf, work, &
+                           info )
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 here = here - nbnext
+                 ! test if 2 by 2 block breaks into two 1 by 1 blocks
+                 if( nbf==2 ) then
+                    if( t( here+1, here )==zero )nbf = 3
+                 end if
+              else
+                 ! current block consists of two 1 by 1 blocks each of which
+                 ! must be swapped individually
+                 nbnext = 1
+                 if( here>=3 ) then
+                    if( t( here-1, here-2 )/=zero )nbnext = 2
+                 end if
+                 call stdlib_slaexc( wantq, n, t, ldt, q, ldq, here-nbnext, nbnext,1, work, info )
+                           
+                 if( info/=0 ) then
+                    ilst = here
+                    return
+                 end if
+                 if( nbnext==1 ) then
+                    ! swap two 1 by 1 blocks, no problems possible
+                    call stdlib_slaexc( wantq, n, t, ldt, q, ldq, here, nbnext, 1,work, info )
+                              
+                    here = here - 1
+                 else
+                    ! recompute nbnext in case 2 by 2 split
+                    if( t( here, here-1 )==zero )nbnext = 1
+                    if( nbnext==2 ) then
+                       ! 2 by 2 block did not split
+                       call stdlib_slaexc( wantq, n, t, ldt, q, ldq, here-1, 2, 1,work, info )
+                                 
+                       if( info/=0 ) then
+                          ilst = here
+                          return
+                       end if
+                       here = here - 2
+                    else
+                       ! 2 by 2 block did split
+                       call stdlib_slaexc( wantq, n, t, ldt, q, ldq, here, 1, 1,work, info )
+                                 
+                       call stdlib_slaexc( wantq, n, t, ldt, q, ldq, here-1, 1, 1,work, info )
+                                 
+                       here = here - 2
+                    end if
+                 end if
+              end if
+              if( here>ilst )go to 20
+           end if
+           ilst = here
+           return
+     end subroutine stdlib_strexc
+
+     !> STRSEN: reorders the real Schur factorization of a real matrix
+     !> A = Q*T*Q**T, so that a selected cluster of eigenvalues appears in
+     !> the leading diagonal blocks of the upper quasi-triangular matrix T,
+     !> and the leading columns of Q form an orthonormal basis of the
+     !> corresponding right invariant subspace.
+     !> Optionally the routine computes the reciprocal condition numbers of
+     !> the cluster of eigenvalues and/or the invariant subspace.
+     !> T must be in Schur canonical form (as returned by SHSEQR), that is,
+     !> block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; each
+     !> 2-by-2 diagonal block has its diagonal elements equal and its
+     !> off-diagonal elements of opposite sign.
+
+     subroutine stdlib_strsen( job, compq, select, n, t, ldt, q, ldq, wr, wi,m, s, sep, work, &
+               lwork, iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldq, ldt, liwork, lwork, n
+           real(sp), intent(out) :: s, sep
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: q(ldq,*), t(ldt,*)
+           real(sp), intent(out) :: wi(*), work(*), wr(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, pair, swap, wantbh, wantq, wants, wantsp
+           integer(ilp) :: ierr, k, kase, kk, ks, liwmin, lwmin, n1, n2, nn
+           real(sp) :: est, rnorm, scale
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantsp = stdlib_lsame( job, 'V' ) .or. wantbh
+           wantq = stdlib_lsame( compq, 'V' )
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.wants .and. .not.wantsp )then
+              info = -1
+           else if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -8
+           else
+              ! set m to the dimension of the specified invariant subspace,
+              ! and test lwork and liwork.
+              m = 0
+              pair = .false.
+              do k = 1, n
+                 if( pair ) then
+                    pair = .false.
+                 else
+                    if( k<n ) then
+                       if( t( k+1, k )==zero ) then
+                          if( select( k ) )m = m + 1
+                       else
+                          pair = .true.
+                          if( select( k ) .or. select( k+1 ) )m = m + 2
+                       end if
+                    else
+                       if( select( n ) )m = m + 1
+                    end if
+                 end if
+              end do
+              n1 = m
+              n2 = n - m
+              nn = n1*n2
+              if(  wantsp ) then
+                 lwmin = max( 1, 2*nn )
+                 liwmin = max( 1, nn )
+              else if( stdlib_lsame( job, 'N' ) ) then
+                 lwmin = max( 1, n )
+                 liwmin = 1
+              else if( stdlib_lsame( job, 'E' ) ) then
+                 lwmin = max( 1, nn )
+                 liwmin = 1
+              end if
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -15
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -17
+              end if
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STRSEN', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( m==n .or. m==0 ) then
+              if( wants )s = one
+              if( wantsp )sep = stdlib_slange( '1', n, n, t, ldt, work )
+              go to 40
+           end if
+           ! collect the selected blocks at the top-left corner of t.
+           ks = 0
+           pair = .false.
+           loop_20: do k = 1, n
+              if( pair ) then
+                 pair = .false.
+              else
+                 swap = select( k )
+                 if( k<n ) then
+                    if( t( k+1, k )/=zero ) then
+                       pair = .true.
+                       swap = swap .or. select( k+1 )
+                    end if
+                 end if
+                 if( swap ) then
+                    ks = ks + 1
+                    ! swap the k-th block to position ks.
+                    ierr = 0
+                    kk = k
+                    if( k/=ks )call stdlib_strexc( compq, n, t, ldt, q, ldq, kk, ks, work,ierr )
+                              
+                    if( ierr==1 .or. ierr==2 ) then
+                       ! blocks too close to swap: exit.
+                       info = 1
+                       if( wants )s = zero
+                       if( wantsp )sep = zero
+                       go to 40
+                    end if
+                    if( pair )ks = ks + 1
+                 end if
+              end if
+           end do loop_20
+           if( wants ) then
+              ! solve sylvester equation for r:
+                 ! t11*r - r*t22 = scale*t12
+              call stdlib_slacpy( 'F', n1, n2, t( 1, n1+1 ), ldt, work, n1 )
+              call stdlib_strsyl( 'N', 'N', -1, n1, n2, t, ldt, t( n1+1, n1+1 ),ldt, work, n1, &
+                        scale, ierr )
+              ! estimate the reciprocal of the condition number of the cluster
+              ! of eigenvalues.
+              rnorm = stdlib_slange( 'F', n1, n2, work, n1, work )
+              if( rnorm==zero ) then
+                 s = one
+              else
+                 s = scale / ( sqrt( scale*scale / rnorm+rnorm )*sqrt( rnorm ) )
+              end if
+           end if
+           if( wantsp ) then
+              ! estimate sep(t11,t22).
+              est = zero
+              kase = 0
+              30 continue
+              call stdlib_slacn2( nn, work( nn+1 ), work, iwork, est, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! solve  t11*r - r*t22 = scale*x.
+                    call stdlib_strsyl( 'N', 'N', -1, n1, n2, t, ldt,t( n1+1, n1+1 ), ldt, work, &
+                              n1, scale,ierr )
+                 else
+                    ! solve t11**t*r - r*t22**t = scale*x.
+                    call stdlib_strsyl( 'T', 'T', -1, n1, n2, t, ldt,t( n1+1, n1+1 ), ldt, work, &
+                              n1, scale,ierr )
+                 end if
+                 go to 30
+              end if
+              sep = scale / est
+           end if
+           40 continue
+           ! store the output eigenvalues in wr and wi.
+           do k = 1, n
+              wr( k ) = t( k, k )
+              wi( k ) = zero
+           end do
+           do k = 1, n - 1
+              if( t( k+1, k )/=zero ) then
+                 wi( k ) = sqrt( abs( t( k, k+1 ) ) )*sqrt( abs( t( k+1, k ) ) )
+                 wi( k+1 ) = -wi( k )
+              end if
+           end do
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_strsen
+
+     !> STRSNA: estimates reciprocal condition numbers for specified
+     !> eigenvalues and/or right eigenvectors of a real upper
+     !> quasi-triangular matrix T (or of any matrix Q*T*Q**T with Q
+     !> orthogonal).
+     !> T must be in Schur canonical form (as returned by SHSEQR), that is,
+     !> block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; each
+     !> 2-by-2 diagonal block has its diagonal elements equal and its
+     !> off-diagonal elements of opposite sign.
+
+     subroutine stdlib_strsna( job, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, s, sep, mm, m, &
+               work, ldwork, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, ldwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(out) :: s(*), sep(*), work(ldwork,*)
+           real(sp), intent(in) :: t(ldt,*), vl(ldvl,*), vr(ldvr,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: pair, somcon, wantbh, wants, wantsp
+           integer(ilp) :: i, ierr, ifst, ilst, j, k, kase, ks, n2, nn
+           real(sp) :: bignum, cond, cs, delta, dumm, eps, est, lnrm, mu, prod, prod1, prod2, &
+                     rnrm, scale, smlnum, sn
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           real(sp) :: dummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantsp = stdlib_lsame( job, 'V' ) .or. wantbh
+           somcon = stdlib_lsame( howmny, 'S' )
+           info = 0
+           if( .not.wants .and. .not.wantsp ) then
+              info = -1
+           else if( .not.stdlib_lsame( howmny, 'A' ) .and. .not.somcon ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( wants .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( wants .and. ldvr<n ) ) then
+              info = -10
+           else
+              ! set m to the number of eigenpairs for which condition numbers
+              ! are required, and test mm.
+              if( somcon ) then
+                 m = 0
+                 pair = .false.
+                 do k = 1, n
+                    if( pair ) then
+                       pair = .false.
+                    else
+                       if( k<n ) then
+                          if( t( k+1, k )==zero ) then
+                             if( select( k ) )m = m + 1
+                          else
+                             pair = .true.
+                             if( select( k ) .or. select( k+1 ) )m = m + 2
+                          end if
+                       else
+                          if( select( n ) )m = m + 1
+                       end if
+                    end if
+                 end do
+              else
+                 m = n
+              end if
+              if( mm<m ) then
+                 info = -13
+              else if( ldwork<1 .or. ( wantsp .and. ldwork<n ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'STRSNA', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( somcon ) then
+                 if( .not.select( 1 ) )return
+              end if
+              if( wants )s( 1 ) = one
+              if( wantsp )sep( 1 ) = abs( t( 1, 1 ) )
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ks = 0
+           pair = .false.
+           loop_60: do k = 1, n
+              ! determine whether t(k,k) begins a 1-by-1 or 2-by-2 block.
+              if( pair ) then
+                 pair = .false.
+                 cycle loop_60
+              else
+                 if( k<n )pair = t( k+1, k )/=zero
+              end if
+              ! determine whether condition numbers are required for the k-th
+              ! eigenpair.
+              if( somcon ) then
+                 if( pair ) then
+                    if( .not.select( k ) .and. .not.select( k+1 ) )cycle loop_60
+                 else
+                    if( .not.select( k ) )cycle loop_60
+                 end if
+              end if
+              ks = ks + 1
+              if( wants ) then
+                 ! compute the reciprocal condition number of the k-th
+                 ! eigenvalue.
+                 if( .not.pair ) then
+                    ! real eigenvalue.
+                    prod = stdlib_sdot( n, vr( 1, ks ), 1, vl( 1, ks ), 1 )
+                    rnrm = stdlib_snrm2( n, vr( 1, ks ), 1 )
+                    lnrm = stdlib_snrm2( n, vl( 1, ks ), 1 )
+                    s( ks ) = abs( prod ) / ( rnrm*lnrm )
+                 else
+                    ! complex eigenvalue.
+                    prod1 = stdlib_sdot( n, vr( 1, ks ), 1, vl( 1, ks ), 1 )
+                    prod1 = prod1 + stdlib_sdot( n, vr( 1, ks+1 ), 1, vl( 1, ks+1 ),1 )
+                    prod2 = stdlib_sdot( n, vl( 1, ks ), 1, vr( 1, ks+1 ), 1 )
+                    prod2 = prod2 - stdlib_sdot( n, vl( 1, ks+1 ), 1, vr( 1, ks ),1 )
+                    rnrm = stdlib_slapy2( stdlib_snrm2( n, vr( 1, ks ), 1 ),stdlib_snrm2( n, vr( &
+                              1, ks+1 ), 1 ) )
+                    lnrm = stdlib_slapy2( stdlib_snrm2( n, vl( 1, ks ), 1 ),stdlib_snrm2( n, vl( &
+                              1, ks+1 ), 1 ) )
+                    cond = stdlib_slapy2( prod1, prod2 ) / ( rnrm*lnrm )
+                    s( ks ) = cond
+                    s( ks+1 ) = cond
+                 end if
+              end if
+              if( wantsp ) then
+                 ! estimate the reciprocal condition number of the k-th
+                 ! eigenvector.
+                 ! copy the matrix t to the array work and swap the diagonal
+                 ! block beginning at t(k,k) to the (1,1) position.
+                 call stdlib_slacpy( 'FULL', n, n, t, ldt, work, ldwork )
+                 ifst = k
+                 ilst = 1
+                 call stdlib_strexc( 'NO Q', n, work, ldwork, dummy, 1, ifst, ilst,work( 1, n+1 ),&
+                            ierr )
+                 if( ierr==1 .or. ierr==2 ) then
+                    ! could not swap because blocks not well separated
+                    scale = one
+                    est = bignum
+                 else
+                    ! reordering successful
+                    if( work( 2, 1 )==zero ) then
+                       ! form c = t22 - lambda*i in work(2:n,2:n).
+                       do i = 2, n
+                          work( i, i ) = work( i, i ) - work( 1, 1 )
+                       end do
+                       n2 = 1
+                       nn = n - 1
+                    else
+                       ! triangularize the 2 by 2 block by unitary
+                       ! transformation u = [  cs   i*ss ]
+                                          ! [ i*ss   cs  ].
+                       ! such that the (1,1) position of work is complex
+                       ! eigenvalue lambda with positive imaginary part. (2,2)
+                       ! position of work is the complex eigenvalue lambda
+                       ! with negative imaginary  part.
+                       mu = sqrt( abs( work( 1, 2 ) ) )*sqrt( abs( work( 2, 1 ) ) )
+                       delta = stdlib_slapy2( mu, work( 2, 1 ) )
+                       cs = mu / delta
+                       sn = -work( 2, 1 ) / delta
+                       ! form
+                       ! c**t = work(2:n,2:n) + i*[rwork(1) ..... rwork(n-1) ]
+                                                ! [   mu                     ]
+                                                ! [         ..               ]
+                                                ! [             ..           ]
+                                                ! [                  mu      ]
+                       ! where c**t is transpose of matrix c,
+                       ! and rwork is stored starting in the n+1-st column of
+                       ! work.
+                       do j = 3, n
+                          work( 2, j ) = cs*work( 2, j )
+                          work( j, j ) = work( j, j ) - work( 1, 1 )
+                       end do
+                       work( 2, 2 ) = zero
+                       work( 1, n+1 ) = two*mu
+                       do i = 2, n - 1
+                          work( i, n+1 ) = sn*work( 1, i+1 )
+                       end do
+                       n2 = 2
+                       nn = 2*( n-1 )
+                    end if
+                    ! estimate norm(inv(c**t))
+                    est = zero
+                    kase = 0
+                    50 continue
+                    call stdlib_slacn2( nn, work( 1, n+2 ), work( 1, n+4 ), iwork,est, kase, &
+                              isave )
+                    if( kase/=0 ) then
+                       if( kase==1 ) then
+                          if( n2==1 ) then
+                             ! real eigenvalue: solve c**t*x = scale*c.
+                             call stdlib_slaqtr( .true., .true., n-1, work( 2, 2 ),ldwork, dummy, &
+                                       dumm, scale,work( 1, n+4 ), work( 1, n+6 ),ierr )
+                          else
+                             ! complex eigenvalue: solve
+                             ! c**t*(p+iq) = scale*(c+id) in real arithmetic.
+                             call stdlib_slaqtr( .true., .false., n-1, work( 2, 2 ),ldwork, work( &
+                                       1, n+1 ), mu, scale,work( 1, n+4 ), work( 1, n+6 ),ierr )
+                          end if
+                       else
+                          if( n2==1 ) then
+                             ! real eigenvalue: solve c*x = scale*c.
+                             call stdlib_slaqtr( .false., .true., n-1, work( 2, 2 ),ldwork, dummy,&
+                                        dumm, scale,work( 1, n+4 ), work( 1, n+6 ),ierr )
+                          else
+                             ! complex eigenvalue: solve
+                             ! c*(p+iq) = scale*(c+id) in real arithmetic.
+                             call stdlib_slaqtr( .false., .false., n-1,work( 2, 2 ), ldwork,work( &
+                                       1, n+1 ), mu, scale,work( 1, n+4 ), work( 1, n+6 ),ierr )
+                          end if
+                       end if
+                       go to 50
+                    end if
+                 end if
+                 sep( ks ) = scale / max( est, smlnum )
+                 if( pair )sep( ks+1 ) = sep( ks )
+              end if
+              if( pair )ks = ks + 1
+           end do loop_60
+           return
+     end subroutine stdlib_strsna
+
+     !> SGELQ: computes an LQ factorization of a real M-by-N matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a N-by-N orthogonal matrix;
+     !> L is a lower-triangular M-by-M matrix;
+     !> 0 is a M-by-(N-M) zero matrix, if M < N.
+
+     pure subroutine stdlib_sgelq( m, n, a, lda, t, tsize, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, tsize, lwork
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: t(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, lminws, mint, minw
+           integer(ilp) :: mb, nb, mintsz, nblcks, lwmin, lwopt, lwreq
+           ! Intrinsic Functions 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( tsize==-1 .or. tsize==-2 .or.lwork==-1 .or. lwork==-2 )
+           mint = .false.
+           minw = .false.
+           if( tsize==-2 .or. lwork==-2 ) then
+             if( tsize/=-1 ) mint = .true.
+             if( lwork/=-1 ) minw = .true.
+           end if
+           ! determine the block size
+           if( min( m, n )>0 ) then
+             mb = stdlib_ilaenv( 1, 'SGELQ ', ' ', m, n, 1, -1 )
+             nb = stdlib_ilaenv( 1, 'SGELQ ', ' ', m, n, 2, -1 )
+           else
+             mb = 1
+             nb = n
+           end if
+           if( mb>min( m, n ) .or. mb<1 ) mb = 1
+           if( nb>n .or. nb<=m ) nb = n
+           mintsz = m + 5
+           if ( nb>m .and. n>m ) then
+             if( mod( n - m, nb - m )==0 ) then
+               nblcks = ( n - m ) / ( nb - m )
+             else
+               nblcks = ( n - m ) / ( nb - m ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           ! determine if the workspace size satisfies minimal size
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+              lwmin = max( 1, n )
+              lwopt = max( 1, mb*n )
+           else
+              lwmin = max( 1, m )
+              lwopt = max( 1, mb*m )
+           end if
+           lminws = .false.
+           if( ( tsize<max( 1, mb*m*nblcks + 5 ) .or. lwork<lwopt ).and. ( lwork>=lwmin ) .and. ( &
+                     tsize>=mintsz ).and. ( .not.lquery ) ) then
+             if( tsize<max( 1, mb*m*nblcks + 5 ) ) then
+               lminws = .true.
+               mb = 1
+               nb = n
+             end if
+             if( lwork<lwopt ) then
+               lminws = .true.
+               mb = 1
+             end if
+           end if
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+              lwreq = max( 1, mb*n )
+           else
+              lwreq = max( 1, mb*m )
+           end if
+           if( m<0 ) then
+             info = -1
+           else if( n<0 ) then
+             info = -2
+           else if( lda<max( 1, m ) ) then
+             info = -4
+           else if( tsize<max( 1, mb*m*nblcks + 5 ).and. ( .not.lquery ) .and. ( .not.lminws ) ) &
+                     then
+             info = -6
+           else if( ( lwork<lwreq ) .and.( .not.lquery ).and. ( .not.lminws ) ) then
+             info = -8
+           end if
+           if( info==0 ) then
+             if( mint ) then
+               t( 1 ) = mintsz
+             else
+               t( 1 ) = mb*m*nblcks + 5
+             end if
+             t( 2 ) = mb
+             t( 3 ) = nb
+             if( minw ) then
+               work( 1 ) = lwmin
+             else
+               work( 1 ) = lwreq
+             end if
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'SGELQ', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+             return
+           end if
+           ! the lq decomposition
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+             call stdlib_sgelqt( m, n, mb, a, lda, t( 6 ), mb, work, info )
+           else
+             call stdlib_slaswlq( m, n, mb, nb, a, lda, t( 6 ), mb, work,lwork, info )
+           end if
+           work( 1 ) = lwreq
+           return
+     end subroutine stdlib_sgelq
+
+     !> SGELSY: computes the minimum-norm solution to a real linear least
+     !> squares problem:
+     !> minimize || A * X - B ||
+     !> using a complete orthogonal factorization of A.  A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+     !> The routine first computes a QR factorization with column pivoting:
+     !> A * P = Q * [ R11 R12 ]
+     !> [  0  R22 ]
+     !> with R11 defined as the largest leading submatrix whose estimated
+     !> condition number is less than 1/RCOND.  The order of R11, RANK,
+     !> is the effective rank of A.
+     !> Then, R22 is considered to be negligible, and R12 is annihilated
+     !> by orthogonal transformations from the right, arriving at the
+     !> complete orthogonal factorization:
+     !> A * P = Q * [ T11 0 ] * Z
+     !> [  0  0 ]
+     !> The minimum-norm solution is then
+     !> X = P * Z**T [ inv(T11)*Q1**T*B ]
+     !> [        0         ]
+     !> where Q1 consists of the first RANK columns of Q.
+     !> This routine is basically identical to the original xGELSX except
+     !> three differences:
+     !> o The call to the subroutine xGEQPF has been substituted by the
+     !> the call to the subroutine xGEQP3. This subroutine is a Blas-3
+     !> version of the QR factorization with column pivoting.
+     !> o Matrix B (the right hand side) is updated with Blas-3.
+     !> o The permutation of matrix B (the right hand side) is faster and
+     !> more simple.
+
+     subroutine stdlib_sgelsy( m, n, nrhs, a, lda, b, ldb, jpvt, rcond, rank,work, lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(sp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: imax = 1
+           integer(ilp), parameter :: imin = 2
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iascl, ibscl, ismax, ismin, j, lwkmin, lwkopt, mn, nb, nb1, nb2, &
+                     nb3, nb4
+           real(sp) :: anrm, bignum, bnrm, c1, c2, s1, s2, smax, smaxpr, smin, sminpr, smlnum, &
+                     wsize
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           mn = min( m, n )
+           ismin = mn + 1
+           ismax = 2*mn + 1
+           ! test the input arguments.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, m, n ) ) then
+              info = -7
+           end if
+           ! figure out optimal block size
+           if( info==0 ) then
+              if( mn==0 .or. nrhs==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'SGEQRF', ' ', m, n, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'SGERQF', ' ', m, n, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'SORMQR', ' ', m, n, nrhs, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'SORMRQ', ' ', m, n, nrhs, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = mn + max( 2*mn, n + 1, mn + nrhs )
+                 lwkopt = max( lwkmin,mn + 2*n + nb*( n + 1 ), 2*mn + nb*nrhs )
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGELSY', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( mn==0 .or. nrhs==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters
+           smlnum = stdlib_slamch( 'S' ) / stdlib_slamch( 'P' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! scale a, b if max entries outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', m, n, a, lda, work )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_slascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_slascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_slaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              rank = 0
+              go to 70
+           end if
+           bnrm = stdlib_slange( 'M', m, nrhs, b, ldb, work )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_slascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_slascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! compute qr factorization with column pivoting of a:
+              ! a * p = q * r
+           call stdlib_sgeqp3( m, n, a, lda, jpvt, work( 1 ), work( mn+1 ),lwork-mn, info )
+                     
+           wsize = mn + work( mn+1 )
+           ! workspace: mn+2*n+nb*(n+1).
+           ! details of householder rotations stored in work(1:mn).
+           ! determine rank using incremental condition estimation
+           work( ismin ) = one
+           work( ismax ) = one
+           smax = abs( a( 1, 1 ) )
+           smin = smax
+           if( abs( a( 1, 1 ) )==zero ) then
+              rank = 0
+              call stdlib_slaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              go to 70
+           else
+              rank = 1
+           end if
+           10 continue
+           if( rank<mn ) then
+              i = rank + 1
+              call stdlib_slaic1( imin, rank, work( ismin ), smin, a( 1, i ),a( i, i ), sminpr, &
+                        s1, c1 )
+              call stdlib_slaic1( imax, rank, work( ismax ), smax, a( 1, i ),a( i, i ), smaxpr, &
+                        s2, c2 )
+              if( smaxpr*rcond<=sminpr ) then
+                 do i = 1, rank
+                    work( ismin+i-1 ) = s1*work( ismin+i-1 )
+                    work( ismax+i-1 ) = s2*work( ismax+i-1 )
+                 end do
+                 work( ismin+rank ) = c1
+                 work( ismax+rank ) = c2
+                 smin = sminpr
+                 smax = smaxpr
+                 rank = rank + 1
+                 go to 10
+              end if
+           end if
+           ! workspace: 3*mn.
+           ! logically partition r = [ r11 r12 ]
+                                   ! [  0  r22 ]
+           ! where r11 = r(1:rank,1:rank)
+           ! [r11,r12] = [ t11, 0 ] * y
+           if( rank<n )call stdlib_stzrzf( rank, n, a, lda, work( mn+1 ), work( 2*mn+1 ),lwork-&
+                     2*mn, info )
+           ! workspace: 2*mn.
+           ! details of householder rotations stored in work(mn+1:2*mn)
+           ! b(1:m,1:nrhs) := q**t * b(1:m,1:nrhs)
+           call stdlib_sormqr( 'LEFT', 'TRANSPOSE', m, nrhs, mn, a, lda, work( 1 ),b, ldb, work( &
+                     2*mn+1 ), lwork-2*mn, info )
+           wsize = max( wsize, 2*mn+work( 2*mn+1 ) )
+           ! workspace: 2*mn+nb*nrhs.
+           ! b(1:rank,1:nrhs) := inv(t11) * b(1:rank,1:nrhs)
+           call stdlib_strsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', rank,nrhs, one, a, lda,&
+                      b, ldb )
+           do j = 1, nrhs
+              do i = rank + 1, n
+                 b( i, j ) = zero
+              end do
+           end do
+           ! b(1:n,1:nrhs) := y**t * b(1:n,1:nrhs)
+           if( rank<n ) then
+              call stdlib_sormrz( 'LEFT', 'TRANSPOSE', n, nrhs, rank, n-rank, a,lda, work( mn+1 ),&
+                         b, ldb, work( 2*mn+1 ),lwork-2*mn, info )
+           end if
+           ! workspace: 2*mn+nrhs.
+           ! b(1:n,1:nrhs) := p * b(1:n,1:nrhs)
+           do j = 1, nrhs
+              do i = 1, n
+                 work( jpvt( i ) ) = b( i, j )
+              end do
+              call stdlib_scopy( n, work( 1 ), 1, b( 1, j ), 1 )
+           end do
+           ! workspace: n.
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_slascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_slascl( 'U', 0, 0, smlnum, anrm, rank, rank, a, lda,info )
+           else if( iascl==2 ) then
+              call stdlib_slascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_slascl( 'U', 0, 0, bignum, anrm, rank, rank, a, lda,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_slascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_slascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           70 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sgelsy
+
+     !> SGEQR: computes a QR factorization of a real M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     pure subroutine stdlib_sgeqr( m, n, a, lda, t, tsize, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, tsize, lwork
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: t(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, lminws, mint, minw
+           integer(ilp) :: mb, nb, mintsz, nblcks
+           ! Intrinsic Functions 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( tsize==-1 .or. tsize==-2 .or.lwork==-1 .or. lwork==-2 )
+           mint = .false.
+           minw = .false.
+           if( tsize==-2 .or. lwork==-2 ) then
+             if( tsize/=-1 ) mint = .true.
+             if( lwork/=-1 ) minw = .true.
+           end if
+           ! determine the block size
+           if( min( m, n )>0 ) then
+             mb = stdlib_ilaenv( 1, 'SGEQR ', ' ', m, n, 1, -1 )
+             nb = stdlib_ilaenv( 1, 'SGEQR ', ' ', m, n, 2, -1 )
+           else
+             mb = m
+             nb = 1
+           end if
+           if( mb>m .or. mb<=n ) mb = m
+           if( nb>min( m, n ) .or. nb<1 ) nb = 1
+           mintsz = n + 5
+           if ( mb>n .and. m>n ) then
+             if( mod( m - n, mb - n )==0 ) then
+               nblcks = ( m - n ) / ( mb - n )
+             else
+               nblcks = ( m - n ) / ( mb - n ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           ! determine if the workspace size satisfies minimal size
+           lminws = .false.
+           if( ( tsize<max( 1, nb*n*nblcks + 5 ) .or. lwork<nb*n ).and. ( lwork>=n ) .and. ( &
+                     tsize>=mintsz ).and. ( .not.lquery ) ) then
+             if( tsize<max( 1, nb*n*nblcks + 5 ) ) then
+               lminws = .true.
+               nb = 1
+               mb = m
+             end if
+             if( lwork<nb*n ) then
+               lminws = .true.
+               nb = 1
+             end if
+           end if
+           if( m<0 ) then
+             info = -1
+           else if( n<0 ) then
+             info = -2
+           else if( lda<max( 1, m ) ) then
+             info = -4
+           else if( tsize<max( 1, nb*n*nblcks + 5 ).and. ( .not.lquery ) .and. ( .not.lminws ) ) &
+                     then
+             info = -6
+           else if( ( lwork<max( 1, n*nb ) ) .and. ( .not.lquery ).and. ( .not.lminws ) ) &
+                     then
+             info = -8
+           end if
+           if( info==0 ) then
+             if( mint ) then
+               t( 1 ) = mintsz
+             else
+               t( 1 ) = nb*n*nblcks + 5
+             end if
+             t( 2 ) = mb
+             t( 3 ) = nb
+             if( minw ) then
+               work( 1 ) = max( 1, n )
+             else
+               work( 1 ) = max( 1, nb*n )
+             end if
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'SGEQR', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+             return
+           end if
+           ! the qr decomposition
+           if( ( m<=n ) .or. ( mb<=n ) .or. ( mb>=m ) ) then
+             call stdlib_sgeqrt( m, n, nb, a, lda, t( 6 ), nb, work, info )
+           else
+             call stdlib_slatsqr( m, n, mb, nb, a, lda, t( 6 ), nb, work,lwork, info )
+           end if
+           work( 1 ) = max( 1, nb*n )
+           return
+     end subroutine stdlib_sgeqr
+
+     !> SGETSLS: solves overdetermined or underdetermined real linear systems
+     !> involving an M-by-N matrix A, using a tall skinny QR or short wide LQ
+     !> factorization of A.  It is assumed that A has full rank.
+     !> The following options are provided:
+     !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A*X ||.
+     !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+     !> an underdetermined system A * X = B.
+     !> 3. If TRANS = 'T' and m >= n:  find the minimum norm solution of
+     !> an undetermined system A**T * X = B.
+     !> 4. If TRANS = 'T' and m < n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A**T * X ||.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+
+     subroutine stdlib_sgetsls( trans, m, n, nrhs, a, lda, b, ldb,work, lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, tran
+           integer(ilp) :: i, iascl, ibscl, j, maxmn, brow, scllen, tszo, tszm, lwo, lwm, lw1, &
+                     lw2, wsizeo, wsizem, info2
+           real(sp) :: anrm, bignum, bnrm, smlnum, tq(5), workq(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,int
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           maxmn = max( m, n )
+           tran  = stdlib_lsame( trans, 'T' )
+           lquery = ( lwork==-1 .or. lwork==-2 )
+           if( .not.( stdlib_lsame( trans, 'N' ) .or.stdlib_lsame( trans, 'T' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+           ! determine the optimum and minimum lwork
+            if( m>=n ) then
+              call stdlib_sgeqr( m, n, a, lda, tq, -1, workq, -1, info2 )
+              tszo = int( tq( 1 ),KIND=ilp)
+              lwo  = int( workq( 1 ),KIND=ilp)
+              call stdlib_sgemqr( 'L', trans, m, nrhs, n, a, lda, tq,tszo, b, ldb, workq, -1, &
+                        info2 )
+              lwo  = max( lwo, int( workq( 1 ),KIND=ilp) )
+              call stdlib_sgeqr( m, n, a, lda, tq, -2, workq, -2, info2 )
+              tszm = int( tq( 1 ),KIND=ilp)
+              lwm  = int( workq( 1 ),KIND=ilp)
+              call stdlib_sgemqr( 'L', trans, m, nrhs, n, a, lda, tq,tszm, b, ldb, workq, -1, &
+                        info2 )
+              lwm = max( lwm, int( workq( 1 ),KIND=ilp) )
+              wsizeo = tszo + lwo
+              wsizem = tszm + lwm
+            else
+              call stdlib_sgelq( m, n, a, lda, tq, -1, workq, -1, info2 )
+              tszo = int( tq( 1 ),KIND=ilp)
+              lwo  = int( workq( 1 ),KIND=ilp)
+              call stdlib_sgemlq( 'L', trans, n, nrhs, m, a, lda, tq,tszo, b, ldb, workq, -1, &
+                        info2 )
+              lwo  = max( lwo, int( workq( 1 ),KIND=ilp) )
+              call stdlib_sgelq( m, n, a, lda, tq, -2, workq, -2, info2 )
+              tszm = int( tq( 1 ),KIND=ilp)
+              lwm  = int( workq( 1 ),KIND=ilp)
+              call stdlib_sgemlq( 'L', trans, n, nrhs, m, a, lda, tq,tszm, b, ldb, workq, -1, &
+                        info2 )
+              lwm  = max( lwm, int( workq( 1 ),KIND=ilp) )
+              wsizeo = tszo + lwo
+              wsizem = tszm + lwm
+            end if
+            if( ( lwork<wsizem ).and.( .not.lquery ) ) then
+               info = -10
+            end if
+            work( 1 ) = real( wsizeo,KIND=sp)
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'SGETSLS', -info )
+             return
+           end if
+           if( lquery ) then
+             if( lwork==-2 ) work( 1 ) = real( wsizem,KIND=sp)
+             return
+           end if
+           if( lwork<wsizeo ) then
+             lw1 = tszm
+             lw2 = lwm
+           else
+             lw1 = tszo
+             lw2 = lwo
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+                call stdlib_slaset( 'FULL', max( m, n ), nrhs, zero, zero,b, ldb )
+                return
+           end if
+           ! get machine parameters
+            smlnum = stdlib_slamch( 'S' ) / stdlib_slamch( 'P' )
+            bignum = one / smlnum
+            call stdlib_slabad( smlnum, bignum )
+           ! scale a, b if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', m, n, a, lda, work )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_slascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_slascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_slaset( 'F', maxmn, nrhs, zero, zero, b, ldb )
+              go to 50
+           end if
+           brow = m
+           if ( tran ) then
+             brow = n
+           end if
+           bnrm = stdlib_slange( 'M', brow, nrhs, b, ldb, work )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_slascl( 'G', 0, 0, bnrm, smlnum, brow, nrhs, b, ldb,info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_slascl( 'G', 0, 0, bnrm, bignum, brow, nrhs, b, ldb,info )
+              ibscl = 2
+           end if
+           if ( m>=n ) then
+              ! compute qr factorization of a
+             call stdlib_sgeqr( m, n, a, lda, work( lw2+1 ), lw1,work( 1 ), lw2, info )
+             if ( .not.tran ) then
+                 ! least-squares problem min || a * x - b ||
+                 ! b(1:m,1:nrhs) := q**t * b(1:m,1:nrhs)
+               call stdlib_sgemqr( 'L' , 'T', m, nrhs, n, a, lda,work( lw2+1 ), lw1, b, ldb, work(&
+                          1 ), lw2,info )
+                 ! b(1:n,1:nrhs) := inv(r) * b(1:n,1:nrhs)
+               call stdlib_strtrs( 'U', 'N', 'N', n, nrhs,a, lda, b, ldb, info )
+               if( info>0 ) then
+                 return
+               end if
+               scllen = n
+             else
+                 ! overdetermined system of equations a**t * x = b
+                 ! b(1:n,1:nrhs) := inv(r**t) * b(1:n,1:nrhs)
+                 call stdlib_strtrs( 'U', 'T', 'N', n, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(n+1:m,1:nrhs) = zero
+                 do j = 1, nrhs
+                    do i = n + 1, m
+                       b( i, j ) = zero
+                    end do
+                 end do
+                 ! b(1:m,1:nrhs) := q(1:n,:) * b(1:n,1:nrhs)
+                 call stdlib_sgemqr( 'L', 'N', m, nrhs, n, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 scllen = m
+              end if
+           else
+              ! compute lq factorization of a
+              call stdlib_sgelq( m, n, a, lda, work( lw2+1 ), lw1,work( 1 ), lw2, info )
+              ! workspace at least m, optimally m*nb.
+              if( .not.tran ) then
+                 ! underdetermined system of equations a * x = b
+                 ! b(1:m,1:nrhs) := inv(l) * b(1:m,1:nrhs)
+                 call stdlib_strtrs( 'L', 'N', 'N', m, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(m+1:n,1:nrhs) = 0
+                 do j = 1, nrhs
+                    do i = m + 1, n
+                       b( i, j ) = zero
+                    end do
+                 end do
+                 ! b(1:n,1:nrhs) := q(1:n,:)**t * b(1:m,1:nrhs)
+                 call stdlib_sgemlq( 'L', 'T', n, nrhs, m, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = n
+              else
+                 ! overdetermined system min || a**t * x - b ||
+                 ! b(1:n,1:nrhs) := q * b(1:n,1:nrhs)
+                 call stdlib_sgemlq( 'L', 'N', n, nrhs, m, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:m,1:nrhs) := inv(l**t) * b(1:m,1:nrhs)
+                 call stdlib_strtrs( 'LOWER', 'TRANSPOSE', 'NON-UNIT', m, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = m
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+             call stdlib_slascl( 'G', 0, 0, anrm, smlnum, scllen, nrhs, b, ldb,info )
+           else if( iascl==2 ) then
+             call stdlib_slascl( 'G', 0, 0, anrm, bignum, scllen, nrhs, b, ldb,info )
+           end if
+           if( ibscl==1 ) then
+             call stdlib_slascl( 'G', 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb,info )
+           else if( ibscl==2 ) then
+             call stdlib_slascl( 'G', 0, 0, bignum, bnrm, scllen, nrhs, b, ldb,info )
+           end if
+           50 continue
+           work( 1 ) = real( tszo + lwo,KIND=sp)
+           return
+     end subroutine stdlib_sgetsls
+
+     !> SGETSQRHRT: computes a NB2-sized column blocked QR-factorization
+     !> of a complex M-by-N matrix A with M >= N,
+     !> A = Q * R.
+     !> The routine uses internally a NB1-sized column blocked and MB1-sized
+     !> row blocked TSQR-factorization and perfors the reconstruction
+     !> of the Householder vectors from the TSQR output. The routine also
+     !> converts the R_tsqr factor from the TSQR-factorization output into
+     !> the R factor that corresponds to the Householder QR-factorization,
+     !> A = Q_tsqr * R_tsqr = Q * R.
+     !> The output Q and R factors are stored in the same format as in SGEQRT
+     !> (Q is in blocked compact WY-representation). See the documentation
+     !> of SGEQRT for more details on the format.
+
+     pure subroutine stdlib_sgetsqrhrt( m, n, mb1, nb1, nb2, a, lda, t, ldt, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, nb1, nb2, mb1
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: t(ldt,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, lw1, lw2, lwt, ldwt, lworkopt, nb1local, nb2local, &
+                     num_all_row_blocks
+           ! Intrinsic Functions 
+           intrinsic :: ceiling,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery  = lwork==-1
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb1<=n ) then
+              info = -3
+           else if( nb1<1 ) then
+              info = -4
+           else if( nb2<1 ) then
+              info = -5
+           else if( lda<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1,  min( nb2, n ) ) ) then
+              info = -9
+           else
+              ! test the input lwork for the dimension of the array work.
+              ! this workspace is used to store array:
+              ! a) matrix t and work for stdlib_slatsqr;
+              ! b) n-by-n upper-triangular factor r_tsqr;
+              ! c) matrix t and array work for stdlib_sorgtsqr_row;
+              ! d) diagonal d for stdlib_sorhr_col.
+              if( lwork<n*n+1 .and. .not.lquery ) then
+                 info = -11
+              else
+                 ! set block size for column blocks
+                 nb1local = min( nb1, n )
+                 num_all_row_blocks = max( 1,ceiling( real( m - n,KIND=sp) / real( mb1 - n,&
+                           KIND=sp) ) )
+                 ! length and leading dimension of work array to place
+                 ! t array in tsqr.
+                 lwt = num_all_row_blocks * n * nb1local
+                 ldwt = nb1local
+                 ! length of tsqr work array
+                 lw1 = nb1local * n
+                 ! length of stdlib_sorgtsqr_row work array.
+                 lw2 = nb1local * max( nb1local, ( n - nb1local ) )
+                 lworkopt = max( lwt + lw1, max( lwt+n*n+lw2, lwt+n*n+n ) )
+                 if( ( lwork<max( 1, lworkopt ) ).and.(.not.lquery) ) then
+                    info = -11
+                 end if
+              end if
+           end if
+           ! handle error in the input parameters and return workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGETSQRHRT', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = real( lworkopt,KIND=sp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = real( lworkopt,KIND=sp)
+              return
+           end if
+           nb2local = min( nb2, n )
+           ! (1) perform tsqr-factorization of the m-by-n matrix a.
+           call stdlib_slatsqr( m, n, mb1, nb1local, a, lda, work, ldwt,work(lwt+1), lw1, iinfo )
+                     
+           ! (2) copy the factor r_tsqr stored in the upper-triangular part
+               ! of a into the square matrix in the work array
+               ! work(lwt+1:lwt+n*n) column-by-column.
+           do j = 1, n
+              call stdlib_scopy( j, a( 1, j ), 1, work( lwt + n*(j-1)+1 ), 1 )
+           end do
+           ! (3) generate a m-by-n matrix q with orthonormal columns from
+           ! the result stored below the diagonal in the array a in place.
+           call stdlib_sorgtsqr_row( m, n, mb1, nb1local, a, lda, work, ldwt,work( lwt+n*n+1 ), &
+                     lw2, iinfo )
+           ! (4) perform the reconstruction of householder vectors from
+           ! the matrix q (stored in a) in place.
+           call stdlib_sorhr_col( m, n, nb2local, a, lda, t, ldt,work( lwt+n*n+1 ), iinfo )
+                     
+           ! (5) copy the factor r_tsqr stored in the square matrix in the
+           ! work array work(lwt+1:lwt+n*n) into the upper-triangular
+           ! part of a.
+           ! (6) compute from r_tsqr the factor r_hr corresponding to
+           ! the reconstructed householder vectors, i.e. r_hr = s * r_tsqr.
+           ! this multiplication by the sign matrix s on the left means
+           ! changing the sign of i-th row of the matrix r_tsqr according
+           ! to sign of the i-th diagonal element diag(i) of the matrix s.
+           ! diag is stored in work( lwt+n*n+1 ) from the stdlib_sorhr_col output.
+           ! (5) and (6) can be combined in a single loop, so the rows in a
+           ! are accessed only once.
+           do i = 1, n
+              if( work( lwt+n*n+i )==-one ) then
+                 do j = i, n
+                    a( i, j ) = -one * work( lwt+n*(j-1)+i )
+                 end do
+              else
+                 call stdlib_scopy( n-i+1, work(lwt+n*(i-1)+i), n, a( i, i ), lda )
+              end if
+           end do
+           work( 1 ) = real( lworkopt,KIND=sp)
+           return
+     end subroutine stdlib_sgetsqrhrt
+
+     !> SLAED2: merges the two sets of eigenvalues together into a single
+     !> sorted set.  Then it tries to deflate the size of the problem.
+     !> There are two ways in which deflation can occur:  when two or more
+     !> eigenvalues are close together or if there is a tiny entry in the
+     !> Z vector.  For each such occurrence the order of the related secular
+     !> equation problem is reduced by one.
+
+     pure subroutine stdlib_slaed2( k, n, n1, d, q, ldq, indxq, rho, z, dlamda, w,q2, indx, indxc,&
+                indxp, coltyp, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, k
+           integer(ilp), intent(in) :: ldq, n, n1
+           real(sp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(out) :: coltyp(*), indx(*), indxc(*), indxp(*)
+           integer(ilp), intent(inout) :: indxq(*)
+           real(sp), intent(inout) :: d(*), q(ldq,*), z(*)
+           real(sp), intent(out) :: dlamda(*), q2(*), w(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: mone = -1.0_sp
+           
+           ! Local Arrays 
+           integer(ilp) :: ctot(4), psm(4)
+           ! Local Scalars 
+           integer(ilp) :: ct, i, imax, iq1, iq2, j, jmax, js, k2, n1p1, n2, nj, pj
+           real(sp) :: c, eps, s, t, tau, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           else if( min( 1, ( n / 2 ) )>n1 .or. ( n / 2 )<n1 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAED2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           n2 = n - n1
+           n1p1 = n1 + 1
+           if( rho<zero ) then
+              call stdlib_sscal( n2, mone, z( n1p1 ), 1 )
+           end if
+           ! normalize z so that norm(z) = 1.  since z is the concatenation of
+           ! two normalized vectors, norm2(z) = sqrt(2).
+           t = one / sqrt( two )
+           call stdlib_sscal( n, t, z, 1 )
+           ! rho = abs( norm(z)**2 * rho )
+           rho = abs( two*rho )
+           ! sort the eigenvalues into increasing order
+           do i = n1p1, n
+              indxq( i ) = indxq( i ) + n1
+           end do
+           ! re-integrate the deflated parts from the last pass
+           do i = 1, n
+              dlamda( i ) = d( indxq( i ) )
+           end do
+           call stdlib_slamrg( n1, n2, dlamda, 1, 1, indxc )
+           do i = 1, n
+              indx( i ) = indxq( indxc( i ) )
+           end do
+           ! calculate the allowable deflation tolerance
+           imax = stdlib_isamax( n, z, 1 )
+           jmax = stdlib_isamax( n, d, 1 )
+           eps = stdlib_slamch( 'EPSILON' )
+           tol = eight*eps*max( abs( d( jmax ) ), abs( z( imax ) ) )
+           ! if the rank-1 modifier is small enough, no more needs to be done
+           ! except to reorganize q so that its columns correspond with the
+           ! elements in d.
+           if( rho*abs( z( imax ) )<=tol ) then
+              k = 0
+              iq2 = 1
+              do j = 1, n
+                 i = indx( j )
+                 call stdlib_scopy( n, q( 1, i ), 1, q2( iq2 ), 1 )
+                 dlamda( j ) = d( i )
+                 iq2 = iq2 + n
+              end do
+              call stdlib_slacpy( 'A', n, n, q2, n, q, ldq )
+              call stdlib_scopy( n, dlamda, 1, d, 1 )
+              go to 190
+           end if
+           ! if there are multiple eigenvalues then the problem deflates.  here
+           ! the number of equal eigenvalues are found.  as each equal
+           ! eigenvalue is found, an elementary reflector is computed to rotate
+           ! the corresponding eigensubspace so that the corresponding
+           ! components of z are zero in this new basis.
+           do i = 1, n1
+              coltyp( i ) = 1
+           end do
+           do i = n1p1, n
+              coltyp( i ) = 3
+           end do
+           k = 0
+           k2 = n + 1
+           do j = 1, n
+              nj = indx( j )
+              if( rho*abs( z( nj ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 coltyp( nj ) = 4
+                 indxp( k2 ) = nj
+                 if( j==n )go to 100
+              else
+                 pj = nj
+                 go to 80
+              end if
+           end do
+           80 continue
+           j = j + 1
+           nj = indx( j )
+           if( j>n )go to 100
+           if( rho*abs( z( nj ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              coltyp( nj ) = 4
+              indxp( k2 ) = nj
+           else
+              ! check if eigenvalues are close enough to allow deflation.
+              s = z( pj )
+              c = z( nj )
+              ! find sqrt(a**2+b**2) without overflow or
+              ! destructive underflow.
+              tau = stdlib_slapy2( c, s )
+              t = d( nj ) - d( pj )
+              c = c / tau
+              s = -s / tau
+              if( abs( t*c*s )<=tol ) then
+                 ! deflation is possible.
+                 z( nj ) = tau
+                 z( pj ) = zero
+                 if( coltyp( nj )/=coltyp( pj ) )coltyp( nj ) = 2
+                 coltyp( pj ) = 4
+                 call stdlib_srot( n, q( 1, pj ), 1, q( 1, nj ), 1, c, s )
+                 t = d( pj )*c**2 + d( nj )*s**2
+                 d( nj ) = d( pj )*s**2 + d( nj )*c**2
+                 d( pj ) = t
+                 k2 = k2 - 1
+                 i = 1
+                 90 continue
+                 if( k2+i<=n ) then
+                    if( d( pj )<d( indxp( k2+i ) ) ) then
+                       indxp( k2+i-1 ) = indxp( k2+i )
+                       indxp( k2+i ) = pj
+                       i = i + 1
+                       go to 90
+                    else
+                       indxp( k2+i-1 ) = pj
+                    end if
+                 else
+                    indxp( k2+i-1 ) = pj
+                 end if
+                 pj = nj
+              else
+                 k = k + 1
+                 dlamda( k ) = d( pj )
+                 w( k ) = z( pj )
+                 indxp( k ) = pj
+                 pj = nj
+              end if
+           end if
+           go to 80
+           100 continue
+           ! record the last eigenvalue.
+           k = k + 1
+           dlamda( k ) = d( pj )
+           w( k ) = z( pj )
+           indxp( k ) = pj
+           ! count up the total number of the various types of columns, then
+           ! form a permutation which positions the four column types into
+           ! four uniform groups (although one or more of these groups may be
+           ! empty).
+           do j = 1, 4
+              ctot( j ) = 0
+           end do
+           do j = 1, n
+              ct = coltyp( j )
+              ctot( ct ) = ctot( ct ) + 1
+           end do
+           ! psm(*) = position in submatrix (of types 1 through 4)
+           psm( 1 ) = 1
+           psm( 2 ) = 1 + ctot( 1 )
+           psm( 3 ) = psm( 2 ) + ctot( 2 )
+           psm( 4 ) = psm( 3 ) + ctot( 3 )
+           k = n - ctot( 4 )
+           ! fill out the indxc array so that the permutation which it induces
+           ! will place all type-1 columns first, all type-2 columns next,
+           ! then all type-3's, and finally all type-4's.
+           do j = 1, n
+              js = indxp( j )
+              ct = coltyp( js )
+              indx( psm( ct ) ) = js
+              indxc( psm( ct ) ) = j
+              psm( ct ) = psm( ct ) + 1
+           end do
+           ! sort the eigenvalues and corresponding eigenvectors into dlamda
+           ! and q2 respectively.  the eigenvalues/vectors which were not
+           ! deflated go into the first k slots of dlamda and q2 respectively,
+           ! while those which were deflated go into the last n - k slots.
+           i = 1
+           iq1 = 1
+           iq2 = 1 + ( ctot( 1 )+ctot( 2 ) )*n1
+           do j = 1, ctot( 1 )
+              js = indx( i )
+              call stdlib_scopy( n1, q( 1, js ), 1, q2( iq1 ), 1 )
+              z( i ) = d( js )
+              i = i + 1
+              iq1 = iq1 + n1
+           end do
+           do j = 1, ctot( 2 )
+              js = indx( i )
+              call stdlib_scopy( n1, q( 1, js ), 1, q2( iq1 ), 1 )
+              call stdlib_scopy( n2, q( n1+1, js ), 1, q2( iq2 ), 1 )
+              z( i ) = d( js )
+              i = i + 1
+              iq1 = iq1 + n1
+              iq2 = iq2 + n2
+           end do
+           do j = 1, ctot( 3 )
+              js = indx( i )
+              call stdlib_scopy( n2, q( n1+1, js ), 1, q2( iq2 ), 1 )
+              z( i ) = d( js )
+              i = i + 1
+              iq2 = iq2 + n2
+           end do
+           iq1 = iq2
+           do j = 1, ctot( 4 )
+              js = indx( i )
+              call stdlib_scopy( n, q( 1, js ), 1, q2( iq2 ), 1 )
+              iq2 = iq2 + n
+              z( i ) = d( js )
+              i = i + 1
+           end do
+           ! the deflated eigenvalues and their corresponding vectors go back
+           ! into the last n - k slots of d and q respectively.
+           if( k<n ) then
+              call stdlib_slacpy( 'A', n, ctot( 4 ), q2( iq1 ), n,q( 1, k+1 ), ldq )
+              call stdlib_scopy( n-k, z( k+1 ), 1, d( k+1 ), 1 )
+           end if
+           ! copy ctot into coltyp for referencing in stdlib_slaed3.
+           do j = 1, 4
+              coltyp( j ) = ctot( j )
+           end do
+           190 continue
+           return
+     end subroutine stdlib_slaed2
+
+     !> SLAQR2: is identical to SLAQR3 except that it avoids
+     !> recursion by calling SLAHQR instead of SLAQR4.
+     !> Aggressive early deflation:
+     !> This subroutine accepts as input an upper Hessenberg matrix
+     !> H and performs an orthogonal similarity transformation
+     !> designed to detect and deflate fully converged eigenvalues from
+     !> a trailing principal submatrix.  On output H has been over-
+     !> written by a new Hessenberg matrix that is a perturbation of
+     !> an orthogonal similarity transformation of H.  It is to be
+     !> hoped that the final version of H has many zero subdiagonal
+     !> entries.
+
+     subroutine stdlib_slaqr2( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, ns, nd,&
+                sr, si, v, ldv, nh, t,ldt, nv, wv, ldwv, work, lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kbot, ktop, ldh, ldt, ldv, ldwv, ldz, lwork, n,&
+                      nh, nv, nw
+           integer(ilp), intent(out) :: nd, ns
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(sp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(sp), intent(out) :: si(*), sr(*), t(ldt,*), v(ldv,*), work(*), wv(ldwv,*)
+        ! ================================================================
+           
+           ! Local Scalars 
+           real(sp) :: aa, bb, beta, cc, cs, dd, evi, evk, foo, s, safmax, safmin, smlnum, sn, &
+                     tau, ulp
+           integer(ilp) :: i, ifst, ilst, info, infqr, j, jw, k, kcol, kend, kln, krow, kwtop, &
+                     ltop, lwk1, lwk2, lwkopt
+           logical(lk) :: bulge, sorted
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,max,min,real,sqrt
+           ! Executable Statements 
+           ! ==== estimate optimal workspace. ====
+           jw = min( nw, kbot-ktop+1 )
+           if( jw<=2 ) then
+              lwkopt = 1
+           else
+              ! ==== workspace query call to stdlib_sgehrd ====
+              call stdlib_sgehrd( jw, 1, jw-1, t, ldt, work, work, -1, info )
+              lwk1 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_sormhr ====
+              call stdlib_sormhr( 'R', 'N', jw, jw, 1, jw-1, t, ldt, work, v, ldv,work, -1, info )
+                        
+              lwk2 = int( work( 1 ),KIND=ilp)
+              ! ==== optimal workspace ====
+              lwkopt = jw + max( lwk1, lwk2 )
+           end if
+           ! ==== quick return in case of workspace query. ====
+           if( lwork==-1 ) then
+              work( 1 ) = real( lwkopt,KIND=sp)
+              return
+           end if
+           ! ==== nothing to do ...
+           ! ... for an empty active block ... ====
+           ns = 0
+           nd = 0
+           work( 1 ) = one
+           if( ktop>kbot )return
+           ! ... nor for an empty deflation window. ====
+           if( nw<1 )return
+           ! ==== machine constants ====
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = one / safmin
+           call stdlib_slabad( safmin, safmax )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=sp) / ulp )
+           ! ==== setup deflation window ====
+           jw = min( nw, kbot-ktop+1 )
+           kwtop = kbot - jw + 1
+           if( kwtop==ktop ) then
+              s = zero
+           else
+              s = h( kwtop, kwtop-1 )
+           end if
+           if( kbot==kwtop ) then
+              ! ==== 1-by-1 deflation window: not much to do ====
+              sr( kwtop ) = h( kwtop, kwtop )
+              si( kwtop ) = zero
+              ns = 1
+              nd = 0
+              if( abs( s )<=max( smlnum, ulp*abs( h( kwtop, kwtop ) ) ) )then
+                 ns = 0
+                 nd = 1
+                 if( kwtop>ktop )h( kwtop, kwtop-1 ) = zero
+              end if
+              work( 1 ) = one
+              return
+           end if
+           ! ==== convert to spike-triangular form.  (in case of a
+           ! .    rare qr failure, this routine continues to do
+           ! .    aggressive early deflation using that part of
+           ! .    the deflation window that converged using infqr
+           ! .    here and there to keep track.) ====
+           call stdlib_slacpy( 'U', jw, jw, h( kwtop, kwtop ), ldh, t, ldt )
+           call stdlib_scopy( jw-1, h( kwtop+1, kwtop ), ldh+1, t( 2, 1 ), ldt+1 )
+           call stdlib_slaset( 'A', jw, jw, zero, one, v, ldv )
+           call stdlib_slahqr( .true., .true., jw, 1, jw, t, ldt, sr( kwtop ),si( kwtop ), 1, jw, &
+                     v, ldv, infqr )
+           ! ==== stdlib_strexc needs a clean margin near the diagonal ====
+           do j = 1, jw - 3
+              t( j+2, j ) = zero
+              t( j+3, j ) = zero
+           end do
+           if( jw>2 )t( jw, jw-2 ) = zero
+           ! ==== deflation detection loop ====
+           ns = jw
+           ilst = infqr + 1
+           20 continue
+           if( ilst<=ns ) then
+              if( ns==1 ) then
+                 bulge = .false.
+              else
+                 bulge = t( ns, ns-1 )/=zero
+              end if
+              ! ==== small spike tip test for deflation ====
+              if( .not.bulge ) then
+                 ! ==== real eigenvalue ====
+                 foo = abs( t( ns, ns ) )
+                 if( foo==zero )foo = abs( s )
+                 if( abs( s*v( 1, ns ) )<=max( smlnum, ulp*foo ) ) then
+                    ! ==== deflatable ====
+                    ns = ns - 1
+                 else
+                    ! ==== undeflatable.   move it up out of the way.
+                    ! .    (stdlib_strexc can not fail in this case.) ====
+                    ifst = ns
+                    call stdlib_strexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    ilst = ilst + 1
+                 end if
+              else
+                 ! ==== complex conjugate pair ====
+                 foo = abs( t( ns, ns ) ) + sqrt( abs( t( ns, ns-1 ) ) )*sqrt( abs( t( ns-1, ns ) &
+                           ) )
+                 if( foo==zero )foo = abs( s )
+                 if( max( abs( s*v( 1, ns ) ), abs( s*v( 1, ns-1 ) ) )<=max( smlnum, ulp*foo ) ) &
+                           then
+                    ! ==== deflatable ====
+                    ns = ns - 2
+                 else
+                    ! ==== undeflatable. move them up out of the way.
+                    ! .    fortunately, stdlib_strexc does the right thing with
+                    ! .    ilst in case of a rare exchange failure. ====
+                    ifst = ns
+                    call stdlib_strexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    ilst = ilst + 2
+                 end if
+              end if
+              ! ==== end deflation detection loop ====
+              go to 20
+           end if
+              ! ==== return to hessenberg form ====
+           if( ns==0 )s = zero
+           if( ns<jw ) then
+              ! ==== sorting diagonal blocks of t improves accuracy for
+              ! .    graded matrices.  bubble sort deals well with
+              ! .    exchange failures. ====
+              sorted = .false.
+              i = ns + 1
+              30 continue
+              if( sorted )go to 50
+              sorted = .true.
+              kend = i - 1
+              i = infqr + 1
+              if( i==ns ) then
+                 k = i + 1
+              else if( t( i+1, i )==zero ) then
+                 k = i + 1
+              else
+                 k = i + 2
+              end if
+              40 continue
+              if( k<=kend ) then
+                 if( k==i+1 ) then
+                    evi = abs( t( i, i ) )
+                 else
+                    evi = abs( t( i, i ) ) + sqrt( abs( t( i+1, i ) ) )*sqrt( abs( t( i, i+1 ) ) )
+                              
+                 end if
+                 if( k==kend ) then
+                    evk = abs( t( k, k ) )
+                 else if( t( k+1, k )==zero ) then
+                    evk = abs( t( k, k ) )
+                 else
+                    evk = abs( t( k, k ) ) + sqrt( abs( t( k+1, k ) ) )*sqrt( abs( t( k, k+1 ) ) )
+                              
+                 end if
+                 if( evi>=evk ) then
+                    i = k
+                 else
+                    sorted = .false.
+                    ifst = i
+                    ilst = k
+                    call stdlib_strexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    if( info==0 ) then
+                       i = ilst
+                    else
+                       i = k
+                    end if
+                 end if
+                 if( i==kend ) then
+                    k = i + 1
+                 else if( t( i+1, i )==zero ) then
+                    k = i + 1
+                 else
+                    k = i + 2
+                 end if
+                 go to 40
+              end if
+              go to 30
+              50 continue
+           end if
+           ! ==== restore shift/eigenvalue array from t ====
+           i = jw
+           60 continue
+           if( i>=infqr+1 ) then
+              if( i==infqr+1 ) then
+                 sr( kwtop+i-1 ) = t( i, i )
+                 si( kwtop+i-1 ) = zero
+                 i = i - 1
+              else if( t( i, i-1 )==zero ) then
+                 sr( kwtop+i-1 ) = t( i, i )
+                 si( kwtop+i-1 ) = zero
+                 i = i - 1
+              else
+                 aa = t( i-1, i-1 )
+                 cc = t( i, i-1 )
+                 bb = t( i-1, i )
+                 dd = t( i, i )
+                 call stdlib_slanv2( aa, bb, cc, dd, sr( kwtop+i-2 ),si( kwtop+i-2 ), sr( kwtop+i-&
+                           1 ),si( kwtop+i-1 ), cs, sn )
+                 i = i - 2
+              end if
+              go to 60
+           end if
+           if( ns<jw .or. s==zero ) then
+              if( ns>1 .and. s/=zero ) then
+                 ! ==== reflect spike back into lower triangle ====
+                 call stdlib_scopy( ns, v, ldv, work, 1 )
+                 beta = work( 1 )
+                 call stdlib_slarfg( ns, beta, work( 2 ), 1, tau )
+                 work( 1 ) = one
+                 call stdlib_slaset( 'L', jw-2, jw-2, zero, zero, t( 3, 1 ), ldt )
+                 call stdlib_slarf( 'L', ns, jw, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_slarf( 'R', ns, ns, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_slarf( 'R', jw, ns, work, 1, tau, v, ldv,work( jw+1 ) )
+                 call stdlib_sgehrd( jw, 1, ns, t, ldt, work, work( jw+1 ),lwork-jw, info )
+                           
+              end if
+              ! ==== copy updated reduced window into place ====
+              if( kwtop>1 )h( kwtop, kwtop-1 ) = s*v( 1, 1 )
+              call stdlib_slacpy( 'U', jw, jw, t, ldt, h( kwtop, kwtop ), ldh )
+              call stdlib_scopy( jw-1, t( 2, 1 ), ldt+1, h( kwtop+1, kwtop ),ldh+1 )
+              ! ==== accumulate orthogonal matrix in order update
+              ! .    h and z, if requested.  ====
+              if( ns>1 .and. s/=zero )call stdlib_sormhr( 'R', 'N', jw, ns, 1, ns, t, ldt, work, &
+                        v, ldv,work( jw+1 ), lwork-jw, info )
+              ! ==== update vertical slab in h ====
+              if( wantt ) then
+                 ltop = 1
+              else
+                 ltop = ktop
+              end if
+              do krow = ltop, kwtop - 1, nv
+                 kln = min( nv, kwtop-krow )
+                 call stdlib_sgemm( 'N', 'N', kln, jw, jw, one, h( krow, kwtop ),ldh, v, ldv, &
+                           zero, wv, ldwv )
+                 call stdlib_slacpy( 'A', kln, jw, wv, ldwv, h( krow, kwtop ), ldh )
+              end do
+              ! ==== update horizontal slab in h ====
+              if( wantt ) then
+                 do kcol = kbot + 1, n, nh
+                    kln = min( nh, n-kcol+1 )
+                    call stdlib_sgemm( 'C', 'N', jw, kln, jw, one, v, ldv,h( kwtop, kcol ), ldh, &
+                              zero, t, ldt )
+                    call stdlib_slacpy( 'A', jw, kln, t, ldt, h( kwtop, kcol ),ldh )
+                 end do
+              end if
+              ! ==== update vertical slab in z ====
+              if( wantz ) then
+                 do krow = iloz, ihiz, nv
+                    kln = min( nv, ihiz-krow+1 )
+                    call stdlib_sgemm( 'N', 'N', kln, jw, jw, one, z( krow, kwtop ),ldz, v, ldv, &
+                              zero, wv, ldwv )
+                    call stdlib_slacpy( 'A', kln, jw, wv, ldwv, z( krow, kwtop ),ldz )
+                 end do
+              end if
+           end if
+           ! ==== return the number of deflations ... ====
+           nd = jw - ns
+           ! ==== ... and the number of shifts. (subtracting
+           ! .    infqr from the spike length takes care
+           ! .    of the case of a rare qr failure while
+           ! .    calculating eigenvalues of the deflation
+           ! .    window.)  ====
+           ns = ns - infqr
+            ! ==== return optimal workspace. ====
+           work( 1 ) = real( lwkopt,KIND=sp)
+     end subroutine stdlib_slaqr2
+
+     !> SLASD1: computes the SVD of an upper bidiagonal N-by-M matrix B,
+     !> where N = NL + NR + 1 and M = N + SQRE. SLASD1 is called from SLASD0.
+     !> A related subroutine SLASD7 handles the case in which the singular
+     !> values (and the singular vectors in factored form) are desired.
+     !> SLASD1 computes the SVD as follows:
+     !> ( D1(in)    0    0       0 )
+     !> B = U(in) * (   Z1**T   a   Z2**T    b ) * VT(in)
+     !> (   0       0   D2(in)   0 )
+     !> = U(out) * ( D(out) 0) * VT(out)
+     !> where Z**T = (Z1**T a Z2**T b) = u**T VT**T, and u is a vector of dimension M
+     !> with ALPHA and BETA in the NL+1 and NL+2 th entries and zeros
+     !> elsewhere; and the entry b is empty if SQRE = 0.
+     !> The left singular vectors of the original matrix are stored in U, and
+     !> the transpose of the right singular vectors are stored in VT, and the
+     !> singular values are in D.  The algorithm consists of three stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple singular values or when there are zeros in
+     !> the Z vector.  For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one.  This stage is
+     !> performed by the routine SLASD2.
+     !> The second stage consists of calculating the updated
+     !> singular values. This is done by finding the square roots of the
+     !> roots of the secular equation via the routine SLASD4 (as called
+     !> by SLASD3). This routine also calculates the singular vectors of
+     !> the current problem.
+     !> The final stage consists of computing the updated singular vectors
+     !> directly using the updated singular values.  The singular vectors
+     !> for the current problem are multiplied with the singular vectors
+     !> from the overall problem.
+
+     pure subroutine stdlib_slasd1( nl, nr, sqre, d, alpha, beta, u, ldu, vt, ldvt,idxq, iwork, &
+               work, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu, ldvt, nl, nr, sqre
+           real(sp), intent(inout) :: alpha, beta
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: idxq(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: d(*), u(ldu,*), vt(ldvt,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: coltyp, i, idx, idxc, idxp, iq, isigma, iu2, ivt2, iz, k, ldq, ldu2, &
+                     ldvt2, m, n, n1, n2
+           real(sp) :: orgnrm
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( nl<1 ) then
+              info = -1
+           else if( nr<1 ) then
+              info = -2
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLASD1', -info )
+              return
+           end if
+           n = nl + nr + 1
+           m = n + sqre
+           ! the following values are for bookkeeping purposes only.  they are
+           ! integer pointers which indicate the portion of the workspace
+           ! used by a particular array in stdlib_slasd2 and stdlib_slasd3.
+           ldu2 = n
+           ldvt2 = m
+           iz = 1
+           isigma = iz + m
+           iu2 = isigma + n
+           ivt2 = iu2 + ldu2*n
+           iq = ivt2 + ldvt2*m
+           idx = 1
+           idxc = idx + n
+           coltyp = idxc + n
+           idxp = coltyp + n
+           ! scale.
+           orgnrm = max( abs( alpha ), abs( beta ) )
+           d( nl+1 ) = zero
+           do i = 1, n
+              if( abs( d( i ) )>orgnrm ) then
+                 orgnrm = abs( d( i ) )
+              end if
+           end do
+           call stdlib_slascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, info )
+           alpha = alpha / orgnrm
+           beta = beta / orgnrm
+           ! deflate singular values.
+           call stdlib_slasd2( nl, nr, sqre, k, d, work( iz ), alpha, beta, u, ldu,vt, ldvt, work(&
+            isigma ), work( iu2 ), ldu2,work( ivt2 ), ldvt2, iwork( idxp ), iwork( idx ),iwork( &
+                      idxc ), idxq, iwork( coltyp ), info )
+           ! solve secular equation and update singular vectors.
+           ldq = k
+           call stdlib_slasd3( nl, nr, sqre, k, d, work( iq ), ldq, work( isigma ),u, ldu, work( &
+           iu2 ), ldu2, vt, ldvt, work( ivt2 ),ldvt2, iwork( idxc ), iwork( coltyp ), work( iz ),&
+                     info )
+           ! report the possible convergence failure.
+           if( info/=0 ) then
+              return
+           end if
+           ! unscale.
+           call stdlib_slascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+           ! prepare the idxq sorting permutation.
+           n1 = k
+           n2 = n - k
+           call stdlib_slamrg( n1, n2, d, 1, -1, idxq )
+           return
+     end subroutine stdlib_slasd1
+
+     !> SLAED1: computes the updated eigensystem of a diagonal
+     !> matrix after modification by a rank-one symmetric matrix.  This
+     !> routine is used only for the eigenproblem which requires all
+     !> eigenvalues and eigenvectors of a tridiagonal matrix.  SLAED7 handles
+     !> the case in which eigenvalues only or eigenvalues and eigenvectors
+     !> of a full symmetric matrix (which was reduced to tridiagonal form)
+     !> are desired.
+     !> T = Q(in) ( D(in) + RHO * Z*Z**T ) Q**T(in) = Q(out) * D(out) * Q**T(out)
+     !> where Z = Q**T*u, u is a vector of length N with ones in the
+     !> CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+     !> The eigenvectors of the original matrix are stored in Q, and the
+     !> eigenvalues are in D.  The algorithm consists of three stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple eigenvalues or if there is a zero in
+     !> the Z vector.  For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one.  This stage is
+     !> performed by the routine SLAED2.
+     !> The second stage consists of calculating the updated
+     !> eigenvalues. This is done by finding the roots of the secular
+     !> equation via the routine SLAED4 (as called by SLAED3).
+     !> This routine also calculates the eigenvectors of the current
+     !> problem.
+     !> The final stage consists of computing the updated eigenvectors
+     !> directly using the updated eigenvalues.  The eigenvectors for
+     !> the current problem are multiplied with the eigenvectors from
+     !> the overall problem.
+
+     pure subroutine stdlib_slaed1( n, d, q, ldq, indxq, rho, cutpnt, work, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: cutpnt, ldq, n
+           integer(ilp), intent(out) :: info
+           real(sp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: indxq(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: d(*), q(ldq,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: coltyp, cpp1, i, idlmda, indx, indxc, indxp, iq2, is, iw, iz, k, n1, &
+                     n2
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( ldq<max( 1, n ) ) then
+              info = -4
+           else if( min( 1, n / 2 )>cutpnt .or. ( n / 2 )<cutpnt ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAED1', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! the following values are integer pointers which indicate
+           ! the portion of the workspace
+           ! used by a particular array in stdlib_slaed2 and stdlib_slaed3.
+           iz = 1
+           idlmda = iz + n
+           iw = idlmda + n
+           iq2 = iw + n
+           indx = 1
+           indxc = indx + n
+           coltyp = indxc + n
+           indxp = coltyp + n
+           ! form the z-vector which consists of the last row of q_1 and the
+           ! first row of q_2.
+           call stdlib_scopy( cutpnt, q( cutpnt, 1 ), ldq, work( iz ), 1 )
+           cpp1 = cutpnt + 1
+           call stdlib_scopy( n-cutpnt, q( cpp1, cpp1 ), ldq, work( iz+cutpnt ), 1 )
+           ! deflate eigenvalues.
+           call stdlib_slaed2( k, n, cutpnt, d, q, ldq, indxq, rho, work( iz ),work( idlmda ), &
+           work( iw ), work( iq2 ),iwork( indx ), iwork( indxc ), iwork( indxp ),iwork( coltyp ), &
+                     info )
+           if( info/=0 )go to 20
+           ! solve secular equation.
+           if( k/=0 ) then
+              is = ( iwork( coltyp )+iwork( coltyp+1 ) )*cutpnt +( iwork( coltyp+1 )+iwork( &
+                        coltyp+2 ) )*( n-cutpnt ) + iq2
+              call stdlib_slaed3( k, n, cutpnt, d, q, ldq, rho, work( idlmda ),work( iq2 ), iwork(&
+                         indxc ), iwork( coltyp ),work( iw ), work( is ), info )
+              if( info/=0 )go to 20
+           ! prepare the indxq sorting permutation.
+              n1 = k
+              n2 = n - k
+              call stdlib_slamrg( n1, n2, d, 1, -1, indxq )
+           else
+              do i = 1, n
+                 indxq( i ) = i
+              end do
+           end if
+           20 continue
+           return
+     end subroutine stdlib_slaed1
+
+     !> SLAED0: computes all eigenvalues and corresponding eigenvectors of a
+     !> symmetric tridiagonal matrix using the divide and conquer method.
+
+     pure subroutine stdlib_slaed0( icompq, qsiz, n, d, e, q, ldq, qstore, ldqs,work, iwork, info &
+               )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, ldq, ldqs, n, qsiz
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: d(*), e(*), q(ldq,*)
+           real(sp), intent(out) :: qstore(ldqs,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: curlvl, curprb, curr, i, igivcl, igivnm, igivpt, indxq, iperm, iprmpt, &
+           iq, iqptr, iwrem, j, k, lgn, matsiz, msd2, smlsiz, smm1, spm1, spm2, submat, subpbs, &
+                     tlvls
+           real(sp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( icompq<0 .or. icompq>2 ) then
+              info = -1
+           else if( ( icompq==1 ) .and. ( qsiz<max( 0, n ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldq<max( 1, n ) ) then
+              info = -7
+           else if( ldqs<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAED0', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           smlsiz = stdlib_ilaenv( 9, 'SLAED0', ' ', 0, 0, 0, 0 )
+           ! determine the size and placement of the submatrices, and save in
+           ! the leading elements of iwork.
+           iwork( 1 ) = n
+           subpbs = 1
+           tlvls = 0
+           10 continue
+           if( iwork( subpbs )>smlsiz ) then
+              do j = subpbs, 1, -1
+                 iwork( 2*j ) = ( iwork( j )+1 ) / 2
+                 iwork( 2*j-1 ) = iwork( j ) / 2
+              end do
+              tlvls = tlvls + 1
+              subpbs = 2*subpbs
+              go to 10
+           end if
+           do j = 2, subpbs
+              iwork( j ) = iwork( j ) + iwork( j-1 )
+           end do
+           ! divide the matrix into subpbs submatrices of size at most smlsiz+1
+           ! using rank-1 modifications (cuts).
+           spm1 = subpbs - 1
+           do i = 1, spm1
+              submat = iwork( i ) + 1
+              smm1 = submat - 1
+              d( smm1 ) = d( smm1 ) - abs( e( smm1 ) )
+              d( submat ) = d( submat ) - abs( e( smm1 ) )
+           end do
+           indxq = 4*n + 3
+           if( icompq/=2 ) then
+              ! set up workspaces for eigenvalues only/accumulate new vectors
+              ! routine
+              temp = log( real( n,KIND=sp) ) / log( two )
+              lgn = int( temp,KIND=ilp)
+              if( 2**lgn<n )lgn = lgn + 1
+              if( 2**lgn<n )lgn = lgn + 1
+              iprmpt = indxq + n + 1
+              iperm = iprmpt + n*lgn
+              iqptr = iperm + n*lgn
+              igivpt = iqptr + n + 2
+              igivcl = igivpt + n*lgn
+              igivnm = 1
+              iq = igivnm + 2*n*lgn
+              iwrem = iq + n**2 + 1
+              ! initialize pointers
+              do i = 0, subpbs
+                 iwork( iprmpt+i ) = 1
+                 iwork( igivpt+i ) = 1
+              end do
+              iwork( iqptr ) = 1
+           end if
+           ! solve each submatrix eigenproblem at the bottom of the divide and
+           ! conquer tree.
+           curr = 0
+           loop_70: do i = 0, spm1
+              if( i==0 ) then
+                 submat = 1
+                 matsiz = iwork( 1 )
+              else
+                 submat = iwork( i ) + 1
+                 matsiz = iwork( i+1 ) - iwork( i )
+              end if
+              if( icompq==2 ) then
+                 call stdlib_ssteqr( 'I', matsiz, d( submat ), e( submat ),q( submat, submat ), &
+                           ldq, work, info )
+                 if( info/=0 )go to 130
+              else
+                 call stdlib_ssteqr( 'I', matsiz, d( submat ), e( submat ),work( iq-1+iwork( &
+                           iqptr+curr ) ), matsiz, work,info )
+                 if( info/=0 )go to 130
+                 if( icompq==1 ) then
+                    call stdlib_sgemm( 'N', 'N', qsiz, matsiz, matsiz, one,q( 1, submat ), ldq, &
+                    work( iq-1+iwork( iqptr+curr ) ), matsiz, zero, qstore( 1, submat ),ldqs )
+                              
+                 end if
+                 iwork( iqptr+curr+1 ) = iwork( iqptr+curr ) + matsiz**2
+                 curr = curr + 1
+              end if
+              k = 1
+              do j = submat, iwork( i+1 )
+                 iwork( indxq+j ) = k
+                 k = k + 1
+              end do
+           end do loop_70
+           ! successively merge eigensystems of adjacent submatrices
+           ! into eigensystem for the corresponding larger matrix.
+           ! while ( subpbs > 1 )
+           curlvl = 1
+           80 continue
+           if( subpbs>1 ) then
+              spm2 = subpbs - 2
+              loop_90: do i = 0, spm2, 2
+                 if( i==0 ) then
+                    submat = 1
+                    matsiz = iwork( 2 )
+                    msd2 = iwork( 1 )
+                    curprb = 0
+                 else
+                    submat = iwork( i ) + 1
+                    matsiz = iwork( i+2 ) - iwork( i )
+                    msd2 = matsiz / 2
+                    curprb = curprb + 1
+                 end if
+           ! merge lower order eigensystems (of size msd2 and matsiz - msd2)
+           ! into an eigensystem of size matsiz.
+           ! stdlib_slaed1 is used only for the full eigensystem of a tridiagonal
+           ! matrix.
+           ! stdlib_slaed7 handles the cases in which eigenvalues only or eigenvalues
+           ! and eigenvectors of a full symmetric matrix (which was reduced to
+           ! tridiagonal form) are desired.
+                 if( icompq==2 ) then
+                    call stdlib_slaed1( matsiz, d( submat ), q( submat, submat ),ldq, iwork( &
+                    indxq+submat ),e( submat+msd2-1 ), msd2, work,iwork( subpbs+1 ), info )
+                              
+                 else
+                    call stdlib_slaed7( icompq, matsiz, qsiz, tlvls, curlvl, curprb,d( submat ), &
+                    qstore( 1, submat ), ldqs,iwork( indxq+submat ), e( submat+msd2-1 ),msd2, &
+                    work( iq ), iwork( iqptr ),iwork( iprmpt ), iwork( iperm ),iwork( igivpt ), &
+                    iwork( igivcl ),work( igivnm ), work( iwrem ),iwork( subpbs+1 ), info )
+                              
+                 end if
+                 if( info/=0 )go to 130
+                 iwork( i / 2+1 ) = iwork( i+2 )
+              end do loop_90
+              subpbs = subpbs / 2
+              curlvl = curlvl + 1
+              go to 80
+           end if
+           ! end while
+           ! re-merge the eigenvalues/vectors which were deflated at the final
+           ! merge step.
+           if( icompq==1 ) then
+              do i = 1, n
+                 j = iwork( indxq+i )
+                 work( i ) = d( j )
+                 call stdlib_scopy( qsiz, qstore( 1, j ), 1, q( 1, i ), 1 )
+              end do
+              call stdlib_scopy( n, work, 1, d, 1 )
+           else if( icompq==2 ) then
+              do i = 1, n
+                 j = iwork( indxq+i )
+                 work( i ) = d( j )
+                 call stdlib_scopy( n, q( 1, j ), 1, work( n*i+1 ), 1 )
+              end do
+              call stdlib_scopy( n, work, 1, d, 1 )
+              call stdlib_slacpy( 'A', n, n, work( n+1 ), n, q, ldq )
+           else
+              do i = 1, n
+                 j = iwork( indxq+i )
+                 work( i ) = d( j )
+              end do
+              call stdlib_scopy( n, work, 1, d, 1 )
+           end if
+           go to 140
+           130 continue
+           info = submat*( n+1 ) + submat + matsiz - 1
+           140 continue
+           return
+     end subroutine stdlib_slaed0
+
+     !> SSTEDC: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric tridiagonal matrix using the divide and conquer method.
+     !> The eigenvectors of a full or band real symmetric matrix can also be
+     !> found if SSYTRD or SSPTRD or SSBTRD has been used to reduce this
+     !> matrix to tridiagonal form.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.  See SLAED3 for details.
+
+     pure subroutine stdlib_sstedc( compz, n, d, e, z, ldz, work, lwork, iwork,liwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: d(*), e(*), z(ldz,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: finish, i, icompz, ii, j, k, lgn, liwmin, lwmin, m, smlsiz, start, &
+                     storez, strtrw
+           real(sp) :: eps, orgnrm, p, tiny
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,max,mod,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or.( icompz>0 .and. ldz<max( 1, n ) ) ) then
+              info = -6
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              smlsiz = stdlib_ilaenv( 9, 'SSTEDC', ' ', 0, 0, 0, 0 )
+              if( n<=1 .or. icompz==0 ) then
+                 liwmin = 1
+                 lwmin = 1
+              else if( n<=smlsiz ) then
+                 liwmin = 1
+                 lwmin = 2*( n - 1 )
+              else
+                 lgn = int( log( real( n,KIND=sp) )/log( two ),KIND=ilp)
+                 if( 2**lgn<n )lgn = lgn + 1
+                 if( 2**lgn<n )lgn = lgn + 1
+                 if( icompz==1 ) then
+                    lwmin = 1 + 3*n + 2*n*lgn + 4*n**2
+                    liwmin = 6 + 6*n + 5*n*lgn
+                 else if( icompz==2 ) then
+                    lwmin = 1 + 4*n + n**2
+                    liwmin = 3 + 5*n
+                 end if
+              end if
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not. lquery ) then
+                 info = -8
+              else if( liwork<liwmin .and. .not. lquery ) then
+                 info = -10
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSTEDC', -info )
+              return
+           else if (lquery) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz/=0 )z( 1, 1 ) = one
+              return
+           end if
+           ! if the following conditional clause is removed, then the routine
+           ! will use the divide and conquer routine to compute only the
+           ! eigenvalues, which requires (3n + 3n**2) real workspace and
+           ! (2 + 5n + 2n lg(n)) integer workspace.
+           ! since on many architectures stdlib_ssterf is much faster than any other
+           ! algorithm for finding eigenvalues only, it is used here
+           ! as the default. if the conditional clause is removed, then
+           ! information on the size of workspace needs to be changed.
+           ! if compz = 'n', use stdlib_ssterf to compute the eigenvalues.
+           if( icompz==0 ) then
+              call stdlib_ssterf( n, d, e, info )
+              go to 50
+           end if
+           ! if n is smaller than the minimum divide size (smlsiz+1), then
+           ! solve the problem with another solver.
+           if( n<=smlsiz ) then
+              call stdlib_ssteqr( compz, n, d, e, z, ldz, work, info )
+           else
+              ! if compz = 'v', the z matrix must be stored elsewhere for later
+              ! use.
+              if( icompz==1 ) then
+                 storez = 1 + n*n
+              else
+                 storez = 1
+              end if
+              if( icompz==2 ) then
+                 call stdlib_slaset( 'FULL', n, n, zero, one, z, ldz )
+              end if
+              ! scale.
+              orgnrm = stdlib_slanst( 'M', n, d, e )
+              if( orgnrm==zero )go to 50
+              eps = stdlib_slamch( 'EPSILON' )
+              start = 1
+              ! while ( start <= n )
+              10 continue
+              if( start<=n ) then
+                 ! let finish be the position of the next subdiagonal entry
+                 ! such that e( finish ) <= tiny or finish = n if no such
+                 ! subdiagonal exists.  the matrix identified by the elements
+                 ! between start and finish constitutes an independent
+                 ! sub-problem.
+                 finish = start
+                 20 continue
+                 if( finish<n ) then
+                    tiny = eps*sqrt( abs( d( finish ) ) )*sqrt( abs( d( finish+1 ) ) )
+                    if( abs( e( finish ) )>tiny ) then
+                       finish = finish + 1
+                       go to 20
+                    end if
+                 end if
+                 ! (sub) problem determined.  compute its size and solve it.
+                 m = finish - start + 1
+                 if( m==1 ) then
+                    start = finish + 1
+                    go to 10
+                 end if
+                 if( m>smlsiz ) then
+                    ! scale.
+                    orgnrm = stdlib_slanst( 'M', m, d( start ), e( start ) )
+                    call stdlib_slascl( 'G', 0, 0, orgnrm, one, m, 1, d( start ), m,info )
+                    call stdlib_slascl( 'G', 0, 0, orgnrm, one, m-1, 1, e( start ),m-1, info )
+                              
+                    if( icompz==1 ) then
+                       strtrw = 1
+                    else
+                       strtrw = start
+                    end if
+                    call stdlib_slaed0( icompz, n, m, d( start ), e( start ),z( strtrw, start ), &
+                              ldz, work( 1 ), n,work( storez ), iwork, info )
+                    if( info/=0 ) then
+                       info = ( info / ( m+1 )+start-1 )*( n+1 ) +mod( info, ( m+1 ) ) + start - &
+                                 1
+                       go to 50
+                    end if
+                    ! scale back.
+                    call stdlib_slascl( 'G', 0, 0, one, orgnrm, m, 1, d( start ), m,info )
+                 else
+                    if( icompz==1 ) then
+                       ! since qr won't update a z matrix which is larger than
+                       ! the length of d, we must solve the sub-problem in a
+                       ! workspace and then multiply back into z.
+                       call stdlib_ssteqr( 'I', m, d( start ), e( start ), work, m,work( m*m+1 ), &
+                                 info )
+                       call stdlib_slacpy( 'A', n, m, z( 1, start ), ldz,work( storez ), n )
+                                 
+                       call stdlib_sgemm( 'N', 'N', n, m, m, one,work( storez ), n, work, m, zero,&
+                                 z( 1, start ), ldz )
+                    else if( icompz==2 ) then
+                       call stdlib_ssteqr( 'I', m, d( start ), e( start ),z( start, start ), ldz, &
+                                 work, info )
+                    else
+                       call stdlib_ssterf( m, d( start ), e( start ), info )
+                    end if
+                    if( info/=0 ) then
+                       info = start*( n+1 ) + finish
+                       go to 50
+                    end if
+                 end if
+                 start = finish + 1
+                 go to 10
+              end if
+              ! endwhile
+              if( icompz==0 ) then
+                ! use quick sort
+                call stdlib_slasrt( 'I', n, d, info )
+              else
+                ! use selection sort to minimize swaps of eigenvectors
+                do ii = 2, n
+                   i = ii - 1
+                   k = i
+                   p = d( i )
+                   do j = ii, n
+                      if( d( j )<p ) then
+                         k = j
+                         p = d( j )
+                      end if
+                   end do
+                   if( k/=i ) then
+                      d( k ) = d( i )
+                      d( i ) = p
+                      call stdlib_sswap( n, z( 1, i ), 1, z( 1, k ), 1 )
+                   end if
+                end do
+              end if
+           end if
+           50 continue
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_sstedc
+
+     !> SSTEVD: computes all eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric tridiagonal matrix. If eigenvectors are desired, it
+     !> uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_sstevd( jobz, n, d, e, z, ldz, work, lwork, iwork,liwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantz
+           integer(ilp) :: iscale, liwmin, lwmin
+           real(sp) :: bignum, eps, rmax, rmin, safmin, sigma, smlnum, tnrm
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           liwmin = 1
+           lwmin = 1
+           if( n>1 .and. wantz ) then
+              lwmin = 1 + 4*n + n**2
+              liwmin = 3 + 5*n
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -6
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -8
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -10
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSTEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           tnrm = stdlib_slanst( 'M', n, d, e )
+           if( tnrm>zero .and. tnrm<rmin ) then
+              iscale = 1
+              sigma = rmin / tnrm
+           else if( tnrm>rmax ) then
+              iscale = 1
+              sigma = rmax / tnrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_sscal( n, sigma, d, 1 )
+              call stdlib_sscal( n-1, sigma, e( 1 ), 1 )
+           end if
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvalues and
+           ! eigenvectors, call stdlib_sstedc.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, d, e, info )
+           else
+              call stdlib_sstedc( 'I', n, d, e, z, ldz, work, lwork, iwork, liwork,info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 )call stdlib_sscal( n, one / sigma, d, 1 )
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_sstevd
+
+     !> SSYEVD: computes all eigenvalues and, optionally, eigenvectors of a
+     !> real symmetric matrix A. If eigenvectors are desired, it uses a
+     !> divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+     !> Because of large use of BLAS of level 3, SSYEVD needs N**2 more
+     !> workspace than SSYEVX.
+
+     subroutine stdlib_ssyevd( jobz, uplo, n, a, lda, w, work, lwork, iwork,liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, inde, indtau, indwk2, indwrk, iscale, liopt, liwmin, llwork, &
+                     llwrk2, lopt, lwmin
+           real(sp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 liwmin = 1
+                 lwmin = 1
+                 lopt = lwmin
+                 liopt = liwmin
+              else
+                 if( wantz ) then
+                    liwmin = 3 + 5*n
+                    lwmin = 1 + 6*n + 2*n**2
+                 else
+                    liwmin = 1
+                    lwmin = 2*n + 1
+                 end if
+                 lopt = max( lwmin, 2*n +stdlib_ilaenv( 1, 'SSYTRD', uplo, n, -1, -1, -1 ) )
+                           
+                 liopt = liwmin
+              end if
+              work( 1 ) = lopt
+              iwork( 1 ) = liopt
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -8
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -10
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = a( 1, 1 )
+              if( wantz )a( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_slansy( 'M', uplo, n, a, lda, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 )call stdlib_slascl( uplo, 0, 0, one, sigma, n, n, a, lda, info )
+           ! call stdlib_ssytrd to reduce symmetric matrix to tridiagonal form.
+           inde = 1
+           indtau = inde + n
+           indwrk = indtau + n
+           llwork = lwork - indwrk + 1
+           indwk2 = indwrk + n*n
+           llwrk2 = lwork - indwk2 + 1
+           call stdlib_ssytrd( uplo, n, a, lda, w, work( inde ), work( indtau ),work( indwrk ), &
+                     llwork, iinfo )
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, first call
+           ! stdlib_sstedc to generate the eigenvector matrix, work(indwrk), of the
+           ! tridiagonal matrix, then call stdlib_sormtr to multiply it by the
+           ! householder transformations stored in a.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, work( inde ), info )
+           else
+              call stdlib_sstedc( 'I', n, w, work( inde ), work( indwrk ), n,work( indwk2 ), &
+                        llwrk2, iwork, liwork, info )
+              call stdlib_sormtr( 'L', uplo, 'N', n, n, a, lda, work( indtau ),work( indwrk ), n, &
+                        work( indwk2 ), llwrk2, iinfo )
+              call stdlib_slacpy( 'A', n, n, work( indwrk ), n, a, lda )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 )call stdlib_sscal( n, one / sigma, w, 1 )
+           work( 1 ) = lopt
+           iwork( 1 ) = liopt
+           return
+     end subroutine stdlib_ssyevd
+
+     !> SSYGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be symmetric and B is also positive definite.
+     !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_ssygvd( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, iwork, liwork,&
+                info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: liopt, liwmin, lopt, lwmin
+           ! Intrinsic Functions 
+           intrinsic :: max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              liwmin = 1
+              lwmin = 1
+           else if( wantz ) then
+              liwmin = 3 + 5*n
+              lwmin = 1 + 6*n + 2*n**2
+           else
+              liwmin = 1
+              lwmin = 2*n + 1
+           end if
+           lopt = lwmin
+           liopt = liwmin
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              work( 1 ) = lopt
+              iwork( 1 ) = liopt
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_spotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_ssygst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_ssyevd( jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork,info )
+           lopt = max( real( lopt,KIND=sp), real( work( 1 ),KIND=sp) )
+           liopt = max( real( liopt,KIND=sp), real( iwork( 1 ),KIND=sp) )
+           if( wantz .and. info==0 ) then
+              ! backtransform eigenvectors to the original problem.
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t*y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 call stdlib_strsm( 'LEFT', uplo, trans, 'NON-UNIT', n, n, one,b, ldb, a, lda )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t*y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_strmm( 'LEFT', uplo, trans, 'NON-UNIT', n, n, one,b, ldb, a, lda )
+                           
+              end if
+           end if
+           work( 1 ) = lopt
+           iwork( 1 ) = liopt
+           return
+     end subroutine stdlib_ssygvd
+
+     !> SSBEVD: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a real symmetric band matrix A. If eigenvectors are desired, it uses
+     !> a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_ssbevd( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,lwork, iwork, liwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: ab(ldab,*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, inde, indwk2, indwrk, iscale, liwmin, llwrk2, lwmin
+           real(sp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              liwmin = 1
+              lwmin = 1
+           else
+              if( wantz ) then
+                 liwmin = 3 + 5*n
+                 lwmin = 1 + 5*n + 2*n**2
+              else
+                 liwmin = 1
+                 lwmin = 2*n
+              end if
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSBEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = ab( 1, 1 )
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_slansb( 'M', uplo, n, kd, ab, ldab, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_slascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_slascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+           end if
+           ! call stdlib_ssbtrd to reduce symmetric band matrix to tridiagonal form.
+           inde = 1
+           indwrk = inde + n
+           indwk2 = indwrk + n*n
+           llwrk2 = lwork - indwk2 + 1
+           call stdlib_ssbtrd( jobz, uplo, n, kd, ab, ldab, w, work( inde ), z, ldz,work( indwrk )&
+                     , iinfo )
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, call stdlib_sstedc.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, work( inde ), info )
+           else
+              call stdlib_sstedc( 'I', n, w, work( inde ), work( indwrk ), n,work( indwk2 ), &
+                        llwrk2, iwork, liwork, info )
+              call stdlib_sgemm( 'N', 'N', n, n, n, one, z, ldz, work( indwrk ), n,zero, work( &
+                        indwk2 ), n )
+              call stdlib_slacpy( 'A', n, n, work( indwk2 ), n, z, ldz )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 )call stdlib_sscal( n, one / sigma, w, 1 )
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_ssbevd
+
+     !> SSBGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite banded eigenproblem, of the
+     !> form A*x=(lambda)*B*x.  Here A and B are assumed to be symmetric and
+     !> banded, and B is also positive definite.  If eigenvectors are
+     !> desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_ssbgvd( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w,z, ldz, work, &
+               lwork, iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: vect
+           integer(ilp) :: iinfo, inde, indwk2, indwrk, liwmin, llwrk2, lwmin
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              liwmin = 1
+              lwmin = 1
+           else if( wantz ) then
+              liwmin = 3 + 5*n
+              lwmin = 1 + 5*n + 2*n**2
+           else
+              liwmin = 1
+              lwmin = 2*n
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -14
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSBGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_spbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           inde = 1
+           indwrk = inde + n
+           indwk2 = indwrk + n*n
+           llwrk2 = lwork - indwk2 + 1
+           call stdlib_ssbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, z, ldz,work, iinfo )
+                     
+           ! reduce to tridiagonal form.
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_ssbtrd( vect, uplo, n, ka, ab, ldab, w, work( inde ), z, ldz,work( indwrk )&
+                     , iinfo )
+           ! for eigenvalues only, call stdlib_ssterf. for eigenvectors, call stdlib_sstedc.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, work( inde ), info )
+           else
+              call stdlib_sstedc( 'I', n, w, work( inde ), work( indwrk ), n,work( indwk2 ), &
+                        llwrk2, iwork, liwork, info )
+              call stdlib_sgemm( 'N', 'N', n, n, n, one, z, ldz, work( indwrk ), n,zero, work( &
+                        indwk2 ), n )
+              call stdlib_slacpy( 'A', n, n, work( indwk2 ), n, z, ldz )
+           end if
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_ssbgvd
+
+     !> SSPEVD: computes all the eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric matrix A in packed storage. If eigenvectors are
+     !> desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_sspevd( jobz, uplo, n, ap, w, z, ldz, work, lwork,iwork, liwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: ap(*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantz
+           integer(ilp) :: iinfo, inde, indtau, indwrk, iscale, liwmin, llwork, lwmin
+           real(sp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 liwmin = 1
+                 lwmin = 1
+              else
+                 if( wantz ) then
+                    liwmin = 3 + 5*n
+                    lwmin = 1 + 6*n + n**2
+                 else
+                    liwmin = 1
+                    lwmin = 2*n
+                 end if
+              end if
+              iwork( 1 ) = liwmin
+              work( 1 ) = lwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -9
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = ap( 1 )
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_slansp( 'M', uplo, n, ap, work )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_sscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+           end if
+           ! call stdlib_ssptrd to reduce symmetric packed matrix to tridiagonal form.
+           inde = 1
+           indtau = inde + n
+           call stdlib_ssptrd( uplo, n, ap, w, work( inde ), work( indtau ), iinfo )
+           ! for eigenvalues only, call stdlib_ssterf.  for eigenvectors, first call
+           ! stdlib_sstedc to generate the eigenvector matrix, work(indwrk), of the
+           ! tridiagonal matrix, then call stdlib_sopmtr to multiply it by the
+           ! householder transformations represented in ap.
+           if( .not.wantz ) then
+              call stdlib_ssterf( n, w, work( inde ), info )
+           else
+              indwrk = indtau + n
+              llwork = lwork - indwrk + 1
+              call stdlib_sstedc( 'I', n, w, work( inde ), z, ldz, work( indwrk ),llwork, iwork, &
+                        liwork, info )
+              call stdlib_sopmtr( 'L', uplo, 'N', n, n, ap, work( indtau ), z, ldz,work( indwrk ),&
+                         iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 )call stdlib_sscal( n, one / sigma, w, 1 )
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_sspevd
+
+     !> SSPGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a real generalized symmetric-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be symmetric, stored in packed format, and B is also
+     !> positive definite.
+     !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_sspgvd( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,lwork, iwork, liwork,&
+                info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, ldz, liwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: ap(*), bp(*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: j, liwmin, lwmin, neig
+           ! Intrinsic Functions 
+           intrinsic :: max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 liwmin = 1
+                 lwmin = 1
+              else
+                 if( wantz ) then
+                    liwmin = 3 + 5*n
+                    lwmin = 1 + 6*n + 2*n**2
+                 else
+                    liwmin = 1
+                    lwmin = 2*n
+                 end if
+              end if
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSPGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of bp.
+           call stdlib_spptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_sspgst( itype, uplo, n, ap, bp, info )
+           call stdlib_sspevd( jobz, uplo, n, ap, w, z, ldz, work, lwork, iwork,liwork, info )
+                     
+           lwmin = max( real( lwmin,KIND=sp), real( work( 1 ),KIND=sp) )
+           liwmin = max( real( liwmin,KIND=sp), real( iwork( 1 ),KIND=sp) )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**t *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'T'
+                 end if
+                 do j = 1, neig
+                    call stdlib_stpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**t *y
+                 if( upper ) then
+                    trans = 'T'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, neig
+                    call stdlib_stpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_sspgvd
+
+     !> SBDSDC: computes the singular value decomposition (SVD) of a real
+     !> N-by-N (upper or lower) bidiagonal matrix B:  B = U * S * VT,
+     !> using a divide and conquer method, where S is a diagonal matrix
+     !> with non-negative diagonal elements (the singular values of B), and
+     !> U and VT are orthogonal matrices of left and right singular vectors,
+     !> respectively. SBDSDC can be used to compute all singular values,
+     !> and optionally, singular vectors or singular vectors in compact form.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.  See SLASD3 for details.
+     !> The code currently calls SLASDQ if singular values only are desired.
+     !> However, it can be slightly modified to compute singular values
+     !> using the divide and conquer method.
+
+     pure subroutine stdlib_sbdsdc( uplo, compq, n, d, e, u, ldu, vt, ldvt, q, iq,work, iwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu, ldvt, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iq(*), iwork(*)
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: q(*), u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+        ! changed dimension statement in comment describing e from (n) to
+        ! (n-1).  sven, 17 feb 05.
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: difl, difr, givcol, givnum, givptr, i, ic, icompq, ierr, ii, is, iu, &
+           iuplo, ivt, j, k, kk, mlvl, nm1, nsize, perm, poles, qstart, smlsiz, smlszp, sqre, &
+                     start, wstart, z
+           real(sp) :: cs, eps, orgnrm, p, r, sn
+           ! Intrinsic Functions 
+           intrinsic :: real,abs,int,log,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           iuplo = 0
+           if( stdlib_lsame( uplo, 'U' ) )iuplo = 1
+           if( stdlib_lsame( uplo, 'L' ) )iuplo = 2
+           if( stdlib_lsame( compq, 'N' ) ) then
+              icompq = 0
+           else if( stdlib_lsame( compq, 'P' ) ) then
+              icompq = 1
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              icompq = 2
+           else
+              icompq = -1
+           end if
+           if( iuplo==0 ) then
+              info = -1
+           else if( icompq<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ( ldu<1 ) .or. ( ( icompq==2 ) .and. ( ldu<n ) ) ) then
+              info = -7
+           else if( ( ldvt<1 ) .or. ( ( icompq==2 ) .and. ( ldvt<n ) ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SBDSDC', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           smlsiz = stdlib_ilaenv( 9, 'SBDSDC', ' ', 0, 0, 0, 0 )
+           if( n==1 ) then
+              if( icompq==1 ) then
+                 q( 1 ) = sign( one, d( 1 ) )
+                 q( 1+smlsiz*n ) = one
+              else if( icompq==2 ) then
+                 u( 1, 1 ) = sign( one, d( 1 ) )
+                 vt( 1, 1 ) = one
+              end if
+              d( 1 ) = abs( d( 1 ) )
+              return
+           end if
+           nm1 = n - 1
+           ! if matrix lower bidiagonal, rotate to be upper bidiagonal
+           ! by applying givens rotations on the left
+           wstart = 1
+           qstart = 3
+           if( icompq==1 ) then
+              call stdlib_scopy( n, d, 1, q( 1 ), 1 )
+              call stdlib_scopy( n-1, e, 1, q( n+1 ), 1 )
+           end if
+           if( iuplo==2 ) then
+              qstart = 5
+              if( icompq == 2 ) wstart = 2*n - 1
+              do i = 1, n - 1
+                 call stdlib_slartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( icompq==1 ) then
+                    q( i+2*n ) = cs
+                    q( i+3*n ) = sn
+                 else if( icompq==2 ) then
+                    work( i ) = cs
+                    work( nm1+i ) = -sn
+                 end if
+              end do
+           end if
+           ! if icompq = 0, use stdlib_slasdq to compute the singular values.
+           if( icompq==0 ) then
+              ! ignore wstart, instead using work( 1 ), since the two vectors
+              ! for cs and -sn above are added only if icompq == 2,
+              ! and adding them exceeds documented work size of 4*n.
+              call stdlib_slasdq( 'U', 0, n, 0, 0, 0, d, e, vt, ldvt, u, ldu, u,ldu, work( 1 ), &
+                        info )
+              go to 40
+           end if
+           ! if n is smaller than the minimum divide size smlsiz, then solve
+           ! the problem with another solver.
+           if( n<=smlsiz ) then
+              if( icompq==2 ) then
+                 call stdlib_slaset( 'A', n, n, zero, one, u, ldu )
+                 call stdlib_slaset( 'A', n, n, zero, one, vt, ldvt )
+                 call stdlib_slasdq( 'U', 0, n, n, n, 0, d, e, vt, ldvt, u, ldu, u,ldu, work( &
+                           wstart ), info )
+              else if( icompq==1 ) then
+                 iu = 1
+                 ivt = iu + n
+                 call stdlib_slaset( 'A', n, n, zero, one, q( iu+( qstart-1 )*n ),n )
+                 call stdlib_slaset( 'A', n, n, zero, one, q( ivt+( qstart-1 )*n ),n )
+                 call stdlib_slasdq( 'U', 0, n, n, n, 0, d, e,q( ivt+( qstart-1 )*n ), n,q( iu+( &
+                           qstart-1 )*n ), n,q( iu+( qstart-1 )*n ), n, work( wstart ),info )
+              end if
+              go to 40
+           end if
+           if( icompq==2 ) then
+              call stdlib_slaset( 'A', n, n, zero, one, u, ldu )
+              call stdlib_slaset( 'A', n, n, zero, one, vt, ldvt )
+           end if
+           ! scale.
+           orgnrm = stdlib_slanst( 'M', n, d, e )
+           if( orgnrm==zero )return
+           call stdlib_slascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, ierr )
+           call stdlib_slascl( 'G', 0, 0, orgnrm, one, nm1, 1, e, nm1, ierr )
+           eps = stdlib_slamch( 'EPSILON' )
+           mlvl = int( log( real( n,KIND=sp) / real( smlsiz+1,KIND=sp) ) / log( two ),KIND=ilp) + &
+                     1
+           smlszp = smlsiz + 1
+           if( icompq==1 ) then
+              iu = 1
+              ivt = 1 + smlsiz
+              difl = ivt + smlszp
+              difr = difl + mlvl
+              z = difr + mlvl*2
+              ic = z + mlvl
+              is = ic + 1
+              poles = is + 1
+              givnum = poles + 2*mlvl
+              k = 1
+              givptr = 2
+              perm = 3
+              givcol = perm + mlvl
+           end if
+           do i = 1, n
+              if( abs( d( i ) )<eps ) then
+                 d( i ) = sign( eps, d( i ) )
+              end if
+           end do
+           start = 1
+           sqre = 0
+           loop_30: do i = 1, nm1
+              if( ( abs( e( i ) )<eps ) .or. ( i==nm1 ) ) then
+              ! subproblem found. first determine its size and then
+              ! apply divide and conquer on it.
+                 if( i<nm1 ) then
+              ! a subproblem with e(i) small for i < nm1.
+                    nsize = i - start + 1
+                 else if( abs( e( i ) )>=eps ) then
+              ! a subproblem with e(nm1) not too small but i = nm1.
+                    nsize = n - start + 1
+                 else
+              ! a subproblem with e(nm1) small. this implies an
+              ! 1-by-1 subproblem at d(n). solve this 1-by-1 problem
+              ! first.
+                    nsize = i - start + 1
+                    if( icompq==2 ) then
+                       u( n, n ) = sign( one, d( n ) )
+                       vt( n, n ) = one
+                    else if( icompq==1 ) then
+                       q( n+( qstart-1 )*n ) = sign( one, d( n ) )
+                       q( n+( smlsiz+qstart-1 )*n ) = one
+                    end if
+                    d( n ) = abs( d( n ) )
+                 end if
+                 if( icompq==2 ) then
+                    call stdlib_slasd0( nsize, sqre, d( start ), e( start ),u( start, start ), &
+                              ldu, vt( start, start ),ldvt, smlsiz, iwork, work( wstart ), info )
+                 else
+                    call stdlib_slasda( icompq, smlsiz, nsize, sqre, d( start ),e( start ), q( &
+                    start+( iu+qstart-2 )*n ), n,q( start+( ivt+qstart-2 )*n ),iq( start+k*n ), q(&
+                     start+( difl+qstart-2 )*n ), q( start+( difr+qstart-2 )*n ),q( start+( z+&
+                     qstart-2 )*n ),q( start+( poles+qstart-2 )*n ),iq( start+givptr*n ), iq( &
+                     start+givcol*n ),n, iq( start+perm*n ),q( start+( givnum+qstart-2 )*n ),q( &
+                     start+( ic+qstart-2 )*n ),q( start+( is+qstart-2 )*n ),work( wstart ), iwork,&
+                                info )
+                 end if
+                 if( info/=0 ) then
+                    return
+                 end if
+                 start = i + 1
+              end if
+           end do loop_30
+           ! unscale
+           call stdlib_slascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, ierr )
+           40 continue
+           ! use selection sort to minimize swaps of singular vectors
+           do ii = 2, n
+              i = ii - 1
+              kk = i
+              p = d( i )
+              do j = ii, n
+                 if( d( j )>p ) then
+                    kk = j
+                    p = d( j )
+                 end if
+              end do
+              if( kk/=i ) then
+                 d( kk ) = d( i )
+                 d( i ) = p
+                 if( icompq==1 ) then
+                    iq( i ) = kk
+                 else if( icompq==2 ) then
+                    call stdlib_sswap( n, u( 1, i ), 1, u( 1, kk ), 1 )
+                    call stdlib_sswap( n, vt( i, 1 ), ldvt, vt( kk, 1 ), ldvt )
+                 end if
+              else if( icompq==1 ) then
+                 iq( i ) = i
+              end if
+           end do
+           ! if icompq = 1, use iq(n,1) as the indicator for uplo
+           if( icompq==1 ) then
+              if( iuplo==1 ) then
+                 iq( n ) = 1
+              else
+                 iq( n ) = 0
+              end if
+           end if
+           ! if b is lower bidiagonal, update u by those givens rotations
+           ! which rotated b to be upper bidiagonal
+           if( ( iuplo==2 ) .and. ( icompq==2 ) )call stdlib_slasr( 'L', 'V', 'B', n, n, work( 1 )&
+                     , work( n ), u, ldu )
+           return
+     end subroutine stdlib_sbdsdc
+
+     !> SBDSQR: computes the singular values and, optionally, the right and/or
+     !> left singular vectors from the singular value decomposition (SVD) of
+     !> a real N-by-N (upper or lower) bidiagonal matrix B using the implicit
+     !> zero-shift QR algorithm.  The SVD of B has the form
+     !> B = Q * S * P**T
+     !> where S is the diagonal matrix of singular values, Q is an orthogonal
+     !> matrix of left singular vectors, and P is an orthogonal matrix of
+     !> right singular vectors.  If left singular vectors are requested, this
+     !> subroutine actually returns U*Q instead of Q, and, if right singular
+     !> vectors are requested, this subroutine returns P**T*VT instead of
+     !> P**T, for given real input matrices U and VT.  When U and VT are the
+     !> orthogonal matrices that reduce a general matrix A to bidiagonal
+     !> form:  A = U*B*VT, as computed by SGEBRD, then
+     !> A = (U*Q) * S * (P**T*VT)
+     !> is the SVD of A.  Optionally, the subroutine may also compute Q**T*C
+     !> for a given real input matrix C.
+     !> See "Computing  Small Singular Values of Bidiagonal Matrices With
+     !> Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+     !> LAPACK Working Note #3 (or SIAM J. Sci. Statist. Comput. vol. 11,
+     !> no. 5, pp. 873-912, Sept 1990) and
+     !> "Accurate singular values and differential qd algorithms," by
+     !> B. Parlett and V. Fernando, Technical Report CPAM-554, Mathematics
+     !> Department, University of California at Berkeley, July 1992
+     !> for a detailed description of the algorithm.
+
+     pure subroutine stdlib_sbdsqr( uplo, n, ncvt, nru, ncc, d, e, vt, ldvt, u,ldu, c, ldc, work, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, ldu, ldvt, n, ncc, ncvt, nru
+           ! Array Arguments 
+           real(sp), intent(inout) :: c(ldc,*), d(*), e(*), u(ldu,*), vt(ldvt,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: hndrth = 0.01_sp
+           real(sp), parameter :: hndrd = 100.0_sp
+           real(sp), parameter :: meigth = -0.125_sp
+           integer(ilp), parameter :: maxitr = 6
+           
+           
+           
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, rotate
+           integer(ilp) :: i, idir, isub, iter, iterdivn, j, ll, lll, m, maxitdivn, nm1, nm12, &
+                     nm13, oldll, oldm
+           real(sp) :: abse, abss, cosl, cosr, cs, eps, f, g, h, mu, oldcs, oldsn, r, shift, &
+           sigmn, sigmx, sinl, sinr, sll, smax, smin, sminl, sminoa, sn, thresh, tol, tolmul, &
+                     unfl
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.lower ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ncvt<0 ) then
+              info = -3
+           else if( nru<0 ) then
+              info = -4
+           else if( ncc<0 ) then
+              info = -5
+           else if( ( ncvt==0 .and. ldvt<1 ) .or.( ncvt>0 .and. ldvt<max( 1, n ) ) ) then
+              info = -9
+           else if( ldu<max( 1, nru ) ) then
+              info = -11
+           else if( ( ncc==0 .and. ldc<1 ) .or.( ncc>0 .and. ldc<max( 1, n ) ) ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SBDSQR', -info )
+              return
+           end if
+           if( n==0 )return
+           if( n==1 )go to 160
+           ! rotate is true if any singular vectors desired, false otherwise
+           rotate = ( ncvt>0 ) .or. ( nru>0 ) .or. ( ncc>0 )
+           ! if no singular vectors desired, use qd algorithm
+           if( .not.rotate ) then
+              call stdlib_slasq1( n, d, e, work, info )
+           ! if info equals 2, dqds didn't finish, try to finish
+              if( info /= 2 ) return
+              info = 0
+           end if
+           nm1 = n - 1
+           nm12 = nm1 + nm1
+           nm13 = nm12 + nm1
+           idir = 0
+           ! get machine constants
+           eps = stdlib_slamch( 'EPSILON' )
+           unfl = stdlib_slamch( 'SAFE MINIMUM' )
+           ! if matrix lower bidiagonal, rotate to be upper bidiagonal
+           ! by applying givens rotations on the left
+           if( lower ) then
+              do i = 1, n - 1
+                 call stdlib_slartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 work( i ) = cs
+                 work( nm1+i ) = sn
+              end do
+              ! update singular vectors if desired
+              if( nru>0 )call stdlib_slasr( 'R', 'V', 'F', nru, n, work( 1 ), work( n ), u,ldu )
+                        
+              if( ncc>0 )call stdlib_slasr( 'L', 'V', 'F', n, ncc, work( 1 ), work( n ), c,ldc )
+                        
+           end if
+           ! compute singular values to relative accuracy tol
+           ! (by setting tol to be negative, algorithm will compute
+           ! singular values to absolute accuracy abs(tol)*norm(input matrix))
+           tolmul = max( ten, min( hndrd, eps**meigth ) )
+           tol = tolmul*eps
+           ! compute approximate maximum, minimum singular values
+           smax = zero
+           do i = 1, n
+              smax = max( smax, abs( d( i ) ) )
+           end do
+           do i = 1, n - 1
+              smax = max( smax, abs( e( i ) ) )
+           end do
+           sminl = zero
+           if( tol>=zero ) then
+              ! relative accuracy desired
+              sminoa = abs( d( 1 ) )
+              if( sminoa==zero )go to 50
+              mu = sminoa
+              do i = 2, n
+                 mu = abs( d( i ) )*( mu / ( mu+abs( e( i-1 ) ) ) )
+                 sminoa = min( sminoa, mu )
+                 if( sminoa==zero )go to 50
+              end do
+              50 continue
+              sminoa = sminoa / sqrt( real( n,KIND=sp) )
+              thresh = max( tol*sminoa, maxitr*(n*(n*unfl)) )
+           else
+              ! absolute accuracy desired
+              thresh = max( abs( tol )*smax, maxitr*(n*(n*unfl)) )
+           end if
+           ! prepare for main iteration loop for the singular values
+           ! (maxit is the maximum number of passes through the inner
+           ! loop permitted before nonconvergence signalled.)
+           maxitdivn = maxitr*n
+           iterdivn = 0
+           iter = -1
+           oldll = -1
+           oldm = -1
+           ! m points to last element of unconverged part of matrix
+           m = n
+           ! begin main iteration loop
+           60 continue
+           ! check for convergence or exceeding iteration count
+           if( m<=1 )go to 160
+           if( iter>=n ) then
+              iter = iter - n
+              iterdivn = iterdivn + 1
+              if( iterdivn>=maxitdivn )go to 200
+           end if
+           ! find diagonal block of matrix to work on
+           if( tol<zero .and. abs( d( m ) )<=thresh )d( m ) = zero
+           smax = abs( d( m ) )
+           smin = smax
+           do lll = 1, m - 1
+              ll = m - lll
+              abss = abs( d( ll ) )
+              abse = abs( e( ll ) )
+              if( tol<zero .and. abss<=thresh )d( ll ) = zero
+              if( abse<=thresh )go to 80
+              smin = min( smin, abss )
+              smax = max( smax, abss, abse )
+           end do
+           ll = 0
+           go to 90
+           80 continue
+           e( ll ) = zero
+           ! matrix splits since e(ll) = 0
+           if( ll==m-1 ) then
+              ! convergence of bottom singular value, return to top of loop
+              m = m - 1
+              go to 60
+           end if
+           90 continue
+           ll = ll + 1
+           ! e(ll) through e(m-1) are nonzero, e(ll-1) is zero
+           if( ll==m-1 ) then
+              ! 2 by 2 block, handle separately
+              call stdlib_slasv2( d( m-1 ), e( m-1 ), d( m ), sigmn, sigmx, sinr,cosr, sinl, cosl &
+                        )
+              d( m-1 ) = sigmx
+              e( m-1 ) = zero
+              d( m ) = sigmn
+              ! compute singular vectors, if desired
+              if( ncvt>0 )call stdlib_srot( ncvt, vt( m-1, 1 ), ldvt, vt( m, 1 ), ldvt, cosr,sinr &
+                        )
+              if( nru>0 )call stdlib_srot( nru, u( 1, m-1 ), 1, u( 1, m ), 1, cosl, sinl )
+              if( ncc>0 )call stdlib_srot( ncc, c( m-1, 1 ), ldc, c( m, 1 ), ldc, cosl,sinl )
+                        
+              m = m - 2
+              go to 60
+           end if
+           ! if working on new submatrix, choose shift direction
+           ! (from larger end diagonal element towards smaller)
+           if( ll>oldm .or. m<oldll ) then
+              if( abs( d( ll ) )>=abs( d( m ) ) ) then
+                 ! chase bulge from top (big end) to bottom (small end)
+                 idir = 1
+              else
+                 ! chase bulge from bottom (big end) to top (small end)
+                 idir = 2
+              end if
+           end if
+           ! apply convergence tests
+           if( idir==1 ) then
+              ! run convergence test in forward direction
+              ! first apply standard test to bottom of matrix
+              if( abs( e( m-1 ) )<=abs( tol )*abs( d( m ) ) .or.( tol<zero .and. abs( e( m-1 ) )&
+                        <=thresh ) ) then
+                 e( m-1 ) = zero
+                 go to 60
+              end if
+              if( tol>=zero ) then
+                 ! if relative accuracy desired,
+                 ! apply convergence criterion forward
+                 mu = abs( d( ll ) )
+                 sminl = mu
+                 do lll = ll, m - 1
+                    if( abs( e( lll ) )<=tol*mu ) then
+                       e( lll ) = zero
+                       go to 60
+                    end if
+                    mu = abs( d( lll+1 ) )*( mu / ( mu+abs( e( lll ) ) ) )
+                    sminl = min( sminl, mu )
+                 end do
+              end if
+           else
+              ! run convergence test in backward direction
+              ! first apply standard test to top of matrix
+              if( abs( e( ll ) )<=abs( tol )*abs( d( ll ) ) .or.( tol<zero .and. abs( e( ll ) )&
+                        <=thresh ) ) then
+                 e( ll ) = zero
+                 go to 60
+              end if
+              if( tol>=zero ) then
+                 ! if relative accuracy desired,
+                 ! apply convergence criterion backward
+                 mu = abs( d( m ) )
+                 sminl = mu
+                 do lll = m - 1, ll, -1
+                    if( abs( e( lll ) )<=tol*mu ) then
+                       e( lll ) = zero
+                       go to 60
+                    end if
+                    mu = abs( d( lll ) )*( mu / ( mu+abs( e( lll ) ) ) )
+                    sminl = min( sminl, mu )
+                 end do
+              end if
+           end if
+           oldll = ll
+           oldm = m
+           ! compute shift.  first, test if shifting would ruin relative
+           ! accuracy, and if so set the shift to zero.
+           if( tol>=zero .and. n*tol*( sminl / smax )<=max( eps, hndrth*tol ) ) then
+              ! use a zero shift to avoid loss of relative accuracy
+              shift = zero
+           else
+              ! compute the shift from 2-by-2 block at end of matrix
+              if( idir==1 ) then
+                 sll = abs( d( ll ) )
+                 call stdlib_slas2( d( m-1 ), e( m-1 ), d( m ), shift, r )
+              else
+                 sll = abs( d( m ) )
+                 call stdlib_slas2( d( ll ), e( ll ), d( ll+1 ), shift, r )
+              end if
+              ! test if shift negligible, and if so set to zero
+              if( sll>zero ) then
+                 if( ( shift / sll )**2<eps )shift = zero
+              end if
+           end if
+           ! increment iteration count
+           iter = iter + m - ll
+           ! if shift = 0, do simplified qr iteration
+           if( shift==zero ) then
+              if( idir==1 ) then
+                 ! chase bulge from top to bottom
+                 ! save cosines and sines for later singular vector updates
+                 cs = one
+                 oldcs = one
+                 do i = ll, m - 1
+                    call stdlib_slartg( d( i )*cs, e( i ), cs, sn, r )
+                    if( i>ll )e( i-1 ) = oldsn*r
+                    call stdlib_slartg( oldcs*r, d( i+1 )*sn, oldcs, oldsn, d( i ) )
+                    work( i-ll+1 ) = cs
+                    work( i-ll+1+nm1 ) = sn
+                    work( i-ll+1+nm12 ) = oldcs
+                    work( i-ll+1+nm13 ) = oldsn
+                 end do
+                 h = d( m )*cs
+                 d( m ) = h*oldcs
+                 e( m-1 ) = h*oldsn
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_slasr( 'L', 'V', 'F', m-ll+1, ncvt, work( 1 ),work( n ), &
+                           vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_slasr( 'R', 'V', 'F', nru, m-ll+1, work( nm12+1 ),work( &
+                           nm13+1 ), u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_slasr( 'L', 'V', 'F', m-ll+1, ncc, work( nm12+1 ),work( &
+                           nm13+1 ), c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( m-1 ) )<=thresh )e( m-1 ) = zero
+              else
+                 ! chase bulge from bottom to top
+                 ! save cosines and sines for later singular vector updates
+                 cs = one
+                 oldcs = one
+                 do i = m, ll + 1, -1
+                    call stdlib_slartg( d( i )*cs, e( i-1 ), cs, sn, r )
+                    if( i<m )e( i ) = oldsn*r
+                    call stdlib_slartg( oldcs*r, d( i-1 )*sn, oldcs, oldsn, d( i ) )
+                    work( i-ll ) = cs
+                    work( i-ll+nm1 ) = -sn
+                    work( i-ll+nm12 ) = oldcs
+                    work( i-ll+nm13 ) = -oldsn
+                 end do
+                 h = d( ll )*cs
+                 d( ll ) = h*oldcs
+                 e( ll ) = h*oldsn
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_slasr( 'L', 'V', 'B', m-ll+1, ncvt, work( nm12+1 ),work( &
+                           nm13+1 ), vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_slasr( 'R', 'V', 'B', nru, m-ll+1, work( 1 ),work( n ), u(&
+                            1, ll ), ldu )
+                 if( ncc>0 )call stdlib_slasr( 'L', 'V', 'B', m-ll+1, ncc, work( 1 ),work( n ), c(&
+                            ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( ll ) )<=thresh )e( ll ) = zero
+              end if
+           else
+              ! use nonzero shift
+              if( idir==1 ) then
+                 ! chase bulge from top to bottom
+                 ! save cosines and sines for later singular vector updates
+                 f = ( abs( d( ll ) )-shift )*( sign( one, d( ll ) )+shift / d( ll ) )
+                 g = e( ll )
+                 do i = ll, m - 1
+                    call stdlib_slartg( f, g, cosr, sinr, r )
+                    if( i>ll )e( i-1 ) = r
+                    f = cosr*d( i ) + sinr*e( i )
+                    e( i ) = cosr*e( i ) - sinr*d( i )
+                    g = sinr*d( i+1 )
+                    d( i+1 ) = cosr*d( i+1 )
+                    call stdlib_slartg( f, g, cosl, sinl, r )
+                    d( i ) = r
+                    f = cosl*e( i ) + sinl*d( i+1 )
+                    d( i+1 ) = cosl*d( i+1 ) - sinl*e( i )
+                    if( i<m-1 ) then
+                       g = sinl*e( i+1 )
+                       e( i+1 ) = cosl*e( i+1 )
+                    end if
+                    work( i-ll+1 ) = cosr
+                    work( i-ll+1+nm1 ) = sinr
+                    work( i-ll+1+nm12 ) = cosl
+                    work( i-ll+1+nm13 ) = sinl
+                 end do
+                 e( m-1 ) = f
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_slasr( 'L', 'V', 'F', m-ll+1, ncvt, work( 1 ),work( n ), &
+                           vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_slasr( 'R', 'V', 'F', nru, m-ll+1, work( nm12+1 ),work( &
+                           nm13+1 ), u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_slasr( 'L', 'V', 'F', m-ll+1, ncc, work( nm12+1 ),work( &
+                           nm13+1 ), c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( m-1 ) )<=thresh )e( m-1 ) = zero
+              else
+                 ! chase bulge from bottom to top
+                 ! save cosines and sines for later singular vector updates
+                 f = ( abs( d( m ) )-shift )*( sign( one, d( m ) )+shift /d( m ) )
+                 g = e( m-1 )
+                 do i = m, ll + 1, -1
+                    call stdlib_slartg( f, g, cosr, sinr, r )
+                    if( i<m )e( i ) = r
+                    f = cosr*d( i ) + sinr*e( i-1 )
+                    e( i-1 ) = cosr*e( i-1 ) - sinr*d( i )
+                    g = sinr*d( i-1 )
+                    d( i-1 ) = cosr*d( i-1 )
+                    call stdlib_slartg( f, g, cosl, sinl, r )
+                    d( i ) = r
+                    f = cosl*e( i-1 ) + sinl*d( i-1 )
+                    d( i-1 ) = cosl*d( i-1 ) - sinl*e( i-1 )
+                    if( i>ll+1 ) then
+                       g = sinl*e( i-2 )
+                       e( i-2 ) = cosl*e( i-2 )
+                    end if
+                    work( i-ll ) = cosr
+                    work( i-ll+nm1 ) = -sinr
+                    work( i-ll+nm12 ) = cosl
+                    work( i-ll+nm13 ) = -sinl
+                 end do
+                 e( ll ) = f
+                 ! test convergence
+                 if( abs( e( ll ) )<=thresh )e( ll ) = zero
+                 ! update singular vectors if desired
+                 if( ncvt>0 )call stdlib_slasr( 'L', 'V', 'B', m-ll+1, ncvt, work( nm12+1 ),work( &
+                           nm13+1 ), vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_slasr( 'R', 'V', 'B', nru, m-ll+1, work( 1 ),work( n ), u(&
+                            1, ll ), ldu )
+                 if( ncc>0 )call stdlib_slasr( 'L', 'V', 'B', m-ll+1, ncc, work( 1 ),work( n ), c(&
+                            ll, 1 ), ldc )
+              end if
+           end if
+           ! qr iteration finished, go back and check convergence
+           go to 60
+           ! all singular values converged, so make them positive
+           160 continue
+           do i = 1, n
+              if( d( i )<zero ) then
+                 d( i ) = -d( i )
+                 ! change sign of singular vectors, if desired
+                 if( ncvt>0 )call stdlib_sscal( ncvt, negone, vt( i, 1 ), ldvt )
+              end if
+           end do
+           ! sort the singular values into decreasing order (insertion sort on
+           ! singular values, but only one transposition per singular vector)
+           do i = 1, n - 1
+              ! scan for smallest d(i)
+              isub = 1
+              smin = d( 1 )
+              do j = 2, n + 1 - i
+                 if( d( j )<=smin ) then
+                    isub = j
+                    smin = d( j )
+                 end if
+              end do
+              if( isub/=n+1-i ) then
+                 ! swap singular values and vectors
+                 d( isub ) = d( n+1-i )
+                 d( n+1-i ) = smin
+                 if( ncvt>0 )call stdlib_sswap( ncvt, vt( isub, 1 ), ldvt, vt( n+1-i, 1 ),ldvt )
+                           
+                 if( nru>0 )call stdlib_sswap( nru, u( 1, isub ), 1, u( 1, n+1-i ), 1 )
+                 if( ncc>0 )call stdlib_sswap( ncc, c( isub, 1 ), ldc, c( n+1-i, 1 ), ldc )
+                           
+              end if
+           end do
+           go to 220
+           ! maximum number of iterations exceeded, failure to converge
+           200 continue
+           info = 0
+           do i = 1, n - 1
+              if( e( i )/=zero )info = info + 1
+           end do
+           220 continue
+           return
+     end subroutine stdlib_sbdsqr
+
+     !> SGEES: computes for an N-by-N real nonsymmetric matrix A, the
+     !> eigenvalues, the real Schur form T, and, optionally, the matrix of
+     !> Schur vectors Z.  This gives the Schur factorization A = Z*T*(Z**T).
+     !> Optionally, it also orders the eigenvalues on the diagonal of the
+     !> real Schur form so that selected eigenvalues are at the top left.
+     !> The leading columns of Z then form an orthonormal basis for the
+     !> invariant subspace corresponding to the selected eigenvalues.
+     !> A matrix is in real Schur form if it is upper quasi-triangular with
+     !> 1-by-1 and 2-by-2 blocks. 2-by-2 blocks will be standardized in the
+     !> form
+     !> [  a  b  ]
+     !> [  c  a  ]
+     !> where b*c < 0. The eigenvalues of such a block are a +- sqrt(bc).
+
+     subroutine stdlib_sgees( jobvs, sort, select, n, a, lda, sdim, wr, wi,vs, ldvs, work, lwork, &
+               bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvs, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldvs, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: vs(ldvs,*), wi(*), work(*), wr(*)
+           ! Function Arguments 
+           procedure(stdlib_select_s) :: select
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, lastsl, lquery, lst2sl, scalea, wantst, wantvs
+           integer(ilp) :: hswork, i, i1, i2, ibal, icond, ierr, ieval, ihi, ilo, inxt, ip, itau, &
+                     iwrk, maxwrk, minwrk
+           real(sp) :: anrm, bignum, cscale, eps, s, sep, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(sp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvs = stdlib_lsame( jobvs, 'V' )
+           wantst = stdlib_lsame( sort, 'S' )
+           if( ( .not.wantvs ) .and. ( .not.stdlib_lsame( jobvs, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldvs<1 .or. ( wantvs .and. ldvs<n ) ) then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_shseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = 2*n + n*stdlib_ilaenv( 1, 'SGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 3*n
+                 call stdlib_shseqr( 'S', jobvs, n, 1, n, a, lda, wr, wi, vs, ldvs,work, -1, &
+                           ieval )
+                 hswork = work( 1 )
+                 if( .not.wantvs ) then
+                    maxwrk = max( maxwrk, n + hswork )
+                 else
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'SORGHR', ' ', n, 1, n,&
+                               -1 ) )
+                    maxwrk = max( maxwrk, n + hswork )
+                 end if
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEES ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_slascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (workspace: need n)
+           ibal = 1
+           call stdlib_sgebal( 'P', n, a, lda, ilo, ihi, work( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (workspace: need 3*n, prefer 2*n+n*nb)
+           itau = n + ibal
+           iwrk = n + itau
+           call stdlib_sgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvs ) then
+              ! copy householder vectors to vs
+              call stdlib_slacpy( 'L', n, n, a, lda, vs, ldvs )
+              ! generate orthogonal matrix in vs
+              ! (workspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              call stdlib_sorghr( n, ilo, ihi, vs, ldvs, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+           end if
+           sdim = 0
+           ! perform qr iteration, accumulating schur vectors in vs if desired
+           ! (workspace: need n+1, prefer n+hswork (see comments) )
+           iwrk = itau
+           call stdlib_shseqr( 'S', jobvs, n, ilo, ihi, a, lda, wr, wi, vs, ldvs,work( iwrk ), &
+                     lwork-iwrk+1, ieval )
+           if( ieval>0 )info = ieval
+           ! sort eigenvalues if desired
+           if( wantst .and. info==0 ) then
+              if( scalea ) then
+                 call stdlib_slascl( 'G', 0, 0, cscale, anrm, n, 1, wr, n, ierr )
+                 call stdlib_slascl( 'G', 0, 0, cscale, anrm, n, 1, wi, n, ierr )
+              end if
+              do i = 1, n
+                 bwork( i ) = select( wr( i ), wi( i ) )
+              end do
+              ! reorder eigenvalues and transform schur vectors
+              ! (workspace: none needed)
+              call stdlib_strsen( 'N', jobvs, bwork, n, a, lda, vs, ldvs, wr, wi,sdim, s, sep, &
+                        work( iwrk ), lwork-iwrk+1, idum, 1,icond )
+              if( icond>0 )info = n + icond
+           end if
+           if( wantvs ) then
+              ! undo balancing
+              ! (workspace: need n)
+              call stdlib_sgebak( 'P', 'R', n, ilo, ihi, work( ibal ), n, vs, ldvs,ierr )
+           end if
+           if( scalea ) then
+              ! undo scaling for the schur form of a
+              call stdlib_slascl( 'H', 0, 0, cscale, anrm, n, n, a, lda, ierr )
+              call stdlib_scopy( n, a, lda+1, wr, 1 )
+              if( cscale==smlnum ) then
+                 ! if scaling back towards underflow, adjust wi if an
+                 ! offdiagonal element of a 2-by-2 block in the schur form
+                 ! underflows.
+                 if( ieval>0 ) then
+                    i1 = ieval + 1
+                    i2 = ihi - 1
+                    call stdlib_slascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wi,max( ilo-1, 1 ), &
+                              ierr )
+                 else if( wantst ) then
+                    i1 = 1
+                    i2 = n - 1
+                 else
+                    i1 = ilo
+                    i2 = ihi - 1
+                 end if
+                 inxt = i1 - 1
+                 loop_20: do i = i1, i2
+                    if( i<inxt )cycle loop_20
+                    if( wi( i )==zero ) then
+                       inxt = i + 1
+                    else
+                       if( a( i+1, i )==zero ) then
+                          wi( i ) = zero
+                          wi( i+1 ) = zero
+                       else if( a( i+1, i )/=zero .and. a( i, i+1 )==zero ) then
+                          wi( i ) = zero
+                          wi( i+1 ) = zero
+                          if( i>1 )call stdlib_sswap( i-1, a( 1, i ), 1, a( 1, i+1 ), 1 )
+                          if( n>i+1 )call stdlib_sswap( n-i-1, a( i, i+2 ), lda,a( i+1, i+2 ), &
+                                    lda )
+                          if( wantvs ) then
+                             call stdlib_sswap( n, vs( 1, i ), 1, vs( 1, i+1 ), 1 )
+                          end if
+                          a( i, i+1 ) = a( i+1, i )
+                          a( i+1, i ) = zero
+                       end if
+                       inxt = i + 2
+                    end if
+                 end do loop_20
+              end if
+              ! undo scaling for the imaginary part of the eigenvalues
+              call stdlib_slascl( 'G', 0, 0, cscale, anrm, n-ieval, 1,wi( ieval+1 ), max( n-ieval,&
+                         1 ), ierr )
+           end if
+           if( wantst .and. info==0 ) then
+              ! check if reordering successful
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = select( wr( i ), wi( i ) )
+                 if( wi( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_sgees
+
+     !> SGEESX: computes for an N-by-N real nonsymmetric matrix A, the
+     !> eigenvalues, the real Schur form T, and, optionally, the matrix of
+     !> Schur vectors Z.  This gives the Schur factorization A = Z*T*(Z**T).
+     !> Optionally, it also orders the eigenvalues on the diagonal of the
+     !> real Schur form so that selected eigenvalues are at the top left;
+     !> computes a reciprocal condition number for the average of the
+     !> selected eigenvalues (RCONDE); and computes a reciprocal condition
+     !> number for the right invariant subspace corresponding to the
+     !> selected eigenvalues (RCONDV).  The leading columns of Z form an
+     !> orthonormal basis for this invariant subspace.
+     !> For further explanation of the reciprocal condition numbers RCONDE
+     !> and RCONDV, see Section 4.10_sp of the LAPACK Users' Guide (where
+     !> these quantities are called s and sep respectively).
+     !> A real matrix is in real Schur form if it is upper quasi-triangular
+     !> with 1-by-1 and 2-by-2 blocks. 2-by-2 blocks will be standardized in
+     !> the form
+     !> [  a  b  ]
+     !> [  c  a  ]
+     !> where b*c < 0. The eigenvalues of such a block are a +- sqrt(bc).
+
+     subroutine stdlib_sgeesx( jobvs, sort, select, sense, n, a, lda, sdim,wr, wi, vs, ldvs, &
+               rconde, rcondv, work, lwork,iwork, liwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvs, sense, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldvs, liwork, lwork, n
+           real(sp), intent(out) :: rconde, rcondv
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: vs(ldvs,*), wi(*), work(*), wr(*)
+           ! Function Arguments 
+           procedure(stdlib_select_s) :: select
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, lastsl, lquery, lst2sl, scalea, wantsb, wantse, wantsn, wantst, &
+                     wantsv, wantvs
+           integer(ilp) :: hswork, i, i1, i2, ibal, icond, ierr, ieval, ihi, ilo, inxt, ip, itau, &
+                     iwrk, lwrk, liwrk, maxwrk, minwrk
+           real(sp) :: anrm, bignum, cscale, eps, smlnum
+           ! Local Arrays 
+           real(sp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           wantvs = stdlib_lsame( jobvs, 'V' )
+           wantst = stdlib_lsame( sort, 'S' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( ( .not.wantvs ) .and. ( .not.stdlib_lsame( jobvs, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -2
+           else if( .not.( wantsn .or. wantse .or. wantsv .or. wantsb ) .or.( .not.wantst .and. &
+                     .not.wantsn ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldvs<1 .or. ( wantvs .and. ldvs<n ) ) then
+              info = -12
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "rworkspace:" describe the
+             ! minimal amount of real workspace needed at that point in the
+             ! code, as well as the preferred amount for good performance.
+             ! iworkspace refers to integer workspace.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_shseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.
+             ! if sense = 'e', 'v' or 'b', then the amount of workspace needed
+             ! depends on sdim, which is computed by the routine stdlib_strsen later
+             ! in the code.)
+           if( info==0 ) then
+              liwrk = 1
+              if( n==0 ) then
+                 minwrk = 1
+                 lwrk = 1
+              else
+                 maxwrk = 2*n + n*stdlib_ilaenv( 1, 'SGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 3*n
+                 call stdlib_shseqr( 'S', jobvs, n, 1, n, a, lda, wr, wi, vs, ldvs,work, -1, &
+                           ieval )
+                 hswork = work( 1 )
+                 if( .not.wantvs ) then
+                    maxwrk = max( maxwrk, n + hswork )
+                 else
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'SORGHR', ' ', n, 1, n,&
+                               -1 ) )
+                    maxwrk = max( maxwrk, n + hswork )
+                 end if
+                 lwrk = maxwrk
+                 if( .not.wantsn )lwrk = max( lwrk, n + ( n*n )/2 )
+                 if( wantsv .or. wantsb )liwrk = ( n*n )/4
+              end if
+              iwork( 1 ) = liwrk
+              work( 1 ) = lwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -16
+              else if( liwork<1 .and. .not.lquery ) then
+                 info = -18
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEESX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_slascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (rworkspace: need n)
+           ibal = 1
+           call stdlib_sgebal( 'P', n, a, lda, ilo, ihi, work( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (rworkspace: need 3*n, prefer 2*n+n*nb)
+           itau = n + ibal
+           iwrk = n + itau
+           call stdlib_sgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvs ) then
+              ! copy householder vectors to vs
+              call stdlib_slacpy( 'L', n, n, a, lda, vs, ldvs )
+              ! generate orthogonal matrix in vs
+              ! (rworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              call stdlib_sorghr( n, ilo, ihi, vs, ldvs, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+           end if
+           sdim = 0
+           ! perform qr iteration, accumulating schur vectors in vs if desired
+           ! (rworkspace: need n+1, prefer n+hswork (see comments) )
+           iwrk = itau
+           call stdlib_shseqr( 'S', jobvs, n, ilo, ihi, a, lda, wr, wi, vs, ldvs,work( iwrk ), &
+                     lwork-iwrk+1, ieval )
+           if( ieval>0 )info = ieval
+           ! sort eigenvalues if desired
+           if( wantst .and. info==0 ) then
+              if( scalea ) then
+                 call stdlib_slascl( 'G', 0, 0, cscale, anrm, n, 1, wr, n, ierr )
+                 call stdlib_slascl( 'G', 0, 0, cscale, anrm, n, 1, wi, n, ierr )
+              end if
+              do i = 1, n
+                 bwork( i ) = select( wr( i ), wi( i ) )
+              end do
+              ! reorder eigenvalues, transform schur vectors, and compute
+              ! reciprocal condition numbers
+              ! (rworkspace: if sense is not 'n', need n+2*sdim*(n-sdim)
+                           ! otherwise, need n )
+              ! (iworkspace: if sense is 'v' or 'b', need sdim*(n-sdim)
+                           ! otherwise, need 0 )
+              call stdlib_strsen( sense, jobvs, bwork, n, a, lda, vs, ldvs, wr, wi,sdim, rconde, &
+                        rcondv, work( iwrk ), lwork-iwrk+1,iwork, liwork, icond )
+              if( .not.wantsn )maxwrk = max( maxwrk, n+2*sdim*( n-sdim ) )
+              if( icond==-15 ) then
+                 ! not enough real workspace
+                 info = -16
+              else if( icond==-17 ) then
+                 ! not enough integer workspace
+                 info = -18
+              else if( icond>0 ) then
+                 ! stdlib_strsen failed to reorder or to restore standard schur form
+                 info = icond + n
+              end if
+           end if
+           if( wantvs ) then
+              ! undo balancing
+              ! (rworkspace: need n)
+              call stdlib_sgebak( 'P', 'R', n, ilo, ihi, work( ibal ), n, vs, ldvs,ierr )
+           end if
+           if( scalea ) then
+              ! undo scaling for the schur form of a
+              call stdlib_slascl( 'H', 0, 0, cscale, anrm, n, n, a, lda, ierr )
+              call stdlib_scopy( n, a, lda+1, wr, 1 )
+              if( ( wantsv .or. wantsb ) .and. info==0 ) then
+                 dum( 1 ) = rcondv
+                 call stdlib_slascl( 'G', 0, 0, cscale, anrm, 1, 1, dum, 1, ierr )
+                 rcondv = dum( 1 )
+              end if
+              if( cscale==smlnum ) then
+                 ! if scaling back towards underflow, adjust wi if an
+                 ! offdiagonal element of a 2-by-2 block in the schur form
+                 ! underflows.
+                 if( ieval>0 ) then
+                    i1 = ieval + 1
+                    i2 = ihi - 1
+                    call stdlib_slascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wi, n,ierr )
+                 else if( wantst ) then
+                    i1 = 1
+                    i2 = n - 1
+                 else
+                    i1 = ilo
+                    i2 = ihi - 1
+                 end if
+                 inxt = i1 - 1
+                 loop_20: do i = i1, i2
+                    if( i<inxt )cycle loop_20
+                    if( wi( i )==zero ) then
+                       inxt = i + 1
+                    else
+                       if( a( i+1, i )==zero ) then
+                          wi( i ) = zero
+                          wi( i+1 ) = zero
+                       else if( a( i+1, i )/=zero .and. a( i, i+1 )==zero ) then
+                          wi( i ) = zero
+                          wi( i+1 ) = zero
+                          if( i>1 )call stdlib_sswap( i-1, a( 1, i ), 1, a( 1, i+1 ), 1 )
+                          if( n>i+1 )call stdlib_sswap( n-i-1, a( i, i+2 ), lda,a( i+1, i+2 ), &
+                                    lda )
+                          if( wantvs ) then
+                            call stdlib_sswap( n, vs( 1, i ), 1, vs( 1, i+1 ), 1 )
+                          end if
+                          a( i, i+1 ) = a( i+1, i )
+                          a( i+1, i ) = zero
+                       end if
+                       inxt = i + 2
+                    end if
+                 end do loop_20
+              end if
+              call stdlib_slascl( 'G', 0, 0, cscale, anrm, n-ieval, 1,wi( ieval+1 ), max( n-ieval,&
+                         1 ), ierr )
+           end if
+           if( wantst .and. info==0 ) then
+              ! check if reordering successful
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = select( wr( i ), wi( i ) )
+                 if( wi( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           work( 1 ) = maxwrk
+           if( wantsv .or. wantsb ) then
+              iwork( 1 ) = sdim*(n-sdim)
+           else
+              iwork( 1 ) = 1
+           end if
+           return
+     end subroutine stdlib_sgeesx
+
+     !> SGEEV: computes for an N-by-N real nonsymmetric matrix A, the
+     !> eigenvalues and, optionally, the left and/or right eigenvectors.
+     !> The right eigenvector v(j) of A satisfies
+     !> A * v(j) = lambda(j) * v(j)
+     !> where lambda(j) is its eigenvalue.
+     !> The left eigenvector u(j) of A satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H
+     !> where u(j)**H denotes the conjugate-transpose of u(j).
+     !> The computed eigenvectors are normalized to have Euclidean norm
+     !> equal to 1 and largest component real.
+
+     subroutine stdlib_sgeev( jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr,ldvr, work, lwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: vl(ldvl,*), vr(ldvr,*), wi(*), work(*), wr(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantvl, wantvr
+           character :: side
+           integer(ilp) :: hswork, i, ibal, ierr, ihi, ilo, itau, iwrk, k, lwork_trevc, maxwrk, &
+                     minwrk, nout
+           real(sp) :: anrm, bignum, cs, cscale, eps, r, scl, smlnum, sn
+           ! Local Arrays 
+           logical(lk) :: select(1)
+           real(sp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvl = stdlib_lsame( jobvl, 'V' )
+           wantvr = stdlib_lsame( jobvr, 'V' )
+           if( ( .not.wantvl ) .and. ( .not.stdlib_lsame( jobvl, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantvr ) .and. ( .not.stdlib_lsame( jobvr, 'N' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldvl<1 .or. ( wantvl .and. ldvl<n ) ) then
+              info = -9
+           else if( ldvr<1 .or. ( wantvr .and. ldvr<n ) ) then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_shseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = 2*n + n*stdlib_ilaenv( 1, 'SGEHRD', ' ', n, 1, n, 0 )
+                 if( wantvl ) then
+                    minwrk = 4*n
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'SORGHR', ' ', n, 1, n,&
+                               -1 ) )
+                    call stdlib_shseqr( 'S', 'V', n, 1, n, a, lda, wr, wi, vl, ldvl,work, -1, &
+                              info )
+                    hswork = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + 1, n + hswork )
+                    call stdlib_strevc3( 'L', 'B', select, n, a, lda,vl, ldvl, vr, ldvr, n, nout,&
+                              work, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    maxwrk = max( maxwrk, 4*n )
+                 else if( wantvr ) then
+                    minwrk = 4*n
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'SORGHR', ' ', n, 1, n,&
+                               -1 ) )
+                    call stdlib_shseqr( 'S', 'V', n, 1, n, a, lda, wr, wi, vr, ldvr,work, -1, &
+                              info )
+                    hswork = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + 1, n + hswork )
+                    call stdlib_strevc3( 'R', 'B', select, n, a, lda,vl, ldvl, vr, ldvr, n, nout,&
+                              work, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    maxwrk = max( maxwrk, 4*n )
+                 else
+                    minwrk = 3*n
+                    call stdlib_shseqr( 'E', 'N', n, 1, n, a, lda, wr, wi, vr, ldvr,work, -1, &
+                              info )
+                    hswork = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + 1, n + hswork )
+                 end if
+                 maxwrk = max( maxwrk, minwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_slascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! balance the matrix
+           ! (workspace: need n)
+           ibal = 1
+           call stdlib_sgebal( 'B', n, a, lda, ilo, ihi, work( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (workspace: need 3*n, prefer 2*n+n*nb)
+           itau = ibal + n
+           iwrk = itau + n
+           call stdlib_sgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvl ) then
+              ! want left eigenvectors
+              ! copy householder vectors to vl
+              side = 'L'
+              call stdlib_slacpy( 'L', n, n, a, lda, vl, ldvl )
+              ! generate orthogonal matrix in vl
+              ! (workspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              call stdlib_sorghr( n, ilo, ihi, vl, ldvl, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vl
+              ! (workspace: need n+1, prefer n+hswork (see comments) )
+              iwrk = itau
+              call stdlib_shseqr( 'S', 'V', n, ilo, ihi, a, lda, wr, wi, vl, ldvl,work( iwrk ), &
+                        lwork-iwrk+1, info )
+              if( wantvr ) then
+                 ! want left and right eigenvectors
+                 ! copy schur vectors to vr
+                 side = 'B'
+                 call stdlib_slacpy( 'F', n, n, vl, ldvl, vr, ldvr )
+              end if
+           else if( wantvr ) then
+              ! want right eigenvectors
+              ! copy householder vectors to vr
+              side = 'R'
+              call stdlib_slacpy( 'L', n, n, a, lda, vr, ldvr )
+              ! generate orthogonal matrix in vr
+              ! (workspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              call stdlib_sorghr( n, ilo, ihi, vr, ldvr, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vr
+              ! (workspace: need n+1, prefer n+hswork (see comments) )
+              iwrk = itau
+              call stdlib_shseqr( 'S', 'V', n, ilo, ihi, a, lda, wr, wi, vr, ldvr,work( iwrk ), &
+                        lwork-iwrk+1, info )
+           else
+              ! compute eigenvalues only
+              ! (workspace: need n+1, prefer n+hswork (see comments) )
+              iwrk = itau
+              call stdlib_shseqr( 'E', 'N', n, ilo, ihi, a, lda, wr, wi, vr, ldvr,work( iwrk ), &
+                        lwork-iwrk+1, info )
+           end if
+           ! if info /= 0 from stdlib_shseqr, then quit
+           if( info/=0 )go to 50
+           if( wantvl .or. wantvr ) then
+              ! compute left and/or right eigenvectors
+              ! (workspace: need 4*n, prefer n + n + 2*n*nb)
+              call stdlib_strevc3( side, 'B', select, n, a, lda, vl, ldvl, vr, ldvr,n, nout, work(&
+                         iwrk ), lwork-iwrk+1, ierr )
+           end if
+           if( wantvl ) then
+              ! undo balancing of left eigenvectors
+              ! (workspace: need n)
+              call stdlib_sgebak( 'B', 'L', n, ilo, ihi, work( ibal ), n, vl, ldvl,ierr )
+              ! normalize left eigenvectors and make largest component real
+              do i = 1, n
+                 if( wi( i )==zero ) then
+                    scl = one / stdlib_snrm2( n, vl( 1, i ), 1 )
+                    call stdlib_sscal( n, scl, vl( 1, i ), 1 )
+                 else if( wi( i )>zero ) then
+                    scl = one / stdlib_slapy2( stdlib_snrm2( n, vl( 1, i ), 1 ),stdlib_snrm2( n, &
+                              vl( 1, i+1 ), 1 ) )
+                    call stdlib_sscal( n, scl, vl( 1, i ), 1 )
+                    call stdlib_sscal( n, scl, vl( 1, i+1 ), 1 )
+                    do k = 1, n
+                       work( iwrk+k-1 ) = vl( k, i )**2 + vl( k, i+1 )**2
+                    end do
+                    k = stdlib_isamax( n, work( iwrk ), 1 )
+                    call stdlib_slartg( vl( k, i ), vl( k, i+1 ), cs, sn, r )
+                    call stdlib_srot( n, vl( 1, i ), 1, vl( 1, i+1 ), 1, cs, sn )
+                    vl( k, i+1 ) = zero
+                 end if
+              end do
+           end if
+           if( wantvr ) then
+              ! undo balancing of right eigenvectors
+              ! (workspace: need n)
+              call stdlib_sgebak( 'B', 'R', n, ilo, ihi, work( ibal ), n, vr, ldvr,ierr )
+              ! normalize right eigenvectors and make largest component real
+              do i = 1, n
+                 if( wi( i )==zero ) then
+                    scl = one / stdlib_snrm2( n, vr( 1, i ), 1 )
+                    call stdlib_sscal( n, scl, vr( 1, i ), 1 )
+                 else if( wi( i )>zero ) then
+                    scl = one / stdlib_slapy2( stdlib_snrm2( n, vr( 1, i ), 1 ),stdlib_snrm2( n, &
+                              vr( 1, i+1 ), 1 ) )
+                    call stdlib_sscal( n, scl, vr( 1, i ), 1 )
+                    call stdlib_sscal( n, scl, vr( 1, i+1 ), 1 )
+                    do k = 1, n
+                       work( iwrk+k-1 ) = vr( k, i )**2 + vr( k, i+1 )**2
+                    end do
+                    k = stdlib_isamax( n, work( iwrk ), 1 )
+                    call stdlib_slartg( vr( k, i ), vr( k, i+1 ), cs, sn, r )
+                    call stdlib_srot( n, vr( 1, i ), 1, vr( 1, i+1 ), 1, cs, sn )
+                    vr( k, i+1 ) = zero
+                 end if
+              end do
+           end if
+           ! undo scaling if necessary
+           50 continue
+           if( scalea ) then
+              call stdlib_slascl( 'G', 0, 0, cscale, anrm, n-info, 1, wr( info+1 ),max( n-info, 1 &
+                        ), ierr )
+              call stdlib_slascl( 'G', 0, 0, cscale, anrm, n-info, 1, wi( info+1 ),max( n-info, 1 &
+                        ), ierr )
+              if( info>0 ) then
+                 call stdlib_slascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wr, n,ierr )
+                 call stdlib_slascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wi, n,ierr )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_sgeev
+
+     !> SGEEVX: computes for an N-by-N real nonsymmetric matrix A, the
+     !> eigenvalues and, optionally, the left and/or right eigenvectors.
+     !> Optionally also, it computes a balancing transformation to improve
+     !> the conditioning of the eigenvalues and eigenvectors (ILO, IHI,
+     !> SCALE, and ABNRM), reciprocal condition numbers for the eigenvalues
+     !> (RCONDE), and reciprocal condition numbers for the right
+     !> eigenvectors (RCONDV).
+     !> The right eigenvector v(j) of A satisfies
+     !> A * v(j) = lambda(j) * v(j)
+     !> where lambda(j) is its eigenvalue.
+     !> The left eigenvector u(j) of A satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H
+     !> where u(j)**H denotes the conjugate-transpose of u(j).
+     !> The computed eigenvectors are normalized to have Euclidean norm
+     !> equal to 1 and largest component real.
+     !> Balancing a matrix means permuting the rows and columns to make it
+     !> more nearly upper triangular, and applying a diagonal similarity
+     !> transformation D * A * D**(-1), where D is a diagonal matrix, to
+     !> make its rows and columns closer in norm and the condition numbers
+     !> of its eigenvalues and eigenvectors smaller.  The computed
+     !> reciprocal condition numbers correspond to the balanced matrix.
+     !> Permuting rows and columns will not change the condition numbers
+     !> (in exact arithmetic) but diagonal scaling will.  For further
+     !> explanation of balancing, see section 4.10.2_sp of the LAPACK
+     !> Users' Guide.
+
+     subroutine stdlib_sgeevx( balanc, jobvl, jobvr, sense, n, a, lda, wr, wi,vl, ldvl, vr, ldvr, &
+               ilo, ihi, scale, abnrm,rconde, rcondv, work, lwork, iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: balanc, jobvl, jobvr, sense
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldvl, ldvr, lwork, n
+           real(sp), intent(out) :: abnrm
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: rconde(*), rcondv(*), scale(*), vl(ldvl,*), vr(ldvr,*), wi(*),&
+                      work(*), wr(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantvl, wantvr, wntsnb, wntsne, wntsnn, wntsnv
+           character :: job, side
+           integer(ilp) :: hswork, i, icond, ierr, itau, iwrk, k, lwork_trevc, maxwrk, minwrk, &
+                     nout
+           real(sp) :: anrm, bignum, cs, cscale, eps, r, scl, smlnum, sn
+           ! Local Arrays 
+           logical(lk) :: select(1)
+           real(sp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvl = stdlib_lsame( jobvl, 'V' )
+           wantvr = stdlib_lsame( jobvr, 'V' )
+           wntsnn = stdlib_lsame( sense, 'N' )
+           wntsne = stdlib_lsame( sense, 'E' )
+           wntsnv = stdlib_lsame( sense, 'V' )
+           wntsnb = stdlib_lsame( sense, 'B' )
+           if( .not.( stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc, 'S' ).or. &
+                     stdlib_lsame( balanc, 'P' ) .or. stdlib_lsame( balanc, 'B' ) ) )then
+              info = -1
+           else if( ( .not.wantvl ) .and. ( .not.stdlib_lsame( jobvl, 'N' ) ) ) then
+              info = -2
+           else if( ( .not.wantvr ) .and. ( .not.stdlib_lsame( jobvr, 'N' ) ) ) then
+              info = -3
+           else if( .not.( wntsnn .or. wntsne .or. wntsnb .or. wntsnv ) .or.( ( wntsne .or. &
+                     wntsnb ) .and. .not.( wantvl .and.wantvr ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( wantvl .and. ldvl<n ) ) then
+              info = -11
+           else if( ldvr<1 .or. ( wantvr .and. ldvr<n ) ) then
+              info = -13
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_shseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'SGEHRD', ' ', n, 1, n, 0 )
+                 if( wantvl ) then
+                    call stdlib_strevc3( 'L', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    call stdlib_shseqr( 'S', 'V', n, 1, n, a, lda, wr, wi, vl, ldvl,work, -1, &
+                              info )
+                 else if( wantvr ) then
+                    call stdlib_strevc3( 'R', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    call stdlib_shseqr( 'S', 'V', n, 1, n, a, lda, wr, wi, vr, ldvr,work, -1, &
+                              info )
+                 else
+                    if( wntsnn ) then
+                       call stdlib_shseqr( 'E', 'N', n, 1, n, a, lda, wr, wi, vr,ldvr, work, -1, &
+                                 info )
+                    else
+                       call stdlib_shseqr( 'S', 'N', n, 1, n, a, lda, wr, wi, vr,ldvr, work, -1, &
+                                 info )
+                    end if
+                 end if
+                 hswork = int( work(1),KIND=ilp)
+                 if( ( .not.wantvl ) .and. ( .not.wantvr ) ) then
+                    minwrk = 2*n
+                    if( .not.wntsnn )minwrk = max( minwrk, n*n+6*n )
+                    maxwrk = max( maxwrk, hswork )
+                    if( .not.wntsnn )maxwrk = max( maxwrk, n*n + 6*n )
+                 else
+                    minwrk = 3*n
+                    if( ( .not.wntsnn ) .and. ( .not.wntsne ) )minwrk = max( minwrk, n*n + 6*n )
+                              
+                    maxwrk = max( maxwrk, hswork )
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'SORGHR',' ', n, 1, n, -&
+                              1 ) )
+                    if( ( .not.wntsnn ) .and. ( .not.wntsne ) )maxwrk = max( maxwrk, n*n + 6*n )
+                              
+                    maxwrk = max( maxwrk, 3*n )
+                 end if
+                 maxwrk = max( maxwrk, minwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -21
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGEEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           icond = 0
+           anrm = stdlib_slange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_slascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! balance the matrix and compute abnrm
+           call stdlib_sgebal( balanc, n, a, lda, ilo, ihi, scale, ierr )
+           abnrm = stdlib_slange( '1', n, n, a, lda, dum )
+           if( scalea ) then
+              dum( 1 ) = abnrm
+              call stdlib_slascl( 'G', 0, 0, cscale, anrm, 1, 1, dum, 1, ierr )
+              abnrm = dum( 1 )
+           end if
+           ! reduce to upper hessenberg form
+           ! (workspace: need 2*n, prefer n+n*nb)
+           itau = 1
+           iwrk = itau + n
+           call stdlib_sgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvl ) then
+              ! want left eigenvectors
+              ! copy householder vectors to vl
+              side = 'L'
+              call stdlib_slacpy( 'L', n, n, a, lda, vl, ldvl )
+              ! generate orthogonal matrix in vl
+              ! (workspace: need 2*n-1, prefer n+(n-1)*nb)
+              call stdlib_sorghr( n, ilo, ihi, vl, ldvl, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vl
+              ! (workspace: need 1, prefer hswork (see comments) )
+              iwrk = itau
+              call stdlib_shseqr( 'S', 'V', n, ilo, ihi, a, lda, wr, wi, vl, ldvl,work( iwrk ), &
+                        lwork-iwrk+1, info )
+              if( wantvr ) then
+                 ! want left and right eigenvectors
+                 ! copy schur vectors to vr
+                 side = 'B'
+                 call stdlib_slacpy( 'F', n, n, vl, ldvl, vr, ldvr )
+              end if
+           else if( wantvr ) then
+              ! want right eigenvectors
+              ! copy householder vectors to vr
+              side = 'R'
+              call stdlib_slacpy( 'L', n, n, a, lda, vr, ldvr )
+              ! generate orthogonal matrix in vr
+              ! (workspace: need 2*n-1, prefer n+(n-1)*nb)
+              call stdlib_sorghr( n, ilo, ihi, vr, ldvr, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vr
+              ! (workspace: need 1, prefer hswork (see comments) )
+              iwrk = itau
+              call stdlib_shseqr( 'S', 'V', n, ilo, ihi, a, lda, wr, wi, vr, ldvr,work( iwrk ), &
+                        lwork-iwrk+1, info )
+           else
+              ! compute eigenvalues only
+              ! if condition numbers desired, compute schur form
+              if( wntsnn ) then
+                 job = 'E'
+              else
+                 job = 'S'
+              end if
+              ! (workspace: need 1, prefer hswork (see comments) )
+              iwrk = itau
+              call stdlib_shseqr( job, 'N', n, ilo, ihi, a, lda, wr, wi, vr, ldvr,work( iwrk ), &
+                        lwork-iwrk+1, info )
+           end if
+           ! if info /= 0 from stdlib_shseqr, then quit
+           if( info/=0 )go to 50
+           if( wantvl .or. wantvr ) then
+              ! compute left and/or right eigenvectors
+              ! (workspace: need 3*n, prefer n + 2*n*nb)
+              call stdlib_strevc3( side, 'B', select, n, a, lda, vl, ldvl, vr, ldvr,n, nout, work(&
+                         iwrk ), lwork-iwrk+1, ierr )
+           end if
+           ! compute condition numbers if desired
+           ! (workspace: need n*n+6*n unless sense = 'e')
+           if( .not.wntsnn ) then
+              call stdlib_strsna( sense, 'A', select, n, a, lda, vl, ldvl, vr, ldvr,rconde, &
+                        rcondv, n, nout, work( iwrk ), n, iwork,icond )
+           end if
+           if( wantvl ) then
+              ! undo balancing of left eigenvectors
+              call stdlib_sgebak( balanc, 'L', n, ilo, ihi, scale, n, vl, ldvl,ierr )
+              ! normalize left eigenvectors and make largest component real
+              do i = 1, n
+                 if( wi( i )==zero ) then
+                    scl = one / stdlib_snrm2( n, vl( 1, i ), 1 )
+                    call stdlib_sscal( n, scl, vl( 1, i ), 1 )
+                 else if( wi( i )>zero ) then
+                    scl = one / stdlib_slapy2( stdlib_snrm2( n, vl( 1, i ), 1 ),stdlib_snrm2( n, &
+                              vl( 1, i+1 ), 1 ) )
+                    call stdlib_sscal( n, scl, vl( 1, i ), 1 )
+                    call stdlib_sscal( n, scl, vl( 1, i+1 ), 1 )
+                    do k = 1, n
+                       work( k ) = vl( k, i )**2 + vl( k, i+1 )**2
+                    end do
+                    k = stdlib_isamax( n, work, 1 )
+                    call stdlib_slartg( vl( k, i ), vl( k, i+1 ), cs, sn, r )
+                    call stdlib_srot( n, vl( 1, i ), 1, vl( 1, i+1 ), 1, cs, sn )
+                    vl( k, i+1 ) = zero
+                 end if
+              end do
+           end if
+           if( wantvr ) then
+              ! undo balancing of right eigenvectors
+              call stdlib_sgebak( balanc, 'R', n, ilo, ihi, scale, n, vr, ldvr,ierr )
+              ! normalize right eigenvectors and make largest component real
+              do i = 1, n
+                 if( wi( i )==zero ) then
+                    scl = one / stdlib_snrm2( n, vr( 1, i ), 1 )
+                    call stdlib_sscal( n, scl, vr( 1, i ), 1 )
+                 else if( wi( i )>zero ) then
+                    scl = one / stdlib_slapy2( stdlib_snrm2( n, vr( 1, i ), 1 ),stdlib_snrm2( n, &
+                              vr( 1, i+1 ), 1 ) )
+                    call stdlib_sscal( n, scl, vr( 1, i ), 1 )
+                    call stdlib_sscal( n, scl, vr( 1, i+1 ), 1 )
+                    do k = 1, n
+                       work( k ) = vr( k, i )**2 + vr( k, i+1 )**2
+                    end do
+                    k = stdlib_isamax( n, work, 1 )
+                    call stdlib_slartg( vr( k, i ), vr( k, i+1 ), cs, sn, r )
+                    call stdlib_srot( n, vr( 1, i ), 1, vr( 1, i+1 ), 1, cs, sn )
+                    vr( k, i+1 ) = zero
+                 end if
+              end do
+           end if
+           ! undo scaling if necessary
+           50 continue
+           if( scalea ) then
+              call stdlib_slascl( 'G', 0, 0, cscale, anrm, n-info, 1, wr( info+1 ),max( n-info, 1 &
+                        ), ierr )
+              call stdlib_slascl( 'G', 0, 0, cscale, anrm, n-info, 1, wi( info+1 ),max( n-info, 1 &
+                        ), ierr )
+              if( info==0 ) then
+                 if( ( wntsnv .or. wntsnb ) .and. icond==0 )call stdlib_slascl( 'G', 0, 0, cscale,&
+                            anrm, n, 1, rcondv, n,ierr )
+              else
+                 call stdlib_slascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wr, n,ierr )
+                 call stdlib_slascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, wi, n,ierr )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_sgeevx
+
+     !> SGEJSV: computes the singular value decomposition (SVD) of a real M-by-N
+     !> matrix [A], where M >= N. The SVD of [A] is written as
+     !> [A] = [U] * [SIGMA] * [V]^t,
+     !> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+     !> diagonal elements, [U] is an M-by-N (or M-by-M) orthonormal matrix, and
+     !> [V] is an N-by-N orthogonal matrix. The diagonal elements of [SIGMA] are
+     !> the singular values of [A]. The columns of [U] and [V] are the left and
+     !> the right singular vectors of [A], respectively. The matrices [U] and [V]
+     !> are computed and stored in the arrays U and V, respectively. The diagonal
+     !> of [SIGMA] is computed and stored in the array SVA.
+     !> SGEJSV can sometimes compute tiny singular values and their singular vectors much
+     !> more accurately than other SVD routines, see below under Further Details.
+
+     pure subroutine stdlib_sgejsv( joba, jobu, jobv, jobr, jobt, jobp,m, n, a, lda, sva, u, ldu, &
+               v, ldv,work, lwork, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldv, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: sva(n), u(ldu,*), v(ldv,*), work(lwork)
+           integer(ilp), intent(out) :: iwork(*)
+           character, intent(in) :: joba, jobp, jobr, jobt, jobu, jobv
+        ! ===========================================================================
+           
+           ! Local Scalars 
+           real(sp) :: aapp, aaqq, aatmax, aatmin, big, big1, cond_ok, condr1, condr2, entra, &
+                     entrat, epsln, maxprj, scalem, sconda, sfmin, small, temp1, uscal1, uscal2, xsc
+           integer(ilp) :: ierr, n1, nr, numrank, p, q, warning
+           logical(lk) :: almort, defr, errest, goscal, jracc, kill, lsvec, l2aber, l2kill, &
+                     l2pert, l2rank, l2tran, noscal, rowpiv, rsvec, transp
+           ! Intrinsic Functions 
+           intrinsic :: abs,log,max,min,float,nint,sign,sqrt
+           ! test the input arguments
+           lsvec  = stdlib_lsame( jobu, 'U' ) .or. stdlib_lsame( jobu, 'F' )
+           jracc  = stdlib_lsame( jobv, 'J' )
+           rsvec  = stdlib_lsame( jobv, 'V' ) .or. jracc
+           rowpiv = stdlib_lsame( joba, 'F' ) .or. stdlib_lsame( joba, 'G' )
+           l2rank = stdlib_lsame( joba, 'R' )
+           l2aber = stdlib_lsame( joba, 'A' )
+           errest = stdlib_lsame( joba, 'E' ) .or. stdlib_lsame( joba, 'G' )
+           l2tran = stdlib_lsame( jobt, 'T' )
+           l2kill = stdlib_lsame( jobr, 'R' )
+           defr   = stdlib_lsame( jobr, 'N' )
+           l2pert = stdlib_lsame( jobp, 'P' )
+           if ( .not.(rowpiv .or. l2rank .or. l2aber .or.errest .or. stdlib_lsame( joba, 'C' ) )) &
+                     then
+              info = - 1
+           else if ( .not.( lsvec  .or. stdlib_lsame( jobu, 'N' ) .or.stdlib_lsame( jobu, 'W' )) )&
+                      then
+              info = - 2
+           else if ( .not.( rsvec .or. stdlib_lsame( jobv, 'N' ) .or.stdlib_lsame( jobv, 'W' )) &
+                     .or. ( jracc .and. (.not.lsvec) ) ) then
+              info = - 3
+           else if ( .not. ( l2kill .or. defr ) )    then
+              info = - 4
+           else if ( .not. ( l2tran .or. stdlib_lsame( jobt, 'N' ) ) ) then
+              info = - 5
+           else if ( .not. ( l2pert .or. stdlib_lsame( jobp, 'N' ) ) ) then
+              info = - 6
+           else if ( m < 0 ) then
+              info = - 7
+           else if ( ( n < 0 ) .or. ( n > m ) ) then
+              info = - 8
+           else if ( lda < m ) then
+              info = - 10
+           else if ( lsvec .and. ( ldu < m ) ) then
+              info = - 13
+           else if ( rsvec .and. ( ldv < n ) ) then
+              info = - 15
+           else if ( (.not.(lsvec .or. rsvec .or. errest).and.(lwork < max(7,4*n+1,2*m+n))) .or.(&
+           .not.(lsvec .or. rsvec) .and. errest .and.(lwork < max(7,4*n+n*n,2*m+n))) .or.(lsvec &
+           .and. (.not.rsvec) .and. (lwork < max(7,2*m+n,4*n+1))).or.(rsvec .and. (.not.lsvec) &
+           .and. (lwork < max(7,2*m+n,4*n+1))).or.(lsvec .and. rsvec .and. (.not.jracc) .and.(&
+           lwork<max(2*m+n,6*n+2*n*n))).or. (lsvec .and. rsvec .and. jracc .and.lwork<max(2*m+n,&
+                     4*n+n*n,2*n+n*n+6)))then
+              info = - 17
+           else
+              ! #:)
+              info = 0
+           end if
+           if ( info /= 0 ) then
+             ! #:(
+              call stdlib_xerbla( 'SGEJSV', - info )
+              return
+           end if
+           ! quick return for void matrix (y3k safe)
+       ! #:)
+           if ( ( m == 0 ) .or. ( n == 0 ) ) then
+              iwork(1:3) = 0
+              work(1:7) = 0
+              return
+           endif
+           ! determine whether the matrix u should be m x n or m x m
+           if ( lsvec ) then
+              n1 = n
+              if ( stdlib_lsame( jobu, 'F' ) ) n1 = m
+           end if
+           ! set numerical parameters
+      ! !    note: make sure stdlib_slamch() does not fail on the target architecture.
+           epsln = stdlib_slamch('EPSILON')
+           sfmin = stdlib_slamch('SAFEMINIMUM')
+           small = sfmin / epsln
+           big   = stdlib_slamch('O')
+           ! big   = one / sfmin
+           ! initialize sva(1:n) = diag( ||a e_i||_2 )_1^n
+      ! (!)  if necessary, scale sva() to protect the largest norm from
+           ! overflow. it is possible that this scaling pushes the smallest
+           ! column norm left from the underflow threshold (extreme case).
+           scalem  = one / sqrt(real(m,KIND=sp)*real(n,KIND=sp))
+           noscal  = .true.
+           goscal  = .true.
+           do p = 1, n
+              aapp = zero
+              aaqq = one
+              call stdlib_slassq( m, a(1,p), 1, aapp, aaqq )
+              if ( aapp > big ) then
+                 info = - 9
+                 call stdlib_xerbla( 'SGEJSV', -info )
+                 return
+              end if
+              aaqq = sqrt(aaqq)
+              if ( ( aapp < (big / aaqq) ) .and. noscal  ) then
+                 sva(p)  = aapp * aaqq
+              else
+                 noscal  = .false.
+                 sva(p)  = aapp * ( aaqq * scalem )
+                 if ( goscal ) then
+                    goscal = .false.
+                    call stdlib_sscal( p-1, scalem, sva, 1 )
+                 end if
+              end if
+           end do
+           if ( noscal ) scalem = one
+           aapp = zero
+           aaqq = big
+           do p = 1, n
+              aapp = max( aapp, sva(p) )
+              if ( sva(p) /= zero ) aaqq = min( aaqq, sva(p) )
+           end do
+           ! quick return for zero m x n matrix
+       ! #:)
+           if ( aapp == zero ) then
+              if ( lsvec ) call stdlib_slaset( 'G', m, n1, zero, one, u, ldu )
+              if ( rsvec ) call stdlib_slaset( 'G', n, n,  zero, one, v, ldv )
+              work(1) = one
+              work(2) = one
+              if ( errest ) work(3) = one
+              if ( lsvec .and. rsvec ) then
+                 work(4) = one
+                 work(5) = one
+              end if
+              if ( l2tran ) then
+                 work(6) = zero
+                 work(7) = zero
+              end if
+              iwork(1) = 0
+              iwork(2) = 0
+              iwork(3) = 0
+              return
+           end if
+           ! issue warning if denormalized column norms detected. override the
+           ! high relative accuracy request. issue licence to kill columns
+           ! (set them to zero) whose norm is less than sigma_max / big (roughly).
+       ! #:(
+           warning = 0
+           if ( aaqq <= sfmin ) then
+              l2rank = .true.
+              l2kill = .true.
+              warning = 1
+           end if
+           ! quick return for one-column matrix
+       ! #:)
+           if ( n == 1 ) then
+              if ( lsvec ) then
+                 call stdlib_slascl( 'G',0,0,sva(1),scalem, m,1,a(1,1),lda,ierr )
+                 call stdlib_slacpy( 'A', m, 1, a, lda, u, ldu )
+                 ! computing all m left singular vectors of the m x 1 matrix
+                 if ( n1 /= n  ) then
+                    call stdlib_sgeqrf( m, n, u,ldu, work, work(n+1),lwork-n,ierr )
+                    call stdlib_sorgqr( m,n1,1, u,ldu,work,work(n+1),lwork-n,ierr )
+                    call stdlib_scopy( m, a(1,1), 1, u(1,1), 1 )
+                 end if
+              end if
+              if ( rsvec ) then
+                  v(1,1) = one
+              end if
+              if ( sva(1) < (big*scalem) ) then
+                 sva(1)  = sva(1) / scalem
+                 scalem  = one
+              end if
+              work(1) = one / scalem
+              work(2) = one
+              if ( sva(1) /= zero ) then
+                 iwork(1) = 1
+                 if ( ( sva(1) / scalem) >= sfmin ) then
+                    iwork(2) = 1
+                 else
+                    iwork(2) = 0
+                 end if
+              else
+                 iwork(1) = 0
+                 iwork(2) = 0
+              end if
+              iwork(3) = 0
+              if ( errest ) work(3) = one
+              if ( lsvec .and. rsvec ) then
+                 work(4) = one
+                 work(5) = one
+              end if
+              if ( l2tran ) then
+                 work(6) = zero
+                 work(7) = zero
+              end if
+              return
+           end if
+           transp = .false.
+           l2tran = l2tran .and. ( m == n )
+           aatmax = -one
+           aatmin =  big
+           if ( rowpiv .or. l2tran ) then
+           ! compute the row norms, needed to determine row pivoting sequence
+           ! (in the case of heavily row weighted a, row pivoting is strongly
+           ! advised) and to collect information needed to compare the
+           ! structures of a * a^t and a^t * a (in the case l2tran==.true.).
+              if ( l2tran ) then
+                 do p = 1, m
+                    xsc   = zero
+                    temp1 = one
+                    call stdlib_slassq( n, a(p,1), lda, xsc, temp1 )
+                    ! stdlib_slassq gets both the ell_2 and the ell_infinity norm
+                    ! in one pass through the vector
+                    work(m+n+p)  = xsc * scalem
+                    work(n+p)    = xsc * (scalem*sqrt(temp1))
+                    aatmax = max( aatmax, work(n+p) )
+                    if (work(n+p) /= zero) aatmin = min(aatmin,work(n+p))
+                 end do
+              else
+                 do p = 1, m
+                    work(m+n+p) = scalem*abs( a(p,stdlib_isamax(n,a(p,1),lda)) )
+                    aatmax = max( aatmax, work(m+n+p) )
+                    aatmin = min( aatmin, work(m+n+p) )
+                 end do
+              end if
+           end if
+           ! for square matrix a try to determine whether a^t  would be  better
+           ! input for the preconditioned jacobi svd, with faster convergence.
+           ! the decision is based on an o(n) function of the vector of column
+           ! and row norms of a, based on the shannon entropy. this should give
+           ! the right choice in most cases when the difference actually matters.
+           ! it may fail and pick the slower converging side.
+           entra  = zero
+           entrat = zero
+           if ( l2tran ) then
+              xsc   = zero
+              temp1 = one
+              call stdlib_slassq( n, sva, 1, xsc, temp1 )
+              temp1 = one / temp1
+              entra = zero
+              do p = 1, n
+                 big1  = ( ( sva(p) / xsc )**2 ) * temp1
+                 if ( big1 /= zero ) entra = entra + big1 * log(big1)
+              end do
+              entra = - entra / log(real(n,KIND=sp))
+              ! now, sva().^2/trace(a^t * a) is a point in the probability simplex.
+              ! it is derived from the diagonal of  a^t * a.  do the same with the
+              ! diagonal of a * a^t, compute the entropy of the corresponding
+              ! probability distribution. note that a * a^t and a^t * a have the
+              ! same trace.
+              entrat = zero
+              do p = n+1, n+m
+                 big1 = ( ( work(p) / xsc )**2 ) * temp1
+                 if ( big1 /= zero ) entrat = entrat + big1 * log(big1)
+              end do
+              entrat = - entrat / log(real(m,KIND=sp))
+              ! analyze the entropies and decide a or a^t. smaller entropy
+              ! usually means better input for the algorithm.
+              transp = ( entrat < entra )
+              ! if a^t is better than a, transpose a.
+              if ( transp ) then
+                 ! in an optimal implementation, this trivial transpose
+                 ! should be replaced with faster transpose.
+                 do p = 1, n - 1
+                    do q = p + 1, n
+                        temp1 = a(q,p)
+                       a(q,p) = a(p,q)
+                       a(p,q) = temp1
+                    end do
+                 end do
+                 do p = 1, n
+                    work(m+n+p) = sva(p)
+                    sva(p)      = work(n+p)
+                 end do
+                 temp1  = aapp
+                 aapp   = aatmax
+                 aatmax = temp1
+                 temp1  = aaqq
+                 aaqq   = aatmin
+                 aatmin = temp1
+                 kill   = lsvec
+                 lsvec  = rsvec
+                 rsvec  = kill
+                 if ( lsvec ) n1 = n
+                 rowpiv = .true.
+              end if
+           end if
+           ! end if l2tran
+           ! scale the matrix so that its maximal singular value remains less
+           ! than sqrt(big) -- the matrix is scaled so that its maximal column
+           ! has euclidean norm equal to sqrt(big/n). the only reason to keep
+           ! sqrt(big) instead of big is the fact that stdlib_sgejsv uses lapack and
+           ! blas routines that, in some implementations, are not capable of
+           ! working in the full interval [sfmin,big] and that they may provoke
+           ! overflows in the intermediate results. if the singular values spread
+           ! from sfmin to big, then stdlib_sgesvj will compute them. so, in that case,
+           ! one should use stdlib_sgesvj instead of stdlib_sgejsv.
+           big1   = sqrt( big )
+           temp1  = sqrt( big / real(n,KIND=sp) )
+           call stdlib_slascl( 'G', 0, 0, aapp, temp1, n, 1, sva, n, ierr )
+           if ( aaqq > (aapp * sfmin) ) then
+               aaqq = ( aaqq / aapp ) * temp1
+           else
+               aaqq = ( aaqq * temp1 ) / aapp
+           end if
+           temp1 = temp1 * scalem
+           call stdlib_slascl( 'G', 0, 0, aapp, temp1, m, n, a, lda, ierr )
+           ! to undo scaling at the end of this procedure, multiply the
+           ! computed singular values with uscal2 / uscal1.
+           uscal1 = temp1
+           uscal2 = aapp
+           if ( l2kill ) then
+              ! l2kill enforces computation of nonzero singular values in
+              ! the restricted range of condition number of the initial a,
+              ! sigma_max(a) / sigma_min(a) approx. sqrt(big)/sqrt(sfmin).
+              xsc = sqrt( sfmin )
+           else
+              xsc = small
+              ! now, if the condition number of a is too big,
+              ! sigma_max(a) / sigma_min(a) > sqrt(big/n) * epsln / sfmin,
+              ! as a precaution measure, the full svd is computed using stdlib_sgesvj
+              ! with accumulated jacobi rotations. this provides numerically
+              ! more robust computation, at the cost of slightly increased run
+              ! time. depending on the concrete implementation of blas and lapack
+              ! (i.e. how they behave in presence of extreme ill-conditioning) the
+              ! implementor may decide to remove this switch.
+              if ( ( aaqq<sqrt(sfmin) ) .and. lsvec .and. rsvec ) then
+                 jracc = .true.
+              end if
+           end if
+           if ( aaqq < xsc ) then
+              do p = 1, n
+                 if ( sva(p) < xsc ) then
+                    call stdlib_slaset( 'A', m, 1, zero, zero, a(1,p), lda )
+                    sva(p) = zero
+                 end if
+              end do
+           end if
+           ! preconditioning using qr factorization with pivoting
+           if ( rowpiv ) then
+              ! optional row permutation (bjoerck row pivoting):
+              ! a result by cox and higham shows that the bjoerck's
+              ! row pivoting combined with standard column pivoting
+              ! has similar effect as powell-reid complete pivoting.
+              ! the ell-infinity norms of a are made nonincreasing.
+              do p = 1, m - 1
+                 q = stdlib_isamax( m-p+1, work(m+n+p), 1 ) + p - 1
+                 iwork(2*n+p) = q
+                 if ( p /= q ) then
+                    temp1       = work(m+n+p)
+                    work(m+n+p) = work(m+n+q)
+                    work(m+n+q) = temp1
+                 end if
+              end do
+              call stdlib_slaswp( n, a, lda, 1, m-1, iwork(2*n+1), 1 )
+           end if
+           ! end of the preparation phase (scaling, optional sorting and
+           ! transposing, optional flushing of small columns).
+           ! preconditioning
+           ! if the full svd is needed, the right singular vectors are computed
+           ! from a matrix equation, and for that we need theoretical analysis
+           ! of the businger-golub pivoting. so we use stdlib_sgeqp3 as the first rr qrf.
+           ! in all other cases the first rr qrf can be chosen by other criteria
+           ! (eg speed by replacing global with restricted window pivoting, such
+           ! as in sgeqpx from toms # 782). good results will be obtained using
+           ! sgeqpx with properly (!) chosen numerical parameters.
+           ! any improvement of stdlib_sgeqp3 improves overall performance of stdlib_sgejsv.
+           ! a * p1 = q1 * [ r1^t 0]^t:
+           do p = 1, n
+              ! All Columns Are Free Columns
+              iwork(p) = 0
+           end do
+           call stdlib_sgeqp3( m,n,a,lda, iwork,work, work(n+1),lwork-n, ierr )
+           ! the upper triangular matrix r1 from the first qrf is inspected for
+           ! rank deficiency and possibilities for deflation, or possible
+           ! ill-conditioning. depending on the user specified flag l2rank,
+           ! the procedure explores possibilities to reduce the numerical
+           ! rank by inspecting the computed upper triangular factor. if
+           ! l2rank or l2aber are up, then stdlib_sgejsv will compute the svd of
+           ! a + da, where ||da|| <= f(m,n)*epsln.
+           nr = 1
+           if ( l2aber ) then
+              ! standard absolute error bound suffices. all sigma_i with
+              ! sigma_i < n*epsln*||a|| are flushed to zero. this is an
+              ! aggressive enforcement of lower numerical rank by introducing a
+              ! backward error of the order of n*epsln*||a||.
+              temp1 = sqrt(real(n,KIND=sp))*epsln
+              do p = 2, n
+                 if ( abs(a(p,p)) >= (temp1*abs(a(1,1))) ) then
+                    nr = nr + 1
+                 else
+                    go to 3002
+                 end if
+              end do
+              3002 continue
+           else if ( l2rank ) then
+              ! .. similarly as above, only slightly more gentle (less aggressive).
+              ! sudden drop on the diagonal of r1 is used as the criterion for
+              ! close-to-rank-deficient.
+              temp1 = sqrt(sfmin)
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < (epsln*abs(a(p-1,p-1))) ) .or.( abs(a(p,p)) < small ) .or.( &
+                           l2kill .and. (abs(a(p,p)) < temp1) ) ) go to 3402
+                 nr = nr + 1
+              end do
+              3402 continue
+           else
+              ! the goal is high relative accuracy. however, if the matrix
+              ! has high scaled condition number the relative accuracy is in
+              ! general not feasible. later on, a condition number estimator
+              ! will be deployed to estimate the scaled condition number.
+              ! here we just remove the underflowed part of the triangular
+              ! factor. this prevents the situation in which the code is
+              ! working hard to get the accuracy not warranted by the data.
+              temp1  = sqrt(sfmin)
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < small ) .or.( l2kill .and. (abs(a(p,p)) < temp1) ) ) go to &
+                           3302
+                 nr = nr + 1
+              end do
+              3302 continue
+           end if
+           almort = .false.
+           if ( nr == n ) then
+              maxprj = one
+              do p = 2, n
+                 temp1  = abs(a(p,p)) / sva(iwork(p))
+                 maxprj = min( maxprj, temp1 )
+              end do
+              if ( maxprj**2 >= one - real(n,KIND=sp)*epsln ) almort = .true.
+           end if
+           sconda = - one
+           condr1 = - one
+           condr2 = - one
+           if ( errest ) then
+              if ( n == nr ) then
+                 if ( rsvec ) then
+                    ! V Is Available As Workspace
+                    call stdlib_slacpy( 'U', n, n, a, lda, v, ldv )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_sscal( p, one/temp1, v(1,p), 1 )
+                    end do
+                    call stdlib_spocon( 'U', n, v, ldv, one, temp1,work(n+1), iwork(2*n+m+1), &
+                              ierr )
+                 else if ( lsvec ) then
+                    ! U Is Available As Workspace
+                    call stdlib_slacpy( 'U', n, n, a, lda, u, ldu )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_sscal( p, one/temp1, u(1,p), 1 )
+                    end do
+                    call stdlib_spocon( 'U', n, u, ldu, one, temp1,work(n+1), iwork(2*n+m+1), &
+                              ierr )
+                 else
+                    call stdlib_slacpy( 'U', n, n, a, lda, work(n+1), n )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_sscal( p, one/temp1, work(n+(p-1)*n+1), 1 )
+                    end do
+                 ! The Columns Of R Are Scaled To Have Unit Euclidean Lengths
+                    call stdlib_spocon( 'U', n, work(n+1), n, one, temp1,work(n+n*n+1), iwork(2*n+&
+                              m+1), ierr )
+                 end if
+                 sconda = one / sqrt(temp1)
+                 ! sconda is an estimate of sqrt(||(r^t * r)^(-1)||_1).
+                 ! n^(-1/4) * sconda <= ||r^(-1)||_2 <= n^(1/4) * sconda
+              else
+                 sconda = - one
+              end if
+           end if
+           l2pert = l2pert .and. ( abs( a(1,1)/a(nr,nr) ) > sqrt(big1) )
+           ! if there is no violent scaling, artificial perturbation is not needed.
+           ! phase 3:
+           if ( .not. ( rsvec .or. lsvec ) ) then
+               ! singular values only
+               ! .. transpose a(1:nr,1:n)
+              do p = 1, min( n-1, nr )
+                 call stdlib_scopy( n-p, a(p,p+1), lda, a(p+1,p), 1 )
+              end do
+              ! the following two do-loops introduce small relative perturbation
+              ! into the strict upper triangle of the lower triangular matrix.
+              ! small entries below the main diagonal are also changed.
+              ! this modification is useful if the computing environment does not
+              ! provide/allow flush to zero underflow, for it prevents many
+              ! annoying denormalized numbers in case of strongly scaled matrices.
+              ! the perturbation is structured so that it does not introduce any
+              ! new perturbation of the singular values, and it does not destroy
+              ! the job done by the preconditioner.
+              ! the licence for this perturbation is in the variable l2pert, which
+              ! should be .false. if flush to zero underflow is active.
+              if ( .not. almort ) then
+                 if ( l2pert ) then
+                    ! xsc = sqrt(small)
+                    xsc = epsln / real(n,KIND=sp)
+                    do q = 1, nr
+                       temp1 = xsc*abs(a(q,q))
+                       do p = 1, n
+                          if ( ( (p>q) .and. (abs(a(p,q))<=temp1) ).or. ( p < q ) )a(p,q) = sign( &
+                                    temp1, a(p,q) )
+                       end do
+                    end do
+                 else
+                    call stdlib_slaset( 'U', nr-1,nr-1, zero,zero, a(1,2),lda )
+                 end if
+                  ! Second Preconditioning Using The Qr Factorization
+                 call stdlib_sgeqrf( n,nr, a,lda, work, work(n+1),lwork-n, ierr )
+                 ! And Transpose Upper To Lower Triangular
+                 do p = 1, nr - 1
+                    call stdlib_scopy( nr-p, a(p,p+1), lda, a(p+1,p), 1 )
+                 end do
+              end if
+                 ! row-cyclic jacobi svd algorithm with column pivoting
+                 ! .. again some perturbation (a "background noise") is added
+                 ! to drown denormals
+                 if ( l2pert ) then
+                    ! xsc = sqrt(small)
+                    xsc = epsln / real(n,KIND=sp)
+                    do q = 1, nr
+                       temp1 = xsc*abs(a(q,q))
+                       do p = 1, nr
+                          if ( ( (p>q) .and. (abs(a(p,q))<=temp1) ).or. ( p < q ) )a(p,q) = sign( &
+                                    temp1, a(p,q) )
+                       end do
+                    end do
+                 else
+                    call stdlib_slaset( 'U', nr-1, nr-1, zero, zero, a(1,2), lda )
+                 end if
+                 ! .. and one-sided jacobi rotations are started on a lower
+                 ! triangular matrix (plus perturbation which is ignored in
+                 ! the part which destroys triangular form (confusing?!))
+                 call stdlib_sgesvj( 'L', 'NOU', 'NOV', nr, nr, a, lda, sva,n, v, ldv, work, &
+                           lwork, info )
+                 scalem  = work(1)
+                 numrank = nint(work(2),KIND=ilp)
+           else if ( rsvec .and. ( .not. lsvec ) ) then
+              ! -> singular values and right singular vectors <-
+              if ( almort ) then
+                 ! In This Case Nr Equals N
+                 do p = 1, nr
+                    call stdlib_scopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                 end do
+                 call stdlib_slaset( 'UPPER', nr-1, nr-1, zero, zero, v(1,2), ldv )
+                 call stdlib_sgesvj( 'L','U','N', n, nr, v,ldv, sva, nr, a,lda,work, lwork, info )
+                           
+                 scalem  = work(1)
+                 numrank = nint(work(2),KIND=ilp)
+              else
+              ! .. two more qr factorizations ( one qrf is not enough, two require
+              ! accumulated product of jacobi rotations, three are perfect )
+                 call stdlib_slaset( 'LOWER', nr-1, nr-1, zero, zero, a(2,1), lda )
+                 call stdlib_sgelqf( nr, n, a, lda, work, work(n+1), lwork-n, ierr)
+                 call stdlib_slacpy( 'LOWER', nr, nr, a, lda, v, ldv )
+                 call stdlib_slaset( 'UPPER', nr-1, nr-1, zero, zero, v(1,2), ldv )
+                 call stdlib_sgeqrf( nr, nr, v, ldv, work(n+1), work(2*n+1),lwork-2*n, ierr )
+                           
+                 do p = 1, nr
+                    call stdlib_scopy( nr-p+1, v(p,p), ldv, v(p,p), 1 )
+                 end do
+                 call stdlib_slaset( 'UPPER', nr-1, nr-1, zero, zero, v(1,2), ldv )
+                 call stdlib_sgesvj( 'LOWER', 'U','N', nr, nr, v,ldv, sva, nr, u,ldu, work(n+1), &
+                           lwork-n, info )
+                 scalem  = work(n+1)
+                 numrank = nint(work(n+2),KIND=ilp)
+                 if ( nr < n ) then
+                    call stdlib_slaset( 'A',n-nr, nr, zero,zero, v(nr+1,1),   ldv )
+                    call stdlib_slaset( 'A',nr, n-nr, zero,zero, v(1,nr+1),   ldv )
+                    call stdlib_slaset( 'A',n-nr,n-nr,zero,one, v(nr+1,nr+1), ldv )
+                 end if
+              call stdlib_sormlq( 'LEFT', 'TRANSPOSE', n, n, nr, a, lda, work,v, ldv, work(n+1), &
+                        lwork-n, ierr )
+              end if
+              do p = 1, n
+                 call stdlib_scopy( n, v(p,1), ldv, a(iwork(p),1), lda )
+              end do
+              call stdlib_slacpy( 'ALL', n, n, a, lda, v, ldv )
+              if ( transp ) then
+                 call stdlib_slacpy( 'ALL', n, n, v, ldv, u, ldu )
+              end if
+           else if ( lsvec .and. ( .not. rsvec ) ) then
+              ! Singular Values And Left Singular Vectors                 
+              ! Second Preconditioning Step To Avoid Need To Accumulate
+              ! jacobi rotations in the jacobi iterations.
+              do p = 1, nr
+                 call stdlib_scopy( n-p+1, a(p,p), lda, u(p,p), 1 )
+              end do
+              call stdlib_slaset( 'UPPER', nr-1, nr-1, zero, zero, u(1,2), ldu )
+              call stdlib_sgeqrf( n, nr, u, ldu, work(n+1), work(2*n+1),lwork-2*n, ierr )
+              do p = 1, nr - 1
+                 call stdlib_scopy( nr-p, u(p,p+1), ldu, u(p+1,p), 1 )
+              end do
+              call stdlib_slaset( 'UPPER', nr-1, nr-1, zero, zero, u(1,2), ldu )
+              call stdlib_sgesvj( 'LOWER', 'U', 'N', nr,nr, u, ldu, sva, nr, a,lda, work(n+1), &
+                        lwork-n, info )
+              scalem  = work(n+1)
+              numrank = nint(work(n+2),KIND=ilp)
+              if ( nr < m ) then
+                 call stdlib_slaset( 'A',  m-nr, nr,zero, zero, u(nr+1,1), ldu )
+                 if ( nr < n1 ) then
+                    call stdlib_slaset( 'A',nr, n1-nr, zero, zero, u(1,nr+1), ldu )
+                    call stdlib_slaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                 end if
+              end if
+              call stdlib_sormqr( 'LEFT', 'NO TR', m, n1, n, a, lda, work, u,ldu, work(n+1), &
+                        lwork-n, ierr )
+              if ( rowpiv )call stdlib_slaswp( n1, u, ldu, 1, m-1, iwork(2*n+1), -1 )
+              do p = 1, n1
+                 xsc = one / stdlib_snrm2( m, u(1,p), 1 )
+                 call stdlib_sscal( m, xsc, u(1,p), 1 )
+              end do
+              if ( transp ) then
+                 call stdlib_slacpy( 'ALL', n, n, u, ldu, v, ldv )
+              end if
+           else
+              ! Full Svd 
+              if ( .not. jracc ) then
+              if ( .not. almort ) then
+                 ! second preconditioning step (qrf [with pivoting])
+                 ! note that the composition of transpose, qrf and transpose is
+                 ! equivalent to an lqf call. since in many libraries the qrf
+                 ! seems to be better optimized than the lqf, we do explicit
+                 ! transpose and use the qrf. this is subject to changes in an
+                 ! optimized implementation of stdlib_sgejsv.
+                 do p = 1, nr
+                    call stdlib_scopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                 end do
+                 ! The Following Two Loops Perturb Small Entries To Avoid
+                 ! denormals in the second qr factorization, where they are
+                 ! as good as zeros. this is done to avoid painfully slow
+                 ! computation with denormals. the relative size of the perturbation
+                 ! is a parameter that can be changed by the implementer.
+                 ! this perturbation device will be obsolete on machines with
+                 ! properly implemented arithmetic.
+                 ! to switch it off, set l2pert=.false. to remove it from  the
+                 ! code, remove the action under l2pert=.true., leave the else part.
+                 ! the following two loops should be blocked and fused with the
+                 ! transposed copy above.
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do q = 1, nr
+                       temp1 = xsc*abs( v(q,q) )
+                       do p = 1, n
+                          if ( ( p > q ) .and. ( abs(v(p,q)) <= temp1 ).or. ( p < q ) )v(p,q) = &
+                                    sign( temp1, v(p,q) )
+                          if ( p < q ) v(p,q) = - v(p,q)
+                       end do
+                    end do
+                 else
+                    call stdlib_slaset( 'U', nr-1, nr-1, zero, zero, v(1,2), ldv )
+                 end if
+                 ! estimate the row scaled condition number of r1
+                 ! (if r1 is rectangular, n > nr, then the condition number
+                 ! of the leading nr x nr submatrix is estimated.)
+                 call stdlib_slacpy( 'L', nr, nr, v, ldv, work(2*n+1), nr )
+                 do p = 1, nr
+                    temp1 = stdlib_snrm2(nr-p+1,work(2*n+(p-1)*nr+p),1)
+                    call stdlib_sscal(nr-p+1,one/temp1,work(2*n+(p-1)*nr+p),1)
+                 end do
+                 call stdlib_spocon('LOWER',nr,work(2*n+1),nr,one,temp1,work(2*n+nr*nr+1),iwork(m+&
+                           2*n+1),ierr)
+                 condr1 = one / sqrt(temp1)
+                 ! Here Need A Second Opinion On The Condition Number
+                 ! Then Assume Worst Case Scenario
+                 ! r1 is ok for inverse <=> condr1 < real(n,KIND=sp)
+                 ! more conservative    <=> condr1 < sqrt(real(n,KIND=sp))
+                 cond_ok = sqrt(real(nr,KIND=sp))
+      ! [tp]       cond_ok is a tuning parameter.
+                 if ( condr1 < cond_ok ) then
+                    ! .. the second qrf without pivoting. note: in an optimized
+                    ! implementation, this qrf should be implemented as the qrf
+                    ! of a lower triangular matrix.
+                    ! r1^t = q2 * r2
+                    call stdlib_sgeqrf( n, nr, v, ldv, work(n+1), work(2*n+1),lwork-2*n, ierr )
+                              
+                    if ( l2pert ) then
+                       xsc = sqrt(small)/epsln
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             temp1 = xsc * min(abs(v(p,p)),abs(v(q,q)))
+                             if ( abs(v(q,p)) <= temp1 )v(q,p) = sign( temp1, v(q,p) )
+                          end do
+                       end do
+                    end if
+                    if ( nr /= n )call stdlib_slacpy( 'A', n, nr, v, ldv, work(2*n+1), n )
+                    ! .. save ...
+                 ! This Transposed Copy Should Be Better Than Naive
+                    do p = 1, nr - 1
+                       call stdlib_scopy( nr-p, v(p,p+1), ldv, v(p+1,p), 1 )
+                    end do
+                    condr2 = condr1
+                 else
+                    ! .. ill-conditioned case: second qrf with pivoting
+                    ! note that windowed pivoting would be equally good
+                    ! numerically, and more run-time efficient. so, in
+                    ! an optimal implementation, the next call to stdlib_sgeqp3
+                    ! should be replaced with eg. call sgeqpx (acm toms #782)
+                    ! with properly (carefully) chosen parameters.
+                    ! r1^t * p2 = q2 * r2
+                    do p = 1, nr
+                       iwork(n+p) = 0
+                    end do
+                    call stdlib_sgeqp3( n, nr, v, ldv, iwork(n+1), work(n+1),work(2*n+1), lwork-&
+                              2*n, ierr )
+      ! *               call stdlib_sgeqrf( n, nr, v, ldv, work(n+1), work(2*n+1),
+      ! *     $              lwork-2*n, ierr )
+                    if ( l2pert ) then
+                       xsc = sqrt(small)
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             temp1 = xsc * min(abs(v(p,p)),abs(v(q,q)))
+                             if ( abs(v(q,p)) <= temp1 )v(q,p) = sign( temp1, v(q,p) )
+                          end do
+                       end do
+                    end if
+                    call stdlib_slacpy( 'A', n, nr, v, ldv, work(2*n+1), n )
+                    if ( l2pert ) then
+                       xsc = sqrt(small)
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             temp1 = xsc * min(abs(v(p,p)),abs(v(q,q)))
+                             v(p,q) = - sign( temp1, v(q,p) )
+                          end do
+                       end do
+                    else
+                       call stdlib_slaset( 'L',nr-1,nr-1,zero,zero,v(2,1),ldv )
+                    end if
+                    ! now, compute r2 = l3 * q3, the lq factorization.
+                    call stdlib_sgelqf( nr, nr, v, ldv, work(2*n+n*nr+1),work(2*n+n*nr+nr+1), &
+                              lwork-2*n-n*nr-nr, ierr )
+                    ! And Estimate The Condition Number
+                    call stdlib_slacpy( 'L',nr,nr,v,ldv,work(2*n+n*nr+nr+1),nr )
+                    do p = 1, nr
+                       temp1 = stdlib_snrm2( p, work(2*n+n*nr+nr+p), nr )
+                       call stdlib_sscal( p, one/temp1, work(2*n+n*nr+nr+p), nr )
+                    end do
+                    call stdlib_spocon( 'L',nr,work(2*n+n*nr+nr+1),nr,one,temp1,work(2*n+n*nr+nr+&
+                              nr*nr+1),iwork(m+2*n+1),ierr )
+                    condr2 = one / sqrt(temp1)
+                    if ( condr2 >= cond_ok ) then
+                       ! Save The Householder Vectors Used For Q3
+                       ! (this overwrites the copy of r2, as it will not be
+                       ! needed in this branch, but it does not overwritte the
+                       ! huseholder vectors of q2.).
+                       call stdlib_slacpy( 'U', nr, nr, v, ldv, work(2*n+1), n )
+                       ! And The Rest Of The Information On Q3 Is In
+                       ! work(2*n+n*nr+1:2*n+n*nr+n)
+                    end if
+                 end if
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do q = 2, nr
+                       temp1 = xsc * v(q,q)
+                       do p = 1, q - 1
+                          ! v(p,q) = - sign( temp1, v(q,p) )
+                          v(p,q) = - sign( temp1, v(p,q) )
+                       end do
+                    end do
+                 else
+                    call stdlib_slaset( 'U', nr-1,nr-1, zero,zero, v(1,2), ldv )
+                 end if
+              ! second preconditioning finished; continue with jacobi svd
+              ! the input matrix is lower trinagular.
+              ! recover the right singular vectors as solution of a well
+              ! conditioned triangular matrix equation.
+                 if ( condr1 < cond_ok ) then
+                    call stdlib_sgesvj( 'L','U','N',nr,nr,v,ldv,sva,nr,u,ldu,work(2*n+n*nr+nr+1),&
+                              lwork-2*n-n*nr-nr,info )
+                    scalem  = work(2*n+n*nr+nr+1)
+                    numrank = nint(work(2*n+n*nr+nr+2),KIND=ilp)
+                    do p = 1, nr
+                       call stdlib_scopy( nr, v(1,p), 1, u(1,p), 1 )
+                       call stdlib_sscal( nr, sva(p),    v(1,p), 1 )
+                    end do
+              ! Pick The Right Matrix Equation And Solve It
+                    if ( nr == n ) then
+       ! :))             .. best case, r1 is inverted. the solution of this matrix
+                       ! equation is q2*v2 = the product of the jacobi rotations
+                       ! used in stdlib_sgesvj, premultiplied with the orthogonal matrix
+                       ! from the second qr factorization.
+                       call stdlib_strsm( 'L','U','N','N', nr,nr,one, a,lda, v,ldv )
+                    else
+                       ! .. r1 is well conditioned, but non-square. transpose(r2)
+                       ! is inverted to get the product of the jacobi rotations
+                       ! used in stdlib_sgesvj. the q-factor from the second qr
+                       ! factorization is then built in explicitly.
+                       call stdlib_strsm('L','U','T','N',nr,nr,one,work(2*n+1),n,v,ldv)
+                       if ( nr < n ) then
+                         call stdlib_slaset('A',n-nr,nr,zero,zero,v(nr+1,1),ldv)
+                         call stdlib_slaset('A',nr,n-nr,zero,zero,v(1,nr+1),ldv)
+                         call stdlib_slaset('A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv)
+                       end if
+                       call stdlib_sormqr('L','N',n,n,nr,work(2*n+1),n,work(n+1),v,ldv,work(2*n+&
+                                 n*nr+nr+1),lwork-2*n-n*nr-nr,ierr)
+                    end if
+                 else if ( condr2 < cond_ok ) then
+       ! :)           .. the input matrix a is very likely a relative of
+                    ! the kahan matrix :)
+                    ! the matrix r2 is inverted. the solution of the matrix equation
+                    ! is q3^t*v3 = the product of the jacobi rotations (appplied to
+                    ! the lower triangular l3 from the lq factorization of
+                    ! r2=l3*q3), pre-multiplied with the transposed q3.
+                    call stdlib_sgesvj( 'L', 'U', 'N', nr, nr, v, ldv, sva, nr, u,ldu, work(2*n+&
+                              n*nr+nr+1), lwork-2*n-n*nr-nr, info )
+                    scalem  = work(2*n+n*nr+nr+1)
+                    numrank = nint(work(2*n+n*nr+nr+2),KIND=ilp)
+                    do p = 1, nr
+                       call stdlib_scopy( nr, v(1,p), 1, u(1,p), 1 )
+                       call stdlib_sscal( nr, sva(p),    u(1,p), 1 )
+                    end do
+                    call stdlib_strsm('L','U','N','N',nr,nr,one,work(2*n+1),n,u,ldu)
+                    ! Apply The Permutation From The Second Qr Factorization
+                    do q = 1, nr
+                       do p = 1, nr
+                          work(2*n+n*nr+nr+iwork(n+p)) = u(p,q)
+                       end do
+                       do p = 1, nr
+                          u(p,q) = work(2*n+n*nr+nr+p)
+                       end do
+                    end do
+                    if ( nr < n ) then
+                       call stdlib_slaset( 'A',n-nr,nr,zero,zero,v(nr+1,1),ldv )
+                       call stdlib_slaset( 'A',nr,n-nr,zero,zero,v(1,nr+1),ldv )
+                       call stdlib_slaset( 'A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv )
+                    end if
+                    call stdlib_sormqr( 'L','N',n,n,nr,work(2*n+1),n,work(n+1),v,ldv,work(2*n+&
+                              n*nr+nr+1),lwork-2*n-n*nr-nr,ierr )
+                 else
+                    ! last line of defense.
+       ! #:(          this is a rather pathological case: no scaled condition
+                    ! improvement after two pivoted qr factorizations. other
+                    ! possibility is that the rank revealing qr factorization
+                    ! or the condition estimator has failed, or the cond_ok
+                    ! is set very close to one (which is unnecessary). normally,
+                    ! this branch should never be executed, but in rare cases of
+                    ! failure of the rrqr or condition estimator, the last line of
+                    ! defense ensures that stdlib_sgejsv completes the task.
+                    ! compute the full svd of l3 using stdlib_sgesvj with explicit
+                    ! accumulation of jacobi rotations.
+                    call stdlib_sgesvj( 'L', 'U', 'V', nr, nr, v, ldv, sva, nr, u,ldu, work(2*n+&
+                              n*nr+nr+1), lwork-2*n-n*nr-nr, info )
+                    scalem  = work(2*n+n*nr+nr+1)
+                    numrank = nint(work(2*n+n*nr+nr+2),KIND=ilp)
+                    if ( nr < n ) then
+                       call stdlib_slaset( 'A',n-nr,nr,zero,zero,v(nr+1,1),ldv )
+                       call stdlib_slaset( 'A',nr,n-nr,zero,zero,v(1,nr+1),ldv )
+                       call stdlib_slaset( 'A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv )
+                    end if
+                    call stdlib_sormqr( 'L','N',n,n,nr,work(2*n+1),n,work(n+1),v,ldv,work(2*n+&
+                              n*nr+nr+1),lwork-2*n-n*nr-nr,ierr )
+                    call stdlib_sormlq( 'L', 'T', nr, nr, nr, work(2*n+1), n,work(2*n+n*nr+1), u, &
+                              ldu, work(2*n+n*nr+nr+1),lwork-2*n-n*nr-nr, ierr )
+                    do q = 1, nr
+                       do p = 1, nr
+                          work(2*n+n*nr+nr+iwork(n+p)) = u(p,q)
+                       end do
+                       do p = 1, nr
+                          u(p,q) = work(2*n+n*nr+nr+p)
+                       end do
+                    end do
+                 end if
+                 ! permute the rows of v using the (column) permutation from the
+                 ! first qrf. also, scale the columns to make them unit in
+                 ! euclidean norm. this applies to all cases.
+                 temp1 = sqrt(real(n,KIND=sp)) * epsln
+                 do q = 1, n
+                    do p = 1, n
+                       work(2*n+n*nr+nr+iwork(p)) = v(p,q)
+                    end do
+                    do p = 1, n
+                       v(p,q) = work(2*n+n*nr+nr+p)
+                    end do
+                    xsc = one / stdlib_snrm2( n, v(1,q), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_sscal( n, xsc, &
+                              v(1,q), 1 )
+                 end do
+                 ! at this moment, v contains the right singular vectors of a.
+                 ! next, assemble the left singular vector matrix u (m x n).
+                 if ( nr < m ) then
+                    call stdlib_slaset( 'A', m-nr, nr, zero, zero, u(nr+1,1), ldu )
+                    if ( nr < n1 ) then
+                       call stdlib_slaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                       call stdlib_slaset('A',m-nr,n1-nr,zero,one,u(nr+1,nr+1),ldu)
+                    end if
+                 end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! matrix u. this applies to all cases.
+                 call stdlib_sormqr( 'LEFT', 'NO_TR', m, n1, n, a, lda, work, u,ldu, work(n+1), &
+                           lwork-n, ierr )
+                 ! the columns of u are normalized. the cost is o(m*n) flops.
+                 temp1 = sqrt(real(m,KIND=sp)) * epsln
+                 do p = 1, nr
+                    xsc = one / stdlib_snrm2( m, u(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_sscal( m, xsc, &
+                              u(1,p), 1 )
+                 end do
+                 ! if the initial qrf is computed with row pivoting, the left
+                 ! singular vectors must be adjusted.
+                 if ( rowpiv )call stdlib_slaswp( n1, u, ldu, 1, m-1, iwork(2*n+1), -1 )
+              else
+              ! The Initial Matrix A Has Almost Orthogonal Columns And
+              ! the second qrf is not needed
+                 call stdlib_slacpy( 'UPPER', n, n, a, lda, work(n+1), n )
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do p = 2, n
+                       temp1 = xsc * work( n + (p-1)*n + p )
+                       do q = 1, p - 1
+                          work(n+(q-1)*n+p)=-sign(temp1,work(n+(p-1)*n+q))
+                       end do
+                    end do
+                 else
+                    call stdlib_slaset( 'LOWER',n-1,n-1,zero,zero,work(n+2),n )
+                 end if
+                 call stdlib_sgesvj( 'UPPER', 'U', 'N', n, n, work(n+1), n, sva,n, u, ldu, work(n+&
+                           n*n+1), lwork-n-n*n, info )
+                 scalem  = work(n+n*n+1)
+                 numrank = nint(work(n+n*n+2),KIND=ilp)
+                 do p = 1, n
+                    call stdlib_scopy( n, work(n+(p-1)*n+1), 1, u(1,p), 1 )
+                    call stdlib_sscal( n, sva(p), work(n+(p-1)*n+1), 1 )
+                 end do
+                 call stdlib_strsm( 'LEFT', 'UPPER', 'NOTRANS', 'NO UD', n, n,one, a, lda, work(n+&
+                           1), n )
+                 do p = 1, n
+                    call stdlib_scopy( n, work(n+p), n, v(iwork(p),1), ldv )
+                 end do
+                 temp1 = sqrt(real(n,KIND=sp))*epsln
+                 do p = 1, n
+                    xsc = one / stdlib_snrm2( n, v(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_sscal( n, xsc, &
+                              v(1,p), 1 )
+                 end do
+                 ! assemble the left singular vector matrix u (m x n).
+                 if ( n < m ) then
+                    call stdlib_slaset( 'A',  m-n, n, zero, zero, u(n+1,1), ldu )
+                    if ( n < n1 ) then
+                       call stdlib_slaset( 'A',n,  n1-n, zero, zero,  u(1,n+1),ldu )
+                       call stdlib_slaset( 'A',m-n,n1-n, zero, one,u(n+1,n+1),ldu )
+                    end if
+                 end if
+                 call stdlib_sormqr( 'LEFT', 'NO TR', m, n1, n, a, lda, work, u,ldu, work(n+1), &
+                           lwork-n, ierr )
+                 temp1 = sqrt(real(m,KIND=sp))*epsln
+                 do p = 1, n1
+                    xsc = one / stdlib_snrm2( m, u(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_sscal( m, xsc, &
+                              u(1,p), 1 )
+                 end do
+                 if ( rowpiv )call stdlib_slaswp( n1, u, ldu, 1, m-1, iwork(2*n+1), -1 )
+              end if
+              ! end of the  >> almost orthogonal case <<  in the full svd
+              else
+              ! this branch deploys a preconditioned jacobi svd with explicitly
+              ! accumulated rotations. it is included as optional, mainly for
+              ! experimental purposes. it does perform well, and can also be used.
+              ! in this implementation, this branch will be automatically activated
+              ! if the  condition number sigma_max(a) / sigma_min(a) is predicted
+              ! to be greater than the overflow threshold. this is because the
+              ! a posteriori computation of the singular vectors assumes robust
+              ! implementation of blas and some lapack procedures, capable of working
+              ! in presence of extreme values. since that is not always the case, ...
+              do p = 1, nr
+                 call stdlib_scopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+              end do
+              if ( l2pert ) then
+                 xsc = sqrt(small/epsln)
+                 do q = 1, nr
+                    temp1 = xsc*abs( v(q,q) )
+                    do p = 1, n
+                       if ( ( p > q ) .and. ( abs(v(p,q)) <= temp1 ).or. ( p < q ) )v(p,q) = sign(&
+                                  temp1, v(p,q) )
+                       if ( p < q ) v(p,q) = - v(p,q)
+                    end do
+                 end do
+              else
+                 call stdlib_slaset( 'U', nr-1, nr-1, zero, zero, v(1,2), ldv )
+              end if
+              call stdlib_sgeqrf( n, nr, v, ldv, work(n+1), work(2*n+1),lwork-2*n, ierr )
+              call stdlib_slacpy( 'L', n, nr, v, ldv, work(2*n+1), n )
+              do p = 1, nr
+                 call stdlib_scopy( nr-p+1, v(p,p), ldv, u(p,p), 1 )
+              end do
+              if ( l2pert ) then
+                 xsc = sqrt(small/epsln)
+                 do q = 2, nr
+                    do p = 1, q - 1
+                       temp1 = xsc * min(abs(u(p,p)),abs(u(q,q)))
+                       u(p,q) = - sign( temp1, u(q,p) )
+                    end do
+                 end do
+              else
+                 call stdlib_slaset('U', nr-1, nr-1, zero, zero, u(1,2), ldu )
+              end if
+              call stdlib_sgesvj( 'L', 'U', 'V', nr, nr, u, ldu, sva,n, v, ldv, work(2*n+n*nr+1), &
+                        lwork-2*n-n*nr, info )
+              scalem  = work(2*n+n*nr+1)
+              numrank = nint(work(2*n+n*nr+2),KIND=ilp)
+              if ( nr < n ) then
+                 call stdlib_slaset( 'A',n-nr,nr,zero,zero,v(nr+1,1),ldv )
+                 call stdlib_slaset( 'A',nr,n-nr,zero,zero,v(1,nr+1),ldv )
+                 call stdlib_slaset( 'A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv )
+              end if
+              call stdlib_sormqr( 'L','N',n,n,nr,work(2*n+1),n,work(n+1),v,ldv,work(2*n+n*nr+nr+1)&
+                        ,lwork-2*n-n*nr-nr,ierr )
+                 ! permute the rows of v using the (column) permutation from the
+                 ! first qrf. also, scale the columns to make them unit in
+                 ! euclidean norm. this applies to all cases.
+                 temp1 = sqrt(real(n,KIND=sp)) * epsln
+                 do q = 1, n
+                    do p = 1, n
+                       work(2*n+n*nr+nr+iwork(p)) = v(p,q)
+                    end do
+                    do p = 1, n
+                       v(p,q) = work(2*n+n*nr+nr+p)
+                    end do
+                    xsc = one / stdlib_snrm2( n, v(1,q), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_sscal( n, xsc, &
+                              v(1,q), 1 )
+                 end do
+                 ! at this moment, v contains the right singular vectors of a.
+                 ! next, assemble the left singular vector matrix u (m x n).
+              if ( nr < m ) then
+                 call stdlib_slaset( 'A',  m-nr, nr, zero, zero, u(nr+1,1), ldu )
+                 if ( nr < n1 ) then
+                    call stdlib_slaset( 'A',nr,  n1-nr, zero, zero,  u(1,nr+1),ldu )
+                    call stdlib_slaset( 'A',m-nr,n1-nr, zero, one,u(nr+1,nr+1),ldu )
+                 end if
+              end if
+              call stdlib_sormqr( 'LEFT', 'NO TR', m, n1, n, a, lda, work, u,ldu, work(n+1), &
+                        lwork-n, ierr )
+                 if ( rowpiv )call stdlib_slaswp( n1, u, ldu, 1, m-1, iwork(2*n+1), -1 )
+              end if
+              if ( transp ) then
+                 ! .. swap u and v because the procedure worked on a^t
+                 do p = 1, n
+                    call stdlib_sswap( n, u(1,p), 1, v(1,p), 1 )
+                 end do
+              end if
+           end if
+           ! end of the full svd
+           ! undo scaling, if necessary (and possible)
+           if ( uscal2 <= (big/sva(1))*uscal1 ) then
+              call stdlib_slascl( 'G', 0, 0, uscal1, uscal2, nr, 1, sva, n, ierr )
+              uscal1 = one
+              uscal2 = one
+           end if
+           if ( nr < n ) then
+              do p = nr+1, n
+                 sva(p) = zero
+              end do
+           end if
+           work(1) = uscal2 * scalem
+           work(2) = uscal1
+           if ( errest ) work(3) = sconda
+           if ( lsvec .and. rsvec ) then
+              work(4) = condr1
+              work(5) = condr2
+           end if
+           if ( l2tran ) then
+              work(6) = entra
+              work(7) = entrat
+           end if
+           iwork(1) = nr
+           iwork(2) = numrank
+           iwork(3) = warning
+           return
+     end subroutine stdlib_sgejsv
+
+     !> SGELSD: computes the minimum-norm solution to a real linear least
+     !> squares problem:
+     !> minimize 2-norm(| b - A*x |)
+     !> using the singular value decomposition (SVD) of A. A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+     !> The problem is solved in three steps:
+     !> (1) Reduce the coefficient matrix A to bidiagonal form with
+     !> Householder transformations, reducing the original problem
+     !> into a "bidiagonal least squares problem" (BLS)
+     !> (2) Solve the BLS using a divide and conquer approach.
+     !> (3) Apply back all the Householder transformations to solve
+     !> the original least squares problem.
+     !> The effective rank of A is determined by treating as zero those
+     !> singular values which are less than RCOND times the largest singular
+     !> value.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_sgelsd( m, n, nrhs, a, lda, b, ldb, s, rcond,rank, work, lwork, iwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(sp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: s(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: iascl, ibscl, ie, il, itau, itaup, itauq, ldwork, liwork, maxmn, &
+                     maxwrk, minmn, minwrk, mm, mnthr, nlvl, nwork, smlsiz, wlalsd
+           real(sp) :: anrm, bignum, bnrm, eps, sfmin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: int,log,max,min,real
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           minmn = min( m, n )
+           maxmn = max( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, maxmn ) ) then
+              info = -7
+           end if
+           ! compute workspace.
+           ! (note: comments in the code beginning "workspace:" describe the
+           ! minimal amount of workspace needed at that point in the code,
+           ! as well as the preferred amount for good performance.
+           ! nb refers to the optimal block size for the immediately
+           ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              liwork = 1
+              if( minmn>0 ) then
+                 smlsiz = stdlib_ilaenv( 9, 'SGELSD', ' ', 0, 0, 0, 0 )
+                 mnthr = stdlib_ilaenv( 6, 'SGELSD', ' ', m, n, nrhs, -1 )
+                 nlvl = max( int( log( real( minmn,KIND=sp) / real( smlsiz + 1,KIND=sp) ) /log( &
+                           two ),KIND=ilp) + 1, 0 )
+                 liwork = 3*minmn*nlvl + 11*minmn
+                 mm = m
+                 if( m>=n .and. m>=mnthr ) then
+                    ! path 1a - overdetermined, with many more rows than
+                              ! columns.
+                    mm = n
+                    maxwrk = max( maxwrk, n + n*stdlib_ilaenv( 1, 'SGEQRF', ' ', m,n, -1, -1 ) )
+                              
+                    maxwrk = max( maxwrk, n + nrhs*stdlib_ilaenv( 1, 'SORMQR', 'LT',m, nrhs, n, -&
+                              1 ) )
+                 end if
+                 if( m>=n ) then
+                    ! path 1 - overdetermined or exactly determined.
+                    maxwrk = max( maxwrk, 3*n + ( mm + n )*stdlib_ilaenv( 1,'SGEBRD', ' ', mm, n, &
+                              -1, -1 ) )
+                    maxwrk = max( maxwrk, 3*n + nrhs*stdlib_ilaenv( 1, 'SORMBR','QLT', mm, nrhs, &
+                              n, -1 ) )
+                    maxwrk = max( maxwrk, 3*n + ( n - 1 )*stdlib_ilaenv( 1,'SORMBR', 'PLN', n, &
+                              nrhs, n, -1 ) )
+                    wlalsd = 9*n + 2*n*smlsiz + 8*n*nlvl + n*nrhs +( smlsiz + 1 )**2
+                    maxwrk = max( maxwrk, 3*n + wlalsd )
+                    minwrk = max( 3*n + mm, 3*n + nrhs, 3*n + wlalsd )
+                 end if
+                 if( n>m ) then
+                    wlalsd = 9*m + 2*m*smlsiz + 8*m*nlvl + m*nrhs +( smlsiz + 1 )**2
+                    if( n>=mnthr ) then
+                       ! path 2a - underdetermined, with many more columns
+                                 ! than rows.
+                       maxwrk = m + m*stdlib_ilaenv( 1, 'SGELQF', ' ', m, n, -1,-1 )
+                       maxwrk = max( maxwrk, m*m + 4*m + 2*m*stdlib_ilaenv( 1,'SGEBRD', ' ', m, m,&
+                                  -1, -1 ) )
+                       maxwrk = max( maxwrk, m*m + 4*m + nrhs*stdlib_ilaenv( 1,'SORMBR', 'QLT', m,&
+                                  nrhs, m, -1 ) )
+                       maxwrk = max( maxwrk, m*m + 4*m + ( m - 1 )*stdlib_ilaenv( 1,'SORMBR', &
+                                 'PLN', m, nrhs, m, -1 ) )
+                       if( nrhs>1 ) then
+                          maxwrk = max( maxwrk, m*m + m + m*nrhs )
+                       else
+                          maxwrk = max( maxwrk, m*m + 2*m )
+                       end if
+                       maxwrk = max( maxwrk, m + nrhs*stdlib_ilaenv( 1, 'SORMLQ','LT', n, nrhs, m,&
+                                  -1 ) )
+                       maxwrk = max( maxwrk, m*m + 4*m + wlalsd )
+           ! xxx: ensure the path 2a case below is triggered.  the workspace
+           ! calculation should use queries for all routines eventually.
+                       maxwrk = max( maxwrk,4*m+m*m+max( m, 2*m-4, nrhs, n-3*m ) )
+                    else
+                       ! path 2 - remaining underdetermined cases.
+                       maxwrk = 3*m + ( n + m )*stdlib_ilaenv( 1, 'SGEBRD', ' ', m,n, -1, -1 )
+                                 
+                       maxwrk = max( maxwrk, 3*m + nrhs*stdlib_ilaenv( 1, 'SORMBR','QLT', m, nrhs,&
+                                  n, -1 ) )
+                       maxwrk = max( maxwrk, 3*m + m*stdlib_ilaenv( 1, 'SORMBR','PLN', n, nrhs, m,&
+                                  -1 ) )
+                       maxwrk = max( maxwrk, 3*m + wlalsd )
+                    end if
+                    minwrk = max( 3*m + nrhs, 3*m + m, 3*m + wlalsd )
+                 end if
+              end if
+              minwrk = min( minwrk, maxwrk )
+              work( 1 ) = maxwrk
+              iwork( 1 ) = liwork
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGELSD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters.
+           eps = stdlib_slamch( 'P' )
+           sfmin = stdlib_slamch( 'S' )
+           smlnum = sfmin / eps
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! scale a if max entry outside range [smlnum,bignum].
+           anrm = stdlib_slange( 'M', m, n, a, lda, work )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum.
+              call stdlib_slascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum.
+              call stdlib_slascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_slaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              call stdlib_slaset( 'F', minmn, 1, zero, zero, s, 1 )
+              rank = 0
+              go to 10
+           end if
+           ! scale b if max entry outside range [smlnum,bignum].
+           bnrm = stdlib_slange( 'M', m, nrhs, b, ldb, work )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum.
+              call stdlib_slascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum.
+              call stdlib_slascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! if m < n make sure certain entries of b are zero.
+           if( m<n )call stdlib_slaset( 'F', n-m, nrhs, zero, zero, b( m+1, 1 ), ldb )
+           ! overdetermined case.
+           if( m>=n ) then
+              ! path 1 - overdetermined or exactly determined.
+              mm = m
+              if( m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns.
+                 mm = n
+                 itau = 1
+                 nwork = itau + n
+                 ! compute a=q*r.
+                 ! (workspace: need 2*n, prefer n+n*nb)
+                 call stdlib_sgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                           info )
+                 ! multiply b by transpose(q).
+                 ! (workspace: need n+nrhs, prefer n+nrhs*nb)
+                 call stdlib_sormqr( 'L', 'T', m, nrhs, n, a, lda, work( itau ), b,ldb, work( &
+                           nwork ), lwork-nwork+1, info )
+                 ! zero out below r.
+                 if( n>1 ) then
+                    call stdlib_slaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ), lda )
+                 end if
+              end if
+              ie = 1
+              itauq = ie + n
+              itaup = itauq + n
+              nwork = itaup + n
+              ! bidiagonalize r in a.
+              ! (workspace: need 3*n+mm, prefer 3*n+(mm+n)*nb)
+              call stdlib_sgebrd( mm, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), work(&
+                         nwork ), lwork-nwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors of r.
+              ! (workspace: need 3*n+nrhs, prefer 3*n+nrhs*nb)
+              call stdlib_sormbr( 'Q', 'L', 'T', mm, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_slalsd( 'U', smlsiz, n, nrhs, s, work( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of r.
+              call stdlib_sormbr( 'P', 'L', 'N', n, nrhs, n, a, lda, work( itaup ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+           else if( n>=mnthr .and. lwork>=4*m+m*m+max( m, 2*m-4, nrhs, n-3*m, wlalsd ) ) &
+                     then
+              ! path 2a - underdetermined, with many more columns than rows
+              ! and sufficient workspace for an efficient algorithm.
+              ldwork = m
+              if( lwork>=max( 4*m+m*lda+max( m, 2*m-4, nrhs, n-3*m ),m*lda+m+m*nrhs, 4*m+m*lda+&
+                        wlalsd ) )ldwork = lda
+              itau = 1
+              nwork = m + 1
+              ! compute a=l*q.
+              ! (workspace: need 2*m, prefer m+m*nb)
+              call stdlib_sgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, info )
+                        
+              il = nwork
+              ! copy l to work(il), zeroing out above its diagonal.
+              call stdlib_slacpy( 'L', m, m, a, lda, work( il ), ldwork )
+              call stdlib_slaset( 'U', m-1, m-1, zero, zero, work( il+ldwork ),ldwork )
+              ie = il + ldwork*m
+              itauq = ie + m
+              itaup = itauq + m
+              nwork = itaup + m
+              ! bidiagonalize l in work(il).
+              ! (workspace: need m*m+5*m, prefer m*m+4*m+2*m*nb)
+              call stdlib_sgebrd( m, m, work( il ), ldwork, s, work( ie ),work( itauq ), work( &
+                        itaup ), work( nwork ),lwork-nwork+1, info )
+              ! multiply b by transpose of left bidiagonalizing vectors of l.
+              ! (workspace: need m*m+4*m+nrhs, prefer m*m+4*m+nrhs*nb)
+              call stdlib_sormbr( 'Q', 'L', 'T', m, nrhs, m, work( il ), ldwork,work( itauq ), b, &
+                        ldb, work( nwork ),lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_slalsd( 'U', smlsiz, m, nrhs, s, work( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of l.
+              call stdlib_sormbr( 'P', 'L', 'N', m, nrhs, m, work( il ), ldwork,work( itaup ), b, &
+                        ldb, work( nwork ),lwork-nwork+1, info )
+              ! zero out below first m rows of b.
+              call stdlib_slaset( 'F', n-m, nrhs, zero, zero, b( m+1, 1 ), ldb )
+              nwork = itau + m
+              ! multiply transpose(q) by b.
+              ! (workspace: need m+nrhs, prefer m+nrhs*nb)
+              call stdlib_sormlq( 'L', 'T', n, nrhs, m, a, lda, work( itau ), b,ldb, work( nwork )&
+                        , lwork-nwork+1, info )
+           else
+              ! path 2 - remaining underdetermined cases.
+              ie = 1
+              itauq = ie + m
+              itaup = itauq + m
+              nwork = itaup + m
+              ! bidiagonalize a.
+              ! (workspace: need 3*m+n, prefer 3*m+(m+n)*nb)
+              call stdlib_sgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), work( &
+                        nwork ), lwork-nwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors.
+              ! (workspace: need 3*m+nrhs, prefer 3*m+nrhs*nb)
+              call stdlib_sormbr( 'Q', 'L', 'T', m, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_slalsd( 'L', smlsiz, m, nrhs, s, work( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of a.
+              call stdlib_sormbr( 'P', 'L', 'N', n, nrhs, m, a, lda, work( itaup ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+           end if
+           ! undo scaling.
+           if( iascl==1 ) then
+              call stdlib_slascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_slascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,info )
+           else if( iascl==2 ) then
+              call stdlib_slascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_slascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_slascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_slascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           10 continue
+           work( 1 ) = maxwrk
+           iwork( 1 ) = liwork
+           return
+     end subroutine stdlib_sgelsd
+
+     !> SGELSS: computes the minimum norm solution to a real linear least
+     !> squares problem:
+     !> Minimize 2-norm(| b - A*x |).
+     !> using the singular value decomposition (SVD) of A. A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution matrix
+     !> X.
+     !> The effective rank of A is determined by treating as zero those
+     !> singular values which are less than RCOND times the largest singular
+     !> value.
+
+     subroutine stdlib_sgelss( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(sp), intent(in) :: rcond
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: s(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: bdspac, bl, chunk, i, iascl, ibscl, ie, il, itau, itaup, itauq, iwork, &
+                     ldwork, maxmn, maxwrk, minmn, minwrk, mm, mnthr
+           integer(ilp) :: lwork_sgeqrf, lwork_sormqr, lwork_sgebrd, lwork_sormbr, lwork_sorgbr, &
+                     lwork_sormlq
+           real(sp) :: anrm, bignum, bnrm, eps, sfmin, smlnum, thr
+           ! Local Arrays 
+           real(sp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           minmn = min( m, n )
+           maxmn = max( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, maxmn ) ) then
+              info = -7
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( minmn>0 ) then
+                 mm = m
+                 mnthr = stdlib_ilaenv( 6, 'SGELSS', ' ', m, n, nrhs, -1 )
+                 if( m>=n .and. m>=mnthr ) then
+                    ! path 1a - overdetermined, with many more rows than
+                              ! columns
+                    ! compute space needed for stdlib_sgeqrf
+                    call stdlib_sgeqrf( m, n, a, lda, dum(1), dum(1), -1, info )
+                    lwork_sgeqrf=dum(1)
+                    ! compute space needed for stdlib_sormqr
+                    call stdlib_sormqr( 'L', 'T', m, nrhs, n, a, lda, dum(1), b,ldb, dum(1), -1, &
+                              info )
+                    lwork_sormqr=dum(1)
+                    mm = n
+                    maxwrk = max( maxwrk, n + lwork_sgeqrf )
+                    maxwrk = max( maxwrk, n + lwork_sormqr )
+                 end if
+                 if( m>=n ) then
+                    ! path 1 - overdetermined or exactly determined
+                    ! compute workspace needed for stdlib_sbdsqr
+                    bdspac = max( 1, 5*n )
+                    ! compute space needed for stdlib_sgebrd
+                    call stdlib_sgebrd( mm, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, info &
+                              )
+                    lwork_sgebrd=dum(1)
+                    ! compute space needed for stdlib_sormbr
+                    call stdlib_sormbr( 'Q', 'L', 'T', mm, nrhs, n, a, lda, dum(1),b, ldb, dum(1),&
+                               -1, info )
+                    lwork_sormbr=dum(1)
+                    ! compute space needed for stdlib_sorgbr
+                    call stdlib_sorgbr( 'P', n, n, n, a, lda, dum(1),dum(1), -1, info )
+                    lwork_sorgbr=dum(1)
+                    ! compute total workspace needed
+                    maxwrk = max( maxwrk, 3*n + lwork_sgebrd )
+                    maxwrk = max( maxwrk, 3*n + lwork_sormbr )
+                    maxwrk = max( maxwrk, 3*n + lwork_sorgbr )
+                    maxwrk = max( maxwrk, bdspac )
+                    maxwrk = max( maxwrk, n*nrhs )
+                    minwrk = max( 3*n + mm, 3*n + nrhs, bdspac )
+                    maxwrk = max( minwrk, maxwrk )
+                 end if
+                 if( n>m ) then
+                    ! compute workspace needed for stdlib_sbdsqr
+                    bdspac = max( 1, 5*m )
+                    minwrk = max( 3*m+nrhs, 3*m+n, bdspac )
+                    if( n>=mnthr ) then
+                       ! path 2a - underdetermined, with many more columns
+                       ! than rows
+                       ! compute space needed for stdlib_sgebrd
+                       call stdlib_sgebrd( m, m, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, &
+                                 info )
+                       lwork_sgebrd=dum(1)
+                       ! compute space needed for stdlib_sormbr
+                       call stdlib_sormbr( 'Q', 'L', 'T', m, nrhs, n, a, lda,dum(1), b, ldb, dum(&
+                                 1), -1, info )
+                       lwork_sormbr=dum(1)
+                       ! compute space needed for stdlib_sorgbr
+                       call stdlib_sorgbr( 'P', m, m, m, a, lda, dum(1),dum(1), -1, info )
+                       lwork_sorgbr=dum(1)
+                       ! compute space needed for stdlib_sormlq
+                       call stdlib_sormlq( 'L', 'T', n, nrhs, m, a, lda, dum(1),b, ldb, dum(1), -&
+                                 1, info )
+                       lwork_sormlq=dum(1)
+                       ! compute total workspace needed
+                       maxwrk = m + m*stdlib_ilaenv( 1, 'SGELQF', ' ', m, n, -1,-1 )
+                       maxwrk = max( maxwrk, m*m + 4*m + lwork_sgebrd )
+                       maxwrk = max( maxwrk, m*m + 4*m + lwork_sormbr )
+                       maxwrk = max( maxwrk, m*m + 4*m + lwork_sorgbr )
+                       maxwrk = max( maxwrk, m*m + m + bdspac )
+                       if( nrhs>1 ) then
+                          maxwrk = max( maxwrk, m*m + m + m*nrhs )
+                       else
+                          maxwrk = max( maxwrk, m*m + 2*m )
+                       end if
+                       maxwrk = max( maxwrk, m + lwork_sormlq )
+                    else
+                       ! path 2 - underdetermined
+                       ! compute space needed for stdlib_sgebrd
+                       call stdlib_sgebrd( m, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, &
+                                 info )
+                       lwork_sgebrd=dum(1)
+                       ! compute space needed for stdlib_sormbr
+                       call stdlib_sormbr( 'Q', 'L', 'T', m, nrhs, m, a, lda,dum(1), b, ldb, dum(&
+                                 1), -1, info )
+                       lwork_sormbr=dum(1)
+                       ! compute space needed for stdlib_sorgbr
+                       call stdlib_sorgbr( 'P', m, n, m, a, lda, dum(1),dum(1), -1, info )
+                       lwork_sorgbr=dum(1)
+                       maxwrk = 3*m + lwork_sgebrd
+                       maxwrk = max( maxwrk, 3*m + lwork_sormbr )
+                       maxwrk = max( maxwrk, 3*m + lwork_sorgbr )
+                       maxwrk = max( maxwrk, bdspac )
+                       maxwrk = max( maxwrk, n*nrhs )
+                    end if
+                 end if
+                 maxwrk = max( minwrk, maxwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery )info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGELSS', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters
+           eps = stdlib_slamch( 'P' )
+           sfmin = stdlib_slamch( 'S' )
+           smlnum = sfmin / eps
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', m, n, a, lda, work )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_slascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_slascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_slaset( 'F', max( m, n ), nrhs, zero, zero, b, ldb )
+              call stdlib_slaset( 'F', minmn, 1, zero, zero, s, minmn )
+              rank = 0
+              go to 70
+           end if
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_slange( 'M', m, nrhs, b, ldb, work )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_slascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_slascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! overdetermined case
+           if( m>=n ) then
+              ! path 1 - overdetermined or exactly determined
+              mm = m
+              if( m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns
+                 mm = n
+                 itau = 1
+                 iwork = itau + n
+                 ! compute a=q*r
+                 ! (workspace: need 2*n, prefer n+n*nb)
+                 call stdlib_sgeqrf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                           info )
+                 ! multiply b by transpose(q)
+                 ! (workspace: need n+nrhs, prefer n+nrhs*nb)
+                 call stdlib_sormqr( 'L', 'T', m, nrhs, n, a, lda, work( itau ), b,ldb, work( &
+                           iwork ), lwork-iwork+1, info )
+                 ! zero out below r
+                 if( n>1 )call stdlib_slaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ), lda )
+              end if
+              ie = 1
+              itauq = ie + n
+              itaup = itauq + n
+              iwork = itaup + n
+              ! bidiagonalize r in a
+              ! (workspace: need 3*n+mm, prefer 3*n+(mm+n)*nb)
+              call stdlib_sgebrd( mm, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), work(&
+                         iwork ), lwork-iwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors of r
+              ! (workspace: need 3*n+nrhs, prefer 3*n+nrhs*nb)
+              call stdlib_sormbr( 'Q', 'L', 'T', mm, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        iwork ), lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors of r in a
+              ! (workspace: need 4*n-1, prefer 3*n+(n-1)*nb)
+              call stdlib_sorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), lwork-iwork+&
+                        1, info )
+              iwork = ie + n
+              ! perform bidiagonal qr iteration
+                ! multiply b by transpose of left singular vectors
+                ! compute right singular vectors in a
+              ! (workspace: need bdspac)
+              call stdlib_sbdsqr( 'U', n, n, 0, nrhs, s, work( ie ), a, lda, dum,1, b, ldb, work( &
+                        iwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, n
+                 if( s( i )>thr ) then
+                    call stdlib_srscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_slaset( 'F', 1, nrhs, zero, zero, b( i, 1 ), ldb )
+                 end if
+              end do
+              ! multiply b by right singular vectors
+              ! (workspace: need n, prefer n*nrhs)
+              if( lwork>=ldb*nrhs .and. nrhs>1 ) then
+                 call stdlib_sgemm( 'T', 'N', n, nrhs, n, one, a, lda, b, ldb, zero,work, ldb )
+                           
+                 call stdlib_slacpy( 'G', n, nrhs, work, ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = lwork / n
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_sgemm( 'T', 'N', n, bl, n, one, a, lda, b( 1, i ),ldb, zero, work,&
+                               n )
+                    call stdlib_slacpy( 'G', n, bl, work, n, b( 1, i ), ldb )
+                 end do
+              else
+                 call stdlib_sgemv( 'T', n, n, one, a, lda, b, 1, zero, work, 1 )
+                 call stdlib_scopy( n, work, 1, b, 1 )
+              end if
+           else if( n>=mnthr .and. lwork>=4*m+m*m+max( m, 2*m-4, nrhs, n-3*m ) ) then
+              ! path 2a - underdetermined, with many more columns than rows
+              ! and sufficient workspace for an efficient algorithm
+              ldwork = m
+              if( lwork>=max( 4*m+m*lda+max( m, 2*m-4, nrhs, n-3*m ),m*lda+m+m*nrhs ) )ldwork = &
+                        lda
+              itau = 1
+              iwork = m + 1
+              ! compute a=l*q
+              ! (workspace: need 2*m, prefer m+m*nb)
+              call stdlib_sgelqf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, info )
+                        
+              il = iwork
+              ! copy l to work(il), zeroing out above it
+              call stdlib_slacpy( 'L', m, m, a, lda, work( il ), ldwork )
+              call stdlib_slaset( 'U', m-1, m-1, zero, zero, work( il+ldwork ),ldwork )
+              ie = il + ldwork*m
+              itauq = ie + m
+              itaup = itauq + m
+              iwork = itaup + m
+              ! bidiagonalize l in work(il)
+              ! (workspace: need m*m+5*m, prefer m*m+4*m+2*m*nb)
+              call stdlib_sgebrd( m, m, work( il ), ldwork, s, work( ie ),work( itauq ), work( &
+                        itaup ), work( iwork ),lwork-iwork+1, info )
+              ! multiply b by transpose of left bidiagonalizing vectors of l
+              ! (workspace: need m*m+4*m+nrhs, prefer m*m+4*m+nrhs*nb)
+              call stdlib_sormbr( 'Q', 'L', 'T', m, nrhs, m, work( il ), ldwork,work( itauq ), b, &
+                        ldb, work( iwork ),lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors of r in work(il)
+              ! (workspace: need m*m+5*m-1, prefer m*m+4*m+(m-1)*nb)
+              call stdlib_sorgbr( 'P', m, m, m, work( il ), ldwork, work( itaup ),work( iwork ), &
+                        lwork-iwork+1, info )
+              iwork = ie + m
+              ! perform bidiagonal qr iteration,
+                 ! computing right singular vectors of l in work(il) and
+                 ! multiplying b by transpose of left singular vectors
+              ! (workspace: need m*m+m+bdspac)
+              call stdlib_sbdsqr( 'U', m, m, 0, nrhs, s, work( ie ), work( il ),ldwork, a, lda, b,&
+                         ldb, work( iwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, m
+                 if( s( i )>thr ) then
+                    call stdlib_srscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_slaset( 'F', 1, nrhs, zero, zero, b( i, 1 ), ldb )
+                 end if
+              end do
+              iwork = ie
+              ! multiply b by right singular vectors of l in work(il)
+              ! (workspace: need m*m+2*m, prefer m*m+m+m*nrhs)
+              if( lwork>=ldb*nrhs+iwork-1 .and. nrhs>1 ) then
+                 call stdlib_sgemm( 'T', 'N', m, nrhs, m, one, work( il ), ldwork,b, ldb, zero, &
+                           work( iwork ), ldb )
+                 call stdlib_slacpy( 'G', m, nrhs, work( iwork ), ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = ( lwork-iwork+1 ) / m
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_sgemm( 'T', 'N', m, bl, m, one, work( il ), ldwork,b( 1, i ), ldb,&
+                               zero, work( iwork ), m )
+                    call stdlib_slacpy( 'G', m, bl, work( iwork ), m, b( 1, i ),ldb )
+                 end do
+              else
+                 call stdlib_sgemv( 'T', m, m, one, work( il ), ldwork, b( 1, 1 ),1, zero, work( &
+                           iwork ), 1 )
+                 call stdlib_scopy( m, work( iwork ), 1, b( 1, 1 ), 1 )
+              end if
+              ! zero out below first m rows of b
+              call stdlib_slaset( 'F', n-m, nrhs, zero, zero, b( m+1, 1 ), ldb )
+              iwork = itau + m
+              ! multiply transpose(q) by b
+              ! (workspace: need m+nrhs, prefer m+nrhs*nb)
+              call stdlib_sormlq( 'L', 'T', n, nrhs, m, a, lda, work( itau ), b,ldb, work( iwork )&
+                        , lwork-iwork+1, info )
+           else
+              ! path 2 - remaining underdetermined cases
+              ie = 1
+              itauq = ie + m
+              itaup = itauq + m
+              iwork = itaup + m
+              ! bidiagonalize a
+              ! (workspace: need 3*m+n, prefer 3*m+(m+n)*nb)
+              call stdlib_sgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), work( &
+                        iwork ), lwork-iwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors
+              ! (workspace: need 3*m+nrhs, prefer 3*m+nrhs*nb)
+              call stdlib_sormbr( 'Q', 'L', 'T', m, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        iwork ), lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors in a
+              ! (workspace: need 4*m, prefer 3*m+m*nb)
+              call stdlib_sorgbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), lwork-iwork+&
+                        1, info )
+              iwork = ie + m
+              ! perform bidiagonal qr iteration,
+                 ! computing right singular vectors of a in a and
+                 ! multiplying b by transpose of left singular vectors
+              ! (workspace: need bdspac)
+              call stdlib_sbdsqr( 'L', m, n, 0, nrhs, s, work( ie ), a, lda, dum,1, b, ldb, work( &
+                        iwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, m
+                 if( s( i )>thr ) then
+                    call stdlib_srscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_slaset( 'F', 1, nrhs, zero, zero, b( i, 1 ), ldb )
+                 end if
+              end do
+              ! multiply b by right singular vectors of a
+              ! (workspace: need n, prefer n*nrhs)
+              if( lwork>=ldb*nrhs .and. nrhs>1 ) then
+                 call stdlib_sgemm( 'T', 'N', n, nrhs, m, one, a, lda, b, ldb, zero,work, ldb )
+                           
+                 call stdlib_slacpy( 'F', n, nrhs, work, ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = lwork / n
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_sgemm( 'T', 'N', n, bl, m, one, a, lda, b( 1, i ),ldb, zero, work,&
+                               n )
+                    call stdlib_slacpy( 'F', n, bl, work, n, b( 1, i ), ldb )
+                 end do
+              else
+                 call stdlib_sgemv( 'T', m, n, one, a, lda, b, 1, zero, work, 1 )
+                 call stdlib_scopy( n, work, 1, b, 1 )
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_slascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_slascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,info )
+           else if( iascl==2 ) then
+              call stdlib_slascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_slascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_slascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_slascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           70 continue
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_sgelss
+
+     !> SGESDD: computes the singular value decomposition (SVD) of a real
+     !> M-by-N matrix A, optionally computing the left and right singular
+     !> vectors.  If singular vectors are desired, it uses a
+     !> divide-and-conquer algorithm.
+     !> The SVD is written
+     !> A = U * SIGMA * transpose(V)
+     !> where SIGMA is an M-by-N matrix which is zero except for its
+     !> min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
+     !> V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
+     !> are the singular values of A; they are real and non-negative, and
+     !> are returned in descending order.  The first min(m,n) columns of
+     !> U and V are the left and right singular vectors of A.
+     !> Note that the routine returns VT = V**T, not V.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_sgesdd( jobz, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, iwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldvt, lwork, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: s(*), u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wntqa, wntqas, wntqn, wntqo, wntqs
+           integer(ilp) :: bdspac, blk, chunk, i, ie, ierr, il, ir, iscl, itau, itaup, itauq, iu, &
+                     ivt, ldwkvt, ldwrkl, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr, nwork, wrkbl
+           integer(ilp) :: lwork_sgebrd_mn, lwork_sgebrd_mm, lwork_sgebrd_nn, lwork_sgelqf_mn, &
+           lwork_sgeqrf_mn, lwork_sorgbr_p_mm, lwork_sorgbr_q_nn, lwork_sorglq_mn, &
+           lwork_sorglq_nn, lwork_sorgqr_mm, lwork_sorgqr_mn, lwork_sormbr_prt_mm, &
+           lwork_sormbr_qln_mm, lwork_sormbr_prt_mn, lwork_sormbr_qln_mn, lwork_sormbr_prt_nn, &
+                     lwork_sormbr_qln_nn
+           real(sp) :: anrm, bignum, eps, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(sp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info   = 0
+           minmn  = min( m, n )
+           wntqa  = stdlib_lsame( jobz, 'A' )
+           wntqs  = stdlib_lsame( jobz, 'S' )
+           wntqas = wntqa .or. wntqs
+           wntqo  = stdlib_lsame( jobz, 'O' )
+           wntqn  = stdlib_lsame( jobz, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.( wntqa .or. wntqs .or. wntqo .or. wntqn ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldu<1 .or. ( wntqas .and. ldu<m ) .or.( wntqo .and. m<n .and. ldu<m ) ) &
+                     then
+              info = -8
+           else if( ldvt<1 .or. ( wntqa .and. ldvt<n ) .or.( wntqs .and. ldvt<minmn ) .or.( wntqo &
+                     .and. m>=n .and. ldvt<n ) ) then
+              info = -10
+           end if
+           ! compute workspace
+             ! note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace allocated at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              bdspac = 0
+              mnthr  = int( minmn*11.0_sp / 6.0_sp,KIND=ilp)
+              if( m>=n .and. minmn>0 ) then
+                 ! compute space needed for stdlib_sbdsdc
+                 if( wntqn ) then
+                    ! stdlib_sbdsdc needs only 4*n (or 6*n for uplo=l for lapack <= 3.6_sp)
+                    ! keep 7*n for backwards compatibility.
+                    bdspac = 7*n
+                 else
+                    bdspac = 3*n*n + 4*n
+                 end if
+                 ! compute space preferred for each routine
+                 call stdlib_sgebrd( m, n, dum(1), m, dum(1), dum(1), dum(1),dum(1), dum(1), -1, &
+                           ierr )
+                 lwork_sgebrd_mn = int( dum(1),KIND=ilp)
+                 call stdlib_sgebrd( n, n, dum(1), n, dum(1), dum(1), dum(1),dum(1), dum(1), -1, &
+                           ierr )
+                 lwork_sgebrd_nn = int( dum(1),KIND=ilp)
+                 call stdlib_sgeqrf( m, n, dum(1), m, dum(1), dum(1), -1, ierr )
+                 lwork_sgeqrf_mn = int( dum(1),KIND=ilp)
+                 call stdlib_sorgbr( 'Q', n, n, n, dum(1), n, dum(1), dum(1), -1,ierr )
+                 lwork_sorgbr_q_nn = int( dum(1),KIND=ilp)
+                 call stdlib_sorgqr( m, m, n, dum(1), m, dum(1), dum(1), -1, ierr )
+                 lwork_sorgqr_mm = int( dum(1),KIND=ilp)
+                 call stdlib_sorgqr( m, n, n, dum(1), m, dum(1), dum(1), -1, ierr )
+                 lwork_sorgqr_mn = int( dum(1),KIND=ilp)
+                 call stdlib_sormbr( 'P', 'R', 'T', n, n, n, dum(1), n,dum(1), dum(1), n, dum(1), &
+                           -1, ierr )
+                 lwork_sormbr_prt_nn = int( dum(1),KIND=ilp)
+                 call stdlib_sormbr( 'Q', 'L', 'N', n, n, n, dum(1), n,dum(1), dum(1), n, dum(1), &
+                           -1, ierr )
+                 lwork_sormbr_qln_nn = int( dum(1),KIND=ilp)
+                 call stdlib_sormbr( 'Q', 'L', 'N', m, n, n, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_sormbr_qln_mn = int( dum(1),KIND=ilp)
+                 call stdlib_sormbr( 'Q', 'L', 'N', m, m, n, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_sormbr_qln_mm = int( dum(1),KIND=ilp)
+                 if( m>=mnthr ) then
+                    if( wntqn ) then
+                       ! path 1 (m >> n, jobz='n')
+                       wrkbl = n + lwork_sgeqrf_mn
+                       wrkbl = max( wrkbl, 3*n + lwork_sgebrd_nn )
+                       maxwrk = max( wrkbl, bdspac + n )
+                       minwrk = bdspac + n
+                    else if( wntqo ) then
+                       ! path 2 (m >> n, jobz='o')
+                       wrkbl = n + lwork_sgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_sorgqr_mn )
+                       wrkbl = max( wrkbl, 3*n + lwork_sgebrd_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_sormbr_qln_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_sormbr_prt_nn )
+                       wrkbl = max( wrkbl, 3*n + bdspac )
+                       maxwrk = wrkbl + 2*n*n
+                       minwrk = bdspac + 2*n*n + 3*n
+                    else if( wntqs ) then
+                       ! path 3 (m >> n, jobz='s')
+                       wrkbl = n + lwork_sgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_sorgqr_mn )
+                       wrkbl = max( wrkbl, 3*n + lwork_sgebrd_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_sormbr_qln_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_sormbr_prt_nn )
+                       wrkbl = max( wrkbl, 3*n + bdspac )
+                       maxwrk = wrkbl + n*n
+                       minwrk = bdspac + n*n + 3*n
+                    else if( wntqa ) then
+                       ! path 4 (m >> n, jobz='a')
+                       wrkbl = n + lwork_sgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_sorgqr_mm )
+                       wrkbl = max( wrkbl, 3*n + lwork_sgebrd_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_sormbr_qln_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_sormbr_prt_nn )
+                       wrkbl = max( wrkbl, 3*n + bdspac )
+                       maxwrk = wrkbl + n*n
+                       minwrk = n*n + max( 3*n + bdspac, n + m )
+                    end if
+                 else
+                    ! path 5 (m >= n, but not much larger)
+                    wrkbl = 3*n + lwork_sgebrd_mn
+                    if( wntqn ) then
+                       ! path 5n (m >= n, jobz='n')
+                       maxwrk = max( wrkbl, 3*n + bdspac )
+                       minwrk = 3*n + max( m, bdspac )
+                    else if( wntqo ) then
+                       ! path 5o (m >= n, jobz='o')
+                       wrkbl = max( wrkbl, 3*n + lwork_sormbr_prt_nn )
+                       wrkbl = max( wrkbl, 3*n + lwork_sormbr_qln_mn )
+                       wrkbl = max( wrkbl, 3*n + bdspac )
+                       maxwrk = wrkbl + m*n
+                       minwrk = 3*n + max( m, n*n + bdspac )
+                    else if( wntqs ) then
+                       ! path 5s (m >= n, jobz='s')
+                       wrkbl = max( wrkbl, 3*n + lwork_sormbr_qln_mn )
+                       wrkbl = max( wrkbl, 3*n + lwork_sormbr_prt_nn )
+                       maxwrk = max( wrkbl, 3*n + bdspac )
+                       minwrk = 3*n + max( m, bdspac )
+                    else if( wntqa ) then
+                       ! path 5a (m >= n, jobz='a')
+                       wrkbl = max( wrkbl, 3*n + lwork_sormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*n + lwork_sormbr_prt_nn )
+                       maxwrk = max( wrkbl, 3*n + bdspac )
+                       minwrk = 3*n + max( m, bdspac )
+                    end if
+                 end if
+              else if( minmn>0 ) then
+                 ! compute space needed for stdlib_sbdsdc
+                 if( wntqn ) then
+                    ! stdlib_sbdsdc needs only 4*n (or 6*n for uplo=l for lapack <= 3.6_sp)
+                    ! keep 7*n for backwards compatibility.
+                    bdspac = 7*m
+                 else
+                    bdspac = 3*m*m + 4*m
+                 end if
+                 ! compute space preferred for each routine
+                 call stdlib_sgebrd( m, n, dum(1), m, dum(1), dum(1), dum(1),dum(1), dum(1), -1, &
+                           ierr )
+                 lwork_sgebrd_mn = int( dum(1),KIND=ilp)
+                 call stdlib_sgebrd( m, m, a, m, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                           
+                 lwork_sgebrd_mm = int( dum(1),KIND=ilp)
+                 call stdlib_sgelqf( m, n, a, m, dum(1), dum(1), -1, ierr )
+                 lwork_sgelqf_mn = int( dum(1),KIND=ilp)
+                 call stdlib_sorglq( n, n, m, dum(1), n, dum(1), dum(1), -1, ierr )
+                 lwork_sorglq_nn = int( dum(1),KIND=ilp)
+                 call stdlib_sorglq( m, n, m, a, m, dum(1), dum(1), -1, ierr )
+                 lwork_sorglq_mn = int( dum(1),KIND=ilp)
+                 call stdlib_sorgbr( 'P', m, m, m, a, n, dum(1), dum(1), -1, ierr )
+                 lwork_sorgbr_p_mm = int( dum(1),KIND=ilp)
+                 call stdlib_sormbr( 'P', 'R', 'T', m, m, m, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_sormbr_prt_mm = int( dum(1),KIND=ilp)
+                 call stdlib_sormbr( 'P', 'R', 'T', m, n, m, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_sormbr_prt_mn = int( dum(1),KIND=ilp)
+                 call stdlib_sormbr( 'P', 'R', 'T', n, n, m, dum(1), n,dum(1), dum(1), n, dum(1), &
+                           -1, ierr )
+                 lwork_sormbr_prt_nn = int( dum(1),KIND=ilp)
+                 call stdlib_sormbr( 'Q', 'L', 'N', m, m, m, dum(1), m,dum(1), dum(1), m, dum(1), &
+                           -1, ierr )
+                 lwork_sormbr_qln_mm = int( dum(1),KIND=ilp)
+                 if( n>=mnthr ) then
+                    if( wntqn ) then
+                       ! path 1t (n >> m, jobz='n')
+                       wrkbl = m + lwork_sgelqf_mn
+                       wrkbl = max( wrkbl, 3*m + lwork_sgebrd_mm )
+                       maxwrk = max( wrkbl, bdspac + m )
+                       minwrk = bdspac + m
+                    else if( wntqo ) then
+                       ! path 2t (n >> m, jobz='o')
+                       wrkbl = m + lwork_sgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_sorglq_mn )
+                       wrkbl = max( wrkbl, 3*m + lwork_sgebrd_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_sormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_sormbr_prt_mm )
+                       wrkbl = max( wrkbl, 3*m + bdspac )
+                       maxwrk = wrkbl + 2*m*m
+                       minwrk = bdspac + 2*m*m + 3*m
+                    else if( wntqs ) then
+                       ! path 3t (n >> m, jobz='s')
+                       wrkbl = m + lwork_sgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_sorglq_mn )
+                       wrkbl = max( wrkbl, 3*m + lwork_sgebrd_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_sormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_sormbr_prt_mm )
+                       wrkbl = max( wrkbl, 3*m + bdspac )
+                       maxwrk = wrkbl + m*m
+                       minwrk = bdspac + m*m + 3*m
+                    else if( wntqa ) then
+                       ! path 4t (n >> m, jobz='a')
+                       wrkbl = m + lwork_sgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_sorglq_nn )
+                       wrkbl = max( wrkbl, 3*m + lwork_sgebrd_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_sormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_sormbr_prt_mm )
+                       wrkbl = max( wrkbl, 3*m + bdspac )
+                       maxwrk = wrkbl + m*m
+                       minwrk = m*m + max( 3*m + bdspac, m + n )
+                    end if
+                 else
+                    ! path 5t (n > m, but not much larger)
+                    wrkbl = 3*m + lwork_sgebrd_mn
+                    if( wntqn ) then
+                       ! path 5tn (n > m, jobz='n')
+                       maxwrk = max( wrkbl, 3*m + bdspac )
+                       minwrk = 3*m + max( n, bdspac )
+                    else if( wntqo ) then
+                       ! path 5to (n > m, jobz='o')
+                       wrkbl = max( wrkbl, 3*m + lwork_sormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_sormbr_prt_mn )
+                       wrkbl = max( wrkbl, 3*m + bdspac )
+                       maxwrk = wrkbl + m*n
+                       minwrk = 3*m + max( n, m*m + bdspac )
+                    else if( wntqs ) then
+                       ! path 5ts (n > m, jobz='s')
+                       wrkbl = max( wrkbl, 3*m + lwork_sormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_sormbr_prt_mn )
+                       maxwrk = max( wrkbl, 3*m + bdspac )
+                       minwrk = 3*m + max( n, bdspac )
+                    else if( wntqa ) then
+                       ! path 5ta (n > m, jobz='a')
+                       wrkbl = max( wrkbl, 3*m + lwork_sormbr_qln_mm )
+                       wrkbl = max( wrkbl, 3*m + lwork_sormbr_prt_nn )
+                       maxwrk = max( wrkbl, 3*m + bdspac )
+                       minwrk = 3*m + max( n, bdspac )
+                    end if
+                 end if
+              end if
+              maxwrk = max( maxwrk, minwrk )
+              work( 1 ) = stdlib_sroundup_lwork( maxwrk )
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGESDD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = sqrt( stdlib_slamch( 'S' ) ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', m, n, a, lda, dum )
+           if( stdlib_sisnan( anrm ) ) then
+               info = -4
+               return
+           end if
+           iscl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              iscl = 1
+              call stdlib_slascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, ierr )
+           else if( anrm>bignum ) then
+              iscl = 1
+              call stdlib_slascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, ierr )
+           end if
+           if( m>=n ) then
+              ! a has at least as many rows as columns. if a has sufficiently
+              ! more rows than columns, first reduce using the qr
+              ! decomposition (if sufficient workspace available)
+              if( m>=mnthr ) then
+                 if( wntqn ) then
+                    ! path 1 (m >> n, jobz='n')
+                    ! no singular vectors to be computed
+                    itau = 1
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! workspace: need   n [tau] + n    [work]
+                    ! workspace: prefer n [tau] + n*nb [work]
+                    call stdlib_sgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! zero out below r
+                    call stdlib_slaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ), lda )
+                    ie = 1
+                    itauq = ie + n
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! workspace: need   3*n [e, tauq, taup] + n      [work]
+                    ! workspace: prefer 3*n [e, tauq, taup] + 2*n*nb [work]
+                    call stdlib_sgebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( nwork ), lwork-nwork+1,ierr )
+                    nwork = ie + n
+                    ! perform bidiagonal svd, computing singular values only
+                    ! workspace: need   n [e] + bdspac
+                    call stdlib_sbdsdc( 'U', 'N', n, s, work( ie ), dum, 1, dum, 1,dum, idum, &
+                              work( nwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 2 (m >> n, jobz = 'o')
+                    ! n left singular vectors to be overwritten on a and
+                    ! n right singular vectors to be computed in vt
+                    ir = 1
+                    ! work(ir) is ldwrkr by n
+                    if( lwork >= lda*n + n*n + 3*n + bdspac ) then
+                       ldwrkr = lda
+                    else
+                       ldwrkr = ( lwork - n*n - 3*n - bdspac ) / n
+                    end if
+                    itau = ir + ldwrkr*n
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! workspace: need   n*n [r] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    call stdlib_sgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! copy r to work(ir), zeroing out below it
+                    call stdlib_slacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                    call stdlib_slaset( 'L', n - 1, n - 1, zero, zero, work(ir+1),ldwrkr )
+                    ! generate q in a
+                    ! workspace: need   n*n [r] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    call stdlib_sorgqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ie = itau
+                    itauq = ie + n
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in work(ir)
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n      [work]
+                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + 2*n*nb [work]
+                    call stdlib_sgebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork - nwork + 1, ierr )
+                    ! work(iu) is n by n
+                    iu = nwork
+                    nwork = iu + n*n
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in work(iu) and computing right
+                    ! singular vectors of bidiagonal matrix in vt
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u] + bdspac
+                    call stdlib_sbdsdc( 'U', 'I', n, s, work( ie ), work( iu ), n,vt, ldvt, dum, &
+                              idum, work( nwork ), iwork,info )
+                    ! overwrite work(iu) by left singular vectors of r
+                    ! and vt by right singular vectors of r
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u] + n    [work]
+                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
+                    call stdlib_sormbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                              work( iu ), n, work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_sormbr( 'P', 'R', 'T', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork - nwork + 1, ierr )
+                    ! multiply q in a by left singular vectors of r in
+                    ! work(iu), storing result in work(ir) and copying to a
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u]
+                    ! workspace: prefer m*n [r] + 3*n [e, tauq, taup] + n*n [u]
+                    do i = 1, m, ldwrkr
+                       chunk = min( m - i + 1, ldwrkr )
+                       call stdlib_sgemm( 'N', 'N', chunk, n, n, one, a( i, 1 ),lda, work( iu ), &
+                                 n, zero, work( ir ),ldwrkr )
+                       call stdlib_slacpy( 'F', chunk, n, work( ir ), ldwrkr,a( i, 1 ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 3 (m >> n, jobz='s')
+                    ! n left singular vectors to be computed in u and
+                    ! n right singular vectors to be computed in vt
+                    ir = 1
+                    ! work(ir) is n by n
+                    ldwrkr = n
+                    itau = ir + ldwrkr*n
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! workspace: need   n*n [r] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    call stdlib_sgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! copy r to work(ir), zeroing out below it
+                    call stdlib_slacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                    call stdlib_slaset( 'L', n - 1, n - 1, zero, zero, work(ir+1),ldwrkr )
+                    ! generate q in a
+                    ! workspace: need   n*n [r] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    call stdlib_sorgqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ie = itau
+                    itauq = ie + n
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in work(ir)
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n      [work]
+                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + 2*n*nb [work]
+                    call stdlib_sgebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork - nwork + 1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagoal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + bdspac
+                    call stdlib_sbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! overwrite u by left singular vectors of r and vt
+                    ! by right singular vectors of r
+                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n    [work]
+                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + n*nb [work]
+                    call stdlib_sormbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                              u, ldu, work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_sormbr( 'P', 'R', 'T', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork - nwork + 1, ierr )
+                    ! multiply q in a by left singular vectors of r in
+                    ! work(ir), storing result in u
+                    ! workspace: need   n*n [r]
+                    call stdlib_slacpy( 'F', n, n, u, ldu, work( ir ), ldwrkr )
+                    call stdlib_sgemm( 'N', 'N', m, n, n, one, a, lda, work( ir ),ldwrkr, zero, u,&
+                               ldu )
+                 else if( wntqa ) then
+                    ! path 4 (m >> n, jobz='a')
+                    ! m left singular vectors to be computed in u and
+                    ! n right singular vectors to be computed in vt
+                    iu = 1
+                    ! work(iu) is n by n
+                    ldwrku = n
+                    itau = iu + ldwrku*n
+                    nwork = itau + n
+                    ! compute a=q*r, copying result to u
+                    ! workspace: need   n*n [u] + n [tau] + n    [work]
+                    ! workspace: prefer n*n [u] + n [tau] + n*nb [work]
+                    call stdlib_sgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    call stdlib_slacpy( 'L', m, n, a, lda, u, ldu )
+                    ! generate q in u
+                    ! workspace: need   n*n [u] + n [tau] + m    [work]
+                    ! workspace: prefer n*n [u] + n [tau] + m*nb [work]
+                    call stdlib_sorgqr( m, m, n, u, ldu, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ! produce r in a, zeroing out other entries
+                    call stdlib_slaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ), lda )
+                    ie = itau
+                    itauq = ie + n
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + n      [work]
+                    ! workspace: prefer n*n [u] + 3*n [e, tauq, taup] + 2*n*nb [work]
+                    call stdlib_sgebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( nwork ), lwork-nwork+1,ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in work(iu) and computing right
+                    ! singular vectors of bidiagonal matrix in vt
+                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + bdspac
+                    call stdlib_sbdsdc( 'U', 'I', n, s, work( ie ), work( iu ), n,vt, ldvt, dum, &
+                              idum, work( nwork ), iwork,info )
+                    ! overwrite work(iu) by left singular vectors of r and vt
+                    ! by right singular vectors of r
+                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + n    [work]
+                    ! workspace: prefer n*n [u] + 3*n [e, tauq, taup] + n*nb [work]
+                    call stdlib_sormbr( 'Q', 'L', 'N', n, n, n, a, lda,work( itauq ), work( iu ), &
+                              ldwrku,work( nwork ), lwork - nwork + 1, ierr )
+                    call stdlib_sormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    ! multiply q in u by left singular vectors of r in
+                    ! work(iu), storing result in a
+                    ! workspace: need   n*n [u]
+                    call stdlib_sgemm( 'N', 'N', m, n, n, one, u, ldu, work( iu ),ldwrku, zero, a,&
+                               lda )
+                    ! copy left singular vectors of a from a to u
+                    call stdlib_slacpy( 'F', m, n, a, lda, u, ldu )
+                 end if
+              else
+                 ! m < mnthr
+                 ! path 5 (m >= n, but not much larger)
+                 ! reduce to bidiagonal form without qr decomposition
+                 ie = 1
+                 itauq = ie + n
+                 itaup = itauq + n
+                 nwork = itaup + n
+                 ! bidiagonalize a
+                 ! workspace: need   3*n [e, tauq, taup] + m        [work]
+                 ! workspace: prefer 3*n [e, tauq, taup] + (m+n)*nb [work]
+                 call stdlib_sgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 5n (m >= n, jobz='n')
+                    ! perform bidiagonal svd, only computing singular values
+                    ! workspace: need   3*n [e, tauq, taup] + bdspac
+                    call stdlib_sbdsdc( 'U', 'N', n, s, work( ie ), dum, 1, dum, 1,dum, idum, &
+                              work( nwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 5o (m >= n, jobz='o')
+                    iu = nwork
+                    if( lwork >= m*n + 3*n + bdspac ) then
+                       ! work( iu ) is m by n
+                       ldwrku = m
+                       nwork = iu + ldwrku*n
+                       call stdlib_slaset( 'F', m, n, zero, zero, work( iu ),ldwrku )
+                       ! ir is unused; silence compile warnings
+                       ir = -1
+                    else
+                       ! work( iu ) is n by n
+                       ldwrku = n
+                       nwork = iu + ldwrku*n
+                       ! work(ir) is ldwrkr by n
+                       ir = nwork
+                       ldwrkr = ( lwork - n*n - 3*n ) / n
+                    end if
+                    nwork = iu + ldwrku*n
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in work(iu) and computing right
+                    ! singular vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*n [e, tauq, taup] + n*n [u] + bdspac
+                    call stdlib_sbdsdc( 'U', 'I', n, s, work( ie ), work( iu ),ldwrku, vt, ldvt, &
+                              dum, idum, work( nwork ),iwork, info )
+                    ! overwrite vt by right singular vectors of a
+                    ! workspace: need   3*n [e, tauq, taup] + n*n [u] + n    [work]
+                    ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
+                    call stdlib_sormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    if( lwork >= m*n + 3*n + bdspac ) then
+                       ! path 5o-fast
+                       ! overwrite work(iu) by left singular vectors of a
+                       ! workspace: need   3*n [e, tauq, taup] + m*n [u] + n    [work]
+                       ! workspace: prefer 3*n [e, tauq, taup] + m*n [u] + n*nb [work]
+                       call stdlib_sormbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), work( iu &
+                                 ), ldwrku,work( nwork ), lwork - nwork + 1, ierr )
+                       ! copy left singular vectors of a from work(iu) to a
+                       call stdlib_slacpy( 'F', m, n, work( iu ), ldwrku, a, lda )
+                    else
+                       ! path 5o-slow
+                       ! generate q in a
+                       ! workspace: need   3*n [e, tauq, taup] + n*n [u] + n    [work]
+                       ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
+                       call stdlib_sorgbr( 'Q', m, n, n, a, lda, work( itauq ),work( nwork ), &
+                                 lwork - nwork + 1, ierr )
+                       ! multiply q in a by left singular vectors of
+                       ! bidiagonal matrix in work(iu), storing result in
+                       ! work(ir) and copying to a
+                       ! workspace: need   3*n [e, tauq, taup] + n*n [u] + nb*n [r]
+                       ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + m*n  [r]
+                       do i = 1, m, ldwrkr
+                          chunk = min( m - i + 1, ldwrkr )
+                          call stdlib_sgemm( 'N', 'N', chunk, n, n, one, a( i, 1 ),lda, work( iu )&
+                                    , ldwrku, zero,work( ir ), ldwrkr )
+                          call stdlib_slacpy( 'F', chunk, n, work( ir ), ldwrkr,a( i, 1 ), lda )
+                                    
+                       end do
+                    end if
+                 else if( wntqs ) then
+                    ! path 5s (m >= n, jobz='s')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*n [e, tauq, taup] + bdspac
+                    call stdlib_slaset( 'F', m, n, zero, zero, u, ldu )
+                    call stdlib_sbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! overwrite u by left singular vectors of a and vt
+                    ! by right singular vectors of a
+                    ! workspace: need   3*n [e, tauq, taup] + n    [work]
+                    ! workspace: prefer 3*n [e, tauq, taup] + n*nb [work]
+                    call stdlib_sormbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_sormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                 else if( wntqa ) then
+                    ! path 5a (m >= n, jobz='a')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*n [e, tauq, taup] + bdspac
+                    call stdlib_slaset( 'F', m, m, zero, zero, u, ldu )
+                    call stdlib_sbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! set the right corner of u to identity matrix
+                    if( m>n ) then
+                       call stdlib_slaset( 'F', m - n, m - n, zero, one, u(n+1,n+1),ldu )
+                    end if
+                    ! overwrite u by left singular vectors of a and vt
+                    ! by right singular vectors of a
+                    ! workspace: need   3*n [e, tauq, taup] + m    [work]
+                    ! workspace: prefer 3*n [e, tauq, taup] + m*nb [work]
+                    call stdlib_sormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_sormbr( 'P', 'R', 'T', n, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                 end if
+              end if
+           else
+              ! a has more columns than rows. if a has sufficiently more
+              ! columns than rows, first reduce using the lq decomposition (if
+              ! sufficient workspace available)
+              if( n>=mnthr ) then
+                 if( wntqn ) then
+                    ! path 1t (n >> m, jobz='n')
+                    ! no singular vectors to be computed
+                    itau = 1
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! workspace: need   m [tau] + m [work]
+                    ! workspace: prefer m [tau] + m*nb [work]
+                    call stdlib_sgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! zero out above l
+                    call stdlib_slaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ), lda )
+                    ie = 1
+                    itauq = ie + m
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! workspace: need   3*m [e, tauq, taup] + m      [work]
+                    ! workspace: prefer 3*m [e, tauq, taup] + 2*m*nb [work]
+                    call stdlib_sgebrd( m, m, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( nwork ), lwork-nwork+1,ierr )
+                    nwork = ie + m
+                    ! perform bidiagonal svd, computing singular values only
+                    ! workspace: need   m [e] + bdspac
+                    call stdlib_sbdsdc( 'U', 'N', m, s, work( ie ), dum, 1, dum, 1,dum, idum, &
+                              work( nwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 2t (n >> m, jobz='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! m left singular vectors to be computed in u
+                    ivt = 1
+                    ! work(ivt) is m by m
+                    ! work(il)  is m by m; it is later resized to m by chunk for gemm
+                    il = ivt + m*m
+                    if( lwork >= m*n + m*m + 3*m + bdspac ) then
+                       ldwrkl = m
+                       chunk = n
+                    else
+                       ldwrkl = m
+                       chunk = ( lwork - m*m ) / m
+                    end if
+                    itau = il + ldwrkl*m
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! workspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
+                    call stdlib_sgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! copy l to work(il), zeroing about above it
+                    call stdlib_slacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
+                    call stdlib_slaset( 'U', m - 1, m - 1, zero, zero,work( il + ldwrkl ), ldwrkl &
+                              )
+                    ! generate q in a
+                    ! workspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
+                    call stdlib_sorglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ie = itau
+                    itauq = ie + m
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in work(il)
+                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m      [work]
+                    ! workspace: prefer m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + 2*m*nb [work]
+                    call stdlib_sgebrd( m, m, work( il ), ldwrkl, s, work( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork - nwork + 1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u, and computing right singular
+                    ! vectors of bidiagonal matrix in work(ivt)
+                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + bdspac
+                    call stdlib_sbdsdc( 'U', 'I', m, s, work( ie ), u, ldu,work( ivt ), m, dum, &
+                              idum, work( nwork ),iwork, info )
+                    ! overwrite u by left singular vectors of l and work(ivt)
+                    ! by right singular vectors of l
+                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m    [work]
+                    ! workspace: prefer m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m*nb [work]
+                    call stdlib_sormbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
+                              u, ldu, work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_sormbr( 'P', 'R', 'T', m, m, m, work( il ), ldwrkl,work( itaup ), &
+                              work( ivt ), m,work( nwork ), lwork - nwork + 1, ierr )
+                    ! multiply right singular vectors of l in work(ivt) by q
+                    ! in a, storing result in work(il) and copying to a
+                    ! workspace: need   m*m [vt] + m*m [l]
+                    ! workspace: prefer m*m [vt] + m*n [l]
+                    ! at this point, l is resized as m by chunk.
+                    do i = 1, n, chunk
+                       blk = min( n - i + 1, chunk )
+                       call stdlib_sgemm( 'N', 'N', m, blk, m, one, work( ivt ), m,a( 1, i ), lda,&
+                                  zero, work( il ), ldwrkl )
+                       call stdlib_slacpy( 'F', m, blk, work( il ), ldwrkl,a( 1, i ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 3t (n >> m, jobz='s')
+                    ! m right singular vectors to be computed in vt and
+                    ! m left singular vectors to be computed in u
+                    il = 1
+                    ! work(il) is m by m
+                    ldwrkl = m
+                    itau = il + ldwrkl*m
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! workspace: need   m*m [l] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [l] + m [tau] + m*nb [work]
+                    call stdlib_sgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    ! copy l to work(il), zeroing out above it
+                    call stdlib_slacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
+                    call stdlib_slaset( 'U', m - 1, m - 1, zero, zero,work( il + ldwrkl ), ldwrkl &
+                              )
+                    ! generate q in a
+                    ! workspace: need   m*m [l] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [l] + m [tau] + m*nb [work]
+                    call stdlib_sorglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ie = itau
+                    itauq = ie + m
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in work(iu).
+                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + m      [work]
+                    ! workspace: prefer m*m [l] + 3*m [e, tauq, taup] + 2*m*nb [work]
+                    call stdlib_sgebrd( m, m, work( il ), ldwrkl, s, work( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork - nwork + 1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + bdspac
+                    call stdlib_sbdsdc( 'U', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! overwrite u by left singular vectors of l and vt
+                    ! by right singular vectors of l
+                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + m    [work]
+                    ! workspace: prefer m*m [l] + 3*m [e, tauq, taup] + m*nb [work]
+                    call stdlib_sormbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
+                              u, ldu, work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_sormbr( 'P', 'R', 'T', m, m, m, work( il ), ldwrkl,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork - nwork + 1, ierr )
+                    ! multiply right singular vectors of l in work(il) by
+                    ! q in a, storing result in vt
+                    ! workspace: need   m*m [l]
+                    call stdlib_slacpy( 'F', m, m, vt, ldvt, work( il ), ldwrkl )
+                    call stdlib_sgemm( 'N', 'N', m, n, m, one, work( il ), ldwrkl,a, lda, zero, &
+                              vt, ldvt )
+                 else if( wntqa ) then
+                    ! path 4t (n >> m, jobz='a')
+                    ! n right singular vectors to be computed in vt and
+                    ! m left singular vectors to be computed in u
+                    ivt = 1
+                    ! work(ivt) is m by m
+                    ldwkvt = m
+                    itau = ivt + ldwkvt*m
+                    nwork = itau + m
+                    ! compute a=l*q, copying result to vt
+                    ! workspace: need   m*m [vt] + m [tau] + m    [work]
+                    ! workspace: prefer m*m [vt] + m [tau] + m*nb [work]
+                    call stdlib_sgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
+                              1, ierr )
+                    call stdlib_slacpy( 'U', m, n, a, lda, vt, ldvt )
+                    ! generate q in vt
+                    ! workspace: need   m*m [vt] + m [tau] + n    [work]
+                    ! workspace: prefer m*m [vt] + m [tau] + n*nb [work]
+                    call stdlib_sorglq( n, n, m, vt, ldvt, work( itau ),work( nwork ), lwork - &
+                              nwork + 1, ierr )
+                    ! produce l in a, zeroing out other entries
+                    call stdlib_slaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ), lda )
+                    ie = itau
+                    itauq = ie + m
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup] + m      [work]
+                    ! workspace: prefer m*m [vt] + 3*m [e, tauq, taup] + 2*m*nb [work]
+                    call stdlib_sgebrd( m, m, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( nwork ), lwork-nwork+1,ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in work(ivt)
+                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup] + bdspac
+                    call stdlib_sbdsdc( 'U', 'I', m, s, work( ie ), u, ldu,work( ivt ), ldwkvt, &
+                              dum, idum,work( nwork ), iwork, info )
+                    ! overwrite u by left singular vectors of l and work(ivt)
+                    ! by right singular vectors of l
+                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup]+ m    [work]
+                    ! workspace: prefer m*m [vt] + 3*m [e, tauq, taup]+ m*nb [work]
+                    call stdlib_sormbr( 'Q', 'L', 'N', m, m, m, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_sormbr( 'P', 'R', 'T', m, m, m, a, lda,work( itaup ), work( ivt ),&
+                               ldwkvt,work( nwork ), lwork - nwork + 1, ierr )
+                    ! multiply right singular vectors of l in work(ivt) by
+                    ! q in vt, storing result in a
+                    ! workspace: need   m*m [vt]
+                    call stdlib_sgemm( 'N', 'N', m, n, m, one, work( ivt ), ldwkvt,vt, ldvt, zero,&
+                               a, lda )
+                    ! copy right singular vectors of a from a to vt
+                    call stdlib_slacpy( 'F', m, n, a, lda, vt, ldvt )
+                 end if
+              else
+                 ! n < mnthr
+                 ! path 5t (n > m, but not much larger)
+                 ! reduce to bidiagonal form without lq decomposition
+                 ie = 1
+                 itauq = ie + m
+                 itaup = itauq + m
+                 nwork = itaup + m
+                 ! bidiagonalize a
+                 ! workspace: need   3*m [e, tauq, taup] + n        [work]
+                 ! workspace: prefer 3*m [e, tauq, taup] + (m+n)*nb [work]
+                 call stdlib_sgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 5tn (n > m, jobz='n')
+                    ! perform bidiagonal svd, only computing singular values
+                    ! workspace: need   3*m [e, tauq, taup] + bdspac
+                    call stdlib_sbdsdc( 'L', 'N', m, s, work( ie ), dum, 1, dum, 1,dum, idum, &
+                              work( nwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 5to (n > m, jobz='o')
+                    ldwkvt = m
+                    ivt = nwork
+                    if( lwork >= m*n + 3*m + bdspac ) then
+                       ! work( ivt ) is m by n
+                       call stdlib_slaset( 'F', m, n, zero, zero, work( ivt ),ldwkvt )
+                       nwork = ivt + ldwkvt*n
+                       ! il is unused; silence compile warnings
+                       il = -1
+                    else
+                       ! work( ivt ) is m by m
+                       nwork = ivt + ldwkvt*m
+                       il = nwork
+                       ! work(il) is m by chunk
+                       chunk = ( lwork - m*m - 3*m ) / m
+                    end if
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in work(ivt)
+                    ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + bdspac
+                    call stdlib_sbdsdc( 'L', 'I', m, s, work( ie ), u, ldu,work( ivt ), ldwkvt, &
+                              dum, idum,work( nwork ), iwork, info )
+                    ! overwrite u by left singular vectors of a
+                    ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m    [work]
+                    ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*nb [work]
+                    call stdlib_sormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    if( lwork >= m*n + 3*m + bdspac ) then
+                       ! path 5to-fast
+                       ! overwrite work(ivt) by left singular vectors of a
+                       ! workspace: need   3*m [e, tauq, taup] + m*n [vt] + m    [work]
+                       ! workspace: prefer 3*m [e, tauq, taup] + m*n [vt] + m*nb [work]
+                       call stdlib_sormbr( 'P', 'R', 'T', m, n, m, a, lda,work( itaup ), work( &
+                                 ivt ), ldwkvt,work( nwork ), lwork - nwork + 1, ierr )
+                       ! copy right singular vectors of a from work(ivt) to a
+                       call stdlib_slacpy( 'F', m, n, work( ivt ), ldwkvt, a, lda )
+                    else
+                       ! path 5to-slow
+                       ! generate p**t in a
+                       ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m    [work]
+                       ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*nb [work]
+                       call stdlib_sorgbr( 'P', m, n, m, a, lda, work( itaup ),work( nwork ), &
+                                 lwork - nwork + 1, ierr )
+                       ! multiply q in a by right singular vectors of
+                       ! bidiagonal matrix in work(ivt), storing result in
+                       ! work(il) and copying to a
+                       ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m*nb [l]
+                       ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*n  [l]
+                       do i = 1, n, chunk
+                          blk = min( n - i + 1, chunk )
+                          call stdlib_sgemm( 'N', 'N', m, blk, m, one, work( ivt ),ldwkvt, a( 1, &
+                                    i ), lda, zero,work( il ), m )
+                          call stdlib_slacpy( 'F', m, blk, work( il ), m, a( 1, i ),lda )
+                       end do
+                    end if
+                 else if( wntqs ) then
+                    ! path 5ts (n > m, jobz='s')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*m [e, tauq, taup] + bdspac
+                    call stdlib_slaset( 'F', m, n, zero, zero, vt, ldvt )
+                    call stdlib_sbdsdc( 'L', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! overwrite u by left singular vectors of a and vt
+                    ! by right singular vectors of a
+                    ! workspace: need   3*m [e, tauq, taup] + m    [work]
+                    ! workspace: prefer 3*m [e, tauq, taup] + m*nb [work]
+                    call stdlib_sormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_sormbr( 'P', 'R', 'T', m, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                 else if( wntqa ) then
+                    ! path 5ta (n > m, jobz='a')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in u and computing right singular
+                    ! vectors of bidiagonal matrix in vt
+                    ! workspace: need   3*m [e, tauq, taup] + bdspac
+                    call stdlib_slaset( 'F', n, n, zero, zero, vt, ldvt )
+                    call stdlib_sbdsdc( 'L', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
+                              work( nwork ), iwork,info )
+                    ! set the right corner of vt to identity matrix
+                    if( n>m ) then
+                       call stdlib_slaset( 'F', n-m, n-m, zero, one, vt(m+1,m+1),ldvt )
+                    end if
+                    ! overwrite u by left singular vectors of a and vt
+                    ! by right singular vectors of a
+                    ! workspace: need   3*m [e, tauq, taup] + n    [work]
+                    ! workspace: prefer 3*m [e, tauq, taup] + n*nb [work]
+                    call stdlib_sormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                    call stdlib_sormbr( 'P', 'R', 'T', n, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork - nwork + 1, ierr )
+                 end if
+              end if
+           end if
+           ! undo scaling if necessary
+           if( iscl==1 ) then
+              if( anrm>bignum )call stdlib_slascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( anrm<smlnum )call stdlib_slascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+           end if
+           ! return optimal workspace in work(1)
+           work( 1 ) = stdlib_sroundup_lwork( maxwrk )
+           return
+     end subroutine stdlib_sgesdd
+
+     !> SGESVD: computes the singular value decomposition (SVD) of a real
+     !> M-by-N matrix A, optionally computing the left and/or right singular
+     !> vectors. The SVD is written
+     !> A = U * SIGMA * transpose(V)
+     !> where SIGMA is an M-by-N matrix which is zero except for its
+     !> min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
+     !> V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
+     !> are the singular values of A; they are real and non-negative, and
+     !> are returned in descending order.  The first min(m,n) columns of
+     !> U and V are the left and right singular vectors of A.
+     !> Note that the routine returns V**T, not V.
+
+     subroutine stdlib_sgesvd( jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu, jobvt
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldvt, lwork, m, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: s(*), u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wntua, wntuas, wntun, wntuo, wntus, wntva, wntvas, wntvn, wntvo,&
+                      wntvs
+           integer(ilp) :: bdspac, blk, chunk, i, ie, ierr, ir, iscl, itau, itaup, itauq, iu, &
+           iwork, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr, ncu, ncvt, nru, nrvt, &
+                     wrkbl
+           integer(ilp) :: lwork_sgeqrf, lwork_sorgqr_n, lwork_sorgqr_m, lwork_sgebrd, &
+                     lwork_sorgbr_p, lwork_sorgbr_q, lwork_sgelqf, lwork_sorglq_n, lwork_sorglq_m
+           real(sp) :: anrm, bignum, eps, smlnum
+           ! Local Arrays 
+           real(sp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           minmn = min( m, n )
+           wntua = stdlib_lsame( jobu, 'A' )
+           wntus = stdlib_lsame( jobu, 'S' )
+           wntuas = wntua .or. wntus
+           wntuo = stdlib_lsame( jobu, 'O' )
+           wntun = stdlib_lsame( jobu, 'N' )
+           wntva = stdlib_lsame( jobvt, 'A' )
+           wntvs = stdlib_lsame( jobvt, 'S' )
+           wntvas = wntva .or. wntvs
+           wntvo = stdlib_lsame( jobvt, 'O' )
+           wntvn = stdlib_lsame( jobvt, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.( wntua .or. wntus .or. wntuo .or. wntun ) ) then
+              info = -1
+           else if( .not.( wntva .or. wntvs .or. wntvo .or. wntvn ) .or.( wntvo .and. wntuo ) ) &
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldu<1 .or. ( wntuas .and. ldu<m ) ) then
+              info = -9
+           else if( ldvt<1 .or. ( wntva .and. ldvt<n ) .or.( wntvs .and. ldvt<minmn ) ) &
+                     then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( m>=n .and. minmn>0 ) then
+                 ! compute space needed for stdlib_sbdsqr
+                 mnthr = stdlib_ilaenv( 6, 'SGESVD', jobu // jobvt, m, n, 0, 0 )
+                 bdspac = 5*n
+                 ! compute space needed for stdlib_sgeqrf
+                 call stdlib_sgeqrf( m, n, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_sgeqrf = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_sorgqr
+                 call stdlib_sorgqr( m, n, n, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_sorgqr_n = int( dum(1),KIND=ilp)
+                 call stdlib_sorgqr( m, m, n, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_sorgqr_m = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_sgebrd
+                 call stdlib_sgebrd( n, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                           
+                 lwork_sgebrd = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_sorgbr p
+                 call stdlib_sorgbr( 'P', n, n, n, a, lda, dum(1),dum(1), -1, ierr )
+                 lwork_sorgbr_p = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_sorgbr q
+                 call stdlib_sorgbr( 'Q', n, n, n, a, lda, dum(1),dum(1), -1, ierr )
+                 lwork_sorgbr_q = int( dum(1),KIND=ilp)
+                 if( m>=mnthr ) then
+                    if( wntun ) then
+                       ! path 1 (m much larger than n, jobu='n')
+                       maxwrk = n + lwork_sgeqrf
+                       maxwrk = max( maxwrk, 3*n+lwork_sgebrd )
+                       if( wntvo .or. wntvas )maxwrk = max( maxwrk, 3*n+lwork_sorgbr_p )
+                       maxwrk = max( maxwrk, bdspac )
+                       minwrk = max( 4*n, bdspac )
+                    else if( wntuo .and. wntvn ) then
+                       ! path 2 (m much larger than n, jobu='o', jobvt='n')
+                       wrkbl = n + lwork_sgeqrf
+                       wrkbl = max( wrkbl, n+lwork_sorgqr_n )
+                       wrkbl = max( wrkbl, 3*n+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*n+lwork_sorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = max( n*n+wrkbl, n*n+m*n+n )
+                       minwrk = max( 3*n+m, bdspac )
+                    else if( wntuo .and. wntvas ) then
+                       ! path 3 (m much larger than n, jobu='o', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_sgeqrf
+                       wrkbl = max( wrkbl, n+lwork_sorgqr_n )
+                       wrkbl = max( wrkbl, 3*n+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*n+lwork_sorgbr_q )
+                       wrkbl = max( wrkbl, 3*n+lwork_sorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = max( n*n+wrkbl, n*n+m*n+n )
+                       minwrk = max( 3*n+m, bdspac )
+                    else if( wntus .and. wntvn ) then
+                       ! path 4 (m much larger than n, jobu='s', jobvt='n')
+                       wrkbl = n + lwork_sgeqrf
+                       wrkbl = max( wrkbl, n+lwork_sorgqr_n )
+                       wrkbl = max( wrkbl, 3*n+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*n+lwork_sorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = n*n + wrkbl
+                       minwrk = max( 3*n+m, bdspac )
+                    else if( wntus .and. wntvo ) then
+                       ! path 5 (m much larger than n, jobu='s', jobvt='o')
+                       wrkbl = n + lwork_sgeqrf
+                       wrkbl = max( wrkbl, n+lwork_sorgqr_n )
+                       wrkbl = max( wrkbl, 3*n+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*n+lwork_sorgbr_q )
+                       wrkbl = max( wrkbl, 3*n+lwork_sorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = 2*n*n + wrkbl
+                       minwrk = max( 3*n+m, bdspac )
+                    else if( wntus .and. wntvas ) then
+                       ! path 6 (m much larger than n, jobu='s', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_sgeqrf
+                       wrkbl = max( wrkbl, n+lwork_sorgqr_n )
+                       wrkbl = max( wrkbl, 3*n+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*n+lwork_sorgbr_q )
+                       wrkbl = max( wrkbl, 3*n+lwork_sorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = n*n + wrkbl
+                       minwrk = max( 3*n+m, bdspac )
+                    else if( wntua .and. wntvn ) then
+                       ! path 7 (m much larger than n, jobu='a', jobvt='n')
+                       wrkbl = n + lwork_sgeqrf
+                       wrkbl = max( wrkbl, n+lwork_sorgqr_m )
+                       wrkbl = max( wrkbl, 3*n+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*n+lwork_sorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = n*n + wrkbl
+                       minwrk = max( 3*n+m, bdspac )
+                    else if( wntua .and. wntvo ) then
+                       ! path 8 (m much larger than n, jobu='a', jobvt='o')
+                       wrkbl = n + lwork_sgeqrf
+                       wrkbl = max( wrkbl, n+lwork_sorgqr_m )
+                       wrkbl = max( wrkbl, 3*n+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*n+lwork_sorgbr_q )
+                       wrkbl = max( wrkbl, 3*n+lwork_sorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = 2*n*n + wrkbl
+                       minwrk = max( 3*n+m, bdspac )
+                    else if( wntua .and. wntvas ) then
+                       ! path 9 (m much larger than n, jobu='a', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_sgeqrf
+                       wrkbl = max( wrkbl, n+lwork_sorgqr_m )
+                       wrkbl = max( wrkbl, 3*n+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*n+lwork_sorgbr_q )
+                       wrkbl = max( wrkbl, 3*n+lwork_sorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = n*n + wrkbl
+                       minwrk = max( 3*n+m, bdspac )
+                    end if
+                 else
+                    ! path 10 (m at least n, but not much larger)
+                    call stdlib_sgebrd( m, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                              
+                    lwork_sgebrd = int( dum(1),KIND=ilp)
+                    maxwrk = 3*n + lwork_sgebrd
+                    if( wntus .or. wntuo ) then
+                       call stdlib_sorgbr( 'Q', m, n, n, a, lda, dum(1),dum(1), -1, ierr )
+                       lwork_sorgbr_q = int( dum(1),KIND=ilp)
+                       maxwrk = max( maxwrk, 3*n+lwork_sorgbr_q )
+                    end if
+                    if( wntua ) then
+                       call stdlib_sorgbr( 'Q', m, m, n, a, lda, dum(1),dum(1), -1, ierr )
+                       lwork_sorgbr_q = int( dum(1),KIND=ilp)
+                       maxwrk = max( maxwrk, 3*n+lwork_sorgbr_q )
+                    end if
+                    if( .not.wntvn ) then
+                      maxwrk = max( maxwrk, 3*n+lwork_sorgbr_p )
+                    end if
+                    maxwrk = max( maxwrk, bdspac )
+                    minwrk = max( 3*n+m, bdspac )
+                 end if
+              else if( minmn>0 ) then
+                 ! compute space needed for stdlib_sbdsqr
+                 mnthr = stdlib_ilaenv( 6, 'SGESVD', jobu // jobvt, m, n, 0, 0 )
+                 bdspac = 5*m
+                 ! compute space needed for stdlib_sgelqf
+                 call stdlib_sgelqf( m, n, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_sgelqf = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_sorglq
+                 call stdlib_sorglq( n, n, m, dum(1), n, dum(1), dum(1), -1, ierr )
+                 lwork_sorglq_n = int( dum(1),KIND=ilp)
+                 call stdlib_sorglq( m, n, m, a, lda, dum(1), dum(1), -1, ierr )
+                 lwork_sorglq_m = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_sgebrd
+                 call stdlib_sgebrd( m, m, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                           
+                 lwork_sgebrd = int( dum(1),KIND=ilp)
+                  ! compute space needed for stdlib_sorgbr p
+                 call stdlib_sorgbr( 'P', m, m, m, a, n, dum(1),dum(1), -1, ierr )
+                 lwork_sorgbr_p = int( dum(1),KIND=ilp)
+                 ! compute space needed for stdlib_sorgbr q
+                 call stdlib_sorgbr( 'Q', m, m, m, a, n, dum(1),dum(1), -1, ierr )
+                 lwork_sorgbr_q = int( dum(1),KIND=ilp)
+                 if( n>=mnthr ) then
+                    if( wntvn ) then
+                       ! path 1t(n much larger than m, jobvt='n')
+                       maxwrk = m + lwork_sgelqf
+                       maxwrk = max( maxwrk, 3*m+lwork_sgebrd )
+                       if( wntuo .or. wntuas )maxwrk = max( maxwrk, 3*m+lwork_sorgbr_q )
+                       maxwrk = max( maxwrk, bdspac )
+                       minwrk = max( 4*m, bdspac )
+                    else if( wntvo .and. wntun ) then
+                       ! path 2t(n much larger than m, jobu='n', jobvt='o')
+                       wrkbl = m + lwork_sgelqf
+                       wrkbl = max( wrkbl, m+lwork_sorglq_m )
+                       wrkbl = max( wrkbl, 3*m+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*m+lwork_sorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = max( m*m+wrkbl, m*m+m*n+m )
+                       minwrk = max( 3*m+n, bdspac )
+                    else if( wntvo .and. wntuas ) then
+                       ! path 3t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='o')
+                       wrkbl = m + lwork_sgelqf
+                       wrkbl = max( wrkbl, m+lwork_sorglq_m )
+                       wrkbl = max( wrkbl, 3*m+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*m+lwork_sorgbr_p )
+                       wrkbl = max( wrkbl, 3*m+lwork_sorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = max( m*m+wrkbl, m*m+m*n+m )
+                       minwrk = max( 3*m+n, bdspac )
+                    else if( wntvs .and. wntun ) then
+                       ! path 4t(n much larger than m, jobu='n', jobvt='s')
+                       wrkbl = m + lwork_sgelqf
+                       wrkbl = max( wrkbl, m+lwork_sorglq_m )
+                       wrkbl = max( wrkbl, 3*m+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*m+lwork_sorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = m*m + wrkbl
+                       minwrk = max( 3*m+n, bdspac )
+                    else if( wntvs .and. wntuo ) then
+                       ! path 5t(n much larger than m, jobu='o', jobvt='s')
+                       wrkbl = m + lwork_sgelqf
+                       wrkbl = max( wrkbl, m+lwork_sorglq_m )
+                       wrkbl = max( wrkbl, 3*m+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*m+lwork_sorgbr_p )
+                       wrkbl = max( wrkbl, 3*m+lwork_sorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = 2*m*m + wrkbl
+                       minwrk = max( 3*m+n, bdspac )
+                       maxwrk = max( maxwrk, minwrk )
+                    else if( wntvs .and. wntuas ) then
+                       ! path 6t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='s')
+                       wrkbl = m + lwork_sgelqf
+                       wrkbl = max( wrkbl, m+lwork_sorglq_m )
+                       wrkbl = max( wrkbl, 3*m+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*m+lwork_sorgbr_p )
+                       wrkbl = max( wrkbl, 3*m+lwork_sorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = m*m + wrkbl
+                       minwrk = max( 3*m+n, bdspac )
+                    else if( wntva .and. wntun ) then
+                       ! path 7t(n much larger than m, jobu='n', jobvt='a')
+                       wrkbl = m + lwork_sgelqf
+                       wrkbl = max( wrkbl, m+lwork_sorglq_n )
+                       wrkbl = max( wrkbl, 3*m+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*m+lwork_sorgbr_p )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = m*m + wrkbl
+                       minwrk = max( 3*m+n, bdspac )
+                    else if( wntva .and. wntuo ) then
+                       ! path 8t(n much larger than m, jobu='o', jobvt='a')
+                       wrkbl = m + lwork_sgelqf
+                       wrkbl = max( wrkbl, m+lwork_sorglq_n )
+                       wrkbl = max( wrkbl, 3*m+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*m+lwork_sorgbr_p )
+                       wrkbl = max( wrkbl, 3*m+lwork_sorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = 2*m*m + wrkbl
+                       minwrk = max( 3*m+n, bdspac )
+                    else if( wntva .and. wntuas ) then
+                       ! path 9t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='a')
+                       wrkbl = m + lwork_sgelqf
+                       wrkbl = max( wrkbl, m+lwork_sorglq_n )
+                       wrkbl = max( wrkbl, 3*m+lwork_sgebrd )
+                       wrkbl = max( wrkbl, 3*m+lwork_sorgbr_p )
+                       wrkbl = max( wrkbl, 3*m+lwork_sorgbr_q )
+                       wrkbl = max( wrkbl, bdspac )
+                       maxwrk = m*m + wrkbl
+                       minwrk = max( 3*m+n, bdspac )
+                    end if
+                 else
+                    ! path 10t(n greater than m, but not much larger)
+                    call stdlib_sgebrd( m, n, a, lda, s, dum(1), dum(1),dum(1), dum(1), -1, ierr )
+                              
+                    lwork_sgebrd = int( dum(1),KIND=ilp)
+                    maxwrk = 3*m + lwork_sgebrd
+                    if( wntvs .or. wntvo ) then
+                      ! compute space needed for stdlib_sorgbr p
+                      call stdlib_sorgbr( 'P', m, n, m, a, n, dum(1),dum(1), -1, ierr )
+                      lwork_sorgbr_p = int( dum(1),KIND=ilp)
+                      maxwrk = max( maxwrk, 3*m+lwork_sorgbr_p )
+                    end if
+                    if( wntva ) then
+                      call stdlib_sorgbr( 'P', n, n, m, a, n, dum(1),dum(1), -1, ierr )
+                      lwork_sorgbr_p = int( dum(1),KIND=ilp)
+                      maxwrk = max( maxwrk, 3*m+lwork_sorgbr_p )
+                    end if
+                    if( .not.wntun ) then
+                       maxwrk = max( maxwrk, 3*m+lwork_sorgbr_q )
+                    end if
+                    maxwrk = max( maxwrk, bdspac )
+                    minwrk = max( 3*m+n, bdspac )
+                 end if
+              end if
+              maxwrk = max( maxwrk, minwrk )
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGESVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = sqrt( stdlib_slamch( 'S' ) ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', m, n, a, lda, dum )
+           iscl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              iscl = 1
+              call stdlib_slascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, ierr )
+           else if( anrm>bignum ) then
+              iscl = 1
+              call stdlib_slascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, ierr )
+           end if
+           if( m>=n ) then
+              ! a has at least as many rows as columns. if a has sufficiently
+              ! more rows than columns, first reduce using the qr
+              ! decomposition (if sufficient workspace available)
+              if( m>=mnthr ) then
+                 if( wntun ) then
+                    ! path 1 (m much larger than n, jobu='n')
+                    ! no left singular vectors to be computed
+                    itau = 1
+                    iwork = itau + n
+                    ! compute a=q*r
+                    ! (workspace: need 2*n, prefer n+n*nb)
+                    call stdlib_sgeqrf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                              ierr )
+                    ! zero out below r
+                    if( n > 1 ) then
+                       call stdlib_slaset( 'L', n-1, n-1, zero, zero, a( 2, 1 ),lda )
+                    end if
+                    ie = 1
+                    itauq = ie + n
+                    itaup = itauq + n
+                    iwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! (workspace: need 4*n, prefer 3*n+2*n*nb)
+                    call stdlib_sgebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( iwork ), lwork-iwork+1,ierr )
+                    ncvt = 0
+                    if( wntvo .or. wntvas ) then
+                       ! if right singular vectors desired, generate p'.
+                       ! (workspace: need 4*n-1, prefer 3*n+(n-1)*nb)
+                       call stdlib_sorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       ncvt = n
+                    end if
+                    iwork = ie + n
+                    ! perform bidiagonal qr iteration, computing right
+                    ! singular vectors of a in a if desired
+                    ! (workspace: need bdspac)
+                    call stdlib_sbdsqr( 'U', n, ncvt, 0, 0, s, work( ie ), a, lda,dum, 1, dum, 1, &
+                              work( iwork ), info )
+                    ! if right singular vectors desired in vt, copy them there
+                    if( wntvas )call stdlib_slacpy( 'F', n, n, a, lda, vt, ldvt )
+                 else if( wntuo .and. wntvn ) then
+                    ! path 2 (m much larger than n, jobu='o', jobvt='n')
+                    ! n left singular vectors to be overwritten on a and
+                    ! no right singular vectors to be computed
+                    if( lwork>=n*n+max( 4*n, bdspac ) ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n+n )+lda*n ) then
+                          ! work(iu) is lda by n, work(ir) is lda by n
+                          ldwrku = lda
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n+n )+n*n ) then
+                          ! work(iu) is lda by n, work(ir) is n by n
+                          ldwrku = lda
+                          ldwrkr = n
+                       else
+                          ! work(iu) is ldwrku by n, work(ir) is n by n
+                          ldwrku = ( lwork-n*n-n ) / n
+                          ldwrkr = n
+                       end if
+                       itau = ir + ldwrkr*n
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (workspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to work(ir) and zero out below it
+                       call stdlib_slacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                       call stdlib_slaset( 'L', n-1, n-1, zero, zero, work( ir+1 ),ldwrkr )
+                                 
+                       ! generate q in a
+                       ! (workspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       call stdlib_sorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + n
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in work(ir)
+                       ! (workspace: need n*n+4*n, prefer n*n+3*n+2*n*nb)
+                       call stdlib_sgebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
+                                 work( itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing r
+                       ! (workspace: need n*n+4*n, prefer n*n+3*n+n*nb)
+                       call stdlib_sorgbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       iwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of r in work(ir)
+                       ! (workspace: need n*n+bdspac)
+                       call stdlib_sbdsqr( 'U', n, 0, n, 0, s, work( ie ), dum, 1,work( ir ), &
+                                 ldwrkr, dum, 1,work( iwork ), info )
+                       iu = ie + n
+                       ! multiply q in a by left singular vectors of r in
+                       ! work(ir), storing result in work(iu) and copying to a
+                       ! (workspace: need n*n+2*n, prefer n*n+m*n+n)
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_sgemm( 'N', 'N', chunk, n, n, one, a( i, 1 ),lda, work( ir )&
+                                    , ldwrkr, zero,work( iu ), ldwrku )
+                          call stdlib_slacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       ie = 1
+                       itauq = ie + n
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize a
+                       ! (workspace: need 3*n+m, prefer 3*n+(m+n)*nb)
+                       call stdlib_sgebrd( m, n, a, lda, s, work( ie ),work( itauq ), work( itaup &
+                                 ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing a
+                       ! (workspace: need 4*n, prefer 3*n+n*nb)
+                       call stdlib_sorgbr( 'Q', m, n, n, a, lda, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in a
+                       ! (workspace: need bdspac)
+                       call stdlib_sbdsqr( 'U', n, 0, m, 0, s, work( ie ), dum, 1,a, lda, dum, 1, &
+                                 work( iwork ), info )
+                    end if
+                 else if( wntuo .and. wntvas ) then
+                    ! path 3 (m much larger than n, jobu='o', jobvt='s' or 'a')
+                    ! n left singular vectors to be overwritten on a and
+                    ! n right singular vectors to be computed in vt
+                    if( lwork>=n*n+max( 4*n, bdspac ) ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n+n )+lda*n ) then
+                          ! work(iu) is lda by n and work(ir) is lda by n
+                          ldwrku = lda
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n+n )+n*n ) then
+                          ! work(iu) is lda by n and work(ir) is n by n
+                          ldwrku = lda
+                          ldwrkr = n
+                       else
+                          ! work(iu) is ldwrku by n and work(ir) is n by n
+                          ldwrku = ( lwork-n*n-n ) / n
+                          ldwrkr = n
+                       end if
+                       itau = ir + ldwrkr*n
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (workspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to vt, zeroing out below it
+                       call stdlib_slacpy( 'U', n, n, a, lda, vt, ldvt )
+                       if( n>1 )call stdlib_slaset( 'L', n-1, n-1, zero, zero,vt( 2, 1 ), ldvt )
+                                 
+                       ! generate q in a
+                       ! (workspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       call stdlib_sorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + n
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in vt, copying result to work(ir)
+                       ! (workspace: need n*n+4*n, prefer n*n+3*n+2*n*nb)
+                       call stdlib_sgebrd( n, n, vt, ldvt, s, work( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       call stdlib_slacpy( 'L', n, n, vt, ldvt, work( ir ), ldwrkr )
+                       ! generate left vectors bidiagonalizing r in work(ir)
+                       ! (workspace: need n*n+4*n, prefer n*n+3*n+n*nb)
+                       call stdlib_sorgbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing r in vt
+                       ! (workspace: need n*n+4*n-1, prefer n*n+3*n+(n-1)*nb)
+                       call stdlib_sorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of r in work(ir) and computing right
+                       ! singular vectors of r in vt
+                       ! (workspace: need n*n+bdspac)
+                       call stdlib_sbdsqr( 'U', n, n, n, 0, s, work( ie ), vt, ldvt,work( ir ), &
+                                 ldwrkr, dum, 1,work( iwork ), info )
+                       iu = ie + n
+                       ! multiply q in a by left singular vectors of r in
+                       ! work(ir), storing result in work(iu) and copying to a
+                       ! (workspace: need n*n+2*n, prefer n*n+m*n+n)
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_sgemm( 'N', 'N', chunk, n, n, one, a( i, 1 ),lda, work( ir )&
+                                    , ldwrkr, zero,work( iu ), ldwrku )
+                          call stdlib_slacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       itau = 1
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (workspace: need 2*n, prefer n+n*nb)
+                       call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to vt, zeroing out below it
+                       call stdlib_slacpy( 'U', n, n, a, lda, vt, ldvt )
+                       if( n>1 )call stdlib_slaset( 'L', n-1, n-1, zero, zero,vt( 2, 1 ), ldvt )
+                                 
+                       ! generate q in a
+                       ! (workspace: need 2*n, prefer n+n*nb)
+                       call stdlib_sorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + n
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in vt
+                       ! (workspace: need 4*n, prefer 3*n+2*n*nb)
+                       call stdlib_sgebrd( n, n, vt, ldvt, s, work( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! multiply q in a by left vectors bidiagonalizing r
+                       ! (workspace: need 3*n+m, prefer 3*n+m*nb)
+                       call stdlib_sormbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), a, lda,&
+                                  work( iwork ),lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing r in vt
+                       ! (workspace: need 4*n-1, prefer 3*n+(n-1)*nb)
+                       call stdlib_sorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in a and computing right
+                       ! singular vectors of a in vt
+                       ! (workspace: need bdspac)
+                       call stdlib_sbdsqr( 'U', n, n, m, 0, s, work( ie ), vt, ldvt,a, lda, dum, &
+                                 1, work( iwork ), info )
+                    end if
+                 else if( wntus ) then
+                    if( wntvn ) then
+                       ! path 4 (m much larger than n, jobu='s', jobvt='n')
+                       ! n left singular vectors to be computed in u and
+                       ! no right singular vectors to be computed
+                       if( lwork>=n*n+max( 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(ir) is lda by n
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is n by n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (workspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(ir), zeroing out below it
+                          call stdlib_slacpy( 'U', n, n, a, lda, work( ir ),ldwrkr )
+                          call stdlib_slaset( 'L', n-1, n-1, zero, zero,work( ir+1 ), ldwrkr )
+                                    
+                          ! generate q in a
+                          ! (workspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          call stdlib_sorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(ir)
+                          ! (workspace: need n*n+4*n, prefer n*n+3*n+2*n*nb)
+                          call stdlib_sgebrd( n, n, work( ir ), ldwrkr, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left vectors bidiagonalizing r in work(ir)
+                          ! (workspace: need n*n+4*n, prefer n*n+3*n+n*nb)
+                          call stdlib_sorgbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(ir)
+                          ! (workspace: need n*n+bdspac)
+                          call stdlib_sbdsqr( 'U', n, 0, n, 0, s, work( ie ), dum,1, work( ir ), &
+                                    ldwrkr, dum, 1,work( iwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(ir), storing result in u
+                          ! (workspace: need n*n)
+                          call stdlib_sgemm( 'N', 'N', m, n, n, one, a, lda,work( ir ), ldwrkr, &
+                                    zero, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n+n*nb)
+                          call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need 2*n, prefer n+n*nb)
+                          call stdlib_sorgqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_slaset( 'L', n-1, n-1, zero, zero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (workspace: need 4*n, prefer 3*n+2*n*nb)
+                          call stdlib_sgebrd( n, n, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left vectors bidiagonalizing r
+                          ! (workspace: need 3*n+m, prefer 3*n+m*nb)
+                          call stdlib_sormbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u
+                          ! (workspace: need bdspac)
+                          call stdlib_sbdsqr( 'U', n, 0, m, 0, s, work( ie ), dum,1, u, ldu, dum, &
+                                    1, work( iwork ),info )
+                       end if
+                    else if( wntvo ) then
+                       ! path 5 (m much larger than n, jobu='s', jobvt='o')
+                       ! n left singular vectors to be computed in u and
+                       ! n right singular vectors to be overwritten on a
+                       if( lwork>=2*n*n+max( 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*n ) then
+                             ! work(iu) is lda by n and work(ir) is lda by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+n )*n ) then
+                             ! work(iu) is lda by n and work(ir) is n by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          else
+                             ! work(iu) is n by n and work(ir) is n by n
+                             ldwrku = n
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (workspace: need 2*n*n+2*n, prefer 2*n*n+n+n*nb)
+                          call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_slacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_slaset( 'L', n-1, n-1, zero, zero,work( iu+1 ), ldwrku )
+                                    
+                          ! generate q in a
+                          ! (workspace: need 2*n*n+2*n, prefer 2*n*n+n+n*nb)
+                          call stdlib_sorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to
+                          ! work(ir)
+                          ! (workspace: need 2*n*n+4*n,
+                                      ! prefer 2*n*n+3*n+2*n*nb)
+                          call stdlib_sgebrd( n, n, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_slacpy( 'U', n, n, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (workspace: need 2*n*n+4*n, prefer 2*n*n+3*n+n*nb)
+                          call stdlib_sorgbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (workspace: need 2*n*n+4*n-1,
+                                      ! prefer 2*n*n+3*n+(n-1)*nb)
+                          call stdlib_sorgbr( 'P', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in work(ir)
+                          ! (workspace: need 2*n*n+bdspac)
+                          call stdlib_sbdsqr( 'U', n, n, n, 0, s, work( ie ),work( ir ), ldwrkr, &
+                                    work( iu ),ldwrku, dum, 1, work( iwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(iu), storing result in u
+                          ! (workspace: need n*n)
+                          call stdlib_sgemm( 'N', 'N', m, n, n, one, a, lda,work( iu ), ldwrku, &
+                                    zero, u, ldu )
+                          ! copy right singular vectors of r to a
+                          ! (workspace: need n*n)
+                          call stdlib_slacpy( 'F', n, n, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n+n*nb)
+                          call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need 2*n, prefer n+n*nb)
+                          call stdlib_sorgqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_slaset( 'L', n-1, n-1, zero, zero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (workspace: need 4*n, prefer 3*n+2*n*nb)
+                          call stdlib_sgebrd( n, n, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left vectors bidiagonalizing r
+                          ! (workspace: need 3*n+m, prefer 3*n+m*nb)
+                          call stdlib_sormbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right vectors bidiagonalizing r in a
+                          ! (workspace: need 4*n-1, prefer 3*n+(n-1)*nb)
+                          call stdlib_sorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in a
+                          ! (workspace: need bdspac)
+                          call stdlib_sbdsqr( 'U', n, n, m, 0, s, work( ie ), a,lda, u, ldu, dum, &
+                                    1, work( iwork ),info )
+                       end if
+                    else if( wntvas ) then
+                       ! path 6 (m much larger than n, jobu='s', jobvt='s'
+                               ! or 'a')
+                       ! n left singular vectors to be computed in u and
+                       ! n right singular vectors to be computed in vt
+                       if( lwork>=n*n+max( 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is n by n
+                             ldwrku = n
+                          end if
+                          itau = iu + ldwrku*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (workspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_slacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_slaset( 'L', n-1, n-1, zero, zero,work( iu+1 ), ldwrku )
+                                    
+                          ! generate q in a
+                          ! (workspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          call stdlib_sorgqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to vt
+                          ! (workspace: need n*n+4*n, prefer n*n+3*n+2*n*nb)
+                          call stdlib_sgebrd( n, n, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_slacpy( 'U', n, n, work( iu ), ldwrku, vt,ldvt )
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (workspace: need n*n+4*n, prefer n*n+3*n+n*nb)
+                          call stdlib_sorgbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (workspace: need n*n+4*n-1,
+                                      ! prefer n*n+3*n+(n-1)*nb)
+                          call stdlib_sorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in vt
+                          ! (workspace: need n*n+bdspac)
+                          call stdlib_sbdsqr( 'U', n, n, n, 0, s, work( ie ), vt,ldvt, work( iu ),&
+                                     ldwrku, dum, 1,work( iwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(iu), storing result in u
+                          ! (workspace: need n*n)
+                          call stdlib_sgemm( 'N', 'N', m, n, n, one, a, lda,work( iu ), ldwrku, &
+                                    zero, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n+n*nb)
+                          call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need 2*n, prefer n+n*nb)
+                          call stdlib_sorgqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to vt, zeroing out below it
+                          call stdlib_slacpy( 'U', n, n, a, lda, vt, ldvt )
+                          if( n>1 )call stdlib_slaset( 'L', n-1, n-1, zero, zero,vt( 2, 1 ), ldvt &
+                                    )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in vt
+                          ! (workspace: need 4*n, prefer 3*n+2*n*nb)
+                          call stdlib_sgebrd( n, n, vt, ldvt, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in vt
+                          ! (workspace: need 3*n+m, prefer 3*n+m*nb)
+                          call stdlib_sormbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (workspace: need 4*n-1, prefer 3*n+(n-1)*nb)
+                          call stdlib_sorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_sbdsqr( 'U', n, n, m, 0, s, work( ie ), vt,ldvt, u, ldu, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    end if
+                 else if( wntua ) then
+                    if( wntvn ) then
+                       ! path 7 (m much larger than n, jobu='a', jobvt='n')
+                       ! m left singular vectors to be computed in u and
+                       ! no right singular vectors to be computed
+                       if( lwork>=n*n+max( n+m, 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(ir) is lda by n
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is n by n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'L', m, n, a, lda, u, ldu )
+                          ! copy r to work(ir), zeroing out below it
+                          call stdlib_slacpy( 'U', n, n, a, lda, work( ir ),ldwrkr )
+                          call stdlib_slaset( 'L', n-1, n-1, zero, zero,work( ir+1 ), ldwrkr )
+                                    
+                          ! generate q in u
+                          ! (workspace: need n*n+n+m, prefer n*n+n+m*nb)
+                          call stdlib_sorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(ir)
+                          ! (workspace: need n*n+4*n, prefer n*n+3*n+2*n*nb)
+                          call stdlib_sgebrd( n, n, work( ir ), ldwrkr, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (workspace: need n*n+4*n, prefer n*n+3*n+n*nb)
+                          call stdlib_sorgbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(ir)
+                          ! (workspace: need n*n+bdspac)
+                          call stdlib_sbdsqr( 'U', n, 0, n, 0, s, work( ie ), dum,1, work( ir ), &
+                                    ldwrkr, dum, 1,work( iwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(ir), storing result in a
+                          ! (workspace: need n*n)
+                          call stdlib_sgemm( 'N', 'N', m, n, n, one, u, ldu,work( ir ), ldwrkr, &
+                                    zero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_slacpy( 'F', m, n, a, lda, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n+n*nb)
+                          call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need n+m, prefer n+m*nb)
+                          call stdlib_sorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_slaset( 'L', n-1, n-1, zero, zero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (workspace: need 4*n, prefer 3*n+2*n*nb)
+                          call stdlib_sgebrd( n, n, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in a
+                          ! (workspace: need 3*n+m, prefer 3*n+m*nb)
+                          call stdlib_sormbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u
+                          ! (workspace: need bdspac)
+                          call stdlib_sbdsqr( 'U', n, 0, m, 0, s, work( ie ), dum,1, u, ldu, dum, &
+                                    1, work( iwork ),info )
+                       end if
+                    else if( wntvo ) then
+                       ! path 8 (m much larger than n, jobu='a', jobvt='o')
+                       ! m left singular vectors to be computed in u and
+                       ! n right singular vectors to be overwritten on a
+                       if( lwork>=2*n*n+max( n+m, 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*n ) then
+                             ! work(iu) is lda by n and work(ir) is lda by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+n )*n ) then
+                             ! work(iu) is lda by n and work(ir) is n by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          else
+                             ! work(iu) is n by n and work(ir) is n by n
+                             ldwrku = n
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n*n+2*n, prefer 2*n*n+n+n*nb)
+                          call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need 2*n*n+n+m, prefer 2*n*n+n+m*nb)
+                          call stdlib_sorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_slacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_slaset( 'L', n-1, n-1, zero, zero,work( iu+1 ), ldwrku )
+                                    
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to
+                          ! work(ir)
+                          ! (workspace: need 2*n*n+4*n,
+                                      ! prefer 2*n*n+3*n+2*n*nb)
+                          call stdlib_sgebrd( n, n, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_slacpy( 'U', n, n, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (workspace: need 2*n*n+4*n, prefer 2*n*n+3*n+n*nb)
+                          call stdlib_sorgbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (workspace: need 2*n*n+4*n-1,
+                                      ! prefer 2*n*n+3*n+(n-1)*nb)
+                          call stdlib_sorgbr( 'P', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in work(ir)
+                          ! (workspace: need 2*n*n+bdspac)
+                          call stdlib_sbdsqr( 'U', n, n, n, 0, s, work( ie ),work( ir ), ldwrkr, &
+                                    work( iu ),ldwrku, dum, 1, work( iwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(iu), storing result in a
+                          ! (workspace: need n*n)
+                          call stdlib_sgemm( 'N', 'N', m, n, n, one, u, ldu,work( iu ), ldwrku, &
+                                    zero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_slacpy( 'F', m, n, a, lda, u, ldu )
+                          ! copy right singular vectors of r from work(ir) to a
+                          call stdlib_slacpy( 'F', n, n, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n+n*nb)
+                          call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need n+m, prefer n+m*nb)
+                          call stdlib_sorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_slaset( 'L', n-1, n-1, zero, zero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (workspace: need 4*n, prefer 3*n+2*n*nb)
+                          call stdlib_sgebrd( n, n, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in a
+                          ! (workspace: need 3*n+m, prefer 3*n+m*nb)
+                          call stdlib_sormbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in a
+                          ! (workspace: need 4*n-1, prefer 3*n+(n-1)*nb)
+                          call stdlib_sorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in a
+                          ! (workspace: need bdspac)
+                          call stdlib_sbdsqr( 'U', n, n, m, 0, s, work( ie ), a,lda, u, ldu, dum, &
+                                    1, work( iwork ),info )
+                       end if
+                    else if( wntvas ) then
+                       ! path 9 (m much larger than n, jobu='a', jobvt='s'
+                               ! or 'a')
+                       ! m left singular vectors to be computed in u and
+                       ! n right singular vectors to be computed in vt
+                       if( lwork>=n*n+max( n+m, 4*n, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is n by n
+                             ldwrku = n
+                          end if
+                          itau = iu + ldwrku*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need n*n+n+m, prefer n*n+n+m*nb)
+                          call stdlib_sorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_slacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_slaset( 'L', n-1, n-1, zero, zero,work( iu+1 ), ldwrku )
+                                    
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to vt
+                          ! (workspace: need n*n+4*n, prefer n*n+3*n+2*n*nb)
+                          call stdlib_sgebrd( n, n, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_slacpy( 'U', n, n, work( iu ), ldwrku, vt,ldvt )
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (workspace: need n*n+4*n, prefer n*n+3*n+n*nb)
+                          call stdlib_sorgbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (workspace: need n*n+4*n-1,
+                                      ! prefer n*n+3*n+(n-1)*nb)
+                          call stdlib_sorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in vt
+                          ! (workspace: need n*n+bdspac)
+                          call stdlib_sbdsqr( 'U', n, n, n, 0, s, work( ie ), vt,ldvt, work( iu ),&
+                                     ldwrku, dum, 1,work( iwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(iu), storing result in a
+                          ! (workspace: need n*n)
+                          call stdlib_sgemm( 'N', 'N', m, n, n, one, u, ldu,work( iu ), ldwrku, &
+                                    zero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_slacpy( 'F', m, n, a, lda, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (workspace: need 2*n, prefer n+n*nb)
+                          call stdlib_sgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (workspace: need n+m, prefer n+m*nb)
+                          call stdlib_sorgqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r from a to vt, zeroing out below it
+                          call stdlib_slacpy( 'U', n, n, a, lda, vt, ldvt )
+                          if( n>1 )call stdlib_slaset( 'L', n-1, n-1, zero, zero,vt( 2, 1 ), ldvt &
+                                    )
+                          ie = itau
+                          itauq = ie + n
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in vt
+                          ! (workspace: need 4*n, prefer 3*n+2*n*nb)
+                          call stdlib_sgebrd( n, n, vt, ldvt, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in vt
+                          ! (workspace: need 3*n+m, prefer 3*n+m*nb)
+                          call stdlib_sormbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (workspace: need 4*n-1, prefer 3*n+(n-1)*nb)
+                          call stdlib_sorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          iwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_sbdsqr( 'U', n, n, m, 0, s, work( ie ), vt,ldvt, u, ldu, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    end if
+                 end if
+              else
+                 ! m < mnthr
+                 ! path 10 (m at least n, but not much larger)
+                 ! reduce to bidiagonal form without qr decomposition
+                 ie = 1
+                 itauq = ie + n
+                 itaup = itauq + n
+                 iwork = itaup + n
+                 ! bidiagonalize a
+                 ! (workspace: need 3*n+m, prefer 3*n+(m+n)*nb)
+                 call stdlib_sgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                           work( iwork ), lwork-iwork+1,ierr )
+                 if( wntuas ) then
+                    ! if left singular vectors desired in u, copy result to u
+                    ! and generate left bidiagonalizing vectors in u
+                    ! (workspace: need 3*n+ncu, prefer 3*n+ncu*nb)
+                    call stdlib_slacpy( 'L', m, n, a, lda, u, ldu )
+                    if( wntus )ncu = n
+                    if( wntua )ncu = m
+                    call stdlib_sorgbr( 'Q', m, ncu, n, u, ldu, work( itauq ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntvas ) then
+                    ! if right singular vectors desired in vt, copy result to
+                    ! vt and generate right bidiagonalizing vectors in vt
+                    ! (workspace: need 4*n-1, prefer 3*n+(n-1)*nb)
+                    call stdlib_slacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_sorgbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntuo ) then
+                    ! if left singular vectors desired in a, generate left
+                    ! bidiagonalizing vectors in a
+                    ! (workspace: need 4*n, prefer 3*n+n*nb)
+                    call stdlib_sorgbr( 'Q', m, n, n, a, lda, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvo ) then
+                    ! if right singular vectors desired in a, generate right
+                    ! bidiagonalizing vectors in a
+                    ! (workspace: need 4*n-1, prefer 3*n+(n-1)*nb)
+                    call stdlib_sorgbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 iwork = ie + n
+                 if( wntuas .or. wntuo )nru = m
+                 if( wntun )nru = 0
+                 if( wntvas .or. wntvo )ncvt = n
+                 if( wntvn )ncvt = 0
+                 if( ( .not.wntuo ) .and. ( .not.wntvo ) ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in vt
+                    ! (workspace: need bdspac)
+                    call stdlib_sbdsqr( 'U', n, ncvt, nru, 0, s, work( ie ), vt,ldvt, u, ldu, dum,&
+                               1, work( iwork ), info )
+                 else if( ( .not.wntuo ) .and. wntvo ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in a
+                    ! (workspace: need bdspac)
+                    call stdlib_sbdsqr( 'U', n, ncvt, nru, 0, s, work( ie ), a, lda,u, ldu, dum, &
+                              1, work( iwork ), info )
+                 else
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in a and computing right singular
+                    ! vectors in vt
+                    ! (workspace: need bdspac)
+                    call stdlib_sbdsqr( 'U', n, ncvt, nru, 0, s, work( ie ), vt,ldvt, a, lda, dum,&
+                               1, work( iwork ), info )
+                 end if
+              end if
+           else
+              ! a has more columns than rows. if a has sufficiently more
+              ! columns than rows, first reduce using the lq decomposition (if
+              ! sufficient workspace available)
+              if( n>=mnthr ) then
+                 if( wntvn ) then
+                    ! path 1t(n much larger than m, jobvt='n')
+                    ! no right singular vectors to be computed
+                    itau = 1
+                    iwork = itau + m
+                    ! compute a=l*q
+                    ! (workspace: need 2*m, prefer m+m*nb)
+                    call stdlib_sgelqf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                              ierr )
+                    ! zero out above l
+                    call stdlib_slaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ), lda )
+                    ie = 1
+                    itauq = ie + m
+                    itaup = itauq + m
+                    iwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! (workspace: need 4*m, prefer 3*m+2*m*nb)
+                    call stdlib_sgebrd( m, m, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                              work( iwork ), lwork-iwork+1,ierr )
+                    if( wntuo .or. wntuas ) then
+                       ! if left singular vectors desired, generate q
+                       ! (workspace: need 4*m, prefer 3*m+m*nb)
+                       call stdlib_sorgbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                    end if
+                    iwork = ie + m
+                    nru = 0
+                    if( wntuo .or. wntuas )nru = m
+                    ! perform bidiagonal qr iteration, computing left singular
+                    ! vectors of a in a if desired
+                    ! (workspace: need bdspac)
+                    call stdlib_sbdsqr( 'U', m, 0, nru, 0, s, work( ie ), dum, 1, a,lda, dum, 1, &
+                              work( iwork ), info )
+                    ! if left singular vectors desired in u, copy them there
+                    if( wntuas )call stdlib_slacpy( 'F', m, m, a, lda, u, ldu )
+                 else if( wntvo .and. wntun ) then
+                    ! path 2t(n much larger than m, jobu='n', jobvt='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! no left singular vectors to be computed
+                    if( lwork>=m*m+max( 4*m, bdspac ) ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n+m )+lda*m ) then
+                          ! work(iu) is lda by n and work(ir) is lda by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n+m )+m*m ) then
+                          ! work(iu) is lda by n and work(ir) is m by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = m
+                       else
+                          ! work(iu) is m by chunk and work(ir) is m by m
+                          ldwrku = m
+                          chunk = ( lwork-m*m-m ) / m
+                          ldwrkr = m
+                       end if
+                       itau = ir + ldwrkr*m
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (workspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to work(ir) and zero out above it
+                       call stdlib_slacpy( 'L', m, m, a, lda, work( ir ), ldwrkr )
+                       call stdlib_slaset( 'U', m-1, m-1, zero, zero,work( ir+ldwrkr ), ldwrkr )
+                                 
+                       ! generate q in a
+                       ! (workspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       call stdlib_sorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + m
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in work(ir)
+                       ! (workspace: need m*m+4*m, prefer m*m+3*m+2*m*nb)
+                       call stdlib_sgebrd( m, m, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
+                                 work( itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing l
+                       ! (workspace: need m*m+4*m-1, prefer m*m+3*m+(m-1)*nb)
+                       call stdlib_sorgbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       iwork = ie + m
+                       ! perform bidiagonal qr iteration, computing right
+                       ! singular vectors of l in work(ir)
+                       ! (workspace: need m*m+bdspac)
+                       call stdlib_sbdsqr( 'U', m, m, 0, 0, s, work( ie ),work( ir ), ldwrkr, dum,&
+                                  1, dum, 1,work( iwork ), info )
+                       iu = ie + m
+                       ! multiply right singular vectors of l in work(ir) by q
+                       ! in a, storing result in work(iu) and copying to a
+                       ! (workspace: need m*m+2*m, prefer m*m+m*n+m)
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_sgemm( 'N', 'N', m, blk, m, one, work( ir ),ldwrkr, a( 1, i &
+                                    ), lda, zero,work( iu ), ldwrku )
+                          call stdlib_slacpy( 'F', m, blk, work( iu ), ldwrku,a( 1, i ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       ie = 1
+                       itauq = ie + m
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize a
+                       ! (workspace: need 3*m+n, prefer 3*m+(m+n)*nb)
+                       call stdlib_sgebrd( m, n, a, lda, s, work( ie ),work( itauq ), work( itaup &
+                                 ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing a
+                       ! (workspace: need 4*m, prefer 3*m+m*nb)
+                       call stdlib_sorgbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + m
+                       ! perform bidiagonal qr iteration, computing right
+                       ! singular vectors of a in a
+                       ! (workspace: need bdspac)
+                       call stdlib_sbdsqr( 'L', m, n, 0, 0, s, work( ie ), a, lda,dum, 1, dum, 1, &
+                                 work( iwork ), info )
+                    end if
+                 else if( wntvo .and. wntuas ) then
+                    ! path 3t(n much larger than m, jobu='s' or 'a', jobvt='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! m left singular vectors to be computed in u
+                    if( lwork>=m*m+max( 4*m, bdspac ) ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n+m )+lda*m ) then
+                          ! work(iu) is lda by n and work(ir) is lda by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n+m )+m*m ) then
+                          ! work(iu) is lda by n and work(ir) is m by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = m
+                       else
+                          ! work(iu) is m by chunk and work(ir) is m by m
+                          ldwrku = m
+                          chunk = ( lwork-m*m-m ) / m
+                          ldwrkr = m
+                       end if
+                       itau = ir + ldwrkr*m
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (workspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to u, zeroing about above it
+                       call stdlib_slacpy( 'L', m, m, a, lda, u, ldu )
+                       call stdlib_slaset( 'U', m-1, m-1, zero, zero, u( 1, 2 ),ldu )
+                       ! generate q in a
+                       ! (workspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       call stdlib_sorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + m
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in u, copying result to work(ir)
+                       ! (workspace: need m*m+4*m, prefer m*m+3*m+2*m*nb)
+                       call stdlib_sgebrd( m, m, u, ldu, s, work( ie ),work( itauq ), work( itaup &
+                                 ),work( iwork ), lwork-iwork+1, ierr )
+                       call stdlib_slacpy( 'U', m, m, u, ldu, work( ir ), ldwrkr )
+                       ! generate right vectors bidiagonalizing l in work(ir)
+                       ! (workspace: need m*m+4*m-1, prefer m*m+3*m+(m-1)*nb)
+                       call stdlib_sorgbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing l in u
+                       ! (workspace: need m*m+4*m, prefer m*m+3*m+m*nb)
+                       call stdlib_sorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + m
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of l in u, and computing right
+                       ! singular vectors of l in work(ir)
+                       ! (workspace: need m*m+bdspac)
+                       call stdlib_sbdsqr( 'U', m, m, m, 0, s, work( ie ),work( ir ), ldwrkr, u, &
+                                 ldu, dum, 1,work( iwork ), info )
+                       iu = ie + m
+                       ! multiply right singular vectors of l in work(ir) by q
+                       ! in a, storing result in work(iu) and copying to a
+                       ! (workspace: need m*m+2*m, prefer m*m+m*n+m))
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_sgemm( 'N', 'N', m, blk, m, one, work( ir ),ldwrkr, a( 1, i &
+                                    ), lda, zero,work( iu ), ldwrku )
+                          call stdlib_slacpy( 'F', m, blk, work( iu ), ldwrku,a( 1, i ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       itau = 1
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (workspace: need 2*m, prefer m+m*nb)
+                       call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to u, zeroing out above it
+                       call stdlib_slacpy( 'L', m, m, a, lda, u, ldu )
+                       call stdlib_slaset( 'U', m-1, m-1, zero, zero, u( 1, 2 ),ldu )
+                       ! generate q in a
+                       ! (workspace: need 2*m, prefer m+m*nb)
+                       call stdlib_sorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = itau
+                       itauq = ie + m
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in u
+                       ! (workspace: need 4*m, prefer 3*m+2*m*nb)
+                       call stdlib_sgebrd( m, m, u, ldu, s, work( ie ),work( itauq ), work( itaup &
+                                 ),work( iwork ), lwork-iwork+1, ierr )
+                       ! multiply right vectors bidiagonalizing l by q in a
+                       ! (workspace: need 3*m+n, prefer 3*m+n*nb)
+                       call stdlib_sormbr( 'P', 'L', 'T', m, n, m, u, ldu,work( itaup ), a, lda, &
+                                 work( iwork ),lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing l in u
+                       ! (workspace: need 4*m, prefer 3*m+m*nb)
+                       call stdlib_sorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       iwork = ie + m
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in u and computing right
+                       ! singular vectors of a in a
+                       ! (workspace: need bdspac)
+                       call stdlib_sbdsqr( 'U', m, n, m, 0, s, work( ie ), a, lda,u, ldu, dum, 1, &
+                                 work( iwork ), info )
+                    end if
+                 else if( wntvs ) then
+                    if( wntun ) then
+                       ! path 4t(n much larger than m, jobu='n', jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! no left singular vectors to be computed
+                       if( lwork>=m*m+max( 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(ir) is lda by m
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is m by m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (workspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(ir), zeroing out above it
+                          call stdlib_slacpy( 'L', m, m, a, lda, work( ir ),ldwrkr )
+                          call stdlib_slaset( 'U', m-1, m-1, zero, zero,work( ir+ldwrkr ), ldwrkr &
+                                    )
+                          ! generate q in a
+                          ! (workspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          call stdlib_sorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(ir)
+                          ! (workspace: need m*m+4*m, prefer m*m+3*m+2*m*nb)
+                          call stdlib_sgebrd( m, m, work( ir ), ldwrkr, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right vectors bidiagonalizing l in
+                          ! work(ir)
+                          ! (workspace: need m*m+4*m, prefer m*m+3*m+(m-1)*nb)
+                          call stdlib_sorgbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of l in work(ir)
+                          ! (workspace: need m*m+bdspac)
+                          call stdlib_sbdsqr( 'U', m, m, 0, 0, s, work( ie ),work( ir ), ldwrkr, &
+                                    dum, 1, dum, 1,work( iwork ), info )
+                          ! multiply right singular vectors of l in work(ir) by
+                          ! q in a, storing result in vt
+                          ! (workspace: need m*m)
+                          call stdlib_sgemm( 'N', 'N', m, n, m, one, work( ir ),ldwrkr, a, lda, &
+                                    zero, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (workspace: need 2*m, prefer m+m*nb)
+                          call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy result to vt
+                          call stdlib_slacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need 2*m, prefer m+m*nb)
+                          call stdlib_sorglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_slaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ),lda )
+                          ! bidiagonalize l in a
+                          ! (workspace: need 4*m, prefer 3*m+2*m*nb)
+                          call stdlib_sgebrd( m, m, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right vectors bidiagonalizing l by q in vt
+                          ! (workspace: need 3*m+n, prefer 3*m+n*nb)
+                          call stdlib_sormbr( 'P', 'L', 'T', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_sbdsqr( 'U', m, n, 0, 0, s, work( ie ), vt,ldvt, dum, 1, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    else if( wntuo ) then
+                       ! path 5t(n much larger than m, jobu='o', jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! m left singular vectors to be overwritten on a
+                       if( lwork>=2*m*m+max( 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*m ) then
+                             ! work(iu) is lda by m and work(ir) is lda by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+m )*m ) then
+                             ! work(iu) is lda by m and work(ir) is m by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          else
+                             ! work(iu) is m by m and work(ir) is m by m
+                             ldwrku = m
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (workspace: need 2*m*m+2*m, prefer 2*m*m+m+m*nb)
+                          call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out below it
+                          call stdlib_slacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_slaset( 'U', m-1, m-1, zero, zero,work( iu+ldwrku ), ldwrku &
+                                    )
+                          ! generate q in a
+                          ! (workspace: need 2*m*m+2*m, prefer 2*m*m+m+m*nb)
+                          call stdlib_sorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to
+                          ! work(ir)
+                          ! (workspace: need 2*m*m+4*m,
+                                      ! prefer 2*m*m+3*m+2*m*nb)
+                          call stdlib_sgebrd( m, m, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_slacpy( 'L', m, m, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (workspace: need 2*m*m+4*m-1,
+                                      ! prefer 2*m*m+3*m+(m-1)*nb)
+                          call stdlib_sorgbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (workspace: need 2*m*m+4*m, prefer 2*m*m+3*m+m*nb)
+                          call stdlib_sorgbr( 'Q', m, m, m, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in work(ir) and computing
+                          ! right singular vectors of l in work(iu)
+                          ! (workspace: need 2*m*m+bdspac)
+                          call stdlib_sbdsqr( 'U', m, m, m, 0, s, work( ie ),work( iu ), ldwrku, &
+                                    work( ir ),ldwrkr, dum, 1, work( iwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in a, storing result in vt
+                          ! (workspace: need m*m)
+                          call stdlib_sgemm( 'N', 'N', m, n, m, one, work( iu ),ldwrku, a, lda, &
+                                    zero, vt, ldvt )
+                          ! copy left singular vectors of l to a
+                          ! (workspace: need m*m)
+                          call stdlib_slacpy( 'F', m, m, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m+m*nb)
+                          call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need 2*m, prefer m+m*nb)
+                          call stdlib_sorglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_slaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ),lda )
+                          ! bidiagonalize l in a
+                          ! (workspace: need 4*m, prefer 3*m+2*m*nb)
+                          call stdlib_sgebrd( m, m, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right vectors bidiagonalizing l by q in vt
+                          ! (workspace: need 3*m+n, prefer 3*m+n*nb)
+                          call stdlib_sormbr( 'P', 'L', 'T', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors of l in a
+                          ! (workspace: need 4*m, prefer 3*m+m*nb)
+                          call stdlib_sorgbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, compute left
+                          ! singular vectors of a in a and compute right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_sbdsqr( 'U', m, n, m, 0, s, work( ie ), vt,ldvt, a, lda, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    else if( wntuas ) then
+                       ! path 6t(n much larger than m, jobu='s' or 'a',
+                               ! jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! m left singular vectors to be computed in u
+                       if( lwork>=m*m+max( 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is lda by m
+                             ldwrku = m
+                          end if
+                          itau = iu + ldwrku*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (workspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_slacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_slaset( 'U', m-1, m-1, zero, zero,work( iu+ldwrku ), ldwrku &
+                                    )
+                          ! generate q in a
+                          ! (workspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          call stdlib_sorglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to u
+                          ! (workspace: need m*m+4*m, prefer m*m+3*m+2*m*nb)
+                          call stdlib_sgebrd( m, m, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_slacpy( 'L', m, m, work( iu ), ldwrku, u,ldu )
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (workspace: need m*m+4*m-1,
+                                      ! prefer m*m+3*m+(m-1)*nb)
+                          call stdlib_sorgbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (workspace: need m*m+4*m, prefer m*m+3*m+m*nb)
+                          call stdlib_sorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in u and computing right
+                          ! singular vectors of l in work(iu)
+                          ! (workspace: need m*m+bdspac)
+                          call stdlib_sbdsqr( 'U', m, m, m, 0, s, work( ie ),work( iu ), ldwrku, &
+                                    u, ldu, dum, 1,work( iwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in a, storing result in vt
+                          ! (workspace: need m*m)
+                          call stdlib_sgemm( 'N', 'N', m, n, m, one, work( iu ),ldwrku, a, lda, &
+                                    zero, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m+m*nb)
+                          call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need 2*m, prefer m+m*nb)
+                          call stdlib_sorglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to u, zeroing out above it
+                          call stdlib_slacpy( 'L', m, m, a, lda, u, ldu )
+                          call stdlib_slaset( 'U', m-1, m-1, zero, zero, u( 1, 2 ),ldu )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in u
+                          ! (workspace: need 4*m, prefer 3*m+2*m*nb)
+                          call stdlib_sgebrd( m, m, u, ldu, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in u by q
+                          ! in vt
+                          ! (workspace: need 3*m+n, prefer 3*m+n*nb)
+                          call stdlib_sormbr( 'P', 'L', 'T', m, n, m, u, ldu,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (workspace: need 4*m, prefer 3*m+m*nb)
+                          call stdlib_sorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_sbdsqr( 'U', m, n, m, 0, s, work( ie ), vt,ldvt, u, ldu, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    end if
+                 else if( wntva ) then
+                    if( wntun ) then
+                       ! path 7t(n much larger than m, jobu='n', jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! no left singular vectors to be computed
+                       if( lwork>=m*m+max( n+m, 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(ir) is lda by m
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is m by m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! copy l to work(ir), zeroing out above it
+                          call stdlib_slacpy( 'L', m, m, a, lda, work( ir ),ldwrkr )
+                          call stdlib_slaset( 'U', m-1, m-1, zero, zero,work( ir+ldwrkr ), ldwrkr &
+                                    )
+                          ! generate q in vt
+                          ! (workspace: need m*m+m+n, prefer m*m+m+n*nb)
+                          call stdlib_sorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(ir)
+                          ! (workspace: need m*m+4*m, prefer m*m+3*m+2*m*nb)
+                          call stdlib_sgebrd( m, m, work( ir ), ldwrkr, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (workspace: need m*m+4*m-1,
+                                      ! prefer m*m+3*m+(m-1)*nb)
+                          call stdlib_sorgbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of l in work(ir)
+                          ! (workspace: need m*m+bdspac)
+                          call stdlib_sbdsqr( 'U', m, m, 0, 0, s, work( ie ),work( ir ), ldwrkr, &
+                                    dum, 1, dum, 1,work( iwork ), info )
+                          ! multiply right singular vectors of l in work(ir) by
+                          ! q in vt, storing result in a
+                          ! (workspace: need m*m)
+                          call stdlib_sgemm( 'N', 'N', m, n, m, one, work( ir ),ldwrkr, vt, ldvt, &
+                                    zero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_slacpy( 'F', m, n, a, lda, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m+m*nb)
+                          call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need m+n, prefer m+n*nb)
+                          call stdlib_sorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_slaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ),lda )
+                          ! bidiagonalize l in a
+                          ! (workspace: need 4*m, prefer 3*m+2*m*nb)
+                          call stdlib_sgebrd( m, m, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in a by q
+                          ! in vt
+                          ! (workspace: need 3*m+n, prefer 3*m+n*nb)
+                          call stdlib_sormbr( 'P', 'L', 'T', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_sbdsqr( 'U', m, n, 0, 0, s, work( ie ), vt,ldvt, dum, 1, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    else if( wntuo ) then
+                       ! path 8t(n much larger than m, jobu='o', jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! m left singular vectors to be overwritten on a
+                       if( lwork>=2*m*m+max( n+m, 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*m ) then
+                             ! work(iu) is lda by m and work(ir) is lda by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+m )*m ) then
+                             ! work(iu) is lda by m and work(ir) is m by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          else
+                             ! work(iu) is m by m and work(ir) is m by m
+                             ldwrku = m
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m*m+2*m, prefer 2*m*m+m+m*nb)
+                          call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need 2*m*m+m+n, prefer 2*m*m+m+n*nb)
+                          call stdlib_sorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_slacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_slaset( 'U', m-1, m-1, zero, zero,work( iu+ldwrku ), ldwrku &
+                                    )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to
+                          ! work(ir)
+                          ! (workspace: need 2*m*m+4*m,
+                                      ! prefer 2*m*m+3*m+2*m*nb)
+                          call stdlib_sgebrd( m, m, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_slacpy( 'L', m, m, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (workspace: need 2*m*m+4*m-1,
+                                      ! prefer 2*m*m+3*m+(m-1)*nb)
+                          call stdlib_sorgbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (workspace: need 2*m*m+4*m, prefer 2*m*m+3*m+m*nb)
+                          call stdlib_sorgbr( 'Q', m, m, m, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in work(ir) and computing
+                          ! right singular vectors of l in work(iu)
+                          ! (workspace: need 2*m*m+bdspac)
+                          call stdlib_sbdsqr( 'U', m, m, m, 0, s, work( ie ),work( iu ), ldwrku, &
+                                    work( ir ),ldwrkr, dum, 1, work( iwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in vt, storing result in a
+                          ! (workspace: need m*m)
+                          call stdlib_sgemm( 'N', 'N', m, n, m, one, work( iu ),ldwrku, vt, ldvt, &
+                                    zero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_slacpy( 'F', m, n, a, lda, vt, ldvt )
+                          ! copy left singular vectors of a from work(ir) to a
+                          call stdlib_slacpy( 'F', m, m, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m+m*nb)
+                          call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need m+n, prefer m+n*nb)
+                          call stdlib_sorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_slaset( 'U', m-1, m-1, zero, zero, a( 1, 2 ),lda )
+                          ! bidiagonalize l in a
+                          ! (workspace: need 4*m, prefer 3*m+2*m*nb)
+                          call stdlib_sgebrd( m, m, a, lda, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in a by q
+                          ! in vt
+                          ! (workspace: need 3*m+n, prefer 3*m+n*nb)
+                          call stdlib_sormbr( 'P', 'L', 'T', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in a
+                          ! (workspace: need 4*m, prefer 3*m+m*nb)
+                          call stdlib_sorgbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in a and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_sbdsqr( 'U', m, n, m, 0, s, work( ie ), vt,ldvt, a, lda, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    else if( wntuas ) then
+                       ! path 9t(n much larger than m, jobu='s' or 'a',
+                               ! jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! m left singular vectors to be computed in u
+                       if( lwork>=m*m+max( n+m, 4*m, bdspac ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(iu) is lda by m
+                             ldwrku = lda
+                          else
+                             ! work(iu) is m by m
+                             ldwrku = m
+                          end if
+                          itau = iu + ldwrku*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need m*m+m+n, prefer m*m+m+n*nb)
+                          call stdlib_sorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_slacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_slaset( 'U', m-1, m-1, zero, zero,work( iu+ldwrku ), ldwrku &
+                                    )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to u
+                          ! (workspace: need m*m+4*m, prefer m*m+3*m+2*m*nb)
+                          call stdlib_sgebrd( m, m, work( iu ), ldwrku, s,work( ie ), work( itauq &
+                                    ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_slacpy( 'L', m, m, work( iu ), ldwrku, u,ldu )
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (workspace: need m*m+4*m, prefer m*m+3*m+(m-1)*nb)
+                          call stdlib_sorgbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (workspace: need m*m+4*m, prefer m*m+3*m+m*nb)
+                          call stdlib_sorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in u and computing right
+                          ! singular vectors of l in work(iu)
+                          ! (workspace: need m*m+bdspac)
+                          call stdlib_sbdsqr( 'U', m, m, m, 0, s, work( ie ),work( iu ), ldwrku, &
+                                    u, ldu, dum, 1,work( iwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in vt, storing result in a
+                          ! (workspace: need m*m)
+                          call stdlib_sgemm( 'N', 'N', m, n, m, one, work( iu ),ldwrku, vt, ldvt, &
+                                    zero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_slacpy( 'F', m, n, a, lda, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (workspace: need 2*m, prefer m+m*nb)
+                          call stdlib_sgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_slacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (workspace: need m+n, prefer m+n*nb)
+                          call stdlib_sorglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to u, zeroing out above it
+                          call stdlib_slacpy( 'L', m, m, a, lda, u, ldu )
+                          call stdlib_slaset( 'U', m-1, m-1, zero, zero, u( 1, 2 ),ldu )
+                          ie = itau
+                          itauq = ie + m
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in u
+                          ! (workspace: need 4*m, prefer 3*m+2*m*nb)
+                          call stdlib_sgebrd( m, m, u, ldu, s, work( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in u by q
+                          ! in vt
+                          ! (workspace: need 3*m+n, prefer 3*m+n*nb)
+                          call stdlib_sormbr( 'P', 'L', 'T', m, n, m, u, ldu,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (workspace: need 4*m, prefer 3*m+m*nb)
+                          call stdlib_sorgbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          iwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (workspace: need bdspac)
+                          call stdlib_sbdsqr( 'U', m, n, m, 0, s, work( ie ), vt,ldvt, u, ldu, &
+                                    dum, 1, work( iwork ),info )
+                       end if
+                    end if
+                 end if
+              else
+                 ! n < mnthr
+                 ! path 10t(n greater than m, but not much larger)
+                 ! reduce to bidiagonal form without lq decomposition
+                 ie = 1
+                 itauq = ie + m
+                 itaup = itauq + m
+                 iwork = itaup + m
+                 ! bidiagonalize a
+                 ! (workspace: need 3*m+n, prefer 3*m+(m+n)*nb)
+                 call stdlib_sgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
+                           work( iwork ), lwork-iwork+1,ierr )
+                 if( wntuas ) then
+                    ! if left singular vectors desired in u, copy result to u
+                    ! and generate left bidiagonalizing vectors in u
+                    ! (workspace: need 4*m-1, prefer 3*m+(m-1)*nb)
+                    call stdlib_slacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_sorgbr( 'Q', m, m, n, u, ldu, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvas ) then
+                    ! if right singular vectors desired in vt, copy result to
+                    ! vt and generate right bidiagonalizing vectors in vt
+                    ! (workspace: need 3*m+nrvt, prefer 3*m+nrvt*nb)
+                    call stdlib_slacpy( 'U', m, n, a, lda, vt, ldvt )
+                    if( wntva )nrvt = n
+                    if( wntvs )nrvt = m
+                    call stdlib_sorgbr( 'P', nrvt, n, m, vt, ldvt, work( itaup ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntuo ) then
+                    ! if left singular vectors desired in a, generate left
+                    ! bidiagonalizing vectors in a
+                    ! (workspace: need 4*m-1, prefer 3*m+(m-1)*nb)
+                    call stdlib_sorgbr( 'Q', m, m, n, a, lda, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvo ) then
+                    ! if right singular vectors desired in a, generate right
+                    ! bidiagonalizing vectors in a
+                    ! (workspace: need 4*m, prefer 3*m+m*nb)
+                    call stdlib_sorgbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 iwork = ie + m
+                 if( wntuas .or. wntuo )nru = m
+                 if( wntun )nru = 0
+                 if( wntvas .or. wntvo )ncvt = n
+                 if( wntvn )ncvt = 0
+                 if( ( .not.wntuo ) .and. ( .not.wntvo ) ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in vt
+                    ! (workspace: need bdspac)
+                    call stdlib_sbdsqr( 'L', m, ncvt, nru, 0, s, work( ie ), vt,ldvt, u, ldu, dum,&
+                               1, work( iwork ), info )
+                 else if( ( .not.wntuo ) .and. wntvo ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in a
+                    ! (workspace: need bdspac)
+                    call stdlib_sbdsqr( 'L', m, ncvt, nru, 0, s, work( ie ), a, lda,u, ldu, dum, &
+                              1, work( iwork ), info )
+                 else
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in a and computing right singular
+                    ! vectors in vt
+                    ! (workspace: need bdspac)
+                    call stdlib_sbdsqr( 'L', m, ncvt, nru, 0, s, work( ie ), vt,ldvt, a, lda, dum,&
+                               1, work( iwork ), info )
+                 end if
+              end if
+           end if
+           ! if stdlib_sbdsqr failed to converge, copy unconverged superdiagonals
+           ! to work( 2:minmn )
+           if( info/=0 ) then
+              if( ie>2 ) then
+                 do i = 1, minmn - 1
+                    work( i+1 ) = work( i+ie-1 )
+                 end do
+              end if
+              if( ie<2 ) then
+                 do i = minmn - 1, 1, -1
+                    work( i+1 ) = work( i+ie-1 )
+                 end do
+              end if
+           end if
+           ! undo scaling if necessary
+           if( iscl==1 ) then
+              if( anrm>bignum )call stdlib_slascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm>bignum )call stdlib_slascl( 'G', 0, 0, bignum, anrm, minmn-1,&
+                         1, work( 2 ),minmn, ierr )
+              if( anrm<smlnum )call stdlib_slascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm<smlnum )call stdlib_slascl( 'G', 0, 0, smlnum, anrm, minmn-1,&
+                         1, work( 2 ),minmn, ierr )
+           end if
+           ! return optimal workspace in work(1)
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_sgesvd
+
+     !> SGESVDQ: computes the singular value decomposition (SVD) of a real
+     !> M-by-N matrix A, where M >= N. The SVD of A is written as
+     !> [++]   [xx]   [x0]   [xx]
+     !> A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+     !> [++]   [xx]
+     !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+     !> matrix, and V is an N-by-N orthogonal matrix. The diagonal elements
+     !> of SIGMA are the singular values of A. The columns of U and V are the
+     !> left and the right singular vectors of A, respectively.
+
+     subroutine stdlib_sgesvdq( joba, jobp, jobr, jobu, jobv, m, n, a, lda,s, u, ldu, v, ldv, &
+               numrank, iwork, liwork,work, lwork, rwork, lrwork, info )
+           ! Scalar Arguments 
+           character, intent(in) :: joba, jobp, jobr, jobu, jobv
+           integer(ilp), intent(in) :: m, n, lda, ldu, ldv, liwork, lrwork
+           integer(ilp), intent(out) :: numrank, info
+           integer(ilp), intent(inout) :: lwork
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: u(ldu,*), v(ldv,*), work(*)
+           real(sp), intent(out) :: s(*), rwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: ierr, iwoff, nr, n1, optratio, p, q
+           integer(ilp) :: lwcon, lwqp3, lwrk_sgelqf, lwrk_sgesvd, lwrk_sgesvd2, lwrk_sgeqp3, &
+           lwrk_sgeqrf, lwrk_sormlq, lwrk_sormqr, lwrk_sormqr2, lwlqf, lwqrf, lwsvd, lwsvd2, &
+                     lworq, lworq2, lwunlq, minwrk, minwrk2, optwrk, optwrk2, iminwrk, rminwrk
+           logical(lk) :: accla, acclm, acclh, ascaled, conda, dntwu, dntwv, lquery, lsvc0, lsvec,&
+                      rowprm, rsvec, rtrans, wntua, wntuf, wntur, wntus, wntva, wntvr
+           real(sp) :: big, epsln, rtmp, sconda, sfmin
+           ! Local Arrays
+           real(sp) :: rdummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           wntus  = stdlib_lsame( jobu, 'S' ) .or. stdlib_lsame( jobu, 'U' )
+           wntur  = stdlib_lsame( jobu, 'R' )
+           wntua  = stdlib_lsame( jobu, 'A' )
+           wntuf  = stdlib_lsame( jobu, 'F' )
+           lsvc0  = wntus .or. wntur .or. wntua
+           lsvec  = lsvc0 .or. wntuf
+           dntwu  = stdlib_lsame( jobu, 'N' )
+           wntvr  = stdlib_lsame( jobv, 'R' )
+           wntva  = stdlib_lsame( jobv, 'A' ) .or. stdlib_lsame( jobv, 'V' )
+           rsvec  = wntvr .or. wntva
+           dntwv  = stdlib_lsame( jobv, 'N' )
+           accla  = stdlib_lsame( joba, 'A' )
+           acclm  = stdlib_lsame( joba, 'M' )
+           conda  = stdlib_lsame( joba, 'E' )
+           acclh  = stdlib_lsame( joba, 'H' ) .or. conda
+           rowprm = stdlib_lsame( jobp, 'P' )
+           rtrans = stdlib_lsame( jobr, 'T' )
+           if ( rowprm ) then
+              if ( conda ) then
+                 iminwrk = max( 1, n + m - 1 + n )
+              else
+                 iminwrk = max( 1, n + m - 1 )
+              end if
+              rminwrk = max( 2, m )
+           else
+              if ( conda ) then
+                 iminwrk = max( 1, n + n )
+              else
+                 iminwrk = max( 1, n )
+              end if
+              rminwrk = 2
+           end if
+           lquery = (liwork == -1 .or. lwork == -1 .or. lrwork == -1)
+           info  = 0
+           if ( .not. ( accla .or. acclm .or. acclh ) ) then
+              info = -1
+           else if ( .not.( rowprm .or. stdlib_lsame( jobp, 'N' ) ) ) then
+               info = -2
+           else if ( .not.( rtrans .or. stdlib_lsame( jobr, 'N' ) ) ) then
+               info = -3
+           else if ( .not.( lsvec .or. dntwu ) ) then
+              info = -4
+           else if ( wntur .and. wntva ) then
+              info = -5
+           else if ( .not.( rsvec .or. dntwv )) then
+              info = -5
+           else if ( m<0 ) then
+              info = -6
+           else if ( ( n<0 ) .or. ( n>m ) ) then
+              info = -7
+           else if ( lda<max( 1, m ) ) then
+              info = -9
+           else if ( ldu<1 .or. ( lsvc0 .and. ldu<m ) .or.( wntuf .and. ldu<n ) ) then
+              info = -12
+           else if ( ldv<1 .or. ( rsvec .and. ldv<n ) .or.( conda .and. ldv<n ) ) then
+              info = -14
+           else if ( liwork < iminwrk .and. .not. lquery ) then
+              info = -17
+           end if
+           if ( info == 0 ) then
+              ! Compute The Minimal And The Optimal Workspace Lengths
+              ! [[the expressions for computing the minimal and the optimal
+              ! values of lwork are written with a lot of redundancy and
+              ! can be simplified. however, this detailed form is easier for
+              ! maintenance and modifications of the code.]]
+              ! Minimal Workspace Length For Stdlib_Sgeqp3 Of An M X N Matrix
+              lwqp3 = 3 * n + 1
+              ! Minimal Workspace Length For Stdlib_Sormqr To Build Left Singular Vectors
+              if ( wntus .or. wntur ) then
+                  lworq  = max( n  , 1 )
+              else if ( wntua ) then
+                  lworq = max( m , 1 )
+              end if
+              ! Minimal Workspace Length For Stdlib_Spocon Of An N X N Matrix
+              lwcon = 3 * n
+              ! Stdlib_Sgesvd Of An N X N Matrix
+              lwsvd = max( 5 * n, 1 )
+              if ( lquery ) then
+                  call stdlib_sgeqp3( m, n, a, lda, iwork, rdummy, rdummy, -1,ierr )
+                  lwrk_sgeqp3 = int( rdummy(1),KIND=ilp)
+                  if ( wntus .or. wntur ) then
+                      call stdlib_sormqr( 'L', 'N', m, n, n, a, lda, rdummy, u,ldu, rdummy, -1, &
+                                ierr )
+                      lwrk_sormqr = int( rdummy(1),KIND=ilp)
+                  else if ( wntua ) then
+                      call stdlib_sormqr( 'L', 'N', m, m, n, a, lda, rdummy, u,ldu, rdummy, -1, &
+                                ierr )
+                      lwrk_sormqr = int( rdummy(1),KIND=ilp)
+                  else
+                      lwrk_sormqr = 0
+                  end if
+              end if
+              minwrk = 2
+              optwrk = 2
+              if ( .not. (lsvec .or. rsvec )) then
+                  ! Minimal And Optimal Sizes Of The Workspace If
+                  ! only the singular values are requested
+                  if ( conda ) then
+                     minwrk = max( n+lwqp3, lwcon, lwsvd )
+                  else
+                     minwrk = max( n+lwqp3, lwsvd )
+                  end if
+                  if ( lquery ) then
+                      call stdlib_sgesvd( 'N', 'N', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                ierr )
+                      lwrk_sgesvd = int( rdummy(1),KIND=ilp)
+                      if ( conda ) then
+                         optwrk = max( n+lwrk_sgeqp3, n+lwcon, lwrk_sgesvd )
+                      else
+                         optwrk = max( n+lwrk_sgeqp3, lwrk_sgesvd )
+                      end if
+                  end if
+              else if ( lsvec .and. (.not.rsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Workspace If The
+                  ! singular values and the left singular vectors are requested
+                  if ( conda ) then
+                      minwrk = n + max( lwqp3, lwcon, lwsvd, lworq )
+                  else
+                      minwrk = n + max( lwqp3, lwsvd, lworq )
+                  end if
+                  if ( lquery ) then
+                     if ( rtrans ) then
+                        call stdlib_sgesvd( 'N', 'O', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                  ierr )
+                     else
+                        call stdlib_sgesvd( 'O', 'N', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                  ierr )
+                     end if
+                     lwrk_sgesvd = int( rdummy(1),KIND=ilp)
+                     if ( conda ) then
+                         optwrk = n + max( lwrk_sgeqp3, lwcon, lwrk_sgesvd,lwrk_sormqr )
+                     else
+                         optwrk = n + max( lwrk_sgeqp3, lwrk_sgesvd,lwrk_sormqr )
+                     end if
+                  end if
+              else if ( rsvec .and. (.not.lsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Workspace If The
+                  ! singular values and the right singular vectors are requested
+                  if ( conda ) then
+                      minwrk = n + max( lwqp3, lwcon, lwsvd )
+                  else
+                      minwrk = n + max( lwqp3, lwsvd )
+                  end if
+                  if ( lquery ) then
+                      if ( rtrans ) then
+                          call stdlib_sgesvd( 'O', 'N', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -&
+                                    1, ierr )
+                      else
+                          call stdlib_sgesvd( 'N', 'O', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -&
+                                    1, ierr )
+                      end if
+                      lwrk_sgesvd = int( rdummy(1),KIND=ilp)
+                      if ( conda ) then
+                          optwrk = n + max( lwrk_sgeqp3, lwcon, lwrk_sgesvd )
+                      else
+                          optwrk = n + max( lwrk_sgeqp3, lwrk_sgesvd )
+                      end if
+                  end if
+              else
+                  ! Minimal And Optimal Sizes Of The Workspace If The
+                  ! full svd is requested
+                  if ( rtrans ) then
+                      minwrk = max( lwqp3, lwsvd, lworq )
+                      if ( conda ) minwrk = max( minwrk, lwcon )
+                      minwrk = minwrk + n
+                      if ( wntva ) then
+                         ! .. minimal workspace length for n x n/2 stdlib_sgeqrf
+                         lwqrf  = max( n/2, 1 )
+                         ! .. minimal workspace length for n/2 x n/2 stdlib_sgesvd
+                         lwsvd2 = max( 5 * (n/2), 1 )
+                         lworq2 = max( n, 1 )
+                         minwrk2 = max( lwqp3, n/2+lwqrf, n/2+lwsvd2,n/2+lworq2, lworq )
+                         if ( conda ) minwrk2 = max( minwrk2, lwcon )
+                         minwrk2 = n + minwrk2
+                         minwrk = max( minwrk, minwrk2 )
+                      end if
+                  else
+                      minwrk = max( lwqp3, lwsvd, lworq )
+                      if ( conda ) minwrk = max( minwrk, lwcon )
+                      minwrk = minwrk + n
+                      if ( wntva ) then
+                         ! .. minimal workspace length for n/2 x n stdlib_sgelqf
+                         lwlqf  = max( n/2, 1 )
+                         lwsvd2 = max( 5 * (n/2), 1 )
+                         lwunlq = max( n , 1 )
+                         minwrk2 = max( lwqp3, n/2+lwlqf, n/2+lwsvd2,n/2+lwunlq, lworq )
+                         if ( conda ) minwrk2 = max( minwrk2, lwcon )
+                         minwrk2 = n + minwrk2
+                         minwrk = max( minwrk, minwrk2 )
+                      end if
+                  end if
+                  if ( lquery ) then
+                     if ( rtrans ) then
+                        call stdlib_sgesvd( 'O', 'A', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                  ierr )
+                        lwrk_sgesvd = int( rdummy(1),KIND=ilp)
+                        optwrk = max(lwrk_sgeqp3,lwrk_sgesvd,lwrk_sormqr)
+                        if ( conda ) optwrk = max( optwrk, lwcon )
+                        optwrk = n + optwrk
+                        if ( wntva ) then
+                            call stdlib_sgeqrf(n,n/2,u,ldu,rdummy,rdummy,-1,ierr)
+                            lwrk_sgeqrf = int( rdummy(1),KIND=ilp)
+                            call stdlib_sgesvd( 'S', 'O', n/2,n/2, v,ldv, s, u,ldu,v, ldv, rdummy,&
+                                       -1, ierr )
+                            lwrk_sgesvd2 = int( rdummy(1),KIND=ilp)
+                            call stdlib_sormqr( 'R', 'C', n, n, n/2, u, ldu, rdummy,v, ldv, &
+                                      rdummy, -1, ierr )
+                            lwrk_sormqr2 = int( rdummy(1),KIND=ilp)
+                            optwrk2 = max( lwrk_sgeqp3, n/2+lwrk_sgeqrf,n/2+lwrk_sgesvd2, n/2+&
+                                      lwrk_sormqr2 )
+                            if ( conda ) optwrk2 = max( optwrk2, lwcon )
+                            optwrk2 = n + optwrk2
+                            optwrk = max( optwrk, optwrk2 )
+                        end if
+                     else
+                        call stdlib_sgesvd( 'S', 'O', n, n, a, lda, s, u, ldu,v, ldv, rdummy, -1, &
+                                  ierr )
+                        lwrk_sgesvd = int( rdummy(1),KIND=ilp)
+                        optwrk = max(lwrk_sgeqp3,lwrk_sgesvd,lwrk_sormqr)
+                        if ( conda ) optwrk = max( optwrk, lwcon )
+                        optwrk = n + optwrk
+                        if ( wntva ) then
+                           call stdlib_sgelqf(n/2,n,u,ldu,rdummy,rdummy,-1,ierr)
+                           lwrk_sgelqf = int( rdummy(1),KIND=ilp)
+                           call stdlib_sgesvd( 'S','O', n/2,n/2, v, ldv, s, u, ldu,v, ldv, rdummy,&
+                                      -1, ierr )
+                           lwrk_sgesvd2 = int( rdummy(1),KIND=ilp)
+                           call stdlib_sormlq( 'R', 'N', n, n, n/2, u, ldu, rdummy,v, ldv, rdummy,&
+                                     -1,ierr )
+                           lwrk_sormlq = int( rdummy(1),KIND=ilp)
+                           optwrk2 = max( lwrk_sgeqp3, n/2+lwrk_sgelqf,n/2+lwrk_sgesvd2, n/2+&
+                                     lwrk_sormlq )
+                            if ( conda ) optwrk2 = max( optwrk2, lwcon )
+                            optwrk2 = n + optwrk2
+                            optwrk = max( optwrk, optwrk2 )
+                        end if
+                     end if
+                  end if
+              end if
+              minwrk = max( 2, minwrk )
+              optwrk = max( 2, optwrk )
+              if ( lwork < minwrk .and. (.not.lquery) ) info = -19
+           end if
+           if (info == 0 .and. lrwork < rminwrk .and. .not. lquery) then
+              info = -21
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGESVDQ', -info )
+              return
+           else if ( lquery ) then
+           ! return optimal workspace
+               iwork(1) = iminwrk
+               work(1) = optwrk
+               work(2) = minwrk
+               rwork(1) = rminwrk
+               return
+           end if
+           ! quick return if the matrix is void.
+           if( ( m==0 ) .or. ( n==0 ) ) then
+           ! All Output Is Void
+              return
+           end if
+           big = stdlib_slamch('O')
+           ascaled = .false.
+           iwoff = 1
+           if ( rowprm ) then
+                 iwoff = m
+                 ! Reordering The Rows In Decreasing Sequence In The
+                 ! ell-infinity norm - this enhances numerical robustness in
+                 ! the case of differently scaled rows.
+                 do p = 1, m
+                     ! rwork(p) = abs( a(p,stdlib_icamax(n,a(p,1),lda)) )
+                     ! [[stdlib_slange will return nan if an entry of the p-th row is nan]]
+                     rwork(p) = stdlib_slange( 'M', 1, n, a(p,1), lda, rdummy )
+                     ! .. check for nan's and inf's
+                     if ( ( rwork(p) /= rwork(p) ) .or.( (rwork(p)*zero) /= zero ) ) then
+                         info = -8
+                         call stdlib_xerbla( 'SGESVDQ', -info )
+                         return
+                     end if
+                 end do
+                 do p = 1, m - 1
+                 q = stdlib_isamax( m-p+1, rwork(p), 1 ) + p - 1
+                 iwork(n+p) = q
+                 if ( p /= q ) then
+                    rtmp     = rwork(p)
+                    rwork(p) = rwork(q)
+                    rwork(q) = rtmp
+                 end if
+                 end do
+                 if ( rwork(1) == zero ) then
+                    ! quick return: a is the m x n zero matrix.
+                    numrank = 0
+                    call stdlib_slaset( 'G', n, 1, zero, zero, s, n )
+                    if ( wntus ) call stdlib_slaset('G', m, n, zero, one, u, ldu)
+                    if ( wntua ) call stdlib_slaset('G', m, m, zero, one, u, ldu)
+                    if ( wntva ) call stdlib_slaset('G', n, n, zero, one, v, ldv)
+                    if ( wntuf ) then
+                        call stdlib_slaset( 'G', n, 1, zero, zero, work, n )
+                        call stdlib_slaset( 'G', m, n, zero,  one, u, ldu )
+                    end if
+                    do p = 1, n
+                        iwork(p) = p
+                    end do
+                    if ( rowprm ) then
+                        do p = n + 1, n + m - 1
+                            iwork(p) = p - n
+                        end do
+                    end if
+                    if ( conda ) rwork(1) = -1
+                    rwork(2) = -1
+                    return
+                 end if
+                 if ( rwork(1) > big / sqrt(real(m,KIND=sp)) ) then
+                     ! .. to prevent overflow in the qr factorization, scale the
+                     ! matrix by 1/sqrt(m) if too large entry detected
+                     call stdlib_slascl('G',0,0,sqrt(real(m,KIND=sp)),one, m,n, a,lda, ierr)
+                               
+                     ascaled = .true.
+                 end if
+                 call stdlib_slaswp( n, a, lda, 1, m-1, iwork(n+1), 1 )
+           end if
+          ! .. at this stage, preemptive scaling is done only to avoid column
+          ! norms overflows during the qr factorization. the svd procedure should
+          ! have its own scaling to save the singular values from overflows and
+          ! underflows. that depends on the svd procedure.
+           if ( .not.rowprm ) then
+               rtmp = stdlib_slange( 'M', m, n, a, lda, rdummy )
+               if ( ( rtmp /= rtmp ) .or.( (rtmp*zero) /= zero ) ) then
+                    info = -8
+                    call stdlib_xerbla( 'SGESVDQ', -info )
+                    return
+               end if
+               if ( rtmp > big / sqrt(real(m,KIND=sp)) ) then
+                   ! .. to prevent overflow in the qr factorization, scale the
+                   ! matrix by 1/sqrt(m) if too large entry detected
+                   call stdlib_slascl('G',0,0, sqrt(real(m,KIND=sp)),one, m,n, a,lda, ierr)
+                             
+                   ascaled = .true.
+               end if
+           end if
+           ! Qr Factorization With Column Pivoting
+           ! a * p = q * [ r ]
+                       ! [ 0 ]
+           do p = 1, n
+              ! All Columns Are Free Columns
+              iwork(p) = 0
+           end do
+           call stdlib_sgeqp3( m, n, a, lda, iwork, work, work(n+1), lwork-n,ierr )
+          ! if the user requested accuracy level allows truncation in the
+          ! computed upper triangular factor, the matrix r is examined and,
+          ! if possible, replaced with its leading upper trapezoidal part.
+           epsln = stdlib_slamch('E')
+           sfmin = stdlib_slamch('S')
+           ! small = sfmin / epsln
+           nr = n
+           if ( accla ) then
+              ! standard absolute error bound suffices. all sigma_i with
+              ! sigma_i < n*eps*||a||_f are flushed to zero. this is an
+              ! aggressive enforcement of lower numerical rank by introducing a
+              ! backward error of the order of n*eps*||a||_f.
+              nr = 1
+              rtmp = sqrt(real(n,KIND=sp))*epsln
+              do p = 2, n
+                 if ( abs(a(p,p)) < (rtmp*abs(a(1,1))) ) go to 3002
+                    nr = nr + 1
+              end do
+              3002 continue
+           elseif ( acclm ) then
+              ! .. similarly as above, only slightly more gentle (less aggressive).
+              ! sudden drop on the diagonal of r is used as the criterion for being
+              ! close-to-rank-deficient. the threshold is set to epsln=stdlib_slamch('e').
+              ! [[this can be made more flexible by replacing this hard-coded value
+              ! with a user specified threshold.]] also, the values that underflow
+              ! will be truncated.
+              nr = 1
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < (epsln*abs(a(p-1,p-1))) ) .or.( abs(a(p,p)) < sfmin ) ) go &
+                           to 3402
+                 nr = nr + 1
+              end do
+              3402 continue
+           else
+              ! Rrqr Not Authorized To Determine Numerical Rank Except In The
+              ! obvious case of zero pivots.
+              ! .. inspect r for exact zeros on the diagonal;
+              ! r(i,i)=0 => r(i:n,i:n)=0.
+              nr = 1
+              do p = 2, n
+                 if ( abs(a(p,p)) == zero ) go to 3502
+                 nr = nr + 1
+              end do
+              3502 continue
+              if ( conda ) then
+                 ! estimate the scaled condition number of a. use the fact that it is
+                 ! the same as the scaled condition number of r.
+                    ! V Is Used As Workspace
+                    call stdlib_slacpy( 'U', n, n, a, lda, v, ldv )
+                    ! only the leading nr x nr submatrix of the triangular factor
+                    ! is considered. only if nr=n will this give a reliable error
+                    ! bound. however, even for nr < n, this can be used on an
+                    ! expert level and obtain useful information in the sense of
+                    ! perturbation theory.
+                    do p = 1, nr
+                       rtmp = stdlib_snrm2( p, v(1,p), 1 )
+                       call stdlib_sscal( p, one/rtmp, v(1,p), 1 )
+                    end do
+                    if ( .not. ( lsvec .or. rsvec ) ) then
+                        call stdlib_spocon( 'U', nr, v, ldv, one, rtmp,work, iwork(n+iwoff), ierr &
+                                  )
+                    else
+                        call stdlib_spocon( 'U', nr, v, ldv, one, rtmp,work(n+1), iwork(n+iwoff), &
+                                  ierr )
+                    end if
+                    sconda = one / sqrt(rtmp)
+                 ! for nr=n, sconda is an estimate of sqrt(||(r^* * r)^(-1)||_1),
+                 ! n^(-1/4) * sconda <= ||r^(-1)||_2 <= n^(1/4) * sconda
+                 ! see the reference [1] for more details.
+              end if
+           endif
+           if ( wntur ) then
+               n1 = nr
+           else if ( wntus .or. wntuf) then
+               n1 = n
+           else if ( wntua ) then
+               n1 = m
+           end if
+           if ( .not. ( rsvec .or. lsvec ) ) then
+      ! .......................................................................
+              ! Only The Singular Values Are Requested
+      ! .......................................................................
+              if ( rtrans ) then
+               ! .. compute the singular values of r**t = [a](1:nr,1:n)**t
+                 ! .. set the lower triangle of [a] to [a](1:nr,1:n)**t and
+                 ! the upper triangle of [a] to zero.
+                 do p = 1, min( n, nr )
+                    do q = p + 1, n
+                       a(q,p) = a(p,q)
+                       if ( q <= nr ) a(p,q) = zero
+                    end do
+                 end do
+                 call stdlib_sgesvd( 'N', 'N', n, nr, a, lda, s, u, ldu,v, ldv, work, lwork, info &
+                           )
+              else
+                 ! .. compute the singular values of r = [a](1:nr,1:n)
+                 if ( nr > 1 )call stdlib_slaset( 'L', nr-1,nr-1, zero,zero, a(2,1), lda )
+                 call stdlib_sgesvd( 'N', 'N', nr, n, a, lda, s, u, ldu,v, ldv, work, lwork, info &
+                           )
+              end if
+           else if ( lsvec .and. ( .not. rsvec) ) then
+      ! .......................................................................
+             ! The Singular Values And The Left Singular Vectors Requested
+      ! .......................................................................""""""""
+              if ( rtrans ) then
+                  ! .. apply stdlib_sgesvd to r**t
+                  ! .. copy r**t into [u] and overwrite [u] with the right singular
+                  ! vectors of r
+                 do p = 1, nr
+                    do q = p, n
+                       u(q,p) = a(p,q)
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_slaset( 'U', nr-1,nr-1, zero,zero, u(1,2), ldu )
+                 ! .. the left singular vectors not computed, the nr right singular
+                 ! vectors overwrite [u](1:nr,1:nr) as transposed. these
+                 ! will be pre-multiplied by q to build the left singular vectors of a.
+                    call stdlib_sgesvd( 'N', 'O', n, nr, u, ldu, s, u, ldu,u, ldu, work(n+1), &
+                              lwork-n, info )
+                    do p = 1, nr
+                        do q = p + 1, nr
+                           rtmp   = u(q,p)
+                           u(q,p) = u(p,q)
+                           u(p,q) = rtmp
+                        end do
+                    end do
+              else
+                  ! Apply Stdlib_Sgesvd To R
+                  ! .. copy r into [u] and overwrite [u] with the left singular vectors
+                  call stdlib_slacpy( 'U', nr, n, a, lda, u, ldu )
+                  if ( nr > 1 )call stdlib_slaset( 'L', nr-1, nr-1, zero, zero, u(2,1), ldu )
+                            
+                  ! .. the right singular vectors not computed, the nr left singular
+                  ! vectors overwrite [u](1:nr,1:nr)
+                     call stdlib_sgesvd( 'O', 'N', nr, n, u, ldu, s, u, ldu,v, ldv, work(n+1), &
+                               lwork-n, info )
+                     ! .. now [u](1:nr,1:nr) contains the nr left singular vectors of
+                     ! r. these will be pre-multiplied by q to build the left singular
+                     ! vectors of a.
+              end if
+                 ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+              if ( ( nr < m ) .and. ( .not.wntuf ) ) then
+                  call stdlib_slaset('A', m-nr, nr, zero, zero, u(nr+1,1), ldu)
+                  if ( nr < n1 ) then
+                     call stdlib_slaset( 'A',nr,n1-nr,zero,zero,u(1,nr+1), ldu )
+                     call stdlib_slaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                  end if
+              end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! vectors matrix u.
+              if ( .not.wntuf )call stdlib_sormqr( 'L', 'N', m, n1, n, a, lda, work, u,ldu, work(&
+                        n+1), lwork-n, ierr )
+              if ( rowprm .and. .not.wntuf )call stdlib_slaswp( n1, u, ldu, 1, m-1, iwork(n+1), -&
+                        1 )
+           else if ( rsvec .and. ( .not. lsvec ) ) then
+      ! .......................................................................
+             ! The Singular Values And The Right Singular Vectors Requested
+      ! .......................................................................
+               if ( rtrans ) then
+                  ! .. apply stdlib_sgesvd to r**t
+                  ! .. copy r**t into v and overwrite v with the left singular vectors
+                 do p = 1, nr
+                    do q = p, n
+                       v(q,p) = (a(p,q))
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_slaset( 'U', nr-1,nr-1, zero,zero, v(1,2), ldv )
+                 ! .. the left singular vectors of r**t overwrite v, the right singular
+                 ! vectors not computed
+                 if ( wntvr .or. ( nr == n ) ) then
+                    call stdlib_sgesvd( 'O', 'N', n, nr, v, ldv, s, u, ldu,u, ldu, work(n+1), &
+                              lwork-n, info )
+                    do p = 1, nr
+                        do q = p + 1, nr
+                           rtmp   = v(q,p)
+                           v(q,p) = v(p,q)
+                           v(p,q) = rtmp
+                        end do
+                    end do
+                    if ( nr < n ) then
+                        do p = 1, nr
+                           do q = nr + 1, n
+                               v(p,q) = v(q,p)
+                           end do
+                        end do
+                    end if
+                    call stdlib_slapmt( .false., nr, n, v, ldv, iwork )
+                 else
+                     ! .. need all n right singular vectors and nr < n
+                     ! [!] this is simple implementation that augments [v](1:n,1:nr)
+                     ! by padding a zero block. in the case nr << n, a more efficient
+                     ! way is to first use the qr factorization. for more details
+                     ! how to implement this, see the " full svd " branch.
+                     call stdlib_slaset('G', n, n-nr, zero, zero, v(1,nr+1), ldv)
+                     call stdlib_sgesvd( 'O', 'N', n, n, v, ldv, s, u, ldu,u, ldu, work(n+1), &
+                               lwork-n, info )
+                     do p = 1, n
+                        do q = p + 1, n
+                           rtmp   = v(q,p)
+                           v(q,p) = v(p,q)
+                           v(p,q) = rtmp
+                        end do
+                     end do
+                     call stdlib_slapmt( .false., n, n, v, ldv, iwork )
+                 end if
+               else
+                  ! Aply Stdlib_Sgesvd To R
+                  ! Copy R Into V And Overwrite V With The Right Singular Vectors
+                  call stdlib_slacpy( 'U', nr, n, a, lda, v, ldv )
+                  if ( nr > 1 )call stdlib_slaset( 'L', nr-1, nr-1, zero, zero, v(2,1), ldv )
+                            
+                  ! .. the right singular vectors overwrite v, the nr left singular
+                  ! vectors stored in u(1:nr,1:nr)
+                  if ( wntvr .or. ( nr == n ) ) then
+                     call stdlib_sgesvd( 'N', 'O', nr, n, v, ldv, s, u, ldu,v, ldv, work(n+1), &
+                               lwork-n, info )
+                     call stdlib_slapmt( .false., nr, n, v, ldv, iwork )
+                     ! .. now [v](1:nr,1:n) contains v(1:n,1:nr)**t
+                  else
+                     ! .. need all n right singular vectors and nr < n
+                     ! [!] this is simple implementation that augments [v](1:nr,1:n)
+                     ! by padding a zero block. in the case nr << n, a more efficient
+                     ! way is to first use the lq factorization. for more details
+                     ! how to implement this, see the " full svd " branch.
+                      call stdlib_slaset('G', n-nr, n, zero,zero, v(nr+1,1), ldv)
+                      call stdlib_sgesvd( 'N', 'O', n, n, v, ldv, s, u, ldu,v, ldv, work(n+1), &
+                                lwork-n, info )
+                      call stdlib_slapmt( .false., n, n, v, ldv, iwork )
+                  end if
+                  ! .. now [v] contains the transposed matrix of the right singular
+                  ! vectors of a.
+               end if
+           else
+      ! .......................................................................
+             ! Full Svd Requested
+      ! .......................................................................
+              if ( rtrans ) then
+                  ! .. apply stdlib_sgesvd to r**t [[this option is left for r
+                 if ( wntvr .or. ( nr == n ) ) then
+                  ! .. copy r**t into [v] and overwrite [v] with the left singular
+                  ! vectors of r**t
+                 do p = 1, nr
+                    do q = p, n
+                       v(q,p) = a(p,q)
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_slaset( 'U', nr-1,nr-1, zero,zero, v(1,2), ldv )
+                 ! .. the left singular vectors of r**t overwrite [v], the nr right
+                 ! singular vectors of r**t stored in [u](1:nr,1:nr) as transposed
+                    call stdlib_sgesvd( 'O', 'A', n, nr, v, ldv, s, v, ldv,u, ldu, work(n+1), &
+                              lwork-n, info )
+                    ! Assemble V
+                    do p = 1, nr
+                       do q = p + 1, nr
+                          rtmp   = v(q,p)
+                          v(q,p) = v(p,q)
+                          v(p,q) = rtmp
+                       end do
+                    end do
+                    if ( nr < n ) then
+                        do p = 1, nr
+                           do q = nr+1, n
+                              v(p,q) = v(q,p)
+                           end do
+                        end do
+                    end if
+                    call stdlib_slapmt( .false., nr, n, v, ldv, iwork )
+                     do p = 1, nr
+                        do q = p + 1, nr
+                           rtmp   = u(q,p)
+                           u(q,p) = u(p,q)
+                           u(p,q) = rtmp
+                        end do
+                     end do
+                     if ( ( nr < m ) .and. .not.(wntuf)) then
+                       call stdlib_slaset('A', m-nr,nr, zero,zero, u(nr+1,1), ldu)
+                       if ( nr < n1 ) then
+                          call stdlib_slaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                          call stdlib_slaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                       end if
+                    end if
+                 else
+                     ! .. need all n right singular vectors and nr < n
+                  ! .. copy r**t into [v] and overwrite [v] with the left singular
+                  ! vectors of r**t
+                     ! [[the optimal ratio n/nr for using qrf instead of padding
+                       ! with zeros. here hard coded to 2; it must be at least
+                       ! two due to work space constraints.]]
+                     ! optratio = stdlib_ilaenv(6, 'sgesvd', 's' // 'o', nr,n,0,0)
+                     ! optratio = max( optratio, 2 )
+                     optratio = 2
+                     if ( optratio*nr > n ) then
+                        do p = 1, nr
+                           do q = p, n
+                              v(q,p) = a(p,q)
+                           end do
+                        end do
+                        if ( nr > 1 )call stdlib_slaset('U',nr-1,nr-1, zero,zero, v(1,2),ldv)
+                                  
+                        call stdlib_slaset('A',n,n-nr,zero,zero,v(1,nr+1),ldv)
+                        call stdlib_sgesvd( 'O', 'A', n, n, v, ldv, s, v, ldv,u, ldu, work(n+1), &
+                                  lwork-n, info )
+                        do p = 1, n
+                           do q = p + 1, n
+                              rtmp   = v(q,p)
+                              v(q,p) = v(p,q)
+                              v(p,q) = rtmp
+                           end do
+                        end do
+                        call stdlib_slapmt( .false., n, n, v, ldv, iwork )
+                    ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x n1), i.e. (m x n) or (m x m).
+                        do p = 1, n
+                           do q = p + 1, n
+                              rtmp   = u(q,p)
+                              u(q,p) = u(p,q)
+                              u(p,q) = rtmp
+                           end do
+                        end do
+                        if ( ( n < m ) .and. .not.(wntuf)) then
+                           call stdlib_slaset('A',m-n,n,zero,zero,u(n+1,1),ldu)
+                           if ( n < n1 ) then
+                             call stdlib_slaset('A',n,n1-n,zero,zero,u(1,n+1),ldu)
+                             call stdlib_slaset('A',m-n,n1-n,zero,one,u(n+1,n+1), ldu )
+                           end if
+                        end if
+                     else
+                        ! .. copy r**t into [u] and overwrite [u] with the right
+                        ! singular vectors of r
+                        do p = 1, nr
+                           do q = p, n
+                              u(q,nr+p) = a(p,q)
+                           end do
+                        end do
+                        if ( nr > 1 )call stdlib_slaset('U',nr-1,nr-1,zero,zero,u(1,nr+2),ldu)
+                                  
+                        call stdlib_sgeqrf( n, nr, u(1,nr+1), ldu, work(n+1),work(n+nr+1), lwork-&
+                                  n-nr, ierr )
+                        do p = 1, nr
+                            do q = 1, n
+                                v(q,p) = u(p,nr+q)
+                            end do
+                        end do
+                       call stdlib_slaset('U',nr-1,nr-1,zero,zero,v(1,2),ldv)
+                       call stdlib_sgesvd( 'S', 'O', nr, nr, v, ldv, s, u, ldu,v,ldv, work(n+nr+1)&
+                                 ,lwork-n-nr, info )
+                       call stdlib_slaset('A',n-nr,nr,zero,zero,v(nr+1,1),ldv)
+                       call stdlib_slaset('A',nr,n-nr,zero,zero,v(1,nr+1),ldv)
+                       call stdlib_slaset('A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv)
+                       call stdlib_sormqr('R','C', n, n, nr, u(1,nr+1), ldu,work(n+1),v,ldv,work(&
+                                 n+nr+1),lwork-n-nr,ierr)
+                       call stdlib_slapmt( .false., n, n, v, ldv, iwork )
+                       ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                       ! (m x nr) or (m x n) or (m x m).
+                       if ( ( nr < m ) .and. .not.(wntuf)) then
+                          call stdlib_slaset('A',m-nr,nr,zero,zero,u(nr+1,1),ldu)
+                          if ( nr < n1 ) then
+                          call stdlib_slaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                          call stdlib_slaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1),ldu)
+                          end if
+                       end if
+                     end if
+                 end if
+              else
+                  ! .. apply stdlib_sgesvd to r [[this is the recommended option]]
+                  if ( wntvr .or. ( nr == n ) ) then
+                      ! .. copy r into [v] and overwrite v with the right singular vectors
+                      call stdlib_slacpy( 'U', nr, n, a, lda, v, ldv )
+                     if ( nr > 1 )call stdlib_slaset( 'L', nr-1,nr-1, zero,zero, v(2,1), ldv )
+                               
+                     ! .. the right singular vectors of r overwrite [v], the nr left
+                     ! singular vectors of r stored in [u](1:nr,1:nr)
+                     call stdlib_sgesvd( 'S', 'O', nr, n, v, ldv, s, u, ldu,v, ldv, work(n+1), &
+                               lwork-n, info )
+                     call stdlib_slapmt( .false., nr, n, v, ldv, iwork )
+                     ! .. now [v](1:nr,1:n) contains v(1:n,1:nr)**t
+                     ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+                    if ( ( nr < m ) .and. .not.(wntuf)) then
+                       call stdlib_slaset('A', m-nr,nr, zero,zero, u(nr+1,1), ldu)
+                       if ( nr < n1 ) then
+                          call stdlib_slaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                          call stdlib_slaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                       end if
+                    end if
+                  else
+                    ! .. need all n right singular vectors and nr < n
+                    ! The Requested Number Of The Left Singular Vectors
+                     ! is then n1 (n or m)
+                     ! [[the optimal ratio n/nr for using lq instead of padding
+                       ! with zeros. here hard coded to 2; it must be at least
+                       ! two due to work space constraints.]]
+                     ! optratio = stdlib_ilaenv(6, 'sgesvd', 's' // 'o', nr,n,0,0)
+                     ! optratio = max( optratio, 2 )
+                    optratio = 2
+                    if ( optratio * nr > n ) then
+                       call stdlib_slacpy( 'U', nr, n, a, lda, v, ldv )
+                       if ( nr > 1 )call stdlib_slaset('L', nr-1,nr-1, zero,zero, v(2,1),ldv)
+                                 
+                    ! .. the right singular vectors of r overwrite [v], the nr left
+                       ! singular vectors of r stored in [u](1:nr,1:nr)
+                       call stdlib_slaset('A', n-nr,n, zero,zero, v(nr+1,1),ldv)
+                       call stdlib_sgesvd( 'S', 'O', n, n, v, ldv, s, u, ldu,v, ldv, work(n+1), &
+                                 lwork-n, info )
+                       call stdlib_slapmt( .false., n, n, v, ldv, iwork )
+                       ! .. now [v] contains the transposed matrix of the right
+                       ! singular vectors of a. the leading n left singular vectors
+                       ! are in [u](1:n,1:n)
+                       ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                       ! (m x n1), i.e. (m x n) or (m x m).
+                       if ( ( n < m ) .and. .not.(wntuf)) then
+                           call stdlib_slaset('A',m-n,n,zero,zero,u(n+1,1),ldu)
+                           if ( n < n1 ) then
+                             call stdlib_slaset('A',n,n1-n,zero,zero,u(1,n+1),ldu)
+                             call stdlib_slaset( 'A',m-n,n1-n,zero,one,u(n+1,n+1), ldu )
+                           end if
+                       end if
+                    else
+                       call stdlib_slacpy( 'U', nr, n, a, lda, u(nr+1,1), ldu )
+                       if ( nr > 1 )call stdlib_slaset('L',nr-1,nr-1,zero,zero,u(nr+2,1),ldu)
+                                 
+                       call stdlib_sgelqf( nr, n, u(nr+1,1), ldu, work(n+1),work(n+nr+1), lwork-n-&
+                                 nr, ierr )
+                       call stdlib_slacpy('L',nr,nr,u(nr+1,1),ldu,v,ldv)
+                       if ( nr > 1 )call stdlib_slaset('U',nr-1,nr-1,zero,zero,v(1,2),ldv)
+                       call stdlib_sgesvd( 'S', 'O', nr, nr, v, ldv, s, u, ldu,v, ldv, work(n+nr+&
+                                 1), lwork-n-nr, info )
+                       call stdlib_slaset('A',n-nr,nr,zero,zero,v(nr+1,1),ldv)
+                       call stdlib_slaset('A',nr,n-nr,zero,zero,v(1,nr+1),ldv)
+                       call stdlib_slaset('A',n-nr,n-nr,zero,one,v(nr+1,nr+1),ldv)
+                       call stdlib_sormlq('R','N',n,n,nr,u(nr+1,1),ldu,work(n+1),v, ldv, work(n+&
+                                 nr+1),lwork-n-nr,ierr)
+                       call stdlib_slapmt( .false., n, n, v, ldv, iwork )
+                     ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+                       if ( ( nr < m ) .and. .not.(wntuf)) then
+                          call stdlib_slaset('A',m-nr,nr,zero,zero,u(nr+1,1),ldu)
+                          if ( nr < n1 ) then
+                          call stdlib_slaset('A',nr,n1-nr,zero,zero,u(1,nr+1),ldu)
+                          call stdlib_slaset( 'A',m-nr,n1-nr,zero,one,u(nr+1,nr+1), ldu )
+                          end if
+                       end if
+                    end if
+                  end if
+              ! .. end of the "r**t or r" branch
+              end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! vectors matrix u.
+              if ( .not. wntuf )call stdlib_sormqr( 'L', 'N', m, n1, n, a, lda, work, u,ldu, work(&
+                        n+1), lwork-n, ierr )
+              if ( rowprm .and. .not.wntuf )call stdlib_slaswp( n1, u, ldu, 1, m-1, iwork(n+1), -&
+                        1 )
+           ! ... end of the "full svd" branch
+           end if
+           ! check whether some singular values are returned as zeros, e.g.
+           ! due to underflow, and update the numerical rank.
+           p = nr
+           do q = p, 1, -1
+               if ( s(q) > zero ) go to 4002
+               nr = nr - 1
+           end do
+           4002 continue
+           ! .. if numerical rank deficiency is detected, the truncated
+           ! singular values are set to zero.
+           if ( nr < n ) call stdlib_slaset( 'G', n-nr,1, zero,zero, s(nr+1), n )
+           ! .. undo scaling; this may cause overflow in the largest singular
+           ! values.
+           if ( ascaled )call stdlib_slascl( 'G',0,0, one,sqrt(real(m,KIND=sp)), nr,1, s, n, ierr &
+                     )
+           if ( conda ) rwork(1) = sconda
+           rwork(2) = p - nr
+           ! .. p-nr is the number of singular values that are computed as
+           ! exact zeros in stdlib_sgesvd() applied to the (possibly truncated)
+           ! full row rank triangular (trapezoidal) factor of a.
+           numrank = nr
+           return
+     end subroutine stdlib_sgesvdq
+
+     !> SGESVJ: computes the singular value decomposition (SVD) of a real
+     !> M-by-N matrix A, where M >= N. The SVD of A is written as
+     !> [++]   [xx]   [x0]   [xx]
+     !> A = U * SIGMA * V^t,  [++] = [xx] * [ox] * [xx]
+     !> [++]   [xx]
+     !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+     !> matrix, and V is an N-by-N orthogonal matrix. The diagonal elements
+     !> of SIGMA are the singular values of A. The columns of U and V are the
+     !> left and the right singular vectors of A, respectively.
+     !> SGESVJ can sometimes compute tiny singular values and their singular vectors much
+     !> more accurately than other SVD routines, see below under Further Details.
+
+     pure subroutine stdlib_sgesvj( joba, jobu, jobv, m, n, a, lda, sva, mv, v,ldv, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n
+           character, intent(in) :: joba, jobu, jobv
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), v(ldv,*), work(lwork)
+           real(sp), intent(out) :: sva(n)
+        ! =====================================================================
+           ! Local Parameters 
+           integer(ilp), parameter :: nsweep = 30
+           
+           
+           ! Local Scalars 
+           real(sp) :: aapp, aapp0, aapq, aaqq, apoaq, aqoap, big, bigtheta, cs, ctol, epsln, &
+           large, mxaapq, mxsinj, rootbig, rooteps, rootsfmin, roottol, skl, sfmin, small, sn, t, &
+                     temp1, theta, thsign, tol
+           integer(ilp) :: blskip, emptsw, i, ibr, ierr, igl, ijblsk, ir1, iswrot, jbc, jgl, kbl, &
+                     lkahead, mvl, n2, n34, n4, nbl, notrot, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, goscale, lower, lsvec, noscale, rotok, rsvec, uctol, &
+                     upper
+           ! Local Arrays 
+           real(sp) :: fastr(5)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,float,sign,sqrt
+           ! from lapack
+           ! from lapack
+           ! Executable Statements 
+           ! test the input arguments
+           lsvec = stdlib_lsame( jobu, 'U' )
+           uctol = stdlib_lsame( jobu, 'C' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           applv = stdlib_lsame( jobv, 'A' )
+           upper = stdlib_lsame( joba, 'U' )
+           lower = stdlib_lsame( joba, 'L' )
+           if( .not.( upper .or. lower .or. stdlib_lsame( joba, 'G' ) ) ) then
+              info = -1
+           else if( .not.( lsvec .or. uctol .or. stdlib_lsame( jobu, 'N' ) ) ) then
+              info = -2
+           else if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -5
+           else if( lda<m ) then
+              info = -7
+           else if( mv<0 ) then
+              info = -9
+           else if( ( rsvec .and. ( ldv<n ) ) .or.( applv .and. ( ldv<mv ) ) ) then
+              info = -11
+           else if( uctol .and. ( work( 1 )<=one ) ) then
+              info = -12
+           else if( lwork<max( m+n, 6 ) ) then
+              info = -13
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGESVJ', -info )
+              return
+           end if
+       ! #:) quick return for void matrix
+           if( ( m==0 ) .or. ( n==0 ) )return
+           ! set numerical parameters
+           ! the stopping criterion for jacobi rotations is
+           ! max_{i<>j}|a(:,i)^t * a(:,j)|/(||a(:,i)||*||a(:,j)||) < ctol*eps
+           ! where eps is the round-off and ctol is defined as follows:
+           if( uctol ) then
+              ! ... user controlled
+              ctol = work( 1 )
+           else
+              ! ... default
+              if( lsvec .or. rsvec .or. applv ) then
+                 ctol = sqrt( real( m,KIND=sp) )
+              else
+                 ctol = real( m,KIND=sp)
+              end if
+           end if
+           ! ... and the machine dependent parameters are
+      ! [!]  (make sure that stdlib_slamch() works properly on the target machine.)
+           epsln = stdlib_slamch( 'EPSILON' )
+           rooteps = sqrt( epsln )
+           sfmin = stdlib_slamch( 'SAFEMINIMUM' )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / epsln
+           big = stdlib_slamch( 'OVERFLOW' )
+           ! big         = one    / sfmin
+           rootbig = one / rootsfmin
+           large = big / sqrt( real( m*n,KIND=sp) )
+           bigtheta = one / rooteps
+           tol = ctol*epsln
+           roottol = sqrt( tol )
+           if( real( m,KIND=sp)*epsln>=one ) then
+              info = -4
+              call stdlib_xerbla( 'SGESVJ', -info )
+              return
+           end if
+           ! initialize the right singular vector matrix.
+           if( rsvec ) then
+              mvl = n
+              call stdlib_slaset( 'A', mvl, n, zero, one, v, ldv )
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           ! initialize sva( 1:n ) = ( ||a e_i||_2, i = 1:n )
+      ! (!)  if necessary, scale a to protect the largest singular value
+           ! from overflow. it is possible that saving the largest singular
+           ! value destroys the information about the small ones.
+           ! this initial scaling is almost minimal in the sense that the
+           ! goal is to make sure that no column norm overflows, and that
+           ! sqrt(n)*max_i sva(i) does not overflow. if infinite entries
+           ! in a are detected, the procedure returns with info=-6.
+           skl = one / sqrt( real( m,KIND=sp)*real( n,KIND=sp) )
+           noscale = .true.
+           goscale = .true.
+           if( lower ) then
+              ! the input matrix is m-by-n lower triangular (trapezoidal)
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_slassq( m-p+1, a( p, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'SGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl )
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           else if( upper ) then
+              ! the input matrix is m-by-n upper triangular (trapezoidal)
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_slassq( p, a( 1, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'SGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl )
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           else
+              ! the input matrix is m-by-n general dense
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_slassq( m, a( 1, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'SGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl )
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           end if
+           if( noscale )skl = one
+           ! move the smaller part of the spectrum from the underflow threshold
+      ! (!)  start by determining the position of the nonzero entries of the
+           ! array sva() relative to ( sfmin, big ).
+           aapp = zero
+           aaqq = big
+           do p = 1, n
+              if( sva( p )/=zero )aaqq = min( aaqq, sva( p ) )
+              aapp = max( aapp, sva( p ) )
+           end do
+       ! #:) quick return for zero matrix
+           if( aapp==zero ) then
+              if( lsvec )call stdlib_slaset( 'G', m, n, zero, one, a, lda )
+              work( 1 ) = one
+              work( 2 ) = zero
+              work( 3 ) = zero
+              work( 4 ) = zero
+              work( 5 ) = zero
+              work( 6 ) = zero
+              return
+           end if
+       ! #:) quick return for one-column matrix
+           if( n==1 ) then
+              if( lsvec )call stdlib_slascl( 'G', 0, 0, sva( 1 ), skl, m, 1,a( 1, 1 ), lda, ierr )
+                        
+              work( 1 ) = one / skl
+              if( sva( 1 )>=sfmin ) then
+                 work( 2 ) = one
+              else
+                 work( 2 ) = zero
+              end if
+              work( 3 ) = zero
+              work( 4 ) = zero
+              work( 5 ) = zero
+              work( 6 ) = zero
+              return
+           end if
+           ! protect small singular values from underflow, and try to
+           ! avoid underflows/overflows in computing jacobi rotations.
+           sn = sqrt( sfmin / epsln )
+           temp1 = sqrt( big / real( n,KIND=sp) )
+           if( ( aapp<=sn ) .or. ( aaqq>=temp1 ) .or.( ( sn<=aaqq ) .and. ( aapp<=temp1 ) ) ) &
+                     then
+              temp1 = min( big, temp1 / aapp )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq<=sn ) .and. ( aapp<=temp1 ) ) then
+              temp1 = min( sn / aaqq, big / ( aapp*sqrt( real( n,KIND=sp) ) ) )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq>=sn ) .and. ( aapp>=temp1 ) ) then
+              temp1 = max( sn / aaqq, temp1 / aapp )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq<=sn ) .and. ( aapp>=temp1 ) ) then
+              temp1 = min( sn / aaqq, big / ( sqrt( real( n,KIND=sp) )*aapp ) )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else
+              temp1 = one
+           end if
+           ! scale, if necessary
+           if( temp1/=one ) then
+              call stdlib_slascl( 'G', 0, 0, one, temp1, n, 1, sva, n, ierr )
+           end if
+           skl = temp1*skl
+           if( skl/=one ) then
+              call stdlib_slascl( joba, 0, 0, one, skl, m, n, a, lda, ierr )
+              skl = one / skl
+           end if
+           ! row-cyclic jacobi svd algorithm with column pivoting
+           emptsw = ( n*( n-1 ) ) / 2
+           notrot = 0
+           fastr( 1 ) = zero
+           ! a is represented in factored form a = a * diag(work), where diag(work)
+           ! is initialized to identity. work is updated during fast scaled
+           ! rotations.
+           do q = 1, n
+              work( q ) = one
+           end do
+           swband = 3
+      ! [tp] swband is a tuning parameter [tp]. it is meaningful and effective
+           ! if stdlib_sgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_sgesvj. for sweeps i=1:swband the procedure
+           ! works on pivots inside a band-like region around the diagonal.
+           ! the boundaries are determined dynamically, based on the number of
+           ! pivots above a threshold.
+           kbl = min( 8, n )
+      ! [tp] kbl is a tuning parameter that defines the tile size in the
+           ! tiling of the p-q loops of pivot pairs. in general, an optimal
+           ! value of kbl depends on the matrix dimensions and on the
+           ! parameters of the computer's memory.
+           nbl = n / kbl
+           if( ( nbl*kbl )/=n )nbl = nbl + 1
+           blskip = kbl**2
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           lkahead = 1
+      ! [tp] lkahead is a tuning parameter.
+           ! quasi block transformations, using the lower (upper) triangular
+           ! structure of the input matrix. the quasi-block-cycling usually
+           ! invokes cubic convergence. big part of this cycle is done inside
+           ! canonical subspaces of dimensions less than m.
+           if( ( lower .or. upper ) .and. ( n>max( 64, 4*kbl ) ) ) then
+      ! [tp] the number of partition levels and the actual partition are
+           ! tuning parameters.
+              n4 = n / 4
+              n2 = n / 2
+              n34 = 3*n4
+              if( applv ) then
+                 q = 0
+              else
+                 q = 1
+              end if
+              if( lower ) then
+           ! this works very well on lower triangular matrices, in particular
+           ! in the framework of the preconditioned jacobi svd (xgejsv).
+           ! the idea is simple:
+           ! [+ 0 0 0]   note that jacobi transformations of [0 0]
+           ! [+ + 0 0]                                       [0 0]
+           ! [+ + x 0]   actually work on [x 0]              [x 0]
+           ! [+ + x x]                    [x x].             [x x]
+                 call stdlib_sgsvj0( jobv, m-n34, n-n34, a( n34+1, n34+1 ), lda,work( n34+1 ), &
+                 sva( n34+1 ), mvl,v( n34*q+1, n34+1 ), ldv, epsln, sfmin, tol,2, work( n+1 ), &
+                           lwork-n, ierr )
+                 call stdlib_sgsvj0( jobv, m-n2, n34-n2, a( n2+1, n2+1 ), lda,work( n2+1 ), sva( &
+                 n2+1 ), mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 2,work( n+1 ), lwork-n, &
+                           ierr )
+                 call stdlib_sgsvj1( jobv, m-n2, n-n2, n4, a( n2+1, n2+1 ), lda,work( n2+1 ), sva(&
+                  n2+1 ), mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 1,work( n+1 ), lwork-n, &
+                            ierr )
+                 call stdlib_sgsvj0( jobv, m-n4, n2-n4, a( n4+1, n4+1 ), lda,work( n4+1 ), sva( &
+                 n4+1 ), mvl,v( n4*q+1, n4+1 ), ldv, epsln, sfmin, tol, 1,work( n+1 ), lwork-n, &
+                           ierr )
+                 call stdlib_sgsvj0( jobv, m, n4, a, lda, work, sva, mvl, v, ldv,epsln, sfmin, &
+                           tol, 1, work( n+1 ), lwork-n,ierr )
+                 call stdlib_sgsvj1( jobv, m, n2, n4, a, lda, work, sva, mvl, v,ldv, epsln, sfmin,&
+                            tol, 1, work( n+1 ),lwork-n, ierr )
+              else if( upper ) then
+                 call stdlib_sgsvj0( jobv, n4, n4, a, lda, work, sva, mvl, v, ldv,epsln, sfmin, &
+                           tol, 2, work( n+1 ), lwork-n,ierr )
+                 call stdlib_sgsvj0( jobv, n2, n4, a( 1, n4+1 ), lda, work( n4+1 ),sva( n4+1 ), &
+                 mvl, v( n4*q+1, n4+1 ), ldv,epsln, sfmin, tol, 1, work( n+1 ), lwork-n,ierr )
+                           
+                 call stdlib_sgsvj1( jobv, n2, n2, n4, a, lda, work, sva, mvl, v,ldv, epsln, &
+                           sfmin, tol, 1, work( n+1 ),lwork-n, ierr )
+                 call stdlib_sgsvj0( jobv, n2+n4, n4, a( 1, n2+1 ), lda,work( n2+1 ), sva( n2+1 ),&
+                  mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 1,work( n+1 ), lwork-n, ierr )
+                            
+              end if
+           end if
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+           ! each sweep is unrolled using kbl-by-kbl tiles over the pivot pairs
+           ! 1 <= p < q <= n. this is the first step toward a blocked implementation
+           ! of the rotations. new implementation, based on block transformations,
+           ! is under development.
+              loop_2000: do ibr = 1, nbl
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_1002: do ir1 = 0, min( lkahead, nbl-ibr )
+                    igl = igl + ir1*kbl
+                    loop_2001: do p = igl, min( igl+kbl-1, n-1 )
+           ! .. de rijk's pivoting
+                       q = stdlib_isamax( n-p+1, sva( p ), 1 ) + p - 1
+                       if( p/=q ) then
+                          call stdlib_sswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                          if( rsvec )call stdlib_sswap( mvl, v( 1, p ), 1,v( 1, q ), 1 )
+                          temp1 = sva( p )
+                          sva( p ) = sva( q )
+                          sva( q ) = temp1
+                          temp1 = work( p )
+                          work( p ) = work( q )
+                          work( q ) = temp1
+                       end if
+                       if( ir1==0 ) then
+              ! column norms are periodically updated by explicit
+              ! norm computation.
+              ! caveat:
+              ! unfortunately, some blas implementations compute stdlib_snrm2(m,a(1,p),1)
+              ! as sqrt(stdlib_sdot(m,a(1,p),1,a(1,p),1)), which may cause the result to
+              ! overflow for ||a(:,p)||_2 > sqrt(overflow_threshold), and to
+              ! underflow for ||a(:,p)||_2 < sqrt(underflow_threshold).
+              ! hence, stdlib_snrm2 cannot be trusted, not even in the case when
+              ! the true norm is far from the under(over)flow boundaries.
+              ! if properly implemented stdlib_snrm2 is available, the if-then-else
+              ! below should read "aapp = stdlib_snrm2( m, a(1,p), 1 ) * work(p)".
+                          if( ( sva( p )<rootbig ) .and.( sva( p )>rootsfmin ) ) then
+                             sva( p ) = stdlib_snrm2( m, a( 1, p ), 1 )*work( p )
+                          else
+                             temp1 = zero
+                             aapp = one
+                             call stdlib_slassq( m, a( 1, p ), 1, temp1, aapp )
+                             sva( p ) = temp1*sqrt( aapp )*work( p )
+                          end if
+                          aapp = sva( p )
+                       else
+                          aapp = sva( p )
+                       end if
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2002: do q = p + 1, min( igl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+                                if( aaqq>=one ) then
+                                   rotok = ( small*aapp )<=aaqq
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_sdot( m, a( 1, p ), 1, a( 1,q ), 1 )*work( &
+                                                p )*work( q ) /aaqq ) / aapp
+                                   else
+                                      call stdlib_scopy( m, a( 1, p ), 1,work( n+1 ), 1 )
+                                      call stdlib_slascl( 'G', 0, 0, aapp,work( p ), m, 1,work( n+&
+                                                1 ), lda, ierr )
+                                      aapq = stdlib_sdot( m, work( n+1 ), 1,a( 1, q ), 1 )*work( &
+                                                q ) / aaqq
+                                   end if
+                                else
+                                   rotok = aapp<=( aaqq / small )
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_sdot( m, a( 1, p ), 1, a( 1,q ), 1 )*work( &
+                                                p )*work( q ) /aaqq ) / aapp
+                                   else
+                                      call stdlib_scopy( m, a( 1, q ), 1,work( n+1 ), 1 )
+                                      call stdlib_slascl( 'G', 0, 0, aaqq,work( q ), m, 1,work( n+&
+                                                1 ), lda, ierr )
+                                      aapq = stdlib_sdot( m, work( n+1 ), 1,a( 1, p ), 1 )*work( &
+                                                p ) / aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                 ! Rotate
+      ! [rtd]      rotated = rotated + one
+                                   if( ir1==0 ) then
+                                      notrot = 0
+                                      pskipped = 0
+                                      iswrot = iswrot + 1
+                                   end if
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq ) / aapq
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*work( p ) / work( q )
+                                         fastr( 4 ) = -t*work( q ) /work( p )
+                                         call stdlib_srotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_srotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = work( p ) / work( q )
+                                         aqoap = work( q ) / work( p )
+                                         if( work( p )>=one ) then
+                                            if( work( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               work( p ) = work( p )*cs
+                                               work( q ) = work( q )*cs
+                                               call stdlib_srotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_srotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_saxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_saxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               work( p ) = work( p )*cs
+                                               work( q ) = work( q ) / cs
+                                               if( rsvec ) then
+                                                  call stdlib_saxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_saxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                            end if
+                                         else
+                                            if( work( q )>=one ) then
+                                               call stdlib_saxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_saxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               work( p ) = work( p ) / cs
+                                               work( q ) = work( q )*cs
+                                               if( rsvec ) then
+                                                  call stdlib_saxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_saxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                            else
+                                               if( work( p )>=work( q ) )then
+                                                  call stdlib_saxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_saxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  work( p ) = work( p )*cs
+                                                  work( q ) = work( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_saxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_saxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_saxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_saxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  work( p ) = work( p ) / cs
+                                                  work( q ) = work( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_saxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_saxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                      call stdlib_scopy( m, a( 1, p ), 1,work( n+1 ), 1 )
+                                      call stdlib_slascl( 'G', 0, 0, aapp, one, m,1, work( n+1 ), &
+                                                lda,ierr )
+                                      call stdlib_slascl( 'G', 0, 0, aaqq, one, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      temp1 = -aapq*work( p ) / work( q )
+                                      call stdlib_saxpy( m, temp1, work( n+1 ), 1,a( 1, q ), 1 )
+                                                
+                                      call stdlib_slascl( 'G', 0, 0, one, aaqq, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                      mxsinj = max( mxsinj, sfmin )
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! recompute sva(q), sva(p).
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_snrm2( m, a( 1, q ), 1 )*work( q )
+                                                   
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_slassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*work( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_snrm2( m, a( 1, p ), 1 )*work( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_slassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*work( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                                else
+              ! a(:,p) and a(:,q) already numerically orthogonal
+                                   if( ir1==0 )notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                end if
+                             else
+              ! a(:,q) is zero column
+                                if( ir1==0 )notrot = notrot + 1
+                                pskipped = pskipped + 1
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                if( ir1==0 )aapp = -aapp
+                                notrot = 0
+                                go to 2103
+                             end if
+                          end do loop_2002
+           ! end q-loop
+           2103 continue
+           ! bailed out of q-loop
+                          sva( p ) = aapp
+                       else
+                          sva( p ) = aapp
+                          if( ( ir1==0 ) .and. ( aapp==zero ) )notrot = notrot + min( igl+kbl-1, &
+                                    n ) - p
+                       end if
+                    end do loop_2001
+           ! end of the p-loop
+           ! end of doing the block ( ibr, ibr )
+                 end do loop_1002
+           ! end of ir1-loop
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = ibr + 1, nbl
+                    jgl = ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! safe gram matrix computation
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_sdot( m, a( 1, p ), 1, a( 1,q ), 1 )*work( &
+                                                p )*work( q ) /aaqq ) / aapp
+                                   else
+                                      call stdlib_scopy( m, a( 1, p ), 1,work( n+1 ), 1 )
+                                      call stdlib_slascl( 'G', 0, 0, aapp,work( p ), m, 1,work( n+&
+                                                1 ), lda, ierr )
+                                      aapq = stdlib_sdot( m, work( n+1 ), 1,a( 1, q ), 1 )*work( &
+                                                q ) / aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_sdot( m, a( 1, p ), 1, a( 1,q ), 1 )*work( &
+                                                p )*work( q ) /aaqq ) / aapp
+                                   else
+                                      call stdlib_scopy( m, a( 1, q ), 1,work( n+1 ), 1 )
+                                      call stdlib_slascl( 'G', 0, 0, aaqq,work( q ), m, 1,work( n+&
+                                                1 ), lda, ierr )
+                                      aapq = stdlib_sdot( m, work( n+1 ), 1,a( 1, p ), 1 )*work( &
+                                                p ) / aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                                   notrot = 0
+      ! [rtd]      rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq ) / aapq
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*work( p ) / work( q )
+                                         fastr( 4 ) = -t*work( q ) /work( p )
+                                         call stdlib_srotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_srotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = work( p ) / work( q )
+                                         aqoap = work( q ) / work( p )
+                                         if( work( p )>=one ) then
+                                            if( work( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               work( p ) = work( p )*cs
+                                               work( q ) = work( q )*cs
+                                               call stdlib_srotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_srotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_saxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_saxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_saxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_saxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                               work( p ) = work( p )*cs
+                                               work( q ) = work( q ) / cs
+                                            end if
+                                         else
+                                            if( work( q )>=one ) then
+                                               call stdlib_saxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_saxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_saxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_saxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                               work( p ) = work( p ) / cs
+                                               work( q ) = work( q )*cs
+                                            else
+                                               if( work( p )>=work( q ) )then
+                                                  call stdlib_saxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_saxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  work( p ) = work( p )*cs
+                                                  work( q ) = work( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_saxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_saxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_saxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_saxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  work( p ) = work( p ) / cs
+                                                  work( q ) = work( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_saxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_saxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                                      if( aapp>aaqq ) then
+                                         call stdlib_scopy( m, a( 1, p ), 1,work( n+1 ), 1 )
+                                                   
+                                         call stdlib_slascl( 'G', 0, 0, aapp, one,m, 1, work( n+1 &
+                                                   ), lda,ierr )
+                                         call stdlib_slascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*work( p ) / work( q )
+                                         call stdlib_saxpy( m, temp1, work( n+1 ),1, a( 1, q ), 1 &
+                                                   )
+                                         call stdlib_slascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      else
+                                         call stdlib_scopy( m, a( 1, q ), 1,work( n+1 ), 1 )
+                                                   
+                                         call stdlib_slascl( 'G', 0, 0, aaqq, one,m, 1, work( n+1 &
+                                                   ), lda,ierr )
+                                         call stdlib_slascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*work( q ) / work( p )
+                                         call stdlib_saxpy( m, temp1, work( n+1 ),1, a( 1, p ), 1 &
+                                                   )
+                                         call stdlib_slascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q)
+                 ! .. recompute sva(q)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_snrm2( m, a( 1, q ), 1 )*work( q )
+                                                   
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_slassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*work( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_snrm2( m, a( 1, p ), 1 )*work( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_slassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*work( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_snrm2( m, a( 1, n ), 1 )*work( n )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_slassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )*work( n )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<sqrt( real( n,KIND=sp) )*tol ) .and. ( real( n,&
+                        KIND=sp)*mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:( reaching this point means that the procedure has not converged.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means numerical convergence after the i-th
+           ! sweep.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the singular values and find how many are above
+           ! the underflow threshold.
+           n2 = 0
+           n4 = 0
+           do p = 1, n - 1
+              q = stdlib_isamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 temp1 = work( p )
+                 work( p ) = work( q )
+                 work( q ) = temp1
+                 call stdlib_sswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_sswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+              if( sva( p )/=zero ) then
+                 n4 = n4 + 1
+                 if( sva( p )*skl>sfmin )n2 = n2 + 1
+              end if
+           end do
+           if( sva( n )/=zero ) then
+              n4 = n4 + 1
+              if( sva( n )*skl>sfmin )n2 = n2 + 1
+           end if
+           ! normalize the left singular vectors.
+           if( lsvec .or. uctol ) then
+              do p = 1, n2
+                 call stdlib_sscal( m, work( p ) / sva( p ), a( 1, p ), 1 )
+              end do
+           end if
+           ! scale the product of jacobi rotations (assemble the fast rotations).
+           if( rsvec ) then
+              if( applv ) then
+                 do p = 1, n
+                    call stdlib_sscal( mvl, work( p ), v( 1, p ), 1 )
+                 end do
+              else
+                 do p = 1, n
+                    temp1 = one / stdlib_snrm2( mvl, v( 1, p ), 1 )
+                    call stdlib_sscal( mvl, temp1, v( 1, p ), 1 )
+                 end do
+              end if
+           end if
+           ! undo scaling, if necessary (and possible).
+           if( ( ( skl>one ) .and. ( sva( 1 )<( big / skl ) ) ).or. ( ( skl<one ) .and. ( sva( &
+                     max( n2, 1 ) ) >( sfmin / skl ) ) ) ) then
+              do p = 1, n
+                 sva( p ) = skl*sva( p )
+              end do
+              skl = one
+           end if
+           work( 1 ) = skl
+           ! the singular values of a are skl*sva(1:n). if skl/=one
+           ! then some of the singular values may overflow or underflow and
+           ! the spectrum is given in this factored representation.
+           work( 2 ) = real( n4,KIND=sp)
+           ! n4 is the number of computed nonzero singular values of a.
+           work( 3 ) = real( n2,KIND=sp)
+           ! n2 is the number of singular values of a greater than sfmin.
+           ! if n2<n, sva(n2:n) contains zeros and/or denormalized numbers
+           ! that may carry some information.
+           work( 4 ) = real( i,KIND=sp)
+           ! i is the index of the last sweep before declaring convergence.
+           work( 5 ) = mxaapq
+           ! mxaapq is the largest absolute value of scaled pivots in the
+           ! last sweep
+           work( 6 ) = mxsinj
+           ! mxsinj is the largest absolute value of the sines of jacobi angles
+           ! in the last sweep
+           return
+     end subroutine stdlib_sgesvj
+
+     !> SGGES3: computes for a pair of N-by-N real nonsymmetric matrices (A,B),
+     !> the generalized eigenvalues, the generalized real Schur form (S,T),
+     !> optionally, the left and/or right matrices of Schur vectors (VSL and
+     !> VSR). This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL)*S*(VSR)**T, (VSL)*T*(VSR)**T )
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> quasi-triangular matrix S and the upper triangular matrix T.The
+     !> leading columns of VSL and VSR then form an orthonormal basis for the
+     !> corresponding left and right eigenspaces (deflating subspaces).
+     !> (If only the generalized eigenvalues are needed, use the driver
+     !> SGGEV instead, which is faster.)
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0 or both being zero.
+     !> A pair of matrices (S,T) is in generalized real Schur form if T is
+     !> upper triangular with non-negative diagonal and S is block upper
+     !> triangular with 1-by-1 and 2-by-2 blocks.  1-by-1 blocks correspond
+     !> to real generalized eigenvalues, while 2-by-2 blocks of S will be
+     !> "standardized" by making the corresponding elements of T have the
+     !> form:
+     !> [  a  0  ]
+     !> [  0  b  ]
+     !> and the pair of corresponding 2-by-2 blocks in S and T will have a
+     !> complex conjugate pair of generalized eigenvalues.
+
+     subroutine stdlib_sgges3( jobvsl, jobvsr, sort, selctg, n, a, lda, b,ldb, sdim, alphar, &
+               alphai, beta, vsl, ldvsl,vsr, ldvsr, work, lwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: alphai(*), alphar(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), &
+                     work(*)
+           ! Function Arguments 
+           procedure(stdlib_selctg_s) :: selctg
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, lst2sl, &
+                     wantst
+           integer(ilp) :: i, icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, ip, iright, irows, &
+                     itau, iwrk, lwkopt
+           real(sp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pvsl, pvsr, safmax, safmin, &
+                     smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(sp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -15
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -17
+           else if( lwork<6*n+16 .and. .not.lquery ) then
+              info = -19
+           end if
+           ! compute workspace
+           if( info==0 ) then
+              call stdlib_sgeqrf( n, n, b, ldb, work, work, -1, ierr )
+              lwkopt = max( 6*n+16, 3*n+int( work( 1 ),KIND=ilp) )
+              call stdlib_sormqr( 'L', 'T', n, n, n, b, ldb, work, a, lda, work,-1, ierr )
+              lwkopt = max( lwkopt, 3*n+int( work( 1 ),KIND=ilp) )
+              if( ilvsl ) then
+                 call stdlib_sorgqr( n, n, n, vsl, ldvsl, work, work, -1, ierr )
+                 lwkopt = max( lwkopt, 3*n+int( work( 1 ),KIND=ilp) )
+              end if
+              call stdlib_sgghd3( jobvsl, jobvsr, n, 1, n, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr, &
+                        work, -1, ierr )
+              lwkopt = max( lwkopt, 3*n+int( work( 1 ),KIND=ilp) )
+              call stdlib_slaqz0( 'S', jobvsl, jobvsr, n, 1, n, a, lda, b, ldb,alphar, alphai, &
+                        beta, vsl, ldvsl, vsr, ldvsr,work, -1, 0, ierr )
+              lwkopt = max( lwkopt, 2*n+int( work( 1 ),KIND=ilp) )
+              if( wantst ) then
+                 call stdlib_stgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb,alphar, alphai, &
+                 beta, vsl, ldvsl, vsr, ldvsr,sdim, pvsl, pvsr, dif, work, -1, idum, 1,ierr )
+                           
+                 lwkopt = max( lwkopt, 2*n+int( work( 1 ),KIND=ilp) )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGES3 ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           safmin = stdlib_slamch( 'S' )
+           safmax = one / safmin
+           call stdlib_slabad( safmin, safmax )
+           smlnum = sqrt( safmin ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_slascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_slange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_slascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_sggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_sgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           call stdlib_sormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           if( ilvsl ) then
+              call stdlib_slaset( 'FULL', n, n, zero, one, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_slacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_sorgqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_slaset( 'FULL', n, n, zero, one, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           call stdlib_sgghd3( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      work( iwrk ), lwork+1-iwrk, ierr )
+           ! perform qz algorithm, computing schur vectors if desired
+           iwrk = itau
+           call stdlib_slaqz0( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vsl, ldvsl, vsr, ldvsr,work( iwrk ), lwork+1-iwrk, 0, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 40
+           end if
+           ! sort eigenvalues alpha/beta if desired
+           sdim = 0
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selctging
+              if( ilascl ) then
+                 call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n,ierr )
+                 call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n,ierr )
+              end if
+              if( ilbscl )call stdlib_slascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alphar( i ), alphai( i ), beta( i ) )
+              end do
+              call stdlib_stgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb, alphar,alphai, beta, &
+              vsl, ldvsl, vsr, ldvsr, sdim, pvsl,pvsr, dif, work( iwrk ), lwork-iwrk+1, idum, 1,&
+                        ierr )
+              if( ierr==1 )info = n + 3
+           end if
+           ! apply back-permutation to vsl and vsr
+           if( ilvsl )call stdlib_sggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_sggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, &
+                     vsr, ldvsr, ierr )
+           ! check if unscaling would cause over/underflow, if so, rescale
+           ! (alphar(i),alphai(i),beta(i)) so beta(i) is on the order of
+           ! b(i,i) and alphar(i) and alphai(i) are on the order of a(i,i)
+           if( ilascl )then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                    if( ( alphar( i )/safmax )>( anrmto/anrm ) .or.( safmin/alphar( i ) )>( &
+                              anrm/anrmto ) ) then
+                       work( 1 ) = abs( a( i, i )/alphar( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    else if( ( alphai( i )/safmax )>( anrmto/anrm ) .or.( safmin/alphai( i ) )>( &
+                              anrm/anrmto ) ) then
+                       work( 1 ) = abs( a( i, i+1 )/alphai( i ) )
+                       beta( i ) = beta( i )*work( 1 )
+                       alphar( i ) = alphar( i )*work( 1 )
+                       alphai( i ) = alphai( i )*work( 1 )
+                    end if
+                 end if
+              end do
+           end if
+           if( ilbscl )then
+              do i = 1, n
+                 if( alphai( i )/=zero ) then
+                     if( ( beta( i )/safmax )>( bnrmto/bnrm ) .or.( safmin/beta( i ) )>( &
+                               bnrm/bnrmto ) ) then
+                        work( 1 ) = abs(b( i, i )/beta( i ))
+                        beta( i ) = beta( i )*work( 1 )
+                        alphar( i ) = alphar( i )*work( 1 )
+                        alphai( i ) = alphai( i )*work( 1 )
+                     end if
+                  end if
+              end do
+           end if
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_slascl( 'H', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_slascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_slascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              lst2sl = .true.
+              sdim = 0
+              ip = 0
+              do i = 1, n
+                 cursl = selctg( alphar( i ), alphai( i ), beta( i ) )
+                 if( alphai( i )==zero ) then
+                    if( cursl )sdim = sdim + 1
+                    ip = 0
+                    if( cursl .and. .not.lastsl )info = n + 2
+                 else
+                    if( ip==1 ) then
+                       ! last eigenvalue of conjugate pair
+                       cursl = cursl .or. lastsl
+                       lastsl = cursl
+                       if( cursl )sdim = sdim + 2
+                       ip = -1
+                       if( cursl .and. .not.lst2sl )info = n + 2
+                    else
+                       ! first eigenvalue of conjugate pair
+                       ip = 1
+                    end if
+                 end if
+                 lst2sl = lastsl
+                 lastsl = cursl
+              end do
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_sgges3
+
+     !> SGGEV3: computes for a pair of N-by-N real nonsymmetric matrices (A,B)
+     !> the generalized eigenvalues, and optionally, the left and/or right
+     !> generalized eigenvectors.
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right eigenvector v(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j).
+     !> The left eigenvector u(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> u(j)**H * A  = lambda(j) * u(j)**H * B .
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_sggev3( jobvl, jobvr, n, a, lda, b, ldb, alphar,alphai, beta, vl, ldvl, vr,&
+                ldvr, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: alphai(*), alphar(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+                     
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery
+           character :: chtemp
+           integer(ilp) :: icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, in, iright, irows, itau, &
+                     iwrk, jc, jr, lwkopt
+           real(sp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -12
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -14
+           else if( lwork<max( 1, 8*n ) .and. .not.lquery ) then
+              info = -16
+           end if
+           ! compute workspace
+           if( info==0 ) then
+              call stdlib_sgeqrf( n, n, b, ldb, work, work, -1, ierr )
+              lwkopt = max( 1, 8*n, 3*n+int( work( 1 ),KIND=ilp) )
+              call stdlib_sormqr( 'L', 'T', n, n, n, b, ldb, work, a, lda, work,-1, ierr )
+              lwkopt = max( lwkopt, 3*n+int( work( 1 ),KIND=ilp) )
+              call stdlib_sgghd3( jobvl, jobvr, n, 1, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, work, &
+                        -1, ierr )
+              lwkopt = max( lwkopt, 3*n+int( work( 1 ),KIND=ilp) )
+              if( ilvl ) then
+                 call stdlib_sorgqr( n, n, n, vl, ldvl, work, work, -1, ierr )
+                 lwkopt = max( lwkopt, 3*n+int( work( 1 ),KIND=ilp) )
+                 call stdlib_slaqz0( 'S', jobvl, jobvr, n, 1, n, a, lda, b, ldb,alphar, alphai, &
+                           beta, vl, ldvl, vr, ldvr,work, -1, 0, ierr )
+                 lwkopt = max( lwkopt, 2*n+int( work( 1 ),KIND=ilp) )
+              else
+                 call stdlib_slaqz0( 'E', jobvl, jobvr, n, 1, n, a, lda, b, ldb,alphar, alphai, &
+                           beta, vl, ldvl, vr, ldvr,work, -1, 0, ierr )
+                 lwkopt = max( lwkopt, 2*n+int( work( 1 ),KIND=ilp) )
+              end if
+              work( 1 ) = real( lwkopt,KIND=sp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGGEV3 ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_slamch( 'P' )
+           smlnum = stdlib_slamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_slabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_slange( 'M', n, n, a, lda, work )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_slascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_slange( 'M', n, n, b, ldb, work )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_slascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrices a, b to isolate eigenvalues if possible
+           ileft = 1
+           iright = n + 1
+           iwrk = iright + n
+           call stdlib_sggbal( 'P', n, a, lda, b, ldb, ilo, ihi, work( ileft ),work( iright ), &
+                     work( iwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           irows = ihi + 1 - ilo
+           if( ilv ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = iwrk
+           iwrk = itau + irows
+           call stdlib_sgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           call stdlib_sormqr( 'L', 'T', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl
+           if( ilvl ) then
+              call stdlib_slaset( 'FULL', n, n, zero, one, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_slacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_sorgqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vr
+           if( ilvr )call stdlib_slaset( 'FULL', n, n, zero, one, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           if( ilv ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_sgghd3( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        work( iwrk ), lwork+1-iwrk, ierr )
+           else
+              call stdlib_sgghd3( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr,work( iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur forms and schur vectors)
+           iwrk = itau
+           if( ilv ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_slaqz0( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                     beta, vl, ldvl, vr, ldvr,work( iwrk ), lwork+1-iwrk, 0, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 110
+           end if
+           ! compute eigenvectors
+           if( ilv ) then
+              if( ilvl ) then
+                 if( ilvr ) then
+                    chtemp = 'B'
+                 else
+                    chtemp = 'L'
+                 end if
+              else
+                 chtemp = 'R'
+              end if
+              call stdlib_stgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, n, &
+                        in, work( iwrk ), ierr )
+              if( ierr/=0 ) then
+                 info = n + 2
+                 go to 110
+              end if
+              ! undo balancing on vl and vr and normalization
+              if( ilvl ) then
+                 call stdlib_sggbak( 'P', 'L', n, ilo, ihi, work( ileft ),work( iright ), n, vl, &
+                           ldvl, ierr )
+                 loop_50: do jc = 1, n
+                    if( alphai( jc )<zero )cycle loop_50
+                    temp = zero
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          temp = max( temp, abs( vl( jr, jc ) ) )
+                       end do
+                    else
+                       do jr = 1, n
+                          temp = max( temp, abs( vl( jr, jc ) )+abs( vl( jr, jc+1 ) ) )
+                       end do
+                    end if
+                    if( temp<smlnum )cycle loop_50
+                    temp = one / temp
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          vl( jr, jc ) = vl( jr, jc )*temp
+                       end do
+                    else
+                       do jr = 1, n
+                          vl( jr, jc ) = vl( jr, jc )*temp
+                          vl( jr, jc+1 ) = vl( jr, jc+1 )*temp
+                       end do
+                    end if
+                 end do loop_50
+              end if
+              if( ilvr ) then
+                 call stdlib_sggbak( 'P', 'R', n, ilo, ihi, work( ileft ),work( iright ), n, vr, &
+                           ldvr, ierr )
+                 loop_100: do jc = 1, n
+                    if( alphai( jc )<zero )cycle loop_100
+                    temp = zero
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          temp = max( temp, abs( vr( jr, jc ) ) )
+                       end do
+                    else
+                       do jr = 1, n
+                          temp = max( temp, abs( vr( jr, jc ) )+abs( vr( jr, jc+1 ) ) )
+                       end do
+                    end if
+                    if( temp<smlnum )cycle loop_100
+                    temp = one / temp
+                    if( alphai( jc )==zero ) then
+                       do jr = 1, n
+                          vr( jr, jc ) = vr( jr, jc )*temp
+                       end do
+                    else
+                       do jr = 1, n
+                          vr( jr, jc ) = vr( jr, jc )*temp
+                          vr( jr, jc+1 ) = vr( jr, jc+1 )*temp
+                       end do
+                    end if
+                 end do loop_100
+              end if
+              ! end of eigenvector calculation
+           end if
+           ! undo scaling if necessary
+           110 continue
+           if( ilascl ) then
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphar, n, ierr )
+              call stdlib_slascl( 'G', 0, 0, anrmto, anrm, n, 1, alphai, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_slascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           work( 1 ) = real( lwkopt,KIND=sp)
+           return
+     end subroutine stdlib_sggev3
+
+     !> SGSVJ0: is called from SGESVJ as a pre-processor and that is its main
+     !> purpose. It applies Jacobi rotations in the same way as SGESVJ does, but
+     !> it does not check convergence (stopping criterion). Few tuning
+     !> parameters (marked by [TP]) are available for the implementer.
+
+     pure subroutine stdlib_sgsvj0( jobv, m, n, a, lda, d, sva, mv, v, ldv, eps,sfmin, tol, &
+               nsweep, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n, nsweep
+           real(sp), intent(in) :: eps, sfmin, tol
+           character, intent(in) :: jobv
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), sva(n), d(n), v(ldv,*)
+           real(sp), intent(out) :: work(lwork)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: aapp, aapp0, aapq, aaqq, apoaq, aqoap, big, bigtheta, cs, mxaapq, mxsinj, &
+                     rootbig, rooteps, rootsfmin, roottol, small, sn, t, temp1, theta, thsign
+           integer(ilp) :: blskip, emptsw, i, ibr, ierr, igl, ijblsk, ir1, iswrot, jbc, jgl, kbl, &
+                     lkahead, mvl, nbl, notrot, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, rotok, rsvec
+           ! Local Arrays 
+           real(sp) :: fastr(5)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,float,min,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           applv = stdlib_lsame( jobv, 'A' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -3
+           else if( lda<m ) then
+              info = -5
+           else if( ( rsvec.or.applv ) .and. ( mv<0 ) ) then
+              info = -8
+           else if( ( rsvec.and.( ldv<n ) ).or.( applv.and.( ldv<mv ) ) ) then
+              info = -10
+           else if( tol<=eps ) then
+              info = -13
+           else if( nsweep<0 ) then
+              info = -14
+           else if( lwork<m ) then
+              info = -16
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGSVJ0', -info )
+              return
+           end if
+           if( rsvec ) then
+              mvl = n
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           rooteps = sqrt( eps )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / eps
+           big = one / sfmin
+           rootbig = one / rootsfmin
+           bigtheta = one / rooteps
+           roottol = sqrt( tol )
+           ! .. row-cyclic jacobi svd algorithm with column pivoting ..
+           emptsw = ( n*( n-1 ) ) / 2
+           notrot = 0
+           fastr( 1 ) = zero
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           swband = 0
+      ! [tp] swband is a tuning parameter. it is meaningful and effective
+           ! if stdlib_sgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_sgesvj. for sweeps i=1:swband the procedure
+           ! ......
+           kbl = min( 8, n )
+      ! [tp] kbl is a tuning parameter that defines the tile size in the
+           ! tiling of the p-q loops of pivot pairs. in general, an optimal
+           ! value of kbl depends on the matrix dimensions and on the
+           ! parameters of the computer's memory.
+           nbl = n / kbl
+           if( ( nbl*kbl )/=n )nbl = nbl + 1
+           blskip = ( kbl**2 ) + 1
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           lkahead = 1
+      ! [tp] lkahead is a tuning parameter.
+           swband = 0
+           pskipped = 0
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+              loop_2000: do ibr = 1, nbl
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_1002: do ir1 = 0, min( lkahead, nbl-ibr )
+                    igl = igl + ir1*kbl
+                    loop_2001: do p = igl, min( igl+kbl-1, n-1 )
+           ! .. de rijk's pivoting
+                       q = stdlib_isamax( n-p+1, sva( p ), 1 ) + p - 1
+                       if( p/=q ) then
+                          call stdlib_sswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                          if( rsvec )call stdlib_sswap( mvl, v( 1, p ), 1,v( 1, q ), 1 )
+                          temp1 = sva( p )
+                          sva( p ) = sva( q )
+                          sva( q ) = temp1
+                          temp1 = d( p )
+                          d( p ) = d( q )
+                          d( q ) = temp1
+                       end if
+                       if( ir1==0 ) then
+              ! column norms are periodically updated by explicit
+              ! norm computation.
+              ! caveat:
+              ! some blas implementations compute stdlib_snrm2(m,a(1,p),1)
+              ! as sqrt(stdlib_sdot(m,a(1,p),1,a(1,p),1)), which may result in
+              ! overflow for ||a(:,p)||_2 > sqrt(overflow_threshold), and
+              ! underflow for ||a(:,p)||_2 < sqrt(underflow_threshold).
+              ! hence, stdlib_snrm2 cannot be trusted, not even in the case when
+              ! the true norm is far from the under(over)flow boundaries.
+              ! if properly implemented stdlib_snrm2 is available, the if-then-else
+              ! below should read "aapp = stdlib_snrm2( m, a(1,p), 1 ) * d(p)".
+                          if( ( sva( p )<rootbig ) .and.( sva( p )>rootsfmin ) ) then
+                             sva( p ) = stdlib_snrm2( m, a( 1, p ), 1 )*d( p )
+                          else
+                             temp1 = zero
+                             aapp = one
+                             call stdlib_slassq( m, a( 1, p ), 1, temp1, aapp )
+                             sva( p ) = temp1*sqrt( aapp )*d( p )
+                          end if
+                          aapp = sva( p )
+                       else
+                          aapp = sva( p )
+                       end if
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2002: do q = p + 1, min( igl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+                                if( aaqq>=one ) then
+                                   rotok = ( small*aapp )<=aaqq
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_sdot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_scopy( m, a( 1, p ), 1, work, 1 )
+                                      call stdlib_slascl( 'G', 0, 0, aapp, d( p ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_sdot( m, work, 1, a( 1, q ),1 )*d( q ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   rotok = aapp<=( aaqq / small )
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_sdot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_scopy( m, a( 1, q ), 1, work, 1 )
+                                      call stdlib_slascl( 'G', 0, 0, aaqq, d( q ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_sdot( m, work, 1, a( 1, p ),1 )*d( p ) / &
+                                                aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                 ! Rotate
+                 ! rotated = rotated + one
+                                   if( ir1==0 ) then
+                                      notrot = 0
+                                      pskipped = 0
+                                      iswrot = iswrot + 1
+                                   end if
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq ) / aapq
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*d( p ) / d( q )
+                                         fastr( 4 ) = -t*d( q ) / d( p )
+                                         call stdlib_srotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_srotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = d( p ) / d( q )
+                                         aqoap = d( q ) / d( p )
+                                         if( d( p )>=one ) then
+                                            if( d( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q )*cs
+                                               call stdlib_srotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_srotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_saxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_saxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q ) / cs
+                                               if( rsvec ) then
+                                                  call stdlib_saxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_saxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                            end if
+                                         else
+                                            if( d( q )>=one ) then
+                                               call stdlib_saxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_saxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               d( p ) = d( p ) / cs
+                                               d( q ) = d( q )*cs
+                                               if( rsvec ) then
+                                                  call stdlib_saxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_saxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                            else
+                                               if( d( p )>=d( q ) ) then
+                                                  call stdlib_saxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_saxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  d( p ) = d( p )*cs
+                                                  d( q ) = d( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_saxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_saxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_saxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_saxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  d( p ) = d( p ) / cs
+                                                  d( q ) = d( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_saxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_saxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                      call stdlib_scopy( m, a( 1, p ), 1, work, 1 )
+                                      call stdlib_slascl( 'G', 0, 0, aapp, one, m,1, work, lda, &
+                                                ierr )
+                                      call stdlib_slascl( 'G', 0, 0, aaqq, one, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      temp1 = -aapq*d( p ) / d( q )
+                                      call stdlib_saxpy( m, temp1, work, 1,a( 1, q ), 1 )
+                                      call stdlib_slascl( 'G', 0, 0, one, aaqq, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                      mxsinj = max( mxsinj, sfmin )
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! recompute sva(q), sva(p).
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_snrm2( m, a( 1, q ), 1 )*d( q )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_slassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*d( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_snrm2( m, a( 1, p ), 1 )*d( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_slassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*d( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                                else
+              ! a(:,p) and a(:,q) already numerically orthogonal
+                                   if( ir1==0 )notrot = notrot + 1
+                                   pskipped = pskipped + 1
+                                end if
+                             else
+              ! a(:,q) is zero column
+                                if( ir1==0 )notrot = notrot + 1
+                                pskipped = pskipped + 1
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                if( ir1==0 )aapp = -aapp
+                                notrot = 0
+                                go to 2103
+                             end if
+                          end do loop_2002
+           ! end q-loop
+           2103 continue
+           ! bailed out of q-loop
+                          sva( p ) = aapp
+                       else
+                          sva( p ) = aapp
+                          if( ( ir1==0 ) .and. ( aapp==zero ) )notrot = notrot + min( igl+kbl-1, &
+                                    n ) - p
+                       end if
+                    end do loop_2001
+           ! end of the p-loop
+           ! end of doing the block ( ibr, ibr )
+                 end do loop_1002
+           ! end of ir1-loop
+      ! ........................................................
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = ibr + 1, nbl
+                    jgl = ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! Safe Gram Matrix Computation 
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_sdot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_scopy( m, a( 1, p ), 1, work, 1 )
+                                      call stdlib_slascl( 'G', 0, 0, aapp, d( p ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_sdot( m, work, 1, a( 1, q ),1 )*d( q ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_sdot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_scopy( m, a( 1, q ), 1, work, 1 )
+                                      call stdlib_slascl( 'G', 0, 0, aaqq, d( q ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_sdot( m, work, 1, a( 1, p ),1 )*d( p ) / &
+                                                aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                                   notrot = 0
+                 ! rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq ) / aapq
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*d( p ) / d( q )
+                                         fastr( 4 ) = -t*d( q ) / d( p )
+                                         call stdlib_srotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_srotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = d( p ) / d( q )
+                                         aqoap = d( q ) / d( p )
+                                         if( d( p )>=one ) then
+                                            if( d( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q )*cs
+                                               call stdlib_srotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_srotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_saxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_saxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_saxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_saxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q ) / cs
+                                            end if
+                                         else
+                                            if( d( q )>=one ) then
+                                               call stdlib_saxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_saxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_saxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_saxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                               d( p ) = d( p ) / cs
+                                               d( q ) = d( q )*cs
+                                            else
+                                               if( d( p )>=d( q ) ) then
+                                                  call stdlib_saxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_saxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  d( p ) = d( p )*cs
+                                                  d( q ) = d( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_saxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_saxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_saxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_saxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  d( p ) = d( p ) / cs
+                                                  d( q ) = d( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_saxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_saxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                                      if( aapp>aaqq ) then
+                                         call stdlib_scopy( m, a( 1, p ), 1, work,1 )
+                                         call stdlib_slascl( 'G', 0, 0, aapp, one,m, 1, work, lda,&
+                                                    ierr )
+                                         call stdlib_slascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*d( p ) / d( q )
+                                         call stdlib_saxpy( m, temp1, work, 1,a( 1, q ), 1 )
+                                                   
+                                         call stdlib_slascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      else
+                                         call stdlib_scopy( m, a( 1, q ), 1, work,1 )
+                                         call stdlib_slascl( 'G', 0, 0, aaqq, one,m, 1, work, lda,&
+                                                    ierr )
+                                         call stdlib_slascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*d( q ) / d( p )
+                                         call stdlib_saxpy( m, temp1, work, 1,a( 1, p ), 1 )
+                                                   
+                                         call stdlib_slascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q)
+                 ! .. recompute sva(q)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_snrm2( m, a( 1, q ), 1 )*d( q )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_slassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*d( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_snrm2( m, a( 1, p ), 1 )*d( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_slassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*d( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_snrm2( m, a( 1, n ), 1 )*d( n )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_slassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )*d( n )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<real( n,KIND=sp)*tol ) .and.( real( n,KIND=sp)&
+                        *mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:) reaching this point means that the procedure has completed the given
+           ! number of iterations.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means that during the i-th sweep all pivots were
+           ! below the given tolerance, causing early exit.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the vector d.
+           do p = 1, n - 1
+              q = stdlib_isamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 temp1 = d( p )
+                 d( p ) = d( q )
+                 d( q ) = temp1
+                 call stdlib_sswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_sswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_sgsvj0
+
+     !> SGSVJ1: is called from SGESVJ as a pre-processor and that is its main
+     !> purpose. It applies Jacobi rotations in the same way as SGESVJ does, but
+     !> it targets only particular pivots and it does not check convergence
+     !> (stopping criterion). Few tuning parameters (marked by [TP]) are
+     !> available for the implementer.
+     !> Further Details
+     !> ~~~~~~~~~~~~~~~
+     !> SGSVJ1 applies few sweeps of Jacobi rotations in the column space of
+     !> the input M-by-N matrix A. The pivot pairs are taken from the (1,2)
+     !> off-diagonal block in the corresponding N-by-N Gram matrix A^T * A. The
+     !> block-entries (tiles) of the (1,2) off-diagonal block are marked by the
+     !> [x]'s in the following scheme:
+     !> | *  *  * [x] [x] [x]|
+     !> | *  *  * [x] [x] [x]|    Row-cycling in the nblr-by-nblc [x] blocks.
+     !> | *  *  * [x] [x] [x]|    Row-cyclic pivoting inside each [x] block.
+     !> |[x] [x] [x] *  *  * |
+     !> |[x] [x] [x] *  *  * |
+     !> |[x] [x] [x] *  *  * |
+     !> In terms of the columns of A, the first N1 columns are rotated 'against'
+     !> the remaining N-N1 columns, trying to increase the angle between the
+     !> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is
+     !> tiled using quadratic tiles of side KBL. Here, KBL is a tuning parameter.
+     !> The number of sweeps is given in NSWEEP and the orthogonality threshold
+     !> is given in TOL.
+
+     pure subroutine stdlib_sgsvj1( jobv, m, n, n1, a, lda, d, sva, mv, v, ldv,eps, sfmin, tol, &
+               nsweep, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(sp), intent(in) :: eps, sfmin, tol
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n, n1, nsweep
+           character, intent(in) :: jobv
+           ! Array Arguments 
+           real(sp), intent(inout) :: a(lda,*), d(n), sva(n), v(ldv,*)
+           real(sp), intent(out) :: work(lwork)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(sp) :: aapp, aapp0, aapq, aaqq, apoaq, aqoap, big, bigtheta, cs, large, mxaapq, &
+           mxsinj, rootbig, rooteps, rootsfmin, roottol, small, sn, t, temp1, theta, &
+                     thsign
+           integer(ilp) :: blskip, emptsw, i, ibr, igl, ierr, ijblsk, iswrot, jbc, jgl, kbl, mvl, &
+                     notrot, nblc, nblr, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, rotok, rsvec
+           ! Local Arrays 
+           real(sp) :: fastr(5)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,float,min,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           applv = stdlib_lsame( jobv, 'A' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -3
+           else if( n1<0 ) then
+              info = -4
+           else if( lda<m ) then
+              info = -6
+           else if( ( rsvec.or.applv ) .and. ( mv<0 ) ) then
+              info = -9
+           else if( ( rsvec.and.( ldv<n ) ).or.( applv.and.( ldv<mv ) )  ) then
+              info = -11
+           else if( tol<=eps ) then
+              info = -14
+           else if( nsweep<0 ) then
+              info = -15
+           else if( lwork<m ) then
+              info = -17
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SGSVJ1', -info )
+              return
+           end if
+           if( rsvec ) then
+              mvl = n
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           rooteps = sqrt( eps )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / eps
+           big = one / sfmin
+           rootbig = one / rootsfmin
+           large = big / sqrt( real( m*n,KIND=sp) )
+           bigtheta = one / rooteps
+           roottol = sqrt( tol )
+           ! Initialize The Right Singular Vector Matrix 
+           ! rsvec = stdlib_lsame( jobv, 'y' )
+           emptsw = n1*( n-n1 )
+           notrot = 0
+           fastr( 1 ) = zero
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           kbl = min( 8, n )
+           nblr = n1 / kbl
+           if( ( nblr*kbl )/=n1 )nblr = nblr + 1
+           ! .. the tiling is nblr-by-nblc [tiles]
+           nblc = ( n-n1 ) / kbl
+           if( ( nblc*kbl )/=( n-n1 ) )nblc = nblc + 1
+           blskip = ( kbl**2 ) + 1
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           swband = 0
+      ! [tp] swband is a tuning parameter. it is meaningful and effective
+           ! if stdlib_sgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_sgesvj.
+           ! | *   *   * [x] [x] [x]|
+           ! | *   *   * [x] [x] [x]|    row-cycling in the nblr-by-nblc [x] blocks.
+           ! | *   *   * [x] [x] [x]|    row-cyclic pivoting inside each [x] block.
+           ! |[x] [x] [x] *   *   * |
+           ! |[x] [x] [x] *   *   * |
+           ! |[x] [x] [x] *   *   * |
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+              loop_2000: do ibr = 1, nblr
+                 igl = ( ibr-1 )*kbl + 1
+      ! ........................................................
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = 1, nblc
+                    jgl = n1 + ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n1 )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! Safe Gram Matrix Computation 
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_sdot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_scopy( m, a( 1, p ), 1, work, 1 )
+                                      call stdlib_slascl( 'G', 0, 0, aapp, d( p ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_sdot( m, work, 1, a( 1, q ),1 )*d( q ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_sdot( m, a( 1, p ), 1, a( 1,q ), 1 )*d( p )&
+                                                *d( q ) / aaqq )/ aapp
+                                   else
+                                      call stdlib_scopy( m, a( 1, q ), 1, work, 1 )
+                                      call stdlib_slascl( 'G', 0, 0, aaqq, d( q ),m, 1, work, lda,&
+                                                 ierr )
+                                      aapq = stdlib_sdot( m, work, 1, a( 1, p ),1 )*d( p ) / &
+                                                aapp
+                                   end if
+                                end if
+                                mxaapq = max( mxaapq, abs( aapq ) )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq )>tol ) then
+                                   notrot = 0
+                 ! rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq ) / aapq
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t = half / theta
+                                         fastr( 3 ) = t*d( p ) / d( q )
+                                         fastr( 4 ) = -t*d( q ) / d( p )
+                                         call stdlib_srotm( m, a( 1, p ), 1,a( 1, q ), 1, fastr )
+                                                   
+                                         if( rsvec )call stdlib_srotm( mvl,v( 1, p ), 1,v( 1, q ),&
+                                                    1,fastr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq ) )
+                                         apoaq = d( p ) / d( q )
+                                         aqoap = d( q ) / d( p )
+                                         if( d( p )>=one ) then
+                                            if( d( q )>=one ) then
+                                               fastr( 3 ) = t*apoaq
+                                               fastr( 4 ) = -t*aqoap
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q )*cs
+                                               call stdlib_srotm( m, a( 1, p ), 1,a( 1, q ), 1,&
+                                                         fastr )
+                                               if( rsvec )call stdlib_srotm( mvl,v( 1, p ), 1, v( &
+                                                         1, q ),1, fastr )
+                                            else
+                                               call stdlib_saxpy( m, -t*aqoap,a( 1, q ), 1,a( 1, &
+                                                         p ), 1 )
+                                               call stdlib_saxpy( m, cs*sn*apoaq,a( 1, p ), 1,a( &
+                                                         1, q ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_saxpy( mvl, -t*aqoap,v( 1, q ), 1,v(&
+                                                             1, p ), 1 )
+                                                  call stdlib_saxpy( mvl,cs*sn*apoaq,v( 1, p ), 1,&
+                                                            v( 1, q ), 1 )
+                                               end if
+                                               d( p ) = d( p )*cs
+                                               d( q ) = d( q ) / cs
+                                            end if
+                                         else
+                                            if( d( q )>=one ) then
+                                               call stdlib_saxpy( m, t*apoaq,a( 1, p ), 1,a( 1, q &
+                                                         ), 1 )
+                                               call stdlib_saxpy( m, -cs*sn*aqoap,a( 1, q ), 1,a( &
+                                                         1, p ), 1 )
+                                               if( rsvec ) then
+                                                  call stdlib_saxpy( mvl, t*apoaq,v( 1, p ), 1,v( &
+                                                            1, q ), 1 )
+                                                  call stdlib_saxpy( mvl,-cs*sn*aqoap,v( 1, q ), &
+                                                            1,v( 1, p ), 1 )
+                                               end if
+                                               d( p ) = d( p ) / cs
+                                               d( q ) = d( q )*cs
+                                            else
+                                               if( d( p )>=d( q ) ) then
+                                                  call stdlib_saxpy( m, -t*aqoap,a( 1, q ), 1,a( &
+                                                            1, p ), 1 )
+                                                  call stdlib_saxpy( m, cs*sn*apoaq,a( 1, p ), 1,&
+                                                            a( 1, q ), 1 )
+                                                  d( p ) = d( p )*cs
+                                                  d( q ) = d( q ) / cs
+                                                  if( rsvec ) then
+                                                     call stdlib_saxpy( mvl,-t*aqoap,v( 1, q ), 1,&
+                                                               v( 1, p ), 1 )
+                                                     call stdlib_saxpy( mvl,cs*sn*apoaq,v( 1, p ),&
+                                                                1,v( 1, q ), 1 )
+                                                  end if
+                                               else
+                                                  call stdlib_saxpy( m, t*apoaq,a( 1, p ), 1,a( 1,&
+                                                             q ), 1 )
+                                                  call stdlib_saxpy( m,-cs*sn*aqoap,a( 1, q ), 1,&
+                                                            a( 1, p ), 1 )
+                                                  d( p ) = d( p ) / cs
+                                                  d( q ) = d( q )*cs
+                                                  if( rsvec ) then
+                                                     call stdlib_saxpy( mvl,t*apoaq, v( 1, p ),1, &
+                                                               v( 1, q ), 1 )
+                                                     call stdlib_saxpy( mvl,-cs*sn*aqoap,v( 1, q )&
+                                                               , 1,v( 1, p ), 1 )
+                                                  end if
+                                               end if
+                                            end if
+                                         end if
+                                      end if
+                                   else
+                                      if( aapp>aaqq ) then
+                                         call stdlib_scopy( m, a( 1, p ), 1, work,1 )
+                                         call stdlib_slascl( 'G', 0, 0, aapp, one,m, 1, work, lda,&
+                                                    ierr )
+                                         call stdlib_slascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*d( p ) / d( q )
+                                         call stdlib_saxpy( m, temp1, work, 1,a( 1, q ), 1 )
+                                                   
+                                         call stdlib_slascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      else
+                                         call stdlib_scopy( m, a( 1, q ), 1, work,1 )
+                                         call stdlib_slascl( 'G', 0, 0, aaqq, one,m, 1, work, lda,&
+                                                    ierr )
+                                         call stdlib_slascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         temp1 = -aapq*d( q ) / d( p )
+                                         call stdlib_saxpy( m, temp1, work, 1,a( 1, p ), 1 )
+                                                   
+                                         call stdlib_slascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq*aapq ) )
+                                         mxsinj = max( mxsinj, sfmin )
+                                      end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q)
+                 ! .. recompute sva(q)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_snrm2( m, a( 1, q ), 1 )*d( q )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_slassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )*d( q )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_snrm2( m, a( 1, p ), 1 )*d( p )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_slassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )*d( p )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+                 ! skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+            ! if ( notrot >= emptsw )  go to 2011
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+      ! **      if ( notrot >= emptsw )  go to 2011
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **   if ( notrot >= emptsw ) go to 1994
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_snrm2( m, a( 1, n ), 1 )*d( n )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_slassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )*d( n )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<real( n,KIND=sp)*tol ) .and.( real( n,KIND=sp)&
+                        *mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:) reaching this point means that the procedure has completed the given
+           ! number of sweeps.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means that during the i-th sweep all pivots were
+           ! below the given threshold, causing early exit.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the vector d
+           do p = 1, n - 1
+              q = stdlib_isamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 temp1 = d( p )
+                 d( p ) = d( q )
+                 d( q ) = temp1
+                 call stdlib_sswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_sswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_sgsvj1
+
+     !> SHSEQR: computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+     !> Schur form), and Z is the orthogonal matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input orthogonal
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+     subroutine stdlib_shseqr( job, compz, n, ilo, ihi, h, ldh, wr, wi, z,ldz, work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           character, intent(in) :: compz, job
+           ! Array Arguments 
+           real(sp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(sp), intent(out) :: wi(*), work(*), wr(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: nl = 49
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_slahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== nl allocates some local workspace to help small matrices
+           ! .    through a rare stdlib_slahqr failure.  nl > ntiny = 15 is
+           ! .    required and nl <= nmin = stdlib_ilaenv(ispec=12,...) is recom-
+           ! .    mended.  (the default value of nmin is 75.)  using nl = 49
+           ! .    allows up to six simultaneous shifts and a 16-by-16
+           ! .    deflation window.  ====
+           
+           
+           ! Local Arrays 
+           real(sp) :: hl(nl,nl), workl(nl)
+           ! Local Scalars 
+           integer(ilp) :: i, kbot, nmin
+           logical(lk) :: initz, lquery, wantt, wantz
+           ! Intrinsic Functions 
+           intrinsic :: max,min,real
+           ! Executable Statements 
+           ! ==== decode and check the input parameters. ====
+           wantt = stdlib_lsame( job, 'S' )
+           initz = stdlib_lsame( compz, 'I' )
+           wantz = initz .or. stdlib_lsame( compz, 'V' )
+           work( 1 ) = real( max( 1, n ),KIND=sp)
+           lquery = lwork==-1
+           info = 0
+           if( .not.stdlib_lsame( job, 'E' ) .and. .not.wantt ) then
+              info = -1
+           else if( .not.stdlib_lsame( compz, 'N' ) .and. .not.wantz ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -4
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -5
+           else if( ldh<max( 1, n ) ) then
+              info = -7
+           else if( ldz<1 .or. ( wantz .and. ldz<max( 1, n ) ) ) then
+              info = -11
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              ! ==== quick return in case of invalid argument. ====
+              call stdlib_xerbla( 'SHSEQR', -info )
+              return
+           else if( n==0 ) then
+              ! ==== quick return in case n = 0; nothing to do. ====
+              return
+           else if( lquery ) then
+              ! ==== quick return in case of a workspace query ====
+              call stdlib_slaqr0( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, ilo,ihi, z, ldz, &
+                        work, lwork, info )
+              ! ==== ensure reported workspace size is backward-compatible with
+              ! .    previous lapack versions. ====
+              work( 1 ) = max( real( max( 1, n ),KIND=sp), work( 1 ) )
+              return
+           else
+              ! ==== copy eigenvalues isolated by stdlib_sgebal ====
+              do i = 1, ilo - 1
+                 wr( i ) = h( i, i )
+                 wi( i ) = zero
+              end do
+              do i = ihi + 1, n
+                 wr( i ) = h( i, i )
+                 wi( i ) = zero
+              end do
+              ! ==== initialize z, if requested ====
+              if( initz )call stdlib_slaset( 'A', n, n, zero, one, z, ldz )
+              ! ==== quick return if possible ====
+              if( ilo==ihi ) then
+                 wr( ilo ) = h( ilo, ilo )
+                 wi( ilo ) = zero
+                 return
+              end if
+              ! ==== stdlib_slahqr/stdlib_slaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'SHSEQR', job( : 1 ) // compz( : 1 ), n,ilo, ihi, lwork )
+                        
+              nmin = max( ntiny, nmin )
+              ! ==== stdlib_slaqr0 for big matrices; stdlib_slahqr for small ones ====
+              if( n>nmin ) then
+                 call stdlib_slaqr0( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, ilo,ihi, z, ldz, &
+                           work, lwork, info )
+              else
+                 ! ==== small matrix ====
+                 call stdlib_slahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi, ilo,ihi, z, ldz, &
+                           info )
+                 if( info>0 ) then
+                    ! ==== a rare stdlib_slahqr failure!  stdlib_slaqr0 sometimes succeeds
+                    ! .    when stdlib_slahqr fails. ====
+                    kbot = info
+                    if( n>=nl ) then
+                       ! ==== larger matrices have enough subdiagonal scratch
+                       ! .    space to call stdlib_slaqr0 directly. ====
+                       call stdlib_slaqr0( wantt, wantz, n, ilo, kbot, h, ldh, wr,wi, ilo, ihi, z,&
+                                  ldz, work, lwork, info )
+                    else
+                       ! ==== tiny matrices don't have enough subdiagonal
+                       ! .    scratch space to benefit from stdlib_slaqr0.  hence,
+                       ! .    tiny matrices must be copied into a larger
+                       ! .    array before calling stdlib_slaqr0. ====
+                       call stdlib_slacpy( 'A', n, n, h, ldh, hl, nl )
+                       hl( n+1, n ) = zero
+                       call stdlib_slaset( 'A', nl, nl-n, zero, zero, hl( 1, n+1 ),nl )
+                       call stdlib_slaqr0( wantt, wantz, nl, ilo, kbot, hl, nl, wr,wi, ilo, ihi, &
+                                 z, ldz, workl, nl, info )
+                       if( wantt .or. info/=0 )call stdlib_slacpy( 'A', n, n, hl, nl, h, ldh )
+                                 
+                    end if
+                 end if
+              end if
+              ! ==== clear out the trash, if necessary. ====
+              if( ( wantt .or. info/=0 ) .and. n>2 )call stdlib_slaset( 'L', n-2, n-2, zero, zero,&
+                         h( 3, 1 ), ldh )
+              ! ==== ensure reported workspace size is backward-compatible with
+              ! .    previous lapack versions. ====
+              work( 1 ) = max( real( max( 1, n ),KIND=sp), work( 1 ) )
+           end if
+     end subroutine stdlib_shseqr
+
+     !> SLALSA: is an itermediate step in solving the least squares problem
+     !> by computing the SVD of the coefficient matrix in compact form (The
+     !> singular vectors are computed as products of simple orthorgonal
+     !> matrices.).
+     !> If ICOMPQ = 0, SLALSA applies the inverse of the left singular vector
+     !> matrix of an upper bidiagonal matrix to the right hand side; and if
+     !> ICOMPQ = 1, SLALSA applies the right singular vector matrix to the
+     !> right hand side. The singular vector matrices were generated in
+     !> compact form by SLALSA.
+
+     pure subroutine stdlib_slalsa( icompq, smlsiz, n, nrhs, b, ldb, bx, ldbx, u,ldu, vt, k, difl,&
+                difr, z, poles, givptr,givcol, ldgcol, perm, givnum, c, s, work,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, ldb, ldbx, ldgcol, ldu, n, nrhs, smlsiz
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(ldgcol,*), givptr(*), k(*), perm(ldgcol,*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: b(ldb,*)
+           real(sp), intent(out) :: bx(ldbx,*), work(*)
+           real(sp), intent(in) :: c(*), difl(ldu,*), difr(ldu,*), givnum(ldu,*), poles(ldu,*), s(&
+                     *), u(ldu,*), vt(ldu,*), z(ldu,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ic, im1, inode, j, lf, ll, lvl, lvl2, nd, ndb1, ndiml, ndimr, &
+                     nl, nlf, nlp1, nlvl, nr, nrf, nrp1, sqre
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( smlsiz<3 ) then
+              info = -2
+           else if( n<smlsiz ) then
+              info = -3
+           else if( nrhs<1 ) then
+              info = -4
+           else if( ldb<n ) then
+              info = -6
+           else if( ldbx<n ) then
+              info = -8
+           else if( ldu<n ) then
+              info = -10
+           else if( ldgcol<n ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLALSA', -info )
+              return
+           end if
+           ! book-keeping and  setting up the computation tree.
+           inode = 1
+           ndiml = inode + n
+           ndimr = ndiml + n
+           call stdlib_slasdt( n, nlvl, nd, iwork( inode ), iwork( ndiml ),iwork( ndimr ), smlsiz &
+                     )
+           ! the following code applies back the left singular vector factors.
+           ! for applying back the right singular vector factors, go to 50.
+           if( icompq==1 ) then
+              go to 50
+           end if
+           ! the nodes on the bottom level of the tree were solved
+           ! by stdlib_slasdq. the corresponding left and right singular vector
+           ! matrices are in explicit form. first apply back the left
+           ! singular vector matrices.
+           ndb1 = ( nd+1 ) / 2
+           do i = ndb1, nd
+              ! ic : center row of each node
+              ! nl : number of rows of left  subproblem
+              ! nr : number of rows of right subproblem
+              ! nlf: starting row of the left   subproblem
+              ! nrf: starting row of the right  subproblem
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nr = iwork( ndimr+i1 )
+              nlf = ic - nl
+              nrf = ic + 1
+              call stdlib_sgemm( 'T', 'N', nl, nrhs, nl, one, u( nlf, 1 ), ldu,b( nlf, 1 ), ldb, &
+                        zero, bx( nlf, 1 ), ldbx )
+              call stdlib_sgemm( 'T', 'N', nr, nrhs, nr, one, u( nrf, 1 ), ldu,b( nrf, 1 ), ldb, &
+                        zero, bx( nrf, 1 ), ldbx )
+           end do
+           ! next copy the rows of b that correspond to unchanged rows
+           ! in the bidiagonal matrix to bx.
+           do i = 1, nd
+              ic = iwork( inode+i-1 )
+              call stdlib_scopy( nrhs, b( ic, 1 ), ldb, bx( ic, 1 ), ldbx )
+           end do
+           ! finally go through the left singular vector matrices of all
+           ! the other subproblems bottom-up on the tree.
+           j = 2**nlvl
+           sqre = 0
+           do lvl = nlvl, 1, -1
+              lvl2 = 2*lvl - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = lf, ll
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 j = j - 1
+                 call stdlib_slals0( icompq, nl, nr, sqre, nrhs, bx( nlf, 1 ), ldbx,b( nlf, 1 ), &
+                 ldb, perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ), ldgcol,givnum( nlf, lvl2 &
+                 ), ldu, poles( nlf, lvl2 ),difl( nlf, lvl ), difr( nlf, lvl2 ),z( nlf, lvl ), k( &
+                           j ), c( j ), s( j ), work,info )
+              end do
+           end do
+           go to 90
+           ! icompq = 1: applying back the right singular vector factors.
+           50 continue
+           ! first now go through the right singular vector matrices of all
+           ! the tree nodes top-down.
+           j = 0
+           loop_70: do lvl = 1, nlvl
+              lvl2 = 2*lvl - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl.
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = ll, lf, -1
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 if( i==ll ) then
+                    sqre = 0
+                 else
+                    sqre = 1
+                 end if
+                 j = j + 1
+                 call stdlib_slals0( icompq, nl, nr, sqre, nrhs, b( nlf, 1 ), ldb,bx( nlf, 1 ), &
+                 ldbx, perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ), ldgcol,givnum( nlf, &
+                 lvl2 ), ldu, poles( nlf, lvl2 ),difl( nlf, lvl ), difr( nlf, lvl2 ),z( nlf, lvl )&
+                           , k( j ), c( j ), s( j ), work,info )
+              end do
+           end do loop_70
+           ! the nodes on the bottom level of the tree were solved
+           ! by stdlib_slasdq. the corresponding right singular vector
+           ! matrices are in explicit form. apply them back.
+           ndb1 = ( nd+1 ) / 2
+           do i = ndb1, nd
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nr = iwork( ndimr+i1 )
+              nlp1 = nl + 1
+              if( i==nd ) then
+                 nrp1 = nr
+              else
+                 nrp1 = nr + 1
+              end if
+              nlf = ic - nl
+              nrf = ic + 1
+              call stdlib_sgemm( 'T', 'N', nlp1, nrhs, nlp1, one, vt( nlf, 1 ), ldu,b( nlf, 1 ), &
+                        ldb, zero, bx( nlf, 1 ), ldbx )
+              call stdlib_sgemm( 'T', 'N', nrp1, nrhs, nrp1, one, vt( nrf, 1 ), ldu,b( nrf, 1 ), &
+                        ldb, zero, bx( nrf, 1 ), ldbx )
+           end do
+           90 continue
+           return
+     end subroutine stdlib_slalsa
+
+     !> SLALSD: uses the singular value decomposition of A to solve the least
+     !> squares problem of finding X to minimize the Euclidean norm of each
+     !> column of A*X-B, where A is N-by-N upper bidiagonal, and X and B
+     !> are N-by-NRHS. The solution X overwrites B.
+     !> The singular values of A smaller than RCOND times the largest
+     !> singular value are treated as zero in solving the least squares
+     !> problem; in this case a minimum norm solution is returned.
+     !> The actual singular values are returned in D in ascending order.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_slalsd( uplo, smlsiz, n, nrhs, d, e, b, ldb, rcond,rank, work, iwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: ldb, n, nrhs, smlsiz
+           real(sp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: b(ldb,*), d(*), e(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: bx, bxst, c, difl, difr, givcol, givnum, givptr, i, icmpq1, icmpq2, &
+           iwk, j, k, nlvl, nm1, nsize, nsub, nwork, perm, poles, s, sizei, smlszp, sqre, st, st1,&
+                      u, vt, z
+           real(sp) :: cs, eps, orgnrm, r, rcnd, sn, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,log,real,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -3
+           else if( nrhs<1 ) then
+              info = -4
+           else if( ( ldb<1 ) .or. ( ldb<n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLALSD', -info )
+              return
+           end if
+           eps = stdlib_slamch( 'EPSILON' )
+           ! set up the tolerance.
+           if( ( rcond<=zero ) .or. ( rcond>=one ) ) then
+              rcnd = eps
+           else
+              rcnd = rcond
+           end if
+           rank = 0
+           ! quick return if possible.
+           if( n==0 ) then
+              return
+           else if( n==1 ) then
+              if( d( 1 )==zero ) then
+                 call stdlib_slaset( 'A', 1, nrhs, zero, zero, b, ldb )
+              else
+                 rank = 1
+                 call stdlib_slascl( 'G', 0, 0, d( 1 ), one, 1, nrhs, b, ldb, info )
+                 d( 1 ) = abs( d( 1 ) )
+              end if
+              return
+           end if
+           ! rotate the matrix if it is lower bidiagonal.
+           if( uplo=='L' ) then
+              do i = 1, n - 1
+                 call stdlib_slartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( nrhs==1 ) then
+                    call stdlib_srot( 1, b( i, 1 ), 1, b( i+1, 1 ), 1, cs, sn )
+                 else
+                    work( i*2-1 ) = cs
+                    work( i*2 ) = sn
+                 end if
+              end do
+              if( nrhs>1 ) then
+                 do i = 1, nrhs
+                    do j = 1, n - 1
+                       cs = work( j*2-1 )
+                       sn = work( j*2 )
+                       call stdlib_srot( 1, b( j, i ), 1, b( j+1, i ), 1, cs, sn )
+                    end do
+                 end do
+              end if
+           end if
+           ! scale.
+           nm1 = n - 1
+           orgnrm = stdlib_slanst( 'M', n, d, e )
+           if( orgnrm==zero ) then
+              call stdlib_slaset( 'A', n, nrhs, zero, zero, b, ldb )
+              return
+           end if
+           call stdlib_slascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, info )
+           call stdlib_slascl( 'G', 0, 0, orgnrm, one, nm1, 1, e, nm1, info )
+           ! if n is smaller than the minimum divide size smlsiz, then solve
+           ! the problem with another solver.
+           if( n<=smlsiz ) then
+              nwork = 1 + n*n
+              call stdlib_slaset( 'A', n, n, zero, one, work, n )
+              call stdlib_slasdq( 'U', 0, n, n, 0, nrhs, d, e, work, n, work, n, b,ldb, work( &
+                        nwork ), info )
+              if( info/=0 ) then
+                 return
+              end if
+              tol = rcnd*abs( d( stdlib_isamax( n, d, 1 ) ) )
+              do i = 1, n
+                 if( d( i )<=tol ) then
+                    call stdlib_slaset( 'A', 1, nrhs, zero, zero, b( i, 1 ), ldb )
+                 else
+                    call stdlib_slascl( 'G', 0, 0, d( i ), one, 1, nrhs, b( i, 1 ),ldb, info )
+                              
+                    rank = rank + 1
+                 end if
+              end do
+              call stdlib_sgemm( 'T', 'N', n, nrhs, n, one, work, n, b, ldb, zero,work( nwork ), &
+                        n )
+              call stdlib_slacpy( 'A', n, nrhs, work( nwork ), n, b, ldb )
+              ! unscale.
+              call stdlib_slascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+              call stdlib_slasrt( 'D', n, d, info )
+              call stdlib_slascl( 'G', 0, 0, orgnrm, one, n, nrhs, b, ldb, info )
+              return
+           end if
+           ! book-keeping and setting up some constants.
+           nlvl = int( log( real( n,KIND=sp) / real( smlsiz+1,KIND=sp) ) / log( two ),KIND=ilp) + &
+                     1
+           smlszp = smlsiz + 1
+           u = 1
+           vt = 1 + smlsiz*n
+           difl = vt + smlszp*n
+           difr = difl + nlvl*n
+           z = difr + nlvl*n*2
+           c = z + nlvl*n
+           s = c + n
+           poles = s + n
+           givnum = poles + 2*nlvl*n
+           bx = givnum + 2*nlvl*n
+           nwork = bx + n*nrhs
+           sizei = 1 + n
+           k = sizei + n
+           givptr = k + n
+           perm = givptr + n
+           givcol = perm + nlvl*n
+           iwk = givcol + nlvl*n*2
+           st = 1
+           sqre = 0
+           icmpq1 = 1
+           icmpq2 = 0
+           nsub = 0
+           do i = 1, n
+              if( abs( d( i ) )<eps ) then
+                 d( i ) = sign( eps, d( i ) )
+              end if
+           end do
+           loop_60: do i = 1, nm1
+              if( ( abs( e( i ) )<eps ) .or. ( i==nm1 ) ) then
+                 nsub = nsub + 1
+                 iwork( nsub ) = st
+                 ! subproblem found. first determine its size and then
+                 ! apply divide and conquer on it.
+                 if( i<nm1 ) then
+                    ! a subproblem with e(i) small for i < nm1.
+                    nsize = i - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                 else if( abs( e( i ) )>=eps ) then
+                    ! a subproblem with e(nm1) not too small but i = nm1.
+                    nsize = n - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                 else
+                    ! a subproblem with e(nm1) small. this implies an
+                    ! 1-by-1 subproblem at d(n), which is not solved
+                    ! explicitly.
+                    nsize = i - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                    nsub = nsub + 1
+                    iwork( nsub ) = n
+                    iwork( sizei+nsub-1 ) = 1
+                    call stdlib_scopy( nrhs, b( n, 1 ), ldb, work( bx+nm1 ), n )
+                 end if
+                 st1 = st - 1
+                 if( nsize==1 ) then
+                    ! this is a 1-by-1 subproblem and is not solved
+                    ! explicitly.
+                    call stdlib_scopy( nrhs, b( st, 1 ), ldb, work( bx+st1 ), n )
+                 else if( nsize<=smlsiz ) then
+                    ! this is a small subproblem and is solved by stdlib_slasdq.
+                    call stdlib_slaset( 'A', nsize, nsize, zero, one,work( vt+st1 ), n )
+                    call stdlib_slasdq( 'U', 0, nsize, nsize, 0, nrhs, d( st ),e( st ), work( vt+&
+                              st1 ), n, work( nwork ),n, b( st, 1 ), ldb, work( nwork ), info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                    call stdlib_slacpy( 'A', nsize, nrhs, b( st, 1 ), ldb,work( bx+st1 ), n )
+                              
+                 else
+                    ! a large problem. solve it using divide and conquer.
+                    call stdlib_slasda( icmpq1, smlsiz, nsize, sqre, d( st ),e( st ), work( u+st1 &
+                    ), n, work( vt+st1 ),iwork( k+st1 ), work( difl+st1 ),work( difr+st1 ), work( &
+                    z+st1 ),work( poles+st1 ), iwork( givptr+st1 ),iwork( givcol+st1 ), n, iwork( &
+                    perm+st1 ),work( givnum+st1 ), work( c+st1 ),work( s+st1 ), work( nwork ), &
+                              iwork( iwk ),info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                    bxst = bx + st1
+                    call stdlib_slalsa( icmpq2, smlsiz, nsize, nrhs, b( st, 1 ),ldb, work( bxst ),&
+                     n, work( u+st1 ), n,work( vt+st1 ), iwork( k+st1 ),work( difl+st1 ), work( &
+                     difr+st1 ),work( z+st1 ), work( poles+st1 ),iwork( givptr+st1 ), iwork( &
+                     givcol+st1 ), n,iwork( perm+st1 ), work( givnum+st1 ),work( c+st1 ), work( s+&
+                               st1 ), work( nwork ),iwork( iwk ), info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                 end if
+                 st = i + 1
+              end if
+           end do loop_60
+           ! apply the singular values and treat the tiny ones as zero.
+           tol = rcnd*abs( d( stdlib_isamax( n, d, 1 ) ) )
+           do i = 1, n
+              ! some of the elements in d can be negative because 1-by-1
+              ! subproblems were not solved explicitly.
+              if( abs( d( i ) )<=tol ) then
+                 call stdlib_slaset( 'A', 1, nrhs, zero, zero, work( bx+i-1 ), n )
+              else
+                 rank = rank + 1
+                 call stdlib_slascl( 'G', 0, 0, d( i ), one, 1, nrhs,work( bx+i-1 ), n, info )
+                           
+              end if
+              d( i ) = abs( d( i ) )
+           end do
+           ! now apply back the right singular vectors.
+           icmpq2 = 1
+           do i = 1, nsub
+              st = iwork( i )
+              st1 = st - 1
+              nsize = iwork( sizei+i-1 )
+              bxst = bx + st1
+              if( nsize==1 ) then
+                 call stdlib_scopy( nrhs, work( bxst ), n, b( st, 1 ), ldb )
+              else if( nsize<=smlsiz ) then
+                 call stdlib_sgemm( 'T', 'N', nsize, nrhs, nsize, one,work( vt+st1 ), n, work( &
+                           bxst ), n, zero,b( st, 1 ), ldb )
+              else
+                 call stdlib_slalsa( icmpq2, smlsiz, nsize, nrhs, work( bxst ), n,b( st, 1 ), ldb,&
+                  work( u+st1 ), n,work( vt+st1 ), iwork( k+st1 ),work( difl+st1 ), work( difr+&
+                  st1 ),work( z+st1 ), work( poles+st1 ),iwork( givptr+st1 ), iwork( givcol+st1 ),&
+                   n,iwork( perm+st1 ), work( givnum+st1 ),work( c+st1 ), work( s+st1 ), work( &
+                             nwork ),iwork( iwk ), info )
+                 if( info/=0 ) then
+                    return
+                 end if
+              end if
+           end do
+           ! unscale and sort the singular values.
+           call stdlib_slascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+           call stdlib_slasrt( 'D', n, d, info )
+           call stdlib_slascl( 'G', 0, 0, orgnrm, one, n, nrhs, b, ldb, info )
+           return
+     end subroutine stdlib_slalsd
+
+     !> SLAQR0: computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+     !> Schur form), and Z is the orthogonal matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input orthogonal
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+     subroutine stdlib_slaqr0( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, ldz, work,&
+                lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(sp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(sp), intent(out) :: wi(*), work(*), wr(*)
+        ! ================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: kexnw = 5
+           integer(ilp), parameter :: kexsh = 6
+           real(sp), parameter :: wilk1 = 0.75_sp
+           real(sp), parameter :: wilk2 = -0.4375_sp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_slahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== exceptional deflation windows:  try to cure rare
+           ! .    slow convergence by varying the size of the
+           ! .    deflation window after kexnw iterations. ====
+           
+           ! ==== exceptional shifts: try to cure rare slow convergence
+           ! .    with ad-hoc exceptional shifts every kexsh iterations.
+           ! .    ====
+           
+           ! ==== the constants wilk1 and wilk2 are used to form the
+           ! .    exceptional shifts. ====
+           
+           
+           ! Local Scalars 
+           real(sp) :: aa, bb, cc, cs, dd, sn, ss, swap
+           integer(ilp) :: i, inf, it, itmax, k, kacc22, kbot, kdu, ks, kt, ktop, ku, kv, kwh, &
+           kwtop, kwv, ld, ls, lwkopt, ndec, ndfl, nh, nho, nibble, nmin, ns, nsmax, nsr, nve, nw,&
+                      nwmax, nwr, nwupbd
+           logical(lk) :: sorted
+           character :: jbcmpz*2
+           ! Local Arrays 
+           real(sp) :: zdum(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,max,min,mod,real
+           ! Executable Statements 
+           info = 0
+           ! ==== quick return for n = 0: nothing to do. ====
+           if( n==0 ) then
+              work( 1 ) = one
+              return
+           end if
+           if( n<=ntiny ) then
+              ! ==== tiny matrices must use stdlib_slahqr. ====
+              lwkopt = 1
+              if( lwork/=-1 )call stdlib_slahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, &
+                        ihiz, z, ldz, info )
+           else
+              ! ==== use small bulge multi-shift qr with aggressive early
+              ! .    deflation on larger-than-tiny matrices. ====
+              ! ==== hope for the best. ====
+              info = 0
+              ! ==== set up job flags for stdlib_ilaenv. ====
+              if( wantt ) then
+                 jbcmpz( 1: 1 ) = 'S'
+              else
+                 jbcmpz( 1: 1 ) = 'E'
+              end if
+              if( wantz ) then
+                 jbcmpz( 2: 2 ) = 'V'
+              else
+                 jbcmpz( 2: 2 ) = 'N'
+              end if
+              ! ==== nwr = recommended deflation window size.  at this
+              ! .    point,  n > ntiny = 15, so there is enough
+              ! .    subdiagonal workspace for nwr>=2 as required.
+              ! .    (in fact, there is enough subdiagonal space for
+              ! .    nwr>=4.) ====
+              nwr = stdlib_ilaenv( 13, 'SLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nwr = max( 2, nwr )
+              nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+              ! ==== nsr = recommended number of simultaneous shifts.
+              ! .    at this point n > ntiny = 15, so there is at
+              ! .    enough subdiagonal workspace for nsr to be even
+              ! .    and greater than or equal to two as required. ====
+              nsr = stdlib_ilaenv( 15, 'SLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nsr = min( nsr, ( n-3 ) / 6, ihi-ilo )
+              nsr = max( 2, nsr-mod( nsr, 2 ) )
+              ! ==== estimate optimal workspace ====
+              ! ==== workspace query call to stdlib_slaqr3 ====
+              call stdlib_slaqr3( wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz,ihiz, z, ldz, ls,&
+                         ld, wr, wi, h, ldh, n, h, ldh,n, h, ldh, work, -1 )
+              ! ==== optimal workspace = max(stdlib_slaqr5, stdlib_slaqr3) ====
+              lwkopt = max( 3*nsr / 2, int( work( 1 ),KIND=ilp) )
+              ! ==== quick return in case of workspace query. ====
+              if( lwork==-1 ) then
+                 work( 1 ) = real( lwkopt,KIND=sp)
+                 return
+              end if
+              ! ==== stdlib_slahqr/stdlib_slaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'SLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nmin = max( ntiny, nmin )
+              ! ==== nibble crossover point ====
+              nibble = stdlib_ilaenv( 14, 'SLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nibble = max( 0, nibble )
+              ! ==== accumulate reflections during ttswp?  use block
+              ! .    2-by-2 structure during matrix-matrix multiply? ====
+              kacc22 = stdlib_ilaenv( 16, 'SLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              kacc22 = max( 0, kacc22 )
+              kacc22 = min( 2, kacc22 )
+              ! ==== nwmax = the largest possible deflation window for
+              ! .    which there is sufficient workspace. ====
+              nwmax = min( ( n-1 ) / 3, lwork / 2 )
+              nw = nwmax
+              ! ==== nsmax = the largest number of simultaneous shifts
+              ! .    for which there is sufficient workspace. ====
+              nsmax = min( ( n-3 ) / 6, 2*lwork / 3 )
+              nsmax = nsmax - mod( nsmax, 2 )
+              ! ==== ndfl: an iteration count restarted at deflation. ====
+              ndfl = 1
+              ! ==== itmax = iteration limit ====
+              itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )
+              ! ==== last row and column in the active block ====
+              kbot = ihi
+              ! ==== main loop ====
+              loop_80: do it = 1, itmax
+                 ! ==== done when kbot falls below ilo ====
+                 if( kbot<ilo )go to 90
+                 ! ==== locate active block ====
+                 do k = kbot, ilo + 1, -1
+                    if( h( k, k-1 )==zero )go to 20
+                 end do
+                 k = ilo
+                 20 continue
+                 ktop = k
+                 ! ==== select deflation window size:
+                 ! .    typical case:
+                 ! .      if possible and advisable, nibble the entire
+                 ! .      active block.  if not, use size min(nwr,nwmax)
+                 ! .      or min(nwr+1,nwmax) depending upon which has
+                 ! .      the smaller corresponding subdiagonal entry
+                 ! .      (a heuristic).
+                 ! .    exceptional case:
+                 ! .      if there have been no deflations in kexnw or
+                 ! .      more iterations, then vary the deflation window
+                 ! .      size.   at first, because, larger windows are,
+                 ! .      in general, more powerful than smaller ones,
+                 ! .      rapidly increase the window to the maximum possible.
+                 ! .      then, gradually reduce the window size. ====
+                 nh = kbot - ktop + 1
+                 nwupbd = min( nh, nwmax )
+                 if( ndfl<kexnw ) then
+                    nw = min( nwupbd, nwr )
+                 else
+                    nw = min( nwupbd, 2*nw )
+                 end if
+                 if( nw<nwmax ) then
+                    if( nw>=nh-1 ) then
+                       nw = nh
+                    else
+                       kwtop = kbot - nw + 1
+                       if( abs( h( kwtop, kwtop-1 ) )>abs( h( kwtop-1, kwtop-2 ) ) )nw = nw + &
+                                 1
+                    end if
+                 end if
+                 if( ndfl<kexnw ) then
+                    ndec = -1
+                 else if( ndec>=0 .or. nw>=nwupbd ) then
+                    ndec = ndec + 1
+                    if( nw-ndec<2 )ndec = 0
+                    nw = nw - ndec
+                 end if
+                 ! ==== aggressive early deflation:
+                 ! .    split workspace under the subdiagonal into
+                 ! .      - an nw-by-nw work array v in the lower
+                 ! .        left-hand-corner,
+                 ! .      - an nw-by-at-least-nw-but-more-is-better
+                 ! .        (nw-by-nho) horizontal work array along
+                 ! .        the bottom edge,
+                 ! .      - an at-least-nw-but-more-is-better (nhv-by-nw)
+                 ! .        vertical work array along the left-hand-edge.
+                 ! .        ====
+                 kv = n - nw + 1
+                 kt = nw + 1
+                 nho = ( n-nw-1 ) - kt + 1
+                 kwv = nw + 2
+                 nve = ( n-nw ) - kwv + 1
+                 ! ==== aggressive early deflation ====
+                 call stdlib_slaqr3( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+                 ls, ld, wr, wi, h( kv, 1 ), ldh,nho, h( kv, kt ), ldh, nve, h( kwv, 1 ), ldh,&
+                           work, lwork )
+                 ! ==== adjust kbot accounting for new deflations. ====
+                 kbot = kbot - ld
+                 ! ==== ks points to the shifts. ====
+                 ks = kbot - ls + 1
+                 ! ==== skip an expensive qr sweep if there is a (partly
+                 ! .    heuristic) reason to expect that many eigenvalues
+                 ! .    will deflate without it.  here, the qr sweep is
+                 ! .    skipped if many eigenvalues have just been deflated
+                 ! .    or if the remaining active block is small.
+                 if( ( ld==0 ) .or. ( ( 100*ld<=nw*nibble ) .and. ( kbot-ktop+1>min( nmin, nwmax )&
+                            ) ) ) then
+                    ! ==== ns = nominal number of simultaneous shifts.
+                    ! .    this may be lowered (slightly) if stdlib_slaqr3
+                    ! .    did not provide that many shifts. ====
+                    ns = min( nsmax, nsr, max( 2, kbot-ktop ) )
+                    ns = ns - mod( ns, 2 )
+                    ! ==== if there have been no deflations
+                    ! .    in a multiple of kexsh iterations,
+                    ! .    then try exceptional shifts.
+                    ! .    otherwise use shifts provided by
+                    ! .    stdlib_slaqr3 above or from the eigenvalues
+                    ! .    of a trailing principal submatrix. ====
+                    if( mod( ndfl, kexsh )==0 ) then
+                       ks = kbot - ns + 1
+                       do i = kbot, max( ks+1, ktop+2 ), -2
+                          ss = abs( h( i, i-1 ) ) + abs( h( i-1, i-2 ) )
+                          aa = wilk1*ss + h( i, i )
+                          bb = ss
+                          cc = wilk2*ss
+                          dd = aa
+                          call stdlib_slanv2( aa, bb, cc, dd, wr( i-1 ), wi( i-1 ),wr( i ), wi( i &
+                                    ), cs, sn )
+                       end do
+                       if( ks==ktop ) then
+                          wr( ks+1 ) = h( ks+1, ks+1 )
+                          wi( ks+1 ) = zero
+                          wr( ks ) = wr( ks+1 )
+                          wi( ks ) = wi( ks+1 )
+                       end if
+                    else
+                       ! ==== got ns/2 or fewer shifts? use stdlib_slaqr4 or
+                       ! .    stdlib_slahqr on a trailing principal submatrix to
+                       ! .    get more. (since ns<=nsmax<=(n-3)/6,
+                       ! .    there is enough space below the subdiagonal
+                       ! .    to fit an ns-by-ns scratch array.) ====
+                       if( kbot-ks+1<=ns / 2 ) then
+                          ks = kbot - ns + 1
+                          kt = n - ns + 1
+                          call stdlib_slacpy( 'A', ns, ns, h( ks, ks ), ldh,h( kt, 1 ), ldh )
+                                    
+                          if( ns>nmin ) then
+                             call stdlib_slaqr4( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, wr( &
+                                       ks ),wi( ks ), 1, 1, zdum, 1, work,lwork, inf )
+                          else
+                             call stdlib_slahqr( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, wr( &
+                                       ks ),wi( ks ), 1, 1, zdum, 1, inf )
+                          end if
+                          ks = ks + inf
+                          ! ==== in case of a rare qr failure use
+                          ! .    eigenvalues of the trailing 2-by-2
+                          ! .    principal submatrix.  ====
+                          if( ks>=kbot ) then
+                             aa = h( kbot-1, kbot-1 )
+                             cc = h( kbot, kbot-1 )
+                             bb = h( kbot-1, kbot )
+                             dd = h( kbot, kbot )
+                             call stdlib_slanv2( aa, bb, cc, dd, wr( kbot-1 ),wi( kbot-1 ), wr( &
+                                       kbot ),wi( kbot ), cs, sn )
+                             ks = kbot - 1
+                          end if
+                       end if
+                       if( kbot-ks+1>ns ) then
+                          ! ==== sort the shifts (helps a little)
+                          ! .    bubble sort keeps complex conjugate
+                          ! .    pairs together. ====
+                          sorted = .false.
+                          do k = kbot, ks + 1, -1
+                             if( sorted )go to 60
+                             sorted = .true.
+                             do i = ks, k - 1
+                                if( abs( wr( i ) )+abs( wi( i ) )<abs( wr( i+1 ) )+abs( wi( i+1 ) &
+                                          ) ) then
+                                   sorted = .false.
+                                   swap = wr( i )
+                                   wr( i ) = wr( i+1 )
+                                   wr( i+1 ) = swap
+                                   swap = wi( i )
+                                   wi( i ) = wi( i+1 )
+                                   wi( i+1 ) = swap
+                                end if
+                             end do
+                          end do
+                          60 continue
+                       end if
+                       ! ==== shuffle shifts into pairs of real shifts
+                       ! .    and pairs of complex conjugate shifts
+                       ! .    assuming complex conjugate shifts are
+                       ! .    already adjacent to one another. (yes,
+                       ! .    they are.)  ====
+                       do i = kbot, ks + 2, -2
+                          if( wi( i )/=-wi( i-1 ) ) then
+                             swap = wr( i )
+                             wr( i ) = wr( i-1 )
+                             wr( i-1 ) = wr( i-2 )
+                             wr( i-2 ) = swap
+                             swap = wi( i )
+                             wi( i ) = wi( i-1 )
+                             wi( i-1 ) = wi( i-2 )
+                             wi( i-2 ) = swap
+                          end if
+                       end do
+                    end if
+                    ! ==== if there are only two shifts and both are
+                    ! .    real, then use only one.  ====
+                    if( kbot-ks+1==2 ) then
+                       if( wi( kbot )==zero ) then
+                          if( abs( wr( kbot )-h( kbot, kbot ) )<abs( wr( kbot-1 )-h( kbot, kbot ) &
+                                    ) ) then
+                             wr( kbot-1 ) = wr( kbot )
+                          else
+                             wr( kbot ) = wr( kbot-1 )
+                          end if
+                       end if
+                    end if
+                    ! ==== use up to ns of the the smallest magnitude
+                    ! .    shifts.  if there aren't ns shifts available,
+                    ! .    then use them all, possibly dropping one to
+                    ! .    make the number of shifts even. ====
+                    ns = min( ns, kbot-ks+1 )
+                    ns = ns - mod( ns, 2 )
+                    ks = kbot - ns + 1
+                    ! ==== small-bulge multi-shift qr sweep:
+                    ! .    split workspace under the subdiagonal into
+                    ! .    - a kdu-by-kdu work array u in the lower
+                    ! .      left-hand-corner,
+                    ! .    - a kdu-by-at-least-kdu-but-more-is-better
+                    ! .      (kdu-by-nho) horizontal work array wh along
+                    ! .      the bottom edge,
+                    ! .    - and an at-least-kdu-but-more-is-better-by-kdu
+                    ! .      (nve-by-kdu) vertical work wv arrow along
+                    ! .      the left-hand-edge. ====
+                    kdu = 2*ns
+                    ku = n - kdu + 1
+                    kwh = kdu + 1
+                    nho = ( n-kdu+1-4 ) - ( kdu+1 ) + 1
+                    kwv = kdu + 4
+                    nve = n - kdu - kwv + 1
+                    ! ==== small-bulge multi-shift qr sweep ====
+                    call stdlib_slaqr5( wantt, wantz, kacc22, n, ktop, kbot, ns,wr( ks ), wi( ks )&
+                    , h, ldh, iloz, ihiz, z,ldz, work, 3, h( ku, 1 ), ldh, nve,h( kwv, 1 ), ldh, &
+                              nho, h( ku, kwh ), ldh )
+                 end if
+                 ! ==== note progress (or the lack of it). ====
+                 if( ld>0 ) then
+                    ndfl = 1
+                 else
+                    ndfl = ndfl + 1
+                 end if
+                 ! ==== end of main loop ====
+              end do loop_80
+              ! ==== iteration limit exceeded.  set info to show where
+              ! .    the problem occurred and exit. ====
+              info = kbot
+              90 continue
+           end if
+           ! ==== return the optimal value of lwork. ====
+           work( 1 ) = real( lwkopt,KIND=sp)
+     end subroutine stdlib_slaqr0
+
+     !> Aggressive early deflation:
+     !> SLAQR3: accepts as input an upper Hessenberg matrix
+     !> H and performs an orthogonal similarity transformation
+     !> designed to detect and deflate fully converged eigenvalues from
+     !> a trailing principal submatrix.  On output H has been over-
+     !> written by a new Hessenberg matrix that is a perturbation of
+     !> an orthogonal similarity transformation of H.  It is to be
+     !> hoped that the final version of H has many zero subdiagonal
+     !> entries.
+
+     subroutine stdlib_slaqr3( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, ns, nd,&
+                sr, si, v, ldv, nh, t,ldt, nv, wv, ldwv, work, lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kbot, ktop, ldh, ldt, ldv, ldwv, ldz, lwork, n,&
+                      nh, nv, nw
+           integer(ilp), intent(out) :: nd, ns
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(sp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(sp), intent(out) :: si(*), sr(*), t(ldt,*), v(ldv,*), work(*), wv(ldwv,*)
+        ! ================================================================
+           
+           ! Local Scalars 
+           real(sp) :: aa, bb, beta, cc, cs, dd, evi, evk, foo, s, safmax, safmin, smlnum, sn, &
+                     tau, ulp
+           integer(ilp) :: i, ifst, ilst, info, infqr, j, jw, k, kcol, kend, kln, krow, kwtop, &
+                     ltop, lwk1, lwk2, lwk3, lwkopt, nmin
+           logical(lk) :: bulge, sorted
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,max,min,real,sqrt
+           ! Executable Statements 
+           ! ==== estimate optimal workspace. ====
+           jw = min( nw, kbot-ktop+1 )
+           if( jw<=2 ) then
+              lwkopt = 1
+           else
+              ! ==== workspace query call to stdlib_sgehrd ====
+              call stdlib_sgehrd( jw, 1, jw-1, t, ldt, work, work, -1, info )
+              lwk1 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_sormhr ====
+              call stdlib_sormhr( 'R', 'N', jw, jw, 1, jw-1, t, ldt, work, v, ldv,work, -1, info )
+                        
+              lwk2 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_slaqr4 ====
+              call stdlib_slaqr4( .true., .true., jw, 1, jw, t, ldt, sr, si, 1, jw,v, ldv, work, -&
+                        1, infqr )
+              lwk3 = int( work( 1 ),KIND=ilp)
+              ! ==== optimal workspace ====
+              lwkopt = max( jw+max( lwk1, lwk2 ), lwk3 )
+           end if
+           ! ==== quick return in case of workspace query. ====
+           if( lwork==-1 ) then
+              work( 1 ) = real( lwkopt,KIND=sp)
+              return
+           end if
+           ! ==== nothing to do ...
+           ! ... for an empty active block ... ====
+           ns = 0
+           nd = 0
+           work( 1 ) = one
+           if( ktop>kbot )return
+           ! ... nor for an empty deflation window. ====
+           if( nw<1 )return
+           ! ==== machine constants ====
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = one / safmin
+           call stdlib_slabad( safmin, safmax )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=sp) / ulp )
+           ! ==== setup deflation window ====
+           jw = min( nw, kbot-ktop+1 )
+           kwtop = kbot - jw + 1
+           if( kwtop==ktop ) then
+              s = zero
+           else
+              s = h( kwtop, kwtop-1 )
+           end if
+           if( kbot==kwtop ) then
+              ! ==== 1-by-1 deflation window: not much to do ====
+              sr( kwtop ) = h( kwtop, kwtop )
+              si( kwtop ) = zero
+              ns = 1
+              nd = 0
+              if( abs( s )<=max( smlnum, ulp*abs( h( kwtop, kwtop ) ) ) )then
+                 ns = 0
+                 nd = 1
+                 if( kwtop>ktop )h( kwtop, kwtop-1 ) = zero
+              end if
+              work( 1 ) = one
+              return
+           end if
+           ! ==== convert to spike-triangular form.  (in case of a
+           ! .    rare qr failure, this routine continues to do
+           ! .    aggressive early deflation using that part of
+           ! .    the deflation window that converged using infqr
+           ! .    here and there to keep track.) ====
+           call stdlib_slacpy( 'U', jw, jw, h( kwtop, kwtop ), ldh, t, ldt )
+           call stdlib_scopy( jw-1, h( kwtop+1, kwtop ), ldh+1, t( 2, 1 ), ldt+1 )
+           call stdlib_slaset( 'A', jw, jw, zero, one, v, ldv )
+           nmin = stdlib_ilaenv( 12, 'SLAQR3', 'SV', jw, 1, jw, lwork )
+           if( jw>nmin ) then
+              call stdlib_slaqr4( .true., .true., jw, 1, jw, t, ldt, sr( kwtop ),si( kwtop ), 1, &
+                        jw, v, ldv, work, lwork, infqr )
+           else
+              call stdlib_slahqr( .true., .true., jw, 1, jw, t, ldt, sr( kwtop ),si( kwtop ), 1, &
+                        jw, v, ldv, infqr )
+           end if
+           ! ==== stdlib_strexc needs a clean margin near the diagonal ====
+           do j = 1, jw - 3
+              t( j+2, j ) = zero
+              t( j+3, j ) = zero
+           end do
+           if( jw>2 )t( jw, jw-2 ) = zero
+           ! ==== deflation detection loop ====
+           ns = jw
+           ilst = infqr + 1
+           20 continue
+           if( ilst<=ns ) then
+              if( ns==1 ) then
+                 bulge = .false.
+              else
+                 bulge = t( ns, ns-1 )/=zero
+              end if
+              ! ==== small spike tip test for deflation ====
+              if( .not. bulge ) then
+                 ! ==== real eigenvalue ====
+                 foo = abs( t( ns, ns ) )
+                 if( foo==zero )foo = abs( s )
+                 if( abs( s*v( 1, ns ) )<=max( smlnum, ulp*foo ) ) then
+                    ! ==== deflatable ====
+                    ns = ns - 1
+                 else
+                    ! ==== undeflatable.   move it up out of the way.
+                    ! .    (stdlib_strexc can not fail in this case.) ====
+                    ifst = ns
+                    call stdlib_strexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    ilst = ilst + 1
+                 end if
+              else
+                 ! ==== complex conjugate pair ====
+                 foo = abs( t( ns, ns ) ) + sqrt( abs( t( ns, ns-1 ) ) )*sqrt( abs( t( ns-1, ns ) &
+                           ) )
+                 if( foo==zero )foo = abs( s )
+                 if( max( abs( s*v( 1, ns ) ), abs( s*v( 1, ns-1 ) ) )<=max( smlnum, ulp*foo ) ) &
+                           then
+                    ! ==== deflatable ====
+                    ns = ns - 2
+                 else
+                    ! ==== undeflatable. move them up out of the way.
+                    ! .    fortunately, stdlib_strexc does the right thing with
+                    ! .    ilst in case of a rare exchange failure. ====
+                    ifst = ns
+                    call stdlib_strexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    ilst = ilst + 2
+                 end if
+              end if
+              ! ==== end deflation detection loop ====
+              go to 20
+           end if
+              ! ==== return to hessenberg form ====
+           if( ns==0 )s = zero
+           if( ns<jw ) then
+              ! ==== sorting diagonal blocks of t improves accuracy for
+              ! .    graded matrices.  bubble sort deals well with
+              ! .    exchange failures. ====
+              sorted = .false.
+              i = ns + 1
+              30 continue
+              if( sorted )go to 50
+              sorted = .true.
+              kend = i - 1
+              i = infqr + 1
+              if( i==ns ) then
+                 k = i + 1
+              else if( t( i+1, i )==zero ) then
+                 k = i + 1
+              else
+                 k = i + 2
+              end if
+              40 continue
+              if( k<=kend ) then
+                 if( k==i+1 ) then
+                    evi = abs( t( i, i ) )
+                 else
+                    evi = abs( t( i, i ) ) + sqrt( abs( t( i+1, i ) ) )*sqrt( abs( t( i, i+1 ) ) )
+                              
+                 end if
+                 if( k==kend ) then
+                    evk = abs( t( k, k ) )
+                 else if( t( k+1, k )==zero ) then
+                    evk = abs( t( k, k ) )
+                 else
+                    evk = abs( t( k, k ) ) + sqrt( abs( t( k+1, k ) ) )*sqrt( abs( t( k, k+1 ) ) )
+                              
+                 end if
+                 if( evi>=evk ) then
+                    i = k
+                 else
+                    sorted = .false.
+                    ifst = i
+                    ilst = k
+                    call stdlib_strexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, work,info )
+                    if( info==0 ) then
+                       i = ilst
+                    else
+                       i = k
+                    end if
+                 end if
+                 if( i==kend ) then
+                    k = i + 1
+                 else if( t( i+1, i )==zero ) then
+                    k = i + 1
+                 else
+                    k = i + 2
+                 end if
+                 go to 40
+              end if
+              go to 30
+              50 continue
+           end if
+           ! ==== restore shift/eigenvalue array from t ====
+           i = jw
+           60 continue
+           if( i>=infqr+1 ) then
+              if( i==infqr+1 ) then
+                 sr( kwtop+i-1 ) = t( i, i )
+                 si( kwtop+i-1 ) = zero
+                 i = i - 1
+              else if( t( i, i-1 )==zero ) then
+                 sr( kwtop+i-1 ) = t( i, i )
+                 si( kwtop+i-1 ) = zero
+                 i = i - 1
+              else
+                 aa = t( i-1, i-1 )
+                 cc = t( i, i-1 )
+                 bb = t( i-1, i )
+                 dd = t( i, i )
+                 call stdlib_slanv2( aa, bb, cc, dd, sr( kwtop+i-2 ),si( kwtop+i-2 ), sr( kwtop+i-&
+                           1 ),si( kwtop+i-1 ), cs, sn )
+                 i = i - 2
+              end if
+              go to 60
+           end if
+           if( ns<jw .or. s==zero ) then
+              if( ns>1 .and. s/=zero ) then
+                 ! ==== reflect spike back into lower triangle ====
+                 call stdlib_scopy( ns, v, ldv, work, 1 )
+                 beta = work( 1 )
+                 call stdlib_slarfg( ns, beta, work( 2 ), 1, tau )
+                 work( 1 ) = one
+                 call stdlib_slaset( 'L', jw-2, jw-2, zero, zero, t( 3, 1 ), ldt )
+                 call stdlib_slarf( 'L', ns, jw, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_slarf( 'R', ns, ns, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_slarf( 'R', jw, ns, work, 1, tau, v, ldv,work( jw+1 ) )
+                 call stdlib_sgehrd( jw, 1, ns, t, ldt, work, work( jw+1 ),lwork-jw, info )
+                           
+              end if
+              ! ==== copy updated reduced window into place ====
+              if( kwtop>1 )h( kwtop, kwtop-1 ) = s*v( 1, 1 )
+              call stdlib_slacpy( 'U', jw, jw, t, ldt, h( kwtop, kwtop ), ldh )
+              call stdlib_scopy( jw-1, t( 2, 1 ), ldt+1, h( kwtop+1, kwtop ),ldh+1 )
+              ! ==== accumulate orthogonal matrix in order update
+              ! .    h and z, if requested.  ====
+              if( ns>1 .and. s/=zero )call stdlib_sormhr( 'R', 'N', jw, ns, 1, ns, t, ldt, work, &
+                        v, ldv,work( jw+1 ), lwork-jw, info )
+              ! ==== update vertical slab in h ====
+              if( wantt ) then
+                 ltop = 1
+              else
+                 ltop = ktop
+              end if
+              do krow = ltop, kwtop - 1, nv
+                 kln = min( nv, kwtop-krow )
+                 call stdlib_sgemm( 'N', 'N', kln, jw, jw, one, h( krow, kwtop ),ldh, v, ldv, &
+                           zero, wv, ldwv )
+                 call stdlib_slacpy( 'A', kln, jw, wv, ldwv, h( krow, kwtop ), ldh )
+              end do
+              ! ==== update horizontal slab in h ====
+              if( wantt ) then
+                 do kcol = kbot + 1, n, nh
+                    kln = min( nh, n-kcol+1 )
+                    call stdlib_sgemm( 'C', 'N', jw, kln, jw, one, v, ldv,h( kwtop, kcol ), ldh, &
+                              zero, t, ldt )
+                    call stdlib_slacpy( 'A', jw, kln, t, ldt, h( kwtop, kcol ),ldh )
+                 end do
+              end if
+              ! ==== update vertical slab in z ====
+              if( wantz ) then
+                 do krow = iloz, ihiz, nv
+                    kln = min( nv, ihiz-krow+1 )
+                    call stdlib_sgemm( 'N', 'N', kln, jw, jw, one, z( krow, kwtop ),ldz, v, ldv, &
+                              zero, wv, ldwv )
+                    call stdlib_slacpy( 'A', kln, jw, wv, ldwv, z( krow, kwtop ),ldz )
+                 end do
+              end if
+           end if
+           ! ==== return the number of deflations ... ====
+           nd = jw - ns
+           ! ==== ... and the number of shifts. (subtracting
+           ! .    infqr from the spike length takes care
+           ! .    of the case of a rare qr failure while
+           ! .    calculating eigenvalues of the deflation
+           ! .    window.)  ====
+           ns = ns - infqr
+            ! ==== return optimal workspace. ====
+           work( 1 ) = real( lwkopt,KIND=sp)
+     end subroutine stdlib_slaqr3
+
+     !> SLAQR4: implements one level of recursion for SLAQR0.
+     !> It is a complete implementation of the small bulge multi-shift
+     !> QR algorithm.  It may be called by SLAQR0 and, for large enough
+     !> deflation window size, it may be called by SLAQR3.  This
+     !> subroutine is identical to SLAQR0 except that it calls SLAQR2
+     !> instead of SLAQR3.
+     !> SLAQR4 computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**T, where T is an upper quasi-triangular matrix (the
+     !> Schur form), and Z is the orthogonal matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input orthogonal
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the orthogonal matrix Q:  A = Q*H*Q**T = (QZ)*T*(QZ)**T.
+
+     subroutine stdlib_slaqr4( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, ihiz, z, ldz, work,&
+                lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           real(sp), intent(inout) :: h(ldh,*), z(ldz,*)
+           real(sp), intent(out) :: wi(*), work(*), wr(*)
+        ! ================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: kexnw = 5
+           integer(ilp), parameter :: kexsh = 6
+           real(sp), parameter :: wilk1 = 0.75_sp
+           real(sp), parameter :: wilk2 = -0.4375_sp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_slahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== exceptional deflation windows:  try to cure rare
+           ! .    slow convergence by varying the size of the
+           ! .    deflation window after kexnw iterations. ====
+           
+           ! ==== exceptional shifts: try to cure rare slow convergence
+           ! .    with ad-hoc exceptional shifts every kexsh iterations.
+           ! .    ====
+           
+           ! ==== the constants wilk1 and wilk2 are used to form the
+           ! .    exceptional shifts. ====
+           
+           
+           ! Local Scalars 
+           real(sp) :: aa, bb, cc, cs, dd, sn, ss, swap
+           integer(ilp) :: i, inf, it, itmax, k, kacc22, kbot, kdu, ks, kt, ktop, ku, kv, kwh, &
+           kwtop, kwv, ld, ls, lwkopt, ndec, ndfl, nh, nho, nibble, nmin, ns, nsmax, nsr, nve, nw,&
+                      nwmax, nwr, nwupbd
+           logical(lk) :: sorted
+           character :: jbcmpz*2
+           ! Local Arrays 
+           real(sp) :: zdum(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,int,max,min,mod,real
+           ! Executable Statements 
+           info = 0
+           ! ==== quick return for n = 0: nothing to do. ====
+           if( n==0 ) then
+              work( 1 ) = one
+              return
+           end if
+           if( n<=ntiny ) then
+              ! ==== tiny matrices must use stdlib_slahqr. ====
+              lwkopt = 1
+              if( lwork/=-1 )call stdlib_slahqr( wantt, wantz, n, ilo, ihi, h, ldh, wr, wi,iloz, &
+                        ihiz, z, ldz, info )
+           else
+              ! ==== use small bulge multi-shift qr with aggressive early
+              ! .    deflation on larger-than-tiny matrices. ====
+              ! ==== hope for the best. ====
+              info = 0
+              ! ==== set up job flags for stdlib_ilaenv. ====
+              if( wantt ) then
+                 jbcmpz( 1: 1 ) = 'S'
+              else
+                 jbcmpz( 1: 1 ) = 'E'
+              end if
+              if( wantz ) then
+                 jbcmpz( 2: 2 ) = 'V'
+              else
+                 jbcmpz( 2: 2 ) = 'N'
+              end if
+              ! ==== nwr = recommended deflation window size.  at this
+              ! .    point,  n > ntiny = 15, so there is enough
+              ! .    subdiagonal workspace for nwr>=2 as required.
+              ! .    (in fact, there is enough subdiagonal space for
+              ! .    nwr>=4.) ====
+              nwr = stdlib_ilaenv( 13, 'SLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nwr = max( 2, nwr )
+              nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+              ! ==== nsr = recommended number of simultaneous shifts.
+              ! .    at this point n > ntiny = 15, so there is at
+              ! .    enough subdiagonal workspace for nsr to be even
+              ! .    and greater than or equal to two as required. ====
+              nsr = stdlib_ilaenv( 15, 'SLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nsr = min( nsr, ( n-3 ) / 6, ihi-ilo )
+              nsr = max( 2, nsr-mod( nsr, 2 ) )
+              ! ==== estimate optimal workspace ====
+              ! ==== workspace query call to stdlib_slaqr2 ====
+              call stdlib_slaqr2( wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz,ihiz, z, ldz, ls,&
+                         ld, wr, wi, h, ldh, n, h, ldh,n, h, ldh, work, -1 )
+              ! ==== optimal workspace = max(stdlib_slaqr5, stdlib_slaqr2) ====
+              lwkopt = max( 3*nsr / 2, int( work( 1 ),KIND=ilp) )
+              ! ==== quick return in case of workspace query. ====
+              if( lwork==-1 ) then
+                 work( 1 ) = real( lwkopt,KIND=sp)
+                 return
+              end if
+              ! ==== stdlib_slahqr/stdlib_slaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'SLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nmin = max( ntiny, nmin )
+              ! ==== nibble crossover point ====
+              nibble = stdlib_ilaenv( 14, 'SLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nibble = max( 0, nibble )
+              ! ==== accumulate reflections during ttswp?  use block
+              ! .    2-by-2 structure during matrix-matrix multiply? ====
+              kacc22 = stdlib_ilaenv( 16, 'SLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              kacc22 = max( 0, kacc22 )
+              kacc22 = min( 2, kacc22 )
+              ! ==== nwmax = the largest possible deflation window for
+              ! .    which there is sufficient workspace. ====
+              nwmax = min( ( n-1 ) / 3, lwork / 2 )
+              nw = nwmax
+              ! ==== nsmax = the largest number of simultaneous shifts
+              ! .    for which there is sufficient workspace. ====
+              nsmax = min( ( n-3 ) / 6, 2*lwork / 3 )
+              nsmax = nsmax - mod( nsmax, 2 )
+              ! ==== ndfl: an iteration count restarted at deflation. ====
+              ndfl = 1
+              ! ==== itmax = iteration limit ====
+              itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )
+              ! ==== last row and column in the active block ====
+              kbot = ihi
+              ! ==== main loop ====
+              loop_80: do it = 1, itmax
+                 ! ==== done when kbot falls below ilo ====
+                 if( kbot<ilo )go to 90
+                 ! ==== locate active block ====
+                 do k = kbot, ilo + 1, -1
+                    if( h( k, k-1 )==zero )go to 20
+                 end do
+                 k = ilo
+                 20 continue
+                 ktop = k
+                 ! ==== select deflation window size:
+                 ! .    typical case:
+                 ! .      if possible and advisable, nibble the entire
+                 ! .      active block.  if not, use size min(nwr,nwmax)
+                 ! .      or min(nwr+1,nwmax) depending upon which has
+                 ! .      the smaller corresponding subdiagonal entry
+                 ! .      (a heuristic).
+                 ! .    exceptional case:
+                 ! .      if there have been no deflations in kexnw or
+                 ! .      more iterations, then vary the deflation window
+                 ! .      size.   at first, because, larger windows are,
+                 ! .      in general, more powerful than smaller ones,
+                 ! .      rapidly increase the window to the maximum possible.
+                 ! .      then, gradually reduce the window size. ====
+                 nh = kbot - ktop + 1
+                 nwupbd = min( nh, nwmax )
+                 if( ndfl<kexnw ) then
+                    nw = min( nwupbd, nwr )
+                 else
+                    nw = min( nwupbd, 2*nw )
+                 end if
+                 if( nw<nwmax ) then
+                    if( nw>=nh-1 ) then
+                       nw = nh
+                    else
+                       kwtop = kbot - nw + 1
+                       if( abs( h( kwtop, kwtop-1 ) )>abs( h( kwtop-1, kwtop-2 ) ) )nw = nw + &
+                                 1
+                    end if
+                 end if
+                 if( ndfl<kexnw ) then
+                    ndec = -1
+                 else if( ndec>=0 .or. nw>=nwupbd ) then
+                    ndec = ndec + 1
+                    if( nw-ndec<2 )ndec = 0
+                    nw = nw - ndec
+                 end if
+                 ! ==== aggressive early deflation:
+                 ! .    split workspace under the subdiagonal into
+                 ! .      - an nw-by-nw work array v in the lower
+                 ! .        left-hand-corner,
+                 ! .      - an nw-by-at-least-nw-but-more-is-better
+                 ! .        (nw-by-nho) horizontal work array along
+                 ! .        the bottom edge,
+                 ! .      - an at-least-nw-but-more-is-better (nhv-by-nw)
+                 ! .        vertical work array along the left-hand-edge.
+                 ! .        ====
+                 kv = n - nw + 1
+                 kt = nw + 1
+                 nho = ( n-nw-1 ) - kt + 1
+                 kwv = nw + 2
+                 nve = ( n-nw ) - kwv + 1
+                 ! ==== aggressive early deflation ====
+                 call stdlib_slaqr2( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+                 ls, ld, wr, wi, h( kv, 1 ), ldh,nho, h( kv, kt ), ldh, nve, h( kwv, 1 ), ldh,&
+                           work, lwork )
+                 ! ==== adjust kbot accounting for new deflations. ====
+                 kbot = kbot - ld
+                 ! ==== ks points to the shifts. ====
+                 ks = kbot - ls + 1
+                 ! ==== skip an expensive qr sweep if there is a (partly
+                 ! .    heuristic) reason to expect that many eigenvalues
+                 ! .    will deflate without it.  here, the qr sweep is
+                 ! .    skipped if many eigenvalues have just been deflated
+                 ! .    or if the remaining active block is small.
+                 if( ( ld==0 ) .or. ( ( 100*ld<=nw*nibble ) .and. ( kbot-ktop+1>min( nmin, nwmax )&
+                            ) ) ) then
+                    ! ==== ns = nominal number of simultaneous shifts.
+                    ! .    this may be lowered (slightly) if stdlib_slaqr2
+                    ! .    did not provide that many shifts. ====
+                    ns = min( nsmax, nsr, max( 2, kbot-ktop ) )
+                    ns = ns - mod( ns, 2 )
+                    ! ==== if there have been no deflations
+                    ! .    in a multiple of kexsh iterations,
+                    ! .    then try exceptional shifts.
+                    ! .    otherwise use shifts provided by
+                    ! .    stdlib_slaqr2 above or from the eigenvalues
+                    ! .    of a trailing principal submatrix. ====
+                    if( mod( ndfl, kexsh )==0 ) then
+                       ks = kbot - ns + 1
+                       do i = kbot, max( ks+1, ktop+2 ), -2
+                          ss = abs( h( i, i-1 ) ) + abs( h( i-1, i-2 ) )
+                          aa = wilk1*ss + h( i, i )
+                          bb = ss
+                          cc = wilk2*ss
+                          dd = aa
+                          call stdlib_slanv2( aa, bb, cc, dd, wr( i-1 ), wi( i-1 ),wr( i ), wi( i &
+                                    ), cs, sn )
+                       end do
+                       if( ks==ktop ) then
+                          wr( ks+1 ) = h( ks+1, ks+1 )
+                          wi( ks+1 ) = zero
+                          wr( ks ) = wr( ks+1 )
+                          wi( ks ) = wi( ks+1 )
+                       end if
+                    else
+                       ! ==== got ns/2 or fewer shifts? use stdlib_slahqr
+                       ! .    on a trailing principal submatrix to
+                       ! .    get more. (since ns<=nsmax<=(n-3)/6,
+                       ! .    there is enough space below the subdiagonal
+                       ! .    to fit an ns-by-ns scratch array.) ====
+                       if( kbot-ks+1<=ns / 2 ) then
+                          ks = kbot - ns + 1
+                          kt = n - ns + 1
+                          call stdlib_slacpy( 'A', ns, ns, h( ks, ks ), ldh,h( kt, 1 ), ldh )
+                                    
+                          call stdlib_slahqr( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, wr( ks &
+                                    ), wi( ks ),1, 1, zdum, 1, inf )
+                          ks = ks + inf
+                          ! ==== in case of a rare qr failure use
+                          ! .    eigenvalues of the trailing 2-by-2
+                          ! .    principal submatrix.  ====
+                          if( ks>=kbot ) then
+                             aa = h( kbot-1, kbot-1 )
+                             cc = h( kbot, kbot-1 )
+                             bb = h( kbot-1, kbot )
+                             dd = h( kbot, kbot )
+                             call stdlib_slanv2( aa, bb, cc, dd, wr( kbot-1 ),wi( kbot-1 ), wr( &
+                                       kbot ),wi( kbot ), cs, sn )
+                             ks = kbot - 1
+                          end if
+                       end if
+                       if( kbot-ks+1>ns ) then
+                          ! ==== sort the shifts (helps a little)
+                          ! .    bubble sort keeps complex conjugate
+                          ! .    pairs together. ====
+                          sorted = .false.
+                          do k = kbot, ks + 1, -1
+                             if( sorted )go to 60
+                             sorted = .true.
+                             do i = ks, k - 1
+                                if( abs( wr( i ) )+abs( wi( i ) )<abs( wr( i+1 ) )+abs( wi( i+1 ) &
+                                          ) ) then
+                                   sorted = .false.
+                                   swap = wr( i )
+                                   wr( i ) = wr( i+1 )
+                                   wr( i+1 ) = swap
+                                   swap = wi( i )
+                                   wi( i ) = wi( i+1 )
+                                   wi( i+1 ) = swap
+                                end if
+                             end do
+                          end do
+                          60 continue
+                       end if
+                       ! ==== shuffle shifts into pairs of real shifts
+                       ! .    and pairs of complex conjugate shifts
+                       ! .    assuming complex conjugate shifts are
+                       ! .    already adjacent to one another. (yes,
+                       ! .    they are.)  ====
+                       do i = kbot, ks + 2, -2
+                          if( wi( i )/=-wi( i-1 ) ) then
+                             swap = wr( i )
+                             wr( i ) = wr( i-1 )
+                             wr( i-1 ) = wr( i-2 )
+                             wr( i-2 ) = swap
+                             swap = wi( i )
+                             wi( i ) = wi( i-1 )
+                             wi( i-1 ) = wi( i-2 )
+                             wi( i-2 ) = swap
+                          end if
+                       end do
+                    end if
+                    ! ==== if there are only two shifts and both are
+                    ! .    real, then use only one.  ====
+                    if( kbot-ks+1==2 ) then
+                       if( wi( kbot )==zero ) then
+                          if( abs( wr( kbot )-h( kbot, kbot ) )<abs( wr( kbot-1 )-h( kbot, kbot ) &
+                                    ) ) then
+                             wr( kbot-1 ) = wr( kbot )
+                          else
+                             wr( kbot ) = wr( kbot-1 )
+                          end if
+                       end if
+                    end if
+                    ! ==== use up to ns of the the smallest magnitude
+                    ! .    shifts.  if there aren't ns shifts available,
+                    ! .    then use them all, possibly dropping one to
+                    ! .    make the number of shifts even. ====
+                    ns = min( ns, kbot-ks+1 )
+                    ns = ns - mod( ns, 2 )
+                    ks = kbot - ns + 1
+                    ! ==== small-bulge multi-shift qr sweep:
+                    ! .    split workspace under the subdiagonal into
+                    ! .    - a kdu-by-kdu work array u in the lower
+                    ! .      left-hand-corner,
+                    ! .    - a kdu-by-at-least-kdu-but-more-is-better
+                    ! .      (kdu-by-nho) horizontal work array wh along
+                    ! .      the bottom edge,
+                    ! .    - and an at-least-kdu-but-more-is-better-by-kdu
+                    ! .      (nve-by-kdu) vertical work wv arrow along
+                    ! .      the left-hand-edge. ====
+                    kdu = 2*ns
+                    ku = n - kdu + 1
+                    kwh = kdu + 1
+                    nho = ( n-kdu+1-4 ) - ( kdu+1 ) + 1
+                    kwv = kdu + 4
+                    nve = n - kdu - kwv + 1
+                    ! ==== small-bulge multi-shift qr sweep ====
+                    call stdlib_slaqr5( wantt, wantz, kacc22, n, ktop, kbot, ns,wr( ks ), wi( ks )&
+                    , h, ldh, iloz, ihiz, z,ldz, work, 3, h( ku, 1 ), ldh, nve,h( kwv, 1 ), ldh, &
+                              nho, h( ku, kwh ), ldh )
+                 end if
+                 ! ==== note progress (or the lack of it). ====
+                 if( ld>0 ) then
+                    ndfl = 1
+                 else
+                    ndfl = ndfl + 1
+                 end if
+                 ! ==== end of main loop ====
+              end do loop_80
+              ! ==== iteration limit exceeded.  set info to show where
+              ! .    the problem occurred and exit. ====
+              info = kbot
+              90 continue
+           end if
+           ! ==== return the optimal value of lwork. ====
+           work( 1 ) = real( lwkopt,KIND=sp)
+     end subroutine stdlib_slaqr4
+
+     !> SLAQZ0: computes the eigenvalues of a real matrix pair (H,T),
+     !> where H is an upper Hessenberg matrix and T is upper triangular,
+     !> using the double-shift QZ method.
+     !> Matrix pairs of this type are produced by the reduction to
+     !> generalized upper Hessenberg form of a real matrix pair (A,B):
+     !> A = Q1*H*Z1**T,  B = Q1*T*Z1**T,
+     !> as computed by SGGHRD.
+     !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+     !> also reduced to generalized Schur form,
+     !> H = Q*S*Z**T,  T = Q*P*Z**T,
+     !> where Q and Z are orthogonal matrices, P is an upper triangular
+     !> matrix, and S is a quasi-triangular matrix with 1-by-1 and 2-by-2
+     !> diagonal blocks.
+     !> The 1-by-1 blocks correspond to real eigenvalues of the matrix pair
+     !> (H,T) and the 2-by-2 blocks correspond to complex conjugate pairs of
+     !> eigenvalues.
+     !> Additionally, the 2-by-2 upper triangular diagonal blocks of P
+     !> corresponding to 2-by-2 blocks of S are reduced to positive diagonal
+     !> form, i.e., if S(j+1,j) is non-zero, then P(j+1,j) = P(j,j+1) = 0,
+     !> P(j,j) > 0, and P(j+1,j+1) > 0.
+     !> Optionally, the orthogonal matrix Q from the generalized Schur
+     !> factorization may be postmultiplied into an input matrix Q1, and the
+     !> orthogonal matrix Z may be postmultiplied into an input matrix Z1.
+     !> If Q1 and Z1 are the orthogonal matrices from SGGHRD that reduced
+     !> the matrix pair (A,B) to generalized upper Hessenberg form, then the
+     !> output matrices Q1*Q and Z1*Z are the orthogonal factors from the
+     !> generalized Schur factorization of (A,B):
+     !> A = (Q1*Q)*S*(Z1*Z)**T,  B = (Q1*Q)*P*(Z1*Z)**T.
+     !> To avoid overflow, eigenvalues of the matrix pair (H,T) (equivalently,
+     !> of (A,B)) are computed as a pair of values (alpha,beta), where alpha is
+     !> complex and beta real.
+     !> If beta is nonzero, lambda = alpha / beta is an eigenvalue of the
+     !> generalized nonsymmetric eigenvalue problem (GNEP)
+     !> A*x = lambda*B*x
+     !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+     !> alternate form of the GNEP
+     !> mu*A*y = B*y.
+     !> Real eigenvalues can be read directly from the generalized Schur
+     !> form:
+     !> alpha = S(i,i), beta = P(i,i).
+     !> Ref: C.B. Moler
+     !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+     !> pp. 241--256.
+     !> Ref: B. Kagstrom, D. Kressner, "Multishift Variants of the QZ
+     !> Algorithm with Aggressive Early Deflation", SIAM J. Numer.
+     !> Anal., 29(2006), pp. 199--227.
+     !> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
+     !> multipole rational QZ method with agressive early deflation"
+
+     recursive subroutine stdlib_slaqz0( wants, wantq, wantz, n, ilo, ihi, a,lda, b, ldb, alphar, &
+               alphai, beta,q, ldq, z, ldz, work, lwork, rec,info )
+           ! arguments
+           character, intent( in ) :: wants, wantq, wantz
+           integer(ilp), intent( in ) :: n, ilo, ihi, lda, ldb, ldq, ldz, lwork,rec
+           integer(ilp), intent( out ) :: info
+           real(sp), intent( inout ) :: a( lda, * ), b( ldb, * ), q( ldq, * ),z( ldz, * ), alphar(&
+                      * ), alphai( * ), beta( * ), work( * )
+           
+           ! local scalars
+           real(sp) :: smlnum, ulp, eshift, safmin, safmax, c1, s1, temp, swap
+           integer(ilp) :: istart, istop, iiter, maxit, istart2, k, ld, nshifts, nblock, nw, nmin,&
+            nibble, n_undeflated, n_deflated, ns, sweep_info, shiftpos, lworkreq, k2, istartm, &
+            istopm, iwants, iwantq, iwantz, norm_info, aed_info, nwr, nbr, nsr, itemp1, itemp2, &
+                      rcost, i
+           logical(lk) :: ilschur, ilq, ilz
+           character :: jbcmpz*3
+           if( stdlib_lsame( wants, 'E' ) ) then
+              ilschur = .false.
+              iwants = 1
+           else if( stdlib_lsame( wants, 'S' ) ) then
+              ilschur = .true.
+              iwants = 2
+           else
+              iwants = 0
+           end if
+           if( stdlib_lsame( wantq, 'N' ) ) then
+              ilq = .false.
+              iwantq = 1
+           else if( stdlib_lsame( wantq, 'V' ) ) then
+              ilq = .true.
+              iwantq = 2
+           else if( stdlib_lsame( wantq, 'I' ) ) then
+              ilq = .true.
+              iwantq = 3
+           else
+              iwantq = 0
+           end if
+           if( stdlib_lsame( wantz, 'N' ) ) then
+              ilz = .false.
+              iwantz = 1
+           else if( stdlib_lsame( wantz, 'V' ) ) then
+              ilz = .true.
+              iwantz = 2
+           else if( stdlib_lsame( wantz, 'I' ) ) then
+              ilz = .true.
+              iwantz = 3
+           else
+              iwantz = 0
+           end if
+           ! check argument values
+           info = 0
+           if( iwants==0 ) then
+              info = -1
+           else if( iwantq==0 ) then
+              info = -2
+           else if( iwantz==0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 ) then
+              info = -5
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -6
+           else if( lda<n ) then
+              info = -8
+           else if( ldb<n ) then
+              info = -10
+           else if( ldq<1 .or. ( ilq .and. ldq<n ) ) then
+              info = -15
+           else if( ldz<1 .or. ( ilz .and. ldz<n ) ) then
+              info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAQZ0', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = real( 1,KIND=sp)
+              return
+           end if
+           ! get the parameters
+           jbcmpz( 1:1 ) = wants
+           jbcmpz( 2:2 ) = wantq
+           jbcmpz( 3:3 ) = wantz
+           nmin = stdlib_ilaenv( 12, 'SLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nwr = stdlib_ilaenv( 13, 'SLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nwr = max( 2, nwr )
+           nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+           nibble = stdlib_ilaenv( 14, 'SLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nsr = stdlib_ilaenv( 15, 'SLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nsr = min( nsr, ( n+6 ) / 9, ihi-ilo )
+           nsr = max( 2, nsr-mod( nsr, 2 ) )
+           rcost = stdlib_ilaenv( 17, 'SLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           itemp1 = int( nsr/sqrt( 1+2*nsr/( real( rcost,KIND=sp)/100*n ) ),KIND=ilp)
+           itemp1 = ( ( itemp1-1 )/4 )*4+4
+           nbr = nsr+itemp1
+           if( n < nmin .or. rec >= 2 ) then
+              call stdlib_shgeqz( wants, wantq, wantz, n, ilo, ihi, a, lda, b, ldb,alphar, alphai,&
+                         beta, q, ldq, z, ldz, work,lwork, info )
+              return
+           end if
+           ! find out required workspace
+           ! workspace query to stdlib_slaqz3
+           nw = max( nwr, nmin )
+           call stdlib_slaqz3( ilschur, ilq, ilz, n, ilo, ihi, nw, a, lda, b, ldb,q, ldq, z, ldz, &
+           n_undeflated, n_deflated, alphar,alphai, beta, work, nw, work, nw, work, -1, rec,&
+                     aed_info )
+           itemp1 = int( work( 1 ),KIND=ilp)
+           ! workspace query to stdlib_slaqz4
+           call stdlib_slaqz4( ilschur, ilq, ilz, n, ilo, ihi, nsr, nbr, alphar,alphai, beta, a, &
+                     lda, b, ldb, q, ldq, z, ldz, work,nbr, work, nbr, work, -1, sweep_info )
+           itemp2 = int( work( 1 ),KIND=ilp)
+           lworkreq = max( itemp1+2*nw**2, itemp2+2*nbr**2 )
+           if ( lwork ==-1 ) then
+              work( 1 ) = real( lworkreq,KIND=sp)
+              return
+           else if ( lwork < lworkreq ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAQZ0', info )
+              return
+           end if
+           ! initialize q and z
+           if( iwantq==3 ) call stdlib_slaset( 'FULL', n, n, zero, one, q, ldq )
+           if( iwantz==3 ) call stdlib_slaset( 'FULL', n, n, zero, one, z, ldz )
+           ! get machine constants
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_slabad( safmin, safmax )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=sp)/ulp )
+           istart = ilo
+           istop = ihi
+           maxit = 3*( ihi-ilo+1 )
+           ld = 0
+           do iiter = 1, maxit
+              if( iiter >= maxit ) then
+                 info = istop+1
+                 goto 80
+              end if
+              if ( istart+1 >= istop ) then
+                 istop = istart
+                 exit
+              end if
+              ! check deflations at the end
+              if ( abs( a( istop-1, istop-2 ) ) <= max( smlnum,ulp*( abs( a( istop-1, istop-1 ) )+&
+                        abs( a( istop-2,istop-2 ) ) ) ) ) then
+                 a( istop-1, istop-2 ) = zero
+                 istop = istop-2
+                 ld = 0
+                 eshift = zero
+              else if ( abs( a( istop, istop-1 ) ) <= max( smlnum,ulp*( abs( a( istop, istop ) )+&
+                        abs( a( istop-1,istop-1 ) ) ) ) ) then
+                 a( istop, istop-1 ) = zero
+                 istop = istop-1
+                 ld = 0
+                 eshift = zero
+              end if
+              ! check deflations at the start
+              if ( abs( a( istart+2, istart+1 ) ) <= max( smlnum,ulp*( abs( a( istart+1, istart+1 &
+                        ) )+abs( a( istart+2,istart+2 ) ) ) ) ) then
+                 a( istart+2, istart+1 ) = zero
+                 istart = istart+2
+                 ld = 0
+                 eshift = zero
+              else if ( abs( a( istart+1, istart ) ) <= max( smlnum,ulp*( abs( a( istart, istart )&
+                         )+abs( a( istart+1,istart+1 ) ) ) ) ) then
+                 a( istart+1, istart ) = zero
+                 istart = istart+1
+                 ld = 0
+                 eshift = zero
+              end if
+              if ( istart+1 >= istop ) then
+                 exit
+              end if
+              ! check interior deflations
+              istart2 = istart
+              do k = istop, istart+1, -1
+                 if ( abs( a( k, k-1 ) ) <= max( smlnum, ulp*( abs( a( k,k ) )+abs( a( k-1, k-1 ) &
+                           ) ) ) ) then
+                    a( k, k-1 ) = zero
+                    istart2 = k
+                    exit
+                 end if
+              end do
+              ! get range to apply rotations to
+              if ( ilschur ) then
+                 istartm = 1
+                 istopm = n
+              else
+                 istartm = istart2
+                 istopm = istop
+              end if
+              ! check infinite eigenvalues, this is done without blocking so might
+              ! slow down the method when many infinite eigenvalues are present
+              k = istop
+              do while ( k>=istart2 )
+                 temp = zero
+                 if( k < istop ) then
+                    temp = temp+abs( b( k, k+1 ) )
+                 end if
+                 if( k > istart2 ) then
+                    temp = temp+abs( b( k-1, k ) )
+                 end if
+                 if( abs( b( k, k ) ) < max( smlnum, ulp*temp ) ) then
+                    ! a diagonal element of b is negligable, move it
+                    ! to the top and deflate it
+                    do k2 = k, istart2+1, -1
+                       call stdlib_slartg( b( k2-1, k2 ), b( k2-1, k2-1 ), c1, s1,temp )
+                       b( k2-1, k2 ) = temp
+                       b( k2-1, k2-1 ) = zero
+                       call stdlib_srot( k2-2-istartm+1, b( istartm, k2 ), 1,b( istartm, k2-1 ), &
+                                 1, c1, s1 )
+                       call stdlib_srot( min( k2+1, istop )-istartm+1, a( istartm,k2 ), 1, a( &
+                                 istartm, k2-1 ), 1, c1, s1 )
+                       if ( ilz ) then
+                          call stdlib_srot( n, z( 1, k2 ), 1, z( 1, k2-1 ), 1, c1,s1 )
+                       end if
+                       if( k2<istop ) then
+                          call stdlib_slartg( a( k2, k2-1 ), a( k2+1, k2-1 ), c1,s1, temp )
+                                    
+                          a( k2, k2-1 ) = temp
+                          a( k2+1, k2-1 ) = zero
+                          call stdlib_srot( istopm-k2+1, a( k2, k2 ), lda, a( k2+1,k2 ), lda, c1, &
+                                    s1 )
+                          call stdlib_srot( istopm-k2+1, b( k2, k2 ), ldb, b( k2+1,k2 ), ldb, c1, &
+                                    s1 )
+                          if( ilq ) then
+                             call stdlib_srot( n, q( 1, k2 ), 1, q( 1, k2+1 ), 1,c1, s1 )
+                          end if
+                       end if
+                    end do
+                    if( istart2<istop )then
+                       call stdlib_slartg( a( istart2, istart2 ), a( istart2+1,istart2 ), c1, s1, &
+                                 temp )
+                       a( istart2, istart2 ) = temp
+                       a( istart2+1, istart2 ) = zero
+                       call stdlib_srot( istopm-( istart2+1 )+1, a( istart2,istart2+1 ), lda, a( &
+                                 istart2+1,istart2+1 ), lda, c1, s1 )
+                       call stdlib_srot( istopm-( istart2+1 )+1, b( istart2,istart2+1 ), ldb, b( &
+                                 istart2+1,istart2+1 ), ldb, c1, s1 )
+                       if( ilq ) then
+                          call stdlib_srot( n, q( 1, istart2 ), 1, q( 1,istart2+1 ), 1, c1, s1 )
+                                    
+                       end if
+                    end if
+                    istart2 = istart2+1
+                 end if
+                 k = k-1
+              end do
+              ! istart2 now points to the top of the bottom right
+              ! unreduced hessenberg block
+              if ( istart2 >= istop ) then
+                 istop = istart2-1
+                 ld = 0
+                 eshift = zero
+                 cycle
+              end if
+              nw = nwr
+              nshifts = nsr
+              nblock = nbr
+              if ( istop-istart2+1 < nmin ) then
+                 ! setting nw to the size of the subblock will make aed deflate
+                 ! all the eigenvalues. this is slightly more efficient than just
+                 ! using qz_small because the off diagonal part gets updated via blas.
+                 if ( istop-istart+1 < nmin ) then
+                    nw = istop-istart+1
+                    istart2 = istart
+                 else
+                    nw = istop-istart2+1
+                 end if
+              end if
+              ! time for aed
+              call stdlib_slaqz3( ilschur, ilq, ilz, n, istart2, istop, nw, a, lda,b, ldb, q, ldq,&
+               z, ldz, n_undeflated, n_deflated,alphar, alphai, beta, work, nw, work( nw**2+1 ),&
+                         nw, work( 2*nw**2+1 ), lwork-2*nw**2, rec,aed_info )
+              if ( n_deflated > 0 ) then
+                 istop = istop-n_deflated
+                 ld = 0
+                 eshift = zero
+              end if
+              if ( 100*n_deflated > nibble*( n_deflated+n_undeflated ) .or.istop-istart2+1 < nmin &
+                        ) then
+                 ! aed has uncovered many eigenvalues. skip a qz sweep and run
+                 ! aed again.
+                 cycle
+              end if
+              ld = ld+1
+              ns = min( nshifts, istop-istart2 )
+              ns = min( ns, n_undeflated )
+              shiftpos = istop-n_deflated-n_undeflated+1
+              ! shuffle shifts to put double shifts in front
+              ! this ensures that we don't split up a double shift
+              do i = shiftpos, shiftpos+n_undeflated-1, 2
+                 if( alphai( i )/=-alphai( i+1 ) ) then
+                    swap = alphar( i )
+                    alphar( i ) = alphar( i+1 )
+                    alphar( i+1 ) = alphar( i+2 )
+                    alphar( i+2 ) = swap
+                    swap = alphai( i )
+                    alphai( i ) = alphai( i+1 )
+                    alphai( i+1 ) = alphai( i+2 )
+                    alphai( i+2 ) = swap
+                    swap = beta( i )
+                    beta( i ) = beta( i+1 )
+                    beta( i+1 ) = beta( i+2 )
+                    beta( i+2 ) = swap
+                 end if
+              end do
+              if ( mod( ld, 6 ) == 0 ) then
+                 ! exceptional shift.  chosen for no particularly good reason.
+                 if( ( real( maxit,KIND=sp)*safmin )*abs( a( istop,istop-1 ) )<abs( a( istop-1, &
+                           istop-1 ) ) ) then
+                    eshift = a( istop, istop-1 )/b( istop-1, istop-1 )
+                 else
+                    eshift = eshift+one/( safmin*real( maxit,KIND=sp) )
+                 end if
+                 alphar( shiftpos ) = one
+                 alphar( shiftpos+1 ) = zero
+                 alphai( shiftpos ) = zero
+                 alphai( shiftpos+1 ) = zero
+                 beta( shiftpos ) = eshift
+                 beta( shiftpos+1 ) = eshift
+                 ns = 2
+              end if
+              ! time for a qz sweep
+              call stdlib_slaqz4( ilschur, ilq, ilz, n, istart2, istop, ns, nblock,alphar( &
+              shiftpos ), alphai( shiftpos ),beta( shiftpos ), a, lda, b, ldb, q, ldq, z, ldz,&
+              work, nblock, work( nblock**2+1 ), nblock,work( 2*nblock**2+1 ), lwork-2*nblock**2,&
+                        sweep_info )
+           end do
+           ! call stdlib_shgeqz to normalize the eigenvalue blocks and set the eigenvalues
+           ! if all the eigenvalues have been found, stdlib_shgeqz will not do any iterations
+           ! and only normalize the blocks. in case of a rare convergence failure,
+           ! the single shift might perform better.
+        80 call stdlib_shgeqz( wants, wantq, wantz, n, ilo, ihi, a, lda, b, ldb,alphar, alphai, &
+                  beta, q, ldq, z, ldz, work, lwork,norm_info )
+           info = norm_info
+     end subroutine stdlib_slaqz0
+
+     !> SLAQZ3: performs AED
+
+     recursive subroutine stdlib_slaqz3( ilschur, ilq, ilz, n, ilo, ihi, nw,a, lda, b, ldb, q, &
+               ldq, z, ldz, ns,nd, alphar, alphai, beta, qc, ldqc,zc, ldzc, work, lwork, rec, info )
+           ! arguments
+           logical(lk), intent( in ) :: ilschur, ilq, ilz
+           integer(ilp), intent( in ) :: n, ilo, ihi, nw, lda, ldb, ldq, ldz,ldqc, ldzc, lwork, &
+                     rec
+           real(sp), intent( inout ) :: a( lda, * ), b( ldb, * ), q( ldq, * ),z( ldz, * ), alphar(&
+                      * ), alphai( * ), beta( * )
+           integer(ilp), intent( out ) :: ns, nd, info
+           real(sp), intent(inout) :: qc(ldqc,*), zc(ldzc,*)
+           real(sp), intent(out) :: work(*)
+           
+           ! local scalars
+           logical(lk) :: bulge
+           integer(ilp) :: jw, kwtop, kwbot, istopm, istartm, k, k2, stgexc_info, ifst, ilst, &
+                     lworkreq, qz_small_info
+           real(sp) :: s, smlnum, ulp, safmin, safmax, c1, s1, temp
+           info = 0
+           ! set up deflation window
+           jw = min( nw, ihi-ilo+1 )
+           kwtop = ihi-jw+1
+           if ( kwtop == ilo ) then
+              s = zero
+           else
+              s = a( kwtop, kwtop-1 )
+           end if
+           ! determine required workspace
+           ifst = 1
+           ilst = jw
+           call stdlib_stgexc( .true., .true., jw, a, lda, b, ldb, qc, ldqc, zc,ldzc, ifst, ilst, &
+                     work, -1, stgexc_info )
+           lworkreq = int( work( 1 ),KIND=ilp)
+           call stdlib_slaqz0( 'S', 'V', 'V', jw, 1, jw, a( kwtop, kwtop ), lda,b( kwtop, kwtop ),&
+                      ldb, alphar, alphai, beta, qc,ldqc, zc, ldzc, work, -1, rec+1, qz_small_info )
+           lworkreq = max( lworkreq, int( work( 1 ),KIND=ilp)+2*jw**2 )
+           lworkreq = max( lworkreq, n*nw, 2*nw**2+n )
+           if ( lwork ==-1 ) then
+              ! workspace query, quick return
+              work( 1 ) = lworkreq
+              return
+           else if ( lwork < lworkreq ) then
+              info = -26
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLAQZ3', -info )
+              return
+           end if
+           ! get machine constants
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_slabad( safmin, safmax )
+           ulp = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=sp)/ulp )
+           if ( ihi == kwtop ) then
+              ! 1 by 1 deflation window, just try a regular deflation
+              alphar( kwtop ) = a( kwtop, kwtop )
+              alphai( kwtop ) = zero
+              beta( kwtop ) = b( kwtop, kwtop )
+              ns = 1
+              nd = 0
+              if ( abs( s ) <= max( smlnum, ulp*abs( a( kwtop,kwtop ) ) ) ) then
+                 ns = 0
+                 nd = 1
+                 if ( kwtop > ilo ) then
+                    a( kwtop, kwtop-1 ) = zero
+                 end if
+              end if
+           end if
+           ! store window in case of convergence failure
+           call stdlib_slacpy( 'ALL', jw, jw, a( kwtop, kwtop ), lda, work, jw )
+           call stdlib_slacpy( 'ALL', jw, jw, b( kwtop, kwtop ), ldb, work( jw**2+1 ), jw )
+                     
+           ! transform window to real schur form
+           call stdlib_slaset( 'FULL', jw, jw, zero, one, qc, ldqc )
+           call stdlib_slaset( 'FULL', jw, jw, zero, one, zc, ldzc )
+           call stdlib_slaqz0( 'S', 'V', 'V', jw, 1, jw, a( kwtop, kwtop ), lda,b( kwtop, kwtop ),&
+            ldb, alphar, alphai, beta, qc,ldqc, zc, ldzc, work( 2*jw**2+1 ), lwork-2*jw**2,rec+1, &
+                      qz_small_info )
+           if( qz_small_info /= 0 ) then
+              ! convergence failure, restore the window and exit
+              nd = 0
+              ns = jw-qz_small_info
+              call stdlib_slacpy( 'ALL', jw, jw, work, jw, a( kwtop, kwtop ), lda )
+              call stdlib_slacpy( 'ALL', jw, jw, work( jw**2+1 ), jw, b( kwtop,kwtop ), ldb )
+                        
+              return
+           end if
+           ! deflation detection loop
+           if ( kwtop == ilo .or. s == zero ) then
+              kwbot = kwtop-1
+           else
+              kwbot = ihi
+              k = 1
+              k2 = 1
+              do while ( k <= jw )
+                 bulge = .false.
+                 if ( kwbot-kwtop+1 >= 2 ) then
+                    bulge = a( kwbot, kwbot-1 ) /= zero
+                 end if
+                 if ( bulge ) then
+                    ! try to deflate complex conjugate eigenvalue pair
+                    temp = abs( a( kwbot, kwbot ) )+sqrt( abs( a( kwbot,kwbot-1 ) ) )*sqrt( abs( &
+                              a( kwbot-1, kwbot ) ) )
+                    if( temp == zero )then
+                       temp = abs( s )
+                    end if
+                    if ( max( abs( s*qc( 1, kwbot-kwtop ) ), abs( s*qc( 1,kwbot-kwtop+1 ) ) ) <= &
+                              max( smlnum,ulp*temp ) ) then
+                       ! deflatable
+                       kwbot = kwbot-2
+                    else
+                       ! not deflatable, move out of the way
+                       ifst = kwbot-kwtop+1
+                       ilst = k2
+                       call stdlib_stgexc( .true., .true., jw, a( kwtop, kwtop ),lda, b( kwtop, &
+                       kwtop ), ldb, qc, ldqc,zc, ldzc, ifst, ilst, work, lwork,stgexc_info )
+                                 
+                       k2 = k2+2
+                    end if
+                    k = k+2
+                 else
+                    ! try to deflate real eigenvalue
+                    temp = abs( a( kwbot, kwbot ) )
+                    if( temp == zero ) then
+                       temp = abs( s )
+                    end if
+                    if ( ( abs( s*qc( 1, kwbot-kwtop+1 ) ) ) <= max( ulp*temp, smlnum ) ) &
+                              then
+                       ! deflatable
+                       kwbot = kwbot-1
+                    else
+                       ! not deflatable, move out of the way
+                       ifst = kwbot-kwtop+1
+                       ilst = k2
+                       call stdlib_stgexc( .true., .true., jw, a( kwtop, kwtop ),lda, b( kwtop, &
+                       kwtop ), ldb, qc, ldqc,zc, ldzc, ifst, ilst, work, lwork,stgexc_info )
+                                 
+                       k2 = k2+1
+                    end if
+                    k = k+1
+                 end if
+              end do
+           end if
+           ! store eigenvalues
+           nd = ihi-kwbot
+           ns = jw-nd
+           k = kwtop
+           do while ( k <= ihi )
+              bulge = .false.
+              if ( k < ihi ) then
+                 if ( a( k+1, k ) /= zero ) then
+                    bulge = .true.
+                 end if
+              end if
+              if ( bulge ) then
+                 ! 2x2 eigenvalue block
+                 call stdlib_slag2( a( k, k ), lda, b( k, k ), ldb, safmin,beta( k ), beta( k+1 ),&
+                            alphar( k ),alphar( k+1 ), alphai( k ) )
+                 alphai( k+1 ) = -alphai( k )
+                 k = k+2
+              else
+                 ! 1x1 eigenvalue block
+                 alphar( k ) = a( k, k )
+                 alphai( k ) = zero
+                 beta( k ) = b( k, k )
+                 k = k+1
+              end if
+           end do
+           if ( kwtop /= ilo .and. s /= zero ) then
+              ! reflect spike back, this will create optimally packed bulges
+              a( kwtop:kwbot, kwtop-1 ) = a( kwtop, kwtop-1 )*qc( 1,1:jw-nd )
+              do k = kwbot-1, kwtop, -1
+                 call stdlib_slartg( a( k, kwtop-1 ), a( k+1, kwtop-1 ), c1, s1,temp )
+                 a( k, kwtop-1 ) = temp
+                 a( k+1, kwtop-1 ) = zero
+                 k2 = max( kwtop, k-1 )
+                 call stdlib_srot( ihi-k2+1, a( k, k2 ), lda, a( k+1, k2 ), lda, c1,s1 )
+                 call stdlib_srot( ihi-( k-1 )+1, b( k, k-1 ), ldb, b( k+1, k-1 ),ldb, c1, s1 )
+                           
+                 call stdlib_srot( jw, qc( 1, k-kwtop+1 ), 1, qc( 1, k+1-kwtop+1 ),1, c1, s1 )
+                           
+              end do
+              ! chase bulges down
+              istartm = kwtop
+              istopm = ihi
+              k = kwbot-1
+              do while ( k >= kwtop )
+                 if ( ( k >= kwtop+1 ) .and. a( k+1, k-1 ) /= zero ) then
+                    ! move double pole block down and remove it
+                    do k2 = k-1, kwbot-2
+                       call stdlib_slaqz2( .true., .true., k2, kwtop, kwtop+jw-1,kwbot, a, lda, b,&
+                                  ldb, jw, kwtop, qc,ldqc, jw, kwtop, zc, ldzc )
+                    end do
+                    k = k-2
+                 else
+                    ! k points to single shift
+                    do k2 = k, kwbot-2
+                       ! move shift down
+                       call stdlib_slartg( b( k2+1, k2+1 ), b( k2+1, k2 ), c1, s1,temp )
+                       b( k2+1, k2+1 ) = temp
+                       b( k2+1, k2 ) = zero
+                       call stdlib_srot( k2+2-istartm+1, a( istartm, k2+1 ), 1,a( istartm, k2 ), &
+                                 1, c1, s1 )
+                       call stdlib_srot( k2-istartm+1, b( istartm, k2+1 ), 1,b( istartm, k2 ), 1, &
+                                 c1, s1 )
+                       call stdlib_srot( jw, zc( 1, k2+1-kwtop+1 ), 1, zc( 1,k2-kwtop+1 ), 1, c1, &
+                                 s1 )
+                       call stdlib_slartg( a( k2+1, k2 ), a( k2+2, k2 ), c1, s1,temp )
+                       a( k2+1, k2 ) = temp
+                       a( k2+2, k2 ) = zero
+                       call stdlib_srot( istopm-k2, a( k2+1, k2+1 ), lda, a( k2+2,k2+1 ), lda, c1,&
+                                  s1 )
+                       call stdlib_srot( istopm-k2, b( k2+1, k2+1 ), ldb, b( k2+2,k2+1 ), ldb, c1,&
+                                  s1 )
+                       call stdlib_srot( jw, qc( 1, k2+1-kwtop+1 ), 1, qc( 1,k2+2-kwtop+1 ), 1, &
+                                 c1, s1 )
+                    end do
+                    ! remove the shift
+                    call stdlib_slartg( b( kwbot, kwbot ), b( kwbot, kwbot-1 ), c1,s1, temp )
+                              
+                    b( kwbot, kwbot ) = temp
+                    b( kwbot, kwbot-1 ) = zero
+                    call stdlib_srot( kwbot-istartm, b( istartm, kwbot ), 1,b( istartm, kwbot-1 ),&
+                               1, c1, s1 )
+                    call stdlib_srot( kwbot-istartm+1, a( istartm, kwbot ), 1,a( istartm, kwbot-1 &
+                              ), 1, c1, s1 )
+                    call stdlib_srot( jw, zc( 1, kwbot-kwtop+1 ), 1, zc( 1,kwbot-1-kwtop+1 ), 1, &
+                              c1, s1 )
+                    k = k-1
+                 end if
+              end do
+           end if
+           ! apply qc and zc to rest of the matrix
+           if ( ilschur ) then
+              istartm = 1
+              istopm = n
+           else
+              istartm = ilo
+              istopm = ihi
+           end if
+           if ( istopm-ihi > 0 ) then
+              call stdlib_sgemm( 'T', 'N', jw, istopm-ihi, jw, one, qc, ldqc,a( kwtop, ihi+1 ), &
+                        lda, zero, work, jw )
+              call stdlib_slacpy( 'ALL', jw, istopm-ihi, work, jw, a( kwtop,ihi+1 ), lda )
+              call stdlib_sgemm( 'T', 'N', jw, istopm-ihi, jw, one, qc, ldqc,b( kwtop, ihi+1 ), &
+                        ldb, zero, work, jw )
+              call stdlib_slacpy( 'ALL', jw, istopm-ihi, work, jw, b( kwtop,ihi+1 ), ldb )
+           end if
+           if ( ilq ) then
+              call stdlib_sgemm( 'N', 'N', n, jw, jw, one, q( 1, kwtop ), ldq, qc,ldqc, zero, &
+                        work, n )
+              call stdlib_slacpy( 'ALL', n, jw, work, n, q( 1, kwtop ), ldq )
+           end if
+           if ( kwtop-1-istartm+1 > 0 ) then
+              call stdlib_sgemm( 'N', 'N', kwtop-istartm, jw, jw, one, a( istartm,kwtop ), lda, &
+                        zc, ldzc, zero, work,kwtop-istartm )
+              call stdlib_slacpy( 'ALL', kwtop-istartm, jw, work, kwtop-istartm,a( istartm, kwtop &
+                        ), lda )
+              call stdlib_sgemm( 'N', 'N', kwtop-istartm, jw, jw, one, b( istartm,kwtop ), ldb, &
+                        zc, ldzc, zero, work,kwtop-istartm )
+              call stdlib_slacpy( 'ALL', kwtop-istartm, jw, work, kwtop-istartm,b( istartm, kwtop &
+                        ), ldb )
+           end if
+           if ( ilz ) then
+              call stdlib_sgemm( 'N', 'N', n, jw, jw, one, z( 1, kwtop ), ldz, zc,ldzc, zero, &
+                        work, n )
+              call stdlib_slacpy( 'ALL', n, jw, work, n, z( 1, kwtop ), ldz )
+           end if
+     end subroutine stdlib_slaqz3
+
+     !> To find the desired eigenvalues of a given real symmetric
+     !> tridiagonal matrix T, SLARRE: sets any "small" off-diagonal
+     !> elements to zero, and for each unreduced block T_i, it finds
+     !> (a) a suitable shift at one end of the block's spectrum,
+     !> (b) the base representation, T_i - sigma_i I = L_i D_i L_i^T, and
+     !> (c) eigenvalues of each L_i D_i L_i^T.
+     !> The representations and eigenvalues found are then used by
+     !> SSTEMR to compute the eigenvectors of T.
+     !> The accuracy varies depending on whether bisection is used to
+     !> find a few eigenvalues or the dqds algorithm (subroutine SLASQ2) to
+     !> conpute all and then discard any unwanted one.
+     !> As an added benefit, SLARRE also outputs the n
+     !> Gerschgorin intervals for the matrices L_i D_i L_i^T.
+
+     pure subroutine stdlib_slarre( range, n, vl, vu, il, iu, d, e, e2,rtol1, rtol2, spltol, &
+               nsplit, isplit, m,w, werr, wgap, iblock, indexw, gers, pivmin,work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: range
+           integer(ilp), intent(in) :: il, iu, n
+           integer(ilp), intent(out) :: info, m, nsplit
+           real(sp), intent(out) :: pivmin
+           real(sp), intent(in) :: rtol1, rtol2, spltol
+           real(sp), intent(inout) :: vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iblock(*), isplit(*), iwork(*), indexw(*)
+           real(sp), intent(inout) :: d(*), e(*), e2(*)
+           real(sp), intent(out) :: gers(*), w(*), werr(*), wgap(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: hndrd = 100.0_sp
+           real(sp), parameter :: pert = 4.0_sp
+           real(sp), parameter :: fourth = one/four
+           real(sp), parameter :: fac = half
+           real(sp), parameter :: maxgrowth = 64.0_sp
+           real(sp), parameter :: fudge = 2.0_sp
+           integer(ilp), parameter :: maxtry = 6
+           integer(ilp), parameter :: allrng = 1
+           integer(ilp), parameter :: indrng = 2
+           integer(ilp), parameter :: valrng = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: forceb, norep, usedqd
+           integer(ilp) :: cnt, cnt1, cnt2, i, ibegin, idum, iend, iinfo, in, indl, indu, irange, &
+                     j, jblk, mb, mm, wbegin, wend
+           real(sp) :: avgap, bsrtol, clwdth, dmax, dpivot, eabs, emax, eold, eps, gl, gu, isleft,&
+                      isrght, rtl, rtol, s1, s2, safmin, sgndef, sigma, spdiam, tau, tmp, tmp1
+           ! Local Arrays 
+           integer(ilp) :: iseed(4)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           ! decode range
+           if( stdlib_lsame( range, 'A' ) ) then
+              irange = allrng
+           else if( stdlib_lsame( range, 'V' ) ) then
+              irange = valrng
+           else if( stdlib_lsame( range, 'I' ) ) then
+              irange = indrng
+           end if
+           m = 0
+           ! get machine constants
+           safmin = stdlib_slamch( 'S' )
+           eps = stdlib_slamch( 'P' )
+           ! set parameters
+           rtl = hndrd*eps
+           ! if one were ever to ask for less initial precision in bsrtol,
+           ! one should keep in mind that for the subset case, the extremal
+           ! eigenvalues must be at least as accurate as the current setting
+           ! (eigenvalues in the middle need not as much accuracy)
+           bsrtol = sqrt(eps)*(0.5e-3_sp)
+           ! treat case of 1x1 matrix for quick return
+           if( n==1 ) then
+              if( (irange==allrng).or.((irange==valrng).and.(d(1)>vl).and.(d(1)<=vu)).or.((&
+                        irange==indrng).and.(il==1).and.(iu==1)) ) then
+                 m = 1
+                 w(1) = d(1)
+                 ! the computation error of the eigenvalue is zero
+                 werr(1) = zero
+                 wgap(1) = zero
+                 iblock( 1 ) = 1
+                 indexw( 1 ) = 1
+                 gers(1) = d( 1 )
+                 gers(2) = d( 1 )
+              endif
+              ! store the shift for the initial rrr, which is zero in this case
+              e(1) = zero
+              return
+           end if
+           ! general case: tridiagonal matrix of order > 1
+           ! init werr, wgap. compute gerschgorin intervals and spectral diameter.
+           ! compute maximum off-diagonal entry and pivmin.
+           gl = d(1)
+           gu = d(1)
+           eold = zero
+           emax = zero
+           e(n) = zero
+           do i = 1,n
+              werr(i) = zero
+              wgap(i) = zero
+              eabs = abs( e(i) )
+              if( eabs >= emax ) then
+                 emax = eabs
+              end if
+              tmp1 = eabs + eold
+              gers( 2*i-1) = d(i) - tmp1
+              gl =  min( gl, gers( 2*i - 1))
+              gers( 2*i ) = d(i) + tmp1
+              gu = max( gu, gers(2*i) )
+              eold  = eabs
+           end do
+           ! the minimum pivot allowed in the sturm sequence for t
+           pivmin = safmin * max( one, emax**2 )
+           ! compute spectral diameter. the gerschgorin bounds give an
+           ! estimate that is wrong by at most a factor of sqrt(2)
+           spdiam = gu - gl
+           ! compute splitting points
+           call stdlib_slarra( n, d, e, e2, spltol, spdiam,nsplit, isplit, iinfo )
+           ! can force use of bisection instead of faster dqds.
+           ! option left in the code for future multisection work.
+           forceb = .false.
+           ! initialize usedqd, dqds should be used for allrng unless someone
+           ! explicitly wants bisection.
+           usedqd = (( irange==allrng ) .and. (.not.forceb))
+           if( (irange==allrng) .and. (.not. forceb) ) then
+              ! set interval [vl,vu] that contains all eigenvalues
+              vl = gl
+              vu = gu
+           else
+              ! we call stdlib_slarrd to find crude approximations to the eigenvalues
+              ! in the desired range. in case irange = indrng, we also obtain the
+              ! interval (vl,vu] that contains all the wanted eigenvalues.
+              ! an interval [left,right] has converged if
+              ! right-left<rtol*max(abs(left),abs(right))
+              ! stdlib_slarrd needs a work of size 4*n, iwork of size 3*n
+              call stdlib_slarrd( range, 'B', n, vl, vu, il, iu, gers,bsrtol, d, e, e2, pivmin, &
+                        nsplit, isplit,mm, w, werr, vl, vu, iblock, indexw,work, iwork, iinfo )
+              if( iinfo/=0 ) then
+                 info = -1
+                 return
+              endif
+              ! make sure that the entries m+1 to n in w, werr, iblock, indexw are 0
+              do i = mm+1,n
+                 w( i ) = zero
+                 werr( i ) = zero
+                 iblock( i ) = 0
+                 indexw( i ) = 0
+              end do
+           end if
+      ! **
+           ! loop over unreduced blocks
+           ibegin = 1
+           wbegin = 1
+           loop_170: do jblk = 1, nsplit
+              iend = isplit( jblk )
+              in = iend - ibegin + 1
+              ! 1 x 1 block
+              if( in==1 ) then
+                 if( (irange==allrng).or.( (irange==valrng).and.( d( ibegin )>vl ).and.( d( &
+                           ibegin )<=vu ) ).or. ( (irange==indrng).and.(iblock(wbegin)==jblk))) then
+                    m = m + 1
+                    w( m ) = d( ibegin )
+                    werr(m) = zero
+                    ! the gap for a single block doesn't matter for the later
+                    ! algorithm and is assigned an arbitrary large value
+                    wgap(m) = zero
+                    iblock( m ) = jblk
+                    indexw( m ) = 1
+                    wbegin = wbegin + 1
+                 endif
+                 ! e( iend ) holds the shift for the initial rrr
+                 e( iend ) = zero
+                 ibegin = iend + 1
+                 cycle loop_170
+              end if
+              ! blocks of size larger than 1x1
+              ! e( iend ) will hold the shift for the initial rrr, for now set it =0
+              e( iend ) = zero
+              ! find local outer bounds gl,gu for the block
+              gl = d(ibegin)
+              gu = d(ibegin)
+              do i = ibegin , iend
+                 gl = min( gers( 2*i-1 ), gl )
+                 gu = max( gers( 2*i ), gu )
+              end do
+              spdiam = gu - gl
+              if(.not. ((irange==allrng).and.(.not.forceb)) ) then
+                 ! count the number of eigenvalues in the current block.
+                 mb = 0
+                 do i = wbegin,mm
+                    if( iblock(i)==jblk ) then
+                       mb = mb+1
+                    else
+                       goto 21
+                    endif
+                 end do
+                 21 continue
+                 if( mb==0) then
+                    ! no eigenvalue in the current block lies in the desired range
+                    ! e( iend ) holds the shift for the initial rrr
+                    e( iend ) = zero
+                    ibegin = iend + 1
+                    cycle loop_170
+                 else
+                    ! decide whether dqds or bisection is more efficient
+                    usedqd = ( (mb > fac*in) .and. (.not.forceb) )
+                    wend = wbegin + mb - 1
+                    ! calculate gaps for the current block
+                    ! in later stages, when representations for individual
+                    ! eigenvalues are different, we use sigma = e( iend ).
+                    sigma = zero
+                    do i = wbegin, wend - 1
+                       wgap( i ) = max( zero,w(i+1)-werr(i+1) - (w(i)+werr(i)) )
+                    end do
+                    wgap( wend ) = max( zero,vu - sigma - (w( wend )+werr( wend )))
+                    ! find local index of the first and last desired evalue.
+                    indl = indexw(wbegin)
+                    indu = indexw( wend )
+                 endif
+              endif
+              if(( (irange==allrng) .and. (.not. forceb) ).or.usedqd) then
+                 ! case of dqds
+                 ! find approximations to the extremal eigenvalues of the block
+                 call stdlib_slarrk( in, 1, gl, gu, d(ibegin),e2(ibegin), pivmin, rtl, tmp, tmp1, &
+                           iinfo )
+                 if( iinfo/=0 ) then
+                    info = -1
+                    return
+                 endif
+                 isleft = max(gl, tmp - tmp1- hndrd * eps* abs(tmp - tmp1))
+                 call stdlib_slarrk( in, in, gl, gu, d(ibegin),e2(ibegin), pivmin, rtl, tmp, tmp1,&
+                            iinfo )
+                 if( iinfo/=0 ) then
+                    info = -1
+                    return
+                 endif
+                 isrght = min(gu, tmp + tmp1+ hndrd * eps * abs(tmp + tmp1))
+                 ! improve the estimate of the spectral diameter
+                 spdiam = isrght - isleft
+              else
+                 ! case of bisection
+                 ! find approximations to the wanted extremal eigenvalues
+                 isleft = max(gl, w(wbegin) - werr(wbegin)- hndrd * eps*abs(w(wbegin)- werr(&
+                           wbegin) ))
+                 isrght = min(gu,w(wend) + werr(wend)+ hndrd * eps * abs(w(wend)+ werr(wend)))
+                           
+              endif
+              ! decide whether the base representation for the current block
+              ! l_jblk d_jblk l_jblk^t = t_jblk - sigma_jblk i
+              ! should be on the left or the right end of the current block.
+              ! the strategy is to shift to the end which is "more populated"
+              ! furthermore, decide whether to use dqds for the computation of
+              ! dqds is chosen if all eigenvalues are desired or the number of
+              ! eigenvalues to be computed is large compared to the blocksize.
+              if( ( irange==allrng ) .and. (.not.forceb) ) then
+                 ! if all the eigenvalues have to be computed, we use dqd
+                 usedqd = .true.
+                 ! indl is the local index of the first eigenvalue to compute
+                 indl = 1
+                 indu = in
+                 ! mb =  number of eigenvalues to compute
+                 mb = in
+                 wend = wbegin + mb - 1
+                 ! define 1/4 and 3/4 points of the spectrum
+                 s1 = isleft + fourth * spdiam
+                 s2 = isrght - fourth * spdiam
+              else
+                 ! stdlib_slarrd has computed iblock and indexw for each eigenvalue
+                 ! approximation.
+                 ! choose sigma
+                 if( usedqd ) then
+                    s1 = isleft + fourth * spdiam
+                    s2 = isrght - fourth * spdiam
+                 else
+                    tmp = min(isrght,vu) -  max(isleft,vl)
+                    s1 =  max(isleft,vl) + fourth * tmp
+                    s2 =  min(isrght,vu) - fourth * tmp
+                 endif
+              endif
+              ! compute the negcount at the 1/4 and 3/4 points
+              if(mb>1) then
+                 call stdlib_slarrc( 'T', in, s1, s2, d(ibegin),e(ibegin), pivmin, cnt, cnt1, &
+                           cnt2, iinfo)
+              endif
+              if(mb==1) then
+                 sigma = gl
+                 sgndef = one
+              elseif( cnt1 - indl >= indu - cnt2 ) then
+                 if( ( irange==allrng ) .and. (.not.forceb) ) then
+                    sigma = max(isleft,gl)
+                 elseif( usedqd ) then
+                    ! use gerschgorin bound as shift to get pos def matrix
+                    ! for dqds
+                    sigma = isleft
+                 else
+                    ! use approximation of the first desired eigenvalue of the
+                    ! block as shift
+                    sigma = max(isleft,vl)
+                 endif
+                 sgndef = one
+              else
+                 if( ( irange==allrng ) .and. (.not.forceb) ) then
+                    sigma = min(isrght,gu)
+                 elseif( usedqd ) then
+                    ! use gerschgorin bound as shift to get neg def matrix
+                    ! for dqds
+                    sigma = isrght
+                 else
+                    ! use approximation of the first desired eigenvalue of the
+                    ! block as shift
+                    sigma = min(isrght,vu)
+                 endif
+                 sgndef = -one
+              endif
+              ! an initial sigma has been chosen that will be used for computing
+              ! t - sigma i = l d l^t
+              ! define the increment tau of the shift in case the initial shift
+              ! needs to be refined to obtain a factorization with not too much
+              ! element growth.
+              if( usedqd ) then
+                 ! the initial sigma was to the outer end of the spectrum
+                 ! the matrix is definite and we need not retreat.
+                 tau = spdiam*eps*n + two*pivmin
+                 tau = max( tau,two*eps*abs(sigma) )
+              else
+                 if(mb>1) then
+                    clwdth = w(wend) + werr(wend) - w(wbegin) - werr(wbegin)
+                    avgap = abs(clwdth / real(wend-wbegin,KIND=sp))
+                    if( sgndef==one ) then
+                       tau = half*max(wgap(wbegin),avgap)
+                       tau = max(tau,werr(wbegin))
+                    else
+                       tau = half*max(wgap(wend-1),avgap)
+                       tau = max(tau,werr(wend))
+                    endif
+                 else
+                    tau = werr(wbegin)
+                 endif
+              endif
+              loop_80: do idum = 1, maxtry
+                 ! compute l d l^t factorization of tridiagonal matrix t - sigma i.
+                 ! store d in work(1:in), l in work(in+1:2*in), and reciprocals of
+                 ! pivots in work(2*in+1:3*in)
+                 dpivot = d( ibegin ) - sigma
+                 work( 1 ) = dpivot
+                 dmax = abs( work(1) )
+                 j = ibegin
+                 do i = 1, in - 1
+                    work( 2*in+i ) = one / work( i )
+                    tmp = e( j )*work( 2*in+i )
+                    work( in+i ) = tmp
+                    dpivot = ( d( j+1 )-sigma ) - tmp*e( j )
+                    work( i+1 ) = dpivot
+                    dmax = max( dmax, abs(dpivot) )
+                    j = j + 1
+                 end do
+                 ! check for element growth
+                 if( dmax > maxgrowth*spdiam ) then
+                    norep = .true.
+                 else
+                    norep = .false.
+                 endif
+                 if( usedqd .and. .not.norep ) then
+                    ! ensure the definiteness of the representation
+                    ! all entries of d (of l d l^t) must have the same sign
+                    do i = 1, in
+                       tmp = sgndef*work( i )
+                       if( tmp<zero ) norep = .true.
+                    end do
+                 endif
+                 if(norep) then
+                    ! note that in the case of irange=allrng, we use the gerschgorin
+                    ! shift which makes the matrix definite. so we should end up
+                    ! here really only in the case of irange = valrng or indrng.
+                    if( idum==maxtry-1 ) then
+                       if( sgndef==one ) then
+                          ! the fudged gerschgorin shift should succeed
+                          sigma =gl - fudge*spdiam*eps*n - fudge*two*pivmin
+                       else
+                          sigma =gu + fudge*spdiam*eps*n + fudge*two*pivmin
+                       end if
+                    else
+                       sigma = sigma - sgndef * tau
+                       tau = two * tau
+                    end if
+                 else
+                    ! an initial rrr is found
+                    go to 83
+                 end if
+              end do loop_80
+              ! if the program reaches this point, no base representation could be
+              ! found in maxtry iterations.
+              info = 2
+              return
+              83 continue
+              ! at this point, we have found an initial base representation
+              ! t - sigma i = l d l^t with not too much element growth.
+              ! store the shift.
+              e( iend ) = sigma
+              ! store d and l.
+              call stdlib_scopy( in, work, 1, d( ibegin ), 1 )
+              call stdlib_scopy( in-1, work( in+1 ), 1, e( ibegin ), 1 )
+              if(mb>1 ) then
+                 ! perturb each entry of the base representation by a small
+                 ! (but random) relative amount to overcome difficulties with
+                 ! glued matrices.
+                 do i = 1, 4
+                    iseed( i ) = 1
+                 end do
+                 call stdlib_slarnv(2, iseed, 2*in-1, work(1))
+                 do i = 1,in-1
+                    d(ibegin+i-1) = d(ibegin+i-1)*(one+eps*pert*work(i))
+                    e(ibegin+i-1) = e(ibegin+i-1)*(one+eps*pert*work(in+i))
+                 end do
+                 d(iend) = d(iend)*(one+eps*four*work(in))
+              endif
+              ! don't update the gerschgorin intervals because keeping track
+              ! of the updates would be too much work in stdlib_slarrv.
+              ! we update w instead and use it to locate the proper gerschgorin
+              ! intervals.
+              ! compute the required eigenvalues of l d l' by bisection or dqds
+              if ( .not.usedqd ) then
+                 ! if stdlib_slarrd has been used, shift the eigenvalue approximations
+                 ! according to their representation. this is necessary for
+                 ! a uniform stdlib_slarrv since dqds computes eigenvalues of the
+                 ! shifted representation. in stdlib_slarrv, w will always hold the
+                 ! unshifted eigenvalue approximation.
+                 do j=wbegin,wend
+                    w(j) = w(j) - sigma
+                    werr(j) = werr(j) + abs(w(j)) * eps
+                 end do
+                 ! call stdlib_slarrb to reduce eigenvalue error of the approximations
+                 ! from stdlib_slarrd
+                 do i = ibegin, iend-1
+                    work( i ) = d( i ) * e( i )**2
+                 end do
+                 ! use bisection to find ev from indl to indu
+                 call stdlib_slarrb(in, d(ibegin), work(ibegin),indl, indu, rtol1, rtol2, indl-1,&
+                 w(wbegin), wgap(wbegin), werr(wbegin),work( 2*n+1 ), iwork, pivmin, spdiam,in, &
+                           iinfo )
+                 if( iinfo /= 0 ) then
+                    info = -4
+                    return
+                 end if
+                 ! stdlib_slarrb computes all gaps correctly except for the last one
+                 ! record distance to vu/gu
+                 wgap( wend ) = max( zero,( vu-sigma ) - ( w( wend ) + werr( wend ) ) )
+                 do i = indl, indu
+                    m = m + 1
+                    iblock(m) = jblk
+                    indexw(m) = i
+                 end do
+              else
+                 ! call dqds to get all eigs (and then possibly delete unwanted
+                 ! eigenvalues).
+                 ! note that dqds finds the eigenvalues of the l d l^t representation
+                 ! of t to high relative accuracy. high relative accuracy
+                 ! might be lost when the shift of the rrr is subtracted to obtain
+                 ! the eigenvalues of t. however, t is not guaranteed to define its
+                 ! eigenvalues to high relative accuracy anyway.
+                 ! set rtol to the order of the tolerance used in stdlib_slasq2
+                 ! this is an estimated error, the worst case bound is 4*n*eps
+                 ! which is usually too large and requires unnecessary work to be
+                 ! done by bisection when computing the eigenvectors
+                 rtol = log(real(in,KIND=sp)) * four * eps
+                 j = ibegin
+                 do i = 1, in - 1
+                    work( 2*i-1 ) = abs( d( j ) )
+                    work( 2*i ) = e( j )*e( j )*work( 2*i-1 )
+                    j = j + 1
+                 end do
+                 work( 2*in-1 ) = abs( d( iend ) )
+                 work( 2*in ) = zero
+                 call stdlib_slasq2( in, work, iinfo )
+                 if( iinfo /= 0 ) then
+                    ! if iinfo = -5 then an index is part of a tight cluster
+                    ! and should be changed. the index is in iwork(1) and the
+                    ! gap is in work(n+1)
+                    info = -5
+                    return
+                 else
+                    ! test that all eigenvalues are positive as expected
+                    do i = 1, in
+                       if( work( i )<zero ) then
+                          info = -6
+                          return
+                       endif
+                    end do
+                 end if
+                 if( sgndef>zero ) then
+                    do i = indl, indu
+                       m = m + 1
+                       w( m ) = work( in-i+1 )
+                       iblock( m ) = jblk
+                       indexw( m ) = i
+                    end do
+                 else
+                    do i = indl, indu
+                       m = m + 1
+                       w( m ) = -work( i )
+                       iblock( m ) = jblk
+                       indexw( m ) = i
+                    end do
+                 end if
+                 do i = m - mb + 1, m
+                    ! the value of rtol below should be the tolerance in stdlib_slasq2
+                    werr( i ) = rtol * abs( w(i) )
+                 end do
+                 do i = m - mb + 1, m - 1
+                    ! compute the right gap between the intervals
+                    wgap( i ) = max( zero,w(i+1)-werr(i+1) - (w(i)+werr(i)) )
+                 end do
+                 wgap( m ) = max( zero,( vu-sigma ) - ( w( m ) + werr( m ) ) )
+              end if
+              ! proceed with next block
+              ibegin = iend + 1
+              wbegin = wend + 1
+           end do loop_170
+           return
+     end subroutine stdlib_slarre
+
+     !> Using a divide and conquer approach, SLASD0: computes the singular
+     !> value decomposition (SVD) of a real upper bidiagonal N-by-M
+     !> matrix B with diagonal D and offdiagonal E, where M = N + SQRE.
+     !> The algorithm computes orthogonal matrices U and VT such that
+     !> B = U * S * VT. The singular values S are overwritten on D.
+     !> A related subroutine, SLASDA, computes only the singular values,
+     !> and optionally, the singular vectors in compact form.
+
+     pure subroutine stdlib_slasd0( n, sqre, d, e, u, ldu, vt, ldvt, smlsiz, iwork,work, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu, ldvt, n, smlsiz, sqre
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ic, idxq, idxqc, im1, inode, itemp, iwk, j, lf, ll, lvl, m, ncc,&
+                      nd, ndb1, ndiml, ndimr, nl, nlf, nlp1, nlvl, nr, nrf, nrp1, sqrei
+           real(sp) :: alpha, beta
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -2
+           end if
+           m = n + sqre
+           if( ldu<n ) then
+              info = -6
+           else if( ldvt<m ) then
+              info = -8
+           else if( smlsiz<3 ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLASD0', -info )
+              return
+           end if
+           ! if the input matrix is too small, call stdlib_slasdq to find the svd.
+           if( n<=smlsiz ) then
+              call stdlib_slasdq( 'U', sqre, n, m, n, 0, d, e, vt, ldvt, u, ldu, u,ldu, work, &
+                        info )
+              return
+           end if
+           ! set up the computation tree.
+           inode = 1
+           ndiml = inode + n
+           ndimr = ndiml + n
+           idxq = ndimr + n
+           iwk = idxq + n
+           call stdlib_slasdt( n, nlvl, nd, iwork( inode ), iwork( ndiml ),iwork( ndimr ), smlsiz &
+                     )
+           ! for the nodes on bottom level of the tree, solve
+           ! their subproblems by stdlib_slasdq.
+           ndb1 = ( nd+1 ) / 2
+           ncc = 0
+           loop_30: do i = ndb1, nd
+           ! ic : center row of each node
+           ! nl : number of rows of left  subproblem
+           ! nr : number of rows of right subproblem
+           ! nlf: starting row of the left   subproblem
+           ! nrf: starting row of the right  subproblem
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nlp1 = nl + 1
+              nr = iwork( ndimr+i1 )
+              nrp1 = nr + 1
+              nlf = ic - nl
+              nrf = ic + 1
+              sqrei = 1
+              call stdlib_slasdq( 'U', sqrei, nl, nlp1, nl, ncc, d( nlf ), e( nlf ),vt( nlf, nlf )&
+                        , ldvt, u( nlf, nlf ), ldu,u( nlf, nlf ), ldu, work, info )
+              if( info/=0 ) then
+                 return
+              end if
+              itemp = idxq + nlf - 2
+              do j = 1, nl
+                 iwork( itemp+j ) = j
+              end do
+              if( i==nd ) then
+                 sqrei = sqre
+              else
+                 sqrei = 1
+              end if
+              nrp1 = nr + sqrei
+              call stdlib_slasdq( 'U', sqrei, nr, nrp1, nr, ncc, d( nrf ), e( nrf ),vt( nrf, nrf )&
+                        , ldvt, u( nrf, nrf ), ldu,u( nrf, nrf ), ldu, work, info )
+              if( info/=0 ) then
+                 return
+              end if
+              itemp = idxq + ic
+              do j = 1, nr
+                 iwork( itemp+j-1 ) = j
+              end do
+           end do loop_30
+           ! now conquer each subproblem bottom-up.
+           loop_50: do lvl = nlvl, 1, -1
+              ! find the first node lf and last node ll on the
+              ! current level lvl.
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = lf, ll
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 if( ( sqre==0 ) .and. ( i==ll ) ) then
+                    sqrei = sqre
+                 else
+                    sqrei = 1
+                 end if
+                 idxqc = idxq + nlf - 1
+                 alpha = d( ic )
+                 beta = e( ic )
+                 call stdlib_slasd1( nl, nr, sqrei, d( nlf ), alpha, beta,u( nlf, nlf ), ldu, vt( &
+                           nlf, nlf ), ldvt,iwork( idxqc ), iwork( iwk ), work, info )
+           ! report the possible convergence failure.
+                 if( info/=0 ) then
+                    return
+                 end if
+              end do
+           end do loop_50
+           return
+     end subroutine stdlib_slasd0
+
+     !> Using a divide and conquer approach, SLASDA: computes the singular
+     !> value decomposition (SVD) of a real upper bidiagonal N-by-M matrix
+     !> B with diagonal D and offdiagonal E, where M = N + SQRE. The
+     !> algorithm computes the singular values in the SVD B = U * S * VT.
+     !> The orthogonal matrices U and VT are optionally computed in
+     !> compact form.
+     !> A related subroutine, SLASD0, computes the singular values and
+     !> the singular vectors in explicit form.
+
+     pure subroutine stdlib_slasda( icompq, smlsiz, n, sqre, d, e, u, ldu, vt, k,difl, difr, z, &
+               poles, givptr, givcol, ldgcol,perm, givnum, c, s, work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, ldgcol, ldu, n, smlsiz, sqre
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(ldgcol,*), givptr(*), iwork(*), k(*), perm(ldgcol,&
+                     *)
+           real(sp), intent(out) :: c(*), difl(ldu,*), difr(ldu,*), givnum(ldu,*), poles(ldu,*), &
+                     s(*), u(ldu,*), vt(ldu,*), work(*), z(ldu,*)
+           real(sp), intent(inout) :: d(*), e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ic, idxq, idxqi, im1, inode, itemp, iwk, j, lf, ll, lvl, lvl2, &
+           m, ncc, nd, ndb1, ndiml, ndimr, nl, nlf, nlp1, nlvl, nr, nrf, nrp1, nru, nwork1, &
+                     nwork2, smlszp, sqrei, vf, vfi, vl, vli
+           real(sp) :: alpha, beta
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( smlsiz<3 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( ldu<( n+sqre ) ) then
+              info = -8
+           else if( ldgcol<n ) then
+              info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLASDA', -info )
+              return
+           end if
+           m = n + sqre
+           ! if the input matrix is too small, call stdlib_slasdq to find the svd.
+           if( n<=smlsiz ) then
+              if( icompq==0 ) then
+                 call stdlib_slasdq( 'U', sqre, n, 0, 0, 0, d, e, vt, ldu, u, ldu,u, ldu, work, &
+                           info )
+              else
+                 call stdlib_slasdq( 'U', sqre, n, m, n, 0, d, e, vt, ldu, u, ldu,u, ldu, work, &
+                           info )
+              end if
+              return
+           end if
+           ! book-keeping and  set up the computation tree.
+           inode = 1
+           ndiml = inode + n
+           ndimr = ndiml + n
+           idxq = ndimr + n
+           iwk = idxq + n
+           ncc = 0
+           nru = 0
+           smlszp = smlsiz + 1
+           vf = 1
+           vl = vf + m
+           nwork1 = vl + m
+           nwork2 = nwork1 + smlszp*smlszp
+           call stdlib_slasdt( n, nlvl, nd, iwork( inode ), iwork( ndiml ),iwork( ndimr ), smlsiz &
+                     )
+           ! for the nodes on bottom level of the tree, solve
+           ! their subproblems by stdlib_slasdq.
+           ndb1 = ( nd+1 ) / 2
+           loop_30: do i = ndb1, nd
+              ! ic : center row of each node
+              ! nl : number of rows of left  subproblem
+              ! nr : number of rows of right subproblem
+              ! nlf: starting row of the left   subproblem
+              ! nrf: starting row of the right  subproblem
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nlp1 = nl + 1
+              nr = iwork( ndimr+i1 )
+              nlf = ic - nl
+              nrf = ic + 1
+              idxqi = idxq + nlf - 2
+              vfi = vf + nlf - 1
+              vli = vl + nlf - 1
+              sqrei = 1
+              if( icompq==0 ) then
+                 call stdlib_slaset( 'A', nlp1, nlp1, zero, one, work( nwork1 ),smlszp )
+                 call stdlib_slasdq( 'U', sqrei, nl, nlp1, nru, ncc, d( nlf ),e( nlf ), work( &
+                 nwork1 ), smlszp,work( nwork2 ), nl, work( nwork2 ), nl,work( nwork2 ), info )
+                           
+                 itemp = nwork1 + nl*smlszp
+                 call stdlib_scopy( nlp1, work( nwork1 ), 1, work( vfi ), 1 )
+                 call stdlib_scopy( nlp1, work( itemp ), 1, work( vli ), 1 )
+              else
+                 call stdlib_slaset( 'A', nl, nl, zero, one, u( nlf, 1 ), ldu )
+                 call stdlib_slaset( 'A', nlp1, nlp1, zero, one, vt( nlf, 1 ), ldu )
+                 call stdlib_slasdq( 'U', sqrei, nl, nlp1, nl, ncc, d( nlf ),e( nlf ), vt( nlf, 1 &
+                           ), ldu, u( nlf, 1 ), ldu,u( nlf, 1 ), ldu, work( nwork1 ), info )
+                 call stdlib_scopy( nlp1, vt( nlf, 1 ), 1, work( vfi ), 1 )
+                 call stdlib_scopy( nlp1, vt( nlf, nlp1 ), 1, work( vli ), 1 )
+              end if
+              if( info/=0 ) then
+                 return
+              end if
+              do j = 1, nl
+                 iwork( idxqi+j ) = j
+              end do
+              if( ( i==nd ) .and. ( sqre==0 ) ) then
+                 sqrei = 0
+              else
+                 sqrei = 1
+              end if
+              idxqi = idxqi + nlp1
+              vfi = vfi + nlp1
+              vli = vli + nlp1
+              nrp1 = nr + sqrei
+              if( icompq==0 ) then
+                 call stdlib_slaset( 'A', nrp1, nrp1, zero, one, work( nwork1 ),smlszp )
+                 call stdlib_slasdq( 'U', sqrei, nr, nrp1, nru, ncc, d( nrf ),e( nrf ), work( &
+                 nwork1 ), smlszp,work( nwork2 ), nr, work( nwork2 ), nr,work( nwork2 ), info )
+                           
+                 itemp = nwork1 + ( nrp1-1 )*smlszp
+                 call stdlib_scopy( nrp1, work( nwork1 ), 1, work( vfi ), 1 )
+                 call stdlib_scopy( nrp1, work( itemp ), 1, work( vli ), 1 )
+              else
+                 call stdlib_slaset( 'A', nr, nr, zero, one, u( nrf, 1 ), ldu )
+                 call stdlib_slaset( 'A', nrp1, nrp1, zero, one, vt( nrf, 1 ), ldu )
+                 call stdlib_slasdq( 'U', sqrei, nr, nrp1, nr, ncc, d( nrf ),e( nrf ), vt( nrf, 1 &
+                           ), ldu, u( nrf, 1 ), ldu,u( nrf, 1 ), ldu, work( nwork1 ), info )
+                 call stdlib_scopy( nrp1, vt( nrf, 1 ), 1, work( vfi ), 1 )
+                 call stdlib_scopy( nrp1, vt( nrf, nrp1 ), 1, work( vli ), 1 )
+              end if
+              if( info/=0 ) then
+                 return
+              end if
+              do j = 1, nr
+                 iwork( idxqi+j ) = j
+              end do
+           end do loop_30
+           ! now conquer each subproblem bottom-up.
+           j = 2**nlvl
+           loop_50: do lvl = nlvl, 1, -1
+              lvl2 = lvl*2 - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl.
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              loop_40: do i = lf, ll
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 if( i==ll ) then
+                    sqrei = sqre
+                 else
+                    sqrei = 1
+                 end if
+                 vfi = vf + nlf - 1
+                 vli = vl + nlf - 1
+                 idxqi = idxq + nlf - 1
+                 alpha = d( ic )
+                 beta = e( ic )
+                 if( icompq==0 ) then
+                    call stdlib_slasd6( icompq, nl, nr, sqrei, d( nlf ),work( vfi ), work( vli ), &
+                    alpha, beta,iwork( idxqi ), perm, givptr( 1 ), givcol,ldgcol, givnum, ldu, &
+                    poles, difl, difr, z,k( 1 ), c( 1 ), s( 1 ), work( nwork1 ),iwork( iwk ), &
+                              info )
+                 else
+                    j = j - 1
+                    call stdlib_slasd6( icompq, nl, nr, sqrei, d( nlf ),work( vfi ), work( vli ), &
+                    alpha, beta,iwork( idxqi ), perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ),&
+                     ldgcol,givnum( nlf, lvl2 ), ldu,poles( nlf, lvl2 ), difl( nlf, lvl ),difr( &
+                     nlf, lvl2 ), z( nlf, lvl ), k( j ),c( j ), s( j ), work( nwork1 ),iwork( iwk &
+                               ), info )
+                 end if
+                 if( info/=0 ) then
+                    return
+                 end if
+              end do loop_40
+           end do loop_50
+           return
+     end subroutine stdlib_slasda
+
+     !> SLASDQ: computes the singular value decomposition (SVD) of a real
+     !> (upper or lower) bidiagonal matrix with diagonal D and offdiagonal
+     !> E, accumulating the transformations if desired. Letting B denote
+     !> the input bidiagonal matrix, the algorithm computes orthogonal
+     !> matrices Q and P such that B = Q * S * P**T (P**T denotes the transpose
+     !> of P). The singular values S are overwritten on D.
+     !> The input matrix U  is changed to U  * Q  if desired.
+     !> The input matrix VT is changed to P**T * VT if desired.
+     !> The input matrix C  is changed to Q**T * C  if desired.
+     !> See "Computing  Small Singular Values of Bidiagonal Matrices With
+     !> Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+     !> LAPACK Working Note #3, for a detailed description of the algorithm.
+
+     pure subroutine stdlib_slasdq( uplo, sqre, n, ncvt, nru, ncc, d, e, vt, ldvt,u, ldu, c, ldc, &
+               work, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, ldu, ldvt, n, ncc, ncvt, nru, sqre
+           ! Array Arguments 
+           real(sp), intent(inout) :: c(ldc,*), d(*), e(*), u(ldu,*), vt(ldvt,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: rotate
+           integer(ilp) :: i, isub, iuplo, j, np1, sqre1
+           real(sp) :: cs, r, smin, sn
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           iuplo = 0
+           if( stdlib_lsame( uplo, 'U' ) )iuplo = 1
+           if( stdlib_lsame( uplo, 'L' ) )iuplo = 2
+           if( iuplo==0 ) then
+              info = -1
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ncvt<0 ) then
+              info = -4
+           else if( nru<0 ) then
+              info = -5
+           else if( ncc<0 ) then
+              info = -6
+           else if( ( ncvt==0 .and. ldvt<1 ) .or.( ncvt>0 .and. ldvt<max( 1, n ) ) ) then
+              info = -10
+           else if( ldu<max( 1, nru ) ) then
+              info = -12
+           else if( ( ncc==0 .and. ldc<1 ) .or.( ncc>0 .and. ldc<max( 1, n ) ) ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SLASDQ', -info )
+              return
+           end if
+           if( n==0 )return
+           ! rotate is true if any singular vectors desired, false otherwise
+           rotate = ( ncvt>0 ) .or. ( nru>0 ) .or. ( ncc>0 )
+           np1 = n + 1
+           sqre1 = sqre
+           ! if matrix non-square upper bidiagonal, rotate to be lower
+           ! bidiagonal.  the rotations are on the right.
+           if( ( iuplo==1 ) .and. ( sqre1==1 ) ) then
+              do i = 1, n - 1
+                 call stdlib_slartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( rotate ) then
+                    work( i ) = cs
+                    work( n+i ) = sn
+                 end if
+              end do
+              call stdlib_slartg( d( n ), e( n ), cs, sn, r )
+              d( n ) = r
+              e( n ) = zero
+              if( rotate ) then
+                 work( n ) = cs
+                 work( n+n ) = sn
+              end if
+              iuplo = 2
+              sqre1 = 0
+              ! update singular vectors if desired.
+              if( ncvt>0 )call stdlib_slasr( 'L', 'V', 'F', np1, ncvt, work( 1 ),work( np1 ), vt, &
+                        ldvt )
+           end if
+           ! if matrix lower bidiagonal, rotate to be upper bidiagonal
+           ! by applying givens rotations on the left.
+           if( iuplo==2 ) then
+              do i = 1, n - 1
+                 call stdlib_slartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( rotate ) then
+                    work( i ) = cs
+                    work( n+i ) = sn
+                 end if
+              end do
+              ! if matrix (n+1)-by-n lower bidiagonal, one additional
+              ! rotation is needed.
+              if( sqre1==1 ) then
+                 call stdlib_slartg( d( n ), e( n ), cs, sn, r )
+                 d( n ) = r
+                 if( rotate ) then
+                    work( n ) = cs
+                    work( n+n ) = sn
+                 end if
+              end if
+              ! update singular vectors if desired.
+              if( nru>0 ) then
+                 if( sqre1==0 ) then
+                    call stdlib_slasr( 'R', 'V', 'F', nru, n, work( 1 ),work( np1 ), u, ldu )
+                              
+                 else
+                    call stdlib_slasr( 'R', 'V', 'F', nru, np1, work( 1 ),work( np1 ), u, ldu )
+                              
+                 end if
+              end if
+              if( ncc>0 ) then
+                 if( sqre1==0 ) then
+                    call stdlib_slasr( 'L', 'V', 'F', n, ncc, work( 1 ),work( np1 ), c, ldc )
+                              
+                 else
+                    call stdlib_slasr( 'L', 'V', 'F', np1, ncc, work( 1 ),work( np1 ), c, ldc )
+                              
+                 end if
+              end if
+           end if
+           ! call stdlib_sbdsqr to compute the svd of the reduced real
+           ! n-by-n upper bidiagonal matrix.
+           call stdlib_sbdsqr( 'U', n, ncvt, nru, ncc, d, e, vt, ldvt, u, ldu, c,ldc, work, info )
+                     
+           ! sort the singular values into ascending order (insertion sort on
+           ! singular values, but only one transposition per singular vector)
+           do i = 1, n
+              ! scan for smallest d(i).
+              isub = i
+              smin = d( i )
+              do j = i + 1, n
+                 if( d( j )<smin ) then
+                    isub = j
+                    smin = d( j )
+                 end if
+              end do
+              if( isub/=i ) then
+                 ! swap singular values and vectors.
+                 d( isub ) = d( i )
+                 d( i ) = smin
+                 if( ncvt>0 )call stdlib_sswap( ncvt, vt( isub, 1 ), ldvt, vt( i, 1 ), ldvt )
+                           
+                 if( nru>0 )call stdlib_sswap( nru, u( 1, isub ), 1, u( 1, i ), 1 )
+                 if( ncc>0 )call stdlib_sswap( ncc, c( isub, 1 ), ldc, c( i, 1 ), ldc )
+              end if
+           end do
+           return
+     end subroutine stdlib_slasdq
+
+     !> SLASQ1: computes the singular values of a real N-by-N bidiagonal
+     !> matrix with diagonal D and off-diagonal E. The singular values
+     !> are computed to high relative accuracy, in the absence of
+     !> denormalization, underflow and overflow. The algorithm was first
+     !> presented in
+     !> "Accurate singular values and differential qd algorithms" by K. V.
+     !> Fernando and B. N. Parlett, Numer. Math., Vol-67, No. 2, pp. 191-230,
+     !> 1994,
+     !> and the present implementation is described in "An implementation of
+     !> the dqds Algorithm (Positive Case)", LAPACK Working Note.
+
+     pure subroutine stdlib_slasq1( n, d, e, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo
+           real(sp) :: eps, scale, safmin, sigmn, sigmx
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'SLASQ1', -info )
+              return
+           else if( n==0 ) then
+              return
+           else if( n==1 ) then
+              d( 1 ) = abs( d( 1 ) )
+              return
+           else if( n==2 ) then
+              call stdlib_slas2( d( 1 ), e( 1 ), d( 2 ), sigmn, sigmx )
+              d( 1 ) = sigmx
+              d( 2 ) = sigmn
+              return
+           end if
+           ! estimate the largest singular value.
+           sigmx = zero
+           do i = 1, n - 1
+              d( i ) = abs( d( i ) )
+              sigmx = max( sigmx, abs( e( i ) ) )
+           end do
+           d( n ) = abs( d( n ) )
+           ! early return if sigmx is zero (matrix is already diagonal).
+           if( sigmx==zero ) then
+              call stdlib_slasrt( 'D', n, d, iinfo )
+              return
+           end if
+           do i = 1, n
+              sigmx = max( sigmx, d( i ) )
+           end do
+           ! copy d and e into work (in the z format) and scale (squaring the
+           ! input data makes scaling by a power of the radix pointless).
+           eps = stdlib_slamch( 'PRECISION' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           scale = sqrt( eps / safmin )
+           call stdlib_scopy( n, d, 1, work( 1 ), 2 )
+           call stdlib_scopy( n-1, e, 1, work( 2 ), 2 )
+           call stdlib_slascl( 'G', 0, 0, sigmx, scale, 2*n-1, 1, work, 2*n-1,iinfo )
+           ! compute the q's and e's.
+           do i = 1, 2*n - 1
+              work( i ) = work( i )**2
+           end do
+           work( 2*n ) = zero
+           call stdlib_slasq2( n, work, info )
+           if( info==0 ) then
+              do i = 1, n
+                 d( i ) = sqrt( work( i ) )
+              end do
+              call stdlib_slascl( 'G', 0, 0, scale, sigmx, n, 1, d, n, iinfo )
+           else if( info==2 ) then
+           ! maximum number of iterations exceeded.  move data from work
+           ! into d and e so the calling subroutine can try to finish
+              do i = 1, n
+                 d( i ) = sqrt( work( 2*i-1 ) )
+                 e( i ) = sqrt( work( 2*i ) )
+              end do
+              call stdlib_slascl( 'G', 0, 0, scale, sigmx, n, 1, d, n, iinfo )
+              call stdlib_slascl( 'G', 0, 0, scale, sigmx, n, 1, e, n, iinfo )
+           end if
+           return
+     end subroutine stdlib_slasq1
+
+     !> SLASQ2: computes all the eigenvalues of the symmetric positive
+     !> definite tridiagonal matrix associated with the qd array Z to high
+     !> relative accuracy are computed to high relative accuracy, in the
+     !> absence of denormalization, underflow and overflow.
+     !> To see the relation of Z to the tridiagonal matrix, let L be a
+     !> unit lower bidiagonal matrix with subdiagonals Z(2,4,6,,..) and
+     !> let U be an upper bidiagonal matrix with 1's above and diagonal
+     !> Z(1,3,5,,..). The tridiagonal is L*U or, if you prefer, the
+     !> symmetric tridiagonal to which it is similar.
+     !> Note : SLASQ2 defines a logical variable, IEEE, which is true
+     !> on machines which follow ieee-754 floating-point standard in their
+     !> handling of infinities and NaNs, and false otherwise. This variable
+     !> is passed to SLASQ3.
+
+     pure subroutine stdlib_slasq2( n, z, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(sp), intent(inout) :: z(*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: cbias = 1.50_sp
+           real(sp), parameter :: hundrd = 100.0_sp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ieee
+           integer(ilp) :: i0, i4, iinfo, ipn4, iter, iwhila, iwhilb, k, kmin, n0, nbig, ndiv, &
+                     nfail, pp, splt, ttype, i1, n1
+           real(sp) :: d, dee, deemin, desig, dmin, dmin1, dmin2, dn, dn1, dn2, e, emax, emin, &
+           eps, g, oldemn, qmax, qmin, s, safmin, sigma, t, tau, temp, tol, tol2, trace, zmax, &
+                     tempe, tempq
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,sqrt
+           ! Executable Statements 
+           ! test the input arguments.
+           ! (in case stdlib_slasq2 is not called by stdlib_slasq1)
+           info = 0
+           eps = stdlib_slamch( 'PRECISION' )
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           tol = eps*hundrd
+           tol2 = tol**2
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'SLASQ2', 1 )
+              return
+           else if( n==0 ) then
+              return
+           else if( n==1 ) then
+              ! 1-by-1 case.
+              if( z( 1 )<zero ) then
+                 info = -201
+                 call stdlib_xerbla( 'SLASQ2', 2 )
+              end if
+              return
+           else if( n==2 ) then
+              ! 2-by-2 case.
+              if( z( 1 )<zero ) then
+                 info = -201
+                 call stdlib_xerbla( 'SLASQ2', 2 )
+                 return
+              else if( z( 2 )<zero ) then
+                 info = -202
+                 call stdlib_xerbla( 'SLASQ2', 2 )
+                 return
+              else if( z( 3 )<zero ) then
+                info = -203
+                call stdlib_xerbla( 'SLASQ2', 2 )
+                return
+              else if( z( 3 )>z( 1 ) ) then
+                 d = z( 3 )
+                 z( 3 ) = z( 1 )
+                 z( 1 ) = d
+              end if
+              z( 5 ) = z( 1 ) + z( 2 ) + z( 3 )
+              if( z( 2 )>z( 3 )*tol2 ) then
+                 t = half*( ( z( 1 )-z( 3 ) )+z( 2 ) )
+                 s = z( 3 )*( z( 2 ) / t )
+                 if( s<=t ) then
+                    s = z( 3 )*( z( 2 ) / ( t*( one+sqrt( one+s / t ) ) ) )
+                 else
+                    s = z( 3 )*( z( 2 ) / ( t+sqrt( t )*sqrt( t+s ) ) )
+                 end if
+                 t = z( 1 ) + ( s+z( 2 ) )
+                 z( 3 ) = z( 3 )*( z( 1 ) / t )
+                 z( 1 ) = t
+              end if
+              z( 2 ) = z( 3 )
+              z( 6 ) = z( 2 ) + z( 1 )
+              return
+           end if
+           ! check for negative data and compute sums of q's and e's.
+           z( 2*n ) = zero
+           emin = z( 2 )
+           qmax = zero
+           zmax = zero
+           d = zero
+           e = zero
+           do k = 1, 2*( n-1 ), 2
+              if( z( k )<zero ) then
+                 info = -( 200+k )
+                 call stdlib_xerbla( 'SLASQ2', 2 )
+                 return
+              else if( z( k+1 )<zero ) then
+                 info = -( 200+k+1 )
+                 call stdlib_xerbla( 'SLASQ2', 2 )
+                 return
+              end if
+              d = d + z( k )
+              e = e + z( k+1 )
+              qmax = max( qmax, z( k ) )
+              emin = min( emin, z( k+1 ) )
+              zmax = max( qmax, zmax, z( k+1 ) )
+           end do
+           if( z( 2*n-1 )<zero ) then
+              info = -( 200+2*n-1 )
+              call stdlib_xerbla( 'SLASQ2', 2 )
+              return
+           end if
+           d = d + z( 2*n-1 )
+           qmax = max( qmax, z( 2*n-1 ) )
+           zmax = max( qmax, zmax )
+           ! check for diagonality.
+           if( e==zero ) then
+              do k = 2, n
+                 z( k ) = z( 2*k-1 )
+              end do
+              call stdlib_slasrt( 'D', n, z, iinfo )
+              z( 2*n-1 ) = d
+              return
+           end if
+           trace = d + e
+           ! check for zero data.
+           if( trace==zero ) then
+              z( 2*n-1 ) = zero
+              return
+           end if
+           ! check whether the machine is ieee conformable.
+           ! ieee = ( stdlib_ilaenv( 10, 'slasq2', 'n', 1, 2, 3, 4 )==1 )
+           ! [11/15/2008] the case ieee=.true. has a problem in single precision with
+           ! some the test matrices of type 16. the double precision code is fine.
+           ieee = .false.
+           ! rearrange data for locality: z=(q1,qq1,e1,ee1,q2,qq2,e2,ee2,...).
+           do k = 2*n, 2, -2
+              z( 2*k ) = zero
+              z( 2*k-1 ) = z( k )
+              z( 2*k-2 ) = zero
+              z( 2*k-3 ) = z( k-1 )
+           end do
+           i0 = 1
+           n0 = n
+           ! reverse the qd-array, if warranted.
+           if( cbias*z( 4*i0-3 )<z( 4*n0-3 ) ) then
+              ipn4 = 4*( i0+n0 )
+              do i4 = 4*i0, 2*( i0+n0-1 ), 4
+                 temp = z( i4-3 )
+                 z( i4-3 ) = z( ipn4-i4-3 )
+                 z( ipn4-i4-3 ) = temp
+                 temp = z( i4-1 )
+                 z( i4-1 ) = z( ipn4-i4-5 )
+                 z( ipn4-i4-5 ) = temp
+              end do
+           end if
+           ! initial split checking via dqd and li's test.
+           pp = 0
+           loop_80: do k = 1, 2
+              d = z( 4*n0+pp-3 )
+              do i4 = 4*( n0-1 ) + pp, 4*i0 + pp, -4
+                 if( z( i4-1 )<=tol2*d ) then
+                    z( i4-1 ) = -zero
+                    d = z( i4-3 )
+                 else
+                    d = z( i4-3 )*( d / ( d+z( i4-1 ) ) )
+                 end if
+              end do
+              ! dqd maps z to zz plus li's test.
+              emin = z( 4*i0+pp+1 )
+              d = z( 4*i0+pp-3 )
+              do i4 = 4*i0 + pp, 4*( n0-1 ) + pp, 4
+                 z( i4-2*pp-2 ) = d + z( i4-1 )
+                 if( z( i4-1 )<=tol2*d ) then
+                    z( i4-1 ) = -zero
+                    z( i4-2*pp-2 ) = d
+                    z( i4-2*pp ) = zero
+                    d = z( i4+1 )
+                 else if( safmin*z( i4+1 )<z( i4-2*pp-2 ) .and.safmin*z( i4-2*pp-2 )<z( i4+1 ) ) &
+                           then
+                    temp = z( i4+1 ) / z( i4-2*pp-2 )
+                    z( i4-2*pp ) = z( i4-1 )*temp
+                    d = d*temp
+                 else
+                    z( i4-2*pp ) = z( i4+1 )*( z( i4-1 ) / z( i4-2*pp-2 ) )
+                    d = z( i4+1 )*( d / z( i4-2*pp-2 ) )
+                 end if
+                 emin = min( emin, z( i4-2*pp ) )
+              end do
+              z( 4*n0-pp-2 ) = d
+              ! now find qmax.
+              qmax = z( 4*i0-pp-2 )
+              do i4 = 4*i0 - pp + 2, 4*n0 - pp - 2, 4
+                 qmax = max( qmax, z( i4 ) )
+              end do
+              ! prepare for the next iteration on k.
+              pp = 1 - pp
+           end do loop_80
+           ! initialise variables to pass to stdlib_slasq3.
+           ttype = 0
+           dmin1 = zero
+           dmin2 = zero
+           dn    = zero
+           dn1   = zero
+           dn2   = zero
+           g     = zero
+           tau   = zero
+           iter = 2
+           nfail = 0
+           ndiv = 2*( n0-i0 )
+           loop_160: do iwhila = 1, n + 1
+              if( n0<1 )go to 170
+              ! while array unfinished do
+              ! e(n0) holds the value of sigma when submatrix in i0:n0
+              ! splits from the rest of the array, but is negated.
+              desig = zero
+              if( n0==n ) then
+                 sigma = zero
+              else
+                 sigma = -z( 4*n0-1 )
+              end if
+              if( sigma<zero ) then
+                 info = 1
+                 return
+              end if
+              ! find last unreduced submatrix's top index i0, find qmax and
+              ! emin. find gershgorin-type bound if q's much greater than e's.
+              emax = zero
+              if( n0>i0 ) then
+                 emin = abs( z( 4*n0-5 ) )
+              else
+                 emin = zero
+              end if
+              qmin = z( 4*n0-3 )
+              qmax = qmin
+              do i4 = 4*n0, 8, -4
+                 if( z( i4-5 )<=zero )go to 100
+                 if( qmin>=four*emax ) then
+                    qmin = min( qmin, z( i4-3 ) )
+                    emax = max( emax, z( i4-5 ) )
+                 end if
+                 qmax = max( qmax, z( i4-7 )+z( i4-5 ) )
+                 emin = min( emin, z( i4-5 ) )
+              end do
+              i4 = 4
+              100 continue
+              i0 = i4 / 4
+              pp = 0
+              if( n0-i0>1 ) then
+                 dee = z( 4*i0-3 )
+                 deemin = dee
+                 kmin = i0
+                 do i4 = 4*i0+1, 4*n0-3, 4
+                    dee = z( i4 )*( dee /( dee+z( i4-2 ) ) )
+                    if( dee<=deemin ) then
+                       deemin = dee
+                       kmin = ( i4+3 )/4
+                    end if
+                 end do
+                 if( (kmin-i0)*2<n0-kmin .and.deemin<=half*z(4*n0-3) ) then
+                    ipn4 = 4*( i0+n0 )
+                    pp = 2
+                    do i4 = 4*i0, 2*( i0+n0-1 ), 4
+                       temp = z( i4-3 )
+                       z( i4-3 ) = z( ipn4-i4-3 )
+                       z( ipn4-i4-3 ) = temp
+                       temp = z( i4-2 )
+                       z( i4-2 ) = z( ipn4-i4-2 )
+                       z( ipn4-i4-2 ) = temp
+                       temp = z( i4-1 )
+                       z( i4-1 ) = z( ipn4-i4-5 )
+                       z( ipn4-i4-5 ) = temp
+                       temp = z( i4 )
+                       z( i4 ) = z( ipn4-i4-4 )
+                       z( ipn4-i4-4 ) = temp
+                    end do
+                 end if
+              end if
+              ! put -(initial shift) into dmin.
+              dmin = -max( zero, qmin-two*sqrt( qmin )*sqrt( emax ) )
+              ! now i0:n0 is unreduced.
+              ! pp = 0 for ping, pp = 1 for pong.
+              ! pp = 2 indicates that flipping was applied to the z array and
+                     ! and that the tests for deflation upon entry in stdlib_slasq3
+                     ! should not be performed.
+              nbig = 100*( n0-i0+1 )
+              loop_140: do iwhilb = 1, nbig
+                 if( i0>n0 )go to 150
+                 ! while submatrix unfinished take a good dqds step.
+                 call stdlib_slasq3( i0, n0, z, pp, dmin, sigma, desig, qmax, nfail,iter, ndiv, &
+                           ieee, ttype, dmin1, dmin2, dn, dn1,dn2, g, tau )
+                 pp = 1 - pp
+                 ! when emin is very small check for splits.
+                 if( pp==0 .and. n0-i0>=3 ) then
+                    if( z( 4*n0 )<=tol2*qmax .or.z( 4*n0-1 )<=tol2*sigma ) then
+                       splt = i0 - 1
+                       qmax = z( 4*i0-3 )
+                       emin = z( 4*i0-1 )
+                       oldemn = z( 4*i0 )
+                       do i4 = 4*i0, 4*( n0-3 ), 4
+                          if( z( i4 )<=tol2*z( i4-3 ) .or.z( i4-1 )<=tol2*sigma ) then
+                             z( i4-1 ) = -sigma
+                             splt = i4 / 4
+                             qmax = zero
+                             emin = z( i4+3 )
+                             oldemn = z( i4+4 )
+                          else
+                             qmax = max( qmax, z( i4+1 ) )
+                             emin = min( emin, z( i4-1 ) )
+                             oldemn = min( oldemn, z( i4 ) )
+                          end if
+                       end do
+                       z( 4*n0-1 ) = emin
+                       z( 4*n0 ) = oldemn
+                       i0 = splt + 1
+                    end if
+                 end if
+              end do loop_140
+              info = 2
+              ! maximum number of iterations exceeded, restore the shift
+              ! sigma and place the new d's and e's in a qd array.
+              ! this might need to be done for several blocks
+              i1 = i0
+              n1 = n0
+              145 continue
+              tempq = z( 4*i0-3 )
+              z( 4*i0-3 ) = z( 4*i0-3 ) + sigma
+              do k = i0+1, n0
+                 tempe = z( 4*k-5 )
+                 z( 4*k-5 ) = z( 4*k-5 ) * (tempq / z( 4*k-7 ))
+                 tempq = z( 4*k-3 )
+                 z( 4*k-3 ) = z( 4*k-3 ) + sigma + tempe - z( 4*k-5 )
+              end do
+              ! prepare to do this on the previous block if there is one
+              if( i1>1 ) then
+                 n1 = i1-1
+                 do while( ( i1>=2 ) .and. ( z(4*i1-5)>=zero ) )
+                    i1 = i1 - 1
+                 end do
+                 if( i1>=1 ) then
+                    sigma = -z(4*n1-1)
+                    go to 145
+                 end if
+              end if
+              do k = 1, n
+                 z( 2*k-1 ) = z( 4*k-3 )
+              ! only the block 1..n0 is unfinished.  the rest of the e's
+              ! must be essentially zero, although sometimes other data
+              ! has been stored in them.
+                 if( k<n0 ) then
+                    z( 2*k ) = z( 4*k-1 )
+                 else
+                    z( 2*k ) = 0
+                 end if
+              end do
+              return
+              ! end iwhilb
+              150 continue
+           end do loop_160
+           info = 3
+           return
+           ! end iwhila
+           170 continue
+           ! move q's to the front.
+           do k = 2, n
+              z( k ) = z( 4*k-3 )
+           end do
+           ! sort and compute sum of eigenvalues.
+           call stdlib_slasrt( 'D', n, z, iinfo )
+           e = zero
+           do k = n, 1, -1
+              e = e + z( k )
+           end do
+           ! store trace, sum(eigenvalues) and information on performance.
+           z( 2*n+1 ) = trace
+           z( 2*n+2 ) = e
+           z( 2*n+3 ) = real( iter,KIND=sp)
+           z( 2*n+4 ) = real( ndiv,KIND=sp) / real( n**2,KIND=sp)
+           z( 2*n+5 ) = hundrd*nfail / real( iter,KIND=sp)
+           return
+     end subroutine stdlib_slasq2
+
+     !> DLATRF_AA factorizes a panel of a real symmetric matrix A using
+     !> the Aasen's algorithm. The panel consists of a set of NB rows of A
+     !> when UPLO is U, or a set of NB columns when UPLO is L.
+     !> In order to factorize the panel, the Aasen's algorithm requires the
+     !> last row, or column, of the previous panel. The first row, or column,
+     !> of A is set to be the first row, or column, of an identity matrix,
+     !> which is used to factorize the first panel.
+     !> The resulting J-th row of U, or J-th column of L, is stored in the
+     !> (J-1)-th row, or column, of A (without the unit diagonals), while
+     !> the diagonal and subdiagonal of A are overwritten by those of T.
+
+     pure subroutine stdlib_slasyf_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: m, nb, j1, lda, ldh
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*), h(ldh,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, k, k1, i1, i2, mj
+           real(sp) :: piv, alpha
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           j = 1
+           ! k1 is the first column of the panel to be factorized
+           ! i.e.,  k1 is 2 for the first block column, and 1 for the rest of the blocks
+           k1 = (2-j1)+1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              10 continue
+              if ( j>min(m, nb) )go to 20
+              ! k is the column to be factorized
+               ! when being called from stdlib_ssytrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:m, j) := a(j, j:m) - h(j:m, 1:(j-1)) * l(j1:(j-1), j),
+               ! where h(j:m, j) has been initialized to be a(j, j:m)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_sgemv( 'NO TRANSPOSE', mj, j-k1,-one, h( j, k1 ), ldh,a( 1, j ), 1,&
+                           one, h( j, j ), 1 )
+              end if
+              ! copy h(i:m, i) into work
+              call stdlib_scopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j-1, j:m) * t(j-1,j),
+                  ! where a(j-1, j) stores t(j-1, j) and a(j-2, j:m) stores u(j-1, j:m)
+                 alpha = -a( k-1, j )
+                 call stdlib_saxpy( mj, alpha, a( k-2, j ), lda, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( k, j ) = work( 1 )
+              if( j<m ) then
+                 ! compute work(2:m) = t(j, j) l(j, (j+1):m)
+                  ! where a(j, j) stores t(j, j) and a(j-1, (j+1):m) stores u(j, (j+1):m)
+                 if( k>1 ) then
+                    alpha = -a( k, j )
+                    call stdlib_saxpy( m-j, alpha, a( k-1, j+1 ), lda,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:m)|)
+                 i2 = stdlib_isamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply symmetric pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1, i1+1:m) with a(i1+1:m, i2)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_sswap( i2-i1-1, a( j1+i1-1, i1+1 ), lda,a( j1+i1, i2 ), 1 )
+                              
+                    ! swap a(i1, i2+1:m) with a(i2, i2+1:m)
+                    if( i2<m )call stdlib_sswap( m-i2, a( j1+i1-1, i2+1 ), lda,a( j1+i2-1, i2+1 ),&
+                               lda )
+                    ! swap a(i1, i1) with a(i2,i2)
+                    piv = a( i1+j1-1, i1 )
+                    a( j1+i1-1, i1 ) = a( j1+i2-1, i2 )
+                    a( j1+i2-1, i2 ) = piv
+                    ! swap h(i1, 1:j1) with h(i2, 1:j1)
+                    call stdlib_sswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_sswap( i1-k1+1, a( 1, i1 ), 1,a( 1, i2 ), 1 )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j, j+1) = t(j, j+1)
+                 a( k, j+1 ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:m, j+1) into h(j:m, j),
+                    call stdlib_scopy( m-j, a( k+1, j+1 ), lda,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:m ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:m, j) = l(j+2:m, j+1)
+                 if( j<(m-1) ) then
+                    if( a( k, j+1 )/=zero ) then
+                       alpha = one / a( k, j+1 )
+                       call stdlib_scopy( m-j-1, work( 3 ), 1, a( k, j+2 ), lda )
+                       call stdlib_sscal( m-j-1, alpha, a( k, j+2 ), lda )
+                    else
+                       call stdlib_slaset( 'FULL', 1, m-j-1, zero, zero,a( k, j+2 ), lda)
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 10
+              20 continue
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              30 continue
+              if( j>min( m, nb ) )go to 40
+              ! k is the column to be factorized
+               ! when being called from stdlib_ssytrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:m, j) := a(j:m, j) - h(j:m, 1:(j-1)) * l(j, j1:(j-1))^t,
+               ! where h(j:m, j) has been initialized to be a(j:m, j)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_sgemv( 'NO TRANSPOSE', mj, j-k1,-one, h( j, k1 ), ldh,a( j, 1 ), lda,&
+                           one, h( j, j ), 1 )
+              end if
+              ! copy h(j:m, j) into work
+              call stdlib_scopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j:m, j-1) * t(j-1,j),
+                  ! where a(j-1, j) = t(j-1, j) and a(j, j-2) = l(j, j-1)
+                 alpha = -a( j, k-1 )
+                 call stdlib_saxpy( mj, alpha, a( j, k-2 ), 1, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( j, k ) = work( 1 )
+              if( j<m ) then
+                 ! compute work(2:m) = t(j, j) l((j+1):m, j)
+                  ! where a(j, j) = t(j, j) and a((j+1):m, j-1) = l((j+1):m, j)
+                 if( k>1 ) then
+                    alpha = -a( j, k )
+                    call stdlib_saxpy( m-j, alpha, a( j+1, k-1 ), 1,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:m)|)
+                 i2 = stdlib_isamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply symmetric pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1+1:m, i1) with a(i2, i1+1:m)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_sswap( i2-i1-1, a( i1+1, j1+i1-1 ), 1,a( i2, j1+i1 ), lda )
+                              
+                    ! swap a(i2+1:m, i1) with a(i2+1:m, i2)
+                    if( i2<m )call stdlib_sswap( m-i2, a( i2+1, j1+i1-1 ), 1,a( i2+1, j1+i2-1 ), &
+                              1 )
+                    ! swap a(i1, i1) with a(i2, i2)
+                    piv = a( i1, j1+i1-1 )
+                    a( i1, j1+i1-1 ) = a( i2, j1+i2-1 )
+                    a( i2, j1+i2-1 ) = piv
+                    ! swap h(i1, i1:j1) with h(i2, i2:j1)
+                    call stdlib_sswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_sswap( i1-k1+1, a( i1, 1 ), lda,a( i2, 1 ), lda )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j+1, j) = t(j+1, j)
+                 a( j+1, k ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:m, j+1) into h(j+1:m, j),
+                    call stdlib_scopy( m-j, a( j+1, k+1 ), 1,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:m ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:m, j) = l(j+2:m, j+1)
+                 if( j<(m-1) ) then
+                    if( a( j+1, k )/=zero ) then
+                       alpha = one / a( j+1, k )
+                       call stdlib_scopy( m-j-1, work( 3 ), 1, a( j+2, k ), 1 )
+                       call stdlib_sscal( m-j-1, alpha, a( j+2, k ), 1 )
+                    else
+                       call stdlib_slaset( 'FULL', m-j-1, 1, zero, zero,a( j+2, k ), lda )
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 30
+              40 continue
+           end if
+           return
+     end subroutine stdlib_slasyf_aa
+
+     !> SPTEQR: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric positive definite tridiagonal matrix by first factoring the
+     !> matrix using SPTTRF, and then calling SBDSQR to compute the singular
+     !> values of the bidiagonal factor.
+     !> This routine computes the eigenvalues of the positive definite
+     !> tridiagonal matrix to high relative accuracy.  This means that if the
+     !> eigenvalues range over many orders of magnitude in size, then the
+     !> small eigenvalues and corresponding eigenvectors will be computed
+     !> more accurately than, for example, with the standard QR method.
+     !> The eigenvectors of a full or band symmetric positive definite matrix
+     !> can also be found if SSYTRD, SSPTRD, or SSBTRD has been used to
+     !> reduce this matrix to tridiagonal form. (The reduction to tridiagonal
+     !> form, however, may preclude the possibility of obtaining high
+     !> relative accuracy in the small eigenvalues of the original matrix, if
+     !> these eigenvalues range over many orders of magnitude.)
+
+     pure subroutine stdlib_spteqr( compz, n, d, e, z, ldz, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(sp), intent(inout) :: d(*), e(*), z(ldz,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Arrays 
+           real(sp) :: c(1,1), vt(1,1)
+           ! Local Scalars 
+           integer(ilp) :: i, icompz, nru
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or. ( icompz>0 .and. ldz<max( 1,n ) ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SPTEQR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz>0 )z( 1, 1 ) = one
+              return
+           end if
+           if( icompz==2 )call stdlib_slaset( 'FULL', n, n, zero, one, z, ldz )
+           ! call stdlib_spttrf to factor the matrix.
+           call stdlib_spttrf( n, d, e, info )
+           if( info/=0 )return
+           do i = 1, n
+              d( i ) = sqrt( d( i ) )
+           end do
+           do i = 1, n - 1
+              e( i ) = e( i )*d( i )
+           end do
+           ! call stdlib_sbdsqr to compute the singular values/vectors of the
+           ! bidiagonal factor.
+           if( icompz>0 ) then
+              nru = n
+           else
+              nru = 0
+           end if
+           call stdlib_sbdsqr( 'LOWER', n, 0, nru, 0, d, e, vt, 1, z, ldz, c, 1,work, info )
+                     
+           ! square the singular values.
+           if( info==0 ) then
+              do i = 1, n
+                 d( i ) = d( i )*d( i )
+              end do
+           else
+              info = n + info
+           end if
+           return
+     end subroutine stdlib_spteqr
+
+     !> SSTEGR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+     !> a well defined set of pairwise different real eigenvalues, the corresponding
+     !> real eigenvectors are pairwise orthogonal.
+     !> The spectrum may be computed either completely or partially by specifying
+     !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+     !> eigenvalues.
+     !> SSTEGR is a compatibility wrapper around the improved SSTEMR routine.
+     !> See SSTEMR for further details.
+     !> One important change is that the ABSTOL parameter no longer provides any
+     !> benefit and hence is no longer used.
+     !> Note : SSTEGR and SSTEMR work only on machines which follow
+     !> IEEE-754 floating-point standard in their handling of infinities and
+     !> NaNs.  Normal execution may create these exceptiona values and hence
+     !> may abort due to a floating point exception in environments which
+     !> do not conform to the IEEE-754 standard.
+
+     pure subroutine stdlib_sstegr( jobz, range, n, d, e, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               isuppz, work, lwork, iwork,liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           integer(ilp), intent(in) :: il, iu, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: w(*), work(*)
+           real(sp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: tryrac
+           ! Executable Statements 
+           info = 0
+           tryrac = .false.
+           call stdlib_sstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, n, isuppz, &
+                     tryrac, work, lwork,iwork, liwork, info )
+     end subroutine stdlib_sstegr
+
+     !> SSTEMR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+     !> a well defined set of pairwise different real eigenvalues, the corresponding
+     !> real eigenvectors are pairwise orthogonal.
+     !> The spectrum may be computed either completely or partially by specifying
+     !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+     !> eigenvalues.
+     !> Depending on the number of desired eigenvalues, these are computed either
+     !> by bisection or the dqds algorithm. Numerically orthogonal eigenvectors are
+     !> computed by the use of various suitable L D L^T factorizations near clusters
+     !> of close eigenvalues (referred to as RRRs, Relatively Robust
+     !> Representations). An informal sketch of the algorithm follows.
+     !> For each unreduced block (submatrix) of T,
+     !> (a) Compute T - sigma I  = L D L^T, so that L and D
+     !> define all the wanted eigenvalues to high relative accuracy.
+     !> This means that small relative changes in the entries of D and L
+     !> cause only small relative changes in the eigenvalues and
+     !> eigenvectors. The standard (unfactored) representation of the
+     !> tridiagonal matrix T does not have this property in general.
+     !> (b) Compute the eigenvalues to suitable accuracy.
+     !> If the eigenvectors are desired, the algorithm attains full
+     !> accuracy of the computed eigenvalues only right before
+     !> the corresponding vectors have to be computed, see steps c) and d).
+     !> (c) For each cluster of close eigenvalues, select a new
+     !> shift close to the cluster, find a new factorization, and refine
+     !> the shifted eigenvalues to suitable accuracy.
+     !> (d) For each eigenvalue with a large enough relative separation compute
+     !> the corresponding eigenvector by forming a rank revealing twisted
+     !> factorization. Go back to (c) for any clusters that remain.
+     !> For more details, see:
+     !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+     !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+     !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+     !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+     !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+     !> 2004.  Also LAPACK Working Note 154.
+     !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem",
+     !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Further Details
+     !> 1.SSTEMR works only on machines which follow IEEE-754
+     !> floating-point standard in their handling of infinities and NaNs.
+     !> This permits the use of efficient inner loops avoiding a check for
+     !> zero divisors.
+
+     pure subroutine stdlib_sstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, nzc, &
+               isuppz, tryrac, work, lwork,iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           logical(lk), intent(inout) :: tryrac
+           integer(ilp), intent(in) :: il, iu, ldz, nzc, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: w(*), work(*)
+           real(sp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(sp), parameter :: minrgp = 3.0e-3_sp
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lquery, valeig, wantz, zquery
+           integer(ilp) :: i, ibegin, iend, ifirst, iil, iindbl, iindw, iindwk, iinfo, iinspl, &
+           iiu, ilast, in, indd, inde2, inderr, indgp, indgrs, indwrk, itmp, itmp2, j, jblk, jj, &
+                     liwmin, lwmin, nsplit, nzcmin, offset, wbegin, wend
+           real(sp) :: bignum, cs, eps, pivmin, r1, r2, rmax, rmin, rtol1, rtol2, safmin, scale, &
+                     smlnum, sn, thresh, tmp, tnrm, wl, wu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ).or.( liwork==-1 ) )
+           zquery = ( nzc==-1 )
+           ! stdlib_sstemr needs work of size 6*n, iwork of size 3*n.
+           ! in addition, stdlib_slarre needs work of size 6*n, iwork of size 5*n.
+           ! furthermore, stdlib_slarrv needs work of size 12*n, iwork of size 7*n.
+           if( wantz ) then
+              lwmin = 18*n
+              liwmin = 10*n
+           else
+              ! need less workspace if only the eigenvalues are wanted
+              lwmin = 12*n
+              liwmin = 8*n
+           endif
+           wl = zero
+           wu = zero
+           iil = 0
+           iiu = 0
+           nsplit = 0
+           if( valeig ) then
+              ! we do not reference vl, vu in the cases range = 'i','a'
+              ! the interval (wl, wu] contains all the wanted eigenvalues.
+              ! it is either given by the user or computed in stdlib_slarre.
+              wl = vl
+              wu = vu
+           elseif( indeig ) then
+              ! we do not reference il, iu in the cases range = 'v','a'
+              iil = il
+              iiu = iu
+           endif
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( valeig .and. n>0 .and. wu<=wl ) then
+              info = -7
+           else if( indeig .and. ( iil<1 .or. iil>n ) ) then
+              info = -8
+           else if( indeig .and. ( iiu<iil .or. iiu>n ) ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -13
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -17
+           else if( liwork<liwmin .and. .not.lquery ) then
+              info = -19
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( wantz .and. alleig ) then
+                 nzcmin = n
+              else if( wantz .and. valeig ) then
+                 call stdlib_slarrc( 'T', n, vl, vu, d, e, safmin,nzcmin, itmp, itmp2, info )
+                           
+              else if( wantz .and. indeig ) then
+                 nzcmin = iiu-iil+1
+              else
+                 ! wantz == false.
+                 nzcmin = 0
+              endif
+              if( zquery .and. info==0 ) then
+                 z( 1,1 ) = nzcmin
+              else if( nzc<nzcmin .and. .not.zquery ) then
+                 info = -14
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSTEMR', -info )
+              return
+           else if( lquery .or. zquery ) then
+              return
+           end if
+           ! handle n = 0, 1, and 2 cases immediately
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = d( 1 )
+              else
+                 if( wl<d( 1 ) .and. wu>=d( 1 ) ) then
+                    m = 1
+                    w( 1 ) = d( 1 )
+                 end if
+              end if
+              if( wantz.and.(.not.zquery) ) then
+                 z( 1, 1 ) = one
+                 isuppz(1) = 1
+                 isuppz(2) = 1
+              end if
+              return
+           end if
+           if( n==2 ) then
+              if( .not.wantz ) then
+                 call stdlib_slae2( d(1), e(1), d(2), r1, r2 )
+              else if( wantz.and.(.not.zquery) ) then
+                 call stdlib_slaev2( d(1), e(1), d(2), r1, r2, cs, sn )
+              end if
+              if( alleig.or.(valeig.and.(r2>wl).and.(r2<=wu)).or.(indeig.and.(iil==1)) ) &
+                        then
+                 m = m+1
+                 w( m ) = r2
+                 if( wantz.and.(.not.zquery) ) then
+                    z( 1, m ) = -sn
+                    z( 2, m ) = cs
+                    ! note: at most one of sn and cs can be zero.
+                    if (sn/=zero) then
+                       if (cs/=zero) then
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 2
+                       else
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 1
+                       end if
+                    else
+                       isuppz(2*m-1) = 2
+                       isuppz(2*m) = 2
+                    end if
+                 endif
+              endif
+              if( alleig.or.(valeig.and.(r1>wl).and.(r1<=wu)).or.(indeig.and.(iiu==2)) ) &
+                        then
+                 m = m+1
+                 w( m ) = r1
+                 if( wantz.and.(.not.zquery) ) then
+                    z( 1, m ) = cs
+                    z( 2, m ) = sn
+                    ! note: at most one of sn and cs can be zero.
+                    if (sn/=zero) then
+                       if (cs/=zero) then
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 2
+                       else
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 1
+                       end if
+                    else
+                       isuppz(2*m-1) = 2
+                       isuppz(2*m) = 2
+                    end if
+                 endif
+              endif
+           else
+           ! continue with general n
+              indgrs = 1
+              inderr = 2*n + 1
+              indgp = 3*n + 1
+              indd = 4*n + 1
+              inde2 = 5*n + 1
+              indwrk = 6*n + 1
+              iinspl = 1
+              iindbl = n + 1
+              iindw = 2*n + 1
+              iindwk = 3*n + 1
+              ! scale matrix to allowable range, if necessary.
+              ! the allowable range is related to the pivmin parameter; see the
+              ! comments in stdlib_slarrd.  the preference for scaling small values
+              ! up is heuristic; we expect users' matrices not to be close to the
+              ! rmax threshold.
+              scale = one
+              tnrm = stdlib_slanst( 'M', n, d, e )
+              if( tnrm>zero .and. tnrm<rmin ) then
+                 scale = rmin / tnrm
+              else if( tnrm>rmax ) then
+                 scale = rmax / tnrm
+              end if
+              if( scale/=one ) then
+                 call stdlib_sscal( n, scale, d, 1 )
+                 call stdlib_sscal( n-1, scale, e, 1 )
+                 tnrm = tnrm*scale
+                 if( valeig ) then
+                    ! if eigenvalues in interval have to be found,
+                    ! scale (wl, wu] accordingly
+                    wl = wl*scale
+                    wu = wu*scale
+                 endif
+              end if
+              ! compute the desired eigenvalues of the tridiagonal after splitting
+              ! into smaller subblocks if the corresponding off-diagonal elements
+              ! are small
+              ! thresh is the splitting parameter for stdlib_slarre
+              ! a negative thresh forces the old splitting criterion based on the
+              ! size of the off-diagonal. a positive thresh switches to splitting
+              ! which preserves relative accuracy.
+              if( tryrac ) then
+                 ! test whether the matrix warrants the more expensive relative approach.
+                 call stdlib_slarrr( n, d, e, iinfo )
+              else
+                 ! the user does not care about relative accurately eigenvalues
+                 iinfo = -1
+              endif
+              ! set the splitting criterion
+              if (iinfo==0) then
+                 thresh = eps
+              else
+                 thresh = -eps
+                 ! relative accuracy is desired but t does not guarantee it
+                 tryrac = .false.
+              endif
+              if( tryrac ) then
+                 ! copy original diagonal, needed to guarantee relative accuracy
+                 call stdlib_scopy(n,d,1,work(indd),1)
+              endif
+              ! store the squares of the offdiagonal values of t
+              do j = 1, n-1
+                 work( inde2+j-1 ) = e(j)**2
+              end do
+              ! set the tolerance parameters for bisection
+              if( .not.wantz ) then
+                 ! stdlib_slarre computes the eigenvalues to full precision.
+                 rtol1 = four * eps
+                 rtol2 = four * eps
+              else
+                 ! stdlib_slarre computes the eigenvalues to less than full precision.
+                 ! stdlib_slarrv will refine the eigenvalue approximations, and we can
+                 ! need less accurate initial bisection in stdlib_slarre.
+                 ! note: these settings do only affect the subset case and stdlib_slarre
+                 rtol1 = max( sqrt(eps)*5.0e-2_sp, four * eps )
+                 rtol2 = max( sqrt(eps)*5.0e-3_sp, four * eps )
+              endif
+              call stdlib_slarre( range, n, wl, wu, iil, iiu, d, e,work(inde2), rtol1, rtol2, &
+              thresh, nsplit,iwork( iinspl ), m, w, work( inderr ),work( indgp ), iwork( iindbl ),&
+              iwork( iindw ), work( indgrs ), pivmin,work( indwrk ), iwork( iindwk ), iinfo )
+                        
+              if( iinfo/=0 ) then
+                 info = 10 + abs( iinfo )
+                 return
+              end if
+              ! note that if range /= 'v', stdlib_slarre computes bounds on the desired
+              ! part of the spectrum. all desired eigenvalues are contained in
+              ! (wl,wu]
+              if( wantz ) then
+                 ! compute the desired eigenvectors corresponding to the computed
+                 ! eigenvalues
+                 call stdlib_slarrv( n, wl, wu, d, e,pivmin, iwork( iinspl ), m,1, m, minrgp, &
+                 rtol1, rtol2,w, work( inderr ), work( indgp ), iwork( iindbl ),iwork( iindw ), &
+                           work( indgrs ), z, ldz,isuppz, work( indwrk ), iwork( iindwk ), iinfo )
+                 if( iinfo/=0 ) then
+                    info = 20 + abs( iinfo )
+                    return
+                 end if
+              else
+                 ! stdlib_slarre computes eigenvalues of the (shifted) root representation
+                 ! stdlib_slarrv returns the eigenvalues of the unshifted matrix.
+                 ! however, if the eigenvectors are not desired by the user, we need
+                 ! to apply the corresponding shifts from stdlib_slarre to obtain the
+                 ! eigenvalues of the original matrix.
+                 do j = 1, m
+                    itmp = iwork( iindbl+j-1 )
+                    w( j ) = w( j ) + e( iwork( iinspl+itmp-1 ) )
+                 end do
+              end if
+              if ( tryrac ) then
+                 ! refine computed eigenvalues so that they are relatively accurate
+                 ! with respect to the original matrix t.
+                 ibegin = 1
+                 wbegin = 1
+                 loop_39: do jblk = 1, iwork( iindbl+m-1 )
+                    iend = iwork( iinspl+jblk-1 )
+                    in = iend - ibegin + 1
+                    wend = wbegin - 1
+                    ! check if any eigenvalues have to be refined in this block
+                    36 continue
+                    if( wend<m ) then
+                       if( iwork( iindbl+wend )==jblk ) then
+                          wend = wend + 1
+                          go to 36
+                       end if
+                    end if
+                    if( wend<wbegin ) then
+                       ibegin = iend + 1
+                       cycle loop_39
+                    end if
+                    offset = iwork(iindw+wbegin-1)-1
+                    ifirst = iwork(iindw+wbegin-1)
+                    ilast = iwork(iindw+wend-1)
+                    rtol2 = four * eps
+                    call stdlib_slarrj( in,work(indd+ibegin-1), work(inde2+ibegin-1),ifirst, &
+                    ilast, rtol2, offset, w(wbegin),work( inderr+wbegin-1 ),work( indwrk ), iwork(&
+                               iindwk ), pivmin,tnrm, iinfo )
+                    ibegin = iend + 1
+                    wbegin = wend + 1
+                 end do loop_39
+              endif
+              ! if matrix was scaled, then rescale eigenvalues appropriately.
+              if( scale/=one ) then
+                 call stdlib_sscal( m, one / scale, w, 1 )
+              end if
+           end if
+           ! if eigenvalues are not in increasing order, then sort them,
+           ! possibly along with eigenvectors.
+           if( nsplit>1 .or. n==2 ) then
+              if( .not. wantz ) then
+                 call stdlib_slasrt( 'I', m, w, iinfo )
+                 if( iinfo/=0 ) then
+                    info = 3
+                    return
+                 end if
+              else
+                 do j = 1, m - 1
+                    i = 0
+                    tmp = w( j )
+                    do jj = j + 1, m
+                       if( w( jj )<tmp ) then
+                          i = jj
+                          tmp = w( jj )
+                       end if
+                    end do
+                    if( i/=0 ) then
+                       w( i ) = w( j )
+                       w( j ) = tmp
+                       if( wantz ) then
+                          call stdlib_sswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                          itmp = isuppz( 2*i-1 )
+                          isuppz( 2*i-1 ) = isuppz( 2*j-1 )
+                          isuppz( 2*j-1 ) = itmp
+                          itmp = isuppz( 2*i )
+                          isuppz( 2*i ) = isuppz( 2*j )
+                          isuppz( 2*j ) = itmp
+                       end if
+                    end if
+                 end do
+              end if
+           endif
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_sstemr
+
+     !> SSTEVR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T.  Eigenvalues and
+     !> eigenvectors can be selected by specifying either a range of values
+     !> or a range of indices for the desired eigenvalues.
+     !> Whenever possible, SSTEVR calls SSTEMR to compute the
+     !> eigenspectrum using Relatively Robust Representations.  SSTEMR
+     !> computes eigenvalues by the dqds algorithm, while orthogonal
+     !> eigenvectors are computed from various "good" L D L^T representations
+     !> (also known as Relatively Robust Representations). Gram-Schmidt
+     !> orthogonalization is avoided as far as possible. More specifically,
+     !> the various steps of the algorithm are as follows. For the i-th
+     !> unreduced block of T,
+     !> (a) Compute T - sigma_i = L_i D_i L_i^T, such that L_i D_i L_i^T
+     !> is a relatively robust representation,
+     !> (b) Compute the eigenvalues, lambda_j, of L_i D_i L_i^T to high
+     !> relative accuracy by the dqds algorithm,
+     !> (c) If there is a cluster of close eigenvalues, "choose" sigma_i
+     !> close to the cluster, and go to step (a),
+     !> (d) Given the approximate eigenvalue lambda_j of L_i D_i L_i^T,
+     !> compute the corresponding eigenvector by forming a
+     !> rank-revealing twisted factorization.
+     !> The desired accuracy of the output can be specified by the input
+     !> parameter ABSTOL.
+     !> For more details, see "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem", by Inderjit Dhillon,
+     !> Computer Science Division Technical Report No. UCB//CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Note 1 : SSTEVR calls SSTEMR when the full spectrum is requested
+     !> on machines which conform to the ieee-754 floating point standard.
+     !> SSTEVR calls SSTEBZ and SSTEIN on non-ieee machines and
+     !> when partial spectrum requests are made.
+     !> Normal execution of SSTEMR may create NaNs and infinities and
+     !> hence may abort due to a floating point exception in environments
+     !> which do not handle NaNs and infinities in the ieee standard default
+     !> manner.
+
+     pure subroutine stdlib_sstevr( jobz, range, n, d, e, vl, vu, il, iu, abstol,m, w, z, ldz, &
+               isuppz, work, lwork, iwork,liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           integer(ilp), intent(in) :: il, iu, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(sp), intent(inout) :: d(*), e(*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, lquery, valeig, wantz, tryrac
+           character :: order
+           integer(ilp) :: i, ieeeok, imax, indibl, indifl, indisp, indiwo, iscale, j, jj, liwmin,&
+                      lwmin, nsplit
+           real(sp) :: bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, tnrm, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           ieeeok = stdlib_ilaenv( 10, 'SSTEVR', 'N', 1, 2, 3, 4 )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ) .or. ( liwork==-1 ) )
+           lwmin = max( 1, 20*n )
+           liwmin = max(1, 10*n )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -7
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -8
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -9
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -14
+              end if
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -17
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -19
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSTEVR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = d( 1 )
+              else
+                 if( vl<d( 1 ) .and. vu>=d( 1 ) ) then
+                    m = 1
+                    w( 1 ) = d( 1 )
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           end if
+           tnrm = stdlib_slanst( 'M', n, d, e )
+           if( tnrm>zero .and. tnrm<rmin ) then
+              iscale = 1
+              sigma = rmin / tnrm
+           else if( tnrm>rmax ) then
+              iscale = 1
+              sigma = rmax / tnrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_sscal( n, sigma, d, 1 )
+              call stdlib_sscal( n-1, sigma, e( 1 ), 1 )
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! initialize indices into workspaces.  note: these indices are used only
+           ! if stdlib_ssterf or stdlib_sstemr fail.
+           ! iwork(indibl:indibl+m-1) corresponds to iblock in stdlib_sstebz and
+           ! stores the block indices of each of the m<=n eigenvalues.
+           indibl = 1
+           ! iwork(indisp:indisp+nsplit-1) corresponds to isplit in stdlib_sstebz and
+           ! stores the starting and finishing indices of each block.
+           indisp = indibl + n
+           ! iwork(indifl:indifl+n-1) stores the indices of eigenvectors
+           ! that corresponding to eigenvectors that fail to converge in
+           ! stdlib_sstein.  this information is discarded; if any fail, the driver
+           ! returns info > 0.
+           indifl = indisp + n
+           ! indiwo is the offset of the remaining integer workspace.
+           indiwo = indisp + n
+           ! if all eigenvalues are desired, then
+           ! call stdlib_ssterf or stdlib_sstemr.  if this fails for some eigenvalue, then
+           ! try stdlib_sstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ieeeok==1 ) then
+              call stdlib_scopy( n-1, e( 1 ), 1, work( 1 ), 1 )
+              if( .not.wantz ) then
+                 call stdlib_scopy( n, d, 1, w, 1 )
+                 call stdlib_ssterf( n, w, work, info )
+              else
+                 call stdlib_scopy( n, d, 1, work( n+1 ), 1 )
+                 if (abstol <= two*n*eps) then
+                    tryrac = .true.
+                 else
+                    tryrac = .false.
+                 end if
+                 call stdlib_sstemr( jobz, 'A', n, work( n+1 ), work, vl, vu, il,iu, m, w, z, ldz,&
+                            n, isuppz, tryrac,work( 2*n+1 ), lwork-2*n, iwork, liwork, info )
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 10
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_sstebz and, if eigenvectors are desired, stdlib_sstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           call stdlib_sstebz( range, order, n, vll, vuu, il, iu, abstol, d, e, m,nsplit, w, &
+                     iwork( indibl ), iwork( indisp ), work,iwork( indiwo ), info )
+           if( wantz ) then
+              call stdlib_sstein( n, d, e, m, w, iwork( indibl ), iwork( indisp ),z, ldz, work, &
+                        iwork( indiwo ), iwork( indifl ),info )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           10 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    w( i ) = w( j )
+                    w( j ) = tmp1
+                    call stdlib_sswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                 end if
+              end do
+           end if
+            ! causes problems with tests 19
+            ! if (wantz .and. indeig ) z( 1,1) = z(1,1) / 1.002_sp + .002
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_sstevr
+
+     !> SSYEVR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric matrix A.  Eigenvalues and eigenvectors can be
+     !> selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+     !> SSYEVR first reduces the matrix A to tridiagonal form T with a call
+     !> to SSYTRD.  Then, whenever possible, SSYEVR calls SSTEMR to compute
+     !> the eigenspectrum using Relatively Robust Representations.  SSTEMR
+     !> computes eigenvalues by the dqds algorithm, while orthogonal
+     !> eigenvectors are computed from various "good" L D L^T representations
+     !> (also known as Relatively Robust Representations). Gram-Schmidt
+     !> orthogonalization is avoided as far as possible. More specifically,
+     !> the various steps of the algorithm are as follows.
+     !> For each unreduced block (submatrix) of T,
+     !> (a) Compute T - sigma I  = L D L^T, so that L and D
+     !> define all the wanted eigenvalues to high relative accuracy.
+     !> This means that small relative changes in the entries of D and L
+     !> cause only small relative changes in the eigenvalues and
+     !> eigenvectors. The standard (unfactored) representation of the
+     !> tridiagonal matrix T does not have this property in general.
+     !> (b) Compute the eigenvalues to suitable accuracy.
+     !> If the eigenvectors are desired, the algorithm attains full
+     !> accuracy of the computed eigenvalues only right before
+     !> the corresponding vectors have to be computed, see steps c) and d).
+     !> (c) For each cluster of close eigenvalues, select a new
+     !> shift close to the cluster, find a new factorization, and refine
+     !> the shifted eigenvalues to suitable accuracy.
+     !> (d) For each eigenvalue with a large enough relative separation compute
+     !> the corresponding eigenvector by forming a rank revealing twisted
+     !> factorization. Go back to (c) for any clusters that remain.
+     !> The desired accuracy of the output can be specified by the input
+     !> parameter ABSTOL.
+     !> For more details, see SSTEMR's documentation and:
+     !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+     !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+     !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+     !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+     !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+     !> 2004.  Also LAPACK Working Note 154.
+     !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem",
+     !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Note 1 : SSYEVR calls SSTEMR when the full spectrum is requested
+     !> on machines which conform to the ieee-754 floating point standard.
+     !> SSYEVR calls SSTEBZ and SSTEIN on non-ieee machines and
+     !> when partial spectrum requests are made.
+     !> Normal execution of SSTEMR may create NaNs and infinities and
+     !> hence may abort due to a floating point exception in environments
+     !> which do not handle NaNs and infinities in the ieee standard default
+     !> manner.
+
+     subroutine stdlib_ssyevr( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               isuppz, work, lwork,iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, lda, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(sp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: w(*), work(*), z(ldz,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, lquery, test, valeig, wantz, tryrac
+           character :: order
+           integer(ilp) :: i, ieeeok, iinfo, imax, indd, inddd, inde, indee, indibl, indifl, &
+           indisp, indiwo, indtau, indwk, indwkn, iscale, j, jj, liwmin, llwork, llwrkn, lwkopt, &
+                     lwmin, nb, nsplit
+           real(sp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           ieeeok = stdlib_ilaenv( 10, 'SSYEVR', 'N', 1, 2, 3, 4 )
+           lower = stdlib_lsame( uplo, 'L' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ) .or. ( liwork==-1 ) )
+           lwmin = max( 1, 26*n )
+           liwmin = max( 1, 10*n )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -8
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -9
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -15
+              end if
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'SSYTRD', uplo, n, -1, -1, -1 )
+              nb = max( nb, stdlib_ilaenv( 1, 'SORMTR', uplo, n, -1, -1, -1 ) )
+              lwkopt = max( ( nb+1 )*n, lwmin )
+              work( 1 ) = lwkopt
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -18
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYEVR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( n==1 ) then
+              work( 1 ) = 26
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = a( 1, 1 )
+              else
+                 if( vl<a( 1, 1 ) .and. vu>=a( 1, 1 ) ) then
+                    m = 1
+                    w( 1 ) = a( 1, 1 )
+                 end if
+              end if
+              if( wantz ) then
+                 z( 1, 1 ) = one
+                 isuppz( 1 ) = 1
+                 isuppz( 2 ) = 1
+              end if
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_slamch( 'SAFE MINIMUM' )
+           eps = stdlib_slamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if (valeig) then
+              vll = vl
+              vuu = vu
+           end if
+           anrm = stdlib_slansy( 'M', uplo, n, a, lda, work )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 do j = 1, n
+                    call stdlib_sscal( n-j+1, sigma, a( j, j ), 1 )
+                 end do
+              else
+                 do j = 1, n
+                    call stdlib_sscal( j, sigma, a( 1, j ), 1 )
+                 end do
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! initialize indices into workspaces.  note: the iwork indices are
+           ! used only if stdlib_ssterf or stdlib_sstemr fail.
+           ! work(indtau:indtau+n-1) stores the scalar factors of the
+           ! elementary reflectors used in stdlib_ssytrd.
+           indtau = 1
+           ! work(indd:indd+n-1) stores the tridiagonal's diagonal entries.
+           indd = indtau + n
+           ! work(inde:inde+n-1) stores the off-diagonal entries of the
+           ! tridiagonal matrix from stdlib_ssytrd.
+           inde = indd + n
+           ! work(inddd:inddd+n-1) is a copy of the diagonal entries over
+           ! -written by stdlib_sstemr (the stdlib_ssterf path copies the diagonal to w).
+           inddd = inde + n
+           ! work(indee:indee+n-1) is a copy of the off-diagonal entries over
+           ! -written while computing the eigenvalues in stdlib_ssterf and stdlib_sstemr.
+           indee = inddd + n
+           ! indwk is the starting offset of the left-over workspace, and
+           ! llwork is the remaining workspace size.
+           indwk = indee + n
+           llwork = lwork - indwk + 1
+           ! iwork(indibl:indibl+m-1) corresponds to iblock in stdlib_sstebz and
+           ! stores the block indices of each of the m<=n eigenvalues.
+           indibl = 1
+           ! iwork(indisp:indisp+nsplit-1) corresponds to isplit in stdlib_sstebz and
+           ! stores the starting and finishing indices of each block.
+           indisp = indibl + n
+           ! iwork(indifl:indifl+n-1) stores the indices of eigenvectors
+           ! that corresponding to eigenvectors that fail to converge in
+           ! stdlib_sstein.  this information is discarded; if any fail, the driver
+           ! returns info > 0.
+           indifl = indisp + n
+           ! indiwo is the offset of the remaining integer workspace.
+           indiwo = indifl + n
+           ! call stdlib_ssytrd to reduce symmetric matrix to tridiagonal form.
+           call stdlib_ssytrd( uplo, n, a, lda, work( indd ), work( inde ),work( indtau ), work( &
+                     indwk ), llwork, iinfo )
+           ! if all eigenvalues are desired
+           ! then call stdlib_ssterf or stdlib_sstemr and stdlib_sormtr.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig.or.test ) .and. ( ieeeok==1 ) ) then
+              if( .not.wantz ) then
+                 call stdlib_scopy( n, work( indd ), 1, w, 1 )
+                 call stdlib_scopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_ssterf( n, w, work( indee ), info )
+              else
+                 call stdlib_scopy( n-1, work( inde ), 1, work( indee ), 1 )
+                 call stdlib_scopy( n, work( indd ), 1, work( inddd ), 1 )
+                 if (abstol <= two*n*eps) then
+                    tryrac = .true.
+                 else
+                    tryrac = .false.
+                 end if
+                 call stdlib_sstemr( jobz, 'A', n, work( inddd ), work( indee ),vl, vu, il, iu, m,&
+                            w, z, ldz, n, isuppz,tryrac, work( indwk ), lwork, iwork, liwork,info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_sstemr.
+                 if( wantz .and. info==0 ) then
+                    indwkn = inde
+                    llwrkn = lwork - indwkn + 1
+                    call stdlib_sormtr( 'L', uplo, 'N', n, m, a, lda,work( indtau ), z, ldz, work(&
+                               indwkn ),llwrkn, iinfo )
+                 end if
+              end if
+              if( info==0 ) then
+                 ! everything worked.  skip stdlib_sstebz/stdlib_sstein.  iwork(:) are
+                 ! undefined.
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_sstebz and, if eigenvectors are desired, stdlib_sstein.
+           ! also call stdlib_sstebz and stdlib_sstein if stdlib_sstemr fails.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           call stdlib_sstebz( range, order, n, vll, vuu, il, iu, abstll,work( indd ), work( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), work( indwk ),iwork( indiwo ), info )
+                     
+           if( wantz ) then
+              call stdlib_sstein( n, work( indd ), work( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,work( indwk ), iwork( indiwo ), iwork( indifl ),info )
+              ! apply orthogonal matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_sstein.
+              indwkn = inde
+              llwrkn = lwork - indwkn + 1
+              call stdlib_sormtr( 'L', uplo, 'N', n, m, a, lda, work( indtau ), z,ldz, work( &
+                        indwkn ), llwrkn, iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+        ! jump here if stdlib_sstemr/stdlib_sstein succeeded.
+        30 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_sscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.  note: we do not sort the ifail portion of iwork.
+           ! it may not be initialized (if stdlib_sstemr/stdlib_sstein succeeded), and we do
+           ! not return this detailed information to the user.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    w( i ) = w( j )
+                    w( j ) = tmp1
+                    call stdlib_sswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                 end if
+              end do
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_ssyevr
+
+     !> SSYSV computes the solution to a real system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Aasen's algorithm is used to factor A as
+     !> A = U**T * T * U,  if UPLO = 'U', or
+     !> A = L * T * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is symmetric tridiagonal. The factored
+     !> form of A is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_ssysv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*), b(ldb,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, lwkopt_sytrf, lwkopt_sytrs
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max(2*n, 3*n-2) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              call stdlib_ssytrf_aa( uplo, n, a, lda, ipiv, work, -1, info )
+              lwkopt_sytrf = int( work(1),KIND=ilp)
+              call stdlib_ssytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,-1, info )
+              lwkopt_sytrs = int( work(1),KIND=ilp)
+              lwkopt = max( lwkopt_sytrf, lwkopt_sytrs )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYSV_AA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u**t*t*u or a = l*t*l**t.
+           call stdlib_ssytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_ssytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssysv_aa
+
+     !> SSYTRF_AA: computes the factorization of a real symmetric matrix A
+     !> using the Aasen's algorithm.  The form of the factorization is
+     !> A = U**T*T*U  or  A = L*T*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is a symmetric tridiagonal matrix.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_ssytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(sp), intent(inout) :: a(lda,*)
+           real(sp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: j, lwkopt
+           integer(ilp) :: nb, mj, nj, k1, k2, j1, j2, j3, jb
+           real(sp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! determine the block size
+           nb = stdlib_ilaenv( 1, 'SSYTRF_AA', uplo, n, -1, -1, -1 )
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              lwkopt = (nb+1)*n
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'SSYTRF_AA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return
+           if ( n==0 ) then
+               return
+           endif
+           ipiv( 1 ) = 1
+           if ( n==1 ) then
+              return
+           end if
+           ! adjust block size based on the workspace size
+           if( lwork<((1+nb)*n) ) then
+              nb = ( lwork-n ) / n
+           end if
+           if( upper ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              ! copy first row a(1, 1:n) into h(1:n) (stored in work(1:n))
+              call stdlib_scopy( n, a( 1, 1 ), lda, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_slasyf;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              10 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j + 1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_slasyf_aa( uplo, 2-k1, n-j, jb,a( max(1, j), j+1 ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_sswap( j1-k1-2, a( 1, j2 ), 1,a( 1, ipiv(j2) ), 1 )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+               ! the row a(j1-1, j2-1:n) stores u(j1, j2+1:n) and
+               ! work stores the current block of the auxiriarly matrix h
+              if( j<n ) then
+                 ! if first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = a( j, j+1 )
+                    a( j, j+1 ) = one
+                    call stdlib_scopy( n-j, a( j-1, j+1 ), lda,work( (j+1-j1+1)+jb*n ), 1 )
+                              
+                    call stdlib_sscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=1 and k2= 0 for the first panel,
+                     ! while k1=0 and k2=1 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_sgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_sgemv( 'NO TRANSPOSE', mj, jb+1,-one, work( j3-j1+1+k1*n ), &
+                                    n,a( j1-k2, j3 ), 1,one, a( j3, j3 ), lda )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block of j2-th block row with stdlib_sgemm
+                       call stdlib_sgemm( 'TRANSPOSE', 'TRANSPOSE',nj, n-j3+1, jb+1,-one, a( j1-&
+                                 k2, j2 ), lda,work( j3-j1+1+k1*n ), n,one, a( j2, j3 ), lda )
+                    end do
+                    ! recover t( j, j+1 )
+                    a( j, j+1 ) = alpha
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_scopy( n-j, a( j+1, j+1 ), lda, work( 1 ), 1 )
+              end if
+              go to 10
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              ! copy first column a(1:n, 1) into h(1:n, 1)
+               ! (stored in work(1:n))
+              call stdlib_scopy( n, a( 1, 1 ), 1, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_slasyf;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              11 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j+1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_slasyf_aa( uplo, 2-k1, n-j, jb,a( j+1, max(1, j) ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_sswap( j1-k1-2, a( j2, 1 ), lda,a( ipiv(j2), 1 ), lda )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+                ! a(j2+1, j1-1) stores l(j2+1, j1) and
+                ! work(j2+1, 1) stores h(j2+1, 1)
+              if( j<n ) then
+                 ! if first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = a( j+1, j )
+                    a( j+1, j ) = one
+                    call stdlib_scopy( n-j, a( j+1, j-1 ), 1,work( (j+1-j1+1)+jb*n ), 1 )
+                    call stdlib_sscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=1 and k2= 0 for the first panel,
+                     ! while k1=0 and k2=1 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_sgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_sgemv( 'NO TRANSPOSE', mj, jb+1,-one, work( j3-j1+1+k1*n ), &
+                                    n,a( j3, j1-k2 ), lda,one, a( j3, j3 ), 1 )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block in j2-th block column with stdlib_sgemm
+                       call stdlib_sgemm( 'NO TRANSPOSE', 'TRANSPOSE',n-j3+1, nj, jb+1,-one, work(&
+                                  j3-j1+1+k1*n ), n,a( j2, j1-k2 ), lda,one, a( j3, j2 ), lda )
+                    end do
+                    ! recover t( j+1, j )
+                    a( j+1, j ) = alpha
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_scopy( n-j, a( j+1, j+1 ), 1, work( 1 ), 1 )
+              end if
+              go to 11
+           end if
+           20 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ssytrf_aa
+
+
+
+end module stdlib_linalg_lapack_s
diff --git a/src/stdlib_linalg_lapack_w.fypp b/src/stdlib_linalg_lapack_w.fypp
new file mode 100644
index 000000000..a685fd87f
--- /dev/null
+++ b/src/stdlib_linalg_lapack_w.fypp
@@ -0,0 +1,81218 @@
+#:include "common.fypp" 
+#:if WITH_QP 
+module stdlib_linalg_lapack_w
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas
+     use stdlib_linalg_lapack_aux
+     use stdlib_linalg_lapack_s
+     use stdlib_linalg_lapack_c
+     use stdlib_linalg_lapack_d
+     use stdlib_linalg_lapack_z
+     use stdlib_linalg_lapack_q
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+     public :: stdlib_wlag2w
+     public :: stdlib_wbbcsd
+     public :: stdlib_wbdsqr
+     public :: stdlib_wcgesv
+     public :: stdlib_wcposv
+     public :: stdlib_wdrscl
+     public :: stdlib_wgbbrd
+     public :: stdlib_wgbcon
+     public :: stdlib_wgbequ
+     public :: stdlib_wgbequb
+     public :: stdlib_wgbrfs
+     public :: stdlib_wgbsv
+     public :: stdlib_wgbsvx
+     public :: stdlib_wgbtf2
+     public :: stdlib_wgbtrf
+     public :: stdlib_wgbtrs
+     public :: stdlib_wgebak
+     public :: stdlib_wgebal
+     public :: stdlib_wgebd2
+     public :: stdlib_wgebrd
+     public :: stdlib_wgecon
+     public :: stdlib_wgeequ
+     public :: stdlib_wgeequb
+     public :: stdlib_wgees
+     public :: stdlib_wgeesx
+     public :: stdlib_wgeev
+     public :: stdlib_wgeevx
+     public :: stdlib_wgehd2
+     public :: stdlib_wgehrd
+     public :: stdlib_wgejsv
+     public :: stdlib_wgelq
+     public :: stdlib_wgelq2
+     public :: stdlib_wgelqf
+     public :: stdlib_wgelqt
+     public :: stdlib_wgelqt3
+     public :: stdlib_wgels
+     public :: stdlib_wgelsd
+     public :: stdlib_wgelss
+     public :: stdlib_wgelsy
+     public :: stdlib_wgemlq
+     public :: stdlib_wgemlqt
+     public :: stdlib_wgemqr
+     public :: stdlib_wgemqrt
+     public :: stdlib_wgeql2
+     public :: stdlib_wgeqlf
+     public :: stdlib_wgeqp3
+     public :: stdlib_wgeqr
+     public :: stdlib_wgeqr2
+     public :: stdlib_wgeqr2p
+     public :: stdlib_wgeqrf
+     public :: stdlib_wgeqrfp
+     public :: stdlib_wgeqrt
+     public :: stdlib_wgeqrt2
+     public :: stdlib_wgeqrt3
+     public :: stdlib_wgerfs
+     public :: stdlib_wgerq2
+     public :: stdlib_wgerqf
+     public :: stdlib_wgesc2
+     public :: stdlib_wgesdd
+     public :: stdlib_wgesv
+     public :: stdlib_wgesvd
+     public :: stdlib_wgesvdq
+     public :: stdlib_wgesvj
+     public :: stdlib_wgesvx
+     public :: stdlib_wgetc2
+     public :: stdlib_wgetf2
+     public :: stdlib_wgetrf
+     public :: stdlib_wgetrf2
+     public :: stdlib_wgetri
+     public :: stdlib_wgetrs
+     public :: stdlib_wgetsls
+     public :: stdlib_wgetsqrhrt
+     public :: stdlib_wggbak
+     public :: stdlib_wggbal
+     public :: stdlib_wgges
+     public :: stdlib_wgges3
+     public :: stdlib_wggesx
+     public :: stdlib_wggev
+     public :: stdlib_wggev3
+     public :: stdlib_wggevx
+     public :: stdlib_wggglm
+     public :: stdlib_wgghd3
+     public :: stdlib_wgghrd
+     public :: stdlib_wgglse
+     public :: stdlib_wggqrf
+     public :: stdlib_wggrqf
+     public :: stdlib_wgsvj0
+     public :: stdlib_wgsvj1
+     public :: stdlib_wgtcon
+     public :: stdlib_wgtrfs
+     public :: stdlib_wgtsv
+     public :: stdlib_wgtsvx
+     public :: stdlib_wgttrf
+     public :: stdlib_wgttrs
+     public :: stdlib_wgtts2
+     public :: stdlib_whb2st_kernels
+     public :: stdlib_whbev
+     public :: stdlib_whbevd
+     public :: stdlib_whbevx
+     public :: stdlib_whbgst
+     public :: stdlib_whbgv
+     public :: stdlib_whbgvd
+     public :: stdlib_whbgvx
+     public :: stdlib_whbtrd
+     public :: stdlib_whecon
+     public :: stdlib_whecon_rook
+     public :: stdlib_wheequb
+     public :: stdlib_wheev
+     public :: stdlib_wheevd
+     public :: stdlib_wheevr
+     public :: stdlib_wheevx
+     public :: stdlib_whegs2
+     public :: stdlib_whegst
+     public :: stdlib_whegv
+     public :: stdlib_whegvd
+     public :: stdlib_whegvx
+     public :: stdlib_wherfs
+     public :: stdlib_whesv
+     public :: stdlib_whesv_aa
+     public :: stdlib_whesv_rk
+     public :: stdlib_whesv_rook
+     public :: stdlib_whesvx
+     public :: stdlib_wheswapr
+     public :: stdlib_whetd2
+     public :: stdlib_whetf2
+     public :: stdlib_whetf2_rk
+     public :: stdlib_whetf2_rook
+     public :: stdlib_whetrd
+     public :: stdlib_whetrd_hb2st
+     public :: stdlib_whetrd_he2hb
+     public :: stdlib_whetrf
+     public :: stdlib_whetrf_aa
+     public :: stdlib_whetrf_rk
+     public :: stdlib_whetrf_rook
+     public :: stdlib_whetri
+     public :: stdlib_whetri_rook
+     public :: stdlib_whetrs
+     public :: stdlib_whetrs2
+     public :: stdlib_whetrs_3
+     public :: stdlib_whetrs_aa
+     public :: stdlib_whetrs_rook
+     public :: stdlib_whfrk
+     public :: stdlib_whgeqz
+     public :: stdlib_whpcon
+     public :: stdlib_whpev
+     public :: stdlib_whpevd
+     public :: stdlib_whpevx
+     public :: stdlib_whpgst
+     public :: stdlib_whpgv
+     public :: stdlib_whpgvd
+     public :: stdlib_whpgvx
+     public :: stdlib_whprfs
+     public :: stdlib_whpsv
+     public :: stdlib_whpsvx
+     public :: stdlib_whptrd
+     public :: stdlib_whptrf
+     public :: stdlib_whptri
+     public :: stdlib_whptrs
+     public :: stdlib_whsein
+     public :: stdlib_whseqr
+     public :: stdlib_wla_gbamv
+     public :: stdlib_wla_gbrcond_c
+     public :: stdlib_wla_gbrpvgrw
+     public :: stdlib_wla_geamv
+     public :: stdlib_wla_gercond_c
+     public :: stdlib_wla_gerpvgrw
+     public :: stdlib_wla_heamv
+     public :: stdlib_wla_hercond_c
+     public :: stdlib_wla_herpvgrw
+     public :: stdlib_wla_lin_berr
+     public :: stdlib_wla_porcond_c
+     public :: stdlib_wla_porpvgrw
+     public :: stdlib_wla_syamv
+     public :: stdlib_wla_syrcond_c
+     public :: stdlib_wla_syrpvgrw
+     public :: stdlib_wla_wwaddw
+     public :: stdlib_wlabrd
+     public :: stdlib_wlacgv
+     public :: stdlib_wlacn2
+     public :: stdlib_wlacon
+     public :: stdlib_wlacp2
+     public :: stdlib_wlacpy
+     public :: stdlib_wlacrm
+     public :: stdlib_wlacrt
+     public :: stdlib_wladiv
+     public :: stdlib_wlaed0
+     public :: stdlib_wlaed7
+     public :: stdlib_wlaed8
+     public :: stdlib_wlaein
+     public :: stdlib_wlaesy
+     public :: stdlib_wlaev2
+     public :: stdlib_wlag2c
+     public :: stdlib_wlags2
+     public :: stdlib_wlagtm
+     public :: stdlib_wlahef
+     public :: stdlib_wlahef_aa
+     public :: stdlib_wlahef_rk
+     public :: stdlib_wlahef_rook
+     public :: stdlib_wlahqr
+     public :: stdlib_wlahr2
+     public :: stdlib_wlaic1
+     public :: stdlib_wlals0
+     public :: stdlib_wlalsa
+     public :: stdlib_wlalsd
+     public :: stdlib_wlamswlq
+     public :: stdlib_wlamtsqr
+     public :: stdlib_wlangb
+     public :: stdlib_wlange
+     public :: stdlib_wlangt
+     public :: stdlib_wlanhb
+     public :: stdlib_wlanhe
+     public :: stdlib_wlanhf
+     public :: stdlib_wlanhp
+     public :: stdlib_wlanhs
+     public :: stdlib_wlanht
+     public :: stdlib_wlansb
+     public :: stdlib_wlansp
+     public :: stdlib_wlansy
+     public :: stdlib_wlantb
+     public :: stdlib_wlantp
+     public :: stdlib_wlantr
+     public :: stdlib_wlapll
+     public :: stdlib_wlapmr
+     public :: stdlib_wlapmt
+     public :: stdlib_wlaqgb
+     public :: stdlib_wlaqge
+     public :: stdlib_wlaqhb
+     public :: stdlib_wlaqhe
+     public :: stdlib_wlaqhp
+     public :: stdlib_wlaqp2
+     public :: stdlib_wlaqps
+     public :: stdlib_wlaqr0
+     public :: stdlib_wlaqr1
+     public :: stdlib_wlaqr2
+     public :: stdlib_wlaqr3
+     public :: stdlib_wlaqr4
+     public :: stdlib_wlaqr5
+     public :: stdlib_wlaqsb
+     public :: stdlib_wlaqsp
+     public :: stdlib_wlaqsy
+     public :: stdlib_wlaqz0
+     public :: stdlib_wlaqz1
+     public :: stdlib_wlaqz2
+     public :: stdlib_wlaqz3
+     public :: stdlib_wlar1v
+     public :: stdlib_wlar2v
+     public :: stdlib_wlarcm
+     public :: stdlib_wlarf
+     public :: stdlib_wlarfb
+     public :: stdlib_wlarfb_gett
+     public :: stdlib_wlarfg
+     public :: stdlib_wlarfgp
+     public :: stdlib_wlarft
+     public :: stdlib_wlarfx
+     public :: stdlib_wlarfy
+     public :: stdlib_wlargv
+     public :: stdlib_wlarnv
+     public :: stdlib_wlarrv
+     public :: stdlib_wlartg
+     public :: stdlib_wlartv
+     public :: stdlib_wlarz
+     public :: stdlib_wlarzb
+     public :: stdlib_wlarzt
+     public :: stdlib_wlascl
+     public :: stdlib_wlaset
+     public :: stdlib_wlasr
+     public :: stdlib_wlassq
+     public :: stdlib_wlaswlq
+     public :: stdlib_wlaswp
+     public :: stdlib_wlasyf
+     public :: stdlib_wlasyf_aa
+     public :: stdlib_wlasyf_rk
+     public :: stdlib_wlasyf_rook
+     public :: stdlib_wlat2c
+     public :: stdlib_wlatbs
+     public :: stdlib_wlatdf
+     public :: stdlib_wlatps
+     public :: stdlib_wlatrd
+     public :: stdlib_wlatrs
+     public :: stdlib_wlatrz
+     public :: stdlib_wlatsqr
+     public :: stdlib_wlaunhr_col_getrfnp
+     public :: stdlib_wlaunhr_col_getrfnp2
+     public :: stdlib_wlauu2
+     public :: stdlib_wlauum
+     public :: stdlib_wpbcon
+     public :: stdlib_wpbequ
+     public :: stdlib_wpbrfs
+     public :: stdlib_wpbstf
+     public :: stdlib_wpbsv
+     public :: stdlib_wpbsvx
+     public :: stdlib_wpbtf2
+     public :: stdlib_wpbtrf
+     public :: stdlib_wpbtrs
+     public :: stdlib_wpftrf
+     public :: stdlib_wpftri
+     public :: stdlib_wpftrs
+     public :: stdlib_wpocon
+     public :: stdlib_wpoequ
+     public :: stdlib_wpoequb
+     public :: stdlib_wporfs
+     public :: stdlib_wposv
+     public :: stdlib_wposvx
+     public :: stdlib_wpotf2
+     public :: stdlib_wpotrf
+     public :: stdlib_wpotrf2
+     public :: stdlib_wpotri
+     public :: stdlib_wpotrs
+     public :: stdlib_wppcon
+     public :: stdlib_wppequ
+     public :: stdlib_wpprfs
+     public :: stdlib_wppsv
+     public :: stdlib_wppsvx
+     public :: stdlib_wpptrf
+     public :: stdlib_wpptri
+     public :: stdlib_wpptrs
+     public :: stdlib_wpstf2
+     public :: stdlib_wpstrf
+     public :: stdlib_wptcon
+     public :: stdlib_wpteqr
+     public :: stdlib_wptrfs
+     public :: stdlib_wptsv
+     public :: stdlib_wptsvx
+     public :: stdlib_wpttrf
+     public :: stdlib_wpttrs
+     public :: stdlib_wptts2
+     public :: stdlib_wrot
+     public :: stdlib_wspcon
+     public :: stdlib_wspmv
+     public :: stdlib_wspr
+     public :: stdlib_wsprfs
+     public :: stdlib_wspsv
+     public :: stdlib_wspsvx
+     public :: stdlib_wsptrf
+     public :: stdlib_wsptri
+     public :: stdlib_wsptrs
+     public :: stdlib_wstedc
+     public :: stdlib_wstegr
+     public :: stdlib_wstein
+     public :: stdlib_wstemr
+     public :: stdlib_wsteqr
+     public :: stdlib_wsycon
+     public :: stdlib_wsycon_rook
+     public :: stdlib_wsyconv
+     public :: stdlib_wsyconvf
+     public :: stdlib_wsyconvf_rook
+     public :: stdlib_wsyequb
+     public :: stdlib_wsymv
+     public :: stdlib_wsyr
+     public :: stdlib_wsyrfs
+     public :: stdlib_wsysv
+     public :: stdlib_wsysv_aa
+     public :: stdlib_wsysv_rk
+     public :: stdlib_wsysv_rook
+     public :: stdlib_wsysvx
+     public :: stdlib_wsyswapr
+     public :: stdlib_wsytf2
+     public :: stdlib_wsytf2_rk
+     public :: stdlib_wsytf2_rook
+     public :: stdlib_wsytrf
+     public :: stdlib_wsytrf_aa
+     public :: stdlib_wsytrf_rk
+     public :: stdlib_wsytrf_rook
+     public :: stdlib_wsytri
+     public :: stdlib_wsytri_rook
+     public :: stdlib_wsytrs
+     public :: stdlib_wsytrs2
+     public :: stdlib_wsytrs_3
+     public :: stdlib_wsytrs_aa
+     public :: stdlib_wsytrs_rook
+     public :: stdlib_wtbcon
+     public :: stdlib_wtbrfs
+     public :: stdlib_wtbtrs
+     public :: stdlib_wtfsm
+     public :: stdlib_wtftri
+     public :: stdlib_wtfttp
+     public :: stdlib_wtfttr
+     public :: stdlib_wtgevc
+     public :: stdlib_wtgex2
+     public :: stdlib_wtgexc
+     public :: stdlib_wtgsen
+     public :: stdlib_wtgsja
+     public :: stdlib_wtgsna
+     public :: stdlib_wtgsy2
+     public :: stdlib_wtgsyl
+     public :: stdlib_wtpcon
+     public :: stdlib_wtplqt
+     public :: stdlib_wtplqt2
+     public :: stdlib_wtpmlqt
+     public :: stdlib_wtpmqrt
+     public :: stdlib_wtpqrt
+     public :: stdlib_wtpqrt2
+     public :: stdlib_wtprfb
+     public :: stdlib_wtprfs
+     public :: stdlib_wtptri
+     public :: stdlib_wtptrs
+     public :: stdlib_wtpttf
+     public :: stdlib_wtpttr
+     public :: stdlib_wtrcon
+     public :: stdlib_wtrevc
+     public :: stdlib_wtrevc3
+     public :: stdlib_wtrexc
+     public :: stdlib_wtrrfs
+     public :: stdlib_wtrsen
+     public :: stdlib_wtrsna
+     public :: stdlib_wtrsyl
+     public :: stdlib_wtrti2
+     public :: stdlib_wtrtri
+     public :: stdlib_wtrtrs
+     public :: stdlib_wtrttf
+     public :: stdlib_wtrttp
+     public :: stdlib_wtzrzf
+     public :: stdlib_wunbdb
+     public :: stdlib_wunbdb1
+     public :: stdlib_wunbdb2
+     public :: stdlib_wunbdb3
+     public :: stdlib_wunbdb4
+     public :: stdlib_wunbdb5
+     public :: stdlib_wunbdb6
+     public :: stdlib_wuncsd
+     public :: stdlib_wuncsd2by1
+     public :: stdlib_wung2l
+     public :: stdlib_wung2r
+     public :: stdlib_wungbr
+     public :: stdlib_wunghr
+     public :: stdlib_wungl2
+     public :: stdlib_wunglq
+     public :: stdlib_wungql
+     public :: stdlib_wungqr
+     public :: stdlib_wungr2
+     public :: stdlib_wungrq
+     public :: stdlib_wungtr
+     public :: stdlib_wungtsqr
+     public :: stdlib_wungtsqr_row
+     public :: stdlib_wunhr_col
+     public :: stdlib_wunm22
+     public :: stdlib_wunm2l
+     public :: stdlib_wunm2r
+     public :: stdlib_wunmbr
+     public :: stdlib_wunmhr
+     public :: stdlib_wunml2
+     public :: stdlib_wunmlq
+     public :: stdlib_wunmql
+     public :: stdlib_wunmqr
+     public :: stdlib_wunmr2
+     public :: stdlib_wunmr3
+     public :: stdlib_wunmrq
+     public :: stdlib_wunmrz
+     public :: stdlib_wunmtr
+     public :: stdlib_wupgtr
+     public :: stdlib_wupmtr
+
+     ! 128-bit real constants 
+     real(qp),    parameter, private ::     negone = -1.00_qp
+     real(qp),    parameter, private ::       zero = 0.00_qp
+     real(qp),    parameter, private ::       half = 0.50_qp
+     real(qp),    parameter, private ::        one = 1.00_qp
+     real(qp),    parameter, private ::        two = 2.00_qp
+     real(qp),    parameter, private ::      three = 3.00_qp
+     real(qp),    parameter, private ::       four = 4.00_qp
+     real(qp),    parameter, private ::      eight = 8.00_qp
+     real(qp),    parameter, private ::        ten = 10.00_qp
+
+     ! 128-bit complex constants 
+     complex(qp), parameter, private :: czero   = ( 0.0_qp,0.0_qp)
+     complex(qp), parameter, private :: chalf   = ( 0.5_qp,0.0_qp)
+     complex(qp), parameter, private :: cone    = ( 1.0_qp,0.0_qp)
+     complex(qp), parameter, private :: cnegone = (-1.0_qp,0.0_qp)
+
+     ! 128-bit scaling constants 
+     integer,     parameter, private :: maxexp = maxexponent(zero) 
+     integer,     parameter, private :: minexp = minexponent(zero) 
+     real(qp),    parameter, private :: rradix = real(radix(zero),qp) 
+     real(qp),    parameter, private :: ulp    = epsilon(zero) 
+     real(qp),    parameter, private :: eps    = ulp*half 
+     real(qp),    parameter, private :: safmin = rradix**max(minexp-1,1-maxexp) 
+     real(qp),    parameter, private :: safmax = one/safmin 
+     real(qp),    parameter, private :: smlnum = safmin/ulp 
+     real(qp),    parameter, private :: bignum = safmax*ulp 
+     real(qp),    parameter, private :: rtmin  = sqrt(smlnum) 
+     real(qp),    parameter, private :: rtmax  = sqrt(bignum) 
+
+     ! 128-bit Blue's scaling constants 
+     ! ssml>=1/s and sbig==1/S with s,S as defined in https://doi.org/10.1145/355769.355771 
+     real(qp),    parameter, private :: tsml   = rradix**ceiling((minexp-1)*half) 
+     real(qp),    parameter, private :: tbig   = rradix**floor((maxexp-digits(zero)+1)*half) 
+     real(qp),    parameter, private :: ssml   = rradix**(-floor((minexp-digits(zero))*half)) 
+     real(qp),    parameter, private :: sbig   = rradix**(-ceiling((maxexp+digits(zero)-1)*half)) 
+
+
+     contains
+
+     !> ZLAG2W: converts a COMPLEX matrix, SA, to a COMPLEX*16 matrix, A.
+     !> Note that while it is possible to overflow while converting
+     !> from double to single, it is not possible to overflow when
+     !> converting from single to double.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_wlag2w( m, n, sa, ldsa, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, m, n
+           ! Array Arguments 
+           complex(qp), intent(in) :: sa(ldsa,*)
+           complex(qp), intent(out) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           info = 0
+           do j = 1, n
+              do i = 1, m
+                 a( i, j ) = sa( i, j )
+              end do
+           end do
+           return
+     end subroutine stdlib_wlag2w
+
+     !> ZBBCSD: computes the CS decomposition of a unitary matrix in
+     !> bidiagonal-block form,
+     !> [ B11 | B12 0  0 ]
+     !> [  0  |  0 -I  0 ]
+     !> X = [----------------]
+     !> [ B21 | B22 0  0 ]
+     !> [  0  |  0  0  I ]
+     !> [  C | -S  0  0 ]
+     !> [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**H
+     !> = [---------] [---------------] [---------]   .
+     !> [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
+     !> [  0 |  0  0  I ]
+     !> X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
+     !> than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
+     !> transposed and/or permuted. This can be done in constant time using
+     !> the TRANS and SIGNS options. See ZUNCSD for details.)
+     !> The bidiagonal matrices B11, B12, B21, and B22 are represented
+     !> implicitly by angles THETA(1:Q) and PHI(1:Q-1).
+     !> The unitary matrices U1, U2, V1T, and V2T are input/output.
+     !> The input matrices are pre- or post-multiplied by the appropriate
+     !> singular vector matrices.
+
+     pure subroutine stdlib_wbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
+     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, rwork, &
+               lrwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lrwork, m, p, q
+           ! Array Arguments 
+           real(qp), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
+                      b22e(*), rwork(*)
+           real(qp), intent(inout) :: phi(*), theta(*)
+           complex(qp), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
+                     
+        ! ===================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxitr = 6
+           real(qp), parameter :: hundred = 100.0_qp
+           real(qp), parameter :: meighth = -0.125_qp
+           real(qp), parameter :: piover2 = 1.57079632679489661923132169163975144210_qp
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
+                     wantu2, wantv1t, wantv2t
+           integer(ilp) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
+                     iv2tcs, iv2tsn, j, lrworkmin, lrworkopt, maxit, mini
+           real(qp) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
+                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
+           ! Intrinsic Functions 
+           intrinsic :: abs,atan2,cos,max,min,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lrwork == -1
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           wantv2t = stdlib_lsame( jobv2t, 'Y' )
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           if( m < 0 ) then
+              info = -6
+           else if( p < 0 .or. p > m ) then
+              info = -7
+           else if( q < 0 .or. q > m ) then
+              info = -8
+           else if( q > p .or. q > m-p .or. q > m-q ) then
+              info = -8
+           else if( wantu1 .and. ldu1 < p ) then
+              info = -12
+           else if( wantu2 .and. ldu2 < m-p ) then
+              info = -14
+           else if( wantv1t .and. ldv1t < q ) then
+              info = -16
+           else if( wantv2t .and. ldv2t < m-q ) then
+              info = -18
+           end if
+           ! quick return if q = 0
+           if( info == 0 .and. q == 0 ) then
+              lrworkmin = 1
+              rwork(1) = lrworkmin
+              return
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              iu1cs = 1
+              iu1sn = iu1cs + q
+              iu2cs = iu1sn + q
+              iu2sn = iu2cs + q
+              iv1tcs = iu2sn + q
+              iv1tsn = iv1tcs + q
+              iv2tcs = iv1tsn + q
+              iv2tsn = iv2tcs + q
+              lrworkopt = iv2tsn + q - 1
+              lrworkmin = lrworkopt
+              rwork(1) = lrworkopt
+              if( lrwork < lrworkmin .and. .not. lquery ) then
+                 info = -28
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZBBCSD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'EPSILON' )
+           unfl = stdlib_qlamch( 'SAFE MINIMUM' )
+           tolmul = max( ten, min( hundred, eps**meighth ) )
+           tol = tolmul*eps
+           thresh = max( tol, maxitr*q*q*unfl )
+           ! test for negligible sines or cosines
+           do i = 1, q
+              if( theta(i) < thresh ) then
+                 theta(i) = zero
+              else if( theta(i) > piover2-thresh ) then
+                 theta(i) = piover2
+              end if
+           end do
+           do i = 1, q-1
+              if( phi(i) < thresh ) then
+                 phi(i) = zero
+              else if( phi(i) > piover2-thresh ) then
+                 phi(i) = piover2
+              end if
+           end do
+           ! initial deflation
+           imax = q
+           do while( imax > 1 )
+              if( phi(imax-1) /= zero ) then
+                 exit
+              end if
+              imax = imax - 1
+           end do
+           imin = imax - 1
+           if  ( imin > 1 ) then
+              do while( phi(imin-1) /= zero )
+                 imin = imin - 1
+                 if  ( imin <= 1 ) exit
+              end do
+           end if
+           ! initialize iteration counter
+           maxit = maxitr*q*q
+           iter = 0
+           ! begin main iteration loop
+           do while( imax > 1 )
+              ! compute the matrix entries
+              b11d(imin) = cos( theta(imin) )
+              b21d(imin) = -sin( theta(imin) )
+              do i = imin, imax - 1
+                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
+                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
+                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
+                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
+                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
+                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
+                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
+                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
+              end do
+              b12d(imax) = sin( theta(imax) )
+              b22d(imax) = cos( theta(imax) )
+              ! abort if not converging; otherwise, increment iter
+              if( iter > maxit ) then
+                 info = 0
+                 do i = 1, q
+                    if( phi(i) /= zero )info = info + 1
+                 end do
+                 return
+              end if
+              iter = iter + imax - imin
+              ! compute shifts
+              thetamax = theta(imin)
+              thetamin = theta(imin)
+              do i = imin+1, imax
+                 if( theta(i) > thetamax )thetamax = theta(i)
+                 if( theta(i) < thetamin )thetamin = theta(i)
+              end do
+              if( thetamax > piover2 - thresh ) then
+                 ! zero on diagonals of b11 and b22; induce deflation with a
+                 ! zero shift
+                 mu = zero
+                 nu = one
+              else if( thetamin < thresh ) then
+                 ! zero on diagonals of b12 and b22; induce deflation with a
+                 ! zero shift
+                 mu = one
+                 nu = zero
+              else
+                 ! compute shifts for b11 and b21 and use the lesser
+                 call stdlib_qlas2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
+                           
+                 call stdlib_qlas2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
+                           
+                 if( sigma11 <= sigma21 ) then
+                    mu = sigma11
+                    nu = sqrt( one - mu**2 )
+                    if( mu < thresh ) then
+                       mu = zero
+                       nu = one
+                    end if
+                 else
+                    nu = sigma21
+                    mu = sqrt( 1.0_qp - nu**2 )
+                    if( nu < thresh ) then
+                       mu = one
+                       nu = zero
+                    end if
+                 end if
+              end if
+              ! rotate to produce bulges in b11 and b21
+              if( mu <= nu ) then
+                 call stdlib_qlartgs( b11d(imin), b11e(imin), mu,rwork(iv1tcs+imin-1), rwork(&
+                           iv1tsn+imin-1) )
+              else
+                 call stdlib_qlartgs( b21d(imin), b21e(imin), nu,rwork(iv1tcs+imin-1), rwork(&
+                           iv1tsn+imin-1) )
+              end if
+              temp = rwork(iv1tcs+imin-1)*b11d(imin) +rwork(iv1tsn+imin-1)*b11e(imin)
+              b11e(imin) = rwork(iv1tcs+imin-1)*b11e(imin) -rwork(iv1tsn+imin-1)*b11d(imin)
+                        
+              b11d(imin) = temp
+              b11bulge = rwork(iv1tsn+imin-1)*b11d(imin+1)
+              b11d(imin+1) = rwork(iv1tcs+imin-1)*b11d(imin+1)
+              temp = rwork(iv1tcs+imin-1)*b21d(imin) +rwork(iv1tsn+imin-1)*b21e(imin)
+              b21e(imin) = rwork(iv1tcs+imin-1)*b21e(imin) -rwork(iv1tsn+imin-1)*b21d(imin)
+                        
+              b21d(imin) = temp
+              b21bulge = rwork(iv1tsn+imin-1)*b21d(imin+1)
+              b21d(imin+1) = rwork(iv1tcs+imin-1)*b21d(imin+1)
+              ! compute theta(imin)
+              theta( imin ) = atan2( sqrt( b21d(imin)**2+b21bulge**2 ),sqrt( b11d(imin)**2+&
+                        b11bulge**2 ) )
+              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
+              if( b11d(imin)**2+b11bulge**2 > thresh**2 ) then
+                 call stdlib_qlartgp( b11bulge, b11d(imin), rwork(iu1sn+imin-1),rwork(iu1cs+imin-&
+                           1), r )
+              else if( mu <= nu ) then
+                 call stdlib_qlartgs( b11e( imin ), b11d( imin + 1 ), mu,rwork(iu1cs+imin-1), &
+                           rwork(iu1sn+imin-1) )
+              else
+                 call stdlib_qlartgs( b12d( imin ), b12e( imin ), nu,rwork(iu1cs+imin-1), rwork(&
+                           iu1sn+imin-1) )
+              end if
+              if( b21d(imin)**2+b21bulge**2 > thresh**2 ) then
+                 call stdlib_qlartgp( b21bulge, b21d(imin), rwork(iu2sn+imin-1),rwork(iu2cs+imin-&
+                           1), r )
+              else if( nu < mu ) then
+                 call stdlib_qlartgs( b21e( imin ), b21d( imin + 1 ), nu,rwork(iu2cs+imin-1), &
+                           rwork(iu2sn+imin-1) )
+              else
+                 call stdlib_qlartgs( b22d(imin), b22e(imin), mu,rwork(iu2cs+imin-1), rwork(iu2sn+&
+                           imin-1) )
+              end if
+              rwork(iu2cs+imin-1) = -rwork(iu2cs+imin-1)
+              rwork(iu2sn+imin-1) = -rwork(iu2sn+imin-1)
+              temp = rwork(iu1cs+imin-1)*b11e(imin) +rwork(iu1sn+imin-1)*b11d(imin+1)
+              b11d(imin+1) = rwork(iu1cs+imin-1)*b11d(imin+1) -rwork(iu1sn+imin-1)*b11e(imin)
+                        
+              b11e(imin) = temp
+              if( imax > imin+1 ) then
+                 b11bulge = rwork(iu1sn+imin-1)*b11e(imin+1)
+                 b11e(imin+1) = rwork(iu1cs+imin-1)*b11e(imin+1)
+              end if
+              temp = rwork(iu1cs+imin-1)*b12d(imin) +rwork(iu1sn+imin-1)*b12e(imin)
+              b12e(imin) = rwork(iu1cs+imin-1)*b12e(imin) -rwork(iu1sn+imin-1)*b12d(imin)
+              b12d(imin) = temp
+              b12bulge = rwork(iu1sn+imin-1)*b12d(imin+1)
+              b12d(imin+1) = rwork(iu1cs+imin-1)*b12d(imin+1)
+              temp = rwork(iu2cs+imin-1)*b21e(imin) +rwork(iu2sn+imin-1)*b21d(imin+1)
+              b21d(imin+1) = rwork(iu2cs+imin-1)*b21d(imin+1) -rwork(iu2sn+imin-1)*b21e(imin)
+                        
+              b21e(imin) = temp
+              if( imax > imin+1 ) then
+                 b21bulge = rwork(iu2sn+imin-1)*b21e(imin+1)
+                 b21e(imin+1) = rwork(iu2cs+imin-1)*b21e(imin+1)
+              end if
+              temp = rwork(iu2cs+imin-1)*b22d(imin) +rwork(iu2sn+imin-1)*b22e(imin)
+              b22e(imin) = rwork(iu2cs+imin-1)*b22e(imin) -rwork(iu2sn+imin-1)*b22d(imin)
+              b22d(imin) = temp
+              b22bulge = rwork(iu2sn+imin-1)*b22d(imin+1)
+              b22d(imin+1) = rwork(iu2cs+imin-1)*b22d(imin+1)
+              ! inner loop: chase bulges from b11(imin,imin+2),
+              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
+              ! bottom-right
+              do i = imin+1, imax-1
+                 ! compute phi(i-1)
+                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
+                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
+                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
+                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
+                 phi(i-1) = atan2( sqrt(x1**2+x2**2), sqrt(y1**2+y2**2) )
+                 ! determine if there are bulges to chase or if a new direct
+                 ! summand has been reached
+                 restart11 = b11e(i-1)**2 + b11bulge**2 <= thresh**2
+                 restart21 = b21e(i-1)**2 + b21bulge**2 <= thresh**2
+                 restart12 = b12d(i-1)**2 + b12bulge**2 <= thresh**2
+                 restart22 = b22d(i-1)**2 + b22bulge**2 <= thresh**2
+                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
+                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
+                 ! chasing by applying the original shift again.
+                 if( .not. restart11 .and. .not. restart21 ) then
+                    call stdlib_qlartgp( x2, x1, rwork(iv1tsn+i-1),rwork(iv1tcs+i-1), r )
+                 else if( .not. restart11 .and. restart21 ) then
+                    call stdlib_qlartgp( b11bulge, b11e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
+                               r )
+                 else if( restart11 .and. .not. restart21 ) then
+                    call stdlib_qlartgp( b21bulge, b21e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
+                               r )
+                 else if( mu <= nu ) then
+                    call stdlib_qlartgs( b11d(i), b11e(i), mu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
+                               )
+                 else
+                    call stdlib_qlartgs( b21d(i), b21e(i), nu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
+                               )
+                 end if
+                 rwork(iv1tcs+i-1) = -rwork(iv1tcs+i-1)
+                 rwork(iv1tsn+i-1) = -rwork(iv1tsn+i-1)
+                 if( .not. restart12 .and. .not. restart22 ) then
+                    call stdlib_qlartgp( y2, y1, rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-1-1), r )
+                              
+                 else if( .not. restart12 .and. restart22 ) then
+                    call stdlib_qlartgp( b12bulge, b12d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
+                              1-1), r )
+                 else if( restart12 .and. .not. restart22 ) then
+                    call stdlib_qlartgp( b22bulge, b22d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
+                              1-1), r )
+                 else if( nu < mu ) then
+                    call stdlib_qlartgs( b12e(i-1), b12d(i), nu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
+                              i-1-1) )
+                 else
+                    call stdlib_qlartgs( b22e(i-1), b22d(i), mu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
+                              i-1-1) )
+                 end if
+                 temp = rwork(iv1tcs+i-1)*b11d(i) + rwork(iv1tsn+i-1)*b11e(i)
+                 b11e(i) = rwork(iv1tcs+i-1)*b11e(i) -rwork(iv1tsn+i-1)*b11d(i)
+                 b11d(i) = temp
+                 b11bulge = rwork(iv1tsn+i-1)*b11d(i+1)
+                 b11d(i+1) = rwork(iv1tcs+i-1)*b11d(i+1)
+                 temp = rwork(iv1tcs+i-1)*b21d(i) + rwork(iv1tsn+i-1)*b21e(i)
+                 b21e(i) = rwork(iv1tcs+i-1)*b21e(i) -rwork(iv1tsn+i-1)*b21d(i)
+                 b21d(i) = temp
+                 b21bulge = rwork(iv1tsn+i-1)*b21d(i+1)
+                 b21d(i+1) = rwork(iv1tcs+i-1)*b21d(i+1)
+                 temp = rwork(iv2tcs+i-1-1)*b12e(i-1) +rwork(iv2tsn+i-1-1)*b12d(i)
+                 b12d(i) = rwork(iv2tcs+i-1-1)*b12d(i) -rwork(iv2tsn+i-1-1)*b12e(i-1)
+                 b12e(i-1) = temp
+                 b12bulge = rwork(iv2tsn+i-1-1)*b12e(i)
+                 b12e(i) = rwork(iv2tcs+i-1-1)*b12e(i)
+                 temp = rwork(iv2tcs+i-1-1)*b22e(i-1) +rwork(iv2tsn+i-1-1)*b22d(i)
+                 b22d(i) = rwork(iv2tcs+i-1-1)*b22d(i) -rwork(iv2tsn+i-1-1)*b22e(i-1)
+                 b22e(i-1) = temp
+                 b22bulge = rwork(iv2tsn+i-1-1)*b22e(i)
+                 b22e(i) = rwork(iv2tcs+i-1-1)*b22e(i)
+                 ! compute theta(i)
+                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
+                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
+                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
+                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
+                 theta(i) = atan2( sqrt(y1**2+y2**2), sqrt(x1**2+x2**2) )
+                 ! determine if there are bulges to chase or if a new direct
+                 ! summand has been reached
+                 restart11 =   b11d(i)**2 + b11bulge**2 <= thresh**2
+                 restart12 = b12e(i-1)**2 + b12bulge**2 <= thresh**2
+                 restart21 =   b21d(i)**2 + b21bulge**2 <= thresh**2
+                 restart22 = b22e(i-1)**2 + b22bulge**2 <= thresh**2
+                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
+                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
+                 ! chasing by applying the original shift again.
+                 if( .not. restart11 .and. .not. restart12 ) then
+                    call stdlib_qlartgp( x2, x1, rwork(iu1sn+i-1), rwork(iu1cs+i-1),r )
+                 else if( .not. restart11 .and. restart12 ) then
+                    call stdlib_qlartgp( b11bulge, b11d(i), rwork(iu1sn+i-1),rwork(iu1cs+i-1), r )
+                              
+                 else if( restart11 .and. .not. restart12 ) then
+                    call stdlib_qlartgp( b12bulge, b12e(i-1), rwork(iu1sn+i-1),rwork(iu1cs+i-1), &
+                              r )
+                 else if( mu <= nu ) then
+                    call stdlib_qlartgs( b11e(i), b11d(i+1), mu, rwork(iu1cs+i-1),rwork(iu1sn+i-1)&
+                               )
+                 else
+                    call stdlib_qlartgs( b12d(i), b12e(i), nu, rwork(iu1cs+i-1),rwork(iu1sn+i-1) )
+                              
+                 end if
+                 if( .not. restart21 .and. .not. restart22 ) then
+                    call stdlib_qlartgp( y2, y1, rwork(iu2sn+i-1), rwork(iu2cs+i-1),r )
+                 else if( .not. restart21 .and. restart22 ) then
+                    call stdlib_qlartgp( b21bulge, b21d(i), rwork(iu2sn+i-1),rwork(iu2cs+i-1), r )
+                              
+                 else if( restart21 .and. .not. restart22 ) then
+                    call stdlib_qlartgp( b22bulge, b22e(i-1), rwork(iu2sn+i-1),rwork(iu2cs+i-1), &
+                              r )
+                 else if( nu < mu ) then
+                    call stdlib_qlartgs( b21e(i), b21e(i+1), nu, rwork(iu2cs+i-1),rwork(iu2sn+i-1)&
+                               )
+                 else
+                    call stdlib_qlartgs( b22d(i), b22e(i), mu, rwork(iu2cs+i-1),rwork(iu2sn+i-1) )
+                              
+                 end if
+                 rwork(iu2cs+i-1) = -rwork(iu2cs+i-1)
+                 rwork(iu2sn+i-1) = -rwork(iu2sn+i-1)
+                 temp = rwork(iu1cs+i-1)*b11e(i) + rwork(iu1sn+i-1)*b11d(i+1)
+                 b11d(i+1) = rwork(iu1cs+i-1)*b11d(i+1) -rwork(iu1sn+i-1)*b11e(i)
+                 b11e(i) = temp
+                 if( i < imax - 1 ) then
+                    b11bulge = rwork(iu1sn+i-1)*b11e(i+1)
+                    b11e(i+1) = rwork(iu1cs+i-1)*b11e(i+1)
+                 end if
+                 temp = rwork(iu2cs+i-1)*b21e(i) + rwork(iu2sn+i-1)*b21d(i+1)
+                 b21d(i+1) = rwork(iu2cs+i-1)*b21d(i+1) -rwork(iu2sn+i-1)*b21e(i)
+                 b21e(i) = temp
+                 if( i < imax - 1 ) then
+                    b21bulge = rwork(iu2sn+i-1)*b21e(i+1)
+                    b21e(i+1) = rwork(iu2cs+i-1)*b21e(i+1)
+                 end if
+                 temp = rwork(iu1cs+i-1)*b12d(i) + rwork(iu1sn+i-1)*b12e(i)
+                 b12e(i) = rwork(iu1cs+i-1)*b12e(i) -rwork(iu1sn+i-1)*b12d(i)
+                 b12d(i) = temp
+                 b12bulge = rwork(iu1sn+i-1)*b12d(i+1)
+                 b12d(i+1) = rwork(iu1cs+i-1)*b12d(i+1)
+                 temp = rwork(iu2cs+i-1)*b22d(i) + rwork(iu2sn+i-1)*b22e(i)
+                 b22e(i) = rwork(iu2cs+i-1)*b22e(i) -rwork(iu2sn+i-1)*b22d(i)
+                 b22d(i) = temp
+                 b22bulge = rwork(iu2sn+i-1)*b22d(i+1)
+                 b22d(i+1) = rwork(iu2cs+i-1)*b22d(i+1)
+              end do
+              ! compute phi(imax-1)
+              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
+              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
+              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
+              phi(imax-1) = atan2( abs(x1), sqrt(y1**2+y2**2) )
+              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
+              restart12 = b12d(imax-1)**2 + b12bulge**2 <= thresh**2
+              restart22 = b22d(imax-1)**2 + b22bulge**2 <= thresh**2
+              if( .not. restart12 .and. .not. restart22 ) then
+                 call stdlib_qlartgp( y2, y1, rwork(iv2tsn+imax-1-1),rwork(iv2tcs+imax-1-1), r )
+                           
+              else if( .not. restart12 .and. restart22 ) then
+                 call stdlib_qlartgp( b12bulge, b12d(imax-1),rwork(iv2tsn+imax-1-1),rwork(iv2tcs+&
+                           imax-1-1), r )
+              else if( restart12 .and. .not. restart22 ) then
+                 call stdlib_qlartgp( b22bulge, b22d(imax-1),rwork(iv2tsn+imax-1-1),rwork(iv2tcs+&
+                           imax-1-1), r )
+              else if( nu < mu ) then
+                 call stdlib_qlartgs( b12e(imax-1), b12d(imax), nu,rwork(iv2tcs+imax-1-1),rwork(&
+                           iv2tsn+imax-1-1) )
+              else
+                 call stdlib_qlartgs( b22e(imax-1), b22d(imax), mu,rwork(iv2tcs+imax-1-1),rwork(&
+                           iv2tsn+imax-1-1) )
+              end if
+              temp = rwork(iv2tcs+imax-1-1)*b12e(imax-1) +rwork(iv2tsn+imax-1-1)*b12d(imax)
+                        
+              b12d(imax) = rwork(iv2tcs+imax-1-1)*b12d(imax) -rwork(iv2tsn+imax-1-1)*b12e(imax-1)
+                        
+              b12e(imax-1) = temp
+              temp = rwork(iv2tcs+imax-1-1)*b22e(imax-1) +rwork(iv2tsn+imax-1-1)*b22d(imax)
+                        
+              b22d(imax) = rwork(iv2tcs+imax-1-1)*b22d(imax) -rwork(iv2tsn+imax-1-1)*b22e(imax-1)
+                        
+              b22e(imax-1) = temp
+              ! update singular vectors
+              if( wantu1 ) then
+                 if( colmajor ) then
+                    call stdlib_wlasr( 'R', 'V', 'F', p, imax-imin+1,rwork(iu1cs+imin-1), rwork(&
+                              iu1sn+imin-1),u1(1,imin), ldu1 )
+                 else
+                    call stdlib_wlasr( 'L', 'V', 'F', imax-imin+1, p,rwork(iu1cs+imin-1), rwork(&
+                              iu1sn+imin-1),u1(imin,1), ldu1 )
+                 end if
+              end if
+              if( wantu2 ) then
+                 if( colmajor ) then
+                    call stdlib_wlasr( 'R', 'V', 'F', m-p, imax-imin+1,rwork(iu2cs+imin-1), rwork(&
+                              iu2sn+imin-1),u2(1,imin), ldu2 )
+                 else
+                    call stdlib_wlasr( 'L', 'V', 'F', imax-imin+1, m-p,rwork(iu2cs+imin-1), rwork(&
+                              iu2sn+imin-1),u2(imin,1), ldu2 )
+                 end if
+              end if
+              if( wantv1t ) then
+                 if( colmajor ) then
+                    call stdlib_wlasr( 'L', 'V', 'F', imax-imin+1, q,rwork(iv1tcs+imin-1), rwork(&
+                              iv1tsn+imin-1),v1t(imin,1), ldv1t )
+                 else
+                    call stdlib_wlasr( 'R', 'V', 'F', q, imax-imin+1,rwork(iv1tcs+imin-1), rwork(&
+                              iv1tsn+imin-1),v1t(1,imin), ldv1t )
+                 end if
+              end if
+              if( wantv2t ) then
+                 if( colmajor ) then
+                    call stdlib_wlasr( 'L', 'V', 'F', imax-imin+1, m-q,rwork(iv2tcs+imin-1), &
+                              rwork(iv2tsn+imin-1),v2t(imin,1), ldv2t )
+                 else
+                    call stdlib_wlasr( 'R', 'V', 'F', m-q, imax-imin+1,rwork(iv2tcs+imin-1), &
+                              rwork(iv2tsn+imin-1),v2t(1,imin), ldv2t )
+                 end if
+              end if
+              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
+              if( b11e(imax-1)+b21e(imax-1) > 0 ) then
+                 b11d(imax) = -b11d(imax)
+                 b21d(imax) = -b21d(imax)
+                 if( wantv1t ) then
+                    if( colmajor ) then
+                       call stdlib_wscal( q, cnegone, v1t(imax,1), ldv1t )
+                    else
+                       call stdlib_wscal( q, cnegone, v1t(1,imax), 1 )
+                    end if
+                 end if
+              end if
+              ! compute theta(imax)
+              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
+              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
+              theta(imax) = atan2( abs(y1), abs(x1) )
+              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
+              ! and b22(imax,imax-1)
+              if( b11d(imax)+b12e(imax-1) < 0 ) then
+                 b12d(imax) = -b12d(imax)
+                 if( wantu1 ) then
+                    if( colmajor ) then
+                       call stdlib_wscal( p, cnegone, u1(1,imax), 1 )
+                    else
+                       call stdlib_wscal( p, cnegone, u1(imax,1), ldu1 )
+                    end if
+                 end if
+              end if
+              if( b21d(imax)+b22e(imax-1) > 0 ) then
+                 b22d(imax) = -b22d(imax)
+                 if( wantu2 ) then
+                    if( colmajor ) then
+                       call stdlib_wscal( m-p, cnegone, u2(1,imax), 1 )
+                    else
+                       call stdlib_wscal( m-p, cnegone, u2(imax,1), ldu2 )
+                    end if
+                 end if
+              end if
+              ! fix signs on b12(imax,imax) and b22(imax,imax)
+              if( b12d(imax)+b22d(imax) < 0 ) then
+                 if( wantv2t ) then
+                    if( colmajor ) then
+                       call stdlib_wscal( m-q, cnegone, v2t(imax,1), ldv2t )
+                    else
+                       call stdlib_wscal( m-q, cnegone, v2t(1,imax), 1 )
+                    end if
+                 end if
+              end if
+              ! test for negligible sines or cosines
+              do i = imin, imax
+                 if( theta(i) < thresh ) then
+                    theta(i) = zero
+                 else if( theta(i) > piover2-thresh ) then
+                    theta(i) = piover2
+                 end if
+              end do
+              do i = imin, imax-1
+                 if( phi(i) < thresh ) then
+                    phi(i) = zero
+                 else if( phi(i) > piover2-thresh ) then
+                    phi(i) = piover2
+                 end if
+              end do
+              ! deflate
+              if (imax > 1) then
+                 do while( phi(imax-1) == zero )
+                    imax = imax - 1
+                    if (imax <= 1) exit
+                 end do
+              end if
+              if( imin > imax - 1 )imin = imax - 1
+              if (imin > 1) then
+                 do while (phi(imin-1) /= zero)
+                     imin = imin - 1
+                     if (imin <= 1) exit
+                 end do
+              end if
+              ! repeat main iteration loop
+           end do
+           ! postprocessing: order theta from least to greatest
+           do i = 1, q
+              mini = i
+              thetamin = theta(i)
+              do j = i+1, q
+                 if( theta(j) < thetamin ) then
+                    mini = j
+                    thetamin = theta(j)
+                 end if
+              end do
+              if( mini /= i ) then
+                 theta(mini) = theta(i)
+                 theta(i) = thetamin
+                 if( colmajor ) then
+                    if( wantu1 )call stdlib_wswap( p, u1(1,i), 1, u1(1,mini), 1 )
+                    if( wantu2 )call stdlib_wswap( m-p, u2(1,i), 1, u2(1,mini), 1 )
+                    if( wantv1t )call stdlib_wswap( q, v1t(i,1), ldv1t, v1t(mini,1), ldv1t )
+                              
+                    if( wantv2t )call stdlib_wswap( m-q, v2t(i,1), ldv2t, v2t(mini,1),ldv2t )
+                              
+                 else
+                    if( wantu1 )call stdlib_wswap( p, u1(i,1), ldu1, u1(mini,1), ldu1 )
+                    if( wantu2 )call stdlib_wswap( m-p, u2(i,1), ldu2, u2(mini,1), ldu2 )
+                    if( wantv1t )call stdlib_wswap( q, v1t(1,i), 1, v1t(1,mini), 1 )
+                    if( wantv2t )call stdlib_wswap( m-q, v2t(1,i), 1, v2t(1,mini), 1 )
+                 end if
+              end if
+           end do
+           return
+     end subroutine stdlib_wbbcsd
+
+     !> ZBDSQR: computes the singular values and, optionally, the right and/or
+     !> left singular vectors from the singular value decomposition (SVD) of
+     !> a real N-by-N (upper or lower) bidiagonal matrix B using the implicit
+     !> zero-shift QR algorithm.  The SVD of B has the form
+     !> B = Q * S * P**H
+     !> where S is the diagonal matrix of singular values, Q is an orthogonal
+     !> matrix of left singular vectors, and P is an orthogonal matrix of
+     !> right singular vectors.  If left singular vectors are requested, this
+     !> subroutine actually returns U*Q instead of Q, and, if right singular
+     !> vectors are requested, this subroutine returns P**H*VT instead of
+     !> P**H, for given complex input matrices U and VT.  When U and VT are
+     !> the unitary matrices that reduce a general matrix A to bidiagonal
+     !> form: A = U*B*VT, as computed by ZGEBRD, then
+     !> A = (U*Q) * S * (P**H*VT)
+     !> is the SVD of A.  Optionally, the subroutine may also compute Q**H*C
+     !> for a given complex input matrix C.
+     !> See "Computing  Small Singular Values of Bidiagonal Matrices With
+     !> Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+     !> LAPACK Working Note #3 (or SIAM J. Sci. Statist. Comput. vol. 11,
+     !> no. 5, pp. 873-912, Sept 1990) and
+     !> "Accurate singular values and differential qd algorithms," by
+     !> B. Parlett and V. Fernando, Technical Report CPAM-554, Mathematics
+     !> Department, University of California at Berkeley, July 1992
+     !> for a detailed description of the algorithm.
+
+     pure subroutine stdlib_wbdsqr( uplo, n, ncvt, nru, ncc, d, e, vt, ldvt, u,ldu, c, ldc, rwork,&
+                info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, ldu, ldvt, n, ncc, ncvt, nru
+           ! Array Arguments 
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: c(ldc,*), u(ldu,*), vt(ldvt,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: hndrth = 0.01_qp
+           real(qp), parameter :: hndrd = 100.0_qp
+           real(qp), parameter :: meigth = -0.125_qp
+           integer(ilp), parameter :: maxitr = 6
+           
+           
+           
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, rotate
+           integer(ilp) :: i, idir, isub, iter, j, ll, lll, m, maxit, nm1, nm12, nm13, oldll, &
+                     oldm
+           real(qp) :: abse, abss, cosl, cosr, cs, eps, f, g, h, mu, oldcs, oldsn, r, shift, &
+           sigmn, sigmx, sinl, sinr, sll, smax, smin, sminl, sminoa, sn, thresh, tol, tolmul, &
+                     unfl
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.lower ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ncvt<0 ) then
+              info = -3
+           else if( nru<0 ) then
+              info = -4
+           else if( ncc<0 ) then
+              info = -5
+           else if( ( ncvt==0 .and. ldvt<1 ) .or.( ncvt>0 .and. ldvt<max( 1, n ) ) ) then
+              info = -9
+           else if( ldu<max( 1, nru ) ) then
+              info = -11
+           else if( ( ncc==0 .and. ldc<1 ) .or.( ncc>0 .and. ldc<max( 1, n ) ) ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZBDSQR', -info )
+              return
+           end if
+           if( n==0 )return
+           if( n==1 )go to 160
+           ! rotate is true if any singular vectors desired, false otherwise
+           rotate = ( ncvt>0 ) .or. ( nru>0 ) .or. ( ncc>0 )
+           ! if no singular vectors desired, use qd algorithm
+           if( .not.rotate ) then
+              call stdlib_qlasq1( n, d, e, rwork, info )
+           ! if info equals 2, dqds didn't finish, try to finish
+              if( info /= 2 ) return
+              info = 0
+           end if
+           nm1 = n - 1
+           nm12 = nm1 + nm1
+           nm13 = nm12 + nm1
+           idir = 0
+           ! get machine constants
+           eps = stdlib_qlamch( 'EPSILON' )
+           unfl = stdlib_qlamch( 'SAFE MINIMUM' )
+           ! if matrix lower bidiagonal, rotate to be upper bidiagonal
+           ! by applying givens rotations on the left
+           if( lower ) then
+              do i = 1, n - 1
+                 call stdlib_qlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 rwork( i ) = cs
+                 rwork( nm1+i ) = sn
+              end do
+              ! update singular vectors if desired
+              if( nru>0 )call stdlib_wlasr( 'R', 'V', 'F', nru, n, rwork( 1 ), rwork( n ),u, ldu )
+                        
+              if( ncc>0 )call stdlib_wlasr( 'L', 'V', 'F', n, ncc, rwork( 1 ), rwork( n ),c, ldc )
+                        
+           end if
+           ! compute singular values to relative accuracy tol
+           ! (by setting tol to be negative, algorithm will compute
+           ! singular values to absolute accuracy abs(tol)*norm(input matrix))
+           tolmul = max( ten, min( hndrd, eps**meigth ) )
+           tol = tolmul*eps
+           ! compute approximate maximum, minimum singular values
+           smax = zero
+           do i = 1, n
+              smax = max( smax, abs( d( i ) ) )
+           end do
+           do i = 1, n - 1
+              smax = max( smax, abs( e( i ) ) )
+           end do
+           sminl = zero
+           if( tol>=zero ) then
+              ! relative accuracy desired
+              sminoa = abs( d( 1 ) )
+              if( sminoa==zero )go to 50
+              mu = sminoa
+              do i = 2, n
+                 mu = abs( d( i ) )*( mu / ( mu+abs( e( i-1 ) ) ) )
+                 sminoa = min( sminoa, mu )
+                 if( sminoa==zero )go to 50
+              end do
+              50 continue
+              sminoa = sminoa / sqrt( real( n,KIND=qp) )
+              thresh = max( tol*sminoa, maxitr*n*n*unfl )
+           else
+              ! absolute accuracy desired
+              thresh = max( abs( tol )*smax, maxitr*n*n*unfl )
+           end if
+           ! prepare for main iteration loop for the singular values
+           ! (maxit is the maximum number of passes through the inner
+           ! loop permitted before nonconvergence signalled.)
+           maxit = maxitr*n*n
+           iter = 0
+           oldll = -1
+           oldm = -1
+           ! m points to last element of unconverged part of matrix
+           m = n
+           ! begin main iteration loop
+           60 continue
+           ! check for convergence or exceeding iteration count
+           if( m<=1 )go to 160
+           if( iter>maxit )go to 200
+           ! find diagonal block of matrix to work on
+           if( tol<zero .and. abs( d( m ) )<=thresh )d( m ) = zero
+           smax = abs( d( m ) )
+           smin = smax
+           do lll = 1, m - 1
+              ll = m - lll
+              abss = abs( d( ll ) )
+              abse = abs( e( ll ) )
+              if( tol<zero .and. abss<=thresh )d( ll ) = zero
+              if( abse<=thresh )go to 80
+              smin = min( smin, abss )
+              smax = max( smax, abss, abse )
+           end do
+           ll = 0
+           go to 90
+           80 continue
+           e( ll ) = zero
+           ! matrix splits since e(ll) = 0
+           if( ll==m-1 ) then
+              ! convergence of bottom singular value, return to top of loop
+              m = m - 1
+              go to 60
+           end if
+           90 continue
+           ll = ll + 1
+           ! e(ll) through e(m-1) are nonzero, e(ll-1) is zero
+           if( ll==m-1 ) then
+              ! 2 by 2 block, handle separately
+              call stdlib_qlasv2( d( m-1 ), e( m-1 ), d( m ), sigmn, sigmx, sinr,cosr, sinl, cosl &
+                        )
+              d( m-1 ) = sigmx
+              e( m-1 ) = zero
+              d( m ) = sigmn
+              ! compute singular vectors, if desired
+              if( ncvt>0 )call stdlib_wdrot( ncvt, vt( m-1, 1 ), ldvt, vt( m, 1 ), ldvt,cosr, &
+                        sinr )
+              if( nru>0 )call stdlib_wdrot( nru, u( 1, m-1 ), 1, u( 1, m ), 1, cosl, sinl )
+                        
+              if( ncc>0 )call stdlib_wdrot( ncc, c( m-1, 1 ), ldc, c( m, 1 ), ldc, cosl,sinl )
+                        
+              m = m - 2
+              go to 60
+           end if
+           ! if working on new submatrix, choose shift direction
+           ! (from larger end diagonal element towards smaller)
+           if( ll>oldm .or. m<oldll ) then
+              if( abs( d( ll ) )>=abs( d( m ) ) ) then
+                 ! chase bulge from top (big end) to bottom (small end)
+                 idir = 1
+              else
+                 ! chase bulge from bottom (big end) to top (small end)
+                 idir = 2
+              end if
+           end if
+           ! apply convergence tests
+           if( idir==1 ) then
+              ! run convergence test in forward direction
+              ! first apply standard test to bottom of matrix
+              if( abs( e( m-1 ) )<=abs( tol )*abs( d( m ) ) .or.( tol<zero .and. abs( e( m-1 ) )&
+                        <=thresh ) ) then
+                 e( m-1 ) = zero
+                 go to 60
+              end if
+              if( tol>=zero ) then
+                 ! if relative accuracy desired,
+                 ! apply convergence criterion forward
+                 mu = abs( d( ll ) )
+                 sminl = mu
+                 do lll = ll, m - 1
+                    if( abs( e( lll ) )<=tol*mu ) then
+                       e( lll ) = zero
+                       go to 60
+                    end if
+                    mu = abs( d( lll+1 ) )*( mu / ( mu+abs( e( lll ) ) ) )
+                    sminl = min( sminl, mu )
+                 end do
+              end if
+           else
+              ! run convergence test in backward direction
+              ! first apply standard test to top of matrix
+              if( abs( e( ll ) )<=abs( tol )*abs( d( ll ) ) .or.( tol<zero .and. abs( e( ll ) )&
+                        <=thresh ) ) then
+                 e( ll ) = zero
+                 go to 60
+              end if
+              if( tol>=zero ) then
+                 ! if relative accuracy desired,
+                 ! apply convergence criterion backward
+                 mu = abs( d( m ) )
+                 sminl = mu
+                 do lll = m - 1, ll, -1
+                    if( abs( e( lll ) )<=tol*mu ) then
+                       e( lll ) = zero
+                       go to 60
+                    end if
+                    mu = abs( d( lll ) )*( mu / ( mu+abs( e( lll ) ) ) )
+                    sminl = min( sminl, mu )
+                 end do
+              end if
+           end if
+           oldll = ll
+           oldm = m
+           ! compute shift.  first, test if shifting would ruin relative
+           ! accuracy, and if so set the shift to zero.
+           if( tol>=zero .and. n*tol*( sminl / smax )<=max( eps, hndrth*tol ) ) then
+              ! use a zero shift to avoid loss of relative accuracy
+              shift = zero
+           else
+              ! compute the shift from 2-by-2 block at end of matrix
+              if( idir==1 ) then
+                 sll = abs( d( ll ) )
+                 call stdlib_qlas2( d( m-1 ), e( m-1 ), d( m ), shift, r )
+              else
+                 sll = abs( d( m ) )
+                 call stdlib_qlas2( d( ll ), e( ll ), d( ll+1 ), shift, r )
+              end if
+              ! test if shift negligible, and if so set to zero
+              if( sll>zero ) then
+                 if( ( shift / sll )**2<eps )shift = zero
+              end if
+           end if
+           ! increment iteration count
+           iter = iter + m - ll
+           ! if shift = 0, do simplified qr iteration
+           if( shift==zero ) then
+              if( idir==1 ) then
+                 ! chase bulge from top to bottom
+                 ! save cosines and sines for later singular vector updates
+                 cs = one
+                 oldcs = one
+                 do i = ll, m - 1
+                    call stdlib_qlartg( d( i )*cs, e( i ), cs, sn, r )
+                    if( i>ll )e( i-1 ) = oldsn*r
+                    call stdlib_qlartg( oldcs*r, d( i+1 )*sn, oldcs, oldsn, d( i ) )
+                    rwork( i-ll+1 ) = cs
+                    rwork( i-ll+1+nm1 ) = sn
+                    rwork( i-ll+1+nm12 ) = oldcs
+                    rwork( i-ll+1+nm13 ) = oldsn
+                 end do
+                 h = d( m )*cs
+                 d( m ) = h*oldcs
+                 e( m-1 ) = h*oldsn
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_wlasr( 'L', 'V', 'F', m-ll+1, ncvt, rwork( 1 ),rwork( n )&
+                           , vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_wlasr( 'R', 'V', 'F', nru, m-ll+1, rwork( nm12+1 ),rwork( &
+                           nm13+1 ), u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_wlasr( 'L', 'V', 'F', m-ll+1, ncc, rwork( nm12+1 ),rwork( &
+                           nm13+1 ), c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( m-1 ) )<=thresh )e( m-1 ) = zero
+              else
+                 ! chase bulge from bottom to top
+                 ! save cosines and sines for later singular vector updates
+                 cs = one
+                 oldcs = one
+                 do i = m, ll + 1, -1
+                    call stdlib_qlartg( d( i )*cs, e( i-1 ), cs, sn, r )
+                    if( i<m )e( i ) = oldsn*r
+                    call stdlib_qlartg( oldcs*r, d( i-1 )*sn, oldcs, oldsn, d( i ) )
+                    rwork( i-ll ) = cs
+                    rwork( i-ll+nm1 ) = -sn
+                    rwork( i-ll+nm12 ) = oldcs
+                    rwork( i-ll+nm13 ) = -oldsn
+                 end do
+                 h = d( ll )*cs
+                 d( ll ) = h*oldcs
+                 e( ll ) = h*oldsn
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_wlasr( 'L', 'V', 'B', m-ll+1, ncvt, rwork( nm12+1 ),&
+                           rwork( nm13+1 ), vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_wlasr( 'R', 'V', 'B', nru, m-ll+1, rwork( 1 ),rwork( n ), &
+                           u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_wlasr( 'L', 'V', 'B', m-ll+1, ncc, rwork( 1 ),rwork( n ), &
+                           c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( ll ) )<=thresh )e( ll ) = zero
+              end if
+           else
+              ! use nonzero shift
+              if( idir==1 ) then
+                 ! chase bulge from top to bottom
+                 ! save cosines and sines for later singular vector updates
+                 f = ( abs( d( ll ) )-shift )*( sign( one, d( ll ) )+shift / d( ll ) )
+                 g = e( ll )
+                 do i = ll, m - 1
+                    call stdlib_qlartg( f, g, cosr, sinr, r )
+                    if( i>ll )e( i-1 ) = r
+                    f = cosr*d( i ) + sinr*e( i )
+                    e( i ) = cosr*e( i ) - sinr*d( i )
+                    g = sinr*d( i+1 )
+                    d( i+1 ) = cosr*d( i+1 )
+                    call stdlib_qlartg( f, g, cosl, sinl, r )
+                    d( i ) = r
+                    f = cosl*e( i ) + sinl*d( i+1 )
+                    d( i+1 ) = cosl*d( i+1 ) - sinl*e( i )
+                    if( i<m-1 ) then
+                       g = sinl*e( i+1 )
+                       e( i+1 ) = cosl*e( i+1 )
+                    end if
+                    rwork( i-ll+1 ) = cosr
+                    rwork( i-ll+1+nm1 ) = sinr
+                    rwork( i-ll+1+nm12 ) = cosl
+                    rwork( i-ll+1+nm13 ) = sinl
+                 end do
+                 e( m-1 ) = f
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_wlasr( 'L', 'V', 'F', m-ll+1, ncvt, rwork( 1 ),rwork( n )&
+                           , vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_wlasr( 'R', 'V', 'F', nru, m-ll+1, rwork( nm12+1 ),rwork( &
+                           nm13+1 ), u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_wlasr( 'L', 'V', 'F', m-ll+1, ncc, rwork( nm12+1 ),rwork( &
+                           nm13+1 ), c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( m-1 ) )<=thresh )e( m-1 ) = zero
+              else
+                 ! chase bulge from bottom to top
+                 ! save cosines and sines for later singular vector updates
+                 f = ( abs( d( m ) )-shift )*( sign( one, d( m ) )+shift /d( m ) )
+                 g = e( m-1 )
+                 do i = m, ll + 1, -1
+                    call stdlib_qlartg( f, g, cosr, sinr, r )
+                    if( i<m )e( i ) = r
+                    f = cosr*d( i ) + sinr*e( i-1 )
+                    e( i-1 ) = cosr*e( i-1 ) - sinr*d( i )
+                    g = sinr*d( i-1 )
+                    d( i-1 ) = cosr*d( i-1 )
+                    call stdlib_qlartg( f, g, cosl, sinl, r )
+                    d( i ) = r
+                    f = cosl*e( i-1 ) + sinl*d( i-1 )
+                    d( i-1 ) = cosl*d( i-1 ) - sinl*e( i-1 )
+                    if( i>ll+1 ) then
+                       g = sinl*e( i-2 )
+                       e( i-2 ) = cosl*e( i-2 )
+                    end if
+                    rwork( i-ll ) = cosr
+                    rwork( i-ll+nm1 ) = -sinr
+                    rwork( i-ll+nm12 ) = cosl
+                    rwork( i-ll+nm13 ) = -sinl
+                 end do
+                 e( ll ) = f
+                 ! test convergence
+                 if( abs( e( ll ) )<=thresh )e( ll ) = zero
+                 ! update singular vectors if desired
+                 if( ncvt>0 )call stdlib_wlasr( 'L', 'V', 'B', m-ll+1, ncvt, rwork( nm12+1 ),&
+                           rwork( nm13+1 ), vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_wlasr( 'R', 'V', 'B', nru, m-ll+1, rwork( 1 ),rwork( n ), &
+                           u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_wlasr( 'L', 'V', 'B', m-ll+1, ncc, rwork( 1 ),rwork( n ), &
+                           c( ll, 1 ), ldc )
+              end if
+           end if
+           ! qr iteration finished, go back and check convergence
+           go to 60
+           ! all singular values converged, so make them positive
+           160 continue
+           do i = 1, n
+              if( d( i )<zero ) then
+                 d( i ) = -d( i )
+                 ! change sign of singular vectors, if desired
+                 if( ncvt>0 )call stdlib_wdscal( ncvt, negone, vt( i, 1 ), ldvt )
+              end if
+           end do
+           ! sort the singular values into decreasing order (insertion sort on
+           ! singular values, but only one transposition per singular vector)
+           do i = 1, n - 1
+              ! scan for smallest d(i)
+              isub = 1
+              smin = d( 1 )
+              do j = 2, n + 1 - i
+                 if( d( j )<=smin ) then
+                    isub = j
+                    smin = d( j )
+                 end if
+              end do
+              if( isub/=n+1-i ) then
+                 ! swap singular values and vectors
+                 d( isub ) = d( n+1-i )
+                 d( n+1-i ) = smin
+                 if( ncvt>0 )call stdlib_wswap( ncvt, vt( isub, 1 ), ldvt, vt( n+1-i, 1 ),ldvt )
+                           
+                 if( nru>0 )call stdlib_wswap( nru, u( 1, isub ), 1, u( 1, n+1-i ), 1 )
+                 if( ncc>0 )call stdlib_wswap( ncc, c( isub, 1 ), ldc, c( n+1-i, 1 ), ldc )
+                           
+              end if
+           end do
+           go to 220
+           ! maximum number of iterations exceeded, failure to converge
+           200 continue
+           info = 0
+           do i = 1, n - 1
+              if( e( i )/=zero )info = info + 1
+           end do
+           220 continue
+           return
+     end subroutine stdlib_wbdsqr
+
+     !> ZCGESV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> ZCGESV first attempts to factorize the matrix in COMPLEX and use this
+     !> factorization within an iterative refinement procedure to produce a
+     !> solution with COMPLEX*16 normwise backward error quality (see below).
+     !> If the approach fails the method switches to a COMPLEX*16
+     !> factorization and solve.
+     !> The iterative refinement is not going to be a winning strategy if
+     !> the ratio COMPLEX performance over COMPLEX*16 performance is too
+     !> small. A reasonable strategy should take the number of right-hand
+     !> sides and the size of the matrix into account. This might be done
+     !> with a call to ILAENV in the future. Up to now, we always try
+     !> iterative refinement.
+     !> The iterative refinement process is stopped if
+     !> ITER > ITERMAX
+     !> or for all the RHS we have:
+     !> RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
+     !> where
+     !> o ITER is the number of the current iteration in the iterative
+     !> refinement process
+     !> o RNRM is the infinity-norm of the residual
+     !> o XNRM is the infinity-norm of the solution
+     !> o ANRM is the infinity-operator-norm of the matrix A
+     !> o EPS is the machine epsilon returned by DLAMCH('Epsilon')
+     !> The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00
+     !> respectively.
+
+     subroutine stdlib_wcgesv( n, nrhs, a, lda, ipiv, b, ldb, x, ldx, work,swork, rwork, iter, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, iter
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(dp), intent(out) :: swork(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: b(ldb,*)
+           complex(qp), intent(out) :: work(n,*), x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           logical(lk), parameter :: doitref = .true.
+           integer(ilp), parameter :: itermax = 30
+           real(qp), parameter :: bwdmax = 1.0e+00_qp
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iiter, ptsa, ptsx
+           real(qp) :: anrm, cte, eps, rnrm, xnrm
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           info = 0
+           iter = 0
+           ! test the input parameters.
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZCGESV', -info )
+              return
+           end if
+           ! quick return if (n==0).
+           if( n==0 )return
+           ! skip double precision iterative refinement if a priori slower
+           ! than quad precision factorization.
+           if( .not.doitref ) then
+              iter = -1
+              go to 40
+           end if
+           ! compute some constants.
+           anrm = stdlib_wlange( 'I', n, n, a, lda, rwork )
+           eps = stdlib_qlamch( 'EPSILON' )
+           cte = anrm*eps*sqrt( real( n,KIND=qp) )*bwdmax
+           ! set the indices ptsa, ptsx for referencing sa and sx in swork.
+           ptsa = 1
+           ptsx = ptsa + n*n
+           ! convert b from quad precision to double precision and store the
+           ! result in sx.
+           call stdlib_wlag2c( n, nrhs, b, ldb, swork( ptsx ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! convert a from quad precision to double precision and store the
+           ! result in sa.
+           call stdlib_wlag2c( n, n, a, lda, swork( ptsa ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! compute the lu factorization of sa.
+           call stdlib_zgetrf( n, n, swork( ptsa ), n, ipiv, info )
+           if( info/=0 ) then
+              iter = -3
+              go to 40
+           end if
+           ! solve the system sa*sx = sb.
+           call stdlib_zgetrs( 'NO TRANSPOSE', n, nrhs, swork( ptsa ), n, ipiv,swork( ptsx ), n, &
+                     info )
+           ! convert sx back to quad precision
+           call stdlib_zlag2w( n, nrhs, swork( ptsx ), n, x, ldx, info )
+           ! compute r = b - ax (r is work).
+           call stdlib_wlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+           call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, nrhs, n, cnegone, a,lda, x, ldx, &
+                     cone, work, n )
+           ! check whether the nrhs normwise backward errors satisfy the
+           ! stopping criterion. if yes, set iter=0 and return.
+           do i = 1, nrhs
+              xnrm = cabs1( x( stdlib_iwamax( n, x( 1, i ), 1 ), i ) )
+              rnrm = cabs1( work( stdlib_iwamax( n, work( 1, i ), 1 ), i ) )
+              if( rnrm>xnrm*cte )go to 10
+           end do
+           ! if we are here, the nrhs normwise backward errors satisfy the
+           ! stopping criterion. we are good to exit.
+           iter = 0
+           return
+           10 continue
+           loop_30: do iiter = 1, itermax
+              ! convert r (in work) from quad precision to double precision
+              ! and store the result in sx.
+              call stdlib_wlag2c( n, nrhs, work, n, swork( ptsx ), n, info )
+              if( info/=0 ) then
+                 iter = -2
+                 go to 40
+              end if
+              ! solve the system sa*sx = sr.
+              call stdlib_zgetrs( 'NO TRANSPOSE', n, nrhs, swork( ptsa ), n, ipiv,swork( ptsx ), &
+                        n, info )
+              ! convert sx back to quad precision and update the current
+              ! iterate.
+              call stdlib_zlag2w( n, nrhs, swork( ptsx ), n, work, n, info )
+              do i = 1, nrhs
+                 call stdlib_waxpy( n, cone, work( 1, i ), 1, x( 1, i ), 1 )
+              end do
+              ! compute r = b - ax (r is work).
+              call stdlib_wlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+              call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, nrhs, n, cnegone,a, lda, x, &
+                        ldx, cone, work, n )
+              ! check whether the nrhs normwise backward errors satisfy the
+              ! stopping criterion. if yes, set iter=iiter>0 and return.
+              do i = 1, nrhs
+                 xnrm = cabs1( x( stdlib_iwamax( n, x( 1, i ), 1 ), i ) )
+                 rnrm = cabs1( work( stdlib_iwamax( n, work( 1, i ), 1 ), i ) )
+                 if( rnrm>xnrm*cte )go to 20
+              end do
+              ! if we are here, the nrhs normwise backward errors satisfy the
+              ! stopping criterion, we are good to exit.
+              iter = iiter
+              return
+              20 continue
+           end do loop_30
+           ! if we are at this place of the code, this is because we have
+           ! performed iter=itermax iterations and never satisfied the stopping
+           ! criterion, set up the iter flag accordingly and follow up on double
+           ! precision routine.
+           iter = -itermax - 1
+           40 continue
+           ! single-precision iterative refinement failed to converge to a
+           ! satisfactory solution, so we resort to quad precision.
+           call stdlib_wgetrf( n, n, a, lda, ipiv, info )
+           if( info/=0 )return
+           call stdlib_wlacpy( 'ALL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_wgetrs( 'NO TRANSPOSE', n, nrhs, a, lda, ipiv, x, ldx,info )
+           return
+     end subroutine stdlib_wcgesv
+
+     !> ZCPOSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> ZCPOSV first attempts to factorize the matrix in COMPLEX and use this
+     !> factorization within an iterative refinement procedure to produce a
+     !> solution with COMPLEX*16 normwise backward error quality (see below).
+     !> If the approach fails the method switches to a COMPLEX*16
+     !> factorization and solve.
+     !> The iterative refinement is not going to be a winning strategy if
+     !> the ratio COMPLEX performance over COMPLEX*16 performance is too
+     !> small. A reasonable strategy should take the number of right-hand
+     !> sides and the size of the matrix into account. This might be done
+     !> with a call to ILAENV in the future. Up to now, we always try
+     !> iterative refinement.
+     !> The iterative refinement process is stopped if
+     !> ITER > ITERMAX
+     !> or for all the RHS we have:
+     !> RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
+     !> where
+     !> o ITER is the number of the current iteration in the iterative
+     !> refinement process
+     !> o RNRM is the infinity-norm of the residual
+     !> o XNRM is the infinity-norm of the solution
+     !> o ANRM is the infinity-operator-norm of the matrix A
+     !> o EPS is the machine epsilon returned by DLAMCH('Epsilon')
+     !> The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00
+     !> respectively.
+
+     subroutine stdlib_wcposv( uplo, n, nrhs, a, lda, b, ldb, x, ldx, work,swork, rwork, iter, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, iter
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(dp), intent(out) :: swork(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: b(ldb,*)
+           complex(qp), intent(out) :: work(n,*), x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           logical(lk), parameter :: doitref = .true.
+           integer(ilp), parameter :: itermax = 30
+           real(qp), parameter :: bwdmax = 1.0e+00_qp
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iiter, ptsa, ptsx
+           real(qp) :: anrm, cte, eps, rnrm, xnrm
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           info = 0
+           iter = 0
+           ! test the input parameters.
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZCPOSV', -info )
+              return
+           end if
+           ! quick return if (n==0).
+           if( n==0 )return
+           ! skip double precision iterative refinement if a priori slower
+           ! than quad precision factorization.
+           if( .not.doitref ) then
+              iter = -1
+              go to 40
+           end if
+           ! compute some constants.
+           anrm = stdlib_wlanhe( 'I', uplo, n, a, lda, rwork )
+           eps = stdlib_qlamch( 'EPSILON' )
+           cte = anrm*eps*sqrt( real( n,KIND=qp) )*bwdmax
+           ! set the indices ptsa, ptsx for referencing sa and sx in swork.
+           ptsa = 1
+           ptsx = ptsa + n*n
+           ! convert b from quad precision to double precision and store the
+           ! result in sx.
+           call stdlib_wlag2c( n, nrhs, b, ldb, swork( ptsx ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! convert a from quad precision to double precision and store the
+           ! result in sa.
+           call stdlib_wlat2c( uplo, n, a, lda, swork( ptsa ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! compute the cholesky factorization of sa.
+           call stdlib_zpotrf( uplo, n, swork( ptsa ), n, info )
+           if( info/=0 ) then
+              iter = -3
+              go to 40
+           end if
+           ! solve the system sa*sx = sb.
+           call stdlib_zpotrs( uplo, n, nrhs, swork( ptsa ), n, swork( ptsx ), n,info )
+           ! convert sx back to complex*16
+           call stdlib_zlag2w( n, nrhs, swork( ptsx ), n, x, ldx, info )
+           ! compute r = b - ax (r is work).
+           call stdlib_wlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+           call stdlib_whemm( 'LEFT', uplo, n, nrhs, cnegone, a, lda, x, ldx, cone,work, n )
+                     
+           ! check whether the nrhs normwise backward errors satisfy the
+           ! stopping criterion. if yes, set iter=0 and return.
+           do i = 1, nrhs
+              xnrm = cabs1( x( stdlib_iwamax( n, x( 1, i ), 1 ), i ) )
+              rnrm = cabs1( work( stdlib_iwamax( n, work( 1, i ), 1 ), i ) )
+              if( rnrm>xnrm*cte )go to 10
+           end do
+           ! if we are here, the nrhs normwise backward errors satisfy the
+           ! stopping criterion. we are good to exit.
+           iter = 0
+           return
+           10 continue
+           loop_30: do iiter = 1, itermax
+              ! convert r (in work) from quad precision to double precision
+              ! and store the result in sx.
+              call stdlib_wlag2c( n, nrhs, work, n, swork( ptsx ), n, info )
+              if( info/=0 ) then
+                 iter = -2
+                 go to 40
+              end if
+              ! solve the system sa*sx = sr.
+              call stdlib_zpotrs( uplo, n, nrhs, swork( ptsa ), n, swork( ptsx ), n,info )
+              ! convert sx back to quad precision and update the current
+              ! iterate.
+              call stdlib_zlag2w( n, nrhs, swork( ptsx ), n, work, n, info )
+              do i = 1, nrhs
+                 call stdlib_waxpy( n, cone, work( 1, i ), 1, x( 1, i ), 1 )
+              end do
+              ! compute r = b - ax (r is work).
+              call stdlib_wlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+              call stdlib_whemm( 'L', uplo, n, nrhs, cnegone, a, lda, x, ldx, cone,work, n )
+                        
+              ! check whether the nrhs normwise backward errors satisfy the
+              ! stopping criterion. if yes, set iter=iiter>0 and return.
+              do i = 1, nrhs
+                 xnrm = cabs1( x( stdlib_iwamax( n, x( 1, i ), 1 ), i ) )
+                 rnrm = cabs1( work( stdlib_iwamax( n, work( 1, i ), 1 ), i ) )
+                 if( rnrm>xnrm*cte )go to 20
+              end do
+              ! if we are here, the nrhs normwise backward errors satisfy the
+              ! stopping criterion, we are good to exit.
+              iter = iiter
+              return
+              20 continue
+           end do loop_30
+           ! if we are at this place of the code, this is because we have
+           ! performed iter=itermax iterations and never satisfied the
+           ! stopping criterion, set up the iter flag accordingly and follow
+           ! up on quad precision routine.
+           iter = -itermax - 1
+           40 continue
+           ! single-precision iterative refinement failed to converge to a
+           ! satisfactory solution, so we resort to quad precision.
+           call stdlib_wpotrf( uplo, n, a, lda, info )
+           if( info/=0 )return
+           call stdlib_wlacpy( 'ALL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_wpotrs( uplo, n, nrhs, a, lda, x, ldx, info )
+           return
+     end subroutine stdlib_wcposv
+
+     !> ZDRSCL: multiplies an n-element complex vector x by the real scalar
+     !> 1/a.  This is done without overflow or underflow as long as
+     !> the final result x/a does not overflow or underflow.
+
+     pure subroutine stdlib_wdrscl( n, sa, sx, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           real(qp), intent(in) :: sa
+           ! Array Arguments 
+           complex(qp), intent(inout) :: sx(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           real(qp) :: bignum, cden, cden1, cnum, cnum1, mul, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 )return
+           ! get machine parameters
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! initialize the denominator to sa and the numerator to 1.
+           cden = sa
+           cnum = one
+           10 continue
+           cden1 = cden*smlnum
+           cnum1 = cnum / bignum
+           if( abs( cden1 )>abs( cnum ) .and. cnum/=zero ) then
+              ! pre-multiply x by smlnum if cden is large compared to cnum.
+              mul = smlnum
+              done = .false.
+              cden = cden1
+           else if( abs( cnum1 )>abs( cden ) ) then
+              ! pre-multiply x by bignum if cden is small compared to cnum.
+              mul = bignum
+              done = .false.
+              cnum = cnum1
+           else
+              ! multiply x by cnum / cden and return.
+              mul = cnum / cden
+              done = .true.
+           end if
+           ! scale the vector x by mul
+           call stdlib_wdscal( n, mul, sx, incx )
+           if( .not.done )go to 10
+           return
+     end subroutine stdlib_wdrscl
+
+     !> ZGBBRD: reduces a complex general m-by-n band matrix A to real upper
+     !> bidiagonal form B by a unitary transformation: Q**H * A * P = B.
+     !> The routine computes B, and optionally forms Q or P**H, or computes
+     !> Q**H*C for a given matrix C.
+
+     pure subroutine stdlib_wgbbrd( vect, m, n, ncc, kl, ku, ab, ldab, d, e, q,ldq, pt, ldpt, c, &
+               ldc, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldc, ldpt, ldq, m, n, ncc
+           ! Array Arguments 
+           real(qp), intent(out) :: d(*), e(*), rwork(*)
+           complex(qp), intent(inout) :: ab(ldab,*), c(ldc,*)
+           complex(qp), intent(out) :: pt(ldpt,*), q(ldq,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: wantb, wantc, wantpt, wantq
+           integer(ilp) :: i, inca, j, j1, j2, kb, kb1, kk, klm, klu1, kun, l, minmn, ml, ml0, mu,&
+                      mu0, nr, nrt
+           real(qp) :: abst, rc
+           complex(qp) :: ra, rb, rs, t
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           wantb = stdlib_lsame( vect, 'B' )
+           wantq = stdlib_lsame( vect, 'Q' ) .or. wantb
+           wantpt = stdlib_lsame( vect, 'P' ) .or. wantb
+           wantc = ncc>0
+           klu1 = kl + ku + 1
+           info = 0
+           if( .not.wantq .and. .not.wantpt .and. .not.stdlib_lsame( vect, 'N' ) )then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ncc<0 ) then
+              info = -4
+           else if( kl<0 ) then
+              info = -5
+           else if( ku<0 ) then
+              info = -6
+           else if( ldab<klu1 ) then
+              info = -8
+           else if( ldq<1 .or. wantq .and. ldq<max( 1, m ) ) then
+              info = -12
+           else if( ldpt<1 .or. wantpt .and. ldpt<max( 1, n ) ) then
+              info = -14
+           else if( ldc<1 .or. wantc .and. ldc<max( 1, m ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBBRD', -info )
+              return
+           end if
+           ! initialize q and p**h to the unit matrix, if needed
+           if( wantq )call stdlib_wlaset( 'FULL', m, m, czero, cone, q, ldq )
+           if( wantpt )call stdlib_wlaset( 'FULL', n, n, czero, cone, pt, ldpt )
+           ! quick return if possible.
+           if( m==0 .or. n==0 )return
+           minmn = min( m, n )
+           if( kl+ku>1 ) then
+              ! reduce to upper bidiagonal form if ku > 0; if ku = 0, reduce
+              ! first to lower bidiagonal form and then transform to upper
+              ! bidiagonal
+              if( ku>0 ) then
+                 ml0 = 1
+                 mu0 = 2
+              else
+                 ml0 = 2
+                 mu0 = 1
+              end if
+              ! wherever possible, plane rotations are generated and applied in
+              ! vector operations of length nr over the index set j1:j2:klu1.
+              ! the complex sines of the plane rotations are stored in work,
+              ! and the real cosines in rwork.
+              klm = min( m-1, kl )
+              kun = min( n-1, ku )
+              kb = klm + kun
+              kb1 = kb + 1
+              inca = kb1*ldab
+              nr = 0
+              j1 = klm + 2
+              j2 = 1 - kun
+              loop_90: do i = 1, minmn
+                 ! reduce i-th column and i-th row of matrix to bidiagonal form
+                 ml = klm + 1
+                 mu = kun + 1
+                 loop_80: do kk = 1, kb
+                    j1 = j1 + kb
+                    j2 = j2 + kb
+                    ! generate plane rotations to annihilate nonzero elements
+                    ! which have been created below the band
+                    if( nr>0 )call stdlib_wlargv( nr, ab( klu1, j1-klm-1 ), inca,work( j1 ), kb1, &
+                              rwork( j1 ), kb1 )
+                    ! apply plane rotations from the left
+                    do l = 1, kb
+                       if( j2-klm+l-1>n ) then
+                          nrt = nr - 1
+                       else
+                          nrt = nr
+                       end if
+                       if( nrt>0 )call stdlib_wlartv( nrt, ab( klu1-l, j1-klm+l-1 ), inca,ab( &
+                                 klu1-l+1, j1-klm+l-1 ), inca,rwork( j1 ), work( j1 ), kb1 )
+                    end do
+                    if( ml>ml0 ) then
+                       if( ml<=m-i+1 ) then
+                          ! generate plane rotation to annihilate a(i+ml-1,i)
+                          ! within the band, and apply rotation from the left
+                          call stdlib_wlartg( ab( ku+ml-1, i ), ab( ku+ml, i ),rwork( i+ml-1 ), &
+                                    work( i+ml-1 ), ra )
+                          ab( ku+ml-1, i ) = ra
+                          if( i<n )call stdlib_wrot( min( ku+ml-2, n-i ),ab( ku+ml-2, i+1 ), ldab-&
+                                    1,ab( ku+ml-1, i+1 ), ldab-1,rwork( i+ml-1 ), work( i+ml-1 ) )
+                       end if
+                       nr = nr + 1
+                       j1 = j1 - kb1
+                    end if
+                    if( wantq ) then
+                       ! accumulate product of plane rotations in q
+                       do j = j1, j2, kb1
+                          call stdlib_wrot( m, q( 1, j-1 ), 1, q( 1, j ), 1,rwork( j ), conjg( &
+                                    work( j ) ) )
+                       end do
+                    end if
+                    if( wantc ) then
+                       ! apply plane rotations to c
+                       do j = j1, j2, kb1
+                          call stdlib_wrot( ncc, c( j-1, 1 ), ldc, c( j, 1 ), ldc,rwork( j ), &
+                                    work( j ) )
+                       end do
+                    end if
+                    if( j2+kun>n ) then
+                       ! adjust j2 to keep within the bounds of the matrix
+                       nr = nr - 1
+                       j2 = j2 - kb1
+                    end if
+                    do j = j1, j2, kb1
+                       ! create nonzero element a(j-1,j+ku) above the band
+                       ! and store it in work(n+1:2*n)
+                       work( j+kun ) = work( j )*ab( 1, j+kun )
+                       ab( 1, j+kun ) = rwork( j )*ab( 1, j+kun )
+                    end do
+                    ! generate plane rotations to annihilate nonzero elements
+                    ! which have been generated above the band
+                    if( nr>0 )call stdlib_wlargv( nr, ab( 1, j1+kun-1 ), inca,work( j1+kun ), kb1,&
+                               rwork( j1+kun ),kb1 )
+                    ! apply plane rotations from the right
+                    do l = 1, kb
+                       if( j2+l-1>m ) then
+                          nrt = nr - 1
+                       else
+                          nrt = nr
+                       end if
+                       if( nrt>0 )call stdlib_wlartv( nrt, ab( l+1, j1+kun-1 ), inca,ab( l, j1+&
+                                 kun ), inca,rwork( j1+kun ), work( j1+kun ), kb1 )
+                    end do
+                    if( ml==ml0 .and. mu>mu0 ) then
+                       if( mu<=n-i+1 ) then
+                          ! generate plane rotation to annihilate a(i,i+mu-1)
+                          ! within the band, and apply rotation from the right
+                          call stdlib_wlartg( ab( ku-mu+3, i+mu-2 ),ab( ku-mu+2, i+mu-1 ),rwork( &
+                                    i+mu-1 ), work( i+mu-1 ), ra )
+                          ab( ku-mu+3, i+mu-2 ) = ra
+                          call stdlib_wrot( min( kl+mu-2, m-i ),ab( ku-mu+4, i+mu-2 ), 1,ab( ku-&
+                                    mu+3, i+mu-1 ), 1,rwork( i+mu-1 ), work( i+mu-1 ) )
+                       end if
+                       nr = nr + 1
+                       j1 = j1 - kb1
+                    end if
+                    if( wantpt ) then
+                       ! accumulate product of plane rotations in p**h
+                       do j = j1, j2, kb1
+                          call stdlib_wrot( n, pt( j+kun-1, 1 ), ldpt,pt( j+kun, 1 ), ldpt, rwork(&
+                                     j+kun ),conjg( work( j+kun ) ) )
+                       end do
+                    end if
+                    if( j2+kb>m ) then
+                       ! adjust j2 to keep within the bounds of the matrix
+                       nr = nr - 1
+                       j2 = j2 - kb1
+                    end if
+                    do j = j1, j2, kb1
+                       ! create nonzero element a(j+kl+ku,j+ku-1) below the
+                       ! band and store it in work(1:n)
+                       work( j+kb ) = work( j+kun )*ab( klu1, j+kun )
+                       ab( klu1, j+kun ) = rwork( j+kun )*ab( klu1, j+kun )
+                    end do
+                    if( ml>ml0 ) then
+                       ml = ml - 1
+                    else
+                       mu = mu - 1
+                    end if
+                 end do loop_80
+              end do loop_90
+           end if
+           if( ku==0 .and. kl>0 ) then
+              ! a has been reduced to complex lower bidiagonal form
+              ! transform lower bidiagonal form to upper bidiagonal by applying
+              ! plane rotations from the left, overwriting superdiagonal
+              ! elements on subdiagonal elements
+              do i = 1, min( m-1, n )
+                 call stdlib_wlartg( ab( 1, i ), ab( 2, i ), rc, rs, ra )
+                 ab( 1, i ) = ra
+                 if( i<n ) then
+                    ab( 2, i ) = rs*ab( 1, i+1 )
+                    ab( 1, i+1 ) = rc*ab( 1, i+1 )
+                 end if
+                 if( wantq )call stdlib_wrot( m, q( 1, i ), 1, q( 1, i+1 ), 1, rc,conjg( rs ) )
+                           
+                 if( wantc )call stdlib_wrot( ncc, c( i, 1 ), ldc, c( i+1, 1 ), ldc, rc,rs )
+                           
+              end do
+           else
+              ! a has been reduced to complex upper bidiagonal form or is
+              ! diagonal
+              if( ku>0 .and. m<n ) then
+                 ! annihilate a(m,m+1) by applying plane rotations from the
+                 ! right
+                 rb = ab( ku, m+1 )
+                 do i = m, 1, -1
+                    call stdlib_wlartg( ab( ku+1, i ), rb, rc, rs, ra )
+                    ab( ku+1, i ) = ra
+                    if( i>1 ) then
+                       rb = -conjg( rs )*ab( ku, i )
+                       ab( ku, i ) = rc*ab( ku, i )
+                    end if
+                    if( wantpt )call stdlib_wrot( n, pt( i, 1 ), ldpt, pt( m+1, 1 ), ldpt,rc, &
+                              conjg( rs ) )
+                 end do
+              end if
+           end if
+           ! make diagonal and superdiagonal elements real, storing them in d
+           ! and e
+           t = ab( ku+1, 1 )
+           loop_120: do i = 1, minmn
+              abst = abs( t )
+              d( i ) = abst
+              if( abst/=zero ) then
+                 t = t / abst
+              else
+                 t = cone
+              end if
+              if( wantq )call stdlib_wscal( m, t, q( 1, i ), 1 )
+              if( wantc )call stdlib_wscal( ncc, conjg( t ), c( i, 1 ), ldc )
+              if( i<minmn ) then
+                 if( ku==0 .and. kl==0 ) then
+                    e( i ) = zero
+                    t = ab( 1, i+1 )
+                 else
+                    if( ku==0 ) then
+                       t = ab( 2, i )*conjg( t )
+                    else
+                       t = ab( ku, i+1 )*conjg( t )
+                    end if
+                    abst = abs( t )
+                    e( i ) = abst
+                    if( abst/=zero ) then
+                       t = t / abst
+                    else
+                       t = cone
+                    end if
+                    if( wantpt )call stdlib_wscal( n, t, pt( i+1, 1 ), ldpt )
+                    t = ab( ku+1, i+1 )*conjg( t )
+                 end if
+              end if
+           end do loop_120
+           return
+     end subroutine stdlib_wgbbrd
+
+     !> ZGBCON: estimates the reciprocal of the condition number of a complex
+     !> general band matrix A, in either the 1-norm or the infinity-norm,
+     !> using the LU factorization computed by ZGBTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     pure subroutine stdlib_wgbcon( norm, n, kl, ku, ab, ldab, ipiv, anorm, rcond,work, rwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(in) :: ab(ldab,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnoti, onenrm
+           character :: normin
+           integer(ilp) :: ix, j, jp, kase, kase1, kd, lm
+           real(qp) :: ainvnm, scale, smlnum
+           complex(qp) :: t, zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,min
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<2*kl+ku+1 ) then
+              info = -6
+           else if( anorm<zero ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           ! estimate the norm of inv(a).
+           ainvnm = zero
+           normin = 'N'
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kd = kl + ku + 1
+           lnoti = kl>0
+           kase = 0
+           10 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(l).
+                 if( lnoti ) then
+                    do j = 1, n - 1
+                       lm = min( kl, n-j )
+                       jp = ipiv( j )
+                       t = work( jp )
+                       if( jp/=j ) then
+                          work( jp ) = work( j )
+                          work( j ) = t
+                       end if
+                       call stdlib_waxpy( lm, -t, ab( kd+1, j ), 1, work( j+1 ), 1 )
+                    end do
+                 end if
+                 ! multiply by inv(u).
+                 call stdlib_wlatbs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kl+ku, ab, &
+                           ldab, work, scale, rwork, info )
+              else
+                 ! multiply by inv(u**h).
+                 call stdlib_wlatbs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, kl+ku, &
+                           ab, ldab, work, scale, rwork,info )
+                 ! multiply by inv(l**h).
+                 if( lnoti ) then
+                    do j = n - 1, 1, -1
+                       lm = min( kl, n-j )
+                       work( j ) = work( j ) - stdlib_wdotc( lm, ab( kd+1, j ), 1,work( j+1 ), 1 )
+                                 
+                       jp = ipiv( j )
+                       if( jp/=j ) then
+                          t = work( jp )
+                          work( jp ) = work( j )
+                          work( j ) = t
+                       end if
+                    end do
+                 end if
+              end if
+              ! divide x by 1/scale if doing so will not cause overflow.
+              normin = 'Y'
+              if( scale/=one ) then
+                 ix = stdlib_iwamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 40
+                 call stdlib_wdrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           40 continue
+           return
+     end subroutine stdlib_wgbcon
+
+     !> ZGBEQU: computes row and column scalings intended to equilibrate an
+     !> M-by-N band matrix A and reduce its condition number.  R returns the
+     !> row scale factors and C the column scale factors, chosen to try to
+     !> make the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+     !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+     !> number and BIGNUM = largest safe number.  Use of these scaling
+     !> factors is not guaranteed to reduce the condition number of A but
+     !> works well in practice.
+
+     pure subroutine stdlib_wgbequ( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(qp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(qp), intent(out) :: c(*), r(*)
+           complex(qp), intent(in) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(qp) :: bignum, rcmax, rcmin, smlnum
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 r( i ) = max( r( i ), cabs1( ab( kd+i-j, j ) ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i))
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 c( j ) = max( c( j ), cabs1( ab( kd+i-j, j ) )*r( i ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j))
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_wgbequ
+
+     !> ZGBEQUB: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+     !> the radix.
+     !> R(i) and C(j) are restricted to be a power of the radix between
+     !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+     !> of these scaling factors is not guaranteed to reduce the condition
+     !> number of A but works well in practice.
+     !> This routine differs from ZGEEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled entries' magnitudes are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_wgbequb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(qp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(qp), intent(out) :: c(*), r(*)
+           complex(qp), intent(in) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(qp) :: bignum, rcmax, rcmin, smlnum, radix, logrdx
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,log,real,aimag
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.  assume smlnum is a power of the radix.
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           radix = stdlib_qlamch( 'B' )
+           logrdx = log(radix)
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 r( i ) = max( r( i ), cabs1( ab( kd+i-j, j ) ) )
+              end do
+           end do
+           do i = 1, m
+              if( r( i )>zero ) then
+                 r( i ) = radix**int( log( r( i ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i)).
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors.
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 c( j ) = max( c( j ), cabs1( ab( kd+i-j, j ) )*r( i ) )
+              end do
+              if( c( j )>zero ) then
+                 c( j ) = radix**int( log( c( j ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j)).
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_wgbequb
+
+     !> ZGBRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is banded, and provides
+     !> error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_wgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb,ipiv, b, ldb, x, &
+               ldx, ferr, berr, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldafb, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transn, transt
+           integer(ilp) :: count, i, j, k, kase, kk, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<kl+ku+1 ) then
+              info = -7
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -9
+           else if( ldb<max( 1, n ) ) then
+              info = -12
+           else if( ldx<max( 1, n ) ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = min( kl+ku+2, n+1 )
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_wcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_wgbmv( trans, n, n, kl, ku, -cone, ab, ldab, x( 1, j ), 1,cone, work, 1 &
+                        )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(op(a))*abs(x) + abs(b).
+              if( notran ) then
+                 do k = 1, n
+                    kk = ku + 1 - k
+                    xk = cabs1( x( k, j ) )
+                    do i = max( 1, k-ku ), min( n, k+kl )
+                       rwork( i ) = rwork( i ) + cabs1( ab( kk+i, k ) )*xk
+                    end do
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    kk = ku + 1 - k
+                    do i = max( 1, k-ku ), min( n, k+kl )
+                       s = s + cabs1( ab( kk+i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_wgbtrs( trans, n, kl, ku, 1, afb, ldafb, ipiv, work, n,info )
+                 call stdlib_waxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_wlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_wgbtrs( transt, n, kl, ku, 1, afb, ldafb, ipiv,work, n, info )
+                              
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_wgbtrs( transn, n, kl, ku, 1, afb, ldafb, ipiv,work, n, info )
+                              
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_wgbrfs
+
+     !> ZGBSV: computes the solution to a complex system of linear equations
+     !> A * X = B, where A is a band matrix of order N with KL subdiagonals
+     !> and KU superdiagonals, and X and B are N-by-NRHS matrices.
+     !> The LU decomposition with partial pivoting and row interchanges is
+     !> used to factor A as A = L * U, where L is a product of permutation
+     !> and unit lower triangular matrices with KL subdiagonals, and U is
+     !> upper triangular with KL+KU superdiagonals.  The factored form of A
+     !> is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_wgbsv( n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: ab(ldab,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( kl<0 ) then
+              info = -2
+           else if( ku<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<2*kl+ku+1 ) then
+              info = -6
+           else if( ldb<max( n, 1 ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBSV ', -info )
+              return
+           end if
+           ! compute the lu factorization of the band matrix a.
+           call stdlib_wgbtrf( n, n, kl, ku, ab, ldab, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_wgbtrs( 'NO TRANSPOSE', n, kl, ku, nrhs, ab, ldab, ipiv,b, ldb, info )
+                        
+           end if
+           return
+     end subroutine stdlib_wgbsv
+
+     !> ZGBSVX: uses the LU factorization to compute the solution to a complex
+     !> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+     !> where A is a band matrix of order N with KL subdiagonals and KU
+     !> superdiagonals, and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_wgbsvx( fact, trans, n, kl, ku, nrhs, ab, ldab, afb,ldafb, ipiv, equed, r, &
+               c, b, ldb, x, ldx,rcond, ferr, berr, work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldafb, ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(qp), intent(inout) :: c(*), r(*)
+           complex(qp), intent(inout) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+        ! moved setting of info = n+1 so info does not subsequently get
+        ! overwritten.  sven, 17 mar 05.
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: colequ, equil, nofact, notran, rowequ
+           character :: norm
+           integer(ilp) :: i, infequ, j, j1, j2
+           real(qp) :: amax, anorm, bignum, colcnd, rcmax, rcmin, rowcnd, rpvgrw, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rowequ = .false.
+              colequ = .false.
+           else
+              rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+              colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kl<0 ) then
+              info = -4
+           else if( ku<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kl+ku+1 ) then
+              info = -8
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -10
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rowequ .or. colequ .or. stdlib_lsame( &
+                     equed, 'N' ) ) ) then
+              info = -12
+           else
+              if( rowequ ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, r( j ) )
+                    rcmax = max( rcmax, r( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -13
+                 else if( n>0 ) then
+                    rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    rowcnd = one
+                 end if
+              end if
+              if( colequ .and. info==0 ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, c( j ) )
+                    rcmax = max( rcmax, c( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -14
+                 else if( n>0 ) then
+                    colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    colcnd = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -16
+                 else if( ldx<max( 1, n ) ) then
+                    info = -18
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_wgbequ( n, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, infequ )
+                        
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_wlaqgb( n, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+                           
+                 rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+                 colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( notran ) then
+              if( rowequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       b( i, j ) = r( i )*b( i, j )
+                    end do
+                 end do
+              end if
+           else if( colequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = c( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the lu factorization of the band matrix a.
+              do j = 1, n
+                 j1 = max( j-ku, 1 )
+                 j2 = min( j+kl, n )
+                 call stdlib_wcopy( j2-j1+1, ab( ku+1-j+j1, j ), 1,afb( kl+ku+1-j+j1, j ), 1 )
+                           
+              end do
+              call stdlib_wgbtrf( n, n, kl, ku, afb, ldafb, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 ) then
+                 ! compute the reciprocal pivot growth factor of the
+                 ! leading rank-deficient info columns of a.
+                 anorm = zero
+                 do j = 1, info
+                    do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                       anorm = max( anorm, abs( ab( i, j ) ) )
+                    end do
+                 end do
+                 rpvgrw = stdlib_wlantb( 'M', 'U', 'N', info, min( info-1, kl+ku ),afb( max( 1, &
+                           kl+ku+2-info ), 1 ), ldafb,rwork )
+                 if( rpvgrw==zero ) then
+                    rpvgrw = one
+                 else
+                    rpvgrw = anorm / rpvgrw
+                 end if
+                 rwork( 1 ) = rpvgrw
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a and the
+           ! reciprocal pivot growth factor rpvgrw.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_wlangb( norm, n, kl, ku, ab, ldab, rwork )
+           rpvgrw = stdlib_wlantb( 'M', 'U', 'N', n, kl+ku, afb, ldafb, rwork )
+           if( rpvgrw==zero ) then
+              rpvgrw = one
+           else
+              rpvgrw = stdlib_wlangb( 'M', n, kl, ku, ab, ldab, rwork ) / rpvgrw
+           end if
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_wgbcon( norm, n, kl, ku, afb, ldafb, ipiv, anorm, rcond,work, rwork, info )
+                     
+           ! compute the solution matrix x.
+           call stdlib_wlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_wgbtrs( trans, n, kl, ku, nrhs, afb, ldafb, ipiv, x, ldx,info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_wgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb, ipiv,b, ldb, x, ldx, &
+                     ferr, berr, work, rwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( notran ) then
+              if( colequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       x( i, j ) = c( i )*x( i, j )
+                    end do
+                 end do
+                 do j = 1, nrhs
+                    ferr( j ) = ferr( j ) / colcnd
+                 end do
+              end if
+           else if( rowequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = r( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / rowcnd
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           rwork( 1 ) = rpvgrw
+           return
+     end subroutine stdlib_wgbsvx
+
+     !> ZGBTF2: computes an LU factorization of a complex m-by-n band matrix
+     !> A using partial pivoting with row interchanges.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_wgbtf2( m, n, kl, ku, ab, ldab, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jp, ju, km, kv
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! kv is the number of superdiagonals in the factor u, allowing for
+           ! fill-in.
+           kv = ku + kl
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+kv+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! gaussian elimination with partial pivoting
+           ! set fill-in elements in columns ku+2 to kv to czero.
+           do j = ku + 2, min( kv, n )
+              do i = kv - j + 2, kl
+                 ab( i, j ) = czero
+              end do
+           end do
+           ! ju is the index of the last column affected by the current stage
+           ! of the factorization.
+           ju = 1
+           loop_40: do j = 1, min( m, n )
+              ! set fill-in elements in column j+kv to czero.
+              if( j+kv<=n ) then
+                 do i = 1, kl
+                    ab( i, j+kv ) = czero
+                 end do
+              end if
+              ! find pivot and test for singularity. km is the number of
+              ! subdiagonal elements in the current column.
+              km = min( kl, m-j )
+              jp = stdlib_iwamax( km+1, ab( kv+1, j ), 1 )
+              ipiv( j ) = jp + j - 1
+              if( ab( kv+jp, j )/=czero ) then
+                 ju = max( ju, min( j+ku+jp-1, n ) )
+                 ! apply interchange to columns j to ju.
+                 if( jp/=1 )call stdlib_wswap( ju-j+1, ab( kv+jp, j ), ldab-1,ab( kv+1, j ), ldab-&
+                           1 )
+                 if( km>0 ) then
+                    ! compute multipliers.
+                    call stdlib_wscal( km, cone / ab( kv+1, j ), ab( kv+2, j ), 1 )
+                    ! update trailing submatrix within the band.
+                    if( ju>j )call stdlib_wgeru( km, ju-j, -cone, ab( kv+2, j ), 1,ab( kv, j+1 ), &
+                              ldab-1, ab( kv+1, j+1 ),ldab-1 )
+                 end if
+              else
+                 ! if pivot is czero, set info to the index of the pivot
+                 ! unless a czero pivot has already been found.
+                 if( info==0 )info = j
+              end if
+           end do loop_40
+           return
+     end subroutine stdlib_wgbtf2
+
+     !> ZGBTRF: computes an LU factorization of a complex m-by-n band matrix A
+     !> using partial pivoting with row interchanges.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_wgbtrf( m, n, kl, ku, ab, ldab, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldwork = nbmax+1
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i2, i3, ii, ip, j, j2, j3, jb, jj, jm, jp, ju, k2, km, kv, nb, &
+                     nw
+           complex(qp) :: temp
+           ! Local Arrays 
+           complex(qp) :: work13(ldwork,nbmax), work31(ldwork,nbmax)
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! kv is the number of superdiagonals in the factor u, allowing for
+           ! fill-in
+           kv = ku + kl
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+kv+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! determine the block size for this environment
+           nb = stdlib_ilaenv( 1, 'ZGBTRF', ' ', m, n, kl, ku )
+           ! the block size must not exceed the limit set by the size of the
+           ! local arrays work13 and work31.
+           nb = min( nb, nbmax )
+           if( nb<=1 .or. nb>kl ) then
+              ! use unblocked code
+              call stdlib_wgbtf2( m, n, kl, ku, ab, ldab, ipiv, info )
+           else
+              ! use blocked code
+              ! czero the superdiagonal elements of the work array work13
+              do j = 1, nb
+                 do i = 1, j - 1
+                    work13( i, j ) = czero
+                 end do
+              end do
+              ! czero the subdiagonal elements of the work array work31
+              do j = 1, nb
+                 do i = j + 1, nb
+                    work31( i, j ) = czero
+                 end do
+              end do
+              ! gaussian elimination with partial pivoting
+              ! set fill-in elements in columns ku+2 to kv to czero
+              do j = ku + 2, min( kv, n )
+                 do i = kv - j + 2, kl
+                    ab( i, j ) = czero
+                 end do
+              end do
+              ! ju is the index of the last column affected by the current
+              ! stage of the factorization
+              ju = 1
+              loop_180: do j = 1, min( m, n ), nb
+                 jb = min( nb, min( m, n )-j+1 )
+                 ! the active part of the matrix is partitioned
+                    ! a11   a12   a13
+                    ! a21   a22   a23
+                    ! a31   a32   a33
+                 ! here a11, a21 and a31 denote the current block of jb columns
+                 ! which is about to be factorized. the number of rows in the
+                 ! partitioning are jb, i2, i3 respectively, and the numbers
+                 ! of columns are jb, j2, j3. the superdiagonal elements of a13
+                 ! and the subdiagonal elements of a31 lie outside the band.
+                 i2 = min( kl-jb, m-j-jb+1 )
+                 i3 = min( jb, m-j-kl+1 )
+                 ! j2 and j3 are computed after ju has been updated.
+                 ! factorize the current block of jb columns
+                 loop_80: do jj = j, j + jb - 1
+                    ! set fill-in elements in column jj+kv to czero
+                    if( jj+kv<=n ) then
+                       do i = 1, kl
+                          ab( i, jj+kv ) = czero
+                       end do
+                    end if
+                    ! find pivot and test for singularity. km is the number of
+                    ! subdiagonal elements in the current column.
+                    km = min( kl, m-jj )
+                    jp = stdlib_iwamax( km+1, ab( kv+1, jj ), 1 )
+                    ipiv( jj ) = jp + jj - j
+                    if( ab( kv+jp, jj )/=czero ) then
+                       ju = max( ju, min( jj+ku+jp-1, n ) )
+                       if( jp/=1 ) then
+                          ! apply interchange to columns j to j+jb-1
+                          if( jp+jj-1<j+kl ) then
+                             call stdlib_wswap( jb, ab( kv+1+jj-j, j ), ldab-1,ab( kv+jp+jj-j, j )&
+                                       , ldab-1 )
+                          else
+                             ! the interchange affects columns j to jj-1 of a31
+                             ! which are stored in the work array work31
+                             call stdlib_wswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,work31( jp+jj-j-&
+                                       kl, 1 ), ldwork )
+                             call stdlib_wswap( j+jb-jj, ab( kv+1, jj ), ldab-1,ab( kv+jp, jj ), &
+                                       ldab-1 )
+                          end if
+                       end if
+                       ! compute multipliers
+                       call stdlib_wscal( km, cone / ab( kv+1, jj ), ab( kv+2, jj ),1 )
+                       ! update trailing submatrix within the band and within
+                       ! the current block. jm is the index of the last column
+                       ! which needs to be updated.
+                       jm = min( ju, j+jb-1 )
+                       if( jm>jj )call stdlib_wgeru( km, jm-jj, -cone, ab( kv+2, jj ), 1,ab( kv, &
+                                 jj+1 ), ldab-1,ab( kv+1, jj+1 ), ldab-1 )
+                    else
+                       ! if pivot is czero, set info to the index of the pivot
+                       ! unless a czero pivot has already been found.
+                       if( info==0 )info = jj
+                    end if
+                    ! copy current column of a31 into the work array work31
+                    nw = min( jj-j+1, i3 )
+                    if( nw>0 )call stdlib_wcopy( nw, ab( kv+kl+1-jj+j, jj ), 1,work31( 1, jj-j+1 )&
+                              , 1 )
+                 end do loop_80
+                 if( j+jb<=n ) then
+                    ! apply the row interchanges to the other blocks.
+                    j2 = min( ju-j+1, kv ) - jb
+                    j3 = max( 0, ju-j-kv+1 )
+                    ! use stdlib_wlaswp to apply the row interchanges to a12, a22, and
+                    ! a32.
+                    call stdlib_wlaswp( j2, ab( kv+1-jb, j+jb ), ldab-1, 1, jb,ipiv( j ), 1 )
+                              
+                    ! adjust the pivot indices.
+                    do i = j, j + jb - 1
+                       ipiv( i ) = ipiv( i ) + j - 1
+                    end do
+                    ! apply the row interchanges to a13, a23, and a33
+                    ! columnwise.
+                    k2 = j - 1 + jb + j2
+                    do i = 1, j3
+                       jj = k2 + i
+                       do ii = j + i - 1, j + jb - 1
+                          ip = ipiv( ii )
+                          if( ip/=ii ) then
+                             temp = ab( kv+1+ii-jj, jj )
+                             ab( kv+1+ii-jj, jj ) = ab( kv+1+ip-jj, jj )
+                             ab( kv+1+ip-jj, jj ) = temp
+                          end if
+                       end do
+                    end do
+                    ! update the relevant part of the trailing submatrix
+                    if( j2>0 ) then
+                       ! update a12
+                       call stdlib_wtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT',jb, j2, cone, &
+                                 ab( kv+1, j ), ldab-1,ab( kv+1-jb, j+jb ), ldab-1 )
+                       if( i2>0 ) then
+                          ! update a22
+                          call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i2, j2,jb, -cone, ab(&
+                           kv+1+jb, j ), ldab-1,ab( kv+1-jb, j+jb ), ldab-1, cone,ab( kv+1, j+jb )&
+                                     , ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! update a32
+                          call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i3, j2,jb, -cone, &
+                          work31, ldwork,ab( kv+1-jb, j+jb ), ldab-1, cone,ab( kv+kl+1-jb, j+jb ),&
+                                     ldab-1 )
+                       end if
+                    end if
+                    if( j3>0 ) then
+                       ! copy the lower triangle of a13 into the work array
+                       ! work13
+                       do jj = 1, j3
+                          do ii = jj, jb
+                             work13( ii, jj ) = ab( ii-jj+1, jj+j+kv-1 )
+                          end do
+                       end do
+                       ! update a13 in the work array
+                       call stdlib_wtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT',jb, j3, cone, &
+                                 ab( kv+1, j ), ldab-1,work13, ldwork )
+                       if( i2>0 ) then
+                          ! update a23
+                          call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i2, j3,jb, -cone, ab(&
+                           kv+1+jb, j ), ldab-1,work13, ldwork, cone, ab( 1+jb, j+kv ),ldab-1 )
+                                     
+                       end if
+                       if( i3>0 ) then
+                          ! update a33
+                          call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i3, j3,jb, -cone, &
+                                    work31, ldwork, work13,ldwork, cone, ab( 1+kl, j+kv ), ldab-1 )
+                       end if
+                       ! copy the lower triangle of a13 back into place
+                       do jj = 1, j3
+                          do ii = jj, jb
+                             ab( ii-jj+1, jj+j+kv-1 ) = work13( ii, jj )
+                          end do
+                       end do
+                    end if
+                 else
+                    ! adjust the pivot indices.
+                    do i = j, j + jb - 1
+                       ipiv( i ) = ipiv( i ) + j - 1
+                    end do
+                 end if
+                 ! partially undo the interchanges in the current block to
+                 ! restore the upper triangular form of a31 and copy the upper
+                 ! triangle of a31 back into place
+                 do jj = j + jb - 1, j, -1
+                    jp = ipiv( jj ) - jj + 1
+                    if( jp/=1 ) then
+                       ! apply interchange to columns j to jj-1
+                       if( jp+jj-1<j+kl ) then
+                          ! the interchange does not affect a31
+                          call stdlib_wswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,ab( kv+jp+jj-j, j ),&
+                                     ldab-1 )
+                       else
+                          ! the interchange does affect a31
+                          call stdlib_wswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,work31( jp+jj-j-kl, &
+                                    1 ), ldwork )
+                       end if
+                    end if
+                    ! copy the current column of a31 back into place
+                    nw = min( i3, jj-j+1 )
+                    if( nw>0 )call stdlib_wcopy( nw, work31( 1, jj-j+1 ), 1,ab( kv+kl+1-jj+j, jj )&
+                              , 1 )
+                 end do
+              end do loop_180
+           end if
+           return
+     end subroutine stdlib_wgbtrf
+
+     !> ZGBTRS: solves a system of linear equations
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B
+     !> with a general band matrix A using the LU factorization computed
+     !> by ZGBTRF.
+
+     pure subroutine stdlib_wgbtrs( trans, n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: ab(ldab,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnoti, notran
+           integer(ilp) :: i, j, kd, l, lm
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<( 2*kl+ku+1 ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           kd = ku + kl + 1
+           lnoti = kl>0
+           if( notran ) then
+              ! solve  a*x = b.
+              ! solve l*x = b, overwriting b with x.
+              ! l is represented as a product of permutations and unit lower
+              ! triangular matrices l = p(1) * l(1) * ... * p(n-1) * l(n-1),
+              ! where each transformation l(i) is a rank-cone modification of
+              ! the identity matrix.
+              if( lnoti ) then
+                 do j = 1, n - 1
+                    lm = min( kl, n-j )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_wswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                    call stdlib_wgeru( lm, nrhs, -cone, ab( kd+1, j ), 1, b( j, 1 ),ldb, b( j+1, &
+                              1 ), ldb )
+                 end do
+              end if
+              do i = 1, nrhs
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_wtbsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, kl+ku,ab, ldab, b( 1, &
+                           i ), 1 )
+              end do
+           else if( stdlib_lsame( trans, 'T' ) ) then
+              ! solve a**t * x = b.
+              do i = 1, nrhs
+                 ! solve u**t * x = b, overwriting b with x.
+                 call stdlib_wtbsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, kl+ku, ab,ldab, b( 1, i )&
+                           , 1 )
+              end do
+              ! solve l**t * x = b, overwriting b with x.
+              if( lnoti ) then
+                 do j = n - 1, 1, -1
+                    lm = min( kl, n-j )
+                    call stdlib_wgemv( 'TRANSPOSE', lm, nrhs, -cone, b( j+1, 1 ),ldb, ab( kd+1, j &
+                              ), 1, cone, b( j, 1 ), ldb )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_wswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                 end do
+              end if
+           else
+              ! solve a**h * x = b.
+              do i = 1, nrhs
+                 ! solve u**h * x = b, overwriting b with x.
+                 call stdlib_wtbsv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,kl+ku, ab, ldab,&
+                            b( 1, i ), 1 )
+              end do
+              ! solve l**h * x = b, overwriting b with x.
+              if( lnoti ) then
+                 do j = n - 1, 1, -1
+                    lm = min( kl, n-j )
+                    call stdlib_wlacgv( nrhs, b( j, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', lm, nrhs, -cone,b( j+1, 1 ), ldb, &
+                              ab( kd+1, j ), 1, cone,b( j, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( j, 1 ), ldb )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_wswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wgbtrs
+
+     !> ZGEBAK: forms the right or left eigenvectors of a complex general
+     !> matrix by backward transformation on the computed eigenvectors of the
+     !> balanced matrix output by ZGEBAL.
+
+     pure subroutine stdlib_wgebak( job, side, n, ilo, ihi, scale, m, v, ldv,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job, side
+           integer(ilp), intent(in) :: ihi, ilo, ldv, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(in) :: scale(*)
+           complex(qp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: leftv, rightv
+           integer(ilp) :: i, ii, k
+           real(qp) :: s
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! decode and test the input parameters
+           rightv = stdlib_lsame( side, 'R' )
+           leftv = stdlib_lsame( side, 'L' )
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( .not.rightv .and. .not.leftv ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -4
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -5
+           else if( m<0 ) then
+              info = -7
+           else if( ldv<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEBAK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( m==0 )return
+           if( stdlib_lsame( job, 'N' ) )return
+           if( ilo==ihi )go to 30
+           ! backward balance
+           if( stdlib_lsame( job, 'S' ) .or. stdlib_lsame( job, 'B' ) ) then
+              if( rightv ) then
+                 do i = ilo, ihi
+                    s = scale( i )
+                    call stdlib_wdscal( m, s, v( i, 1 ), ldv )
+                 end do
+              end if
+              if( leftv ) then
+                 do i = ilo, ihi
+                    s = one / scale( i )
+                    call stdlib_wdscal( m, s, v( i, 1 ), ldv )
+                 end do
+              end if
+           end if
+           ! backward permutation
+           ! for  i = ilo-1 step -1 until 1,
+                    ! ihi+1 step 1 until n do --
+                    30 continue
+           if( stdlib_lsame( job, 'P' ) .or. stdlib_lsame( job, 'B' ) ) then
+              if( rightv ) then
+                 loop_40: do ii = 1, n
+                    i = ii
+                    if( i>=ilo .and. i<=ihi )cycle loop_40
+                    if( i<ilo )i = ilo - ii
+                    k = scale( i )
+                    if( k==i )cycle loop_40
+                    call stdlib_wswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_40
+              end if
+              if( leftv ) then
+                 loop_50: do ii = 1, n
+                    i = ii
+                    if( i>=ilo .and. i<=ihi )cycle loop_50
+                    if( i<ilo )i = ilo - ii
+                    k = scale( i )
+                    if( k==i )cycle loop_50
+                    call stdlib_wswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_50
+              end if
+           end if
+           return
+     end subroutine stdlib_wgebak
+
+     !> ZGEBAL: balances a general complex matrix A.  This involves, first,
+     !> permuting A by a similarity transformation to isolate eigenvalues
+     !> in the first 1 to ILO-1 and last IHI+1 to N elements on the
+     !> diagonal; and second, applying a diagonal similarity transformation
+     !> to rows and columns ILO to IHI to make the rows and columns as
+     !> close in norm as possible.  Both steps are optional.
+     !> Balancing may reduce the 1-norm of the matrix, and improve the
+     !> accuracy of the computed eigenvalues and/or eigenvectors.
+
+     pure subroutine stdlib_wgebal( job, n, a, lda, ilo, ihi, scale, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(out) :: scale(*)
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sclfac = 2.0e+0_qp
+           real(qp), parameter :: factor = 0.95e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: noconv
+           integer(ilp) :: i, ica, iexc, ira, j, k, l, m
+           real(qp) :: c, ca, f, g, r, ra, s, sfmax1, sfmax2, sfmin1, sfmin2
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! test the input parameters
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEBAL', -info )
+              return
+           end if
+           k = 1
+           l = n
+           if( n==0 )go to 210
+           if( stdlib_lsame( job, 'N' ) ) then
+              do i = 1, n
+                 scale( i ) = one
+              end do
+              go to 210
+           end if
+           if( stdlib_lsame( job, 'S' ) )go to 120
+           ! permutation to isolate eigenvalues if possible
+           go to 50
+           ! row and column exchange.
+           20 continue
+           scale( m ) = j
+           if( j==m )go to 30
+           call stdlib_wswap( l, a( 1, j ), 1, a( 1, m ), 1 )
+           call stdlib_wswap( n-k+1, a( j, k ), lda, a( m, k ), lda )
+           30 continue
+           go to ( 40, 80 )iexc
+           ! search for rows isolating an eigenvalue and push them down.
+           40 continue
+           if( l==1 )go to 210
+           l = l - 1
+           50 continue
+           loop_70: do j = l, 1, -1
+              loop_60: do i = 1, l
+                 if( i==j )cycle loop_60
+                 if( real( a( j, i ),KIND=qp)/=zero .or. aimag( a( j, i ) )/=zero )cycle &
+                           loop_70
+              end do loop_60
+              m = l
+              iexc = 1
+              go to 20
+           end do loop_70
+           go to 90
+           ! search for columns isolating an eigenvalue and push them left.
+           80 continue
+           k = k + 1
+           90 continue
+           loop_110: do j = k, l
+              loop_100: do i = k, l
+                 if( i==j )cycle loop_100
+                 if( real( a( i, j ),KIND=qp)/=zero .or. aimag( a( i, j ) )/=zero )cycle &
+                           loop_110
+              end do loop_100
+              m = k
+              iexc = 2
+              go to 20
+           end do loop_110
+           120 continue
+           do i = k, l
+              scale( i ) = one
+           end do
+           if( stdlib_lsame( job, 'P' ) )go to 210
+           ! balance the submatrix in rows k to l.
+           ! iterative loop for norm reduction
+           sfmin1 = stdlib_qlamch( 'S' ) / stdlib_qlamch( 'P' )
+           sfmax1 = one / sfmin1
+           sfmin2 = sfmin1*sclfac
+           sfmax2 = one / sfmin2
+           140 continue
+           noconv = .false.
+           loop_200: do i = k, l
+              c = stdlib_qznrm2( l-k+1, a( k, i ), 1 )
+              r = stdlib_qznrm2( l-k+1, a( i, k ), lda )
+              ica = stdlib_iwamax( l, a( 1, i ), 1 )
+              ca = abs( a( ica, i ) )
+              ira = stdlib_iwamax( n-k+1, a( i, k ), lda )
+              ra = abs( a( i, ira+k-1 ) )
+              ! guard against zero c or r due to underflow.
+              if( c==zero .or. r==zero )cycle loop_200
+              g = r / sclfac
+              f = one
+              s = c + r
+              160 continue
+              if( c>=g .or. max( f, c, ca )>=sfmax2 .or.min( r, g, ra )<=sfmin2 )go to 170
+                 if( stdlib_qisnan( c+f+ca+r+g+ra ) ) then
+                 ! exit if nan to avoid infinite loop
+                 info = -3
+                 call stdlib_xerbla( 'ZGEBAL', -info )
+                 return
+              end if
+              f = f*sclfac
+              c = c*sclfac
+              ca = ca*sclfac
+              r = r / sclfac
+              g = g / sclfac
+              ra = ra / sclfac
+              go to 160
+              170 continue
+              g = c / sclfac
+              180 continue
+              if( g<r .or. max( r, ra )>=sfmax2 .or.min( f, c, g, ca )<=sfmin2 )go to 190
+              f = f / sclfac
+              c = c / sclfac
+              g = g / sclfac
+              ca = ca / sclfac
+              r = r*sclfac
+              ra = ra*sclfac
+              go to 180
+              ! now balance.
+              190 continue
+              if( ( c+r )>=factor*s )cycle loop_200
+              if( f<one .and. scale( i )<one ) then
+                 if( f*scale( i )<=sfmin1 )cycle loop_200
+              end if
+              if( f>one .and. scale( i )>one ) then
+                 if( scale( i )>=sfmax1 / f )cycle loop_200
+              end if
+              g = one / f
+              scale( i ) = scale( i )*f
+              noconv = .true.
+              call stdlib_wdscal( n-k+1, g, a( i, k ), lda )
+              call stdlib_wdscal( l, f, a( 1, i ), 1 )
+           end do loop_200
+           if( noconv )go to 140
+           210 continue
+           ilo = k
+           ihi = l
+           return
+     end subroutine stdlib_wgebal
+
+     !> ZGEBD2: reduces a complex general m by n matrix A to upper or lower
+     !> real bidiagonal form B by a unitary transformation: Q**H * A * P = B.
+     !> If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+     pure subroutine stdlib_wgebd2( m, n, a, lda, d, e, tauq, taup, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(out) :: d(*), e(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: taup(*), tauq(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info<0 ) then
+              call stdlib_xerbla( 'ZGEBD2', -info )
+              return
+           end if
+           if( m>=n ) then
+              ! reduce to upper bidiagonal form
+              do i = 1, n
+                 ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+                 alpha = a( i, i )
+                 call stdlib_wlarfg( m-i+1, alpha, a( min( i+1, m ), i ), 1,tauq( i ) )
+                 d( i ) = real( alpha,KIND=qp)
+                 a( i, i ) = cone
+                 ! apply h(i)**h to a(i:m,i+1:n) from the left
+                 if( i<n )call stdlib_wlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1,conjg( tauq( i ) ), &
+                           a( i, i+1 ), lda, work )
+                 a( i, i ) = d( i )
+                 if( i<n ) then
+                    ! generate elementary reflector g(i) to annihilate
+                    ! a(i,i+2:n)
+                    call stdlib_wlacgv( n-i, a( i, i+1 ), lda )
+                    alpha = a( i, i+1 )
+                    call stdlib_wlarfg( n-i, alpha, a( i, min( i+2, n ) ), lda,taup( i ) )
+                    e( i ) = real( alpha,KIND=qp)
+                    a( i, i+1 ) = cone
+                    ! apply g(i) to a(i+1:m,i+1:n) from the right
+                    call stdlib_wlarf( 'RIGHT', m-i, n-i, a( i, i+1 ), lda,taup( i ), a( i+1, i+1 &
+                              ), lda, work )
+                    call stdlib_wlacgv( n-i, a( i, i+1 ), lda )
+                    a( i, i+1 ) = e( i )
+                 else
+                    taup( i ) = czero
+                 end if
+              end do
+           else
+              ! reduce to lower bidiagonal form
+              do i = 1, m
+                 ! generate elementary reflector g(i) to annihilate a(i,i+1:n)
+                 call stdlib_wlacgv( n-i+1, a( i, i ), lda )
+                 alpha = a( i, i )
+                 call stdlib_wlarfg( n-i+1, alpha, a( i, min( i+1, n ) ), lda,taup( i ) )
+                 d( i ) = real( alpha,KIND=qp)
+                 a( i, i ) = cone
+                 ! apply g(i) to a(i+1:m,i:n) from the right
+                 if( i<m )call stdlib_wlarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda,taup( i ), a( i+&
+                           1, i ), lda, work )
+                 call stdlib_wlacgv( n-i+1, a( i, i ), lda )
+                 a( i, i ) = d( i )
+                 if( i<m ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(i+2:m,i)
+                    alpha = a( i+1, i )
+                    call stdlib_wlarfg( m-i, alpha, a( min( i+2, m ), i ), 1,tauq( i ) )
+                    e( i ) = real( alpha,KIND=qp)
+                    a( i+1, i ) = cone
+                    ! apply h(i)**h to a(i+1:m,i+1:n) from the left
+                    call stdlib_wlarf( 'LEFT', m-i, n-i, a( i+1, i ), 1,conjg( tauq( i ) ), a( i+&
+                              1, i+1 ), lda,work )
+                    a( i+1, i ) = e( i )
+                 else
+                    tauq( i ) = czero
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_wgebd2
+
+     !> ZGEBRD: reduces a general complex M-by-N matrix A to upper or lower
+     !> bidiagonal form B by a unitary transformation: Q**H * A * P = B.
+     !> If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+     pure subroutine stdlib_wgebrd( m, n, a, lda, d, e, tauq, taup, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(out) :: d(*), e(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: taup(*), tauq(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, ldwrkx, ldwrky, lwkopt, minmn, nb, nbmin, nx, ws
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb = max( 1, stdlib_ilaenv( 1, 'ZGEBRD', ' ', m, n, -1, -1 ) )
+           lwkopt = ( m+n )*nb
+           work( 1 ) = real( lwkopt,KIND=qp)
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, m, n ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info<0 ) then
+              call stdlib_xerbla( 'ZGEBRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           minmn = min( m, n )
+           if( minmn==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ws = max( m, n )
+           ldwrkx = m
+           ldwrky = n
+           if( nb>1 .and. nb<minmn ) then
+              ! set the crossover point nx.
+              nx = max( nb, stdlib_ilaenv( 3, 'ZGEBRD', ' ', m, n, -1, -1 ) )
+              ! determine when to switch from blocked to unblocked code.
+              if( nx<minmn ) then
+                 ws = ( m+n )*nb
+                 if( lwork<ws ) then
+                    ! not enough work space for the optimal nb, consider using
+                    ! a smaller block size.
+                    nbmin = stdlib_ilaenv( 2, 'ZGEBRD', ' ', m, n, -1, -1 )
+                    if( lwork>=( m+n )*nbmin ) then
+                       nb = lwork / ( m+n )
+                    else
+                       nb = 1
+                       nx = minmn
+                    end if
+                 end if
+              end if
+           else
+              nx = minmn
+           end if
+           do i = 1, minmn - nx, nb
+              ! reduce rows and columns i:i+ib-1 to bidiagonal form and return
+              ! the matrices x and y which are needed to update the unreduced
+              ! part of the matrix
+              call stdlib_wlabrd( m-i+1, n-i+1, nb, a( i, i ), lda, d( i ), e( i ),tauq( i ), &
+                        taup( i ), work, ldwrkx,work( ldwrkx*nb+1 ), ldwrky )
+              ! update the trailing submatrix a(i+ib:m,i+ib:n), using
+              ! an update of the form  a := a - v*y**h - x*u**h
+              call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m-i-nb+1,n-i-nb+1, nb, -&
+              cone, a( i+nb, i ), lda,work( ldwrkx*nb+nb+1 ), ldwrky, cone,a( i+nb, i+nb ), lda )
+                        
+              call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-i-nb+1, n-i-nb+1,nb, -cone, &
+                        work( nb+1 ), ldwrkx, a( i, i+nb ), lda,cone, a( i+nb, i+nb ), lda )
+              ! copy diagonal and off-diagonal elements of b back into a
+              if( m>=n ) then
+                 do j = i, i + nb - 1
+                    a( j, j ) = d( j )
+                    a( j, j+1 ) = e( j )
+                 end do
+              else
+                 do j = i, i + nb - 1
+                    a( j, j ) = d( j )
+                    a( j+1, j ) = e( j )
+                 end do
+              end if
+           end do
+           ! use unblocked code to reduce the remainder of the matrix
+           call stdlib_wgebd2( m-i+1, n-i+1, a( i, i ), lda, d( i ), e( i ),tauq( i ), taup( i ), &
+                     work, iinfo )
+           work( 1 ) = ws
+           return
+     end subroutine stdlib_wgebrd
+
+     !> ZGECON: estimates the reciprocal of the condition number of a general
+     !> complex matrix A, in either the 1-norm or the infinity-norm, using
+     !> the LU factorization computed by ZGETRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     pure subroutine stdlib_wgecon( norm, n, a, lda, anorm, rcond, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: onenrm
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(qp) :: ainvnm, scale, sl, smlnum, su
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGECON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           ! estimate the norm of inv(a).
+           ainvnm = zero
+           normin = 'N'
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kase = 0
+           10 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(l).
+                 call stdlib_wlatrs( 'LOWER', 'NO TRANSPOSE', 'UNIT', normin, n, a,lda, work, sl, &
+                           rwork, info )
+                 ! multiply by inv(u).
+                 call stdlib_wlatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           su, rwork( n+1 ), info )
+              else
+                 ! multiply by inv(u**h).
+                 call stdlib_wlatrs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, a, lda,&
+                            work, su, rwork( n+1 ),info )
+                 ! multiply by inv(l**h).
+                 call stdlib_wlatrs( 'LOWER', 'CONJUGATE TRANSPOSE', 'UNIT', normin,n, a, lda, &
+                           work, sl, rwork, info )
+              end if
+              ! divide x by 1/(sl*su) if doing so will not cause overflow.
+              scale = sl*su
+              normin = 'Y'
+              if( scale/=one ) then
+                 ix = stdlib_iwamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_wdrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_wgecon
+
+     !> ZGEEQU: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+     !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+     !> number and BIGNUM = largest safe number.  Use of these scaling
+     !> factors is not guaranteed to reduce the condition number of A but
+     !> works well in practice.
+
+     pure subroutine stdlib_wgeequ( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           real(qp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(qp), intent(out) :: c(*), r(*)
+           complex(qp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: bignum, rcmax, rcmin, smlnum
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           do j = 1, n
+              do i = 1, m
+                 r( i ) = max( r( i ), cabs1( a( i, j ) ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i))
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = 1, m
+                 c( j ) = max( c( j ), cabs1( a( i, j ) )*r( i ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j))
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_wgeequ
+
+     !> ZGEEQUB: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+     !> the radix.
+     !> R(i) and C(j) are restricted to be a power of the radix between
+     !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+     !> of these scaling factors is not guaranteed to reduce the condition
+     !> number of A but works well in practice.
+     !> This routine differs from ZGEEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled entries' magnitudes are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_wgeequb( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           real(qp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(qp), intent(out) :: c(*), r(*)
+           complex(qp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: bignum, rcmax, rcmin, smlnum, radix, logrdx
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,log,real,aimag
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.  assume smlnum is a power of the radix.
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           radix = stdlib_qlamch( 'B' )
+           logrdx = log( radix )
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           do j = 1, n
+              do i = 1, m
+                 r( i ) = max( r( i ), cabs1( a( i, j ) ) )
+              end do
+           end do
+           do i = 1, m
+              if( r( i )>zero ) then
+                 r( i ) = radix**int( log(r( i ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i)).
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors.
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = 1, m
+                 c( j ) = max( c( j ), cabs1( a( i, j ) )*r( i ) )
+              end do
+              if( c( j )>zero ) then
+                 c( j ) = radix**int( log( c( j ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j)).
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_wgeequb
+
+     !> ZGEES: computes for an N-by-N complex nonsymmetric matrix A, the
+     !> eigenvalues, the Schur form T, and, optionally, the matrix of Schur
+     !> vectors Z.  This gives the Schur factorization A = Z*T*(Z**H).
+     !> Optionally, it also orders the eigenvalues on the diagonal of the
+     !> Schur form so that selected eigenvalues are at the top left.
+     !> The leading columns of Z then form an orthonormal basis for the
+     !> invariant subspace corresponding to the selected eigenvalues.
+     !> A complex matrix is in Schur form if it is upper triangular.
+
+     subroutine stdlib_wgees( jobvs, sort, select, n, a, lda, sdim, w, vs,ldvs, work, lwork, &
+               rwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvs, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldvs, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: vs(ldvs,*), w(*), work(*)
+           ! Function Arguments 
+           procedure(stdlib_select_w) :: select
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantst, wantvs
+           integer(ilp) :: hswork, i, ibal, icond, ierr, ieval, ihi, ilo, itau, iwrk, maxwrk, &
+                     minwrk
+           real(qp) :: anrm, bignum, cscale, eps, s, sep, smlnum
+           ! Local Arrays 
+           real(qp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvs = stdlib_lsame( jobvs, 'V' )
+           wantst = stdlib_lsame( sort, 'S' )
+           if( ( .not.wantvs ) .and. ( .not.stdlib_lsame( jobvs, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldvs<1 .or. ( wantvs .and. ldvs<n ) ) then
+              info = -10
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to real
+             ! workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_whseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'ZGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 2*n
+                 call stdlib_whseqr( 'S', jobvs, n, 1, n, a, lda, w, vs, ldvs,work, -1, ieval )
+                           
+                 hswork = real( work( 1 ),KIND=qp)
+                 if( .not.wantvs ) then
+                    maxwrk = max( maxwrk, hswork )
+                 else
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'ZUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    maxwrk = max( maxwrk, hswork )
+                 end if
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEES ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_wlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (cworkspace: none)
+           ! (rworkspace: need n)
+           ibal = 1
+           call stdlib_wgebal( 'P', n, a, lda, ilo, ihi, rwork( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (cworkspace: need 2*n, prefer n+n*nb)
+           ! (rworkspace: none)
+           itau = 1
+           iwrk = n + itau
+           call stdlib_wgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvs ) then
+              ! copy householder vectors to vs
+              call stdlib_wlacpy( 'L', n, n, a, lda, vs, ldvs )
+              ! generate unitary matrix in vs
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_wunghr( n, ilo, ihi, vs, ldvs, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+           end if
+           sdim = 0
+           ! perform qr iteration, accumulating schur vectors in vs if desired
+           ! (cworkspace: need 1, prefer hswork (see comments) )
+           ! (rworkspace: none)
+           iwrk = itau
+           call stdlib_whseqr( 'S', jobvs, n, ilo, ihi, a, lda, w, vs, ldvs,work( iwrk ), lwork-&
+                     iwrk+1, ieval )
+           if( ieval>0 )info = ieval
+           ! sort eigenvalues if desired
+           if( wantst .and. info==0 ) then
+              if( scalea )call stdlib_wlascl( 'G', 0, 0, cscale, anrm, n, 1, w, n, ierr )
+              do i = 1, n
+                 bwork( i ) = select( w( i ) )
+              end do
+              ! reorder eigenvalues and transform schur vectors
+              ! (cworkspace: none)
+              ! (rworkspace: none)
+              call stdlib_wtrsen( 'N', jobvs, bwork, n, a, lda, vs, ldvs, w, sdim,s, sep, work( &
+                        iwrk ), lwork-iwrk+1, icond )
+           end if
+           if( wantvs ) then
+              ! undo balancing
+              ! (cworkspace: none)
+              ! (rworkspace: need n)
+              call stdlib_wgebak( 'P', 'R', n, ilo, ihi, rwork( ibal ), n, vs, ldvs,ierr )
+           end if
+           if( scalea ) then
+              ! undo scaling for the schur form of a
+              call stdlib_wlascl( 'U', 0, 0, cscale, anrm, n, n, a, lda, ierr )
+              call stdlib_wcopy( n, a, lda+1, w, 1 )
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_wgees
+
+     !> ZGEESX: computes for an N-by-N complex nonsymmetric matrix A, the
+     !> eigenvalues, the Schur form T, and, optionally, the matrix of Schur
+     !> vectors Z.  This gives the Schur factorization A = Z*T*(Z**H).
+     !> Optionally, it also orders the eigenvalues on the diagonal of the
+     !> Schur form so that selected eigenvalues are at the top left;
+     !> computes a reciprocal condition number for the average of the
+     !> selected eigenvalues (RCONDE); and computes a reciprocal condition
+     !> number for the right invariant subspace corresponding to the
+     !> selected eigenvalues (RCONDV).  The leading columns of Z form an
+     !> orthonormal basis for this invariant subspace.
+     !> For further explanation of the reciprocal condition numbers RCONDE
+     !> and RCONDV, see Section 4.10_qp of the LAPACK Users' Guide (where
+     !> these quantities are called s and sep respectively).
+     !> A complex matrix is in Schur form if it is upper triangular.
+
+     subroutine stdlib_wgeesx( jobvs, sort, select, sense, n, a, lda, sdim, w,vs, ldvs, rconde, &
+               rcondv, work, lwork, rwork,bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvs, sense, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldvs, lwork, n
+           real(qp), intent(out) :: rconde, rcondv
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: vs(ldvs,*), w(*), work(*)
+           ! Function Arguments 
+           procedure(stdlib_select_w) :: select
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantsb, wantse, wantsn, wantst, wantsv, wantvs
+           integer(ilp) :: hswork, i, ibal, icond, ierr, ieval, ihi, ilo, itau, iwrk, lwrk, &
+                     maxwrk, minwrk
+           real(qp) :: anrm, bignum, cscale, eps, smlnum
+           ! Local Arrays 
+           real(qp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           wantvs = stdlib_lsame( jobvs, 'V' )
+           wantst = stdlib_lsame( sort, 'S' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           lquery = ( lwork==-1 )
+           if( ( .not.wantvs ) .and. ( .not.stdlib_lsame( jobvs, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -2
+           else if( .not.( wantsn .or. wantse .or. wantsv .or. wantsb ) .or.( .not.wantst .and. &
+                     .not.wantsn ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldvs<1 .or. ( wantvs .and. ldvs<n ) ) then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of real workspace needed at that point in the
+             ! code, as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to real
+             ! workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_whseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.
+             ! if sense = 'e', 'v' or 'b', then the amount of workspace needed
+             ! depends on sdim, which is computed by the routine stdlib_wtrsen later
+             ! in the code.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 lwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'ZGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 2*n
+                 call stdlib_whseqr( 'S', jobvs, n, 1, n, a, lda, w, vs, ldvs,work, -1, ieval )
+                           
+                 hswork = real( work( 1 ),KIND=qp)
+                 if( .not.wantvs ) then
+                    maxwrk = max( maxwrk, hswork )
+                 else
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'ZUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    maxwrk = max( maxwrk, hswork )
+                 end if
+                 lwrk = maxwrk
+                 if( .not.wantsn )lwrk = max( lwrk, ( n*n )/2 )
+              end if
+              work( 1 ) = lwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEESX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_wlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (cworkspace: none)
+           ! (rworkspace: need n)
+           ibal = 1
+           call stdlib_wgebal( 'P', n, a, lda, ilo, ihi, rwork( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (cworkspace: need 2*n, prefer n+n*nb)
+           ! (rworkspace: none)
+           itau = 1
+           iwrk = n + itau
+           call stdlib_wgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvs ) then
+              ! copy householder vectors to vs
+              call stdlib_wlacpy( 'L', n, n, a, lda, vs, ldvs )
+              ! generate unitary matrix in vs
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_wunghr( n, ilo, ihi, vs, ldvs, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+           end if
+           sdim = 0
+           ! perform qr iteration, accumulating schur vectors in vs if desired
+           ! (cworkspace: need 1, prefer hswork (see comments) )
+           ! (rworkspace: none)
+           iwrk = itau
+           call stdlib_whseqr( 'S', jobvs, n, ilo, ihi, a, lda, w, vs, ldvs,work( iwrk ), lwork-&
+                     iwrk+1, ieval )
+           if( ieval>0 )info = ieval
+           ! sort eigenvalues if desired
+           if( wantst .and. info==0 ) then
+              if( scalea )call stdlib_wlascl( 'G', 0, 0, cscale, anrm, n, 1, w, n, ierr )
+              do i = 1, n
+                 bwork( i ) = select( w( i ) )
+              end do
+              ! reorder eigenvalues, transform schur vectors, and compute
+              ! reciprocal condition numbers
+              ! (cworkspace: if sense is not 'n', need 2*sdim*(n-sdim)
+                           ! otherwise, need none )
+              ! (rworkspace: none)
+              call stdlib_wtrsen( sense, jobvs, bwork, n, a, lda, vs, ldvs, w, sdim,rconde, &
+                        rcondv, work( iwrk ), lwork-iwrk+1,icond )
+              if( .not.wantsn )maxwrk = max( maxwrk, 2*sdim*( n-sdim ) )
+              if( icond==-14 ) then
+                 ! not enough complex workspace
+                 info = -15
+              end if
+           end if
+           if( wantvs ) then
+              ! undo balancing
+              ! (cworkspace: none)
+              ! (rworkspace: need n)
+              call stdlib_wgebak( 'P', 'R', n, ilo, ihi, rwork( ibal ), n, vs, ldvs,ierr )
+           end if
+           if( scalea ) then
+              ! undo scaling for the schur form of a
+              call stdlib_wlascl( 'U', 0, 0, cscale, anrm, n, n, a, lda, ierr )
+              call stdlib_wcopy( n, a, lda+1, w, 1 )
+              if( ( wantsv .or. wantsb ) .and. info==0 ) then
+                 dum( 1 ) = rcondv
+                 call stdlib_qlascl( 'G', 0, 0, cscale, anrm, 1, 1, dum, 1, ierr )
+                 rcondv = dum( 1 )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_wgeesx
+
+     !> ZGEEV: computes for an N-by-N complex nonsymmetric matrix A, the
+     !> eigenvalues and, optionally, the left and/or right eigenvectors.
+     !> The right eigenvector v(j) of A satisfies
+     !> A * v(j) = lambda(j) * v(j)
+     !> where lambda(j) is its eigenvalue.
+     !> The left eigenvector u(j) of A satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H
+     !> where u(j)**H denotes the conjugate transpose of u(j).
+     !> The computed eigenvectors are normalized to have Euclidean norm
+     !> equal to 1 and largest component real.
+
+     subroutine stdlib_wgeev( jobvl, jobvr, n, a, lda, w, vl, ldvl, vr, ldvr,work, lwork, rwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: vl(ldvl,*), vr(ldvr,*), w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantvl, wantvr
+           character :: side
+           integer(ilp) :: hswork, i, ibal, ierr, ihi, ilo, irwork, itau, iwrk, k, lwork_trevc, &
+                     maxwrk, minwrk, nout
+           real(qp) :: anrm, bignum, cscale, eps, scl, smlnum
+           complex(qp) :: tmp
+           ! Local Arrays 
+           logical(lk) :: select(1)
+           real(qp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,conjg,aimag,max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvl = stdlib_lsame( jobvl, 'V' )
+           wantvr = stdlib_lsame( jobvr, 'V' )
+           if( ( .not.wantvl ) .and. ( .not.stdlib_lsame( jobvl, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantvr ) .and. ( .not.stdlib_lsame( jobvr, 'N' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldvl<1 .or. ( wantvl .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( wantvr .and. ldvr<n ) ) then
+              info = -10
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to real
+             ! workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_whseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'ZGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 2*n
+                 if( wantvl ) then
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'ZUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    call stdlib_wtrevc3( 'L', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, rwork, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    call stdlib_whseqr( 'S', 'V', n, 1, n, a, lda, w, vl, ldvl,work, -1, info )
+                              
+                 else if( wantvr ) then
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'ZUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    call stdlib_wtrevc3( 'R', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, rwork, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    call stdlib_whseqr( 'S', 'V', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                              
+                 else
+                    call stdlib_whseqr( 'E', 'N', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                              
+                 end if
+                 hswork = int( work(1),KIND=ilp)
+                 maxwrk = max( maxwrk, hswork, minwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_wlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! balance the matrix
+           ! (cworkspace: none)
+           ! (rworkspace: need n)
+           ibal = 1
+           call stdlib_wgebal( 'B', n, a, lda, ilo, ihi, rwork( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (cworkspace: need 2*n, prefer n+n*nb)
+           ! (rworkspace: none)
+           itau = 1
+           iwrk = itau + n
+           call stdlib_wgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvl ) then
+              ! want left eigenvectors
+              ! copy householder vectors to vl
+              side = 'L'
+              call stdlib_wlacpy( 'L', n, n, a, lda, vl, ldvl )
+              ! generate unitary matrix in vl
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_wunghr( n, ilo, ihi, vl, ldvl, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vl
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_whseqr( 'S', 'V', n, ilo, ihi, a, lda, w, vl, ldvl,work( iwrk ), lwork-&
+                        iwrk+1, info )
+              if( wantvr ) then
+                 ! want left and right eigenvectors
+                 ! copy schur vectors to vr
+                 side = 'B'
+                 call stdlib_wlacpy( 'F', n, n, vl, ldvl, vr, ldvr )
+              end if
+           else if( wantvr ) then
+              ! want right eigenvectors
+              ! copy householder vectors to vr
+              side = 'R'
+              call stdlib_wlacpy( 'L', n, n, a, lda, vr, ldvr )
+              ! generate unitary matrix in vr
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_wunghr( n, ilo, ihi, vr, ldvr, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vr
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_whseqr( 'S', 'V', n, ilo, ihi, a, lda, w, vr, ldvr,work( iwrk ), lwork-&
+                        iwrk+1, info )
+           else
+              ! compute eigenvalues only
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_whseqr( 'E', 'N', n, ilo, ihi, a, lda, w, vr, ldvr,work( iwrk ), lwork-&
+                        iwrk+1, info )
+           end if
+           ! if info /= 0 from stdlib_whseqr, then quit
+           if( info/=0 )go to 50
+           if( wantvl .or. wantvr ) then
+              ! compute left and/or right eigenvectors
+              ! (cworkspace: need 2*n, prefer n + 2*n*nb)
+              ! (rworkspace: need 2*n)
+              irwork = ibal + n
+              call stdlib_wtrevc3( side, 'B', select, n, a, lda, vl, ldvl, vr, ldvr,n, nout, work(&
+                         iwrk ), lwork-iwrk+1,rwork( irwork ), n, ierr )
+           end if
+           if( wantvl ) then
+              ! undo balancing of left eigenvectors
+              ! (cworkspace: none)
+              ! (rworkspace: need n)
+              call stdlib_wgebak( 'B', 'L', n, ilo, ihi, rwork( ibal ), n, vl, ldvl,ierr )
+              ! normalize left eigenvectors and make largest component real
+              do i = 1, n
+                 scl = one / stdlib_qznrm2( n, vl( 1, i ), 1 )
+                 call stdlib_wdscal( n, scl, vl( 1, i ), 1 )
+                 do k = 1, n
+                    rwork( irwork+k-1 ) = real( vl( k, i ),KIND=qp)**2 +aimag( vl( k, i ) )&
+                              **2
+                 end do
+                 k = stdlib_iqamax( n, rwork( irwork ), 1 )
+                 tmp = conjg( vl( k, i ) ) / sqrt( rwork( irwork+k-1 ) )
+                 call stdlib_wscal( n, tmp, vl( 1, i ), 1 )
+                 vl( k, i ) = cmplx( real( vl( k, i ),KIND=qp), zero,KIND=qp)
+              end do
+           end if
+           if( wantvr ) then
+              ! undo balancing of right eigenvectors
+              ! (cworkspace: none)
+              ! (rworkspace: need n)
+              call stdlib_wgebak( 'B', 'R', n, ilo, ihi, rwork( ibal ), n, vr, ldvr,ierr )
+              ! normalize right eigenvectors and make largest component real
+              do i = 1, n
+                 scl = one / stdlib_qznrm2( n, vr( 1, i ), 1 )
+                 call stdlib_wdscal( n, scl, vr( 1, i ), 1 )
+                 do k = 1, n
+                    rwork( irwork+k-1 ) = real( vr( k, i ),KIND=qp)**2 +aimag( vr( k, i ) )&
+                              **2
+                 end do
+                 k = stdlib_iqamax( n, rwork( irwork ), 1 )
+                 tmp = conjg( vr( k, i ) ) / sqrt( rwork( irwork+k-1 ) )
+                 call stdlib_wscal( n, tmp, vr( 1, i ), 1 )
+                 vr( k, i ) = cmplx( real( vr( k, i ),KIND=qp), zero,KIND=qp)
+              end do
+           end if
+           ! undo scaling if necessary
+           50 continue
+           if( scalea ) then
+              call stdlib_wlascl( 'G', 0, 0, cscale, anrm, n-info, 1, w( info+1 ),max( n-info, 1 )&
+                        , ierr )
+              if( info>0 ) then
+                 call stdlib_wlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, w, n, ierr )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_wgeev
+
+     !> ZGEEVX: computes for an N-by-N complex nonsymmetric matrix A, the
+     !> eigenvalues and, optionally, the left and/or right eigenvectors.
+     !> Optionally also, it computes a balancing transformation to improve
+     !> the conditioning of the eigenvalues and eigenvectors (ILO, IHI,
+     !> SCALE, and ABNRM), reciprocal condition numbers for the eigenvalues
+     !> (RCONDE), and reciprocal condition numbers for the right
+     !> eigenvectors (RCONDV).
+     !> The right eigenvector v(j) of A satisfies
+     !> A * v(j) = lambda(j) * v(j)
+     !> where lambda(j) is its eigenvalue.
+     !> The left eigenvector u(j) of A satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H
+     !> where u(j)**H denotes the conjugate transpose of u(j).
+     !> The computed eigenvectors are normalized to have Euclidean norm
+     !> equal to 1 and largest component real.
+     !> Balancing a matrix means permuting the rows and columns to make it
+     !> more nearly upper triangular, and applying a diagonal similarity
+     !> transformation D * A * D**(-1), where D is a diagonal matrix, to
+     !> make its rows and columns closer in norm and the condition numbers
+     !> of its eigenvalues and eigenvectors smaller.  The computed
+     !> reciprocal condition numbers correspond to the balanced matrix.
+     !> Permuting rows and columns will not change the condition numbers
+     !> (in exact arithmetic) but diagonal scaling will.  For further
+     !> explanation of balancing, see section 4.10.2_qp of the LAPACK
+     !> Users' Guide.
+
+     subroutine stdlib_wgeevx( balanc, jobvl, jobvr, sense, n, a, lda, w, vl,ldvl, vr, ldvr, ilo, &
+               ihi, scale, abnrm, rconde,rcondv, work, lwork, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: balanc, jobvl, jobvr, sense
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldvl, ldvr, lwork, n
+           real(qp), intent(out) :: abnrm
+           ! Array Arguments 
+           real(qp), intent(out) :: rconde(*), rcondv(*), rwork(*), scale(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: vl(ldvl,*), vr(ldvr,*), w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantvl, wantvr, wntsnb, wntsne, wntsnn, wntsnv
+           character :: job, side
+           integer(ilp) :: hswork, i, icond, ierr, itau, iwrk, k, lwork_trevc, maxwrk, minwrk, &
+                     nout
+           real(qp) :: anrm, bignum, cscale, eps, scl, smlnum
+           complex(qp) :: tmp
+           ! Local Arrays 
+           logical(lk) :: select(1)
+           real(qp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,conjg,aimag,max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvl = stdlib_lsame( jobvl, 'V' )
+           wantvr = stdlib_lsame( jobvr, 'V' )
+           wntsnn = stdlib_lsame( sense, 'N' )
+           wntsne = stdlib_lsame( sense, 'E' )
+           wntsnv = stdlib_lsame( sense, 'V' )
+           wntsnb = stdlib_lsame( sense, 'B' )
+           if( .not.( stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc, 'S' ) &
+                     .or.stdlib_lsame( balanc, 'P' ) .or. stdlib_lsame( balanc, 'B' ) ) ) then
+              info = -1
+           else if( ( .not.wantvl ) .and. ( .not.stdlib_lsame( jobvl, 'N' ) ) ) then
+              info = -2
+           else if( ( .not.wantvr ) .and. ( .not.stdlib_lsame( jobvr, 'N' ) ) ) then
+              info = -3
+           else if( .not.( wntsnn .or. wntsne .or. wntsnb .or. wntsnv ) .or.( ( wntsne .or. &
+                     wntsnb ) .and. .not.( wantvl .and.wantvr ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( wantvl .and. ldvl<n ) ) then
+              info = -10
+           else if( ldvr<1 .or. ( wantvr .and. ldvr<n ) ) then
+              info = -12
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to real
+             ! workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_whseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'ZGEHRD', ' ', n, 1, n, 0 )
+                 if( wantvl ) then
+                    call stdlib_wtrevc3( 'L', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, rwork, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, lwork_trevc )
+                    call stdlib_whseqr( 'S', 'V', n, 1, n, a, lda, w, vl, ldvl,work, -1, info )
+                              
+                 else if( wantvr ) then
+                    call stdlib_wtrevc3( 'R', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, rwork, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, lwork_trevc )
+                    call stdlib_whseqr( 'S', 'V', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                              
+                 else
+                    if( wntsnn ) then
+                       call stdlib_whseqr( 'E', 'N', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                                 
+                    else
+                       call stdlib_whseqr( 'S', 'N', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                                 
+                    end if
+                 end if
+                 hswork = int( work(1),KIND=ilp)
+                 if( ( .not.wantvl ) .and. ( .not.wantvr ) ) then
+                    minwrk = 2*n
+                    if( .not.( wntsnn .or. wntsne ) )minwrk = max( minwrk, n*n + 2*n )
+                    maxwrk = max( maxwrk, hswork )
+                    if( .not.( wntsnn .or. wntsne ) )maxwrk = max( maxwrk, n*n + 2*n )
+                 else
+                    minwrk = 2*n
+                    if( .not.( wntsnn .or. wntsne ) )minwrk = max( minwrk, n*n + 2*n )
+                    maxwrk = max( maxwrk, hswork )
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'ZUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    if( .not.( wntsnn .or. wntsne ) )maxwrk = max( maxwrk, n*n + 2*n )
+                    maxwrk = max( maxwrk, 2*n )
+                 end if
+                 maxwrk = max( maxwrk, minwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           icond = 0
+           anrm = stdlib_wlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_wlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! balance the matrix and compute abnrm
+           call stdlib_wgebal( balanc, n, a, lda, ilo, ihi, scale, ierr )
+           abnrm = stdlib_wlange( '1', n, n, a, lda, dum )
+           if( scalea ) then
+              dum( 1 ) = abnrm
+              call stdlib_qlascl( 'G', 0, 0, cscale, anrm, 1, 1, dum, 1, ierr )
+              abnrm = dum( 1 )
+           end if
+           ! reduce to upper hessenberg form
+           ! (cworkspace: need 2*n, prefer n+n*nb)
+           ! (rworkspace: none)
+           itau = 1
+           iwrk = itau + n
+           call stdlib_wgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvl ) then
+              ! want left eigenvectors
+              ! copy householder vectors to vl
+              side = 'L'
+              call stdlib_wlacpy( 'L', n, n, a, lda, vl, ldvl )
+              ! generate unitary matrix in vl
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_wunghr( n, ilo, ihi, vl, ldvl, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vl
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_whseqr( 'S', 'V', n, ilo, ihi, a, lda, w, vl, ldvl,work( iwrk ), lwork-&
+                        iwrk+1, info )
+              if( wantvr ) then
+                 ! want left and right eigenvectors
+                 ! copy schur vectors to vr
+                 side = 'B'
+                 call stdlib_wlacpy( 'F', n, n, vl, ldvl, vr, ldvr )
+              end if
+           else if( wantvr ) then
+              ! want right eigenvectors
+              ! copy householder vectors to vr
+              side = 'R'
+              call stdlib_wlacpy( 'L', n, n, a, lda, vr, ldvr )
+              ! generate unitary matrix in vr
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_wunghr( n, ilo, ihi, vr, ldvr, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vr
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_whseqr( 'S', 'V', n, ilo, ihi, a, lda, w, vr, ldvr,work( iwrk ), lwork-&
+                        iwrk+1, info )
+           else
+              ! compute eigenvalues only
+              ! if condition numbers desired, compute schur form
+              if( wntsnn ) then
+                 job = 'E'
+              else
+                 job = 'S'
+              end if
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_whseqr( job, 'N', n, ilo, ihi, a, lda, w, vr, ldvr,work( iwrk ), lwork-&
+                        iwrk+1, info )
+           end if
+           ! if info /= 0 from stdlib_whseqr, then quit
+           if( info/=0 )go to 50
+           if( wantvl .or. wantvr ) then
+              ! compute left and/or right eigenvectors
+              ! (cworkspace: need 2*n, prefer n + 2*n*nb)
+              ! (rworkspace: need n)
+              call stdlib_wtrevc3( side, 'B', select, n, a, lda, vl, ldvl, vr, ldvr,n, nout, work(&
+                         iwrk ), lwork-iwrk+1,rwork, n, ierr )
+           end if
+           ! compute condition numbers if desired
+           ! (cworkspace: need n*n+2*n unless sense = 'e')
+           ! (rworkspace: need 2*n unless sense = 'e')
+           if( .not.wntsnn ) then
+              call stdlib_wtrsna( sense, 'A', select, n, a, lda, vl, ldvl, vr, ldvr,rconde, &
+                        rcondv, n, nout, work( iwrk ), n, rwork,icond )
+           end if
+           if( wantvl ) then
+              ! undo balancing of left eigenvectors
+              call stdlib_wgebak( balanc, 'L', n, ilo, ihi, scale, n, vl, ldvl,ierr )
+              ! normalize left eigenvectors and make largest component real
+              do i = 1, n
+                 scl = one / stdlib_qznrm2( n, vl( 1, i ), 1 )
+                 call stdlib_wdscal( n, scl, vl( 1, i ), 1 )
+                 do k = 1, n
+                    rwork( k ) = real( vl( k, i ),KIND=qp)**2 +aimag( vl( k, i ) )**2
+                 end do
+                 k = stdlib_iqamax( n, rwork, 1 )
+                 tmp = conjg( vl( k, i ) ) / sqrt( rwork( k ) )
+                 call stdlib_wscal( n, tmp, vl( 1, i ), 1 )
+                 vl( k, i ) = cmplx( real( vl( k, i ),KIND=qp), zero,KIND=qp)
+              end do
+           end if
+           if( wantvr ) then
+              ! undo balancing of right eigenvectors
+              call stdlib_wgebak( balanc, 'R', n, ilo, ihi, scale, n, vr, ldvr,ierr )
+              ! normalize right eigenvectors and make largest component real
+              do i = 1, n
+                 scl = one / stdlib_qznrm2( n, vr( 1, i ), 1 )
+                 call stdlib_wdscal( n, scl, vr( 1, i ), 1 )
+                 do k = 1, n
+                    rwork( k ) = real( vr( k, i ),KIND=qp)**2 +aimag( vr( k, i ) )**2
+                 end do
+                 k = stdlib_iqamax( n, rwork, 1 )
+                 tmp = conjg( vr( k, i ) ) / sqrt( rwork( k ) )
+                 call stdlib_wscal( n, tmp, vr( 1, i ), 1 )
+                 vr( k, i ) = cmplx( real( vr( k, i ),KIND=qp), zero,KIND=qp)
+              end do
+           end if
+           ! undo scaling if necessary
+           50 continue
+           if( scalea ) then
+              call stdlib_wlascl( 'G', 0, 0, cscale, anrm, n-info, 1, w( info+1 ),max( n-info, 1 )&
+                        , ierr )
+              if( info==0 ) then
+                 if( ( wntsnv .or. wntsnb ) .and. icond==0 )call stdlib_qlascl( 'G', 0, 0, cscale,&
+                            anrm, n, 1, rcondv, n,ierr )
+              else
+                 call stdlib_wlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, w, n, ierr )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_wgeevx
+
+     !> ZGEHD2: reduces a complex general matrix A to upper Hessenberg form H
+     !> by a unitary similarity transformation:  Q**H * A * Q = H .
+
+     pure subroutine stdlib_wgehd2( n, ilo, ihi, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEHD2', -info )
+              return
+           end if
+           do i = ilo, ihi - 1
+              ! compute elementary reflector h(i) to annihilate a(i+2:ihi,i)
+              alpha = a( i+1, i )
+              call stdlib_wlarfg( ihi-i, alpha, a( min( i+2, n ), i ), 1, tau( i ) )
+              a( i+1, i ) = cone
+              ! apply h(i) to a(1:ihi,i+1:ihi) from the right
+              call stdlib_wlarf( 'RIGHT', ihi, ihi-i, a( i+1, i ), 1, tau( i ),a( 1, i+1 ), lda, &
+                        work )
+              ! apply h(i)**h to a(i+1:ihi,i+1:n) from the left
+              call stdlib_wlarf( 'LEFT', ihi-i, n-i, a( i+1, i ), 1,conjg( tau( i ) ), a( i+1, i+&
+                        1 ), lda, work )
+              a( i+1, i ) = alpha
+           end do
+           return
+     end subroutine stdlib_wgehd2
+
+     !> ZGEHRD: reduces a complex general matrix A to upper Hessenberg form H by
+     !> an unitary similarity transformation:  Q**H * A * Q = H .
+
+     pure subroutine stdlib_wgehrd( n, ilo, ihi, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iwt, j, ldwork, lwkopt, nb, nbmin, nh, nx
+           complex(qp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'ZGEHRD', ' ', n, ilo, ihi, -1 ) )
+              lwkopt = n*nb + tsize
+              work( 1 ) = lwkopt
+           endif
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEHRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! set elements 1:ilo-1 and ihi:n-1 of tau to czero
+           do i = 1, ilo - 1
+              tau( i ) = czero
+           end do
+           do i = max( 1, ihi ), n - 1
+              tau( i ) = czero
+           end do
+           ! quick return if possible
+           nh = ihi - ilo + 1
+           if( nh<=1 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! determine the block size
+           nb = min( nbmax, stdlib_ilaenv( 1, 'ZGEHRD', ' ', n, ilo, ihi, -1 ) )
+           nbmin = 2
+           if( nb>1 .and. nb<nh ) then
+              ! determine when to cross over from blocked to unblocked code
+              ! (last block is always handled by unblocked code)
+              nx = max( nb, stdlib_ilaenv( 3, 'ZGEHRD', ' ', n, ilo, ihi, -1 ) )
+              if( nx<nh ) then
+                 ! determine if workspace is large enough for blocked code
+                 if( lwork<n*nb+tsize ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGEHRD', ' ', n, ilo, ihi,-1 ) )
+                    if( lwork>=(n*nbmin + tsize) ) then
+                       nb = (lwork-tsize) / n
+                    else
+                       nb = 1
+                    end if
+                 end if
+              end if
+           end if
+           ldwork = n
+           if( nb<nbmin .or. nb>=nh ) then
+              ! use unblocked code below
+              i = ilo
+           else
+              ! use blocked code
+              iwt = 1 + n*nb
+              do i = ilo, ihi - 1 - nx, nb
+                 ib = min( nb, ihi-i )
+                 ! reduce columns i:i+ib-1 to hessenberg form, returning the
+                 ! matrices v and t of the block reflector h = i - v*t*v**h
+                 ! which performs the reduction, and also the matrix y = a*v*t
+                 call stdlib_wlahr2( ihi, i, ib, a( 1, i ), lda, tau( i ),work( iwt ), ldt, work, &
+                           ldwork )
+                 ! apply the block reflector h to a(1:ihi,i+ib:ihi) from the
+                 ! right, computing  a := a - y * v**h. v(i+ib,ib-1) must be set
+                 ! to 1
+                 ei = a( i+ib, i+ib-1 )
+                 a( i+ib, i+ib-1 ) = cone
+                 call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',ihi, ihi-i-ib+1,ib, -&
+                           cone, work, ldwork, a( i+ib, i ), lda, cone,a( 1, i+ib ), lda )
+                 a( i+ib, i+ib-1 ) = ei
+                 ! apply the block reflector h to a(1:i,i+1:i+ib-1) from the
+                 ! right
+                 call stdlib_wtrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', i, ib-1,cone, &
+                           a( i+1, i ), lda, work, ldwork )
+                 do j = 0, ib-2
+                    call stdlib_waxpy( i, -cone, work( ldwork*j+1 ), 1,a( 1, i+j+1 ), 1 )
+                 end do
+                 ! apply the block reflector h to a(i+1:ihi,i+ib:n) from the
+                 ! left
+                 call stdlib_wlarfb( 'LEFT', 'CONJUGATE TRANSPOSE', 'FORWARD','COLUMNWISE',ihi-i, &
+                 n-i-ib+1, ib, a( i+1, i ), lda,work( iwt ), ldt, a( i+1, i+ib ), lda,work, &
+                           ldwork )
+              end do
+           end if
+           ! use unblocked code to reduce the rest of the matrix
+           call stdlib_wgehd2( n, i, ihi, a, lda, tau, work, iinfo )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wgehrd
+
+     !> ZGEJSV: computes the singular value decomposition (SVD) of a complex M-by-N
+     !> matrix [A], where M >= N. The SVD of [A] is written as
+     !> [A] = [U] * [SIGMA] * [V]^*,
+     !> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+     !> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and
+     !> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are
+     !> the singular values of [A]. The columns of [U] and [V] are the left and
+     !> the right singular vectors of [A], respectively. The matrices [U] and [V]
+     !> are computed and stored in the arrays U and V, respectively. The diagonal
+     !> of [SIGMA] is computed and stored in the array SVA.
+
+     pure subroutine stdlib_wgejsv( joba, jobu, jobv, jobr, jobt, jobp,m, n, a, lda, sva, u, ldu, &
+               v, ldv,cwork, lwork, rwork, lrwork, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldv, lwork, lrwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: u(ldu,*), v(ldv,*), cwork(lwork)
+           real(qp), intent(out) :: sva(n), rwork(lrwork)
+           integer(ilp), intent(out) :: iwork(*)
+           character, intent(in) :: joba, jobp, jobr, jobt, jobu, jobv
+        ! ===========================================================================
+           
+           
+           ! Local Scalars 
+           complex(qp) :: ctemp
+           real(qp) :: aapp, aaqq, aatmax, aatmin, big, big1, cond_ok, condr1, condr2, entra, &
+                     entrat, epsln, maxprj, scalem, sconda, sfmin, small, temp1, uscal1, uscal2, xsc
+           integer(ilp) :: ierr, n1, nr, numrank, p, q, warning
+           logical(lk) :: almort, defr, errest, goscal, jracc, kill, lquery, lsvec, l2aber, &
+                     l2kill, l2pert, l2rank, l2tran, noscal, rowpiv, rsvec, transp
+           integer(ilp) :: optwrk, minwrk, minrwrk, miniwrk
+           integer(ilp) :: lwcon, lwlqf, lwqp3, lwqrf, lwunmlq, lwunmqr, lwunmqrm, lwsvdj, &
+                     lwsvdjv, lrwqp3, lrwcon, lrwsvdj, iwoff
+           integer(ilp) :: lwrk_wgelqf, lwrk_wgeqp3, lwrk_wgeqp3n, lwrk_wgeqrf, lwrk_wgesvj, &
+                     lwrk_wgesvjv, lwrk_wgesvju, lwrk_wunmlq, lwrk_wunmqr, lwrk_wunmqrm
+           ! Local Arrays
+           complex(qp) :: cdummy(1)
+           real(qp) :: rdummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,conjg,log,max,min,real,nint,sqrt
+           ! test the input arguments
+           lsvec  = stdlib_lsame( jobu, 'U' ) .or. stdlib_lsame( jobu, 'F' )
+           jracc  = stdlib_lsame( jobv, 'J' )
+           rsvec  = stdlib_lsame( jobv, 'V' ) .or. jracc
+           rowpiv = stdlib_lsame( joba, 'F' ) .or. stdlib_lsame( joba, 'G' )
+           l2rank = stdlib_lsame( joba, 'R' )
+           l2aber = stdlib_lsame( joba, 'A' )
+           errest = stdlib_lsame( joba, 'E' ) .or. stdlib_lsame( joba, 'G' )
+           l2tran = stdlib_lsame( jobt, 'T' ) .and. ( m == n )
+           l2kill = stdlib_lsame( jobr, 'R' )
+           defr   = stdlib_lsame( jobr, 'N' )
+           l2pert = stdlib_lsame( jobp, 'P' )
+           lquery = ( lwork == -1 ) .or. ( lrwork == -1 )
+           if ( .not.(rowpiv .or. l2rank .or. l2aber .or.errest .or. stdlib_lsame( joba, 'C' ) )) &
+                     then
+              info = - 1
+           else if ( .not.( lsvec .or. stdlib_lsame( jobu, 'N' ) .or.( stdlib_lsame( jobu, 'W' ) &
+                     .and. rsvec .and. l2tran ) ) ) then
+              info = - 2
+           else if ( .not.( rsvec .or. stdlib_lsame( jobv, 'N' ) .or.( stdlib_lsame( jobv, 'W' ) &
+                     .and. lsvec .and. l2tran ) ) ) then
+              info = - 3
+           else if ( .not. ( l2kill .or. defr ) )    then
+              info = - 4
+           else if ( .not. ( stdlib_lsame(jobt,'T') .or. stdlib_lsame(jobt,'N') ) ) then
+              info = - 5
+           else if ( .not. ( l2pert .or. stdlib_lsame( jobp, 'N' ) ) ) then
+              info = - 6
+           else if ( m < 0 ) then
+              info = - 7
+           else if ( ( n < 0 ) .or. ( n > m ) ) then
+              info = - 8
+           else if ( lda < m ) then
+              info = - 10
+           else if ( lsvec .and. ( ldu < m ) ) then
+              info = - 13
+           else if ( rsvec .and. ( ldv < n ) ) then
+              info = - 15
+           else
+              ! #:)
+              info = 0
+           end if
+           if ( info == 0 ) then
+               ! Compute The Minimal And The Optimal Workspace Lengths
+               ! [[the expressions for computing the minimal and the optimal
+               ! values of lcwork, lrwork are written with a lot of redundancy and
+               ! can be simplified. however, this verbose form is useful for
+               ! maintenance and modifications of the code.]]
+              ! .. minimal workspace length for stdlib_wgeqp3 of an m x n matrix,
+               ! stdlib_wgeqrf of an n x n matrix, stdlib_wgelqf of an n x n matrix,
+               ! stdlib_wunmlq for computing n x n matrix, stdlib_wunmqr for computing n x n
+               ! matrix, stdlib_wunmqr for computing m x n matrix, respectively.
+               lwqp3 = n+1
+               lwqrf = max( 1, n )
+               lwlqf = max( 1, n )
+               lwunmlq  = max( 1, n )
+               lwunmqr  = max( 1, n )
+               lwunmqrm = max( 1, m )
+              ! Minimal Workspace Length For Stdlib_Zpocon Of An N X N Matrix
+               lwcon = 2 * n
+              ! .. minimal workspace length for stdlib_wgesvj of an n x n matrix,
+               ! without and with explicit accumulation of jacobi rotations
+               lwsvdj  = max( 2 * n, 1 )
+               lwsvdjv = max( 2 * n, 1 )
+               ! .. minimal real workspace length for stdlib_wgeqp3, stdlib_wpocon, stdlib_wgesvj
+               lrwqp3  = 2 * n
+               lrwcon  = n
+               lrwsvdj = n
+               if ( lquery ) then
+                   call stdlib_wgeqp3( m, n, a, lda, iwork, cdummy, cdummy, -1,rdummy, ierr )
+                             
+                   lwrk_wgeqp3 = real( cdummy(1),KIND=qp)
+                   call stdlib_wgeqrf( n, n, a, lda, cdummy, cdummy,-1, ierr )
+                   lwrk_wgeqrf = real( cdummy(1),KIND=qp)
+                   call stdlib_wgelqf( n, n, a, lda, cdummy, cdummy,-1, ierr )
+                   lwrk_wgelqf = real( cdummy(1),KIND=qp)
+               end if
+               minwrk  = 2
+               optwrk  = 2
+               miniwrk = n
+               if ( .not. (lsvec .or. rsvec ) ) then
+                   ! Minimal And Optimal Sizes Of The Complex Workspace If
+                   ! only the singular values are requested
+                   if ( errest ) then
+                       minwrk = max( n+lwqp3, n**2+lwcon, n+lwqrf, lwsvdj )
+                   else
+                       minwrk = max( n+lwqp3, n+lwqrf, lwsvdj )
+                   end if
+                   if ( lquery ) then
+                       call stdlib_wgesvj( 'L', 'N', 'N', n, n, a, lda, sva, n, v,ldv, cdummy, -1,&
+                                  rdummy, -1, ierr )
+                       lwrk_wgesvj = real( cdummy(1),KIND=qp)
+                       if ( errest ) then
+                           optwrk = max( n+lwrk_wgeqp3, n**2+lwcon,n+lwrk_wgeqrf, lwrk_wgesvj )
+                                     
+                       else
+                           optwrk = max( n+lwrk_wgeqp3, n+lwrk_wgeqrf,lwrk_wgesvj )
+                       end if
+                   end if
+                   if ( l2tran .or. rowpiv ) then
+                       if ( errest ) then
+                          minrwrk = max( 7, 2*m,  lrwqp3, lrwcon, lrwsvdj )
+                       else
+                          minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj )
+                       end if
+                   else
+                       if ( errest ) then
+                          minrwrk = max( 7, lrwqp3, lrwcon, lrwsvdj )
+                       else
+                          minrwrk = max( 7, lrwqp3, lrwsvdj )
+                       end if
+                   end if
+                   if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+               else if ( rsvec .and. (.not.lsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! singular values and the right singular vectors are requested
+                  if ( errest ) then
+                      minwrk = max( n+lwqp3, lwcon, lwsvdj, n+lwlqf,2*n+lwqrf, n+lwsvdj, n+&
+                                lwunmlq )
+                  else
+                      minwrk = max( n+lwqp3, lwsvdj, n+lwlqf, 2*n+lwqrf,n+lwsvdj, n+lwunmlq )
+                                
+                  end if
+                  if ( lquery ) then
+                      call stdlib_wgesvj( 'L', 'U', 'N', n,n, u, ldu, sva, n, a,lda, cdummy, -1, &
+                                rdummy, -1, ierr )
+                      lwrk_wgesvj = real( cdummy(1),KIND=qp)
+                      call stdlib_wunmlq( 'L', 'C', n, n, n, a, lda, cdummy,v, ldv, cdummy, -1, &
+                                ierr )
+                      lwrk_wunmlq = real( cdummy(1),KIND=qp)
+                      if ( errest ) then
+                      optwrk = max( n+lwrk_wgeqp3, lwcon, lwrk_wgesvj,n+lwrk_wgelqf, 2*n+&
+                                lwrk_wgeqrf,n+lwrk_wgesvj,  n+lwrk_wunmlq )
+                      else
+                      optwrk = max( n+lwrk_wgeqp3, lwrk_wgesvj,n+lwrk_wgelqf,2*n+lwrk_wgeqrf, n+&
+                                lwrk_wgesvj,n+lwrk_wunmlq )
+                      end if
+                  end if
+                  if ( l2tran .or. rowpiv ) then
+                       if ( errest ) then
+                          minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj, lrwcon )
+                       else
+                          minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj )
+                       end if
+                  else
+                       if ( errest ) then
+                          minrwrk = max( 7, lrwqp3, lrwsvdj, lrwcon )
+                       else
+                          minrwrk = max( 7, lrwqp3, lrwsvdj )
+                       end if
+                  end if
+                  if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+               else if ( lsvec .and. (.not.rsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! singular values and the left singular vectors are requested
+                  if ( errest ) then
+                      minwrk = n + max( lwqp3,lwcon,n+lwqrf,lwsvdj,lwunmqrm )
+                  else
+                      minwrk = n + max( lwqp3, n+lwqrf, lwsvdj, lwunmqrm )
+                  end if
+                  if ( lquery ) then
+                      call stdlib_wgesvj( 'L', 'U', 'N', n,n, u, ldu, sva, n, a,lda, cdummy, -1, &
+                                rdummy, -1, ierr )
+                      lwrk_wgesvj = real( cdummy(1),KIND=qp)
+                      call stdlib_wunmqr( 'L', 'N', m, n, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_wunmqrm = real( cdummy(1),KIND=qp)
+                      if ( errest ) then
+                      optwrk = n + max( lwrk_wgeqp3, lwcon, n+lwrk_wgeqrf,lwrk_wgesvj, &
+                                lwrk_wunmqrm )
+                      else
+                      optwrk = n + max( lwrk_wgeqp3, n+lwrk_wgeqrf,lwrk_wgesvj, lwrk_wunmqrm )
+                                
+                      end if
+                  end if
+                  if ( l2tran .or. rowpiv ) then
+                      if ( errest ) then
+                         minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj, lrwcon )
+                      else
+                         minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj )
+                      end if
+                  else
+                      if ( errest ) then
+                         minrwrk = max( 7, lrwqp3, lrwsvdj, lrwcon )
+                      else
+                         minrwrk = max( 7, lrwqp3, lrwsvdj )
+                      end if
+                  end if
+                  if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+               else
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! full svd is requested
+                  if ( .not. jracc ) then
+                      if ( errest ) then
+                         minwrk = max( n+lwqp3, n+lwcon,  2*n+n**2+lwcon,2*n+lwqrf,         2*n+&
+                         lwqp3,2*n+n**2+n+lwlqf,  2*n+n**2+n+n**2+lwcon,2*n+n**2+n+lwsvdj, 2*n+&
+                         n**2+n+lwsvdjv,2*n+n**2+n+lwunmqr,2*n+n**2+n+lwunmlq,n+n**2+lwsvdj,   n+&
+                                   lwunmqrm )
+                      else
+                         minwrk = max( n+lwqp3,        2*n+n**2+lwcon,2*n+lwqrf,         2*n+&
+                         lwqp3,2*n+n**2+n+lwlqf,  2*n+n**2+n+n**2+lwcon,2*n+n**2+n+lwsvdj, 2*n+&
+                         n**2+n+lwsvdjv,2*n+n**2+n+lwunmqr,2*n+n**2+n+lwunmlq,n+n**2+lwsvdj,      &
+                                   n+lwunmqrm )
+                      end if
+                      miniwrk = miniwrk + n
+                      if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+                  else
+                      if ( errest ) then
+                         minwrk = max( n+lwqp3, n+lwcon, 2*n+lwqrf,2*n+n**2+lwsvdjv, 2*n+n**2+n+&
+                                   lwunmqr,n+lwunmqrm )
+                      else
+                         minwrk = max( n+lwqp3, 2*n+lwqrf,2*n+n**2+lwsvdjv, 2*n+n**2+n+lwunmqr,n+&
+                                   lwunmqrm )
+                      end if
+                      if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+                  end if
+                  if ( lquery ) then
+                      call stdlib_wunmqr( 'L', 'N', m, n, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_wunmqrm = real( cdummy(1),KIND=qp)
+                      call stdlib_wunmqr( 'L', 'N', n, n, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_wunmqr = real( cdummy(1),KIND=qp)
+                      if ( .not. jracc ) then
+                          call stdlib_wgeqp3( n,n, a, lda, iwork, cdummy,cdummy, -1,rdummy, ierr )
+                                    
+                          lwrk_wgeqp3n = real( cdummy(1),KIND=qp)
+                          call stdlib_wgesvj( 'L', 'U', 'N', n, n, u, ldu, sva,n, v, ldv, cdummy, &
+                                    -1, rdummy, -1, ierr )
+                          lwrk_wgesvj = real( cdummy(1),KIND=qp)
+                          call stdlib_wgesvj( 'U', 'U', 'N', n, n, u, ldu, sva,n, v, ldv, cdummy, &
+                                    -1, rdummy, -1, ierr )
+                          lwrk_wgesvju = real( cdummy(1),KIND=qp)
+                          call stdlib_wgesvj( 'L', 'U', 'V', n, n, u, ldu, sva,n, v, ldv, cdummy, &
+                                    -1, rdummy, -1, ierr )
+                          lwrk_wgesvjv = real( cdummy(1),KIND=qp)
+                          call stdlib_wunmlq( 'L', 'C', n, n, n, a, lda, cdummy,v, ldv, cdummy, -&
+                                    1, ierr )
+                          lwrk_wunmlq = real( cdummy(1),KIND=qp)
+                          if ( errest ) then
+                            optwrk = max( n+lwrk_wgeqp3, n+lwcon,2*n+n**2+lwcon, 2*n+lwrk_wgeqrf,&
+                            2*n+lwrk_wgeqp3n,2*n+n**2+n+lwrk_wgelqf,2*n+n**2+n+n**2+lwcon,2*n+&
+                            n**2+n+lwrk_wgesvj,2*n+n**2+n+lwrk_wgesvjv,2*n+n**2+n+lwrk_wunmqr,2*n+&
+                                      n**2+n+lwrk_wunmlq,n+n**2+lwrk_wgesvju,n+lwrk_wunmqrm )
+                          else
+                            optwrk = max( n+lwrk_wgeqp3,2*n+n**2+lwcon, 2*n+lwrk_wgeqrf,2*n+&
+                            lwrk_wgeqp3n,2*n+n**2+n+lwrk_wgelqf,2*n+n**2+n+n**2+lwcon,2*n+n**2+n+&
+                            lwrk_wgesvj,2*n+n**2+n+lwrk_wgesvjv,2*n+n**2+n+lwrk_wunmqr,2*n+n**2+n+&
+                                      lwrk_wunmlq,n+n**2+lwrk_wgesvju,n+lwrk_wunmqrm )
+                          end if
+                      else
+                          call stdlib_wgesvj( 'L', 'U', 'V', n, n, u, ldu, sva,n, v, ldv, cdummy, &
+                                    -1, rdummy, -1, ierr )
+                          lwrk_wgesvjv = real( cdummy(1),KIND=qp)
+                          call stdlib_wunmqr( 'L', 'N', n, n, n, cdummy, n, cdummy,v, ldv, cdummy,&
+                                     -1, ierr )
+                          lwrk_wunmqr = real( cdummy(1),KIND=qp)
+                          call stdlib_wunmqr( 'L', 'N', m, n, n, a, lda, cdummy, u,ldu, cdummy, -&
+                                    1, ierr )
+                          lwrk_wunmqrm = real( cdummy(1),KIND=qp)
+                          if ( errest ) then
+                             optwrk = max( n+lwrk_wgeqp3, n+lwcon,2*n+lwrk_wgeqrf, 2*n+n**2,2*n+&
+                                       n**2+lwrk_wgesvjv,2*n+n**2+n+lwrk_wunmqr,n+lwrk_wunmqrm )
+                          else
+                             optwrk = max( n+lwrk_wgeqp3, 2*n+lwrk_wgeqrf,2*n+n**2, 2*n+n**2+&
+                                       lwrk_wgesvjv,2*n+n**2+n+lwrk_wunmqr,n+lwrk_wunmqrm )
+                          end if
+                      end if
+                  end if
+                  if ( l2tran .or. rowpiv ) then
+                      minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj, lrwcon )
+                  else
+                      minrwrk = max( 7, lrwqp3, lrwsvdj, lrwcon )
+                  end if
+               end if
+               minwrk = max( 2, minwrk )
+               optwrk = max( minwrk, optwrk )
+               if ( lwork  < minwrk  .and. (.not.lquery) ) info = - 17
+               if ( lrwork < minrwrk .and. (.not.lquery) ) info = - 19
+           end if
+           if ( info /= 0 ) then
+             ! #:(
+              call stdlib_xerbla( 'ZGEJSV', - info )
+              return
+           else if ( lquery ) then
+               cwork(1) = optwrk
+               cwork(2) = minwrk
+               rwork(1) = minrwrk
+               iwork(1) = max( 4, miniwrk )
+               return
+           end if
+           ! quick return for void matrix (y3k safe)
+       ! #:)
+           if ( ( m == 0 ) .or. ( n == 0 ) ) then
+              iwork(1:4) = 0
+              rwork(1:7) = 0
+              return
+           endif
+           ! determine whether the matrix u should be m x n or m x m
+           if ( lsvec ) then
+              n1 = n
+              if ( stdlib_lsame( jobu, 'F' ) ) n1 = m
+           end if
+           ! set numerical parameters
+      ! !    note: make sure stdlib_qlamch() does not fail on the target architecture.
+           epsln = stdlib_qlamch('EPSILON')
+           sfmin = stdlib_qlamch('SAFEMINIMUM')
+           small = sfmin / epsln
+           big   = stdlib_qlamch('O')
+           ! big   = one / sfmin
+           ! initialize sva(1:n) = diag( ||a e_i||_2 )_1^n
+      ! (!)  if necessary, scale sva() to protect the largest norm from
+           ! overflow. it is possible that this scaling pushes the smallest
+           ! column norm left from the underflow threshold (extreme case).
+           scalem  = one / sqrt(real(m,KIND=qp)*real(n,KIND=qp))
+           noscal  = .true.
+           goscal  = .true.
+           do p = 1, n
+              aapp = zero
+              aaqq = one
+              call stdlib_wlassq( m, a(1,p), 1, aapp, aaqq )
+              if ( aapp > big ) then
+                 info = - 9
+                 call stdlib_xerbla( 'ZGEJSV', -info )
+                 return
+              end if
+              aaqq = sqrt(aaqq)
+              if ( ( aapp < (big / aaqq) ) .and. noscal  ) then
+                 sva(p)  = aapp * aaqq
+              else
+                 noscal  = .false.
+                 sva(p)  = aapp * ( aaqq * scalem )
+                 if ( goscal ) then
+                    goscal = .false.
+                    call stdlib_qscal( p-1, scalem, sva, 1 )
+                 end if
+              end if
+           end do
+           if ( noscal ) scalem = one
+           aapp = zero
+           aaqq = big
+           do p = 1, n
+              aapp = max( aapp, sva(p) )
+              if ( sva(p) /= zero ) aaqq = min( aaqq, sva(p) )
+           end do
+           ! quick return for zero m x n matrix
+       ! #:)
+           if ( aapp == zero ) then
+              if ( lsvec ) call stdlib_wlaset( 'G', m, n1, czero, cone, u, ldu )
+              if ( rsvec ) call stdlib_wlaset( 'G', n, n,  czero, cone, v, ldv )
+              rwork(1) = one
+              rwork(2) = one
+              if ( errest ) rwork(3) = one
+              if ( lsvec .and. rsvec ) then
+                 rwork(4) = one
+                 rwork(5) = one
+              end if
+              if ( l2tran ) then
+                 rwork(6) = zero
+                 rwork(7) = zero
+              end if
+              iwork(1) = 0
+              iwork(2) = 0
+              iwork(3) = 0
+              iwork(4) = -1
+              return
+           end if
+           ! issue warning if denormalized column norms detected. override the
+           ! high relative accuracy request. issue licence to kill nonzero columns
+           ! (set them to zero) whose norm is less than sigma_max / big (roughly).
+       ! #:(
+           warning = 0
+           if ( aaqq <= sfmin ) then
+              l2rank = .true.
+              l2kill = .true.
+              warning = 1
+           end if
+           ! quick return for one-column matrix
+       ! #:)
+           if ( n == 1 ) then
+              if ( lsvec ) then
+                 call stdlib_wlascl( 'G',0,0,sva(1),scalem, m,1,a(1,1),lda,ierr )
+                 call stdlib_wlacpy( 'A', m, 1, a, lda, u, ldu )
+                 ! computing all m left singular vectors of the m x 1 matrix
+                 if ( n1 /= n  ) then
+                   call stdlib_wgeqrf( m, n, u,ldu, cwork, cwork(n+1),lwork-n,ierr )
+                   call stdlib_wungqr( m,n1,1, u,ldu,cwork,cwork(n+1),lwork-n,ierr )
+                   call stdlib_wcopy( m, a(1,1), 1, u(1,1), 1 )
+                 end if
+              end if
+              if ( rsvec ) then
+                  v(1,1) = cone
+              end if
+              if ( sva(1) < (big*scalem) ) then
+                 sva(1)  = sva(1) / scalem
+                 scalem  = one
+              end if
+              rwork(1) = one / scalem
+              rwork(2) = one
+              if ( sva(1) /= zero ) then
+                 iwork(1) = 1
+                 if ( ( sva(1) / scalem) >= sfmin ) then
+                    iwork(2) = 1
+                 else
+                    iwork(2) = 0
+                 end if
+              else
+                 iwork(1) = 0
+                 iwork(2) = 0
+              end if
+              iwork(3) = 0
+              iwork(4) = -1
+              if ( errest ) rwork(3) = one
+              if ( lsvec .and. rsvec ) then
+                 rwork(4) = one
+                 rwork(5) = one
+              end if
+              if ( l2tran ) then
+                 rwork(6) = zero
+                 rwork(7) = zero
+              end if
+              return
+           end if
+           transp = .false.
+           aatmax = -one
+           aatmin =  big
+           if ( rowpiv .or. l2tran ) then
+           ! compute the row norms, needed to determine row pivoting sequence
+           ! (in the case of heavily row weighted a, row pivoting is strongly
+           ! advised) and to collect information needed to compare the
+           ! structures of a * a^* and a^* * a (in the case l2tran==.true.).
+              if ( l2tran ) then
+                 do p = 1, m
+                    xsc   = zero
+                    temp1 = one
+                    call stdlib_wlassq( n, a(p,1), lda, xsc, temp1 )
+                    ! stdlib_wlassq gets both the ell_2 and the ell_infinity norm
+                    ! in one pass through the vector
+                    rwork(m+p)  = xsc * scalem
+                    rwork(p)    = xsc * (scalem*sqrt(temp1))
+                    aatmax = max( aatmax, rwork(p) )
+                    if (rwork(p) /= zero)aatmin = min(aatmin,rwork(p))
+                 end do
+              else
+                 do p = 1, m
+                    rwork(m+p) = scalem*abs( a(p,stdlib_iwamax(n,a(p,1),lda)) )
+                    aatmax = max( aatmax, rwork(m+p) )
+                    aatmin = min( aatmin, rwork(m+p) )
+                 end do
+              end if
+           end if
+           ! for square matrix a try to determine whether a^*  would be better
+           ! input for the preconditioned jacobi svd, with faster convergence.
+           ! the decision is based on an o(n) function of the vector of column
+           ! and row norms of a, based on the shannon entropy. this should give
+           ! the right choice in most cases when the difference actually matters.
+           ! it may fail and pick the slower converging side.
+           entra  = zero
+           entrat = zero
+           if ( l2tran ) then
+              xsc   = zero
+              temp1 = one
+              call stdlib_qlassq( n, sva, 1, xsc, temp1 )
+              temp1 = one / temp1
+              entra = zero
+              do p = 1, n
+                 big1  = ( ( sva(p) / xsc )**2 ) * temp1
+                 if ( big1 /= zero ) entra = entra + big1 * log(big1)
+              end do
+              entra = - entra / log(real(n,KIND=qp))
+              ! now, sva().^2/trace(a^* * a) is a point in the probability simplex.
+              ! it is derived from the diagonal of  a^* * a.  do the same with the
+              ! diagonal of a * a^*, compute the entropy of the corresponding
+              ! probability distribution. note that a * a^* and a^* * a have the
+              ! same trace.
+              entrat = zero
+              do p = 1, m
+                 big1 = ( ( rwork(p) / xsc )**2 ) * temp1
+                 if ( big1 /= zero ) entrat = entrat + big1 * log(big1)
+              end do
+              entrat = - entrat / log(real(m,KIND=qp))
+              ! analyze the entropies and decide a or a^*. smaller entropy
+              ! usually means better input for the algorithm.
+              transp = ( entrat < entra )
+              ! if a^* is better than a, take the adjoint of a. this is allowed
+              ! only for square matrices, m=n.
+              if ( transp ) then
+                 ! in an optimal implementation, this trivial transpose
+                 ! should be replaced with faster transpose.
+                 do p = 1, n - 1
+                    a(p,p) = conjg(a(p,p))
+                    do q = p + 1, n
+                        ctemp = conjg(a(q,p))
+                       a(q,p) = conjg(a(p,q))
+                       a(p,q) = ctemp
+                    end do
+                 end do
+                 a(n,n) = conjg(a(n,n))
+                 do p = 1, n
+                    rwork(m+p) = sva(p)
+                    sva(p)     = rwork(p)
+                    ! previously computed row 2-norms are now column 2-norms
+                    ! of the transposed matrix
+                 end do
+                 temp1  = aapp
+                 aapp   = aatmax
+                 aatmax = temp1
+                 temp1  = aaqq
+                 aaqq   = aatmin
+                 aatmin = temp1
+                 kill   = lsvec
+                 lsvec  = rsvec
+                 rsvec  = kill
+                 if ( lsvec ) n1 = n
+                 rowpiv = .true.
+              end if
+           end if
+           ! end if l2tran
+           ! scale the matrix so that its maximal singular value remains less
+           ! than sqrt(big) -- the matrix is scaled so that its maximal column
+           ! has euclidean norm equal to sqrt(big/n). the only reason to keep
+           ! sqrt(big) instead of big is the fact that stdlib_wgejsv uses lapack and
+           ! blas routines that, in some implementations, are not capable of
+           ! working in the full interval [sfmin,big] and that they may provoke
+           ! overflows in the intermediate results. if the singular values spread
+           ! from sfmin to big, then stdlib_wgesvj will compute them. so, in that case,
+           ! one should use stdlib_wgesvj instead of stdlib_wgejsv.
+           ! >> change in the april 2016 update: allow bigger range, i.e. the
+           ! largest column is allowed up to big/n and stdlib_wgesvj will do the rest.
+           big1   = sqrt( big )
+           temp1  = sqrt( big / real(n,KIND=qp) )
+            ! temp1  = big/real(n,KIND=qp)
+           call stdlib_qlascl( 'G', 0, 0, aapp, temp1, n, 1, sva, n, ierr )
+           if ( aaqq > (aapp * sfmin) ) then
+               aaqq = ( aaqq / aapp ) * temp1
+           else
+               aaqq = ( aaqq * temp1 ) / aapp
+           end if
+           temp1 = temp1 * scalem
+           call stdlib_wlascl( 'G', 0, 0, aapp, temp1, m, n, a, lda, ierr )
+           ! to undo scaling at the end of this procedure, multiply the
+           ! computed singular values with uscal2 / uscal1.
+           uscal1 = temp1
+           uscal2 = aapp
+           if ( l2kill ) then
+              ! l2kill enforces computation of nonzero singular values in
+              ! the restricted range of condition number of the initial a,
+              ! sigma_max(a) / sigma_min(a) approx. sqrt(big)/sqrt(sfmin).
+              xsc = sqrt( sfmin )
+           else
+              xsc = small
+              ! now, if the condition number of a is too big,
+              ! sigma_max(a) / sigma_min(a) > sqrt(big/n) * epsln / sfmin,
+              ! as a precaution measure, the full svd is computed using stdlib_wgesvj
+              ! with accumulated jacobi rotations. this provides numerically
+              ! more robust computation, at the cost of slightly increased run
+              ! time. depending on the concrete implementation of blas and lapack
+              ! (i.e. how they behave in presence of extreme ill-conditioning) the
+              ! implementor may decide to remove this switch.
+              if ( ( aaqq<sqrt(sfmin) ) .and. lsvec .and. rsvec ) then
+                 jracc = .true.
+              end if
+           end if
+           if ( aaqq < xsc ) then
+              do p = 1, n
+                 if ( sva(p) < xsc ) then
+                    call stdlib_wlaset( 'A', m, 1, czero, czero, a(1,p), lda )
+                    sva(p) = zero
+                 end if
+              end do
+           end if
+           ! preconditioning using qr factorization with pivoting
+           if ( rowpiv ) then
+              ! optional row permutation (bjoerck row pivoting):
+              ! a result by cox and higham shows that the bjoerck's
+              ! row pivoting combined with standard column pivoting
+              ! has similar effect as powell-reid complete pivoting.
+              ! the ell-infinity norms of a are made nonincreasing.
+              if ( ( lsvec .and. rsvec ) .and. .not.( jracc ) ) then
+                   iwoff = 2*n
+              else
+                   iwoff = n
+              end if
+              do p = 1, m - 1
+                 q = stdlib_iqamax( m-p+1, rwork(m+p), 1 ) + p - 1
+                 iwork(iwoff+p) = q
+                 if ( p /= q ) then
+                    temp1      = rwork(m+p)
+                    rwork(m+p) = rwork(m+q)
+                    rwork(m+q) = temp1
+                 end if
+              end do
+              call stdlib_wlaswp( n, a, lda, 1, m-1, iwork(iwoff+1), 1 )
+           end if
+           ! end of the preparation phase (scaling, optional sorting and
+           ! transposing, optional flushing of small columns).
+           ! preconditioning
+           ! if the full svd is needed, the right singular vectors are computed
+           ! from a matrix equation, and for that we need theoretical analysis
+           ! of the businger-golub pivoting. so we use stdlib_wgeqp3 as the first rr qrf.
+           ! in all other cases the first rr qrf can be chosen by other criteria
+           ! (eg speed by replacing global with restricted window pivoting, such
+           ! as in xgeqpx from toms # 782). good results will be obtained using
+           ! xgeqpx with properly (!) chosen numerical parameters.
+           ! any improvement of stdlib_wgeqp3 improves overall performance of stdlib_wgejsv.
+           ! a * p1 = q1 * [ r1^* 0]^*:
+           do p = 1, n
+              ! All Columns Are Free Columns
+              iwork(p) = 0
+           end do
+           call stdlib_wgeqp3( m, n, a, lda, iwork, cwork, cwork(n+1), lwork-n,rwork, ierr )
+                     
+           ! the upper triangular matrix r1 from the first qrf is inspected for
+           ! rank deficiency and possibilities for deflation, or possible
+           ! ill-conditioning. depending on the user specified flag l2rank,
+           ! the procedure explores possibilities to reduce the numerical
+           ! rank by inspecting the computed upper triangular factor. if
+           ! l2rank or l2aber are up, then stdlib_wgejsv will compute the svd of
+           ! a + da, where ||da|| <= f(m,n)*epsln.
+           nr = 1
+           if ( l2aber ) then
+              ! standard absolute error bound suffices. all sigma_i with
+              ! sigma_i < n*epsln*||a|| are flushed to zero. this is an
+              ! aggressive enforcement of lower numerical rank by introducing a
+              ! backward error of the order of n*epsln*||a||.
+              temp1 = sqrt(real(n,KIND=qp))*epsln
+              do p = 2, n
+                 if ( abs(a(p,p)) >= (temp1*abs(a(1,1))) ) then
+                    nr = nr + 1
+                 else
+                    go to 3002
+                 end if
+              end do
+              3002 continue
+           else if ( l2rank ) then
+              ! .. similarly as above, only slightly more gentle (less aggressive).
+              ! sudden drop on the diagonal of r1 is used as the criterion for
+              ! close-to-rank-deficient.
+              temp1 = sqrt(sfmin)
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < (epsln*abs(a(p-1,p-1))) ) .or.( abs(a(p,p)) < small ) .or.( &
+                           l2kill .and. (abs(a(p,p)) < temp1) ) ) go to 3402
+                 nr = nr + 1
+              end do
+              3402 continue
+           else
+              ! the goal is high relative accuracy. however, if the matrix
+              ! has high scaled condition number the relative accuracy is in
+              ! general not feasible. later on, a condition number estimator
+              ! will be deployed to estimate the scaled condition number.
+              ! here we just remove the underflowed part of the triangular
+              ! factor. this prevents the situation in which the code is
+              ! working hard to get the accuracy not warranted by the data.
+              temp1  = sqrt(sfmin)
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < small ) .or.( l2kill .and. (abs(a(p,p)) < temp1) ) ) go to &
+                           3302
+                 nr = nr + 1
+              end do
+              3302 continue
+           end if
+           almort = .false.
+           if ( nr == n ) then
+              maxprj = one
+              do p = 2, n
+                 temp1  = abs(a(p,p)) / sva(iwork(p))
+                 maxprj = min( maxprj, temp1 )
+              end do
+              if ( maxprj**2 >= one - real(n,KIND=qp)*epsln ) almort = .true.
+           end if
+           sconda = - one
+           condr1 = - one
+           condr2 = - one
+           if ( errest ) then
+              if ( n == nr ) then
+                 if ( rsvec ) then
+                    ! V Is Available As Workspace
+                    call stdlib_wlacpy( 'U', n, n, a, lda, v, ldv )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_wdscal( p, one/temp1, v(1,p), 1 )
+                    end do
+                    if ( lsvec )then
+                        call stdlib_wpocon( 'U', n, v, ldv, one, temp1,cwork(n+1), rwork, ierr )
+                                  
+                    else
+                        call stdlib_wpocon( 'U', n, v, ldv, one, temp1,cwork, rwork, ierr )
+                                  
+                    end if
+                 else if ( lsvec ) then
+                    ! U Is Available As Workspace
+                    call stdlib_wlacpy( 'U', n, n, a, lda, u, ldu )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_wdscal( p, one/temp1, u(1,p), 1 )
+                    end do
+                    call stdlib_wpocon( 'U', n, u, ldu, one, temp1,cwork(n+1), rwork, ierr )
+                              
+                 else
+                    call stdlib_wlacpy( 'U', n, n, a, lda, cwork, n )
+      ! []            call stdlib_wlacpy( 'u', n, n, a, lda, cwork(n+1), n )
+                    ! change: here index shifted by n to the left, cwork(1:n)
+                    ! not needed for sigma only computation
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+      ! []               call stdlib_wdscal( p, one/temp1, cwork(n+(p-1)*n+1), 1 )
+                       call stdlib_wdscal( p, one/temp1, cwork((p-1)*n+1), 1 )
+                    end do
+                 ! The Columns Of R Are Scaled To Have Unit Euclidean Lengths
+      ! []               call stdlib_wpocon( 'u', n, cwork(n+1), n, one, temp1,
+      ! []     $              cwork(n+n*n+1), rwork, ierr )
+                    call stdlib_wpocon( 'U', n, cwork, n, one, temp1,cwork(n*n+1), rwork, ierr )
+                              
+                 end if
+                 if ( temp1 /= zero ) then
+                    sconda = one / sqrt(temp1)
+                 else
+                    sconda = - one
+                 end if
+                 ! sconda is an estimate of sqrt(||(r^* * r)^(-1)||_1).
+                 ! n^(-1/4) * sconda <= ||r^(-1)||_2 <= n^(1/4) * sconda
+              else
+                 sconda = - one
+              end if
+           end if
+           l2pert = l2pert .and. ( abs( a(1,1)/a(nr,nr) ) > sqrt(big1) )
+           ! if there is no violent scaling, artificial perturbation is not needed.
+           ! phase 3:
+           if ( .not. ( rsvec .or. lsvec ) ) then
+               ! singular values only
+               ! .. transpose a(1:nr,1:n)
+              do p = 1, min( n-1, nr )
+                 call stdlib_wcopy( n-p, a(p,p+1), lda, a(p+1,p), 1 )
+                 call stdlib_wlacgv( n-p+1, a(p,p), 1 )
+              end do
+              if ( nr == n ) a(n,n) = conjg(a(n,n))
+              ! the following two do-loops introduce small relative perturbation
+              ! into the strict upper triangle of the lower triangular matrix.
+              ! small entries below the main diagonal are also changed.
+              ! this modification is useful if the computing environment does not
+              ! provide/allow flush to zero underflow, for it prevents many
+              ! annoying denormalized numbers in case of strongly scaled matrices.
+              ! the perturbation is structured so that it does not introduce any
+              ! new perturbation of the singular values, and it does not destroy
+              ! the job done by the preconditioner.
+              ! the licence for this perturbation is in the variable l2pert, which
+              ! should be .false. if flush to zero underflow is active.
+              if ( .not. almort ) then
+                 if ( l2pert ) then
+                    ! xsc = sqrt(small)
+                    xsc = epsln / real(n,KIND=qp)
+                    do q = 1, nr
+                       ctemp = cmplx(xsc*abs(a(q,q)),zero,KIND=qp)
+                       do p = 1, n
+                          if ( ( (p>q) .and. (abs(a(p,q))<=temp1) ).or. ( p < q ) )a(p,q) = &
+                                    ctemp
+           ! $                     a(p,q) = temp1 * ( a(p,q) / abs(a(p,q)) )
+                       end do
+                    end do
+                 else
+                    call stdlib_wlaset( 'U', nr-1,nr-1, czero,czero, a(1,2),lda )
+                 end if
+                  ! Second Preconditioning Using The Qr Factorization
+                 call stdlib_wgeqrf( n,nr, a,lda, cwork, cwork(n+1),lwork-n, ierr )
+                 ! And Transpose Upper To Lower Triangular
+                 do p = 1, nr - 1
+                    call stdlib_wcopy( nr-p, a(p,p+1), lda, a(p+1,p), 1 )
+                    call stdlib_wlacgv( nr-p+1, a(p,p), 1 )
+                 end do
+           end if
+                 ! row-cyclic jacobi svd algorithm with column pivoting
+                 ! .. again some perturbation (a "background noise") is added
+                 ! to drown denormals
+                 if ( l2pert ) then
+                    ! xsc = sqrt(small)
+                    xsc = epsln / real(n,KIND=qp)
+                    do q = 1, nr
+                       ctemp = cmplx(xsc*abs(a(q,q)),zero,KIND=qp)
+                       do p = 1, nr
+                          if ( ( (p>q) .and. (abs(a(p,q))<=temp1) ).or. ( p < q ) )a(p,q) = &
+                                    ctemp
+           ! $                   a(p,q) = temp1 * ( a(p,q) / abs(a(p,q)) )
+                       end do
+                    end do
+                 else
+                    call stdlib_wlaset( 'U', nr-1, nr-1, czero, czero, a(1,2), lda )
+                 end if
+                 ! .. and one-sided jacobi rotations are started on a lower
+                 ! triangular matrix (plus perturbation which is ignored in
+                 ! the part which destroys triangular form (confusing?!))
+                 call stdlib_wgesvj( 'L', 'N', 'N', nr, nr, a, lda, sva,n, v, ldv, cwork, lwork, &
+                           rwork, lrwork, info )
+                 scalem  = rwork(1)
+                 numrank = nint(rwork(2),KIND=ilp)
+           else if ( ( rsvec .and. ( .not. lsvec ) .and. ( .not. jracc ) ).or.( jracc .and. ( &
+                     .not. lsvec ) .and. ( nr /= n ) ) ) then
+              ! -> singular values and right singular vectors <-
+              if ( almort ) then
+                 ! In This Case Nr Equals N
+                 do p = 1, nr
+                    call stdlib_wcopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                    call stdlib_wlacgv( n-p+1, v(p,p), 1 )
+                 end do
+                 call stdlib_wlaset( 'U', nr-1,nr-1, czero, czero, v(1,2), ldv )
+                 call stdlib_wgesvj( 'L','U','N', n, nr, v, ldv, sva, nr, a, lda,cwork, lwork, &
+                           rwork, lrwork, info )
+                 scalem  = rwork(1)
+                 numrank = nint(rwork(2),KIND=ilp)
+              else
+              ! .. two more qr factorizations ( one qrf is not enough, two require
+              ! accumulated product of jacobi rotations, three are perfect )
+                 call stdlib_wlaset( 'L', nr-1,nr-1, czero, czero, a(2,1), lda )
+                 call stdlib_wgelqf( nr,n, a, lda, cwork, cwork(n+1), lwork-n, ierr)
+                 call stdlib_wlacpy( 'L', nr, nr, a, lda, v, ldv )
+                 call stdlib_wlaset( 'U', nr-1,nr-1, czero, czero, v(1,2), ldv )
+                 call stdlib_wgeqrf( nr, nr, v, ldv, cwork(n+1), cwork(2*n+1),lwork-2*n, ierr )
+                           
+                 do p = 1, nr
+                    call stdlib_wcopy( nr-p+1, v(p,p), ldv, v(p,p), 1 )
+                    call stdlib_wlacgv( nr-p+1, v(p,p), 1 )
+                 end do
+                 call stdlib_wlaset('U', nr-1, nr-1, czero, czero, v(1,2), ldv)
+                 call stdlib_wgesvj( 'L', 'U','N', nr, nr, v,ldv, sva, nr, u,ldu, cwork(n+1), &
+                           lwork-n, rwork, lrwork, info )
+                 scalem  = rwork(1)
+                 numrank = nint(rwork(2),KIND=ilp)
+                 if ( nr < n ) then
+                    call stdlib_wlaset( 'A',n-nr, nr, czero,czero, v(nr+1,1),  ldv )
+                    call stdlib_wlaset( 'A',nr, n-nr, czero,czero, v(1,nr+1),  ldv )
+                    call stdlib_wlaset( 'A',n-nr,n-nr,czero,cone, v(nr+1,nr+1),ldv )
+                 end if
+              call stdlib_wunmlq( 'L', 'C', n, n, nr, a, lda, cwork,v, ldv, cwork(n+1), lwork-n, &
+                        ierr )
+              end if
+               ! Permute The Rows Of V
+               ! do 8991 p = 1, n
+                  ! call stdlib_wcopy( n, v(p,1), ldv, a(iwork(p),1), lda )
+                  8991 continue
+               ! call stdlib_wlacpy( 'all', n, n, a, lda, v, ldv )
+              call stdlib_wlapmr( .false., n, n, v, ldv, iwork )
+               if ( transp ) then
+                 call stdlib_wlacpy( 'A', n, n, v, ldv, u, ldu )
+               end if
+           else if ( jracc .and. (.not. lsvec) .and. ( nr== n ) ) then
+              call stdlib_wlaset( 'L', n-1,n-1, czero, czero, a(2,1), lda )
+              call stdlib_wgesvj( 'U','N','V', n, n, a, lda, sva, n, v, ldv,cwork, lwork, rwork, &
+                        lrwork, info )
+               scalem  = rwork(1)
+               numrank = nint(rwork(2),KIND=ilp)
+               call stdlib_wlapmr( .false., n, n, v, ldv, iwork )
+           else if ( lsvec .and. ( .not. rsvec ) ) then
+              ! Singular Values And Left Singular Vectors                 
+              ! Second Preconditioning Step To Avoid Need To Accumulate
+              ! jacobi rotations in the jacobi iterations.
+              do p = 1, nr
+                 call stdlib_wcopy( n-p+1, a(p,p), lda, u(p,p), 1 )
+                 call stdlib_wlacgv( n-p+1, u(p,p), 1 )
+              end do
+              call stdlib_wlaset( 'U', nr-1, nr-1, czero, czero, u(1,2), ldu )
+              call stdlib_wgeqrf( n, nr, u, ldu, cwork(n+1), cwork(2*n+1),lwork-2*n, ierr )
+                        
+              do p = 1, nr - 1
+                 call stdlib_wcopy( nr-p, u(p,p+1), ldu, u(p+1,p), 1 )
+                 call stdlib_wlacgv( n-p+1, u(p,p), 1 )
+              end do
+              call stdlib_wlaset( 'U', nr-1, nr-1, czero, czero, u(1,2), ldu )
+              call stdlib_wgesvj( 'L', 'U', 'N', nr,nr, u, ldu, sva, nr, a,lda, cwork(n+1), lwork-&
+                        n, rwork, lrwork, info )
+              scalem  = rwork(1)
+              numrank = nint(rwork(2),KIND=ilp)
+              if ( nr < m ) then
+                 call stdlib_wlaset( 'A',  m-nr, nr,czero, czero, u(nr+1,1), ldu )
+                 if ( nr < n1 ) then
+                    call stdlib_wlaset( 'A',nr, n1-nr, czero, czero, u(1,nr+1),ldu )
+                    call stdlib_wlaset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1),ldu )
+                 end if
+              end if
+              call stdlib_wunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, cwork(n+1), lwork-n, &
+                        ierr )
+              if ( rowpiv )call stdlib_wlaswp( n1, u, ldu, 1, m-1, iwork(iwoff+1), -1 )
+              do p = 1, n1
+                 xsc = one / stdlib_qznrm2( m, u(1,p), 1 )
+                 call stdlib_wdscal( m, xsc, u(1,p), 1 )
+              end do
+              if ( transp ) then
+                 call stdlib_wlacpy( 'A', n, n, u, ldu, v, ldv )
+              end if
+           else
+              ! Full Svd 
+              if ( .not. jracc ) then
+              if ( .not. almort ) then
+                 ! second preconditioning step (qrf [with pivoting])
+                 ! note that the composition of transpose, qrf and transpose is
+                 ! equivalent to an lqf call. since in many libraries the qrf
+                 ! seems to be better optimized than the lqf, we do explicit
+                 ! transpose and use the qrf. this is subject to changes in an
+                 ! optimized implementation of stdlib_wgejsv.
+                 do p = 1, nr
+                    call stdlib_wcopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                    call stdlib_wlacgv( n-p+1, v(p,p), 1 )
+                 end do
+                 ! The Following Two Loops Perturb Small Entries To Avoid
+                 ! denormals in the second qr factorization, where they are
+                 ! as good as zeros. this is done to avoid painfully slow
+                 ! computation with denormals. the relative size of the perturbation
+                 ! is a parameter that can be changed by the implementer.
+                 ! this perturbation device will be obsolete on machines with
+                 ! properly implemented arithmetic.
+                 ! to switch it off, set l2pert=.false. to remove it from  the
+                 ! code, remove the action under l2pert=.true., leave the else part.
+                 ! the following two loops should be blocked and fused with the
+                 ! transposed copy above.
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do q = 1, nr
+                       ctemp = cmplx(xsc*abs( v(q,q) ),zero,KIND=qp)
+                       do p = 1, n
+                          if ( ( p > q ) .and. ( abs(v(p,q)) <= temp1 ).or. ( p < q ) )v(p,q) = &
+                                    ctemp
+           ! $                   v(p,q) = temp1 * ( v(p,q) / abs(v(p,q)) )
+                          if ( p < q ) v(p,q) = - v(p,q)
+                       end do
+                    end do
+                 else
+                    call stdlib_wlaset( 'U', nr-1, nr-1, czero, czero, v(1,2), ldv )
+                 end if
+                 ! estimate the row scaled condition number of r1
+                 ! (if r1 is rectangular, n > nr, then the condition number
+                 ! of the leading nr x nr submatrix is estimated.)
+                 call stdlib_wlacpy( 'L', nr, nr, v, ldv, cwork(2*n+1), nr )
+                 do p = 1, nr
+                    temp1 = stdlib_qznrm2(nr-p+1,cwork(2*n+(p-1)*nr+p),1)
+                    call stdlib_wdscal(nr-p+1,one/temp1,cwork(2*n+(p-1)*nr+p),1)
+                 end do
+                 call stdlib_wpocon('L',nr,cwork(2*n+1),nr,one,temp1,cwork(2*n+nr*nr+1),rwork,&
+                           ierr)
+                 condr1 = one / sqrt(temp1)
+                 ! Here Need A Second Opinion On The Condition Number
+                 ! Then Assume Worst Case Scenario
+                 ! r1 is ok for inverse <=> condr1 < real(n,KIND=qp)
+                 ! more conservative    <=> condr1 < sqrt(real(n,KIND=qp))
+                 cond_ok = sqrt(sqrt(real(nr,KIND=qp)))
+      ! [tp]       cond_ok is a tuning parameter.
+                 if ( condr1 < cond_ok ) then
+                    ! .. the second qrf without pivoting. note: in an optimized
+                    ! implementation, this qrf should be implemented as the qrf
+                    ! of a lower triangular matrix.
+                    ! r1^* = q2 * r2
+                    call stdlib_wgeqrf( n, nr, v, ldv, cwork(n+1), cwork(2*n+1),lwork-2*n, ierr )
+                              
+                    if ( l2pert ) then
+                       xsc = sqrt(small)/epsln
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             ctemp=cmplx(xsc*min(abs(v(p,p)),abs(v(q,q))),zero,KIND=qp)
+                             if ( abs(v(q,p)) <= temp1 )v(q,p) = ctemp
+           ! $                     v(q,p) = temp1 * ( v(q,p) / abs(v(q,p)) )
+                          end do
+                       end do
+                    end if
+                    if ( nr /= n )call stdlib_wlacpy( 'A', n, nr, v, ldv, cwork(2*n+1), n )
+                              
+                    ! .. save ...
+                 ! This Transposed Copy Should Be Better Than Naive
+                    do p = 1, nr - 1
+                       call stdlib_wcopy( nr-p, v(p,p+1), ldv, v(p+1,p), 1 )
+                       call stdlib_wlacgv(nr-p+1, v(p,p), 1 )
+                    end do
+                    v(nr,nr)=conjg(v(nr,nr))
+                    condr2 = condr1
+                 else
+                    ! .. ill-conditioned case: second qrf with pivoting
+                    ! note that windowed pivoting would be equally good
+                    ! numerically, and more run-time efficient. so, in
+                    ! an optimal implementation, the next call to stdlib_wgeqp3
+                    ! should be replaced with eg. call zgeqpx (acm toms #782)
+                    ! with properly (carefully) chosen parameters.
+                    ! r1^* * p2 = q2 * r2
+                    do p = 1, nr
+                       iwork(n+p) = 0
+                    end do
+                    call stdlib_wgeqp3( n, nr, v, ldv, iwork(n+1), cwork(n+1),cwork(2*n+1), lwork-&
+                              2*n, rwork, ierr )
+      ! *               call stdlib_wgeqrf( n, nr, v, ldv, cwork(n+1), cwork(2*n+1),
+      ! *     $              lwork-2*n, ierr )
+                    if ( l2pert ) then
+                       xsc = sqrt(small)
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             ctemp=cmplx(xsc*min(abs(v(p,p)),abs(v(q,q))),zero,KIND=qp)
+                             if ( abs(v(q,p)) <= temp1 )v(q,p) = ctemp
+           ! $                     v(q,p) = temp1 * ( v(q,p) / abs(v(q,p)) )
+                          end do
+                       end do
+                    end if
+                    call stdlib_wlacpy( 'A', n, nr, v, ldv, cwork(2*n+1), n )
+                    if ( l2pert ) then
+                       xsc = sqrt(small)
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             ctemp=cmplx(xsc*min(abs(v(p,p)),abs(v(q,q))),zero,KIND=qp)
+                              ! v(p,q) = - temp1*( v(q,p) / abs(v(q,p)) )
+                             v(p,q) = - ctemp
+                          end do
+                       end do
+                    else
+                       call stdlib_wlaset( 'L',nr-1,nr-1,czero,czero,v(2,1),ldv )
+                    end if
+                    ! now, compute r2 = l3 * q3, the lq factorization.
+                    call stdlib_wgelqf( nr, nr, v, ldv, cwork(2*n+n*nr+1),cwork(2*n+n*nr+nr+1), &
+                              lwork-2*n-n*nr-nr, ierr )
+                    ! And Estimate The Condition Number
+                    call stdlib_wlacpy( 'L',nr,nr,v,ldv,cwork(2*n+n*nr+nr+1),nr )
+                    do p = 1, nr
+                       temp1 = stdlib_qznrm2( p, cwork(2*n+n*nr+nr+p), nr )
+                       call stdlib_wdscal( p, one/temp1, cwork(2*n+n*nr+nr+p), nr )
+                    end do
+                    call stdlib_wpocon( 'L',nr,cwork(2*n+n*nr+nr+1),nr,one,temp1,cwork(2*n+n*nr+&
+                              nr+nr*nr+1),rwork,ierr )
+                    condr2 = one / sqrt(temp1)
+                    if ( condr2 >= cond_ok ) then
+                       ! Save The Householder Vectors Used For Q3
+                       ! (this overwrites the copy of r2, as it will not be
+                       ! needed in this branch, but it does not overwritte the
+                       ! huseholder vectors of q2.).
+                       call stdlib_wlacpy( 'U', nr, nr, v, ldv, cwork(2*n+1), n )
+                       ! And The Rest Of The Information On Q3 Is In
+                       ! work(2*n+n*nr+1:2*n+n*nr+n)
+                    end if
+                 end if
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do q = 2, nr
+                       ctemp = xsc * v(q,q)
+                       do p = 1, q - 1
+                           ! v(p,q) = - temp1*( v(p,q) / abs(v(p,q)) )
+                          v(p,q) = - ctemp
+                       end do
+                    end do
+                 else
+                    call stdlib_wlaset( 'U', nr-1,nr-1, czero,czero, v(1,2), ldv )
+                 end if
+              ! second preconditioning finished; continue with jacobi svd
+              ! the input matrix is lower trinagular.
+              ! recover the right singular vectors as solution of a well
+              ! conditioned triangular matrix equation.
+                 if ( condr1 < cond_ok ) then
+                    call stdlib_wgesvj( 'L','U','N',nr,nr,v,ldv,sva,nr,u, ldu,cwork(2*n+n*nr+nr+1)&
+                              ,lwork-2*n-n*nr-nr,rwork,lrwork, info )
+                    scalem  = rwork(1)
+                    numrank = nint(rwork(2),KIND=ilp)
+                    do p = 1, nr
+                       call stdlib_wcopy(  nr, v(1,p), 1, u(1,p), 1 )
+                       call stdlib_wdscal( nr, sva(p),    v(1,p), 1 )
+                    end do
+              ! Pick The Right Matrix Equation And Solve It
+                    if ( nr == n ) then
+       ! :))             .. best case, r1 is inverted. the solution of this matrix
+                       ! equation is q2*v2 = the product of the jacobi rotations
+                       ! used in stdlib_wgesvj, premultiplied with the orthogonal matrix
+                       ! from the second qr factorization.
+                       call stdlib_wtrsm('L','U','N','N', nr,nr,cone, a,lda, v,ldv)
+                    else
+                       ! .. r1 is well conditioned, but non-square. adjoint of r2
+                       ! is inverted to get the product of the jacobi rotations
+                       ! used in stdlib_wgesvj. the q-factor from the second qr
+                       ! factorization is then built in explicitly.
+                       call stdlib_wtrsm('L','U','C','N',nr,nr,cone,cwork(2*n+1),n,v,ldv)
+                       if ( nr < n ) then
+                       call stdlib_wlaset('A',n-nr,nr,czero,czero,v(nr+1,1),ldv)
+                       call stdlib_wlaset('A',nr,n-nr,czero,czero,v(1,nr+1),ldv)
+                       call stdlib_wlaset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                       end if
+                       call stdlib_wunmqr('L','N',n,n,nr,cwork(2*n+1),n,cwork(n+1),v,ldv,cwork(&
+                                 2*n+n*nr+nr+1),lwork-2*n-n*nr-nr,ierr)
+                    end if
+                 else if ( condr2 < cond_ok ) then
+                    ! the matrix r2 is inverted. the solution of the matrix equation
+                    ! is q3^* * v3 = the product of the jacobi rotations (appplied to
+                    ! the lower triangular l3 from the lq factorization of
+                    ! r2=l3*q3), pre-multiplied with the transposed q3.
+                    call stdlib_wgesvj( 'L', 'U', 'N', nr, nr, v, ldv, sva, nr, u,ldu, cwork(2*n+&
+                              n*nr+nr+1), lwork-2*n-n*nr-nr,rwork, lrwork, info )
+                    scalem  = rwork(1)
+                    numrank = nint(rwork(2),KIND=ilp)
+                    do p = 1, nr
+                       call stdlib_wcopy( nr, v(1,p), 1, u(1,p), 1 )
+                       call stdlib_wdscal( nr, sva(p),    u(1,p), 1 )
+                    end do
+                    call stdlib_wtrsm('L','U','N','N',nr,nr,cone,cwork(2*n+1),n,u,ldu)
+                    ! Apply The Permutation From The Second Qr Factorization
+                    do q = 1, nr
+                       do p = 1, nr
+                          cwork(2*n+n*nr+nr+iwork(n+p)) = u(p,q)
+                       end do
+                       do p = 1, nr
+                          u(p,q) = cwork(2*n+n*nr+nr+p)
+                       end do
+                    end do
+                    if ( nr < n ) then
+                       call stdlib_wlaset( 'A',n-nr,nr,czero,czero,v(nr+1,1),ldv )
+                       call stdlib_wlaset( 'A',nr,n-nr,czero,czero,v(1,nr+1),ldv )
+                       call stdlib_wlaset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                    end if
+                    call stdlib_wunmqr( 'L','N',n,n,nr,cwork(2*n+1),n,cwork(n+1),v,ldv,cwork(2*n+&
+                              n*nr+nr+1),lwork-2*n-n*nr-nr,ierr )
+                 else
+                    ! last line of defense.
+       ! #:(          this is a rather pathological case: no scaled condition
+                    ! improvement after two pivoted qr factorizations. other
+                    ! possibility is that the rank revealing qr factorization
+                    ! or the condition estimator has failed, or the cond_ok
+                    ! is set very close to one (which is unnecessary). normally,
+                    ! this branch should never be executed, but in rare cases of
+                    ! failure of the rrqr or condition estimator, the last line of
+                    ! defense ensures that stdlib_wgejsv completes the task.
+                    ! compute the full svd of l3 using stdlib_wgesvj with explicit
+                    ! accumulation of jacobi rotations.
+                    call stdlib_wgesvj( 'L', 'U', 'V', nr, nr, v, ldv, sva, nr, u,ldu, cwork(2*n+&
+                              n*nr+nr+1), lwork-2*n-n*nr-nr,rwork, lrwork, info )
+                    scalem  = rwork(1)
+                    numrank = nint(rwork(2),KIND=ilp)
+                    if ( nr < n ) then
+                       call stdlib_wlaset( 'A',n-nr,nr,czero,czero,v(nr+1,1),ldv )
+                       call stdlib_wlaset( 'A',nr,n-nr,czero,czero,v(1,nr+1),ldv )
+                       call stdlib_wlaset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                    end if
+                    call stdlib_wunmqr( 'L','N',n,n,nr,cwork(2*n+1),n,cwork(n+1),v,ldv,cwork(2*n+&
+                              n*nr+nr+1),lwork-2*n-n*nr-nr,ierr )
+                    call stdlib_wunmlq( 'L', 'C', nr, nr, nr, cwork(2*n+1), n,cwork(2*n+n*nr+1), &
+                              u, ldu, cwork(2*n+n*nr+nr+1),lwork-2*n-n*nr-nr, ierr )
+                    do q = 1, nr
+                       do p = 1, nr
+                          cwork(2*n+n*nr+nr+iwork(n+p)) = u(p,q)
+                       end do
+                       do p = 1, nr
+                          u(p,q) = cwork(2*n+n*nr+nr+p)
+                       end do
+                    end do
+                 end if
+                 ! permute the rows of v using the (column) permutation from the
+                 ! first qrf. also, scale the columns to make them unit in
+                 ! euclidean norm. this applies to all cases.
+                 temp1 = sqrt(real(n,KIND=qp)) * epsln
+                 do q = 1, n
+                    do p = 1, n
+                       cwork(2*n+n*nr+nr+iwork(p)) = v(p,q)
+                    end do
+                    do p = 1, n
+                       v(p,q) = cwork(2*n+n*nr+nr+p)
+                    end do
+                    xsc = one / stdlib_qznrm2( n, v(1,q), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_wdscal( n, xsc,&
+                               v(1,q), 1 )
+                 end do
+                 ! at this moment, v contains the right singular vectors of a.
+                 ! next, assemble the left singular vector matrix u (m x n).
+                 if ( nr < m ) then
+                    call stdlib_wlaset('A', m-nr, nr, czero, czero, u(nr+1,1), ldu)
+                    if ( nr < n1 ) then
+                       call stdlib_wlaset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                       call stdlib_wlaset('A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1),ldu)
+                    end if
+                 end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! matrix u. this applies to all cases.
+                 call stdlib_wunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, cwork(n+1), lwork-&
+                           n, ierr )
+                 ! the columns of u are normalized. the cost is o(m*n) flops.
+                 temp1 = sqrt(real(m,KIND=qp)) * epsln
+                 do p = 1, nr
+                    xsc = one / stdlib_qznrm2( m, u(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_wdscal( m, xsc,&
+                               u(1,p), 1 )
+                 end do
+                 ! if the initial qrf is computed with row pivoting, the left
+                 ! singular vectors must be adjusted.
+                 if ( rowpiv )call stdlib_wlaswp( n1, u, ldu, 1, m-1, iwork(iwoff+1), -1 )
+              else
+              ! The Initial Matrix A Has Almost Orthogonal Columns And
+              ! the second qrf is not needed
+                 call stdlib_wlacpy( 'U', n, n, a, lda, cwork(n+1), n )
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do p = 2, n
+                       ctemp = xsc * cwork( n + (p-1)*n + p )
+                       do q = 1, p - 1
+                           ! cwork(n+(q-1)*n+p)=-temp1 * ( cwork(n+(p-1)*n+q) /
+           ! $                                        abs(cwork(n+(p-1)*n+q)) )
+                          cwork(n+(q-1)*n+p)=-ctemp
+                       end do
+                    end do
+                 else
+                    call stdlib_wlaset( 'L',n-1,n-1,czero,czero,cwork(n+2),n )
+                 end if
+                 call stdlib_wgesvj( 'U', 'U', 'N', n, n, cwork(n+1), n, sva,n, u, ldu, cwork(n+&
+                           n*n+1), lwork-n-n*n, rwork, lrwork,info )
+                 scalem  = rwork(1)
+                 numrank = nint(rwork(2),KIND=ilp)
+                 do p = 1, n
+                    call stdlib_wcopy( n, cwork(n+(p-1)*n+1), 1, u(1,p), 1 )
+                    call stdlib_wdscal( n, sva(p), cwork(n+(p-1)*n+1), 1 )
+                 end do
+                 call stdlib_wtrsm( 'L', 'U', 'N', 'N', n, n,cone, a, lda, cwork(n+1), n )
+                 do p = 1, n
+                    call stdlib_wcopy( n, cwork(n+p), n, v(iwork(p),1), ldv )
+                 end do
+                 temp1 = sqrt(real(n,KIND=qp))*epsln
+                 do p = 1, n
+                    xsc = one / stdlib_qznrm2( n, v(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_wdscal( n, xsc,&
+                               v(1,p), 1 )
+                 end do
+                 ! assemble the left singular vector matrix u (m x n).
+                 if ( n < m ) then
+                    call stdlib_wlaset( 'A',  m-n, n, czero, czero, u(n+1,1), ldu )
+                    if ( n < n1 ) then
+                       call stdlib_wlaset('A',n,  n1-n, czero, czero,  u(1,n+1),ldu)
+                       call stdlib_wlaset( 'A',m-n,n1-n, czero, cone,u(n+1,n+1),ldu)
+                    end if
+                 end if
+                 call stdlib_wunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, cwork(n+1), lwork-&
+                           n, ierr )
+                 temp1 = sqrt(real(m,KIND=qp))*epsln
+                 do p = 1, n1
+                    xsc = one / stdlib_qznrm2( m, u(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_wdscal( m, xsc,&
+                               u(1,p), 1 )
+                 end do
+                 if ( rowpiv )call stdlib_wlaswp( n1, u, ldu, 1, m-1, iwork(iwoff+1), -1 )
+              end if
+              ! end of the  >> almost orthogonal case <<  in the full svd
+              else
+              ! this branch deploys a preconditioned jacobi svd with explicitly
+              ! accumulated rotations. it is included as optional, mainly for
+              ! experimental purposes. it does perform well, and can also be used.
+              ! in this implementation, this branch will be automatically activated
+              ! if the  condition number sigma_max(a) / sigma_min(a) is predicted
+              ! to be greater than the overflow threshold. this is because the
+              ! a posteriori computation of the singular vectors assumes robust
+              ! implementation of blas and some lapack procedures, capable of working
+              ! in presence of extreme values, e.g. when the singular values spread from
+              ! the underflow to the overflow threshold.
+              do p = 1, nr
+                 call stdlib_wcopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                 call stdlib_wlacgv( n-p+1, v(p,p), 1 )
+              end do
+              if ( l2pert ) then
+                 xsc = sqrt(small/epsln)
+                 do q = 1, nr
+                    ctemp = cmplx(xsc*abs( v(q,q) ),zero,KIND=qp)
+                    do p = 1, n
+                       if ( ( p > q ) .and. ( abs(v(p,q)) <= temp1 ).or. ( p < q ) )v(p,q) = &
+                                 ctemp
+           ! $                v(p,q) = temp1 * ( v(p,q) / abs(v(p,q)) )
+                       if ( p < q ) v(p,q) = - v(p,q)
+                    end do
+                 end do
+              else
+                 call stdlib_wlaset( 'U', nr-1, nr-1, czero, czero, v(1,2), ldv )
+              end if
+              call stdlib_wgeqrf( n, nr, v, ldv, cwork(n+1), cwork(2*n+1),lwork-2*n, ierr )
+                        
+              call stdlib_wlacpy( 'L', n, nr, v, ldv, cwork(2*n+1), n )
+              do p = 1, nr
+                 call stdlib_wcopy( nr-p+1, v(p,p), ldv, u(p,p), 1 )
+                 call stdlib_wlacgv( nr-p+1, u(p,p), 1 )
+              end do
+              if ( l2pert ) then
+                 xsc = sqrt(small/epsln)
+                 do q = 2, nr
+                    do p = 1, q - 1
+                       ctemp = cmplx(xsc * min(abs(u(p,p)),abs(u(q,q))),zero,KIND=qp)
+                        ! u(p,q) = - temp1 * ( u(q,p) / abs(u(q,p)) )
+                       u(p,q) = - ctemp
+                    end do
+                 end do
+              else
+                 call stdlib_wlaset('U', nr-1, nr-1, czero, czero, u(1,2), ldu )
+              end if
+              call stdlib_wgesvj( 'L', 'U', 'V', nr, nr, u, ldu, sva,n, v, ldv, cwork(2*n+n*nr+1),&
+                         lwork-2*n-n*nr,rwork, lrwork, info )
+              scalem  = rwork(1)
+              numrank = nint(rwork(2),KIND=ilp)
+              if ( nr < n ) then
+                 call stdlib_wlaset( 'A',n-nr,nr,czero,czero,v(nr+1,1),ldv )
+                 call stdlib_wlaset( 'A',nr,n-nr,czero,czero,v(1,nr+1),ldv )
+                 call stdlib_wlaset( 'A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv )
+              end if
+              call stdlib_wunmqr( 'L','N',n,n,nr,cwork(2*n+1),n,cwork(n+1),v,ldv,cwork(2*n+n*nr+&
+                        nr+1),lwork-2*n-n*nr-nr,ierr )
+                 ! permute the rows of v using the (column) permutation from the
+                 ! first qrf. also, scale the columns to make them unit in
+                 ! euclidean norm. this applies to all cases.
+                 temp1 = sqrt(real(n,KIND=qp)) * epsln
+                 do q = 1, n
+                    do p = 1, n
+                       cwork(2*n+n*nr+nr+iwork(p)) = v(p,q)
+                    end do
+                    do p = 1, n
+                       v(p,q) = cwork(2*n+n*nr+nr+p)
+                    end do
+                    xsc = one / stdlib_qznrm2( n, v(1,q), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_wdscal( n, xsc,&
+                               v(1,q), 1 )
+                 end do
+                 ! at this moment, v contains the right singular vectors of a.
+                 ! next, assemble the left singular vector matrix u (m x n).
+              if ( nr < m ) then
+                 call stdlib_wlaset( 'A',  m-nr, nr, czero, czero, u(nr+1,1), ldu )
+                 if ( nr < n1 ) then
+                    call stdlib_wlaset('A',nr,  n1-nr, czero, czero,  u(1,nr+1),ldu)
+                    call stdlib_wlaset('A',m-nr,n1-nr, czero, cone,u(nr+1,nr+1),ldu)
+                 end if
+              end if
+              call stdlib_wunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, cwork(n+1), lwork-n, &
+                        ierr )
+                 if ( rowpiv )call stdlib_wlaswp( n1, u, ldu, 1, m-1, iwork(iwoff+1), -1 )
+              end if
+              if ( transp ) then
+                 ! .. swap u and v because the procedure worked on a^*
+                 do p = 1, n
+                    call stdlib_wswap( n, u(1,p), 1, v(1,p), 1 )
+                 end do
+              end if
+           end if
+           ! end of the full svd
+           ! undo scaling, if necessary (and possible)
+           if ( uscal2 <= (big/sva(1))*uscal1 ) then
+              call stdlib_qlascl( 'G', 0, 0, uscal1, uscal2, nr, 1, sva, n, ierr )
+              uscal1 = one
+              uscal2 = one
+           end if
+           if ( nr < n ) then
+              do p = nr+1, n
+                 sva(p) = zero
+              end do
+           end if
+           rwork(1) = uscal2 * scalem
+           rwork(2) = uscal1
+           if ( errest ) rwork(3) = sconda
+           if ( lsvec .and. rsvec ) then
+              rwork(4) = condr1
+              rwork(5) = condr2
+           end if
+           if ( l2tran ) then
+              rwork(6) = entra
+              rwork(7) = entrat
+           end if
+           iwork(1) = nr
+           iwork(2) = numrank
+           iwork(3) = warning
+           if ( transp ) then
+               iwork(4) =  1
+           else
+               iwork(4) = -1
+           end if
+           return
+     end subroutine stdlib_wgejsv
+
+     !> ZGELQ: computes an LQ factorization of a complex M-by-N matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a N-by-N orthogonal matrix;
+     !> L is a lower-triangular M-by-M matrix;
+     !> 0 is a M-by-(N-M) zero matrix, if M < N.
+
+     pure subroutine stdlib_wgelq( m, n, a, lda, t, tsize, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, tsize, lwork
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: t(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, lminws, mint, minw
+           integer(ilp) :: mb, nb, mintsz, nblcks, lwmin, lwopt, lwreq
+           ! Intrinsic Functions 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( tsize==-1 .or. tsize==-2 .or.lwork==-1 .or. lwork==-2 )
+           mint = .false.
+           minw = .false.
+           if( tsize==-2 .or. lwork==-2 ) then
+             if( tsize/=-1 ) mint = .true.
+             if( lwork/=-1 ) minw = .true.
+           end if
+           ! determine the block size
+           if( min( m, n )>0 ) then
+             mb = stdlib_ilaenv( 1, 'ZGELQ ', ' ', m, n, 1, -1 )
+             nb = stdlib_ilaenv( 1, 'ZGELQ ', ' ', m, n, 2, -1 )
+           else
+             mb = 1
+             nb = n
+           end if
+           if( mb>min( m, n ) .or. mb<1 ) mb = 1
+           if( nb>n .or. nb<=m ) nb = n
+           mintsz = m + 5
+           if ( nb>m .and. n>m ) then
+             if( mod( n - m, nb - m )==0 ) then
+               nblcks = ( n - m ) / ( nb - m )
+             else
+               nblcks = ( n - m ) / ( nb - m ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           ! determine if the workspace size satisfies minimal size
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+              lwmin = max( 1, n )
+              lwopt = max( 1, mb*n )
+           else
+              lwmin = max( 1, m )
+              lwopt = max( 1, mb*m )
+           end if
+           lminws = .false.
+           if( ( tsize<max( 1, mb*m*nblcks + 5 ) .or. lwork<lwopt ).and. ( lwork>=lwmin ) .and. ( &
+                     tsize>=mintsz ).and. ( .not.lquery ) ) then
+             if( tsize<max( 1, mb*m*nblcks + 5 ) ) then
+                 lminws = .true.
+                 mb = 1
+                 nb = n
+             end if
+             if( lwork<lwopt ) then
+                 lminws = .true.
+                 mb = 1
+             end if
+           end if
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+              lwreq = max( 1, mb*n )
+           else
+              lwreq = max( 1, mb*m )
+           end if
+           if( m<0 ) then
+             info = -1
+           else if( n<0 ) then
+             info = -2
+           else if( lda<max( 1, m ) ) then
+             info = -4
+           else if( tsize<max( 1, mb*m*nblcks + 5 ).and. ( .not.lquery ) .and. ( .not.lminws ) ) &
+                     then
+             info = -6
+           else if( ( lwork<lwreq ) .and.( .not.lquery ).and. ( .not.lminws ) ) then
+             info = -8
+           end if
+           if( info==0 ) then
+             if( mint ) then
+               t( 1 ) = mintsz
+             else
+               t( 1 ) = mb*m*nblcks + 5
+             end if
+             t( 2 ) = mb
+             t( 3 ) = nb
+             if( minw ) then
+               work( 1 ) = lwmin
+             else
+               work( 1 ) = lwreq
+             end if
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZGELQ', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+             return
+           end if
+           ! the lq decomposition
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+             call stdlib_wgelqt( m, n, mb, a, lda, t( 6 ), mb, work, info )
+           else
+             call stdlib_wlaswlq( m, n, mb, nb, a, lda, t( 6 ), mb, work,lwork, info )
+           end if
+           work( 1 ) = lwreq
+           return
+     end subroutine stdlib_wgelq
+
+     !> ZGELQ2: computes an LQ factorization of a complex m-by-n matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a n-by-n orthogonal matrix;
+     !> L is a lower-triangular m-by-m matrix;
+     !> 0 is a m-by-(n-m) zero matrix, if m < n.
+
+     pure subroutine stdlib_wgelq2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELQ2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i,i+1:n)
+              call stdlib_wlacgv( n-i+1, a( i, i ), lda )
+              alpha = a( i, i )
+              call stdlib_wlarfg( n-i+1, alpha, a( i, min( i+1, n ) ), lda,tau( i ) )
+              if( i<m ) then
+                 ! apply h(i) to a(i+1:m,i:n) from the right
+                 a( i, i ) = cone
+                 call stdlib_wlarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda, tau( i ),a( i+1, i ), &
+                           lda, work )
+              end if
+              a( i, i ) = alpha
+              call stdlib_wlacgv( n-i+1, a( i, i ), lda )
+           end do
+           return
+     end subroutine stdlib_wgelq2
+
+     !> ZGELQF: computes an LQ factorization of a complex M-by-N matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a N-by-N orthogonal matrix;
+     !> L is a lower-triangular M-by-M matrix;
+     !> 0 is a M-by-(N-M) zero matrix, if M < N.
+
+     pure subroutine stdlib_wgelqf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZGELQF', ' ', m, n, -1, -1 )
+           lwkopt = m*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, m ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZGELQF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGELQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the lq factorization of the current block
+                 ! a(i:i+ib-1,i:n)
+                 call stdlib_wgelq2( ib, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=m ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_wlarft( 'FORWARD', 'ROWWISE', n-i+1, ib, a( i, i ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h to a(i+ib:m,i:n) from the right
+                    call stdlib_wlarfb( 'RIGHT', 'NO TRANSPOSE', 'FORWARD','ROWWISE', m-i-ib+1, n-&
+                    i+1, ib, a( i, i ),lda, work, ldwork, a( i+ib, i ), lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_wgelq2( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_wgelqf
+
+     !> ZGELQT: computes a blocked LQ factorization of a complex M-by-N matrix A
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_wgelqt( m, n, mb, a, lda, t, ldt, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, mb
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, iinfo, k
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( mb<1 .or. (mb>min(m,n) .and. min(m,n)>0 ))then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<mb ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELQT', -info )
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) return
+           ! blocked loop of length k
+           do i = 1, k,  mb
+              ib = min( k-i+1, mb )
+           ! compute the lq factorization of the current block a(i:m,i:i+ib-1)
+              call stdlib_wgelqt3( ib, n-i+1, a(i,i), lda, t(1,i), ldt, iinfo )
+              if( i+ib<=m ) then
+           ! update by applying h**t to a(i:m,i+ib:n) from the right
+              call stdlib_wlarfb( 'R', 'N', 'F', 'R', m-i-ib+1, n-i+1, ib,a( i, i ), lda, t( 1, i &
+                        ), ldt,a( i+ib, i ), lda, work , m-i-ib+1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_wgelqt
+
+     !> ZGELQT3: recursively computes a LQ factorization of a complex M-by-N
+     !> matrix A, using the compact WY representation of Q.
+     !> Based on the algorithm of Elmroth and Gustavson,
+     !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+
+     pure recursive subroutine stdlib_wgelqt3( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, ldt
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, j, j1, m1, m2, iinfo
+           ! Executable Statements 
+           info = 0
+           if( m < 0 ) then
+              info = -1
+           else if( n < m ) then
+              info = -2
+           else if( lda < max( 1, m ) ) then
+              info = -4
+           else if( ldt < max( 1, m ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELQT3', -info )
+              return
+           end if
+           if( m==1 ) then
+              ! compute householder transform when m=1
+              call stdlib_wlarfg( n, a(1,1), a( 1, min( 2, n ) ), lda, t(1,1) )
+              t(1,1)=conjg(t(1,1))
+           else
+              ! otherwise, split a into blocks...
+              m1 = m/2
+              m2 = m-m1
+              i1 = min( m1+1, m )
+              j1 = min( m+1, n )
+              ! compute a(1:m1,1:n) <- (y1,r1,t1), where q1 = i - y1 t1 y1^h
+              call stdlib_wgelqt3( m1, n, a, lda, t, ldt, iinfo )
+              ! compute a(j1:m,1:n) =  a(j1:m,1:n) q1^h [workspace: t(1:n1,j1:n)]
+              do i=1,m2
+                 do j=1,m1
+                    t(  i+m1, j ) = a( i+m1, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'R', 'U', 'C', 'U', m2, m1, cone,a, lda, t( i1, 1 ), ldt )
+                        
+              call stdlib_wgemm( 'N', 'C', m2, m1, n-m1, cone, a( i1, i1 ), lda,a( 1, i1 ), lda, &
+                        cone, t( i1, 1 ), ldt)
+              call stdlib_wtrmm( 'R', 'U', 'N', 'N', m2, m1, cone,t, ldt, t( i1, 1 ), ldt )
+                        
+              call stdlib_wgemm( 'N', 'N', m2, n-m1, m1, -cone, t( i1, 1 ), ldt,a( 1, i1 ), lda, &
+                        cone, a( i1, i1 ), lda )
+              call stdlib_wtrmm( 'R', 'U', 'N', 'U', m2, m1 , cone,a, lda, t( i1, 1 ), ldt )
+                        
+              do i=1,m2
+                 do j=1,m1
+                    a(  i+m1, j ) = a( i+m1, j ) - t( i+m1, j )
+                    t( i+m1, j )= czero
+                 end do
+              end do
+              ! compute a(j1:m,j1:n) <- (y2,r2,t2) where q2 = i - y2 t2 y2^h
+              call stdlib_wgelqt3( m2, n-m1, a( i1, i1 ), lda,t( i1, i1 ), ldt, iinfo )
+              ! compute t3 = t(j1:n1,1:n) = -t1 y1^h y2 t2
+              do i=1,m2
+                 do j=1,m1
+                    t( j, i+m1  ) = (a( j, i+m1 ))
+                 end do
+              end do
+              call stdlib_wtrmm( 'R', 'U', 'C', 'U', m1, m2, cone,a( i1, i1 ), lda, t( 1, i1 ), &
+                        ldt )
+              call stdlib_wgemm( 'N', 'C', m1, m2, n-m, cone, a( 1, j1 ), lda,a( i1, j1 ), lda, &
+                        cone, t( 1, i1 ), ldt )
+              call stdlib_wtrmm( 'L', 'U', 'N', 'N', m1, m2, -cone, t, ldt,t( 1, i1 ), ldt )
+                        
+              call stdlib_wtrmm( 'R', 'U', 'N', 'N', m1, m2, cone,t( i1, i1 ), ldt, t( 1, i1 ), &
+                        ldt )
+              ! y = (y1,y2); l = [ l1            0  ];  t = [t1 t3]
+                               ! [ a(1:n1,j1:n)  l2 ]       [ 0 t2]
+           end if
+           return
+     end subroutine stdlib_wgelqt3
+
+     !> ZGELS: solves overdetermined or underdetermined complex linear systems
+     !> involving an M-by-N matrix A, or its conjugate-transpose, using a QR
+     !> or LQ factorization of A.  It is assumed that A has full rank.
+     !> The following options are provided:
+     !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A*X ||.
+     !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+     !> an underdetermined system A * X = B.
+     !> 3. If TRANS = 'C' and m >= n:  find the minimum norm solution of
+     !> an underdetermined system A**H * X = B.
+     !> 4. If TRANS = 'C' and m < n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A**H * X ||.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+
+     subroutine stdlib_wgels( trans, m, n, nrhs, a, lda, b, ldb, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, tpsd
+           integer(ilp) :: brow, i, iascl, ibscl, j, mn, nb, scllen, wsize
+           real(qp) :: anrm, bignum, bnrm, smlnum
+           ! Local Arrays 
+           real(qp) :: rwork(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( .not.( stdlib_lsame( trans, 'N' ) .or. stdlib_lsame( trans, 'C' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m, n ) ) then
+              info = -8
+           else if( lwork<max( 1, mn+max( mn, nrhs ) ) .and. .not.lquery )then
+              info = -10
+           end if
+           ! figure out optimal block size
+           if( info==0 .or. info==-10 ) then
+              tpsd = .true.
+              if( stdlib_lsame( trans, 'N' ) )tpsd = .false.
+              if( m>=n ) then
+                 nb = stdlib_ilaenv( 1, 'ZGEQRF', ' ', m, n, -1, -1 )
+                 if( tpsd ) then
+                    nb = max( nb, stdlib_ilaenv( 1, 'ZUNMQR', 'LN', m, nrhs, n,-1 ) )
+                 else
+                    nb = max( nb, stdlib_ilaenv( 1, 'ZUNMQR', 'LC', m, nrhs, n,-1 ) )
+                 end if
+              else
+                 nb = stdlib_ilaenv( 1, 'ZGELQF', ' ', m, n, -1, -1 )
+                 if( tpsd ) then
+                    nb = max( nb, stdlib_ilaenv( 1, 'ZUNMLQ', 'LC', n, nrhs, m,-1 ) )
+                 else
+                    nb = max( nb, stdlib_ilaenv( 1, 'ZUNMLQ', 'LN', n, nrhs, m,-1 ) )
+                 end if
+              end if
+              wsize = max( 1, mn+max( mn, nrhs )*nb )
+              work( 1 ) = real( wsize,KIND=qp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELS ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+              call stdlib_wlaset( 'FULL', max( m, n ), nrhs, czero, czero, b, ldb )
+              return
+           end if
+           ! get machine parameters
+           smlnum = stdlib_qlamch( 'S' ) / stdlib_qlamch( 'P' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! scale a, b if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_wlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_wlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_wlaset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              go to 50
+           end if
+           brow = m
+           if( tpsd )brow = n
+           bnrm = stdlib_wlange( 'M', brow, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_wlascl( 'G', 0, 0, bnrm, smlnum, brow, nrhs, b, ldb,info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_wlascl( 'G', 0, 0, bnrm, bignum, brow, nrhs, b, ldb,info )
+              ibscl = 2
+           end if
+           if( m>=n ) then
+              ! compute qr factorization of a
+              call stdlib_wgeqrf( m, n, a, lda, work( 1 ), work( mn+1 ), lwork-mn,info )
+              ! workspace at least n, optimally n*nb
+              if( .not.tpsd ) then
+                 ! least-squares problem min || a * x - b ||
+                 ! b(1:m,1:nrhs) := q**h * b(1:m,1:nrhs)
+                 call stdlib_wunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', m, nrhs, n, a,lda, work( 1 ), &
+                           b, ldb, work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:n,1:nrhs) := inv(r) * b(1:n,1:nrhs)
+                 call stdlib_wtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = n
+              else
+                 ! underdetermined system of equations a**t * x = b
+                 ! b(1:n,1:nrhs) := inv(r**h) * b(1:n,1:nrhs)
+                 call stdlib_wtrtrs( 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT',n, nrhs, a, lda, b,&
+                            ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(n+1:m,1:nrhs) = zero
+                 do j = 1, nrhs
+                    do i = n + 1, m
+                       b( i, j ) = czero
+                    end do
+                 end do
+                 ! b(1:m,1:nrhs) := q(1:n,:) * b(1:n,1:nrhs)
+                 call stdlib_wunmqr( 'LEFT', 'NO TRANSPOSE', m, nrhs, n, a, lda,work( 1 ), b, ldb,&
+                            work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = m
+              end if
+           else
+              ! compute lq factorization of a
+              call stdlib_wgelqf( m, n, a, lda, work( 1 ), work( mn+1 ), lwork-mn,info )
+              ! workspace at least m, optimally m*nb.
+              if( .not.tpsd ) then
+                 ! underdetermined system of equations a * x = b
+                 ! b(1:m,1:nrhs) := inv(l) * b(1:m,1:nrhs)
+                 call stdlib_wtrtrs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', m, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(m+1:n,1:nrhs) = 0
+                 do j = 1, nrhs
+                    do i = m + 1, n
+                       b( i, j ) = czero
+                    end do
+                 end do
+                 ! b(1:n,1:nrhs) := q(1:n,:)**h * b(1:m,1:nrhs)
+                 call stdlib_wunmlq( 'LEFT', 'CONJUGATE TRANSPOSE', n, nrhs, m, a,lda, work( 1 ), &
+                           b, ldb, work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = n
+              else
+                 ! overdetermined system min || a**h * x - b ||
+                 ! b(1:n,1:nrhs) := q * b(1:n,1:nrhs)
+                 call stdlib_wunmlq( 'LEFT', 'NO TRANSPOSE', n, nrhs, m, a, lda,work( 1 ), b, ldb,&
+                            work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:m,1:nrhs) := inv(l**h) * b(1:m,1:nrhs)
+                 call stdlib_wtrtrs( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',m, nrhs, a, lda, &
+                           b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = m
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_wlascl( 'G', 0, 0, anrm, smlnum, scllen, nrhs, b, ldb,info )
+           else if( iascl==2 ) then
+              call stdlib_wlascl( 'G', 0, 0, anrm, bignum, scllen, nrhs, b, ldb,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_wlascl( 'G', 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb,info )
+           else if( ibscl==2 ) then
+              call stdlib_wlascl( 'G', 0, 0, bignum, bnrm, scllen, nrhs, b, ldb,info )
+           end if
+           50 continue
+           work( 1 ) = real( wsize,KIND=qp)
+           return
+     end subroutine stdlib_wgels
+
+     !> ZGELSD: computes the minimum-norm solution to a real linear least
+     !> squares problem:
+     !> minimize 2-norm(| b - A*x |)
+     !> using the singular value decomposition (SVD) of A. A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+     !> The problem is solved in three steps:
+     !> (1) Reduce the coefficient matrix A to bidiagonal form with
+     !> Householder transformations, reducing the original problem
+     !> into a "bidiagonal least squares problem" (BLS)
+     !> (2) Solve the BLS using a divide and conquer approach.
+     !> (3) Apply back all the Householder transformations to solve
+     !> the original least squares problem.
+     !> The effective rank of A is determined by treating as zero those
+     !> singular values which are less than RCOND times the largest singular
+     !> value.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_wgelsd( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, rwork, &
+               iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(qp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: rwork(*), s(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: iascl, ibscl, ie, il, itau, itaup, itauq, ldwork, liwork, lrwork, &
+                     maxmn, maxwrk, minmn, minwrk, mm, mnthr, nlvl, nrwork, nwork, smlsiz
+           real(qp) :: anrm, bignum, bnrm, eps, sfmin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: int,log,max,min,real
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           minmn = min( m, n )
+           maxmn = max( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, maxmn ) ) then
+              info = -7
+           end if
+           ! compute workspace.
+           ! (note: comments in the code beginning "workspace:" describe the
+           ! minimal amount of workspace needed at that point in the code,
+           ! as well as the preferred amount for good performance.
+           ! nb refers to the optimal block size for the immediately
+           ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              liwork = 1
+              lrwork = 1
+              if( minmn>0 ) then
+                 smlsiz = stdlib_ilaenv( 9, 'ZGELSD', ' ', 0, 0, 0, 0 )
+                 mnthr = stdlib_ilaenv( 6, 'ZGELSD', ' ', m, n, nrhs, -1 )
+                 nlvl = max( int( log( real( minmn,KIND=qp) / real( smlsiz + 1,KIND=qp) ) /log( &
+                           two ),KIND=ilp) + 1, 0 )
+                 liwork = 3*minmn*nlvl + 11*minmn
+                 mm = m
+                 if( m>=n .and. m>=mnthr ) then
+                    ! path 1a - overdetermined, with many more rows than
+                              ! columns.
+                    mm = n
+                    maxwrk = max( maxwrk, n*stdlib_ilaenv( 1, 'ZGEQRF', ' ', m, n,-1, -1 ) )
+                              
+                    maxwrk = max( maxwrk, nrhs*stdlib_ilaenv( 1, 'ZUNMQR', 'LC', m,nrhs, n, -1 ) )
+                              
+                 end if
+                 if( m>=n ) then
+                    ! path 1 - overdetermined or exactly determined.
+                    lrwork = 10*n + 2*n*smlsiz + 8*n*nlvl + 3*smlsiz*nrhs +max( (smlsiz+1)**2, n*(&
+                              1+nrhs) + 2*nrhs )
+                    maxwrk = max( maxwrk, 2*n + ( mm + n )*stdlib_ilaenv( 1,'ZGEBRD', ' ', mm, n, &
+                              -1, -1 ) )
+                    maxwrk = max( maxwrk, 2*n + nrhs*stdlib_ilaenv( 1, 'ZUNMBR','QLC', mm, nrhs, &
+                              n, -1 ) )
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'ZUNMBR', 'PLN', n, &
+                              nrhs, n, -1 ) )
+                    maxwrk = max( maxwrk, 2*n + n*nrhs )
+                    minwrk = max( 2*n + mm, 2*n + n*nrhs )
+                 end if
+                 if( n>m ) then
+                    lrwork = 10*m + 2*m*smlsiz + 8*m*nlvl + 3*smlsiz*nrhs +max( (smlsiz+1)**2, n*(&
+                              1+nrhs) + 2*nrhs )
+                    if( n>=mnthr ) then
+                       ! path 2a - underdetermined, with many more columns
+                                 ! than rows.
+                       maxwrk = m + m*stdlib_ilaenv( 1, 'ZGELQF', ' ', m, n, -1,-1 )
+                       maxwrk = max( maxwrk, m*m + 4*m + 2*m*stdlib_ilaenv( 1,'ZGEBRD', ' ', m, m,&
+                                  -1, -1 ) )
+                       maxwrk = max( maxwrk, m*m + 4*m + nrhs*stdlib_ilaenv( 1,'ZUNMBR', 'QLC', m,&
+                                  nrhs, m, -1 ) )
+                       maxwrk = max( maxwrk, m*m + 4*m + ( m - 1 )*stdlib_ilaenv( 1,'ZUNMLQ', &
+                                 'LC', n, nrhs, m, -1 ) )
+                       if( nrhs>1 ) then
+                          maxwrk = max( maxwrk, m*m + m + m*nrhs )
+                       else
+                          maxwrk = max( maxwrk, m*m + 2*m )
+                       end if
+                       maxwrk = max( maxwrk, m*m + 4*m + m*nrhs )
+           ! xxx: ensure the path 2a case below is triggered.  the workspace
+           ! calculation should use queries for all routines eventually.
+                       maxwrk = max( maxwrk,4*m+m*m+max( m, 2*m-4, nrhs, n-3*m ) )
+                    else
+                       ! path 2 - underdetermined.
+                       maxwrk = 2*m + ( n + m )*stdlib_ilaenv( 1, 'ZGEBRD', ' ', m,n, -1, -1 )
+                                 
+                       maxwrk = max( maxwrk, 2*m + nrhs*stdlib_ilaenv( 1, 'ZUNMBR','QLC', m, nrhs,&
+                                  m, -1 ) )
+                       maxwrk = max( maxwrk, 2*m + m*stdlib_ilaenv( 1, 'ZUNMBR','PLN', n, nrhs, m,&
+                                  -1 ) )
+                       maxwrk = max( maxwrk, 2*m + m*nrhs )
+                    end if
+                    minwrk = max( 2*m + n, 2*m + m*nrhs )
+                 end if
+              end if
+              minwrk = min( minwrk, maxwrk )
+              work( 1 ) = maxwrk
+              iwork( 1 ) = liwork
+              rwork( 1 ) = lrwork
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELSD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters.
+           eps = stdlib_qlamch( 'P' )
+           sfmin = stdlib_qlamch( 'S' )
+           smlnum = sfmin / eps
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! scale a if max entry outside range [smlnum,bignum].
+           anrm = stdlib_wlange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_wlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum.
+              call stdlib_wlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_wlaset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              call stdlib_qlaset( 'F', minmn, 1, zero, zero, s, 1 )
+              rank = 0
+              go to 10
+           end if
+           ! scale b if max entry outside range [smlnum,bignum].
+           bnrm = stdlib_wlange( 'M', m, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum.
+              call stdlib_wlascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum.
+              call stdlib_wlascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! if m < n make sure b(m+1:n,:) = 0
+           if( m<n )call stdlib_wlaset( 'F', n-m, nrhs, czero, czero, b( m+1, 1 ), ldb )
+           ! overdetermined case.
+           if( m>=n ) then
+              ! path 1 - overdetermined or exactly determined.
+              mm = m
+              if( m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns
+                 mm = n
+                 itau = 1
+                 nwork = itau + n
+                 ! compute a=q*r.
+                 ! (rworkspace: need n)
+                 ! (cworkspace: need n, prefer n*nb)
+                 call stdlib_wgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                           info )
+                 ! multiply b by transpose(q).
+                 ! (rworkspace: need n)
+                 ! (cworkspace: need nrhs, prefer nrhs*nb)
+                 call stdlib_wunmqr( 'L', 'C', m, nrhs, n, a, lda, work( itau ), b,ldb, work( &
+                           nwork ), lwork-nwork+1, info )
+                 ! zero out below r.
+                 if( n>1 ) then
+                    call stdlib_wlaset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+                 end if
+              end if
+              itauq = 1
+              itaup = itauq + n
+              nwork = itaup + n
+              ie = 1
+              nrwork = ie + n
+              ! bidiagonalize r in a.
+              ! (rworkspace: need n)
+              ! (cworkspace: need 2*n+mm, prefer 2*n+(mm+n)*nb)
+              call stdlib_wgebrd( mm, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                        work( nwork ), lwork-nwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors of r.
+              ! (cworkspace: need 2*n+nrhs, prefer 2*n+nrhs*nb)
+              call stdlib_wunmbr( 'Q', 'L', 'C', mm, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_wlalsd( 'U', smlsiz, n, nrhs, s, rwork( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), rwork( nrwork ),iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of r.
+              call stdlib_wunmbr( 'P', 'L', 'N', n, nrhs, n, a, lda, work( itaup ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+           else if( n>=mnthr .and. lwork>=4*m+m*m+max( m, 2*m-4, nrhs, n-3*m ) ) then
+              ! path 2a - underdetermined, with many more columns than rows
+              ! and sufficient workspace for an efficient algorithm.
+              ldwork = m
+              if( lwork>=max( 4*m+m*lda+max( m, 2*m-4, nrhs, n-3*m ),m*lda+m+m*nrhs ) )ldwork = &
+                        lda
+              itau = 1
+              nwork = m + 1
+              ! compute a=l*q.
+              ! (cworkspace: need 2*m, prefer m+m*nb)
+              call stdlib_wgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, info )
+                        
+              il = nwork
+              ! copy l to work(il), zeroing out above its diagonal.
+              call stdlib_wlacpy( 'L', m, m, a, lda, work( il ), ldwork )
+              call stdlib_wlaset( 'U', m-1, m-1, czero, czero, work( il+ldwork ),ldwork )
+              itauq = il + ldwork*m
+              itaup = itauq + m
+              nwork = itaup + m
+              ie = 1
+              nrwork = ie + m
+              ! bidiagonalize l in work(il).
+              ! (rworkspace: need m)
+              ! (cworkspace: need m*m+4*m, prefer m*m+4*m+2*m*nb)
+              call stdlib_wgebrd( m, m, work( il ), ldwork, s, rwork( ie ),work( itauq ), work( &
+                        itaup ), work( nwork ),lwork-nwork+1, info )
+              ! multiply b by transpose of left bidiagonalizing vectors of l.
+              ! (cworkspace: need m*m+4*m+nrhs, prefer m*m+4*m+nrhs*nb)
+              call stdlib_wunmbr( 'Q', 'L', 'C', m, nrhs, m, work( il ), ldwork,work( itauq ), b, &
+                        ldb, work( nwork ),lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_wlalsd( 'U', smlsiz, m, nrhs, s, rwork( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), rwork( nrwork ),iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of l.
+              call stdlib_wunmbr( 'P', 'L', 'N', m, nrhs, m, work( il ), ldwork,work( itaup ), b, &
+                        ldb, work( nwork ),lwork-nwork+1, info )
+              ! zero out below first m rows of b.
+              call stdlib_wlaset( 'F', n-m, nrhs, czero, czero, b( m+1, 1 ), ldb )
+              nwork = itau + m
+              ! multiply transpose(q) by b.
+              ! (cworkspace: need nrhs, prefer nrhs*nb)
+              call stdlib_wunmlq( 'L', 'C', n, nrhs, m, a, lda, work( itau ), b,ldb, work( nwork )&
+                        , lwork-nwork+1, info )
+           else
+              ! path 2 - remaining underdetermined cases.
+              itauq = 1
+              itaup = itauq + m
+              nwork = itaup + m
+              ie = 1
+              nrwork = ie + m
+              ! bidiagonalize a.
+              ! (rworkspace: need m)
+              ! (cworkspace: need 2*m+n, prefer 2*m+(m+n)*nb)
+              call stdlib_wgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), work(&
+                         nwork ), lwork-nwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors.
+              ! (cworkspace: need 2*m+nrhs, prefer 2*m+nrhs*nb)
+              call stdlib_wunmbr( 'Q', 'L', 'C', m, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_wlalsd( 'L', smlsiz, m, nrhs, s, rwork( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), rwork( nrwork ),iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of a.
+              call stdlib_wunmbr( 'P', 'L', 'N', n, nrhs, m, a, lda, work( itaup ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+           end if
+           ! undo scaling.
+           if( iascl==1 ) then
+              call stdlib_wlascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_qlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,info )
+           else if( iascl==2 ) then
+              call stdlib_wlascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_qlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_wlascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_wlascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           10 continue
+           work( 1 ) = maxwrk
+           iwork( 1 ) = liwork
+           rwork( 1 ) = lrwork
+           return
+     end subroutine stdlib_wgelsd
+
+     !> ZGELSS: computes the minimum norm solution to a complex linear
+     !> least squares problem:
+     !> Minimize 2-norm(| b - A*x |).
+     !> using the singular value decomposition (SVD) of A. A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution matrix
+     !> X.
+     !> The effective rank of A is determined by treating as zero those
+     !> singular values which are less than RCOND times the largest singular
+     !> value.
+
+     subroutine stdlib_wgelss( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, rwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(qp), intent(in) :: rcond
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*), s(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: bl, chunk, i, iascl, ibscl, ie, il, irwork, itau, itaup, itauq, iwork, &
+                     ldwork, maxmn, maxwrk, minmn, minwrk, mm, mnthr
+           integer(ilp) :: lwork_wgeqrf, lwork_wunmqr, lwork_wgebrd, lwork_wunmbr, lwork_wungbr, &
+                     lwork_wunmlq, lwork_wgelqf
+           real(qp) :: anrm, bignum, bnrm, eps, sfmin, smlnum, thr
+           ! Local Arrays 
+           complex(qp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           minmn = min( m, n )
+           maxmn = max( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, maxmn ) ) then
+              info = -7
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace refers
+             ! to real workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( minmn>0 ) then
+                 mm = m
+                 mnthr = stdlib_ilaenv( 6, 'ZGELSS', ' ', m, n, nrhs, -1 )
+                 if( m>=n .and. m>=mnthr ) then
+                    ! path 1a - overdetermined, with many more rows than
+                              ! columns
+                    ! compute space needed for stdlib_wgeqrf
+                    call stdlib_wgeqrf( m, n, a, lda, dum(1), dum(1), -1, info )
+                    lwork_wgeqrf = real( dum(1),KIND=qp)
+                    ! compute space needed for stdlib_wunmqr
+                    call stdlib_wunmqr( 'L', 'C', m, nrhs, n, a, lda, dum(1), b,ldb, dum(1), -1, &
+                              info )
+                    lwork_wunmqr = real( dum(1),KIND=qp)
+                    mm = n
+                    maxwrk = max( maxwrk, n + n*stdlib_ilaenv( 1, 'ZGEQRF', ' ', m,n, -1, -1 ) )
+                              
+                    maxwrk = max( maxwrk, n + nrhs*stdlib_ilaenv( 1, 'ZUNMQR', 'LC',m, nrhs, n, -&
+                              1 ) )
+                 end if
+                 if( m>=n ) then
+                    ! path 1 - overdetermined or exactly determined
+                    ! compute space needed for stdlib_wgebrd
+                    call stdlib_wgebrd( mm, n, a, lda, s, s, dum(1), dum(1), dum(1),-1, info )
+                              
+                    lwork_wgebrd = real( dum(1),KIND=qp)
+                    ! compute space needed for stdlib_wunmbr
+                    call stdlib_wunmbr( 'Q', 'L', 'C', mm, nrhs, n, a, lda, dum(1),b, ldb, dum(1),&
+                               -1, info )
+                    lwork_wunmbr = real( dum(1),KIND=qp)
+                    ! compute space needed for stdlib_wungbr
+                    call stdlib_wungbr( 'P', n, n, n, a, lda, dum(1),dum(1), -1, info )
+                    lwork_wungbr = real( dum(1),KIND=qp)
+                    ! compute total workspace needed
+                    maxwrk = max( maxwrk, 2*n + lwork_wgebrd )
+                    maxwrk = max( maxwrk, 2*n + lwork_wunmbr )
+                    maxwrk = max( maxwrk, 2*n + lwork_wungbr )
+                    maxwrk = max( maxwrk, n*nrhs )
+                    minwrk = 2*n + max( nrhs, m )
+                 end if
+                 if( n>m ) then
+                    minwrk = 2*m + max( nrhs, n )
+                    if( n>=mnthr ) then
+                       ! path 2a - underdetermined, with many more columns
+                       ! than rows
+                       ! compute space needed for stdlib_wgelqf
+                       call stdlib_wgelqf( m, n, a, lda, dum(1), dum(1),-1, info )
+                       lwork_wgelqf = real( dum(1),KIND=qp)
+                       ! compute space needed for stdlib_wgebrd
+                       call stdlib_wgebrd( m, m, a, lda, s, s, dum(1), dum(1),dum(1), -1, info )
+                                 
+                       lwork_wgebrd = real( dum(1),KIND=qp)
+                       ! compute space needed for stdlib_wunmbr
+                       call stdlib_wunmbr( 'Q', 'L', 'C', m, nrhs, n, a, lda,dum(1), b, ldb, dum(&
+                                 1), -1, info )
+                       lwork_wunmbr = real( dum(1),KIND=qp)
+                       ! compute space needed for stdlib_wungbr
+                       call stdlib_wungbr( 'P', m, m, m, a, lda, dum(1),dum(1), -1, info )
+                       lwork_wungbr = real( dum(1),KIND=qp)
+                       ! compute space needed for stdlib_wunmlq
+                       call stdlib_wunmlq( 'L', 'C', n, nrhs, m, a, lda, dum(1),b, ldb, dum(1), -&
+                                 1, info )
+                       lwork_wunmlq = real( dum(1),KIND=qp)
+                       ! compute total workspace needed
+                       maxwrk = m + lwork_wgelqf
+                       maxwrk = max( maxwrk, 3*m + m*m + lwork_wgebrd )
+                       maxwrk = max( maxwrk, 3*m + m*m + lwork_wunmbr )
+                       maxwrk = max( maxwrk, 3*m + m*m + lwork_wungbr )
+                       if( nrhs>1 ) then
+                          maxwrk = max( maxwrk, m*m + m + m*nrhs )
+                       else
+                          maxwrk = max( maxwrk, m*m + 2*m )
+                       end if
+                       maxwrk = max( maxwrk, m + lwork_wunmlq )
+                    else
+                       ! path 2 - underdetermined
+                       ! compute space needed for stdlib_wgebrd
+                       call stdlib_wgebrd( m, n, a, lda, s, s, dum(1), dum(1),dum(1), -1, info )
+                                 
+                       lwork_wgebrd = real( dum(1),KIND=qp)
+                       ! compute space needed for stdlib_wunmbr
+                       call stdlib_wunmbr( 'Q', 'L', 'C', m, nrhs, m, a, lda,dum(1), b, ldb, dum(&
+                                 1), -1, info )
+                       lwork_wunmbr = real( dum(1),KIND=qp)
+                       ! compute space needed for stdlib_wungbr
+                       call stdlib_wungbr( 'P', m, n, m, a, lda, dum(1),dum(1), -1, info )
+                       lwork_wungbr = real( dum(1),KIND=qp)
+                       maxwrk = 2*m + lwork_wgebrd
+                       maxwrk = max( maxwrk, 2*m + lwork_wunmbr )
+                       maxwrk = max( maxwrk, 2*m + lwork_wungbr )
+                       maxwrk = max( maxwrk, n*nrhs )
+                    end if
+                 end if
+                 maxwrk = max( minwrk, maxwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery )info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELSS', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters
+           eps = stdlib_qlamch( 'P' )
+           sfmin = stdlib_qlamch( 'S' )
+           smlnum = sfmin / eps
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_wlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_wlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_wlaset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              call stdlib_qlaset( 'F', minmn, 1, zero, zero, s, minmn )
+              rank = 0
+              go to 70
+           end if
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_wlange( 'M', m, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_wlascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_wlascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! overdetermined case
+           if( m>=n ) then
+              ! path 1 - overdetermined or exactly determined
+              mm = m
+              if( m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns
+                 mm = n
+                 itau = 1
+                 iwork = itau + n
+                 ! compute a=q*r
+                 ! (cworkspace: need 2*n, prefer n+n*nb)
+                 ! (rworkspace: none)
+                 call stdlib_wgeqrf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                           info )
+                 ! multiply b by transpose(q)
+                 ! (cworkspace: need n+nrhs, prefer n+nrhs*nb)
+                 ! (rworkspace: none)
+                 call stdlib_wunmqr( 'L', 'C', m, nrhs, n, a, lda, work( itau ), b,ldb, work( &
+                           iwork ), lwork-iwork+1, info )
+                 ! zero out below r
+                 if( n>1 )call stdlib_wlaset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+              end if
+              ie = 1
+              itauq = 1
+              itaup = itauq + n
+              iwork = itaup + n
+              ! bidiagonalize r in a
+              ! (cworkspace: need 2*n+mm, prefer 2*n+(mm+n)*nb)
+              ! (rworkspace: need n)
+              call stdlib_wgebrd( mm, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                        work( iwork ), lwork-iwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors of r
+              ! (cworkspace: need 2*n+nrhs, prefer 2*n+nrhs*nb)
+              ! (rworkspace: none)
+              call stdlib_wunmbr( 'Q', 'L', 'C', mm, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        iwork ), lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors of r in a
+              ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_wungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), lwork-iwork+&
+                        1, info )
+              irwork = ie + n
+              ! perform bidiagonal qr iteration
+                ! multiply b by transpose of left singular vectors
+                ! compute right singular vectors in a
+              ! (cworkspace: none)
+              ! (rworkspace: need bdspac)
+              call stdlib_wbdsqr( 'U', n, n, 0, nrhs, s, rwork( ie ), a, lda, dum,1, b, ldb, &
+                        rwork( irwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, n
+                 if( s( i )>thr ) then
+                    call stdlib_wdrscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_wlaset( 'F', 1, nrhs, czero, czero, b( i, 1 ), ldb )
+                 end if
+              end do
+              ! multiply b by right singular vectors
+              ! (cworkspace: need n, prefer n*nrhs)
+              ! (rworkspace: none)
+              if( lwork>=ldb*nrhs .and. nrhs>1 ) then
+                 call stdlib_wgemm( 'C', 'N', n, nrhs, n, cone, a, lda, b, ldb,czero, work, ldb )
+                           
+                 call stdlib_wlacpy( 'G', n, nrhs, work, ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = lwork / n
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_wgemm( 'C', 'N', n, bl, n, cone, a, lda, b( 1, i ),ldb, czero, &
+                              work, n )
+                    call stdlib_wlacpy( 'G', n, bl, work, n, b( 1, i ), ldb )
+                 end do
+              else
+                 call stdlib_wgemv( 'C', n, n, cone, a, lda, b, 1, czero, work, 1 )
+                 call stdlib_wcopy( n, work, 1, b, 1 )
+              end if
+           else if( n>=mnthr .and. lwork>=3*m+m*m+max( m, nrhs, n-2*m ) )then
+              ! underdetermined case, m much less than n
+              ! path 2a - underdetermined, with many more columns than rows
+              ! and sufficient workspace for an efficient algorithm
+              ldwork = m
+              if( lwork>=3*m+m*lda+max( m, nrhs, n-2*m ) )ldwork = lda
+              itau = 1
+              iwork = m + 1
+              ! compute a=l*q
+              ! (cworkspace: need 2*m, prefer m+m*nb)
+              ! (rworkspace: none)
+              call stdlib_wgelqf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, info )
+                        
+              il = iwork
+              ! copy l to work(il), zeroing out above it
+              call stdlib_wlacpy( 'L', m, m, a, lda, work( il ), ldwork )
+              call stdlib_wlaset( 'U', m-1, m-1, czero, czero, work( il+ldwork ),ldwork )
+              ie = 1
+              itauq = il + ldwork*m
+              itaup = itauq + m
+              iwork = itaup + m
+              ! bidiagonalize l in work(il)
+              ! (cworkspace: need m*m+4*m, prefer m*m+3*m+2*m*nb)
+              ! (rworkspace: need m)
+              call stdlib_wgebrd( m, m, work( il ), ldwork, s, rwork( ie ),work( itauq ), work( &
+                        itaup ), work( iwork ),lwork-iwork+1, info )
+              ! multiply b by transpose of left bidiagonalizing vectors of l
+              ! (cworkspace: need m*m+3*m+nrhs, prefer m*m+3*m+nrhs*nb)
+              ! (rworkspace: none)
+              call stdlib_wunmbr( 'Q', 'L', 'C', m, nrhs, m, work( il ), ldwork,work( itauq ), b, &
+                        ldb, work( iwork ),lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors of r in work(il)
+              ! (cworkspace: need m*m+4*m-1, prefer m*m+3*m+(m-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_wungbr( 'P', m, m, m, work( il ), ldwork, work( itaup ),work( iwork ), &
+                        lwork-iwork+1, info )
+              irwork = ie + m
+              ! perform bidiagonal qr iteration, computing right singular
+              ! vectors of l in work(il) and multiplying b by transpose of
+              ! left singular vectors
+              ! (cworkspace: need m*m)
+              ! (rworkspace: need bdspac)
+              call stdlib_wbdsqr( 'U', m, m, 0, nrhs, s, rwork( ie ), work( il ),ldwork, a, lda, &
+                        b, ldb, rwork( irwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, m
+                 if( s( i )>thr ) then
+                    call stdlib_wdrscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_wlaset( 'F', 1, nrhs, czero, czero, b( i, 1 ), ldb )
+                 end if
+              end do
+              iwork = il + m*ldwork
+              ! multiply b by right singular vectors of l in work(il)
+              ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nrhs)
+              ! (rworkspace: none)
+              if( lwork>=ldb*nrhs+iwork-1 .and. nrhs>1 ) then
+                 call stdlib_wgemm( 'C', 'N', m, nrhs, m, cone, work( il ), ldwork,b, ldb, czero, &
+                           work( iwork ), ldb )
+                 call stdlib_wlacpy( 'G', m, nrhs, work( iwork ), ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = ( lwork-iwork+1 ) / m
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_wgemm( 'C', 'N', m, bl, m, cone, work( il ), ldwork,b( 1, i ), &
+                              ldb, czero, work( iwork ), m )
+                    call stdlib_wlacpy( 'G', m, bl, work( iwork ), m, b( 1, i ),ldb )
+                 end do
+              else
+                 call stdlib_wgemv( 'C', m, m, cone, work( il ), ldwork, b( 1, 1 ),1, czero, work(&
+                            iwork ), 1 )
+                 call stdlib_wcopy( m, work( iwork ), 1, b( 1, 1 ), 1 )
+              end if
+              ! zero out below first m rows of b
+              call stdlib_wlaset( 'F', n-m, nrhs, czero, czero, b( m+1, 1 ), ldb )
+              iwork = itau + m
+              ! multiply transpose(q) by b
+              ! (cworkspace: need m+nrhs, prefer m+nhrs*nb)
+              ! (rworkspace: none)
+              call stdlib_wunmlq( 'L', 'C', n, nrhs, m, a, lda, work( itau ), b,ldb, work( iwork )&
+                        , lwork-iwork+1, info )
+           else
+              ! path 2 - remaining underdetermined cases
+              ie = 1
+              itauq = 1
+              itaup = itauq + m
+              iwork = itaup + m
+              ! bidiagonalize a
+              ! (cworkspace: need 3*m, prefer 2*m+(m+n)*nb)
+              ! (rworkspace: need n)
+              call stdlib_wgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), work(&
+                         iwork ), lwork-iwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors
+              ! (cworkspace: need 2*m+nrhs, prefer 2*m+nrhs*nb)
+              ! (rworkspace: none)
+              call stdlib_wunmbr( 'Q', 'L', 'C', m, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        iwork ), lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors in a
+              ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+              ! (rworkspace: none)
+              call stdlib_wungbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), lwork-iwork+&
+                        1, info )
+              irwork = ie + m
+              ! perform bidiagonal qr iteration,
+                 ! computing right singular vectors of a in a and
+                 ! multiplying b by transpose of left singular vectors
+              ! (cworkspace: none)
+              ! (rworkspace: need bdspac)
+              call stdlib_wbdsqr( 'L', m, n, 0, nrhs, s, rwork( ie ), a, lda, dum,1, b, ldb, &
+                        rwork( irwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, m
+                 if( s( i )>thr ) then
+                    call stdlib_wdrscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_wlaset( 'F', 1, nrhs, czero, czero, b( i, 1 ), ldb )
+                 end if
+              end do
+              ! multiply b by right singular vectors of a
+              ! (cworkspace: need n, prefer n*nrhs)
+              ! (rworkspace: none)
+              if( lwork>=ldb*nrhs .and. nrhs>1 ) then
+                 call stdlib_wgemm( 'C', 'N', n, nrhs, m, cone, a, lda, b, ldb,czero, work, ldb )
+                           
+                 call stdlib_wlacpy( 'G', n, nrhs, work, ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = lwork / n
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_wgemm( 'C', 'N', n, bl, m, cone, a, lda, b( 1, i ),ldb, czero, &
+                              work, n )
+                    call stdlib_wlacpy( 'F', n, bl, work, n, b( 1, i ), ldb )
+                 end do
+              else
+                 call stdlib_wgemv( 'C', m, n, cone, a, lda, b, 1, czero, work, 1 )
+                 call stdlib_wcopy( n, work, 1, b, 1 )
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_wlascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_qlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,info )
+           else if( iascl==2 ) then
+              call stdlib_wlascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_qlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_wlascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_wlascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           70 continue
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_wgelss
+
+     !> ZGELSY: computes the minimum-norm solution to a complex linear least
+     !> squares problem:
+     !> minimize || A * X - B ||
+     !> using a complete orthogonal factorization of A.  A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+     !> The routine first computes a QR factorization with column pivoting:
+     !> A * P = Q * [ R11 R12 ]
+     !> [  0  R22 ]
+     !> with R11 defined as the largest leading submatrix whose estimated
+     !> condition number is less than 1/RCOND.  The order of R11, RANK,
+     !> is the effective rank of A.
+     !> Then, R22 is considered to be negligible, and R12 is annihilated
+     !> by unitary transformations from the right, arriving at the
+     !> complete orthogonal factorization:
+     !> A * P = Q * [ T11 0 ] * Z
+     !> [  0  0 ]
+     !> The minimum-norm solution is then
+     !> X = P * Z**H [ inv(T11)*Q1**H*B ]
+     !> [        0         ]
+     !> where Q1 consists of the first RANK columns of Q.
+     !> This routine is basically identical to the original xGELSX except
+     !> three differences:
+     !> o The permutation of matrix B (the right hand side) is faster and
+     !> more simple.
+     !> o The call to the subroutine xGEQPF has been substituted by the
+     !> the call to the subroutine xGEQP3. This subroutine is a Blas-3
+     !> version of the QR factorization with column pivoting.
+     !> o Matrix B (the right hand side) is updated with Blas-3.
+
+     subroutine stdlib_wgelsy( m, n, nrhs, a, lda, b, ldb, jpvt, rcond, rank,work, lwork, rwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(qp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: imax = 1
+           integer(ilp), parameter :: imin = 2
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iascl, ibscl, ismax, ismin, j, lwkopt, mn, nb, nb1, nb2, nb3, &
+                     nb4
+           real(qp) :: anrm, bignum, bnrm, smax, smaxpr, smin, sminpr, smlnum, wsize
+           complex(qp) :: c1, c2, s1, s2
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,max,min
+           ! Executable Statements 
+           mn = min( m, n )
+           ismin = mn + 1
+           ismax = 2*mn + 1
+           ! test the input arguments.
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'ZGEQRF', ' ', m, n, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'ZGERQF', ' ', m, n, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'ZUNMQR', ' ', m, n, nrhs, -1 )
+           nb4 = stdlib_ilaenv( 1, 'ZUNMRQ', ' ', m, n, nrhs, -1 )
+           nb = max( nb1, nb2, nb3, nb4 )
+           lwkopt = max( 1, mn+2*n+nb*( n+1 ), 2*mn+nb*nrhs )
+           work( 1 ) = cmplx( lwkopt,KIND=qp)
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, m, n ) ) then
+              info = -7
+           else if( lwork<( mn+max( 2*mn, n+1, mn+nrhs ) ) .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELSY', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters
+           smlnum = stdlib_qlamch( 'S' ) / stdlib_qlamch( 'P' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! scale a, b if max entries outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_wlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_wlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_wlaset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              rank = 0
+              go to 70
+           end if
+           bnrm = stdlib_wlange( 'M', m, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_wlascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_wlascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! compute qr factorization with column pivoting of a:
+              ! a * p = q * r
+           call stdlib_wgeqp3( m, n, a, lda, jpvt, work( 1 ), work( mn+1 ),lwork-mn, rwork, info )
+                     
+           wsize = mn + real( work( mn+1 ),KIND=qp)
+           ! complex workspace: mn+nb*(n+1). real workspace 2*n.
+           ! details of householder rotations stored in work(1:mn).
+           ! determine rank using incremental condition estimation
+           work( ismin ) = cone
+           work( ismax ) = cone
+           smax = abs( a( 1, 1 ) )
+           smin = smax
+           if( abs( a( 1, 1 ) )==zero ) then
+              rank = 0
+              call stdlib_wlaset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              go to 70
+           else
+              rank = 1
+           end if
+           10 continue
+           if( rank<mn ) then
+              i = rank + 1
+              call stdlib_wlaic1( imin, rank, work( ismin ), smin, a( 1, i ),a( i, i ), sminpr, &
+                        s1, c1 )
+              call stdlib_wlaic1( imax, rank, work( ismax ), smax, a( 1, i ),a( i, i ), smaxpr, &
+                        s2, c2 )
+              if( smaxpr*rcond<=sminpr ) then
+                 do i = 1, rank
+                    work( ismin+i-1 ) = s1*work( ismin+i-1 )
+                    work( ismax+i-1 ) = s2*work( ismax+i-1 )
+                 end do
+                 work( ismin+rank ) = c1
+                 work( ismax+rank ) = c2
+                 smin = sminpr
+                 smax = smaxpr
+                 rank = rank + 1
+                 go to 10
+              end if
+           end if
+           ! complex workspace: 3*mn.
+           ! logically partition r = [ r11 r12 ]
+                                   ! [  0  r22 ]
+           ! where r11 = r(1:rank,1:rank)
+           ! [r11,r12] = [ t11, 0 ] * y
+           if( rank<n )call stdlib_wtzrzf( rank, n, a, lda, work( mn+1 ), work( 2*mn+1 ),lwork-&
+                     2*mn, info )
+           ! complex workspace: 2*mn.
+           ! details of householder rotations stored in work(mn+1:2*mn)
+           ! b(1:m,1:nrhs) := q**h * b(1:m,1:nrhs)
+           call stdlib_wunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', m, nrhs, mn, a, lda,work( 1 ), b, &
+                     ldb, work( 2*mn+1 ), lwork-2*mn, info )
+           wsize = max( wsize, 2*mn+real( work( 2*mn+1 ),KIND=qp) )
+           ! complex workspace: 2*mn+nb*nrhs.
+           ! b(1:rank,1:nrhs) := inv(t11) * b(1:rank,1:nrhs)
+           call stdlib_wtrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', rank,nrhs, cone, a, &
+                     lda, b, ldb )
+           do j = 1, nrhs
+              do i = rank + 1, n
+                 b( i, j ) = czero
+              end do
+           end do
+           ! b(1:n,1:nrhs) := y**h * b(1:n,1:nrhs)
+           if( rank<n ) then
+              call stdlib_wunmrz( 'LEFT', 'CONJUGATE TRANSPOSE', n, nrhs, rank,n-rank, a, lda, &
+                        work( mn+1 ), b, ldb,work( 2*mn+1 ), lwork-2*mn, info )
+           end if
+           ! complex workspace: 2*mn+nrhs.
+           ! b(1:n,1:nrhs) := p * b(1:n,1:nrhs)
+           do j = 1, nrhs
+              do i = 1, n
+                 work( jpvt( i ) ) = b( i, j )
+              end do
+              call stdlib_wcopy( n, work( 1 ), 1, b( 1, j ), 1 )
+           end do
+           ! complex workspace: n.
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_wlascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_wlascl( 'U', 0, 0, smlnum, anrm, rank, rank, a, lda,info )
+           else if( iascl==2 ) then
+              call stdlib_wlascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_wlascl( 'U', 0, 0, bignum, anrm, rank, rank, a, lda,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_wlascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_wlascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           70 continue
+           work( 1 ) = cmplx( lwkopt,KIND=qp)
+           return
+     end subroutine stdlib_wgelsy
+
+     !> ZGEMLQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product
+     !> of blocked elementary reflectors computed by short wide
+     !> LQ factorization (ZGELQ)
+
+     pure subroutine stdlib_wgemlq( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, tsize, lwork, ldc
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), t(*)
+           complex(qp), intent(inout) :: c(ldc,*)
+           complex(qp), intent(out) :: work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: mb, nb, lw, nblcks, mn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork==-1
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'C' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           mb = int( t( 2 ),KIND=ilp)
+           nb = int( t( 3 ),KIND=ilp)
+           if( left ) then
+             lw = n * mb
+             mn = m
+           else
+             lw = m * mb
+             mn = n
+           end if
+           if( ( nb>k ) .and. ( mn>k ) ) then
+             if( mod( mn - k, nb - k ) == 0 ) then
+               nblcks = ( mn - k ) / ( nb - k )
+             else
+               nblcks = ( mn - k ) / ( nb - k ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+             info = -1
+           else if( .not.tran .and. .not.notran ) then
+             info = -2
+           else if( m<0 ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 .or. k>mn ) then
+             info = -5
+           else if( lda<max( 1, k ) ) then
+             info = -7
+           else if( tsize<5 ) then
+             info = -9
+           else if( ldc<max( 1, m ) ) then
+             info = -11
+           else if( ( lwork<max( 1, lw ) ) .and. ( .not.lquery ) ) then
+             info = -13
+           end if
+           if( info==0 ) then
+             work( 1 ) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZGEMLQ', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n, k )==0 ) then
+             return
+           end if
+           if( ( left .and. m<=k ) .or. ( right .and. n<=k ).or. ( nb<=k ) .or. ( nb>=max( m, n, &
+                     k ) ) ) then
+             call stdlib_wgemlqt( side, trans, m, n, k, mb, a, lda,t( 6 ), mb, c, ldc, work, info &
+                       )
+           else
+             call stdlib_wlamswlq( side, trans, m, n, k, mb, nb, a, lda, t( 6 ),mb, c, ldc, work, &
+                       lwork, info )
+           end if
+           work( 1 ) = lw
+           return
+     end subroutine stdlib_wgemlq
+
+     !> ZGEMLQT: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q C            C Q
+     !> TRANS = 'C':   Q**H C            C Q**H
+     !> where Q is a complex unitary matrix defined as the product of K
+     !> elementary reflectors:
+     !> Q = H(1) H(2) . . . H(K) = I - V T V**H
+     !> generated using the compact WY representation as returned by ZGELQT.
+     !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+
+     pure subroutine stdlib_wgemlqt( side, trans, m, n, k, mb, v, ldv, t, ldt,c, ldc, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, ldc, m, n, mb, ldt
+           ! Array Arguments 
+           complex(qp), intent(in) :: v(ldv,*), t(ldt,*)
+           complex(qp), intent(inout) :: c(ldc,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, ldwork, kf, q
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'C' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( left ) then
+              ldwork = max( 1, n )
+              q = m
+           else if ( right ) then
+              ldwork = max( 1, m )
+              q = n
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>q ) then
+              info = -5
+           else if( mb<1 .or. (mb>k .and. k>0)) then
+              info = -6
+           else if( ldv<max( 1, k ) ) then
+               info = -8
+           else if( ldt<mb ) then
+              info = -10
+           else if( ldc<max( 1, m ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEMLQT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. notran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_wlarfb( 'L', 'C', 'F', 'R', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. tran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_wlarfb( 'R', 'N', 'F', 'R', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           else if( left .and. tran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_wlarfb( 'L', 'N', 'F', 'R', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. notran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_wlarfb( 'R', 'C', 'F', 'R', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           end if
+           return
+     end subroutine stdlib_wgemlqt
+
+     !> ZGEMQR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product
+     !> of blocked elementary reflectors computed by tall skinny
+     !> QR factorization (ZGEQR)
+
+     pure subroutine stdlib_wgemqr( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, tsize, lwork, ldc
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), t(*)
+           complex(qp), intent(inout) :: c(ldc,*)
+           complex(qp), intent(out) :: work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: mb, nb, lw, nblcks, mn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork==-1
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'C' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           mb = int( t( 2 ),KIND=ilp)
+           nb = int( t( 3 ),KIND=ilp)
+           if( left ) then
+             lw = n * nb
+             mn = m
+           else
+             lw = mb * nb
+             mn = n
+           end if
+           if( ( mb>k ) .and. ( mn>k ) ) then
+             if( mod( mn - k, mb - k )==0 ) then
+               nblcks = ( mn - k ) / ( mb - k )
+             else
+               nblcks = ( mn - k ) / ( mb - k ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+             info = -1
+           else if( .not.tran .and. .not.notran ) then
+             info = -2
+           else if( m<0 ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 .or. k>mn ) then
+             info = -5
+           else if( lda<max( 1, mn ) ) then
+             info = -7
+           else if( tsize<5 ) then
+             info = -9
+           else if( ldc<max( 1, m ) ) then
+             info = -11
+           else if( ( lwork<max( 1, lw ) ) .and. ( .not.lquery ) ) then
+             info = -13
+           end if
+           if( info==0 ) then
+             work( 1 ) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZGEMQR', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n, k )==0 ) then
+             return
+           end if
+           if( ( left .and. m<=k ) .or. ( right .and. n<=k ).or. ( mb<=k ) .or. ( mb>=max( m, n, &
+                     k ) ) ) then
+             call stdlib_wgemqrt( side, trans, m, n, k, nb, a, lda, t( 6 ),nb, c, ldc, work, info &
+                       )
+           else
+             call stdlib_wlamtsqr( side, trans, m, n, k, mb, nb, a, lda, t( 6 ),nb, c, ldc, work, &
+                       lwork, info )
+           end if
+           work( 1 ) = lw
+           return
+     end subroutine stdlib_wgemqr
+
+     !> ZGEMQRT: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q C            C Q
+     !> TRANS = 'C':    Q**H C            C Q**H
+     !> where Q is a complex orthogonal matrix defined as the product of K
+     !> elementary reflectors:
+     !> Q = H(1) H(2) . . . H(K) = I - V T V**H
+     !> generated using the compact WY representation as returned by ZGEQRT.
+     !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+
+     pure subroutine stdlib_wgemqrt( side, trans, m, n, k, nb, v, ldv, t, ldt,c, ldc, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, ldc, m, n, nb, ldt
+           ! Array Arguments 
+           complex(qp), intent(in) :: v(ldv,*), t(ldt,*)
+           complex(qp), intent(inout) :: c(ldc,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, ldwork, kf, q
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'C' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( left ) then
+              ldwork = max( 1, n )
+              q = m
+           else if ( right ) then
+              ldwork = max( 1, m )
+              q = n
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>q ) then
+              info = -5
+           else if( nb<1 .or. (nb>k .and. k>0)) then
+              info = -6
+           else if( ldv<max( 1, q ) ) then
+              info = -8
+           else if( ldt<nb ) then
+              info = -10
+           else if( ldc<max( 1, m ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEMQRT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. tran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_wlarfb( 'L', 'C', 'F', 'C', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. notran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_wlarfb( 'R', 'N', 'F', 'C', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           else if( left .and. notran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_wlarfb( 'L', 'N', 'F', 'C', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. tran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_wlarfb( 'R', 'C', 'F', 'C', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           end if
+           return
+     end subroutine stdlib_wgemqrt
+
+     !> ZGEQL2: computes a QL factorization of a complex m by n matrix A:
+     !> A = Q * L.
+
+     pure subroutine stdlib_wgeql2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQL2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = k, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! a(1:m-k+i-1,n-k+i)
+              alpha = a( m-k+i, n-k+i )
+              call stdlib_wlarfg( m-k+i, alpha, a( 1, n-k+i ), 1, tau( i ) )
+              ! apply h(i)**h to a(1:m-k+i,1:n-k+i-1) from the left
+              a( m-k+i, n-k+i ) = cone
+              call stdlib_wlarf( 'LEFT', m-k+i, n-k+i-1, a( 1, n-k+i ), 1,conjg( tau( i ) ), a, &
+                        lda, work )
+              a( m-k+i, n-k+i ) = alpha
+           end do
+           return
+     end subroutine stdlib_wgeql2
+
+     !> ZGEQLF: computes a QL factorization of a complex M-by-N matrix A:
+     !> A = Q * L.
+
+     pure subroutine stdlib_wgeqlf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ki, kk, ldwork, lwkopt, mu, nb, nbmin, nu, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              k = min( m, n )
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'ZGEQLF', ' ', m, n, -1, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, n ) .and. .not.lquery ) then
+                 info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQLF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZGEQLF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGEQLF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially.
+              ! the last kk columns are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              do i = k - kk + ki + 1, k - kk + 1, -nb
+                 ib = min( k-i+1, nb )
+                 ! compute the ql factorization of the current block
+                 ! a(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1)
+                 call stdlib_wgeql2( m-k+i+ib-1, ib, a( 1, n-k+i ), lda, tau( i ),work, iinfo )
+                           
+                 if( n-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_wlarft( 'BACKWARD', 'COLUMNWISE', m-k+i+ib-1, ib,a( 1, n-k+i ), &
+                              lda, tau( i ), work, ldwork )
+                    ! apply h**h to a(1:m-k+i+ib-1,1:n-k+i-1) from the left
+                    call stdlib_wlarfb( 'LEFT', 'CONJUGATE TRANSPOSE', 'BACKWARD','COLUMNWISE', m-&
+                    k+i+ib-1, n-k+i-1, ib,a( 1, n-k+i ), lda, work, ldwork, a, lda,work( ib+1 ), &
+                              ldwork )
+                 end if
+              end do
+              mu = m - k + i + nb - 1
+              nu = n - k + i + nb - 1
+           else
+              mu = m
+              nu = n
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 .and. nu>0 )call stdlib_wgeql2( mu, nu, a, lda, tau, work, iinfo )
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_wgeqlf
+
+     !> ZGEQP3: computes a QR factorization with column pivoting of a
+     !> matrix A:  A*P = Q*R  using Level 3 BLAS.
+
+     pure subroutine stdlib_wgeqp3( m, n, a, lda, jpvt, tau, work, lwork, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: inb = 1
+           integer(ilp), parameter :: inbmin = 2
+           integer(ilp), parameter :: ixover = 3
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: fjb, iws, j, jb, lwkopt, minmn, minws, na, nb, nbmin, nfxd, nx, sm, &
+                     sminmn, sn, topbmn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+        ! ====================
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              minmn = min( m, n )
+              if( minmn==0 ) then
+                 iws = 1
+                 lwkopt = 1
+              else
+                 iws = n + 1
+                 nb = stdlib_ilaenv( inb, 'ZGEQRF', ' ', m, n, -1, -1 )
+                 lwkopt = ( n + 1 )*nb
+              end if
+              work( 1 ) = cmplx( lwkopt,KIND=qp)
+              if( ( lwork<iws ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQP3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! move initial columns up front.
+           nfxd = 1
+           do j = 1, n
+              if( jpvt( j )/=0 ) then
+                 if( j/=nfxd ) then
+                    call stdlib_wswap( m, a( 1, j ), 1, a( 1, nfxd ), 1 )
+                    jpvt( j ) = jpvt( nfxd )
+                    jpvt( nfxd ) = j
+                 else
+                    jpvt( j ) = j
+                 end if
+                 nfxd = nfxd + 1
+              else
+                 jpvt( j ) = j
+              end if
+           end do
+           nfxd = nfxd - 1
+           ! factorize fixed columns
+        ! =======================
+           ! compute the qr factorization of fixed columns and update
+           ! remaining columns.
+           if( nfxd>0 ) then
+              na = min( m, nfxd )
+      ! cc      call stdlib_wgeqr2( m, na, a, lda, tau, work, info )
+              call stdlib_wgeqrf( m, na, a, lda, tau, work, lwork, info )
+              iws = max( iws, int( work( 1 ),KIND=ilp) )
+              if( na<n ) then
+      ! cc         call stdlib_wunm2r( 'left', 'conjugate transpose', m, n-na,
+      ! cc  $                   na, a, lda, tau, a( 1, na+1 ), lda, work,
+      ! cc  $                   info )
+                 call stdlib_wunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', m, n-na, na, a,lda, tau, a( 1,&
+                            na+1 ), lda, work, lwork,info )
+                 iws = max( iws, int( work( 1 ),KIND=ilp) )
+              end if
+           end if
+           ! factorize free columns
+        ! ======================
+           if( nfxd<minmn ) then
+              sm = m - nfxd
+              sn = n - nfxd
+              sminmn = minmn - nfxd
+              ! determine the block size.
+              nb = stdlib_ilaenv( inb, 'ZGEQRF', ' ', sm, sn, -1, -1 )
+              nbmin = 2
+              nx = 0
+              if( ( nb>1 ) .and. ( nb<sminmn ) ) then
+                 ! determine when to cross over from blocked to unblocked code.
+                 nx = max( 0, stdlib_ilaenv( ixover, 'ZGEQRF', ' ', sm, sn, -1,-1 ) )
+                 if( nx<sminmn ) then
+                    ! determine if workspace is large enough for blocked code.
+                    minws = ( sn+1 )*nb
+                    iws = max( iws, minws )
+                    if( lwork<minws ) then
+                       ! not enough workspace to use optimal nb: reduce nb and
+                       ! determine the minimum value of nb.
+                       nb = lwork / ( sn+1 )
+                       nbmin = max( 2, stdlib_ilaenv( inbmin, 'ZGEQRF', ' ', sm, sn,-1, -1 ) )
+                                 
+                    end if
+                 end if
+              end if
+              ! initialize partial column norms. the first n elements of work
+              ! store the exact column norms.
+              do j = nfxd + 1, n
+                 rwork( j ) = stdlib_qznrm2( sm, a( nfxd+1, j ), 1 )
+                 rwork( n+j ) = rwork( j )
+              end do
+              if( ( nb>=nbmin ) .and. ( nb<sminmn ) .and.( nx<sminmn ) ) then
+                 ! use blocked code initially.
+                 j = nfxd + 1
+                 ! compute factorization: while loop.
+                 topbmn = minmn - nx
+                 30 continue
+                 if( j<=topbmn ) then
+                    jb = min( nb, topbmn-j+1 )
+                    ! factorize jb columns among columns j:n.
+                    call stdlib_wlaqps( m, n-j+1, j-1, jb, fjb, a( 1, j ), lda,jpvt( j ), tau( j )&
+                              , rwork( j ),rwork( n+j ), work( 1 ), work( jb+1 ),n-j+1 )
+                    j = j + fjb
+                    go to 30
+                 end if
+              else
+                 j = nfxd + 1
+              end if
+              ! use unblocked code to factor the last or only block.
+              if( j<=minmn )call stdlib_wlaqp2( m, n-j+1, j-1, a( 1, j ), lda, jpvt( j ),tau( j ),&
+                         rwork( j ), rwork( n+j ), work( 1 ) )
+           end if
+           work( 1 ) = cmplx( lwkopt,KIND=qp)
+           return
+     end subroutine stdlib_wgeqp3
+
+     !> ZGEQR: computes a QR factorization of a complex M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     pure subroutine stdlib_wgeqr( m, n, a, lda, t, tsize, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, tsize, lwork
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: t(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, lminws, mint, minw
+           integer(ilp) :: mb, nb, mintsz, nblcks
+           ! Intrinsic Functions 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( tsize==-1 .or. tsize==-2 .or.lwork==-1 .or. lwork==-2 )
+           mint = .false.
+           minw = .false.
+           if( tsize==-2 .or. lwork==-2 ) then
+             if( tsize/=-1 ) mint = .true.
+             if( lwork/=-1 ) minw = .true.
+           end if
+           ! determine the block size
+           if( min ( m, n )>0 ) then
+             mb = stdlib_ilaenv( 1, 'ZGEQR ', ' ', m, n, 1, -1 )
+             nb = stdlib_ilaenv( 1, 'ZGEQR ', ' ', m, n, 2, -1 )
+           else
+             mb = m
+             nb = 1
+           end if
+           if( mb>m .or. mb<=n ) mb = m
+           if( nb>min( m, n ) .or. nb<1 ) nb = 1
+           mintsz = n + 5
+           if( mb>n .and. m>n ) then
+             if( mod( m - n, mb - n )==0 ) then
+               nblcks = ( m - n ) / ( mb - n )
+             else
+               nblcks = ( m - n ) / ( mb - n ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           ! determine if the workspace size satisfies minimal size
+           lminws = .false.
+           if( ( tsize<max( 1, nb*n*nblcks + 5 ) .or. lwork<nb*n ).and. ( lwork>=n ) .and. ( &
+                     tsize>=mintsz ).and. ( .not.lquery ) ) then
+             if( tsize<max( 1, nb*n*nblcks + 5 ) ) then
+               lminws = .true.
+               nb = 1
+               mb = m
+             end if
+             if( lwork<nb*n ) then
+               lminws = .true.
+               nb = 1
+             end if
+           end if
+           if( m<0 ) then
+             info = -1
+           else if( n<0 ) then
+             info = -2
+           else if( lda<max( 1, m ) ) then
+             info = -4
+           else if( tsize<max( 1, nb*n*nblcks + 5 ).and. ( .not.lquery ) .and. ( .not.lminws ) ) &
+                     then
+             info = -6
+           else if( ( lwork<max( 1, n*nb ) ) .and. ( .not.lquery ).and. ( .not.lminws ) ) &
+                     then
+             info = -8
+           end if
+           if( info==0 ) then
+             if( mint ) then
+               t( 1 ) = mintsz
+             else
+               t( 1 ) = nb*n*nblcks + 5
+             end if
+             t( 2 ) = mb
+             t( 3 ) = nb
+             if( minw ) then
+               work( 1 ) = max( 1, n )
+             else
+               work( 1 ) = max( 1, nb*n )
+             end if
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZGEQR', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+             return
+           end if
+           ! the qr decomposition
+           if( ( m<=n ) .or. ( mb<=n ) .or. ( mb>=m ) ) then
+             call stdlib_wgeqrt( m, n, nb, a, lda, t( 6 ), nb, work, info )
+           else
+             call stdlib_wlatsqr( m, n, mb, nb, a, lda, t( 6 ), nb, work,lwork, info )
+           end if
+           work( 1 ) = max( 1, nb*n )
+           return
+     end subroutine stdlib_wgeqr
+
+     !> ZGEQR2: computes a QR factorization of a complex m-by-n matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a m-by-m orthogonal matrix;
+     !> R is an upper-triangular n-by-n matrix;
+     !> 0 is a (m-n)-by-n zero matrix, if m > n.
+
+     pure subroutine stdlib_wgeqr2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQR2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+              call stdlib_wlarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tau( i ) )
+              if( i<n ) then
+                 ! apply h(i)**h to a(i:m,i+1:n) from the left
+                 alpha = a( i, i )
+                 a( i, i ) = cone
+                 call stdlib_wlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1,conjg( tau( i ) ), a( i, i+1 &
+                           ), lda, work )
+                 a( i, i ) = alpha
+              end if
+           end do
+           return
+     end subroutine stdlib_wgeqr2
+
+     !> ZGEQR2P: computes a QR factorization of a complex m-by-n matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a m-by-m orthogonal matrix;
+     !> R is an upper-triangular n-by-n matrix with nonnegative diagonal
+     !> entries;
+     !> 0 is a (m-n)-by-n zero matrix, if m > n.
+
+     subroutine stdlib_wgeqr2p( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQR2P', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+              call stdlib_wlarfgp( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tau( i ) )
+              if( i<n ) then
+                 ! apply h(i)**h to a(i:m,i+1:n) from the left
+                 alpha = a( i, i )
+                 a( i, i ) = cone
+                 call stdlib_wlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1,conjg( tau( i ) ), a( i, i+1 &
+                           ), lda, work )
+                 a( i, i ) = alpha
+              end if
+           end do
+           return
+     end subroutine stdlib_wgeqr2p
+
+     !> ZGEQRF: computes a QR factorization of a complex M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     pure subroutine stdlib_wgeqrf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           k = min( m, n )
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZGEQRF', ' ', m, n, -1, -1 )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( .not.lquery ) then
+              if( lwork<=0 .or. ( m>0 .and. lwork<max( 1, n ) ) )info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQRF', -info )
+              return
+           else if( lquery ) then
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZGEQRF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGEQRF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the qr factorization of the current block
+                 ! a(i:m,i:i+ib-1)
+                 call stdlib_wgeqr2( m-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_wlarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h**h to a(i:m,i+ib:n) from the left
+                    call stdlib_wlarfb( 'LEFT', 'CONJUGATE TRANSPOSE', 'FORWARD','COLUMNWISE', m-&
+                    i+1, n-i-ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 )&
+                              , ldwork )
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_wgeqr2( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_wgeqrf
+
+     !> ZGEQR2P computes a QR factorization of a complex M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix with nonnegative diagonal
+     !> entries;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     subroutine stdlib_wgeqrfp( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZGEQRF', ' ', m, n, -1, -1 )
+           lwkopt = n*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQRFP', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZGEQRF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGEQRF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the qr factorization of the current block
+                 ! a(i:m,i:i+ib-1)
+                 call stdlib_wgeqr2p( m-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_wlarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h**h to a(i:m,i+ib:n) from the left
+                    call stdlib_wlarfb( 'LEFT', 'CONJUGATE TRANSPOSE', 'FORWARD','COLUMNWISE', m-&
+                    i+1, n-i-ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 )&
+                              , ldwork )
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_wgeqr2p( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_wgeqrfp
+
+     !> ZGEQRT: computes a blocked QR factorization of a complex M-by-N matrix A
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_wgeqrt( m, n, nb, a, lda, t, ldt, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, nb
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk), parameter :: use_recursive_qr = .true.
+           integer(ilp) :: i, ib, iinfo, k
+           
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nb<1 .or. ( nb>min(m,n) .and. min(m,n)>0 ) )then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<nb ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQRT', -info )
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) return
+           ! blocked loop of length k
+           do i = 1, k,  nb
+              ib = min( k-i+1, nb )
+           ! compute the qr factorization of the current block a(i:m,i:i+ib-1)
+              if( use_recursive_qr ) then
+                 call stdlib_wgeqrt3( m-i+1, ib, a(i,i), lda, t(1,i), ldt, iinfo )
+              else
+                 call stdlib_wgeqrt2( m-i+1, ib, a(i,i), lda, t(1,i), ldt, iinfo )
+              end if
+              if( i+ib<=n ) then
+           ! update by applying h**h to a(i:m,i+ib:n) from the left
+                 call stdlib_wlarfb( 'L', 'C', 'F', 'C', m-i+1, n-i-ib+1, ib,a( i, i ), lda, t( 1,&
+                            i ), ldt,a( i, i+ib ), lda, work , n-i-ib+1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_wgeqrt
+
+     !> ZGEQRT2: computes a QR factorization of a complex M-by-N matrix A,
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_wgeqrt2( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(qp) :: aii, alpha
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( m<n ) then
+              info = -1
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQRT2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elem. refl. h(i) to annihilate a(i+1:m,i), tau(i) -> t(i,1)
+              call stdlib_wlarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,t( i, 1 ) )
+              if( i<n ) then
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 aii = a( i, i )
+                 a( i, i ) = cone
+                 ! w(1:n-i) := a(i:m,i+1:n)^h * a(i:m,i) [w = t(:,n)]
+                 call stdlib_wgemv( 'C',m-i+1, n-i, cone, a( i, i+1 ), lda,a( i, i ), 1, czero, t(&
+                            1, n ), 1 )
+                 ! a(i:m,i+1:n) = a(i:m,i+1:n) + alpha*a(i:m,i)*w(1:n-1)^h
+                 alpha = -conjg(t( i, 1 ))
+                 call stdlib_wgerc( m-i+1, n-i, alpha, a( i, i ), 1,t( 1, n ), 1, a( i, i+1 ), &
+                           lda )
+                 a( i, i ) = aii
+              end if
+           end do
+           do i = 2, n
+              aii = a( i, i )
+              a( i, i ) = cone
+              ! t(1:i-1,i) := alpha * a(i:m,1:i-1)**h * a(i:m,i)
+              alpha = -t( i, 1 )
+              call stdlib_wgemv( 'C', m-i+1, i-1, alpha, a( i, 1 ), lda,a( i, i ), 1, czero, t( 1,&
+                         i ), 1 )
+              a( i, i ) = aii
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+              call stdlib_wtrmv( 'U', 'N', 'N', i-1, t, ldt, t( 1, i ), 1 )
+                 ! t(i,i) = tau(i)
+                 t( i, i ) = t( i, 1 )
+                 t( i, 1) = czero
+           end do
+     end subroutine stdlib_wgeqrt2
+
+     !> ZGEQRT3: recursively computes a QR factorization of a complex M-by-N
+     !> matrix A, using the compact WY representation of Q.
+     !> Based on the algorithm of Elmroth and Gustavson,
+     !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+
+     pure recursive subroutine stdlib_wgeqrt3( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, ldt
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, j, j1, n1, n2, iinfo
+           ! Executable Statements 
+           info = 0
+           if( n < 0 ) then
+              info = -2
+           else if( m < n ) then
+              info = -1
+           else if( lda < max( 1, m ) ) then
+              info = -4
+           else if( ldt < max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQRT3', -info )
+              return
+           end if
+           if( n==1 ) then
+              ! compute householder transform when n=1
+              call stdlib_wlarfg( m, a(1,1), a( min( 2, m ), 1 ), 1, t(1,1) )
+           else
+              ! otherwise, split a into blocks...
+              n1 = n/2
+              n2 = n-n1
+              j1 = min( n1+1, n )
+              i1 = min( n+1, m )
+              ! compute a(1:m,1:n1) <- (y1,r1,t1), where q1 = i - y1 t1 y1^h
+              call stdlib_wgeqrt3( m, n1, a, lda, t, ldt, iinfo )
+              ! compute a(1:m,j1:n) = q1^h a(1:m,j1:n) [workspace: t(1:n1,j1:n)]
+              do j=1,n2
+                 do i=1,n1
+                    t( i, j+n1 ) = a( i, j+n1 )
+                 end do
+              end do
+              call stdlib_wtrmm( 'L', 'L', 'C', 'U', n1, n2, cone,a, lda, t( 1, j1 ), ldt )
+                        
+              call stdlib_wgemm( 'C', 'N', n1, n2, m-n1, cone, a( j1, 1 ), lda,a( j1, j1 ), lda, &
+                        cone, t( 1, j1 ), ldt)
+              call stdlib_wtrmm( 'L', 'U', 'C', 'N', n1, n2, cone,t, ldt, t( 1, j1 ), ldt )
+                        
+              call stdlib_wgemm( 'N', 'N', m-n1, n2, n1, -cone, a( j1, 1 ), lda,t( 1, j1 ), ldt, &
+                        cone, a( j1, j1 ), lda )
+              call stdlib_wtrmm( 'L', 'L', 'N', 'U', n1, n2, cone,a, lda, t( 1, j1 ), ldt )
+                        
+              do j=1,n2
+                 do i=1,n1
+                    a( i, j+n1 ) = a( i, j+n1 ) - t( i, j+n1 )
+                 end do
+              end do
+              ! compute a(j1:m,j1:n) <- (y2,r2,t2) where q2 = i - y2 t2 y2^h
+              call stdlib_wgeqrt3( m-n1, n2, a( j1, j1 ), lda,t( j1, j1 ), ldt, iinfo )
+              ! compute t3 = t(1:n1,j1:n) = -t1 y1^h y2 t2
+              do i=1,n1
+                 do j=1,n2
+                    t( i, j+n1 ) = conjg(a( j+n1, i ))
+                 end do
+              end do
+              call stdlib_wtrmm( 'R', 'L', 'N', 'U', n1, n2, cone,a( j1, j1 ), lda, t( 1, j1 ), &
+                        ldt )
+              call stdlib_wgemm( 'C', 'N', n1, n2, m-n, cone, a( i1, 1 ), lda,a( i1, j1 ), lda, &
+                        cone, t( 1, j1 ), ldt )
+              call stdlib_wtrmm( 'L', 'U', 'N', 'N', n1, n2, -cone, t, ldt,t( 1, j1 ), ldt )
+                        
+              call stdlib_wtrmm( 'R', 'U', 'N', 'N', n1, n2, cone,t( j1, j1 ), ldt, t( 1, j1 ), &
+                        ldt )
+              ! y = (y1,y2); r = [ r1  a(1:n1,j1:n) ];  t = [t1 t3]
+                               ! [  0        r2     ]       [ 0 t2]
+           end if
+           return
+     end subroutine stdlib_wgeqrt3
+
+     !> ZGERFS: improves the computed solution to a system of linear
+     !> equations and provides error bounds and backward error estimates for
+     !> the solution.
+
+     pure subroutine stdlib_wgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transn, transt
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGERFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_wcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_wgemv( trans, n, n, -cone, a, lda, x( 1, j ), 1, cone, work,1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(op(a))*abs(x) + abs(b).
+              if( notran ) then
+                 do k = 1, n
+                    xk = cabs1( x( k, j ) )
+                    do i = 1, n
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                    end do
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    do i = 1, n
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_wgetrs( trans, n, 1, af, ldaf, ipiv, work, n, info )
+                 call stdlib_waxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_wlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_wgetrs( transt, n, 1, af, ldaf, ipiv, work, n,info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_wgetrs( transn, n, 1, af, ldaf, ipiv, work, n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_wgerfs
+
+     !> ZGERQ2: computes an RQ factorization of a complex m by n matrix A:
+     !> A = R * Q.
+
+     pure subroutine stdlib_wgerq2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGERQ2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = k, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! a(m-k+i,1:n-k+i-1)
+              call stdlib_wlacgv( n-k+i, a( m-k+i, 1 ), lda )
+              alpha = a( m-k+i, n-k+i )
+              call stdlib_wlarfg( n-k+i, alpha, a( m-k+i, 1 ), lda, tau( i ) )
+              ! apply h(i) to a(1:m-k+i-1,1:n-k+i) from the right
+              a( m-k+i, n-k+i ) = cone
+              call stdlib_wlarf( 'RIGHT', m-k+i-1, n-k+i, a( m-k+i, 1 ), lda,tau( i ), a, lda, &
+                        work )
+              a( m-k+i, n-k+i ) = alpha
+              call stdlib_wlacgv( n-k+i-1, a( m-k+i, 1 ), lda )
+           end do
+           return
+     end subroutine stdlib_wgerq2
+
+     !> ZGERQF: computes an RQ factorization of a complex M-by-N matrix A:
+     !> A = R * Q.
+
+     pure subroutine stdlib_wgerqf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ki, kk, ldwork, lwkopt, mu, nb, nbmin, nu, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              k = min( m, n )
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'ZGERQF', ' ', m, n, -1, -1 )
+                 lwkopt = m*nb
+              end if
+              work( 1 ) = lwkopt
+              if ( .not.lquery ) then
+                 if( lwork<=0 .or. ( n>0 .and. lwork<max( 1, m ) ) )info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGERQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZGERQF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGERQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially.
+              ! the last kk rows are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              do i = k - kk + ki + 1, k - kk + 1, -nb
+                 ib = min( k-i+1, nb )
+                 ! compute the rq factorization of the current block
+                 ! a(m-k+i:m-k+i+ib-1,1:n-k+i+ib-1)
+                 call stdlib_wgerq2( ib, n-k+i+ib-1, a( m-k+i, 1 ), lda, tau( i ),work, iinfo )
+                           
+                 if( m-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_wlarft( 'BACKWARD', 'ROWWISE', n-k+i+ib-1, ib,a( m-k+i, 1 ), lda, &
+                              tau( i ), work, ldwork )
+                    ! apply h to a(1:m-k+i-1,1:n-k+i+ib-1) from the right
+                    call stdlib_wlarfb( 'RIGHT', 'NO TRANSPOSE', 'BACKWARD','ROWWISE', m-k+i-1, n-&
+                    k+i+ib-1, ib,a( m-k+i, 1 ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+              mu = m - k + i + nb - 1
+              nu = n - k + i + nb - 1
+           else
+              mu = m
+              nu = n
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 .and. nu>0 )call stdlib_wgerq2( mu, nu, a, lda, tau, work, iinfo )
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_wgerqf
+
+     !> ZGESC2: solves a system of linear equations
+     !> A * X = scale* RHS
+     !> with a general N-by-N matrix A using the LU factorization with
+     !> complete pivoting computed by ZGETC2.
+
+     pure subroutine stdlib_wgesc2( n, a, lda, rhs, ipiv, jpiv, scale )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(out) :: scale
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*), jpiv(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: rhs(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: bignum, eps, smlnum
+           complex(qp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx
+           ! Executable Statements 
+           ! set constant to control overflow
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! apply permutations ipiv to rhs
+           call stdlib_wlaswp( 1, rhs, lda, 1, n-1, ipiv, 1 )
+           ! solve for l part
+           do i = 1, n - 1
+              do j = i + 1, n
+                 rhs( j ) = rhs( j ) - a( j, i )*rhs( i )
+              end do
+           end do
+           ! solve for u part
+           scale = one
+           ! check for scaling
+           i = stdlib_iwamax( n, rhs, 1 )
+           if( two*smlnum*abs( rhs( i ) )>abs( a( n, n ) ) ) then
+              temp = cmplx( one / two, zero,KIND=qp) / abs( rhs( i ) )
+              call stdlib_wscal( n, temp, rhs( 1 ), 1 )
+              scale = scale*real( temp,KIND=qp)
+           end if
+           do i = n, 1, -1
+              temp = cmplx( one, zero,KIND=qp) / a( i, i )
+              rhs( i ) = rhs( i )*temp
+              do j = i + 1, n
+                 rhs( i ) = rhs( i ) - rhs( j )*( a( i, j )*temp )
+              end do
+           end do
+           ! apply permutations jpiv to the solution (rhs)
+           call stdlib_wlaswp( 1, rhs, lda, 1, n-1, jpiv, -1 )
+           return
+     end subroutine stdlib_wgesc2
+
+     !> ZGESDD: computes the singular value decomposition (SVD) of a complex
+     !> M-by-N matrix A, optionally computing the left and/or right singular
+     !> vectors, by using divide-and-conquer method. The SVD is written
+     !> A = U * SIGMA * conjugate-transpose(V)
+     !> where SIGMA is an M-by-N matrix which is zero except for its
+     !> min(m,n) diagonal elements, U is an M-by-M unitary matrix, and
+     !> V is an N-by-N unitary matrix.  The diagonal elements of SIGMA
+     !> are the singular values of A; they are real and non-negative, and
+     !> are returned in descending order.  The first min(m,n) columns of
+     !> U and V are the left and right singular vectors of A.
+     !> Note that the routine returns VT = V**H, not V.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_wgesdd( jobz, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, rwork, iwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldvt, lwork, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: rwork(*), s(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wntqa, wntqas, wntqn, wntqo, wntqs
+           integer(ilp) :: blk, chunk, i, ie, ierr, il, ir, iru, irvt, iscl, itau, itaup, itauq, &
+           iu, ivt, ldwkvt, ldwrkl, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr1, mnthr2, nrwork,&
+                      nwork, wrkbl
+           integer(ilp) :: lwork_wgebrd_mn, lwork_wgebrd_mm, lwork_wgebrd_nn, lwork_wgelqf_mn, &
+           lwork_wgeqrf_mn, lwork_wungbr_p_mn, lwork_wungbr_p_nn, lwork_wungbr_q_mn, &
+           lwork_wungbr_q_mm, lwork_wunglq_mn, lwork_wunglq_nn, lwork_wungqr_mm, lwork_wungqr_mn, &
+           lwork_wunmbr_prc_mm, lwork_wunmbr_qln_mm, lwork_wunmbr_prc_mn, lwork_wunmbr_qln_mn, &
+                     lwork_wunmbr_prc_nn, lwork_wunmbr_qln_nn
+           real(qp) :: anrm, bignum, eps, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(qp) :: dum(1)
+           complex(qp) :: cdum(1)
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info   = 0
+           minmn  = min( m, n )
+           mnthr1 = int( minmn*17.0_qp / 9.0_qp,KIND=ilp)
+           mnthr2 = int( minmn*5.0_qp / 3.0_qp,KIND=ilp)
+           wntqa  = stdlib_lsame( jobz, 'A' )
+           wntqs  = stdlib_lsame( jobz, 'S' )
+           wntqas = wntqa .or. wntqs
+           wntqo  = stdlib_lsame( jobz, 'O' )
+           wntqn  = stdlib_lsame( jobz, 'N' )
+           lquery = ( lwork==-1 )
+           minwrk = 1
+           maxwrk = 1
+           if( .not.( wntqa .or. wntqs .or. wntqo .or. wntqn ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldu<1 .or. ( wntqas .and. ldu<m ) .or.( wntqo .and. m<n .and. ldu<m ) ) &
+                     then
+              info = -8
+           else if( ldvt<1 .or. ( wntqa .and. ldvt<n ) .or.( wntqs .and. ldvt<minmn ) .or.( wntqo &
+                     .and. m>=n .and. ldvt<n ) ) then
+              info = -10
+           end if
+           ! compute workspace
+             ! note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace allocated at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to
+             ! real workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( m>=n .and. minmn>0 ) then
+                 ! there is no complex work space needed for bidiagonal svd
+                 ! the realwork space needed for bidiagonal svd (stdlib_qbdsdc,KIND=qp) is
+                 ! bdspac = 3*n*n + 4*n for singular values and vectors;
+                 ! bdspac = 4*n         for singular values only;
+                 ! not including e, ru, and rvt matrices.
+                 ! compute space preferred for each routine
+                 call stdlib_wgebrd( m, n, cdum(1), m, dum(1), dum(1), cdum(1),cdum(1), cdum(1), -&
+                           1, ierr )
+                 lwork_wgebrd_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_wgebrd( n, n, cdum(1), n, dum(1), dum(1), cdum(1),cdum(1), cdum(1), -&
+                           1, ierr )
+                 lwork_wgebrd_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_wgeqrf( m, n, cdum(1), m, cdum(1), cdum(1), -1, ierr )
+                 lwork_wgeqrf_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_wungbr( 'P', n, n, n, cdum(1), n, cdum(1), cdum(1),-1, ierr )
+                 lwork_wungbr_p_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_wungbr( 'Q', m, m, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_wungbr_q_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_wungbr( 'Q', m, n, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_wungbr_q_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_wungqr( m, m, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_wungqr_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_wungqr( m, n, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_wungqr_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_wunmbr( 'P', 'R', 'C', n, n, n, cdum(1), n, cdum(1),cdum(1), n, cdum(&
+                           1), -1, ierr )
+                 lwork_wunmbr_prc_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_wunmbr( 'Q', 'L', 'N', m, m, n, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_wunmbr_qln_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_wunmbr( 'Q', 'L', 'N', m, n, n, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_wunmbr_qln_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_wunmbr( 'Q', 'L', 'N', n, n, n, cdum(1), n, cdum(1),cdum(1), n, cdum(&
+                           1), -1, ierr )
+                 lwork_wunmbr_qln_nn = int( cdum(1),KIND=ilp)
+                 if( m>=mnthr1 ) then
+                    if( wntqn ) then
+                       ! path 1 (m >> n, jobz='n')
+                       maxwrk = n + lwork_wgeqrf_mn
+                       maxwrk = max( maxwrk, 2*n + lwork_wgebrd_nn )
+                       minwrk = 3*n
+                    else if( wntqo ) then
+                       ! path 2 (m >> n, jobz='o')
+                       wrkbl = n + lwork_wgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_wungqr_mn )
+                       wrkbl = max( wrkbl, 2*n + lwork_wgebrd_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_wunmbr_qln_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_wunmbr_prc_nn )
+                       maxwrk = m*n + n*n + wrkbl
+                       minwrk = 2*n*n + 3*n
+                    else if( wntqs ) then
+                       ! path 3 (m >> n, jobz='s')
+                       wrkbl = n + lwork_wgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_wungqr_mn )
+                       wrkbl = max( wrkbl, 2*n + lwork_wgebrd_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_wunmbr_qln_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_wunmbr_prc_nn )
+                       maxwrk = n*n + wrkbl
+                       minwrk = n*n + 3*n
+                    else if( wntqa ) then
+                       ! path 4 (m >> n, jobz='a')
+                       wrkbl = n + lwork_wgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_wungqr_mm )
+                       wrkbl = max( wrkbl, 2*n + lwork_wgebrd_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_wunmbr_qln_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_wunmbr_prc_nn )
+                       maxwrk = n*n + wrkbl
+                       minwrk = n*n + max( 3*n, n + m )
+                    end if
+                 else if( m>=mnthr2 ) then
+                    ! path 5 (m >> n, but not as much as mnthr1)
+                    maxwrk = 2*n + lwork_wgebrd_mn
+                    minwrk = 2*n + m
+                    if( wntqo ) then
+                       ! path 5o (m >> n, jobz='o')
+                       maxwrk = max( maxwrk, 2*n + lwork_wungbr_p_nn )
+                       maxwrk = max( maxwrk, 2*n + lwork_wungbr_q_mn )
+                       maxwrk = maxwrk + m*n
+                       minwrk = minwrk + n*n
+                    else if( wntqs ) then
+                       ! path 5s (m >> n, jobz='s')
+                       maxwrk = max( maxwrk, 2*n + lwork_wungbr_p_nn )
+                       maxwrk = max( maxwrk, 2*n + lwork_wungbr_q_mn )
+                    else if( wntqa ) then
+                       ! path 5a (m >> n, jobz='a')
+                       maxwrk = max( maxwrk, 2*n + lwork_wungbr_p_nn )
+                       maxwrk = max( maxwrk, 2*n + lwork_wungbr_q_mm )
+                    end if
+                 else
+                    ! path 6 (m >= n, but not much larger)
+                    maxwrk = 2*n + lwork_wgebrd_mn
+                    minwrk = 2*n + m
+                    if( wntqo ) then
+                       ! path 6o (m >= n, jobz='o')
+                       maxwrk = max( maxwrk, 2*n + lwork_wunmbr_prc_nn )
+                       maxwrk = max( maxwrk, 2*n + lwork_wunmbr_qln_mn )
+                       maxwrk = maxwrk + m*n
+                       minwrk = minwrk + n*n
+                    else if( wntqs ) then
+                       ! path 6s (m >= n, jobz='s')
+                       maxwrk = max( maxwrk, 2*n + lwork_wunmbr_qln_mn )
+                       maxwrk = max( maxwrk, 2*n + lwork_wunmbr_prc_nn )
+                    else if( wntqa ) then
+                       ! path 6a (m >= n, jobz='a')
+                       maxwrk = max( maxwrk, 2*n + lwork_wunmbr_qln_mm )
+                       maxwrk = max( maxwrk, 2*n + lwork_wunmbr_prc_nn )
+                    end if
+                 end if
+              else if( minmn>0 ) then
+                 ! there is no complex work space needed for bidiagonal svd
+                 ! the realwork space needed for bidiagonal svd (stdlib_qbdsdc,KIND=qp) is
+                 ! bdspac = 3*m*m + 4*m for singular values and vectors;
+                 ! bdspac = 4*m         for singular values only;
+                 ! not including e, ru, and rvt matrices.
+                 ! compute space preferred for each routine
+                 call stdlib_wgebrd( m, n, cdum(1), m, dum(1), dum(1), cdum(1),cdum(1), cdum(1), -&
+                           1, ierr )
+                 lwork_wgebrd_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_wgebrd( m, m, cdum(1), m, dum(1), dum(1), cdum(1),cdum(1), cdum(1), -&
+                           1, ierr )
+                 lwork_wgebrd_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_wgelqf( m, n, cdum(1), m, cdum(1), cdum(1), -1, ierr )
+                 lwork_wgelqf_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_wungbr( 'P', m, n, m, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_wungbr_p_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_wungbr( 'P', n, n, m, cdum(1), n, cdum(1), cdum(1),-1, ierr )
+                 lwork_wungbr_p_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_wungbr( 'Q', m, m, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_wungbr_q_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_wunglq( m, n, m, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_wunglq_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_wunglq( n, n, m, cdum(1), n, cdum(1), cdum(1),-1, ierr )
+                 lwork_wunglq_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_wunmbr( 'P', 'R', 'C', m, m, m, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_wunmbr_prc_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_wunmbr( 'P', 'R', 'C', m, n, m, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_wunmbr_prc_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_wunmbr( 'P', 'R', 'C', n, n, m, cdum(1), n, cdum(1),cdum(1), n, cdum(&
+                           1), -1, ierr )
+                 lwork_wunmbr_prc_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_wunmbr( 'Q', 'L', 'N', m, m, m, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_wunmbr_qln_mm = int( cdum(1),KIND=ilp)
+                 if( n>=mnthr1 ) then
+                    if( wntqn ) then
+                       ! path 1t (n >> m, jobz='n')
+                       maxwrk = m + lwork_wgelqf_mn
+                       maxwrk = max( maxwrk, 2*m + lwork_wgebrd_mm )
+                       minwrk = 3*m
+                    else if( wntqo ) then
+                       ! path 2t (n >> m, jobz='o')
+                       wrkbl = m + lwork_wgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_wunglq_mn )
+                       wrkbl = max( wrkbl, 2*m + lwork_wgebrd_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_wunmbr_qln_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_wunmbr_prc_mm )
+                       maxwrk = m*n + m*m + wrkbl
+                       minwrk = 2*m*m + 3*m
+                    else if( wntqs ) then
+                       ! path 3t (n >> m, jobz='s')
+                       wrkbl = m + lwork_wgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_wunglq_mn )
+                       wrkbl = max( wrkbl, 2*m + lwork_wgebrd_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_wunmbr_qln_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_wunmbr_prc_mm )
+                       maxwrk = m*m + wrkbl
+                       minwrk = m*m + 3*m
+                    else if( wntqa ) then
+                       ! path 4t (n >> m, jobz='a')
+                       wrkbl = m + lwork_wgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_wunglq_nn )
+                       wrkbl = max( wrkbl, 2*m + lwork_wgebrd_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_wunmbr_qln_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_wunmbr_prc_mm )
+                       maxwrk = m*m + wrkbl
+                       minwrk = m*m + max( 3*m, m + n )
+                    end if
+                 else if( n>=mnthr2 ) then
+                    ! path 5t (n >> m, but not as much as mnthr1)
+                    maxwrk = 2*m + lwork_wgebrd_mn
+                    minwrk = 2*m + n
+                    if( wntqo ) then
+                       ! path 5to (n >> m, jobz='o')
+                       maxwrk = max( maxwrk, 2*m + lwork_wungbr_q_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_wungbr_p_mn )
+                       maxwrk = maxwrk + m*n
+                       minwrk = minwrk + m*m
+                    else if( wntqs ) then
+                       ! path 5ts (n >> m, jobz='s')
+                       maxwrk = max( maxwrk, 2*m + lwork_wungbr_q_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_wungbr_p_mn )
+                    else if( wntqa ) then
+                       ! path 5ta (n >> m, jobz='a')
+                       maxwrk = max( maxwrk, 2*m + lwork_wungbr_q_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_wungbr_p_nn )
+                    end if
+                 else
+                    ! path 6t (n > m, but not much larger)
+                    maxwrk = 2*m + lwork_wgebrd_mn
+                    minwrk = 2*m + n
+                    if( wntqo ) then
+                       ! path 6to (n > m, jobz='o')
+                       maxwrk = max( maxwrk, 2*m + lwork_wunmbr_qln_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_wunmbr_prc_mn )
+                       maxwrk = maxwrk + m*n
+                       minwrk = minwrk + m*m
+                    else if( wntqs ) then
+                       ! path 6ts (n > m, jobz='s')
+                       maxwrk = max( maxwrk, 2*m + lwork_wunmbr_qln_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_wunmbr_prc_mn )
+                    else if( wntqa ) then
+                       ! path 6ta (n > m, jobz='a')
+                       maxwrk = max( maxwrk, 2*m + lwork_wunmbr_qln_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_wunmbr_prc_nn )
+                    end if
+                 end if
+              end if
+              maxwrk = max( maxwrk, minwrk )
+           end if
+           if( info==0 ) then
+              work( 1 ) = stdlib_qroundup_lwork( maxwrk )
+              if( lwork<minwrk .and. .not. lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGESDD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = sqrt( stdlib_qlamch( 'S' ) ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', m, n, a, lda, dum )
+           if( stdlib_qisnan( anrm ) ) then
+               info = -4
+               return
+           end if
+           iscl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              iscl = 1
+              call stdlib_wlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, ierr )
+           else if( anrm>bignum ) then
+              iscl = 1
+              call stdlib_wlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, ierr )
+           end if
+           if( m>=n ) then
+              ! a has at least as many rows as columns. if a has sufficiently
+              ! more rows than columns, first reduce using the qr
+              ! decomposition (if sufficient workspace available)
+              if( m>=mnthr1 ) then
+                 if( wntqn ) then
+                    ! path 1 (m >> n, jobz='n')
+                    ! no singular vectors to be computed
+                    itau = 1
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! cworkspace: need   n [tau] + n    [work]
+                    ! cworkspace: prefer n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! zero out below r
+                    call stdlib_wlaset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+                    ie = 1
+                    itauq = 1
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! cworkspace: need   2*n [tauq, taup] + n      [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + 2*n*nb [work]
+                    ! rworkspace: need   n [e]
+                    call stdlib_wgebrd( n, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( nwork ), lwork-nwork+1,ierr )
+                    nrwork = ie + n
+                    ! perform bidiagonal svd, compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + bdspac
+                    call stdlib_qbdsdc( 'U', 'N', n, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 2 (m >> n, jobz='o')
+                    ! n left singular vectors to be overwritten on a and
+                    ! n right singular vectors to be computed in vt
+                    iu = 1
+                    ! work(iu) is n by n
+                    ldwrku = n
+                    ir = iu + ldwrku*n
+                    if( lwork >= m*n + n*n + 3*n ) then
+                       ! work(ir) is m by n
+                       ldwrkr = m
+                    else
+                       ldwrkr = ( lwork - n*n - 3*n ) / n
+                    end if
+                    itau = ir + ldwrkr*n
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! cworkspace: need   n*n [u] + n*n [r] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! copy r to work( ir ), zeroing out below it
+                    call stdlib_wlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                    call stdlib_wlaset( 'L', n-1, n-1, czero, czero, work( ir+1 ),ldwrkr )
+                    ! generate q in a
+                    ! cworkspace: need   n*n [u] + n*n [r] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wungqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in work(ir)
+                    ! cworkspace: need   n*n [u] + n*n [r] + 2*n [tauq, taup] + n      [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + 2*n [tauq, taup] + 2*n*nb [work]
+                    ! rworkspace: need   n [e]
+                    call stdlib_wgebrd( n, n, work( ir ), ldwrkr, s, rwork( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of r in work(iru) and computing right singular vectors
+                    ! of r in work(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = ie + n
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_qbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=qp) to complex matrix work(iu)
+                    ! overwrite work(iu) by the left singular vectors of r
+                    ! cworkspace: need   n*n [u] + n*n [r] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacp2( 'F', n, n, rwork( iru ), n, work( iu ),ldwrku )
+                    call stdlib_wunmbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                              work( iu ), ldwrku,work( nwork ), lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=qp) to complex matrix vt
+                    ! overwrite vt by the right singular vectors of r
+                    ! cworkspace: need   n*n [u] + n*n [r] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_wunmbr( 'P', 'R', 'C', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork-nwork+1, ierr )
+                    ! multiply q in a by left singular vectors of r in
+                    ! work(iu), storing result in work(ir) and copying to a
+                    ! cworkspace: need   n*n [u] + n*n [r]
+                    ! cworkspace: prefer n*n [u] + m*n [r]
+                    ! rworkspace: need   0
+                    do i = 1, m, ldwrkr
+                       chunk = min( m-i+1, ldwrkr )
+                       call stdlib_wgemm( 'N', 'N', chunk, n, n, cone, a( i, 1 ),lda, work( iu ), &
+                                 ldwrku, czero,work( ir ), ldwrkr )
+                       call stdlib_wlacpy( 'F', chunk, n, work( ir ), ldwrkr,a( i, 1 ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 3 (m >> n, jobz='s')
+                    ! n left singular vectors to be computed in u and
+                    ! n right singular vectors to be computed in vt
+                    ir = 1
+                    ! work(ir) is n by n
+                    ldwrkr = n
+                    itau = ir + ldwrkr*n
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! cworkspace: need   n*n [r] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! copy r to work(ir), zeroing out below it
+                    call stdlib_wlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                    call stdlib_wlaset( 'L', n-1, n-1, czero, czero, work( ir+1 ),ldwrkr )
+                    ! generate q in a
+                    ! cworkspace: need   n*n [r] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wungqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in work(ir)
+                    ! cworkspace: need   n*n [r] + 2*n [tauq, taup] + n      [work]
+                    ! cworkspace: prefer n*n [r] + 2*n [tauq, taup] + 2*n*nb [work]
+                    ! rworkspace: need   n [e]
+                    call stdlib_wgebrd( n, n, work( ir ), ldwrkr, s, rwork( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = ie + n
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_qbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=qp) to complex matrix u
+                    ! overwrite u by left singular vectors of r
+                    ! cworkspace: need   n*n [r] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [r] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacp2( 'F', n, n, rwork( iru ), n, u, ldu )
+                    call stdlib_wunmbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                              u, ldu, work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=qp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of r
+                    ! cworkspace: need   n*n [r] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [r] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_wunmbr( 'P', 'R', 'C', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork-nwork+1, ierr )
+                    ! multiply q in a by left singular vectors of r in
+                    ! work(ir), storing result in u
+                    ! cworkspace: need   n*n [r]
+                    ! rworkspace: need   0
+                    call stdlib_wlacpy( 'F', n, n, u, ldu, work( ir ), ldwrkr )
+                    call stdlib_wgemm( 'N', 'N', m, n, n, cone, a, lda, work( ir ),ldwrkr, czero, &
+                              u, ldu )
+                 else if( wntqa ) then
+                    ! path 4 (m >> n, jobz='a')
+                    ! m left singular vectors to be computed in u and
+                    ! n right singular vectors to be computed in vt
+                    iu = 1
+                    ! work(iu) is n by n
+                    ldwrku = n
+                    itau = iu + ldwrku*n
+                    nwork = itau + n
+                    ! compute a=q*r, copying result to u
+                    ! cworkspace: need   n*n [u] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    call stdlib_wlacpy( 'L', m, n, a, lda, u, ldu )
+                    ! generate q in u
+                    ! cworkspace: need   n*n [u] + n [tau] + m    [work]
+                    ! cworkspace: prefer n*n [u] + n [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wungqr( m, m, n, u, ldu, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ! produce r in a, zeroing out below it
+                    call stdlib_wlaset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! cworkspace: need   n*n [u] + 2*n [tauq, taup] + n      [work]
+                    ! cworkspace: prefer n*n [u] + 2*n [tauq, taup] + 2*n*nb [work]
+                    ! rworkspace: need   n [e]
+                    call stdlib_wgebrd( n, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( nwork ), lwork-nwork+1,ierr )
+                    iru = ie + n
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    call stdlib_qbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=qp) to complex matrix work(iu)
+                    ! overwrite work(iu) by left singular vectors of r
+                    ! cworkspace: need   n*n [u] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [u] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacp2( 'F', n, n, rwork( iru ), n, work( iu ),ldwrku )
+                    call stdlib_wunmbr( 'Q', 'L', 'N', n, n, n, a, lda,work( itauq ), work( iu ), &
+                              ldwrku,work( nwork ), lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=qp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of r
+                    ! cworkspace: need   n*n [u] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [u] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_wunmbr( 'P', 'R', 'C', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! multiply q in u by left singular vectors of r in
+                    ! work(iu), storing result in a
+                    ! cworkspace: need   n*n [u]
+                    ! rworkspace: need   0
+                    call stdlib_wgemm( 'N', 'N', m, n, n, cone, u, ldu, work( iu ),ldwrku, czero, &
+                              a, lda )
+                    ! copy left singular vectors of a from a to u
+                    call stdlib_wlacpy( 'F', m, n, a, lda, u, ldu )
+                 end if
+              else if( m>=mnthr2 ) then
+                 ! mnthr2 <= m < mnthr1
+                 ! path 5 (m >> n, but not as much as mnthr1)
+                 ! reduce to bidiagonal form without qr decomposition, use
+                 ! stdlib_wungbr and matrix multiplication to compute singular vectors
+                 ie = 1
+                 nrwork = ie + n
+                 itauq = 1
+                 itaup = itauq + n
+                 nwork = itaup + n
+                 ! bidiagonalize a
+                 ! cworkspace: need   2*n [tauq, taup] + m        [work]
+                 ! cworkspace: prefer 2*n [tauq, taup] + (m+n)*nb [work]
+                 ! rworkspace: need   n [e]
+                 call stdlib_wgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 5n (m >> n, jobz='n')
+                    ! compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + bdspac
+                    call stdlib_qbdsdc( 'U', 'N', n, s, rwork( ie ), dum, 1,dum,1,dum, idum, &
+                              rwork( nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    iu = nwork
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    ! path 5o (m >> n, jobz='o')
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_wungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! generate q in a
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wungbr( 'Q', m, n, n, a, lda, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    if( lwork >= m*n + 3*n ) then
+                       ! work( iu ) is m by n
+                       ldwrku = m
+                    else
+                       ! work(iu) is ldwrku by n
+                       ldwrku = ( lwork - 3*n ) / n
+                    end if
+                    nwork = iu + ldwrku*n
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    call stdlib_qbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply realmatrix rwork(irvt,KIND=qp) by p**h in vt,
+                    ! storing the result in work(iu), copying to vt
+                    ! cworkspace: need   2*n [tauq, taup] + n*n [u]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + 2*n*n [rwork]
+                    call stdlib_wlarcm( n, n, rwork( irvt ), n, vt, ldvt,work( iu ), ldwrku, &
+                              rwork( nrwork ) )
+                    call stdlib_wlacpy( 'F', n, n, work( iu ), ldwrku, vt, ldvt )
+                    ! multiply q in a by realmatrix rwork(iru,KIND=qp), storing the
+                    ! result in work(iu), copying to a
+                    ! cworkspace: need   2*n [tauq, taup] + n*n [u]
+                    ! cworkspace: prefer 2*n [tauq, taup] + m*n [u]
+                    ! rworkspace: need   n [e] + n*n [ru] + 2*n*n [rwork]
+                    ! rworkspace: prefer n [e] + n*n [ru] + 2*m*n [rwork] < n + 5*n*n since m < 2*n here
+                    nrwork = irvt
+                    do i = 1, m, ldwrku
+                       chunk = min( m-i+1, ldwrku )
+                       call stdlib_wlacrm( chunk, n, a( i, 1 ), lda, rwork( iru ),n, work( iu ), &
+                                 ldwrku, rwork( nrwork ) )
+                       call stdlib_wlacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 5s (m >> n, jobz='s')
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_wungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacpy( 'L', m, n, a, lda, u, ldu )
+                    call stdlib_wungbr( 'Q', m, n, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_qbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply realmatrix rwork(irvt,KIND=qp) by p**h in vt,
+                    ! storing the result in a, copying to vt
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + 2*n*n [rwork]
+                    call stdlib_wlarcm( n, n, rwork( irvt ), n, vt, ldvt, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_wlacpy( 'F', n, n, a, lda, vt, ldvt )
+                    ! multiply q in u by realmatrix rwork(iru,KIND=qp), storing the
+                    ! result in a, copying to u
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + 2*m*n [rwork] < n + 5*n*n since m < 2*n here
+                    nrwork = irvt
+                    call stdlib_wlacrm( m, n, u, ldu, rwork( iru ), n, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_wlacpy( 'F', m, n, a, lda, u, ldu )
+                 else
+                    ! path 5a (m >> n, jobz='a')
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_wungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*n [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacpy( 'L', m, n, a, lda, u, ldu )
+                    call stdlib_wungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_qbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply realmatrix rwork(irvt,KIND=qp) by p**h in vt,
+                    ! storing the result in a, copying to vt
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + 2*n*n [rwork]
+                    call stdlib_wlarcm( n, n, rwork( irvt ), n, vt, ldvt, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_wlacpy( 'F', n, n, a, lda, vt, ldvt )
+                    ! multiply q in u by realmatrix rwork(iru,KIND=qp), storing the
+                    ! result in a, copying to u
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + 2*m*n [rwork] < n + 5*n*n since m < 2*n here
+                    nrwork = irvt
+                    call stdlib_wlacrm( m, n, u, ldu, rwork( iru ), n, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_wlacpy( 'F', m, n, a, lda, u, ldu )
+                 end if
+              else
+                 ! m < mnthr2
+                 ! path 6 (m >= n, but not much larger)
+                 ! reduce to bidiagonal form without qr decomposition
+                 ! use stdlib_wunmbr to compute singular vectors
+                 ie = 1
+                 nrwork = ie + n
+                 itauq = 1
+                 itaup = itauq + n
+                 nwork = itaup + n
+                 ! bidiagonalize a
+                 ! cworkspace: need   2*n [tauq, taup] + m        [work]
+                 ! cworkspace: prefer 2*n [tauq, taup] + (m+n)*nb [work]
+                 ! rworkspace: need   n [e]
+                 call stdlib_wgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 6n (m >= n, jobz='n')
+                    ! compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + bdspac
+                    call stdlib_qbdsdc( 'U', 'N', n, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    iu = nwork
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    if( lwork >= m*n + 3*n ) then
+                       ! work( iu ) is m by n
+                       ldwrku = m
+                    else
+                       ! work( iu ) is ldwrku by n
+                       ldwrku = ( lwork - 3*n ) / n
+                    end if
+                    nwork = iu + ldwrku*n
+                    ! path 6o (m >= n, jobz='o')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    call stdlib_qbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(irvt,KIND=qp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + n*n [u] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*n [u] + n*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_wlacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_wunmbr( 'P', 'R', 'C', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    if( lwork >= m*n + 3*n ) then
+                       ! path 6o-fast
+                       ! copy realmatrix rwork(iru,KIND=qp) to complex matrix work(iu)
+                       ! overwrite work(iu) by left singular vectors of a, copying
+                       ! to a
+                       ! cworkspace: need   2*n [tauq, taup] + m*n [u] + n    [work]
+                       ! cworkspace: prefer 2*n [tauq, taup] + m*n [u] + n*nb [work]
+                       ! rworkspace: need   n [e] + n*n [ru]
+                       call stdlib_wlaset( 'F', m, n, czero, czero, work( iu ),ldwrku )
+                       call stdlib_wlacp2( 'F', n, n, rwork( iru ), n, work( iu ),ldwrku )
+                       call stdlib_wunmbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), work( iu &
+                                 ), ldwrku,work( nwork ), lwork-nwork+1, ierr )
+                       call stdlib_wlacpy( 'F', m, n, work( iu ), ldwrku, a, lda )
+                    else
+                       ! path 6o-slow
+                       ! generate q in a
+                       ! cworkspace: need   2*n [tauq, taup] + n*n [u] + n    [work]
+                       ! cworkspace: prefer 2*n [tauq, taup] + n*n [u] + n*nb [work]
+                       ! rworkspace: need   0
+                       call stdlib_wungbr( 'Q', m, n, n, a, lda, work( itauq ),work( nwork ), &
+                                 lwork-nwork+1, ierr )
+                       ! multiply q in a by realmatrix rwork(iru,KIND=qp), storing the
+                       ! result in work(iu), copying to a
+                       ! cworkspace: need   2*n [tauq, taup] + n*n [u]
+                       ! cworkspace: prefer 2*n [tauq, taup] + m*n [u]
+                       ! rworkspace: need   n [e] + n*n [ru] + 2*n*n [rwork]
+                       ! rworkspace: prefer n [e] + n*n [ru] + 2*m*n [rwork] < n + 5*n*n since m < 2*n here
+                       nrwork = irvt
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_wlacrm( chunk, n, a( i, 1 ), lda,rwork( iru ), n, work( iu )&
+                                    , ldwrku,rwork( nrwork ) )
+                          call stdlib_wlacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    end if
+                 else if( wntqs ) then
+                    ! path 6s (m >= n, jobz='s')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_qbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=qp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_wlaset( 'F', m, n, czero, czero, u, ldu )
+                    call stdlib_wlacp2( 'F', n, n, rwork( iru ), n, u, ldu )
+                    call stdlib_wunmbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=qp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_wlacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_wunmbr( 'P', 'R', 'C', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                 else
+                    ! path 6a (m >= n, jobz='a')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_qbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! set the right corner of u to identity matrix
+                    call stdlib_wlaset( 'F', m, m, czero, czero, u, ldu )
+                    if( m>n ) then
+                       call stdlib_wlaset( 'F', m-n, m-n, czero, cone,u( n+1, n+1 ), ldu )
+                    end if
+                    ! copy realmatrix rwork(iru,KIND=qp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_wlacp2( 'F', n, n, rwork( iru ), n, u, ldu )
+                    call stdlib_wunmbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=qp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_wlacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_wunmbr( 'P', 'R', 'C', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                 end if
+              end if
+           else
+              ! a has more columns than rows. if a has sufficiently more
+              ! columns than rows, first reduce using the lq decomposition (if
+              ! sufficient workspace available)
+              if( n>=mnthr1 ) then
+                 if( wntqn ) then
+                    ! path 1t (n >> m, jobz='n')
+                    ! no singular vectors to be computed
+                    itau = 1
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! cworkspace: need   m [tau] + m    [work]
+                    ! cworkspace: prefer m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! zero out above l
+                    call stdlib_wlaset( 'U', m-1, m-1, czero, czero, a( 1, 2 ),lda )
+                    ie = 1
+                    itauq = 1
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! cworkspace: need   2*m [tauq, taup] + m      [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + 2*m*nb [work]
+                    ! rworkspace: need   m [e]
+                    call stdlib_wgebrd( m, m, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( nwork ), lwork-nwork+1,ierr )
+                    nrwork = ie + m
+                    ! perform bidiagonal svd, compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + bdspac
+                    call stdlib_qbdsdc( 'U', 'N', m, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 2t (n >> m, jobz='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! m left singular vectors to be computed in u
+                    ivt = 1
+                    ldwkvt = m
+                    ! work(ivt) is m by m
+                    il = ivt + ldwkvt*m
+                    if( lwork >= m*n + m*m + 3*m ) then
+                       ! work(il) m by n
+                       ldwrkl = m
+                       chunk = n
+                    else
+                       ! work(il) is m by chunk
+                       ldwrkl = m
+                       chunk = ( lwork - m*m - 3*m ) / m
+                    end if
+                    itau = il + ldwrkl*chunk
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! cworkspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! copy l to work(il), zeroing about above it
+                    call stdlib_wlacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
+                    call stdlib_wlaset( 'U', m-1, m-1, czero, czero,work( il+ldwrkl ), ldwrkl )
+                              
+                    ! generate q in a
+                    ! cworkspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wunglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in work(il)
+                    ! cworkspace: need   m*m [vt] + m*m [l] + 2*m [tauq, taup] + m      [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + 2*m [tauq, taup] + 2*m*nb [work]
+                    ! rworkspace: need   m [e]
+                    call stdlib_wgebrd( m, m, work( il ), ldwrkl, s, rwork( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [ru] + m*m [rvt] + bdspac
+                    iru = ie + m
+                    irvt = iru + m*m
+                    nrwork = irvt + m*m
+                    call stdlib_qbdsdc( 'U', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=qp) to complex matrix work(iu)
+                    ! overwrite work(iu) by the left singular vectors of l
+                    ! cworkspace: need   m*m [vt] + m*m [l] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_wunmbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
+                              u, ldu, work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=qp) to complex matrix work(ivt)
+                    ! overwrite work(ivt) by the right singular vectors of l
+                    ! cworkspace: need   m*m [vt] + m*m [l] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacp2( 'F', m, m, rwork( irvt ), m, work( ivt ),ldwkvt )
+                    call stdlib_wunmbr( 'P', 'R', 'C', m, m, m, work( il ), ldwrkl,work( itaup ), &
+                              work( ivt ), ldwkvt,work( nwork ), lwork-nwork+1, ierr )
+                    ! multiply right singular vectors of l in work(il) by q
+                    ! in a, storing result in work(il) and copying to a
+                    ! cworkspace: need   m*m [vt] + m*m [l]
+                    ! cworkspace: prefer m*m [vt] + m*n [l]
+                    ! rworkspace: need   0
+                    do i = 1, n, chunk
+                       blk = min( n-i+1, chunk )
+                       call stdlib_wgemm( 'N', 'N', m, blk, m, cone, work( ivt ), m,a( 1, i ), &
+                                 lda, czero, work( il ),ldwrkl )
+                       call stdlib_wlacpy( 'F', m, blk, work( il ), ldwrkl,a( 1, i ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 3t (n >> m, jobz='s')
+                    ! m right singular vectors to be computed in vt and
+                    ! m left singular vectors to be computed in u
+                    il = 1
+                    ! work(il) is m by m
+                    ldwrkl = m
+                    itau = il + ldwrkl*m
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! cworkspace: need   m*m [l] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [l] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! copy l to work(il), zeroing out above it
+                    call stdlib_wlacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
+                    call stdlib_wlaset( 'U', m-1, m-1, czero, czero,work( il+ldwrkl ), ldwrkl )
+                              
+                    ! generate q in a
+                    ! cworkspace: need   m*m [l] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [l] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wunglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in work(il)
+                    ! cworkspace: need   m*m [l] + 2*m [tauq, taup] + m      [work]
+                    ! cworkspace: prefer m*m [l] + 2*m [tauq, taup] + 2*m*nb [work]
+                    ! rworkspace: need   m [e]
+                    call stdlib_wgebrd( m, m, work( il ), ldwrkl, s, rwork( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [ru] + m*m [rvt] + bdspac
+                    iru = ie + m
+                    irvt = iru + m*m
+                    nrwork = irvt + m*m
+                    call stdlib_qbdsdc( 'U', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=qp) to complex matrix u
+                    ! overwrite u by left singular vectors of l
+                    ! cworkspace: need   m*m [l] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [l] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_wunmbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
+                              u, ldu, work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=qp) to complex matrix vt
+                    ! overwrite vt by left singular vectors of l
+                    ! cworkspace: need   m*m [l] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [l] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacp2( 'F', m, m, rwork( irvt ), m, vt, ldvt )
+                    call stdlib_wunmbr( 'P', 'R', 'C', m, m, m, work( il ), ldwrkl,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork-nwork+1, ierr )
+                    ! copy vt to work(il), multiply right singular vectors of l
+                    ! in work(il) by q in a, storing result in vt
+                    ! cworkspace: need   m*m [l]
+                    ! rworkspace: need   0
+                    call stdlib_wlacpy( 'F', m, m, vt, ldvt, work( il ), ldwrkl )
+                    call stdlib_wgemm( 'N', 'N', m, n, m, cone, work( il ), ldwrkl,a, lda, czero, &
+                              vt, ldvt )
+                 else if( wntqa ) then
+                    ! path 4t (n >> m, jobz='a')
+                    ! n right singular vectors to be computed in vt and
+                    ! m left singular vectors to be computed in u
+                    ivt = 1
+                    ! work(ivt) is m by m
+                    ldwkvt = m
+                    itau = ivt + ldwkvt*m
+                    nwork = itau + m
+                    ! compute a=l*q, copying result to vt
+                    ! cworkspace: need   m*m [vt] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    call stdlib_wlacpy( 'U', m, n, a, lda, vt, ldvt )
+                    ! generate q in vt
+                    ! cworkspace: need   m*m [vt] + m [tau] + n    [work]
+                    ! cworkspace: prefer m*m [vt] + m [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wunglq( n, n, m, vt, ldvt, work( itau ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! produce l in a, zeroing out above it
+                    call stdlib_wlaset( 'U', m-1, m-1, czero, czero, a( 1, 2 ),lda )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! cworkspace: need   m*m [vt] + 2*m [tauq, taup] + m      [work]
+                    ! cworkspace: prefer m*m [vt] + 2*m [tauq, taup] + 2*m*nb [work]
+                    ! rworkspace: need   m [e]
+                    call stdlib_wgebrd( m, m, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( nwork ), lwork-nwork+1,ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [ru] + m*m [rvt] + bdspac
+                    iru = ie + m
+                    irvt = iru + m*m
+                    nrwork = irvt + m*m
+                    call stdlib_qbdsdc( 'U', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=qp) to complex matrix u
+                    ! overwrite u by left singular vectors of l
+                    ! cworkspace: need   m*m [vt] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_wunmbr( 'Q', 'L', 'N', m, m, m, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=qp) to complex matrix work(ivt)
+                    ! overwrite work(ivt) by right singular vectors of l
+                    ! cworkspace: need   m*m [vt] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacp2( 'F', m, m, rwork( irvt ), m, work( ivt ),ldwkvt )
+                    call stdlib_wunmbr( 'P', 'R', 'C', m, m, m, a, lda,work( itaup ), work( ivt ),&
+                               ldwkvt,work( nwork ), lwork-nwork+1, ierr )
+                    ! multiply right singular vectors of l in work(ivt) by
+                    ! q in vt, storing result in a
+                    ! cworkspace: need   m*m [vt]
+                    ! rworkspace: need   0
+                    call stdlib_wgemm( 'N', 'N', m, n, m, cone, work( ivt ), ldwkvt,vt, ldvt, &
+                              czero, a, lda )
+                    ! copy right singular vectors of a from a to vt
+                    call stdlib_wlacpy( 'F', m, n, a, lda, vt, ldvt )
+                 end if
+              else if( n>=mnthr2 ) then
+                 ! mnthr2 <= n < mnthr1
+                 ! path 5t (n >> m, but not as much as mnthr1)
+                 ! reduce to bidiagonal form without qr decomposition, use
+                 ! stdlib_wungbr and matrix multiplication to compute singular vectors
+                 ie = 1
+                 nrwork = ie + m
+                 itauq = 1
+                 itaup = itauq + m
+                 nwork = itaup + m
+                 ! bidiagonalize a
+                 ! cworkspace: need   2*m [tauq, taup] + n        [work]
+                 ! cworkspace: prefer 2*m [tauq, taup] + (m+n)*nb [work]
+                 ! rworkspace: need   m [e]
+                 call stdlib_wgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 5tn (n >> m, jobz='n')
+                    ! compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + bdspac
+                    call stdlib_qbdsdc( 'L', 'N', m, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    ivt = nwork
+                    ! path 5to (n >> m, jobz='o')
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_wungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! generate p**h in a
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wungbr( 'P', m, n, m, a, lda, work( itaup ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ldwkvt = m
+                    if( lwork >= m*n + 3*m ) then
+                       ! work( ivt ) is m by n
+                       nwork = ivt + ldwkvt*n
+                       chunk = n
+                    else
+                       ! work( ivt ) is m by chunk
+                       chunk = ( lwork - 3*m ) / m
+                       nwork = ivt + ldwkvt*chunk
+                    end if
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    call stdlib_qbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply q in u by realmatrix rwork(irvt,KIND=qp)
+                    ! storing the result in work(ivt), copying to u
+                    ! cworkspace: need   2*m [tauq, taup] + m*m [vt]
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + 2*m*m [rwork]
+                    call stdlib_wlacrm( m, m, u, ldu, rwork( iru ), m, work( ivt ),ldwkvt, rwork( &
+                              nrwork ) )
+                    call stdlib_wlacpy( 'F', m, m, work( ivt ), ldwkvt, u, ldu )
+                    ! multiply rwork(irvt) by p**h in a, storing the
+                    ! result in work(ivt), copying to a
+                    ! cworkspace: need   2*m [tauq, taup] + m*m [vt]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*n [vt]
+                    ! rworkspace: need   m [e] + m*m [rvt] + 2*m*m [rwork]
+                    ! rworkspace: prefer m [e] + m*m [rvt] + 2*m*n [rwork] < m + 5*m*m since n < 2*m here
+                    nrwork = iru
+                    do i = 1, n, chunk
+                       blk = min( n-i+1, chunk )
+                       call stdlib_wlarcm( m, blk, rwork( irvt ), m, a( 1, i ), lda,work( ivt ), &
+                                 ldwkvt, rwork( nrwork ) )
+                       call stdlib_wlacpy( 'F', m, blk, work( ivt ), ldwkvt,a( 1, i ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 5ts (n >> m, jobz='s')
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_wungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacpy( 'U', m, n, a, lda, vt, ldvt )
+                    call stdlib_wungbr( 'P', m, n, m, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_qbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply q in u by realmatrix rwork(iru,KIND=qp), storing the
+                    ! result in a, copying to u
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + 2*m*m [rwork]
+                    call stdlib_wlacrm( m, m, u, ldu, rwork( iru ), m, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_wlacpy( 'F', m, m, a, lda, u, ldu )
+                    ! multiply realmatrix rwork(irvt,KIND=qp) by p**h in vt,
+                    ! storing the result in a, copying to vt
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + 2*m*n [rwork] < m + 5*m*m since n < 2*m here
+                    nrwork = iru
+                    call stdlib_wlarcm( m, n, rwork( irvt ), m, vt, ldvt, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_wlacpy( 'F', m, n, a, lda, vt, ldvt )
+                 else
+                    ! path 5ta (n >> m, jobz='a')
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_wungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*m [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_wlacpy( 'U', m, n, a, lda, vt, ldvt )
+                    call stdlib_wungbr( 'P', n, n, m, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_qbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply q in u by realmatrix rwork(iru,KIND=qp), storing the
+                    ! result in a, copying to u
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + 2*m*m [rwork]
+                    call stdlib_wlacrm( m, m, u, ldu, rwork( iru ), m, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_wlacpy( 'F', m, m, a, lda, u, ldu )
+                    ! multiply realmatrix rwork(irvt,KIND=qp) by p**h in vt,
+                    ! storing the result in a, copying to vt
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + 2*m*n [rwork] < m + 5*m*m since n < 2*m here
+                    nrwork = iru
+                    call stdlib_wlarcm( m, n, rwork( irvt ), m, vt, ldvt, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_wlacpy( 'F', m, n, a, lda, vt, ldvt )
+                 end if
+              else
+                 ! n < mnthr2
+                 ! path 6t (n > m, but not much larger)
+                 ! reduce to bidiagonal form without lq decomposition
+                 ! use stdlib_wunmbr to compute singular vectors
+                 ie = 1
+                 nrwork = ie + m
+                 itauq = 1
+                 itaup = itauq + m
+                 nwork = itaup + m
+                 ! bidiagonalize a
+                 ! cworkspace: need   2*m [tauq, taup] + n        [work]
+                 ! cworkspace: prefer 2*m [tauq, taup] + (m+n)*nb [work]
+                 ! rworkspace: need   m [e]
+                 call stdlib_wgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 6tn (n > m, jobz='n')
+                    ! compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + bdspac
+                    call stdlib_qbdsdc( 'L', 'N', m, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 6to (n > m, jobz='o')
+                    ldwkvt = m
+                    ivt = nwork
+                    if( lwork >= m*n + 3*m ) then
+                       ! work( ivt ) is m by n
+                       call stdlib_wlaset( 'F', m, n, czero, czero, work( ivt ),ldwkvt )
+                       nwork = ivt + ldwkvt*n
+                    else
+                       ! work( ivt ) is m by chunk
+                       chunk = ( lwork - 3*m ) / m
+                       nwork = ivt + ldwkvt*chunk
+                    end if
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_qbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=qp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + m*m [vt] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*m [vt] + m*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru]
+                    call stdlib_wlacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_wunmbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    if( lwork >= m*n + 3*m ) then
+                       ! path 6to-fast
+                       ! copy realmatrix rwork(irvt,KIND=qp) to complex matrix work(ivt)
+                       ! overwrite work(ivt) by right singular vectors of a,
+                       ! copying to a
+                       ! cworkspace: need   2*m [tauq, taup] + m*n [vt] + m    [work]
+                       ! cworkspace: prefer 2*m [tauq, taup] + m*n [vt] + m*nb [work]
+                       ! rworkspace: need   m [e] + m*m [rvt]
+                       call stdlib_wlacp2( 'F', m, m, rwork( irvt ), m, work( ivt ),ldwkvt )
+                                 
+                       call stdlib_wunmbr( 'P', 'R', 'C', m, n, m, a, lda,work( itaup ), work( &
+                                 ivt ), ldwkvt,work( nwork ), lwork-nwork+1, ierr )
+                       call stdlib_wlacpy( 'F', m, n, work( ivt ), ldwkvt, a, lda )
+                    else
+                       ! path 6to-slow
+                       ! generate p**h in a
+                       ! cworkspace: need   2*m [tauq, taup] + m*m [vt] + m    [work]
+                       ! cworkspace: prefer 2*m [tauq, taup] + m*m [vt] + m*nb [work]
+                       ! rworkspace: need   0
+                       call stdlib_wungbr( 'P', m, n, m, a, lda, work( itaup ),work( nwork ), &
+                                 lwork-nwork+1, ierr )
+                       ! multiply q in a by realmatrix rwork(iru,KIND=qp), storing the
+                       ! result in work(iu), copying to a
+                       ! cworkspace: need   2*m [tauq, taup] + m*m [vt]
+                       ! cworkspace: prefer 2*m [tauq, taup] + m*n [vt]
+                       ! rworkspace: need   m [e] + m*m [rvt] + 2*m*m [rwork]
+                       ! rworkspace: prefer m [e] + m*m [rvt] + 2*m*n [rwork] < m + 5*m*m since n < 2*m here
+                       nrwork = iru
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_wlarcm( m, blk, rwork( irvt ), m, a( 1, i ),lda, work( ivt )&
+                                    , ldwkvt,rwork( nrwork ) )
+                          call stdlib_wlacpy( 'F', m, blk, work( ivt ), ldwkvt,a( 1, i ), lda )
+                                    
+                       end do
+                    end if
+                 else if( wntqs ) then
+                    ! path 6ts (n > m, jobz='s')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_qbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=qp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru]
+                    call stdlib_wlacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_wunmbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=qp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt]
+                    call stdlib_wlaset( 'F', m, n, czero, czero, vt, ldvt )
+                    call stdlib_wlacp2( 'F', m, m, rwork( irvt ), m, vt, ldvt )
+                    call stdlib_wunmbr( 'P', 'R', 'C', m, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                 else
+                    ! path 6ta (n > m, jobz='a')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_qbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=qp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru]
+                    call stdlib_wlacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_wunmbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! set all of vt to identity matrix
+                    call stdlib_wlaset( 'F', n, n, czero, cone, vt, ldvt )
+                    ! copy realmatrix rwork(irvt,KIND=qp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt]
+                    call stdlib_wlacp2( 'F', m, m, rwork( irvt ), m, vt, ldvt )
+                    call stdlib_wunmbr( 'P', 'R', 'C', n, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                 end if
+              end if
+           end if
+           ! undo scaling if necessary
+           if( iscl==1 ) then
+              if( anrm>bignum )call stdlib_qlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm>bignum )call stdlib_qlascl( 'G', 0, 0, bignum, anrm, minmn-1,&
+                         1,rwork( ie ), minmn, ierr )
+              if( anrm<smlnum )call stdlib_qlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm<smlnum )call stdlib_qlascl( 'G', 0, 0, smlnum, anrm, minmn-1,&
+                         1,rwork( ie ), minmn, ierr )
+           end if
+           ! return optimal workspace in work(1)
+           work( 1 ) = stdlib_qroundup_lwork( maxwrk )
+           return
+     end subroutine stdlib_wgesdd
+
+     !> ZGESV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> The LU decomposition with partial pivoting and row interchanges is
+     !> used to factor A as
+     !> A = P * L * U,
+     !> where P is a permutation matrix, L is unit lower triangular, and U is
+     !> upper triangular.  The factored form of A is then used to solve the
+     !> system of equations A * X = B.
+
+     pure subroutine stdlib_wgesv( n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGESV ', -info )
+              return
+           end if
+           ! compute the lu factorization of a.
+           call stdlib_wgetrf( n, n, a, lda, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_wgetrs( 'NO TRANSPOSE', n, nrhs, a, lda, ipiv, b, ldb,info )
+           end if
+           return
+     end subroutine stdlib_wgesv
+
+     !> ZGESVD: computes the singular value decomposition (SVD) of a complex
+     !> M-by-N matrix A, optionally computing the left and/or right singular
+     !> vectors. The SVD is written
+     !> A = U * SIGMA * conjugate-transpose(V)
+     !> where SIGMA is an M-by-N matrix which is zero except for its
+     !> min(m,n) diagonal elements, U is an M-by-M unitary matrix, and
+     !> V is an N-by-N unitary matrix.  The diagonal elements of SIGMA
+     !> are the singular values of A; they are real and non-negative, and
+     !> are returned in descending order.  The first min(m,n) columns of
+     !> U and V are the left and right singular vectors of A.
+     !> Note that the routine returns V**H, not V.
+
+     subroutine stdlib_wgesvd( jobu, jobvt, m, n, a, lda, s, u, ldu,vt, ldvt, work, lwork, rwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu, jobvt
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldvt, lwork, m, n
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*), s(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wntua, wntuas, wntun, wntuo, wntus, wntva, wntvas, wntvn, wntvo,&
+                      wntvs
+           integer(ilp) :: blk, chunk, i, ie, ierr, ir, irwork, iscl, itau, itaup, itauq, iu, &
+           iwork, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr, ncu, ncvt, nru, nrvt, &
+                     wrkbl
+           integer(ilp) :: lwork_wgeqrf, lwork_wungqr_n, lwork_wungqr_m, lwork_wgebrd, &
+                     lwork_wungbr_p, lwork_wungbr_q, lwork_wgelqf, lwork_wunglq_n, lwork_wunglq_m
+           real(qp) :: anrm, bignum, eps, smlnum
+           ! Local Arrays 
+           real(qp) :: dum(1)
+           complex(qp) :: cdum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           minmn = min( m, n )
+           wntua = stdlib_lsame( jobu, 'A' )
+           wntus = stdlib_lsame( jobu, 'S' )
+           wntuas = wntua .or. wntus
+           wntuo = stdlib_lsame( jobu, 'O' )
+           wntun = stdlib_lsame( jobu, 'N' )
+           wntva = stdlib_lsame( jobvt, 'A' )
+           wntvs = stdlib_lsame( jobvt, 'S' )
+           wntvas = wntva .or. wntvs
+           wntvo = stdlib_lsame( jobvt, 'O' )
+           wntvn = stdlib_lsame( jobvt, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.( wntua .or. wntus .or. wntuo .or. wntun ) ) then
+              info = -1
+           else if( .not.( wntva .or. wntvs .or. wntvo .or. wntvn ) .or.( wntvo .and. wntuo ) ) &
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldu<1 .or. ( wntuas .and. ldu<m ) ) then
+              info = -9
+           else if( ldvt<1 .or. ( wntva .and. ldvt<n ) .or.( wntvs .and. ldvt<minmn ) ) &
+                     then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to
+             ! real workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( m>=n .and. minmn>0 ) then
+                 ! space needed for stdlib_wbdsqr is bdspac = 5*n
+                 mnthr = stdlib_ilaenv( 6, 'ZGESVD', jobu // jobvt, m, n, 0, 0 )
+                 ! compute space needed for stdlib_wgeqrf
+                 call stdlib_wgeqrf( m, n, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_wgeqrf = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_wungqr
+                 call stdlib_wungqr( m, n, n, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_wungqr_n = int( cdum(1),KIND=ilp)
+                 call stdlib_wungqr( m, m, n, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_wungqr_m = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_wgebrd
+                 call stdlib_wgebrd( n, n, a, lda, s, dum(1), cdum(1),cdum(1), cdum(1), -1, ierr )
+                           
+                 lwork_wgebrd = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_wungbr
+                 call stdlib_wungbr( 'P', n, n, n, a, lda, cdum(1),cdum(1), -1, ierr )
+                 lwork_wungbr_p = int( cdum(1),KIND=ilp)
+                 call stdlib_wungbr( 'Q', n, n, n, a, lda, cdum(1),cdum(1), -1, ierr )
+                 lwork_wungbr_q = int( cdum(1),KIND=ilp)
+                 if( m>=mnthr ) then
+                    if( wntun ) then
+                       ! path 1 (m much larger than n, jobu='n')
+                       maxwrk = n + lwork_wgeqrf
+                       maxwrk = max( maxwrk, 2*n+lwork_wgebrd )
+                       if( wntvo .or. wntvas )maxwrk = max( maxwrk, 2*n+lwork_wungbr_p )
+                       minwrk = 3*n
+                    else if( wntuo .and. wntvn ) then
+                       ! path 2 (m much larger than n, jobu='o', jobvt='n')
+                       wrkbl = n + lwork_wgeqrf
+                       wrkbl = max( wrkbl, n+lwork_wungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_wungbr_q )
+                       maxwrk = max( n*n+wrkbl, n*n+m*n )
+                       minwrk = 2*n + m
+                    else if( wntuo .and. wntvas ) then
+                       ! path 3 (m much larger than n, jobu='o', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_wgeqrf
+                       wrkbl = max( wrkbl, n+lwork_wungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_wungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_wungbr_p )
+                       maxwrk = max( n*n+wrkbl, n*n+m*n )
+                       minwrk = 2*n + m
+                    else if( wntus .and. wntvn ) then
+                       ! path 4 (m much larger than n, jobu='s', jobvt='n')
+                       wrkbl = n + lwork_wgeqrf
+                       wrkbl = max( wrkbl, n+lwork_wungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_wungbr_q )
+                       maxwrk = n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntus .and. wntvo ) then
+                       ! path 5 (m much larger than n, jobu='s', jobvt='o')
+                       wrkbl = n + lwork_wgeqrf
+                       wrkbl = max( wrkbl, n+lwork_wungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_wungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_wungbr_p )
+                       maxwrk = 2*n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntus .and. wntvas ) then
+                       ! path 6 (m much larger than n, jobu='s', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_wgeqrf
+                       wrkbl = max( wrkbl, n+lwork_wungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_wungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_wungbr_p )
+                       maxwrk = n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntua .and. wntvn ) then
+                       ! path 7 (m much larger than n, jobu='a', jobvt='n')
+                       wrkbl = n + lwork_wgeqrf
+                       wrkbl = max( wrkbl, n+lwork_wungqr_m )
+                       wrkbl = max( wrkbl, 2*n+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_wungbr_q )
+                       maxwrk = n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntua .and. wntvo ) then
+                       ! path 8 (m much larger than n, jobu='a', jobvt='o')
+                       wrkbl = n + lwork_wgeqrf
+                       wrkbl = max( wrkbl, n+lwork_wungqr_m )
+                       wrkbl = max( wrkbl, 2*n+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_wungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_wungbr_p )
+                       maxwrk = 2*n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntua .and. wntvas ) then
+                       ! path 9 (m much larger than n, jobu='a', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_wgeqrf
+                       wrkbl = max( wrkbl, n+lwork_wungqr_m )
+                       wrkbl = max( wrkbl, 2*n+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_wungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_wungbr_p )
+                       maxwrk = n*n + wrkbl
+                       minwrk = 2*n + m
+                    end if
+                 else
+                    ! path 10 (m at least n, but not much larger)
+                    call stdlib_wgebrd( m, n, a, lda, s, dum(1), cdum(1),cdum(1), cdum(1), -1, &
+                              ierr )
+                    lwork_wgebrd = int( cdum(1),KIND=ilp)
+                    maxwrk = 2*n + lwork_wgebrd
+                    if( wntus .or. wntuo ) then
+                       call stdlib_wungbr( 'Q', m, n, n, a, lda, cdum(1),cdum(1), -1, ierr )
+                                 
+                       lwork_wungbr_q = int( cdum(1),KIND=ilp)
+                       maxwrk = max( maxwrk, 2*n+lwork_wungbr_q )
+                    end if
+                    if( wntua ) then
+                       call stdlib_wungbr( 'Q', m, m, n, a, lda, cdum(1),cdum(1), -1, ierr )
+                                 
+                       lwork_wungbr_q = int( cdum(1),KIND=ilp)
+                       maxwrk = max( maxwrk, 2*n+lwork_wungbr_q )
+                    end if
+                    if( .not.wntvn ) then
+                       maxwrk = max( maxwrk, 2*n+lwork_wungbr_p )
+                    end if
+                    minwrk = 2*n + m
+                 end if
+              else if( minmn>0 ) then
+                 ! space needed for stdlib_wbdsqr is bdspac = 5*m
+                 mnthr = stdlib_ilaenv( 6, 'ZGESVD', jobu // jobvt, m, n, 0, 0 )
+                 ! compute space needed for stdlib_wgelqf
+                 call stdlib_wgelqf( m, n, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_wgelqf = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_wunglq
+                 call stdlib_wunglq( n, n, m, cdum(1), n, cdum(1), cdum(1), -1,ierr )
+                 lwork_wunglq_n = int( cdum(1),KIND=ilp)
+                 call stdlib_wunglq( m, n, m, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_wunglq_m = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_wgebrd
+                 call stdlib_wgebrd( m, m, a, lda, s, dum(1), cdum(1),cdum(1), cdum(1), -1, ierr )
+                           
+                 lwork_wgebrd = int( cdum(1),KIND=ilp)
+                  ! compute space needed for stdlib_wungbr p
+                 call stdlib_wungbr( 'P', m, m, m, a, n, cdum(1),cdum(1), -1, ierr )
+                 lwork_wungbr_p = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_wungbr q
+                 call stdlib_wungbr( 'Q', m, m, m, a, n, cdum(1),cdum(1), -1, ierr )
+                 lwork_wungbr_q = int( cdum(1),KIND=ilp)
+                 if( n>=mnthr ) then
+                    if( wntvn ) then
+                       ! path 1t(n much larger than m, jobvt='n')
+                       maxwrk = m + lwork_wgelqf
+                       maxwrk = max( maxwrk, 2*m+lwork_wgebrd )
+                       if( wntuo .or. wntuas )maxwrk = max( maxwrk, 2*m+lwork_wungbr_q )
+                       minwrk = 3*m
+                    else if( wntvo .and. wntun ) then
+                       ! path 2t(n much larger than m, jobu='n', jobvt='o')
+                       wrkbl = m + lwork_wgelqf
+                       wrkbl = max( wrkbl, m+lwork_wunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_wungbr_p )
+                       maxwrk = max( m*m+wrkbl, m*m+m*n )
+                       minwrk = 2*m + n
+                    else if( wntvo .and. wntuas ) then
+                       ! path 3t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='o')
+                       wrkbl = m + lwork_wgelqf
+                       wrkbl = max( wrkbl, m+lwork_wunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_wungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_wungbr_q )
+                       maxwrk = max( m*m+wrkbl, m*m+m*n )
+                       minwrk = 2*m + n
+                    else if( wntvs .and. wntun ) then
+                       ! path 4t(n much larger than m, jobu='n', jobvt='s')
+                       wrkbl = m + lwork_wgelqf
+                       wrkbl = max( wrkbl, m+lwork_wunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_wungbr_p )
+                       maxwrk = m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntvs .and. wntuo ) then
+                       ! path 5t(n much larger than m, jobu='o', jobvt='s')
+                       wrkbl = m + lwork_wgelqf
+                       wrkbl = max( wrkbl, m+lwork_wunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_wungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_wungbr_q )
+                       maxwrk = 2*m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntvs .and. wntuas ) then
+                       ! path 6t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='s')
+                       wrkbl = m + lwork_wgelqf
+                       wrkbl = max( wrkbl, m+lwork_wunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_wungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_wungbr_q )
+                       maxwrk = m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntva .and. wntun ) then
+                       ! path 7t(n much larger than m, jobu='n', jobvt='a')
+                       wrkbl = m + lwork_wgelqf
+                       wrkbl = max( wrkbl, m+lwork_wunglq_n )
+                       wrkbl = max( wrkbl, 2*m+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_wungbr_p )
+                       maxwrk = m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntva .and. wntuo ) then
+                       ! path 8t(n much larger than m, jobu='o', jobvt='a')
+                       wrkbl = m + lwork_wgelqf
+                       wrkbl = max( wrkbl, m+lwork_wunglq_n )
+                       wrkbl = max( wrkbl, 2*m+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_wungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_wungbr_q )
+                       maxwrk = 2*m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntva .and. wntuas ) then
+                       ! path 9t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='a')
+                       wrkbl = m + lwork_wgelqf
+                       wrkbl = max( wrkbl, m+lwork_wunglq_n )
+                       wrkbl = max( wrkbl, 2*m+lwork_wgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_wungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_wungbr_q )
+                       maxwrk = m*m + wrkbl
+                       minwrk = 2*m + n
+                    end if
+                 else
+                    ! path 10t(n greater than m, but not much larger)
+                    call stdlib_wgebrd( m, n, a, lda, s, dum(1), cdum(1),cdum(1), cdum(1), -1, &
+                              ierr )
+                    lwork_wgebrd = int( cdum(1),KIND=ilp)
+                    maxwrk = 2*m + lwork_wgebrd
+                    if( wntvs .or. wntvo ) then
+                      ! compute space needed for stdlib_wungbr p
+                      call stdlib_wungbr( 'P', m, n, m, a, n, cdum(1),cdum(1), -1, ierr )
+                      lwork_wungbr_p = int( cdum(1),KIND=ilp)
+                      maxwrk = max( maxwrk, 2*m+lwork_wungbr_p )
+                    end if
+                    if( wntva ) then
+                      call stdlib_wungbr( 'P', n,  n, m, a, n, cdum(1),cdum(1), -1, ierr )
+                      lwork_wungbr_p = int( cdum(1),KIND=ilp)
+                      maxwrk = max( maxwrk, 2*m+lwork_wungbr_p )
+                    end if
+                    if( .not.wntun ) then
+                       maxwrk = max( maxwrk, 2*m+lwork_wungbr_q )
+                    end if
+                    minwrk = 2*m + n
+                 end if
+              end if
+              maxwrk = max( maxwrk, minwrk )
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGESVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = sqrt( stdlib_qlamch( 'S' ) ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', m, n, a, lda, dum )
+           iscl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              iscl = 1
+              call stdlib_wlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, ierr )
+           else if( anrm>bignum ) then
+              iscl = 1
+              call stdlib_wlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, ierr )
+           end if
+           if( m>=n ) then
+              ! a has at least as many rows as columns. if a has sufficiently
+              ! more rows than columns, first reduce using the qr
+              ! decomposition (if sufficient workspace available)
+              if( m>=mnthr ) then
+                 if( wntun ) then
+                    ! path 1 (m much larger than n, jobu='n')
+                    ! no left singular vectors to be computed
+                    itau = 1
+                    iwork = itau + n
+                    ! compute a=q*r
+                    ! (cworkspace: need 2*n, prefer n+n*nb)
+                    ! (rworkspace: need 0)
+                    call stdlib_wgeqrf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                              ierr )
+                    ! zero out below r
+                    if( n > 1 ) then
+                       call stdlib_wlaset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+                    end if
+                    ie = 1
+                    itauq = 1
+                    itaup = itauq + n
+                    iwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                    ! (rworkspace: need n)
+                    call stdlib_wgebrd( n, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( iwork ), lwork-iwork+1,ierr )
+                    ncvt = 0
+                    if( wntvo .or. wntvas ) then
+                       ! if right singular vectors desired, generate p'.
+                       ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       ncvt = n
+                    end if
+                    irwork = ie + n
+                    ! perform bidiagonal qr iteration, computing right
+                    ! singular vectors of a in a if desired
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_wbdsqr( 'U', n, ncvt, 0, 0, s, rwork( ie ), a, lda,cdum, 1, cdum, &
+                              1, rwork( irwork ), info )
+                    ! if right singular vectors desired in vt, copy them there
+                    if( wntvas )call stdlib_wlacpy( 'F', n, n, a, lda, vt, ldvt )
+                 else if( wntuo .and. wntvn ) then
+                    ! path 2 (m much larger than n, jobu='o', jobvt='n')
+                    ! n left singular vectors to be overwritten on a and
+                    ! no right singular vectors to be computed
+                    if( lwork>=n*n+3*n ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n )+lda*n ) then
+                          ! work(iu) is lda by n, work(ir) is lda by n
+                          ldwrku = lda
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n )+n*n ) then
+                          ! work(iu) is lda by n, work(ir) is n by n
+                          ldwrku = lda
+                          ldwrkr = n
+                       else
+                          ! work(iu) is ldwrku by n, work(ir) is n by n
+                          ldwrku = ( lwork-n*n ) / n
+                          ldwrkr = n
+                       end if
+                       itau = ir + ldwrkr*n
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to work(ir) and zero out below it
+                       call stdlib_wlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                       call stdlib_wlaset( 'L', n-1, n-1, czero, czero,work( ir+1 ), ldwrkr )
+                                 
+                       ! generate q in a
+                       ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in work(ir)
+                       ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                       ! (rworkspace: need n)
+                       call stdlib_wgebrd( n, n, work( ir ), ldwrkr, s, rwork( ie ),work( itauq ),&
+                                  work( itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing r
+                       ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                       ! (rworkspace: need 0)
+                       call stdlib_wungbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       irwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of r in work(ir)
+                       ! (cworkspace: need n*n)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_wbdsqr( 'U', n, 0, n, 0, s, rwork( ie ), cdum, 1,work( ir ), &
+                                 ldwrkr, cdum, 1,rwork( irwork ), info )
+                       iu = itauq
+                       ! multiply q in a by left singular vectors of r in
+                       ! work(ir), storing result in work(iu) and copying to a
+                       ! (cworkspace: need n*n+n, prefer n*n+m*n)
+                       ! (rworkspace: 0)
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_wgemm( 'N', 'N', chunk, n, n, cone, a( i, 1 ),lda, work( ir &
+                                    ), ldwrkr, czero,work( iu ), ldwrku )
+                          call stdlib_wlacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       ie = 1
+                       itauq = 1
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize a
+                       ! (cworkspace: need 2*n+m, prefer 2*n+(m+n)*nb)
+                       ! (rworkspace: n)
+                       call stdlib_wgebrd( m, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing a
+                       ! (cworkspace: need 3*n, prefer 2*n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungbr( 'Q', m, n, n, a, lda, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in a
+                       ! (cworkspace: need 0)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_wbdsqr( 'U', n, 0, m, 0, s, rwork( ie ), cdum, 1,a, lda, cdum, &
+                                 1, rwork( irwork ), info )
+                    end if
+                 else if( wntuo .and. wntvas ) then
+                    ! path 3 (m much larger than n, jobu='o', jobvt='s' or 'a')
+                    ! n left singular vectors to be overwritten on a and
+                    ! n right singular vectors to be computed in vt
+                    if( lwork>=n*n+3*n ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n )+lda*n ) then
+                          ! work(iu) is lda by n and work(ir) is lda by n
+                          ldwrku = lda
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n )+n*n ) then
+                          ! work(iu) is lda by n and work(ir) is n by n
+                          ldwrku = lda
+                          ldwrkr = n
+                       else
+                          ! work(iu) is ldwrku by n and work(ir) is n by n
+                          ldwrku = ( lwork-n*n ) / n
+                          ldwrkr = n
+                       end if
+                       itau = ir + ldwrkr*n
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to vt, zeroing out below it
+                       call stdlib_wlacpy( 'U', n, n, a, lda, vt, ldvt )
+                       if( n>1 )call stdlib_wlaset( 'L', n-1, n-1, czero, czero,vt( 2, 1 ), ldvt )
+                                 
+                       ! generate q in a
+                       ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in vt, copying result to work(ir)
+                       ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                       ! (rworkspace: need n)
+                       call stdlib_wgebrd( n, n, vt, ldvt, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       call stdlib_wlacpy( 'L', n, n, vt, ldvt, work( ir ), ldwrkr )
+                       ! generate left vectors bidiagonalizing r in work(ir)
+                       ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing r in vt
+                       ! (cworkspace: need n*n+3*n-1, prefer n*n+2*n+(n-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of r in work(ir) and computing right
+                       ! singular vectors of r in vt
+                       ! (cworkspace: need n*n)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_wbdsqr( 'U', n, n, n, 0, s, rwork( ie ), vt,ldvt, work( ir ), &
+                                 ldwrkr, cdum, 1,rwork( irwork ), info )
+                       iu = itauq
+                       ! multiply q in a by left singular vectors of r in
+                       ! work(ir), storing result in work(iu) and copying to a
+                       ! (cworkspace: need n*n+n, prefer n*n+m*n)
+                       ! (rworkspace: 0)
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_wgemm( 'N', 'N', chunk, n, n, cone, a( i, 1 ),lda, work( ir &
+                                    ), ldwrkr, czero,work( iu ), ldwrku )
+                          call stdlib_wlacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       itau = 1
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (cworkspace: need 2*n, prefer n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to vt, zeroing out below it
+                       call stdlib_wlacpy( 'U', n, n, a, lda, vt, ldvt )
+                       if( n>1 )call stdlib_wlaset( 'L', n-1, n-1, czero, czero,vt( 2, 1 ), ldvt )
+                                 
+                       ! generate q in a
+                       ! (cworkspace: need 2*n, prefer n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in vt
+                       ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                       ! (rworkspace: n)
+                       call stdlib_wgebrd( n, n, vt, ldvt, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! multiply q in a by left vectors bidiagonalizing r
+                       ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wunmbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), a, lda,&
+                                  work( iwork ),lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing r in vt
+                       ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in a and computing right
+                       ! singular vectors of a in vt
+                       ! (cworkspace: 0)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_wbdsqr( 'U', n, n, m, 0, s, rwork( ie ), vt,ldvt, a, lda, cdum,&
+                                  1, rwork( irwork ),info )
+                    end if
+                 else if( wntus ) then
+                    if( wntvn ) then
+                       ! path 4 (m much larger than n, jobu='s', jobvt='n')
+                       ! n left singular vectors to be computed in u and
+                       ! no right singular vectors to be computed
+                       if( lwork>=n*n+3*n ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(ir) is lda by n
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is n by n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(ir), zeroing out below it
+                          call stdlib_wlacpy( 'U', n, n, a, lda, work( ir ),ldwrkr )
+                          call stdlib_wlaset( 'L', n-1, n-1, czero, czero,work( ir+1 ), ldwrkr )
+                                    
+                          ! generate q in a
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(ir)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_wgebrd( n, n, work( ir ), ldwrkr, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left vectors bidiagonalizing r in work(ir)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(ir)
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', n, 0, n, 0, s, rwork( ie ), cdum,1, work( ir ),&
+                                     ldwrkr, cdum, 1,rwork( irwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(ir), storing result in u
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_wgemm( 'N', 'N', m, n, n, cone, a, lda,work( ir ), ldwrkr, &
+                                    czero, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_wlaset( 'L', n-1, n-1, czero, czero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_wgebrd( n, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left vectors bidiagonalizing r
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunmbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', n, 0, m, 0, s, rwork( ie ), cdum,1, u, ldu, &
+                                    cdum, 1, rwork( irwork ),info )
+                       end if
+                    else if( wntvo ) then
+                       ! path 5 (m much larger than n, jobu='s', jobvt='o')
+                       ! n left singular vectors to be computed in u and
+                       ! n right singular vectors to be overwritten on a
+                       if( lwork>=2*n*n+3*n ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*n ) then
+                             ! work(iu) is lda by n and work(ir) is lda by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+n )*n ) then
+                             ! work(iu) is lda by n and work(ir) is n by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          else
+                             ! work(iu) is n by n and work(ir) is n by n
+                             ldwrku = n
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (cworkspace: need 2*n*n+2*n, prefer 2*n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_wlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_wlaset( 'L', n-1, n-1, czero, czero,work( iu+1 ), ldwrku )
+                                    
+                          ! generate q in a
+                          ! (cworkspace: need 2*n*n+2*n, prefer 2*n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to
+                          ! work(ir)
+                          ! (cworkspace: need   2*n*n+3*n,
+                                       ! prefer 2*n*n+2*n+2*n*nb)
+                          ! (rworkspace: need   n)
+                          call stdlib_wgebrd( n, n, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_wlacpy( 'U', n, n, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need 2*n*n+3*n, prefer 2*n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need   2*n*n+3*n-1,
+                                       ! prefer 2*n*n+2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'P', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in work(ir)
+                          ! (cworkspace: need 2*n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', n, n, n, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                    work( iu ),ldwrku, cdum, 1, rwork( irwork ),info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(iu), storing result in u
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_wgemm( 'N', 'N', m, n, n, cone, a, lda,work( iu ), ldwrku, &
+                                    czero, u, ldu )
+                          ! copy right singular vectors of r to a
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_wlacpy( 'F', n, n, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_wlaset( 'L', n-1, n-1, czero, czero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_wgebrd( n, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left vectors bidiagonalizing r
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunmbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right vectors bidiagonalizing r in a
+                          ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in a
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', n, n, m, 0, s, rwork( ie ), a,lda, u, ldu, &
+                                    cdum, 1, rwork( irwork ),info )
+                       end if
+                    else if( wntvas ) then
+                       ! path 6 (m much larger than n, jobu='s', jobvt='s'
+                               ! or 'a')
+                       ! n left singular vectors to be computed in u and
+                       ! n right singular vectors to be computed in vt
+                       if( lwork>=n*n+3*n ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is n by n
+                             ldwrku = n
+                          end if
+                          itau = iu + ldwrku*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_wlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_wlaset( 'L', n-1, n-1, czero, czero,work( iu+1 ), ldwrku )
+                                    
+                          ! generate q in a
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to vt
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_wgebrd( n, n, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_wlacpy( 'U', n, n, work( iu ), ldwrku, vt,ldvt )
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (cworkspace: need   n*n+3*n-1,
+                                       ! prefer n*n+2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in vt
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', n, n, n, 0, s, rwork( ie ), vt,ldvt, work( iu )&
+                                    , ldwrku, cdum, 1,rwork( irwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(iu), storing result in u
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_wgemm( 'N', 'N', m, n, n, cone, a, lda,work( iu ), ldwrku, &
+                                    czero, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to vt, zeroing out below it
+                          call stdlib_wlacpy( 'U', n, n, a, lda, vt, ldvt )
+                          if( n>1 )call stdlib_wlaset( 'L', n-1, n-1, czero, czero,vt( 2, 1 ), &
+                                    ldvt )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in vt
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_wgebrd( n, n, vt, ldvt, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in vt
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunmbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', n, n, m, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    end if
+                 else if( wntua ) then
+                    if( wntvn ) then
+                       ! path 7 (m much larger than n, jobu='a', jobvt='n')
+                       ! m left singular vectors to be computed in u and
+                       ! no right singular vectors to be computed
+                       if( lwork>=n*n+max( n+m, 3*n ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(ir) is lda by n
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is n by n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! copy r to work(ir), zeroing out below it
+                          call stdlib_wlacpy( 'U', n, n, a, lda, work( ir ),ldwrkr )
+                          call stdlib_wlaset( 'L', n-1, n-1, czero, czero,work( ir+1 ), ldwrkr )
+                                    
+                          ! generate q in u
+                          ! (cworkspace: need n*n+n+m, prefer n*n+n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(ir)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_wgebrd( n, n, work( ir ), ldwrkr, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(ir)
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', n, 0, n, 0, s, rwork( ie ), cdum,1, work( ir ),&
+                                     ldwrkr, cdum, 1,rwork( irwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(ir), storing result in a
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_wgemm( 'N', 'N', m, n, n, cone, u, ldu,work( ir ), ldwrkr, &
+                                    czero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_wlacpy( 'F', m, n, a, lda, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need n+m, prefer n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_wlaset( 'L', n-1, n-1, czero, czero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_wgebrd( n, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in a
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunmbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', n, 0, m, 0, s, rwork( ie ), cdum,1, u, ldu, &
+                                    cdum, 1, rwork( irwork ),info )
+                       end if
+                    else if( wntvo ) then
+                       ! path 8 (m much larger than n, jobu='a', jobvt='o')
+                       ! m left singular vectors to be computed in u and
+                       ! n right singular vectors to be overwritten on a
+                       if( lwork>=2*n*n+max( n+m, 3*n ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*n ) then
+                             ! work(iu) is lda by n and work(ir) is lda by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+n )*n ) then
+                             ! work(iu) is lda by n and work(ir) is n by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          else
+                             ! work(iu) is n by n and work(ir) is n by n
+                             ldwrku = n
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n*n+2*n, prefer 2*n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need 2*n*n+n+m, prefer 2*n*n+n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_wlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_wlaset( 'L', n-1, n-1, czero, czero,work( iu+1 ), ldwrku )
+                                    
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to
+                          ! work(ir)
+                          ! (cworkspace: need   2*n*n+3*n,
+                                       ! prefer 2*n*n+2*n+2*n*nb)
+                          ! (rworkspace: need   n)
+                          call stdlib_wgebrd( n, n, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_wlacpy( 'U', n, n, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need 2*n*n+3*n, prefer 2*n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need   2*n*n+3*n-1,
+                                       ! prefer 2*n*n+2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'P', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in work(ir)
+                          ! (cworkspace: need 2*n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', n, n, n, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                    work( iu ),ldwrku, cdum, 1, rwork( irwork ),info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(iu), storing result in a
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_wgemm( 'N', 'N', m, n, n, cone, u, ldu,work( iu ), ldwrku, &
+                                    czero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_wlacpy( 'F', m, n, a, lda, u, ldu )
+                          ! copy right singular vectors of r from work(ir) to a
+                          call stdlib_wlacpy( 'F', n, n, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need n+m, prefer n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_wlaset( 'L', n-1, n-1, czero, czero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_wgebrd( n, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in a
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunmbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in a
+                          ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in a
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', n, n, m, 0, s, rwork( ie ), a,lda, u, ldu, &
+                                    cdum, 1, rwork( irwork ),info )
+                       end if
+                    else if( wntvas ) then
+                       ! path 9 (m much larger than n, jobu='a', jobvt='s'
+                               ! or 'a')
+                       ! m left singular vectors to be computed in u and
+                       ! n right singular vectors to be computed in vt
+                       if( lwork>=n*n+max( n+m, 3*n ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is n by n
+                             ldwrku = n
+                          end if
+                          itau = iu + ldwrku*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need n*n+n+m, prefer n*n+n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_wlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_wlaset( 'L', n-1, n-1, czero, czero,work( iu+1 ), ldwrku )
+                                    
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to vt
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_wgebrd( n, n, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_wlacpy( 'U', n, n, work( iu ), ldwrku, vt,ldvt )
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (cworkspace: need   n*n+3*n-1,
+                                       ! prefer n*n+2*n+(n-1)*nb)
+                          ! (rworkspace: need   0)
+                          call stdlib_wungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in vt
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', n, n, n, 0, s, rwork( ie ), vt,ldvt, work( iu )&
+                                    , ldwrku, cdum, 1,rwork( irwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(iu), storing result in a
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_wgemm( 'N', 'N', m, n, n, cone, u, ldu,work( iu ), ldwrku, &
+                                    czero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_wlacpy( 'F', m, n, a, lda, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need n+m, prefer n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r from a to vt, zeroing out below it
+                          call stdlib_wlacpy( 'U', n, n, a, lda, vt, ldvt )
+                          if( n>1 )call stdlib_wlaset( 'L', n-1, n-1, czero, czero,vt( 2, 1 ), &
+                                    ldvt )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in vt
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_wgebrd( n, n, vt, ldvt, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in vt
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunmbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', n, n, m, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    end if
+                 end if
+              else
+                 ! m < mnthr
+                 ! path 10 (m at least n, but not much larger)
+                 ! reduce to bidiagonal form without qr decomposition
+                 ie = 1
+                 itauq = 1
+                 itaup = itauq + n
+                 iwork = itaup + n
+                 ! bidiagonalize a
+                 ! (cworkspace: need 2*n+m, prefer 2*n+(m+n)*nb)
+                 ! (rworkspace: need n)
+                 call stdlib_wgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( iwork ), lwork-iwork+1,ierr )
+                 if( wntuas ) then
+                    ! if left singular vectors desired in u, copy result to u
+                    ! and generate left bidiagonalizing vectors in u
+                    ! (cworkspace: need 2*n+ncu, prefer 2*n+ncu*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_wlacpy( 'L', m, n, a, lda, u, ldu )
+                    if( wntus )ncu = n
+                    if( wntua )ncu = m
+                    call stdlib_wungbr( 'Q', m, ncu, n, u, ldu, work( itauq ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntvas ) then
+                    ! if right singular vectors desired in vt, copy result to
+                    ! vt and generate right bidiagonalizing vectors in vt
+                    ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_wlacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_wungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntuo ) then
+                    ! if left singular vectors desired in a, generate left
+                    ! bidiagonalizing vectors in a
+                    ! (cworkspace: need 3*n, prefer 2*n+n*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_wungbr( 'Q', m, n, n, a, lda, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvo ) then
+                    ! if right singular vectors desired in a, generate right
+                    ! bidiagonalizing vectors in a
+                    ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_wungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 irwork = ie + n
+                 if( wntuas .or. wntuo )nru = m
+                 if( wntun )nru = 0
+                 if( wntvas .or. wntvo )ncvt = n
+                 if( wntvn )ncvt = 0
+                 if( ( .not.wntuo ) .and. ( .not.wntvo ) ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in vt
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_wbdsqr( 'U', n, ncvt, nru, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                              cdum, 1, rwork( irwork ),info )
+                 else if( ( .not.wntuo ) .and. wntvo ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in a
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_wbdsqr( 'U', n, ncvt, nru, 0, s, rwork( ie ), a,lda, u, ldu, cdum,&
+                               1, rwork( irwork ),info )
+                 else
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in a and computing right singular
+                    ! vectors in vt
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_wbdsqr( 'U', n, ncvt, nru, 0, s, rwork( ie ), vt,ldvt, a, lda, &
+                              cdum, 1, rwork( irwork ),info )
+                 end if
+              end if
+           else
+              ! a has more columns than rows. if a has sufficiently more
+              ! columns than rows, first reduce using the lq decomposition (if
+              ! sufficient workspace available)
+              if( n>=mnthr ) then
+                 if( wntvn ) then
+                    ! path 1t(n much larger than m, jobvt='n')
+                    ! no right singular vectors to be computed
+                    itau = 1
+                    iwork = itau + m
+                    ! compute a=l*q
+                    ! (cworkspace: need 2*m, prefer m+m*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_wgelqf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                              ierr )
+                    ! zero out above l
+                    call stdlib_wlaset( 'U', m-1, m-1, czero, czero, a( 1, 2 ),lda )
+                    ie = 1
+                    itauq = 1
+                    itaup = itauq + m
+                    iwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                    ! (rworkspace: need m)
+                    call stdlib_wgebrd( m, m, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( iwork ), lwork-iwork+1,ierr )
+                    if( wntuo .or. wntuas ) then
+                       ! if left singular vectors desired, generate q
+                       ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                    end if
+                    irwork = ie + m
+                    nru = 0
+                    if( wntuo .or. wntuas )nru = m
+                    ! perform bidiagonal qr iteration, computing left singular
+                    ! vectors of a in a if desired
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_wbdsqr( 'U', m, 0, nru, 0, s, rwork( ie ), cdum, 1,a, lda, cdum, &
+                              1, rwork( irwork ), info )
+                    ! if left singular vectors desired in u, copy them there
+                    if( wntuas )call stdlib_wlacpy( 'F', m, m, a, lda, u, ldu )
+                 else if( wntvo .and. wntun ) then
+                    ! path 2t(n much larger than m, jobu='n', jobvt='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! no left singular vectors to be computed
+                    if( lwork>=m*m+3*m ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n )+lda*m ) then
+                          ! work(iu) is lda by n and work(ir) is lda by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n )+m*m ) then
+                          ! work(iu) is lda by n and work(ir) is m by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = m
+                       else
+                          ! work(iu) is m by chunk and work(ir) is m by m
+                          ldwrku = m
+                          chunk = ( lwork-m*m ) / m
+                          ldwrkr = m
+                       end if
+                       itau = ir + ldwrkr*m
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to work(ir) and zero out above it
+                       call stdlib_wlacpy( 'L', m, m, a, lda, work( ir ), ldwrkr )
+                       call stdlib_wlaset( 'U', m-1, m-1, czero, czero,work( ir+ldwrkr ), ldwrkr )
+                                 
+                       ! generate q in a
+                       ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in work(ir)
+                       ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                       ! (rworkspace: need m)
+                       call stdlib_wgebrd( m, m, work( ir ), ldwrkr, s, rwork( ie ),work( itauq ),&
+                                  work( itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing l
+                       ! (cworkspace: need m*m+3*m-1, prefer m*m+2*m+(m-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       irwork = ie + m
+                       ! perform bidiagonal qr iteration, computing right
+                       ! singular vectors of l in work(ir)
+                       ! (cworkspace: need m*m)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_wbdsqr( 'U', m, m, 0, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                 cdum, 1, cdum, 1,rwork( irwork ), info )
+                       iu = itauq
+                       ! multiply right singular vectors of l in work(ir) by q
+                       ! in a, storing result in work(iu) and copying to a
+                       ! (cworkspace: need m*m+m, prefer m*m+m*n)
+                       ! (rworkspace: 0)
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_wgemm( 'N', 'N', m, blk, m, cone, work( ir ),ldwrkr, a( 1, &
+                                    i ), lda, czero,work( iu ), ldwrku )
+                          call stdlib_wlacpy( 'F', m, blk, work( iu ), ldwrku,a( 1, i ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       ie = 1
+                       itauq = 1
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize a
+                       ! (cworkspace: need 2*m+n, prefer 2*m+(m+n)*nb)
+                       ! (rworkspace: need m)
+                       call stdlib_wgebrd( m, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing a
+                       ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + m
+                       ! perform bidiagonal qr iteration, computing right
+                       ! singular vectors of a in a
+                       ! (cworkspace: 0)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_wbdsqr( 'L', m, n, 0, 0, s, rwork( ie ), a, lda,cdum, 1, cdum, &
+                                 1, rwork( irwork ), info )
+                    end if
+                 else if( wntvo .and. wntuas ) then
+                    ! path 3t(n much larger than m, jobu='s' or 'a', jobvt='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! m left singular vectors to be computed in u
+                    if( lwork>=m*m+3*m ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n )+lda*m ) then
+                          ! work(iu) is lda by n and work(ir) is lda by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n )+m*m ) then
+                          ! work(iu) is lda by n and work(ir) is m by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = m
+                       else
+                          ! work(iu) is m by chunk and work(ir) is m by m
+                          ldwrku = m
+                          chunk = ( lwork-m*m ) / m
+                          ldwrkr = m
+                       end if
+                       itau = ir + ldwrkr*m
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to u, zeroing about above it
+                       call stdlib_wlacpy( 'L', m, m, a, lda, u, ldu )
+                       call stdlib_wlaset( 'U', m-1, m-1, czero, czero, u( 1, 2 ),ldu )
+                       ! generate q in a
+                       ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in u, copying result to work(ir)
+                       ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                       ! (rworkspace: need m)
+                       call stdlib_wgebrd( m, m, u, ldu, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       call stdlib_wlacpy( 'U', m, m, u, ldu, work( ir ), ldwrkr )
+                       ! generate right vectors bidiagonalizing l in work(ir)
+                       ! (cworkspace: need m*m+3*m-1, prefer m*m+2*m+(m-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing l in u
+                       ! (cworkspace: need m*m+3*m, prefer m*m+2*m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + m
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of l in u, and computing right
+                       ! singular vectors of l in work(ir)
+                       ! (cworkspace: need m*m)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_wbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( ir ), ldwrkr, u, &
+                                 ldu, cdum, 1,rwork( irwork ), info )
+                       iu = itauq
+                       ! multiply right singular vectors of l in work(ir) by q
+                       ! in a, storing result in work(iu) and copying to a
+                       ! (cworkspace: need m*m+m, prefer m*m+m*n))
+                       ! (rworkspace: 0)
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_wgemm( 'N', 'N', m, blk, m, cone, work( ir ),ldwrkr, a( 1, &
+                                    i ), lda, czero,work( iu ), ldwrku )
+                          call stdlib_wlacpy( 'F', m, blk, work( iu ), ldwrku,a( 1, i ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       itau = 1
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (cworkspace: need 2*m, prefer m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to u, zeroing out above it
+                       call stdlib_wlacpy( 'L', m, m, a, lda, u, ldu )
+                       call stdlib_wlaset( 'U', m-1, m-1, czero, czero, u( 1, 2 ),ldu )
+                       ! generate q in a
+                       ! (cworkspace: need 2*m, prefer m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in u
+                       ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                       ! (rworkspace: need m)
+                       call stdlib_wgebrd( m, m, u, ldu, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! multiply right vectors bidiagonalizing l by q in a
+                       ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wunmbr( 'P', 'L', 'C', m, n, m, u, ldu,work( itaup ), a, lda, &
+                                 work( iwork ),lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing l in u
+                       ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_wungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + m
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in u and computing right
+                       ! singular vectors of a in a
+                       ! (cworkspace: 0)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_wbdsqr( 'U', m, n, m, 0, s, rwork( ie ), a, lda,u, ldu, cdum, &
+                                 1, rwork( irwork ), info )
+                    end if
+                 else if( wntvs ) then
+                    if( wntun ) then
+                       ! path 4t(n much larger than m, jobu='n', jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! no left singular vectors to be computed
+                       if( lwork>=m*m+3*m ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(ir) is lda by m
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is m by m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(ir), zeroing out above it
+                          call stdlib_wlacpy( 'L', m, m, a, lda, work( ir ),ldwrkr )
+                          call stdlib_wlaset( 'U', m-1, m-1, czero, czero,work( ir+ldwrkr ), &
+                                    ldwrkr )
+                          ! generate q in a
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(ir)
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_wgebrd( m, m, work( ir ), ldwrkr, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right vectors bidiagonalizing l in
+                          ! work(ir)
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of l in work(ir)
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', m, m, 0, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                    cdum, 1, cdum, 1,rwork( irwork ), info )
+                          ! multiply right singular vectors of l in work(ir) by
+                          ! q in a, storing result in vt
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_wgemm( 'N', 'N', m, n, m, cone, work( ir ),ldwrkr, a, lda, &
+                                    czero, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy result to vt
+                          call stdlib_wlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_wlaset( 'U', m-1, m-1, czero, czero,a( 1, 2 ), lda )
+                          ! bidiagonalize l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_wgebrd( m, m, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right vectors bidiagonalizing l by q in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunmbr( 'P', 'L', 'C', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', m, n, 0, 0, s, rwork( ie ), vt,ldvt, cdum, 1, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    else if( wntuo ) then
+                       ! path 5t(n much larger than m, jobu='o', jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! m left singular vectors to be overwritten on a
+                       if( lwork>=2*m*m+3*m ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*m ) then
+                             ! work(iu) is lda by m and work(ir) is lda by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+m )*m ) then
+                             ! work(iu) is lda by m and work(ir) is m by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          else
+                             ! work(iu) is m by m and work(ir) is m by m
+                             ldwrku = m
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (cworkspace: need 2*m*m+2*m, prefer 2*m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out below it
+                          call stdlib_wlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_wlaset( 'U', m-1, m-1, czero, czero,work( iu+ldwrku ), &
+                                    ldwrku )
+                          ! generate q in a
+                          ! (cworkspace: need 2*m*m+2*m, prefer 2*m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to
+                          ! work(ir)
+                          ! (cworkspace: need   2*m*m+3*m,
+                                       ! prefer 2*m*m+2*m+2*m*nb)
+                          ! (rworkspace: need   m)
+                          call stdlib_wgebrd( m, m, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_wlacpy( 'L', m, m, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need   2*m*m+3*m-1,
+                                       ! prefer 2*m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need 2*m*m+3*m, prefer 2*m*m+2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', m, m, m, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in work(ir) and computing
+                          ! right singular vectors of l in work(iu)
+                          ! (cworkspace: need 2*m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( iu ), ldwrku, &
+                                    work( ir ),ldwrkr, cdum, 1, rwork( irwork ),info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in a, storing result in vt
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_wgemm( 'N', 'N', m, n, m, cone, work( iu ),ldwrku, a, lda, &
+                                    czero, vt, ldvt )
+                          ! copy left singular vectors of l to a
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_wlacpy( 'F', m, m, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_wlaset( 'U', m-1, m-1, czero, czero,a( 1, 2 ), lda )
+                          ! bidiagonalize l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_wgebrd( m, m, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right vectors bidiagonalizing l by q in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunmbr( 'P', 'L', 'C', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors of l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in a and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', m, n, m, 0, s, rwork( ie ), vt,ldvt, a, lda, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    else if( wntuas ) then
+                       ! path 6t(n much larger than m, jobu='s' or 'a',
+                               ! jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! m left singular vectors to be computed in u
+                       if( lwork>=m*m+3*m ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is lda by m
+                             ldwrku = m
+                          end if
+                          itau = iu + ldwrku*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_wlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_wlaset( 'U', m-1, m-1, czero, czero,work( iu+ldwrku ), &
+                                    ldwrku )
+                          ! generate q in a
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to u
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_wgebrd( m, m, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_wlacpy( 'L', m, m, work( iu ), ldwrku, u,ldu )
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need   m*m+3*m-1,
+                                       ! prefer m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in u and computing right
+                          ! singular vectors of l in work(iu)
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( iu ), ldwrku, &
+                                    u, ldu, cdum, 1,rwork( irwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in a, storing result in vt
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_wgemm( 'N', 'N', m, n, m, cone, work( iu ),ldwrku, a, lda, &
+                                    czero, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to u, zeroing out above it
+                          call stdlib_wlacpy( 'L', m, m, a, lda, u, ldu )
+                          call stdlib_wlaset( 'U', m-1, m-1, czero, czero,u( 1, 2 ), ldu )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in u
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_wgebrd( m, m, u, ldu, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in u by q
+                          ! in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunmbr( 'P', 'L', 'C', m, n, m, u, ldu,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', m, n, m, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    end if
+                 else if( wntva ) then
+                    if( wntun ) then
+                       ! path 7t(n much larger than m, jobu='n', jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! no left singular vectors to be computed
+                       if( lwork>=m*m+max( n+m, 3*m ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(ir) is lda by m
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is m by m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! copy l to work(ir), zeroing out above it
+                          call stdlib_wlacpy( 'L', m, m, a, lda, work( ir ),ldwrkr )
+                          call stdlib_wlaset( 'U', m-1, m-1, czero, czero,work( ir+ldwrkr ), &
+                                    ldwrkr )
+                          ! generate q in vt
+                          ! (cworkspace: need m*m+m+n, prefer m*m+m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(ir)
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_wgebrd( m, m, work( ir ), ldwrkr, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need   m*m+3*m-1,
+                                       ! prefer m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of l in work(ir)
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', m, m, 0, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                    cdum, 1, cdum, 1,rwork( irwork ), info )
+                          ! multiply right singular vectors of l in work(ir) by
+                          ! q in vt, storing result in a
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_wgemm( 'N', 'N', m, n, m, cone, work( ir ),ldwrkr, vt, ldvt,&
+                                     czero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_wlacpy( 'F', m, n, a, lda, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need m+n, prefer m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_wlaset( 'U', m-1, m-1, czero, czero,a( 1, 2 ), lda )
+                          ! bidiagonalize l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_wgebrd( m, m, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in a by q
+                          ! in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunmbr( 'P', 'L', 'C', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', m, n, 0, 0, s, rwork( ie ), vt,ldvt, cdum, 1, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    else if( wntuo ) then
+                       ! path 8t(n much larger than m, jobu='o', jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! m left singular vectors to be overwritten on a
+                       if( lwork>=2*m*m+max( n+m, 3*m ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*m ) then
+                             ! work(iu) is lda by m and work(ir) is lda by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+m )*m ) then
+                             ! work(iu) is lda by m and work(ir) is m by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          else
+                             ! work(iu) is m by m and work(ir) is m by m
+                             ldwrku = m
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m*m+2*m, prefer 2*m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need 2*m*m+m+n, prefer 2*m*m+m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_wlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_wlaset( 'U', m-1, m-1, czero, czero,work( iu+ldwrku ), &
+                                    ldwrku )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to
+                          ! work(ir)
+                          ! (cworkspace: need   2*m*m+3*m,
+                                       ! prefer 2*m*m+2*m+2*m*nb)
+                          ! (rworkspace: need   m)
+                          call stdlib_wgebrd( m, m, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_wlacpy( 'L', m, m, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need   2*m*m+3*m-1,
+                                       ! prefer 2*m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need 2*m*m+3*m, prefer 2*m*m+2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', m, m, m, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in work(ir) and computing
+                          ! right singular vectors of l in work(iu)
+                          ! (cworkspace: need 2*m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( iu ), ldwrku, &
+                                    work( ir ),ldwrkr, cdum, 1, rwork( irwork ),info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in vt, storing result in a
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_wgemm( 'N', 'N', m, n, m, cone, work( iu ),ldwrku, vt, ldvt,&
+                                     czero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_wlacpy( 'F', m, n, a, lda, vt, ldvt )
+                          ! copy left singular vectors of a from work(ir) to a
+                          call stdlib_wlacpy( 'F', m, m, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need m+n, prefer m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_wlaset( 'U', m-1, m-1, czero, czero,a( 1, 2 ), lda )
+                          ! bidiagonalize l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_wgebrd( m, m, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in a by q
+                          ! in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunmbr( 'P', 'L', 'C', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in a
+                          ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in a and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', m, n, m, 0, s, rwork( ie ), vt,ldvt, a, lda, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    else if( wntuas ) then
+                       ! path 9t(n much larger than m, jobu='s' or 'a',
+                               ! jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! m left singular vectors to be computed in u
+                       if( lwork>=m*m+max( n+m, 3*m ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(iu) is lda by m
+                             ldwrku = lda
+                          else
+                             ! work(iu) is m by m
+                             ldwrku = m
+                          end if
+                          itau = iu + ldwrku*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need m*m+m+n, prefer m*m+m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_wlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_wlaset( 'U', m-1, m-1, czero, czero,work( iu+ldwrku ), &
+                                    ldwrku )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to u
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_wgebrd( m, m, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_wlacpy( 'L', m, m, work( iu ), ldwrku, u,ldu )
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in u and computing right
+                          ! singular vectors of l in work(iu)
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( iu ), ldwrku, &
+                                    u, ldu, cdum, 1,rwork( irwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in vt, storing result in a
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_wgemm( 'N', 'N', m, n, m, cone, work( iu ),ldwrku, vt, ldvt,&
+                                     czero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_wlacpy( 'F', m, n, a, lda, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_wlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need m+n, prefer m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to u, zeroing out above it
+                          call stdlib_wlacpy( 'L', m, m, a, lda, u, ldu )
+                          call stdlib_wlaset( 'U', m-1, m-1, czero, czero,u( 1, 2 ), ldu )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in u
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_wgebrd( m, m, u, ldu, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in u by q
+                          ! in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wunmbr( 'P', 'L', 'C', m, n, m, u, ldu,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_wungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_wbdsqr( 'U', m, n, m, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    end if
+                 end if
+              else
+                 ! n < mnthr
+                 ! path 10t(n greater than m, but not much larger)
+                 ! reduce to bidiagonal form without lq decomposition
+                 ie = 1
+                 itauq = 1
+                 itaup = itauq + m
+                 iwork = itaup + m
+                 ! bidiagonalize a
+                 ! (cworkspace: need 2*m+n, prefer 2*m+(m+n)*nb)
+                 ! (rworkspace: m)
+                 call stdlib_wgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( iwork ), lwork-iwork+1,ierr )
+                 if( wntuas ) then
+                    ! if left singular vectors desired in u, copy result to u
+                    ! and generate left bidiagonalizing vectors in u
+                    ! (cworkspace: need 3*m-1, prefer 2*m+(m-1)*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_wlacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_wungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvas ) then
+                    ! if right singular vectors desired in vt, copy result to
+                    ! vt and generate right bidiagonalizing vectors in vt
+                    ! (cworkspace: need 2*m+nrvt, prefer 2*m+nrvt*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_wlacpy( 'U', m, n, a, lda, vt, ldvt )
+                    if( wntva )nrvt = n
+                    if( wntvs )nrvt = m
+                    call stdlib_wungbr( 'P', nrvt, n, m, vt, ldvt, work( itaup ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntuo ) then
+                    ! if left singular vectors desired in a, generate left
+                    ! bidiagonalizing vectors in a
+                    ! (cworkspace: need 3*m-1, prefer 2*m+(m-1)*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_wungbr( 'Q', m, m, n, a, lda, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvo ) then
+                    ! if right singular vectors desired in a, generate right
+                    ! bidiagonalizing vectors in a
+                    ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_wungbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 irwork = ie + m
+                 if( wntuas .or. wntuo )nru = m
+                 if( wntun )nru = 0
+                 if( wntvas .or. wntvo )ncvt = n
+                 if( wntvn )ncvt = 0
+                 if( ( .not.wntuo ) .and. ( .not.wntvo ) ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in vt
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_wbdsqr( 'L', m, ncvt, nru, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                              cdum, 1, rwork( irwork ),info )
+                 else if( ( .not.wntuo ) .and. wntvo ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in a
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_wbdsqr( 'L', m, ncvt, nru, 0, s, rwork( ie ), a,lda, u, ldu, cdum,&
+                               1, rwork( irwork ),info )
+                 else
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in a and computing right singular
+                    ! vectors in vt
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_wbdsqr( 'L', m, ncvt, nru, 0, s, rwork( ie ), vt,ldvt, a, lda, &
+                              cdum, 1, rwork( irwork ),info )
+                 end if
+              end if
+           end if
+           ! undo scaling if necessary
+           if( iscl==1 ) then
+              if( anrm>bignum )call stdlib_qlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm>bignum )call stdlib_qlascl( 'G', 0, 0, bignum, anrm, minmn-1,&
+                         1,rwork( ie ), minmn, ierr )
+              if( anrm<smlnum )call stdlib_qlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm<smlnum )call stdlib_qlascl( 'G', 0, 0, smlnum, anrm, minmn-1,&
+                         1,rwork( ie ), minmn, ierr )
+           end if
+           ! return optimal workspace in work(1)
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_wgesvd
+
+     !> ZCGESVDQ computes the singular value decomposition (SVD) of a complex
+     !> M-by-N matrix A, where M >= N. The SVD of A is written as
+     !> [++]   [xx]   [x0]   [xx]
+     !> A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+     !> [++]   [xx]
+     !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+     !> matrix, and V is an N-by-N unitary matrix. The diagonal elements
+     !> of SIGMA are the singular values of A. The columns of U and V are the
+     !> left and the right singular vectors of A, respectively.
+
+     subroutine stdlib_wgesvdq( joba, jobp, jobr, jobu, jobv, m, n, a, lda,s, u, ldu, v, ldv, &
+               numrank, iwork, liwork,cwork, lcwork, rwork, lrwork, info )
+           ! Scalar Arguments 
+           character, intent(in) :: joba, jobp, jobr, jobu, jobv
+           integer(ilp), intent(in) :: m, n, lda, ldu, ldv, liwork, lrwork
+           integer(ilp), intent(out) :: numrank, info
+           integer(ilp), intent(inout) :: lcwork
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: u(ldu,*), v(ldv,*), cwork(*)
+           real(qp), intent(out) :: s(*), rwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: ierr, nr, n1, optratio, p, q
+           integer(ilp) :: lwcon, lwqp3, lwrk_wgelqf, lwrk_wgesvd, lwrk_wgesvd2, lwrk_wgeqp3, &
+           lwrk_wgeqrf, lwrk_wunmlq, lwrk_wunmqr, lwrk_wunmqr2, lwlqf, lwqrf, lwsvd, lwsvd2, &
+                     lwunq, lwunq2, lwunlq, minwrk, minwrk2, optwrk, optwrk2, iminwrk, rminwrk
+           logical(lk) :: accla, acclm, acclh, ascaled, conda, dntwu, dntwv, lquery, lsvc0, lsvec,&
+                      rowprm, rsvec, rtrans, wntua, wntuf, wntur, wntus, wntva, wntvr
+           real(qp) :: big, epsln, rtmp, sconda, sfmin
+           complex(qp) :: ctmp
+           ! Local Arrays
+           complex(qp) :: cdummy(1)
+           real(qp) :: rdummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,min,real,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           wntus  = stdlib_lsame( jobu, 'S' ) .or. stdlib_lsame( jobu, 'U' )
+           wntur  = stdlib_lsame( jobu, 'R' )
+           wntua  = stdlib_lsame( jobu, 'A' )
+           wntuf  = stdlib_lsame( jobu, 'F' )
+           lsvc0  = wntus .or. wntur .or. wntua
+           lsvec  = lsvc0 .or. wntuf
+           dntwu  = stdlib_lsame( jobu, 'N' )
+           wntvr  = stdlib_lsame( jobv, 'R' )
+           wntva  = stdlib_lsame( jobv, 'A' ) .or. stdlib_lsame( jobv, 'V' )
+           rsvec  = wntvr .or. wntva
+           dntwv  = stdlib_lsame( jobv, 'N' )
+           accla  = stdlib_lsame( joba, 'A' )
+           acclm  = stdlib_lsame( joba, 'M' )
+           conda  = stdlib_lsame( joba, 'E' )
+           acclh  = stdlib_lsame( joba, 'H' ) .or. conda
+           rowprm = stdlib_lsame( jobp, 'P' )
+           rtrans = stdlib_lsame( jobr, 'T' )
+           if ( rowprm ) then
+              iminwrk = max( 1, n + m - 1 )
+              rminwrk = max( 2, m, 5*n )
+           else
+              iminwrk = max( 1, n )
+              rminwrk = max( 2, 5*n )
+           end if
+           lquery = (liwork == -1 .or. lcwork == -1 .or. lrwork == -1)
+           info  = 0
+           if ( .not. ( accla .or. acclm .or. acclh ) ) then
+              info = -1
+           else if ( .not.( rowprm .or. stdlib_lsame( jobp, 'N' ) ) ) then
+               info = -2
+           else if ( .not.( rtrans .or. stdlib_lsame( jobr, 'N' ) ) ) then
+               info = -3
+           else if ( .not.( lsvec .or. dntwu ) ) then
+              info = -4
+           else if ( wntur .and. wntva ) then
+              info = -5
+           else if ( .not.( rsvec .or. dntwv )) then
+              info = -5
+           else if ( m<0 ) then
+              info = -6
+           else if ( ( n<0 ) .or. ( n>m ) ) then
+              info = -7
+           else if ( lda<max( 1, m ) ) then
+              info = -9
+           else if ( ldu<1 .or. ( lsvc0 .and. ldu<m ) .or.( wntuf .and. ldu<n ) ) then
+              info = -12
+           else if ( ldv<1 .or. ( rsvec .and. ldv<n ) .or.( conda .and. ldv<n ) ) then
+              info = -14
+           else if ( liwork < iminwrk .and. .not. lquery ) then
+              info = -17
+           end if
+           if ( info == 0 ) then
+              ! Compute The Minimal And The Optimal Workspace Lengths
+              ! [[the expressions for computing the minimal and the optimal
+              ! values of lcwork are written with a lot of redundancy and
+              ! can be simplified. however, this detailed form is easier for
+              ! maintenance and modifications of the code.]]
+              ! Minimal Workspace Length For Stdlib_Zgeqp3 Of An M X N Matrix
+              lwqp3 = n+1
+              ! Minimal Workspace Length For Stdlib_Zunmqr To Build Left Singular Vectors
+              if ( wntus .or. wntur ) then
+                  lwunq  = max( n  , 1 )
+              else if ( wntua ) then
+                  lwunq = max( m , 1 )
+              end if
+              ! Minimal Workspace Length For Stdlib_Zpocon Of An N X N Matrix
+              lwcon = 2 * n
+              ! Stdlib_Zgesvd Of An N X N Matrix
+              lwsvd = max( 3 * n, 1 )
+              if ( lquery ) then
+                  call stdlib_wgeqp3( m, n, a, lda, iwork, cdummy, cdummy, -1,rdummy, ierr )
+                            
+                  lwrk_wgeqp3 = int( cdummy(1),KIND=ilp)
+                  if ( wntus .or. wntur ) then
+                      call stdlib_wunmqr( 'L', 'N', m, n, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_wunmqr = int( cdummy(1),KIND=ilp)
+                  else if ( wntua ) then
+                      call stdlib_wunmqr( 'L', 'N', m, m, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_wunmqr = int( cdummy(1),KIND=ilp)
+                  else
+                      lwrk_wunmqr = 0
+                  end if
+              end if
+              minwrk = 2
+              optwrk = 2
+              if ( .not. (lsvec .or. rsvec ) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If
+                  ! only the singular values are requested
+                  if ( conda ) then
+                     minwrk = max( n+lwqp3, lwcon, lwsvd )
+                  else
+                     minwrk = max( n+lwqp3, lwsvd )
+                  end if
+                  if ( lquery ) then
+                      call stdlib_wgesvd( 'N', 'N', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                rdummy, ierr )
+                      lwrk_wgesvd = int( cdummy(1),KIND=ilp)
+                      if ( conda ) then
+                         optwrk = max( n+lwrk_wgeqp3, n+lwcon, lwrk_wgesvd )
+                      else
+                         optwrk = max( n+lwrk_wgeqp3, lwrk_wgesvd )
+                      end if
+                  end if
+              else if ( lsvec .and. (.not.rsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! singular values and the left singular vectors are requested
+                  if ( conda ) then
+                      minwrk = n + max( lwqp3, lwcon, lwsvd, lwunq )
+                  else
+                      minwrk = n + max( lwqp3, lwsvd, lwunq )
+                  end if
+                  if ( lquery ) then
+                     if ( rtrans ) then
+                        call stdlib_wgesvd( 'N', 'O', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                  rdummy, ierr )
+                     else
+                        call stdlib_wgesvd( 'O', 'N', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                  rdummy, ierr )
+                     end if
+                     lwrk_wgesvd = int( cdummy(1),KIND=ilp)
+                     if ( conda ) then
+                         optwrk = n + max( lwrk_wgeqp3, lwcon, lwrk_wgesvd,lwrk_wunmqr )
+                     else
+                         optwrk = n + max( lwrk_wgeqp3, lwrk_wgesvd,lwrk_wunmqr )
+                     end if
+                  end if
+              else if ( rsvec .and. (.not.lsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! singular values and the right singular vectors are requested
+                  if ( conda ) then
+                      minwrk = n + max( lwqp3, lwcon, lwsvd )
+                  else
+                      minwrk = n + max( lwqp3, lwsvd )
+                  end if
+                  if ( lquery ) then
+                      if ( rtrans ) then
+                          call stdlib_wgesvd( 'O', 'N', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -&
+                                    1, rdummy, ierr )
+                      else
+                          call stdlib_wgesvd( 'N', 'O', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -&
+                                    1, rdummy, ierr )
+                      end if
+                      lwrk_wgesvd = int( cdummy(1),KIND=ilp)
+                      if ( conda ) then
+                          optwrk = n + max( lwrk_wgeqp3, lwcon, lwrk_wgesvd )
+                      else
+                          optwrk = n + max( lwrk_wgeqp3, lwrk_wgesvd )
+                      end if
+                  end if
+              else
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! full svd is requested
+                  if ( rtrans ) then
+                      minwrk = max( lwqp3, lwsvd, lwunq )
+                      if ( conda ) minwrk = max( minwrk, lwcon )
+                      minwrk = minwrk + n
+                      if ( wntva ) then
+                         ! .. minimal workspace length for n x n/2 stdlib_wgeqrf
+                         lwqrf  = max( n/2, 1 )
+                         ! .. minimal workspace length for n/2 x n/2 stdlib_wgesvd
+                         lwsvd2 = max( 3 * (n/2), 1 )
+                         lwunq2 = max( n, 1 )
+                         minwrk2 = max( lwqp3, n/2+lwqrf, n/2+lwsvd2,n/2+lwunq2, lwunq )
+                         if ( conda ) minwrk2 = max( minwrk2, lwcon )
+                         minwrk2 = n + minwrk2
+                         minwrk = max( minwrk, minwrk2 )
+                      end if
+                  else
+                      minwrk = max( lwqp3, lwsvd, lwunq )
+                      if ( conda ) minwrk = max( minwrk, lwcon )
+                      minwrk = minwrk + n
+                      if ( wntva ) then
+                         ! .. minimal workspace length for n/2 x n stdlib_wgelqf
+                         lwlqf  = max( n/2, 1 )
+                         lwsvd2 = max( 3 * (n/2), 1 )
+                         lwunlq = max( n , 1 )
+                         minwrk2 = max( lwqp3, n/2+lwlqf, n/2+lwsvd2,n/2+lwunlq, lwunq )
+                         if ( conda ) minwrk2 = max( minwrk2, lwcon )
+                         minwrk2 = n + minwrk2
+                         minwrk = max( minwrk, minwrk2 )
+                      end if
+                  end if
+                  if ( lquery ) then
+                     if ( rtrans ) then
+                        call stdlib_wgesvd( 'O', 'A', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                  rdummy, ierr )
+                        lwrk_wgesvd = int( cdummy(1),KIND=ilp)
+                        optwrk = max(lwrk_wgeqp3,lwrk_wgesvd,lwrk_wunmqr)
+                        if ( conda ) optwrk = max( optwrk, lwcon )
+                        optwrk = n + optwrk
+                        if ( wntva ) then
+                            call stdlib_wgeqrf(n,n/2,u,ldu,cdummy,cdummy,-1,ierr)
+                            lwrk_wgeqrf = int( cdummy(1),KIND=ilp)
+                            call stdlib_wgesvd( 'S', 'O', n/2,n/2, v,ldv, s, u,ldu,v, ldv, cdummy,&
+                                       -1, rdummy, ierr )
+                            lwrk_wgesvd2 = int( cdummy(1),KIND=ilp)
+                            call stdlib_wunmqr( 'R', 'C', n, n, n/2, u, ldu, cdummy,v, ldv, &
+                                      cdummy, -1, ierr )
+                            lwrk_wunmqr2 = int( cdummy(1),KIND=ilp)
+                            optwrk2 = max( lwrk_wgeqp3, n/2+lwrk_wgeqrf,n/2+lwrk_wgesvd2, n/2+&
+                                      lwrk_wunmqr2 )
+                            if ( conda ) optwrk2 = max( optwrk2, lwcon )
+                            optwrk2 = n + optwrk2
+                            optwrk = max( optwrk, optwrk2 )
+                        end if
+                     else
+                        call stdlib_wgesvd( 'S', 'O', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                  rdummy, ierr )
+                        lwrk_wgesvd = int( cdummy(1),KIND=ilp)
+                        optwrk = max(lwrk_wgeqp3,lwrk_wgesvd,lwrk_wunmqr)
+                        if ( conda ) optwrk = max( optwrk, lwcon )
+                        optwrk = n + optwrk
+                        if ( wntva ) then
+                           call stdlib_wgelqf(n/2,n,u,ldu,cdummy,cdummy,-1,ierr)
+                           lwrk_wgelqf = int( cdummy(1),KIND=ilp)
+                           call stdlib_wgesvd( 'S','O', n/2,n/2, v, ldv, s, u, ldu,v, ldv, cdummy,&
+                                      -1, rdummy, ierr )
+                           lwrk_wgesvd2 = int( cdummy(1),KIND=ilp)
+                           call stdlib_wunmlq( 'R', 'N', n, n, n/2, u, ldu, cdummy,v, ldv, cdummy,&
+                                     -1,ierr )
+                           lwrk_wunmlq = int( cdummy(1),KIND=ilp)
+                           optwrk2 = max( lwrk_wgeqp3, n/2+lwrk_wgelqf,n/2+lwrk_wgesvd2, n/2+&
+                                     lwrk_wunmlq )
+                            if ( conda ) optwrk2 = max( optwrk2, lwcon )
+                            optwrk2 = n + optwrk2
+                            optwrk = max( optwrk, optwrk2 )
+                        end if
+                     end if
+                  end if
+              end if
+              minwrk = max( 2, minwrk )
+              optwrk = max( 2, optwrk )
+              if ( lcwork < minwrk .and. (.not.lquery) ) info = -19
+           end if
+           if (info == 0 .and. lrwork < rminwrk .and. .not. lquery) then
+              info = -21
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGESVDQ', -info )
+              return
+           else if ( lquery ) then
+           ! return optimal workspace
+               iwork(1) = iminwrk
+               cwork(1) = optwrk
+               cwork(2) = minwrk
+               rwork(1) = rminwrk
+               return
+           end if
+           ! quick return if the matrix is void.
+           if( ( m==0 ) .or. ( n==0 ) ) then
+           ! All Output Is Void
+              return
+           end if
+           big = stdlib_qlamch('O')
+           ascaled = .false.
+           if ( rowprm ) then
+                 ! Reordering The Rows In Decreasing Sequence In The
+                 ! ell-infinity norm - this enhances numerical robustness in
+                 ! the case of differently scaled rows.
+                 do p = 1, m
+                     ! rwork(p) = abs( a(p,stdlib_iwamax(n,a(p,1),lda)) )
+                     ! [[stdlib_wlange will return nan if an entry of the p-th row is nan]]
+                     rwork(p) = stdlib_wlange( 'M', 1, n, a(p,1), lda, rdummy )
+                     ! .. check for nan's and inf's
+                     if ( ( rwork(p) /= rwork(p) ) .or.( (rwork(p)*zero) /= zero ) ) then
+                         info = -8
+                         call stdlib_xerbla( 'ZGESVDQ', -info )
+                         return
+                     end if
+                 end do
+                 do p = 1, m - 1
+                 q = stdlib_iqamax( m-p+1, rwork(p), 1 ) + p - 1
+                 iwork(n+p) = q
+                 if ( p /= q ) then
+                    rtmp     = rwork(p)
+                    rwork(p) = rwork(q)
+                    rwork(q) = rtmp
+                 end if
+                 end do
+                 if ( rwork(1) == zero ) then
+                    ! quick return: a is the m x n zero matrix.
+                    numrank = 0
+                    call stdlib_qlaset( 'G', n, 1, zero, zero, s, n )
+                    if ( wntus ) call stdlib_wlaset('G', m, n, czero, cone, u, ldu)
+                    if ( wntua ) call stdlib_wlaset('G', m, m, czero, cone, u, ldu)
+                    if ( wntva ) call stdlib_wlaset('G', n, n, czero, cone, v, ldv)
+                    if ( wntuf ) then
+                        call stdlib_wlaset( 'G', n, 1, czero, czero, cwork, n )
+                        call stdlib_wlaset( 'G', m, n, czero, cone, u, ldu )
+                    end if
+                    do p = 1, n
+                        iwork(p) = p
+                    end do
+                    if ( rowprm ) then
+                        do p = n + 1, n + m - 1
+                            iwork(p) = p - n
+                        end do
+                    end if
+                    if ( conda ) rwork(1) = -1
+                    rwork(2) = -1
+                    return
+                 end if
+                 if ( rwork(1) > big / sqrt(real(m,KIND=qp)) ) then
+                     ! .. to prevent overflow in the qr factorization, scale the
+                     ! matrix by 1/sqrt(m) if too large entry detected
+                     call stdlib_wlascl('G',0,0,sqrt(real(m,KIND=qp)),one, m,n, a,lda, ierr)
+                               
+                     ascaled = .true.
+                 end if
+                 call stdlib_wlaswp( n, a, lda, 1, m-1, iwork(n+1), 1 )
+           end if
+          ! .. at this stage, preemptive scaling is done only to avoid column
+          ! norms overflows during the qr factorization. the svd procedure should
+          ! have its own scaling to save the singular values from overflows and
+          ! underflows. that depends on the svd procedure.
+           if ( .not.rowprm ) then
+               rtmp = stdlib_wlange( 'M', m, n, a, lda, rwork )
+               if ( ( rtmp /= rtmp ) .or.( (rtmp*zero) /= zero ) ) then
+                    info = -8
+                    call stdlib_xerbla( 'ZGESVDQ', -info )
+                    return
+               end if
+               if ( rtmp > big / sqrt(real(m,KIND=qp)) ) then
+                   ! .. to prevent overflow in the qr factorization, scale the
+                   ! matrix by 1/sqrt(m) if too large entry detected
+                   call stdlib_wlascl('G',0,0, sqrt(real(m,KIND=qp)),one, m,n, a,lda, ierr)
+                             
+                   ascaled = .true.
+               end if
+           end if
+           ! Qr Factorization With Column Pivoting
+           ! a * p = q * [ r ]
+                       ! [ 0 ]
+           do p = 1, n
+              ! All Columns Are Free Columns
+              iwork(p) = 0
+           end do
+           call stdlib_wgeqp3( m, n, a, lda, iwork, cwork, cwork(n+1), lcwork-n,rwork, ierr )
+                     
+          ! if the user requested accuracy level allows truncation in the
+          ! computed upper triangular factor, the matrix r is examined and,
+          ! if possible, replaced with its leading upper trapezoidal part.
+           epsln = stdlib_qlamch('E')
+           sfmin = stdlib_qlamch('S')
+           ! small = sfmin / epsln
+           nr = n
+           if ( accla ) then
+              ! standard absolute error bound suffices. all sigma_i with
+              ! sigma_i < n*eps*||a||_f are flushed to zero. this is an
+              ! aggressive enforcement of lower numerical rank by introducing a
+              ! backward error of the order of n*eps*||a||_f.
+              nr = 1
+              rtmp = sqrt(real(n,KIND=qp))*epsln
+              do p = 2, n
+                 if ( abs(a(p,p)) < (rtmp*abs(a(1,1))) ) go to 3002
+                    nr = nr + 1
+              end do
+              3002 continue
+           elseif ( acclm ) then
+              ! .. similarly as above, only slightly more gentle (less aggressive).
+              ! sudden drop on the diagonal of r is used as the criterion for being
+              ! close-to-rank-deficient. the threshold is set to epsln=stdlib_qlamch('e').
+              ! [[this can be made more flexible by replacing this hard-coded value
+              ! with a user specified threshold.]] also, the values that underflow
+              ! will be truncated.
+              nr = 1
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < (epsln*abs(a(p-1,p-1))) ) .or.( abs(a(p,p)) < sfmin ) ) go &
+                           to 3402
+                 nr = nr + 1
+              end do
+              3402 continue
+           else
+              ! Rrqr Not Authorized To Determine Numerical Rank Except In The
+              ! obvious case of zero pivots.
+              ! .. inspect r for exact zeros on the diagonal;
+              ! r(i,i)=0 => r(i:n,i:n)=0.
+              nr = 1
+              do p = 2, n
+                 if ( abs(a(p,p)) == zero ) go to 3502
+                 nr = nr + 1
+              end do
+              3502 continue
+              if ( conda ) then
+                 ! estimate the scaled condition number of a. use the fact that it is
+                 ! the same as the scaled condition number of r.
+                    ! V Is Used As Workspace
+                    call stdlib_wlacpy( 'U', n, n, a, lda, v, ldv )
+                    ! only the leading nr x nr submatrix of the triangular factor
+                    ! is considered. only if nr=n will this give a reliable error
+                    ! bound. however, even for nr < n, this can be used on an
+                    ! expert level and obtain useful information in the sense of
+                    ! perturbation theory.
+                    do p = 1, nr
+                       rtmp = stdlib_qznrm2( p, v(1,p), 1 )
+                       call stdlib_wdscal( p, one/rtmp, v(1,p), 1 )
+                    end do
+                    if ( .not. ( lsvec .or. rsvec ) ) then
+                        call stdlib_wpocon( 'U', nr, v, ldv, one, rtmp,cwork, rwork, ierr )
+                                  
+                    else
+                        call stdlib_wpocon( 'U', nr, v, ldv, one, rtmp,cwork(n+1), rwork, ierr )
+                                  
+                    end if
+                    sconda = one / sqrt(rtmp)
+                 ! for nr=n, sconda is an estimate of sqrt(||(r^* * r)^(-1)||_1),
+                 ! n^(-1/4) * sconda <= ||r^(-1)||_2 <= n^(1/4) * sconda
+                 ! see the reference [1] for more details.
+              end if
+           endif
+           if ( wntur ) then
+               n1 = nr
+           else if ( wntus .or. wntuf) then
+               n1 = n
+           else if ( wntua ) then
+               n1 = m
+           end if
+           if ( .not. ( rsvec .or. lsvec ) ) then
+      ! .......................................................................
+              ! Only The Singular Values Are Requested
+      ! .......................................................................
+              if ( rtrans ) then
+               ! .. compute the singular values of r**h = [a](1:nr,1:n)**h
+                 ! .. set the lower triangle of [a] to [a](1:nr,1:n)**h and
+                 ! the upper triangle of [a] to zero.
+                 do p = 1, min( n, nr )
+                    a(p,p) = conjg(a(p,p))
+                    do q = p + 1, n
+                       a(q,p) = conjg(a(p,q))
+                       if ( q <= nr ) a(p,q) = czero
+                    end do
+                 end do
+                 call stdlib_wgesvd( 'N', 'N', n, nr, a, lda, s, u, ldu,v, ldv, cwork, lcwork, &
+                           rwork, info )
+              else
+                 ! .. compute the singular values of r = [a](1:nr,1:n)
+                 if ( nr > 1 )call stdlib_wlaset( 'L', nr-1,nr-1, czero,czero, a(2,1), lda )
+                           
+                 call stdlib_wgesvd( 'N', 'N', nr, n, a, lda, s, u, ldu,v, ldv, cwork, lcwork, &
+                           rwork, info )
+              end if
+           else if ( lsvec .and. ( .not. rsvec) ) then
+      ! .......................................................................
+             ! The Singular Values And The Left Singular Vectors Requested
+      ! .......................................................................""""""""
+              if ( rtrans ) then
+                  ! .. apply stdlib_wgesvd to r**h
+                  ! .. copy r**h into [u] and overwrite [u] with the right singular
+                  ! vectors of r
+                 do p = 1, nr
+                    do q = p, n
+                       u(q,p) = conjg(a(p,q))
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_wlaset( 'U', nr-1,nr-1, czero,czero, u(1,2), ldu )
+                           
+                 ! .. the left singular vectors not computed, the nr right singular
+                 ! vectors overwrite [u](1:nr,1:nr) as conjugate transposed. these
+                 ! will be pre-multiplied by q to build the left singular vectors of a.
+                    call stdlib_wgesvd( 'N', 'O', n, nr, u, ldu, s, u, ldu,u, ldu, cwork(n+1), &
+                              lcwork-n, rwork, info )
+                    do p = 1, nr
+                        u(p,p) = conjg(u(p,p))
+                        do q = p + 1, nr
+                           ctmp   = conjg(u(q,p))
+                           u(q,p) = conjg(u(p,q))
+                           u(p,q) = ctmp
+                        end do
+                    end do
+              else
+                  ! Apply Stdlib_Zgesvd To R
+                  ! .. copy r into [u] and overwrite [u] with the left singular vectors
+                  call stdlib_wlacpy( 'U', nr, n, a, lda, u, ldu )
+                  if ( nr > 1 )call stdlib_wlaset( 'L', nr-1, nr-1, czero, czero, u(2,1), ldu )
+                            
+                  ! .. the right singular vectors not computed, the nr left singular
+                  ! vectors overwrite [u](1:nr,1:nr)
+                     call stdlib_wgesvd( 'O', 'N', nr, n, u, ldu, s, u, ldu,v, ldv, cwork(n+1), &
+                               lcwork-n, rwork, info )
+                     ! .. now [u](1:nr,1:nr) contains the nr left singular vectors of
+                     ! r. these will be pre-multiplied by q to build the left singular
+                     ! vectors of a.
+              end if
+                 ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+              if ( ( nr < m ) .and. ( .not.wntuf ) ) then
+                  call stdlib_wlaset('A', m-nr, nr, czero, czero, u(nr+1,1), ldu)
+                  if ( nr < n1 ) then
+                     call stdlib_wlaset( 'A',nr,n1-nr,czero,czero,u(1,nr+1), ldu )
+                     call stdlib_wlaset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1), ldu )
+                  end if
+              end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! vectors matrix u.
+              if ( .not.wntuf )call stdlib_wunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, &
+                        cwork(n+1), lcwork-n, ierr )
+              if ( rowprm .and. .not.wntuf )call stdlib_wlaswp( n1, u, ldu, 1, m-1, iwork(n+1), -&
+                        1 )
+           else if ( rsvec .and. ( .not. lsvec ) ) then
+      ! .......................................................................
+             ! The Singular Values And The Right Singular Vectors Requested
+      ! .......................................................................
+               if ( rtrans ) then
+                  ! .. apply stdlib_wgesvd to r**h
+                  ! .. copy r**h into v and overwrite v with the left singular vectors
+                 do p = 1, nr
+                    do q = p, n
+                       v(q,p) = conjg(a(p,q))
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_wlaset( 'U', nr-1,nr-1, czero,czero, v(1,2), ldv )
+                           
+                 ! .. the left singular vectors of r**h overwrite v, the right singular
+                 ! vectors not computed
+                 if ( wntvr .or. ( nr == n ) ) then
+                    call stdlib_wgesvd( 'O', 'N', n, nr, v, ldv, s, u, ldu,u, ldu, cwork(n+1), &
+                              lcwork-n, rwork, info )
+                    do p = 1, nr
+                        v(p,p) = conjg(v(p,p))
+                        do q = p + 1, nr
+                           ctmp   = conjg(v(q,p))
+                           v(q,p) = conjg(v(p,q))
+                           v(p,q) = ctmp
+                        end do
+                    end do
+                    if ( nr < n ) then
+                        do p = 1, nr
+                           do q = nr + 1, n
+                               v(p,q) = conjg(v(q,p))
+                           end do
+                        end do
+                    end if
+                    call stdlib_wlapmt( .false., nr, n, v, ldv, iwork )
+                 else
+                     ! .. need all n right singular vectors and nr < n
+                     ! [!] this is simple implementation that augments [v](1:n,1:nr)
+                     ! by padding a zero block. in the case nr << n, a more efficient
+                     ! way is to first use the qr factorization. for more details
+                     ! how to implement this, see the " full svd " branch.
+                     call stdlib_wlaset('G', n, n-nr, czero, czero, v(1,nr+1), ldv)
+                     call stdlib_wgesvd( 'O', 'N', n, n, v, ldv, s, u, ldu,u, ldu, cwork(n+1), &
+                               lcwork-n, rwork, info )
+                     do p = 1, n
+                        v(p,p) = conjg(v(p,p))
+                        do q = p + 1, n
+                           ctmp   = conjg(v(q,p))
+                           v(q,p) = conjg(v(p,q))
+                           v(p,q) = ctmp
+                        end do
+                     end do
+                     call stdlib_wlapmt( .false., n, n, v, ldv, iwork )
+                 end if
+               else
+                  ! Aply Stdlib_Zgesvd To R
+                  ! Copy R Into V And Overwrite V With The Right Singular Vectors
+                  call stdlib_wlacpy( 'U', nr, n, a, lda, v, ldv )
+                  if ( nr > 1 )call stdlib_wlaset( 'L', nr-1, nr-1, czero, czero, v(2,1), ldv )
+                            
+                  ! .. the right singular vectors overwrite v, the nr left singular
+                  ! vectors stored in u(1:nr,1:nr)
+                  if ( wntvr .or. ( nr == n ) ) then
+                     call stdlib_wgesvd( 'N', 'O', nr, n, v, ldv, s, u, ldu,v, ldv, cwork(n+1), &
+                               lcwork-n, rwork, info )
+                     call stdlib_wlapmt( .false., nr, n, v, ldv, iwork )
+                     ! .. now [v](1:nr,1:n) contains v(1:n,1:nr)**h
+                  else
+                     ! .. need all n right singular vectors and nr < n
+                     ! [!] this is simple implementation that augments [v](1:nr,1:n)
+                     ! by padding a zero block. in the case nr << n, a more efficient
+                     ! way is to first use the lq factorization. for more details
+                     ! how to implement this, see the " full svd " branch.
+                      call stdlib_wlaset('G', n-nr, n, czero,czero, v(nr+1,1), ldv)
+                      call stdlib_wgesvd( 'N', 'O', n, n, v, ldv, s, u, ldu,v, ldv, cwork(n+1), &
+                                lcwork-n, rwork, info )
+                      call stdlib_wlapmt( .false., n, n, v, ldv, iwork )
+                  end if
+                  ! .. now [v] contains the adjoint of the matrix of the right singular
+                  ! vectors of a.
+               end if
+           else
+      ! .......................................................................
+             ! Full Svd Requested
+      ! .......................................................................
+              if ( rtrans ) then
+                  ! .. apply stdlib_wgesvd to r**h [[this option is left for r
+                 if ( wntvr .or. ( nr == n ) ) then
+                  ! .. copy r**h into [v] and overwrite [v] with the left singular
+                  ! vectors of r**h
+                 do p = 1, nr
+                    do q = p, n
+                       v(q,p) = conjg(a(p,q))
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_wlaset( 'U', nr-1,nr-1, czero,czero, v(1,2), ldv )
+                           
+                 ! .. the left singular vectors of r**h overwrite [v], the nr right
+                 ! singular vectors of r**h stored in [u](1:nr,1:nr) as conjugate
+                 ! transposed
+                    call stdlib_wgesvd( 'O', 'A', n, nr, v, ldv, s, v, ldv,u, ldu, cwork(n+1), &
+                              lcwork-n, rwork, info )
+                    ! Assemble V
+                    do p = 1, nr
+                       v(p,p) = conjg(v(p,p))
+                       do q = p + 1, nr
+                          ctmp   = conjg(v(q,p))
+                          v(q,p) = conjg(v(p,q))
+                          v(p,q) = ctmp
+                       end do
+                    end do
+                    if ( nr < n ) then
+                        do p = 1, nr
+                           do q = nr+1, n
+                              v(p,q) = conjg(v(q,p))
+                           end do
+                        end do
+                    end if
+                    call stdlib_wlapmt( .false., nr, n, v, ldv, iwork )
+                     do p = 1, nr
+                        u(p,p) = conjg(u(p,p))
+                        do q = p + 1, nr
+                           ctmp   = conjg(u(q,p))
+                           u(q,p) = conjg(u(p,q))
+                           u(p,q) = ctmp
+                        end do
+                     end do
+                     if ( ( nr < m ) .and. .not.(wntuf)) then
+                       call stdlib_wlaset('A', m-nr,nr, czero,czero, u(nr+1,1), ldu)
+                       if ( nr < n1 ) then
+                          call stdlib_wlaset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                          call stdlib_wlaset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1), ldu )
+                                    
+                       end if
+                    end if
+                 else
+                     ! .. need all n right singular vectors and nr < n
+                  ! .. copy r**h into [v] and overwrite [v] with the left singular
+                  ! vectors of r**h
+                     ! [[the optimal ratio n/nr for using qrf instead of padding
+                       ! with zeros. here hard coded to 2; it must be at least
+                       ! two due to work space constraints.]]
+                     ! optratio = stdlib_ilaenv(6, 'zgesvd', 's' // 'o', nr,n,0,0)
+                     ! optratio = max( optratio, 2 )
+                     optratio = 2
+                     if ( optratio*nr > n ) then
+                        do p = 1, nr
+                           do q = p, n
+                              v(q,p) = conjg(a(p,q))
+                           end do
+                        end do
+                        if ( nr > 1 )call stdlib_wlaset('U',nr-1,nr-1, czero,czero, v(1,2),ldv)
+                                  
+                        call stdlib_wlaset('A',n,n-nr,czero,czero,v(1,nr+1),ldv)
+                        call stdlib_wgesvd( 'O', 'A', n, n, v, ldv, s, v, ldv,u, ldu, cwork(n+1), &
+                                  lcwork-n, rwork, info )
+                        do p = 1, n
+                           v(p,p) = conjg(v(p,p))
+                           do q = p + 1, n
+                              ctmp   = conjg(v(q,p))
+                              v(q,p) = conjg(v(p,q))
+                              v(p,q) = ctmp
+                           end do
+                        end do
+                        call stdlib_wlapmt( .false., n, n, v, ldv, iwork )
+                    ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x n1), i.e. (m x n) or (m x m).
+                        do p = 1, n
+                           u(p,p) = conjg(u(p,p))
+                           do q = p + 1, n
+                              ctmp   = conjg(u(q,p))
+                              u(q,p) = conjg(u(p,q))
+                              u(p,q) = ctmp
+                           end do
+                        end do
+                        if ( ( n < m ) .and. .not.(wntuf)) then
+                           call stdlib_wlaset('A',m-n,n,czero,czero,u(n+1,1),ldu)
+                           if ( n < n1 ) then
+                             call stdlib_wlaset('A',n,n1-n,czero,czero,u(1,n+1),ldu)
+                             call stdlib_wlaset('A',m-n,n1-n,czero,cone,u(n+1,n+1), ldu )
+                           end if
+                        end if
+                     else
+                        ! .. copy r**h into [u] and overwrite [u] with the right
+                        ! singular vectors of r
+                        do p = 1, nr
+                           do q = p, n
+                              u(q,nr+p) = conjg(a(p,q))
+                           end do
+                        end do
+                        if ( nr > 1 )call stdlib_wlaset('U',nr-1,nr-1,czero,czero,u(1,nr+2),ldu)
+                                  
+                        call stdlib_wgeqrf( n, nr, u(1,nr+1), ldu, cwork(n+1),cwork(n+nr+1), &
+                                  lcwork-n-nr, ierr )
+                        do p = 1, nr
+                            do q = 1, n
+                                v(q,p) = conjg(u(p,nr+q))
+                            end do
+                        end do
+                       call stdlib_wlaset('U',nr-1,nr-1,czero,czero,v(1,2),ldv)
+                       call stdlib_wgesvd( 'S', 'O', nr, nr, v, ldv, s, u, ldu,v,ldv, cwork(n+nr+&
+                                 1),lcwork-n-nr,rwork, info )
+                       call stdlib_wlaset('A',n-nr,nr,czero,czero,v(nr+1,1),ldv)
+                       call stdlib_wlaset('A',nr,n-nr,czero,czero,v(1,nr+1),ldv)
+                       call stdlib_wlaset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                       call stdlib_wunmqr('R','C', n, n, nr, u(1,nr+1), ldu,cwork(n+1),v,ldv,&
+                                 cwork(n+nr+1),lcwork-n-nr,ierr)
+                       call stdlib_wlapmt( .false., n, n, v, ldv, iwork )
+                       ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                       ! (m x nr) or (m x n) or (m x m).
+                       if ( ( nr < m ) .and. .not.(wntuf)) then
+                          call stdlib_wlaset('A',m-nr,nr,czero,czero,u(nr+1,1),ldu)
+                          if ( nr < n1 ) then
+                          call stdlib_wlaset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                          call stdlib_wlaset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1),ldu)
+                          end if
+                       end if
+                     end if
+                 end if
+              else
+                  ! .. apply stdlib_wgesvd to r [[this is the recommended option]]
+                  if ( wntvr .or. ( nr == n ) ) then
+                      ! .. copy r into [v] and overwrite v with the right singular vectors
+                      call stdlib_wlacpy( 'U', nr, n, a, lda, v, ldv )
+                     if ( nr > 1 )call stdlib_wlaset( 'L', nr-1,nr-1, czero,czero, v(2,1), ldv )
+                               
+                     ! .. the right singular vectors of r overwrite [v], the nr left
+                     ! singular vectors of r stored in [u](1:nr,1:nr)
+                     call stdlib_wgesvd( 'S', 'O', nr, n, v, ldv, s, u, ldu,v, ldv, cwork(n+1), &
+                               lcwork-n, rwork, info )
+                     call stdlib_wlapmt( .false., nr, n, v, ldv, iwork )
+                     ! .. now [v](1:nr,1:n) contains v(1:n,1:nr)**h
+                     ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+                    if ( ( nr < m ) .and. .not.(wntuf)) then
+                       call stdlib_wlaset('A', m-nr,nr, czero,czero, u(nr+1,1), ldu)
+                       if ( nr < n1 ) then
+                          call stdlib_wlaset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                          call stdlib_wlaset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1), ldu )
+                                    
+                       end if
+                    end if
+                  else
+                    ! .. need all n right singular vectors and nr < n
+                    ! The Requested Number Of The Left Singular Vectors
+                     ! is then n1 (n or m)
+                     ! [[the optimal ratio n/nr for using lq instead of padding
+                       ! with zeros. here hard coded to 2; it must be at least
+                       ! two due to work space constraints.]]
+                     ! optratio = stdlib_ilaenv(6, 'zgesvd', 's' // 'o', nr,n,0,0)
+                     ! optratio = max( optratio, 2 )
+                    optratio = 2
+                    if ( optratio * nr > n ) then
+                       call stdlib_wlacpy( 'U', nr, n, a, lda, v, ldv )
+                       if ( nr > 1 )call stdlib_wlaset('L', nr-1,nr-1, czero,czero, v(2,1),ldv)
+                                 
+                    ! .. the right singular vectors of r overwrite [v], the nr left
+                       ! singular vectors of r stored in [u](1:nr,1:nr)
+                       call stdlib_wlaset('A', n-nr,n, czero,czero, v(nr+1,1),ldv)
+                       call stdlib_wgesvd( 'S', 'O', n, n, v, ldv, s, u, ldu,v, ldv, cwork(n+1), &
+                                 lcwork-n, rwork, info )
+                       call stdlib_wlapmt( .false., n, n, v, ldv, iwork )
+                       ! .. now [v] contains the adjoint of the matrix of the right
+                       ! singular vectors of a. the leading n left singular vectors
+                       ! are in [u](1:n,1:n)
+                       ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                       ! (m x n1), i.e. (m x n) or (m x m).
+                       if ( ( n < m ) .and. .not.(wntuf)) then
+                           call stdlib_wlaset('A',m-n,n,czero,czero,u(n+1,1),ldu)
+                           if ( n < n1 ) then
+                             call stdlib_wlaset('A',n,n1-n,czero,czero,u(1,n+1),ldu)
+                             call stdlib_wlaset( 'A',m-n,n1-n,czero,cone,u(n+1,n+1), ldu )
+                           end if
+                       end if
+                    else
+                       call stdlib_wlacpy( 'U', nr, n, a, lda, u(nr+1,1), ldu )
+                       if ( nr > 1 )call stdlib_wlaset('L',nr-1,nr-1,czero,czero,u(nr+2,1),ldu)
+                                 
+                       call stdlib_wgelqf( nr, n, u(nr+1,1), ldu, cwork(n+1),cwork(n+nr+1), &
+                                 lcwork-n-nr, ierr )
+                       call stdlib_wlacpy('L',nr,nr,u(nr+1,1),ldu,v,ldv)
+                       if ( nr > 1 )call stdlib_wlaset('U',nr-1,nr-1,czero,czero,v(1,2),ldv)
+                                 
+                       call stdlib_wgesvd( 'S', 'O', nr, nr, v, ldv, s, u, ldu,v, ldv, cwork(n+nr+&
+                                 1), lcwork-n-nr, rwork, info )
+                       call stdlib_wlaset('A',n-nr,nr,czero,czero,v(nr+1,1),ldv)
+                       call stdlib_wlaset('A',nr,n-nr,czero,czero,v(1,nr+1),ldv)
+                       call stdlib_wlaset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                       call stdlib_wunmlq('R','N',n,n,nr,u(nr+1,1),ldu,cwork(n+1),v, ldv, cwork(n+&
+                                 nr+1),lcwork-n-nr,ierr)
+                       call stdlib_wlapmt( .false., n, n, v, ldv, iwork )
+                     ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+                       if ( ( nr < m ) .and. .not.(wntuf)) then
+                          call stdlib_wlaset('A',m-nr,nr,czero,czero,u(nr+1,1),ldu)
+                          if ( nr < n1 ) then
+                          call stdlib_wlaset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                          call stdlib_wlaset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1), ldu )
+                                    
+                          end if
+                       end if
+                    end if
+                  end if
+              ! .. end of the "r**h or r" branch
+              end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! vectors matrix u.
+              if ( .not. wntuf )call stdlib_wunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, &
+                        cwork(n+1), lcwork-n, ierr )
+              if ( rowprm .and. .not.wntuf )call stdlib_wlaswp( n1, u, ldu, 1, m-1, iwork(n+1), -&
+                        1 )
+           ! ... end of the "full svd" branch
+           end if
+           ! check whether some singular values are returned as zeros, e.g.
+           ! due to underflow, and update the numerical rank.
+           p = nr
+           do q = p, 1, -1
+               if ( s(q) > zero ) go to 4002
+               nr = nr - 1
+           end do
+           4002 continue
+           ! .. if numerical rank deficiency is detected, the truncated
+           ! singular values are set to zero.
+           if ( nr < n ) call stdlib_qlaset( 'G', n-nr,1, zero,zero, s(nr+1), n )
+           ! .. undo scaling; this may cause overflow in the largest singular
+           ! values.
+           if ( ascaled )call stdlib_qlascl( 'G',0,0, one,sqrt(real(m,KIND=qp)), nr,1, s, n, ierr &
+                     )
+           if ( conda ) rwork(1) = sconda
+           rwork(2) = p - nr
+           ! .. p-nr is the number of singular values that are computed as
+           ! exact zeros in stdlib_wgesvd() applied to the (possibly truncated)
+           ! full row rank triangular (trapezoidal) factor of a.
+           numrank = nr
+           return
+     end subroutine stdlib_wgesvdq
+
+     !> ZGESVJ: computes the singular value decomposition (SVD) of a complex
+     !> M-by-N matrix A, where M >= N. The SVD of A is written as
+     !> [++]   [xx]   [x0]   [xx]
+     !> A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+     !> [++]   [xx]
+     !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+     !> matrix, and V is an N-by-N unitary matrix. The diagonal elements
+     !> of SIGMA are the singular values of A. The columns of U and V are the
+     !> left and the right singular vectors of A, respectively.
+
+     pure subroutine stdlib_wgesvj( joba, jobu, jobv, m, n, a, lda, sva, mv, v,ldv, cwork, lwork, &
+               rwork, lrwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, lrwork, m, mv, n
+           character, intent(in) :: joba, jobu, jobv
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), v(ldv,*), cwork(lwork)
+           real(qp), intent(inout) :: rwork(lrwork)
+           real(qp), intent(out) :: sva(n)
+        ! =====================================================================
+           ! Local Parameters 
+           integer(ilp), parameter :: nsweep = 30
+           
+           
+           
+           ! Local Scalars 
+           complex(qp) :: aapq, ompq
+           real(qp) :: aapp, aapp0, aapq1, aaqq, apoaq, aqoap, big, bigtheta, cs, ctol, epsln, &
+           mxaapq, mxsinj, rootbig, rooteps, rootsfmin, roottol, skl, sfmin, small, sn, t, temp1, &
+                     theta, thsign, tol
+           integer(ilp) :: blskip, emptsw, i, ibr, ierr, igl, ijblsk, ir1, iswrot, jbc, jgl, kbl, &
+                     lkahead, mvl, n2, n34, n4, nbl, notrot, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, goscale, lower, lquery, lsvec, noscale, rotok, rsvec, uctol, &
+                     upper
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,conjg,real,sign,sqrt
+           ! from lapack
+           ! from lapack
+           ! Executable Statements 
+           ! test the input arguments
+           lsvec = stdlib_lsame( jobu, 'U' ) .or. stdlib_lsame( jobu, 'F' )
+           uctol = stdlib_lsame( jobu, 'C' )
+           rsvec = stdlib_lsame( jobv, 'V' ) .or. stdlib_lsame( jobv, 'J' )
+           applv = stdlib_lsame( jobv, 'A' )
+           upper = stdlib_lsame( joba, 'U' )
+           lower = stdlib_lsame( joba, 'L' )
+           lquery = ( lwork == -1 ) .or. ( lrwork == -1 )
+           if( .not.( upper .or. lower .or. stdlib_lsame( joba, 'G' ) ) ) then
+              info = -1
+           else if( .not.( lsvec .or. uctol .or. stdlib_lsame( jobu, 'N' ) ) ) then
+              info = -2
+           else if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -5
+           else if( lda<m ) then
+              info = -7
+           else if( mv<0 ) then
+              info = -9
+           else if( ( rsvec .and. ( ldv<n ) ) .or.( applv .and. ( ldv<mv ) ) ) then
+              info = -11
+           else if( uctol .and. ( rwork( 1 )<=one ) ) then
+              info = -12
+           else if( ( lwork<( m+n ) ) .and. ( .not.lquery ) ) then
+              info = -13
+           else if( ( lrwork<max( n, 6 ) ) .and. ( .not.lquery ) ) then
+              info = -15
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGESVJ', -info )
+              return
+           else if ( lquery ) then
+              cwork(1) = m + n
+              rwork(1) = max( n, 6 )
+              return
+           end if
+       ! #:) quick return for void matrix
+           if( ( m==0 ) .or. ( n==0 ) )return
+           ! set numerical parameters
+           ! the stopping criterion for jacobi rotations is
+           ! max_{i<>j}|a(:,i)^* * a(:,j)| / (||a(:,i)||*||a(:,j)||) < ctol*eps
+           ! where eps is the round-off and ctol is defined as follows:
+           if( uctol ) then
+              ! ... user controlled
+              ctol = rwork( 1 )
+           else
+              ! ... default
+              if( lsvec .or. rsvec .or. applv ) then
+                 ctol = sqrt( real( m,KIND=qp) )
+              else
+                 ctol = real( m,KIND=qp)
+              end if
+           end if
+           ! ... and the machine dependent parameters are
+      ! [!]  (make sure that stdlib_dlamch() works properly on the target machine.)
+           epsln = stdlib_qlamch( 'EPSILON' )
+           rooteps = sqrt( epsln )
+           sfmin = stdlib_qlamch( 'SAFEMINIMUM' )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / epsln
+           big = stdlib_qlamch( 'OVERFLOW' )
+           ! big         = one    / sfmin
+           rootbig = one / rootsfmin
+            ! large = big / sqrt( real( m*n,KIND=qp) )
+           bigtheta = one / rooteps
+           tol = ctol*epsln
+           roottol = sqrt( tol )
+           if( real( m,KIND=qp)*epsln>=one ) then
+              info = -4
+              call stdlib_xerbla( 'ZGESVJ', -info )
+              return
+           end if
+           ! initialize the right singular vector matrix.
+           if( rsvec ) then
+              mvl = n
+              call stdlib_wlaset( 'A', mvl, n, czero, cone, v, ldv )
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           ! initialize sva( 1:n ) = ( ||a e_i||_2, i = 1:n )
+      ! (!)  if necessary, scale a to protect the largest singular value
+           ! from overflow. it is possible that saving the largest singular
+           ! value destroys the information about the small ones.
+           ! this initial scaling is almost minimal in the sense that the
+           ! goal is to make sure that no column norm overflows, and that
+           ! sqrt(n)*max_i sva(i) does not overflow. if infinite entries
+           ! in a are detected, the procedure returns with info=-6.
+           skl = one / sqrt( real( m,KIND=qp)*real( n,KIND=qp) )
+           noscale = .true.
+           goscale = .true.
+           if( lower ) then
+              ! the input matrix is m-by-n lower triangular (trapezoidal)
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_wlassq( m-p+1, a( p, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'ZGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl )
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           else if( upper ) then
+              ! the input matrix is m-by-n upper triangular (trapezoidal)
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_wlassq( p, a( 1, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'ZGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl )
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           else
+              ! the input matrix is m-by-n general dense
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_wlassq( m, a( 1, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'ZGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl )
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           end if
+           if( noscale )skl = one
+           ! move the smaller part of the spectrum from the underflow threshold
+      ! (!)  start by determining the position of the nonzero entries of the
+           ! array sva() relative to ( sfmin, big ).
+           aapp = zero
+           aaqq = big
+           do p = 1, n
+              if( sva( p )/=zero )aaqq = min( aaqq, sva( p ) )
+              aapp = max( aapp, sva( p ) )
+           end do
+       ! #:) quick return for zero matrix
+           if( aapp==zero ) then
+              if( lsvec )call stdlib_wlaset( 'G', m, n, czero, cone, a, lda )
+              rwork( 1 ) = one
+              rwork( 2 ) = zero
+              rwork( 3 ) = zero
+              rwork( 4 ) = zero
+              rwork( 5 ) = zero
+              rwork( 6 ) = zero
+              return
+           end if
+       ! #:) quick return for one-column matrix
+           if( n==1 ) then
+              if( lsvec )call stdlib_wlascl( 'G', 0, 0, sva( 1 ), skl, m, 1,a( 1, 1 ), lda, ierr )
+                        
+              rwork( 1 ) = one / skl
+              if( sva( 1 )>=sfmin ) then
+                 rwork( 2 ) = one
+              else
+                 rwork( 2 ) = zero
+              end if
+              rwork( 3 ) = zero
+              rwork( 4 ) = zero
+              rwork( 5 ) = zero
+              rwork( 6 ) = zero
+              return
+           end if
+           ! protect small singular values from underflow, and try to
+           ! avoid underflows/overflows in computing jacobi rotations.
+           sn = sqrt( sfmin / epsln )
+           temp1 = sqrt( big / real( n,KIND=qp) )
+           if( ( aapp<=sn ) .or. ( aaqq>=temp1 ) .or.( ( sn<=aaqq ) .and. ( aapp<=temp1 ) ) ) &
+                     then
+              temp1 = min( big, temp1 / aapp )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq<=sn ) .and. ( aapp<=temp1 ) ) then
+              temp1 = min( sn / aaqq, big / (aapp*sqrt( real(n,KIND=qp)) ) )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq>=sn ) .and. ( aapp>=temp1 ) ) then
+              temp1 = max( sn / aaqq, temp1 / aapp )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq<=sn ) .and. ( aapp>=temp1 ) ) then
+              temp1 = min( sn / aaqq, big / ( sqrt( real( n,KIND=qp) )*aapp ) )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else
+              temp1 = one
+           end if
+           ! scale, if necessary
+           if( temp1/=one ) then
+              call stdlib_qlascl( 'G', 0, 0, one, temp1, n, 1, sva, n, ierr )
+           end if
+           skl = temp1*skl
+           if( skl/=one ) then
+              call stdlib_wlascl( joba, 0, 0, one, skl, m, n, a, lda, ierr )
+              skl = one / skl
+           end if
+           ! row-cyclic jacobi svd algorithm with column pivoting
+           emptsw = ( n*( n-1 ) ) / 2
+           notrot = 0
+           do q = 1, n
+              cwork( q ) = cone
+           end do
+           swband = 3
+      ! [tp] swband is a tuning parameter [tp]. it is meaningful and effective
+           ! if stdlib_wgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_wgejsv. for sweeps i=1:swband the procedure
+           ! works on pivots inside a band-like region around the diagonal.
+           ! the boundaries are determined dynamically, based on the number of
+           ! pivots above a threshold.
+           kbl = min( 8, n )
+      ! [tp] kbl is a tuning parameter that defines the tile size in the
+           ! tiling of the p-q loops of pivot pairs. in general, an optimal
+           ! value of kbl depends on the matrix dimensions and on the
+           ! parameters of the computer's memory.
+           nbl = n / kbl
+           if( ( nbl*kbl )/=n )nbl = nbl + 1
+           blskip = kbl**2
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           lkahead = 1
+      ! [tp] lkahead is a tuning parameter.
+           ! quasi block transformations, using the lower (upper) triangular
+           ! structure of the input matrix. the quasi-block-cycling usually
+           ! invokes cubic convergence. big part of this cycle is done inside
+           ! canonical subspaces of dimensions less than m.
+           if( ( lower .or. upper ) .and. ( n>max( 64, 4*kbl ) ) ) then
+      ! [tp] the number of partition levels and the actual partition are
+           ! tuning parameters.
+              n4 = n / 4
+              n2 = n / 2
+              n34 = 3*n4
+              if( applv ) then
+                 q = 0
+              else
+                 q = 1
+              end if
+              if( lower ) then
+           ! this works very well on lower triangular matrices, in particular
+           ! in the framework of the preconditioned jacobi svd (xgejsv).
+           ! the idea is simple:
+           ! [+ 0 0 0]   note that jacobi transformations of [0 0]
+           ! [+ + 0 0]                                       [0 0]
+           ! [+ + x 0]   actually work on [x 0]              [x 0]
+           ! [+ + x x]                    [x x].             [x x]
+                 call stdlib_wgsvj0( jobv, m-n34, n-n34, a( n34+1, n34+1 ), lda,cwork( n34+1 ), &
+                 sva( n34+1 ), mvl,v( n34*q+1, n34+1 ), ldv, epsln, sfmin, tol,2, cwork( n+1 ), &
+                           lwork-n, ierr )
+                 call stdlib_wgsvj0( jobv, m-n2, n34-n2, a( n2+1, n2+1 ), lda,cwork( n2+1 ), sva( &
+                 n2+1 ), mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 2,cwork( n+1 ), lwork-n, &
+                           ierr )
+                 call stdlib_wgsvj1( jobv, m-n2, n-n2, n4, a( n2+1, n2+1 ), lda,cwork( n2+1 ), &
+                 sva( n2+1 ), mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 1,cwork( n+1 ), &
+                           lwork-n, ierr )
+                 call stdlib_wgsvj0( jobv, m-n4, n2-n4, a( n4+1, n4+1 ), lda,cwork( n4+1 ), sva( &
+                 n4+1 ), mvl,v( n4*q+1, n4+1 ), ldv, epsln, sfmin, tol, 1,cwork( n+1 ), lwork-n, &
+                           ierr )
+                 call stdlib_wgsvj0( jobv, m, n4, a, lda, cwork, sva, mvl, v, ldv,epsln, sfmin, &
+                           tol, 1, cwork( n+1 ), lwork-n,ierr )
+                 call stdlib_wgsvj1( jobv, m, n2, n4, a, lda, cwork, sva, mvl, v,ldv, epsln, &
+                           sfmin, tol, 1, cwork( n+1 ),lwork-n, ierr )
+              else if( upper ) then
+                 call stdlib_wgsvj0( jobv, n4, n4, a, lda, cwork, sva, mvl, v, ldv,epsln, sfmin, &
+                           tol, 2, cwork( n+1 ), lwork-n,ierr )
+                 call stdlib_wgsvj0( jobv, n2, n4, a( 1, n4+1 ), lda, cwork( n4+1 ),sva( n4+1 ), &
+                 mvl, v( n4*q+1, n4+1 ), ldv,epsln, sfmin, tol, 1, cwork( n+1 ), lwork-n,ierr )
+                           
+                 call stdlib_wgsvj1( jobv, n2, n2, n4, a, lda, cwork, sva, mvl, v,ldv, epsln, &
+                           sfmin, tol, 1, cwork( n+1 ),lwork-n, ierr )
+                 call stdlib_wgsvj0( jobv, n2+n4, n4, a( 1, n2+1 ), lda,cwork( n2+1 ), sva( n2+1 )&
+                 , mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 1,cwork( n+1 ), lwork-n, ierr )
+                           
+              end if
+           end if
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+           ! each sweep is unrolled using kbl-by-kbl tiles over the pivot pairs
+           ! 1 <= p < q <= n. this is the first step toward a blocked implementation
+           ! of the rotations. new implementation, based on block transformations,
+           ! is under development.
+              loop_2000: do ibr = 1, nbl
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_1002: do ir1 = 0, min( lkahead, nbl-ibr )
+                    igl = igl + ir1*kbl
+                    loop_2001: do p = igl, min( igl+kbl-1, n-1 )
+           ! .. de rijk's pivoting
+                       q = stdlib_iqamax( n-p+1, sva( p ), 1 ) + p - 1
+                       if( p/=q ) then
+                          call stdlib_wswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                          if( rsvec )call stdlib_wswap( mvl, v( 1, p ), 1,v( 1, q ), 1 )
+                          temp1 = sva( p )
+                          sva( p ) = sva( q )
+                          sva( q ) = temp1
+                          aapq = cwork(p)
+                          cwork(p) = cwork(q)
+                          cwork(q) = aapq
+                       end if
+                       if( ir1==0 ) then
+              ! column norms are periodically updated by explicit
+              ! norm computation.
+      ! [!]     caveat:
+              ! unfortunately, some blas implementations compute stdlib_qznrm2(m,a(1,p),1)
+              ! as sqrt(s=stdlib_zdotc(m,a(1,p),1,a(1,p),1)), which may cause the result to
+              ! overflow for ||a(:,p)||_2 > sqrt(overflow_threshold), and to
+              ! underflow for ||a(:,p)||_2 < sqrt(underflow_threshold).
+              ! hence, stdlib_qznrm2 cannot be trusted, not even in the case when
+              ! the true norm is far from the under(over)flow boundaries.
+              ! if properly implemented stdlib_dcnrm2 is available, the if-then-else-end if
+              ! below should be replaced with "aapp = stdlib_qznrm2( m, a(1,p), 1 )".
+                          if( ( sva( p )<rootbig ) .and.( sva( p )>rootsfmin ) ) then
+                             sva( p ) = stdlib_qznrm2( m, a( 1, p ), 1 )
+                          else
+                             temp1 = zero
+                             aapp = one
+                             call stdlib_wlassq( m, a( 1, p ), 1, temp1, aapp )
+                             sva( p ) = temp1*sqrt( aapp )
+                          end if
+                          aapp = sva( p )
+                       else
+                          aapp = sva( p )
+                       end if
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2002: do q = p + 1, min( igl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+                                if( aaqq>=one ) then
+                                   rotok = ( small*aapp )<=aaqq
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_wdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_wcopy( m, a( 1, p ), 1,cwork(n+1), 1 )
+                                      call stdlib_wlascl( 'G', 0, 0, aapp, one,m, 1, cwork(n+1), &
+                                                lda, ierr )
+                                      aapq = stdlib_wdotc( m, cwork(n+1), 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   rotok = aapp<=( aaqq / small )
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_wdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aapp ) / aaqq
+                                   else
+                                      call stdlib_wcopy( m, a( 1, q ), 1,cwork(n+1), 1 )
+                                      call stdlib_wlascl( 'G', 0, 0, aaqq,one, m, 1,cwork(n+1), &
+                                                lda, ierr )
+                                      aapq = stdlib_wdotc( m, a(1, p ), 1,cwork(n+1), 1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg( cwork(p) ) * cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                ompq = aapq / abs(aapq)
+                 ! Rotate
+      ! [rtd]      rotated = rotated + one
+                                   if( ir1==0 ) then
+                                      notrot = 0
+                                      pskipped = 0
+                                      iswrot = iswrot + 1
+                                   end if
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/aapq1
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_wrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if ( rsvec ) then
+                                             call stdlib_wrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_wrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if ( rsvec ) then
+                                             call stdlib_wrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      cwork(p) = -cwork(q) * ompq
+                                      else
+                    ! .. have to use modified gram-schmidt like transformation
+                                      call stdlib_wcopy( m, a( 1, p ), 1,cwork(n+1), 1 )
+                                      call stdlib_wlascl( 'G', 0, 0, aapp, one, m,1, cwork(n+1), &
+                                                lda,ierr )
+                                      call stdlib_wlascl( 'G', 0, 0, aaqq, one, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      call stdlib_waxpy( m, -aapq, cwork(n+1), 1,a( 1, q ), 1 )
+                                                
+                                      call stdlib_wlascl( 'G', 0, 0, one, aaqq, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                      mxsinj = max( mxsinj, sfmin )
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! recompute sva(q), sva(p).
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_qznrm2( m, a( 1, q ), 1 )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_wlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_qznrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_wlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                                else
+                                   ! a(:,p) and a(:,q) already numerically orthogonal
+                                   if( ir1==0 )notrot = notrot + 1
+      ! [rtd]      skipped  = skipped + 1
+                                   pskipped = pskipped + 1
+                                end if
+                             else
+                                ! a(:,q) is zero column
+                                if( ir1==0 )notrot = notrot + 1
+                                pskipped = pskipped + 1
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                if( ir1==0 )aapp = -aapp
+                                notrot = 0
+                                go to 2103
+                             end if
+                          end do loop_2002
+           ! end q-loop
+           2103 continue
+           ! bailed out of q-loop
+                          sva( p ) = aapp
+                       else
+                          sva( p ) = aapp
+                          if( ( ir1==0 ) .and. ( aapp==zero ) )notrot = notrot + min( igl+kbl-1, &
+                                    n ) - p
+                       end if
+                    end do loop_2001
+           ! end of the p-loop
+           ! end of doing the block ( ibr, ibr )
+                 end do loop_1002
+           ! end of ir1-loop
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = ibr + 1, nbl
+                    jgl = ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! safe gram matrix computation
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_wdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_wcopy( m, a( 1, p ), 1,cwork(n+1), 1 )
+                                      call stdlib_wlascl( 'G', 0, 0, aapp,one, m, 1,cwork(n+1), &
+                                                lda, ierr )
+                                      aapq = stdlib_wdotc( m, cwork(n+1), 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_wdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / max(&
+                                                aaqq,aapp) )/ min(aaqq,aapp)
+                                   else
+                                      call stdlib_wcopy( m, a( 1, q ), 1,cwork(n+1), 1 )
+                                      call stdlib_wlascl( 'G', 0, 0, aaqq,one, m, 1,cwork(n+1), &
+                                                lda, ierr )
+                                      aapq = stdlib_wdotc( m, a( 1, p ), 1,cwork(n+1),  1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg(cwork(p))*cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                   ompq = aapq / abs(aapq)
+                                   notrot = 0
+      ! [rtd]      rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/ aapq1
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_wrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if( rsvec ) then
+                                             call stdlib_wrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_wrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if( rsvec ) then
+                                             call stdlib_wrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      cwork(p) = -cwork(q) * ompq
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                    if( aapp>aaqq ) then
+                                         call stdlib_wcopy( m, a( 1, p ), 1,cwork(n+1), 1 )
+                                                   
+                                         call stdlib_wlascl( 'G', 0, 0, aapp, one,m, 1, cwork(n+1)&
+                                                   ,lda,ierr )
+                                         call stdlib_wlascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         call stdlib_waxpy( m, -aapq, cwork(n+1),1, a( 1, q ), 1 )
+                                                   
+                                         call stdlib_wlascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    else
+                                        call stdlib_wcopy( m, a( 1, q ), 1,cwork(n+1), 1 )
+                                         call stdlib_wlascl( 'G', 0, 0, aaqq, one,m, 1, cwork(n+1)&
+                                                   ,lda,ierr )
+                                         call stdlib_wlascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         call stdlib_waxpy( m, -conjg(aapq),cwork(n+1), 1, a( 1, &
+                                                   p ), 1 )
+                                         call stdlib_wlascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! .. recompute sva(q), sva(p)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_qznrm2( m, a( 1, q ), 1)
+                                       else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_wlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_qznrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_wlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_qznrm2( m, a( 1, n ), 1 )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_wlassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<sqrt( real( n,KIND=qp) )*tol ) .and. ( real( n,&
+                        KIND=qp)*mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:( reaching this point means that the procedure has not converged.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means numerical convergence after the i-th
+           ! sweep.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the singular values and find how many are above
+           ! the underflow threshold.
+           n2 = 0
+           n4 = 0
+           do p = 1, n - 1
+              q = stdlib_iqamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 call stdlib_wswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_wswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+              if( sva( p )/=zero ) then
+                 n4 = n4 + 1
+                 if( sva( p )*skl>sfmin )n2 = n2 + 1
+              end if
+           end do
+           if( sva( n )/=zero ) then
+              n4 = n4 + 1
+              if( sva( n )*skl>sfmin )n2 = n2 + 1
+           end if
+           ! normalize the left singular vectors.
+           if( lsvec .or. uctol ) then
+              do p = 1, n4
+                  ! call stdlib_wdscal( m, one / sva( p ), a( 1, p ), 1 )
+                 call stdlib_wlascl( 'G',0,0, sva(p), one, m, 1, a(1,p), m, ierr )
+              end do
+           end if
+           ! scale the product of jacobi rotations.
+           if( rsvec ) then
+                 do p = 1, n
+                    temp1 = one / stdlib_qznrm2( mvl, v( 1, p ), 1 )
+                    call stdlib_wdscal( mvl, temp1, v( 1, p ), 1 )
+                 end do
+           end if
+           ! undo scaling, if necessary (and possible).
+           if( ( ( skl>one ) .and. ( sva( 1 )<( big / skl ) ) ).or. ( ( skl<one ) .and. ( sva( &
+                     max( n2, 1 ) ) >( sfmin / skl ) ) ) ) then
+              do p = 1, n
+                 sva( p ) = skl*sva( p )
+              end do
+              skl = one
+           end if
+           rwork( 1 ) = skl
+           ! the singular values of a are skl*sva(1:n). if skl/=one
+           ! then some of the singular values may overflow or underflow and
+           ! the spectrum is given in this factored representation.
+           rwork( 2 ) = real( n4,KIND=qp)
+           ! n4 is the number of computed nonzero singular values of a.
+           rwork( 3 ) = real( n2,KIND=qp)
+           ! n2 is the number of singular values of a greater than sfmin.
+           ! if n2<n, sva(n2:n) contains zeros and/or denormalized numbers
+           ! that may carry some information.
+           rwork( 4 ) = real( i,KIND=qp)
+           ! i is the index of the last sweep before declaring convergence.
+           rwork( 5 ) = mxaapq
+           ! mxaapq is the largest absolute value of scaled pivots in the
+           ! last sweep
+           rwork( 6 ) = mxsinj
+           ! mxsinj is the largest absolute value of the sines of jacobi angles
+           ! in the last sweep
+           return
+     end subroutine stdlib_wgesvj
+
+     !> ZGESVX: uses the LU factorization to compute the solution to a complex
+     !> system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_wgesvx( fact, trans, n, nrhs, a, lda, af, ldaf, ipiv,equed, r, c, b, ldb, &
+               x, ldx, rcond, ferr, berr,work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(qp), intent(inout) :: c(*), r(*)
+           complex(qp), intent(inout) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: colequ, equil, nofact, notran, rowequ
+           character :: norm
+           integer(ilp) :: i, infequ, j
+           real(qp) :: amax, anorm, bignum, colcnd, rcmax, rcmin, rowcnd, rpvgrw, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rowequ = .false.
+              colequ = .false.
+           else
+              rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+              colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rowequ .or. colequ .or. stdlib_lsame( &
+                     equed, 'N' ) ) ) then
+              info = -10
+           else
+              if( rowequ ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, r( j ) )
+                    rcmax = max( rcmax, r( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -11
+                 else if( n>0 ) then
+                    rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    rowcnd = one
+                 end if
+              end if
+              if( colequ .and. info==0 ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, c( j ) )
+                    rcmax = max( rcmax, c( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -12
+                 else if( n>0 ) then
+                    colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    colcnd = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -14
+                 else if( ldx<max( 1, n ) ) then
+                    info = -16
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGESVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_wgeequ( n, n, a, lda, r, c, rowcnd, colcnd, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_wlaqge( n, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+                 rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+                 colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( notran ) then
+              if( rowequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       b( i, j ) = r( i )*b( i, j )
+                    end do
+                 end do
+              end if
+           else if( colequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = c( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the lu factorization of a.
+              call stdlib_wlacpy( 'FULL', n, n, a, lda, af, ldaf )
+              call stdlib_wgetrf( n, n, af, ldaf, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 ) then
+                 ! compute the reciprocal pivot growth factor of the
+                 ! leading rank-deficient info columns of a.
+                 rpvgrw = stdlib_wlantr( 'M', 'U', 'N', info, info, af, ldaf,rwork )
+                 if( rpvgrw==zero ) then
+                    rpvgrw = one
+                 else
+                    rpvgrw = stdlib_wlange( 'M', n, info, a, lda, rwork ) /rpvgrw
+                 end if
+                 rwork( 1 ) = rpvgrw
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a and the
+           ! reciprocal pivot growth factor rpvgrw.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_wlange( norm, n, n, a, lda, rwork )
+           rpvgrw = stdlib_wlantr( 'M', 'U', 'N', n, n, af, ldaf, rwork )
+           if( rpvgrw==zero ) then
+              rpvgrw = one
+           else
+              rpvgrw = stdlib_wlange( 'M', n, n, a, lda, rwork ) / rpvgrw
+           end if
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_wgecon( norm, n, af, ldaf, anorm, rcond, work, rwork, info )
+           ! compute the solution matrix x.
+           call stdlib_wlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_wgetrs( trans, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_wgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, rwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( notran ) then
+              if( colequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       x( i, j ) = c( i )*x( i, j )
+                    end do
+                 end do
+                 do j = 1, nrhs
+                    ferr( j ) = ferr( j ) / colcnd
+                 end do
+              end if
+           else if( rowequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = r( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / rowcnd
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           rwork( 1 ) = rpvgrw
+           return
+     end subroutine stdlib_wgesvx
+
+     !> ZGETC2: computes an LU factorization, using complete pivoting, of the
+     !> n-by-n matrix A. The factorization has the form A = P * L * U * Q,
+     !> where P and Q are permutation matrices, L is lower triangular with
+     !> unit diagonal elements and U is upper triangular.
+     !> This is a level 1 BLAS version of the algorithm.
+
+     pure subroutine stdlib_wgetc2( n, a, lda, ipiv, jpiv, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*), jpiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ip, ipv, j, jp, jpv
+           real(qp) :: bignum, eps, smin, smlnum, xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,max
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ! handle the case n=1 by itself
+           if( n==1 ) then
+              ipiv( 1 ) = 1
+              jpiv( 1 ) = 1
+              if( abs( a( 1, 1 ) )<smlnum ) then
+                 info = 1
+                 a( 1, 1 ) = cmplx( smlnum, zero,KIND=qp)
+              end if
+              return
+           end if
+           ! factorize a using complete pivoting.
+           ! set pivots less than smin to smin
+           loop_40: do i = 1, n - 1
+              ! find max element in matrix a
+              xmax = zero
+              do ip = i, n
+                 do jp = i, n
+                    if( abs( a( ip, jp ) )>=xmax ) then
+                       xmax = abs( a( ip, jp ) )
+                       ipv = ip
+                       jpv = jp
+                    end if
+                 end do
+              end do
+              if( i==1 )smin = max( eps*xmax, smlnum )
+              ! swap rows
+              if( ipv/=i )call stdlib_wswap( n, a( ipv, 1 ), lda, a( i, 1 ), lda )
+              ipiv( i ) = ipv
+              ! swap columns
+              if( jpv/=i )call stdlib_wswap( n, a( 1, jpv ), 1, a( 1, i ), 1 )
+              jpiv( i ) = jpv
+              ! check for singularity
+              if( abs( a( i, i ) )<smin ) then
+                 info = i
+                 a( i, i ) = cmplx( smin, zero,KIND=qp)
+              end if
+              do j = i + 1, n
+                 a( j, i ) = a( j, i ) / a( i, i )
+              end do
+              call stdlib_wgeru( n-i, n-i, -cmplx( one,KIND=qp), a( i+1, i ), 1,a( i, i+1 ), lda, &
+                        a( i+1, i+1 ), lda )
+           end do loop_40
+           if( abs( a( n, n ) )<smin ) then
+              info = n
+              a( n, n ) = cmplx( smin, zero,KIND=qp)
+           end if
+           ! set last pivots to n
+           ipiv( n ) = n
+           jpiv( n ) = n
+           return
+     end subroutine stdlib_wgetc2
+
+     !> ZGETF2: computes an LU factorization of a general m-by-n matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the right-looking Level 2 BLAS version of the algorithm.
+
+     pure subroutine stdlib_wgetf2( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: sfmin
+           integer(ilp) :: i, j, jp
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGETF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch('S')
+           do j = 1, min( m, n )
+              ! find pivot and test for singularity.
+              jp = j - 1 + stdlib_iwamax( m-j+1, a( j, j ), 1 )
+              ipiv( j ) = jp
+              if( a( jp, j )/=czero ) then
+                 ! apply the interchange to columns 1:n.
+                 if( jp/=j )call stdlib_wswap( n, a( j, 1 ), lda, a( jp, 1 ), lda )
+                 ! compute elements j+1:m of j-th column.
+                 if( j<m ) then
+                    if( abs(a( j, j )) >= sfmin ) then
+                       call stdlib_wscal( m-j, cone / a( j, j ), a( j+1, j ), 1 )
+                    else
+                       do i = 1, m-j
+                          a( j+i, j ) = a( j+i, j ) / a( j, j )
+                       end do
+                    end if
+                 end if
+              else if( info==0 ) then
+                 info = j
+              end if
+              if( j<min( m, n ) ) then
+                 ! update trailing submatrix.
+                 call stdlib_wgeru( m-j, n-j, -cone, a( j+1, j ), 1, a( j, j+1 ),lda, a( j+1, j+1 &
+                           ), lda )
+              end if
+           end do
+           return
+     end subroutine stdlib_wgetf2
+
+     !> ZGETRF: computes an LU factorization of a general M-by-N matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the right-looking Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_wgetrf( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGETRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'ZGETRF', ' ', m, n, -1, -1 )
+           if( nb<=1 .or. nb>=min( m, n ) ) then
+              ! use unblocked code.
+              call stdlib_wgetrf2( m, n, a, lda, ipiv, info )
+           else
+              ! use blocked code.
+              do j = 1, min( m, n ), nb
+                 jb = min( min( m, n )-j+1, nb )
+                 ! factor diagonal and subdiagonal blocks and test for exact
+                 ! singularity.
+                 call stdlib_wgetrf2( m-j+1, jb, a( j, j ), lda, ipiv( j ), iinfo )
+                 ! adjust info and the pivot indices.
+                 if( info==0 .and. iinfo>0 )info = iinfo + j - 1
+                 do i = j, min( m, j+jb-1 )
+                    ipiv( i ) = j - 1 + ipiv( i )
+                 end do
+                 ! apply interchanges to columns 1:j-1.
+                 call stdlib_wlaswp( j-1, a, lda, j, j+jb-1, ipiv, 1 )
+                 if( j+jb<=n ) then
+                    ! apply interchanges to columns j+jb:n.
+                    call stdlib_wlaswp( n-j-jb+1, a( 1, j+jb ), lda, j, j+jb-1,ipiv, 1 )
+                    ! compute block row of u.
+                    call stdlib_wtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', jb,n-j-jb+1, cone,&
+                               a( j, j ), lda, a( j, j+jb ),lda )
+                    if( j+jb<=m ) then
+                       ! update trailing submatrix.
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-j-jb+1,n-j-jb+1, jb, -&
+                       cone, a( j+jb, j ), lda,a( j, j+jb ), lda, cone, a( j+jb, j+jb ),lda )
+                                 
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_wgetrf
+
+     !> ZGETRF2: computes an LU factorization of a general M-by-N matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the recursive version of the algorithm. It divides
+     !> the matrix into four submatrices:
+     !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+     !> A = [ -----|----- ]  with n1 = min(m,n)/2
+     !> [  A21 | A22  ]       n2 = n-n1
+     !> [ A11 ]
+     !> The subroutine calls itself to factor [ --- ],
+     !> [ A12 ]
+     !> [ A12 ]
+     !> do the swaps on [ --- ], solve A12, update A22,
+     !> [ A22 ]
+     !> then calls itself to factor A22 and do the swaps on A21.
+
+     pure recursive subroutine stdlib_wgetrf2( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: sfmin
+           complex(qp) :: temp
+           integer(ilp) :: i, iinfo, n1, n2
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGETRF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if ( m==1 ) then
+              ! use unblocked code for cone row case
+              ! just need to handle ipiv and info
+              ipiv( 1 ) = 1
+              if ( a(1,1)==czero )info = 1
+           else if( n==1 ) then
+              ! use unblocked code for cone column case
+              ! compute machine safe minimum
+              sfmin = stdlib_qlamch('S')
+              ! find pivot and test for singularity
+              i = stdlib_iwamax( m, a( 1, 1 ), 1 )
+              ipiv( 1 ) = i
+              if( a( i, 1 )/=czero ) then
+                 ! apply the interchange
+                 if( i/=1 ) then
+                    temp = a( 1, 1 )
+                    a( 1, 1 ) = a( i, 1 )
+                    a( i, 1 ) = temp
+                 end if
+                 ! compute elements 2:m of the column
+                 if( abs(a( 1, 1 )) >= sfmin ) then
+                    call stdlib_wscal( m-1, cone / a( 1, 1 ), a( 2, 1 ), 1 )
+                 else
+                    do i = 1, m-1
+                       a( 1+i, 1 ) = a( 1+i, 1 ) / a( 1, 1 )
+                    end do
+                 end if
+              else
+                 info = 1
+              end if
+           else
+              ! use recursive code
+              n1 = min( m, n ) / 2
+              n2 = n-n1
+                     ! [ a11 ]
+              ! factor [ --- ]
+                     ! [ a21 ]
+              call stdlib_wgetrf2( m, n1, a, lda, ipiv, iinfo )
+              if ( info==0 .and. iinfo>0 )info = iinfo
+                                    ! [ a12 ]
+              ! apply interchanges to [ --- ]
+                                    ! [ a22 ]
+              call stdlib_wlaswp( n2, a( 1, n1+1 ), lda, 1, n1, ipiv, 1 )
+              ! solve a12
+              call stdlib_wtrsm( 'L', 'L', 'N', 'U', n1, n2, cone, a, lda,a( 1, n1+1 ), lda )
+                        
+              ! update a22
+              call stdlib_wgemm( 'N', 'N', m-n1, n2, n1, -cone, a( n1+1, 1 ), lda,a( 1, n1+1 ), &
+                        lda, cone, a( n1+1, n1+1 ), lda )
+              ! factor a22
+              call stdlib_wgetrf2( m-n1, n2, a( n1+1, n1+1 ), lda, ipiv( n1+1 ),iinfo )
+              ! adjust info and the pivot indices
+              if ( info==0 .and. iinfo>0 )info = iinfo + n1
+              do i = n1+1, min( m, n )
+                 ipiv( i ) = ipiv( i ) + n1
+              end do
+              ! apply interchanges to a21
+              call stdlib_wlaswp( n1, a( 1, 1 ), lda, n1+1, min( m, n), ipiv, 1 )
+           end if
+           return
+     end subroutine stdlib_wgetrf2
+
+     !> ZGETRI: computes the inverse of a matrix using the LU factorization
+     !> computed by ZGETRF.
+     !> This method inverts U and then computes inv(A) by solving the system
+     !> inv(A)*L = inv(U) for inv(A).
+
+     pure subroutine stdlib_wgetri( n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iws, j, jb, jj, jp, ldwork, lwkopt, nb, nbmin, nn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZGETRI', ' ', n, -1, -1, -1 )
+           lwkopt = n*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGETRI', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form inv(u).  if info > 0 from stdlib_wtrtri, then u is singular,
+           ! and the inverse is not computed.
+           call stdlib_wtrtri( 'UPPER', 'NON-UNIT', n, a, lda, info )
+           if( info>0 )return
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = max( ldwork*nb, 1 )
+              if( lwork<iws ) then
+                 nb = lwork / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZGETRI', ' ', n, -1, -1, -1 ) )
+              end if
+           else
+              iws = n
+           end if
+           ! solve the equation inv(a)*l = inv(u) for inv(a).
+           if( nb<nbmin .or. nb>=n ) then
+              ! use unblocked code.
+              do j = n, 1, -1
+                 ! copy current column of l to work and replace with zeros.
+                 do i = j + 1, n
+                    work( i ) = a( i, j )
+                    a( i, j ) = czero
+                 end do
+                 ! compute current column of inv(a).
+                 if( j<n )call stdlib_wgemv( 'NO TRANSPOSE', n, n-j, -cone, a( 1, j+1 ),lda, work(&
+                            j+1 ), 1, cone, a( 1, j ), 1 )
+              end do
+           else
+              ! use blocked code.
+              nn = ( ( n-1 ) / nb )*nb + 1
+              do j = nn, 1, -nb
+                 jb = min( nb, n-j+1 )
+                 ! copy current block column of l to work and replace with
+                 ! zeros.
+                 do jj = j, j + jb - 1
+                    do i = jj + 1, n
+                       work( i+( jj-j )*ldwork ) = a( i, jj )
+                       a( i, jj ) = czero
+                    end do
+                 end do
+                 ! compute current block column of inv(a).
+                 if( j+jb<=n )call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, jb,n-j-jb+1, -&
+                           cone, a( 1, j+jb ), lda,work( j+jb ), ldwork, cone, a( 1, j ), lda )
+                 call stdlib_wtrsm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n, jb,cone, work( j &
+                           ), ldwork, a( 1, j ), lda )
+              end do
+           end if
+           ! apply column interchanges.
+           do j = n - 1, 1, -1
+              jp = ipiv( j )
+              if( jp/=j )call stdlib_wswap( n, a( 1, j ), 1, a( 1, jp ), 1 )
+           end do
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_wgetri
+
+     !> ZGETRS: solves a system of linear equations
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B
+     !> with a general N-by-N matrix A using the LU factorization computed
+     !> by ZGETRF.
+
+     pure subroutine stdlib_wgetrs( trans, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGETRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( notran ) then
+              ! solve a * x = b.
+              ! apply row interchanges to the right hand sides.
+              call stdlib_wlaswp( nrhs, b, ldb, 1, n, ipiv, 1 )
+              ! solve l*x = b, overwriting b with x.
+              call stdlib_wtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n, nrhs,cone, a, lda, b,&
+                         ldb )
+              ! solve u*x = b, overwriting b with x.
+              call stdlib_wtrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, cone, a, &
+                        lda, b, ldb )
+           else
+              ! solve a**t * x = b  or a**h * x = b.
+              ! solve u**t *x = b or u**h *x = b, overwriting b with x.
+              call stdlib_wtrsm( 'LEFT', 'UPPER', trans, 'NON-UNIT', n, nrhs, cone,a, lda, b, ldb &
+                        )
+              ! solve l**t *x = b, or l**h *x = b overwriting b with x.
+              call stdlib_wtrsm( 'LEFT', 'LOWER', trans, 'UNIT', n, nrhs, cone, a,lda, b, ldb )
+                        
+              ! apply row interchanges to the solution vectors.
+              call stdlib_wlaswp( nrhs, b, ldb, 1, n, ipiv, -1 )
+           end if
+           return
+     end subroutine stdlib_wgetrs
+
+     !> ZGETSLS: solves overdetermined or underdetermined complex linear systems
+     !> involving an M-by-N matrix A, using a tall skinny QR or short wide LQ
+     !> factorization of A.  It is assumed that A has full rank.
+     !> The following options are provided:
+     !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A*X ||.
+     !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+     !> an underdetermined system A * X = B.
+     !> 3. If TRANS = 'C' and m >= n:  find the minimum norm solution of
+     !> an undetermined system A**T * X = B.
+     !> 4. If TRANS = 'C' and m < n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A**T * X ||.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+
+     subroutine stdlib_wgetsls( trans, m, n, nrhs, a, lda, b, ldb,work, lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, tran
+           integer(ilp) :: i, iascl, ibscl, j, maxmn, brow, scllen, tszo, tszm, lwo, lwm, lw1, &
+                     lw2, wsizeo, wsizem, info2
+           real(qp) :: anrm, bignum, bnrm, smlnum, dum(1)
+           complex(qp) :: tq(5), workq(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,int
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           maxmn = max( m, n )
+           tran  = stdlib_lsame( trans, 'C' )
+           lquery = ( lwork==-1 .or. lwork==-2 )
+           if( .not.( stdlib_lsame( trans, 'N' ) .or.stdlib_lsame( trans, 'C' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+           ! determine the optimum and minimum lwork
+            if( m>=n ) then
+              call stdlib_wgeqr( m, n, a, lda, tq, -1, workq, -1, info2 )
+              tszo = int( tq( 1 ),KIND=ilp)
+              lwo  = int( workq( 1 ),KIND=ilp)
+              call stdlib_wgemqr( 'L', trans, m, nrhs, n, a, lda, tq,tszo, b, ldb, workq, -1, &
+                        info2 )
+              lwo  = max( lwo, int( workq( 1 ),KIND=ilp) )
+              call stdlib_wgeqr( m, n, a, lda, tq, -2, workq, -2, info2 )
+              tszm = int( tq( 1 ),KIND=ilp)
+              lwm  = int( workq( 1 ),KIND=ilp)
+              call stdlib_wgemqr( 'L', trans, m, nrhs, n, a, lda, tq,tszm, b, ldb, workq, -1, &
+                        info2 )
+              lwm = max( lwm, int( workq( 1 ),KIND=ilp) )
+              wsizeo = tszo + lwo
+              wsizem = tszm + lwm
+            else
+              call stdlib_wgelq( m, n, a, lda, tq, -1, workq, -1, info2 )
+              tszo = int( tq( 1 ),KIND=ilp)
+              lwo  = int( workq( 1 ),KIND=ilp)
+              call stdlib_wgemlq( 'L', trans, n, nrhs, m, a, lda, tq,tszo, b, ldb, workq, -1, &
+                        info2 )
+              lwo  = max( lwo, int( workq( 1 ),KIND=ilp) )
+              call stdlib_wgelq( m, n, a, lda, tq, -2, workq, -2, info2 )
+              tszm = int( tq( 1 ),KIND=ilp)
+              lwm  = int( workq( 1 ),KIND=ilp)
+              call stdlib_wgemlq( 'L', trans, n, nrhs, m, a, lda, tq,tszm, b, ldb, workq, -1, &
+                        info2 )
+              lwm  = max( lwm, int( workq( 1 ),KIND=ilp) )
+              wsizeo = tszo + lwo
+              wsizem = tszm + lwm
+            end if
+            if( ( lwork<wsizem ).and.( .not.lquery ) ) then
+               info = -10
+            end if
+            work( 1 ) = real( wsizeo,KIND=qp)
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZGETSLS', -info )
+             return
+           end if
+           if( lquery ) then
+             if( lwork==-2 ) work( 1 ) = real( wsizem,KIND=qp)
+             return
+           end if
+           if( lwork<wsizeo ) then
+             lw1 = tszm
+             lw2 = lwm
+           else
+             lw1 = tszo
+             lw2 = lwo
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+                call stdlib_wlaset( 'FULL', max( m, n ), nrhs, czero, czero,b, ldb )
+                return
+           end if
+           ! get machine parameters
+            smlnum = stdlib_qlamch( 'S' ) / stdlib_qlamch( 'P' )
+            bignum = one / smlnum
+            call stdlib_qlabad( smlnum, bignum )
+           ! scale a, b if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', m, n, a, lda, dum )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_wlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_wlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_wlaset( 'F', maxmn, nrhs, czero, czero, b, ldb )
+              go to 50
+           end if
+           brow = m
+           if ( tran ) then
+             brow = n
+           end if
+           bnrm = stdlib_wlange( 'M', brow, nrhs, b, ldb, dum )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_wlascl( 'G', 0, 0, bnrm, smlnum, brow, nrhs, b, ldb,info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_wlascl( 'G', 0, 0, bnrm, bignum, brow, nrhs, b, ldb,info )
+              ibscl = 2
+           end if
+           if ( m>=n ) then
+              ! compute qr factorization of a
+             call stdlib_wgeqr( m, n, a, lda, work( lw2+1 ), lw1,work( 1 ), lw2, info )
+             if ( .not.tran ) then
+                 ! least-squares problem min || a * x - b ||
+                 ! b(1:m,1:nrhs) := q**t * b(1:m,1:nrhs)
+               call stdlib_wgemqr( 'L' , 'C', m, nrhs, n, a, lda,work( lw2+1 ), lw1, b, ldb, work(&
+                          1 ), lw2,info )
+                 ! b(1:n,1:nrhs) := inv(r) * b(1:n,1:nrhs)
+               call stdlib_wtrtrs( 'U', 'N', 'N', n, nrhs,a, lda, b, ldb, info )
+               if( info>0 ) then
+                 return
+               end if
+               scllen = n
+             else
+                 ! overdetermined system of equations a**t * x = b
+                 ! b(1:n,1:nrhs) := inv(r**t) * b(1:n,1:nrhs)
+                 call stdlib_wtrtrs( 'U', 'C', 'N', n, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(n+1:m,1:nrhs) = czero
+                 do j = 1, nrhs
+                    do i = n + 1, m
+                       b( i, j ) = czero
+                    end do
+                 end do
+                 ! b(1:m,1:nrhs) := q(1:n,:) * b(1:n,1:nrhs)
+                 call stdlib_wgemqr( 'L', 'N', m, nrhs, n, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 scllen = m
+              end if
+           else
+              ! compute lq factorization of a
+              call stdlib_wgelq( m, n, a, lda, work( lw2+1 ), lw1,work( 1 ), lw2, info )
+              ! workspace at least m, optimally m*nb.
+              if( .not.tran ) then
+                 ! underdetermined system of equations a * x = b
+                 ! b(1:m,1:nrhs) := inv(l) * b(1:m,1:nrhs)
+                 call stdlib_wtrtrs( 'L', 'N', 'N', m, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(m+1:n,1:nrhs) = 0
+                 do j = 1, nrhs
+                    do i = m + 1, n
+                       b( i, j ) = czero
+                    end do
+                 end do
+                 ! b(1:n,1:nrhs) := q(1:n,:)**t * b(1:m,1:nrhs)
+                 call stdlib_wgemlq( 'L', 'C', n, nrhs, m, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = n
+              else
+                 ! overdetermined system min || a**t * x - b ||
+                 ! b(1:n,1:nrhs) := q * b(1:n,1:nrhs)
+                 call stdlib_wgemlq( 'L', 'N', n, nrhs, m, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:m,1:nrhs) := inv(l**t) * b(1:m,1:nrhs)
+                 call stdlib_wtrtrs( 'L', 'C', 'N', m, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = m
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+             call stdlib_wlascl( 'G', 0, 0, anrm, smlnum, scllen, nrhs, b, ldb,info )
+           else if( iascl==2 ) then
+             call stdlib_wlascl( 'G', 0, 0, anrm, bignum, scllen, nrhs, b, ldb,info )
+           end if
+           if( ibscl==1 ) then
+             call stdlib_wlascl( 'G', 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb,info )
+           else if( ibscl==2 ) then
+             call stdlib_wlascl( 'G', 0, 0, bignum, bnrm, scllen, nrhs, b, ldb,info )
+           end if
+           50 continue
+           work( 1 ) = real( tszo + lwo,KIND=qp)
+           return
+     end subroutine stdlib_wgetsls
+
+     !> ZGETSQRHRT: computes a NB2-sized column blocked QR-factorization
+     !> of a complex M-by-N matrix A with M >= N,
+     !> A = Q * R.
+     !> The routine uses internally a NB1-sized column blocked and MB1-sized
+     !> row blocked TSQR-factorization and perfors the reconstruction
+     !> of the Householder vectors from the TSQR output. The routine also
+     !> converts the R_tsqr factor from the TSQR-factorization output into
+     !> the R factor that corresponds to the Householder QR-factorization,
+     !> A = Q_tsqr * R_tsqr = Q * R.
+     !> The output Q and R factors are stored in the same format as in ZGEQRT
+     !> (Q is in blocked compact WY-representation). See the documentation
+     !> of ZGEQRT for more details on the format.
+
+     pure subroutine stdlib_wgetsqrhrt( m, n, mb1, nb1, nb2, a, lda, t, ldt, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, nb1, nb2, mb1
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: t(ldt,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, lw1, lw2, lwt, ldwt, lworkopt, nb1local, nb2local, &
+                     num_all_row_blocks
+           ! Intrinsic Functions 
+           intrinsic :: ceiling,real,cmplx,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery  = lwork==-1
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb1<=n ) then
+              info = -3
+           else if( nb1<1 ) then
+              info = -4
+           else if( nb2<1 ) then
+              info = -5
+           else if( lda<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1,  min( nb2, n ) ) ) then
+              info = -9
+           else
+              ! test the input lwork for the dimension of the array work.
+              ! this workspace is used to store array:
+              ! a) matrix t and work for stdlib_wlatsqr;
+              ! b) n-by-n upper-triangular factor r_tsqr;
+              ! c) matrix t and array work for stdlib_wungtsqr_row;
+              ! d) diagonal d for stdlib_wunhr_col.
+              if( lwork<n*n+1 .and. .not.lquery ) then
+                 info = -11
+              else
+                 ! set block size for column blocks
+                 nb1local = min( nb1, n )
+                 num_all_row_blocks = max( 1,ceiling( real( m - n,KIND=qp) / real( mb1 - n,&
+                           KIND=qp) ) )
+                 ! length and leading dimension of work array to place
+                 ! t array in tsqr.
+                 lwt = num_all_row_blocks * n * nb1local
+                 ldwt = nb1local
+                 ! length of tsqr work array
+                 lw1 = nb1local * n
+                 ! length of stdlib_wungtsqr_row work array.
+                 lw2 = nb1local * max( nb1local, ( n - nb1local ) )
+                 lworkopt = max( lwt + lw1, max( lwt+n*n+lw2, lwt+n*n+n ) )
+                 if( ( lwork<max( 1, lworkopt ) ).and.(.not.lquery) ) then
+                    info = -11
+                 end if
+              end if
+           end if
+           ! handle error in the input parameters and return workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGETSQRHRT', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = cmplx( lworkopt,KIND=qp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = cmplx( lworkopt,KIND=qp)
+              return
+           end if
+           nb2local = min( nb2, n )
+           ! (1) perform tsqr-factorization of the m-by-n matrix a.
+           call stdlib_wlatsqr( m, n, mb1, nb1local, a, lda, work, ldwt,work(lwt+1), lw1, iinfo )
+                     
+           ! (2) copy the factor r_tsqr stored in the upper-triangular part
+               ! of a into the square matrix in the work array
+               ! work(lwt+1:lwt+n*n) column-by-column.
+           do j = 1, n
+              call stdlib_wcopy( j, a( 1, j ), 1, work( lwt + n*(j-1)+1 ), 1 )
+           end do
+           ! (3) generate a m-by-n matrix q with orthonormal columns from
+           ! the result stored below the diagonal in the array a in place.
+           call stdlib_wungtsqr_row( m, n, mb1, nb1local, a, lda, work, ldwt,work( lwt+n*n+1 ), &
+                     lw2, iinfo )
+           ! (4) perform the reconstruction of householder vectors from
+           ! the matrix q (stored in a) in place.
+           call stdlib_wunhr_col( m, n, nb2local, a, lda, t, ldt,work( lwt+n*n+1 ), iinfo )
+                     
+           ! (5) copy the factor r_tsqr stored in the square matrix in the
+           ! work array work(lwt+1:lwt+n*n) into the upper-triangular
+           ! part of a.
+           ! (6) compute from r_tsqr the factor r_hr corresponding to
+           ! the reconstructed householder vectors, i.e. r_hr = s * r_tsqr.
+           ! this multiplication by the sign matrix s on the left means
+           ! changing the sign of i-th row of the matrix r_tsqr according
+           ! to sign of the i-th diagonal element diag(i) of the matrix s.
+           ! diag is stored in work( lwt+n*n+1 ) from the stdlib_wunhr_col output.
+           ! (5) and (6) can be combined in a single loop, so the rows in a
+           ! are accessed only once.
+           do i = 1, n
+              if( work( lwt+n*n+i )==-cone ) then
+                 do j = i, n
+                    a( i, j ) = -cone * work( lwt+n*(j-1)+i )
+                 end do
+              else
+                 call stdlib_wcopy( n-i+1, work(lwt+n*(i-1)+i), n, a( i, i ), lda )
+              end if
+           end do
+           work( 1 ) = cmplx( lworkopt,KIND=qp)
+           return
+     end subroutine stdlib_wgetsqrhrt
+
+     !> ZGGBAK: forms the right or left eigenvectors of a complex generalized
+     !> eigenvalue problem A*x = lambda*B*x, by backward transformation on
+     !> the computed eigenvectors of the balanced pair of matrices output by
+     !> ZGGBAL.
+
+     pure subroutine stdlib_wggbak( job, side, n, ilo, ihi, lscale, rscale, m, v,ldv, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job, side
+           integer(ilp), intent(in) :: ihi, ilo, ldv, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(in) :: lscale(*), rscale(*)
+           complex(qp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: leftv, rightv
+           integer(ilp) :: i, k
+           ! Intrinsic Functions 
+           intrinsic :: max,int
+           ! Executable Statements 
+           ! test the input parameters
+           rightv = stdlib_lsame( side, 'R' )
+           leftv = stdlib_lsame( side, 'L' )
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( .not.rightv .and. .not.leftv ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( n==0 .and. ihi==0 .and. ilo/=1 ) then
+              info = -4
+           else if( n>0 .and. ( ihi<ilo .or. ihi>max( 1, n ) ) )then
+              info = -5
+           else if( n==0 .and. ilo==1 .and. ihi/=0 ) then
+              info = -5
+           else if( m<0 ) then
+              info = -8
+           else if( ldv<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGBAK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( m==0 )return
+           if( stdlib_lsame( job, 'N' ) )return
+           if( ilo==ihi )go to 30
+           ! backward balance
+           if( stdlib_lsame( job, 'S' ) .or. stdlib_lsame( job, 'B' ) ) then
+              ! backward transformation on right eigenvectors
+              if( rightv ) then
+                 do i = ilo, ihi
+                    call stdlib_wdscal( m, rscale( i ), v( i, 1 ), ldv )
+                 end do
+              end if
+              ! backward transformation on left eigenvectors
+              if( leftv ) then
+                 do i = ilo, ihi
+                    call stdlib_wdscal( m, lscale( i ), v( i, 1 ), ldv )
+                 end do
+              end if
+           end if
+           ! backward permutation
+           30 continue
+           if( stdlib_lsame( job, 'P' ) .or. stdlib_lsame( job, 'B' ) ) then
+              ! backward permutation on right eigenvectors
+              if( rightv ) then
+                 if( ilo==1 )go to 50
+                 loop_40: do i = ilo - 1, 1, -1
+                    k = int(rscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_40
+                    call stdlib_wswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_40
+                 50 continue
+                 if( ihi==n )go to 70
+                 loop_60: do i = ihi + 1, n
+                    k = int(rscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_60
+                    call stdlib_wswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_60
+              end if
+              ! backward permutation on left eigenvectors
+              70 continue
+              if( leftv ) then
+                 if( ilo==1 )go to 90
+                 loop_80: do i = ilo - 1, 1, -1
+                    k = int(lscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_80
+                    call stdlib_wswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_80
+                 90 continue
+                 if( ihi==n )go to 110
+                 loop_100: do i = ihi + 1, n
+                    k = int(lscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_100
+                    call stdlib_wswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_100
+              end if
+           end if
+           110 continue
+           return
+     end subroutine stdlib_wggbak
+
+     !> ZGGBAL: balances a pair of general complex matrices (A,B).  This
+     !> involves, first, permuting A and B by similarity transformations to
+     !> isolate eigenvalues in the first 1 to ILO$-$1 and last IHI+1 to N
+     !> elements on the diagonal; and second, applying a diagonal similarity
+     !> transformation to rows and columns ILO to IHI to make the rows
+     !> and columns as close in norm as possible. Both steps are optional.
+     !> Balancing may reduce the 1-norm of the matrices, and improve the
+     !> accuracy of the computed eigenvalues and/or eigenvectors in the
+     !> generalized eigenvalue problem A*x = lambda*B*x.
+
+     pure subroutine stdlib_wggbal( job, n, a, lda, b, ldb, ilo, ihi, lscale,rscale, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldb, n
+           ! Array Arguments 
+           real(qp), intent(out) :: lscale(*), rscale(*), work(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sclfac = 1.0e+1_qp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, icab, iflow, ip1, ir, irab, it, j, jc, jp1, k, kount, l, lcab, lm1, &
+                     lrab, lsfmax, lsfmin, m, nr, nrp2
+           real(qp) :: alpha, basl, beta, cab, cmax, coef, coef2, coef5, cor, ew, ewc, gamma, &
+                     pgamma, rab, sfmax, sfmin, sum, t, ta, tb, tc
+           complex(qp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,int,log10,max,min,sign
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGBAL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              ilo = 1
+              ihi = n
+              return
+           end if
+           if( n==1 ) then
+              ilo = 1
+              ihi = n
+              lscale( 1 ) = one
+              rscale( 1 ) = one
+              return
+           end if
+           if( stdlib_lsame( job, 'N' ) ) then
+              ilo = 1
+              ihi = n
+              do i = 1, n
+                 lscale( i ) = one
+                 rscale( i ) = one
+              end do
+              return
+           end if
+           k = 1
+           l = n
+           if( stdlib_lsame( job, 'S' ) )go to 190
+           go to 30
+           ! permute the matrices a and b to isolate the eigenvalues.
+           ! find row with one nonzero in columns 1 through l
+           20 continue
+           l = lm1
+           if( l/=1 )go to 30
+           rscale( 1 ) = 1
+           lscale( 1 ) = 1
+           go to 190
+           30 continue
+           lm1 = l - 1
+           loop_80: do i = l, 1, -1
+              do j = 1, lm1
+                 jp1 = j + 1
+                 if( a( i, j )/=czero .or. b( i, j )/=czero )go to 50
+              end do
+              j = l
+              go to 70
+              50 continue
+              do j = jp1, l
+                 if( a( i, j )/=czero .or. b( i, j )/=czero )cycle loop_80
+              end do
+              j = jp1 - 1
+              70 continue
+              m = l
+              iflow = 1
+              go to 160
+           end do loop_80
+           go to 100
+           ! find column with one nonzero in rows k through n
+           90 continue
+           k = k + 1
+           100 continue
+           loop_150: do j = k, l
+              do i = k, lm1
+                 ip1 = i + 1
+                 if( a( i, j )/=czero .or. b( i, j )/=czero )go to 120
+              end do
+              i = l
+              go to 140
+              120 continue
+              do i = ip1, l
+                 if( a( i, j )/=czero .or. b( i, j )/=czero )cycle loop_150
+              end do
+              i = ip1 - 1
+              140 continue
+              m = k
+              iflow = 2
+              go to 160
+           end do loop_150
+           go to 190
+           ! permute rows m and i
+           160 continue
+           lscale( m ) = i
+           if( i==m )go to 170
+           call stdlib_wswap( n-k+1, a( i, k ), lda, a( m, k ), lda )
+           call stdlib_wswap( n-k+1, b( i, k ), ldb, b( m, k ), ldb )
+           ! permute columns m and j
+           170 continue
+           rscale( m ) = j
+           if( j==m )go to 180
+           call stdlib_wswap( l, a( 1, j ), 1, a( 1, m ), 1 )
+           call stdlib_wswap( l, b( 1, j ), 1, b( 1, m ), 1 )
+           180 continue
+           go to ( 20, 90 )iflow
+           190 continue
+           ilo = k
+           ihi = l
+           if( stdlib_lsame( job, 'P' ) ) then
+              do i = ilo, ihi
+                 lscale( i ) = one
+                 rscale( i ) = one
+              end do
+              return
+           end if
+           if( ilo==ihi )return
+           ! balance the submatrix in rows ilo to ihi.
+           nr = ihi - ilo + 1
+           do i = ilo, ihi
+              rscale( i ) = zero
+              lscale( i ) = zero
+              work( i ) = zero
+              work( i+n ) = zero
+              work( i+2*n ) = zero
+              work( i+3*n ) = zero
+              work( i+4*n ) = zero
+              work( i+5*n ) = zero
+           end do
+           ! compute right side vector in resulting linear equations
+           basl = log10( sclfac )
+           do i = ilo, ihi
+              do j = ilo, ihi
+                 if( a( i, j )==czero ) then
+                    ta = zero
+                    go to 210
+                 end if
+                 ta = log10( cabs1( a( i, j ) ) ) / basl
+                 210 continue
+                 if( b( i, j )==czero ) then
+                    tb = zero
+                    go to 220
+                 end if
+                 tb = log10( cabs1( b( i, j ) ) ) / basl
+                 220 continue
+                 work( i+4*n ) = work( i+4*n ) - ta - tb
+                 work( j+5*n ) = work( j+5*n ) - ta - tb
+              end do
+           end do
+           coef = one / real( 2*nr,KIND=qp)
+           coef2 = coef*coef
+           coef5 = half*coef2
+           nrp2 = nr + 2
+           beta = zero
+           it = 1
+           ! start generalized conjugate gradient iteration
+           250 continue
+           gamma = stdlib_qdot( nr, work( ilo+4*n ), 1, work( ilo+4*n ), 1 ) +stdlib_qdot( nr, &
+                     work( ilo+5*n ), 1, work( ilo+5*n ), 1 )
+           ew = zero
+           ewc = zero
+           do i = ilo, ihi
+              ew = ew + work( i+4*n )
+              ewc = ewc + work( i+5*n )
+           end do
+           gamma = coef*gamma - coef2*( ew**2+ewc**2 ) - coef5*( ew-ewc )**2
+           if( gamma==zero )go to 350
+           if( it/=1 )beta = gamma / pgamma
+           t = coef5*( ewc-three*ew )
+           tc = coef5*( ew-three*ewc )
+           call stdlib_qscal( nr, beta, work( ilo ), 1 )
+           call stdlib_qscal( nr, beta, work( ilo+n ), 1 )
+           call stdlib_qaxpy( nr, coef, work( ilo+4*n ), 1, work( ilo+n ), 1 )
+           call stdlib_qaxpy( nr, coef, work( ilo+5*n ), 1, work( ilo ), 1 )
+           do i = ilo, ihi
+              work( i ) = work( i ) + tc
+              work( i+n ) = work( i+n ) + t
+           end do
+           ! apply matrix to vector
+           do i = ilo, ihi
+              kount = 0
+              sum = zero
+              loop_290: do j = ilo, ihi
+                 if( a( i, j )==czero )go to 280
+                 kount = kount + 1
+                 sum = sum + work( j )
+                 280 continue
+                 if( b( i, j )==czero )cycle loop_290
+                 kount = kount + 1
+                 sum = sum + work( j )
+              end do loop_290
+              work( i+2*n ) = real( kount,KIND=qp)*work( i+n ) + sum
+           end do
+           do j = ilo, ihi
+              kount = 0
+              sum = zero
+              loop_320: do i = ilo, ihi
+                 if( a( i, j )==czero )go to 310
+                 kount = kount + 1
+                 sum = sum + work( i+n )
+                 310 continue
+                 if( b( i, j )==czero )cycle loop_320
+                 kount = kount + 1
+                 sum = sum + work( i+n )
+              end do loop_320
+              work( j+3*n ) = real( kount,KIND=qp)*work( j ) + sum
+           end do
+           sum = stdlib_qdot( nr, work( ilo+n ), 1, work( ilo+2*n ), 1 ) +stdlib_qdot( nr, work( &
+                     ilo ), 1, work( ilo+3*n ), 1 )
+           alpha = gamma / sum
+           ! determine correction to current iteration
+           cmax = zero
+           do i = ilo, ihi
+              cor = alpha*work( i+n )
+              if( abs( cor )>cmax )cmax = abs( cor )
+              lscale( i ) = lscale( i ) + cor
+              cor = alpha*work( i )
+              if( abs( cor )>cmax )cmax = abs( cor )
+              rscale( i ) = rscale( i ) + cor
+           end do
+           if( cmax<half )go to 350
+           call stdlib_qaxpy( nr, -alpha, work( ilo+2*n ), 1, work( ilo+4*n ), 1 )
+           call stdlib_qaxpy( nr, -alpha, work( ilo+3*n ), 1, work( ilo+5*n ), 1 )
+           pgamma = gamma
+           it = it + 1
+           if( it<=nrp2 )go to 250
+           ! end generalized conjugate gradient iteration
+           350 continue
+           sfmin = stdlib_qlamch( 'S' )
+           sfmax = one / sfmin
+           lsfmin = int( log10( sfmin ) / basl+one,KIND=ilp)
+           lsfmax = int( log10( sfmax ) / basl,KIND=ilp)
+           do i = ilo, ihi
+              irab = stdlib_iwamax( n-ilo+1, a( i, ilo ), lda )
+              rab = abs( a( i, irab+ilo-1 ) )
+              irab = stdlib_iwamax( n-ilo+1, b( i, ilo ), ldb )
+              rab = max( rab, abs( b( i, irab+ilo-1 ) ) )
+              lrab = int( log10( rab+sfmin ) / basl+one,KIND=ilp)
+              ir = int(lscale( i ) + sign( half, lscale( i ) ),KIND=ilp)
+              ir = min( max( ir, lsfmin ), lsfmax, lsfmax-lrab )
+              lscale( i ) = sclfac**ir
+              icab = stdlib_iwamax( ihi, a( 1, i ), 1 )
+              cab = abs( a( icab, i ) )
+              icab = stdlib_iwamax( ihi, b( 1, i ), 1 )
+              cab = max( cab, abs( b( icab, i ) ) )
+              lcab = int( log10( cab+sfmin ) / basl+one,KIND=ilp)
+              jc = int(rscale( i ) + sign( half, rscale( i ) ),KIND=ilp)
+              jc = min( max( jc, lsfmin ), lsfmax, lsfmax-lcab )
+              rscale( i ) = sclfac**jc
+           end do
+           ! row scaling of matrices a and b
+           do i = ilo, ihi
+              call stdlib_wdscal( n-ilo+1, lscale( i ), a( i, ilo ), lda )
+              call stdlib_wdscal( n-ilo+1, lscale( i ), b( i, ilo ), ldb )
+           end do
+           ! column scaling of matrices a and b
+           do j = ilo, ihi
+              call stdlib_wdscal( ihi, rscale( j ), a( 1, j ), 1 )
+              call stdlib_wdscal( ihi, rscale( j ), b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_wggbal
+
+     !> ZGGES: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, the generalized complex Schur
+     !> form (S, T), and optionally left and/or right Schur vectors (VSL
+     !> and VSR). This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL)*S*(VSR)**H, (VSL)*T*(VSR)**H )
+     !> where (VSR)**H is the conjugate-transpose of VSR.
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> triangular matrix S and the upper triangular matrix T. The leading
+     !> columns of VSL and VSR then form an unitary basis for the
+     !> corresponding left and right eigenspaces (deflating subspaces).
+     !> (If only the generalized eigenvalues are needed, use the driver
+     !> ZGGEV instead, which is faster.)
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0, and even for both being zero.
+     !> A pair of matrices (S,T) is in generalized complex Schur form if S
+     !> and T are upper triangular and, in addition, the diagonal elements
+     !> of T are non-negative real numbers.
+
+     subroutine stdlib_wgges( jobvsl, jobvsr, sort, selctg, n, a, lda, b, ldb,sdim, alpha, beta, &
+               vsl, ldvsl, vsr, ldvsr, work,lwork, rwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: alpha(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), work(*)
+                     
+           ! Function Arguments 
+           procedure(stdlib_selctg_w) :: selctg
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, wantst
+           integer(ilp) :: i, icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, iright, irows, irwrk, &
+                     itau, iwrk, lwkmin, lwkopt
+           real(qp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pvsl, pvsr, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(qp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -14
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -16
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              lwkmin = max( 1, 2*n )
+              lwkopt = max( 1, n + n*stdlib_ilaenv( 1, 'ZGEQRF', ' ', n, 1, n, 0 ) )
+              lwkopt = max( lwkopt, n +n*stdlib_ilaenv( 1, 'ZUNMQR', ' ', n, 1, n, -1 ) )
+              if( ilvsl ) then
+                 lwkopt = max( lwkopt, n +n*stdlib_ilaenv( 1, 'ZUNGQR', ' ', n, 1, n, -1 ) )
+                           
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery )info = -18
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGES ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_wlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_wlange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_wlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (real workspace: need 6*n)
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_wggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (complex workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_wgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (complex workspace: need n, prefer n*nb)
+           call stdlib_wunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           ! (complex workspace: need n, prefer n*nb)
+           if( ilvsl ) then
+              call stdlib_wlaset( 'FULL', n, n, czero, cone, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_wlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_wungqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_wlaset( 'FULL', n, n, czero, cone, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           call stdlib_wgghrd( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      ierr )
+           sdim = 0
+           ! perform qz algorithm, computing schur vectors if desired
+           ! (complex workspace: need n)
+           ! (real workspace: need n)
+           iwrk = itau
+           call stdlib_whgeqz( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vsl, &
+                     ldvsl, vsr, ldvsr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 30
+           end if
+           ! sort eigenvalues alpha/beta if desired
+           ! (workspace: none needed)
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selecting
+              if( ilascl )call stdlib_wlascl( 'G', 0, 0, anrm, anrmto, n, 1, alpha, n, ierr )
+                        
+              if( ilbscl )call stdlib_wlascl( 'G', 0, 0, bnrm, bnrmto, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alpha( i ), beta( i ) )
+              end do
+              call stdlib_wtgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb, alpha,beta, vsl, &
+              ldvsl, vsr, ldvsr, sdim, pvsl, pvsr,dif, work( iwrk ), lwork-iwrk+1, idum, 1, ierr )
+                        
+              if( ierr==1 )info = n + 3
+           end if
+           ! apply back-permutation to vsl and vsr
+           ! (workspace: none needed)
+           if( ilvsl )call stdlib_wggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_wggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsr, ldvsr, ierr )
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_wlascl( 'U', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_wlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_wlascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_wlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              sdim = 0
+              do i = 1, n
+                 cursl = selctg( alpha( i ), beta( i ) )
+                 if( cursl )sdim = sdim + 1
+                 if( cursl .and. .not.lastsl )info = n + 2
+                 lastsl = cursl
+              end do
+           end if
+           30 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wgges
+
+     !> ZGGES3: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, the generalized complex Schur
+     !> form (S, T), and optionally left and/or right Schur vectors (VSL
+     !> and VSR). This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL)*S*(VSR)**H, (VSL)*T*(VSR)**H )
+     !> where (VSR)**H is the conjugate-transpose of VSR.
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> triangular matrix S and the upper triangular matrix T. The leading
+     !> columns of VSL and VSR then form an unitary basis for the
+     !> corresponding left and right eigenspaces (deflating subspaces).
+     !> (If only the generalized eigenvalues are needed, use the driver
+     !> ZGGEV instead, which is faster.)
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0, and even for both being zero.
+     !> A pair of matrices (S,T) is in generalized complex Schur form if S
+     !> and T are upper triangular and, in addition, the diagonal elements
+     !> of T are non-negative real numbers.
+
+     subroutine stdlib_wgges3( jobvsl, jobvsr, sort, selctg, n, a, lda, b,ldb, sdim, alpha, beta, &
+               vsl, ldvsl, vsr, ldvsr,work, lwork, rwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: alpha(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), work(*)
+                     
+           ! Function Arguments 
+           procedure(stdlib_selctg_w) :: selctg
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, wantst
+           integer(ilp) :: i, icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, iright, irows, irwrk, &
+                     itau, iwrk, lwkopt
+           real(qp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pvsl, pvsr, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(qp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -14
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -16
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           ! compute workspace
+           if( info==0 ) then
+              call stdlib_wgeqrf( n, n, b, ldb, work, work, -1, ierr )
+              lwkopt = max( 1,  n + int( work( 1 ),KIND=ilp) )
+              call stdlib_wunmqr( 'L', 'C', n, n, n, b, ldb, work, a, lda, work,-1, ierr )
+              lwkopt = max( lwkopt, n + int( work( 1 ),KIND=ilp) )
+              if( ilvsl ) then
+                 call stdlib_wungqr( n, n, n, vsl, ldvsl, work, work, -1, ierr )
+                 lwkopt = max( lwkopt, n + int( work( 1 ),KIND=ilp) )
+              end if
+              call stdlib_wgghd3( jobvsl, jobvsr, n, 1, n, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr, &
+                        work, -1, ierr )
+              lwkopt = max( lwkopt, n + int( work( 1 ),KIND=ilp) )
+              call stdlib_wlaqz0( 'S', jobvsl, jobvsr, n, 1, n, a, lda, b, ldb,alpha, beta, vsl, &
+                        ldvsl, vsr, ldvsr, work, -1,rwork, 0, ierr )
+              lwkopt = max( lwkopt, int( work( 1 ),KIND=ilp) )
+              if( wantst ) then
+                 call stdlib_wtgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb,alpha, beta, vsl, &
+                           ldvsl, vsr, ldvsr, sdim,pvsl, pvsr, dif, work, -1, idum, 1, ierr )
+                 lwkopt = max( lwkopt, int( work( 1 ),KIND=ilp) )
+              end if
+              work( 1 ) = cmplx( lwkopt,KIND=qp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGES3 ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_wlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_wlange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_wlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_wggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_wgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           call stdlib_wunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           if( ilvsl ) then
+              call stdlib_wlaset( 'FULL', n, n, czero, cone, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_wlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_wungqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_wlaset( 'FULL', n, n, czero, cone, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           call stdlib_wgghd3( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      work( iwrk ), lwork+1-iwrk, ierr )
+           sdim = 0
+           ! perform qz algorithm, computing schur vectors if desired
+           iwrk = itau
+           call stdlib_wlaqz0( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vsl, &
+                     ldvsl, vsr, ldvsr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), 0, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 30
+           end if
+           ! sort eigenvalues alpha/beta if desired
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selecting
+              if( ilascl )call stdlib_wlascl( 'G', 0, 0, anrm, anrmto, n, 1, alpha, n, ierr )
+                        
+              if( ilbscl )call stdlib_wlascl( 'G', 0, 0, bnrm, bnrmto, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alpha( i ), beta( i ) )
+              end do
+              call stdlib_wtgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb, alpha,beta, vsl, &
+              ldvsl, vsr, ldvsr, sdim, pvsl, pvsr,dif, work( iwrk ), lwork-iwrk+1, idum, 1, ierr )
+                        
+              if( ierr==1 )info = n + 3
+           end if
+           ! apply back-permutation to vsl and vsr
+           if( ilvsl )call stdlib_wggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_wggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsr, ldvsr, ierr )
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_wlascl( 'U', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_wlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_wlascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_wlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              sdim = 0
+              do i = 1, n
+                 cursl = selctg( alpha( i ), beta( i ) )
+                 if( cursl )sdim = sdim + 1
+                 if( cursl .and. .not.lastsl )info = n + 2
+                 lastsl = cursl
+              end do
+           end if
+           30 continue
+           work( 1 ) = cmplx( lwkopt,KIND=qp)
+           return
+     end subroutine stdlib_wgges3
+
+     !> ZGGESX: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, the complex Schur form (S,T),
+     !> and, optionally, the left and/or right matrices of Schur vectors (VSL
+     !> and VSR).  This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL) S (VSR)**H, (VSL) T (VSR)**H )
+     !> where (VSR)**H is the conjugate-transpose of VSR.
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> triangular matrix S and the upper triangular matrix T; computes
+     !> a reciprocal condition number for the average of the selected
+     !> eigenvalues (RCONDE); and computes a reciprocal condition number for
+     !> the right and left deflating subspaces corresponding to the selected
+     !> eigenvalues (RCONDV). The leading columns of VSL and VSR then form
+     !> an orthonormal basis for the corresponding left and right eigenspaces
+     !> (deflating subspaces).
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0 or for both being zero.
+     !> A pair of matrices (S,T) is in generalized complex Schur form if T is
+     !> upper triangular with non-negative diagonal and S is upper
+     !> triangular.
+
+     subroutine stdlib_wggesx( jobvsl, jobvsr, sort, selctg, sense, n, a, lda,b, ldb, sdim, alpha,&
+      beta, vsl, ldvsl, vsr,ldvsr, rconde, rcondv, work, lwork, rwork,iwork, liwork, bwork, info )
+                
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sense, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, liwork, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: rconde(2), rcondv(2), rwork(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: alpha(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), work(*)
+                     
+           ! Function Arguments 
+           procedure(stdlib_selctg_w) :: selctg
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, wantsb, wantse, &
+                     wantsn, wantst, wantsv
+           integer(ilp) :: i, icols, ierr, ihi, ijob, ijobvl, ijobvr, ileft, ilo, iright, irows, &
+                     irwrk, itau, iwrk, liwmin, lwrk, maxwrk, minwrk
+           real(qp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pl, pr, smlnum
+           ! Local Arrays 
+           real(qp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( wantsn ) then
+              ijob = 0
+           else if( wantse ) then
+              ijob = 1
+           else if( wantsv ) then
+              ijob = 2
+           else if( wantsb ) then
+              ijob = 4
+           end if
+           ! test the input arguments
+           info = 0
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( .not.( wantsn .or. wantse .or. wantsv .or. wantsb ) .or.( .not.wantst .and. &
+                     .not.wantsn ) ) then
+              info = -5
+           else if( n<0 ) then
+              info = -6
+           else if( lda<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -15
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -17
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              if( n>0) then
+                 minwrk = 2*n
+                 maxwrk = n*(1 + stdlib_ilaenv( 1, 'ZGEQRF', ' ', n, 1, n, 0 ) )
+                 maxwrk = max( maxwrk, n*( 1 +stdlib_ilaenv( 1, 'ZUNMQR', ' ', n, 1, n, -1 ) ) )
+                           
+                 if( ilvsl ) then
+                    maxwrk = max( maxwrk, n*( 1 +stdlib_ilaenv( 1, 'ZUNGQR', ' ', n, 1, n, -1 ) ) &
+                              )
+                 end if
+                 lwrk = maxwrk
+                 if( ijob>=1 )lwrk = max( lwrk, n*n/2 )
+              else
+                 minwrk = 1
+                 maxwrk = 1
+                 lwrk   = 1
+              end if
+              work( 1 ) = lwrk
+              if( wantsn .or. n==0 ) then
+                 liwmin = 1
+              else
+                 liwmin = n + 2
+              end if
+              iwork( 1 ) = liwmin
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -21
+              else if( liwork<liwmin  .and. .not.lquery) then
+                 info = -24
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGESX', -info )
+              return
+           else if (lquery) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_wlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_wlange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_wlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (real workspace: need 6*n)
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_wggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (complex workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_wgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the unitary transformation to matrix a
+           ! (complex workspace: need n, prefer n*nb)
+           call stdlib_wunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           ! (complex workspace: need n, prefer n*nb)
+           if( ilvsl ) then
+              call stdlib_wlaset( 'FULL', n, n, czero, cone, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_wlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_wungqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_wlaset( 'FULL', n, n, czero, cone, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           call stdlib_wgghrd( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      ierr )
+           sdim = 0
+           ! perform qz algorithm, computing schur vectors if desired
+           ! (complex workspace: need n)
+           ! (real workspace:    need n)
+           iwrk = itau
+           call stdlib_whgeqz( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vsl, &
+                     ldvsl, vsr, ldvsr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 40
+           end if
+           ! sort eigenvalues alpha/beta and compute the reciprocal of
+           ! condition number(s)
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selctging
+              if( ilascl )call stdlib_wlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+                        
+              if( ilbscl )call stdlib_wlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alpha( i ), beta( i ) )
+              end do
+              ! reorder eigenvalues, transform generalized schur vectors, and
+              ! compute reciprocal condition numbers
+              ! (complex workspace: if ijob >= 1, need max(1, 2*sdim*(n-sdim))
+                                  ! otherwise, need 1 )
+              call stdlib_wtgsen( ijob, ilvsl, ilvsr, bwork, n, a, lda, b, ldb,alpha, beta, vsl, &
+              ldvsl, vsr, ldvsr, sdim, pl, pr,dif, work( iwrk ), lwork-iwrk+1, iwork, liwork,ierr &
+                        )
+              if( ijob>=1 )maxwrk = max( maxwrk, 2*sdim*( n-sdim ) )
+              if( ierr==-21 ) then
+                  ! not enough complex workspace
+                 info = -21
+              else
+                 if( ijob==1 .or. ijob==4 ) then
+                    rconde( 1 ) = pl
+                    rconde( 2 ) = pr
+                 end if
+                 if( ijob==2 .or. ijob==4 ) then
+                    rcondv( 1 ) = dif( 1 )
+                    rcondv( 2 ) = dif( 2 )
+                 end if
+                 if( ierr==1 )info = n + 3
+              end if
+           end if
+           ! apply permutation to vsl and vsr
+           ! (workspace: none needed)
+           if( ilvsl )call stdlib_wggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_wggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsr, ldvsr, ierr )
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_wlascl( 'U', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_wlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_wlascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_wlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              sdim = 0
+              do i = 1, n
+                 cursl = selctg( alpha( i ), beta( i ) )
+                 if( cursl )sdim = sdim + 1
+                 if( cursl .and. .not.lastsl )info = n + 2
+                 lastsl = cursl
+              end do
+           end if
+           40 continue
+           work( 1 ) = maxwrk
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_wggesx
+
+     !> ZGGEV: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, and optionally, the left and/or
+     !> right generalized eigenvectors.
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right generalized eigenvector v(j) corresponding to the
+     !> generalized eigenvalue lambda(j) of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j).
+     !> The left generalized eigenvector u(j) corresponding to the
+     !> generalized eigenvalues lambda(j) of (A,B) satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H * B
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_wggev( jobvl, jobvr, n, a, lda, b, ldb, alpha, beta,vl, ldvl, vr, ldvr, &
+               work, lwork, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: alpha(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery
+           character :: chtemp
+           integer(ilp) :: icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, in, iright, irows, irwrk,&
+                      itau, iwrk, jc, jr, lwkmin, lwkopt
+           real(qp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           complex(qp) :: x
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,sqrt
+           ! Statement Functions 
+           real(qp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=qp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -11
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -13
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv. the workspace is
+             ! computed assuming ilo = 1 and ihi = n, the worst case.)
+           if( info==0 ) then
+              lwkmin = max( 1, 2*n )
+              lwkopt = max( 1, n + n*stdlib_ilaenv( 1, 'ZGEQRF', ' ', n, 1, n, 0 ) )
+              lwkopt = max( lwkopt, n +n*stdlib_ilaenv( 1, 'ZUNMQR', ' ', n, 1, n, 0 ) )
+              if( ilvl ) then
+                 lwkopt = max( lwkopt, n +n*stdlib_ilaenv( 1, 'ZUNGQR', ' ', n, 1, n, -1 ) )
+                           
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery )info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_qlamch( 'E' )*stdlib_qlamch( 'B' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_wlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_wlange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_wlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrices a, b to isolate eigenvalues if possible
+           ! (real workspace: need 6*n)
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_wggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (complex workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           if( ilv ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_wgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (complex workspace: need n, prefer n*nb)
+           call stdlib_wunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl
+           ! (complex workspace: need n, prefer n*nb)
+           if( ilvl ) then
+              call stdlib_wlaset( 'FULL', n, n, czero, cone, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_wlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_wungqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vr
+           if( ilvr )call stdlib_wlaset( 'FULL', n, n, czero, cone, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           if( ilv ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_wgghrd( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        ierr )
+           else
+              call stdlib_wgghrd( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur form and schur vectors)
+           ! (complex workspace: need n)
+           ! (real workspace: need n)
+           iwrk = itau
+           if( ilv ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_whgeqz( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vl, &
+                     ldvl, vr, ldvr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 70
+           end if
+           ! compute eigenvectors
+           ! (real workspace: need 2*n)
+           ! (complex workspace: need 2*n)
+           if( ilv ) then
+              if( ilvl ) then
+                 if( ilvr ) then
+                    chtemp = 'B'
+                 else
+                    chtemp = 'L'
+                 end if
+              else
+                 chtemp = 'R'
+              end if
+              call stdlib_wtgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, n, &
+                        in, work( iwrk ), rwork( irwrk ),ierr )
+              if( ierr/=0 ) then
+                 info = n + 2
+                 go to 70
+              end if
+              ! undo balancing on vl and vr and normalization
+              ! (workspace: none needed)
+              if( ilvl ) then
+                 call stdlib_wggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), n, vl,&
+                            ldvl, ierr )
+                 loop_30: do jc = 1, n
+                    temp = zero
+                    do jr = 1, n
+                       temp = max( temp, abs1( vl( jr, jc ) ) )
+                    end do
+                    if( temp<smlnum )cycle loop_30
+                    temp = one / temp
+                    do jr = 1, n
+                       vl( jr, jc ) = vl( jr, jc )*temp
+                    end do
+                 end do loop_30
+              end if
+              if( ilvr ) then
+                 call stdlib_wggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), n, vr,&
+                            ldvr, ierr )
+                 loop_60: do jc = 1, n
+                    temp = zero
+                    do jr = 1, n
+                       temp = max( temp, abs1( vr( jr, jc ) ) )
+                    end do
+                    if( temp<smlnum )cycle loop_60
+                    temp = one / temp
+                    do jr = 1, n
+                       vr( jr, jc ) = vr( jr, jc )*temp
+                    end do
+                 end do loop_60
+              end if
+           end if
+           ! undo scaling if necessary
+           70 continue
+           if( ilascl )call stdlib_wlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           if( ilbscl )call stdlib_wlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wggev
+
+     !> ZGGEV3: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, and optionally, the left and/or
+     !> right generalized eigenvectors.
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right generalized eigenvector v(j) corresponding to the
+     !> generalized eigenvalue lambda(j) of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j).
+     !> The left generalized eigenvector u(j) corresponding to the
+     !> generalized eigenvalues lambda(j) of (A,B) satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H * B
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_wggev3( jobvl, jobvr, n, a, lda, b, ldb, alpha, beta,vl, ldvl, vr, ldvr, &
+               work, lwork, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: alpha(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery
+           character :: chtemp
+           integer(ilp) :: icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, in, iright, irows, irwrk,&
+                      itau, iwrk, jc, jr, lwkopt
+           real(qp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           complex(qp) :: x
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,sqrt
+           ! Statement Functions 
+           real(qp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=qp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -11
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -13
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -15
+           end if
+           ! compute workspace
+           if( info==0 ) then
+              call stdlib_wgeqrf( n, n, b, ldb, work, work, -1, ierr )
+              lwkopt = max( 1,  n+int( work( 1 ),KIND=ilp) )
+              call stdlib_wunmqr( 'L', 'C', n, n, n, b, ldb, work, a, lda, work,-1, ierr )
+              lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+              if( ilvl ) then
+                 call stdlib_wungqr( n, n, n, vl, ldvl, work, work, -1, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+              end if
+              if( ilv ) then
+                 call stdlib_wgghd3( jobvl, jobvr, n, 1, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                           work, -1, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+                 call stdlib_wlaqz0( 'S', jobvl, jobvr, n, 1, n, a, lda, b, ldb,alpha, beta, vl, &
+                           ldvl, vr, ldvr, work, -1,rwork, 0, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+              else
+                 call stdlib_wgghd3( jobvl, jobvr, n, 1, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                           work, -1, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+                 call stdlib_wlaqz0( 'E', jobvl, jobvr, n, 1, n, a, lda, b, ldb,alpha, beta, vl, &
+                           ldvl, vr, ldvr, work, -1,rwork, 0, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+              end if
+              work( 1 ) = cmplx( lwkopt,KIND=qp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGEV3 ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_qlamch( 'E' )*stdlib_qlamch( 'B' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_wlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_wlange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_wlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrices a, b to isolate eigenvalues if possible
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_wggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           irows = ihi + 1 - ilo
+           if( ilv ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_wgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           call stdlib_wunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl
+           if( ilvl ) then
+              call stdlib_wlaset( 'FULL', n, n, czero, cone, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_wlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_wungqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vr
+           if( ilvr )call stdlib_wlaset( 'FULL', n, n, czero, cone, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           if( ilv ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_wgghd3( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        work( iwrk ), lwork+1-iwrk, ierr )
+           else
+              call stdlib_wgghd3( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr,work( iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur form and schur vectors)
+           iwrk = itau
+           if( ilv ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_wlaqz0( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vl, &
+                     ldvl, vr, ldvr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), 0, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 70
+           end if
+           ! compute eigenvectors
+           if( ilv ) then
+              if( ilvl ) then
+                 if( ilvr ) then
+                    chtemp = 'B'
+                 else
+                    chtemp = 'L'
+                 end if
+              else
+                 chtemp = 'R'
+              end if
+              call stdlib_wtgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, n, &
+                        in, work( iwrk ), rwork( irwrk ),ierr )
+              if( ierr/=0 ) then
+                 info = n + 2
+                 go to 70
+              end if
+              ! undo balancing on vl and vr and normalization
+              if( ilvl ) then
+                 call stdlib_wggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), n, vl,&
+                            ldvl, ierr )
+                 loop_30: do jc = 1, n
+                    temp = zero
+                    do jr = 1, n
+                       temp = max( temp, abs1( vl( jr, jc ) ) )
+                    end do
+                    if( temp<smlnum )cycle loop_30
+                    temp = one / temp
+                    do jr = 1, n
+                       vl( jr, jc ) = vl( jr, jc )*temp
+                    end do
+                 end do loop_30
+              end if
+              if( ilvr ) then
+                 call stdlib_wggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), n, vr,&
+                            ldvr, ierr )
+                 loop_60: do jc = 1, n
+                    temp = zero
+                    do jr = 1, n
+                       temp = max( temp, abs1( vr( jr, jc ) ) )
+                    end do
+                    if( temp<smlnum )cycle loop_60
+                    temp = one / temp
+                    do jr = 1, n
+                       vr( jr, jc ) = vr( jr, jc )*temp
+                    end do
+                 end do loop_60
+              end if
+           end if
+           ! undo scaling if necessary
+           70 continue
+           if( ilascl )call stdlib_wlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           if( ilbscl )call stdlib_wlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           work( 1 ) = cmplx( lwkopt,KIND=qp)
+           return
+     end subroutine stdlib_wggev3
+
+     !> ZGGEVX: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B) the generalized eigenvalues, and optionally, the left and/or
+     !> right generalized eigenvectors.
+     !> Optionally, it also computes a balancing transformation to improve
+     !> the conditioning of the eigenvalues and eigenvectors (ILO, IHI,
+     !> LSCALE, RSCALE, ABNRM, and BBNRM), reciprocal condition numbers for
+     !> the eigenvalues (RCONDE), and reciprocal condition numbers for the
+     !> right eigenvectors (RCONDV).
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right eigenvector v(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j) .
+     !> The left eigenvector u(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> u(j)**H * A  = lambda(j) * u(j)**H * B.
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_wggevx( balanc, jobvl, jobvr, sense, n, a, lda, b, ldb,alpha, beta, vl, &
+     ldvl, vr, ldvr, ilo, ihi,lscale, rscale, abnrm, bbnrm, rconde, rcondv,work, lwork, rwork, &
+               iwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: balanc, jobvl, jobvr, sense
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           real(qp), intent(out) :: abnrm, bbnrm
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: lscale(*), rconde(*), rcondv(*), rscale(*), rwork(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: alpha(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery, noscl, wantsb, wantse, wantsn, &
+                     wantsv
+           character :: chtemp
+           integer(ilp) :: i, icols, ierr, ijobvl, ijobvr, in, irows, itau, iwrk, iwrk1, j, jc, &
+                     jr, m, maxwrk, minwrk
+           real(qp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           complex(qp) :: x
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,sqrt
+           ! Statement Functions 
+           real(qp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=qp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           noscl  = stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc, 'P' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.( noscl .or. stdlib_lsame( balanc,'S' ) .or.stdlib_lsame( balanc, 'B' ) ) ) &
+                     then
+              info = -1
+           else if( ijobvl<=0 ) then
+              info = -2
+           else if( ijobvr<=0 ) then
+              info = -3
+           else if( .not.( wantsn .or. wantse .or. wantsb .or. wantsv ) )then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -13
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -15
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv. the workspace is
+             ! computed assuming ilo = 1 and ihi = n, the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 minwrk = 2*n
+                 if( wantse ) then
+                    minwrk = 4*n
+                 else if( wantsv .or. wantsb ) then
+                    minwrk = 2*n*( n + 1)
+                 end if
+                 maxwrk = minwrk
+                 maxwrk = max( maxwrk,n + n*stdlib_ilaenv( 1, 'ZGEQRF', ' ', n, 1, n, 0 ) )
+                           
+                 maxwrk = max( maxwrk,n + n*stdlib_ilaenv( 1, 'ZUNMQR', ' ', n, 1, n, 0 ) )
+                           
+                 if( ilvl ) then
+                    maxwrk = max( maxwrk, n +n*stdlib_ilaenv( 1, 'ZUNGQR', ' ', n, 1, n, 0 ) )
+                              
+                 end if
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -25
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_wlange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_wlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_wlange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_wlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute and/or balance the matrix pair (a,b)
+           ! (real workspace: need 6*n if balanc = 's' or 'b', 1 otherwise)
+           call stdlib_wggbal( balanc, n, a, lda, b, ldb, ilo, ihi, lscale, rscale,rwork, ierr )
+                     
+           ! compute abnrm and bbnrm
+           abnrm = stdlib_wlange( '1', n, n, a, lda, rwork( 1 ) )
+           if( ilascl ) then
+              rwork( 1 ) = abnrm
+              call stdlib_qlascl( 'G', 0, 0, anrmto, anrm, 1, 1, rwork( 1 ), 1,ierr )
+              abnrm = rwork( 1 )
+           end if
+           bbnrm = stdlib_wlange( '1', n, n, b, ldb, rwork( 1 ) )
+           if( ilbscl ) then
+              rwork( 1 ) = bbnrm
+              call stdlib_qlascl( 'G', 0, 0, bnrmto, bnrm, 1, 1, rwork( 1 ), 1,ierr )
+              bbnrm = rwork( 1 )
+           end if
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (complex workspace: need n, prefer n*nb )
+           irows = ihi + 1 - ilo
+           if( ilv .or. .not.wantsn ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_wgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the unitary transformation to a
+           ! (complex workspace: need n, prefer n*nb)
+           call stdlib_wunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl and/or vr
+           ! (workspace: need n, prefer n*nb)
+           if( ilvl ) then
+              call stdlib_wlaset( 'FULL', n, n, czero, cone, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_wlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_wungqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           if( ilvr )call stdlib_wlaset( 'FULL', n, n, czero, cone, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           if( ilv .or. .not.wantsn ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_wgghrd( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        ierr )
+           else
+              call stdlib_wgghrd( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur forms and schur vectors)
+           ! (complex workspace: need n)
+           ! (real workspace: need n)
+           iwrk = itau
+           if( ilv .or. .not.wantsn ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_whgeqz( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vl, &
+                     ldvl, vr, ldvr, work( iwrk ),lwork+1-iwrk, rwork, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 90
+           end if
+           ! compute eigenvectors and estimate condition numbers if desired
+           ! stdlib_wtgevc: (complex workspace: need 2*n )
+                   ! (real workspace:    need 2*n )
+           ! stdlib_wtgsna: (complex workspace: need 2*n*n if sense='v' or 'b')
+                   ! (integer workspace: need n+2 )
+           if( ilv .or. .not.wantsn ) then
+              if( ilv ) then
+                 if( ilvl ) then
+                    if( ilvr ) then
+                       chtemp = 'B'
+                    else
+                       chtemp = 'L'
+                    end if
+                 else
+                    chtemp = 'R'
+                 end if
+                 call stdlib_wtgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, n,&
+                            in, work( iwrk ), rwork,ierr )
+                 if( ierr/=0 ) then
+                    info = n + 2
+                    go to 90
+                 end if
+              end if
+              if( .not.wantsn ) then
+                 ! compute eigenvectors (stdlib_wtgevc) and estimate condition
+                 ! numbers (stdlib_wtgsna). note that the definition of the condition
+                 ! number is not invariant under transformation (u,v) to
+                 ! (q*u, z*v), where (u,v) are eigenvectors of the generalized
+                 ! schur form (s,t), q and z are orthogonal matrices. in order
+                 ! to avoid using extra 2*n*n workspace, we have to
+                 ! re-calculate eigenvectors and estimate the condition numbers
+                 ! one at a time.
+                 do i = 1, n
+                    do j = 1, n
+                       bwork( j ) = .false.
+                    end do
+                    bwork( i ) = .true.
+                    iwrk = n + 1
+                    iwrk1 = iwrk + n
+                    if( wantse .or. wantsb ) then
+                       call stdlib_wtgevc( 'B', 'S', bwork, n, a, lda, b, ldb,work( 1 ), n, work( &
+                                 iwrk ), n, 1, m,work( iwrk1 ), rwork, ierr )
+                       if( ierr/=0 ) then
+                          info = n + 2
+                          go to 90
+                       end if
+                    end if
+                    call stdlib_wtgsna( sense, 'S', bwork, n, a, lda, b, ldb,work( 1 ), n, work( &
+                    iwrk ), n, rconde( i ),rcondv( i ), 1, m, work( iwrk1 ),lwork-iwrk1+1, iwork, &
+                              ierr )
+                 end do
+              end if
+           end if
+           ! undo balancing on vl and vr and normalization
+           ! (workspace: none needed)
+           if( ilvl ) then
+              call stdlib_wggbak( balanc, 'L', n, ilo, ihi, lscale, rscale, n, vl,ldvl, ierr )
+                        
+              loop_50: do jc = 1, n
+                 temp = zero
+                 do jr = 1, n
+                    temp = max( temp, abs1( vl( jr, jc ) ) )
+                 end do
+                 if( temp<smlnum )cycle loop_50
+                 temp = one / temp
+                 do jr = 1, n
+                    vl( jr, jc ) = vl( jr, jc )*temp
+                 end do
+              end do loop_50
+           end if
+           if( ilvr ) then
+              call stdlib_wggbak( balanc, 'R', n, ilo, ihi, lscale, rscale, n, vr,ldvr, ierr )
+                        
+              loop_80: do jc = 1, n
+                 temp = zero
+                 do jr = 1, n
+                    temp = max( temp, abs1( vr( jr, jc ) ) )
+                 end do
+                 if( temp<smlnum )cycle loop_80
+                 temp = one / temp
+                 do jr = 1, n
+                    vr( jr, jc ) = vr( jr, jc )*temp
+                 end do
+              end do loop_80
+           end if
+           ! undo scaling if necessary
+           90 continue
+           if( ilascl )call stdlib_wlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           if( ilbscl )call stdlib_wlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_wggevx
+
+     !> ZGGGLM: solves a general Gauss-Markov linear model (GLM) problem:
+     !> minimize || y ||_2   subject to   d = A*x + B*y
+     !> x
+     !> where A is an N-by-M matrix, B is an N-by-P matrix, and d is a
+     !> given N-vector. It is assumed that M <= N <= M+P, and
+     !> rank(A) = M    and    rank( A B ) = N.
+     !> Under these assumptions, the constrained equation is always
+     !> consistent, and there is a unique solution x and a minimal 2-norm
+     !> solution y, which is obtained using a generalized QR factorization
+     !> of the matrices (A, B) given by
+     !> A = Q*(R),   B = Q*T*Z.
+     !> (0)
+     !> In particular, if matrix B is square nonsingular, then the problem
+     !> GLM is equivalent to the following weighted linear least squares
+     !> problem
+     !> minimize || inv(B)*(d-A*x) ||_2
+     !> x
+     !> where inv(B) denotes the inverse of B.
+
+     pure subroutine stdlib_wggglm( n, m, p, a, lda, b, ldb, d, x, y, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*), d(*)
+           complex(qp), intent(out) :: work(*), x(*), y(*)
+        ! ===================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, lopt, lwkmin, lwkopt, nb, nb1, nb2, nb3, nb4, np
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           np = min( n, p )
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( m<0 .or. m>n ) then
+              info = -2
+           else if( p<0 .or. p<n-m ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           ! calculate workspace
+           if( info==0) then
+              if( n==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'ZGEQRF', ' ', n, m, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'ZGERQF', ' ', n, m, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'ZUNMQR', ' ', n, m, p, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'ZUNMRQ', ' ', n, m, p, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = m + n + p
+                 lwkopt = m + np + max( n, p )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGGLM', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              do i = 1, m
+                 x(i) = czero
+              end do
+              do i = 1, p
+                 y(i) = czero
+              end do
+              return
+           end if
+           ! compute the gqr factorization of matrices a and b:
+                ! q**h*a = ( r11 ) m,    q**h*b*z**h = ( t11   t12 ) m
+                         ! (  0  ) n-m                 (  0    t22 ) n-m
+                            ! m                         m+p-n  n-m
+           ! where r11 and t22 are upper triangular, and q and z are
+           ! unitary.
+           call stdlib_wggqrf( n, m, p, a, lda, work, b, ldb, work( m+1 ),work( m+np+1 ), lwork-m-&
+                     np, info )
+           lopt = real( work( m+np+1 ),KIND=qp)
+           ! update left-hand-side vector d = q**h*d = ( d1 ) m
+                                                     ! ( d2 ) n-m
+           call stdlib_wunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', n, 1, m, a, lda, work,d, max( 1, n )&
+                     , work( m+np+1 ), lwork-m-np, info )
+           lopt = max( lopt, int( work( m+np+1 ),KIND=ilp) )
+           ! solve t22*y2 = d2 for y2
+           if( n>m ) then
+              call stdlib_wtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', n-m, 1,b( m+1, m+p-n+1 ), &
+                        ldb, d( m+1 ), n-m, info )
+              if( info>0 ) then
+                 info = 1
+                 return
+              end if
+              call stdlib_wcopy( n-m, d( m+1 ), 1, y( m+p-n+1 ), 1 )
+           end if
+           ! set y1 = 0
+           do i = 1, m + p - n
+              y( i ) = czero
+           end do
+           ! update d1 = d1 - t12*y2
+           call stdlib_wgemv( 'NO TRANSPOSE', m, n-m, -cone, b( 1, m+p-n+1 ), ldb,y( m+p-n+1 ), 1,&
+                      cone, d, 1 )
+           ! solve triangular system: r11*x = d1
+           if( m>0 ) then
+              call stdlib_wtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', m, 1, a, lda,d, m, info )
+                        
+              if( info>0 ) then
+                 info = 2
+                 return
+              end if
+              ! copy d to x
+              call stdlib_wcopy( m, d, 1, x, 1 )
+           end if
+           ! backward transformation y = z**h *y
+           call stdlib_wunmrq( 'LEFT', 'CONJUGATE TRANSPOSE', p, 1, np,b( max( 1, n-p+1 ), 1 ), &
+                     ldb, work( m+1 ), y,max( 1, p ), work( m+np+1 ), lwork-m-np, info )
+           work( 1 ) = m + np + max( lopt, int( work( m+np+1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_wggglm
+
+     !> ZGGHD3: reduces a pair of complex matrices (A,B) to generalized upper
+     !> Hessenberg form using unitary transformations, where A is a
+     !> general matrix and B is upper triangular.  The form of the
+     !> generalized eigenvalue problem is
+     !> A*x = lambda*B*x,
+     !> and B is typically made upper triangular by computing its QR
+     !> factorization and moving the unitary matrix Q to the left side
+     !> of the equation.
+     !> This subroutine simultaneously reduces A to a Hessenberg matrix H:
+     !> Q**H*A*Z = H
+     !> and transforms B to another upper triangular matrix T:
+     !> Q**H*B*Z = T
+     !> in order to reduce the problem to its standard form
+     !> H*y = lambda*T*y
+     !> where y = Z**H*x.
+     !> The unitary matrices Q and Z are determined as products of Givens
+     !> rotations.  They may either be formed explicitly, or they may be
+     !> postmultiplied into input matrices Q1 and Z1, so that
+     !> Q1 * A * Z1**H = (Q1*Q) * H * (Z1*Z)**H
+     !> Q1 * B * Z1**H = (Q1*Q) * T * (Z1*Z)**H
+     !> If Q1 is the unitary matrix from the QR factorization of B in the
+     !> original equation A*x = lambda*B*x, then ZGGHD3 reduces the original
+     !> problem to generalized Hessenberg form.
+     !> This is a blocked variant of CGGHRD, using matrix-matrix
+     !> multiplications for parts of the computation to enhance performance.
+
+     pure subroutine stdlib_wgghd3( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldb, ldq, ldz, n, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: blk22, initq, initz, lquery, wantq, wantz
+           character :: compq2, compz2
+           integer(ilp) :: cola, i, ierr, j, j0, jcol, jj, jrow, k, kacc22, len, lwkopt, n2nb, nb,&
+                      nblst, nbmin, nh, nnb, nx, ppw, ppwo, pw, top, topq
+           real(qp) :: c
+           complex(qp) :: c1, c2, ctemp, s, s1, s2, temp, temp1, temp2, temp3
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,conjg,max
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZGGHD3', ' ', n, ilo, ihi, -1 )
+           lwkopt = max( 6*n*nb, 1 )
+           work( 1 ) = cmplx( lwkopt,KIND=qp)
+           initq = stdlib_lsame( compq, 'I' )
+           wantq = initq .or. stdlib_lsame( compq, 'V' )
+           initz = stdlib_lsame( compz, 'I' )
+           wantz = initz .or. stdlib_lsame( compz, 'V' )
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -1
+           else if( .not.stdlib_lsame( compz, 'N' ) .and. .not.wantz ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ( wantq .and. ldq<n ) .or. ldq<1 ) then
+              info = -11
+           else if( ( wantz .and. ldz<n ) .or. ldz<1 ) then
+              info = -13
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGHD3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! initialize q and z if desired.
+           if( initq )call stdlib_wlaset( 'ALL', n, n, czero, cone, q, ldq )
+           if( initz )call stdlib_wlaset( 'ALL', n, n, czero, cone, z, ldz )
+           ! zero out lower triangle of b.
+           if( n>1 )call stdlib_wlaset( 'LOWER', n-1, n-1, czero, czero, b(2, 1), ldb )
+           ! quick return if possible
+           nh = ihi - ilo + 1
+           if( nh<=1 ) then
+              work( 1 ) = cone
+              return
+           end if
+           ! determine the blocksize.
+           nbmin = stdlib_ilaenv( 2, 'ZGGHD3', ' ', n, ilo, ihi, -1 )
+           if( nb>1 .and. nb<nh ) then
+              ! determine when to use unblocked instead of blocked code.
+              nx = max( nb, stdlib_ilaenv( 3, 'ZGGHD3', ' ', n, ilo, ihi, -1 ) )
+              if( nx<nh ) then
+                 ! determine if workspace is large enough for blocked code.
+                 if( lwork<lwkopt ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code.
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGGHD3', ' ', n, ilo, ihi,-1 ) )
+                    if( lwork>=6*n*nbmin ) then
+                       nb = lwork / ( 6*n )
+                    else
+                       nb = 1
+                    end if
+                 end if
+              end if
+           end if
+           if( nb<nbmin .or. nb>=nh ) then
+              ! use unblocked code below
+              jcol = ilo
+           else
+              ! use blocked code
+              kacc22 = stdlib_ilaenv( 16, 'ZGGHD3', ' ', n, ilo, ihi, -1 )
+              blk22 = kacc22==2
+              do jcol = ilo, ihi-2, nb
+                 nnb = min( nb, ihi-jcol-1 )
+                 ! initialize small unitary factors that will hold the
+                 ! accumulated givens rotations in workspace.
+                 ! n2nb   denotes the number of 2*nnb-by-2*nnb factors
+                 ! nblst  denotes the (possibly smaller) order of the last
+                        ! factor.
+                 n2nb = ( ihi-jcol-1 ) / nnb - 1
+                 nblst = ihi - jcol - n2nb*nnb
+                 call stdlib_wlaset( 'ALL', nblst, nblst, czero, cone, work, nblst )
+                 pw = nblst * nblst + 1
+                 do i = 1, n2nb
+                    call stdlib_wlaset( 'ALL', 2*nnb, 2*nnb, czero, cone,work( pw ), 2*nnb )
+                              
+                    pw = pw + 4*nnb*nnb
+                 end do
+                 ! reduce columns jcol:jcol+nnb-1 of a to hessenberg form.
+                 do j = jcol, jcol+nnb-1
+                    ! reduce jth column of a. store cosines and sines in jth
+                    ! column of a and b, respectively.
+                    do i = ihi, j+2, -1
+                       temp = a( i-1, j )
+                       call stdlib_wlartg( temp, a( i, j ), c, s, a( i-1, j ) )
+                       a( i, j ) = cmplx( c,KIND=qp)
+                       b( i, j ) = s
+                    end do
+                    ! accumulate givens rotations into workspace array.
+                    ppw  = ( nblst + 1 )*( nblst - 2 ) - j + jcol + 1
+                    len  = 2 + j - jcol
+                    jrow = j + n2nb*nnb + 2
+                    do i = ihi, jrow, -1
+                       ctemp = a( i, j )
+                       s = b( i, j )
+                       do jj = ppw, ppw+len-1
+                          temp = work( jj + nblst )
+                          work( jj + nblst ) = ctemp*temp - s*work( jj )
+                          work( jj ) = conjg( s )*temp + ctemp*work( jj )
+                       end do
+                       len = len + 1
+                       ppw = ppw - nblst - 1
+                    end do
+                    ppwo = nblst*nblst + ( nnb+j-jcol-1 )*2*nnb + nnb
+                    j0 = jrow - nnb
+                    do jrow = j0, j+2, -nnb
+                       ppw = ppwo
+                       len  = 2 + j - jcol
+                       do i = jrow+nnb-1, jrow, -1
+                          ctemp = a( i, j )
+                          s = b( i, j )
+                          do jj = ppw, ppw+len-1
+                             temp = work( jj + 2*nnb )
+                             work( jj + 2*nnb ) = ctemp*temp - s*work( jj )
+                             work( jj ) = conjg( s )*temp + ctemp*work( jj )
+                          end do
+                          len = len + 1
+                          ppw = ppw - 2*nnb - 1
+                       end do
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                    ! top denotes the number of top rows in a and b that will
+                    ! not be updated during the next steps.
+                    if( jcol<=2 ) then
+                       top = 0
+                    else
+                       top = jcol
+                    end if
+                    ! propagate transformations through b and replace stored
+                    ! left sines/cosines by right sines/cosines.
+                    do jj = n, j+1, -1
+                       ! update jjth column of b.
+                       do i = min( jj+1, ihi ), j+2, -1
+                          ctemp = a( i, j )
+                          s = b( i, j )
+                          temp = b( i, jj )
+                          b( i, jj ) = ctemp*temp - conjg( s )*b( i-1, jj )
+                          b( i-1, jj ) = s*temp + ctemp*b( i-1, jj )
+                       end do
+                       ! annihilate b( jj+1, jj ).
+                       if( jj<ihi ) then
+                          temp = b( jj+1, jj+1 )
+                          call stdlib_wlartg( temp, b( jj+1, jj ), c, s,b( jj+1, jj+1 ) )
+                          b( jj+1, jj ) = czero
+                          call stdlib_wrot( jj-top, b( top+1, jj+1 ), 1,b( top+1, jj ), 1, c, s )
+                                    
+                          a( jj+1, j ) = cmplx( c,KIND=qp)
+                          b( jj+1, j ) = -conjg( s )
+                       end if
+                    end do
+                    ! update a by transformations from right.
+                    jj = mod( ihi-j-1, 3 )
+                    do i = ihi-j-3, jj+1, -3
+                       ctemp = a( j+1+i, j )
+                       s = -b( j+1+i, j )
+                       c1 = a( j+2+i, j )
+                       s1 = -b( j+2+i, j )
+                       c2 = a( j+3+i, j )
+                       s2 = -b( j+3+i, j )
+                       do k = top+1, ihi
+                          temp = a( k, j+i  )
+                          temp1 = a( k, j+i+1 )
+                          temp2 = a( k, j+i+2 )
+                          temp3 = a( k, j+i+3 )
+                          a( k, j+i+3 ) = c2*temp3 + conjg( s2 )*temp2
+                          temp2 = -s2*temp3 + c2*temp2
+                          a( k, j+i+2 ) = c1*temp2 + conjg( s1 )*temp1
+                          temp1 = -s1*temp2 + c1*temp1
+                          a( k, j+i+1 ) = ctemp*temp1 + conjg( s )*temp
+                          a( k, j+i ) = -s*temp1 + ctemp*temp
+                       end do
+                    end do
+                    if( jj>0 ) then
+                       do i = jj, 1, -1
+                          c = real( a( j+1+i, j ),KIND=qp)
+                          call stdlib_wrot( ihi-top, a( top+1, j+i+1 ), 1,a( top+1, j+i ), 1, c,-&
+                                    conjg( b( j+1+i, j ) ) )
+                       end do
+                    end if
+                    ! update (j+1)th column of a by transformations from left.
+                    if ( j < jcol + nnb - 1 ) then
+                       len  = 1 + j - jcol
+                       ! multiply with the trailing accumulated unitary
+                       ! matrix, which takes the form
+                              ! [  u11  u12  ]
+                          ! u = [            ],
+                              ! [  u21  u22  ]
+                       ! where u21 is a len-by-len matrix and u12 is lower
+                       ! triangular.
+                       jrow = ihi - nblst + 1
+                       call stdlib_wgemv( 'CONJUGATE', nblst, len, cone, work,nblst, a( jrow, j+1 &
+                                 ), 1, czero,work( pw ), 1 )
+                       ppw = pw + len
+                       do i = jrow, jrow+nblst-len-1
+                          work( ppw ) = a( i, j+1 )
+                          ppw = ppw + 1
+                       end do
+                       call stdlib_wtrmv( 'LOWER', 'CONJUGATE', 'NON-UNIT',nblst-len, work( &
+                                 len*nblst + 1 ), nblst,work( pw+len ), 1 )
+                       call stdlib_wgemv( 'CONJUGATE', len, nblst-len, cone,work( (len+1)*nblst - &
+                       len + 1 ), nblst,a( jrow+nblst-len, j+1 ), 1, cone,work( pw+len ), 1 )
+                                 
+                       ppw = pw
+                       do i = jrow, jrow+nblst-1
+                          a( i, j+1 ) = work( ppw )
+                          ppw = ppw + 1
+                       end do
+                       ! multiply with the other accumulated unitary
+                       ! matrices, which take the form
+                              ! [  u11  u12   0  ]
+                              ! [                ]
+                          ! u = [  u21  u22   0  ],
+                              ! [                ]
+                              ! [   0    0    i  ]
+                       ! where i denotes the (nnb-len)-by-(nnb-len) identity
+                       ! matrix, u21 is a len-by-len upper triangular matrix
+                       ! and u12 is an nnb-by-nnb lower triangular matrix.
+                       ppwo = 1 + nblst*nblst
+                       j0 = jrow - nnb
+                       do jrow = j0, jcol+1, -nnb
+                          ppw = pw + len
+                          do i = jrow, jrow+nnb-1
+                             work( ppw ) = a( i, j+1 )
+                             ppw = ppw + 1
+                          end do
+                          ppw = pw
+                          do i = jrow+nnb, jrow+nnb+len-1
+                             work( ppw ) = a( i, j+1 )
+                             ppw = ppw + 1
+                          end do
+                          call stdlib_wtrmv( 'UPPER', 'CONJUGATE', 'NON-UNIT', len,work( ppwo + &
+                                    nnb ), 2*nnb, work( pw ),1 )
+                          call stdlib_wtrmv( 'LOWER', 'CONJUGATE', 'NON-UNIT', nnb,work( ppwo + &
+                                    2*len*nnb ),2*nnb, work( pw + len ), 1 )
+                          call stdlib_wgemv( 'CONJUGATE', nnb, len, cone,work( ppwo ), 2*nnb, a( &
+                                    jrow, j+1 ), 1,cone, work( pw ), 1 )
+                          call stdlib_wgemv( 'CONJUGATE', len, nnb, cone,work( ppwo + 2*len*nnb + &
+                                    nnb ), 2*nnb,a( jrow+nnb, j+1 ), 1, cone,work( pw+len ), 1 )
+                          ppw = pw
+                          do i = jrow, jrow+len+nnb-1
+                             a( i, j+1 ) = work( ppw )
+                             ppw = ppw + 1
+                          end do
+                          ppwo = ppwo + 4*nnb*nnb
+                       end do
+                    end if
+                 end do
+                 ! apply accumulated unitary matrices to a.
+                 cola = n - jcol - nnb + 1
+                 j = ihi - nblst + 1
+                 call stdlib_wgemm( 'CONJUGATE', 'NO TRANSPOSE', nblst,cola, nblst, cone, work, &
+                           nblst,a( j, jcol+nnb ), lda, czero, work( pw ),nblst )
+                 call stdlib_wlacpy( 'ALL', nblst, cola, work( pw ), nblst,a( j, jcol+nnb ), lda )
+                           
+                 ppwo = nblst*nblst + 1
+                 j0 = j - nnb
+                 do j = j0, jcol+1, -nnb
+                    if ( blk22 ) then
+                       ! exploit the structure of
+                              ! [  u11  u12  ]
+                          ! u = [            ]
+                              ! [  u21  u22  ],
+                       ! where all blocks are nnb-by-nnb, u21 is upper
+                       ! triangular and u12 is lower triangular.
+                       call stdlib_wunm22( 'LEFT', 'CONJUGATE', 2*nnb, cola, nnb,nnb, work( ppwo )&
+                                 , 2*nnb,a( j, jcol+nnb ), lda, work( pw ),lwork-pw+1, ierr )
+                    else
+                       ! ignore the structure of u.
+                       call stdlib_wgemm( 'CONJUGATE', 'NO TRANSPOSE', 2*nnb,cola, 2*nnb, cone, &
+                       work( ppwo ), 2*nnb,a( j, jcol+nnb ), lda, czero, work( pw ),2*nnb )
+                                 
+                       call stdlib_wlacpy( 'ALL', 2*nnb, cola, work( pw ), 2*nnb,a( j, jcol+nnb ),&
+                                  lda )
+                    end if
+                    ppwo = ppwo + 4*nnb*nnb
+                 end do
+                 ! apply accumulated unitary matrices to q.
+                 if( wantq ) then
+                    j = ihi - nblst + 1
+                    if ( initq ) then
+                       topq = max( 2, j - jcol + 1 )
+                       nh  = ihi - topq + 1
+                    else
+                       topq = 1
+                       nh = n
+                    end if
+                    call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,nblst, nblst, cone, q( &
+                              topq, j ), ldq,work, nblst, czero, work( pw ), nh )
+                    call stdlib_wlacpy( 'ALL', nh, nblst, work( pw ), nh,q( topq, j ), ldq )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( initq ) then
+                          topq = max( 2, j - jcol + 1 )
+                          nh  = ihi - topq + 1
+                       end if
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_wunm22( 'RIGHT', 'NO TRANSPOSE', nh, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,q( topq, j ), ldq, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,2*nnb, 2*nnb, &
+                          cone, q( topq, j ), ldq,work( ppwo ), 2*nnb, czero, work( pw ),nh )
+                                    
+                          call stdlib_wlacpy( 'ALL', nh, 2*nnb, work( pw ), nh,q( topq, j ), ldq )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+                 ! accumulate right givens rotations if required.
+                 if ( wantz .or. top>0 ) then
+                    ! initialize small unitary factors that will hold the
+                    ! accumulated givens rotations in workspace.
+                    call stdlib_wlaset( 'ALL', nblst, nblst, czero, cone, work,nblst )
+                    pw = nblst * nblst + 1
+                    do i = 1, n2nb
+                       call stdlib_wlaset( 'ALL', 2*nnb, 2*nnb, czero, cone,work( pw ), 2*nnb )
+                                 
+                       pw = pw + 4*nnb*nnb
+                    end do
+                    ! accumulate givens rotations into workspace array.
+                    do j = jcol, jcol+nnb-1
+                       ppw  = ( nblst + 1 )*( nblst - 2 ) - j + jcol + 1
+                       len  = 2 + j - jcol
+                       jrow = j + n2nb*nnb + 2
+                       do i = ihi, jrow, -1
+                          ctemp = a( i, j )
+                          a( i, j ) = czero
+                          s = b( i, j )
+                          b( i, j ) = czero
+                          do jj = ppw, ppw+len-1
+                             temp = work( jj + nblst )
+                             work( jj + nblst ) = ctemp*temp -conjg( s )*work( jj )
+                             work( jj ) = s*temp + ctemp*work( jj )
+                          end do
+                          len = len + 1
+                          ppw = ppw - nblst - 1
+                       end do
+                       ppwo = nblst*nblst + ( nnb+j-jcol-1 )*2*nnb + nnb
+                       j0 = jrow - nnb
+                       do jrow = j0, j+2, -nnb
+                          ppw = ppwo
+                          len  = 2 + j - jcol
+                          do i = jrow+nnb-1, jrow, -1
+                             ctemp = a( i, j )
+                             a( i, j ) = czero
+                             s = b( i, j )
+                             b( i, j ) = czero
+                             do jj = ppw, ppw+len-1
+                                temp = work( jj + 2*nnb )
+                                work( jj + 2*nnb ) = ctemp*temp -conjg( s )*work( jj )
+                                work( jj ) = s*temp + ctemp*work( jj )
+                             end do
+                             len = len + 1
+                             ppw = ppw - 2*nnb - 1
+                          end do
+                          ppwo = ppwo + 4*nnb*nnb
+                       end do
+                    end do
+                 else
+                    call stdlib_wlaset( 'LOWER', ihi - jcol - 1, nnb, czero, czero,a( jcol + 2, &
+                              jcol ), lda )
+                    call stdlib_wlaset( 'LOWER', ihi - jcol - 1, nnb, czero, czero,b( jcol + 2, &
+                              jcol ), ldb )
+                 end if
+                 ! apply accumulated unitary matrices to a and b.
+                 if ( top>0 ) then
+                    j = ihi - nblst + 1
+                    call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,nblst, nblst, cone, a( &
+                              1, j ), lda,work, nblst, czero, work( pw ), top )
+                    call stdlib_wlacpy( 'ALL', top, nblst, work( pw ), top,a( 1, j ), lda )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_wunm22( 'RIGHT', 'NO TRANSPOSE', top, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,a( 1, j ), lda, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,2*nnb, 2*nnb, &
+                          cone, a( 1, j ), lda,work( ppwo ), 2*nnb, czero,work( pw ), top )
+                                    
+                          call stdlib_wlacpy( 'ALL', top, 2*nnb, work( pw ), top,a( 1, j ), lda )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                    j = ihi - nblst + 1
+                    call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,nblst, nblst, cone, b( &
+                              1, j ), ldb,work, nblst, czero, work( pw ), top )
+                    call stdlib_wlacpy( 'ALL', top, nblst, work( pw ), top,b( 1, j ), ldb )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_wunm22( 'RIGHT', 'NO TRANSPOSE', top, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,b( 1, j ), ldb, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,2*nnb, 2*nnb, &
+                          cone, b( 1, j ), ldb,work( ppwo ), 2*nnb, czero,work( pw ), top )
+                                    
+                          call stdlib_wlacpy( 'ALL', top, 2*nnb, work( pw ), top,b( 1, j ), ldb )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+                 ! apply accumulated unitary matrices to z.
+                 if( wantz ) then
+                    j = ihi - nblst + 1
+                    if ( initq ) then
+                       topq = max( 2, j - jcol + 1 )
+                       nh  = ihi - topq + 1
+                    else
+                       topq = 1
+                       nh = n
+                    end if
+                    call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,nblst, nblst, cone, z( &
+                              topq, j ), ldz,work, nblst, czero, work( pw ), nh )
+                    call stdlib_wlacpy( 'ALL', nh, nblst, work( pw ), nh,z( topq, j ), ldz )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                          if ( initq ) then
+                          topq = max( 2, j - jcol + 1 )
+                          nh  = ihi - topq + 1
+                       end if
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_wunm22( 'RIGHT', 'NO TRANSPOSE', nh, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,z( topq, j ), ldz, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,2*nnb, 2*nnb, &
+                          cone, z( topq, j ), ldz,work( ppwo ), 2*nnb, czero, work( pw ),nh )
+                                    
+                          call stdlib_wlacpy( 'ALL', nh, 2*nnb, work( pw ), nh,z( topq, j ), ldz )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+              end do
+           end if
+           ! use unblocked code to reduce the rest of the matrix
+           ! avoid re-initialization of modified q and z.
+           compq2 = compq
+           compz2 = compz
+           if ( jcol/=ilo ) then
+              if ( wantq )compq2 = 'V'
+              if ( wantz )compz2 = 'V'
+           end if
+           if ( jcol<ihi )call stdlib_wgghrd( compq2, compz2, n, jcol, ihi, a, lda, b, ldb, q,ldq,&
+                      z, ldz, ierr )
+           work( 1 ) = cmplx( lwkopt,KIND=qp)
+           return
+     end subroutine stdlib_wgghd3
+
+     !> ZGGHRD: reduces a pair of complex matrices (A,B) to generalized upper
+     !> Hessenberg form using unitary transformations, where A is a
+     !> general matrix and B is upper triangular.  The form of the
+     !> generalized eigenvalue problem is
+     !> A*x = lambda*B*x,
+     !> and B is typically made upper triangular by computing its QR
+     !> factorization and moving the unitary matrix Q to the left side
+     !> of the equation.
+     !> This subroutine simultaneously reduces A to a Hessenberg matrix H:
+     !> Q**H*A*Z = H
+     !> and transforms B to another upper triangular matrix T:
+     !> Q**H*B*Z = T
+     !> in order to reduce the problem to its standard form
+     !> H*y = lambda*T*y
+     !> where y = Z**H*x.
+     !> The unitary matrices Q and Z are determined as products of Givens
+     !> rotations.  They may either be formed explicitly, or they may be
+     !> postmultiplied into input matrices Q1 and Z1, so that
+     !> Q1 * A * Z1**H = (Q1*Q) * H * (Z1*Z)**H
+     !> Q1 * B * Z1**H = (Q1*Q) * T * (Z1*Z)**H
+     !> If Q1 is the unitary matrix from the QR factorization of B in the
+     !> original equation A*x = lambda*B*x, then ZGGHRD reduces the original
+     !> problem to generalized Hessenberg form.
+
+     pure subroutine stdlib_wgghrd( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldb, ldq, ldz, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilq, ilz
+           integer(ilp) :: icompq, icompz, jcol, jrow
+           real(qp) :: c
+           complex(qp) :: ctemp, s
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! decode compq
+           if( stdlib_lsame( compq, 'N' ) ) then
+              ilq = .false.
+              icompq = 1
+           else if( stdlib_lsame( compq, 'V' ) ) then
+              ilq = .true.
+              icompq = 2
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              ilq = .true.
+              icompq = 3
+           else
+              icompq = 0
+           end if
+           ! decode compz
+           if( stdlib_lsame( compz, 'N' ) ) then
+              ilz = .false.
+              icompz = 1
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              ilz = .true.
+              icompz = 2
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              ilz = .true.
+              icompz = 3
+           else
+              icompz = 0
+           end if
+           ! test the input parameters.
+           info = 0
+           if( icompq<=0 ) then
+              info = -1
+           else if( icompz<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ( ilq .and. ldq<n ) .or. ldq<1 ) then
+              info = -11
+           else if( ( ilz .and. ldz<n ) .or. ldz<1 ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGHRD', -info )
+              return
+           end if
+           ! initialize q and z if desired.
+           if( icompq==3 )call stdlib_wlaset( 'FULL', n, n, czero, cone, q, ldq )
+           if( icompz==3 )call stdlib_wlaset( 'FULL', n, n, czero, cone, z, ldz )
+           ! quick return if possible
+           if( n<=1 )return
+           ! zero out lower triangle of b
+           do jcol = 1, n - 1
+              do jrow = jcol + 1, n
+                 b( jrow, jcol ) = czero
+              end do
+           end do
+           ! reduce a and b
+           do jcol = ilo, ihi - 2
+              do jrow = ihi, jcol + 2, -1
+                 ! step 1: rotate rows jrow-1, jrow to kill a(jrow,jcol)
+                 ctemp = a( jrow-1, jcol )
+                 call stdlib_wlartg( ctemp, a( jrow, jcol ), c, s,a( jrow-1, jcol ) )
+                 a( jrow, jcol ) = czero
+                 call stdlib_wrot( n-jcol, a( jrow-1, jcol+1 ), lda,a( jrow, jcol+1 ), lda, c, s )
+                           
+                 call stdlib_wrot( n+2-jrow, b( jrow-1, jrow-1 ), ldb,b( jrow, jrow-1 ), ldb, c, &
+                           s )
+                 if( ilq )call stdlib_wrot( n, q( 1, jrow-1 ), 1, q( 1, jrow ), 1, c,conjg( s ) )
+                           
+                 ! step 2: rotate columns jrow, jrow-1 to kill b(jrow,jrow-1)
+                 ctemp = b( jrow, jrow )
+                 call stdlib_wlartg( ctemp, b( jrow, jrow-1 ), c, s,b( jrow, jrow ) )
+                 b( jrow, jrow-1 ) = czero
+                 call stdlib_wrot( ihi, a( 1, jrow ), 1, a( 1, jrow-1 ), 1, c, s )
+                 call stdlib_wrot( jrow-1, b( 1, jrow ), 1, b( 1, jrow-1 ), 1, c,s )
+                 if( ilz )call stdlib_wrot( n, z( 1, jrow ), 1, z( 1, jrow-1 ), 1, c, s )
+              end do
+           end do
+           return
+     end subroutine stdlib_wgghrd
+
+     !> ZGGLSE: solves the linear equality-constrained least squares (LSE)
+     !> problem:
+     !> minimize || c - A*x ||_2   subject to   B*x = d
+     !> where A is an M-by-N matrix, B is a P-by-N matrix, c is a given
+     !> M-vector, and d is a given P-vector. It is assumed that
+     !> P <= N <= M+P, and
+     !> rank(B) = P and  rank( (A) ) = N.
+     !> ( (B) )
+     !> These conditions ensure that the LSE problem has a unique solution,
+     !> which is obtained using a generalized RQ factorization of the
+     !> matrices (B, A) given by
+     !> B = (0 R)*Q,   A = Z*T*Q.
+
+     pure subroutine stdlib_wgglse( m, n, p, a, lda, b, ldb, c, d, x, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*), c(*), d(*)
+           complex(qp), intent(out) :: work(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkmin, lwkopt, mn, nb, nb1, nb2, nb3, nb4, nr
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( p<0 .or. p>n .or. p<n-m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, p ) ) then
+              info = -7
+           end if
+           ! calculate workspace
+           if( info==0) then
+              if( n==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'ZGEQRF', ' ', m, n, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'ZGERQF', ' ', m, n, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'ZUNMQR', ' ', m, n, p, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'ZUNMRQ', ' ', m, n, p, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = m + n + p
+                 lwkopt = p + mn + max( m, n )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGLSE', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! compute the grq factorization of matrices b and a:
+                  ! b*q**h = (  0  t12 ) p   z**h*a*q**h = ( r11 r12 ) n-p
+                              ! n-p  p                     (  0  r22 ) m+p-n
+                                                            ! n-p  p
+           ! where t12 and r11 are upper triangular, and q and z are
+           ! unitary.
+           call stdlib_wggrqf( p, m, n, b, ldb, work, a, lda, work( p+1 ),work( p+mn+1 ), lwork-p-&
+                     mn, info )
+           lopt = real( work( p+mn+1 ),KIND=qp)
+           ! update c = z**h *c = ( c1 ) n-p
+                             ! ( c2 ) m+p-n
+           call stdlib_wunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', m, 1, mn, a, lda,work( p+1 ), c, &
+                     max( 1, m ), work( p+mn+1 ),lwork-p-mn, info )
+           lopt = max( lopt, int( work( p+mn+1 ),KIND=ilp) )
+           ! solve t12*x2 = d for x2
+           if( p>0 ) then
+              call stdlib_wtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', p, 1,b( 1, n-p+1 ), ldb, d,&
+                         p, info )
+              if( info>0 ) then
+                 info = 1
+                 return
+              end if
+              ! put the solution in x
+              call stdlib_wcopy( p, d, 1, x( n-p+1 ), 1 )
+              ! update c1
+              call stdlib_wgemv( 'NO TRANSPOSE', n-p, p, -cone, a( 1, n-p+1 ), lda,d, 1, cone, c, &
+                        1 )
+           end if
+           ! solve r11*x1 = c1 for x1
+           if( n>p ) then
+              call stdlib_wtrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n-p, 1,a, lda, c, n-p, &
+                        info )
+              if( info>0 ) then
+                 info = 2
+                 return
+              end if
+              ! put the solutions in x
+              call stdlib_wcopy( n-p, c, 1, x, 1 )
+           end if
+           ! compute the residual vector:
+           if( m<n ) then
+              nr = m + p - n
+              if( nr>0 )call stdlib_wgemv( 'NO TRANSPOSE', nr, n-m, -cone, a( n-p+1, m+1 ),lda, d(&
+                         nr+1 ), 1, cone, c( n-p+1 ), 1 )
+           else
+              nr = p
+           end if
+           if( nr>0 ) then
+              call stdlib_wtrmv( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', nr,a( n-p+1, n-p+1 ), lda, &
+                        d, 1 )
+              call stdlib_waxpy( nr, -cone, d, 1, c( n-p+1 ), 1 )
+           end if
+           ! backward transformation x = q**h*x
+           call stdlib_wunmrq( 'LEFT', 'CONJUGATE TRANSPOSE', n, 1, p, b, ldb,work( 1 ), x, n, &
+                     work( p+mn+1 ), lwork-p-mn, info )
+           work( 1 ) = p + mn + max( lopt, int( work( p+mn+1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_wgglse
+
+     !> ZGGQRF: computes a generalized QR factorization of an N-by-M matrix A
+     !> and an N-by-P matrix B:
+     !> A = Q*R,        B = Q*T*Z,
+     !> where Q is an N-by-N unitary matrix, Z is a P-by-P unitary matrix,
+     !> and R and T assume one of the forms:
+     !> if N >= M,  R = ( R11 ) M  ,   or if N < M,  R = ( R11  R12 ) N,
+     !> (  0  ) N-M                         N   M-N
+     !> M
+     !> where R11 is upper triangular, and
+     !> if N <= P,  T = ( 0  T12 ) N,   or if N > P,  T = ( T11 ) N-P,
+     !> P-N  N                           ( T21 ) P
+     !> P
+     !> where T12 or T21 is upper triangular.
+     !> In particular, if B is square and nonsingular, the GQR factorization
+     !> of A and B implicitly gives the QR factorization of inv(B)*A:
+     !> inv(B)*A = Z**H * (inv(T)*R)
+     !> where inv(B) denotes the inverse of the matrix B, and Z**H denotes the
+     !> conjugate transpose of matrix Z.
+
+     pure subroutine stdlib_wggqrf( n, m, p, a, lda, taua, b, ldb, taub, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: taua(*), taub(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkopt, nb, nb1, nb2, nb3
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'ZGEQRF', ' ', n, m, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'ZGERQF', ' ', n, p, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'ZUNMQR', ' ', n, m, p, -1 )
+           nb = max( nb1, nb2, nb3 )
+           lwkopt = max( n, m, p )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( p<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, n, m, p ) .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGQRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! qr factorization of n-by-m matrix a: a = q*r
+           call stdlib_wgeqrf( n, m, a, lda, taua, work, lwork, info )
+           lopt = real( work( 1 ),KIND=qp)
+           ! update b := q**h*b.
+           call stdlib_wunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', n, p, min( n, m ), a,lda, taua, b, &
+                     ldb, work, lwork, info )
+           lopt = max( lopt, int( work( 1 ),KIND=ilp) )
+           ! rq factorization of n-by-p matrix b: b = t*z.
+           call stdlib_wgerqf( n, p, b, ldb, taub, work, lwork, info )
+           work( 1 ) = max( lopt, int( work( 1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_wggqrf
+
+     !> ZGGRQF: computes a generalized RQ factorization of an M-by-N matrix A
+     !> and a P-by-N matrix B:
+     !> A = R*Q,        B = Z*T*Q,
+     !> where Q is an N-by-N unitary matrix, Z is a P-by-P unitary
+     !> matrix, and R and T assume one of the forms:
+     !> if M <= N,  R = ( 0  R12 ) M,   or if M > N,  R = ( R11 ) M-N,
+     !> N-M  M                           ( R21 ) N
+     !> N
+     !> where R12 or R21 is upper triangular, and
+     !> if P >= N,  T = ( T11 ) N  ,   or if P < N,  T = ( T11  T12 ) P,
+     !> (  0  ) P-N                         P   N-P
+     !> N
+     !> where T11 is upper triangular.
+     !> In particular, if B is square and nonsingular, the GRQ factorization
+     !> of A and B implicitly gives the RQ factorization of A*inv(B):
+     !> A*inv(B) = (R*inv(T))*Z**H
+     !> where inv(B) denotes the inverse of the matrix B, and Z**H denotes the
+     !> conjugate transpose of the matrix Z.
+
+     pure subroutine stdlib_wggrqf( m, p, n, a, lda, taua, b, ldb, taub, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: taua(*), taub(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkopt, nb, nb1, nb2, nb3
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'ZGERQF', ' ', m, n, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'ZGEQRF', ' ', p, n, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'ZUNMRQ', ' ', m, n, p, -1 )
+           nb = max( nb1, nb2, nb3 )
+           lwkopt = max( n, m, p )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( p<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, p ) ) then
+              info = -8
+           else if( lwork<max( 1, m, p, n ) .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGRQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! rq factorization of m-by-n matrix a: a = r*q
+           call stdlib_wgerqf( m, n, a, lda, taua, work, lwork, info )
+           lopt = real( work( 1 ),KIND=qp)
+           ! update b := b*q**h
+           call stdlib_wunmrq( 'RIGHT', 'CONJUGATE TRANSPOSE', p, n, min( m, n ),a( max( 1, m-n+1 &
+                     ), 1 ), lda, taua, b, ldb, work,lwork, info )
+           lopt = max( lopt, int( work( 1 ),KIND=ilp) )
+           ! qr factorization of p-by-n matrix b: b = z*t
+           call stdlib_wgeqrf( p, n, b, ldb, taub, work, lwork, info )
+           work( 1 ) = max( lopt, int( work( 1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_wggrqf
+
+     !> ZGSVJ0: is called from ZGESVJ as a pre-processor and that is its main
+     !> purpose. It applies Jacobi rotations in the same way as ZGESVJ does, but
+     !> it does not check convergence (stopping criterion). Few tuning
+     !> parameters (marked by [TP]) are available for the implementer.
+
+     pure subroutine stdlib_wgsvj0( jobv, m, n, a, lda, d, sva, mv, v, ldv, eps,sfmin, tol, &
+               nsweep, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n, nsweep
+           real(qp), intent(in) :: eps, sfmin, tol
+           character, intent(in) :: jobv
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), d(n), v(ldv,*)
+           complex(qp), intent(out) :: work(lwork)
+           real(qp), intent(inout) :: sva(n)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(qp) :: aapq, ompq
+           real(qp) :: aapp, aapp0, aapq1, aaqq, apoaq, aqoap, big, bigtheta, cs, mxaapq, mxsinj, &
+                     rootbig, rooteps, rootsfmin, roottol, small, sn, t, temp1, theta, thsign
+           integer(ilp) :: blskip, emptsw, i, ibr, ierr, igl, ijblsk, ir1, iswrot, jbc, jgl, kbl, &
+                     lkahead, mvl, nbl, notrot, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, rotok, rsvec
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,conjg,real,min,sign,sqrt
+           ! from lapack
+           ! Executable Statements 
+           ! test the input parameters.
+           applv = stdlib_lsame( jobv, 'A' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -3
+           else if( lda<m ) then
+              info = -5
+           else if( ( rsvec.or.applv ) .and. ( mv<0 ) ) then
+              info = -8
+           else if( ( rsvec.and.( ldv<n ) ).or.( applv.and.( ldv<mv ) ) ) then
+              info = -10
+           else if( tol<=eps ) then
+              info = -13
+           else if( nsweep<0 ) then
+              info = -14
+           else if( lwork<m ) then
+              info = -16
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGSVJ0', -info )
+              return
+           end if
+           if( rsvec ) then
+              mvl = n
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           rooteps = sqrt( eps )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / eps
+           big = one / sfmin
+           rootbig = one / rootsfmin
+           bigtheta = one / rooteps
+           roottol = sqrt( tol )
+           ! .. row-cyclic jacobi svd algorithm with column pivoting ..
+           emptsw = ( n*( n-1 ) ) / 2
+           notrot = 0
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           swband = 0
+      ! [tp] swband is a tuning parameter [tp]. it is meaningful and effective
+           ! if stdlib_wgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_wgejsv. for sweeps i=1:swband the procedure
+           ! works on pivots inside a band-like region around the diagonal.
+           ! the boundaries are determined dynamically, based on the number of
+           ! pivots above a threshold.
+           kbl = min( 8, n )
+      ! [tp] kbl is a tuning parameter that defines the tile size in the
+           ! tiling of the p-q loops of pivot pairs. in general, an optimal
+           ! value of kbl depends on the matrix dimensions and on the
+           ! parameters of the computer's memory.
+           nbl = n / kbl
+           if( ( nbl*kbl )/=n )nbl = nbl + 1
+           blskip = kbl**2
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           lkahead = 1
+      ! [tp] lkahead is a tuning parameter.
+           ! quasi block transformations, using the lower (upper) triangular
+           ! structure of the input matrix. the quasi-block-cycling usually
+           ! invokes cubic convergence. big part of this cycle is done inside
+           ! canonical subspaces of dimensions less than m.
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+           ! each sweep is unrolled using kbl-by-kbl tiles over the pivot pairs
+           ! 1 <= p < q <= n. this is the first step toward a blocked implementation
+           ! of the rotations. new implementation, based on block transformations,
+           ! is under development.
+              loop_2000: do ibr = 1, nbl
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_1002: do ir1 = 0, min( lkahead, nbl-ibr )
+                    igl = igl + ir1*kbl
+                    loop_2001: do p = igl, min( igl+kbl-1, n-1 )
+           ! .. de rijk's pivoting
+                       q = stdlib_iqamax( n-p+1, sva( p ), 1 ) + p - 1
+                       if( p/=q ) then
+                          call stdlib_wswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                          if( rsvec )call stdlib_wswap( mvl, v( 1, p ), 1,v( 1, q ), 1 )
+                          temp1 = sva( p )
+                          sva( p ) = sva( q )
+                          sva( q ) = temp1
+                          aapq = d(p)
+                          d(p) = d(q)
+                          d(q) = aapq
+                       end if
+                       if( ir1==0 ) then
+              ! column norms are periodically updated by explicit
+              ! norm computation.
+              ! caveat:
+              ! unfortunately, some blas implementations compute sncrm2(m,a(1,p),1)
+              ! as sqrt(s=stdlib_wdotc(m,a(1,p),1,a(1,p),1)), which may cause the result to
+              ! overflow for ||a(:,p)||_2 > sqrt(overflow_threshold), and to
+              ! underflow for ||a(:,p)||_2 < sqrt(underflow_threshold).
+              ! hence, stdlib_qznrm2 cannot be trusted, not even in the case when
+              ! the true norm is far from the under(over)flow boundaries.
+              ! if properly implemented stdlib_qznrm2 is available, the if-then-else-end if
+              ! below should be replaced with "aapp = stdlib_qznrm2( m, a(1,p), 1 )".
+                          if( ( sva( p )<rootbig ) .and.( sva( p )>rootsfmin ) ) then
+                             sva( p ) = stdlib_qznrm2( m, a( 1, p ), 1 )
+                          else
+                             temp1 = zero
+                             aapp = one
+                             call stdlib_wlassq( m, a( 1, p ), 1, temp1, aapp )
+                             sva( p ) = temp1*sqrt( aapp )
+                          end if
+                          aapp = sva( p )
+                       else
+                          aapp = sva( p )
+                       end if
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2002: do q = p + 1, min( igl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+                                if( aaqq>=one ) then
+                                   rotok = ( small*aapp )<=aaqq
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_wdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_wcopy( m, a( 1, p ), 1,work, 1 )
+                                      call stdlib_wlascl( 'G', 0, 0, aapp, one,m, 1, work, lda, &
+                                                ierr )
+                                      aapq = stdlib_wdotc( m, work, 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   rotok = aapp<=( aaqq / small )
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_wdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aapp ) / aaqq
+                                   else
+                                      call stdlib_wcopy( m, a( 1, q ), 1,work, 1 )
+                                      call stdlib_wlascl( 'G', 0, 0, aaqq,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_wdotc( m, a( 1, p ), 1,work, 1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg( cwork(p) ) * cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                   ompq = aapq / abs(aapq)
+                 ! Rotate
+      ! [rtd]      rotated = rotated + one
+                                   if( ir1==0 ) then
+                                      notrot = 0
+                                      pskipped = 0
+                                      iswrot = iswrot + 1
+                                   end if
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/aapq1
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_wrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if ( rsvec ) then
+                                             call stdlib_wrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_wrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if ( rsvec ) then
+                                             call stdlib_wrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      d(p) = -d(q) * ompq
+                                      else
+                    ! .. have to use modified gram-schmidt like transformation
+                                      call stdlib_wcopy( m, a( 1, p ), 1,work, 1 )
+                                      call stdlib_wlascl( 'G', 0, 0, aapp, one, m,1, work, lda,&
+                                                ierr )
+                                      call stdlib_wlascl( 'G', 0, 0, aaqq, one, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      call stdlib_waxpy( m, -aapq, work, 1,a( 1, q ), 1 )
+                                      call stdlib_wlascl( 'G', 0, 0, one, aaqq, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                      mxsinj = max( mxsinj, sfmin )
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! recompute sva(q), sva(p).
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_qznrm2( m, a( 1, q ), 1 )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_wlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_qznrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_wlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                                else
+              ! a(:,p) and a(:,q) already numerically orthogonal
+                                   if( ir1==0 )notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                end if
+                             else
+              ! a(:,q) is zero column
+                                if( ir1==0 )notrot = notrot + 1
+                                pskipped = pskipped + 1
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                if( ir1==0 )aapp = -aapp
+                                notrot = 0
+                                go to 2103
+                             end if
+                          end do loop_2002
+           ! end q-loop
+           2103 continue
+           ! bailed out of q-loop
+                          sva( p ) = aapp
+                       else
+                          sva( p ) = aapp
+                          if( ( ir1==0 ) .and. ( aapp==zero ) )notrot = notrot + min( igl+kbl-1, &
+                                    n ) - p
+                       end if
+                    end do loop_2001
+           ! end of the p-loop
+           ! end of doing the block ( ibr, ibr )
+                 end do loop_1002
+           ! end of ir1-loop
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = ibr + 1, nbl
+                    jgl = ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! safe gram matrix computation
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_wdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_wcopy( m, a( 1, p ), 1,work, 1 )
+                                      call stdlib_wlascl( 'G', 0, 0, aapp,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_wdotc( m, work, 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_wdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / max(&
+                                                aaqq,aapp) )/ min(aaqq,aapp)
+                                   else
+                                      call stdlib_wcopy( m, a( 1, q ), 1,work, 1 )
+                                      call stdlib_wlascl( 'G', 0, 0, aaqq,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_wdotc( m, a( 1, p ), 1,work, 1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg(cwork(p))*cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                   ompq = aapq / abs(aapq)
+                                   notrot = 0
+      ! [rtd]      rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/ aapq1
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_wrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if( rsvec ) then
+                                             call stdlib_wrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_wrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if( rsvec ) then
+                                             call stdlib_wrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      d(p) = -d(q) * ompq
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                    if( aapp>aaqq ) then
+                                         call stdlib_wcopy( m, a( 1, p ), 1,work, 1 )
+                                         call stdlib_wlascl( 'G', 0, 0, aapp, one,m, 1, work,lda,&
+                                                   ierr )
+                                         call stdlib_wlascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         call stdlib_waxpy( m, -aapq, work,1, a( 1, q ), 1 )
+                                                   
+                                         call stdlib_wlascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    else
+                                        call stdlib_wcopy( m, a( 1, q ), 1,work, 1 )
+                                         call stdlib_wlascl( 'G', 0, 0, aaqq, one,m, 1, work,lda,&
+                                                   ierr )
+                                         call stdlib_wlascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         call stdlib_waxpy( m, -conjg(aapq),work, 1, a( 1, p ), 1 &
+                                                   )
+                                         call stdlib_wlascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! .. recompute sva(q), sva(p)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_qznrm2( m, a( 1, q ), 1)
+                                       else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_wlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_qznrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_wlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_qznrm2( m, a( 1, n ), 1 )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_wlassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<sqrt( real( n,KIND=qp) )*tol ) .and. ( real( n,&
+                        KIND=qp)*mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:( reaching this point means that the procedure has not converged.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means numerical convergence after the i-th
+           ! sweep.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the vector sva() of column norms.
+           do p = 1, n - 1
+              q = stdlib_iqamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 aapq = d( p )
+                 d( p ) = d( q )
+                 d( q ) = aapq
+                 call stdlib_wswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_wswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_wgsvj0
+
+     !> ZGSVJ1: is called from ZGESVJ as a pre-processor and that is its main
+     !> purpose. It applies Jacobi rotations in the same way as ZGESVJ does, but
+     !> it targets only particular pivots and it does not check convergence
+     !> (stopping criterion). Few tuning parameters (marked by [TP]) are
+     !> available for the implementer.
+     !> Further Details
+     !> ~~~~~~~~~~~~~~~
+     !> ZGSVJ1 applies few sweeps of Jacobi rotations in the column space of
+     !> the input M-by-N matrix A. The pivot pairs are taken from the (1,2)
+     !> off-diagonal block in the corresponding N-by-N Gram matrix A^T * A. The
+     !> block-entries (tiles) of the (1,2) off-diagonal block are marked by the
+     !> [x]'s in the following scheme:
+     !> | *  *  * [x] [x] [x]|
+     !> | *  *  * [x] [x] [x]|    Row-cycling in the nblr-by-nblc [x] blocks.
+     !> | *  *  * [x] [x] [x]|    Row-cyclic pivoting inside each [x] block.
+     !> |[x] [x] [x] *  *  * |
+     !> |[x] [x] [x] *  *  * |
+     !> |[x] [x] [x] *  *  * |
+     !> In terms of the columns of A, the first N1 columns are rotated 'against'
+     !> the remaining N-N1 columns, trying to increase the angle between the
+     !> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is
+     !> tiled using quadratic tiles of side KBL. Here, KBL is a tuning parameter.
+     !> The number of sweeps is given in NSWEEP and the orthogonality threshold
+     !> is given in TOL.
+
+     pure subroutine stdlib_wgsvj1( jobv, m, n, n1, a, lda, d, sva, mv, v, ldv,eps, sfmin, tol, &
+               nsweep, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: eps, sfmin, tol
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n, n1, nsweep
+           character, intent(in) :: jobv
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), d(n), v(ldv,*)
+           complex(qp), intent(out) :: work(lwork)
+           real(qp), intent(inout) :: sva(n)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: aapq, ompq
+           real(qp) :: aapp, aapp0, aapq1, aaqq, apoaq, aqoap, big, bigtheta, cs, mxaapq, mxsinj, &
+                     rootbig, rooteps, rootsfmin, roottol, small, sn, t, temp1, theta, thsign
+           integer(ilp) :: blskip, emptsw, i, ibr, igl, ierr, ijblsk, iswrot, jbc, jgl, kbl, mvl, &
+                     notrot, nblc, nblr, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, rotok, rsvec
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,real,min,sign,sqrt
+           ! From Lapack
+           ! Executable Statements 
+           ! test the input parameters.
+           applv = stdlib_lsame( jobv, 'A' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -3
+           else if( n1<0 ) then
+              info = -4
+           else if( lda<m ) then
+              info = -6
+           else if( ( rsvec.or.applv ) .and. ( mv<0 ) ) then
+              info = -9
+           else if( ( rsvec.and.( ldv<n ) ).or.( applv.and.( ldv<mv ) )  ) then
+              info = -11
+           else if( tol<=eps ) then
+              info = -14
+           else if( nsweep<0 ) then
+              info = -15
+           else if( lwork<m ) then
+              info = -17
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGSVJ1', -info )
+              return
+           end if
+           if( rsvec ) then
+              mvl = n
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           rooteps = sqrt( eps )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / eps
+           big = one / sfmin
+           rootbig = one / rootsfmin
+           ! large = big / sqrt( real( m*n,KIND=qp) )
+           bigtheta = one / rooteps
+           roottol = sqrt( tol )
+           ! Initialize The Right Singular Vector Matrix 
+           ! rsvec = stdlib_lsame( jobv, 'y' )
+           emptsw = n1*( n-n1 )
+           notrot = 0
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           kbl = min( 8, n )
+           nblr = n1 / kbl
+           if( ( nblr*kbl )/=n1 )nblr = nblr + 1
+           ! .. the tiling is nblr-by-nblc [tiles]
+           nblc = ( n-n1 ) / kbl
+           if( ( nblc*kbl )/=( n-n1 ) )nblc = nblc + 1
+           blskip = ( kbl**2 ) + 1
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           swband = 0
+      ! [tp] swband is a tuning parameter. it is meaningful and effective
+           ! if stdlib_wgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_wgejsv.
+           ! | *   *   * [x] [x] [x]|
+           ! | *   *   * [x] [x] [x]|    row-cycling in the nblr-by-nblc [x] blocks.
+           ! | *   *   * [x] [x] [x]|    row-cyclic pivoting inside each [x] block.
+           ! |[x] [x] [x] *   *   * |
+           ! |[x] [x] [x] *   *   * |
+           ! |[x] [x] [x] *   *   * |
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+           ! each sweep is unrolled using kbl-by-kbl tiles over the pivot pairs
+           ! 1 <= p < q <= n. this is the first step toward a blocked implementation
+           ! of the rotations. new implementation, based on block transformations,
+           ! is under development.
+              loop_2000: do ibr = 1, nblr
+                 igl = ( ibr-1 )*kbl + 1
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                  ! do 2010 jbc = ibr + 1, nbl
+                 loop_2010: do jbc = 1, nblc
+                    jgl = ( jbc-1 )*kbl + n1 + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n1 )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! safe gram matrix computation
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_wdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_wcopy( m, a( 1, p ), 1,work, 1 )
+                                      call stdlib_wlascl( 'G', 0, 0, aapp,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_wdotc( m, work, 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_wdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / max(&
+                                                aaqq,aapp) )/ min(aaqq,aapp)
+                                   else
+                                      call stdlib_wcopy( m, a( 1, q ), 1,work, 1 )
+                                      call stdlib_wlascl( 'G', 0, 0, aaqq,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_wdotc( m, a( 1, p ), 1,work, 1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg(cwork(p))*cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                   ompq = aapq / abs(aapq)
+                                   notrot = 0
+      ! [rtd]      rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/ aapq1
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_wrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if( rsvec ) then
+                                             call stdlib_wrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_wrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if( rsvec ) then
+                                             call stdlib_wrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      d(p) = -d(q) * ompq
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                    if( aapp>aaqq ) then
+                                         call stdlib_wcopy( m, a( 1, p ), 1,work, 1 )
+                                         call stdlib_wlascl( 'G', 0, 0, aapp, one,m, 1, work,lda,&
+                                                   ierr )
+                                         call stdlib_wlascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         call stdlib_waxpy( m, -aapq, work,1, a( 1, q ), 1 )
+                                                   
+                                         call stdlib_wlascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    else
+                                        call stdlib_wcopy( m, a( 1, q ), 1,work, 1 )
+                                         call stdlib_wlascl( 'G', 0, 0, aaqq, one,m, 1, work,lda,&
+                                                   ierr )
+                                         call stdlib_wlascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         call stdlib_waxpy( m, -conjg(aapq),work, 1, a( 1, p ), 1 &
+                                                   )
+                                         call stdlib_wlascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! .. recompute sva(q), sva(p)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_qznrm2( m, a( 1, q ), 1)
+                                       else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_wlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_qznrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_wlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_qznrm2( m, a( 1, n ), 1 )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_wlassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<sqrt( real( n,KIND=qp) )*tol ) .and. ( real( n,&
+                        KIND=qp)*mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:( reaching this point means that the procedure has not converged.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means numerical convergence after the i-th
+           ! sweep.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the vector sva() of column norms.
+           do p = 1, n - 1
+              q = stdlib_iqamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 aapq = d( p )
+                 d( p ) = d( q )
+                 d( q ) = aapq
+                 call stdlib_wswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_wswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_wgsvj1
+
+     !> ZGTCON: estimates the reciprocal of the condition number of a complex
+     !> tridiagonal matrix A using the LU factorization as computed by
+     !> ZGTTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_wgtcon( norm, n, dl, d, du, du2, ipiv, anorm, rcond,work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: d(*), dl(*), du(*), du2(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: onenrm
+           integer(ilp) :: i, kase, kase1
+           real(qp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: cmplx
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGTCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           ! check that d(1:n) is non-zero.
+           do i = 1, n
+              if( d( i )==cmplx( zero,KIND=qp) )return
+           end do
+           ainvnm = zero
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kase = 0
+           20 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(u)*inv(l).
+                 call stdlib_wgttrs( 'NO TRANSPOSE', n, 1, dl, d, du, du2, ipiv,work, n, info )
+                           
+              else
+                 ! multiply by inv(l**h)*inv(u**h).
+                 call stdlib_wgttrs( 'CONJUGATE TRANSPOSE', n, 1, dl, d, du, du2,ipiv, work, n, &
+                           info )
+              end if
+              go to 20
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_wgtcon
+
+     !> ZGTRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is tridiagonal, and provides
+     !> error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_wgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2,ipiv, b, ldb, x, &
+               ldx, ferr, berr, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: b(ldb,*), d(*), df(*), dl(*), dlf(*), du(*), du2(*), duf(*)
+                     
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transn, transt
+           integer(ilp) :: count, i, j, kase, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -13
+           else if( ldx<max( 1, n ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGTRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = 4
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_110: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_wcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_wlagtm( trans, n, 1, -one, dl, d, du, x( 1, j ), ldx, one,work, n )
+                        
+              ! compute abs(op(a))*abs(x) + abs(b) for use in the backward
+              ! error bound.
+              if( notran ) then
+                 if( n==1 ) then
+                    rwork( 1 ) = cabs1( b( 1, j ) ) +cabs1( d( 1 ) )*cabs1( x( 1, j ) )
+                 else
+                    rwork( 1 ) = cabs1( b( 1, j ) ) +cabs1( d( 1 ) )*cabs1( x( 1, j ) ) +cabs1( &
+                              du( 1 ) )*cabs1( x( 2, j ) )
+                    do i = 2, n - 1
+                       rwork( i ) = cabs1( b( i, j ) ) +cabs1( dl( i-1 ) )*cabs1( x( i-1, j ) ) +&
+                       cabs1( d( i ) )*cabs1( x( i, j ) ) +cabs1( du( i ) )*cabs1( x( i+1, j ) )
+                                 
+                    end do
+                    rwork( n ) = cabs1( b( n, j ) ) +cabs1( dl( n-1 ) )*cabs1( x( n-1, j ) ) +&
+                              cabs1( d( n ) )*cabs1( x( n, j ) )
+                 end if
+              else
+                 if( n==1 ) then
+                    rwork( 1 ) = cabs1( b( 1, j ) ) +cabs1( d( 1 ) )*cabs1( x( 1, j ) )
+                 else
+                    rwork( 1 ) = cabs1( b( 1, j ) ) +cabs1( d( 1 ) )*cabs1( x( 1, j ) ) +cabs1( &
+                              dl( 1 ) )*cabs1( x( 2, j ) )
+                    do i = 2, n - 1
+                       rwork( i ) = cabs1( b( i, j ) ) +cabs1( du( i-1 ) )*cabs1( x( i-1, j ) ) +&
+                       cabs1( d( i ) )*cabs1( x( i, j ) ) +cabs1( dl( i ) )*cabs1( x( i+1, j ) )
+                                 
+                    end do
+                    rwork( n ) = cabs1( b( n, j ) ) +cabs1( du( n-1 ) )*cabs1( x( n-1, j ) ) +&
+                              cabs1( d( n ) )*cabs1( x( n, j ) )
+                 end if
+              end if
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_wgttrs( trans, n, 1, dlf, df, duf, du2, ipiv, work, n,info )
+                 call stdlib_waxpy( n, cmplx( one,KIND=qp), work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_wlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              70 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_wgttrs( transt, n, 1, dlf, df, duf, du2, ipiv, work,n, info )
+                              
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_wgttrs( transn, n, 1, dlf, df, duf, du2, ipiv, work,n, info )
+                              
+                 end if
+                 go to 70
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_110
+           return
+     end subroutine stdlib_wgtrfs
+
+     !> ZGTSV:  solves the equation
+     !> A*X = B,
+     !> where A is an N-by-N tridiagonal matrix, by Gaussian elimination with
+     !> partial pivoting.
+     !> Note that the equation  A**T *X = B  may be solved by interchanging the
+     !> order of the arguments DU and DL.
+
+     pure subroutine stdlib_wgtsv( n, nrhs, dl, d, du, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(qp), intent(inout) :: b(ldb,*), d(*), dl(*), du(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, k
+           complex(qp) :: mult, temp, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGTSV ', -info )
+              return
+           end if
+           if( n==0 )return
+           loop_30: do k = 1, n - 1
+              if( dl( k )==czero ) then
+                 ! subdiagonal is czero, no elimination is required.
+                 if( d( k )==czero ) then
+                    ! diagonal is czero: set info = k and return; a unique
+                    ! solution can not be found.
+                    info = k
+                    return
+                 end if
+              else if( cabs1( d( k ) )>=cabs1( dl( k ) ) ) then
+                 ! no row interchange required
+                 mult = dl( k ) / d( k )
+                 d( k+1 ) = d( k+1 ) - mult*du( k )
+                 do j = 1, nrhs
+                    b( k+1, j ) = b( k+1, j ) - mult*b( k, j )
+                 end do
+                 if( k<( n-1 ) )dl( k ) = czero
+              else
+                 ! interchange rows k and k+1
+                 mult = d( k ) / dl( k )
+                 d( k ) = dl( k )
+                 temp = d( k+1 )
+                 d( k+1 ) = du( k ) - mult*temp
+                 if( k<( n-1 ) ) then
+                    dl( k ) = du( k+1 )
+                    du( k+1 ) = -mult*dl( k )
+                 end if
+                 du( k ) = temp
+                 do j = 1, nrhs
+                    temp = b( k, j )
+                    b( k, j ) = b( k+1, j )
+                    b( k+1, j ) = temp - mult*b( k+1, j )
+                 end do
+              end if
+           end do loop_30
+           if( d( n )==czero ) then
+              info = n
+              return
+           end if
+           ! back solve with the matrix u from the factorization.
+           do j = 1, nrhs
+              b( n, j ) = b( n, j ) / d( n )
+              if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) / d( n-1 )
+              do k = n - 2, 1, -1
+                 b( k, j ) = ( b( k, j )-du( k )*b( k+1, j )-dl( k )*b( k+2, j ) ) / d( k )
+                           
+              end do
+           end do
+           return
+     end subroutine stdlib_wgtsv
+
+     !> ZGTSVX: uses the LU factorization to compute the solution to a complex
+     !> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+     !> where A is a tridiagonal matrix of order N and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     pure subroutine stdlib_wgtsvx( fact, trans, n, nrhs, dl, d, du, dlf, df, duf,du2, ipiv, b, &
+               ldb, x, ldx, rcond, ferr, berr,work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: b(ldb,*), d(*), dl(*), du(*)
+           complex(qp), intent(inout) :: df(*), dlf(*), du2(*), duf(*)
+           complex(qp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact, notran
+           character :: norm
+           real(qp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -14
+           else if( ldx<max( 1, n ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGTSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the lu factorization of a.
+              call stdlib_wcopy( n, d, 1, df, 1 )
+              if( n>1 ) then
+                 call stdlib_wcopy( n-1, dl, 1, dlf, 1 )
+                 call stdlib_wcopy( n-1, du, 1, duf, 1 )
+              end if
+              call stdlib_wgttrf( n, dlf, df, duf, du2, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_wlangt( norm, n, dl, d, du )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_wgtcon( norm, n, dlf, df, duf, du2, ipiv, anorm, rcond, work,info )
+           ! compute the solution vectors x.
+           call stdlib_wlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_wgttrs( trans, n, nrhs, dlf, df, duf, du2, ipiv, x, ldx,info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_wgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2, ipiv,b, ldb, x, ldx, &
+                     ferr, berr, work, rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_wgtsvx
+
+     !> ZGTTRF: computes an LU factorization of a complex tridiagonal matrix A
+     !> using elimination with partial pivoting and row interchanges.
+     !> The factorization has the form
+     !> A = L * U
+     !> where L is a product of permutation and unit lower bidiagonal
+     !> matrices and U is upper triangular with nonzeros in only the main
+     !> diagonal and first two superdiagonals.
+
+     pure subroutine stdlib_wgttrf( n, dl, d, du, du2, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: d(*), dl(*), du(*)
+           complex(qp), intent(out) :: du2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(qp) :: fact, temp, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'ZGTTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize ipiv(i) = i and du2(i) = 0
+           do i = 1, n
+              ipiv( i ) = i
+           end do
+           do i = 1, n - 2
+              du2( i ) = zero
+           end do
+           do i = 1, n - 2
+              if( cabs1( d( i ) )>=cabs1( dl( i ) ) ) then
+                 ! no row interchange required, eliminate dl(i)
+                 if( cabs1( d( i ) )/=zero ) then
+                    fact = dl( i ) / d( i )
+                    dl( i ) = fact
+                    d( i+1 ) = d( i+1 ) - fact*du( i )
+                 end if
+              else
+                 ! interchange rows i and i+1, eliminate dl(i)
+                 fact = d( i ) / dl( i )
+                 d( i ) = dl( i )
+                 dl( i ) = fact
+                 temp = du( i )
+                 du( i ) = d( i+1 )
+                 d( i+1 ) = temp - fact*d( i+1 )
+                 du2( i ) = du( i+1 )
+                 du( i+1 ) = -fact*du( i+1 )
+                 ipiv( i ) = i + 1
+              end if
+           end do
+           if( n>1 ) then
+              i = n - 1
+              if( cabs1( d( i ) )>=cabs1( dl( i ) ) ) then
+                 if( cabs1( d( i ) )/=zero ) then
+                    fact = dl( i ) / d( i )
+                    dl( i ) = fact
+                    d( i+1 ) = d( i+1 ) - fact*du( i )
+                 end if
+              else
+                 fact = d( i ) / dl( i )
+                 d( i ) = dl( i )
+                 dl( i ) = fact
+                 temp = du( i )
+                 du( i ) = d( i+1 )
+                 d( i+1 ) = temp - fact*d( i+1 )
+                 ipiv( i ) = i + 1
+              end if
+           end if
+           ! check for a zero on the diagonal of u.
+           do i = 1, n
+              if( cabs1( d( i ) )==zero ) then
+                 info = i
+                 go to 50
+              end if
+           end do
+           50 continue
+           return
+     end subroutine stdlib_wgttrf
+
+     !> ZGTTRS: solves one of the systems of equations
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> with a tridiagonal matrix A using the LU factorization computed
+     !> by ZGTTRF.
+
+     pure subroutine stdlib_wgttrs( trans, n, nrhs, dl, d, du, du2, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: b(ldb,*)
+           complex(qp), intent(in) :: d(*), dl(*), du(*), du2(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: itrans, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           notran = ( trans=='N' .or. trans=='N' )
+           if( .not.notran .and. .not.( trans=='T' .or. trans=='T' ) .and. .not.( trans=='C' .or. &
+                     trans=='C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( n, 1 ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGTTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! decode trans
+           if( notran ) then
+              itrans = 0
+           else if( trans=='T' .or. trans=='T' ) then
+              itrans = 1
+           else
+              itrans = 2
+           end if
+           ! determine the number of right-hand sides to solve at a time.
+           if( nrhs==1 ) then
+              nb = 1
+           else
+              nb = max( 1, stdlib_ilaenv( 1, 'ZGTTRS', trans, n, nrhs, -1, -1 ) )
+           end if
+           if( nb>=nrhs ) then
+              call stdlib_wgtts2( itrans, n, nrhs, dl, d, du, du2, ipiv, b, ldb )
+           else
+              do j = 1, nrhs, nb
+                 jb = min( nrhs-j+1, nb )
+                 call stdlib_wgtts2( itrans, n, jb, dl, d, du, du2, ipiv, b( 1, j ),ldb )
+              end do
+           end if
+     end subroutine stdlib_wgttrs
+
+     !> ZGTTS2: solves one of the systems of equations
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> with a tridiagonal matrix A using the LU factorization computed
+     !> by ZGTTRF.
+
+     pure subroutine stdlib_wgtts2( itrans, n, nrhs, dl, d, du, du2, ipiv, b, ldb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: itrans, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: b(ldb,*)
+           complex(qp), intent(in) :: d(*), dl(*), du(*), du2(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           complex(qp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( itrans==0 ) then
+              ! solve a*x = b using the lu factorization of a,
+              ! overwriting each right hand side vector with its solution.
+              if( nrhs<=1 ) then
+                 j = 1
+                 10 continue
+                 ! solve l*x = b.
+                 do i = 1, n - 1
+                    if( ipiv( i )==i ) then
+                       b( i+1, j ) = b( i+1, j ) - dl( i )*b( i, j )
+                    else
+                       temp = b( i, j )
+                       b( i, j ) = b( i+1, j )
+                       b( i+1, j ) = temp - dl( i )*b( i, j )
+                    end if
+                 end do
+                 ! solve u*x = b.
+                 b( n, j ) = b( n, j ) / d( n )
+                 if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                 do i = n - 2, 1, -1
+                    b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-du2( i )*b( i+2, j ) ) / d( i )
+                              
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 10
+                 end if
+              else
+                 do j = 1, nrhs
+                 ! solve l*x = b.
+                    do i = 1, n - 1
+                       if( ipiv( i )==i ) then
+                          b( i+1, j ) = b( i+1, j ) - dl( i )*b( i, j )
+                       else
+                          temp = b( i, j )
+                          b( i, j ) = b( i+1, j )
+                          b( i+1, j ) = temp - dl( i )*b( i, j )
+                       end if
+                    end do
+                 ! solve u*x = b.
+                    b( n, j ) = b( n, j ) / d( n )
+                    if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                    do i = n - 2, 1, -1
+                       b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-du2( i )*b( i+2, j ) ) / d( i )
+                                 
+                    end do
+                 end do
+              end if
+           else if( itrans==1 ) then
+              ! solve a**t * x = b.
+              if( nrhs<=1 ) then
+                 j = 1
+                 70 continue
+                 ! solve u**t * x = b.
+                 b( 1, j ) = b( 1, j ) / d( 1 )
+                 if( n>1 )b( 2, j ) = ( b( 2, j )-du( 1 )*b( 1, j ) ) / d( 2 )
+                 do i = 3, n
+                    b( i, j ) = ( b( i, j )-du( i-1 )*b( i-1, j )-du2( i-2 )*b( i-2, j ) ) / d( i &
+                              )
+                 end do
+                 ! solve l**t * x = b.
+                 do i = n - 1, 1, -1
+                    if( ipiv( i )==i ) then
+                       b( i, j ) = b( i, j ) - dl( i )*b( i+1, j )
+                    else
+                       temp = b( i+1, j )
+                       b( i+1, j ) = b( i, j ) - dl( i )*temp
+                       b( i, j ) = temp
+                    end if
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 70
+                 end if
+              else
+                 do j = 1, nrhs
+                 ! solve u**t * x = b.
+                    b( 1, j ) = b( 1, j ) / d( 1 )
+                    if( n>1 )b( 2, j ) = ( b( 2, j )-du( 1 )*b( 1, j ) ) / d( 2 )
+                    do i = 3, n
+                       b( i, j ) = ( b( i, j )-du( i-1 )*b( i-1, j )-du2( i-2 )*b( i-2, j ) ) / d(&
+                                  i )
+                    end do
+                 ! solve l**t * x = b.
+                    do i = n - 1, 1, -1
+                       if( ipiv( i )==i ) then
+                          b( i, j ) = b( i, j ) - dl( i )*b( i+1, j )
+                       else
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - dl( i )*temp
+                          b( i, j ) = temp
+                       end if
+                    end do
+                 end do
+              end if
+           else
+              ! solve a**h * x = b.
+              if( nrhs<=1 ) then
+                 j = 1
+                 130 continue
+                 ! solve u**h * x = b.
+                 b( 1, j ) = b( 1, j ) / conjg( d( 1 ) )
+                 if( n>1 )b( 2, j ) = ( b( 2, j )-conjg( du( 1 ) )*b( 1, j ) ) /conjg( d( 2 ) )
+                           
+                 do i = 3, n
+                    b( i, j ) = ( b( i, j )-conjg( du( i-1 ) )*b( i-1, j )-conjg( du2( i-2 ) )*b( &
+                              i-2, j ) ) /conjg( d( i ) )
+                 end do
+                 ! solve l**h * x = b.
+                 do i = n - 1, 1, -1
+                    if( ipiv( i )==i ) then
+                       b( i, j ) = b( i, j ) - conjg( dl( i ) )*b( i+1, j )
+                    else
+                       temp = b( i+1, j )
+                       b( i+1, j ) = b( i, j ) - conjg( dl( i ) )*temp
+                       b( i, j ) = temp
+                    end if
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 130
+                 end if
+              else
+                 do j = 1, nrhs
+                 ! solve u**h * x = b.
+                    b( 1, j ) = b( 1, j ) / conjg( d( 1 ) )
+                    if( n>1 )b( 2, j ) = ( b( 2, j )-conjg( du( 1 ) )*b( 1, j ) )/ conjg( d( 2 ) )
+                              
+                    do i = 3, n
+                       b( i, j ) = ( b( i, j )-conjg( du( i-1 ) )*b( i-1, j )-conjg( du2( i-2 ) )&
+                                 *b( i-2, j ) ) / conjg( d( i ) )
+                    end do
+                 ! solve l**h * x = b.
+                    do i = n - 1, 1, -1
+                       if( ipiv( i )==i ) then
+                          b( i, j ) = b( i, j ) - conjg( dl( i ) )*b( i+1, j )
+                       else
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - conjg( dl( i ) )*temp
+                          b( i, j ) = temp
+                       end if
+                    end do
+                 end do
+              end if
+           end if
+     end subroutine stdlib_wgtts2
+
+     !> ZHB2ST_KERNELS: is an internal routine used by the ZHETRD_HB2ST
+     !> subroutine.
+
+     pure subroutine  stdlib_whb2st_kernels( uplo, wantz, ttype,st, ed, sweep, n, nb, ib,a, lda, &
+               v, tau, ldvt, work)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: wantz
+           integer(ilp), intent(in) :: ttype, st, ed, sweep, n, nb, ib, lda, ldvt
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: v(*), tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j1, j2, lm, ln, vpos, taupos, dpos, ofdpos, ajeter
+           complex(qp) :: ctmp
+           ! Intrinsic Functions 
+           intrinsic :: conjg,mod
+           ! Executable Statements 
+           ajeter = ib + ldvt
+           upper = stdlib_lsame( uplo, 'U' )
+           if( upper ) then
+               dpos    = 2 * nb + 1
+               ofdpos  = 2 * nb
+           else
+               dpos    = 1
+               ofdpos  = 2
+           endif
+           ! upper case
+           if( upper ) then
+               if( wantz ) then
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               else
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               endif
+               if( ttype==1 ) then
+                   lm = ed - st + 1
+                   v( vpos ) = cone
+                   do i = 1, lm-1
+                       v( vpos+i )         = conjg( a( ofdpos-i, st+i ) )
+                       a( ofdpos-i, st+i ) = czero
+                   end do
+                   ctmp = conjg( a( ofdpos, st ) )
+                   call stdlib_wlarfg( lm, ctmp, v( vpos+1 ), 1,tau( taupos ) )
+                   a( ofdpos, st ) = ctmp
+                   lm = ed - st + 1
+                   call stdlib_wlarfy( uplo, lm, v( vpos ), 1,conjg( tau( taupos ) ),a( dpos, st )&
+                             , lda-1, work)
+               endif
+               if( ttype==3 ) then
+                   lm = ed - st + 1
+                   call stdlib_wlarfy( uplo, lm, v( vpos ), 1,conjg( tau( taupos ) ),a( dpos, st )&
+                             , lda-1, work)
+               endif
+               if( ttype==2 ) then
+                   j1 = ed+1
+                   j2 = min( ed+nb, n )
+                   ln = ed-st+1
+                   lm = j2-j1+1
+                   if( lm>0) then
+                       call stdlib_wlarfx( 'LEFT', ln, lm, v( vpos ),conjg( tau( taupos ) ),a( &
+                                 dpos-nb, j1 ), lda-1, work)
+                       if( wantz ) then
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       else
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       endif
+                       v( vpos ) = cone
+                       do i = 1, lm-1
+                           v( vpos+i )          =conjg( a( dpos-nb-i, j1+i ) )
+                           a( dpos-nb-i, j1+i ) = czero
+                       end do
+                       ctmp = conjg( a( dpos-nb, j1 ) )
+                       call stdlib_wlarfg( lm, ctmp, v( vpos+1 ), 1, tau( taupos ) )
+                       a( dpos-nb, j1 ) = ctmp
+                       call stdlib_wlarfx( 'RIGHT', ln-1, lm, v( vpos ),tau( taupos ),a( dpos-nb+&
+                                 1, j1 ), lda-1, work)
+                   endif
+               endif
+           ! lower case
+           else
+               if( wantz ) then
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               else
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               endif
+               if( ttype==1 ) then
+                   lm = ed - st + 1
+                   v( vpos ) = cone
+                   do i = 1, lm-1
+                       v( vpos+i )         = a( ofdpos+i, st-1 )
+                       a( ofdpos+i, st-1 ) = czero
+                   end do
+                   call stdlib_wlarfg( lm, a( ofdpos, st-1 ), v( vpos+1 ), 1,tau( taupos ) )
+                             
+                   lm = ed - st + 1
+                   call stdlib_wlarfy( uplo, lm, v( vpos ), 1,conjg( tau( taupos ) ),a( dpos, st )&
+                             , lda-1, work)
+               endif
+               if( ttype==3 ) then
+                   lm = ed - st + 1
+                   call stdlib_wlarfy( uplo, lm, v( vpos ), 1,conjg( tau( taupos ) ),a( dpos, st )&
+                             , lda-1, work)
+               endif
+               if( ttype==2 ) then
+                   j1 = ed+1
+                   j2 = min( ed+nb, n )
+                   ln = ed-st+1
+                   lm = j2-j1+1
+                   if( lm>0) then
+                       call stdlib_wlarfx( 'RIGHT', lm, ln, v( vpos ),tau( taupos ), a( dpos+nb, &
+                                 st ),lda-1, work)
+                       if( wantz ) then
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       else
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       endif
+                       v( vpos ) = cone
+                       do i = 1, lm-1
+                           v( vpos+i )        = a( dpos+nb+i, st )
+                           a( dpos+nb+i, st ) = czero
+                       end do
+                       call stdlib_wlarfg( lm, a( dpos+nb, st ), v( vpos+1 ), 1,tau( taupos ) )
+                                 
+                       call stdlib_wlarfx( 'LEFT', lm, ln-1, v( vpos ),conjg( tau( taupos ) ),a( &
+                                 dpos+nb-1, st+1 ), lda-1, work)
+                   endif
+               endif
+           endif
+           return
+     end subroutine stdlib_whb2st_kernels
+
+     !> ZHBEV: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a complex Hermitian band matrix A.
+
+     subroutine stdlib_whbev( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldz, n
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: ab(ldab,*)
+           complex(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, wantz
+           integer(ilp) :: iinfo, imax, inde, indrwk, iscale
+           real(qp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( lower ) then
+                 w( 1 ) = real( ab( 1, 1 ),KIND=qp)
+              else
+                 w( 1 ) = real( ab( kd+1, 1 ),KIND=qp)
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_wlanhb( 'M', uplo, n, kd, ab, ldab, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_wlascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_wlascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+           end if
+           ! call stdlib_whbtrd to reduce hermitian band matrix to tridiagonal form.
+           inde = 1
+           call stdlib_whbtrd( jobz, uplo, n, kd, ab, ldab, w, rwork( inde ), z,ldz, work, iinfo )
+                     
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, call stdlib_wsteqr.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, rwork( inde ), info )
+           else
+              indrwk = inde + n
+              call stdlib_wsteqr( jobz, n, w, rwork( inde ), z, ldz,rwork( indrwk ), info )
+                        
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           return
+     end subroutine stdlib_whbev
+
+     !> ZHBEVD: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a complex Hermitian band matrix A.  If eigenvectors are desired, it
+     !> uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_whbevd( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,lwork, rwork, lrwork, &
+               iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: ab(ldab,*)
+           complex(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indwk2, indwrk, iscale, liwmin, llrwk, llwk2, &
+                     lrwmin, lwmin
+           real(qp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 .or. liwork==-1 .or. lrwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              lwmin = 1
+              lrwmin = 1
+              liwmin = 1
+           else
+              if( wantz ) then
+                 lwmin = 2*n**2
+                 lrwmin = 1 + 5*n + 2*n**2
+                 liwmin = 3 + 5*n
+              else
+                 lwmin = n
+                 lrwmin = n
+                 liwmin = 1
+              end if
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -13
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = real( ab( 1, 1 ),KIND=qp)
+              if( wantz )z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_wlanhb( 'M', uplo, n, kd, ab, ldab, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_wlascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_wlascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+           end if
+           ! call stdlib_whbtrd to reduce hermitian band matrix to tridiagonal form.
+           inde = 1
+           indwrk = inde + n
+           indwk2 = 1 + n*n
+           llwk2 = lwork - indwk2 + 1
+           llrwk = lrwork - indwrk + 1
+           call stdlib_whbtrd( jobz, uplo, n, kd, ab, ldab, w, rwork( inde ), z,ldz, work, iinfo )
+                     
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, call stdlib_wstedc.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_wstedc( 'I', n, w, rwork( inde ), work, n, work( indwk2 ),llwk2, rwork( &
+                        indwrk ), llrwk, iwork, liwork,info )
+              call stdlib_wgemm( 'N', 'N', n, n, n, cone, z, ldz, work, n, czero,work( indwk2 ), &
+                        n )
+              call stdlib_wlacpy( 'A', n, n, work( indwk2 ), n, z, ldz )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_whbevd
+
+     !> ZHBEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex Hermitian band matrix A.  Eigenvalues and eigenvectors
+     !> can be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+
+     subroutine stdlib_whbevx( jobz, range, uplo, n, kd, ab, ldab, q, ldq, vl,vu, il, iu, abstol, &
+               m, w, z, ldz, work, rwork,iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, kd, ldab, ldq, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: ab(ldab,*)
+           complex(qp), intent(out) :: q(ldq,*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwk, indrwk, &
+                     indwrk, iscale, itmp1, j, jj, nsplit
+           real(qp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           complex(qp) :: ctmp1
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lower = stdlib_lsame( uplo, 'L' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           else if( wantz .and. ldq<max( 1, n ) ) then
+              info = -9
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -11
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -12
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -13
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -18
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              m = 1
+              if( lower ) then
+                 ctmp1 = ab( 1, 1 )
+              else
+                 ctmp1 = ab( kd+1, 1 )
+              end if
+              tmp1 = real( ctmp1,KIND=qp)
+              if( valeig ) then
+                 if( .not.( vl<tmp1 .and. vu>=tmp1 ) )m = 0
+              end if
+              if( m==1 ) then
+                 w( 1 ) = real( ctmp1,KIND=qp)
+                 if( wantz )z( 1, 1 ) = cone
+              end if
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           end if
+           anrm = stdlib_wlanhb( 'M', uplo, n, kd, ab, ldab, rwork )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_wlascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_wlascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_whbtrd to reduce hermitian band matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indrwk = inde + n
+           indwrk = 1
+           call stdlib_whbtrd( jobz, uplo, n, kd, ab, ldab, rwork( indd ),rwork( inde ), q, ldq, &
+                     work( indwrk ), iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_qsterf or stdlib_wsteqr.  if this fails for some
+           ! eigenvalue, then try stdlib_qstebz.
+           test = .false.
+           if (indeig) then
+              if (il==1 .and. iu==n) then
+                 test = .true.
+              end if
+           end if
+           if ((alleig .or. test) .and. (abstol<=zero)) then
+              call stdlib_qcopy( n, rwork( indd ), 1, w, 1 )
+              indee = indrwk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_qcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_qsterf( n, w, rwork( indee ), info )
+              else
+                 call stdlib_wlacpy( 'A', n, n, q, ldq, z, ldz )
+                 call stdlib_qcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_wsteqr( jobz, n, w, rwork( indee ), z, ldz,rwork( indrwk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_qstebz and, if eigenvectors are desired, stdlib_wstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwk = indisp + n
+           call stdlib_qstebz( range, order, n, vll, vuu, il, iu, abstll,rwork( indd ), rwork( &
+           inde ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwk ),&
+                      info )
+           if( wantz ) then
+              call stdlib_wstein( n, rwork( indd ), rwork( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwk ), ifail, info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_wstein.
+              do j = 1, m
+                 call stdlib_wcopy( n, z( 1, j ), 1, work( 1 ), 1 )
+                 call stdlib_wgemv( 'N', n, n, cone, q, ldq, work, 1, czero,z( 1, j ), 1 )
+              end do
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           30 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_wswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_whbevx
+
+     !> ZHBGST: reduces a complex Hermitian-definite banded generalized
+     !> eigenproblem  A*x = lambda*B*x  to standard form  C*y = lambda*y,
+     !> such that C has the same bandwidth as A.
+     !> B must have been previously factorized as S**H*S by ZPBSTF, using a
+     !> split Cholesky factorization. A is overwritten by C = X**H*A*X, where
+     !> X = S**(-1)*Q and Q is a unitary matrix chosen to preserve the
+     !> bandwidth of A.
+
+     pure subroutine stdlib_whbgst( vect, uplo, n, ka, kb, ab, ldab, bb, ldbb, x,ldx, work, rwork,&
+                info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldx, n
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: ab(ldab,*)
+           complex(qp), intent(in) :: bb(ldbb,*)
+           complex(qp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: update, upper, wantx
+           integer(ilp) :: i, i0, i1, i2, inca, j, j1, j1t, j2, j2t, k, ka1, kb1, kbt, l, m, nr, &
+                     nrt, nx
+           real(qp) :: bii
+           complex(qp) :: ra, ra1, t
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           wantx = stdlib_lsame( vect, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           ka1 = ka + 1
+           kb1 = kb + 1
+           info = 0
+           if( .not.wantx .and. .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldx<1 .or. wantx .and. ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBGST', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           inca = ldab*ka1
+           ! initialize x to the unit matrix, if needed
+           if( wantx )call stdlib_wlaset( 'FULL', n, n, czero, cone, x, ldx )
+           ! set m to the splitting point m. it must be the same value as is
+           ! used in stdlib_wpbstf. the chosen value allows the arrays work and rwork
+           ! to be of dimension (n).
+           m = ( n+kb ) / 2
+           ! the routine works in two phases, corresponding to the two halves
+           ! of the split cholesky factorization of b as s**h*s where
+           ! s = ( u    )
+               ! ( m  l )
+           ! with u upper triangular of order m, and l lower triangular of
+           ! order n-m. s has the same bandwidth as b.
+           ! s is treated as a product of elementary matrices:
+           ! s = s(m)*s(m-1)*...*s(2)*s(1)*s(m+1)*s(m+2)*...*s(n-1)*s(n)
+           ! where s(i) is determined by the i-th row of s.
+           ! in phase 1, the index i takes the values n, n-1, ... , m+1;
+           ! in phase 2, it takes the values 1, 2, ... , m.
+           ! for each value of i, the current matrix a is updated by forming
+           ! inv(s(i))**h*a*inv(s(i)). this creates a triangular bulge outside
+           ! the band of a. the bulge is then pushed down toward the bottom of
+           ! a in phase 1, and up toward the top of a in phase 2, by applying
+           ! plane rotations.
+           ! there are kb*(kb+1)/2 elements in the bulge, but at most 2*kb-1
+           ! of them are linearly independent, so annihilating a bulge requires
+           ! only 2*kb-1 plane rotations. the rotations are divided into a 1st
+           ! set of kb-1 rotations, and a 2nd set of kb rotations.
+           ! wherever possible, rotations are generated and applied in vector
+           ! operations of length nr between the indices j1 and j2 (sometimes
+           ! replaced by modified values nrt, j1t or j2t).
+           ! the real cosines and complex sines of the rotations are stored in
+           ! the arrays rwork and work, those of the 1st set in elements
+           ! 2:m-kb-1, and those of the 2nd set in elements m-kb+1:n.
+           ! the bulges are not formed explicitly; nonzero elements outside the
+           ! band are created only when they are required for generating new
+           ! rotations; they are stored in the array work, in positions where
+           ! they are later overwritten by the sines of the rotations which
+           ! annihilate them.
+           ! **************************** phase 1 *****************************
+           ! the logical structure of this phase is:
+           ! update = .true.
+           ! do i = n, m + 1, -1
+              ! use s(i) to update a and create a new bulge
+              ! apply rotations to push all bulges ka positions downward
+           ! end do
+           ! update = .false.
+           ! do i = m + ka + 1, n - 1
+              ! apply rotations to push all bulges ka positions downward
+           ! end do
+           ! to avoid duplicating code, the two loops are merged.
+           update = .true.
+           i = n + 1
+           10 continue
+           if( update ) then
+              i = i - 1
+              kbt = min( kb, i-1 )
+              i0 = i - 1
+              i1 = min( n, i+ka )
+              i2 = i - kbt + ka1
+              if( i<m+1 ) then
+                 update = .false.
+                 i = i + 1
+                 i0 = m
+                 if( ka==0 )go to 480
+                 go to 10
+              end if
+           else
+              i = i + ka
+              if( i>n-1 )go to 480
+           end if
+           if( upper ) then
+              ! transform a, working with the upper triangle
+              if( update ) then
+                 ! form  inv(s(i))**h * a * inv(s(i))
+                 bii = real( bb( kb1, i ),KIND=qp)
+                 ab( ka1, i ) = ( real( ab( ka1, i ),KIND=qp) / bii ) / bii
+                 do j = i + 1, i1
+                    ab( i-j+ka1, j ) = ab( i-j+ka1, j ) / bii
+                 end do
+                 do j = max( 1, i-ka ), i - 1
+                    ab( j-i+ka1, i ) = ab( j-i+ka1, i ) / bii
+                 end do
+                 do k = i - kbt, i - 1
+                    do j = i - kbt, k
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( j-i+kb1, i )*conjg( ab( k-i+ka1, &
+                       i ) ) -conjg( bb( k-i+kb1, i ) )*ab( j-i+ka1, i ) +real( ab( ka1, i ),&
+                                 KIND=qp)*bb( j-i+kb1, i )*conjg( bb( k-i+kb1, i ) )
+                    end do
+                    do j = max( 1, i-ka ), i - kbt - 1
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -conjg( bb( k-i+kb1, i ) )*ab( j-i+ka1,&
+                                  i )
+                    end do
+                 end do
+                 do j = i, i1
+                    do k = max( j-ka, i-kbt ), i - 1
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( k-i+kb1, i )*ab( i-j+ka1, j )
+                                 
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_wdscal( n-m, one / bii, x( m+1, i ), 1 )
+                    if( kbt>0 )call stdlib_wgerc( n-m, kbt, -cone, x( m+1, i ), 1,bb( kb1-kbt, i )&
+                              , 1, x( m+1, i-kbt ),ldx )
+                 end if
+                 ! store a(i,i1) in ra1 for use in next loop over k
+                 ra1 = ab( i-i1+ka1, i1 )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions down toward the bottom of the
+              ! band
+              loop_130: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i-k+ka<n .and. i-k>1 ) then
+                       ! generate rotation to annihilate a(i,i-k+ka+1)
+                       call stdlib_wlartg( ab( k+1, i-k+ka ), ra1,rwork( i-k+ka-m ), work( i-k+ka-&
+                                 m ), ra )
+                       ! create nonzero element a(i-k,i-k+ka+1) outside the
+                       ! band and store it in work(i-k)
+                       t = -bb( kb1-k, i )*ra1
+                       work( i-k ) = rwork( i-k+ka-m )*t -conjg( work( i-k+ka-m ) )*ab( 1, i-k+ka &
+                                 )
+                       ab( 1, i-k+ka ) = work( i-k+ka-m )*t +rwork( i-k+ka-m )*ab( 1, i-k+ka )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = max( j2, i+2*ka-k+1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( n-j2t+ka ) / ka1
+                 do j = j2t, j1, ka1
+                    ! create nonzero element a(j-ka,j+1) outside the band
+                    ! and store it in work(j-m)
+                    work( j-m ) = work( j-m )*ab( 1, j+1 )
+                    ab( 1, j+1 ) = rwork( j-m )*ab( 1, j+1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_wlargv( nrt, ab( 1, j2t ), inca, work( j2t-m ), ka1,rwork(&
+                            j2t-m ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the right
+                    do l = 1, ka - 1
+                       call stdlib_wlartv( nr, ab( ka1-l, j2 ), inca,ab( ka-l, j2+1 ), inca, &
+                                 rwork( j2-m ),work( j2-m ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_wlar2v( nr, ab( ka1, j2 ), ab( ka1, j2+1 ),ab( ka, j2+1 ), inca, &
+                              rwork( j2-m ),work( j2-m ), ka1 )
+                    call stdlib_wlacgv( nr, work( j2-m ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca, rwork( j2-m ),work( j2-m ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j2, j1, ka1
+                       call stdlib_wrot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,rwork( j-m ), &
+                                 conjg( work( j-m ) ) )
+                    end do
+                 end if
+              end do loop_130
+              if( update ) then
+                 if( i2<=n .and. kbt>0 ) then
+                    ! create nonzero element a(i-kbt,i-kbt+ka+1) outside the
+                    ! band and store it in work(i-kbt)
+                    work( i-kbt ) = -bb( kb1-kbt, i )*ra1
+                 end if
+              end if
+              loop_170: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i - k - 1 + max( 2, k-i0+1 )*ka1
+                 else
+                    j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+ka+l ) / ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( l, j2-l+1 ), inca,ab( l+1, j2-l+1 ), &
+                              inca, rwork( j2-ka ),work( j2-ka ), ka1 )
+                 end do
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 do j = j1, j2, -ka1
+                    work( j ) = work( j-ka )
+                    rwork( j ) = rwork( j-ka )
+                 end do
+                 do j = j2, j1, ka1
+                    ! create nonzero element a(j-ka,j+1) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( 1, j+1 )
+                    ab( 1, j+1 ) = rwork( j )*ab( 1, j+1 )
+                 end do
+                 if( update ) then
+                    if( i-k<n-ka .and. k<=kbt )work( i-k+ka ) = work( i-k )
+                 end if
+              end do loop_170
+              loop_210: do k = kb, 1, -1
+                 j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_wlargv( nr, ab( 1, j2 ), inca, work( j2 ), ka1,rwork( j2 ), ka1 )
+                              
+                    ! apply rotations in 2nd set from the right
+                    do l = 1, ka - 1
+                       call stdlib_wlartv( nr, ab( ka1-l, j2 ), inca,ab( ka-l, j2+1 ), inca, &
+                                 rwork( j2 ),work( j2 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_wlar2v( nr, ab( ka1, j2 ), ab( ka1, j2+1 ),ab( ka, j2+1 ), inca, &
+                              rwork( j2 ),work( j2 ), ka1 )
+                    call stdlib_wlacgv( nr, work( j2 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca, rwork( j2 ),work( j2 ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j2, j1, ka1
+                       call stdlib_wrot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,rwork( j ), conjg( &
+                                 work( j ) ) )
+                    end do
+                 end if
+              end do loop_210
+              do k = 1, kb - 1
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 ! finish applying rotations in 1st set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca, rwork( j2-m ),work( j2-m ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = n - 1, j2 + ka, -1
+                    rwork( j-m ) = rwork( j-ka-m )
+                    work( j-m ) = work( j-ka-m )
+                 end do
+              end if
+           else
+              ! transform a, working with the lower triangle
+              if( update ) then
+                 ! form  inv(s(i))**h * a * inv(s(i))
+                 bii = real( bb( 1, i ),KIND=qp)
+                 ab( 1, i ) = ( real( ab( 1, i ),KIND=qp) / bii ) / bii
+                 do j = i + 1, i1
+                    ab( j-i+1, i ) = ab( j-i+1, i ) / bii
+                 end do
+                 do j = max( 1, i-ka ), i - 1
+                    ab( i-j+1, j ) = ab( i-j+1, j ) / bii
+                 end do
+                 do k = i - kbt, i - 1
+                    do j = i - kbt, k
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( i-j+1, j )*conjg( ab( i-k+1,k ) ) - &
+                       conjg( bb( i-k+1, k ) )*ab( i-j+1, j ) + real( ab( 1, i ),KIND=qp)*bb( i-j+&
+                                 1, j )*conjg( bb( i-k+1,k ) )
+                    end do
+                    do j = max( 1, i-ka ), i - kbt - 1
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -conjg( bb( i-k+1, k ) )*ab( i-j+1, j )
+                                 
+                    end do
+                 end do
+                 do j = i, i1
+                    do k = max( j-ka, i-kbt ), i - 1
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( i-k+1, k )*ab( j-i+1, i )
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_wdscal( n-m, one / bii, x( m+1, i ), 1 )
+                    if( kbt>0 )call stdlib_wgeru( n-m, kbt, -cone, x( m+1, i ), 1,bb( kbt+1, i-&
+                              kbt ), ldbb-1,x( m+1, i-kbt ), ldx )
+                 end if
+                 ! store a(i1,i) in ra1 for use in next loop over k
+                 ra1 = ab( i1-i+1, i )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions down toward the bottom of the
+              ! band
+              loop_360: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i-k+ka<n .and. i-k>1 ) then
+                       ! generate rotation to annihilate a(i-k+ka+1,i)
+                       call stdlib_wlartg( ab( ka1-k, i ), ra1, rwork( i-k+ka-m ),work( i-k+ka-m )&
+                                 , ra )
+                       ! create nonzero element a(i-k+ka+1,i-k) outside the
+                       ! band and store it in work(i-k)
+                       t = -bb( k+1, i-k )*ra1
+                       work( i-k ) = rwork( i-k+ka-m )*t -conjg( work( i-k+ka-m ) )*ab( ka1, i-k )
+                                 
+                       ab( ka1, i-k ) = work( i-k+ka-m )*t +rwork( i-k+ka-m )*ab( ka1, i-k )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = max( j2, i+2*ka-k+1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( n-j2t+ka ) / ka1
+                 do j = j2t, j1, ka1
+                    ! create nonzero element a(j+1,j-ka) outside the band
+                    ! and store it in work(j-m)
+                    work( j-m ) = work( j-m )*ab( ka1, j-ka+1 )
+                    ab( ka1, j-ka+1 ) = rwork( j-m )*ab( ka1, j-ka+1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_wlargv( nrt, ab( ka1, j2t-ka ), inca, work( j2t-m ),ka1, &
+                           rwork( j2t-m ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the left
+                    do l = 1, ka - 1
+                       call stdlib_wlartv( nr, ab( l+1, j2-l ), inca,ab( l+2, j2-l ), inca, rwork(&
+                                  j2-m ),work( j2-m ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_wlar2v( nr, ab( 1, j2 ), ab( 1, j2+1 ), ab( 2, j2 ),inca, rwork( &
+                              j2-m ), work( j2-m ), ka1 )
+                    call stdlib_wlacgv( nr, work( j2-m ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, rwork( j2-m ),work( j2-m ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j2, j1, ka1
+                       call stdlib_wrot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,rwork( j-m ), work(&
+                                  j-m ) )
+                    end do
+                 end if
+              end do loop_360
+              if( update ) then
+                 if( i2<=n .and. kbt>0 ) then
+                    ! create nonzero element a(i-kbt+ka+1,i-kbt) outside the
+                    ! band and store it in work(i-kbt)
+                    work( i-kbt ) = -bb( kbt+1, i-kbt )*ra1
+                 end if
+              end if
+              loop_400: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i - k - 1 + max( 2, k-i0+1 )*ka1
+                 else
+                    j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+ka+l ) / ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( ka1-l+1, j2-ka ), inca,ab( ka1-l, j2-&
+                              ka+1 ), inca,rwork( j2-ka ), work( j2-ka ), ka1 )
+                 end do
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 do j = j1, j2, -ka1
+                    work( j ) = work( j-ka )
+                    rwork( j ) = rwork( j-ka )
+                 end do
+                 do j = j2, j1, ka1
+                    ! create nonzero element a(j+1,j-ka) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( ka1, j-ka+1 )
+                    ab( ka1, j-ka+1 ) = rwork( j )*ab( ka1, j-ka+1 )
+                 end do
+                 if( update ) then
+                    if( i-k<n-ka .and. k<=kbt )work( i-k+ka ) = work( i-k )
+                 end if
+              end do loop_400
+              loop_440: do k = kb, 1, -1
+                 j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_wlargv( nr, ab( ka1, j2-ka ), inca, work( j2 ), ka1,rwork( j2 ), &
+                              ka1 )
+                    ! apply rotations in 2nd set from the left
+                    do l = 1, ka - 1
+                       call stdlib_wlartv( nr, ab( l+1, j2-l ), inca,ab( l+2, j2-l ), inca, rwork(&
+                                  j2 ),work( j2 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_wlar2v( nr, ab( 1, j2 ), ab( 1, j2+1 ), ab( 2, j2 ),inca, rwork( &
+                              j2 ), work( j2 ), ka1 )
+                    call stdlib_wlacgv( nr, work( j2 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, rwork( j2 ),work( j2 ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j2, j1, ka1
+                       call stdlib_wrot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,rwork( j ), work( &
+                                 j ) )
+                    end do
+                 end if
+              end do loop_440
+              do k = 1, kb - 1
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 ! finish applying rotations in 1st set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, rwork( j2-m ),work( j2-m ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = n - 1, j2 + ka, -1
+                    rwork( j-m ) = rwork( j-ka-m )
+                    work( j-m ) = work( j-ka-m )
+                 end do
+              end if
+           end if
+           go to 10
+           480 continue
+           ! **************************** phase 2 *****************************
+           ! the logical structure of this phase is:
+           ! update = .true.
+           ! do i = 1, m
+              ! use s(i) to update a and create a new bulge
+              ! apply rotations to push all bulges ka positions upward
+           ! end do
+           ! update = .false.
+           ! do i = m - ka - 1, 2, -1
+              ! apply rotations to push all bulges ka positions upward
+           ! end do
+           ! to avoid duplicating code, the two loops are merged.
+           update = .true.
+           i = 0
+           490 continue
+           if( update ) then
+              i = i + 1
+              kbt = min( kb, m-i )
+              i0 = i + 1
+              i1 = max( 1, i-ka )
+              i2 = i + kbt - ka1
+              if( i>m ) then
+                 update = .false.
+                 i = i - 1
+                 i0 = m + 1
+                 if( ka==0 )return
+                 go to 490
+              end if
+           else
+              i = i - ka
+              if( i<2 )return
+           end if
+           if( i<m-kbt ) then
+              nx = m
+           else
+              nx = n
+           end if
+           if( upper ) then
+              ! transform a, working with the upper triangle
+              if( update ) then
+                 ! form  inv(s(i))**h * a * inv(s(i))
+                 bii = real( bb( kb1, i ),KIND=qp)
+                 ab( ka1, i ) = ( real( ab( ka1, i ),KIND=qp) / bii ) / bii
+                 do j = i1, i - 1
+                    ab( j-i+ka1, i ) = ab( j-i+ka1, i ) / bii
+                 end do
+                 do j = i + 1, min( n, i+ka )
+                    ab( i-j+ka1, j ) = ab( i-j+ka1, j ) / bii
+                 end do
+                 do k = i + 1, i + kbt
+                    do j = k, i + kbt
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( i-j+kb1, j )*conjg( ab( i-k+ka1, &
+                       k ) ) -conjg( bb( i-k+kb1, k ) )*ab( i-j+ka1, j ) +real( ab( ka1, i ),&
+                                 KIND=qp)*bb( i-j+kb1, j )*conjg( bb( i-k+kb1, k ) )
+                    end do
+                    do j = i + kbt + 1, min( n, i+ka )
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -conjg( bb( i-k+kb1, k ) )*ab( i-j+ka1,&
+                                  j )
+                    end do
+                 end do
+                 do j = i1, i
+                    do k = i + 1, min( j+ka, i+kbt )
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( i-k+kb1, k )*ab( j-i+ka1, i )
+                                 
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_wdscal( nx, one / bii, x( 1, i ), 1 )
+                    if( kbt>0 )call stdlib_wgeru( nx, kbt, -cone, x( 1, i ), 1,bb( kb, i+1 ), &
+                              ldbb-1, x( 1, i+1 ), ldx )
+                 end if
+                 ! store a(i1,i) in ra1 for use in next loop over k
+                 ra1 = ab( i1-i+ka1, i )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions up toward the top of the band
+              loop_610: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i+k-ka1>0 .and. i+k<m ) then
+                       ! generate rotation to annihilate a(i+k-ka-1,i)
+                       call stdlib_wlartg( ab( k+1, i ), ra1, rwork( i+k-ka ),work( i+k-ka ), ra )
+                                 
+                       ! create nonzero element a(i+k-ka-1,i+k) outside the
+                       ! band and store it in work(m-kb+i+k)
+                       t = -bb( kb1-k, i+k )*ra1
+                       work( m-kb+i+k ) = rwork( i+k-ka )*t -conjg( work( i+k-ka ) )*ab( 1, i+k )
+                                 
+                       ab( 1, i+k ) = work( i+k-ka )*t +rwork( i+k-ka )*ab( 1, i+k )
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = min( j2, i-2*ka+k-1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( j2t+ka-1 ) / ka1
+                 do j = j1, j2t, ka1
+                    ! create nonzero element a(j-1,j+ka) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( 1, j+ka-1 )
+                    ab( 1, j+ka-1 ) = rwork( j )*ab( 1, j+ka-1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_wlargv( nrt, ab( 1, j1+ka ), inca, work( j1 ), ka1,rwork( &
+                           j1 ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the left
+                    do l = 1, ka - 1
+                       call stdlib_wlartv( nr, ab( ka1-l, j1+l ), inca,ab( ka-l, j1+l ), inca, &
+                                 rwork( j1 ),work( j1 ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_wlar2v( nr, ab( ka1, j1 ), ab( ka1, j1-1 ),ab( ka, j1 ), inca, &
+                              rwork( j1 ), work( j1 ),ka1 )
+                    call stdlib_wlacgv( nr, work( j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                               rwork( j1t ),work( j1t ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j1, j2, ka1
+                       call stdlib_wrot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,rwork( j ), work( j ) )
+                                 
+                    end do
+                 end if
+              end do loop_610
+              if( update ) then
+                 if( i2>0 .and. kbt>0 ) then
+                    ! create nonzero element a(i+kbt-ka-1,i+kbt) outside the
+                    ! band and store it in work(m-kb+i+kbt)
+                    work( m-kb+i+kbt ) = -bb( kb1-kbt, i+kbt )*ra1
+                 end if
+              end if
+              loop_650: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i + k + 1 - max( 2, k+i0-m )*ka1
+                 else
+                    j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+ka+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( l, j1t+ka ), inca,ab( l+1, j1t+ka-1 ),&
+                               inca,rwork( m-kb+j1t+ka ),work( m-kb+j1t+ka ), ka1 )
+                 end do
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 do j = j1, j2, ka1
+                    work( m-kb+j ) = work( m-kb+j+ka )
+                    rwork( m-kb+j ) = rwork( m-kb+j+ka )
+                 end do
+                 do j = j1, j2, ka1
+                    ! create nonzero element a(j-1,j+ka) outside the band
+                    ! and store it in work(m-kb+j)
+                    work( m-kb+j ) = work( m-kb+j )*ab( 1, j+ka-1 )
+                    ab( 1, j+ka-1 ) = rwork( m-kb+j )*ab( 1, j+ka-1 )
+                 end do
+                 if( update ) then
+                    if( i+k>ka1 .and. k<=kbt )work( m-kb+i+k-ka ) = work( m-kb+i+k )
+                 end if
+              end do loop_650
+              loop_690: do k = kb, 1, -1
+                 j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_wlargv( nr, ab( 1, j1+ka ), inca, work( m-kb+j1 ),ka1, rwork( m-&
+                              kb+j1 ), ka1 )
+                    ! apply rotations in 2nd set from the left
+                    do l = 1, ka - 1
+                       call stdlib_wlartv( nr, ab( ka1-l, j1+l ), inca,ab( ka-l, j1+l ), inca, &
+                                 rwork( m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_wlar2v( nr, ab( ka1, j1 ), ab( ka1, j1-1 ),ab( ka, j1 ), inca, &
+                              rwork( m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                    call stdlib_wlacgv( nr, work( m-kb+j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                              rwork( m-kb+j1t ), work( m-kb+j1t ),ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j1, j2, ka1
+                       call stdlib_wrot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,rwork( m-kb+j ), work( &
+                                 m-kb+j ) )
+                    end do
+                 end if
+              end do loop_690
+              do k = 1, kb - 1
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 ! finish applying rotations in 1st set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                               rwork( j1t ),work( j1t ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = 2, i2 - ka
+                    rwork( j ) = rwork( j+ka )
+                    work( j ) = work( j+ka )
+                 end do
+              end if
+           else
+              ! transform a, working with the lower triangle
+              if( update ) then
+                 ! form  inv(s(i))**h * a * inv(s(i))
+                 bii = real( bb( 1, i ),KIND=qp)
+                 ab( 1, i ) = ( real( ab( 1, i ),KIND=qp) / bii ) / bii
+                 do j = i1, i - 1
+                    ab( i-j+1, j ) = ab( i-j+1, j ) / bii
+                 end do
+                 do j = i + 1, min( n, i+ka )
+                    ab( j-i+1, i ) = ab( j-i+1, i ) / bii
+                 end do
+                 do k = i + 1, i + kbt
+                    do j = k, i + kbt
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( j-i+1, i )*conjg( ab( k-i+1,i ) ) - &
+                       conjg( bb( k-i+1, i ) )*ab( j-i+1, i ) + real( ab( 1, i ),KIND=qp)*bb( j-i+&
+                                 1, i )*conjg( bb( k-i+1,i ) )
+                    end do
+                    do j = i + kbt + 1, min( n, i+ka )
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -conjg( bb( k-i+1, i ) )*ab( j-i+1, i )
+                                 
+                    end do
+                 end do
+                 do j = i1, i
+                    do k = i + 1, min( j+ka, i+kbt )
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( k-i+1, i )*ab( i-j+1, j )
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_wdscal( nx, one / bii, x( 1, i ), 1 )
+                    if( kbt>0 )call stdlib_wgerc( nx, kbt, -cone, x( 1, i ), 1, bb( 2, i ),1, x( &
+                              1, i+1 ), ldx )
+                 end if
+                 ! store a(i,i1) in ra1 for use in next loop over k
+                 ra1 = ab( i-i1+1, i1 )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions up toward the top of the band
+              loop_840: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i+k-ka1>0 .and. i+k<m ) then
+                       ! generate rotation to annihilate a(i,i+k-ka-1)
+                       call stdlib_wlartg( ab( ka1-k, i+k-ka ), ra1,rwork( i+k-ka ), work( i+k-ka &
+                                 ), ra )
+                       ! create nonzero element a(i+k,i+k-ka-1) outside the
+                       ! band and store it in work(m-kb+i+k)
+                       t = -bb( k+1, i )*ra1
+                       work( m-kb+i+k ) = rwork( i+k-ka )*t -conjg( work( i+k-ka ) )*ab( ka1, i+k-&
+                                 ka )
+                       ab( ka1, i+k-ka ) = work( i+k-ka )*t +rwork( i+k-ka )*ab( ka1, i+k-ka )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = min( j2, i-2*ka+k-1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( j2t+ka-1 ) / ka1
+                 do j = j1, j2t, ka1
+                    ! create nonzero element a(j+ka,j-1) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( ka1, j-1 )
+                    ab( ka1, j-1 ) = rwork( j )*ab( ka1, j-1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_wlargv( nrt, ab( ka1, j1 ), inca, work( j1 ), ka1,rwork( &
+                           j1 ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the right
+                    do l = 1, ka - 1
+                       call stdlib_wlartv( nr, ab( l+1, j1 ), inca, ab( l+2, j1-1 ),inca, rwork( &
+                                 j1 ), work( j1 ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_wlar2v( nr, ab( 1, j1 ), ab( 1, j1-1 ),ab( 2, j1-1 ), inca, rwork(&
+                               j1 ),work( j1 ), ka1 )
+                    call stdlib_wlacgv( nr, work( j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,rwork( j1t ), work( j1t ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j1, j2, ka1
+                       call stdlib_wrot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,rwork( j ), conjg( work(&
+                                  j ) ) )
+                    end do
+                 end if
+              end do loop_840
+              if( update ) then
+                 if( i2>0 .and. kbt>0 ) then
+                    ! create nonzero element a(i+kbt,i+kbt-ka-1) outside the
+                    ! band and store it in work(m-kb+i+kbt)
+                    work( m-kb+i+kbt ) = -bb( kbt+1, i )*ra1
+                 end if
+              end if
+              loop_880: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i + k + 1 - max( 2, k+i0-m )*ka1
+                 else
+                    j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+ka+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( ka1-l+1, j1t+l-1 ), inca,ab( ka1-l, &
+                              j1t+l-1 ), inca,rwork( m-kb+j1t+ka ),work( m-kb+j1t+ka ), ka1 )
+                 end do
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 do j = j1, j2, ka1
+                    work( m-kb+j ) = work( m-kb+j+ka )
+                    rwork( m-kb+j ) = rwork( m-kb+j+ka )
+                 end do
+                 do j = j1, j2, ka1
+                    ! create nonzero element a(j+ka,j-1) outside the band
+                    ! and store it in work(m-kb+j)
+                    work( m-kb+j ) = work( m-kb+j )*ab( ka1, j-1 )
+                    ab( ka1, j-1 ) = rwork( m-kb+j )*ab( ka1, j-1 )
+                 end do
+                 if( update ) then
+                    if( i+k>ka1 .and. k<=kbt )work( m-kb+i+k-ka ) = work( m-kb+i+k )
+                 end if
+              end do loop_880
+              loop_920: do k = kb, 1, -1
+                 j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_wlargv( nr, ab( ka1, j1 ), inca, work( m-kb+j1 ),ka1, rwork( m-kb+&
+                              j1 ), ka1 )
+                    ! apply rotations in 2nd set from the right
+                    do l = 1, ka - 1
+                       call stdlib_wlartv( nr, ab( l+1, j1 ), inca, ab( l+2, j1-1 ),inca, rwork( &
+                                 m-kb+j1 ), work( m-kb+j1 ),ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_wlar2v( nr, ab( 1, j1 ), ab( 1, j1-1 ),ab( 2, j1-1 ), inca, rwork(&
+                               m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                    call stdlib_wlacgv( nr, work( m-kb+j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,rwork( m-kb+j1t ), work( m-kb+j1t ),ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j1, j2, ka1
+                       call stdlib_wrot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,rwork( m-kb+j ), conjg( &
+                                 work( m-kb+j ) ) )
+                    end do
+                 end if
+              end do loop_920
+              do k = 1, kb - 1
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 ! finish applying rotations in 1st set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_wlartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,rwork( j1t ), work( j1t ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = 2, i2 - ka
+                    rwork( j ) = rwork( j+ka )
+                    work( j ) = work( j+ka )
+                 end do
+              end if
+           end if
+           go to 490
+     end subroutine stdlib_whbgst
+
+     !> ZHBGV: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x. Here A and B are assumed to be Hermitian
+     !> and banded, and B is also positive definite.
+
+     pure subroutine stdlib_whbgv( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w, z,ldz, work, &
+               rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldz, n
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           complex(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper, wantz
+           character :: vect
+           integer(ilp) :: iinfo, inde, indwrk
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBGV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_wpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           inde = 1
+           indwrk = inde + n
+           call stdlib_whbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, z, ldz,work, rwork( &
+                     indwrk ), iinfo )
+           ! reduce to tridiagonal form.
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_whbtrd( vect, uplo, n, ka, ab, ldab, w, rwork( inde ), z,ldz, work, iinfo )
+                     
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, call stdlib_wsteqr.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_wsteqr( jobz, n, w, rwork( inde ), z, ldz,rwork( indwrk ), info )
+                        
+           end if
+           return
+     end subroutine stdlib_whbgv
+
+     !> ZHBGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x. Here A and B are assumed to be Hermitian
+     !> and banded, and B is also positive definite.  If eigenvectors are
+     !> desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_whbgvd( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w,z, ldz, work, &
+               lwork, rwork, lrwork, iwork,liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           complex(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: vect
+           integer(ilp) :: iinfo, inde, indwk2, indwrk, liwmin, llrwk, llwk2, lrwmin, &
+                     lwmin
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              lwmin = 1+n
+              lrwmin = 1+n
+              liwmin = 1
+           else if( wantz ) then
+              lwmin = 2*n**2
+              lrwmin = 1 + 5*n + 2*n**2
+              liwmin = 3 + 5*n
+           else
+              lwmin = n
+              lrwmin = n
+              liwmin = 1
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -14
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -16
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -18
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_wpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           inde = 1
+           indwrk = inde + n
+           indwk2 = 1 + n*n
+           llwk2 = lwork - indwk2 + 2
+           llrwk = lrwork - indwrk + 2
+           call stdlib_whbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, z, ldz,work, rwork, &
+                     iinfo )
+           ! reduce hermitian band matrix to tridiagonal form.
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_whbtrd( vect, uplo, n, ka, ab, ldab, w, rwork( inde ), z,ldz, work, iinfo )
+                     
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, call stdlib_wstedc.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_wstedc( 'I', n, w, rwork( inde ), work, n, work( indwk2 ),llwk2, rwork( &
+                        indwrk ), llrwk, iwork, liwork,info )
+              call stdlib_wgemm( 'N', 'N', n, n, n, cone, z, ldz, work, n, czero,work( indwk2 ), &
+                        n )
+              call stdlib_wlacpy( 'A', n, n, work( indwk2 ), n, z, ldz )
+           end if
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_whbgvd
+
+     !> ZHBGVX: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x. Here A and B are assumed to be Hermitian
+     !> and banded, and B is also positive definite.  Eigenvalues and
+     !> eigenvectors can be selected by specifying either all eigenvalues,
+     !> a range of values or a range of indices for the desired eigenvalues.
+
+     pure subroutine stdlib_whbgvx( jobz, range, uplo, n, ka, kb, ab, ldab, bb,ldbb, q, ldq, vl, &
+               vu, il, iu, abstol, m, w, z,ldz, work, rwork, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, ka, kb, ldab, ldbb, ldq, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           complex(qp), intent(out) :: q(ldq,*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, upper, valeig, wantz
+           character :: order, vect
+           integer(ilp) :: i, iinfo, indd, inde, indee, indibl, indisp, indiwk, indrwk, indwrk, &
+                     itmp1, j, jj, nsplit
+           real(qp) :: tmp1
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ka<0 ) then
+              info = -5
+           else if( kb<0 .or. kb>ka ) then
+              info = -6
+           else if( ldab<ka+1 ) then
+              info = -8
+           else if( ldbb<kb+1 ) then
+              info = -10
+           else if( ldq<1 .or. ( wantz .and. ldq<n ) ) then
+              info = -12
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -14
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -15
+                 else if ( iu<min( n, il ) .or. iu>n ) then
+                    info = -16
+                 end if
+              end if
+           end if
+           if( info==0) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -21
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBGVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_wpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           call stdlib_whbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, q, ldq,work, rwork, &
+                     iinfo )
+           ! solve the standard eigenvalue problem.
+           ! reduce hermitian band matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indrwk = inde + n
+           indwrk = 1
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_whbtrd( vect, uplo, n, ka, ab, ldab, rwork( indd ),rwork( inde ), q, ldq, &
+                     work( indwrk ), iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_qsterf or stdlib_wsteqr.  if this fails for some
+           ! eigenvalue, then try stdlib_qstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_qcopy( n, rwork( indd ), 1, w, 1 )
+              indee = indrwk + 2*n
+              call stdlib_qcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+              if( .not.wantz ) then
+                 call stdlib_qsterf( n, w, rwork( indee ), info )
+              else
+                 call stdlib_wlacpy( 'A', n, n, q, ldq, z, ldz )
+                 call stdlib_wsteqr( jobz, n, w, rwork( indee ), z, ldz,rwork( indrwk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_qstebz and, if eigenvectors are desired,
+           ! call stdlib_wstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwk = indisp + n
+           call stdlib_qstebz( range, order, n, vl, vu, il, iu, abstol,rwork( indd ), rwork( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwk ), &
+                     info )
+           if( wantz ) then
+              call stdlib_wstein( n, rwork( indd ), rwork( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwk ), ifail, info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_wstein.
+              do j = 1, m
+                 call stdlib_wcopy( n, z( 1, j ), 1, work( 1 ), 1 )
+                 call stdlib_wgemv( 'N', n, n, cone, q, ldq, work, 1, czero,z( 1, j ), 1 )
+              end do
+           end if
+           30 continue
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_wswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_whbgvx
+
+     !> ZHBTRD: reduces a complex Hermitian band matrix A to real symmetric
+     !> tridiagonal form T by a unitary similarity transformation:
+     !> Q**H * A * Q = T.
+
+     pure subroutine stdlib_whbtrd( vect, uplo, n, kd, ab, ldab, d, e, q, ldq,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldq, n
+           ! Array Arguments 
+           real(qp), intent(out) :: d(*), e(*)
+           complex(qp), intent(inout) :: ab(ldab,*), q(ldq,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: initq, upper, wantq
+           integer(ilp) :: i, i2, ibl, inca, incx, iqaend, iqb, iqend, j, j1, j1end, j1inc, j2, &
+                     jend, jin, jinc, k, kd1, kdm1, kdn, l, last, lend, nq, nr, nrt
+           real(qp) :: abst
+           complex(qp) :: t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           initq = stdlib_lsame( vect, 'V' )
+           wantq = initq .or. stdlib_lsame( vect, 'U' )
+           upper = stdlib_lsame( uplo, 'U' )
+           kd1 = kd + 1
+           kdm1 = kd - 1
+           incx = ldab - 1
+           iqend = 1
+           info = 0
+           if( .not.wantq .and. .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd1 ) then
+              info = -6
+           else if( ldq<max( 1, n ) .and. wantq ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBTRD', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize q to the unit matrix, if needed
+           if( initq )call stdlib_wlaset( 'FULL', n, n, czero, cone, q, ldq )
+           ! wherever possible, plane rotations are generated and applied in
+           ! vector operations of length nr over the index set j1:j2:kd1.
+           ! the real cosines and complex sines of the plane rotations are
+           ! stored in the arrays d and work.
+           inca = kd1*ldab
+           kdn = min( n-1, kd )
+           if( upper ) then
+              if( kd>1 ) then
+                 ! reduce to complex hermitian tridiagonal form, working with
+                 ! the upper triangle
+                 nr = 0
+                 j1 = kdn + 2
+                 j2 = 1
+                 ab( kd1, 1 ) = real( ab( kd1, 1 ),KIND=qp)
+                 loop_90: do i = 1, n - 2
+                    ! reduce i-th row of matrix to tridiagonal form
+                    loop_80: do k = kdn + 1, 2, -1
+                       j1 = j1 + kdn
+                       j2 = j2 + kdn
+                       if( nr>0 ) then
+                          ! generate plane rotations to annihilate nonzero
+                          ! elements which have been created outside the band
+                          call stdlib_wlargv( nr, ab( 1, j1-1 ), inca, work( j1 ),kd1, d( j1 ), &
+                                    kd1 )
+                          ! apply rotations from the right
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_wlartv or stdlib_wrot is used
+                          if( nr>=2*kd-1 ) then
+                             do l = 1, kd - 1
+                                call stdlib_wlartv( nr, ab( l+1, j1-1 ), inca,ab( l, j1 ), inca, &
+                                          d( j1 ),work( j1 ), kd1 )
+                             end do
+                          else
+                             jend = j1 + ( nr-1 )*kd1
+                             do jinc = j1, jend, kd1
+                                call stdlib_wrot( kdm1, ab( 2, jinc-1 ), 1,ab( 1, jinc ), 1, d( &
+                                          jinc ),work( jinc ) )
+                             end do
+                          end if
+                       end if
+                       if( k>2 ) then
+                          if( k<=n-i+1 ) then
+                             ! generate plane rotation to annihilate a(i,i+k-1)
+                             ! within the band
+                             call stdlib_wlartg( ab( kd-k+3, i+k-2 ),ab( kd-k+2, i+k-1 ), d( i+k-&
+                                       1 ),work( i+k-1 ), temp )
+                             ab( kd-k+3, i+k-2 ) = temp
+                             ! apply rotation from the right
+                             call stdlib_wrot( k-3, ab( kd-k+4, i+k-2 ), 1,ab( kd-k+3, i+k-1 ), 1,&
+                                        d( i+k-1 ),work( i+k-1 ) )
+                          end if
+                          nr = nr + 1
+                          j1 = j1 - kdn - 1
+                       end if
+                       ! apply plane rotations from both sides to diagonal
+                       ! blocks
+                       if( nr>0 )call stdlib_wlar2v( nr, ab( kd1, j1-1 ), ab( kd1, j1 ),ab( kd, &
+                                 j1 ), inca, d( j1 ),work( j1 ), kd1 )
+                       ! apply plane rotations from the left
+                       if( nr>0 ) then
+                          call stdlib_wlacgv( nr, work( j1 ), kd1 )
+                          if( 2*kd-1<nr ) then
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_wlartv or stdlib_wrot is used
+                             do l = 1, kd - 1
+                                if( j2+l>n ) then
+                                   nrt = nr - 1
+                                else
+                                   nrt = nr
+                                end if
+                                if( nrt>0 )call stdlib_wlartv( nrt, ab( kd-l, j1+l ), inca,ab( kd-&
+                                          l+1, j1+l ), inca,d( j1 ), work( j1 ), kd1 )
+                             end do
+                          else
+                             j1end = j1 + kd1*( nr-2 )
+                             if( j1end>=j1 ) then
+                                do jin = j1, j1end, kd1
+                                   call stdlib_wrot( kd-1, ab( kd-1, jin+1 ), incx,ab( kd, jin+1 )&
+                                             , incx,d( jin ), work( jin ) )
+                                end do
+                             end if
+                             lend = min( kdm1, n-j2 )
+                             last = j1end + kd1
+                             if( lend>0 )call stdlib_wrot( lend, ab( kd-1, last+1 ), incx,ab( kd, &
+                                       last+1 ), incx, d( last ),work( last ) )
+                          end if
+                       end if
+                       if( wantq ) then
+                          ! accumulate product of plane rotations in q
+                          if( initq ) then
+                       ! take advantage of the fact that q was
+                       ! initially the identity matrix
+                             iqend = max( iqend, j2 )
+                             i2 = max( 0, k-3 )
+                             iqaend = 1 + i*kd
+                             if( k==2 )iqaend = iqaend + kd
+                             iqaend = min( iqaend, iqend )
+                             do j = j1, j2, kd1
+                                ibl = i - i2 / kdm1
+                                i2 = i2 + 1
+                                iqb = max( 1, j-ibl )
+                                nq = 1 + iqaend - iqb
+                                iqaend = min( iqaend+kd, iqend )
+                                call stdlib_wrot( nq, q( iqb, j-1 ), 1, q( iqb, j ),1, d( j ), &
+                                          conjg( work( j ) ) )
+                             end do
+                          else
+                             do j = j1, j2, kd1
+                                call stdlib_wrot( n, q( 1, j-1 ), 1, q( 1, j ), 1,d( j ), conjg( &
+                                          work( j ) ) )
+                             end do
+                          end if
+                       end if
+                       if( j2+kdn>n ) then
+                          ! adjust j2 to keep within the bounds of the matrix
+                          nr = nr - 1
+                          j2 = j2 - kdn - 1
+                       end if
+                       do j = j1, j2, kd1
+                          ! create nonzero element a(j-1,j+kd) outside the band
+                          ! and store it in work
+                          work( j+kd ) = work( j )*ab( 1, j+kd )
+                          ab( 1, j+kd ) = d( j )*ab( 1, j+kd )
+                       end do
+                    end do loop_80
+                 end do loop_90
+              end if
+              if( kd>0 ) then
+                 ! make off-diagonal elements real and copy them to e
+                 do i = 1, n - 1
+                    t = ab( kd, i+1 )
+                    abst = abs( t )
+                    ab( kd, i+1 ) = abst
+                    e( i ) = abst
+                    if( abst/=zero ) then
+                       t = t / abst
+                    else
+                       t = cone
+                    end if
+                    if( i<n-1 )ab( kd, i+2 ) = ab( kd, i+2 )*t
+                    if( wantq ) then
+                       call stdlib_wscal( n, conjg( t ), q( 1, i+1 ), 1 )
+                    end if
+                 end do
+              else
+                 ! set e to zero if original matrix was diagonal
+                 do i = 1, n - 1
+                    e( i ) = zero
+                 end do
+              end if
+              ! copy diagonal elements to d
+              do i = 1, n
+                 d( i ) = real( ab( kd1, i ),KIND=qp)
+              end do
+           else
+              if( kd>1 ) then
+                 ! reduce to complex hermitian tridiagonal form, working with
+                 ! the lower triangle
+                 nr = 0
+                 j1 = kdn + 2
+                 j2 = 1
+                 ab( 1, 1 ) = real( ab( 1, 1 ),KIND=qp)
+                 loop_210: do i = 1, n - 2
+                    ! reduce i-th column of matrix to tridiagonal form
+                    loop_200: do k = kdn + 1, 2, -1
+                       j1 = j1 + kdn
+                       j2 = j2 + kdn
+                       if( nr>0 ) then
+                          ! generate plane rotations to annihilate nonzero
+                          ! elements which have been created outside the band
+                          call stdlib_wlargv( nr, ab( kd1, j1-kd1 ), inca,work( j1 ), kd1, d( j1 )&
+                                    , kd1 )
+                          ! apply plane rotations from one side
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_wlartv or stdlib_wrot is used
+                          if( nr>2*kd-1 ) then
+                             do l = 1, kd - 1
+                                call stdlib_wlartv( nr, ab( kd1-l, j1-kd1+l ), inca,ab( kd1-l+1, &
+                                          j1-kd1+l ), inca,d( j1 ), work( j1 ), kd1 )
+                             end do
+                          else
+                             jend = j1 + kd1*( nr-1 )
+                             do jinc = j1, jend, kd1
+                                call stdlib_wrot( kdm1, ab( kd, jinc-kd ), incx,ab( kd1, jinc-kd )&
+                                          , incx,d( jinc ), work( jinc ) )
+                             end do
+                          end if
+                       end if
+                       if( k>2 ) then
+                          if( k<=n-i+1 ) then
+                             ! generate plane rotation to annihilate a(i+k-1,i)
+                             ! within the band
+                             call stdlib_wlartg( ab( k-1, i ), ab( k, i ),d( i+k-1 ), work( i+k-1 &
+                                       ), temp )
+                             ab( k-1, i ) = temp
+                             ! apply rotation from the left
+                             call stdlib_wrot( k-3, ab( k-2, i+1 ), ldab-1,ab( k-1, i+1 ), ldab-1,&
+                                        d( i+k-1 ),work( i+k-1 ) )
+                          end if
+                          nr = nr + 1
+                          j1 = j1 - kdn - 1
+                       end if
+                       ! apply plane rotations from both sides to diagonal
+                       ! blocks
+                       if( nr>0 )call stdlib_wlar2v( nr, ab( 1, j1-1 ), ab( 1, j1 ),ab( 2, j1-1 ),&
+                                  inca, d( j1 ),work( j1 ), kd1 )
+                       ! apply plane rotations from the right
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_wlartv or stdlib_wrot is used
+                       if( nr>0 ) then
+                          call stdlib_wlacgv( nr, work( j1 ), kd1 )
+                          if( nr>2*kd-1 ) then
+                             do l = 1, kd - 1
+                                if( j2+l>n ) then
+                                   nrt = nr - 1
+                                else
+                                   nrt = nr
+                                end if
+                                if( nrt>0 )call stdlib_wlartv( nrt, ab( l+2, j1-1 ), inca,ab( l+1,&
+                                           j1 ), inca, d( j1 ),work( j1 ), kd1 )
+                             end do
+                          else
+                             j1end = j1 + kd1*( nr-2 )
+                             if( j1end>=j1 ) then
+                                do j1inc = j1, j1end, kd1
+                                   call stdlib_wrot( kdm1, ab( 3, j1inc-1 ), 1,ab( 2, j1inc ), 1, &
+                                             d( j1inc ),work( j1inc ) )
+                                end do
+                             end if
+                             lend = min( kdm1, n-j2 )
+                             last = j1end + kd1
+                             if( lend>0 )call stdlib_wrot( lend, ab( 3, last-1 ), 1,ab( 2, last ),&
+                                        1, d( last ),work( last ) )
+                          end if
+                       end if
+                       if( wantq ) then
+                          ! accumulate product of plane rotations in q
+                          if( initq ) then
+                       ! take advantage of the fact that q was
+                       ! initially the identity matrix
+                             iqend = max( iqend, j2 )
+                             i2 = max( 0, k-3 )
+                             iqaend = 1 + i*kd
+                             if( k==2 )iqaend = iqaend + kd
+                             iqaend = min( iqaend, iqend )
+                             do j = j1, j2, kd1
+                                ibl = i - i2 / kdm1
+                                i2 = i2 + 1
+                                iqb = max( 1, j-ibl )
+                                nq = 1 + iqaend - iqb
+                                iqaend = min( iqaend+kd, iqend )
+                                call stdlib_wrot( nq, q( iqb, j-1 ), 1, q( iqb, j ),1, d( j ), &
+                                          work( j ) )
+                             end do
+                          else
+                             do j = j1, j2, kd1
+                                call stdlib_wrot( n, q( 1, j-1 ), 1, q( 1, j ), 1,d( j ), work( j &
+                                          ) )
+                             end do
+                          end if
+                       end if
+                       if( j2+kdn>n ) then
+                          ! adjust j2 to keep within the bounds of the matrix
+                          nr = nr - 1
+                          j2 = j2 - kdn - 1
+                       end if
+                       do j = j1, j2, kd1
+                          ! create nonzero element a(j+kd,j-1) outside the
+                          ! band and store it in work
+                          work( j+kd ) = work( j )*ab( kd1, j )
+                          ab( kd1, j ) = d( j )*ab( kd1, j )
+                       end do
+                    end do loop_200
+                 end do loop_210
+              end if
+              if( kd>0 ) then
+                 ! make off-diagonal elements real and copy them to e
+                 do i = 1, n - 1
+                    t = ab( 2, i )
+                    abst = abs( t )
+                    ab( 2, i ) = abst
+                    e( i ) = abst
+                    if( abst/=zero ) then
+                       t = t / abst
+                    else
+                       t = cone
+                    end if
+                    if( i<n-1 )ab( 2, i+1 ) = ab( 2, i+1 )*t
+                    if( wantq ) then
+                       call stdlib_wscal( n, t, q( 1, i+1 ), 1 )
+                    end if
+                 end do
+              else
+                 ! set e to zero if original matrix was diagonal
+                 do i = 1, n - 1
+                    e( i ) = zero
+                 end do
+              end if
+              ! copy diagonal elements to d
+              do i = 1, n
+                 d( i ) = real( ab( 1, i ),KIND=qp)
+              end do
+           end if
+           return
+     end subroutine stdlib_whbtrd
+
+     !> ZHECON: estimates the reciprocal of the condition number of a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by ZHETRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_whecon( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(qp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHECON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**h) or inv(u*d*u**h).
+              call stdlib_whetrs( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_whecon
+
+     !> ZHECON_ROOK: estimates the reciprocal of the condition number of a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by CHETRF_ROOK.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_whecon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(qp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHECON_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**h) or inv(u*d*u**h).
+              call stdlib_whetrs_rook( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_whecon_rook
+
+     !> ZHEEQUB: computes row and column scalings intended to equilibrate a
+     !> Hermitian matrix A (with respect to the Euclidean norm) and reduce
+     !> its condition number. The scale factors S are computed by the BIN
+     !> algorithm (see references) so that the scaled matrix B with elements
+     !> B(i,j) = S(i)*A(i,j)*S(j) has a condition number within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_wheequb( uplo, n, a, lda, s, scond, amax, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(out) :: amax, scond
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+           real(qp), intent(out) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: max_iter = 100
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, iter
+           real(qp) :: avg, std, tol, c0, c1, c2, t, u, si, d, base, smin, smax, smlnum, bignum, &
+                     scale, sumsq
+           logical(lk) :: up
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,int,log,max,min,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if ( .not. ( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -1
+           else if ( n < 0 ) then
+              info = -2
+           else if ( lda < max( 1, n ) ) then
+              info = -4
+           end if
+           if ( info /= 0 ) then
+              call stdlib_xerbla( 'ZHEEQUB', -info )
+              return
+           end if
+           up = stdlib_lsame( uplo, 'U' )
+           amax = zero
+           ! quick return if possible.
+           if ( n == 0 ) then
+              scond = one
+              return
+           end if
+           do i = 1, n
+              s( i ) = zero
+           end do
+           amax = zero
+           if ( up ) then
+              do j = 1, n
+                 do i = 1, j-1
+                    s( i ) = max( s( i ), cabs1( a( i, j ) ) )
+                    s( j ) = max( s( j ), cabs1( a( i, j ) ) )
+                    amax = max( amax, cabs1( a( i, j ) ) )
+                 end do
+                 s( j ) = max( s( j ), cabs1( a( j, j ) ) )
+                 amax = max( amax, cabs1( a( j, j ) ) )
+              end do
+           else
+              do j = 1, n
+                 s( j ) = max( s( j ), cabs1( a( j, j ) ) )
+                 amax = max( amax, cabs1( a( j, j ) ) )
+                 do i = j+1, n
+                    s( i ) = max( s( i ), cabs1( a( i, j ) ) )
+                    s( j ) = max( s( j ), cabs1( a( i, j ) ) )
+                    amax = max( amax, cabs1( a( i, j ) ) )
+                 end do
+              end do
+           end if
+           do j = 1, n
+              s( j ) = 1.0_qp / s( j )
+           end do
+           tol = one / sqrt( 2.0_qp * n )
+           do iter = 1, max_iter
+              scale = zero
+              sumsq = zero
+              ! beta = |a|s
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              if ( up ) then
+                 do j = 1, n
+                    do i = 1, j-1
+                       work( i ) = work( i ) + cabs1( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + cabs1( a( i, j ) ) * s( i )
+                    end do
+                    work( j ) = work( j ) + cabs1( a( j, j ) ) * s( j )
+                 end do
+              else
+                 do j = 1, n
+                    work( j ) = work( j ) + cabs1( a( j, j ) ) * s( j )
+                    do i = j+1, n
+                       work( i ) = work( i ) + cabs1( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + cabs1( a( i, j ) ) * s( i )
+                    end do
+                 end do
+              end if
+              ! avg = s^t beta / n
+              avg = zero
+              do i = 1, n
+                 avg = avg + real( s( i )*work( i ),KIND=qp)
+              end do
+              avg = avg / n
+              std = zero
+              do i = n+1, 2*n
+                 work( i ) = s( i-n ) * work( i-n ) - avg
+              end do
+              call stdlib_wlassq( n, work( n+1 ), 1, scale, sumsq )
+              std = scale * sqrt( sumsq / n )
+              if ( std < tol * avg ) goto 999
+              do i = 1, n
+                 t = cabs1( a( i, i ) )
+                 si = s( i )
+                 c2 = ( n-1 ) * t
+                 c1 = ( n-2 ) * ( real( work( i ),KIND=qp) - t*si )
+                 c0 = -(t*si)*si + 2 * real( work( i ),KIND=qp) * si - n*avg
+                 d = c1*c1 - 4*c0*c2
+                 if ( d <= 0 ) then
+                    info = -1
+                    return
+                 end if
+                 si = -2*c0 / ( c1 + sqrt( d ) )
+                 d = si - s( i )
+                 u = zero
+                 if ( up ) then
+                    do j = 1, i
+                       t = cabs1( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = cabs1( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 else
+                    do j = 1, i
+                       t = cabs1( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = cabs1( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 end if
+                 avg = avg + ( u + real( work( i ),KIND=qp) ) * d / n
+                 s( i ) = si
+              end do
+           end do
+           999 continue
+           smlnum = stdlib_qlamch( 'SAFEMIN' )
+           bignum = one / smlnum
+           smin = bignum
+           smax = zero
+           t = one / sqrt( avg )
+           base = stdlib_qlamch( 'B' )
+           u = one / log( base )
+           do i = 1, n
+              s( i ) = base ** int( u * log( s( i ) * t ),KIND=ilp)
+              smin = min( smin, s( i ) )
+              smax = max( smax, s( i ) )
+           end do
+           scond = max( smin, smlnum ) / min( smax, bignum )
+     end subroutine stdlib_wheequb
+
+     !> ZHEEV: computes all eigenvalues and, optionally, eigenvectors of a
+     !> complex Hermitian matrix A.
+
+     subroutine stdlib_wheev( jobz, uplo, n, a, lda, w, work, lwork, rwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indtau, indwrk, iscale, llwork, lwkopt, nb
+           real(qp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, ( nb+1 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, 2*n-1 ) .and. .not.lquery )info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              return
+           end if
+           if( n==1 ) then
+              w( 1 ) = real( a( 1, 1 ),KIND=qp)
+              work( 1 ) = 1
+              if( wantz )a( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_wlanhe( 'M', uplo, n, a, lda, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 )call stdlib_wlascl( uplo, 0, 0, one, sigma, n, n, a, lda, info )
+           ! call stdlib_whetrd to reduce hermitian matrix to tridiagonal form.
+           inde = 1
+           indtau = 1
+           indwrk = indtau + n
+           llwork = lwork - indwrk + 1
+           call stdlib_whetrd( uplo, n, a, lda, w, rwork( inde ), work( indtau ),work( indwrk ), &
+                     llwork, iinfo )
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, first call
+           ! stdlib_wungtr to generate the unitary matrix, then call stdlib_wsteqr.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_wungtr( uplo, n, a, lda, work( indtau ), work( indwrk ),llwork, iinfo )
+                        
+              indwrk = inde + n
+              call stdlib_wsteqr( jobz, n, w, rwork( inde ), a, lda,rwork( indwrk ), info )
+                        
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           ! set work(1) to optimal complex workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wheev
+
+     !> ZHEEVD: computes all eigenvalues and, optionally, eigenvectors of a
+     !> complex Hermitian matrix A.  If eigenvectors are desired, it uses a
+     !> divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_wheevd( jobz, uplo, n, a, lda, w, work, lwork, rwork,lrwork, iwork, liwork,&
+                info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indrwk, indtau, indwk2, indwrk, iscale, liopt, &
+                     liwmin, llrwk, llwork, llwrk2, lopt, lropt, lrwmin, lwmin
+           real(qp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwmin = 1
+                 lrwmin = 1
+                 liwmin = 1
+                 lopt = lwmin
+                 lropt = lrwmin
+                 liopt = liwmin
+              else
+                 if( wantz ) then
+                    lwmin = 2*n + n*n
+                    lrwmin = 1 + 5*n + 2*n**2
+                    liwmin = 3 + 5*n
+                 else
+                    lwmin = n + 1
+                    lrwmin = n
+                    liwmin = 1
+                 end if
+                 lopt = max( lwmin, n +stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 ) )
+                 lropt = lrwmin
+                 liopt = liwmin
+              end if
+              work( 1 ) = lopt
+              rwork( 1 ) = lropt
+              iwork( 1 ) = liopt
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -8
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -10
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = real( a( 1, 1 ),KIND=qp)
+              if( wantz )a( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_wlanhe( 'M', uplo, n, a, lda, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 )call stdlib_wlascl( uplo, 0, 0, one, sigma, n, n, a, lda, info )
+           ! call stdlib_whetrd to reduce hermitian matrix to tridiagonal form.
+           inde = 1
+           indtau = 1
+           indwrk = indtau + n
+           indrwk = inde + n
+           indwk2 = indwrk + n*n
+           llwork = lwork - indwrk + 1
+           llwrk2 = lwork - indwk2 + 1
+           llrwk = lrwork - indrwk + 1
+           call stdlib_whetrd( uplo, n, a, lda, w, rwork( inde ), work( indtau ),work( indwrk ), &
+                     llwork, iinfo )
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, first call
+           ! stdlib_wstedc to generate the eigenvector matrix, work(indwrk), of the
+           ! tridiagonal matrix, then call stdlib_wunmtr to multiply it to the
+           ! householder transformations represented as householder vectors in
+           ! a.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_wstedc( 'I', n, w, rwork( inde ), work( indwrk ), n,work( indwk2 ), &
+                        llwrk2, rwork( indrwk ), llrwk,iwork, liwork, info )
+              call stdlib_wunmtr( 'L', uplo, 'N', n, n, a, lda, work( indtau ),work( indwrk ), n, &
+                        work( indwk2 ), llwrk2, iinfo )
+              call stdlib_wlacpy( 'A', n, n, work( indwrk ), n, a, lda )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           work( 1 ) = lopt
+           rwork( 1 ) = lropt
+           iwork( 1 ) = liopt
+           return
+     end subroutine stdlib_wheevd
+
+     !> ZHEEVR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex Hermitian matrix A.  Eigenvalues and eigenvectors can
+     !> be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+     !> ZHEEVR first reduces the matrix A to tridiagonal form T with a call
+     !> to ZHETRD.  Then, whenever possible, ZHEEVR calls ZSTEMR to compute
+     !> eigenspectrum using Relatively Robust Representations.  ZSTEMR
+     !> computes eigenvalues by the dqds algorithm, while orthogonal
+     !> eigenvectors are computed from various "good" L D L^T representations
+     !> (also known as Relatively Robust Representations). Gram-Schmidt
+     !> orthogonalization is avoided as far as possible. More specifically,
+     !> the various steps of the algorithm are as follows.
+     !> For each unreduced block (submatrix) of T,
+     !> (a) Compute T - sigma I  = L D L^T, so that L and D
+     !> define all the wanted eigenvalues to high relative accuracy.
+     !> This means that small relative changes in the entries of D and L
+     !> cause only small relative changes in the eigenvalues and
+     !> eigenvectors. The standard (unfactored) representation of the
+     !> tridiagonal matrix T does not have this property in general.
+     !> (b) Compute the eigenvalues to suitable accuracy.
+     !> If the eigenvectors are desired, the algorithm attains full
+     !> accuracy of the computed eigenvalues only right before
+     !> the corresponding vectors have to be computed, see steps c) and d).
+     !> (c) For each cluster of close eigenvalues, select a new
+     !> shift close to the cluster, find a new factorization, and refine
+     !> the shifted eigenvalues to suitable accuracy.
+     !> (d) For each eigenvalue with a large enough relative separation compute
+     !> the corresponding eigenvector by forming a rank revealing twisted
+     !> factorization. Go back to (c) for any clusters that remain.
+     !> The desired accuracy of the output can be specified by the input
+     !> parameter ABSTOL.
+     !> For more details, see ZSTEMR's documentation and:
+     !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+     !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+     !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+     !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+     !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+     !> 2004.  Also LAPACK Working Note 154.
+     !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem",
+     !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Note 1 : ZHEEVR calls ZSTEMR when the full spectrum is requested
+     !> on machines which conform to the ieee-754 floating point standard.
+     !> ZHEEVR calls DSTEBZ and ZSTEIN on non-ieee machines and
+     !> when partial spectrum requests are made.
+     !> Normal execution of ZSTEMR may create NaNs and infinities and
+     !> hence may abort due to a floating point exception in environments
+     !> which do not handle NaNs and infinities in the ieee standard default
+     !> manner.
+
+     subroutine stdlib_wheevr( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               isuppz, work, lwork,rwork, lrwork, iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, lda, ldz, liwork, lrwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, lquery, test, valeig, wantz, tryrac
+           character :: order
+           integer(ilp) :: i, ieeeok, iinfo, imax, indibl, indifl, indisp, indiwo, indrd, indrdd, &
+           indre, indree, indrwk, indtau, indwk, indwkn, iscale, itmp1, j, jj, liwmin, llwork, &
+                     llrwork, llwrkn, lrwmin, lwkopt, lwmin, nb, nsplit
+           real(qp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           ieeeok = stdlib_ilaenv( 10, 'ZHEEVR', 'N', 1, 2, 3, 4 )
+           lower = stdlib_lsame( uplo, 'L' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ) .or. ( lrwork==-1 ) .or.( liwork==-1 ) )
+           lrwmin = max( 1, 24*n )
+           liwmin = max( 1, 10*n )
+           lwmin = max( 1, 2*n )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -8
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -9
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -15
+              end if
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 )
+              nb = max( nb, stdlib_ilaenv( 1, 'ZUNMTR', uplo, n, -1, -1, -1 ) )
+              lwkopt = max( ( nb+1 )*n, lwmin )
+              work( 1 ) = lwkopt
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -18
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -20
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -22
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEEVR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( n==1 ) then
+              work( 1 ) = 2
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = real( a( 1, 1 ),KIND=qp)
+              else
+                 if( vl<real( a( 1, 1 ),KIND=qp) .and. vu>=real( a( 1, 1 ),KIND=qp) )then
+                    m = 1
+                    w( 1 ) = real( a( 1, 1 ),KIND=qp)
+                 end if
+              end if
+              if( wantz ) then
+                 z( 1, 1 ) = one
+                 isuppz( 1 ) = 1
+                 isuppz( 2 ) = 1
+              end if
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if (valeig) then
+              vll = vl
+              vuu = vu
+           end if
+           anrm = stdlib_wlansy( 'M', uplo, n, a, lda, rwork )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 do j = 1, n
+                    call stdlib_wdscal( n-j+1, sigma, a( j, j ), 1 )
+                 end do
+              else
+                 do j = 1, n
+                    call stdlib_wdscal( j, sigma, a( 1, j ), 1 )
+                 end do
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! initialize indices into workspaces.  note: the iwork indices are
+           ! used only if stdlib_qsterf or stdlib_wstemr fail.
+           ! work(indtau:indtau+n-1) stores the complex scalar factors of the
+           ! elementary reflectors used in stdlib_whetrd.
+           indtau = 1
+           ! indwk is the starting offset of the remaining complex workspace,
+           ! and llwork is the remaining complex workspace size.
+           indwk = indtau + n
+           llwork = lwork - indwk + 1
+           ! rwork(indrd:indrd+n-1) stores the real tridiagonal's diagonal
+           ! entries.
+           indrd = 1
+           ! rwork(indre:indre+n-1) stores the off-diagonal entries of the
+           ! tridiagonal matrix from stdlib_whetrd.
+           indre = indrd + n
+           ! rwork(indrdd:indrdd+n-1) is a copy of the diagonal entries over
+           ! -written by stdlib_wstemr (the stdlib_qsterf path copies the diagonal to w).
+           indrdd = indre + n
+           ! rwork(indree:indree+n-1) is a copy of the off-diagonal entries over
+           ! -written while computing the eigenvalues in stdlib_qsterf and stdlib_wstemr.
+           indree = indrdd + n
+           ! indrwk is the starting offset of the left-over real workspace, and
+           ! llrwork is the remaining workspace size.
+           indrwk = indree + n
+           llrwork = lrwork - indrwk + 1
+           ! iwork(indibl:indibl+m-1) corresponds to iblock in stdlib_qstebz and
+           ! stores the block indices of each of the m<=n eigenvalues.
+           indibl = 1
+           ! iwork(indisp:indisp+nsplit-1) corresponds to isplit in stdlib_qstebz and
+           ! stores the starting and finishing indices of each block.
+           indisp = indibl + n
+           ! iwork(indifl:indifl+n-1) stores the indices of eigenvectors
+           ! that corresponding to eigenvectors that fail to converge in
+           ! stdlib_qstein.  this information is discarded; if any fail, the driver
+           ! returns info > 0.
+           indifl = indisp + n
+           ! indiwo is the offset of the remaining integer workspace.
+           indiwo = indifl + n
+           ! call stdlib_whetrd to reduce hermitian matrix to tridiagonal form.
+           call stdlib_whetrd( uplo, n, a, lda, rwork( indrd ), rwork( indre ),work( indtau ), &
+                     work( indwk ), llwork, iinfo )
+           ! if all eigenvalues are desired
+           ! then call stdlib_qsterf or stdlib_wstemr and stdlib_wunmtr.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig.or.test ) .and. ( ieeeok==1 ) ) then
+              if( .not.wantz ) then
+                 call stdlib_qcopy( n, rwork( indrd ), 1, w, 1 )
+                 call stdlib_qcopy( n-1, rwork( indre ), 1, rwork( indree ), 1 )
+                 call stdlib_qsterf( n, w, rwork( indree ), info )
+              else
+                 call stdlib_qcopy( n-1, rwork( indre ), 1, rwork( indree ), 1 )
+                 call stdlib_qcopy( n, rwork( indrd ), 1, rwork( indrdd ), 1 )
+                 if (abstol <= two*n*eps) then
+                    tryrac = .true.
+                 else
+                    tryrac = .false.
+                 end if
+                 call stdlib_wstemr( jobz, 'A', n, rwork( indrdd ),rwork( indree ), vl, vu, il, &
+                 iu, m, w,z, ldz, n, isuppz, tryrac,rwork( indrwk ), llrwork,iwork, liwork, info )
+                           
+                 ! apply unitary matrix used in reduction to tridiagonal
+                 ! form to eigenvectors returned by stdlib_wstemr.
+                 if( wantz .and. info==0 ) then
+                    indwkn = indwk
+                    llwrkn = lwork - indwkn + 1
+                    call stdlib_wunmtr( 'L', uplo, 'N', n, m, a, lda,work( indtau ), z, ldz, work(&
+                               indwkn ),llwrkn, iinfo )
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_qstebz and, if eigenvectors are desired, stdlib_wstein.
+           ! also call stdlib_qstebz and stdlib_wstein if stdlib_wstemr fails.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           call stdlib_qstebz( range, order, n, vll, vuu, il, iu, abstll,rwork( indrd ), rwork( &
+           indre ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwo )&
+                     , info )
+           if( wantz ) then
+              call stdlib_wstein( n, rwork( indrd ), rwork( indre ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwo ), iwork( indifl ),info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_wstein.
+              indwkn = indwk
+              llwrkn = lwork - indwkn + 1
+              call stdlib_wunmtr( 'L', uplo, 'N', n, m, a, lda, work( indtau ), z,ldz, work( &
+                        indwkn ), llwrkn, iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           30 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_wswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                 end if
+              end do
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_wheevr
+
+     !> ZHEEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex Hermitian matrix A.  Eigenvalues and eigenvectors can
+     !> be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+
+     subroutine stdlib_wheevx( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               work, lwork, rwork,iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, lda, ldz, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, lquery, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwk, indrwk, &
+                     indtau, indwrk, iscale, itmp1, j, jj, llwork, lwkmin, lwkopt, nb, nsplit
+           real(qp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           lower = stdlib_lsame( uplo, 'L' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -8
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -9
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -15
+              end if
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwkmin = 1
+                 work( 1 ) = lwkmin
+              else
+                 lwkmin = 2*n
+                 nb = stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 )
+                 nb = max( nb, stdlib_ilaenv( 1, 'ZUNMTR', uplo, n, -1, -1, -1 ) )
+                 lwkopt = max( 1, ( nb + 1 )*n )
+                 work( 1 ) = lwkopt
+              end if
+              if( lwork<lwkmin .and. .not.lquery )info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              return
+           end if
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = real( a( 1, 1 ),KIND=qp)
+              else if( valeig ) then
+                 if( vl<real( a( 1, 1 ),KIND=qp) .and. vu>=real( a( 1, 1 ),KIND=qp) )then
+                    m = 1
+                    w( 1 ) = real( a( 1, 1 ),KIND=qp)
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           end if
+           anrm = stdlib_wlanhe( 'M', uplo, n, a, lda, rwork )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 do j = 1, n
+                    call stdlib_wdscal( n-j+1, sigma, a( j, j ), 1 )
+                 end do
+              else
+                 do j = 1, n
+                    call stdlib_wdscal( j, sigma, a( 1, j ), 1 )
+                 end do
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_whetrd to reduce hermitian matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indrwk = inde + n
+           indtau = 1
+           indwrk = indtau + n
+           llwork = lwork - indwrk + 1
+           call stdlib_whetrd( uplo, n, a, lda, rwork( indd ), rwork( inde ),work( indtau ), work(&
+                      indwrk ), llwork, iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal to
+           ! zero, then call stdlib_qsterf or stdlib_wungtr and stdlib_wsteqr.  if this fails for
+           ! some eigenvalue, then try stdlib_qstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_qcopy( n, rwork( indd ), 1, w, 1 )
+              indee = indrwk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_qcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_qsterf( n, w, rwork( indee ), info )
+              else
+                 call stdlib_wlacpy( 'A', n, n, a, lda, z, ldz )
+                 call stdlib_wungtr( uplo, n, z, ldz, work( indtau ),work( indwrk ), llwork, &
+                           iinfo )
+                 call stdlib_qcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_wsteqr( jobz, n, w, rwork( indee ), z, ldz,rwork( indrwk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 40
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_qstebz and, if eigenvectors are desired, stdlib_wstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwk = indisp + n
+           call stdlib_qstebz( range, order, n, vll, vuu, il, iu, abstll,rwork( indd ), rwork( &
+           inde ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwk ),&
+                      info )
+           if( wantz ) then
+              call stdlib_wstein( n, rwork( indd ), rwork( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwk ), ifail, info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_wstein.
+              call stdlib_wunmtr( 'L', uplo, 'N', n, m, a, lda, work( indtau ), z,ldz, work( &
+                        indwrk ), llwork, iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           40 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_wswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           ! set work(1) to optimal complex workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wheevx
+
+     !> ZHEGS2: reduces a complex Hermitian-definite generalized
+     !> eigenproblem to standard form.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H *A*L.
+     !> B must have been previously factorized as U**H *U or L*L**H by ZPOTRF.
+
+     pure subroutine stdlib_whegs2( itype, uplo, n, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k
+           real(qp) :: akk, bkk
+           complex(qp) :: ct
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEGS2', -info )
+              return
+           end if
+           if( itype==1 ) then
+              if( upper ) then
+                 ! compute inv(u**h)*a*inv(u)
+                 do k = 1, n
+                    ! update the upper triangle of a(k:n,k:n)
+                    akk = real( a( k, k ),KIND=qp)
+                    bkk = real( b( k, k ),KIND=qp)
+                    akk = akk / bkk**2
+                    a( k, k ) = akk
+                    if( k<n ) then
+                       call stdlib_wdscal( n-k, one / bkk, a( k, k+1 ), lda )
+                       ct = -half*akk
+                       call stdlib_wlacgv( n-k, a( k, k+1 ), lda )
+                       call stdlib_wlacgv( n-k, b( k, k+1 ), ldb )
+                       call stdlib_waxpy( n-k, ct, b( k, k+1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_wher2( uplo, n-k, -cone, a( k, k+1 ), lda,b( k, k+1 ), ldb, a( &
+                                 k+1, k+1 ), lda )
+                       call stdlib_waxpy( n-k, ct, b( k, k+1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_wlacgv( n-k, b( k, k+1 ), ldb )
+                       call stdlib_wtrsv( uplo, 'CONJUGATE TRANSPOSE', 'NON-UNIT',n-k, b( k+1, k+&
+                                 1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_wlacgv( n-k, a( k, k+1 ), lda )
+                    end if
+                 end do
+              else
+                 ! compute inv(l)*a*inv(l**h)
+                 do k = 1, n
+                    ! update the lower triangle of a(k:n,k:n)
+                    akk = real( a( k, k ),KIND=qp)
+                    bkk = real( b( k, k ),KIND=qp)
+                    akk = akk / bkk**2
+                    a( k, k ) = akk
+                    if( k<n ) then
+                       call stdlib_wdscal( n-k, one / bkk, a( k+1, k ), 1 )
+                       ct = -half*akk
+                       call stdlib_waxpy( n-k, ct, b( k+1, k ), 1, a( k+1, k ), 1 )
+                       call stdlib_wher2( uplo, n-k, -cone, a( k+1, k ), 1,b( k+1, k ), 1, a( k+1,&
+                                  k+1 ), lda )
+                       call stdlib_waxpy( n-k, ct, b( k+1, k ), 1, a( k+1, k ), 1 )
+                       call stdlib_wtrsv( uplo, 'NO TRANSPOSE', 'NON-UNIT', n-k,b( k+1, k+1 ), &
+                                 ldb, a( k+1, k ), 1 )
+                    end if
+                 end do
+              end if
+           else
+              if( upper ) then
+                 ! compute u*a*u**h
+                 do k = 1, n
+                    ! update the upper triangle of a(1:k,1:k)
+                    akk = real( a( k, k ),KIND=qp)
+                    bkk = real( b( k, k ),KIND=qp)
+                    call stdlib_wtrmv( uplo, 'NO TRANSPOSE', 'NON-UNIT', k-1, b,ldb, a( 1, k ), 1 &
+                              )
+                    ct = half*akk
+                    call stdlib_waxpy( k-1, ct, b( 1, k ), 1, a( 1, k ), 1 )
+                    call stdlib_wher2( uplo, k-1, cone, a( 1, k ), 1, b( 1, k ), 1,a, lda )
+                              
+                    call stdlib_waxpy( k-1, ct, b( 1, k ), 1, a( 1, k ), 1 )
+                    call stdlib_wdscal( k-1, bkk, a( 1, k ), 1 )
+                    a( k, k ) = akk*bkk**2
+                 end do
+              else
+                 ! compute l**h *a*l
+                 do k = 1, n
+                    ! update the lower triangle of a(1:k,1:k)
+                    akk = real( a( k, k ),KIND=qp)
+                    bkk = real( b( k, k ),KIND=qp)
+                    call stdlib_wlacgv( k-1, a( k, 1 ), lda )
+                    call stdlib_wtrmv( uplo, 'CONJUGATE TRANSPOSE', 'NON-UNIT', k-1,b, ldb, a( k, &
+                              1 ), lda )
+                    ct = half*akk
+                    call stdlib_wlacgv( k-1, b( k, 1 ), ldb )
+                    call stdlib_waxpy( k-1, ct, b( k, 1 ), ldb, a( k, 1 ), lda )
+                    call stdlib_wher2( uplo, k-1, cone, a( k, 1 ), lda, b( k, 1 ),ldb, a, lda )
+                              
+                    call stdlib_waxpy( k-1, ct, b( k, 1 ), ldb, a( k, 1 ), lda )
+                    call stdlib_wlacgv( k-1, b( k, 1 ), ldb )
+                    call stdlib_wdscal( k-1, bkk, a( k, 1 ), lda )
+                    call stdlib_wlacgv( k-1, a( k, 1 ), lda )
+                    a( k, k ) = akk*bkk**2
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_whegs2
+
+     !> ZHEGST: reduces a complex Hermitian-definite generalized
+     !> eigenproblem to standard form.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L.
+     !> B must have been previously factorized as U**H*U or L*L**H by ZPOTRF.
+
+     pure subroutine stdlib_whegst( itype, uplo, n, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEGST', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'ZHEGST', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_whegs2( itype, uplo, n, a, lda, b, ldb, info )
+           else
+              ! use blocked code
+              if( itype==1 ) then
+                 if( upper ) then
+                    ! compute inv(u**h)*a*inv(u)
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the upper triangle of a(k:n,k:n)
+                       call stdlib_whegs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                       if( k+kb<=n ) then
+                          call stdlib_wtrsm( 'LEFT', uplo, 'CONJUGATE TRANSPOSE','NON-UNIT', kb, &
+                                    n-k-kb+1, cone,b( k, k ), ldb, a( k, k+kb ), lda )
+                          call stdlib_whemm( 'LEFT', uplo, kb, n-k-kb+1, -chalf,a( k, k ), lda, b(&
+                                     k, k+kb ), ldb,cone, a( k, k+kb ), lda )
+                          call stdlib_wher2k( uplo, 'CONJUGATE TRANSPOSE', n-k-kb+1,kb, -cone, a( &
+                                    k, k+kb ), lda,b( k, k+kb ), ldb, one,a( k+kb, k+kb ), lda )
+                          call stdlib_whemm( 'LEFT', uplo, kb, n-k-kb+1, -chalf,a( k, k ), lda, b(&
+                                     k, k+kb ), ldb,cone, a( k, k+kb ), lda )
+                          call stdlib_wtrsm( 'RIGHT', uplo, 'NO TRANSPOSE','NON-UNIT', kb, n-k-kb+&
+                                    1, cone,b( k+kb, k+kb ), ldb, a( k, k+kb ),lda )
+                       end if
+                    end do
+                 else
+                    ! compute inv(l)*a*inv(l**h)
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the lower triangle of a(k:n,k:n)
+                       call stdlib_whegs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                       if( k+kb<=n ) then
+                          call stdlib_wtrsm( 'RIGHT', uplo, 'CONJUGATE TRANSPOSE','NON-UNIT', n-k-&
+                                    kb+1, kb, cone,b( k, k ), ldb, a( k+kb, k ), lda )
+                          call stdlib_whemm( 'RIGHT', uplo, n-k-kb+1, kb, -chalf,a( k, k ), lda, &
+                                    b( k+kb, k ), ldb,cone, a( k+kb, k ), lda )
+                          call stdlib_wher2k( uplo, 'NO TRANSPOSE', n-k-kb+1, kb,-cone, a( k+kb, &
+                                    k ), lda,b( k+kb, k ), ldb, one,a( k+kb, k+kb ), lda )
+                          call stdlib_whemm( 'RIGHT', uplo, n-k-kb+1, kb, -chalf,a( k, k ), lda, &
+                                    b( k+kb, k ), ldb,cone, a( k+kb, k ), lda )
+                          call stdlib_wtrsm( 'LEFT', uplo, 'NO TRANSPOSE','NON-UNIT', n-k-kb+1, &
+                                    kb, cone,b( k+kb, k+kb ), ldb, a( k+kb, k ),lda )
+                       end if
+                    end do
+                 end if
+              else
+                 if( upper ) then
+                    ! compute u*a*u**h
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the upper triangle of a(1:k+kb-1,1:k+kb-1)
+                       call stdlib_wtrmm( 'LEFT', uplo, 'NO TRANSPOSE', 'NON-UNIT',k-1, kb, cone, &
+                                 b, ldb, a( 1, k ), lda )
+                       call stdlib_whemm( 'RIGHT', uplo, k-1, kb, chalf, a( k, k ),lda, b( 1, k ),&
+                                  ldb, cone, a( 1, k ),lda )
+                       call stdlib_wher2k( uplo, 'NO TRANSPOSE', k-1, kb, cone,a( 1, k ), lda, b( &
+                                 1, k ), ldb, one, a,lda )
+                       call stdlib_whemm( 'RIGHT', uplo, k-1, kb, chalf, a( k, k ),lda, b( 1, k ),&
+                                  ldb, cone, a( 1, k ),lda )
+                       call stdlib_wtrmm( 'RIGHT', uplo, 'CONJUGATE TRANSPOSE','NON-UNIT', k-1, &
+                                 kb, cone, b( k, k ), ldb,a( 1, k ), lda )
+                       call stdlib_whegs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                    end do
+                 else
+                    ! compute l**h*a*l
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the lower triangle of a(1:k+kb-1,1:k+kb-1)
+                       call stdlib_wtrmm( 'RIGHT', uplo, 'NO TRANSPOSE', 'NON-UNIT',kb, k-1, cone,&
+                                  b, ldb, a( k, 1 ), lda )
+                       call stdlib_whemm( 'LEFT', uplo, kb, k-1, chalf, a( k, k ),lda, b( k, 1 ), &
+                                 ldb, cone, a( k, 1 ),lda )
+                       call stdlib_wher2k( uplo, 'CONJUGATE TRANSPOSE', k-1, kb,cone, a( k, 1 ), &
+                                 lda, b( k, 1 ), ldb,one, a, lda )
+                       call stdlib_whemm( 'LEFT', uplo, kb, k-1, chalf, a( k, k ),lda, b( k, 1 ), &
+                                 ldb, cone, a( k, 1 ),lda )
+                       call stdlib_wtrmm( 'LEFT', uplo, 'CONJUGATE TRANSPOSE','NON-UNIT', kb, k-1,&
+                                  cone, b( k, k ), ldb,a( k, 1 ), lda )
+                       call stdlib_whegs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_whegst
+
+     !> ZHEGV: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+     !> Here A and B are assumed to be Hermitian and B is also
+     !> positive definite.
+
+     subroutine stdlib_whegv( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, rwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, lwork, n
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: lwkopt, nb, neig
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, ( nb + 1 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, 2*n - 1 ) .and. .not.lquery ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEGV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_wpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_whegst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_wheev( jobz, uplo, n, a, lda, w, work, lwork, rwork, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 call stdlib_wtrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, neig, cone,b, ldb, a, lda &
+                           )
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_wtrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, neig, cone,b, ldb, a, lda &
+                           )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_whegv
+
+     !> ZHEGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be Hermitian and B is also positive definite.
+     !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_whegvd( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, rwork, lrwork,&
+                iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: liopt, liwmin, lopt, lropt, lrwmin, lwmin
+           ! Intrinsic Functions 
+           intrinsic :: real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              lwmin = 1
+              lrwmin = 1
+              liwmin = 1
+           else if( wantz ) then
+              lwmin = 2*n + n*n
+              lrwmin = 1 + 5*n + 2*n*n
+              liwmin = 3 + 5*n
+           else
+              lwmin = n + 1
+              lrwmin = n
+              liwmin = 1
+           end if
+           lopt = lwmin
+           lropt = lrwmin
+           liopt = liwmin
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              work( 1 ) = lopt
+              rwork( 1 ) = lropt
+              iwork( 1 ) = liopt
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -13
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_wpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_whegst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_wheevd( jobz, uplo, n, a, lda, w, work, lwork, rwork, lrwork,iwork, liwork,&
+                      info )
+           lopt = max( real( lopt,KIND=qp), real( work( 1 ),KIND=qp) )
+           lropt = max( real( lropt,KIND=qp), real( rwork( 1 ),KIND=qp) )
+           liopt = max( real( liopt,KIND=qp), real( iwork( 1 ),KIND=qp) )
+           if( wantz .and. info==0 ) then
+              ! backtransform eigenvectors to the original problem.
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 call stdlib_wtrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, n, cone,b, ldb, a, lda )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_wtrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, n, cone,b, ldb, a, lda )
+                           
+              end if
+           end if
+           work( 1 ) = lopt
+           rwork( 1 ) = lropt
+           iwork( 1 ) = liopt
+           return
+     end subroutine stdlib_whegvd
+
+     !> ZHEGVX: computes selected eigenvalues, and optionally, eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be Hermitian and B is also positive definite.
+     !> Eigenvalues and eigenvectors can be selected by specifying either a
+     !> range of values or a range of indices for the desired eigenvalues.
+
+     subroutine stdlib_whegvx( itype, jobz, range, uplo, n, a, lda, b, ldb,vl, vu, il, iu, abstol,&
+                m, w, z, ldz, work,lwork, rwork, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, itype, iu, lda, ldb, ldz, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lquery, upper, valeig, wantz
+           character :: trans
+           integer(ilp) :: lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -3
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -11
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -12
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -13
+                 end if
+              end if
+           end if
+           if (info==0) then
+              if (ldz<1 .or. (wantz .and. ldz<n)) then
+                 info = -18
+              end if
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, ( nb + 1 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEGVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              return
+           end if
+           ! form a cholesky factorization of b.
+           call stdlib_wpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_whegst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_wheevx( jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol,m, w, z, ldz, &
+                     work, lwork, rwork, iwork, ifail,info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              if( info>0 )m = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 call stdlib_wtrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, m, cone, b,ldb, z, ldz )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_wtrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, m, cone, b,ldb, z, ldz )
+                           
+              end if
+           end if
+           ! set work(1) to optimal complex workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_whegvx
+
+     !> ZHERFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian indefinite, and
+     !> provides error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_wherfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHERFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_wcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_whemv( uplo, n, -cone, a, lda, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( a( k, k ),KIND=qp) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( a( k, k ),KIND=qp) )*xk
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_whetrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                 call stdlib_waxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_wlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_whetrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_whetrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_wherfs
+
+     !> ZHESV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**H,  if UPLO = 'U', or
+     !> A = L * D * L**H,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.  The factored form of A is then
+     !> used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_whesv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'ZHETRF', uplo, n, -1, -1, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHESV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+           call stdlib_whetrf( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              if ( lwork<n ) then
+              ! solve with trs ( use level blas 2)
+                 call stdlib_whetrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+              else
+              ! solve with trs2 ( use level blas 3)
+                 call stdlib_whetrs2( uplo,n,nrhs,a,lda,ipiv,b,ldb,work,info )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_whesv
+
+     !> ZHESV_AA: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Aasen's algorithm is used to factor A as
+     !> A = U**H * T * U,  if UPLO = 'U', or
+     !> A = L * T * L**H,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is Hermitian and tridiagonal. The factored form
+     !> of A is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_whesv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, lwkopt_hetrf, lwkopt_hetrs
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max(2*n, 3*n-2) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              call stdlib_whetrf_aa( uplo, n, a, lda, ipiv, work, -1, info )
+              lwkopt_hetrf = int( work(1),KIND=ilp)
+              call stdlib_whetrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,-1, info )
+              lwkopt_hetrs = int( work(1),KIND=ilp)
+              lwkopt = max( lwkopt_hetrf, lwkopt_hetrs )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHESV_AA ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u**h*t*u or a = l*t*l**h.
+           call stdlib_whetrf_aa( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_whetrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_whesv_aa
+
+     !> ZHESV_RK: computes the solution to a complex system of linear
+     !> equations A * X = B, where A is an N-by-N Hermitian matrix
+     !> and X and B are N-by-NRHS matrices.
+     !> The bounded Bunch-Kaufman (rook) diagonal pivoting method is used
+     !> to factor A as
+     !> A = P*U*D*(U**H)*(P**T),  if UPLO = 'U', or
+     !> A = P*L*D*(L**H)*(P**T),  if UPLO = 'L',
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is Hermitian and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> ZHETRF_RK is called to compute the factorization of a complex
+     !> Hermitian matrix.  The factored form of A is then used to solve
+     !> the system of equations A * X = B by calling BLAS3 routine ZHETRS_3.
+
+     pure subroutine stdlib_whesv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_whetrf_rk( uplo, n, a, lda, e, ipiv, work, -1, info )
+                 lwkopt = real( work(1),KIND=qp)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHESV_RK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = p*u*d*(u**h)*(p**t) or
+           ! a = p*u*d*(u**h)*(p**t).
+           call stdlib_whetrf_rk( uplo, n, a, lda, e, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b with blas3 solver, overwriting b with x.
+              call stdlib_whetrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_whesv_rk
+
+     !> ZHESV_ROOK: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The bounded Bunch-Kaufman ("rook") diagonal pivoting method is used
+     !> to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> ZHETRF_ROOK is called to compute the factorization of a complex
+     !> Hermition matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method.
+     !> The factored form of A is then used to solve the system
+     !> of equations A * X = B by calling ZHETRS_ROOK (uses BLAS 2).
+
+     pure subroutine stdlib_whesv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'ZHETRF_ROOK', uplo, n, -1, -1, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHESV_ROOK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+           call stdlib_whetrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              ! solve with trs ( use level blas 2)
+              call stdlib_whetrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_whesv_rook
+
+     !> ZHESVX: uses the diagonal pivoting factorization to compute the
+     !> solution to a complex system of linear equations A * X = B,
+     !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_whesvx( fact, uplo, n, nrhs, a, lda, af, ldaf, ipiv, b,ldb, x, ldx, rcond, &
+               ferr, berr, work, lwork,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, lwork, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(inout) :: af(ldaf,*)
+           complex(qp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, nofact
+           integer(ilp) :: lwkopt, nb
+           real(qp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -11
+           else if( ldx<max( 1, n ) ) then
+              info = -13
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           if( info==0 ) then
+              lwkopt = max( 1, 2*n )
+              if( nofact ) then
+                 nb = stdlib_ilaenv( 1, 'ZHETRF', uplo, n, -1, -1, -1 )
+                 lwkopt = max( lwkopt, n*nb )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHESVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+              call stdlib_wlacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_whetrf( uplo, n, af, ldaf, ipiv, work, lwork, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_wlanhe( 'I', uplo, n, a, lda, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_whecon( uplo, n, af, ldaf, ipiv, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_wlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_whetrs( uplo, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_wherfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_whesvx
+
+     !> ZHESWAPR: applies an elementary permutation on the rows and the columns of
+     !> a hermitian matrix.
+
+     pure subroutine stdlib_wheswapr( uplo, n, a, lda, i1, i2)
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: i1, i2, lda, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,n)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           complex(qp) :: tmp
+           ! Executable Statements 
+           upper = stdlib_lsame( uplo, 'U' )
+           if (upper) then
+               ! upper
+               ! first swap
+                ! - swap column i1 and i2 from i1 to i1-1
+              call stdlib_wswap( i1-1, a(1,i1), 1, a(1,i2), 1 )
+                ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap row i1 from i1+1 to i2-1 with col i2 from i1+1 to i2-1
+                ! - swap a(i2,i1) and a(i1,i2)
+              tmp=a(i1,i1)
+              a(i1,i1)=a(i2,i2)
+              a(i2,i2)=tmp
+              do i=1,i2-i1-1
+                 tmp=a(i1,i1+i)
+                 a(i1,i1+i)=conjg(a(i1+i,i2))
+                 a(i1+i,i2)=conjg(tmp)
+              end do
+               a(i1,i2)=conjg(a(i1,i2))
+                ! third swap
+                ! - swap row i1 and i2 from i2+1 to n
+              do i=i2+1,n
+                 tmp=a(i1,i)
+                 a(i1,i)=a(i2,i)
+                 a(i2,i)=tmp
+              end do
+             else
+               ! lower
+               ! first swap
+                ! - swap row i1 and i2 from 1 to i1-1
+              call stdlib_wswap ( i1-1, a(i1,1), lda, a(i2,1), lda )
+               ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap col i1 from i1+1 to i2-1 with row i2 from i1+1 to i2-1
+                ! - swap a(i2,i1) and a(i1,i2)
+               tmp=a(i1,i1)
+               a(i1,i1)=a(i2,i2)
+               a(i2,i2)=tmp
+               do i=1,i2-i1-1
+                  tmp=a(i1+i,i1)
+                  a(i1+i,i1)=conjg(a(i2,i1+i))
+                  a(i2,i1+i)=conjg(tmp)
+               end do
+               a(i2,i1)=conjg(a(i2,i1))
+               ! third swap
+                ! - swap col i1 and i2 from i2+1 to n
+               do i=i2+1,n
+                  tmp=a(i,i1)
+                  a(i,i1)=a(i,i2)
+                  a(i,i2)=tmp
+               end do
+           endif
+     end subroutine stdlib_wheswapr
+
+     !> ZHETD2: reduces a complex Hermitian matrix A to real symmetric
+     !> tridiagonal form T by a unitary similarity transformation:
+     !> Q**H * A * Q = T.
+
+     pure subroutine stdlib_whetd2( uplo, n, a, lda, d, e, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(out) :: d(*), e(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           complex(qp) :: alpha, taui
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U')
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETD2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           if( upper ) then
+              ! reduce the upper triangle of a
+              a( n, n ) = real( a( n, n ),KIND=qp)
+              do i = n - 1, 1, -1
+                 ! generate elementary reflector h(i) = i - tau * v * v**h
+                 ! to annihilate a(1:i-1,i+1)
+                 alpha = a( i, i+1 )
+                 call stdlib_wlarfg( i, alpha, a( 1, i+1 ), 1, taui )
+                 e( i ) = real( alpha,KIND=qp)
+                 if( taui/=czero ) then
+                    ! apply h(i) from both sides to a(1:i,1:i)
+                    a( i, i+1 ) = cone
+                    ! compute  x := tau * a * v  storing x in tau(1:i)
+                    call stdlib_whemv( uplo, i, taui, a, lda, a( 1, i+1 ), 1, czero,tau, 1 )
+                              
+                    ! compute  w := x - 1/2 * tau * (x**h * v) * v
+                    alpha = -chalf*taui*stdlib_wdotc( i, tau, 1, a( 1, i+1 ), 1 )
+                    call stdlib_waxpy( i, alpha, a( 1, i+1 ), 1, tau, 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**h - w * v**h
+                    call stdlib_wher2( uplo, i, -cone, a( 1, i+1 ), 1, tau, 1, a,lda )
+                 else
+                    a( i, i ) = real( a( i, i ),KIND=qp)
+                 end if
+                 a( i, i+1 ) = e( i )
+                 d( i+1 ) = real( a( i+1, i+1 ),KIND=qp)
+                 tau( i ) = taui
+              end do
+              d( 1 ) = real( a( 1, 1 ),KIND=qp)
+           else
+              ! reduce the lower triangle of a
+              a( 1, 1 ) = real( a( 1, 1 ),KIND=qp)
+              do i = 1, n - 1
+                 ! generate elementary reflector h(i) = i - tau * v * v**h
+                 ! to annihilate a(i+2:n,i)
+                 alpha = a( i+1, i )
+                 call stdlib_wlarfg( n-i, alpha, a( min( i+2, n ), i ), 1, taui )
+                 e( i ) = real( alpha,KIND=qp)
+                 if( taui/=czero ) then
+                    ! apply h(i) from both sides to a(i+1:n,i+1:n)
+                    a( i+1, i ) = cone
+                    ! compute  x := tau * a * v  storing y in tau(i:n-1)
+                    call stdlib_whemv( uplo, n-i, taui, a( i+1, i+1 ), lda,a( i+1, i ), 1, czero, &
+                              tau( i ), 1 )
+                    ! compute  w := x - 1/2 * tau * (x**h * v) * v
+                    alpha = -chalf*taui*stdlib_wdotc( n-i, tau( i ), 1, a( i+1, i ),1 )
+                    call stdlib_waxpy( n-i, alpha, a( i+1, i ), 1, tau( i ), 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**h - w * v**h
+                    call stdlib_wher2( uplo, n-i, -cone, a( i+1, i ), 1, tau( i ), 1,a( i+1, i+1 )&
+                              , lda )
+                 else
+                    a( i+1, i+1 ) = real( a( i+1, i+1 ),KIND=qp)
+                 end if
+                 a( i+1, i ) = e( i )
+                 d( i ) = real( a( i, i ),KIND=qp)
+                 tau( i ) = taui
+              end do
+              d( n ) = real( a( n, n ),KIND=qp)
+           end if
+           return
+     end subroutine stdlib_whetd2
+
+     !> ZHETF2: computes the factorization of a complex Hermitian matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**H  or  A = L*D*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**H is the conjugate transpose of U, and D is
+     !> Hermitian and block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_whetf2( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kk, kp, kstep
+           real(qp) :: absakk, alpha, colmax, d, d11, d22, r1, rowmax, tt
+           complex(qp) :: d12, d21, t, wk, wkm1, wkp1, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETF2', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 90
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) .or. stdlib_qisnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=qp)
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value.
+                    ! determine only rowmax.
+                    jmax = imax + stdlib_iwamax( k-imax, a( imax, imax+1 ), lda )
+                    rowmax = cabs1( a( imax, jmax ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_iwamax( imax-1, a( 1, imax ), 1 )
+                       rowmax = max( rowmax, cabs1( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( real( a( imax, imax ),KIND=qp) )>=alpha*rowmax )then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_wswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    do j = kp + 1, kk - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    r1 = real( a( kk, kk ),KIND=qp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=qp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       a( k, k ) = real( a( k, k ),KIND=qp)
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 else
+                    a( k, k ) = real( a( k, k ),KIND=qp)
+                    if( kstep==2 )a( k-1, k-1 ) = real( a( k-1, k-1 ),KIND=qp)
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**h = a - w(k)*1/d(k)*w(k)**h
+                    r1 = one / real( a( k, k ),KIND=qp)
+                    call stdlib_wher( uplo, k-1, -r1, a( 1, k ), 1, a, lda )
+                    ! store u(k) in column k
+                    call stdlib_wdscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**h
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**h
+                    if( k>2 ) then
+                       d = stdlib_qlapy2( real( a( k-1, k ),KIND=qp),aimag( a( k-1, k ) ) )
+                                 
+                       d22 = real( a( k-1, k-1 ),KIND=qp) / d
+                       d11 = real( a( k, k ),KIND=qp) / d
+                       tt = one / ( d11*d22-one )
+                       d12 = a( k-1, k ) / d
+                       d = tt / d
+                       do j = k - 2, 1, -1
+                          wkm1 = d*( d11*a( j, k-1 )-conjg( d12 )*a( j, k ) )
+                          wk = d*( d22*a( j, k )-d12*a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - a( i, k )*conjg( wk ) -a( i, k-1 )*conjg( &
+                                       wkm1 )
+                          end do
+                          a( j, k ) = wk
+                          a( j, k-1 ) = wkm1
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=qp), zero,KIND=qp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              50 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 90
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) .or. stdlib_qisnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=qp)
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value.
+                    ! determine only rowmax.
+                    jmax = k - 1 + stdlib_iwamax( imax-k, a( imax, k ), lda )
+                    rowmax = cabs1( a( imax, jmax ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_iwamax( n-imax, a( imax+1, imax ), 1 )
+                       rowmax = max( rowmax, cabs1( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( real( a( imax, imax ),KIND=qp) )>=alpha*rowmax )then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_wswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    do j = kk + 1, kp - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    r1 = real( a( kk, kk ),KIND=qp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=qp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       a( k, k ) = real( a( k, k ),KIND=qp)
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 else
+                    a( k, k ) = real( a( k, k ),KIND=qp)
+                    if( kstep==2 )a( k+1, k+1 ) = real( a( k+1, k+1 ),KIND=qp)
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**h = a - w(k)*(1/d(k))*w(k)**h
+                       r1 = one / real( a( k, k ),KIND=qp)
+                       call stdlib_wher( uplo, n-k, -r1, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                 
+                       ! store l(k) in column k
+                       call stdlib_wdscal( n-k, r1, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k)
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**h
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**h
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d = stdlib_qlapy2( real( a( k+1, k ),KIND=qp),aimag( a( k+1, k ) ) )
+                                 
+                       d11 = real( a( k+1, k+1 ),KIND=qp) / d
+                       d22 = real( a( k, k ),KIND=qp) / d
+                       tt = one / ( d11*d22-one )
+                       d21 = a( k+1, k ) / d
+                       d = tt / d
+                       do j = k + 2, n
+                          wk = d*( d11*a( j, k )-d21*a( j, k+1 ) )
+                          wkp1 = d*( d22*a( j, k+1 )-conjg( d21 )*a( j, k ) )
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - a( i, k )*conjg( wk ) -a( i, k+1 )*conjg( &
+                                       wkp1 )
+                          end do
+                          a( j, k ) = wk
+                          a( j, k+1 ) = wkp1
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=qp), zero,KIND=qp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 50
+           end if
+           90 continue
+           return
+     end subroutine stdlib_whetf2
+
+     !> ZHETF2_RK: computes the factorization of a complex Hermitian matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**H)*(P**T) or A = P*L*D*(L**H)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is Hermitian and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_whetf2_rk( uplo, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: e(*)
+        ! ======================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done, upper
+           integer(ilp) :: i, ii, imax, itemp, j, jmax, k, kk, kp, kstep, p
+           real(qp) :: absakk, alpha, colmax, d, d11, d22, r1, dtemp, rowmax, tt, sfmin
+           complex(qp) :: d12, d21, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=qp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETF2_RK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = czero
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 34
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=qp)
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_iwamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_iwamax( imax-1, a( 1, imax ), 1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=qp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( a( imax, imax ),KIND=qp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 12
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! for only a 2x2 pivot, interchange rows and columns k and p
+                 ! in the leading submatrix a(1:k,1:k)
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! (1) swap columnar parts
+                    if( p>1 )call stdlib_wswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = p + 1, k - 1
+                       t = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( p, j ) )
+                       a( p, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( p, k ) = conjg( a( p, k ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( k, k ),KIND=qp)
+                    a( k, k ) = real( a( p, p ),KIND=qp)
+                    a( p, p ) = r1
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_wswap( n-k, a( k, k+1 ), lda, a( p, k+1 ), lda )
+                 end if
+                 ! for both 1x1 and 2x2 pivots, interchange rows and
+                 ! columns kk and kp in the leading submatrix a(1:k,1:k)
+                 if( kp/=kk ) then
+                    ! (1) swap columnar parts
+                    if( kp>1 )call stdlib_wswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = kp + 1, kk - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( kk, kk ),KIND=qp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=qp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       ! (*) make sure that diagonal element of pivot is real
+                       a( k, k ) = real( a( k, k ),KIND=qp)
+                       ! (5) swap row elements
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_wswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                 else
+                    ! (*) make sure that diagonal element of pivot is real
+                    a( k, k ) = real( a( k, k ),KIND=qp)
+                    if( kstep==2 )a( k-1, k-1 ) = real( a( k-1, k-1 ),KIND=qp)
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( abs( real( a( k, k ),KIND=qp) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = one / real( a( k, k ),KIND=qp)
+                          call stdlib_wher( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_wdscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = real( a( k, k ),KIND=qp)
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_wher( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       ! d = |a12|
+                       d = stdlib_qlapy2( real( a( k-1, k ),KIND=qp),aimag( a( k-1, k ) ) )
+                                 
+                       d11 = real( a( k, k ) / d,KIND=qp)
+                       d22 = real( a( k-1, k-1 ) / d,KIND=qp)
+                       d12 = a( k-1, k ) / d
+                       tt = one / ( d11*d22-one )
+                       do j = k - 2, 1, -1
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wkm1 = tt*( d11*a( j, k-1 )-conjg( d12 )*a( j, k ) )
+                          wk = tt*( d22*a( j, k )-d12*a( j, k-1 ) )
+                          ! perform a rank-2 update of a(1:k-2,1:k-2)
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) -( a( i, k ) / d )*conjg( wk ) -( a( i, k-1 ) &
+                                       / d )*conjg( wkm1 )
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d
+                          a( j, k-1 ) = wkm1 / d
+                          ! (*) make sure that diagonal element of pivot is real
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=qp), zero,KIND=qp)
+                       end do
+                    end if
+                    ! copy superdiagonal elements of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    e( k ) = a( k-1, k )
+                    e( k-1 ) = czero
+                    a( k-1, k ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              34 continue
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = czero
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 64
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=qp)
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_iwamax( imax-k, a( imax, k ), lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_iwamax( n-imax, a( imax+1, imax ),1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=qp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( a( imax, imax ),KIND=qp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 42
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! for only a 2x2 pivot, interchange rows and columns k and p
+                 ! in the trailing submatrix a(k:n,k:n)
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! (1) swap columnar parts
+                    if( p<n )call stdlib_wswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = k + 1, p - 1
+                       t = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( p, j ) )
+                       a( p, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( p, k ) = conjg( a( p, k ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( k, k ),KIND=qp)
+                    a( k, k ) = real( a( p, p ),KIND=qp)
+                    a( p, p ) = r1
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_wswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                 end if
+                 ! for both 1x1 and 2x2 pivots, interchange rows and
+                 ! columns kk and kp in the trailing submatrix a(k:n,k:n)
+                 if( kp/=kk ) then
+                    ! (1) swap columnar parts
+                    if( kp<n )call stdlib_wswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! (2) swap and conjugate middle parts
+                    do j = kk + 1, kp - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( kk, kk ),KIND=qp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=qp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       ! (*) make sure that diagonal element of pivot is real
+                       a( k, k ) = real( a( k, k ),KIND=qp)
+                       ! (5) swap row elements
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_wswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                 else
+                    ! (*) make sure that diagonal element of pivot is real
+                    a( k, k ) = real( a( k, k ),KIND=qp)
+                    if( kstep==2 )a( k+1, k+1 ) = real( a( k+1, k+1 ),KIND=qp)
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of a now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                       ! store l(k) in column k
+                       ! handle division by a small number
+                       if( abs( real( a( k, k ),KIND=qp) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = one / real( a( k, k ),KIND=qp)
+                          call stdlib_wher( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_wdscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = real( a( k, k ),KIND=qp)
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_wher( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       ! d = |a21|
+                       d = stdlib_qlapy2( real( a( k+1, k ),KIND=qp),aimag( a( k+1, k ) ) )
+                                 
+                       d11 = real( a( k+1, k+1 ),KIND=qp) / d
+                       d22 = real( a( k, k ),KIND=qp) / d
+                       d21 = a( k+1, k ) / d
+                       tt = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = tt*( d11*a( j, k )-d21*a( j, k+1 ) )
+                          wkp1 = tt*( d22*a( j, k+1 )-conjg( d21 )*a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) -( a( i, k ) / d )*conjg( wk ) -( a( i, k+1 ) &
+                                       / d )*conjg( wkp1 )
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d
+                          a( j, k+1 ) = wkp1 / d
+                          ! (*) make sure that diagonal element of pivot is real
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=qp), zero,KIND=qp)
+                       end do
+                    end if
+                    ! copy subdiagonal elements of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    e( k ) = a( k+1, k )
+                    e( k+1 ) = czero
+                    a( k+1, k ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+              64 continue
+           end if
+           return
+     end subroutine stdlib_whetf2_rk
+
+     !> ZHETF2_ROOK: computes the factorization of a complex Hermitian matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method:
+     !> A = U*D*U**H  or  A = L*D*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**H is the conjugate transpose of U, and D is
+     !> Hermitian and block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_whetf2_rook( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+        ! ======================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done, upper
+           integer(ilp) :: i, ii, imax, itemp, j, jmax, k, kk, kp, kstep, p
+           real(qp) :: absakk, alpha, colmax, d, d11, d22, r1, dtemp, rowmax, tt, sfmin
+           complex(qp) :: d12, d21, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=qp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETF2_ROOK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=qp)
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_iwamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_iwamax( imax-1, a( 1, imax ), 1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=qp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( a( imax, imax ),KIND=qp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 12
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! for only a 2x2 pivot, interchange rows and columns k and p
+                 ! in the leading submatrix a(1:k,1:k)
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! (1) swap columnar parts
+                    if( p>1 )call stdlib_wswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = p + 1, k - 1
+                       t = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( p, j ) )
+                       a( p, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( p, k ) = conjg( a( p, k ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( k, k ),KIND=qp)
+                    a( k, k ) = real( a( p, p ),KIND=qp)
+                    a( p, p ) = r1
+                 end if
+                 ! for both 1x1 and 2x2 pivots, interchange rows and
+                 ! columns kk and kp in the leading submatrix a(1:k,1:k)
+                 if( kp/=kk ) then
+                    ! (1) swap columnar parts
+                    if( kp>1 )call stdlib_wswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = kp + 1, kk - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( kk, kk ),KIND=qp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=qp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       ! (*) make sure that diagonal element of pivot is real
+                       a( k, k ) = real( a( k, k ),KIND=qp)
+                       ! (5) swap row elements
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 else
+                    ! (*) make sure that diagonal element of pivot is real
+                    a( k, k ) = real( a( k, k ),KIND=qp)
+                    if( kstep==2 )a( k-1, k-1 ) = real( a( k-1, k-1 ),KIND=qp)
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( abs( real( a( k, k ),KIND=qp) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = one / real( a( k, k ),KIND=qp)
+                          call stdlib_wher( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_wdscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = real( a( k, k ),KIND=qp)
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_wher( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       ! d = |a12|
+                       d = stdlib_qlapy2( real( a( k-1, k ),KIND=qp),aimag( a( k-1, k ) ) )
+                                 
+                       d11 = real( a( k, k ) / d,KIND=qp)
+                       d22 = real( a( k-1, k-1 ) / d,KIND=qp)
+                       d12 = a( k-1, k ) / d
+                       tt = one / ( d11*d22-one )
+                       do j = k - 2, 1, -1
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wkm1 = tt*( d11*a( j, k-1 )-conjg( d12 )*a( j, k ) )
+                          wk = tt*( d22*a( j, k )-d12*a( j, k-1 ) )
+                          ! perform a rank-2 update of a(1:k-2,1:k-2)
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) -( a( i, k ) / d )*conjg( wk ) -( a( i, k-1 ) &
+                                       / d )*conjg( wkm1 )
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d
+                          a( j, k-1 ) = wkm1 / d
+                          ! (*) make sure that diagonal element of pivot is real
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=qp), zero,KIND=qp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=qp)
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_iwamax( imax-k, a( imax, k ), lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_iwamax( n-imax, a( imax+1, imax ),1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=qp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( a( imax, imax ),KIND=qp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 42
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! for only a 2x2 pivot, interchange rows and columns k and p
+                 ! in the trailing submatrix a(k:n,k:n)
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! (1) swap columnar parts
+                    if( p<n )call stdlib_wswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = k + 1, p - 1
+                       t = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( p, j ) )
+                       a( p, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( p, k ) = conjg( a( p, k ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( k, k ),KIND=qp)
+                    a( k, k ) = real( a( p, p ),KIND=qp)
+                    a( p, p ) = r1
+                 end if
+                 ! for both 1x1 and 2x2 pivots, interchange rows and
+                 ! columns kk and kp in the trailing submatrix a(k:n,k:n)
+                 if( kp/=kk ) then
+                    ! (1) swap columnar parts
+                    if( kp<n )call stdlib_wswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! (2) swap and conjugate middle parts
+                    do j = kk + 1, kp - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( kk, kk ),KIND=qp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=qp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       ! (*) make sure that diagonal element of pivot is real
+                       a( k, k ) = real( a( k, k ),KIND=qp)
+                       ! (5) swap row elements
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 else
+                    ! (*) make sure that diagonal element of pivot is real
+                    a( k, k ) = real( a( k, k ),KIND=qp)
+                    if( kstep==2 )a( k+1, k+1 ) = real( a( k+1, k+1 ),KIND=qp)
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of a now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                       ! store l(k) in column k
+                       ! handle division by a small number
+                       if( abs( real( a( k, k ),KIND=qp) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = one / real( a( k, k ),KIND=qp)
+                          call stdlib_wher( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_wdscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = real( a( k, k ),KIND=qp)
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_wher( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       ! d = |a21|
+                       d = stdlib_qlapy2( real( a( k+1, k ),KIND=qp),aimag( a( k+1, k ) ) )
+                                 
+                       d11 = real( a( k+1, k+1 ),KIND=qp) / d
+                       d22 = real( a( k, k ),KIND=qp) / d
+                       d21 = a( k+1, k ) / d
+                       tt = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = tt*( d11*a( j, k )-d21*a( j, k+1 ) )
+                          wkp1 = tt*( d22*a( j, k+1 )-conjg( d21 )*a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) -( a( i, k ) / d )*conjg( wk ) -( a( i, k+1 ) &
+                                       / d )*conjg( wkp1 )
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d
+                          a( j, k+1 ) = wkp1 / d
+                          ! (*) make sure that diagonal element of pivot is real
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=qp), zero,KIND=qp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_whetf2_rook
+
+     !> ZHETRD: reduces a complex Hermitian matrix A to real symmetric
+     !> tridiagonal form T by a unitary similarity transformation:
+     !> Q**H * A * Q = T.
+
+     pure subroutine stdlib_whetrd( uplo, n, a, lda, d, e, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(qp), intent(out) :: d(*), e(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, iws, j, kk, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              ! determine the block size.
+              nb = stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nx = n
+           iws = 1
+           if( nb>1 .and. nb<n ) then
+              ! determine when to cross over from blocked to unblocked code
+              ! (last block is always handled by unblocked code).
+              nx = max( nb, stdlib_ilaenv( 3, 'ZHETRD', uplo, n, -1, -1, -1 ) )
+              if( nx<n ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code by setting nx = n.
+                    nb = max( lwork / ldwork, 1 )
+                    nbmin = stdlib_ilaenv( 2, 'ZHETRD', uplo, n, -1, -1, -1 )
+                    if( nb<nbmin )nx = n
+                 end if
+              else
+                 nx = n
+              end if
+           else
+              nb = 1
+           end if
+           if( upper ) then
+              ! reduce the upper triangle of a.
+              ! columns 1:kk are handled by the unblocked method.
+              kk = n - ( ( n-nx+nb-1 ) / nb )*nb
+              do i = n - nb + 1, kk + 1, -nb
+                 ! reduce columns i:i+nb-1 to tridiagonal form and form the
+                 ! matrix w which is needed to update the unreduced part of
+                 ! the matrix
+                 call stdlib_wlatrd( uplo, i+nb-1, nb, a, lda, e, tau, work,ldwork )
+                 ! update the unreduced submatrix a(1:i-1,1:i-1), using an
+                 ! update of the form:  a := a - v*w**h - w*v**h
+                 call stdlib_wher2k( uplo, 'NO TRANSPOSE', i-1, nb, -cone,a( 1, i ), lda, work, &
+                           ldwork, one, a, lda )
+                 ! copy superdiagonal elements back into a, and diagonal
+                 ! elements into d
+                 do j = i, i + nb - 1
+                    a( j-1, j ) = e( j-1 )
+                    d( j ) = real( a( j, j ),KIND=qp)
+                 end do
+              end do
+              ! use unblocked code to reduce the last or only block
+              call stdlib_whetd2( uplo, kk, a, lda, d, e, tau, iinfo )
+           else
+              ! reduce the lower triangle of a
+              do i = 1, n - nx, nb
+                 ! reduce columns i:i+nb-1 to tridiagonal form and form the
+                 ! matrix w which is needed to update the unreduced part of
+                 ! the matrix
+                 call stdlib_wlatrd( uplo, n-i+1, nb, a( i, i ), lda, e( i ),tau( i ), work, &
+                           ldwork )
+                 ! update the unreduced submatrix a(i+nb:n,i+nb:n), using
+                 ! an update of the form:  a := a - v*w**h - w*v**h
+                 call stdlib_wher2k( uplo, 'NO TRANSPOSE', n-i-nb+1, nb, -cone,a( i+nb, i ), lda, &
+                           work( nb+1 ), ldwork, one,a( i+nb, i+nb ), lda )
+                 ! copy subdiagonal elements back into a, and diagonal
+                 ! elements into d
+                 do j = i, i + nb - 1
+                    a( j+1, j ) = e( j )
+                    d( j ) = real( a( j, j ),KIND=qp)
+                 end do
+              end do
+              ! use unblocked code to reduce the last or only block
+              call stdlib_whetd2( uplo, n-i+1, a( i, i ), lda, d( i ), e( i ),tau( i ), iinfo )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_whetrd
+
+     !> ZHETRD_HB2ST: reduces a complex Hermitian band matrix A to real symmetric
+     !> tridiagonal form T by a unitary similarity transformation:
+     !> Q**H * A * Q = T.
+
+     pure subroutine stdlib_whetrd_hb2st( stage1, vect, uplo, n, kd, ab, ldab,d, e, hous, lhous, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: stage1, uplo, vect
+           integer(ilp), intent(in) :: n, kd, ldab, lhous, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(out) :: d(*), e(*)
+           complex(qp), intent(inout) :: ab(ldab,*)
+           complex(qp), intent(out) :: hous(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: rzero = 0.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantq, upper, afters1
+           integer(ilp) :: i, m, k, ib, sweepid, myid, shift, stt, st, ed, stind, edind, &
+           blklastind, colpt, thed, stepercol, grsiz, thgrsiz, thgrnb, thgrid, nbtiles, ttype, &
+           tid, nthreads, debug, abdpos, abofdpos, dpos, ofdpos, awpos, inda, indw, apos, sizea, &
+                     lda, indv, indtau, sizev, sizetau, ldv, lhmin, lwmin
+           real(qp) :: abstmp
+           complex(qp) :: tmp
+           ! Intrinsic Functions 
+           intrinsic :: min,max,ceiling,real
+           ! Executable Statements 
+           ! determine the minimal workspace size required.
+           ! test the input parameters
+           debug   = 0
+           info    = 0
+           afters1 = stdlib_lsame( stage1, 'Y' )
+           wantq   = stdlib_lsame( vect, 'V' )
+           upper   = stdlib_lsame( uplo, 'U' )
+           lquery  = ( lwork==-1 ) .or. ( lhous==-1 )
+           ! determine the block size, the workspace size and the hous size.
+           ib     = stdlib_ilaenv2stage( 2, 'ZHETRD_HB2ST', vect, n, kd, -1, -1 )
+           lhmin  = stdlib_ilaenv2stage( 3, 'ZHETRD_HB2ST', vect, n, kd, ib, -1 )
+           lwmin  = stdlib_ilaenv2stage( 4, 'ZHETRD_HB2ST', vect, n, kd, ib, -1 )
+           if( .not.afters1 .and. .not.stdlib_lsame( stage1, 'N' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -2
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<(kd+1) ) then
+              info = -7
+           else if( lhous<lhmin .and. .not.lquery ) then
+              info = -11
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              hous( 1 ) = lhmin
+              work( 1 ) = lwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRD_HB2ST', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! determine pointer position
+           ldv      = kd + ib
+           sizetau  = 2 * n
+           sizev    = 2 * n
+           indtau   = 1
+           indv     = indtau + sizetau
+           lda      = 2 * kd + 1
+           sizea    = lda * n
+           inda     = 1
+           indw     = inda + sizea
+           nthreads = 1
+           tid      = 0
+           if( upper ) then
+               apos     = inda + kd
+               awpos    = inda
+               dpos     = apos + kd
+               ofdpos   = dpos - 1
+               abdpos   = kd + 1
+               abofdpos = kd
+           else
+               apos     = inda
+               awpos    = inda + kd + 1
+               dpos     = apos
+               ofdpos   = dpos + 1
+               abdpos   = 1
+               abofdpos = 2
+           endif
+           ! case kd=0:
+           ! the matrix is diagonal. we just copy it (convert to "real" for
+           ! complex because d is double and the imaginary part should be 0)
+           ! and store it in d. a sequential code here is better or
+           ! in a parallel environment it might need two cores for d and e
+           if( kd==0 ) then
+               do i = 1, n
+                   d( i ) = real( ab( abdpos, i ),KIND=qp)
+               end do
+               do i = 1, n-1
+                   e( i ) = rzero
+               end do
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! case kd=1:
+           ! the matrix is already tridiagonal. we have to make diagonal
+           ! and offdiagonal elements real, and store them in d and e.
+           ! for that, for real precision just copy the diag and offdiag
+           ! to d and e while for the complex case the bulge chasing is
+           ! performed to convert the hermetian tridiagonal to symmetric
+           ! tridiagonal. a simpler conversion formula might be used, but then
+           ! updating the q matrix will be required and based if q is generated
+           ! or not this might complicate the story.
+           if( kd==1 ) then
+               do i = 1, n
+                   d( i ) = real( ab( abdpos, i ),KIND=qp)
+               end do
+               ! make off-diagonal elements real and copy them to e
+               if( upper ) then
+                   do i = 1, n - 1
+                       tmp = ab( abofdpos, i+1 )
+                       abstmp = abs( tmp )
+                       ab( abofdpos, i+1 ) = abstmp
+                       e( i ) = abstmp
+                       if( abstmp/=rzero ) then
+                          tmp = tmp / abstmp
+                       else
+                          tmp = cone
+                       end if
+                       if( i<n-1 )ab( abofdpos, i+2 ) = ab( abofdpos, i+2 )*tmp
+                        ! if( wantz ) then
+                           ! call stdlib_wscal( n, conjg( tmp ), q( 1, i+1 ), 1 )
+                        ! end if
+                   end do
+               else
+                   do i = 1, n - 1
+                      tmp = ab( abofdpos, i )
+                      abstmp = abs( tmp )
+                      ab( abofdpos, i ) = abstmp
+                      e( i ) = abstmp
+                      if( abstmp/=rzero ) then
+                         tmp = tmp / abstmp
+                      else
+                         tmp = cone
+                      end if
+                      if( i<n-1 )ab( abofdpos, i+1 ) = ab( abofdpos, i+1 )*tmp
+                       ! if( wantq ) then
+                          ! call stdlib_wscal( n, tmp, q( 1, i+1 ), 1 )
+                       ! end if
+                   end do
+               endif
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! main code start here.
+           ! reduce the hermitian band of a to a tridiagonal matrix.
+           thgrsiz   = n
+           grsiz     = 1
+           shift     = 3
+           nbtiles   = ceiling( real(n,KIND=qp)/real(kd,KIND=qp) )
+           stepercol = ceiling( real(shift,KIND=qp)/real(grsiz,KIND=qp) )
+           thgrnb    = ceiling( real(n-1,KIND=qp)/real(thgrsiz,KIND=qp) )
+           call stdlib_wlacpy( "A", kd+1, n, ab, ldab, work( apos ), lda )
+           call stdlib_wlaset( "A", kd,   n, czero, czero, work( awpos ), lda )
+           ! openmp parallelisation start here
+           !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) &
+           !$OMP&         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) &
+           !$OMP&         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) &
+           !$OMP&         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) &
+           !$OMP&         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) &
+           !$OMP&         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+           !$OMP MASTER
+           ! main bulge chasing loop
+           loop_100: do thgrid = 1, thgrnb
+               stt  = (thgrid-1)*thgrsiz+1
+               thed = min( (stt + thgrsiz -1), (n-1))
+               loop_110: do i = stt, n-1
+                   ed = min( i, thed )
+                   if( stt>ed ) exit
+                   loop_120: do m = 1, stepercol
+                       st = stt
+                       loop_130: do sweepid = st, ed
+                           loop_140: do k = 1, grsiz
+                               myid  = (i-sweepid)*(stepercol*grsiz)+ (m-1)*grsiz + k
+                               if ( myid==1 ) then
+                                   ttype = 1
+                               else
+                                   ttype = mod( myid, 2 ) + 2
+                               endif
+                               if( ttype==2 ) then
+                                   colpt      = (myid/2)*kd + sweepid
+                                   stind      = colpt-kd+1
+                                   edind      = min(colpt,n)
+                                   blklastind = colpt
+                               else
+                                   colpt      = ((myid+1)/2)*kd + sweepid
+                                   stind      = colpt-kd+1
+                                   edind      = min(colpt,n)
+                                   if( ( stind>=edind-1 ).and.( edind==n ) ) then
+                                       blklastind = n
+                                   else
+                                       blklastind = 0
+                                   endif
+                               endif
+                               ! call the kernel
+                               !$ if( ttype/=1 ) then
+                                   !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) &
+                                   !$OMP&     DEPEND(in:WORK(MYID-1)) &
+                                   !$OMP&     DEPEND(out:WORK(MYID))
+                                   !$ tid      = omp_get_thread_num()
+                                   !$ call stdlib_whb2st_kernels( uplo, wantq, ttype,stind, edind, &
+                                   !$ sweepid, n, kd, ib,work ( inda ), lda,hous( indv ), hous( &
+                                   !$ indtau ), ldv,work( indw + tid*kd ) )
+                                   !$OMP END TASK
+                               !$ else
+                                   !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) &
+                                   !$OMP&     DEPEND(out:WORK(MYID))
+                                   !$ tid      = omp_get_thread_num()
+                                   call stdlib_whb2st_kernels( uplo, wantq, ttype,stind, edind, &
+                                   sweepid, n, kd, ib,work ( inda ), lda,hous( indv ), hous( &
+                                             indtau ), ldv,work( indw + tid*kd ) )
+                                   !$OMP END TASK
+                               !$ endif
+                               if ( blklastind>=(n-1) ) then
+                                   stt = stt + 1
+                                   exit
+                               endif
+                           end do loop_140
+                       end do loop_130
+                   end do loop_120
+               end do loop_110
+           end do loop_100
+           !$OMP END MASTER
+           !$OMP END PARALLEL
+           ! copy the diagonal from a to d. note that d is real thus only
+           ! the real part is needed, the imaginary part should be czero.
+           do i = 1, n
+               d( i ) = real( work( dpos+(i-1)*lda ),KIND=qp)
+           end do
+           ! copy the off diagonal from a to e. note that e is real thus only
+           ! the real part is needed, the imaginary part should be czero.
+           if( upper ) then
+               do i = 1, n-1
+                  e( i ) = real( work( ofdpos+i*lda ),KIND=qp)
+               end do
+           else
+               do i = 1, n-1
+                  e( i ) = real( work( ofdpos+(i-1)*lda ),KIND=qp)
+               end do
+           endif
+           hous( 1 ) = lhmin
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_whetrd_hb2st
+
+     !> ZHETRD_HE2HB: reduces a complex Hermitian matrix A to complex Hermitian
+     !> band-diagonal form AB by a unitary similarity transformation:
+     !> Q**H * A * Q = AB.
+
+     pure subroutine stdlib_whetrd_he2hb( uplo, n, kd, a, lda, ab, ldab, tau,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldab, lwork, n, kd
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: ab(ldab,*), tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: rone = 1.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, j, iinfo, lwmin, pn, pk, lk, ldt, ldw, lds2, lds1, ls2, ls1, lw, lt,&
+                      tpos, wpos, s2pos, s1pos
+           ! Intrinsic Functions 
+           intrinsic :: min,max
+           ! Executable Statements 
+           ! determine the minimal workspace size required
+           ! and test the input parameters
+           info   = 0
+           upper  = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           lwmin  = stdlib_ilaenv2stage( 4, 'ZHETRD_HE2HB', '', n, kd, -1, -1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldab<max( 1, kd+1 ) ) then
+              info = -7
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRD_HE2HB', -info )
+              return
+           else if( lquery ) then
+              work( 1 ) = lwmin
+              return
+           end if
+           ! quick return if possible
+           ! copy the upper/lower portion of a into ab
+           if( n<=kd+1 ) then
+               if( upper ) then
+                   do i = 1, n
+                       lk = min( kd+1, i )
+                       call stdlib_wcopy( lk, a( i-lk+1, i ), 1,ab( kd+1-lk+1, i ), 1 )
+                   end do
+               else
+                   do i = 1, n
+                       lk = min( kd+1, n-i+1 )
+                       call stdlib_wcopy( lk, a( i, i ), 1, ab( 1, i ), 1 )
+                   end do
+               endif
+               work( 1 ) = 1
+               return
+           end if
+           ! determine the pointer position for the workspace
+           ldt    = kd
+           lds1   = kd
+           lt     = ldt*kd
+           lw     = n*kd
+           ls1    = lds1*kd
+           ls2    = lwmin - lt - lw - ls1
+            ! ls2 = n*max(kd,factoptnb)
+           tpos   = 1
+           wpos   = tpos  + lt
+           s1pos  = wpos  + lw
+           s2pos  = s1pos + ls1
+           if( upper ) then
+               ldw    = kd
+               lds2   = kd
+           else
+               ldw    = n
+               lds2   = n
+           endif
+           ! set the workspace of the triangular matrix t to czero once such a
+           ! way every time t is generated the upper/lower portion will be always czero
+           call stdlib_wlaset( "A", ldt, kd, czero, czero, work( tpos ), ldt )
+           if( upper ) then
+               do i = 1, n - kd, kd
+                  pn = n-i-kd+1
+                  pk = min( n-i-kd+1, kd )
+                  ! compute the lq factorization of the current block
+                  call stdlib_wgelqf( kd, pn, a( i, i+kd ), lda,tau( i ), work( s2pos ), ls2, &
+                            iinfo )
+                  ! copy the upper portion of a into ab
+                  do j = i, i+pk-1
+                     lk = min( kd, n-j ) + 1
+                     call stdlib_wcopy( lk, a( j, j ), lda, ab( kd+1, j ), ldab-1 )
+                  end do
+                  call stdlib_wlaset( 'LOWER', pk, pk, czero, cone,a( i, i+kd ), lda )
+                  ! form the matrix t
+                  call stdlib_wlarft( 'FORWARD', 'ROWWISE', pn, pk,a( i, i+kd ), lda, tau( i ),&
+                            work( tpos ), ldt )
+                  ! compute w:
+                  call stdlib_wgemm( 'CONJUGATE', 'NO TRANSPOSE', pk, pn, pk,cone,  work( tpos ), &
+                            ldt,a( i, i+kd ), lda,czero, work( s2pos ), lds2 )
+                  call stdlib_whemm( 'RIGHT', uplo, pk, pn,cone,  a( i+kd, i+kd ), lda,work( &
+                            s2pos ), lds2,czero, work( wpos ), ldw )
+                  call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE', pk, pk, pn,cone,  work( wpos ), &
+                            ldw,work( s2pos ), lds2,czero, work( s1pos ), lds1 )
+                  call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pk, pn, pk,-chalf, work( &
+                            s1pos ), lds1,a( i, i+kd ), lda,cone,   work( wpos ), ldw )
+                  ! update the unreduced submatrix a(i+kd:n,i+kd:n), using
+                  ! an update of the form:  a := a - v'*w - w'*v
+                  call stdlib_wher2k( uplo, 'CONJUGATE', pn, pk,-cone, a( i, i+kd ), lda,work( &
+                            wpos ), ldw,rone, a( i+kd, i+kd ), lda )
+               end do
+              ! copy the upper band to ab which is the band storage matrix
+              do j = n-kd+1, n
+                 lk = min(kd, n-j) + 1
+                 call stdlib_wcopy( lk, a( j, j ), lda, ab( kd+1, j ), ldab-1 )
+              end do
+           else
+               ! reduce the lower triangle of a to lower band matrix
+               loop_40: do i = 1, n - kd, kd
+                  pn = n-i-kd+1
+                  pk = min( n-i-kd+1, kd )
+                  ! compute the qr factorization of the current block
+                  call stdlib_wgeqrf( pn, kd, a( i+kd, i ), lda,tau( i ), work( s2pos ), ls2, &
+                            iinfo )
+                  ! copy the upper portion of a into ab
+                  do j = i, i+pk-1
+                     lk = min( kd, n-j ) + 1
+                     call stdlib_wcopy( lk, a( j, j ), 1, ab( 1, j ), 1 )
+                  end do
+                  call stdlib_wlaset( 'UPPER', pk, pk, czero, cone,a( i+kd, i ), lda )
+                  ! form the matrix t
+                  call stdlib_wlarft( 'FORWARD', 'COLUMNWISE', pn, pk,a( i+kd, i ), lda, tau( i ),&
+                            work( tpos ), ldt )
+                  ! compute w:
+                  call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pn, pk, pk,cone, a( i+kd, i )&
+                            , lda,work( tpos ), ldt,czero, work( s2pos ), lds2 )
+                  call stdlib_whemm( 'LEFT', uplo, pn, pk,cone, a( i+kd, i+kd ), lda,work( s2pos )&
+                            , lds2,czero, work( wpos ), ldw )
+                  call stdlib_wgemm( 'CONJUGATE', 'NO TRANSPOSE', pk, pk, pn,cone, work( s2pos ), &
+                            lds2,work( wpos ), ldw,czero, work( s1pos ), lds1 )
+                  call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pn, pk, pk,-chalf, a( i+kd, &
+                            i ), lda,work( s1pos ), lds1,cone, work( wpos ), ldw )
+                  ! update the unreduced submatrix a(i+kd:n,i+kd:n), using
+                  ! an update of the form:  a := a - v*w' - w*v'
+                  call stdlib_wher2k( uplo, 'NO TRANSPOSE', pn, pk,-cone, a( i+kd, i ), lda,work( &
+                            wpos ), ldw,rone, a( i+kd, i+kd ), lda )
+                  ! ==================================================================
+                  ! restore a for comparison and checking to be removed
+                   ! do 45 j = i, i+pk-1
+                      ! lk = min( kd, n-j ) + 1
+                      ! call stdlib_wcopy( lk, ab( 1, j ), 1, a( j, j ), 1 )
+                      45 continue
+                  ! ==================================================================
+               end do loop_40
+              ! copy the lower band to ab which is the band storage matrix
+              do j = n-kd+1, n
+                 lk = min(kd, n-j) + 1
+                 call stdlib_wcopy( lk, a( j, j ), 1, ab( 1, j ), 1 )
+              end do
+           end if
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_whetrd_he2hb
+
+     !> ZHETRF: computes the factorization of a complex Hermitian matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method.  The form of the
+     !> factorization is
+     !> A = U*D*U**H  or  A = L*D*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_whetrf( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'ZHETRF', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZHETRF', uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_wlahef;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_wlahef( uplo, k, nb, kb, a, lda, ipiv, work, n, iinfo )
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_whetf2( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_wlahef;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_wlahef( uplo, n-k+1, nb, kb, a( k, k ), lda, ipiv( k ),work, n, &
+                           iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_whetf2( uplo, n-k+1, a( k, k ), lda, ipiv( k ), iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_whetrf
+
+     !> ZHETRF_AA: computes the factorization of a complex hermitian matrix A
+     !> using the Aasen's algorithm.  The form of the factorization is
+     !> A = U**H*T*U  or  A = L*T*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is a hermitian tridiagonal matrix.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_whetrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: j, lwkopt
+           integer(ilp) :: nb, mj, nj, k1, k2, j1, j2, j3, jb
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Executable Statements 
+           ! determine the block size
+           nb = stdlib_ilaenv( 1, 'ZHETRF_AA', uplo, n, -1, -1, -1 )
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              lwkopt = (nb+1)*n
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRF_AA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return
+           if ( n==0 ) then
+               return
+           endif
+           ipiv( 1 ) = 1
+           if ( n==1 ) then
+              a( 1, 1 ) = real( a( 1, 1 ),KIND=qp)
+              return
+           end if
+           ! adjust block size based on the workspace size
+           if( lwork<((1+nb)*n) ) then
+              nb = ( lwork-n ) / n
+           end if
+           if( upper ) then
+              ! .....................................................
+              ! factorize a as u**h*d*u using the upper triangle of a
+              ! .....................................................
+              ! copy first row a(1, 1:n) into h(1:n) (stored in work(1:n))
+              call stdlib_wcopy( n, a( 1, 1 ), lda, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_wlahef;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              10 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j + 1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_wlahef_aa( uplo, 2-k1, n-j, jb,a( max(1, j), j+1 ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_wswap( j1-k1-2, a( 1, j2 ), 1,a( 1, ipiv(j2) ), 1 )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+               ! the row a(j1-1, j2-1:n) stores u(j1, j2+1:n) and
+               ! work stores the current block of the auxiriarly matrix h
+              if( j<n ) then
+                ! if the first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = conjg( a( j, j+1 ) )
+                    a( j, j+1 ) = cone
+                    call stdlib_wcopy( n-j, a( j-1, j+1 ), lda,work( (j+1-j1+1)+jb*n ), 1 )
+                              
+                    call stdlib_wscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=0 and k2=1 for the first panel,
+                     ! and k1=1 and k2=0 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_wgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_wgemm( 'CONJUGATE TRANSPOSE', 'TRANSPOSE',1, mj, jb+1,-cone,&
+                           a( j1-k2, j3 ), lda,work( (j3-j1+1)+k1*n ), n,cone, a( j3, j3 ), lda )
+                                     
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block of j2-th block row with stdlib_wgemm
+                       call stdlib_wgemm( 'CONJUGATE TRANSPOSE', 'TRANSPOSE',nj, n-j3+1, jb+1,-&
+                       cone, a( j1-k2, j2 ), lda,work( (j3-j1+1)+k1*n ), n,cone, a( j2, j3 ), lda &
+                                 )
+                    end do
+                    ! recover t( j, j+1 )
+                    a( j, j+1 ) = conjg( alpha )
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_wcopy( n-j, a( j+1, j+1 ), lda, work( 1 ), 1 )
+              end if
+              go to 10
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! .....................................................
+              ! copy first column a(1:n, 1) into h(1:n, 1)
+               ! (stored in work(1:n))
+              call stdlib_wcopy( n, a( 1, 1 ), 1, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_wlahef;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              11 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j+1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_wlahef_aa( uplo, 2-k1, n-j, jb,a( j+1, max(1, j) ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_wswap( j1-k1-2, a( j2, 1 ), lda,a( ipiv(j2), 1 ), lda )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+                ! a(j2+1, j1-1) stores l(j2+1, j1) and
+                ! work(j2+1, 1) stores h(j2+1, 1)
+              if( j<n ) then
+                ! if the first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = conjg( a( j+1, j ) )
+                    a( j+1, j ) = cone
+                    call stdlib_wcopy( n-j, a( j+1, j-1 ), 1,work( (j+1-j1+1)+jb*n ), 1 )
+                    call stdlib_wscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=0 and k2=1 for the first panel,
+                     ! and k1=1 and k2=0 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_wgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',mj, 1, jb+1,-&
+                          cone, work( (j3-j1+1)+k1*n ), n,a( j3, j1-k2 ), lda,cone, a( j3, j3 ), &
+                                    lda )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block of j2-th block column with stdlib_wgemm
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',n-j3+1, nj, jb+1,-&
+                       cone, work( (j3-j1+1)+k1*n ), n,a( j2, j1-k2 ), lda,cone, a( j3, j2 ), lda &
+                                 )
+                    end do
+                    ! recover t( j+1, j )
+                    a( j+1, j ) = conjg( alpha )
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_wcopy( n-j, a( j+1, j+1 ), 1, work( 1 ), 1 )
+              end if
+              go to 11
+           end if
+           20 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_whetrf_aa
+
+     !> ZHETRF_RK: computes the factorization of a complex Hermitian matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**H)*(P**T) or A = P*L*D*(L**H)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is Hermitian and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_whetrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, ip, iws, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'ZHETRF_RK', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRF_RK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZHETRF_RK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_wlahef_rk;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 15
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_wlahef_rk( uplo, k, nb, kb, a, lda, e,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_whetf2_rk( uplo, k, a, lda, e, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k-kb+1:k and apply row permutations to the
+              ! last k+1 colunms k+1:n after that block
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k<n ) then
+                 do i = k, ( k - kb + 1 ), -1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_wswap( n-k, a( i, k+1 ), lda,a( ip, k+1 ), lda )
+                    end if
+                 end do
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+              ! this label is the exit from main loop over k decreasing
+              ! from n to 1 in steps of kb
+              15 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_wlahef_rk;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 35
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_wlahef_rk( uplo, n-k+1, nb, kb, a( k, k ), lda, e( k ),ipiv( k ), &
+                           work, ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_whetf2_rk( uplo, n-k+1, a( k, k ), lda, e( k ),ipiv( k ), iinfo )
+                           
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do i = k, k + kb - 1
+                 if( ipiv( i )>0 ) then
+                    ipiv( i ) = ipiv( i ) + k - 1
+                 else
+                    ipiv( i ) = ipiv( i ) - k + 1
+                 end if
+              end do
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k:k+kb-1 and apply row permutations to the
+              ! first k-1 colunms 1:k-1 before that block
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k>1 ) then
+                 do i = k, ( k + kb - 1 ), 1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_wswap( k-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                    end if
+                 end do
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+              ! this label is the exit from main loop over k increasing
+              ! from 1 to n in steps of kb
+              35 continue
+           ! end lower
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_whetrf_rk
+
+     !> ZHETRF_ROOK: computes the factorization of a complex Hermitian matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+     !> The form of the factorization is
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_whetrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'ZHETRF_ROOK', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, n*nb )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRF_ROOK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZHETRF_ROOK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_wlahef_rook;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_wlahef_rook( uplo, k, nb, kb, a, lda,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_whetf2_rook( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_wlahef_rook;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_wlahef_rook( uplo, n-k+1, nb, kb, a( k, k ), lda,ipiv( k ), work, &
+                           ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_whetf2_rook( uplo, n-k+1, a( k, k ), lda, ipiv( k ),iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_whetrf_rook
+
+     !> ZHETRI: computes the inverse of a complex Hermitian indefinite matrix
+     !> A using the factorization A = U*D*U**H or A = L*D*L**H computed by
+     !> ZHETRF.
+
+     pure subroutine stdlib_whetri( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp, kstep
+           real(qp) :: ak, akp1, d, t
+           complex(qp) :: akkp1, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / real( a( k, k ),KIND=qp)
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_wcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_whemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - real( stdlib_wdotc( k-1, work, 1, a( 1,k ), 1 ),&
+                              KIND=qp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k+1 ) )
+                 ak = real( a( k, k ),KIND=qp) / t
+                 akp1 = real( a( k+1, k+1 ),KIND=qp) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_wcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_whemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - real( stdlib_wdotc( k-1, work, 1, a( 1,k ), 1 ),&
+                              KIND=qp)
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_wdotc( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_wcopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_whemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -real( stdlib_wdotc( k-1, work, 1, a( 1, k+1 ),&
+                              1 ),KIND=qp)
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 call stdlib_wswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                 do j = kp + 1, k - 1
+                    temp = conjg( a( j, k ) )
+                    a( j, k ) = conjg( a( kp, j ) )
+                    a( kp, j ) = temp
+                 end do
+                 a( kp, k ) = conjg( a( kp, k ) )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              go to 30
+              50 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              60 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / real( a( k, k ),KIND=qp)
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_wcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_whemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - real( stdlib_wdotc( n-k, work, 1,a( k+1, k ), 1 ),&
+                              KIND=qp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k-1 ) )
+                 ak = real( a( k-1, k-1 ),KIND=qp) / t
+                 akp1 = real( a( k, k ),KIND=qp) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_wcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_whemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - real( stdlib_wdotc( n-k, work, 1,a( k+1, k ), 1 ),&
+                              KIND=qp)
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_wdotc( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 &
+                              )
+                    call stdlib_wcopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_whemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -real( stdlib_wdotc( n-k, work, 1, a( k+1, k-1 )&
+                              ,1 ),KIND=qp)
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 if( kp<n )call stdlib_wswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                 do j = k + 1, kp - 1
+                    temp = conjg( a( j, k ) )
+                    a( j, k ) = conjg( a( kp, j ) )
+                    a( kp, j ) = temp
+                 end do
+                 a( kp, k ) = conjg( a( kp, k ) )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              go to 60
+              80 continue
+           end if
+           return
+     end subroutine stdlib_whetri
+
+     !> ZHETRI_ROOK: computes the inverse of a complex Hermitian indefinite matrix
+     !> A using the factorization A = U*D*U**H or A = L*D*L**H computed by
+     !> ZHETRF_ROOK.
+
+     pure subroutine stdlib_whetri_rook( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp, kstep
+           real(qp) :: ak, akp1, d, t
+           complex(qp) :: akkp1, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRI_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 70
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / real( a( k, k ),KIND=qp)
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_wcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_whemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - real( stdlib_wdotc( k-1, work, 1, a( 1,k ), 1 ),&
+                              KIND=qp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k+1 ) )
+                 ak = real( a( k, k ),KIND=qp) / t
+                 akp1 = real( a( k+1, k+1 ),KIND=qp) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_wcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_whemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - real( stdlib_wdotc( k-1, work, 1, a( 1,k ), 1 ),&
+                              KIND=qp)
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_wdotc( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_wcopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_whemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -real( stdlib_wdotc( k-1, work, 1, a( 1, k+1 ),&
+                              1 ),KIND=qp)
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the leading
+                 ! submatrix a(1:k,1:k)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_wswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    do j = kp + 1, k - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k+1 with -ipiv(k) and
+                 ! -ipiv(k+1) in the leading submatrix a(k+1:n,k+1:n)
+                 ! (1) interchange rows and columns k and -ipiv(k)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_wswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    do j = kp + 1, k - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+                 ! (2) interchange rows and columns k+1 and -ipiv(k+1)
+                 k = k + 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_wswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    do j = kp + 1, k - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k + 1
+              go to 30
+              70 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**h.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              80 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 120
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / real( a( k, k ),KIND=qp)
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_wcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_whemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - real( stdlib_wdotc( n-k, work, 1,a( k+1, k ), 1 ),&
+                              KIND=qp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k-1 ) )
+                 ak = real( a( k-1, k-1 ),KIND=qp) / t
+                 akp1 = real( a( k, k ),KIND=qp) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_wcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_whemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - real( stdlib_wdotc( n-k, work, 1,a( k+1, k ), 1 ),&
+                              KIND=qp)
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_wdotc( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 &
+                              )
+                    call stdlib_wcopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_whemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -real( stdlib_wdotc( n-k, work, 1, a( k+1, k-1 )&
+                              ,1 ),KIND=qp)
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the trailing
+                 ! submatrix a(k:n,k:n)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_wswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    do j = k + 1, kp - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k-1 with -ipiv(k) and
+                 ! -ipiv(k-1) in the trailing submatrix a(k-1:n,k-1:n)
+                 ! (1) interchange rows and columns k and -ipiv(k)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_wswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    do j = k + 1, kp - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+                 ! (2) interchange rows and columns k-1 and -ipiv(k-1)
+                 k = k - 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_wswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    do j = k + 1, kp - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k - 1
+              go to 80
+              120 continue
+           end if
+           return
+     end subroutine stdlib_whetri_rook
+
+     !> ZHETRS: solves a system of linear equations A*X = B with a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by ZHETRF.
+
+     pure subroutine stdlib_whetrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           real(qp) :: s
+           complex(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**h.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_wgeru( k-1, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=qp) / real( a( k, k ),KIND=qp)
+                 call stdlib_wdscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_wswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_wgeru( k-2, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 call stdlib_wgeru( k-2, nrhs, -cone, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ), &
+                           ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / conjg( akm1k )
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / conjg( akm1k )
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**h *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**h(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 ) then
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**h(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k+1, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k+1 )&
+                              , 1, cone, b( k+1, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**h.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_wgeru( n-k, nrhs, -cone, a( k+1, k ), 1, b( k, 1 ),ldb, b( &
+                           k+1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=qp) / real( a( k, k ),KIND=qp)
+                 call stdlib_wdscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_wswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_wgeru( n-k-1, nrhs, -cone, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_wgeru( n-k-1, nrhs, -cone, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / conjg( akm1k )
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / conjg( akm1k )
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**h *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**h(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n ) then
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**h(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k-1, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k-1 ), 1, cone,b( k-1, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k-1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_whetrs
+
+     !> ZHETRS2: solves a system of linear equations A*X = B with a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by ZHETRF and converted by ZSYCONV.
+
+     pure subroutine stdlib_whetrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, j, k, kp
+           real(qp) :: s
+           complex(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRS2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! convert a
+           call stdlib_wsyconv( uplo, 'C', n, a, lda, ipiv, work, iinfo )
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**h.
+             ! p**t * b
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp==-ipiv( k-1 ) )call stdlib_wswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb &
+                           )
+                 k=k-2
+              end if
+             end do
+        ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+             call stdlib_wtrsm('L','U','N','U',n,nrhs,cone,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i=n
+              do while ( i >= 1 )
+                 if( ipiv(i) > 0 ) then
+                   s = real( cone,KIND=qp) / real( a( i, i ),KIND=qp)
+                   call stdlib_wdscal( nrhs, s, b( i, 1 ), ldb )
+                 elseif ( i > 1) then
+                    if ( ipiv(i-1) == ipiv(i) ) then
+                       akm1k = work(i)
+                       akm1 = a( i-1, i-1 ) / akm1k
+                       ak = a( i, i ) / conjg( akm1k )
+                       denom = akm1*ak - cone
+                       do j = 1, nrhs
+                          bkm1 = b( i-1, j ) / akm1k
+                          bk = b( i, j ) / conjg( akm1k )
+                          b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                          b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                    i = i - 1
+                    endif
+                 endif
+                 i = i - 1
+              end do
+            ! compute (u**h \ b) -> b   [ u**h \ (d \ (u \p**t * b) ) ]
+              call stdlib_wtrsm('L','U','C','U',n,nrhs,cone,a,lda,b,ldb)
+             ! p * b  [ p * (u**h \ (d \ (u \p**t * b) )) ]
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k < n .and. kp==-ipiv( k+1 ) )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp,&
+                            1 ), ldb )
+                 k=k+2
+              endif
+             end do
+           else
+              ! solve a*x = b, where a = l*d*l**h.
+             ! p**t * b
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k+1).
+                 kp = -ipiv( k+1 )
+                 if( kp==-ipiv( k ) )call stdlib_wswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                           
+                 k=k+2
+              endif
+             end do
+        ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+             call stdlib_wtrsm('L','L','N','U',n,nrhs,cone,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i=1
+              do while ( i <= n )
+                 if( ipiv(i) > 0 ) then
+                   s = real( cone,KIND=qp) / real( a( i, i ),KIND=qp)
+                   call stdlib_wdscal( nrhs, s, b( i, 1 ), ldb )
+                 else
+                       akm1k = work(i)
+                       akm1 = a( i, i ) / conjg( akm1k )
+                       ak = a( i+1, i+1 ) / akm1k
+                       denom = akm1*ak - cone
+                       do j = 1, nrhs
+                          bkm1 = b( i, j ) / conjg( akm1k )
+                          bk = b( i+1, j ) / akm1k
+                          b( i, j ) = ( ak*bkm1-bk ) / denom
+                          b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                       i = i + 1
+                 endif
+                 i = i + 1
+              end do
+        ! compute (l**h \ b) -> b   [ l**h \ (d \ (l \p**t * b) ) ]
+             call stdlib_wtrsm('L','L','C','U',n,nrhs,cone,a,lda,b,ldb)
+             ! p * b  [ p * (l**h \ (d \ (l \p**t * b) )) ]
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k>1 .and. kp==-ipiv( k-1 ) )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, &
+                           1 ), ldb )
+                 k=k-2
+              endif
+             end do
+           end if
+           ! revert a
+           call stdlib_wsyconv( uplo, 'R', n, a, lda, ipiv, work, iinfo )
+           return
+     end subroutine stdlib_whetrs2
+
+     !> ZHETRS_3: solves a system of linear equations A * X = B with a complex
+     !> Hermitian matrix A using the factorization computed
+     !> by ZHETRF_RK or ZHETRF_BK:
+     !> A = P*U*D*(U**H)*(P**T) or A = P*L*D*(L**H)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is Hermitian and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This algorithm is using Level 3 BLAS.
+
+     pure subroutine stdlib_whetrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*), e(*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j, k, kp
+           real(qp) :: s
+           complex(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRS_3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! begin upper
+              ! solve a*x = b, where a = u*d*u**h.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+              call stdlib_wtrsm( 'L', 'U', 'N', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i = n
+              do while ( i>=1 )
+                 if( ipiv( i )>0 ) then
+                    s = real( cone,KIND=qp) / real( a( i, i ),KIND=qp)
+                    call stdlib_wdscal( nrhs, s, b( i, 1 ), ldb )
+                 else if ( i>1 ) then
+                    akm1k = e( i )
+                    akm1 = a( i-1, i-1 ) / akm1k
+                    ak = a( i, i ) / conjg( akm1k )
+                    denom = akm1*ak - cone
+                    do j = 1, nrhs
+                       bkm1 = b( i-1, j ) / akm1k
+                       bk = b( i, j ) / conjg( akm1k )
+                       b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                       b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i - 1
+                 end if
+                 i = i - 1
+              end do
+              ! compute (u**h \ b) -> b   [ u**h \ (d \ (u \p**t * b) ) ]
+              call stdlib_wtrsm( 'L', 'U', 'C', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! p * b  [ p * (u**h \ (d \ (u \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+           else
+              ! begin lower
+              ! solve a*x = b, where a = l*d*l**h.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+              call stdlib_wtrsm( 'L', 'L', 'N', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i = 1
+              do while ( i<=n )
+                 if( ipiv( i )>0 ) then
+                    s = real( cone,KIND=qp) / real( a( i, i ),KIND=qp)
+                    call stdlib_wdscal( nrhs, s, b( i, 1 ), ldb )
+                 else if( i<n ) then
+                    akm1k = e( i )
+                    akm1 = a( i, i ) / conjg( akm1k )
+                    ak = a( i+1, i+1 ) / akm1k
+                    denom = akm1*ak - cone
+                    do  j = 1, nrhs
+                       bkm1 = b( i, j ) / conjg( akm1k )
+                       bk = b( i+1, j ) / akm1k
+                       b( i, j ) = ( ak*bkm1-bk ) / denom
+                       b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i + 1
+                 end if
+                 i = i + 1
+              end do
+              ! compute (l**h \ b) -> b   [ l**h \ (d \ (l \p**t * b) ) ]
+              call stdlib_wtrsm('L', 'L', 'C', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! p * b  [ p * (l**h \ (d \ (l \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! end lower
+           end if
+           return
+     end subroutine stdlib_whetrs_3
+
+     !> ZHETRS_AA: solves a system of linear equations A*X = B with a complex
+     !> hermitian matrix A using the factorization A = U**H*T*U or
+     !> A = L*T*L**H computed by ZHETRF_AA.
+
+     pure subroutine stdlib_whetrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, nrhs, lda, ldb, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           logical(lk) :: lquery, upper
+           integer(ilp) :: k, kp, lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, 3*n-2 ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRS_AA', -info )
+              return
+           else if( lquery ) then
+              lwkopt = (3*n-2)
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u**h*t*u.
+              ! 1) forward substitution with u**h
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute u**h \ b -> b    [ (u**h \p**t * b) ]
+                 call stdlib_wtrsm( 'L', 'U', 'C', 'U', n-1, nrhs, cone, a( 1, 2 ),lda, b( 2, 1 ),&
+                            ldb )
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (u**h \p**t * b) ]
+              call stdlib_wlacpy( 'F', 1, n, a(1, 1), lda+1, work(n), 1 )
+              if( n>1 ) then
+                  call stdlib_wlacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 2*n ), 1)
+                  call stdlib_wlacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 1 ), 1 )
+                  call stdlib_wlacgv( n-1, work( 1 ), 1 )
+              end if
+              call stdlib_wgtsv( n, nrhs, work(1), work(n), work(2*n), b, ldb,info )
+              ! 3) backward substitution with u
+              if( n>1 ) then
+                 ! compute u \ b -> b   [ u \ (t \ (u**h \p**t * b) ) ]
+                 call stdlib_wtrsm( 'L', 'U', 'N', 'U', n-1, nrhs, cone, a( 1, 2 ),lda, b(2, 1), &
+                           ldb)
+                 ! pivot, p * b  [ p * (u**h \ (t \ (u \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           else
+              ! solve a*x = b, where a = l*t*l**h.
+              ! 1) forward substitution with l
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute l \ b -> b    [ (l \p**t * b) ]
+                 call stdlib_wtrsm( 'L', 'L', 'N', 'U', n-1, nrhs, cone, a( 2, 1 ),lda, b(2, 1), &
+                           ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (l \p**t * b) ]
+              call stdlib_wlacpy( 'F', 1, n, a(1, 1), lda+1, work(n), 1)
+              if( n>1 ) then
+                  call stdlib_wlacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 1 ), 1)
+                  call stdlib_wlacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 2*n ), 1)
+                  call stdlib_wlacgv( n-1, work( 2*n ), 1 )
+              end if
+              call stdlib_wgtsv(n, nrhs, work(1), work(n), work(2*n), b, ldb,info)
+              ! 3) backward substitution with l**h
+              if( n>1 ) then
+                 ! compute l**h \ b -> b   [ l**h \ (t \ (l \p**t * b) ) ]
+                 call stdlib_wtrsm( 'L', 'L', 'C', 'U', n-1, nrhs, cone, a( 2, 1 ),lda, b( 2, 1 ),&
+                            ldb)
+                 ! pivot, p * b  [ p * (l**h \ (t \ (l \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_whetrs_aa
+
+     !> ZHETRS_ROOK: solves a system of linear equations A*X = B with a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by ZHETRF_ROOK.
+
+     pure subroutine stdlib_whetrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           real(qp) :: s
+           complex(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,real
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRS_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**h.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_wgeru( k-1, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=qp) / real( a( k, k ),KIND=qp)
+                 call stdlib_wdscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k), then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1)
+                 if( kp/=k-1 )call stdlib_wswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_wgeru( k-2, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 call stdlib_wgeru( k-2, nrhs, -cone, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ), &
+                           ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / conjg( akm1k )
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / conjg( akm1k )
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**h *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**h(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 ) then
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**h(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k+1, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k+1 )&
+                              , 1, cone, b( k+1, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k), then k+1 and -ipiv(k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_wswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**h.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_wgeru( n-k, nrhs, -cone, a( k+1, k ), 1, b( k, 1 ),ldb, b( &
+                           k+1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=qp) / real( a( k, k ),KIND=qp)
+                 call stdlib_wdscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k), then k+1 and -ipiv(k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_wswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_wgeru( n-k-1, nrhs, -cone, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_wgeru( n-k-1, nrhs, -cone, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / conjg( akm1k )
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / conjg( akm1k )
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**h *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**h(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n ) then
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**h(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k-1, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k-1 ), 1, cone,b( k-1, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k-1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k), then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_wswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_whetrs_rook
+
+     !> Level 3 BLAS like routine for C in RFP Format.
+     !> ZHFRK: performs one of the Hermitian rank--k operations
+     !> C := alpha*A*A**H + beta*C,
+     !> or
+     !> C := alpha*A**H*A + beta*C,
+     !> where alpha and beta are real scalars, C is an n--by--n Hermitian
+     !> matrix and A is an n--by--k matrix in the first case and a k--by--n
+     !> matrix in the second case.
+
+     pure subroutine stdlib_whfrk( transr, uplo, trans, n, k, alpha, a, lda, beta,c )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, n
+           character, intent(in) :: trans, transr, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: c(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, normaltransr, nisodd, notrans
+           integer(ilp) :: info, nrowa, j, nk, n1, n2
+           complex(qp) :: calpha, cbeta
+           ! Intrinsic Functions 
+           intrinsic :: max,cmplx
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           notrans = stdlib_lsame( trans, 'N' )
+           if( notrans ) then
+              nrowa = n
+           else
+              nrowa = k
+           end if
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( .not.notrans .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( lda<max( 1, nrowa ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHFRK ', -info )
+              return
+           end if
+           ! quick return if possible.
+           ! the quick return case: ((alpha==0).and.(beta/=zero)) is not
+           ! done (it is in stdlib_wherk for example) and left in the general case.
+           if( ( n==0 ) .or. ( ( ( alpha==zero ) .or. ( k==0 ) ) .and.( beta==one ) ) )&
+                     return
+           if( ( alpha==zero ) .and. ( beta==zero ) ) then
+              do j = 1, ( ( n*( n+1 ) ) / 2 )
+                 c( j ) = czero
+              end do
+              return
+           end if
+           calpha = cmplx( alpha, zero,KIND=qp)
+           cbeta = cmplx( beta, zero,KIND=qp)
+           ! c is n-by-n.
+           ! if n is odd, set nisodd = .true., and n1 and n2.
+           ! if n is even, nisodd = .false., and nk.
+           if( mod( n, 2 )==0 ) then
+              nisodd = .false.
+              nk = n / 2
+           else
+              nisodd = .true.
+              if( lower ) then
+                 n2 = n / 2
+                 n1 = n - n2
+              else
+                 n1 = n / 2
+                 n2 = n - n1
+              end if
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is odd, transr = 'n', uplo = 'l', and trans = 'n'
+                       call stdlib_wherk( 'L', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n )
+                                 
+                       call stdlib_wherk( 'U', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( n+1 )&
+                                 , n )
+                       call stdlib_wgemm( 'N', 'C', n2, n1, k, calpha, a( n1+1, 1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( n1+1 ), n )
+                    else
+                       ! n is odd, transr = 'n', uplo = 'l', and trans = 'c'
+                       call stdlib_wherk( 'L', 'C', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n )
+                                 
+                       call stdlib_wherk( 'U', 'C', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( n+1 )&
+                                 , n )
+                       call stdlib_wgemm( 'C', 'N', n2, n1, k, calpha, a( 1, n1+1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( n1+1 ), n )
+                    end if
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is odd, transr = 'n', uplo = 'u', and trans = 'n'
+                       call stdlib_wherk( 'L', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2+1 ), &
+                                 n )
+                       call stdlib_wherk( 'U', 'N', n2, k, alpha, a( n2, 1 ), lda,beta, c( n1+1 ),&
+                                  n )
+                       call stdlib_wgemm( 'N', 'C', n1, n2, k, calpha, a( 1, 1 ),lda, a( n2, 1 ), &
+                                 lda, cbeta, c( 1 ), n )
+                    else
+                       ! n is odd, transr = 'n', uplo = 'u', and trans = 'c'
+                       call stdlib_wherk( 'L', 'C', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2+1 ), &
+                                 n )
+                       call stdlib_wherk( 'U', 'C', n2, k, alpha, a( 1, n2 ), lda,beta, c( n1+1 ),&
+                                  n )
+                       call stdlib_wgemm( 'C', 'N', n1, n2, k, calpha, a( 1, 1 ),lda, a( 1, n2 ), &
+                                 lda, cbeta, c( 1 ), n )
+                    end if
+                 end if
+              else
+                 ! n is odd, and transr = 'c'
+                 if( lower ) then
+                    ! n is odd, transr = 'c', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is odd, transr = 'c', uplo = 'l', and trans = 'n'
+                       call stdlib_wherk( 'U', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n1 &
+                                 )
+                       call stdlib_wherk( 'L', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( 2 ), &
+                                 n1 )
+                       call stdlib_wgemm( 'N', 'C', n1, n2, k, calpha, a( 1, 1 ),lda, a( n1+1, 1 )&
+                                 , lda, cbeta,c( n1*n1+1 ), n1 )
+                    else
+                       ! n is odd, transr = 'c', uplo = 'l', and trans = 'c'
+                       call stdlib_wherk( 'U', 'C', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n1 &
+                                 )
+                       call stdlib_wherk( 'L', 'C', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( 2 ), &
+                                 n1 )
+                       call stdlib_wgemm( 'C', 'N', n1, n2, k, calpha, a( 1, 1 ),lda, a( 1, n1+1 )&
+                                 , lda, cbeta,c( n1*n1+1 ), n1 )
+                    end if
+                 else
+                    ! n is odd, transr = 'c', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is odd, transr = 'c', uplo = 'u', and trans = 'n'
+                       call stdlib_wherk( 'U', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2*n2+1 &
+                                 ), n2 )
+                       call stdlib_wherk( 'L', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( &
+                                 n1*n2+1 ), n2 )
+                       call stdlib_wgemm( 'N', 'C', n2, n1, k, calpha, a( n1+1, 1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( 1 ), n2 )
+                    else
+                       ! n is odd, transr = 'c', uplo = 'u', and trans = 'c'
+                       call stdlib_wherk( 'U', 'C', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2*n2+1 &
+                                 ), n2 )
+                       call stdlib_wherk( 'L', 'C', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( &
+                                 n1*n2+1 ), n2 )
+                       call stdlib_wgemm( 'C', 'N', n2, n1, k, calpha, a( 1, n1+1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( 1 ), n2 )
+                    end if
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is even, transr = 'n', uplo = 'l', and trans = 'n'
+                       call stdlib_wherk( 'L', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( 2 ), n+&
+                                 1 )
+                       call stdlib_wherk( 'U', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( 1 ), &
+                                 n+1 )
+                       call stdlib_wgemm( 'N', 'C', nk, nk, k, calpha, a( nk+1, 1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( nk+2 ),n+1 )
+                    else
+                       ! n is even, transr = 'n', uplo = 'l', and trans = 'c'
+                       call stdlib_wherk( 'L', 'C', nk, k, alpha, a( 1, 1 ), lda,beta, c( 2 ), n+&
+                                 1 )
+                       call stdlib_wherk( 'U', 'C', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( 1 ), &
+                                 n+1 )
+                       call stdlib_wgemm( 'C', 'N', nk, nk, k, calpha, a( 1, nk+1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( nk+2 ),n+1 )
+                    end if
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is even, transr = 'n', uplo = 'u', and trans = 'n'
+                       call stdlib_wherk( 'L', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+2 ), &
+                                 n+1 )
+                       call stdlib_wherk( 'U', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( nk+1 &
+                                 ), n+1 )
+                       call stdlib_wgemm( 'N', 'C', nk, nk, k, calpha, a( 1, 1 ),lda, a( nk+1, 1 )&
+                                 , lda, cbeta, c( 1 ),n+1 )
+                    else
+                       ! n is even, transr = 'n', uplo = 'u', and trans = 'c'
+                       call stdlib_wherk( 'L', 'C', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+2 ), &
+                                 n+1 )
+                       call stdlib_wherk( 'U', 'C', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( nk+1 &
+                                 ), n+1 )
+                       call stdlib_wgemm( 'C', 'N', nk, nk, k, calpha, a( 1, 1 ),lda, a( 1, nk+1 )&
+                                 , lda, cbeta, c( 1 ),n+1 )
+                    end if
+                 end if
+              else
+                 ! n is even, and transr = 'c'
+                 if( lower ) then
+                    ! n is even, transr = 'c', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is even, transr = 'c', uplo = 'l', and trans = 'n'
+                       call stdlib_wherk( 'U', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+1 ), &
+                                 nk )
+                       call stdlib_wherk( 'L', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( 1 ), &
+                                 nk )
+                       call stdlib_wgemm( 'N', 'C', nk, nk, k, calpha, a( 1, 1 ),lda, a( nk+1, 1 )&
+                                 , lda, cbeta,c( ( ( nk+1 )*nk )+1 ), nk )
+                    else
+                       ! n is even, transr = 'c', uplo = 'l', and trans = 'c'
+                       call stdlib_wherk( 'U', 'C', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+1 ), &
+                                 nk )
+                       call stdlib_wherk( 'L', 'C', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( 1 ), &
+                                 nk )
+                       call stdlib_wgemm( 'C', 'N', nk, nk, k, calpha, a( 1, 1 ),lda, a( 1, nk+1 )&
+                                 , lda, cbeta,c( ( ( nk+1 )*nk )+1 ), nk )
+                    end if
+                 else
+                    ! n is even, transr = 'c', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is even, transr = 'c', uplo = 'u', and trans = 'n'
+                       call stdlib_wherk( 'U', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk*( nk+&
+                                 1 )+1 ), nk )
+                       call stdlib_wherk( 'L', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( &
+                                 nk*nk+1 ), nk )
+                       call stdlib_wgemm( 'N', 'C', nk, nk, k, calpha, a( nk+1, 1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( 1 ), nk )
+                    else
+                       ! n is even, transr = 'c', uplo = 'u', and trans = 'c'
+                       call stdlib_wherk( 'U', 'C', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk*( nk+&
+                                 1 )+1 ), nk )
+                       call stdlib_wherk( 'L', 'C', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( &
+                                 nk*nk+1 ), nk )
+                       call stdlib_wgemm( 'C', 'N', nk, nk, k, calpha, a( 1, nk+1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( 1 ), nk )
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_whfrk
+
+     !> ZHGEQZ: computes the eigenvalues of a complex matrix pair (H,T),
+     !> where H is an upper Hessenberg matrix and T is upper triangular,
+     !> using the single-shift QZ method.
+     !> Matrix pairs of this type are produced by the reduction to
+     !> generalized upper Hessenberg form of a complex matrix pair (A,B):
+     !> A = Q1*H*Z1**H,  B = Q1*T*Z1**H,
+     !> as computed by ZGGHRD.
+     !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+     !> also reduced to generalized Schur form,
+     !> H = Q*S*Z**H,  T = Q*P*Z**H,
+     !> where Q and Z are unitary matrices and S and P are upper triangular.
+     !> Optionally, the unitary matrix Q from the generalized Schur
+     !> factorization may be postmultiplied into an input matrix Q1, and the
+     !> unitary matrix Z may be postmultiplied into an input matrix Z1.
+     !> If Q1 and Z1 are the unitary matrices from ZGGHRD that reduced
+     !> the matrix pair (A,B) to generalized Hessenberg form, then the output
+     !> matrices Q1*Q and Z1*Z are the unitary factors from the generalized
+     !> Schur factorization of (A,B):
+     !> A = (Q1*Q)*S*(Z1*Z)**H,  B = (Q1*Q)*P*(Z1*Z)**H.
+     !> To avoid overflow, eigenvalues of the matrix pair (H,T)
+     !> (equivalently, of (A,B)) are computed as a pair of complex values
+     !> (alpha,beta).  If beta is nonzero, lambda = alpha / beta is an
+     !> eigenvalue of the generalized nonsymmetric eigenvalue problem (GNEP)
+     !> A*x = lambda*B*x
+     !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+     !> alternate form of the GNEP
+     !> mu*A*y = B*y.
+     !> The values of alpha and beta for the i-th eigenvalue can be read
+     !> directly from the generalized Schur form:  alpha = S(i,i),
+     !> beta = P(i,i).
+     !> Ref: C.B. Moler
+     !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+     !> pp. 241--256.
+
+     subroutine stdlib_whgeqz( job, compq, compz, n, ilo, ihi, h, ldh, t, ldt,alpha, beta, q, ldq,&
+                z, ldz, work, lwork,rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz, job
+           integer(ilp), intent(in) :: ihi, ilo, ldh, ldq, ldt, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(out) :: alpha(*), beta(*), work(*)
+           complex(qp), intent(inout) :: h(ldh,*), q(ldq,*), t(ldt,*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ilazr2, ilazro, ilq, ilschr, ilz, lquery
+           integer(ilp) :: icompq, icompz, ifirst, ifrstm, iiter, ilast, ilastm, in, ischur, &
+                     istart, j, jc, jch, jiter, jr, maxit
+           real(qp) :: absb, anorm, ascale, atol, bnorm, bscale, btol, c, safmin, temp, temp2, &
+                     tempr, ulp
+           complex(qp) :: abi22, ad11, ad12, ad21, ad22, ctemp, ctemp2, ctemp3, eshift, s, shift, &
+                     signbc, u12, x, abi12, y
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,min,sqrt
+           ! Statement Functions 
+           real(qp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=qp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode job, compq, compz
+           if( stdlib_lsame( job, 'E' ) ) then
+              ilschr = .false.
+              ischur = 1
+           else if( stdlib_lsame( job, 'S' ) ) then
+              ilschr = .true.
+              ischur = 2
+           else
+              ilschr = .true.
+              ischur = 0
+           end if
+           if( stdlib_lsame( compq, 'N' ) ) then
+              ilq = .false.
+              icompq = 1
+           else if( stdlib_lsame( compq, 'V' ) ) then
+              ilq = .true.
+              icompq = 2
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              ilq = .true.
+              icompq = 3
+           else
+              ilq = .true.
+              icompq = 0
+           end if
+           if( stdlib_lsame( compz, 'N' ) ) then
+              ilz = .false.
+              icompz = 1
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              ilz = .true.
+              icompz = 2
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              ilz = .true.
+              icompz = 3
+           else
+              ilz = .true.
+              icompz = 0
+           end if
+           ! check argument values
+           info = 0
+           work( 1 ) = max( 1, n )
+           lquery = ( lwork==-1 )
+           if( ischur==0 ) then
+              info = -1
+           else if( icompq==0 ) then
+              info = -2
+           else if( icompz==0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 ) then
+              info = -5
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -6
+           else if( ldh<n ) then
+              info = -8
+           else if( ldt<n ) then
+              info = -10
+           else if( ldq<1 .or. ( ilq .and. ldq<n ) ) then
+              info = -14
+           else if( ldz<1 .or. ( ilz .and. ldz<n ) ) then
+              info = -16
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHGEQZ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           ! work( 1 ) = cmplx( 1,KIND=qp)
+           if( n<=0 ) then
+              work( 1 ) = cmplx( 1,KIND=qp)
+              return
+           end if
+           ! initialize q and z
+           if( icompq==3 )call stdlib_wlaset( 'FULL', n, n, czero, cone, q, ldq )
+           if( icompz==3 )call stdlib_wlaset( 'FULL', n, n, czero, cone, z, ldz )
+           ! machine constants
+           in = ihi + 1 - ilo
+           safmin = stdlib_qlamch( 'S' )
+           ulp = stdlib_qlamch( 'E' )*stdlib_qlamch( 'B' )
+           anorm = stdlib_wlanhs( 'F', in, h( ilo, ilo ), ldh, rwork )
+           bnorm = stdlib_wlanhs( 'F', in, t( ilo, ilo ), ldt, rwork )
+           atol = max( safmin, ulp*anorm )
+           btol = max( safmin, ulp*bnorm )
+           ascale = one / max( safmin, anorm )
+           bscale = one / max( safmin, bnorm )
+           ! set eigenvalues ihi+1:n
+           do j = ihi + 1, n
+              absb = abs( t( j, j ) )
+              if( absb>safmin ) then
+                 signbc = conjg( t( j, j ) / absb )
+                 t( j, j ) = absb
+                 if( ilschr ) then
+                    call stdlib_wscal( j-1, signbc, t( 1, j ), 1 )
+                    call stdlib_wscal( j, signbc, h( 1, j ), 1 )
+                 else
+                    call stdlib_wscal( 1, signbc, h( j, j ), 1 )
+                 end if
+                 if( ilz )call stdlib_wscal( n, signbc, z( 1, j ), 1 )
+              else
+                 t( j, j ) = czero
+              end if
+              alpha( j ) = h( j, j )
+              beta( j ) = t( j, j )
+           end do
+           ! if ihi < ilo, skip qz steps
+           if( ihi<ilo )go to 190
+           ! main qz iteration loop
+           ! initialize dynamic indices
+           ! eigenvalues ilast+1:n have been found.
+              ! column operations modify rows ifrstm:whatever
+              ! row operations modify columns whatever:ilastm
+           ! if only eigenvalues are being computed, then
+              ! ifrstm is the row of the last splitting row above row ilast;
+              ! this is always at least ilo.
+           ! iiter counts iterations since the last eigenvalue was found,
+              ! to tell when to use an extraordinary shift.
+           ! maxit is the maximum number of qz sweeps allowed.
+           ilast = ihi
+           if( ilschr ) then
+              ifrstm = 1
+              ilastm = n
+           else
+              ifrstm = ilo
+              ilastm = ihi
+           end if
+           iiter = 0
+           eshift = czero
+           maxit = 30*( ihi-ilo+1 )
+           loop_170: do jiter = 1, maxit
+              ! check for too many iterations.
+              if( jiter>maxit )go to 180
+              ! split the matrix if possible.
+              ! two tests:
+                 ! 1: h(j,j-1)=0  or  j=ilo
+                 ! 2: t(j,j)=0
+              ! special case: j=ilast
+              if( ilast==ilo ) then
+                 go to 60
+              else
+                 if( abs1( h( ilast, ilast-1 ) )<=max( safmin, ulp*(abs1( h( ilast, ilast ) ) + &
+                           abs1( h( ilast-1, ilast-1 )) ) ) ) then
+                    h( ilast, ilast-1 ) = czero
+                    go to 60
+                 end if
+              end if
+              if( abs( t( ilast, ilast ) )<=max( safmin, ulp*(abs( t( ilast - 1, ilast ) ) + abs( &
+                        t( ilast-1, ilast-1 )) ) ) ) then
+                 t( ilast, ilast ) = czero
+                 go to 50
+              end if
+              ! general case: j<ilast
+              loop_40: do j = ilast - 1, ilo, -1
+                 ! test 1: for h(j,j-1)=0 or j=ilo
+                 if( j==ilo ) then
+                    ilazro = .true.
+                 else
+                    if( abs1( h( j, j-1 ) )<=max( safmin, ulp*(abs1( h( j, j ) ) + abs1( h( j-1, &
+                              j-1 ) )) ) ) then
+                       h( j, j-1 ) = czero
+                       ilazro = .true.
+                    else
+                       ilazro = .false.
+                    end if
+                 end if
+                 ! test 2: for t(j,j)=0
+                 temp = abs ( t( j, j + 1 ) )
+                 if ( j > ilo )temp = temp + abs ( t( j - 1, j ) )
+                 if( abs( t( j, j ) )<max( safmin,ulp*temp ) ) then
+                    t( j, j ) = czero
+                    ! test 1a: check for 2 consecutive small subdiagonals in a
+                    ilazr2 = .false.
+                    if( .not.ilazro ) then
+                       if( abs1( h( j, j-1 ) )*( ascale*abs1( h( j+1,j ) ) )<=abs1( h( j, j ) )*( &
+                                 ascale*atol ) )ilazr2 = .true.
+                    end if
+                    ! if both tests pass (1
+                    ! element of b in the block is zero, split a 1x1 block off
+                    ! at the top. (i.e., at the j-th row/column) the leading
+                    ! diagonal element of the remainder can also be zero, so
+                    ! this may have to be done repeatedly.
+                    if( ilazro .or. ilazr2 ) then
+                       do jch = j, ilast - 1
+                          ctemp = h( jch, jch )
+                          call stdlib_wlartg( ctemp, h( jch+1, jch ), c, s,h( jch, jch ) )
+                          h( jch+1, jch ) = czero
+                          call stdlib_wrot( ilastm-jch, h( jch, jch+1 ), ldh,h( jch+1, jch+1 ), &
+                                    ldh, c, s )
+                          call stdlib_wrot( ilastm-jch, t( jch, jch+1 ), ldt,t( jch+1, jch+1 ), &
+                                    ldt, c, s )
+                          if( ilq )call stdlib_wrot( n, q( 1, jch ), 1, q( 1, jch+1 ), 1,c, conjg(&
+                                     s ) )
+                          if( ilazr2 )h( jch, jch-1 ) = h( jch, jch-1 )*c
+                          ilazr2 = .false.
+                          if( abs1( t( jch+1, jch+1 ) )>=btol ) then
+                             if( jch+1>=ilast ) then
+                                go to 60
+                             else
+                                ifirst = jch + 1
+                                go to 70
+                             end if
+                          end if
+                          t( jch+1, jch+1 ) = czero
+                       end do
+                       go to 50
+                    else
+                       ! only test 2 passed -- chase the zero to t(ilast,ilast)
+                       ! then process as in the case t(ilast,ilast)=0
+                       do jch = j, ilast - 1
+                          ctemp = t( jch, jch+1 )
+                          call stdlib_wlartg( ctemp, t( jch+1, jch+1 ), c, s,t( jch, jch+1 ) )
+                                    
+                          t( jch+1, jch+1 ) = czero
+                          if( jch<ilastm-1 )call stdlib_wrot( ilastm-jch-1, t( jch, jch+2 ), ldt,&
+                                    t( jch+1, jch+2 ), ldt, c, s )
+                          call stdlib_wrot( ilastm-jch+2, h( jch, jch-1 ), ldh,h( jch+1, jch-1 ), &
+                                    ldh, c, s )
+                          if( ilq )call stdlib_wrot( n, q( 1, jch ), 1, q( 1, jch+1 ), 1,c, conjg(&
+                                     s ) )
+                          ctemp = h( jch+1, jch )
+                          call stdlib_wlartg( ctemp, h( jch+1, jch-1 ), c, s,h( jch+1, jch ) )
+                                    
+                          h( jch+1, jch-1 ) = czero
+                          call stdlib_wrot( jch+1-ifrstm, h( ifrstm, jch ), 1,h( ifrstm, jch-1 ), &
+                                    1, c, s )
+                          call stdlib_wrot( jch-ifrstm, t( ifrstm, jch ), 1,t( ifrstm, jch-1 ), 1,&
+                                     c, s )
+                          if( ilz )call stdlib_wrot( n, z( 1, jch ), 1, z( 1, jch-1 ), 1,c, s )
+                                    
+                       end do
+                       go to 50
+                    end if
+                 else if( ilazro ) then
+                    ! only test 1 passed -- work on j:ilast
+                    ifirst = j
+                    go to 70
+                 end if
+                 ! neither test passed -- try next j
+              end do loop_40
+              ! (drop-through is "impossible")
+              info = 2*n + 1
+              go to 210
+              ! t(ilast,ilast)=0 -- clear h(ilast,ilast-1) to split off a
+              ! 1x1 block.
+              50 continue
+              ctemp = h( ilast, ilast )
+              call stdlib_wlartg( ctemp, h( ilast, ilast-1 ), c, s,h( ilast, ilast ) )
+              h( ilast, ilast-1 ) = czero
+              call stdlib_wrot( ilast-ifrstm, h( ifrstm, ilast ), 1,h( ifrstm, ilast-1 ), 1, c, s &
+                        )
+              call stdlib_wrot( ilast-ifrstm, t( ifrstm, ilast ), 1,t( ifrstm, ilast-1 ), 1, c, s &
+                        )
+              if( ilz )call stdlib_wrot( n, z( 1, ilast ), 1, z( 1, ilast-1 ), 1, c, s )
+              ! h(ilast,ilast-1)=0 -- standardize b, set alpha and beta
+              60 continue
+              absb = abs( t( ilast, ilast ) )
+              if( absb>safmin ) then
+                 signbc = conjg( t( ilast, ilast ) / absb )
+                 t( ilast, ilast ) = absb
+                 if( ilschr ) then
+                    call stdlib_wscal( ilast-ifrstm, signbc, t( ifrstm, ilast ), 1 )
+                    call stdlib_wscal( ilast+1-ifrstm, signbc, h( ifrstm, ilast ),1 )
+                 else
+                    call stdlib_wscal( 1, signbc, h( ilast, ilast ), 1 )
+                 end if
+                 if( ilz )call stdlib_wscal( n, signbc, z( 1, ilast ), 1 )
+              else
+                 t( ilast, ilast ) = czero
+              end if
+              alpha( ilast ) = h( ilast, ilast )
+              beta( ilast ) = t( ilast, ilast )
+              ! go to next block -- exit if finished.
+              ilast = ilast - 1
+              if( ilast<ilo )go to 190
+              ! reset counters
+              iiter = 0
+              eshift = czero
+              if( .not.ilschr ) then
+                 ilastm = ilast
+                 if( ifrstm>ilast )ifrstm = ilo
+              end if
+              go to 160
+              ! qz step
+              ! this iteration only involves rows/columns ifirst:ilast.  we
+              ! assume ifirst < ilast, and that the diagonal of b is non-zero.
+              70 continue
+              iiter = iiter + 1
+              if( .not.ilschr ) then
+                 ifrstm = ifirst
+              end if
+              ! compute the shift.
+              ! at this point, ifirst < ilast, and the diagonal elements of
+              ! t(ifirst:ilast,ifirst,ilast) are larger than btol (in
+              ! magnitude)
+              if( ( iiter / 10 )*10/=iiter ) then
+                 ! the wilkinson shift (aep p.512_qp), i.e., the eigenvalue of
+                 ! the bottom-right 2x2 block of a inv(b) which is nearest to
+                 ! the bottom-right element.
+                 ! we factor b as u*d, where u has unit diagonals, and
+                 ! compute (a*inv(d))*inv(u).
+                 u12 = ( bscale*t( ilast-1, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 ad11 = ( ascale*h( ilast-1, ilast-1 ) ) /( bscale*t( ilast-1, ilast-1 ) )
+                 ad21 = ( ascale*h( ilast, ilast-1 ) ) /( bscale*t( ilast-1, ilast-1 ) )
+                 ad12 = ( ascale*h( ilast-1, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 ad22 = ( ascale*h( ilast, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 abi22 = ad22 - u12*ad21
+                 abi12 = ad12 - u12*ad11
+                 shift = abi22
+                 ctemp = sqrt( abi12 )*sqrt( ad21 )
+                 temp = abs1( ctemp )
+                 if( ctemp/=zero ) then
+                    x = half*( ad11-shift )
+                    temp2 = abs1( x )
+                    temp = max( temp, abs1( x ) )
+                    y = temp*sqrt( ( x / temp )**2+( ctemp / temp )**2 )
+                    if( temp2>zero ) then
+                       if( real( x / temp2,KIND=qp)*real( y,KIND=qp)+aimag( x / temp2 )*aimag( y )&
+                                 <zero )y = -y
+                    end if
+                    shift = shift - ctemp*stdlib_wladiv( ctemp, ( x+y ) )
+                 end if
+              else
+                 ! exceptional shift.  chosen for no particularly good reason.
+                 if( ( iiter / 20 )*20==iiter .and.bscale*abs1(t( ilast, ilast ))>safmin ) &
+                           then
+                    eshift = eshift + ( ascale*h( ilast,ilast ) )/( bscale*t( ilast, ilast ) )
+                              
+                 else
+                    eshift = eshift + ( ascale*h( ilast,ilast-1 ) )/( bscale*t( ilast-1, ilast-1 )&
+                               )
+                 end if
+                 shift = eshift
+              end if
+              ! now check for two consecutive small subdiagonals.
+              do j = ilast - 1, ifirst + 1, -1
+                 istart = j
+                 ctemp = ascale*h( j, j ) - shift*( bscale*t( j, j ) )
+                 temp = abs1( ctemp )
+                 temp2 = ascale*abs1( h( j+1, j ) )
+                 tempr = max( temp, temp2 )
+                 if( tempr<one .and. tempr/=zero ) then
+                    temp = temp / tempr
+                    temp2 = temp2 / tempr
+                 end if
+                 if( abs1( h( j, j-1 ) )*temp2<=temp*atol )go to 90
+              end do
+              istart = ifirst
+              ctemp = ascale*h( ifirst, ifirst ) -shift*( bscale*t( ifirst, ifirst ) )
+              90 continue
+              ! do an implicit-shift qz sweep.
+              ! initial q
+              ctemp2 = ascale*h( istart+1, istart )
+              call stdlib_wlartg( ctemp, ctemp2, c, s, ctemp3 )
+              ! sweep
+              loop_150: do j = istart, ilast - 1
+                 if( j>istart ) then
+                    ctemp = h( j, j-1 )
+                    call stdlib_wlartg( ctemp, h( j+1, j-1 ), c, s, h( j, j-1 ) )
+                    h( j+1, j-1 ) = czero
+                 end if
+                 do jc = j, ilastm
+                    ctemp = c*h( j, jc ) + s*h( j+1, jc )
+                    h( j+1, jc ) = -conjg( s )*h( j, jc ) + c*h( j+1, jc )
+                    h( j, jc ) = ctemp
+                    ctemp2 = c*t( j, jc ) + s*t( j+1, jc )
+                    t( j+1, jc ) = -conjg( s )*t( j, jc ) + c*t( j+1, jc )
+                    t( j, jc ) = ctemp2
+                 end do
+                 if( ilq ) then
+                    do jr = 1, n
+                       ctemp = c*q( jr, j ) + conjg( s )*q( jr, j+1 )
+                       q( jr, j+1 ) = -s*q( jr, j ) + c*q( jr, j+1 )
+                       q( jr, j ) = ctemp
+                    end do
+                 end if
+                 ctemp = t( j+1, j+1 )
+                 call stdlib_wlartg( ctemp, t( j+1, j ), c, s, t( j+1, j+1 ) )
+                 t( j+1, j ) = czero
+                 do jr = ifrstm, min( j+2, ilast )
+                    ctemp = c*h( jr, j+1 ) + s*h( jr, j )
+                    h( jr, j ) = -conjg( s )*h( jr, j+1 ) + c*h( jr, j )
+                    h( jr, j+1 ) = ctemp
+                 end do
+                 do jr = ifrstm, j
+                    ctemp = c*t( jr, j+1 ) + s*t( jr, j )
+                    t( jr, j ) = -conjg( s )*t( jr, j+1 ) + c*t( jr, j )
+                    t( jr, j+1 ) = ctemp
+                 end do
+                 if( ilz ) then
+                    do jr = 1, n
+                       ctemp = c*z( jr, j+1 ) + s*z( jr, j )
+                       z( jr, j ) = -conjg( s )*z( jr, j+1 ) + c*z( jr, j )
+                       z( jr, j+1 ) = ctemp
+                    end do
+                 end if
+              end do loop_150
+              160 continue
+           end do loop_170
+           ! drop-through = non-convergence
+           180 continue
+           info = ilast
+           go to 210
+           ! successful completion of all qz steps
+           190 continue
+           ! set eigenvalues 1:ilo-1
+           do j = 1, ilo - 1
+              absb = abs( t( j, j ) )
+              if( absb>safmin ) then
+                 signbc = conjg( t( j, j ) / absb )
+                 t( j, j ) = absb
+                 if( ilschr ) then
+                    call stdlib_wscal( j-1, signbc, t( 1, j ), 1 )
+                    call stdlib_wscal( j, signbc, h( 1, j ), 1 )
+                 else
+                    call stdlib_wscal( 1, signbc, h( j, j ), 1 )
+                 end if
+                 if( ilz )call stdlib_wscal( n, signbc, z( 1, j ), 1 )
+              else
+                 t( j, j ) = czero
+              end if
+              alpha( j ) = h( j, j )
+              beta( j ) = t( j, j )
+           end do
+           ! normal termination
+           info = 0
+           ! exit (other than argument error) -- return optimal workspace size
+           210 continue
+           work( 1 ) = cmplx( n,KIND=qp)
+           return
+     end subroutine stdlib_whgeqz
+
+     !> ZHPCON: estimates the reciprocal of the condition number of a complex
+     !> Hermitian packed matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by ZHPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_whpcon( uplo, n, ap, ipiv, anorm, rcond, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: ap(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ip, kase
+           real(qp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              ip = n*( n+1 ) / 2
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip - i
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              ip = 1
+              do i = 1, n
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip + n - i + 1
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**h) or inv(u*d*u**h).
+              call stdlib_whptrs( uplo, n, 1, ap, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_whpcon
+
+     !> ZHPEV: computes all the eigenvalues and, optionally, eigenvectors of a
+     !> complex Hermitian matrix in packed storage.
+
+     subroutine stdlib_whpev( jobz, uplo, n, ap, w, z, ldz, work, rwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: ap(*)
+           complex(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantz
+           integer(ilp) :: iinfo, imax, inde, indrwk, indtau, indwrk, iscale
+           real(qp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( stdlib_lsame( uplo, 'L' ) .or. stdlib_lsame( uplo, 'U' ) ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = real( ap( 1 ),KIND=qp)
+              rwork( 1 ) = 1
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_wlanhp( 'M', uplo, n, ap, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_wdscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+           end if
+           ! call stdlib_whptrd to reduce hermitian packed matrix to tridiagonal form.
+           inde = 1
+           indtau = 1
+           call stdlib_whptrd( uplo, n, ap, w, rwork( inde ), work( indtau ),iinfo )
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, first call
+           ! stdlib_wupgtr to generate the orthogonal matrix, then call stdlib_wsteqr.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, rwork( inde ), info )
+           else
+              indwrk = indtau + n
+              call stdlib_wupgtr( uplo, n, ap, work( indtau ), z, ldz,work( indwrk ), iinfo )
+                        
+              indrwk = inde + n
+              call stdlib_wsteqr( jobz, n, w, rwork( inde ), z, ldz,rwork( indrwk ), info )
+                        
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           return
+     end subroutine stdlib_whpev
+
+     !> ZHPEVD: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a complex Hermitian matrix A in packed storage.  If eigenvectors are
+     !> desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_whpevd( jobz, uplo, n, ap, w, z, ldz, work, lwork,rwork, lrwork, iwork, &
+               liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: ap(*)
+           complex(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indrwk, indtau, indwrk, iscale, liwmin, llrwk, &
+                     llwrk, lrwmin, lwmin
+           real(qp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( stdlib_lsame( uplo, 'L' ) .or. stdlib_lsame( uplo, 'U' ) ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwmin = 1
+                 liwmin = 1
+                 lrwmin = 1
+              else
+                 if( wantz ) then
+                    lwmin = 2*n
+                    lrwmin = 1 + 5*n + 2*n**2
+                    liwmin = 3 + 5*n
+                 else
+                    lwmin = n
+                    lrwmin = n
+                    liwmin = 1
+                 end if
+              end if
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -9
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -11
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = real( ap( 1 ),KIND=qp)
+              if( wantz )z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_wlanhp( 'M', uplo, n, ap, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_wdscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+           end if
+           ! call stdlib_whptrd to reduce hermitian packed matrix to tridiagonal form.
+           inde = 1
+           indtau = 1
+           indrwk = inde + n
+           indwrk = indtau + n
+           llwrk = lwork - indwrk + 1
+           llrwk = lrwork - indrwk + 1
+           call stdlib_whptrd( uplo, n, ap, w, rwork( inde ), work( indtau ),iinfo )
+           ! for eigenvalues only, call stdlib_qsterf.  for eigenvectors, first call
+           ! stdlib_wupgtr to generate the orthogonal matrix, then call stdlib_wstedc.
+           if( .not.wantz ) then
+              call stdlib_qsterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_wstedc( 'I', n, w, rwork( inde ), z, ldz, work( indwrk ),llwrk, rwork( &
+                        indrwk ), llrwk, iwork, liwork,info )
+              call stdlib_wupmtr( 'L', uplo, 'N', n, n, ap, work( indtau ), z, ldz,work( indwrk ),&
+                         iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_whpevd
+
+     !> ZHPEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex Hermitian matrix A in packed storage.
+     !> Eigenvalues/vectors can be selected by specifying either a range of
+     !> values or a range of indices for the desired eigenvalues.
+
+     subroutine stdlib_whpevx( jobz, range, uplo, n, ap, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               work, rwork, iwork,ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: ap(*)
+           complex(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwk, indrwk, &
+                     indtau, indwrk, iscale, itmp1, j, jj, nsplit
+           real(qp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( stdlib_lsame( uplo, 'L' ) .or. stdlib_lsame( uplo, 'U' ) ) )&
+                     then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -7
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -8
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -9
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = real( ap( 1 ),KIND=qp)
+              else
+                 if( vl<real( ap( 1 ),KIND=qp) .and. vu>=real( ap( 1 ),KIND=qp) ) then
+                    m = 1
+                    w( 1 ) = real( ap( 1 ),KIND=qp)
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           end if
+           anrm = stdlib_wlanhp( 'M', uplo, n, ap, rwork )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_wdscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_whptrd to reduce hermitian packed matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indrwk = inde + n
+           indtau = 1
+           indwrk = indtau + n
+           call stdlib_whptrd( uplo, n, ap, rwork( indd ), rwork( inde ),work( indtau ), iinfo )
+                     
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_qsterf or stdlib_wupgtr and stdlib_wsteqr.  if this fails
+           ! for some eigenvalue, then try stdlib_qstebz.
+           test = .false.
+           if (indeig) then
+              if (il==1 .and. iu==n) then
+                 test = .true.
+              end if
+           end if
+           if ((alleig .or. test) .and. (abstol<=zero)) then
+              call stdlib_qcopy( n, rwork( indd ), 1, w, 1 )
+              indee = indrwk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_qcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_qsterf( n, w, rwork( indee ), info )
+              else
+                 call stdlib_wupgtr( uplo, n, ap, work( indtau ), z, ldz,work( indwrk ), iinfo )
+                           
+                 call stdlib_qcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_wsteqr( jobz, n, w, rwork( indee ), z, ldz,rwork( indrwk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 20
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_qstebz and, if eigenvectors are desired, stdlib_wstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwk = indisp + n
+           call stdlib_qstebz( range, order, n, vll, vuu, il, iu, abstll,rwork( indd ), rwork( &
+           inde ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwk ),&
+                      info )
+           if( wantz ) then
+              call stdlib_wstein( n, rwork( indd ), rwork( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwk ), ifail, info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_wstein.
+              indwrk = indtau + n
+              call stdlib_wupmtr( 'L', uplo, 'N', n, m, ap, work( indtau ), z, ldz,work( indwrk ),&
+                         iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           20 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_qscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_wswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_whpevx
+
+     !> ZHPGST: reduces a complex Hermitian-definite generalized
+     !> eigenproblem to standard form, using packed storage.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L.
+     !> B must have been previously factorized as U**H*U or L*L**H by ZPPTRF.
+
+     pure subroutine stdlib_whpgst( itype, uplo, n, ap, bp, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: ap(*)
+           complex(qp), intent(in) :: bp(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, j1, j1j1, jj, k, k1, k1k1, kk
+           real(qp) :: ajj, akk, bjj, bkk
+           complex(qp) :: ct
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPGST', -info )
+              return
+           end if
+           if( itype==1 ) then
+              if( upper ) then
+                 ! compute inv(u**h)*a*inv(u)
+                 ! j1 and jj are the indices of a(1,j) and a(j,j)
+                 jj = 0
+                 do j = 1, n
+                    j1 = jj + 1
+                    jj = jj + j
+                    ! compute the j-th column of the upper triangle of a
+                    ap( jj ) = real( ap( jj ),KIND=qp)
+                    bjj = real( bp( jj ),KIND=qp)
+                    call stdlib_wtpsv( uplo, 'CONJUGATE TRANSPOSE', 'NON-UNIT', j,bp, ap( j1 ), 1 &
+                              )
+                    call stdlib_whpmv( uplo, j-1, -cone, ap, bp( j1 ), 1, cone,ap( j1 ), 1 )
+                              
+                    call stdlib_wdscal( j-1, one / bjj, ap( j1 ), 1 )
+                    ap( jj ) = ( ap( jj )-stdlib_wdotc( j-1, ap( j1 ), 1, bp( j1 ),1 ) ) / &
+                              bjj
+                 end do
+              else
+                 ! compute inv(l)*a*inv(l**h)
+                 ! kk and k1k1 are the indices of a(k,k) and a(k+1,k+1)
+                 kk = 1
+                 do k = 1, n
+                    k1k1 = kk + n - k + 1
+                    ! update the lower triangle of a(k:n,k:n)
+                    akk = real( ap( kk ),KIND=qp)
+                    bkk = real( bp( kk ),KIND=qp)
+                    akk = akk / bkk**2
+                    ap( kk ) = akk
+                    if( k<n ) then
+                       call stdlib_wdscal( n-k, one / bkk, ap( kk+1 ), 1 )
+                       ct = -half*akk
+                       call stdlib_waxpy( n-k, ct, bp( kk+1 ), 1, ap( kk+1 ), 1 )
+                       call stdlib_whpr2( uplo, n-k, -cone, ap( kk+1 ), 1,bp( kk+1 ), 1, ap( k1k1 &
+                                 ) )
+                       call stdlib_waxpy( n-k, ct, bp( kk+1 ), 1, ap( kk+1 ), 1 )
+                       call stdlib_wtpsv( uplo, 'NO TRANSPOSE', 'NON-UNIT', n-k,bp( k1k1 ), ap( &
+                                 kk+1 ), 1 )
+                    end if
+                    kk = k1k1
+                 end do
+              end if
+           else
+              if( upper ) then
+                 ! compute u*a*u**h
+                 ! k1 and kk are the indices of a(1,k) and a(k,k)
+                 kk = 0
+                 do k = 1, n
+                    k1 = kk + 1
+                    kk = kk + k
+                    ! update the upper triangle of a(1:k,1:k)
+                    akk = real( ap( kk ),KIND=qp)
+                    bkk = real( bp( kk ),KIND=qp)
+                    call stdlib_wtpmv( uplo, 'NO TRANSPOSE', 'NON-UNIT', k-1, bp,ap( k1 ), 1 )
+                              
+                    ct = half*akk
+                    call stdlib_waxpy( k-1, ct, bp( k1 ), 1, ap( k1 ), 1 )
+                    call stdlib_whpr2( uplo, k-1, cone, ap( k1 ), 1, bp( k1 ), 1,ap )
+                    call stdlib_waxpy( k-1, ct, bp( k1 ), 1, ap( k1 ), 1 )
+                    call stdlib_wdscal( k-1, bkk, ap( k1 ), 1 )
+                    ap( kk ) = akk*bkk**2
+                 end do
+              else
+                 ! compute l**h *a*l
+                 ! jj and j1j1 are the indices of a(j,j) and a(j+1,j+1)
+                 jj = 1
+                 do j = 1, n
+                    j1j1 = jj + n - j + 1
+                    ! compute the j-th column of the lower triangle of a
+                    ajj = real( ap( jj ),KIND=qp)
+                    bjj = real( bp( jj ),KIND=qp)
+                    ap( jj ) = ajj*bjj + stdlib_wdotc( n-j, ap( jj+1 ), 1,bp( jj+1 ), 1 )
+                    call stdlib_wdscal( n-j, bjj, ap( jj+1 ), 1 )
+                    call stdlib_whpmv( uplo, n-j, cone, ap( j1j1 ), bp( jj+1 ), 1,cone, ap( jj+1 )&
+                              , 1 )
+                    call stdlib_wtpmv( uplo, 'CONJUGATE TRANSPOSE', 'NON-UNIT',n-j+1, bp( jj ), &
+                              ap( jj ), 1 )
+                    jj = j1j1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_whpgst
+
+     !> ZHPGV: computes all the eigenvalues and, optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+     !> Here A and B are assumed to be Hermitian, stored in packed format,
+     !> and B is also positive definite.
+
+     subroutine stdlib_whpgv( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, ldz, n
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: ap(*), bp(*)
+           complex(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper, wantz
+           character :: trans
+           integer(ilp) :: j, neig
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPGV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_wpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_whpgst( itype, uplo, n, ap, bp, info )
+           call stdlib_whpev( jobz, uplo, n, ap, w, z, ldz, work, rwork, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 do j = 1, neig
+                    call stdlib_wtpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, neig
+                    call stdlib_wtpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_whpgv
+
+     !> ZHPGVD: computes all the eigenvalues and, optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be Hermitian, stored in packed format, and B is also
+     !> positive definite.
+     !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_whpgvd( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,lwork, rwork, lrwork,&
+                iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: ap(*), bp(*)
+           complex(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: j, liwmin, lrwmin, lwmin, neig
+           ! Intrinsic Functions 
+           intrinsic :: real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwmin = 1
+                 liwmin = 1
+                 lrwmin = 1
+              else
+                 if( wantz ) then
+                    lwmin = 2*n
+                    lrwmin = 1 + 5*n + 2*n**2
+                    liwmin = 3 + 5*n
+                 else
+                    lwmin = n
+                    lrwmin = n
+                    liwmin = 1
+                 end if
+              end if
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -13
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_wpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_whpgst( itype, uplo, n, ap, bp, info )
+           call stdlib_whpevd( jobz, uplo, n, ap, w, z, ldz, work, lwork, rwork,lrwork, iwork, &
+                     liwork, info )
+           lwmin = max( real( lwmin,KIND=qp), real( work( 1 ),KIND=qp) )
+           lrwmin = max( real( lrwmin,KIND=qp), real( rwork( 1 ),KIND=qp) )
+           liwmin = max( real( liwmin,KIND=qp), real( iwork( 1 ),KIND=qp) )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 do j = 1, neig
+                    call stdlib_wtpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, neig
+                    call stdlib_wtpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_whpgvd
+
+     !> ZHPGVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be Hermitian, stored in packed format, and B is also
+     !> positive definite.  Eigenvalues and eigenvectors can be selected by
+     !> specifying either a range of values or a range of indices for the
+     !> desired eigenvalues.
+
+     subroutine stdlib_whpgvx( itype, jobz, range, uplo, n, ap, bp, vl, vu,il, iu, abstol, m, w, &
+               z, ldz, work, rwork,iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, itype, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(out) :: rwork(*), w(*)
+           complex(qp), intent(inout) :: ap(*), bp(*)
+           complex(qp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, upper, valeig, wantz
+           character :: trans
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -3
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl ) then
+                    info = -9
+                 end if
+              else if( indeig ) then
+                 if( il<1 ) then
+                    info = -10
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -11
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPGVX', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_wpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_whpgst( itype, uplo, n, ap, bp, info )
+           call stdlib_whpevx( jobz, range, uplo, n, ap, vl, vu, il, iu, abstol, m,w, z, ldz, &
+                     work, rwork, iwork, ifail, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              if( info>0 )m = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 do j = 1, m
+                    call stdlib_wtpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, m
+                    call stdlib_wtpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_whpgvx
+
+     !> ZHPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian indefinite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_whprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, work,&
+                rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_wcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_whpmv( uplo, n, -cone, ap, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( ap( kk+k-1 ),KIND=qp) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( ap( kk ),KIND=qp) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_whptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                 call stdlib_waxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_wlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_whptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_whptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_whprfs
+
+     !> ZHPSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian matrix stored in packed format and X
+     !> and B are N-by-NRHS matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**H,  if UPLO = 'U', or
+     !> A = L * D * L**H,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, D is Hermitian and block diagonal with 1-by-1
+     !> and 2-by-2 diagonal blocks.  The factored form of A is then used to
+     !> solve the system of equations A * X = B.
+
+     pure subroutine stdlib_whpsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPSV ', -info )
+              return
+           end if
+           ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+           call stdlib_whptrf( uplo, n, ap, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_whptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_whpsv
+
+     !> ZHPSVX: uses the diagonal pivoting factorization A = U*D*U**H or
+     !> A = L*D*L**H to compute the solution to a complex system of linear
+     !> equations A * X = B, where A is an N-by-N Hermitian matrix stored
+     !> in packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_whpsvx( fact, uplo, n, nrhs, ap, afp, ipiv, b, ldb, x,ldx, rcond, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(inout) :: afp(*)
+           complex(qp), intent(in) :: ap(*), b(ldb,*)
+           complex(qp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(qp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+              call stdlib_wcopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_whptrf( uplo, n, afp, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_wlanhp( 'I', uplo, n, ap, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_whpcon( uplo, n, afp, ipiv, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_wlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_whptrs( uplo, n, nrhs, afp, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_whprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx, ferr,berr, work, &
+                     rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_whpsvx
+
+     !> ZHPTRD: reduces a complex Hermitian matrix A stored in packed form to
+     !> real symmetric tridiagonal form T by a unitary similarity
+     !> transformation: Q**H * A * Q = T.
+
+     pure subroutine stdlib_whptrd( uplo, n, ap, d, e, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(out) :: d(*), e(*)
+           complex(qp), intent(inout) :: ap(*)
+           complex(qp), intent(out) :: tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, i1, i1i1, ii
+           complex(qp) :: alpha, taui
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPTRD', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           if( upper ) then
+              ! reduce the upper triangle of a.
+              ! i1 is the index in ap of a(1,i+1).
+              i1 = n*( n-1 ) / 2 + 1
+              ap( i1+n-1 ) = real( ap( i1+n-1 ),KIND=qp)
+              do i = n - 1, 1, -1
+                 ! generate elementary reflector h(i) = i - tau * v * v**h
+                 ! to annihilate a(1:i-1,i+1)
+                 alpha = ap( i1+i-1 )
+                 call stdlib_wlarfg( i, alpha, ap( i1 ), 1, taui )
+                 e( i ) = real( alpha,KIND=qp)
+                 if( taui/=czero ) then
+                    ! apply h(i) from both sides to a(1:i,1:i)
+                    ap( i1+i-1 ) = cone
+                    ! compute  y := tau * a * v  storing y in tau(1:i)
+                    call stdlib_whpmv( uplo, i, taui, ap, ap( i1 ), 1, czero, tau,1 )
+                    ! compute  w := y - 1/2 * tau * (y**h *v) * v
+                    alpha = -chalf*taui*stdlib_wdotc( i, tau, 1, ap( i1 ), 1 )
+                    call stdlib_waxpy( i, alpha, ap( i1 ), 1, tau, 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**h - w * v**h
+                    call stdlib_whpr2( uplo, i, -cone, ap( i1 ), 1, tau, 1, ap )
+                 end if
+                 ap( i1+i-1 ) = e( i )
+                 d( i+1 ) = real( ap( i1+i ),KIND=qp)
+                 tau( i ) = taui
+                 i1 = i1 - i
+              end do
+              d( 1 ) = real( ap( 1 ),KIND=qp)
+           else
+              ! reduce the lower triangle of a. ii is the index in ap of
+              ! a(i,i) and i1i1 is the index of a(i+1,i+1).
+              ii = 1
+              ap( 1 ) = real( ap( 1 ),KIND=qp)
+              do i = 1, n - 1
+                 i1i1 = ii + n - i + 1
+                 ! generate elementary reflector h(i) = i - tau * v * v**h
+                 ! to annihilate a(i+2:n,i)
+                 alpha = ap( ii+1 )
+                 call stdlib_wlarfg( n-i, alpha, ap( ii+2 ), 1, taui )
+                 e( i ) = real( alpha,KIND=qp)
+                 if( taui/=czero ) then
+                    ! apply h(i) from both sides to a(i+1:n,i+1:n)
+                    ap( ii+1 ) = cone
+                    ! compute  y := tau * a * v  storing y in tau(i:n-1)
+                    call stdlib_whpmv( uplo, n-i, taui, ap( i1i1 ), ap( ii+1 ), 1,czero, tau( i ),&
+                               1 )
+                    ! compute  w := y - 1/2 * tau * (y**h *v) * v
+                    alpha = -chalf*taui*stdlib_wdotc( n-i, tau( i ), 1, ap( ii+1 ),1 )
+                    call stdlib_waxpy( n-i, alpha, ap( ii+1 ), 1, tau( i ), 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**h - w * v**h
+                    call stdlib_whpr2( uplo, n-i, -cone, ap( ii+1 ), 1, tau( i ), 1,ap( i1i1 ) )
+                              
+                 end if
+                 ap( ii+1 ) = e( i )
+                 d( i ) = real( ap( ii ),KIND=qp)
+                 tau( i ) = taui
+                 ii = i1i1
+              end do
+              d( n ) = real( ap( ii ),KIND=qp)
+           end if
+           return
+     end subroutine stdlib_whptrd
+
+     !> ZHPTRF: computes the factorization of a complex Hermitian packed
+     !> matrix A using the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**H  or  A = L*D*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+
+     pure subroutine stdlib_whptrf( uplo, n, ap, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kc, kk, knc, kp, kpc, kstep, kx, npp
+           real(qp) :: absakk, alpha, colmax, d, d11, d22, r1, rowmax, tt
+           complex(qp) :: d12, d21, t, wk, wkm1, wkp1, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPTRF', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              kc = ( n-1 )*n / 2 + 1
+              10 continue
+              knc = kc
+              ! if k < 1, exit from loop
+              if( k<1 )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( ap( kc+k-1 ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, ap( kc ), 1 )
+                 colmax = cabs1( ap( kc+imax-1 ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ap( kc+k-1 ) = real( ap( kc+k-1 ),KIND=qp)
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    rowmax = zero
+                    jmax = imax
+                    kx = imax*( imax+1 ) / 2 + imax
+                    do j = imax + 1, k
+                       if( cabs1( ap( kx ) )>rowmax ) then
+                          rowmax = cabs1( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + j
+                    end do
+                    kpc = ( imax-1 )*imax / 2 + 1
+                    if( imax>1 ) then
+                       jmax = stdlib_iwamax( imax-1, ap( kpc ), 1 )
+                       rowmax = max( rowmax, cabs1( ap( kpc+jmax-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( real( ap( kpc+imax-1 ),KIND=qp) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kstep==2 )knc = knc - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_wswap( kp-1, ap( knc ), 1, ap( kpc ), 1 )
+                    kx = kpc + kp - 1
+                    do j = kp + 1, kk - 1
+                       kx = kx + j - 1
+                       t = conjg( ap( knc+j-1 ) )
+                       ap( knc+j-1 ) = conjg( ap( kx ) )
+                       ap( kx ) = t
+                    end do
+                    ap( kx+kk-1 ) = conjg( ap( kx+kk-1 ) )
+                    r1 = real( ap( knc+kk-1 ),KIND=qp)
+                    ap( knc+kk-1 ) = real( ap( kpc+kp-1 ),KIND=qp)
+                    ap( kpc+kp-1 ) = r1
+                    if( kstep==2 ) then
+                       ap( kc+k-1 ) = real( ap( kc+k-1 ),KIND=qp)
+                       t = ap( kc+k-2 )
+                       ap( kc+k-2 ) = ap( kc+kp-1 )
+                       ap( kc+kp-1 ) = t
+                    end if
+                 else
+                    ap( kc+k-1 ) = real( ap( kc+k-1 ),KIND=qp)
+                    if( kstep==2 )ap( kc-1 ) = real( ap( kc-1 ),KIND=qp)
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**h = a - w(k)*1/d(k)*w(k)**h
+                    r1 = one / real( ap( kc+k-1 ),KIND=qp)
+                    call stdlib_whpr( uplo, k-1, -r1, ap( kc ), 1, ap )
+                    ! store u(k) in column k
+                    call stdlib_wdscal( k-1, r1, ap( kc ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**h
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**h
+                    if( k>2 ) then
+                       d = stdlib_qlapy2( real( ap( k-1+( k-1 )*k / 2 ),KIND=qp),aimag( ap( k-1+( &
+                                 k-1 )*k / 2 ) ) )
+                       d22 = real( ap( k-1+( k-2 )*( k-1 ) / 2 ),KIND=qp) / d
+                       d11 = real( ap( k+( k-1 )*k / 2 ),KIND=qp) / d
+                       tt = one / ( d11*d22-one )
+                       d12 = ap( k-1+( k-1 )*k / 2 ) / d
+                       d = tt / d
+                       do j = k - 2, 1, -1
+                          wkm1 = d*( d11*ap( j+( k-2 )*( k-1 ) / 2 )-conjg( d12 )*ap( j+( k-1 )*k &
+                                    / 2 ) )
+                          wk = d*( d22*ap( j+( k-1 )*k / 2 )-d12*ap( j+( k-2 )*( k-1 ) / 2 ) )
+                                    
+                          do i = j, 1, -1
+                             ap( i+( j-1 )*j / 2 ) = ap( i+( j-1 )*j / 2 ) -ap( i+( k-1 )*k / 2 )&
+                                       *conjg( wk ) -ap( i+( k-2 )*( k-1 ) / 2 )*conjg( wkm1 )
+                          end do
+                          ap( j+( k-1 )*k / 2 ) = wk
+                          ap( j+( k-2 )*( k-1 ) / 2 ) = wkm1
+                          ap( j+( j-1 )*j / 2 ) = cmplx( real( ap( j+( j-1 )*j / 2 ),KIND=qp), &
+                                    zero,KIND=qp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              kc = knc - k
+              go to 10
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              kc = 1
+              npp = n*( n+1 ) / 2
+              60 continue
+              knc = kc
+              ! if k > n, exit from loop
+              if( k>n )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( ap( kc ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, ap( kc+1 ), 1 )
+                 colmax = cabs1( ap( kc+imax-k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ap( kc ) = real( ap( kc ),KIND=qp)
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    rowmax = zero
+                    kx = kc + imax - k
+                    do j = k, imax - 1
+                       if( cabs1( ap( kx ) )>rowmax ) then
+                          rowmax = cabs1( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + n - j
+                    end do
+                    kpc = npp - ( n-imax+1 )*( n-imax+2 ) / 2 + 1
+                    if( imax<n ) then
+                       jmax = imax + stdlib_iwamax( n-imax, ap( kpc+1 ), 1 )
+                       rowmax = max( rowmax, cabs1( ap( kpc+jmax-imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( real( ap( kpc ),KIND=qp) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kstep==2 )knc = knc + n - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_wswap( n-kp, ap( knc+kp-kk+1 ), 1, ap( kpc+1 ),1 )
+                              
+                    kx = knc + kp - kk
+                    do j = kk + 1, kp - 1
+                       kx = kx + n - j + 1
+                       t = conjg( ap( knc+j-kk ) )
+                       ap( knc+j-kk ) = conjg( ap( kx ) )
+                       ap( kx ) = t
+                    end do
+                    ap( knc+kp-kk ) = conjg( ap( knc+kp-kk ) )
+                    r1 = real( ap( knc ),KIND=qp)
+                    ap( knc ) = real( ap( kpc ),KIND=qp)
+                    ap( kpc ) = r1
+                    if( kstep==2 ) then
+                       ap( kc ) = real( ap( kc ),KIND=qp)
+                       t = ap( kc+1 )
+                       ap( kc+1 ) = ap( kc+kp-k )
+                       ap( kc+kp-k ) = t
+                    end if
+                 else
+                    ap( kc ) = real( ap( kc ),KIND=qp)
+                    if( kstep==2 )ap( knc ) = real( ap( knc ),KIND=qp)
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**h = a - w(k)*(1/d(k))*w(k)**h
+                       r1 = one / real( ap( kc ),KIND=qp)
+                       call stdlib_whpr( uplo, n-k, -r1, ap( kc+1 ), 1,ap( kc+n-k+1 ) )
+                       ! store l(k) in column k
+                       call stdlib_wdscal( n-k, r1, ap( kc+1 ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**h
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**h
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d = stdlib_qlapy2( real( ap( k+1+( k-1 )*( 2*n-k ) / 2 ),KIND=qp),aimag( &
+                                 ap( k+1+( k-1 )*( 2*n-k ) / 2 ) ) )
+                       d11 = real( ap( k+1+k*( 2*n-k-1 ) / 2 ),KIND=qp) / d
+                       d22 = real( ap( k+( k-1 )*( 2*n-k ) / 2 ),KIND=qp) / d
+                       tt = one / ( d11*d22-one )
+                       d21 = ap( k+1+( k-1 )*( 2*n-k ) / 2 ) / d
+                       d = tt / d
+                       do j = k + 2, n
+                          wk = d*( d11*ap( j+( k-1 )*( 2*n-k ) / 2 )-d21*ap( j+k*( 2*n-k-1 ) / 2 )&
+                                     )
+                          wkp1 = d*( d22*ap( j+k*( 2*n-k-1 ) / 2 )-conjg( d21 )*ap( j+( k-1 )*( &
+                                    2*n-k ) /2 ) )
+                          do i = j, n
+                             ap( i+( j-1 )*( 2*n-j ) / 2 ) = ap( i+( j-1 )*( 2*n-j ) / 2 ) - ap( &
+                             i+( k-1 )*( 2*n-k ) /2 )*conjg( wk ) - ap( i+k*( 2*n-k-1 ) / 2 )&
+                                       *conjg( wkp1 )
+                          end do
+                          ap( j+( k-1 )*( 2*n-k ) / 2 ) = wk
+                          ap( j+k*( 2*n-k-1 ) / 2 ) = wkp1
+                          ap( j+( j-1 )*( 2*n-j ) / 2 )= cmplx( real( ap( j+( j-1 )*( 2*n-j ) / 2 &
+                                    ),KIND=qp),zero,KIND=qp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              kc = knc + n - k + 2
+              go to 60
+           end if
+           110 continue
+           return
+     end subroutine stdlib_whptrf
+
+     !> ZHPTRI: computes the inverse of a complex Hermitian indefinite matrix
+     !> A in packed storage using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by ZHPTRF.
+
+     pure subroutine stdlib_whptri( uplo, n, ap, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: ap(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kcnext, kp, kpc, kstep, kx, npp
+           real(qp) :: ak, akp1, d, t
+           complex(qp) :: akkp1, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              kp = n*( n+1 ) / 2
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. ap( kp )==czero )return
+                 kp = kp - info
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              kp = 1
+              do info = 1, n
+                 if( ipiv( info )>0 .and. ap( kp )==czero )return
+                 kp = kp + n - info + 1
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              kcnext = kc + k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc+k-1 ) = one / real( ap( kc+k-1 ),KIND=qp)
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_wcopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_whpmv( uplo, k-1, -cone, ap, work, 1, czero,ap( kc ), 1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -real( stdlib_wdotc( k-1, work, 1, ap( kc ), 1 ),&
+                              KIND=qp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( ap( kcnext+k-1 ) )
+                 ak = real( ap( kc+k-1 ),KIND=qp) / t
+                 akp1 = real( ap( kcnext+k ),KIND=qp) / t
+                 akkp1 = ap( kcnext+k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 ap( kc+k-1 ) = akp1 / d
+                 ap( kcnext+k ) = ak / d
+                 ap( kcnext+k-1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_wcopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_whpmv( uplo, k-1, -cone, ap, work, 1, czero,ap( kc ), 1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -real( stdlib_wdotc( k-1, work, 1, ap( kc ), 1 ),&
+                              KIND=qp)
+                    ap( kcnext+k-1 ) = ap( kcnext+k-1 ) -stdlib_wdotc( k-1, ap( kc ), 1, ap( &
+                              kcnext ),1 )
+                    call stdlib_wcopy( k-1, ap( kcnext ), 1, work, 1 )
+                    call stdlib_whpmv( uplo, k-1, -cone, ap, work, 1, czero,ap( kcnext ), 1 )
+                              
+                    ap( kcnext+k ) = ap( kcnext+k ) -real( stdlib_wdotc( k-1, work, 1, ap( kcnext &
+                              ),1 ),KIND=qp)
+                 end if
+                 kstep = 2
+                 kcnext = kcnext + k + 1
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kpc = ( kp-1 )*kp / 2 + 1
+                 call stdlib_wswap( kp-1, ap( kc ), 1, ap( kpc ), 1 )
+                 kx = kpc + kp - 1
+                 do j = kp + 1, k - 1
+                    kx = kx + j - 1
+                    temp = conjg( ap( kc+j-1 ) )
+                    ap( kc+j-1 ) = conjg( ap( kx ) )
+                    ap( kx ) = temp
+                 end do
+                 ap( kc+kp-1 ) = conjg( ap( kc+kp-1 ) )
+                 temp = ap( kc+k-1 )
+                 ap( kc+k-1 ) = ap( kpc+kp-1 )
+                 ap( kpc+kp-1 ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc+k+k-1 )
+                    ap( kc+k+k-1 ) = ap( kc+k+kp-1 )
+                    ap( kc+k+kp-1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              kc = kcnext
+              go to 30
+              50 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              npp = n*( n+1 ) / 2
+              k = n
+              kc = npp
+              60 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 80
+              kcnext = kc - ( n-k+2 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc ) = one / real( ap( kc ),KIND=qp)
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_wcopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_whpmv( uplo, n-k, -cone, ap( kc+n-k+1 ), work, 1,czero, ap( kc+1 )&
+                              , 1 )
+                    ap( kc ) = ap( kc ) - real( stdlib_wdotc( n-k, work, 1,ap( kc+1 ), 1 ),&
+                              KIND=qp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( ap( kcnext+1 ) )
+                 ak = real( ap( kcnext ),KIND=qp) / t
+                 akp1 = real( ap( kc ),KIND=qp) / t
+                 akkp1 = ap( kcnext+1 ) / t
+                 d = t*( ak*akp1-one )
+                 ap( kcnext ) = akp1 / d
+                 ap( kc ) = ak / d
+                 ap( kcnext+1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_wcopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_whpmv( uplo, n-k, -cone, ap( kc+( n-k+1 ) ), work,1, czero, ap( &
+                              kc+1 ), 1 )
+                    ap( kc ) = ap( kc ) - real( stdlib_wdotc( n-k, work, 1,ap( kc+1 ), 1 ),&
+                              KIND=qp)
+                    ap( kcnext+1 ) = ap( kcnext+1 ) -stdlib_wdotc( n-k, ap( kc+1 ), 1,ap( kcnext+&
+                              2 ), 1 )
+                    call stdlib_wcopy( n-k, ap( kcnext+2 ), 1, work, 1 )
+                    call stdlib_whpmv( uplo, n-k, -cone, ap( kc+( n-k+1 ) ), work,1, czero, ap( &
+                              kcnext+2 ), 1 )
+                    ap( kcnext ) = ap( kcnext ) -real( stdlib_wdotc( n-k, work, 1, ap( kcnext+2 ),&
+                              1 ),KIND=qp)
+                 end if
+                 kstep = 2
+                 kcnext = kcnext - ( n-k+3 )
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kpc = npp - ( n-kp+1 )*( n-kp+2 ) / 2 + 1
+                 if( kp<n )call stdlib_wswap( n-kp, ap( kc+kp-k+1 ), 1, ap( kpc+1 ), 1 )
+                 kx = kc + kp - k
+                 do j = k + 1, kp - 1
+                    kx = kx + n - j + 1
+                    temp = conjg( ap( kc+j-k ) )
+                    ap( kc+j-k ) = conjg( ap( kx ) )
+                    ap( kx ) = temp
+                 end do
+                 ap( kc+kp-k ) = conjg( ap( kc+kp-k ) )
+                 temp = ap( kc )
+                 ap( kc ) = ap( kpc )
+                 ap( kpc ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc-n+k-1 )
+                    ap( kc-n+k-1 ) = ap( kc-n+kp-1 )
+                    ap( kc-n+kp-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              kc = kcnext
+              go to 60
+              80 continue
+           end if
+           return
+     end subroutine stdlib_whptri
+
+     !> ZHPTRS: solves a system of linear equations A*X = B with a complex
+     !> Hermitian matrix A stored in packed format using the factorization
+     !> A = U*D*U**H or A = L*D*L**H computed by ZHPTRF.
+
+     pure subroutine stdlib_whptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: ap(*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kp
+           real(qp) :: s
+           complex(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**h.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              kc = kc - k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_wgeru( k-1, nrhs, -cone, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=qp) / real( ap( kc+k-1 ),KIND=qp)
+                 call stdlib_wdscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_wswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_wgeru( k-2, nrhs, -cone, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 call stdlib_wgeru( k-2, nrhs, -cone, ap( kc-( k-1 ) ), 1,b( k-1, 1 ), ldb, b( 1, &
+                           1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+k-2 )
+                 akm1 = ap( kc-1 ) / akm1k
+                 ak = ap( kc+k-1 ) / conjg( akm1k )
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / conjg( akm1k )
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc - k + 1
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**h *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**h(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 ) then
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, ap( kc ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + k
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**h(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, ap( kc ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k+1, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, ap( kc+k ),&
+                               1, cone, b( k+1, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + 2*k + 1
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**h.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_wgeru( n-k, nrhs, -cone, ap( kc+1 ), 1, b( k, 1 ),ldb, b( k+&
+                           1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=qp) / real( ap( kc ),KIND=qp)
+                 call stdlib_wdscal( nrhs, s, b( k, 1 ), ldb )
+                 kc = kc + n - k + 1
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_wswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_wgeru( n-k-1, nrhs, -cone, ap( kc+2 ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_wgeru( n-k-1, nrhs, -cone, ap( kc+n-k+2 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+1 )
+                 akm1 = ap( kc ) / conjg( akm1k )
+                 ak = ap( kc+n-k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / conjg( akm1k )
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc + 2*( n-k ) + 1
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**h *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              kc = kc - ( n-k+1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**h(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n ) then
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              ap( kc+1 ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**h(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              ap( kc+1 ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k-1, 1 ), ldb )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              ap( kc-( n-k ) ), 1, cone,b( k-1, 1 ), ldb )
+                    call stdlib_wlacgv( nrhs, b( k-1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc - ( n-k+2 )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_whptrs
+
+     !> ZHSEIN: uses inverse iteration to find specified right and/or left
+     !> eigenvectors of a complex upper Hessenberg matrix H.
+     !> The right eigenvector x and the left eigenvector y of the matrix H
+     !> corresponding to an eigenvalue w are defined by:
+     !> H * x = w * x,     y**h * H = w * y**h
+     !> where y**h denotes the conjugate transpose of the vector y.
+
+     subroutine stdlib_whsein( side, eigsrc, initv, select, n, h, ldh, w, vl,ldvl, vr, ldvr, mm, &
+               m, work, rwork, ifaill,ifailr, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: eigsrc, initv, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldh, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: ifaill(*), ifailr(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(in) :: h(ldh,*)
+           complex(qp), intent(inout) :: vl(ldvl,*), vr(ldvr,*), w(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: rzero = 0.0e+0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: bothv, fromqr, leftv, noinit, rightv
+           integer(ilp) :: i, iinfo, k, kl, kln, kr, ks, ldwork
+           real(qp) :: eps3, hnorm, smlnum, ulp, unfl
+           complex(qp) :: cdum, wk
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           bothv = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv = stdlib_lsame( side, 'L' ) .or. bothv
+           fromqr = stdlib_lsame( eigsrc, 'Q' )
+           noinit = stdlib_lsame( initv, 'N' )
+           ! set m to the number of columns required to store the selected
+           ! eigenvectors.
+           m = 0
+           do k = 1, n
+              if( select( k ) )m = m + 1
+           end do
+           info = 0
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.fromqr .and. .not.stdlib_lsame( eigsrc, 'N' ) ) then
+              info = -2
+           else if( .not.noinit .and. .not.stdlib_lsame( initv, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( ldh<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -10
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -12
+           else if( mm<m ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHSEIN', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! set machine-dependent constants.
+           unfl = stdlib_qlamch( 'SAFE MINIMUM' )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           ldwork = n
+           kl = 1
+           kln = 0
+           if( fromqr ) then
+              kr = 0
+           else
+              kr = n
+           end if
+           ks = 1
+           loop_100: do k = 1, n
+              if( select( k ) ) then
+                 ! compute eigenvector(s) corresponding to w(k).
+                 if( fromqr ) then
+                    ! if affiliation of eigenvalues is known, check whether
+                    ! the matrix splits.
+                    ! determine kl and kr such that 1 <= kl <= k <= kr <= n
+                    ! and h(kl,kl-1) and h(kr+1,kr) are czero (or kl = 1 or
+                    ! kr = n).
+                    ! then inverse iteration can be performed with the
+                    ! submatrix h(kl:n,kl:n) for a left eigenvector, and with
+                    ! the submatrix h(1:kr,1:kr) for a right eigenvector.
+                    do i = k, kl + 1, -1
+                       if( h( i, i-1 )==czero )go to 30
+                    end do
+                    30 continue
+                    kl = i
+                    if( k>kr ) then
+                       do i = k, n - 1
+                          if( h( i+1, i )==czero )go to 50
+                       end do
+                       50 continue
+                       kr = i
+                    end if
+                 end if
+                 if( kl/=kln ) then
+                    kln = kl
+                    ! compute infinity-norm of submatrix h(kl:kr,kl:kr) if it
+                    ! has not ben computed before.
+                    hnorm = stdlib_wlanhs( 'I', kr-kl+1, h( kl, kl ), ldh, rwork )
+                    if( stdlib_qisnan( hnorm ) ) then
+                       info = -6
+                       return
+                    else if( hnorm>rzero ) then
+                       eps3 = hnorm*ulp
+                    else
+                       eps3 = smlnum
+                    end if
+                 end if
+                 ! perturb eigenvalue if it is close to any previous
+                 ! selected eigenvalues affiliated to the submatrix
+                 ! h(kl:kr,kl:kr). close roots are modified by eps3.
+                 wk = w( k )
+                 60 continue
+                 do i = k - 1, kl, -1
+                    if( select( i ) .and. cabs1( w( i )-wk )<eps3 ) then
+                       wk = wk + eps3
+                       go to 60
+                    end if
+                 end do
+                 w( k ) = wk
+                 if( leftv ) then
+                    ! compute left eigenvector.
+                    call stdlib_wlaein( .false., noinit, n-kl+1, h( kl, kl ), ldh,wk, vl( kl, ks )&
+                              , work, ldwork, rwork, eps3,smlnum, iinfo )
+                    if( iinfo>0 ) then
+                       info = info + 1
+                       ifaill( ks ) = k
+                    else
+                       ifaill( ks ) = 0
+                    end if
+                    do i = 1, kl - 1
+                       vl( i, ks ) = czero
+                    end do
+                 end if
+                 if( rightv ) then
+                    ! compute right eigenvector.
+                    call stdlib_wlaein( .true., noinit, kr, h, ldh, wk, vr( 1, ks ),work, ldwork, &
+                              rwork, eps3, smlnum, iinfo )
+                    if( iinfo>0 ) then
+                       info = info + 1
+                       ifailr( ks ) = k
+                    else
+                       ifailr( ks ) = 0
+                    end if
+                    do i = kr + 1, n
+                       vr( i, ks ) = czero
+                    end do
+                 end if
+                 ks = ks + 1
+              end if
+           end do loop_100
+           return
+     end subroutine stdlib_whsein
+
+     !> ZHSEQR: computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**H, where T is an upper triangular matrix (the
+     !> Schur form), and Z is the unitary matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input unitary
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*T*(QZ)**H.
+
+     pure subroutine stdlib_whseqr( job, compz, n, ilo, ihi, h, ldh, w, z, ldz,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           character, intent(in) :: compz, job
+           ! Array Arguments 
+           complex(qp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(qp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: nl = 49
+           real(qp), parameter :: rzero = 0.0_qp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_wlahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== nl allocates some local workspace to help small matrices
+           ! .    through a rare stdlib_wlahqr failure.  nl > ntiny = 15 is
+           ! .    required and nl <= nmin = stdlib_ilaenv(ispec=12,...) is recom-
+           ! .    mended.  (the default value of nmin is 75.)  using nl = 49
+           ! .    allows up to six simultaneous shifts and a 16-by-16
+           ! .    deflation window.  ====
+           
+           
+           
+           ! Local Arrays 
+           complex(qp) :: hl(nl,nl), workl(nl)
+           ! Local Scalars 
+           integer(ilp) :: kbot, nmin
+           logical(lk) :: initz, lquery, wantt, wantz
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,max,min
+           ! Executable Statements 
+           ! ==== decode and check the input parameters. ====
+           wantt = stdlib_lsame( job, 'S' )
+           initz = stdlib_lsame( compz, 'I' )
+           wantz = initz .or. stdlib_lsame( compz, 'V' )
+           work( 1 ) = cmplx( real( max( 1, n ),KIND=qp), rzero,KIND=qp)
+           lquery = lwork==-1
+           info = 0
+           if( .not.stdlib_lsame( job, 'E' ) .and. .not.wantt ) then
+              info = -1
+           else if( .not.stdlib_lsame( compz, 'N' ) .and. .not.wantz ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -4
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -5
+           else if( ldh<max( 1, n ) ) then
+              info = -7
+           else if( ldz<1 .or. ( wantz .and. ldz<max( 1, n ) ) ) then
+              info = -10
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              ! ==== quick return in case of invalid argument. ====
+              call stdlib_xerbla( 'ZHSEQR', -info )
+              return
+           else if( n==0 ) then
+              ! ==== quick return in case n = 0; nothing to do. ====
+              return
+           else if( lquery ) then
+              ! ==== quick return in case of a workspace query ====
+              call stdlib_wlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, w, ilo, ihi, z,ldz, work, &
+                        lwork, info )
+              ! ==== ensure reported workspace size is backward-compatible with
+              ! .    previous lapack versions. ====
+              work( 1 ) = cmplx( max( real( work( 1 ),KIND=qp), real( max( 1,n ),KIND=qp) ), &
+                        rzero,KIND=qp)
+              return
+           else
+              ! ==== copy eigenvalues isolated by stdlib_wgebal ====
+              if( ilo>1 )call stdlib_wcopy( ilo-1, h, ldh+1, w, 1 )
+              if( ihi<n )call stdlib_wcopy( n-ihi, h( ihi+1, ihi+1 ), ldh+1, w( ihi+1 ), 1 )
+                        
+              ! ==== initialize z, if requested ====
+              if( initz )call stdlib_wlaset( 'A', n, n, czero, cone, z, ldz )
+              ! ==== quick return if possible ====
+              if( ilo==ihi ) then
+                 w( ilo ) = h( ilo, ilo )
+                 return
+              end if
+              ! ==== stdlib_wlahqr/stdlib_wlaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'ZHSEQR', job( : 1 ) // compz( : 1 ), n,ilo, ihi, lwork )
+                        
+              nmin = max( ntiny, nmin )
+              ! ==== stdlib_wlaqr0 for big matrices; stdlib_wlahqr for small ones ====
+              if( n>nmin ) then
+                 call stdlib_wlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, w, ilo, ihi,z, ldz, work, &
+                           lwork, info )
+              else
+                 ! ==== small matrix ====
+                 call stdlib_wlahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, ilo, ihi,z, ldz, info )
+                           
+                 if( info>0 ) then
+                    ! ==== a rare stdlib_wlahqr failure!  stdlib_wlaqr0 sometimes succeeds
+                    ! .    when stdlib_wlahqr fails. ====
+                    kbot = info
+                    if( n>=nl ) then
+                       ! ==== larger matrices have enough subdiagonal scratch
+                       ! .    space to call stdlib_wlaqr0 directly. ====
+                       call stdlib_wlaqr0( wantt, wantz, n, ilo, kbot, h, ldh, w,ilo, ihi, z, ldz,&
+                                  work, lwork, info )
+                    else
+                       ! ==== tiny matrices don't have enough subdiagonal
+                       ! .    scratch space to benefit from stdlib_wlaqr0.  hence,
+                       ! .    tiny matrices must be copied into a larger
+                       ! .    array before calling stdlib_wlaqr0. ====
+                       call stdlib_wlacpy( 'A', n, n, h, ldh, hl, nl )
+                       hl( n+1, n ) = czero
+                       call stdlib_wlaset( 'A', nl, nl-n, czero, czero, hl( 1, n+1 ),nl )
+                       call stdlib_wlaqr0( wantt, wantz, nl, ilo, kbot, hl, nl, w,ilo, ihi, z, &
+                                 ldz, workl, nl, info )
+                       if( wantt .or. info/=0 )call stdlib_wlacpy( 'A', n, n, hl, nl, h, ldh )
+                                 
+                    end if
+                 end if
+              end if
+              ! ==== clear out the trash, if necessary. ====
+              if( ( wantt .or. info/=0 ) .and. n>2 )call stdlib_wlaset( 'L', n-2, n-2, czero, &
+                        czero, h( 3, 1 ), ldh )
+              ! ==== ensure reported workspace size is backward-compatible with
+              ! .    previous lapack versions. ====
+              work( 1 ) = cmplx( max( real( max( 1, n ),KIND=qp),real( work( 1 ),KIND=qp) ), &
+                        rzero,KIND=qp)
+           end if
+     end subroutine stdlib_whseqr
+
+     !> ZLA_GBAMV:  performs one of the matrix-vector operations
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_wla_gbamv( trans, m, n, kl, ku, alpha, ab, ldab, x,incx, beta, y, incy )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, ldab, m, n, kl, ku, trans
+           ! Array Arguments 
+           complex(qp), intent(in) :: ab(ldab,*), x(*)
+           real(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_wero
+           real(qp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky, lenx, leny, kd, ke
+           complex(qp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,real,aimag,sign
+           ! Statement Functions
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( .not.( ( trans==stdlib_ilatrans( 'N' ) ).or. ( trans==stdlib_ilatrans( 'T' ) )&
+                     .or. ( trans==stdlib_ilatrans( 'C' ) ) ) ) then
+              info = 1
+           else if( m<0 )then
+              info = 2
+           else if( n<0 )then
+              info = 3
+           else if( kl<0 .or. kl>m-1 ) then
+              info = 4
+           else if( ku<0 .or. ku>n-1 ) then
+              info = 5
+           else if( ldab<kl+ku+1 )then
+              info = 6
+           else if( incx==0 )then
+              info = 8
+           else if( incy==0 )then
+              info = 11
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'ZLA_GBAMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( m==0 ).or.( n==0 ).or.( ( alpha==czero ).and.( beta==cone ) ) )return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if( trans==stdlib_ilatrans( 'N' ) )then
+              lenx = n
+              leny = m
+           else
+              lenx = m
+              leny = n
+           end if
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( lenx - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( leny - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(m*n) symb_wero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           kd = ku + 1
+           ke = kl + 1
+           iy = ky
+           if ( incx==1 ) then
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_wero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = cabs1( ab( kd+i-j, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_wero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_wero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = cabs1( ab( ke-i+j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_wero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_wero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       jx = kx
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = cabs1( ab( kd+i-j, j ) )
+                          symb_wero = symb_wero .and.( x( jx ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_wero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       jx = kx
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = cabs1( ab( ke-i+j, i ) )
+                          symb_wero = symb_wero .and.( x( jx ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wla_gbamv
+
+     !> ZLA_GBRCOND_C: Computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a DOUBLE PRECISION vector.
+
+     real(qp) function stdlib_wla_gbrcond_c( trans, n, kl, ku, ab,ldab, afb, ldafb, ipiv,c, &
+               capply, info, work,rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, kl, ku, ldab, ldafb
+           integer(ilp) :: kd, ke
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: ab(ldab,*), afb(ldafb,*)
+           complex(qp), intent(out) :: work(*)
+           real(qp), intent(in) :: c(*)
+           real(qp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notrans
+           integer(ilp) :: kase, i, j
+           real(qp) :: ainvnm, anorm, tmp
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_wla_gbrcond_c = zero
+           info = 0
+           notrans = stdlib_lsame( trans, 'N' )
+           if ( .not. notrans .and. .not. stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 .or. kl>n-1 ) then
+              info = -3
+           else if( ku<0 .or. ku>n-1 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLA_GBRCOND_C', -info )
+              return
+           end if
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           kd = ku + 1
+           ke = kl + 1
+           if ( notrans ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + cabs1( ab( kd+i-j, j ) ) / c( j )
+                    end do
+                 else
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + cabs1( ab( kd+i-j, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + cabs1( ab( ke-i+j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + cabs1( ab( ke-i+j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_wla_gbrcond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if ( notrans ) then
+                    call stdlib_wgbtrs( 'NO TRANSPOSE', n, kl, ku, 1, afb, ldafb,ipiv, work, n, &
+                              info )
+                 else
+                    call stdlib_wgbtrs( 'CONJUGATE TRANSPOSE', n, kl, ku, 1, afb,ldafb, ipiv, &
+                              work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**h).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( notrans ) then
+                    call stdlib_wgbtrs( 'CONJUGATE TRANSPOSE', n, kl, ku, 1, afb,ldafb, ipiv,  &
+                              work, n, info )
+                 else
+                    call stdlib_wgbtrs( 'NO TRANSPOSE', n, kl, ku, 1, afb, ldafb,ipiv, work, n, &
+                              info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_wla_gbrcond_c = one / ainvnm
+           return
+     end function stdlib_wla_gbrcond_c
+
+     !> ZLA_GBRPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     pure real(qp) function stdlib_wla_gbrpvgrw( n, kl, ku, ncols, ab,ldab, afb, ldafb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, kl, ku, ncols, ldab, ldafb
+           ! Array Arguments 
+           complex(qp), intent(in) :: ab(ldab,*), afb(ldafb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(qp) :: amax, umax, rpvgrw
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,aimag
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           rpvgrw = one
+           kd = ku + 1
+           do j = 1, ncols
+              amax = zero
+              umax = zero
+              do i = max( j-ku, 1 ), min( j+kl, n )
+                 amax = max( cabs1( ab( kd+i-j, j ) ), amax )
+              end do
+              do i = max( j-ku, 1 ), j
+                 umax = max( cabs1( afb( kd+i-j, j ) ), umax )
+              end do
+              if ( umax /= zero ) then
+                 rpvgrw = min( amax / umax, rpvgrw )
+              end if
+           end do
+           stdlib_wla_gbrpvgrw = rpvgrw
+     end function stdlib_wla_gbrpvgrw
+
+     !> ZLA_GEAMV:  performs one of the matrix-vector operations
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_wla_geamv( trans, m, n, alpha, a, lda, x, incx, beta,y, incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           integer(ilp), intent(in) :: trans
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), x(*)
+           real(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_wero
+           real(qp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky, lenx, leny
+           complex(qp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,real,aimag,sign
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( .not.( ( trans==stdlib_ilatrans( 'N' ) ).or. ( trans==stdlib_ilatrans( 'T' ) )&
+                     .or. ( trans==stdlib_ilatrans( 'C' ) ) ) ) then
+              info = 1
+           else if( m<0 )then
+              info = 2
+           else if( n<0 )then
+              info = 3
+           else if( lda<max( 1, m ) )then
+              info = 6
+           else if( incx==0 )then
+              info = 8
+           else if( incy==0 )then
+              info = 11
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'ZLA_GEAMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( m==0 ).or.( n==0 ).or.( ( alpha==czero ).and.( beta==cone ) ) )return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if( trans==stdlib_ilatrans( 'N' ) )then
+              lenx = n
+              leny = m
+           else
+              lenx = m
+              leny = n
+           end if
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( lenx - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( leny - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(m*n) symb_wero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_wero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       do j = 1, lenx
+                          temp = cabs1( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_wero ) y( iy ) =y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_wero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       do j = 1, lenx
+                          temp = cabs1( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_wero ) y( iy ) =y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_wero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       jx = kx
+                       do j = 1, lenx
+                          temp = cabs1( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( jx ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_wero ) y( iy ) =y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_wero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       jx = kx
+                       do j = 1, lenx
+                          temp = cabs1( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( jx ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_wero ) y( iy ) =y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wla_geamv
+
+     !> ZLA_GERCOND_C: computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a DOUBLE PRECISION vector.
+
+     real(qp) function stdlib_wla_gercond_c( trans, n, a, lda, af,ldaf, ipiv, c, capply,info, &
+               work, rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, lda, ldaf
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*), af(ldaf,*)
+           complex(qp), intent(out) :: work(*)
+           real(qp), intent(in) :: c(*)
+           real(qp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notrans
+           integer(ilp) :: kase, i, j
+           real(qp) :: ainvnm, anorm, tmp
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real,aimag
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_wla_gercond_c = zero
+           info = 0
+           notrans = stdlib_lsame( trans, 'N' )
+           if ( .not. notrans .and. .not. stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLA_GERCOND_C', -info )
+              return
+           end if
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           if ( notrans ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, n
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, n
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, n
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, n
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_wla_gercond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if (notrans) then
+                    call stdlib_wgetrs( 'NO TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                              
+                 else
+                    call stdlib_wgetrs( 'CONJUGATE TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info &
+                              )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**h).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( notrans ) then
+                    call stdlib_wgetrs( 'CONJUGATE TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info &
+                              )
+                 else
+                    call stdlib_wgetrs( 'NO TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                              
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_wla_gercond_c = one / ainvnm
+           return
+     end function stdlib_wla_gercond_c
+
+     !> ZLA_GERPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     pure real(qp) function stdlib_wla_gerpvgrw( n, ncols, a, lda, af,ldaf )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, ncols, lda, ldaf
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), af(ldaf,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: amax, umax, rpvgrw
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: max,min,abs,real,aimag
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           rpvgrw = one
+           do j = 1, ncols
+              amax = zero
+              umax = zero
+              do i = 1, n
+                 amax = max( cabs1( a( i, j ) ), amax )
+              end do
+              do i = 1, j
+                 umax = max( cabs1( af( i, j ) ), umax )
+              end do
+              if ( umax /= zero ) then
+                 rpvgrw = min( amax / umax, rpvgrw )
+              end if
+           end do
+           stdlib_wla_gerpvgrw = rpvgrw
+     end function stdlib_wla_gerpvgrw
+
+     !> ZLA_SYAMV  performs the matrix-vector operation
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> n by n symmetric matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_wla_heamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n, uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), x(*)
+           real(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_wero
+           real(qp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign,real,aimag
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag ( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( uplo/=stdlib_ilauplo( 'U' ) .and.uplo/=stdlib_ilauplo( 'L' ) )then
+              info = 1
+           else if( n<0 )then
+              info = 2
+           else if( lda<max( 1, n ) )then
+              info = 5
+           else if( incx==0 )then
+              info = 7
+           else if( incy==0 )then
+              info = 10
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'ZHEMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( n - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( n - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(n^2) symb_wero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if (.not.symb_wero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if (.not.symb_wero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wla_heamv
+
+     !> ZLA_HERCOND_C: computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a DOUBLE PRECISION vector.
+
+     real(qp) function stdlib_wla_hercond_c( uplo, n, a, lda, af,ldaf, ipiv, c, capply,info, work,&
+                rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, lda, ldaf
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*), af(ldaf,*)
+           complex(qp), intent(out) :: work(*)
+           real(qp), intent(in) :: c(*)
+           real(qp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: kase, i, j
+           real(qp) :: ainvnm, anorm, tmp
+           logical(lk) :: up, upper
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_wla_hercond_c = zero
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLA_HERCOND_C', -info )
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_wla_hercond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if ( up ) then
+                    call stdlib_whetrs( 'U', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_whetrs( 'L', n, 1, af, ldaf, ipiv,work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**h).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_whetrs( 'U', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_whetrs( 'L', n, 1, af, ldaf, ipiv,work, n, info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_wla_hercond_c = one / ainvnm
+           return
+     end function stdlib_wla_hercond_c
+
+     !> ZLA_HERPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(qp) function stdlib_wla_herpvgrw( uplo, n, info, a, lda, af,ldaf, ipiv, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, info, lda, ldaf
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: ncols, i, j, k, kp
+           real(qp) :: amax, umax, rpvgrw, tmp
+           logical(lk) :: upper, lsame
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag ( zdum ) )
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           if ( info==0 ) then
+              if (upper) then
+                 ncols = 1
+              else
+                 ncols = n
+              end if
+           else
+              ncols = info
+           end if
+           rpvgrw = one
+           do i = 1, 2*n
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column of a.  compute the max
+           ! for all n columns so we can apply the pivot permutation while
+           ! looping below.  assume a full factorization is the common case.
+           if ( upper ) then
+              do j = 1, n
+                 do i = 1, j
+                    work( n+i ) = max( cabs1( a( i,j ) ), work( n+i ) )
+                    work( n+j ) = max( cabs1( a( i,j ) ), work( n+j ) )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = j, n
+                    work( n+i ) = max( cabs1( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( cabs1( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of u or l.  also
+           ! permute the magnitudes of a above so they're in the same order as
+           ! the factor.
+           ! the iteration orders and permutations were copied from stdlib_wsytrs.
+           ! calls to stdlib_dswap would be severe overkill.
+           if ( upper ) then
+              k = n
+              do while ( k < ncols .and. k>0 )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = 1, k
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                    end do
+                    k = k - 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k-1 )
+                    work( n+k-1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = 1, k-1
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                       work( k-1 ) =max( cabs1( af( i, k-1 ) ), work( k-1 ) )
+                    end do
+                    work( k ) = max( cabs1( af( k, k ) ), work( k ) )
+                    k = k - 2
+                 end if
+              end do
+              k = ncols
+              do while ( k <= n )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k + 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k + 2
+                 end if
+              end do
+           else
+              k = 1
+              do while ( k <= ncols )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = k, n
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                    end do
+                    k = k + 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k+1 )
+                    work( n+k+1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = k+1, n
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                       work( k+1 ) =max( cabs1( af( i, k+1 ) ) , work( k+1 ) )
+                    end do
+                    work(k) = max( cabs1( af( k, k ) ), work( k ) )
+                    k = k + 2
+                 end if
+              end do
+              k = ncols
+              do while ( k >= 1 )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k - 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k - 2
+                 endif
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( upper ) then
+              do i = ncols, n
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_wla_herpvgrw = rpvgrw
+     end function stdlib_wla_herpvgrw
+
+     !> ZLA_LIN_BERR: computes componentwise relative backward error from
+     !> the formula
+     !> max(i) ( abs(R(i)) / ( abs(op(A_s))*abs(Y) + abs(B_s) )(i) )
+     !> where abs(Z) is the componentwise absolute value of the matrix
+     !> or vector Z.
+
+     pure subroutine stdlib_wla_lin_berr( n, nz, nrhs, res, ayb, berr )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, nz, nrhs
+           ! Array Arguments 
+           real(qp), intent(in) :: ayb(n,nrhs)
+           real(qp), intent(out) :: berr(nrhs)
+           complex(qp), intent(in) :: res(n,nrhs)
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: tmp,safe1
+           integer(ilp) :: i, j
+           complex(qp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           complex(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! adding safe1 to the numerator guards against spuriously zero
+           ! residuals.  a similar safeguard is in the cla_yyamv routine used
+           ! to compute ayb.
+           safe1 = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = (nz+1)*safe1
+           do j = 1, nrhs
+              berr(j) = zero
+              do i = 1, n
+                 if (ayb(i,j) /= zero) then
+                    tmp = (safe1 + cabs1(res(i,j)))/ayb(i,j)
+                    berr(j) = max( berr(j), tmp )
+                 end if
+           ! if ayb is exactly 0.0_qp (and if computed by cla_yyamv), then we know
+           ! the true residual also must be exactly zero.
+              end do
+           end do
+     end subroutine stdlib_wla_lin_berr
+
+     !> ZLA_PORCOND_C: Computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a DOUBLE PRECISION vector
+
+     real(qp) function stdlib_wla_porcond_c( uplo, n, a, lda, af,ldaf, c, capply, info,work, &
+               rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, lda, ldaf
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), af(ldaf,*)
+           complex(qp), intent(out) :: work(*)
+           real(qp), intent(in) :: c(*)
+           real(qp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: kase
+           real(qp) :: ainvnm, anorm, tmp
+           integer(ilp) :: i, j
+           logical(lk) :: up, upper
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real,aimag
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_wla_porcond_c = zero
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLA_PORCOND_C', -info )
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_wla_porcond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if ( up ) then
+                    call stdlib_wpotrs( 'U', n, 1, af, ldaf,work, n, info )
+                 else
+                    call stdlib_wpotrs( 'L', n, 1, af, ldaf,work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**h).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_wpotrs( 'U', n, 1, af, ldaf,work, n, info )
+                 else
+                    call stdlib_wpotrs( 'L', n, 1, af, ldaf,work, n, info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_wla_porcond_c = one / ainvnm
+           return
+     end function stdlib_wla_porcond_c
+
+     !> ZLA_PORPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(qp) function stdlib_wla_porpvgrw( uplo, ncols, a, lda, af,ldaf, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: ncols, lda, ldaf
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: amax, umax, rpvgrw
+           logical(lk) :: upper
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,aimag
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           ! stdlib_qpotrf will have factored only the ncolsxncols leading minor, so
+           ! we restrict the growth search to that minor and use only the first
+           ! 2*ncols workspace entries.
+           rpvgrw = one
+           do i = 1, 2*ncols
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column.
+           if ( upper ) then
+              do j = 1, ncols
+                 do i = 1, j
+                    work( ncols+j ) =max( cabs1( a( i, j ) ), work( ncols+j ) )
+                 end do
+              end do
+           else
+              do j = 1, ncols
+                 do i = j, ncols
+                    work( ncols+j ) =max( cabs1( a( i, j ) ), work( ncols+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of the factor in
+           ! af.  no pivoting, so no permutations.
+           if ( stdlib_lsame( 'UPPER', uplo ) ) then
+              do j = 1, ncols
+                 do i = 1, j
+                    work( j ) = max( cabs1( af( i, j ) ), work( j ) )
+                 end do
+              end do
+           else
+              do j = 1, ncols
+                 do i = j, ncols
+                    work( j ) = max( cabs1( af( i, j ) ), work( j ) )
+                 end do
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( stdlib_lsame( 'UPPER', uplo ) ) then
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( ncols+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( ncols+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_wla_porpvgrw = rpvgrw
+     end function stdlib_wla_porpvgrw
+
+     !> ZLA_SYAMV:  performs the matrix-vector operation
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> n by n symmetric matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_wla_syamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           integer(ilp), intent(in) :: uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), x(*)
+           real(qp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_wero
+           real(qp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign,real,aimag
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag ( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( uplo/=stdlib_ilauplo( 'U' ) .and.uplo/=stdlib_ilauplo( 'L' ) )then
+              info = 1
+           else if( n<0 )then
+              info = 2
+           else if( lda<max( 1, n ) )then
+              info = 5
+           else if( incx==0 )then
+              info = 7
+           else if( incy==0 )then
+              info = 10
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'ZLA_SYAMV', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( n - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( n - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(n^2) symb_wero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_wero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_wero = .true.
+                    else
+                       symb_wero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( i, j ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( j, i ) )
+                          symb_wero = symb_wero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_wero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wla_syamv
+
+     !> ZLA_SYRCOND_C: Computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a DOUBLE PRECISION vector.
+
+     real(qp) function stdlib_wla_syrcond_c( uplo, n, a, lda, af,ldaf, ipiv, c, capply,info, work,&
+                rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, lda, ldaf
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*), af(ldaf,*)
+           complex(qp), intent(out) :: work(*)
+           real(qp), intent(in) :: c(*)
+           real(qp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: kase
+           real(qp) :: ainvnm, anorm, tmp
+           integer(ilp) :: i, j
+           logical(lk) :: up, upper
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_wla_syrcond_c = zero
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLA_SYRCOND_C', -info )
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_wla_syrcond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if ( up ) then
+                    call stdlib_wsytrs( 'U', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_wsytrs( 'L', n, 1, af, ldaf, ipiv,work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_wsytrs( 'U', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_wsytrs( 'L', n, 1, af, ldaf, ipiv,work, n, info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_wla_syrcond_c = one / ainvnm
+           return
+     end function stdlib_wla_syrcond_c
+
+     !> ZLA_SYRPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(qp) function stdlib_wla_syrpvgrw( uplo, n, info, a, lda, af,ldaf, ipiv, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, info, lda, ldaf
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(qp), intent(out) :: work(*)
+           integer(ilp), intent(in) :: ipiv(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: ncols, i, j, k, kp
+           real(qp) :: amax, umax, rpvgrw, tmp
+           logical(lk) :: upper
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag ( zdum ) )
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           if ( info==0 ) then
+              if ( upper ) then
+                 ncols = 1
+              else
+                 ncols = n
+              end if
+           else
+              ncols = info
+           end if
+           rpvgrw = one
+           do i = 1, 2*n
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column of a.  compute the max
+           ! for all n columns so we can apply the pivot permutation while
+           ! looping below.  assume a full factorization is the common case.
+           if ( upper ) then
+              do j = 1, n
+                 do i = 1, j
+                    work( n+i ) = max( cabs1( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( cabs1( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = j, n
+                    work( n+i ) = max( cabs1( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( cabs1( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of u or l.  also
+           ! permute the magnitudes of a above so they're in the same order as
+           ! the factor.
+           ! the iteration orders and permutations were copied from stdlib_wsytrs.
+           ! calls to stdlib_dswap would be severe overkill.
+           if ( upper ) then
+              k = n
+              do while ( k < ncols .and. k>0 )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = 1, k
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                    end do
+                    k = k - 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k-1 )
+                    work( n+k-1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = 1, k-1
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                       work( k-1 ) =max( cabs1( af( i, k-1 ) ), work( k-1 ) )
+                    end do
+                    work( k ) = max( cabs1( af( k, k ) ), work( k ) )
+                    k = k - 2
+                 end if
+              end do
+              k = ncols
+              do while ( k <= n )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k + 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k + 2
+                 end if
+              end do
+           else
+              k = 1
+              do while ( k <= ncols )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = k, n
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                    end do
+                    k = k + 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k+1 )
+                    work( n+k+1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = k+1, n
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                       work( k+1 ) =max( cabs1( af( i, k+1 ) ), work( k+1 ) )
+                    end do
+                    work( k ) = max( cabs1( af( k, k ) ), work( k ) )
+                    k = k + 2
+                 end if
+              end do
+              k = ncols
+              do while ( k >= 1 )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k - 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k - 2
+                 endif
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( upper ) then
+              do i = ncols, n
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_wla_syrpvgrw = rpvgrw
+     end function stdlib_wla_syrpvgrw
+
+     !> ZLA_WWADDW: adds a vector W into a doubled-single vector (X, Y).
+     !> This works for all extant IBM's hex and binary floating point
+     !> arithmetic, but not for decimal.
+
+     pure subroutine stdlib_wla_wwaddw( n, x, y, w )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: x(*), y(*)
+           complex(qp), intent(in) :: w(*)
+        ! =====================================================================
+           ! Local Scalars 
+           complex(qp) :: s
+           integer(ilp) :: i
+           ! Executable Statements 
+           do 10 i = 1, n
+             s = x(i) + w(i)
+             s = (s + s) - s
+             y(i) = ((x(i) - s) + w(i)) + y(i)
+             x(i) = s
+             10 continue
+           return
+     end subroutine stdlib_wla_wwaddw
+
+     !> ZLABRD: reduces the first NB rows and columns of a complex general
+     !> m by n matrix A to upper or lower real bidiagonal form by a unitary
+     !> transformation Q**H * A * P, and returns the matrices X and Y which
+     !> are needed to apply the transformation to the unreduced part of A.
+     !> If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
+     !> bidiagonal form.
+     !> This is an auxiliary routine called by ZGEBRD
+
+     pure subroutine stdlib_wlabrd( m, n, nb, a, lda, d, e, tauq, taup, x, ldx, y,ldy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldx, ldy, m, n, nb
+           ! Array Arguments 
+           real(qp), intent(out) :: d(*), e(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: taup(*), tauq(*), x(ldx,*), y(ldy,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           if( m>=n ) then
+              ! reduce to upper bidiagonal form
+              loop_10: do i = 1, nb
+                 ! update a(i:m,i)
+                 call stdlib_wlacgv( i-1, y( i, 1 ), ldy )
+                 call stdlib_wgemv( 'NO TRANSPOSE', m-i+1, i-1, -cone, a( i, 1 ),lda, y( i, 1 ), &
+                           ldy, cone, a( i, i ), 1 )
+                 call stdlib_wlacgv( i-1, y( i, 1 ), ldy )
+                 call stdlib_wgemv( 'NO TRANSPOSE', m-i+1, i-1, -cone, x( i, 1 ),ldx, a( 1, i ), &
+                           1, cone, a( i, i ), 1 )
+                 ! generate reflection q(i) to annihilate a(i+1:m,i)
+                 alpha = a( i, i )
+                 call stdlib_wlarfg( m-i+1, alpha, a( min( i+1, m ), i ), 1,tauq( i ) )
+                 d( i ) = real( alpha,KIND=qp)
+                 if( i<n ) then
+                    a( i, i ) = cone
+                    ! compute y(i+1:n,i)
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', m-i+1, n-i, cone,a( i, i+1 ), lda, &
+                              a( i, i ), 1, czero,y( i+1, i ), 1 )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', m-i+1, i-1, cone,a( i, 1 ), lda, a( &
+                              i, i ), 1, czero,y( 1, i ), 1 )
+                    call stdlib_wgemv( 'NO TRANSPOSE', n-i, i-1, -cone, y( i+1, 1 ),ldy, y( 1, i )&
+                              , 1, cone, y( i+1, i ), 1 )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', m-i+1, i-1, cone,x( i, 1 ), ldx, a( &
+                              i, i ), 1, czero,y( 1, i ), 1 )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', i-1, n-i, -cone,a( 1, i+1 ), lda, y(&
+                               1, i ), 1, cone,y( i+1, i ), 1 )
+                    call stdlib_wscal( n-i, tauq( i ), y( i+1, i ), 1 )
+                    ! update a(i,i+1:n)
+                    call stdlib_wlacgv( n-i, a( i, i+1 ), lda )
+                    call stdlib_wlacgv( i, a( i, 1 ), lda )
+                    call stdlib_wgemv( 'NO TRANSPOSE', n-i, i, -cone, y( i+1, 1 ),ldy, a( i, 1 ), &
+                              lda, cone, a( i, i+1 ), lda )
+                    call stdlib_wlacgv( i, a( i, 1 ), lda )
+                    call stdlib_wlacgv( i-1, x( i, 1 ), ldx )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', i-1, n-i, -cone,a( 1, i+1 ), lda, x(&
+                               i, 1 ), ldx, cone,a( i, i+1 ), lda )
+                    call stdlib_wlacgv( i-1, x( i, 1 ), ldx )
+                    ! generate reflection p(i) to annihilate a(i,i+2:n)
+                    alpha = a( i, i+1 )
+                    call stdlib_wlarfg( n-i, alpha, a( i, min( i+2, n ) ), lda,taup( i ) )
+                    e( i ) = real( alpha,KIND=qp)
+                    a( i, i+1 ) = cone
+                    ! compute x(i+1:m,i)
+                    call stdlib_wgemv( 'NO TRANSPOSE', m-i, n-i, cone, a( i+1, i+1 ),lda, a( i, i+&
+                              1 ), lda, czero, x( i+1, i ), 1 )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-i, i, cone,y( i+1, 1 ), ldy, a( i,&
+                               i+1 ), lda, czero,x( 1, i ), 1 )
+                    call stdlib_wgemv( 'NO TRANSPOSE', m-i, i, -cone, a( i+1, 1 ),lda, x( 1, i ), &
+                              1, cone, x( i+1, i ), 1 )
+                    call stdlib_wgemv( 'NO TRANSPOSE', i-1, n-i, cone, a( 1, i+1 ),lda, a( i, i+1 &
+                              ), lda, czero, x( 1, i ), 1 )
+                    call stdlib_wgemv( 'NO TRANSPOSE', m-i, i-1, -cone, x( i+1, 1 ),ldx, x( 1, i )&
+                              , 1, cone, x( i+1, i ), 1 )
+                    call stdlib_wscal( m-i, taup( i ), x( i+1, i ), 1 )
+                    call stdlib_wlacgv( n-i, a( i, i+1 ), lda )
+                 end if
+              end do loop_10
+           else
+              ! reduce to lower bidiagonal form
+              loop_20: do i = 1, nb
+                 ! update a(i,i:n)
+                 call stdlib_wlacgv( n-i+1, a( i, i ), lda )
+                 call stdlib_wlacgv( i-1, a( i, 1 ), lda )
+                 call stdlib_wgemv( 'NO TRANSPOSE', n-i+1, i-1, -cone, y( i, 1 ),ldy, a( i, 1 ), &
+                           lda, cone, a( i, i ), lda )
+                 call stdlib_wlacgv( i-1, a( i, 1 ), lda )
+                 call stdlib_wlacgv( i-1, x( i, 1 ), ldx )
+                 call stdlib_wgemv( 'CONJUGATE TRANSPOSE', i-1, n-i+1, -cone,a( 1, i ), lda, x( i,&
+                            1 ), ldx, cone, a( i, i ),lda )
+                 call stdlib_wlacgv( i-1, x( i, 1 ), ldx )
+                 ! generate reflection p(i) to annihilate a(i,i+1:n)
+                 alpha = a( i, i )
+                 call stdlib_wlarfg( n-i+1, alpha, a( i, min( i+1, n ) ), lda,taup( i ) )
+                 d( i ) = real( alpha,KIND=qp)
+                 if( i<m ) then
+                    a( i, i ) = cone
+                    ! compute x(i+1:m,i)
+                    call stdlib_wgemv( 'NO TRANSPOSE', m-i, n-i+1, cone, a( i+1, i ),lda, a( i, i &
+                              ), lda, czero, x( i+1, i ), 1 )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-i+1, i-1, cone,y( i, 1 ), ldy, a( &
+                              i, i ), lda, czero,x( 1, i ), 1 )
+                    call stdlib_wgemv( 'NO TRANSPOSE', m-i, i-1, -cone, a( i+1, 1 ),lda, x( 1, i )&
+                              , 1, cone, x( i+1, i ), 1 )
+                    call stdlib_wgemv( 'NO TRANSPOSE', i-1, n-i+1, cone, a( 1, i ),lda, a( i, i ),&
+                               lda, czero, x( 1, i ), 1 )
+                    call stdlib_wgemv( 'NO TRANSPOSE', m-i, i-1, -cone, x( i+1, 1 ),ldx, x( 1, i )&
+                              , 1, cone, x( i+1, i ), 1 )
+                    call stdlib_wscal( m-i, taup( i ), x( i+1, i ), 1 )
+                    call stdlib_wlacgv( n-i+1, a( i, i ), lda )
+                    ! update a(i+1:m,i)
+                    call stdlib_wlacgv( i-1, y( i, 1 ), ldy )
+                    call stdlib_wgemv( 'NO TRANSPOSE', m-i, i-1, -cone, a( i+1, 1 ),lda, y( i, 1 )&
+                              , ldy, cone, a( i+1, i ), 1 )
+                    call stdlib_wlacgv( i-1, y( i, 1 ), ldy )
+                    call stdlib_wgemv( 'NO TRANSPOSE', m-i, i, -cone, x( i+1, 1 ),ldx, a( 1, i ), &
+                              1, cone, a( i+1, i ), 1 )
+                    ! generate reflection q(i) to annihilate a(i+2:m,i)
+                    alpha = a( i+1, i )
+                    call stdlib_wlarfg( m-i, alpha, a( min( i+2, m ), i ), 1,tauq( i ) )
+                    e( i ) = real( alpha,KIND=qp)
+                    a( i+1, i ) = cone
+                    ! compute y(i+1:n,i)
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', m-i, n-i, cone,a( i+1, i+1 ), lda, &
+                              a( i+1, i ), 1, czero,y( i+1, i ), 1 )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', m-i, i-1, cone,a( i+1, 1 ), lda, a( &
+                              i+1, i ), 1, czero,y( 1, i ), 1 )
+                    call stdlib_wgemv( 'NO TRANSPOSE', n-i, i-1, -cone, y( i+1, 1 ),ldy, y( 1, i )&
+                              , 1, cone, y( i+1, i ), 1 )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', m-i, i, cone,x( i+1, 1 ), ldx, a( i+&
+                              1, i ), 1, czero,y( 1, i ), 1 )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', i, n-i, -cone,a( 1, i+1 ), lda, y( &
+                              1, i ), 1, cone,y( i+1, i ), 1 )
+                    call stdlib_wscal( n-i, tauq( i ), y( i+1, i ), 1 )
+                 else
+                    call stdlib_wlacgv( n-i+1, a( i, i ), lda )
+                 end if
+              end do loop_20
+           end if
+           return
+     end subroutine stdlib_wlabrd
+
+     !> ZLACGV: conjugates a complex vector of length N.
+
+     pure subroutine stdlib_wlacgv( n, x, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: x(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ioff
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           if( incx==1 ) then
+              do i = 1, n
+                 x( i ) = conjg( x( i ) )
+              end do
+           else
+              ioff = 1
+              if( incx<0 )ioff = 1 - ( n-1 )*incx
+              do i = 1, n
+                 x( ioff ) = conjg( x( ioff ) )
+                 ioff = ioff + incx
+              end do
+           end if
+           return
+     end subroutine stdlib_wlacgv
+
+     !> ZLACN2: estimates the 1-norm of a square, complex matrix A.
+     !> Reverse communication is used for evaluating matrix-vector products.
+
+     pure subroutine stdlib_wlacn2( n, v, x, est, kase, isave )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(inout) :: kase
+           integer(ilp), intent(in) :: n
+           real(qp), intent(inout) :: est
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: isave(3)
+           complex(qp), intent(out) :: v(*)
+           complex(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, jlast
+           real(qp) :: absxi, altsgn, estold, safmin, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag
+           ! Executable Statements 
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           if( kase==0 ) then
+              do i = 1, n
+                 x( i ) = cmplx( one / real( n,KIND=qp),KIND=qp)
+              end do
+              kase = 1
+              isave( 1 ) = 1
+              return
+           end if
+           go to ( 20, 40, 70, 90, 120 )isave( 1 )
+           ! ................ entry   (isave( 1 ) = 1)
+           ! first iteration.  x has been overwritten by a*x.
+           20 continue
+           if( n==1 ) then
+              v( 1 ) = x( 1 )
+              est = abs( v( 1 ) )
+              ! ... quit
+              go to 130
+           end if
+           est = stdlib_qzsum1( n, x, 1 )
+           do i = 1, n
+              absxi = abs( x( i ) )
+              if( absxi>safmin ) then
+                 x( i ) = cmplx( real( x( i ),KIND=qp) / absxi,aimag( x( i ) ) / absxi,KIND=qp)
+                           
+              else
+                 x( i ) = cone
+              end if
+           end do
+           kase = 2
+           isave( 1 ) = 2
+           return
+           ! ................ entry   (isave( 1 ) = 2)
+           ! first iteration.  x has been overwritten by ctrans(a)*x.
+           40 continue
+           isave( 2 ) = stdlib_iwmax1( n, x, 1 )
+           isave( 3 ) = 2
+           ! main loop - iterations 2,3,...,itmax.
+           50 continue
+           do i = 1, n
+              x( i ) = czero
+           end do
+           x( isave( 2 ) ) = cone
+           kase = 1
+           isave( 1 ) = 3
+           return
+           ! ................ entry   (isave( 1 ) = 3)
+           ! x has been overwritten by a*x.
+           70 continue
+           call stdlib_wcopy( n, x, 1, v, 1 )
+           estold = est
+           est = stdlib_qzsum1( n, v, 1 )
+           ! test for cycling.
+           if( est<=estold )go to 100
+           do i = 1, n
+              absxi = abs( x( i ) )
+              if( absxi>safmin ) then
+                 x( i ) = cmplx( real( x( i ),KIND=qp) / absxi,aimag( x( i ) ) / absxi,KIND=qp)
+                           
+              else
+                 x( i ) = cone
+              end if
+           end do
+           kase = 2
+           isave( 1 ) = 4
+           return
+           ! ................ entry   (isave( 1 ) = 4)
+           ! x has been overwritten by ctrans(a)*x.
+           90 continue
+           jlast = isave( 2 )
+           isave( 2 ) = stdlib_iwmax1( n, x, 1 )
+           if( ( abs( x( jlast ) )/=abs( x( isave( 2 ) ) ) ) .and.( isave( 3 )<itmax ) ) &
+                     then
+              isave( 3 ) = isave( 3 ) + 1
+              go to 50
+           end if
+           ! iteration complete.  final stage.
+           100 continue
+           altsgn = one
+           do i = 1, n
+              x( i ) = cmplx( altsgn*( one+real( i-1,KIND=qp) / real( n-1,KIND=qp) ),KIND=qp)
+                        
+              altsgn = -altsgn
+           end do
+           kase = 1
+           isave( 1 ) = 5
+           return
+           ! ................ entry   (isave( 1 ) = 5)
+           ! x has been overwritten by a*x.
+           120 continue
+           temp = two*( stdlib_qzsum1( n, x, 1 ) / real( 3*n,KIND=qp) )
+           if( temp>est ) then
+              call stdlib_wcopy( n, x, 1, v, 1 )
+              est = temp
+           end if
+           130 continue
+           kase = 0
+           return
+     end subroutine stdlib_wlacn2
+
+     !> ZLACON: estimates the 1-norm of a square, complex matrix A.
+     !> Reverse communication is used for evaluating matrix-vector products.
+
+     subroutine stdlib_wlacon( n, v, x, est, kase )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(inout) :: kase
+           integer(ilp), intent(in) :: n
+           real(qp), intent(inout) :: est
+           ! Array Arguments 
+           complex(qp), intent(out) :: v(n)
+           complex(qp), intent(inout) :: x(n)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iter, j, jlast, jump
+           real(qp) :: absxi, altsgn, estold, safmin, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag
+           ! Save Statement 
+           save
+           ! Executable Statements 
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           if( kase==0 ) then
+              do i = 1, n
+                 x( i ) = cmplx( one / real( n,KIND=qp),KIND=qp)
+              end do
+              kase = 1
+              jump = 1
+              return
+           end if
+           go to ( 20, 40, 70, 90, 120 )jump
+           ! ................ entry   (jump = 1)
+           ! first iteration.  x has been overwritten by a*x.
+           20 continue
+           if( n==1 ) then
+              v( 1 ) = x( 1 )
+              est = abs( v( 1 ) )
+              ! ... quit
+              go to 130
+           end if
+           est = stdlib_qzsum1( n, x, 1 )
+           do i = 1, n
+              absxi = abs( x( i ) )
+              if( absxi>safmin ) then
+                 x( i ) = cmplx( real( x( i ),KIND=qp) / absxi,aimag( x( i ) ) / absxi,KIND=qp)
+                           
+              else
+                 x( i ) = cone
+              end if
+           end do
+           kase = 2
+           jump = 2
+           return
+           ! ................ entry   (jump = 2)
+           ! first iteration.  x has been overwritten by ctrans(a)*x.
+           40 continue
+           j = stdlib_iwmax1( n, x, 1 )
+           iter = 2
+           ! main loop - iterations 2,3,...,itmax.
+           50 continue
+           do i = 1, n
+              x( i ) = czero
+           end do
+           x( j ) = cone
+           kase = 1
+           jump = 3
+           return
+           ! ................ entry   (jump = 3)
+           ! x has been overwritten by a*x.
+           70 continue
+           call stdlib_wcopy( n, x, 1, v, 1 )
+           estold = est
+           est = stdlib_qzsum1( n, v, 1 )
+           ! test for cycling.
+           if( est<=estold )go to 100
+           do i = 1, n
+              absxi = abs( x( i ) )
+              if( absxi>safmin ) then
+                 x( i ) = cmplx( real( x( i ),KIND=qp) / absxi,aimag( x( i ) ) / absxi,KIND=qp)
+                           
+              else
+                 x( i ) = cone
+              end if
+           end do
+           kase = 2
+           jump = 4
+           return
+           ! ................ entry   (jump = 4)
+           ! x has been overwritten by ctrans(a)*x.
+           90 continue
+           jlast = j
+           j = stdlib_iwmax1( n, x, 1 )
+           if( ( abs( x( jlast ) )/=abs( x( j ) ) ) .and.( iter<itmax ) ) then
+              iter = iter + 1
+              go to 50
+           end if
+           ! iteration complete.  final stage.
+           100 continue
+           altsgn = one
+           do i = 1, n
+              x( i ) = cmplx( altsgn*( one+real( i-1,KIND=qp) / real( n-1,KIND=qp) ),KIND=qp)
+                        
+              altsgn = -altsgn
+           end do
+           kase = 1
+           jump = 5
+           return
+           ! ................ entry   (jump = 5)
+           ! x has been overwritten by a*x.
+           120 continue
+           temp = two*( stdlib_qzsum1( n, x, 1 ) / real( 3*n,KIND=qp) )
+           if( temp>est ) then
+              call stdlib_wcopy( n, x, 1, v, 1 )
+              est = temp
+           end if
+           130 continue
+           kase = 0
+           return
+     end subroutine stdlib_wlacon
+
+     !> ZLACP2: copies all or part of a real two-dimensional matrix A to a
+     !> complex matrix B.
+
+     pure subroutine stdlib_wlacp2( uplo, m, n, a, lda, b, ldb )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(out) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              do j = 1, n
+                 do i = j, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = 1, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_wlacp2
+
+     !> ZLACPY: copies all or part of a two-dimensional matrix A to another
+     !> matrix B.
+
+     pure subroutine stdlib_wlacpy( uplo, m, n, a, lda, b, ldb )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(out) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              do j = 1, n
+                 do i = j, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = 1, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_wlacpy
+
+     !> ZLACRM: performs a very simple matrix-matrix multiplication:
+     !> C := A * B,
+     !> where A is M by N and complex; B is N by N and real;
+     !> C is M by N and complex.
+
+     pure subroutine stdlib_wlacrm( m, n, a, lda, b, ldb, c, ldc, rwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           ! Array Arguments 
+           real(qp), intent(in) :: b(ldb,*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(out) :: c(ldc,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,aimag
+           ! Executable Statements 
+           ! quick return if possible.
+           if( ( m==0 ) .or. ( n==0 ) )return
+           do j = 1, n
+              do i = 1, m
+                 rwork( ( j-1 )*m+i ) = real( a( i, j ),KIND=qp)
+              end do
+           end do
+           l = m*n + 1
+           call stdlib_qgemm( 'N', 'N', m, n, n, one, rwork, m, b, ldb, zero,rwork( l ), m )
+                     
+           do j = 1, n
+              do i = 1, m
+                 c( i, j ) = rwork( l+( j-1 )*m+i-1 )
+              end do
+           end do
+           do j = 1, n
+              do i = 1, m
+                 rwork( ( j-1 )*m+i ) = aimag( a( i, j ) )
+              end do
+           end do
+           call stdlib_qgemm( 'N', 'N', m, n, n, one, rwork, m, b, ldb, zero,rwork( l ), m )
+                     
+           do j = 1, n
+              do i = 1, m
+                 c( i, j ) = cmplx( real( c( i, j ),KIND=qp),rwork( l+( j-1 )*m+i-1 ),KIND=qp)
+                           
+              end do
+           end do
+           return
+     end subroutine stdlib_wlacrm
+
+     !> ZLACRT: performs the operation
+     !> (  c  s )( x )  ==> ( x )
+     !> ( -s  c )( y )      ( y )
+     !> where c and s are complex and the vectors x and y are complex.
+
+     pure subroutine stdlib_wlacrt( n, cx, incx, cy, incy, c, s )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           complex(qp), intent(in) :: c, s
+           ! Array Arguments 
+           complex(qp), intent(inout) :: cx(*), cy(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           complex(qp) :: ctemp
+           ! Executable Statements 
+           if( n<=0 )return
+           if( incx==1 .and. incy==1 )go to 20
+           ! code for unequal increments or equal increments not equal to 1
+           ix = 1
+           iy = 1
+           if( incx<0 )ix = ( -n+1 )*incx + 1
+           if( incy<0 )iy = ( -n+1 )*incy + 1
+           do i = 1, n
+              ctemp = c*cx( ix ) + s*cy( iy )
+              cy( iy ) = c*cy( iy ) - s*cx( ix )
+              cx( ix ) = ctemp
+              ix = ix + incx
+              iy = iy + incy
+           end do
+           return
+           ! code for both increments equal to 1
+           20 continue
+           do i = 1, n
+              ctemp = c*cx( i ) + s*cy( i )
+              cy( i ) = c*cy( i ) - s*cx( i )
+              cx( i ) = ctemp
+           end do
+           return
+     end subroutine stdlib_wlacrt
+
+     !> ZLADIV: := X / Y, where X and Y are complex.  The computation of X / Y
+     !> will not overflow on an intermediary step unless the results
+     !> overflows.
+
+     pure complex(qp)     function stdlib_wladiv( x, y )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: x, y
+        ! =====================================================================
+           ! Local Scalars 
+           real(qp) :: zi, zr
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,aimag
+           ! Executable Statements 
+           call stdlib_qladiv( real( x,KIND=qp), aimag( x ), real( y,KIND=qp), aimag( y ), zr,zi )
+                     
+           stdlib_wladiv = cmplx( zr, zi,KIND=qp)
+           return
+     end function stdlib_wladiv
+
+     !> Using the divide and conquer method, ZLAED0: computes all eigenvalues
+     !> of a symmetric tridiagonal matrix which is one diagonal block of
+     !> those from reducing a dense or band Hermitian matrix and
+     !> corresponding eigenvectors of the dense or band matrix.
+
+     pure subroutine stdlib_wlaed0( qsiz, n, d, e, q, ldq, qstore, ldqs, rwork,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldq, ldqs, n, qsiz
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: q(ldq,*)
+           complex(qp), intent(out) :: qstore(ldqs,*)
+        ! =====================================================================
+        ! warning:      n could be as big as qsiz!
+           
+           ! Local Scalars 
+           integer(ilp) :: curlvl, curprb, curr, i, igivcl, igivnm, igivpt, indxq, iperm, iprmpt, &
+           iq, iqptr, iwrem, j, k, lgn, ll, matsiz, msd2, smlsiz, smm1, spm1, spm2, submat, &
+                     subpbs, tlvls
+           real(qp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           ! if( icompq < 0 .or. icompq > 2 ) then
+              ! info = -1
+           ! else if( ( icompq == 1 ) .and. ( qsiz < max( 0, n ) ) )
+          ! $        then
+           if( qsiz<max( 0, n ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           else if( ldqs<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAED0', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           smlsiz = stdlib_ilaenv( 9, 'ZLAED0', ' ', 0, 0, 0, 0 )
+           ! determine the size and placement of the submatrices, and save in
+           ! the leading elements of iwork.
+           iwork( 1 ) = n
+           subpbs = 1
+           tlvls = 0
+           10 continue
+           if( iwork( subpbs )>smlsiz ) then
+              do j = subpbs, 1, -1
+                 iwork( 2*j ) = ( iwork( j )+1 ) / 2
+                 iwork( 2*j-1 ) = iwork( j ) / 2
+              end do
+              tlvls = tlvls + 1
+              subpbs = 2*subpbs
+              go to 10
+           end if
+           do j = 2, subpbs
+              iwork( j ) = iwork( j ) + iwork( j-1 )
+           end do
+           ! divide the matrix into subpbs submatrices of size at most smlsiz+1
+           ! using rank-1 modifications (cuts).
+           spm1 = subpbs - 1
+           do i = 1, spm1
+              submat = iwork( i ) + 1
+              smm1 = submat - 1
+              d( smm1 ) = d( smm1 ) - abs( e( smm1 ) )
+              d( submat ) = d( submat ) - abs( e( smm1 ) )
+           end do
+           indxq = 4*n + 3
+           ! set up workspaces for eigenvalues only/accumulate new vectors
+           ! routine
+           temp = log( real( n,KIND=qp) ) / log( two )
+           lgn = int( temp,KIND=ilp)
+           if( 2**lgn<n )lgn = lgn + 1
+           if( 2**lgn<n )lgn = lgn + 1
+           iprmpt = indxq + n + 1
+           iperm = iprmpt + n*lgn
+           iqptr = iperm + n*lgn
+           igivpt = iqptr + n + 2
+           igivcl = igivpt + n*lgn
+           igivnm = 1
+           iq = igivnm + 2*n*lgn
+           iwrem = iq + n**2 + 1
+           ! initialize pointers
+           do i = 0, subpbs
+              iwork( iprmpt+i ) = 1
+              iwork( igivpt+i ) = 1
+           end do
+           iwork( iqptr ) = 1
+           ! solve each submatrix eigenproblem at the bottom of the divide and
+           ! conquer tree.
+           curr = 0
+           do i = 0, spm1
+              if( i==0 ) then
+                 submat = 1
+                 matsiz = iwork( 1 )
+              else
+                 submat = iwork( i ) + 1
+                 matsiz = iwork( i+1 ) - iwork( i )
+              end if
+              ll = iq - 1 + iwork( iqptr+curr )
+              call stdlib_qsteqr( 'I', matsiz, d( submat ), e( submat ),rwork( ll ), matsiz, &
+                        rwork, info )
+              call stdlib_wlacrm( qsiz, matsiz, q( 1, submat ), ldq, rwork( ll ),matsiz, qstore( &
+                        1, submat ), ldqs,rwork( iwrem ) )
+              iwork( iqptr+curr+1 ) = iwork( iqptr+curr ) + matsiz**2
+              curr = curr + 1
+              if( info>0 ) then
+                 info = submat*( n+1 ) + submat + matsiz - 1
+                 return
+              end if
+              k = 1
+              do j = submat, iwork( i+1 )
+                 iwork( indxq+j ) = k
+                 k = k + 1
+              end do
+           end do
+           ! successively merge eigensystems of adjacent submatrices
+           ! into eigensystem for the corresponding larger matrix.
+           ! while ( subpbs > 1 )
+           curlvl = 1
+           80 continue
+           if( subpbs>1 ) then
+              spm2 = subpbs - 2
+              do i = 0, spm2, 2
+                 if( i==0 ) then
+                    submat = 1
+                    matsiz = iwork( 2 )
+                    msd2 = iwork( 1 )
+                    curprb = 0
+                 else
+                    submat = iwork( i ) + 1
+                    matsiz = iwork( i+2 ) - iwork( i )
+                    msd2 = matsiz / 2
+                    curprb = curprb + 1
+                 end if
+           ! merge lower order eigensystems (of size msd2 and matsiz - msd2)
+           ! into an eigensystem of size matsiz.  stdlib_wlaed7 handles the case
+           ! when the eigenvectors of a full or band hermitian matrix (which
+           ! was reduced to tridiagonal form) are desired.
+           ! i am free to use q as a valuable working space until loop 150.
+                 call stdlib_wlaed7( matsiz, msd2, qsiz, tlvls, curlvl, curprb,d( submat ), &
+                 qstore( 1, submat ), ldqs,e( submat+msd2-1 ), iwork( indxq+submat ),rwork( iq ), &
+                 iwork( iqptr ), iwork( iprmpt ),iwork( iperm ), iwork( igivpt ),iwork( igivcl ), &
+                           rwork( igivnm ),q( 1, submat ), rwork( iwrem ),iwork( subpbs+1 ), info )
+                 if( info>0 ) then
+                    info = submat*( n+1 ) + submat + matsiz - 1
+                    return
+                 end if
+                 iwork( i / 2+1 ) = iwork( i+2 )
+              end do
+              subpbs = subpbs / 2
+              curlvl = curlvl + 1
+              go to 80
+           end if
+           ! end while
+           ! re-merge the eigenvalues/vectors which were deflated at the final
+           ! merge step.
+           do i = 1, n
+              j = iwork( indxq+i )
+              rwork( i ) = d( j )
+              call stdlib_wcopy( qsiz, qstore( 1, j ), 1, q( 1, i ), 1 )
+           end do
+           call stdlib_qcopy( n, rwork, 1, d, 1 )
+           return
+     end subroutine stdlib_wlaed0
+
+     !> ZLAED7: computes the updated eigensystem of a diagonal
+     !> matrix after modification by a rank-one symmetric matrix. This
+     !> routine is used only for the eigenproblem which requires all
+     !> eigenvalues and optionally eigenvectors of a dense or banded
+     !> Hermitian matrix that has been reduced to tridiagonal form.
+     !> T = Q(in) ( D(in) + RHO * Z*Z**H ) Q**H(in) = Q(out) * D(out) * Q**H(out)
+     !> where Z = Q**Hu, u is a vector of length N with ones in the
+     !> CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+     !> The eigenvectors of the original matrix are stored in Q, and the
+     !> eigenvalues are in D.  The algorithm consists of three stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple eigenvalues or if there is a zero in
+     !> the Z vector.  For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one.  This stage is
+     !> performed by the routine DLAED2.
+     !> The second stage consists of calculating the updated
+     !> eigenvalues. This is done by finding the roots of the secular
+     !> equation via the routine DLAED4 (as called by SLAED3).
+     !> This routine also calculates the eigenvectors of the current
+     !> problem.
+     !> The final stage consists of computing the updated eigenvectors
+     !> directly using the updated eigenvalues.  The eigenvectors for
+     !> the current problem are multiplied with the eigenvectors from
+     !> the overall problem.
+
+     pure subroutine stdlib_wlaed7( n, cutpnt, qsiz, tlvls, curlvl, curpbm, d, q,ldq, rho, indxq, &
+               qstore, qptr, prmptr, perm,givptr, givcol, givnum, work, rwork, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: curlvl, curpbm, cutpnt, ldq, n, qsiz, tlvls
+           integer(ilp), intent(out) :: info
+           real(qp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: givcol(2,*), givptr(*), perm(*), prmptr(*), qptr(*)
+                     
+           integer(ilp), intent(out) :: indxq(*), iwork(*)
+           real(qp), intent(inout) :: d(*), givnum(2,*), qstore(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: q(ldq,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: coltyp, curr, i, idlmda, indx, indxc, indxp, iq, iw, iz, k, n1, n2, &
+                     ptr
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           ! if( icompq<0 .or. icompq>1 ) then
+              ! info = -1
+           ! else if( n<0 ) then
+           if( n<0 ) then
+              info = -1
+           else if( min( 1, n )>cutpnt .or. n<cutpnt ) then
+              info = -2
+           else if( qsiz<n ) then
+              info = -3
+           else if( ldq<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAED7', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! the following values are for bookkeeping purposes only.  they are
+           ! integer pointers which indicate the portion of the workspace
+           ! used by a particular array in stdlib_qlaed2 and stdlib_dlaed3.
+           iz = 1
+           idlmda = iz + n
+           iw = idlmda + n
+           iq = iw + n
+           indx = 1
+           indxc = indx + n
+           coltyp = indxc + n
+           indxp = coltyp + n
+           ! form the z-vector which consists of the last row of q_1 and the
+           ! first row of q_2.
+           ptr = 1 + 2**tlvls
+           do i = 1, curlvl - 1
+              ptr = ptr + 2**( tlvls-i )
+           end do
+           curr = ptr + curpbm
+           call stdlib_qlaeda( n, tlvls, curlvl, curpbm, prmptr, perm, givptr,givcol, givnum, &
+                     qstore, qptr, rwork( iz ),rwork( iz+n ), info )
+           ! when solving the final problem, we no longer need the stored data,
+           ! so we will overwrite the data from this level onto the previously
+           ! used storage space.
+           if( curlvl==tlvls ) then
+              qptr( curr ) = 1
+              prmptr( curr ) = 1
+              givptr( curr ) = 1
+           end if
+           ! sort and deflate eigenvalues.
+           call stdlib_wlaed8( k, n, qsiz, q, ldq, d, rho, cutpnt, rwork( iz ),rwork( idlmda ), &
+           work, qsiz, rwork( iw ),iwork( indxp ), iwork( indx ), indxq,perm( prmptr( curr ) ), &
+           givptr( curr+1 ),givcol( 1, givptr( curr ) ),givnum( 1, givptr( curr ) ), info )
+                     
+           prmptr( curr+1 ) = prmptr( curr ) + n
+           givptr( curr+1 ) = givptr( curr+1 ) + givptr( curr )
+           ! solve secular equation.
+           if( k/=0 ) then
+              call stdlib_qlaed9( k, 1, k, n, d, rwork( iq ), k, rho,rwork( idlmda ), rwork( iw ),&
+                        qstore( qptr( curr ) ), k, info )
+              call stdlib_wlacrm( qsiz, k, work, qsiz, qstore( qptr( curr ) ), k, q,ldq, rwork( &
+                        iq ) )
+              qptr( curr+1 ) = qptr( curr ) + k**2
+              if( info/=0 ) then
+                 return
+              end if
+           ! prepare the indxq sorting premutation.
+              n1 = k
+              n2 = n - k
+              call stdlib_qlamrg( n1, n2, d, 1, -1, indxq )
+           else
+              qptr( curr+1 ) = qptr( curr )
+              do i = 1, n
+                 indxq( i ) = i
+              end do
+           end if
+           return
+     end subroutine stdlib_wlaed7
+
+     !> ZLAED8: merges the two sets of eigenvalues together into a single
+     !> sorted set.  Then it tries to deflate the size of the problem.
+     !> There are two ways in which deflation can occur:  when two or more
+     !> eigenvalues are close together or if there is a tiny element in the
+     !> Z vector.  For each such occurrence the order of the related secular
+     !> equation problem is reduced by one.
+
+     pure subroutine stdlib_wlaed8( k, n, qsiz, q, ldq, d, rho, cutpnt, z, dlamda,q2, ldq2, w, &
+               indxp, indx, indxq, perm, givptr,givcol, givnum, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: cutpnt, ldq, ldq2, n, qsiz
+           integer(ilp), intent(out) :: givptr, info, k
+           real(qp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(2,*), indx(*), indxp(*), perm(*)
+           integer(ilp), intent(inout) :: indxq(*)
+           real(qp), intent(inout) :: d(*), z(*)
+           real(qp), intent(out) :: dlamda(*), givnum(2,*), w(*)
+           complex(qp), intent(inout) :: q(ldq,*)
+           complex(qp), intent(out) :: q2(ldq2,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: mone = -1.0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, imax, j, jlam, jmax, jp, k2, n1, n1p1, n2
+           real(qp) :: c, eps, s, t, tau, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( qsiz<n ) then
+              info = -3
+           else if( ldq<max( 1, n ) ) then
+              info = -5
+           else if( cutpnt<min( 1, n ) .or. cutpnt>n ) then
+              info = -8
+           else if( ldq2<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAED8', -info )
+              return
+           end if
+           ! need to initialize givptr to o here in case of quick exit
+           ! to prevent an unspecified code behavior (usually sigfault)
+           ! when iwork array on entry to *stedc is not zeroed
+           ! (or at least some iwork entries which used in *laed7 for givptr).
+           givptr = 0
+           ! quick return if possible
+           if( n==0 )return
+           n1 = cutpnt
+           n2 = n - n1
+           n1p1 = n1 + 1
+           if( rho<zero ) then
+              call stdlib_qscal( n2, mone, z( n1p1 ), 1 )
+           end if
+           ! normalize z so that norm(z) = 1
+           t = one / sqrt( two )
+           do j = 1, n
+              indx( j ) = j
+           end do
+           call stdlib_qscal( n, t, z, 1 )
+           rho = abs( two*rho )
+           ! sort the eigenvalues into increasing order
+           do i = cutpnt + 1, n
+              indxq( i ) = indxq( i ) + cutpnt
+           end do
+           do i = 1, n
+              dlamda( i ) = d( indxq( i ) )
+              w( i ) = z( indxq( i ) )
+           end do
+           i = 1
+           j = cutpnt + 1
+           call stdlib_qlamrg( n1, n2, dlamda, 1, 1, indx )
+           do i = 1, n
+              d( i ) = dlamda( indx( i ) )
+              z( i ) = w( indx( i ) )
+           end do
+           ! calculate the allowable deflation tolerance
+           imax = stdlib_iqamax( n, z, 1 )
+           jmax = stdlib_iqamax( n, d, 1 )
+           eps = stdlib_qlamch( 'EPSILON' )
+           tol = eight*eps*abs( d( jmax ) )
+           ! if the rank-1 modifier is small enough, no more needs to be done
+           ! -- except to reorganize q so that its columns correspond with the
+           ! elements in d.
+           if( rho*abs( z( imax ) )<=tol ) then
+              k = 0
+              do j = 1, n
+                 perm( j ) = indxq( indx( j ) )
+                 call stdlib_wcopy( qsiz, q( 1, perm( j ) ), 1, q2( 1, j ), 1 )
+              end do
+              call stdlib_wlacpy( 'A', qsiz, n, q2( 1, 1 ), ldq2, q( 1, 1 ), ldq )
+              return
+           end if
+           ! if there are multiple eigenvalues then the problem deflates.  here
+           ! the number of equal eigenvalues are found.  as each equal
+           ! eigenvalue is found, an elementary reflector is computed to rotate
+           ! the corresponding eigensubspace so that the corresponding
+           ! components of z are zero in this new basis.
+           k = 0
+           k2 = n + 1
+           do j = 1, n
+              if( rho*abs( z( j ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 indxp( k2 ) = j
+                 if( j==n )go to 100
+              else
+                 jlam = j
+                 go to 70
+              end if
+           end do
+           70 continue
+           j = j + 1
+           if( j>n )go to 90
+           if( rho*abs( z( j ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              indxp( k2 ) = j
+           else
+              ! check if eigenvalues are close enough to allow deflation.
+              s = z( jlam )
+              c = z( j )
+              ! find sqrt(a**2+b**2) without overflow or
+              ! destructive underflow.
+              tau = stdlib_qlapy2( c, s )
+              t = d( j ) - d( jlam )
+              c = c / tau
+              s = -s / tau
+              if( abs( t*c*s )<=tol ) then
+                 ! deflation is possible.
+                 z( j ) = tau
+                 z( jlam ) = zero
+                 ! record the appropriate givens rotation
+                 givptr = givptr + 1
+                 givcol( 1, givptr ) = indxq( indx( jlam ) )
+                 givcol( 2, givptr ) = indxq( indx( j ) )
+                 givnum( 1, givptr ) = c
+                 givnum( 2, givptr ) = s
+                 call stdlib_wdrot( qsiz, q( 1, indxq( indx( jlam ) ) ), 1,q( 1, indxq( indx( j ) &
+                           ) ), 1, c, s )
+                 t = d( jlam )*c*c + d( j )*s*s
+                 d( j ) = d( jlam )*s*s + d( j )*c*c
+                 d( jlam ) = t
+                 k2 = k2 - 1
+                 i = 1
+                 80 continue
+                 if( k2+i<=n ) then
+                    if( d( jlam )<d( indxp( k2+i ) ) ) then
+                       indxp( k2+i-1 ) = indxp( k2+i )
+                       indxp( k2+i ) = jlam
+                       i = i + 1
+                       go to 80
+                    else
+                       indxp( k2+i-1 ) = jlam
+                    end if
+                 else
+                    indxp( k2+i-1 ) = jlam
+                 end if
+                 jlam = j
+              else
+                 k = k + 1
+                 w( k ) = z( jlam )
+                 dlamda( k ) = d( jlam )
+                 indxp( k ) = jlam
+                 jlam = j
+              end if
+           end if
+           go to 70
+           90 continue
+           ! record the last eigenvalue.
+           k = k + 1
+           w( k ) = z( jlam )
+           dlamda( k ) = d( jlam )
+           indxp( k ) = jlam
+           100 continue
+           ! sort the eigenvalues and corresponding eigenvectors into dlamda
+           ! and q2 respectively.  the eigenvalues/vectors which were not
+           ! deflated go into the first k slots of dlamda and q2 respectively,
+           ! while those which were deflated go into the last n - k slots.
+           do j = 1, n
+              jp = indxp( j )
+              dlamda( j ) = d( jp )
+              perm( j ) = indxq( indx( jp ) )
+              call stdlib_wcopy( qsiz, q( 1, perm( j ) ), 1, q2( 1, j ), 1 )
+           end do
+           ! the deflated eigenvalues and their corresponding vectors go back
+           ! into the last n - k slots of d and q respectively.
+           if( k<n ) then
+              call stdlib_qcopy( n-k, dlamda( k+1 ), 1, d( k+1 ), 1 )
+              call stdlib_wlacpy( 'A', qsiz, n-k, q2( 1, k+1 ), ldq2, q( 1, k+1 ),ldq )
+           end if
+           return
+     end subroutine stdlib_wlaed8
+
+     !> ZLAEIN: uses inverse iteration to find a right or left eigenvector
+     !> corresponding to the eigenvalue W of a complex upper Hessenberg
+     !> matrix H.
+
+     pure subroutine stdlib_wlaein( rightv, noinit, n, h, ldh, w, v, b, ldb, rwork,eps3, smlnum, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: noinit, rightv
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldh, n
+           real(qp), intent(in) :: eps3, smlnum
+           complex(qp), intent(in) :: w
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(out) :: b(ldb,*)
+           complex(qp), intent(in) :: h(ldh,*)
+           complex(qp), intent(inout) :: v(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: tenth = 1.0e-1_qp
+           
+           
+           ! Local Scalars 
+           character :: normin, trans
+           integer(ilp) :: i, ierr, its, j
+           real(qp) :: growto, nrmsml, rootn, rtemp, scale, vnorm
+           complex(qp) :: cdum, ei, ej, temp, x
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           info = 0
+           ! growto is the threshold used in the acceptance test for an
+           ! eigenvector.
+           rootn = sqrt( real( n,KIND=qp) )
+           growto = tenth / rootn
+           nrmsml = max( one, eps3*rootn )*smlnum
+           ! form b = h - w*i (except that the subdiagonal elements are not
+           ! stored).
+           do j = 1, n
+              do i = 1, j - 1
+                 b( i, j ) = h( i, j )
+              end do
+              b( j, j ) = h( j, j ) - w
+           end do
+           if( noinit ) then
+              ! initialize v.
+              do i = 1, n
+                 v( i ) = eps3
+              end do
+           else
+              ! scale supplied initial vector.
+              vnorm = stdlib_qznrm2( n, v, 1 )
+              call stdlib_wdscal( n, ( eps3*rootn ) / max( vnorm, nrmsml ), v, 1 )
+           end if
+           if( rightv ) then
+              ! lu decomposition with partial pivoting of b, replacing czero
+              ! pivots by eps3.
+              do i = 1, n - 1
+                 ei = h( i+1, i )
+                 if( cabs1( b( i, i ) )<cabs1( ei ) ) then
+                    ! interchange rows and eliminate.
+                    x = stdlib_wladiv( b( i, i ), ei )
+                    b( i, i ) = ei
+                    do j = i + 1, n
+                       temp = b( i+1, j )
+                       b( i+1, j ) = b( i, j ) - x*temp
+                       b( i, j ) = temp
+                    end do
+                 else
+                    ! eliminate without interchange.
+                    if( b( i, i )==czero )b( i, i ) = eps3
+                    x = stdlib_wladiv( ei, b( i, i ) )
+                    if( x/=czero ) then
+                       do j = i + 1, n
+                          b( i+1, j ) = b( i+1, j ) - x*b( i, j )
+                       end do
+                    end if
+                 end if
+              end do
+              if( b( n, n )==czero )b( n, n ) = eps3
+              trans = 'N'
+           else
+              ! ul decomposition with partial pivoting of b, replacing czero
+              ! pivots by eps3.
+              do j = n, 2, -1
+                 ej = h( j, j-1 )
+                 if( cabs1( b( j, j ) )<cabs1( ej ) ) then
+                    ! interchange columns and eliminate.
+                    x = stdlib_wladiv( b( j, j ), ej )
+                    b( j, j ) = ej
+                    do i = 1, j - 1
+                       temp = b( i, j-1 )
+                       b( i, j-1 ) = b( i, j ) - x*temp
+                       b( i, j ) = temp
+                    end do
+                 else
+                    ! eliminate without interchange.
+                    if( b( j, j )==czero )b( j, j ) = eps3
+                    x = stdlib_wladiv( ej, b( j, j ) )
+                    if( x/=czero ) then
+                       do i = 1, j - 1
+                          b( i, j-1 ) = b( i, j-1 ) - x*b( i, j )
+                       end do
+                    end if
+                 end if
+              end do
+              if( b( 1, 1 )==czero )b( 1, 1 ) = eps3
+              trans = 'C'
+           end if
+           normin = 'N'
+           do its = 1, n
+              ! solve u*x = scale*v for a right eigenvector
+                ! or u**h *x = scale*v for a left eigenvector,
+              ! overwriting x on v.
+              call stdlib_wlatrs( 'UPPER', trans, 'NONUNIT', normin, n, b, ldb, v,scale, rwork, &
+                        ierr )
+              normin = 'Y'
+              ! test for sufficient growth in the norm of v.
+              vnorm = stdlib_qzasum( n, v, 1 )
+              if( vnorm>=growto*scale )go to 120
+              ! choose new orthogonal starting vector and try again.
+              rtemp = eps3 / ( rootn+one )
+              v( 1 ) = eps3
+              do i = 2, n
+                 v( i ) = rtemp
+              end do
+              v( n-its+1 ) = v( n-its+1 ) - eps3*rootn
+           end do
+           ! failure to find eigenvector in n iterations.
+           info = 1
+           120 continue
+           ! normalize eigenvector.
+           i = stdlib_iwamax( n, v, 1 )
+           call stdlib_wdscal( n, one / cabs1( v( i ) ), v, 1 )
+           return
+     end subroutine stdlib_wlaein
+
+     !> ZLAESY: computes the eigendecomposition of a 2-by-2 symmetric matrix
+     !> ( ( A, B );( B, C ) )
+     !> provided the norm of the matrix of eigenvectors is larger than
+     !> some threshold value.
+     !> RT1 is the eigenvalue of larger absolute value, and RT2 of
+     !> smaller absolute value.  If the eigenvectors are computed, then
+     !> on return ( CS1, SN1 ) is the unit eigenvector for RT1, hence
+     !> [  CS1     SN1   ] . [ A  B ] . [ CS1    -SN1   ] = [ RT1  0  ]
+     !> [ -SN1     CS1   ]   [ B  C ]   [ SN1     CS1   ]   [  0  RT2 ]
+
+     pure subroutine stdlib_wlaesy( a, b, c, rt1, rt2, evscal, cs1, sn1 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: a, b, c
+           complex(qp), intent(out) :: cs1, evscal, rt1, rt2, sn1
+       ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1_qp
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           real(qp) :: babs, evnorm, tabs, z
+           complex(qp) :: s, t, tmp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! special case:  the matrix is actually diagonal.
+           ! to avoid divide by zero later, we treat this case separately.
+           if( abs( b )==zero ) then
+              rt1 = a
+              rt2 = c
+              if( abs( rt1 )<abs( rt2 ) ) then
+                 tmp = rt1
+                 rt1 = rt2
+                 rt2 = tmp
+                 cs1 = zero
+                 sn1 = one
+              else
+                 cs1 = one
+                 sn1 = zero
+              end if
+           else
+              ! compute the eigenvalues and eigenvectors.
+              ! the characteristic equation is
+                 ! lambda **2 - (a+c) lambda + (a*c - b*b)
+              ! and we solve it using the quadratic formula.
+              s = ( a+c )*half
+              t = ( a-c )*half
+              ! take the square root carefully to avoid over/under flow.
+              babs = abs( b )
+              tabs = abs( t )
+              z = max( babs, tabs )
+              if( z>zero )t = z*sqrt( ( t / z )**2+( b / z )**2 )
+              ! compute the two eigenvalues.  rt1 and rt2 are exchanged
+              ! if necessary so that rt1 will have the greater magnitude.
+              rt1 = s + t
+              rt2 = s - t
+              if( abs( rt1 )<abs( rt2 ) ) then
+                 tmp = rt1
+                 rt1 = rt2
+                 rt2 = tmp
+              end if
+              ! choose cs1 = 1 and sn1 to satisfy the first equation, then
+              ! scale the components of this eigenvector so that the matrix
+              ! of eigenvectors x satisfies  x * x**t = i .  (no scaling is
+              ! done if the norm of the eigenvalue matrix is less than thresh.)
+              sn1 = ( rt1-a ) / b
+              tabs = abs( sn1 )
+              if( tabs>one ) then
+                 t = tabs*sqrt( ( one / tabs )**2+( sn1 / tabs )**2 )
+              else
+                 t = sqrt( cone+sn1*sn1 )
+              end if
+              evnorm = abs( t )
+              if( evnorm>=thresh ) then
+                 evscal = cone / t
+                 cs1 = evscal
+                 sn1 = sn1*evscal
+              else
+                 evscal = zero
+              end if
+           end if
+           return
+     end subroutine stdlib_wlaesy
+
+     !> ZLAEV2: computes the eigendecomposition of a 2-by-2 Hermitian matrix
+     !> [  A         B  ]
+     !> [  CONJG(B)  C  ].
+     !> On return, RT1 is the eigenvalue of larger absolute value, RT2 is the
+     !> eigenvalue of smaller absolute value, and (CS1,SN1) is the unit right
+     !> eigenvector for RT1, giving the decomposition
+     !> [ CS1  CONJG(SN1) ] [    A     B ] [ CS1 -CONJG(SN1) ] = [ RT1  0  ]
+     !> [-SN1     CS1     ] [ CONJG(B) C ] [ SN1     CS1     ]   [  0  RT2 ].
+
+     pure subroutine stdlib_wlaev2( a, b, c, rt1, rt2, cs1, sn1 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(out) :: cs1, rt1, rt2
+           complex(qp), intent(in) :: a, b, c
+           complex(qp), intent(out) :: sn1
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(qp) :: t
+           complex(qp) :: w
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg
+           ! Executable Statements 
+           if( abs( b )==zero ) then
+              w = one
+           else
+              w = conjg( b ) / abs( b )
+           end if
+           call stdlib_qlaev2( real( a,KIND=qp), abs( b ), real( c,KIND=qp), rt1, rt2, cs1, t )
+                     
+           sn1 = w*t
+           return
+     end subroutine stdlib_wlaev2
+
+     !> ZLAG2C: converts a COMPLEX*16 matrix, SA, to a COMPLEX matrix, A.
+     !> RMAX is the overflow for the SINGLE PRECISION arithmetic
+     !> ZLAG2C checks that all the entries of A are between -RMAX and
+     !> RMAX. If not the conversion is aborted and a flag is raised.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_wlag2c( m, n, a, lda, sa, ldsa, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, m, n
+           ! Array Arguments 
+           complex(dp), intent(out) :: sa(ldsa,*)
+           complex(qp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: rmax
+           ! Intrinsic Functions 
+           intrinsic :: real,aimag
+           ! Executable Statements 
+           rmax = stdlib_dlamch( 'O' )
+           do j = 1, n
+              do i = 1, m
+                 if( ( real( a( i, j ),KIND=qp)<-rmax ) .or.( real( a( i, j ),KIND=qp)>rmax ) &
+                           .or.( aimag( a( i, j ) )<-rmax ) .or.( aimag( a( i, j ) )>rmax ) ) then
+                    info = 1
+                    go to 30
+                 end if
+                 sa( i, j ) = a( i, j )
+              end do
+           end do
+           info = 0
+           30 continue
+           return
+     end subroutine stdlib_wlag2c
+
+     !> ZLAGS2: computes 2-by-2 unitary matrices U, V and Q, such
+     !> that if ( UPPER ) then
+     !> U**H *A*Q = U**H *( A1 A2 )*Q = ( x  0  )
+     !> ( 0  A3 )     ( x  x  )
+     !> and
+     !> V**H*B*Q = V**H *( B1 B2 )*Q = ( x  0  )
+     !> ( 0  B3 )     ( x  x  )
+     !> or if ( .NOT.UPPER ) then
+     !> U**H *A*Q = U**H *( A1 0  )*Q = ( x  x  )
+     !> ( A2 A3 )     ( 0  x  )
+     !> and
+     !> V**H *B*Q = V**H *( B1 0  )*Q = ( x  x  )
+     !> ( B2 B3 )     ( 0  x  )
+     !> where
+     !> U = (   CSU    SNU ), V = (  CSV    SNV ),
+     !> ( -SNU**H  CSU )      ( -SNV**H CSV )
+     !> Q = (   CSQ    SNQ )
+     !> ( -SNQ**H  CSQ )
+     !> The rows of the transformed A and B are parallel. Moreover, if the
+     !> input 2-by-2 matrix A is not zero, then the transformed (1,1) entry
+     !> of A is not zero. If the input matrices A and B are both not zero,
+     !> then the transformed (2,2) element of B is not zero, except when the
+     !> first rows of input A and B are parallel and the second rows are
+     !> zero.
+
+     pure subroutine stdlib_wlags2( upper, a1, a2, a3, b1, b2, b3, csu, snu, csv,snv, csq, snq )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: upper
+           real(qp), intent(in) :: a1, a3, b1, b3
+           real(qp), intent(out) :: csq, csu, csv
+           complex(qp), intent(in) :: a2, b2
+           complex(qp), intent(out) :: snq, snu, snv
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: a, aua11, aua12, aua21, aua22, avb12, avb11, avb21, avb22, csl, csr, d, fb,&
+                      fc, s1, s2, snl, snr, ua11r, ua22r, vb11r, vb22r
+           complex(qp) :: b, c, d1, r, t, ua11, ua12, ua21, ua22, vb11, vb12, vb21, vb22
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag
+           ! Statement Functions 
+           real(qp) :: abs1
+           ! Statement Function Definitions 
+           abs1( t ) = abs( real( t,KIND=qp) ) + abs( aimag( t ) )
+           ! Executable Statements 
+           if( upper ) then
+              ! input matrices a and b are upper triangular matrices
+              ! form matrix c = a*adj(b) = ( a b )
+                                         ! ( 0 d )
+              a = a1*b3
+              d = a3*b1
+              b = a2*b1 - a1*b2
+              fb = abs( b )
+              ! transform complex 2-by-2 matrix c to real matrix by unitary
+              ! diagonal matrix diag(1,d1).
+              d1 = one
+              if( fb/=zero )d1 = b / fb
+              ! the svd of real 2 by 2 triangular c
+               ! ( csl -snl )*( a b )*(  csr  snr ) = ( r 0 )
+               ! ( snl  csl ) ( 0 d ) ( -snr  csr )   ( 0 t )
+              call stdlib_qlasv2( a, fb, d, s1, s2, snr, csr, snl, csl )
+              if( abs( csl )>=abs( snl ) .or. abs( csr )>=abs( snr ) )then
+                 ! compute the (1,1) and (1,2) elements of u**h *a and v**h *b,
+                 ! and (1,2) element of |u|**h *|a| and |v|**h *|b|.
+                 ua11r = csl*a1
+                 ua12 = csl*a2 + d1*snl*a3
+                 vb11r = csr*b1
+                 vb12 = csr*b2 + d1*snr*b3
+                 aua12 = abs( csl )*abs1( a2 ) + abs( snl )*abs( a3 )
+                 avb12 = abs( csr )*abs1( b2 ) + abs( snr )*abs( b3 )
+                 ! zero (1,2) elements of u**h *a and v**h *b
+                 if( ( abs( ua11r )+abs1( ua12 ) )==zero ) then
+                    call stdlib_wlartg( -cmplx( vb11r,KIND=qp), conjg( vb12 ), csq, snq,r )
+                              
+                 else if( ( abs( vb11r )+abs1( vb12 ) )==zero ) then
+                    call stdlib_wlartg( -cmplx( ua11r,KIND=qp), conjg( ua12 ), csq, snq,r )
+                              
+                 else if( aua12 / ( abs( ua11r )+abs1( ua12 ) )<=avb12 /( abs( vb11r )+abs1( vb12 &
+                           ) ) ) then
+                    call stdlib_wlartg( -cmplx( ua11r,KIND=qp), conjg( ua12 ), csq, snq,r )
+                              
+                 else
+                    call stdlib_wlartg( -cmplx( vb11r,KIND=qp), conjg( vb12 ), csq, snq,r )
+                              
+                 end if
+                 csu = csl
+                 snu = -d1*snl
+                 csv = csr
+                 snv = -d1*snr
+              else
+                 ! compute the (2,1) and (2,2) elements of u**h *a and v**h *b,
+                 ! and (2,2) element of |u|**h *|a| and |v|**h *|b|.
+                 ua21 = -conjg( d1 )*snl*a1
+                 ua22 = -conjg( d1 )*snl*a2 + csl*a3
+                 vb21 = -conjg( d1 )*snr*b1
+                 vb22 = -conjg( d1 )*snr*b2 + csr*b3
+                 aua22 = abs( snl )*abs1( a2 ) + abs( csl )*abs( a3 )
+                 avb22 = abs( snr )*abs1( b2 ) + abs( csr )*abs( b3 )
+                 ! zero (2,2) elements of u**h *a and v**h *b, and then swap.
+                 if( ( abs1( ua21 )+abs1( ua22 ) )==zero ) then
+                    call stdlib_wlartg( -conjg( vb21 ), conjg( vb22 ), csq, snq,r )
+                 else if( ( abs1( vb21 )+abs( vb22 ) )==zero ) then
+                    call stdlib_wlartg( -conjg( ua21 ), conjg( ua22 ), csq, snq,r )
+                 else if( aua22 / ( abs1( ua21 )+abs1( ua22 ) )<=avb22 /( abs1( vb21 )+abs1( vb22 &
+                           ) ) ) then
+                    call stdlib_wlartg( -conjg( ua21 ), conjg( ua22 ), csq, snq,r )
+                 else
+                    call stdlib_wlartg( -conjg( vb21 ), conjg( vb22 ), csq, snq,r )
+                 end if
+                 csu = snl
+                 snu = d1*csl
+                 csv = snr
+                 snv = d1*csr
+              end if
+           else
+              ! input matrices a and b are lower triangular matrices
+              ! form matrix c = a*adj(b) = ( a 0 )
+                                         ! ( c d )
+              a = a1*b3
+              d = a3*b1
+              c = a2*b3 - a3*b2
+              fc = abs( c )
+              ! transform complex 2-by-2 matrix c to real matrix by unitary
+              ! diagonal matrix diag(d1,1).
+              d1 = one
+              if( fc/=zero )d1 = c / fc
+              ! the svd of real 2 by 2 triangular c
+               ! ( csl -snl )*( a 0 )*(  csr  snr ) = ( r 0 )
+               ! ( snl  csl ) ( c d ) ( -snr  csr )   ( 0 t )
+              call stdlib_qlasv2( a, fc, d, s1, s2, snr, csr, snl, csl )
+              if( abs( csr )>=abs( snr ) .or. abs( csl )>=abs( snl ) )then
+                 ! compute the (2,1) and (2,2) elements of u**h *a and v**h *b,
+                 ! and (2,1) element of |u|**h *|a| and |v|**h *|b|.
+                 ua21 = -d1*snr*a1 + csr*a2
+                 ua22r = csr*a3
+                 vb21 = -d1*snl*b1 + csl*b2
+                 vb22r = csl*b3
+                 aua21 = abs( snr )*abs( a1 ) + abs( csr )*abs1( a2 )
+                 avb21 = abs( snl )*abs( b1 ) + abs( csl )*abs1( b2 )
+                 ! zero (2,1) elements of u**h *a and v**h *b.
+                 if( ( abs1( ua21 )+abs( ua22r ) )==zero ) then
+                    call stdlib_wlartg( cmplx( vb22r,KIND=qp), vb21, csq, snq, r )
+                 else if( ( abs1( vb21 )+abs( vb22r ) )==zero ) then
+                    call stdlib_wlartg( cmplx( ua22r,KIND=qp), ua21, csq, snq, r )
+                 else if( aua21 / ( abs1( ua21 )+abs( ua22r ) )<=avb21 /( abs1( vb21 )+abs( vb22r &
+                           ) ) ) then
+                    call stdlib_wlartg( cmplx( ua22r,KIND=qp), ua21, csq, snq, r )
+                 else
+                    call stdlib_wlartg( cmplx( vb22r,KIND=qp), vb21, csq, snq, r )
+                 end if
+                 csu = csr
+                 snu = -conjg( d1 )*snr
+                 csv = csl
+                 snv = -conjg( d1 )*snl
+              else
+                 ! compute the (1,1) and (1,2) elements of u**h *a and v**h *b,
+                 ! and (1,1) element of |u|**h *|a| and |v|**h *|b|.
+                 ua11 = csr*a1 + conjg( d1 )*snr*a2
+                 ua12 = conjg( d1 )*snr*a3
+                 vb11 = csl*b1 + conjg( d1 )*snl*b2
+                 vb12 = conjg( d1 )*snl*b3
+                 aua11 = abs( csr )*abs( a1 ) + abs( snr )*abs1( a2 )
+                 avb11 = abs( csl )*abs( b1 ) + abs( snl )*abs1( b2 )
+                 ! zero (1,1) elements of u**h *a and v**h *b, and then swap.
+                 if( ( abs1( ua11 )+abs1( ua12 ) )==zero ) then
+                    call stdlib_wlartg( vb12, vb11, csq, snq, r )
+                 else if( ( abs1( vb11 )+abs1( vb12 ) )==zero ) then
+                    call stdlib_wlartg( ua12, ua11, csq, snq, r )
+                 else if( aua11 / ( abs1( ua11 )+abs1( ua12 ) )<=avb11 /( abs1( vb11 )+abs1( vb12 &
+                           ) ) ) then
+                    call stdlib_wlartg( ua12, ua11, csq, snq, r )
+                 else
+                    call stdlib_wlartg( vb12, vb11, csq, snq, r )
+                 end if
+                 csu = snr
+                 snu = conjg( d1 )*csr
+                 csv = snl
+                 snv = conjg( d1 )*csl
+              end if
+           end if
+           return
+     end subroutine stdlib_wlags2
+
+     !> ZLAGTM: performs a matrix-vector product of the form
+     !> B := alpha * A * X + beta * B
+     !> where A is a tridiagonal matrix of order N, B and X are N by NRHS
+     !> matrices, and alpha and beta are real scalars, each of which may be
+     !> 0., 1., or -1.
+
+     pure subroutine stdlib_wlagtm( trans, n, nrhs, alpha, dl, d, du, x, ldx, beta,b, ldb )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(qp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           complex(qp), intent(inout) :: b(ldb,*)
+           complex(qp), intent(in) :: d(*), dl(*), du(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           if( n==0 )return
+           ! multiply b by beta if beta/=1.
+           if( beta==zero ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = zero
+                 end do
+              end do
+           else if( beta==-one ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = -b( i, j )
+                 end do
+              end do
+           end if
+           if( alpha==one ) then
+              if( stdlib_lsame( trans, 'N' ) ) then
+                 ! compute b := b + a*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j ) +du( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) + dl( n-1 )*x( n-1, j ) +d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + dl( i-1 )*x( i-1, j ) +d( i )*x( i, j ) + du( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! compute b := b + a**t * x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j ) +dl( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) + du( n-1 )*x( n-1, j ) +d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + du( i-1 )*x( i-1, j ) +d( i )*x( i, j ) + dl( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else if( stdlib_lsame( trans, 'C' ) ) then
+                 ! compute b := b + a**h * x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + conjg( d( 1 ) )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + conjg( d( 1 ) )*x( 1, j ) +conjg( dl( 1 ) )*x( 2, &
+                                 j )
+                       b( n, j ) = b( n, j ) + conjg( du( n-1 ) )*x( n-1, j ) + conjg( d( n ) )*x(&
+                                  n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + conjg( du( i-1 ) )*x( i-1, j ) + conjg( d( i ) )&
+                                    *x( i, j ) + conjg( dl( i ) )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              end if
+           else if( alpha==-one ) then
+              if( stdlib_lsame( trans, 'N' ) ) then
+                 ! compute b := b - a*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j ) -du( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) - dl( n-1 )*x( n-1, j ) -d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - dl( i-1 )*x( i-1, j ) -d( i )*x( i, j ) - du( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! compute b := b - a**t *x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j ) -dl( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) - du( n-1 )*x( n-1, j ) -d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - du( i-1 )*x( i-1, j ) -d( i )*x( i, j ) - dl( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else if( stdlib_lsame( trans, 'C' ) ) then
+                 ! compute b := b - a**h *x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - conjg( d( 1 ) )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - conjg( d( 1 ) )*x( 1, j ) -conjg( dl( 1 ) )*x( 2, &
+                                 j )
+                       b( n, j ) = b( n, j ) - conjg( du( n-1 ) )*x( n-1, j ) - conjg( d( n ) )*x(&
+                                  n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - conjg( du( i-1 ) )*x( i-1, j ) - conjg( d( i ) )&
+                                    *x( i, j ) - conjg( dl( i ) )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wlagtm
+
+     !> ZLAHEF: computes a partial factorization of a complex Hermitian
+     !> matrix A using the Bunch-Kaufman diagonal pivoting method. The
+     !> partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I      0     )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**H U22**H )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**H L21**H )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0      I     )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> Note that U**H denotes the conjugate transpose of U.
+     !> ZLAHEF is an auxiliary routine called by ZHETRF. It uses blocked code
+     !> (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or
+     !> A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_wlahef( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: imax, j, jb, jj, jmax, jp, k, kk, kkw, kp, kstep, kw
+           real(qp) :: absakk, alpha, colmax, r1, rowmax, t
+           complex(qp) :: d11, d21, d22, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,aimag,max,min,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=qp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11 (note that conjg(w) is actually stored)
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              ! kw is the column of w which corresponds to column k of a
+              k = n
+              10 continue
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              ! copy column k of a to column kw of w and update it
+              call stdlib_wcopy( k-1, a( 1, k ), 1, w( 1, kw ), 1 )
+              w( k, kw ) = real( a( k, k ),KIND=qp)
+              if( k<n ) then
+                 call stdlib_wgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ), lda,w( k, kw+1 ), &
+                           ldw, cone, w( 1, kw ), 1 )
+                 w( k, kw ) = real( w( k, kw ),KIND=qp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, kw ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=qp)
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! begin pivot search along imax row
+                    ! copy column imax to column kw-1 of w and update it
+                    call stdlib_wcopy( imax-1, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                    w( imax, kw-1 ) = real( a( imax, imax ),KIND=qp)
+                    call stdlib_wcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                              
+                    call stdlib_wlacgv( k-imax, w( imax+1, kw-1 ), 1 )
+                    if( k<n ) then
+                       call stdlib_wgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda, w( imax,&
+                                  kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                       w( imax, kw-1 ) = real( w( imax, kw-1 ),KIND=qp)
+                    end if
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value.
+                    ! determine only rowmax.
+                    jmax = imax + stdlib_iwamax( k-imax, w( imax+1, kw-1 ), 1 )
+                    rowmax = cabs1( w( jmax, kw-1 ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_iwamax( imax-1, w( 1, kw-1 ), 1 )
+                       rowmax = max( rowmax, cabs1( w( jmax, kw-1 ) ) )
+                    end if
+                    ! case(2)
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    ! case(3)
+                    else if( abs( real( w( imax, kw-1 ),KIND=qp) )>=alpha*rowmax )then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column kw-1 of w to column kw of w
+                       call stdlib_wcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                    ! case(4)
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                    ! end pivot search along imax row
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=qp)
+                    call stdlib_wcopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_wlacgv( kk-1-kp, a( kp, kp+1 ), lda )
+                    if( kp>1 )call stdlib_wcopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_wswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_wswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! (1) store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element u(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=qp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_wcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(4))
+                       r1 = one / real( a( k, k ),KIND=qp)
+                       call stdlib_wdscal( k-1, r1, a( 1, k ), 1 )
+                       ! (2) conjugate column w(kw)
+                       call stdlib_wlacgv( k-1, w( 1, kw ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! (1) store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! (note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored)
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! = ( conj(d21)*( d11 ) d21*(  -1 ) )
+                         ! (           (  -1 )     ( d22 ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = t/d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0, since in 2x2 pivot case(4)
+                            ! |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / conjg( d21 )
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = one / ( real( d11*d22,KIND=qp)-one )
+                       d21 = t / d21
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = d21*( d11*w( j, kw-1 )-w( j, kw ) )
+                          a( j, k ) = conjg( d21 )*( d22*w( j, kw )-w( j, kw-1 ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                    ! (2) conjugate columns w(kw) and w(kw-1)
+                    call stdlib_wlacgv( k-1, w( 1, kw ), 1 )
+                    call stdlib_wlacgv( k-2, w( 1, kw-1 ), 1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**h = a11 - u12*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=qp)
+                    call stdlib_wgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=qp)
+                 end do
+                 ! update the rectangular superdiagonal block
+                 call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k,-cone, a( 1, k+1 ), &
+                           lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n looping backwards from k+1 to n
+              j = k + 1
+              60 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j + 1
+                 end if
+                 ! (note: here, j is used to determine row length. length n-j+1
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j + 1
+                 if( jp/=jj .and. j<=n )call stdlib_wswap( n-j+1, a( jp, j ), lda, a( jj, j ), &
+                           lda )
+              if( j<n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22 (note that conjg(w) is actually stored)
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              ! copy column k of a to column k of w and update it
+              w( k, k ) = real( a( k, k ),KIND=qp)
+              if( k<n )call stdlib_wcopy( n-k, a( k+1, k ), 1, w( k+1, k ), 1 )
+              call stdlib_wgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ), lda,w( k, 1 ), ldw,&
+                         cone, w( k, k ), 1 )
+              w( k, k ) = real( w( k, k ),KIND=qp)
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, k ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=qp)
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! begin pivot search along imax row
+                    ! copy column imax to column k+1 of w and update it
+                    call stdlib_wcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1 )
+                    call stdlib_wlacgv( imax-k, w( k, k+1 ), 1 )
+                    w( imax, k+1 ) = real( a( imax, imax ),KIND=qp)
+                    if( imax<n )call stdlib_wcopy( n-imax, a( imax+1, imax ), 1,w( imax+1, k+1 ), &
+                              1 )
+                    call stdlib_wgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( imax, &
+                              1 ), ldw, cone, w( k, k+1 ),1 )
+                    w( imax, k+1 ) = real( w( imax, k+1 ),KIND=qp)
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value.
+                    ! determine only rowmax.
+                    jmax = k - 1 + stdlib_iwamax( imax-k, w( k, k+1 ), 1 )
+                    rowmax = cabs1( w( jmax, k+1 ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_iwamax( n-imax, w( imax+1, k+1 ), 1 )
+                       rowmax = max( rowmax, cabs1( w( jmax, k+1 ) ) )
+                    end if
+                    ! case(2)
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    ! case(3)
+                    else if( abs( real( w( imax, k+1 ),KIND=qp) )>=alpha*rowmax )then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column k+1 of w to column k of w
+                       call stdlib_wcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                    ! case(4)
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                    ! end pivot search along imax row
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=qp)
+                    call stdlib_wcopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    call stdlib_wlacgv( kp-kk-1, a( kp, kk+1 ), lda )
+                    if( kp<n )call stdlib_wcopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (columns k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_wswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_wswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! (1) store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=qp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_wcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(4))
+                       r1 = one / real( a( k, k ),KIND=qp)
+                       call stdlib_wdscal( n-k, r1, a( k+1, k ), 1 )
+                       ! (2) conjugate column w(k)
+                       call stdlib_wlacgv( n-k, w( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! (1) store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! (note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored)
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! = ( conj(d21)*( d11 ) d21*(  -1 ) )
+                         ! (           (  -1 )     ( d22 ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = t/d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0, since in 2x2 pivot case(4)
+                            ! |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / conjg( d21 )
+                       t = one / ( real( d11*d22,KIND=qp)-one )
+                       d21 = t / d21
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = conjg( d21 )*( d11*w( j, k )-w( j, k+1 ) )
+                          a( j, k+1 ) = d21*( d22*w( j, k+1 )-w( j, k ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    ! (2) conjugate columns w(k) and w(k+1)
+                    call stdlib_wlacgv( n-k, w( k+1, k ), 1 )
+                    call stdlib_wlacgv( n-k-1, w( k+2, k+1 ), 1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**h = a22 - l21*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=qp)
+                    call stdlib_wgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=qp)
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! of rows in columns 1:k-1 looping backwards from k-1 to 1
+              j = k - 1
+              120 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j - 1
+                 end if
+                 ! (note: here, j is used to determine row length. length j
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j - 1
+                 if( jp/=jj .and. j>=1 )call stdlib_wswap( j, a( jp, 1 ), lda, a( jj, 1 ), lda )
+                           
+              if( j>1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_wlahef
+
+     !> DLAHEF_AA factorizes a panel of a complex hermitian matrix A using
+     !> the Aasen's algorithm. The panel consists of a set of NB rows of A
+     !> when UPLO is U, or a set of NB columns when UPLO is L.
+     !> In order to factorize the panel, the Aasen's algorithm requires the
+     !> last row, or column, of the previous panel. The first row, or column,
+     !> of A is set to be the first row, or column, of an identity matrix,
+     !> which is used to factorize the first panel.
+     !> The resulting J-th row of U, or J-th column of L, is stored in the
+     !> (J-1)-th row, or column, of A (without the unit diagonals), while
+     !> the diagonal and subdiagonal of A are overwritten by those of T.
+
+     pure subroutine stdlib_wlahef_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: m, nb, j1, lda, ldh
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), h(ldh,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, k, k1, i1, i2, mj
+           complex(qp) :: piv, alpha
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Executable Statements 
+           j = 1
+           ! k1 is the first column of the panel to be factorized
+           ! i.e.,  k1 is 2 for the first block column, and 1 for the rest of the blocks
+           k1 = (2-j1)+1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              10 continue
+              if ( j>min(m, nb) )go to 20
+              ! k is the column to be factorized
+               ! when being called from stdlib_whetrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:n, j) := a(j, j:n) - h(j:n, 1:(j-1)) * l(j1:(j-1), j),
+               ! where h(j:n, j) has been initialized to be a(j, j:n)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_wlacgv( j-k1, a( 1, j ), 1 )
+                 call stdlib_wgemv( 'NO TRANSPOSE', mj, j-k1,-cone, h( j, k1 ), ldh,a( 1, j ), 1,&
+                           cone, h( j, j ), 1 )
+                 call stdlib_wlacgv( j-k1, a( 1, j ), 1 )
+              end if
+              ! copy h(i:n, i) into work
+              call stdlib_wcopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j-1, j:n) * t(j-1,j),
+                  ! where a(j-1, j) stores t(j-1, j) and a(j-2, j:n) stores u(j-1, j:n)
+                 alpha = -conjg( a( k-1, j ) )
+                 call stdlib_waxpy( mj, alpha, a( k-2, j ), lda, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( k, j ) = real( work( 1 ),KIND=qp)
+              if( j<m ) then
+                 ! compute work(2:n) = t(j, j) l(j, (j+1):n)
+                  ! where a(j, j) stores t(j, j) and a(j-1, (j+1):n) stores u(j, (j+1):n)
+                 if( k>1 ) then
+                    alpha = -a( k, j )
+                    call stdlib_waxpy( m-j, alpha, a( k-1, j+1 ), lda,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:n)|)
+                 i2 = stdlib_iwamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply hermitian pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1, i1+1:n) with a(i1+1:n, i2)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_wswap( i2-i1-1, a( j1+i1-1, i1+1 ), lda,a( j1+i1, i2 ), 1 )
+                              
+                    call stdlib_wlacgv( i2-i1, a( j1+i1-1, i1+1 ), lda )
+                    call stdlib_wlacgv( i2-i1-1, a( j1+i1, i2 ), 1 )
+                    ! swap a(i1, i2+1:n) with a(i2, i2+1:n)
+                    if( i2<m )call stdlib_wswap( m-i2, a( j1+i1-1, i2+1 ), lda,a( j1+i2-1, i2+1 ),&
+                               lda )
+                    ! swap a(i1, i1) with a(i2,i2)
+                    piv = a( i1+j1-1, i1 )
+                    a( j1+i1-1, i1 ) = a( j1+i2-1, i2 )
+                    a( j1+i2-1, i2 ) = piv
+                    ! swap h(i1, 1:j1) with h(i2, 1:j1)
+                    call stdlib_wswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_wswap( i1-k1+1, a( 1, i1 ), 1,a( 1, i2 ), 1 )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j, j+1) = t(j, j+1)
+                 a( k, j+1 ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:n, j+1) into h(j:n, j),
+                    call stdlib_wcopy( m-j, a( k+1, j+1 ), lda,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:n ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:n, j) = l(j+2:n, j+1)
+                 if( j<(m-1) ) then
+                    if( a( k, j+1 )/=czero ) then
+                       alpha = cone / a( k, j+1 )
+                       call stdlib_wcopy( m-j-1, work( 3 ), 1, a( k, j+2 ), lda )
+                       call stdlib_wscal( m-j-1, alpha, a( k, j+2 ), lda )
+                    else
+                       call stdlib_wlaset( 'FULL', 1, m-j-1, czero, czero,a( k, j+2 ), lda)
+                                 
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 10
+              20 continue
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              30 continue
+              if( j>min( m, nb ) )go to 40
+              ! k is the column to be factorized
+               ! when being called from stdlib_whetrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:n, j) := a(j:n, j) - h(j:n, 1:(j-1)) * l(j, j1:(j-1))^t,
+               ! where h(j:n, j) has been initialized to be a(j:n, j)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_wlacgv( j-k1, a( j, 1 ), lda )
+                 call stdlib_wgemv( 'NO TRANSPOSE', mj, j-k1,-cone, h( j, k1 ), ldh,a( j, 1 ), &
+                           lda,cone, h( j, j ), 1 )
+                 call stdlib_wlacgv( j-k1, a( j, 1 ), lda )
+              end if
+              ! copy h(j:n, j) into work
+              call stdlib_wcopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j:n, j-1) * t(j-1,j),
+                  ! where a(j-1, j) = t(j-1, j) and a(j, j-2) = l(j, j-1)
+                 alpha = -conjg( a( j, k-1 ) )
+                 call stdlib_waxpy( mj, alpha, a( j, k-2 ), 1, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( j, k ) = real( work( 1 ),KIND=qp)
+              if( j<m ) then
+                 ! compute work(2:n) = t(j, j) l((j+1):n, j)
+                  ! where a(j, j) = t(j, j) and a((j+1):n, j-1) = l((j+1):n, j)
+                 if( k>1 ) then
+                    alpha = -a( j, k )
+                    call stdlib_waxpy( m-j, alpha, a( j+1, k-1 ), 1,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:n)|)
+                 i2 = stdlib_iwamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply hermitian pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1+1:n, i1) with a(i2, i1+1:n)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_wswap( i2-i1-1, a( i1+1, j1+i1-1 ), 1,a( i2, j1+i1 ), lda )
+                              
+                    call stdlib_wlacgv( i2-i1, a( i1+1, j1+i1-1 ), 1 )
+                    call stdlib_wlacgv( i2-i1-1, a( i2, j1+i1 ), lda )
+                    ! swap a(i2+1:n, i1) with a(i2+1:n, i2)
+                    if( i2<m )call stdlib_wswap( m-i2, a( i2+1, j1+i1-1 ), 1,a( i2+1, j1+i2-1 ), &
+                              1 )
+                    ! swap a(i1, i1) with a(i2, i2)
+                    piv = a( i1, j1+i1-1 )
+                    a( i1, j1+i1-1 ) = a( i2, j1+i2-1 )
+                    a( i2, j1+i2-1 ) = piv
+                    ! swap h(i1, i1:j1) with h(i2, i2:j1)
+                    call stdlib_wswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_wswap( i1-k1+1, a( i1, 1 ), lda,a( i2, 1 ), lda )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j+1, j) = t(j+1, j)
+                 a( j+1, k ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:n, j+1) into h(j+1:n, j),
+                    call stdlib_wcopy( m-j, a( j+1, k+1 ), 1,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:n ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:n, j) = l(j+2:n, j+1)
+                 if( j<(m-1) ) then
+                    if( a( j+1, k )/=czero ) then
+                       alpha = cone / a( j+1, k )
+                       call stdlib_wcopy( m-j-1, work( 3 ), 1, a( j+2, k ), 1 )
+                       call stdlib_wscal( m-j-1, alpha, a( j+2, k ), 1 )
+                    else
+                       call stdlib_wlaset( 'FULL', m-j-1, 1, czero, czero,a( j+2, k ), lda )
+                                 
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 30
+              40 continue
+           end if
+           return
+     end subroutine stdlib_wlahef_aa
+
+     !> ZLAHEF_RK: computes a partial factorization of a complex Hermitian
+     !> matrix A using the bounded Bunch-Kaufman (rook) diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**H U22**H )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**H L21**H )  if UPLO = 'L',
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> ZLAHEF_RK is an auxiliary routine called by ZHETRF_RK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_wlahef_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: w(ldw,*), e(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, ii, j, jb, jj, jmax, k, kk, kkw, kp, kstep, kw, p
+           real(qp) :: absakk, alpha, colmax, dtemp, r1, rowmax, t, sfmin
+           complex(qp) :: d11, d21, d22, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,aimag,max,min,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=qp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11 (note that conjg(w) is actually stored)
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = czero
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              if( k>1 )call stdlib_wcopy( k-1, a( 1, k ), 1, w( 1, kw ), 1 )
+              w( k, kw ) = real( a( k, k ),KIND=qp)
+              if( k<n ) then
+                 call stdlib_wgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ), lda,w( k, kw+1 ), &
+                           ldw, cone, w( 1, kw ), 1 )
+                 w( k, kw ) = real( w( k, kw ),KIND=qp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, kw ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( w( k, kw ),KIND=qp)
+                 if( k>1 )call stdlib_wcopy( k-1, w( 1, kw ), 1, a( 1, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! lop until pivot found
+                    done = .false.
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       if( imax>1 )call stdlib_wcopy( imax-1, a( 1, imax ), 1, w( 1, kw-1 ),1 )
+                                 
+                       w( imax, kw-1 ) = real( a( imax, imax ),KIND=qp)
+                       call stdlib_wcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       call stdlib_wlacgv( k-imax, w( imax+1, kw-1 ), 1 )
+                       if( k<n ) then
+                          call stdlib_wgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda, w( &
+                                    imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                          w( imax, kw-1 ) = real( w( imax, kw-1 ),KIND=qp)
+                       end if
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_iwamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = cabs1( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_iwamax( imax-1, w( 1, kw-1 ), 1 )
+                          dtemp = cabs1( w( itemp, kw-1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=qp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( w( imax,kw-1 ),KIND=qp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_wcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_wcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 12
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns p and k.
+                 ! updated column p is already stored in column kw of w.
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p of submatrix a
+                    ! at step k. no need to copy element into columns
+                    ! k and k-1 of a for 2-by-2 pivot, since these columns
+                    ! will be later overwritten.
+                    a( p, p ) = real( a( k, k ),KIND=qp)
+                    call stdlib_wcopy( k-1-p, a( p+1, k ), 1, a( p, p+1 ),lda )
+                    call stdlib_wlacgv( k-1-p, a( p, p+1 ), lda )
+                    if( p>1 )call stdlib_wcopy( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in the last k+1 to n columns of a
+                    ! (columns k and k-1 of a for 2-by-2 pivot will be
+                    ! later overwritten). interchange rows k and p
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_wswap( n-k, a( k, k+1 ), lda, a( p, k+1 ),lda )
+                    call stdlib_wswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ),ldw )
+                 end if
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=qp)
+                    call stdlib_wcopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_wlacgv( kk-1-kp, a( kp, kp+1 ), lda )
+                    if( kp>1 )call stdlib_wcopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_wswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_wswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! (1) store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element u(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=qp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_wcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(3))
+                       ! handle division by a small number
+                       t = real( a( k, k ),KIND=qp)
+                       if( abs( t )>=sfmin ) then
+                          r1 = one / t
+                          call stdlib_wdscal( k-1, r1, a( 1, k ), 1 )
+                       else
+                          do ii = 1, k-1
+                             a( ii, k ) = a( ii, k ) / t
+                          end do
+                       end if
+                       ! (2) conjugate column w(kw)
+                       call stdlib_wlacgv( k-1, w( 1, kw ), 1 )
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! (1) store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! (note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored)
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! handle division by a small number. (note: order of
+                       ! operations is important)
+                       ! = ( t*(( d11 )/conj(d21)) t*((  -1 )/d21 ) )
+                         ! (   ((  -1 )          )   (( d22 )     ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0 in 2x2 pivot case(4),
+                            ! since |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / conjg( d21 )
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = one / ( real( d11*d22,KIND=qp)-one )
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( ( d11*w( j, kw-1 )-w( j, kw ) ) /d21 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /conjg( d21 ) )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy superdiagonal element of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = czero
+                    a( k, k ) = w( k, kw )
+                    e( k ) = w( k-1, kw )
+                    e( k-1 ) = czero
+                    ! (2) conjugate columns w(kw) and w(kw-1)
+                    call stdlib_wlacgv( k-1, w( 1, kw ), 1 )
+                    call stdlib_wlacgv( k-2, w( 1, kw-1 ), 1 )
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**h = a11 - u12*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=qp)
+                    call stdlib_wgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=qp)
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k,-cone, a( &
+                           1, k+1 ), lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22 (note that conjg(w) is actually stored)
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = czero
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update column k of w
+              w( k, k ) = real( a( k, k ),KIND=qp)
+              if( k<n )call stdlib_wcopy( n-k, a( k+1, k ), 1, w( k+1, k ), 1 )
+              if( k>1 ) then
+                 call stdlib_wgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( k, 1 ), &
+                           ldw, cone, w( k, k ), 1 )
+                 w( k, k ) = real( w( k, k ),KIND=qp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, k ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( w( k, k ),KIND=qp)
+                 if( k<n )call stdlib_wcopy( n-k, w( k+1, k ), 1, a( k+1, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_wcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_wlacgv( imax-k, w( k, k+1 ), 1 )
+                       w( imax, k+1 ) = real( a( imax, imax ),KIND=qp)
+                       if( imax<n )call stdlib_wcopy( n-imax, a( imax+1, imax ), 1,w( imax+1, k+1 &
+                                 ), 1 )
+                       if( k>1 ) then
+                          call stdlib_wgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone,a( k, 1 ), lda, w( &
+                                    imax, 1 ), ldw,cone, w( k, k+1 ), 1 )
+                          w( imax, k+1 ) = real( w( imax, k+1 ),KIND=qp)
+                       end if
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_iwamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = cabs1( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_iwamax( n-imax, w( imax+1, k+1 ), 1)
+                          dtemp = cabs1( w( itemp, k+1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,k+1 ),KIND=qp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( w( imax,k+1 ),KIND=qp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_wcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_wcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 72
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns p and k (only for 2-by-2 pivot).
+                 ! updated column p is already stored in column k of w.
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column kk-1 to column p of submatrix a
+                    ! at step k. no need to copy element into columns
+                    ! k and k+1 of a for 2-by-2 pivot, since these columns
+                    ! will be later overwritten.
+                    a( p, p ) = real( a( k, k ),KIND=qp)
+                    call stdlib_wcopy( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                    call stdlib_wlacgv( p-k-1, a( p, k+1 ), lda )
+                    if( p<n )call stdlib_wcopy( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    ! interchange rows k and p in first k-1 columns of a
+                    ! (columns k and k+1 of a for 2-by-2 pivot will be
+                    ! later overwritten). interchange rows k and p
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_wswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_wswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=qp)
+                    call stdlib_wcopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    call stdlib_wlacgv( kp-kk-1, a( kp, kk+1 ), lda )
+                    if( kp<n )call stdlib_wcopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (column k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_wswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_wswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! (1) store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=qp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_wcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(3))
+                       ! handle division by a small number
+                       t = real( a( k, k ),KIND=qp)
+                       if( abs( t )>=sfmin ) then
+                          r1 = one / t
+                          call stdlib_wdscal( n-k, r1, a( k+1, k ), 1 )
+                       else
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / t
+                          end do
+                       end if
+                       ! (2) conjugate column w(k)
+                       call stdlib_wlacgv( n-k, w( k+1, k ), 1 )
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! (1) store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored.
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! handle division by a small number. (note: order of
+                       ! operations is important)
+                       ! = ( t*(( d11 )/conj(d21)) t*((  -1 )/d21 ) )
+                         ! (   ((  -1 )          )   (( d22 )     ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0 in 2x2 pivot case(4),
+                            ! since |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / conjg( d21 )
+                       t = one / ( real( d11*d22,KIND=qp)-one )
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /conjg( d21 ) )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy subdiagonal element of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = czero
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    e( k ) = w( k+1, k )
+                    e( k+1 ) = czero
+                    ! (2) conjugate columns w(k) and w(k+1)
+                    call stdlib_wlacgv( n-k, w( k+1, k ), 1 )
+                    call stdlib_wlacgv( n-k-1, w( k+2, k+1 ), 1 )
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**h = a22 - l21*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=qp)
+                    call stdlib_wgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=qp)
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_wlahef_rk
+
+     !> ZLAHEF_ROOK: computes a partial factorization of a complex Hermitian
+     !> matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting
+     !> method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I      0     )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**H U22**H )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**H L21**H )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0      I     )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> Note that U**H denotes the conjugate transpose of U.
+     !> ZLAHEF_ROOK is an auxiliary routine called by ZHETRF_ROOK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_wlahef_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, ii, j, jb, jj, jmax, jp1, jp2, k, kk, kkw, kp, kstep, kw, &
+                     p
+           real(qp) :: absakk, alpha, colmax, dtemp, r1, rowmax, t, sfmin
+           complex(qp) :: d11, d21, d22, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,aimag,max,min,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=qp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11 (note that conjg(w) is actually stored)
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              if( k>1 )call stdlib_wcopy( k-1, a( 1, k ), 1, w( 1, kw ), 1 )
+              w( k, kw ) = real( a( k, k ),KIND=qp)
+              if( k<n ) then
+                 call stdlib_wgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ), lda,w( k, kw+1 ), &
+                           ldw, cone, w( 1, kw ), 1 )
+                 w( k, kw ) = real( w( k, kw ),KIND=qp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, kw ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( w( k, kw ),KIND=qp)
+                 if( k>1 )call stdlib_wcopy( k-1, w( 1, kw ), 1, a( 1, k ), 1 )
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! lop until pivot found
+                    done = .false.
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       if( imax>1 )call stdlib_wcopy( imax-1, a( 1, imax ), 1, w( 1, kw-1 ),1 )
+                                 
+                       w( imax, kw-1 ) = real( a( imax, imax ),KIND=qp)
+                       call stdlib_wcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       call stdlib_wlacgv( k-imax, w( imax+1, kw-1 ), 1 )
+                       if( k<n ) then
+                          call stdlib_wgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda, w( &
+                                    imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                          w( imax, kw-1 ) = real( w( imax, kw-1 ),KIND=qp)
+                       end if
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_iwamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = cabs1( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_iwamax( imax-1, w( 1, kw-1 ), 1 )
+                          dtemp = cabs1( w( itemp, kw-1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=qp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( w( imax,kw-1 ),KIND=qp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_wcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_wcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 12
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns p and k.
+                 ! updated column p is already stored in column kw of w.
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p of submatrix a
+                    ! at step k. no need to copy element into columns
+                    ! k and k-1 of a for 2-by-2 pivot, since these columns
+                    ! will be later overwritten.
+                    a( p, p ) = real( a( k, k ),KIND=qp)
+                    call stdlib_wcopy( k-1-p, a( p+1, k ), 1, a( p, p+1 ),lda )
+                    call stdlib_wlacgv( k-1-p, a( p, p+1 ), lda )
+                    if( p>1 )call stdlib_wcopy( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in the last k+1 to n columns of a
+                    ! (columns k and k-1 of a for 2-by-2 pivot will be
+                    ! later overwritten). interchange rows k and p
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_wswap( n-k, a( k, k+1 ), lda, a( p, k+1 ),lda )
+                    call stdlib_wswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ),ldw )
+                 end if
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=qp)
+                    call stdlib_wcopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_wlacgv( kk-1-kp, a( kp, kp+1 ), lda )
+                    if( kp>1 )call stdlib_wcopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_wswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_wswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! (1) store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element u(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=qp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_wcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(3))
+                       ! handle division by a small number
+                       t = real( a( k, k ),KIND=qp)
+                       if( abs( t )>=sfmin ) then
+                          r1 = one / t
+                          call stdlib_wdscal( k-1, r1, a( 1, k ), 1 )
+                       else
+                          do ii = 1, k-1
+                             a( ii, k ) = a( ii, k ) / t
+                          end do
+                       end if
+                       ! (2) conjugate column w(kw)
+                       call stdlib_wlacgv( k-1, w( 1, kw ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! (1) store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! (note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored)
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! handle division by a small number. (note: order of
+                       ! operations is important)
+                       ! = ( t*(( d11 )/conj(d21)) t*((  -1 )/d21 ) )
+                         ! (   ((  -1 )          )   (( d22 )     ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0 in 2x2 pivot case(4),
+                            ! since |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / conjg( d21 )
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = one / ( real( d11*d22,KIND=qp)-one )
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( ( d11*w( j, kw-1 )-w( j, kw ) ) /d21 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /conjg( d21 ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                    ! (2) conjugate columns w(kw) and w(kw-1)
+                    call stdlib_wlacgv( k-1, w( 1, kw ), 1 )
+                    call stdlib_wlacgv( k-2, w( 1, kw-1 ), 1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**h = a11 - u12*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=qp)
+                    call stdlib_wgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=qp)
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k,-cone, a( &
+                           1, k+1 ), lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in of rows in columns k+1:n looping backwards from k+1 to n
+              j = k + 1
+              60 continue
+                 ! undo the interchanges (if any) of rows j and jp2
+                 ! (or j and jp2, and j+1 and jp1) at each step j
+                 kstep = 1
+                 jp1 = 1
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    ! (here, j is a diagonal index)
+                    j = j + 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 ! (note: here, j is used to determine row length. length n-j+1
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j + 1
+                 if( jp2/=jj .and. j<=n )call stdlib_wswap( n-j+1, a( jp2, j ), lda, a( jj, j ), &
+                           lda )
+                 jj = jj + 1
+                 if( kstep==2 .and. jp1/=jj .and. j<=n )call stdlib_wswap( n-j+1, a( jp1, j ), &
+                           lda, a( jj, j ), lda )
+              if( j<n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22 (note that conjg(w) is actually stored)
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update column k of w
+              w( k, k ) = real( a( k, k ),KIND=qp)
+              if( k<n )call stdlib_wcopy( n-k, a( k+1, k ), 1, w( k+1, k ), 1 )
+              if( k>1 ) then
+                 call stdlib_wgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( k, 1 ), &
+                           ldw, cone, w( k, k ), 1 )
+                 w( k, k ) = real( w( k, k ),KIND=qp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, k ),KIND=qp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( w( k, k ),KIND=qp)
+                 if( k<n )call stdlib_wcopy( n-k, w( k+1, k ), 1, a( k+1, k ), 1 )
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_wcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_wlacgv( imax-k, w( k, k+1 ), 1 )
+                       w( imax, k+1 ) = real( a( imax, imax ),KIND=qp)
+                       if( imax<n )call stdlib_wcopy( n-imax, a( imax+1, imax ), 1,w( imax+1, k+1 &
+                                 ), 1 )
+                       if( k>1 ) then
+                          call stdlib_wgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone,a( k, 1 ), lda, w( &
+                                    imax, 1 ), ldw,cone, w( k, k+1 ), 1 )
+                          w( imax, k+1 ) = real( w( imax, k+1 ),KIND=qp)
+                       end if
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_iwamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = cabs1( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_iwamax( n-imax, w( imax+1, k+1 ), 1)
+                          dtemp = cabs1( w( itemp, k+1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,k+1 ),KIND=qp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( w( imax,k+1 ),KIND=qp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_wcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_wcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 72
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns p and k (only for 2-by-2 pivot).
+                 ! updated column p is already stored in column k of w.
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column kk-1 to column p of submatrix a
+                    ! at step k. no need to copy element into columns
+                    ! k and k+1 of a for 2-by-2 pivot, since these columns
+                    ! will be later overwritten.
+                    a( p, p ) = real( a( k, k ),KIND=qp)
+                    call stdlib_wcopy( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                    call stdlib_wlacgv( p-k-1, a( p, k+1 ), lda )
+                    if( p<n )call stdlib_wcopy( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    ! interchange rows k and p in first k-1 columns of a
+                    ! (columns k and k+1 of a for 2-by-2 pivot will be
+                    ! later overwritten). interchange rows k and p
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_wswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_wswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=qp)
+                    call stdlib_wcopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    call stdlib_wlacgv( kp-kk-1, a( kp, kk+1 ), lda )
+                    if( kp<n )call stdlib_wcopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (column k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_wswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_wswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! (1) store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=qp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_wcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(3))
+                       ! handle division by a small number
+                       t = real( a( k, k ),KIND=qp)
+                       if( abs( t )>=sfmin ) then
+                          r1 = one / t
+                          call stdlib_wdscal( n-k, r1, a( k+1, k ), 1 )
+                       else
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / t
+                          end do
+                       end if
+                       ! (2) conjugate column w(k)
+                       call stdlib_wlacgv( n-k, w( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! (1) store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored.
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! handle division by a small number. (note: order of
+                       ! operations is important)
+                       ! = ( t*(( d11 )/conj(d21)) t*((  -1 )/d21 ) )
+                         ! (   ((  -1 )          )   (( d22 )     ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0 in 2x2 pivot case(4),
+                            ! since |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / conjg( d21 )
+                       t = one / ( real( d11*d22,KIND=qp)-one )
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /conjg( d21 ) )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    ! (2) conjugate columns w(k) and w(k+1)
+                    call stdlib_wlacgv( n-k, w( k+1, k ), 1 )
+                    call stdlib_wlacgv( n-k-1, w( k+2, k+1 ), 1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**h = a22 - l21*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=qp)
+                    call stdlib_wgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=qp)
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! of rows in columns 1:k-1 looping backwards from k-1 to 1
+              j = k - 1
+              120 continue
+                 ! undo the interchanges (if any) of rows j and jp2
+                 ! (or j and jp2, and j-1 and jp1) at each step j
+                 kstep = 1
+                 jp1 = 1
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    ! (here, j is a diagonal index)
+                    j = j - 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 ! (note: here, j is used to determine row length. length j
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j - 1
+                 if( jp2/=jj .and. j>=1 )call stdlib_wswap( j, a( jp2, 1 ), lda, a( jj, 1 ), lda )
+                           
+                 jj = jj -1
+                 if( kstep==2 .and. jp1/=jj .and. j>=1 )call stdlib_wswap( j, a( jp1, 1 ), lda, a(&
+                            jj, 1 ), lda )
+              if( j>1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_wlahef_rook
+
+     !> ZLAHQR: is an auxiliary routine called by CHSEQR to update the
+     !> eigenvalues and Schur decomposition already computed by CHSEQR, by
+     !> dealing with the Hessenberg submatrix in rows and columns ILO to
+     !> IHI.
+
+     pure subroutine stdlib_wlahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, info &
+               )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(qp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(qp), intent(out) :: w(*)
+        ! =========================================================
+           ! Parameters 
+           real(qp), parameter :: rzero = 0.0_qp
+           real(qp), parameter :: rone = 1.0_qp
+           real(qp), parameter :: dat1 = 3.0_qp/4.0_qp
+           integer(ilp), parameter :: kexsh = 10
+           
+           
+           
+           
+           ! Local Scalars 
+           complex(qp) :: cdum, h11, h11s, h22, sc, sum, t, t1, temp, u, v2, x, y
+           real(qp) :: aa, ab, ba, bb, h10, h21, rtemp, s, safmax, safmin, smlnum, sx, t2, tst, &
+                     ulp
+           integer(ilp) :: i, i1, i2, its, itmax, j, jhi, jlo, k, l, m, nh, nz, kdefl
+           ! Local Arrays 
+           complex(qp) :: v(2)
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,aimag,max,min,sqrt
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           if( ilo==ihi ) then
+              w( ilo ) = h( ilo, ilo )
+              return
+           end if
+           ! ==== clear out the trash ====
+           do j = ilo, ihi - 3
+              h( j+2, j ) = czero
+              h( j+3, j ) = czero
+           end do
+           if( ilo<=ihi-2 )h( ihi, ihi-2 ) = czero
+           ! ==== ensure that subdiagonal entries are real ====
+           if( wantt ) then
+              jlo = 1
+              jhi = n
+           else
+              jlo = ilo
+              jhi = ihi
+           end if
+           do i = ilo + 1, ihi
+              if( aimag( h( i, i-1 ) )/=rzero ) then
+                 ! ==== the following redundant normalization
+                 ! .    avoids problems with both gradual and
+                 ! .    sudden underflow in abs(h(i,i-1)) ====
+                 sc = h( i, i-1 ) / cabs1( h( i, i-1 ) )
+                 sc = conjg( sc ) / abs( sc )
+                 h( i, i-1 ) = abs( h( i, i-1 ) )
+                 call stdlib_wscal( jhi-i+1, sc, h( i, i ), ldh )
+                 call stdlib_wscal( min( jhi, i+1 )-jlo+1, conjg( sc ),h( jlo, i ), 1 )
+                 if( wantz )call stdlib_wscal( ihiz-iloz+1, conjg( sc ), z( iloz, i ), 1 )
+              end if
+           end do
+           nh = ihi - ilo + 1
+           nz = ihiz - iloz + 1
+           ! set machine-dependent constants for the stopping criterion.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = rone / safmin
+           call stdlib_qlabad( safmin, safmax )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin*( real( nh,KIND=qp) / ulp )
+           ! i1 and i2 are the indices of the first row and last column of h
+           ! to which transformations must be applied. if eigenvalues only are
+           ! being computed, i1 and i2 are set inside the main loop.
+           if( wantt ) then
+              i1 = 1
+              i2 = n
+           end if
+           ! itmax is the total number of qr iterations allowed.
+           itmax = 30 * max( 10, nh )
+           ! kdefl counts the number of iterations since a deflation
+           kdefl = 0
+           ! the main loop begins here. i is the loop index and decreases from
+           ! ihi to ilo in steps of 1. each iteration of the loop works
+           ! with the active submatrix in rows and columns l to i.
+           ! eigenvalues i+1 to ihi have already converged. either l = ilo, or
+           ! h(l,l-1) is negligible so that the matrix splits.
+           i = ihi
+           30 continue
+           if( i<ilo )go to 150
+           ! perform qr iterations on rows and columns ilo to i until a
+           ! submatrix of order 1 splits off at the bottom because a
+           ! subdiagonal element has become negligible.
+           l = ilo
+           loop_130: do its = 0, itmax
+              ! look for a single small subdiagonal element.
+              do k = i, l + 1, -1
+                 if( cabs1( h( k, k-1 ) )<=smlnum )go to 50
+                 tst = cabs1( h( k-1, k-1 ) ) + cabs1( h( k, k ) )
+                 if( tst==czero ) then
+                    if( k-2>=ilo )tst = tst + abs( real( h( k-1, k-2 ),KIND=qp) )
+                    if( k+1<=ihi )tst = tst + abs( real( h( k+1, k ),KIND=qp) )
+                 end if
+                 ! ==== the following is a conservative small subdiagonal
+                 ! .    deflation criterion due to ahues
+                 ! .    1997). it has better mathematical foundation and
+                 ! .    improves accuracy in some examples.  ====
+                 if( abs( real( h( k, k-1 ),KIND=qp) )<=ulp*tst ) then
+                    ab = max( cabs1( h( k, k-1 ) ), cabs1( h( k-1, k ) ) )
+                    ba = min( cabs1( h( k, k-1 ) ), cabs1( h( k-1, k ) ) )
+                    aa = max( cabs1( h( k, k ) ),cabs1( h( k-1, k-1 )-h( k, k ) ) )
+                    bb = min( cabs1( h( k, k ) ),cabs1( h( k-1, k-1 )-h( k, k ) ) )
+                    s = aa + ab
+                    if( ba*( ab / s )<=max( smlnum,ulp*( bb*( aa / s ) ) ) )go to 50
+                 end if
+              end do
+              50 continue
+              l = k
+              if( l>ilo ) then
+                 ! h(l,l-1) is negligible
+                 h( l, l-1 ) = czero
+              end if
+              ! exit from loop if a submatrix of order 1 has split off.
+              if( l>=i )go to 140
+              kdefl = kdefl + 1
+              ! now the active submatrix is in rows and columns l to i. if
+              ! eigenvalues only are being computed, only the active submatrix
+              ! need be transformed.
+              if( .not.wantt ) then
+                 i1 = l
+                 i2 = i
+              end if
+              if( mod(kdefl,2*kexsh)==0 ) then
+                 ! exceptional shift.
+                 s = dat1*abs( real( h( i, i-1 ),KIND=qp) )
+                 t = s + h( i, i )
+              else if( mod(kdefl,kexsh)==0 ) then
+                 ! exceptional shift.
+                 s = dat1*abs( real( h( l+1, l ),KIND=qp) )
+                 t = s + h( l, l )
+              else
+                 ! wilkinson's shift.
+                 t = h( i, i )
+                 u = sqrt( h( i-1, i ) )*sqrt( h( i, i-1 ) )
+                 s = cabs1( u )
+                 if( s/=rzero ) then
+                    x = half*( h( i-1, i-1 )-t )
+                    sx = cabs1( x )
+                    s = max( s, cabs1( x ) )
+                    y = s*sqrt( ( x / s )**2+( u / s )**2 )
+                    if( sx>rzero ) then
+                       if( real( x / sx,KIND=qp)*real( y,KIND=qp)+aimag( x / sx )*aimag( y )&
+                                 <rzero )y = -y
+                    end if
+                    t = t - u*stdlib_wladiv( u, ( x+y ) )
+                 end if
+              end if
+              ! look for two consecutive small subdiagonal elements.
+              do m = i - 1, l + 1, -1
+                 ! determine the effect of starting the single-shift qr
+                 ! iteration at row m, and see if this would make h(m,m-1)
+                 ! negligible.
+                 h11 = h( m, m )
+                 h22 = h( m+1, m+1 )
+                 h11s = h11 - t
+                 h21 = real( h( m+1, m ),KIND=qp)
+                 s = cabs1( h11s ) + abs( h21 )
+                 h11s = h11s / s
+                 h21 = h21 / s
+                 v( 1 ) = h11s
+                 v( 2 ) = h21
+                 h10 = real( h( m, m-1 ),KIND=qp)
+                 if( abs( h10 )*abs( h21 )<=ulp*( cabs1( h11s )*( cabs1( h11 )+cabs1( h22 ) ) ) )&
+                           go to 70
+              end do
+              h11 = h( l, l )
+              h22 = h( l+1, l+1 )
+              h11s = h11 - t
+              h21 = real( h( l+1, l ),KIND=qp)
+              s = cabs1( h11s ) + abs( h21 )
+              h11s = h11s / s
+              h21 = h21 / s
+              v( 1 ) = h11s
+              v( 2 ) = h21
+              70 continue
+              ! single-shift qr step
+              loop_120: do k = m, i - 1
+                 ! the first iteration of this loop determines a reflection g
+                 ! from the vector v and applies it from left and right to h,
+                 ! thus creating a nonzero bulge below the subdiagonal.
+                 ! each subsequent iteration determines a reflection g to
+                 ! restore the hessenberg form in the (k-1)th column, and thus
+                 ! chases the bulge cone step toward the bottom of the active
+                 ! submatrix.
+                 ! v(2) is always real before the call to stdlib_wlarfg, and hence
+                 ! after the call t2 ( = t1*v(2) ) is also real.
+                 if( k>m )call stdlib_wcopy( 2, h( k, k-1 ), 1, v, 1 )
+                 call stdlib_wlarfg( 2, v( 1 ), v( 2 ), 1, t1 )
+                 if( k>m ) then
+                    h( k, k-1 ) = v( 1 )
+                    h( k+1, k-1 ) = czero
+                 end if
+                 v2 = v( 2 )
+                 t2 = real( t1*v2,KIND=qp)
+                 ! apply g from the left to transform the rows of the matrix
+                 ! in columns k to i2.
+                 do j = k, i2
+                    sum = conjg( t1 )*h( k, j ) + t2*h( k+1, j )
+                    h( k, j ) = h( k, j ) - sum
+                    h( k+1, j ) = h( k+1, j ) - sum*v2
+                 end do
+                 ! apply g from the right to transform the columns of the
+                 ! matrix in rows i1 to min(k+2,i).
+                 do j = i1, min( k+2, i )
+                    sum = t1*h( j, k ) + t2*h( j, k+1 )
+                    h( j, k ) = h( j, k ) - sum
+                    h( j, k+1 ) = h( j, k+1 ) - sum*conjg( v2 )
+                 end do
+                 if( wantz ) then
+                    ! accumulate transformations in the matrix z
+                    do j = iloz, ihiz
+                       sum = t1*z( j, k ) + t2*z( j, k+1 )
+                       z( j, k ) = z( j, k ) - sum
+                       z( j, k+1 ) = z( j, k+1 ) - sum*conjg( v2 )
+                    end do
+                 end if
+                 if( k==m .and. m>l ) then
+                    ! if the qr step was started at row m > l because two
+                    ! consecutive small subdiagonals were found, then extra
+                    ! scaling must be performed to ensure that h(m,m-1) remains
+                    ! real.
+                    temp = cone - t1
+                    temp = temp / abs( temp )
+                    h( m+1, m ) = h( m+1, m )*conjg( temp )
+                    if( m+2<=i )h( m+2, m+1 ) = h( m+2, m+1 )*temp
+                    do j = m, i
+                       if( j/=m+1 ) then
+                          if( i2>j )call stdlib_wscal( i2-j, temp, h( j, j+1 ), ldh )
+                          call stdlib_wscal( j-i1, conjg( temp ), h( i1, j ), 1 )
+                          if( wantz ) then
+                             call stdlib_wscal( nz, conjg( temp ), z( iloz, j ),1 )
+                          end if
+                       end if
+                    end do
+                 end if
+              end do loop_120
+              ! ensure that h(i,i-1) is real.
+              temp = h( i, i-1 )
+              if( aimag( temp )/=rzero ) then
+                 rtemp = abs( temp )
+                 h( i, i-1 ) = rtemp
+                 temp = temp / rtemp
+                 if( i2>i )call stdlib_wscal( i2-i, conjg( temp ), h( i, i+1 ), ldh )
+                 call stdlib_wscal( i-i1, temp, h( i1, i ), 1 )
+                 if( wantz ) then
+                    call stdlib_wscal( nz, temp, z( iloz, i ), 1 )
+                 end if
+              end if
+           end do loop_130
+           ! failure to converge in remaining number of iterations
+           info = i
+           return
+           140 continue
+           ! h(i,i-1) is negligible: cone eigenvalue has converged.
+           w( i ) = h( i, i )
+           ! reset deflation counter
+           kdefl = 0
+           ! return to start of the main loop with new value of i.
+           i = l - 1
+           go to 30
+           150 continue
+           return
+     end subroutine stdlib_wlahqr
+
+     !> ZLAHR2: reduces the first NB columns of A complex general n-BY-(n-k+1)
+     !> matrix A so that elements below the k-th subdiagonal are zero. The
+     !> reduction is performed by an unitary similarity transformation
+     !> Q**H * A * Q. The routine returns the matrices V and T which determine
+     !> Q as a block reflector I - V*T*V**H, and also the matrix Y = A * V * T.
+     !> This is an auxiliary routine called by ZGEHRD.
+
+     pure subroutine stdlib_wlahr2( n, k, nb, a, lda, tau, t, ldt, y, ldy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: k, lda, ldt, ldy, n, nb
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: t(ldt,nb), tau(nb), y(ldy,nb)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(qp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 )return
+           loop_10: do i = 1, nb
+              if( i>1 ) then
+                 ! update a(k+1:n,i)
+                 ! update i-th column of a - y * v**h
+                 call stdlib_wlacgv( i-1, a( k+i-1, 1 ), lda )
+                 call stdlib_wgemv( 'NO TRANSPOSE', n-k, i-1, -cone, y(k+1,1), ldy,a( k+i-1, 1 ), &
+                           lda, cone, a( k+1, i ), 1 )
+                 call stdlib_wlacgv( i-1, a( k+i-1, 1 ), lda )
+                 ! apply i - v * t**h * v**h to this column (call it b) from the
+                 ! left, using the last column of t as workspace
+                 ! let  v = ( v1 )   and   b = ( b1 )   (first i-1 rows)
+                          ! ( v2 )             ( b2 )
+                 ! where v1 is unit lower triangular
+                 ! w := v1**h * b1
+                 call stdlib_wcopy( i-1, a( k+1, i ), 1, t( 1, nb ), 1 )
+                 call stdlib_wtrmv( 'LOWER', 'CONJUGATE TRANSPOSE', 'UNIT',i-1, a( k+1, 1 ),lda, &
+                           t( 1, nb ), 1 )
+                 ! w := w + v2**h * b2
+                 call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-k-i+1, i-1,cone, a( k+i, 1 ),lda, a( &
+                           k+i, i ), 1, cone, t( 1, nb ), 1 )
+                 ! w := t**h * w
+                 call stdlib_wtrmv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',i-1, t, ldt,t( 1, &
+                           nb ), 1 )
+                 ! b2 := b2 - v2*w
+                 call stdlib_wgemv( 'NO TRANSPOSE', n-k-i+1, i-1, -cone,a( k+i, 1 ),lda, t( 1, nb &
+                           ), 1, cone, a( k+i, i ), 1 )
+                 ! b1 := b1 - v1*w
+                 call stdlib_wtrmv( 'LOWER', 'NO TRANSPOSE','UNIT', i-1,a( k+1, 1 ), lda, t( 1, &
+                           nb ), 1 )
+                 call stdlib_waxpy( i-1, -cone, t( 1, nb ), 1, a( k+1, i ), 1 )
+                 a( k+i-1, i-1 ) = ei
+              end if
+              ! generate the elementary reflector h(i) to annihilate
+              ! a(k+i+1:n,i)
+              call stdlib_wlarfg( n-k-i+1, a( k+i, i ), a( min( k+i+1, n ), i ), 1,tau( i ) )
+                        
+              ei = a( k+i, i )
+              a( k+i, i ) = cone
+              ! compute  y(k+1:n,i)
+              call stdlib_wgemv( 'NO TRANSPOSE', n-k, n-k-i+1,cone, a( k+1, i+1 ),lda, a( k+i, i )&
+                        , 1, czero, y( k+1, i ), 1 )
+              call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-k-i+1, i-1,cone, a( k+i, 1 ), lda,a( k+&
+                        i, i ), 1, czero, t( 1, i ), 1 )
+              call stdlib_wgemv( 'NO TRANSPOSE', n-k, i-1, -cone,y( k+1, 1 ), ldy,t( 1, i ), 1, &
+                        cone, y( k+1, i ), 1 )
+              call stdlib_wscal( n-k, tau( i ), y( k+1, i ), 1 )
+              ! compute t(1:i,i)
+              call stdlib_wscal( i-1, -tau( i ), t( 1, i ), 1 )
+              call stdlib_wtrmv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',i-1, t, ldt,t( 1, i ), 1 )
+                        
+              t( i, i ) = tau( i )
+           end do loop_10
+           a( k+nb, nb ) = ei
+           ! compute y(1:k,1:nb)
+           call stdlib_wlacpy( 'ALL', k, nb, a( 1, 2 ), lda, y, ldy )
+           call stdlib_wtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE','UNIT', k, nb,cone, a( k+1, 1 ), &
+                     lda, y, ldy )
+           if( n>k+nb )call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', k,nb, n-k-nb, cone,a( 1,&
+                      2+nb ), lda, a( k+1+nb, 1 ), lda, cone, y,ldy )
+           call stdlib_wtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE','NON-UNIT', k, nb,cone, t, ldt, y, &
+                     ldy )
+           return
+     end subroutine stdlib_wlahr2
+
+     !> ZLAIC1: applies one step of incremental condition estimation in
+     !> its simplest version:
+     !> Let x, twonorm(x) = 1, be an approximate singular vector of an j-by-j
+     !> lower triangular matrix L, such that
+     !> twonorm(L*x) = sest
+     !> Then ZLAIC1 computes sestpr, s, c such that
+     !> the vector
+     !> [ s*x ]
+     !> xhat = [  c  ]
+     !> is an approximate singular vector of
+     !> [ L       0  ]
+     !> Lhat = [ w**H gamma ]
+     !> in the sense that
+     !> twonorm(Lhat*xhat) = sestpr.
+     !> Depending on JOB, an estimate for the largest or smallest singular
+     !> value is computed.
+     !> Note that [s c]**H and sestpr**2 is an eigenpair of the system
+     !> diag(sest*sest, 0) + [alpha  gamma] * [ conjg(alpha) ]
+     !> [ conjg(gamma) ]
+     !> where  alpha =  x**H * w.
+
+     pure subroutine stdlib_wlaic1( job, j, x, sest, w, gamma, sestpr, s, c )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: j, job
+           real(qp), intent(in) :: sest
+           real(qp), intent(out) :: sestpr
+           complex(qp), intent(out) :: c, s
+           complex(qp), intent(in) :: gamma
+           ! Array Arguments 
+           complex(qp), intent(in) :: w(j), x(j)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(qp) :: absalp, absest, absgam, b, eps, norma, s1, s2, scl, t, test, tmp, zeta1, &
+                     zeta2
+           complex(qp) :: alpha, cosine, sine
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,sqrt
+           ! Executable Statements 
+           eps = stdlib_qlamch( 'EPSILON' )
+           alpha = stdlib_wdotc( j, x, 1, w, 1 )
+           absalp = abs( alpha )
+           absgam = abs( gamma )
+           absest = abs( sest )
+           if( job==1 ) then
+              ! estimating largest singular value
+              ! special cases
+              if( sest==zero ) then
+                 s1 = max( absgam, absalp )
+                 if( s1==zero ) then
+                    s = zero
+                    c = one
+                    sestpr = zero
+                 else
+                    s = alpha / s1
+                    c = gamma / s1
+                    tmp = real( sqrt( s*conjg( s )+c*conjg( c ) ),KIND=qp)
+                    s = s / tmp
+                    c = c / tmp
+                    sestpr = s1*tmp
+                 end if
+                 return
+              else if( absgam<=eps*absest ) then
+                 s = one
+                 c = zero
+                 tmp = max( absest, absalp )
+                 s1 = absest / tmp
+                 s2 = absalp / tmp
+                 sestpr = tmp*sqrt( s1*s1+s2*s2 )
+                 return
+              else if( absalp<=eps*absest ) then
+                 s1 = absgam
+                 s2 = absest
+                 if( s1<=s2 ) then
+                    s = one
+                    c = zero
+                    sestpr = s2
+                 else
+                    s = zero
+                    c = one
+                    sestpr = s1
+                 end if
+                 return
+              else if( absest<=eps*absalp .or. absest<=eps*absgam ) then
+                 s1 = absgam
+                 s2 = absalp
+                 if( s1<=s2 ) then
+                    tmp = s1 / s2
+                    scl = sqrt( one+tmp*tmp )
+                    sestpr = s2*scl
+                    s = ( alpha / s2 ) / scl
+                    c = ( gamma / s2 ) / scl
+                 else
+                    tmp = s2 / s1
+                    scl = sqrt( one+tmp*tmp )
+                    sestpr = s1*scl
+                    s = ( alpha / s1 ) / scl
+                    c = ( gamma / s1 ) / scl
+                 end if
+                 return
+              else
+                 ! normal case
+                 zeta1 = absalp / absest
+                 zeta2 = absgam / absest
+                 b = ( one-zeta1*zeta1-zeta2*zeta2 )*half
+                 c = zeta1*zeta1
+                 if( b>zero ) then
+                    t = real( c / ( b+sqrt( b*b+c ) ),KIND=qp)
+                 else
+                    t = real( sqrt( b*b+c ) - b,KIND=qp)
+                 end if
+                 sine = -( alpha / absest ) / t
+                 cosine = -( gamma / absest ) / ( one+t )
+                 tmp = real( sqrt( sine * conjg( sine )+ cosine * conjg( cosine ) ),KIND=qp)
+                           
+                 s = sine / tmp
+                 c = cosine / tmp
+                 sestpr = sqrt( t+one )*absest
+                 return
+              end if
+           else if( job==2 ) then
+              ! estimating smallest singular value
+              ! special cases
+              if( sest==zero ) then
+                 sestpr = zero
+                 if( max( absgam, absalp )==zero ) then
+                    sine = one
+                    cosine = zero
+                 else
+                    sine = -conjg( gamma )
+                    cosine = conjg( alpha )
+                 end if
+                 s1 = max( abs( sine ), abs( cosine ) )
+                 s = sine / s1
+                 c = cosine / s1
+                 tmp = real( sqrt( s*conjg( s )+c*conjg( c ) ),KIND=qp)
+                 s = s / tmp
+                 c = c / tmp
+                 return
+              else if( absgam<=eps*absest ) then
+                 s = zero
+                 c = one
+                 sestpr = absgam
+                 return
+              else if( absalp<=eps*absest ) then
+                 s1 = absgam
+                 s2 = absest
+                 if( s1<=s2 ) then
+                    s = zero
+                    c = one
+                    sestpr = s1
+                 else
+                    s = one
+                    c = zero
+                    sestpr = s2
+                 end if
+                 return
+              else if( absest<=eps*absalp .or. absest<=eps*absgam ) then
+                 s1 = absgam
+                 s2 = absalp
+                 if( s1<=s2 ) then
+                    tmp = s1 / s2
+                    scl = sqrt( one+tmp*tmp )
+                    sestpr = absest*( tmp / scl )
+                    s = -( conjg( gamma ) / s2 ) / scl
+                    c = ( conjg( alpha ) / s2 ) / scl
+                 else
+                    tmp = s2 / s1
+                    scl = sqrt( one+tmp*tmp )
+                    sestpr = absest / scl
+                    s = -( conjg( gamma ) / s1 ) / scl
+                    c = ( conjg( alpha ) / s1 ) / scl
+                 end if
+                 return
+              else
+                 ! normal case
+                 zeta1 = absalp / absest
+                 zeta2 = absgam / absest
+                 norma = max( one+zeta1*zeta1+zeta1*zeta2,zeta1*zeta2+zeta2*zeta2 )
+                 ! see if root is closer to zero or to one
+                 test = one + two*( zeta1-zeta2 )*( zeta1+zeta2 )
+                 if( test>=zero ) then
+                    ! root is close to zero, compute directly
+                    b = ( zeta1*zeta1+zeta2*zeta2+one )*half
+                    c = zeta2*zeta2
+                    t = real( c / ( b+sqrt( abs( b*b-c ) ) ),KIND=qp)
+                    sine = ( alpha / absest ) / ( one-t )
+                    cosine = -( gamma / absest ) / t
+                    sestpr = sqrt( t+four*eps*eps*norma )*absest
+                 else
+                    ! root is closer to one, shift by that amount
+                    b = ( zeta2*zeta2+zeta1*zeta1-one )*half
+                    c = zeta1*zeta1
+                    if( b>=zero ) then
+                       t = -c / ( b+sqrt( b*b+c ) )
+                    else
+                       t = b - sqrt( b*b+c )
+                    end if
+                    sine = -( alpha / absest ) / t
+                    cosine = -( gamma / absest ) / ( one+t )
+                    sestpr = sqrt( one+t+four*eps*eps*norma )*absest
+                 end if
+                 tmp = real( sqrt( sine * conjg( sine )+ cosine * conjg( cosine ) ),KIND=qp)
+                           
+                 s = sine / tmp
+                 c = cosine / tmp
+                 return
+              end if
+           end if
+           return
+     end subroutine stdlib_wlaic1
+
+     !> ZLALS0: applies back the multiplying factors of either the left or the
+     !> right singular vector matrix of a diagonal matrix appended by a row
+     !> to the right hand side matrix B in solving the least squares problem
+     !> using the divide-and-conquer SVD approach.
+     !> For the left singular vector matrix, three types of orthogonal
+     !> matrices are involved:
+     !> (1L) Givens rotations: the number of such rotations is GIVPTR; the
+     !> pairs of columns/rows they were applied to are stored in GIVCOL;
+     !> and the C- and S-values of these rotations are stored in GIVNUM.
+     !> (2L) Permutation. The (NL+1)-st row of B is to be moved to the first
+     !> row, and for J=2:N, PERM(J)-th row of B is to be moved to the
+     !> J-th row.
+     !> (3L) The left singular vector matrix of the remaining matrix.
+     !> For the right singular vector matrix, four types of orthogonal
+     !> matrices are involved:
+     !> (1R) The right singular vector matrix of the remaining matrix.
+     !> (2R) If SQRE = 1, one extra Givens rotation to generate the right
+     !> null space.
+     !> (3R) The inverse transformation of (2L).
+     !> (4R) The inverse transformation of (1L).
+
+     pure subroutine stdlib_wlals0( icompq, nl, nr, sqre, nrhs, b, ldb, bx, ldbx,perm, givptr, &
+               givcol, ldgcol, givnum, ldgnum,poles, difl, difr, z, k, c, s, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: givptr, icompq, k, ldb, ldbx, ldgcol, ldgnum, nl, nr, nrhs,&
+                      sqre
+           integer(ilp), intent(out) :: info
+           real(qp), intent(in) :: c, s
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(ldgcol,*), perm(*)
+           real(qp), intent(in) :: difl(*), difr(ldgnum,*), givnum(ldgnum,*), poles(ldgnum,*), z(&
+                     *)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: b(ldb,*)
+           complex(qp), intent(out) :: bx(ldbx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jcol, jrow, m, n, nlp1
+           real(qp) :: diflj, difrj, dj, dsigj, dsigjp, temp
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,aimag,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           n = nl + nr + 1
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( nl<1 ) then
+              info = -2
+           else if( nr<1 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( nrhs<1 ) then
+              info = -5
+           else if( ldb<n ) then
+              info = -7
+           else if( ldbx<n ) then
+              info = -9
+           else if( givptr<0 ) then
+              info = -11
+           else if( ldgcol<n ) then
+              info = -13
+           else if( ldgnum<n ) then
+              info = -15
+           else if( k<1 ) then
+              info = -20
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLALS0', -info )
+              return
+           end if
+           m = n + sqre
+           nlp1 = nl + 1
+           if( icompq==0 ) then
+              ! apply back orthogonal transformations from the left.
+              ! step (1l): apply back the givens rotations performed.
+              do i = 1, givptr
+                 call stdlib_wdrot( nrhs, b( givcol( i, 2 ), 1 ), ldb,b( givcol( i, 1 ), 1 ), ldb,&
+                            givnum( i, 2 ),givnum( i, 1 ) )
+              end do
+              ! step (2l): permute rows of b.
+              call stdlib_wcopy( nrhs, b( nlp1, 1 ), ldb, bx( 1, 1 ), ldbx )
+              do i = 2, n
+                 call stdlib_wcopy( nrhs, b( perm( i ), 1 ), ldb, bx( i, 1 ), ldbx )
+              end do
+              ! step (3l): apply the inverse of the left singular vector
+              ! matrix to bx.
+              if( k==1 ) then
+                 call stdlib_wcopy( nrhs, bx, ldbx, b, ldb )
+                 if( z( 1 )<zero ) then
+                    call stdlib_wdscal( nrhs, negone, b, ldb )
+                 end if
+              else
+                 loop_100: do j = 1, k
+                    diflj = difl( j )
+                    dj = poles( j, 1 )
+                    dsigj = -poles( j, 2 )
+                    if( j<k ) then
+                       difrj = -difr( j, 1 )
+                       dsigjp = -poles( j+1, 2 )
+                    end if
+                    if( ( z( j )==zero ) .or. ( poles( j, 2 )==zero ) )then
+                       rwork( j ) = zero
+                    else
+                       rwork( j ) = -poles( j, 2 )*z( j ) / diflj /( poles( j, 2 )+dj )
+                    end if
+                    do i = 1, j - 1
+                       if( ( z( i )==zero ) .or.( poles( i, 2 )==zero ) ) then
+                          rwork( i ) = zero
+                       else
+                          rwork( i ) = poles( i, 2 )*z( i ) /( stdlib_qlamc3( poles( i, 2 ), &
+                                    dsigj )-diflj ) / ( poles( i, 2 )+dj )
+                       end if
+                    end do
+                    do i = j + 1, k
+                       if( ( z( i )==zero ) .or.( poles( i, 2 )==zero ) ) then
+                          rwork( i ) = zero
+                       else
+                          rwork( i ) = poles( i, 2 )*z( i ) /( stdlib_qlamc3( poles( i, 2 ), &
+                                    dsigjp )+difrj ) / ( poles( i, 2 )+dj )
+                       end if
+                    end do
+                    rwork( 1 ) = negone
+                    temp = stdlib_qnrm2( k, rwork, 1 )
+                    ! since b and bx are complex, the following call to stdlib_qgemv
+                    ! is performed in two steps (real and imaginary parts).
+                    ! call stdlib_qgemv( 't', k, nrhs, one, bx, ldbx, work, 1, zero,
+          ! $                     b( j, 1 ), ldb )
+                    i = k + nrhs*2
+                    do jcol = 1, nrhs
+                       do jrow = 1, k
+                          i = i + 1
+                          rwork( i ) = real( bx( jrow, jcol ),KIND=qp)
+                       end do
+                    end do
+                    call stdlib_qgemv( 'T', k, nrhs, one, rwork( 1+k+nrhs*2 ), k,rwork( 1 ), 1, &
+                              zero, rwork( 1+k ), 1 )
+                    i = k + nrhs*2
+                    do jcol = 1, nrhs
+                       do jrow = 1, k
+                          i = i + 1
+                          rwork( i ) = aimag( bx( jrow, jcol ) )
+                       end do
+                    end do
+                    call stdlib_qgemv( 'T', k, nrhs, one, rwork( 1+k+nrhs*2 ), k,rwork( 1 ), 1, &
+                              zero, rwork( 1+k+nrhs ), 1 )
+                    do jcol = 1, nrhs
+                       b( j, jcol ) = cmplx( rwork( jcol+k ),rwork( jcol+k+nrhs ),KIND=qp)
+                    end do
+                    call stdlib_wlascl( 'G', 0, 0, temp, one, 1, nrhs, b( j, 1 ),ldb, info )
+                              
+                 end do loop_100
+              end if
+              ! move the deflated rows of bx to b also.
+              if( k<max( m, n ) )call stdlib_wlacpy( 'A', n-k, nrhs, bx( k+1, 1 ), ldbx,b( k+1, 1 &
+                        ), ldb )
+           else
+              ! apply back the right orthogonal transformations.
+              ! step (1r): apply back the new right singular vector matrix
+              ! to b.
+              if( k==1 ) then
+                 call stdlib_wcopy( nrhs, b, ldb, bx, ldbx )
+              else
+                 loop_180: do j = 1, k
+                    dsigj = poles( j, 2 )
+                    if( z( j )==zero ) then
+                       rwork( j ) = zero
+                    else
+                       rwork( j ) = -z( j ) / difl( j ) /( dsigj+poles( j, 1 ) ) / difr( j, 2 )
+                                 
+                    end if
+                    do i = 1, j - 1
+                       if( z( j )==zero ) then
+                          rwork( i ) = zero
+                       else
+                          rwork( i ) = z( j ) / ( stdlib_qlamc3( dsigj, -poles( i+1,2 ) )-difr( i,&
+                                     1 ) ) /( dsigj+poles( i, 1 ) ) / difr( i, 2 )
+                       end if
+                    end do
+                    do i = j + 1, k
+                       if( z( j )==zero ) then
+                          rwork( i ) = zero
+                       else
+                          rwork( i ) = z( j ) / ( stdlib_qlamc3( dsigj, -poles( i,2 ) )-difl( i ) &
+                                    ) /( dsigj+poles( i, 1 ) ) / difr( i, 2 )
+                       end if
+                    end do
+                    ! since b and bx are complex, the following call to stdlib_qgemv
+                    ! is performed in two steps (real and imaginary parts).
+                    ! call stdlib_qgemv( 't', k, nrhs, one, b, ldb, work, 1, zero,
+          ! $                     bx( j, 1 ), ldbx )
+                    i = k + nrhs*2
+                    do jcol = 1, nrhs
+                       do jrow = 1, k
+                          i = i + 1
+                          rwork( i ) = real( b( jrow, jcol ),KIND=qp)
+                       end do
+                    end do
+                    call stdlib_qgemv( 'T', k, nrhs, one, rwork( 1+k+nrhs*2 ), k,rwork( 1 ), 1, &
+                              zero, rwork( 1+k ), 1 )
+                    i = k + nrhs*2
+                    do jcol = 1, nrhs
+                       do jrow = 1, k
+                          i = i + 1
+                          rwork( i ) = aimag( b( jrow, jcol ) )
+                       end do
+                    end do
+                    call stdlib_qgemv( 'T', k, nrhs, one, rwork( 1+k+nrhs*2 ), k,rwork( 1 ), 1, &
+                              zero, rwork( 1+k+nrhs ), 1 )
+                    do jcol = 1, nrhs
+                       bx( j, jcol ) = cmplx( rwork( jcol+k ),rwork( jcol+k+nrhs ),KIND=qp)
+                                 
+                    end do
+                 end do loop_180
+              end if
+              ! step (2r): if sqre = 1, apply back the rotation that is
+              ! related to the right null space of the subproblem.
+              if( sqre==1 ) then
+                 call stdlib_wcopy( nrhs, b( m, 1 ), ldb, bx( m, 1 ), ldbx )
+                 call stdlib_wdrot( nrhs, bx( 1, 1 ), ldbx, bx( m, 1 ), ldbx, c, s )
+              end if
+              if( k<max( m, n ) )call stdlib_wlacpy( 'A', n-k, nrhs, b( k+1, 1 ), ldb, bx( k+1, 1 &
+                        ),ldbx )
+              ! step (3r): permute rows of b.
+              call stdlib_wcopy( nrhs, bx( 1, 1 ), ldbx, b( nlp1, 1 ), ldb )
+              if( sqre==1 ) then
+                 call stdlib_wcopy( nrhs, bx( m, 1 ), ldbx, b( m, 1 ), ldb )
+              end if
+              do i = 2, n
+                 call stdlib_wcopy( nrhs, bx( i, 1 ), ldbx, b( perm( i ), 1 ), ldb )
+              end do
+              ! step (4r): apply back the givens rotations performed.
+              do i = givptr, 1, -1
+                 call stdlib_wdrot( nrhs, b( givcol( i, 2 ), 1 ), ldb,b( givcol( i, 1 ), 1 ), ldb,&
+                            givnum( i, 2 ),-givnum( i, 1 ) )
+              end do
+           end if
+           return
+     end subroutine stdlib_wlals0
+
+     !> ZLALSA: is an itermediate step in solving the least squares problem
+     !> by computing the SVD of the coefficient matrix in compact form (The
+     !> singular vectors are computed as products of simple orthorgonal
+     !> matrices.).
+     !> If ICOMPQ = 0, ZLALSA applies the inverse of the left singular vector
+     !> matrix of an upper bidiagonal matrix to the right hand side; and if
+     !> ICOMPQ = 1, ZLALSA applies the right singular vector matrix to the
+     !> right hand side. The singular vector matrices were generated in
+     !> compact form by ZLALSA.
+
+     pure subroutine stdlib_wlalsa( icompq, smlsiz, n, nrhs, b, ldb, bx, ldbx, u,ldu, vt, k, difl,&
+                difr, z, poles, givptr,givcol, ldgcol, perm, givnum, c, s, rwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, ldb, ldbx, ldgcol, ldu, n, nrhs, smlsiz
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(ldgcol,*), givptr(*), k(*), perm(ldgcol,*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(in) :: c(*), difl(ldu,*), difr(ldu,*), givnum(ldu,*), poles(ldu,*), s(&
+                     *), u(ldu,*), vt(ldu,*), z(ldu,*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: b(ldb,*)
+           complex(qp), intent(out) :: bx(ldbx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ic, im1, inode, j, jcol, jimag, jreal, jrow, lf, ll, lvl, lvl2, &
+                     nd, ndb1, ndiml, ndimr, nl, nlf, nlp1, nlvl, nr, nrf, nrp1, sqre
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,aimag
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( smlsiz<3 ) then
+              info = -2
+           else if( n<smlsiz ) then
+              info = -3
+           else if( nrhs<1 ) then
+              info = -4
+           else if( ldb<n ) then
+              info = -6
+           else if( ldbx<n ) then
+              info = -8
+           else if( ldu<n ) then
+              info = -10
+           else if( ldgcol<n ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLALSA', -info )
+              return
+           end if
+           ! book-keeping and  setting up the computation tree.
+           inode = 1
+           ndiml = inode + n
+           ndimr = ndiml + n
+           call stdlib_qlasdt( n, nlvl, nd, iwork( inode ), iwork( ndiml ),iwork( ndimr ), smlsiz &
+                     )
+           ! the following code applies back the left singular vector factors.
+           ! for applying back the right singular vector factors, go to 170.
+           if( icompq==1 ) then
+              go to 170
+           end if
+           ! the nodes on the bottom level of the tree were solved
+           ! by stdlib_qlasdq. the corresponding left and right singular vector
+           ! matrices are in explicit form. first apply back the left
+           ! singular vector matrices.
+           ndb1 = ( nd+1 ) / 2
+           loop_130: do i = ndb1, nd
+              ! ic : center row of each node
+              ! nl : number of rows of left  subproblem
+              ! nr : number of rows of right subproblem
+              ! nlf: starting row of the left   subproblem
+              ! nrf: starting row of the right  subproblem
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nr = iwork( ndimr+i1 )
+              nlf = ic - nl
+              nrf = ic + 1
+              ! since b and bx are complex, the following call to stdlib_qgemm
+              ! is performed in two steps (real and imaginary parts).
+              ! call stdlib_qgemm( 't', 'n', nl, nrhs, nl, one, u( nlf, 1 ), ldu,
+           ! $               b( nlf, 1 ), ldb, zero, bx( nlf, 1 ), ldbx )
+              j = nl*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nl - 1
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=qp)
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', nl, nrhs, nl, one, u( nlf, 1 ), ldu,rwork( 1+nl*nrhs*2 &
+                        ), nl, zero, rwork( 1 ), nl )
+              j = nl*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nl - 1
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', nl, nrhs, nl, one, u( nlf, 1 ), ldu,rwork( 1+nl*nrhs*2 &
+                        ), nl, zero, rwork( 1+nl*nrhs ),nl )
+              jreal = 0
+              jimag = nl*nrhs
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nl - 1
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    bx( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=qp)
+                 end do
+              end do
+              ! since b and bx are complex, the following call to stdlib_qgemm
+              ! is performed in two steps (real and imaginary parts).
+              ! call stdlib_qgemm( 't', 'n', nr, nrhs, nr, one, u( nrf, 1 ), ldu,
+          ! $               b( nrf, 1 ), ldb, zero, bx( nrf, 1 ), ldbx )
+              j = nr*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nr - 1
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=qp)
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', nr, nrhs, nr, one, u( nrf, 1 ), ldu,rwork( 1+nr*nrhs*2 &
+                        ), nr, zero, rwork( 1 ), nr )
+              j = nr*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nr - 1
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', nr, nrhs, nr, one, u( nrf, 1 ), ldu,rwork( 1+nr*nrhs*2 &
+                        ), nr, zero, rwork( 1+nr*nrhs ),nr )
+              jreal = 0
+              jimag = nr*nrhs
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nr - 1
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    bx( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=qp)
+                 end do
+              end do
+           end do loop_130
+           ! next copy the rows of b that correspond to unchanged rows
+           ! in the bidiagonal matrix to bx.
+           do i = 1, nd
+              ic = iwork( inode+i-1 )
+              call stdlib_wcopy( nrhs, b( ic, 1 ), ldb, bx( ic, 1 ), ldbx )
+           end do
+           ! finally go through the left singular vector matrices of all
+           ! the other subproblems bottom-up on the tree.
+           j = 2**nlvl
+           sqre = 0
+           do lvl = nlvl, 1, -1
+              lvl2 = 2*lvl - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = lf, ll
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 j = j - 1
+                 call stdlib_wlals0( icompq, nl, nr, sqre, nrhs, bx( nlf, 1 ), ldbx,b( nlf, 1 ), &
+                 ldb, perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ), ldgcol,givnum( nlf, lvl2 &
+                 ), ldu, poles( nlf, lvl2 ),difl( nlf, lvl ), difr( nlf, lvl2 ),z( nlf, lvl ), k( &
+                           j ), c( j ), s( j ), rwork,info )
+              end do
+           end do
+           go to 330
+           ! icompq = 1: applying back the right singular vector factors.
+           170 continue
+           ! first now go through the right singular vector matrices of all
+           ! the tree nodes top-down.
+           j = 0
+           loop_190: do lvl = 1, nlvl
+              lvl2 = 2*lvl - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl.
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = ll, lf, -1
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 if( i==ll ) then
+                    sqre = 0
+                 else
+                    sqre = 1
+                 end if
+                 j = j + 1
+                 call stdlib_wlals0( icompq, nl, nr, sqre, nrhs, b( nlf, 1 ), ldb,bx( nlf, 1 ), &
+                 ldbx, perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ), ldgcol,givnum( nlf, &
+                 lvl2 ), ldu, poles( nlf, lvl2 ),difl( nlf, lvl ), difr( nlf, lvl2 ),z( nlf, lvl )&
+                           , k( j ), c( j ), s( j ), rwork,info )
+              end do
+           end do loop_190
+           ! the nodes on the bottom level of the tree were solved
+           ! by stdlib_qlasdq. the corresponding right singular vector
+           ! matrices are in explicit form. apply them back.
+           ndb1 = ( nd+1 ) / 2
+           loop_320: do i = ndb1, nd
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nr = iwork( ndimr+i1 )
+              nlp1 = nl + 1
+              if( i==nd ) then
+                 nrp1 = nr
+              else
+                 nrp1 = nr + 1
+              end if
+              nlf = ic - nl
+              nrf = ic + 1
+              ! since b and bx are complex, the following call to stdlib_qgemm is
+              ! performed in two steps (real and imaginary parts).
+              ! call stdlib_qgemm( 't', 'n', nlp1, nrhs, nlp1, one, vt( nlf, 1 ), ldu,
+          ! $               b( nlf, 1 ), ldb, zero, bx( nlf, 1 ), ldbx )
+              j = nlp1*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nlp1 - 1
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=qp)
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', nlp1, nrhs, nlp1, one, vt( nlf, 1 ), ldu,rwork( 1+&
+                        nlp1*nrhs*2 ), nlp1, zero, rwork( 1 ),nlp1 )
+              j = nlp1*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nlp1 - 1
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', nlp1, nrhs, nlp1, one, vt( nlf, 1 ), ldu,rwork( 1+&
+                        nlp1*nrhs*2 ), nlp1, zero,rwork( 1+nlp1*nrhs ), nlp1 )
+              jreal = 0
+              jimag = nlp1*nrhs
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nlp1 - 1
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    bx( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=qp)
+                 end do
+              end do
+              ! since b and bx are complex, the following call to stdlib_qgemm is
+              ! performed in two steps (real and imaginary parts).
+              ! call stdlib_qgemm( 't', 'n', nrp1, nrhs, nrp1, one, vt( nrf, 1 ), ldu,
+          ! $               b( nrf, 1 ), ldb, zero, bx( nrf, 1 ), ldbx )
+              j = nrp1*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nrp1 - 1
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=qp)
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', nrp1, nrhs, nrp1, one, vt( nrf, 1 ), ldu,rwork( 1+&
+                        nrp1*nrhs*2 ), nrp1, zero, rwork( 1 ),nrp1 )
+              j = nrp1*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nrp1 - 1
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', nrp1, nrhs, nrp1, one, vt( nrf, 1 ), ldu,rwork( 1+&
+                        nrp1*nrhs*2 ), nrp1, zero,rwork( 1+nrp1*nrhs ), nrp1 )
+              jreal = 0
+              jimag = nrp1*nrhs
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nrp1 - 1
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    bx( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=qp)
+                 end do
+              end do
+           end do loop_320
+           330 continue
+           return
+     end subroutine stdlib_wlalsa
+
+     !> ZLALSD: uses the singular value decomposition of A to solve the least
+     !> squares problem of finding X to minimize the Euclidean norm of each
+     !> column of A*X-B, where A is N-by-N upper bidiagonal, and X and B
+     !> are N-by-NRHS. The solution X overwrites B.
+     !> The singular values of A smaller than RCOND times the largest
+     !> singular value are treated as zero in solving the least squares
+     !> problem; in this case a minimum norm solution is returned.
+     !> The actual singular values are returned in D in ascending order.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_wlalsd( uplo, smlsiz, n, nrhs, d, e, b, ldb, rcond,rank, work, rwork, &
+               iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: ldb, n, nrhs, smlsiz
+           real(qp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: bx, bxst, c, difl, difr, givcol, givnum, givptr, i, icmpq1, icmpq2, &
+           irwb, irwib, irwrb, irwu, irwvt, irwwrk, iwk, j, jcol, jimag, jreal, jrow, k, nlvl, &
+           nm1, nrwork, nsize, nsub, perm, poles, s, sizei, smlszp, sqre, st, st1, u, vt, &
+                     z
+           real(qp) :: cs, eps, orgnrm, rcnd, r, sn, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,int,log,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -3
+           else if( nrhs<1 ) then
+              info = -4
+           else if( ( ldb<1 ) .or. ( ldb<n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLALSD', -info )
+              return
+           end if
+           eps = stdlib_qlamch( 'EPSILON' )
+           ! set up the tolerance.
+           if( ( rcond<=zero ) .or. ( rcond>=one ) ) then
+              rcnd = eps
+           else
+              rcnd = rcond
+           end if
+           rank = 0
+           ! quick return if possible.
+           if( n==0 ) then
+              return
+           else if( n==1 ) then
+              if( d( 1 )==zero ) then
+                 call stdlib_wlaset( 'A', 1, nrhs, czero, czero, b, ldb )
+              else
+                 rank = 1
+                 call stdlib_wlascl( 'G', 0, 0, d( 1 ), one, 1, nrhs, b, ldb, info )
+                 d( 1 ) = abs( d( 1 ) )
+              end if
+              return
+           end if
+           ! rotate the matrix if it is lower bidiagonal.
+           if( uplo=='L' ) then
+              do i = 1, n - 1
+                 call stdlib_qlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( nrhs==1 ) then
+                    call stdlib_wdrot( 1, b( i, 1 ), 1, b( i+1, 1 ), 1, cs, sn )
+                 else
+                    rwork( i*2-1 ) = cs
+                    rwork( i*2 ) = sn
+                 end if
+              end do
+              if( nrhs>1 ) then
+                 do i = 1, nrhs
+                    do j = 1, n - 1
+                       cs = rwork( j*2-1 )
+                       sn = rwork( j*2 )
+                       call stdlib_wdrot( 1, b( j, i ), 1, b( j+1, i ), 1, cs, sn )
+                    end do
+                 end do
+              end if
+           end if
+           ! scale.
+           nm1 = n - 1
+           orgnrm = stdlib_qlanst( 'M', n, d, e )
+           if( orgnrm==zero ) then
+              call stdlib_wlaset( 'A', n, nrhs, czero, czero, b, ldb )
+              return
+           end if
+           call stdlib_qlascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, info )
+           call stdlib_qlascl( 'G', 0, 0, orgnrm, one, nm1, 1, e, nm1, info )
+           ! if n is smaller than the minimum divide size smlsiz, then solve
+           ! the problem with another solver.
+           if( n<=smlsiz ) then
+              irwu = 1
+              irwvt = irwu + n*n
+              irwwrk = irwvt + n*n
+              irwrb = irwwrk
+              irwib = irwrb + n*nrhs
+              irwb = irwib + n*nrhs
+              call stdlib_qlaset( 'A', n, n, zero, one, rwork( irwu ), n )
+              call stdlib_qlaset( 'A', n, n, zero, one, rwork( irwvt ), n )
+              call stdlib_qlasdq( 'U', 0, n, n, n, 0, d, e, rwork( irwvt ), n,rwork( irwu ), n, &
+                        rwork( irwwrk ), 1,rwork( irwwrk ), info )
+              if( info/=0 ) then
+                 return
+              end if
+              ! in the real version, b is passed to stdlib_qlasdq and multiplied
+              ! internally by q**h. here b is complex and that product is
+              ! computed below in two steps (real and imaginary parts).
+              j = irwb - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=qp)
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', n, nrhs, n, one, rwork( irwu ), n,rwork( irwb ), n, &
+                        zero, rwork( irwrb ), n )
+              j = irwb - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', n, nrhs, n, one, rwork( irwu ), n,rwork( irwb ), n, &
+                        zero, rwork( irwib ), n )
+              jreal = irwrb - 1
+              jimag = irwib - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    b( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=qp)
+                 end do
+              end do
+              tol = rcnd*abs( d( stdlib_iqamax( n, d, 1 ) ) )
+              do i = 1, n
+                 if( d( i )<=tol ) then
+                    call stdlib_wlaset( 'A', 1, nrhs, czero, czero, b( i, 1 ), ldb )
+                 else
+                    call stdlib_wlascl( 'G', 0, 0, d( i ), one, 1, nrhs, b( i, 1 ),ldb, info )
+                              
+                    rank = rank + 1
+                 end if
+              end do
+              ! since b is complex, the following call to stdlib_qgemm is performed
+              ! in two steps (real and imaginary parts). that is for v * b
+              ! (in the real version of the code v**h is stored in work).
+              ! call stdlib_qgemm( 't', 'n', n, nrhs, n, one, work, n, b, ldb, zero,
+          ! $               work( nwork ), n )
+              j = irwb - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=qp)
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', n, nrhs, n, one, rwork( irwvt ), n,rwork( irwb ), n, &
+                        zero, rwork( irwrb ), n )
+              j = irwb - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_qgemm( 'T', 'N', n, nrhs, n, one, rwork( irwvt ), n,rwork( irwb ), n, &
+                        zero, rwork( irwib ), n )
+              jreal = irwrb - 1
+              jimag = irwib - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    b( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=qp)
+                 end do
+              end do
+              ! unscale.
+              call stdlib_qlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+              call stdlib_qlasrt( 'D', n, d, info )
+              call stdlib_wlascl( 'G', 0, 0, orgnrm, one, n, nrhs, b, ldb, info )
+              return
+           end if
+           ! book-keeping and setting up some constants.
+           nlvl = int( log( real( n,KIND=qp) / real( smlsiz+1,KIND=qp) ) / log( two ),KIND=ilp) + &
+                     1
+           smlszp = smlsiz + 1
+           u = 1
+           vt = 1 + smlsiz*n
+           difl = vt + smlszp*n
+           difr = difl + nlvl*n
+           z = difr + nlvl*n*2
+           c = z + nlvl*n
+           s = c + n
+           poles = s + n
+           givnum = poles + 2*nlvl*n
+           nrwork = givnum + 2*nlvl*n
+           bx = 1
+           irwrb = nrwork
+           irwib = irwrb + smlsiz*nrhs
+           irwb = irwib + smlsiz*nrhs
+           sizei = 1 + n
+           k = sizei + n
+           givptr = k + n
+           perm = givptr + n
+           givcol = perm + nlvl*n
+           iwk = givcol + nlvl*n*2
+           st = 1
+           sqre = 0
+           icmpq1 = 1
+           icmpq2 = 0
+           nsub = 0
+           do i = 1, n
+              if( abs( d( i ) )<eps ) then
+                 d( i ) = sign( eps, d( i ) )
+              end if
+           end do
+           loop_240: do i = 1, nm1
+              if( ( abs( e( i ) )<eps ) .or. ( i==nm1 ) ) then
+                 nsub = nsub + 1
+                 iwork( nsub ) = st
+                 ! subproblem found. first determine its size and then
+                 ! apply divide and conquer on it.
+                 if( i<nm1 ) then
+                    ! a subproblem with e(i) small for i < nm1.
+                    nsize = i - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                 else if( abs( e( i ) )>=eps ) then
+                    ! a subproblem with e(nm1) not too small but i = nm1.
+                    nsize = n - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                 else
+                    ! a subproblem with e(nm1) small. this implies an
+                    ! 1-by-1 subproblem at d(n), which is not solved
+                    ! explicitly.
+                    nsize = i - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                    nsub = nsub + 1
+                    iwork( nsub ) = n
+                    iwork( sizei+nsub-1 ) = 1
+                    call stdlib_wcopy( nrhs, b( n, 1 ), ldb, work( bx+nm1 ), n )
+                 end if
+                 st1 = st - 1
+                 if( nsize==1 ) then
+                    ! this is a 1-by-1 subproblem and is not solved
+                    ! explicitly.
+                    call stdlib_wcopy( nrhs, b( st, 1 ), ldb, work( bx+st1 ), n )
+                 else if( nsize<=smlsiz ) then
+                    ! this is a small subproblem and is solved by stdlib_qlasdq.
+                    call stdlib_qlaset( 'A', nsize, nsize, zero, one,rwork( vt+st1 ), n )
+                    call stdlib_qlaset( 'A', nsize, nsize, zero, one,rwork( u+st1 ), n )
+                    call stdlib_qlasdq( 'U', 0, nsize, nsize, nsize, 0, d( st ),e( st ), rwork( &
+                    vt+st1 ), n, rwork( u+st1 ),n, rwork( nrwork ), 1, rwork( nrwork ),info )
+                              
+                    if( info/=0 ) then
+                       return
+                    end if
+                    ! in the real version, b is passed to stdlib_qlasdq and multiplied
+                    ! internally by q**h. here b is complex and that product is
+                    ! computed below in two steps (real and imaginary parts).
+                    j = irwb - 1
+                    do jcol = 1, nrhs
+                       do jrow = st, st + nsize - 1
+                          j = j + 1
+                          rwork( j ) = real( b( jrow, jcol ),KIND=qp)
+                       end do
+                    end do
+                    call stdlib_qgemm( 'T', 'N', nsize, nrhs, nsize, one,rwork( u+st1 ), n, rwork(&
+                               irwb ), nsize,zero, rwork( irwrb ), nsize )
+                    j = irwb - 1
+                    do jcol = 1, nrhs
+                       do jrow = st, st + nsize - 1
+                          j = j + 1
+                          rwork( j ) = aimag( b( jrow, jcol ) )
+                       end do
+                    end do
+                    call stdlib_qgemm( 'T', 'N', nsize, nrhs, nsize, one,rwork( u+st1 ), n, rwork(&
+                               irwb ), nsize,zero, rwork( irwib ), nsize )
+                    jreal = irwrb - 1
+                    jimag = irwib - 1
+                    do jcol = 1, nrhs
+                       do jrow = st, st + nsize - 1
+                          jreal = jreal + 1
+                          jimag = jimag + 1
+                          b( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=qp)
+                       end do
+                    end do
+                    call stdlib_wlacpy( 'A', nsize, nrhs, b( st, 1 ), ldb,work( bx+st1 ), n )
+                              
+                 else
+                    ! a large problem. solve it using divide and conquer.
+                    call stdlib_qlasda( icmpq1, smlsiz, nsize, sqre, d( st ),e( st ), rwork( u+&
+                    st1 ), n, rwork( vt+st1 ),iwork( k+st1 ), rwork( difl+st1 ),rwork( difr+st1 ),&
+                     rwork( z+st1 ),rwork( poles+st1 ), iwork( givptr+st1 ),iwork( givcol+st1 ), &
+                     n, iwork( perm+st1 ),rwork( givnum+st1 ), rwork( c+st1 ),rwork( s+st1 ), &
+                               rwork( nrwork ),iwork( iwk ), info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                    bxst = bx + st1
+                    call stdlib_wlalsa( icmpq2, smlsiz, nsize, nrhs, b( st, 1 ),ldb, work( bxst ),&
+                     n, rwork( u+st1 ), n,rwork( vt+st1 ), iwork( k+st1 ),rwork( difl+st1 ), &
+                     rwork( difr+st1 ),rwork( z+st1 ), rwork( poles+st1 ),iwork( givptr+st1 ), &
+                     iwork( givcol+st1 ), n,iwork( perm+st1 ), rwork( givnum+st1 ),rwork( c+st1 ),&
+                                rwork( s+st1 ),rwork( nrwork ), iwork( iwk ), info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                 end if
+                 st = i + 1
+              end if
+           end do loop_240
+           ! apply the singular values and treat the tiny ones as zero.
+           tol = rcnd*abs( d( stdlib_iqamax( n, d, 1 ) ) )
+           do i = 1, n
+              ! some of the elements in d can be negative because 1-by-1
+              ! subproblems were not solved explicitly.
+              if( abs( d( i ) )<=tol ) then
+                 call stdlib_wlaset( 'A', 1, nrhs, czero, czero, work( bx+i-1 ), n )
+              else
+                 rank = rank + 1
+                 call stdlib_wlascl( 'G', 0, 0, d( i ), one, 1, nrhs,work( bx+i-1 ), n, info )
+                           
+              end if
+              d( i ) = abs( d( i ) )
+           end do
+           ! now apply back the right singular vectors.
+           icmpq2 = 1
+           loop_320: do i = 1, nsub
+              st = iwork( i )
+              st1 = st - 1
+              nsize = iwork( sizei+i-1 )
+              bxst = bx + st1
+              if( nsize==1 ) then
+                 call stdlib_wcopy( nrhs, work( bxst ), n, b( st, 1 ), ldb )
+              else if( nsize<=smlsiz ) then
+                 ! since b and bx are complex, the following call to stdlib_qgemm
+                 ! is performed in two steps (real and imaginary parts).
+                 ! call stdlib_qgemm( 't', 'n', nsize, nrhs, nsize, one,
+          ! $                  rwork( vt+st1 ), n, rwork( bxst ), n, zero,
+          ! $                  b( st, 1 ), ldb )
+                 j = bxst - n - 1
+                 jreal = irwb - 1
+                 do jcol = 1, nrhs
+                    j = j + n
+                    do jrow = 1, nsize
+                       jreal = jreal + 1
+                       rwork( jreal ) = real( work( j+jrow ),KIND=qp)
+                    end do
+                 end do
+                 call stdlib_qgemm( 'T', 'N', nsize, nrhs, nsize, one,rwork( vt+st1 ), n, rwork( &
+                           irwb ), nsize, zero,rwork( irwrb ), nsize )
+                 j = bxst - n - 1
+                 jimag = irwb - 1
+                 do jcol = 1, nrhs
+                    j = j + n
+                    do jrow = 1, nsize
+                       jimag = jimag + 1
+                       rwork( jimag ) = aimag( work( j+jrow ) )
+                    end do
+                 end do
+                 call stdlib_qgemm( 'T', 'N', nsize, nrhs, nsize, one,rwork( vt+st1 ), n, rwork( &
+                           irwb ), nsize, zero,rwork( irwib ), nsize )
+                 jreal = irwrb - 1
+                 jimag = irwib - 1
+                 do jcol = 1, nrhs
+                    do jrow = st, st + nsize - 1
+                       jreal = jreal + 1
+                       jimag = jimag + 1
+                       b( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=qp)
+                    end do
+                 end do
+              else
+                 call stdlib_wlalsa( icmpq2, smlsiz, nsize, nrhs, work( bxst ), n,b( st, 1 ), ldb,&
+                  rwork( u+st1 ), n,rwork( vt+st1 ), iwork( k+st1 ),rwork( difl+st1 ), rwork( &
+                  difr+st1 ),rwork( z+st1 ), rwork( poles+st1 ),iwork( givptr+st1 ), iwork( &
+                  givcol+st1 ), n,iwork( perm+st1 ), rwork( givnum+st1 ),rwork( c+st1 ), rwork( s+&
+                            st1 ),rwork( nrwork ), iwork( iwk ), info )
+                 if( info/=0 ) then
+                    return
+                 end if
+              end if
+           end do loop_320
+           ! unscale and sort the singular values.
+           call stdlib_qlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+           call stdlib_qlasrt( 'D', n, d, info )
+           call stdlib_wlascl( 'G', 0, 0, orgnrm, one, n, nrhs, b, ldb, info )
+           return
+     end subroutine stdlib_wlalsd
+
+     !> ZLAMSWLQ: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of blocked
+     !> elementary reflectors computed by short wide LQ
+     !> factorization (ZLASWLQ)
+
+     pure subroutine stdlib_wlamswlq( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, mb, nb, ldt, lwork, ldc
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), t(ldt,*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: c(ldc,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: i, ii, kk, lw, ctr
+           ! External Subroutines 
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork<0
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'C' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           if (left) then
+             lw = n * mb
+           else
+             lw = m * mb
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( k<0 ) then
+             info = -5
+           else if( m<k ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<mb .or. mb<1) then
+             info = -6
+           else if( lda<max( 1, k ) ) then
+             info = -9
+           else if( ldt<max( 1, mb) ) then
+             info = -11
+           else if( ldc<max( 1, m ) ) then
+              info = -13
+           else if(( lwork<max(1,lw)).and.(.not.lquery)) then
+             info = -15
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZLAMSWLQ', -info )
+             work(1) = lw
+             return
+           else if (lquery) then
+             work(1) = lw
+             return
+           end if
+           ! quick return if possible
+           if( min(m,n,k)==0 ) then
+             return
+           end if
+           if((nb<=k).or.(nb>=max(m,n,k))) then
+             call stdlib_wgemlqt( side, trans, m, n, k, mb, a, lda,t, ldt, c, ldc, work, info)
+                       
+             return
+           end if
+           if(left.and.tran) then
+               ! multiply q to the last block of c
+               kk = mod((m-k),(nb-k))
+               ctr = (m-k)/(nb-k)
+               if (kk>0) then
+                 ii=m-kk+1
+                 call stdlib_wtpmlqt('L','C',kk , n, k, 0, mb, a(1,ii), lda,t(1,ctr*k+1), ldt, c(&
+                           1,1), ldc,c(ii,1), ldc, work, info )
+               else
+                 ii=m+1
+               end if
+               do i=ii-(nb-k),nb+1,-(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+nb)
+                 ctr = ctr - 1
+                 call stdlib_wtpmlqt('L','C',nb-k , n, k, 0,mb, a(1,i), lda,t(1,ctr*k+1),ldt, c(1,&
+                           1), ldc,c(i,1), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:nb)
+               call stdlib_wgemlqt('L','C',nb , n, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (left.and.notran) then
+               ! multiply q to the first block of c
+              kk = mod((m-k),(nb-k))
+              ii=m-kk+1
+              ctr = 1
+              call stdlib_wgemlqt('L','N',nb , n, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=nb+1,ii-nb+k,(nb-k)
+               ! multiply q to the current block of c (i:i+nb,1:n)
+               call stdlib_wtpmlqt('L','N',nb-k , n, k, 0,mb, a(1,i), lda,t(1, ctr * k + 1), ldt, &
+                         c(1,1), ldc,c(i,1), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=m) then
+               ! multiply q to the last block of c
+               call stdlib_wtpmlqt('L','N',kk , n, k, 0, mb, a(1,ii), lda,t(1, ctr * k + 1), ldt, &
+                         c(1,1), ldc,c(ii,1), ldc, work, info )
+              end if
+           else if(right.and.notran) then
+               ! multiply q to the last block of c
+               kk = mod((n-k),(nb-k))
+               ctr = (n-k)/(nb-k)
+               if (kk>0) then
+                 ii=n-kk+1
+                 call stdlib_wtpmlqt('R','N',m , kk, k, 0, mb, a(1, ii), lda,t(1, ctr * k + 1), &
+                           ldt, c(1,1), ldc,c(1,ii), ldc, work, info )
+               else
+                 ii=n+1
+               end if
+               do i=ii-(nb-k),nb+1,-(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               ctr = ctr - 1
+               call stdlib_wtpmlqt('R','N', m, nb-k, k, 0, mb, a(1, i), lda,t(1, ctr * k + 1), &
+                         ldt, c(1,1), ldc,c(1,i), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:mb)
+               call stdlib_wgemlqt('R','N',m , nb, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (right.and.tran) then
+             ! multiply q to the first block of c
+              kk = mod((n-k),(nb-k))
+              ii=n-kk+1
+              call stdlib_wgemlqt('R','C',m , nb, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              ctr = 1
+              do i=nb+1,ii-nb+k,(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               call stdlib_wtpmlqt('R','C',m , nb-k, k, 0,mb, a(1,i), lda,t(1,ctr *k+1), ldt, c(1,&
+                         1), ldc,c(1,i), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=n) then
+             ! multiply q to the last block of c
+               call stdlib_wtpmlqt('R','C',m , kk, k, 0,mb, a(1,ii), lda,t(1, ctr * k + 1),ldt, c(&
+                         1,1), ldc,c(1,ii), ldc, work, info )
+              end if
+           end if
+           work(1) = lw
+           return
+     end subroutine stdlib_wlamswlq
+
+     !> ZLAMTSQR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product
+     !> of blocked elementary reflectors computed by tall skinny
+     !> QR factorization (ZLATSQR)
+
+     pure subroutine stdlib_wlamtsqr( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, mb, nb, ldt, lwork, ldc
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), t(ldt,*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: c(ldc,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: i, ii, kk, lw, ctr, q
+           ! External Subroutines 
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork<0
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'C' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           if (left) then
+             lw = n * nb
+             q = m
+           else
+             lw = m * nb
+             q = n
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<k ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 ) then
+             info = -5
+           else if( k<nb .or. nb<1 ) then
+             info = -7
+           else if( lda<max( 1, q ) ) then
+             info = -9
+           else if( ldt<max( 1, nb) ) then
+             info = -11
+           else if( ldc<max( 1, m ) ) then
+              info = -13
+           else if(( lwork<max(1,lw)).and.(.not.lquery)) then
+             info = -15
+           end if
+           ! determine the block size if it is tall skinny or short and wide
+           if( info==0)  then
+               work(1) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZLAMTSQR', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n,k)==0 ) then
+             return
+           end if
+           if((mb<=k).or.(mb>=max(m,n,k))) then
+             call stdlib_wgemqrt( side, trans, m, n, k, nb, a, lda,t, ldt, c, ldc, work, info)
+                       
+             return
+            end if
+           if(left.and.notran) then
+               ! multiply q to the last block of c
+              kk = mod((m-k),(mb-k))
+              ctr = (m-k)/(mb-k)
+              if (kk>0) then
+                ii=m-kk+1
+                call stdlib_wtpmqrt('L','N',kk , n, k, 0, nb, a(ii,1), lda,t(1, ctr * k + 1),ldt ,&
+                           c(1,1), ldc,c(ii,1), ldc, work, info )
+              else
+                ii=m+1
+              end if
+              do i=ii-(mb-k),mb+1,-(mb-k)
+               ! multiply q to the current block of c (i:i+mb,1:n)
+                ctr = ctr - 1
+                call stdlib_wtpmqrt('L','N',mb-k , n, k, 0,nb, a(i,1), lda,t(1,ctr * k + 1),ldt, &
+                          c(1,1), ldc,c(i,1), ldc, work, info )
+              end do
+               ! multiply q to the first block of c (1:mb,1:n)
+              call stdlib_wgemqrt('L','N',mb , n, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+           else if (left.and.tran) then
+               ! multiply q to the first block of c
+              kk = mod((m-k),(mb-k))
+              ii=m-kk+1
+              ctr = 1
+              call stdlib_wgemqrt('L','C',mb , n, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=mb+1,ii-mb+k,(mb-k)
+               ! multiply q to the current block of c (i:i+mb,1:n)
+               call stdlib_wtpmqrt('L','C',mb-k , n, k, 0,nb, a(i,1), lda,t(1,ctr * k + 1),ldt, c(&
+                         1,1), ldc,c(i,1), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=m) then
+               ! multiply q to the last block of c
+               call stdlib_wtpmqrt('L','C',kk , n, k, 0,nb, a(ii,1), lda,t(1, ctr * k + 1), ldt, &
+                         c(1,1), ldc,c(ii,1), ldc, work, info )
+              end if
+           else if(right.and.tran) then
+               ! multiply q to the last block of c
+               kk = mod((n-k),(mb-k))
+               ctr = (n-k)/(mb-k)
+               if (kk>0) then
+                 ii=n-kk+1
+                 call stdlib_wtpmqrt('R','C',m , kk, k, 0, nb, a(ii,1), lda,t(1,ctr * k + 1), ldt,&
+                            c(1,1), ldc,c(1,ii), ldc, work, info )
+               else
+                 ii=n+1
+               end if
+               do i=ii-(mb-k),mb+1,-(mb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+                 ctr = ctr - 1
+                 call stdlib_wtpmqrt('R','C',m , mb-k, k, 0,nb, a(i,1), lda,t(1, ctr * k + 1), &
+                           ldt, c(1,1), ldc,c(1,i), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:mb)
+               call stdlib_wgemqrt('R','C',m , mb, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (right.and.notran) then
+               ! multiply q to the first block of c
+              kk = mod((n-k),(mb-k))
+              ii=n-kk+1
+              ctr = 1
+              call stdlib_wgemqrt('R','N', m, mb , k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=mb+1,ii-mb+k,(mb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               call stdlib_wtpmqrt('R','N', m, mb-k, k, 0,nb, a(i,1), lda,t(1, ctr * k + 1),ldt, &
+                         c(1,1), ldc,c(1,i), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=n) then
+               ! multiply q to the last block of c
+               call stdlib_wtpmqrt('R','N', m, kk , k, 0,nb, a(ii,1), lda,t(1,ctr * k + 1),ldt, c(&
+                         1,1), ldc,c(1,ii), ldc, work, info )
+              end if
+           end if
+           work(1) = lw
+           return
+     end subroutine stdlib_wlamtsqr
+
+     !> ZLANGB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n band matrix  A,  with kl sub-diagonals and ku super-diagonals.
+
+     real(qp) function stdlib_wlangb( norm, n, kl, ku, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: kl, ku, ldab, n
+           ! Array Arguments 
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(in) :: ab(ldab,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k, l
+           real(qp) :: scale, sum, value, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                    temp = abs( ab( i, j ) )
+                    if( value<temp .or. stdlib_qisnan( temp ) ) value = temp
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                    sum = sum + abs( ab( i, j ) )
+                 end do
+                 if( value<sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 k = ku + 1 - j
+                 do i = max( 1, j-ku ), min( n, j+kl )
+                    work( i ) = work( i ) + abs( ab( k+i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, n
+                 temp = work( i )
+                 if( value<temp .or. stdlib_qisnan( temp ) ) value = temp
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 l = max( 1, j-ku )
+                 k = ku + 1 - j + l
+                 call stdlib_wlassq( min( n, j+kl )-l+1, ab( k, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_wlangb = value
+           return
+     end function stdlib_wlangb
+
+     !> ZLANGE:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex matrix A.
+
+     real(qp) function stdlib_wlange( norm, m, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: scale, sum, value, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( min( m, n )==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = 1, m
+                    temp = abs( a( i, j ) )
+                    if( value<temp .or. stdlib_qisnan( temp ) ) value = temp
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = 1, m
+                    sum = sum + abs( a( i, j ) )
+                 end do
+                 if( value<sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, m
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 do i = 1, m
+                    work( i ) = work( i ) + abs( a( i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, m
+                 temp = work( i )
+                 if( value<temp .or. stdlib_qisnan( temp ) ) value = temp
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 call stdlib_wlassq( m, a( 1, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_wlange = value
+           return
+     end function stdlib_wlange
+
+     !> ZLANGT:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex tridiagonal matrix A.
+
+     pure real(qp) function stdlib_wlangt( norm, n, dl, d, du )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(qp), intent(in) :: d(*), dl(*), du(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: anorm, scale, sum, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n<=0 ) then
+              anorm = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              anorm = abs( d( n ) )
+              do i = 1, n - 1
+                 if( anorm<abs( dl( i ) ) .or. stdlib_qisnan( abs( dl( i ) ) ) )anorm = abs(dl(i))
+                           
+                 if( anorm<abs( d( i ) ) .or. stdlib_qisnan( abs( d( i ) ) ) )anorm = abs(d(i))
+                           
+                 if( anorm<abs( du( i ) ) .or. stdlib_qisnan (abs( du( i ) ) ) )anorm = abs(du(i))
+                           
+              end do
+           else if( stdlib_lsame( norm, 'O' ) .or. norm=='1' ) then
+              ! find norm1(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( dl( 1 ) )
+                 temp = abs( d( n ) )+abs( du( n-1 ) )
+                 if( anorm < temp .or. stdlib_qisnan( temp ) ) anorm = temp
+                 do i = 2, n - 1
+                    temp = abs( d( i ) )+abs( dl( i ) )+abs( du( i-1 ) )
+                    if( anorm < temp .or. stdlib_qisnan( temp ) ) anorm = temp
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( du( 1 ) )
+                 temp = abs( d( n ) )+abs( dl( n-1 ) )
+                 if( anorm < temp .or. stdlib_qisnan( temp ) ) anorm = temp
+                 do i = 2, n - 1
+                    temp = abs( d( i ) )+abs( du( i ) )+abs( dl( i-1 ) )
+                    if( anorm < temp .or. stdlib_qisnan( temp ) ) anorm = temp
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              call stdlib_wlassq( n, d, 1, scale, sum )
+              if( n>1 ) then
+                 call stdlib_wlassq( n-1, dl, 1, scale, sum )
+                 call stdlib_wlassq( n-1, du, 1, scale, sum )
+              end if
+              anorm = scale*sqrt( sum )
+           end if
+           stdlib_wlangt = anorm
+           return
+     end function stdlib_wlangt
+
+     !> ZLANHB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n hermitian band matrix A,  with k super-diagonals.
+
+     real(qp) function stdlib_wlanhb( norm, uplo, n, k, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(in) :: ab(ldab,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           real(qp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = max( k+2-j, 1 ), k
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                    sum = abs( real( ab( k+1, j ),KIND=qp) )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    sum = abs( real( ab( 1, j ),KIND=qp) )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    do i = 2, min( n+1-j, k+1 )
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is hermitian).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    l = k + 1 - j
+                    do i = max( 1, j-k ), j - 1
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( real( ab( k+1, j ),KIND=qp) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( real( ab( 1, j ),KIND=qp) )
+                    l = 1 - j
+                    do i = j + 1, min( n, j+k )
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( k>0 ) then
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 2, n
+                       call stdlib_wlassq( min( j-1, k ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                    l = k + 1
+                 else
+                    do j = 1, n - 1
+                       call stdlib_wlassq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                    end do
+                    l = 1
+                 end if
+                 sum = 2*sum
+              else
+                 l = 1
+              end if
+              do j = 1, n
+                 if( real( ab( l, j ),KIND=qp)/=zero ) then
+                    absa = abs( real( ab( l, j ),KIND=qp) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_wlanhb = value
+           return
+     end function stdlib_wlanhb
+
+     !> ZLANHE:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex hermitian matrix A.
+
+     real(qp) function stdlib_wlanhe( norm, uplo, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = 1, j - 1
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                    sum = abs( real( a( j, j ),KIND=qp) )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    sum = abs( real( a( j, j ),KIND=qp) )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    do i = j + 1, n
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is hermitian).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( real( a( j, j ),KIND=qp) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( real( a( j, j ),KIND=qp) )
+                    do i = j + 1, n
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_wlassq( j-1, a( 1, j ), 1, scale, sum )
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_wlassq( n-j, a( j+1, j ), 1, scale, sum )
+                 end do
+              end if
+              sum = 2*sum
+              do i = 1, n
+                 if( real( a( i, i ),KIND=qp)/=zero ) then
+                    absa = abs( real( a( i, i ),KIND=qp) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_wlanhe = value
+           return
+     end function stdlib_wlanhe
+
+     !> ZLANHF:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex Hermitian matrix A in RFP format.
+
+     real(qp) function stdlib_wlanhf( norm, transr, uplo, n, a, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, transr, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(out) :: work(0:*)
+           complex(qp), intent(in) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, ifm, ilu, noe, n1, k, l, lda
+           real(qp) :: scale, s, value, aa, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              stdlib_wlanhf = zero
+              return
+           else if( n==1 ) then
+              stdlib_wlanhf = abs(real(a(0),KIND=qp))
+              return
+           end if
+           ! set noe = 1 if n is odd. if n is even set noe=0
+           noe = 1
+           if( mod( n, 2 )==0 )noe = 0
+           ! set ifm = 0 when form='c' or 'c' and 1 otherwise
+           ifm = 1
+           if( stdlib_lsame( transr, 'C' ) )ifm = 0
+           ! set ilu = 0 when uplo='u or 'u' and 1 otherwise
+           ilu = 1
+           if( stdlib_lsame( uplo, 'U' ) )ilu = 0
+           ! set lda = (n+1)/2 when ifm = 0
+           ! set lda = n when ifm = 1 and noe = 1
+           ! set lda = n+1 when ifm = 1 and noe = 0
+           if( ifm==1 ) then
+              if( noe==1 ) then
+                 lda = n
+              else
+                 ! noe=0
+                 lda = n + 1
+              end if
+           else
+              ! ifm=0
+              lda = ( n+1 ) / 2
+           end if
+           if( stdlib_lsame( norm, 'M' ) ) then
+             ! find max(abs(a(i,j))).
+              k = ( n+1 ) / 2
+              value = zero
+              if( noe==1 ) then
+                 ! n is odd
+                 if( ifm==1 ) then
+                    ! a is n by k
+                    if( ilu==1 ) then
+                       ! uplo ='l'
+                       j = 0
+                       ! -> l(0,0)
+                       temp = abs( real( a( j+j*lda ),KIND=qp) )
+                       if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       do i = 1, n - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                       do j = 1, k - 1
+                          do i = 0, j - 2
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                          i = j - 1
+                          ! l(k+j,k+j)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          i = j
+                          ! -> l(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          do i = j + 1, n - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                       end do
+                    else
+                       ! uplo = 'u'
+                       do j = 0, k - 2
+                          do i = 0, k + j - 2
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                          i = k + j - 1
+                          ! -> u(i,i)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          i = i + 1
+                          ! =k+j; i -> u(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          do i = k + j + 1, n - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                       end do
+                       do i = 0, n - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          ! j=k-1
+                       end do
+                       ! i=n-1 -> u(n-1,n-1)
+                       temp = abs( real( a( i+j*lda ),KIND=qp) )
+                       if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                    end if
+                 else
+                    ! xpose case; a is k by n
+                    if( ilu==1 ) then
+                       ! uplo ='l'
+                       do j = 0, k - 2
+                          do i = 0, j - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                          i = j
+                          ! l(i,i)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          i = j + 1
+                          ! l(j+k,j+k)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          do i = j + 2, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = k - 1
+                       do i = 0, k - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                       i = k - 1
+                       ! -> l(i,i) is at a(i,j)
+                       temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       do j = k, n - 1
+                          do i = 0, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                       end do
+                    else
+                       ! uplo = 'u'
+                       do j = 0, k - 2
+                          do i = 0, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = k - 1
+                       ! -> u(j,j) is at a(0,j)
+                       temp = abs( real( a( 0+j*lda ),KIND=qp) )
+                       if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       do i = 1, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                       do j = k, n - 1
+                          do i = 0, j - k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                          i = j - k
+                          ! -> u(i,i) at a(i,j)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          i = j - k + 1
+                          ! u(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          do i = j - k + 2, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                       end do
+                    end if
+                 end if
+              else
+                 ! n is even
+                 if( ifm==1 ) then
+                    ! a is n+1 by k
+                    if( ilu==1 ) then
+                       ! uplo ='l'
+                       j = 0
+                       ! -> l(k,k)
+                       temp = abs( real( a( j+j*lda ),KIND=qp) )
+                       if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       temp = abs( real( a( j+1+j*lda ),KIND=qp) )
+                       if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       do i = 2, n
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                       do j = 1, k - 1
+                          do i = 0, j - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                          i = j
+                          ! l(k+j,k+j)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          i = j + 1
+                          ! -> l(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          do i = j + 2, n
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                       end do
+                    else
+                       ! uplo = 'u'
+                       do j = 0, k - 2
+                          do i = 0, k + j - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                          i = k + j
+                          ! -> u(i,i)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          i = i + 1
+                          ! =k+j+1; i -> u(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          do i = k + j + 2, n
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                       end do
+                       do i = 0, n - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          ! j=k-1
+                       end do
+                       ! i=n-1 -> u(n-1,n-1)
+                       temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       i = n
+                       ! -> u(k-1,k-1)
+                       temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                    end if
+                 else
+                    ! xpose case; a is k by n+1
+                    if( ilu==1 ) then
+                       ! uplo ='l'
+                       j = 0
+                       ! -> l(k,k) at a(0,0)
+                       temp = abs( real( a( j+j*lda ),KIND=qp) )
+                       if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       do i = 1, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                       do j = 1, k - 1
+                          do i = 0, j - 2
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                          i = j - 1
+                          ! l(i,i)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          i = j
+                          ! l(j+k,j+k)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          do i = j + 1, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = k
+                       do i = 0, k - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                       i = k - 1
+                       ! -> l(i,i) is at a(i,j)
+                       temp = abs( real( a( i+j*lda ),KIND=qp) )
+                       if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       do j = k + 1, n
+                          do i = 0, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                       end do
+                    else
+                       ! uplo = 'u'
+                       do j = 0, k - 1
+                          do i = 0, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = k
+                       ! -> u(j,j) is at a(0,j)
+                       temp = abs( real( a( 0+j*lda ),KIND=qp) )
+                       if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       do i = 1, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                       do j = k + 1, n - 1
+                          do i = 0, j - k - 2
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                          i = j - k - 1
+                          ! -> u(i,i) at a(i,j)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          i = j - k
+                          ! u(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=qp) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          do i = j - k + 1, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = n
+                       do i = 0, k - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                       i = k - 1
+                       ! u(k,k) at a(i,j)
+                       temp = abs( real( a( i+j*lda ),KIND=qp) )
+                       if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                    end if
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+             ! find normi(a) ( = norm1(a), since a is hermitian).
+              if( ifm==1 ) then
+                 ! a is 'n'
+                 k = n / 2
+                 if( noe==1 ) then
+                    ! n is odd
+                    if( ilu==0 ) then
+                       ! uplo = 'u'
+                       do i = 0, k - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k
+                          s = zero
+                          do i = 0, k + j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(i,j+k)
+                             s = s + aa
+                             work( i ) = work( i ) + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          ! -> a(j+k,j+k)
+                          work( j+k ) = s + aa
+                          if( i==k+k )go to 10
+                          i = i + 1
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          ! -> a(j,j)
+                          work( j ) = work( j ) + aa
+                          s = zero
+                          do l = j + 1, k - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       10 continue
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu = 1
+                       k = k + 1
+                       ! k=(n+1)/2 for n odd and ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = k - 1, 0, -1
+                          s = zero
+                          do i = 0, j - 2
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,i+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + aa
+                          end do
+                          if( j>0 ) then
+                             aa = abs( real( a( i+j*lda ),KIND=qp) )
+                             ! -> a(j+k,j+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + s
+                             ! i=j
+                             i = i + 1
+                          end if
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          ! -> a(j,j)
+                          work( j ) = aa
+                          s = zero
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    end if
+                 else
+                    ! n is even
+                    if( ilu==0 ) then
+                       ! uplo = 'u'
+                       do i = 0, k - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 1
+                          s = zero
+                          do i = 0, k + j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(i,j+k)
+                             s = s + aa
+                             work( i ) = work( i ) + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          ! -> a(j+k,j+k)
+                          work( j+k ) = s + aa
+                          i = i + 1
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          ! -> a(j,j)
+                          work( j ) = work( j ) + aa
+                          s = zero
+                          do l = j + 1, k - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu = 1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = k - 1, 0, -1
+                          s = zero
+                          do i = 0, j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,i+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          ! -> a(j+k,j+k)
+                          s = s + aa
+                          work( i+k ) = work( i+k ) + s
+                          ! i=j
+                          i = i + 1
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          ! -> a(j,j)
+                          work( j ) = aa
+                          s = zero
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    end if
+                 end if
+              else
+                 ! ifm=0
+                 k = n / 2
+                 if( noe==1 ) then
+                    ! n is odd
+                    if( ilu==0 ) then
+                       ! uplo = 'u'
+                       n1 = k
+                       ! n/2
+                       k = k + 1
+                       ! k is the row size and lda
+                       do i = n1, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, n1 - 1
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,n1+i)
+                             work( i+n1 ) = work( i+n1 ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = s
+                       end do
+                       ! j=n1=k-1 is special
+                       s = abs( real( a( 0+j*lda ),KIND=qp) )
+                       ! a(k-1,k-1)
+                       do i = 1, k - 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k-1,i+n1)
+                          work( i+n1 ) = work( i+n1 ) + aa
+                          s = s + aa
+                       end do
+                       work( j ) = work( j ) + s
+                       do j = k, n - 1
+                          s = zero
+                          do i = 0, j - k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(i,j-k)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          ! i=j-k
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          ! a(j-k,j-k)
+                          s = s + aa
+                          work( j-k ) = work( j-k ) + s
+                          i = i + 1
+                          s = abs( real( a( i+j*lda ),KIND=qp) )
+                          ! a(j,j)
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,l)
+                             work( l ) = work( l ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu=1
+                       k = k + 1
+                       ! k=(n+1)/2 for n odd and ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 2
+                          ! process
+                          s = zero
+                          do i = 0, j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          ! i=j so process of a(j,j)
+                          s = s + aa
+                          work( j ) = s
+                          ! is initialised here
+                          i = i + 1
+                          ! i=j process a(j+k,j+k)
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          s = aa
+                          do l = k + j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(l,k+j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( k+j ) = work( k+j ) + s
+                       end do
+                       ! j=k-1 is special :process col a(k-1,0:k-1)
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,i)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( real( a( i+j*lda ),KIND=qp) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = s
+                       ! done with col j=k+1
+                       do j = k, n - 1
+                          ! process col j of a = a(j,0:k-1)
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    end if
+                 else
+                    ! n is even
+                    if( ilu==0 ) then
+                       ! uplo = 'u'
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 1
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i+k)
+                             work( i+k ) = work( i+k ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = s
+                       end do
+                       ! j=k
+                       aa = abs( real( a( 0+j*lda ),KIND=qp) )
+                       ! a(k,k)
+                       s = aa
+                       do i = 1, k - 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,k+i)
+                          work( i+k ) = work( i+k ) + aa
+                          s = s + aa
+                       end do
+                       work( j ) = work( j ) + s
+                       do j = k + 1, n - 1
+                          s = zero
+                          do i = 0, j - 2 - k
+                             aa = abs( a( i+j*lda ) )
+                             ! a(i,j-k-1)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          ! i=j-1-k
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          ! a(j-k-1,j-k-1)
+                          s = s + aa
+                          work( j-k-1 ) = work( j-k-1 ) + s
+                          i = i + 1
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          ! a(j,j)
+                          s = aa
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,l)
+                             work( l ) = work( l ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       ! j=n
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(i,k-1)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( real( a( i+j*lda ),KIND=qp) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = work( i ) + s
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       ! j=0 is special :process col a(k:n-1,k)
+                       s = abs( real( a( 0 ),KIND=qp) )
+                       ! a(k,k)
+                       do i = 1, k - 1
+                          aa = abs( a( i ) )
+                          ! a(k+i,k)
+                          work( i+k ) = work( i+k ) + aa
+                          s = s + aa
+                       end do
+                       work( k ) = work( k ) + s
+                       do j = 1, k - 1
+                          ! process
+                          s = zero
+                          do i = 0, j - 2
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j-1,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          ! i=j-1 so process of a(j-1,j-1)
+                          s = s + aa
+                          work( j-1 ) = s
+                          ! is initialised here
+                          i = i + 1
+                          ! i=j process a(j+k,j+k)
+                          aa = abs( real( a( i+j*lda ),KIND=qp) )
+                          s = aa
+                          do l = k + j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(l,k+j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( k+j ) = work( k+j ) + s
+                       end do
+                       ! j=k is special :process col a(k,0:k-1)
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,i)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( real( a( i+j*lda ),KIND=qp) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = s
+                       ! done with col j=k+1
+                       do j = k + 1, n
+                          ! process col j-1 of a = a(j-1,0:k-1)
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j-1,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          work( j-1 ) = work( j-1 ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_qisnan( temp ) )value = temp
+                       end do
+                    end if
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+             ! find normf(a).
+              k = ( n+1 ) / 2
+              scale = zero
+              s = one
+              if( noe==1 ) then
+                 ! n is odd
+                 if( ifm==1 ) then
+                    ! a is normal
+                    if( ilu==0 ) then
+                       ! a is upper
+                       do j = 0, k - 3
+                          call stdlib_wlassq( k-j-2, a( k+j+1+j*lda ), 1, scale, s )
+                          ! l at a(k,0)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_wlassq( k+j-1, a( 0+j*lda ), 1, scale, s )
+                          ! trap u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = k - 1
+                       ! -> u(k,k) at a(k-1,0)
+                       do i = 0, k - 2
+                          aa = real( a( l ),KIND=qp)
+                          ! u(k+i,k+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=qp)
+                          ! u(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                       aa = real( a( l ),KIND=qp)
+                       ! u(n-1,n-1)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                    else
+                       ! ilu=1
+                       do j = 0, k - 1
+                          call stdlib_wlassq( n-j-1, a( j+1+j*lda ), 1, scale, s )
+                          ! trap l at a(0,0)
+                       end do
+                       do j = 1, k - 2
+                          call stdlib_wlassq( j, a( 0+( 1+j )*lda ), 1, scale, s )
+                          ! u at a(0,1)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       aa = real( a( 0 ),KIND=qp)
+                       ! l(0,0) at a(0,0)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                       l = lda
+                       ! -> l(k,k) at a(0,1)
+                       do i = 1, k - 1
+                          aa = real( a( l ),KIND=qp)
+                          ! l(k-1+i,k-1+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=qp)
+                          ! l(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                    end if
+                 else
+                    ! a is xpose
+                    if( ilu==0 ) then
+                       ! a**h is upper
+                       do j = 1, k - 2
+                          call stdlib_wlassq( j, a( 0+( k+j )*lda ), 1, scale, s )
+                          ! u at a(0,k)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_wlassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k-1 rect. at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_wlassq( k-j-1, a( j+1+( j+k-1 )*lda ), 1,scale, s )
+                          ! l at a(0,k-1)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0 + k*lda - lda
+                       ! -> u(k-1,k-1) at a(0,k-1)
+                       aa = real( a( l ),KIND=qp)
+                       ! u(k-1,k-1)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                       l = l + lda
+                       ! -> u(0,0) at a(0,k)
+                       do j = k, n - 1
+                          aa = real( a( l ),KIND=qp)
+                          ! -> u(j-k,j-k)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=qp)
+                          ! -> u(j,j)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                    else
+                       ! a**h is lower
+                       do j = 1, k - 1
+                          call stdlib_wlassq( j, a( 0+j*lda ), 1, scale, s )
+                          ! u at a(0,0)
+                       end do
+                       do j = k, n - 1
+                          call stdlib_wlassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k-1 rect. at a(0,k)
+                       end do
+                       do j = 0, k - 3
+                          call stdlib_wlassq( k-j-2, a( j+2+j*lda ), 1, scale, s )
+                          ! l at a(1,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0
+                       ! -> l(0,0) at a(0,0)
+                       do i = 0, k - 2
+                          aa = real( a( l ),KIND=qp)
+                          ! l(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=qp)
+                          ! l(k+i,k+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                       ! l-> k-1 + (k-1)*lda or l(k-1,k-1) at a(k-1,k-1)
+                       aa = real( a( l ),KIND=qp)
+                       ! l(k-1,k-1) at a(k-1,k-1)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                    end if
+                 end if
+              else
+                 ! n is even
+                 if( ifm==1 ) then
+                    ! a is normal
+                    if( ilu==0 ) then
+                       ! a is upper
+                       do j = 0, k - 2
+                          call stdlib_wlassq( k-j-1, a( k+j+2+j*lda ), 1, scale, s )
+                       ! l at a(k+1,0)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_wlassq( k+j, a( 0+j*lda ), 1, scale, s )
+                       ! trap u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = k
+                       ! -> u(k,k) at a(k,0)
+                       do i = 0, k - 1
+                          aa = real( a( l ),KIND=qp)
+                          ! u(k+i,k+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=qp)
+                          ! u(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                    else
+                       ! ilu=1
+                       do j = 0, k - 1
+                          call stdlib_wlassq( n-j-1, a( j+2+j*lda ), 1, scale, s )
+                          ! trap l at a(1,0)
+                       end do
+                       do j = 1, k - 1
+                          call stdlib_wlassq( j, a( 0+j*lda ), 1, scale, s )
+                          ! u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0
+                       ! -> l(k,k) at a(0,0)
+                       do i = 0, k - 1
+                          aa = real( a( l ),KIND=qp)
+                          ! l(k-1+i,k-1+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=qp)
+                          ! l(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                    end if
+                 else
+                    ! a is xpose
+                    if( ilu==0 ) then
+                       ! a**h is upper
+                       do j = 1, k - 1
+                          call stdlib_wlassq( j, a( 0+( k+1+j )*lda ), 1, scale, s )
+                       ! u at a(0,k+1)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_wlassq( k, a( 0+j*lda ), 1, scale, s )
+                       ! k by k rect. at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_wlassq( k-j-1, a( j+1+( j+k )*lda ), 1, scale,s )
+                       ! l at a(0,k)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0 + k*lda
+                       ! -> u(k,k) at a(0,k)
+                       aa = real( a( l ),KIND=qp)
+                       ! u(k,k)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                       l = l + lda
+                       ! -> u(0,0) at a(0,k+1)
+                       do j = k + 1, n - 1
+                          aa = real( a( l ),KIND=qp)
+                          ! -> u(j-k-1,j-k-1)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=qp)
+                          ! -> u(j,j)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                       ! l=k-1+n*lda
+                       ! -> u(k-1,k-1) at a(k-1,n)
+                       aa = real( a( l ),KIND=qp)
+                       ! u(k,k)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                    else
+                       ! a**h is lower
+                       do j = 1, k - 1
+                          call stdlib_wlassq( j, a( 0+( j+1 )*lda ), 1, scale, s )
+                       ! u at a(0,1)
+                       end do
+                       do j = k + 1, n
+                          call stdlib_wlassq( k, a( 0+j*lda ), 1, scale, s )
+                       ! k by k rect. at a(0,k+1)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_wlassq( k-j-1, a( j+1+j*lda ), 1, scale, s )
+                       ! l at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0
+                       ! -> l(k,k) at a(0,0)
+                       aa = real( a( l ),KIND=qp)
+                       ! l(k,k) at a(0,0)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                       l = lda
+                       ! -> l(0,0) at a(0,1)
+                       do i = 0, k - 2
+                          aa = real( a( l ),KIND=qp)
+                          ! l(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=qp)
+                          ! l(k+i+1,k+i+1)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                       ! l-> k - 1 + k*lda or l(k-1,k-1) at a(k-1,k)
+                       aa = real( a( l ),KIND=qp)
+                       ! l(k-1,k-1) at a(k-1,k)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                    end if
+                 end if
+              end if
+              value = scale*sqrt( s )
+           end if
+           stdlib_wlanhf = value
+           return
+     end function stdlib_wlanhf
+
+     !> ZLANHP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex hermitian matrix A,  supplied in packed form.
+
+     real(qp) function stdlib_wlanhp( norm, uplo, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(in) :: ap(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k
+           real(qp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 k = 0
+                 do j = 1, n
+                    do i = k + 1, k + j - 1
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                    k = k + j
+                    sum = abs( real( ap( k ),KIND=qp) )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 k = 1
+                 do j = 1, n
+                    sum = abs( real( ap( k ),KIND=qp) )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    do i = k + 1, k + n - j
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                    k = k + n - j + 1
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is hermitian).
+              value = zero
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    work( j ) = sum + abs( real( ap( k ),KIND=qp) )
+                    k = k + 1
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( real( ap( k ),KIND=qp) )
+                    k = k + 1
+                    do i = j + 1, n
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              k = 2
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_wlassq( j-1, ap( k ), 1, scale, sum )
+                    k = k + j
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_wlassq( n-j, ap( k ), 1, scale, sum )
+                    k = k + n - j + 1
+                 end do
+              end if
+              sum = 2*sum
+              k = 1
+              do i = 1, n
+                 if( real( ap( k ),KIND=qp)/=zero ) then
+                    absa = abs( real( ap( k ),KIND=qp) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    k = k + i + 1
+                 else
+                    k = k + n - i + 1
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_wlanhp = value
+           return
+     end function stdlib_wlanhp
+
+     !> ZLANHS:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> Hessenberg matrix A.
+
+     real(qp) function stdlib_wlanhs( norm, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = 1, min( n, j+1 )
+                    sum = abs( a( i, j ) )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = 1, min( n, j+1 )
+                    sum = sum + abs( a( i, j ) )
+                 end do
+                 if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 do i = 1, min( n, j+1 )
+                    work( i ) = work( i ) + abs( a( i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 call stdlib_wlassq( min( n, j+1 ), a( 1, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_wlanhs = value
+           return
+     end function stdlib_wlanhs
+
+     !> ZLANHT:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex Hermitian tridiagonal matrix A.
+
+     pure real(qp) function stdlib_wlanht( norm, n, d, e )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*)
+           complex(qp), intent(in) :: e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: anorm, scale, sum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           if( n<=0 ) then
+              anorm = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              anorm = abs( d( n ) )
+              do i = 1, n - 1
+                 sum =  abs( d( i ) )
+                 if( anorm < sum .or. stdlib_qisnan( sum ) ) anorm = sum
+                 sum = abs( e( i ) )
+                 if( anorm < sum .or. stdlib_qisnan( sum ) ) anorm = sum
+              end do
+           else if( stdlib_lsame( norm, 'O' ) .or. norm=='1' .or.stdlib_lsame( norm, 'I' ) ) &
+                     then
+              ! find norm1(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( e( 1 ) )
+                 sum = abs( e( n-1 ) )+abs( d( n ) )
+                 if( anorm < sum .or. stdlib_qisnan( sum ) ) anorm = sum
+                 do i = 2, n - 1
+                    sum = abs( d( i ) )+abs( e( i ) )+abs( e( i-1 ) )
+                    if( anorm < sum .or. stdlib_qisnan( sum ) ) anorm = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( n>1 ) then
+                 call stdlib_wlassq( n-1, e, 1, scale, sum )
+                 sum = 2*sum
+              end if
+              call stdlib_qlassq( n, d, 1, scale, sum )
+              anorm = scale*sqrt( sum )
+           end if
+           stdlib_wlanht = anorm
+           return
+     end function stdlib_wlanht
+
+     !> ZLANSB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n symmetric band matrix A,  with k super-diagonals.
+
+     real(qp) function stdlib_wlansb( norm, uplo, n, k, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(in) :: ab(ldab,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           real(qp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = max( k+2-j, 1 ), k + 1
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                 end do
+              else
+                 do j = 1, n
+                    do i = 1, min( n+1-j, k+1 )
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    l = k + 1 - j
+                    do i = max( 1, j-k ), j - 1
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( ab( k+1, j ) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( ab( 1, j ) )
+                    l = 1 - j
+                    do i = j + 1, min( n, j+k )
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( k>0 ) then
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 2, n
+                       call stdlib_wlassq( min( j-1, k ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                    l = k + 1
+                 else
+                    do j = 1, n - 1
+                       call stdlib_wlassq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                    end do
+                    l = 1
+                 end if
+                 sum = 2*sum
+              else
+                 l = 1
+              end if
+              call stdlib_wlassq( n, ab( l, 1 ), ldab, scale, sum )
+              value = scale*sqrt( sum )
+           end if
+           stdlib_wlansb = value
+           return
+     end function stdlib_wlansb
+
+     !> ZLANSP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex symmetric matrix A,  supplied in packed form.
+
+     real(qp) function stdlib_wlansp( norm, uplo, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(in) :: ap(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k
+           real(qp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 k = 1
+                 do j = 1, n
+                    do i = k, k + j - 1
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                    k = k + j
+                 end do
+              else
+                 k = 1
+                 do j = 1, n
+                    do i = k, k + n - j
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                    k = k + n - j + 1
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    work( j ) = sum + abs( ap( k ) )
+                    k = k + 1
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( ap( k ) )
+                    k = k + 1
+                    do i = j + 1, n
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              k = 2
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_wlassq( j-1, ap( k ), 1, scale, sum )
+                    k = k + j
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_wlassq( n-j, ap( k ), 1, scale, sum )
+                    k = k + n - j + 1
+                 end do
+              end if
+              sum = 2*sum
+              k = 1
+              do i = 1, n
+                 if( real( ap( k ),KIND=qp)/=zero ) then
+                    absa = abs( real( ap( k ),KIND=qp) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+                 if( aimag( ap( k ) )/=zero ) then
+                    absa = abs( aimag( ap( k ) ) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    k = k + i + 1
+                 else
+                    k = k + n - i + 1
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_wlansp = value
+           return
+     end function stdlib_wlansp
+
+     !> ZLANSY:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex symmetric matrix A.
+
+     real(qp) function stdlib_wlansy( norm, uplo, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = 1, j
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                 end do
+              else
+                 do j = 1, n
+                    do i = j, n
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( a( j, j ) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( a( j, j ) )
+                    do i = j + 1, n
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_wlassq( j-1, a( 1, j ), 1, scale, sum )
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_wlassq( n-j, a( j+1, j ), 1, scale, sum )
+                 end do
+              end if
+              sum = 2*sum
+              call stdlib_wlassq( n, a, lda+1, scale, sum )
+              value = scale*sqrt( sum )
+           end if
+           stdlib_wlansy = value
+           return
+     end function stdlib_wlansy
+
+     !> ZLANTB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n triangular band matrix A,  with ( k + 1 ) diagonals.
+
+     real(qp) function stdlib_wlantb( norm, uplo, diag, n, k, ab,ldab, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(in) :: ab(ldab,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j, l
+           real(qp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = max( k+2-j, 1 ), k
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = 2, min( n+1-j, k+1 )
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = max( k+2-j, 1 ), k + 1
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = 1, min( n+1-j, k+1 )
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = max( k+2-j, 1 ), k
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = max( k+2-j, 1 ), k + 1
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = 2, min( n+1-j, k+1 )
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = 1, min( n+1-j, k+1 )
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       l = k + 1 - j
+                       do i = max( 1, j-k ), j - 1
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       l = k + 1 - j
+                       do i = max( 1, j-k ), j
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       l = 1 - j
+                       do i = j + 1, min( n, j+k )
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       l = 1 - j
+                       do i = j, min( n, j+k )
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 end if
+              end if
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    if( k>0 ) then
+                       do j = 2, n
+                          call stdlib_wlassq( min( j-1, k ),ab( max( k+2-j, 1 ), j ), 1, scale,&
+                                    sum )
+                       end do
+                    end if
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_wlassq( min( j, k+1 ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    if( k>0 ) then
+                       do j = 1, n - 1
+                          call stdlib_wlassq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                       end do
+                    end if
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_wlassq( min( n-j+1, k+1 ), ab( 1, j ), 1, scale,sum )
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_wlantb = value
+           return
+     end function stdlib_wlantb
+
+     !> ZLANTP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> triangular matrix A, supplied in packed form.
+
+     real(qp) function stdlib_wlantp( norm, uplo, diag, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(in) :: ap(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j, k
+           real(qp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              k = 1
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = k, k + j - 2
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                       k = k + j
+                    end do
+                 else
+                    do j = 1, n
+                       do i = k + 1, k + n - j
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                       k = k + n - j + 1
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = k, k + j - 1
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                       k = k + j
+                    end do
+                 else
+                    do j = 1, n
+                       do i = k, k + n - j
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                       k = k + n - j + 1
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              k = 1
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = k, k + j - 2
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = k, k + j - 1
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    end if
+                    k = k + j
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = k + 1, k + n - j
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = k, k + n - j
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    end if
+                    k = k + n - j + 1
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       do i = 1, j - 1
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                       k = k + 1
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = 1, j
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       k = k + 1
+                       do i = j + 1, n
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j, n
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 end if
+              end if
+              value = zero
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    k = 2
+                    do j = 2, n
+                       call stdlib_wlassq( j-1, ap( k ), 1, scale, sum )
+                       k = k + j
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    k = 1
+                    do j = 1, n
+                       call stdlib_wlassq( j, ap( k ), 1, scale, sum )
+                       k = k + j
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    k = 2
+                    do j = 1, n - 1
+                       call stdlib_wlassq( n-j, ap( k ), 1, scale, sum )
+                       k = k + n - j + 1
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    k = 1
+                    do j = 1, n
+                       call stdlib_wlassq( n-j+1, ap( k ), 1, scale, sum )
+                       k = k + n - j + 1
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_wlantp = value
+           return
+     end function stdlib_wlantp
+
+     !> ZLANTR:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> trapezoidal or triangular matrix A.
+
+     real(qp) function stdlib_wlantr( norm, uplo, diag, m, n, a, lda,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j
+           real(qp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( min( m, n )==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = 1, min( m, j-1 )
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = j + 1, m
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = 1, min( m, j )
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = j, m
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( ( udiag ) .and. ( j<=m ) ) then
+                       sum = one
+                       do i = 1, j - 1
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = 1, min( m, j )
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = j + 1, m
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = j, m
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, m
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       do i = 1, min( m, j-1 )
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = 1, min( m, j )
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, min( m, n )
+                       work( i ) = one
+                    end do
+                    do i = n + 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j + 1, m
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j, m
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 end if
+              end if
+              value = zero
+              do i = 1, m
+                 sum = work( i )
+                 if( value < sum .or. stdlib_qisnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = min( m, n )
+                    do j = 2, n
+                       call stdlib_wlassq( min( m, j-1 ), a( 1, j ), 1, scale, sum )
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_wlassq( min( m, j ), a( 1, j ), 1, scale, sum )
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = min( m, n )
+                    do j = 1, n
+                       call stdlib_wlassq( m-j, a( min( m, j+1 ), j ), 1, scale,sum )
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_wlassq( m-j+1, a( j, j ), 1, scale, sum )
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_wlantr = value
+           return
+     end function stdlib_wlantr
+
+     !> Given two column vectors X and Y, let
+     !> A = ( X Y ).
+     !> The subroutine first computes the QR factorization of A = Q*R,
+     !> and then computes the SVD of the 2-by-2 upper triangular matrix R.
+     !> The smaller singular value of R is returned in SSMIN, which is used
+     !> as the measurement of the linear dependency of the vectors X and Y.
+
+     pure subroutine stdlib_wlapll( n, x, incx, y, incy, ssmin )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           real(qp), intent(out) :: ssmin
+           ! Array Arguments 
+           complex(qp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(qp) :: ssmax
+           complex(qp) :: a11, a12, a22, c, tau
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 ) then
+              ssmin = zero
+              return
+           end if
+           ! compute the qr factorization of the n-by-2 matrix ( x y )
+           call stdlib_wlarfg( n, x( 1 ), x( 1+incx ), incx, tau )
+           a11 = x( 1 )
+           x( 1 ) = cone
+           c = -conjg( tau )*stdlib_wdotc( n, x, incx, y, incy )
+           call stdlib_waxpy( n, c, x, incx, y, incy )
+           call stdlib_wlarfg( n-1, y( 1+incy ), y( 1+2*incy ), incy, tau )
+           a12 = y( 1 )
+           a22 = y( 1+incy )
+           ! compute the svd of 2-by-2 upper triangular matrix.
+           call stdlib_qlas2( abs( a11 ), abs( a12 ), abs( a22 ), ssmin, ssmax )
+           return
+     end subroutine stdlib_wlapll
+
+     !> ZLAPMR: rearranges the rows of the M by N matrix X as specified
+     !> by the permutation K(1),K(2),...,K(M) of the integers 1,...,M.
+     !> If FORWRD = .TRUE.,  forward permutation:
+     !> X(K(I),*) is moved X(I,*) for I = 1,2,...,M.
+     !> If FORWRD = .FALSE., backward permutation:
+     !> X(I,*) is moved to X(K(I),*) for I = 1,2,...,M.
+
+     pure subroutine stdlib_wlapmr( forwrd, m, n, x, ldx, k )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: forwrd
+           integer(ilp), intent(in) :: ldx, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: k(*)
+           complex(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, in, j, jj
+           complex(qp) :: temp
+           ! Executable Statements 
+           if( m<=1 )return
+           do i = 1, m
+              k( i ) = -k( i )
+           end do
+           if( forwrd ) then
+              ! forward permutation
+              do i = 1, m
+                 if( k( i )>0 )go to 40
+                 j = i
+                 k( j ) = -k( j )
+                 in = k( j )
+                 20 continue
+                 if( k( in )>0 )go to 40
+                 do jj = 1, n
+                    temp = x( j, jj )
+                    x( j, jj ) = x( in, jj )
+                    x( in, jj ) = temp
+                 end do
+                 k( in ) = -k( in )
+                 j = in
+                 in = k( in )
+                 go to 20
+                 40 continue
+              end do
+           else
+              ! backward permutation
+              do i = 1, m
+                 if( k( i )>0 )go to 80
+                 k( i ) = -k( i )
+                 j = k( i )
+                 60 continue
+                 if( j==i )go to 80
+                 do jj = 1, n
+                    temp = x( i, jj )
+                    x( i, jj ) = x( j, jj )
+                    x( j, jj ) = temp
+                 end do
+                 k( j ) = -k( j )
+                 j = k( j )
+                 go to 60
+                 80 continue
+              end do
+           end if
+           return
+     end subroutine stdlib_wlapmr
+
+     !> ZLAPMT: rearranges the columns of the M by N matrix X as specified
+     !> by the permutation K(1),K(2),...,K(N) of the integers 1,...,N.
+     !> If FORWRD = .TRUE.,  forward permutation:
+     !> X(*,K(J)) is moved X(*,J) for J = 1,2,...,N.
+     !> If FORWRD = .FALSE., backward permutation:
+     !> X(*,J) is moved to X(*,K(J)) for J = 1,2,...,N.
+
+     pure subroutine stdlib_wlapmt( forwrd, m, n, x, ldx, k )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: forwrd
+           integer(ilp), intent(in) :: ldx, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: k(*)
+           complex(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ii, in, j
+           complex(qp) :: temp
+           ! Executable Statements 
+           if( n<=1 )return
+           do i = 1, n
+              k( i ) = -k( i )
+           end do
+           if( forwrd ) then
+              ! forward permutation
+              do i = 1, n
+                 if( k( i )>0 )go to 40
+                 j = i
+                 k( j ) = -k( j )
+                 in = k( j )
+                 20 continue
+                 if( k( in )>0 )go to 40
+                 do ii = 1, m
+                    temp = x( ii, j )
+                    x( ii, j ) = x( ii, in )
+                    x( ii, in ) = temp
+                 end do
+                 k( in ) = -k( in )
+                 j = in
+                 in = k( in )
+                 go to 20
+                 40 continue
+              end do
+           else
+              ! backward permutation
+              do i = 1, n
+                 if( k( i )>0 )go to 80
+                 k( i ) = -k( i )
+                 j = k( i )
+                 60 continue
+                 if( j==i )go to 80
+                 do ii = 1, m
+                    temp = x( ii, i )
+                    x( ii, i ) = x( ii, j )
+                    x( ii, j ) = temp
+                 end do
+                 k( j ) = -k( j )
+                 j = k( j )
+                 go to 60
+                 80 continue
+              end do
+           end if
+           return
+     end subroutine stdlib_wlapmt
+
+     !> ZLAQGB: equilibrates a general M by N band matrix A with KL
+     !> subdiagonals and KU superdiagonals using the row and scaling factors
+     !> in the vectors R and C.
+
+     pure subroutine stdlib_wlaqgb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(qp), intent(in) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(qp), intent(in) :: c(*), r(*)
+           complex(qp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1e+0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           large = one / small
+           if( rowcnd>=thresh .and. amax>=small .and. amax<=large )then
+              ! no row scaling
+              if( colcnd>=thresh ) then
+                 ! no column scaling
+                 equed = 'N'
+              else
+                 ! column scaling
+                 do j = 1, n
+                    cj = c( j )
+                    do i = max( 1, j-ku ), min( m, j+kl )
+                       ab( ku+1+i-j, j ) = cj*ab( ku+1+i-j, j )
+                    end do
+                 end do
+                 equed = 'C'
+              end if
+           else if( colcnd>=thresh ) then
+              ! row scaling, no column scaling
+              do j = 1, n
+                 do i = max( 1, j-ku ), min( m, j+kl )
+                    ab( ku+1+i-j, j ) = r( i )*ab( ku+1+i-j, j )
+                 end do
+              end do
+              equed = 'R'
+           else
+              ! row and column scaling
+              do j = 1, n
+                 cj = c( j )
+                 do i = max( 1, j-ku ), min( m, j+kl )
+                    ab( ku+1+i-j, j ) = cj*r( i )*ab( ku+1+i-j, j )
+                 end do
+              end do
+              equed = 'B'
+           end if
+           return
+     end subroutine stdlib_wlaqgb
+
+     !> ZLAQGE: equilibrates a general M by N matrix A using the row and
+     !> column scaling factors in the vectors R and C.
+
+     pure subroutine stdlib_wlaqge( m, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           integer(ilp), intent(in) :: lda, m, n
+           real(qp), intent(in) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(qp), intent(in) :: c(*), r(*)
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1e+0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           large = one / small
+           if( rowcnd>=thresh .and. amax>=small .and. amax<=large )then
+              ! no row scaling
+              if( colcnd>=thresh ) then
+                 ! no column scaling
+                 equed = 'N'
+              else
+                 ! column scaling
+                 do j = 1, n
+                    cj = c( j )
+                    do i = 1, m
+                       a( i, j ) = cj*a( i, j )
+                    end do
+                 end do
+                 equed = 'C'
+              end if
+           else if( colcnd>=thresh ) then
+              ! row scaling, no column scaling
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = r( i )*a( i, j )
+                 end do
+              end do
+              equed = 'R'
+           else
+              ! row and column scaling
+              do j = 1, n
+                 cj = c( j )
+                 do i = 1, m
+                    a( i, j ) = cj*r( i )*a( i, j )
+                 end do
+              end do
+              equed = 'B'
+           end if
+           return
+     end subroutine stdlib_wlaqge
+
+     !> ZLAQHB: equilibrates a Hermitian band matrix A
+     !> using the scaling factors in the vector S.
+
+     pure subroutine stdlib_wlaqhb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(qp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(out) :: s(*)
+           complex(qp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1e+0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored in band format.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = max( 1, j-kd ), j - 1
+                       ab( kd+1+i-j, j ) = cj*s( i )*ab( kd+1+i-j, j )
+                    end do
+                    ab( kd+1, j ) = cj*cj*real( ab( kd+1, j ),KIND=qp)
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    ab( 1, j ) = cj*cj*real( ab( 1, j ),KIND=qp)
+                    do i = j + 1, min( n, j+kd )
+                       ab( 1+i-j, j ) = cj*s( i )*ab( 1+i-j, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_wlaqhb
+
+     !> ZLAQHE: equilibrates a Hermitian matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_wlaqhe( uplo, n, a, lda, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(in) :: s(*)
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1e+0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j - 1
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                    a( j, j ) = cj*cj*real( a( j, j ),KIND=qp)
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    a( j, j ) = cj*cj*real( a( j, j ),KIND=qp)
+                    do i = j + 1, n
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_wlaqhe
+
+     !> ZLAQHP: equilibrates a Hermitian matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_wlaqhp( uplo, n, ap, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(in) :: s(*)
+           complex(qp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1e+0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jc
+           real(qp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j - 1
+                       ap( jc+i-1 ) = cj*s( i )*ap( jc+i-1 )
+                    end do
+                    ap( jc+j-1 ) = cj*cj*real( ap( jc+j-1 ),KIND=qp)
+                    jc = jc + j
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    ap( jc ) = cj*cj*real( ap( jc ),KIND=qp)
+                    do i = j + 1, n
+                       ap( jc+i-j ) = cj*s( i )*ap( jc+i-j )
+                    end do
+                    jc = jc + n - j + 1
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_wlaqhp
+
+     !> ZLAQP2: computes a QR factorization with column pivoting of
+     !> the block A(OFFSET+1:M,1:N).
+     !> The block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.
+
+     pure subroutine stdlib_wlaqp2( m, n, offset, a, lda, jpvt, tau, vn1, vn2,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, m, n, offset
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(qp), intent(inout) :: vn1(*), vn2(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, itemp, j, mn, offpi, pvt
+           real(qp) :: temp, temp2, tol3z
+           complex(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,min,sqrt
+           ! Executable Statements 
+           mn = min( m-offset, n )
+           tol3z = sqrt(stdlib_qlamch('EPSILON'))
+           ! compute factorization.
+           loop_20: do i = 1, mn
+              offpi = offset + i
+              ! determine ith pivot column and swap if necessary.
+              pvt = ( i-1 ) + stdlib_iqamax( n-i+1, vn1( i ), 1 )
+              if( pvt/=i ) then
+                 call stdlib_wswap( m, a( 1, pvt ), 1, a( 1, i ), 1 )
+                 itemp = jpvt( pvt )
+                 jpvt( pvt ) = jpvt( i )
+                 jpvt( i ) = itemp
+                 vn1( pvt ) = vn1( i )
+                 vn2( pvt ) = vn2( i )
+              end if
+              ! generate elementary reflector h(i).
+              if( offpi<m ) then
+                 call stdlib_wlarfg( m-offpi+1, a( offpi, i ), a( offpi+1, i ), 1,tau( i ) )
+                           
+              else
+                 call stdlib_wlarfg( 1, a( m, i ), a( m, i ), 1, tau( i ) )
+              end if
+              if( i<n ) then
+                 ! apply h(i)**h to a(offset+i:m,i+1:n) from the left.
+                 aii = a( offpi, i )
+                 a( offpi, i ) = cone
+                 call stdlib_wlarf( 'LEFT', m-offpi+1, n-i, a( offpi, i ), 1,conjg( tau( i ) ), a(&
+                            offpi, i+1 ), lda,work( 1 ) )
+                 a( offpi, i ) = aii
+              end if
+              ! update partial column norms.
+              do j = i + 1, n
+                 if( vn1( j )/=zero ) then
+                    ! note: the following 4 lines follow from the analysis in
+                    ! lapack working note 176.
+                    temp = one - ( abs( a( offpi, j ) ) / vn1( j ) )**2
+                    temp = max( temp, zero )
+                    temp2 = temp*( vn1( j ) / vn2( j ) )**2
+                    if( temp2 <= tol3z ) then
+                       if( offpi<m ) then
+                          vn1( j ) = stdlib_qznrm2( m-offpi, a( offpi+1, j ), 1 )
+                          vn2( j ) = vn1( j )
+                       else
+                          vn1( j ) = zero
+                          vn2( j ) = zero
+                       end if
+                    else
+                       vn1( j ) = vn1( j )*sqrt( temp )
+                    end if
+                 end if
+              end do
+           end do loop_20
+           return
+     end subroutine stdlib_wlaqp2
+
+     !> ZLAQPS: computes a step of QR factorization with column pivoting
+     !> of a complex M-by-N matrix A by using Blas-3.  It tries to factorize
+     !> NB columns from A starting from the row OFFSET+1, and updates all
+     !> of the matrix with Blas-3 xGEMM.
+     !> In some cases, due to catastrophic cancellations, it cannot
+     !> factorize NB columns.  Hence, the actual number of factorized
+     !> columns is returned in KB.
+     !> Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.
+
+     pure subroutine stdlib_wlaqps( m, n, offset, nb, kb, a, lda, jpvt, tau, vn1,vn2, auxv, f, &
+               ldf )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: kb
+           integer(ilp), intent(in) :: lda, ldf, m, n, nb, offset
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(qp), intent(inout) :: vn1(*), vn2(*)
+           complex(qp), intent(inout) :: a(lda,*), auxv(*), f(ldf,*)
+           complex(qp), intent(out) :: tau(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: itemp, j, k, lastrk, lsticc, pvt, rk
+           real(qp) :: temp, temp2, tol3z
+           complex(qp) :: akk
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,max,min,nint,sqrt
+           ! Executable Statements 
+           lastrk = min( m, n+offset )
+           lsticc = 0
+           k = 0
+           tol3z = sqrt(stdlib_qlamch('EPSILON'))
+           ! beginning of while loop.
+           10 continue
+           if( ( k<nb ) .and. ( lsticc==0 ) ) then
+              k = k + 1
+              rk = offset + k
+              ! determine ith pivot column and swap if necessary
+              pvt = ( k-1 ) + stdlib_iqamax( n-k+1, vn1( k ), 1 )
+              if( pvt/=k ) then
+                 call stdlib_wswap( m, a( 1, pvt ), 1, a( 1, k ), 1 )
+                 call stdlib_wswap( k-1, f( pvt, 1 ), ldf, f( k, 1 ), ldf )
+                 itemp = jpvt( pvt )
+                 jpvt( pvt ) = jpvt( k )
+                 jpvt( k ) = itemp
+                 vn1( pvt ) = vn1( k )
+                 vn2( pvt ) = vn2( k )
+              end if
+              ! apply previous householder reflectors to column k:
+              ! a(rk:m,k) := a(rk:m,k) - a(rk:m,1:k-1)*f(k,1:k-1)**h.
+              if( k>1 ) then
+                 do j = 1, k - 1
+                    f( k, j ) = conjg( f( k, j ) )
+                 end do
+                 call stdlib_wgemv( 'NO TRANSPOSE', m-rk+1, k-1, -cone, a( rk, 1 ),lda, f( k, 1 ),&
+                            ldf, cone, a( rk, k ), 1 )
+                 do j = 1, k - 1
+                    f( k, j ) = conjg( f( k, j ) )
+                 end do
+              end if
+              ! generate elementary reflector h(k).
+              if( rk<m ) then
+                 call stdlib_wlarfg( m-rk+1, a( rk, k ), a( rk+1, k ), 1, tau( k ) )
+              else
+                 call stdlib_wlarfg( 1, a( rk, k ), a( rk, k ), 1, tau( k ) )
+              end if
+              akk = a( rk, k )
+              a( rk, k ) = cone
+              ! compute kth column of f:
+              ! compute  f(k+1:n,k) := tau(k)*a(rk:m,k+1:n)**h*a(rk:m,k).
+              if( k<n ) then
+                 call stdlib_wgemv( 'CONJUGATE TRANSPOSE', m-rk+1, n-k, tau( k ),a( rk, k+1 ), &
+                           lda, a( rk, k ), 1, czero,f( k+1, k ), 1 )
+              end if
+              ! padding f(1:k,k) with zeros.
+              do j = 1, k
+                 f( j, k ) = czero
+              end do
+              ! incremental updating of f:
+              ! f(1:n,k) := f(1:n,k) - tau(k)*f(1:n,1:k-1)*a(rk:m,1:k-1)**h
+                          ! *a(rk:m,k).
+              if( k>1 ) then
+                 call stdlib_wgemv( 'CONJUGATE TRANSPOSE', m-rk+1, k-1, -tau( k ),a( rk, 1 ), lda,&
+                            a( rk, k ), 1, czero,auxv( 1 ), 1 )
+                 call stdlib_wgemv( 'NO TRANSPOSE', n, k-1, cone, f( 1, 1 ), ldf,auxv( 1 ), 1, &
+                           cone, f( 1, k ), 1 )
+              end if
+              ! update the current row of a:
+              ! a(rk,k+1:n) := a(rk,k+1:n) - a(rk,1:k)*f(k+1:n,1:k)**h.
+              if( k<n ) then
+                 call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', 1, n-k,k, -cone, a( rk,&
+                            1 ), lda, f( k+1, 1 ), ldf,cone, a( rk, k+1 ), lda )
+              end if
+              ! update partial column norms.
+              if( rk<lastrk ) then
+                 do j = k + 1, n
+                    if( vn1( j )/=zero ) then
+                       ! note: the following 4 lines follow from the analysis in
+                       ! lapack working note 176.
+                       temp = abs( a( rk, j ) ) / vn1( j )
+                       temp = max( zero, ( one+temp )*( one-temp ) )
+                       temp2 = temp*( vn1( j ) / vn2( j ) )**2
+                       if( temp2 <= tol3z ) then
+                          vn2( j ) = real( lsticc,KIND=qp)
+                          lsticc = j
+                       else
+                          vn1( j ) = vn1( j )*sqrt( temp )
+                       end if
+                    end if
+                 end do
+              end if
+              a( rk, k ) = akk
+              ! end of while loop.
+              go to 10
+           end if
+           kb = k
+           rk = offset + kb
+           ! apply the block reflector to the rest of the matrix:
+           ! a(offset+kb+1:m,kb+1:n) := a(offset+kb+1:m,kb+1:n) -
+                               ! a(offset+kb+1:m,1:kb)*f(kb+1:n,1:kb)**h.
+           if( kb<min( n, m-offset ) ) then
+              call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m-rk, n-kb,kb, -cone, a( &
+                        rk+1, 1 ), lda, f( kb+1, 1 ), ldf,cone, a( rk+1, kb+1 ), lda )
+           end if
+           ! recomputation of difficult columns.
+           60 continue
+           if( lsticc>0 ) then
+              itemp = nint( vn2( lsticc ),KIND=ilp)
+              vn1( lsticc ) = stdlib_qznrm2( m-rk, a( rk+1, lsticc ), 1 )
+              ! note: the computation of vn1( lsticc ) relies on the fact that
+              ! stdlib_dnrm2 does not fail on vectors with norm below the value of
+              ! sqrt(stdlib_qlamch('s'))
+              vn2( lsticc ) = vn1( lsticc )
+              lsticc = itemp
+              go to 60
+           end if
+           return
+     end subroutine stdlib_wlaqps
+
+     !> ZLAQR0: computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**H, where T is an upper triangular matrix (the
+     !> Schur form), and Z is the unitary matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input unitary
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+
+     pure subroutine stdlib_wlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, work,&
+                lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(qp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(qp), intent(out) :: w(*), work(*)
+        ! ================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: kexnw = 5
+           integer(ilp), parameter :: kexsh = 6
+           real(qp), parameter :: wilk1 = 0.75_qp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_wlahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== exceptional deflation windows:  try to cure rare
+           ! .    slow convergence by varying the size of the
+           ! .    deflation window after kexnw iterations. ====
+           
+           ! ==== exceptional shifts: try to cure rare slow convergence
+           ! .    with ad-hoc exceptional shifts every kexsh iterations.
+           ! .    ====
+           
+           ! ==== the constant wilk1 is used to form the exceptional
+           ! .    shifts. ====
+           
+           
+           
+           ! Local Scalars 
+           complex(qp) :: aa, bb, cc, cdum, dd, det, rtdisc, swap, tr2
+           real(qp) :: s
+           integer(ilp) :: i, inf, it, itmax, k, kacc22, kbot, kdu, ks, kt, ktop, ku, kv, kwh, &
+           kwtop, kwv, ld, ls, lwkopt, ndec, ndfl, nh, nho, nibble, nmin, ns, nsmax, nsr, nve, nw,&
+                      nwmax, nwr, nwupbd
+           logical(lk) :: sorted
+           character :: jbcmpz*2
+           ! Local Arrays 
+           complex(qp) :: zdum(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,int,max,min,mod,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           info = 0
+           ! ==== quick return for n = 0: nothing to do. ====
+           if( n==0 ) then
+              work( 1 ) = cone
+              return
+           end if
+           if( n<=ntiny ) then
+              ! ==== tiny matrices must use stdlib_wlahqr. ====
+              lwkopt = 1
+              if( lwork/=-1 )call stdlib_wlahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, &
+                        z, ldz, info )
+           else
+              ! ==== use small bulge multi-shift qr with aggressive early
+              ! .    deflation on larger-than-tiny matrices. ====
+              ! ==== hope for the best. ====
+              info = 0
+              ! ==== set up job flags for stdlib_ilaenv. ====
+              if( wantt ) then
+                 jbcmpz( 1: 1 ) = 'S'
+              else
+                 jbcmpz( 1: 1 ) = 'E'
+              end if
+              if( wantz ) then
+                 jbcmpz( 2: 2 ) = 'V'
+              else
+                 jbcmpz( 2: 2 ) = 'N'
+              end if
+              ! ==== nwr = recommended deflation window size.  at this
+              ! .    point,  n > ntiny = 15, so there is enough
+              ! .    subdiagonal workspace for nwr>=2 as required.
+              ! .    (in fact, there is enough subdiagonal space for
+              ! .    nwr>=4.) ====
+              nwr = stdlib_ilaenv( 13, 'ZLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nwr = max( 2, nwr )
+              nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+              ! ==== nsr = recommended number of simultaneous shifts.
+              ! .    at this point n > ntiny = 15, so there is at
+              ! .    enough subdiagonal workspace for nsr to be even
+              ! .    and greater than or equal to two as required. ====
+              nsr = stdlib_ilaenv( 15, 'ZLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nsr = min( nsr, ( n-3 ) / 6, ihi-ilo )
+              nsr = max( 2, nsr-mod( nsr, 2 ) )
+              ! ==== estimate optimal workspace ====
+              ! ==== workspace query call to stdlib_wlaqr3 ====
+              call stdlib_wlaqr3( wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz,ihiz, z, ldz, ls,&
+                         ld, w, h, ldh, n, h, ldh, n, h,ldh, work, -1 )
+              ! ==== optimal workspace = max(stdlib_wlaqr5, stdlib_wlaqr3) ====
+              lwkopt = max( 3*nsr / 2, int( work( 1 ),KIND=ilp) )
+              ! ==== quick return in case of workspace query. ====
+              if( lwork==-1 ) then
+                 work( 1 ) = cmplx( lwkopt, 0,KIND=qp)
+                 return
+              end if
+              ! ==== stdlib_wlahqr/stdlib_wlaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'ZLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nmin = max( ntiny, nmin )
+              ! ==== nibble crossover point ====
+              nibble = stdlib_ilaenv( 14, 'ZLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nibble = max( 0, nibble )
+              ! ==== accumulate reflections during ttswp?  use block
+              ! .    2-by-2 structure during matrix-matrix multiply? ====
+              kacc22 = stdlib_ilaenv( 16, 'ZLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              kacc22 = max( 0, kacc22 )
+              kacc22 = min( 2, kacc22 )
+              ! ==== nwmax = the largest possible deflation window for
+              ! .    which there is sufficient workspace. ====
+              nwmax = min( ( n-1 ) / 3, lwork / 2 )
+              nw = nwmax
+              ! ==== nsmax = the largest number of simultaneous shifts
+              ! .    for which there is sufficient workspace. ====
+              nsmax = min( ( n-3 ) / 6, 2*lwork / 3 )
+              nsmax = nsmax - mod( nsmax, 2 )
+              ! ==== ndfl: an iteration count restarted at deflation. ====
+              ndfl = 1
+              ! ==== itmax = iteration limit ====
+              itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )
+              ! ==== last row and column in the active block ====
+              kbot = ihi
+              ! ==== main loop ====
+              loop_70: do it = 1, itmax
+                 ! ==== done when kbot falls below ilo ====
+                 if( kbot<ilo )go to 80
+                 ! ==== locate active block ====
+                 do k = kbot, ilo + 1, -1
+                    if( h( k, k-1 )==czero )go to 20
+                 end do
+                 k = ilo
+                 20 continue
+                 ktop = k
+                 ! ==== select deflation window size:
+                 ! .    typical case:
+                 ! .      if possible and advisable, nibble the entire
+                 ! .      active block.  if not, use size min(nwr,nwmax)
+                 ! .      or min(nwr+1,nwmax) depending upon which has
+                 ! .      the smaller corresponding subdiagonal entry
+                 ! .      (a heuristic).
+                 ! .    exceptional case:
+                 ! .      if there have been no deflations in kexnw or
+                 ! .      more iterations, then vary the deflation window
+                 ! .      size.   at first, because, larger windows are,
+                 ! .      in general, more powerful than smaller ones,
+                 ! .      rapidly increase the window to the maximum possible.
+                 ! .      then, gradually reduce the window size. ====
+                 nh = kbot - ktop + 1
+                 nwupbd = min( nh, nwmax )
+                 if( ndfl<kexnw ) then
+                    nw = min( nwupbd, nwr )
+                 else
+                    nw = min( nwupbd, 2*nw )
+                 end if
+                 if( nw<nwmax ) then
+                    if( nw>=nh-1 ) then
+                       nw = nh
+                    else
+                       kwtop = kbot - nw + 1
+                       if( cabs1( h( kwtop, kwtop-1 ) )>cabs1( h( kwtop-1, kwtop-2 ) ) )nw = nw + &
+                                 1
+                    end if
+                 end if
+                 if( ndfl<kexnw ) then
+                    ndec = -1
+                 else if( ndec>=0 .or. nw>=nwupbd ) then
+                    ndec = ndec + 1
+                    if( nw-ndec<2 )ndec = 0
+                    nw = nw - ndec
+                 end if
+                 ! ==== aggressive early deflation:
+                 ! .    split workspace under the subdiagonal into
+                 ! .      - an nw-by-nw work array v in the lower
+                 ! .        left-hand-corner,
+                 ! .      - an nw-by-at-least-nw-but-more-is-better
+                 ! .        (nw-by-nho) horizontal work array along
+                 ! .        the bottom edge,
+                 ! .      - an at-least-nw-but-more-is-better (nhv-by-nw)
+                 ! .        vertical work array along the left-hand-edge.
+                 ! .        ====
+                 kv = n - nw + 1
+                 kt = nw + 1
+                 nho = ( n-nw-1 ) - kt + 1
+                 kwv = nw + 2
+                 nve = ( n-nw ) - kwv + 1
+                 ! ==== aggressive early deflation ====
+                 call stdlib_wlaqr3( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+                 ls, ld, w, h( kv, 1 ), ldh, nho,h( kv, kt ), ldh, nve, h( kwv, 1 ), ldh, work,&
+                           lwork )
+                 ! ==== adjust kbot accounting for new deflations. ====
+                 kbot = kbot - ld
+                 ! ==== ks points to the shifts. ====
+                 ks = kbot - ls + 1
+                 ! ==== skip an expensive qr sweep if there is a (partly
+                 ! .    heuristic) reason to expect that many eigenvalues
+                 ! .    will deflate without it.  here, the qr sweep is
+                 ! .    skipped if many eigenvalues have just been deflated
+                 ! .    or if the remaining active block is small.
+                 if( ( ld==0 ) .or. ( ( 100*ld<=nw*nibble ) .and. ( kbot-ktop+1>min( nmin, nwmax )&
+                            ) ) ) then
+                    ! ==== ns = nominal number of simultaneous shifts.
+                    ! .    this may be lowered (slightly) if stdlib_wlaqr3
+                    ! .    did not provide that many shifts. ====
+                    ns = min( nsmax, nsr, max( 2, kbot-ktop ) )
+                    ns = ns - mod( ns, 2 )
+                    ! ==== if there have been no deflations
+                    ! .    in a multiple of kexsh iterations,
+                    ! .    then try exceptional shifts.
+                    ! .    otherwise use shifts provided by
+                    ! .    stdlib_wlaqr3 above or from the eigenvalues
+                    ! .    of a trailing principal submatrix. ====
+                    if( mod( ndfl, kexsh )==0 ) then
+                       ks = kbot - ns + 1
+                       do i = kbot, ks + 1, -2
+                          w( i ) = h( i, i ) + wilk1*cabs1( h( i, i-1 ) )
+                          w( i-1 ) = w( i )
+                       end do
+                    else
+                       ! ==== got ns/2 or fewer shifts? use stdlib_wlaqr4 or
+                       ! .    stdlib_wlahqr on a trailing principal submatrix to
+                       ! .    get more. (since ns<=nsmax<=(n-3)/6,
+                       ! .    there is enough space below the subdiagonal
+                       ! .    to fit an ns-by-ns scratch array.) ====
+                       if( kbot-ks+1<=ns / 2 ) then
+                          ks = kbot - ns + 1
+                          kt = n - ns + 1
+                          call stdlib_wlacpy( 'A', ns, ns, h( ks, ks ), ldh,h( kt, 1 ), ldh )
+                                    
+                          if( ns>nmin ) then
+                             call stdlib_wlaqr4( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, w( &
+                                       ks ), 1, 1,zdum, 1, work, lwork, inf )
+                          else
+                             call stdlib_wlahqr( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, w( &
+                                       ks ), 1, 1,zdum, 1, inf )
+                          end if
+                          ks = ks + inf
+                          ! ==== in case of a rare qr failure use
+                          ! .    eigenvalues of the trailing 2-by-2
+                          ! .    principal submatrix.  scale to avoid
+                          ! .    overflows, underflows and subnormals.
+                          ! .    (the scale factor s can not be czero,
+                          ! .    because h(kbot,kbot-1) is nonzero.) ====
+                          if( ks>=kbot ) then
+                             s = cabs1( h( kbot-1, kbot-1 ) ) +cabs1( h( kbot, kbot-1 ) ) +cabs1( &
+                                       h( kbot-1, kbot ) ) +cabs1( h( kbot, kbot ) )
+                             aa = h( kbot-1, kbot-1 ) / s
+                             cc = h( kbot, kbot-1 ) / s
+                             bb = h( kbot-1, kbot ) / s
+                             dd = h( kbot, kbot ) / s
+                             tr2 = ( aa+dd ) / two
+                             det = ( aa-tr2 )*( dd-tr2 ) - bb*cc
+                             rtdisc = sqrt( -det )
+                             w( kbot-1 ) = ( tr2+rtdisc )*s
+                             w( kbot ) = ( tr2-rtdisc )*s
+                             ks = kbot - 1
+                          end if
+                       end if
+                       if( kbot-ks+1>ns ) then
+                          ! ==== sort the shifts (helps a little) ====
+                          sorted = .false.
+                          do k = kbot, ks + 1, -1
+                             if( sorted )go to 60
+                             sorted = .true.
+                             do i = ks, k - 1
+                                if( cabs1( w( i ) )<cabs1( w( i+1 ) ) )then
+                                   sorted = .false.
+                                   swap = w( i )
+                                   w( i ) = w( i+1 )
+                                   w( i+1 ) = swap
+                                end if
+                             end do
+                          end do
+                          60 continue
+                       end if
+                    end if
+                    ! ==== if there are only two shifts, then use
+                    ! .    only cone.  ====
+                    if( kbot-ks+1==2 ) then
+                       if( cabs1( w( kbot )-h( kbot, kbot ) )<cabs1( w( kbot-1 )-h( kbot, kbot ) )&
+                                  ) then
+                          w( kbot-1 ) = w( kbot )
+                       else
+                          w( kbot ) = w( kbot-1 )
+                       end if
+                    end if
+                    ! ==== use up to ns of the the smallest magnitude
+                    ! .    shifts.  if there aren't ns shifts available,
+                    ! .    then use them all, possibly dropping cone to
+                    ! .    make the number of shifts even. ====
+                    ns = min( ns, kbot-ks+1 )
+                    ns = ns - mod( ns, 2 )
+                    ks = kbot - ns + 1
+                    ! ==== small-bulge multi-shift qr sweep:
+                    ! .    split workspace under the subdiagonal into
+                    ! .    - a kdu-by-kdu work array u in the lower
+                    ! .      left-hand-corner,
+                    ! .    - a kdu-by-at-least-kdu-but-more-is-better
+                    ! .      (kdu-by-nho) horizontal work array wh along
+                    ! .      the bottom edge,
+                    ! .    - and an at-least-kdu-but-more-is-better-by-kdu
+                    ! .      (nve-by-kdu) vertical work wv arrow along
+                    ! .      the left-hand-edge. ====
+                    kdu = 2*ns
+                    ku = n - kdu + 1
+                    kwh = kdu + 1
+                    nho = ( n-kdu+1-4 ) - ( kdu+1 ) + 1
+                    kwv = kdu + 4
+                    nve = n - kdu - kwv + 1
+                    ! ==== small-bulge multi-shift qr sweep ====
+                    call stdlib_wlaqr5( wantt, wantz, kacc22, n, ktop, kbot, ns,w( ks ), h, ldh, &
+                    iloz, ihiz, z, ldz, work,3, h( ku, 1 ), ldh, nve, h( kwv, 1 ), ldh,nho, h( ku,&
+                               kwh ), ldh )
+                 end if
+                 ! ==== note progress (or the lack of it). ====
+                 if( ld>0 ) then
+                    ndfl = 1
+                 else
+                    ndfl = ndfl + 1
+                 end if
+                 ! ==== end of main loop ====
+              end do loop_70
+              ! ==== iteration limit exceeded.  set info to show where
+              ! .    the problem occurred and exit. ====
+              info = kbot
+              80 continue
+           end if
+           ! ==== return the optimal value of lwork. ====
+           work( 1 ) = cmplx( lwkopt, 0,KIND=qp)
+     end subroutine stdlib_wlaqr0
+
+     !> Given a 2-by-2 or 3-by-3 matrix H, ZLAQR1: sets v to a
+     !> scalar multiple of the first column of the product
+     !> (*)  K = (H - s1*I)*(H - s2*I)
+     !> scaling to avoid overflows and most underflows.
+     !> This is useful for starting double implicit shift bulges
+     !> in the QR algorithm.
+
+     pure subroutine stdlib_wlaqr1( n, h, ldh, s1, s2, v )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(qp), intent(in) :: s1, s2
+           integer(ilp), intent(in) :: ldh, n
+           ! Array Arguments 
+           complex(qp), intent(in) :: h(ldh,*)
+           complex(qp), intent(out) :: v(*)
+        ! ================================================================
+           ! Parameters 
+           real(qp), parameter :: rzero = 0.0_qp
+           
+           
+           ! Local Scalars 
+           complex(qp) :: cdum, h21s, h31s
+           real(qp) :: s
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! quick return if possible
+           if( n/=2 .and. n/=3 ) then
+              return
+           end if
+           if( n==2 ) then
+              s = cabs1( h( 1, 1 )-s2 ) + cabs1( h( 2, 1 ) )
+              if( s==rzero ) then
+                 v( 1 ) = czero
+                 v( 2 ) = czero
+              else
+                 h21s = h( 2, 1 ) / s
+                 v( 1 ) = h21s*h( 1, 2 ) + ( h( 1, 1 )-s1 )*( ( h( 1, 1 )-s2 ) / s )
+                 v( 2 ) = h21s*( h( 1, 1 )+h( 2, 2 )-s1-s2 )
+              end if
+           else
+              s = cabs1( h( 1, 1 )-s2 ) + cabs1( h( 2, 1 ) ) +cabs1( h( 3, 1 ) )
+              if( s==czero ) then
+                 v( 1 ) = czero
+                 v( 2 ) = czero
+                 v( 3 ) = czero
+              else
+                 h21s = h( 2, 1 ) / s
+                 h31s = h( 3, 1 ) / s
+                 v( 1 ) = ( h( 1, 1 )-s1 )*( ( h( 1, 1 )-s2 ) / s ) +h( 1, 2 )*h21s + h( 1, 3 )&
+                           *h31s
+                 v( 2 ) = h21s*( h( 1, 1 )+h( 2, 2 )-s1-s2 ) + h( 2, 3 )*h31s
+                 v( 3 ) = h31s*( h( 1, 1 )+h( 3, 3 )-s1-s2 ) + h21s*h( 3, 2 )
+              end if
+           end if
+     end subroutine stdlib_wlaqr1
+
+     !> ZLAQR2: is identical to ZLAQR3 except that it avoids
+     !> recursion by calling ZLAHQR instead of ZLAQR4.
+     !> Aggressive early deflation:
+     !> ZLAQR2 accepts as input an upper Hessenberg matrix
+     !> H and performs an unitary similarity transformation
+     !> designed to detect and deflate fully converged eigenvalues from
+     !> a trailing principal submatrix.  On output H has been over-
+     !> written by a new Hessenberg matrix that is a perturbation of
+     !> an unitary similarity transformation of H.  It is to be
+     !> hoped that the final version of H has many zero subdiagonal
+     !> entries.
+
+     pure subroutine stdlib_wlaqr2( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+               ns, nd, sh, v, ldv, nh, t, ldt,nv, wv, ldwv, work, lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kbot, ktop, ldh, ldt, ldv, ldwv, ldz, lwork, n,&
+                      nh, nv, nw
+           integer(ilp), intent(out) :: nd, ns
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(qp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(qp), intent(out) :: sh(*), t(ldt,*), v(ldv,*), work(*), wv(ldwv,*)
+        ! ================================================================
+           ! Parameters 
+           real(qp), parameter :: rzero = 0.0_qp
+           real(qp), parameter :: rone = 1.0_qp
+           
+           
+           ! Local Scalars 
+           complex(qp) :: beta, cdum, s, tau
+           real(qp) :: foo, safmax, safmin, smlnum, ulp
+           integer(ilp) :: i, ifst, ilst, info, infqr, j, jw, kcol, kln, knt, krow, kwtop, ltop, &
+                     lwk1, lwk2, lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,int,max,min
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! ==== estimate optimal workspace. ====
+           jw = min( nw, kbot-ktop+1 )
+           if( jw<=2 ) then
+              lwkopt = 1
+           else
+              ! ==== workspace query call to stdlib_wgehrd ====
+              call stdlib_wgehrd( jw, 1, jw-1, t, ldt, work, work, -1, info )
+              lwk1 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_wunmhr ====
+              call stdlib_wunmhr( 'R', 'N', jw, jw, 1, jw-1, t, ldt, work, v, ldv,work, -1, info )
+                        
+              lwk2 = int( work( 1 ),KIND=ilp)
+              ! ==== optimal workspace ====
+              lwkopt = jw + max( lwk1, lwk2 )
+           end if
+           ! ==== quick return in case of workspace query. ====
+           if( lwork==-1 ) then
+              work( 1 ) = cmplx( lwkopt, 0,KIND=qp)
+              return
+           end if
+           ! ==== nothing to do ...
+           ! ... for an empty active block ... ====
+           ns = 0
+           nd = 0
+           work( 1 ) = cone
+           if( ktop>kbot )return
+           ! ... nor for an empty deflation window. ====
+           if( nw<1 )return
+           ! ==== machine constants ====
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = rone / safmin
+           call stdlib_qlabad( safmin, safmax )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=qp) / ulp )
+           ! ==== setup deflation window ====
+           jw = min( nw, kbot-ktop+1 )
+           kwtop = kbot - jw + 1
+           if( kwtop==ktop ) then
+              s = czero
+           else
+              s = h( kwtop, kwtop-1 )
+           end if
+           if( kbot==kwtop ) then
+              ! ==== 1-by-1 deflation window: not much to do ====
+              sh( kwtop ) = h( kwtop, kwtop )
+              ns = 1
+              nd = 0
+              if( cabs1( s )<=max( smlnum, ulp*cabs1( h( kwtop,kwtop ) ) ) ) then
+                 ns = 0
+                 nd = 1
+                 if( kwtop>ktop )h( kwtop, kwtop-1 ) = czero
+              end if
+              work( 1 ) = cone
+              return
+           end if
+           ! ==== convert to spike-triangular form.  (in case of a
+           ! .    rare qr failure, this routine continues to do
+           ! .    aggressive early deflation using that part of
+           ! .    the deflation window that converged using infqr
+           ! .    here and there to keep track.) ====
+           call stdlib_wlacpy( 'U', jw, jw, h( kwtop, kwtop ), ldh, t, ldt )
+           call stdlib_wcopy( jw-1, h( kwtop+1, kwtop ), ldh+1, t( 2, 1 ), ldt+1 )
+           call stdlib_wlaset( 'A', jw, jw, czero, cone, v, ldv )
+           call stdlib_wlahqr( .true., .true., jw, 1, jw, t, ldt, sh( kwtop ), 1,jw, v, ldv, &
+                     infqr )
+           ! ==== deflation detection loop ====
+           ns = jw
+           ilst = infqr + 1
+           do knt = infqr + 1, jw
+              ! ==== small spike tip deflation test ====
+              foo = cabs1( t( ns, ns ) )
+              if( foo==rzero )foo = cabs1( s )
+              if( cabs1( s )*cabs1( v( 1, ns ) )<=max( smlnum, ulp*foo ) )then
+                 ! ==== cone more converged eigenvalue ====
+                 ns = ns - 1
+              else
+                 ! ==== cone undeflatable eigenvalue.  move it up out of the
+                 ! .    way.   (stdlib_wtrexc can not fail in this case.) ====
+                 ifst = ns
+                 call stdlib_wtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, info )
+                 ilst = ilst + 1
+              end if
+           end do
+              ! ==== return to hessenberg form ====
+           if( ns==0 )s = czero
+           if( ns<jw ) then
+              ! ==== sorting the diagonal of t improves accuracy for
+              ! .    graded matrices.  ====
+              do i = infqr + 1, ns
+                 ifst = i
+                 do j = i + 1, ns
+                    if( cabs1( t( j, j ) )>cabs1( t( ifst, ifst ) ) )ifst = j
+                 end do
+                 ilst = i
+                 if( ifst/=ilst )call stdlib_wtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, info )
+                           
+              end do
+           end if
+           ! ==== restore shift/eigenvalue array from t ====
+           do i = infqr + 1, jw
+              sh( kwtop+i-1 ) = t( i, i )
+           end do
+           if( ns<jw .or. s==czero ) then
+              if( ns>1 .and. s/=czero ) then
+                 ! ==== reflect spike back into lower triangle ====
+                 call stdlib_wcopy( ns, v, ldv, work, 1 )
+                 do i = 1, ns
+                    work( i ) = conjg( work( i ) )
+                 end do
+                 beta = work( 1 )
+                 call stdlib_wlarfg( ns, beta, work( 2 ), 1, tau )
+                 work( 1 ) = cone
+                 call stdlib_wlaset( 'L', jw-2, jw-2, czero, czero, t( 3, 1 ), ldt )
+                 call stdlib_wlarf( 'L', ns, jw, work, 1, conjg( tau ), t, ldt,work( jw+1 ) )
+                           
+                 call stdlib_wlarf( 'R', ns, ns, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_wlarf( 'R', jw, ns, work, 1, tau, v, ldv,work( jw+1 ) )
+                 call stdlib_wgehrd( jw, 1, ns, t, ldt, work, work( jw+1 ),lwork-jw, info )
+                           
+              end if
+              ! ==== copy updated reduced window into place ====
+              if( kwtop>1 )h( kwtop, kwtop-1 ) = s*conjg( v( 1, 1 ) )
+              call stdlib_wlacpy( 'U', jw, jw, t, ldt, h( kwtop, kwtop ), ldh )
+              call stdlib_wcopy( jw-1, t( 2, 1 ), ldt+1, h( kwtop+1, kwtop ),ldh+1 )
+              ! ==== accumulate orthogonal matrix in order update
+              ! .    h and z, if requested.  ====
+              if( ns>1 .and. s/=czero )call stdlib_wunmhr( 'R', 'N', jw, ns, 1, ns, t, ldt, work, &
+                        v, ldv,work( jw+1 ), lwork-jw, info )
+              ! ==== update vertical slab in h ====
+              if( wantt ) then
+                 ltop = 1
+              else
+                 ltop = ktop
+              end if
+              do krow = ltop, kwtop - 1, nv
+                 kln = min( nv, kwtop-krow )
+                 call stdlib_wgemm( 'N', 'N', kln, jw, jw, cone, h( krow, kwtop ),ldh, v, ldv, &
+                           czero, wv, ldwv )
+                 call stdlib_wlacpy( 'A', kln, jw, wv, ldwv, h( krow, kwtop ), ldh )
+              end do
+              ! ==== update horizontal slab in h ====
+              if( wantt ) then
+                 do kcol = kbot + 1, n, nh
+                    kln = min( nh, n-kcol+1 )
+                    call stdlib_wgemm( 'C', 'N', jw, kln, jw, cone, v, ldv,h( kwtop, kcol ), ldh, &
+                              czero, t, ldt )
+                    call stdlib_wlacpy( 'A', jw, kln, t, ldt, h( kwtop, kcol ),ldh )
+                 end do
+              end if
+              ! ==== update vertical slab in z ====
+              if( wantz ) then
+                 do krow = iloz, ihiz, nv
+                    kln = min( nv, ihiz-krow+1 )
+                    call stdlib_wgemm( 'N', 'N', kln, jw, jw, cone, z( krow, kwtop ),ldz, v, ldv, &
+                              czero, wv, ldwv )
+                    call stdlib_wlacpy( 'A', kln, jw, wv, ldwv, z( krow, kwtop ),ldz )
+                 end do
+              end if
+           end if
+           ! ==== return the number of deflations ... ====
+           nd = jw - ns
+           ! ==== ... and the number of shifts. (subtracting
+           ! .    infqr from the spike length takes care
+           ! .    of the case of a rare qr failure while
+           ! .    calculating eigenvalues of the deflation
+           ! .    window.)  ====
+           ns = ns - infqr
+            ! ==== return optimal workspace. ====
+           work( 1 ) = cmplx( lwkopt, 0,KIND=qp)
+     end subroutine stdlib_wlaqr2
+
+     !> Aggressive early deflation:
+     !> ZLAQR3: accepts as input an upper Hessenberg matrix
+     !> H and performs an unitary similarity transformation
+     !> designed to detect and deflate fully converged eigenvalues from
+     !> a trailing principal submatrix.  On output H has been over-
+     !> written by a new Hessenberg matrix that is a perturbation of
+     !> an unitary similarity transformation of H.  It is to be
+     !> hoped that the final version of H has many zero subdiagonal
+     !> entries.
+
+     pure subroutine stdlib_wlaqr3( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+               ns, nd, sh, v, ldv, nh, t, ldt,nv, wv, ldwv, work, lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kbot, ktop, ldh, ldt, ldv, ldwv, ldz, lwork, n,&
+                      nh, nv, nw
+           integer(ilp), intent(out) :: nd, ns
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(qp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(qp), intent(out) :: sh(*), t(ldt,*), v(ldv,*), work(*), wv(ldwv,*)
+        ! ================================================================
+           ! Parameters 
+           real(qp), parameter :: rzero = 0.0_qp
+           real(qp), parameter :: rone = 1.0_qp
+           
+           
+           ! Local Scalars 
+           complex(qp) :: beta, cdum, s, tau
+           real(qp) :: foo, safmax, safmin, smlnum, ulp
+           integer(ilp) :: i, ifst, ilst, info, infqr, j, jw, kcol, kln, knt, krow, kwtop, ltop, &
+                     lwk1, lwk2, lwk3, lwkopt, nmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,int,max,min
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! ==== estimate optimal workspace. ====
+           jw = min( nw, kbot-ktop+1 )
+           if( jw<=2 ) then
+              lwkopt = 1
+           else
+              ! ==== workspace query call to stdlib_wgehrd ====
+              call stdlib_wgehrd( jw, 1, jw-1, t, ldt, work, work, -1, info )
+              lwk1 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_wunmhr ====
+              call stdlib_wunmhr( 'R', 'N', jw, jw, 1, jw-1, t, ldt, work, v, ldv,work, -1, info )
+                        
+              lwk2 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_wlaqr4 ====
+              call stdlib_wlaqr4( .true., .true., jw, 1, jw, t, ldt, sh, 1, jw, v,ldv, work, -1, &
+                        infqr )
+              lwk3 = int( work( 1 ),KIND=ilp)
+              ! ==== optimal workspace ====
+              lwkopt = max( jw+max( lwk1, lwk2 ), lwk3 )
+           end if
+           ! ==== quick return in case of workspace query. ====
+           if( lwork==-1 ) then
+              work( 1 ) = cmplx( lwkopt, 0,KIND=qp)
+              return
+           end if
+           ! ==== nothing to do ...
+           ! ... for an empty active block ... ====
+           ns = 0
+           nd = 0
+           work( 1 ) = cone
+           if( ktop>kbot )return
+           ! ... nor for an empty deflation window. ====
+           if( nw<1 )return
+           ! ==== machine constants ====
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = rone / safmin
+           call stdlib_qlabad( safmin, safmax )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=qp) / ulp )
+           ! ==== setup deflation window ====
+           jw = min( nw, kbot-ktop+1 )
+           kwtop = kbot - jw + 1
+           if( kwtop==ktop ) then
+              s = czero
+           else
+              s = h( kwtop, kwtop-1 )
+           end if
+           if( kbot==kwtop ) then
+              ! ==== 1-by-1 deflation window: not much to do ====
+              sh( kwtop ) = h( kwtop, kwtop )
+              ns = 1
+              nd = 0
+              if( cabs1( s )<=max( smlnum, ulp*cabs1( h( kwtop,kwtop ) ) ) ) then
+                 ns = 0
+                 nd = 1
+                 if( kwtop>ktop )h( kwtop, kwtop-1 ) = czero
+              end if
+              work( 1 ) = cone
+              return
+           end if
+           ! ==== convert to spike-triangular form.  (in case of a
+           ! .    rare qr failure, this routine continues to do
+           ! .    aggressive early deflation using that part of
+           ! .    the deflation window that converged using infqr
+           ! .    here and there to keep track.) ====
+           call stdlib_wlacpy( 'U', jw, jw, h( kwtop, kwtop ), ldh, t, ldt )
+           call stdlib_wcopy( jw-1, h( kwtop+1, kwtop ), ldh+1, t( 2, 1 ), ldt+1 )
+           call stdlib_wlaset( 'A', jw, jw, czero, cone, v, ldv )
+           nmin = stdlib_ilaenv( 12, 'ZLAQR3', 'SV', jw, 1, jw, lwork )
+           if( jw>nmin ) then
+              call stdlib_wlaqr4( .true., .true., jw, 1, jw, t, ldt, sh( kwtop ), 1,jw, v, ldv, &
+                        work, lwork, infqr )
+           else
+              call stdlib_wlahqr( .true., .true., jw, 1, jw, t, ldt, sh( kwtop ), 1,jw, v, ldv, &
+                        infqr )
+           end if
+           ! ==== deflation detection loop ====
+           ns = jw
+           ilst = infqr + 1
+           do knt = infqr + 1, jw
+              ! ==== small spike tip deflation test ====
+              foo = cabs1( t( ns, ns ) )
+              if( foo==rzero )foo = cabs1( s )
+              if( cabs1( s )*cabs1( v( 1, ns ) )<=max( smlnum, ulp*foo ) )then
+                 ! ==== cone more converged eigenvalue ====
+                 ns = ns - 1
+              else
+                 ! ==== cone undeflatable eigenvalue.  move it up out of the
+                 ! .    way.   (stdlib_wtrexc can not fail in this case.) ====
+                 ifst = ns
+                 call stdlib_wtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, info )
+                 ilst = ilst + 1
+              end if
+           end do
+              ! ==== return to hessenberg form ====
+           if( ns==0 )s = czero
+           if( ns<jw ) then
+              ! ==== sorting the diagonal of t improves accuracy for
+              ! .    graded matrices.  ====
+              do i = infqr + 1, ns
+                 ifst = i
+                 do j = i + 1, ns
+                    if( cabs1( t( j, j ) )>cabs1( t( ifst, ifst ) ) )ifst = j
+                 end do
+                 ilst = i
+                 if( ifst/=ilst )call stdlib_wtrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, info )
+                           
+              end do
+           end if
+           ! ==== restore shift/eigenvalue array from t ====
+           do i = infqr + 1, jw
+              sh( kwtop+i-1 ) = t( i, i )
+           end do
+           if( ns<jw .or. s==czero ) then
+              if( ns>1 .and. s/=czero ) then
+                 ! ==== reflect spike back into lower triangle ====
+                 call stdlib_wcopy( ns, v, ldv, work, 1 )
+                 do i = 1, ns
+                    work( i ) = conjg( work( i ) )
+                 end do
+                 beta = work( 1 )
+                 call stdlib_wlarfg( ns, beta, work( 2 ), 1, tau )
+                 work( 1 ) = cone
+                 call stdlib_wlaset( 'L', jw-2, jw-2, czero, czero, t( 3, 1 ), ldt )
+                 call stdlib_wlarf( 'L', ns, jw, work, 1, conjg( tau ), t, ldt,work( jw+1 ) )
+                           
+                 call stdlib_wlarf( 'R', ns, ns, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_wlarf( 'R', jw, ns, work, 1, tau, v, ldv,work( jw+1 ) )
+                 call stdlib_wgehrd( jw, 1, ns, t, ldt, work, work( jw+1 ),lwork-jw, info )
+                           
+              end if
+              ! ==== copy updated reduced window into place ====
+              if( kwtop>1 )h( kwtop, kwtop-1 ) = s*conjg( v( 1, 1 ) )
+              call stdlib_wlacpy( 'U', jw, jw, t, ldt, h( kwtop, kwtop ), ldh )
+              call stdlib_wcopy( jw-1, t( 2, 1 ), ldt+1, h( kwtop+1, kwtop ),ldh+1 )
+              ! ==== accumulate orthogonal matrix in order update
+              ! .    h and z, if requested.  ====
+              if( ns>1 .and. s/=czero )call stdlib_wunmhr( 'R', 'N', jw, ns, 1, ns, t, ldt, work, &
+                        v, ldv,work( jw+1 ), lwork-jw, info )
+              ! ==== update vertical slab in h ====
+              if( wantt ) then
+                 ltop = 1
+              else
+                 ltop = ktop
+              end if
+              do krow = ltop, kwtop - 1, nv
+                 kln = min( nv, kwtop-krow )
+                 call stdlib_wgemm( 'N', 'N', kln, jw, jw, cone, h( krow, kwtop ),ldh, v, ldv, &
+                           czero, wv, ldwv )
+                 call stdlib_wlacpy( 'A', kln, jw, wv, ldwv, h( krow, kwtop ), ldh )
+              end do
+              ! ==== update horizontal slab in h ====
+              if( wantt ) then
+                 do kcol = kbot + 1, n, nh
+                    kln = min( nh, n-kcol+1 )
+                    call stdlib_wgemm( 'C', 'N', jw, kln, jw, cone, v, ldv,h( kwtop, kcol ), ldh, &
+                              czero, t, ldt )
+                    call stdlib_wlacpy( 'A', jw, kln, t, ldt, h( kwtop, kcol ),ldh )
+                 end do
+              end if
+              ! ==== update vertical slab in z ====
+              if( wantz ) then
+                 do krow = iloz, ihiz, nv
+                    kln = min( nv, ihiz-krow+1 )
+                    call stdlib_wgemm( 'N', 'N', kln, jw, jw, cone, z( krow, kwtop ),ldz, v, ldv, &
+                              czero, wv, ldwv )
+                    call stdlib_wlacpy( 'A', kln, jw, wv, ldwv, z( krow, kwtop ),ldz )
+                 end do
+              end if
+           end if
+           ! ==== return the number of deflations ... ====
+           nd = jw - ns
+           ! ==== ... and the number of shifts. (subtracting
+           ! .    infqr from the spike length takes care
+           ! .    of the case of a rare qr failure while
+           ! .    calculating eigenvalues of the deflation
+           ! .    window.)  ====
+           ns = ns - infqr
+            ! ==== return optimal workspace. ====
+           work( 1 ) = cmplx( lwkopt, 0,KIND=qp)
+     end subroutine stdlib_wlaqr3
+
+     !> ZLAQR4: implements one level of recursion for ZLAQR0.
+     !> It is a complete implementation of the small bulge multi-shift
+     !> QR algorithm.  It may be called by ZLAQR0 and, for large enough
+     !> deflation window size, it may be called by ZLAQR3.  This
+     !> subroutine is identical to ZLAQR0 except that it calls ZLAQR2
+     !> instead of ZLAQR3.
+     !> ZLAQR4 computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**H, where T is an upper triangular matrix (the
+     !> Schur form), and Z is the unitary matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input unitary
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+
+     pure subroutine stdlib_wlaqr4( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, work,&
+                lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(qp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(qp), intent(out) :: w(*), work(*)
+        ! ================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: kexnw = 5
+           integer(ilp), parameter :: kexsh = 6
+           real(qp), parameter :: wilk1 = 0.75_qp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_wlahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== exceptional deflation windows:  try to cure rare
+           ! .    slow convergence by varying the size of the
+           ! .    deflation window after kexnw iterations. ====
+           
+           ! ==== exceptional shifts: try to cure rare slow convergence
+           ! .    with ad-hoc exceptional shifts every kexsh iterations.
+           ! .    ====
+           
+           ! ==== the constant wilk1 is used to form the exceptional
+           ! .    shifts. ====
+           
+           
+           
+           ! Local Scalars 
+           complex(qp) :: aa, bb, cc, cdum, dd, det, rtdisc, swap, tr2
+           real(qp) :: s
+           integer(ilp) :: i, inf, it, itmax, k, kacc22, kbot, kdu, ks, kt, ktop, ku, kv, kwh, &
+           kwtop, kwv, ld, ls, lwkopt, ndec, ndfl, nh, nho, nibble, nmin, ns, nsmax, nsr, nve, nw,&
+                      nwmax, nwr, nwupbd
+           logical(lk) :: sorted
+           character :: jbcmpz*2
+           ! Local Arrays 
+           complex(qp) :: zdum(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,int,max,min,mod,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           info = 0
+           ! ==== quick return for n = 0: nothing to do. ====
+           if( n==0 ) then
+              work( 1 ) = cone
+              return
+           end if
+           if( n<=ntiny ) then
+              ! ==== tiny matrices must use stdlib_wlahqr. ====
+              lwkopt = 1
+              if( lwork/=-1 )call stdlib_wlahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, &
+                        z, ldz, info )
+           else
+              ! ==== use small bulge multi-shift qr with aggressive early
+              ! .    deflation on larger-than-tiny matrices. ====
+              ! ==== hope for the best. ====
+              info = 0
+              ! ==== set up job flags for stdlib_ilaenv. ====
+              if( wantt ) then
+                 jbcmpz( 1: 1 ) = 'S'
+              else
+                 jbcmpz( 1: 1 ) = 'E'
+              end if
+              if( wantz ) then
+                 jbcmpz( 2: 2 ) = 'V'
+              else
+                 jbcmpz( 2: 2 ) = 'N'
+              end if
+              ! ==== nwr = recommended deflation window size.  at this
+              ! .    point,  n > ntiny = 15, so there is enough
+              ! .    subdiagonal workspace for nwr>=2 as required.
+              ! .    (in fact, there is enough subdiagonal space for
+              ! .    nwr>=4.) ====
+              nwr = stdlib_ilaenv( 13, 'ZLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nwr = max( 2, nwr )
+              nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+              ! ==== nsr = recommended number of simultaneous shifts.
+              ! .    at this point n > ntiny = 15, so there is at
+              ! .    enough subdiagonal workspace for nsr to be even
+              ! .    and greater than or equal to two as required. ====
+              nsr = stdlib_ilaenv( 15, 'ZLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nsr = min( nsr, ( n-3 ) / 6, ihi-ilo )
+              nsr = max( 2, nsr-mod( nsr, 2 ) )
+              ! ==== estimate optimal workspace ====
+              ! ==== workspace query call to stdlib_wlaqr2 ====
+              call stdlib_wlaqr2( wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz,ihiz, z, ldz, ls,&
+                         ld, w, h, ldh, n, h, ldh, n, h,ldh, work, -1 )
+              ! ==== optimal workspace = max(stdlib_wlaqr5, stdlib_wlaqr2) ====
+              lwkopt = max( 3*nsr / 2, int( work( 1 ),KIND=ilp) )
+              ! ==== quick return in case of workspace query. ====
+              if( lwork==-1 ) then
+                 work( 1 ) = cmplx( lwkopt, 0,KIND=qp)
+                 return
+              end if
+              ! ==== stdlib_wlahqr/stdlib_wlaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'ZLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nmin = max( ntiny, nmin )
+              ! ==== nibble crossover point ====
+              nibble = stdlib_ilaenv( 14, 'ZLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nibble = max( 0, nibble )
+              ! ==== accumulate reflections during ttswp?  use block
+              ! .    2-by-2 structure during matrix-matrix multiply? ====
+              kacc22 = stdlib_ilaenv( 16, 'ZLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              kacc22 = max( 0, kacc22 )
+              kacc22 = min( 2, kacc22 )
+              ! ==== nwmax = the largest possible deflation window for
+              ! .    which there is sufficient workspace. ====
+              nwmax = min( ( n-1 ) / 3, lwork / 2 )
+              nw = nwmax
+              ! ==== nsmax = the largest number of simultaneous shifts
+              ! .    for which there is sufficient workspace. ====
+              nsmax = min( ( n-3 ) / 6, 2*lwork / 3 )
+              nsmax = nsmax - mod( nsmax, 2 )
+              ! ==== ndfl: an iteration count restarted at deflation. ====
+              ndfl = 1
+              ! ==== itmax = iteration limit ====
+              itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )
+              ! ==== last row and column in the active block ====
+              kbot = ihi
+              ! ==== main loop ====
+              loop_70: do it = 1, itmax
+                 ! ==== done when kbot falls below ilo ====
+                 if( kbot<ilo )go to 80
+                 ! ==== locate active block ====
+                 do k = kbot, ilo + 1, -1
+                    if( h( k, k-1 )==czero )go to 20
+                 end do
+                 k = ilo
+                 20 continue
+                 ktop = k
+                 ! ==== select deflation window size:
+                 ! .    typical case:
+                 ! .      if possible and advisable, nibble the entire
+                 ! .      active block.  if not, use size min(nwr,nwmax)
+                 ! .      or min(nwr+1,nwmax) depending upon which has
+                 ! .      the smaller corresponding subdiagonal entry
+                 ! .      (a heuristic).
+                 ! .    exceptional case:
+                 ! .      if there have been no deflations in kexnw or
+                 ! .      more iterations, then vary the deflation window
+                 ! .      size.   at first, because, larger windows are,
+                 ! .      in general, more powerful than smaller ones,
+                 ! .      rapidly increase the window to the maximum possible.
+                 ! .      then, gradually reduce the window size. ====
+                 nh = kbot - ktop + 1
+                 nwupbd = min( nh, nwmax )
+                 if( ndfl<kexnw ) then
+                    nw = min( nwupbd, nwr )
+                 else
+                    nw = min( nwupbd, 2*nw )
+                 end if
+                 if( nw<nwmax ) then
+                    if( nw>=nh-1 ) then
+                       nw = nh
+                    else
+                       kwtop = kbot - nw + 1
+                       if( cabs1( h( kwtop, kwtop-1 ) )>cabs1( h( kwtop-1, kwtop-2 ) ) )nw = nw + &
+                                 1
+                    end if
+                 end if
+                 if( ndfl<kexnw ) then
+                    ndec = -1
+                 else if( ndec>=0 .or. nw>=nwupbd ) then
+                    ndec = ndec + 1
+                    if( nw-ndec<2 )ndec = 0
+                    nw = nw - ndec
+                 end if
+                 ! ==== aggressive early deflation:
+                 ! .    split workspace under the subdiagonal into
+                 ! .      - an nw-by-nw work array v in the lower
+                 ! .        left-hand-corner,
+                 ! .      - an nw-by-at-least-nw-but-more-is-better
+                 ! .        (nw-by-nho) horizontal work array along
+                 ! .        the bottom edge,
+                 ! .      - an at-least-nw-but-more-is-better (nhv-by-nw)
+                 ! .        vertical work array along the left-hand-edge.
+                 ! .        ====
+                 kv = n - nw + 1
+                 kt = nw + 1
+                 nho = ( n-nw-1 ) - kt + 1
+                 kwv = nw + 2
+                 nve = ( n-nw ) - kwv + 1
+                 ! ==== aggressive early deflation ====
+                 call stdlib_wlaqr2( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+                 ls, ld, w, h( kv, 1 ), ldh, nho,h( kv, kt ), ldh, nve, h( kwv, 1 ), ldh, work,&
+                           lwork )
+                 ! ==== adjust kbot accounting for new deflations. ====
+                 kbot = kbot - ld
+                 ! ==== ks points to the shifts. ====
+                 ks = kbot - ls + 1
+                 ! ==== skip an expensive qr sweep if there is a (partly
+                 ! .    heuristic) reason to expect that many eigenvalues
+                 ! .    will deflate without it.  here, the qr sweep is
+                 ! .    skipped if many eigenvalues have just been deflated
+                 ! .    or if the remaining active block is small.
+                 if( ( ld==0 ) .or. ( ( 100*ld<=nw*nibble ) .and. ( kbot-ktop+1>min( nmin, nwmax )&
+                            ) ) ) then
+                    ! ==== ns = nominal number of simultaneous shifts.
+                    ! .    this may be lowered (slightly) if stdlib_wlaqr2
+                    ! .    did not provide that many shifts. ====
+                    ns = min( nsmax, nsr, max( 2, kbot-ktop ) )
+                    ns = ns - mod( ns, 2 )
+                    ! ==== if there have been no deflations
+                    ! .    in a multiple of kexsh iterations,
+                    ! .    then try exceptional shifts.
+                    ! .    otherwise use shifts provided by
+                    ! .    stdlib_wlaqr2 above or from the eigenvalues
+                    ! .    of a trailing principal submatrix. ====
+                    if( mod( ndfl, kexsh )==0 ) then
+                       ks = kbot - ns + 1
+                       do i = kbot, ks + 1, -2
+                          w( i ) = h( i, i ) + wilk1*cabs1( h( i, i-1 ) )
+                          w( i-1 ) = w( i )
+                       end do
+                    else
+                       ! ==== got ns/2 or fewer shifts? use stdlib_wlahqr
+                       ! .    on a trailing principal submatrix to
+                       ! .    get more. (since ns<=nsmax<=(n-3)/6,
+                       ! .    there is enough space below the subdiagonal
+                       ! .    to fit an ns-by-ns scratch array.) ====
+                       if( kbot-ks+1<=ns / 2 ) then
+                          ks = kbot - ns + 1
+                          kt = n - ns + 1
+                          call stdlib_wlacpy( 'A', ns, ns, h( ks, ks ), ldh,h( kt, 1 ), ldh )
+                                    
+                          call stdlib_wlahqr( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, w( ks )&
+                                    , 1, 1, zdum,1, inf )
+                          ks = ks + inf
+                          ! ==== in case of a rare qr failure use
+                          ! .    eigenvalues of the trailing 2-by-2
+                          ! .    principal submatrix.  scale to avoid
+                          ! .    overflows, underflows and subnormals.
+                          ! .    (the scale factor s can not be czero,
+                          ! .    because h(kbot,kbot-1) is nonzero.) ====
+                          if( ks>=kbot ) then
+                             s = cabs1( h( kbot-1, kbot-1 ) ) +cabs1( h( kbot, kbot-1 ) ) +cabs1( &
+                                       h( kbot-1, kbot ) ) +cabs1( h( kbot, kbot ) )
+                             aa = h( kbot-1, kbot-1 ) / s
+                             cc = h( kbot, kbot-1 ) / s
+                             bb = h( kbot-1, kbot ) / s
+                             dd = h( kbot, kbot ) / s
+                             tr2 = ( aa+dd ) / two
+                             det = ( aa-tr2 )*( dd-tr2 ) - bb*cc
+                             rtdisc = sqrt( -det )
+                             w( kbot-1 ) = ( tr2+rtdisc )*s
+                             w( kbot ) = ( tr2-rtdisc )*s
+                             ks = kbot - 1
+                          end if
+                       end if
+                       if( kbot-ks+1>ns ) then
+                          ! ==== sort the shifts (helps a little) ====
+                          sorted = .false.
+                          do k = kbot, ks + 1, -1
+                             if( sorted )go to 60
+                             sorted = .true.
+                             do i = ks, k - 1
+                                if( cabs1( w( i ) )<cabs1( w( i+1 ) ) )then
+                                   sorted = .false.
+                                   swap = w( i )
+                                   w( i ) = w( i+1 )
+                                   w( i+1 ) = swap
+                                end if
+                             end do
+                          end do
+                          60 continue
+                       end if
+                    end if
+                    ! ==== if there are only two shifts, then use
+                    ! .    only cone.  ====
+                    if( kbot-ks+1==2 ) then
+                       if( cabs1( w( kbot )-h( kbot, kbot ) )<cabs1( w( kbot-1 )-h( kbot, kbot ) )&
+                                  ) then
+                          w( kbot-1 ) = w( kbot )
+                       else
+                          w( kbot ) = w( kbot-1 )
+                       end if
+                    end if
+                    ! ==== use up to ns of the the smallest magnitude
+                    ! .    shifts.  if there aren't ns shifts available,
+                    ! .    then use them all, possibly dropping cone to
+                    ! .    make the number of shifts even. ====
+                    ns = min( ns, kbot-ks+1 )
+                    ns = ns - mod( ns, 2 )
+                    ks = kbot - ns + 1
+                    ! ==== small-bulge multi-shift qr sweep:
+                    ! .    split workspace under the subdiagonal into
+                    ! .    - a kdu-by-kdu work array u in the lower
+                    ! .      left-hand-corner,
+                    ! .    - a kdu-by-at-least-kdu-but-more-is-better
+                    ! .      (kdu-by-nho) horizontal work array wh along
+                    ! .      the bottom edge,
+                    ! .    - and an at-least-kdu-but-more-is-better-by-kdu
+                    ! .      (nve-by-kdu) vertical work wv arrow along
+                    ! .      the left-hand-edge. ====
+                    kdu = 2*ns
+                    ku = n - kdu + 1
+                    kwh = kdu + 1
+                    nho = ( n-kdu+1-4 ) - ( kdu+1 ) + 1
+                    kwv = kdu + 4
+                    nve = n - kdu - kwv + 1
+                    ! ==== small-bulge multi-shift qr sweep ====
+                    call stdlib_wlaqr5( wantt, wantz, kacc22, n, ktop, kbot, ns,w( ks ), h, ldh, &
+                    iloz, ihiz, z, ldz, work,3, h( ku, 1 ), ldh, nve, h( kwv, 1 ), ldh,nho, h( ku,&
+                               kwh ), ldh )
+                 end if
+                 ! ==== note progress (or the lack of it). ====
+                 if( ld>0 ) then
+                    ndfl = 1
+                 else
+                    ndfl = ndfl + 1
+                 end if
+                 ! ==== end of main loop ====
+              end do loop_70
+              ! ==== iteration limit exceeded.  set info to show where
+              ! .    the problem occurred and exit. ====
+              info = kbot
+              80 continue
+           end if
+           ! ==== return the optimal value of lwork. ====
+           work( 1 ) = cmplx( lwkopt, 0,KIND=qp)
+     end subroutine stdlib_wlaqr4
+
+     !> ZLAQR5:, called by ZLAQR0, performs a
+     !> single small-bulge multi-shift QR sweep.
+
+     pure subroutine stdlib_wlaqr5( wantt, wantz, kacc22, n, ktop, kbot, nshfts, s,h, ldh, iloz, &
+               ihiz, z, ldz, v, ldv, u, ldu, nv,wv, ldwv, nh, wh, ldwh )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kacc22, kbot, ktop, ldh, ldu, ldv, ldwh, ldwv, &
+                     ldz, n, nh, nshfts, nv
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(qp), intent(inout) :: h(ldh,*), s(*), z(ldz,*)
+           complex(qp), intent(out) :: u(ldu,*), v(ldv,*), wh(ldwh,*), wv(ldwv,*)
+        ! ================================================================
+           ! Parameters 
+           real(qp), parameter :: rzero = 0.0_qp
+           real(qp), parameter :: rone = 1.0_qp
+           
+           
+           ! Local Scalars 
+           complex(qp) :: alpha, beta, cdum, refsum
+           real(qp) :: h11, h12, h21, h22, safmax, safmin, scl, smlnum, tst1, tst2, ulp
+           integer(ilp) :: i2, i4, incol, j, jbot, jcol, jlen, jrow, jtop, k, k1, kdu, kms, krcol,&
+                      m, m22, mbot, mtop, nbmps, ndcol, ns, nu
+           logical(lk) :: accum, bmp22
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,aimag,max,min,mod
+           ! Local Arrays 
+           complex(qp) :: vt(3)
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! ==== if there are no shifts, then there is nothing to do. ====
+           if( nshfts<2 )return
+           ! ==== if the active block is empty or 1-by-1, then there
+           ! .    is nothing to do. ====
+           if( ktop>=kbot )return
+           ! ==== nshfts is supposed to be even, but if it is odd,
+           ! .    then simply reduce it by cone.  ====
+           ns = nshfts - mod( nshfts, 2 )
+           ! ==== machine constants for deflation ====
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = rone / safmin
+           call stdlib_qlabad( safmin, safmax )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=qp) / ulp )
+           ! ==== use accumulated reflections to update far-from-diagonal
+           ! .    entries ? ====
+           accum = ( kacc22==1 ) .or. ( kacc22==2 )
+           ! ==== clear trash ====
+           if( ktop+2<=kbot )h( ktop+2, ktop ) = czero
+           ! ==== nbmps = number of 2-shift bulges in the chain ====
+           nbmps = ns / 2
+           ! ==== kdu = width of slab ====
+           kdu = 4*nbmps
+           ! ==== create and chase chains of nbmps bulges ====
+           loop_180: do incol = ktop - 2*nbmps + 1, kbot - 2, 2*nbmps
+              ! jtop = index from which updates from the right start.
+              if( accum ) then
+                 jtop = max( ktop, incol )
+              else if( wantt ) then
+                 jtop = 1
+              else
+                 jtop = ktop
+              end if
+              ndcol = incol + kdu
+              if( accum )call stdlib_wlaset( 'ALL', kdu, kdu, czero, cone, u, ldu )
+              ! ==== near-the-diagonal bulge chase.  the following loop
+              ! .    performs the near-the-diagonal part of a small bulge
+              ! .    multi-shift qr sweep.  each 4*nbmps column diagonal
+              ! .    chunk extends from column incol to column ndcol
+              ! .    (including both column incol and column ndcol). the
+              ! .    following loop chases a 2*nbmps+1 column long chain of
+              ! .    nbmps bulges 2*nbmps columns to the right.  (incol
+              ! .    may be less than ktop and and ndcol may be greater than
+              ! .    kbot indicating phantom columns from which to chase
+              ! .    bulges before they are actually introduced or to which
+              ! .    to chase bulges beyond column kbot.)  ====
+              loop_145: do krcol = incol, min( incol+2*nbmps-1, kbot-2 )
+                 ! ==== bulges number mtop to mbot are active double implicit
+                 ! .    shift bulges.  there may or may not also be small
+                 ! .    2-by-2 bulge, if there is room.  the inactive bulges
+                 ! .    (if any) must wait until the active bulges have moved
+                 ! .    down the diagonal to make room.  the phantom matrix
+                 ! .    paradigm described above helps keep track.  ====
+                 mtop = max( 1, ( ktop-krcol ) / 2+1 )
+                 mbot = min( nbmps, ( kbot-krcol-1 ) / 2 )
+                 m22 = mbot + 1
+                 bmp22 = ( mbot<nbmps ) .and. ( krcol+2*( m22-1 ) )==( kbot-2 )
+                 ! ==== generate reflections to chase the chain right
+                 ! .    cone column.  (the minimum value of k is ktop-1.) ====
+                 if ( bmp22 ) then
+                    ! ==== special case: 2-by-2 reflection at bottom treated
+                    ! .    separately ====
+                    k = krcol + 2*( m22-1 )
+                    if( k==ktop-1 ) then
+                       call stdlib_wlaqr1( 2, h( k+1, k+1 ), ldh, s( 2*m22-1 ),s( 2*m22 ), v( 1, &
+                                 m22 ) )
+                       beta = v( 1, m22 )
+                       call stdlib_wlarfg( 2, beta, v( 2, m22 ), 1, v( 1, m22 ) )
+                    else
+                       beta = h( k+1, k )
+                       v( 2, m22 ) = h( k+2, k )
+                       call stdlib_wlarfg( 2, beta, v( 2, m22 ), 1, v( 1, m22 ) )
+                       h( k+1, k ) = beta
+                       h( k+2, k ) = czero
+                    end if
+                    ! ==== perform update from right within
+                    ! .    computational window. ====
+                    do j = jtop, min( kbot, k+3 )
+                       refsum = v( 1, m22 )*( h( j, k+1 )+v( 2, m22 )*h( j, k+2 ) )
+                       h( j, k+1 ) = h( j, k+1 ) - refsum
+                       h( j, k+2 ) = h( j, k+2 ) -refsum*conjg( v( 2, m22 ) )
+                    end do
+                    ! ==== perform update from left within
+                    ! .    computational window. ====
+                    if( accum ) then
+                       jbot = min( ndcol, kbot )
+                    else if( wantt ) then
+                       jbot = n
+                    else
+                       jbot = kbot
+                    end if
+                    do j = k+1, jbot
+                       refsum = conjg( v( 1, m22 ) )*( h( k+1, j )+conjg( v( 2, m22 ) )*h( k+2, j &
+                                 ) )
+                       h( k+1, j ) = h( k+1, j ) - refsum
+                       h( k+2, j ) = h( k+2, j ) - refsum*v( 2, m22 )
+                    end do
+                    ! ==== the following convergence test requires that
+                    ! .    the tradition small-compared-to-nearby-diagonals
+                    ! .    criterion and the ahues
+                    ! .    criteria both be satisfied.  the latter improves
+                    ! .    accuracy in some examples. falling back on an
+                    ! .    alternate convergence criterion when tst1 or tst2
+                    ! .    is czero (as done here) is traditional but probably
+                    ! .    unnecessary. ====
+                    if( k>=ktop ) then
+                       if( h( k+1, k )/=czero ) then
+                          tst1 = cabs1( h( k, k ) ) + cabs1( h( k+1, k+1 ) )
+                          if( tst1==rzero ) then
+                             if( k>=ktop+1 )tst1 = tst1 + cabs1( h( k, k-1 ) )
+                             if( k>=ktop+2 )tst1 = tst1 + cabs1( h( k, k-2 ) )
+                             if( k>=ktop+3 )tst1 = tst1 + cabs1( h( k, k-3 ) )
+                             if( k<=kbot-2 )tst1 = tst1 + cabs1( h( k+2, k+1 ) )
+                             if( k<=kbot-3 )tst1 = tst1 + cabs1( h( k+3, k+1 ) )
+                             if( k<=kbot-4 )tst1 = tst1 + cabs1( h( k+4, k+1 ) )
+                          end if
+                          if( cabs1( h( k+1, k ) )<=max( smlnum, ulp*tst1 ) ) then
+                             h12 = max( cabs1( h( k+1, k ) ),cabs1( h( k, k+1 ) ) )
+                             h21 = min( cabs1( h( k+1, k ) ),cabs1( h( k, k+1 ) ) )
+                             h11 = max( cabs1( h( k+1, k+1 ) ),cabs1( h( k, k )-h( k+1, k+1 ) ) )
+                                       
+                             h22 = min( cabs1( h( k+1, k+1 ) ),cabs1( h( k, k )-h( k+1, k+1 ) ) )
+                                       
+                             scl = h11 + h12
+                             tst2 = h22*( h11 / scl )
+                             if( tst2==rzero .or. h21*( h12 / scl )<=max( smlnum, ulp*tst2 ) )h( &
+                                       k+1, k ) = czero
+                          end if
+                       end if
+                    end if
+                    ! ==== accumulate orthogonal transformations. ====
+                    if( accum ) then
+                       kms = k - incol
+                       do j = max( 1, ktop-incol ), kdu
+                          refsum = v( 1, m22 )*( u( j, kms+1 )+v( 2, m22 )*u( j, kms+2 ) )
+                          u( j, kms+1 ) = u( j, kms+1 ) - refsum
+                          u( j, kms+2 ) = u( j, kms+2 ) -refsum*conjg( v( 2, m22 ) )
+                       end do
+                    else if( wantz ) then
+                       do j = iloz, ihiz
+                          refsum = v( 1, m22 )*( z( j, k+1 )+v( 2, m22 )*z( j, k+2 ) )
+                          z( j, k+1 ) = z( j, k+1 ) - refsum
+                          z( j, k+2 ) = z( j, k+2 ) -refsum*conjg( v( 2, m22 ) )
+                       end do
+                    end if
+                 end if
+                 ! ==== normal case: chain of 3-by-3 reflections ====
+                 loop_80: do m = mbot, mtop, -1
+                    k = krcol + 2*( m-1 )
+                    if( k==ktop-1 ) then
+                       call stdlib_wlaqr1( 3, h( ktop, ktop ), ldh, s( 2*m-1 ),s( 2*m ), v( 1, m )&
+                                  )
+                       alpha = v( 1, m )
+                       call stdlib_wlarfg( 3, alpha, v( 2, m ), 1, v( 1, m ) )
+                    else
+                       ! ==== perform delayed transformation of row below
+                       ! .    mth bulge. exploit fact that first two elements
+                       ! .    of row are actually czero. ====
+                       refsum = v( 1, m )*v( 3, m )*h( k+3, k+2 )
+                       h( k+3, k   ) = -refsum
+                       h( k+3, k+1 ) = -refsum*conjg( v( 2, m ) )
+                       h( k+3, k+2 ) = h( k+3, k+2 ) -refsum*conjg( v( 3, m ) )
+                       ! ==== calculate reflection to move
+                       ! .    mth bulge cone step. ====
+                       beta      = h( k+1, k )
+                       v( 2, m ) = h( k+2, k )
+                       v( 3, m ) = h( k+3, k )
+                       call stdlib_wlarfg( 3, beta, v( 2, m ), 1, v( 1, m ) )
+                       ! ==== a bulge may collapse because of vigilant
+                       ! .    deflation or destructive underflow.  in the
+                       ! .    underflow case, try the two-small-subdiagonals
+                       ! .    trick to try to reinflate the bulge.  ====
+                       if( h( k+3, k )/=czero .or. h( k+3, k+1 )/=czero .or. h( k+3, k+2 )==czero &
+                                 ) then
+                          ! ==== typical case: not collapsed (yet). ====
+                          h( k+1, k ) = beta
+                          h( k+2, k ) = czero
+                          h( k+3, k ) = czero
+                       else
+                          ! ==== atypical case: collapsed.  attempt to
+                          ! .    reintroduce ignoring h(k+1,k) and h(k+2,k).
+                          ! .    if the fill resulting from the new
+                          ! .    reflector is too large, then abandon it.
+                          ! .    otherwise, use the new cone. ====
+                          call stdlib_wlaqr1( 3, h( k+1, k+1 ), ldh, s( 2*m-1 ),s( 2*m ), vt )
+                                    
+                          alpha = vt( 1 )
+                          call stdlib_wlarfg( 3, alpha, vt( 2 ), 1, vt( 1 ) )
+                          refsum = conjg( vt( 1 ) )*( h( k+1, k )+conjg( vt( 2 ) )*h( k+2, k ) )
+                                    
+                          if( cabs1( h( k+2, k )-refsum*vt( 2 ) )+cabs1( refsum*vt( 3 ) )>ulp*( &
+                          cabs1( h( k, k ) )+cabs1( h( k+1,k+1 ) )+cabs1( h( k+2, k+2 ) ) ) ) &
+                                    then
+                             ! ==== starting a new bulge here would
+                             ! .    create non-negligible fill.  use
+                             ! .    the old cone with trepidation. ====
+                             h( k+1, k ) = beta
+                             h( k+2, k ) = czero
+                             h( k+3, k ) = czero
+                          else
+                             ! ==== starting a new bulge here would
+                             ! .    create only negligible fill.
+                             ! .    replace the old reflector with
+                             ! .    the new cone. ====
+                             h( k+1, k ) = h( k+1, k ) - refsum
+                             h( k+2, k ) = czero
+                             h( k+3, k ) = czero
+                             v( 1, m ) = vt( 1 )
+                             v( 2, m ) = vt( 2 )
+                             v( 3, m ) = vt( 3 )
+                          end if
+                       end if
+                    end if
+                    ! ====  apply reflection from the right and
+                    ! .     the first column of update from the left.
+                    ! .     these updates are required for the vigilant
+                    ! .     deflation check. we still delay most of the
+                    ! .     updates from the left for efficiency. ====
+                    do j = jtop, min( kbot, k+3 )
+                       refsum = v( 1, m )*( h( j, k+1 )+v( 2, m )*h( j, k+2 )+v( 3, m )*h( j, k+3 &
+                                 ) )
+                       h( j, k+1 ) = h( j, k+1 ) - refsum
+                       h( j, k+2 ) = h( j, k+2 ) -refsum*conjg( v( 2, m ) )
+                       h( j, k+3 ) = h( j, k+3 ) -refsum*conjg( v( 3, m ) )
+                    end do
+                    ! ==== perform update from left for subsequent
+                    ! .    column. ====
+                    refsum =  conjg( v( 1, m ) )*( h( k+1, k+1 )+conjg( v( 2, m ) )*h( k+2, k+1 )+&
+                              conjg( v( 3, m ) )*h( k+3, k+1 ) )
+                    h( k+1, k+1 ) = h( k+1, k+1 ) - refsum
+                    h( k+2, k+1 ) = h( k+2, k+1 ) - refsum*v( 2, m )
+                    h( k+3, k+1 ) = h( k+3, k+1 ) - refsum*v( 3, m )
+                    ! ==== the following convergence test requires that
+                    ! .    the tradition small-compared-to-nearby-diagonals
+                    ! .    criterion and the ahues
+                    ! .    criteria both be satisfied.  the latter improves
+                    ! .    accuracy in some examples. falling back on an
+                    ! .    alternate convergence criterion when tst1 or tst2
+                    ! .    is czero (as done here) is traditional but probably
+                    ! .    unnecessary. ====
+                    if( k<ktop)cycle
+                    if( h( k+1, k )/=czero ) then
+                       tst1 = cabs1( h( k, k ) ) + cabs1( h( k+1, k+1 ) )
+                       if( tst1==rzero ) then
+                          if( k>=ktop+1 )tst1 = tst1 + cabs1( h( k, k-1 ) )
+                          if( k>=ktop+2 )tst1 = tst1 + cabs1( h( k, k-2 ) )
+                          if( k>=ktop+3 )tst1 = tst1 + cabs1( h( k, k-3 ) )
+                          if( k<=kbot-2 )tst1 = tst1 + cabs1( h( k+2, k+1 ) )
+                          if( k<=kbot-3 )tst1 = tst1 + cabs1( h( k+3, k+1 ) )
+                          if( k<=kbot-4 )tst1 = tst1 + cabs1( h( k+4, k+1 ) )
+                       end if
+                       if( cabs1( h( k+1, k ) )<=max( smlnum, ulp*tst1 ) )then
+                          h12 = max( cabs1( h( k+1, k ) ),cabs1( h( k, k+1 ) ) )
+                          h21 = min( cabs1( h( k+1, k ) ),cabs1( h( k, k+1 ) ) )
+                          h11 = max( cabs1( h( k+1, k+1 ) ),cabs1( h( k, k )-h( k+1, k+1 ) ) )
+                                    
+                          h22 = min( cabs1( h( k+1, k+1 ) ),cabs1( h( k, k )-h( k+1, k+1 ) ) )
+                                    
+                          scl = h11 + h12
+                          tst2 = h22*( h11 / scl )
+                          if( tst2==rzero .or. h21*( h12 / scl )<=max( smlnum, ulp*tst2 ) )h( k+1,&
+                                     k ) = czero
+                       end if
+                    end if
+                 end do loop_80
+                 ! ==== multiply h by reflections from the left ====
+                 if( accum ) then
+                    jbot = min( ndcol, kbot )
+                 else if( wantt ) then
+                    jbot = n
+                 else
+                    jbot = kbot
+                 end if
+                 do m = mbot, mtop, -1
+                    k = krcol + 2*( m-1 )
+                    do j = max( ktop, krcol + 2*m ), jbot
+                       refsum = conjg( v( 1, m ) )*( h( k+1, j )+conjg( v( 2, m ) )*h( k+2, j )+&
+                                 conjg( v( 3, m ) )*h( k+3, j ) )
+                       h( k+1, j ) = h( k+1, j ) - refsum
+                       h( k+2, j ) = h( k+2, j ) - refsum*v( 2, m )
+                       h( k+3, j ) = h( k+3, j ) - refsum*v( 3, m )
+                    end do
+                 end do
+                 ! ==== accumulate orthogonal transformations. ====
+                 if( accum ) then
+                    ! ==== accumulate u. (if needed, update z later
+                    ! .    with an efficient matrix-matrix
+                    ! .    multiply.) ====
+                    do m = mbot, mtop, -1
+                       k = krcol + 2*( m-1 )
+                       kms = k - incol
+                       i2 = max( 1, ktop-incol )
+                       i2 = max( i2, kms-(krcol-incol)+1 )
+                       i4 = min( kdu, krcol + 2*( mbot-1 ) - incol + 5 )
+                       do j = i2, i4
+                          refsum = v( 1, m )*( u( j, kms+1 )+v( 2, m )*u( j, kms+2 )+v( 3, m )*u( &
+                                    j, kms+3 ) )
+                          u( j, kms+1 ) = u( j, kms+1 ) - refsum
+                          u( j, kms+2 ) = u( j, kms+2 ) -refsum*conjg( v( 2, m ) )
+                          u( j, kms+3 ) = u( j, kms+3 ) -refsum*conjg( v( 3, m ) )
+                       end do
+                    end do
+                 else if( wantz ) then
+                    ! ==== u is not accumulated, so update z
+                    ! .    now by multiplying by reflections
+                    ! .    from the right. ====
+                    do m = mbot, mtop, -1
+                       k = krcol + 2*( m-1 )
+                       do j = iloz, ihiz
+                          refsum = v( 1, m )*( z( j, k+1 )+v( 2, m )*z( j, k+2 )+v( 3, m )*z( j, &
+                                    k+3 ) )
+                          z( j, k+1 ) = z( j, k+1 ) - refsum
+                          z( j, k+2 ) = z( j, k+2 ) -refsum*conjg( v( 2, m ) )
+                          z( j, k+3 ) = z( j, k+3 ) -refsum*conjg( v( 3, m ) )
+                       end do
+                    end do
+                 end if
+                 ! ==== end of near-the-diagonal bulge chase. ====
+              end do loop_145
+              ! ==== use u (if accumulated) to update far-from-diagonal
+              ! .    entries in h.  if required, use u to update z as
+              ! .    well. ====
+              if( accum ) then
+                 if( wantt ) then
+                    jtop = 1
+                    jbot = n
+                 else
+                    jtop = ktop
+                    jbot = kbot
+                 end if
+                 k1 = max( 1, ktop-incol )
+                 nu = ( kdu-max( 0, ndcol-kbot ) ) - k1 + 1
+                 ! ==== horizontal multiply ====
+                 do jcol = min( ndcol, kbot ) + 1, jbot, nh
+                    jlen = min( nh, jbot-jcol+1 )
+                    call stdlib_wgemm( 'C', 'N', nu, jlen, nu, cone, u( k1, k1 ),ldu, h( incol+k1,&
+                               jcol ), ldh, czero, wh,ldwh )
+                    call stdlib_wlacpy( 'ALL', nu, jlen, wh, ldwh,h( incol+k1, jcol ), ldh )
+                              
+                 end do
+                 ! ==== vertical multiply ====
+                 do jrow = jtop, max( ktop, incol ) - 1, nv
+                    jlen = min( nv, max( ktop, incol )-jrow )
+                    call stdlib_wgemm( 'N', 'N', jlen, nu, nu, cone,h( jrow, incol+k1 ), ldh, u( &
+                              k1, k1 ),ldu, czero, wv, ldwv )
+                    call stdlib_wlacpy( 'ALL', jlen, nu, wv, ldwv,h( jrow, incol+k1 ), ldh )
+                              
+                 end do
+                 ! ==== z multiply (also vertical) ====
+                 if( wantz ) then
+                    do jrow = iloz, ihiz, nv
+                       jlen = min( nv, ihiz-jrow+1 )
+                       call stdlib_wgemm( 'N', 'N', jlen, nu, nu, cone,z( jrow, incol+k1 ), ldz, &
+                                 u( k1, k1 ),ldu, czero, wv, ldwv )
+                       call stdlib_wlacpy( 'ALL', jlen, nu, wv, ldwv,z( jrow, incol+k1 ), ldz )
+                                 
+                    end do
+                 end if
+              end if
+           end do loop_180
+     end subroutine stdlib_wlaqr5
+
+     !> ZLAQSB: equilibrates a symmetric band matrix A using the scaling
+     !> factors in the vector S.
+
+     pure subroutine stdlib_wlaqsb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(qp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(in) :: s(*)
+           complex(qp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1e+0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored in band format.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = max( 1, j-kd ), j
+                       ab( kd+1+i-j, j ) = cj*s( i )*ab( kd+1+i-j, j )
+                    end do
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, min( n, j+kd )
+                       ab( 1+i-j, j ) = cj*s( i )*ab( 1+i-j, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_wlaqsb
+
+     !> ZLAQSP: equilibrates a symmetric matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_wlaqsp( uplo, n, ap, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(in) :: s(*)
+           complex(qp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1e+0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jc
+           real(qp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j
+                       ap( jc+i-1 ) = cj*s( i )*ap( jc+i-1 )
+                    end do
+                    jc = jc + j
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, n
+                       ap( jc+i-j ) = cj*s( i )*ap( jc+i-j )
+                    end do
+                    jc = jc + n - j + 1
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_wlaqsp
+
+     !> ZLAQSY: equilibrates a symmetric matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_wlaqsy( uplo, n, a, lda, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(in) :: s(*)
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: thresh = 0.1e+0_qp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_qlamch( 'SAFE MINIMUM' ) / stdlib_qlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, n
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_wlaqsy
+
+     !> ZLAQZ0: computes the eigenvalues of a real matrix pair (H,T),
+     !> where H is an upper Hessenberg matrix and T is upper triangular,
+     !> using the double-shift QZ method.
+     !> Matrix pairs of this type are produced by the reduction to
+     !> generalized upper Hessenberg form of a real matrix pair (A,B):
+     !> A = Q1*H*Z1**H,  B = Q1*T*Z1**H,
+     !> as computed by ZGGHRD.
+     !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+     !> also reduced to generalized Schur form,
+     !> H = Q*S*Z**H,  T = Q*P*Z**H,
+     !> where Q and Z are unitary matrices, P and S are an upper triangular
+     !> matrices.
+     !> Optionally, the unitary matrix Q from the generalized Schur
+     !> factorization may be postmultiplied into an input matrix Q1, and the
+     !> unitary matrix Z may be postmultiplied into an input matrix Z1.
+     !> If Q1 and Z1 are the unitary matrices from ZGGHRD that reduced
+     !> the matrix pair (A,B) to generalized upper Hessenberg form, then the
+     !> output matrices Q1*Q and Z1*Z are the unitary factors from the
+     !> generalized Schur factorization of (A,B):
+     !> A = (Q1*Q)*S*(Z1*Z)**H,  B = (Q1*Q)*P*(Z1*Z)**H.
+     !> To avoid overflow, eigenvalues of the matrix pair (H,T) (equivalently,
+     !> of (A,B)) are computed as a pair of values (alpha,beta), where alpha is
+     !> complex and beta real.
+     !> If beta is nonzero, lambda = alpha / beta is an eigenvalue of the
+     !> generalized nonsymmetric eigenvalue problem (GNEP)
+     !> A*x = lambda*B*x
+     !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+     !> alternate form of the GNEP
+     !> mu*A*y = B*y.
+     !> Eigenvalues can be read directly from the generalized Schur
+     !> form:
+     !> alpha = S(i,i), beta = P(i,i).
+     !> Ref: C.B. Moler
+     !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+     !> pp. 241--256.
+     !> Ref: B. Kagstrom, D. Kressner, "Multishift Variants of the QZ
+     !> Algorithm with Aggressive Early Deflation", SIAM J. Numer.
+     !> Anal., 29(2006), pp. 199--227.
+     !> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
+     !> multipole rational QZ method with agressive early deflation"
+
+     recursive subroutine stdlib_wlaqz0( wants, wantq, wantz, n, ilo, ihi, a,lda, b, ldb, alpha, &
+               beta, q, ldq, z,ldz, work, lwork, rwork, rec,info )
+           ! arguments
+           character, intent( in ) :: wants, wantq, wantz
+           integer(ilp), intent( in ) :: n, ilo, ihi, lda, ldb, ldq, ldz, lwork,rec
+           integer(ilp), intent( out ) :: info
+           complex(qp), intent( inout ) :: a( lda, * ), b( ldb, * ), q( ldq,* ), z( ldz, * ), &
+                     alpha( * ), beta( * ), work( * )
+           real(qp), intent( out ) :: rwork( * )
+           
+           
+           ! local scalars
+           real(qp) :: smlnum, ulp, safmin, safmax, c1, tempr
+           complex(qp) :: eshift, s1, temp
+           integer(ilp) :: istart, istop, iiter, maxit, istart2, k, ld, nshifts, nblock, nw, nmin,&
+            nibble, n_undeflated, n_qeflated, ns, sweep_info, shiftpos, lworkreq, k2, istartm, &
+            istopm, iwants, iwantq, iwantz, norm_info, aed_info, nwr, nbr, nsr, itemp1, itemp2, &
+                      rcost
+           logical(lk) :: ilschur, ilq, ilz
+           character :: jbcmpz*3
+           if( stdlib_lsame( wants, 'E' ) ) then
+              ilschur = .false.
+              iwants = 1
+           else if( stdlib_lsame( wants, 'S' ) ) then
+              ilschur = .true.
+              iwants = 2
+           else
+              iwants = 0
+           end if
+           if( stdlib_lsame( wantq, 'N' ) ) then
+              ilq = .false.
+              iwantq = 1
+           else if( stdlib_lsame( wantq, 'V' ) ) then
+              ilq = .true.
+              iwantq = 2
+           else if( stdlib_lsame( wantq, 'I' ) ) then
+              ilq = .true.
+              iwantq = 3
+           else
+              iwantq = 0
+           end if
+           if( stdlib_lsame( wantz, 'N' ) ) then
+              ilz = .false.
+              iwantz = 1
+           else if( stdlib_lsame( wantz, 'V' ) ) then
+              ilz = .true.
+              iwantz = 2
+           else if( stdlib_lsame( wantz, 'I' ) ) then
+              ilz = .true.
+              iwantz = 3
+           else
+              iwantz = 0
+           end if
+           ! check argument values
+           info = 0
+           if( iwants==0 ) then
+              info = -1
+           else if( iwantq==0 ) then
+              info = -2
+           else if( iwantz==0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 ) then
+              info = -5
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -6
+           else if( lda<n ) then
+              info = -8
+           else if( ldb<n ) then
+              info = -10
+           else if( ldq<1 .or. ( ilq .and. ldq<n ) ) then
+              info = -15
+           else if( ldz<1 .or. ( ilz .and. ldz<n ) ) then
+              info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAQZ0', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = real( 1,KIND=qp)
+              return
+           end if
+           ! get the parameters
+           jbcmpz( 1:1 ) = wants
+           jbcmpz( 2:2 ) = wantq
+           jbcmpz( 3:3 ) = wantz
+           nmin = stdlib_ilaenv( 12, 'ZLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nwr = stdlib_ilaenv( 13, 'ZLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nwr = max( 2, nwr )
+           nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+           nibble = stdlib_ilaenv( 14, 'ZLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nsr = stdlib_ilaenv( 15, 'ZLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nsr = min( nsr, ( n+6 ) / 9, ihi-ilo )
+           nsr = max( 2, nsr-mod( nsr, 2 ) )
+           rcost = stdlib_ilaenv( 17, 'ZLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           itemp1 = int( nsr/sqrt( 1+2*nsr/( real( rcost,KIND=qp)/100*n ) ),KIND=ilp)
+           itemp1 = ( ( itemp1-1 )/4 )*4+4
+           nbr = nsr+itemp1
+           if( n < nmin .or. rec >= 2 ) then
+              call stdlib_whgeqz( wants, wantq, wantz, n, ilo, ihi, a, lda, b, ldb,alpha, beta, q,&
+                         ldq, z, ldz, work, lwork, rwork,info )
+              return
+           end if
+           ! find out required workspace
+           ! workspace query to stdlib_wlaqz2
+           nw = max( nwr, nmin )
+           call stdlib_wlaqz2( ilschur, ilq, ilz, n, ilo, ihi, nw, a, lda, b, ldb,q, ldq, z, ldz, &
+           n_undeflated, n_qeflated, alpha,beta, work, nw, work, nw, work, -1, rwork, rec,&
+                     aed_info )
+           itemp1 = int( work( 1 ),KIND=ilp)
+           ! workspace query to stdlib_wlaqz3
+           call stdlib_wlaqz3( ilschur, ilq, ilz, n, ilo, ihi, nsr, nbr, alpha,beta, a, lda, b, &
+                     ldb, q, ldq, z, ldz, work, nbr,work, nbr, work, -1, sweep_info )
+           itemp2 = int( work( 1 ),KIND=ilp)
+           lworkreq = max( itemp1+2*nw**2, itemp2+2*nbr**2 )
+           if ( lwork ==-1 ) then
+              work( 1 ) = real( lworkreq,KIND=qp)
+              return
+           else if ( lwork < lworkreq ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAQZ0', info )
+              return
+           end if
+           ! initialize q and z
+           if( iwantq==3 ) call stdlib_wlaset( 'FULL', n, n, czero, cone, q,ldq )
+           if( iwantz==3 ) call stdlib_wlaset( 'FULL', n, n, czero, cone, z,ldz )
+           ! get machine constants
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_qlabad( safmin, safmax )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=qp)/ulp )
+           istart = ilo
+           istop = ihi
+           maxit = 30*( ihi-ilo+1 )
+           ld = 0
+           do iiter = 1, maxit
+              if( iiter >= maxit ) then
+                 info = istop+1
+                 goto 80
+              end if
+              if ( istart+1 >= istop ) then
+                 istop = istart
+                 exit
+              end if
+              ! check deflations at the end
+              if ( abs( a( istop, istop-1 ) ) <= max( smlnum,ulp*( abs( a( istop, istop ) )+abs( &
+                        a( istop-1,istop-1 ) ) ) ) ) then
+                 a( istop, istop-1 ) = czero
+                 istop = istop-1
+                 ld = 0
+                 eshift = czero
+              end if
+              ! check deflations at the start
+              if ( abs( a( istart+1, istart ) ) <= max( smlnum,ulp*( abs( a( istart, istart ) )+&
+                        abs( a( istart+1,istart+1 ) ) ) ) ) then
+                 a( istart+1, istart ) = czero
+                 istart = istart+1
+                 ld = 0
+                 eshift = czero
+              end if
+              if ( istart+1 >= istop ) then
+                 exit
+              end if
+              ! check interior deflations
+              istart2 = istart
+              do k = istop, istart+1, -1
+                 if ( abs( a( k, k-1 ) ) <= max( smlnum, ulp*( abs( a( k,k ) )+abs( a( k-1, k-1 ) &
+                           ) ) ) ) then
+                    a( k, k-1 ) = czero
+                    istart2 = k
+                    exit
+                 end if
+              end do
+              ! get range to apply rotations to
+              if ( ilschur ) then
+                 istartm = 1
+                 istopm = n
+              else
+                 istartm = istart2
+                 istopm = istop
+              end if
+              ! check infinite eigenvalues, this is done without blocking so might
+              ! slow down the method when many infinite eigenvalues are present
+              k = istop
+              do while ( k>=istart2 )
+                 tempr = zero
+                 if( k < istop ) then
+                    tempr = tempr+abs( b( k, k+1 ) )
+                 end if
+                 if( k > istart2 ) then
+                    tempr = tempr+abs( b( k-1, k ) )
+                 end if
+                 if( abs( b( k, k ) ) < max( smlnum, ulp*tempr ) ) then
+                    ! a diagonal element of b is negligable, move it
+                    ! to the top and deflate it
+                    do k2 = k, istart2+1, -1
+                       call stdlib_wlartg( b( k2-1, k2 ), b( k2-1, k2-1 ), c1, s1,temp )
+                       b( k2-1, k2 ) = temp
+                       b( k2-1, k2-1 ) = czero
+                       call stdlib_wrot( k2-2-istartm+1, b( istartm, k2 ), 1,b( istartm, k2-1 ), &
+                                 1, c1, s1 )
+                       call stdlib_wrot( min( k2+1, istop )-istartm+1, a( istartm,k2 ), 1, a( &
+                                 istartm, k2-1 ), 1, c1, s1 )
+                       if ( ilz ) then
+                          call stdlib_wrot( n, z( 1, k2 ), 1, z( 1, k2-1 ), 1, c1,s1 )
+                       end if
+                       if( k2<istop ) then
+                          call stdlib_wlartg( a( k2, k2-1 ), a( k2+1, k2-1 ), c1,s1, temp )
+                                    
+                          a( k2, k2-1 ) = temp
+                          a( k2+1, k2-1 ) = czero
+                          call stdlib_wrot( istopm-k2+1, a( k2, k2 ), lda, a( k2+1,k2 ), lda, c1, &
+                                    s1 )
+                          call stdlib_wrot( istopm-k2+1, b( k2, k2 ), ldb, b( k2+1,k2 ), ldb, c1, &
+                                    s1 )
+                          if( ilq ) then
+                             call stdlib_wrot( n, q( 1, k2 ), 1, q( 1, k2+1 ), 1,c1, conjg( s1 ) )
+                                       
+                          end if
+                       end if
+                    end do
+                    if( istart2<istop )then
+                       call stdlib_wlartg( a( istart2, istart2 ), a( istart2+1,istart2 ), c1, s1, &
+                                 temp )
+                       a( istart2, istart2 ) = temp
+                       a( istart2+1, istart2 ) = czero
+                       call stdlib_wrot( istopm-( istart2+1 )+1, a( istart2,istart2+1 ), lda, a( &
+                                 istart2+1,istart2+1 ), lda, c1, s1 )
+                       call stdlib_wrot( istopm-( istart2+1 )+1, b( istart2,istart2+1 ), ldb, b( &
+                                 istart2+1,istart2+1 ), ldb, c1, s1 )
+                       if( ilq ) then
+                          call stdlib_wrot( n, q( 1, istart2 ), 1, q( 1,istart2+1 ), 1, c1, conjg(&
+                                     s1 ) )
+                       end if
+                    end if
+                    istart2 = istart2+1
+                 end if
+                 k = k-1
+              end do
+              ! istart2 now points to the top of the bottom right
+              ! unreduced hessenberg block
+              if ( istart2 >= istop ) then
+                 istop = istart2-1
+                 ld = 0
+                 eshift = czero
+                 cycle
+              end if
+              nw = nwr
+              nshifts = nsr
+              nblock = nbr
+              if ( istop-istart2+1 < nmin ) then
+                 ! setting nw to the size of the subblock will make aed deflate
+                 ! all the eigenvalues. this is slightly more efficient than just
+                 ! using qz_small because the off diagonal part gets updated via blas.
+                 if ( istop-istart+1 < nmin ) then
+                    nw = istop-istart+1
+                    istart2 = istart
+                 else
+                    nw = istop-istart2+1
+                 end if
+              end if
+              ! time for aed
+              call stdlib_wlaqz2( ilschur, ilq, ilz, n, istart2, istop, nw, a, lda,b, ldb, q, ldq,&
+               z, ldz, n_undeflated, n_qeflated,alpha, beta, work, nw, work( nw**2+1 ), nw,work( &
+                         2*nw**2+1 ), lwork-2*nw**2, rwork, rec,aed_info )
+              if ( n_qeflated > 0 ) then
+                 istop = istop-n_qeflated
+                 ld = 0
+                 eshift = czero
+              end if
+              if ( 100*n_qeflated > nibble*( n_qeflated+n_undeflated ) .or.istop-istart2+1 < nmin &
+                        ) then
+                 ! aed has uncovered many eigenvalues. skip a qz sweep and run
+                 ! aed again.
+                 cycle
+              end if
+              ld = ld+1
+              ns = min( nshifts, istop-istart2 )
+              ns = min( ns, n_undeflated )
+              shiftpos = istop-n_qeflated-n_undeflated+1
+              if ( mod( ld, 6 ) == 0 ) then
+                 ! exceptional shift.  chosen for no particularly good reason.
+                 if( ( real( maxit,KIND=qp)*safmin )*abs( a( istop,istop-1 ) )<abs( a( istop-1, &
+                           istop-1 ) ) ) then
+                    eshift = a( istop, istop-1 )/b( istop-1, istop-1 )
+                 else
+                    eshift = eshift+cone/( safmin*real( maxit,KIND=qp) )
+                 end if
+                 alpha( shiftpos ) = cone
+                 beta( shiftpos ) = eshift
+                 ns = 1
+              end if
+              ! time for a qz sweep
+              call stdlib_wlaqz3( ilschur, ilq, ilz, n, istart2, istop, ns, nblock,alpha( &
+              shiftpos ), beta( shiftpos ), a, lda, b,ldb, q, ldq, z, ldz, work, nblock, work( &
+                        nblock**2+1 ), nblock, work( 2*nblock**2+1 ),lwork-2*nblock**2, sweep_info )
+           end do
+           ! call stdlib_whgeqz to normalize the eigenvalue blocks and set the eigenvalues
+           ! if all the eigenvalues have been found, stdlib_whgeqz will not do any iterations
+           ! and only normalize the blocks. in case of a rare convergence failure,
+           ! the single shift might perform better.
+        80 call stdlib_whgeqz( wants, wantq, wantz, n, ilo, ihi, a, lda, b, ldb,alpha, beta, q, &
+                  ldq, z, ldz, work, lwork, rwork,norm_info )
+           info = norm_info
+     end subroutine stdlib_wlaqz0
+
+     !> ZLAQZ1: chases a 1x1 shift bulge in a matrix pencil down a single position
+
+     pure subroutine stdlib_wlaqz1( ilq, ilz, k, istartm, istopm, ihi, a, lda, b,ldb, nq, qstart, &
+               q, ldq, nz, zstart, z, ldz )
+           ! arguments
+           logical(lk), intent( in ) :: ilq, ilz
+           integer(ilp), intent( in ) :: k, lda, ldb, ldq, ldz, istartm, istopm,nq, nz, qstart, &
+                     zstart, ihi
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           
+           
+           ! local variables
+           real(qp) :: c
+           complex(qp) :: s, temp
+           if( k+1 == ihi ) then
+              ! shift is located on the edge of the matrix, remove it
+              call stdlib_wlartg( b( ihi, ihi ), b( ihi, ihi-1 ), c, s, temp )
+              b( ihi, ihi ) = temp
+              b( ihi, ihi-1 ) = czero
+              call stdlib_wrot( ihi-istartm, b( istartm, ihi ), 1, b( istartm,ihi-1 ), 1, c, s )
+                        
+              call stdlib_wrot( ihi-istartm+1, a( istartm, ihi ), 1, a( istartm,ihi-1 ), 1, c, s )
+                        
+              if ( ilz ) then
+                 call stdlib_wrot( nz, z( 1, ihi-zstart+1 ), 1, z( 1, ihi-1-zstart+1 ), 1, c, s )
+                           
+              end if
+           else
+              ! normal operation, move bulge down
+              ! apply transformation from the right
+              call stdlib_wlartg( b( k+1, k+1 ), b( k+1, k ), c, s, temp )
+              b( k+1, k+1 ) = temp
+              b( k+1, k ) = czero
+              call stdlib_wrot( k+2-istartm+1, a( istartm, k+1 ), 1, a( istartm,k ), 1, c, s )
+                        
+              call stdlib_wrot( k-istartm+1, b( istartm, k+1 ), 1, b( istartm, k ),1, c, s )
+                        
+              if ( ilz ) then
+                 call stdlib_wrot( nz, z( 1, k+1-zstart+1 ), 1, z( 1, k-zstart+1 ),1, c, s )
+                           
+              end if
+              ! apply transformation from the left
+              call stdlib_wlartg( a( k+1, k ), a( k+2, k ), c, s, temp )
+              a( k+1, k ) = temp
+              a( k+2, k ) = czero
+              call stdlib_wrot( istopm-k, a( k+1, k+1 ), lda, a( k+2, k+1 ), lda, c,s )
+              call stdlib_wrot( istopm-k, b( k+1, k+1 ), ldb, b( k+2, k+1 ), ldb, c,s )
+              if ( ilq ) then
+                 call stdlib_wrot( nq, q( 1, k+1-qstart+1 ), 1, q( 1, k+2-qstart+1 ), 1, c, conjg(&
+                            s ) )
+              end if
+           end if
+     end subroutine stdlib_wlaqz1
+
+     !> ZLAQZ2: performs AED
+
+     recursive subroutine stdlib_wlaqz2( ilschur, ilq, ilz, n, ilo, ihi, nw,a, lda, b, ldb, q, &
+               ldq, z, ldz, ns,nd, alpha, beta, qc, ldqc, zc, ldzc,work, lwork, rwork, rec, info )
+           ! arguments
+           logical(lk), intent( in ) :: ilschur, ilq, ilz
+           integer(ilp), intent( in ) :: n, ilo, ihi, nw, lda, ldb, ldq, ldz,ldqc, ldzc, lwork, &
+                     rec
+           complex(qp), intent( inout ) :: a( lda, * ), b( ldb, * ), q( ldq,* ), z( ldz, * ), &
+                     alpha( * ), beta( * )
+           integer(ilp), intent( out ) :: ns, nd, info
+           complex(qp), intent(inout) :: qc(ldqc,*), zc(ldzc,*)
+           complex(qp), intent(out) :: work(*)
+           real(qp), intent(out) :: rwork(*)
+           
+           
+           ! local scalars
+           integer(ilp) :: jw, kwtop, kwbot, istopm, istartm, k, k2, ztgexc_info, ifst, ilst, &
+                     lworkreq, qz_small_info
+           real(qp) ::smlnum, ulp, safmin, safmax, c1, tempr
+           complex(qp) :: s, s1, temp
+           info = 0
+           ! set up deflation window
+           jw = min( nw, ihi-ilo+1 )
+           kwtop = ihi-jw+1
+           if ( kwtop == ilo ) then
+              s = czero
+           else
+              s = a( kwtop, kwtop-1 )
+           end if
+           ! determine required workspace
+           ifst = 1
+           ilst = jw
+           call stdlib_wlaqz0( 'S', 'V', 'V', jw, 1, jw, a( kwtop, kwtop ), lda,b( kwtop, kwtop ),&
+                      ldb, alpha, beta, qc, ldqc, zc,ldzc, work, -1, rwork, rec+1, qz_small_info )
+           lworkreq = int( work( 1 ),KIND=ilp)+2*jw**2
+           lworkreq = max( lworkreq, n*nw, 2*nw**2+n )
+           if ( lwork ==-1 ) then
+              ! workspace query, quick return
+              work( 1 ) = lworkreq
+              return
+           else if ( lwork < lworkreq ) then
+              info = -26
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAQZ2', -info )
+              return
+           end if
+           ! get machine constants
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_qlabad( safmin, safmax )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=qp)/ulp )
+           if ( ihi == kwtop ) then
+              ! 1 by 1 deflation window, just try a regular deflation
+              alpha( kwtop ) = a( kwtop, kwtop )
+              beta( kwtop ) = b( kwtop, kwtop )
+              ns = 1
+              nd = 0
+              if ( abs( s ) <= max( smlnum, ulp*abs( a( kwtop,kwtop ) ) ) ) then
+                 ns = 0
+                 nd = 1
+                 if ( kwtop > ilo ) then
+                    a( kwtop, kwtop-1 ) = czero
+                 end if
+              end if
+           end if
+           ! store window in case of convergence failure
+           call stdlib_wlacpy( 'ALL', jw, jw, a( kwtop, kwtop ), lda, work, jw )
+           call stdlib_wlacpy( 'ALL', jw, jw, b( kwtop, kwtop ), ldb, work( jw**2+1 ), jw )
+                     
+           ! transform window to real schur form
+           call stdlib_wlaset( 'FULL', jw, jw, czero, cone, qc, ldqc )
+           call stdlib_wlaset( 'FULL', jw, jw, czero, cone, zc, ldzc )
+           call stdlib_wlaqz0( 'S', 'V', 'V', jw, 1, jw, a( kwtop, kwtop ), lda,b( kwtop, kwtop ),&
+            ldb, alpha, beta, qc, ldqc, zc,ldzc, work( 2*jw**2+1 ), lwork-2*jw**2, rwork,rec+1, &
+                      qz_small_info )
+           if( qz_small_info /= 0 ) then
+              ! convergence failure, restore the window and exit
+              nd = 0
+              ns = jw-qz_small_info
+              call stdlib_wlacpy( 'ALL', jw, jw, work, jw, a( kwtop, kwtop ), lda )
+              call stdlib_wlacpy( 'ALL', jw, jw, work( jw**2+1 ), jw, b( kwtop,kwtop ), ldb )
+                        
+              return
+           end if
+           ! deflation detection loop
+           if ( kwtop == ilo .or. s == czero ) then
+              kwbot = kwtop-1
+           else
+              kwbot = ihi
+              k = 1
+              k2 = 1
+              do while ( k <= jw )
+                    ! try to deflate eigenvalue
+                    tempr = abs( a( kwbot, kwbot ) )
+                    if( tempr == zero ) then
+                       tempr = abs( s )
+                    end if
+                    if ( ( abs( s*qc( 1, kwbot-kwtop+1 ) ) ) <= max( ulp*tempr, smlnum ) ) &
+                              then
+                       ! deflatable
+                       kwbot = kwbot-1
+                    else
+                       ! not deflatable, move out of the way
+                       ifst = kwbot-kwtop+1
+                       ilst = k2
+                       call stdlib_wtgexc( .true., .true., jw, a( kwtop, kwtop ),lda, b( kwtop, &
+                                 kwtop ), ldb, qc, ldqc,zc, ldzc, ifst, ilst, ztgexc_info )
+                       k2 = k2+1
+                    end if
+                    k = k+1
+              end do
+           end if
+           ! store eigenvalues
+           nd = ihi-kwbot
+           ns = jw-nd
+           k = kwtop
+           do while ( k <= ihi )
+              alpha( k ) = a( k, k )
+              beta( k ) = b( k, k )
+              k = k+1
+           end do
+           if ( kwtop /= ilo .and. s /= czero ) then
+              ! reflect spike back, this will create optimally packed bulges
+              a( kwtop:kwbot, kwtop-1 ) = a( kwtop, kwtop-1 ) *conjg( qc( 1,1:jw-nd ) )
+              do k = kwbot-1, kwtop, -1
+                 call stdlib_wlartg( a( k, kwtop-1 ), a( k+1, kwtop-1 ), c1, s1,temp )
+                 a( k, kwtop-1 ) = temp
+                 a( k+1, kwtop-1 ) = czero
+                 k2 = max( kwtop, k-1 )
+                 call stdlib_wrot( ihi-k2+1, a( k, k2 ), lda, a( k+1, k2 ), lda, c1,s1 )
+                 call stdlib_wrot( ihi-( k-1 )+1, b( k, k-1 ), ldb, b( k+1, k-1 ),ldb, c1, s1 )
+                           
+                 call stdlib_wrot( jw, qc( 1, k-kwtop+1 ), 1, qc( 1, k+1-kwtop+1 ),1, c1, conjg( &
+                           s1 ) )
+              end do
+              ! chase bulges down
+              istartm = kwtop
+              istopm = ihi
+              k = kwbot-1
+              do while ( k >= kwtop )
+                 ! move bulge down and remove it
+                 do k2 = k, kwbot-1
+                    call stdlib_wlaqz1( .true., .true., k2, kwtop, kwtop+jw-1,kwbot, a, lda, b, &
+                              ldb, jw, kwtop, qc, ldqc,jw, kwtop, zc, ldzc )
+                 end do
+                 k = k-1
+              end do
+           end if
+           ! apply qc and zc to rest of the matrix
+           if ( ilschur ) then
+              istartm = 1
+              istopm = n
+           else
+              istartm = ilo
+              istopm = ihi
+           end if
+           if ( istopm-ihi > 0 ) then
+              call stdlib_wgemm( 'C', 'N', jw, istopm-ihi, jw, cone, qc, ldqc,a( kwtop, ihi+1 ), &
+                        lda, czero, work, jw )
+              call stdlib_wlacpy( 'ALL', jw, istopm-ihi, work, jw, a( kwtop,ihi+1 ), lda )
+              call stdlib_wgemm( 'C', 'N', jw, istopm-ihi, jw, cone, qc, ldqc,b( kwtop, ihi+1 ), &
+                        ldb, czero, work, jw )
+              call stdlib_wlacpy( 'ALL', jw, istopm-ihi, work, jw, b( kwtop,ihi+1 ), ldb )
+           end if
+           if ( ilq ) then
+              call stdlib_wgemm( 'N', 'N', n, jw, jw, cone, q( 1, kwtop ), ldq, qc,ldqc, czero, &
+                        work, n )
+              call stdlib_wlacpy( 'ALL', n, jw, work, n, q( 1, kwtop ), ldq )
+           end if
+           if ( kwtop-1-istartm+1 > 0 ) then
+              call stdlib_wgemm( 'N', 'N', kwtop-istartm, jw, jw, cone, a( istartm,kwtop ), lda, &
+                        zc, ldzc, czero, work,kwtop-istartm )
+             call stdlib_wlacpy( 'ALL', kwtop-istartm, jw, work, kwtop-istartm,a( istartm, kwtop )&
+                       , lda )
+              call stdlib_wgemm( 'N', 'N', kwtop-istartm, jw, jw, cone, b( istartm,kwtop ), ldb, &
+                        zc, ldzc, czero, work,kwtop-istartm )
+             call stdlib_wlacpy( 'ALL', kwtop-istartm, jw, work, kwtop-istartm,b( istartm, kwtop )&
+                       , ldb )
+           end if
+           if ( ilz ) then
+              call stdlib_wgemm( 'N', 'N', n, jw, jw, cone, z( 1, kwtop ), ldz, zc,ldzc, czero, &
+                        work, n )
+              call stdlib_wlacpy( 'ALL', n, jw, work, n, z( 1, kwtop ), ldz )
+           end if
+     end subroutine stdlib_wlaqz2
+
+     !> ZLAQZ3: Executes a single multishift QZ sweep
+
+     pure subroutine stdlib_wlaqz3( ilschur, ilq, ilz, n, ilo, ihi, nshifts,nblock_qesired, alpha,&
+                beta, a, lda, b, ldb,q, ldq, z, ldz, qc, ldqc, zc, ldzc, work,lwork, info )
+           ! function arguments
+           logical(lk), intent( in ) :: ilschur, ilq, ilz
+           integer(ilp), intent( in ) :: n, ilo, ihi, lda, ldb, ldq, ldz, lwork,nshifts, &
+                     nblock_qesired, ldqc, ldzc
+           complex(qp), intent( inout ) :: a( lda, * ), b( ldb, * ), q( ldq,* ), z( ldz, * ), qc( &
+                     ldqc, * ), zc( ldzc, * ), work( * ),alpha( * ), beta( * )
+           integer(ilp), intent( out ) :: info
+           
+           
+           ! local scalars
+           integer(ilp) :: i, j, ns, istartm, istopm, sheight, swidth, k, np, istartb, istopb, &
+                     ishift, nblock, npos
+           real(qp) :: safmin, safmax, c, scale
+           complex(qp) :: temp, temp2, temp3, s
+           info = 0
+           if ( nblock_qesired < nshifts+1 ) then
+              info = -8
+           end if
+           if ( lwork ==-1 ) then
+              ! workspace query, quick return
+              work( 1 ) = n*nblock_qesired
+              return
+           else if ( lwork < n*nblock_qesired ) then
+              info = -25
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAQZ3', -info )
+              return
+           end if
+           ! executable statements
+           ! get machine constants
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_qlabad( safmin, safmax )
+           if ( ilo >= ihi ) then
+              return
+           end if
+           if ( ilschur ) then
+              istartm = 1
+              istopm = n
+           else
+              istartm = ilo
+              istopm = ihi
+           end if
+           ns = nshifts
+           npos = max( nblock_qesired-ns, 1 )
+           ! the following block introduces the shifts and chases
+           ! them down one by one just enough to make space for
+           ! the other shifts. the near-the-diagonal block is
+           ! of size (ns+1) x ns.
+           call stdlib_wlaset( 'FULL', ns+1, ns+1, czero, cone, qc, ldqc )
+           call stdlib_wlaset( 'FULL', ns, ns, czero, cone, zc, ldzc )
+           do i = 1, ns
+              ! introduce the shift
+              scale = sqrt( abs( alpha( i ) ) ) * sqrt( abs( beta( i ) ) )
+              if( scale >= safmin .and. scale <= safmax ) then
+                 alpha( i ) = alpha( i )/scale
+                 beta( i ) = beta( i )/scale
+              end if
+              temp2 = beta( i )*a( ilo, ilo )-alpha( i )*b( ilo, ilo )
+              temp3 = beta( i )*a( ilo+1, ilo )
+              if ( abs( temp2 ) > safmax .or.abs( temp3 ) > safmax ) then
+                 temp2 = cone
+                 temp3 = czero
+              end if
+              call stdlib_wlartg( temp2, temp3, c, s, temp )
+              call stdlib_wrot( ns, a( ilo, ilo ), lda, a( ilo+1, ilo ), lda, c,s )
+              call stdlib_wrot( ns, b( ilo, ilo ), ldb, b( ilo+1, ilo ), ldb, c,s )
+              call stdlib_wrot( ns+1, qc( 1, 1 ), 1, qc( 1, 2 ), 1, c,conjg( s ) )
+              ! chase the shift down
+              do j = 1, ns-i
+                 call stdlib_wlaqz1( .true., .true., j, 1, ns, ihi-ilo+1, a( ilo,ilo ), lda, b( &
+                           ilo, ilo ), ldb, ns+1, 1, qc,ldqc, ns, 1, zc, ldzc )
+              end do
+           end do
+           ! update the rest of the pencil
+           ! update a(ilo:ilo+ns,ilo+ns:istopm) and b(ilo:ilo+ns,ilo+ns:istopm)
+           ! from the left with qc(1:ns+1,1:ns+1)'
+           sheight = ns+1
+           swidth = istopm-( ilo+ns )+1
+           if ( swidth > 0 ) then
+              call stdlib_wgemm( 'C', 'N', sheight, swidth, sheight, cone, qc, ldqc,a( ilo, ilo+&
+                        ns ), lda, czero, work, sheight )
+              call stdlib_wlacpy( 'ALL', sheight, swidth, work, sheight, a( ilo,ilo+ns ), lda )
+                        
+              call stdlib_wgemm( 'C', 'N', sheight, swidth, sheight, cone, qc, ldqc,b( ilo, ilo+&
+                        ns ), ldb, czero, work, sheight )
+              call stdlib_wlacpy( 'ALL', sheight, swidth, work, sheight, b( ilo,ilo+ns ), ldb )
+                        
+           end if
+           if ( ilq ) then
+              call stdlib_wgemm( 'N', 'N', n, sheight, sheight, cone, q( 1, ilo ),ldq, qc, ldqc, &
+                        czero, work, n )
+              call stdlib_wlacpy( 'ALL', n, sheight, work, n, q( 1, ilo ), ldq )
+           end if
+           ! update a(istartm:ilo-1,ilo:ilo+ns-1) and b(istartm:ilo-1,ilo:ilo+ns-1)
+           ! from the right with zc(1:ns,1:ns)
+           sheight = ilo-1-istartm+1
+           swidth = ns
+           if ( sheight > 0 ) then
+              call stdlib_wgemm( 'N', 'N', sheight, swidth, swidth, cone,a( istartm, ilo ), lda, &
+                        zc, ldzc, czero, work,sheight )
+              call stdlib_wlacpy( 'ALL', sheight, swidth, work, sheight, a( istartm,ilo ), lda )
+                        
+              call stdlib_wgemm( 'N', 'N', sheight, swidth, swidth, cone,b( istartm, ilo ), ldb, &
+                        zc, ldzc, czero, work,sheight )
+              call stdlib_wlacpy( 'ALL', sheight, swidth, work, sheight, b( istartm,ilo ), ldb )
+                        
+           end if
+           if ( ilz ) then
+              call stdlib_wgemm( 'N', 'N', n, swidth, swidth, cone, z( 1, ilo ),ldz, zc, ldzc, &
+                        czero, work, n )
+              call stdlib_wlacpy( 'ALL', n, swidth, work, n, z( 1, ilo ), ldz )
+           end if
+           ! the following block chases the shifts down to the bottom
+           ! right block. if possible, a shift is moved down npos
+           ! positions at a time
+           k = ilo
+           do while ( k < ihi-ns )
+              np = min( ihi-ns-k, npos )
+              ! size of the near-the-diagonal block
+              nblock = ns+np
+              ! istartb points to the first row we will be updating
+              istartb = k+1
+              ! istopb points to the last column we will be updating
+              istopb = k+nblock-1
+              call stdlib_wlaset( 'FULL', ns+np, ns+np, czero, cone, qc, ldqc )
+              call stdlib_wlaset( 'FULL', ns+np, ns+np, czero, cone, zc, ldzc )
+              ! near the diagonal shift chase
+              do i = ns-1, 0, -1
+                 do j = 0, np-1
+                    ! move down the block with index k+i+j, updating
+                    ! the (ns+np x ns+np) block:
+                    ! (k:k+ns+np,k:k+ns+np-1)
+                    call stdlib_wlaqz1( .true., .true., k+i+j, istartb, istopb, ihi,a, lda, b, &
+                              ldb, nblock, k+1, qc, ldqc,nblock, k, zc, ldzc )
+                 end do
+              end do
+              ! update rest of the pencil
+              ! update a(k+1:k+ns+np, k+ns+np:istopm) and
+              ! b(k+1:k+ns+np, k+ns+np:istopm)
+              ! from the left with qc(1:ns+np,1:ns+np)'
+              sheight = ns+np
+              swidth = istopm-( k+ns+np )+1
+              if ( swidth > 0 ) then
+                 call stdlib_wgemm( 'C', 'N', sheight, swidth, sheight, cone, qc,ldqc, a( k+1, k+&
+                           ns+np ), lda, czero, work,sheight )
+                 call stdlib_wlacpy( 'ALL', sheight, swidth, work, sheight, a( k+1,k+ns+np ), lda &
+                           )
+                 call stdlib_wgemm( 'C', 'N', sheight, swidth, sheight, cone, qc,ldqc, b( k+1, k+&
+                           ns+np ), ldb, czero, work,sheight )
+                 call stdlib_wlacpy( 'ALL', sheight, swidth, work, sheight, b( k+1,k+ns+np ), ldb &
+                           )
+              end if
+              if ( ilq ) then
+                 call stdlib_wgemm( 'N', 'N', n, nblock, nblock, cone, q( 1, k+1 ),ldq, qc, ldqc, &
+                           czero, work, n )
+                 call stdlib_wlacpy( 'ALL', n, nblock, work, n, q( 1, k+1 ), ldq )
+              end if
+              ! update a(istartm:k,k:k+ns+npos-1) and b(istartm:k,k:k+ns+npos-1)
+              ! from the right with zc(1:ns+np,1:ns+np)
+              sheight = k-istartm+1
+              swidth = nblock
+              if ( sheight > 0 ) then
+                 call stdlib_wgemm( 'N', 'N', sheight, swidth, swidth, cone,a( istartm, k ), lda, &
+                           zc, ldzc, czero, work,sheight )
+                 call stdlib_wlacpy( 'ALL', sheight, swidth, work, sheight,a( istartm, k ), lda )
+                           
+                 call stdlib_wgemm( 'N', 'N', sheight, swidth, swidth, cone,b( istartm, k ), ldb, &
+                           zc, ldzc, czero, work,sheight )
+                 call stdlib_wlacpy( 'ALL', sheight, swidth, work, sheight,b( istartm, k ), ldb )
+                           
+              end if
+              if ( ilz ) then
+                 call stdlib_wgemm( 'N', 'N', n, nblock, nblock, cone, z( 1, k ),ldz, zc, ldzc, &
+                           czero, work, n )
+                 call stdlib_wlacpy( 'ALL', n, nblock, work, n, z( 1, k ), ldz )
+              end if
+              k = k+np
+           end do
+           ! the following block removes the shifts from the bottom right corner
+           ! one by one. updates are initially applied to a(ihi-ns+1:ihi,ihi-ns:ihi).
+           call stdlib_wlaset( 'FULL', ns, ns, czero, cone, qc, ldqc )
+           call stdlib_wlaset( 'FULL', ns+1, ns+1, czero, cone, zc, ldzc )
+           ! istartb points to the first row we will be updating
+           istartb = ihi-ns+1
+           ! istopb points to the last column we will be updating
+           istopb = ihi
+           do i = 1, ns
+              ! chase the shift down to the bottom right corner
+              do ishift = ihi-i, ihi-1
+                 call stdlib_wlaqz1( .true., .true., ishift, istartb, istopb, ihi,a, lda, b, ldb, &
+                           ns, ihi-ns+1, qc, ldqc, ns+1,ihi-ns, zc, ldzc )
+              end do
+           end do
+           ! update rest of the pencil
+           ! update a(ihi-ns+1:ihi, ihi+1:istopm)
+           ! from the left with qc(1:ns,1:ns)'
+           sheight = ns
+           swidth = istopm-( ihi+1 )+1
+           if ( swidth > 0 ) then
+              call stdlib_wgemm( 'C', 'N', sheight, swidth, sheight, cone, qc, ldqc,a( ihi-ns+1, &
+                        ihi+1 ), lda, czero, work, sheight )
+              call stdlib_wlacpy( 'ALL', sheight, swidth, work, sheight,a( ihi-ns+1, ihi+1 ), lda &
+                        )
+              call stdlib_wgemm( 'C', 'N', sheight, swidth, sheight, cone, qc, ldqc,b( ihi-ns+1, &
+                        ihi+1 ), ldb, czero, work, sheight )
+              call stdlib_wlacpy( 'ALL', sheight, swidth, work, sheight,b( ihi-ns+1, ihi+1 ), ldb &
+                        )
+           end if
+           if ( ilq ) then
+              call stdlib_wgemm( 'N', 'N', n, ns, ns, cone, q( 1, ihi-ns+1 ), ldq,qc, ldqc, czero,&
+                         work, n )
+              call stdlib_wlacpy( 'ALL', n, ns, work, n, q( 1, ihi-ns+1 ), ldq )
+           end if
+           ! update a(istartm:ihi-ns,ihi-ns:ihi)
+           ! from the right with zc(1:ns+1,1:ns+1)
+           sheight = ihi-ns-istartm+1
+           swidth = ns+1
+           if ( sheight > 0 ) then
+              call stdlib_wgemm( 'N', 'N', sheight, swidth, swidth, cone,a( istartm, ihi-ns ), &
+                        lda, zc, ldzc, czero, work,sheight )
+              call stdlib_wlacpy( 'ALL', sheight, swidth, work, sheight, a( istartm,ihi-ns ), lda &
+                        )
+              call stdlib_wgemm( 'N', 'N', sheight, swidth, swidth, cone,b( istartm, ihi-ns ), &
+                        ldb, zc, ldzc, czero, work,sheight )
+              call stdlib_wlacpy( 'ALL', sheight, swidth, work, sheight, b( istartm,ihi-ns ), ldb &
+                        )
+           end if
+           if ( ilz ) then
+              call stdlib_wgemm( 'N', 'N', n, ns+1, ns+1, cone, z( 1, ihi-ns ), ldz,zc, ldzc, &
+                        czero, work, n )
+              call stdlib_wlacpy( 'ALL', n, ns+1, work, n, z( 1, ihi-ns ), ldz )
+           end if
+     end subroutine stdlib_wlaqz3
+
+     !> ZLAR1V: computes the (scaled) r-th column of the inverse of
+     !> the sumbmatrix in rows B1 through BN of the tridiagonal matrix
+     !> L D L**T - sigma I. When sigma is close to an eigenvalue, the
+     !> computed vector is an accurate eigenvector. Usually, r corresponds
+     !> to the index where the eigenvector is largest in magnitude.
+     !> The following steps accomplish this computation :
+     !> (a) Stationary qd transform,  L D L**T - sigma I = L(+) D(+) L(+)**T,
+     !> (b) Progressive qd transform, L D L**T - sigma I = U(-) D(-) U(-)**T,
+     !> (c) Computation of the diagonal elements of the inverse of
+     !> L D L**T - sigma I by combining the above transforms, and choosing
+     !> r as the index where the diagonal of the inverse is (one of the)
+     !> largest in magnitude.
+     !> (d) Computation of the (scaled) r-th column of the inverse using the
+     !> twisted factorization obtained by combining the top part of the
+     !> the stationary and the bottom part of the progressive transform.
+
+     pure subroutine stdlib_wlar1v( n, b1, bn, lambda, d, l, ld, lld,pivmin, gaptol, z, wantnc, &
+               negcnt, ztz, mingma,r, isuppz, nrminv, resid, rqcorr, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantnc
+           integer(ilp), intent(in) :: b1, bn, n
+           integer(ilp), intent(out) :: negcnt
+           integer(ilp), intent(inout) :: r
+           real(qp), intent(in) :: gaptol, lambda, pivmin
+           real(qp), intent(out) :: mingma, nrminv, resid, rqcorr, ztz
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*)
+           real(qp), intent(in) :: d(*), l(*), ld(*), lld(*)
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: z(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: sawnan1, sawnan2
+           integer(ilp) :: i, indlpl, indp, inds, indumn, neg1, neg2, r1, r2
+           real(qp) :: dminus, dplus, eps, s, tmp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real
+           ! Executable Statements 
+           eps = stdlib_qlamch( 'PRECISION' )
+           if( r==0 ) then
+              r1 = b1
+              r2 = bn
+           else
+              r1 = r
+              r2 = r
+           end if
+           ! storage for lplus
+           indlpl = 0
+           ! storage for uminus
+           indumn = n
+           inds = 2*n + 1
+           indp = 3*n + 1
+           if( b1==1 ) then
+              work( inds ) = zero
+           else
+              work( inds+b1-1 ) = lld( b1-1 )
+           end if
+           ! compute the stationary transform (using the differential form)
+           ! until the index r2.
+           sawnan1 = .false.
+           neg1 = 0
+           s = work( inds+b1-1 ) - lambda
+           do i = b1, r1 - 1
+              dplus = d( i ) + s
+              work( indlpl+i ) = ld( i ) / dplus
+              if(dplus<zero) neg1 = neg1 + 1
+              work( inds+i ) = s*work( indlpl+i )*l( i )
+              s = work( inds+i ) - lambda
+           end do
+           sawnan1 = stdlib_qisnan( s )
+           if( sawnan1 ) goto 60
+           do i = r1, r2 - 1
+              dplus = d( i ) + s
+              work( indlpl+i ) = ld( i ) / dplus
+              work( inds+i ) = s*work( indlpl+i )*l( i )
+              s = work( inds+i ) - lambda
+           end do
+           sawnan1 = stdlib_qisnan( s )
+           60 continue
+           if( sawnan1 ) then
+              ! runs a slower version of the above loop if a nan is detected
+              neg1 = 0
+              s = work( inds+b1-1 ) - lambda
+              do i = b1, r1 - 1
+                 dplus = d( i ) + s
+                 if(abs(dplus)<pivmin) dplus = -pivmin
+                 work( indlpl+i ) = ld( i ) / dplus
+                 if(dplus<zero) neg1 = neg1 + 1
+                 work( inds+i ) = s*work( indlpl+i )*l( i )
+                 if( work( indlpl+i )==zero )work( inds+i ) = lld( i )
+                 s = work( inds+i ) - lambda
+              end do
+              do i = r1, r2 - 1
+                 dplus = d( i ) + s
+                 if(abs(dplus)<pivmin) dplus = -pivmin
+                 work( indlpl+i ) = ld( i ) / dplus
+                 work( inds+i ) = s*work( indlpl+i )*l( i )
+                 if( work( indlpl+i )==zero )work( inds+i ) = lld( i )
+                 s = work( inds+i ) - lambda
+              end do
+           end if
+           ! compute the progressive transform (using the differential form)
+           ! until the index r1
+           sawnan2 = .false.
+           neg2 = 0
+           work( indp+bn-1 ) = d( bn ) - lambda
+           do i = bn - 1, r1, -1
+              dminus = lld( i ) + work( indp+i )
+              tmp = d( i ) / dminus
+              if(dminus<zero) neg2 = neg2 + 1
+              work( indumn+i ) = l( i )*tmp
+              work( indp+i-1 ) = work( indp+i )*tmp - lambda
+           end do
+           tmp = work( indp+r1-1 )
+           sawnan2 = stdlib_qisnan( tmp )
+           if( sawnan2 ) then
+              ! runs a slower version of the above loop if a nan is detected
+              neg2 = 0
+              do i = bn-1, r1, -1
+                 dminus = lld( i ) + work( indp+i )
+                 if(abs(dminus)<pivmin) dminus = -pivmin
+                 tmp = d( i ) / dminus
+                 if(dminus<zero) neg2 = neg2 + 1
+                 work( indumn+i ) = l( i )*tmp
+                 work( indp+i-1 ) = work( indp+i )*tmp - lambda
+                 if( tmp==zero )work( indp+i-1 ) = d( i ) - lambda
+              end do
+           end if
+           ! find the index (from r1 to r2) of the largest (in magnitude)
+           ! diagonal element of the inverse
+           mingma = work( inds+r1-1 ) + work( indp+r1-1 )
+           if( mingma<zero ) neg1 = neg1 + 1
+           if( wantnc ) then
+              negcnt = neg1 + neg2
+           else
+              negcnt = -1
+           endif
+           if( abs(mingma)==zero )mingma = eps*work( inds+r1-1 )
+           r = r1
+           do i = r1, r2 - 1
+              tmp = work( inds+i ) + work( indp+i )
+              if( tmp==zero )tmp = eps*work( inds+i )
+              if( abs( tmp )<=abs( mingma ) ) then
+                 mingma = tmp
+                 r = i + 1
+              end if
+           end do
+           ! compute the fp vector: solve n^t v = e_r
+           isuppz( 1 ) = b1
+           isuppz( 2 ) = bn
+           z( r ) = cone
+           ztz = one
+           ! compute the fp vector upwards from r
+           if( .not.sawnan1 .and. .not.sawnan2 ) then
+              do i = r-1, b1, -1
+                 z( i ) = -( work( indlpl+i )*z( i+1 ) )
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i ) = zero
+                    isuppz( 1 ) = i + 1
+                    goto 220
+                 endif
+                 ztz = ztz + real( z( i )*z( i ),KIND=qp)
+              end do
+              220 continue
+           else
+              ! run slower loop if nan occurred.
+              do i = r - 1, b1, -1
+                 if( z( i+1 )==zero ) then
+                    z( i ) = -( ld( i+1 ) / ld( i ) )*z( i+2 )
+                 else
+                    z( i ) = -( work( indlpl+i )*z( i+1 ) )
+                 end if
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i ) = zero
+                    isuppz( 1 ) = i + 1
+                    go to 240
+                 end if
+                 ztz = ztz + real( z( i )*z( i ),KIND=qp)
+              end do
+              240 continue
+           endif
+           ! compute the fp vector downwards from r in blocks of size blksiz
+           if( .not.sawnan1 .and. .not.sawnan2 ) then
+              do i = r, bn-1
+                 z( i+1 ) = -( work( indumn+i )*z( i ) )
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i+1 ) = zero
+                    isuppz( 2 ) = i
+                    go to 260
+                 end if
+                 ztz = ztz + real( z( i+1 )*z( i+1 ),KIND=qp)
+              end do
+              260 continue
+           else
+              ! run slower loop if nan occurred.
+              do i = r, bn - 1
+                 if( z( i )==zero ) then
+                    z( i+1 ) = -( ld( i-1 ) / ld( i ) )*z( i-1 )
+                 else
+                    z( i+1 ) = -( work( indumn+i )*z( i ) )
+                 end if
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i+1 ) = zero
+                    isuppz( 2 ) = i
+                    go to 280
+                 end if
+                 ztz = ztz + real( z( i+1 )*z( i+1 ),KIND=qp)
+              end do
+              280 continue
+           end if
+           ! compute quantities for convergence test
+           tmp = one / ztz
+           nrminv = sqrt( tmp )
+           resid = abs( mingma )*nrminv
+           rqcorr = mingma*tmp
+           return
+     end subroutine stdlib_wlar1v
+
+     !> ZLAR2V: applies a vector of complex plane rotations with real cosines
+     !> from both sides to a sequence of 2-by-2 complex Hermitian matrices,
+     !> defined by the elements of the vectors x, y and z. For i = 1,2,...,n
+     !> (       x(i)  z(i) ) :=
+     !> ( conjg(z(i)) y(i) )
+     !> (  c(i) conjg(s(i)) ) (       x(i)  z(i) ) ( c(i) -conjg(s(i)) )
+     !> ( -s(i)       c(i)  ) ( conjg(z(i)) y(i) ) ( s(i)        c(i)  )
+
+     pure subroutine stdlib_wlar2v( n, x, y, z, incx, c, s, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, n
+           ! Array Arguments 
+           real(qp), intent(in) :: c(*)
+           complex(qp), intent(in) :: s(*)
+           complex(qp), intent(inout) :: x(*), y(*), z(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix
+           real(qp) :: ci, sii, sir, t1i, t1r, t5, t6, xi, yi, zii, zir
+           complex(qp) :: si, t2, t3, t4, zi
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,conjg,aimag
+           ! Executable Statements 
+           ix = 1
+           ic = 1
+           do i = 1, n
+              xi = real( x( ix ),KIND=qp)
+              yi = real( y( ix ),KIND=qp)
+              zi = z( ix )
+              zir = real( zi,KIND=qp)
+              zii = aimag( zi )
+              ci = c( ic )
+              si = s( ic )
+              sir = real( si,KIND=qp)
+              sii = aimag( si )
+              t1r = sir*zir - sii*zii
+              t1i = sir*zii + sii*zir
+              t2 = ci*zi
+              t3 = t2 - conjg( si )*xi
+              t4 = conjg( t2 ) + si*yi
+              t5 = ci*xi + t1r
+              t6 = ci*yi - t1r
+              x( ix ) = ci*t5 + ( sir*real( t4,KIND=qp)+sii*aimag( t4 ) )
+              y( ix ) = ci*t6 - ( sir*real( t3,KIND=qp)-sii*aimag( t3 ) )
+              z( ix ) = ci*t3 + conjg( si )*cmplx( t6, t1i,KIND=qp)
+              ix = ix + incx
+              ic = ic + incc
+           end do
+           return
+     end subroutine stdlib_wlar2v
+
+     !> ZLARCM: performs a very simple matrix-matrix multiplication:
+     !> C := A * B,
+     !> where A is M by M and real; B is M by N and complex;
+     !> C is M by N and complex.
+
+     pure subroutine stdlib_wlarcm( m, n, a, lda, b, ldb, c, ldc, rwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           ! Array Arguments 
+           real(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(in) :: b(ldb,*)
+           complex(qp), intent(out) :: c(ldc,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,aimag
+           ! Executable Statements 
+           ! quick return if possible.
+           if( ( m==0 ) .or. ( n==0 ) )return
+           do j = 1, n
+              do i = 1, m
+                 rwork( ( j-1 )*m+i ) = real( b( i, j ),KIND=qp)
+              end do
+           end do
+           l = m*n + 1
+           call stdlib_qgemm( 'N', 'N', m, n, m, one, a, lda, rwork, m, zero,rwork( l ), m )
+                     
+           do j = 1, n
+              do i = 1, m
+                 c( i, j ) = rwork( l+( j-1 )*m+i-1 )
+              end do
+           end do
+           do j = 1, n
+              do i = 1, m
+                 rwork( ( j-1 )*m+i ) = aimag( b( i, j ) )
+              end do
+           end do
+           call stdlib_qgemm( 'N', 'N', m, n, m, one, a, lda, rwork, m, zero,rwork( l ), m )
+                     
+           do j = 1, n
+              do i = 1, m
+                 c( i, j ) = cmplx( real( c( i, j ),KIND=qp),rwork( l+( j-1 )*m+i-1 ),KIND=qp)
+                           
+              end do
+           end do
+           return
+     end subroutine stdlib_wlarcm
+
+     !> ZLARF: applies a complex elementary reflector H to a complex M-by-N
+     !> matrix C, from either the left or the right. H is represented in the
+     !> form
+     !> H = I - tau * v * v**H
+     !> where tau is a complex scalar and v is a complex vector.
+     !> If tau = 0, then H is taken to be the unit matrix.
+     !> To apply H**H, supply conjg(tau) instead
+     !> tau.
+
+     pure subroutine stdlib_wlarf( side, m, n, v, incv, tau, c, ldc, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: incv, ldc, m, n
+           complex(qp), intent(in) :: tau
+           ! Array Arguments 
+           complex(qp), intent(inout) :: c(ldc,*)
+           complex(qp), intent(in) :: v(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: applyleft
+           integer(ilp) :: i, lastv, lastc
+           ! Executable Statements 
+           applyleft = stdlib_lsame( side, 'L' )
+           lastv = 0
+           lastc = 0
+           if( tau/=czero ) then
+           ! set up variables for scanning v.  lastv begins pointing to the end
+           ! of v.
+              if( applyleft ) then
+                 lastv = m
+              else
+                 lastv = n
+              end if
+              if( incv>0 ) then
+                 i = 1 + (lastv-1) * incv
+              else
+                 i = 1
+              end if
+           ! look for the last non-czero row in v.
+              do while( lastv>0 .and. v( i )==czero )
+                 lastv = lastv - 1
+                 i = i - incv
+              end do
+              if( applyleft ) then
+           ! scan for the last non-czero column in c(1:lastv,:).
+                 lastc = stdlib_ilawlc(lastv, n, c, ldc)
+              else
+           ! scan for the last non-czero row in c(:,1:lastv).
+                 lastc = stdlib_ilawlr(m, lastv, c, ldc)
+              end if
+           end if
+           ! note that lastc.eq.0_qp renders the blas operations null; no special
+           ! case is needed at this level.
+           if( applyleft ) then
+              ! form  h * c
+              if( lastv>0 ) then
+                 ! w(1:lastc,1) := c(1:lastv,1:lastc)**h * v(1:lastv,1)
+                 call stdlib_wgemv( 'CONJUGATE TRANSPOSE', lastv, lastc, cone,c, ldc, v, incv, &
+                           czero, work, 1 )
+                 ! c(1:lastv,1:lastc) := c(...) - v(1:lastv,1) * w(1:lastc,1)**h
+                 call stdlib_wgerc( lastv, lastc, -tau, v, incv, work, 1, c, ldc )
+              end if
+           else
+              ! form  c * h
+              if( lastv>0 ) then
+                 ! w(1:lastc,1) := c(1:lastc,1:lastv) * v(1:lastv,1)
+                 call stdlib_wgemv( 'NO TRANSPOSE', lastc, lastv, cone, c, ldc,v, incv, czero, &
+                           work, 1 )
+                 ! c(1:lastc,1:lastv) := c(...) - w(1:lastc,1) * v(1:lastv,1)**h
+                 call stdlib_wgerc( lastc, lastv, -tau, work, 1, v, incv, c, ldc )
+              end if
+           end if
+           return
+     end subroutine stdlib_wlarf
+
+     !> ZLARFB: applies a complex block reflector H or its transpose H**H to a
+     !> complex M-by-N matrix C, from either the left or the right.
+
+     pure subroutine stdlib_wlarfb( side, trans, direct, storev, m, n, k, v, ldv,t, ldt, c, ldc, &
+               work, ldwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, ldc, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: c(ldc,*)
+           complex(qp), intent(in) :: t(ldt,*), v(ldv,*)
+           complex(qp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           character :: transt
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           if( stdlib_lsame( trans, 'N' ) ) then
+              transt = 'C'
+           else
+              transt = 'N'
+           end if
+           if( stdlib_lsame( storev, 'C' ) ) then
+              if( stdlib_lsame( direct, 'F' ) ) then
+                 ! let  v =  ( v1 )    (first k rows)
+                           ! ( v2 )
+                 ! where  v1  is unit lower triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**h * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**h * v  =  (c1**h * v1 + c2**h * v2)  (stored in work)
+                    ! w := c1**h
+                    do j = 1, k
+                       call stdlib_wcopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+                       call stdlib_wlacgv( n, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n,k, cone, v, &
+                              ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c2**h * v2
+                       call stdlib_wgemm( 'CONJUGATE TRANSPOSE', 'NO TRANSPOSE', n,k, m-k, cone, &
+                                 c( k+1, 1 ), ldc,v( k+1, 1 ), ldv, cone, work, ldwork )
+                    end if
+                    ! w := w * t**h  or  w * t
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', transt, 'NON-UNIT', n, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v * w**h
+                    if( m>k ) then
+                       ! c2 := c2 - v2 * w**h
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',m-k, n, k, -cone, &
+                                 v( k+1, 1 ), ldv, work,ldwork, cone, c( k+1, 1 ), ldc )
+                    end if
+                    ! w := w * v1**h
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', n, k, cone,&
+                               v, ldv, work, ldwork )
+                    ! c1 := c1 - w**h
+                    do j = 1, k
+                       do i = 1, n
+                          c( j, i ) = c( j, i ) - conjg( work( i, j ) )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**h  where  c = ( c1  c2 )
+                    ! w := c * v  =  (c1*v1 + c2*v2)  (stored in work)
+                    ! w := c1
+                    do j = 1, k
+                       call stdlib_wcopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', m,k, cone, v, &
+                              ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c2 * v2
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, k, n-k,cone, c( 1, k+&
+                                 1 ), ldc, v( k+1, 1 ), ldv,cone, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**h
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', trans, 'NON-UNIT', m, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v**h
+                    if( n>k ) then
+                       ! c2 := c2 - w * v2**h
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m,n-k, k, -cone, &
+                                 work, ldwork, v( k+1, 1 ),ldv, cone, c( 1, k+1 ), ldc )
+                    end if
+                    ! w := w * v1**h
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', m, k, cone,&
+                               v, ldv, work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, j ) = c( i, j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              else
+                 ! let  v =  ( v1 )
+                           ! ( v2 )    (last k rows)
+                 ! where  v2  is unit upper triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**h * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**h * v  =  (c1**h * v1 + c2**h * v2)  (stored in work)
+                    ! w := c2**h
+                    do j = 1, k
+                       call stdlib_wcopy( n, c( m-k+j, 1 ), ldc, work( 1, j ), 1 )
+                       call stdlib_wlacgv( n, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', n,k, cone, v( m-&
+                              k+1, 1 ), ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c1**h * v1
+                       call stdlib_wgemm( 'CONJUGATE TRANSPOSE', 'NO TRANSPOSE', n,k, m-k, cone, &
+                                 c, ldc, v, ldv, cone, work,ldwork )
+                    end if
+                    ! w := w * t**h  or  w * t
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v * w**h
+                    if( m>k ) then
+                       ! c1 := c1 - v1 * w**h
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',m-k, n, k, -cone, &
+                                 v, ldv, work, ldwork,cone, c, ldc )
+                    end if
+                    ! w := w * v2**h
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','UNIT', n, k, cone,&
+                               v( m-k+1, 1 ), ldv, work,ldwork )
+                    ! c2 := c2 - w**h
+                    do j = 1, k
+                       do i = 1, n
+                          c( m-k+j, i ) = c( m-k+j, i ) -conjg( work( i, j ) )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**h  where  c = ( c1  c2 )
+                    ! w := c * v  =  (c1*v1 + c2*v2)  (stored in work)
+                    ! w := c2
+                    do j = 1, k
+                       call stdlib_wcopy( m, c( 1, n-k+j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', m,k, cone, v( n-&
+                              k+1, 1 ), ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c1 * v1
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, k, n-k,cone, c, ldc, &
+                                 v, ldv, cone, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**h
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v**h
+                    if( n>k ) then
+                       ! c1 := c1 - w * v1**h
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m,n-k, k, -cone, &
+                                 work, ldwork, v, ldv, cone,c, ldc )
+                    end if
+                    ! w := w * v2**h
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','UNIT', m, k, cone,&
+                               v( n-k+1, 1 ), ldv, work,ldwork )
+                    ! c2 := c2 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, n-k+j ) = c( i, n-k+j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              end if
+           else if( stdlib_lsame( storev, 'R' ) ) then
+              if( stdlib_lsame( direct, 'F' ) ) then
+                 ! let  v =  ( v1  v2 )    (v1: first k columns)
+                 ! where  v1  is unit upper triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**h * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**h * v**h  =  (c1**h * v1**h + c2**h * v2**h) (stored in work)
+                    ! w := c1**h
+                    do j = 1, k
+                       call stdlib_wcopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+                       call stdlib_wlacgv( n, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1**h
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','UNIT', n, k, cone,&
+                               v, ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c2**h * v2**h
+                       call stdlib_wgemm( 'CONJUGATE TRANSPOSE','CONJUGATE TRANSPOSE', n, k, m-k, &
+                                 cone,c( k+1, 1 ), ldc, v( 1, k+1 ), ldv, cone,work, ldwork )
+                    end if
+                    ! w := w * t**h  or  w * t
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', transt, 'NON-UNIT', n, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v**h * w**h
+                    if( m>k ) then
+                       ! c2 := c2 - v2**h * w**h
+                       call stdlib_wgemm( 'CONJUGATE TRANSPOSE','CONJUGATE TRANSPOSE', m-k, n, k, &
+                                 -cone,v( 1, k+1 ), ldv, work, ldwork, cone,c( k+1, 1 ), ldc )
+                    end if
+                    ! w := w * v1
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', n,k, cone, v, &
+                              ldv, work, ldwork )
+                    ! c1 := c1 - w**h
+                    do j = 1, k
+                       do i = 1, n
+                          c( j, i ) = c( j, i ) - conjg( work( i, j ) )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**h  where  c = ( c1  c2 )
+                    ! w := c * v**h  =  (c1*v1**h + c2*v2**h)  (stored in work)
+                    ! w := c1
+                    do j = 1, k
+                       call stdlib_wcopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1**h
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','UNIT', m, k, cone,&
+                               v, ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c2 * v2**h
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m,k, n-k, cone, &
+                                 c( 1, k+1 ), ldc,v( 1, k+1 ), ldv, cone, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**h
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', trans, 'NON-UNIT', m, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v
+                    if( n>k ) then
+                       ! c2 := c2 - w * v2
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, n-k, k,-cone, work, &
+                                 ldwork, v( 1, k+1 ), ldv, cone,c( 1, k+1 ), ldc )
+                    end if
+                    ! w := w * v1
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', m,k, cone, v, &
+                              ldv, work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, j ) = c( i, j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              else
+                 ! let  v =  ( v1  v2 )    (v2: last k columns)
+                 ! where  v2  is unit lower triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**h * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**h * v**h  =  (c1**h * v1**h + c2**h * v2**h) (stored in work)
+                    ! w := c2**h
+                    do j = 1, k
+                       call stdlib_wcopy( n, c( m-k+j, 1 ), ldc, work( 1, j ), 1 )
+                       call stdlib_wlacgv( n, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2**h
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', n, k, cone,&
+                               v( 1, m-k+1 ), ldv, work,ldwork )
+                    if( m>k ) then
+                       ! w := w + c1**h * v1**h
+                       call stdlib_wgemm( 'CONJUGATE TRANSPOSE','CONJUGATE TRANSPOSE', n, k, m-k, &
+                                 cone, c,ldc, v, ldv, cone, work, ldwork )
+                    end if
+                    ! w := w * t**h  or  w * t
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v**h * w**h
+                    if( m>k ) then
+                       ! c1 := c1 - v1**h * w**h
+                       call stdlib_wgemm( 'CONJUGATE TRANSPOSE','CONJUGATE TRANSPOSE', m-k, n, k, &
+                                 -cone, v,ldv, work, ldwork, cone, c, ldc )
+                    end if
+                    ! w := w * v2
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n,k, cone, v( 1, &
+                              m-k+1 ), ldv, work, ldwork )
+                    ! c2 := c2 - w**h
+                    do j = 1, k
+                       do i = 1, n
+                          c( m-k+j, i ) = c( m-k+j, i ) -conjg( work( i, j ) )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**h  where  c = ( c1  c2 )
+                    ! w := c * v**h  =  (c1*v1**h + c2*v2**h)  (stored in work)
+                    ! w := c2
+                    do j = 1, k
+                       call stdlib_wcopy( m, c( 1, n-k+j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2**h
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', m, k, cone,&
+                               v( 1, n-k+1 ), ldv, work,ldwork )
+                    if( n>k ) then
+                       ! w := w + c1 * v1**h
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m,k, n-k, cone, &
+                                 c, ldc, v, ldv, cone, work,ldwork )
+                    end if
+                    ! w := w * t  or  w * t**h
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v
+                    if( n>k ) then
+                       ! c1 := c1 - w * v1
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, n-k, k,-cone, work, &
+                                 ldwork, v, ldv, cone, c, ldc )
+                    end if
+                    ! w := w * v2
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', m,k, cone, v( 1, &
+                              n-k+1 ), ldv, work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, n-k+j ) = c( i, n-k+j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_wlarfb
+
+     !> ZLARFB_GETT: applies a complex Householder block reflector H from the
+     !> left to a complex (K+M)-by-N  "triangular-pentagonal" matrix
+     !> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+     !> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+     !> in the array B. The block reflector H is stored in a compact
+     !> WY-representation, where the elementary reflectors are in the
+     !> arrays A, B and T. See Further Details section.
+
+     pure subroutine stdlib_wlarfb_gett( ident, m, n, k, t, ldt, a, lda, b, ldb,work, ldwork )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: ident
+           integer(ilp), intent(in) :: k, lda, ldb, ldt, ldwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(in) :: t(ldt,*)
+           complex(qp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnotident
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<0 .or. n<=0 .or. k==0 .or. k>n )return
+           lnotident = .not.stdlib_lsame( ident, 'I' )
+           ! ------------------------------------------------------------------
+           ! first step. computation of the column block 2:
+              ! ( a2 ) := h * ( a2 )
+              ! ( b2 )        ( b2 )
+           ! ------------------------------------------------------------------
+           if( n>k ) then
+              ! col2_(1) compute w2: = a2. therefore, copy a2 = a(1:k, k+1:n)
+              ! into w2=work(1:k, 1:n-k) column-by-column.
+              do j = 1, n-k
+                 call stdlib_wcopy( k, a( 1, k+j ), 1, work( 1, j ), 1 )
+              end do
+              if( lnotident ) then
+                 ! col2_(2) compute w2: = (v1**h) * w2 = (a1**h) * w2,
+                 ! v1 is not an identy matrix, but unit lower-triangular
+                 ! v1 stored in a1 (diagonal ones are not stored).
+                 call stdlib_wtrmm( 'L', 'L', 'C', 'U', k, n-k, cone, a, lda,work, ldwork )
+                           
+              end if
+              ! col2_(3) compute w2: = w2 + (v2**h) * b2 = w2 + (b1**h) * b2
+              ! v2 stored in b1.
+              if( m>0 ) then
+                 call stdlib_wgemm( 'C', 'N', k, n-k, m, cone, b, ldb,b( 1, k+1 ), ldb, cone, &
+                           work, ldwork )
+              end if
+              ! col2_(4) compute w2: = t * w2,
+              ! t is upper-triangular.
+              call stdlib_wtrmm( 'L', 'U', 'N', 'N', k, n-k, cone, t, ldt,work, ldwork )
+              ! col2_(5) compute b2: = b2 - v2 * w2 = b2 - b1 * w2,
+              ! v2 stored in b1.
+              if( m>0 ) then
+                 call stdlib_wgemm( 'N', 'N', m, n-k, k, -cone, b, ldb,work, ldwork, cone, b( 1, &
+                           k+1 ), ldb )
+              end if
+              if( lnotident ) then
+                 ! col2_(6) compute w2: = v1 * w2 = a1 * w2,
+                 ! v1 is not an identity matrix, but unit lower-triangular,
+                 ! v1 stored in a1 (diagonal ones are not stored).
+                 call stdlib_wtrmm( 'L', 'L', 'N', 'U', k, n-k, cone, a, lda,work, ldwork )
+                           
+              end if
+              ! col2_(7) compute a2: = a2 - w2 =
+                                   ! = a(1:k, k+1:n-k) - work(1:k, 1:n-k),
+              ! column-by-column.
+              do j = 1, n-k
+                 do i = 1, k
+                    a( i, k+j ) = a( i, k+j ) - work( i, j )
+                 end do
+              end do
+           end if
+           ! ------------------------------------------------------------------
+           ! second step. computation of the column block 1:
+              ! ( a1 ) := h * ( a1 )
+              ! ( b1 )        (  0 )
+           ! ------------------------------------------------------------------
+           ! col1_(1) compute w1: = a1. copy the upper-triangular
+           ! a1 = a(1:k, 1:k) into the upper-triangular
+           ! w1 = work(1:k, 1:k) column-by-column.
+           do j = 1, k
+              call stdlib_wcopy( j, a( 1, j ), 1, work( 1, j ), 1 )
+           end do
+           ! set the subdiagonal elements of w1 to zero column-by-column.
+           do j = 1, k - 1
+              do i = j + 1, k
+                 work( i, j ) = czero
+              end do
+           end do
+           if( lnotident ) then
+              ! col1_(2) compute w1: = (v1**h) * w1 = (a1**h) * w1,
+              ! v1 is not an identity matrix, but unit lower-triangular
+              ! v1 stored in a1 (diagonal ones are not stored),
+              ! w1 is upper-triangular with zeroes below the diagonal.
+              call stdlib_wtrmm( 'L', 'L', 'C', 'U', k, k, cone, a, lda,work, ldwork )
+           end if
+           ! col1_(3) compute w1: = t * w1,
+           ! t is upper-triangular,
+           ! w1 is upper-triangular with zeroes below the diagonal.
+           call stdlib_wtrmm( 'L', 'U', 'N', 'N', k, k, cone, t, ldt,work, ldwork )
+           ! col1_(4) compute b1: = - v2 * w1 = - b1 * w1,
+           ! v2 = b1, w1 is upper-triangular with zeroes below the diagonal.
+           if( m>0 ) then
+              call stdlib_wtrmm( 'R', 'U', 'N', 'N', m, k, -cone, work, ldwork,b, ldb )
+           end if
+           if( lnotident ) then
+              ! col1_(5) compute w1: = v1 * w1 = a1 * w1,
+              ! v1 is not an identity matrix, but unit lower-triangular
+              ! v1 stored in a1 (diagonal ones are not stored),
+              ! w1 is upper-triangular on input with zeroes below the diagonal,
+              ! and square on output.
+              call stdlib_wtrmm( 'L', 'L', 'N', 'U', k, k, cone, a, lda,work, ldwork )
+              ! col1_(6) compute a1: = a1 - w1 = a(1:k, 1:k) - work(1:k, 1:k)
+              ! column-by-column. a1 is upper-triangular on input.
+              ! if ident, a1 is square on output, and w1 is square,
+              ! if not ident, a1 is upper-triangular on output,
+              ! w1 is upper-triangular.
+              ! col1_(6)_a compute elements of a1 below the diagonal.
+              do j = 1, k - 1
+                 do i = j + 1, k
+                    a( i, j ) = - work( i, j )
+                 end do
+              end do
+           end if
+           ! col1_(6)_b compute elements of a1 on and above the diagonal.
+           do j = 1, k
+              do i = 1, j
+                 a( i, j ) = a( i, j ) - work( i, j )
+              end do
+           end do
+           return
+     end subroutine stdlib_wlarfb_gett
+
+     !> ZLARFG: generates a complex elementary reflector H of order n, such
+     !> that
+     !> H**H * ( alpha ) = ( beta ),   H**H * H = I.
+     !> (   x   )   (   0  )
+     !> where alpha and beta are scalars, with beta real, and x is an
+     !> (n-1)-element complex vector. H is represented in the form
+     !> H = I - tau * ( 1 ) * ( 1 v**H ) ,
+     !> ( v )
+     !> where tau is a complex scalar and v is a complex (n-1)-element
+     !> vector. Note that H is not hermitian.
+     !> If the elements of x are all zero and alpha is real, then tau = 0
+     !> and H is taken to be the unit matrix.
+     !> Otherwise  1 <= real(tau) <= 2  and  abs(tau-1) <= 1 .
+
+     pure subroutine stdlib_wlarfg( n, alpha, x, incx, tau )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           complex(qp), intent(inout) :: alpha
+           complex(qp), intent(out) :: tau
+           ! Array Arguments 
+           complex(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, knt
+           real(qp) :: alphi, alphr, beta, rsafmn, safmin, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,sign
+           ! Executable Statements 
+           if( n<=0 ) then
+              tau = zero
+              return
+           end if
+           xnorm = stdlib_qznrm2( n-1, x, incx )
+           alphr = real( alpha,KIND=qp)
+           alphi = aimag( alpha )
+           if( xnorm==zero .and. alphi==zero ) then
+              ! h  =  i
+              tau = zero
+           else
+              ! general case
+              beta = -sign( stdlib_qlapy3( alphr, alphi, xnorm ), alphr )
+              safmin = stdlib_qlamch( 'S' ) / stdlib_qlamch( 'E' )
+              rsafmn = one / safmin
+              knt = 0
+              if( abs( beta )<safmin ) then
+                 ! xnorm, beta may be inaccurate; scale x and recompute them
+                 10 continue
+                 knt = knt + 1
+                 call stdlib_wdscal( n-1, rsafmn, x, incx )
+                 beta = beta*rsafmn
+                 alphi = alphi*rsafmn
+                 alphr = alphr*rsafmn
+                 if( (abs( beta )<safmin) .and. (knt < 20) )go to 10
+                 ! new beta is at most 1, at least safmin
+                 xnorm = stdlib_qznrm2( n-1, x, incx )
+                 alpha = cmplx( alphr, alphi,KIND=qp)
+                 beta = -sign( stdlib_qlapy3( alphr, alphi, xnorm ), alphr )
+              end if
+              tau = cmplx( ( beta-alphr ) / beta, -alphi / beta,KIND=qp)
+              alpha = stdlib_wladiv( cmplx( one,KIND=qp), alpha-beta )
+              call stdlib_wscal( n-1, alpha, x, incx )
+              ! if alpha is subnormal, it may lose relative accuracy
+              do j = 1, knt
+                 beta = beta*safmin
+              end do
+              alpha = beta
+           end if
+           return
+     end subroutine stdlib_wlarfg
+
+     !> ZLARFGP: generates a complex elementary reflector H of order n, such
+     !> that
+     !> H**H * ( alpha ) = ( beta ),   H**H * H = I.
+     !> (   x   )   (   0  )
+     !> where alpha and beta are scalars, beta is real and non-negative, and
+     !> x is an (n-1)-element complex vector.  H is represented in the form
+     !> H = I - tau * ( 1 ) * ( 1 v**H ) ,
+     !> ( v )
+     !> where tau is a complex scalar and v is a complex (n-1)-element
+     !> vector. Note that H is not hermitian.
+     !> If the elements of x are all zero and alpha is real, then tau = 0
+     !> and H is taken to be the unit matrix.
+
+     subroutine stdlib_wlarfgp( n, alpha, x, incx, tau )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           complex(qp), intent(inout) :: alpha
+           complex(qp), intent(out) :: tau
+           ! Array Arguments 
+           complex(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, knt
+           real(qp) :: alphi, alphr, beta, bignum, smlnum, xnorm
+           complex(qp) :: savealpha
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,sign
+           ! Executable Statements 
+           if( n<=0 ) then
+              tau = zero
+              return
+           end if
+           xnorm = stdlib_qznrm2( n-1, x, incx )
+           alphr = real( alpha,KIND=qp)
+           alphi = aimag( alpha )
+           if( xnorm==zero ) then
+              ! h  =  [1-alpha/abs(alpha) 0; 0 i], sign chosen so alpha >= 0.
+              if( alphi==zero ) then
+                 if( alphr>=zero ) then
+                    ! when tau.eq.zero, the vector is special-cased to be
+                    ! all zeros in the application routines.  we do not need
+                    ! to clear it.
+                    tau = zero
+                 else
+                    ! however, the application routines rely on explicit
+                    ! zero checks when tau.ne.zero, and we must clear x.
+                    tau = two
+                    do j = 1, n-1
+                       x( 1 + (j-1)*incx ) = zero
+                    end do
+                    alpha = -alpha
+                 end if
+              else
+                 ! only "reflecting" the diagonal entry to be real and non-negative.
+                 xnorm = stdlib_qlapy2( alphr, alphi )
+                 tau = cmplx( one - alphr / xnorm, -alphi / xnorm,KIND=qp)
+                 do j = 1, n-1
+                    x( 1 + (j-1)*incx ) = zero
+                 end do
+                 alpha = xnorm
+              end if
+           else
+              ! general case
+              beta = sign( stdlib_qlapy3( alphr, alphi, xnorm ), alphr )
+              smlnum = stdlib_qlamch( 'S' ) / stdlib_qlamch( 'E' )
+              bignum = one / smlnum
+              knt = 0
+              if( abs( beta )<smlnum ) then
+                 ! xnorm, beta may be inaccurate; scale x and recompute them
+                 10 continue
+                 knt = knt + 1
+                 call stdlib_wdscal( n-1, bignum, x, incx )
+                 beta = beta*bignum
+                 alphi = alphi*bignum
+                 alphr = alphr*bignum
+                 if( (abs( beta )<smlnum) .and. (knt < 20) )go to 10
+                 ! new beta is at most 1, at least smlnum
+                 xnorm = stdlib_qznrm2( n-1, x, incx )
+                 alpha = cmplx( alphr, alphi,KIND=qp)
+                 beta = sign( stdlib_qlapy3( alphr, alphi, xnorm ), alphr )
+              end if
+              savealpha = alpha
+              alpha = alpha + beta
+              if( beta<zero ) then
+                 beta = -beta
+                 tau = -alpha / beta
+              else
+                 alphr = alphi * (alphi/real( alpha,KIND=qp))
+                 alphr = alphr + xnorm * (xnorm/real( alpha,KIND=qp))
+                 tau = cmplx( alphr/beta, -alphi/beta,KIND=qp)
+                 alpha = cmplx( -alphr, alphi,KIND=qp)
+              end if
+              alpha = stdlib_wladiv( cmplx( one,KIND=qp), alpha )
+              if ( abs(tau)<=smlnum ) then
+                 ! in the case where the computed tau ends up being a denormalized number,
+                 ! it loses relative accuracy. this is a big problem. solution: flush tau
+                 ! to zero (or two or whatever makes a nonnegative real number for beta).
+                 ! (bug report provided by pat quillen from mathworks on jul 29, 2009.)
+                 ! (thanks pat. thanks mathworks.)
+                 alphr = real( savealpha,KIND=qp)
+                 alphi = aimag( savealpha )
+                 if( alphi==zero ) then
+                    if( alphr>=zero ) then
+                       tau = zero
+                    else
+                       tau = two
+                       do j = 1, n-1
+                          x( 1 + (j-1)*incx ) = zero
+                       end do
+                       beta = real( -savealpha,KIND=qp)
+                    end if
+                 else
+                    xnorm = stdlib_qlapy2( alphr, alphi )
+                    tau = cmplx( one - alphr / xnorm, -alphi / xnorm,KIND=qp)
+                    do j = 1, n-1
+                       x( 1 + (j-1)*incx ) = zero
+                    end do
+                    beta = xnorm
+                 end if
+              else
+                 ! this is the general case.
+                 call stdlib_wscal( n-1, alpha, x, incx )
+              end if
+              ! if beta is subnormal, it may lose relative accuracy
+              do j = 1, knt
+                 beta = beta*smlnum
+              end do
+              alpha = beta
+           end if
+           return
+     end subroutine stdlib_wlarfgp
+
+     !> ZLARFT: forms the triangular factor T of a complex block reflector H
+     !> of order n, which is defined as a product of k elementary reflectors.
+     !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+     !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+     !> If STOREV = 'C', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th column of the array V, and
+     !> H  =  I - V * T * V**H
+     !> If STOREV = 'R', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th row of the array V, and
+     !> H  =  I - V**H * T * V
+
+     pure subroutine stdlib_wlarft( direct, storev, n, k, v, ldv, tau, t, ldt )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, storev
+           integer(ilp), intent(in) :: k, ldt, ldv, n
+           ! Array Arguments 
+           complex(qp), intent(out) :: t(ldt,*)
+           complex(qp), intent(in) :: tau(*), v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, prevlastv, lastv
+           ! Executable Statements 
+           ! quick return if possible
+           if( n==0 )return
+           if( stdlib_lsame( direct, 'F' ) ) then
+              prevlastv = n
+              do i = 1, k
+                 prevlastv = max( prevlastv, i )
+                 if( tau( i )==czero ) then
+                    ! h(i)  =  i
+                    do j = 1, i
+                       t( j, i ) = czero
+                    end do
+                 else
+                    ! general case
+                    if( stdlib_lsame( storev, 'C' ) ) then
+                       ! skip any trailing zeros.
+                       do lastv = n, i+1, -1
+                          if( v( lastv, i )/=czero ) exit
+                       end do
+                       do j = 1, i-1
+                          t( j, i ) = -tau( i ) * conjg( v( i , j ) )
+                       end do
+                       j = min( lastv, prevlastv )
+                       ! t(1:i-1,i) := - tau(i) * v(i:j,1:i-1)**h * v(i:j,i)
+                       call stdlib_wgemv( 'CONJUGATE TRANSPOSE', j-i, i-1,-tau( i ), v( i+1, 1 ), &
+                                 ldv,v( i+1, i ), 1, cone, t( 1, i ), 1 )
+                    else
+                       ! skip any trailing zeros.
+                       do lastv = n, i+1, -1
+                          if( v( i, lastv )/=czero ) exit
+                       end do
+                       do j = 1, i-1
+                          t( j, i ) = -tau( i ) * v( j , i )
+                       end do
+                       j = min( lastv, prevlastv )
+                       ! t(1:i-1,i) := - tau(i) * v(1:i-1,i:j) * v(i,i:j)**h
+                       call stdlib_wgemm( 'N', 'C', i-1, 1, j-i, -tau( i ),v( 1, i+1 ), ldv, v( i,&
+                                  i+1 ), ldv,cone, t( 1, i ), ldt )
+                    end if
+                    ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+                    call stdlib_wtrmv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', i-1, t,ldt, t( 1, i ),&
+                               1 )
+                    t( i, i ) = tau( i )
+                    if( i>1 ) then
+                       prevlastv = max( prevlastv, lastv )
+                    else
+                       prevlastv = lastv
+                    end if
+                  end if
+              end do
+           else
+              prevlastv = 1
+              do i = k, 1, -1
+                 if( tau( i )==czero ) then
+                    ! h(i)  =  i
+                    do j = i, k
+                       t( j, i ) = czero
+                    end do
+                 else
+                    ! general case
+                    if( i<k ) then
+                       if( stdlib_lsame( storev, 'C' ) ) then
+                          ! skip any leading zeros.
+                          do lastv = 1, i-1
+                             if( v( lastv, i )/=czero ) exit
+                          end do
+                          do j = i+1, k
+                             t( j, i ) = -tau( i ) * conjg( v( n-k+i , j ) )
+                          end do
+                          j = max( lastv, prevlastv )
+                          ! t(i+1:k,i) = -tau(i) * v(j:n-k+i,i+1:k)**h * v(j:n-k+i,i)
+                          call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-k+i-j, k-i,-tau( i ), v( j, &
+                                    i+1 ), ldv, v( j, i ),1, cone, t( i+1, i ), 1 )
+                       else
+                          ! skip any leading zeros.
+                          do lastv = 1, i-1
+                             if( v( i, lastv )/=czero ) exit
+                          end do
+                          do j = i+1, k
+                             t( j, i ) = -tau( i ) * v( j, n-k+i )
+                          end do
+                          j = max( lastv, prevlastv )
+                          ! t(i+1:k,i) = -tau(i) * v(i+1:k,j:n-k+i) * v(i,j:n-k+i)**h
+                          call stdlib_wgemm( 'N', 'C', k-i, 1, n-k+i-j, -tau( i ),v( i+1, j ), &
+                                    ldv, v( i, j ), ldv,cone, t( i+1, i ), ldt )
+                       end if
+                       ! t(i+1:k,i) := t(i+1:k,i+1:k) * t(i+1:k,i)
+                       call stdlib_wtrmv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', k-i,t( i+1, i+1 ), &
+                                 ldt, t( i+1, i ), 1 )
+                       if( i>1 ) then
+                          prevlastv = min( prevlastv, lastv )
+                       else
+                          prevlastv = lastv
+                       end if
+                    end if
+                    t( i, i ) = tau( i )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_wlarft
+
+     !> ZLARFX: applies a complex elementary reflector H to a complex m by n
+     !> matrix C, from either the left or the right. H is represented in the
+     !> form
+     !> H = I - tau * v * v**H
+     !> where tau is a complex scalar and v is a complex vector.
+     !> If tau = 0, then H is taken to be the unit matrix
+     !> This version uses inline code if H has order < 11.
+
+     pure subroutine stdlib_wlarfx( side, m, n, v, tau, c, ldc, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: ldc, m, n
+           complex(qp), intent(in) :: tau
+           ! Array Arguments 
+           complex(qp), intent(inout) :: c(ldc,*)
+           complex(qp), intent(in) :: v(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j
+           complex(qp) :: sum, t1, t10, t2, t3, t4, t5, t6, t7, t8, t9, v1, v10, v2, v3, v4, v5, &
+                     v6, v7, v8, v9
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           if( tau==czero )return
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c, where h has order m.
+              go to ( 10, 30, 50, 70, 90, 110, 130, 150,170, 190 )m
+              ! code for general m
+              call stdlib_wlarf( side, m, n, v, 1, tau, c, ldc, work )
+              go to 410
+              10 continue
+              ! special code for 1 x 1 householder
+              t1 = cone - tau*v( 1 )*conjg( v( 1 ) )
+              do j = 1, n
+                 c( 1, j ) = t1*c( 1, j )
+              end do
+              go to 410
+              30 continue
+              ! special code for 2 x 2 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+              end do
+              go to 410
+              50 continue
+              ! special code for 3 x 3 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+              end do
+              go to 410
+              70 continue
+              ! special code for 4 x 4 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+              end do
+              go to 410
+              90 continue
+              ! special code for 5 x 5 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j )
+                           
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+              end do
+              go to 410
+              110 continue
+              ! special code for 6 x 6 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+              end do
+              go to 410
+              130 continue
+              ! special code for 7 x 7 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              v7 = conjg( v( 7 ) )
+              t7 = tau*conjg( v7 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+              end do
+              go to 410
+              150 continue
+              ! special code for 8 x 8 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              v7 = conjg( v( 7 ) )
+              t7 = tau*conjg( v7 )
+              v8 = conjg( v( 8 ) )
+              t8 = tau*conjg( v8 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+              end do
+              go to 410
+              170 continue
+              ! special code for 9 x 9 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              v7 = conjg( v( 7 ) )
+              t7 = tau*conjg( v7 )
+              v8 = conjg( v( 8 ) )
+              t8 = tau*conjg( v8 )
+              v9 = conjg( v( 9 ) )
+              t9 = tau*conjg( v9 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j ) + v9*c( 9, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+                 c( 9, j ) = c( 9, j ) - sum*t9
+              end do
+              go to 410
+              190 continue
+              ! special code for 10 x 10 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              v7 = conjg( v( 7 ) )
+              t7 = tau*conjg( v7 )
+              v8 = conjg( v( 8 ) )
+              t8 = tau*conjg( v8 )
+              v9 = conjg( v( 9 ) )
+              t9 = tau*conjg( v9 )
+              v10 = conjg( v( 10 ) )
+              t10 = tau*conjg( v10 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j ) + v9*c( 9, j ) +v10*c( 10, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+                 c( 9, j ) = c( 9, j ) - sum*t9
+                 c( 10, j ) = c( 10, j ) - sum*t10
+              end do
+              go to 410
+           else
+              ! form  c * h, where h has order n.
+              go to ( 210, 230, 250, 270, 290, 310, 330, 350,370, 390 )n
+              ! code for general n
+              call stdlib_wlarf( side, m, n, v, 1, tau, c, ldc, work )
+              go to 410
+              210 continue
+              ! special code for 1 x 1 householder
+              t1 = cone - tau*v( 1 )*conjg( v( 1 ) )
+              do j = 1, m
+                 c( j, 1 ) = t1*c( j, 1 )
+              end do
+              go to 410
+              230 continue
+              ! special code for 2 x 2 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+              end do
+              go to 410
+              250 continue
+              ! special code for 3 x 3 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+              end do
+              go to 410
+              270 continue
+              ! special code for 4 x 4 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+              end do
+              go to 410
+              290 continue
+              ! special code for 5 x 5 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 )
+                           
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+              end do
+              go to 410
+              310 continue
+              ! special code for 6 x 6 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+              end do
+              go to 410
+              330 continue
+              ! special code for 7 x 7 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              v7 = v( 7 )
+              t7 = tau*conjg( v7 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+              end do
+              go to 410
+              350 continue
+              ! special code for 8 x 8 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              v7 = v( 7 )
+              t7 = tau*conjg( v7 )
+              v8 = v( 8 )
+              t8 = tau*conjg( v8 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+              end do
+              go to 410
+              370 continue
+              ! special code for 9 x 9 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              v7 = v( 7 )
+              t7 = tau*conjg( v7 )
+              v8 = v( 8 )
+              t8 = tau*conjg( v8 )
+              v9 = v( 9 )
+              t9 = tau*conjg( v9 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 ) + v9*c( j, 9 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+                 c( j, 9 ) = c( j, 9 ) - sum*t9
+              end do
+              go to 410
+              390 continue
+              ! special code for 10 x 10 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              v7 = v( 7 )
+              t7 = tau*conjg( v7 )
+              v8 = v( 8 )
+              t8 = tau*conjg( v8 )
+              v9 = v( 9 )
+              t9 = tau*conjg( v9 )
+              v10 = v( 10 )
+              t10 = tau*conjg( v10 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 ) + v9*c( j, 9 ) +v10*c( j, 10 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+                 c( j, 9 ) = c( j, 9 ) - sum*t9
+                 c( j, 10 ) = c( j, 10 ) - sum*t10
+              end do
+              go to 410
+           end if
+           410 continue
+           return
+     end subroutine stdlib_wlarfx
+
+     !> ZLARFY: applies an elementary reflector, or Householder matrix, H,
+     !> to an n x n Hermitian matrix C, from both the left and the right.
+     !> H is represented in the form
+     !> H = I - tau * v * v'
+     !> where  tau  is a scalar and  v  is a vector.
+     !> If  tau  is  zero, then  H  is taken to be the unit matrix.
+
+     pure subroutine stdlib_wlarfy( uplo, n, v, incv, tau, c, ldc, work )
+        ! -- lapack test routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incv, ldc, n
+           complex(qp), intent(in) :: tau
+           ! Array Arguments 
+           complex(qp), intent(inout) :: c(ldc,*)
+           complex(qp), intent(in) :: v(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(qp) :: alpha
+           ! Executable Statements 
+           if( tau==czero )return
+           ! form  w:= c * v
+           call stdlib_whemv( uplo, n, cone, c, ldc, v, incv, czero, work, 1 )
+           alpha = -chalf*tau*stdlib_wdotc( n, work, 1, v, incv )
+           call stdlib_waxpy( n, alpha, v, incv, work, 1 )
+           ! c := c - v * w' - w * v'
+           call stdlib_wher2( uplo, n, -tau, v, incv, work, 1, c, ldc )
+           return
+     end subroutine stdlib_wlarfy
+
+     !> ZLARGV: generates a vector of complex plane rotations with real
+     !> cosines, determined by elements of the complex vectors x and y.
+     !> For i = 1,2,...,n
+     !> (        c(i)   s(i) ) ( x(i) ) = ( r(i) )
+     !> ( -conjg(s(i))  c(i) ) ( y(i) ) = (   0  )
+     !> where c(i)**2 + ABS(s(i))**2 = 1
+     !> The following conventions are used (these are the same as in ZLARTG,
+     !> but differ from the BLAS1 routine ZROTG):
+     !> If y(i)=0, then c(i)=1 and s(i)=0.
+     !> If x(i)=0, then c(i)=0 and s(i) is chosen so that r(i) is real.
+
+     pure subroutine stdlib_wlargv( n, x, incx, y, incy, c, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, incy, n
+           ! Array Arguments 
+           real(qp), intent(out) :: c(*)
+           complex(qp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           ! logical            first
+           integer(ilp) :: count, i, ic, ix, iy, j
+           real(qp) :: cs, d, di, dr, eps, f2, f2s, g2, g2s, safmin, safmn2, safmx2, scale
+           complex(qp) :: f, ff, fs, g, gs, r, sn
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,int,log,max,sqrt
+           ! Statement Functions 
+           real(qp) :: abs1, abssq
+           ! Save Statement 
+           ! save               first, safmx2, safmin, safmn2
+           ! Data Statements 
+           ! data               first / .true. /
+           ! Statement Function Definitions 
+           abs1( ff ) = max( abs( real( ff,KIND=qp) ), abs( aimag( ff ) ) )
+           abssq( ff ) = real( ff,KIND=qp)**2 + aimag( ff )**2
+           ! Executable Statements 
+           ! if( first ) then
+              ! first = .false.
+              safmin = stdlib_qlamch( 'S' )
+              eps = stdlib_qlamch( 'E' )
+              safmn2 = stdlib_qlamch( 'B' )**int( log( safmin / eps ) /log( stdlib_qlamch( 'B' ) )&
+                         / two,KIND=ilp)
+              safmx2 = one / safmn2
+           ! end if
+           ix = 1
+           iy = 1
+           ic = 1
+           loop_60: do i = 1, n
+              f = x( ix )
+              g = y( iy )
+              ! use identical algorithm as in stdlib_wlartg
+              scale = max( abs1( f ), abs1( g ) )
+              fs = f
+              gs = g
+              count = 0
+              if( scale>=safmx2 ) then
+              10 continue
+                 count = count + 1
+                 fs = fs*safmn2
+                 gs = gs*safmn2
+                 scale = scale*safmn2
+                 if( scale>=safmx2 .and. count < 20 )go to 10
+              else if( scale<=safmn2 ) then
+                 if( g==czero ) then
+                    cs = one
+                    sn = czero
+                    r = f
+                    go to 50
+                 end if
+                 20 continue
+                 count = count - 1
+                 fs = fs*safmx2
+                 gs = gs*safmx2
+                 scale = scale*safmx2
+                 if( scale<=safmn2 )go to 20
+              end if
+              f2 = abssq( fs )
+              g2 = abssq( gs )
+              if( f2<=max( g2, one )*safmin ) then
+                 ! this is a rare case: f is very small.
+                 if( f==czero ) then
+                    cs = zero
+                    r = stdlib_qlapy2( real( g,KIND=qp), aimag( g ) )
+                    ! do complex/real division explicitly with two real
+                    ! divisions
+                    d = stdlib_qlapy2( real( gs,KIND=qp), aimag( gs ) )
+                    sn = cmplx( real( gs,KIND=qp) / d, -aimag( gs ) / d,KIND=qp)
+                    go to 50
+                 end if
+                 f2s = stdlib_qlapy2( real( fs,KIND=qp), aimag( fs ) )
+                 ! g2 and g2s are accurate
+                 ! g2 is at least safmin, and g2s is at least safmn2
+                 g2s = sqrt( g2 )
+                 ! error in cs from underflow in f2s is at most
+                 ! unfl / safmn2 .lt. sqrt(unfl*eps) .lt. eps
+                 ! if max(g2,one)=g2, then f2 .lt. g2*safmin,
+                 ! and so cs .lt. sqrt(safmin)
+                 ! if max(g2,one)=one, then f2 .lt. safmin
+                 ! and so cs .lt. sqrt(safmin)/safmn2 = sqrt(eps)
+                 ! therefore, cs = f2s/g2s / sqrt( 1 + (f2s/g2s)**2 ) = f2s/g2s
+                 cs = f2s / g2s
+                 ! make sure abs(ff) = 1
+                 ! do complex/real division explicitly with 2 real divisions
+                 if( abs1( f )>one ) then
+                    d = stdlib_qlapy2( real( f,KIND=qp), aimag( f ) )
+                    ff = cmplx( real( f,KIND=qp) / d, aimag( f ) / d,KIND=qp)
+                 else
+                    dr = safmx2*real( f,KIND=qp)
+                    di = safmx2*aimag( f )
+                    d = stdlib_qlapy2( dr, di )
+                    ff = cmplx( dr / d, di / d,KIND=qp)
+                 end if
+                 sn = ff*cmplx( real( gs,KIND=qp) / g2s, -aimag( gs ) / g2s,KIND=qp)
+                 r = cs*f + sn*g
+              else
+                 ! this is the most common case.
+                 ! neither f2 nor f2/g2 are less than safmin
+                 ! f2s cannot overflow, and it is accurate
+                 f2s = sqrt( one+g2 / f2 )
+                 ! do the f2s(real)*fs(complex) multiply with two real
+                 ! multiplies
+                 r = cmplx( f2s*real( fs,KIND=qp), f2s*aimag( fs ),KIND=qp)
+                 cs = one / f2s
+                 d = f2 + g2
+                 ! do complex/real division explicitly with two real divisions
+                 sn = cmplx( real( r,KIND=qp) / d, aimag( r ) / d,KIND=qp)
+                 sn = sn*conjg( gs )
+                 if( count/=0 ) then
+                    if( count>0 ) then
+                       do j = 1, count
+                          r = r*safmx2
+                       end do
+                    else
+                       do j = 1, -count
+                          r = r*safmn2
+                       end do
+                    end if
+                 end if
+              end if
+              50 continue
+              c( ic ) = cs
+              y( iy ) = sn
+              x( ix ) = r
+              ic = ic + incc
+              iy = iy + incy
+              ix = ix + incx
+           end do loop_60
+           return
+     end subroutine stdlib_wlargv
+
+     !> ZLARNV: returns a vector of n random complex numbers from a uniform or
+     !> normal distribution.
+
+     pure subroutine stdlib_wlarnv( idist, iseed, n, x )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: idist, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: iseed(4)
+           complex(qp), intent(out) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: lv = 128
+           real(qp), parameter :: twopi = 6.28318530717958647692528676655900576839e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, il, iv
+           ! Local Arrays 
+           real(qp) :: u(lv)
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,exp,log,min,sqrt
+           ! Executable Statements 
+           do 60 iv = 1, n, lv / 2
+              il = min( lv / 2, n-iv+1 )
+              ! call stdlib_qlaruv to generate 2*il realnumbers from a uniform (0,1,KIND=qp)
+              ! distribution (2*il <= lv)
+              call stdlib_qlaruv( iseed, 2*il, u )
+              if( idist==1 ) then
+                 ! copy generated numbers
+                 do i = 1, il
+                    x( iv+i-1 ) = cmplx( u( 2*i-1 ), u( 2*i ),KIND=qp)
+                 end do
+              else if( idist==2 ) then
+                 ! convert generated numbers to uniform (-1,1) distribution
+                 do i = 1, il
+                    x( iv+i-1 ) = cmplx( two*u( 2*i-1 )-one,two*u( 2*i )-one,KIND=qp)
+                 end do
+              else if( idist==3 ) then
+                 ! convert generated numbers to normal (0,1) distribution
+                 do i = 1, il
+                    x( iv+i-1 ) = sqrt( -two*log( u( 2*i-1 ) ) )*exp( cmplx( zero, twopi*u( 2*i ),&
+                              KIND=qp) )
+                 end do
+              else if( idist==4 ) then
+                 ! convert generated numbers to complex numbers uniformly
+                 ! distributed on the unit disk
+                 do i = 1, il
+                    x( iv+i-1 ) = sqrt( u( 2*i-1 ) )*exp( cmplx( zero, twopi*u( 2*i ),KIND=qp) )
+                              
+                 end do
+              else if( idist==5 ) then
+                 ! convert generated numbers to complex numbers uniformly
+                 ! distributed on the unit circle
+                 do i = 1, il
+                    x( iv+i-1 ) = exp( cmplx( zero, twopi*u( 2*i ),KIND=qp) )
+                 end do
+              end if
+              60 continue
+           return
+     end subroutine stdlib_wlarnv
+
+     !> ZLARRV: computes the eigenvectors of the tridiagonal matrix
+     !> T = L D L**T given L, D and APPROXIMATIONS to the eigenvalues of L D L**T.
+     !> The input eigenvalues should have been computed by DLARRE.
+
+     pure subroutine stdlib_wlarrv( n, vl, vu, d, l, pivmin,isplit, m, dol, dou, minrgp,rtol1, &
+               rtol2, w, werr, wgap,iblock, indexw, gers, z, ldz, isuppz,work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: dol, dou, ldz, m, n
+           integer(ilp), intent(out) :: info
+           real(qp), intent(in) :: minrgp, pivmin, vl, vu
+           real(qp), intent(inout) :: rtol1, rtol2
+           ! Array Arguments 
+           integer(ilp), intent(in) :: iblock(*), indexw(*), isplit(*)
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(qp), intent(inout) :: d(*), l(*), w(*), werr(*), wgap(*)
+           real(qp), intent(in) :: gers(*)
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxitr = 10
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: eskip, needbs, stp2ii, tryrqc, usedbs, usedrq
+           integer(ilp) :: done, i, ibegin, idone, iend, ii, iindc1, iindc2, iindr, iindwk, iinfo,&
+            im, in, indeig, indld, indlld, indwrk, isupmn, isupmx, iter, itmp1, j, jblk, k, &
+            miniwsize, minwsize, nclus, ndepth, negcnt, newcls, newfst, newftt, newlst, newsiz, &
+            offset, oldcls, oldfst, oldien, oldlst, oldncl, p, parity, q, wbegin, wend, windex, &
+                      windmn, windpl, zfrom, zto, zusedl, zusedu, zusedw
+           integer(ilp) :: indin1, indin2
+           real(qp) :: bstres, bstw, eps, fudge, gap, gaptol, gl, gu, lambda, left, lgap, mingma, &
+           nrminv, resid, rgap, right, rqcorr, rqtol, savgap, sgndef, sigma, spdiam, ssigma, tau, &
+                     tmp, tol, ztz
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min
+           intrinsic :: cmplx
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( (n<=0).or.(m<=0) ) then
+              return
+           end if
+           ! the first n entries of work are reserved for the eigenvalues
+           indld = n+1
+           indlld= 2*n+1
+           indin1 = 3*n + 1
+           indin2 = 4*n + 1
+           indwrk = 5*n + 1
+           minwsize = 12 * n
+           do i= 1,minwsize
+              work( i ) = zero
+           end do
+           ! iwork(iindr+1:iindr+n) hold the twist indices r for the
+           ! factorization used to compute the fp vector
+           iindr = 0
+           ! iwork(iindc1+1:iinc2+n) are used to store the clusters of the current
+           ! layer and the one above.
+           iindc1 = n
+           iindc2 = 2*n
+           iindwk = 3*n + 1
+           miniwsize = 7 * n
+           do i= 1,miniwsize
+              iwork( i ) = 0
+           end do
+           zusedl = 1
+           if(dol>1) then
+              ! set lower bound for use of z
+              zusedl = dol-1
+           endif
+           zusedu = m
+           if(dou<m) then
+              ! set lower bound for use of z
+              zusedu = dou+1
+           endif
+           ! the width of the part of z that is used
+           zusedw = zusedu - zusedl + 1
+           call stdlib_wlaset( 'FULL', n, zusedw, czero, czero,z(1,zusedl), ldz )
+           eps = stdlib_qlamch( 'PRECISION' )
+           rqtol = two * eps
+           ! set expert flags for standard code.
+           tryrqc = .true.
+           if((dol==1).and.(dou==m)) then
+           else
+              ! only selected eigenpairs are computed. since the other evalues
+              ! are not refined by rq iteration, bisection has to compute to full
+              ! accuracy.
+              rtol1 = four * eps
+              rtol2 = four * eps
+           endif
+           ! the entries wbegin:wend in w, werr, wgap correspond to the
+           ! desired eigenvalues. the support of the nonzero eigenvector
+           ! entries is contained in the interval ibegin:iend.
+           ! remark that if k eigenpairs are desired, then the eigenvectors
+           ! are stored in k contiguous columns of z.
+           ! done is the number of eigenvectors already computed
+           done = 0
+           ibegin = 1
+           wbegin = 1
+           loop_170: do jblk = 1, iblock( m )
+              iend = isplit( jblk )
+              sigma = l( iend )
+              ! find the eigenvectors of the submatrix indexed ibegin
+              ! through iend.
+              wend = wbegin - 1
+              15 continue
+              if( wend<m ) then
+                 if( iblock( wend+1 )==jblk ) then
+                    wend = wend + 1
+                    go to 15
+                 end if
+              end if
+              if( wend<wbegin ) then
+                 ibegin = iend + 1
+                 cycle loop_170
+              elseif( (wend<dol).or.(wbegin>dou) ) then
+                 ibegin = iend + 1
+                 wbegin = wend + 1
+                 cycle loop_170
+              end if
+              ! find local spectral diameter of the block
+              gl = gers( 2*ibegin-1 )
+              gu = gers( 2*ibegin )
+              do i = ibegin+1 , iend
+                 gl = min( gers( 2*i-1 ), gl )
+                 gu = max( gers( 2*i ), gu )
+              end do
+              spdiam = gu - gl
+              ! oldien is the last index of the previous block
+              oldien = ibegin - 1
+              ! calculate the size of the current block
+              in = iend - ibegin + 1
+              ! the number of eigenvalues in the current block
+              im = wend - wbegin + 1
+              ! this is for a 1x1 block
+              if( ibegin==iend ) then
+                 done = done+1
+                 z( ibegin, wbegin ) = cmplx( one, zero,KIND=qp)
+                 isuppz( 2*wbegin-1 ) = ibegin
+                 isuppz( 2*wbegin ) = ibegin
+                 w( wbegin ) = w( wbegin ) + sigma
+                 work( wbegin ) = w( wbegin )
+                 ibegin = iend + 1
+                 wbegin = wbegin + 1
+                 cycle loop_170
+              end if
+              ! the desired (shifted) eigenvalues are stored in w(wbegin:wend)
+              ! note that these can be approximations, in this case, the corresp.
+              ! entries of werr give the size of the uncertainty interval.
+              ! the eigenvalue approximations will be refined when necessary as
+              ! high relative accuracy is required for the computation of the
+              ! corresponding eigenvectors.
+              call stdlib_qcopy( im, w( wbegin ), 1,work( wbegin ), 1 )
+              ! we store in w the eigenvalue approximations w.r.t. the original
+              ! matrix t.
+              do i=1,im
+                 w(wbegin+i-1) = w(wbegin+i-1)+sigma
+              end do
+              ! ndepth is the current depth of the representation tree
+              ndepth = 0
+              ! parity is either 1 or 0
+              parity = 1
+              ! nclus is the number of clusters for the next level of the
+              ! representation tree, we start with nclus = 1 for the root
+              nclus = 1
+              iwork( iindc1+1 ) = 1
+              iwork( iindc1+2 ) = im
+              ! idone is the number of eigenvectors already computed in the current
+              ! block
+              idone = 0
+              ! loop while( idone<im )
+              ! generate the representation tree for the current block and
+              ! compute the eigenvectors
+              40 continue
+              if( idone<im ) then
+                 ! this is a crude protection against infinitely deep trees
+                 if( ndepth>m ) then
+                    info = -2
+                    return
+                 endif
+                 ! breadth first processing of the current level of the representation
+                 ! tree: oldncl = number of clusters on current level
+                 oldncl = nclus
+                 ! reset nclus to count the number of child clusters
+                 nclus = 0
+                 parity = 1 - parity
+                 if( parity==0 ) then
+                    oldcls = iindc1
+                    newcls = iindc2
+                 else
+                    oldcls = iindc2
+                    newcls = iindc1
+                 end if
+                 ! process the clusters on the current level
+                 loop_150: do i = 1, oldncl
+                    j = oldcls + 2*i
+                    ! oldfst, oldlst = first, last index of current cluster.
+                                     ! cluster indices start with 1 and are relative
+                                     ! to wbegin when accessing w, wgap, werr, z
+                    oldfst = iwork( j-1 )
+                    oldlst = iwork( j )
+                    if( ndepth>0 ) then
+                       ! retrieve relatively robust representation (rrr) of cluster
+                       ! that has been computed at the previous level
+                       ! the rrr is stored in z and overwritten once the eigenvectors
+                       ! have been computed or when the cluster is refined
+                       if((dol==1).and.(dou==m)) then
+                          ! get representation from location of the leftmost evalue
+                          ! of the cluster
+                          j = wbegin + oldfst - 1
+                       else
+                          if(wbegin+oldfst-1<dol) then
+                             ! get representation from the left end of z array
+                             j = dol - 1
+                          elseif(wbegin+oldfst-1>dou) then
+                             ! get representation from the right end of z array
+                             j = dou
+                          else
+                             j = wbegin + oldfst - 1
+                          endif
+                       endif
+                       do k = 1, in - 1
+                          d( ibegin+k-1 ) = real( z( ibegin+k-1,j ),KIND=qp)
+                          l( ibegin+k-1 ) = real( z( ibegin+k-1,j+1 ),KIND=qp)
+                       end do
+                       d( iend ) = real( z( iend, j ),KIND=qp)
+                       sigma = real( z( iend, j+1 ),KIND=qp)
+                       ! set the corresponding entries in z to zero
+                       call stdlib_wlaset( 'FULL', in, 2, czero, czero,z( ibegin, j), ldz )
+                                 
+                    end if
+                    ! compute dl and dll of current rrr
+                    do j = ibegin, iend-1
+                       tmp = d( j )*l( j )
+                       work( indld-1+j ) = tmp
+                       work( indlld-1+j ) = tmp*l( j )
+                    end do
+                    if( ndepth>0 ) then
+                       ! p and q are index of the first and last eigenvalue to compute
+                       ! within the current block
+                       p = indexw( wbegin-1+oldfst )
+                       q = indexw( wbegin-1+oldlst )
+                       ! offset for the arrays work, wgap and werr, i.e., the p-offset
+                       ! through the q-offset elements of these arrays are to be used.
+                        ! offset = p-oldfst
+                       offset = indexw( wbegin ) - 1
+                       ! perform limited bisection (if necessary) to get approximate
+                       ! eigenvalues to the precision needed.
+                       call stdlib_qlarrb( in, d( ibegin ),work(indlld+ibegin-1),p, q, rtol1, &
+                       rtol2, offset,work(wbegin),wgap(wbegin),werr(wbegin),work( indwrk ), iwork(&
+                                  iindwk ),pivmin, spdiam, in, iinfo )
+                       if( iinfo/=0 ) then
+                          info = -1
+                          return
+                       endif
+                       ! we also recompute the extremal gaps. w holds all eigenvalues
+                       ! of the unshifted matrix and must be used for computation
+                       ! of wgap, the entries of work might stem from rrrs with
+                       ! different shifts. the gaps from wbegin-1+oldfst to
+                       ! wbegin-1+oldlst are correctly computed in stdlib_qlarrb.
+                       ! however, we only allow the gaps to become greater since
+                       ! this is what should happen when we decrease werr
+                       if( oldfst>1) then
+                          wgap( wbegin+oldfst-2 ) =max(wgap(wbegin+oldfst-2),w(wbegin+oldfst-1)-&
+                          werr(wbegin+oldfst-1)- w(wbegin+oldfst-2)-werr(wbegin+oldfst-2) )
+                                    
+                       endif
+                       if( wbegin + oldlst -1 < wend ) then
+                          wgap( wbegin+oldlst-1 ) =max(wgap(wbegin+oldlst-1),w(wbegin+oldlst)-&
+                                    werr(wbegin+oldlst)- w(wbegin+oldlst-1)-werr(wbegin+oldlst-1) )
+                       endif
+                       ! each time the eigenvalues in work get refined, we store
+                       ! the newly found approximation with all shifts applied in w
+                       do j=oldfst,oldlst
+                          w(wbegin+j-1) = work(wbegin+j-1)+sigma
+                       end do
+                    end if
+                    ! process the current node.
+                    newfst = oldfst
+                    loop_140: do j = oldfst, oldlst
+                       if( j==oldlst ) then
+                          ! we are at the right end of the cluster, this is also the
+                          ! boundary of the child cluster
+                          newlst = j
+                       else if ( wgap( wbegin + j -1)>=minrgp* abs( work(wbegin + j -1) ) ) &
+                                 then
+                          ! the right relative gap is big enough, the child cluster
+                          ! (newfst,..,newlst) is well separated from the following
+                          newlst = j
+                        else
+                          ! inside a child cluster, the relative gap is not
+                          ! big enough.
+                          cycle loop_140
+                       end if
+                       ! compute size of child cluster found
+                       newsiz = newlst - newfst + 1
+                       ! newftt is the place in z where the new rrr or the computed
+                       ! eigenvector is to be stored
+                       if((dol==1).and.(dou==m)) then
+                          ! store representation at location of the leftmost evalue
+                          ! of the cluster
+                          newftt = wbegin + newfst - 1
+                       else
+                          if(wbegin+newfst-1<dol) then
+                             ! store representation at the left end of z array
+                             newftt = dol - 1
+                          elseif(wbegin+newfst-1>dou) then
+                             ! store representation at the right end of z array
+                             newftt = dou
+                          else
+                             newftt = wbegin + newfst - 1
+                          endif
+                       endif
+                       if( newsiz>1) then
+                          ! current child is not a singleton but a cluster.
+                          ! compute and store new representation of child.
+                          ! compute left and right cluster gap.
+                          ! lgap and rgap are not computed from work because
+                          ! the eigenvalue approximations may stem from rrrs
+                          ! different shifts. however, w hold all eigenvalues
+                          ! of the unshifted matrix. still, the entries in wgap
+                          ! have to be computed from work since the entries
+                          ! in w might be of the same order so that gaps are not
+                          ! exhibited correctly for very close eigenvalues.
+                          if( newfst==1 ) then
+                             lgap = max( zero,w(wbegin)-werr(wbegin) - vl )
+                         else
+                             lgap = wgap( wbegin+newfst-2 )
+                          endif
+                          rgap = wgap( wbegin+newlst-1 )
+                          ! compute left- and rightmost eigenvalue of child
+                          ! to high precision in order to shift as close
+                          ! as possible and obtain as large relative gaps
+                          ! as possible
+                          do k =1,2
+                             if(k==1) then
+                                p = indexw( wbegin-1+newfst )
+                             else
+                                p = indexw( wbegin-1+newlst )
+                             endif
+                             offset = indexw( wbegin ) - 1
+                             call stdlib_qlarrb( in, d(ibegin),work( indlld+ibegin-1 ),p,p,rqtol, &
+                             rqtol, offset,work(wbegin),wgap(wbegin),werr(wbegin),work( indwrk ),&
+                                       iwork( iindwk ), pivmin, spdiam,in, iinfo )
+                          end do
+                          if((wbegin+newlst-1<dol).or.(wbegin+newfst-1>dou)) then
+                             ! if the cluster contains no desired eigenvalues
+                             ! skip the computation of that branch of the rep. tree
+                             ! we could skip before the refinement of the extremal
+                             ! eigenvalues of the child, but then the representation
+                             ! tree could be different from the one when nothing is
+                             ! skipped. for this reason we skip at this place.
+                             idone = idone + newlst - newfst + 1
+                             goto 139
+                          endif
+                          ! compute rrr of child cluster.
+                          ! note that the new rrr is stored in z
+                          ! stdlib_qlarrf needs lwork = 2*n
+                          call stdlib_qlarrf( in, d( ibegin ), l( ibegin ),work(indld+ibegin-1),&
+                          newfst, newlst, work(wbegin),wgap(wbegin), werr(wbegin),spdiam, lgap, &
+                          rgap, pivmin, tau,work( indin1 ), work( indin2 ),work( indwrk ), iinfo )
+                                    
+                          ! in the complex case, stdlib_qlarrf cannot write
+                          ! the new rrr directly into z and needs an intermediate
+                          ! workspace
+                          do k = 1, in-1
+                             z( ibegin+k-1, newftt ) =cmplx( work( indin1+k-1 ), zero,KIND=qp)
+                                       
+                             z( ibegin+k-1, newftt+1 ) =cmplx( work( indin2+k-1 ), zero,KIND=qp)
+                                       
+                          end do
+                          z( iend, newftt ) =cmplx( work( indin1+in-1 ), zero,KIND=qp)
+                          if( iinfo==0 ) then
+                             ! a new rrr for the cluster was found by stdlib_qlarrf
+                             ! update shift and store it
+                             ssigma = sigma + tau
+                             z( iend, newftt+1 ) = cmplx( ssigma, zero,KIND=qp)
+                             ! work() are the midpoints and werr() the semi-width
+                             ! note that the entries in w are unchanged.
+                             do k = newfst, newlst
+                                fudge =three*eps*abs(work(wbegin+k-1))
+                                work( wbegin + k - 1 ) =work( wbegin + k - 1) - tau
+                                fudge = fudge +four*eps*abs(work(wbegin+k-1))
+                                ! fudge errors
+                                werr( wbegin + k - 1 ) =werr( wbegin + k - 1 ) + fudge
+                                ! gaps are not fudged. provided that werr is small
+                                ! when eigenvalues are close, a zero gap indicates
+                                ! that a new representation is needed for resolving
+                                ! the cluster. a fudge could lead to a wrong decision
+                                ! of judging eigenvalues 'separated' which in
+                                ! reality are not. this could have a negative impact
+                                ! on the orthogonality of the computed eigenvectors.
+                             end do
+                             nclus = nclus + 1
+                             k = newcls + 2*nclus
+                             iwork( k-1 ) = newfst
+                             iwork( k ) = newlst
+                          else
+                             info = -2
+                             return
+                          endif
+                       else
+                          ! compute eigenvector of singleton
+                          iter = 0
+                          tol = four * log(real(in,KIND=qp)) * eps
+                          k = newfst
+                          windex = wbegin + k - 1
+                          windmn = max(windex - 1,1)
+                          windpl = min(windex + 1,m)
+                          lambda = work( windex )
+                          done = done + 1
+                          ! check if eigenvector computation is to be skipped
+                          if((windex<dol).or.(windex>dou)) then
+                             eskip = .true.
+                             goto 125
+                          else
+                             eskip = .false.
+                          endif
+                          left = work( windex ) - werr( windex )
+                          right = work( windex ) + werr( windex )
+                          indeig = indexw( windex )
+                          ! note that since we compute the eigenpairs for a child,
+                          ! all eigenvalue approximations are w.r.t the same shift.
+                          ! in this case, the entries in work should be used for
+                          ! computing the gaps since they exhibit even very small
+                          ! differences in the eigenvalues, as opposed to the
+                          ! entries in w which might "look" the same.
+                          if( k == 1) then
+                             ! in the case range='i' and with not much initial
+                             ! accuracy in lambda and vl, the formula
+                             ! lgap = max( zero, (sigma - vl) + lambda )
+                             ! can lead to an overestimation of the left gap and
+                             ! thus to inadequately early rqi 'convergence'.
+                             ! prevent this by forcing a small left gap.
+                             lgap = eps*max(abs(left),abs(right))
+                          else
+                             lgap = wgap(windmn)
+                          endif
+                          if( k == im) then
+                             ! in the case range='i' and with not much initial
+                             ! accuracy in lambda and vu, the formula
+                             ! can lead to an overestimation of the right gap and
+                             ! thus to inadequately early rqi 'convergence'.
+                             ! prevent this by forcing a small right gap.
+                             rgap = eps*max(abs(left),abs(right))
+                          else
+                             rgap = wgap(windex)
+                          endif
+                          gap = min( lgap, rgap )
+                          if(( k == 1).or.(k == im)) then
+                             ! the eigenvector support can become wrong
+                             ! because significant entries could be cut off due to a
+                             ! large gaptol parameter in lar1v. prevent this.
+                             gaptol = zero
+                          else
+                             gaptol = gap * eps
+                          endif
+                          isupmn = in
+                          isupmx = 1
+                          ! update wgap so that it holds the minimum gap
+                          ! to the left or the right. this is crucial in the
+                          ! case where bisection is used to ensure that the
+                          ! eigenvalue is refined up to the required precision.
+                          ! the correct value is restored afterwards.
+                          savgap = wgap(windex)
+                          wgap(windex) = gap
+                          ! we want to use the rayleigh quotient correction
+                          ! as often as possible since it converges quadratically
+                          ! when we are close enough to the desired eigenvalue.
+                          ! however, the rayleigh quotient can have the wrong sign
+                          ! and lead us away from the desired eigenvalue. in this
+                          ! case, the best we can do is to use bisection.
+                          usedbs = .false.
+                          usedrq = .false.
+                          ! bisection is initially turned off unless it is forced
+                          needbs =  .not.tryrqc
+                          120 continue
+                          ! check if bisection should be used to refine eigenvalue
+                          if(needbs) then
+                             ! take the bisection as new iterate
+                             usedbs = .true.
+                             itmp1 = iwork( iindr+windex )
+                             offset = indexw( wbegin ) - 1
+                             call stdlib_qlarrb( in, d(ibegin),work(indlld+ibegin-1),indeig,&
+                             indeig,zero, two*eps, offset,work(wbegin),wgap(wbegin),werr(wbegin),&
+                                       work( indwrk ),iwork( iindwk ), pivmin, spdiam,itmp1, iinfo )
+                             if( iinfo/=0 ) then
+                                info = -3
+                                return
+                             endif
+                             lambda = work( windex )
+                             ! reset twist index from inaccurate lambda to
+                             ! force computation of true mingma
+                             iwork( iindr+windex ) = 0
+                          endif
+                          ! given lambda, compute the eigenvector.
+                          call stdlib_wlar1v( in, 1, in, lambda, d( ibegin ),l( ibegin ), work(&
+                          indld+ibegin-1),work(indlld+ibegin-1),pivmin, gaptol, z( ibegin, windex &
+                          ),.not.usedbs, negcnt, ztz, mingma,iwork( iindr+windex ), isuppz( &
+                                    2*windex-1 ),nrminv, resid, rqcorr, work( indwrk ) )
+                          if(iter == 0) then
+                             bstres = resid
+                             bstw = lambda
+                          elseif(resid<bstres) then
+                             bstres = resid
+                             bstw = lambda
+                          endif
+                          isupmn = min(isupmn,isuppz( 2*windex-1 ))
+                          isupmx = max(isupmx,isuppz( 2*windex ))
+                          iter = iter + 1
+                          ! sin alpha <= |resid|/gap
+                          ! note that both the residual and the gap are
+                          ! proportional to the matrix, so ||t|| doesn't play
+                          ! a role in the quotient
+                          ! convergence test for rayleigh-quotient iteration
+                          ! (omitted when bisection has been used)
+                          if( resid>tol*gap .and. abs( rqcorr )>rqtol*abs( lambda ) .and. .not. &
+                                    usedbs)then
+                             ! we need to check that the rqcorr update doesn't
+                             ! move the eigenvalue away from the desired one and
+                             ! towards a neighbor. -> protection with bisection
+                             if(indeig<=negcnt) then
+                                ! the wanted eigenvalue lies to the left
+                                sgndef = -one
+                             else
+                                ! the wanted eigenvalue lies to the right
+                                sgndef = one
+                             endif
+                             ! we only use the rqcorr if it improves the
+                             ! the iterate reasonably.
+                             if( ( rqcorr*sgndef>=zero ).and.( lambda + rqcorr<= right).and.( &
+                                       lambda + rqcorr>= left)) then
+                                usedrq = .true.
+                                ! store new midpoint of bisection interval in work
+                                if(sgndef==one) then
+                                   ! the current lambda is on the left of the true
+                                   ! eigenvalue
+                                   left = lambda
+                                   ! we prefer to assume that the error estimate
+                                   ! is correct. we could make the interval not
+                                   ! as a bracket but to be modified if the rqcorr
+                                   ! chooses to. in this case, the right side should
+                                   ! be modified as follows:
+                                    ! right = max(right, lambda + rqcorr)
+                                else
+                                   ! the current lambda is on the right of the true
+                                   ! eigenvalue
+                                   right = lambda
+                                   ! see comment about assuming the error estimate is
+                                   ! correct above.
+                                    ! left = min(left, lambda + rqcorr)
+                                endif
+                                work( windex ) =half * (right + left)
+                                ! take rqcorr since it has the correct sign and
+                                ! improves the iterate reasonably
+                                lambda = lambda + rqcorr
+                                ! update width of error interval
+                                werr( windex ) =half * (right-left)
+                             else
+                                needbs = .true.
+                             endif
+                             if(right-left<rqtol*abs(lambda)) then
+                                   ! the eigenvalue is computed to bisection accuracy
+                                   ! compute eigenvector and stop
+                                usedbs = .true.
+                                goto 120
+                             elseif( iter<maxitr ) then
+                                goto 120
+                             elseif( iter==maxitr ) then
+                                needbs = .true.
+                                goto 120
+                             else
+                                info = 5
+                                return
+                             end if
+                          else
+                             stp2ii = .false.
+             if(usedrq .and. usedbs .and.bstres<=resid) then
+                                lambda = bstw
+                                stp2ii = .true.
+                             endif
+                             if (stp2ii) then
+                                ! improve error angle by second step
+                                call stdlib_wlar1v( in, 1, in, lambda,d( ibegin ), l( ibegin ),&
+                                work(indld+ibegin-1),work(indlld+ibegin-1),pivmin, gaptol, z( &
+                                ibegin, windex ),.not.usedbs, negcnt, ztz, mingma,iwork( iindr+&
+                                windex ),isuppz( 2*windex-1 ),nrminv, resid, rqcorr, work( indwrk &
+                                          ) )
+                             endif
+                             work( windex ) = lambda
+                          end if
+                          ! compute fp-vector support w.r.t. whole matrix
+                          isuppz( 2*windex-1 ) = isuppz( 2*windex-1 )+oldien
+                          isuppz( 2*windex ) = isuppz( 2*windex )+oldien
+                          zfrom = isuppz( 2*windex-1 )
+                          zto = isuppz( 2*windex )
+                          isupmn = isupmn + oldien
+                          isupmx = isupmx + oldien
+                          ! ensure vector is ok if support in the rqi has changed
+                          if(isupmn<zfrom) then
+                             do ii = isupmn,zfrom-1
+                                z( ii, windex ) = zero
+                             end do
+                          endif
+                          if(isupmx>zto) then
+                             do ii = zto+1,isupmx
+                                z( ii, windex ) = zero
+                             end do
+                          endif
+                          call stdlib_wdscal( zto-zfrom+1, nrminv,z( zfrom, windex ), 1 )
+                          125 continue
+                          ! update w
+                          w( windex ) = lambda+sigma
+                          ! recompute the gaps on the left and right
+                          ! but only allow them to become larger and not
+                          ! smaller (which can only happen through "bad"
+                          ! cancellation and doesn't reflect the theory
+                          ! where the initial gaps are underestimated due
+                          ! to werr being too crude.)
+                          if(.not.eskip) then
+                             if( k>1) then
+                                wgap( windmn ) = max( wgap(windmn),w(windex)-werr(windex)- w(&
+                                          windmn)-werr(windmn) )
+                             endif
+                             if( windex<wend ) then
+                                wgap( windex ) = max( savgap,w( windpl )-werr( windpl )- w( &
+                                          windex )-werr( windex) )
+                             endif
+                          endif
+                          idone = idone + 1
+                       endif
+                       ! here ends the code for the current child
+                       139 continue
+                       ! proceed to any remaining child nodes
+                       newfst = j + 1
+                    end do loop_140
+                 end do loop_150
+                 ndepth = ndepth + 1
+                 go to 40
+              end if
+              ibegin = iend + 1
+              wbegin = wend + 1
+           end do loop_170
+           return
+     end subroutine stdlib_wlarrv
+
+     !> !
+     !>
+     !> ZLARTG: generates a plane rotation so that
+     !> [  C         S  ] . [ F ]  =  [ R ]
+     !> [ -conjg(S)  C  ]   [ G ]     [ 0 ]
+     !> where C is real and C**2 + |S|**2 = 1.
+     !> The mathematical formulas used for C and S are
+     !> sgn(x) = {  x / |x|,   x != 0
+     !> {  1,         x = 0
+     !> R = sgn(F) * sqrt(|F|**2 + |G|**2)
+     !> C = |F| / sqrt(|F|**2 + |G|**2)
+     !> S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2)
+     !> When F and G are real, the formulas simplify to C = F/R and
+     !> S = G/R, and the returned values of C, S, and R should be
+     !> identical to those returned by DLARTG.
+     !> The algorithm used to compute these quantities incorporates scaling
+     !> to avoid overflow or underflow in computing the square root of the
+     !> sum of squares.
+     !> This is a faster version of the BLAS1 routine ZROTG, except for
+     !> the following differences:
+     !> F and G are unchanged on return.
+     !> If G=0, then C=1 and S=0.
+     !> If F=0, then C=0 and S is chosen so that R is real.
+     !> Below, wp=>dp stands for quad precision from LA_CONSTANTS module.
+
+     pure subroutine stdlib_wlartg( f, g, c, s, r )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! february 2021
+        ! Scalar Arguments 
+        real(qp), intent(out) :: c
+        complex(qp), intent(in) :: f, g
+        complex(qp), intent(out) :: r, s
+        ! Local Scalars 
+        real(qp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w
+        complex(qp) :: fs, gs, t
+        ! Intrinsic Functions 
+        intrinsic :: abs,aimag,conjg,max,min,real,sqrt
+        ! Statement Functions 
+        real(qp) :: abssq
+        ! Statement Function Definitions 
+        abssq( t ) = real( t,KIND=qp)**2 + aimag( t )**2
+        ! Executable Statements 
+        if( g == czero ) then
+           c = one
+           s = czero
+           r = f
+        else if( f == czero ) then
+           c = zero
+           g1 = max( abs(real(g,KIND=qp)), abs(aimag(g)) )
+           if( g1 > rtmin .and. g1 < rtmax ) then
+              ! use unscaled algorithm
+              g2 = abssq( g )
+              d = sqrt( g2 )
+              s = conjg( g ) / d
+              r = d
+           else
+              ! use scaled algorithm
+              u = min( safmax, max( safmin, g1 ) )
+              uu = one / u
+              gs = g*uu
+              g2 = abssq( gs )
+              d = sqrt( g2 )
+              s = conjg( gs ) / d
+              r = d*u
+           end if
+        else
+           f1 = max( abs(real(f,KIND=qp)), abs(aimag(f)) )
+           g1 = max( abs(real(g,KIND=qp)), abs(aimag(g)) )
+     if( f1 > rtmin .and. f1 < rtmax .and.          g1 > rtmin .and. g1 < rtmax ) then
+              ! use unscaled algorithm
+              f2 = abssq( f )
+              g2 = abssq( g )
+              h2 = f2 + g2
+              if( f2 > rtmin .and. h2 < rtmax ) then
+                 d = sqrt( f2*h2 )
+              else
+                 d = sqrt( f2 )*sqrt( h2 )
+              end if
+              p = 1 / d
+              c = f2*p
+              s = conjg( g )*( f*p )
+              r = f*( h2*p )
+           else
+              ! use scaled algorithm
+              u = min( safmax, max( safmin, f1, g1 ) )
+              uu = one / u
+              gs = g*uu
+              g2 = abssq( gs )
+              if( f1*uu < rtmin ) then
+                 ! f is not well-scaled when scaled by g1.
+                 ! use a different scaling for f.
+                 v = min( safmax, max( safmin, f1 ) )
+                 vv = one / v
+                 w = v * uu
+                 fs = f*vv
+                 f2 = abssq( fs )
+                 h2 = f2*w**2 + g2
+              else
+                 ! otherwise use the same scaling for f and g.
+                 w = one
+                 fs = f*uu
+                 f2 = abssq( fs )
+                 h2 = f2 + g2
+              end if
+              if( f2 > rtmin .and. h2 < rtmax ) then
+                 d = sqrt( f2*h2 )
+              else
+                 d = sqrt( f2 )*sqrt( h2 )
+              end if
+              p = 1 / d
+              c = ( f2*p )*w
+              s = conjg( gs )*( fs*p )
+              r = ( fs*( h2*p ) )*u
+           end if
+        end if
+        return
+     end subroutine stdlib_wlartg
+
+     !> ZLARTV: applies a vector of complex plane rotations with real cosines
+     !> to elements of the complex vectors x and y. For i = 1,2,...,n
+     !> ( x(i) ) := (        c(i)   s(i) ) ( x(i) )
+     !> ( y(i) )    ( -conjg(s(i))  c(i) ) ( y(i) )
+
+     pure subroutine stdlib_wlartv( n, x, incx, y, incy, c, s, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, incy, n
+           ! Array Arguments 
+           real(qp), intent(in) :: c(*)
+           complex(qp), intent(in) :: s(*)
+           complex(qp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix, iy
+           complex(qp) :: xi, yi
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ix = 1
+           iy = 1
+           ic = 1
+           do i = 1, n
+              xi = x( ix )
+              yi = y( iy )
+              x( ix ) = c( ic )*xi + s( ic )*yi
+              y( iy ) = c( ic )*yi - conjg( s( ic ) )*xi
+              ix = ix + incx
+              iy = iy + incy
+              ic = ic + incc
+           end do
+           return
+     end subroutine stdlib_wlartv
+
+     !> ZLARZ: applies a complex elementary reflector H to a complex
+     !> M-by-N matrix C, from either the left or the right. H is represented
+     !> in the form
+     !> H = I - tau * v * v**H
+     !> where tau is a complex scalar and v is a complex vector.
+     !> If tau = 0, then H is taken to be the unit matrix.
+     !> To apply H**H (the conjugate transpose of H), supply conjg(tau) instead
+     !> tau.
+     !> H is a product of k elementary reflectors as returned by ZTZRZF.
+
+     pure subroutine stdlib_wlarz( side, m, n, l, v, incv, tau, c, ldc, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: incv, l, ldc, m, n
+           complex(qp), intent(in) :: tau
+           ! Array Arguments 
+           complex(qp), intent(inout) :: c(ldc,*)
+           complex(qp), intent(in) :: v(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Executable Statements 
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c
+              if( tau/=czero ) then
+                 ! w( 1:n ) = conjg( c( 1, 1:n ) )
+                 call stdlib_wcopy( n, c, ldc, work, 1 )
+                 call stdlib_wlacgv( n, work, 1 )
+                 ! w( 1:n ) = conjg( w( 1:n ) + c( m-l+1:m, 1:n )**h * v( 1:l ) )
+                 call stdlib_wgemv( 'CONJUGATE TRANSPOSE', l, n, cone, c( m-l+1, 1 ),ldc, v, incv,&
+                            cone, work, 1 )
+                 call stdlib_wlacgv( n, work, 1 )
+                 ! c( 1, 1:n ) = c( 1, 1:n ) - tau * w( 1:n )
+                 call stdlib_waxpy( n, -tau, work, 1, c, ldc )
+                 ! c( m-l+1:m, 1:n ) = c( m-l+1:m, 1:n ) - ...
+                                     ! tau * v( 1:l ) * w( 1:n )**h
+                 call stdlib_wgeru( l, n, -tau, v, incv, work, 1, c( m-l+1, 1 ),ldc )
+              end if
+           else
+              ! form  c * h
+              if( tau/=czero ) then
+                 ! w( 1:m ) = c( 1:m, 1 )
+                 call stdlib_wcopy( m, c, 1, work, 1 )
+                 ! w( 1:m ) = w( 1:m ) + c( 1:m, n-l+1:n, 1:n ) * v( 1:l )
+                 call stdlib_wgemv( 'NO TRANSPOSE', m, l, cone, c( 1, n-l+1 ), ldc,v, incv, cone, &
+                           work, 1 )
+                 ! c( 1:m, 1 ) = c( 1:m, 1 ) - tau * w( 1:m )
+                 call stdlib_waxpy( m, -tau, work, 1, c, 1 )
+                 ! c( 1:m, n-l+1:n ) = c( 1:m, n-l+1:n ) - ...
+                                     ! tau * w( 1:m ) * v( 1:l )**h
+                 call stdlib_wgerc( m, l, -tau, work, 1, v, incv, c( 1, n-l+1 ),ldc )
+              end if
+           end if
+           return
+     end subroutine stdlib_wlarz
+
+     !> ZLARZB: applies a complex block reflector H or its transpose H**H
+     !> to a complex distributed M-by-N  C from the left or the right.
+     !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+
+     pure subroutine stdlib_wlarzb( side, trans, direct, storev, m, n, k, l, v,ldv, t, ldt, c, &
+               ldc, work, ldwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, l, ldc, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: c(ldc,*), t(ldt,*), v(ldv,*)
+           complex(qp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           character :: transt
+           integer(ilp) :: i, info, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           ! check for currently supported options
+           info = 0
+           if( .not.stdlib_lsame( direct, 'B' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( storev, 'R' ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLARZB', -info )
+              return
+           end if
+           if( stdlib_lsame( trans, 'N' ) ) then
+              transt = 'C'
+           else
+              transt = 'N'
+           end if
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c  or  h**h * c
+              ! w( 1:n, 1:k ) = c( 1:k, 1:n )**h
+              do j = 1, k
+                 call stdlib_wcopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+              end do
+              ! w( 1:n, 1:k ) = w( 1:n, 1:k ) + ...
+                              ! c( m-l+1:m, 1:n )**h * v( 1:k, 1:l )**t
+              if( l>0 )call stdlib_wgemm( 'TRANSPOSE', 'CONJUGATE TRANSPOSE', n, k, l,cone, c( m-&
+                        l+1, 1 ), ldc, v, ldv, cone, work,ldwork )
+              ! w( 1:n, 1:k ) = w( 1:n, 1:k ) * t**t  or  w( 1:m, 1:k ) * t
+              call stdlib_wtrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k, cone, t,ldt, work, &
+                        ldwork )
+              ! c( 1:k, 1:n ) = c( 1:k, 1:n ) - w( 1:n, 1:k )**h
+              do j = 1, n
+                 do i = 1, k
+                    c( i, j ) = c( i, j ) - work( j, i )
+                 end do
+              end do
+              ! c( m-l+1:m, 1:n ) = c( m-l+1:m, 1:n ) - ...
+                                  ! v( 1:k, 1:l )**h * w( 1:n, 1:k )**h
+              if( l>0 )call stdlib_wgemm( 'TRANSPOSE', 'TRANSPOSE', l, n, k, -cone, v, ldv,work, &
+                        ldwork, cone, c( m-l+1, 1 ), ldc )
+           else if( stdlib_lsame( side, 'R' ) ) then
+              ! form  c * h  or  c * h**h
+              ! w( 1:m, 1:k ) = c( 1:m, 1:k )
+              do j = 1, k
+                 call stdlib_wcopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+              end do
+              ! w( 1:m, 1:k ) = w( 1:m, 1:k ) + ...
+                              ! c( 1:m, n-l+1:n ) * v( 1:k, 1:l )**h
+              if( l>0 )call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, k, l, cone,c( 1, n-l+1 )&
+                        , ldc, v, ldv, cone, work, ldwork )
+              ! w( 1:m, 1:k ) = w( 1:m, 1:k ) * conjg( t )  or
+                              ! w( 1:m, 1:k ) * t**h
+              do j = 1, k
+                 call stdlib_wlacgv( k-j+1, t( j, j ), 1 )
+              end do
+              call stdlib_wtrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k, cone, t,ldt, work, &
+                        ldwork )
+              do j = 1, k
+                 call stdlib_wlacgv( k-j+1, t( j, j ), 1 )
+              end do
+              ! c( 1:m, 1:k ) = c( 1:m, 1:k ) - w( 1:m, 1:k )
+              do j = 1, k
+                 do i = 1, m
+                    c( i, j ) = c( i, j ) - work( i, j )
+                 end do
+              end do
+              ! c( 1:m, n-l+1:n ) = c( 1:m, n-l+1:n ) - ...
+                                  ! w( 1:m, 1:k ) * conjg( v( 1:k, 1:l ) )
+              do j = 1, l
+                 call stdlib_wlacgv( k, v( 1, j ), 1 )
+              end do
+              if( l>0 )call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, l, k, -cone,work, &
+                        ldwork, v, ldv, cone, c( 1, n-l+1 ), ldc )
+              do j = 1, l
+                 call stdlib_wlacgv( k, v( 1, j ), 1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_wlarzb
+
+     !> ZLARZT: forms the triangular factor T of a complex block reflector
+     !> H of order > n, which is defined as a product of k elementary
+     !> reflectors.
+     !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+     !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+     !> If STOREV = 'C', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th column of the array V, and
+     !> H  =  I - V * T * V**H
+     !> If STOREV = 'R', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th row of the array V, and
+     !> H  =  I - V**H * T * V
+     !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+
+     pure subroutine stdlib_wlarzt( direct, storev, n, k, v, ldv, tau, t, ldt )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, storev
+           integer(ilp), intent(in) :: k, ldt, ldv, n
+           ! Array Arguments 
+           complex(qp), intent(out) :: t(ldt,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j
+           ! Executable Statements 
+           ! check for currently supported options
+           info = 0
+           if( .not.stdlib_lsame( direct, 'B' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( storev, 'R' ) ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLARZT', -info )
+              return
+           end if
+           do i = k, 1, -1
+              if( tau( i )==czero ) then
+                 ! h(i)  =  i
+                 do j = i, k
+                    t( j, i ) = czero
+                 end do
+              else
+                 ! general case
+                 if( i<k ) then
+                    ! t(i+1:k,i) = - tau(i) * v(i+1:k,1:n) * v(i,1:n)**h
+                    call stdlib_wlacgv( n, v( i, 1 ), ldv )
+                    call stdlib_wgemv( 'NO TRANSPOSE', k-i, n, -tau( i ),v( i+1, 1 ), ldv, v( i, &
+                              1 ), ldv, czero,t( i+1, i ), 1 )
+                    call stdlib_wlacgv( n, v( i, 1 ), ldv )
+                    ! t(i+1:k,i) = t(i+1:k,i+1:k) * t(i+1:k,i)
+                    call stdlib_wtrmv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', k-i,t( i+1, i+1 ), &
+                              ldt, t( i+1, i ), 1 )
+                 end if
+                 t( i, i ) = tau( i )
+              end if
+           end do
+           return
+     end subroutine stdlib_wlarzt
+
+     !> ZLASCL: multiplies the M by N complex matrix A by the real scalar
+     !> CTO/CFROM.  This is done without over/underflow as long as the final
+     !> result CTO*A(I,J)/CFROM does not over/underflow. TYPE specifies that
+     !> A may be full, upper triangular, lower triangular, upper Hessenberg,
+     !> or banded.
+
+     pure subroutine stdlib_wlascl( type, kl, ku, cfrom, cto, m, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: type
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, lda, m, n
+           real(qp), intent(in) :: cfrom, cto
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: i, itype, j, k1, k2, k3, k4
+           real(qp) :: bignum, cfrom1, cfromc, cto1, ctoc, mul, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( stdlib_lsame( type, 'G' ) ) then
+              itype = 0
+           else if( stdlib_lsame( type, 'L' ) ) then
+              itype = 1
+           else if( stdlib_lsame( type, 'U' ) ) then
+              itype = 2
+           else if( stdlib_lsame( type, 'H' ) ) then
+              itype = 3
+           else if( stdlib_lsame( type, 'B' ) ) then
+              itype = 4
+           else if( stdlib_lsame( type, 'Q' ) ) then
+              itype = 5
+           else if( stdlib_lsame( type, 'Z' ) ) then
+              itype = 6
+           else
+              itype = -1
+           end if
+           if( itype==-1 ) then
+              info = -1
+           else if( cfrom==zero .or. stdlib_qisnan(cfrom) ) then
+              info = -4
+           else if( stdlib_qisnan(cto) ) then
+              info = -5
+           else if( m<0 ) then
+              info = -6
+           else if( n<0 .or. ( itype==4 .and. n/=m ) .or.( itype==5 .and. n/=m ) ) then
+              info = -7
+           else if( itype<=3 .and. lda<max( 1, m ) ) then
+              info = -9
+           else if( itype>=4 ) then
+              if( kl<0 .or. kl>max( m-1, 0 ) ) then
+                 info = -2
+              else if( ku<0 .or. ku>max( n-1, 0 ) .or.( ( itype==4 .or. itype==5 ) .and. kl/=ku ) &
+                        )then
+                 info = -3
+              else if( ( itype==4 .and. lda<kl+1 ) .or.( itype==5 .and. lda<ku+1 ) .or.( itype==6 &
+                        .and. lda<2*kl+ku+1 ) ) then
+                 info = -9
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLASCL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 )return
+           ! get machine parameters
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           cfromc = cfrom
+           ctoc = cto
+           10 continue
+           cfrom1 = cfromc*smlnum
+           if( cfrom1==cfromc ) then
+              ! cfromc is an inf.  multiply by a correctly signed zero for
+              ! finite ctoc, or a nan if ctoc is infinite.
+              mul = ctoc / cfromc
+              done = .true.
+              cto1 = ctoc
+           else
+              cto1 = ctoc / bignum
+              if( cto1==ctoc ) then
+                 ! ctoc is either 0 or an inf.  in both cases, ctoc itself
+                 ! serves as the correct multiplication factor.
+                 mul = ctoc
+                 done = .true.
+                 cfromc = one
+              else if( abs( cfrom1 )>abs( ctoc ) .and. ctoc/=zero ) then
+                 mul = smlnum
+                 done = .false.
+                 cfromc = cfrom1
+              else if( abs( cto1 )>abs( cfromc ) ) then
+                 mul = bignum
+                 done = .false.
+                 ctoc = cto1
+              else
+                 mul = ctoc / cfromc
+                 done = .true.
+              end if
+           end if
+           if( itype==0 ) then
+              ! full matrix
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==1 ) then
+              ! lower triangular matrix
+              do j = 1, n
+                 do i = j, m
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==2 ) then
+              ! upper triangular matrix
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==3 ) then
+              ! upper hessenberg matrix
+              do j = 1, n
+                 do i = 1, min( j+1, m )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==4 ) then
+              ! lower chalf of a symmetric band matrix
+              k3 = kl + 1
+              k4 = n + 1
+              do j = 1, n
+                 do i = 1, min( k3, k4-j )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==5 ) then
+              ! upper chalf of a symmetric band matrix
+              k1 = ku + 2
+              k3 = ku + 1
+              do j = 1, n
+                 do i = max( k1-j, 1 ), k3
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==6 ) then
+              ! band matrix
+              k1 = kl + ku + 2
+              k2 = kl + 1
+              k3 = 2*kl + ku + 1
+              k4 = kl + ku + 1 + m
+              do j = 1, n
+                 do i = max( k1-j, k2 ), min( k3, k4-j )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           end if
+           if( .not.done )go to 10
+           return
+     end subroutine stdlib_wlascl
+
+     !> ZLASET: initializes a 2-D array A to BETA on the diagonal and
+     !> ALPHA on the offdiagonals.
+
+     pure subroutine stdlib_wlaset( uplo, m, n, alpha, beta, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, m, n
+           complex(qp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           complex(qp), intent(out) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! set the diagonal to beta and the strictly upper triangular
+              ! part of the array to alpha.
+              do j = 2, n
+                 do i = 1, min( j-1, m )
+                    a( i, j ) = alpha
+                 end do
+              end do
+              do i = 1, min( n, m )
+                 a( i, i ) = beta
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              ! set the diagonal to beta and the strictly lower triangular
+              ! part of the array to alpha.
+              do j = 1, min( m, n )
+                 do i = j + 1, m
+                    a( i, j ) = alpha
+                 end do
+              end do
+              do i = 1, min( n, m )
+                 a( i, i ) = beta
+              end do
+           else
+              ! set the array to beta on the diagonal and alpha on the
+              ! offdiagonal.
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = alpha
+                 end do
+              end do
+              do i = 1, min( m, n )
+                 a( i, i ) = beta
+              end do
+           end if
+           return
+     end subroutine stdlib_wlaset
+
+     !> ZLASR: applies a sequence of real plane rotations to a complex matrix
+     !> A, from either the left or the right.
+     !> When SIDE = 'L', the transformation takes the form
+     !> A := P*A
+     !> and when SIDE = 'R', the transformation takes the form
+     !> A := A*P**T
+     !> where P is an orthogonal matrix consisting of a sequence of z plane
+     !> rotations, with z = M when SIDE = 'L' and z = N when SIDE = 'R',
+     !> and P**T is the transpose of P.
+     !> When DIRECT = 'F' (Forward sequence), then
+     !> P = P(z-1) * ... * P(2) * P(1)
+     !> and when DIRECT = 'B' (Backward sequence), then
+     !> P = P(1) * P(2) * ... * P(z-1)
+     !> where P(k) is a plane rotation matrix defined by the 2-by-2 rotation
+     !> R(k) = (  c(k)  s(k) )
+     !> = ( -s(k)  c(k) ).
+     !> When PIVOT = 'V' (Variable pivot), the rotation is performed
+     !> for the plane (k,k+1), i.e., P(k) has the form
+     !> P(k) = (  1                                            )
+     !> (       ...                                     )
+     !> (              1                                )
+     !> (                   c(k)  s(k)                  )
+     !> (                  -s(k)  c(k)                  )
+     !> (                                1              )
+     !> (                                     ...       )
+     !> (                                            1  )
+     !> where R(k) appears as a rank-2 modification to the identity matrix in
+     !> rows and columns k and k+1.
+     !> When PIVOT = 'T' (Top pivot), the rotation is performed for the
+     !> plane (1,k+1), so P(k) has the form
+     !> P(k) = (  c(k)                    s(k)                 )
+     !> (         1                                     )
+     !> (              ...                              )
+     !> (                     1                         )
+     !> ( -s(k)                    c(k)                 )
+     !> (                                 1             )
+     !> (                                      ...      )
+     !> (                                             1 )
+     !> where R(k) appears in rows and columns 1 and k+1.
+     !> Similarly, when PIVOT = 'B' (Bottom pivot), the rotation is
+     !> performed for the plane (k,z), giving P(k) the form
+     !> P(k) = ( 1                                             )
+     !> (      ...                                      )
+     !> (             1                                 )
+     !> (                  c(k)                    s(k) )
+     !> (                         1                     )
+     !> (                              ...              )
+     !> (                                     1         )
+     !> (                 -s(k)                    c(k) )
+     !> where R(k) appears in rows and columns k and z.  The rotations are
+     !> performed without ever forming P(k) explicitly.
+
+     pure subroutine stdlib_wlasr( side, pivot, direct, m, n, c, s, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, pivot, side
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(qp), intent(in) :: c(*), s(*)
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j
+           real(qp) :: ctemp, stemp
+           complex(qp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( .not.( stdlib_lsame( side, 'L' ) .or. stdlib_lsame( side, 'R' ) ) ) then
+              info = 1
+           else if( .not.( stdlib_lsame( pivot, 'V' ) .or. stdlib_lsame( pivot,'T' ) .or. &
+                     stdlib_lsame( pivot, 'B' ) ) ) then
+              info = 2
+           else if( .not.( stdlib_lsame( direct, 'F' ) .or. stdlib_lsame( direct, 'B' ) ) )&
+                     then
+              info = 3
+           else if( m<0 ) then
+              info = 4
+           else if( n<0 ) then
+              info = 5
+           else if( lda<max( 1, m ) ) then
+              info = 9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLASR ', info )
+              return
+           end if
+           ! quick return if possible
+           if( ( m==0 ) .or. ( n==0 ) )return
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  p * a
+              if( stdlib_lsame( pivot, 'V' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, m - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j+1, i )
+                             a( j+1, i ) = ctemp*temp - stemp*a( j, i )
+                             a( j, i ) = stemp*temp + ctemp*a( j, i )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j+1, i )
+                             a( j+1, i ) = ctemp*temp - stemp*a( j, i )
+                             a( j, i ) = stemp*temp + ctemp*a( j, i )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'T' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 2, m
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = ctemp*temp - stemp*a( 1, i )
+                             a( 1, i ) = stemp*temp + ctemp*a( 1, i )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m, 2, -1
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = ctemp*temp - stemp*a( 1, i )
+                             a( 1, i ) = stemp*temp + ctemp*a( 1, i )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'B' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, m - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = stemp*a( m, i ) + ctemp*temp
+                             a( m, i ) = ctemp*a( m, i ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = stemp*a( m, i ) + ctemp*temp
+                             a( m, i ) = ctemp*a( m, i ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 end if
+              end if
+           else if( stdlib_lsame( side, 'R' ) ) then
+              ! form a * p**t
+              if( stdlib_lsame( pivot, 'V' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, n - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j+1 )
+                             a( i, j+1 ) = ctemp*temp - stemp*a( i, j )
+                             a( i, j ) = stemp*temp + ctemp*a( i, j )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j+1 )
+                             a( i, j+1 ) = ctemp*temp - stemp*a( i, j )
+                             a( i, j ) = stemp*temp + ctemp*a( i, j )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'T' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 2, n
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = ctemp*temp - stemp*a( i, 1 )
+                             a( i, 1 ) = stemp*temp + ctemp*a( i, 1 )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n, 2, -1
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = ctemp*temp - stemp*a( i, 1 )
+                             a( i, 1 ) = stemp*temp + ctemp*a( i, 1 )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'B' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, n - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = stemp*a( i, n ) + ctemp*temp
+                             a( i, n ) = ctemp*a( i, n ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = stemp*a( i, n ) + ctemp*temp
+                             a( i, n ) = ctemp*a( i, n ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_wlasr
+
+     !> !
+     !>
+     !> ZLASSQ:  returns the values  scl  and  smsq  such that
+     !> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+     !> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+     !> assumed to be non-negative.
+     !> scale and sumsq must be supplied in SCALE and SUMSQ and
+     !> scl and smsq are overwritten on SCALE and SUMSQ respectively.
+     !> If scale * sqrt( sumsq ) > tbig then
+     !> we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
+     !> and if 0 < scale * sqrt( sumsq ) < tsml then
+     !> we require:   scale <= sqrt( HUGE ) / ssml       on entry,
+     !> where
+     !> tbig -- upper threshold for values whose square is representable;
+     !> sbig -- scaling constant for big numbers; \see la_constants.f90
+     !> tsml -- lower threshold for values whose square is representable;
+     !> ssml -- scaling constant for small numbers; \see la_constants.f90
+     !> and
+     !> TINY*EPS -- tiniest representable number;
+     !> HUGE     -- biggest representable number.
+
+     pure subroutine stdlib_wlassq( n, x, incx, scl, sumsq )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+        ! Scalar Arguments 
+     integer(ilp), intent(in) :: incx, n
+        real(qp), intent(inout) :: scl, sumsq
+        ! Array Arguments 
+        complex(qp), intent(in) :: x(*)
+        ! Local Scalars 
+     integer(ilp) :: i, ix
+     logical(lk) :: notbig
+        real(qp) :: abig, amed, asml, ax, ymax, ymin
+        ! quick return if possible
+        if( ieee_is_nan(scl) .or. ieee_is_nan(sumsq) ) return
+        if( sumsq == zero ) scl = one
+        if( scl == zero ) then
+           scl = one
+           sumsq = zero
+        end if
+        if (n <= 0) then
+           return
+        end if
+        ! compute the sum of squares in 3 accumulators:
+           ! abig -- sums of squares scaled down to avoid overflow
+           ! asml -- sums of squares scaled up to avoid underflow
+           ! amed -- sums of squares that do not require scaling
+        ! the thresholds and multipliers are
+           ! tbig -- values bigger than this are scaled down by sbig
+           ! tsml -- values smaller than this are scaled up by ssml
+        notbig = .true.
+        asml = zero
+        amed = zero
+        abig = zero
+        ix = 1
+        if( incx < 0 ) ix = 1 - (n-1)*incx
+        do i = 1, n
+           ax = abs(real(x(ix),KIND=qp))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ax = abs(aimag(x(ix)))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ix = ix + incx
+        end do
+        ! put the existing sum of squares into one of the accumulators
+        if( sumsq > zero ) then
+           ax = scl*sqrt( sumsq )
+           if (ax > tbig) then
+              ! we assume scl >= sqrt( tiny*eps ) / sbig
+              abig = abig + (scl*sbig)**2 * sumsq
+           else if (ax < tsml) then
+              ! we assume scl <= sqrt( huge ) / ssml
+              if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+           else
+              amed = amed + scl**2 * sumsq
+           end if
+        end if
+        ! combine abig and amed or amed and asml if more than one
+        ! accumulator was used.
+        if (abig > zero) then
+           ! combine abig and amed if abig > 0.
+           if (amed > zero .or. ieee_is_nan(amed)) then
+              abig = abig + (amed*sbig)*sbig
+           end if
+           scl = one / sbig
+           sumsq = abig
+        else if (asml > zero) then
+           ! combine amed and asml if asml > 0.
+           if (amed > zero .or. ieee_is_nan(amed)) then
+              amed = sqrt(amed)
+              asml = sqrt(asml) / ssml
+              if (asml > amed) then
+                 ymin = amed
+                 ymax = asml
+              else
+                 ymin = asml
+                 ymax = amed
+              end if
+              scl = one
+              sumsq = ymax**2*( one + (ymin/ymax)**2 )
+           else
+              scl = one / ssml
+              sumsq = asml
+           end if
+        else
+           ! otherwise all values are mid-range or zero
+           scl = one
+           sumsq = amed
+        end if
+        return
+     end subroutine stdlib_wlassq
+
+     !> ZLASWLQ: computes a blocked Tall-Skinny LQ factorization of
+     !> a complexx M-by-N matrix A for M <= N:
+     !> A = ( L 0 ) *  Q,
+     !> where:
+     !> Q is a n-by-N orthogonal matrix, stored on exit in an implicit
+     !> form in the elements above the diagonal of the array A and in
+     !> the elements of the array T;
+     !> L is a lower-triangular M-by-M matrix stored on exit in
+     !> the elements on and below the diagonal of the array A.
+     !> 0 is a M-by-(N-M) zero matrix, if M < N, and is not stored.
+
+     pure subroutine stdlib_wlaswlq( m, n, mb, nb, a, lda, t, ldt, work, lwork,info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, mb, nb, lwork, ldt
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ii, kk, ctr
+           ! External Subroutines 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+             info = -1
+           else if( n<0 .or. n<m ) then
+             info = -2
+           else if( mb<1 .or. ( mb>m .and. m>0 )) then
+             info = -3
+           else if( nb<=0 ) then
+             info = -4
+           else if( lda<max( 1, m ) ) then
+             info = -6
+           else if( ldt<mb ) then
+             info = -8
+           else if( ( lwork<m*mb) .and. (.not.lquery) ) then
+             info = -10
+           end if
+           if( info==0)  then
+           work(1) = mb*m
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZLASWLQ', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n)==0 ) then
+               return
+           end if
+           ! the lq decomposition
+            if((m>=n).or.(nb<=m).or.(nb>=n)) then
+             call stdlib_wgelqt( m, n, mb, a, lda, t, ldt, work, info)
+             return
+            end if
+            kk = mod((n-m),(nb-m))
+            ii=n-kk+1
+            ! compute the lq factorization of the first block a(1:m,1:nb)
+            call stdlib_wgelqt( m, nb, mb, a(1,1), lda, t, ldt, work, info)
+            ctr = 1
+            do i = nb+1, ii-nb+m , (nb-m)
+            ! compute the qr factorization of the current block a(1:m,i:i+nb-m)
+              call stdlib_wtplqt( m, nb-m, 0, mb, a(1,1), lda, a( 1, i ),lda, t(1, ctr * m + 1),&
+                        ldt, work, info )
+              ctr = ctr + 1
+            end do
+           ! compute the qr factorization of the last block a(1:m,ii:n)
+            if (ii<=n) then
+             call stdlib_wtplqt( m, kk, 0, mb, a(1,1), lda, a( 1, ii ),lda, t(1, ctr * m + 1), &
+                       ldt,work, info )
+            end if
+           work( 1 ) = m * mb
+           return
+     end subroutine stdlib_wlaswlq
+
+     !> ZLASWP: performs a series of row interchanges on the matrix A.
+     !> One row interchange is initiated for each of rows K1 through K2 of A.
+
+     pure subroutine stdlib_wlaswp( n, a, lda, k1, k2, ipiv, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k1, k2, lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, i1, i2, inc, ip, ix, ix0, j, k, n32
+           complex(qp) :: temp
+           ! Executable Statements 
+           ! interchange row i with row ipiv(k1+(i-k1)*abs(incx)) for each of rows
+           ! k1 through k2.
+           if( incx>0 ) then
+              ix0 = k1
+              i1 = k1
+              i2 = k2
+              inc = 1
+           else if( incx<0 ) then
+              ix0 = k1 + ( k1-k2 )*incx
+              i1 = k2
+              i2 = k1
+              inc = -1
+           else
+              return
+           end if
+           n32 = ( n / 32 )*32
+           if( n32/=0 ) then
+              do j = 1, n32, 32
+                 ix = ix0
+                 do i = i1, i2, inc
+                    ip = ipiv( ix )
+                    if( ip/=i ) then
+                       do k = j, j + 31
+                          temp = a( i, k )
+                          a( i, k ) = a( ip, k )
+                          a( ip, k ) = temp
+                       end do
+                    end if
+                    ix = ix + incx
+                 end do
+              end do
+           end if
+           if( n32/=n ) then
+              n32 = n32 + 1
+              ix = ix0
+              do i = i1, i2, inc
+                 ip = ipiv( ix )
+                 if( ip/=i ) then
+                    do k = n32, n
+                       temp = a( i, k )
+                       a( i, k ) = a( ip, k )
+                       a( ip, k ) = temp
+                    end do
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end subroutine stdlib_wlaswp
+
+     !> ZLASYF: computes a partial factorization of a complex symmetric matrix
+     !> A using the Bunch-Kaufman diagonal pivoting method. The partial
+     !> factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) ( D    0  ) ( L11**T L21**T )  if UPLO = 'L'
+     !> ( L21  I ) ( 0   A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> Note that U**T denotes the transpose of U.
+     !> ZLASYF is an auxiliary routine called by ZSYTRF. It uses blocked code
+     !> (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or
+     !> A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_wlasyf( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: imax, j, jb, jj, jmax, jp, k, kk, kkw, kp, kstep, kw
+           real(qp) :: absakk, alpha, colmax, rowmax
+           complex(qp) :: d11, d21, d22, r1, t, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=qp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              ! kw is the column of w which corresponds to column k of a
+              k = n
+              10 continue
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              ! copy column k of a to column kw of w and update it
+              call stdlib_wcopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_wgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ), lda,w( k, &
+                        kw+1 ), ldw, cone, w( 1, kw ), 1 )
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! copy column imax to column kw-1 of w and update it
+                    call stdlib_wcopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                    call stdlib_wcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                              
+                    if( k<n )call stdlib_wgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda, w(&
+                               imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = imax + stdlib_iwamax( k-imax, w( imax+1, kw-1 ), 1 )
+                    rowmax = cabs1( w( jmax, kw-1 ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_iwamax( imax-1, w( 1, kw-1 ), 1 )
+                       rowmax = max( rowmax, cabs1( w( jmax, kw-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( w( imax, kw-1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column kw-1 of w to column kw of w
+                       call stdlib_wcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = a( kk, kk )
+                    call stdlib_wcopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    if( kp>1 )call stdlib_wcopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_wswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_wswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! note: diagonal element u(k,k) is a unit element
+                    ! and not stored.
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    call stdlib_wcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    r1 = cone / a( k, k )
+                    call stdlib_wscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored.
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! compose the columns of the inverse of 2-by-2 pivot
+                       ! block d in the following way to reduce the number
+                       ! of flops when we myltiply panel ( w(kw-1) w(kw) ) by
+                       ! this inverse
+                       ! d**(-1) = ( d11 d21 )**(-1) =
+                                 ! ( d21 d22 )
+                       ! = 1/(d11*d22-d21**2) * ( ( d22 ) (-d21 ) ) =
+                                              ! ( (-d21 ) ( d11 ) )
+                       ! = 1/d21 * 1/((d11/d21)*(d22/d21)-1) *
+                         ! * ( ( d22/d21 ) (      -1 ) ) =
+                           ! ( (      -1 ) ( d11/d21 ) )
+                       ! = 1/d21 * 1/(d22*d11-1) * ( ( d11 ) (  -1 ) ) =
+                                                 ! ( ( -1  ) ( d22 ) )
+                       ! = 1/d21 * t * ( ( d11 ) (  -1 ) )
+                                     ! ( (  -1 ) ( d22 ) )
+                       ! = d21 * ( ( d11 ) (  -1 ) )
+                               ! ( (  -1 ) ( d22 ) )
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / d21
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = cone / ( d11*d22-cone )
+                       d21 = t / d21
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = d21*( d11*w( j, kw-1 )-w( j, kw ) )
+                          a( j, k ) = d21*( d22*w( j, kw )-w( j, kw-1 ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_wgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k,-cone, a( 1, k+1 ), &
+                           lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n looping backwards from k+1 to n
+              j = k + 1
+              60 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j + 1
+                 end if
+                 ! (note: here, j is used to determine row length. length n-j+1
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j + 1
+                 if( jp/=jj .and. j<=n )call stdlib_wswap( n-j+1, a( jp, j ), lda, a( jj, j ), &
+                           lda )
+              if( j<n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              ! copy column k of a to column k of w and update it
+              call stdlib_wcopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              call stdlib_wgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ), lda,w( k, 1 ), ldw,&
+                         cone, w( k, k ), 1 )
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! copy column imax to column k+1 of w and update it
+                    call stdlib_wcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1 )
+                    call stdlib_wcopy( n-imax+1, a( imax, imax ), 1, w( imax, k+1 ),1 )
+                    call stdlib_wgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( imax, &
+                              1 ), ldw, cone, w( k, k+1 ),1 )
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = k - 1 + stdlib_iwamax( imax-k, w( k, k+1 ), 1 )
+                    rowmax = cabs1( w( jmax, k+1 ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_iwamax( n-imax, w( imax+1, k+1 ), 1 )
+                       rowmax = max( rowmax, cabs1( w( jmax, k+1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( w( imax, k+1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column k+1 of w to column k of w
+                       call stdlib_wcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = a( kk, kk )
+                    call stdlib_wcopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    if( kp<n )call stdlib_wcopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (columns k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_wswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_wswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    call stdlib_wcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       r1 = cone / a( k, k )
+                       call stdlib_wscal( n-k, r1, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! (note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored)
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! compose the columns of the inverse of 2-by-2 pivot
+                       ! block d in the following way to reduce the number
+                       ! of flops when we myltiply panel ( w(k) w(k+1) ) by
+                       ! this inverse
+                       ! d**(-1) = ( d11 d21 )**(-1) =
+                                 ! ( d21 d22 )
+                       ! = 1/(d11*d22-d21**2) * ( ( d22 ) (-d21 ) ) =
+                                              ! ( (-d21 ) ( d11 ) )
+                       ! = 1/d21 * 1/((d11/d21)*(d22/d21)-1) *
+                         ! * ( ( d22/d21 ) (      -1 ) ) =
+                           ! ( (      -1 ) ( d11/d21 ) )
+                       ! = 1/d21 * 1/(d22*d11-1) * ( ( d11 ) (  -1 ) ) =
+                                                 ! ( ( -1  ) ( d22 ) )
+                       ! = 1/d21 * t * ( ( d11 ) (  -1 ) )
+                                     ! ( (  -1 ) ( d22 ) )
+                       ! = d21 * ( ( d11 ) (  -1 ) )
+                               ! ( (  -1 ) ( d22 ) )
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       d21 = t / d21
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = d21*( d11*w( j, k )-w( j, k+1 ) )
+                          a( j, k+1 ) = d21*( d22*w( j, k+1 )-w( j, k ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_wgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! of rows in columns 1:k-1 looping backwards from k-1 to 1
+              j = k - 1
+              120 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j - 1
+                 end if
+                 ! (note: here, j is used to determine row length. length j
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j - 1
+                 if( jp/=jj .and. j>=1 )call stdlib_wswap( j, a( jp, 1 ), lda, a( jj, 1 ), lda )
+                           
+              if( j>1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_wlasyf
+
+     !> DLATRF_AA factorizes a panel of a complex symmetric matrix A using
+     !> the Aasen's algorithm. The panel consists of a set of NB rows of A
+     !> when UPLO is U, or a set of NB columns when UPLO is L.
+     !> In order to factorize the panel, the Aasen's algorithm requires the
+     !> last row, or column, of the previous panel. The first row, or column,
+     !> of A is set to be the first row, or column, of an identity matrix,
+     !> which is used to factorize the first panel.
+     !> The resulting J-th row of U, or J-th column of L, is stored in the
+     !> (J-1)-th row, or column, of A (without the unit diagonals), while
+     !> the diagonal and subdiagonal of A are overwritten by those of T.
+
+     pure subroutine stdlib_wlasyf_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: m, nb, j1, lda, ldh
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), h(ldh,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, k, k1, i1, i2, mj
+           complex(qp) :: piv, alpha
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           j = 1
+           ! k1 is the first column of the panel to be factorized
+           ! i.e.,  k1 is 2 for the first block column, and 1 for the rest of the blocks
+           k1 = (2-j1)+1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              10 continue
+              if ( j>min(m, nb) )go to 20
+              ! k is the column to be factorized
+               ! when being called from stdlib_wsytrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:m, j) := a(j, j:m) - h(j:m, 1:(j-1)) * l(j1:(j-1), j),
+               ! where h(j:m, j) has been initialized to be a(j, j:m)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_wgemv( 'NO TRANSPOSE', mj, j-k1,-cone, h( j, k1 ), ldh,a( 1, j ), 1,&
+                           cone, h( j, j ), 1 )
+              end if
+              ! copy h(i:m, i) into work
+              call stdlib_wcopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j-1, j:m) * t(j-1,j),
+                  ! where a(j-1, j) stores t(j-1, j) and a(j-2, j:m) stores u(j-1, j:m)
+                 alpha = -a( k-1, j )
+                 call stdlib_waxpy( mj, alpha, a( k-2, j ), lda, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( k, j ) = work( 1 )
+              if( j<m ) then
+                 ! compute work(2:m) = t(j, j) l(j, (j+1):m)
+                  ! where a(j, j) stores t(j, j) and a(j-1, (j+1):m) stores u(j, (j+1):m)
+                 if( k>1 ) then
+                    alpha = -a( k, j )
+                    call stdlib_waxpy( m-j, alpha, a( k-1, j+1 ), lda,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:m)|)
+                 i2 = stdlib_iwamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply symmetric pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1, i1+1:m) with a(i1+1:m, i2)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_wswap( i2-i1-1, a( j1+i1-1, i1+1 ), lda,a( j1+i1, i2 ), 1 )
+                              
+                    ! swap a(i1, i2+1:m) with a(i2, i2+1:m)
+                    if( i2<m )call stdlib_wswap( m-i2, a( j1+i1-1, i2+1 ), lda,a( j1+i2-1, i2+1 ),&
+                               lda )
+                    ! swap a(i1, i1) with a(i2,i2)
+                    piv = a( i1+j1-1, i1 )
+                    a( j1+i1-1, i1 ) = a( j1+i2-1, i2 )
+                    a( j1+i2-1, i2 ) = piv
+                    ! swap h(i1, 1:j1) with h(i2, 1:j1)
+                    call stdlib_wswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_wswap( i1-k1+1, a( 1, i1 ), 1,a( 1, i2 ), 1 )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j, j+1) = t(j, j+1)
+                 a( k, j+1 ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:m, j+1) into h(j:m, j),
+                    call stdlib_wcopy( m-j, a( k+1, j+1 ), lda,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:m ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:m, j) = l(j+2:m, j+1)
+                 if( j<(m-1) ) then
+                    if( a( k, j+1 )/=czero ) then
+                       alpha = cone / a( k, j+1 )
+                       call stdlib_wcopy( m-j-1, work( 3 ), 1, a( k, j+2 ), lda )
+                       call stdlib_wscal( m-j-1, alpha, a( k, j+2 ), lda )
+                    else
+                       call stdlib_wlaset( 'FULL', 1, m-j-1, czero, czero,a( k, j+2 ), lda)
+                                 
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 10
+              20 continue
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              30 continue
+              if( j>min( m, nb ) )go to 40
+              ! k is the column to be factorized
+               ! when being called from stdlib_wsytrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:m, j) := a(j:m, j) - h(j:m, 1:(j-1)) * l(j, j1:(j-1))^t,
+               ! where h(j:m, j) has been initialized to be a(j:m, j)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_wgemv( 'NO TRANSPOSE', mj, j-k1,-cone, h( j, k1 ), ldh,a( j, 1 ), &
+                           lda,cone, h( j, j ), 1 )
+              end if
+              ! copy h(j:m, j) into work
+              call stdlib_wcopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j:m, j-1) * t(j-1,j),
+                  ! where a(j-1, j) = t(j-1, j) and a(j, j-2) = l(j, j-1)
+                 alpha = -a( j, k-1 )
+                 call stdlib_waxpy( mj, alpha, a( j, k-2 ), 1, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( j, k ) = work( 1 )
+              if( j<m ) then
+                 ! compute work(2:m) = t(j, j) l((j+1):m, j)
+                  ! where a(j, j) = t(j, j) and a((j+1):m, j-1) = l((j+1):m, j)
+                 if( k>1 ) then
+                    alpha = -a( j, k )
+                    call stdlib_waxpy( m-j, alpha, a( j+1, k-1 ), 1,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:m)|)
+                 i2 = stdlib_iwamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply symmetric pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1+1:m, i1) with a(i2, i1+1:m)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_wswap( i2-i1-1, a( i1+1, j1+i1-1 ), 1,a( i2, j1+i1 ), lda )
+                              
+                    ! swap a(i2+1:m, i1) with a(i2+1:m, i2)
+                    if( i2<m )call stdlib_wswap( m-i2, a( i2+1, j1+i1-1 ), 1,a( i2+1, j1+i2-1 ), &
+                              1 )
+                    ! swap a(i1, i1) with a(i2, i2)
+                    piv = a( i1, j1+i1-1 )
+                    a( i1, j1+i1-1 ) = a( i2, j1+i2-1 )
+                    a( i2, j1+i2-1 ) = piv
+                    ! swap h(i1, i1:j1) with h(i2, i2:j1)
+                    call stdlib_wswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_wswap( i1-k1+1, a( i1, 1 ), lda,a( i2, 1 ), lda )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j+1, j) = t(j+1, j)
+                 a( j+1, k ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:m, j+1) into h(j+1:m, j),
+                    call stdlib_wcopy( m-j, a( j+1, k+1 ), 1,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:m ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:m, j) = l(j+2:m, j+1)
+                 if( j<(m-1) ) then
+                    if( a( j+1, k )/=czero ) then
+                       alpha = cone / a( j+1, k )
+                       call stdlib_wcopy( m-j-1, work( 3 ), 1, a( j+2, k ), 1 )
+                       call stdlib_wscal( m-j-1, alpha, a( j+2, k ), 1 )
+                    else
+                       call stdlib_wlaset( 'FULL', m-j-1, 1, czero, czero,a( j+2, k ), lda )
+                                 
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 30
+              40 continue
+           end if
+           return
+     end subroutine stdlib_wlasyf_aa
+
+     !> ZLASYF_RK: computes a partial factorization of a complex symmetric
+     !> matrix A using the bounded Bunch-Kaufman (rook) diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L',
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> ZLASYF_RK is an auxiliary routine called by ZSYTRF_RK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_wlasyf_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: e(*), w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, j, jb, jj, jmax, k, kk, kw, kkw, kp, kstep, p, ii
+           real(qp) :: absakk, alpha, colmax, rowmax, sfmin, dtemp
+           complex(qp) :: d11, d12, d21, d22, r1, t, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=qp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = czero
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              call stdlib_wcopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_wgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ),lda, w( k, &
+                        kw+1 ), ldw, cone, w( 1, kw ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_wcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       call stdlib_wcopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                       call stdlib_wcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       if( k<n )call stdlib_wgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda,&
+                                  w( imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_iwamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = cabs1( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_iwamax( imax-1, w( 1, kw-1 ), 1 )
+                          dtemp = cabs1( w( itemp, kw-1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! cabs1( w( imax, kw-1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.(cabs1( w( imax, kw-1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_wcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_wcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_wcopy( k-p, a( p+1, k ), 1, a( p, p+1 ), lda )
+                    call stdlib_wcopy( p, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in last n-k+1 columns of a
+                    ! and last n-k+2 columns of w
+                    call stdlib_wswap( n-k+1, a( k, k ), lda, a( p, k ), lda )
+                    call stdlib_wswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kkw of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_wcopy( k-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_wcopy( kp, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last n-kk+1 columns
+                    ! of a and w
+                    call stdlib_wswap( n-kk+1, a( kk, kk ), lda, a( kp, kk ), lda )
+                    call stdlib_wswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! store u(k) in column k of a
+                    call stdlib_wcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          r1 = cone / a( k, k )
+                          call stdlib_wscal( k-1, r1, a( 1, k ), 1 )
+                       else if( a( k, k )/=czero ) then
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now
+                    ! hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    if( k>2 ) then
+                       ! store u(k) and u(k-1) in columns k and k-1 of a
+                       d12 = w( k-1, kw )
+                       d11 = w( k, kw ) / d12
+                       d22 = w( k-1, kw-1 ) / d12
+                       t = cone / ( d11*d22-cone )
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( (d11*w( j, kw-1 )-w( j, kw ) ) /d12 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /d12 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy superdiagonal element of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = czero
+                    a( k, k ) = w( k, kw )
+                    e( k ) = w( k-1, kw )
+                    e( k-1 ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_wgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb,n-k, -cone, a( &
+                           1, k+1 ), lda, w( j, kw+1 ),ldw, cone, a( 1, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = czero
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update it
+              call stdlib_wcopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              if( k>1 )call stdlib_wgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( k, &
+                        1 ), ldw, cone, w( k, k ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_wcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_wcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_wcopy( n-imax+1, a( imax, imax ), 1,w( imax, k+1 ), 1 )
+                       if( k>1 )call stdlib_wgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone,a( k, 1 ), &
+                                 lda, w( imax, 1 ), ldw,cone, w( k, k+1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_iwamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = cabs1( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_iwamax( n-imax, w( imax+1, k+1 ), 1)
+                          dtemp = cabs1( w( itemp, k+1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! cabs1( w( imax, k+1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( cabs1( w( imax, k+1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_wcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_wcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 72
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_wcopy( p-k, a( k, k ), 1, a( p, k ), lda )
+                    call stdlib_wcopy( n-p+1, a( p, k ), 1, a( p, p ), 1 )
+                    ! interchange rows k and p in first k columns of a
+                    ! and first k+1 columns of w
+                    call stdlib_wswap( k, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_wswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kk of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_wcopy( kp-k-1, a( k+1, kk ), 1, a( kp, k+1 ), lda )
+                    call stdlib_wcopy( n-kp+1, a( kp, kk ), 1, a( kp, kp ), 1 )
+                    ! interchange rows kk and kp in first kk columns of a and w
+                    call stdlib_wswap( kk, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_wswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    ! store l(k) in column k of a
+                    call stdlib_wcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          r1 = cone / a( k, k )
+                          call stdlib_wscal( n-k, r1, a( k+1, k ), 1 )
+                       else if( a( k, k )/=czero ) then
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! store l(k) and l(k+1) in columns k and k+1 of a
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /d21 )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy subdiagonal element of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = czero
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    e( k ) = w( k+1, k )
+                    e( k+1 ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_wgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_wlasyf_rk
+
+     !> ZLASYF_ROOK: computes a partial factorization of a complex symmetric
+     !> matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> ZLASYF_ROOK is an auxiliary routine called by ZSYTRF_ROOK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_wlasyf_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, j, jb, jj, jmax, jp1, jp2, k, kk, kw, kkw, kp, kstep, p, &
+                     ii
+           real(qp) :: absakk, alpha, colmax, rowmax, dtemp, sfmin
+           complex(qp) :: d11, d12, d21, d22, r1, t, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt,aimag,real
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=qp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              call stdlib_wcopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_wgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ),lda, w( k, &
+                        kw+1 ), ldw, cone, w( 1, kw ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_wcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       call stdlib_wcopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                       call stdlib_wcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       if( k<n )call stdlib_wgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda,&
+                                  w( imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_iwamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = cabs1( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_iwamax( imax-1, w( 1, kw-1 ), 1 )
+                          dtemp = cabs1( w( itemp, kw-1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! cabs1( w( imax, kw-1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.(cabs1( w( imax, kw-1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_wcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_wcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_wcopy( k-p, a( p+1, k ), 1, a( p, p+1 ), lda )
+                    call stdlib_wcopy( p, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in last n-k+1 columns of a
+                    ! and last n-k+2 columns of w
+                    call stdlib_wswap( n-k+1, a( k, k ), lda, a( p, k ), lda )
+                    call stdlib_wswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kkw of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_wcopy( k-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_wcopy( kp, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last n-kk+1 columns
+                    ! of a and w
+                    call stdlib_wswap( n-kk+1, a( kk, kk ), lda, a( kp, kk ), lda )
+                    call stdlib_wswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! store u(k) in column k of a
+                    call stdlib_wcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          r1 = cone / a( k, k )
+                          call stdlib_wscal( k-1, r1, a( 1, k ), 1 )
+                       else if( a( k, k )/=czero ) then
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now
+                    ! hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    if( k>2 ) then
+                       ! store u(k) and u(k-1) in columns k and k-1 of a
+                       d12 = w( k-1, kw )
+                       d11 = w( k, kw ) / d12
+                       d22 = w( k-1, kw-1 ) / d12
+                       t = cone / ( d11*d22-cone )
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( (d11*w( j, kw-1 )-w( j, kw ) ) /d12 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /d12 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_wgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb,n-k, -cone, a( &
+                           1, k+1 ), lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n
+              j = k + 1
+              60 continue
+                 kstep = 1
+                 jp1 = 1
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    j = j + 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 j = j + 1
+                 if( jp2/=jj .and. j<=n )call stdlib_wswap( n-j+1, a( jp2, j ), lda, a( jj, j ), &
+                           lda )
+                 jj = j - 1
+                 if( jp1/=jj .and. kstep==2 )call stdlib_wswap( n-j+1, a( jp1, j ), lda, a( jj, j &
+                           ), lda )
+              if( j<=n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update it
+              call stdlib_wcopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              if( k>1 )call stdlib_wgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( k, &
+                        1 ), ldw, cone, w( k, k ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_wcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_wcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_wcopy( n-imax+1, a( imax, imax ), 1,w( imax, k+1 ), 1 )
+                       if( k>1 )call stdlib_wgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone,a( k, 1 ), &
+                                 lda, w( imax, 1 ), ldw,cone, w( k, k+1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_iwamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = cabs1( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_iwamax( n-imax, w( imax+1, k+1 ), 1)
+                          dtemp = cabs1( w( itemp, k+1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! cabs1( w( imax, k+1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( cabs1( w( imax, k+1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_wcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_wcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 72
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_wcopy( p-k, a( k, k ), 1, a( p, k ), lda )
+                    call stdlib_wcopy( n-p+1, a( p, k ), 1, a( p, p ), 1 )
+                    ! interchange rows k and p in first k columns of a
+                    ! and first k+1 columns of w
+                    call stdlib_wswap( k, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_wswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kk of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_wcopy( kp-k-1, a( k+1, kk ), 1, a( kp, k+1 ), lda )
+                    call stdlib_wcopy( n-kp+1, a( kp, kk ), 1, a( kp, kp ), 1 )
+                    ! interchange rows kk and kp in first kk columns of a and w
+                    call stdlib_wswap( kk, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_wswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    ! store l(k) in column k of a
+                    call stdlib_wcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          r1 = cone / a( k, k )
+                          call stdlib_wscal( n-k, r1, a( k+1, k ), 1 )
+                       else if( a( k, k )/=czero ) then
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! store l(k) and l(k+1) in columns k and k+1 of a
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /d21 )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_wgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ), ldw,cone, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! in columns 1:k-1
+              j = k - 1
+              120 continue
+                 kstep = 1
+                 jp1 = 1
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    j = j - 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 j = j - 1
+                 if( jp2/=jj .and. j>=1 )call stdlib_wswap( j, a( jp2, 1 ), lda, a( jj, 1 ), lda )
+                           
+                 jj = j + 1
+                 if( jp1/=jj .and. kstep==2 )call stdlib_wswap( j, a( jp1, 1 ), lda, a( jj, 1 ), &
+                           lda )
+              if( j>=1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_wlasyf_rook
+
+     !> ZLAT2C: converts a COMPLEX*16 triangular matrix, SA, to a COMPLEX
+     !> triangular matrix, A.
+     !> RMAX is the overflow for the SINGLE PRECISION arithmetic
+     !> ZLAT2C checks that all the entries of A are between -RMAX and
+     !> RMAX. If not the conversion is aborted and a flag is raised.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_wlat2c( uplo, n, a, lda, sa, ldsa, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, n
+           ! Array Arguments 
+           complex(dp), intent(out) :: sa(ldsa,*)
+           complex(qp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(qp) :: rmax
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: real,aimag
+           ! Executable Statements 
+           rmax = stdlib_dlamch( 'O' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( upper ) then
+              do j = 1, n
+                 do i = 1, j
+                    if( ( real( a( i, j ),KIND=qp)<-rmax ) .or.( real( a( i, j ),KIND=qp)>rmax ) &
+                    .or.( aimag( a( i, j ) )<-rmax ) .or.( aimag( a( i, j ) )>rmax ) ) &
+                              then
+                       info = 1
+                       go to 50
+                    end if
+                    sa( i, j ) = a( i, j )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = j, n
+                    if( ( real( a( i, j ),KIND=qp)<-rmax ) .or.( real( a( i, j ),KIND=qp)>rmax ) &
+                    .or.( aimag( a( i, j ) )<-rmax ) .or.( aimag( a( i, j ) )>rmax ) ) &
+                              then
+                       info = 1
+                       go to 50
+                    end if
+                    sa( i, j ) = a( i, j )
+                 end do
+              end do
+           end if
+           50 continue
+           return
+     end subroutine stdlib_wlat2c
+
+     !> ZLATBS: solves one of the triangular systems
+     !> A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+     !> with scaling to prevent overflow, where A is an upper or lower
+     !> triangular band matrix.  Here A**T denotes the transpose of A, x and b
+     !> are n-element vectors, and s is a scaling factor, usually less than
+     !> or equal to 1, chosen so that the components of x will be less than
+     !> the overflow threshold.  If the unscaled problem will not cause
+     !> overflow, the Level 2 BLAS routine ZTBSV is called.  If the matrix A
+     !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+     !> non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_wlatbs( uplo, trans, diag, normin, n, kd, ab, ldab, x,scale, cnorm, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(qp), intent(out) :: scale
+           ! Array Arguments 
+           real(qp), intent(inout) :: cnorm(*)
+           complex(qp), intent(in) :: ab(ldab,*)
+           complex(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, j, jfirst, jinc, jlast, jlen, maind
+           real(qp) :: bignum, grow, rec, smlnum, tjj, tmax, tscal, xbnd, xj, xmax
+           complex(qp) :: csumj, tjjs, uscal, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,min
+           ! Statement Functions 
+           real(qp) :: cabs1, cabs2
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           cabs2( zdum ) = abs( real( zdum,KIND=qp) / 2._qp ) +abs( aimag( zdum ) / 2._qp )
+                     
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( kd<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLATBS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = smlnum / stdlib_qlamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 do j = 1, n
+                    jlen = min( kd, j-1 )
+                    cnorm( j ) = stdlib_qzasum( jlen, ab( kd+1-jlen, j ), 1 )
+                 end do
+              else
+                 ! a is lower triangular.
+                 do j = 1, n
+                    jlen = min( kd, n-j )
+                    if( jlen>0 ) then
+                       cnorm( j ) = stdlib_qzasum( jlen, ab( 2, j ), 1 )
+                    else
+                       cnorm( j ) = zero
+                    end if
+                 end do
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum/2.
+           imax = stdlib_iqamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum*half ) then
+              tscal = one
+           else
+              tscal = half / ( smlnum*tmax )
+              call stdlib_qscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_wtbsv can be used.
+           xmax = zero
+           do j = 1, n
+              xmax = max( xmax, cabs2( x( j ) ) )
+           end do
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+                 maind = kd + 1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+                 maind = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 60
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    tjjs = ab( maind, j )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = g(j-1) / abs(a(j,j))
+                       xbnd = min( xbnd, min( one, tjj )*grow )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              60 continue
+           else
+              ! compute the growth in a**t * x = b  or  a**h * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+                 maind = kd + 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+                 maind = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 90
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    tjjs = ab( maind, j )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                       if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              90 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_wtbsv( uplo, trans, diag, n, kd, ab, ldab, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum*half ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = ( bignum*half ) / xmax
+                 call stdlib_wdscal( n, scale, x, 1 )
+                 xmax = bignum
+              else
+                 xmax = xmax*two
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 loop_120: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = cabs1( x( j ) )
+                    if( nounit ) then
+                       tjjs = ab( maind, j )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 110
+                    end if
+                    tjj = cabs1( tjjs )
+                    if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                       if( tjj<one ) then
+                          if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                             rec = one / xj
+                             call stdlib_wdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       xj = cabs1( x( j ) )
+                    else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                       if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                          rec = ( tjj*bignum ) / xj
+                          if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                             rec = rec / cnorm( j )
+                          end if
+                          call stdlib_wdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                       x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       xj = cabs1( x( j ) )
+                    else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                       do i = 1, n
+                          x( i ) = zero
+                       end do
+                       x( j ) = one
+                       xj = one
+                       scale = zero
+                       xmax = zero
+                    end if
+                    110 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_wdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_wdscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(max(1,j-kd):j-1) := x(max(1,j-kd):j-1) -
+                                                   ! x(j)* a(max(1,j-kd):j-1,j)
+                          jlen = min( kd, j-1 )
+                          call stdlib_waxpy( jlen, -x( j )*tscal,ab( kd+1-jlen, j ), 1, x( j-jlen &
+                                    ), 1 )
+                          i = stdlib_iwamax( j-1, x, 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                    else if( j<n ) then
+                       ! compute the update
+                          ! x(j+1:min(j+kd,n)) := x(j+1:min(j+kd,n)) -
+                                                ! x(j) * a(j+1:min(j+kd,n),j)
+                       jlen = min( kd, n-j )
+                       if( jlen>0 )call stdlib_waxpy( jlen, -x( j )*tscal, ab( 2, j ), 1,x( j+1 ),&
+                                  1 )
+                       i = j + stdlib_iwamax( n-j, x( j+1 ), 1 )
+                       xmax = cabs1( x( i ) )
+                    end if
+                 end do loop_120
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! solve a**t * x = b
+                 loop_170: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = ab( maind, j )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = stdlib_wladiv( uscal, tjjs )
+                       end if
+                       if( rec<one ) then
+                          call stdlib_wdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=qp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_wdotu to perform the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          csumj = stdlib_wdotu( jlen, ab( kd+1-jlen, j ), 1,x( j-jlen ), 1 )
+                                    
+                       else
+                          jlen = min( kd, n-j )
+                          if( jlen>1 )csumj = stdlib_wdotu( jlen, ab( 2, j ), 1, x( j+1 ),1 )
+                                    
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          do i = 1, jlen
+                             csumj = csumj + ( ab( kd+i-jlen, j )*uscal )*x( j-jlen-1+i )
+                          end do
+                       else
+                          jlen = min( kd, n-j )
+                          do i = 1, jlen
+                             csumj = csumj + ( ab( i+1, j )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=qp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = ab( maind, j )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 160
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_wdscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_wdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**t *x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       160 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_wladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                 end do loop_170
+              else
+                 ! solve a**h * x = b
+                 loop_220: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = conjg( ab( maind, j ) )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = stdlib_wladiv( uscal, tjjs )
+                       end if
+                       if( rec<one ) then
+                          call stdlib_wdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=qp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_wdotc to perform the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          csumj = stdlib_wdotc( jlen, ab( kd+1-jlen, j ), 1,x( j-jlen ), 1 )
+                                    
+                       else
+                          jlen = min( kd, n-j )
+                          if( jlen>1 )csumj = stdlib_wdotc( jlen, ab( 2, j ), 1, x( j+1 ),1 )
+                                    
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          do i = 1, jlen
+                             csumj = csumj + ( conjg( ab( kd+i-jlen, j ) )*uscal )*x( j-jlen-1+i )
+                                       
+                          end do
+                       else
+                          jlen = min( kd, n-j )
+                          do i = 1, jlen
+                             csumj = csumj + ( conjg( ab( i+1, j ) )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=qp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = conjg( ab( maind, j ) )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 210
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_wdscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_wdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**h *x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       210 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_wladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                 end do loop_220
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_qscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_wlatbs
+
+     !> ZLATDF: computes the contribution to the reciprocal Dif-estimate
+     !> by solving for x in Z * x = b, where b is chosen such that the norm
+     !> of x is as large as possible. It is assumed that LU decomposition
+     !> of Z has been computed by ZGETC2. On entry RHS = f holds the
+     !> contribution from earlier solved sub-systems, and on return RHS = x.
+     !> The factorization of Z returned by ZGETC2 has the form
+     !> Z = P * L * U * Q, where P and Q are permutation matrices. L is lower
+     !> triangular with unit diagonal elements and U is upper triangular.
+
+     pure subroutine stdlib_wlatdf( ijob, n, z, ldz, rhs, rdsum, rdscal, ipiv,jpiv )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ijob, ldz, n
+           real(qp), intent(inout) :: rdscal, rdsum
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*), jpiv(*)
+           complex(qp), intent(inout) :: rhs(*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxdim = 2
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j, k
+           real(qp) :: rtemp, scale, sminu, splus
+           complex(qp) :: bm, bp, pmone, temp
+           ! Local Arrays 
+           real(qp) :: rwork(maxdim)
+           complex(qp) :: work(4*maxdim), xm(maxdim), xp(maxdim)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,sqrt
+           ! Executable Statements 
+           if( ijob/=2 ) then
+              ! apply permutations ipiv to rhs
+              call stdlib_wlaswp( 1, rhs, ldz, 1, n-1, ipiv, 1 )
+              ! solve for l-part choosing rhs either to +1 or -1.
+              pmone = -cone
+              loop_10: do j = 1, n - 1
+                 bp = rhs( j ) + cone
+                 bm = rhs( j ) - cone
+                 splus = one
+                 ! lockahead for l- part rhs(1:n-1) = +-1
+                 ! splus and smin computed more efficiently than in bsolve[1].
+                 splus = splus + real( stdlib_wdotc( n-j, z( j+1, j ), 1, z( j+1,j ), 1 ),KIND=qp)
+                           
+                 sminu = real( stdlib_wdotc( n-j, z( j+1, j ), 1, rhs( j+1 ), 1 ),KIND=qp)
+                 splus = splus*real( rhs( j ),KIND=qp)
+                 if( splus>sminu ) then
+                    rhs( j ) = bp
+                 else if( sminu>splus ) then
+                    rhs( j ) = bm
+                 else
+                    ! in this case the updating sums are equal and we can
+                    ! choose rhs(j) +1 or -1. the first time this happens we
+                    ! choose -1, thereafter +1. this is a simple way to get
+                    ! good estimates of matrices like byers well-known example
+                    ! (see [1]). (not done in bsolve.)
+                    rhs( j ) = rhs( j ) + pmone
+                    pmone = cone
+                 end if
+                 ! compute the remaining r.h.s.
+                 temp = -rhs( j )
+                 call stdlib_waxpy( n-j, temp, z( j+1, j ), 1, rhs( j+1 ), 1 )
+              end do loop_10
+              ! solve for u- part, lockahead for rhs(n) = +-1. this is not done
+              ! in bsolve and will hopefully give us a better estimate because
+              ! any ill-conditioning of the original matrix is transferred to u
+              ! and not to l. u(n, n) is an approximation to sigma_min(lu).
+              call stdlib_wcopy( n-1, rhs, 1, work, 1 )
+              work( n ) = rhs( n ) + cone
+              rhs( n ) = rhs( n ) - cone
+              splus = zero
+              sminu = zero
+              do i = n, 1, -1
+                 temp = cone / z( i, i )
+                 work( i ) = work( i )*temp
+                 rhs( i ) = rhs( i )*temp
+                 do k = i + 1, n
+                    work( i ) = work( i ) - work( k )*( z( i, k )*temp )
+                    rhs( i ) = rhs( i ) - rhs( k )*( z( i, k )*temp )
+                 end do
+                 splus = splus + abs( work( i ) )
+                 sminu = sminu + abs( rhs( i ) )
+              end do
+              if( splus>sminu )call stdlib_wcopy( n, work, 1, rhs, 1 )
+              ! apply the permutations jpiv to the computed solution (rhs)
+              call stdlib_wlaswp( 1, rhs, ldz, 1, n-1, jpiv, -1 )
+              ! compute the sum of squares
+              call stdlib_wlassq( n, rhs, 1, rdscal, rdsum )
+              return
+           end if
+           ! entry ijob = 2
+           ! compute approximate nullvector xm of z
+           call stdlib_wgecon( 'I', n, z, ldz, one, rtemp, work, rwork, info )
+           call stdlib_wcopy( n, work( n+1 ), 1, xm, 1 )
+           ! compute rhs
+           call stdlib_wlaswp( 1, xm, ldz, 1, n-1, ipiv, -1 )
+           temp = cone / sqrt( stdlib_wdotc( n, xm, 1, xm, 1 ) )
+           call stdlib_wscal( n, temp, xm, 1 )
+           call stdlib_wcopy( n, xm, 1, xp, 1 )
+           call stdlib_waxpy( n, cone, rhs, 1, xp, 1 )
+           call stdlib_waxpy( n, -cone, xm, 1, rhs, 1 )
+           call stdlib_wgesc2( n, z, ldz, rhs, ipiv, jpiv, scale )
+           call stdlib_wgesc2( n, z, ldz, xp, ipiv, jpiv, scale )
+           if( stdlib_qzasum( n, xp, 1 )>stdlib_qzasum( n, rhs, 1 ) )call stdlib_wcopy( n, xp, 1, &
+                     rhs, 1 )
+           ! compute the sum of squares
+           call stdlib_wlassq( n, rhs, 1, rdscal, rdsum )
+           return
+     end subroutine stdlib_wlatdf
+
+     !> ZLATPS: solves one of the triangular systems
+     !> A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+     !> with scaling to prevent overflow, where A is an upper or lower
+     !> triangular matrix stored in packed form.  Here A**T denotes the
+     !> transpose of A, A**H denotes the conjugate transpose of A, x and b
+     !> are n-element vectors, and s is a scaling factor, usually less than
+     !> or equal to 1, chosen so that the components of x will be less than
+     !> the overflow threshold.  If the unscaled problem will not cause
+     !> overflow, the Level 2 BLAS routine ZTPSV is called. If the matrix A
+     !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+     !> non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_wlatps( uplo, trans, diag, normin, n, ap, x, scale,cnorm, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(out) :: scale
+           ! Array Arguments 
+           real(qp), intent(inout) :: cnorm(*)
+           complex(qp), intent(in) :: ap(*)
+           complex(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, ip, j, jfirst, jinc, jlast, jlen
+           real(qp) :: bignum, grow, rec, smlnum, tjj, tmax, tscal, xbnd, xj, xmax
+           complex(qp) :: csumj, tjjs, uscal, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,min
+           ! Statement Functions 
+           real(qp) :: cabs1, cabs2
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           cabs2( zdum ) = abs( real( zdum,KIND=qp) / 2._qp ) +abs( aimag( zdum ) / 2._qp )
+                     
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLATPS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = smlnum / stdlib_qlamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 ip = 1
+                 do j = 1, n
+                    cnorm( j ) = stdlib_qzasum( j-1, ap( ip ), 1 )
+                    ip = ip + j
+                 end do
+              else
+                 ! a is lower triangular.
+                 ip = 1
+                 do j = 1, n - 1
+                    cnorm( j ) = stdlib_qzasum( n-j, ap( ip+1 ), 1 )
+                    ip = ip + n - j + 1
+                 end do
+                 cnorm( n ) = zero
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum/2.
+           imax = stdlib_iqamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum*half ) then
+              tscal = one
+           else
+              tscal = half / ( smlnum*tmax )
+              call stdlib_qscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_wtpsv can be used.
+           xmax = zero
+           do j = 1, n
+              xmax = max( xmax, cabs2( x( j ) ) )
+           end do
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 60
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = n
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    tjjs = ap( ip )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = g(j-1) / abs(a(j,j))
+                       xbnd = min( xbnd, min( one, tjj )*grow )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                    ip = ip + jinc*jlen
+                    jlen = jlen - 1
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              60 continue
+           else
+              ! compute the growth in a**t * x = b  or  a**h * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 90
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    tjjs = ap( ip )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                       if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              90 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_wtpsv( uplo, trans, diag, n, ap, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum*half ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = ( bignum*half ) / xmax
+                 call stdlib_wdscal( n, scale, x, 1 )
+                 xmax = bignum
+              else
+                 xmax = xmax*two
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 loop_120: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = cabs1( x( j ) )
+                    if( nounit ) then
+                       tjjs = ap( ip )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 110
+                    end if
+                    tjj = cabs1( tjjs )
+                    if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                       if( tjj<one ) then
+                          if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                             rec = one / xj
+                             call stdlib_wdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       xj = cabs1( x( j ) )
+                    else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                       if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                          rec = ( tjj*bignum ) / xj
+                          if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                             rec = rec / cnorm( j )
+                          end if
+                          call stdlib_wdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                       x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       xj = cabs1( x( j ) )
+                    else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                       do i = 1, n
+                          x( i ) = zero
+                       end do
+                       x( j ) = one
+                       xj = one
+                       scale = zero
+                       xmax = zero
+                    end if
+                    110 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_wdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_wdscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(1:j-1) := x(1:j-1) - x(j) * a(1:j-1,j)
+                          call stdlib_waxpy( j-1, -x( j )*tscal, ap( ip-j+1 ), 1, x,1 )
+                          i = stdlib_iwamax( j-1, x, 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                       ip = ip - j
+                    else
+                       if( j<n ) then
+                          ! compute the update
+                             ! x(j+1:n) := x(j+1:n) - x(j) * a(j+1:n,j)
+                          call stdlib_waxpy( n-j, -x( j )*tscal, ap( ip+1 ), 1,x( j+1 ), 1 )
+                                    
+                          i = j + stdlib_iwamax( n-j, x( j+1 ), 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                       ip = ip + n - j + 1
+                    end if
+                 end do loop_120
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! solve a**t * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 loop_170: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = ap( ip )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = stdlib_wladiv( uscal, tjjs )
+                       end if
+                       if( rec<one ) then
+                          call stdlib_wdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=qp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_wdotu to perform the dot product.
+                       if( upper ) then
+                          csumj = stdlib_wdotu( j-1, ap( ip-j+1 ), 1, x, 1 )
+                       else if( j<n ) then
+                          csumj = stdlib_wdotu( n-j, ap( ip+1 ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             csumj = csumj + ( ap( ip-j+i )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = 1, n - j
+                             csumj = csumj + ( ap( ip+i )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=qp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = ap( ip )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 160
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_wdscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_wdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**t *x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       160 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_wladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do loop_170
+              else
+                 ! solve a**h * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 loop_220: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = conjg( ap( ip ) )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = stdlib_wladiv( uscal, tjjs )
+                       end if
+                       if( rec<one ) then
+                          call stdlib_wdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=qp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_wdotc to perform the dot product.
+                       if( upper ) then
+                          csumj = stdlib_wdotc( j-1, ap( ip-j+1 ), 1, x, 1 )
+                       else if( j<n ) then
+                          csumj = stdlib_wdotc( n-j, ap( ip+1 ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             csumj = csumj + ( conjg( ap( ip-j+i ) )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = 1, n - j
+                             csumj = csumj + ( conjg( ap( ip+i ) )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=qp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = conjg( ap( ip ) )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 210
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_wdscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_wdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**h *x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       210 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_wladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do loop_220
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_qscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_wlatps
+
+     !> ZLATRD: reduces NB rows and columns of a complex Hermitian matrix A to
+     !> Hermitian tridiagonal form by a unitary similarity
+     !> transformation Q**H * A * Q, and returns the matrices V and W which are
+     !> needed to apply the transformation to the unreduced part of A.
+     !> If UPLO = 'U', ZLATRD reduces the last NB rows and columns of a
+     !> matrix, of which the upper triangle is supplied;
+     !> if UPLO = 'L', ZLATRD reduces the first NB rows and columns of a
+     !> matrix, of which the lower triangle is supplied.
+     !> This is an auxiliary routine called by ZHETRD.
+
+     pure subroutine stdlib_wlatrd( uplo, n, nb, a, lda, e, tau, w, ldw )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           real(qp), intent(out) :: e(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), w(ldw,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iw
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: real,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 )return
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! reduce last nb columns of upper triangle
+              loop_10: do i = n, n - nb + 1, -1
+                 iw = i - n + nb
+                 if( i<n ) then
+                    ! update a(1:i,i)
+                    a( i, i ) = real( a( i, i ),KIND=qp)
+                    call stdlib_wlacgv( n-i, w( i, iw+1 ), ldw )
+                    call stdlib_wgemv( 'NO TRANSPOSE', i, n-i, -cone, a( 1, i+1 ),lda, w( i, iw+1 &
+                              ), ldw, cone, a( 1, i ), 1 )
+                    call stdlib_wlacgv( n-i, w( i, iw+1 ), ldw )
+                    call stdlib_wlacgv( n-i, a( i, i+1 ), lda )
+                    call stdlib_wgemv( 'NO TRANSPOSE', i, n-i, -cone, w( 1, iw+1 ),ldw, a( i, i+1 &
+                              ), lda, cone, a( 1, i ), 1 )
+                    call stdlib_wlacgv( n-i, a( i, i+1 ), lda )
+                    a( i, i ) = real( a( i, i ),KIND=qp)
+                 end if
+                 if( i>1 ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(1:i-2,i)
+                    alpha = a( i-1, i )
+                    call stdlib_wlarfg( i-1, alpha, a( 1, i ), 1, tau( i-1 ) )
+                    e( i-1 ) = real( alpha,KIND=qp)
+                    a( i-1, i ) = cone
+                    ! compute w(1:i-1,i)
+                    call stdlib_whemv( 'UPPER', i-1, cone, a, lda, a( 1, i ), 1,czero, w( 1, iw ),&
+                               1 )
+                    if( i<n ) then
+                       call stdlib_wgemv( 'CONJUGATE TRANSPOSE', i-1, n-i, cone,w( 1, iw+1 ), ldw,&
+                                  a( 1, i ), 1, czero,w( i+1, iw ), 1 )
+                       call stdlib_wgemv( 'NO TRANSPOSE', i-1, n-i, -cone,a( 1, i+1 ), lda, w( i+&
+                                 1, iw ), 1, cone,w( 1, iw ), 1 )
+                       call stdlib_wgemv( 'CONJUGATE TRANSPOSE', i-1, n-i, cone,a( 1, i+1 ), lda, &
+                                 a( 1, i ), 1, czero,w( i+1, iw ), 1 )
+                       call stdlib_wgemv( 'NO TRANSPOSE', i-1, n-i, -cone,w( 1, iw+1 ), ldw, w( i+&
+                                 1, iw ), 1, cone,w( 1, iw ), 1 )
+                    end if
+                    call stdlib_wscal( i-1, tau( i-1 ), w( 1, iw ), 1 )
+                    alpha = -chalf*tau( i-1 )*stdlib_wdotc( i-1, w( 1, iw ), 1,a( 1, i ), 1 )
+                              
+                    call stdlib_waxpy( i-1, alpha, a( 1, i ), 1, w( 1, iw ), 1 )
+                 end if
+              end do loop_10
+           else
+              ! reduce first nb columns of lower triangle
+              loop_20: do i = 1, nb
+                 ! update a(i:n,i)
+                 a( i, i ) = real( a( i, i ),KIND=qp)
+                 call stdlib_wlacgv( i-1, w( i, 1 ), ldw )
+                 call stdlib_wgemv( 'NO TRANSPOSE', n-i+1, i-1, -cone, a( i, 1 ),lda, w( i, 1 ), &
+                           ldw, cone, a( i, i ), 1 )
+                 call stdlib_wlacgv( i-1, w( i, 1 ), ldw )
+                 call stdlib_wlacgv( i-1, a( i, 1 ), lda )
+                 call stdlib_wgemv( 'NO TRANSPOSE', n-i+1, i-1, -cone, w( i, 1 ),ldw, a( i, 1 ), &
+                           lda, cone, a( i, i ), 1 )
+                 call stdlib_wlacgv( i-1, a( i, 1 ), lda )
+                 a( i, i ) = real( a( i, i ),KIND=qp)
+                 if( i<n ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(i+2:n,i)
+                    alpha = a( i+1, i )
+                    call stdlib_wlarfg( n-i, alpha, a( min( i+2, n ), i ), 1,tau( i ) )
+                    e( i ) = real( alpha,KIND=qp)
+                    a( i+1, i ) = cone
+                    ! compute w(i+1:n,i)
+                    call stdlib_whemv( 'LOWER', n-i, cone, a( i+1, i+1 ), lda,a( i+1, i ), 1, &
+                              czero, w( i+1, i ), 1 )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-i, i-1, cone,w( i+1, 1 ), ldw, a( &
+                              i+1, i ), 1, czero,w( 1, i ), 1 )
+                    call stdlib_wgemv( 'NO TRANSPOSE', n-i, i-1, -cone, a( i+1, 1 ),lda, w( 1, i )&
+                              , 1, cone, w( i+1, i ), 1 )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-i, i-1, cone,a( i+1, 1 ), lda, a( &
+                              i+1, i ), 1, czero,w( 1, i ), 1 )
+                    call stdlib_wgemv( 'NO TRANSPOSE', n-i, i-1, -cone, w( i+1, 1 ),ldw, w( 1, i )&
+                              , 1, cone, w( i+1, i ), 1 )
+                    call stdlib_wscal( n-i, tau( i ), w( i+1, i ), 1 )
+                    alpha = -chalf*tau( i )*stdlib_wdotc( n-i, w( i+1, i ), 1,a( i+1, i ), 1 )
+                              
+                    call stdlib_waxpy( n-i, alpha, a( i+1, i ), 1, w( i+1, i ), 1 )
+                 end if
+              end do loop_20
+           end if
+           return
+     end subroutine stdlib_wlatrd
+
+     !> ZLATRS: solves one of the triangular systems
+     !> A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+     !> with scaling to prevent overflow.  Here A is an upper or lower
+     !> triangular matrix, A**T denotes the transpose of A, A**H denotes the
+     !> conjugate transpose of A, x and b are n-element vectors, and s is a
+     !> scaling factor, usually less than or equal to 1, chosen so that the
+     !> components of x will be less than the overflow threshold.  If the
+     !> unscaled problem will not cause overflow, the Level 2 BLAS routine
+     !> ZTRSV is called. If the matrix A is singular (A(j,j) = 0 for some j),
+     !> then s is set to 0 and a non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_wlatrs( uplo, trans, diag, normin, n, a, lda, x, scale,cnorm, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(out) :: scale
+           ! Array Arguments 
+           real(qp), intent(inout) :: cnorm(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, j, jfirst, jinc, jlast
+           real(qp) :: bignum, grow, rec, smlnum, tjj, tmax, tscal, xbnd, xj, xmax
+           complex(qp) :: csumj, tjjs, uscal, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,min
+           ! Statement Functions 
+           real(qp) :: cabs1, cabs2
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           cabs2( zdum ) = abs( real( zdum,KIND=qp) / 2._qp ) +abs( aimag( zdum ) / 2._qp )
+                     
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLATRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = smlnum / stdlib_qlamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 do j = 1, n
+                    cnorm( j ) = stdlib_qzasum( j-1, a( 1, j ), 1 )
+                 end do
+              else
+                 ! a is lower triangular.
+                 do j = 1, n - 1
+                    cnorm( j ) = stdlib_qzasum( n-j, a( j+1, j ), 1 )
+                 end do
+                 cnorm( n ) = zero
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum/2.
+           imax = stdlib_iqamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum*half ) then
+              tscal = one
+           else
+              tscal = half / ( smlnum*tmax )
+              call stdlib_qscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_wtrsv can be used.
+           xmax = zero
+           do j = 1, n
+              xmax = max( xmax, cabs2( x( j ) ) )
+           end do
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 60
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    tjjs = a( j, j )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = g(j-1) / abs(a(j,j))
+                       xbnd = min( xbnd, min( one, tjj )*grow )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              60 continue
+           else
+              ! compute the growth in a**t * x = b  or  a**h * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 90
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    tjjs = a( j, j )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                       if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              90 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_wtrsv( uplo, trans, diag, n, a, lda, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum*half ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = ( bignum*half ) / xmax
+                 call stdlib_wdscal( n, scale, x, 1 )
+                 xmax = bignum
+              else
+                 xmax = xmax*two
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 loop_120: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = cabs1( x( j ) )
+                    if( nounit ) then
+                       tjjs = a( j, j )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 110
+                    end if
+                    tjj = cabs1( tjjs )
+                    if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                       if( tjj<one ) then
+                          if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                             rec = one / xj
+                             call stdlib_wdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       xj = cabs1( x( j ) )
+                    else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                       if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                          rec = ( tjj*bignum ) / xj
+                          if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                             rec = rec / cnorm( j )
+                          end if
+                          call stdlib_wdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                       x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       xj = cabs1( x( j ) )
+                    else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                       do i = 1, n
+                          x( i ) = zero
+                       end do
+                       x( j ) = one
+                       xj = one
+                       scale = zero
+                       xmax = zero
+                    end if
+                    110 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_wdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_wdscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(1:j-1) := x(1:j-1) - x(j) * a(1:j-1,j)
+                          call stdlib_waxpy( j-1, -x( j )*tscal, a( 1, j ), 1, x,1 )
+                          i = stdlib_iwamax( j-1, x, 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                    else
+                       if( j<n ) then
+                          ! compute the update
+                             ! x(j+1:n) := x(j+1:n) - x(j) * a(j+1:n,j)
+                          call stdlib_waxpy( n-j, -x( j )*tscal, a( j+1, j ), 1,x( j+1 ), 1 )
+                                    
+                          i = j + stdlib_iwamax( n-j, x( j+1 ), 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                    end if
+                 end do loop_120
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! solve a**t * x = b
+                 loop_170: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = a( j, j )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = stdlib_wladiv( uscal, tjjs )
+                       end if
+                       if( rec<one ) then
+                          call stdlib_wdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=qp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_wdotu to perform the dot product.
+                       if( upper ) then
+                          csumj = stdlib_wdotu( j-1, a( 1, j ), 1, x, 1 )
+                       else if( j<n ) then
+                          csumj = stdlib_wdotu( n-j, a( j+1, j ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             csumj = csumj + ( a( i, j )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = j + 1, n
+                             csumj = csumj + ( a( i, j )*uscal )*x( i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=qp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          tjjs = a( j, j )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 160
+                       end if
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_wdscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_wdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**t *x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       160 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_wladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                 end do loop_170
+              else
+                 ! solve a**h * x = b
+                 loop_220: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = conjg( a( j, j ) )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = stdlib_wladiv( uscal, tjjs )
+                       end if
+                       if( rec<one ) then
+                          call stdlib_wdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=qp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_wdotc to perform the dot product.
+                       if( upper ) then
+                          csumj = stdlib_wdotc( j-1, a( 1, j ), 1, x, 1 )
+                       else if( j<n ) then
+                          csumj = stdlib_wdotc( n-j, a( j+1, j ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             csumj = csumj + ( conjg( a( i, j ) )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = j + 1, n
+                             csumj = csumj + ( conjg( a( i, j ) )*uscal )*x( i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=qp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          tjjs = conjg( a( j, j ) )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 210
+                       end if
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_wdscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_wdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_wladiv( x( j ), tjjs )
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**h *x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       210 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_wladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                 end do loop_220
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_qscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_wlatrs
+
+     !> ZLATRZ: factors the M-by-(M+L) complex upper trapezoidal matrix
+     !> [ A1 A2 ] = [ A(1:M,1:M) A(1:M,N-L+1:N) ] as ( R  0 ) * Z by means
+     !> of unitary transformations, where  Z is an (M+L)-by-(M+L) unitary
+     !> matrix and, R and A1 are M-by-M upper triangular matrices.
+
+     pure subroutine stdlib_wlatrz( m, n, l, a, lda, tau, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: l, lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( m==0 ) then
+              return
+           else if( m==n ) then
+              do i = 1, n
+                 tau( i ) = czero
+              end do
+              return
+           end if
+           do i = m, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! [ a(i,i) a(i,n-l+1:n) ]
+              call stdlib_wlacgv( l, a( i, n-l+1 ), lda )
+              alpha = conjg( a( i, i ) )
+              call stdlib_wlarfg( l+1, alpha, a( i, n-l+1 ), lda, tau( i ) )
+              tau( i ) = conjg( tau( i ) )
+              ! apply h(i) to a(1:i-1,i:n) from the right
+              call stdlib_wlarz( 'RIGHT', i-1, n-i+1, l, a( i, n-l+1 ), lda,conjg( tau( i ) ), a( &
+                        1, i ), lda, work )
+              a( i, i ) = conjg( alpha )
+           end do
+           return
+     end subroutine stdlib_wlatrz
+
+     !> ZLATSQR: computes a blocked Tall-Skinny QR factorization of
+     !> a complex M-by-N matrix A for M >= N:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix, stored on exit in an implicit
+     !> form in the elements below the diagonal of the array A and in
+     !> the elements of the array T;
+     !> R is an upper-triangular N-by-N matrix, stored on exit in
+     !> the elements on and above the diagonal of the array A.
+     !> 0 is a (M-N)-by-N zero matrix, and is not stored.
+
+     pure subroutine stdlib_wlatsqr( m, n, mb, nb, a, lda, t, ldt, work,lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, mb, nb, ldt, lwork
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ii, kk, ctr
+           ! External Subroutines 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+             info = -1
+           else if( n<0 .or. m<n ) then
+             info = -2
+           else if( mb<1 ) then
+             info = -3
+           else if( nb<1 .or. ( nb>n .and. n>0 )) then
+             info = -4
+           else if( lda<max( 1, m ) ) then
+             info = -6
+           else if( ldt<nb ) then
+             info = -8
+           else if( lwork<(n*nb) .and. (.not.lquery) ) then
+             info = -10
+           end if
+           if( info==0)  then
+             work(1) = nb*n
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZLATSQR', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n)==0 ) then
+               return
+           end if
+           ! the qr decomposition
+            if ((mb<=n).or.(mb>=m)) then
+              call stdlib_wgeqrt( m, n, nb, a, lda, t, ldt, work, info)
+              return
+            end if
+            kk = mod((m-n),(mb-n))
+            ii=m-kk+1
+            ! compute the qr factorization of the first block a(1:mb,1:n)
+            call stdlib_wgeqrt( mb, n, nb, a(1,1), lda, t, ldt, work, info )
+            ctr = 1
+            do i = mb+1, ii-mb+n ,  (mb-n)
+            ! compute the qr factorization of the current block a(i:i+mb-n,1:n)
+              call stdlib_wtpqrt( mb-n, n, 0, nb, a(1,1), lda, a( i, 1 ), lda,t(1, ctr * n + 1),&
+                        ldt, work, info )
+              ctr = ctr + 1
+            end do
+            ! compute the qr factorization of the last block a(ii:m,1:n)
+            if (ii<=m) then
+              call stdlib_wtpqrt( kk, n, 0, nb, a(1,1), lda, a( ii, 1 ), lda,t(1,ctr * n + 1), &
+                        ldt,work, info )
+            end if
+           work( 1 ) = n*nb
+           return
+     end subroutine stdlib_wlatsqr
+
+     !> ZLAUNHR_COL_GETRFNP: computes the modified LU factorization without
+     !> pivoting of a complex general M-by-N matrix A. The factorization has
+     !> the form:
+     !> A - S = L * U,
+     !> where:
+     !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+     !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+     !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+     !> i-1 steps of Gaussian elimination. This means that the diagonal
+     !> element at each step of "modified" Gaussian elimination is
+     !> at least one in absolute value (so that division-by-zero not
+     !> not possible during the division by the diagonal element);
+     !> L is a M-by-N lower triangular matrix with unit diagonal elements
+     !> (lower trapezoidal if M > N);
+     !> and U is a M-by-N upper triangular matrix
+     !> (upper trapezoidal if M < N).
+     !> This routine is an auxiliary routine used in the Householder
+     !> reconstruction routine ZUNHR_COL. In ZUNHR_COL, this routine is
+     !> applied to an M-by-N matrix A with orthonormal columns, where each
+     !> element is bounded by one in absolute value. With the choice of
+     !> the matrix S above, one can show that the diagonal element at each
+     !> step of Gaussian elimination is the largest (in absolute value) in
+     !> the column on or below the diagonal, so that no pivoting is required
+     !> for numerical stability [1].
+     !> For more details on the Householder reconstruction algorithm,
+     !> including the modified LU factorization, see [1].
+     !> This is the blocked right-looking version of the algorithm,
+     !> calling Level 3 BLAS to update the submatrix. To factorize a block,
+     !> this routine calls the recursive routine ZLAUNHR_COL_GETRFNP2.
+     !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+     !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+     !> E. Solomonik, J. Parallel Distrib. Comput.,
+     !> vol. 85, pp. 3-31, 2015.
+
+     pure subroutine stdlib_wlaunhr_col_getrfnp( m, n, a, lda, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: d(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: iinfo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAUNHR_COL_GETRFNP', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'ZLAUNHR_COL_GETRFNP', ' ', m, n, -1, -1 )
+           if( nb<=1 .or. nb>=min( m, n ) ) then
+              ! use unblocked code.
+              call stdlib_wlaunhr_col_getrfnp2( m, n, a, lda, d, info )
+           else
+              ! use blocked code.
+              do j = 1, min( m, n ), nb
+                 jb = min( min( m, n )-j+1, nb )
+                 ! factor diagonal and subdiagonal blocks.
+                 call stdlib_wlaunhr_col_getrfnp2( m-j+1, jb, a( j, j ), lda,d( j ), iinfo )
+                           
+                 if( j+jb<=n ) then
+                    ! compute block row of u.
+                    call stdlib_wtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', jb,n-j-jb+1, cone,&
+                               a( j, j ), lda, a( j, j+jb ),lda )
+                    if( j+jb<=m ) then
+                       ! update trailing submatrix.
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-j-jb+1,n-j-jb+1, jb, -&
+                       cone, a( j+jb, j ), lda,a( j, j+jb ), lda, cone, a( j+jb, j+jb ),lda )
+                                 
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_wlaunhr_col_getrfnp
+
+     !> ZLAUNHR_COL_GETRFNP2: computes the modified LU factorization without
+     !> pivoting of a complex general M-by-N matrix A. The factorization has
+     !> the form:
+     !> A - S = L * U,
+     !> where:
+     !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+     !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+     !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+     !> i-1 steps of Gaussian elimination. This means that the diagonal
+     !> element at each step of "modified" Gaussian elimination is at
+     !> least one in absolute value (so that division-by-zero not
+     !> possible during the division by the diagonal element);
+     !> L is a M-by-N lower triangular matrix with unit diagonal elements
+     !> (lower trapezoidal if M > N);
+     !> and U is a M-by-N upper triangular matrix
+     !> (upper trapezoidal if M < N).
+     !> This routine is an auxiliary routine used in the Householder
+     !> reconstruction routine ZUNHR_COL. In ZUNHR_COL, this routine is
+     !> applied to an M-by-N matrix A with orthonormal columns, where each
+     !> element is bounded by one in absolute value. With the choice of
+     !> the matrix S above, one can show that the diagonal element at each
+     !> step of Gaussian elimination is the largest (in absolute value) in
+     !> the column on or below the diagonal, so that no pivoting is required
+     !> for numerical stability [1].
+     !> For more details on the Householder reconstruction algorithm,
+     !> including the modified LU factorization, see [1].
+     !> This is the recursive version of the LU factorization algorithm.
+     !> Denote A - S by B. The algorithm divides the matrix B into four
+     !> submatrices:
+     !> [  B11 | B12  ]  where B11 is n1 by n1,
+     !> B = [ -----|----- ]        B21 is (m-n1) by n1,
+     !> [  B21 | B22  ]        B12 is n1 by n2,
+     !> B22 is (m-n1) by n2,
+     !> with n1 = min(m,n)/2, n2 = n-n1.
+     !> The subroutine calls itself to factor B11, solves for B21,
+     !> solves for B12, updates B22, then calls itself to factor B22.
+     !> For more details on the recursive LU algorithm, see [2].
+     !> ZLAUNHR_COL_GETRFNP2 is called to factorize a block by the blocked
+     !> routine ZLAUNHR_COL_GETRFNP, which uses blocked code calling
+     !> Level 3 BLAS to update the submatrix. However, ZLAUNHR_COL_GETRFNP2
+     !> is self-sufficient and can be used without ZLAUNHR_COL_GETRFNP.
+     !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+     !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+     !> E. Solomonik, J. Parallel Distrib. Comput.,
+     !> vol. 85, pp. 3-31, 2015.
+     !> [2] "Recursion leads to automatic variable blocking for dense linear
+     !> algebra algorithms", F. Gustavson, IBM J. of Res. and Dev.,
+     !> vol. 41, no. 6, pp. 737-755, 1997.
+
+     pure recursive subroutine stdlib_wlaunhr_col_getrfnp2( m, n, a, lda, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: d(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(qp) :: sfmin
+           integer(ilp) :: i, iinfo, n1, n2
+           complex(qp) :: z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,sign,max,min
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=qp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAUNHR_COL_GETRFNP2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 )return
+           if ( m==1 ) then
+              ! one row case, (also recursion termination case),
+              ! use unblocked code
+              ! transfer the sign
+              d( 1 ) = cmplx( -sign( one, real( a( 1, 1 ),KIND=qp) ),KIND=qp)
+              ! construct the row of u
+              a( 1, 1 ) = a( 1, 1 ) - d( 1 )
+           else if( n==1 ) then
+              ! one column case, (also recursion termination case),
+              ! use unblocked code
+              ! transfer the sign
+              d( 1 ) = cmplx( -sign( one, real( a( 1, 1 ),KIND=qp) ),KIND=qp)
+              ! construct the row of u
+              a( 1, 1 ) = a( 1, 1 ) - d( 1 )
+              ! scale the elements 2:m of the column
+              ! determine machine safe minimum
+              sfmin = stdlib_qlamch('S')
+              ! construct the subdiagonal elements of l
+              if( cabs1( a( 1, 1 ) ) >= sfmin ) then
+                 call stdlib_wscal( m-1, cone / a( 1, 1 ), a( 2, 1 ), 1 )
+              else
+                 do i = 2, m
+                    a( i, 1 ) = a( i, 1 ) / a( 1, 1 )
+                 end do
+              end if
+           else
+              ! divide the matrix b into four submatrices
+              n1 = min( m, n ) / 2
+              n2 = n-n1
+              ! factor b11, recursive call
+              call stdlib_wlaunhr_col_getrfnp2( n1, n1, a, lda, d, iinfo )
+              ! solve for b21
+              call stdlib_wtrsm( 'R', 'U', 'N', 'N', m-n1, n1, cone, a, lda,a( n1+1, 1 ), lda )
+                        
+              ! solve for b12
+              call stdlib_wtrsm( 'L', 'L', 'N', 'U', n1, n2, cone, a, lda,a( 1, n1+1 ), lda )
+                        
+              ! update b22, i.e. compute the schur complement
+              ! b22 := b22 - b21*b12
+              call stdlib_wgemm( 'N', 'N', m-n1, n2, n1, -cone, a( n1+1, 1 ), lda,a( 1, n1+1 ), &
+                        lda, cone, a( n1+1, n1+1 ), lda )
+              ! factor b22, recursive call
+              call stdlib_wlaunhr_col_getrfnp2( m-n1, n2, a( n1+1, n1+1 ), lda,d( n1+1 ), iinfo )
+                        
+           end if
+           return
+     end subroutine stdlib_wlaunhr_col_getrfnp2
+
+     !> ZLAUU2: computes the product U * U**H or L**H * L, where the triangular
+     !> factor U or L is stored in the upper or lower triangular part of
+     !> the array A.
+     !> If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+     !> overwriting the factor U in A.
+     !> If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+     !> overwriting the factor L in A.
+     !> This is the unblocked form of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_wlauu2( uplo, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           real(qp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAUU2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the product u * u**h.
+              do i = 1, n
+                 aii = real( a( i, i ),KIND=qp)
+                 if( i<n ) then
+                    a( i, i ) = aii*aii + real( stdlib_wdotc( n-i, a( i, i+1 ), lda,a( i, i+1 ), &
+                              lda ),KIND=qp)
+                    call stdlib_wlacgv( n-i, a( i, i+1 ), lda )
+                    call stdlib_wgemv( 'NO TRANSPOSE', i-1, n-i, cone, a( 1, i+1 ),lda, a( i, i+1 &
+                              ), lda, cmplx( aii,KIND=qp),a( 1, i ), 1 )
+                    call stdlib_wlacgv( n-i, a( i, i+1 ), lda )
+                 else
+                    call stdlib_wdscal( i, aii, a( 1, i ), 1 )
+                 end if
+              end do
+           else
+              ! compute the product l**h * l.
+              do i = 1, n
+                 aii = real( a( i, i ),KIND=qp)
+                 if( i<n ) then
+                    a( i, i ) = aii*aii + real( stdlib_wdotc( n-i, a( i+1, i ), 1,a( i+1, i ), 1 )&
+                              ,KIND=qp)
+                    call stdlib_wlacgv( i-1, a( i, 1 ), lda )
+                    call stdlib_wgemv( 'CONJUGATE TRANSPOSE', n-i, i-1, cone,a( i+1, 1 ), lda, a( &
+                              i+1, i ), 1,cmplx( aii,KIND=qp), a( i, 1 ), lda )
+                    call stdlib_wlacgv( i-1, a( i, 1 ), lda )
+                 else
+                    call stdlib_wdscal( i, aii, a( i, 1 ), lda )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_wlauu2
+
+     !> ZLAUUM: computes the product U * U**H or L**H * L, where the triangular
+     !> factor U or L is stored in the upper or lower triangular part of
+     !> the array A.
+     !> If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+     !> overwriting the factor U in A.
+     !> If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+     !> overwriting the factor L in A.
+     !> This is the blocked form of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_wlauum( uplo, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ib, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAUUM', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'ZLAUUM', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_wlauu2( uplo, n, a, lda, info )
+           else
+              ! use blocked code
+              if( upper ) then
+                 ! compute the product u * u**h.
+                 do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT', i-1, &
+                              ib, cone, a( i, i ), lda,a( 1, i ), lda )
+                    call stdlib_wlauu2( 'UPPER', ib, a( i, i ), lda, info )
+                    if( i+ib<=n ) then
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',i-1, ib, n-i-ib+1,&
+                                  cone, a( 1, i+ib ),lda, a( i, i+ib ), lda, cone, a( 1, i ),lda )
+                       call stdlib_wherk( 'UPPER', 'NO TRANSPOSE', ib, n-i-ib+1,one, a( i, i+ib ),&
+                                  lda, one, a( i, i ),lda )
+                    end if
+                 end do
+              else
+                 ! compute the product l**h * l.
+                 do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    call stdlib_wtrmm( 'LEFT', 'LOWER', 'CONJUGATE TRANSPOSE','NON-UNIT', ib, i-1,&
+                               cone, a( i, i ), lda,a( i, 1 ), lda )
+                    call stdlib_wlauu2( 'LOWER', ib, a( i, i ), lda, info )
+                    if( i+ib<=n ) then
+                       call stdlib_wgemm( 'CONJUGATE TRANSPOSE', 'NO TRANSPOSE', ib,i-1, n-i-ib+1,&
+                                  cone, a( i+ib, i ), lda,a( i+ib, 1 ), lda, cone, a( i, 1 ), lda )
+                       call stdlib_wherk( 'LOWER', 'CONJUGATE TRANSPOSE', ib,n-i-ib+1, one, a( i+&
+                                 ib, i ), lda, one,a( i, i ), lda )
+                    end if
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wlauum
+
+     !> ZPBCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex Hermitian positive definite band matrix using
+     !> the Cholesky factorization A = U**H*U or A = L*L**H computed by
+     !> ZPBTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_wpbcon( uplo, n, kd, ab, ldab, anorm, rcond, work,rwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(in) :: ab(ldab,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(qp) :: ainvnm, scale, scalel, scaleu, smlnum
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**h).
+                 call stdlib_wlatbs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, kd, ab,&
+                            ldab, work, scalel, rwork,info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_wlatbs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scaleu, rwork, info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_wlatbs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(l**h).
+                 call stdlib_wlatbs( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, kd, ab,&
+                            ldab, work, scaleu, rwork,info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_iwamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_wdrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_wpbcon
+
+     !> ZPBEQU: computes row and column scalings intended to equilibrate a
+     !> Hermitian positive definite band matrix A and reduce its condition
+     !> number (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_wpbequ( uplo, n, kd, ab, ldab, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(qp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(out) :: s(*)
+           complex(qp), intent(in) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j
+           real(qp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           if( upper ) then
+              j = kd + 1
+           else
+              j = 1
+           end if
+           ! initialize smin and amax.
+           s( 1 ) = real( ab( j, 1 ),KIND=qp)
+           smin = s( 1 )
+           amax = s( 1 )
+           ! find the minimum and maximum diagonal elements.
+           do i = 2, n
+              s( i ) = real( ab( j, i ),KIND=qp)
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_wpbequ
+
+     !> ZPBRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian positive definite
+     !> and banded, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_wpbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b,ldb, x, ldx, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldafb, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, l, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldafb<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = min( n+1, 2*kd+2 )
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_wcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_whbmv( uplo, n, kd, -cone, ab, ldab, x( 1, j ), 1, cone,work, 1 )
+                        
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    l = kd + 1 - k
+                    do i = max( 1, k-kd ), k - 1
+                       rwork( i ) = rwork( i ) + cabs1( ab( l+i, k ) )*xk
+                       s = s + cabs1( ab( l+i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( ab( kd+1, k ),KIND=qp) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( ab( 1, k ),KIND=qp) )*xk
+                    l = 1 - k
+                    do i = k + 1, min( n, k+kd )
+                       rwork( i ) = rwork( i ) + cabs1( ab( l+i, k ) )*xk
+                       s = s + cabs1( ab( l+i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_wpbtrs( uplo, n, kd, 1, afb, ldafb, work, n, info )
+                 call stdlib_waxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_wlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_wpbtrs( uplo, n, kd, 1, afb, ldafb, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_wpbtrs( uplo, n, kd, 1, afb, ldafb, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_wpbrfs
+
+     !> ZPBSTF: computes a split Cholesky factorization of a complex
+     !> Hermitian positive definite band matrix A.
+     !> This routine is designed to be used in conjunction with ZHBGST.
+     !> The factorization has the form  A = S**H*S  where S is a band matrix
+     !> of the same bandwidth as A and the following structure:
+     !> S = ( U    )
+     !> ( M  L )
+     !> where U is upper triangular of order m = (n+kd)/2, and L is lower
+     !> triangular of order n-m.
+
+     pure subroutine stdlib_wpbstf( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, kld, km, m
+           real(qp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBSTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           kld = max( 1, ldab-1 )
+           ! set the splitting point m.
+           m = ( n+kd ) / 2
+           if( upper ) then
+              ! factorize a(m+1:n,m+1:n) as l**h*l, and update a(1:m,1:m).
+              do j = n, m + 1, -1
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( kd+1, j ),KIND=qp)
+                 if( ajj<=zero ) then
+                    ab( kd+1, j ) = ajj
+                    go to 50
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 km = min( j-1, kd )
+                 ! compute elements j-km:j-1 of the j-th column and update the
+                 ! the leading submatrix within the band.
+                 call stdlib_wdscal( km, one / ajj, ab( kd+1-km, j ), 1 )
+                 call stdlib_wher( 'UPPER', km, -one, ab( kd+1-km, j ), 1,ab( kd+1, j-km ), kld )
+                           
+              end do
+              ! factorize the updated submatrix a(1:m,1:m) as u**h*u.
+              do j = 1, m
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( kd+1, j ),KIND=qp)
+                 if( ajj<=zero ) then
+                    ab( kd+1, j ) = ajj
+                    go to 50
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 km = min( kd, m-j )
+                 ! compute elements j+1:j+km of the j-th row and update the
+                 ! trailing submatrix within the band.
+                 if( km>0 ) then
+                    call stdlib_wdscal( km, one / ajj, ab( kd, j+1 ), kld )
+                    call stdlib_wlacgv( km, ab( kd, j+1 ), kld )
+                    call stdlib_wher( 'UPPER', km, -one, ab( kd, j+1 ), kld,ab( kd+1, j+1 ), kld )
+                              
+                    call stdlib_wlacgv( km, ab( kd, j+1 ), kld )
+                 end if
+              end do
+           else
+              ! factorize a(m+1:n,m+1:n) as l**h*l, and update a(1:m,1:m).
+              do j = n, m + 1, -1
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( 1, j ),KIND=qp)
+                 if( ajj<=zero ) then
+                    ab( 1, j ) = ajj
+                    go to 50
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 km = min( j-1, kd )
+                 ! compute elements j-km:j-1 of the j-th row and update the
+                 ! trailing submatrix within the band.
+                 call stdlib_wdscal( km, one / ajj, ab( km+1, j-km ), kld )
+                 call stdlib_wlacgv( km, ab( km+1, j-km ), kld )
+                 call stdlib_wher( 'LOWER', km, -one, ab( km+1, j-km ), kld,ab( 1, j-km ), kld )
+                           
+                 call stdlib_wlacgv( km, ab( km+1, j-km ), kld )
+              end do
+              ! factorize the updated submatrix a(1:m,1:m) as u**h*u.
+              do j = 1, m
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( 1, j ),KIND=qp)
+                 if( ajj<=zero ) then
+                    ab( 1, j ) = ajj
+                    go to 50
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 km = min( kd, m-j )
+                 ! compute elements j+1:j+km of the j-th column and update the
+                 ! trailing submatrix within the band.
+                 if( km>0 ) then
+                    call stdlib_wdscal( km, one / ajj, ab( 2, j ), 1 )
+                    call stdlib_wher( 'LOWER', km, -one, ab( 2, j ), 1,ab( 1, j+1 ), kld )
+                 end if
+              end do
+           end if
+           return
+           50 continue
+           info = j
+           return
+     end subroutine stdlib_wpbstf
+
+     !> ZPBSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite band matrix and X
+     !> and B are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular band matrix, and L is a lower
+     !> triangular band matrix, with the same number of superdiagonals or
+     !> subdiagonals as A.  The factored form of A is then used to solve the
+     !> system of equations A * X = B.
+
+     pure subroutine stdlib_wpbsv( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           complex(qp), intent(inout) :: ab(ldab,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**h *u or a = l*l**h.
+           call stdlib_wpbtrf( uplo, n, kd, ab, ldab, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_wpbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_wpbsv
+
+     !> ZPBSVX: uses the Cholesky factorization A = U**H*U or A = L*L**H to
+     !> compute the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite band matrix and X
+     !> and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_wpbsvx( fact, uplo, n, kd, nrhs, ab, ldab, afb, ldafb,equed, s, b, ldb, x, &
+               ldx, rcond, ferr, berr,work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldafb, ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(qp), intent(inout) :: s(*)
+           complex(qp), intent(inout) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ, upper
+           integer(ilp) :: i, infequ, j, j1, j2
+           real(qp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           else if( ldafb<kd+1 ) then
+              info = -9
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -10
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -11
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -13
+                 else if( ldx<max( 1, n ) ) then
+                    info = -15
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_wpbequ( uplo, n, kd, ab, ldab, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_wlaqhb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right-hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**h *u or a = l*l**h.
+              if( upper ) then
+                 do j = 1, n
+                    j1 = max( j-kd, 1 )
+                    call stdlib_wcopy( j-j1+1, ab( kd+1-j+j1, j ), 1,afb( kd+1-j+j1, j ), 1 )
+                              
+                 end do
+              else
+                 do j = 1, n
+                    j2 = min( j+kd, n )
+                    call stdlib_wcopy( j2-j+1, ab( 1, j ), 1, afb( 1, j ), 1 )
+                 end do
+              end if
+              call stdlib_wpbtrf( uplo, n, kd, afb, ldafb, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_wlanhb( '1', uplo, n, kd, ab, ldab, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_wpbcon( uplo, n, kd, afb, ldafb, anorm, rcond, work, rwork,info )
+           ! compute the solution matrix x.
+           call stdlib_wlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_wpbtrs( uplo, n, kd, nrhs, afb, ldafb, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_wpbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b, ldb, x,ldx, ferr, berr,&
+                      work, rwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_wpbsvx
+
+     !> ZPBTF2: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite band matrix A.
+     !> The factorization has the form
+     !> A = U**H * U ,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix, U**H is the conjugate transpose
+     !> of U, and L is lower triangular.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_wpbtf2( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, kld, kn
+           real(qp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           kld = max( 1, ldab-1 )
+           if( upper ) then
+              ! compute the cholesky factorization a = u**h * u.
+              do j = 1, n
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( kd+1, j ),KIND=qp)
+                 if( ajj<=zero ) then
+                    ab( kd+1, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 ! compute elements j+1:j+kn of row j and update the
+                 ! trailing submatrix within the band.
+                 kn = min( kd, n-j )
+                 if( kn>0 ) then
+                    call stdlib_wdscal( kn, one / ajj, ab( kd, j+1 ), kld )
+                    call stdlib_wlacgv( kn, ab( kd, j+1 ), kld )
+                    call stdlib_wher( 'UPPER', kn, -one, ab( kd, j+1 ), kld,ab( kd+1, j+1 ), kld )
+                              
+                    call stdlib_wlacgv( kn, ab( kd, j+1 ), kld )
+                 end if
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**h.
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( 1, j ),KIND=qp)
+                 if( ajj<=zero ) then
+                    ab( 1, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 ! compute elements j+1:j+kn of column j and update the
+                 ! trailing submatrix within the band.
+                 kn = min( kd, n-j )
+                 if( kn>0 ) then
+                    call stdlib_wdscal( kn, one / ajj, ab( 2, j ), 1 )
+                    call stdlib_wher( 'LOWER', kn, -one, ab( 2, j ), 1,ab( 1, j+1 ), kld )
+                 end if
+              end do
+           end if
+           return
+           30 continue
+           info = j
+           return
+     end subroutine stdlib_wpbtf2
+
+     !> ZPBTRF: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite band matrix A.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+
+     pure subroutine stdlib_wpbtrf( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 32
+           integer(ilp), parameter :: ldwork = nbmax+1
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i2, i3, ib, ii, j, jj, nb
+           ! Local Arrays 
+           complex(qp) :: work(ldwork,nbmax)
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( .not.stdlib_lsame( uplo, 'U' ) ) .and.( .not.stdlib_lsame( uplo, 'L' ) ) ) &
+                     then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment
+           nb = stdlib_ilaenv( 1, 'ZPBTRF', uplo, n, kd, -1, -1 )
+           ! the block size must not exceed the semi-bandwidth kd, and must not
+           ! exceed the limit set by the size of the local array work.
+           nb = min( nb, nbmax )
+           if( nb<=1 .or. nb>kd ) then
+              ! use unblocked code
+              call stdlib_wpbtf2( uplo, n, kd, ab, ldab, info )
+           else
+              ! use blocked code
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! compute the cholesky factorization of a hermitian band
+                 ! matrix, given the upper triangle of the matrix in band
+                 ! storage.
+                 ! zero the upper triangle of the work array.
+                 do j = 1, nb
+                    do i = 1, j - 1
+                       work( i, j ) = zero
+                    end do
+                 end do
+                 ! process the band matrix one diagonal block at a time.
+                 loop_70: do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    ! factorize the diagonal block
+                    call stdlib_wpotf2( uplo, ib, ab( kd+1, i ), ldab-1, ii )
+                    if( ii/=0 ) then
+                       info = i + ii - 1
+                       go to 150
+                    end if
+                    if( i+ib<=n ) then
+                       ! update the relevant part of the trailing submatrix.
+                       ! if a11 denotes the diagonal block which has just been
+                       ! factorized, then we need to update the remaining
+                       ! blocks in the diagram:
+                          ! a11   a12   a13
+                                ! a22   a23
+                                      ! a33
+                       ! the numbers of rows and columns in the partitioning
+                       ! are ib, i2, i3 respectively. the blocks a12, a22 and
+                       ! a23 are empty if ib = kd. the upper triangle of a13
+                       ! lies outside the band.
+                       i2 = min( kd-ib, n-i-ib+1 )
+                       i3 = min( ib, n-i-kd+1 )
+                       if( i2>0 ) then
+                          ! update a12
+                          call stdlib_wtrsm( 'LEFT', 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT', &
+                                    ib, i2, cone,ab( kd+1, i ), ldab-1,ab( kd+1-ib, i+ib ), ldab-1 )
+                          ! update a22
+                          call stdlib_wherk( 'UPPER', 'CONJUGATE TRANSPOSE', i2, ib,-one, ab( kd+&
+                                    1-ib, i+ib ), ldab-1, one,ab( kd+1, i+ib ), ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! copy the lower triangle of a13 into the work array.
+                          do jj = 1, i3
+                             do ii = jj, ib
+                                work( ii, jj ) = ab( ii-jj+1, jj+i+kd-1 )
+                             end do
+                          end do
+                          ! update a13 (in the work array).
+                          call stdlib_wtrsm( 'LEFT', 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT', &
+                                    ib, i3, cone,ab( kd+1, i ), ldab-1, work, ldwork )
+                          ! update a23
+                          if( i2>0 )call stdlib_wgemm( 'CONJUGATE TRANSPOSE','NO TRANSPOSE', i2, &
+                          i3, ib, -cone,ab( kd+1-ib, i+ib ), ldab-1, work,ldwork, cone, ab( 1+ib, &
+                                    i+kd ),ldab-1 )
+                          ! update a33
+                          call stdlib_wherk( 'UPPER', 'CONJUGATE TRANSPOSE', i3, ib,-one, work, &
+                                    ldwork, one,ab( kd+1, i+kd ), ldab-1 )
+                          ! copy the lower triangle of a13 back into place.
+                          do jj = 1, i3
+                             do ii = jj, ib
+                                ab( ii-jj+1, jj+i+kd-1 ) = work( ii, jj )
+                             end do
+                          end do
+                       end if
+                    end if
+                 end do loop_70
+              else
+                 ! compute the cholesky factorization of a hermitian band
+                 ! matrix, given the lower triangle of the matrix in band
+                 ! storage.
+                 ! zero the lower triangle of the work array.
+                 do j = 1, nb
+                    do i = j + 1, nb
+                       work( i, j ) = zero
+                    end do
+                 end do
+                 ! process the band matrix one diagonal block at a time.
+                 loop_140: do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    ! factorize the diagonal block
+                    call stdlib_wpotf2( uplo, ib, ab( 1, i ), ldab-1, ii )
+                    if( ii/=0 ) then
+                       info = i + ii - 1
+                       go to 150
+                    end if
+                    if( i+ib<=n ) then
+                       ! update the relevant part of the trailing submatrix.
+                       ! if a11 denotes the diagonal block which has just been
+                       ! factorized, then we need to update the remaining
+                       ! blocks in the diagram:
+                          ! a11
+                          ! a21   a22
+                          ! a31   a32   a33
+                       ! the numbers of rows and columns in the partitioning
+                       ! are ib, i2, i3 respectively. the blocks a21, a22 and
+                       ! a32 are empty if ib = kd. the lower triangle of a31
+                       ! lies outside the band.
+                       i2 = min( kd-ib, n-i-ib+1 )
+                       i3 = min( ib, n-i-kd+1 )
+                       if( i2>0 ) then
+                          ! update a21
+                          call stdlib_wtrsm( 'RIGHT', 'LOWER','CONJUGATE TRANSPOSE', 'NON-UNIT', &
+                                    i2,ib, cone, ab( 1, i ), ldab-1,ab( 1+ib, i ), ldab-1 )
+                          ! update a22
+                          call stdlib_wherk( 'LOWER', 'NO TRANSPOSE', i2, ib, -one,ab( 1+ib, i ), &
+                                    ldab-1, one,ab( 1, i+ib ), ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! copy the upper triangle of a31 into the work array.
+                          do jj = 1, ib
+                             do ii = 1, min( jj, i3 )
+                                work( ii, jj ) = ab( kd+1-jj+ii, jj+i-1 )
+                             end do
+                          end do
+                          ! update a31 (in the work array).
+                          call stdlib_wtrsm( 'RIGHT', 'LOWER','CONJUGATE TRANSPOSE', 'NON-UNIT', &
+                                    i3,ib, cone, ab( 1, i ), ldab-1, work,ldwork )
+                          ! update a32
+                          if( i2>0 )call stdlib_wgemm( 'NO TRANSPOSE','CONJUGATE TRANSPOSE', i3, &
+                          i2, ib,-cone, work, ldwork, ab( 1+ib, i ),ldab-1, cone, ab( 1+kd-ib, i+&
+                                    ib ),ldab-1 )
+                          ! update a33
+                          call stdlib_wherk( 'LOWER', 'NO TRANSPOSE', i3, ib, -one,work, ldwork, &
+                                    one, ab( 1, i+kd ),ldab-1 )
+                          ! copy the upper triangle of a31 back into place.
+                          do jj = 1, ib
+                             do ii = 1, min( jj, i3 )
+                                ab( kd+1-jj+ii, jj+i-1 ) = work( ii, jj )
+                             end do
+                          end do
+                       end if
+                    end if
+                 end do loop_140
+              end if
+           end if
+           return
+           150 continue
+           return
+     end subroutine stdlib_wpbtrf
+
+     !> ZPBTRS: solves a system of linear equations A*X = B with a Hermitian
+     !> positive definite band matrix A using the Cholesky factorization
+     !> A = U**H *U or A = L*L**H computed by ZPBTRF.
+
+     pure subroutine stdlib_wpbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           complex(qp), intent(in) :: ab(ldab,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**h *u.
+              do j = 1, nrhs
+                 ! solve u**h *x = b, overwriting b with x.
+                 call stdlib_wtbsv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,kd, ab, ldab, b(&
+                            1, j ), 1 )
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_wtbsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j )&
+                           , 1 )
+              end do
+           else
+              ! solve a*x = b where a = l*l**h.
+              do j = 1, nrhs
+                 ! solve l*x = b, overwriting b with x.
+                 call stdlib_wtbsv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j )&
+                           , 1 )
+                 ! solve l**h *x = b, overwriting b with x.
+                 call stdlib_wtbsv( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,kd, ab, ldab, b(&
+                            1, j ), 1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_wpbtrs
+
+     !> ZPFTRF: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the block version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_wpftrf( transr, uplo, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(in) :: n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPFTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_wpotrf( 'L', n1, a( 0 ), n, info )
+                    if( info>0 )return
+                    call stdlib_wtrsm( 'R', 'L', 'C', 'N', n2, n1, cone, a( 0 ), n,a( n1 ), n )
+                              
+                    call stdlib_wherk( 'U', 'N', n2, n1, -one, a( n1 ), n, one,a( n ), n )
+                    call stdlib_wpotrf( 'U', n2, a( n ), n, info )
+                    if( info>0 )info = info + n1
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_wpotrf( 'L', n1, a( n2 ), n, info )
+                    if( info>0 )return
+                    call stdlib_wtrsm( 'L', 'L', 'N', 'N', n1, n2, cone, a( n2 ), n,a( 0 ), n )
+                              
+                    call stdlib_wherk( 'U', 'C', n2, n1, -one, a( 0 ), n, one,a( n1 ), n )
+                    call stdlib_wpotrf( 'U', n2, a( n1 ), n, info )
+                    if( info>0 )info = info + n1
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    call stdlib_wpotrf( 'U', n1, a( 0 ), n1, info )
+                    if( info>0 )return
+                    call stdlib_wtrsm( 'L', 'U', 'C', 'N', n1, n2, cone, a( 0 ), n1,a( n1*n1 ), &
+                              n1 )
+                    call stdlib_wherk( 'L', 'C', n2, n1, -one, a( n1*n1 ), n1, one,a( 1 ), n1 )
+                              
+                    call stdlib_wpotrf( 'L', n2, a( 1 ), n1, info )
+                    if( info>0 )info = info + n1
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    call stdlib_wpotrf( 'U', n1, a( n2*n2 ), n2, info )
+                    if( info>0 )return
+                    call stdlib_wtrsm( 'R', 'U', 'N', 'N', n2, n1, cone, a( n2*n2 ),n2, a( 0 ), &
+                              n2 )
+                    call stdlib_wherk( 'L', 'N', n2, n1, -one, a( 0 ), n2, one,a( n1*n2 ), n2 )
+                              
+                    call stdlib_wpotrf( 'L', n2, a( n1*n2 ), n2, info )
+                    if( info>0 )info = info + n1
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_wpotrf( 'L', k, a( 1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_wtrsm( 'R', 'L', 'C', 'N', k, k, cone, a( 1 ), n+1,a( k+1 ), n+1 )
+                              
+                    call stdlib_wherk( 'U', 'N', k, k, -one, a( k+1 ), n+1, one,a( 0 ), n+1 )
+                              
+                    call stdlib_wpotrf( 'U', k, a( 0 ), n+1, info )
+                    if( info>0 )info = info + k
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_wpotrf( 'L', k, a( k+1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_wtrsm( 'L', 'L', 'N', 'N', k, k, cone, a( k+1 ),n+1, a( 0 ), n+1 )
+                              
+                    call stdlib_wherk( 'U', 'C', k, k, -one, a( 0 ), n+1, one,a( k ), n+1 )
+                              
+                    call stdlib_wpotrf( 'U', k, a( k ), n+1, info )
+                    if( info>0 )info = info + k
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_wpotrf( 'U', k, a( 0+k ), k, info )
+                    if( info>0 )return
+                    call stdlib_wtrsm( 'L', 'U', 'C', 'N', k, k, cone, a( k ), n1,a( k*( k+1 ) ), &
+                              k )
+                    call stdlib_wherk( 'L', 'C', k, k, -one, a( k*( k+1 ) ), k, one,a( 0 ), k )
+                              
+                    call stdlib_wpotrf( 'L', k, a( 0 ), k, info )
+                    if( info>0 )info = info + k
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_wpotrf( 'U', k, a( k*( k+1 ) ), k, info )
+                    if( info>0 )return
+                    call stdlib_wtrsm( 'R', 'U', 'N', 'N', k, k, cone,a( k*( k+1 ) ), k, a( 0 ), &
+                              k )
+                    call stdlib_wherk( 'L', 'N', k, k, -one, a( 0 ), k, one,a( k*k ), k )
+                    call stdlib_wpotrf( 'L', k, a( k*k ), k, info )
+                    if( info>0 )info = info + k
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_wpftrf
+
+     !> ZPFTRI: computes the inverse of a complex Hermitian positive definite
+     !> matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+     !> computed by ZPFTRF.
+
+     pure subroutine stdlib_wpftri( transr, uplo, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPFTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_wtftri( transr, uplo, 'N', n, a, info )
+           if( info>0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution of triangular matrix multiply: inv(u)*inv(u)^c or
+           ! inv(l)^c*inv(l). there are eight cases.
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                    ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                    ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_wlauum( 'L', n1, a( 0 ), n, info )
+                    call stdlib_wherk( 'L', 'C', n1, n2, one, a( n1 ), n, one,a( 0 ), n )
+                    call stdlib_wtrmm( 'L', 'U', 'N', 'N', n2, n1, cone, a( n ), n,a( n1 ), n )
+                              
+                    call stdlib_wlauum( 'U', n2, a( n ), n, info )
+                 else
+                    ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                    ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                    ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_wlauum( 'L', n1, a( n2 ), n, info )
+                    call stdlib_wherk( 'L', 'N', n1, n2, one, a( 0 ), n, one,a( n2 ), n )
+                    call stdlib_wtrmm( 'R', 'U', 'C', 'N', n1, n2, cone, a( n1 ), n,a( 0 ), n )
+                              
+                    call stdlib_wlauum( 'U', n2, a( n1 ), n, info )
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose, and n is odd
+                    ! t1 -> a(0), t2 -> a(1), s -> a(0+n1*n1)
+                    call stdlib_wlauum( 'U', n1, a( 0 ), n1, info )
+                    call stdlib_wherk( 'U', 'N', n1, n2, one, a( n1*n1 ), n1, one,a( 0 ), n1 )
+                              
+                    call stdlib_wtrmm( 'R', 'L', 'N', 'N', n1, n2, cone, a( 1 ), n1,a( n1*n1 ), &
+                              n1 )
+                    call stdlib_wlauum( 'L', n2, a( 1 ), n1, info )
+                 else
+                    ! srpa for upper, transpose, and n is odd
+                    ! t1 -> a(0+n2*n2), t2 -> a(0+n1*n2), s -> a(0)
+                    call stdlib_wlauum( 'U', n1, a( n2*n2 ), n2, info )
+                    call stdlib_wherk( 'U', 'C', n1, n2, one, a( 0 ), n2, one,a( n2*n2 ), n2 )
+                              
+                    call stdlib_wtrmm( 'L', 'L', 'C', 'N', n2, n1, cone, a( n1*n2 ),n2, a( 0 ), &
+                              n2 )
+                    call stdlib_wlauum( 'L', n2, a( n1*n2 ), n2, info )
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_wlauum( 'L', k, a( 1 ), n+1, info )
+                    call stdlib_wherk( 'L', 'C', k, k, one, a( k+1 ), n+1, one,a( 1 ), n+1 )
+                              
+                    call stdlib_wtrmm( 'L', 'U', 'N', 'N', k, k, cone, a( 0 ), n+1,a( k+1 ), n+1 )
+                              
+                    call stdlib_wlauum( 'U', k, a( 0 ), n+1, info )
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_wlauum( 'L', k, a( k+1 ), n+1, info )
+                    call stdlib_wherk( 'L', 'N', k, k, one, a( 0 ), n+1, one,a( k+1 ), n+1 )
+                              
+                    call stdlib_wtrmm( 'R', 'U', 'C', 'N', k, k, cone, a( k ), n+1,a( 0 ), n+1 )
+                              
+                    call stdlib_wlauum( 'U', k, a( k ), n+1, info )
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose, and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1),
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_wlauum( 'U', k, a( k ), k, info )
+                    call stdlib_wherk( 'U', 'N', k, k, one, a( k*( k+1 ) ), k, one,a( k ), k )
+                              
+                    call stdlib_wtrmm( 'R', 'L', 'N', 'N', k, k, cone, a( 0 ), k,a( k*( k+1 ) ), &
+                              k )
+                    call stdlib_wlauum( 'L', k, a( 0 ), k, info )
+                 else
+                    ! srpa for upper, transpose, and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0),
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_wlauum( 'U', k, a( k*( k+1 ) ), k, info )
+                    call stdlib_wherk( 'U', 'C', k, k, one, a( 0 ), k, one,a( k*( k+1 ) ), k )
+                              
+                    call stdlib_wtrmm( 'L', 'L', 'C', 'N', k, k, cone, a( k*k ), k,a( 0 ), k )
+                              
+                    call stdlib_wlauum( 'L', k, a( k*k ), k, info )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_wpftri
+
+     !> ZPFTRS: solves a system of linear equations A*X = B with a Hermitian
+     !> positive definite matrix A using the Cholesky factorization
+     !> A = U**H*U or A = L*L**H computed by ZPFTRF.
+
+     pure subroutine stdlib_wpftrs( transr, uplo, n, nrhs, a, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(0:*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, normaltransr
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPFTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! start execution: there are two triangular solves
+           if( lower ) then
+              call stdlib_wtfsm( transr, 'L', uplo, 'N', 'N', n, nrhs, cone, a, b,ldb )
+              call stdlib_wtfsm( transr, 'L', uplo, 'C', 'N', n, nrhs, cone, a, b,ldb )
+           else
+              call stdlib_wtfsm( transr, 'L', uplo, 'C', 'N', n, nrhs, cone, a, b,ldb )
+              call stdlib_wtfsm( transr, 'L', uplo, 'N', 'N', n, nrhs, cone, a, b,ldb )
+           end if
+           return
+     end subroutine stdlib_wpftrs
+
+     !> ZPOCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex Hermitian positive definite matrix using the
+     !> Cholesky factorization A = U**H*U or A = L*L**H computed by ZPOTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_wpocon( uplo, n, a, lda, anorm, rcond, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(qp) :: ainvnm, scale, scalel, scaleu, smlnum
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of inv(a).
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**h).
+                 call stdlib_wlatrs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, a, lda,&
+                            work, scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_wlatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           scaleu, rwork, info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_wlatrs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(l**h).
+                 call stdlib_wlatrs( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, a, lda,&
+                            work, scaleu, rwork, info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_iwamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_wdrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_wpocon
+
+     !> ZPOEQU: computes row and column scalings intended to equilibrate a
+     !> Hermitian positive definite matrix A and reduce its condition number
+     !> (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_wpoequ( n, a, lda, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(out) :: s(*)
+           complex(qp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           ! find the minimum and maximum diagonal elements.
+           s( 1 ) = real( a( 1, 1 ),KIND=qp)
+           smin = s( 1 )
+           amax = s( 1 )
+           do i = 2, n
+              s( i ) = real( a( i, i ),KIND=qp)
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_wpoequ
+
+     !> ZPOEQUB: computes row and column scalings intended to equilibrate a
+     !> Hermitian positive definite matrix A and reduce its condition number
+     !> (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+     !> This routine differs from ZPOEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled diagonal entries are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_wpoequb( n, a, lda, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(out) :: amax, scond
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           real(qp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: smin, base, tmp
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt,log,int,real,aimag
+           ! Executable Statements 
+           ! test the input parameters.
+           ! positive definite only performs 1 pass of equilibration.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           base = stdlib_qlamch( 'B' )
+           tmp = -0.5e+0_qp / log ( base )
+           ! find the minimum and maximum diagonal elements.
+           s( 1 ) = real( a( 1, 1 ),KIND=qp)
+           smin = s( 1 )
+           amax = s( 1 )
+           do i = 2, n
+              s( i ) = real( a( i, i ),KIND=qp)
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = base ** int( tmp * log( s( i ) ),KIND=ilp)
+              end do
+              ! compute scond = min(s(i)) / max(s(i)).
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_wpoequb
+
+     !> ZPORFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian positive definite,
+     !> and provides error bounds and backward error estimates for the
+     !> solution.
+
+     pure subroutine stdlib_wporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x,ldx, ferr, berr, &
+               work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: x(ldx,*)
+        ! ====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPORFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_wcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_whemv( uplo, n, -cone, a, lda, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( a( k, k ),KIND=qp) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( a( k, k ),KIND=qp) )*xk
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_wpotrs( uplo, n, 1, af, ldaf, work, n, info )
+                 call stdlib_waxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_wlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_wpotrs( uplo, n, 1, af, ldaf, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_wpotrs( uplo, n, 1, af, ldaf, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_wporfs
+
+     !> ZPOSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**H* U,  if UPLO = 'U', or
+     !> A = L * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and  L is a lower triangular
+     !> matrix.  The factored form of A is then used to solve the system of
+     !> equations A * X = B.
+
+     pure subroutine stdlib_wposv( uplo, n, nrhs, a, lda, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**h *u or a = l*l**h.
+           call stdlib_wpotrf( uplo, n, a, lda, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_wpotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_wposv
+
+     !> ZPOSVX: uses the Cholesky factorization A = U**H*U or A = L*L**H to
+     !> compute the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_wposvx( fact, uplo, n, nrhs, a, lda, af, ldaf, equed,s, b, ldb, x, ldx, &
+               rcond, ferr, berr, work,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(qp), intent(inout) :: s(*)
+           complex(qp), intent(inout) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ
+           integer(ilp) :: i, infequ, j
+           real(qp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -9
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -10
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -12
+                 else if( ldx<max( 1, n ) ) then
+                    info = -14
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_wpoequ( n, a, lda, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_wlaqhe( uplo, n, a, lda, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**h *u or a = l*l**h.
+              call stdlib_wlacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_wpotrf( uplo, n, af, ldaf, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_wlanhe( '1', uplo, n, a, lda, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_wpocon( uplo, n, af, ldaf, anorm, rcond, work, rwork, info )
+           ! compute the solution matrix x.
+           call stdlib_wlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_wpotrs( uplo, n, nrhs, af, ldaf, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_wporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x, ldx,ferr, berr, work, &
+                     rwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_wposvx
+
+     !> ZPOTF2: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**H * U ,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_wpotf2( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j
+           real(qp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: real,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the cholesky factorization a = u**h *u.
+              do j = 1, n
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = real( a( j, j ),KIND=qp) - real( stdlib_wdotc( j-1, a( 1, j ), 1,a( 1, j ),&
+                            1 ),KIND=qp)
+                 if( ajj<=zero.or.stdlib_qisnan( ajj ) ) then
+                    a( j, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of row j.
+                 if( j<n ) then
+                    call stdlib_wlacgv( j-1, a( 1, j ), 1 )
+                    call stdlib_wgemv( 'TRANSPOSE', j-1, n-j, -cone, a( 1, j+1 ),lda, a( 1, j ), &
+                              1, cone, a( j, j+1 ), lda )
+                    call stdlib_wlacgv( j-1, a( 1, j ), 1 )
+                    call stdlib_wdscal( n-j, one / ajj, a( j, j+1 ), lda )
+                 end if
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**h.
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = real( a( j, j ),KIND=qp) - real( stdlib_wdotc( j-1, a( j, 1 ), lda,a( j, 1 &
+                           ), lda ),KIND=qp)
+                 if( ajj<=zero.or.stdlib_qisnan( ajj ) ) then
+                    a( j, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of column j.
+                 if( j<n ) then
+                    call stdlib_wlacgv( j-1, a( j, 1 ), lda )
+                    call stdlib_wgemv( 'NO TRANSPOSE', n-j, j-1, -cone, a( j+1, 1 ),lda, a( j, 1 )&
+                              , lda, cone, a( j+1, j ), 1 )
+                    call stdlib_wlacgv( j-1, a( j, 1 ), lda )
+                    call stdlib_wdscal( n-j, one / ajj, a( j+1, j ), 1 )
+                 end if
+              end do
+           end if
+           go to 40
+           30 continue
+           info = j
+           40 continue
+           return
+     end subroutine stdlib_wpotf2
+
+     !> ZPOTRF: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the block version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_wpotrf( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'ZPOTRF', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code.
+              call stdlib_wpotrf2( uplo, n, a, lda, info )
+           else
+              ! use blocked code.
+              if( upper ) then
+                 ! compute the cholesky factorization a = u**h *u.
+                 do j = 1, n, nb
+                    ! update and factorize the current diagonal block and test
+                    ! for non-positive-definiteness.
+                    jb = min( nb, n-j+1 )
+                    call stdlib_wherk( 'UPPER', 'CONJUGATE TRANSPOSE', jb, j-1,-one, a( 1, j ), &
+                              lda, one, a( j, j ), lda )
+                    call stdlib_wpotrf2( 'UPPER', jb, a( j, j ), lda, info )
+                    if( info/=0 )go to 30
+                    if( j+jb<=n ) then
+                       ! compute the current block row.
+                       call stdlib_wgemm( 'CONJUGATE TRANSPOSE', 'NO TRANSPOSE', jb,n-j-jb+1, j-1,&
+                                  -cone, a( 1, j ), lda,a( 1, j+jb ), lda, cone, a( j, j+jb ),lda )
+                       call stdlib_wtrsm( 'LEFT', 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT', jb, &
+                                 n-j-jb+1, cone, a( j, j ),lda, a( j, j+jb ), lda )
+                    end if
+                 end do
+              else
+                 ! compute the cholesky factorization a = l*l**h.
+                 do j = 1, n, nb
+                    ! update and factorize the current diagonal block and test
+                    ! for non-positive-definiteness.
+                    jb = min( nb, n-j+1 )
+                    call stdlib_wherk( 'LOWER', 'NO TRANSPOSE', jb, j-1, -one,a( j, 1 ), lda, one,&
+                               a( j, j ), lda )
+                    call stdlib_wpotrf2( 'LOWER', jb, a( j, j ), lda, info )
+                    if( info/=0 )go to 30
+                    if( j+jb<=n ) then
+                       ! compute the current block column.
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',n-j-jb+1, jb, j-1,&
+                                  -cone, a( j+jb, 1 ),lda, a( j, 1 ), lda, cone, a( j+jb, j ),lda )
+                       call stdlib_wtrsm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','NON-UNIT', n-j-&
+                                 jb+1, jb, cone, a( j, j ),lda, a( j+jb, j ), lda )
+                    end if
+                 end do
+              end if
+           end if
+           go to 40
+           30 continue
+           info = info + j - 1
+           40 continue
+           return
+     end subroutine stdlib_wpotrf
+
+     !> ZPOTRF2: computes the Cholesky factorization of a Hermitian
+     !> positive definite matrix A using the recursive algorithm.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the recursive version of the algorithm. It divides
+     !> the matrix into four submatrices:
+     !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+     !> A = [ -----|----- ]  with n1 = n/2
+     !> [  A21 | A22  ]       n2 = n-n1
+     !> The subroutine calls itself to factor A11. Update and scale A21
+     !> or A12, update A22 then call itself to factor A22.
+
+     pure recursive subroutine stdlib_wpotrf2( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: n1, n2, iinfo
+           real(qp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOTRF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! n=1 case
+           if( n==1 ) then
+              ! test for non-positive-definiteness
+              ajj = real( a( 1, 1 ),KIND=qp)
+              if( ajj<=zero.or.stdlib_qisnan( ajj ) ) then
+                 info = 1
+                 return
+              end if
+              ! factor
+              a( 1, 1 ) = sqrt( ajj )
+           ! use recursive code
+           else
+              n1 = n/2
+              n2 = n-n1
+              ! factor a11
+              call stdlib_wpotrf2( uplo, n1, a( 1, 1 ), lda, iinfo )
+              if ( iinfo/=0 ) then
+                 info = iinfo
+                 return
+              end if
+              ! compute the cholesky factorization a = u**h*u
+              if( upper ) then
+                 ! update and scale a12
+                 call stdlib_wtrsm( 'L', 'U', 'C', 'N', n1, n2, cone,a( 1, 1 ), lda, a( 1, n1+1 ),&
+                            lda )
+                 ! update and factor a22
+                 call stdlib_wherk( uplo, 'C', n2, n1, -one, a( 1, n1+1 ), lda,one, a( n1+1, n1+1 &
+                           ), lda )
+                 call stdlib_wpotrf2( uplo, n2, a( n1+1, n1+1 ), lda, iinfo )
+                 if ( iinfo/=0 ) then
+                    info = iinfo + n1
+                    return
+                 end if
+              ! compute the cholesky factorization a = l*l**h
+              else
+                 ! update and scale a21
+                 call stdlib_wtrsm( 'R', 'L', 'C', 'N', n2, n1, cone,a( 1, 1 ), lda, a( n1+1, 1 ),&
+                            lda )
+                 ! update and factor a22
+                 call stdlib_wherk( uplo, 'N', n2, n1, -one, a( n1+1, 1 ), lda,one, a( n1+1, n1+1 &
+                           ), lda )
+                 call stdlib_wpotrf2( uplo, n2, a( n1+1, n1+1 ), lda, iinfo )
+                 if ( iinfo/=0 ) then
+                    info = iinfo + n1
+                    return
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_wpotrf2
+
+     !> ZPOTRI: computes the inverse of a complex Hermitian positive definite
+     !> matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+     !> computed by ZPOTRF.
+
+     pure subroutine stdlib_wpotri( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_wtrtri( uplo, 'NON-UNIT', n, a, lda, info )
+           if( info>0 )return
+           ! form inv(u) * inv(u)**h or inv(l)**h * inv(l).
+           call stdlib_wlauum( uplo, n, a, lda, info )
+           return
+     end subroutine stdlib_wpotri
+
+     !> ZPOTRS: solves a system of linear equations A*X = B with a Hermitian
+     !> positive definite matrix A using the Cholesky factorization
+     !> A = U**H * U or A = L * L**H computed by ZPOTRF.
+
+     pure subroutine stdlib_wpotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**h *u.
+              ! solve u**h *x = b, overwriting b with x.
+              call stdlib_wtrsm( 'LEFT', 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',n, nrhs, cone,&
+                         a, lda, b, ldb )
+              ! solve u*x = b, overwriting b with x.
+              call stdlib_wtrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, cone, a, &
+                        lda, b, ldb )
+           else
+              ! solve a*x = b where a = l*l**h.
+              ! solve l*x = b, overwriting b with x.
+              call stdlib_wtrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, cone, a, &
+                        lda, b, ldb )
+              ! solve l**h *x = b, overwriting b with x.
+              call stdlib_wtrsm( 'LEFT', 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',n, nrhs, cone,&
+                         a, lda, b, ldb )
+           end if
+           return
+     end subroutine stdlib_wpotrs
+
+     !> ZPPCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex Hermitian positive definite packed matrix using
+     !> the Cholesky factorization A = U**H*U or A = L*L**H computed by
+     !> ZPPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_wppcon( uplo, n, ap, anorm, rcond, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(in) :: ap(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(qp) :: ainvnm, scale, scalel, scaleu, smlnum
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**h).
+                 call stdlib_wlatps( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, ap, &
+                           work, scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_wlatps( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,ap, work, &
+                           scaleu, rwork, info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_wlatps( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,ap, work, &
+                           scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(l**h).
+                 call stdlib_wlatps( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, ap, &
+                           work, scaleu, rwork, info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_iwamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_wdrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_wppcon
+
+     !> ZPPEQU: computes row and column scalings intended to equilibrate a
+     !> Hermitian positive definite matrix A in packed storage and reduce
+     !> its condition number (with respect to the two-norm).  S contains the
+     !> scale factors, S(i)=1/sqrt(A(i,i)), chosen so that the scaled matrix
+     !> B with elements B(i,j)=S(i)*A(i,j)*S(j) has ones on the diagonal.
+     !> This choice of S puts the condition number of B within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_wppequ( uplo, n, ap, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(qp), intent(out) :: s(*)
+           complex(qp), intent(in) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, jj
+           real(qp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           ! initialize smin and amax.
+           s( 1 ) = real( ap( 1 ),KIND=qp)
+           smin = s( 1 )
+           amax = s( 1 )
+           if( upper ) then
+              ! uplo = 'u':  upper triangle of a is stored.
+              ! find the minimum and maximum diagonal elements.
+              jj = 1
+              do i = 2, n
+                 jj = jj + i
+                 s( i ) = real( ap( jj ),KIND=qp)
+                 smin = min( smin, s( i ) )
+                 amax = max( amax, s( i ) )
+              end do
+           else
+              ! uplo = 'l':  lower triangle of a is stored.
+              ! find the minimum and maximum diagonal elements.
+              jj = 1
+              do i = 2, n
+                 jj = jj + n - i + 2
+                 s( i ) = real( ap( jj ),KIND=qp)
+                 smin = min( smin, s( i ) )
+                 amax = max( amax, s( i ) )
+              end do
+           end if
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_wppequ
+
+     !> ZPPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian positive definite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_wpprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr,berr, work, &
+               rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: x(ldx,*)
+        ! ====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_wcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_whpmv( uplo, n, -cone, ap, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( ap( kk+k-1 ),KIND=qp) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( ap( kk ),KIND=qp) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_wpptrs( uplo, n, 1, afp, work, n, info )
+                 call stdlib_waxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_wlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_wpptrs( uplo, n, 1, afp, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_wpptrs( uplo, n, 1, afp, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_wpprfs
+
+     !> ZPPSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix stored in
+     !> packed format and X and B are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is a lower triangular
+     !> matrix.  The factored form of A is then used to solve the system of
+     !> equations A * X = B.
+
+     pure subroutine stdlib_wppsv( uplo, n, nrhs, ap, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(qp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**h *u or a = l*l**h.
+           call stdlib_wpptrf( uplo, n, ap, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_wpptrs( uplo, n, nrhs, ap, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_wppsv
+
+     !> ZPPSVX: uses the Cholesky factorization A = U**H * U or A = L * L**H to
+     !> compute the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix stored in
+     !> packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_wppsvx( fact, uplo, n, nrhs, ap, afp, equed, s, b, ldb,x, ldx, rcond, ferr,&
+                berr, work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(qp), intent(inout) :: s(*)
+           complex(qp), intent(inout) :: afp(*), ap(*), b(ldb,*)
+           complex(qp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ
+           integer(ilp) :: i, infequ, j
+           real(qp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_qlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -7
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -8
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -10
+                 else if( ldx<max( 1, n ) ) then
+                    info = -12
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_wppequ( uplo, n, ap, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_wlaqhp( uplo, n, ap, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right-hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**h * u or a = l * l**h.
+              call stdlib_wcopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_wpptrf( uplo, n, afp, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_wlanhp( 'I', uplo, n, ap, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_wppcon( uplo, n, afp, anorm, rcond, work, rwork, info )
+           ! compute the solution matrix x.
+           call stdlib_wlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_wpptrs( uplo, n, nrhs, afp, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_wpprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr, berr,work, rwork, &
+                     info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_wppsvx
+
+     !> ZPPTRF: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite matrix A stored in packed format.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+
+     pure subroutine stdlib_wpptrf( uplo, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jc, jj
+           real(qp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the cholesky factorization a = u**h * u.
+              jj = 0
+              do j = 1, n
+                 jc = jj + 1
+                 jj = jj + j
+                 ! compute elements 1:j-1 of column j.
+                 if( j>1 )call stdlib_wtpsv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',j-1, ap, &
+                           ap( jc ), 1 )
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = real( ap( jj ),KIND=qp) - real( stdlib_wdotc( j-1,ap( jc ), 1, ap( jc ), 1 &
+                           ),KIND=qp)
+                 if( ajj<=zero ) then
+                    ap( jj ) = ajj
+                    go to 30
+                 end if
+                 ap( jj ) = sqrt( ajj )
+              end do
+           else
+              ! compute the cholesky factorization a = l * l**h.
+              jj = 1
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = real( ap( jj ),KIND=qp)
+                 if( ajj<=zero ) then
+                    ap( jj ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 ap( jj ) = ajj
+                 ! compute elements j+1:n of column j and update the trailing
+                 ! submatrix.
+                 if( j<n ) then
+                    call stdlib_wdscal( n-j, one / ajj, ap( jj+1 ), 1 )
+                    call stdlib_whpr( 'LOWER', n-j, -one, ap( jj+1 ), 1,ap( jj+n-j+1 ) )
+                    jj = jj + n - j + 1
+                 end if
+              end do
+           end if
+           go to 40
+           30 continue
+           info = j
+           40 continue
+           return
+     end subroutine stdlib_wpptrf
+
+     !> ZPPTRI: computes the inverse of a complex Hermitian positive definite
+     !> matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+     !> computed by ZPPTRF.
+
+     pure subroutine stdlib_wpptri( uplo, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jc, jj, jjn
+           real(qp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_wtptri( uplo, 'NON-UNIT', n, ap, info )
+           if( info>0 )return
+           if( upper ) then
+              ! compute the product inv(u) * inv(u)**h.
+              jj = 0
+              do j = 1, n
+                 jc = jj + 1
+                 jj = jj + j
+                 if( j>1 )call stdlib_whpr( 'UPPER', j-1, one, ap( jc ), 1, ap )
+                 ajj = real( ap( jj ),KIND=qp)
+                 call stdlib_wdscal( j, ajj, ap( jc ), 1 )
+              end do
+           else
+              ! compute the product inv(l)**h * inv(l).
+              jj = 1
+              do j = 1, n
+                 jjn = jj + n - j + 1
+                 ap( jj ) = real( stdlib_wdotc( n-j+1, ap( jj ), 1, ap( jj ), 1 ),KIND=qp)
+                 if( j<n )call stdlib_wtpmv( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',n-j, ap( &
+                           jjn ), ap( jj+1 ), 1 )
+                 jj = jjn
+              end do
+           end if
+           return
+     end subroutine stdlib_wpptri
+
+     !> ZPPTRS: solves a system of linear equations A*X = B with a Hermitian
+     !> positive definite matrix A in packed storage using the Cholesky
+     !> factorization A = U**H * U or A = L * L**H computed by ZPPTRF.
+
+     pure subroutine stdlib_wpptrs( uplo, n, nrhs, ap, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(qp), intent(in) :: ap(*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**h * u.
+              do i = 1, nrhs
+                 ! solve u**h *x = b, overwriting b with x.
+                 call stdlib_wtpsv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,ap, b( 1, i ), &
+                           1 )
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_wtpsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                           
+              end do
+           else
+              ! solve a*x = b where a = l * l**h.
+              do i = 1, nrhs
+                 ! solve l*y = b, overwriting b with x.
+                 call stdlib_wtpsv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                           
+                 ! solve l**h *x = y, overwriting b with x.
+                 call stdlib_wtpsv( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,ap, b( 1, i ), &
+                           1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_wpptrs
+
+     !> ZPSTF2: computes the Cholesky factorization with complete
+     !> pivoting of a complex Hermitian positive semidefinite matrix A.
+     !> The factorization has the form
+     !> P**T * A * P = U**H * U ,  if UPLO = 'U',
+     !> P**T * A * P = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular, and
+     !> P is stored as vector PIV.
+     !> This algorithm does not attempt to check that A is positive
+     !> semidefinite. This version of the algorithm calls level 2 BLAS.
+
+     pure subroutine stdlib_wpstf2( uplo, n, a, lda, piv, rank, tol, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: tol
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(2*n)
+           integer(ilp), intent(out) :: piv(n)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(qp) :: ztemp
+           real(qp) :: ajj, dstop, dtemp
+           integer(ilp) :: i, itemp, j, pvt
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPSTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize piv
+           do i = 1, n
+              piv( i ) = i
+           end do
+           ! compute stopping value
+           do i = 1, n
+              work( i ) = real( a( i, i ),KIND=qp)
+           end do
+           pvt = maxloc( work( 1:n ), 1 )
+           ajj = real( a( pvt, pvt ),KIND=qp)
+           if( ajj<=zero.or.stdlib_qisnan( ajj ) ) then
+              rank = 0
+              info = 1
+              go to 200
+           end if
+           ! compute stopping value if not supplied
+           if( tol<zero ) then
+              dstop = n * stdlib_qlamch( 'EPSILON' ) * ajj
+           else
+              dstop = tol
+           end if
+           ! set first chalf of work to zero, holds dot products
+           do i = 1, n
+              work( i ) = 0
+           end do
+           if( upper ) then
+              ! compute the cholesky factorization p**t * a * p = u**h* u
+              loop_150: do j = 1, n
+              ! find pivot, test for exit, else swap rows and columns
+              ! update dot products, compute possible pivots which are
+              ! stored in the second chalf of work
+                 do i = j, n
+                    if( j>1 ) then
+                       work( i ) = work( i ) +real( conjg( a( j-1, i ) )*a( j-1, i ),KIND=qp)
+                                 
+                    end if
+                    work( n+i ) = real( a( i, i ),KIND=qp) - work( i )
+                 end do
+                 if( j>1 ) then
+                    itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                    pvt = itemp + j - 1
+                    ajj = work( n+pvt )
+                    if( ajj<=dstop.or.stdlib_qisnan( ajj ) ) then
+                       a( j, j ) = ajj
+                       go to 190
+                    end if
+                 end if
+                 if( j/=pvt ) then
+                    ! pivot ok, so can now swap pivot rows and columns
+                    a( pvt, pvt ) = a( j, j )
+                    call stdlib_wswap( j-1, a( 1, j ), 1, a( 1, pvt ), 1 )
+                    if( pvt<n )call stdlib_wswap( n-pvt, a( j, pvt+1 ), lda,a( pvt, pvt+1 ), lda )
+                              
+                    do i = j + 1, pvt - 1
+                       ztemp = conjg( a( j, i ) )
+                       a( j, i ) = conjg( a( i, pvt ) )
+                       a( i, pvt ) = ztemp
+                    end do
+                    a( j, pvt ) = conjg( a( j, pvt ) )
+                    ! swap dot products and piv
+                    dtemp = work( j )
+                    work( j ) = work( pvt )
+                    work( pvt ) = dtemp
+                    itemp = piv( pvt )
+                    piv( pvt ) = piv( j )
+                    piv( j ) = itemp
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of row j
+                 if( j<n ) then
+                    call stdlib_wlacgv( j-1, a( 1, j ), 1 )
+                    call stdlib_wgemv( 'TRANS', j-1, n-j, -cone, a( 1, j+1 ), lda,a( 1, j ), 1, &
+                              cone, a( j, j+1 ), lda )
+                    call stdlib_wlacgv( j-1, a( 1, j ), 1 )
+                    call stdlib_wdscal( n-j, one / ajj, a( j, j+1 ), lda )
+                 end if
+              end do loop_150
+           else
+              ! compute the cholesky factorization p**t * a * p = l * l**h
+              loop_180: do j = 1, n
+              ! find pivot, test for exit, else swap rows and columns
+              ! update dot products, compute possible pivots which are
+              ! stored in the second chalf of work
+                 do i = j, n
+                    if( j>1 ) then
+                       work( i ) = work( i ) +real( conjg( a( i, j-1 ) )*a( i, j-1 ),KIND=qp)
+                                 
+                    end if
+                    work( n+i ) = real( a( i, i ),KIND=qp) - work( i )
+                 end do
+                 if( j>1 ) then
+                    itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                    pvt = itemp + j - 1
+                    ajj = work( n+pvt )
+                    if( ajj<=dstop.or.stdlib_qisnan( ajj ) ) then
+                       a( j, j ) = ajj
+                       go to 190
+                    end if
+                 end if
+                 if( j/=pvt ) then
+                    ! pivot ok, so can now swap pivot rows and columns
+                    a( pvt, pvt ) = a( j, j )
+                    call stdlib_wswap( j-1, a( j, 1 ), lda, a( pvt, 1 ), lda )
+                    if( pvt<n )call stdlib_wswap( n-pvt, a( pvt+1, j ), 1, a( pvt+1, pvt ),1 )
+                              
+                    do i = j + 1, pvt - 1
+                       ztemp = conjg( a( i, j ) )
+                       a( i, j ) = conjg( a( pvt, i ) )
+                       a( pvt, i ) = ztemp
+                    end do
+                    a( pvt, j ) = conjg( a( pvt, j ) )
+                    ! swap dot products and piv
+                    dtemp = work( j )
+                    work( j ) = work( pvt )
+                    work( pvt ) = dtemp
+                    itemp = piv( pvt )
+                    piv( pvt ) = piv( j )
+                    piv( j ) = itemp
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of column j
+                 if( j<n ) then
+                    call stdlib_wlacgv( j-1, a( j, 1 ), lda )
+                    call stdlib_wgemv( 'NO TRANS', n-j, j-1, -cone, a( j+1, 1 ),lda, a( j, 1 ), &
+                              lda, cone, a( j+1, j ), 1 )
+                    call stdlib_wlacgv( j-1, a( j, 1 ), lda )
+                    call stdlib_wdscal( n-j, one / ajj, a( j+1, j ), 1 )
+                 end if
+              end do loop_180
+           end if
+           ! ran to completion, a has full rank
+           rank = n
+           go to 200
+           190 continue
+           ! rank is number of steps completed.  set info = 1 to signal
+           ! that the factorization cannot be used to solve a system.
+           rank = j - 1
+           info = 1
+           200 continue
+           return
+     end subroutine stdlib_wpstf2
+
+     !> ZPSTRF: computes the Cholesky factorization with complete
+     !> pivoting of a complex Hermitian positive semidefinite matrix A.
+     !> The factorization has the form
+     !> P**T * A * P = U**H * U ,  if UPLO = 'U',
+     !> P**T * A * P = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular, and
+     !> P is stored as vector PIV.
+     !> This algorithm does not attempt to check that A is positive
+     !> semidefinite. This version of the algorithm calls level 3 BLAS.
+
+     pure subroutine stdlib_wpstrf( uplo, n, a, lda, piv, rank, tol, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(qp), intent(in) :: tol
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           real(qp), intent(out) :: work(2*n)
+           integer(ilp), intent(out) :: piv(n)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(qp) :: ztemp
+           real(qp) :: ajj, dstop, dtemp
+           integer(ilp) :: i, itemp, j, jb, k, nb, pvt
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max,min,sqrt,maxloc
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPSTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get block size
+           nb = stdlib_ilaenv( 1, 'ZPOTRF', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_wpstf2( uplo, n, a( 1, 1 ), lda, piv, rank, tol, work,info )
+              go to 230
+           else
+           ! initialize piv
+              do i = 1, n
+                 piv( i ) = i
+              end do
+           ! compute stopping value
+              do i = 1, n
+                 work( i ) = real( a( i, i ),KIND=qp)
+              end do
+              pvt = maxloc( work( 1:n ), 1 )
+              ajj = real( a( pvt, pvt ),KIND=qp)
+              if( ajj<=zero.or.stdlib_qisnan( ajj ) ) then
+                 rank = 0
+                 info = 1
+                 go to 230
+              end if
+           ! compute stopping value if not supplied
+              if( tol<zero ) then
+                 dstop = n * stdlib_qlamch( 'EPSILON' ) * ajj
+              else
+                 dstop = tol
+              end if
+              if( upper ) then
+                 ! compute the cholesky factorization p**t * a * p = u**h * u
+                 loop_160: do k = 1, n, nb
+                    ! account for last block not being nb wide
+                    jb = min( nb, n-k+1 )
+                    ! set relevant part of first chalf of work to zero,
+                    ! holds dot products
+                    do i = k, n
+                       work( i ) = 0
+                    end do
+                    loop_150: do j = k, k + jb - 1
+                    ! find pivot, test for exit, else swap rows and columns
+                    ! update dot products, compute possible pivots which are
+                    ! stored in the second chalf of work
+                       do i = j, n
+                          if( j>k ) then
+                             work( i ) = work( i ) +real( conjg( a( j-1, i ) )*a( j-1, i ),&
+                                       KIND=qp)
+                          end if
+                          work( n+i ) = real( a( i, i ),KIND=qp) - work( i )
+                       end do
+                       if( j>1 ) then
+                          itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                          pvt = itemp + j - 1
+                          ajj = work( n+pvt )
+                          if( ajj<=dstop.or.stdlib_qisnan( ajj ) ) then
+                             a( j, j ) = ajj
+                             go to 220
+                          end if
+                       end if
+                       if( j/=pvt ) then
+                          ! pivot ok, so can now swap pivot rows and columns
+                          a( pvt, pvt ) = a( j, j )
+                          call stdlib_wswap( j-1, a( 1, j ), 1, a( 1, pvt ), 1 )
+                          if( pvt<n )call stdlib_wswap( n-pvt, a( j, pvt+1 ), lda,a( pvt, pvt+1 ),&
+                                     lda )
+                          do i = j + 1, pvt - 1
+                             ztemp = conjg( a( j, i ) )
+                             a( j, i ) = conjg( a( i, pvt ) )
+                             a( i, pvt ) = ztemp
+                          end do
+                          a( j, pvt ) = conjg( a( j, pvt ) )
+                          ! swap dot products and piv
+                          dtemp = work( j )
+                          work( j ) = work( pvt )
+                          work( pvt ) = dtemp
+                          itemp = piv( pvt )
+                          piv( pvt ) = piv( j )
+                          piv( j ) = itemp
+                       end if
+                       ajj = sqrt( ajj )
+                       a( j, j ) = ajj
+                       ! compute elements j+1:n of row j.
+                       if( j<n ) then
+                          call stdlib_wlacgv( j-1, a( 1, j ), 1 )
+                          call stdlib_wgemv( 'TRANS', j-k, n-j, -cone, a( k, j+1 ),lda, a( k, j ),&
+                                     1, cone, a( j, j+1 ),lda )
+                          call stdlib_wlacgv( j-1, a( 1, j ), 1 )
+                          call stdlib_wdscal( n-j, one / ajj, a( j, j+1 ), lda )
+                       end if
+                    end do loop_150
+                    ! update trailing matrix, j already incremented
+                    if( k+jb<=n ) then
+                       call stdlib_wherk( 'UPPER', 'CONJ TRANS', n-j+1, jb, -one,a( k, j ), lda, &
+                                 one, a( j, j ), lda )
+                    end if
+                 end do loop_160
+              else
+              ! compute the cholesky factorization p**t * a * p = l * l**h
+                 loop_210: do k = 1, n, nb
+                    ! account for last block not being nb wide
+                    jb = min( nb, n-k+1 )
+                    ! set relevant part of first chalf of work to zero,
+                    ! holds dot products
+                    do i = k, n
+                       work( i ) = 0
+                    end do
+                    loop_200: do j = k, k + jb - 1
+                    ! find pivot, test for exit, else swap rows and columns
+                    ! update dot products, compute possible pivots which are
+                    ! stored in the second chalf of work
+                       do i = j, n
+                          if( j>k ) then
+                             work( i ) = work( i ) +real( conjg( a( i, j-1 ) )*a( i, j-1 ),&
+                                       KIND=qp)
+                          end if
+                          work( n+i ) = real( a( i, i ),KIND=qp) - work( i )
+                       end do
+                       if( j>1 ) then
+                          itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                          pvt = itemp + j - 1
+                          ajj = work( n+pvt )
+                          if( ajj<=dstop.or.stdlib_qisnan( ajj ) ) then
+                             a( j, j ) = ajj
+                             go to 220
+                          end if
+                       end if
+                       if( j/=pvt ) then
+                          ! pivot ok, so can now swap pivot rows and columns
+                          a( pvt, pvt ) = a( j, j )
+                          call stdlib_wswap( j-1, a( j, 1 ), lda, a( pvt, 1 ), lda )
+                          if( pvt<n )call stdlib_wswap( n-pvt, a( pvt+1, j ), 1,a( pvt+1, pvt ), &
+                                    1 )
+                          do i = j + 1, pvt - 1
+                             ztemp = conjg( a( i, j ) )
+                             a( i, j ) = conjg( a( pvt, i ) )
+                             a( pvt, i ) = ztemp
+                          end do
+                          a( pvt, j ) = conjg( a( pvt, j ) )
+                          ! swap dot products and piv
+                          dtemp = work( j )
+                          work( j ) = work( pvt )
+                          work( pvt ) = dtemp
+                          itemp = piv( pvt )
+                          piv( pvt ) = piv( j )
+                          piv( j ) = itemp
+                       end if
+                       ajj = sqrt( ajj )
+                       a( j, j ) = ajj
+                       ! compute elements j+1:n of column j.
+                       if( j<n ) then
+                          call stdlib_wlacgv( j-1, a( j, 1 ), lda )
+                          call stdlib_wgemv( 'NO TRANS', n-j, j-k, -cone,a( j+1, k ), lda, a( j, &
+                                    k ), lda, cone,a( j+1, j ), 1 )
+                          call stdlib_wlacgv( j-1, a( j, 1 ), lda )
+                          call stdlib_wdscal( n-j, one / ajj, a( j+1, j ), 1 )
+                       end if
+                    end do loop_200
+                    ! update trailing matrix, j already incremented
+                    if( k+jb<=n ) then
+                       call stdlib_wherk( 'LOWER', 'NO TRANS', n-j+1, jb, -one,a( j, k ), lda, &
+                                 one, a( j, j ), lda )
+                    end if
+                 end do loop_210
+              end if
+           end if
+           ! ran to completion, a has full rank
+           rank = n
+           go to 230
+           220 continue
+           ! rank is the number of steps completed.  set info = 1 to signal
+           ! that the factorization cannot be used to solve a system.
+           rank = j - 1
+           info = 1
+           230 continue
+           return
+     end subroutine stdlib_wpstrf
+
+     !> ZPTCON: computes the reciprocal of the condition number (in the
+     !> 1-norm) of a complex Hermitian positive definite tridiagonal matrix
+     !> using the factorization A = L*D*L**H or A = U**H*D*U computed by
+     !> ZPTTRF.
+     !> Norm(inv(A)) is computed by a direct method, and the reciprocal of
+     !> the condition number is computed as
+     !> RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_wptcon( n, d, e, anorm, rcond, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(in) :: e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ix
+           real(qp) :: ainvnm
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( anorm<zero ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPTCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           ! check that d(1:n) is positive.
+           do i = 1, n
+              if( d( i )<=zero )return
+           end do
+           ! solve m(a) * x = e, where m(a) = (m(i,j)) is given by
+              ! m(i,j) =  abs(a(i,j)), i = j,
+              ! m(i,j) = -abs(a(i,j)), i .ne. j,
+           ! and e = [ 1, 1, ..., 1 ]**t.  note m(a) = m(l)*d*m(l)**h.
+           ! solve m(l) * x = e.
+           rwork( 1 ) = one
+           do i = 2, n
+              rwork( i ) = one + rwork( i-1 )*abs( e( i-1 ) )
+           end do
+           ! solve d * m(l)**h * x = b.
+           rwork( n ) = rwork( n ) / d( n )
+           do i = n - 1, 1, -1
+              rwork( i ) = rwork( i ) / d( i ) + rwork( i+1 )*abs( e( i ) )
+           end do
+           ! compute ainvnm = max(x(i)), 1<=i<=n.
+           ix = stdlib_iqamax( n, rwork, 1 )
+           ainvnm = abs( rwork( ix ) )
+           ! compute the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_wptcon
+
+     !> ZPTEQR: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric positive definite tridiagonal matrix by first factoring the
+     !> matrix using DPTTRF and then calling ZBDSQR to compute the singular
+     !> values of the bidiagonal factor.
+     !> This routine computes the eigenvalues of the positive definite
+     !> tridiagonal matrix to high relative accuracy.  This means that if the
+     !> eigenvalues range over many orders of magnitude in size, then the
+     !> small eigenvalues and corresponding eigenvectors will be computed
+     !> more accurately than, for example, with the standard QR method.
+     !> The eigenvectors of a full or band positive definite Hermitian matrix
+     !> can also be found if ZHETRD, ZHPTRD, or ZHBTRD has been used to
+     !> reduce this matrix to tridiagonal form.  (The reduction to
+     !> tridiagonal form, however, may preclude the possibility of obtaining
+     !> high relative accuracy in the small eigenvalues of the original
+     !> matrix, if these eigenvalues range over many orders of magnitude.)
+
+     pure subroutine stdlib_wpteqr( compz, n, d, e, z, ldz, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: z(ldz,*)
+        ! ====================================================================
+           
+           ! Local Arrays 
+           complex(qp) :: c(1,1), vt(1,1)
+           ! Local Scalars 
+           integer(ilp) :: i, icompz, nru
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or. ( icompz>0 .and. ldz<max( 1,n ) ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPTEQR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz>0 )z( 1, 1 ) = cone
+              return
+           end if
+           if( icompz==2 )call stdlib_wlaset( 'FULL', n, n, czero, cone, z, ldz )
+           ! call stdlib_qpttrf to factor the matrix.
+           call stdlib_qpttrf( n, d, e, info )
+           if( info/=0 )return
+           do i = 1, n
+              d( i ) = sqrt( d( i ) )
+           end do
+           do i = 1, n - 1
+              e( i ) = e( i )*d( i )
+           end do
+           ! call stdlib_wbdsqr to compute the singular values/vectors of the
+           ! bidiagonal factor.
+           if( icompz>0 ) then
+              nru = n
+           else
+              nru = 0
+           end if
+           call stdlib_wbdsqr( 'LOWER', n, 0, nru, 0, d, e, vt, 1, z, ldz, c, 1,work, info )
+                     
+           ! square the singular values.
+           if( info==0 ) then
+              do i = 1, n
+                 d( i ) = d( i )*d( i )
+              end do
+           else
+              info = n + info
+           end if
+           return
+     end subroutine stdlib_wpteqr
+
+     !> ZPTRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian positive definite
+     !> and tridiagonal, and provides error bounds and backward error
+     !> estimates for the solution.
+
+     pure subroutine stdlib_wptrfs( uplo, n, nrhs, d, e, df, ef, b, ldb, x, ldx,ferr, berr, work, &
+               rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(qp), intent(in) :: d(*), df(*)
+           complex(qp), intent(in) :: b(ldb,*), e(*), ef(*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ix, j, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin
+           complex(qp) :: bi, cx, dx, ex, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPTRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = 4
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_100: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x.  also compute
+              ! abs(a)*abs(x) + abs(b) for use in the backward error bound.
+              if( upper ) then
+                 if( n==1 ) then
+                    bi = b( 1, j )
+                    dx = d( 1 )*x( 1, j )
+                    work( 1 ) = bi - dx
+                    rwork( 1 ) = cabs1( bi ) + cabs1( dx )
+                 else
+                    bi = b( 1, j )
+                    dx = d( 1 )*x( 1, j )
+                    ex = e( 1 )*x( 2, j )
+                    work( 1 ) = bi - dx - ex
+                    rwork( 1 ) = cabs1( bi ) + cabs1( dx ) +cabs1( e( 1 ) )*cabs1( x( 2, j ) )
+                              
+                    do i = 2, n - 1
+                       bi = b( i, j )
+                       cx = conjg( e( i-1 ) )*x( i-1, j )
+                       dx = d( i )*x( i, j )
+                       ex = e( i )*x( i+1, j )
+                       work( i ) = bi - cx - dx - ex
+                       rwork( i ) = cabs1( bi ) +cabs1( e( i-1 ) )*cabs1( x( i-1, j ) ) +cabs1( &
+                                 dx ) + cabs1( e( i ) )*cabs1( x( i+1, j ) )
+                    end do
+                    bi = b( n, j )
+                    cx = conjg( e( n-1 ) )*x( n-1, j )
+                    dx = d( n )*x( n, j )
+                    work( n ) = bi - cx - dx
+                    rwork( n ) = cabs1( bi ) + cabs1( e( n-1 ) )*cabs1( x( n-1, j ) ) + cabs1( dx &
+                              )
+                 end if
+              else
+                 if( n==1 ) then
+                    bi = b( 1, j )
+                    dx = d( 1 )*x( 1, j )
+                    work( 1 ) = bi - dx
+                    rwork( 1 ) = cabs1( bi ) + cabs1( dx )
+                 else
+                    bi = b( 1, j )
+                    dx = d( 1 )*x( 1, j )
+                    ex = conjg( e( 1 ) )*x( 2, j )
+                    work( 1 ) = bi - dx - ex
+                    rwork( 1 ) = cabs1( bi ) + cabs1( dx ) +cabs1( e( 1 ) )*cabs1( x( 2, j ) )
+                              
+                    do i = 2, n - 1
+                       bi = b( i, j )
+                       cx = e( i-1 )*x( i-1, j )
+                       dx = d( i )*x( i, j )
+                       ex = conjg( e( i ) )*x( i+1, j )
+                       work( i ) = bi - cx - dx - ex
+                       rwork( i ) = cabs1( bi ) +cabs1( e( i-1 ) )*cabs1( x( i-1, j ) ) +cabs1( &
+                                 dx ) + cabs1( e( i ) )*cabs1( x( i+1, j ) )
+                    end do
+                    bi = b( n, j )
+                    cx = e( n-1 )*x( n-1, j )
+                    dx = d( n )*x( n, j )
+                    work( n ) = bi - cx - dx
+                    rwork( n ) = cabs1( bi ) + cabs1( e( n-1 ) )*cabs1( x( n-1, j ) ) + cabs1( dx &
+                              )
+                 end if
+              end if
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_wpttrs( uplo, n, 1, df, ef, work, n, info )
+                 call stdlib_waxpy( n, cmplx( one,KIND=qp), work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              ix = stdlib_iqamax( n, rwork, 1 )
+              ferr( j ) = rwork( ix )
+              ! estimate the norm of inv(a).
+              ! solve m(a) * x = e, where m(a) = (m(i,j)) is given by
+                 ! m(i,j) =  abs(a(i,j)), i = j,
+                 ! m(i,j) = -abs(a(i,j)), i .ne. j,
+              ! and e = [ 1, 1, ..., 1 ]**t.  note m(a) = m(l)*d*m(l)**h.
+              ! solve m(l) * x = e.
+              rwork( 1 ) = one
+              do i = 2, n
+                 rwork( i ) = one + rwork( i-1 )*abs( ef( i-1 ) )
+              end do
+              ! solve d * m(l)**h * x = b.
+              rwork( n ) = rwork( n ) / df( n )
+              do i = n - 1, 1, -1
+                 rwork( i ) = rwork( i ) / df( i ) +rwork( i+1 )*abs( ef( i ) )
+              end do
+              ! compute norm(inv(a)) = max(x(i)), 1<=i<=n.
+              ix = stdlib_iqamax( n, rwork, 1 )
+              ferr( j ) = ferr( j )*abs( rwork( ix ) )
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_100
+           return
+     end subroutine stdlib_wptrfs
+
+     !> ZPTSV: computes the solution to a complex system of linear equations
+     !> A*X = B, where A is an N-by-N Hermitian positive definite tridiagonal
+     !> matrix, and X and B are N-by-NRHS matrices.
+     !> A is factored as A = L*D*L**H, and the factored form of A is then
+     !> used to solve the system of equations.
+
+     pure subroutine stdlib_wptsv( n, nrhs, d, e, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(inout) :: d(*)
+           complex(qp), intent(inout) :: b(ldb,*), e(*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPTSV ', -info )
+              return
+           end if
+           ! compute the l*d*l**h (or u**h*d*u) factorization of a.
+           call stdlib_wpttrf( n, d, e, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_wpttrs( 'LOWER', n, nrhs, d, e, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_wptsv
+
+     !> ZPTSVX: uses the factorization A = L*D*L**H to compute the solution
+     !> to a complex system of linear equations A*X = B, where A is an
+     !> N-by-N Hermitian positive definite tridiagonal matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     pure subroutine stdlib_wptsvx( fact, n, nrhs, d, e, df, ef, b, ldb, x, ldx,rcond, ferr, berr,&
+                work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(qp), intent(in) :: d(*)
+           real(qp), intent(inout) :: df(*)
+           complex(qp), intent(in) :: b(ldb,*), e(*)
+           complex(qp), intent(inout) :: ef(*)
+           complex(qp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(qp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPTSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the l*d*l**h (or u**h*d*u) factorization of a.
+              call stdlib_qcopy( n, d, 1, df, 1 )
+              if( n>1 )call stdlib_wcopy( n-1, e, 1, ef, 1 )
+              call stdlib_wpttrf( n, df, ef, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_wlanht( '1', n, d, e )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_wptcon( n, df, ef, anorm, rcond, rwork, info )
+           ! compute the solution vectors x.
+           call stdlib_wlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_wpttrs( 'LOWER', n, nrhs, df, ef, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_wptrfs( 'LOWER', n, nrhs, d, e, df, ef, b, ldb, x, ldx, ferr,berr, work, &
+                     rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_wptsvx
+
+     !> ZPTTRF: computes the L*D*L**H factorization of a complex Hermitian
+     !> positive definite tridiagonal matrix A.  The factorization may also
+     !> be regarded as having the form A = U**H *D*U.
+
+     pure subroutine stdlib_wpttrf( n, d, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(qp), intent(inout) :: d(*)
+           complex(qp), intent(inout) :: e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i4
+           real(qp) :: eii, eir, f, g
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,aimag,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'ZPTTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! compute the l*d*l**h (or u**h *d*u) factorization of a.
+           i4 = mod( n-1, 4 )
+           do i = 1, i4
+              if( d( i )<=zero ) then
+                 info = i
+                 go to 30
+              end if
+              eir = real( e( i ),KIND=qp)
+              eii = aimag( e( i ) )
+              f = eir / d( i )
+              g = eii / d( i )
+              e( i ) = cmplx( f, g,KIND=qp)
+              d( i+1 ) = d( i+1 ) - f*eir - g*eii
+           end do
+           loop_20: do i = i4 + 1, n - 4, 4
+              ! drop out of the loop if d(i) <= 0: the matrix is not positive
+              ! definite.
+              if( d( i )<=zero ) then
+                 info = i
+                 go to 30
+              end if
+              ! solve for e(i) and d(i+1).
+              eir = real( e( i ),KIND=qp)
+              eii = aimag( e( i ) )
+              f = eir / d( i )
+              g = eii / d( i )
+              e( i ) = cmplx( f, g,KIND=qp)
+              d( i+1 ) = d( i+1 ) - f*eir - g*eii
+              if( d( i+1 )<=zero ) then
+                 info = i + 1
+                 go to 30
+              end if
+              ! solve for e(i+1) and d(i+2).
+              eir = real( e( i+1 ),KIND=qp)
+              eii = aimag( e( i+1 ) )
+              f = eir / d( i+1 )
+              g = eii / d( i+1 )
+              e( i+1 ) = cmplx( f, g,KIND=qp)
+              d( i+2 ) = d( i+2 ) - f*eir - g*eii
+              if( d( i+2 )<=zero ) then
+                 info = i + 2
+                 go to 30
+              end if
+              ! solve for e(i+2) and d(i+3).
+              eir = real( e( i+2 ),KIND=qp)
+              eii = aimag( e( i+2 ) )
+              f = eir / d( i+2 )
+              g = eii / d( i+2 )
+              e( i+2 ) = cmplx( f, g,KIND=qp)
+              d( i+3 ) = d( i+3 ) - f*eir - g*eii
+              if( d( i+3 )<=zero ) then
+                 info = i + 3
+                 go to 30
+              end if
+              ! solve for e(i+3) and d(i+4).
+              eir = real( e( i+3 ),KIND=qp)
+              eii = aimag( e( i+3 ) )
+              f = eir / d( i+3 )
+              g = eii / d( i+3 )
+              e( i+3 ) = cmplx( f, g,KIND=qp)
+              d( i+4 ) = d( i+4 ) - f*eir - g*eii
+           end do loop_20
+           ! check d(n) for positive definiteness.
+           if( d( n )<=zero )info = n
+           30 continue
+           return
+     end subroutine stdlib_wpttrf
+
+     !> ZPTTRS: solves a tridiagonal system of the form
+     !> A * X = B
+     !> using the factorization A = U**H *D* U or A = L*D*L**H computed by ZPTTRF.
+     !> D is a diagonal matrix specified in the vector D, U (or L) is a unit
+     !> bidiagonal matrix whose superdiagonal (subdiagonal) is specified in
+     !> the vector E, and X and B are N by NRHS matrices.
+
+     pure subroutine stdlib_wpttrs( uplo, n, nrhs, d, e, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*)
+           complex(qp), intent(inout) :: b(ldb,*)
+           complex(qp), intent(in) :: e(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: iuplo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           upper = ( uplo=='U' .or. uplo=='U' )
+           if( .not.upper .and. .not.( uplo=='L' .or. uplo=='L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPTTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! determine the number of right-hand sides to solve at a time.
+           if( nrhs==1 ) then
+              nb = 1
+           else
+              nb = max( 1, stdlib_ilaenv( 1, 'ZPTTRS', uplo, n, nrhs, -1, -1 ) )
+           end if
+           ! decode uplo
+           if( upper ) then
+              iuplo = 1
+           else
+              iuplo = 0
+           end if
+           if( nb>=nrhs ) then
+              call stdlib_wptts2( iuplo, n, nrhs, d, e, b, ldb )
+           else
+              do j = 1, nrhs, nb
+                 jb = min( nrhs-j+1, nb )
+                 call stdlib_wptts2( iuplo, n, jb, d, e, b( 1, j ), ldb )
+              end do
+           end if
+           return
+     end subroutine stdlib_wpttrs
+
+     !> ZPTTS2: solves a tridiagonal system of the form
+     !> A * X = B
+     !> using the factorization A = U**H *D*U or A = L*D*L**H computed by ZPTTRF.
+     !> D is a diagonal matrix specified in the vector D, U (or L) is a unit
+     !> bidiagonal matrix whose superdiagonal (subdiagonal) is specified in
+     !> the vector E, and X and B are N by NRHS matrices.
+
+     pure subroutine stdlib_wptts2( iuplo, n, nrhs, d, e, b, ldb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: iuplo, ldb, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(in) :: d(*)
+           complex(qp), intent(inout) :: b(ldb,*)
+           complex(qp), intent(in) :: e(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 )call stdlib_wdscal( nrhs, 1._qp / d( 1 ), b, ldb )
+              return
+           end if
+           if( iuplo==1 ) then
+              ! solve a * x = b using the factorization a = u**h *d*u,
+              ! overwriting each right hand side vector with its solution.
+              if( nrhs<=2 ) then
+                 j = 1
+                 10 continue
+                 ! solve u**h * x = b.
+                 do i = 2, n
+                    b( i, j ) = b( i, j ) - b( i-1, j )*conjg( e( i-1 ) )
+                 end do
+                 ! solve d * u * x = b.
+                 do i = 1, n
+                    b( i, j ) = b( i, j ) / d( i )
+                 end do
+                 do i = n - 1, 1, -1
+                    b( i, j ) = b( i, j ) - b( i+1, j )*e( i )
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 10
+                 end if
+              else
+                 do j = 1, nrhs
+                    ! solve u**h * x = b.
+                    do i = 2, n
+                       b( i, j ) = b( i, j ) - b( i-1, j )*conjg( e( i-1 ) )
+                    end do
+                    ! solve d * u * x = b.
+                    b( n, j ) = b( n, j ) / d( n )
+                    do i = n - 1, 1, -1
+                       b( i, j ) = b( i, j ) / d( i ) - b( i+1, j )*e( i )
+                    end do
+                 end do
+              end if
+           else
+              ! solve a * x = b using the factorization a = l*d*l**h,
+              ! overwriting each right hand side vector with its solution.
+              if( nrhs<=2 ) then
+                 j = 1
+                 80 continue
+                 ! solve l * x = b.
+                 do i = 2, n
+                    b( i, j ) = b( i, j ) - b( i-1, j )*e( i-1 )
+                 end do
+                 ! solve d * l**h * x = b.
+                 do i = 1, n
+                    b( i, j ) = b( i, j ) / d( i )
+                 end do
+                 do i = n - 1, 1, -1
+                    b( i, j ) = b( i, j ) - b( i+1, j )*conjg( e( i ) )
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 80
+                 end if
+              else
+                 do j = 1, nrhs
+                    ! solve l * x = b.
+                    do i = 2, n
+                       b( i, j ) = b( i, j ) - b( i-1, j )*e( i-1 )
+                    end do
+                    ! solve d * l**h * x = b.
+                    b( n, j ) = b( n, j ) / d( n )
+                    do i = n - 1, 1, -1
+                       b( i, j ) = b( i, j ) / d( i ) -b( i+1, j )*conjg( e( i ) )
+                    end do
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wptts2
+
+     !> ZROT:   applies a plane rotation, where the cos (C) is real and the
+     !> sin (S) is complex, and the vectors CX and CY are complex.
+
+     pure subroutine stdlib_wrot( n, cx, incx, cy, incy, c, s )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           real(qp), intent(in) :: c
+           complex(qp), intent(in) :: s
+           ! Array Arguments 
+           complex(qp), intent(inout) :: cx(*), cy(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           complex(qp) :: stemp
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           if( n<=0 )return
+           if( incx==1 .and. incy==1 )go to 20
+           ! code for unequal increments or equal increments not equal to 1
+           ix = 1
+           iy = 1
+           if( incx<0 )ix = ( -n+1 )*incx + 1
+           if( incy<0 )iy = ( -n+1 )*incy + 1
+           do i = 1, n
+              stemp = c*cx( ix ) + s*cy( iy )
+              cy( iy ) = c*cy( iy ) - conjg( s )*cx( ix )
+              cx( ix ) = stemp
+              ix = ix + incx
+              iy = iy + incy
+           end do
+           return
+           ! code for both increments equal to 1
+           20 continue
+           do i = 1, n
+              stemp = c*cx( i ) + s*cy( i )
+              cy( i ) = c*cy( i ) - conjg( s )*cx( i )
+              cx( i ) = stemp
+           end do
+           return
+     end subroutine stdlib_wrot
+
+     !> ZSPCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex symmetric packed matrix A using the
+     !> factorization A = U*D*U**T or A = L*D*L**T computed by ZSPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_wspcon( uplo, n, ap, ipiv, anorm, rcond, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: ap(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ip, kase
+           real(qp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              ip = n*( n+1 ) / 2
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip - i
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              ip = 1
+              do i = 1, n
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip + n - i + 1
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_wsptrs( uplo, n, 1, ap, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_wspcon
+
+     !> ZSPMV:  performs the matrix-vector operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_wspmv( uplo, n, alpha, ap, x, incx, beta, y, incy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incx, incy, n
+           complex(qp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           complex(qp), intent(in) :: ap(*), x(*)
+           complex(qp), intent(inout) :: y(*)
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           complex(qp) :: temp1, temp2
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = 1
+           else if( n<0 ) then
+              info = 2
+           else if( incx==0 ) then
+              info = 6
+           else if( incy==0 ) then
+              info = 9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ) .or. ( ( alpha==czero ) .and. ( beta==cone ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 ) then
+              kx = 1
+           else
+              kx = 1 - ( n-1 )*incx
+           end if
+           if( incy>0 ) then
+              ky = 1
+           else
+              ky = 1 - ( n-1 )*incy
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           ! first form  y := beta*y.
+           if( beta/=cone ) then
+              if( incy==1 ) then
+                 if( beta==czero ) then
+                    do i = 1, n
+                       y( i ) = czero
+                    end do
+                 else
+                    do i = 1, n
+                       y( i ) = beta*y( i )
+                    end do
+                 end if
+              else
+                 iy = ky
+                 if( beta==czero ) then
+                    do i = 1, n
+                       y( iy ) = czero
+                       iy = iy + incy
+                    end do
+                 else
+                    do i = 1, n
+                       y( iy ) = beta*y( iy )
+                       iy = iy + incy
+                    end do
+                 end if
+              end if
+           end if
+           if( alpha==czero )return
+           kk = 1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! form  y  when ap contains the upper triangle.
+              if( ( incx==1 ) .and. ( incy==1 ) ) then
+                 do j = 1, n
+                    temp1 = alpha*x( j )
+                    temp2 = czero
+                    k = kk
+                    do i = 1, j - 1
+                       y( i ) = y( i ) + temp1*ap( k )
+                       temp2 = temp2 + ap( k )*x( i )
+                       k = k + 1
+                    end do
+                    y( j ) = y( j ) + temp1*ap( kk+j-1 ) + alpha*temp2
+                    kk = kk + j
+                 end do
+              else
+                 jx = kx
+                 jy = ky
+                 do j = 1, n
+                    temp1 = alpha*x( jx )
+                    temp2 = czero
+                    ix = kx
+                    iy = ky
+                    do k = kk, kk + j - 2
+                       y( iy ) = y( iy ) + temp1*ap( k )
+                       temp2 = temp2 + ap( k )*x( ix )
+                       ix = ix + incx
+                       iy = iy + incy
+                    end do
+                    y( jy ) = y( jy ) + temp1*ap( kk+j-1 ) + alpha*temp2
+                    jx = jx + incx
+                    jy = jy + incy
+                    kk = kk + j
+                 end do
+              end if
+           else
+              ! form  y  when ap contains the lower triangle.
+              if( ( incx==1 ) .and. ( incy==1 ) ) then
+                 do j = 1, n
+                    temp1 = alpha*x( j )
+                    temp2 = czero
+                    y( j ) = y( j ) + temp1*ap( kk )
+                    k = kk + 1
+                    do i = j + 1, n
+                       y( i ) = y( i ) + temp1*ap( k )
+                       temp2 = temp2 + ap( k )*x( i )
+                       k = k + 1
+                    end do
+                    y( j ) = y( j ) + alpha*temp2
+                    kk = kk + ( n-j+1 )
+                 end do
+              else
+                 jx = kx
+                 jy = ky
+                 do j = 1, n
+                    temp1 = alpha*x( jx )
+                    temp2 = czero
+                    y( jy ) = y( jy ) + temp1*ap( kk )
+                    ix = jx
+                    iy = jy
+                    do k = kk + 1, kk + n - j
+                       ix = ix + incx
+                       iy = iy + incy
+                       y( iy ) = y( iy ) + temp1*ap( k )
+                       temp2 = temp2 + ap( k )*x( ix )
+                    end do
+                    y( jy ) = y( jy ) + alpha*temp2
+                    jx = jx + incx
+                    jy = jy + incy
+                    kk = kk + ( n-j+1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wspmv
+
+     !> ZSPR:    performs the symmetric rank 1 operation
+     !> A := alpha*x*x**H + A,
+     !> where alpha is a complex scalar, x is an n element vector and A is an
+     !> n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_wspr( uplo, n, alpha, x, incx, ap )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incx, n
+           complex(qp), intent(in) :: alpha
+           ! Array Arguments 
+           complex(qp), intent(inout) :: ap(*)
+           complex(qp), intent(in) :: x(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           complex(qp) :: temp
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = 1
+           else if( n<0 ) then
+              info = 2
+           else if( incx==0 ) then
+              info = 5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPR  ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ) .or. ( alpha==czero ) )return
+           ! set the start point in x if the increment is not unity.
+           if( incx<=0 ) then
+              kx = 1 - ( n-1 )*incx
+           else if( incx/=1 ) then
+              kx = 1
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           kk = 1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! form  a  when upper triangle is stored in ap.
+              if( incx==1 ) then
+                 do j = 1, n
+                    if( x( j )/=czero ) then
+                       temp = alpha*x( j )
+                       k = kk
+                       do i = 1, j - 1
+                          ap( k ) = ap( k ) + x( i )*temp
+                          k = k + 1
+                       end do
+                       ap( kk+j-1 ) = ap( kk+j-1 ) + x( j )*temp
+                    else
+                       ap( kk+j-1 ) = ap( kk+j-1 )
+                    end if
+                    kk = kk + j
+                 end do
+              else
+                 jx = kx
+                 do j = 1, n
+                    if( x( jx )/=czero ) then
+                       temp = alpha*x( jx )
+                       ix = kx
+                       do k = kk, kk + j - 2
+                          ap( k ) = ap( k ) + x( ix )*temp
+                          ix = ix + incx
+                       end do
+                       ap( kk+j-1 ) = ap( kk+j-1 ) + x( jx )*temp
+                    else
+                       ap( kk+j-1 ) = ap( kk+j-1 )
+                    end if
+                    jx = jx + incx
+                    kk = kk + j
+                 end do
+              end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+              if( incx==1 ) then
+                 do j = 1, n
+                    if( x( j )/=czero ) then
+                       temp = alpha*x( j )
+                       ap( kk ) = ap( kk ) + temp*x( j )
+                       k = kk + 1
+                       do i = j + 1, n
+                          ap( k ) = ap( k ) + x( i )*temp
+                          k = k + 1
+                       end do
+                    else
+                       ap( kk ) = ap( kk )
+                    end if
+                    kk = kk + n - j + 1
+                 end do
+              else
+                 jx = kx
+                 do j = 1, n
+                    if( x( jx )/=czero ) then
+                       temp = alpha*x( jx )
+                       ap( kk ) = ap( kk ) + temp*x( jx )
+                       ix = jx
+                       do k = kk + 1, kk + n - j
+                          ix = ix + incx
+                          ap( k ) = ap( k ) + x( ix )*temp
+                       end do
+                    else
+                       ap( kk ) = ap( kk )
+                    end if
+                    jx = jx + incx
+                    kk = kk + n - j + 1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wspr
+
+     !> ZSPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric indefinite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_wsprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, work,&
+                rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_wcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_wspmv( uplo, n, -cone, ap, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + cabs1( ap( kk+k-1 ) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + cabs1( ap( kk ) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_wsptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                 call stdlib_waxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_wlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_wsptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_wsptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_wsprfs
+
+     !> ZSPSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix stored in packed format and X
+     !> and B are N-by-NRHS matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, D is symmetric and block diagonal with 1-by-1
+     !> and 2-by-2 diagonal blocks.  The factored form of A is then used to
+     !> solve the system of equations A * X = B.
+
+     pure subroutine stdlib_wspsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPSV ', -info )
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_wsptrf( uplo, n, ap, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_wsptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_wspsv
+
+     !> ZSPSVX: uses the diagonal pivoting factorization A = U*D*U**T or
+     !> A = L*D*L**T to compute the solution to a complex system of linear
+     !> equations A * X = B, where A is an N-by-N symmetric matrix stored
+     !> in packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_wspsvx( fact, uplo, n, nrhs, ap, afp, ipiv, b, ldb, x,ldx, rcond, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(inout) :: afp(*)
+           complex(qp), intent(in) :: ap(*), b(ldb,*)
+           complex(qp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(qp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+              call stdlib_wcopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_wsptrf( uplo, n, afp, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_wlansp( 'I', uplo, n, ap, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_wspcon( uplo, n, afp, ipiv, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_wlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_wsptrs( uplo, n, nrhs, afp, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_wsprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx, ferr,berr, work, &
+                     rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_wspsvx
+
+     !> ZSPTRF: computes the factorization of a complex symmetric matrix A
+     !> stored in packed format using the Bunch-Kaufman diagonal pivoting
+     !> method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+
+     pure subroutine stdlib_wsptrf( uplo, n, ap, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kc, kk, knc, kp, kpc, kstep, kx, npp
+           real(qp) :: absakk, alpha, colmax, rowmax
+           complex(qp) :: d11, d12, d21, d22, r1, t, wk, wkm1, wkp1, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPTRF', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              kc = ( n-1 )*n / 2 + 1
+              10 continue
+              knc = kc
+              ! if k < 1, exit from loop
+              if( k<1 )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( ap( kc+k-1 ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, ap( kc ), 1 )
+                 colmax = cabs1( ap( kc+imax-1 ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    rowmax = zero
+                    jmax = imax
+                    kx = imax*( imax+1 ) / 2 + imax
+                    do j = imax + 1, k
+                       if( cabs1( ap( kx ) )>rowmax ) then
+                          rowmax = cabs1( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + j
+                    end do
+                    kpc = ( imax-1 )*imax / 2 + 1
+                    if( imax>1 ) then
+                       jmax = stdlib_iwamax( imax-1, ap( kpc ), 1 )
+                       rowmax = max( rowmax, cabs1( ap( kpc+jmax-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( ap( kpc+imax-1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kstep==2 )knc = knc - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_wswap( kp-1, ap( knc ), 1, ap( kpc ), 1 )
+                    kx = kpc + kp - 1
+                    do j = kp + 1, kk - 1
+                       kx = kx + j - 1
+                       t = ap( knc+j-1 )
+                       ap( knc+j-1 ) = ap( kx )
+                       ap( kx ) = t
+                    end do
+                    t = ap( knc+kk-1 )
+                    ap( knc+kk-1 ) = ap( kpc+kp-1 )
+                    ap( kpc+kp-1 ) = t
+                    if( kstep==2 ) then
+                       t = ap( kc+k-2 )
+                       ap( kc+k-2 ) = ap( kc+kp-1 )
+                       ap( kc+kp-1 ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**t = a - w(k)*1/d(k)*w(k)**t
+                    r1 = cone / ap( kc+k-1 )
+                    call stdlib_wspr( uplo, k-1, -r1, ap( kc ), 1, ap )
+                    ! store u(k) in column k
+                    call stdlib_wscal( k-1, r1, ap( kc ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**t
+                    if( k>2 ) then
+                       d12 = ap( k-1+( k-1 )*k / 2 )
+                       d22 = ap( k-1+( k-2 )*( k-1 ) / 2 ) / d12
+                       d11 = ap( k+( k-1 )*k / 2 ) / d12
+                       t = cone / ( d11*d22-cone )
+                       d12 = t / d12
+                       do j = k - 2, 1, -1
+                          wkm1 = d12*( d11*ap( j+( k-2 )*( k-1 ) / 2 )-ap( j+( k-1 )*k / 2 ) )
+                                    
+                          wk = d12*( d22*ap( j+( k-1 )*k / 2 )-ap( j+( k-2 )*( k-1 ) / 2 ) )
+                                    
+                          do i = j, 1, -1
+                             ap( i+( j-1 )*j / 2 ) = ap( i+( j-1 )*j / 2 ) -ap( i+( k-1 )*k / 2 )&
+                                       *wk -ap( i+( k-2 )*( k-1 ) / 2 )*wkm1
+                          end do
+                          ap( j+( k-1 )*k / 2 ) = wk
+                          ap( j+( k-2 )*( k-1 ) / 2 ) = wkm1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              kc = knc - k
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              kc = 1
+              npp = n*( n+1 ) / 2
+              60 continue
+              knc = kc
+              ! if k > n, exit from loop
+              if( k>n )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( ap( kc ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, ap( kc+1 ), 1 )
+                 colmax = cabs1( ap( kc+imax-k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    rowmax = zero
+                    kx = kc + imax - k
+                    do j = k, imax - 1
+                       if( cabs1( ap( kx ) )>rowmax ) then
+                          rowmax = cabs1( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + n - j
+                    end do
+                    kpc = npp - ( n-imax+1 )*( n-imax+2 ) / 2 + 1
+                    if( imax<n ) then
+                       jmax = imax + stdlib_iwamax( n-imax, ap( kpc+1 ), 1 )
+                       rowmax = max( rowmax, cabs1( ap( kpc+jmax-imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( ap( kpc ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kstep==2 )knc = knc + n - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_wswap( n-kp, ap( knc+kp-kk+1 ), 1, ap( kpc+1 ),1 )
+                              
+                    kx = knc + kp - kk
+                    do j = kk + 1, kp - 1
+                       kx = kx + n - j + 1
+                       t = ap( knc+j-kk )
+                       ap( knc+j-kk ) = ap( kx )
+                       ap( kx ) = t
+                    end do
+                    t = ap( knc )
+                    ap( knc ) = ap( kpc )
+                    ap( kpc ) = t
+                    if( kstep==2 ) then
+                       t = ap( kc+1 )
+                       ap( kc+1 ) = ap( kc+kp-k )
+                       ap( kc+kp-k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**t = a - w(k)*(1/d(k))*w(k)**t
+                       r1 = cone / ap( kc )
+                       call stdlib_wspr( uplo, n-k, -r1, ap( kc+1 ), 1,ap( kc+n-k+1 ) )
+                       ! store l(k) in column k
+                       call stdlib_wscal( n-k, r1, ap( kc+1 ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**t
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**t
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d21 = ap( k+1+( k-1 )*( 2*n-k ) / 2 )
+                       d11 = ap( k+1+k*( 2*n-k-1 ) / 2 ) / d21
+                       d22 = ap( k+( k-1 )*( 2*n-k ) / 2 ) / d21
+                       t = cone / ( d11*d22-cone )
+                       d21 = t / d21
+                       do j = k + 2, n
+                          wk = d21*( d11*ap( j+( k-1 )*( 2*n-k ) / 2 )-ap( j+k*( 2*n-k-1 ) / 2 ) )
+                                    
+                          wkp1 = d21*( d22*ap( j+k*( 2*n-k-1 ) / 2 )-ap( j+( k-1 )*( 2*n-k ) / 2 )&
+                                     )
+                          do i = j, n
+                             ap( i+( j-1 )*( 2*n-j ) / 2 ) = ap( i+( j-1 )*( 2*n-j ) / 2 ) - ap( &
+                                       i+( k-1 )*( 2*n-k ) /2 )*wk - ap( i+k*( 2*n-k-1 ) / 2 )*wkp1
+                          end do
+                          ap( j+( k-1 )*( 2*n-k ) / 2 ) = wk
+                          ap( j+k*( 2*n-k-1 ) / 2 ) = wkp1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              kc = knc + n - k + 2
+              go to 60
+           end if
+           110 continue
+           return
+     end subroutine stdlib_wsptrf
+
+     !> ZSPTRI: computes the inverse of a complex symmetric indefinite matrix
+     !> A in packed storage using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by ZSPTRF.
+
+     pure subroutine stdlib_wsptri( uplo, n, ap, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: ap(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kcnext, kp, kpc, kstep, kx, npp
+           complex(qp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              kp = n*( n+1 ) / 2
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. ap( kp )==czero )return
+                 kp = kp - info
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              kp = 1
+              do info = 1, n
+                 if( ipiv( info )>0 .and. ap( kp )==czero )return
+                 kp = kp + n - info + 1
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              kcnext = kc + k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc+k-1 ) = cone / ap( kc+k-1 )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_wcopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_wspmv( uplo, k-1, -cone, ap, work, 1, czero, ap( kc ),1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -stdlib_wdotu( k-1, work, 1, ap( kc ), 1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = ap( kcnext+k-1 )
+                 ak = ap( kc+k-1 ) / t
+                 akp1 = ap( kcnext+k ) / t
+                 akkp1 = ap( kcnext+k-1 ) / t
+                 d = t*( ak*akp1-cone )
+                 ap( kc+k-1 ) = akp1 / d
+                 ap( kcnext+k ) = ak / d
+                 ap( kcnext+k-1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_wcopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_wspmv( uplo, k-1, -cone, ap, work, 1, czero, ap( kc ),1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -stdlib_wdotu( k-1, work, 1, ap( kc ), 1 )
+                    ap( kcnext+k-1 ) = ap( kcnext+k-1 ) -stdlib_wdotu( k-1, ap( kc ), 1, ap( &
+                              kcnext ),1 )
+                    call stdlib_wcopy( k-1, ap( kcnext ), 1, work, 1 )
+                    call stdlib_wspmv( uplo, k-1, -cone, ap, work, 1, czero,ap( kcnext ), 1 )
+                              
+                    ap( kcnext+k ) = ap( kcnext+k ) -stdlib_wdotu( k-1, work, 1, ap( kcnext ), 1 )
+                              
+                 end if
+                 kstep = 2
+                 kcnext = kcnext + k + 1
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kpc = ( kp-1 )*kp / 2 + 1
+                 call stdlib_wswap( kp-1, ap( kc ), 1, ap( kpc ), 1 )
+                 kx = kpc + kp - 1
+                 do j = kp + 1, k - 1
+                    kx = kx + j - 1
+                    temp = ap( kc+j-1 )
+                    ap( kc+j-1 ) = ap( kx )
+                    ap( kx ) = temp
+                 end do
+                 temp = ap( kc+k-1 )
+                 ap( kc+k-1 ) = ap( kpc+kp-1 )
+                 ap( kpc+kp-1 ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc+k+k-1 )
+                    ap( kc+k+k-1 ) = ap( kc+k+kp-1 )
+                    ap( kc+k+kp-1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              kc = kcnext
+              go to 30
+              50 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              npp = n*( n+1 ) / 2
+              k = n
+              kc = npp
+              60 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 80
+              kcnext = kc - ( n-k+2 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc ) = cone / ap( kc )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_wcopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_wspmv( uplo, n-k, -cone, ap( kc+n-k+1 ), work, 1,czero, ap( kc+1 )&
+                              , 1 )
+                    ap( kc ) = ap( kc ) - stdlib_wdotu( n-k, work, 1, ap( kc+1 ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = ap( kcnext+1 )
+                 ak = ap( kcnext ) / t
+                 akp1 = ap( kc ) / t
+                 akkp1 = ap( kcnext+1 ) / t
+                 d = t*( ak*akp1-cone )
+                 ap( kcnext ) = akp1 / d
+                 ap( kc ) = ak / d
+                 ap( kcnext+1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_wcopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_wspmv( uplo, n-k, -cone, ap( kc+( n-k+1 ) ), work, 1,czero, ap( &
+                              kc+1 ), 1 )
+                    ap( kc ) = ap( kc ) - stdlib_wdotu( n-k, work, 1, ap( kc+1 ),1 )
+                    ap( kcnext+1 ) = ap( kcnext+1 ) -stdlib_wdotu( n-k, ap( kc+1 ), 1,ap( kcnext+&
+                              2 ), 1 )
+                    call stdlib_wcopy( n-k, ap( kcnext+2 ), 1, work, 1 )
+                    call stdlib_wspmv( uplo, n-k, -cone, ap( kc+( n-k+1 ) ), work, 1,czero, ap( &
+                              kcnext+2 ), 1 )
+                    ap( kcnext ) = ap( kcnext ) -stdlib_wdotu( n-k, work, 1, ap( kcnext+2 ), 1 )
+                              
+                 end if
+                 kstep = 2
+                 kcnext = kcnext - ( n-k+3 )
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kpc = npp - ( n-kp+1 )*( n-kp+2 ) / 2 + 1
+                 if( kp<n )call stdlib_wswap( n-kp, ap( kc+kp-k+1 ), 1, ap( kpc+1 ), 1 )
+                 kx = kc + kp - k
+                 do j = k + 1, kp - 1
+                    kx = kx + n - j + 1
+                    temp = ap( kc+j-k )
+                    ap( kc+j-k ) = ap( kx )
+                    ap( kx ) = temp
+                 end do
+                 temp = ap( kc )
+                 ap( kc ) = ap( kpc )
+                 ap( kpc ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc-n+k-1 )
+                    ap( kc-n+k-1 ) = ap( kc-n+kp-1 )
+                    ap( kc-n+kp-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              kc = kcnext
+              go to 60
+              80 continue
+           end if
+           return
+     end subroutine stdlib_wsptri
+
+     !> ZSPTRS: solves a system of linear equations A*X = B with a complex
+     !> symmetric matrix A stored in packed format using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by ZSPTRF.
+
+     pure subroutine stdlib_wsptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: ap(*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kp
+           complex(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              kc = kc - k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_wgeru( k-1, nrhs, -cone, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_wscal( nrhs, cone / ap( kc+k-1 ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_wswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_wgeru( k-2, nrhs, -cone, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 call stdlib_wgeru( k-2, nrhs, -cone, ap( kc-( k-1 ) ), 1,b( k-1, 1 ), ldb, b( 1, &
+                           1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+k-2 )
+                 akm1 = ap( kc-1 ) / akm1k
+                 ak = ap( kc+k-1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc - k + 1
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_wgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb, ap( kc ),1, cone, b( k,&
+                            1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + k
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 call stdlib_wgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb, ap( kc ),1, cone, b( k,&
+                            1 ), ldb )
+                 call stdlib_wgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb,ap( kc+k ), 1, cone, b( &
+                           k+1, 1 ), ldb )
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + 2*k + 1
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_wgeru( n-k, nrhs, -cone, ap( kc+1 ), 1, b( k, 1 ),ldb, b( k+&
+                           1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_wscal( nrhs, cone / ap( kc ), b( k, 1 ), ldb )
+                 kc = kc + n - k + 1
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_wswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_wgeru( n-k-1, nrhs, -cone, ap( kc+2 ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_wgeru( n-k-1, nrhs, -cone, ap( kc+n-k+2 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+1 )
+                 akm1 = ap( kc ) / akm1k
+                 ak = ap( kc+n-k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc + 2*( n-k ) + 1
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              kc = kc - ( n-k+1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_wgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, ap( &
+                           kc+1 ), 1, cone, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_wgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, ap( kc+1 ),&
+                               1, cone, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, ap( kc-( n-&
+                              k ) ), 1, cone, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc - ( n-k+2 )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_wsptrs
+
+     !> ZSTEDC: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric tridiagonal matrix using the divide and conquer method.
+     !> The eigenvectors of a full or band complex Hermitian matrix can also
+     !> be found if ZHETRD or ZHPTRD or ZHBTRD has been used to reduce this
+     !> matrix to tridiagonal form.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.  See DLAED3 for details.
+
+     pure subroutine stdlib_wstedc( compz, n, d, e, z, ldz, work, lwork, rwork,lrwork, iwork, &
+               liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: finish, i, icompz, ii, j, k, lgn, liwmin, ll, lrwmin, lwmin, m, smlsiz,&
+                      start
+           real(qp) :: eps, orgnrm, p, tiny
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log,max,mod,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or.( icompz>0 .and. ldz<max( 1, n ) ) ) then
+              info = -6
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              smlsiz = stdlib_ilaenv( 9, 'ZSTEDC', ' ', 0, 0, 0, 0 )
+              if( n<=1 .or. icompz==0 ) then
+                 lwmin = 1
+                 liwmin = 1
+                 lrwmin = 1
+              else if( n<=smlsiz ) then
+                 lwmin = 1
+                 liwmin = 1
+                 lrwmin = 2*( n - 1 )
+              else if( icompz==1 ) then
+                 lgn = int( log( real( n,KIND=qp) ) / log( two ),KIND=ilp)
+                 if( 2**lgn<n )lgn = lgn + 1
+                 if( 2**lgn<n )lgn = lgn + 1
+                 lwmin = n*n
+                 lrwmin = 1 + 3*n + 2*n*lgn + 4*n**2
+                 liwmin = 6 + 6*n + 5*n*lgn
+              else if( icompz==2 ) then
+                 lwmin = 1
+                 lrwmin = 1 + 4*n + 2*n**2
+                 liwmin = 3 + 5*n
+              end if
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -8
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -10
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSTEDC', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz/=0 )z( 1, 1 ) = one
+              return
+           end if
+           ! if the following conditional clause is removed, then the routine
+           ! will use the divide and conquer routine to compute only the
+           ! eigenvalues, which requires (3n + 3n**2) real workspace and
+           ! (2 + 5n + 2n lg(n)) integer workspace.
+           ! since on many architectures stdlib_qsterf is much faster than any other
+           ! algorithm for finding eigenvalues only, it is used here
+           ! as the default. if the conditional clause is removed, then
+           ! information on the size of workspace needs to be changed.
+           ! if compz = 'n', use stdlib_qsterf to compute the eigenvalues.
+           if( icompz==0 ) then
+              call stdlib_qsterf( n, d, e, info )
+              go to 70
+           end if
+           ! if n is smaller than the minimum divide size (smlsiz+1), then
+           ! solve the problem with another solver.
+           if( n<=smlsiz ) then
+              call stdlib_wsteqr( compz, n, d, e, z, ldz, rwork, info )
+           else
+              ! if compz = 'i', we simply call stdlib_qstedc instead.
+              if( icompz==2 ) then
+                 call stdlib_qlaset( 'FULL', n, n, zero, one, rwork, n )
+                 ll = n*n + 1
+                 call stdlib_qstedc( 'I', n, d, e, rwork, n,rwork( ll ), lrwork-ll+1, iwork, &
+                           liwork, info )
+                 do j = 1, n
+                    do i = 1, n
+                       z( i, j ) = rwork( ( j-1 )*n+i )
+                    end do
+                 end do
+                 go to 70
+              end if
+              ! from now on, only option left to be handled is compz = 'v',
+              ! i.e. icompz = 1.
+              ! scale.
+              orgnrm = stdlib_qlanst( 'M', n, d, e )
+              if( orgnrm==zero )go to 70
+              eps = stdlib_qlamch( 'EPSILON' )
+              start = 1
+              ! while ( start <= n )
+              30 continue
+              if( start<=n ) then
+                 ! let finish be the position of the next subdiagonal entry
+                 ! such that e( finish ) <= tiny or finish = n if no such
+                 ! subdiagonal exists.  the matrix identified by the elements
+                 ! between start and finish constitutes an independent
+                 ! sub-problem.
+                 finish = start
+                 40 continue
+                 if( finish<n ) then
+                    tiny = eps*sqrt( abs( d( finish ) ) )*sqrt( abs( d( finish+1 ) ) )
+                    if( abs( e( finish ) )>tiny ) then
+                       finish = finish + 1
+                       go to 40
+                    end if
+                 end if
+                 ! (sub) problem determined.  compute its size and solve it.
+                 m = finish - start + 1
+                 if( m>smlsiz ) then
+                    ! scale.
+                    orgnrm = stdlib_qlanst( 'M', m, d( start ), e( start ) )
+                    call stdlib_qlascl( 'G', 0, 0, orgnrm, one, m, 1, d( start ), m,info )
+                    call stdlib_qlascl( 'G', 0, 0, orgnrm, one, m-1, 1, e( start ),m-1, info )
+                              
+                    call stdlib_wlaed0( n, m, d( start ), e( start ), z( 1, start ),ldz, work, n, &
+                              rwork, iwork, info )
+                    if( info>0 ) then
+                       info = ( info / ( m+1 )+start-1 )*( n+1 ) +mod( info, ( m+1 ) ) + start - &
+                                 1
+                       go to 70
+                    end if
+                    ! scale back.
+                    call stdlib_qlascl( 'G', 0, 0, one, orgnrm, m, 1, d( start ), m,info )
+                 else
+                    call stdlib_qsteqr( 'I', m, d( start ), e( start ), rwork, m,rwork( m*m+1 ), &
+                              info )
+                    call stdlib_wlacrm( n, m, z( 1, start ), ldz, rwork, m, work, n,rwork( m*m+1 )&
+                               )
+                    call stdlib_wlacpy( 'A', n, m, work, n, z( 1, start ), ldz )
+                    if( info>0 ) then
+                       info = start*( n+1 ) + finish
+                       go to 70
+                    end if
+                 end if
+                 start = finish + 1
+                 go to 30
+              end if
+              ! endwhile
+              ! use selection sort to minimize swaps of eigenvectors
+              do ii = 2, n
+                i = ii - 1
+                k = i
+                p = d( i )
+                do j = ii, n
+                   if( d( j )<p ) then
+                      k = j
+                      p = d( j )
+                   end if
+                end do
+                if( k/=i ) then
+                   d( k ) = d( i )
+                   d( i ) = p
+                   call stdlib_wswap( n, z( 1, i ), 1, z( 1, k ), 1 )
+                end if
+              end do
+           end if
+           70 continue
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_wstedc
+
+     !> ZSTEGR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+     !> a well defined set of pairwise different real eigenvalues, the corresponding
+     !> real eigenvectors are pairwise orthogonal.
+     !> The spectrum may be computed either completely or partially by specifying
+     !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+     !> eigenvalues.
+     !> ZSTEGR is a compatibility wrapper around the improved ZSTEMR routine.
+     !> See ZSTEMR for further details.
+     !> One important change is that the ABSTOL parameter no longer provides any
+     !> benefit and hence is no longer used.
+     !> Note : ZSTEGR and ZSTEMR work only on machines which follow
+     !> IEEE-754 floating-point standard in their handling of infinities and
+     !> NaNs.  Normal execution may create these exceptiona values and hence
+     !> may abort due to a floating point exception in environments which
+     !> do not conform to the IEEE-754 standard.
+
+     pure subroutine stdlib_wstegr( jobz, range, n, d, e, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               isuppz, work, lwork, iwork,liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           integer(ilp), intent(in) :: il, iu, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: w(*), work(*)
+           complex(qp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: tryrac
+           ! Executable Statements 
+           info = 0
+           tryrac = .false.
+           call stdlib_wstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, n, isuppz, &
+                     tryrac, work, lwork,iwork, liwork, info )
+     end subroutine stdlib_wstegr
+
+     !> ZSTEIN: computes the eigenvectors of a real symmetric tridiagonal
+     !> matrix T corresponding to specified eigenvalues, using inverse
+     !> iteration.
+     !> The maximum number of iterations allowed for each eigenvector is
+     !> specified by an internal parameter MAXITS (currently set to 5).
+     !> Although the eigenvectors are real, they are stored in a complex
+     !> array, which may be passed to ZUNMTR or ZUPMTR for back
+     !> transformation to the eigenvectors of a complex Hermitian matrix
+     !> which was reduced to tridiagonal form.
+
+     pure subroutine stdlib_wstein( n, d, e, m, w, iblock, isplit, z, ldz, work,iwork, ifail, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, m, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: iblock(*), isplit(*)
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(qp), intent(in) :: d(*), e(*), w(*)
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(out) :: z(ldz,*)
+       ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: odm3 = 1.0e-3_qp
+           real(qp), parameter :: odm1 = 1.0e-1_qp
+           integer(ilp), parameter :: maxits = 5
+           integer(ilp), parameter :: extra = 2
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: b1, blksiz, bn, gpind, i, iinfo, indrv1, indrv2, indrv3, indrv4, &
+                     indrv5, its, j, j1, jblk, jmax, jr, nblk, nrmchk
+           real(qp) :: dtpcrt, eps, eps1, nrm, onenrm, ortol, pertol, scl, sep, tol, xj, xjm, &
+                     ztr
+           ! Local Arrays 
+           integer(ilp) :: iseed(4)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           do i = 1, m
+              ifail( i ) = 0
+           end do
+           if( n<0 ) then
+              info = -1
+           else if( m<0 .or. m>n ) then
+              info = -4
+           else if( ldz<max( 1, n ) ) then
+              info = -9
+           else
+              do j = 2, m
+                 if( iblock( j )<iblock( j-1 ) ) then
+                    info = -6
+                    go to 30
+                 end if
+                 if( iblock( j )==iblock( j-1 ) .and. w( j )<w( j-1 ) )then
+                    info = -5
+                    go to 30
+                 end if
+              end do
+              30 continue
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSTEIN', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) then
+              return
+           else if( n==1 ) then
+              z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           eps = stdlib_qlamch( 'PRECISION' )
+           ! initialize seed for random number generator stdlib_qlarnv.
+           do i = 1, 4
+              iseed( i ) = 1
+           end do
+           ! initialize pointers.
+           indrv1 = 0
+           indrv2 = indrv1 + n
+           indrv3 = indrv2 + n
+           indrv4 = indrv3 + n
+           indrv5 = indrv4 + n
+           ! compute eigenvectors of matrix blocks.
+           j1 = 1
+           loop_180: do nblk = 1, iblock( m )
+              ! find starting and ending indices of block nblk.
+              if( nblk==1 ) then
+                 b1 = 1
+              else
+                 b1 = isplit( nblk-1 ) + 1
+              end if
+              bn = isplit( nblk )
+              blksiz = bn - b1 + 1
+              if( blksiz==1 )go to 60
+              gpind = j1
+              ! compute reorthogonalization criterion and stopping criterion.
+              onenrm = abs( d( b1 ) ) + abs( e( b1 ) )
+              onenrm = max( onenrm, abs( d( bn ) )+abs( e( bn-1 ) ) )
+              do i = b1 + 1, bn - 1
+                 onenrm = max( onenrm, abs( d( i ) )+abs( e( i-1 ) )+abs( e( i ) ) )
+              end do
+              ortol = odm3*onenrm
+              dtpcrt = sqrt( odm1 / blksiz )
+              ! loop through eigenvalues of block nblk.
+              60 continue
+              jblk = 0
+              loop_170: do j = j1, m
+                 if( iblock( j )/=nblk ) then
+                    j1 = j
+                    cycle loop_180
+                 end if
+                 jblk = jblk + 1
+                 xj = w( j )
+                 ! skip all the work if the block size is one.
+                 if( blksiz==1 ) then
+                    work( indrv1+1 ) = one
+                    go to 140
+                 end if
+                 ! if eigenvalues j and j-1 are too close, add a relatively
+                 ! small perturbation.
+                 if( jblk>1 ) then
+                    eps1 = abs( eps*xj )
+                    pertol = ten*eps1
+                    sep = xj - xjm
+                    if( sep<pertol )xj = xjm + pertol
+                 end if
+                 its = 0
+                 nrmchk = 0
+                 ! get random starting vector.
+                 call stdlib_qlarnv( 2, iseed, blksiz, work( indrv1+1 ) )
+                 ! copy the matrix t so it won't be destroyed in factorization.
+                 call stdlib_qcopy( blksiz, d( b1 ), 1, work( indrv4+1 ), 1 )
+                 call stdlib_qcopy( blksiz-1, e( b1 ), 1, work( indrv2+2 ), 1 )
+                 call stdlib_qcopy( blksiz-1, e( b1 ), 1, work( indrv3+1 ), 1 )
+                 ! compute lu factors with partial pivoting  ( pt = lu )
+                 tol = zero
+                 call stdlib_qlagtf( blksiz, work( indrv4+1 ), xj, work( indrv2+2 ),work( indrv3+&
+                           1 ), tol, work( indrv5+1 ), iwork,iinfo )
+                 ! update iteration count.
+                 70 continue
+                 its = its + 1
+                 if( its>maxits )go to 120
+                 ! normalize and scale the righthand side vector pb.
+                 jmax = stdlib_iqamax( blksiz, work( indrv1+1 ), 1 )
+                 scl = blksiz*onenrm*max( eps,abs( work( indrv4+blksiz ) ) ) /abs( work( indrv1+&
+                           jmax ) )
+                 call stdlib_qscal( blksiz, scl, work( indrv1+1 ), 1 )
+                 ! solve the system lu = pb.
+                 call stdlib_qlagts( -1, blksiz, work( indrv4+1 ), work( indrv2+2 ),work( indrv3+&
+                           1 ), work( indrv5+1 ), iwork,work( indrv1+1 ), tol, iinfo )
+                 ! reorthogonalize by modified gram-schmidt if eigenvalues are
+                 ! close enough.
+                 if( jblk==1 )go to 110
+                 if( abs( xj-xjm )>ortol )gpind = j
+                 if( gpind/=j ) then
+                    do i = gpind, j - 1
+                       ztr = zero
+                       do jr = 1, blksiz
+                          ztr = ztr + work( indrv1+jr )*real( z( b1-1+jr, i ),KIND=qp)
+                       end do
+                       do jr = 1, blksiz
+                          work( indrv1+jr ) = work( indrv1+jr ) -ztr*real( z( b1-1+jr, i ),&
+                                    KIND=qp)
+                       end do
+                    end do
+                 end if
+                 ! check the infinity norm of the iterate.
+                 110 continue
+                 jmax = stdlib_iqamax( blksiz, work( indrv1+1 ), 1 )
+                 nrm = abs( work( indrv1+jmax ) )
+                 ! continue for additional iterations after norm reaches
+                 ! stopping criterion.
+                 if( nrm<dtpcrt )go to 70
+                 nrmchk = nrmchk + 1
+                 if( nrmchk<extra+1 )go to 70
+                 go to 130
+                 ! if stopping criterion was not satisfied, update info and
+                 ! store eigenvector number in array ifail.
+                 120 continue
+                 info = info + 1
+                 ifail( info ) = j
+                 ! accept iterate as jth eigenvector.
+                 130 continue
+                 scl = one / stdlib_qnrm2( blksiz, work( indrv1+1 ), 1 )
+                 jmax = stdlib_iqamax( blksiz, work( indrv1+1 ), 1 )
+                 if( work( indrv1+jmax )<zero )scl = -scl
+                 call stdlib_qscal( blksiz, scl, work( indrv1+1 ), 1 )
+                 140 continue
+                 do i = 1, n
+                    z( i, j ) = czero
+                 end do
+                 do i = 1, blksiz
+                    z( b1+i-1, j ) = cmplx( work( indrv1+i ), zero,KIND=qp)
+                 end do
+                 ! save the shift to check eigenvalue spacing at next
+                 ! iteration.
+                 xjm = xj
+              end do loop_170
+           end do loop_180
+           return
+     end subroutine stdlib_wstein
+
+     !> ZSTEMR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+     !> a well defined set of pairwise different real eigenvalues, the corresponding
+     !> real eigenvectors are pairwise orthogonal.
+     !> The spectrum may be computed either completely or partially by specifying
+     !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+     !> eigenvalues.
+     !> Depending on the number of desired eigenvalues, these are computed either
+     !> by bisection or the dqds algorithm. Numerically orthogonal eigenvectors are
+     !> computed by the use of various suitable L D L^T factorizations near clusters
+     !> of close eigenvalues (referred to as RRRs, Relatively Robust
+     !> Representations). An informal sketch of the algorithm follows.
+     !> For each unreduced block (submatrix) of T,
+     !> (a) Compute T - sigma I  = L D L^T, so that L and D
+     !> define all the wanted eigenvalues to high relative accuracy.
+     !> This means that small relative changes in the entries of D and L
+     !> cause only small relative changes in the eigenvalues and
+     !> eigenvectors. The standard (unfactored) representation of the
+     !> tridiagonal matrix T does not have this property in general.
+     !> (b) Compute the eigenvalues to suitable accuracy.
+     !> If the eigenvectors are desired, the algorithm attains full
+     !> accuracy of the computed eigenvalues only right before
+     !> the corresponding vectors have to be computed, see steps c) and d).
+     !> (c) For each cluster of close eigenvalues, select a new
+     !> shift close to the cluster, find a new factorization, and refine
+     !> the shifted eigenvalues to suitable accuracy.
+     !> (d) For each eigenvalue with a large enough relative separation compute
+     !> the corresponding eigenvector by forming a rank revealing twisted
+     !> factorization. Go back to (c) for any clusters that remain.
+     !> For more details, see:
+     !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+     !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+     !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+     !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+     !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+     !> 2004.  Also LAPACK Working Note 154.
+     !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem",
+     !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Further Details
+     !> 1.ZSTEMR works only on machines which follow IEEE-754
+     !> floating-point standard in their handling of infinities and NaNs.
+     !> This permits the use of efficient inner loops avoiding a check for
+     !> zero divisors.
+     !> 2. LAPACK routines can be used to reduce a complex Hermitean matrix to
+     !> real symmetric tridiagonal form.
+     !> (Any complex Hermitean tridiagonal matrix has real values on its diagonal
+     !> and potentially complex numbers on its off-diagonals. By applying a
+     !> similarity transform with an appropriate diagonal matrix
+     !> diag(1,e^{i \phy_1}, ... , e^{i \phy_{n-1}}), the complex Hermitean
+     !> matrix can be transformed into a real symmetric matrix and complex
+     !> arithmetic can be entirely avoided.)
+     !> While the eigenvectors of the real symmetric tridiagonal matrix are real,
+     !> the eigenvectors of original complex Hermitean matrix have complex entries
+     !> in general.
+     !> Since LAPACK drivers overwrite the matrix data with the eigenvectors,
+     !> ZSTEMR accepts complex workspace to facilitate interoperability
+     !> with ZUNMTR or ZUPMTR.
+
+     pure subroutine stdlib_wstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, nzc, &
+               isuppz, tryrac, work, lwork,iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           logical(lk), intent(inout) :: tryrac
+           integer(ilp), intent(in) :: il, iu, ldz, nzc, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(in) :: vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: w(*), work(*)
+           complex(qp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: minrgp = 1.0e-3_qp
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lquery, valeig, wantz, zquery
+           integer(ilp) :: i, ibegin, iend, ifirst, iil, iindbl, iindw, iindwk, iinfo, iinspl, &
+           iiu, ilast, in, indd, inde2, inderr, indgp, indgrs, indwrk, itmp, itmp2, j, jblk, jj, &
+                     liwmin, lwmin, nsplit, nzcmin, offset, wbegin, wend
+           real(qp) :: bignum, cs, eps, pivmin, r1, r2, rmax, rmin, rtol1, rtol2, safmin, scale, &
+                     smlnum, sn, thresh, tmp, tnrm, wl, wu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ).or.( liwork==-1 ) )
+           zquery = ( nzc==-1 )
+           ! stdlib_qstemr needs work of size 6*n, iwork of size 3*n.
+           ! in addition, stdlib_qlarre needs work of size 6*n, iwork of size 5*n.
+           ! furthermore, stdlib_wlarrv needs work of size 12*n, iwork of size 7*n.
+           if( wantz ) then
+              lwmin = 18*n
+              liwmin = 10*n
+           else
+              ! need less workspace if only the eigenvalues are wanted
+              lwmin = 12*n
+              liwmin = 8*n
+           endif
+           wl = zero
+           wu = zero
+           iil = 0
+           iiu = 0
+           nsplit = 0
+           if( valeig ) then
+              ! we do not reference vl, vu in the cases range = 'i','a'
+              ! the interval (wl, wu] contains all the wanted eigenvalues.
+              ! it is either given by the user or computed in stdlib_qlarre.
+              wl = vl
+              wu = vu
+           elseif( indeig ) then
+              ! we do not reference il, iu in the cases range = 'v','a'
+              iil = il
+              iiu = iu
+           endif
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( valeig .and. n>0 .and. wu<=wl ) then
+              info = -7
+           else if( indeig .and. ( iil<1 .or. iil>n ) ) then
+              info = -8
+           else if( indeig .and. ( iiu<iil .or. iiu>n ) ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -13
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -17
+           else if( liwork<liwmin .and. .not.lquery ) then
+              info = -19
+           end if
+           ! get machine constants.
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           eps = stdlib_qlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( wantz .and. alleig ) then
+                 nzcmin = n
+              else if( wantz .and. valeig ) then
+                 call stdlib_qlarrc( 'T', n, vl, vu, d, e, safmin,nzcmin, itmp, itmp2, info )
+                           
+              else if( wantz .and. indeig ) then
+                 nzcmin = iiu-iil+1
+              else
+                 ! wantz == false.
+                 nzcmin = 0
+              endif
+              if( zquery .and. info==0 ) then
+                 z( 1,1 ) = nzcmin
+              else if( nzc<nzcmin .and. .not.zquery ) then
+                 info = -14
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSTEMR', -info )
+              return
+           else if( lquery .or. zquery ) then
+              return
+           end if
+           ! handle n = 0, 1, and 2 cases immediately
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = d( 1 )
+              else
+                 if( wl<d( 1 ) .and. wu>=d( 1 ) ) then
+                    m = 1
+                    w( 1 ) = d( 1 )
+                 end if
+              end if
+              if( wantz.and.(.not.zquery) ) then
+                 z( 1, 1 ) = one
+                 isuppz(1) = 1
+                 isuppz(2) = 1
+              end if
+              return
+           end if
+           if( n==2 ) then
+              if( .not.wantz ) then
+                 call stdlib_qlae2( d(1), e(1), d(2), r1, r2 )
+              else if( wantz.and.(.not.zquery) ) then
+                 call stdlib_qlaev2( d(1), e(1), d(2), r1, r2, cs, sn )
+              end if
+              if( alleig.or.(valeig.and.(r2>wl).and.(r2<=wu)).or.(indeig.and.(iil==1)) ) &
+                        then
+                 m = m+1
+                 w( m ) = r2
+                 if( wantz.and.(.not.zquery) ) then
+                    z( 1, m ) = -sn
+                    z( 2, m ) = cs
+                    ! note: at most one of sn and cs can be zero.
+                    if (sn/=zero) then
+                       if (cs/=zero) then
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 2
+                       else
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 1
+                       end if
+                    else
+                       isuppz(2*m-1) = 2
+                       isuppz(2*m) = 2
+                    end if
+                 endif
+              endif
+              if( alleig.or.(valeig.and.(r1>wl).and.(r1<=wu)).or.(indeig.and.(iiu==2)) ) &
+                        then
+                 m = m+1
+                 w( m ) = r1
+                 if( wantz.and.(.not.zquery) ) then
+                    z( 1, m ) = cs
+                    z( 2, m ) = sn
+                    ! note: at most one of sn and cs can be zero.
+                    if (sn/=zero) then
+                       if (cs/=zero) then
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 2
+                       else
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 1
+                       end if
+                    else
+                       isuppz(2*m-1) = 2
+                       isuppz(2*m) = 2
+                    end if
+                 endif
+              endif
+           else
+              ! continue with general n
+              indgrs = 1
+              inderr = 2*n + 1
+              indgp = 3*n + 1
+              indd = 4*n + 1
+              inde2 = 5*n + 1
+              indwrk = 6*n + 1
+              iinspl = 1
+              iindbl = n + 1
+              iindw = 2*n + 1
+              iindwk = 3*n + 1
+              ! scale matrix to allowable range, if necessary.
+              ! the allowable range is related to the pivmin parameter; see the
+              ! comments in stdlib_qlarrd.  the preference for scaling small values
+              ! up is heuristic; we expect users' matrices not to be close to the
+              ! rmax threshold.
+              scale = one
+              tnrm = stdlib_qlanst( 'M', n, d, e )
+              if( tnrm>zero .and. tnrm<rmin ) then
+                 scale = rmin / tnrm
+              else if( tnrm>rmax ) then
+                 scale = rmax / tnrm
+              end if
+              if( scale/=one ) then
+                 call stdlib_qscal( n, scale, d, 1 )
+                 call stdlib_qscal( n-1, scale, e, 1 )
+                 tnrm = tnrm*scale
+                 if( valeig ) then
+                    ! if eigenvalues in interval have to be found,
+                    ! scale (wl, wu] accordingly
+                    wl = wl*scale
+                    wu = wu*scale
+                 endif
+              end if
+              ! compute the desired eigenvalues of the tridiagonal after splitting
+              ! into smaller subblocks if the corresponding off-diagonal elements
+              ! are small
+              ! thresh is the splitting parameter for stdlib_qlarre
+              ! a negative thresh forces the old splitting criterion based on the
+              ! size of the off-diagonal. a positive thresh switches to splitting
+              ! which preserves relative accuracy.
+              if( tryrac ) then
+                 ! test whether the matrix warrants the more expensive relative approach.
+                 call stdlib_qlarrr( n, d, e, iinfo )
+              else
+                 ! the user does not care about relative accurately eigenvalues
+                 iinfo = -1
+              endif
+              ! set the splitting criterion
+              if (iinfo==0) then
+                 thresh = eps
+              else
+                 thresh = -eps
+                 ! relative accuracy is desired but t does not guarantee it
+                 tryrac = .false.
+              endif
+              if( tryrac ) then
+                 ! copy original diagonal, needed to guarantee relative accuracy
+                 call stdlib_qcopy(n,d,1,work(indd),1)
+              endif
+              ! store the squares of the offdiagonal values of t
+              do j = 1, n-1
+                 work( inde2+j-1 ) = e(j)**2
+              end do
+              ! set the tolerance parameters for bisection
+              if( .not.wantz ) then
+                 ! stdlib_qlarre computes the eigenvalues to full precision.
+                 rtol1 = four * eps
+                 rtol2 = four * eps
+              else
+                 ! stdlib_qlarre computes the eigenvalues to less than full precision.
+                 ! stdlib_wlarrv will refine the eigenvalue approximations, and we only
+                 ! need less accurate initial bisection in stdlib_qlarre.
+                 ! note: these settings do only affect the subset case and stdlib_qlarre
+                 rtol1 = sqrt(eps)
+                 rtol2 = max( sqrt(eps)*5.0e-3_qp, four * eps )
+              endif
+              call stdlib_qlarre( range, n, wl, wu, iil, iiu, d, e,work(inde2), rtol1, rtol2, &
+              thresh, nsplit,iwork( iinspl ), m, w, work( inderr ),work( indgp ), iwork( iindbl ),&
+              iwork( iindw ), work( indgrs ), pivmin,work( indwrk ), iwork( iindwk ), iinfo )
+                        
+              if( iinfo/=0 ) then
+                 info = 10 + abs( iinfo )
+                 return
+              end if
+              ! note that if range /= 'v', stdlib_qlarre computes bounds on the desired
+              ! part of the spectrum. all desired eigenvalues are contained in
+              ! (wl,wu]
+              if( wantz ) then
+                 ! compute the desired eigenvectors corresponding to the computed
+                 ! eigenvalues
+                 call stdlib_wlarrv( n, wl, wu, d, e,pivmin, iwork( iinspl ), m,1, m, minrgp, &
+                 rtol1, rtol2,w, work( inderr ), work( indgp ), iwork( iindbl ),iwork( iindw ), &
+                           work( indgrs ), z, ldz,isuppz, work( indwrk ), iwork( iindwk ), iinfo )
+                 if( iinfo/=0 ) then
+                    info = 20 + abs( iinfo )
+                    return
+                 end if
+              else
+                 ! stdlib_qlarre computes eigenvalues of the (shifted) root representation
+                 ! stdlib_wlarrv returns the eigenvalues of the unshifted matrix.
+                 ! however, if the eigenvectors are not desired by the user, we need
+                 ! to apply the corresponding shifts from stdlib_qlarre to obtain the
+                 ! eigenvalues of the original matrix.
+                 do j = 1, m
+                    itmp = iwork( iindbl+j-1 )
+                    w( j ) = w( j ) + e( iwork( iinspl+itmp-1 ) )
+                 end do
+              end if
+              if ( tryrac ) then
+                 ! refine computed eigenvalues so that they are relatively accurate
+                 ! with respect to the original matrix t.
+                 ibegin = 1
+                 wbegin = 1
+                 loop_39: do jblk = 1, iwork( iindbl+m-1 )
+                    iend = iwork( iinspl+jblk-1 )
+                    in = iend - ibegin + 1
+                    wend = wbegin - 1
+                    ! check if any eigenvalues have to be refined in this block
+                    36 continue
+                    if( wend<m ) then
+                       if( iwork( iindbl+wend )==jblk ) then
+                          wend = wend + 1
+                          go to 36
+                       end if
+                    end if
+                    if( wend<wbegin ) then
+                       ibegin = iend + 1
+                       cycle loop_39
+                    end if
+                    offset = iwork(iindw+wbegin-1)-1
+                    ifirst = iwork(iindw+wbegin-1)
+                    ilast = iwork(iindw+wend-1)
+                    rtol2 = four * eps
+                    call stdlib_qlarrj( in,work(indd+ibegin-1), work(inde2+ibegin-1),ifirst, &
+                    ilast, rtol2, offset, w(wbegin),work( inderr+wbegin-1 ),work( indwrk ), iwork(&
+                               iindwk ), pivmin,tnrm, iinfo )
+                    ibegin = iend + 1
+                    wbegin = wend + 1
+                 end do loop_39
+              endif
+              ! if matrix was scaled, then rescale eigenvalues appropriately.
+              if( scale/=one ) then
+                 call stdlib_qscal( m, one / scale, w, 1 )
+              end if
+           end if
+           ! if eigenvalues are not in increasing order, then sort them,
+           ! possibly along with eigenvectors.
+           if( nsplit>1 .or. n==2 ) then
+              if( .not. wantz ) then
+                 call stdlib_qlasrt( 'I', m, w, iinfo )
+                 if( iinfo/=0 ) then
+                    info = 3
+                    return
+                 end if
+              else
+                 do j = 1, m - 1
+                    i = 0
+                    tmp = w( j )
+                    do jj = j + 1, m
+                       if( w( jj )<tmp ) then
+                          i = jj
+                          tmp = w( jj )
+                       end if
+                    end do
+                    if( i/=0 ) then
+                       w( i ) = w( j )
+                       w( j ) = tmp
+                       if( wantz ) then
+                          call stdlib_wswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                          itmp = isuppz( 2*i-1 )
+                          isuppz( 2*i-1 ) = isuppz( 2*j-1 )
+                          isuppz( 2*j-1 ) = itmp
+                          itmp = isuppz( 2*i )
+                          isuppz( 2*i ) = isuppz( 2*j )
+                          isuppz( 2*j ) = itmp
+                       end if
+                    end if
+                 end do
+              end if
+           endif
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_wstemr
+
+     !> ZSTEQR: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric tridiagonal matrix using the implicit QL or QR method.
+     !> The eigenvectors of a full or band complex Hermitian matrix can also
+     !> be found if ZHETRD or ZHPTRD or ZHBTRD has been used to reduce this
+     !> matrix to tridiagonal form.
+
+     pure subroutine stdlib_wsteqr( compz, n, d, e, z, ldz, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(qp), intent(inout) :: d(*), e(*)
+           real(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 30
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, icompz, ii, iscale, j, jtot, k, l, l1, lend, lendm1, lendp1, lendsv,&
+                      lm1, lsv, m, mm, mm1, nm1, nmaxit
+           real(qp) :: anorm, b, c, eps, eps2, f, g, p, r, rt1, rt2, s, safmax, safmin, ssfmax, &
+                     ssfmin, tst
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or. ( icompz>0 .and. ldz<max( 1,n ) ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSTEQR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz==2 )z( 1, 1 ) = cone
+              return
+           end if
+           ! determine the unit roundoff and over/underflow thresholds.
+           eps = stdlib_qlamch( 'E' )
+           eps2 = eps**2
+           safmin = stdlib_qlamch( 'S' )
+           safmax = one / safmin
+           ssfmax = sqrt( safmax ) / three
+           ssfmin = sqrt( safmin ) / eps2
+           ! compute the eigenvalues and eigenvectors of the tridiagonal
+           ! matrix.
+           if( icompz==2 )call stdlib_wlaset( 'FULL', n, n, czero, cone, z, ldz )
+           nmaxit = n*maxit
+           jtot = 0
+           ! determine where the matrix splits and choose ql or qr iteration
+           ! for each block, according to whether top or bottom diagonal
+           ! element is smaller.
+           l1 = 1
+           nm1 = n - 1
+           10 continue
+           if( l1>n )go to 160
+           if( l1>1 )e( l1-1 ) = zero
+           if( l1<=nm1 ) then
+              do m = l1, nm1
+                 tst = abs( e( m ) )
+                 if( tst==zero )go to 30
+                 if( tst<=( sqrt( abs( d( m ) ) )*sqrt( abs( d( m+1 ) ) ) )*eps ) then
+                    e( m ) = zero
+                    go to 30
+                 end if
+              end do
+           end if
+           m = n
+           30 continue
+           l = l1
+           lsv = l
+           lend = m
+           lendsv = lend
+           l1 = m + 1
+           if( lend==l )go to 10
+           ! scale submatrix in rows and columns l to lend
+           anorm = stdlib_qlanst( 'I', lend-l+1, d( l ), e( l ) )
+           iscale = 0
+           if( anorm==zero )go to 10
+           if( anorm>ssfmax ) then
+              iscale = 1
+              call stdlib_qlascl( 'G', 0, 0, anorm, ssfmax, lend-l+1, 1, d( l ), n,info )
+              call stdlib_qlascl( 'G', 0, 0, anorm, ssfmax, lend-l, 1, e( l ), n,info )
+           else if( anorm<ssfmin ) then
+              iscale = 2
+              call stdlib_qlascl( 'G', 0, 0, anorm, ssfmin, lend-l+1, 1, d( l ), n,info )
+              call stdlib_qlascl( 'G', 0, 0, anorm, ssfmin, lend-l, 1, e( l ), n,info )
+           end if
+           ! choose between ql and qr iteration
+           if( abs( d( lend ) )<abs( d( l ) ) ) then
+              lend = lsv
+              l = lendsv
+           end if
+           if( lend>l ) then
+              ! ql iteration
+              ! look for small subdiagonal element.
+              40 continue
+              if( l/=lend ) then
+                 lendm1 = lend - 1
+                 do m = l, lendm1
+                    tst = abs( e( m ) )**2
+                    if( tst<=( eps2*abs( d( m ) ) )*abs( d( m+1 ) )+safmin )go to 60
+                 end do
+              end if
+              m = lend
+              60 continue
+              if( m<lend )e( m ) = zero
+              p = d( l )
+              if( m==l )go to 80
+              ! if remaining matrix is 2-by-2, use stdlib_qlae2 or stdlib_dlaev2
+              ! to compute its eigensystem.
+              if( m==l+1 ) then
+                 if( icompz>0 ) then
+                    call stdlib_qlaev2( d( l ), e( l ), d( l+1 ), rt1, rt2, c, s )
+                    work( l ) = c
+                    work( n-1+l ) = s
+                    call stdlib_wlasr( 'R', 'V', 'B', n, 2, work( l ),work( n-1+l ), z( 1, l ), &
+                              ldz )
+                 else
+                    call stdlib_qlae2( d( l ), e( l ), d( l+1 ), rt1, rt2 )
+                 end if
+                 d( l ) = rt1
+                 d( l+1 ) = rt2
+                 e( l ) = zero
+                 l = l + 2
+                 if( l<=lend )go to 40
+                 go to 140
+              end if
+              if( jtot==nmaxit )go to 140
+              jtot = jtot + 1
+              ! form shift.
+              g = ( d( l+1 )-p ) / ( two*e( l ) )
+              r = stdlib_qlapy2( g, one )
+              g = d( m ) - p + ( e( l ) / ( g+sign( r, g ) ) )
+              s = one
+              c = one
+              p = zero
+              ! inner loop
+              mm1 = m - 1
+              do i = mm1, l, -1
+                 f = s*e( i )
+                 b = c*e( i )
+                 call stdlib_qlartg( g, f, c, s, r )
+                 if( i/=m-1 )e( i+1 ) = r
+                 g = d( i+1 ) - p
+                 r = ( d( i )-g )*s + two*c*b
+                 p = s*r
+                 d( i+1 ) = g + p
+                 g = c*r - b
+                 ! if eigenvectors are desired, then save rotations.
+                 if( icompz>0 ) then
+                    work( i ) = c
+                    work( n-1+i ) = -s
+                 end if
+              end do
+              ! if eigenvectors are desired, then apply saved rotations.
+              if( icompz>0 ) then
+                 mm = m - l + 1
+                 call stdlib_wlasr( 'R', 'V', 'B', n, mm, work( l ), work( n-1+l ),z( 1, l ), ldz &
+                           )
+              end if
+              d( l ) = d( l ) - p
+              e( l ) = g
+              go to 40
+              ! eigenvalue found.
+              80 continue
+              d( l ) = p
+              l = l + 1
+              if( l<=lend )go to 40
+              go to 140
+           else
+              ! qr iteration
+              ! look for small superdiagonal element.
+              90 continue
+              if( l/=lend ) then
+                 lendp1 = lend + 1
+                 do m = l, lendp1, -1
+                    tst = abs( e( m-1 ) )**2
+                    if( tst<=( eps2*abs( d( m ) ) )*abs( d( m-1 ) )+safmin )go to 110
+                 end do
+              end if
+              m = lend
+              110 continue
+              if( m>lend )e( m-1 ) = zero
+              p = d( l )
+              if( m==l )go to 130
+              ! if remaining matrix is 2-by-2, use stdlib_qlae2 or stdlib_dlaev2
+              ! to compute its eigensystem.
+              if( m==l-1 ) then
+                 if( icompz>0 ) then
+                    call stdlib_qlaev2( d( l-1 ), e( l-1 ), d( l ), rt1, rt2, c, s )
+                    work( m ) = c
+                    work( n-1+m ) = s
+                    call stdlib_wlasr( 'R', 'V', 'F', n, 2, work( m ),work( n-1+m ), z( 1, l-1 ), &
+                              ldz )
+                 else
+                    call stdlib_qlae2( d( l-1 ), e( l-1 ), d( l ), rt1, rt2 )
+                 end if
+                 d( l-1 ) = rt1
+                 d( l ) = rt2
+                 e( l-1 ) = zero
+                 l = l - 2
+                 if( l>=lend )go to 90
+                 go to 140
+              end if
+              if( jtot==nmaxit )go to 140
+              jtot = jtot + 1
+              ! form shift.
+              g = ( d( l-1 )-p ) / ( two*e( l-1 ) )
+              r = stdlib_qlapy2( g, one )
+              g = d( m ) - p + ( e( l-1 ) / ( g+sign( r, g ) ) )
+              s = one
+              c = one
+              p = zero
+              ! inner loop
+              lm1 = l - 1
+              do i = m, lm1
+                 f = s*e( i )
+                 b = c*e( i )
+                 call stdlib_qlartg( g, f, c, s, r )
+                 if( i/=m )e( i-1 ) = r
+                 g = d( i ) - p
+                 r = ( d( i+1 )-g )*s + two*c*b
+                 p = s*r
+                 d( i ) = g + p
+                 g = c*r - b
+                 ! if eigenvectors are desired, then save rotations.
+                 if( icompz>0 ) then
+                    work( i ) = c
+                    work( n-1+i ) = s
+                 end if
+              end do
+              ! if eigenvectors are desired, then apply saved rotations.
+              if( icompz>0 ) then
+                 mm = l - m + 1
+                 call stdlib_wlasr( 'R', 'V', 'F', n, mm, work( m ), work( n-1+m ),z( 1, m ), ldz &
+                           )
+              end if
+              d( l ) = d( l ) - p
+              e( lm1 ) = g
+              go to 90
+              ! eigenvalue found.
+              130 continue
+              d( l ) = p
+              l = l - 1
+              if( l>=lend )go to 90
+              go to 140
+           end if
+           ! undo scaling if necessary
+           140 continue
+           if( iscale==1 ) then
+              call stdlib_qlascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv+1, 1,d( lsv ), n, info )
+                        
+              call stdlib_qlascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv, 1, e( lsv ),n, info )
+                        
+           else if( iscale==2 ) then
+              call stdlib_qlascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv+1, 1,d( lsv ), n, info )
+                        
+              call stdlib_qlascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv, 1, e( lsv ),n, info )
+                        
+           end if
+           ! check for no convergence to an eigenvalue after a total
+           ! of n*maxit iterations.
+           if( jtot==nmaxit ) then
+              do i = 1, n - 1
+                 if( e( i )/=zero )info = info + 1
+              end do
+              return
+           end if
+           go to 10
+           ! order eigenvalues and eigenvectors.
+           160 continue
+           if( icompz==0 ) then
+              ! use quick sort
+              call stdlib_qlasrt( 'I', n, d, info )
+           else
+              ! use selection sort to minimize swaps of eigenvectors
+              do ii = 2, n
+                 i = ii - 1
+                 k = i
+                 p = d( i )
+                 do j = ii, n
+                    if( d( j )<p ) then
+                       k = j
+                       p = d( j )
+                    end if
+                 end do
+                 if( k/=i ) then
+                    d( k ) = d( i )
+                    d( i ) = p
+                    call stdlib_wswap( n, z( 1, i ), 1, z( 1, k ), 1 )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_wsteqr
+
+     !> ZSYCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex symmetric matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by ZSYTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_wsycon( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(qp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_wsytrs( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_wsycon
+
+     !> ZSYCON_ROOK: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex symmetric matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by ZSYTRF_ROOK.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_wsycon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(in) :: anorm
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(qp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYCON_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==czero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_wsytrs_rook( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_wsycon_rook
+
+     !> ZSYCONV: converts A given by ZHETRF into L and D or vice-versa.
+     !> Get nondiagonal elements of D (returned in workspace) and
+     !> apply or reverse permutation done in TRF.
+
+     pure subroutine stdlib_wsyconv( uplo, way, n, a, lda, ipiv, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip, j
+           complex(qp) :: temp
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYCONV', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 i=n
+                 e(1)=czero
+                 do while ( i > 1 )
+                    if( ipiv(i) < 0 ) then
+                       e(i)=a(i-1,i)
+                       e(i-1)=czero
+                       a(i-1,i)=czero
+                       i=i-1
+                    else
+                       e(i)=czero
+                    endif
+                    i=i-1
+                 end do
+                 ! convert permutations
+                 i=n
+                 do while ( i >= 1 )
+                    if( ipiv(i) > 0) then
+                       ip=ipiv(i)
+                       if( i < n) then
+                          do j= i+1,n
+                            temp=a(ip,j)
+                            a(ip,j)=a(i,j)
+                            a(i,j)=temp
+                          end do
+                       endif
+                    else
+                       ip=-ipiv(i)
+                       if( i < n) then
+                          do j= i+1,n
+                             temp=a(ip,j)
+                             a(ip,j)=a(i-1,j)
+                             a(i-1,j)=temp
+                          end do
+                       endif
+                       i=i-1
+                    endif
+                    i=i-1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations
+                 i=1
+                 do while ( i <= n )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if( i < n) then
+                       do j= i+1,n
+                         temp=a(ip,j)
+                         a(ip,j)=a(i,j)
+                         a(i,j)=temp
+                       end do
+                       endif
+                    else
+                      ip=-ipiv(i)
+                      i=i+1
+                      if( i < n) then
+                         do j= i+1,n
+                            temp=a(ip,j)
+                            a(ip,j)=a(i-1,j)
+                            a(i-1,j)=temp
+                         end do
+                      endif
+                    endif
+                    i=i+1
+                 end do
+                 ! revert value
+                 i=n
+                 do while ( i > 1 )
+                    if( ipiv(i) < 0 ) then
+                       a(i-1,i)=e(i)
+                       i=i-1
+                    endif
+                    i=i-1
+                 end do
+              end if
+           else
+              ! a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 i=1
+                 e(n)=czero
+                 do while ( i <= n )
+                    if( i<n .and. ipiv(i) < 0 ) then
+                       e(i)=a(i+1,i)
+                       e(i+1)=czero
+                       a(i+1,i)=czero
+                       i=i+1
+                    else
+                       e(i)=czero
+                    endif
+                    i=i+1
+                 end do
+                 ! convert permutations
+                 i=1
+                 do while ( i <= n )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(ip,j)
+                             a(ip,j)=a(i,j)
+                             a(i,j)=temp
+                          end do
+                       endif
+                    else
+                       ip=-ipiv(i)
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(ip,j)
+                             a(ip,j)=a(i+1,j)
+                             a(i+1,j)=temp
+                          end do
+                       endif
+                       i=i+1
+                    endif
+                    i=i+1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations
+                 i=n
+                 do while ( i >= 1 )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(i,j)
+                             a(i,j)=a(ip,j)
+                             a(ip,j)=temp
+                          end do
+                       endif
+                    else
+                       ip=-ipiv(i)
+                       i=i-1
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(i+1,j)
+                             a(i+1,j)=a(ip,j)
+                             a(ip,j)=temp
+                          end do
+                       endif
+                    endif
+                    i=i-1
+                 end do
+                 ! revert value
+                 i=1
+                 do while ( i <= n-1 )
+                    if( ipiv(i) < 0 ) then
+                       a(i+1,i)=e(i)
+                       i=i+1
+                    endif
+                    i=i+1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wsyconv
+
+     !> If parameter WAY = 'C':
+     !> ZSYCONVF: converts the factorization output format used in
+     !> ZSYTRF provided on entry in parameter A into the factorization
+     !> output format used in ZSYTRF_RK (or ZSYTRF_BK) that is stored
+     !> on exit in parameters A and E. It also converts in place details of
+     !> the intechanges stored in IPIV from the format used in ZSYTRF into
+     !> the format used in ZSYTRF_RK (or ZSYTRF_BK).
+     !> If parameter WAY = 'R':
+     !> ZSYCONVF performs the conversion in reverse direction, i.e.
+     !> converts the factorization output format used in ZSYTRF_RK
+     !> (or ZSYTRF_BK) provided on entry in parameters A and E into
+     !> the factorization output format used in ZSYTRF that is stored
+     !> on exit in parameter A. It also converts in place details of
+     !> the intechanges stored in IPIV from the format used in ZSYTRF_RK
+     !> (or ZSYTRF_BK) into the format used in ZSYTRF.
+     !> ZSYCONVF can also convert in Hermitian matrix case, i.e. between
+     !> formats used in ZHETRF and ZHETRF_RK (or ZHETRF_BK).
+
+     pure subroutine stdlib_wsyconvf( uplo, way, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYCONVF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! begin a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 ! assign superdiagonal entries of d to array e and czero out
+                 ! corresponding entries in input storage a
+                 i = n
+                 e( 1 ) = czero
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       e( i ) = a( i-1, i )
+                       e( i-1 ) = czero
+                       a( i-1, i ) = czero
+                       i = i - 1
+                    else
+                       e( i ) = czero
+                    end if
+                    i = i - 1
+                 end do
+                 ! convert permutations and ipiv
+                 ! apply permutations to submatrices of upper part of a
+                 ! in factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_wswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i) in a(1:i,n-i:n)
+                       ip = -ipiv( i )
+                       if( i<n ) then
+                          if( ip/=(i-1) ) then
+                             call stdlib_wswap( n-i, a( i-1, i+1 ), lda,a( ip, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is no interchnge of rows i and and ipiv(i),
+                       ! so this should be reflected in ipiv format for
+                       ! *sytrf_rk ( or *sytrf_bk)
+                       ipiv( i ) = i
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations and ipiv
+                 ! apply permutations to submatrices of upper part of a
+                 ! in reverse factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_wswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i) in a(1:i,n-i:n)
+                       i = i + 1
+                       ip = -ipiv( i )
+                       if( i<n ) then
+                          if( ip/=(i-1) ) then
+                             call stdlib_wswap( n-i, a( ip, i+1 ), lda,a( i-1, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is cone interchange of rows i-1 and ipiv(i-1),
+                       ! so this should be recorded in two consecutive entries
+                       ! in ipiv format for *sytrf
+                       ipiv( i ) = ipiv( i-1 )
+                    end if
+                    i = i + 1
+                 end do
+                 ! revert value
+                 ! assign superdiagonal entries of d from array e to
+                 ! superdiagonal entries of a.
+                 i = n
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       a( i-1, i ) = e( i )
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              ! end a is upper
+              end if
+           else
+              ! begin a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 ! assign subdiagonal entries of d to array e and czero out
+                 ! corresponding entries in input storage a
+                 i = 1
+                 e( n ) = czero
+                 do while ( i<=n )
+                    if( i<n .and. ipiv(i)<0 ) then
+                       e( i ) = a( i+1, i )
+                       e( i+1 ) = czero
+                       a( i+1, i ) = czero
+                       i = i + 1
+                    else
+                       e( i ) = czero
+                    end if
+                    i = i + 1
+                 end do
+                 ! convert permutations and ipiv
+                 ! apply permutations to submatrices of lower part of a
+                 ! in factorization order where k increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_wswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i) in a(i:n,1:i-1)
+                       ip = -ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=(i+1) ) then
+                             call stdlib_wswap( i-1, a( i+1, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is no interchnge of rows i and and ipiv(i),
+                       ! so this should be reflected in ipiv format for
+                       ! *sytrf_rk ( or *sytrf_bk)
+                       ipiv( i ) = i
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations and ipiv
+                 ! apply permutations to submatrices of lower part of a
+                 ! in reverse factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_wswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i) in a(i:n,1:i-1)
+                       i = i - 1
+                       ip = -ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=(i+1) ) then
+                             call stdlib_wswap( i-1, a( ip, 1 ), lda,a( i+1, 1 ), lda )
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is cone interchange of rows i+1 and ipiv(i+1),
+                       ! so this should be recorded in consecutive entries
+                       ! in ipiv format for *sytrf
+                       ipiv( i ) = ipiv( i+1 )
+                    end if
+                    i = i - 1
+                 end do
+                 ! revert value
+                 ! assign subdiagonal entries of d from array e to
+                 ! subgiagonal entries of a.
+                 i = 1
+                 do while ( i<=n-1 )
+                    if( ipiv( i )<0 ) then
+                       a( i + 1, i ) = e( i )
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              end if
+              ! end a is lower
+           end if
+           return
+     end subroutine stdlib_wsyconvf
+
+     !> If parameter WAY = 'C':
+     !> ZSYCONVF_ROOK: converts the factorization output format used in
+     !> ZSYTRF_ROOK provided on entry in parameter A into the factorization
+     !> output format used in ZSYTRF_RK (or ZSYTRF_BK) that is stored
+     !> on exit in parameters A and E. IPIV format for ZSYTRF_ROOK and
+     !> ZSYTRF_RK (or ZSYTRF_BK) is the same and is not converted.
+     !> If parameter WAY = 'R':
+     !> ZSYCONVF_ROOK performs the conversion in reverse direction, i.e.
+     !> converts the factorization output format used in ZSYTRF_RK
+     !> (or ZSYTRF_BK) provided on entry in parameters A and E into
+     !> the factorization output format used in ZSYTRF_ROOK that is stored
+     !> on exit in parameter A. IPIV format for ZSYTRF_ROOK and
+     !> ZSYTRF_RK (or ZSYTRF_BK) is the same and is not converted.
+     !> ZSYCONVF_ROOK can also convert in Hermitian matrix case, i.e. between
+     !> formats used in ZHETRF_ROOK and ZHETRF_RK (or ZHETRF_BK).
+
+     pure subroutine stdlib_wsyconvf_rook( uplo, way, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip, ip2
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYCONVF_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! begin a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 ! assign superdiagonal entries of d to array e and czero out
+                 ! corresponding entries in input storage a
+                 i = n
+                 e( 1 ) = czero
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       e( i ) = a( i-1, i )
+                       e( i-1 ) = czero
+                       a( i-1, i ) = czero
+                       i = i - 1
+                    else
+                       e( i ) = czero
+                    end if
+                    i = i - 1
+                 end do
+                 ! convert permutations
+                 ! apply permutations to submatrices of upper part of a
+                 ! in factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_wswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i and ipiv(i) and i-1 and ipiv(i-1)
+                       ! in a(1:i,n-i:n)
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i-1 )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_wswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                          if( ip2/=(i-1) ) then
+                             call stdlib_wswap( n-i, a( i-1, i+1 ), lda,a( ip2, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations
+                 ! apply permutations to submatrices of upper part of a
+                 ! in reverse factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_wswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i-1) and i and ipiv(i)
+                       ! in a(1:i,n-i:n)
+                       i = i + 1
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i-1 )
+                       if( i<n ) then
+                          if( ip2/=(i-1) ) then
+                             call stdlib_wswap( n-i, a( ip2, i+1 ), lda,a( i-1, i+1 ), lda )
+                                       
+                          end if
+                          if( ip/=i ) then
+                             call stdlib_wswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    end if
+                    i = i + 1
+                 end do
+                 ! revert value
+                 ! assign superdiagonal entries of d from array e to
+                 ! superdiagonal entries of a.
+                 i = n
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       a( i-1, i ) = e( i )
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              ! end a is upper
+              end if
+           else
+              ! begin a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 ! assign subdiagonal entries of d to array e and czero out
+                 ! corresponding entries in input storage a
+                 i = 1
+                 e( n ) = czero
+                 do while ( i<=n )
+                    if( i<n .and. ipiv(i)<0 ) then
+                       e( i ) = a( i+1, i )
+                       e( i+1 ) = czero
+                       a( i+1, i ) = czero
+                       i = i + 1
+                    else
+                       e( i ) = czero
+                    end if
+                    i = i + 1
+                 end do
+                 ! convert permutations
+                 ! apply permutations to submatrices of lower part of a
+                 ! in factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_wswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i and ipiv(i) and i+1 and ipiv(i+1)
+                       ! in a(i:n,1:i-1)
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i+1 )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_wswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                          if( ip2/=(i+1) ) then
+                             call stdlib_wswap( i-1, a( i+1, 1 ), lda,a( ip2, 1 ), lda )
+                          end if
+                       end if
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations
+                 ! apply permutations to submatrices of lower part of a
+                 ! in reverse factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_wswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i+1) and i and ipiv(i)
+                       ! in a(i:n,1:i-1)
+                       i = i - 1
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i+1 )
+                       if ( i>1 ) then
+                          if( ip2/=(i+1) ) then
+                             call stdlib_wswap( i-1, a( ip2, 1 ), lda,a( i+1, 1 ), lda )
+                          end if
+                          if( ip/=i ) then
+                             call stdlib_wswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    end if
+                    i = i - 1
+                 end do
+                 ! revert value
+                 ! assign subdiagonal entries of d from array e to
+                 ! subgiagonal entries of a.
+                 i = 1
+                 do while ( i<=n-1 )
+                    if( ipiv( i )<0 ) then
+                       a( i + 1, i ) = e( i )
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              end if
+              ! end a is lower
+           end if
+           return
+     end subroutine stdlib_wsyconvf_rook
+
+     !> ZSYEQUB: computes row and column scalings intended to equilibrate a
+     !> symmetric matrix A (with respect to the Euclidean norm) and reduce
+     !> its condition number. The scale factors S are computed by the BIN
+     !> algorithm (see references) so that the scaled matrix B with elements
+     !> B(i,j) = S(i)*A(i,j)*S(j) has a condition number within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_wsyequb( uplo, n, a, lda, s, scond, amax, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(out) :: amax, scond
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+           real(qp), intent(out) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: max_iter = 100
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, iter
+           real(qp) :: avg, std, tol, c0, c1, c2, t, u, si, d, base, smin, smax, smlnum, bignum, &
+                     scale, sumsq
+           logical(lk) :: up
+           complex(qp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,int,log,max,min,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if ( .not. ( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -1
+           else if ( n < 0 ) then
+              info = -2
+           else if ( lda < max( 1, n ) ) then
+              info = -4
+           end if
+           if ( info /= 0 ) then
+              call stdlib_xerbla( 'ZSYEQUB', -info )
+              return
+           end if
+           up = stdlib_lsame( uplo, 'U' )
+           amax = zero
+           ! quick return if possible.
+           if ( n == 0 ) then
+              scond = one
+              return
+           end if
+           do i = 1, n
+              s( i ) = zero
+           end do
+           amax = zero
+           if ( up ) then
+              do j = 1, n
+                 do i = 1, j-1
+                    s( i ) = max( s( i ), cabs1( a( i, j ) ) )
+                    s( j ) = max( s( j ), cabs1( a( i, j ) ) )
+                    amax = max( amax, cabs1( a( i, j ) ) )
+                 end do
+                 s( j ) = max( s( j ), cabs1( a( j, j ) ) )
+                 amax = max( amax, cabs1( a( j, j ) ) )
+              end do
+           else
+              do j = 1, n
+                 s( j ) = max( s( j ), cabs1( a( j, j ) ) )
+                 amax = max( amax, cabs1( a( j, j ) ) )
+                 do i = j+1, n
+                    s( i ) = max( s( i ), cabs1( a( i, j ) ) )
+                    s( j ) = max( s( j ), cabs1( a( i, j ) ) )
+                    amax = max( amax, cabs1( a( i, j ) ) )
+                 end do
+              end do
+           end if
+           do j = 1, n
+              s( j ) = 1.0_qp / s( j )
+           end do
+           tol = one / sqrt( 2.0_qp * n )
+           do iter = 1, max_iter
+              scale = zero
+              sumsq = zero
+              ! beta = |a|s
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              if ( up ) then
+                 do j = 1, n
+                    do i = 1, j-1
+                       work( i ) = work( i ) + cabs1( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + cabs1( a( i, j ) ) * s( i )
+                    end do
+                    work( j ) = work( j ) + cabs1( a( j, j ) ) * s( j )
+                 end do
+              else
+                 do j = 1, n
+                    work( j ) = work( j ) + cabs1( a( j, j ) ) * s( j )
+                    do i = j+1, n
+                       work( i ) = work( i ) + cabs1( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + cabs1( a( i, j ) ) * s( i )
+                    end do
+                 end do
+              end if
+              ! avg = s^t beta / n
+              avg = zero
+              do i = 1, n
+                 avg = avg + s( i ) * real( work( i ),KIND=qp)
+              end do
+              avg = avg / n
+              std = zero
+              do i = n+1, 2*n
+                 work( i ) = s( i-n ) * work( i-n ) - avg
+              end do
+              call stdlib_wlassq( n, work( n+1 ), 1, scale, sumsq )
+              std = scale * sqrt( sumsq / n )
+              if ( std < tol * avg ) goto 999
+              do i = 1, n
+                 t = cabs1( a( i, i ) )
+                 si = s( i )
+                 c2 = ( n-1 ) * t
+                 c1 = ( n-2 ) * ( real( work( i ),KIND=qp) - t*si )
+                 c0 = -(t*si)*si + 2 * real( work( i ),KIND=qp) * si - n*avg
+                 d = c1*c1 - 4*c0*c2
+                 if ( d <= 0 ) then
+                    info = -1
+                    return
+                 end if
+                 si = -2*c0 / ( c1 + sqrt( d ) )
+                 d = si - s( i )
+                 u = zero
+                 if ( up ) then
+                    do j = 1, i
+                       t = cabs1( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = cabs1( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 else
+                    do j = 1, i
+                       t = cabs1( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = cabs1( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 end if
+                 avg = avg + ( u + real( work( i ),KIND=qp) ) * d / n
+                 s( i ) = si
+              end do
+           end do
+           999 continue
+           smlnum = stdlib_qlamch( 'SAFEMIN' )
+           bignum = one / smlnum
+           smin = bignum
+           smax = zero
+           t = one / sqrt( avg )
+           base = stdlib_qlamch( 'B' )
+           u = one / log( base )
+           do i = 1, n
+              s( i ) = base ** int( u * log( s( i ) * t ),KIND=ilp)
+              smin = min( smin, s( i ) )
+              smax = max( smax, s( i ) )
+           end do
+           scond = max( smin, smlnum ) / min( smax, bignum )
+     end subroutine stdlib_wsyequb
+
+     !> ZSYMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric matrix.
+
+     pure subroutine stdlib_wsymv( uplo, n, alpha, a, lda, x, incx, beta, y, incy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           complex(qp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), x(*)
+           complex(qp), intent(inout) :: y(*)
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           complex(qp) :: temp1, temp2
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = 1
+           else if( n<0 ) then
+              info = 2
+           else if( lda<max( 1, n ) ) then
+              info = 5
+           else if( incx==0 ) then
+              info = 7
+           else if( incy==0 ) then
+              info = 10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ) .or. ( ( alpha==czero ) .and. ( beta==cone ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 ) then
+              kx = 1
+           else
+              kx = 1 - ( n-1 )*incx
+           end if
+           if( incy>0 ) then
+              ky = 1
+           else
+              ky = 1 - ( n-1 )*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           ! first form  y := beta*y.
+           if( beta/=cone ) then
+              if( incy==1 ) then
+                 if( beta==czero ) then
+                    do i = 1, n
+                       y( i ) = czero
+                    end do
+                 else
+                    do i = 1, n
+                       y( i ) = beta*y( i )
+                    end do
+                 end if
+              else
+                 iy = ky
+                 if( beta==czero ) then
+                    do i = 1, n
+                       y( iy ) = czero
+                       iy = iy + incy
+                    end do
+                 else
+                    do i = 1, n
+                       y( iy ) = beta*y( iy )
+                       iy = iy + incy
+                    end do
+                 end if
+              end if
+           end if
+           if( alpha==czero )return
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! form  y  when a is stored in upper triangle.
+              if( ( incx==1 ) .and. ( incy==1 ) ) then
+                 do j = 1, n
+                    temp1 = alpha*x( j )
+                    temp2 = czero
+                    do i = 1, j - 1
+                       y( i ) = y( i ) + temp1*a( i, j )
+                       temp2 = temp2 + a( i, j )*x( i )
+                    end do
+                    y( j ) = y( j ) + temp1*a( j, j ) + alpha*temp2
+                 end do
+              else
+                 jx = kx
+                 jy = ky
+                 do j = 1, n
+                    temp1 = alpha*x( jx )
+                    temp2 = czero
+                    ix = kx
+                    iy = ky
+                    do i = 1, j - 1
+                       y( iy ) = y( iy ) + temp1*a( i, j )
+                       temp2 = temp2 + a( i, j )*x( ix )
+                       ix = ix + incx
+                       iy = iy + incy
+                    end do
+                    y( jy ) = y( jy ) + temp1*a( j, j ) + alpha*temp2
+                    jx = jx + incx
+                    jy = jy + incy
+                 end do
+              end if
+           else
+              ! form  y  when a is stored in lower triangle.
+              if( ( incx==1 ) .and. ( incy==1 ) ) then
+                 do j = 1, n
+                    temp1 = alpha*x( j )
+                    temp2 = czero
+                    y( j ) = y( j ) + temp1*a( j, j )
+                    do i = j + 1, n
+                       y( i ) = y( i ) + temp1*a( i, j )
+                       temp2 = temp2 + a( i, j )*x( i )
+                    end do
+                    y( j ) = y( j ) + alpha*temp2
+                 end do
+              else
+                 jx = kx
+                 jy = ky
+                 do j = 1, n
+                    temp1 = alpha*x( jx )
+                    temp2 = czero
+                    y( jy ) = y( jy ) + temp1*a( j, j )
+                    ix = jx
+                    iy = jy
+                    do i = j + 1, n
+                       ix = ix + incx
+                       iy = iy + incy
+                       y( iy ) = y( iy ) + temp1*a( i, j )
+                       temp2 = temp2 + a( i, j )*x( ix )
+                    end do
+                    y( jy ) = y( jy ) + alpha*temp2
+                    jx = jx + incx
+                    jy = jy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wsymv
+
+     !> ZSYR:   performs the symmetric rank 1 operation
+     !> A := alpha*x*x**H + A,
+     !> where alpha is a complex scalar, x is an n element vector and A is an
+     !> n by n symmetric matrix.
+
+     pure subroutine stdlib_wsyr( uplo, n, alpha, x, incx, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incx, lda, n
+           complex(qp), intent(in) :: alpha
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: x(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, ix, j, jx, kx
+           complex(qp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = 1
+           else if( n<0 ) then
+              info = 2
+           else if( incx==0 ) then
+              info = 5
+           else if( lda<max( 1, n ) ) then
+              info = 7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYR  ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ) .or. ( alpha==czero ) )return
+           ! set the start point in x if the increment is not unity.
+           if( incx<=0 ) then
+              kx = 1 - ( n-1 )*incx
+           else if( incx/=1 ) then
+              kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! form  a  when a is stored in upper triangle.
+              if( incx==1 ) then
+                 do j = 1, n
+                    if( x( j )/=czero ) then
+                       temp = alpha*x( j )
+                       do i = 1, j
+                          a( i, j ) = a( i, j ) + x( i )*temp
+                       end do
+                    end if
+                 end do
+              else
+                 jx = kx
+                 do j = 1, n
+                    if( x( jx )/=czero ) then
+                       temp = alpha*x( jx )
+                       ix = kx
+                       do i = 1, j
+                          a( i, j ) = a( i, j ) + x( ix )*temp
+                          ix = ix + incx
+                       end do
+                    end if
+                    jx = jx + incx
+                 end do
+              end if
+           else
+              ! form  a  when a is stored in lower triangle.
+              if( incx==1 ) then
+                 do j = 1, n
+                    if( x( j )/=czero ) then
+                       temp = alpha*x( j )
+                       do i = j, n
+                          a( i, j ) = a( i, j ) + x( i )*temp
+                       end do
+                    end if
+                 end do
+              else
+                 jx = kx
+                 do j = 1, n
+                    if( x( jx )/=czero ) then
+                       temp = alpha*x( jx )
+                       ix = jx
+                       do i = j, n
+                          a( i, j ) = a( i, j ) + x( ix )*temp
+                          ix = ix + incx
+                       end do
+                    end if
+                    jx = jx + incx
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wsyr
+
+     !> ZSYRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric indefinite, and
+     !> provides error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_wsyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_wcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_wsymv( uplo, n, -cone, a, lda, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + cabs1( a( k, k ) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + cabs1( a( k, k ) )*xk
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_wsytrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                 call stdlib_waxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_wlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_wsytrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_wsytrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_wsyrfs
+
+     !> ZSYSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.  The factored form of A is then
+     !> used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_wsysv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_wsytrf( uplo, n, a, lda, ipiv, work, -1, info )
+                 lwkopt = real( work(1),KIND=qp)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYSV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_wsytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              if ( lwork<n ) then
+              ! solve with trs ( use level blas 2)
+                 call stdlib_wsytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+              else
+              ! solve with trs2 ( use level blas 3)
+                 call stdlib_wsytrs2( uplo,n,nrhs,a,lda,ipiv,b,ldb,work,info )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wsysv
+
+     !> ZSYSV computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Aasen's algorithm is used to factor A as
+     !> A = U**T * T * U,  if UPLO = 'U', or
+     !> A = L * T * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is symmetric tridiagonal. The factored
+     !> form of A is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_wsysv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, lwkopt_sytrf, lwkopt_sytrs
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max(2*n, 3*n-2) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              call stdlib_wsytrf_aa( uplo, n, a, lda, ipiv, work, -1, info )
+              lwkopt_sytrf = int( work(1),KIND=ilp)
+              call stdlib_wsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,-1, info )
+              lwkopt_sytrs = int( work(1),KIND=ilp)
+              lwkopt = max( lwkopt_sytrf, lwkopt_sytrs )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYSV_AA ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u**t*t*u or a = l*t*l**t.
+           call stdlib_wsytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_wsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wsysv_aa
+
+     !> ZSYSV_RK: computes the solution to a complex system of linear
+     !> equations A * X = B, where A is an N-by-N symmetric matrix
+     !> and X and B are N-by-NRHS matrices.
+     !> The bounded Bunch-Kaufman (rook) diagonal pivoting method is used
+     !> to factor A as
+     !> A = P*U*D*(U**T)*(P**T),  if UPLO = 'U', or
+     !> A = P*L*D*(L**T)*(P**T),  if UPLO = 'L',
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> ZSYTRF_RK is called to compute the factorization of a complex
+     !> symmetric matrix.  The factored form of A is then used to solve
+     !> the system of equations A * X = B by calling BLAS3 routine ZSYTRS_3.
+
+     pure subroutine stdlib_wsysv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_wsytrf_rk( uplo, n, a, lda, e, ipiv, work, -1, info )
+                 lwkopt = real( work(1),KIND=qp)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYSV_RK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = p*u*d*(u**t)*(p**t) or
+           ! a = p*u*d*(u**t)*(p**t).
+           call stdlib_wsytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b with blas3 solver, overwriting b with x.
+              call stdlib_wsytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wsysv_rk
+
+     !> ZSYSV_ROOK: computes the solution to a complex system of linear
+     !> equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> ZSYTRF_ROOK is called to compute the factorization of a complex
+     !> symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method.
+     !> The factored form of A is then used to solve the system
+     !> of equations A * X = B by calling ZSYTRS_ROOK.
+
+     pure subroutine stdlib_wsysv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_wsytrf_rook( uplo, n, a, lda, ipiv, work, -1, info )
+                 lwkopt = real( work(1),KIND=qp)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYSV_ROOK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_wsytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              ! solve with trs_rook ( use level 2 blas)
+              call stdlib_wsytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wsysv_rook
+
+     !> ZSYSVX: uses the diagonal pivoting factorization to compute the
+     !> solution to a complex system of linear equations A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_wsysvx( fact, uplo, n, nrhs, a, lda, af, ldaf, ipiv, b,ldb, x, ldx, rcond, &
+               ferr, berr, work, lwork,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, lwork, n, nrhs
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(inout) :: af(ldaf,*)
+           complex(qp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, nofact
+           integer(ilp) :: lwkopt, nb
+           real(qp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -11
+           else if( ldx<max( 1, n ) ) then
+              info = -13
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           if( info==0 ) then
+              lwkopt = max( 1, 2*n )
+              if( nofact ) then
+                 nb = stdlib_ilaenv( 1, 'ZSYTRF', uplo, n, -1, -1, -1 )
+                 lwkopt = max( lwkopt, n*nb )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYSVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+              call stdlib_wlacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_wsytrf( uplo, n, af, ldaf, ipiv, work, lwork, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_wlansy( 'I', uplo, n, a, lda, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_wsycon( uplo, n, af, ldaf, ipiv, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_wlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_wsytrs( uplo, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_wsyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_qlamch( 'EPSILON' ) )info = n + 1
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wsysvx
+
+     !> ZSYSWAPR: applies an elementary permutation on the rows and the columns of
+     !> a symmetric matrix.
+
+     pure subroutine stdlib_wsyswapr( uplo, n, a, lda, i1, i2)
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: i1, i2, lda, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,n)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           complex(qp) :: tmp
+           ! Executable Statements 
+           upper = stdlib_lsame( uplo, 'U' )
+           if (upper) then
+               ! upper
+               ! first swap
+                ! - swap column i1 and i2 from i1 to i1-1
+              call stdlib_wswap( i1-1, a(1,i1), 1, a(1,i2), 1 )
+                ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap row i1 from i1+1 to i2-1 with col i2 from i1+1 to i2-1
+              tmp=a(i1,i1)
+              a(i1,i1)=a(i2,i2)
+              a(i2,i2)=tmp
+              do i=1,i2-i1-1
+                 tmp=a(i1,i1+i)
+                 a(i1,i1+i)=a(i1+i,i2)
+                 a(i1+i,i2)=tmp
+              end do
+                ! third swap
+                ! - swap row i1 and i2 from i2+1 to n
+              do i=i2+1,n
+                 tmp=a(i1,i)
+                 a(i1,i)=a(i2,i)
+                 a(i2,i)=tmp
+              end do
+             else
+               ! lower
+               ! first swap
+                ! - swap row i1 and i2 from i1 to i1-1
+              call stdlib_wswap( i1-1, a(i1,1), lda, a(i2,1), lda )
+               ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap col i1 from i1+1 to i2-1 with row i2 from i1+1 to i2-1
+               tmp=a(i1,i1)
+               a(i1,i1)=a(i2,i2)
+               a(i2,i2)=tmp
+               do i=1,i2-i1-1
+                  tmp=a(i1+i,i1)
+                  a(i1+i,i1)=a(i2,i1+i)
+                  a(i2,i1+i)=tmp
+               end do
+               ! third swap
+                ! - swap col i1 and i2 from i2+1 to n
+               do i=i2+1,n
+                  tmp=a(i,i1)
+                  a(i,i1)=a(i,i2)
+                  a(i,i2)=tmp
+               end do
+           endif
+     end subroutine stdlib_wsyswapr
+
+     !> ZSYTF2: computes the factorization of a complex symmetric matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**T is the transpose of U, and D is symmetric and
+     !> block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_wsytf2( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kk, kp, kstep
+           real(qp) :: absakk, alpha, colmax, rowmax
+           complex(qp) :: d11, d12, d21, d22, r1, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,sqrt
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=qp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTF2', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero .or. stdlib_qisnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = imax + stdlib_iwamax( k-imax, a( imax, imax+1 ), lda )
+                    rowmax = cabs1( a( imax, jmax ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_iwamax( imax-1, a( 1, imax ), 1 )
+                       rowmax = max( rowmax, cabs1( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( a( imax, imax ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_wswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    call stdlib_wswap( kk-kp-1, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**t = a - w(k)*1/d(k)*w(k)**t
+                    r1 = cone / a( k, k )
+                    call stdlib_wsyr( uplo, k-1, -r1, a( 1, k ), 1, a, lda )
+                    ! store u(k) in column k
+                    call stdlib_wscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**t
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = cone / ( d11*d22-cone )
+                       d12 = t / d12
+                       do j = k - 2, 1, -1
+                          wkm1 = d12*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = d12*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - a( i, k )*wk -a( i, k-1 )*wkm1
+                          end do
+                          a( j, k ) = wk
+                          a( j, k-1 ) = wkm1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero .or. stdlib_qisnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = k - 1 + stdlib_iwamax( imax-k, a( imax, k ), lda )
+                    rowmax = cabs1( a( imax, jmax ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_iwamax( n-imax, a( imax+1, imax ), 1 )
+                       rowmax = max( rowmax, cabs1( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( a( imax, imax ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_wswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    call stdlib_wswap( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**t = a - w(k)*(1/d(k))*w(k)**t
+                       r1 = cone / a( k, k )
+                       call stdlib_wsyr( uplo, n-k, -r1, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                 
+                       ! store l(k) in column k
+                       call stdlib_wscal( n-k, r1, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k)
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**t
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**t
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       d21 = t / d21
+                       do j = k + 2, n
+                          wk = d21*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = d21*( d22*a( j, k+1 )-a( j, k ) )
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - a( i, k )*wk -a( i, k+1 )*wkp1
+                          end do
+                          a( j, k ) = wk
+                          a( j, k+1 ) = wkp1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_wsytf2
+
+     !> ZSYTF2_RK: computes the factorization of a complex symmetric matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_wsytf2_rk( uplo, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: e(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper, done
+           integer(ilp) :: i, imax, j, jmax, itemp, k, kk, kp, kstep, p, ii
+           real(qp) :: absakk, alpha, colmax, rowmax, dtemp, sfmin
+           complex(qp) :: d11, d12, d21, d22, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt,aimag,real
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=qp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTF2_RK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = czero
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 34
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = czero
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange,
+                    ! use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_iwamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_iwamax( imax-1, a( 1, imax ), 1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( cabs1( a( imax, imax ) )<alpha*rowmax ))then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the leading
+                    ! submatrix a(1:k,1:k) if we have a 2-by-2 pivot
+                    if( p>1 )call stdlib_wswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    if( p<(k-1) )call stdlib_wswap( k-p-1, a( p+1, k ), 1, a( p, p+1 ),lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_wswap( n-k, a( k, k+1 ), lda, a( p, k+1 ), lda )
+                 end if
+                 ! second swap
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    if( kp>1 )call stdlib_wswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    if( ( kk>1 ) .and. ( kp<(kk-1) ) )call stdlib_wswap( kk-kp-1, a( kp+1, kk ), &
+                              1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_wswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = cone / a( k, k )
+                          call stdlib_wsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_wscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_wsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = cone / ( d11*d22-cone )
+                       do j = k - 2, 1, -1
+                          wkm1 = t*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = t*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - (a( i, k ) / d12 )*wk -( a( i, k-1 ) / d12 )&
+                                       *wkm1
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d12
+                          a( j, k-1 ) = wkm1 / d12
+                       end do
+                    end if
+                    ! copy superdiagonal elements of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    e( k ) = a( k-1, k )
+                    e( k-1 ) = czero
+                    a( k-1, k ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              34 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = czero
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 64
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = czero
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_iwamax( imax-k, a( imax, k ), lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_iwamax( n-imax, a( imax+1, imax ),1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( cabs1( a( imax, imax ) )<alpha*rowmax ))then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 42
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the trailing
+                    ! submatrix a(k:n,k:n) if we have a 2-by-2 pivot
+                    if( p<n )call stdlib_wswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    if( p>(k+1) )call stdlib_wswap( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_wswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                 end if
+                 ! second swap
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_wswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    if( ( kk<n ) .and. ( kp>(kk+1) ) )call stdlib_wswap( kp-kk-1, a( kk+1, kk ), &
+                              1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_wswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                    ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                    ! store l(k) in column k
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = cone / a( k, k )
+                          call stdlib_wsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_wscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_wsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = t*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = t*( d22*a( j, k+1 )-a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - ( a( i, k ) / d21 )*wk -( a( i, k+1 ) / d21 )&
+                                       *wkp1
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d21
+                          a( j, k+1 ) = wkp1 / d21
+                       end do
+                    end if
+                    ! copy subdiagonal elements of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    e( k ) = a( k+1, k )
+                    e( k+1 ) = czero
+                    a( k+1, k ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+              64 continue
+           end if
+           return
+     end subroutine stdlib_wsytf2_rk
+
+     !> ZSYTF2_ROOK: computes the factorization of a complex symmetric matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**T is the transpose of U, and D is symmetric and
+     !> block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_wsytf2_rook( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: sevten = 17.0e+0_qp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper, done
+           integer(ilp) :: i, imax, j, jmax, itemp, k, kk, kp, kstep, p, ii
+           real(qp) :: absakk, alpha, colmax, rowmax, dtemp, sfmin
+           complex(qp) :: d11, d12, d21, d22, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt,aimag,real
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=qp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTF2_ROOK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_qlamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_iwamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange,
+                    ! use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_iwamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_iwamax( imax-1, a( 1, imax ), 1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! cabs1( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( cabs1(a( imax, imax ))<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the leading
+                    ! submatrix a(1:k,1:k) if we have a 2-by-2 pivot
+                    if( p>1 )call stdlib_wswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    if( p<(k-1) )call stdlib_wswap( k-p-1, a( p+1, k ), 1, a( p, p+1 ),lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                 end if
+                 ! second swap
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    if( kp>1 )call stdlib_wswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    if( ( kk>1 ) .and. ( kp<(kk-1) ) )call stdlib_wswap( kk-kp-1, a( kp+1, kk ), &
+                              1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = cone / a( k, k )
+                          call stdlib_wsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_wscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_wsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = cone / ( d11*d22-cone )
+                       do j = k - 2, 1, -1
+                          wkm1 = t*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = t*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - (a( i, k ) / d12 )*wk -( a( i, k-1 ) / d12 )&
+                                       *wkm1
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d12
+                          a( j, k-1 ) = wkm1 / d12
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_iwamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_iwamax( imax-k, a( imax, k ), lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_iwamax( n-imax, a( imax+1, imax ),1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! cabs1( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( cabs1(a( imax, imax ))<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 42
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the trailing
+                    ! submatrix a(k:n,k:n) if we have a 2-by-2 pivot
+                    if( p<n )call stdlib_wswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    if( p>(k+1) )call stdlib_wswap( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                 end if
+                 ! second swap
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_wswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    if( ( kk<n ) .and. ( kp>(kk+1) ) )call stdlib_wswap( kp-kk-1, a( kk+1, kk ), &
+                              1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                    ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                    ! store l(k) in column k
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = cone / a( k, k )
+                          call stdlib_wsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_wscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_wsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = t*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = t*( d22*a( j, k+1 )-a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - ( a( i, k ) / d21 )*wk -( a( i, k+1 ) / d21 )&
+                                       *wkp1
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d21
+                          a( j, k+1 ) = wkp1 / d21
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_wsytf2_rook
+
+     !> ZSYTRF: computes the factorization of a complex symmetric matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method.  The form of the
+     !> factorization is
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_wsytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'ZSYTRF', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZSYTRF', uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_wlasyf;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_wlasyf( uplo, k, nb, kb, a, lda, ipiv, work, n, iinfo )
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_wsytf2( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_wlasyf;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_wlasyf( uplo, n-k+1, nb, kb, a( k, k ), lda, ipiv( k ),work, n, &
+                           iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_wsytf2( uplo, n-k+1, a( k, k ), lda, ipiv( k ), iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wsytrf
+
+     !> ZSYTRF_AA: computes the factorization of a complex symmetric matrix A
+     !> using the Aasen's algorithm.  The form of the factorization is
+     !> A = U**T*T*U  or  A = L*T*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is a complex symmetric tridiagonal matrix.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_wsytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: j, lwkopt
+           integer(ilp) :: nb, mj, nj, k1, k2, j1, j2, j3, jb
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! determine the block size
+           nb = stdlib_ilaenv( 1, 'ZSYTRF_AA', uplo, n, -1, -1, -1 )
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              lwkopt = (nb+1)*n
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRF_AA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return
+           if ( n==0 ) then
+               return
+           endif
+           ipiv( 1 ) = 1
+           if ( n==1 ) then
+              return
+           end if
+           ! adjust block size based on the workspace size
+           if( lwork<((1+nb)*n) ) then
+              nb = ( lwork-n ) / n
+           end if
+           if( upper ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              ! copy first row a(1, 1:n) into h(1:n) (stored in work(1:n))
+              call stdlib_wcopy( n, a( 1, 1 ), lda, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_wlasyf;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              10 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j + 1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_wlasyf_aa( uplo, 2-k1, n-j, jb,a( max(1, j), j+1 ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_wswap( j1-k1-2, a( 1, j2 ), 1,a( 1, ipiv(j2) ), 1 )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+               ! the row a(j1-1, j2-1:n) stores u(j1, j2+1:n) and
+               ! work stores the current block of the auxiriarly matrix h
+              if( j<n ) then
+                 ! if first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = a( j, j+1 )
+                    a( j, j+1 ) = cone
+                    call stdlib_wcopy( n-j, a( j-1, j+1 ), lda,work( (j+1-j1+1)+jb*n ), 1 )
+                              
+                    call stdlib_wscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=1 and k2= 0 for the first panel,
+                     ! while k1=0 and k2=1 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_wgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_wgemv( 'NO TRANSPOSE', mj, jb+1,-cone, work( j3-j1+1+k1*n ),&
+                                     n,a( j1-k2, j3 ), 1,cone, a( j3, j3 ), lda )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block of j2-th block row with stdlib_wgemm
+                       call stdlib_wgemm( 'TRANSPOSE', 'TRANSPOSE',nj, n-j3+1, jb+1,-cone, a( j1-&
+                                 k2, j2 ), lda,work( j3-j1+1+k1*n ), n,cone, a( j2, j3 ), lda )
+                    end do
+                    ! recover t( j, j+1 )
+                    a( j, j+1 ) = alpha
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_wcopy( n-j, a( j+1, j+1 ), lda, work( 1 ), 1 )
+              end if
+              go to 10
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              ! copy first column a(1:n, 1) into h(1:n, 1)
+               ! (stored in work(1:n))
+              call stdlib_wcopy( n, a( 1, 1 ), 1, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_wlasyf;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              11 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j+1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_wlasyf_aa( uplo, 2-k1, n-j, jb,a( j+1, max(1, j) ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_wswap( j1-k1-2, a( j2, 1 ), lda,a( ipiv(j2), 1 ), lda )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+                ! a(j2+1, j1-1) stores l(j2+1, j1) and
+                ! work(j2+1, 1) stores h(j2+1, 1)
+              if( j<n ) then
+                 ! if first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = a( j+1, j )
+                    a( j+1, j ) = cone
+                    call stdlib_wcopy( n-j, a( j+1, j-1 ), 1,work( (j+1-j1+1)+jb*n ), 1 )
+                    call stdlib_wscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=1 and k2= 0 for the first panel,
+                     ! while k1=0 and k2=1 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_wgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_wgemv( 'NO TRANSPOSE', mj, jb+1,-cone, work( j3-j1+1+k1*n ),&
+                                     n,a( j3, j1-k2 ), lda,cone, a( j3, j3 ), 1 )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block in j2-th block column with stdlib_wgemm
+                       call stdlib_wgemm( 'NO TRANSPOSE', 'TRANSPOSE',n-j3+1, nj, jb+1,-cone, &
+                       work( j3-j1+1+k1*n ), n,a( j2, j1-k2 ), lda,cone, a( j3, j2 ), lda )
+                                 
+                    end do
+                    ! recover t( j+1, j )
+                    a( j+1, j ) = alpha
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_wcopy( n-j, a( j+1, j+1 ), 1, work( 1 ), 1 )
+              end if
+              go to 11
+           end if
+           20 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wsytrf_aa
+
+     !> ZSYTRF_RK: computes the factorization of a complex symmetric matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_wsytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, ip, iws, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'ZSYTRF_RK', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRF_RK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZSYTRF_RK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_wlasyf_rk;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 15
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_wlasyf_rk( uplo, k, nb, kb, a, lda, e,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_wsytf2_rk( uplo, k, a, lda, e, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k-kb+1:k and apply row permutations to the
+              ! last k+1 colunms k+1:n after that block
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k<n ) then
+                 do i = k, ( k - kb + 1 ), -1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_wswap( n-k, a( i, k+1 ), lda,a( ip, k+1 ), lda )
+                    end if
+                 end do
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+              ! this label is the exit from main loop over k decreasing
+              ! from n to 1 in steps of kb
+              15 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_wlasyf_rk;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 35
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_wlasyf_rk( uplo, n-k+1, nb, kb, a( k, k ), lda, e( k ),ipiv( k ), &
+                           work, ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_wsytf2_rk( uplo, n-k+1, a( k, k ), lda, e( k ),ipiv( k ), iinfo )
+                           
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do i = k, k + kb - 1
+                 if( ipiv( i )>0 ) then
+                    ipiv( i ) = ipiv( i ) + k - 1
+                 else
+                    ipiv( i ) = ipiv( i ) - k + 1
+                 end if
+              end do
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k:k+kb-1 and apply row permutations to the
+              ! first k-1 colunms 1:k-1 before that block
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k>1 ) then
+                 do i = k, ( k + kb - 1 ), 1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_wswap( k-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                    end if
+                 end do
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+              ! this label is the exit from main loop over k increasing
+              ! from 1 to n in steps of kb
+              35 continue
+           ! end lower
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wsytrf_rk
+
+     !> ZSYTRF_ROOK: computes the factorization of a complex symmetric matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+     !> The form of the factorization is
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_wsytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'ZSYTRF_ROOK', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, n*nb )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRF_ROOK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZSYTRF_ROOK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_wlasyf_rook;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_wlasyf_rook( uplo, k, nb, kb, a, lda,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_wsytf2_rook( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_wlasyf_rook;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_wlasyf_rook( uplo, n-k+1, nb, kb, a( k, k ), lda,ipiv( k ), work, &
+                           ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_wsytf2_rook( uplo, n-k+1, a( k, k ), lda, ipiv( k ),iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wsytrf_rook
+
+     !> ZSYTRI: computes the inverse of a complex symmetric indefinite matrix
+     !> A using the factorization A = U*D*U**T or A = L*D*L**T computed by
+     !> ZSYTRF.
+
+     pure subroutine stdlib_wsytri( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kp, kstep
+           complex(qp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 40
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = cone / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_wcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_wsymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_wdotu( k-1, work, 1, a( 1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = a( k, k+1 )
+                 ak = a( k, k ) / t
+                 akp1 = a( k+1, k+1 ) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-cone )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_wcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_wsymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_wdotu( k-1, work, 1, a( 1, k ),1 )
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_wdotu( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_wcopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_wsymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -stdlib_wdotu( k-1, work, 1, a( 1, k+1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 call stdlib_wswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                 call stdlib_wswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              go to 30
+              40 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              50 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 60
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = cone / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_wcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_wsymv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_wdotu( n-k, work, 1, a( k+1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = a( k, k-1 )
+                 ak = a( k-1, k-1 ) / t
+                 akp1 = a( k, k ) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-cone )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_wcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_wsymv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_wdotu( n-k, work, 1, a( k+1, k ),1 )
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_wdotu( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 &
+                              )
+                    call stdlib_wcopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_wsymv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+&
+                              1, k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -stdlib_wdotu( n-k, work, 1, a( k+1, k-1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 if( kp<n )call stdlib_wswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                 call stdlib_wswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              go to 50
+              60 continue
+           end if
+           return
+     end subroutine stdlib_wsytri
+
+     !> ZSYTRI_ROOK: computes the inverse of a complex symmetric
+     !> matrix A using the factorization A = U*D*U**T or A = L*D*L**T
+     !> computed by ZSYTRF_ROOK.
+
+     pure subroutine stdlib_wsytri_rook( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kp, kstep
+           complex(qp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRI_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 40
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = cone / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_wcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_wsymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_wdotu( k-1, work, 1, a( 1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = a( k, k+1 )
+                 ak = a( k, k ) / t
+                 akp1 = a( k+1, k+1 ) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-cone )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_wcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_wsymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_wdotu( k-1, work, 1, a( 1, k ),1 )
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_wdotu( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_wcopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_wsymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -stdlib_wdotu( k-1, work, 1, a( 1, k+1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_wswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_wswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k+1 with -ipiv(k) and
+                 ! -ipiv(k+1)in the leading submatrix a(1:k+1,1:k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_wswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_wswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+                 k = k + 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_wswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_wswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k + 1
+              go to 30
+              40 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              50 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 60
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = cone / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_wcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_wsymv( uplo, n-k,-cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+1,&
+                               k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_wdotu( n-k, work, 1, a( k+1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = a( k, k-1 )
+                 ak = a( k-1, k-1 ) / t
+                 akp1 = a( k, k ) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-cone )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_wcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_wsymv( uplo, n-k,-cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+1,&
+                               k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_wdotu( n-k, work, 1, a( k+1, k ),1 )
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_wdotu( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 &
+                              )
+                    call stdlib_wcopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_wsymv( uplo, n-k,-cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+1,&
+                               k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -stdlib_wdotu( n-k, work, 1, a( k+1, k-1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_wswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_wswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k-1 with -ipiv(k) and
+                 ! -ipiv(k-1) in the trailing submatrix a(k-1:n,k-1:n)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_wswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_wswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+                 k = k - 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_wswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_wswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k - 1
+              go to 50
+              60 continue
+           end if
+           return
+     end subroutine stdlib_wsytri_rook
+
+     !> ZSYTRS: solves a system of linear equations A*X = B with a complex
+     !> symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by ZSYTRF.
+
+     pure subroutine stdlib_wsytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           complex(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_wgeru( k-1, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_wscal( nrhs, cone / a( k, k ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_wswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_wgeru( k-2, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 call stdlib_wgeru( k-2, nrhs, -cone, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ), &
+                           ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_wgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb, a( 1, k ),1, cone, b( &
+                           k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 call stdlib_wgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb, a( 1, k ),1, cone, b( &
+                           k, 1 ), ldb )
+                 call stdlib_wgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb,a( 1, k+1 ), 1, cone, b(&
+                            k+1, 1 ), ldb )
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_wgeru( n-k, nrhs, -cone, a( k+1, k ), 1, b( k, 1 ),ldb, b( &
+                           k+1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_wscal( nrhs, cone / a( k, k ), b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_wswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_wgeru( n-k-1, nrhs, -cone, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_wgeru( n-k-1, nrhs, -cone, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / akm1k
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_wgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+&
+                           1, k ), 1, cone, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_wgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+1, k )&
+                              , 1, cone, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+1, k-&
+                              1 ), 1, cone, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_wsytrs
+
+     !> ZSYTRS2: solves a system of linear equations A*X = B with a complex
+     !> symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by ZSYTRF and converted by ZSYCONV.
+
+     pure subroutine stdlib_wsytrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, j, k, kp
+           complex(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRS2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! convert a
+           call stdlib_wsyconv( uplo, 'C', n, a, lda, ipiv, work, iinfo )
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+             ! p**t * b
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp==-ipiv( k-1 ) )call stdlib_wswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb &
+                           )
+                 k=k-2
+              end if
+             end do
+        ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+             call stdlib_wtrsm('L','U','N','U',n,nrhs,cone,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i=n
+              do while ( i >= 1 )
+                 if( ipiv(i) > 0 ) then
+                   call stdlib_wscal( nrhs, cone / a( i, i ), b( i, 1 ), ldb )
+                 elseif ( i > 1) then
+                    if ( ipiv(i-1) == ipiv(i) ) then
+                       akm1k = work(i)
+                       akm1 = a( i-1, i-1 ) / akm1k
+                       ak = a( i, i ) / akm1k
+                       denom = akm1*ak - cone
+                       do j = 1, nrhs
+                          bkm1 = b( i-1, j ) / akm1k
+                          bk = b( i, j ) / akm1k
+                          b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                          b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                    i = i - 1
+                    endif
+                 endif
+                 i = i - 1
+              end do
+            ! compute (u**t \ b) -> b   [ u**t \ (d \ (u \p**t * b) ) ]
+              call stdlib_wtrsm('L','U','T','U',n,nrhs,cone,a,lda,b,ldb)
+             ! p * b  [ p * (u**t \ (d \ (u \p**t * b) )) ]
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k < n .and. kp==-ipiv( k+1 ) )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp,&
+                            1 ), ldb )
+                 k=k+2
+              endif
+             end do
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+             ! p**t * b
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k+1).
+                 kp = -ipiv( k+1 )
+                 if( kp==-ipiv( k ) )call stdlib_wswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                           
+                 k=k+2
+              endif
+             end do
+        ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+             call stdlib_wtrsm('L','L','N','U',n,nrhs,cone,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i=1
+              do while ( i <= n )
+                 if( ipiv(i) > 0 ) then
+                   call stdlib_wscal( nrhs, cone / a( i, i ), b( i, 1 ), ldb )
+                 else
+                       akm1k = work(i)
+                       akm1 = a( i, i ) / akm1k
+                       ak = a( i+1, i+1 ) / akm1k
+                       denom = akm1*ak - cone
+                       do j = 1, nrhs
+                          bkm1 = b( i, j ) / akm1k
+                          bk = b( i+1, j ) / akm1k
+                          b( i, j ) = ( ak*bkm1-bk ) / denom
+                          b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                       i = i + 1
+                 endif
+                 i = i + 1
+              end do
+        ! compute (l**t \ b) -> b   [ l**t \ (d \ (l \p**t * b) ) ]
+             call stdlib_wtrsm('L','L','T','U',n,nrhs,cone,a,lda,b,ldb)
+             ! p * b  [ p * (l**t \ (d \ (l \p**t * b) )) ]
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k>1 .and. kp==-ipiv( k-1 ) )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, &
+                           1 ), ldb )
+                 k=k-2
+              endif
+             end do
+           end if
+           ! revert a
+           call stdlib_wsyconv( uplo, 'R', n, a, lda, ipiv, work, iinfo )
+           return
+     end subroutine stdlib_wsytrs2
+
+     !> ZSYTRS_3: solves a system of linear equations A * X = B with a complex
+     !> symmetric matrix A using the factorization computed
+     !> by ZSYTRF_RK or ZSYTRF_BK:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This algorithm is using Level 3 BLAS.
+
+     pure subroutine stdlib_wsytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*), e(*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j, k, kp
+           complex(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRS_3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! begin upper
+              ! solve a*x = b, where a = u*d*u**t.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+              call stdlib_wtrsm( 'L', 'U', 'N', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i = n
+              do while ( i>=1 )
+                 if( ipiv( i )>0 ) then
+                    call stdlib_wscal( nrhs, cone / a( i, i ), b( i, 1 ), ldb )
+                 else if ( i>1 ) then
+                    akm1k = e( i )
+                    akm1 = a( i-1, i-1 ) / akm1k
+                    ak = a( i, i ) / akm1k
+                    denom = akm1*ak - cone
+                    do j = 1, nrhs
+                       bkm1 = b( i-1, j ) / akm1k
+                       bk = b( i, j ) / akm1k
+                       b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                       b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i - 1
+                 end if
+                 i = i - 1
+              end do
+              ! compute (u**t \ b) -> b   [ u**t \ (d \ (u \p**t * b) ) ]
+              call stdlib_wtrsm( 'L', 'U', 'T', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! p * b  [ p * (u**t \ (d \ (u \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+           else
+              ! begin lower
+              ! solve a*x = b, where a = l*d*l**t.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+              call stdlib_wtrsm( 'L', 'L', 'N', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i = 1
+              do while ( i<=n )
+                 if( ipiv( i )>0 ) then
+                    call stdlib_wscal( nrhs, cone / a( i, i ), b( i, 1 ), ldb )
+                 else if( i<n ) then
+                    akm1k = e( i )
+                    akm1 = a( i, i ) / akm1k
+                    ak = a( i+1, i+1 ) / akm1k
+                    denom = akm1*ak - cone
+                    do  j = 1, nrhs
+                       bkm1 = b( i, j ) / akm1k
+                       bk = b( i+1, j ) / akm1k
+                       b( i, j ) = ( ak*bkm1-bk ) / denom
+                       b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i + 1
+                 end if
+                 i = i + 1
+              end do
+              ! compute (l**t \ b) -> b   [ l**t \ (d \ (l \p**t * b) ) ]
+              call stdlib_wtrsm('L', 'L', 'T', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! p * b  [ p * (l**t \ (d \ (l \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! end lower
+           end if
+           return
+     end subroutine stdlib_wsytrs_3
+
+     !> ZSYTRS_AA: solves a system of linear equations A*X = B with a complex
+     !> symmetric matrix A using the factorization A = U**T*T*U or
+     !> A = L*T*L**T computed by ZSYTRF_AA.
+
+     pure subroutine stdlib_wsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, nrhs, lda, ldb, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           logical(lk) :: lquery, upper
+           integer(ilp) :: k, kp, lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, 3*n-2 ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRS_AA', -info )
+              return
+           else if( lquery ) then
+              lwkopt = (3*n-2)
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u**t*t*u.
+              ! 1) forward substitution with u**t
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute u**t \ b -> b    [ (u**t \p**t * b) ]
+                 call stdlib_wtrsm( 'L', 'U', 'T', 'U', n-1, nrhs, cone, a( 1, 2 ),lda, b( 2, 1 ),&
+                            ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (u**t \p**t * b) ]
+              call stdlib_wlacpy( 'F', 1, n, a( 1, 1 ), lda+1, work( n ), 1)
+              if( n>1 ) then
+                 call stdlib_wlacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 1 ), 1 )
+                 call stdlib_wlacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 2*n ), 1 )
+              end if
+              call stdlib_wgtsv( n, nrhs, work( 1 ), work( n ), work( 2*n ), b, ldb,info )
+              ! 3) backward substitution with u
+              if( n>1 ) then
+                 ! compute u \ b -> b   [ u \ (t \ (u**t \p**t * b) ) ]
+                 call stdlib_wtrsm( 'L', 'U', 'N', 'U', n-1, nrhs, cone, a( 1, 2 ),lda, b( 2, 1 ),&
+                            ldb)
+                 ! pivot, p * b -> b  [ p * (u \ (t \ (u**t \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           else
+              ! solve a*x = b, where a = l*t*l**t.
+              ! 1) forward substitution with l
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute l \ b -> b    [ (l \p**t * b) ]
+                 call stdlib_wtrsm( 'L', 'L', 'N', 'U', n-1, nrhs, cone, a( 2, 1 ),lda, b( 2, 1 ),&
+                            ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (l \p**t * b) ]
+              call stdlib_wlacpy( 'F', 1, n, a(1, 1), lda+1, work(n), 1)
+              if( n>1 ) then
+                 call stdlib_wlacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 1 ), 1 )
+                 call stdlib_wlacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 2*n ), 1 )
+              end if
+              call stdlib_wgtsv( n, nrhs, work( 1 ), work(n), work( 2*n ), b, ldb,info)
+              ! 3) backward substitution with l**t
+              if( n>1 ) then
+                 ! compute (l**t \ b) -> b   [ l**t \ (t \ (l \p**t * b) ) ]
+                 call stdlib_wtrsm( 'L', 'L', 'T', 'U', n-1, nrhs, cone, a( 2, 1 ),lda, b( 2, 1 ),&
+                            ldb)
+                 ! pivot, p * b -> b  [ p * (l**t \ (t \ (l \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wsytrs_aa
+
+     !> ZSYTRS_ROOK: solves a system of linear equations A*X = B with
+     !> a complex symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by ZSYTRF_ROOK.
+
+     pure subroutine stdlib_wsytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           complex(qp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRS_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_wgeru( k-1, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_wscal( nrhs, cone / a( k, k ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k) then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_wswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k>2 ) then
+                    call stdlib_wgeru( k-2, nrhs,-cone, a( 1, k ), 1, b( k, 1 ),ldb, b( 1, 1 ), &
+                              ldb )
+                    call stdlib_wgeru( k-2, nrhs,-cone, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 )&
+                              , ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 )call stdlib_wgemv( 'TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), 1, &
+                           cone, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_wgemv( 'TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), 1, cone, &
+                              b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k+1 ), 1, cone,&
+                               b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k) then k+1 and -ipiv(k+1).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_wswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_wgeru( n-k, nrhs, -cone, a( k+1, k ), 1, b( k, 1 ),ldb, b( &
+                           k+1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_wscal( nrhs, cone / a( k, k ), b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k) then k+1 and -ipiv(k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_wswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_wgeru( n-k-1, nrhs,-cone, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_wgeru( n-k-1, nrhs,-cone, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( k+&
+                              2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / akm1k
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_wgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+&
+                           1, k ), 1, cone, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_wgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+1, k )&
+                              , 1, cone, b( k, 1 ), ldb )
+                    call stdlib_wgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+1, k-&
+                              1 ), 1, cone, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k) then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_wswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_wswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_wsytrs_rook
+
+     !> ZTBCON: estimates the reciprocal of the condition number of a
+     !> triangular band matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_wtbcon( norm, uplo, diag, n, kd, ab, ldab, rcond, work,rwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(in) :: ab(ldab,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(qp) :: ainvnm, anorm, scale, smlnum, xnorm
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )*real( max( n, 1 ),KIND=qp)
+           ! compute the 1-norm of the triangular matrix a or a**h.
+           anorm = stdlib_wlantb( norm, uplo, diag, n, kd, ab, ldab, rwork )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the 1-norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_wlatbs( uplo, 'NO TRANSPOSE', diag, normin, n, kd,ab, ldab, work, &
+                              scale, rwork, info )
+                 else
+                    ! multiply by inv(a**h).
+                    call stdlib_wlatbs( uplo, 'CONJUGATE TRANSPOSE', diag, normin,n, kd, ab, ldab,&
+                               work, scale, rwork, info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_iwamax( n, work, 1 )
+                    xnorm = cabs1( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_wdrscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_wtbcon
+
+     !> ZTBRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular band
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by ZTBTRS or some other
+     !> means before entering this routine.  ZTBRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_wtbrfs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, x, ldx, ferr,&
+                berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: ab(ldab,*), b(ldb,*), x(ldx,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transn, transt
+           integer(ilp) :: i, j, k, kase, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = kd + 2
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_wcopy( n, x( 1, j ), 1, work, 1 )
+              call stdlib_wtbmv( uplo, trans, diag, n, kd, ab, ldab, work, 1 )
+              call stdlib_waxpy( n, -cone, b( 1, j ), 1, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = max( 1, k-kd ), k
+                             rwork( i ) = rwork( i ) +cabs1( ab( kd+1+i-k, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = max( 1, k-kd ), k - 1
+                             rwork( i ) = rwork( i ) +cabs1( ab( kd+1+i-k, k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k, min( n, k+kd )
+                             rwork( i ) = rwork( i ) +cabs1( ab( 1+i-k, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k + 1, min( n, k+kd )
+                             rwork( i ) = rwork( i ) +cabs1( ab( 1+i-k, k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**h)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = max( 1, k-kd ), k
+                             s = s + cabs1( ab( kd+1+i-k, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = max( 1, k-kd ), k - 1
+                             s = s + cabs1( ab( kd+1+i-k, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, min( n, k+kd )
+                             s = s + cabs1( ab( 1+i-k, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = k + 1, min( n, k+kd )
+                             s = s + cabs1( ab( 1+i-k, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_wlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_wtbsv( uplo, transt, diag, n, kd, ab, ldab, work,1 )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_wtbsv( uplo, transn, diag, n, kd, ab, ldab, work,1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_wtbrfs
+
+     !> ZTBTRS: solves a triangular system of the form
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> where A is a triangular band matrix of order N, and B is an
+     !> N-by-NRHS matrix.  A check is made to verify that A is nonsingular.
+
+     pure subroutine stdlib_wtbtrs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           complex(qp), intent(in) :: ab(ldab,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nounit = stdlib_lsame( diag, 'N' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              if( upper ) then
+                 do info = 1, n
+                    if( ab( kd+1, info )==czero )return
+                 end do
+              else
+                 do info = 1, n
+                    if( ab( 1, info )==czero )return
+                 end do
+              end if
+           end if
+           info = 0
+           ! solve a * x = b,  a**t * x = b,  or  a**h * x = b.
+           do j = 1, nrhs
+              call stdlib_wtbsv( uplo, trans, diag, n, kd, ab, ldab, b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_wtbtrs
+
+     !> Level 3 BLAS like routine for A in RFP Format.
+     !> ZTFSM:  solves the matrix equation
+     !> op( A )*X = alpha*B  or  X*op( A ) = alpha*B
+     !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**H.
+     !> A is in Rectangular Full Packed (RFP) Format.
+     !> The matrix X is overwritten on B.
+
+     pure subroutine stdlib_wtfsm( transr, side, uplo, trans, diag, m, n, alpha, a,b, ldb )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, diag, side, trans, uplo
+           integer(ilp), intent(in) :: ldb, m, n
+           complex(qp), intent(in) :: alpha
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(0:*)
+           complex(qp), intent(inout) :: b(0:ldb-1,0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lside, misodd, nisodd, normaltransr, notrans
+           integer(ilp) :: m1, m2, n1, n2, k, info, i, j
+           ! Intrinsic Functions 
+           intrinsic :: max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lside = stdlib_lsame( side, 'L' )
+           lower = stdlib_lsame( uplo, 'L' )
+           notrans = stdlib_lsame( trans, 'N' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lside .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -2
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -3
+           else if( .not.notrans .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -4
+           else if( .not.stdlib_lsame( diag, 'N' ) .and. .not.stdlib_lsame( diag, 'U' ) )&
+                     then
+              info = -5
+           else if( m<0 ) then
+              info = -6
+           else if( n<0 ) then
+              info = -7
+           else if( ldb<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTFSM ', -info )
+              return
+           end if
+           ! quick return when ( (n==0).or.(m==0) )
+           if( ( m==0 ) .or. ( n==0 ) )return
+           ! quick return when alpha==(0e+0_qp,0e+0_qp)
+           if( alpha==czero ) then
+              do j = 0, n - 1
+                 do i = 0, m - 1
+                    b( i, j ) = czero
+                 end do
+              end do
+              return
+           end if
+           if( lside ) then
+              ! side = 'l'
+              ! a is m-by-m.
+              ! if m is odd, set nisodd = .true., and m1 and m2.
+              ! if m is even, nisodd = .false., and m.
+              if( mod( m, 2 )==0 ) then
+                 misodd = .false.
+                 k = m / 2
+              else
+                 misodd = .true.
+                 if( lower ) then
+                    m2 = m / 2
+                    m1 = m - m2
+                 else
+                    m1 = m / 2
+                    m2 = m - m1
+                 end if
+              end if
+              if( misodd ) then
+                 ! side = 'l' and n is odd
+                 if( normaltransr ) then
+                    ! side = 'l', n is odd, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='l', n is odd, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'n'
+                          if( m==1 ) then
+                             call stdlib_wtrsm( 'L', 'L', 'N', diag, m1, n, alpha,a, m, b, ldb )
+                                       
+                          else
+                             call stdlib_wtrsm( 'L', 'L', 'N', diag, m1, n, alpha,a( 0 ), m, b, &
+                                       ldb )
+                             call stdlib_wgemm( 'N', 'N', m2, n, m1, -cone, a( m1 ),m, b, ldb, &
+                                       alpha, b( m1, 0 ), ldb )
+                             call stdlib_wtrsm( 'L', 'U', 'C', diag, m2, n, cone,a( m ), m, b( m1,&
+                                        0 ), ldb )
+                          end if
+                       else
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'c'
+                          if( m==1 ) then
+                             call stdlib_wtrsm( 'L', 'L', 'C', diag, m1, n, alpha,a( 0 ), m, b, &
+                                       ldb )
+                          else
+                             call stdlib_wtrsm( 'L', 'U', 'N', diag, m2, n, alpha,a( m ), m, b( &
+                                       m1, 0 ), ldb )
+                             call stdlib_wgemm( 'C', 'N', m1, n, m2, -cone, a( m1 ),m, b( m1, 0 ),&
+                                        ldb, alpha, b, ldb )
+                             call stdlib_wtrsm( 'L', 'L', 'C', diag, m1, n, cone,a( 0 ), m, b, &
+                                       ldb )
+                          end if
+                       end if
+                    else
+                       ! side  ='l', n is odd, transr = 'n', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_wtrsm( 'L', 'L', 'N', diag, m1, n, alpha,a( m2 ), m, b, ldb &
+                                    )
+                          call stdlib_wgemm( 'C', 'N', m2, n, m1, -cone, a( 0 ), m,b, ldb, alpha, &
+                                    b( m1, 0 ), ldb )
+                          call stdlib_wtrsm( 'L', 'U', 'C', diag, m2, n, cone,a( m1 ), m, b( m1, &
+                                    0 ), ldb )
+                       else
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'c'
+                          call stdlib_wtrsm( 'L', 'U', 'N', diag, m2, n, alpha,a( m1 ), m, b( m1, &
+                                    0 ), ldb )
+                          call stdlib_wgemm( 'N', 'N', m1, n, m2, -cone, a( 0 ), m,b( m1, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_wtrsm( 'L', 'L', 'C', diag, m1, n, cone,a( m2 ), m, b, ldb )
+                                    
+                       end if
+                    end if
+                 else
+                    ! side = 'l', n is odd, and transr = 'c'
+                    if( lower ) then
+                       ! side  ='l', n is odd, transr = 'c', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is odd, transr = 'c', uplo = 'l', and
+                          ! trans = 'n'
+                          if( m==1 ) then
+                             call stdlib_wtrsm( 'L', 'U', 'C', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                          else
+                             call stdlib_wtrsm( 'L', 'U', 'C', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                             call stdlib_wgemm( 'C', 'N', m2, n, m1, -cone,a( m1*m1 ), m1, b, ldb,&
+                                        alpha,b( m1, 0 ), ldb )
+                             call stdlib_wtrsm( 'L', 'L', 'N', diag, m2, n, cone,a( 1 ), m1, b( &
+                                       m1, 0 ), ldb )
+                          end if
+                       else
+                          ! side  ='l', n is odd, transr = 'c', uplo = 'l', and
+                          ! trans = 'c'
+                          if( m==1 ) then
+                             call stdlib_wtrsm( 'L', 'U', 'N', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                          else
+                             call stdlib_wtrsm( 'L', 'L', 'C', diag, m2, n, alpha,a( 1 ), m1, b( &
+                                       m1, 0 ), ldb )
+                             call stdlib_wgemm( 'N', 'N', m1, n, m2, -cone,a( m1*m1 ), m1, b( m1, &
+                                       0 ), ldb,alpha, b, ldb )
+                             call stdlib_wtrsm( 'L', 'U', 'N', diag, m1, n, cone,a( 0 ), m1, b, &
+                                       ldb )
+                          end if
+                       end if
+                    else
+                       ! side  ='l', n is odd, transr = 'c', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is odd, transr = 'c', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_wtrsm( 'L', 'U', 'C', diag, m1, n, alpha,a( m2*m2 ), m2, b, &
+                                    ldb )
+                          call stdlib_wgemm( 'N', 'N', m2, n, m1, -cone, a( 0 ), m2,b, ldb, alpha,&
+                                     b( m1, 0 ), ldb )
+                          call stdlib_wtrsm( 'L', 'L', 'N', diag, m2, n, cone,a( m1*m2 ), m2, b( &
+                                    m1, 0 ), ldb )
+                       else
+                          ! side  ='l', n is odd, transr = 'c', uplo = 'u', and
+                          ! trans = 'c'
+                          call stdlib_wtrsm( 'L', 'L', 'C', diag, m2, n, alpha,a( m1*m2 ), m2, b( &
+                                    m1, 0 ), ldb )
+                          call stdlib_wgemm( 'C', 'N', m1, n, m2, -cone, a( 0 ), m2,b( m1, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_wtrsm( 'L', 'U', 'N', diag, m1, n, cone,a( m2*m2 ), m2, b, &
+                                    ldb )
+                       end if
+                    end if
+                 end if
+              else
+                 ! side = 'l' and n is even
+                 if( normaltransr ) then
+                    ! side = 'l', n is even, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='l', n is even, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_wtrsm( 'L', 'L', 'N', diag, k, n, alpha,a( 1 ), m+1, b, ldb &
+                                    )
+                          call stdlib_wgemm( 'N', 'N', k, n, k, -cone, a( k+1 ),m+1, b, ldb, &
+                                    alpha, b( k, 0 ), ldb )
+                          call stdlib_wtrsm( 'L', 'U', 'C', diag, k, n, cone,a( 0 ), m+1, b( k, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'c'
+                          call stdlib_wtrsm( 'L', 'U', 'N', diag, k, n, alpha,a( 0 ), m+1, b( k, &
+                                    0 ), ldb )
+                          call stdlib_wgemm( 'C', 'N', k, n, k, -cone, a( k+1 ),m+1, b( k, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_wtrsm( 'L', 'L', 'C', diag, k, n, cone,a( 1 ), m+1, b, ldb )
+                                    
+                       end if
+                    else
+                       ! side  ='l', n is even, transr = 'n', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_wtrsm( 'L', 'L', 'N', diag, k, n, alpha,a( k+1 ), m+1, b, &
+                                    ldb )
+                          call stdlib_wgemm( 'C', 'N', k, n, k, -cone, a( 0 ), m+1,b, ldb, alpha, &
+                                    b( k, 0 ), ldb )
+                          call stdlib_wtrsm( 'L', 'U', 'C', diag, k, n, cone,a( k ), m+1, b( k, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'c'
+                          call stdlib_wtrsm( 'L', 'U', 'N', diag, k, n, alpha,a( k ), m+1, b( k, &
+                                    0 ), ldb )
+                          call stdlib_wgemm( 'N', 'N', k, n, k, -cone, a( 0 ), m+1,b( k, 0 ), ldb,&
+                                     alpha, b, ldb )
+                          call stdlib_wtrsm( 'L', 'L', 'C', diag, k, n, cone,a( k+1 ), m+1, b, &
+                                    ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'l', n is even, and transr = 'c'
+                    if( lower ) then
+                       ! side  ='l', n is even, transr = 'c', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is even, transr = 'c', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_wtrsm( 'L', 'U', 'C', diag, k, n, alpha,a( k ), k, b, ldb )
+                                    
+                          call stdlib_wgemm( 'C', 'N', k, n, k, -cone,a( k*( k+1 ) ), k, b, ldb, &
+                                    alpha,b( k, 0 ), ldb )
+                          call stdlib_wtrsm( 'L', 'L', 'N', diag, k, n, cone,a( 0 ), k, b( k, 0 ),&
+                                     ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'c', uplo = 'l',
+                          ! and trans = 'c'
+                          call stdlib_wtrsm( 'L', 'L', 'C', diag, k, n, alpha,a( 0 ), k, b( k, 0 )&
+                                    , ldb )
+                          call stdlib_wgemm( 'N', 'N', k, n, k, -cone,a( k*( k+1 ) ), k, b( k, 0 )&
+                                    , ldb,alpha, b, ldb )
+                          call stdlib_wtrsm( 'L', 'U', 'N', diag, k, n, cone,a( k ), k, b, ldb )
+                                    
+                       end if
+                    else
+                       ! side  ='l', n is even, transr = 'c', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is even, transr = 'c', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_wtrsm( 'L', 'U', 'C', diag, k, n, alpha,a( k*( k+1 ) ), k, &
+                                    b, ldb )
+                          call stdlib_wgemm( 'N', 'N', k, n, k, -cone, a( 0 ), k, b,ldb, alpha, b(&
+                                     k, 0 ), ldb )
+                          call stdlib_wtrsm( 'L', 'L', 'N', diag, k, n, cone,a( k*k ), k, b( k, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'c', uplo = 'u',
+                          ! and trans = 'c'
+                          call stdlib_wtrsm( 'L', 'L', 'C', diag, k, n, alpha,a( k*k ), k, b( k, &
+                                    0 ), ldb )
+                          call stdlib_wgemm( 'C', 'N', k, n, k, -cone, a( 0 ), k,b( k, 0 ), ldb, &
+                                    alpha, b, ldb )
+                          call stdlib_wtrsm( 'L', 'U', 'N', diag, k, n, cone,a( k*( k+1 ) ), k, b,&
+                                     ldb )
+                       end if
+                    end if
+                 end if
+              end if
+           else
+              ! side = 'r'
+              ! a is n-by-n.
+              ! if n is odd, set nisodd = .true., and n1 and n2.
+              ! if n is even, nisodd = .false., and k.
+              if( mod( n, 2 )==0 ) then
+                 nisodd = .false.
+                 k = n / 2
+              else
+                 nisodd = .true.
+                 if( lower ) then
+                    n2 = n / 2
+                    n1 = n - n2
+                 else
+                    n1 = n / 2
+                    n2 = n - n1
+                 end if
+              end if
+              if( nisodd ) then
+                 ! side = 'r' and n is odd
+                 if( normaltransr ) then
+                    ! side = 'r', n is odd, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='r', n is odd, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'n'
+                          call stdlib_wtrsm( 'R', 'U', 'C', diag, m, n2, alpha,a( n ), n, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_wgemm( 'N', 'N', m, n1, n2, -cone, b( 0, n1 ),ldb, a( n1 ), &
+                                    n, alpha, b( 0, 0 ),ldb )
+                          call stdlib_wtrsm( 'R', 'L', 'N', diag, m, n1, cone,a( 0 ), n, b( 0, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'c'
+                          call stdlib_wtrsm( 'R', 'L', 'C', diag, m, n1, alpha,a( 0 ), n, b( 0, 0 &
+                                    ), ldb )
+                          call stdlib_wgemm( 'N', 'C', m, n2, n1, -cone, b( 0, 0 ),ldb, a( n1 ), &
+                                    n, alpha, b( 0, n1 ),ldb )
+                          call stdlib_wtrsm( 'R', 'U', 'N', diag, m, n2, cone,a( n ), n, b( 0, n1 &
+                                    ), ldb )
+                       end if
+                    else
+                       ! side  ='r', n is odd, transr = 'n', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_wtrsm( 'R', 'L', 'C', diag, m, n1, alpha,a( n2 ), n, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_wgemm( 'N', 'N', m, n2, n1, -cone, b( 0, 0 ),ldb, a( 0 ), n,&
+                                     alpha, b( 0, n1 ),ldb )
+                          call stdlib_wtrsm( 'R', 'U', 'N', diag, m, n2, cone,a( n1 ), n, b( 0, &
+                                    n1 ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'c'
+                          call stdlib_wtrsm( 'R', 'U', 'C', diag, m, n2, alpha,a( n1 ), n, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_wgemm( 'N', 'C', m, n1, n2, -cone, b( 0, n1 ),ldb, a( 0 ), &
+                                    n, alpha, b( 0, 0 ), ldb )
+                          call stdlib_wtrsm( 'R', 'L', 'N', diag, m, n1, cone,a( n2 ), n, b( 0, 0 &
+                                    ), ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'r', n is odd, and transr = 'c'
+                    if( lower ) then
+                       ! side  ='r', n is odd, transr = 'c', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'c', uplo = 'l', and
+                          ! trans = 'n'
+                          call stdlib_wtrsm( 'R', 'L', 'N', diag, m, n2, alpha,a( 1 ), n1, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_wgemm( 'N', 'C', m, n1, n2, -cone, b( 0, n1 ),ldb, a( n1*n1 &
+                                    ), n1, alpha, b( 0, 0 ),ldb )
+                          call stdlib_wtrsm( 'R', 'U', 'C', diag, m, n1, cone,a( 0 ), n1, b( 0, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'c', uplo = 'l', and
+                          ! trans = 'c'
+                          call stdlib_wtrsm( 'R', 'U', 'N', diag, m, n1, alpha,a( 0 ), n1, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_wgemm( 'N', 'N', m, n2, n1, -cone, b( 0, 0 ),ldb, a( n1*n1 )&
+                                    , n1, alpha, b( 0, n1 ),ldb )
+                          call stdlib_wtrsm( 'R', 'L', 'C', diag, m, n2, cone,a( 1 ), n1, b( 0, &
+                                    n1 ), ldb )
+                       end if
+                    else
+                       ! side  ='r', n is odd, transr = 'c', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'c', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_wtrsm( 'R', 'U', 'N', diag, m, n1, alpha,a( n2*n2 ), n2, b( &
+                                    0, 0 ), ldb )
+                          call stdlib_wgemm( 'N', 'C', m, n2, n1, -cone, b( 0, 0 ),ldb, a( 0 ), &
+                                    n2, alpha, b( 0, n1 ),ldb )
+                          call stdlib_wtrsm( 'R', 'L', 'C', diag, m, n2, cone,a( n1*n2 ), n2, b( &
+                                    0, n1 ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'c', uplo = 'u', and
+                          ! trans = 'c'
+                          call stdlib_wtrsm( 'R', 'L', 'N', diag, m, n2, alpha,a( n1*n2 ), n2, b( &
+                                    0, n1 ), ldb )
+                          call stdlib_wgemm( 'N', 'N', m, n1, n2, -cone, b( 0, n1 ),ldb, a( 0 ), &
+                                    n2, alpha, b( 0, 0 ),ldb )
+                          call stdlib_wtrsm( 'R', 'U', 'C', diag, m, n1, cone,a( n2*n2 ), n2, b( &
+                                    0, 0 ), ldb )
+                       end if
+                    end if
+                 end if
+              else
+                 ! side = 'r' and n is even
+                 if( normaltransr ) then
+                    ! side = 'r', n is even, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='r', n is even, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_wtrsm( 'R', 'U', 'C', diag, m, k, alpha,a( 0 ), n+1, b( 0, &
+                                    k ), ldb )
+                          call stdlib_wgemm( 'N', 'N', m, k, k, -cone, b( 0, k ),ldb, a( k+1 ), n+&
+                                    1, alpha, b( 0, 0 ),ldb )
+                          call stdlib_wtrsm( 'R', 'L', 'N', diag, m, k, cone,a( 1 ), n+1, b( 0, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'c'
+                          call stdlib_wtrsm( 'R', 'L', 'C', diag, m, k, alpha,a( 1 ), n+1, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_wgemm( 'N', 'C', m, k, k, -cone, b( 0, 0 ),ldb, a( k+1 ), n+&
+                                    1, alpha, b( 0, k ),ldb )
+                          call stdlib_wtrsm( 'R', 'U', 'N', diag, m, k, cone,a( 0 ), n+1, b( 0, k &
+                                    ), ldb )
+                       end if
+                    else
+                       ! side  ='r', n is even, transr = 'n', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_wtrsm( 'R', 'L', 'C', diag, m, k, alpha,a( k+1 ), n+1, b( 0,&
+                                     0 ), ldb )
+                          call stdlib_wgemm( 'N', 'N', m, k, k, -cone, b( 0, 0 ),ldb, a( 0 ), n+1,&
+                                     alpha, b( 0, k ),ldb )
+                          call stdlib_wtrsm( 'R', 'U', 'N', diag, m, k, cone,a( k ), n+1, b( 0, k &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'c'
+                          call stdlib_wtrsm( 'R', 'U', 'C', diag, m, k, alpha,a( k ), n+1, b( 0, &
+                                    k ), ldb )
+                          call stdlib_wgemm( 'N', 'C', m, k, k, -cone, b( 0, k ),ldb, a( 0 ), n+1,&
+                                     alpha, b( 0, 0 ),ldb )
+                          call stdlib_wtrsm( 'R', 'L', 'N', diag, m, k, cone,a( k+1 ), n+1, b( 0, &
+                                    0 ), ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'r', n is even, and transr = 'c'
+                    if( lower ) then
+                       ! side  ='r', n is even, transr = 'c', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'c', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_wtrsm( 'R', 'L', 'N', diag, m, k, alpha,a( 0 ), k, b( 0, k )&
+                                    , ldb )
+                          call stdlib_wgemm( 'N', 'C', m, k, k, -cone, b( 0, k ),ldb, a( ( k+1 )&
+                                    *k ), k, alpha,b( 0, 0 ), ldb )
+                          call stdlib_wtrsm( 'R', 'U', 'C', diag, m, k, cone,a( k ), k, b( 0, 0 ),&
+                                     ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'c', uplo = 'l',
+                          ! and trans = 'c'
+                          call stdlib_wtrsm( 'R', 'U', 'N', diag, m, k, alpha,a( k ), k, b( 0, 0 )&
+                                    , ldb )
+                          call stdlib_wgemm( 'N', 'N', m, k, k, -cone, b( 0, 0 ),ldb, a( ( k+1 )&
+                                    *k ), k, alpha,b( 0, k ), ldb )
+                          call stdlib_wtrsm( 'R', 'L', 'C', diag, m, k, cone,a( 0 ), k, b( 0, k ),&
+                                     ldb )
+                       end if
+                    else
+                       ! side  ='r', n is even, transr = 'c', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'c', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_wtrsm( 'R', 'U', 'N', diag, m, k, alpha,a( ( k+1 )*k ), k, &
+                                    b( 0, 0 ), ldb )
+                          call stdlib_wgemm( 'N', 'C', m, k, k, -cone, b( 0, 0 ),ldb, a( 0 ), k, &
+                                    alpha, b( 0, k ), ldb )
+                          call stdlib_wtrsm( 'R', 'L', 'C', diag, m, k, cone,a( k*k ), k, b( 0, k &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'c', uplo = 'u',
+                          ! and trans = 'c'
+                          call stdlib_wtrsm( 'R', 'L', 'N', diag, m, k, alpha,a( k*k ), k, b( 0, &
+                                    k ), ldb )
+                          call stdlib_wgemm( 'N', 'N', m, k, k, -cone, b( 0, k ),ldb, a( 0 ), k, &
+                                    alpha, b( 0, 0 ), ldb )
+                          call stdlib_wtrsm( 'R', 'U', 'C', diag, m, k, cone,a( ( k+1 )*k ), k, b(&
+                                     0, 0 ), ldb )
+                       end if
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_wtfsm
+
+     !> ZTFTRI: computes the inverse of a triangular matrix A stored in RFP
+     !> format.
+     !> This is a Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_wtftri( transr, uplo, diag, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo, diag
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( .not.stdlib_lsame( diag, 'N' ) .and. .not.stdlib_lsame( diag, 'U' ) )&
+                     then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTFTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_wtrtri( 'L', diag, n1, a( 0 ), n, info )
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'R', 'L', 'N', diag, n2, n1, -cone, a( 0 ),n, a( n1 ), n )
+                              
+                    call stdlib_wtrtri( 'U', diag, n2, a( n ), n, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'L', 'U', 'C', diag, n2, n1, cone, a( n ), n,a( n1 ), n )
+                              
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_wtrtri( 'L', diag, n1, a( n2 ), n, info )
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'L', 'L', 'C', diag, n1, n2, -cone, a( n2 ),n, a( 0 ), n )
+                              
+                    call stdlib_wtrtri( 'U', diag, n2, a( n1 ), n, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'R', 'U', 'N', diag, n1, n2, cone, a( n1 ),n, a( 0 ), n )
+                              
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0), t2 -> a(1), s -> a(0+n1*n1)
+                    call stdlib_wtrtri( 'U', diag, n1, a( 0 ), n1, info )
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'L', 'U', 'N', diag, n1, n2, -cone, a( 0 ),n1, a( n1*n1 ), &
+                              n1 )
+                    call stdlib_wtrtri( 'L', diag, n2, a( 1 ), n1, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'R', 'L', 'C', diag, n1, n2, cone, a( 1 ),n1, a( n1*n1 ), &
+                              n1 )
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0+n2*n2), t2 -> a(0+n1*n2), s -> a(0)
+                    call stdlib_wtrtri( 'U', diag, n1, a( n2*n2 ), n2, info )
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'R', 'U', 'C', diag, n2, n1, -cone,a( n2*n2 ), n2, a( 0 ), &
+                              n2 )
+                    call stdlib_wtrtri( 'L', diag, n2, a( n1*n2 ), n2, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'L', 'L', 'N', diag, n2, n1, cone,a( n1*n2 ), n2, a( 0 ), &
+                              n2 )
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_wtrtri( 'L', diag, k, a( 1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'R', 'L', 'N', diag, k, k, -cone, a( 1 ),n+1, a( k+1 ), n+&
+                              1 )
+                    call stdlib_wtrtri( 'U', diag, k, a( 0 ), n+1, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'L', 'U', 'C', diag, k, k, cone, a( 0 ), n+1,a( k+1 ), n+1 &
+                              )
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_wtrtri( 'L', diag, k, a( k+1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'L', 'L', 'C', diag, k, k, -cone, a( k+1 ),n+1, a( 0 ), n+&
+                              1 )
+                    call stdlib_wtrtri( 'U', diag, k, a( k ), n+1, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'R', 'U', 'N', diag, k, k, cone, a( k ), n+1,a( 0 ), n+1 )
+                              
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_wtrtri( 'U', diag, k, a( k ), k, info )
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'L', 'U', 'N', diag, k, k, -cone, a( k ), k,a( k*( k+1 ) ),&
+                               k )
+                    call stdlib_wtrtri( 'L', diag, k, a( 0 ), k, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'R', 'L', 'C', diag, k, k, cone, a( 0 ), k,a( k*( k+1 ) ), &
+                              k )
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_wtrtri( 'U', diag, k, a( k*( k+1 ) ), k, info )
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'R', 'U', 'C', diag, k, k, -cone,a( k*( k+1 ) ), k, a( 0 ),&
+                               k )
+                    call stdlib_wtrtri( 'L', diag, k, a( k*k ), k, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_wtrmm( 'L', 'L', 'N', diag, k, k, cone, a( k*k ), k,a( 0 ), k )
+                              
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_wtftri
+
+     !> ZTFTTP: copies a triangular matrix A from rectangular full packed
+     !> format (TF) to standard packed format (TP).
+
+     pure subroutine stdlib_wtfttp( transr, uplo, n, arf, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(qp), intent(out) :: ap(0:*)
+           complex(qp), intent(in) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt
+           integer(ilp) :: i, j, ij
+           integer(ilp) :: ijp, jp, lda, js
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Intrinsic Functions 
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTFTTP', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( normaltransr ) then
+                 ap( 0 ) = arf( 0 )
+              else
+                 ap( 0 ) = conjg( arf( 0 ) )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           ! set lda of arf^c; arf^c is (0:(n+1)/2-1,0:n-noe)
+           ! where noe = 0 if n is even, noe = 1 if n is odd
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              lda = n + 1
+           else
+              nisodd = .true.
+              lda = n
+           end if
+           ! arf^c has lda rows and n+1-noe cols
+           if( .not.normaltransr )lda = ( n+1 ) / 2
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda = n
+                    ijp = 0
+                    jp = 0
+                    do j = 0, n2
+                       do i = j, n - 1
+                          ij = i + jp
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, n2 - 1
+                       do j = 1 + i, n2
+                          ij = i + j*lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    ijp = 0
+                    do j = 0, n1 - 1
+                       ij = n2 + j
+                       do i = 0, j
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = n1, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ijp = 0
+                    do i = 0, n2
+                       do ij = i*( lda+1 ), n*lda - 1, lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 1
+                    do j = 0, n2 - 1
+                       do ij = js, js + n2 - j - 1
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    ijp = 0
+                    js = n2*lda
+                    do j = 0, n1 - 1
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, n1
+                       do ij = i, i + ( n1+i )*lda, lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    ijp = 0
+                    jp = 0
+                    do j = 0, k - 1
+                       do i = j, n - 1
+                          ij = 1 + i + jp
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, k - 1
+                       do j = i, k - 1
+                          ij = i + j*lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    ijp = 0
+                    do j = 0, k - 1
+                       ij = k + 1 + j
+                       do i = 0, j
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = k, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    ijp = 0
+                    do i = 0, k - 1
+                       do ij = i + ( i+1 )*lda, ( n+1 )*lda - 1, lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 0
+                    do j = 0, k - 1
+                       do ij = js, js + k - j - 1
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    ijp = 0
+                    js = ( k+1 )*lda
+                    do j = 0, k - 1
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, k - 1
+                       do ij = i, i + ( k+i )*lda, lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_wtfttp
+
+     !> ZTFTTR: copies a triangular matrix A from rectangular full packed
+     !> format (TF) to standard full format (TR).
+
+     pure subroutine stdlib_wtfttr( transr, uplo, n, arf, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           complex(qp), intent(out) :: a(0:lda-1,0:*)
+           complex(qp), intent(in) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt, nx2, np1x2
+           integer(ilp) :: i, j, l, ij
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTFTTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 ) then
+                 if( normaltransr ) then
+                    a( 0, 0 ) = arf( 0 )
+                 else
+                    a( 0, 0 ) = conjg( arf( 0 ) )
+                 end if
+              end if
+              return
+           end if
+           ! size of array arf(1:2,0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower: for n even n1=n2=k
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true., lda=n+1 and a is (n+1)--by--k2.
+           ! if n is even, set k = n/2 and nisodd = .false., lda=n and a is
+           ! n--by--(n+1)/2.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              if( .not.lower )np1x2 = n + n + 2
+           else
+              nisodd = .true.
+              if( .not.lower )nx2 = n + n
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda=n
+                    ij = 0
+                    do j = 0, n2
+                       do i = n1, n2 + j
+                          a( n2+j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0); lda=n
+                    ij = nt - n
+                    do j = n - 1, n1, -1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = j - n1, n1 - 1
+                          a( j-n1, l ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       ij = ij - nx2
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ij = 0
+                    do j = 0, n2 - 1
+                       do i = 0, j
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       do i = n1 + j, n - 1
+                          a( i, n1+j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = n2, n - 1
+                       do i = 0, n1 - 1
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    ij = 0
+                    do j = 0, n1
+                       do i = n1, n - 1
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, n1 - 1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = n2 + j, n - 1
+                          a( n2+j, l ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1); lda=n+1
+                    ij = 0
+                    do j = 0, k - 1
+                       do i = k, k + j
+                          a( k+j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0); lda=n+1
+                    ij = nt - n - 1
+                    do j = n - 1, k, -1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = j - k, k - 1
+                          a( j-k, l ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       ij = ij - np1x2
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper, a=b)
+                    ! t1 -> a(0,1) , t2 -> a(0,0) , s -> a(0,k+1) :
+                    ! t1 -> a(0+k) , t2 -> a(0+0) , s -> a(0+k*(k+1)); lda=k
+                    ij = 0
+                    j = k
+                    do i = k, n - 1
+                       a( i, j ) = arf( ij )
+                       ij = ij + 1
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       do i = k + 1 + j, n - 1
+                          a( i, k+1+j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = k - 1, n - 1
+                       do i = 0, k - 1
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper, a=b)
+                    ! t1 -> a(0,k+1) , t2 -> a(0,k) , s -> a(0,0)
+                    ! t1 -> a(0+k*(k+1)) , t2 -> a(0+k*k) , s -> a(0+0)); lda=k
+                    ij = 0
+                    do j = 0, k
+                       do i = k, n - 1
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = k + 1 + j, n - 1
+                          a( k+1+j, l ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    ! note that here j = k-1
+                    do i = 0, j
+                       a( i, j ) = arf( ij )
+                       ij = ij + 1
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_wtfttr
+
+     !> ZTGEVC: computes some or all of the right and/or left eigenvectors of
+     !> a pair of complex matrices (S,P), where S and P are upper triangular.
+     !> Matrix pairs of this type are produced by the generalized Schur
+     !> factorization of a complex matrix pair (A,B):
+     !> A = Q*S*Z**H,  B = Q*P*Z**H
+     !> as computed by ZGGHRD + ZHGEQZ.
+     !> The right eigenvector x and the left eigenvector y of (S,P)
+     !> corresponding to an eigenvalue w are defined by:
+     !> S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
+     !> where y**H denotes the conjugate tranpose of y.
+     !> The eigenvalues are not input to this routine, but are computed
+     !> directly from the diagonal elements of S and P.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of (S,P), or the products Z*X and/or Q*Y,
+     !> where Z and Q are input matrices.
+     !> If Q and Z are the unitary factors from the generalized Schur
+     !> factorization of a matrix pair (A,B), then Z*X and Q*Y
+     !> are the matrices of right and left eigenvectors of (A,B).
+
+     pure subroutine stdlib_wtgevc( side, howmny, select, n, s, lds, p, ldp, vl,ldvl, vr, ldvr, &
+               mm, m, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldp, lds, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(in) :: p(ldp,*), s(lds,*)
+           complex(qp), intent(inout) :: vl(ldvl,*), vr(ldvr,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: compl, compr, ilall, ilback, ilbbad, ilcomp, lsa, lsb
+           integer(ilp) :: i, ibeg, ieig, iend, ihwmny, im, iside, isrc, j, je, jr
+           real(qp) :: acoefa, acoeff, anorm, ascale, bcoefa, big, bignum, bnorm, bscale, dmin, &
+                     safmin, sbeta, scale, small, temp, ulp, xmax
+           complex(qp) :: bcoeff, ca, cb, d, salpha, sum, suma, sumb, x
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,min
+           ! Statement Functions 
+           real(qp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=qp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode and test the input parameters
+           if( stdlib_lsame( howmny, 'A' ) ) then
+              ihwmny = 1
+              ilall = .true.
+              ilback = .false.
+           else if( stdlib_lsame( howmny, 'S' ) ) then
+              ihwmny = 2
+              ilall = .false.
+              ilback = .false.
+           else if( stdlib_lsame( howmny, 'B' ) ) then
+              ihwmny = 3
+              ilall = .true.
+              ilback = .true.
+           else
+              ihwmny = -1
+           end if
+           if( stdlib_lsame( side, 'R' ) ) then
+              iside = 1
+              compl = .false.
+              compr = .true.
+           else if( stdlib_lsame( side, 'L' ) ) then
+              iside = 2
+              compl = .true.
+              compr = .false.
+           else if( stdlib_lsame( side, 'B' ) ) then
+              iside = 3
+              compl = .true.
+              compr = .true.
+           else
+              iside = -1
+           end if
+           info = 0
+           if( iside<0 ) then
+              info = -1
+           else if( ihwmny<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lds<max( 1, n ) ) then
+              info = -6
+           else if( ldp<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGEVC', -info )
+              return
+           end if
+           ! count the number of eigenvectors
+           if( .not.ilall ) then
+              im = 0
+              do j = 1, n
+                 if( select( j ) )im = im + 1
+              end do
+           else
+              im = n
+           end if
+           ! check diagonal of b
+           ilbbad = .false.
+           do j = 1, n
+              if( aimag( p( j, j ) )/=zero )ilbbad = .true.
+           end do
+           if( ilbbad ) then
+              info = -7
+           else if( compl .and. ldvl<n .or. ldvl<1 ) then
+              info = -10
+           else if( compr .and. ldvr<n .or. ldvr<1 ) then
+              info = -12
+           else if( mm<im ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGEVC', -info )
+              return
+           end if
+           ! quick return if possible
+           m = im
+           if( n==0 )return
+           ! machine constants
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           big = one / safmin
+           call stdlib_qlabad( safmin, big )
+           ulp = stdlib_qlamch( 'EPSILON' )*stdlib_qlamch( 'BASE' )
+           small = safmin*n / ulp
+           big = one / small
+           bignum = one / ( safmin*n )
+           ! compute the 1-norm of each column of the strictly upper triangular
+           ! part of a and b to check for possible overflow in the triangular
+           ! solver.
+           anorm = abs1( s( 1, 1 ) )
+           bnorm = abs1( p( 1, 1 ) )
+           rwork( 1 ) = zero
+           rwork( n+1 ) = zero
+           do j = 2, n
+              rwork( j ) = zero
+              rwork( n+j ) = zero
+              do i = 1, j - 1
+                 rwork( j ) = rwork( j ) + abs1( s( i, j ) )
+                 rwork( n+j ) = rwork( n+j ) + abs1( p( i, j ) )
+              end do
+              anorm = max( anorm, rwork( j )+abs1( s( j, j ) ) )
+              bnorm = max( bnorm, rwork( n+j )+abs1( p( j, j ) ) )
+           end do
+           ascale = one / max( anorm, safmin )
+           bscale = one / max( bnorm, safmin )
+           ! left eigenvectors
+           if( compl ) then
+              ieig = 0
+              ! main loop over eigenvalues
+              loop_140: do je = 1, n
+                 if( ilall ) then
+                    ilcomp = .true.
+                 else
+                    ilcomp = select( je )
+                 end if
+                 if( ilcomp ) then
+                    ieig = ieig + 1
+                    if( abs1( s( je, je ) )<=safmin .and.abs( real( p( je, je ),KIND=qp) )&
+                              <=safmin ) then
+                       ! singular matrix pencil -- return unit eigenvector
+                       do jr = 1, n
+                          vl( jr, ieig ) = czero
+                       end do
+                       vl( ieig, ieig ) = cone
+                       cycle loop_140
+                    end if
+                    ! non-singular eigenvalue:
+                    ! compute coefficients  a  and  b  in
+                         ! h
+                       ! y  ( a a - b b ) = 0
+                    temp = one / max( abs1( s( je, je ) )*ascale,abs( real( p( je, je ),KIND=qp) )&
+                              *bscale, safmin )
+                    salpha = ( temp*s( je, je ) )*ascale
+                    sbeta = ( temp*real( p( je, je ),KIND=qp) )*bscale
+                    acoeff = sbeta*ascale
+                    bcoeff = salpha*bscale
+                    ! scale to avoid underflow
+                    lsa = abs( sbeta )>=safmin .and. abs( acoeff )<small
+                    lsb = abs1( salpha )>=safmin .and. abs1( bcoeff )<small
+                    scale = one
+                    if( lsa )scale = ( small / abs( sbeta ) )*min( anorm, big )
+                    if( lsb )scale = max( scale, ( small / abs1( salpha ) )*min( bnorm, big ) )
+                              
+                    if( lsa .or. lsb ) then
+                       scale = min( scale, one /( safmin*max( one, abs( acoeff ),abs1( bcoeff ) ) &
+                                 ) )
+                       if( lsa ) then
+                          acoeff = ascale*( scale*sbeta )
+                       else
+                          acoeff = scale*acoeff
+                       end if
+                       if( lsb ) then
+                          bcoeff = bscale*( scale*salpha )
+                       else
+                          bcoeff = scale*bcoeff
+                       end if
+                    end if
+                    acoefa = abs( acoeff )
+                    bcoefa = abs1( bcoeff )
+                    xmax = one
+                    do jr = 1, n
+                       work( jr ) = czero
+                    end do
+                    work( je ) = cone
+                    dmin = max( ulp*acoefa*anorm, ulp*bcoefa*bnorm, safmin )
+                                                    ! h
+                    ! triangular solve of  (a a - b b)  y = 0
+                                            ! h
+                    ! (rowwise in  (a a - b b) , or columnwise in a a - b b)
+                    loop_100: do j = je + 1, n
+                       ! compute
+                             ! j-1
+                       ! sum = sum  conjg( a*s(k,j) - b*p(k,j) )*x(k)
+                             ! k=je
+                       ! (scale if necessary)
+                       temp = one / xmax
+                       if( acoefa*rwork( j )+bcoefa*rwork( n+j )>bignum*temp ) then
+                          do jr = je, j - 1
+                             work( jr ) = temp*work( jr )
+                          end do
+                          xmax = one
+                       end if
+                       suma = czero
+                       sumb = czero
+                       do jr = je, j - 1
+                          suma = suma + conjg( s( jr, j ) )*work( jr )
+                          sumb = sumb + conjg( p( jr, j ) )*work( jr )
+                       end do
+                       sum = acoeff*suma - conjg( bcoeff )*sumb
+                       ! form x(j) = - sum / conjg( a*s(j,j) - b*p(j,j) )
+                       ! with scaling and perturbation of the denominator
+                       d = conjg( acoeff*s( j, j )-bcoeff*p( j, j ) )
+                       if( abs1( d )<=dmin )d = cmplx( dmin,KIND=qp)
+                       if( abs1( d )<one ) then
+                          if( abs1( sum )>=bignum*abs1( d ) ) then
+                             temp = one / abs1( sum )
+                             do jr = je, j - 1
+                                work( jr ) = temp*work( jr )
+                             end do
+                             xmax = temp*xmax
+                             sum = temp*sum
+                          end if
+                       end if
+                       work( j ) = stdlib_wladiv( -sum, d )
+                       xmax = max( xmax, abs1( work( j ) ) )
+                    end do loop_100
+                    ! back transform eigenvector if howmny='b'.
+                    if( ilback ) then
+                       call stdlib_wgemv( 'N', n, n+1-je, cone, vl( 1, je ), ldvl,work( je ), 1, &
+                                 czero, work( n+1 ), 1 )
+                       isrc = 2
+                       ibeg = 1
+                    else
+                       isrc = 1
+                       ibeg = je
+                    end if
+                    ! copy and scale eigenvector into column of vl
+                    xmax = zero
+                    do jr = ibeg, n
+                       xmax = max( xmax, abs1( work( ( isrc-1 )*n+jr ) ) )
+                    end do
+                    if( xmax>safmin ) then
+                       temp = one / xmax
+                       do jr = ibeg, n
+                          vl( jr, ieig ) = temp*work( ( isrc-1 )*n+jr )
+                       end do
+                    else
+                       ibeg = n + 1
+                    end if
+                    do jr = 1, ibeg - 1
+                       vl( jr, ieig ) = czero
+                    end do
+                 end if
+              end do loop_140
+           end if
+           ! right eigenvectors
+           if( compr ) then
+              ieig = im + 1
+              ! main loop over eigenvalues
+              loop_250: do je = n, 1, -1
+                 if( ilall ) then
+                    ilcomp = .true.
+                 else
+                    ilcomp = select( je )
+                 end if
+                 if( ilcomp ) then
+                    ieig = ieig - 1
+                    if( abs1( s( je, je ) )<=safmin .and.abs( real( p( je, je ),KIND=qp) )&
+                              <=safmin ) then
+                       ! singular matrix pencil -- return unit eigenvector
+                       do jr = 1, n
+                          vr( jr, ieig ) = czero
+                       end do
+                       vr( ieig, ieig ) = cone
+                       cycle loop_250
+                    end if
+                    ! non-singular eigenvalue:
+                    ! compute coefficients  a  and  b  in
+                    ! ( a a - b b ) x  = 0
+                    temp = one / max( abs1( s( je, je ) )*ascale,abs( real( p( je, je ),KIND=qp) )&
+                              *bscale, safmin )
+                    salpha = ( temp*s( je, je ) )*ascale
+                    sbeta = ( temp*real( p( je, je ),KIND=qp) )*bscale
+                    acoeff = sbeta*ascale
+                    bcoeff = salpha*bscale
+                    ! scale to avoid underflow
+                    lsa = abs( sbeta )>=safmin .and. abs( acoeff )<small
+                    lsb = abs1( salpha )>=safmin .and. abs1( bcoeff )<small
+                    scale = one
+                    if( lsa )scale = ( small / abs( sbeta ) )*min( anorm, big )
+                    if( lsb )scale = max( scale, ( small / abs1( salpha ) )*min( bnorm, big ) )
+                              
+                    if( lsa .or. lsb ) then
+                       scale = min( scale, one /( safmin*max( one, abs( acoeff ),abs1( bcoeff ) ) &
+                                 ) )
+                       if( lsa ) then
+                          acoeff = ascale*( scale*sbeta )
+                       else
+                          acoeff = scale*acoeff
+                       end if
+                       if( lsb ) then
+                          bcoeff = bscale*( scale*salpha )
+                       else
+                          bcoeff = scale*bcoeff
+                       end if
+                    end if
+                    acoefa = abs( acoeff )
+                    bcoefa = abs1( bcoeff )
+                    xmax = one
+                    do jr = 1, n
+                       work( jr ) = czero
+                    end do
+                    work( je ) = cone
+                    dmin = max( ulp*acoefa*anorm, ulp*bcoefa*bnorm, safmin )
+                    ! triangular solve of  (a a - b b) x = 0  (columnwise)
+                    ! work(1:j-1) contains sums w,
+                    ! work(j+1:je) contains x
+                    do jr = 1, je - 1
+                       work( jr ) = acoeff*s( jr, je ) - bcoeff*p( jr, je )
+                    end do
+                    work( je ) = cone
+                    loop_210: do j = je - 1, 1, -1
+                       ! form x(j) := - w(j) / d
+                       ! with scaling and perturbation of the denominator
+                       d = acoeff*s( j, j ) - bcoeff*p( j, j )
+                       if( abs1( d )<=dmin )d = cmplx( dmin,KIND=qp)
+                       if( abs1( d )<one ) then
+                          if( abs1( work( j ) )>=bignum*abs1( d ) ) then
+                             temp = one / abs1( work( j ) )
+                             do jr = 1, je
+                                work( jr ) = temp*work( jr )
+                             end do
+                          end if
+                       end if
+                       work( j ) = stdlib_wladiv( -work( j ), d )
+                       if( j>1 ) then
+                          ! w = w + x(j)*(a s(*,j) - b p(*,j) ) with scaling
+                          if( abs1( work( j ) )>one ) then
+                             temp = one / abs1( work( j ) )
+                             if( acoefa*rwork( j )+bcoefa*rwork( n+j )>=bignum*temp ) then
+                                do jr = 1, je
+                                   work( jr ) = temp*work( jr )
+                                end do
+                             end if
+                          end if
+                          ca = acoeff*work( j )
+                          cb = bcoeff*work( j )
+                          do jr = 1, j - 1
+                             work( jr ) = work( jr ) + ca*s( jr, j ) -cb*p( jr, j )
+                          end do
+                       end if
+                    end do loop_210
+                    ! back transform eigenvector if howmny='b'.
+                    if( ilback ) then
+                       call stdlib_wgemv( 'N', n, je, cone, vr, ldvr, work, 1,czero, work( n+1 ), &
+                                 1 )
+                       isrc = 2
+                       iend = n
+                    else
+                       isrc = 1
+                       iend = je
+                    end if
+                    ! copy and scale eigenvector into column of vr
+                    xmax = zero
+                    do jr = 1, iend
+                       xmax = max( xmax, abs1( work( ( isrc-1 )*n+jr ) ) )
+                    end do
+                    if( xmax>safmin ) then
+                       temp = one / xmax
+                       do jr = 1, iend
+                          vr( jr, ieig ) = temp*work( ( isrc-1 )*n+jr )
+                       end do
+                    else
+                       iend = 0
+                    end if
+                    do jr = iend + 1, n
+                       vr( jr, ieig ) = czero
+                    end do
+                 end if
+              end do loop_250
+           end if
+           return
+     end subroutine stdlib_wtgevc
+
+     !> ZTGEX2: swaps adjacent diagonal 1 by 1 blocks (A11,B11) and (A22,B22)
+     !> in an upper triangular matrix pair (A, B) by an unitary equivalence
+     !> transformation.
+     !> (A, B) must be in generalized Schur canonical form, that is, A and
+     !> B are both upper triangular.
+     !> Optionally, the matrices Q and Z of generalized Schur vectors are
+     !> updated.
+     !> Q(in) * A(in) * Z(in)**H = Q(out) * A(out) * Z(out)**H
+     !> Q(in) * B(in) * Z(in)**H = Q(out) * B(out) * Z(out)**H
+
+     pure subroutine stdlib_wtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, j1, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: j1, lda, ldb, ldq, ldz, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: twenty = 2.0e+1_qp
+           integer(ilp), parameter :: ldst = 2
+           logical(lk), parameter :: wands = .true.
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: strong, weak
+           integer(ilp) :: i, m
+           real(qp) :: cq, cz, eps, sa, sb, scale, smlnum, sum, thresha, threshb
+           complex(qp) :: cdum, f, g, sq, sz
+           ! Local Arrays 
+           complex(qp) :: s(ldst,ldst), t(ldst,ldst), work(8)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,max,sqrt
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=1 )return
+           m = ldst
+           weak = .false.
+           strong = .false.
+           ! make a local copy of selected block in (a, b)
+           call stdlib_wlacpy( 'FULL', m, m, a( j1, j1 ), lda, s, ldst )
+           call stdlib_wlacpy( 'FULL', m, m, b( j1, j1 ), ldb, t, ldst )
+           ! compute the threshold for testing the acceptance of swapping.
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' ) / eps
+           scale = real( czero,KIND=qp)
+           sum = real( cone,KIND=qp)
+           call stdlib_wlacpy( 'FULL', m, m, s, ldst, work, m )
+           call stdlib_wlacpy( 'FULL', m, m, t, ldst, work( m*m+1 ), m )
+           call stdlib_wlassq( m*m, work, 1, scale, sum )
+           sa = scale*sqrt( sum )
+           scale = real( czero,KIND=qp)
+           sum = real( cone,KIND=qp)
+           call stdlib_wlassq( m*m, work(m*m+1), 1, scale, sum )
+           sb = scale*sqrt( sum )
+           ! thres has been changed from
+              ! thresh = max( ten*eps*sa, smlnum )
+           ! to
+              ! thresh = max( twenty*eps*sa, smlnum )
+           ! on 04/01/10.
+           ! "bug" reported by ondra kamenik, confirmed by julie langou, fixed by
+           ! jim demmel and guillaume revy. see forum post 1783.
+           thresha = max( twenty*eps*sa, smlnum )
+           threshb = max( twenty*eps*sb, smlnum )
+           ! compute unitary ql and rq that swap 1-by-1 and 1-by-1 blocks
+           ! using givens rotations and perform the swap tentatively.
+           f = s( 2, 2 )*t( 1, 1 ) - t( 2, 2 )*s( 1, 1 )
+           g = s( 2, 2 )*t( 1, 2 ) - t( 2, 2 )*s( 1, 2 )
+           sa = abs( s( 2, 2 ) ) * abs( t( 1, 1 ) )
+           sb = abs( s( 1, 1 ) ) * abs( t( 2, 2 ) )
+           call stdlib_wlartg( g, f, cz, sz, cdum )
+           sz = -sz
+           call stdlib_wrot( 2, s( 1, 1 ), 1, s( 1, 2 ), 1, cz, conjg( sz ) )
+           call stdlib_wrot( 2, t( 1, 1 ), 1, t( 1, 2 ), 1, cz, conjg( sz ) )
+           if( sa>=sb ) then
+              call stdlib_wlartg( s( 1, 1 ), s( 2, 1 ), cq, sq, cdum )
+           else
+              call stdlib_wlartg( t( 1, 1 ), t( 2, 1 ), cq, sq, cdum )
+           end if
+           call stdlib_wrot( 2, s( 1, 1 ), ldst, s( 2, 1 ), ldst, cq, sq )
+           call stdlib_wrot( 2, t( 1, 1 ), ldst, t( 2, 1 ), ldst, cq, sq )
+           ! weak stability test: |s21| <= o(eps f-norm((a)))
+                                ! and  |t21| <= o(eps f-norm((b)))
+           weak = abs( s( 2, 1 ) )<=thresha .and.abs( t( 2, 1 ) )<=threshb
+           if( .not.weak )go to 20
+           if( wands ) then
+              ! strong stability test:
+                 ! f-norm((a-ql**h*s*qr)) <= o(eps*f-norm((a)))
+                 ! and
+                 ! f-norm((b-ql**h*t*qr)) <= o(eps*f-norm((b)))
+              call stdlib_wlacpy( 'FULL', m, m, s, ldst, work, m )
+              call stdlib_wlacpy( 'FULL', m, m, t, ldst, work( m*m+1 ), m )
+              call stdlib_wrot( 2, work, 1, work( 3 ), 1, cz, -conjg( sz ) )
+              call stdlib_wrot( 2, work( 5 ), 1, work( 7 ), 1, cz, -conjg( sz ) )
+              call stdlib_wrot( 2, work, 2, work( 2 ), 2, cq, -sq )
+              call stdlib_wrot( 2, work( 5 ), 2, work( 6 ), 2, cq, -sq )
+              do i = 1, 2
+                 work( i ) = work( i ) - a( j1+i-1, j1 )
+                 work( i+2 ) = work( i+2 ) - a( j1+i-1, j1+1 )
+                 work( i+4 ) = work( i+4 ) - b( j1+i-1, j1 )
+                 work( i+6 ) = work( i+6 ) - b( j1+i-1, j1+1 )
+              end do
+              scale = real( czero,KIND=qp)
+              sum = real( cone,KIND=qp)
+              call stdlib_wlassq( m*m, work, 1, scale, sum )
+              sa = scale*sqrt( sum )
+              scale = real( czero,KIND=qp)
+              sum = real( cone,KIND=qp)
+              call stdlib_wlassq( m*m, work(m*m+1), 1, scale, sum )
+              sb = scale*sqrt( sum )
+              strong = sa<=thresha .and. sb<=threshb
+              if( .not.strong )go to 20
+           end if
+           ! if the swap is accepted ("weakly" and "strongly"), apply the
+           ! equivalence transformations to the original matrix pair (a,b)
+           call stdlib_wrot( j1+1, a( 1, j1 ), 1, a( 1, j1+1 ), 1, cz,conjg( sz ) )
+           call stdlib_wrot( j1+1, b( 1, j1 ), 1, b( 1, j1+1 ), 1, cz,conjg( sz ) )
+           call stdlib_wrot( n-j1+1, a( j1, j1 ), lda, a( j1+1, j1 ), lda, cq, sq )
+           call stdlib_wrot( n-j1+1, b( j1, j1 ), ldb, b( j1+1, j1 ), ldb, cq, sq )
+           ! set  n1 by n2 (2,1) blocks to 0
+           a( j1+1, j1 ) = czero
+           b( j1+1, j1 ) = czero
+           ! accumulate transformations into q and z if requested.
+           if( wantz )call stdlib_wrot( n, z( 1, j1 ), 1, z( 1, j1+1 ), 1, cz,conjg( sz ) )
+                     
+           if( wantq )call stdlib_wrot( n, q( 1, j1 ), 1, q( 1, j1+1 ), 1, cq,conjg( sq ) )
+                     
+           ! exit with info = 0 if swap was successfully performed.
+           return
+           ! exit with info = 1 if swap was rejected.
+           20 continue
+           info = 1
+           return
+     end subroutine stdlib_wtgex2
+
+     !> ZTGEXC: reorders the generalized Schur decomposition of a complex
+     !> matrix pair (A,B), using an unitary equivalence transformation
+     !> (A, B) := Q * (A, B) * Z**H, so that the diagonal block of (A, B) with
+     !> row index IFST is moved to row ILST.
+     !> (A, B) must be in generalized Schur canonical form, that is, A and
+     !> B are both upper triangular.
+     !> Optionally, the matrices Q and Z of generalized Schur vectors are
+     !> updated.
+     !> Q(in) * A(in) * Z(in)**H = Q(out) * A(out) * Z(out)**H
+     !> Q(in) * B(in) * Z(in)**H = Q(out) * B(out) * Z(out)**H
+
+     pure subroutine stdlib_wtgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, ifst, ilst, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(in) :: ifst, lda, ldb, ldq, ldz, n
+           integer(ilp), intent(inout) :: ilst
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: here
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode and test input arguments.
+           info = 0
+           if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldq<1 .or. wantq .and. ( ldq<max( 1, n ) ) ) then
+              info = -9
+           else if( ldz<1 .or. wantz .and. ( ldz<max( 1, n ) ) ) then
+              info = -11
+           else if( ifst<1 .or. ifst>n ) then
+              info = -12
+           else if( ilst<1 .or. ilst>n ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGEXC', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 )return
+           if( ifst==ilst )return
+           if( ifst<ilst ) then
+              here = ifst
+              10 continue
+              ! swap with next one below
+              call stdlib_wtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z, ldz,here, info )
+                        
+              if( info/=0 ) then
+                 ilst = here
+                 return
+              end if
+              here = here + 1
+              if( here<ilst )go to 10
+              here = here - 1
+           else
+              here = ifst - 1
+              20 continue
+              ! swap with next one above
+              call stdlib_wtgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z, ldz,here, info )
+                        
+              if( info/=0 ) then
+                 ilst = here
+                 return
+              end if
+              here = here - 1
+              if( here>=ilst )go to 20
+              here = here + 1
+           end if
+           ilst = here
+           return
+     end subroutine stdlib_wtgexc
+
+     !> ZTGSEN: reorders the generalized Schur decomposition of a complex
+     !> matrix pair (A, B) (in terms of an unitary equivalence trans-
+     !> formation Q**H * (A, B) * Z), so that a selected cluster of eigenvalues
+     !> appears in the leading diagonal blocks of the pair (A,B). The leading
+     !> columns of Q and Z form unitary bases of the corresponding left and
+     !> right eigenspaces (deflating subspaces). (A, B) must be in
+     !> generalized Schur canonical form, that is, A and B are both upper
+     !> triangular.
+     !> ZTGSEN also computes the generalized eigenvalues
+     !> w(j)= ALPHA(j) / BETA(j)
+     !> of the reordered matrix pair (A, B).
+     !> Optionally, the routine computes estimates of reciprocal condition
+     !> numbers for eigenvalues and eigenspaces. These are Difu[(A11,B11),
+     !> (A22,B22)] and Difl[(A11,B11), (A22,B22)], i.e. the separation(s)
+     !> between the matrix pairs (A11, B11) and (A22,B22) that correspond to
+     !> the selected cluster and the eigenvalues outside the cluster, resp.,
+     !> and norms of "projections" onto left and right eigenspaces w.r.t.
+     !> the selected cluster in the (1,1)-block.
+
+     pure subroutine stdlib_wtgsen( ijob, wantq, wantz, select, n, a, lda, b, ldb,alpha, beta, q, &
+               ldq, z, ldz, m, pl, pr, dif,work, lwork, iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldq, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(qp), intent(out) :: pl, pr
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: dif(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           complex(qp), intent(out) :: alpha(*), beta(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: idifjb = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, swap, wantd, wantd1, wantd2, wantp
+           integer(ilp) :: i, ierr, ijb, k, kase, ks, liwmin, lwmin, mn2, n1, n2
+           real(qp) :: dscale, dsum, rdscal, safmin
+           complex(qp) :: temp1, temp2
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,conjg,max,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           info = 0
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( ijob<0 .or. ijob>5 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -13
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGSEN', -info )
+              return
+           end if
+           ierr = 0
+           wantp = ijob==1 .or. ijob>=4
+           wantd1 = ijob==2 .or. ijob==4
+           wantd2 = ijob==3 .or. ijob==5
+           wantd = wantd1 .or. wantd2
+           ! set m to the dimension of the specified pair of deflating
+           ! subspaces.
+           m = 0
+           if( .not.lquery .or. ijob/=0 ) then
+           do k = 1, n
+              alpha( k ) = a( k, k )
+              beta( k ) = b( k, k )
+              if( k<n ) then
+                 if( select( k ) )m = m + 1
+              else
+                 if( select( n ) )m = m + 1
+              end if
+           end do
+           end if
+           if( ijob==1 .or. ijob==2 .or. ijob==4 ) then
+              lwmin = max( 1, 2*m*( n-m ) )
+              liwmin = max( 1, n+2 )
+           else if( ijob==3 .or. ijob==5 ) then
+              lwmin = max( 1, 4*m*( n-m ) )
+              liwmin = max( 1, 2*m*( n-m ), n+2 )
+           else
+              lwmin = 1
+              liwmin = 1
+           end if
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           if( lwork<lwmin .and. .not.lquery ) then
+              info = -21
+           else if( liwork<liwmin .and. .not.lquery ) then
+              info = -23
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGSEN', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( m==n .or. m==0 ) then
+              if( wantp ) then
+                 pl = one
+                 pr = one
+              end if
+              if( wantd ) then
+                 dscale = zero
+                 dsum = one
+                 do i = 1, n
+                    call stdlib_wlassq( n, a( 1, i ), 1, dscale, dsum )
+                    call stdlib_wlassq( n, b( 1, i ), 1, dscale, dsum )
+                 end do
+                 dif( 1 ) = dscale*sqrt( dsum )
+                 dif( 2 ) = dif( 1 )
+              end if
+              go to 70
+           end if
+           ! get machine constant
+           safmin = stdlib_qlamch( 'S' )
+           ! collect the selected blocks at the top-left corner of (a, b).
+           ks = 0
+           do k = 1, n
+              swap = select( k )
+              if( swap ) then
+                 ks = ks + 1
+                 ! swap the k-th block to position ks. compute unitary q
+                 ! and z that will swap adjacent diagonal blocks in (a, b).
+                 if( k/=ks )call stdlib_wtgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, k,&
+                            ks, ierr )
+                 if( ierr>0 ) then
+                    ! swap is rejected: exit.
+                    info = 1
+                    if( wantp ) then
+                       pl = zero
+                       pr = zero
+                    end if
+                    if( wantd ) then
+                       dif( 1 ) = zero
+                       dif( 2 ) = zero
+                    end if
+                    go to 70
+                 end if
+              end if
+           end do
+           if( wantp ) then
+              ! solve generalized sylvester equation for r and l:
+                         ! a11 * r - l * a22 = a12
+                         ! b11 * r - l * b22 = b12
+              n1 = m
+              n2 = n - m
+              i = n1 + 1
+              call stdlib_wlacpy( 'FULL', n1, n2, a( 1, i ), lda, work, n1 )
+              call stdlib_wlacpy( 'FULL', n1, n2, b( 1, i ), ldb, work( n1*n2+1 ),n1 )
+              ijb = 0
+              call stdlib_wtgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda, work,n1, b, ldb, b( i,&
+               i ), ldb, work( n1*n2+1 ), n1,dscale, dif( 1 ), work( n1*n2*2+1 ),lwork-2*n1*n2, &
+                         iwork, ierr )
+              ! estimate the reciprocal of norms of "projections" onto
+              ! left and right eigenspaces
+              rdscal = zero
+              dsum = one
+              call stdlib_wlassq( n1*n2, work, 1, rdscal, dsum )
+              pl = rdscal*sqrt( dsum )
+              if( pl==zero ) then
+                 pl = one
+              else
+                 pl = dscale / ( sqrt( dscale*dscale / pl+pl )*sqrt( pl ) )
+              end if
+              rdscal = zero
+              dsum = one
+              call stdlib_wlassq( n1*n2, work( n1*n2+1 ), 1, rdscal, dsum )
+              pr = rdscal*sqrt( dsum )
+              if( pr==zero ) then
+                 pr = one
+              else
+                 pr = dscale / ( sqrt( dscale*dscale / pr+pr )*sqrt( pr ) )
+              end if
+           end if
+           if( wantd ) then
+              ! compute estimates difu and difl.
+              if( wantd1 ) then
+                 n1 = m
+                 n2 = n - m
+                 i = n1 + 1
+                 ijb = idifjb
+                 ! frobenius norm-based difu estimate.
+                 call stdlib_wtgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda, work,n1, b, ldb, b(&
+                  i, i ), ldb, work( n1*n2+1 ),n1, dscale, dif( 1 ), work( n1*n2*2+1 ),lwork-&
+                            2*n1*n2, iwork, ierr )
+                 ! frobenius norm-based difl estimate.
+                 call stdlib_wtgsyl( 'N', ijb, n2, n1, a( i, i ), lda, a, lda, work,n2, b( i, i ),&
+                  ldb, b, ldb, work( n1*n2+1 ),n2, dscale, dif( 2 ), work( n1*n2*2+1 ),lwork-&
+                            2*n1*n2, iwork, ierr )
+              else
+                 ! compute 1-norm-based estimates of difu and difl using
+                 ! reversed communication with stdlib_wlacn2. in each step a
+                 ! generalized sylvester equation or a transposed variant
+                 ! is solved.
+                 kase = 0
+                 n1 = m
+                 n2 = n - m
+                 i = n1 + 1
+                 ijb = 0
+                 mn2 = 2*n1*n2
+                 ! 1-norm-based estimate of difu.
+                 40 continue
+                 call stdlib_wlacn2( mn2, work( mn2+1 ), work, dif( 1 ), kase,isave )
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve generalized sylvester equation
+                       call stdlib_wtgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda,work, n1, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n1, dscale, dif( 1 ),work( n1*n2*2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    else
+                       ! solve the transposed variant.
+                       call stdlib_wtgsyl( 'C', ijb, n1, n2, a, lda, a( i, i ), lda,work, n1, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n1, dscale, dif( 1 ),work( n1*n2*2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    end if
+                    go to 40
+                 end if
+                 dif( 1 ) = dscale / dif( 1 )
+                 ! 1-norm-based estimate of difl.
+                 50 continue
+                 call stdlib_wlacn2( mn2, work( mn2+1 ), work, dif( 2 ), kase,isave )
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve generalized sylvester equation
+                       call stdlib_wtgsyl( 'N', ijb, n2, n1, a( i, i ), lda, a, lda,work, n2, b( &
+                       i, i ), ldb, b, ldb,work( n1*n2+1 ), n2, dscale, dif( 2 ),work( n1*n2*2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    else
+                       ! solve the transposed variant.
+                       call stdlib_wtgsyl( 'C', ijb, n2, n1, a( i, i ), lda, a, lda,work, n2, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n2, dscale, dif( 2 ),work( n1*n2*2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    end if
+                    go to 50
+                 end if
+                 dif( 2 ) = dscale / dif( 2 )
+              end if
+           end if
+           ! if b(k,k) is complex, make it real and positive (normalization
+           ! of the generalized schur form) and store the generalized
+           ! eigenvalues of reordered pair (a, b)
+           do k = 1, n
+              dscale = abs( b( k, k ) )
+              if( dscale>safmin ) then
+                 temp1 = conjg( b( k, k ) / dscale )
+                 temp2 = b( k, k ) / dscale
+                 b( k, k ) = dscale
+                 call stdlib_wscal( n-k, temp1, b( k, k+1 ), ldb )
+                 call stdlib_wscal( n-k+1, temp1, a( k, k ), lda )
+                 if( wantq )call stdlib_wscal( n, temp2, q( 1, k ), 1 )
+              else
+                 b( k, k ) = cmplx( zero, zero,KIND=qp)
+              end if
+              alpha( k ) = a( k, k )
+              beta( k ) = b( k, k )
+           end do
+           70 continue
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_wtgsen
+
+     !> ZTGSJA: computes the generalized singular value decomposition (GSVD)
+     !> of two complex upper triangular (or trapezoidal) matrices A and B.
+     !> On entry, it is assumed that matrices A and B have the following
+     !> forms, which may be obtained by the preprocessing subroutine ZGGSVP
+     !> from a general M-by-N matrix A and P-by-N matrix B:
+     !> N-K-L  K    L
+     !> A =    K ( 0    A12  A13 ) if M-K-L >= 0;
+     !> L ( 0     0   A23 )
+     !> M-K-L ( 0     0    0  )
+     !> N-K-L  K    L
+     !> A =  K ( 0    A12  A13 ) if M-K-L < 0;
+     !> M-K ( 0     0   A23 )
+     !> N-K-L  K    L
+     !> B =  L ( 0     0   B13 )
+     !> P-L ( 0     0    0  )
+     !> where the K-by-K matrix A12 and L-by-L matrix B13 are nonsingular
+     !> upper triangular; A23 is L-by-L upper triangular if M-K-L >= 0,
+     !> otherwise A23 is (M-K)-by-L upper trapezoidal.
+     !> On exit,
+     !> U**H *A*Q = D1*( 0 R ),    V**H *B*Q = D2*( 0 R ),
+     !> where U, V and Q are unitary matrices.
+     !> R is a nonsingular upper triangular matrix, and D1
+     !> and D2 are ``diagonal'' matrices, which are of the following
+     !> structures:
+     !> If M-K-L >= 0,
+     !> K  L
+     !> D1 =     K ( I  0 )
+     !> L ( 0  C )
+     !> M-K-L ( 0  0 )
+     !> K  L
+     !> D2 = L   ( 0  S )
+     !> P-L ( 0  0 )
+     !> N-K-L  K    L
+     !> ( 0 R ) = K (  0   R11  R12 ) K
+     !> L (  0    0   R22 ) L
+     !> where
+     !> C = diag( ALPHA(K+1), ... , ALPHA(K+L) ),
+     !> S = diag( BETA(K+1),  ... , BETA(K+L) ),
+     !> C**2 + S**2 = I.
+     !> R is stored in A(1:K+L,N-K-L+1:N) on exit.
+     !> If M-K-L < 0,
+     !> K M-K K+L-M
+     !> D1 =   K ( I  0    0   )
+     !> M-K ( 0  C    0   )
+     !> K M-K K+L-M
+     !> D2 =   M-K ( 0  S    0   )
+     !> K+L-M ( 0  0    I   )
+     !> P-L ( 0  0    0   )
+     !> N-K-L  K   M-K  K+L-M
+     !> ( 0 R ) =    K ( 0    R11  R12  R13  )
+     !> M-K ( 0     0   R22  R23  )
+     !> K+L-M ( 0     0    0   R33  )
+     !> where
+     !> C = diag( ALPHA(K+1), ... , ALPHA(M) ),
+     !> S = diag( BETA(K+1),  ... , BETA(M) ),
+     !> C**2 + S**2 = I.
+     !> R = ( R11 R12 R13 ) is stored in A(1:M, N-K-L+1:N) and R33 is stored
+     !> (  0  R22 R23 )
+     !> in B(M-K+1:L,N+M-K-L+1:N) on exit.
+     !> The computation of the unitary transformation matrices U, V or Q
+     !> is optional.  These matrices may either be formed explicitly, or they
+     !> may be postmultiplied into input matrices U1, V1, or Q1.
+
+     pure subroutine stdlib_wtgsja( jobu, jobv, jobq, m, p, n, k, l, a, lda, b,ldb, tola, tolb, &
+               alpha, beta, u, ldu, v, ldv,q, ldq, work, ncycle, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobq, jobu, jobv
+           integer(ilp), intent(out) :: info, ncycle
+           integer(ilp), intent(in) :: k, l, lda, ldb, ldq, ldu, ldv, m, n, p
+           real(qp), intent(in) :: tola, tolb
+           ! Array Arguments 
+           real(qp), intent(out) :: alpha(*), beta(*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), u(ldu,*), v(ldv,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 40
+           real(qp), parameter :: hugenum = huge(zero)
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: initq, initu, initv, upper, wantq, wantu, wantv
+           integer(ilp) :: i, j, kcycle
+           real(qp) :: a1, a3, b1, b3, csq, csu, csv, error, gamma, rwk, ssmin
+           complex(qp) :: a2, b2, snq, snu, snv
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,max,min,huge
+           ! Executable Statements 
+           ! decode and test the input parameters
+           initu = stdlib_lsame( jobu, 'I' )
+           wantu = initu .or. stdlib_lsame( jobu, 'U' )
+           initv = stdlib_lsame( jobv, 'I' )
+           wantv = initv .or. stdlib_lsame( jobv, 'V' )
+           initq = stdlib_lsame( jobq, 'I' )
+           wantq = initq .or. stdlib_lsame( jobq, 'Q' )
+           info = 0
+           if( .not.( initu .or. wantu .or. stdlib_lsame( jobu, 'N' ) ) ) then
+              info = -1
+           else if( .not.( initv .or. wantv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -2
+           else if( .not.( initq .or. wantq .or. stdlib_lsame( jobq, 'N' ) ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( p<0 ) then
+              info = -5
+           else if( n<0 ) then
+              info = -6
+           else if( lda<max( 1, m ) ) then
+              info = -10
+           else if( ldb<max( 1, p ) ) then
+              info = -12
+           else if( ldu<1 .or. ( wantu .and. ldu<m ) ) then
+              info = -18
+           else if( ldv<1 .or. ( wantv .and. ldv<p ) ) then
+              info = -20
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -22
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGSJA', -info )
+              return
+           end if
+           ! initialize u, v and q, if necessary
+           if( initu )call stdlib_wlaset( 'FULL', m, m, czero, cone, u, ldu )
+           if( initv )call stdlib_wlaset( 'FULL', p, p, czero, cone, v, ldv )
+           if( initq )call stdlib_wlaset( 'FULL', n, n, czero, cone, q, ldq )
+           ! loop until convergence
+           upper = .false.
+           loop_40: do kcycle = 1, maxit
+              upper = .not.upper
+              loop_20: do i = 1, l - 1
+                 loop_10: do j = i + 1, l
+                    a1 = zero
+                    a2 = czero
+                    a3 = zero
+                    if( k+i<=m )a1 = real( a( k+i, n-l+i ),KIND=qp)
+                    if( k+j<=m )a3 = real( a( k+j, n-l+j ),KIND=qp)
+                    b1 = real( b( i, n-l+i ),KIND=qp)
+                    b3 = real( b( j, n-l+j ),KIND=qp)
+                    if( upper ) then
+                       if( k+i<=m )a2 = a( k+i, n-l+j )
+                       b2 = b( i, n-l+j )
+                    else
+                       if( k+j<=m )a2 = a( k+j, n-l+i )
+                       b2 = b( j, n-l+i )
+                    end if
+                    call stdlib_wlags2( upper, a1, a2, a3, b1, b2, b3, csu, snu,csv, snv, csq, &
+                              snq )
+                    ! update (k+i)-th and (k+j)-th rows of matrix a: u**h *a
+                    if( k+j<=m )call stdlib_wrot( l, a( k+j, n-l+1 ), lda, a( k+i, n-l+1 ),lda, &
+                              csu, conjg( snu ) )
+                    ! update i-th and j-th rows of matrix b: v**h *b
+                    call stdlib_wrot( l, b( j, n-l+1 ), ldb, b( i, n-l+1 ), ldb,csv, conjg( snv ) &
+                              )
+                    ! update (n-l+i)-th and (n-l+j)-th columns of matrices
+                    ! a and b: a*q and b*q
+                    call stdlib_wrot( min( k+l, m ), a( 1, n-l+j ), 1,a( 1, n-l+i ), 1, csq, snq )
+                              
+                    call stdlib_wrot( l, b( 1, n-l+j ), 1, b( 1, n-l+i ), 1, csq,snq )
+                    if( upper ) then
+                       if( k+i<=m )a( k+i, n-l+j ) = czero
+                       b( i, n-l+j ) = czero
+                    else
+                       if( k+j<=m )a( k+j, n-l+i ) = czero
+                       b( j, n-l+i ) = czero
+                    end if
+                    ! ensure that the diagonal elements of a and b are real.
+                    if( k+i<=m )a( k+i, n-l+i ) = real( a( k+i, n-l+i ),KIND=qp)
+                    if( k+j<=m )a( k+j, n-l+j ) = real( a( k+j, n-l+j ),KIND=qp)
+                    b( i, n-l+i ) = real( b( i, n-l+i ),KIND=qp)
+                    b( j, n-l+j ) = real( b( j, n-l+j ),KIND=qp)
+                    ! update unitary matrices u, v, q, if desired.
+                    if( wantu .and. k+j<=m )call stdlib_wrot( m, u( 1, k+j ), 1, u( 1, k+i ), 1, &
+                              csu,snu )
+                    if( wantv )call stdlib_wrot( p, v( 1, j ), 1, v( 1, i ), 1, csv, snv )
+                    if( wantq )call stdlib_wrot( n, q( 1, n-l+j ), 1, q( 1, n-l+i ), 1, csq,snq )
+                              
+                 end do loop_10
+              end do loop_20
+              if( .not.upper ) then
+                 ! the matrices a13 and b13 were lower triangular at the start
+                 ! of the cycle, and are now upper triangular.
+                 ! convergence test: test the parallelism of the corresponding
+                 ! rows of a and b.
+                 error = zero
+                 do i = 1, min( l, m-k )
+                    call stdlib_wcopy( l-i+1, a( k+i, n-l+i ), lda, work, 1 )
+                    call stdlib_wcopy( l-i+1, b( i, n-l+i ), ldb, work( l+1 ), 1 )
+                    call stdlib_wlapll( l-i+1, work, 1, work( l+1 ), 1, ssmin )
+                    error = max( error, ssmin )
+                 end do
+                 if( abs( error )<=min( tola, tolb ) )go to 50
+              end if
+              ! end of cycle loop
+           end do loop_40
+           ! the algorithm has not converged after maxit cycles.
+           info = 1
+           go to 100
+           50 continue
+           ! if error <= min(tola,tolb), then the algorithm has converged.
+           ! compute the generalized singular value pairs (alpha, beta), and
+           ! set the triangular matrix r to array a.
+           do i = 1, k
+              alpha( i ) = one
+              beta( i ) = zero
+           end do
+           do i = 1, min( l, m-k )
+              a1 = real( a( k+i, n-l+i ),KIND=qp)
+              b1 = real( b( i, n-l+i ),KIND=qp)
+              gamma = b1 / a1
+              if( (gamma<=hugenum).and.(gamma>=-hugenum) ) then
+                 if( gamma<zero ) then
+                    call stdlib_wdscal( l-i+1, -one, b( i, n-l+i ), ldb )
+                    if( wantv )call stdlib_wdscal( p, -one, v( 1, i ), 1 )
+                 end if
+                 call stdlib_qlartg( abs( gamma ), one, beta( k+i ), alpha( k+i ),rwk )
+                 if( alpha( k+i )>=beta( k+i ) ) then
+                    call stdlib_wdscal( l-i+1, one / alpha( k+i ), a( k+i, n-l+i ),lda )
+                 else
+                    call stdlib_wdscal( l-i+1, one / beta( k+i ), b( i, n-l+i ),ldb )
+                    call stdlib_wcopy( l-i+1, b( i, n-l+i ), ldb, a( k+i, n-l+i ),lda )
+                 end if
+              else
+                 alpha( k+i ) = zero
+                 beta( k+i ) = one
+                 call stdlib_wcopy( l-i+1, b( i, n-l+i ), ldb, a( k+i, n-l+i ),lda )
+              end if
+           end do
+           ! post-assignment
+           do i = m + 1, k + l
+              alpha( i ) = zero
+              beta( i ) = one
+           end do
+           if( k+l<n ) then
+              do i = k + l + 1, n
+                 alpha( i ) = zero
+                 beta( i ) = zero
+              end do
+           end if
+           100 continue
+           ncycle = kcycle
+           return
+     end subroutine stdlib_wtgsja
+
+     !> ZTGSNA: estimates reciprocal condition numbers for specified
+     !> eigenvalues and/or eigenvectors of a matrix pair (A, B).
+     !> (A, B) must be in generalized Schur canonical form, that is, A and
+     !> B are both upper triangular.
+
+     pure subroutine stdlib_wtgsna( job, howmny, select, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, s, &
+               dif, mm, m, work, lwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: dif(*), s(*)
+           complex(qp), intent(in) :: a(lda,*), b(ldb,*), vl(ldvl,*), vr(ldvr,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: idifjb = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, somcon, wantbh, wantdf, wants
+           integer(ilp) :: i, ierr, ifst, ilst, k, ks, lwmin, n1, n2
+           real(qp) :: bignum, cond, eps, lnrm, rnrm, scale, smlnum
+           complex(qp) :: yhax, yhbx
+           ! Local Arrays 
+           complex(qp) :: dummy(1), dummy1(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,max
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantdf = stdlib_lsame( job, 'V' ) .or. wantbh
+           somcon = stdlib_lsame( howmny, 'S' )
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.wants .and. .not.wantdf ) then
+              info = -1
+           else if( .not.stdlib_lsame( howmny, 'A' ) .and. .not.somcon ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( wants .and. ldvl<n ) then
+              info = -10
+           else if( wants .and. ldvr<n ) then
+              info = -12
+           else
+              ! set m to the number of eigenpairs for which condition numbers
+              ! are required, and test mm.
+              if( somcon ) then
+                 m = 0
+                 do k = 1, n
+                    if( select( k ) )m = m + 1
+                 end do
+              else
+                 m = n
+              end if
+              if( n==0 ) then
+                 lwmin = 1
+              else if( stdlib_lsame( job, 'V' ) .or. stdlib_lsame( job, 'B' ) ) then
+                 lwmin = 2*n*n
+              else
+                 lwmin = n
+              end if
+              work( 1 ) = lwmin
+              if( mm<m ) then
+                 info = -15
+              else if( lwork<lwmin .and. .not.lquery ) then
+                 info = -18
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGSNA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ks = 0
+           loop_20: do k = 1, n
+              ! determine whether condition numbers are required for the k-th
+              ! eigenpair.
+              if( somcon ) then
+                 if( .not.select( k ) )cycle loop_20
+              end if
+              ks = ks + 1
+              if( wants ) then
+                 ! compute the reciprocal condition number of the k-th
+                 ! eigenvalue.
+                 rnrm = stdlib_qznrm2( n, vr( 1, ks ), 1 )
+                 lnrm = stdlib_qznrm2( n, vl( 1, ks ), 1 )
+                 call stdlib_wgemv( 'N', n, n, cmplx( one, zero,KIND=qp), a, lda,vr( 1, ks ), 1, &
+                           cmplx( zero, zero,KIND=qp), work, 1 )
+                 yhax = stdlib_wdotc( n, work, 1, vl( 1, ks ), 1 )
+                 call stdlib_wgemv( 'N', n, n, cmplx( one, zero,KIND=qp), b, ldb,vr( 1, ks ), 1, &
+                           cmplx( zero, zero,KIND=qp), work, 1 )
+                 yhbx = stdlib_wdotc( n, work, 1, vl( 1, ks ), 1 )
+                 cond = stdlib_qlapy2( abs( yhax ), abs( yhbx ) )
+                 if( cond==zero ) then
+                    s( ks ) = -one
+                 else
+                    s( ks ) = cond / ( rnrm*lnrm )
+                 end if
+              end if
+              if( wantdf ) then
+                 if( n==1 ) then
+                    dif( ks ) = stdlib_qlapy2( abs( a( 1, 1 ) ), abs( b( 1, 1 ) ) )
+                 else
+                    ! estimate the reciprocal condition number of the k-th
+                    ! eigenvectors.
+                    ! copy the matrix (a, b) to the array work and move the
+                    ! (k,k)th pair to the (1,1) position.
+                    call stdlib_wlacpy( 'FULL', n, n, a, lda, work, n )
+                    call stdlib_wlacpy( 'FULL', n, n, b, ldb, work( n*n+1 ), n )
+                    ifst = k
+                    ilst = 1
+                    call stdlib_wtgexc( .false., .false., n, work, n, work( n*n+1 ),n, dummy, 1, &
+                              dummy1, 1, ifst, ilst, ierr )
+                    if( ierr>0 ) then
+                       ! ill-conditioned problem - swap rejected.
+                       dif( ks ) = zero
+                    else
+                       ! reordering successful, solve generalized sylvester
+                       ! equation for r and l,
+                                  ! a22 * r - l * a11 = a12
+                                  ! b22 * r - l * b11 = b12,
+                       ! and compute estimate of difl[(a11,b11), (a22, b22)].
+                       n1 = 1
+                       n2 = n - n1
+                       i = n*n + 1
+                       call stdlib_wtgsyl( 'N', idifjb, n2, n1, work( n*n1+n1+1 ),n, work, n, &
+                       work( n1+1 ), n,work( n*n1+n1+i ), n, work( i ), n,work( n1+i ), n, scale, &
+                                 dif( ks ), dummy,1, iwork, ierr )
+                    end if
+                 end if
+              end if
+           end do loop_20
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_wtgsna
+
+     !> ZTGSY2: solves the generalized Sylvester equation
+     !> A * R - L * B = scale * C               (1)
+     !> D * R - L * E = scale * F
+     !> using Level 1 and 2 BLAS, where R and L are unknown M-by-N matrices,
+     !> (A, D), (B, E) and (C, F) are given matrix pairs of size M-by-M,
+     !> N-by-N and M-by-N, respectively. A, B, D and E are upper triangular
+     !> (i.e., (A,D) and (B,E) in generalized Schur form).
+     !> The solution (R, L) overwrites (C, F). 0 <= SCALE <= 1 is an output
+     !> scaling factor chosen to avoid overflow.
+     !> In matrix notation solving equation (1) corresponds to solve
+     !> Zx = scale * b, where Z is defined as
+     !> Z = [ kron(In, A)  -kron(B**H, Im) ]             (2)
+     !> [ kron(In, D)  -kron(E**H, Im) ],
+     !> Ik is the identity matrix of size k and X**H is the conjuguate transpose of X.
+     !> kron(X, Y) is the Kronecker product between the matrices X and Y.
+     !> If TRANS = 'C', y in the conjugate transposed system Z**H*y = scale*b
+     !> is solved for, which is equivalent to solve for R and L in
+     !> A**H * R  + D**H * L   = scale * C           (3)
+     !> R  * B**H + L  * E**H  = scale * -F
+     !> This case is used to compute an estimate of Dif[(A, D), (B, E)] =
+     !> = sigma_min(Z) using reverse communication with ZLACON.
+     !> ZTGSY2 also (IJOB >= 1) contributes to the computation in ZTGSYL
+     !> of an upper bound on the separation between to matrix pairs. Then
+     !> the input (A, D), (B, E) are sub-pencils of two matrix pairs in
+     !> ZTGSYL.
+
+     pure subroutine stdlib_wtgsy2( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f, &
+               ldf, scale, rdsum, rdscal,info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldc, ldd, lde, ldf, m, n
+           integer(ilp), intent(out) :: info
+           real(qp), intent(inout) :: rdscal, rdsum
+           real(qp), intent(out) :: scale
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), b(ldb,*), d(ldd,*), e(lde,*)
+           complex(qp), intent(inout) :: c(ldc,*), f(ldf,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ldz = 2
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: i, ierr, j, k
+           real(qp) :: scaloc
+           complex(qp) :: alpha
+           ! Local Arrays 
+           integer(ilp) :: ipiv(ldz), jpiv(ldz)
+           complex(qp) :: rhs(ldz), z(ldz,ldz)
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,conjg,max
+           ! Executable Statements 
+           ! decode and test input parameters
+           info = 0
+           ierr = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -1
+           else if( notran ) then
+              if( ( ijob<0 ) .or. ( ijob>2 ) ) then
+                 info = -2
+              end if
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 info = -3
+              else if( n<=0 ) then
+                 info = -4
+              else if( lda<max( 1, m ) ) then
+                 info = -6
+              else if( ldb<max( 1, n ) ) then
+                 info = -8
+              else if( ldc<max( 1, m ) ) then
+                 info = -10
+              else if( ldd<max( 1, m ) ) then
+                 info = -12
+              else if( lde<max( 1, n ) ) then
+                 info = -14
+              else if( ldf<max( 1, m ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGSY2', -info )
+              return
+           end if
+           if( notran ) then
+              ! solve (i, j) - system
+                 ! a(i, i) * r(i, j) - l(i, j) * b(j, j) = c(i, j)
+                 ! d(i, i) * r(i, j) - l(i, j) * e(j, j) = f(i, j)
+              ! for i = m, m - 1, ..., 1; j = 1, 2, ..., n
+              scale = one
+              scaloc = one
+              loop_30: do j = 1, n
+                 loop_20: do i = m, 1, -1
+                    ! build 2 by 2 system
+                    z( 1, 1 ) = a( i, i )
+                    z( 2, 1 ) = d( i, i )
+                    z( 1, 2 ) = -b( j, j )
+                    z( 2, 2 ) = -e( j, j )
+                    ! set up right hand side(s)
+                    rhs( 1 ) = c( i, j )
+                    rhs( 2 ) = f( i, j )
+                    ! solve z * x = rhs
+                    call stdlib_wgetc2( ldz, z, ldz, ipiv, jpiv, ierr )
+                    if( ierr>0 )info = ierr
+                    if( ijob==0 ) then
+                       call stdlib_wgesc2( ldz, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_wscal( m, cmplx( scaloc, zero,KIND=qp),c( 1, k ), 1 )
+                                       
+                             call stdlib_wscal( m, cmplx( scaloc, zero,KIND=qp),f( 1, k ), 1 )
+                                       
+                          end do
+                          scale = scale*scaloc
+                       end if
+                    else
+                       call stdlib_wlatdf( ijob, ldz, z, ldz, rhs, rdsum, rdscal,ipiv, jpiv )
+                                 
+                    end if
+                    ! unpack solution vector(s)
+                    c( i, j ) = rhs( 1 )
+                    f( i, j ) = rhs( 2 )
+                    ! substitute r(i, j) and l(i, j) into remaining equation.
+                    if( i>1 ) then
+                       alpha = -rhs( 1 )
+                       call stdlib_waxpy( i-1, alpha, a( 1, i ), 1, c( 1, j ), 1 )
+                       call stdlib_waxpy( i-1, alpha, d( 1, i ), 1, f( 1, j ), 1 )
+                    end if
+                    if( j<n ) then
+                       call stdlib_waxpy( n-j, rhs( 2 ), b( j, j+1 ), ldb,c( i, j+1 ), ldc )
+                                 
+                       call stdlib_waxpy( n-j, rhs( 2 ), e( j, j+1 ), lde,f( i, j+1 ), ldf )
+                                 
+                    end if
+                 end do loop_20
+              end do loop_30
+           else
+              ! solve transposed (i, j) - system:
+                 ! a(i, i)**h * r(i, j) + d(i, i)**h * l(j, j) = c(i, j)
+                 ! r(i, i) * b(j, j) + l(i, j) * e(j, j)   = -f(i, j)
+              ! for i = 1, 2, ..., m, j = n, n - 1, ..., 1
+              scale = one
+              scaloc = one
+              loop_80: do i = 1, m
+                 loop_70: do j = n, 1, -1
+                    ! build 2 by 2 system z**h
+                    z( 1, 1 ) = conjg( a( i, i ) )
+                    z( 2, 1 ) = -conjg( b( j, j ) )
+                    z( 1, 2 ) = conjg( d( i, i ) )
+                    z( 2, 2 ) = -conjg( e( j, j ) )
+                    ! set up right hand side(s)
+                    rhs( 1 ) = c( i, j )
+                    rhs( 2 ) = f( i, j )
+                    ! solve z**h * x = rhs
+                    call stdlib_wgetc2( ldz, z, ldz, ipiv, jpiv, ierr )
+                    if( ierr>0 )info = ierr
+                    call stdlib_wgesc2( ldz, z, ldz, rhs, ipiv, jpiv, scaloc )
+                    if( scaloc/=one ) then
+                       do k = 1, n
+                          call stdlib_wscal( m, cmplx( scaloc, zero,KIND=qp), c( 1, k ),1 )
+                                    
+                          call stdlib_wscal( m, cmplx( scaloc, zero,KIND=qp), f( 1, k ),1 )
+                                    
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    ! unpack solution vector(s)
+                    c( i, j ) = rhs( 1 )
+                    f( i, j ) = rhs( 2 )
+                    ! substitute r(i, j) and l(i, j) into remaining equation.
+                    do k = 1, j - 1
+                       f( i, k ) = f( i, k ) + rhs( 1 )*conjg( b( k, j ) ) +rhs( 2 )*conjg( e( k, &
+                                 j ) )
+                    end do
+                    do k = i + 1, m
+                       c( k, j ) = c( k, j ) - conjg( a( i, k ) )*rhs( 1 ) -conjg( d( i, k ) )&
+                                 *rhs( 2 )
+                    end do
+                 end do loop_70
+              end do loop_80
+           end if
+           return
+     end subroutine stdlib_wtgsy2
+
+     !> ZTGSYL: solves the generalized Sylvester equation:
+     !> A * R - L * B = scale * C            (1)
+     !> D * R - L * E = scale * F
+     !> where R and L are unknown m-by-n matrices, (A, D), (B, E) and
+     !> (C, F) are given matrix pairs of size m-by-m, n-by-n and m-by-n,
+     !> respectively, with complex entries. A, B, D and E are upper
+     !> triangular (i.e., (A,D) and (B,E) in generalized Schur form).
+     !> The solution (R, L) overwrites (C, F). 0 <= SCALE <= 1
+     !> is an output scaling factor chosen to avoid overflow.
+     !> In matrix notation (1) is equivalent to solve Zx = scale*b, where Z
+     !> is defined as
+     !> Z = [ kron(In, A)  -kron(B**H, Im) ]        (2)
+     !> [ kron(In, D)  -kron(E**H, Im) ],
+     !> Here Ix is the identity matrix of size x and X**H is the conjugate
+     !> transpose of X. Kron(X, Y) is the Kronecker product between the
+     !> matrices X and Y.
+     !> If TRANS = 'C', y in the conjugate transposed system Z**H *y = scale*b
+     !> is solved for, which is equivalent to solve for R and L in
+     !> A**H * R + D**H * L = scale * C           (3)
+     !> R * B**H + L * E**H = scale * -F
+     !> This case (TRANS = 'C') is used to compute an one-norm-based estimate
+     !> of Dif[(A,D), (B,E)], the separation between the matrix pairs (A,D)
+     !> and (B,E), using ZLACON.
+     !> If IJOB >= 1, ZTGSYL computes a Frobenius norm-based estimate of
+     !> Dif[(A,D),(B,E)]. That is, the reciprocal of a lower bound on the
+     !> reciprocal of the smallest singular value of Z.
+     !> This is a level-3 BLAS algorithm.
+
+     pure subroutine stdlib_wtgsyl( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f, &
+               ldf, scale, dif, work, lwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldc, ldd, lde, ldf, lwork, m, n
+           integer(ilp), intent(out) :: info
+           real(qp), intent(out) :: dif, scale
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           complex(qp), intent(in) :: a(lda,*), b(ldb,*), d(ldd,*), e(lde,*)
+           complex(qp), intent(inout) :: c(ldc,*), f(ldf,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+        ! replaced various illegal calls to stdlib_zcopy by calls to stdlib_zlaset.
+        ! sven hammarling, 1/5/02.
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, notran
+           integer(ilp) :: i, ie, ifunc, iround, is, isolve, j, je, js, k, linfo, lwmin, mb, nb, &
+                     p, pq, q
+           real(qp) :: dscale, dsum, scale2, scaloc
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,max,sqrt
+           ! Executable Statements 
+           ! decode and test input parameters
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -1
+           else if( notran ) then
+              if( ( ijob<0 ) .or. ( ijob>4 ) ) then
+                 info = -2
+              end if
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 info = -3
+              else if( n<=0 ) then
+                 info = -4
+              else if( lda<max( 1, m ) ) then
+                 info = -6
+              else if( ldb<max( 1, n ) ) then
+                 info = -8
+              else if( ldc<max( 1, m ) ) then
+                 info = -10
+              else if( ldd<max( 1, m ) ) then
+                 info = -12
+              else if( lde<max( 1, n ) ) then
+                 info = -14
+              else if( ldf<max( 1, m ) ) then
+                 info = -16
+              end if
+           end if
+           if( info==0 ) then
+              if( notran ) then
+                 if( ijob==1 .or. ijob==2 ) then
+                    lwmin = max( 1, 2*m*n )
+                 else
+                    lwmin = 1
+                 end if
+              else
+                 lwmin = 1
+              end if
+              work( 1 ) = lwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGSYL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              scale = 1
+              if( notran ) then
+                 if( ijob/=0 ) then
+                    dif = 0
+                 end if
+              end if
+              return
+           end if
+           ! determine  optimal block sizes mb and nb
+           mb = stdlib_ilaenv( 2, 'ZTGSYL', trans, m, n, -1, -1 )
+           nb = stdlib_ilaenv( 5, 'ZTGSYL', trans, m, n, -1, -1 )
+           isolve = 1
+           ifunc = 0
+           if( notran ) then
+              if( ijob>=3 ) then
+                 ifunc = ijob - 2
+                 call stdlib_wlaset( 'F', m, n, czero, czero, c, ldc )
+                 call stdlib_wlaset( 'F', m, n, czero, czero, f, ldf )
+              else if( ijob>=1 .and. notran ) then
+                 isolve = 2
+              end if
+           end if
+           if( ( mb<=1 .and. nb<=1 ) .or. ( mb>=m .and. nb>=n ) )then
+              ! use unblocked level 2 solver
+              loop_30: do iround = 1, isolve
+                 scale = one
+                 dscale = zero
+                 dsum = one
+                 pq = m*n
+                 call stdlib_wtgsy2( trans, ifunc, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f,&
+                            ldf, scale, dsum, dscale,info )
+                 if( dscale/=zero ) then
+                    if( ijob==1 .or. ijob==3 ) then
+                       dif = sqrt( real( 2*m*n,KIND=qp) ) / ( dscale*sqrt( dsum ) )
+                    else
+                       dif = sqrt( real( pq,KIND=qp) ) / ( dscale*sqrt( dsum ) )
+                    end if
+                 end if
+                 if( isolve==2 .and. iround==1 ) then
+                    if( notran ) then
+                       ifunc = ijob
+                    end if
+                    scale2 = scale
+                    call stdlib_wlacpy( 'F', m, n, c, ldc, work, m )
+                    call stdlib_wlacpy( 'F', m, n, f, ldf, work( m*n+1 ), m )
+                    call stdlib_wlaset( 'F', m, n, czero, czero, c, ldc )
+                    call stdlib_wlaset( 'F', m, n, czero, czero, f, ldf )
+                 else if( isolve==2 .and. iround==2 ) then
+                    call stdlib_wlacpy( 'F', m, n, work, m, c, ldc )
+                    call stdlib_wlacpy( 'F', m, n, work( m*n+1 ), m, f, ldf )
+                    scale = scale2
+                 end if
+              end do loop_30
+              return
+           end if
+           ! determine block structure of a
+           p = 0
+           i = 1
+           40 continue
+           if( i>m )go to 50
+           p = p + 1
+           iwork( p ) = i
+           i = i + mb
+           if( i>=m )go to 50
+           go to 40
+           50 continue
+           iwork( p+1 ) = m + 1
+           if( iwork( p )==iwork( p+1 ) )p = p - 1
+           ! determine block structure of b
+           q = p + 1
+           j = 1
+           60 continue
+           if( j>n )go to 70
+           q = q + 1
+           iwork( q ) = j
+           j = j + nb
+           if( j>=n )go to 70
+           go to 60
+           70 continue
+           iwork( q+1 ) = n + 1
+           if( iwork( q )==iwork( q+1 ) )q = q - 1
+           if( notran ) then
+              loop_150: do iround = 1, isolve
+                 ! solve (i, j) - subsystem
+                     ! a(i, i) * r(i, j) - l(i, j) * b(j, j) = c(i, j)
+                     ! d(i, i) * r(i, j) - l(i, j) * e(j, j) = f(i, j)
+                 ! for i = p, p - 1, ..., 1; j = 1, 2, ..., q
+                 pq = 0
+                 scale = one
+                 dscale = zero
+                 dsum = one
+                 loop_130: do j = p + 2, q
+                    js = iwork( j )
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    loop_120: do i = p, 1, -1
+                       is = iwork( i )
+                       ie = iwork( i+1 ) - 1
+                       mb = ie - is + 1
+                       call stdlib_wtgsy2( trans, ifunc, mb, nb, a( is, is ), lda,b( js, js ), &
+                       ldb, c( is, js ), ldc,d( is, is ), ldd, e( js, js ), lde,f( is, js ), ldf, &
+                                 scaloc, dsum, dscale,linfo )
+                       if( linfo>0 )info = linfo
+                       pq = pq + mb*nb
+                       if( scaloc/=one ) then
+                          do k = 1, js - 1
+                             call stdlib_wscal( m, cmplx( scaloc, zero,KIND=qp),c( 1, k ), 1 )
+                                       
+                             call stdlib_wscal( m, cmplx( scaloc, zero,KIND=qp),f( 1, k ), 1 )
+                                       
+                          end do
+                          do k = js, je
+                             call stdlib_wscal( is-1, cmplx( scaloc, zero,KIND=qp),c( 1, k ), 1 )
+                                       
+                             call stdlib_wscal( is-1, cmplx( scaloc, zero,KIND=qp),f( 1, k ), 1 )
+                                       
+                          end do
+                          do k = js, je
+                             call stdlib_wscal( m-ie, cmplx( scaloc, zero,KIND=qp),c( ie+1, k ), &
+                                       1 )
+                             call stdlib_wscal( m-ie, cmplx( scaloc, zero,KIND=qp),f( ie+1, k ), &
+                                       1 )
+                          end do
+                          do k = je + 1, n
+                             call stdlib_wscal( m, cmplx( scaloc, zero,KIND=qp),c( 1, k ), 1 )
+                                       
+                             call stdlib_wscal( m, cmplx( scaloc, zero,KIND=qp),f( 1, k ), 1 )
+                                       
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! substitute r(i,j) and l(i,j) into remaining equation.
+                       if( i>1 ) then
+                          call stdlib_wgemm( 'N', 'N', is-1, nb, mb,cmplx( -one, zero,KIND=qp), a(&
+                           1, is ), lda,c( is, js ), ldc, cmplx( one, zero,KIND=qp),c( 1, js ), &
+                                     ldc )
+                          call stdlib_wgemm( 'N', 'N', is-1, nb, mb,cmplx( -one, zero,KIND=qp), d(&
+                           1, is ), ldd,c( is, js ), ldc, cmplx( one, zero,KIND=qp),f( 1, js ), &
+                                     ldf )
+                       end if
+                       if( j<q ) then
+                          call stdlib_wgemm( 'N', 'N', mb, n-je, nb,cmplx( one, zero,KIND=qp), f( &
+                          is, js ), ldf,b( js, je+1 ), ldb,cmplx( one, zero,KIND=qp), c( is, je+1 &
+                                    ),ldc )
+                          call stdlib_wgemm( 'N', 'N', mb, n-je, nb,cmplx( one, zero,KIND=qp), f( &
+                          is, js ), ldf,e( js, je+1 ), lde,cmplx( one, zero,KIND=qp), f( is, je+1 &
+                                    ),ldf )
+                       end if
+                    end do loop_120
+                 end do loop_130
+                 if( dscale/=zero ) then
+                    if( ijob==1 .or. ijob==3 ) then
+                       dif = sqrt( real( 2*m*n,KIND=qp) ) / ( dscale*sqrt( dsum ) )
+                    else
+                       dif = sqrt( real( pq,KIND=qp) ) / ( dscale*sqrt( dsum ) )
+                    end if
+                 end if
+                 if( isolve==2 .and. iround==1 ) then
+                    if( notran ) then
+                       ifunc = ijob
+                    end if
+                    scale2 = scale
+                    call stdlib_wlacpy( 'F', m, n, c, ldc, work, m )
+                    call stdlib_wlacpy( 'F', m, n, f, ldf, work( m*n+1 ), m )
+                    call stdlib_wlaset( 'F', m, n, czero, czero, c, ldc )
+                    call stdlib_wlaset( 'F', m, n, czero, czero, f, ldf )
+                 else if( isolve==2 .and. iround==2 ) then
+                    call stdlib_wlacpy( 'F', m, n, work, m, c, ldc )
+                    call stdlib_wlacpy( 'F', m, n, work( m*n+1 ), m, f, ldf )
+                    scale = scale2
+                 end if
+              end do loop_150
+           else
+              ! solve transposed (i, j)-subsystem
+                  ! a(i, i)**h * r(i, j) + d(i, i)**h * l(i, j) = c(i, j)
+                  ! r(i, j) * b(j, j)  + l(i, j) * e(j, j) = -f(i, j)
+              ! for i = 1,2,..., p; j = q, q-1,..., 1
+              scale = one
+              loop_210: do i = 1, p
+                 is = iwork( i )
+                 ie = iwork( i+1 ) - 1
+                 mb = ie - is + 1
+                 loop_200: do j = q, p + 2, -1
+                    js = iwork( j )
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    call stdlib_wtgsy2( trans, ifunc, mb, nb, a( is, is ), lda,b( js, js ), ldb, &
+                    c( is, js ), ldc,d( is, is ), ldd, e( js, js ), lde,f( is, js ), ldf, scaloc, &
+                              dsum, dscale,linfo )
+                    if( linfo>0 )info = linfo
+                    if( scaloc/=one ) then
+                       do k = 1, js - 1
+                          call stdlib_wscal( m, cmplx( scaloc, zero,KIND=qp), c( 1, k ),1 )
+                                    
+                          call stdlib_wscal( m, cmplx( scaloc, zero,KIND=qp), f( 1, k ),1 )
+                                    
+                       end do
+                       do k = js, je
+                          call stdlib_wscal( is-1, cmplx( scaloc, zero,KIND=qp),c( 1, k ), 1 )
+                                    
+                          call stdlib_wscal( is-1, cmplx( scaloc, zero,KIND=qp),f( 1, k ), 1 )
+                                    
+                       end do
+                       do k = js, je
+                          call stdlib_wscal( m-ie, cmplx( scaloc, zero,KIND=qp),c( ie+1, k ), 1 )
+                                    
+                          call stdlib_wscal( m-ie, cmplx( scaloc, zero,KIND=qp),f( ie+1, k ), 1 )
+                                    
+                       end do
+                       do k = je + 1, n
+                          call stdlib_wscal( m, cmplx( scaloc, zero,KIND=qp), c( 1, k ),1 )
+                                    
+                          call stdlib_wscal( m, cmplx( scaloc, zero,KIND=qp), f( 1, k ),1 )
+                                    
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    ! substitute r(i,j) and l(i,j) into remaining equation.
+                    if( j>p+2 ) then
+                       call stdlib_wgemm( 'N', 'C', mb, js-1, nb,cmplx( one, zero,KIND=qp), c( is,&
+                        js ), ldc,b( 1, js ), ldb, cmplx( one, zero,KIND=qp),f( is, 1 ), ldf )
+                                  
+                       call stdlib_wgemm( 'N', 'C', mb, js-1, nb,cmplx( one, zero,KIND=qp), f( is,&
+                        js ), ldf,e( 1, js ), lde, cmplx( one, zero,KIND=qp),f( is, 1 ), ldf )
+                                  
+                    end if
+                    if( i<p ) then
+                       call stdlib_wgemm( 'C', 'N', m-ie, nb, mb,cmplx( -one, zero,KIND=qp), a( &
+                       is, ie+1 ), lda,c( is, js ), ldc, cmplx( one, zero,KIND=qp),c( ie+1, js ), &
+                                 ldc )
+                       call stdlib_wgemm( 'C', 'N', m-ie, nb, mb,cmplx( -one, zero,KIND=qp), d( &
+                       is, ie+1 ), ldd,f( is, js ), ldf, cmplx( one, zero,KIND=qp),c( ie+1, js ), &
+                                 ldc )
+                    end if
+                 end do loop_200
+              end do loop_210
+           end if
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_wtgsyl
+
+     !> ZTPCON: estimates the reciprocal of the condition number of a packed
+     !> triangular matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_wtpcon( norm, uplo, diag, n, ap, rcond, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(in) :: ap(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(qp) :: ainvnm, anorm, scale, smlnum, xnorm
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=qp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_wlantp( norm, uplo, diag, n, ap, rwork )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_wlatps( uplo, 'NO TRANSPOSE', diag, normin, n, ap,work, scale, &
+                              rwork, info )
+                 else
+                    ! multiply by inv(a**h).
+                    call stdlib_wlatps( uplo, 'CONJUGATE TRANSPOSE', diag, normin,n, ap, work, &
+                              scale, rwork, info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_iwamax( n, work, 1 )
+                    xnorm = cabs1( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_wdrscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_wtpcon
+
+     !> ZTPLQT: computes a blocked LQ factorization of a complex
+     !> "triangular-pentagonal" matrix C, which is composed of a
+     !> triangular block A and pentagonal block B, using the compact
+     !> WY representation for Q.
+
+     pure subroutine stdlib_wtplqt( m, n, l, mb, a, lda, b, ldb, t, ldt, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l, mb
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, lb, nb, iinfo
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. (l>min(m,n) .and. min(m,n)>=0)) then
+              info = -3
+           else if( mb<1 .or. (mb>m .and. m>0)) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m ) ) then
+              info = -8
+           else if( ldt<mb ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPLQT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) return
+           do i = 1, m, mb
+           ! compute the qr factorization of the current block
+              ib = min( m-i+1, mb )
+              nb = min( n-l+i+ib-1, n )
+              if( i>=l ) then
+                 lb = 0
+              else
+                 lb = nb-n+l-i+1
+              end if
+              call stdlib_wtplqt2( ib, nb, lb, a(i,i), lda, b( i, 1 ), ldb,t(1, i ), ldt, iinfo )
+                        
+           ! update by applying h**t to b(i+ib:m,:) from the right
+              if( i+ib<=m ) then
+                 call stdlib_wtprfb( 'R', 'N', 'F', 'R', m-i-ib+1, nb, ib, lb,b( i, 1 ), ldb, t( &
+                           1, i ), ldt,a( i+ib, i ), lda, b( i+ib, 1 ), ldb,work, m-i-ib+1)
+              end if
+           end do
+           return
+     end subroutine stdlib_wtplqt
+
+     !> ZTPLQT2: computes a LQ a factorization of a complex "triangular-pentagonal"
+     !> matrix C, which is composed of a triangular block A and pentagonal block B,
+     !> using the compact WY representation for Q.
+
+     pure subroutine stdlib_wtplqt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, p, mp, np
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. l>min(m,n) ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1, m ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPLQT2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) return
+           do i = 1, m
+              ! generate elementary reflector h(i) to annihilate b(i,:)
+              p = n-l+min( l, i )
+              call stdlib_wlarfg( p+1, a( i, i ), b( i, 1 ), ldb, t( 1, i ) )
+              t(1,i)=conjg(t(1,i))
+              if( i<m ) then
+                 do j = 1, p
+                    b( i, j ) = conjg(b(i,j))
+                 end do
+                 ! w(m-i:1) := c(i+1:m,i:n) * c(i,i:n) [use w = t(m,:)]
+                 do j = 1, m-i
+                    t( m, j ) = (a( i+j, i ))
+                 end do
+                 call stdlib_wgemv( 'N', m-i, p, cone, b( i+1, 1 ), ldb,b( i, 1 ), ldb, cone, t( &
+                           m, 1 ), ldt )
+                 ! c(i+1:m,i:n) = c(i+1:m,i:n) + alpha * c(i,i:n)*w(m-1:1)^h
+                 alpha = -(t( 1, i ))
+                 do j = 1, m-i
+                    a( i+j, i ) = a( i+j, i ) + alpha*(t( m, j ))
+                 end do
+                 call stdlib_wgerc( m-i, p, (alpha),  t( m, 1 ), ldt,b( i, 1 ), ldb, b( i+1, 1 ), &
+                           ldb )
+                 do j = 1, p
+                    b( i, j ) = conjg(b(i,j))
+                 end do
+              end if
+           end do
+           do i = 2, m
+              ! t(i,1:i-1) := c(i:i-1,1:n)**h * (alpha * c(i,i:n))
+              alpha = -(t( 1, i ))
+              do j = 1, i-1
+                 t( i, j ) = czero
+              end do
+              p = min( i-1, l )
+              np = min( n-l+1, n )
+              mp = min( p+1, m )
+              do j = 1, n-l+p
+                b(i,j)=conjg(b(i,j))
+              end do
+              ! triangular part of b2
+              do j = 1, p
+                 t( i, j ) = (alpha*b( i, n-l+j ))
+              end do
+              call stdlib_wtrmv( 'L', 'N', 'N', p, b( 1, np ), ldb,t( i, 1 ), ldt )
+              ! rectangular part of b2
+              call stdlib_wgemv( 'N', i-1-p, l,  alpha, b( mp, np ), ldb,b( i, np ), ldb, czero, &
+                        t( i,mp ), ldt )
+              ! b1
+              call stdlib_wgemv( 'N', i-1, n-l, alpha, b, ldb, b( i, 1 ), ldb,cone, t( i, 1 ), &
+                        ldt )
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(i,1:i-1)
+              do j = 1, i-1
+                 t(i,j)=conjg(t(i,j))
+              end do
+              call stdlib_wtrmv( 'L', 'C', 'N', i-1, t, ldt, t( i, 1 ), ldt )
+              do j = 1, i-1
+                 t(i,j)=conjg(t(i,j))
+              end do
+              do j = 1, n-l+p
+                 b(i,j)=conjg(b(i,j))
+              end do
+              ! t(i,i) = tau(i)
+              t( i, i ) = t( 1, i )
+              t( 1, i ) = czero
+           end do
+           do i=1,m
+              do j= i+1,m
+                 t(i,j)=(t(j,i))
+                 t(j,i)=czero
+              end do
+           end do
+     end subroutine stdlib_wtplqt2
+
+     !> ZTPMLQT: applies a complex unitary matrix Q obtained from a
+     !> "triangular-pentagonal" complex block reflector H to a general
+     !> complex matrix C, which consists of two blocks A and B.
+
+     pure subroutine stdlib_wtpmlqt( side, trans, m, n, k, l, mb, v, ldv, t, ldt,a, lda, b, ldb, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, lda, ldb, m, n, l, mb, ldt
+           ! Array Arguments 
+           complex(qp), intent(in) :: v(ldv,*), t(ldt,*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, nb, lb, kf, ldaq
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'C' )
+           notran = stdlib_lsame( trans, 'N' )
+           if ( left ) then
+              ldaq = max( 1, k )
+           else if ( right ) then
+              ldaq = max( 1, m )
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( l<0 .or. l>k ) then
+              info = -6
+           else if( mb<1 .or. (mb>k .and. k>0) ) then
+              info = -7
+           else if( ldv<k ) then
+              info = -9
+           else if( ldt<mb ) then
+              info = -11
+           else if( lda<ldaq ) then
+              info = -13
+           else if( ldb<max( 1, m ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPMLQT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. notran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = 0
+                 end if
+                 call stdlib_wtprfb( 'L', 'C', 'F', 'R', nb, n, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. tran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = nb-n+l-i+1
+                 end if
+                 call stdlib_wtprfb( 'R', 'N', 'F', 'R', m, nb, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           else if( left .and. tran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = 0
+                 end if
+                 call stdlib_wtprfb( 'L', 'N', 'F', 'R', nb, n, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. notran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = nb-n+l-i+1
+                 end if
+                 call stdlib_wtprfb( 'R', 'C', 'F', 'R', m, nb, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           end if
+           return
+     end subroutine stdlib_wtpmlqt
+
+     !> ZTPMQRT: applies a complex orthogonal matrix Q obtained from a
+     !> "triangular-pentagonal" complex block reflector H to a general
+     !> complex matrix C, which consists of two blocks A and B.
+
+     pure subroutine stdlib_wtpmqrt( side, trans, m, n, k, l, nb, v, ldv, t, ldt,a, lda, b, ldb, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, lda, ldb, m, n, l, nb, ldt
+           ! Array Arguments 
+           complex(qp), intent(in) :: v(ldv,*), t(ldt,*)
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, mb, lb, kf, ldaq, ldvq
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'C' )
+           notran = stdlib_lsame( trans, 'N' )
+           if ( left ) then
+              ldvq = max( 1, m )
+              ldaq = max( 1, k )
+           else if ( right ) then
+              ldvq = max( 1, n )
+              ldaq = max( 1, m )
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( l<0 .or. l>k ) then
+              info = -6
+           else if( nb<1 .or. (nb>k .and. k>0) ) then
+              info = -7
+           else if( ldv<ldvq ) then
+              info = -9
+           else if( ldt<nb ) then
+              info = -11
+           else if( lda<ldaq ) then
+              info = -13
+           else if( ldb<max( 1, m ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPMQRT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. tran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-m+l-i+1
+                 end if
+                 call stdlib_wtprfb( 'L', 'C', 'F', 'C', mb, n, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. notran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-n+l-i+1
+                 end if
+                 call stdlib_wtprfb( 'R', 'N', 'F', 'C', m, mb, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           else if( left .and. notran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-m+l-i+1
+                 end if
+                 call stdlib_wtprfb( 'L', 'N', 'F', 'C', mb, n, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. tran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-n+l-i+1
+                 end if
+                 call stdlib_wtprfb( 'R', 'C', 'F', 'C', m, mb, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           end if
+           return
+     end subroutine stdlib_wtpmqrt
+
+     !> ZTPQRT: computes a blocked QR factorization of a complex
+     !> "triangular-pentagonal" matrix C, which is composed of a
+     !> triangular block A and pentagonal block B, using the compact
+     !> WY representation for Q.
+
+     pure subroutine stdlib_wtpqrt( m, n, l, nb, a, lda, b, ldb, t, ldt, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l, nb
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, lb, mb, iinfo
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. (l>min(m,n) .and. min(m,n)>=0)) then
+              info = -3
+           else if( nb<1 .or. (nb>n .and. n>0)) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, m ) ) then
+              info = -8
+           else if( ldt<nb ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPQRT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) return
+           do i = 1, n, nb
+           ! compute the qr factorization of the current block
+              ib = min( n-i+1, nb )
+              mb = min( m-l+i+ib-1, m )
+              if( i>=l ) then
+                 lb = 0
+              else
+                 lb = mb-m+l-i+1
+              end if
+              call stdlib_wtpqrt2( mb, ib, lb, a(i,i), lda, b( 1, i ), ldb,t(1, i ), ldt, iinfo )
+                        
+           ! update by applying h**h to b(:,i+ib:n) from the left
+              if( i+ib<=n ) then
+                 call stdlib_wtprfb( 'L', 'C', 'F', 'C', mb, n-i-ib+1, ib, lb,b( 1, i ), ldb, t( &
+                           1, i ), ldt,a( i, i+ib ), lda, b( 1, i+ib ), ldb,work, ib )
+              end if
+           end do
+           return
+     end subroutine stdlib_wtpqrt
+
+     !> ZTPQRT2: computes a QR factorization of a complex "triangular-pentagonal"
+     !> matrix C, which is composed of a triangular block A and pentagonal block B,
+     !> using the compact WY representation for Q.
+
+     pure subroutine stdlib_wtpqrt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, p, mp, np
+           complex(qp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. l>min(m,n) ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPQRT2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) return
+           do i = 1, n
+              ! generate elementary reflector h(i) to annihilate b(:,i)
+              p = m-l+min( l, i )
+              call stdlib_wlarfg( p+1, a( i, i ), b( 1, i ), 1, t( i, 1 ) )
+              if( i<n ) then
+                 ! w(1:n-i) := c(i:m,i+1:n)**h * c(i:m,i) [use w = t(:,n)]
+                 do j = 1, n-i
+                    t( j, n ) = conjg(a( i, i+j ))
+                 end do
+                 call stdlib_wgemv( 'C', p, n-i, cone, b( 1, i+1 ), ldb,b( 1, i ), 1, cone, t( 1, &
+                           n ), 1 )
+                 ! c(i:m,i+1:n) = c(i:m,i+1:n) + alpha*c(i:m,i)*w(1:n-1)**h
+                 alpha = -conjg(t( i, 1 ))
+                 do j = 1, n-i
+                    a( i, i+j ) = a( i, i+j ) + alpha*conjg(t( j, n ))
+                 end do
+                 call stdlib_wgerc( p, n-i, alpha, b( 1, i ), 1,t( 1, n ), 1, b( 1, i+1 ), ldb )
+                           
+              end if
+           end do
+           do i = 2, n
+              ! t(1:i-1,i) := c(i:m,1:i-1)**h * (alpha * c(i:m,i))
+              alpha = -t( i, 1 )
+              do j = 1, i-1
+                 t( j, i ) = czero
+              end do
+              p = min( i-1, l )
+              mp = min( m-l+1, m )
+              np = min( p+1, n )
+              ! triangular part of b2
+              do j = 1, p
+                 t( j, i ) = alpha*b( m-l+j, i )
+              end do
+              call stdlib_wtrmv( 'U', 'C', 'N', p, b( mp, 1 ), ldb,t( 1, i ), 1 )
+              ! rectangular part of b2
+              call stdlib_wgemv( 'C', l, i-1-p, alpha, b( mp, np ), ldb,b( mp, i ), 1, czero, t( &
+                        np, i ), 1 )
+              ! b1
+              call stdlib_wgemv( 'C', m-l, i-1, alpha, b, ldb, b( 1, i ), 1,cone, t( 1, i ), 1 )
+                        
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+              call stdlib_wtrmv( 'U', 'N', 'N', i-1, t, ldt, t( 1, i ), 1 )
+              ! t(i,i) = tau(i)
+              t( i, i ) = t( i, 1 )
+              t( i, 1 ) = czero
+           end do
+     end subroutine stdlib_wtpqrt2
+
+     !> ZTPRFB: applies a complex "triangular-pentagonal" block reflector H or its
+     !> conjugate transpose H**H to a complex matrix C, which is composed of two
+     !> blocks A and B, either from the left or right.
+
+     pure subroutine stdlib_wtprfb( side, trans, direct, storev, m, n, k, l,v, ldv, t, ldt, a, &
+               lda, b, ldb, work, ldwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, l, lda, ldb, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(in) :: t(ldt,*), v(ldv,*)
+           complex(qp), intent(out) :: work(ldwork,*)
+        ! ==========================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, mp, np, kp
+           logical(lk) :: left, forward, column, right, backward, row
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 .or. k<=0 .or. l<0 ) return
+           if( stdlib_lsame( storev, 'C' ) ) then
+              column = .true.
+              row = .false.
+           else if ( stdlib_lsame( storev, 'R' ) ) then
+              column = .false.
+              row = .true.
+           else
+              column = .false.
+              row = .false.
+           end if
+           if( stdlib_lsame( side, 'L' ) ) then
+              left = .true.
+              right = .false.
+           else if( stdlib_lsame( side, 'R' ) ) then
+              left = .false.
+              right = .true.
+           else
+              left = .false.
+              right = .false.
+           end if
+           if( stdlib_lsame( direct, 'F' ) ) then
+              forward = .true.
+              backward = .false.
+           else if( stdlib_lsame( direct, 'B' ) ) then
+              forward = .false.
+              backward = .true.
+           else
+              forward = .false.
+              backward = .false.
+           end if
+       ! ---------------------------------------------------------------------------
+           if( column .and. forward .and. left  ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i ]    (k-by-k)
+                        ! [ v ]    (m-by-k)
+              ! form  h c  or  h**h c  where  c = [ a ]  (k-by-n)
+                                                ! [ b ]  (m-by-n)
+              ! h = i - w t w**h          or  h**h = i - w t**h w**h
+              ! a = a -   t (a + v**h b)  or  a = a -   t**h (a + v**h b)
+              ! b = b - v t (a + v**h b)  or  b = b - v t**h (a + v**h b)
+       ! ---------------------------------------------------------------------------
+              mp = min( m-l+1, m )
+              kp = min( l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( i, j ) = b( m-l+i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'L', 'U', 'C', 'N', l, n, cone, v( mp, 1 ), ldv,work, ldwork )
+                        
+              call stdlib_wgemm( 'C', 'N', l, n, m-l, cone, v, ldv, b, ldb,cone, work, ldwork )
+                        
+              call stdlib_wgemm( 'C', 'N', k-l, n, m, cone, v( 1, kp ), ldv,b, ldb, czero, work( &
+                        kp, 1 ), ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'L', 'U', trans, 'N', k, n, cone, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_wgemm( 'N', 'N', m-l, n, k, -cone, v, ldv, work, ldwork,cone, b, ldb )
+                        
+              call stdlib_wgemm( 'N', 'N', l, n, k-l, -cone, v( mp, kp ), ldv,work( kp, 1 ), &
+                        ldwork, cone, b( mp, 1 ),  ldb )
+              call stdlib_wtrmm( 'L', 'U', 'N', 'N', l, n, cone, v( mp, 1 ), ldv,work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, l
+                    b( m-l+i, j ) = b( m-l+i, j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. forward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i ]    (k-by-k)
+                        ! [ v ]    (n-by-k)
+              ! form  c h or  c h**h  where  c = [ a b ] (a is m-by-k, b is m-by-n)
+              ! h = i - w t w**h          or  h**h = i - w t**h w**h
+              ! a = a - (a + b v) t      or  a = a - (a + b v) t**h
+              ! b = b - (a + b v) t v**h  or  b = b - (a + b v) t**h v**h
+       ! ---------------------------------------------------------------------------
+              np = min( n-l+1, n )
+              kp = min( l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, j ) = b( i, n-l+j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'R', 'U', 'N', 'N', m, l, cone, v( np, 1 ), ldv,work, ldwork )
+                        
+              call stdlib_wgemm( 'N', 'N', m, l, n-l, cone, b, ldb,v, ldv, cone, work, ldwork )
+                        
+              call stdlib_wgemm( 'N', 'N', m, k-l, n, cone, b, ldb,v( 1, kp ), ldv, czero, work( &
+                        1, kp ), ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'R', 'U', trans, 'N', m, k, cone, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_wgemm( 'N', 'C', m, n-l, k, -cone, work, ldwork,v, ldv, cone, b, ldb )
+                        
+              call stdlib_wgemm( 'N', 'C', m, l, k-l, -cone, work( 1, kp ), ldwork,v( np, kp ), &
+                        ldv, cone, b( 1, np ), ldb )
+              call stdlib_wtrmm( 'R', 'U', 'C', 'N', m, l, cone, v( np, 1 ), ldv,work, ldwork )
+                        
+              do j = 1, l
+                 do i = 1, m
+                    b( i, n-l+j ) = b( i, n-l+j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. backward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v ]    (m-by-k)
+                        ! [ i ]    (k-by-k)
+              ! form  h c  or  h**h c  where  c = [ b ]  (m-by-n)
+                                                ! [ a ]  (k-by-n)
+              ! h = i - w t w**h          or  h**h = i - w t**h w**h
+              ! a = a -   t (a + v**h b)  or  a = a -   t**h (a + v**h b)
+              ! b = b - v t (a + v**h b)  or  b = b - v t**h (a + v**h b)
+       ! ---------------------------------------------------------------------------
+              mp = min( l+1, m )
+              kp = min( k-l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( k-l+i, j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'L', 'L', 'C', 'N', l, n, cone, v( 1, kp ), ldv,work( kp, 1 ), &
+                        ldwork )
+              call stdlib_wgemm( 'C', 'N', l, n, m-l, cone, v( mp, kp ), ldv,b( mp, 1 ), ldb, &
+                        cone, work( kp, 1 ), ldwork )
+              call stdlib_wgemm( 'C', 'N', k-l, n, m, cone, v, ldv,b, ldb, czero, work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'L', 'L', trans, 'N', k, n, cone, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_wgemm( 'N', 'N', m-l, n, k, -cone, v( mp, 1 ), ldv,work, ldwork, cone, &
+                        b( mp, 1 ), ldb )
+              call stdlib_wgemm( 'N', 'N', l, n, k-l, -cone, v, ldv,work, ldwork, cone, b,  ldb )
+                        
+              call stdlib_wtrmm( 'L', 'L', 'N', 'N', l, n, cone, v( 1, kp ), ldv,work( kp, 1 ), &
+                        ldwork )
+              do j = 1, n
+                 do i = 1, l
+                    b( i, j ) = b( i, j ) - work( k-l+i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. backward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v ]    (n-by-k)
+                        ! [ i ]    (k-by-k)
+              ! form  c h  or  c h**h  where  c = [ b a ] (b is m-by-n, a is m-by-k)
+              ! h = i - w t w**h          or  h**h = i - w t**h w**h
+              ! a = a - (a + b v) t      or  a = a - (a + b v) t**h
+              ! b = b - (a + b v) t v**h  or  b = b - (a + b v) t**h v**h
+       ! ---------------------------------------------------------------------------
+              np = min( l+1, n )
+              kp = min( k-l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, k-l+j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'R', 'L', 'N', 'N', m, l, cone, v( 1, kp ), ldv,work( 1, kp ), &
+                        ldwork )
+              call stdlib_wgemm( 'N', 'N', m, l, n-l, cone, b( 1, np ), ldb,v( np, kp ), ldv, &
+                        cone, work( 1, kp ), ldwork )
+              call stdlib_wgemm( 'N', 'N', m, k-l, n, cone, b, ldb,v, ldv, czero, work, ldwork )
+                        
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'R', 'L', trans, 'N', m, k, cone, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_wgemm( 'N', 'C', m, n-l, k, -cone, work, ldwork,v( np, 1 ), ldv, cone, &
+                        b( 1, np ), ldb )
+              call stdlib_wgemm( 'N', 'C', m, l, k-l, -cone, work, ldwork,v, ldv, cone, b, ldb )
+                        
+              call stdlib_wtrmm( 'R', 'L', 'C', 'N', m, l, cone, v( 1, kp ), ldv,work( 1, kp ), &
+                        ldwork )
+              do j = 1, l
+                 do i = 1, m
+                    b( i, j ) = b( i, j ) - work( i, k-l+j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. forward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i v ] ( i is k-by-k, v is k-by-m )
+              ! form  h c  or  h**h c  where  c = [ a ]  (k-by-n)
+                                                ! [ b ]  (m-by-n)
+              ! h = i - w**h t w          or  h**h = i - w**h t**h w
+              ! a = a -     t (a + v b)  or  a = a -     t**h (a + v b)
+              ! b = b - v**h t (a + v b)  or  b = b - v**h t**h (a + v b)
+       ! ---------------------------------------------------------------------------
+              mp = min( m-l+1, m )
+              kp = min( l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( i, j ) = b( m-l+i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'L', 'L', 'N', 'N', l, n, cone, v( 1, mp ), ldv,work, ldb )
+                        
+              call stdlib_wgemm( 'N', 'N', l, n, m-l, cone, v, ldv,b, ldb,cone, work, ldwork )
+                        
+              call stdlib_wgemm( 'N', 'N', k-l, n, m, cone, v( kp, 1 ), ldv,b, ldb, czero, work( &
+                        kp, 1 ), ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'L', 'U', trans, 'N', k, n, cone, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_wgemm( 'C', 'N', m-l, n, k, -cone, v, ldv, work, ldwork,cone, b, ldb )
+                        
+              call stdlib_wgemm( 'C', 'N', l, n, k-l, -cone, v( kp, mp ), ldv,work( kp, 1 ), &
+                        ldwork, cone, b( mp, 1 ), ldb )
+              call stdlib_wtrmm( 'L', 'L', 'C', 'N', l, n, cone, v( 1, mp ), ldv,work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, l
+                    b( m-l+i, j ) = b( m-l+i, j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. forward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i v ] ( i is k-by-k, v is k-by-n )
+              ! form  c h  or  c h**h  where  c = [ a b ] (a is m-by-k, b is m-by-n)
+              ! h = i - w**h t w            or  h**h = i - w**h t**h w
+              ! a = a - (a + b v**h) t      or  a = a - (a + b v**h) t**h
+              ! b = b - (a + b v**h) t v    or  b = b - (a + b v**h) t**h v
+       ! ---------------------------------------------------------------------------
+              np = min( n-l+1, n )
+              kp = min( l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, j ) = b( i, n-l+j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'R', 'L', 'C', 'N', m, l, cone, v( 1, np ), ldv,work, ldwork )
+                        
+              call stdlib_wgemm( 'N', 'C', m, l, n-l, cone, b, ldb, v, ldv,cone, work, ldwork )
+                        
+              call stdlib_wgemm( 'N', 'C', m, k-l, n, cone, b, ldb,v( kp, 1 ), ldv, czero, work( &
+                        1, kp ), ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'R', 'U', trans, 'N', m, k, cone, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_wgemm( 'N', 'N', m, n-l, k, -cone, work, ldwork,v, ldv, cone, b, ldb )
+                        
+              call stdlib_wgemm( 'N', 'N', m, l, k-l, -cone, work( 1, kp ), ldwork,v( kp, np ), &
+                        ldv, cone, b( 1, np ), ldb )
+              call stdlib_wtrmm( 'R', 'L', 'N', 'N', m, l, cone, v( 1, np ), ldv,work, ldwork )
+                        
+              do j = 1, l
+                 do i = 1, m
+                    b( i, n-l+j ) = b( i, n-l+j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. backward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v i ] ( i is k-by-k, v is k-by-m )
+              ! form  h c  or  h**h c  where  c = [ b ]  (m-by-n)
+                                                ! [ a ]  (k-by-n)
+              ! h = i - w**h t w          or  h**h = i - w**h t**h w
+              ! a = a -     t (a + v b)  or  a = a -     t**h (a + v b)
+              ! b = b - v**h t (a + v b)  or  b = b - v**h t**h (a + v b)
+       ! ---------------------------------------------------------------------------
+              mp = min( l+1, m )
+              kp = min( k-l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( k-l+i, j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'L', 'U', 'N', 'N', l, n, cone, v( kp, 1 ), ldv,work( kp, 1 ), &
+                        ldwork )
+              call stdlib_wgemm( 'N', 'N', l, n, m-l, cone, v( kp, mp ), ldv,b( mp, 1 ), ldb, &
+                        cone, work( kp, 1 ), ldwork )
+              call stdlib_wgemm( 'N', 'N', k-l, n, m, cone, v, ldv, b, ldb,czero, work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'L', 'L ', trans, 'N', k, n, cone, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_wgemm( 'C', 'N', m-l, n, k, -cone, v( 1, mp ), ldv,work, ldwork, cone, &
+                        b( mp, 1 ), ldb )
+              call stdlib_wgemm( 'C', 'N', l, n, k-l, -cone, v, ldv,work, ldwork, cone, b, ldb )
+                        
+              call stdlib_wtrmm( 'L', 'U', 'C', 'N', l, n, cone, v( kp, 1 ), ldv,work( kp, 1 ), &
+                        ldwork )
+              do j = 1, n
+                 do i = 1, l
+                    b( i, j ) = b( i, j ) - work( k-l+i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. backward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v i ] ( i is k-by-k, v is k-by-n )
+              ! form  c h  or  c h**h  where  c = [ b a ] (a is m-by-k, b is m-by-n)
+              ! h = i - w**h t w            or  h**h = i - w**h t**h w
+              ! a = a - (a + b v**h) t      or  a = a - (a + b v**h) t**h
+              ! b = b - (a + b v**h) t v    or  b = b - (a + b v**h) t**h v
+       ! ---------------------------------------------------------------------------
+              np = min( l+1, n )
+              kp = min( k-l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, k-l+j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'R', 'U', 'C', 'N', m, l, cone, v( kp, 1 ), ldv,work( 1, kp ), &
+                        ldwork )
+              call stdlib_wgemm( 'N', 'C', m, l, n-l, cone, b( 1, np ), ldb,v( kp, np ), ldv, &
+                        cone, work( 1, kp ), ldwork )
+              call stdlib_wgemm( 'N', 'C', m, k-l, n, cone, b, ldb, v, ldv,czero, work, ldwork )
+                        
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_wtrmm( 'R', 'L', trans, 'N', m, k, cone, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_wgemm( 'N', 'N', m, n-l, k, -cone, work, ldwork,v( 1, np ), ldv, cone, &
+                        b( 1, np ), ldb )
+              call stdlib_wgemm( 'N', 'N', m, l, k-l , -cone, work, ldwork,v, ldv, cone, b, ldb )
+                        
+              call stdlib_wtrmm( 'R', 'U', 'N', 'N', m, l, cone, v( kp, 1 ), ldv,work( 1, kp ), &
+                        ldwork )
+              do j = 1, l
+                 do i = 1, m
+                    b( i, j ) = b( i, j ) - work( i, k-l+j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_wtprfb
+
+     !> ZTPRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular packed
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by ZTPTRS or some other
+     !> means before entering this routine.  ZTPRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_wtprfs( uplo, trans, diag, n, nrhs, ap, b, ldb, x, ldx,ferr, berr, &
+               work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: ap(*), b(ldb,*), x(ldx,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transn, transt
+           integer(ilp) :: i, j, k, kase, kc, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_wcopy( n, x( 1, j ), 1, work, 1 )
+              call stdlib_wtpmv( uplo, trans, diag, n, ap, work, 1 )
+              call stdlib_waxpy( n, -cone, b( 1, j ), 1, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = 1, k
+                             rwork( i ) = rwork( i ) +cabs1( ap( kc+i-1 ) )*xk
+                          end do
+                          kc = kc + k
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = 1, k - 1
+                             rwork( i ) = rwork( i ) +cabs1( ap( kc+i-1 ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                          kc = kc + k
+                       end do
+                    end if
+                 else
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k, n
+                             rwork( i ) = rwork( i ) +cabs1( ap( kc+i-k ) )*xk
+                          end do
+                          kc = kc + n - k + 1
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k + 1, n
+                             rwork( i ) = rwork( i ) +cabs1( ap( kc+i-k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                          kc = kc + n - k + 1
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**h)*abs(x) + abs(b).
+                 if( upper ) then
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = 1, k
+                             s = s + cabs1( ap( kc+i-1 ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                          kc = kc + k
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = 1, k - 1
+                             s = s + cabs1( ap( kc+i-1 ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                          kc = kc + k
+                       end do
+                    end if
+                 else
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, n
+                             s = s + cabs1( ap( kc+i-k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                          kc = kc + n - k + 1
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = k + 1, n
+                             s = s + cabs1( ap( kc+i-k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                          kc = kc + n - k + 1
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_wlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_wtpsv( uplo, transt, diag, n, ap, work, 1 )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_wtpsv( uplo, transn, diag, n, ap, work, 1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_wtprfs
+
+     !> ZTPTRI: computes the inverse of a complex upper or lower triangular
+     !> matrix A stored in packed format.
+
+     pure subroutine stdlib_wtptri( uplo, diag, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jc, jclast, jj
+           complex(qp) :: ajj
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPTRI', -info )
+              return
+           end if
+           ! check for singularity if non-unit.
+           if( nounit ) then
+              if( upper ) then
+                 jj = 0
+                 do info = 1, n
+                    jj = jj + info
+                    if( ap( jj )==czero )return
+                 end do
+              else
+                 jj = 1
+                 do info = 1, n
+                    if( ap( jj )==czero )return
+                    jj = jj + n - info + 1
+                 end do
+              end if
+              info = 0
+           end if
+           if( upper ) then
+              ! compute inverse of upper triangular matrix.
+              jc = 1
+              do j = 1, n
+                 if( nounit ) then
+                    ap( jc+j-1 ) = cone / ap( jc+j-1 )
+                    ajj = -ap( jc+j-1 )
+                 else
+                    ajj = -cone
+                 end if
+                 ! compute elements 1:j-1 of j-th column.
+                 call stdlib_wtpmv( 'UPPER', 'NO TRANSPOSE', diag, j-1, ap,ap( jc ), 1 )
+                 call stdlib_wscal( j-1, ajj, ap( jc ), 1 )
+                 jc = jc + j
+              end do
+           else
+              ! compute inverse of lower triangular matrix.
+              jc = n*( n+1 ) / 2
+              do j = n, 1, -1
+                 if( nounit ) then
+                    ap( jc ) = cone / ap( jc )
+                    ajj = -ap( jc )
+                 else
+                    ajj = -cone
+                 end if
+                 if( j<n ) then
+                    ! compute elements j+1:n of j-th column.
+                    call stdlib_wtpmv( 'LOWER', 'NO TRANSPOSE', diag, n-j,ap( jclast ), ap( jc+1 )&
+                              , 1 )
+                    call stdlib_wscal( n-j, ajj, ap( jc+1 ), 1 )
+                 end if
+                 jclast = jc
+                 jc = jc - n + j - 2
+              end do
+           end if
+           return
+     end subroutine stdlib_wtptri
+
+     !> ZTPTRS: solves a triangular system of the form
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> where A is a triangular matrix of order N stored in packed format,
+     !> and B is an N-by-NRHS matrix.  A check is made to verify that A is
+     !> nonsingular.
+
+     pure subroutine stdlib_wtptrs( uplo, trans, diag, n, nrhs, ap, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(qp), intent(in) :: ap(*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jc
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              if( upper ) then
+                 jc = 1
+                 do info = 1, n
+                    if( ap( jc+info-1 )==czero )return
+                    jc = jc + info
+                 end do
+              else
+                 jc = 1
+                 do info = 1, n
+                    if( ap( jc )==czero )return
+                    jc = jc + n - info + 1
+                 end do
+              end if
+           end if
+           info = 0
+           ! solve  a * x = b,  a**t * x = b,  or  a**h * x = b.
+           do j = 1, nrhs
+              call stdlib_wtpsv( uplo, trans, diag, n, ap, b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_wtptrs
+
+     !> ZTPTTF: copies a triangular matrix A from standard packed format (TP)
+     !> to rectangular full packed format (TF).
+
+     pure subroutine stdlib_wtpttf( transr, uplo, n, ap, arf, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(qp), intent(in) :: ap(0:*)
+           complex(qp), intent(out) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt
+           integer(ilp) :: i, j, ij
+           integer(ilp) :: ijp, jp, lda, js
+           ! Intrinsic Functions 
+           intrinsic :: conjg,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPTTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( normaltransr ) then
+                 arf( 0 ) = ap( 0 )
+              else
+                 arf( 0 ) = conjg( ap( 0 ) )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           ! set lda of arf^c; arf^c is (0:(n+1)/2-1,0:n-noe)
+           ! where noe = 0 if n is even, noe = 1 if n is odd
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              lda = n + 1
+           else
+              nisodd = .true.
+              lda = n
+           end if
+           ! arf^c has lda rows and n+1-noe cols
+           if( .not.normaltransr )lda = ( n+1 ) / 2
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda = n
+                    ijp = 0
+                    jp = 0
+                    do j = 0, n2
+                       do i = j, n - 1
+                          ij = i + jp
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, n2 - 1
+                       do j = 1 + i, n2
+                          ij = i + j*lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    ijp = 0
+                    do j = 0, n1 - 1
+                       ij = n2 + j
+                       do i = 0, j
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = n1, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ijp = 0
+                    do i = 0, n2
+                       do ij = i*( lda+1 ), n*lda - 1, lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 1
+                    do j = 0, n2 - 1
+                       do ij = js, js + n2 - j - 1
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    ijp = 0
+                    js = n2*lda
+                    do j = 0, n1 - 1
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, n1
+                       do ij = i, i + ( n1+i )*lda, lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    ijp = 0
+                    jp = 0
+                    do j = 0, k - 1
+                       do i = j, n - 1
+                          ij = 1 + i + jp
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, k - 1
+                       do j = i, k - 1
+                          ij = i + j*lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    ijp = 0
+                    do j = 0, k - 1
+                       ij = k + 1 + j
+                       do i = 0, j
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = k, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    ijp = 0
+                    do i = 0, k - 1
+                       do ij = i + ( i+1 )*lda, ( n+1 )*lda - 1, lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 0
+                    do j = 0, k - 1
+                       do ij = js, js + k - j - 1
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    ijp = 0
+                    js = ( k+1 )*lda
+                    do j = 0, k - 1
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, k - 1
+                       do ij = i, i + ( k+i )*lda, lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_wtpttf
+
+     !> ZTPTTR: copies a triangular matrix A from standard packed format (TP)
+     !> to standard full format (TR).
+
+     pure subroutine stdlib_wtpttr( uplo, n, ap, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           complex(qp), intent(out) :: a(lda,*)
+           complex(qp), intent(in) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower
+           integer(ilp) :: i, j, k
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPTTR', -info )
+              return
+           end if
+           if( lower ) then
+              k = 0
+              do j = 1, n
+                 do i = j, n
+                    k = k + 1
+                    a( i, j ) = ap( k )
+                 end do
+              end do
+           else
+              k = 0
+              do j = 1, n
+                 do i = 1, j
+                    k = k + 1
+                    a( i, j ) = ap( k )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_wtpttr
+
+     !> ZTRCON: estimates the reciprocal of the condition number of a
+     !> triangular matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_wtrcon( norm, uplo, diag, n, a, lda, rcond, work,rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(qp), intent(out) :: rcond
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(qp) :: ainvnm, anorm, scale, smlnum, xnorm
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_qlamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=qp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_wlantr( norm, uplo, diag, n, n, a, lda, rwork )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_wlatrs( uplo, 'NO TRANSPOSE', diag, normin, n, a,lda, work, scale,&
+                               rwork, info )
+                 else
+                    ! multiply by inv(a**h).
+                    call stdlib_wlatrs( uplo, 'CONJUGATE TRANSPOSE', diag, normin,n, a, lda, work,&
+                               scale, rwork, info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_iwamax( n, work, 1 )
+                    xnorm = cabs1( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_wdrscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_wtrcon
+
+     !> ZTREVC: computes some or all of the right and/or left eigenvectors of
+     !> a complex upper triangular matrix T.
+     !> Matrices of this type are produced by the Schur factorization of
+     !> a complex general matrix:  A = Q*T*Q**H, as computed by ZHSEQR.
+     !> The right eigenvector x and the left eigenvector y of T corresponding
+     !> to an eigenvalue w are defined by:
+     !> T*x = w*x,     (y**H)*T = w*(y**H)
+     !> where y**H denotes the conjugate transpose of the vector y.
+     !> The eigenvalues are not input to this routine, but are read directly
+     !> from the diagonal of T.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+     !> input matrix.  If Q is the unitary factor that reduces a matrix A to
+     !> Schur form T, then Q*X and Q*Y are the matrices of right and left
+     !> eigenvectors of A.
+
+     pure subroutine stdlib_wtrevc( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m, &
+               work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: t(ldt,*), vl(ldvl,*), vr(ldvr,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           complex(qp), parameter :: cmzero = (0.0e+0_qp,0.0e+0_qp)
+           complex(qp), parameter :: cmone = (1.0e+0_qp,0.0e+0_qp)
+           
+           
+           ! Local Scalars 
+           logical(lk) :: allv, bothv, leftv, over, rightv, somev
+           integer(ilp) :: i, ii, is, j, k, ki
+           real(qp) :: ovfl, remax, scale, smin, smlnum, ulp, unfl
+           complex(qp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! decode and test the input parameters
+           bothv = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv = stdlib_lsame( side, 'L' ) .or. bothv
+           allv = stdlib_lsame( howmny, 'A' )
+           over = stdlib_lsame( howmny, 'B' )
+           somev = stdlib_lsame( howmny, 'S' )
+           ! set m to the number of columns required to store the selected
+           ! eigenvectors.
+           if( somev ) then
+              m = 0
+              do j = 1, n
+                 if( select( j ) )m = m + 1
+              end do
+           else
+              m = n
+           end if
+           info = 0
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.allv .and. .not.over .and. .not.somev ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -10
+           else if( mm<m ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTREVC', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! set the constants to control overflow.
+           unfl = stdlib_qlamch( 'SAFE MINIMUM' )
+           ovfl = one / unfl
+           call stdlib_qlabad( unfl, ovfl )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           ! store the diagonal elements of t in working array work.
+           do i = 1, n
+              work( i+n ) = t( i, i )
+           end do
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           rwork( 1 ) = zero
+           do j = 2, n
+              rwork( j ) = stdlib_qzasum( j-1, t( 1, j ), 1 )
+           end do
+           if( rightv ) then
+              ! compute right eigenvectors.
+              is = m
+              loop_80: do ki = n, 1, -1
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_80
+                 end if
+                 smin = max( ulp*( cabs1( t( ki, ki ) ) ), smlnum )
+                 work( 1 ) = cmone
+                 ! form right-hand side.
+                 do k = 1, ki - 1
+                    work( k ) = -t( k, ki )
+                 end do
+                 ! solve the triangular system:
+                    ! (t(1:ki-1,1:ki-1) - t(ki,ki))*x = scale*work.
+                 do k = 1, ki - 1
+                    t( k, k ) = t( k, k ) - t( ki, ki )
+                    if( cabs1( t( k, k ) )<smin )t( k, k ) = smin
+                 end do
+                 if( ki>1 ) then
+                    call stdlib_wlatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', 'Y',ki-1, t, ldt, &
+                              work( 1 ), scale, rwork,info )
+                    work( ki ) = scale
+                 end if
+                 ! copy the vector x or q*x to vr and normalize.
+                 if( .not.over ) then
+                    call stdlib_wcopy( ki, work( 1 ), 1, vr( 1, is ), 1 )
+                    ii = stdlib_iwamax( ki, vr( 1, is ), 1 )
+                    remax = one / cabs1( vr( ii, is ) )
+                    call stdlib_wdscal( ki, remax, vr( 1, is ), 1 )
+                    do k = ki + 1, n
+                       vr( k, is ) = cmzero
+                    end do
+                 else
+                    if( ki>1 )call stdlib_wgemv( 'N', n, ki-1, cmone, vr, ldvr, work( 1 ),1, &
+                              cmplx( scale,KIND=qp), vr( 1, ki ), 1 )
+                    ii = stdlib_iwamax( n, vr( 1, ki ), 1 )
+                    remax = one / cabs1( vr( ii, ki ) )
+                    call stdlib_wdscal( n, remax, vr( 1, ki ), 1 )
+                 end if
+                 ! set back the original diagonal elements of t.
+                 do k = 1, ki - 1
+                    t( k, k ) = work( k+n )
+                 end do
+                 is = is - 1
+              end do loop_80
+           end if
+           if( leftv ) then
+              ! compute left eigenvectors.
+              is = 1
+              loop_130: do ki = 1, n
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_130
+                 end if
+                 smin = max( ulp*( cabs1( t( ki, ki ) ) ), smlnum )
+                 work( n ) = cmone
+                 ! form right-hand side.
+                 do k = ki + 1, n
+                    work( k ) = -conjg( t( ki, k ) )
+                 end do
+                 ! solve the triangular system:
+                    ! (t(ki+1:n,ki+1:n) - t(ki,ki))**h * x = scale*work.
+                 do k = ki + 1, n
+                    t( k, k ) = t( k, k ) - t( ki, ki )
+                    if( cabs1( t( k, k ) )<smin )t( k, k ) = smin
+                 end do
+                 if( ki<n ) then
+                    call stdlib_wlatrs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT','Y', n-ki, t( &
+                              ki+1, ki+1 ), ldt,work( ki+1 ), scale, rwork, info )
+                    work( ki ) = scale
+                 end if
+                 ! copy the vector x or q*x to vl and normalize.
+                 if( .not.over ) then
+                    call stdlib_wcopy( n-ki+1, work( ki ), 1, vl( ki, is ), 1 )
+                    ii = stdlib_iwamax( n-ki+1, vl( ki, is ), 1 ) + ki - 1
+                    remax = one / cabs1( vl( ii, is ) )
+                    call stdlib_wdscal( n-ki+1, remax, vl( ki, is ), 1 )
+                    do k = 1, ki - 1
+                       vl( k, is ) = cmzero
+                    end do
+                 else
+                    if( ki<n )call stdlib_wgemv( 'N', n, n-ki, cmone, vl( 1, ki+1 ), ldvl,work( &
+                              ki+1 ), 1, cmplx( scale,KIND=qp),vl( 1, ki ), 1 )
+                    ii = stdlib_iwamax( n, vl( 1, ki ), 1 )
+                    remax = one / cabs1( vl( ii, ki ) )
+                    call stdlib_wdscal( n, remax, vl( 1, ki ), 1 )
+                 end if
+                 ! set back the original diagonal elements of t.
+                 do k = ki + 1, n
+                    t( k, k ) = work( k+n )
+                 end do
+                 is = is + 1
+              end do loop_130
+           end if
+           return
+     end subroutine stdlib_wtrevc
+
+     !> ZTREVC3: computes some or all of the right and/or left eigenvectors of
+     !> a complex upper triangular matrix T.
+     !> Matrices of this type are produced by the Schur factorization of
+     !> a complex general matrix:  A = Q*T*Q**H, as computed by ZHSEQR.
+     !> The right eigenvector x and the left eigenvector y of T corresponding
+     !> to an eigenvalue w are defined by:
+     !> T*x = w*x,     (y**H)*T = w*(y**H)
+     !> where y**H denotes the conjugate transpose of the vector y.
+     !> The eigenvalues are not input to this routine, but are read directly
+     !> from the diagonal of T.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+     !> input matrix. If Q is the unitary factor that reduces a matrix A to
+     !> Schur form T, then Q*X and Q*Y are the matrices of right and left
+     !> eigenvectors of A.
+     !> This uses a Level 3 BLAS version of the back transformation.
+
+     pure subroutine stdlib_wtrevc3( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m, &
+               work, lwork, rwork, lrwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, lwork, lrwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(inout) :: t(ldt,*), vl(ldvl,*), vr(ldvr,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmin = 8
+           integer(ilp), parameter :: nbmax = 128
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: allv, bothv, leftv, lquery, over, rightv, somev
+           integer(ilp) :: i, ii, is, j, k, ki, iv, maxwrk, nb
+           real(qp) :: ovfl, remax, scale, smin, smlnum, ulp, unfl
+           complex(qp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! decode and test the input parameters
+           bothv  = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv  = stdlib_lsame( side, 'L' ) .or. bothv
+           allv  = stdlib_lsame( howmny, 'A' )
+           over  = stdlib_lsame( howmny, 'B' )
+           somev = stdlib_lsame( howmny, 'S' )
+           ! set m to the number of columns required to store the selected
+           ! eigenvectors.
+           if( somev ) then
+              m = 0
+              do j = 1, n
+                 if( select( j ) )m = m + 1
+              end do
+           else
+              m = n
+           end if
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZTREVC', side // howmny, n, -1, -1, -1 )
+           maxwrk = n + 2*n*nb
+           work(1) = maxwrk
+           rwork(1) = n
+           lquery = ( lwork==-1 .or. lrwork==-1 )
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.allv .and. .not.over .and. .not.somev ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -10
+           else if( mm<m ) then
+              info = -11
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -14
+           else if ( lrwork<max( 1, n ) .and. .not.lquery ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTREVC3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! use blocked version of back-transformation if sufficient workspace.
+           ! zero-out the workspace to avoid potential nan propagation.
+           if( over .and. lwork >= n + 2*n*nbmin ) then
+              nb = (lwork - n) / (2*n)
+              nb = min( nb, nbmax )
+              call stdlib_wlaset( 'F', n, 1+2*nb, czero, czero, work, n )
+           else
+              nb = 1
+           end if
+           ! set the constants to control overflow.
+           unfl = stdlib_qlamch( 'SAFE MINIMUM' )
+           ovfl = one / unfl
+           call stdlib_qlabad( unfl, ovfl )
+           ulp = stdlib_qlamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           ! store the diagonal elements of t in working array work.
+           do i = 1, n
+              work( i ) = t( i, i )
+           end do
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           rwork( 1 ) = zero
+           do j = 2, n
+              rwork( j ) = stdlib_qzasum( j-1, t( 1, j ), 1 )
+           end do
+           if( rightv ) then
+              ! ============================================================
+              ! compute right eigenvectors.
+              ! iv is index of column in current block.
+              ! non-blocked version always uses iv=nb=1;
+              ! blocked     version starts with iv=nb, goes down to 1.
+              ! (note the "0-th" column is used to store the original diagonal.)
+              iv = nb
+              is = m
+              loop_80: do ki = n, 1, -1
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_80
+                 end if
+                 smin = max( ulp*( cabs1( t( ki, ki ) ) ), smlnum )
+                 ! --------------------------------------------------------
+                 ! complex right eigenvector
+                 work( ki + iv*n ) = cone
+                 ! form right-hand side.
+                 do k = 1, ki - 1
+                    work( k + iv*n ) = -t( k, ki )
+                 end do
+                 ! solve upper triangular system:
+                 ! [ t(1:ki-1,1:ki-1) - t(ki,ki) ]*x = scale*work.
+                 do k = 1, ki - 1
+                    t( k, k ) = t( k, k ) - t( ki, ki )
+                    if( cabs1( t( k, k ) )<smin )t( k, k ) = smin
+                 end do
+                 if( ki>1 ) then
+                    call stdlib_wlatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', 'Y',ki-1, t, ldt, &
+                              work( 1 + iv*n ), scale,rwork, info )
+                    work( ki + iv*n ) = scale
+                 end if
+                 ! copy the vector x or q*x to vr and normalize.
+                 if( .not.over ) then
+                    ! ------------------------------
+                    ! no back-transform: copy x to vr and normalize.
+                    call stdlib_wcopy( ki, work( 1 + iv*n ), 1, vr( 1, is ), 1 )
+                    ii = stdlib_iwamax( ki, vr( 1, is ), 1 )
+                    remax = one / cabs1( vr( ii, is ) )
+                    call stdlib_wdscal( ki, remax, vr( 1, is ), 1 )
+                    do k = ki + 1, n
+                       vr( k, is ) = czero
+                    end do
+                 else if( nb==1 ) then
+                    ! ------------------------------
+                    ! version 1: back-transform each vector with gemv, q*x.
+                    if( ki>1 )call stdlib_wgemv( 'N', n, ki-1, cone, vr, ldvr,work( 1 + iv*n ), 1,&
+                               cmplx( scale,KIND=qp),vr( 1, ki ), 1 )
+                    ii = stdlib_iwamax( n, vr( 1, ki ), 1 )
+                    remax = one / cabs1( vr( ii, ki ) )
+                    call stdlib_wdscal( n, remax, vr( 1, ki ), 1 )
+                 else
+                    ! ------------------------------
+                    ! version 2: back-transform block of vectors with gemm
+                    ! zero out below vector
+                    do k = ki + 1, n
+                       work( k + iv*n ) = czero
+                    end do
+                    ! columns iv:nb of work are valid vectors.
+                    ! when the number of vectors stored reaches nb,
+                    ! or if this was last vector, do the gemm
+                    if( (iv==1) .or. (ki==1) ) then
+                       call stdlib_wgemm( 'N', 'N', n, nb-iv+1, ki+nb-iv, cone,vr, ldvr,work( 1 + &
+                                 (iv)*n    ), n,czero,work( 1 + (nb+iv)*n ), n )
+                       ! normalize vectors
+                       do k = iv, nb
+                          ii = stdlib_iwamax( n, work( 1 + (nb+k)*n ), 1 )
+                          remax = one / cabs1( work( ii + (nb+k)*n ) )
+                          call stdlib_wdscal( n, remax, work( 1 + (nb+k)*n ), 1 )
+                       end do
+                       call stdlib_wlacpy( 'F', n, nb-iv+1,work( 1 + (nb+iv)*n ), n,vr( 1, ki ), &
+                                 ldvr )
+                       iv = nb
+                    else
+                       iv = iv - 1
+                    end if
+                 end if
+                 ! restore the original diagonal elements of t.
+                 do k = 1, ki - 1
+                    t( k, k ) = work( k )
+                 end do
+                 is = is - 1
+              end do loop_80
+           end if
+           if( leftv ) then
+              ! ============================================================
+              ! compute left eigenvectors.
+              ! iv is index of column in current block.
+              ! non-blocked version always uses iv=1;
+              ! blocked     version starts with iv=1, goes up to nb.
+              ! (note the "0-th" column is used to store the original diagonal.)
+              iv = 1
+              is = 1
+              loop_130: do ki = 1, n
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_130
+                 end if
+                 smin = max( ulp*( cabs1( t( ki, ki ) ) ), smlnum )
+                 ! --------------------------------------------------------
+                 ! complex left eigenvector
+                 work( ki + iv*n ) = cone
+                 ! form right-hand side.
+                 do k = ki + 1, n
+                    work( k + iv*n ) = -conjg( t( ki, k ) )
+                 end do
+                 ! solve conjugate-transposed triangular system:
+                 ! [ t(ki+1:n,ki+1:n) - t(ki,ki) ]**h * x = scale*work.
+                 do k = ki + 1, n
+                    t( k, k ) = t( k, k ) - t( ki, ki )
+                    if( cabs1( t( k, k ) )<smin )t( k, k ) = smin
+                 end do
+                 if( ki<n ) then
+                    call stdlib_wlatrs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT','Y', n-ki, t( &
+                              ki+1, ki+1 ), ldt,work( ki+1 + iv*n ), scale, rwork, info )
+                    work( ki + iv*n ) = scale
+                 end if
+                 ! copy the vector x or q*x to vl and normalize.
+                 if( .not.over ) then
+                    ! ------------------------------
+                    ! no back-transform: copy x to vl and normalize.
+                    call stdlib_wcopy( n-ki+1, work( ki + iv*n ), 1, vl(ki,is), 1 )
+                    ii = stdlib_iwamax( n-ki+1, vl( ki, is ), 1 ) + ki - 1
+                    remax = one / cabs1( vl( ii, is ) )
+                    call stdlib_wdscal( n-ki+1, remax, vl( ki, is ), 1 )
+                    do k = 1, ki - 1
+                       vl( k, is ) = czero
+                    end do
+                 else if( nb==1 ) then
+                    ! ------------------------------
+                    ! version 1: back-transform each vector with gemv, q*x.
+                    if( ki<n )call stdlib_wgemv( 'N', n, n-ki, cone, vl( 1, ki+1 ), ldvl,work( ki+&
+                              1 + iv*n ), 1, cmplx( scale,KIND=qp),vl( 1, ki ), 1 )
+                    ii = stdlib_iwamax( n, vl( 1, ki ), 1 )
+                    remax = one / cabs1( vl( ii, ki ) )
+                    call stdlib_wdscal( n, remax, vl( 1, ki ), 1 )
+                 else
+                    ! ------------------------------
+                    ! version 2: back-transform block of vectors with gemm
+                    ! zero out above vector
+                    ! could go from ki-nv+1 to ki-1
+                    do k = 1, ki - 1
+                       work( k + iv*n ) = czero
+                    end do
+                    ! columns 1:iv of work are valid vectors.
+                    ! when the number of vectors stored reaches nb,
+                    ! or if this was last vector, do the gemm
+                    if( (iv==nb) .or. (ki==n) ) then
+                       call stdlib_wgemm( 'N', 'N', n, iv, n-ki+iv, cone,vl( 1, ki-iv+1 ), ldvl,&
+                                 work( ki-iv+1 + (1)*n ), n,czero,work( 1 + (nb+1)*n ), n )
+                       ! normalize vectors
+                       do k = 1, iv
+                          ii = stdlib_iwamax( n, work( 1 + (nb+k)*n ), 1 )
+                          remax = one / cabs1( work( ii + (nb+k)*n ) )
+                          call stdlib_wdscal( n, remax, work( 1 + (nb+k)*n ), 1 )
+                       end do
+                       call stdlib_wlacpy( 'F', n, iv,work( 1 + (nb+1)*n ), n,vl( 1, ki-iv+1 ), &
+                                 ldvl )
+                       iv = 1
+                    else
+                       iv = iv + 1
+                    end if
+                 end if
+                 ! restore the original diagonal elements of t.
+                 do k = ki + 1, n
+                    t( k, k ) = work( k )
+                 end do
+                 is = is + 1
+              end do loop_130
+           end if
+           return
+     end subroutine stdlib_wtrevc3
+
+     !> ZTREXC: reorders the Schur factorization of a complex matrix
+     !> A = Q*T*Q**H, so that the diagonal element of T with row index IFST
+     !> is moved to row ILST.
+     !> The Schur form T is reordered by a unitary similarity transformation
+     !> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by
+     !> postmultplying it with Z.
+
+     pure subroutine stdlib_wtrexc( compq, n, t, ldt, q, ldq, ifst, ilst, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq
+           integer(ilp), intent(in) :: ifst, ilst, ldq, ldt, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(qp), intent(inout) :: q(ldq,*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: wantq
+           integer(ilp) :: k, m1, m2, m3
+           real(qp) :: cs
+           complex(qp) :: sn, t11, t22, temp
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           info = 0
+           wantq = stdlib_lsame( compq, 'V' )
+           if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldt<max( 1, n ) ) then
+              info = -4
+           else if( ldq<1 .or. ( wantq .and. ldq<max( 1, n ) ) ) then
+              info = -6
+           else if(( ifst<1 .or. ifst>n ).and.( n>0 )) then
+              info = -7
+           else if(( ilst<1 .or. ilst>n ).and.( n>0 )) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTREXC', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 .or. ifst==ilst )return
+           if( ifst<ilst ) then
+              ! move the ifst-th diagonal element forward down the diagonal.
+              m1 = 0
+              m2 = -1
+              m3 = 1
+           else
+              ! move the ifst-th diagonal element backward up the diagonal.
+              m1 = -1
+              m2 = 0
+              m3 = -1
+           end if
+           do k = ifst + m1, ilst + m2, m3
+              ! interchange the k-th and (k+1)-th diagonal elements.
+              t11 = t( k, k )
+              t22 = t( k+1, k+1 )
+              ! determine the transformation to perform the interchange.
+              call stdlib_wlartg( t( k, k+1 ), t22-t11, cs, sn, temp )
+              ! apply transformation to the matrix t.
+              if( k+2<=n )call stdlib_wrot( n-k-1, t( k, k+2 ), ldt, t( k+1, k+2 ), ldt, cs,sn )
+                        
+              call stdlib_wrot( k-1, t( 1, k ), 1, t( 1, k+1 ), 1, cs,conjg( sn ) )
+              t( k, k ) = t22
+              t( k+1, k+1 ) = t11
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_wrot( n, q( 1, k ), 1, q( 1, k+1 ), 1, cs,conjg( sn ) )
+              end if
+           end do
+           return
+     end subroutine stdlib_wtrexc
+
+     !> ZTRRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by ZTRTRS or some other
+     !> means before entering this routine.  ZTRRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_wtrrfs( uplo, trans, diag, n, nrhs, a, lda, b, ldb, x,ldx, ferr, berr,&
+                work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(qp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(qp), intent(in) :: a(lda,*), b(ldb,*), x(ldx,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transn, transt
+           integer(ilp) :: i, j, k, kase, nz
+           real(qp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(qp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=qp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_qlamch( 'EPSILON' )
+           safmin = stdlib_qlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_wcopy( n, x( 1, j ), 1, work, 1 )
+              call stdlib_wtrmv( uplo, trans, diag, n, a, lda, work, 1 )
+              call stdlib_waxpy( n, -cone, b( 1, j ), 1, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = 1, k
+                             rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = 1, k - 1
+                             rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k, n
+                             rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k + 1, n
+                             rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**h)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = 1, k
+                             s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = 1, k - 1
+                             s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, n
+                             s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = k + 1, n
+                             s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_wlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_wlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_wtrsv( uplo, transt, diag, n, a, lda, work, 1 )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_wtrsv( uplo, transn, diag, n, a, lda, work, 1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_wtrrfs
+
+     !> ZTRSEN: reorders the Schur factorization of a complex matrix
+     !> A = Q*T*Q**H, so that a selected cluster of eigenvalues appears in
+     !> the leading positions on the diagonal of the upper triangular matrix
+     !> T, and the leading columns of Q form an orthonormal basis of the
+     !> corresponding right invariant subspace.
+     !> Optionally the routine computes the reciprocal condition numbers of
+     !> the cluster of eigenvalues and/or the invariant subspace.
+
+     subroutine stdlib_wtrsen( job, compq, select, n, t, ldt, q, ldq, w, m, s,sep, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldq, ldt, lwork, n
+           real(qp), intent(out) :: s, sep
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           complex(qp), intent(inout) :: q(ldq,*), t(ldt,*)
+           complex(qp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantbh, wantq, wants, wantsp
+           integer(ilp) :: ierr, k, kase, ks, lwmin, n1, n2, nn
+           real(qp) :: est, rnorm, scale
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           real(qp) :: rwork(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantsp = stdlib_lsame( job, 'V' ) .or. wantbh
+           wantq = stdlib_lsame( compq, 'V' )
+           ! set m to the number of selected eigenvalues.
+           m = 0
+           do k = 1, n
+              if( select( k ) )m = m + 1
+           end do
+           n1 = m
+           n2 = n - m
+           nn = n1*n2
+           info = 0
+           lquery = ( lwork==-1 )
+           if( wantsp ) then
+              lwmin = max( 1, 2*nn )
+           else if( stdlib_lsame( job, 'N' ) ) then
+              lwmin = 1
+           else if( stdlib_lsame( job, 'E' ) ) then
+              lwmin = max( 1, nn )
+           end if
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.wants .and. .not.wantsp )then
+              info = -1
+           else if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -8
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -14
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRSEN', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==n .or. m==0 ) then
+              if( wants )s = one
+              if( wantsp )sep = stdlib_wlange( '1', n, n, t, ldt, rwork )
+              go to 40
+           end if
+           ! collect the selected eigenvalues at the top left corner of t.
+           ks = 0
+           do k = 1, n
+              if( select( k ) ) then
+                 ks = ks + 1
+                 ! swap the k-th eigenvalue to position ks.
+                 if( k/=ks )call stdlib_wtrexc( compq, n, t, ldt, q, ldq, k, ks, ierr )
+              end if
+           end do
+           if( wants ) then
+              ! solve the sylvester equation for r:
+                 ! t11*r - r*t22 = scale*t12
+              call stdlib_wlacpy( 'F', n1, n2, t( 1, n1+1 ), ldt, work, n1 )
+              call stdlib_wtrsyl( 'N', 'N', -1, n1, n2, t, ldt, t( n1+1, n1+1 ),ldt, work, n1, &
+                        scale, ierr )
+              ! estimate the reciprocal of the condition number of the cluster
+              ! of eigenvalues.
+              rnorm = stdlib_wlange( 'F', n1, n2, work, n1, rwork )
+              if( rnorm==zero ) then
+                 s = one
+              else
+                 s = scale / ( sqrt( scale*scale / rnorm+rnorm )*sqrt( rnorm ) )
+              end if
+           end if
+           if( wantsp ) then
+              ! estimate sep(t11,t22).
+              est = zero
+              kase = 0
+              30 continue
+              call stdlib_wlacn2( nn, work( nn+1 ), work, est, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! solve t11*r - r*t22 = scale*x.
+                    call stdlib_wtrsyl( 'N', 'N', -1, n1, n2, t, ldt,t( n1+1, n1+1 ), ldt, work, &
+                              n1, scale,ierr )
+                 else
+                    ! solve t11**h*r - r*t22**h = scale*x.
+                    call stdlib_wtrsyl( 'C', 'C', -1, n1, n2, t, ldt,t( n1+1, n1+1 ), ldt, work, &
+                              n1, scale,ierr )
+                 end if
+                 go to 30
+              end if
+              sep = scale / est
+           end if
+           40 continue
+           ! copy reordered eigenvalues to w.
+           do k = 1, n
+              w( k ) = t( k, k )
+           end do
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_wtrsen
+
+     !> ZTRSNA: estimates reciprocal condition numbers for specified
+     !> eigenvalues and/or right eigenvectors of a complex upper triangular
+     !> matrix T (or of any matrix Q*T*Q**H with Q unitary).
+
+     pure subroutine stdlib_wtrsna( job, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, s, sep, mm,&
+                m, work, ldwork, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, ldwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(qp), intent(out) :: rwork(*), s(*), sep(*)
+           complex(qp), intent(in) :: t(ldt,*), vl(ldvl,*), vr(ldvr,*)
+           complex(qp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: somcon, wantbh, wants, wantsp
+           character :: normin
+           integer(ilp) :: i, ierr, ix, j, k, kase, ks
+           real(qp) :: bignum, eps, est, lnrm, rnrm, scale, smlnum, xnorm
+           complex(qp) :: cdum, prod
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           complex(qp) :: dummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(qp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=qp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantsp = stdlib_lsame( job, 'V' ) .or. wantbh
+           somcon = stdlib_lsame( howmny, 'S' )
+           ! set m to the number of eigenpairs for which condition numbers are
+           ! to be computed.
+           if( somcon ) then
+              m = 0
+              do j = 1, n
+                 if( select( j ) )m = m + 1
+              end do
+           else
+              m = n
+           end if
+           info = 0
+           if( .not.wants .and. .not.wantsp ) then
+              info = -1
+           else if( .not.stdlib_lsame( howmny, 'A' ) .and. .not.somcon ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( wants .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( wants .and. ldvr<n ) ) then
+              info = -10
+           else if( mm<m ) then
+              info = -13
+           else if( ldwork<1 .or. ( wantsp .and. ldwork<n ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRSNA', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( somcon ) then
+                 if( .not.select( 1 ) )return
+              end if
+              if( wants )s( 1 ) = one
+              if( wantsp )sep( 1 ) = abs( t( 1, 1 ) )
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           ks = 1
+           loop_50: do k = 1, n
+              if( somcon ) then
+                 if( .not.select( k ) )cycle loop_50
+              end if
+              if( wants ) then
+                 ! compute the reciprocal condition number of the k-th
+                 ! eigenvalue.
+                 prod = stdlib_wdotc( n, vr( 1, ks ), 1, vl( 1, ks ), 1 )
+                 rnrm = stdlib_qznrm2( n, vr( 1, ks ), 1 )
+                 lnrm = stdlib_qznrm2( n, vl( 1, ks ), 1 )
+                 s( ks ) = abs( prod ) / ( rnrm*lnrm )
+              end if
+              if( wantsp ) then
+                 ! estimate the reciprocal condition number of the k-th
+                 ! eigenvector.
+                 ! copy the matrix t to the array work and swap the k-th
+                 ! diagonal element to the (1,1) position.
+                 call stdlib_wlacpy( 'FULL', n, n, t, ldt, work, ldwork )
+                 call stdlib_wtrexc( 'NO Q', n, work, ldwork, dummy, 1, k, 1, ierr )
+                 ! form  c = t22 - lambda*i in work(2:n,2:n).
+                 do i = 2, n
+                    work( i, i ) = work( i, i ) - work( 1, 1 )
+                 end do
+                 ! estimate a lower bound for the 1-norm of inv(c**h). the 1st
+                 ! and (n+1)th columns of work are used to store work vectors.
+                 sep( ks ) = zero
+                 est = zero
+                 kase = 0
+                 normin = 'N'
+                 30 continue
+                 call stdlib_wlacn2( n-1, work( 1, n+1 ), work, est, kase, isave )
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve c**h*x = scale*b
+                       call stdlib_wlatrs( 'UPPER', 'CONJUGATE TRANSPOSE','NONUNIT', normin, n-1, &
+                                 work( 2, 2 ),ldwork, work, scale, rwork, ierr )
+                    else
+                       ! solve c*x = scale*b
+                       call stdlib_wlatrs( 'UPPER', 'NO TRANSPOSE', 'NONUNIT',normin, n-1, work( &
+                                 2, 2 ), ldwork, work,scale, rwork, ierr )
+                    end if
+                    normin = 'Y'
+                    if( scale/=one ) then
+                       ! multiply by 1/scale if doing so will not cause
+                       ! overflow.
+                       ix = stdlib_iwamax( n-1, work, 1 )
+                       xnorm = cabs1( work( ix, 1 ) )
+                       if( scale<xnorm*smlnum .or. scale==zero )go to 40
+                       call stdlib_wdrscl( n, scale, work, 1 )
+                    end if
+                    go to 30
+                 end if
+                 sep( ks ) = one / max( est, smlnum )
+              end if
+              40 continue
+              ks = ks + 1
+           end do loop_50
+           return
+     end subroutine stdlib_wtrsna
+
+     !> ZTRSYL: solves the complex Sylvester matrix equation:
+     !> op(A)*X + X*op(B) = scale*C or
+     !> op(A)*X - X*op(B) = scale*C,
+     !> where op(A) = A or A**H, and A and B are both upper triangular. A is
+     !> M-by-M and B is N-by-N; the right hand side C and the solution X are
+     !> M-by-N; and scale is an output scale factor, set <= 1 to avoid
+     !> overflow in X.
+
+     subroutine stdlib_wtrsyl( trana, tranb, isgn, m, n, a, lda, b, ldb, c,ldc, scale, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trana, tranb
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: isgn, lda, ldb, ldc, m, n
+           real(qp), intent(out) :: scale
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(qp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notrna, notrnb
+           integer(ilp) :: j, k, l
+           real(qp) :: bignum, da11, db, eps, scaloc, sgn, smin, smlnum
+           complex(qp) :: a11, suml, sumr, vec, x11
+           ! Local Arrays 
+           real(qp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,min
+           ! Executable Statements 
+           ! decode and test input parameters
+           notrna = stdlib_lsame( trana, 'N' )
+           notrnb = stdlib_lsame( tranb, 'N' )
+           info = 0
+           if( .not.notrna .and. .not.stdlib_lsame( trana, 'C' ) ) then
+              info = -1
+           else if( .not.notrnb .and. .not.stdlib_lsame( tranb, 'C' ) ) then
+              info = -2
+           else if( isgn/=1 .and. isgn/=-1 ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, m ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRSYL', -info )
+              return
+           end if
+           ! quick return if possible
+           scale = one
+           if( m==0 .or. n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_qlamch( 'P' )
+           smlnum = stdlib_qlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_qlabad( smlnum, bignum )
+           smlnum = smlnum*real( m*n,KIND=qp) / eps
+           bignum = one / smlnum
+           smin = max( smlnum, eps*stdlib_wlange( 'M', m, m, a, lda, dum ),eps*stdlib_wlange( 'M',&
+                      n, n, b, ldb, dum ) )
+           sgn = isgn
+           if( notrna .and. notrnb ) then
+              ! solve    a*x + isgn*x*b = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! bottom-left corner column by column by
+                  ! a(k,k)*x(k,l) + isgn*x(k,l)*b(l,l) = c(k,l) - r(k,l)
+              ! where
+                          ! m                        l-1
+                ! r(k,l) = sum [a(k,i)*x(i,l)] +isgn*sum [x(k,j)*b(j,l)].
+                        ! i=k+1                      j=1
+              loop_30: do l = 1, n
+                 do k = m, 1, -1
+                    suml = stdlib_wdotu( m-k, a( k, min( k+1, m ) ), lda,c( min( k+1, m ), l ), 1 &
+                              )
+                    sumr = stdlib_wdotu( l-1, c( k, 1 ), ldc, b( 1, l ), 1 )
+                    vec = c( k, l ) - ( suml+sgn*sumr )
+                    scaloc = one
+                    a11 = a( k, k ) + sgn*b( l, l )
+                    da11 = abs( real( a11,KIND=qp) ) + abs( aimag( a11 ) )
+                    if( da11<=smin ) then
+                       a11 = smin
+                       da11 = smin
+                       info = 1
+                    end if
+                    db = abs( real( vec,KIND=qp) ) + abs( aimag( vec ) )
+                    if( da11<one .and. db>one ) then
+                       if( db>bignum*da11 )scaloc = one / db
+                    end if
+                    x11 = stdlib_wladiv( vec*cmplx( scaloc,KIND=qp), a11 )
+                    if( scaloc/=one ) then
+                       do j = 1, n
+                          call stdlib_wdscal( m, scaloc, c( 1, j ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    c( k, l ) = x11
+                 end do
+              end do loop_30
+           else if( .not.notrna .and. notrnb ) then
+              ! solve    a**h *x + isgn*x*b = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! upper-left corner column by column by
+                  ! a**h(k,k)*x(k,l) + isgn*x(k,l)*b(l,l) = c(k,l) - r(k,l)
+              ! where
+                         ! k-1                           l-1
+                ! r(k,l) = sum [a**h(i,k)*x(i,l)] + isgn*sum [x(k,j)*b(j,l)]
+                         ! i=1                           j=1
+              loop_60: do l = 1, n
+                 do k = 1, m
+                    suml = stdlib_wdotc( k-1, a( 1, k ), 1, c( 1, l ), 1 )
+                    sumr = stdlib_wdotu( l-1, c( k, 1 ), ldc, b( 1, l ), 1 )
+                    vec = c( k, l ) - ( suml+sgn*sumr )
+                    scaloc = one
+                    a11 = conjg( a( k, k ) ) + sgn*b( l, l )
+                    da11 = abs( real( a11,KIND=qp) ) + abs( aimag( a11 ) )
+                    if( da11<=smin ) then
+                       a11 = smin
+                       da11 = smin
+                       info = 1
+                    end if
+                    db = abs( real( vec,KIND=qp) ) + abs( aimag( vec ) )
+                    if( da11<one .and. db>one ) then
+                       if( db>bignum*da11 )scaloc = one / db
+                    end if
+                    x11 = stdlib_wladiv( vec*cmplx( scaloc,KIND=qp), a11 )
+                    if( scaloc/=one ) then
+                       do j = 1, n
+                          call stdlib_wdscal( m, scaloc, c( 1, j ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    c( k, l ) = x11
+                 end do
+              end do loop_60
+           else if( .not.notrna .and. .not.notrnb ) then
+              ! solve    a**h*x + isgn*x*b**h = c.
+              ! the (k,l)th block of x is determined starting from
+              ! upper-right corner column by column by
+                  ! a**h(k,k)*x(k,l) + isgn*x(k,l)*b**h(l,l) = c(k,l) - r(k,l)
+              ! where
+                          ! k-1
+                 ! r(k,l) = sum [a**h(i,k)*x(i,l)] +
+                          ! i=1
+                                 ! n
+                           ! isgn*sum [x(k,j)*b**h(l,j)].
+                                ! j=l+1
+              loop_90: do l = n, 1, -1
+                 do k = 1, m
+                    suml = stdlib_wdotc( k-1, a( 1, k ), 1, c( 1, l ), 1 )
+                    sumr = stdlib_wdotc( n-l, c( k, min( l+1, n ) ), ldc,b( l, min( l+1, n ) ), &
+                              ldb )
+                    vec = c( k, l ) - ( suml+sgn*conjg( sumr ) )
+                    scaloc = one
+                    a11 = conjg( a( k, k )+sgn*b( l, l ) )
+                    da11 = abs( real( a11,KIND=qp) ) + abs( aimag( a11 ) )
+                    if( da11<=smin ) then
+                       a11 = smin
+                       da11 = smin
+                       info = 1
+                    end if
+                    db = abs( real( vec,KIND=qp) ) + abs( aimag( vec ) )
+                    if( da11<one .and. db>one ) then
+                       if( db>bignum*da11 )scaloc = one / db
+                    end if
+                    x11 = stdlib_wladiv( vec*cmplx( scaloc,KIND=qp), a11 )
+                    if( scaloc/=one ) then
+                       do j = 1, n
+                          call stdlib_wdscal( m, scaloc, c( 1, j ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    c( k, l ) = x11
+                 end do
+              end do loop_90
+           else if( notrna .and. .not.notrnb ) then
+              ! solve    a*x + isgn*x*b**h = c.
+              ! the (k,l)th block of x is determined starting from
+              ! bottom-left corner column by column by
+                 ! a(k,k)*x(k,l) + isgn*x(k,l)*b**h(l,l) = c(k,l) - r(k,l)
+              ! where
+                          ! m                          n
+                ! r(k,l) = sum [a(k,i)*x(i,l)] + isgn*sum [x(k,j)*b**h(l,j)]
+                        ! i=k+1                      j=l+1
+              loop_120: do l = n, 1, -1
+                 do k = m, 1, -1
+                    suml = stdlib_wdotu( m-k, a( k, min( k+1, m ) ), lda,c( min( k+1, m ), l ), 1 &
+                              )
+                    sumr = stdlib_wdotc( n-l, c( k, min( l+1, n ) ), ldc,b( l, min( l+1, n ) ), &
+                              ldb )
+                    vec = c( k, l ) - ( suml+sgn*conjg( sumr ) )
+                    scaloc = one
+                    a11 = a( k, k ) + sgn*conjg( b( l, l ) )
+                    da11 = abs( real( a11,KIND=qp) ) + abs( aimag( a11 ) )
+                    if( da11<=smin ) then
+                       a11 = smin
+                       da11 = smin
+                       info = 1
+                    end if
+                    db = abs( real( vec,KIND=qp) ) + abs( aimag( vec ) )
+                    if( da11<one .and. db>one ) then
+                       if( db>bignum*da11 )scaloc = one / db
+                    end if
+                    x11 = stdlib_wladiv( vec*cmplx( scaloc,KIND=qp), a11 )
+                    if( scaloc/=one ) then
+                       do j = 1, n
+                          call stdlib_wdscal( m, scaloc, c( 1, j ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    c( k, l ) = x11
+                 end do
+              end do loop_120
+           end if
+           return
+     end subroutine stdlib_wtrsyl
+
+     !> ZTRTI2: computes the inverse of a complex upper or lower triangular
+     !> matrix.
+     !> This is the Level 2 BLAS version of the algorithm.
+
+     pure subroutine stdlib_wtrti2( uplo, diag, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j
+           complex(qp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRTI2', -info )
+              return
+           end if
+           if( upper ) then
+              ! compute inverse of upper triangular matrix.
+              do j = 1, n
+                 if( nounit ) then
+                    a( j, j ) = cone / a( j, j )
+                    ajj = -a( j, j )
+                 else
+                    ajj = -cone
+                 end if
+                 ! compute elements 1:j-1 of j-th column.
+                 call stdlib_wtrmv( 'UPPER', 'NO TRANSPOSE', diag, j-1, a, lda,a( 1, j ), 1 )
+                           
+                 call stdlib_wscal( j-1, ajj, a( 1, j ), 1 )
+              end do
+           else
+              ! compute inverse of lower triangular matrix.
+              do j = n, 1, -1
+                 if( nounit ) then
+                    a( j, j ) = cone / a( j, j )
+                    ajj = -a( j, j )
+                 else
+                    ajj = -cone
+                 end if
+                 if( j<n ) then
+                    ! compute elements j+1:n of j-th column.
+                    call stdlib_wtrmv( 'LOWER', 'NO TRANSPOSE', diag, n-j,a( j+1, j+1 ), lda, a( &
+                              j+1, j ), 1 )
+                    call stdlib_wscal( n-j, ajj, a( j+1, j ), 1 )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_wtrti2
+
+     !> ZTRTRI: computes the inverse of a complex upper or lower triangular
+     !> matrix A.
+     !> This is the Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_wtrtri( uplo, diag, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jb, nb, nn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity if non-unit.
+           if( nounit ) then
+              do info = 1, n
+                 if( a( info, info )==czero )return
+              end do
+              info = 0
+           end if
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'ZTRTRI', uplo // diag, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_wtrti2( uplo, diag, n, a, lda, info )
+           else
+              ! use blocked code
+              if( upper ) then
+                 ! compute inverse of upper triangular matrix
+                 do j = 1, n, nb
+                    jb = min( nb, n-j+1 )
+                    ! compute rows 1:j-1 of current block column
+                    call stdlib_wtrmm( 'LEFT', 'UPPER', 'NO TRANSPOSE', diag, j-1,jb, cone, a, &
+                              lda, a( 1, j ), lda )
+                    call stdlib_wtrsm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', diag, j-1,jb, -cone, a( &
+                              j, j ), lda, a( 1, j ), lda )
+                    ! compute inverse of current diagonal block
+                    call stdlib_wtrti2( 'UPPER', diag, jb, a( j, j ), lda, info )
+                 end do
+              else
+                 ! compute inverse of lower triangular matrix
+                 nn = ( ( n-1 ) / nb )*nb + 1
+                 do j = nn, 1, -nb
+                    jb = min( nb, n-j+1 )
+                    if( j+jb<=n ) then
+                       ! compute rows j+jb:n of current block column
+                       call stdlib_wtrmm( 'LEFT', 'LOWER', 'NO TRANSPOSE', diag,n-j-jb+1, jb, &
+                                 cone, a( j+jb, j+jb ), lda,a( j+jb, j ), lda )
+                       call stdlib_wtrsm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', diag,n-j-jb+1, jb, -&
+                                 cone, a( j, j ), lda,a( j+jb, j ), lda )
+                    end if
+                    ! compute inverse of current diagonal block
+                    call stdlib_wtrti2( 'LOWER', diag, jb, a( j, j ), lda, info )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_wtrtri
+
+     !> ZTRTRS: solves a triangular system of the form
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> where A is a triangular matrix of order N, and B is an N-by-NRHS
+     !> matrix.  A check is made to verify that A is nonsingular.
+
+     pure subroutine stdlib_wtrtrs( uplo, trans, diag, n, nrhs, a, lda, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              do info = 1, n
+                 if( a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           ! solve a * x = b,  a**t * x = b,  or  a**h * x = b.
+           call stdlib_wtrsm( 'LEFT', uplo, trans, diag, n, nrhs, cone, a, lda, b,ldb )
+           return
+     end subroutine stdlib_wtrtrs
+
+     !> ZTRTTF: copies a triangular matrix A from standard full format (TR)
+     !> to rectangular full packed format (TF) .
+
+     pure subroutine stdlib_wtrttf( transr, uplo, n, a, lda, arf, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(0:lda-1,0:*)
+           complex(qp), intent(out) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: i, ij, j, k, l, n1, n2, nt, nx2, np1x2
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRTTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 ) then
+                 if( normaltransr ) then
+                    arf( 0 ) = a( 0, 0 )
+                 else
+                    arf( 0 ) = conjg( a( 0, 0 ) )
+                 end if
+              end if
+              return
+           end if
+           ! size of array arf(1:2,0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower: for n even n1=n2=k
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true., lda=n+1 and a is (n+1)--by--k2.
+           ! if n is even, set k = n/2 and nisodd = .false., lda=n and a is
+           ! n--by--(n+1)/2.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              if( .not.lower )np1x2 = n + n + 2
+           else
+              nisodd = .true.
+              if( .not.lower )nx2 = n + n
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda=n
+                    ij = 0
+                    do j = 0, n2
+                       do i = n1, n2 + j
+                          arf( ij ) = conjg( a( n2+j, i ) )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0); lda=n
+                    ij = nt - n
+                    do j = n - 1, n1, -1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = j - n1, n1 - 1
+                          arf( ij ) = conjg( a( j-n1, l ) )
+                          ij = ij + 1
+                       end do
+                       ij = ij - nx2
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ij = 0
+                    do j = 0, n2 - 1
+                       do i = 0, j
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                       do i = n1 + j, n - 1
+                          arf( ij ) = a( i, n1+j )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = n2, n - 1
+                       do i = 0, n1 - 1
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda=n2
+                    ij = 0
+                    do j = 0, n1
+                       do i = n1, n - 1
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, n1 - 1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = n2 + j, n - 1
+                          arf( ij ) = conjg( a( n2+j, l ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1); lda=n+1
+                    ij = 0
+                    do j = 0, k - 1
+                       do i = k, k + j
+                          arf( ij ) = conjg( a( k+j, i ) )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0); lda=n+1
+                    ij = nt - n - 1
+                    do j = n - 1, k, -1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = j - k, k - 1
+                          arf( ij ) = conjg( a( j-k, l ) )
+                          ij = ij + 1
+                       end do
+                       ij = ij - np1x2
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper, a=b)
+                    ! t1 -> a(0,1) , t2 -> a(0,0) , s -> a(0,k+1) :
+                    ! t1 -> a(0+k) , t2 -> a(0+0) , s -> a(0+k*(k+1)); lda=k
+                    ij = 0
+                    j = k
+                    do i = k, n - 1
+                       arf( ij ) = a( i, j )
+                       ij = ij + 1
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                       do i = k + 1 + j, n - 1
+                          arf( ij ) = a( i, k+1+j )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = k - 1, n - 1
+                       do i = 0, k - 1
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper, a=b)
+                    ! t1 -> a(0,k+1) , t2 -> a(0,k) , s -> a(0,0)
+                    ! t1 -> a(0+k*(k+1)) , t2 -> a(0+k*k) , s -> a(0+0)); lda=k
+                    ij = 0
+                    do j = 0, k
+                       do i = k, n - 1
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = k + 1 + j, n - 1
+                          arf( ij ) = conjg( a( k+1+j, l ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    ! note that here j = k-1
+                    do i = 0, j
+                       arf( ij ) = a( i, j )
+                       ij = ij + 1
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_wtrttf
+
+     !> ZTRTTP: copies a triangular matrix A from full format (TR) to standard
+     !> packed format (TP).
+
+     pure subroutine stdlib_wtrttp( uplo, n, a, lda, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*)
+           complex(qp), intent(out) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower
+           integer(ilp) :: i, j, k
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRTTP', -info )
+              return
+           end if
+           if( lower ) then
+              k = 0
+              do j = 1, n
+                 do i = j, n
+                    k = k + 1
+                    ap( k ) = a( i, j )
+                 end do
+              end do
+           else
+              k = 0
+              do j = 1, n
+                 do i = 1, j
+                    k = k + 1
+                    ap( k ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_wtrttp
+
+     !> ZTZRZF: reduces the M-by-N ( M<=N ) complex upper trapezoidal matrix A
+     !> to upper triangular form by means of unitary transformations.
+     !> The upper trapezoidal matrix A is factored as
+     !> A = ( R  0 ) * Z,
+     !> where Z is an N-by-N unitary matrix and R is an M-by-M upper
+     !> triangular matrix.
+
+     pure subroutine stdlib_wtzrzf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iws, ki, kk, ldwork, lwkmin, lwkopt, m1, mu, nb, nbmin, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              if( m==0 .or. m==n ) then
+                 lwkopt = 1
+                 lwkmin = 1
+              else
+                 ! determine the block size.
+                 nb = stdlib_ilaenv( 1, 'ZGERQF', ' ', m, n, -1, -1 )
+                 lwkopt = m*nb
+                 lwkmin = max( 1, m )
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTZRZF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 ) then
+              return
+           else if( m==n ) then
+              do i = 1, n
+                 tau( i ) = czero
+              end do
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = m
+           if( nb>1 .and. nb<m ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZGERQF', ' ', m, n, -1, -1 ) )
+              if( nx<m ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGERQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<m .and. nx<m ) then
+              ! use blocked code initially.
+              ! the last kk rows are handled by the block method.
+              m1 = min( m+1, n )
+              ki = ( ( m-nx-1 ) / nb )*nb
+              kk = min( m, ki+nb )
+              do i = m - kk + ki + 1, m - kk + 1, -nb
+                 ib = min( m-i+1, nb )
+                 ! compute the tz factorization of the current block
+                 ! a(i:i+ib-1,i:n)
+                 call stdlib_wlatrz( ib, n-i+1, n-m, a( i, i ), lda, tau( i ),work )
+                 if( i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_wlarzt( 'BACKWARD', 'ROWWISE', n-m, ib, a( i, m1 ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h to a(1:i-1,i:n) from the right
+                    call stdlib_wlarzb( 'RIGHT', 'NO TRANSPOSE', 'BACKWARD','ROWWISE', i-1, n-i+1,&
+                     ib, n-m, a( i, m1 ),lda, work, ldwork, a( 1, i ), lda,work( ib+1 ), ldwork )
+                               
+                 end if
+              end do
+              mu = i + nb - 1
+           else
+              mu = m
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 )call stdlib_wlatrz( mu, n, n-m, a, lda, tau, work )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wtzrzf
+
+     !> ZUNBDB: simultaneously bidiagonalizes the blocks of an M-by-M
+     !> partitioned unitary matrix X:
+     !> [ B11 | B12 0  0 ]
+     !> [ X11 | X12 ]   [ P1 |    ] [  0  |  0 -I  0 ] [ Q1 |    ]**H
+     !> X = [-----------] = [---------] [----------------] [---------]   .
+     !> [ X21 | X22 ]   [    | P2 ] [ B21 | B22 0  0 ] [    | Q2 ]
+     !> [  0  |  0  0  I ]
+     !> X11 is P-by-Q. Q must be no larger than P, M-P, or M-Q. (If this is
+     !> not the case, then X must be transposed and/or permuted. This can be
+     !> done in constant time using the TRANS and SIGNS options. See ZUNCSD
+     !> for details.)
+     !> The unitary matrices P1, P2, Q1, and Q2 are P-by-P, (M-P)-by-
+     !> (M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. They are
+     !> represented implicitly by Householder vectors.
+     !> B11, B12, B21, and B22 are Q-by-Q bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_wunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+               ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: signs, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldx11, ldx12, ldx21, ldx22, lwork, m, p, q
+           ! Array Arguments 
+           real(qp), intent(out) :: phi(*), theta(*)
+           complex(qp), intent(out) :: taup1(*), taup2(*), tauq1(*), tauq2(*), work(*)
+           complex(qp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
+                     
+        ! ====================================================================
+           ! Parameters 
+           real(qp), parameter :: realone = 1.0_qp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: colmajor, lquery
+           integer(ilp) :: i, lworkmin, lworkopt
+           real(qp) :: z1, z2, z3, z4
+           ! Intrinsic Functions
+           intrinsic :: atan2,cos,max,min,sin
+           intrinsic :: cmplx,conjg
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           if( .not. stdlib_lsame( signs, 'O' ) ) then
+              z1 = realone
+              z2 = realone
+              z3 = realone
+              z4 = realone
+           else
+              z1 = realone
+              z2 = -realone
+              z3 = realone
+              z4 = -realone
+           end if
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -3
+           else if( p < 0 .or. p > m ) then
+              info = -4
+           else if( q < 0 .or. q > p .or. q > m-p .or.q > m-q ) then
+              info = -5
+           else if( colmajor .and. ldx11 < max( 1, p ) ) then
+              info = -7
+           else if( .not.colmajor .and. ldx11 < max( 1, q ) ) then
+              info = -7
+           else if( colmajor .and. ldx12 < max( 1, p ) ) then
+              info = -9
+           else if( .not.colmajor .and. ldx12 < max( 1, m-q ) ) then
+              info = -9
+           else if( colmajor .and. ldx21 < max( 1, m-p ) ) then
+              info = -11
+           else if( .not.colmajor .and. ldx21 < max( 1, q ) ) then
+              info = -11
+           else if( colmajor .and. ldx22 < max( 1, m-p ) ) then
+              info = -13
+           else if( .not.colmajor .and. ldx22 < max( 1, m-q ) ) then
+              info = -13
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              lworkopt = m - q
+              lworkmin = m - q
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not. lquery ) then
+                 info = -21
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'XORBDB', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! handle column-major and row-major separately
+           if( colmajor ) then
+              ! reduce columns 1, ..., q of x11, x12, x21, and x22
+              do i = 1, q
+                 if( i == 1 ) then
+                    call stdlib_wscal( p-i+1, cmplx( z1, 0.0_qp,KIND=qp), x11(i,i), 1 )
+                 else
+                    call stdlib_wscal( p-i+1, cmplx( z1*cos(phi(i-1)), 0.0_qp,KIND=qp),x11(i,i), &
+                              1 )
+                    call stdlib_waxpy( p-i+1, cmplx( -z1*z3*z4*sin(phi(i-1)),0.0_qp,KIND=qp), x12(&
+                              i,i-1), 1, x11(i,i), 1 )
+                 end if
+                 if( i == 1 ) then
+                    call stdlib_wscal( m-p-i+1, cmplx( z2, 0.0_qp,KIND=qp), x21(i,i), 1 )
+                 else
+                    call stdlib_wscal( m-p-i+1, cmplx( z2*cos(phi(i-1)), 0.0_qp,KIND=qp),x21(i,i),&
+                               1 )
+                    call stdlib_waxpy( m-p-i+1, cmplx( -z2*z3*z4*sin(phi(i-1)),0.0_qp,KIND=qp), &
+                              x22(i,i-1), 1, x21(i,i), 1 )
+                 end if
+                 theta(i) = atan2( stdlib_qznrm2( m-p-i+1, x21(i,i), 1 ),stdlib_qznrm2( p-i+1, &
+                           x11(i,i), 1 ) )
+                 if( p > i ) then
+                    call stdlib_wlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+                 else if ( p == i ) then
+                    call stdlib_wlarfgp( p-i+1, x11(i,i), x11(i,i), 1, taup1(i) )
+                 end if
+                 x11(i,i) = cone
+                 if ( m-p > i ) then
+                    call stdlib_wlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1,taup2(i) )
+                 else if ( m-p == i ) then
+                    call stdlib_wlarfgp( m-p-i+1, x21(i,i), x21(i,i), 1,taup2(i) )
+                 end if
+                 x21(i,i) = cone
+                 if ( q > i ) then
+                    call stdlib_wlarf( 'L', p-i+1, q-i, x11(i,i), 1,conjg(taup1(i)), x11(i,i+1), &
+                              ldx11, work )
+                    call stdlib_wlarf( 'L', m-p-i+1, q-i, x21(i,i), 1,conjg(taup2(i)), x21(i,i+1),&
+                               ldx21, work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_wlarf( 'L', p-i+1, m-q-i+1, x11(i,i), 1,conjg(taup1(i)), x12(i,i),&
+                               ldx12, work )
+                    call stdlib_wlarf( 'L', m-p-i+1, m-q-i+1, x21(i,i), 1,conjg(taup2(i)), x22(i,&
+                              i), ldx22, work )
+                 end if
+                 if( i < q ) then
+                    call stdlib_wscal( q-i, cmplx( -z1*z3*sin(theta(i)), 0.0_qp,KIND=qp),x11(i,i+&
+                              1), ldx11 )
+                    call stdlib_waxpy( q-i, cmplx( z2*z3*cos(theta(i)), 0.0_qp,KIND=qp),x21(i,i+1)&
+                              , ldx21, x11(i,i+1), ldx11 )
+                 end if
+                 call stdlib_wscal( m-q-i+1, cmplx( -z1*z4*sin(theta(i)), 0.0_qp,KIND=qp),x12(i,i)&
+                           , ldx12 )
+                 call stdlib_waxpy( m-q-i+1, cmplx( z2*z4*cos(theta(i)), 0.0_qp,KIND=qp),x22(i,i),&
+                            ldx22, x12(i,i), ldx12 )
+                 if( i < q )phi(i) = atan2( stdlib_qznrm2( q-i, x11(i,i+1), ldx11 ),stdlib_qznrm2(&
+                            m-q-i+1, x12(i,i), ldx12 ) )
+                 if( i < q ) then
+                    call stdlib_wlacgv( q-i, x11(i,i+1), ldx11 )
+                    if ( i == q-1 ) then
+                       call stdlib_wlarfgp( q-i, x11(i,i+1), x11(i,i+1), ldx11,tauq1(i) )
+                    else
+                       call stdlib_wlarfgp( q-i, x11(i,i+1), x11(i,i+2), ldx11,tauq1(i) )
+                    end if
+                    x11(i,i+1) = cone
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_wlacgv( m-q-i+1, x12(i,i), ldx12 )
+                    if ( m-q == i ) then
+                       call stdlib_wlarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
+                    else
+                       call stdlib_wlarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
+                                 
+                    end if
+                 end if
+                 x12(i,i) = cone
+                 if( i < q ) then
+                    call stdlib_wlarf( 'R', p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x11(i+1,i+1), &
+                              ldx11, work )
+                    call stdlib_wlarf( 'R', m-p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x21(i+1,i+1), &
+                              ldx21, work )
+                 end if
+                 if ( p > i ) then
+                    call stdlib_wlarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
+                              ldx12, work )
+                 end if
+                 if ( m-p > i ) then
+                    call stdlib_wlarf( 'R', m-p-i, m-q-i+1, x12(i,i), ldx12,tauq2(i), x22(i+1,i), &
+                              ldx22, work )
+                 end if
+                 if( i < q )call stdlib_wlacgv( q-i, x11(i,i+1), ldx11 )
+                 call stdlib_wlacgv( m-q-i+1, x12(i,i), ldx12 )
+              end do
+              ! reduce columns q + 1, ..., p of x12, x22
+              do i = q + 1, p
+                 call stdlib_wscal( m-q-i+1, cmplx( -z1*z4, 0.0_qp,KIND=qp), x12(i,i),ldx12 )
+                           
+                 call stdlib_wlacgv( m-q-i+1, x12(i,i), ldx12 )
+                 if ( i >= m-q ) then
+                    call stdlib_wlarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
+                 else
+                    call stdlib_wlarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
+                 end if
+                 x12(i,i) = cone
+                 if ( p > i ) then
+                    call stdlib_wlarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
+                              ldx12, work )
+                 end if
+                 if( m-p-q >= 1 )call stdlib_wlarf( 'R', m-p-q, m-q-i+1, x12(i,i), ldx12,tauq2(i),&
+                            x22(q+1,i), ldx22, work )
+                 call stdlib_wlacgv( m-q-i+1, x12(i,i), ldx12 )
+              end do
+              ! reduce columns p + 1, ..., m - q of x12, x22
+              do i = 1, m - p - q
+                 call stdlib_wscal( m-p-q-i+1, cmplx( z2*z4, 0.0_qp,KIND=qp),x22(q+i,p+i), ldx22 )
+                           
+                 call stdlib_wlacgv( m-p-q-i+1, x22(q+i,p+i), ldx22 )
+                 call stdlib_wlarfgp( m-p-q-i+1, x22(q+i,p+i), x22(q+i,p+i+1),ldx22, tauq2(p+i) )
+                           
+                 x22(q+i,p+i) = cone
+                 call stdlib_wlarf( 'R', m-p-q-i, m-p-q-i+1, x22(q+i,p+i), ldx22,tauq2(p+i), x22(&
+                           q+i+1,p+i), ldx22, work )
+                 call stdlib_wlacgv( m-p-q-i+1, x22(q+i,p+i), ldx22 )
+              end do
+           else
+              ! reduce columns 1, ..., q of x11, x12, x21, x22
+              do i = 1, q
+                 if( i == 1 ) then
+                    call stdlib_wscal( p-i+1, cmplx( z1, 0.0_qp,KIND=qp), x11(i,i),ldx11 )
+                 else
+                    call stdlib_wscal( p-i+1, cmplx( z1*cos(phi(i-1)), 0.0_qp,KIND=qp),x11(i,i), &
+                              ldx11 )
+                    call stdlib_waxpy( p-i+1, cmplx( -z1*z3*z4*sin(phi(i-1)),0.0_qp,KIND=qp), x12(&
+                              i-1,i), ldx12, x11(i,i), ldx11 )
+                 end if
+                 if( i == 1 ) then
+                    call stdlib_wscal( m-p-i+1, cmplx( z2, 0.0_qp,KIND=qp), x21(i,i),ldx21 )
+                              
+                 else
+                    call stdlib_wscal( m-p-i+1, cmplx( z2*cos(phi(i-1)), 0.0_qp,KIND=qp),x21(i,i),&
+                               ldx21 )
+                    call stdlib_waxpy( m-p-i+1, cmplx( -z2*z3*z4*sin(phi(i-1)),0.0_qp,KIND=qp), &
+                              x22(i-1,i), ldx22, x21(i,i), ldx21 )
+                 end if
+                 theta(i) = atan2( stdlib_qznrm2( m-p-i+1, x21(i,i), ldx21 ),stdlib_qznrm2( p-i+1,&
+                            x11(i,i), ldx11 ) )
+                 call stdlib_wlacgv( p-i+1, x11(i,i), ldx11 )
+                 call stdlib_wlacgv( m-p-i+1, x21(i,i), ldx21 )
+                 call stdlib_wlarfgp( p-i+1, x11(i,i), x11(i,i+1), ldx11, taup1(i) )
+                 x11(i,i) = cone
+                 if ( i == m-p ) then
+                    call stdlib_wlarfgp( m-p-i+1, x21(i,i), x21(i,i), ldx21,taup2(i) )
+                 else
+                    call stdlib_wlarfgp( m-p-i+1, x21(i,i), x21(i,i+1), ldx21,taup2(i) )
+                 end if
+                 x21(i,i) = cone
+                 call stdlib_wlarf( 'R', q-i, p-i+1, x11(i,i), ldx11, taup1(i),x11(i+1,i), ldx11, &
+                           work )
+                 call stdlib_wlarf( 'R', m-q-i+1, p-i+1, x11(i,i), ldx11, taup1(i),x12(i,i), &
+                           ldx12, work )
+                 call stdlib_wlarf( 'R', q-i, m-p-i+1, x21(i,i), ldx21, taup2(i),x21(i+1,i), &
+                           ldx21, work )
+                 call stdlib_wlarf( 'R', m-q-i+1, m-p-i+1, x21(i,i), ldx21,taup2(i), x22(i,i), &
+                           ldx22, work )
+                 call stdlib_wlacgv( p-i+1, x11(i,i), ldx11 )
+                 call stdlib_wlacgv( m-p-i+1, x21(i,i), ldx21 )
+                 if( i < q ) then
+                    call stdlib_wscal( q-i, cmplx( -z1*z3*sin(theta(i)), 0.0_qp,KIND=qp),x11(i+1,&
+                              i), 1 )
+                    call stdlib_waxpy( q-i, cmplx( z2*z3*cos(theta(i)), 0.0_qp,KIND=qp),x21(i+1,i)&
+                              , 1, x11(i+1,i), 1 )
+                 end if
+                 call stdlib_wscal( m-q-i+1, cmplx( -z1*z4*sin(theta(i)), 0.0_qp,KIND=qp),x12(i,i)&
+                           , 1 )
+                 call stdlib_waxpy( m-q-i+1, cmplx( z2*z4*cos(theta(i)), 0.0_qp,KIND=qp),x22(i,i),&
+                            1, x12(i,i), 1 )
+                 if( i < q )phi(i) = atan2( stdlib_qznrm2( q-i, x11(i+1,i), 1 ),stdlib_qznrm2( m-&
+                           q-i+1, x12(i,i), 1 ) )
+                 if( i < q ) then
+                    call stdlib_wlarfgp( q-i, x11(i+1,i), x11(i+2,i), 1, tauq1(i) )
+                    x11(i+1,i) = cone
+                 end if
+                 call stdlib_wlarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1, tauq2(i) )
+                 x12(i,i) = cone
+                 if( i < q ) then
+                    call stdlib_wlarf( 'L', q-i, p-i, x11(i+1,i), 1,conjg(tauq1(i)), x11(i+1,i+1),&
+                               ldx11, work )
+                    call stdlib_wlarf( 'L', q-i, m-p-i, x11(i+1,i), 1,conjg(tauq1(i)), x21(i+1,i+&
+                              1), ldx21, work )
+                 end if
+                 call stdlib_wlarf( 'L', m-q-i+1, p-i, x12(i,i), 1,conjg(tauq2(i)), x12(i,i+1), &
+                           ldx12, work )
+                 if ( m-p > i ) then
+                    call stdlib_wlarf( 'L', m-q-i+1, m-p-i, x12(i,i), 1,conjg(tauq2(i)), x22(i,i+&
+                              1), ldx22, work )
+                 end if
+              end do
+              ! reduce columns q + 1, ..., p of x12, x22
+              do i = q + 1, p
+                 call stdlib_wscal( m-q-i+1, cmplx( -z1*z4, 0.0_qp,KIND=qp), x12(i,i), 1 )
+                 call stdlib_wlarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1, tauq2(i) )
+                 x12(i,i) = cone
+                 if ( p > i ) then
+                    call stdlib_wlarf( 'L', m-q-i+1, p-i, x12(i,i), 1,conjg(tauq2(i)), x12(i,i+1),&
+                               ldx12, work )
+                 end if
+                 if( m-p-q >= 1 )call stdlib_wlarf( 'L', m-q-i+1, m-p-q, x12(i,i), 1,conjg(tauq2(&
+                           i)), x22(i,q+1), ldx22, work )
+              end do
+              ! reduce columns p + 1, ..., m - q of x12, x22
+              do i = 1, m - p - q
+                 call stdlib_wscal( m-p-q-i+1, cmplx( z2*z4, 0.0_qp,KIND=qp),x22(p+i,q+i), 1 )
+                           
+                 call stdlib_wlarfgp( m-p-q-i+1, x22(p+i,q+i), x22(p+i+1,q+i), 1,tauq2(p+i) )
+                           
+                 x22(p+i,q+i) = cone
+                 if ( m-p-q /= i ) then
+                    call stdlib_wlarf( 'L', m-p-q-i+1, m-p-q-i, x22(p+i,q+i), 1,conjg(tauq2(p+i)),&
+                               x22(p+i,q+i+1), ldx22,work )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_wunbdb
+
+     !> ZUNBDB1: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. Q must be no larger than P,
+     !> M-P, or M-Q. Routines ZUNBDB2, ZUNBDB3, and ZUNBDB4 handle cases in
+     !> which Q is not the minimum dimension.
+     !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are Q-by-Q bidiagonal matrices represented implicitly by
+     !> angles THETA, PHI.
+
+     subroutine stdlib_wunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(qp), intent(out) :: phi(*), theta(*)
+           complex(qp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           complex(qp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < q .or. m-p < q ) then
+              info = -2
+           else if( q < 0 .or. m-q < q ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p-1, m-p-1, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-2
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNBDB1', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce columns 1, ..., q of x11 and x21
+           do i = 1, q
+              call stdlib_wlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              call stdlib_wlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              theta(i) = atan2( real( x21(i,i),KIND=qp), real( x11(i,i),KIND=qp) )
+              c = cos( theta(i) )
+              s = sin( theta(i) )
+              x11(i,i) = cone
+              x21(i,i) = cone
+              call stdlib_wlarf( 'L', p-i+1, q-i, x11(i,i), 1, conjg(taup1(i)),x11(i,i+1), ldx11, &
+                        work(ilarf) )
+              call stdlib_wlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, conjg(taup2(i)),x21(i,i+1), &
+                        ldx21, work(ilarf) )
+              if( i < q ) then
+                 call stdlib_wdrot( q-i, x11(i,i+1), ldx11, x21(i,i+1), ldx21, c,s )
+                 call stdlib_wlacgv( q-i, x21(i,i+1), ldx21 )
+                 call stdlib_wlarfgp( q-i, x21(i,i+1), x21(i,i+2), ldx21, tauq1(i) )
+                 s = real( x21(i,i+1),KIND=qp)
+                 x21(i,i+1) = cone
+                 call stdlib_wlarf( 'R', p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x11(i+1,i+1), &
+                           ldx11, work(ilarf) )
+                 call stdlib_wlarf( 'R', m-p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x21(i+1,i+1), &
+                           ldx21, work(ilarf) )
+                 call stdlib_wlacgv( q-i, x21(i,i+1), ldx21 )
+                 c = sqrt( stdlib_qznrm2( p-i, x11(i+1,i+1), 1 )**2+ stdlib_qznrm2( m-p-i, x21(i+&
+                           1,i+1), 1 )**2 )
+                 phi(i) = atan2( s, c )
+                 call stdlib_wunbdb5( p-i, m-p-i, q-i-1, x11(i+1,i+1), 1,x21(i+1,i+1), 1, x11(i+1,&
+                           i+2), ldx11,x21(i+1,i+2), ldx21, work(iorbdb5), lorbdb5,childinfo )
+              end if
+           end do
+           return
+     end subroutine stdlib_wunbdb1
+
+     !> ZUNBDB2: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. P must be no larger than M-P,
+     !> Q, or M-Q. Routines ZUNBDB1, ZUNBDB3, and ZUNBDB4 handle cases in
+     !> which P is not the minimum dimension.
+     !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are P-by-P bidiagonal matrices represented implicitly by
+     !> angles THETA, PHI.
+
+     subroutine stdlib_wunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(qp), intent(out) :: phi(*), theta(*)
+           complex(qp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           complex(qp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < 0 .or. p > m-p ) then
+              info = -2
+           else if( q < 0 .or. q < p .or. m-q < p ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p-1, m-p, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-1
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNBDB2', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce rows 1, ..., p of x11 and x21
+           do i = 1, p
+              if( i > 1 ) then
+                 call stdlib_wdrot( q-i+1, x11(i,i), ldx11, x21(i-1,i), ldx21, c,s )
+              end if
+              call stdlib_wlacgv( q-i+1, x11(i,i), ldx11 )
+              call stdlib_wlarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
+              c = real( x11(i,i),KIND=qp)
+              x11(i,i) = cone
+              call stdlib_wlarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_wlarf( 'R', m-p-i+1, q-i+1, x11(i,i), ldx11, tauq1(i),x21(i,i), ldx21, &
+                        work(ilarf) )
+              call stdlib_wlacgv( q-i+1, x11(i,i), ldx11 )
+              s = sqrt( stdlib_qznrm2( p-i, x11(i+1,i), 1 )**2+ stdlib_qznrm2( m-p-i+1, x21(i,i), &
+                        1 )**2 )
+              theta(i) = atan2( s, c )
+              call stdlib_wunbdb5( p-i, m-p-i+1, q-i, x11(i+1,i), 1, x21(i,i), 1,x11(i+1,i+1), &
+                        ldx11, x21(i,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
+              call stdlib_wscal( p-i, cnegone, x11(i+1,i), 1 )
+              call stdlib_wlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              if( i < p ) then
+                 call stdlib_wlarfgp( p-i, x11(i+1,i), x11(i+2,i), 1, taup1(i) )
+                 phi(i) = atan2( real( x11(i+1,i),KIND=qp), real( x21(i,i),KIND=qp) )
+                 c = cos( phi(i) )
+                 s = sin( phi(i) )
+                 x11(i+1,i) = cone
+                 call stdlib_wlarf( 'L', p-i, q-i, x11(i+1,i), 1, conjg(taup1(i)),x11(i+1,i+1), &
+                           ldx11, work(ilarf) )
+              end if
+              x21(i,i) = cone
+              call stdlib_wlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, conjg(taup2(i)),x21(i,i+1), &
+                        ldx21, work(ilarf) )
+           end do
+           ! reduce the bottom-right portion of x21 to the identity matrix
+           do i = p + 1, q
+              call stdlib_wlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              x21(i,i) = cone
+              call stdlib_wlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, conjg(taup2(i)),x21(i,i+1), &
+                        ldx21, work(ilarf) )
+           end do
+           return
+     end subroutine stdlib_wunbdb2
+
+     !> ZUNBDB3: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-P must be no larger than P,
+     !> Q, or M-Q. Routines ZUNBDB1, ZUNBDB2, and ZUNBDB4 handle cases in
+     !> which M-P is not the minimum dimension.
+     !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are (M-P)-by-(M-P) bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_wunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(qp), intent(out) :: phi(*), theta(*)
+           complex(qp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           complex(qp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( 2*p < m .or. p > m ) then
+              info = -2
+           else if( q < m-p .or. m-q < m-p ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p, m-p-1, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-1
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNBDB3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce rows 1, ..., m-p of x11 and x21
+           do i = 1, m-p
+              if( i > 1 ) then
+                 call stdlib_wdrot( q-i+1, x11(i-1,i), ldx11, x21(i,i), ldx11, c,s )
+              end if
+              call stdlib_wlacgv( q-i+1, x21(i,i), ldx21 )
+              call stdlib_wlarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
+              s = real( x21(i,i),KIND=qp)
+              x21(i,i) = cone
+              call stdlib_wlarf( 'R', p-i+1, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_wlarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
+                        work(ilarf) )
+              call stdlib_wlacgv( q-i+1, x21(i,i), ldx21 )
+              c = sqrt( stdlib_qznrm2( p-i+1, x11(i,i), 1 )**2+ stdlib_qznrm2( m-p-i, x21(i+1,i), &
+                        1 )**2 )
+              theta(i) = atan2( s, c )
+              call stdlib_wunbdb5( p-i+1, m-p-i, q-i, x11(i,i), 1, x21(i+1,i), 1,x11(i,i+1), &
+                        ldx11, x21(i+1,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
+              call stdlib_wlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              if( i < m-p ) then
+                 call stdlib_wlarfgp( m-p-i, x21(i+1,i), x21(i+2,i), 1, taup2(i) )
+                 phi(i) = atan2( real( x21(i+1,i),KIND=qp), real( x11(i,i),KIND=qp) )
+                 c = cos( phi(i) )
+                 s = sin( phi(i) )
+                 x21(i+1,i) = cone
+                 call stdlib_wlarf( 'L', m-p-i, q-i, x21(i+1,i), 1,conjg(taup2(i)), x21(i+1,i+1), &
+                           ldx21,work(ilarf) )
+              end if
+              x11(i,i) = cone
+              call stdlib_wlarf( 'L', p-i+1, q-i, x11(i,i), 1, conjg(taup1(i)),x11(i,i+1), ldx11, &
+                        work(ilarf) )
+           end do
+           ! reduce the bottom-right portion of x11 to the identity matrix
+           do i = m-p + 1, q
+              call stdlib_wlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              x11(i,i) = cone
+              call stdlib_wlarf( 'L', p-i+1, q-i, x11(i,i), 1, conjg(taup1(i)),x11(i,i+1), ldx11, &
+                        work(ilarf) )
+           end do
+           return
+     end subroutine stdlib_wunbdb3
+
+     !> ZUNBDB4: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-Q must be no larger than P,
+     !> M-P, or Q. Routines ZUNBDB1, ZUNBDB2, and ZUNBDB3 handle cases in
+     !> which M-Q is not the minimum dimension.
+     !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are (M-Q)-by-(M-Q) bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_wunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               phantom, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(qp), intent(out) :: phi(*), theta(*)
+           complex(qp), intent(out) :: phantom(*), taup1(*), taup2(*), tauq1(*), work(*)
+           complex(qp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(qp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, j, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < m-q .or. m-p < m-q ) then
+              info = -2
+           else if( q < m-q .or. q > m ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( q-1, p-1, m-p-1 )
+              iorbdb5 = 2
+              lorbdb5 = q
+              lworkopt = ilarf + llarf - 1
+              lworkopt = max( lworkopt, iorbdb5 + lorbdb5 - 1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNBDB4', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce columns 1, ..., m-q of x11 and x21
+           do i = 1, m-q
+              if( i == 1 ) then
+                 do j = 1, m
+                    phantom(j) = czero
+                 end do
+                 call stdlib_wunbdb5( p, m-p, q, phantom(1), 1, phantom(p+1), 1,x11, ldx11, x21, &
+                           ldx21, work(iorbdb5),lorbdb5, childinfo )
+                 call stdlib_wscal( p, cnegone, phantom(1), 1 )
+                 call stdlib_wlarfgp( p, phantom(1), phantom(2), 1, taup1(1) )
+                 call stdlib_wlarfgp( m-p, phantom(p+1), phantom(p+2), 1, taup2(1) )
+                 theta(i) = atan2( real( phantom(1),KIND=qp), real( phantom(p+1),KIND=qp) )
+                           
+                 c = cos( theta(i) )
+                 s = sin( theta(i) )
+                 phantom(1) = cone
+                 phantom(p+1) = cone
+                 call stdlib_wlarf( 'L', p, q, phantom(1), 1, conjg(taup1(1)), x11,ldx11, work(&
+                           ilarf) )
+                 call stdlib_wlarf( 'L', m-p, q, phantom(p+1), 1, conjg(taup2(1)),x21, ldx21, &
+                           work(ilarf) )
+              else
+                 call stdlib_wunbdb5( p-i+1, m-p-i+1, q-i+1, x11(i,i-1), 1,x21(i,i-1), 1, x11(i,i)&
+                           , ldx11, x21(i,i),ldx21, work(iorbdb5), lorbdb5, childinfo )
+                 call stdlib_wscal( p-i+1, cnegone, x11(i,i-1), 1 )
+                 call stdlib_wlarfgp( p-i+1, x11(i,i-1), x11(i+1,i-1), 1, taup1(i) )
+                 call stdlib_wlarfgp( m-p-i+1, x21(i,i-1), x21(i+1,i-1), 1,taup2(i) )
+                 theta(i) = atan2( real( x11(i,i-1),KIND=qp), real( x21(i,i-1),KIND=qp) )
+                 c = cos( theta(i) )
+                 s = sin( theta(i) )
+                 x11(i,i-1) = cone
+                 x21(i,i-1) = cone
+                 call stdlib_wlarf( 'L', p-i+1, q-i+1, x11(i,i-1), 1,conjg(taup1(i)), x11(i,i), &
+                           ldx11, work(ilarf) )
+                 call stdlib_wlarf( 'L', m-p-i+1, q-i+1, x21(i,i-1), 1,conjg(taup2(i)), x21(i,i), &
+                           ldx21, work(ilarf) )
+              end if
+              call stdlib_wdrot( q-i+1, x11(i,i), ldx11, x21(i,i), ldx21, s, -c )
+              call stdlib_wlacgv( q-i+1, x21(i,i), ldx21 )
+              call stdlib_wlarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
+              c = real( x21(i,i),KIND=qp)
+              x21(i,i) = cone
+              call stdlib_wlarf( 'R', p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_wlarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
+                        work(ilarf) )
+              call stdlib_wlacgv( q-i+1, x21(i,i), ldx21 )
+              if( i < m-q ) then
+                 s = sqrt( stdlib_qznrm2( p-i, x11(i+1,i), 1 )**2+ stdlib_qznrm2( m-p-i, x21(i+1,&
+                           i), 1 )**2 )
+                 phi(i) = atan2( s, c )
+              end if
+           end do
+           ! reduce the bottom-right portion of x11 to [ i 0 ]
+           do i = m - q + 1, p
+              call stdlib_wlacgv( q-i+1, x11(i,i), ldx11 )
+              call stdlib_wlarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
+              x11(i,i) = cone
+              call stdlib_wlarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_wlarf( 'R', q-p, q-i+1, x11(i,i), ldx11, tauq1(i),x21(m-q+1,i), ldx21, &
+                        work(ilarf) )
+              call stdlib_wlacgv( q-i+1, x11(i,i), ldx11 )
+           end do
+           ! reduce the bottom-right portion of x21 to [ 0 i ]
+           do i = p + 1, q
+              call stdlib_wlacgv( q-i+1, x21(m-q+i-p,i), ldx21 )
+              call stdlib_wlarfgp( q-i+1, x21(m-q+i-p,i), x21(m-q+i-p,i+1), ldx21,tauq1(i) )
+                        
+              x21(m-q+i-p,i) = cone
+              call stdlib_wlarf( 'R', q-i, q-i+1, x21(m-q+i-p,i), ldx21, tauq1(i),x21(m-q+i-p+1,i)&
+                        , ldx21, work(ilarf) )
+              call stdlib_wlacgv( q-i+1, x21(m-q+i-p,i), ldx21 )
+           end do
+           return
+     end subroutine stdlib_wunbdb4
+
+     !> ZUNBDB5: orthogonalizes the column vector
+     !> X = [ X1 ]
+     !> [ X2 ]
+     !> with respect to the columns of
+     !> Q = [ Q1 ] .
+     !> [ Q2 ]
+     !> The columns of Q must be orthonormal.
+     !> If the projection is zero according to Kahan's "twice is enough"
+     !> criterion, then some other vector from the orthogonal complement
+     !> is returned. This vector is chosen in an arbitrary but deterministic
+     !> way.
+
+     pure subroutine stdlib_wunbdb5( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(qp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: x1(*), x2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: childinfo, i, j
+           ! Intrinsic Function 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           if( m1 < 0 ) then
+              info = -1
+           else if( m2 < 0 ) then
+              info = -2
+           else if( n < 0 ) then
+              info = -3
+           else if( incx1 < 1 ) then
+              info = -5
+           else if( incx2 < 1 ) then
+              info = -7
+           else if( ldq1 < max( 1, m1 ) ) then
+              info = -9
+           else if( ldq2 < max( 1, m2 ) ) then
+              info = -11
+           else if( lwork < n ) then
+              info = -13
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNBDB5', -info )
+              return
+           end if
+           ! project x onto the orthogonal complement of q
+           call stdlib_wunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2, ldq2,work, lwork, &
+                     childinfo )
+           ! if the projection is nonzero, then return
+           if( stdlib_qznrm2(m1,x1,incx1) /= czero.or. stdlib_qznrm2(m2,x2,incx2) /= czero ) &
+                     then
+              return
+           end if
+           ! project each standard basis vector e_1,...,e_m1 in turn, stopping
+           ! when a nonzero projection is found
+           do i = 1, m1
+              do j = 1, m1
+                 x1(j) = czero
+              end do
+              x1(i) = cone
+              do j = 1, m2
+                 x2(j) = czero
+              end do
+              call stdlib_wunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                        lwork, childinfo )
+              if( stdlib_qznrm2(m1,x1,incx1) /= czero.or. stdlib_qznrm2(m2,x2,incx2) /= czero ) &
+                        then
+                 return
+              end if
+           end do
+           ! project each standard basis vector e_(m1+1),...,e_(m1+m2) in turn,
+           ! stopping when a nonzero projection is found
+           do i = 1, m2
+              do j = 1, m1
+                 x1(j) = czero
+              end do
+              do j = 1, m2
+                 x2(j) = czero
+              end do
+              x2(i) = cone
+              call stdlib_wunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                        lwork, childinfo )
+              if( stdlib_qznrm2(m1,x1,incx1) /= czero.or. stdlib_qznrm2(m2,x2,incx2) /= czero ) &
+                        then
+                 return
+              end if
+           end do
+           return
+     end subroutine stdlib_wunbdb5
+
+     !> ZUNBDB6: orthogonalizes the column vector
+     !> X = [ X1 ]
+     !> [ X2 ]
+     !> with respect to the columns of
+     !> Q = [ Q1 ] .
+     !> [ Q2 ]
+     !> The columns of Q must be orthonormal.
+     !> If the projection is zero according to Kahan's "twice is enough"
+     !> criterion, then the zero vector is returned.
+
+     pure subroutine stdlib_wunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(qp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
+           complex(qp), intent(out) :: work(*)
+           complex(qp), intent(inout) :: x1(*), x2(*)
+        ! =====================================================================
+           ! Parameters 
+           real(qp), parameter :: alphasq = 0.01_qp
+           real(qp), parameter :: realone = 1.0_qp
+           real(qp), parameter :: realzero = 0.0_qp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(qp) :: normsq1, normsq2, scl1, scl2, ssq1, ssq2
+           ! Intrinsic Function 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           if( m1 < 0 ) then
+              info = -1
+           else if( m2 < 0 ) then
+              info = -2
+           else if( n < 0 ) then
+              info = -3
+           else if( incx1 < 1 ) then
+              info = -5
+           else if( incx2 < 1 ) then
+              info = -7
+           else if( ldq1 < max( 1, m1 ) ) then
+              info = -9
+           else if( ldq2 < max( 1, m2 ) ) then
+              info = -11
+           else if( lwork < n ) then
+              info = -13
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNBDB6', -info )
+              return
+           end if
+           ! first, project x onto the orthogonal complement of q's column
+           ! space
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_wlassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_wlassq( m2, x2, incx2, scl2, ssq2 )
+           normsq1 = scl1**2*ssq1 + scl2**2*ssq2
+           if( m1 == 0 ) then
+              do i = 1, n
+                 work(i) = czero
+              end do
+           else
+              call stdlib_wgemv( 'C', m1, n, cone, q1, ldq1, x1, incx1, czero, work,1 )
+           end if
+           call stdlib_wgemv( 'C', m2, n, cone, q2, ldq2, x2, incx2, cone, work, 1 )
+           call stdlib_wgemv( 'N', m1, n, cnegone, q1, ldq1, work, 1, cone, x1,incx1 )
+           call stdlib_wgemv( 'N', m2, n, cnegone, q2, ldq2, work, 1, cone, x2,incx2 )
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_wlassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_wlassq( m2, x2, incx2, scl2, ssq2 )
+           normsq2 = scl1**2*ssq1 + scl2**2*ssq2
+           ! if projection is sufficiently large in norm, then stop.
+           ! if projection is czero, then stop.
+           ! otherwise, project again.
+           if( normsq2 >= alphasq*normsq1 ) then
+              return
+           end if
+           if( normsq2 == czero ) then
+              return
+           end if
+           normsq1 = normsq2
+           do i = 1, n
+              work(i) = czero
+           end do
+           if( m1 == 0 ) then
+              do i = 1, n
+                 work(i) = czero
+              end do
+           else
+              call stdlib_wgemv( 'C', m1, n, cone, q1, ldq1, x1, incx1, czero, work,1 )
+           end if
+           call stdlib_wgemv( 'C', m2, n, cone, q2, ldq2, x2, incx2, cone, work, 1 )
+           call stdlib_wgemv( 'N', m1, n, cnegone, q1, ldq1, work, 1, cone, x1,incx1 )
+           call stdlib_wgemv( 'N', m2, n, cnegone, q2, ldq2, work, 1, cone, x2,incx2 )
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_wlassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_wlassq( m1, x1, incx1, scl1, ssq1 )
+           normsq2 = scl1**2*ssq1 + scl2**2*ssq2
+           ! if second projection is sufficiently large in norm, then do
+           ! nothing more. alternatively, if it shrunk significantly, then
+           ! truncate it to czero.
+           if( normsq2 < alphasq*normsq1 ) then
+              do i = 1, m1
+                 x1(i) = czero
+              end do
+              do i = 1, m2
+                 x2(i) = czero
+              end do
+           end if
+           return
+     end subroutine stdlib_wunbdb6
+
+     !> ZUNCSD: computes the CS decomposition of an M-by-M partitioned
+     !> unitary matrix X:
+     !> [  I  0  0 |  0  0  0 ]
+     !> [  0  C  0 |  0 -S  0 ]
+     !> [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**H
+     !> X = [-----------] = [---------] [---------------------] [---------]   .
+     !> [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
+     !> [  0  S  0 |  0  C  0 ]
+     !> [  0  0  I |  0  0  0 ]
+     !> X11 is P-by-Q. The unitary matrices U1, U2, V1, and V2 are P-by-P,
+     !> (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
+     !> R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
+     !> which R = MIN(P,M-P,Q,M-Q).
+
+     recursive subroutine stdlib_wuncsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, x11, &
+     ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, &
+               work, lwork, rwork, lrwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, signs, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, ldx11, ldx12, ldx21, ldx22, &
+                     lrwork, lwork, m, p, q
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(qp), intent(out) :: theta(*)
+           real(qp), intent(out) :: rwork(*)
+           complex(qp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*), work(*)
+                     
+           complex(qp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
+                     
+        ! ===================================================================
+           
+           ! Local Scalars 
+           character :: transt, signst
+           integer(ilp) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
+           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, itauq2, j, lbbcsdwork, &
+           lbbcsdworkmin, lbbcsdworkopt, lorbdbwork, lorbdbworkmin, lorbdbworkopt, lorglqwork, &
+           lorglqworkmin, lorglqworkopt, lorgqrwork, lorgqrworkmin, lorgqrworkopt, lworkmin, &
+                     lworkopt, p1, q1
+           logical(lk) :: colmajor, defaultsigns, lquery, wantu1, wantu2, wantv1t, wantv2t
+           integer(ilp) :: lrworkmin, lrworkopt
+           logical(lk) :: lrquery
+           ! Intrinsic Functions
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           wantv2t = stdlib_lsame( jobv2t, 'Y' )
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           defaultsigns = .not. stdlib_lsame( signs, 'O' )
+           lquery = lwork == -1
+           lrquery = lrwork == -1
+           if( m < 0 ) then
+              info = -7
+           else if( p < 0 .or. p > m ) then
+              info = -8
+           else if( q < 0 .or. q > m ) then
+              info = -9
+           else if ( colmajor .and.  ldx11 < max( 1, p ) ) then
+             info = -11
+           else if (.not. colmajor .and. ldx11 < max( 1, q ) ) then
+             info = -11
+           else if (colmajor .and. ldx12 < max( 1, p ) ) then
+             info = -13
+           else if (.not. colmajor .and. ldx12 < max( 1, m-q ) ) then
+             info = -13
+           else if (colmajor .and. ldx21 < max( 1, m-p ) ) then
+             info = -15
+           else if (.not. colmajor .and. ldx21 < max( 1, q ) ) then
+             info = -15
+           else if (colmajor .and. ldx22 < max( 1, m-p ) ) then
+             info = -17
+           else if (.not. colmajor .and. ldx22 < max( 1, m-q ) ) then
+             info = -17
+           else if( wantu1 .and. ldu1 < p ) then
+              info = -20
+           else if( wantu2 .and. ldu2 < m-p ) then
+              info = -22
+           else if( wantv1t .and. ldv1t < q ) then
+              info = -24
+           else if( wantv2t .and. ldv2t < m-q ) then
+              info = -26
+           end if
+           ! work with transpose if convenient
+           if( info == 0 .and. min( p, m-p ) < min( q, m-q ) ) then
+              if( colmajor ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              if( defaultsigns ) then
+                 signst = 'O'
+              else
+                 signst = 'D'
+              end if
+              call stdlib_wuncsd( jobv1t, jobv2t, jobu1, jobu2, transt, signst, m,q, p, x11, &
+              ldx11, x21, ldx21, x12, ldx12, x22,ldx22, theta, v1t, ldv1t, v2t, ldv2t, u1, ldu1,&
+                        u2, ldu2, work, lwork, rwork, lrwork, iwork,info )
+              return
+           end if
+           ! work with permutation [ 0 i; i 0 ] * x * [ 0 i; i 0 ] if
+           ! convenient
+           if( info == 0 .and. m-q < q ) then
+              if( defaultsigns ) then
+                 signst = 'O'
+              else
+                 signst = 'D'
+              end if
+              call stdlib_wuncsd( jobu2, jobu1, jobv2t, jobv1t, trans, signst, m,m-p, m-q, x22, &
+              ldx22, x21, ldx21, x12, ldx12, x11,ldx11, theta, u2, ldu2, u1, ldu1, v2t, ldv2t, &
+                        v1t,ldv1t, work, lwork, rwork, lrwork, iwork, info )
+              return
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ! real workspace
+              iphi = 2
+              ib11d = iphi + max( 1, q - 1 )
+              ib11e = ib11d + max( 1, q )
+              ib12d = ib11e + max( 1, q - 1 )
+              ib12e = ib12d + max( 1, q )
+              ib21d = ib12e + max( 1, q - 1 )
+              ib21e = ib21d + max( 1, q )
+              ib22d = ib21e + max( 1, q - 1 )
+              ib22e = ib22d + max( 1, q )
+              ibbcsd = ib22e + max( 1, q - 1 )
+              call stdlib_wbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, theta, u1, &
+              ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, theta, theta, theta, theta, theta,theta, &
+                        theta, theta, rwork, -1, childinfo )
+              lbbcsdworkopt = int( rwork(1),KIND=ilp)
+              lbbcsdworkmin = lbbcsdworkopt
+              lrworkopt = ibbcsd + lbbcsdworkopt - 1
+              lrworkmin = ibbcsd + lbbcsdworkmin - 1
+              rwork(1) = lrworkopt
+              ! complex workspace
+              itaup1 = 2
+              itaup2 = itaup1 + max( 1, p )
+              itauq1 = itaup2 + max( 1, m - p )
+              itauq2 = itauq1 + max( 1, q )
+              iorgqr = itauq2 + max( 1, m - q )
+              call stdlib_wungqr( m-q, m-q, m-q, u1, max(1,m-q), u1, work, -1,childinfo )
+              lorgqrworkopt = int( work(1),KIND=ilp)
+              lorgqrworkmin = max( 1, m - q )
+              iorglq = itauq2 + max( 1, m - q )
+              call stdlib_wunglq( m-q, m-q, m-q, u1, max(1,m-q), u1, work, -1,childinfo )
+              lorglqworkopt = int( work(1),KIND=ilp)
+              lorglqworkmin = max( 1, m - q )
+              iorbdb = itauq2 + max( 1, m - q )
+              call stdlib_wunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+                        ldx22, theta, theta, u1, u2,v1t, v2t, work, -1, childinfo )
+              lorbdbworkopt = int( work(1),KIND=ilp)
+              lorbdbworkmin = lorbdbworkopt
+              lworkopt = max( iorgqr + lorgqrworkopt, iorglq + lorglqworkopt,iorbdb + &
+                        lorbdbworkopt ) - 1
+              lworkmin = max( iorgqr + lorgqrworkmin, iorglq + lorglqworkmin,iorbdb + &
+                        lorbdbworkmin ) - 1
+              work(1) = max(lworkopt,lworkmin)
+              if( lwork < lworkmin.and. .not. ( lquery .or. lrquery ) ) then
+                 info = -22
+              else if( lrwork < lrworkmin.and. .not. ( lquery .or. lrquery ) ) then
+                 info = -24
+              else
+                 lorgqrwork = lwork - iorgqr + 1
+                 lorglqwork = lwork - iorglq + 1
+                 lorbdbwork = lwork - iorbdb + 1
+                 lbbcsdwork = lrwork - ibbcsd + 1
+              end if
+           end if
+           ! abort if any illegal arguments
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNCSD', -info )
+              return
+           else if( lquery .or. lrquery ) then
+              return
+           end if
+           ! transform to bidiagonal block form
+           call stdlib_wunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12, x21,ldx21, x22, &
+           ldx22, theta, rwork(iphi), work(itaup1),work(itaup2), work(itauq1), work(itauq2),work(&
+                     iorbdb), lorbdbwork, childinfo )
+           ! accumulate householder reflectors
+           if( colmajor ) then
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_wlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_wungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqrwork, &
+                           info)
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_wlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_wungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqrwork,&
+                            info )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_wlacpy( 'U', q-1, q-1, x11(1,2), ldx11, v1t(2,2),ldv1t )
+                 v1t(1, 1) = cone
+                 do j = 2, q
+                    v1t(1,j) = czero
+                    v1t(j,1) = czero
+                 end do
+                 call stdlib_wunglq( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorglq), &
+                           lorglqwork, info )
+              end if
+              if( wantv2t .and. m-q > 0 ) then
+                 call stdlib_wlacpy( 'U', p, m-q, x12, ldx12, v2t, ldv2t )
+                 if( m-p > q) then
+                    call stdlib_wlacpy( 'U', m-p-q, m-p-q, x22(q+1,p+1), ldx22,v2t(p+1,p+1), &
+                              ldv2t )
+                 end if
+                 if( m > q ) then
+                    call stdlib_wunglq( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorglq), &
+                              lorglqwork, info )
+                 end if
+              end if
+           else
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_wlacpy( 'U', q, p, x11, ldx11, u1, ldu1 )
+                 call stdlib_wunglq( p, p, q, u1, ldu1, work(itaup1), work(iorglq),lorglqwork, &
+                           info)
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_wlacpy( 'U', q, m-p, x21, ldx21, u2, ldu2 )
+                 call stdlib_wunglq( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorglq), lorglqwork,&
+                            info )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_wlacpy( 'L', q-1, q-1, x11(2,1), ldx11, v1t(2,2),ldv1t )
+                 v1t(1, 1) = cone
+                 do j = 2, q
+                    v1t(1,j) = czero
+                    v1t(j,1) = czero
+                 end do
+                 call stdlib_wungqr( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorgqr), &
+                           lorgqrwork, info )
+              end if
+              if( wantv2t .and. m-q > 0 ) then
+                 p1 = min( p+1, m )
+                 q1 = min( q+1, m )
+                 call stdlib_wlacpy( 'L', m-q, p, x12, ldx12, v2t, ldv2t )
+                 if( m > p+q ) then
+                    call stdlib_wlacpy( 'L', m-p-q, m-p-q, x22(p1,q1), ldx22,v2t(p+1,p+1), ldv2t )
+                              
+                 end if
+                 call stdlib_wungqr( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorgqr), &
+                           lorgqrwork, info )
+              end if
+           end if
+           ! compute the csd of the matrix in bidiagonal-block form
+           call stdlib_wbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q, theta,rwork(iphi), &
+           u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, rwork(ib11d), rwork(ib11e), rwork(ib12d),&
+           rwork(ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd),&
+                     lbbcsdwork, info )
+           ! permute rows and columns to place identity submatrices in top-
+           ! left corner of (1,1)-block and/or bottom-right corner of (1,2)-
+           ! block and/or bottom-right corner of (2,1)-block and/or top-left
+           ! corner of (2,2)-block
+           if( q > 0 .and. wantu2 ) then
+              do i = 1, q
+                 iwork(i) = m - p - q + i
+              end do
+              do i = q + 1, m - p
+                 iwork(i) = i - q
+              end do
+              if( colmajor ) then
+                 call stdlib_wlapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              else
+                 call stdlib_wlapmr( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           end if
+           if( m > 0 .and. wantv2t ) then
+              do i = 1, p
+                 iwork(i) = m - p - q + i
+              end do
+              do i = p + 1, m - q
+                 iwork(i) = i - p
+              end do
+              if( .not. colmajor ) then
+                 call stdlib_wlapmt( .false., m-q, m-q, v2t, ldv2t, iwork )
+              else
+                 call stdlib_wlapmr( .false., m-q, m-q, v2t, ldv2t, iwork )
+              end if
+           end if
+           return
+           ! end stdlib_wuncsd
+     end subroutine stdlib_wuncsd
+
+     !> ZUNCSD2BY1: computes the CS decomposition of an M-by-Q matrix X with
+     !> orthonormal columns that has been partitioned into a 2-by-1 block
+     !> structure:
+     !> [  I1 0  0 ]
+     !> [  0  C  0 ]
+     !> [ X11 ]   [ U1 |    ] [  0  0  0 ]
+     !> X = [-----] = [---------] [----------] V1**T .
+     !> [ X21 ]   [    | U2 ] [  0  0  0 ]
+     !> [  0  S  0 ]
+     !> [  0  0  I2]
+     !> X11 is P-by-Q. The unitary matrices U1, U2, and V1 are P-by-P,
+     !> (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
+     !> nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
+     !> R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
+     !> K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
+
+     subroutine stdlib_wuncsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta, &
+               u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, rwork, lrwork, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, lwork, ldx11, ldx21, m, p, q
+           integer(ilp), intent(in) :: lrwork
+           integer(ilp) :: lrworkmin, lrworkopt
+           ! Array Arguments 
+           real(qp), intent(out) :: rwork(*)
+           real(qp), intent(out) :: theta(*)
+           complex(qp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), work(*)
+           complex(qp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
+           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, j, lbbcsd, lorbdb, &
+           lorglq, lorglqmin, lorglqopt, lorgqr, lorgqrmin, lorgqropt, lworkmin, lworkopt, &
+                     r
+           logical(lk) :: lquery, wantu1, wantu2, wantv1t
+           ! Local Arrays 
+           real(qp) :: dum(1)
+           complex(qp) :: cdum(1,1)
+           ! Intrinsic Function 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           lquery = ( lwork==-1 ) .or. ( lrwork==-1 )
+           if( m < 0 ) then
+              info = -4
+           else if( p < 0 .or. p > m ) then
+              info = -5
+           else if( q < 0 .or. q > m ) then
+              info = -6
+           else if( ldx11 < max( 1, p ) ) then
+              info = -8
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -10
+           else if( wantu1 .and. ldu1 < max( 1, p ) ) then
+              info = -13
+           else if( wantu2 .and. ldu2 < max( 1, m - p ) ) then
+              info = -15
+           else if( wantv1t .and. ldv1t < max( 1, q ) ) then
+              info = -17
+           end if
+           r = min( p, m-p, q, m-q )
+           ! compute workspace
+             ! work layout:
+           ! |-----------------------------------------|
+           ! | lworkopt (1)                            |
+           ! |-----------------------------------------|
+           ! | taup1 (max(1,p))                        |
+           ! | taup2 (max(1,m-p))                      |
+           ! | tauq1 (max(1,q))                        |
+           ! |-----------------------------------------|
+           ! | stdlib_wunbdb work | stdlib_wungqr work | stdlib_wunglq work |
+           ! |             |             |             |
+           ! |             |             |             |
+           ! |             |             |             |
+           ! |             |             |             |
+           ! |-----------------------------------------|
+             ! rwork layout:
+           ! |------------------|
+           ! | lrworkopt (1)    |
+           ! |------------------|
+           ! | phi (max(1,r-1)) |
+           ! |------------------|
+           ! | b11d (r)         |
+           ! | b11e (r-1)       |
+           ! | b12d (r)         |
+           ! | b12e (r-1)       |
+           ! | b21d (r)         |
+           ! | b21e (r-1)       |
+           ! | b22d (r)         |
+           ! | b22e (r-1)       |
+           ! | stdlib_wbbcsd rwork     |
+           ! |------------------|
+           if( info == 0 ) then
+              iphi = 2
+              ib11d = iphi + max( 1, r-1 )
+              ib11e = ib11d + max( 1, r )
+              ib12d = ib11e + max( 1, r - 1 )
+              ib12e = ib12d + max( 1, r )
+              ib21d = ib12e + max( 1, r - 1 )
+              ib21e = ib21d + max( 1, r )
+              ib22d = ib21e + max( 1, r - 1 )
+              ib22e = ib22d + max( 1, r )
+              ibbcsd = ib22e + max( 1, r - 1 )
+              itaup1 = 2
+              itaup2 = itaup1 + max( 1, p )
+              itauq1 = itaup2 + max( 1, m-p )
+              iorbdb = itauq1 + max( 1, q )
+              iorgqr = itauq1 + max( 1, q )
+              iorglq = itauq1 + max( 1, q )
+              lorgqrmin = 1
+              lorgqropt = 1
+              lorglqmin = 1
+              lorglqopt = 1
+              if( r == q ) then
+                 call stdlib_wunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
+                           cdum, work, -1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_wungqr( p, p, q, u1, ldu1, cdum, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 endif
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_wungqr( m-p, m-p, q, u2, ldu2, cdum, work(1), -1,childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_wunglq( q-1, q-1, q-1, v1t, ldv1t,cdum, work(1), -1, childinfo )
+                              
+                    lorglqmin = max( lorglqmin, q-1 )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_wbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,dum, u1, ldu1,&
+                  u2, ldu2, v1t, ldv1t, cdum, 1,dum, dum, dum, dum, dum, dum, dum, dum,rwork(1), -&
+                            1, childinfo )
+                 lbbcsd = int( rwork(1),KIND=ilp)
+              else if( r == p ) then
+                 call stdlib_wunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
+                           cdum, work(1), -1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_wungqr( p-1, p-1, p-1, u1(2,2), ldu1, cdum, work(1),-1, childinfo &
+                              )
+                    lorgqrmin = max( lorgqrmin, p-1 )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_wungqr( m-p, m-p, q, u2, ldu2, cdum, work(1), -1,childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_wunglq( q, q, r, v1t, ldv1t, cdum, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_wbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,dum, v1t, &
+                 ldv1t, cdum, 1, u1, ldu1, u2, ldu2,dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
+                           1), -1, childinfo )
+                 lbbcsd = int( rwork(1),KIND=ilp)
+              else if( r == m-p ) then
+                 call stdlib_wunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
+                           cdum, work(1), -1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_wungqr( p, p, q, u1, ldu1, cdum, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_wungqr( m-p-1, m-p-1, m-p-1, u2(2,2), ldu2, cdum,work(1), -1, &
+                              childinfo )
+                    lorgqrmin = max( lorgqrmin, m-p-1 )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_wunglq( q, q, r, v1t, ldv1t, cdum, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_wbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, dum, cdum,&
+                  1, v1t, ldv1t, u2, ldu2, u1,ldu1, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
+                            1), -1, childinfo )
+                 lbbcsd = int( rwork(1),KIND=ilp)
+              else
+                 call stdlib_wunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
+                           cdum, cdum, work(1), -1, childinfo)
+                 lorbdb = m + int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_wungqr( p, p, m-q, u1, ldu1, cdum, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_wungqr( m-p, m-p, m-q, u2, ldu2, cdum, work(1), -1,childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_wunglq( q, q, q, v1t, ldv1t, cdum, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_wbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, dum, u2, &
+                 ldu2, u1, ldu1, cdum, 1, v1t,ldv1t, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
+                           1), -1, childinfo )
+                 lbbcsd = int( rwork(1),KIND=ilp)
+              end if
+              lrworkmin = ibbcsd+lbbcsd-1
+              lrworkopt = lrworkmin
+              rwork(1) = lrworkopt
+              lworkmin = max( iorbdb+lorbdb-1,iorgqr+lorgqrmin-1,iorglq+lorglqmin-1 )
+              lworkopt = max( iorbdb+lorbdb-1,iorgqr+lorgqropt-1,iorglq+lorglqopt-1 )
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                 info = -19
+              end if
+              if( lrwork < lrworkmin .and. .not.lquery ) then
+                 info = -21
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNCSD2BY1', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           lorgqr = lwork-iorgqr+1
+           lorglq = lwork-iorglq+1
+           ! handle four cases separately: r = q, r = p, r = m-p, and r = m-q,
+           ! in which r = min(p,m-p,q,m-q)
+           if( r == q ) then
+              ! case 1: r = q
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_wunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
+                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_wlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_wungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_wlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_wungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 v1t(1,1) = cone
+                 do j = 2, q
+                    v1t(1,j) = czero
+                    v1t(j,1) = czero
+                 end do
+                 call stdlib_wlacpy( 'U', q-1, q-1, x21(1,2), ldx21, v1t(2,2),ldv1t )
+                 call stdlib_wunglq( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorglq), &
+                           lorglq, childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_wbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,rwork(iphi), u1, &
+              ldu1, u2, ldu2, v1t, ldv1t, cdum,1, rwork(ib11d), rwork(ib11e), rwork(ib12d),rwork(&
+              ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd),lrwork-&
+                        ibbcsd+1, childinfo )
+              ! permute rows and columns to place czero submatrices in
+              ! preferred positions
+              if( q > 0 .and. wantu2 ) then
+                 do i = 1, q
+                    iwork(i) = m - p - q + i
+                 end do
+                 do i = q + 1, m - p
+                    iwork(i) = i - q
+                 end do
+                 call stdlib_wlapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           else if( r == p ) then
+              ! case 2: r = p
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_wunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
+                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 u1(1,1) = cone
+                 do j = 2, p
+                    u1(1,j) = czero
+                    u1(j,1) = czero
+                 end do
+                 call stdlib_wlacpy( 'L', p-1, p-1, x11(2,1), ldx11, u1(2,2), ldu1 )
+                 call stdlib_wungqr( p-1, p-1, p-1, u1(2,2), ldu1, work(itaup1),work(iorgqr), &
+                           lorgqr, childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_wlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_wungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_wlacpy( 'U', p, q, x11, ldx11, v1t, ldv1t )
+                 call stdlib_wunglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_wbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,rwork(iphi), v1t,&
+               ldv1t, cdum, 1, u1, ldu1, u2,ldu2, rwork(ib11d), rwork(ib11e), rwork(ib12d),rwork(&
+               ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd), &
+                         lbbcsd,childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( q > 0 .and. wantu2 ) then
+                 do i = 1, q
+                    iwork(i) = m - p - q + i
+                 end do
+                 do i = q + 1, m - p
+                    iwork(i) = i - q
+                 end do
+                 call stdlib_wlapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           else if( r == m-p ) then
+              ! case 3: r = m-p
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_wunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
+                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_wlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_wungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 u2(1,1) = cone
+                 do j = 2, m-p
+                    u2(1,j) = czero
+                    u2(j,1) = czero
+                 end do
+                 call stdlib_wlacpy( 'L', m-p-1, m-p-1, x21(2,1), ldx21, u2(2,2),ldu2 )
+                 call stdlib_wungqr( m-p-1, m-p-1, m-p-1, u2(2,2), ldu2,work(itaup2), work(iorgqr)&
+                           , lorgqr, childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_wlacpy( 'U', m-p, q, x21, ldx21, v1t, ldv1t )
+                 call stdlib_wunglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_wbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, rwork(iphi), &
+              cdum, 1, v1t, ldv1t, u2, ldu2,u1, ldu1, rwork(ib11d), rwork(ib11e),rwork(ib12d), &
+              rwork(ib12e), rwork(ib21d),rwork(ib21e), rwork(ib22d), rwork(ib22e),rwork(ibbcsd), &
+                        lbbcsd, childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( q > r ) then
+                 do i = 1, r
+                    iwork(i) = q - r + i
+                 end do
+                 do i = r + 1, q
+                    iwork(i) = i - r
+                 end do
+                 if( wantu1 ) then
+                    call stdlib_wlapmt( .false., p, q, u1, ldu1, iwork )
+                 end if
+                 if( wantv1t ) then
+                    call stdlib_wlapmr( .false., q, q, v1t, ldv1t, iwork )
+                 end if
+              end if
+           else
+              ! case 4: r = m-q
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_wunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
+              itaup1), work(itaup2),work(itauq1), work(iorbdb), work(iorbdb+m),lorbdb-m, &
+                        childinfo )
+              ! accumulate householder reflectors
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_wcopy( m-p, work(iorbdb+p), 1, u2, 1 )
+              end if
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_wcopy( p, work(iorbdb), 1, u1, 1 )
+                 do j = 2, p
+                    u1(1,j) = czero
+                 end do
+                 call stdlib_wlacpy( 'L', p-1, m-q-1, x11(2,1), ldx11, u1(2,2),ldu1 )
+                 call stdlib_wungqr( p, p, m-q, u1, ldu1, work(itaup1),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 do j = 2, m-p
+                    u2(1,j) = czero
+                 end do
+                 call stdlib_wlacpy( 'L', m-p-1, m-q-1, x21(2,1), ldx21, u2(2,2),ldu2 )
+                 call stdlib_wungqr( m-p, m-p, m-q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_wlacpy( 'U', m-q, q, x21, ldx21, v1t, ldv1t )
+                 call stdlib_wlacpy( 'U', p-(m-q), q-(m-q), x11(m-q+1,m-q+1), ldx11,v1t(m-q+1,m-q+&
+                           1), ldv1t )
+                 call stdlib_wlacpy( 'U', -p+q, q-p, x21(m-q+1,p+1), ldx21,v1t(p+1,p+1), ldv1t )
+                           
+                 call stdlib_wunglq( q, q, q, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_wbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, rwork(iphi), &
+              u2, ldu2, u1, ldu1, cdum, 1,v1t, ldv1t, rwork(ib11d), rwork(ib11e),rwork(ib12d), &
+              rwork(ib12e), rwork(ib21d),rwork(ib21e), rwork(ib22d), rwork(ib22e),rwork(ibbcsd), &
+                        lbbcsd, childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( p > r ) then
+                 do i = 1, r
+                    iwork(i) = p - r + i
+                 end do
+                 do i = r + 1, p
+                    iwork(i) = i - r
+                 end do
+                 if( wantu1 ) then
+                    call stdlib_wlapmt( .false., p, p, u1, ldu1, iwork )
+                 end if
+                 if( wantv1t ) then
+                    call stdlib_wlapmr( .false., p, q, v1t, ldv1t, iwork )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_wuncsd2by1
+
+     !> ZUNG2L: generates an m by n complex matrix Q with orthonormal columns,
+     !> which is defined as the last n columns of a product of k elementary
+     !> reflectors of order m
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by ZGEQLF.
+
+     pure subroutine stdlib_wung2l( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNG2L', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           ! initialise columns 1:n-k to columns of the unit matrix
+           do j = 1, n - k
+              do l = 1, m
+                 a( l, j ) = czero
+              end do
+              a( m-n+j, j ) = cone
+           end do
+           do i = 1, k
+              ii = n - k + i
+              ! apply h(i) to a(1:m-k+i,1:n-k+i) from the left
+              a( m-n+ii, ii ) = cone
+              call stdlib_wlarf( 'LEFT', m-n+ii, ii-1, a( 1, ii ), 1, tau( i ), a,lda, work )
+                        
+              call stdlib_wscal( m-n+ii-1, -tau( i ), a( 1, ii ), 1 )
+              a( m-n+ii, ii ) = cone - tau( i )
+              ! set a(m-k+i+1:m,n-k+i) to czero
+              do l = m - n + ii + 1, m
+                 a( l, ii ) = czero
+              end do
+           end do
+           return
+     end subroutine stdlib_wung2l
+
+     !> ZUNG2R: generates an m by n complex matrix Q with orthonormal columns,
+     !> which is defined as the first n columns of a product of k elementary
+     !> reflectors of order m
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by ZGEQRF.
+
+     pure subroutine stdlib_wung2r( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNG2R', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           ! initialise columns k+1:n to columns of the unit matrix
+           do j = k + 1, n
+              do l = 1, m
+                 a( l, j ) = czero
+              end do
+              a( j, j ) = cone
+           end do
+           do i = k, 1, -1
+              ! apply h(i) to a(i:m,i:n) from the left
+              if( i<n ) then
+                 a( i, i ) = cone
+                 call stdlib_wlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tau( i ),a( i, i+1 ), lda, &
+                           work )
+              end if
+              if( i<m )call stdlib_wscal( m-i, -tau( i ), a( i+1, i ), 1 )
+              a( i, i ) = cone - tau( i )
+              ! set a(1:i-1,i) to czero
+              do l = 1, i - 1
+                 a( l, i ) = czero
+              end do
+           end do
+           return
+     end subroutine stdlib_wung2r
+
+     !> ZUNGBR: generates one of the complex unitary matrices Q or P**H
+     !> determined by ZGEBRD when reducing a complex matrix A to bidiagonal
+     !> form: A = Q * B * P**H.  Q and P**H are defined as products of
+     !> elementary reflectors H(i) or G(i) respectively.
+     !> If VECT = 'Q', A is assumed to have been an M-by-K matrix, and Q
+     !> is of order M:
+     !> if m >= k, Q = H(1) H(2) . . . H(k) and ZUNGBR returns the first n
+     !> columns of Q, where m >= n >= k;
+     !> if m < k, Q = H(1) H(2) . . . H(m-1) and ZUNGBR returns Q as an
+     !> M-by-M matrix.
+     !> If VECT = 'P', A is assumed to have been a K-by-N matrix, and P**H
+     !> is of order N:
+     !> if k < n, P**H = G(k) . . . G(2) G(1) and ZUNGBR returns the first m
+     !> rows of P**H, where n >= m >= k;
+     !> if k >= n, P**H = G(n-1) . . . G(2) G(1) and ZUNGBR returns P**H as
+     !> an N-by-N matrix.
+
+     pure subroutine stdlib_wungbr( vect, m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantq
+           integer(ilp) :: i, iinfo, j, lwkopt, mn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           wantq = stdlib_lsame( vect, 'Q' )
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( .not.wantq .and. .not.stdlib_lsame( vect, 'P' ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 .or. ( wantq .and. ( n>m .or. n<min( m,k ) ) ) .or. ( .not.wantq .and. ( &
+                     m>n .or. m<min( n, k ) ) ) ) then
+              info = -3
+           else if( k<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( lwork<max( 1, mn ) .and. .not.lquery ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              work( 1 ) = 1
+              if( wantq ) then
+                 if( m>=k ) then
+                    call stdlib_wungqr( m, n, k, a, lda, tau, work, -1, iinfo )
+                 else
+                    if( m>1 ) then
+                       call stdlib_wungqr( m-1, m-1, m-1, a, lda, tau, work, -1,iinfo )
+                    end if
+                 end if
+              else
+                 if( k<n ) then
+                    call stdlib_wunglq( m, n, k, a, lda, tau, work, -1, iinfo )
+                 else
+                    if( n>1 ) then
+                       call stdlib_wunglq( n-1, n-1, n-1, a, lda, tau, work, -1,iinfo )
+                    end if
+                 end if
+              end if
+              lwkopt = real( work( 1 ),KIND=qp)
+              lwkopt = max (lwkopt, mn)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGBR', -info )
+              return
+           else if( lquery ) then
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( wantq ) then
+              ! form q, determined by a call to stdlib_wgebrd to reduce an m-by-k
+              ! matrix
+              if( m>=k ) then
+                 ! if m >= k, assume m >= n >= k
+                 call stdlib_wungqr( m, n, k, a, lda, tau, work, lwork, iinfo )
+              else
+                 ! if m < k, assume m = n
+                 ! shift the vectors which define the elementary reflectors cone
+                 ! column to the right, and set the first row and column of q
+                 ! to those of the unit matrix
+                 do j = m, 2, -1
+                    a( 1, j ) = czero
+                    do i = j + 1, m
+                       a( i, j ) = a( i, j-1 )
+                    end do
+                 end do
+                 a( 1, 1 ) = cone
+                 do i = 2, m
+                    a( i, 1 ) = czero
+                 end do
+                 if( m>1 ) then
+                    ! form q(2:m,2:m)
+                    call stdlib_wungqr( m-1, m-1, m-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                              
+                 end if
+              end if
+           else
+              ! form p**h, determined by a call to stdlib_wgebrd to reduce a k-by-n
+              ! matrix
+              if( k<n ) then
+                 ! if k < n, assume k <= m <= n
+                 call stdlib_wunglq( m, n, k, a, lda, tau, work, lwork, iinfo )
+              else
+                 ! if k >= n, assume m = n
+                 ! shift the vectors which define the elementary reflectors cone
+                 ! row downward, and set the first row and column of p**h to
+                 ! those of the unit matrix
+                 a( 1, 1 ) = cone
+                 do i = 2, n
+                    a( i, 1 ) = czero
+                 end do
+                 do j = 2, n
+                    do i = j - 1, 2, -1
+                       a( i, j ) = a( i-1, j )
+                    end do
+                    a( 1, j ) = czero
+                 end do
+                 if( n>1 ) then
+                    ! form p**h(2:n,2:n)
+                    call stdlib_wunglq( n-1, n-1, n-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                              
+                 end if
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wungbr
+
+     !> ZUNGHR: generates a complex unitary matrix Q which is defined as the
+     !> product of IHI-ILO elementary reflectors of order N, as returned by
+     !> ZGEHRD:
+     !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+     pure subroutine stdlib_wunghr( n, ilo, ihi, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, lwkopt, nb, nh
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nh = ihi - ilo
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( lwork<max( 1, nh ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'ZUNGQR', ' ', nh, nh, nh, -1 )
+              lwkopt = max( 1, nh )*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGHR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! shift the vectors which define the elementary reflectors cone
+           ! column to the right, and set the first ilo and the last n-ihi
+           ! rows and columns to those of the unit matrix
+           do j = ihi, ilo + 1, -1
+              do i = 1, j - 1
+                 a( i, j ) = czero
+              end do
+              do i = j + 1, ihi
+                 a( i, j ) = a( i, j-1 )
+              end do
+              do i = ihi + 1, n
+                 a( i, j ) = czero
+              end do
+           end do
+           do j = 1, ilo
+              do i = 1, n
+                 a( i, j ) = czero
+              end do
+              a( j, j ) = cone
+           end do
+           do j = ihi + 1, n
+              do i = 1, n
+                 a( i, j ) = czero
+              end do
+              a( j, j ) = cone
+           end do
+           if( nh>0 ) then
+              ! generate q(ilo+1:ihi,ilo+1:ihi)
+              call stdlib_wungqr( nh, nh, nh, a( ilo+1, ilo+1 ), lda, tau( ilo ),work, lwork, &
+                        iinfo )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wunghr
+
+     !> ZUNGL2: generates an m-by-n complex matrix Q with orthonormal rows,
+     !> which is defined as the first m rows of a product of k elementary
+     !> reflectors of order n
+     !> Q  =  H(k)**H . . . H(2)**H H(1)**H
+     !> as returned by ZGELQF.
+
+     pure subroutine stdlib_wungl2( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGL2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 )return
+           if( k<m ) then
+              ! initialise rows k+1:m to rows of the unit matrix
+              do j = 1, n
+                 do l = k + 1, m
+                    a( l, j ) = czero
+                 end do
+                 if( j>k .and. j<=m )a( j, j ) = cone
+              end do
+           end if
+           do i = k, 1, -1
+              ! apply h(i)**h to a(i:m,i:n) from the right
+              if( i<n ) then
+                 call stdlib_wlacgv( n-i, a( i, i+1 ), lda )
+                 if( i<m ) then
+                    a( i, i ) = cone
+                    call stdlib_wlarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda,conjg( tau( i ) ), a( &
+                              i+1, i ), lda, work )
+                 end if
+                 call stdlib_wscal( n-i, -tau( i ), a( i, i+1 ), lda )
+                 call stdlib_wlacgv( n-i, a( i, i+1 ), lda )
+              end if
+              a( i, i ) = cone - conjg( tau( i ) )
+              ! set a(i,1:i-1) to czero
+              do l = 1, i - 1
+                 a( i, l ) = czero
+              end do
+           end do
+           return
+     end subroutine stdlib_wungl2
+
+     !> ZUNGLQ: generates an M-by-N complex matrix Q with orthonormal rows,
+     !> which is defined as the first M rows of a product of K elementary
+     !> reflectors of order N
+     !> Q  =  H(k)**H . . . H(2)**H H(1)**H
+     !> as returned by ZGELQF.
+
+     pure subroutine stdlib_wunglq( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, ki, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZUNGLQ', ' ', m, n, k, -1 )
+           lwkopt = max( 1, m )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( lwork<max( 1, m ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGLQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZUNGLQ', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNGLQ', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the last block.
+              ! the first kk rows are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              ! set a(kk+1:m,1:kk) to czero.
+              do j = 1, kk
+                 do i = kk + 1, m
+                    a( i, j ) = czero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the last or only block.
+           if( kk<m )call stdlib_wungl2( m-kk, n-kk, k-kk, a( kk+1, kk+1 ), lda,tau( kk+1 ), work,&
+                      iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = ki + 1, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 if( i+ib<=m ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_wlarft( 'FORWARD', 'ROWWISE', n-i+1, ib, a( i, i ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h**h to a(i+ib:m,i:n) from the right
+                    call stdlib_wlarfb( 'RIGHT', 'CONJUGATE TRANSPOSE', 'FORWARD','ROWWISE', m-i-&
+                    ib+1, n-i+1, ib, a( i, i ),lda, work, ldwork, a( i+ib, i ), lda,work( ib+1 ), &
+                              ldwork )
+                 end if
+                 ! apply h**h to columns i:n of current block
+                 call stdlib_wungl2( ib, n-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 ! set columns 1:i-1 of current block to czero
+                 do j = 1, i - 1
+                    do l = i, i + ib - 1
+                       a( l, j ) = czero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_wunglq
+
+     !> ZUNGQL: generates an M-by-N complex matrix Q with orthonormal columns,
+     !> which is defined as the last N columns of a product of K elementary
+     !> reflectors of order M
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by ZGEQLF.
+
+     pure subroutine stdlib_wungql( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'ZUNGQL', ' ', m, n, k, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, n ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGQL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZUNGQL', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNGQL', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the first block.
+              ! the last kk columns are handled by the block method.
+              kk = min( k, ( ( k-nx+nb-1 ) / nb )*nb )
+              ! set a(m-kk+1:m,1:n-kk) to czero.
+              do j = 1, n - kk
+                 do i = m - kk + 1, m
+                    a( i, j ) = czero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the first or only block.
+           call stdlib_wung2l( m-kk, n-kk, k-kk, a, lda, tau, work, iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = k - kk + 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 if( n-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_wlarft( 'BACKWARD', 'COLUMNWISE', m-k+i+ib-1, ib,a( 1, n-k+i ), &
+                              lda, tau( i ), work, ldwork )
+                    ! apply h to a(1:m-k+i+ib-1,1:n-k+i-1) from the left
+                    call stdlib_wlarfb( 'LEFT', 'NO TRANSPOSE', 'BACKWARD','COLUMNWISE', m-k+i+ib-&
+                    1, n-k+i-1, ib,a( 1, n-k+i ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+                 ! apply h to rows 1:m-k+i+ib-1 of current block
+                 call stdlib_wung2l( m-k+i+ib-1, ib, ib, a( 1, n-k+i ), lda,tau( i ), work, iinfo &
+                           )
+                 ! set rows m-k+i+ib:m of current block to czero
+                 do j = n - k + i, n - k + i + ib - 1
+                    do l = m - k + i + ib, m
+                       a( l, j ) = czero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_wungql
+
+     !> ZUNGQR: generates an M-by-N complex matrix Q with orthonormal columns,
+     !> which is defined as the first N columns of a product of K elementary
+     !> reflectors of order M
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by ZGEQRF.
+
+     pure subroutine stdlib_wungqr( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, ki, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZUNGQR', ' ', m, n, k, -1 )
+           lwkopt = max( 1, n )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGQR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZUNGQR', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNGQR', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the last block.
+              ! the first kk columns are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              ! set a(1:kk,kk+1:n) to czero.
+              do j = kk + 1, n
+                 do i = 1, kk
+                    a( i, j ) = czero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the last or only block.
+           if( kk<n )call stdlib_wung2r( m-kk, n-kk, k-kk, a( kk+1, kk+1 ), lda,tau( kk+1 ), work,&
+                      iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = ki + 1, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_wlarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h to a(i:m,i+ib:n) from the left
+                    call stdlib_wlarfb( 'LEFT', 'NO TRANSPOSE', 'FORWARD','COLUMNWISE', m-i+1, n-&
+                    i-ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 ), &
+                              ldwork )
+                 end if
+                 ! apply h to rows i:m of current block
+                 call stdlib_wung2r( m-i+1, ib, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 ! set rows 1:i-1 of current block to czero
+                 do j = i, i + ib - 1
+                    do l = 1, i - 1
+                       a( l, j ) = czero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_wungqr
+
+     !> ZUNGR2: generates an m by n complex matrix Q with orthonormal rows,
+     !> which is defined as the last m rows of a product of k elementary
+     !> reflectors of order n
+     !> Q  =  H(1)**H H(2)**H . . . H(k)**H
+     !> as returned by ZGERQF.
+
+     pure subroutine stdlib_wungr2( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, l
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGR2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 )return
+           if( k<m ) then
+              ! initialise rows 1:m-k to rows of the unit matrix
+              do j = 1, n
+                 do l = 1, m - k
+                    a( l, j ) = czero
+                 end do
+                 if( j>n-m .and. j<=n-k )a( m-n+j, j ) = cone
+              end do
+           end if
+           do i = 1, k
+              ii = m - k + i
+              ! apply h(i)**h to a(1:m-k+i,1:n-k+i) from the right
+              call stdlib_wlacgv( n-m+ii-1, a( ii, 1 ), lda )
+              a( ii, n-m+ii ) = cone
+              call stdlib_wlarf( 'RIGHT', ii-1, n-m+ii, a( ii, 1 ), lda,conjg( tau( i ) ), a, lda,&
+                         work )
+              call stdlib_wscal( n-m+ii-1, -tau( i ), a( ii, 1 ), lda )
+              call stdlib_wlacgv( n-m+ii-1, a( ii, 1 ), lda )
+              a( ii, n-m+ii ) = cone - conjg( tau( i ) )
+              ! set a(m-k+i,n-k+i+1:n) to czero
+              do l = n - m + ii + 1, n
+                 a( ii, l ) = czero
+              end do
+           end do
+           return
+     end subroutine stdlib_wungr2
+
+     !> ZUNGRQ: generates an M-by-N complex matrix Q with orthonormal rows,
+     !> which is defined as the last M rows of a product of K elementary
+     !> reflectors of order N
+     !> Q  =  H(1)**H H(2)**H . . . H(k)**H
+     !> as returned by ZGERQF.
+
+     pure subroutine stdlib_wungrq( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, ii, iinfo, iws, j, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'ZUNGRQ', ' ', m, n, k, -1 )
+                 lwkopt = m*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, m ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGRQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZUNGRQ', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNGRQ', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the first block.
+              ! the last kk rows are handled by the block method.
+              kk = min( k, ( ( k-nx+nb-1 ) / nb )*nb )
+              ! set a(1:m-kk,n-kk+1:n) to czero.
+              do j = n - kk + 1, n
+                 do i = 1, m - kk
+                    a( i, j ) = czero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the first or only block.
+           call stdlib_wungr2( m-kk, n-kk, k-kk, a, lda, tau, work, iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = k - kk + 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 ii = m - k + i
+                 if( ii>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_wlarft( 'BACKWARD', 'ROWWISE', n-k+i+ib-1, ib,a( ii, 1 ), lda, &
+                              tau( i ), work, ldwork )
+                    ! apply h**h to a(1:m-k+i-1,1:n-k+i+ib-1) from the right
+                    call stdlib_wlarfb( 'RIGHT', 'CONJUGATE TRANSPOSE', 'BACKWARD','ROWWISE', ii-&
+                    1, n-k+i+ib-1, ib, a( ii, 1 ),lda, work, ldwork, a, lda, work( ib+1 ),ldwork )
+                              
+                 end if
+                 ! apply h**h to columns 1:n-k+i+ib-1 of current block
+                 call stdlib_wungr2( ib, n-k+i+ib-1, ib, a( ii, 1 ), lda, tau( i ),work, iinfo )
+                           
+                 ! set columns n-k+i+ib:n of current block to czero
+                 do l = n - k + i + ib, n
+                    do j = ii, ii + ib - 1
+                       a( j, l ) = czero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_wungrq
+
+     !> ZUNGTR: generates a complex unitary matrix Q which is defined as the
+     !> product of n-1 elementary reflectors of order N, as returned by
+     !> ZHETRD:
+     !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+
+     pure subroutine stdlib_wungtr( uplo, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, j, lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, n-1 ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              if( upper ) then
+                 nb = stdlib_ilaenv( 1, 'ZUNGQL', ' ', n-1, n-1, n-1, -1 )
+              else
+                 nb = stdlib_ilaenv( 1, 'ZUNGQR', ' ', n-1, n-1, n-1, -1 )
+              end if
+              lwkopt = max( 1, n-1 )*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGTR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( upper ) then
+              ! q was determined by a call to stdlib_whetrd with uplo = 'u'
+              ! shift the vectors which define the elementary reflectors cone
+              ! column to the left, and set the last row and column of q to
+              ! those of the unit matrix
+              do j = 1, n - 1
+                 do i = 1, j - 1
+                    a( i, j ) = a( i, j+1 )
+                 end do
+                 a( n, j ) = czero
+              end do
+              do i = 1, n - 1
+                 a( i, n ) = czero
+              end do
+              a( n, n ) = cone
+              ! generate q(1:n-1,1:n-1)
+              call stdlib_wungql( n-1, n-1, n-1, a, lda, tau, work, lwork, iinfo )
+           else
+              ! q was determined by a call to stdlib_whetrd with uplo = 'l'.
+              ! shift the vectors which define the elementary reflectors cone
+              ! column to the right, and set the first row and column of q to
+              ! those of the unit matrix
+              do j = n, 2, -1
+                 a( 1, j ) = czero
+                 do i = j + 1, n
+                    a( i, j ) = a( i, j-1 )
+                 end do
+              end do
+              a( 1, 1 ) = cone
+              do i = 2, n
+                 a( i, 1 ) = czero
+              end do
+              if( n>1 ) then
+                 ! generate q(2:n,2:n)
+                 call stdlib_wungqr( n-1, n-1, n-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                           
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wungtr
+
+     !> ZUNGTSQR: generates an M-by-N complex matrix Q_out with orthonormal
+     !> columns, which are the first N columns of a product of comlpex unitary
+     !> matrices of order M which are returned by ZLATSQR
+     !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+     !> See the documentation for ZLATSQR.
+
+     pure subroutine stdlib_wungtsqr( m, n, mb, nb, a, lda, t, ldt, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, mb, nb
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: t(ldt,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: iinfo, ldc, lworkopt, lc, lw, nblocal, j
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           lquery  = lwork==-1
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb<=n ) then
+              info = -3
+           else if( nb<1 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -8
+           else
+              ! test the input lwork for the dimension of the array work.
+              ! this workspace is used to store array c(ldc, n) and work(lwork)
+              ! in the call to stdlib_wlamtsqr. see the documentation for stdlib_wlamtsqr.
+              if( lwork<2 .and. (.not.lquery) ) then
+                 info = -10
+              else
+                 ! set block size for column blocks
+                 nblocal = min( nb, n )
+                 ! lwork = -1, then set the size for the array c(ldc,n)
+                 ! in stdlib_wlamtsqr call and set the optimal size of the work array
+                 ! work(lwork) in stdlib_wlamtsqr call.
+                 ldc = m
+                 lc = ldc*n
+                 lw = n * nblocal
+                 lworkopt = lc+lw
+                 if( ( lwork<max( 1, lworkopt ) ).and.(.not.lquery) ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           ! handle error in the input parameters and return workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGTSQR', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = cmplx( lworkopt,KIND=qp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = cmplx( lworkopt,KIND=qp)
+              return
+           end if
+           ! (1) form explicitly the tall-skinny m-by-n left submatrix q1_in
+           ! of m-by-m orthogonal matrix q_in, which is implicitly stored in
+           ! the subdiagonal part of input array a and in the input array t.
+           ! perform by the following operation using the routine stdlib_wlamtsqr.
+               ! q1_in = q_in * ( i ), where i is a n-by-n identity matrix,
+                              ! ( 0 )        0 is a (m-n)-by-n zero matrix.
+           ! (1a) form m-by-n matrix in the array work(1:ldc*n) with ones
+           ! on the diagonal and zeros elsewhere.
+           call stdlib_wlaset( 'F', m, n, czero, cone, work, ldc )
+           ! (1b)  on input, work(1:ldc*n) stores ( i );
+                                                ! ( 0 )
+                 ! on output, work(1:ldc*n) stores q1_in.
+           call stdlib_wlamtsqr( 'L', 'N', m, n, n, mb, nblocal, a, lda, t, ldt,work, ldc, work( &
+                     lc+1 ), lw, iinfo )
+           ! (2) copy the result from the part of the work array (1:m,1:n)
+           ! with the leading dimension ldc that starts at work(1) into
+           ! the output array a(1:m,1:n) column-by-column.
+           do j = 1, n
+              call stdlib_wcopy( m, work( (j-1)*ldc + 1 ), 1, a( 1, j ), 1 )
+           end do
+           work( 1 ) = cmplx( lworkopt,KIND=qp)
+           return
+     end subroutine stdlib_wungtsqr
+
+     !> ZUNGTSQR_ROW: generates an M-by-N complex matrix Q_out with
+     !> orthonormal columns from the output of ZLATSQR. These N orthonormal
+     !> columns are the first N columns of a product of complex unitary
+     !> matrices Q(k)_in of order M, which are returned by ZLATSQR in
+     !> a special format.
+     !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+     !> The input matrices Q(k)_in are stored in row and column blocks in A.
+     !> See the documentation of ZLATSQR for more details on the format of
+     !> Q(k)_in, where each Q(k)_in is represented by block Householder
+     !> transformations. This routine calls an auxiliary routine ZLARFB_GETT,
+     !> where the computation is performed on each individual block. The
+     !> algorithm first sweeps NB-sized column blocks from the right to left
+     !> starting in the bottom row block and continues to the top row block
+     !> (hence _ROW in the routine name). This sweep is in reverse order of
+     !> the order in which ZLATSQR generates the output blocks.
+
+     pure subroutine stdlib_wungtsqr_row( m, n, mb, nb, a, lda, t, ldt, work,lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, mb, nb
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(in) :: t(ldt,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: nblocal, mb2, m_plus_one, itmp, ib_bottom, lworkopt, &
+                     num_all_row_blocks, jb_t, ib, imb, kb, kb_last, knb, mb1
+           ! Local Arrays 
+           complex(qp) :: dummy(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           lquery  = lwork==-1
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb<=n ) then
+              info = -3
+           else if( nb<1 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           nblocal = min( nb, n )
+           ! determine the workspace size.
+           if( info==0 ) then
+              lworkopt = nblocal * max( nblocal, ( n - nblocal ) )
+           end if
+           ! handle error in the input parameters and handle the workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGTSQR_ROW', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = cmplx( lworkopt,KIND=qp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = cmplx( lworkopt,KIND=qp)
+              return
+           end if
+           ! (0) set the upper-triangular part of the matrix a to zero and
+           ! its diagonal elements to one.
+           call stdlib_wlaset('U', m, n, czero, cone, a, lda )
+           ! kb_last is the column index of the last column block reflector
+           ! in the matrices t and v.
+           kb_last = ( ( n-1 ) / nblocal ) * nblocal + 1
+           ! (1) bottom-up loop over row blocks of a, except the top row block.
+           ! note: if mb>=m, then the loop is never executed.
+           if ( mb<m ) then
+              ! mb2 is the row blocking size for the row blocks before the
+              ! first top row block in the matrix a. ib is the row index for
+              ! the row blocks in the matrix a before the first top row block.
+              ! ib_bottom is the row index for the last bottom row block
+              ! in the matrix a. jb_t is the column index of the corresponding
+              ! column block in the matrix t.
+              ! initialize variables.
+              ! num_all_row_blocks is the number of row blocks in the matrix a
+              ! including the first row block.
+              mb2 = mb - n
+              m_plus_one = m + 1
+              itmp = ( m - mb - 1 ) / mb2
+              ib_bottom = itmp * mb2 + mb + 1
+              num_all_row_blocks = itmp + 2
+              jb_t = num_all_row_blocks * n + 1
+              do ib = ib_bottom, mb+1, -mb2
+                 ! determine the block size imb for the current row block
+                 ! in the matrix a.
+                 imb = min( m_plus_one - ib, mb2 )
+                 ! determine the column index jb_t for the current column block
+                 ! in the matrix t.
+                 jb_t = jb_t - n
+                 ! apply column blocks of h in the row block from right to left.
+                 ! kb is the column index of the current column block reflector
+                 ! in the matrices t and v.
+                 do kb = kb_last, 1, -nblocal
+                    ! determine the size of the current column block knb in
+                    ! the matrices t and v.
+                    knb = min( nblocal, n - kb + 1 )
+                    call stdlib_wlarfb_gett( 'I', imb, n-kb+1, knb,t( 1, jb_t+kb-1 ), ldt, a( kb, &
+                              kb ), lda,a( ib, kb ), lda, work, knb )
+                 end do
+              end do
+           end if
+           ! (2) top row block of a.
+           ! note: if mb>=m, then we have only one row block of a of size m
+           ! and we work on the entire matrix a.
+           mb1 = min( mb, m )
+           ! apply column blocks of h in the top row block from right to left.
+           ! kb is the column index of the current block reflector in
+           ! the matrices t and v.
+           do kb = kb_last, 1, -nblocal
+              ! determine the size of the current column block knb in
+              ! the matrices t and v.
+              knb = min( nblocal, n - kb + 1 )
+              if( mb1-kb-knb+1==0 ) then
+                 ! in stdlib_dlarfb_gett parameters, when m=0, then the matrix b
+                 ! does not exist, hence we need to pass a dummy array
+                 ! reference dummy(1,1) to b with lddummy=1.
+                 call stdlib_wlarfb_gett( 'N', 0, n-kb+1, knb,t( 1, kb ), ldt, a( kb, kb ), lda,&
+                           dummy( 1, 1 ), 1, work, knb )
+              else
+                 call stdlib_wlarfb_gett( 'N', mb1-kb-knb+1, n-kb+1, knb,t( 1, kb ), ldt, a( kb, &
+                           kb ), lda,a( kb+knb, kb), lda, work, knb )
+              end if
+           end do
+           work( 1 ) = cmplx( lworkopt,KIND=qp)
+           return
+     end subroutine stdlib_wungtsqr_row
+
+     !> ZUNHR_COL: takes an M-by-N complex matrix Q_in with orthonormal columns
+     !> as input, stored in A, and performs Householder Reconstruction (HR),
+     !> i.e. reconstructs Householder vectors V(i) implicitly representing
+     !> another M-by-N matrix Q_out, with the property that Q_in = Q_out*S,
+     !> where S is an N-by-N diagonal matrix with diagonal entries
+     !> equal to +1 or -1. The Householder vectors (columns V(i) of V) are
+     !> stored in A on output, and the diagonal entries of S are stored in D.
+     !> Block reflectors are also returned in T
+     !> (same output format as ZGEQRT).
+
+     pure subroutine stdlib_wunhr_col( m, n, nb, a, lda, t, ldt, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, nb
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*)
+           complex(qp), intent(out) :: d(*), t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo, j, jb, jbtemp1, jbtemp2, jnb, nplusone
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( nb<1 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -7
+           end if
+           ! handle error in the input parameters.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNHR_COL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              return
+           end if
+           ! on input, the m-by-n matrix a contains the unitary
+           ! m-by-n matrix q_in.
+           ! (1) compute the unit lower-trapezoidal v (ones on the diagonal
+           ! are not stored) by performing the "modified" lu-decomposition.
+           ! q_in - ( s ) = v * u = ( v1 ) * u,
+                  ! ( 0 )           ( v2 )
+           ! where 0 is an (m-n)-by-n zero matrix.
+           ! (1-1) factor v1 and u.
+           call stdlib_wlaunhr_col_getrfnp( n, n, a, lda, d, iinfo )
+           ! (1-2) solve for v2.
+           if( m>n ) then
+              call stdlib_wtrsm( 'R', 'U', 'N', 'N', m-n, n, cone, a, lda,a( n+1, 1 ), lda )
+                        
+           end if
+           ! (2) reconstruct the block reflector t stored in t(1:nb, 1:n)
+           ! as a sequence of upper-triangular blocks with nb-size column
+           ! blocking.
+           ! loop over the column blocks of size nb of the array a(1:m,1:n)
+           ! and the array t(1:nb,1:n), jb is the column index of a column
+           ! block, jnb is the column block size at each step jb.
+           nplusone = n + 1
+           do jb = 1, n, nb
+              ! (2-0) determine the column block size jnb.
+              jnb = min( nplusone-jb, nb )
+              ! (2-1) copy the upper-triangular part of the current jnb-by-jnb
+              ! diagonal block u(jb) (of the n-by-n matrix u) stored
+              ! in a(jb:jb+jnb-1,jb:jb+jnb-1) into the upper-triangular part
+              ! of the current jnb-by-jnb block t(1:jnb,jb:jb+jnb-1)
+              ! column-by-column, total jnb*(jnb+1)/2 elements.
+              jbtemp1 = jb - 1
+              do j = jb, jb+jnb-1
+                 call stdlib_wcopy( j-jbtemp1, a( jb, j ), 1, t( 1, j ), 1 )
+              end do
+              ! (2-2) perform on the upper-triangular part of the current
+              ! jnb-by-jnb diagonal block u(jb) (of the n-by-n matrix u) stored
+              ! in t(1:jnb,jb:jb+jnb-1) the following operation in place:
+              ! (-1)*u(jb)*s(jb), i.e the result will be stored in the upper-
+              ! triangular part of t(1:jnb,jb:jb+jnb-1). this multiplication
+              ! of the jnb-by-jnb diagonal block u(jb) by the jnb-by-jnb
+              ! diagonal block s(jb) of the n-by-n sign matrix s from the
+              ! right means changing the sign of each j-th column of the block
+              ! u(jb) according to the sign of the diagonal element of the block
+              ! s(jb), i.e. s(j,j) that is stored in the array element d(j).
+              do j = jb, jb+jnb-1
+                 if( d( j )==cone ) then
+                    call stdlib_wscal( j-jbtemp1, -cone, t( 1, j ), 1 )
+                 end if
+              end do
+              ! (2-3) perform the triangular solve for the current block
+              ! matrix x(jb):
+                     ! x(jb) * (a(jb)**t) = b(jb), where:
+                     ! a(jb)**t  is a jnb-by-jnb unit upper-triangular
+                               ! coefficient block, and a(jb)=v1(jb), which
+                               ! is a jnb-by-jnb unit lower-triangular block
+                               ! stored in a(jb:jb+jnb-1,jb:jb+jnb-1).
+                               ! the n-by-n matrix v1 is the upper part
+                               ! of the m-by-n lower-trapezoidal matrix v
+                               ! stored in a(1:m,1:n);
+                     ! b(jb)     is a jnb-by-jnb  upper-triangular right-hand
+                               ! side block, b(jb) = (-1)*u(jb)*s(jb), and
+                               ! b(jb) is stored in t(1:jnb,jb:jb+jnb-1);
+                     ! x(jb)     is a jnb-by-jnb upper-triangular solution
+                               ! block, x(jb) is the upper-triangular block
+                               ! reflector t(jb), and x(jb) is stored
+                               ! in t(1:jnb,jb:jb+jnb-1).
+                   ! in other words, we perform the triangular solve for the
+                   ! upper-triangular block t(jb):
+                     ! t(jb) * (v1(jb)**t) = (-1)*u(jb)*s(jb).
+                   ! even though the blocks x(jb) and b(jb) are upper-
+                   ! triangular, the routine stdlib_wtrsm will access all jnb**2
+                   ! elements of the square t(1:jnb,jb:jb+jnb-1). therefore,
+                   ! we need to set to zero the elements of the block
+                   ! t(1:jnb,jb:jb+jnb-1) below the diagonal before the call
+                   ! to stdlib_wtrsm.
+              ! (2-3a) set the elements to zero.
+              jbtemp2 = jb - 2
+              do j = jb, jb+jnb-2
+                 do i = j-jbtemp2, nb
+                    t( i, j ) = czero
+                 end do
+              end do
+              ! (2-3b) perform the triangular solve.
+              call stdlib_wtrsm( 'R', 'L', 'C', 'U', jnb, jnb, cone,a( jb, jb ), lda, t( 1, jb ), &
+                        ldt )
+           end do
+           return
+     end subroutine stdlib_wunhr_col
+
+
+     pure subroutine stdlib_wunm22( side, trans, m, n, n1, n2, q, ldq, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(in) :: m, n, n1, n2, ldq, ldc, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(qp), intent(in) :: q(ldq,*)
+           complex(qp), intent(inout) :: c(ldc,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, ldwork, len, lwkopt, nb, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q;
+           ! nw is the minimum dimension of work.
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           nw = nq
+           if( n1==0 .or. n2==0 ) nw = 1
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'C' ) )&
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( n1<0 .or. n1+n2/=nq ) then
+              info = -5
+           else if( n2<0 ) then
+              info = -6
+           else if( ldq<max( 1, nq ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              lwkopt = m*n
+              work( 1 ) = cmplx( lwkopt,KIND=qp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNM22', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! degenerate cases (n1 = 0 or n2 = 0) are handled using stdlib_wtrmm.
+           if( n1==0 ) then
+              call stdlib_wtrmm( side, 'UPPER', trans, 'NON-UNIT', m, n, cone,q, ldq, c, ldc )
+                        
+              work( 1 ) = cone
+              return
+           else if( n2==0 ) then
+              call stdlib_wtrmm( side, 'LOWER', trans, 'NON-UNIT', m, n, cone,q, ldq, c, ldc )
+                        
+              work( 1 ) = cone
+              return
+           end if
+           ! compute the largest chunk size available from the workspace.
+           nb = max( 1, min( lwork, lwkopt ) / nq )
+           if( left ) then
+              if( notran ) then
+                 do i = 1, n, nb
+                    len = min( nb, n-i+1 )
+                    ldwork = m
+                    ! multiply bottom part of c by q12.
+                    call stdlib_wlacpy( 'ALL', n1, len, c( n2+1, i ), ldc, work,ldwork )
+                    call stdlib_wtrmm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT',n1, len, cone, &
+                              q( 1, n2+1 ), ldq, work,ldwork )
+                    ! multiply top part of c by q11.
+                    call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n1, len, n2,cone, q, ldq, &
+                              c( 1, i ), ldc, cone, work,ldwork )
+                    ! multiply top part of c by q21.
+                    call stdlib_wlacpy( 'ALL', n2, len, c( 1, i ), ldc,work( n1+1 ), ldwork )
+                              
+                    call stdlib_wtrmm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',n2, len, cone, &
+                              q( n1+1, 1 ), ldq,work( n1+1 ), ldwork )
+                    ! multiply bottom part of c by q22.
+                    call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n2, len, n1,cone, q( n1+1, &
+                              n2+1 ), ldq, c( n2+1, i ), ldc,cone, work( n1+1 ), ldwork )
+                    ! copy everything back.
+                    call stdlib_wlacpy( 'ALL', m, len, work, ldwork, c( 1, i ),ldc )
+                 end do
+              else
+                 do i = 1, n, nb
+                    len = min( nb, n-i+1 )
+                    ldwork = m
+                    ! multiply bottom part of c by q21**h.
+                    call stdlib_wlacpy( 'ALL', n2, len, c( n1+1, i ), ldc, work,ldwork )
+                    call stdlib_wtrmm( 'LEFT', 'UPPER', 'CONJUGATE', 'NON-UNIT',n2, len, cone, q( &
+                              n1+1, 1 ), ldq, work,ldwork )
+                    ! multiply top part of c by q11**h.
+                    call stdlib_wgemm( 'CONJUGATE', 'NO TRANSPOSE', n2, len, n1,cone, q, ldq, c( &
+                              1, i ), ldc, cone, work,ldwork )
+                    ! multiply top part of c by q12**h.
+                    call stdlib_wlacpy( 'ALL', n1, len, c( 1, i ), ldc,work( n2+1 ), ldwork )
+                              
+                    call stdlib_wtrmm( 'LEFT', 'LOWER', 'CONJUGATE', 'NON-UNIT',n1, len, cone, q( &
+                              1, n2+1 ), ldq,work( n2+1 ), ldwork )
+                    ! multiply bottom part of c by q22**h.
+                    call stdlib_wgemm( 'CONJUGATE', 'NO TRANSPOSE', n1, len, n2,cone, q( n1+1, n2+&
+                              1 ), ldq, c( n1+1, i ), ldc,cone, work( n2+1 ), ldwork )
+                    ! copy everything back.
+                    call stdlib_wlacpy( 'ALL', m, len, work, ldwork, c( 1, i ),ldc )
+                 end do
+              end if
+           else
+              if( notran ) then
+                 do i = 1, m, nb
+                    len = min( nb, m-i+1 )
+                    ldwork = len
+                    ! multiply right part of c by q21.
+                    call stdlib_wlacpy( 'ALL', len, n2, c( i, n1+1 ), ldc, work,ldwork )
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',len, n2, cone,&
+                               q( n1+1, 1 ), ldq, work,ldwork )
+                    ! multiply left part of c by q11.
+                    call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', len, n2, n1,cone, c( i, 1 )&
+                              , ldc, q, ldq, cone, work,ldwork )
+                    ! multiply left part of c by q12.
+                    call stdlib_wlacpy( 'ALL', len, n1, c( i, 1 ), ldc,work( 1 + n2*ldwork ), &
+                              ldwork )
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT',len, n1, cone,&
+                               q( 1, n2+1 ), ldq,work( 1 + n2*ldwork ), ldwork )
+                    ! multiply right part of c by q22.
+                    call stdlib_wgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', len, n1, n2,cone, c( i, n1+&
+                              1 ), ldc, q( n1+1, n2+1 ), ldq,cone, work( 1 + n2*ldwork ), ldwork )
+                    ! copy everything back.
+                    call stdlib_wlacpy( 'ALL', len, n, work, ldwork, c( i, 1 ),ldc )
+                 end do
+              else
+                 do i = 1, m, nb
+                    len = min( nb, m-i+1 )
+                    ldwork = len
+                    ! multiply right part of c by q12**h.
+                    call stdlib_wlacpy( 'ALL', len, n1, c( i, n2+1 ), ldc, work,ldwork )
+                    call stdlib_wtrmm( 'RIGHT', 'LOWER', 'CONJUGATE', 'NON-UNIT',len, n1, cone, q(&
+                               1, n2+1 ), ldq, work,ldwork )
+                    ! multiply left part of c by q11**h.
+                    call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE', len, n1, n2,cone, c( i, 1 ), &
+                              ldc, q, ldq, cone, work,ldwork )
+                    ! multiply left part of c by q21**h.
+                    call stdlib_wlacpy( 'ALL', len, n2, c( i, 1 ), ldc,work( 1 + n1*ldwork ), &
+                              ldwork )
+                    call stdlib_wtrmm( 'RIGHT', 'UPPER', 'CONJUGATE', 'NON-UNIT',len, n2, cone, q(&
+                               n1+1, 1 ), ldq,work( 1 + n1*ldwork ), ldwork )
+                    ! multiply right part of c by q22**h.
+                    call stdlib_wgemm( 'NO TRANSPOSE', 'CONJUGATE', len, n2, n1,cone, c( i, n2+1 )&
+                              , ldc, q( n1+1, n2+1 ), ldq,cone, work( 1 + n1*ldwork ), ldwork )
+                    ! copy everything back.
+                    call stdlib_wlacpy( 'ALL', len, n, work, ldwork, c( i, 1 ),ldc )
+                 end do
+              end if
+           end if
+           work( 1 ) = cmplx( lwkopt,KIND=qp)
+           return
+     end subroutine stdlib_wunm22
+
+     !> ZUNM2L: overwrites the general complex m-by-n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by ZGEQLF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_wunm2l( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, mi, ni, nq
+           complex(qp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNM2L', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. notran .or. .not.left .and. .not.notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+           else
+              mi = m
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(1:m-k+i,1:n)
+                 mi = m - k + i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,1:n-k+i)
+                 ni = n - k + i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = tau( i )
+              else
+                 taui = conjg( tau( i ) )
+              end if
+              aii = a( nq-k+i, i )
+              a( nq-k+i, i ) = cone
+              call stdlib_wlarf( side, mi, ni, a( 1, i ), 1, taui, c, ldc, work )
+              a( nq-k+i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_wunm2l
+
+     !> ZUNM2R: overwrites the general complex m-by-n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by ZGEQRF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_wunm2r( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, jc, mi, ni, nq
+           complex(qp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNM2R', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran .or. .not.left .and. notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              jc = 1
+           else
+              mi = m
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = tau( i )
+              else
+                 taui = conjg( tau( i ) )
+              end if
+              aii = a( i, i )
+              a( i, i ) = cone
+              call stdlib_wlarf( side, mi, ni, a( i, i ), 1, taui, c( ic, jc ), ldc,work )
+              a( i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_wunm2r
+
+     !> If VECT = 'Q', ZUNMBR: overwrites the general complex M-by-N matrix C
+     !> with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> If VECT = 'P', ZUNMBR overwrites the general complex M-by-N matrix C
+     !> with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      P * C          C * P
+     !> TRANS = 'C':      P**H * C       C * P**H
+     !> Here Q and P**H are the unitary matrices determined by ZGEBRD when
+     !> reducing a complex matrix A to bidiagonal form: A = Q * B * P**H. Q
+     !> and P**H are defined as products of elementary reflectors H(i) and
+     !> G(i) respectively.
+     !> Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the
+     !> order of the unitary matrix Q or P**H that is applied.
+     !> If VECT = 'Q', A is assumed to have been an NQ-by-K matrix:
+     !> if nq >= k, Q = H(1) H(2) . . . H(k);
+     !> if nq < k, Q = H(1) H(2) . . . H(nq-1).
+     !> If VECT = 'P', A is assumed to have been a K-by-NQ matrix:
+     !> if k < nq, P = G(1) G(2) . . . G(k);
+     !> if k >= nq, P = G(1) G(2) . . . G(nq-1).
+
+     pure subroutine stdlib_wunmbr( vect, side, trans, m, n, k, a, lda, tau, c,ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: applyq, left, lquery, notran
+           character :: transt
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           applyq = stdlib_lsame( vect, 'Q' )
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q or p and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.applyq .and. .not.stdlib_lsame( vect, 'P' ) ) then
+              info = -1
+           else if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -2
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( k<0 ) then
+              info = -6
+           else if( ( applyq .and. lda<max( 1, nq ) ) .or.( .not.applyq .and. lda<max( 1, min( nq,&
+                      k ) ) ) )then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( m>0 .and. n>0 ) then
+                 if( applyq ) then
+                    if( left ) then
+                       nb = stdlib_ilaenv( 1, 'ZUNMQR', side // trans, m-1, n, m-1,-1 )
+                    else
+                       nb = stdlib_ilaenv( 1, 'ZUNMQR', side // trans, m, n-1, n-1,-1 )
+                    end if
+                 else
+                    if( left ) then
+                       nb = stdlib_ilaenv( 1, 'ZUNMLQ', side // trans, m-1, n, m-1,-1 )
+                    else
+                       nb = stdlib_ilaenv( 1, 'ZUNMLQ', side // trans, m, n-1, n-1,-1 )
+                    end if
+                 end if
+                 lwkopt = nw*nb
+              else
+                 lwkopt = 1
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMBR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if( applyq ) then
+              ! apply q
+              if( nq>=k ) then
+                 ! q was determined by a call to stdlib_wgebrd with nq >= k
+                 call stdlib_wunmqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, iinfo &
+                           )
+              else if( nq>1 ) then
+                 ! q was determined by a call to stdlib_wgebrd with nq < k
+                 if( left ) then
+                    mi = m - 1
+                    ni = n
+                    i1 = 2
+                    i2 = 1
+                 else
+                    mi = m
+                    ni = n - 1
+                    i1 = 1
+                    i2 = 2
+                 end if
+                 call stdlib_wunmqr( side, trans, mi, ni, nq-1, a( 2, 1 ), lda, tau,c( i1, i2 ), &
+                           ldc, work, lwork, iinfo )
+              end if
+           else
+              ! apply p
+              if( notran ) then
+                 transt = 'C'
+              else
+                 transt = 'N'
+              end if
+              if( nq>k ) then
+                 ! p was determined by a call to stdlib_wgebrd with nq > k
+                 call stdlib_wunmlq( side, transt, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                           iinfo )
+              else if( nq>1 ) then
+                 ! p was determined by a call to stdlib_wgebrd with nq <= k
+                 if( left ) then
+                    mi = m - 1
+                    ni = n
+                    i1 = 2
+                    i2 = 1
+                 else
+                    mi = m
+                    ni = n - 1
+                    i1 = 1
+                    i2 = 2
+                 end if
+                 call stdlib_wunmlq( side, transt, mi, ni, nq-1, a( 1, 2 ), lda,tau, c( i1, i2 ), &
+                           ldc, work, lwork, iinfo )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wunmbr
+
+     !> ZUNMHR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> IHI-ILO elementary reflectors, as returned by ZGEHRD:
+     !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+     pure subroutine stdlib_wunmhr( side, trans, m, n, ilo, ihi, a, lda, tau, c,ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldc, lwork, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, lquery
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, nh, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nh = ihi - ilo
+           left = stdlib_lsame( side, 'L' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'C' ) )&
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 .or. ilo>max( 1, nq ) ) then
+              info = -5
+           else if( ihi<min( ilo, nq ) .or. ihi>nq ) then
+              info = -6
+           else if( lda<max( 1, nq ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( left ) then
+                 nb = stdlib_ilaenv( 1, 'ZUNMQR', side // trans, nh, n, nh, -1 )
+              else
+                 nb = stdlib_ilaenv( 1, 'ZUNMQR', side // trans, m, nh, nh, -1 )
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMHR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. nh==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( left ) then
+              mi = nh
+              ni = n
+              i1 = ilo + 1
+              i2 = 1
+           else
+              mi = m
+              ni = nh
+              i1 = 1
+              i2 = ilo + 1
+           end if
+           call stdlib_wunmqr( side, trans, mi, ni, nh, a( ilo+1, ilo ), lda,tau( ilo ), c( i1, &
+                     i2 ), ldc, work, lwork, iinfo )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wunmhr
+
+     !> ZUNML2: overwrites the general complex m-by-n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k)**H . . . H(2)**H H(1)**H
+     !> as returned by ZGELQF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_wunml2( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, jc, mi, ni, nq
+           complex(qp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNML2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. notran .or. .not.left .and. .not.notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              jc = 1
+           else
+              mi = m
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = conjg( tau( i ) )
+              else
+                 taui = tau( i )
+              end if
+              if( i<nq )call stdlib_wlacgv( nq-i, a( i, i+1 ), lda )
+              aii = a( i, i )
+              a( i, i ) = cone
+              call stdlib_wlarf( side, mi, ni, a( i, i ), lda, taui, c( ic, jc ),ldc, work )
+                        
+              a( i, i ) = aii
+              if( i<nq )call stdlib_wlacgv( nq-i, a( i, i+1 ), lda )
+           end do
+           return
+     end subroutine stdlib_wunml2
+
+     !> ZUNMLQ: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k)**H . . . H(2)**H H(1)**H
+     !> as returned by ZGELQF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_wunmlq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, jc, ldwork, lwkopt, mi, nb, nbmin, &
+                     ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'ZUNMLQ', side // trans, m, n, k,-1 ) )
+              lwkopt = nw*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMLQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNMLQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_wunml2( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. notran ) .or.( .not.left .and. .not.notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              if( notran ) then
+                 transt = 'C'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i) h(i+1) . . . h(i+ib-1)
+                 call stdlib_wlarft( 'FORWARD', 'ROWWISE', nq-i+1, ib, a( i, i ),lda, tau( i ), &
+                           work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**h is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**h
+                 call stdlib_wlarfb( side, transt, 'FORWARD', 'ROWWISE', mi, ni, ib,a( i, i ), &
+                           lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wunmlq
+
+     !> ZUNMQL: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by ZGEQLF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_wunmql( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, i1, i2, i3, ib, iinfo, iwt, ldwork, lwkopt, mi, nb, nbmin, ni, nq, &
+                     nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'ZUNMQL', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMQL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNMQL', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_wunm2l( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. notran ) .or.( .not.left .and. .not.notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_wlarft( 'BACKWARD', 'COLUMNWISE', nq-k+i+ib-1, ib,a( 1, i ), lda, &
+                           tau( i ), work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(1:m-k+i+ib-1,1:n)
+                    mi = m - k + i + ib - 1
+                 else
+                    ! h or h**h is applied to c(1:m,1:n-k+i+ib-1)
+                    ni = n - k + i + ib - 1
+                 end if
+                 ! apply h or h**h
+                 call stdlib_wlarfb( side, trans, 'BACKWARD', 'COLUMNWISE', mi, ni,ib, a( 1, i ), &
+                           lda, work( iwt ), ldt, c, ldc,work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wunmql
+
+     !> ZUNMQR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_wunmqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, jc, ldwork, lwkopt, mi, nb, nbmin, &
+                     ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'ZUNMQR', side // trans, m, n, k,-1 ) )
+              lwkopt = nw*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMQR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNMQR', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_wunm2r( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i) h(i+1) . . . h(i+ib-1)
+                 call stdlib_wlarft( 'FORWARD', 'COLUMNWISE', nq-i+1, ib, a( i, i ),lda, tau( i ),&
+                            work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**h is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**h
+                 call stdlib_wlarfb( side, trans, 'FORWARD', 'COLUMNWISE', mi, ni,ib, a( i, i ), &
+                           lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wunmqr
+
+     !> ZUNMR2: overwrites the general complex m-by-n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1)**H H(2)**H . . . H(k)**H
+     !> as returned by ZGERQF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_wunmr2( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, mi, ni, nq
+           complex(qp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMR2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran .or. .not.left .and. notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+           else
+              mi = m
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(1:m-k+i,1:n)
+                 mi = m - k + i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,1:n-k+i)
+                 ni = n - k + i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = conjg( tau( i ) )
+              else
+                 taui = tau( i )
+              end if
+              call stdlib_wlacgv( nq-k+i-1, a( i, 1 ), lda )
+              aii = a( i, nq-k+i )
+              a( i, nq-k+i ) = cone
+              call stdlib_wlarf( side, mi, ni, a( i, 1 ), lda, taui, c, ldc, work )
+              a( i, nq-k+i ) = aii
+              call stdlib_wlacgv( nq-k+i-1, a( i, 1 ), lda )
+           end do
+           return
+     end subroutine stdlib_wunmr2
+
+     !> ZUNMR3: overwrites the general complex m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by ZTZRZF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_wunmr3( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, l, lda, ldc, m, n
+           ! Array Arguments 
+           complex(qp), intent(in) :: a(lda,*), tau(*)
+           complex(qp), intent(inout) :: c(ldc,*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, ja, jc, mi, ni, nq
+           complex(qp) :: taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( l<0 .or. ( left .and. ( l>m ) ) .or.( .not.left .and. ( l>n ) ) ) then
+              info = -6
+           else if( lda<max( 1, k ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMR3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran .or. .not.left .and. notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              ja = m - l + 1
+              jc = 1
+           else
+              mi = m
+              ja = n - l + 1
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = tau( i )
+              else
+                 taui = conjg( tau( i ) )
+              end if
+              call stdlib_wlarz( side, mi, ni, l, a( i, ja ), lda, taui,c( ic, jc ), ldc, work )
+                        
+           end do
+           return
+     end subroutine stdlib_wunmr3
+
+     !> ZUNMRQ: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1)**H H(2)**H . . . H(k)**H
+     !> as returned by ZGERQF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_wunmrq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, iinfo, iwt, ldwork, lwkopt, mi, nb, nbmin, ni, nq, &
+                     nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'ZUNMRQ', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMRQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNMRQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_wunmr2( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              if( notran ) then
+                 transt = 'C'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_wlarft( 'BACKWARD', 'ROWWISE', nq-k+i+ib-1, ib,a( i, 1 ), lda, tau( &
+                           i ), work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(1:m-k+i+ib-1,1:n)
+                    mi = m - k + i + ib - 1
+                 else
+                    ! h or h**h is applied to c(1:m,1:n-k+i+ib-1)
+                    ni = n - k + i + ib - 1
+                 end if
+                 ! apply h or h**h
+                 call stdlib_wlarfb( side, transt, 'BACKWARD', 'ROWWISE', mi, ni,ib, a( i, 1 ), &
+                           lda, work( iwt ), ldt, c, ldc,work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wunmrq
+
+     !> ZUNMRZ: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by ZTZRZF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_wunmrz( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, l, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, ja, jc, ldwork, lwkopt, mi, nb, &
+                     nbmin, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( l<0 .or. ( left .and. ( l>m ) ) .or.( .not.left .and. ( l>n ) ) ) then
+              info = -6
+           else if( lda<max( 1, k ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<max( 1, nw ) .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'ZUNMRQ', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMRZ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! determine the block size.  nb may be at most nbmax, where nbmax
+           ! is used to define the local array t.
+           nb = min( nbmax, stdlib_ilaenv( 1, 'ZUNMRQ', side // trans, m, n, k,-1 ) )
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNMRQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_wunmr3( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, iinfo )
+                        
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+                 ja = m - l + 1
+              else
+                 mi = m
+                 ic = 1
+                 ja = n - l + 1
+              end if
+              if( notran ) then
+                 transt = 'C'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_wlarzt( 'BACKWARD', 'ROWWISE', l, ib, a( i, ja ), lda,tau( i ), work(&
+                            iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**h is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**h
+                 call stdlib_wlarzb( side, transt, 'BACKWARD', 'ROWWISE', mi, ni,ib, l, a( i, ja )&
+                           , lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wunmrz
+
+     !> ZUNMTR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> nq-1 elementary reflectors, as returned by ZHETRD:
+     !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+     pure subroutine stdlib_wunmtr( side, uplo, trans, m, n, a, lda, tau, c, ldc,work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, lquery, upper
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'C' ) )&
+                     then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              if( upper ) then
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'ZUNMQL', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'ZUNMQL', side // trans, m, n-1, n-1,-1 )
+                 end if
+              else
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'ZUNMQR', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'ZUNMQR', side // trans, m, n-1, n-1,-1 )
+                 end if
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMTR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. nq==1 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( left ) then
+              mi = m - 1
+              ni = n
+           else
+              mi = m
+              ni = n - 1
+           end if
+           if( upper ) then
+              ! q was determined by a call to stdlib_whetrd with uplo = 'u'
+              call stdlib_wunmql( side, trans, mi, ni, nq-1, a( 1, 2 ), lda, tau, c,ldc, work, &
+                        lwork, iinfo )
+           else
+              ! q was determined by a call to stdlib_whetrd with uplo = 'l'
+              if( left ) then
+                 i1 = 2
+                 i2 = 1
+              else
+                 i1 = 1
+                 i2 = 2
+              end if
+              call stdlib_wunmqr( side, trans, mi, ni, nq-1, a( 2, 1 ), lda, tau,c( i1, i2 ), ldc,&
+                         work, lwork, iinfo )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_wunmtr
+
+     !> ZUPGTR: generates a complex unitary matrix Q which is defined as the
+     !> product of n-1 elementary reflectors H(i) of order n, as returned by
+     !> ZHPTRD using packed storage:
+     !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+
+     pure subroutine stdlib_wupgtr( uplo, n, ap, tau, q, ldq, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldq, n
+           ! Array Arguments 
+           complex(qp), intent(in) :: ap(*), tau(*)
+           complex(qp), intent(out) :: q(ldq,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, ij, j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUPGTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! q was determined by a call to stdlib_whptrd with uplo = 'u'
+              ! unpack the vectors which define the elementary reflectors and
+              ! set the last row and column of q equal to those of the unit
+              ! matrix
+              ij = 2
+              do j = 1, n - 1
+                 do i = 1, j - 1
+                    q( i, j ) = ap( ij )
+                    ij = ij + 1
+                 end do
+                 ij = ij + 2
+                 q( n, j ) = czero
+              end do
+              do i = 1, n - 1
+                 q( i, n ) = czero
+              end do
+              q( n, n ) = cone
+              ! generate q(1:n-1,1:n-1)
+              call stdlib_wung2l( n-1, n-1, n-1, q, ldq, tau, work, iinfo )
+           else
+              ! q was determined by a call to stdlib_whptrd with uplo = 'l'.
+              ! unpack the vectors which define the elementary reflectors and
+              ! set the first row and column of q equal to those of the unit
+              ! matrix
+              q( 1, 1 ) = cone
+              do i = 2, n
+                 q( i, 1 ) = czero
+              end do
+              ij = 3
+              do j = 2, n
+                 q( 1, j ) = czero
+                 do i = j + 1, n
+                    q( i, j ) = ap( ij )
+                    ij = ij + 1
+                 end do
+                 ij = ij + 2
+              end do
+              if( n>1 ) then
+                 ! generate q(2:n,2:n)
+                 call stdlib_wung2r( n-1, n-1, n-1, q( 2, 2 ), ldq, tau, work,iinfo )
+              end if
+           end if
+           return
+     end subroutine stdlib_wupgtr
+
+     !> ZUPMTR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> nq-1 elementary reflectors, as returned by ZHPTRD using packed
+     !> storage:
+     !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+     pure subroutine stdlib_wupmtr( side, uplo, trans, m, n, ap, tau, c, ldc, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, m, n
+           ! Array Arguments 
+           complex(qp), intent(inout) :: ap(*), c(ldc,*)
+           complex(qp), intent(in) :: tau(*)
+           complex(qp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: forwrd, left, notran, upper
+           integer(ilp) :: i, i1, i2, i3, ic, ii, jc, mi, ni, nq
+           complex(qp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           upper = stdlib_lsame( uplo, 'U' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( ldc<max( 1, m ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUPMTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if( upper ) then
+              ! q was determined by a call to stdlib_whptrd with uplo = 'u'
+              forwrd = ( left .and. notran ) .or.( .not.left .and. .not.notran )
+              if( forwrd ) then
+                 i1 = 1
+                 i2 = nq - 1
+                 i3 = 1
+                 ii = 2
+              else
+                 i1 = nq - 1
+                 i2 = 1
+                 i3 = -1
+                 ii = nq*( nq+1 ) / 2 - 1
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              do i = i1, i2, i3
+                 if( left ) then
+                    ! h(i) or h(i)**h is applied to c(1:i,1:n)
+                    mi = i
+                 else
+                    ! h(i) or h(i)**h is applied to c(1:m,1:i)
+                    ni = i
+                 end if
+                 ! apply h(i) or h(i)**h
+                 if( notran ) then
+                    taui = tau( i )
+                 else
+                    taui = conjg( tau( i ) )
+                 end if
+                 aii = ap( ii )
+                 ap( ii ) = cone
+                 call stdlib_wlarf( side, mi, ni, ap( ii-i+1 ), 1, taui, c, ldc,work )
+                 ap( ii ) = aii
+                 if( forwrd ) then
+                    ii = ii + i + 2
+                 else
+                    ii = ii - i - 1
+                 end if
+              end do
+           else
+              ! q was determined by a call to stdlib_whptrd with uplo = 'l'.
+              forwrd = ( left .and. .not.notran ) .or.( .not.left .and. notran )
+              if( forwrd ) then
+                 i1 = 1
+                 i2 = nq - 1
+                 i3 = 1
+                 ii = 2
+              else
+                 i1 = nq - 1
+                 i2 = 1
+                 i3 = -1
+                 ii = nq*( nq+1 ) / 2 - 1
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              loop_20: do i = i1, i2, i3
+                 aii = ap( ii )
+                 ap( ii ) = cone
+                 if( left ) then
+                    ! h(i) or h(i)**h is applied to c(i+1:m,1:n)
+                    mi = m - i
+                    ic = i + 1
+                 else
+                    ! h(i) or h(i)**h is applied to c(1:m,i+1:n)
+                    ni = n - i
+                    jc = i + 1
+                 end if
+                 ! apply h(i) or h(i)**h
+                 if( notran ) then
+                    taui = tau( i )
+                 else
+                    taui = conjg( tau( i ) )
+                 end if
+                 call stdlib_wlarf( side, mi, ni, ap( ii ), 1, taui, c( ic, jc ),ldc, work )
+                           
+                 ap( ii ) = aii
+                 if( forwrd ) then
+                    ii = ii + nq - i + 1
+                 else
+                    ii = ii - nq + i - 2
+                 end if
+              end do loop_20
+           end if
+           return
+     end subroutine stdlib_wupmtr
+
+
+
+end module stdlib_linalg_lapack_w
+#:endif
diff --git a/src/stdlib_linalg_lapack_z.fypp b/src/stdlib_linalg_lapack_z.fypp
new file mode 100644
index 000000000..1d42d3d5c
--- /dev/null
+++ b/src/stdlib_linalg_lapack_z.fypp
@@ -0,0 +1,81218 @@
+#:include "common.fypp" 
+module stdlib_linalg_lapack_z
+     use stdlib_linalg_constants
+     use stdlib_linalg_blas
+     use stdlib_linalg_lapack_aux
+     use stdlib_linalg_lapack_s
+     use stdlib_linalg_lapack_c
+     use stdlib_linalg_lapack_d
+     implicit none(type,external)
+     private
+
+
+     public :: sp,dp,qp,lk,ilp
+#:if WITH_QP
+     public :: stdlib_zlag2w
+#:endif
+     public :: stdlib_zbbcsd
+     public :: stdlib_zbdsqr
+     public :: stdlib_zcgesv
+     public :: stdlib_zcposv
+     public :: stdlib_zdrscl
+     public :: stdlib_zgbbrd
+     public :: stdlib_zgbcon
+     public :: stdlib_zgbequ
+     public :: stdlib_zgbequb
+     public :: stdlib_zgbrfs
+     public :: stdlib_zgbsv
+     public :: stdlib_zgbsvx
+     public :: stdlib_zgbtf2
+     public :: stdlib_zgbtrf
+     public :: stdlib_zgbtrs
+     public :: stdlib_zgebak
+     public :: stdlib_zgebal
+     public :: stdlib_zgebd2
+     public :: stdlib_zgebrd
+     public :: stdlib_zgecon
+     public :: stdlib_zgeequ
+     public :: stdlib_zgeequb
+     public :: stdlib_zgees
+     public :: stdlib_zgeesx
+     public :: stdlib_zgeev
+     public :: stdlib_zgeevx
+     public :: stdlib_zgehd2
+     public :: stdlib_zgehrd
+     public :: stdlib_zgejsv
+     public :: stdlib_zgelq
+     public :: stdlib_zgelq2
+     public :: stdlib_zgelqf
+     public :: stdlib_zgelqt
+     public :: stdlib_zgelqt3
+     public :: stdlib_zgels
+     public :: stdlib_zgelsd
+     public :: stdlib_zgelss
+     public :: stdlib_zgelsy
+     public :: stdlib_zgemlq
+     public :: stdlib_zgemlqt
+     public :: stdlib_zgemqr
+     public :: stdlib_zgemqrt
+     public :: stdlib_zgeql2
+     public :: stdlib_zgeqlf
+     public :: stdlib_zgeqp3
+     public :: stdlib_zgeqr
+     public :: stdlib_zgeqr2
+     public :: stdlib_zgeqr2p
+     public :: stdlib_zgeqrf
+     public :: stdlib_zgeqrfp
+     public :: stdlib_zgeqrt
+     public :: stdlib_zgeqrt2
+     public :: stdlib_zgeqrt3
+     public :: stdlib_zgerfs
+     public :: stdlib_zgerq2
+     public :: stdlib_zgerqf
+     public :: stdlib_zgesc2
+     public :: stdlib_zgesdd
+     public :: stdlib_zgesv
+     public :: stdlib_zgesvd
+     public :: stdlib_zgesvdq
+     public :: stdlib_zgesvj
+     public :: stdlib_zgesvx
+     public :: stdlib_zgetc2
+     public :: stdlib_zgetf2
+     public :: stdlib_zgetrf
+     public :: stdlib_zgetrf2
+     public :: stdlib_zgetri
+     public :: stdlib_zgetrs
+     public :: stdlib_zgetsls
+     public :: stdlib_zgetsqrhrt
+     public :: stdlib_zggbak
+     public :: stdlib_zggbal
+     public :: stdlib_zgges
+     public :: stdlib_zgges3
+     public :: stdlib_zggesx
+     public :: stdlib_zggev
+     public :: stdlib_zggev3
+     public :: stdlib_zggevx
+     public :: stdlib_zggglm
+     public :: stdlib_zgghd3
+     public :: stdlib_zgghrd
+     public :: stdlib_zgglse
+     public :: stdlib_zggqrf
+     public :: stdlib_zggrqf
+     public :: stdlib_zgsvj0
+     public :: stdlib_zgsvj1
+     public :: stdlib_zgtcon
+     public :: stdlib_zgtrfs
+     public :: stdlib_zgtsv
+     public :: stdlib_zgtsvx
+     public :: stdlib_zgttrf
+     public :: stdlib_zgttrs
+     public :: stdlib_zgtts2
+     public :: stdlib_zhb2st_kernels
+     public :: stdlib_zhbev
+     public :: stdlib_zhbevd
+     public :: stdlib_zhbevx
+     public :: stdlib_zhbgst
+     public :: stdlib_zhbgv
+     public :: stdlib_zhbgvd
+     public :: stdlib_zhbgvx
+     public :: stdlib_zhbtrd
+     public :: stdlib_zhecon
+     public :: stdlib_zhecon_rook
+     public :: stdlib_zheequb
+     public :: stdlib_zheev
+     public :: stdlib_zheevd
+     public :: stdlib_zheevr
+     public :: stdlib_zheevx
+     public :: stdlib_zhegs2
+     public :: stdlib_zhegst
+     public :: stdlib_zhegv
+     public :: stdlib_zhegvd
+     public :: stdlib_zhegvx
+     public :: stdlib_zherfs
+     public :: stdlib_zhesv
+     public :: stdlib_zhesv_aa
+     public :: stdlib_zhesv_rk
+     public :: stdlib_zhesv_rook
+     public :: stdlib_zhesvx
+     public :: stdlib_zheswapr
+     public :: stdlib_zhetd2
+     public :: stdlib_zhetf2
+     public :: stdlib_zhetf2_rk
+     public :: stdlib_zhetf2_rook
+     public :: stdlib_zhetrd
+     public :: stdlib_zhetrd_hb2st
+     public :: stdlib_zhetrd_he2hb
+     public :: stdlib_zhetrf
+     public :: stdlib_zhetrf_aa
+     public :: stdlib_zhetrf_rk
+     public :: stdlib_zhetrf_rook
+     public :: stdlib_zhetri
+     public :: stdlib_zhetri_rook
+     public :: stdlib_zhetrs
+     public :: stdlib_zhetrs2
+     public :: stdlib_zhetrs_3
+     public :: stdlib_zhetrs_aa
+     public :: stdlib_zhetrs_rook
+     public :: stdlib_zhfrk
+     public :: stdlib_zhgeqz
+     public :: stdlib_zhpcon
+     public :: stdlib_zhpev
+     public :: stdlib_zhpevd
+     public :: stdlib_zhpevx
+     public :: stdlib_zhpgst
+     public :: stdlib_zhpgv
+     public :: stdlib_zhpgvd
+     public :: stdlib_zhpgvx
+     public :: stdlib_zhprfs
+     public :: stdlib_zhpsv
+     public :: stdlib_zhpsvx
+     public :: stdlib_zhptrd
+     public :: stdlib_zhptrf
+     public :: stdlib_zhptri
+     public :: stdlib_zhptrs
+     public :: stdlib_zhsein
+     public :: stdlib_zhseqr
+     public :: stdlib_zla_gbamv
+     public :: stdlib_zla_gbrcond_c
+     public :: stdlib_zla_gbrpvgrw
+     public :: stdlib_zla_geamv
+     public :: stdlib_zla_gercond_c
+     public :: stdlib_zla_gerpvgrw
+     public :: stdlib_zla_heamv
+     public :: stdlib_zla_hercond_c
+     public :: stdlib_zla_herpvgrw
+     public :: stdlib_zla_lin_berr
+     public :: stdlib_zla_porcond_c
+     public :: stdlib_zla_porpvgrw
+     public :: stdlib_zla_syamv
+     public :: stdlib_zla_syrcond_c
+     public :: stdlib_zla_syrpvgrw
+     public :: stdlib_zla_wwaddw
+     public :: stdlib_zlabrd
+     public :: stdlib_zlacgv
+     public :: stdlib_zlacn2
+     public :: stdlib_zlacon
+     public :: stdlib_zlacp2
+     public :: stdlib_zlacpy
+     public :: stdlib_zlacrm
+     public :: stdlib_zlacrt
+     public :: stdlib_zladiv
+     public :: stdlib_zlaed0
+     public :: stdlib_zlaed7
+     public :: stdlib_zlaed8
+     public :: stdlib_zlaein
+     public :: stdlib_zlaesy
+     public :: stdlib_zlaev2
+     public :: stdlib_zlag2c
+     public :: stdlib_zlags2
+     public :: stdlib_zlagtm
+     public :: stdlib_zlahef
+     public :: stdlib_zlahef_aa
+     public :: stdlib_zlahef_rk
+     public :: stdlib_zlahef_rook
+     public :: stdlib_zlahqr
+     public :: stdlib_zlahr2
+     public :: stdlib_zlaic1
+     public :: stdlib_zlals0
+     public :: stdlib_zlalsa
+     public :: stdlib_zlalsd
+     public :: stdlib_zlamswlq
+     public :: stdlib_zlamtsqr
+     public :: stdlib_zlangb
+     public :: stdlib_zlange
+     public :: stdlib_zlangt
+     public :: stdlib_zlanhb
+     public :: stdlib_zlanhe
+     public :: stdlib_zlanhf
+     public :: stdlib_zlanhp
+     public :: stdlib_zlanhs
+     public :: stdlib_zlanht
+     public :: stdlib_zlansb
+     public :: stdlib_zlansp
+     public :: stdlib_zlansy
+     public :: stdlib_zlantb
+     public :: stdlib_zlantp
+     public :: stdlib_zlantr
+     public :: stdlib_zlapll
+     public :: stdlib_zlapmr
+     public :: stdlib_zlapmt
+     public :: stdlib_zlaqgb
+     public :: stdlib_zlaqge
+     public :: stdlib_zlaqhb
+     public :: stdlib_zlaqhe
+     public :: stdlib_zlaqhp
+     public :: stdlib_zlaqp2
+     public :: stdlib_zlaqps
+     public :: stdlib_zlaqr0
+     public :: stdlib_zlaqr1
+     public :: stdlib_zlaqr2
+     public :: stdlib_zlaqr3
+     public :: stdlib_zlaqr4
+     public :: stdlib_zlaqr5
+     public :: stdlib_zlaqsb
+     public :: stdlib_zlaqsp
+     public :: stdlib_zlaqsy
+     public :: stdlib_zlaqz0
+     public :: stdlib_zlaqz1
+     public :: stdlib_zlaqz2
+     public :: stdlib_zlaqz3
+     public :: stdlib_zlar1v
+     public :: stdlib_zlar2v
+     public :: stdlib_zlarcm
+     public :: stdlib_zlarf
+     public :: stdlib_zlarfb
+     public :: stdlib_zlarfb_gett
+     public :: stdlib_zlarfg
+     public :: stdlib_zlarfgp
+     public :: stdlib_zlarft
+     public :: stdlib_zlarfx
+     public :: stdlib_zlarfy
+     public :: stdlib_zlargv
+     public :: stdlib_zlarnv
+     public :: stdlib_zlarrv
+     public :: stdlib_zlartg
+     public :: stdlib_zlartv
+     public :: stdlib_zlarz
+     public :: stdlib_zlarzb
+     public :: stdlib_zlarzt
+     public :: stdlib_zlascl
+     public :: stdlib_zlaset
+     public :: stdlib_zlasr
+     public :: stdlib_zlassq
+     public :: stdlib_zlaswlq
+     public :: stdlib_zlaswp
+     public :: stdlib_zlasyf
+     public :: stdlib_zlasyf_aa
+     public :: stdlib_zlasyf_rk
+     public :: stdlib_zlasyf_rook
+     public :: stdlib_zlat2c
+     public :: stdlib_zlatbs
+     public :: stdlib_zlatdf
+     public :: stdlib_zlatps
+     public :: stdlib_zlatrd
+     public :: stdlib_zlatrs
+     public :: stdlib_zlatrz
+     public :: stdlib_zlatsqr
+     public :: stdlib_zlaunhr_col_getrfnp
+     public :: stdlib_zlaunhr_col_getrfnp2
+     public :: stdlib_zlauu2
+     public :: stdlib_zlauum
+     public :: stdlib_zpbcon
+     public :: stdlib_zpbequ
+     public :: stdlib_zpbrfs
+     public :: stdlib_zpbstf
+     public :: stdlib_zpbsv
+     public :: stdlib_zpbsvx
+     public :: stdlib_zpbtf2
+     public :: stdlib_zpbtrf
+     public :: stdlib_zpbtrs
+     public :: stdlib_zpftrf
+     public :: stdlib_zpftri
+     public :: stdlib_zpftrs
+     public :: stdlib_zpocon
+     public :: stdlib_zpoequ
+     public :: stdlib_zpoequb
+     public :: stdlib_zporfs
+     public :: stdlib_zposv
+     public :: stdlib_zposvx
+     public :: stdlib_zpotf2
+     public :: stdlib_zpotrf
+     public :: stdlib_zpotrf2
+     public :: stdlib_zpotri
+     public :: stdlib_zpotrs
+     public :: stdlib_zppcon
+     public :: stdlib_zppequ
+     public :: stdlib_zpprfs
+     public :: stdlib_zppsv
+     public :: stdlib_zppsvx
+     public :: stdlib_zpptrf
+     public :: stdlib_zpptri
+     public :: stdlib_zpptrs
+     public :: stdlib_zpstf2
+     public :: stdlib_zpstrf
+     public :: stdlib_zptcon
+     public :: stdlib_zpteqr
+     public :: stdlib_zptrfs
+     public :: stdlib_zptsv
+     public :: stdlib_zptsvx
+     public :: stdlib_zpttrf
+     public :: stdlib_zpttrs
+     public :: stdlib_zptts2
+     public :: stdlib_zrot
+     public :: stdlib_zspcon
+     public :: stdlib_zspmv
+     public :: stdlib_zspr
+     public :: stdlib_zsprfs
+     public :: stdlib_zspsv
+     public :: stdlib_zspsvx
+     public :: stdlib_zsptrf
+     public :: stdlib_zsptri
+     public :: stdlib_zsptrs
+     public :: stdlib_zstedc
+     public :: stdlib_zstegr
+     public :: stdlib_zstein
+     public :: stdlib_zstemr
+     public :: stdlib_zsteqr
+     public :: stdlib_zsycon
+     public :: stdlib_zsycon_rook
+     public :: stdlib_zsyconv
+     public :: stdlib_zsyconvf
+     public :: stdlib_zsyconvf_rook
+     public :: stdlib_zsyequb
+     public :: stdlib_zsymv
+     public :: stdlib_zsyr
+     public :: stdlib_zsyrfs
+     public :: stdlib_zsysv
+     public :: stdlib_zsysv_aa
+     public :: stdlib_zsysv_rk
+     public :: stdlib_zsysv_rook
+     public :: stdlib_zsysvx
+     public :: stdlib_zsyswapr
+     public :: stdlib_zsytf2
+     public :: stdlib_zsytf2_rk
+     public :: stdlib_zsytf2_rook
+     public :: stdlib_zsytrf
+     public :: stdlib_zsytrf_aa
+     public :: stdlib_zsytrf_rk
+     public :: stdlib_zsytrf_rook
+     public :: stdlib_zsytri
+     public :: stdlib_zsytri_rook
+     public :: stdlib_zsytrs
+     public :: stdlib_zsytrs2
+     public :: stdlib_zsytrs_3
+     public :: stdlib_zsytrs_aa
+     public :: stdlib_zsytrs_rook
+     public :: stdlib_ztbcon
+     public :: stdlib_ztbrfs
+     public :: stdlib_ztbtrs
+     public :: stdlib_ztfsm
+     public :: stdlib_ztftri
+     public :: stdlib_ztfttp
+     public :: stdlib_ztfttr
+     public :: stdlib_ztgevc
+     public :: stdlib_ztgex2
+     public :: stdlib_ztgexc
+     public :: stdlib_ztgsen
+     public :: stdlib_ztgsja
+     public :: stdlib_ztgsna
+     public :: stdlib_ztgsy2
+     public :: stdlib_ztgsyl
+     public :: stdlib_ztpcon
+     public :: stdlib_ztplqt
+     public :: stdlib_ztplqt2
+     public :: stdlib_ztpmlqt
+     public :: stdlib_ztpmqrt
+     public :: stdlib_ztpqrt
+     public :: stdlib_ztpqrt2
+     public :: stdlib_ztprfb
+     public :: stdlib_ztprfs
+     public :: stdlib_ztptri
+     public :: stdlib_ztptrs
+     public :: stdlib_ztpttf
+     public :: stdlib_ztpttr
+     public :: stdlib_ztrcon
+     public :: stdlib_ztrevc
+     public :: stdlib_ztrevc3
+     public :: stdlib_ztrexc
+     public :: stdlib_ztrrfs
+     public :: stdlib_ztrsen
+     public :: stdlib_ztrsna
+     public :: stdlib_ztrsyl
+     public :: stdlib_ztrti2
+     public :: stdlib_ztrtri
+     public :: stdlib_ztrtrs
+     public :: stdlib_ztrttf
+     public :: stdlib_ztrttp
+     public :: stdlib_ztzrzf
+     public :: stdlib_zunbdb
+     public :: stdlib_zunbdb1
+     public :: stdlib_zunbdb2
+     public :: stdlib_zunbdb3
+     public :: stdlib_zunbdb4
+     public :: stdlib_zunbdb5
+     public :: stdlib_zunbdb6
+     public :: stdlib_zuncsd
+     public :: stdlib_zuncsd2by1
+     public :: stdlib_zung2l
+     public :: stdlib_zung2r
+     public :: stdlib_zungbr
+     public :: stdlib_zunghr
+     public :: stdlib_zungl2
+     public :: stdlib_zunglq
+     public :: stdlib_zungql
+     public :: stdlib_zungqr
+     public :: stdlib_zungr2
+     public :: stdlib_zungrq
+     public :: stdlib_zungtr
+     public :: stdlib_zungtsqr
+     public :: stdlib_zungtsqr_row
+     public :: stdlib_zunhr_col
+     public :: stdlib_zunm22
+     public :: stdlib_zunm2l
+     public :: stdlib_zunm2r
+     public :: stdlib_zunmbr
+     public :: stdlib_zunmhr
+     public :: stdlib_zunml2
+     public :: stdlib_zunmlq
+     public :: stdlib_zunmql
+     public :: stdlib_zunmqr
+     public :: stdlib_zunmr2
+     public :: stdlib_zunmr3
+     public :: stdlib_zunmrq
+     public :: stdlib_zunmrz
+     public :: stdlib_zunmtr
+     public :: stdlib_zupgtr
+     public :: stdlib_zupmtr
+
+     ! 64-bit real constants 
+     real(dp),    parameter, private ::     negone = -1.00_dp
+     real(dp),    parameter, private ::       zero = 0.00_dp
+     real(dp),    parameter, private ::       half = 0.50_dp
+     real(dp),    parameter, private ::        one = 1.00_dp
+     real(dp),    parameter, private ::        two = 2.00_dp
+     real(dp),    parameter, private ::      three = 3.00_dp
+     real(dp),    parameter, private ::       four = 4.00_dp
+     real(dp),    parameter, private ::      eight = 8.00_dp
+     real(dp),    parameter, private ::        ten = 10.00_dp
+
+     ! 64-bit complex constants 
+     complex(dp), parameter, private :: czero   = ( 0.0_dp,0.0_dp)
+     complex(dp), parameter, private :: chalf   = ( 0.5_dp,0.0_dp)
+     complex(dp), parameter, private :: cone    = ( 1.0_dp,0.0_dp)
+     complex(dp), parameter, private :: cnegone = (-1.0_dp,0.0_dp)
+
+     ! 64-bit scaling constants 
+     integer,     parameter, private :: maxexp = maxexponent(zero) 
+     integer,     parameter, private :: minexp = minexponent(zero) 
+     real(dp),    parameter, private :: rradix = real(radix(zero),dp) 
+     real(dp),    parameter, private :: ulp    = epsilon(zero) 
+     real(dp),    parameter, private :: eps    = ulp*half 
+     real(dp),    parameter, private :: safmin = rradix**max(minexp-1,1-maxexp) 
+     real(dp),    parameter, private :: safmax = one/safmin 
+     real(dp),    parameter, private :: smlnum = safmin/ulp 
+     real(dp),    parameter, private :: bignum = safmax*ulp 
+     real(dp),    parameter, private :: rtmin  = sqrt(smlnum) 
+     real(dp),    parameter, private :: rtmax  = sqrt(bignum) 
+
+     ! 64-bit Blue's scaling constants 
+     ! ssml>=1/s and sbig==1/S with s,S as defined in https://doi.org/10.1145/355769.355771 
+     real(dp),    parameter, private :: tsml   = rradix**ceiling((minexp-1)*half) 
+     real(dp),    parameter, private :: tbig   = rradix**floor((maxexp-digits(zero)+1)*half) 
+     real(dp),    parameter, private :: ssml   = rradix**(-floor((minexp-digits(zero))*half)) 
+     real(dp),    parameter, private :: sbig   = rradix**(-ceiling((maxexp+digits(zero)-1)*half)) 
+
+
+     contains
+
+#:if WITH_QP
+     !> ZLAG2W: converts a COMPLEX matrix, SA, to a COMPLEX*16 matrix, A.
+     !> Note that while it is possible to overflow while converting
+     !> from double to single, it is not possible to overflow when
+     !> converting from single to double.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_zlag2w( m, n, sa, ldsa, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, m, n
+           ! Array Arguments 
+           complex(dp), intent(in) :: sa(ldsa,*)
+           complex(qp), intent(out) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           info = 0
+           do j = 1, n
+              do i = 1, m
+                 a( i, j ) = sa( i, j )
+              end do
+           end do
+           return
+     end subroutine stdlib_zlag2w
+#:endif
+
+     !> ZDRSCL: multiplies an n-element complex vector x by the real scalar
+     !> 1/a.  This is done without overflow or underflow as long as
+     !> the final result x/a does not overflow or underflow.
+
+     pure subroutine stdlib_zdrscl( n, sa, sx, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           real(dp), intent(in) :: sa
+           ! Array Arguments 
+           complex(dp), intent(inout) :: sx(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           real(dp) :: bignum, cden, cden1, cnum, cnum1, mul, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 )return
+           ! get machine parameters
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! initialize the denominator to sa and the numerator to 1.
+           cden = sa
+           cnum = one
+           10 continue
+           cden1 = cden*smlnum
+           cnum1 = cnum / bignum
+           if( abs( cden1 )>abs( cnum ) .and. cnum/=zero ) then
+              ! pre-multiply x by smlnum if cden is large compared to cnum.
+              mul = smlnum
+              done = .false.
+              cden = cden1
+           else if( abs( cnum1 )>abs( cden ) ) then
+              ! pre-multiply x by bignum if cden is small compared to cnum.
+              mul = bignum
+              done = .false.
+              cnum = cnum1
+           else
+              ! multiply x by cnum / cden and return.
+              mul = cnum / cden
+              done = .true.
+           end if
+           ! scale the vector x by mul
+           call stdlib_zdscal( n, mul, sx, incx )
+           if( .not.done )go to 10
+           return
+     end subroutine stdlib_zdrscl
+
+     !> ZGBEQU: computes row and column scalings intended to equilibrate an
+     !> M-by-N band matrix A and reduce its condition number.  R returns the
+     !> row scale factors and C the column scale factors, chosen to try to
+     !> make the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+     !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+     !> number and BIGNUM = largest safe number.  Use of these scaling
+     !> factors is not guaranteed to reduce the condition number of A but
+     !> works well in practice.
+
+     pure subroutine stdlib_zgbequ( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(dp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(dp), intent(out) :: c(*), r(*)
+           complex(dp), intent(in) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(dp) :: bignum, rcmax, rcmin, smlnum
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 r( i ) = max( r( i ), cabs1( ab( kd+i-j, j ) ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i))
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 c( j ) = max( c( j ), cabs1( ab( kd+i-j, j ) )*r( i ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j))
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_zgbequ
+
+     !> ZGBEQUB: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+     !> the radix.
+     !> R(i) and C(j) are restricted to be a power of the radix between
+     !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+     !> of these scaling factors is not guaranteed to reduce the condition
+     !> number of A but works well in practice.
+     !> This routine differs from ZGEEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled entries' magnitudes are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_zgbequb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(dp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(dp), intent(out) :: c(*), r(*)
+           complex(dp), intent(in) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(dp) :: bignum, rcmax, rcmin, smlnum, radix, logrdx
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,log,real,aimag
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.  assume smlnum is a power of the radix.
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           radix = stdlib_dlamch( 'B' )
+           logrdx = log(radix)
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           kd = ku + 1
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 r( i ) = max( r( i ), cabs1( ab( kd+i-j, j ) ) )
+              end do
+           end do
+           do i = 1, m
+              if( r( i )>zero ) then
+                 r( i ) = radix**int( log( r( i ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i)).
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors.
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = max( j-ku, 1 ), min( j+kl, m )
+                 c( j ) = max( c( j ), cabs1( ab( kd+i-j, j ) )*r( i ) )
+              end do
+              if( c( j )>zero ) then
+                 c( j ) = radix**int( log( c( j ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j)).
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_zgbequb
+
+     !> ZGBTF2: computes an LU factorization of a complex m-by-n band matrix
+     !> A using partial pivoting with row interchanges.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_zgbtf2( m, n, kl, ku, ab, ldab, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jp, ju, km, kv
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! kv is the number of superdiagonals in the factor u, allowing for
+           ! fill-in.
+           kv = ku + kl
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+kv+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! gaussian elimination with partial pivoting
+           ! set fill-in elements in columns ku+2 to kv to czero.
+           do j = ku + 2, min( kv, n )
+              do i = kv - j + 2, kl
+                 ab( i, j ) = czero
+              end do
+           end do
+           ! ju is the index of the last column affected by the current stage
+           ! of the factorization.
+           ju = 1
+           loop_40: do j = 1, min( m, n )
+              ! set fill-in elements in column j+kv to czero.
+              if( j+kv<=n ) then
+                 do i = 1, kl
+                    ab( i, j+kv ) = czero
+                 end do
+              end if
+              ! find pivot and test for singularity. km is the number of
+              ! subdiagonal elements in the current column.
+              km = min( kl, m-j )
+              jp = stdlib_izamax( km+1, ab( kv+1, j ), 1 )
+              ipiv( j ) = jp + j - 1
+              if( ab( kv+jp, j )/=czero ) then
+                 ju = max( ju, min( j+ku+jp-1, n ) )
+                 ! apply interchange to columns j to ju.
+                 if( jp/=1 )call stdlib_zswap( ju-j+1, ab( kv+jp, j ), ldab-1,ab( kv+1, j ), ldab-&
+                           1 )
+                 if( km>0 ) then
+                    ! compute multipliers.
+                    call stdlib_zscal( km, cone / ab( kv+1, j ), ab( kv+2, j ), 1 )
+                    ! update trailing submatrix within the band.
+                    if( ju>j )call stdlib_zgeru( km, ju-j, -cone, ab( kv+2, j ), 1,ab( kv, j+1 ), &
+                              ldab-1, ab( kv+1, j+1 ),ldab-1 )
+                 end if
+              else
+                 ! if pivot is czero, set info to the index of the pivot
+                 ! unless a czero pivot has already been found.
+                 if( info==0 )info = j
+              end if
+           end do loop_40
+           return
+     end subroutine stdlib_zgbtf2
+
+     !> ZGEBAK: forms the right or left eigenvectors of a complex general
+     !> matrix by backward transformation on the computed eigenvectors of the
+     !> balanced matrix output by ZGEBAL.
+
+     pure subroutine stdlib_zgebak( job, side, n, ilo, ihi, scale, m, v, ldv,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job, side
+           integer(ilp), intent(in) :: ihi, ilo, ldv, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(in) :: scale(*)
+           complex(dp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: leftv, rightv
+           integer(ilp) :: i, ii, k
+           real(dp) :: s
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! decode and test the input parameters
+           rightv = stdlib_lsame( side, 'R' )
+           leftv = stdlib_lsame( side, 'L' )
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( .not.rightv .and. .not.leftv ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -4
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -5
+           else if( m<0 ) then
+              info = -7
+           else if( ldv<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEBAK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( m==0 )return
+           if( stdlib_lsame( job, 'N' ) )return
+           if( ilo==ihi )go to 30
+           ! backward balance
+           if( stdlib_lsame( job, 'S' ) .or. stdlib_lsame( job, 'B' ) ) then
+              if( rightv ) then
+                 do i = ilo, ihi
+                    s = scale( i )
+                    call stdlib_zdscal( m, s, v( i, 1 ), ldv )
+                 end do
+              end if
+              if( leftv ) then
+                 do i = ilo, ihi
+                    s = one / scale( i )
+                    call stdlib_zdscal( m, s, v( i, 1 ), ldv )
+                 end do
+              end if
+           end if
+           ! backward permutation
+           ! for  i = ilo-1 step -1 until 1,
+                    ! ihi+1 step 1 until n do --
+                    30 continue
+           if( stdlib_lsame( job, 'P' ) .or. stdlib_lsame( job, 'B' ) ) then
+              if( rightv ) then
+                 loop_40: do ii = 1, n
+                    i = ii
+                    if( i>=ilo .and. i<=ihi )cycle loop_40
+                    if( i<ilo )i = ilo - ii
+                    k = scale( i )
+                    if( k==i )cycle loop_40
+                    call stdlib_zswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_40
+              end if
+              if( leftv ) then
+                 loop_50: do ii = 1, n
+                    i = ii
+                    if( i>=ilo .and. i<=ihi )cycle loop_50
+                    if( i<ilo )i = ilo - ii
+                    k = scale( i )
+                    if( k==i )cycle loop_50
+                    call stdlib_zswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_50
+              end if
+           end if
+           return
+     end subroutine stdlib_zgebak
+
+     !> ZGEBAL: balances a general complex matrix A.  This involves, first,
+     !> permuting A by a similarity transformation to isolate eigenvalues
+     !> in the first 1 to ILO-1 and last IHI+1 to N elements on the
+     !> diagonal; and second, applying a diagonal similarity transformation
+     !> to rows and columns ILO to IHI to make the rows and columns as
+     !> close in norm as possible.  Both steps are optional.
+     !> Balancing may reduce the 1-norm of the matrix, and improve the
+     !> accuracy of the computed eigenvalues and/or eigenvectors.
+
+     pure subroutine stdlib_zgebal( job, n, a, lda, ilo, ihi, scale, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(out) :: scale(*)
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sclfac = 2.0e+0_dp
+           real(dp), parameter :: factor = 0.95e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: noconv
+           integer(ilp) :: i, ica, iexc, ira, j, k, l, m
+           real(dp) :: c, ca, f, g, r, ra, s, sfmax1, sfmax2, sfmin1, sfmin2
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! test the input parameters
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEBAL', -info )
+              return
+           end if
+           k = 1
+           l = n
+           if( n==0 )go to 210
+           if( stdlib_lsame( job, 'N' ) ) then
+              do i = 1, n
+                 scale( i ) = one
+              end do
+              go to 210
+           end if
+           if( stdlib_lsame( job, 'S' ) )go to 120
+           ! permutation to isolate eigenvalues if possible
+           go to 50
+           ! row and column exchange.
+           20 continue
+           scale( m ) = j
+           if( j==m )go to 30
+           call stdlib_zswap( l, a( 1, j ), 1, a( 1, m ), 1 )
+           call stdlib_zswap( n-k+1, a( j, k ), lda, a( m, k ), lda )
+           30 continue
+           go to ( 40, 80 )iexc
+           ! search for rows isolating an eigenvalue and push them down.
+           40 continue
+           if( l==1 )go to 210
+           l = l - 1
+           50 continue
+           loop_70: do j = l, 1, -1
+              loop_60: do i = 1, l
+                 if( i==j )cycle loop_60
+                 if( real( a( j, i ),KIND=dp)/=zero .or. aimag( a( j, i ) )/=zero )cycle &
+                           loop_70
+              end do loop_60
+              m = l
+              iexc = 1
+              go to 20
+           end do loop_70
+           go to 90
+           ! search for columns isolating an eigenvalue and push them left.
+           80 continue
+           k = k + 1
+           90 continue
+           loop_110: do j = k, l
+              loop_100: do i = k, l
+                 if( i==j )cycle loop_100
+                 if( real( a( i, j ),KIND=dp)/=zero .or. aimag( a( i, j ) )/=zero )cycle &
+                           loop_110
+              end do loop_100
+              m = k
+              iexc = 2
+              go to 20
+           end do loop_110
+           120 continue
+           do i = k, l
+              scale( i ) = one
+           end do
+           if( stdlib_lsame( job, 'P' ) )go to 210
+           ! balance the submatrix in rows k to l.
+           ! iterative loop for norm reduction
+           sfmin1 = stdlib_dlamch( 'S' ) / stdlib_dlamch( 'P' )
+           sfmax1 = one / sfmin1
+           sfmin2 = sfmin1*sclfac
+           sfmax2 = one / sfmin2
+           140 continue
+           noconv = .false.
+           loop_200: do i = k, l
+              c = stdlib_dznrm2( l-k+1, a( k, i ), 1 )
+              r = stdlib_dznrm2( l-k+1, a( i, k ), lda )
+              ica = stdlib_izamax( l, a( 1, i ), 1 )
+              ca = abs( a( ica, i ) )
+              ira = stdlib_izamax( n-k+1, a( i, k ), lda )
+              ra = abs( a( i, ira+k-1 ) )
+              ! guard against zero c or r due to underflow.
+              if( c==zero .or. r==zero )cycle loop_200
+              g = r / sclfac
+              f = one
+              s = c + r
+              160 continue
+              if( c>=g .or. max( f, c, ca )>=sfmax2 .or.min( r, g, ra )<=sfmin2 )go to 170
+                 if( stdlib_disnan( c+f+ca+r+g+ra ) ) then
+                 ! exit if nan to avoid infinite loop
+                 info = -3
+                 call stdlib_xerbla( 'ZGEBAL', -info )
+                 return
+              end if
+              f = f*sclfac
+              c = c*sclfac
+              ca = ca*sclfac
+              r = r / sclfac
+              g = g / sclfac
+              ra = ra / sclfac
+              go to 160
+              170 continue
+              g = c / sclfac
+              180 continue
+              if( g<r .or. max( r, ra )>=sfmax2 .or.min( f, c, g, ca )<=sfmin2 )go to 190
+              f = f / sclfac
+              c = c / sclfac
+              g = g / sclfac
+              ca = ca / sclfac
+              r = r*sclfac
+              ra = ra*sclfac
+              go to 180
+              ! now balance.
+              190 continue
+              if( ( c+r )>=factor*s )cycle loop_200
+              if( f<one .and. scale( i )<one ) then
+                 if( f*scale( i )<=sfmin1 )cycle loop_200
+              end if
+              if( f>one .and. scale( i )>one ) then
+                 if( scale( i )>=sfmax1 / f )cycle loop_200
+              end if
+              g = one / f
+              scale( i ) = scale( i )*f
+              noconv = .true.
+              call stdlib_zdscal( n-k+1, g, a( i, k ), lda )
+              call stdlib_zdscal( l, f, a( 1, i ), 1 )
+           end do loop_200
+           if( noconv )go to 140
+           210 continue
+           ilo = k
+           ihi = l
+           return
+     end subroutine stdlib_zgebal
+
+     !> ZGEEQU: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+     !> R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+     !> number and BIGNUM = largest safe number.  Use of these scaling
+     !> factors is not guaranteed to reduce the condition number of A but
+     !> works well in practice.
+
+     pure subroutine stdlib_zgeequ( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           real(dp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(dp), intent(out) :: c(*), r(*)
+           complex(dp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: bignum, rcmax, rcmin, smlnum
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           do j = 1, n
+              do i = 1, m
+                 r( i ) = max( r( i ), cabs1( a( i, j ) ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i))
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = 1, m
+                 c( j ) = max( c( j ), cabs1( a( i, j ) )*r( i ) )
+              end do
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j))
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_zgeequ
+
+     !> ZGEEQUB: computes row and column scalings intended to equilibrate an
+     !> M-by-N matrix A and reduce its condition number.  R returns the row
+     !> scale factors and C the column scale factors, chosen to try to make
+     !> the largest element in each row and column of the matrix B with
+     !> elements B(i,j)=R(i)*A(i,j)*C(j) have an absolute value of at most
+     !> the radix.
+     !> R(i) and C(j) are restricted to be a power of the radix between
+     !> SMLNUM = smallest safe number and BIGNUM = largest safe number.  Use
+     !> of these scaling factors is not guaranteed to reduce the condition
+     !> number of A but works well in practice.
+     !> This routine differs from ZGEEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled entries' magnitudes are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_zgeequb( m, n, a, lda, r, c, rowcnd, colcnd, amax,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           real(dp), intent(out) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(dp), intent(out) :: c(*), r(*)
+           complex(dp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: bignum, rcmax, rcmin, smlnum, radix, logrdx
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,log,real,aimag
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rowcnd = one
+              colcnd = one
+              amax = zero
+              return
+           end if
+           ! get machine constants.  assume smlnum is a power of the radix.
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           radix = stdlib_dlamch( 'B' )
+           logrdx = log( radix )
+           ! compute row scale factors.
+           do i = 1, m
+              r( i ) = zero
+           end do
+           ! find the maximum element in each row.
+           do j = 1, n
+              do i = 1, m
+                 r( i ) = max( r( i ), cabs1( a( i, j ) ) )
+              end do
+           end do
+           do i = 1, m
+              if( r( i )>zero ) then
+                 r( i ) = radix**int( log(r( i ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do i = 1, m
+              rcmax = max( rcmax, r( i ) )
+              rcmin = min( rcmin, r( i ) )
+           end do
+           amax = rcmax
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do i = 1, m
+                 if( r( i )==zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do i = 1, m
+                 r( i ) = one / min( max( r( i ), smlnum ), bignum )
+              end do
+              ! compute rowcnd = min(r(i)) / max(r(i)).
+              rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           ! compute column scale factors.
+           do j = 1, n
+              c( j ) = zero
+           end do
+           ! find the maximum element in each column,
+           ! assuming the row scaling computed above.
+           do j = 1, n
+              do i = 1, m
+                 c( j ) = max( c( j ), cabs1( a( i, j ) )*r( i ) )
+              end do
+              if( c( j )>zero ) then
+                 c( j ) = radix**int( log( c( j ) ) / logrdx,KIND=ilp)
+              end if
+           end do
+           ! find the maximum and minimum scale factors.
+           rcmin = bignum
+           rcmax = zero
+           do j = 1, n
+              rcmin = min( rcmin, c( j ) )
+              rcmax = max( rcmax, c( j ) )
+           end do
+           if( rcmin==zero ) then
+              ! find the first zero scale factor and return an error code.
+              do j = 1, n
+                 if( c( j )==zero ) then
+                    info = m + j
+                    return
+                 end if
+              end do
+           else
+              ! invert the scale factors.
+              do j = 1, n
+                 c( j ) = one / min( max( c( j ), smlnum ), bignum )
+              end do
+              ! compute colcnd = min(c(j)) / max(c(j)).
+              colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+           end if
+           return
+     end subroutine stdlib_zgeequb
+
+     !> ZGETC2: computes an LU factorization, using complete pivoting, of the
+     !> n-by-n matrix A. The factorization has the form A = P * L * U * Q,
+     !> where P and Q are permutation matrices, L is lower triangular with
+     !> unit diagonal elements and U is upper triangular.
+     !> This is a level 1 BLAS version of the algorithm.
+
+     pure subroutine stdlib_zgetc2( n, a, lda, ipiv, jpiv, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*), jpiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ip, ipv, j, jp, jpv
+           real(dp) :: bignum, eps, smin, smlnum, xmax
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,max
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! handle the case n=1 by itself
+           if( n==1 ) then
+              ipiv( 1 ) = 1
+              jpiv( 1 ) = 1
+              if( abs( a( 1, 1 ) )<smlnum ) then
+                 info = 1
+                 a( 1, 1 ) = cmplx( smlnum, zero,KIND=dp)
+              end if
+              return
+           end if
+           ! factorize a using complete pivoting.
+           ! set pivots less than smin to smin
+           loop_40: do i = 1, n - 1
+              ! find max element in matrix a
+              xmax = zero
+              do ip = i, n
+                 do jp = i, n
+                    if( abs( a( ip, jp ) )>=xmax ) then
+                       xmax = abs( a( ip, jp ) )
+                       ipv = ip
+                       jpv = jp
+                    end if
+                 end do
+              end do
+              if( i==1 )smin = max( eps*xmax, smlnum )
+              ! swap rows
+              if( ipv/=i )call stdlib_zswap( n, a( ipv, 1 ), lda, a( i, 1 ), lda )
+              ipiv( i ) = ipv
+              ! swap columns
+              if( jpv/=i )call stdlib_zswap( n, a( 1, jpv ), 1, a( 1, i ), 1 )
+              jpiv( i ) = jpv
+              ! check for singularity
+              if( abs( a( i, i ) )<smin ) then
+                 info = i
+                 a( i, i ) = cmplx( smin, zero,KIND=dp)
+              end if
+              do j = i + 1, n
+                 a( j, i ) = a( j, i ) / a( i, i )
+              end do
+              call stdlib_zgeru( n-i, n-i, -cmplx( one,KIND=dp), a( i+1, i ), 1,a( i, i+1 ), lda, &
+                        a( i+1, i+1 ), lda )
+           end do loop_40
+           if( abs( a( n, n ) )<smin ) then
+              info = n
+              a( n, n ) = cmplx( smin, zero,KIND=dp)
+           end if
+           ! set last pivots to n
+           ipiv( n ) = n
+           jpiv( n ) = n
+           return
+     end subroutine stdlib_zgetc2
+
+     !> ZGETF2: computes an LU factorization of a general m-by-n matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the right-looking Level 2 BLAS version of the algorithm.
+
+     pure subroutine stdlib_zgetf2( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: sfmin
+           integer(ilp) :: i, j, jp
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGETF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch('S')
+           do j = 1, min( m, n )
+              ! find pivot and test for singularity.
+              jp = j - 1 + stdlib_izamax( m-j+1, a( j, j ), 1 )
+              ipiv( j ) = jp
+              if( a( jp, j )/=czero ) then
+                 ! apply the interchange to columns 1:n.
+                 if( jp/=j )call stdlib_zswap( n, a( j, 1 ), lda, a( jp, 1 ), lda )
+                 ! compute elements j+1:m of j-th column.
+                 if( j<m ) then
+                    if( abs(a( j, j )) >= sfmin ) then
+                       call stdlib_zscal( m-j, cone / a( j, j ), a( j+1, j ), 1 )
+                    else
+                       do i = 1, m-j
+                          a( j+i, j ) = a( j+i, j ) / a( j, j )
+                       end do
+                    end if
+                 end if
+              else if( info==0 ) then
+                 info = j
+              end if
+              if( j<min( m, n ) ) then
+                 ! update trailing submatrix.
+                 call stdlib_zgeru( m-j, n-j, -cone, a( j+1, j ), 1, a( j, j+1 ),lda, a( j+1, j+1 &
+                           ), lda )
+              end if
+           end do
+           return
+     end subroutine stdlib_zgetf2
+
+     !> ZGGBAK: forms the right or left eigenvectors of a complex generalized
+     !> eigenvalue problem A*x = lambda*B*x, by backward transformation on
+     !> the computed eigenvectors of the balanced pair of matrices output by
+     !> ZGGBAL.
+
+     pure subroutine stdlib_zggbak( job, side, n, ilo, ihi, lscale, rscale, m, v,ldv, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job, side
+           integer(ilp), intent(in) :: ihi, ilo, ldv, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(in) :: lscale(*), rscale(*)
+           complex(dp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: leftv, rightv
+           integer(ilp) :: i, k
+           ! Intrinsic Functions 
+           intrinsic :: max,int
+           ! Executable Statements 
+           ! test the input parameters
+           rightv = stdlib_lsame( side, 'R' )
+           leftv = stdlib_lsame( side, 'L' )
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( .not.rightv .and. .not.leftv ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( n==0 .and. ihi==0 .and. ilo/=1 ) then
+              info = -4
+           else if( n>0 .and. ( ihi<ilo .or. ihi>max( 1, n ) ) )then
+              info = -5
+           else if( n==0 .and. ilo==1 .and. ihi/=0 ) then
+              info = -5
+           else if( m<0 ) then
+              info = -8
+           else if( ldv<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGBAK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( m==0 )return
+           if( stdlib_lsame( job, 'N' ) )return
+           if( ilo==ihi )go to 30
+           ! backward balance
+           if( stdlib_lsame( job, 'S' ) .or. stdlib_lsame( job, 'B' ) ) then
+              ! backward transformation on right eigenvectors
+              if( rightv ) then
+                 do i = ilo, ihi
+                    call stdlib_zdscal( m, rscale( i ), v( i, 1 ), ldv )
+                 end do
+              end if
+              ! backward transformation on left eigenvectors
+              if( leftv ) then
+                 do i = ilo, ihi
+                    call stdlib_zdscal( m, lscale( i ), v( i, 1 ), ldv )
+                 end do
+              end if
+           end if
+           ! backward permutation
+           30 continue
+           if( stdlib_lsame( job, 'P' ) .or. stdlib_lsame( job, 'B' ) ) then
+              ! backward permutation on right eigenvectors
+              if( rightv ) then
+                 if( ilo==1 )go to 50
+                 loop_40: do i = ilo - 1, 1, -1
+                    k = int(rscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_40
+                    call stdlib_zswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_40
+                 50 continue
+                 if( ihi==n )go to 70
+                 loop_60: do i = ihi + 1, n
+                    k = int(rscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_60
+                    call stdlib_zswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_60
+              end if
+              ! backward permutation on left eigenvectors
+              70 continue
+              if( leftv ) then
+                 if( ilo==1 )go to 90
+                 loop_80: do i = ilo - 1, 1, -1
+                    k = int(lscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_80
+                    call stdlib_zswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_80
+                 90 continue
+                 if( ihi==n )go to 110
+                 loop_100: do i = ihi + 1, n
+                    k = int(lscale( i ),KIND=ilp)
+                    if( k==i )cycle loop_100
+                    call stdlib_zswap( m, v( i, 1 ), ldv, v( k, 1 ), ldv )
+                 end do loop_100
+              end if
+           end if
+           110 continue
+           return
+     end subroutine stdlib_zggbak
+
+     !> ZGGBAL: balances a pair of general complex matrices (A,B).  This
+     !> involves, first, permuting A and B by similarity transformations to
+     !> isolate eigenvalues in the first 1 to ILO$-$1 and last IHI+1 to N
+     !> elements on the diagonal; and second, applying a diagonal similarity
+     !> transformation to rows and columns ILO to IHI to make the rows
+     !> and columns as close in norm as possible. Both steps are optional.
+     !> Balancing may reduce the 1-norm of the matrices, and improve the
+     !> accuracy of the computed eigenvalues and/or eigenvectors in the
+     !> generalized eigenvalue problem A*x = lambda*B*x.
+
+     pure subroutine stdlib_zggbal( job, n, a, lda, b, ldb, ilo, ihi, lscale,rscale, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: job
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldb, n
+           ! Array Arguments 
+           real(dp), intent(out) :: lscale(*), rscale(*), work(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sclfac = 1.0e+1_dp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, icab, iflow, ip1, ir, irab, it, j, jc, jp1, k, kount, l, lcab, lm1, &
+                     lrab, lsfmax, lsfmin, m, nr, nrp2
+           real(dp) :: alpha, basl, beta, cab, cmax, coef, coef2, coef5, cor, ew, ewc, gamma, &
+                     pgamma, rab, sfmax, sfmin, sum, t, ta, tb, tc
+           complex(dp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,int,log10,max,min,sign
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.stdlib_lsame( job, 'P' ) &
+                     .and..not.stdlib_lsame( job, 'S' ) .and. .not.stdlib_lsame( job, 'B' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGBAL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              ilo = 1
+              ihi = n
+              return
+           end if
+           if( n==1 ) then
+              ilo = 1
+              ihi = n
+              lscale( 1 ) = one
+              rscale( 1 ) = one
+              return
+           end if
+           if( stdlib_lsame( job, 'N' ) ) then
+              ilo = 1
+              ihi = n
+              do i = 1, n
+                 lscale( i ) = one
+                 rscale( i ) = one
+              end do
+              return
+           end if
+           k = 1
+           l = n
+           if( stdlib_lsame( job, 'S' ) )go to 190
+           go to 30
+           ! permute the matrices a and b to isolate the eigenvalues.
+           ! find row with one nonzero in columns 1 through l
+           20 continue
+           l = lm1
+           if( l/=1 )go to 30
+           rscale( 1 ) = 1
+           lscale( 1 ) = 1
+           go to 190
+           30 continue
+           lm1 = l - 1
+           loop_80: do i = l, 1, -1
+              do j = 1, lm1
+                 jp1 = j + 1
+                 if( a( i, j )/=czero .or. b( i, j )/=czero )go to 50
+              end do
+              j = l
+              go to 70
+              50 continue
+              do j = jp1, l
+                 if( a( i, j )/=czero .or. b( i, j )/=czero )cycle loop_80
+              end do
+              j = jp1 - 1
+              70 continue
+              m = l
+              iflow = 1
+              go to 160
+           end do loop_80
+           go to 100
+           ! find column with one nonzero in rows k through n
+           90 continue
+           k = k + 1
+           100 continue
+           loop_150: do j = k, l
+              do i = k, lm1
+                 ip1 = i + 1
+                 if( a( i, j )/=czero .or. b( i, j )/=czero )go to 120
+              end do
+              i = l
+              go to 140
+              120 continue
+              do i = ip1, l
+                 if( a( i, j )/=czero .or. b( i, j )/=czero )cycle loop_150
+              end do
+              i = ip1 - 1
+              140 continue
+              m = k
+              iflow = 2
+              go to 160
+           end do loop_150
+           go to 190
+           ! permute rows m and i
+           160 continue
+           lscale( m ) = i
+           if( i==m )go to 170
+           call stdlib_zswap( n-k+1, a( i, k ), lda, a( m, k ), lda )
+           call stdlib_zswap( n-k+1, b( i, k ), ldb, b( m, k ), ldb )
+           ! permute columns m and j
+           170 continue
+           rscale( m ) = j
+           if( j==m )go to 180
+           call stdlib_zswap( l, a( 1, j ), 1, a( 1, m ), 1 )
+           call stdlib_zswap( l, b( 1, j ), 1, b( 1, m ), 1 )
+           180 continue
+           go to ( 20, 90 )iflow
+           190 continue
+           ilo = k
+           ihi = l
+           if( stdlib_lsame( job, 'P' ) ) then
+              do i = ilo, ihi
+                 lscale( i ) = one
+                 rscale( i ) = one
+              end do
+              return
+           end if
+           if( ilo==ihi )return
+           ! balance the submatrix in rows ilo to ihi.
+           nr = ihi - ilo + 1
+           do i = ilo, ihi
+              rscale( i ) = zero
+              lscale( i ) = zero
+              work( i ) = zero
+              work( i+n ) = zero
+              work( i+2*n ) = zero
+              work( i+3*n ) = zero
+              work( i+4*n ) = zero
+              work( i+5*n ) = zero
+           end do
+           ! compute right side vector in resulting linear equations
+           basl = log10( sclfac )
+           do i = ilo, ihi
+              do j = ilo, ihi
+                 if( a( i, j )==czero ) then
+                    ta = zero
+                    go to 210
+                 end if
+                 ta = log10( cabs1( a( i, j ) ) ) / basl
+                 210 continue
+                 if( b( i, j )==czero ) then
+                    tb = zero
+                    go to 220
+                 end if
+                 tb = log10( cabs1( b( i, j ) ) ) / basl
+                 220 continue
+                 work( i+4*n ) = work( i+4*n ) - ta - tb
+                 work( j+5*n ) = work( j+5*n ) - ta - tb
+              end do
+           end do
+           coef = one / real( 2*nr,KIND=dp)
+           coef2 = coef*coef
+           coef5 = half*coef2
+           nrp2 = nr + 2
+           beta = zero
+           it = 1
+           ! start generalized conjugate gradient iteration
+           250 continue
+           gamma = stdlib_ddot( nr, work( ilo+4*n ), 1, work( ilo+4*n ), 1 ) +stdlib_ddot( nr, &
+                     work( ilo+5*n ), 1, work( ilo+5*n ), 1 )
+           ew = zero
+           ewc = zero
+           do i = ilo, ihi
+              ew = ew + work( i+4*n )
+              ewc = ewc + work( i+5*n )
+           end do
+           gamma = coef*gamma - coef2*( ew**2+ewc**2 ) - coef5*( ew-ewc )**2
+           if( gamma==zero )go to 350
+           if( it/=1 )beta = gamma / pgamma
+           t = coef5*( ewc-three*ew )
+           tc = coef5*( ew-three*ewc )
+           call stdlib_dscal( nr, beta, work( ilo ), 1 )
+           call stdlib_dscal( nr, beta, work( ilo+n ), 1 )
+           call stdlib_daxpy( nr, coef, work( ilo+4*n ), 1, work( ilo+n ), 1 )
+           call stdlib_daxpy( nr, coef, work( ilo+5*n ), 1, work( ilo ), 1 )
+           do i = ilo, ihi
+              work( i ) = work( i ) + tc
+              work( i+n ) = work( i+n ) + t
+           end do
+           ! apply matrix to vector
+           do i = ilo, ihi
+              kount = 0
+              sum = zero
+              loop_290: do j = ilo, ihi
+                 if( a( i, j )==czero )go to 280
+                 kount = kount + 1
+                 sum = sum + work( j )
+                 280 continue
+                 if( b( i, j )==czero )cycle loop_290
+                 kount = kount + 1
+                 sum = sum + work( j )
+              end do loop_290
+              work( i+2*n ) = real( kount,KIND=dp)*work( i+n ) + sum
+           end do
+           do j = ilo, ihi
+              kount = 0
+              sum = zero
+              loop_320: do i = ilo, ihi
+                 if( a( i, j )==czero )go to 310
+                 kount = kount + 1
+                 sum = sum + work( i+n )
+                 310 continue
+                 if( b( i, j )==czero )cycle loop_320
+                 kount = kount + 1
+                 sum = sum + work( i+n )
+              end do loop_320
+              work( j+3*n ) = real( kount,KIND=dp)*work( j ) + sum
+           end do
+           sum = stdlib_ddot( nr, work( ilo+n ), 1, work( ilo+2*n ), 1 ) +stdlib_ddot( nr, work( &
+                     ilo ), 1, work( ilo+3*n ), 1 )
+           alpha = gamma / sum
+           ! determine correction to current iteration
+           cmax = zero
+           do i = ilo, ihi
+              cor = alpha*work( i+n )
+              if( abs( cor )>cmax )cmax = abs( cor )
+              lscale( i ) = lscale( i ) + cor
+              cor = alpha*work( i )
+              if( abs( cor )>cmax )cmax = abs( cor )
+              rscale( i ) = rscale( i ) + cor
+           end do
+           if( cmax<half )go to 350
+           call stdlib_daxpy( nr, -alpha, work( ilo+2*n ), 1, work( ilo+4*n ), 1 )
+           call stdlib_daxpy( nr, -alpha, work( ilo+3*n ), 1, work( ilo+5*n ), 1 )
+           pgamma = gamma
+           it = it + 1
+           if( it<=nrp2 )go to 250
+           ! end generalized conjugate gradient iteration
+           350 continue
+           sfmin = stdlib_dlamch( 'S' )
+           sfmax = one / sfmin
+           lsfmin = int( log10( sfmin ) / basl+one,KIND=ilp)
+           lsfmax = int( log10( sfmax ) / basl,KIND=ilp)
+           do i = ilo, ihi
+              irab = stdlib_izamax( n-ilo+1, a( i, ilo ), lda )
+              rab = abs( a( i, irab+ilo-1 ) )
+              irab = stdlib_izamax( n-ilo+1, b( i, ilo ), ldb )
+              rab = max( rab, abs( b( i, irab+ilo-1 ) ) )
+              lrab = int( log10( rab+sfmin ) / basl+one,KIND=ilp)
+              ir = int(lscale( i ) + sign( half, lscale( i ) ),KIND=ilp)
+              ir = min( max( ir, lsfmin ), lsfmax, lsfmax-lrab )
+              lscale( i ) = sclfac**ir
+              icab = stdlib_izamax( ihi, a( 1, i ), 1 )
+              cab = abs( a( icab, i ) )
+              icab = stdlib_izamax( ihi, b( 1, i ), 1 )
+              cab = max( cab, abs( b( icab, i ) ) )
+              lcab = int( log10( cab+sfmin ) / basl+one,KIND=ilp)
+              jc = int(rscale( i ) + sign( half, rscale( i ) ),KIND=ilp)
+              jc = min( max( jc, lsfmin ), lsfmax, lsfmax-lcab )
+              rscale( i ) = sclfac**jc
+           end do
+           ! row scaling of matrices a and b
+           do i = ilo, ihi
+              call stdlib_zdscal( n-ilo+1, lscale( i ), a( i, ilo ), lda )
+              call stdlib_zdscal( n-ilo+1, lscale( i ), b( i, ilo ), ldb )
+           end do
+           ! column scaling of matrices a and b
+           do j = ilo, ihi
+              call stdlib_zdscal( ihi, rscale( j ), a( 1, j ), 1 )
+              call stdlib_zdscal( ihi, rscale( j ), b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_zggbal
+
+     !> ZGTSV:  solves the equation
+     !> A*X = B,
+     !> where A is an N-by-N tridiagonal matrix, by Gaussian elimination with
+     !> partial pivoting.
+     !> Note that the equation  A**T *X = B  may be solved by interchanging the
+     !> order of the arguments DU and DL.
+
+     pure subroutine stdlib_zgtsv( n, nrhs, dl, d, du, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(dp), intent(inout) :: b(ldb,*), d(*), dl(*), du(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, k
+           complex(dp) :: mult, temp, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGTSV ', -info )
+              return
+           end if
+           if( n==0 )return
+           loop_30: do k = 1, n - 1
+              if( dl( k )==czero ) then
+                 ! subdiagonal is czero, no elimination is required.
+                 if( d( k )==czero ) then
+                    ! diagonal is czero: set info = k and return; a unique
+                    ! solution can not be found.
+                    info = k
+                    return
+                 end if
+              else if( cabs1( d( k ) )>=cabs1( dl( k ) ) ) then
+                 ! no row interchange required
+                 mult = dl( k ) / d( k )
+                 d( k+1 ) = d( k+1 ) - mult*du( k )
+                 do j = 1, nrhs
+                    b( k+1, j ) = b( k+1, j ) - mult*b( k, j )
+                 end do
+                 if( k<( n-1 ) )dl( k ) = czero
+              else
+                 ! interchange rows k and k+1
+                 mult = d( k ) / dl( k )
+                 d( k ) = dl( k )
+                 temp = d( k+1 )
+                 d( k+1 ) = du( k ) - mult*temp
+                 if( k<( n-1 ) ) then
+                    dl( k ) = du( k+1 )
+                    du( k+1 ) = -mult*dl( k )
+                 end if
+                 du( k ) = temp
+                 do j = 1, nrhs
+                    temp = b( k, j )
+                    b( k, j ) = b( k+1, j )
+                    b( k+1, j ) = temp - mult*b( k+1, j )
+                 end do
+              end if
+           end do loop_30
+           if( d( n )==czero ) then
+              info = n
+              return
+           end if
+           ! back solve with the matrix u from the factorization.
+           do j = 1, nrhs
+              b( n, j ) = b( n, j ) / d( n )
+              if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) / d( n-1 )
+              do k = n - 2, 1, -1
+                 b( k, j ) = ( b( k, j )-du( k )*b( k+1, j )-dl( k )*b( k+2, j ) ) / d( k )
+                           
+              end do
+           end do
+           return
+     end subroutine stdlib_zgtsv
+
+     !> ZGTTRF: computes an LU factorization of a complex tridiagonal matrix A
+     !> using elimination with partial pivoting and row interchanges.
+     !> The factorization has the form
+     !> A = L * U
+     !> where L is a product of permutation and unit lower bidiagonal
+     !> matrices and U is upper triangular with nonzeros in only the main
+     !> diagonal and first two superdiagonals.
+
+     pure subroutine stdlib_zgttrf( n, dl, d, du, du2, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: d(*), dl(*), du(*)
+           complex(dp), intent(out) :: du2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(dp) :: fact, temp, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'ZGTTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize ipiv(i) = i and du2(i) = 0
+           do i = 1, n
+              ipiv( i ) = i
+           end do
+           do i = 1, n - 2
+              du2( i ) = zero
+           end do
+           do i = 1, n - 2
+              if( cabs1( d( i ) )>=cabs1( dl( i ) ) ) then
+                 ! no row interchange required, eliminate dl(i)
+                 if( cabs1( d( i ) )/=zero ) then
+                    fact = dl( i ) / d( i )
+                    dl( i ) = fact
+                    d( i+1 ) = d( i+1 ) - fact*du( i )
+                 end if
+              else
+                 ! interchange rows i and i+1, eliminate dl(i)
+                 fact = d( i ) / dl( i )
+                 d( i ) = dl( i )
+                 dl( i ) = fact
+                 temp = du( i )
+                 du( i ) = d( i+1 )
+                 d( i+1 ) = temp - fact*d( i+1 )
+                 du2( i ) = du( i+1 )
+                 du( i+1 ) = -fact*du( i+1 )
+                 ipiv( i ) = i + 1
+              end if
+           end do
+           if( n>1 ) then
+              i = n - 1
+              if( cabs1( d( i ) )>=cabs1( dl( i ) ) ) then
+                 if( cabs1( d( i ) )/=zero ) then
+                    fact = dl( i ) / d( i )
+                    dl( i ) = fact
+                    d( i+1 ) = d( i+1 ) - fact*du( i )
+                 end if
+              else
+                 fact = d( i ) / dl( i )
+                 d( i ) = dl( i )
+                 dl( i ) = fact
+                 temp = du( i )
+                 du( i ) = d( i+1 )
+                 d( i+1 ) = temp - fact*d( i+1 )
+                 ipiv( i ) = i + 1
+              end if
+           end if
+           ! check for a zero on the diagonal of u.
+           do i = 1, n
+              if( cabs1( d( i ) )==zero ) then
+                 info = i
+                 go to 50
+              end if
+           end do
+           50 continue
+           return
+     end subroutine stdlib_zgttrf
+
+     !> ZGTTS2: solves one of the systems of equations
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> with a tridiagonal matrix A using the LU factorization computed
+     !> by ZGTTRF.
+
+     pure subroutine stdlib_zgtts2( itrans, n, nrhs, dl, d, du, du2, ipiv, b, ldb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: itrans, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: b(ldb,*)
+           complex(dp), intent(in) :: d(*), dl(*), du(*), du2(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           complex(dp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( itrans==0 ) then
+              ! solve a*x = b using the lu factorization of a,
+              ! overwriting each right hand side vector with its solution.
+              if( nrhs<=1 ) then
+                 j = 1
+                 10 continue
+                 ! solve l*x = b.
+                 do i = 1, n - 1
+                    if( ipiv( i )==i ) then
+                       b( i+1, j ) = b( i+1, j ) - dl( i )*b( i, j )
+                    else
+                       temp = b( i, j )
+                       b( i, j ) = b( i+1, j )
+                       b( i+1, j ) = temp - dl( i )*b( i, j )
+                    end if
+                 end do
+                 ! solve u*x = b.
+                 b( n, j ) = b( n, j ) / d( n )
+                 if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                 do i = n - 2, 1, -1
+                    b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-du2( i )*b( i+2, j ) ) / d( i )
+                              
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 10
+                 end if
+              else
+                 do j = 1, nrhs
+                 ! solve l*x = b.
+                    do i = 1, n - 1
+                       if( ipiv( i )==i ) then
+                          b( i+1, j ) = b( i+1, j ) - dl( i )*b( i, j )
+                       else
+                          temp = b( i, j )
+                          b( i, j ) = b( i+1, j )
+                          b( i+1, j ) = temp - dl( i )*b( i, j )
+                       end if
+                    end do
+                 ! solve u*x = b.
+                    b( n, j ) = b( n, j ) / d( n )
+                    if( n>1 )b( n-1, j ) = ( b( n-1, j )-du( n-1 )*b( n, j ) ) /d( n-1 )
+                    do i = n - 2, 1, -1
+                       b( i, j ) = ( b( i, j )-du( i )*b( i+1, j )-du2( i )*b( i+2, j ) ) / d( i )
+                                 
+                    end do
+                 end do
+              end if
+           else if( itrans==1 ) then
+              ! solve a**t * x = b.
+              if( nrhs<=1 ) then
+                 j = 1
+                 70 continue
+                 ! solve u**t * x = b.
+                 b( 1, j ) = b( 1, j ) / d( 1 )
+                 if( n>1 )b( 2, j ) = ( b( 2, j )-du( 1 )*b( 1, j ) ) / d( 2 )
+                 do i = 3, n
+                    b( i, j ) = ( b( i, j )-du( i-1 )*b( i-1, j )-du2( i-2 )*b( i-2, j ) ) / d( i &
+                              )
+                 end do
+                 ! solve l**t * x = b.
+                 do i = n - 1, 1, -1
+                    if( ipiv( i )==i ) then
+                       b( i, j ) = b( i, j ) - dl( i )*b( i+1, j )
+                    else
+                       temp = b( i+1, j )
+                       b( i+1, j ) = b( i, j ) - dl( i )*temp
+                       b( i, j ) = temp
+                    end if
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 70
+                 end if
+              else
+                 do j = 1, nrhs
+                 ! solve u**t * x = b.
+                    b( 1, j ) = b( 1, j ) / d( 1 )
+                    if( n>1 )b( 2, j ) = ( b( 2, j )-du( 1 )*b( 1, j ) ) / d( 2 )
+                    do i = 3, n
+                       b( i, j ) = ( b( i, j )-du( i-1 )*b( i-1, j )-du2( i-2 )*b( i-2, j ) ) / d(&
+                                  i )
+                    end do
+                 ! solve l**t * x = b.
+                    do i = n - 1, 1, -1
+                       if( ipiv( i )==i ) then
+                          b( i, j ) = b( i, j ) - dl( i )*b( i+1, j )
+                       else
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - dl( i )*temp
+                          b( i, j ) = temp
+                       end if
+                    end do
+                 end do
+              end if
+           else
+              ! solve a**h * x = b.
+              if( nrhs<=1 ) then
+                 j = 1
+                 130 continue
+                 ! solve u**h * x = b.
+                 b( 1, j ) = b( 1, j ) / conjg( d( 1 ) )
+                 if( n>1 )b( 2, j ) = ( b( 2, j )-conjg( du( 1 ) )*b( 1, j ) ) /conjg( d( 2 ) )
+                           
+                 do i = 3, n
+                    b( i, j ) = ( b( i, j )-conjg( du( i-1 ) )*b( i-1, j )-conjg( du2( i-2 ) )*b( &
+                              i-2, j ) ) /conjg( d( i ) )
+                 end do
+                 ! solve l**h * x = b.
+                 do i = n - 1, 1, -1
+                    if( ipiv( i )==i ) then
+                       b( i, j ) = b( i, j ) - conjg( dl( i ) )*b( i+1, j )
+                    else
+                       temp = b( i+1, j )
+                       b( i+1, j ) = b( i, j ) - conjg( dl( i ) )*temp
+                       b( i, j ) = temp
+                    end if
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 130
+                 end if
+              else
+                 do j = 1, nrhs
+                 ! solve u**h * x = b.
+                    b( 1, j ) = b( 1, j ) / conjg( d( 1 ) )
+                    if( n>1 )b( 2, j ) = ( b( 2, j )-conjg( du( 1 ) )*b( 1, j ) )/ conjg( d( 2 ) )
+                              
+                    do i = 3, n
+                       b( i, j ) = ( b( i, j )-conjg( du( i-1 ) )*b( i-1, j )-conjg( du2( i-2 ) )&
+                                 *b( i-2, j ) ) / conjg( d( i ) )
+                    end do
+                 ! solve l**h * x = b.
+                    do i = n - 1, 1, -1
+                       if( ipiv( i )==i ) then
+                          b( i, j ) = b( i, j ) - conjg( dl( i ) )*b( i+1, j )
+                       else
+                          temp = b( i+1, j )
+                          b( i+1, j ) = b( i, j ) - conjg( dl( i ) )*temp
+                          b( i, j ) = temp
+                       end if
+                    end do
+                 end do
+              end if
+           end if
+     end subroutine stdlib_zgtts2
+
+     !> ZHESWAPR: applies an elementary permutation on the rows and the columns of
+     !> a hermitian matrix.
+
+     pure subroutine stdlib_zheswapr( uplo, n, a, lda, i1, i2)
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: i1, i2, lda, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,n)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           complex(dp) :: tmp
+           ! Executable Statements 
+           upper = stdlib_lsame( uplo, 'U' )
+           if (upper) then
+               ! upper
+               ! first swap
+                ! - swap column i1 and i2 from i1 to i1-1
+              call stdlib_zswap( i1-1, a(1,i1), 1, a(1,i2), 1 )
+                ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap row i1 from i1+1 to i2-1 with col i2 from i1+1 to i2-1
+                ! - swap a(i2,i1) and a(i1,i2)
+              tmp=a(i1,i1)
+              a(i1,i1)=a(i2,i2)
+              a(i2,i2)=tmp
+              do i=1,i2-i1-1
+                 tmp=a(i1,i1+i)
+                 a(i1,i1+i)=conjg(a(i1+i,i2))
+                 a(i1+i,i2)=conjg(tmp)
+              end do
+               a(i1,i2)=conjg(a(i1,i2))
+                ! third swap
+                ! - swap row i1 and i2 from i2+1 to n
+              do i=i2+1,n
+                 tmp=a(i1,i)
+                 a(i1,i)=a(i2,i)
+                 a(i2,i)=tmp
+              end do
+             else
+               ! lower
+               ! first swap
+                ! - swap row i1 and i2 from 1 to i1-1
+              call stdlib_zswap ( i1-1, a(i1,1), lda, a(i2,1), lda )
+               ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap col i1 from i1+1 to i2-1 with row i2 from i1+1 to i2-1
+                ! - swap a(i2,i1) and a(i1,i2)
+               tmp=a(i1,i1)
+               a(i1,i1)=a(i2,i2)
+               a(i2,i2)=tmp
+               do i=1,i2-i1-1
+                  tmp=a(i1+i,i1)
+                  a(i1+i,i1)=conjg(a(i2,i1+i))
+                  a(i2,i1+i)=conjg(tmp)
+               end do
+               a(i2,i1)=conjg(a(i2,i1))
+               ! third swap
+                ! - swap col i1 and i2 from i2+1 to n
+               do i=i2+1,n
+                  tmp=a(i,i1)
+                  a(i,i1)=a(i,i2)
+                  a(i,i2)=tmp
+               end do
+           endif
+     end subroutine stdlib_zheswapr
+
+     !> ZHETF2: computes the factorization of a complex Hermitian matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**H  or  A = L*D*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**H is the conjugate transpose of U, and D is
+     !> Hermitian and block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_zhetf2( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kk, kp, kstep
+           real(dp) :: absakk, alpha, colmax, d, d11, d22, r1, rowmax, tt
+           complex(dp) :: d12, d21, t, wk, wkm1, wkp1, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETF2', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 90
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) .or. stdlib_disnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=dp)
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value.
+                    ! determine only rowmax.
+                    jmax = imax + stdlib_izamax( k-imax, a( imax, imax+1 ), lda )
+                    rowmax = cabs1( a( imax, jmax ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_izamax( imax-1, a( 1, imax ), 1 )
+                       rowmax = max( rowmax, cabs1( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( real( a( imax, imax ),KIND=dp) )>=alpha*rowmax )then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_zswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    do j = kp + 1, kk - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    r1 = real( a( kk, kk ),KIND=dp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=dp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       a( k, k ) = real( a( k, k ),KIND=dp)
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 else
+                    a( k, k ) = real( a( k, k ),KIND=dp)
+                    if( kstep==2 )a( k-1, k-1 ) = real( a( k-1, k-1 ),KIND=dp)
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**h = a - w(k)*1/d(k)*w(k)**h
+                    r1 = one / real( a( k, k ),KIND=dp)
+                    call stdlib_zher( uplo, k-1, -r1, a( 1, k ), 1, a, lda )
+                    ! store u(k) in column k
+                    call stdlib_zdscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**h
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**h
+                    if( k>2 ) then
+                       d = stdlib_dlapy2( real( a( k-1, k ),KIND=dp),aimag( a( k-1, k ) ) )
+                                 
+                       d22 = real( a( k-1, k-1 ),KIND=dp) / d
+                       d11 = real( a( k, k ),KIND=dp) / d
+                       tt = one / ( d11*d22-one )
+                       d12 = a( k-1, k ) / d
+                       d = tt / d
+                       do j = k - 2, 1, -1
+                          wkm1 = d*( d11*a( j, k-1 )-conjg( d12 )*a( j, k ) )
+                          wk = d*( d22*a( j, k )-d12*a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - a( i, k )*conjg( wk ) -a( i, k-1 )*conjg( &
+                                       wkm1 )
+                          end do
+                          a( j, k ) = wk
+                          a( j, k-1 ) = wkm1
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=dp), zero,KIND=dp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              50 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 90
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) .or. stdlib_disnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=dp)
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value.
+                    ! determine only rowmax.
+                    jmax = k - 1 + stdlib_izamax( imax-k, a( imax, k ), lda )
+                    rowmax = cabs1( a( imax, jmax ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_izamax( n-imax, a( imax+1, imax ), 1 )
+                       rowmax = max( rowmax, cabs1( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( real( a( imax, imax ),KIND=dp) )>=alpha*rowmax )then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_zswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    do j = kk + 1, kp - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    r1 = real( a( kk, kk ),KIND=dp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=dp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       a( k, k ) = real( a( k, k ),KIND=dp)
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 else
+                    a( k, k ) = real( a( k, k ),KIND=dp)
+                    if( kstep==2 )a( k+1, k+1 ) = real( a( k+1, k+1 ),KIND=dp)
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**h = a - w(k)*(1/d(k))*w(k)**h
+                       r1 = one / real( a( k, k ),KIND=dp)
+                       call stdlib_zher( uplo, n-k, -r1, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                 
+                       ! store l(k) in column k
+                       call stdlib_zdscal( n-k, r1, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k)
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**h
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**h
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d = stdlib_dlapy2( real( a( k+1, k ),KIND=dp),aimag( a( k+1, k ) ) )
+                                 
+                       d11 = real( a( k+1, k+1 ),KIND=dp) / d
+                       d22 = real( a( k, k ),KIND=dp) / d
+                       tt = one / ( d11*d22-one )
+                       d21 = a( k+1, k ) / d
+                       d = tt / d
+                       do j = k + 2, n
+                          wk = d*( d11*a( j, k )-d21*a( j, k+1 ) )
+                          wkp1 = d*( d22*a( j, k+1 )-conjg( d21 )*a( j, k ) )
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - a( i, k )*conjg( wk ) -a( i, k+1 )*conjg( &
+                                       wkp1 )
+                          end do
+                          a( j, k ) = wk
+                          a( j, k+1 ) = wkp1
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=dp), zero,KIND=dp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 50
+           end if
+           90 continue
+           return
+     end subroutine stdlib_zhetf2
+
+     !> ZHETF2_RK: computes the factorization of a complex Hermitian matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**H)*(P**T) or A = P*L*D*(L**H)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is Hermitian and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_zhetf2_rk( uplo, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: e(*)
+        ! ======================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done, upper
+           integer(ilp) :: i, ii, imax, itemp, j, jmax, k, kk, kp, kstep, p
+           real(dp) :: absakk, alpha, colmax, d, d11, d22, r1, dtemp, rowmax, tt, sfmin
+           complex(dp) :: d12, d21, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=dp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETF2_RK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = czero
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 34
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=dp)
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_izamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_izamax( imax-1, a( 1, imax ), 1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=dp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( a( imax, imax ),KIND=dp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 12
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! for only a 2x2 pivot, interchange rows and columns k and p
+                 ! in the leading submatrix a(1:k,1:k)
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! (1) swap columnar parts
+                    if( p>1 )call stdlib_zswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = p + 1, k - 1
+                       t = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( p, j ) )
+                       a( p, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( p, k ) = conjg( a( p, k ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( k, k ),KIND=dp)
+                    a( k, k ) = real( a( p, p ),KIND=dp)
+                    a( p, p ) = r1
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_zswap( n-k, a( k, k+1 ), lda, a( p, k+1 ), lda )
+                 end if
+                 ! for both 1x1 and 2x2 pivots, interchange rows and
+                 ! columns kk and kp in the leading submatrix a(1:k,1:k)
+                 if( kp/=kk ) then
+                    ! (1) swap columnar parts
+                    if( kp>1 )call stdlib_zswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = kp + 1, kk - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( kk, kk ),KIND=dp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=dp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       ! (*) make sure that diagonal element of pivot is real
+                       a( k, k ) = real( a( k, k ),KIND=dp)
+                       ! (5) swap row elements
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_zswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                 else
+                    ! (*) make sure that diagonal element of pivot is real
+                    a( k, k ) = real( a( k, k ),KIND=dp)
+                    if( kstep==2 )a( k-1, k-1 ) = real( a( k-1, k-1 ),KIND=dp)
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( abs( real( a( k, k ),KIND=dp) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = one / real( a( k, k ),KIND=dp)
+                          call stdlib_zher( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_zdscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = real( a( k, k ),KIND=dp)
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_zher( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       ! d = |a12|
+                       d = stdlib_dlapy2( real( a( k-1, k ),KIND=dp),aimag( a( k-1, k ) ) )
+                                 
+                       d11 = real( a( k, k ) / d,KIND=dp)
+                       d22 = real( a( k-1, k-1 ) / d,KIND=dp)
+                       d12 = a( k-1, k ) / d
+                       tt = one / ( d11*d22-one )
+                       do j = k - 2, 1, -1
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wkm1 = tt*( d11*a( j, k-1 )-conjg( d12 )*a( j, k ) )
+                          wk = tt*( d22*a( j, k )-d12*a( j, k-1 ) )
+                          ! perform a rank-2 update of a(1:k-2,1:k-2)
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) -( a( i, k ) / d )*conjg( wk ) -( a( i, k-1 ) &
+                                       / d )*conjg( wkm1 )
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d
+                          a( j, k-1 ) = wkm1 / d
+                          ! (*) make sure that diagonal element of pivot is real
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=dp), zero,KIND=dp)
+                       end do
+                    end if
+                    ! copy superdiagonal elements of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    e( k ) = a( k-1, k )
+                    e( k-1 ) = czero
+                    a( k-1, k ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              34 continue
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = czero
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 64
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=dp)
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_izamax( imax-k, a( imax, k ), lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_izamax( n-imax, a( imax+1, imax ),1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=dp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( a( imax, imax ),KIND=dp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 42
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! for only a 2x2 pivot, interchange rows and columns k and p
+                 ! in the trailing submatrix a(k:n,k:n)
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! (1) swap columnar parts
+                    if( p<n )call stdlib_zswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = k + 1, p - 1
+                       t = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( p, j ) )
+                       a( p, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( p, k ) = conjg( a( p, k ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( k, k ),KIND=dp)
+                    a( k, k ) = real( a( p, p ),KIND=dp)
+                    a( p, p ) = r1
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_zswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                 end if
+                 ! for both 1x1 and 2x2 pivots, interchange rows and
+                 ! columns kk and kp in the trailing submatrix a(k:n,k:n)
+                 if( kp/=kk ) then
+                    ! (1) swap columnar parts
+                    if( kp<n )call stdlib_zswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! (2) swap and conjugate middle parts
+                    do j = kk + 1, kp - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( kk, kk ),KIND=dp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=dp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       ! (*) make sure that diagonal element of pivot is real
+                       a( k, k ) = real( a( k, k ),KIND=dp)
+                       ! (5) swap row elements
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_zswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                 else
+                    ! (*) make sure that diagonal element of pivot is real
+                    a( k, k ) = real( a( k, k ),KIND=dp)
+                    if( kstep==2 )a( k+1, k+1 ) = real( a( k+1, k+1 ),KIND=dp)
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of a now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                       ! store l(k) in column k
+                       ! handle division by a small number
+                       if( abs( real( a( k, k ),KIND=dp) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = one / real( a( k, k ),KIND=dp)
+                          call stdlib_zher( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_zdscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = real( a( k, k ),KIND=dp)
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_zher( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       ! d = |a21|
+                       d = stdlib_dlapy2( real( a( k+1, k ),KIND=dp),aimag( a( k+1, k ) ) )
+                                 
+                       d11 = real( a( k+1, k+1 ),KIND=dp) / d
+                       d22 = real( a( k, k ),KIND=dp) / d
+                       d21 = a( k+1, k ) / d
+                       tt = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = tt*( d11*a( j, k )-d21*a( j, k+1 ) )
+                          wkp1 = tt*( d22*a( j, k+1 )-conjg( d21 )*a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) -( a( i, k ) / d )*conjg( wk ) -( a( i, k+1 ) &
+                                       / d )*conjg( wkp1 )
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d
+                          a( j, k+1 ) = wkp1 / d
+                          ! (*) make sure that diagonal element of pivot is real
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=dp), zero,KIND=dp)
+                       end do
+                    end if
+                    ! copy subdiagonal elements of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    e( k ) = a( k+1, k )
+                    e( k+1 ) = czero
+                    a( k+1, k ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+              64 continue
+           end if
+           return
+     end subroutine stdlib_zhetf2_rk
+
+     !> ZHETF2_ROOK: computes the factorization of a complex Hermitian matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method:
+     !> A = U*D*U**H  or  A = L*D*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**H is the conjugate transpose of U, and D is
+     !> Hermitian and block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_zhetf2_rook( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+        ! ======================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done, upper
+           integer(ilp) :: i, ii, imax, itemp, j, jmax, k, kk, kp, kstep, p
+           real(dp) :: absakk, alpha, colmax, d, d11, d22, r1, dtemp, rowmax, tt, sfmin
+           complex(dp) :: d12, d21, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=dp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETF2_ROOK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=dp)
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_izamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_izamax( imax-1, a( 1, imax ), 1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=dp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( a( imax, imax ),KIND=dp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 12
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! for only a 2x2 pivot, interchange rows and columns k and p
+                 ! in the leading submatrix a(1:k,1:k)
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! (1) swap columnar parts
+                    if( p>1 )call stdlib_zswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = p + 1, k - 1
+                       t = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( p, j ) )
+                       a( p, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( p, k ) = conjg( a( p, k ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( k, k ),KIND=dp)
+                    a( k, k ) = real( a( p, p ),KIND=dp)
+                    a( p, p ) = r1
+                 end if
+                 ! for both 1x1 and 2x2 pivots, interchange rows and
+                 ! columns kk and kp in the leading submatrix a(1:k,1:k)
+                 if( kp/=kk ) then
+                    ! (1) swap columnar parts
+                    if( kp>1 )call stdlib_zswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = kp + 1, kk - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( kk, kk ),KIND=dp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=dp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       ! (*) make sure that diagonal element of pivot is real
+                       a( k, k ) = real( a( k, k ),KIND=dp)
+                       ! (5) swap row elements
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 else
+                    ! (*) make sure that diagonal element of pivot is real
+                    a( k, k ) = real( a( k, k ),KIND=dp)
+                    if( kstep==2 )a( k-1, k-1 ) = real( a( k-1, k-1 ),KIND=dp)
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( abs( real( a( k, k ),KIND=dp) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = one / real( a( k, k ),KIND=dp)
+                          call stdlib_zher( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_zdscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = real( a( k, k ),KIND=dp)
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_zher( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       ! d = |a12|
+                       d = stdlib_dlapy2( real( a( k-1, k ),KIND=dp),aimag( a( k-1, k ) ) )
+                                 
+                       d11 = real( a( k, k ) / d,KIND=dp)
+                       d22 = real( a( k-1, k-1 ) / d,KIND=dp)
+                       d12 = a( k-1, k ) / d
+                       tt = one / ( d11*d22-one )
+                       do j = k - 2, 1, -1
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wkm1 = tt*( d11*a( j, k-1 )-conjg( d12 )*a( j, k ) )
+                          wk = tt*( d22*a( j, k )-d12*a( j, k-1 ) )
+                          ! perform a rank-2 update of a(1:k-2,1:k-2)
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) -( a( i, k ) / d )*conjg( wk ) -( a( i, k-1 ) &
+                                       / d )*conjg( wkm1 )
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d
+                          a( j, k-1 ) = wkm1 / d
+                          ! (*) make sure that diagonal element of pivot is real
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=dp), zero,KIND=dp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( a( k, k ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=dp)
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_izamax( imax-k, a( imax, k ), lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_izamax( n-imax, a( imax+1, imax ),1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=dp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( a( imax, imax ),KIND=dp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 42
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! for only a 2x2 pivot, interchange rows and columns k and p
+                 ! in the trailing submatrix a(k:n,k:n)
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! (1) swap columnar parts
+                    if( p<n )call stdlib_zswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    ! (2) swap and conjugate middle parts
+                    do j = k + 1, p - 1
+                       t = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( p, j ) )
+                       a( p, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( p, k ) = conjg( a( p, k ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( k, k ),KIND=dp)
+                    a( k, k ) = real( a( p, p ),KIND=dp)
+                    a( p, p ) = r1
+                 end if
+                 ! for both 1x1 and 2x2 pivots, interchange rows and
+                 ! columns kk and kp in the trailing submatrix a(k:n,k:n)
+                 if( kp/=kk ) then
+                    ! (1) swap columnar parts
+                    if( kp<n )call stdlib_zswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! (2) swap and conjugate middle parts
+                    do j = kk + 1, kp - 1
+                       t = conjg( a( j, kk ) )
+                       a( j, kk ) = conjg( a( kp, j ) )
+                       a( kp, j ) = t
+                    end do
+                    ! (3) swap and conjugate corner elements at row-col interserction
+                    a( kp, kk ) = conjg( a( kp, kk ) )
+                    ! (4) swap diagonal elements at row-col intersection
+                    r1 = real( a( kk, kk ),KIND=dp)
+                    a( kk, kk ) = real( a( kp, kp ),KIND=dp)
+                    a( kp, kp ) = r1
+                    if( kstep==2 ) then
+                       ! (*) make sure that diagonal element of pivot is real
+                       a( k, k ) = real( a( k, k ),KIND=dp)
+                       ! (5) swap row elements
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 else
+                    ! (*) make sure that diagonal element of pivot is real
+                    a( k, k ) = real( a( k, k ),KIND=dp)
+                    if( kstep==2 )a( k+1, k+1 ) = real( a( k+1, k+1 ),KIND=dp)
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of a now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                       ! store l(k) in column k
+                       ! handle division by a small number
+                       if( abs( real( a( k, k ),KIND=dp) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = one / real( a( k, k ),KIND=dp)
+                          call stdlib_zher( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_zdscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = real( a( k, k ),KIND=dp)
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_zher( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       ! d = |a21|
+                       d = stdlib_dlapy2( real( a( k+1, k ),KIND=dp),aimag( a( k+1, k ) ) )
+                                 
+                       d11 = real( a( k+1, k+1 ),KIND=dp) / d
+                       d22 = real( a( k, k ),KIND=dp) / d
+                       d21 = a( k+1, k ) / d
+                       tt = one / ( d11*d22-one )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = tt*( d11*a( j, k )-d21*a( j, k+1 ) )
+                          wkp1 = tt*( d22*a( j, k+1 )-conjg( d21 )*a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) -( a( i, k ) / d )*conjg( wk ) -( a( i, k+1 ) &
+                                       / d )*conjg( wkp1 )
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d
+                          a( j, k+1 ) = wkp1 / d
+                          ! (*) make sure that diagonal element of pivot is real
+                          a( j, j ) = cmplx( real( a( j, j ),KIND=dp), zero,KIND=dp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_zhetf2_rook
+
+     !> ZHETRI: computes the inverse of a complex Hermitian indefinite matrix
+     !> A using the factorization A = U*D*U**H or A = L*D*L**H computed by
+     !> ZHETRF.
+
+     pure subroutine stdlib_zhetri( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp, kstep
+           real(dp) :: ak, akp1, d, t
+           complex(dp) :: akkp1, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / real( a( k, k ),KIND=dp)
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_zcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_zhemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - real( stdlib_zdotc( k-1, work, 1, a( 1,k ), 1 ),&
+                              KIND=dp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k+1 ) )
+                 ak = real( a( k, k ),KIND=dp) / t
+                 akp1 = real( a( k+1, k+1 ),KIND=dp) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_zcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_zhemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - real( stdlib_zdotc( k-1, work, 1, a( 1,k ), 1 ),&
+                              KIND=dp)
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_zdotc( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_zcopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_zhemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -real( stdlib_zdotc( k-1, work, 1, a( 1, k+1 ),&
+                              1 ),KIND=dp)
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 call stdlib_zswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                 do j = kp + 1, k - 1
+                    temp = conjg( a( j, k ) )
+                    a( j, k ) = conjg( a( kp, j ) )
+                    a( kp, j ) = temp
+                 end do
+                 a( kp, k ) = conjg( a( kp, k ) )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              go to 30
+              50 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              60 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / real( a( k, k ),KIND=dp)
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_zcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_zhemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - real( stdlib_zdotc( n-k, work, 1,a( k+1, k ), 1 ),&
+                              KIND=dp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k-1 ) )
+                 ak = real( a( k-1, k-1 ),KIND=dp) / t
+                 akp1 = real( a( k, k ),KIND=dp) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_zcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_zhemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - real( stdlib_zdotc( n-k, work, 1,a( k+1, k ), 1 ),&
+                              KIND=dp)
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_zdotc( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 &
+                              )
+                    call stdlib_zcopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_zhemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -real( stdlib_zdotc( n-k, work, 1, a( k+1, k-1 )&
+                              ,1 ),KIND=dp)
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 if( kp<n )call stdlib_zswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                 do j = k + 1, kp - 1
+                    temp = conjg( a( j, k ) )
+                    a( j, k ) = conjg( a( kp, j ) )
+                    a( kp, j ) = temp
+                 end do
+                 a( kp, k ) = conjg( a( kp, k ) )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              go to 60
+              80 continue
+           end if
+           return
+     end subroutine stdlib_zhetri
+
+     !> ZHETRI_ROOK: computes the inverse of a complex Hermitian indefinite matrix
+     !> A using the factorization A = U*D*U**H or A = L*D*L**H computed by
+     !> ZHETRF_ROOK.
+
+     pure subroutine stdlib_zhetri_rook( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp, kstep
+           real(dp) :: ak, akp1, d, t
+           complex(dp) :: akkp1, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRI_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 70
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / real( a( k, k ),KIND=dp)
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_zcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_zhemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - real( stdlib_zdotc( k-1, work, 1, a( 1,k ), 1 ),&
+                              KIND=dp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k+1 ) )
+                 ak = real( a( k, k ),KIND=dp) / t
+                 akp1 = real( a( k+1, k+1 ),KIND=dp) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_zcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_zhemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - real( stdlib_zdotc( k-1, work, 1, a( 1,k ), 1 ),&
+                              KIND=dp)
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_zdotc( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_zcopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_zhemv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -real( stdlib_zdotc( k-1, work, 1, a( 1, k+1 ),&
+                              1 ),KIND=dp)
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the leading
+                 ! submatrix a(1:k,1:k)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_zswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    do j = kp + 1, k - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k+1 with -ipiv(k) and
+                 ! -ipiv(k+1) in the leading submatrix a(k+1:n,k+1:n)
+                 ! (1) interchange rows and columns k and -ipiv(k)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_zswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    do j = kp + 1, k - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+                 ! (2) interchange rows and columns k+1 and -ipiv(k+1)
+                 k = k + 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_zswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    do j = kp + 1, k - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k + 1
+              go to 30
+              70 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**h.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              80 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 120
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = one / real( a( k, k ),KIND=dp)
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_zcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_zhemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - real( stdlib_zdotc( n-k, work, 1,a( k+1, k ), 1 ),&
+                              KIND=dp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( a( k, k-1 ) )
+                 ak = real( a( k-1, k-1 ),KIND=dp) / t
+                 akp1 = real( a( k, k ),KIND=dp) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_zcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_zhemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - real( stdlib_zdotc( n-k, work, 1,a( k+1, k ), 1 ),&
+                              KIND=dp)
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_zdotc( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 &
+                              )
+                    call stdlib_zcopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_zhemv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work,1, czero, a( k+&
+                              1, k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -real( stdlib_zdotc( n-k, work, 1, a( k+1, k-1 )&
+                              ,1 ),KIND=dp)
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the trailing
+                 ! submatrix a(k:n,k:n)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_zswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    do j = k + 1, kp - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k-1 with -ipiv(k) and
+                 ! -ipiv(k-1) in the trailing submatrix a(k-1:n,k-1:n)
+                 ! (1) interchange rows and columns k and -ipiv(k)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_zswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    do j = k + 1, kp - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+                 ! (2) interchange rows and columns k-1 and -ipiv(k-1)
+                 k = k - 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_zswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    do j = k + 1, kp - 1
+                       temp = conjg( a( j, k ) )
+                       a( j, k ) = conjg( a( kp, j ) )
+                       a( kp, j ) = temp
+                    end do
+                    a( kp, k ) = conjg( a( kp, k ) )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k - 1
+              go to 80
+              120 continue
+           end if
+           return
+     end subroutine stdlib_zhetri_rook
+
+     !> ZHETRS_3: solves a system of linear equations A * X = B with a complex
+     !> Hermitian matrix A using the factorization computed
+     !> by ZHETRF_RK or ZHETRF_BK:
+     !> A = P*U*D*(U**H)*(P**T) or A = P*L*D*(L**H)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is Hermitian and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This algorithm is using Level 3 BLAS.
+
+     pure subroutine stdlib_zhetrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*), e(*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j, k, kp
+           real(dp) :: s
+           complex(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRS_3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! begin upper
+              ! solve a*x = b, where a = u*d*u**h.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+              call stdlib_ztrsm( 'L', 'U', 'N', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i = n
+              do while ( i>=1 )
+                 if( ipiv( i )>0 ) then
+                    s = real( cone,KIND=dp) / real( a( i, i ),KIND=dp)
+                    call stdlib_zdscal( nrhs, s, b( i, 1 ), ldb )
+                 else if ( i>1 ) then
+                    akm1k = e( i )
+                    akm1 = a( i-1, i-1 ) / akm1k
+                    ak = a( i, i ) / conjg( akm1k )
+                    denom = akm1*ak - cone
+                    do j = 1, nrhs
+                       bkm1 = b( i-1, j ) / akm1k
+                       bk = b( i, j ) / conjg( akm1k )
+                       b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                       b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i - 1
+                 end if
+                 i = i - 1
+              end do
+              ! compute (u**h \ b) -> b   [ u**h \ (d \ (u \p**t * b) ) ]
+              call stdlib_ztrsm( 'L', 'U', 'C', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! p * b  [ p * (u**h \ (d \ (u \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+           else
+              ! begin lower
+              ! solve a*x = b, where a = l*d*l**h.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+              call stdlib_ztrsm( 'L', 'L', 'N', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i = 1
+              do while ( i<=n )
+                 if( ipiv( i )>0 ) then
+                    s = real( cone,KIND=dp) / real( a( i, i ),KIND=dp)
+                    call stdlib_zdscal( nrhs, s, b( i, 1 ), ldb )
+                 else if( i<n ) then
+                    akm1k = e( i )
+                    akm1 = a( i, i ) / conjg( akm1k )
+                    ak = a( i+1, i+1 ) / akm1k
+                    denom = akm1*ak - cone
+                    do  j = 1, nrhs
+                       bkm1 = b( i, j ) / conjg( akm1k )
+                       bk = b( i+1, j ) / akm1k
+                       b( i, j ) = ( ak*bkm1-bk ) / denom
+                       b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i + 1
+                 end if
+                 i = i + 1
+              end do
+              ! compute (l**h \ b) -> b   [ l**h \ (d \ (l \p**t * b) ) ]
+              call stdlib_ztrsm('L', 'L', 'C', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! p * b  [ p * (l**h \ (d \ (l \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! end lower
+           end if
+           return
+     end subroutine stdlib_zhetrs_3
+
+     !> Level 3 BLAS like routine for C in RFP Format.
+     !> ZHFRK: performs one of the Hermitian rank--k operations
+     !> C := alpha*A*A**H + beta*C,
+     !> or
+     !> C := alpha*A**H*A + beta*C,
+     !> where alpha and beta are real scalars, C is an n--by--n Hermitian
+     !> matrix and A is an n--by--k matrix in the first case and a k--by--n
+     !> matrix in the second case.
+
+     pure subroutine stdlib_zhfrk( transr, uplo, trans, n, k, alpha, a, lda, beta,c )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: k, lda, n
+           character, intent(in) :: trans, transr, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: c(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, normaltransr, nisodd, notrans
+           integer(ilp) :: info, nrowa, j, nk, n1, n2
+           complex(dp) :: calpha, cbeta
+           ! Intrinsic Functions 
+           intrinsic :: max,cmplx
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           notrans = stdlib_lsame( trans, 'N' )
+           if( notrans ) then
+              nrowa = n
+           else
+              nrowa = k
+           end if
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( .not.notrans .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( lda<max( 1, nrowa ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHFRK ', -info )
+              return
+           end if
+           ! quick return if possible.
+           ! the quick return case: ((alpha==0).and.(beta/=zero)) is not
+           ! done (it is in stdlib_zherk for example) and left in the general case.
+           if( ( n==0 ) .or. ( ( ( alpha==zero ) .or. ( k==0 ) ) .and.( beta==one ) ) )&
+                     return
+           if( ( alpha==zero ) .and. ( beta==zero ) ) then
+              do j = 1, ( ( n*( n+1 ) ) / 2 )
+                 c( j ) = czero
+              end do
+              return
+           end if
+           calpha = cmplx( alpha, zero,KIND=dp)
+           cbeta = cmplx( beta, zero,KIND=dp)
+           ! c is n-by-n.
+           ! if n is odd, set nisodd = .true., and n1 and n2.
+           ! if n is even, nisodd = .false., and nk.
+           if( mod( n, 2 )==0 ) then
+              nisodd = .false.
+              nk = n / 2
+           else
+              nisodd = .true.
+              if( lower ) then
+                 n2 = n / 2
+                 n1 = n - n2
+              else
+                 n1 = n / 2
+                 n2 = n - n1
+              end if
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! n is odd, transr = 'n', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is odd, transr = 'n', uplo = 'l', and trans = 'n'
+                       call stdlib_zherk( 'L', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n )
+                                 
+                       call stdlib_zherk( 'U', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( n+1 )&
+                                 , n )
+                       call stdlib_zgemm( 'N', 'C', n2, n1, k, calpha, a( n1+1, 1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( n1+1 ), n )
+                    else
+                       ! n is odd, transr = 'n', uplo = 'l', and trans = 'c'
+                       call stdlib_zherk( 'L', 'C', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n )
+                                 
+                       call stdlib_zherk( 'U', 'C', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( n+1 )&
+                                 , n )
+                       call stdlib_zgemm( 'C', 'N', n2, n1, k, calpha, a( 1, n1+1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( n1+1 ), n )
+                    end if
+                 else
+                    ! n is odd, transr = 'n', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is odd, transr = 'n', uplo = 'u', and trans = 'n'
+                       call stdlib_zherk( 'L', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2+1 ), &
+                                 n )
+                       call stdlib_zherk( 'U', 'N', n2, k, alpha, a( n2, 1 ), lda,beta, c( n1+1 ),&
+                                  n )
+                       call stdlib_zgemm( 'N', 'C', n1, n2, k, calpha, a( 1, 1 ),lda, a( n2, 1 ), &
+                                 lda, cbeta, c( 1 ), n )
+                    else
+                       ! n is odd, transr = 'n', uplo = 'u', and trans = 'c'
+                       call stdlib_zherk( 'L', 'C', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2+1 ), &
+                                 n )
+                       call stdlib_zherk( 'U', 'C', n2, k, alpha, a( 1, n2 ), lda,beta, c( n1+1 ),&
+                                  n )
+                       call stdlib_zgemm( 'C', 'N', n1, n2, k, calpha, a( 1, 1 ),lda, a( 1, n2 ), &
+                                 lda, cbeta, c( 1 ), n )
+                    end if
+                 end if
+              else
+                 ! n is odd, and transr = 'c'
+                 if( lower ) then
+                    ! n is odd, transr = 'c', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is odd, transr = 'c', uplo = 'l', and trans = 'n'
+                       call stdlib_zherk( 'U', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n1 &
+                                 )
+                       call stdlib_zherk( 'L', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( 2 ), &
+                                 n1 )
+                       call stdlib_zgemm( 'N', 'C', n1, n2, k, calpha, a( 1, 1 ),lda, a( n1+1, 1 )&
+                                 , lda, cbeta,c( n1*n1+1 ), n1 )
+                    else
+                       ! n is odd, transr = 'c', uplo = 'l', and trans = 'c'
+                       call stdlib_zherk( 'U', 'C', n1, k, alpha, a( 1, 1 ), lda,beta, c( 1 ), n1 &
+                                 )
+                       call stdlib_zherk( 'L', 'C', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( 2 ), &
+                                 n1 )
+                       call stdlib_zgemm( 'C', 'N', n1, n2, k, calpha, a( 1, 1 ),lda, a( 1, n1+1 )&
+                                 , lda, cbeta,c( n1*n1+1 ), n1 )
+                    end if
+                 else
+                    ! n is odd, transr = 'c', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is odd, transr = 'c', uplo = 'u', and trans = 'n'
+                       call stdlib_zherk( 'U', 'N', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2*n2+1 &
+                                 ), n2 )
+                       call stdlib_zherk( 'L', 'N', n2, k, alpha, a( n1+1, 1 ), lda,beta, c( &
+                                 n1*n2+1 ), n2 )
+                       call stdlib_zgemm( 'N', 'C', n2, n1, k, calpha, a( n1+1, 1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( 1 ), n2 )
+                    else
+                       ! n is odd, transr = 'c', uplo = 'u', and trans = 'c'
+                       call stdlib_zherk( 'U', 'C', n1, k, alpha, a( 1, 1 ), lda,beta, c( n2*n2+1 &
+                                 ), n2 )
+                       call stdlib_zherk( 'L', 'C', n2, k, alpha, a( 1, n1+1 ), lda,beta, c( &
+                                 n1*n2+1 ), n2 )
+                       call stdlib_zgemm( 'C', 'N', n2, n1, k, calpha, a( 1, n1+1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( 1 ), n2 )
+                    end if
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! n is even, transr = 'n', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is even, transr = 'n', uplo = 'l', and trans = 'n'
+                       call stdlib_zherk( 'L', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( 2 ), n+&
+                                 1 )
+                       call stdlib_zherk( 'U', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( 1 ), &
+                                 n+1 )
+                       call stdlib_zgemm( 'N', 'C', nk, nk, k, calpha, a( nk+1, 1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( nk+2 ),n+1 )
+                    else
+                       ! n is even, transr = 'n', uplo = 'l', and trans = 'c'
+                       call stdlib_zherk( 'L', 'C', nk, k, alpha, a( 1, 1 ), lda,beta, c( 2 ), n+&
+                                 1 )
+                       call stdlib_zherk( 'U', 'C', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( 1 ), &
+                                 n+1 )
+                       call stdlib_zgemm( 'C', 'N', nk, nk, k, calpha, a( 1, nk+1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( nk+2 ),n+1 )
+                    end if
+                 else
+                    ! n is even, transr = 'n', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is even, transr = 'n', uplo = 'u', and trans = 'n'
+                       call stdlib_zherk( 'L', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+2 ), &
+                                 n+1 )
+                       call stdlib_zherk( 'U', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( nk+1 &
+                                 ), n+1 )
+                       call stdlib_zgemm( 'N', 'C', nk, nk, k, calpha, a( 1, 1 ),lda, a( nk+1, 1 )&
+                                 , lda, cbeta, c( 1 ),n+1 )
+                    else
+                       ! n is even, transr = 'n', uplo = 'u', and trans = 'c'
+                       call stdlib_zherk( 'L', 'C', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+2 ), &
+                                 n+1 )
+                       call stdlib_zherk( 'U', 'C', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( nk+1 &
+                                 ), n+1 )
+                       call stdlib_zgemm( 'C', 'N', nk, nk, k, calpha, a( 1, 1 ),lda, a( 1, nk+1 )&
+                                 , lda, cbeta, c( 1 ),n+1 )
+                    end if
+                 end if
+              else
+                 ! n is even, and transr = 'c'
+                 if( lower ) then
+                    ! n is even, transr = 'c', and uplo = 'l'
+                    if( notrans ) then
+                       ! n is even, transr = 'c', uplo = 'l', and trans = 'n'
+                       call stdlib_zherk( 'U', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+1 ), &
+                                 nk )
+                       call stdlib_zherk( 'L', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( 1 ), &
+                                 nk )
+                       call stdlib_zgemm( 'N', 'C', nk, nk, k, calpha, a( 1, 1 ),lda, a( nk+1, 1 )&
+                                 , lda, cbeta,c( ( ( nk+1 )*nk )+1 ), nk )
+                    else
+                       ! n is even, transr = 'c', uplo = 'l', and trans = 'c'
+                       call stdlib_zherk( 'U', 'C', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk+1 ), &
+                                 nk )
+                       call stdlib_zherk( 'L', 'C', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( 1 ), &
+                                 nk )
+                       call stdlib_zgemm( 'C', 'N', nk, nk, k, calpha, a( 1, 1 ),lda, a( 1, nk+1 )&
+                                 , lda, cbeta,c( ( ( nk+1 )*nk )+1 ), nk )
+                    end if
+                 else
+                    ! n is even, transr = 'c', and uplo = 'u'
+                    if( notrans ) then
+                       ! n is even, transr = 'c', uplo = 'u', and trans = 'n'
+                       call stdlib_zherk( 'U', 'N', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk*( nk+&
+                                 1 )+1 ), nk )
+                       call stdlib_zherk( 'L', 'N', nk, k, alpha, a( nk+1, 1 ), lda,beta, c( &
+                                 nk*nk+1 ), nk )
+                       call stdlib_zgemm( 'N', 'C', nk, nk, k, calpha, a( nk+1, 1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( 1 ), nk )
+                    else
+                       ! n is even, transr = 'c', uplo = 'u', and trans = 'c'
+                       call stdlib_zherk( 'U', 'C', nk, k, alpha, a( 1, 1 ), lda,beta, c( nk*( nk+&
+                                 1 )+1 ), nk )
+                       call stdlib_zherk( 'L', 'C', nk, k, alpha, a( 1, nk+1 ), lda,beta, c( &
+                                 nk*nk+1 ), nk )
+                       call stdlib_zgemm( 'C', 'N', nk, nk, k, calpha, a( 1, nk+1 ),lda, a( 1, 1 )&
+                                 , lda, cbeta, c( 1 ), nk )
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_zhfrk
+
+     !> ZHPGST: reduces a complex Hermitian-definite generalized
+     !> eigenproblem to standard form, using packed storage.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L.
+     !> B must have been previously factorized as U**H*U or L*L**H by ZPPTRF.
+
+     pure subroutine stdlib_zhpgst( itype, uplo, n, ap, bp, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: ap(*)
+           complex(dp), intent(in) :: bp(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, j1, j1j1, jj, k, k1, k1k1, kk
+           real(dp) :: ajj, akk, bjj, bkk
+           complex(dp) :: ct
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPGST', -info )
+              return
+           end if
+           if( itype==1 ) then
+              if( upper ) then
+                 ! compute inv(u**h)*a*inv(u)
+                 ! j1 and jj are the indices of a(1,j) and a(j,j)
+                 jj = 0
+                 do j = 1, n
+                    j1 = jj + 1
+                    jj = jj + j
+                    ! compute the j-th column of the upper triangle of a
+                    ap( jj ) = real( ap( jj ),KIND=dp)
+                    bjj = real( bp( jj ),KIND=dp)
+                    call stdlib_ztpsv( uplo, 'CONJUGATE TRANSPOSE', 'NON-UNIT', j,bp, ap( j1 ), 1 &
+                              )
+                    call stdlib_zhpmv( uplo, j-1, -cone, ap, bp( j1 ), 1, cone,ap( j1 ), 1 )
+                              
+                    call stdlib_zdscal( j-1, one / bjj, ap( j1 ), 1 )
+                    ap( jj ) = ( ap( jj )-stdlib_zdotc( j-1, ap( j1 ), 1, bp( j1 ),1 ) ) / &
+                              bjj
+                 end do
+              else
+                 ! compute inv(l)*a*inv(l**h)
+                 ! kk and k1k1 are the indices of a(k,k) and a(k+1,k+1)
+                 kk = 1
+                 do k = 1, n
+                    k1k1 = kk + n - k + 1
+                    ! update the lower triangle of a(k:n,k:n)
+                    akk = real( ap( kk ),KIND=dp)
+                    bkk = real( bp( kk ),KIND=dp)
+                    akk = akk / bkk**2
+                    ap( kk ) = akk
+                    if( k<n ) then
+                       call stdlib_zdscal( n-k, one / bkk, ap( kk+1 ), 1 )
+                       ct = -half*akk
+                       call stdlib_zaxpy( n-k, ct, bp( kk+1 ), 1, ap( kk+1 ), 1 )
+                       call stdlib_zhpr2( uplo, n-k, -cone, ap( kk+1 ), 1,bp( kk+1 ), 1, ap( k1k1 &
+                                 ) )
+                       call stdlib_zaxpy( n-k, ct, bp( kk+1 ), 1, ap( kk+1 ), 1 )
+                       call stdlib_ztpsv( uplo, 'NO TRANSPOSE', 'NON-UNIT', n-k,bp( k1k1 ), ap( &
+                                 kk+1 ), 1 )
+                    end if
+                    kk = k1k1
+                 end do
+              end if
+           else
+              if( upper ) then
+                 ! compute u*a*u**h
+                 ! k1 and kk are the indices of a(1,k) and a(k,k)
+                 kk = 0
+                 do k = 1, n
+                    k1 = kk + 1
+                    kk = kk + k
+                    ! update the upper triangle of a(1:k,1:k)
+                    akk = real( ap( kk ),KIND=dp)
+                    bkk = real( bp( kk ),KIND=dp)
+                    call stdlib_ztpmv( uplo, 'NO TRANSPOSE', 'NON-UNIT', k-1, bp,ap( k1 ), 1 )
+                              
+                    ct = half*akk
+                    call stdlib_zaxpy( k-1, ct, bp( k1 ), 1, ap( k1 ), 1 )
+                    call stdlib_zhpr2( uplo, k-1, cone, ap( k1 ), 1, bp( k1 ), 1,ap )
+                    call stdlib_zaxpy( k-1, ct, bp( k1 ), 1, ap( k1 ), 1 )
+                    call stdlib_zdscal( k-1, bkk, ap( k1 ), 1 )
+                    ap( kk ) = akk*bkk**2
+                 end do
+              else
+                 ! compute l**h *a*l
+                 ! jj and j1j1 are the indices of a(j,j) and a(j+1,j+1)
+                 jj = 1
+                 do j = 1, n
+                    j1j1 = jj + n - j + 1
+                    ! compute the j-th column of the lower triangle of a
+                    ajj = real( ap( jj ),KIND=dp)
+                    bjj = real( bp( jj ),KIND=dp)
+                    ap( jj ) = ajj*bjj + stdlib_zdotc( n-j, ap( jj+1 ), 1,bp( jj+1 ), 1 )
+                    call stdlib_zdscal( n-j, bjj, ap( jj+1 ), 1 )
+                    call stdlib_zhpmv( uplo, n-j, cone, ap( j1j1 ), bp( jj+1 ), 1,cone, ap( jj+1 )&
+                              , 1 )
+                    call stdlib_ztpmv( uplo, 'CONJUGATE TRANSPOSE', 'NON-UNIT',n-j+1, bp( jj ), &
+                              ap( jj ), 1 )
+                    jj = j1j1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zhpgst
+
+     !> ZHPTRF: computes the factorization of a complex Hermitian packed
+     !> matrix A using the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**H  or  A = L*D*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+
+     pure subroutine stdlib_zhptrf( uplo, n, ap, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kc, kk, knc, kp, kpc, kstep, kx, npp
+           real(dp) :: absakk, alpha, colmax, d, d11, d22, r1, rowmax, tt
+           complex(dp) :: d12, d21, t, wk, wkm1, wkp1, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPTRF', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              kc = ( n-1 )*n / 2 + 1
+              10 continue
+              knc = kc
+              ! if k < 1, exit from loop
+              if( k<1 )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( ap( kc+k-1 ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, ap( kc ), 1 )
+                 colmax = cabs1( ap( kc+imax-1 ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ap( kc+k-1 ) = real( ap( kc+k-1 ),KIND=dp)
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    rowmax = zero
+                    jmax = imax
+                    kx = imax*( imax+1 ) / 2 + imax
+                    do j = imax + 1, k
+                       if( cabs1( ap( kx ) )>rowmax ) then
+                          rowmax = cabs1( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + j
+                    end do
+                    kpc = ( imax-1 )*imax / 2 + 1
+                    if( imax>1 ) then
+                       jmax = stdlib_izamax( imax-1, ap( kpc ), 1 )
+                       rowmax = max( rowmax, cabs1( ap( kpc+jmax-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( real( ap( kpc+imax-1 ),KIND=dp) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kstep==2 )knc = knc - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_zswap( kp-1, ap( knc ), 1, ap( kpc ), 1 )
+                    kx = kpc + kp - 1
+                    do j = kp + 1, kk - 1
+                       kx = kx + j - 1
+                       t = conjg( ap( knc+j-1 ) )
+                       ap( knc+j-1 ) = conjg( ap( kx ) )
+                       ap( kx ) = t
+                    end do
+                    ap( kx+kk-1 ) = conjg( ap( kx+kk-1 ) )
+                    r1 = real( ap( knc+kk-1 ),KIND=dp)
+                    ap( knc+kk-1 ) = real( ap( kpc+kp-1 ),KIND=dp)
+                    ap( kpc+kp-1 ) = r1
+                    if( kstep==2 ) then
+                       ap( kc+k-1 ) = real( ap( kc+k-1 ),KIND=dp)
+                       t = ap( kc+k-2 )
+                       ap( kc+k-2 ) = ap( kc+kp-1 )
+                       ap( kc+kp-1 ) = t
+                    end if
+                 else
+                    ap( kc+k-1 ) = real( ap( kc+k-1 ),KIND=dp)
+                    if( kstep==2 )ap( kc-1 ) = real( ap( kc-1 ),KIND=dp)
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**h = a - w(k)*1/d(k)*w(k)**h
+                    r1 = one / real( ap( kc+k-1 ),KIND=dp)
+                    call stdlib_zhpr( uplo, k-1, -r1, ap( kc ), 1, ap )
+                    ! store u(k) in column k
+                    call stdlib_zdscal( k-1, r1, ap( kc ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**h
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**h
+                    if( k>2 ) then
+                       d = stdlib_dlapy2( real( ap( k-1+( k-1 )*k / 2 ),KIND=dp),aimag( ap( k-1+( &
+                                 k-1 )*k / 2 ) ) )
+                       d22 = real( ap( k-1+( k-2 )*( k-1 ) / 2 ),KIND=dp) / d
+                       d11 = real( ap( k+( k-1 )*k / 2 ),KIND=dp) / d
+                       tt = one / ( d11*d22-one )
+                       d12 = ap( k-1+( k-1 )*k / 2 ) / d
+                       d = tt / d
+                       do j = k - 2, 1, -1
+                          wkm1 = d*( d11*ap( j+( k-2 )*( k-1 ) / 2 )-conjg( d12 )*ap( j+( k-1 )*k &
+                                    / 2 ) )
+                          wk = d*( d22*ap( j+( k-1 )*k / 2 )-d12*ap( j+( k-2 )*( k-1 ) / 2 ) )
+                                    
+                          do i = j, 1, -1
+                             ap( i+( j-1 )*j / 2 ) = ap( i+( j-1 )*j / 2 ) -ap( i+( k-1 )*k / 2 )&
+                                       *conjg( wk ) -ap( i+( k-2 )*( k-1 ) / 2 )*conjg( wkm1 )
+                          end do
+                          ap( j+( k-1 )*k / 2 ) = wk
+                          ap( j+( k-2 )*( k-1 ) / 2 ) = wkm1
+                          ap( j+( j-1 )*j / 2 ) = cmplx( real( ap( j+( j-1 )*j / 2 ),KIND=dp), &
+                                    zero,KIND=dp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              kc = knc - k
+              go to 10
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              kc = 1
+              npp = n*( n+1 ) / 2
+              60 continue
+              knc = kc
+              ! if k > n, exit from loop
+              if( k>n )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( ap( kc ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, ap( kc+1 ), 1 )
+                 colmax = cabs1( ap( kc+imax-k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ap( kc ) = real( ap( kc ),KIND=dp)
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    rowmax = zero
+                    kx = kc + imax - k
+                    do j = k, imax - 1
+                       if( cabs1( ap( kx ) )>rowmax ) then
+                          rowmax = cabs1( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + n - j
+                    end do
+                    kpc = npp - ( n-imax+1 )*( n-imax+2 ) / 2 + 1
+                    if( imax<n ) then
+                       jmax = imax + stdlib_izamax( n-imax, ap( kpc+1 ), 1 )
+                       rowmax = max( rowmax, cabs1( ap( kpc+jmax-imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( abs( real( ap( kpc ),KIND=dp) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kstep==2 )knc = knc + n - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_zswap( n-kp, ap( knc+kp-kk+1 ), 1, ap( kpc+1 ),1 )
+                              
+                    kx = knc + kp - kk
+                    do j = kk + 1, kp - 1
+                       kx = kx + n - j + 1
+                       t = conjg( ap( knc+j-kk ) )
+                       ap( knc+j-kk ) = conjg( ap( kx ) )
+                       ap( kx ) = t
+                    end do
+                    ap( knc+kp-kk ) = conjg( ap( knc+kp-kk ) )
+                    r1 = real( ap( knc ),KIND=dp)
+                    ap( knc ) = real( ap( kpc ),KIND=dp)
+                    ap( kpc ) = r1
+                    if( kstep==2 ) then
+                       ap( kc ) = real( ap( kc ),KIND=dp)
+                       t = ap( kc+1 )
+                       ap( kc+1 ) = ap( kc+kp-k )
+                       ap( kc+kp-k ) = t
+                    end if
+                 else
+                    ap( kc ) = real( ap( kc ),KIND=dp)
+                    if( kstep==2 )ap( knc ) = real( ap( knc ),KIND=dp)
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**h = a - w(k)*(1/d(k))*w(k)**h
+                       r1 = one / real( ap( kc ),KIND=dp)
+                       call stdlib_zhpr( uplo, n-k, -r1, ap( kc+1 ), 1,ap( kc+n-k+1 ) )
+                       ! store l(k) in column k
+                       call stdlib_zdscal( n-k, r1, ap( kc+1 ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**h
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**h
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d = stdlib_dlapy2( real( ap( k+1+( k-1 )*( 2*n-k ) / 2 ),KIND=dp),aimag( &
+                                 ap( k+1+( k-1 )*( 2*n-k ) / 2 ) ) )
+                       d11 = real( ap( k+1+k*( 2*n-k-1 ) / 2 ),KIND=dp) / d
+                       d22 = real( ap( k+( k-1 )*( 2*n-k ) / 2 ),KIND=dp) / d
+                       tt = one / ( d11*d22-one )
+                       d21 = ap( k+1+( k-1 )*( 2*n-k ) / 2 ) / d
+                       d = tt / d
+                       do j = k + 2, n
+                          wk = d*( d11*ap( j+( k-1 )*( 2*n-k ) / 2 )-d21*ap( j+k*( 2*n-k-1 ) / 2 )&
+                                     )
+                          wkp1 = d*( d22*ap( j+k*( 2*n-k-1 ) / 2 )-conjg( d21 )*ap( j+( k-1 )*( &
+                                    2*n-k ) /2 ) )
+                          do i = j, n
+                             ap( i+( j-1 )*( 2*n-j ) / 2 ) = ap( i+( j-1 )*( 2*n-j ) / 2 ) - ap( &
+                             i+( k-1 )*( 2*n-k ) /2 )*conjg( wk ) - ap( i+k*( 2*n-k-1 ) / 2 )&
+                                       *conjg( wkp1 )
+                          end do
+                          ap( j+( k-1 )*( 2*n-k ) / 2 ) = wk
+                          ap( j+k*( 2*n-k-1 ) / 2 ) = wkp1
+                          ap( j+( j-1 )*( 2*n-j ) / 2 )= cmplx( real( ap( j+( j-1 )*( 2*n-j ) / 2 &
+                                    ),KIND=dp),zero,KIND=dp)
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              kc = knc + n - k + 2
+              go to 60
+           end if
+           110 continue
+           return
+     end subroutine stdlib_zhptrf
+
+     !> ZHPTRI: computes the inverse of a complex Hermitian indefinite matrix
+     !> A in packed storage using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by ZHPTRF.
+
+     pure subroutine stdlib_zhptri( uplo, n, ap, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: ap(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kcnext, kp, kpc, kstep, kx, npp
+           real(dp) :: ak, akp1, d, t
+           complex(dp) :: akkp1, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              kp = n*( n+1 ) / 2
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. ap( kp )==czero )return
+                 kp = kp - info
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              kp = 1
+              do info = 1, n
+                 if( ipiv( info )>0 .and. ap( kp )==czero )return
+                 kp = kp + n - info + 1
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              kcnext = kc + k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc+k-1 ) = one / real( ap( kc+k-1 ),KIND=dp)
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_zcopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_zhpmv( uplo, k-1, -cone, ap, work, 1, czero,ap( kc ), 1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -real( stdlib_zdotc( k-1, work, 1, ap( kc ), 1 ),&
+                              KIND=dp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( ap( kcnext+k-1 ) )
+                 ak = real( ap( kc+k-1 ),KIND=dp) / t
+                 akp1 = real( ap( kcnext+k ),KIND=dp) / t
+                 akkp1 = ap( kcnext+k-1 ) / t
+                 d = t*( ak*akp1-one )
+                 ap( kc+k-1 ) = akp1 / d
+                 ap( kcnext+k ) = ak / d
+                 ap( kcnext+k-1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_zcopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_zhpmv( uplo, k-1, -cone, ap, work, 1, czero,ap( kc ), 1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -real( stdlib_zdotc( k-1, work, 1, ap( kc ), 1 ),&
+                              KIND=dp)
+                    ap( kcnext+k-1 ) = ap( kcnext+k-1 ) -stdlib_zdotc( k-1, ap( kc ), 1, ap( &
+                              kcnext ),1 )
+                    call stdlib_zcopy( k-1, ap( kcnext ), 1, work, 1 )
+                    call stdlib_zhpmv( uplo, k-1, -cone, ap, work, 1, czero,ap( kcnext ), 1 )
+                              
+                    ap( kcnext+k ) = ap( kcnext+k ) -real( stdlib_zdotc( k-1, work, 1, ap( kcnext &
+                              ),1 ),KIND=dp)
+                 end if
+                 kstep = 2
+                 kcnext = kcnext + k + 1
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kpc = ( kp-1 )*kp / 2 + 1
+                 call stdlib_zswap( kp-1, ap( kc ), 1, ap( kpc ), 1 )
+                 kx = kpc + kp - 1
+                 do j = kp + 1, k - 1
+                    kx = kx + j - 1
+                    temp = conjg( ap( kc+j-1 ) )
+                    ap( kc+j-1 ) = conjg( ap( kx ) )
+                    ap( kx ) = temp
+                 end do
+                 ap( kc+kp-1 ) = conjg( ap( kc+kp-1 ) )
+                 temp = ap( kc+k-1 )
+                 ap( kc+k-1 ) = ap( kpc+kp-1 )
+                 ap( kpc+kp-1 ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc+k+k-1 )
+                    ap( kc+k+k-1 ) = ap( kc+k+kp-1 )
+                    ap( kc+k+kp-1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              kc = kcnext
+              go to 30
+              50 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**h.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              npp = n*( n+1 ) / 2
+              k = n
+              kc = npp
+              60 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 80
+              kcnext = kc - ( n-k+2 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc ) = one / real( ap( kc ),KIND=dp)
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_zcopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_zhpmv( uplo, n-k, -cone, ap( kc+n-k+1 ), work, 1,czero, ap( kc+1 )&
+                              , 1 )
+                    ap( kc ) = ap( kc ) - real( stdlib_zdotc( n-k, work, 1,ap( kc+1 ), 1 ),&
+                              KIND=dp)
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = abs( ap( kcnext+1 ) )
+                 ak = real( ap( kcnext ),KIND=dp) / t
+                 akp1 = real( ap( kc ),KIND=dp) / t
+                 akkp1 = ap( kcnext+1 ) / t
+                 d = t*( ak*akp1-one )
+                 ap( kcnext ) = akp1 / d
+                 ap( kc ) = ak / d
+                 ap( kcnext+1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_zcopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_zhpmv( uplo, n-k, -cone, ap( kc+( n-k+1 ) ), work,1, czero, ap( &
+                              kc+1 ), 1 )
+                    ap( kc ) = ap( kc ) - real( stdlib_zdotc( n-k, work, 1,ap( kc+1 ), 1 ),&
+                              KIND=dp)
+                    ap( kcnext+1 ) = ap( kcnext+1 ) -stdlib_zdotc( n-k, ap( kc+1 ), 1,ap( kcnext+&
+                              2 ), 1 )
+                    call stdlib_zcopy( n-k, ap( kcnext+2 ), 1, work, 1 )
+                    call stdlib_zhpmv( uplo, n-k, -cone, ap( kc+( n-k+1 ) ), work,1, czero, ap( &
+                              kcnext+2 ), 1 )
+                    ap( kcnext ) = ap( kcnext ) -real( stdlib_zdotc( n-k, work, 1, ap( kcnext+2 ),&
+                              1 ),KIND=dp)
+                 end if
+                 kstep = 2
+                 kcnext = kcnext - ( n-k+3 )
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kpc = npp - ( n-kp+1 )*( n-kp+2 ) / 2 + 1
+                 if( kp<n )call stdlib_zswap( n-kp, ap( kc+kp-k+1 ), 1, ap( kpc+1 ), 1 )
+                 kx = kc + kp - k
+                 do j = k + 1, kp - 1
+                    kx = kx + n - j + 1
+                    temp = conjg( ap( kc+j-k ) )
+                    ap( kc+j-k ) = conjg( ap( kx ) )
+                    ap( kx ) = temp
+                 end do
+                 ap( kc+kp-k ) = conjg( ap( kc+kp-k ) )
+                 temp = ap( kc )
+                 ap( kc ) = ap( kpc )
+                 ap( kpc ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc-n+k-1 )
+                    ap( kc-n+k-1 ) = ap( kc-n+kp-1 )
+                    ap( kc-n+kp-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              kc = kcnext
+              go to 60
+              80 continue
+           end if
+           return
+     end subroutine stdlib_zhptri
+
+     !> ZLA_GBAMV:  performs one of the matrix-vector operations
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_zla_gbamv( trans, m, n, kl, ku, alpha, ab, ldab, x,incx, beta, y, incy )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, ldab, m, n, kl, ku, trans
+           ! Array Arguments 
+           complex(dp), intent(in) :: ab(ldab,*), x(*)
+           real(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(dp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky, lenx, leny, kd, ke
+           complex(dp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,real,aimag,sign
+           ! Statement Functions
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( .not.( ( trans==stdlib_ilatrans( 'N' ) ).or. ( trans==stdlib_ilatrans( 'T' ) )&
+                     .or. ( trans==stdlib_ilatrans( 'C' ) ) ) ) then
+              info = 1
+           else if( m<0 )then
+              info = 2
+           else if( n<0 )then
+              info = 3
+           else if( kl<0 .or. kl>m-1 ) then
+              info = 4
+           else if( ku<0 .or. ku>n-1 ) then
+              info = 5
+           else if( ldab<kl+ku+1 )then
+              info = 6
+           else if( incx==0 )then
+              info = 8
+           else if( incy==0 )then
+              info = 11
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'ZLA_GBAMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( m==0 ).or.( n==0 ).or.( ( alpha==czero ).and.( beta==cone ) ) )return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if( trans==stdlib_ilatrans( 'N' ) )then
+              lenx = n
+              leny = m
+           else
+              lenx = m
+              leny = n
+           end if
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( lenx - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( leny - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(m*n) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           kd = ku + 1
+           ke = kl + 1
+           iy = ky
+           if ( incx==1 ) then
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = cabs1( ab( kd+i-j, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = cabs1( ab( ke-i+j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       jx = kx
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = cabs1( ab( kd+i-j, j ) )
+                          symb_zero = symb_zero .and.( x( jx ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       jx = kx
+                       do j = max( i-kl, 1 ), min( i+ku, lenx )
+                          temp = cabs1( ab( ke-i+j, i ) )
+                          symb_zero = symb_zero .and.( x( jx ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zla_gbamv
+
+     !> ZLA_GBRPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     pure real(dp) function stdlib_zla_gbrpvgrw( n, kl, ku, ncols, ab,ldab, afb, ldafb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, kl, ku, ncols, ldab, ldafb
+           ! Array Arguments 
+           complex(dp), intent(in) :: ab(ldab,*), afb(ldafb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j, kd
+           real(dp) :: amax, umax, rpvgrw
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,aimag
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           rpvgrw = one
+           kd = ku + 1
+           do j = 1, ncols
+              amax = zero
+              umax = zero
+              do i = max( j-ku, 1 ), min( j+kl, n )
+                 amax = max( cabs1( ab( kd+i-j, j ) ), amax )
+              end do
+              do i = max( j-ku, 1 ), j
+                 umax = max( cabs1( afb( kd+i-j, j ) ), umax )
+              end do
+              if ( umax /= zero ) then
+                 rpvgrw = min( amax / umax, rpvgrw )
+              end if
+           end do
+           stdlib_zla_gbrpvgrw = rpvgrw
+     end function stdlib_zla_gbrpvgrw
+
+     !> ZLA_GEAMV:  performs one of the matrix-vector operations
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> or   y := alpha*abs(A)**T*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> m by n matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_zla_geamv( trans, m, n, alpha, a, lda, x, incx, beta,y, incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, m, n
+           integer(ilp), intent(in) :: trans
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), x(*)
+           real(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(dp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky, lenx, leny
+           complex(dp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,real,aimag,sign
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( .not.( ( trans==stdlib_ilatrans( 'N' ) ).or. ( trans==stdlib_ilatrans( 'T' ) )&
+                     .or. ( trans==stdlib_ilatrans( 'C' ) ) ) ) then
+              info = 1
+           else if( m<0 )then
+              info = 2
+           else if( n<0 )then
+              info = 3
+           else if( lda<max( 1, m ) )then
+              info = 6
+           else if( incx==0 )then
+              info = 8
+           else if( incy==0 )then
+              info = 11
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'ZLA_GEAMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( m==0 ).or.( n==0 ).or.( ( alpha==czero ).and.( beta==cone ) ) )return
+           ! set  lenx  and  leny, the lengths of the vectors x and y, and set
+           ! up the start points in  x  and  y.
+           if( trans==stdlib_ilatrans( 'N' ) )then
+              lenx = n
+              leny = m
+           else
+              lenx = m
+              leny = n
+           end if
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( lenx - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( leny - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(m*n) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       do j = 1, lenx
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero ) y( iy ) =y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       do j = 1, lenx
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero ) y( iy ) =y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if( trans==stdlib_ilatrans( 'N' ) )then
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       jx = kx
+                       do j = 1, lenx
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( jx ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero ) y( iy ) =y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, leny
+                    if ( beta == czero ) then
+                       symb_zero = .true.
+                       y( iy ) = czero
+                    else if ( y( iy ) == czero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= czero ) then
+                       jx = kx
+                       do j = 1, lenx
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( jx ) == czero .or. temp == czero )
+                                    
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero ) y( iy ) =y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zla_geamv
+
+     !> ZLA_GERPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     pure real(dp) function stdlib_zla_gerpvgrw( n, ncols, a, lda, af,ldaf )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, ncols, lda, ldaf
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), af(ldaf,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: amax, umax, rpvgrw
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: max,min,abs,real,aimag
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           rpvgrw = one
+           do j = 1, ncols
+              amax = zero
+              umax = zero
+              do i = 1, n
+                 amax = max( cabs1( a( i, j ) ), amax )
+              end do
+              do i = 1, j
+                 umax = max( cabs1( af( i, j ) ), umax )
+              end do
+              if ( umax /= zero ) then
+                 rpvgrw = min( amax / umax, rpvgrw )
+              end if
+           end do
+           stdlib_zla_gerpvgrw = rpvgrw
+     end function stdlib_zla_gerpvgrw
+
+     !> ZLA_SYAMV  performs the matrix-vector operation
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> n by n symmetric matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_zla_heamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n, uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), x(*)
+           real(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(dp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign,real,aimag
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag ( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( uplo/=stdlib_ilauplo( 'U' ) .and.uplo/=stdlib_ilauplo( 'L' ) )then
+              info = 1
+           else if( n<0 )then
+              info = 2
+           else if( lda<max( 1, n ) )then
+              info = 5
+           else if( incx==0 )then
+              info = 7
+           else if( incy==0 )then
+              info = 10
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'ZHEMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( n - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( n - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(n^2) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if (.not.symb_zero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if (.not.symb_zero)y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zla_heamv
+
+     !> ZLA_LIN_BERR: computes componentwise relative backward error from
+     !> the formula
+     !> max(i) ( abs(R(i)) / ( abs(op(A_s))*abs(Y) + abs(B_s) )(i) )
+     !> where abs(Z) is the componentwise absolute value of the matrix
+     !> or vector Z.
+
+     pure subroutine stdlib_zla_lin_berr( n, nz, nrhs, res, ayb, berr )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n, nz, nrhs
+           ! Array Arguments 
+           real(dp), intent(in) :: ayb(n,nrhs)
+           real(dp), intent(out) :: berr(nrhs)
+           complex(dp), intent(in) :: res(n,nrhs)
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: tmp,safe1
+           integer(ilp) :: i, j
+           complex(dp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           complex(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! adding safe1 to the numerator guards against spuriously zero
+           ! residuals.  a similar safeguard is in the cla_yyamv routine used
+           ! to compute ayb.
+           safe1 = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = (nz+1)*safe1
+           do j = 1, nrhs
+              berr(j) = zero
+              do i = 1, n
+                 if (ayb(i,j) /= zero) then
+                    tmp = (safe1 + cabs1(res(i,j)))/ayb(i,j)
+                    berr(j) = max( berr(j), tmp )
+                 end if
+           ! if ayb is exactly 0.0_dp (and if computed by cla_yyamv), then we know
+           ! the true residual also must be exactly zero.
+              end do
+           end do
+     end subroutine stdlib_zla_lin_berr
+
+     !> ZLA_PORPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(dp) function stdlib_zla_porpvgrw( uplo, ncols, a, lda, af,ldaf, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: ncols, lda, ldaf
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: amax, umax, rpvgrw
+           logical(lk) :: upper
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,real,aimag
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           ! stdlib_dpotrf will have factored only the ncolsxncols leading minor, so
+           ! we restrict the growth search to that minor and use only the first
+           ! 2*ncols workspace entries.
+           rpvgrw = one
+           do i = 1, 2*ncols
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column.
+           if ( upper ) then
+              do j = 1, ncols
+                 do i = 1, j
+                    work( ncols+j ) =max( cabs1( a( i, j ) ), work( ncols+j ) )
+                 end do
+              end do
+           else
+              do j = 1, ncols
+                 do i = j, ncols
+                    work( ncols+j ) =max( cabs1( a( i, j ) ), work( ncols+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of the factor in
+           ! af.  no pivoting, so no permutations.
+           if ( stdlib_lsame( 'UPPER', uplo ) ) then
+              do j = 1, ncols
+                 do i = 1, j
+                    work( j ) = max( cabs1( af( i, j ) ), work( j ) )
+                 end do
+              end do
+           else
+              do j = 1, ncols
+                 do i = j, ncols
+                    work( j ) = max( cabs1( af( i, j ) ), work( j ) )
+                 end do
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( stdlib_lsame( 'UPPER', uplo ) ) then
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( ncols+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( ncols+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_zla_porpvgrw = rpvgrw
+     end function stdlib_zla_porpvgrw
+
+     !> ZLA_SYAMV:  performs the matrix-vector operation
+     !> y := alpha*abs(A)*abs(x) + beta*abs(y),
+     !> where alpha and beta are scalars, x and y are vectors and A is an
+     !> n by n symmetric matrix.
+     !> This function is primarily used in calculating error bounds.
+     !> To protect against underflow during evaluation, components in
+     !> the resulting vector are perturbed away from zero by (N+1)
+     !> times the underflow threshold.  To prevent unnecessarily large
+     !> errors for block-structure embedded in general matrices,
+     !> "symbolically" zero components are not perturbed.  A zero
+     !> entry is considered "symbolic" if all multiplications involved
+     !> in computing that entry have at least one zero multiplicand.
+
+     subroutine stdlib_zla_syamv( uplo, n, alpha, a, lda, x, incx, beta, y,incy )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: alpha, beta
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           integer(ilp), intent(in) :: uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), x(*)
+           real(dp), intent(inout) :: y(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: symb_zero
+           real(dp) :: temp, safe1
+           integer(ilp) :: i, info, iy, j, jx, kx, ky
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: max,abs,sign,real,aimag
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag ( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if     ( uplo/=stdlib_ilauplo( 'U' ) .and.uplo/=stdlib_ilauplo( 'L' ) )then
+              info = 1
+           else if( n<0 )then
+              info = 2
+           else if( lda<max( 1, n ) )then
+              info = 5
+           else if( incx==0 )then
+              info = 7
+           else if( incy==0 )then
+              info = 10
+           end if
+           if( info/=0 )then
+              call stdlib_xerbla( 'ZLA_SYAMV', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ).or.( ( alpha==zero ).and.( beta==one ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 )then
+              kx = 1
+           else
+              kx = 1 - ( n - 1 )*incx
+           end if
+           if( incy>0 )then
+              ky = 1
+           else
+              ky = 1 - ( n - 1 )*incy
+           end if
+           ! set safe1 essentially to be the underflow threshold times the
+           ! number of additions in each row.
+           safe1 = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = (n+1)*safe1
+           ! form  y := alpha*abs(a)*abs(x) + beta*abs(y).
+           ! the o(n^2) symb_zero tests could be replaced by o(n) queries to
+           ! the inexact flag.  still doesn't help change the iteration order
+           ! to per-column.
+           iy = ky
+           if ( incx==1 ) then
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( j ) )*temp
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           else
+              if ( uplo == stdlib_ilauplo( 'U' ) ) then
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              else
+                 do i = 1, n
+                    if ( beta == zero ) then
+                       symb_zero = .true.
+                       y( iy ) = zero
+                    else if ( y( iy ) == zero ) then
+                       symb_zero = .true.
+                    else
+                       symb_zero = .false.
+                       y( iy ) = beta * abs( y( iy ) )
+                    end if
+                    jx = kx
+                    if ( alpha /= zero ) then
+                       do j = 1, i
+                          temp = cabs1( a( i, j ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                       do j = i+1, n
+                          temp = cabs1( a( j, i ) )
+                          symb_zero = symb_zero .and.( x( j ) == zero .or. temp == zero )
+                          y( iy ) = y( iy ) + alpha*cabs1( x( jx ) )*temp
+                          jx = jx + incx
+                       end do
+                    end if
+                    if ( .not.symb_zero )y( iy ) = y( iy ) + sign( safe1, y( iy ) )
+                    iy = iy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zla_syamv
+
+     !> ZLA_WWADDW: adds a vector W into a doubled-single vector (X, Y).
+     !> This works for all extant IBM's hex and binary floating point
+     !> arithmetic, but not for decimal.
+
+     pure subroutine stdlib_zla_wwaddw( n, x, y, w )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: x(*), y(*)
+           complex(dp), intent(in) :: w(*)
+        ! =====================================================================
+           ! Local Scalars 
+           complex(dp) :: s
+           integer(ilp) :: i
+           ! Executable Statements 
+           do 10 i = 1, n
+             s = x(i) + w(i)
+             s = (s + s) - s
+             y(i) = ((x(i) - s) + w(i)) + y(i)
+             x(i) = s
+             10 continue
+           return
+     end subroutine stdlib_zla_wwaddw
+
+     !> ZLACGV: conjugates a complex vector of length N.
+
+     pure subroutine stdlib_zlacgv( n, x, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: x(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ioff
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           if( incx==1 ) then
+              do i = 1, n
+                 x( i ) = conjg( x( i ) )
+              end do
+           else
+              ioff = 1
+              if( incx<0 )ioff = 1 - ( n-1 )*incx
+              do i = 1, n
+                 x( ioff ) = conjg( x( ioff ) )
+                 ioff = ioff + incx
+              end do
+           end if
+           return
+     end subroutine stdlib_zlacgv
+
+     !> ZLACN2: estimates the 1-norm of a square, complex matrix A.
+     !> Reverse communication is used for evaluating matrix-vector products.
+
+     pure subroutine stdlib_zlacn2( n, v, x, est, kase, isave )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(inout) :: kase
+           integer(ilp), intent(in) :: n
+           real(dp), intent(inout) :: est
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: isave(3)
+           complex(dp), intent(out) :: v(*)
+           complex(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, jlast
+           real(dp) :: absxi, altsgn, estold, safmin, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag
+           ! Executable Statements 
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           if( kase==0 ) then
+              do i = 1, n
+                 x( i ) = cmplx( one / real( n,KIND=dp),KIND=dp)
+              end do
+              kase = 1
+              isave( 1 ) = 1
+              return
+           end if
+           go to ( 20, 40, 70, 90, 120 )isave( 1 )
+           ! ................ entry   (isave( 1 ) = 1)
+           ! first iteration.  x has been overwritten by a*x.
+           20 continue
+           if( n==1 ) then
+              v( 1 ) = x( 1 )
+              est = abs( v( 1 ) )
+              ! ... quit
+              go to 130
+           end if
+           est = stdlib_dzsum1( n, x, 1 )
+           do i = 1, n
+              absxi = abs( x( i ) )
+              if( absxi>safmin ) then
+                 x( i ) = cmplx( real( x( i ),KIND=dp) / absxi,aimag( x( i ) ) / absxi,KIND=dp)
+                           
+              else
+                 x( i ) = cone
+              end if
+           end do
+           kase = 2
+           isave( 1 ) = 2
+           return
+           ! ................ entry   (isave( 1 ) = 2)
+           ! first iteration.  x has been overwritten by ctrans(a)*x.
+           40 continue
+           isave( 2 ) = stdlib_izmax1( n, x, 1 )
+           isave( 3 ) = 2
+           ! main loop - iterations 2,3,...,itmax.
+           50 continue
+           do i = 1, n
+              x( i ) = czero
+           end do
+           x( isave( 2 ) ) = cone
+           kase = 1
+           isave( 1 ) = 3
+           return
+           ! ................ entry   (isave( 1 ) = 3)
+           ! x has been overwritten by a*x.
+           70 continue
+           call stdlib_zcopy( n, x, 1, v, 1 )
+           estold = est
+           est = stdlib_dzsum1( n, v, 1 )
+           ! test for cycling.
+           if( est<=estold )go to 100
+           do i = 1, n
+              absxi = abs( x( i ) )
+              if( absxi>safmin ) then
+                 x( i ) = cmplx( real( x( i ),KIND=dp) / absxi,aimag( x( i ) ) / absxi,KIND=dp)
+                           
+              else
+                 x( i ) = cone
+              end if
+           end do
+           kase = 2
+           isave( 1 ) = 4
+           return
+           ! ................ entry   (isave( 1 ) = 4)
+           ! x has been overwritten by ctrans(a)*x.
+           90 continue
+           jlast = isave( 2 )
+           isave( 2 ) = stdlib_izmax1( n, x, 1 )
+           if( ( abs( x( jlast ) )/=abs( x( isave( 2 ) ) ) ) .and.( isave( 3 )<itmax ) ) &
+                     then
+              isave( 3 ) = isave( 3 ) + 1
+              go to 50
+           end if
+           ! iteration complete.  final stage.
+           100 continue
+           altsgn = one
+           do i = 1, n
+              x( i ) = cmplx( altsgn*( one+real( i-1,KIND=dp) / real( n-1,KIND=dp) ),KIND=dp)
+                        
+              altsgn = -altsgn
+           end do
+           kase = 1
+           isave( 1 ) = 5
+           return
+           ! ................ entry   (isave( 1 ) = 5)
+           ! x has been overwritten by a*x.
+           120 continue
+           temp = two*( stdlib_dzsum1( n, x, 1 ) / real( 3*n,KIND=dp) )
+           if( temp>est ) then
+              call stdlib_zcopy( n, x, 1, v, 1 )
+              est = temp
+           end if
+           130 continue
+           kase = 0
+           return
+     end subroutine stdlib_zlacn2
+
+     !> ZLACON: estimates the 1-norm of a square, complex matrix A.
+     !> Reverse communication is used for evaluating matrix-vector products.
+
+     subroutine stdlib_zlacon( n, v, x, est, kase )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(inout) :: kase
+           integer(ilp), intent(in) :: n
+           real(dp), intent(inout) :: est
+           ! Array Arguments 
+           complex(dp), intent(out) :: v(n)
+           complex(dp), intent(inout) :: x(n)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iter, j, jlast, jump
+           real(dp) :: absxi, altsgn, estold, safmin, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag
+           ! Save Statement 
+           save
+           ! Executable Statements 
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           if( kase==0 ) then
+              do i = 1, n
+                 x( i ) = cmplx( one / real( n,KIND=dp),KIND=dp)
+              end do
+              kase = 1
+              jump = 1
+              return
+           end if
+           go to ( 20, 40, 70, 90, 120 )jump
+           ! ................ entry   (jump = 1)
+           ! first iteration.  x has been overwritten by a*x.
+           20 continue
+           if( n==1 ) then
+              v( 1 ) = x( 1 )
+              est = abs( v( 1 ) )
+              ! ... quit
+              go to 130
+           end if
+           est = stdlib_dzsum1( n, x, 1 )
+           do i = 1, n
+              absxi = abs( x( i ) )
+              if( absxi>safmin ) then
+                 x( i ) = cmplx( real( x( i ),KIND=dp) / absxi,aimag( x( i ) ) / absxi,KIND=dp)
+                           
+              else
+                 x( i ) = cone
+              end if
+           end do
+           kase = 2
+           jump = 2
+           return
+           ! ................ entry   (jump = 2)
+           ! first iteration.  x has been overwritten by ctrans(a)*x.
+           40 continue
+           j = stdlib_izmax1( n, x, 1 )
+           iter = 2
+           ! main loop - iterations 2,3,...,itmax.
+           50 continue
+           do i = 1, n
+              x( i ) = czero
+           end do
+           x( j ) = cone
+           kase = 1
+           jump = 3
+           return
+           ! ................ entry   (jump = 3)
+           ! x has been overwritten by a*x.
+           70 continue
+           call stdlib_zcopy( n, x, 1, v, 1 )
+           estold = est
+           est = stdlib_dzsum1( n, v, 1 )
+           ! test for cycling.
+           if( est<=estold )go to 100
+           do i = 1, n
+              absxi = abs( x( i ) )
+              if( absxi>safmin ) then
+                 x( i ) = cmplx( real( x( i ),KIND=dp) / absxi,aimag( x( i ) ) / absxi,KIND=dp)
+                           
+              else
+                 x( i ) = cone
+              end if
+           end do
+           kase = 2
+           jump = 4
+           return
+           ! ................ entry   (jump = 4)
+           ! x has been overwritten by ctrans(a)*x.
+           90 continue
+           jlast = j
+           j = stdlib_izmax1( n, x, 1 )
+           if( ( abs( x( jlast ) )/=abs( x( j ) ) ) .and.( iter<itmax ) ) then
+              iter = iter + 1
+              go to 50
+           end if
+           ! iteration complete.  final stage.
+           100 continue
+           altsgn = one
+           do i = 1, n
+              x( i ) = cmplx( altsgn*( one+real( i-1,KIND=dp) / real( n-1,KIND=dp) ),KIND=dp)
+                        
+              altsgn = -altsgn
+           end do
+           kase = 1
+           jump = 5
+           return
+           ! ................ entry   (jump = 5)
+           ! x has been overwritten by a*x.
+           120 continue
+           temp = two*( stdlib_dzsum1( n, x, 1 ) / real( 3*n,KIND=dp) )
+           if( temp>est ) then
+              call stdlib_zcopy( n, x, 1, v, 1 )
+              est = temp
+           end if
+           130 continue
+           kase = 0
+           return
+     end subroutine stdlib_zlacon
+
+     !> ZLACP2: copies all or part of a real two-dimensional matrix A to a
+     !> complex matrix B.
+
+     pure subroutine stdlib_zlacp2( uplo, m, n, a, lda, b, ldb )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(out) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              do j = 1, n
+                 do i = j, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = 1, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_zlacp2
+
+     !> ZLACPY: copies all or part of a two-dimensional matrix A to another
+     !> matrix B.
+
+     pure subroutine stdlib_zlacpy( uplo, m, n, a, lda, b, ldb )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldb, m, n
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(out) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              do j = 1, n
+                 do i = j, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = 1, m
+                    b( i, j ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_zlacpy
+
+     !> ZLACRM: performs a very simple matrix-matrix multiplication:
+     !> C := A * B,
+     !> where A is M by N and complex; B is N by N and real;
+     !> C is M by N and complex.
+
+     pure subroutine stdlib_zlacrm( m, n, a, lda, b, ldb, c, ldc, rwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           ! Array Arguments 
+           real(dp), intent(in) :: b(ldb,*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(out) :: c(ldc,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,aimag
+           ! Executable Statements 
+           ! quick return if possible.
+           if( ( m==0 ) .or. ( n==0 ) )return
+           do j = 1, n
+              do i = 1, m
+                 rwork( ( j-1 )*m+i ) = real( a( i, j ),KIND=dp)
+              end do
+           end do
+           l = m*n + 1
+           call stdlib_dgemm( 'N', 'N', m, n, n, one, rwork, m, b, ldb, zero,rwork( l ), m )
+                     
+           do j = 1, n
+              do i = 1, m
+                 c( i, j ) = rwork( l+( j-1 )*m+i-1 )
+              end do
+           end do
+           do j = 1, n
+              do i = 1, m
+                 rwork( ( j-1 )*m+i ) = aimag( a( i, j ) )
+              end do
+           end do
+           call stdlib_dgemm( 'N', 'N', m, n, n, one, rwork, m, b, ldb, zero,rwork( l ), m )
+                     
+           do j = 1, n
+              do i = 1, m
+                 c( i, j ) = cmplx( real( c( i, j ),KIND=dp),rwork( l+( j-1 )*m+i-1 ),KIND=dp)
+                           
+              end do
+           end do
+           return
+     end subroutine stdlib_zlacrm
+
+     !> ZLACRT: performs the operation
+     !> (  c  s )( x )  ==> ( x )
+     !> ( -s  c )( y )      ( y )
+     !> where c and s are complex and the vectors x and y are complex.
+
+     pure subroutine stdlib_zlacrt( n, cx, incx, cy, incy, c, s )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           complex(dp), intent(in) :: c, s
+           ! Array Arguments 
+           complex(dp), intent(inout) :: cx(*), cy(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           complex(dp) :: ctemp
+           ! Executable Statements 
+           if( n<=0 )return
+           if( incx==1 .and. incy==1 )go to 20
+           ! code for unequal increments or equal increments not equal to 1
+           ix = 1
+           iy = 1
+           if( incx<0 )ix = ( -n+1 )*incx + 1
+           if( incy<0 )iy = ( -n+1 )*incy + 1
+           do i = 1, n
+              ctemp = c*cx( ix ) + s*cy( iy )
+              cy( iy ) = c*cy( iy ) - s*cx( ix )
+              cx( ix ) = ctemp
+              ix = ix + incx
+              iy = iy + incy
+           end do
+           return
+           ! code for both increments equal to 1
+           20 continue
+           do i = 1, n
+              ctemp = c*cx( i ) + s*cy( i )
+              cy( i ) = c*cy( i ) - s*cx( i )
+              cx( i ) = ctemp
+           end do
+           return
+     end subroutine stdlib_zlacrt
+
+     !> ZLADIV: := X / Y, where X and Y are complex.  The computation of X / Y
+     !> will not overflow on an intermediary step unless the results
+     !> overflows.
+
+     pure complex(dp)     function stdlib_zladiv( x, y )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: x, y
+        ! =====================================================================
+           ! Local Scalars 
+           real(dp) :: zi, zr
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,aimag
+           ! Executable Statements 
+           call stdlib_dladiv( real( x,KIND=dp), aimag( x ), real( y,KIND=dp), aimag( y ), zr,zi )
+                     
+           stdlib_zladiv = cmplx( zr, zi,KIND=dp)
+           return
+     end function stdlib_zladiv
+
+     !> ZLAED8: merges the two sets of eigenvalues together into a single
+     !> sorted set.  Then it tries to deflate the size of the problem.
+     !> There are two ways in which deflation can occur:  when two or more
+     !> eigenvalues are close together or if there is a tiny element in the
+     !> Z vector.  For each such occurrence the order of the related secular
+     !> equation problem is reduced by one.
+
+     pure subroutine stdlib_zlaed8( k, n, qsiz, q, ldq, d, rho, cutpnt, z, dlamda,q2, ldq2, w, &
+               indxp, indx, indxq, perm, givptr,givcol, givnum, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: cutpnt, ldq, ldq2, n, qsiz
+           integer(ilp), intent(out) :: givptr, info, k
+           real(dp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(out) :: givcol(2,*), indx(*), indxp(*), perm(*)
+           integer(ilp), intent(inout) :: indxq(*)
+           real(dp), intent(inout) :: d(*), z(*)
+           real(dp), intent(out) :: dlamda(*), givnum(2,*), w(*)
+           complex(dp), intent(inout) :: q(ldq,*)
+           complex(dp), intent(out) :: q2(ldq2,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: mone = -1.0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, imax, j, jlam, jmax, jp, k2, n1, n1p1, n2
+           real(dp) :: c, eps, s, t, tau, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( qsiz<n ) then
+              info = -3
+           else if( ldq<max( 1, n ) ) then
+              info = -5
+           else if( cutpnt<min( 1, n ) .or. cutpnt>n ) then
+              info = -8
+           else if( ldq2<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAED8', -info )
+              return
+           end if
+           ! need to initialize givptr to o here in case of quick exit
+           ! to prevent an unspecified code behavior (usually sigfault)
+           ! when iwork array on entry to *stedc is not zeroed
+           ! (or at least some iwork entries which used in *laed7 for givptr).
+           givptr = 0
+           ! quick return if possible
+           if( n==0 )return
+           n1 = cutpnt
+           n2 = n - n1
+           n1p1 = n1 + 1
+           if( rho<zero ) then
+              call stdlib_dscal( n2, mone, z( n1p1 ), 1 )
+           end if
+           ! normalize z so that norm(z) = 1
+           t = one / sqrt( two )
+           do j = 1, n
+              indx( j ) = j
+           end do
+           call stdlib_dscal( n, t, z, 1 )
+           rho = abs( two*rho )
+           ! sort the eigenvalues into increasing order
+           do i = cutpnt + 1, n
+              indxq( i ) = indxq( i ) + cutpnt
+           end do
+           do i = 1, n
+              dlamda( i ) = d( indxq( i ) )
+              w( i ) = z( indxq( i ) )
+           end do
+           i = 1
+           j = cutpnt + 1
+           call stdlib_dlamrg( n1, n2, dlamda, 1, 1, indx )
+           do i = 1, n
+              d( i ) = dlamda( indx( i ) )
+              z( i ) = w( indx( i ) )
+           end do
+           ! calculate the allowable deflation tolerance
+           imax = stdlib_idamax( n, z, 1 )
+           jmax = stdlib_idamax( n, d, 1 )
+           eps = stdlib_dlamch( 'EPSILON' )
+           tol = eight*eps*abs( d( jmax ) )
+           ! if the rank-1 modifier is small enough, no more needs to be done
+           ! -- except to reorganize q so that its columns correspond with the
+           ! elements in d.
+           if( rho*abs( z( imax ) )<=tol ) then
+              k = 0
+              do j = 1, n
+                 perm( j ) = indxq( indx( j ) )
+                 call stdlib_zcopy( qsiz, q( 1, perm( j ) ), 1, q2( 1, j ), 1 )
+              end do
+              call stdlib_zlacpy( 'A', qsiz, n, q2( 1, 1 ), ldq2, q( 1, 1 ), ldq )
+              return
+           end if
+           ! if there are multiple eigenvalues then the problem deflates.  here
+           ! the number of equal eigenvalues are found.  as each equal
+           ! eigenvalue is found, an elementary reflector is computed to rotate
+           ! the corresponding eigensubspace so that the corresponding
+           ! components of z are zero in this new basis.
+           k = 0
+           k2 = n + 1
+           do j = 1, n
+              if( rho*abs( z( j ) )<=tol ) then
+                 ! deflate due to small z component.
+                 k2 = k2 - 1
+                 indxp( k2 ) = j
+                 if( j==n )go to 100
+              else
+                 jlam = j
+                 go to 70
+              end if
+           end do
+           70 continue
+           j = j + 1
+           if( j>n )go to 90
+           if( rho*abs( z( j ) )<=tol ) then
+              ! deflate due to small z component.
+              k2 = k2 - 1
+              indxp( k2 ) = j
+           else
+              ! check if eigenvalues are close enough to allow deflation.
+              s = z( jlam )
+              c = z( j )
+              ! find sqrt(a**2+b**2) without overflow or
+              ! destructive underflow.
+              tau = stdlib_dlapy2( c, s )
+              t = d( j ) - d( jlam )
+              c = c / tau
+              s = -s / tau
+              if( abs( t*c*s )<=tol ) then
+                 ! deflation is possible.
+                 z( j ) = tau
+                 z( jlam ) = zero
+                 ! record the appropriate givens rotation
+                 givptr = givptr + 1
+                 givcol( 1, givptr ) = indxq( indx( jlam ) )
+                 givcol( 2, givptr ) = indxq( indx( j ) )
+                 givnum( 1, givptr ) = c
+                 givnum( 2, givptr ) = s
+                 call stdlib_zdrot( qsiz, q( 1, indxq( indx( jlam ) ) ), 1,q( 1, indxq( indx( j ) &
+                           ) ), 1, c, s )
+                 t = d( jlam )*c*c + d( j )*s*s
+                 d( j ) = d( jlam )*s*s + d( j )*c*c
+                 d( jlam ) = t
+                 k2 = k2 - 1
+                 i = 1
+                 80 continue
+                 if( k2+i<=n ) then
+                    if( d( jlam )<d( indxp( k2+i ) ) ) then
+                       indxp( k2+i-1 ) = indxp( k2+i )
+                       indxp( k2+i ) = jlam
+                       i = i + 1
+                       go to 80
+                    else
+                       indxp( k2+i-1 ) = jlam
+                    end if
+                 else
+                    indxp( k2+i-1 ) = jlam
+                 end if
+                 jlam = j
+              else
+                 k = k + 1
+                 w( k ) = z( jlam )
+                 dlamda( k ) = d( jlam )
+                 indxp( k ) = jlam
+                 jlam = j
+              end if
+           end if
+           go to 70
+           90 continue
+           ! record the last eigenvalue.
+           k = k + 1
+           w( k ) = z( jlam )
+           dlamda( k ) = d( jlam )
+           indxp( k ) = jlam
+           100 continue
+           ! sort the eigenvalues and corresponding eigenvectors into dlamda
+           ! and q2 respectively.  the eigenvalues/vectors which were not
+           ! deflated go into the first k slots of dlamda and q2 respectively,
+           ! while those which were deflated go into the last n - k slots.
+           do j = 1, n
+              jp = indxp( j )
+              dlamda( j ) = d( jp )
+              perm( j ) = indxq( indx( jp ) )
+              call stdlib_zcopy( qsiz, q( 1, perm( j ) ), 1, q2( 1, j ), 1 )
+           end do
+           ! the deflated eigenvalues and their corresponding vectors go back
+           ! into the last n - k slots of d and q respectively.
+           if( k<n ) then
+              call stdlib_dcopy( n-k, dlamda( k+1 ), 1, d( k+1 ), 1 )
+              call stdlib_zlacpy( 'A', qsiz, n-k, q2( 1, k+1 ), ldq2, q( 1, k+1 ),ldq )
+           end if
+           return
+     end subroutine stdlib_zlaed8
+
+     !> ZLAESY: computes the eigendecomposition of a 2-by-2 symmetric matrix
+     !> ( ( A, B );( B, C ) )
+     !> provided the norm of the matrix of eigenvectors is larger than
+     !> some threshold value.
+     !> RT1 is the eigenvalue of larger absolute value, and RT2 of
+     !> smaller absolute value.  If the eigenvectors are computed, then
+     !> on return ( CS1, SN1 ) is the unit eigenvector for RT1, hence
+     !> [  CS1     SN1   ] . [ A  B ] . [ CS1    -SN1   ] = [ RT1  0  ]
+     !> [ -SN1     CS1   ]   [ B  C ]   [ SN1     CS1   ]   [  0  RT2 ]
+
+     pure subroutine stdlib_zlaesy( a, b, c, rt1, rt2, evscal, cs1, sn1 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: a, b, c
+           complex(dp), intent(out) :: cs1, evscal, rt1, rt2, sn1
+       ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1_dp
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           real(dp) :: babs, evnorm, tabs, z
+           complex(dp) :: s, t, tmp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           ! special case:  the matrix is actually diagonal.
+           ! to avoid divide by zero later, we treat this case separately.
+           if( abs( b )==zero ) then
+              rt1 = a
+              rt2 = c
+              if( abs( rt1 )<abs( rt2 ) ) then
+                 tmp = rt1
+                 rt1 = rt2
+                 rt2 = tmp
+                 cs1 = zero
+                 sn1 = one
+              else
+                 cs1 = one
+                 sn1 = zero
+              end if
+           else
+              ! compute the eigenvalues and eigenvectors.
+              ! the characteristic equation is
+                 ! lambda **2 - (a+c) lambda + (a*c - b*b)
+              ! and we solve it using the quadratic formula.
+              s = ( a+c )*half
+              t = ( a-c )*half
+              ! take the square root carefully to avoid over/under flow.
+              babs = abs( b )
+              tabs = abs( t )
+              z = max( babs, tabs )
+              if( z>zero )t = z*sqrt( ( t / z )**2+( b / z )**2 )
+              ! compute the two eigenvalues.  rt1 and rt2 are exchanged
+              ! if necessary so that rt1 will have the greater magnitude.
+              rt1 = s + t
+              rt2 = s - t
+              if( abs( rt1 )<abs( rt2 ) ) then
+                 tmp = rt1
+                 rt1 = rt2
+                 rt2 = tmp
+              end if
+              ! choose cs1 = 1 and sn1 to satisfy the first equation, then
+              ! scale the components of this eigenvector so that the matrix
+              ! of eigenvectors x satisfies  x * x**t = i .  (no scaling is
+              ! done if the norm of the eigenvalue matrix is less than thresh.)
+              sn1 = ( rt1-a ) / b
+              tabs = abs( sn1 )
+              if( tabs>one ) then
+                 t = tabs*sqrt( ( one / tabs )**2+( sn1 / tabs )**2 )
+              else
+                 t = sqrt( cone+sn1*sn1 )
+              end if
+              evnorm = abs( t )
+              if( evnorm>=thresh ) then
+                 evscal = cone / t
+                 cs1 = evscal
+                 sn1 = sn1*evscal
+              else
+                 evscal = zero
+              end if
+           end if
+           return
+     end subroutine stdlib_zlaesy
+
+     !> ZLAEV2: computes the eigendecomposition of a 2-by-2 Hermitian matrix
+     !> [  A         B  ]
+     !> [  CONJG(B)  C  ].
+     !> On return, RT1 is the eigenvalue of larger absolute value, RT2 is the
+     !> eigenvalue of smaller absolute value, and (CS1,SN1) is the unit right
+     !> eigenvector for RT1, giving the decomposition
+     !> [ CS1  CONJG(SN1) ] [    A     B ] [ CS1 -CONJG(SN1) ] = [ RT1  0  ]
+     !> [-SN1     CS1     ] [ CONJG(B) C ] [ SN1     CS1     ]   [  0  RT2 ].
+
+     pure subroutine stdlib_zlaev2( a, b, c, rt1, rt2, cs1, sn1 )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(out) :: cs1, rt1, rt2
+           complex(dp), intent(in) :: a, b, c
+           complex(dp), intent(out) :: sn1
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(dp) :: t
+           complex(dp) :: w
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg
+           ! Executable Statements 
+           if( abs( b )==zero ) then
+              w = one
+           else
+              w = conjg( b ) / abs( b )
+           end if
+           call stdlib_dlaev2( real( a,KIND=dp), abs( b ), real( c,KIND=dp), rt1, rt2, cs1, t )
+                     
+           sn1 = w*t
+           return
+     end subroutine stdlib_zlaev2
+
+     !> ZLAG2C: converts a COMPLEX*16 matrix, SA, to a COMPLEX matrix, A.
+     !> RMAX is the overflow for the SINGLE PRECISION arithmetic
+     !> ZLAG2C checks that all the entries of A are between -RMAX and
+     !> RMAX. If not the conversion is aborted and a flag is raised.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_zlag2c( m, n, a, lda, sa, ldsa, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, m, n
+           ! Array Arguments 
+           complex(sp), intent(out) :: sa(ldsa,*)
+           complex(dp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: rmax
+           ! Intrinsic Functions 
+           intrinsic :: real,aimag
+           ! Executable Statements 
+           rmax = stdlib_slamch( 'O' )
+           do j = 1, n
+              do i = 1, m
+                 if( ( real( a( i, j ),KIND=dp)<-rmax ) .or.( real( a( i, j ),KIND=dp)>rmax ) &
+                           .or.( aimag( a( i, j ) )<-rmax ) .or.( aimag( a( i, j ) )>rmax ) ) then
+                    info = 1
+                    go to 30
+                 end if
+                 sa( i, j ) = a( i, j )
+              end do
+           end do
+           info = 0
+           30 continue
+           return
+     end subroutine stdlib_zlag2c
+
+     !> ZLAGTM: performs a matrix-vector product of the form
+     !> B := alpha * A * X + beta * B
+     !> where A is a tridiagonal matrix of order N, B and X are N by NRHS
+     !> matrices, and alpha and beta are real scalars, each of which may be
+     !> 0., 1., or -1.
+
+     pure subroutine stdlib_zlagtm( trans, n, nrhs, alpha, dl, d, du, x, ldx, beta,b, ldb )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(dp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           complex(dp), intent(inout) :: b(ldb,*)
+           complex(dp), intent(in) :: d(*), dl(*), du(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           if( n==0 )return
+           ! multiply b by beta if beta/=1.
+           if( beta==zero ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = zero
+                 end do
+              end do
+           else if( beta==-one ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = -b( i, j )
+                 end do
+              end do
+           end if
+           if( alpha==one ) then
+              if( stdlib_lsame( trans, 'N' ) ) then
+                 ! compute b := b + a*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j ) +du( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) + dl( n-1 )*x( n-1, j ) +d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + dl( i-1 )*x( i-1, j ) +d( i )*x( i, j ) + du( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! compute b := b + a**t * x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + d( 1 )*x( 1, j ) +dl( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) + du( n-1 )*x( n-1, j ) +d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + du( i-1 )*x( i-1, j ) +d( i )*x( i, j ) + dl( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else if( stdlib_lsame( trans, 'C' ) ) then
+                 ! compute b := b + a**h * x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) + conjg( d( 1 ) )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) + conjg( d( 1 ) )*x( 1, j ) +conjg( dl( 1 ) )*x( 2, &
+                                 j )
+                       b( n, j ) = b( n, j ) + conjg( du( n-1 ) )*x( n-1, j ) + conjg( d( n ) )*x(&
+                                  n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) + conjg( du( i-1 ) )*x( i-1, j ) + conjg( d( i ) )&
+                                    *x( i, j ) + conjg( dl( i ) )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              end if
+           else if( alpha==-one ) then
+              if( stdlib_lsame( trans, 'N' ) ) then
+                 ! compute b := b - a*x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j ) -du( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) - dl( n-1 )*x( n-1, j ) -d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - dl( i-1 )*x( i-1, j ) -d( i )*x( i, j ) - du( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! compute b := b - a**t *x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - d( 1 )*x( 1, j ) -dl( 1 )*x( 2, j )
+                       b( n, j ) = b( n, j ) - du( n-1 )*x( n-1, j ) -d( n )*x( n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - du( i-1 )*x( i-1, j ) -d( i )*x( i, j ) - dl( i &
+                                    )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              else if( stdlib_lsame( trans, 'C' ) ) then
+                 ! compute b := b - a**h *x
+                 do j = 1, nrhs
+                    if( n==1 ) then
+                       b( 1, j ) = b( 1, j ) - conjg( d( 1 ) )*x( 1, j )
+                    else
+                       b( 1, j ) = b( 1, j ) - conjg( d( 1 ) )*x( 1, j ) -conjg( dl( 1 ) )*x( 2, &
+                                 j )
+                       b( n, j ) = b( n, j ) - conjg( du( n-1 ) )*x( n-1, j ) - conjg( d( n ) )*x(&
+                                  n, j )
+                       do i = 2, n - 1
+                          b( i, j ) = b( i, j ) - conjg( du( i-1 ) )*x( i-1, j ) - conjg( d( i ) )&
+                                    *x( i, j ) - conjg( dl( i ) )*x( i+1, j )
+                       end do
+                    end if
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zlagtm
+
+     !> ZLAHEF: computes a partial factorization of a complex Hermitian
+     !> matrix A using the Bunch-Kaufman diagonal pivoting method. The
+     !> partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I      0     )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**H U22**H )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**H L21**H )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0      I     )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> Note that U**H denotes the conjugate transpose of U.
+     !> ZLAHEF is an auxiliary routine called by ZHETRF. It uses blocked code
+     !> (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or
+     !> A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_zlahef( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: imax, j, jb, jj, jmax, jp, k, kk, kkw, kp, kstep, kw
+           real(dp) :: absakk, alpha, colmax, r1, rowmax, t
+           complex(dp) :: d11, d21, d22, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,aimag,max,min,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=dp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11 (note that conjg(w) is actually stored)
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              ! kw is the column of w which corresponds to column k of a
+              k = n
+              10 continue
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              ! copy column k of a to column kw of w and update it
+              call stdlib_zcopy( k-1, a( 1, k ), 1, w( 1, kw ), 1 )
+              w( k, kw ) = real( a( k, k ),KIND=dp)
+              if( k<n ) then
+                 call stdlib_zgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ), lda,w( k, kw+1 ), &
+                           ldw, cone, w( 1, kw ), 1 )
+                 w( k, kw ) = real( w( k, kw ),KIND=dp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, kw ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=dp)
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! begin pivot search along imax row
+                    ! copy column imax to column kw-1 of w and update it
+                    call stdlib_zcopy( imax-1, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                    w( imax, kw-1 ) = real( a( imax, imax ),KIND=dp)
+                    call stdlib_zcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                              
+                    call stdlib_zlacgv( k-imax, w( imax+1, kw-1 ), 1 )
+                    if( k<n ) then
+                       call stdlib_zgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda, w( imax,&
+                                  kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                       w( imax, kw-1 ) = real( w( imax, kw-1 ),KIND=dp)
+                    end if
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value.
+                    ! determine only rowmax.
+                    jmax = imax + stdlib_izamax( k-imax, w( imax+1, kw-1 ), 1 )
+                    rowmax = cabs1( w( jmax, kw-1 ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_izamax( imax-1, w( 1, kw-1 ), 1 )
+                       rowmax = max( rowmax, cabs1( w( jmax, kw-1 ) ) )
+                    end if
+                    ! case(2)
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    ! case(3)
+                    else if( abs( real( w( imax, kw-1 ),KIND=dp) )>=alpha*rowmax )then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column kw-1 of w to column kw of w
+                       call stdlib_zcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                    ! case(4)
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                    ! end pivot search along imax row
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=dp)
+                    call stdlib_zcopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_zlacgv( kk-1-kp, a( kp, kp+1 ), lda )
+                    if( kp>1 )call stdlib_zcopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_zswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_zswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! (1) store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element u(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=dp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_zcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(4))
+                       r1 = one / real( a( k, k ),KIND=dp)
+                       call stdlib_zdscal( k-1, r1, a( 1, k ), 1 )
+                       ! (2) conjugate column w(kw)
+                       call stdlib_zlacgv( k-1, w( 1, kw ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! (1) store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! (note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored)
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! = ( conj(d21)*( d11 ) d21*(  -1 ) )
+                         ! (           (  -1 )     ( d22 ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = t/d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0, since in 2x2 pivot case(4)
+                            ! |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / conjg( d21 )
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = one / ( real( d11*d22,KIND=dp)-one )
+                       d21 = t / d21
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = d21*( d11*w( j, kw-1 )-w( j, kw ) )
+                          a( j, k ) = conjg( d21 )*( d22*w( j, kw )-w( j, kw-1 ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                    ! (2) conjugate columns w(kw) and w(kw-1)
+                    call stdlib_zlacgv( k-1, w( 1, kw ), 1 )
+                    call stdlib_zlacgv( k-2, w( 1, kw-1 ), 1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**h = a11 - u12*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=dp)
+                    call stdlib_zgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=dp)
+                 end do
+                 ! update the rectangular superdiagonal block
+                 call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k,-cone, a( 1, k+1 ), &
+                           lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n looping backwards from k+1 to n
+              j = k + 1
+              60 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j + 1
+                 end if
+                 ! (note: here, j is used to determine row length. length n-j+1
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j + 1
+                 if( jp/=jj .and. j<=n )call stdlib_zswap( n-j+1, a( jp, j ), lda, a( jj, j ), &
+                           lda )
+              if( j<n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22 (note that conjg(w) is actually stored)
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              ! copy column k of a to column k of w and update it
+              w( k, k ) = real( a( k, k ),KIND=dp)
+              if( k<n )call stdlib_zcopy( n-k, a( k+1, k ), 1, w( k+1, k ), 1 )
+              call stdlib_zgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ), lda,w( k, 1 ), ldw,&
+                         cone, w( k, k ), 1 )
+              w( k, k ) = real( w( k, k ),KIND=dp)
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, k ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( a( k, k ),KIND=dp)
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! begin pivot search along imax row
+                    ! copy column imax to column k+1 of w and update it
+                    call stdlib_zcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1 )
+                    call stdlib_zlacgv( imax-k, w( k, k+1 ), 1 )
+                    w( imax, k+1 ) = real( a( imax, imax ),KIND=dp)
+                    if( imax<n )call stdlib_zcopy( n-imax, a( imax+1, imax ), 1,w( imax+1, k+1 ), &
+                              1 )
+                    call stdlib_zgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( imax, &
+                              1 ), ldw, cone, w( k, k+1 ),1 )
+                    w( imax, k+1 ) = real( w( imax, k+1 ),KIND=dp)
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value.
+                    ! determine only rowmax.
+                    jmax = k - 1 + stdlib_izamax( imax-k, w( k, k+1 ), 1 )
+                    rowmax = cabs1( w( jmax, k+1 ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_izamax( n-imax, w( imax+1, k+1 ), 1 )
+                       rowmax = max( rowmax, cabs1( w( jmax, k+1 ) ) )
+                    end if
+                    ! case(2)
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    ! case(3)
+                    else if( abs( real( w( imax, k+1 ),KIND=dp) )>=alpha*rowmax )then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column k+1 of w to column k of w
+                       call stdlib_zcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                    ! case(4)
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                    ! end pivot search along imax row
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=dp)
+                    call stdlib_zcopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    call stdlib_zlacgv( kp-kk-1, a( kp, kk+1 ), lda )
+                    if( kp<n )call stdlib_zcopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (columns k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_zswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_zswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! (1) store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=dp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_zcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(4))
+                       r1 = one / real( a( k, k ),KIND=dp)
+                       call stdlib_zdscal( n-k, r1, a( k+1, k ), 1 )
+                       ! (2) conjugate column w(k)
+                       call stdlib_zlacgv( n-k, w( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! (1) store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! (note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored)
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! = ( conj(d21)*( d11 ) d21*(  -1 ) )
+                         ! (           (  -1 )     ( d22 ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = t/d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0, since in 2x2 pivot case(4)
+                            ! |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / conjg( d21 )
+                       t = one / ( real( d11*d22,KIND=dp)-one )
+                       d21 = t / d21
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = conjg( d21 )*( d11*w( j, k )-w( j, k+1 ) )
+                          a( j, k+1 ) = d21*( d22*w( j, k+1 )-w( j, k ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    ! (2) conjugate columns w(k) and w(k+1)
+                    call stdlib_zlacgv( n-k, w( k+1, k ), 1 )
+                    call stdlib_zlacgv( n-k-1, w( k+2, k+1 ), 1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**h = a22 - l21*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=dp)
+                    call stdlib_zgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=dp)
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! of rows in columns 1:k-1 looping backwards from k-1 to 1
+              j = k - 1
+              120 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j - 1
+                 end if
+                 ! (note: here, j is used to determine row length. length j
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j - 1
+                 if( jp/=jj .and. j>=1 )call stdlib_zswap( j, a( jp, 1 ), lda, a( jj, 1 ), lda )
+                           
+              if( j>1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_zlahef
+
+     !> ZLAHEF_RK: computes a partial factorization of a complex Hermitian
+     !> matrix A using the bounded Bunch-Kaufman (rook) diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**H U22**H )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**H L21**H )  if UPLO = 'L',
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> ZLAHEF_RK is an auxiliary routine called by ZHETRF_RK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_zlahef_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: w(ldw,*), e(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, ii, j, jb, jj, jmax, k, kk, kkw, kp, kstep, kw, p
+           real(dp) :: absakk, alpha, colmax, dtemp, r1, rowmax, t, sfmin
+           complex(dp) :: d11, d21, d22, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,aimag,max,min,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=dp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11 (note that conjg(w) is actually stored)
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = czero
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              if( k>1 )call stdlib_zcopy( k-1, a( 1, k ), 1, w( 1, kw ), 1 )
+              w( k, kw ) = real( a( k, k ),KIND=dp)
+              if( k<n ) then
+                 call stdlib_zgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ), lda,w( k, kw+1 ), &
+                           ldw, cone, w( 1, kw ), 1 )
+                 w( k, kw ) = real( w( k, kw ),KIND=dp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, kw ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( w( k, kw ),KIND=dp)
+                 if( k>1 )call stdlib_zcopy( k-1, w( 1, kw ), 1, a( 1, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! lop until pivot found
+                    done = .false.
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       if( imax>1 )call stdlib_zcopy( imax-1, a( 1, imax ), 1, w( 1, kw-1 ),1 )
+                                 
+                       w( imax, kw-1 ) = real( a( imax, imax ),KIND=dp)
+                       call stdlib_zcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       call stdlib_zlacgv( k-imax, w( imax+1, kw-1 ), 1 )
+                       if( k<n ) then
+                          call stdlib_zgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda, w( &
+                                    imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                          w( imax, kw-1 ) = real( w( imax, kw-1 ),KIND=dp)
+                       end if
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_izamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = cabs1( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_izamax( imax-1, w( 1, kw-1 ), 1 )
+                          dtemp = cabs1( w( itemp, kw-1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=dp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( w( imax,kw-1 ),KIND=dp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_zcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_zcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 12
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns p and k.
+                 ! updated column p is already stored in column kw of w.
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p of submatrix a
+                    ! at step k. no need to copy element into columns
+                    ! k and k-1 of a for 2-by-2 pivot, since these columns
+                    ! will be later overwritten.
+                    a( p, p ) = real( a( k, k ),KIND=dp)
+                    call stdlib_zcopy( k-1-p, a( p+1, k ), 1, a( p, p+1 ),lda )
+                    call stdlib_zlacgv( k-1-p, a( p, p+1 ), lda )
+                    if( p>1 )call stdlib_zcopy( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in the last k+1 to n columns of a
+                    ! (columns k and k-1 of a for 2-by-2 pivot will be
+                    ! later overwritten). interchange rows k and p
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_zswap( n-k, a( k, k+1 ), lda, a( p, k+1 ),lda )
+                    call stdlib_zswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ),ldw )
+                 end if
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=dp)
+                    call stdlib_zcopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_zlacgv( kk-1-kp, a( kp, kp+1 ), lda )
+                    if( kp>1 )call stdlib_zcopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_zswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_zswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! (1) store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element u(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=dp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_zcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(3))
+                       ! handle division by a small number
+                       t = real( a( k, k ),KIND=dp)
+                       if( abs( t )>=sfmin ) then
+                          r1 = one / t
+                          call stdlib_zdscal( k-1, r1, a( 1, k ), 1 )
+                       else
+                          do ii = 1, k-1
+                             a( ii, k ) = a( ii, k ) / t
+                          end do
+                       end if
+                       ! (2) conjugate column w(kw)
+                       call stdlib_zlacgv( k-1, w( 1, kw ), 1 )
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! (1) store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! (note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored)
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! handle division by a small number. (note: order of
+                       ! operations is important)
+                       ! = ( t*(( d11 )/conj(d21)) t*((  -1 )/d21 ) )
+                         ! (   ((  -1 )          )   (( d22 )     ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0 in 2x2 pivot case(4),
+                            ! since |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / conjg( d21 )
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = one / ( real( d11*d22,KIND=dp)-one )
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( ( d11*w( j, kw-1 )-w( j, kw ) ) /d21 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /conjg( d21 ) )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy superdiagonal element of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = czero
+                    a( k, k ) = w( k, kw )
+                    e( k ) = w( k-1, kw )
+                    e( k-1 ) = czero
+                    ! (2) conjugate columns w(kw) and w(kw-1)
+                    call stdlib_zlacgv( k-1, w( 1, kw ), 1 )
+                    call stdlib_zlacgv( k-2, w( 1, kw-1 ), 1 )
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**h = a11 - u12*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=dp)
+                    call stdlib_zgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=dp)
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k,-cone, a( &
+                           1, k+1 ), lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22 (note that conjg(w) is actually stored)
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = czero
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update column k of w
+              w( k, k ) = real( a( k, k ),KIND=dp)
+              if( k<n )call stdlib_zcopy( n-k, a( k+1, k ), 1, w( k+1, k ), 1 )
+              if( k>1 ) then
+                 call stdlib_zgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( k, 1 ), &
+                           ldw, cone, w( k, k ), 1 )
+                 w( k, k ) = real( w( k, k ),KIND=dp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, k ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( w( k, k ),KIND=dp)
+                 if( k<n )call stdlib_zcopy( n-k, w( k+1, k ), 1, a( k+1, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_zcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_zlacgv( imax-k, w( k, k+1 ), 1 )
+                       w( imax, k+1 ) = real( a( imax, imax ),KIND=dp)
+                       if( imax<n )call stdlib_zcopy( n-imax, a( imax+1, imax ), 1,w( imax+1, k+1 &
+                                 ), 1 )
+                       if( k>1 ) then
+                          call stdlib_zgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone,a( k, 1 ), lda, w( &
+                                    imax, 1 ), ldw,cone, w( k, k+1 ), 1 )
+                          w( imax, k+1 ) = real( w( imax, k+1 ),KIND=dp)
+                       end if
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_izamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = cabs1( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_izamax( n-imax, w( imax+1, k+1 ), 1)
+                          dtemp = cabs1( w( itemp, k+1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,k+1 ),KIND=dp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( w( imax,k+1 ),KIND=dp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_zcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_zcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 72
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns p and k (only for 2-by-2 pivot).
+                 ! updated column p is already stored in column k of w.
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column kk-1 to column p of submatrix a
+                    ! at step k. no need to copy element into columns
+                    ! k and k+1 of a for 2-by-2 pivot, since these columns
+                    ! will be later overwritten.
+                    a( p, p ) = real( a( k, k ),KIND=dp)
+                    call stdlib_zcopy( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                    call stdlib_zlacgv( p-k-1, a( p, k+1 ), lda )
+                    if( p<n )call stdlib_zcopy( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    ! interchange rows k and p in first k-1 columns of a
+                    ! (columns k and k+1 of a for 2-by-2 pivot will be
+                    ! later overwritten). interchange rows k and p
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_zswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_zswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=dp)
+                    call stdlib_zcopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    call stdlib_zlacgv( kp-kk-1, a( kp, kk+1 ), lda )
+                    if( kp<n )call stdlib_zcopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (column k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_zswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_zswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! (1) store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=dp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_zcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(3))
+                       ! handle division by a small number
+                       t = real( a( k, k ),KIND=dp)
+                       if( abs( t )>=sfmin ) then
+                          r1 = one / t
+                          call stdlib_zdscal( n-k, r1, a( k+1, k ), 1 )
+                       else
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / t
+                          end do
+                       end if
+                       ! (2) conjugate column w(k)
+                       call stdlib_zlacgv( n-k, w( k+1, k ), 1 )
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! (1) store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored.
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! handle division by a small number. (note: order of
+                       ! operations is important)
+                       ! = ( t*(( d11 )/conj(d21)) t*((  -1 )/d21 ) )
+                         ! (   ((  -1 )          )   (( d22 )     ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0 in 2x2 pivot case(4),
+                            ! since |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / conjg( d21 )
+                       t = one / ( real( d11*d22,KIND=dp)-one )
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /conjg( d21 ) )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy subdiagonal element of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = czero
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    e( k ) = w( k+1, k )
+                    e( k+1 ) = czero
+                    ! (2) conjugate columns w(k) and w(k+1)
+                    call stdlib_zlacgv( n-k, w( k+1, k ), 1 )
+                    call stdlib_zlacgv( n-k-1, w( k+2, k+1 ), 1 )
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**h = a22 - l21*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=dp)
+                    call stdlib_zgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=dp)
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_zlahef_rk
+
+     !> ZLAHEF_ROOK: computes a partial factorization of a complex Hermitian
+     !> matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting
+     !> method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I      0     )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**H U22**H )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**H L21**H )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0      I     )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> Note that U**H denotes the conjugate transpose of U.
+     !> ZLAHEF_ROOK is an auxiliary routine called by ZHETRF_ROOK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_zlahef_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, ii, j, jb, jj, jmax, jp1, jp2, k, kk, kkw, kp, kstep, kw, &
+                     p
+           real(dp) :: absakk, alpha, colmax, dtemp, r1, rowmax, t, sfmin
+           complex(dp) :: d11, d21, d22, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,aimag,max,min,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=dp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11 (note that conjg(w) is actually stored)
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              if( k>1 )call stdlib_zcopy( k-1, a( 1, k ), 1, w( 1, kw ), 1 )
+              w( k, kw ) = real( a( k, k ),KIND=dp)
+              if( k<n ) then
+                 call stdlib_zgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ), lda,w( k, kw+1 ), &
+                           ldw, cone, w( 1, kw ), 1 )
+                 w( k, kw ) = real( w( k, kw ),KIND=dp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, kw ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( w( k, kw ),KIND=dp)
+                 if( k>1 )call stdlib_zcopy( k-1, w( 1, kw ), 1, a( 1, k ), 1 )
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! lop until pivot found
+                    done = .false.
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       if( imax>1 )call stdlib_zcopy( imax-1, a( 1, imax ), 1, w( 1, kw-1 ),1 )
+                                 
+                       w( imax, kw-1 ) = real( a( imax, imax ),KIND=dp)
+                       call stdlib_zcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       call stdlib_zlacgv( k-imax, w( imax+1, kw-1 ), 1 )
+                       if( k<n ) then
+                          call stdlib_zgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda, w( &
+                                    imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                          w( imax, kw-1 ) = real( w( imax, kw-1 ),KIND=dp)
+                       end if
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_izamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = cabs1( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_izamax( imax-1, w( 1, kw-1 ), 1 )
+                          dtemp = cabs1( w( itemp, kw-1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,kw-1 ),KIND=dp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( w( imax,kw-1 ),KIND=dp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_zcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_zcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 12
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns p and k.
+                 ! updated column p is already stored in column kw of w.
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p of submatrix a
+                    ! at step k. no need to copy element into columns
+                    ! k and k-1 of a for 2-by-2 pivot, since these columns
+                    ! will be later overwritten.
+                    a( p, p ) = real( a( k, k ),KIND=dp)
+                    call stdlib_zcopy( k-1-p, a( p+1, k ), 1, a( p, p+1 ),lda )
+                    call stdlib_zlacgv( k-1-p, a( p, p+1 ), lda )
+                    if( p>1 )call stdlib_zcopy( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in the last k+1 to n columns of a
+                    ! (columns k and k-1 of a for 2-by-2 pivot will be
+                    ! later overwritten). interchange rows k and p
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_zswap( n-k, a( k, k+1 ), lda, a( p, k+1 ),lda )
+                    call stdlib_zswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ),ldw )
+                 end if
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=dp)
+                    call stdlib_zcopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_zlacgv( kk-1-kp, a( kp, kp+1 ), lda )
+                    if( kp>1 )call stdlib_zcopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_zswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_zswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! (1) store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element u(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=dp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_zcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(3))
+                       ! handle division by a small number
+                       t = real( a( k, k ),KIND=dp)
+                       if( abs( t )>=sfmin ) then
+                          r1 = one / t
+                          call stdlib_zdscal( k-1, r1, a( 1, k ), 1 )
+                       else
+                          do ii = 1, k-1
+                             a( ii, k ) = a( ii, k ) / t
+                          end do
+                       end if
+                       ! (2) conjugate column w(kw)
+                       call stdlib_zlacgv( k-1, w( 1, kw ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! (1) store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! (note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored)
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! handle division by a small number. (note: order of
+                       ! operations is important)
+                       ! = ( t*(( d11 )/conj(d21)) t*((  -1 )/d21 ) )
+                         ! (   ((  -1 )          )   (( d22 )     ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0 in 2x2 pivot case(4),
+                            ! since |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / conjg( d21 )
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = one / ( real( d11*d22,KIND=dp)-one )
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( ( d11*w( j, kw-1 )-w( j, kw ) ) /d21 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /conjg( d21 ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                    ! (2) conjugate columns w(kw) and w(kw-1)
+                    call stdlib_zlacgv( k-1, w( 1, kw ), 1 )
+                    call stdlib_zlacgv( k-2, w( 1, kw-1 ), 1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**h = a11 - u12*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=dp)
+                    call stdlib_zgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=dp)
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k,-cone, a( &
+                           1, k+1 ), lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in of rows in columns k+1:n looping backwards from k+1 to n
+              j = k + 1
+              60 continue
+                 ! undo the interchanges (if any) of rows j and jp2
+                 ! (or j and jp2, and j+1 and jp1) at each step j
+                 kstep = 1
+                 jp1 = 1
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    ! (here, j is a diagonal index)
+                    j = j + 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 ! (note: here, j is used to determine row length. length n-j+1
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j + 1
+                 if( jp2/=jj .and. j<=n )call stdlib_zswap( n-j+1, a( jp2, j ), lda, a( jj, j ), &
+                           lda )
+                 jj = jj + 1
+                 if( kstep==2 .and. jp1/=jj .and. j<=n )call stdlib_zswap( n-j+1, a( jp1, j ), &
+                           lda, a( jj, j ), lda )
+              if( j<n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22 (note that conjg(w) is actually stored)
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update column k of w
+              w( k, k ) = real( a( k, k ),KIND=dp)
+              if( k<n )call stdlib_zcopy( n-k, a( k+1, k ), 1, w( k+1, k ), 1 )
+              if( k>1 ) then
+                 call stdlib_zgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( k, 1 ), &
+                           ldw, cone, w( k, k ), 1 )
+                 w( k, k ) = real( w( k, k ),KIND=dp)
+              end if
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = abs( real( w( k, k ),KIND=dp) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 a( k, k ) = real( w( k, k ),KIND=dp)
+                 if( k<n )call stdlib_zcopy( n-k, w( k+1, k ), 1, a( k+1, k ), 1 )
+              else
+                 ! ============================================================
+                 ! begin pivot search
+                 ! case(1)
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_zcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_zlacgv( imax-k, w( k, k+1 ), 1 )
+                       w( imax, k+1 ) = real( a( imax, imax ),KIND=dp)
+                       if( imax<n )call stdlib_zcopy( n-imax, a( imax+1, imax ), 1,w( imax+1, k+1 &
+                                 ), 1 )
+                       if( k>1 ) then
+                          call stdlib_zgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone,a( k, 1 ), lda, w( &
+                                    imax, 1 ), ldw,cone, w( k, k+1 ), 1 )
+                          w( imax, k+1 ) = real( w( imax, k+1 ),KIND=dp)
+                       end if
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_izamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = cabs1( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_izamax( n-imax, w( imax+1, k+1 ), 1)
+                          dtemp = cabs1( w( itemp, k+1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! case(2)
+                       ! equivalent to testing for
+                       ! abs( real( w( imax,k+1 ),KIND=dp) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( abs( real( w( imax,k+1 ),KIND=dp) )<alpha*rowmax ) ) &
+                                 then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_zcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! case(3)
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       ! case(4)
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_zcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not.done ) goto 72
+                 end if
+                 ! end pivot search
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns p and k (only for 2-by-2 pivot).
+                 ! updated column p is already stored in column k of w.
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column kk-1 to column p of submatrix a
+                    ! at step k. no need to copy element into columns
+                    ! k and k+1 of a for 2-by-2 pivot, since these columns
+                    ! will be later overwritten.
+                    a( p, p ) = real( a( k, k ),KIND=dp)
+                    call stdlib_zcopy( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                    call stdlib_zlacgv( p-k-1, a( p, k+1 ), lda )
+                    if( p<n )call stdlib_zcopy( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    ! interchange rows k and p in first k-1 columns of a
+                    ! (columns k and k+1 of a for 2-by-2 pivot will be
+                    ! later overwritten). interchange rows k and p
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_zswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_zswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = real( a( kk, kk ),KIND=dp)
+                    call stdlib_zcopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    call stdlib_zlacgv( kp-kk-1, a( kp, kk+1 ), lda )
+                    if( kp<n )call stdlib_zcopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (column k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_zswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_zswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! (1) store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    ! (note: no need to use for hermitian matrix
+                    ! a( k, k ) = real( w( k, k),KIND=dp) to separately copy diagonal
+                    ! element d(k,k) from w (potentially saves only one load))
+                    call stdlib_zcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       ! (note: no need to check if a(k,k) is not zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! case a(k,k) = 0 falls into 2x2 pivot case(3))
+                       ! handle division by a small number
+                       t = real( a( k, k ),KIND=dp)
+                       if( abs( t )>=sfmin ) then
+                          r1 = one / t
+                          call stdlib_zdscal( n-k, r1, a( k+1, k ), 1 )
+                       else
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / t
+                          end do
+                       end if
+                       ! (2) conjugate column w(k)
+                       call stdlib_zlacgv( n-k, w( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! (1) store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored.
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! factor out the columns of the inverse of 2-by-2 pivot
+                       ! block d, so that each column contains 1, to reduce the
+                       ! number of flops when we multiply panel
+                       ! ( w(kw-1) w(kw) ) by this inverse, i.e. by d**(-1).
+                       ! d**(-1) = ( d11 cj(d21) )**(-1) =
+                                 ! ( d21    d22 )
+                       ! = 1/(d11*d22-|d21|**2) * ( ( d22) (-cj(d21) ) ) =
+                                                ! ( (-d21) (     d11 ) )
+                       ! = 1/(|d21|**2) * 1/((d11/cj(d21))*(d22/d21)-1) *
+                         ! * ( d21*( d22/d21 ) conj(d21)*(           - 1 ) ) =
+                           ! (     (      -1 )           ( d11/conj(d21) ) )
+                       ! = 1/(|d21|**2) * 1/(d22*d11-1) *
+                         ! * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                           ! (     (  -1 )           ( d22 ) )
+                       ! = (1/|d21|**2) * t * ( d21*( d11 ) conj(d21)*(  -1 ) ) =
+                                            ! (     (  -1 )           ( d22 ) )
+                       ! = ( (t/conj(d21))*( d11 ) (t/d21)*(  -1 ) ) =
+                         ! (               (  -1 )         ( d22 ) )
+                       ! handle division by a small number. (note: order of
+                       ! operations is important)
+                       ! = ( t*(( d11 )/conj(d21)) t*((  -1 )/d21 ) )
+                         ! (   ((  -1 )          )   (( d22 )     ) ),
+                       ! where d11 = d22/d21,
+                             ! d22 = d11/conj(d21),
+                             ! d21 = d21,
+                             ! t = 1/(d22*d11-1).
+                       ! (note: no need to check for division by zero,
+                        ! since that was ensured earlier in pivot search:
+                        ! (a) d21 != 0 in 2x2 pivot case(4),
+                            ! since |d21| should be larger than |d11| and |d22|;
+                        ! (b) (d22*d11 - 1) != 0, since from (a),
+                            ! both |d11| < 1, |d22| < 1, hence |d22*d11| << 1.)
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / conjg( d21 )
+                       t = one / ( real( d11*d22,KIND=dp)-one )
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /conjg( d21 ) )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    ! (2) conjugate columns w(k) and w(k+1)
+                    call stdlib_zlacgv( n-k, w( k+1, k ), 1 )
+                    call stdlib_zlacgv( n-k-1, w( k+2, k+1 ), 1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**h = a22 - l21*w**h
+              ! computing blocks of nb columns at a time (note that conjg(w) is
+              ! actually stored)
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    a( jj, jj ) = real( a( jj, jj ),KIND=dp)
+                    call stdlib_zgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                    a( jj, jj ) = real( a( jj, jj ),KIND=dp)
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! of rows in columns 1:k-1 looping backwards from k-1 to 1
+              j = k - 1
+              120 continue
+                 ! undo the interchanges (if any) of rows j and jp2
+                 ! (or j and jp2, and j-1 and jp1) at each step j
+                 kstep = 1
+                 jp1 = 1
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    ! (here, j is a diagonal index)
+                    j = j - 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 ! (note: here, j is used to determine row length. length j
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j - 1
+                 if( jp2/=jj .and. j>=1 )call stdlib_zswap( j, a( jp2, 1 ), lda, a( jj, 1 ), lda )
+                           
+                 jj = jj -1
+                 if( kstep==2 .and. jp1/=jj .and. j>=1 )call stdlib_zswap( j, a( jp1, 1 ), lda, a(&
+                            jj, 1 ), lda )
+              if( j>1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_zlahef_rook
+
+     !> ZLAIC1: applies one step of incremental condition estimation in
+     !> its simplest version:
+     !> Let x, twonorm(x) = 1, be an approximate singular vector of an j-by-j
+     !> lower triangular matrix L, such that
+     !> twonorm(L*x) = sest
+     !> Then ZLAIC1 computes sestpr, s, c such that
+     !> the vector
+     !> [ s*x ]
+     !> xhat = [  c  ]
+     !> is an approximate singular vector of
+     !> [ L       0  ]
+     !> Lhat = [ w**H gamma ]
+     !> in the sense that
+     !> twonorm(Lhat*xhat) = sestpr.
+     !> Depending on JOB, an estimate for the largest or smallest singular
+     !> value is computed.
+     !> Note that [s c]**H and sestpr**2 is an eigenpair of the system
+     !> diag(sest*sest, 0) + [alpha  gamma] * [ conjg(alpha) ]
+     !> [ conjg(gamma) ]
+     !> where  alpha =  x**H * w.
+
+     pure subroutine stdlib_zlaic1( job, j, x, sest, w, gamma, sestpr, s, c )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: j, job
+           real(dp), intent(in) :: sest
+           real(dp), intent(out) :: sestpr
+           complex(dp), intent(out) :: c, s
+           complex(dp), intent(in) :: gamma
+           ! Array Arguments 
+           complex(dp), intent(in) :: w(j), x(j)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(dp) :: absalp, absest, absgam, b, eps, norma, s1, s2, scl, t, test, tmp, zeta1, &
+                     zeta2
+           complex(dp) :: alpha, cosine, sine
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,sqrt
+           ! Executable Statements 
+           eps = stdlib_dlamch( 'EPSILON' )
+           alpha = stdlib_zdotc( j, x, 1, w, 1 )
+           absalp = abs( alpha )
+           absgam = abs( gamma )
+           absest = abs( sest )
+           if( job==1 ) then
+              ! estimating largest singular value
+              ! special cases
+              if( sest==zero ) then
+                 s1 = max( absgam, absalp )
+                 if( s1==zero ) then
+                    s = zero
+                    c = one
+                    sestpr = zero
+                 else
+                    s = alpha / s1
+                    c = gamma / s1
+                    tmp = real( sqrt( s*conjg( s )+c*conjg( c ) ),KIND=dp)
+                    s = s / tmp
+                    c = c / tmp
+                    sestpr = s1*tmp
+                 end if
+                 return
+              else if( absgam<=eps*absest ) then
+                 s = one
+                 c = zero
+                 tmp = max( absest, absalp )
+                 s1 = absest / tmp
+                 s2 = absalp / tmp
+                 sestpr = tmp*sqrt( s1*s1+s2*s2 )
+                 return
+              else if( absalp<=eps*absest ) then
+                 s1 = absgam
+                 s2 = absest
+                 if( s1<=s2 ) then
+                    s = one
+                    c = zero
+                    sestpr = s2
+                 else
+                    s = zero
+                    c = one
+                    sestpr = s1
+                 end if
+                 return
+              else if( absest<=eps*absalp .or. absest<=eps*absgam ) then
+                 s1 = absgam
+                 s2 = absalp
+                 if( s1<=s2 ) then
+                    tmp = s1 / s2
+                    scl = sqrt( one+tmp*tmp )
+                    sestpr = s2*scl
+                    s = ( alpha / s2 ) / scl
+                    c = ( gamma / s2 ) / scl
+                 else
+                    tmp = s2 / s1
+                    scl = sqrt( one+tmp*tmp )
+                    sestpr = s1*scl
+                    s = ( alpha / s1 ) / scl
+                    c = ( gamma / s1 ) / scl
+                 end if
+                 return
+              else
+                 ! normal case
+                 zeta1 = absalp / absest
+                 zeta2 = absgam / absest
+                 b = ( one-zeta1*zeta1-zeta2*zeta2 )*half
+                 c = zeta1*zeta1
+                 if( b>zero ) then
+                    t = real( c / ( b+sqrt( b*b+c ) ),KIND=dp)
+                 else
+                    t = real( sqrt( b*b+c ) - b,KIND=dp)
+                 end if
+                 sine = -( alpha / absest ) / t
+                 cosine = -( gamma / absest ) / ( one+t )
+                 tmp = real( sqrt( sine * conjg( sine )+ cosine * conjg( cosine ) ),KIND=dp)
+                           
+                 s = sine / tmp
+                 c = cosine / tmp
+                 sestpr = sqrt( t+one )*absest
+                 return
+              end if
+           else if( job==2 ) then
+              ! estimating smallest singular value
+              ! special cases
+              if( sest==zero ) then
+                 sestpr = zero
+                 if( max( absgam, absalp )==zero ) then
+                    sine = one
+                    cosine = zero
+                 else
+                    sine = -conjg( gamma )
+                    cosine = conjg( alpha )
+                 end if
+                 s1 = max( abs( sine ), abs( cosine ) )
+                 s = sine / s1
+                 c = cosine / s1
+                 tmp = real( sqrt( s*conjg( s )+c*conjg( c ) ),KIND=dp)
+                 s = s / tmp
+                 c = c / tmp
+                 return
+              else if( absgam<=eps*absest ) then
+                 s = zero
+                 c = one
+                 sestpr = absgam
+                 return
+              else if( absalp<=eps*absest ) then
+                 s1 = absgam
+                 s2 = absest
+                 if( s1<=s2 ) then
+                    s = zero
+                    c = one
+                    sestpr = s1
+                 else
+                    s = one
+                    c = zero
+                    sestpr = s2
+                 end if
+                 return
+              else if( absest<=eps*absalp .or. absest<=eps*absgam ) then
+                 s1 = absgam
+                 s2 = absalp
+                 if( s1<=s2 ) then
+                    tmp = s1 / s2
+                    scl = sqrt( one+tmp*tmp )
+                    sestpr = absest*( tmp / scl )
+                    s = -( conjg( gamma ) / s2 ) / scl
+                    c = ( conjg( alpha ) / s2 ) / scl
+                 else
+                    tmp = s2 / s1
+                    scl = sqrt( one+tmp*tmp )
+                    sestpr = absest / scl
+                    s = -( conjg( gamma ) / s1 ) / scl
+                    c = ( conjg( alpha ) / s1 ) / scl
+                 end if
+                 return
+              else
+                 ! normal case
+                 zeta1 = absalp / absest
+                 zeta2 = absgam / absest
+                 norma = max( one+zeta1*zeta1+zeta1*zeta2,zeta1*zeta2+zeta2*zeta2 )
+                 ! see if root is closer to zero or to one
+                 test = one + two*( zeta1-zeta2 )*( zeta1+zeta2 )
+                 if( test>=zero ) then
+                    ! root is close to zero, compute directly
+                    b = ( zeta1*zeta1+zeta2*zeta2+one )*half
+                    c = zeta2*zeta2
+                    t = real( c / ( b+sqrt( abs( b*b-c ) ) ),KIND=dp)
+                    sine = ( alpha / absest ) / ( one-t )
+                    cosine = -( gamma / absest ) / t
+                    sestpr = sqrt( t+four*eps*eps*norma )*absest
+                 else
+                    ! root is closer to one, shift by that amount
+                    b = ( zeta2*zeta2+zeta1*zeta1-one )*half
+                    c = zeta1*zeta1
+                    if( b>=zero ) then
+                       t = -c / ( b+sqrt( b*b+c ) )
+                    else
+                       t = b - sqrt( b*b+c )
+                    end if
+                    sine = -( alpha / absest ) / t
+                    cosine = -( gamma / absest ) / ( one+t )
+                    sestpr = sqrt( one+t+four*eps*eps*norma )*absest
+                 end if
+                 tmp = real( sqrt( sine * conjg( sine )+ cosine * conjg( cosine ) ),KIND=dp)
+                           
+                 s = sine / tmp
+                 c = cosine / tmp
+                 return
+              end if
+           end if
+           return
+     end subroutine stdlib_zlaic1
+
+     !> ZLAPMR: rearranges the rows of the M by N matrix X as specified
+     !> by the permutation K(1),K(2),...,K(M) of the integers 1,...,M.
+     !> If FORWRD = .TRUE.,  forward permutation:
+     !> X(K(I),*) is moved X(I,*) for I = 1,2,...,M.
+     !> If FORWRD = .FALSE., backward permutation:
+     !> X(I,*) is moved to X(K(I),*) for I = 1,2,...,M.
+
+     pure subroutine stdlib_zlapmr( forwrd, m, n, x, ldx, k )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: forwrd
+           integer(ilp), intent(in) :: ldx, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: k(*)
+           complex(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, in, j, jj
+           complex(dp) :: temp
+           ! Executable Statements 
+           if( m<=1 )return
+           do i = 1, m
+              k( i ) = -k( i )
+           end do
+           if( forwrd ) then
+              ! forward permutation
+              do i = 1, m
+                 if( k( i )>0 )go to 40
+                 j = i
+                 k( j ) = -k( j )
+                 in = k( j )
+                 20 continue
+                 if( k( in )>0 )go to 40
+                 do jj = 1, n
+                    temp = x( j, jj )
+                    x( j, jj ) = x( in, jj )
+                    x( in, jj ) = temp
+                 end do
+                 k( in ) = -k( in )
+                 j = in
+                 in = k( in )
+                 go to 20
+                 40 continue
+              end do
+           else
+              ! backward permutation
+              do i = 1, m
+                 if( k( i )>0 )go to 80
+                 k( i ) = -k( i )
+                 j = k( i )
+                 60 continue
+                 if( j==i )go to 80
+                 do jj = 1, n
+                    temp = x( i, jj )
+                    x( i, jj ) = x( j, jj )
+                    x( j, jj ) = temp
+                 end do
+                 k( j ) = -k( j )
+                 j = k( j )
+                 go to 60
+                 80 continue
+              end do
+           end if
+           return
+     end subroutine stdlib_zlapmr
+
+     !> ZLAPMT: rearranges the columns of the M by N matrix X as specified
+     !> by the permutation K(1),K(2),...,K(N) of the integers 1,...,N.
+     !> If FORWRD = .TRUE.,  forward permutation:
+     !> X(*,K(J)) is moved X(*,J) for J = 1,2,...,N.
+     !> If FORWRD = .FALSE., backward permutation:
+     !> X(*,J) is moved to X(*,K(J)) for J = 1,2,...,N.
+
+     pure subroutine stdlib_zlapmt( forwrd, m, n, x, ldx, k )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: forwrd
+           integer(ilp), intent(in) :: ldx, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: k(*)
+           complex(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ii, in, j
+           complex(dp) :: temp
+           ! Executable Statements 
+           if( n<=1 )return
+           do i = 1, n
+              k( i ) = -k( i )
+           end do
+           if( forwrd ) then
+              ! forward permutation
+              do i = 1, n
+                 if( k( i )>0 )go to 40
+                 j = i
+                 k( j ) = -k( j )
+                 in = k( j )
+                 20 continue
+                 if( k( in )>0 )go to 40
+                 do ii = 1, m
+                    temp = x( ii, j )
+                    x( ii, j ) = x( ii, in )
+                    x( ii, in ) = temp
+                 end do
+                 k( in ) = -k( in )
+                 j = in
+                 in = k( in )
+                 go to 20
+                 40 continue
+              end do
+           else
+              ! backward permutation
+              do i = 1, n
+                 if( k( i )>0 )go to 80
+                 k( i ) = -k( i )
+                 j = k( i )
+                 60 continue
+                 if( j==i )go to 80
+                 do ii = 1, m
+                    temp = x( ii, i )
+                    x( ii, i ) = x( ii, j )
+                    x( ii, j ) = temp
+                 end do
+                 k( j ) = -k( j )
+                 j = k( j )
+                 go to 60
+                 80 continue
+              end do
+           end if
+           return
+     end subroutine stdlib_zlapmt
+
+     !> ZLAQGB: equilibrates a general M by N band matrix A with KL
+     !> subdiagonals and KU superdiagonals using the row and scaling factors
+     !> in the vectors R and C.
+
+     pure subroutine stdlib_zlaqgb( m, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           real(dp), intent(in) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(dp), intent(in) :: c(*), r(*)
+           complex(dp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1e+0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           large = one / small
+           if( rowcnd>=thresh .and. amax>=small .and. amax<=large )then
+              ! no row scaling
+              if( colcnd>=thresh ) then
+                 ! no column scaling
+                 equed = 'N'
+              else
+                 ! column scaling
+                 do j = 1, n
+                    cj = c( j )
+                    do i = max( 1, j-ku ), min( m, j+kl )
+                       ab( ku+1+i-j, j ) = cj*ab( ku+1+i-j, j )
+                    end do
+                 end do
+                 equed = 'C'
+              end if
+           else if( colcnd>=thresh ) then
+              ! row scaling, no column scaling
+              do j = 1, n
+                 do i = max( 1, j-ku ), min( m, j+kl )
+                    ab( ku+1+i-j, j ) = r( i )*ab( ku+1+i-j, j )
+                 end do
+              end do
+              equed = 'R'
+           else
+              ! row and column scaling
+              do j = 1, n
+                 cj = c( j )
+                 do i = max( 1, j-ku ), min( m, j+kl )
+                    ab( ku+1+i-j, j ) = cj*r( i )*ab( ku+1+i-j, j )
+                 end do
+              end do
+              equed = 'B'
+           end if
+           return
+     end subroutine stdlib_zlaqgb
+
+     !> ZLAQGE: equilibrates a general M by N matrix A using the row and
+     !> column scaling factors in the vectors R and C.
+
+     pure subroutine stdlib_zlaqge( m, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           integer(ilp), intent(in) :: lda, m, n
+           real(dp), intent(in) :: amax, colcnd, rowcnd
+           ! Array Arguments 
+           real(dp), intent(in) :: c(*), r(*)
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1e+0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           large = one / small
+           if( rowcnd>=thresh .and. amax>=small .and. amax<=large )then
+              ! no row scaling
+              if( colcnd>=thresh ) then
+                 ! no column scaling
+                 equed = 'N'
+              else
+                 ! column scaling
+                 do j = 1, n
+                    cj = c( j )
+                    do i = 1, m
+                       a( i, j ) = cj*a( i, j )
+                    end do
+                 end do
+                 equed = 'C'
+              end if
+           else if( colcnd>=thresh ) then
+              ! row scaling, no column scaling
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = r( i )*a( i, j )
+                 end do
+              end do
+              equed = 'R'
+           else
+              ! row and column scaling
+              do j = 1, n
+                 cj = c( j )
+                 do i = 1, m
+                    a( i, j ) = cj*r( i )*a( i, j )
+                 end do
+              end do
+              equed = 'B'
+           end if
+           return
+     end subroutine stdlib_zlaqge
+
+     !> ZLAQHB: equilibrates a Hermitian band matrix A
+     !> using the scaling factors in the vector S.
+
+     pure subroutine stdlib_zlaqhb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(dp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(out) :: s(*)
+           complex(dp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1e+0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored in band format.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = max( 1, j-kd ), j - 1
+                       ab( kd+1+i-j, j ) = cj*s( i )*ab( kd+1+i-j, j )
+                    end do
+                    ab( kd+1, j ) = cj*cj*real( ab( kd+1, j ),KIND=dp)
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    ab( 1, j ) = cj*cj*real( ab( 1, j ),KIND=dp)
+                    do i = j + 1, min( n, j+kd )
+                       ab( 1+i-j, j ) = cj*s( i )*ab( 1+i-j, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_zlaqhb
+
+     !> ZLAQHE: equilibrates a Hermitian matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_zlaqhe( uplo, n, a, lda, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(in) :: s(*)
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1e+0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j - 1
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                    a( j, j ) = cj*cj*real( a( j, j ),KIND=dp)
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    a( j, j ) = cj*cj*real( a( j, j ),KIND=dp)
+                    do i = j + 1, n
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_zlaqhe
+
+     !> ZLAQHP: equilibrates a Hermitian matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_zlaqhp( uplo, n, ap, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(in) :: s(*)
+           complex(dp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1e+0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jc
+           real(dp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j - 1
+                       ap( jc+i-1 ) = cj*s( i )*ap( jc+i-1 )
+                    end do
+                    ap( jc+j-1 ) = cj*cj*real( ap( jc+j-1 ),KIND=dp)
+                    jc = jc + j
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    ap( jc ) = cj*cj*real( ap( jc ),KIND=dp)
+                    do i = j + 1, n
+                       ap( jc+i-j ) = cj*s( i )*ap( jc+i-j )
+                    end do
+                    jc = jc + n - j + 1
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_zlaqhp
+
+     !> Given a 2-by-2 or 3-by-3 matrix H, ZLAQR1: sets v to a
+     !> scalar multiple of the first column of the product
+     !> (*)  K = (H - s1*I)*(H - s2*I)
+     !> scaling to avoid overflows and most underflows.
+     !> This is useful for starting double implicit shift bulges
+     !> in the QR algorithm.
+
+     pure subroutine stdlib_zlaqr1( n, h, ldh, s1, s2, v )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           complex(dp), intent(in) :: s1, s2
+           integer(ilp), intent(in) :: ldh, n
+           ! Array Arguments 
+           complex(dp), intent(in) :: h(ldh,*)
+           complex(dp), intent(out) :: v(*)
+        ! ================================================================
+           ! Parameters 
+           real(dp), parameter :: rzero = 0.0_dp
+           
+           
+           ! Local Scalars 
+           complex(dp) :: cdum, h21s, h31s
+           real(dp) :: s
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! quick return if possible
+           if( n/=2 .and. n/=3 ) then
+              return
+           end if
+           if( n==2 ) then
+              s = cabs1( h( 1, 1 )-s2 ) + cabs1( h( 2, 1 ) )
+              if( s==rzero ) then
+                 v( 1 ) = czero
+                 v( 2 ) = czero
+              else
+                 h21s = h( 2, 1 ) / s
+                 v( 1 ) = h21s*h( 1, 2 ) + ( h( 1, 1 )-s1 )*( ( h( 1, 1 )-s2 ) / s )
+                 v( 2 ) = h21s*( h( 1, 1 )+h( 2, 2 )-s1-s2 )
+              end if
+           else
+              s = cabs1( h( 1, 1 )-s2 ) + cabs1( h( 2, 1 ) ) +cabs1( h( 3, 1 ) )
+              if( s==czero ) then
+                 v( 1 ) = czero
+                 v( 2 ) = czero
+                 v( 3 ) = czero
+              else
+                 h21s = h( 2, 1 ) / s
+                 h31s = h( 3, 1 ) / s
+                 v( 1 ) = ( h( 1, 1 )-s1 )*( ( h( 1, 1 )-s2 ) / s ) +h( 1, 2 )*h21s + h( 1, 3 )&
+                           *h31s
+                 v( 2 ) = h21s*( h( 1, 1 )+h( 2, 2 )-s1-s2 ) + h( 2, 3 )*h31s
+                 v( 3 ) = h31s*( h( 1, 1 )+h( 3, 3 )-s1-s2 ) + h21s*h( 3, 2 )
+              end if
+           end if
+     end subroutine stdlib_zlaqr1
+
+     !> ZLAQSB: equilibrates a symmetric band matrix A using the scaling
+     !> factors in the vector S.
+
+     pure subroutine stdlib_zlaqsb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(dp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(in) :: s(*)
+           complex(dp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1e+0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: cj, large, small
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored in band format.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = max( 1, j-kd ), j
+                       ab( kd+1+i-j, j ) = cj*s( i )*ab( kd+1+i-j, j )
+                    end do
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, min( n, j+kd )
+                       ab( 1+i-j, j ) = cj*s( i )*ab( 1+i-j, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_zlaqsb
+
+     !> ZLAQSP: equilibrates a symmetric matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_zlaqsp( uplo, n, ap, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(in) :: s(*)
+           complex(dp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1e+0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jc
+           real(dp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j
+                       ap( jc+i-1 ) = cj*s( i )*ap( jc+i-1 )
+                    end do
+                    jc = jc + j
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 jc = 1
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, n
+                       ap( jc+i-j ) = cj*s( i )*ap( jc+i-j )
+                    end do
+                    jc = jc + n - j + 1
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_zlaqsp
+
+     !> ZLAQSY: equilibrates a symmetric matrix A using the scaling factors
+     !> in the vector S.
+
+     pure subroutine stdlib_zlaqsy( uplo, n, a, lda, s, scond, amax, equed )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(out) :: equed
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(in) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(in) :: s(*)
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: thresh = 0.1e+0_dp
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: cj, large, small
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 ) then
+              equed = 'N'
+              return
+           end if
+           ! initialize large and small.
+           small = stdlib_dlamch( 'SAFE MINIMUM' ) / stdlib_dlamch( 'PRECISION' )
+           large = one / small
+           if( scond>=thresh .and. amax>=small .and. amax<=large ) then
+              ! no equilibration
+              equed = 'N'
+           else
+              ! replace a by diag(s) * a * diag(s).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! upper triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = 1, j
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              else
+                 ! lower triangle of a is stored.
+                 do j = 1, n
+                    cj = s( j )
+                    do i = j, n
+                       a( i, j ) = cj*s( i )*a( i, j )
+                    end do
+                 end do
+              end if
+              equed = 'Y'
+           end if
+           return
+     end subroutine stdlib_zlaqsy
+
+     !> ZLAR1V: computes the (scaled) r-th column of the inverse of
+     !> the sumbmatrix in rows B1 through BN of the tridiagonal matrix
+     !> L D L**T - sigma I. When sigma is close to an eigenvalue, the
+     !> computed vector is an accurate eigenvector. Usually, r corresponds
+     !> to the index where the eigenvector is largest in magnitude.
+     !> The following steps accomplish this computation :
+     !> (a) Stationary qd transform,  L D L**T - sigma I = L(+) D(+) L(+)**T,
+     !> (b) Progressive qd transform, L D L**T - sigma I = U(-) D(-) U(-)**T,
+     !> (c) Computation of the diagonal elements of the inverse of
+     !> L D L**T - sigma I by combining the above transforms, and choosing
+     !> r as the index where the diagonal of the inverse is (one of the)
+     !> largest in magnitude.
+     !> (d) Computation of the (scaled) r-th column of the inverse using the
+     !> twisted factorization obtained by combining the top part of the
+     !> the stationary and the bottom part of the progressive transform.
+
+     pure subroutine stdlib_zlar1v( n, b1, bn, lambda, d, l, ld, lld,pivmin, gaptol, z, wantnc, &
+               negcnt, ztz, mingma,r, isuppz, nrminv, resid, rqcorr, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantnc
+           integer(ilp), intent(in) :: b1, bn, n
+           integer(ilp), intent(out) :: negcnt
+           integer(ilp), intent(inout) :: r
+           real(dp), intent(in) :: gaptol, lambda, pivmin
+           real(dp), intent(out) :: mingma, nrminv, resid, rqcorr, ztz
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*)
+           real(dp), intent(in) :: d(*), l(*), ld(*), lld(*)
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: z(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: sawnan1, sawnan2
+           integer(ilp) :: i, indlpl, indp, inds, indumn, neg1, neg2, r1, r2
+           real(dp) :: dminus, dplus, eps, s, tmp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real
+           ! Executable Statements 
+           eps = stdlib_dlamch( 'PRECISION' )
+           if( r==0 ) then
+              r1 = b1
+              r2 = bn
+           else
+              r1 = r
+              r2 = r
+           end if
+           ! storage for lplus
+           indlpl = 0
+           ! storage for uminus
+           indumn = n
+           inds = 2*n + 1
+           indp = 3*n + 1
+           if( b1==1 ) then
+              work( inds ) = zero
+           else
+              work( inds+b1-1 ) = lld( b1-1 )
+           end if
+           ! compute the stationary transform (using the differential form)
+           ! until the index r2.
+           sawnan1 = .false.
+           neg1 = 0
+           s = work( inds+b1-1 ) - lambda
+           do i = b1, r1 - 1
+              dplus = d( i ) + s
+              work( indlpl+i ) = ld( i ) / dplus
+              if(dplus<zero) neg1 = neg1 + 1
+              work( inds+i ) = s*work( indlpl+i )*l( i )
+              s = work( inds+i ) - lambda
+           end do
+           sawnan1 = stdlib_disnan( s )
+           if( sawnan1 ) goto 60
+           do i = r1, r2 - 1
+              dplus = d( i ) + s
+              work( indlpl+i ) = ld( i ) / dplus
+              work( inds+i ) = s*work( indlpl+i )*l( i )
+              s = work( inds+i ) - lambda
+           end do
+           sawnan1 = stdlib_disnan( s )
+           60 continue
+           if( sawnan1 ) then
+              ! runs a slower version of the above loop if a nan is detected
+              neg1 = 0
+              s = work( inds+b1-1 ) - lambda
+              do i = b1, r1 - 1
+                 dplus = d( i ) + s
+                 if(abs(dplus)<pivmin) dplus = -pivmin
+                 work( indlpl+i ) = ld( i ) / dplus
+                 if(dplus<zero) neg1 = neg1 + 1
+                 work( inds+i ) = s*work( indlpl+i )*l( i )
+                 if( work( indlpl+i )==zero )work( inds+i ) = lld( i )
+                 s = work( inds+i ) - lambda
+              end do
+              do i = r1, r2 - 1
+                 dplus = d( i ) + s
+                 if(abs(dplus)<pivmin) dplus = -pivmin
+                 work( indlpl+i ) = ld( i ) / dplus
+                 work( inds+i ) = s*work( indlpl+i )*l( i )
+                 if( work( indlpl+i )==zero )work( inds+i ) = lld( i )
+                 s = work( inds+i ) - lambda
+              end do
+           end if
+           ! compute the progressive transform (using the differential form)
+           ! until the index r1
+           sawnan2 = .false.
+           neg2 = 0
+           work( indp+bn-1 ) = d( bn ) - lambda
+           do i = bn - 1, r1, -1
+              dminus = lld( i ) + work( indp+i )
+              tmp = d( i ) / dminus
+              if(dminus<zero) neg2 = neg2 + 1
+              work( indumn+i ) = l( i )*tmp
+              work( indp+i-1 ) = work( indp+i )*tmp - lambda
+           end do
+           tmp = work( indp+r1-1 )
+           sawnan2 = stdlib_disnan( tmp )
+           if( sawnan2 ) then
+              ! runs a slower version of the above loop if a nan is detected
+              neg2 = 0
+              do i = bn-1, r1, -1
+                 dminus = lld( i ) + work( indp+i )
+                 if(abs(dminus)<pivmin) dminus = -pivmin
+                 tmp = d( i ) / dminus
+                 if(dminus<zero) neg2 = neg2 + 1
+                 work( indumn+i ) = l( i )*tmp
+                 work( indp+i-1 ) = work( indp+i )*tmp - lambda
+                 if( tmp==zero )work( indp+i-1 ) = d( i ) - lambda
+              end do
+           end if
+           ! find the index (from r1 to r2) of the largest (in magnitude)
+           ! diagonal element of the inverse
+           mingma = work( inds+r1-1 ) + work( indp+r1-1 )
+           if( mingma<zero ) neg1 = neg1 + 1
+           if( wantnc ) then
+              negcnt = neg1 + neg2
+           else
+              negcnt = -1
+           endif
+           if( abs(mingma)==zero )mingma = eps*work( inds+r1-1 )
+           r = r1
+           do i = r1, r2 - 1
+              tmp = work( inds+i ) + work( indp+i )
+              if( tmp==zero )tmp = eps*work( inds+i )
+              if( abs( tmp )<=abs( mingma ) ) then
+                 mingma = tmp
+                 r = i + 1
+              end if
+           end do
+           ! compute the fp vector: solve n^t v = e_r
+           isuppz( 1 ) = b1
+           isuppz( 2 ) = bn
+           z( r ) = cone
+           ztz = one
+           ! compute the fp vector upwards from r
+           if( .not.sawnan1 .and. .not.sawnan2 ) then
+              do i = r-1, b1, -1
+                 z( i ) = -( work( indlpl+i )*z( i+1 ) )
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i ) = zero
+                    isuppz( 1 ) = i + 1
+                    goto 220
+                 endif
+                 ztz = ztz + real( z( i )*z( i ),KIND=dp)
+              end do
+              220 continue
+           else
+              ! run slower loop if nan occurred.
+              do i = r - 1, b1, -1
+                 if( z( i+1 )==zero ) then
+                    z( i ) = -( ld( i+1 ) / ld( i ) )*z( i+2 )
+                 else
+                    z( i ) = -( work( indlpl+i )*z( i+1 ) )
+                 end if
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i ) = zero
+                    isuppz( 1 ) = i + 1
+                    go to 240
+                 end if
+                 ztz = ztz + real( z( i )*z( i ),KIND=dp)
+              end do
+              240 continue
+           endif
+           ! compute the fp vector downwards from r in blocks of size blksiz
+           if( .not.sawnan1 .and. .not.sawnan2 ) then
+              do i = r, bn-1
+                 z( i+1 ) = -( work( indumn+i )*z( i ) )
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i+1 ) = zero
+                    isuppz( 2 ) = i
+                    go to 260
+                 end if
+                 ztz = ztz + real( z( i+1 )*z( i+1 ),KIND=dp)
+              end do
+              260 continue
+           else
+              ! run slower loop if nan occurred.
+              do i = r, bn - 1
+                 if( z( i )==zero ) then
+                    z( i+1 ) = -( ld( i-1 ) / ld( i ) )*z( i-1 )
+                 else
+                    z( i+1 ) = -( work( indumn+i )*z( i ) )
+                 end if
+                 if( (abs(z(i))+abs(z(i+1)))* abs(ld(i))<gaptol )then
+                    z( i+1 ) = zero
+                    isuppz( 2 ) = i
+                    go to 280
+                 end if
+                 ztz = ztz + real( z( i+1 )*z( i+1 ),KIND=dp)
+              end do
+              280 continue
+           end if
+           ! compute quantities for convergence test
+           tmp = one / ztz
+           nrminv = sqrt( tmp )
+           resid = abs( mingma )*nrminv
+           rqcorr = mingma*tmp
+           return
+     end subroutine stdlib_zlar1v
+
+     !> ZLAR2V: applies a vector of complex plane rotations with real cosines
+     !> from both sides to a sequence of 2-by-2 complex Hermitian matrices,
+     !> defined by the elements of the vectors x, y and z. For i = 1,2,...,n
+     !> (       x(i)  z(i) ) :=
+     !> ( conjg(z(i)) y(i) )
+     !> (  c(i) conjg(s(i)) ) (       x(i)  z(i) ) ( c(i) -conjg(s(i)) )
+     !> ( -s(i)       c(i)  ) ( conjg(z(i)) y(i) ) ( s(i)        c(i)  )
+
+     pure subroutine stdlib_zlar2v( n, x, y, z, incx, c, s, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, n
+           ! Array Arguments 
+           real(dp), intent(in) :: c(*)
+           complex(dp), intent(in) :: s(*)
+           complex(dp), intent(inout) :: x(*), y(*), z(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix
+           real(dp) :: ci, sii, sir, t1i, t1r, t5, t6, xi, yi, zii, zir
+           complex(dp) :: si, t2, t3, t4, zi
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,conjg,aimag
+           ! Executable Statements 
+           ix = 1
+           ic = 1
+           do i = 1, n
+              xi = real( x( ix ),KIND=dp)
+              yi = real( y( ix ),KIND=dp)
+              zi = z( ix )
+              zir = real( zi,KIND=dp)
+              zii = aimag( zi )
+              ci = c( ic )
+              si = s( ic )
+              sir = real( si,KIND=dp)
+              sii = aimag( si )
+              t1r = sir*zir - sii*zii
+              t1i = sir*zii + sii*zir
+              t2 = ci*zi
+              t3 = t2 - conjg( si )*xi
+              t4 = conjg( t2 ) + si*yi
+              t5 = ci*xi + t1r
+              t6 = ci*yi - t1r
+              x( ix ) = ci*t5 + ( sir*real( t4,KIND=dp)+sii*aimag( t4 ) )
+              y( ix ) = ci*t6 - ( sir*real( t3,KIND=dp)-sii*aimag( t3 ) )
+              z( ix ) = ci*t3 + conjg( si )*cmplx( t6, t1i,KIND=dp)
+              ix = ix + incx
+              ic = ic + incc
+           end do
+           return
+     end subroutine stdlib_zlar2v
+
+     !> ZLARCM: performs a very simple matrix-matrix multiplication:
+     !> C := A * B,
+     !> where A is M by M and real; B is M by N and complex;
+     !> C is M by N and complex.
+
+     pure subroutine stdlib_zlarcm( m, n, a, lda, b, ldb, c, ldc, rwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldb, ldc, m, n
+           ! Array Arguments 
+           real(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(in) :: b(ldb,*)
+           complex(dp), intent(out) :: c(ldc,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,aimag
+           ! Executable Statements 
+           ! quick return if possible.
+           if( ( m==0 ) .or. ( n==0 ) )return
+           do j = 1, n
+              do i = 1, m
+                 rwork( ( j-1 )*m+i ) = real( b( i, j ),KIND=dp)
+              end do
+           end do
+           l = m*n + 1
+           call stdlib_dgemm( 'N', 'N', m, n, m, one, a, lda, rwork, m, zero,rwork( l ), m )
+                     
+           do j = 1, n
+              do i = 1, m
+                 c( i, j ) = rwork( l+( j-1 )*m+i-1 )
+              end do
+           end do
+           do j = 1, n
+              do i = 1, m
+                 rwork( ( j-1 )*m+i ) = aimag( b( i, j ) )
+              end do
+           end do
+           call stdlib_dgemm( 'N', 'N', m, n, m, one, a, lda, rwork, m, zero,rwork( l ), m )
+                     
+           do j = 1, n
+              do i = 1, m
+                 c( i, j ) = cmplx( real( c( i, j ),KIND=dp),rwork( l+( j-1 )*m+i-1 ),KIND=dp)
+                           
+              end do
+           end do
+           return
+     end subroutine stdlib_zlarcm
+
+     !> ZLARF: applies a complex elementary reflector H to a complex M-by-N
+     !> matrix C, from either the left or the right. H is represented in the
+     !> form
+     !> H = I - tau * v * v**H
+     !> where tau is a complex scalar and v is a complex vector.
+     !> If tau = 0, then H is taken to be the unit matrix.
+     !> To apply H**H, supply conjg(tau) instead
+     !> tau.
+
+     pure subroutine stdlib_zlarf( side, m, n, v, incv, tau, c, ldc, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: incv, ldc, m, n
+           complex(dp), intent(in) :: tau
+           ! Array Arguments 
+           complex(dp), intent(inout) :: c(ldc,*)
+           complex(dp), intent(in) :: v(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: applyleft
+           integer(ilp) :: i, lastv, lastc
+           ! Executable Statements 
+           applyleft = stdlib_lsame( side, 'L' )
+           lastv = 0
+           lastc = 0
+           if( tau/=czero ) then
+           ! set up variables for scanning v.  lastv begins pointing to the end
+           ! of v.
+              if( applyleft ) then
+                 lastv = m
+              else
+                 lastv = n
+              end if
+              if( incv>0 ) then
+                 i = 1 + (lastv-1) * incv
+              else
+                 i = 1
+              end if
+           ! look for the last non-czero row in v.
+              do while( lastv>0 .and. v( i )==czero )
+                 lastv = lastv - 1
+                 i = i - incv
+              end do
+              if( applyleft ) then
+           ! scan for the last non-czero column in c(1:lastv,:).
+                 lastc = stdlib_ilazlc(lastv, n, c, ldc)
+              else
+           ! scan for the last non-czero row in c(:,1:lastv).
+                 lastc = stdlib_ilazlr(m, lastv, c, ldc)
+              end if
+           end if
+           ! note that lastc.eq.0_dp renders the blas operations null; no special
+           ! case is needed at this level.
+           if( applyleft ) then
+              ! form  h * c
+              if( lastv>0 ) then
+                 ! w(1:lastc,1) := c(1:lastv,1:lastc)**h * v(1:lastv,1)
+                 call stdlib_zgemv( 'CONJUGATE TRANSPOSE', lastv, lastc, cone,c, ldc, v, incv, &
+                           czero, work, 1 )
+                 ! c(1:lastv,1:lastc) := c(...) - v(1:lastv,1) * w(1:lastc,1)**h
+                 call stdlib_zgerc( lastv, lastc, -tau, v, incv, work, 1, c, ldc )
+              end if
+           else
+              ! form  c * h
+              if( lastv>0 ) then
+                 ! w(1:lastc,1) := c(1:lastc,1:lastv) * v(1:lastv,1)
+                 call stdlib_zgemv( 'NO TRANSPOSE', lastc, lastv, cone, c, ldc,v, incv, czero, &
+                           work, 1 )
+                 ! c(1:lastc,1:lastv) := c(...) - w(1:lastc,1) * v(1:lastv,1)**h
+                 call stdlib_zgerc( lastc, lastv, -tau, work, 1, v, incv, c, ldc )
+              end if
+           end if
+           return
+     end subroutine stdlib_zlarf
+
+     !> ZLARFB: applies a complex block reflector H or its transpose H**H to a
+     !> complex M-by-N matrix C, from either the left or the right.
+
+     pure subroutine stdlib_zlarfb( side, trans, direct, storev, m, n, k, v, ldv,t, ldt, c, ldc, &
+               work, ldwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, ldc, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: c(ldc,*)
+           complex(dp), intent(in) :: t(ldt,*), v(ldv,*)
+           complex(dp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           character :: transt
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           if( stdlib_lsame( trans, 'N' ) ) then
+              transt = 'C'
+           else
+              transt = 'N'
+           end if
+           if( stdlib_lsame( storev, 'C' ) ) then
+              if( stdlib_lsame( direct, 'F' ) ) then
+                 ! let  v =  ( v1 )    (first k rows)
+                           ! ( v2 )
+                 ! where  v1  is unit lower triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**h * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**h * v  =  (c1**h * v1 + c2**h * v2)  (stored in work)
+                    ! w := c1**h
+                    do j = 1, k
+                       call stdlib_zcopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+                       call stdlib_zlacgv( n, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n,k, cone, v, &
+                              ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c2**h * v2
+                       call stdlib_zgemm( 'CONJUGATE TRANSPOSE', 'NO TRANSPOSE', n,k, m-k, cone, &
+                                 c( k+1, 1 ), ldc,v( k+1, 1 ), ldv, cone, work, ldwork )
+                    end if
+                    ! w := w * t**h  or  w * t
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', transt, 'NON-UNIT', n, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v * w**h
+                    if( m>k ) then
+                       ! c2 := c2 - v2 * w**h
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',m-k, n, k, -cone, &
+                                 v( k+1, 1 ), ldv, work,ldwork, cone, c( k+1, 1 ), ldc )
+                    end if
+                    ! w := w * v1**h
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', n, k, cone,&
+                               v, ldv, work, ldwork )
+                    ! c1 := c1 - w**h
+                    do j = 1, k
+                       do i = 1, n
+                          c( j, i ) = c( j, i ) - conjg( work( i, j ) )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**h  where  c = ( c1  c2 )
+                    ! w := c * v  =  (c1*v1 + c2*v2)  (stored in work)
+                    ! w := c1
+                    do j = 1, k
+                       call stdlib_zcopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', m,k, cone, v, &
+                              ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c2 * v2
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, k, n-k,cone, c( 1, k+&
+                                 1 ), ldc, v( k+1, 1 ), ldv,cone, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**h
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', trans, 'NON-UNIT', m, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v**h
+                    if( n>k ) then
+                       ! c2 := c2 - w * v2**h
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m,n-k, k, -cone, &
+                                 work, ldwork, v( k+1, 1 ),ldv, cone, c( 1, k+1 ), ldc )
+                    end if
+                    ! w := w * v1**h
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', m, k, cone,&
+                               v, ldv, work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, j ) = c( i, j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              else
+                 ! let  v =  ( v1 )
+                           ! ( v2 )    (last k rows)
+                 ! where  v2  is unit upper triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**h * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**h * v  =  (c1**h * v1 + c2**h * v2)  (stored in work)
+                    ! w := c2**h
+                    do j = 1, k
+                       call stdlib_zcopy( n, c( m-k+j, 1 ), ldc, work( 1, j ), 1 )
+                       call stdlib_zlacgv( n, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', n,k, cone, v( m-&
+                              k+1, 1 ), ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c1**h * v1
+                       call stdlib_zgemm( 'CONJUGATE TRANSPOSE', 'NO TRANSPOSE', n,k, m-k, cone, &
+                                 c, ldc, v, ldv, cone, work,ldwork )
+                    end if
+                    ! w := w * t**h  or  w * t
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v * w**h
+                    if( m>k ) then
+                       ! c1 := c1 - v1 * w**h
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',m-k, n, k, -cone, &
+                                 v, ldv, work, ldwork,cone, c, ldc )
+                    end if
+                    ! w := w * v2**h
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','UNIT', n, k, cone,&
+                               v( m-k+1, 1 ), ldv, work,ldwork )
+                    ! c2 := c2 - w**h
+                    do j = 1, k
+                       do i = 1, n
+                          c( m-k+j, i ) = c( m-k+j, i ) -conjg( work( i, j ) )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**h  where  c = ( c1  c2 )
+                    ! w := c * v  =  (c1*v1 + c2*v2)  (stored in work)
+                    ! w := c2
+                    do j = 1, k
+                       call stdlib_zcopy( m, c( 1, n-k+j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', m,k, cone, v( n-&
+                              k+1, 1 ), ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c1 * v1
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, k, n-k,cone, c, ldc, &
+                                 v, ldv, cone, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**h
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v**h
+                    if( n>k ) then
+                       ! c1 := c1 - w * v1**h
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m,n-k, k, -cone, &
+                                 work, ldwork, v, ldv, cone,c, ldc )
+                    end if
+                    ! w := w * v2**h
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','UNIT', m, k, cone,&
+                               v( n-k+1, 1 ), ldv, work,ldwork )
+                    ! c2 := c2 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, n-k+j ) = c( i, n-k+j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              end if
+           else if( stdlib_lsame( storev, 'R' ) ) then
+              if( stdlib_lsame( direct, 'F' ) ) then
+                 ! let  v =  ( v1  v2 )    (v1: first k columns)
+                 ! where  v1  is unit upper triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**h * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**h * v**h  =  (c1**h * v1**h + c2**h * v2**h) (stored in work)
+                    ! w := c1**h
+                    do j = 1, k
+                       call stdlib_zcopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+                       call stdlib_zlacgv( n, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1**h
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','UNIT', n, k, cone,&
+                               v, ldv, work, ldwork )
+                    if( m>k ) then
+                       ! w := w + c2**h * v2**h
+                       call stdlib_zgemm( 'CONJUGATE TRANSPOSE','CONJUGATE TRANSPOSE', n, k, m-k, &
+                                 cone,c( k+1, 1 ), ldc, v( 1, k+1 ), ldv, cone,work, ldwork )
+                    end if
+                    ! w := w * t**h  or  w * t
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', transt, 'NON-UNIT', n, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v**h * w**h
+                    if( m>k ) then
+                       ! c2 := c2 - v2**h * w**h
+                       call stdlib_zgemm( 'CONJUGATE TRANSPOSE','CONJUGATE TRANSPOSE', m-k, n, k, &
+                                 -cone,v( 1, k+1 ), ldv, work, ldwork, cone,c( k+1, 1 ), ldc )
+                    end if
+                    ! w := w * v1
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', n,k, cone, v, &
+                              ldv, work, ldwork )
+                    ! c1 := c1 - w**h
+                    do j = 1, k
+                       do i = 1, n
+                          c( j, i ) = c( j, i ) - conjg( work( i, j ) )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**h  where  c = ( c1  c2 )
+                    ! w := c * v**h  =  (c1*v1**h + c2*v2**h)  (stored in work)
+                    ! w := c1
+                    do j = 1, k
+                       call stdlib_zcopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v1**h
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','UNIT', m, k, cone,&
+                               v, ldv, work, ldwork )
+                    if( n>k ) then
+                       ! w := w + c2 * v2**h
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m,k, n-k, cone, &
+                                 c( 1, k+1 ), ldc,v( 1, k+1 ), ldv, cone, work, ldwork )
+                    end if
+                    ! w := w * t  or  w * t**h
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', trans, 'NON-UNIT', m, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v
+                    if( n>k ) then
+                       ! c2 := c2 - w * v2
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, n-k, k,-cone, work, &
+                                 ldwork, v( 1, k+1 ), ldv, cone,c( 1, k+1 ), ldc )
+                    end if
+                    ! w := w * v1
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'UNIT', m,k, cone, v, &
+                              ldv, work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, j ) = c( i, j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              else
+                 ! let  v =  ( v1  v2 )    (v2: last k columns)
+                 ! where  v2  is unit lower triangular.
+                 if( stdlib_lsame( side, 'L' ) ) then
+                    ! form  h * c  or  h**h * c  where  c = ( c1 )
+                                                          ! ( c2 )
+                    ! w := c**h * v**h  =  (c1**h * v1**h + c2**h * v2**h) (stored in work)
+                    ! w := c2**h
+                    do j = 1, k
+                       call stdlib_zcopy( n, c( m-k+j, 1 ), ldc, work( 1, j ), 1 )
+                       call stdlib_zlacgv( n, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2**h
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', n, k, cone,&
+                               v( 1, m-k+1 ), ldv, work,ldwork )
+                    if( m>k ) then
+                       ! w := w + c1**h * v1**h
+                       call stdlib_zgemm( 'CONJUGATE TRANSPOSE','CONJUGATE TRANSPOSE', n, k, m-k, &
+                                 cone, c,ldc, v, ldv, cone, work, ldwork )
+                    end if
+                    ! w := w * t**h  or  w * t
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - v**h * w**h
+                    if( m>k ) then
+                       ! c1 := c1 - v1**h * w**h
+                       call stdlib_zgemm( 'CONJUGATE TRANSPOSE','CONJUGATE TRANSPOSE', m-k, n, k, &
+                                 -cone, v,ldv, work, ldwork, cone, c, ldc )
+                    end if
+                    ! w := w * v2
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n,k, cone, v( 1, &
+                              m-k+1 ), ldv, work, ldwork )
+                    ! c2 := c2 - w**h
+                    do j = 1, k
+                       do i = 1, n
+                          c( m-k+j, i ) = c( m-k+j, i ) -conjg( work( i, j ) )
+                       end do
+                    end do
+                 else if( stdlib_lsame( side, 'R' ) ) then
+                    ! form  c * h  or  c * h**h  where  c = ( c1  c2 )
+                    ! w := c * v**h  =  (c1*v1**h + c2*v2**h)  (stored in work)
+                    ! w := c2
+                    do j = 1, k
+                       call stdlib_zcopy( m, c( 1, n-k+j ), 1, work( 1, j ), 1 )
+                    end do
+                    ! w := w * v2**h
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', m, k, cone,&
+                               v( 1, n-k+1 ), ldv, work,ldwork )
+                    if( n>k ) then
+                       ! w := w + c1 * v1**h
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m,k, n-k, cone, &
+                                 c, ldc, v, ldv, cone, work,ldwork )
+                    end if
+                    ! w := w * t  or  w * t**h
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k,cone, t, ldt, &
+                              work, ldwork )
+                    ! c := c - w * v
+                    if( n>k ) then
+                       ! c1 := c1 - w * v1
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, n-k, k,-cone, work, &
+                                 ldwork, v, ldv, cone, c, ldc )
+                    end if
+                    ! w := w * v2
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', m,k, cone, v( 1, &
+                              n-k+1 ), ldv, work, ldwork )
+                    ! c1 := c1 - w
+                    do j = 1, k
+                       do i = 1, m
+                          c( i, n-k+j ) = c( i, n-k+j ) - work( i, j )
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_zlarfb
+
+     !> ZLARFB_GETT: applies a complex Householder block reflector H from the
+     !> left to a complex (K+M)-by-N  "triangular-pentagonal" matrix
+     !> composed of two block matrices: an upper trapezoidal K-by-N matrix A
+     !> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
+     !> in the array B. The block reflector H is stored in a compact
+     !> WY-representation, where the elementary reflectors are in the
+     !> arrays A, B and T. See Further Details section.
+
+     pure subroutine stdlib_zlarfb_gett( ident, m, n, k, t, ldt, a, lda, b, ldb,work, ldwork )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: ident
+           integer(ilp), intent(in) :: k, lda, ldb, ldt, ldwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(in) :: t(ldt,*)
+           complex(dp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnotident
+           integer(ilp) :: i, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<0 .or. n<=0 .or. k==0 .or. k>n )return
+           lnotident = .not.stdlib_lsame( ident, 'I' )
+           ! ------------------------------------------------------------------
+           ! first step. computation of the column block 2:
+              ! ( a2 ) := h * ( a2 )
+              ! ( b2 )        ( b2 )
+           ! ------------------------------------------------------------------
+           if( n>k ) then
+              ! col2_(1) compute w2: = a2. therefore, copy a2 = a(1:k, k+1:n)
+              ! into w2=work(1:k, 1:n-k) column-by-column.
+              do j = 1, n-k
+                 call stdlib_zcopy( k, a( 1, k+j ), 1, work( 1, j ), 1 )
+              end do
+              if( lnotident ) then
+                 ! col2_(2) compute w2: = (v1**h) * w2 = (a1**h) * w2,
+                 ! v1 is not an identy matrix, but unit lower-triangular
+                 ! v1 stored in a1 (diagonal ones are not stored).
+                 call stdlib_ztrmm( 'L', 'L', 'C', 'U', k, n-k, cone, a, lda,work, ldwork )
+                           
+              end if
+              ! col2_(3) compute w2: = w2 + (v2**h) * b2 = w2 + (b1**h) * b2
+              ! v2 stored in b1.
+              if( m>0 ) then
+                 call stdlib_zgemm( 'C', 'N', k, n-k, m, cone, b, ldb,b( 1, k+1 ), ldb, cone, &
+                           work, ldwork )
+              end if
+              ! col2_(4) compute w2: = t * w2,
+              ! t is upper-triangular.
+              call stdlib_ztrmm( 'L', 'U', 'N', 'N', k, n-k, cone, t, ldt,work, ldwork )
+              ! col2_(5) compute b2: = b2 - v2 * w2 = b2 - b1 * w2,
+              ! v2 stored in b1.
+              if( m>0 ) then
+                 call stdlib_zgemm( 'N', 'N', m, n-k, k, -cone, b, ldb,work, ldwork, cone, b( 1, &
+                           k+1 ), ldb )
+              end if
+              if( lnotident ) then
+                 ! col2_(6) compute w2: = v1 * w2 = a1 * w2,
+                 ! v1 is not an identity matrix, but unit lower-triangular,
+                 ! v1 stored in a1 (diagonal ones are not stored).
+                 call stdlib_ztrmm( 'L', 'L', 'N', 'U', k, n-k, cone, a, lda,work, ldwork )
+                           
+              end if
+              ! col2_(7) compute a2: = a2 - w2 =
+                                   ! = a(1:k, k+1:n-k) - work(1:k, 1:n-k),
+              ! column-by-column.
+              do j = 1, n-k
+                 do i = 1, k
+                    a( i, k+j ) = a( i, k+j ) - work( i, j )
+                 end do
+              end do
+           end if
+           ! ------------------------------------------------------------------
+           ! second step. computation of the column block 1:
+              ! ( a1 ) := h * ( a1 )
+              ! ( b1 )        (  0 )
+           ! ------------------------------------------------------------------
+           ! col1_(1) compute w1: = a1. copy the upper-triangular
+           ! a1 = a(1:k, 1:k) into the upper-triangular
+           ! w1 = work(1:k, 1:k) column-by-column.
+           do j = 1, k
+              call stdlib_zcopy( j, a( 1, j ), 1, work( 1, j ), 1 )
+           end do
+           ! set the subdiagonal elements of w1 to zero column-by-column.
+           do j = 1, k - 1
+              do i = j + 1, k
+                 work( i, j ) = czero
+              end do
+           end do
+           if( lnotident ) then
+              ! col1_(2) compute w1: = (v1**h) * w1 = (a1**h) * w1,
+              ! v1 is not an identity matrix, but unit lower-triangular
+              ! v1 stored in a1 (diagonal ones are not stored),
+              ! w1 is upper-triangular with zeroes below the diagonal.
+              call stdlib_ztrmm( 'L', 'L', 'C', 'U', k, k, cone, a, lda,work, ldwork )
+           end if
+           ! col1_(3) compute w1: = t * w1,
+           ! t is upper-triangular,
+           ! w1 is upper-triangular with zeroes below the diagonal.
+           call stdlib_ztrmm( 'L', 'U', 'N', 'N', k, k, cone, t, ldt,work, ldwork )
+           ! col1_(4) compute b1: = - v2 * w1 = - b1 * w1,
+           ! v2 = b1, w1 is upper-triangular with zeroes below the diagonal.
+           if( m>0 ) then
+              call stdlib_ztrmm( 'R', 'U', 'N', 'N', m, k, -cone, work, ldwork,b, ldb )
+           end if
+           if( lnotident ) then
+              ! col1_(5) compute w1: = v1 * w1 = a1 * w1,
+              ! v1 is not an identity matrix, but unit lower-triangular
+              ! v1 stored in a1 (diagonal ones are not stored),
+              ! w1 is upper-triangular on input with zeroes below the diagonal,
+              ! and square on output.
+              call stdlib_ztrmm( 'L', 'L', 'N', 'U', k, k, cone, a, lda,work, ldwork )
+              ! col1_(6) compute a1: = a1 - w1 = a(1:k, 1:k) - work(1:k, 1:k)
+              ! column-by-column. a1 is upper-triangular on input.
+              ! if ident, a1 is square on output, and w1 is square,
+              ! if not ident, a1 is upper-triangular on output,
+              ! w1 is upper-triangular.
+              ! col1_(6)_a compute elements of a1 below the diagonal.
+              do j = 1, k - 1
+                 do i = j + 1, k
+                    a( i, j ) = - work( i, j )
+                 end do
+              end do
+           end if
+           ! col1_(6)_b compute elements of a1 on and above the diagonal.
+           do j = 1, k
+              do i = 1, j
+                 a( i, j ) = a( i, j ) - work( i, j )
+              end do
+           end do
+           return
+     end subroutine stdlib_zlarfb_gett
+
+     !> ZLARFG: generates a complex elementary reflector H of order n, such
+     !> that
+     !> H**H * ( alpha ) = ( beta ),   H**H * H = I.
+     !> (   x   )   (   0  )
+     !> where alpha and beta are scalars, with beta real, and x is an
+     !> (n-1)-element complex vector. H is represented in the form
+     !> H = I - tau * ( 1 ) * ( 1 v**H ) ,
+     !> ( v )
+     !> where tau is a complex scalar and v is a complex (n-1)-element
+     !> vector. Note that H is not hermitian.
+     !> If the elements of x are all zero and alpha is real, then tau = 0
+     !> and H is taken to be the unit matrix.
+     !> Otherwise  1 <= real(tau) <= 2  and  abs(tau-1) <= 1 .
+
+     pure subroutine stdlib_zlarfg( n, alpha, x, incx, tau )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           complex(dp), intent(inout) :: alpha
+           complex(dp), intent(out) :: tau
+           ! Array Arguments 
+           complex(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, knt
+           real(dp) :: alphi, alphr, beta, rsafmn, safmin, xnorm
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,sign
+           ! Executable Statements 
+           if( n<=0 ) then
+              tau = zero
+              return
+           end if
+           xnorm = stdlib_dznrm2( n-1, x, incx )
+           alphr = real( alpha,KIND=dp)
+           alphi = aimag( alpha )
+           if( xnorm==zero .and. alphi==zero ) then
+              ! h  =  i
+              tau = zero
+           else
+              ! general case
+              beta = -sign( stdlib_dlapy3( alphr, alphi, xnorm ), alphr )
+              safmin = stdlib_dlamch( 'S' ) / stdlib_dlamch( 'E' )
+              rsafmn = one / safmin
+              knt = 0
+              if( abs( beta )<safmin ) then
+                 ! xnorm, beta may be inaccurate; scale x and recompute them
+                 10 continue
+                 knt = knt + 1
+                 call stdlib_zdscal( n-1, rsafmn, x, incx )
+                 beta = beta*rsafmn
+                 alphi = alphi*rsafmn
+                 alphr = alphr*rsafmn
+                 if( (abs( beta )<safmin) .and. (knt < 20) )go to 10
+                 ! new beta is at most 1, at least safmin
+                 xnorm = stdlib_dznrm2( n-1, x, incx )
+                 alpha = cmplx( alphr, alphi,KIND=dp)
+                 beta = -sign( stdlib_dlapy3( alphr, alphi, xnorm ), alphr )
+              end if
+              tau = cmplx( ( beta-alphr ) / beta, -alphi / beta,KIND=dp)
+              alpha = stdlib_zladiv( cmplx( one,KIND=dp), alpha-beta )
+              call stdlib_zscal( n-1, alpha, x, incx )
+              ! if alpha is subnormal, it may lose relative accuracy
+              do j = 1, knt
+                 beta = beta*safmin
+              end do
+              alpha = beta
+           end if
+           return
+     end subroutine stdlib_zlarfg
+
+     !> ZLARFGP: generates a complex elementary reflector H of order n, such
+     !> that
+     !> H**H * ( alpha ) = ( beta ),   H**H * H = I.
+     !> (   x   )   (   0  )
+     !> where alpha and beta are scalars, beta is real and non-negative, and
+     !> x is an (n-1)-element complex vector.  H is represented in the form
+     !> H = I - tau * ( 1 ) * ( 1 v**H ) ,
+     !> ( v )
+     !> where tau is a complex scalar and v is a complex (n-1)-element
+     !> vector. Note that H is not hermitian.
+     !> If the elements of x are all zero and alpha is real, then tau = 0
+     !> and H is taken to be the unit matrix.
+
+     subroutine stdlib_zlarfgp( n, alpha, x, incx, tau )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, n
+           complex(dp), intent(inout) :: alpha
+           complex(dp), intent(out) :: tau
+           ! Array Arguments 
+           complex(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, knt
+           real(dp) :: alphi, alphr, beta, bignum, smlnum, xnorm
+           complex(dp) :: savealpha
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,sign
+           ! Executable Statements 
+           if( n<=0 ) then
+              tau = zero
+              return
+           end if
+           xnorm = stdlib_dznrm2( n-1, x, incx )
+           alphr = real( alpha,KIND=dp)
+           alphi = aimag( alpha )
+           if( xnorm==zero ) then
+              ! h  =  [1-alpha/abs(alpha) 0; 0 i], sign chosen so alpha >= 0.
+              if( alphi==zero ) then
+                 if( alphr>=zero ) then
+                    ! when tau.eq.zero, the vector is special-cased to be
+                    ! all zeros in the application routines.  we do not need
+                    ! to clear it.
+                    tau = zero
+                 else
+                    ! however, the application routines rely on explicit
+                    ! zero checks when tau.ne.zero, and we must clear x.
+                    tau = two
+                    do j = 1, n-1
+                       x( 1 + (j-1)*incx ) = zero
+                    end do
+                    alpha = -alpha
+                 end if
+              else
+                 ! only "reflecting" the diagonal entry to be real and non-negative.
+                 xnorm = stdlib_dlapy2( alphr, alphi )
+                 tau = cmplx( one - alphr / xnorm, -alphi / xnorm,KIND=dp)
+                 do j = 1, n-1
+                    x( 1 + (j-1)*incx ) = zero
+                 end do
+                 alpha = xnorm
+              end if
+           else
+              ! general case
+              beta = sign( stdlib_dlapy3( alphr, alphi, xnorm ), alphr )
+              smlnum = stdlib_dlamch( 'S' ) / stdlib_dlamch( 'E' )
+              bignum = one / smlnum
+              knt = 0
+              if( abs( beta )<smlnum ) then
+                 ! xnorm, beta may be inaccurate; scale x and recompute them
+                 10 continue
+                 knt = knt + 1
+                 call stdlib_zdscal( n-1, bignum, x, incx )
+                 beta = beta*bignum
+                 alphi = alphi*bignum
+                 alphr = alphr*bignum
+                 if( (abs( beta )<smlnum) .and. (knt < 20) )go to 10
+                 ! new beta is at most 1, at least smlnum
+                 xnorm = stdlib_dznrm2( n-1, x, incx )
+                 alpha = cmplx( alphr, alphi,KIND=dp)
+                 beta = sign( stdlib_dlapy3( alphr, alphi, xnorm ), alphr )
+              end if
+              savealpha = alpha
+              alpha = alpha + beta
+              if( beta<zero ) then
+                 beta = -beta
+                 tau = -alpha / beta
+              else
+                 alphr = alphi * (alphi/real( alpha,KIND=dp))
+                 alphr = alphr + xnorm * (xnorm/real( alpha,KIND=dp))
+                 tau = cmplx( alphr/beta, -alphi/beta,KIND=dp)
+                 alpha = cmplx( -alphr, alphi,KIND=dp)
+              end if
+              alpha = stdlib_zladiv( cmplx( one,KIND=dp), alpha )
+              if ( abs(tau)<=smlnum ) then
+                 ! in the case where the computed tau ends up being a denormalized number,
+                 ! it loses relative accuracy. this is a big problem. solution: flush tau
+                 ! to zero (or two or whatever makes a nonnegative real number for beta).
+                 ! (bug report provided by pat quillen from mathworks on jul 29, 2009.)
+                 ! (thanks pat. thanks mathworks.)
+                 alphr = real( savealpha,KIND=dp)
+                 alphi = aimag( savealpha )
+                 if( alphi==zero ) then
+                    if( alphr>=zero ) then
+                       tau = zero
+                    else
+                       tau = two
+                       do j = 1, n-1
+                          x( 1 + (j-1)*incx ) = zero
+                       end do
+                       beta = real( -savealpha,KIND=dp)
+                    end if
+                 else
+                    xnorm = stdlib_dlapy2( alphr, alphi )
+                    tau = cmplx( one - alphr / xnorm, -alphi / xnorm,KIND=dp)
+                    do j = 1, n-1
+                       x( 1 + (j-1)*incx ) = zero
+                    end do
+                    beta = xnorm
+                 end if
+              else
+                 ! this is the general case.
+                 call stdlib_zscal( n-1, alpha, x, incx )
+              end if
+              ! if beta is subnormal, it may lose relative accuracy
+              do j = 1, knt
+                 beta = beta*smlnum
+              end do
+              alpha = beta
+           end if
+           return
+     end subroutine stdlib_zlarfgp
+
+     !> ZLARFT: forms the triangular factor T of a complex block reflector H
+     !> of order n, which is defined as a product of k elementary reflectors.
+     !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+     !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+     !> If STOREV = 'C', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th column of the array V, and
+     !> H  =  I - V * T * V**H
+     !> If STOREV = 'R', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th row of the array V, and
+     !> H  =  I - V**H * T * V
+
+     pure subroutine stdlib_zlarft( direct, storev, n, k, v, ldv, tau, t, ldt )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, storev
+           integer(ilp), intent(in) :: k, ldt, ldv, n
+           ! Array Arguments 
+           complex(dp), intent(out) :: t(ldt,*)
+           complex(dp), intent(in) :: tau(*), v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, prevlastv, lastv
+           ! Executable Statements 
+           ! quick return if possible
+           if( n==0 )return
+           if( stdlib_lsame( direct, 'F' ) ) then
+              prevlastv = n
+              do i = 1, k
+                 prevlastv = max( prevlastv, i )
+                 if( tau( i )==czero ) then
+                    ! h(i)  =  i
+                    do j = 1, i
+                       t( j, i ) = czero
+                    end do
+                 else
+                    ! general case
+                    if( stdlib_lsame( storev, 'C' ) ) then
+                       ! skip any trailing zeros.
+                       do lastv = n, i+1, -1
+                          if( v( lastv, i )/=czero ) exit
+                       end do
+                       do j = 1, i-1
+                          t( j, i ) = -tau( i ) * conjg( v( i , j ) )
+                       end do
+                       j = min( lastv, prevlastv )
+                       ! t(1:i-1,i) := - tau(i) * v(i:j,1:i-1)**h * v(i:j,i)
+                       call stdlib_zgemv( 'CONJUGATE TRANSPOSE', j-i, i-1,-tau( i ), v( i+1, 1 ), &
+                                 ldv,v( i+1, i ), 1, cone, t( 1, i ), 1 )
+                    else
+                       ! skip any trailing zeros.
+                       do lastv = n, i+1, -1
+                          if( v( i, lastv )/=czero ) exit
+                       end do
+                       do j = 1, i-1
+                          t( j, i ) = -tau( i ) * v( j , i )
+                       end do
+                       j = min( lastv, prevlastv )
+                       ! t(1:i-1,i) := - tau(i) * v(1:i-1,i:j) * v(i,i:j)**h
+                       call stdlib_zgemm( 'N', 'C', i-1, 1, j-i, -tau( i ),v( 1, i+1 ), ldv, v( i,&
+                                  i+1 ), ldv,cone, t( 1, i ), ldt )
+                    end if
+                    ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+                    call stdlib_ztrmv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', i-1, t,ldt, t( 1, i ),&
+                               1 )
+                    t( i, i ) = tau( i )
+                    if( i>1 ) then
+                       prevlastv = max( prevlastv, lastv )
+                    else
+                       prevlastv = lastv
+                    end if
+                  end if
+              end do
+           else
+              prevlastv = 1
+              do i = k, 1, -1
+                 if( tau( i )==czero ) then
+                    ! h(i)  =  i
+                    do j = i, k
+                       t( j, i ) = czero
+                    end do
+                 else
+                    ! general case
+                    if( i<k ) then
+                       if( stdlib_lsame( storev, 'C' ) ) then
+                          ! skip any leading zeros.
+                          do lastv = 1, i-1
+                             if( v( lastv, i )/=czero ) exit
+                          end do
+                          do j = i+1, k
+                             t( j, i ) = -tau( i ) * conjg( v( n-k+i , j ) )
+                          end do
+                          j = max( lastv, prevlastv )
+                          ! t(i+1:k,i) = -tau(i) * v(j:n-k+i,i+1:k)**h * v(j:n-k+i,i)
+                          call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-k+i-j, k-i,-tau( i ), v( j, &
+                                    i+1 ), ldv, v( j, i ),1, cone, t( i+1, i ), 1 )
+                       else
+                          ! skip any leading zeros.
+                          do lastv = 1, i-1
+                             if( v( i, lastv )/=czero ) exit
+                          end do
+                          do j = i+1, k
+                             t( j, i ) = -tau( i ) * v( j, n-k+i )
+                          end do
+                          j = max( lastv, prevlastv )
+                          ! t(i+1:k,i) = -tau(i) * v(i+1:k,j:n-k+i) * v(i,j:n-k+i)**h
+                          call stdlib_zgemm( 'N', 'C', k-i, 1, n-k+i-j, -tau( i ),v( i+1, j ), &
+                                    ldv, v( i, j ), ldv,cone, t( i+1, i ), ldt )
+                       end if
+                       ! t(i+1:k,i) := t(i+1:k,i+1:k) * t(i+1:k,i)
+                       call stdlib_ztrmv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', k-i,t( i+1, i+1 ), &
+                                 ldt, t( i+1, i ), 1 )
+                       if( i>1 ) then
+                          prevlastv = min( prevlastv, lastv )
+                       else
+                          prevlastv = lastv
+                       end if
+                    end if
+                    t( i, i ) = tau( i )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_zlarft
+
+     !> ZLARFX: applies a complex elementary reflector H to a complex m by n
+     !> matrix C, from either the left or the right. H is represented in the
+     !> form
+     !> H = I - tau * v * v**H
+     !> where tau is a complex scalar and v is a complex vector.
+     !> If tau = 0, then H is taken to be the unit matrix
+     !> This version uses inline code if H has order < 11.
+
+     pure subroutine stdlib_zlarfx( side, m, n, v, tau, c, ldc, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: ldc, m, n
+           complex(dp), intent(in) :: tau
+           ! Array Arguments 
+           complex(dp), intent(inout) :: c(ldc,*)
+           complex(dp), intent(in) :: v(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j
+           complex(dp) :: sum, t1, t10, t2, t3, t4, t5, t6, t7, t8, t9, v1, v10, v2, v3, v4, v5, &
+                     v6, v7, v8, v9
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           if( tau==czero )return
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c, where h has order m.
+              go to ( 10, 30, 50, 70, 90, 110, 130, 150,170, 190 )m
+              ! code for general m
+              call stdlib_zlarf( side, m, n, v, 1, tau, c, ldc, work )
+              go to 410
+              10 continue
+              ! special code for 1 x 1 householder
+              t1 = cone - tau*v( 1 )*conjg( v( 1 ) )
+              do j = 1, n
+                 c( 1, j ) = t1*c( 1, j )
+              end do
+              go to 410
+              30 continue
+              ! special code for 2 x 2 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+              end do
+              go to 410
+              50 continue
+              ! special code for 3 x 3 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+              end do
+              go to 410
+              70 continue
+              ! special code for 4 x 4 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+              end do
+              go to 410
+              90 continue
+              ! special code for 5 x 5 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j )
+                           
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+              end do
+              go to 410
+              110 continue
+              ! special code for 6 x 6 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+              end do
+              go to 410
+              130 continue
+              ! special code for 7 x 7 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              v7 = conjg( v( 7 ) )
+              t7 = tau*conjg( v7 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+              end do
+              go to 410
+              150 continue
+              ! special code for 8 x 8 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              v7 = conjg( v( 7 ) )
+              t7 = tau*conjg( v7 )
+              v8 = conjg( v( 8 ) )
+              t8 = tau*conjg( v8 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+              end do
+              go to 410
+              170 continue
+              ! special code for 9 x 9 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              v7 = conjg( v( 7 ) )
+              t7 = tau*conjg( v7 )
+              v8 = conjg( v( 8 ) )
+              t8 = tau*conjg( v8 )
+              v9 = conjg( v( 9 ) )
+              t9 = tau*conjg( v9 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j ) + v9*c( 9, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+                 c( 9, j ) = c( 9, j ) - sum*t9
+              end do
+              go to 410
+              190 continue
+              ! special code for 10 x 10 householder
+              v1 = conjg( v( 1 ) )
+              t1 = tau*conjg( v1 )
+              v2 = conjg( v( 2 ) )
+              t2 = tau*conjg( v2 )
+              v3 = conjg( v( 3 ) )
+              t3 = tau*conjg( v3 )
+              v4 = conjg( v( 4 ) )
+              t4 = tau*conjg( v4 )
+              v5 = conjg( v( 5 ) )
+              t5 = tau*conjg( v5 )
+              v6 = conjg( v( 6 ) )
+              t6 = tau*conjg( v6 )
+              v7 = conjg( v( 7 ) )
+              t7 = tau*conjg( v7 )
+              v8 = conjg( v( 8 ) )
+              t8 = tau*conjg( v8 )
+              v9 = conjg( v( 9 ) )
+              t9 = tau*conjg( v9 )
+              v10 = conjg( v( 10 ) )
+              t10 = tau*conjg( v10 )
+              do j = 1, n
+                 sum = v1*c( 1, j ) + v2*c( 2, j ) + v3*c( 3, j ) +v4*c( 4, j ) + v5*c( 5, j ) + &
+                           v6*c( 6, j ) +v7*c( 7, j ) + v8*c( 8, j ) + v9*c( 9, j ) +v10*c( 10, j )
+                 c( 1, j ) = c( 1, j ) - sum*t1
+                 c( 2, j ) = c( 2, j ) - sum*t2
+                 c( 3, j ) = c( 3, j ) - sum*t3
+                 c( 4, j ) = c( 4, j ) - sum*t4
+                 c( 5, j ) = c( 5, j ) - sum*t5
+                 c( 6, j ) = c( 6, j ) - sum*t6
+                 c( 7, j ) = c( 7, j ) - sum*t7
+                 c( 8, j ) = c( 8, j ) - sum*t8
+                 c( 9, j ) = c( 9, j ) - sum*t9
+                 c( 10, j ) = c( 10, j ) - sum*t10
+              end do
+              go to 410
+           else
+              ! form  c * h, where h has order n.
+              go to ( 210, 230, 250, 270, 290, 310, 330, 350,370, 390 )n
+              ! code for general n
+              call stdlib_zlarf( side, m, n, v, 1, tau, c, ldc, work )
+              go to 410
+              210 continue
+              ! special code for 1 x 1 householder
+              t1 = cone - tau*v( 1 )*conjg( v( 1 ) )
+              do j = 1, m
+                 c( j, 1 ) = t1*c( j, 1 )
+              end do
+              go to 410
+              230 continue
+              ! special code for 2 x 2 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+              end do
+              go to 410
+              250 continue
+              ! special code for 3 x 3 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+              end do
+              go to 410
+              270 continue
+              ! special code for 4 x 4 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+              end do
+              go to 410
+              290 continue
+              ! special code for 5 x 5 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 )
+                           
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+              end do
+              go to 410
+              310 continue
+              ! special code for 6 x 6 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+              end do
+              go to 410
+              330 continue
+              ! special code for 7 x 7 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              v7 = v( 7 )
+              t7 = tau*conjg( v7 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+              end do
+              go to 410
+              350 continue
+              ! special code for 8 x 8 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              v7 = v( 7 )
+              t7 = tau*conjg( v7 )
+              v8 = v( 8 )
+              t8 = tau*conjg( v8 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+              end do
+              go to 410
+              370 continue
+              ! special code for 9 x 9 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              v7 = v( 7 )
+              t7 = tau*conjg( v7 )
+              v8 = v( 8 )
+              t8 = tau*conjg( v8 )
+              v9 = v( 9 )
+              t9 = tau*conjg( v9 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 ) + v9*c( j, 9 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+                 c( j, 9 ) = c( j, 9 ) - sum*t9
+              end do
+              go to 410
+              390 continue
+              ! special code for 10 x 10 householder
+              v1 = v( 1 )
+              t1 = tau*conjg( v1 )
+              v2 = v( 2 )
+              t2 = tau*conjg( v2 )
+              v3 = v( 3 )
+              t3 = tau*conjg( v3 )
+              v4 = v( 4 )
+              t4 = tau*conjg( v4 )
+              v5 = v( 5 )
+              t5 = tau*conjg( v5 )
+              v6 = v( 6 )
+              t6 = tau*conjg( v6 )
+              v7 = v( 7 )
+              t7 = tau*conjg( v7 )
+              v8 = v( 8 )
+              t8 = tau*conjg( v8 )
+              v9 = v( 9 )
+              t9 = tau*conjg( v9 )
+              v10 = v( 10 )
+              t10 = tau*conjg( v10 )
+              do j = 1, m
+                 sum = v1*c( j, 1 ) + v2*c( j, 2 ) + v3*c( j, 3 ) +v4*c( j, 4 ) + v5*c( j, 5 ) + &
+                           v6*c( j, 6 ) +v7*c( j, 7 ) + v8*c( j, 8 ) + v9*c( j, 9 ) +v10*c( j, 10 )
+                 c( j, 1 ) = c( j, 1 ) - sum*t1
+                 c( j, 2 ) = c( j, 2 ) - sum*t2
+                 c( j, 3 ) = c( j, 3 ) - sum*t3
+                 c( j, 4 ) = c( j, 4 ) - sum*t4
+                 c( j, 5 ) = c( j, 5 ) - sum*t5
+                 c( j, 6 ) = c( j, 6 ) - sum*t6
+                 c( j, 7 ) = c( j, 7 ) - sum*t7
+                 c( j, 8 ) = c( j, 8 ) - sum*t8
+                 c( j, 9 ) = c( j, 9 ) - sum*t9
+                 c( j, 10 ) = c( j, 10 ) - sum*t10
+              end do
+              go to 410
+           end if
+           410 continue
+           return
+     end subroutine stdlib_zlarfx
+
+     !> ZLARFY: applies an elementary reflector, or Householder matrix, H,
+     !> to an n x n Hermitian matrix C, from both the left and the right.
+     !> H is represented in the form
+     !> H = I - tau * v * v'
+     !> where  tau  is a scalar and  v  is a vector.
+     !> If  tau  is  zero, then  H  is taken to be the unit matrix.
+
+     pure subroutine stdlib_zlarfy( uplo, n, v, incv, tau, c, ldc, work )
+        ! -- lapack test routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incv, ldc, n
+           complex(dp), intent(in) :: tau
+           ! Array Arguments 
+           complex(dp), intent(inout) :: c(ldc,*)
+           complex(dp), intent(in) :: v(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: alpha
+           ! Executable Statements 
+           if( tau==czero )return
+           ! form  w:= c * v
+           call stdlib_zhemv( uplo, n, cone, c, ldc, v, incv, czero, work, 1 )
+           alpha = -chalf*tau*stdlib_zdotc( n, work, 1, v, incv )
+           call stdlib_zaxpy( n, alpha, v, incv, work, 1 )
+           ! c := c - v * w' - w * v'
+           call stdlib_zher2( uplo, n, -tau, v, incv, work, 1, c, ldc )
+           return
+     end subroutine stdlib_zlarfy
+
+     !> ZLARNV: returns a vector of n random complex numbers from a uniform or
+     !> normal distribution.
+
+     pure subroutine stdlib_zlarnv( idist, iseed, n, x )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: idist, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: iseed(4)
+           complex(dp), intent(out) :: x(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: lv = 128
+           real(dp), parameter :: twopi = 6.28318530717958647692528676655900576839e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, il, iv
+           ! Local Arrays 
+           real(dp) :: u(lv)
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,exp,log,min,sqrt
+           ! Executable Statements 
+           do 60 iv = 1, n, lv / 2
+              il = min( lv / 2, n-iv+1 )
+              ! call stdlib_dlaruv to generate 2*il realnumbers from a uniform (0,1,KIND=dp)
+              ! distribution (2*il <= lv)
+              call stdlib_dlaruv( iseed, 2*il, u )
+              if( idist==1 ) then
+                 ! copy generated numbers
+                 do i = 1, il
+                    x( iv+i-1 ) = cmplx( u( 2*i-1 ), u( 2*i ),KIND=dp)
+                 end do
+              else if( idist==2 ) then
+                 ! convert generated numbers to uniform (-1,1) distribution
+                 do i = 1, il
+                    x( iv+i-1 ) = cmplx( two*u( 2*i-1 )-one,two*u( 2*i )-one,KIND=dp)
+                 end do
+              else if( idist==3 ) then
+                 ! convert generated numbers to normal (0,1) distribution
+                 do i = 1, il
+                    x( iv+i-1 ) = sqrt( -two*log( u( 2*i-1 ) ) )*exp( cmplx( zero, twopi*u( 2*i ),&
+                              KIND=dp) )
+                 end do
+              else if( idist==4 ) then
+                 ! convert generated numbers to complex numbers uniformly
+                 ! distributed on the unit disk
+                 do i = 1, il
+                    x( iv+i-1 ) = sqrt( u( 2*i-1 ) )*exp( cmplx( zero, twopi*u( 2*i ),KIND=dp) )
+                              
+                 end do
+              else if( idist==5 ) then
+                 ! convert generated numbers to complex numbers uniformly
+                 ! distributed on the unit circle
+                 do i = 1, il
+                    x( iv+i-1 ) = exp( cmplx( zero, twopi*u( 2*i ),KIND=dp) )
+                 end do
+              end if
+              60 continue
+           return
+     end subroutine stdlib_zlarnv
+
+     !> !
+     !>
+     !> ZLARTG: generates a plane rotation so that
+     !> [  C         S  ] . [ F ]  =  [ R ]
+     !> [ -conjg(S)  C  ]   [ G ]     [ 0 ]
+     !> where C is real and C**2 + |S|**2 = 1.
+     !> The mathematical formulas used for C and S are
+     !> sgn(x) = {  x / |x|,   x != 0
+     !> {  1,         x = 0
+     !> R = sgn(F) * sqrt(|F|**2 + |G|**2)
+     !> C = |F| / sqrt(|F|**2 + |G|**2)
+     !> S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2)
+     !> When F and G are real, the formulas simplify to C = F/R and
+     !> S = G/R, and the returned values of C, S, and R should be
+     !> identical to those returned by DLARTG.
+     !> The algorithm used to compute these quantities incorporates scaling
+     !> to avoid overflow or underflow in computing the square root of the
+     !> sum of squares.
+     !> This is a faster version of the BLAS1 routine ZROTG, except for
+     !> the following differences:
+     !> F and G are unchanged on return.
+     !> If G=0, then C=1 and S=0.
+     !> If F=0, then C=0 and S is chosen so that R is real.
+     !> Below, wp=>dp stands for double precision from LA_CONSTANTS module.
+
+     pure subroutine stdlib_zlartg( f, g, c, s, r )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! february 2021
+        ! Scalar Arguments 
+        real(dp), intent(out) :: c
+        complex(dp), intent(in) :: f, g
+        complex(dp), intent(out) :: r, s
+        ! Local Scalars 
+        real(dp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w
+        complex(dp) :: fs, gs, t
+        ! Intrinsic Functions 
+        intrinsic :: abs,aimag,conjg,max,min,real,sqrt
+        ! Statement Functions 
+        real(dp) :: abssq
+        ! Statement Function Definitions 
+        abssq( t ) = real( t,KIND=dp)**2 + aimag( t )**2
+        ! Executable Statements 
+        if( g == czero ) then
+           c = one
+           s = czero
+           r = f
+        else if( f == czero ) then
+           c = zero
+           g1 = max( abs(real(g,KIND=dp)), abs(aimag(g)) )
+           if( g1 > rtmin .and. g1 < rtmax ) then
+              ! use unscaled algorithm
+              g2 = abssq( g )
+              d = sqrt( g2 )
+              s = conjg( g ) / d
+              r = d
+           else
+              ! use scaled algorithm
+              u = min( safmax, max( safmin, g1 ) )
+              uu = one / u
+              gs = g*uu
+              g2 = abssq( gs )
+              d = sqrt( g2 )
+              s = conjg( gs ) / d
+              r = d*u
+           end if
+        else
+           f1 = max( abs(real(f,KIND=dp)), abs(aimag(f)) )
+           g1 = max( abs(real(g,KIND=dp)), abs(aimag(g)) )
+     if( f1 > rtmin .and. f1 < rtmax .and.          g1 > rtmin .and. g1 < rtmax ) then
+              ! use unscaled algorithm
+              f2 = abssq( f )
+              g2 = abssq( g )
+              h2 = f2 + g2
+              if( f2 > rtmin .and. h2 < rtmax ) then
+                 d = sqrt( f2*h2 )
+              else
+                 d = sqrt( f2 )*sqrt( h2 )
+              end if
+              p = 1 / d
+              c = f2*p
+              s = conjg( g )*( f*p )
+              r = f*( h2*p )
+           else
+              ! use scaled algorithm
+              u = min( safmax, max( safmin, f1, g1 ) )
+              uu = one / u
+              gs = g*uu
+              g2 = abssq( gs )
+              if( f1*uu < rtmin ) then
+                 ! f is not well-scaled when scaled by g1.
+                 ! use a different scaling for f.
+                 v = min( safmax, max( safmin, f1 ) )
+                 vv = one / v
+                 w = v * uu
+                 fs = f*vv
+                 f2 = abssq( fs )
+                 h2 = f2*w**2 + g2
+              else
+                 ! otherwise use the same scaling for f and g.
+                 w = one
+                 fs = f*uu
+                 f2 = abssq( fs )
+                 h2 = f2 + g2
+              end if
+              if( f2 > rtmin .and. h2 < rtmax ) then
+                 d = sqrt( f2*h2 )
+              else
+                 d = sqrt( f2 )*sqrt( h2 )
+              end if
+              p = 1 / d
+              c = ( f2*p )*w
+              s = conjg( gs )*( fs*p )
+              r = ( fs*( h2*p ) )*u
+           end if
+        end if
+        return
+     end subroutine stdlib_zlartg
+
+     !> ZLARTV: applies a vector of complex plane rotations with real cosines
+     !> to elements of the complex vectors x and y. For i = 1,2,...,n
+     !> ( x(i) ) := (        c(i)   s(i) ) ( x(i) )
+     !> ( y(i) )    ( -conjg(s(i))  c(i) ) ( y(i) )
+
+     pure subroutine stdlib_zlartv( n, x, incx, y, incy, c, s, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, incy, n
+           ! Array Arguments 
+           real(dp), intent(in) :: c(*)
+           complex(dp), intent(in) :: s(*)
+           complex(dp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ic, ix, iy
+           complex(dp) :: xi, yi
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ix = 1
+           iy = 1
+           ic = 1
+           do i = 1, n
+              xi = x( ix )
+              yi = y( iy )
+              x( ix ) = c( ic )*xi + s( ic )*yi
+              y( iy ) = c( ic )*yi - conjg( s( ic ) )*xi
+              ix = ix + incx
+              iy = iy + incy
+              ic = ic + incc
+           end do
+           return
+     end subroutine stdlib_zlartv
+
+     !> ZLARZ: applies a complex elementary reflector H to a complex
+     !> M-by-N matrix C, from either the left or the right. H is represented
+     !> in the form
+     !> H = I - tau * v * v**H
+     !> where tau is a complex scalar and v is a complex vector.
+     !> If tau = 0, then H is taken to be the unit matrix.
+     !> To apply H**H (the conjugate transpose of H), supply conjg(tau) instead
+     !> tau.
+     !> H is a product of k elementary reflectors as returned by ZTZRZF.
+
+     pure subroutine stdlib_zlarz( side, m, n, l, v, incv, tau, c, ldc, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side
+           integer(ilp), intent(in) :: incv, l, ldc, m, n
+           complex(dp), intent(in) :: tau
+           ! Array Arguments 
+           complex(dp), intent(inout) :: c(ldc,*)
+           complex(dp), intent(in) :: v(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Executable Statements 
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c
+              if( tau/=czero ) then
+                 ! w( 1:n ) = conjg( c( 1, 1:n ) )
+                 call stdlib_zcopy( n, c, ldc, work, 1 )
+                 call stdlib_zlacgv( n, work, 1 )
+                 ! w( 1:n ) = conjg( w( 1:n ) + c( m-l+1:m, 1:n )**h * v( 1:l ) )
+                 call stdlib_zgemv( 'CONJUGATE TRANSPOSE', l, n, cone, c( m-l+1, 1 ),ldc, v, incv,&
+                            cone, work, 1 )
+                 call stdlib_zlacgv( n, work, 1 )
+                 ! c( 1, 1:n ) = c( 1, 1:n ) - tau * w( 1:n )
+                 call stdlib_zaxpy( n, -tau, work, 1, c, ldc )
+                 ! c( m-l+1:m, 1:n ) = c( m-l+1:m, 1:n ) - ...
+                                     ! tau * v( 1:l ) * w( 1:n )**h
+                 call stdlib_zgeru( l, n, -tau, v, incv, work, 1, c( m-l+1, 1 ),ldc )
+              end if
+           else
+              ! form  c * h
+              if( tau/=czero ) then
+                 ! w( 1:m ) = c( 1:m, 1 )
+                 call stdlib_zcopy( m, c, 1, work, 1 )
+                 ! w( 1:m ) = w( 1:m ) + c( 1:m, n-l+1:n, 1:n ) * v( 1:l )
+                 call stdlib_zgemv( 'NO TRANSPOSE', m, l, cone, c( 1, n-l+1 ), ldc,v, incv, cone, &
+                           work, 1 )
+                 ! c( 1:m, 1 ) = c( 1:m, 1 ) - tau * w( 1:m )
+                 call stdlib_zaxpy( m, -tau, work, 1, c, 1 )
+                 ! c( 1:m, n-l+1:n ) = c( 1:m, n-l+1:n ) - ...
+                                     ! tau * w( 1:m ) * v( 1:l )**h
+                 call stdlib_zgerc( m, l, -tau, work, 1, v, incv, c( 1, n-l+1 ),ldc )
+              end if
+           end if
+           return
+     end subroutine stdlib_zlarz
+
+     !> ZLARZB: applies a complex block reflector H or its transpose H**H
+     !> to a complex distributed M-by-N  C from the left or the right.
+     !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+
+     pure subroutine stdlib_zlarzb( side, trans, direct, storev, m, n, k, l, v,ldv, t, ldt, c, &
+               ldc, work, ldwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, l, ldc, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: c(ldc,*), t(ldt,*), v(ldv,*)
+           complex(dp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           character :: transt
+           integer(ilp) :: i, info, j
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           ! check for currently supported options
+           info = 0
+           if( .not.stdlib_lsame( direct, 'B' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( storev, 'R' ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLARZB', -info )
+              return
+           end if
+           if( stdlib_lsame( trans, 'N' ) ) then
+              transt = 'C'
+           else
+              transt = 'N'
+           end if
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  h * c  or  h**h * c
+              ! w( 1:n, 1:k ) = c( 1:k, 1:n )**h
+              do j = 1, k
+                 call stdlib_zcopy( n, c( j, 1 ), ldc, work( 1, j ), 1 )
+              end do
+              ! w( 1:n, 1:k ) = w( 1:n, 1:k ) + ...
+                              ! c( m-l+1:m, 1:n )**h * v( 1:k, 1:l )**t
+              if( l>0 )call stdlib_zgemm( 'TRANSPOSE', 'CONJUGATE TRANSPOSE', n, k, l,cone, c( m-&
+                        l+1, 1 ), ldc, v, ldv, cone, work,ldwork )
+              ! w( 1:n, 1:k ) = w( 1:n, 1:k ) * t**t  or  w( 1:m, 1:k ) * t
+              call stdlib_ztrmm( 'RIGHT', 'LOWER', transt, 'NON-UNIT', n, k, cone, t,ldt, work, &
+                        ldwork )
+              ! c( 1:k, 1:n ) = c( 1:k, 1:n ) - w( 1:n, 1:k )**h
+              do j = 1, n
+                 do i = 1, k
+                    c( i, j ) = c( i, j ) - work( j, i )
+                 end do
+              end do
+              ! c( m-l+1:m, 1:n ) = c( m-l+1:m, 1:n ) - ...
+                                  ! v( 1:k, 1:l )**h * w( 1:n, 1:k )**h
+              if( l>0 )call stdlib_zgemm( 'TRANSPOSE', 'TRANSPOSE', l, n, k, -cone, v, ldv,work, &
+                        ldwork, cone, c( m-l+1, 1 ), ldc )
+           else if( stdlib_lsame( side, 'R' ) ) then
+              ! form  c * h  or  c * h**h
+              ! w( 1:m, 1:k ) = c( 1:m, 1:k )
+              do j = 1, k
+                 call stdlib_zcopy( m, c( 1, j ), 1, work( 1, j ), 1 )
+              end do
+              ! w( 1:m, 1:k ) = w( 1:m, 1:k ) + ...
+                              ! c( 1:m, n-l+1:n ) * v( 1:k, 1:l )**h
+              if( l>0 )call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE', m, k, l, cone,c( 1, n-l+1 )&
+                        , ldc, v, ldv, cone, work, ldwork )
+              ! w( 1:m, 1:k ) = w( 1:m, 1:k ) * conjg( t )  or
+                              ! w( 1:m, 1:k ) * t**h
+              do j = 1, k
+                 call stdlib_zlacgv( k-j+1, t( j, j ), 1 )
+              end do
+              call stdlib_ztrmm( 'RIGHT', 'LOWER', trans, 'NON-UNIT', m, k, cone, t,ldt, work, &
+                        ldwork )
+              do j = 1, k
+                 call stdlib_zlacgv( k-j+1, t( j, j ), 1 )
+              end do
+              ! c( 1:m, 1:k ) = c( 1:m, 1:k ) - w( 1:m, 1:k )
+              do j = 1, k
+                 do i = 1, m
+                    c( i, j ) = c( i, j ) - work( i, j )
+                 end do
+              end do
+              ! c( 1:m, n-l+1:n ) = c( 1:m, n-l+1:n ) - ...
+                                  ! w( 1:m, 1:k ) * conjg( v( 1:k, 1:l ) )
+              do j = 1, l
+                 call stdlib_zlacgv( k, v( 1, j ), 1 )
+              end do
+              if( l>0 )call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m, l, k, -cone,work, &
+                        ldwork, v, ldv, cone, c( 1, n-l+1 ), ldc )
+              do j = 1, l
+                 call stdlib_zlacgv( k, v( 1, j ), 1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_zlarzb
+
+     !> ZLARZT: forms the triangular factor T of a complex block reflector
+     !> H of order > n, which is defined as a product of k elementary
+     !> reflectors.
+     !> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
+     !> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular.
+     !> If STOREV = 'C', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th column of the array V, and
+     !> H  =  I - V * T * V**H
+     !> If STOREV = 'R', the vector which defines the elementary reflector
+     !> H(i) is stored in the i-th row of the array V, and
+     !> H  =  I - V**H * T * V
+     !> Currently, only STOREV = 'R' and DIRECT = 'B' are supported.
+
+     pure subroutine stdlib_zlarzt( direct, storev, n, k, v, ldv, tau, t, ldt )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, storev
+           integer(ilp), intent(in) :: k, ldt, ldv, n
+           ! Array Arguments 
+           complex(dp), intent(out) :: t(ldt,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(inout) :: v(ldv,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j
+           ! Executable Statements 
+           ! check for currently supported options
+           info = 0
+           if( .not.stdlib_lsame( direct, 'B' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( storev, 'R' ) ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLARZT', -info )
+              return
+           end if
+           do i = k, 1, -1
+              if( tau( i )==czero ) then
+                 ! h(i)  =  i
+                 do j = i, k
+                    t( j, i ) = czero
+                 end do
+              else
+                 ! general case
+                 if( i<k ) then
+                    ! t(i+1:k,i) = - tau(i) * v(i+1:k,1:n) * v(i,1:n)**h
+                    call stdlib_zlacgv( n, v( i, 1 ), ldv )
+                    call stdlib_zgemv( 'NO TRANSPOSE', k-i, n, -tau( i ),v( i+1, 1 ), ldv, v( i, &
+                              1 ), ldv, czero,t( i+1, i ), 1 )
+                    call stdlib_zlacgv( n, v( i, 1 ), ldv )
+                    ! t(i+1:k,i) = t(i+1:k,i+1:k) * t(i+1:k,i)
+                    call stdlib_ztrmv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', k-i,t( i+1, i+1 ), &
+                              ldt, t( i+1, i ), 1 )
+                 end if
+                 t( i, i ) = tau( i )
+              end if
+           end do
+           return
+     end subroutine stdlib_zlarzt
+
+     !> ZLASCL: multiplies the M by N complex matrix A by the real scalar
+     !> CTO/CFROM.  This is done without over/underflow as long as the final
+     !> result CTO*A(I,J)/CFROM does not over/underflow. TYPE specifies that
+     !> A may be full, upper triangular, lower triangular, upper Hessenberg,
+     !> or banded.
+
+     pure subroutine stdlib_zlascl( type, kl, ku, cfrom, cto, m, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: type
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, lda, m, n
+           real(dp), intent(in) :: cfrom, cto
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: i, itype, j, k1, k2, k3, k4
+           real(dp) :: bignum, cfrom1, cfromc, cto1, ctoc, mul, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( stdlib_lsame( type, 'G' ) ) then
+              itype = 0
+           else if( stdlib_lsame( type, 'L' ) ) then
+              itype = 1
+           else if( stdlib_lsame( type, 'U' ) ) then
+              itype = 2
+           else if( stdlib_lsame( type, 'H' ) ) then
+              itype = 3
+           else if( stdlib_lsame( type, 'B' ) ) then
+              itype = 4
+           else if( stdlib_lsame( type, 'Q' ) ) then
+              itype = 5
+           else if( stdlib_lsame( type, 'Z' ) ) then
+              itype = 6
+           else
+              itype = -1
+           end if
+           if( itype==-1 ) then
+              info = -1
+           else if( cfrom==zero .or. stdlib_disnan(cfrom) ) then
+              info = -4
+           else if( stdlib_disnan(cto) ) then
+              info = -5
+           else if( m<0 ) then
+              info = -6
+           else if( n<0 .or. ( itype==4 .and. n/=m ) .or.( itype==5 .and. n/=m ) ) then
+              info = -7
+           else if( itype<=3 .and. lda<max( 1, m ) ) then
+              info = -9
+           else if( itype>=4 ) then
+              if( kl<0 .or. kl>max( m-1, 0 ) ) then
+                 info = -2
+              else if( ku<0 .or. ku>max( n-1, 0 ) .or.( ( itype==4 .or. itype==5 ) .and. kl/=ku ) &
+                        )then
+                 info = -3
+              else if( ( itype==4 .and. lda<kl+1 ) .or.( itype==5 .and. lda<ku+1 ) .or.( itype==6 &
+                        .and. lda<2*kl+ku+1 ) ) then
+                 info = -9
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLASCL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 )return
+           ! get machine parameters
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           cfromc = cfrom
+           ctoc = cto
+           10 continue
+           cfrom1 = cfromc*smlnum
+           if( cfrom1==cfromc ) then
+              ! cfromc is an inf.  multiply by a correctly signed zero for
+              ! finite ctoc, or a nan if ctoc is infinite.
+              mul = ctoc / cfromc
+              done = .true.
+              cto1 = ctoc
+           else
+              cto1 = ctoc / bignum
+              if( cto1==ctoc ) then
+                 ! ctoc is either 0 or an inf.  in both cases, ctoc itself
+                 ! serves as the correct multiplication factor.
+                 mul = ctoc
+                 done = .true.
+                 cfromc = one
+              else if( abs( cfrom1 )>abs( ctoc ) .and. ctoc/=zero ) then
+                 mul = smlnum
+                 done = .false.
+                 cfromc = cfrom1
+              else if( abs( cto1 )>abs( cfromc ) ) then
+                 mul = bignum
+                 done = .false.
+                 ctoc = cto1
+              else
+                 mul = ctoc / cfromc
+                 done = .true.
+              end if
+           end if
+           if( itype==0 ) then
+              ! full matrix
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==1 ) then
+              ! lower triangular matrix
+              do j = 1, n
+                 do i = j, m
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==2 ) then
+              ! upper triangular matrix
+              do j = 1, n
+                 do i = 1, min( j, m )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==3 ) then
+              ! upper hessenberg matrix
+              do j = 1, n
+                 do i = 1, min( j+1, m )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==4 ) then
+              ! lower chalf of a symmetric band matrix
+              k3 = kl + 1
+              k4 = n + 1
+              do j = 1, n
+                 do i = 1, min( k3, k4-j )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==5 ) then
+              ! upper chalf of a symmetric band matrix
+              k1 = ku + 2
+              k3 = ku + 1
+              do j = 1, n
+                 do i = max( k1-j, 1 ), k3
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           else if( itype==6 ) then
+              ! band matrix
+              k1 = kl + ku + 2
+              k2 = kl + 1
+              k3 = 2*kl + ku + 1
+              k4 = kl + ku + 1 + m
+              do j = 1, n
+                 do i = max( k1-j, k2 ), min( k3, k4-j )
+                    a( i, j ) = a( i, j )*mul
+                 end do
+              end do
+           end if
+           if( .not.done )go to 10
+           return
+     end subroutine stdlib_zlascl
+
+     !> ZLASET: initializes a 2-D array A to BETA on the diagonal and
+     !> ALPHA on the offdiagonals.
+
+     pure subroutine stdlib_zlaset( uplo, m, n, alpha, beta, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, m, n
+           complex(dp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           complex(dp), intent(out) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! set the diagonal to beta and the strictly upper triangular
+              ! part of the array to alpha.
+              do j = 2, n
+                 do i = 1, min( j-1, m )
+                    a( i, j ) = alpha
+                 end do
+              end do
+              do i = 1, min( n, m )
+                 a( i, i ) = beta
+              end do
+           else if( stdlib_lsame( uplo, 'L' ) ) then
+              ! set the diagonal to beta and the strictly lower triangular
+              ! part of the array to alpha.
+              do j = 1, min( m, n )
+                 do i = j + 1, m
+                    a( i, j ) = alpha
+                 end do
+              end do
+              do i = 1, min( n, m )
+                 a( i, i ) = beta
+              end do
+           else
+              ! set the array to beta on the diagonal and alpha on the
+              ! offdiagonal.
+              do j = 1, n
+                 do i = 1, m
+                    a( i, j ) = alpha
+                 end do
+              end do
+              do i = 1, min( m, n )
+                 a( i, i ) = beta
+              end do
+           end if
+           return
+     end subroutine stdlib_zlaset
+
+     !> ZLASR: applies a sequence of real plane rotations to a complex matrix
+     !> A, from either the left or the right.
+     !> When SIDE = 'L', the transformation takes the form
+     !> A := P*A
+     !> and when SIDE = 'R', the transformation takes the form
+     !> A := A*P**T
+     !> where P is an orthogonal matrix consisting of a sequence of z plane
+     !> rotations, with z = M when SIDE = 'L' and z = N when SIDE = 'R',
+     !> and P**T is the transpose of P.
+     !> When DIRECT = 'F' (Forward sequence), then
+     !> P = P(z-1) * ... * P(2) * P(1)
+     !> and when DIRECT = 'B' (Backward sequence), then
+     !> P = P(1) * P(2) * ... * P(z-1)
+     !> where P(k) is a plane rotation matrix defined by the 2-by-2 rotation
+     !> R(k) = (  c(k)  s(k) )
+     !> = ( -s(k)  c(k) ).
+     !> When PIVOT = 'V' (Variable pivot), the rotation is performed
+     !> for the plane (k,k+1), i.e., P(k) has the form
+     !> P(k) = (  1                                            )
+     !> (       ...                                     )
+     !> (              1                                )
+     !> (                   c(k)  s(k)                  )
+     !> (                  -s(k)  c(k)                  )
+     !> (                                1              )
+     !> (                                     ...       )
+     !> (                                            1  )
+     !> where R(k) appears as a rank-2 modification to the identity matrix in
+     !> rows and columns k and k+1.
+     !> When PIVOT = 'T' (Top pivot), the rotation is performed for the
+     !> plane (1,k+1), so P(k) has the form
+     !> P(k) = (  c(k)                    s(k)                 )
+     !> (         1                                     )
+     !> (              ...                              )
+     !> (                     1                         )
+     !> ( -s(k)                    c(k)                 )
+     !> (                                 1             )
+     !> (                                      ...      )
+     !> (                                             1 )
+     !> where R(k) appears in rows and columns 1 and k+1.
+     !> Similarly, when PIVOT = 'B' (Bottom pivot), the rotation is
+     !> performed for the plane (k,z), giving P(k) the form
+     !> P(k) = ( 1                                             )
+     !> (      ...                                      )
+     !> (             1                                 )
+     !> (                  c(k)                    s(k) )
+     !> (                         1                     )
+     !> (                              ...              )
+     !> (                                     1         )
+     !> (                 -s(k)                    c(k) )
+     !> where R(k) appears in rows and columns k and z.  The rotations are
+     !> performed without ever forming P(k) explicitly.
+
+     pure subroutine stdlib_zlasr( side, pivot, direct, m, n, c, s, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, pivot, side
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(in) :: c(*), s(*)
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j
+           real(dp) :: ctemp, stemp
+           complex(dp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( .not.( stdlib_lsame( side, 'L' ) .or. stdlib_lsame( side, 'R' ) ) ) then
+              info = 1
+           else if( .not.( stdlib_lsame( pivot, 'V' ) .or. stdlib_lsame( pivot,'T' ) .or. &
+                     stdlib_lsame( pivot, 'B' ) ) ) then
+              info = 2
+           else if( .not.( stdlib_lsame( direct, 'F' ) .or. stdlib_lsame( direct, 'B' ) ) )&
+                     then
+              info = 3
+           else if( m<0 ) then
+              info = 4
+           else if( n<0 ) then
+              info = 5
+           else if( lda<max( 1, m ) ) then
+              info = 9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLASR ', info )
+              return
+           end if
+           ! quick return if possible
+           if( ( m==0 ) .or. ( n==0 ) )return
+           if( stdlib_lsame( side, 'L' ) ) then
+              ! form  p * a
+              if( stdlib_lsame( pivot, 'V' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, m - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j+1, i )
+                             a( j+1, i ) = ctemp*temp - stemp*a( j, i )
+                             a( j, i ) = stemp*temp + ctemp*a( j, i )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j+1, i )
+                             a( j+1, i ) = ctemp*temp - stemp*a( j, i )
+                             a( j, i ) = stemp*temp + ctemp*a( j, i )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'T' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 2, m
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = ctemp*temp - stemp*a( 1, i )
+                             a( 1, i ) = stemp*temp + ctemp*a( 1, i )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m, 2, -1
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = ctemp*temp - stemp*a( 1, i )
+                             a( 1, i ) = stemp*temp + ctemp*a( 1, i )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'B' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, m - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = stemp*a( m, i ) + ctemp*temp
+                             a( m, i ) = ctemp*a( m, i ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = m - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, n
+                             temp = a( j, i )
+                             a( j, i ) = stemp*a( m, i ) + ctemp*temp
+                             a( m, i ) = ctemp*a( m, i ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 end if
+              end if
+           else if( stdlib_lsame( side, 'R' ) ) then
+              ! form a * p**t
+              if( stdlib_lsame( pivot, 'V' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, n - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j+1 )
+                             a( i, j+1 ) = ctemp*temp - stemp*a( i, j )
+                             a( i, j ) = stemp*temp + ctemp*a( i, j )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j+1 )
+                             a( i, j+1 ) = ctemp*temp - stemp*a( i, j )
+                             a( i, j ) = stemp*temp + ctemp*a( i, j )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'T' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 2, n
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = ctemp*temp - stemp*a( i, 1 )
+                             a( i, 1 ) = stemp*temp + ctemp*a( i, 1 )
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n, 2, -1
+                       ctemp = c( j-1 )
+                       stemp = s( j-1 )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = ctemp*temp - stemp*a( i, 1 )
+                             a( i, 1 ) = stemp*temp + ctemp*a( i, 1 )
+                          end do
+                       end if
+                    end do
+                 end if
+              else if( stdlib_lsame( pivot, 'B' ) ) then
+                 if( stdlib_lsame( direct, 'F' ) ) then
+                    do j = 1, n - 1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = stemp*a( i, n ) + ctemp*temp
+                             a( i, n ) = ctemp*a( i, n ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 else if( stdlib_lsame( direct, 'B' ) ) then
+                    do j = n - 1, 1, -1
+                       ctemp = c( j )
+                       stemp = s( j )
+                       if( ( ctemp/=one ) .or. ( stemp/=zero ) ) then
+                          do i = 1, m
+                             temp = a( i, j )
+                             a( i, j ) = stemp*a( i, n ) + ctemp*temp
+                             a( i, n ) = ctemp*a( i, n ) - stemp*temp
+                          end do
+                       end if
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_zlasr
+
+     !> !
+     !>
+     !> ZLASSQ:  returns the values  scl  and  smsq  such that
+     !> ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+     !> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+     !> assumed to be non-negative.
+     !> scale and sumsq must be supplied in SCALE and SUMSQ and
+     !> scl and smsq are overwritten on SCALE and SUMSQ respectively.
+     !> If scale * sqrt( sumsq ) > tbig then
+     !> we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
+     !> and if 0 < scale * sqrt( sumsq ) < tsml then
+     !> we require:   scale <= sqrt( HUGE ) / ssml       on entry,
+     !> where
+     !> tbig -- upper threshold for values whose square is representable;
+     !> sbig -- scaling constant for big numbers; \see la_constants.f90
+     !> tsml -- lower threshold for values whose square is representable;
+     !> ssml -- scaling constant for small numbers; \see la_constants.f90
+     !> and
+     !> TINY*EPS -- tiniest representable number;
+     !> HUGE     -- biggest representable number.
+
+     pure subroutine stdlib_zlassq( n, x, incx, scl, sumsq )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+        ! Scalar Arguments 
+     integer(ilp), intent(in) :: incx, n
+        real(dp), intent(inout) :: scl, sumsq
+        ! Array Arguments 
+        complex(dp), intent(in) :: x(*)
+        ! Local Scalars 
+     integer(ilp) :: i, ix
+     logical(lk) :: notbig
+        real(dp) :: abig, amed, asml, ax, ymax, ymin
+        ! quick return if possible
+        if( ieee_is_nan(scl) .or. ieee_is_nan(sumsq) ) return
+        if( sumsq == zero ) scl = one
+        if( scl == zero ) then
+           scl = one
+           sumsq = zero
+        end if
+        if (n <= 0) then
+           return
+        end if
+        ! compute the sum of squares in 3 accumulators:
+           ! abig -- sums of squares scaled down to avoid overflow
+           ! asml -- sums of squares scaled up to avoid underflow
+           ! amed -- sums of squares that do not require scaling
+        ! the thresholds and multipliers are
+           ! tbig -- values bigger than this are scaled down by sbig
+           ! tsml -- values smaller than this are scaled up by ssml
+        notbig = .true.
+        asml = zero
+        amed = zero
+        abig = zero
+        ix = 1
+        if( incx < 0 ) ix = 1 - (n-1)*incx
+        do i = 1, n
+           ax = abs(real(x(ix),KIND=dp))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ax = abs(aimag(x(ix)))
+           if (ax > tbig) then
+              abig = abig + (ax*sbig)**2
+              notbig = .false.
+           else if (ax < tsml) then
+              if (notbig) asml = asml + (ax*ssml)**2
+           else
+              amed = amed + ax**2
+           end if
+           ix = ix + incx
+        end do
+        ! put the existing sum of squares into one of the accumulators
+        if( sumsq > zero ) then
+           ax = scl*sqrt( sumsq )
+           if (ax > tbig) then
+              ! we assume scl >= sqrt( tiny*eps ) / sbig
+              abig = abig + (scl*sbig)**2 * sumsq
+           else if (ax < tsml) then
+              ! we assume scl <= sqrt( huge ) / ssml
+              if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+           else
+              amed = amed + scl**2 * sumsq
+           end if
+        end if
+        ! combine abig and amed or amed and asml if more than one
+        ! accumulator was used.
+        if (abig > zero) then
+           ! combine abig and amed if abig > 0.
+           if (amed > zero .or. ieee_is_nan(amed)) then
+              abig = abig + (amed*sbig)*sbig
+           end if
+           scl = one / sbig
+           sumsq = abig
+        else if (asml > zero) then
+           ! combine amed and asml if asml > 0.
+           if (amed > zero .or. ieee_is_nan(amed)) then
+              amed = sqrt(amed)
+              asml = sqrt(asml) / ssml
+              if (asml > amed) then
+                 ymin = amed
+                 ymax = asml
+              else
+                 ymin = asml
+                 ymax = amed
+              end if
+              scl = one
+              sumsq = ymax**2*( one + (ymin/ymax)**2 )
+           else
+              scl = one / ssml
+              sumsq = asml
+           end if
+        else
+           ! otherwise all values are mid-range or zero
+           scl = one
+           sumsq = amed
+        end if
+        return
+     end subroutine stdlib_zlassq
+
+     !> ZLASWP: performs a series of row interchanges on the matrix A.
+     !> One row interchange is initiated for each of rows K1 through K2 of A.
+
+     pure subroutine stdlib_zlaswp( n, a, lda, k1, k2, ipiv, incx )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, k1, k2, lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, i1, i2, inc, ip, ix, ix0, j, k, n32
+           complex(dp) :: temp
+           ! Executable Statements 
+           ! interchange row i with row ipiv(k1+(i-k1)*abs(incx)) for each of rows
+           ! k1 through k2.
+           if( incx>0 ) then
+              ix0 = k1
+              i1 = k1
+              i2 = k2
+              inc = 1
+           else if( incx<0 ) then
+              ix0 = k1 + ( k1-k2 )*incx
+              i1 = k2
+              i2 = k1
+              inc = -1
+           else
+              return
+           end if
+           n32 = ( n / 32 )*32
+           if( n32/=0 ) then
+              do j = 1, n32, 32
+                 ix = ix0
+                 do i = i1, i2, inc
+                    ip = ipiv( ix )
+                    if( ip/=i ) then
+                       do k = j, j + 31
+                          temp = a( i, k )
+                          a( i, k ) = a( ip, k )
+                          a( ip, k ) = temp
+                       end do
+                    end if
+                    ix = ix + incx
+                 end do
+              end do
+           end if
+           if( n32/=n ) then
+              n32 = n32 + 1
+              ix = ix0
+              do i = i1, i2, inc
+                 ip = ipiv( ix )
+                 if( ip/=i ) then
+                    do k = n32, n
+                       temp = a( i, k )
+                       a( i, k ) = a( ip, k )
+                       a( ip, k ) = temp
+                    end do
+                 end if
+                 ix = ix + incx
+              end do
+           end if
+           return
+     end subroutine stdlib_zlaswp
+
+     !> ZLASYF: computes a partial factorization of a complex symmetric matrix
+     !> A using the Bunch-Kaufman diagonal pivoting method. The partial
+     !> factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) ( D    0  ) ( L11**T L21**T )  if UPLO = 'L'
+     !> ( L21  I ) ( 0   A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> Note that U**T denotes the transpose of U.
+     !> ZLASYF is an auxiliary routine called by ZSYTRF. It uses blocked code
+     !> (calling Level 3 BLAS) to update the submatrix A11 (if UPLO = 'U') or
+     !> A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_zlasyf( uplo, n, nb, kb, a, lda, ipiv, w, ldw, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: imax, j, jb, jj, jmax, jp, k, kk, kkw, kp, kstep, kw
+           real(dp) :: absakk, alpha, colmax, rowmax
+           complex(dp) :: d11, d21, d22, r1, t, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=dp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              ! kw is the column of w which corresponds to column k of a
+              k = n
+              10 continue
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              ! copy column k of a to column kw of w and update it
+              call stdlib_zcopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_zgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ), lda,w( k, &
+                        kw+1 ), ldw, cone, w( 1, kw ), 1 )
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! copy column imax to column kw-1 of w and update it
+                    call stdlib_zcopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                    call stdlib_zcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                              
+                    if( k<n )call stdlib_zgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda, w(&
+                               imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = imax + stdlib_izamax( k-imax, w( imax+1, kw-1 ), 1 )
+                    rowmax = cabs1( w( jmax, kw-1 ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_izamax( imax-1, w( 1, kw-1 ), 1 )
+                       rowmax = max( rowmax, cabs1( w( jmax, kw-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( w( imax, kw-1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column kw-1 of w to column kw of w
+                       call stdlib_zcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kkw of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k-1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = a( kk, kk )
+                    call stdlib_zcopy( kk-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    if( kp>1 )call stdlib_zcopy( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last k+1 to n columns of a
+                    ! (columns k (or k and k-1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in last kkw to nb columns of w.
+                    if( k<n )call stdlib_zswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                    call stdlib_zswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(kw) = u(k)*d(k),
+                    ! where u(k) is the k-th column of u
+                    ! store subdiag. elements of column u(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! note: diagonal element u(k,k) is a unit element
+                    ! and not stored.
+                       ! a(k,k) := d(k,k) = w(k,kw)
+                       ! a(1:k-1,k) := u(1:k-1,k) = w(1:k-1,kw)/d(k,k)
+                    call stdlib_zcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    r1 = cone / a( k, k )
+                    call stdlib_zscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now hold
+                    ! ( w(kw-1) w(kw) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! store u(1:k-2,k-1) and u(1:k-2,k) and 2-by-2
+                    ! block d(k-1:k,k-1:k) in columns k-1 and k of a.
+                    ! note: 2-by-2 diagonal block u(k-1:k,k-1:k) is a unit
+                    ! block and not stored.
+                       ! a(k-1:k,k-1:k) := d(k-1:k,k-1:k) = w(k-1:k,kw-1:kw)
+                       ! a(1:k-2,k-1:k) := u(1:k-2,k:k-1:k) =
+                       ! = w(1:k-2,kw-1:kw) * ( d(k-1:k,k-1:k)**(-1) )
+                    if( k>2 ) then
+                       ! compose the columns of the inverse of 2-by-2 pivot
+                       ! block d in the following way to reduce the number
+                       ! of flops when we myltiply panel ( w(kw-1) w(kw) ) by
+                       ! this inverse
+                       ! d**(-1) = ( d11 d21 )**(-1) =
+                                 ! ( d21 d22 )
+                       ! = 1/(d11*d22-d21**2) * ( ( d22 ) (-d21 ) ) =
+                                              ! ( (-d21 ) ( d11 ) )
+                       ! = 1/d21 * 1/((d11/d21)*(d22/d21)-1) *
+                         ! * ( ( d22/d21 ) (      -1 ) ) =
+                           ! ( (      -1 ) ( d11/d21 ) )
+                       ! = 1/d21 * 1/(d22*d11-1) * ( ( d11 ) (  -1 ) ) =
+                                                 ! ( ( -1  ) ( d22 ) )
+                       ! = 1/d21 * t * ( ( d11 ) (  -1 ) )
+                                     ! ( (  -1 ) ( d22 ) )
+                       ! = d21 * ( ( d11 ) (  -1 ) )
+                               ! ( (  -1 ) ( d22 ) )
+                       d21 = w( k-1, kw )
+                       d11 = w( k, kw ) / d21
+                       d22 = w( k-1, kw-1 ) / d21
+                       t = cone / ( d11*d22-cone )
+                       d21 = t / d21
+                       ! update elements in columns a(k-1) and a(k) as
+                       ! dot products of rows of ( w(kw-1) w(kw) ) and columns
+                       ! of d**(-1)
+                       do j = 1, k - 2
+                          a( j, k-1 ) = d21*( d11*w( j, kw-1 )-w( j, kw ) )
+                          a( j, k ) = d21*( d22*w( j, kw )-w( j, kw-1 ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_zgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb, n-k,-cone, a( 1, k+1 ), &
+                           lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n looping backwards from k+1 to n
+              j = k + 1
+              60 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j + 1
+                 end if
+                 ! (note: here, j is used to determine row length. length n-j+1
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j + 1
+                 if( jp/=jj .and. j<=n )call stdlib_zswap( n-j+1, a( jp, j ), lda, a( jj, j ), &
+                           lda )
+              if( j<n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              ! copy column k of a to column k of w and update it
+              call stdlib_zcopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              call stdlib_zgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ), lda,w( k, 1 ), ldw,&
+                         cone, w( k, k ), 1 )
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! copy column imax to column k+1 of w and update it
+                    call stdlib_zcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1 )
+                    call stdlib_zcopy( n-imax+1, a( imax, imax ), 1, w( imax, k+1 ),1 )
+                    call stdlib_zgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( imax, &
+                              1 ), ldw, cone, w( k, k+1 ),1 )
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = k - 1 + stdlib_izamax( imax-k, w( k, k+1 ), 1 )
+                    rowmax = cabs1( w( jmax, k+1 ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_izamax( n-imax, w( imax+1, k+1 ), 1 )
+                       rowmax = max( rowmax, cabs1( w( jmax, k+1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( w( imax, k+1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                       ! copy column k+1 of w to column k of w
+                       call stdlib_zcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 ! ============================================================
+                 ! kk is the column of a where pivoting step stopped
+                 kk = k + kstep - 1
+                 ! interchange rows and columns kp and kk.
+                 ! updated column kp is already stored in column kk of w.
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp of submatrix a
+                    ! at step k. no need to copy element into column k
+                    ! (or k and k+1 for 2-by-2 pivot) of a, since these columns
+                    ! will be later overwritten.
+                    a( kp, kp ) = a( kk, kk )
+                    call stdlib_zcopy( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    if( kp<n )call stdlib_zcopy( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    ! interchange rows kk and kp in first k-1 columns of a
+                    ! (columns k (or k and k+1 for 2-by-2 pivot) of a will be
+                    ! later overwritten). interchange rows kk and kp
+                    ! in first kk columns of w.
+                    if( k>1 )call stdlib_zswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_zswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k),
+                    ! where l(k) is the k-th column of l
+                    ! store subdiag. elements of column l(k)
+                    ! and 1-by-1 block d(k) in column k of a.
+                    ! (note: diagonal element l(k,k) is a unit element
+                    ! and not stored)
+                       ! a(k,k) := d(k,k) = w(k,k)
+                       ! a(k+1:n,k) := l(k+1:n,k) = w(k+1:n,k)/d(k,k)
+                    call stdlib_zcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       r1 = cone / a( k, k )
+                       call stdlib_zscal( n-k, r1, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! store l(k+2:n,k) and l(k+2:n,k+1) and 2-by-2
+                    ! block d(k:k+1,k:k+1) in columns k and k+1 of a.
+                    ! (note: 2-by-2 diagonal block l(k:k+1,k:k+1) is a unit
+                    ! block and not stored)
+                       ! a(k:k+1,k:k+1) := d(k:k+1,k:k+1) = w(k:k+1,k:k+1)
+                       ! a(k+2:n,k:k+1) := l(k+2:n,k:k+1) =
+                       ! = w(k+2:n,k:k+1) * ( d(k:k+1,k:k+1)**(-1) )
+                    if( k<n-1 ) then
+                       ! compose the columns of the inverse of 2-by-2 pivot
+                       ! block d in the following way to reduce the number
+                       ! of flops when we myltiply panel ( w(k) w(k+1) ) by
+                       ! this inverse
+                       ! d**(-1) = ( d11 d21 )**(-1) =
+                                 ! ( d21 d22 )
+                       ! = 1/(d11*d22-d21**2) * ( ( d22 ) (-d21 ) ) =
+                                              ! ( (-d21 ) ( d11 ) )
+                       ! = 1/d21 * 1/((d11/d21)*(d22/d21)-1) *
+                         ! * ( ( d22/d21 ) (      -1 ) ) =
+                           ! ( (      -1 ) ( d11/d21 ) )
+                       ! = 1/d21 * 1/(d22*d11-1) * ( ( d11 ) (  -1 ) ) =
+                                                 ! ( ( -1  ) ( d22 ) )
+                       ! = 1/d21 * t * ( ( d11 ) (  -1 ) )
+                                     ! ( (  -1 ) ( d22 ) )
+                       ! = d21 * ( ( d11 ) (  -1 ) )
+                               ! ( (  -1 ) ( d22 ) )
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       d21 = t / d21
+                       ! update elements in columns a(k) and a(k+1) as
+                       ! dot products of rows of ( w(k) w(k+1) ) and columns
+                       ! of d**(-1)
+                       do j = k + 2, n
+                          a( j, k ) = d21*( d11*w( j, k )-w( j, k+1 ) )
+                          a( j, k+1 ) = d21*( d22*w( j, k+1 )-w( j, k ) )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_zgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! of rows in columns 1:k-1 looping backwards from k-1 to 1
+              j = k - 1
+              120 continue
+                 ! undo the interchanges (if any) of rows jj and jp at each
+                 ! step j
+                 ! (here, j is a diagonal index)
+                 jj = j
+                 jp = ipiv( j )
+                 if( jp<0 ) then
+                    jp = -jp
+                    ! (here, j is a diagonal index)
+                    j = j - 1
+                 end if
+                 ! (note: here, j is used to determine row length. length j
+                 ! of the rows to swap back doesn't include diagonal element)
+                 j = j - 1
+                 if( jp/=jj .and. j>=1 )call stdlib_zswap( j, a( jp, 1 ), lda, a( jj, 1 ), lda )
+                           
+              if( j>1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_zlasyf
+
+     !> ZLASYF_RK: computes a partial factorization of a complex symmetric
+     !> matrix A using the bounded Bunch-Kaufman (rook) diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L',
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> ZLASYF_RK is an auxiliary routine called by ZSYTRF_RK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_zlasyf_rk( uplo, n, nb, kb, a, lda, e, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: e(*), w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, j, jb, jj, jmax, k, kk, kw, kkw, kp, kstep, p, ii
+           real(dp) :: absakk, alpha, colmax, rowmax, sfmin, dtemp
+           complex(dp) :: d11, d12, d21, d22, r1, t, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=dp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = czero
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              call stdlib_zcopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_zgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ),lda, w( k, &
+                        kw+1 ), ldw, cone, w( 1, kw ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_zcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       call stdlib_zcopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                       call stdlib_zcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       if( k<n )call stdlib_zgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda,&
+                                  w( imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_izamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = cabs1( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_izamax( imax-1, w( 1, kw-1 ), 1 )
+                          dtemp = cabs1( w( itemp, kw-1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! cabs1( w( imax, kw-1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.(cabs1( w( imax, kw-1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_zcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_zcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_zcopy( k-p, a( p+1, k ), 1, a( p, p+1 ), lda )
+                    call stdlib_zcopy( p, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in last n-k+1 columns of a
+                    ! and last n-k+2 columns of w
+                    call stdlib_zswap( n-k+1, a( k, k ), lda, a( p, k ), lda )
+                    call stdlib_zswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kkw of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_zcopy( k-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_zcopy( kp, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last n-kk+1 columns
+                    ! of a and w
+                    call stdlib_zswap( n-kk+1, a( kk, kk ), lda, a( kp, kk ), lda )
+                    call stdlib_zswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! store u(k) in column k of a
+                    call stdlib_zcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          r1 = cone / a( k, k )
+                          call stdlib_zscal( k-1, r1, a( 1, k ), 1 )
+                       else if( a( k, k )/=czero ) then
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now
+                    ! hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    if( k>2 ) then
+                       ! store u(k) and u(k-1) in columns k and k-1 of a
+                       d12 = w( k-1, kw )
+                       d11 = w( k, kw ) / d12
+                       d22 = w( k-1, kw-1 ) / d12
+                       t = cone / ( d11*d22-cone )
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( (d11*w( j, kw-1 )-w( j, kw ) ) /d12 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /d12 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy superdiagonal element of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = czero
+                    a( k, k ) = w( k, kw )
+                    e( k ) = w( k-1, kw )
+                    e( k-1 ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_zgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb,n-k, -cone, a( &
+                           1, k+1 ), lda, w( j, kw+1 ),ldw, cone, a( 1, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = czero
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update it
+              call stdlib_zcopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              if( k>1 )call stdlib_zgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( k, &
+                        1 ), ldw, cone, w( k, k ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_zcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = czero
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_zcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_zcopy( n-imax+1, a( imax, imax ), 1,w( imax, k+1 ), 1 )
+                       if( k>1 )call stdlib_zgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone,a( k, 1 ), &
+                                 lda, w( imax, 1 ), ldw,cone, w( k, k+1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_izamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = cabs1( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_izamax( n-imax, w( imax+1, k+1 ), 1)
+                          dtemp = cabs1( w( itemp, k+1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! cabs1( w( imax, k+1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( cabs1( w( imax, k+1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_zcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_zcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 72
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_zcopy( p-k, a( k, k ), 1, a( p, k ), lda )
+                    call stdlib_zcopy( n-p+1, a( p, k ), 1, a( p, p ), 1 )
+                    ! interchange rows k and p in first k columns of a
+                    ! and first k+1 columns of w
+                    call stdlib_zswap( k, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_zswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kk of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_zcopy( kp-k-1, a( k+1, kk ), 1, a( kp, k+1 ), lda )
+                    call stdlib_zcopy( n-kp+1, a( kp, kk ), 1, a( kp, kp ), 1 )
+                    ! interchange rows kk and kp in first kk columns of a and w
+                    call stdlib_zswap( kk, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_zswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    ! store l(k) in column k of a
+                    call stdlib_zcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          r1 = cone / a( k, k )
+                          call stdlib_zscal( n-k, r1, a( k+1, k ), 1 )
+                       else if( a( k, k )/=czero ) then
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! store l(k) and l(k+1) in columns k and k+1 of a
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /d21 )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy diagonal elements of d(k) to a,
+                    ! copy subdiagonal element of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = czero
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                    e( k ) = w( k+1, k )
+                    e( k+1 ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_zgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ),ldw, cone, a( j+jb, j ), lda )
+              end do
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_zlasyf_rk
+
+     !> ZLASYF_ROOK: computes a partial factorization of a complex symmetric
+     !> matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method. The partial factorization has the form:
+     !> A  =  ( I  U12 ) ( A11  0  ) (  I       0    )  if UPLO = 'U', or:
+     !> ( 0  U22 ) (  0   D  ) ( U12**T U22**T )
+     !> A  =  ( L11  0 ) (  D   0  ) ( L11**T L21**T )  if UPLO = 'L'
+     !> ( L21  I ) (  0  A22 ) (  0       I    )
+     !> where the order of D is at most NB. The actual order is returned in
+     !> the argument KB, and is either NB or NB-1, or N if N <= NB.
+     !> ZLASYF_ROOK is an auxiliary routine called by ZSYTRF_ROOK. It uses
+     !> blocked code (calling Level 3 BLAS) to update the submatrix
+     !> A11 (if UPLO = 'U') or A22 (if UPLO = 'L').
+
+     pure subroutine stdlib_zlasyf_rook( uplo, n, nb, kb, a, lda, ipiv, w, ldw,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, kb
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: w(ldw,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: done
+           integer(ilp) :: imax, itemp, j, jb, jj, jmax, jp1, jp2, k, kk, kw, kkw, kp, kstep, p, &
+                     ii
+           real(dp) :: absakk, alpha, colmax, rowmax, dtemp, sfmin
+           complex(dp) :: d11, d12, d21, d22, r1, t, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt,aimag,real
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=dp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           info = 0
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch( 'S' )
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! factorize the trailing columns of a using the upper triangle
+              ! of a and working backwards, and compute the matrix w = u12*d
+              ! for use in updating a11
+              ! k is the main loop index, decreasing from n in steps of 1 or 2
+              k = n
+              10 continue
+              ! kw is the column of w which corresponds to column k of a
+              kw = nb + k - n
+              ! exit from loop
+              if( ( k<=n-nb+1 .and. nb<n ) .or. k<1 )go to 30
+              kstep = 1
+              p = k
+              ! copy column k of a to column kw of w and update it
+              call stdlib_zcopy( k, a( 1, k ), 1, w( 1, kw ), 1 )
+              if( k<n )call stdlib_zgemv( 'NO TRANSPOSE', k, n-k, -cone, a( 1, k+1 ),lda, w( k, &
+                        kw+1 ), ldw, cone, w( 1, kw ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, kw ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, w( 1, kw ), 1 )
+                 colmax = cabs1( w( imax, kw ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_zcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column kw-1 of w and update it
+                       call stdlib_zcopy( imax, a( 1, imax ), 1, w( 1, kw-1 ), 1 )
+                       call stdlib_zcopy( k-imax, a( imax, imax+1 ), lda,w( imax+1, kw-1 ), 1 )
+                                 
+                       if( k<n )call stdlib_zgemv( 'NO TRANSPOSE', k, n-k, -cone,a( 1, k+1 ), lda,&
+                                  w( imax, kw+1 ), ldw,cone, w( 1, kw-1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_izamax( k-imax, w( imax+1, kw-1 ),1 )
+                          rowmax = cabs1( w( jmax, kw-1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_izamax( imax-1, w( 1, kw-1 ), 1 )
+                          dtemp = cabs1( w( itemp, kw-1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! cabs1( w( imax, kw-1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.(cabs1( w( imax, kw-1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column kw-1 of w to column kw of w
+                          call stdlib_zcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k-1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_zcopy( k, w( 1, kw-1 ), 1, w( 1, kw ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! ============================================================
+                 kk = k - kstep + 1
+                 ! kkw is the column of w which corresponds to column kk of a
+                 kkw = nb + kk - n
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_zcopy( k-p, a( p+1, k ), 1, a( p, p+1 ), lda )
+                    call stdlib_zcopy( p, a( 1, k ), 1, a( 1, p ), 1 )
+                    ! interchange rows k and p in last n-k+1 columns of a
+                    ! and last n-k+2 columns of w
+                    call stdlib_zswap( n-k+1, a( k, k ), lda, a( p, k ), lda )
+                    call stdlib_zswap( n-kk+1, w( k, kkw ), ldw, w( p, kkw ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kkw of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_zcopy( k-1-kp, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    call stdlib_zcopy( kp, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    ! interchange rows kk and kp in last n-kk+1 columns
+                    ! of a and w
+                    call stdlib_zswap( n-kk+1, a( kk, kk ), lda, a( kp, kk ), lda )
+                    call stdlib_zswap( n-kk+1, w( kk, kkw ), ldw, w( kp, kkw ),ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column kw of w now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! store u(k) in column k of a
+                    call stdlib_zcopy( k, w( 1, kw ), 1, a( 1, k ), 1 )
+                    if( k>1 ) then
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          r1 = cone / a( k, k )
+                          call stdlib_zscal( k-1, r1, a( 1, k ), 1 )
+                       else if( a( k, k )/=czero ) then
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns kw and kw-1 of w now
+                    ! hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    if( k>2 ) then
+                       ! store u(k) and u(k-1) in columns k and k-1 of a
+                       d12 = w( k-1, kw )
+                       d11 = w( k, kw ) / d12
+                       d22 = w( k-1, kw-1 ) / d12
+                       t = cone / ( d11*d22-cone )
+                       do j = 1, k - 2
+                          a( j, k-1 ) = t*( (d11*w( j, kw-1 )-w( j, kw ) ) /d12 )
+                          a( j, k ) = t*( ( d22*w( j, kw )-w( j, kw-1 ) ) /d12 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k-1, k-1 ) = w( k-1, kw-1 )
+                    a( k-1, k ) = w( k-1, kw )
+                    a( k, k ) = w( k, kw )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              30 continue
+              ! update the upper triangle of a11 (= a(1:k,1:k)) as
+              ! a11 := a11 - u12*d*u12**t = a11 - u12*w**t
+              ! computing blocks of nb columns at a time
+              do j = ( ( k-1 ) / nb )*nb + 1, 1, -nb
+                 jb = min( nb, k-j+1 )
+                 ! update the upper triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_zgemv( 'NO TRANSPOSE', jj-j+1, n-k, -cone,a( j, k+1 ), lda, w( jj,&
+                               kw+1 ), ldw, cone,a( j, jj ), 1 )
+                 end do
+                 ! update the rectangular superdiagonal block
+                 if( j>=2 )call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE', j-1, jb,n-k, -cone, a( &
+                           1, k+1 ), lda, w( j, kw+1 ), ldw,cone, a( 1, j ), lda )
+              end do
+              ! put u12 in standard form by partially undoing the interchanges
+              ! in columns k+1:n
+              j = k + 1
+              60 continue
+                 kstep = 1
+                 jp1 = 1
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    j = j + 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 j = j + 1
+                 if( jp2/=jj .and. j<=n )call stdlib_zswap( n-j+1, a( jp2, j ), lda, a( jj, j ), &
+                           lda )
+                 jj = j - 1
+                 if( jp1/=jj .and. kstep==2 )call stdlib_zswap( n-j+1, a( jp1, j ), lda, a( jj, j &
+                           ), lda )
+              if( j<=n )go to 60
+              ! set kb to the number of columns factorized
+              kb = n - k
+           else
+              ! factorize the leading columns of a using the lower triangle
+              ! of a and working forwards, and compute the matrix w = l21*d
+              ! for use in updating a22
+              ! k is the main loop index, increasing from 1 in steps of 1 or 2
+              k = 1
+              70 continue
+              ! exit from loop
+              if( ( k>=nb .and. nb<n ) .or. k>n )go to 90
+              kstep = 1
+              p = k
+              ! copy column k of a to column k of w and update it
+              call stdlib_zcopy( n-k+1, a( k, k ), 1, w( k, k ), 1 )
+              if( k>1 )call stdlib_zgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone, a( k, 1 ),lda, w( k, &
+                        1 ), ldw, cone, w( k, k ), 1 )
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( w( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, w( k+1, k ), 1 )
+                 colmax = cabs1( w( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 call stdlib_zcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+              else
+                 ! ============================================================
+                 ! test for interchange
+                 ! equivalent to testing for absakk>=alpha*colmax
+                 ! (used to handle nan and inf)
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    72 continue
+                       ! begin pivot search loop body
+                       ! copy column imax to column k+1 of w and update it
+                       call stdlib_zcopy( imax-k, a( imax, k ), lda, w( k, k+1 ), 1)
+                       call stdlib_zcopy( n-imax+1, a( imax, imax ), 1,w( imax, k+1 ), 1 )
+                       if( k>1 )call stdlib_zgemv( 'NO TRANSPOSE', n-k+1, k-1, -cone,a( k, 1 ), &
+                                 lda, w( imax, 1 ), ldw,cone, w( k, k+1 ), 1 )
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_izamax( imax-k, w( k, k+1 ), 1 )
+                          rowmax = cabs1( w( jmax, k+1 ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_izamax( n-imax, w( imax+1, k+1 ), 1)
+                          dtemp = cabs1( w( itemp, k+1 ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for
+                       ! cabs1( w( imax, k+1 ) )>=alpha*rowmax
+                       ! (used to handle nan and inf)
+                       if( .not.( cabs1( w( imax, k+1 ) )<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          ! copy column k+1 of w to column k of w
+                          call stdlib_zcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                          done = .true.
+                       ! equivalent to testing for rowmax==colmax,
+                       ! (used to handle nan and inf)
+                       else if( ( p==jmax ) .or. ( rowmax<=colmax ) )then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found: set params and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                          ! copy updated jmaxth (next imaxth) column to kth of w
+                          call stdlib_zcopy( n-k+1, w( k, k+1 ), 1, w( k, k ), 1 )
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 72
+                 end if
+                 ! ============================================================
+                 kk = k + kstep - 1
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! copy non-updated column k to column p
+                    call stdlib_zcopy( p-k, a( k, k ), 1, a( p, k ), lda )
+                    call stdlib_zcopy( n-p+1, a( p, k ), 1, a( p, p ), 1 )
+                    ! interchange rows k and p in first k columns of a
+                    ! and first k+1 columns of w
+                    call stdlib_zswap( k, a( k, 1 ), lda, a( p, 1 ), lda )
+                    call stdlib_zswap( kk, w( k, 1 ), ldw, w( p, 1 ), ldw )
+                 end if
+                 ! updated column kp is already stored in column kk of w
+                 if( kp/=kk ) then
+                    ! copy non-updated column kk to column kp
+                    a( kp, k ) = a( kk, k )
+                    call stdlib_zcopy( kp-k-1, a( k+1, kk ), 1, a( kp, k+1 ), lda )
+                    call stdlib_zcopy( n-kp+1, a( kp, kk ), 1, a( kp, kp ), 1 )
+                    ! interchange rows kk and kp in first kk columns of a and w
+                    call stdlib_zswap( kk, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                    call stdlib_zswap( kk, w( kk, 1 ), ldw, w( kp, 1 ), ldw )
+                 end if
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k of w now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    ! store l(k) in column k of a
+                    call stdlib_zcopy( n-k+1, w( k, k ), 1, a( k, k ), 1 )
+                    if( k<n ) then
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          r1 = cone / a( k, k )
+                          call stdlib_zscal( n-k, r1, a( k+1, k ), 1 )
+                       else if( a( k, k )/=czero ) then
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / a( k, k )
+                          end do
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 of w now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! store l(k) and l(k+1) in columns k and k+1 of a
+                       d21 = w( k+1, k )
+                       d11 = w( k+1, k+1 ) / d21
+                       d22 = w( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       do j = k + 2, n
+                          a( j, k ) = t*( ( d11*w( j, k )-w( j, k+1 ) ) /d21 )
+                          a( j, k+1 ) = t*( ( d22*w( j, k+1 )-w( j, k ) ) /d21 )
+                       end do
+                    end if
+                    ! copy d(k) to a
+                    a( k, k ) = w( k, k )
+                    a( k+1, k ) = w( k+1, k )
+                    a( k+1, k+1 ) = w( k+1, k+1 )
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 70
+              90 continue
+              ! update the lower triangle of a22 (= a(k:n,k:n)) as
+              ! a22 := a22 - l21*d*l21**t = a22 - l21*w**t
+              ! computing blocks of nb columns at a time
+              do j = k, n, nb
+                 jb = min( nb, n-j+1 )
+                 ! update the lower triangle of the diagonal block
+                 do jj = j, j + jb - 1
+                    call stdlib_zgemv( 'NO TRANSPOSE', j+jb-jj, k-1, -cone,a( jj, 1 ), lda, w( jj,&
+                               1 ), ldw, cone,a( jj, jj ), 1 )
+                 end do
+                 ! update the rectangular subdiagonal block
+                 if( j+jb<=n )call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE', n-j-jb+1, jb,k-1, -&
+                           cone, a( j+jb, 1 ), lda, w( j, 1 ), ldw,cone, a( j+jb, j ), lda )
+              end do
+              ! put l21 in standard form by partially undoing the interchanges
+              ! in columns 1:k-1
+              j = k - 1
+              120 continue
+                 kstep = 1
+                 jp1 = 1
+                 jj = j
+                 jp2 = ipiv( j )
+                 if( jp2<0 ) then
+                    jp2 = -jp2
+                    j = j - 1
+                    jp1 = -ipiv( j )
+                    kstep = 2
+                 end if
+                 j = j - 1
+                 if( jp2/=jj .and. j>=1 )call stdlib_zswap( j, a( jp2, 1 ), lda, a( jj, 1 ), lda )
+                           
+                 jj = j + 1
+                 if( jp1/=jj .and. kstep==2 )call stdlib_zswap( j, a( jp1, 1 ), lda, a( jj, 1 ), &
+                           lda )
+              if( j>=1 )go to 120
+              ! set kb to the number of columns factorized
+              kb = k - 1
+           end if
+           return
+     end subroutine stdlib_zlasyf_rook
+
+     !> ZLAT2C: converts a COMPLEX*16 triangular matrix, SA, to a COMPLEX
+     !> triangular matrix, A.
+     !> RMAX is the overflow for the SINGLE PRECISION arithmetic
+     !> ZLAT2C checks that all the entries of A are between -RMAX and
+     !> RMAX. If not the conversion is aborted and a flag is raised.
+     !> This is an auxiliary routine so there is no argument checking.
+
+     pure subroutine stdlib_zlat2c( uplo, n, a, lda, sa, ldsa, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldsa, n
+           ! Array Arguments 
+           complex(sp), intent(out) :: sa(ldsa,*)
+           complex(dp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: rmax
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: real,aimag
+           ! Executable Statements 
+           rmax = stdlib_slamch( 'O' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( upper ) then
+              do j = 1, n
+                 do i = 1, j
+                    if( ( real( a( i, j ),KIND=dp)<-rmax ) .or.( real( a( i, j ),KIND=dp)>rmax ) &
+                    .or.( aimag( a( i, j ) )<-rmax ) .or.( aimag( a( i, j ) )>rmax ) ) &
+                              then
+                       info = 1
+                       go to 50
+                    end if
+                    sa( i, j ) = a( i, j )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = j, n
+                    if( ( real( a( i, j ),KIND=dp)<-rmax ) .or.( real( a( i, j ),KIND=dp)>rmax ) &
+                    .or.( aimag( a( i, j ) )<-rmax ) .or.( aimag( a( i, j ) )>rmax ) ) &
+                              then
+                       info = 1
+                       go to 50
+                    end if
+                    sa( i, j ) = a( i, j )
+                 end do
+              end do
+           end if
+           50 continue
+           return
+     end subroutine stdlib_zlat2c
+
+     !> ZLATBS: solves one of the triangular systems
+     !> A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+     !> with scaling to prevent overflow, where A is an upper or lower
+     !> triangular band matrix.  Here A**T denotes the transpose of A, x and b
+     !> are n-element vectors, and s is a scaling factor, usually less than
+     !> or equal to 1, chosen so that the components of x will be less than
+     !> the overflow threshold.  If the unscaled problem will not cause
+     !> overflow, the Level 2 BLAS routine ZTBSV is called.  If the matrix A
+     !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+     !> non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_zlatbs( uplo, trans, diag, normin, n, kd, ab, ldab, x,scale, cnorm, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(dp), intent(out) :: scale
+           ! Array Arguments 
+           real(dp), intent(inout) :: cnorm(*)
+           complex(dp), intent(in) :: ab(ldab,*)
+           complex(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, j, jfirst, jinc, jlast, jlen, maind
+           real(dp) :: bignum, grow, rec, smlnum, tjj, tmax, tscal, xbnd, xj, xmax
+           complex(dp) :: csumj, tjjs, uscal, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1, cabs2
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           cabs2( zdum ) = abs( real( zdum,KIND=dp) / 2._dp ) +abs( aimag( zdum ) / 2._dp )
+                     
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( kd<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLATBS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = smlnum / stdlib_dlamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 do j = 1, n
+                    jlen = min( kd, j-1 )
+                    cnorm( j ) = stdlib_dzasum( jlen, ab( kd+1-jlen, j ), 1 )
+                 end do
+              else
+                 ! a is lower triangular.
+                 do j = 1, n
+                    jlen = min( kd, n-j )
+                    if( jlen>0 ) then
+                       cnorm( j ) = stdlib_dzasum( jlen, ab( 2, j ), 1 )
+                    else
+                       cnorm( j ) = zero
+                    end if
+                 end do
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum/2.
+           imax = stdlib_idamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum*half ) then
+              tscal = one
+           else
+              tscal = half / ( smlnum*tmax )
+              call stdlib_dscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_ztbsv can be used.
+           xmax = zero
+           do j = 1, n
+              xmax = max( xmax, cabs2( x( j ) ) )
+           end do
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+                 maind = kd + 1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+                 maind = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 60
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    tjjs = ab( maind, j )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = g(j-1) / abs(a(j,j))
+                       xbnd = min( xbnd, min( one, tjj )*grow )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              60 continue
+           else
+              ! compute the growth in a**t * x = b  or  a**h * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+                 maind = kd + 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+                 maind = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 90
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    tjjs = ab( maind, j )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                       if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              90 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_ztbsv( uplo, trans, diag, n, kd, ab, ldab, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum*half ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = ( bignum*half ) / xmax
+                 call stdlib_zdscal( n, scale, x, 1 )
+                 xmax = bignum
+              else
+                 xmax = xmax*two
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 loop_120: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = cabs1( x( j ) )
+                    if( nounit ) then
+                       tjjs = ab( maind, j )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 110
+                    end if
+                    tjj = cabs1( tjjs )
+                    if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                       if( tjj<one ) then
+                          if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                             rec = one / xj
+                             call stdlib_zdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       xj = cabs1( x( j ) )
+                    else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                       if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                          rec = ( tjj*bignum ) / xj
+                          if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                             rec = rec / cnorm( j )
+                          end if
+                          call stdlib_zdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                       x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       xj = cabs1( x( j ) )
+                    else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                       do i = 1, n
+                          x( i ) = zero
+                       end do
+                       x( j ) = one
+                       xj = one
+                       scale = zero
+                       xmax = zero
+                    end if
+                    110 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_zdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_zdscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(max(1,j-kd):j-1) := x(max(1,j-kd):j-1) -
+                                                   ! x(j)* a(max(1,j-kd):j-1,j)
+                          jlen = min( kd, j-1 )
+                          call stdlib_zaxpy( jlen, -x( j )*tscal,ab( kd+1-jlen, j ), 1, x( j-jlen &
+                                    ), 1 )
+                          i = stdlib_izamax( j-1, x, 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                    else if( j<n ) then
+                       ! compute the update
+                          ! x(j+1:min(j+kd,n)) := x(j+1:min(j+kd,n)) -
+                                                ! x(j) * a(j+1:min(j+kd,n),j)
+                       jlen = min( kd, n-j )
+                       if( jlen>0 )call stdlib_zaxpy( jlen, -x( j )*tscal, ab( 2, j ), 1,x( j+1 ),&
+                                  1 )
+                       i = j + stdlib_izamax( n-j, x( j+1 ), 1 )
+                       xmax = cabs1( x( i ) )
+                    end if
+                 end do loop_120
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! solve a**t * x = b
+                 loop_170: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = ab( maind, j )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = stdlib_zladiv( uscal, tjjs )
+                       end if
+                       if( rec<one ) then
+                          call stdlib_zdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=dp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_zdotu to perform the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          csumj = stdlib_zdotu( jlen, ab( kd+1-jlen, j ), 1,x( j-jlen ), 1 )
+                                    
+                       else
+                          jlen = min( kd, n-j )
+                          if( jlen>1 )csumj = stdlib_zdotu( jlen, ab( 2, j ), 1, x( j+1 ),1 )
+                                    
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          do i = 1, jlen
+                             csumj = csumj + ( ab( kd+i-jlen, j )*uscal )*x( j-jlen-1+i )
+                          end do
+                       else
+                          jlen = min( kd, n-j )
+                          do i = 1, jlen
+                             csumj = csumj + ( ab( i+1, j )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=dp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = ab( maind, j )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 160
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_zdscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_zdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**t *x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       160 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_zladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                 end do loop_170
+              else
+                 ! solve a**h * x = b
+                 loop_220: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = conjg( ab( maind, j ) )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = stdlib_zladiv( uscal, tjjs )
+                       end if
+                       if( rec<one ) then
+                          call stdlib_zdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=dp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_zdotc to perform the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          csumj = stdlib_zdotc( jlen, ab( kd+1-jlen, j ), 1,x( j-jlen ), 1 )
+                                    
+                       else
+                          jlen = min( kd, n-j )
+                          if( jlen>1 )csumj = stdlib_zdotc( jlen, ab( 2, j ), 1, x( j+1 ),1 )
+                                    
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          jlen = min( kd, j-1 )
+                          do i = 1, jlen
+                             csumj = csumj + ( conjg( ab( kd+i-jlen, j ) )*uscal )*x( j-jlen-1+i )
+                                       
+                          end do
+                       else
+                          jlen = min( kd, n-j )
+                          do i = 1, jlen
+                             csumj = csumj + ( conjg( ab( i+1, j ) )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=dp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = conjg( ab( maind, j ) )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 210
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_zdscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_zdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**h *x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       210 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_zladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                 end do loop_220
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_dscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_zlatbs
+
+     !> ZLATPS: solves one of the triangular systems
+     !> A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+     !> with scaling to prevent overflow, where A is an upper or lower
+     !> triangular matrix stored in packed form.  Here A**T denotes the
+     !> transpose of A, A**H denotes the conjugate transpose of A, x and b
+     !> are n-element vectors, and s is a scaling factor, usually less than
+     !> or equal to 1, chosen so that the components of x will be less than
+     !> the overflow threshold.  If the unscaled problem will not cause
+     !> overflow, the Level 2 BLAS routine ZTPSV is called. If the matrix A
+     !> is singular (A(j,j) = 0 for some j), then s is set to 0 and a
+     !> non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_zlatps( uplo, trans, diag, normin, n, ap, x, scale,cnorm, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(out) :: scale
+           ! Array Arguments 
+           real(dp), intent(inout) :: cnorm(*)
+           complex(dp), intent(in) :: ap(*)
+           complex(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, ip, j, jfirst, jinc, jlast, jlen
+           real(dp) :: bignum, grow, rec, smlnum, tjj, tmax, tscal, xbnd, xj, xmax
+           complex(dp) :: csumj, tjjs, uscal, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1, cabs2
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           cabs2( zdum ) = abs( real( zdum,KIND=dp) / 2._dp ) +abs( aimag( zdum ) / 2._dp )
+                     
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLATPS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = smlnum / stdlib_dlamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 ip = 1
+                 do j = 1, n
+                    cnorm( j ) = stdlib_dzasum( j-1, ap( ip ), 1 )
+                    ip = ip + j
+                 end do
+              else
+                 ! a is lower triangular.
+                 ip = 1
+                 do j = 1, n - 1
+                    cnorm( j ) = stdlib_dzasum( n-j, ap( ip+1 ), 1 )
+                    ip = ip + n - j + 1
+                 end do
+                 cnorm( n ) = zero
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum/2.
+           imax = stdlib_idamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum*half ) then
+              tscal = one
+           else
+              tscal = half / ( smlnum*tmax )
+              call stdlib_dscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_ztpsv can be used.
+           xmax = zero
+           do j = 1, n
+              xmax = max( xmax, cabs2( x( j ) ) )
+           end do
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 60
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = n
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    tjjs = ap( ip )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = g(j-1) / abs(a(j,j))
+                       xbnd = min( xbnd, min( one, tjj )*grow )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                    ip = ip + jinc*jlen
+                    jlen = jlen - 1
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              60 continue
+           else
+              ! compute the growth in a**t * x = b  or  a**h * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 90
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    tjjs = ap( ip )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                       if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              90 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_ztpsv( uplo, trans, diag, n, ap, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum*half ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = ( bignum*half ) / xmax
+                 call stdlib_zdscal( n, scale, x, 1 )
+                 xmax = bignum
+              else
+                 xmax = xmax*two
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 loop_120: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = cabs1( x( j ) )
+                    if( nounit ) then
+                       tjjs = ap( ip )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 110
+                    end if
+                    tjj = cabs1( tjjs )
+                    if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                       if( tjj<one ) then
+                          if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                             rec = one / xj
+                             call stdlib_zdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       xj = cabs1( x( j ) )
+                    else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                       if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                          rec = ( tjj*bignum ) / xj
+                          if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                             rec = rec / cnorm( j )
+                          end if
+                          call stdlib_zdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                       x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       xj = cabs1( x( j ) )
+                    else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                       do i = 1, n
+                          x( i ) = zero
+                       end do
+                       x( j ) = one
+                       xj = one
+                       scale = zero
+                       xmax = zero
+                    end if
+                    110 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_zdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_zdscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(1:j-1) := x(1:j-1) - x(j) * a(1:j-1,j)
+                          call stdlib_zaxpy( j-1, -x( j )*tscal, ap( ip-j+1 ), 1, x,1 )
+                          i = stdlib_izamax( j-1, x, 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                       ip = ip - j
+                    else
+                       if( j<n ) then
+                          ! compute the update
+                             ! x(j+1:n) := x(j+1:n) - x(j) * a(j+1:n,j)
+                          call stdlib_zaxpy( n-j, -x( j )*tscal, ap( ip+1 ), 1,x( j+1 ), 1 )
+                                    
+                          i = j + stdlib_izamax( n-j, x( j+1 ), 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                       ip = ip + n - j + 1
+                    end if
+                 end do loop_120
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! solve a**t * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 loop_170: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = ap( ip )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = stdlib_zladiv( uscal, tjjs )
+                       end if
+                       if( rec<one ) then
+                          call stdlib_zdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=dp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_zdotu to perform the dot product.
+                       if( upper ) then
+                          csumj = stdlib_zdotu( j-1, ap( ip-j+1 ), 1, x, 1 )
+                       else if( j<n ) then
+                          csumj = stdlib_zdotu( n-j, ap( ip+1 ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             csumj = csumj + ( ap( ip-j+i )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = 1, n - j
+                             csumj = csumj + ( ap( ip+i )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=dp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = ap( ip )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 160
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_zdscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_zdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**t *x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       160 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_zladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do loop_170
+              else
+                 ! solve a**h * x = b
+                 ip = jfirst*( jfirst+1 ) / 2
+                 jlen = 1
+                 loop_220: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = conjg( ap( ip ) )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = stdlib_zladiv( uscal, tjjs )
+                       end if
+                       if( rec<one ) then
+                          call stdlib_zdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=dp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_zdotc to perform the dot product.
+                       if( upper ) then
+                          csumj = stdlib_zdotc( j-1, ap( ip-j+1 ), 1, x, 1 )
+                       else if( j<n ) then
+                          csumj = stdlib_zdotc( n-j, ap( ip+1 ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             csumj = csumj + ( conjg( ap( ip-j+i ) )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = 1, n - j
+                             csumj = csumj + ( conjg( ap( ip+i ) )*uscal )*x( j+i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=dp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                          tjjs = conjg( ap( ip ) )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 210
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_zdscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_zdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**h *x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       210 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_zladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                    jlen = jlen + 1
+                    ip = ip + jinc*jlen
+                 end do loop_220
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_dscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_zlatps
+
+     !> ZLATRD: reduces NB rows and columns of a complex Hermitian matrix A to
+     !> Hermitian tridiagonal form by a unitary similarity
+     !> transformation Q**H * A * Q, and returns the matrices V and W which are
+     !> needed to apply the transformation to the unreduced part of A.
+     !> If UPLO = 'U', ZLATRD reduces the last NB rows and columns of a
+     !> matrix, of which the upper triangle is supplied;
+     !> if UPLO = 'L', ZLATRD reduces the first NB rows and columns of a
+     !> matrix, of which the lower triangle is supplied.
+     !> This is an auxiliary routine called by ZHETRD.
+
+     pure subroutine stdlib_zlatrd( uplo, n, nb, a, lda, e, tau, w, ldw )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: lda, ldw, n, nb
+           ! Array Arguments 
+           real(dp), intent(out) :: e(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), w(ldw,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iw
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: real,min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=0 )return
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! reduce last nb columns of upper triangle
+              loop_10: do i = n, n - nb + 1, -1
+                 iw = i - n + nb
+                 if( i<n ) then
+                    ! update a(1:i,i)
+                    a( i, i ) = real( a( i, i ),KIND=dp)
+                    call stdlib_zlacgv( n-i, w( i, iw+1 ), ldw )
+                    call stdlib_zgemv( 'NO TRANSPOSE', i, n-i, -cone, a( 1, i+1 ),lda, w( i, iw+1 &
+                              ), ldw, cone, a( 1, i ), 1 )
+                    call stdlib_zlacgv( n-i, w( i, iw+1 ), ldw )
+                    call stdlib_zlacgv( n-i, a( i, i+1 ), lda )
+                    call stdlib_zgemv( 'NO TRANSPOSE', i, n-i, -cone, w( 1, iw+1 ),ldw, a( i, i+1 &
+                              ), lda, cone, a( 1, i ), 1 )
+                    call stdlib_zlacgv( n-i, a( i, i+1 ), lda )
+                    a( i, i ) = real( a( i, i ),KIND=dp)
+                 end if
+                 if( i>1 ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(1:i-2,i)
+                    alpha = a( i-1, i )
+                    call stdlib_zlarfg( i-1, alpha, a( 1, i ), 1, tau( i-1 ) )
+                    e( i-1 ) = real( alpha,KIND=dp)
+                    a( i-1, i ) = cone
+                    ! compute w(1:i-1,i)
+                    call stdlib_zhemv( 'UPPER', i-1, cone, a, lda, a( 1, i ), 1,czero, w( 1, iw ),&
+                               1 )
+                    if( i<n ) then
+                       call stdlib_zgemv( 'CONJUGATE TRANSPOSE', i-1, n-i, cone,w( 1, iw+1 ), ldw,&
+                                  a( 1, i ), 1, czero,w( i+1, iw ), 1 )
+                       call stdlib_zgemv( 'NO TRANSPOSE', i-1, n-i, -cone,a( 1, i+1 ), lda, w( i+&
+                                 1, iw ), 1, cone,w( 1, iw ), 1 )
+                       call stdlib_zgemv( 'CONJUGATE TRANSPOSE', i-1, n-i, cone,a( 1, i+1 ), lda, &
+                                 a( 1, i ), 1, czero,w( i+1, iw ), 1 )
+                       call stdlib_zgemv( 'NO TRANSPOSE', i-1, n-i, -cone,w( 1, iw+1 ), ldw, w( i+&
+                                 1, iw ), 1, cone,w( 1, iw ), 1 )
+                    end if
+                    call stdlib_zscal( i-1, tau( i-1 ), w( 1, iw ), 1 )
+                    alpha = -chalf*tau( i-1 )*stdlib_zdotc( i-1, w( 1, iw ), 1,a( 1, i ), 1 )
+                              
+                    call stdlib_zaxpy( i-1, alpha, a( 1, i ), 1, w( 1, iw ), 1 )
+                 end if
+              end do loop_10
+           else
+              ! reduce first nb columns of lower triangle
+              loop_20: do i = 1, nb
+                 ! update a(i:n,i)
+                 a( i, i ) = real( a( i, i ),KIND=dp)
+                 call stdlib_zlacgv( i-1, w( i, 1 ), ldw )
+                 call stdlib_zgemv( 'NO TRANSPOSE', n-i+1, i-1, -cone, a( i, 1 ),lda, w( i, 1 ), &
+                           ldw, cone, a( i, i ), 1 )
+                 call stdlib_zlacgv( i-1, w( i, 1 ), ldw )
+                 call stdlib_zlacgv( i-1, a( i, 1 ), lda )
+                 call stdlib_zgemv( 'NO TRANSPOSE', n-i+1, i-1, -cone, w( i, 1 ),ldw, a( i, 1 ), &
+                           lda, cone, a( i, i ), 1 )
+                 call stdlib_zlacgv( i-1, a( i, 1 ), lda )
+                 a( i, i ) = real( a( i, i ),KIND=dp)
+                 if( i<n ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(i+2:n,i)
+                    alpha = a( i+1, i )
+                    call stdlib_zlarfg( n-i, alpha, a( min( i+2, n ), i ), 1,tau( i ) )
+                    e( i ) = real( alpha,KIND=dp)
+                    a( i+1, i ) = cone
+                    ! compute w(i+1:n,i)
+                    call stdlib_zhemv( 'LOWER', n-i, cone, a( i+1, i+1 ), lda,a( i+1, i ), 1, &
+                              czero, w( i+1, i ), 1 )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-i, i-1, cone,w( i+1, 1 ), ldw, a( &
+                              i+1, i ), 1, czero,w( 1, i ), 1 )
+                    call stdlib_zgemv( 'NO TRANSPOSE', n-i, i-1, -cone, a( i+1, 1 ),lda, w( 1, i )&
+                              , 1, cone, w( i+1, i ), 1 )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-i, i-1, cone,a( i+1, 1 ), lda, a( &
+                              i+1, i ), 1, czero,w( 1, i ), 1 )
+                    call stdlib_zgemv( 'NO TRANSPOSE', n-i, i-1, -cone, w( i+1, 1 ),ldw, w( 1, i )&
+                              , 1, cone, w( i+1, i ), 1 )
+                    call stdlib_zscal( n-i, tau( i ), w( i+1, i ), 1 )
+                    alpha = -chalf*tau( i )*stdlib_zdotc( n-i, w( i+1, i ), 1,a( i+1, i ), 1 )
+                              
+                    call stdlib_zaxpy( n-i, alpha, a( i+1, i ), 1, w( i+1, i ), 1 )
+                 end if
+              end do loop_20
+           end if
+           return
+     end subroutine stdlib_zlatrd
+
+     !> ZLATRS: solves one of the triangular systems
+     !> A * x = s*b,  A**T * x = s*b,  or  A**H * x = s*b,
+     !> with scaling to prevent overflow.  Here A is an upper or lower
+     !> triangular matrix, A**T denotes the transpose of A, A**H denotes the
+     !> conjugate transpose of A, x and b are n-element vectors, and s is a
+     !> scaling factor, usually less than or equal to 1, chosen so that the
+     !> components of x will be less than the overflow threshold.  If the
+     !> unscaled problem will not cause overflow, the Level 2 BLAS routine
+     !> ZTRSV is called. If the matrix A is singular (A(j,j) = 0 for some j),
+     !> then s is set to 0 and a non-trivial solution to A*x = 0 is returned.
+
+     pure subroutine stdlib_zlatrs( uplo, trans, diag, normin, n, a, lda, x, scale,cnorm, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, normin, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(out) :: scale
+           ! Array Arguments 
+           real(dp), intent(inout) :: cnorm(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           integer(ilp) :: i, imax, j, jfirst, jinc, jlast
+           real(dp) :: bignum, grow, rec, smlnum, tjj, tmax, tscal, xbnd, xj, xmax
+           complex(dp) :: csumj, tjjs, uscal, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1, cabs2
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           cabs2( zdum ) = abs( real( zdum,KIND=dp) / 2._dp ) +abs( aimag( zdum ) / 2._dp )
+                     
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           ! test the input parameters.
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( .not.stdlib_lsame( normin, 'Y' ) .and. .not.stdlib_lsame( normin, 'N' ) ) &
+                     then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLATRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine machine dependent parameters to control overflow.
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = smlnum / stdlib_dlamch( 'PRECISION' )
+           bignum = one / smlnum
+           scale = one
+           if( stdlib_lsame( normin, 'N' ) ) then
+              ! compute the 1-norm of each column, not including the diagonal.
+              if( upper ) then
+                 ! a is upper triangular.
+                 do j = 1, n
+                    cnorm( j ) = stdlib_dzasum( j-1, a( 1, j ), 1 )
+                 end do
+              else
+                 ! a is lower triangular.
+                 do j = 1, n - 1
+                    cnorm( j ) = stdlib_dzasum( n-j, a( j+1, j ), 1 )
+                 end do
+                 cnorm( n ) = zero
+              end if
+           end if
+           ! scale the column norms by tscal if the maximum element in cnorm is
+           ! greater than bignum/2.
+           imax = stdlib_idamax( n, cnorm, 1 )
+           tmax = cnorm( imax )
+           if( tmax<=bignum*half ) then
+              tscal = one
+           else
+              tscal = half / ( smlnum*tmax )
+              call stdlib_dscal( n, tscal, cnorm, 1 )
+           end if
+           ! compute a bound on the computed solution vector to see if the
+           ! level 2 blas routine stdlib_ztrsv can be used.
+           xmax = zero
+           do j = 1, n
+              xmax = max( xmax, cabs2( x( j ) ) )
+           end do
+           xbnd = xmax
+           if( notran ) then
+              ! compute the growth in a * x = b.
+              if( upper ) then
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              else
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 60
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, g(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    tjjs = a( j, j )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = g(j-1) / abs(a(j,j))
+                       xbnd = min( xbnd, min( one, tjj )*grow )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                    if( tjj+cnorm( j )>=smlnum ) then
+                       ! g(j) = g(j-1)*( 1 + cnorm(j) / abs(a(j,j)) )
+                       grow = grow*( tjj / ( tjj+cnorm( j ) ) )
+                    else
+                       ! g(j) could overflow, set grow to 0.
+                       grow = zero
+                    end if
+                 end do
+                 grow = xbnd
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 60
+                    ! g(j) = g(j-1)*( 1 + cnorm(j) )
+                    grow = grow*( one / ( one+cnorm( j ) ) )
+                 end do
+              end if
+              60 continue
+           else
+              ! compute the growth in a**t * x = b  or  a**h * x = b.
+              if( upper ) then
+                 jfirst = 1
+                 jlast = n
+                 jinc = 1
+              else
+                 jfirst = n
+                 jlast = 1
+                 jinc = -1
+              end if
+              if( tscal/=one ) then
+                 grow = zero
+                 go to 90
+              end if
+              if( nounit ) then
+                 ! a is non-unit triangular.
+                 ! compute grow = 1/g(j) and xbnd = 1/m(j).
+                 ! initially, m(0) = max{x(i), i=1,...,n}.
+                 grow = half / max( xbnd, smlnum )
+                 xbnd = grow
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = max( g(j-1), m(j-1)*( 1 + cnorm(j) ) )
+                    xj = one + cnorm( j )
+                    grow = min( grow, xbnd / xj )
+                    tjjs = a( j, j )
+                    tjj = cabs1( tjjs )
+                    if( tjj>=smlnum ) then
+                       ! m(j) = m(j-1)*( 1 + cnorm(j) ) / abs(a(j,j))
+                       if( xj>tjj )xbnd = xbnd*( tjj / xj )
+                    else
+                       ! m(j) could overflow, set xbnd to 0.
+                       xbnd = zero
+                    end if
+                 end do
+                 grow = min( grow, xbnd )
+              else
+                 ! a is unit triangular.
+                 ! compute grow = 1/g(j), where g(0) = max{x(i), i=1,...,n}.
+                 grow = min( one, half / max( xbnd, smlnum ) )
+                 do j = jfirst, jlast, jinc
+                    ! exit the loop if the growth factor is too small.
+                    if( grow<=smlnum )go to 90
+                    ! g(j) = ( 1 + cnorm(j) )*g(j-1)
+                    xj = one + cnorm( j )
+                    grow = grow / xj
+                 end do
+              end if
+              90 continue
+           end if
+           if( ( grow*tscal )>smlnum ) then
+              ! use the level 2 blas solve if the reciprocal of the bound on
+              ! elements of x is not too small.
+              call stdlib_ztrsv( uplo, trans, diag, n, a, lda, x, 1 )
+           else
+              ! use a level 1 blas solve, scaling intermediate results.
+              if( xmax>bignum*half ) then
+                 ! scale x so that its components are less than or equal to
+                 ! bignum in absolute value.
+                 scale = ( bignum*half ) / xmax
+                 call stdlib_zdscal( n, scale, x, 1 )
+                 xmax = bignum
+              else
+                 xmax = xmax*two
+              end if
+              if( notran ) then
+                 ! solve a * x = b
+                 loop_120: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) / a(j,j), scaling x if necessary.
+                    xj = cabs1( x( j ) )
+                    if( nounit ) then
+                       tjjs = a( j, j )*tscal
+                    else
+                       tjjs = tscal
+                       if( tscal==one )go to 110
+                    end if
+                    tjj = cabs1( tjjs )
+                    if( tjj>smlnum ) then
+                          ! abs(a(j,j)) > smlnum:
+                       if( tjj<one ) then
+                          if( xj>tjj*bignum ) then
+                                ! scale x by 1/b(j).
+                             rec = one / xj
+                             call stdlib_zdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                       end if
+                       x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       xj = cabs1( x( j ) )
+                    else if( tjj>zero ) then
+                          ! 0 < abs(a(j,j)) <= smlnum:
+                       if( xj>tjj*bignum ) then
+                             ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum
+                             ! to avoid overflow when dividing by a(j,j).
+                          rec = ( tjj*bignum ) / xj
+                          if( cnorm( j )>one ) then
+                                ! scale by 1/cnorm(j) to avoid overflow when
+                                ! multiplying x(j) times column j.
+                             rec = rec / cnorm( j )
+                          end if
+                          call stdlib_zdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                       x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       xj = cabs1( x( j ) )
+                    else
+                          ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                          ! scale = 0, and compute a solution to a*x = 0.
+                       do i = 1, n
+                          x( i ) = zero
+                       end do
+                       x( j ) = one
+                       xj = one
+                       scale = zero
+                       xmax = zero
+                    end if
+                    110 continue
+                    ! scale x if necessary to avoid overflow when adding a
+                    ! multiple of column j of a.
+                    if( xj>one ) then
+                       rec = one / xj
+                       if( cnorm( j )>( bignum-xmax )*rec ) then
+                          ! scale x by 1/(2*abs(x(j))).
+                          rec = rec*half
+                          call stdlib_zdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                       end if
+                    else if( xj*cnorm( j )>( bignum-xmax ) ) then
+                       ! scale x by 1/2.
+                       call stdlib_zdscal( n, half, x, 1 )
+                       scale = scale*half
+                    end if
+                    if( upper ) then
+                       if( j>1 ) then
+                          ! compute the update
+                             ! x(1:j-1) := x(1:j-1) - x(j) * a(1:j-1,j)
+                          call stdlib_zaxpy( j-1, -x( j )*tscal, a( 1, j ), 1, x,1 )
+                          i = stdlib_izamax( j-1, x, 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                    else
+                       if( j<n ) then
+                          ! compute the update
+                             ! x(j+1:n) := x(j+1:n) - x(j) * a(j+1:n,j)
+                          call stdlib_zaxpy( n-j, -x( j )*tscal, a( j+1, j ), 1,x( j+1 ), 1 )
+                                    
+                          i = j + stdlib_izamax( n-j, x( j+1 ), 1 )
+                          xmax = cabs1( x( i ) )
+                       end if
+                    end if
+                 end do loop_120
+              else if( stdlib_lsame( trans, 'T' ) ) then
+                 ! solve a**t * x = b
+                 loop_170: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = a( j, j )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = stdlib_zladiv( uscal, tjjs )
+                       end if
+                       if( rec<one ) then
+                          call stdlib_zdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=dp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_zdotu to perform the dot product.
+                       if( upper ) then
+                          csumj = stdlib_zdotu( j-1, a( 1, j ), 1, x, 1 )
+                       else if( j<n ) then
+                          csumj = stdlib_zdotu( n-j, a( j+1, j ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             csumj = csumj + ( a( i, j )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = j + 1, n
+                             csumj = csumj + ( a( i, j )*uscal )*x( i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=dp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          tjjs = a( j, j )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 160
+                       end if
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_zdscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_zdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**t *x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       160 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_zladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                 end do loop_170
+              else
+                 ! solve a**h * x = b
+                 loop_220: do j = jfirst, jlast, jinc
+                    ! compute x(j) = b(j) - sum a(k,j)*x(k).
+                                          ! k<>j
+                    xj = cabs1( x( j ) )
+                    uscal = tscal
+                    rec = one / max( xmax, one )
+                    if( cnorm( j )>( bignum-xj )*rec ) then
+                       ! if x(j) could overflow, scale x by 1/(2*xmax).
+                       rec = rec*half
+                       if( nounit ) then
+                          tjjs = conjg( a( j, j ) )*tscal
+                       else
+                          tjjs = tscal
+                       end if
+                       tjj = cabs1( tjjs )
+                       if( tjj>one ) then
+                             ! divide by a(j,j) when scaling x if a(j,j) > 1.
+                          rec = min( one, rec*tjj )
+                          uscal = stdlib_zladiv( uscal, tjjs )
+                       end if
+                       if( rec<one ) then
+                          call stdlib_zdscal( n, rec, x, 1 )
+                          scale = scale*rec
+                          xmax = xmax*rec
+                       end if
+                    end if
+                    csumj = zero
+                    if( uscal==cmplx( one,KIND=dp) ) then
+                       ! if the scaling needed for a in the dot product is 1,
+                       ! call stdlib_zdotc to perform the dot product.
+                       if( upper ) then
+                          csumj = stdlib_zdotc( j-1, a( 1, j ), 1, x, 1 )
+                       else if( j<n ) then
+                          csumj = stdlib_zdotc( n-j, a( j+1, j ), 1, x( j+1 ), 1 )
+                       end if
+                    else
+                       ! otherwise, use in-line code for the dot product.
+                       if( upper ) then
+                          do i = 1, j - 1
+                             csumj = csumj + ( conjg( a( i, j ) )*uscal )*x( i )
+                          end do
+                       else if( j<n ) then
+                          do i = j + 1, n
+                             csumj = csumj + ( conjg( a( i, j ) )*uscal )*x( i )
+                          end do
+                       end if
+                    end if
+                    if( uscal==cmplx( tscal,KIND=dp) ) then
+                       ! compute x(j) := ( x(j) - csumj ) / a(j,j) if 1/a(j,j)
+                       ! was not used to scale the dotproduct.
+                       x( j ) = x( j ) - csumj
+                       xj = cabs1( x( j ) )
+                       if( nounit ) then
+                          tjjs = conjg( a( j, j ) )*tscal
+                       else
+                          tjjs = tscal
+                          if( tscal==one )go to 210
+                       end if
+                          ! compute x(j) = x(j) / a(j,j), scaling if necessary.
+                       tjj = cabs1( tjjs )
+                       if( tjj>smlnum ) then
+                             ! abs(a(j,j)) > smlnum:
+                          if( tjj<one ) then
+                             if( xj>tjj*bignum ) then
+                                   ! scale x by 1/abs(x(j)).
+                                rec = one / xj
+                                call stdlib_zdscal( n, rec, x, 1 )
+                                scale = scale*rec
+                                xmax = xmax*rec
+                             end if
+                          end if
+                          x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       else if( tjj>zero ) then
+                             ! 0 < abs(a(j,j)) <= smlnum:
+                          if( xj>tjj*bignum ) then
+                                ! scale x by (1/abs(x(j)))*abs(a(j,j))*bignum.
+                             rec = ( tjj*bignum ) / xj
+                             call stdlib_zdscal( n, rec, x, 1 )
+                             scale = scale*rec
+                             xmax = xmax*rec
+                          end if
+                          x( j ) = stdlib_zladiv( x( j ), tjjs )
+                       else
+                             ! a(j,j) = 0:  set x(1:n) = 0, x(j) = 1, and
+                             ! scale = 0 and compute a solution to a**h *x = 0.
+                          do i = 1, n
+                             x( i ) = zero
+                          end do
+                          x( j ) = one
+                          scale = zero
+                          xmax = zero
+                       end if
+                       210 continue
+                    else
+                       ! compute x(j) := x(j) / a(j,j) - csumj if the dot
+                       ! product has already been divided by 1/a(j,j).
+                       x( j ) = stdlib_zladiv( x( j ), tjjs ) - csumj
+                    end if
+                    xmax = max( xmax, cabs1( x( j ) ) )
+                 end do loop_220
+              end if
+              scale = scale / tscal
+           end if
+           ! scale the column norms by 1/tscal for return.
+           if( tscal/=one ) then
+              call stdlib_dscal( n, one / tscal, cnorm, 1 )
+           end if
+           return
+     end subroutine stdlib_zlatrs
+
+     !> ZLATRZ: factors the M-by-(M+L) complex upper trapezoidal matrix
+     !> [ A1 A2 ] = [ A(1:M,1:M) A(1:M,N-L+1:N) ] as ( R  0 ) * Z by means
+     !> of unitary transformations, where  Z is an (M+L)-by-(M+L) unitary
+     !> matrix and, R and A1 are M-by-M upper triangular matrices.
+
+     pure subroutine stdlib_zlatrz( m, n, l, a, lda, tau, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: l, lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( m==0 ) then
+              return
+           else if( m==n ) then
+              do i = 1, n
+                 tau( i ) = czero
+              end do
+              return
+           end if
+           do i = m, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! [ a(i,i) a(i,n-l+1:n) ]
+              call stdlib_zlacgv( l, a( i, n-l+1 ), lda )
+              alpha = conjg( a( i, i ) )
+              call stdlib_zlarfg( l+1, alpha, a( i, n-l+1 ), lda, tau( i ) )
+              tau( i ) = conjg( tau( i ) )
+              ! apply h(i) to a(1:i-1,i:n) from the right
+              call stdlib_zlarz( 'RIGHT', i-1, n-i+1, l, a( i, n-l+1 ), lda,conjg( tau( i ) ), a( &
+                        1, i ), lda, work )
+              a( i, i ) = conjg( alpha )
+           end do
+           return
+     end subroutine stdlib_zlatrz
+
+     !> ZLAUNHR_COL_GETRFNP2: computes the modified LU factorization without
+     !> pivoting of a complex general M-by-N matrix A. The factorization has
+     !> the form:
+     !> A - S = L * U,
+     !> where:
+     !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+     !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+     !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+     !> i-1 steps of Gaussian elimination. This means that the diagonal
+     !> element at each step of "modified" Gaussian elimination is at
+     !> least one in absolute value (so that division-by-zero not
+     !> possible during the division by the diagonal element);
+     !> L is a M-by-N lower triangular matrix with unit diagonal elements
+     !> (lower trapezoidal if M > N);
+     !> and U is a M-by-N upper triangular matrix
+     !> (upper trapezoidal if M < N).
+     !> This routine is an auxiliary routine used in the Householder
+     !> reconstruction routine ZUNHR_COL. In ZUNHR_COL, this routine is
+     !> applied to an M-by-N matrix A with orthonormal columns, where each
+     !> element is bounded by one in absolute value. With the choice of
+     !> the matrix S above, one can show that the diagonal element at each
+     !> step of Gaussian elimination is the largest (in absolute value) in
+     !> the column on or below the diagonal, so that no pivoting is required
+     !> for numerical stability [1].
+     !> For more details on the Householder reconstruction algorithm,
+     !> including the modified LU factorization, see [1].
+     !> This is the recursive version of the LU factorization algorithm.
+     !> Denote A - S by B. The algorithm divides the matrix B into four
+     !> submatrices:
+     !> [  B11 | B12  ]  where B11 is n1 by n1,
+     !> B = [ -----|----- ]        B21 is (m-n1) by n1,
+     !> [  B21 | B22  ]        B12 is n1 by n2,
+     !> B22 is (m-n1) by n2,
+     !> with n1 = min(m,n)/2, n2 = n-n1.
+     !> The subroutine calls itself to factor B11, solves for B21,
+     !> solves for B12, updates B22, then calls itself to factor B22.
+     !> For more details on the recursive LU algorithm, see [2].
+     !> ZLAUNHR_COL_GETRFNP2 is called to factorize a block by the blocked
+     !> routine ZLAUNHR_COL_GETRFNP, which uses blocked code calling
+     !> Level 3 BLAS to update the submatrix. However, ZLAUNHR_COL_GETRFNP2
+     !> is self-sufficient and can be used without ZLAUNHR_COL_GETRFNP.
+     !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+     !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+     !> E. Solomonik, J. Parallel Distrib. Comput.,
+     !> vol. 85, pp. 3-31, 2015.
+     !> [2] "Recursion leads to automatic variable blocking for dense linear
+     !> algebra algorithms", F. Gustavson, IBM J. of Res. and Dev.,
+     !> vol. 41, no. 6, pp. 737-755, 1997.
+
+     pure recursive subroutine stdlib_zlaunhr_col_getrfnp2( m, n, a, lda, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: d(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(dp) :: sfmin
+           integer(ilp) :: i, iinfo, n1, n2
+           complex(dp) :: z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,sign,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=dp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAUNHR_COL_GETRFNP2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 )return
+           if ( m==1 ) then
+              ! one row case, (also recursion termination case),
+              ! use unblocked code
+              ! transfer the sign
+              d( 1 ) = cmplx( -sign( one, real( a( 1, 1 ),KIND=dp) ),KIND=dp)
+              ! construct the row of u
+              a( 1, 1 ) = a( 1, 1 ) - d( 1 )
+           else if( n==1 ) then
+              ! one column case, (also recursion termination case),
+              ! use unblocked code
+              ! transfer the sign
+              d( 1 ) = cmplx( -sign( one, real( a( 1, 1 ),KIND=dp) ),KIND=dp)
+              ! construct the row of u
+              a( 1, 1 ) = a( 1, 1 ) - d( 1 )
+              ! scale the elements 2:m of the column
+              ! determine machine safe minimum
+              sfmin = stdlib_dlamch('S')
+              ! construct the subdiagonal elements of l
+              if( cabs1( a( 1, 1 ) ) >= sfmin ) then
+                 call stdlib_zscal( m-1, cone / a( 1, 1 ), a( 2, 1 ), 1 )
+              else
+                 do i = 2, m
+                    a( i, 1 ) = a( i, 1 ) / a( 1, 1 )
+                 end do
+              end if
+           else
+              ! divide the matrix b into four submatrices
+              n1 = min( m, n ) / 2
+              n2 = n-n1
+              ! factor b11, recursive call
+              call stdlib_zlaunhr_col_getrfnp2( n1, n1, a, lda, d, iinfo )
+              ! solve for b21
+              call stdlib_ztrsm( 'R', 'U', 'N', 'N', m-n1, n1, cone, a, lda,a( n1+1, 1 ), lda )
+                        
+              ! solve for b12
+              call stdlib_ztrsm( 'L', 'L', 'N', 'U', n1, n2, cone, a, lda,a( 1, n1+1 ), lda )
+                        
+              ! update b22, i.e. compute the schur complement
+              ! b22 := b22 - b21*b12
+              call stdlib_zgemm( 'N', 'N', m-n1, n2, n1, -cone, a( n1+1, 1 ), lda,a( 1, n1+1 ), &
+                        lda, cone, a( n1+1, n1+1 ), lda )
+              ! factor b22, recursive call
+              call stdlib_zlaunhr_col_getrfnp2( m-n1, n2, a( n1+1, n1+1 ), lda,d( n1+1 ), iinfo )
+                        
+           end if
+           return
+     end subroutine stdlib_zlaunhr_col_getrfnp2
+
+     !> ZLAUU2: computes the product U * U**H or L**H * L, where the triangular
+     !> factor U or L is stored in the upper or lower triangular part of
+     !> the array A.
+     !> If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+     !> overwriting the factor U in A.
+     !> If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+     !> overwriting the factor L in A.
+     !> This is the unblocked form of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_zlauu2( uplo, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           real(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAUU2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the product u * u**h.
+              do i = 1, n
+                 aii = real( a( i, i ),KIND=dp)
+                 if( i<n ) then
+                    a( i, i ) = aii*aii + real( stdlib_zdotc( n-i, a( i, i+1 ), lda,a( i, i+1 ), &
+                              lda ),KIND=dp)
+                    call stdlib_zlacgv( n-i, a( i, i+1 ), lda )
+                    call stdlib_zgemv( 'NO TRANSPOSE', i-1, n-i, cone, a( 1, i+1 ),lda, a( i, i+1 &
+                              ), lda, cmplx( aii,KIND=dp),a( 1, i ), 1 )
+                    call stdlib_zlacgv( n-i, a( i, i+1 ), lda )
+                 else
+                    call stdlib_zdscal( i, aii, a( 1, i ), 1 )
+                 end if
+              end do
+           else
+              ! compute the product l**h * l.
+              do i = 1, n
+                 aii = real( a( i, i ),KIND=dp)
+                 if( i<n ) then
+                    a( i, i ) = aii*aii + real( stdlib_zdotc( n-i, a( i+1, i ), 1,a( i+1, i ), 1 )&
+                              ,KIND=dp)
+                    call stdlib_zlacgv( i-1, a( i, 1 ), lda )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-i, i-1, cone,a( i+1, 1 ), lda, a( &
+                              i+1, i ), 1,cmplx( aii,KIND=dp), a( i, 1 ), lda )
+                    call stdlib_zlacgv( i-1, a( i, 1 ), lda )
+                 else
+                    call stdlib_zdscal( i, aii, a( i, 1 ), lda )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_zlauu2
+
+     !> ZLAUUM: computes the product U * U**H or L**H * L, where the triangular
+     !> factor U or L is stored in the upper or lower triangular part of
+     !> the array A.
+     !> If UPLO = 'U' or 'u' then the upper triangle of the result is stored,
+     !> overwriting the factor U in A.
+     !> If UPLO = 'L' or 'l' then the lower triangle of the result is stored,
+     !> overwriting the factor L in A.
+     !> This is the blocked form of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_zlauum( uplo, n, a, lda, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ib, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAUUM', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'ZLAUUM', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_zlauu2( uplo, n, a, lda, info )
+           else
+              ! use blocked code
+              if( upper ) then
+                 ! compute the product u * u**h.
+                 do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT', i-1, &
+                              ib, cone, a( i, i ), lda,a( 1, i ), lda )
+                    call stdlib_zlauu2( 'UPPER', ib, a( i, i ), lda, info )
+                    if( i+ib<=n ) then
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',i-1, ib, n-i-ib+1,&
+                                  cone, a( 1, i+ib ),lda, a( i, i+ib ), lda, cone, a( 1, i ),lda )
+                       call stdlib_zherk( 'UPPER', 'NO TRANSPOSE', ib, n-i-ib+1,one, a( i, i+ib ),&
+                                  lda, one, a( i, i ),lda )
+                    end if
+                 end do
+              else
+                 ! compute the product l**h * l.
+                 do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    call stdlib_ztrmm( 'LEFT', 'LOWER', 'CONJUGATE TRANSPOSE','NON-UNIT', ib, i-1,&
+                               cone, a( i, i ), lda,a( i, 1 ), lda )
+                    call stdlib_zlauu2( 'LOWER', ib, a( i, i ), lda, info )
+                    if( i+ib<=n ) then
+                       call stdlib_zgemm( 'CONJUGATE TRANSPOSE', 'NO TRANSPOSE', ib,i-1, n-i-ib+1,&
+                                  cone, a( i+ib, i ), lda,a( i+ib, 1 ), lda, cone, a( i, 1 ), lda )
+                       call stdlib_zherk( 'LOWER', 'CONJUGATE TRANSPOSE', ib,n-i-ib+1, one, a( i+&
+                                 ib, i ), lda, one,a( i, i ), lda )
+                    end if
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zlauum
+
+     !> ZPBCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex Hermitian positive definite band matrix using
+     !> the Cholesky factorization A = U**H*U or A = L*L**H computed by
+     !> ZPBTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_zpbcon( uplo, n, kd, ab, ldab, anorm, rcond, work,rwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(in) :: ab(ldab,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(dp) :: ainvnm, scale, scalel, scaleu, smlnum
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**h).
+                 call stdlib_zlatbs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, kd, ab,&
+                            ldab, work, scalel, rwork,info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_zlatbs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scaleu, rwork, info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_zlatbs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kd, ab, ldab, &
+                           work, scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(l**h).
+                 call stdlib_zlatbs( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, kd, ab,&
+                            ldab, work, scaleu, rwork,info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_izamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_zdrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_zpbcon
+
+     !> ZPBEQU: computes row and column scalings intended to equilibrate a
+     !> Hermitian positive definite band matrix A and reduce its condition
+     !> number (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_zpbequ( uplo, n, kd, ab, ldab, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(dp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(out) :: s(*)
+           complex(dp), intent(in) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j
+           real(dp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           if( upper ) then
+              j = kd + 1
+           else
+              j = 1
+           end if
+           ! initialize smin and amax.
+           s( 1 ) = real( ab( j, 1 ),KIND=dp)
+           smin = s( 1 )
+           amax = s( 1 )
+           ! find the minimum and maximum diagonal elements.
+           do i = 2, n
+              s( i ) = real( ab( j, i ),KIND=dp)
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_zpbequ
+
+     !> ZPBSTF: computes a split Cholesky factorization of a complex
+     !> Hermitian positive definite band matrix A.
+     !> This routine is designed to be used in conjunction with ZHBGST.
+     !> The factorization has the form  A = S**H*S  where S is a band matrix
+     !> of the same bandwidth as A and the following structure:
+     !> S = ( U    )
+     !> ( M  L )
+     !> where U is upper triangular of order m = (n+kd)/2, and L is lower
+     !> triangular of order n-m.
+
+     pure subroutine stdlib_zpbstf( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, kld, km, m
+           real(dp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBSTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           kld = max( 1, ldab-1 )
+           ! set the splitting point m.
+           m = ( n+kd ) / 2
+           if( upper ) then
+              ! factorize a(m+1:n,m+1:n) as l**h*l, and update a(1:m,1:m).
+              do j = n, m + 1, -1
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( kd+1, j ),KIND=dp)
+                 if( ajj<=zero ) then
+                    ab( kd+1, j ) = ajj
+                    go to 50
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 km = min( j-1, kd )
+                 ! compute elements j-km:j-1 of the j-th column and update the
+                 ! the leading submatrix within the band.
+                 call stdlib_zdscal( km, one / ajj, ab( kd+1-km, j ), 1 )
+                 call stdlib_zher( 'UPPER', km, -one, ab( kd+1-km, j ), 1,ab( kd+1, j-km ), kld )
+                           
+              end do
+              ! factorize the updated submatrix a(1:m,1:m) as u**h*u.
+              do j = 1, m
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( kd+1, j ),KIND=dp)
+                 if( ajj<=zero ) then
+                    ab( kd+1, j ) = ajj
+                    go to 50
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 km = min( kd, m-j )
+                 ! compute elements j+1:j+km of the j-th row and update the
+                 ! trailing submatrix within the band.
+                 if( km>0 ) then
+                    call stdlib_zdscal( km, one / ajj, ab( kd, j+1 ), kld )
+                    call stdlib_zlacgv( km, ab( kd, j+1 ), kld )
+                    call stdlib_zher( 'UPPER', km, -one, ab( kd, j+1 ), kld,ab( kd+1, j+1 ), kld )
+                              
+                    call stdlib_zlacgv( km, ab( kd, j+1 ), kld )
+                 end if
+              end do
+           else
+              ! factorize a(m+1:n,m+1:n) as l**h*l, and update a(1:m,1:m).
+              do j = n, m + 1, -1
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( 1, j ),KIND=dp)
+                 if( ajj<=zero ) then
+                    ab( 1, j ) = ajj
+                    go to 50
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 km = min( j-1, kd )
+                 ! compute elements j-km:j-1 of the j-th row and update the
+                 ! trailing submatrix within the band.
+                 call stdlib_zdscal( km, one / ajj, ab( km+1, j-km ), kld )
+                 call stdlib_zlacgv( km, ab( km+1, j-km ), kld )
+                 call stdlib_zher( 'LOWER', km, -one, ab( km+1, j-km ), kld,ab( 1, j-km ), kld )
+                           
+                 call stdlib_zlacgv( km, ab( km+1, j-km ), kld )
+              end do
+              ! factorize the updated submatrix a(1:m,1:m) as u**h*u.
+              do j = 1, m
+                 ! compute s(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( 1, j ),KIND=dp)
+                 if( ajj<=zero ) then
+                    ab( 1, j ) = ajj
+                    go to 50
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 km = min( kd, m-j )
+                 ! compute elements j+1:j+km of the j-th column and update the
+                 ! trailing submatrix within the band.
+                 if( km>0 ) then
+                    call stdlib_zdscal( km, one / ajj, ab( 2, j ), 1 )
+                    call stdlib_zher( 'LOWER', km, -one, ab( 2, j ), 1,ab( 1, j+1 ), kld )
+                 end if
+              end do
+           end if
+           return
+           50 continue
+           info = j
+           return
+     end subroutine stdlib_zpbstf
+
+     !> ZPBTF2: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite band matrix A.
+     !> The factorization has the form
+     !> A = U**H * U ,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix, U**H is the conjugate transpose
+     !> of U, and L is lower triangular.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_zpbtf2( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, kld, kn
+           real(dp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           kld = max( 1, ldab-1 )
+           if( upper ) then
+              ! compute the cholesky factorization a = u**h * u.
+              do j = 1, n
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( kd+1, j ),KIND=dp)
+                 if( ajj<=zero ) then
+                    ab( kd+1, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( kd+1, j ) = ajj
+                 ! compute elements j+1:j+kn of row j and update the
+                 ! trailing submatrix within the band.
+                 kn = min( kd, n-j )
+                 if( kn>0 ) then
+                    call stdlib_zdscal( kn, one / ajj, ab( kd, j+1 ), kld )
+                    call stdlib_zlacgv( kn, ab( kd, j+1 ), kld )
+                    call stdlib_zher( 'UPPER', kn, -one, ab( kd, j+1 ), kld,ab( kd+1, j+1 ), kld )
+                              
+                    call stdlib_zlacgv( kn, ab( kd, j+1 ), kld )
+                 end if
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**h.
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = real( ab( 1, j ),KIND=dp)
+                 if( ajj<=zero ) then
+                    ab( 1, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 ab( 1, j ) = ajj
+                 ! compute elements j+1:j+kn of column j and update the
+                 ! trailing submatrix within the band.
+                 kn = min( kd, n-j )
+                 if( kn>0 ) then
+                    call stdlib_zdscal( kn, one / ajj, ab( 2, j ), 1 )
+                    call stdlib_zher( 'LOWER', kn, -one, ab( 2, j ), 1,ab( 1, j+1 ), kld )
+                 end if
+              end do
+           end if
+           return
+           30 continue
+           info = j
+           return
+     end subroutine stdlib_zpbtf2
+
+     !> ZPBTRS: solves a system of linear equations A*X = B with a Hermitian
+     !> positive definite band matrix A using the Cholesky factorization
+     !> A = U**H *U or A = L*L**H computed by ZPBTRF.
+
+     pure subroutine stdlib_zpbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           complex(dp), intent(in) :: ab(ldab,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**h *u.
+              do j = 1, nrhs
+                 ! solve u**h *x = b, overwriting b with x.
+                 call stdlib_ztbsv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,kd, ab, ldab, b(&
+                            1, j ), 1 )
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_ztbsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j )&
+                           , 1 )
+              end do
+           else
+              ! solve a*x = b where a = l*l**h.
+              do j = 1, nrhs
+                 ! solve l*x = b, overwriting b with x.
+                 call stdlib_ztbsv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n, kd, ab,ldab, b( 1, j )&
+                           , 1 )
+                 ! solve l**h *x = b, overwriting b with x.
+                 call stdlib_ztbsv( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,kd, ab, ldab, b(&
+                            1, j ), 1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_zpbtrs
+
+     !> ZPOCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex Hermitian positive definite matrix using the
+     !> Cholesky factorization A = U**H*U or A = L*L**H computed by ZPOTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_zpocon( uplo, n, a, lda, anorm, rcond, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(dp) :: ainvnm, scale, scalel, scaleu, smlnum
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of inv(a).
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**h).
+                 call stdlib_zlatrs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, a, lda,&
+                            work, scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_zlatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           scaleu, rwork, info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_zlatrs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(l**h).
+                 call stdlib_zlatrs( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, a, lda,&
+                            work, scaleu, rwork, info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_izamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_zdrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_zpocon
+
+     !> ZPOEQU: computes row and column scalings intended to equilibrate a
+     !> Hermitian positive definite matrix A and reduce its condition number
+     !> (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_zpoequ( n, a, lda, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(out) :: s(*)
+           complex(dp), intent(in) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           ! find the minimum and maximum diagonal elements.
+           s( 1 ) = real( a( 1, 1 ),KIND=dp)
+           smin = s( 1 )
+           amax = s( 1 )
+           do i = 2, n
+              s( i ) = real( a( i, i ),KIND=dp)
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_zpoequ
+
+     !> ZPOEQUB: computes row and column scalings intended to equilibrate a
+     !> Hermitian positive definite matrix A and reduce its condition number
+     !> (with respect to the two-norm).  S contains the scale factors,
+     !> S(i) = 1/sqrt(A(i,i)), chosen so that the scaled matrix B with
+     !> elements B(i,j) = S(i)*A(i,j)*S(j) has ones on the diagonal.  This
+     !> choice of S puts the condition number of B within a factor N of the
+     !> smallest possible condition number over all possible diagonal
+     !> scalings.
+     !> This routine differs from ZPOEQU by restricting the scaling factors
+     !> to a power of the radix.  Barring over- and underflow, scaling by
+     !> these factors introduces no additional rounding errors.  However, the
+     !> scaled diagonal entries are no longer approximately 1 but lie
+     !> between sqrt(radix) and 1/sqrt(radix).
+
+     pure subroutine stdlib_zpoequb( n, a, lda, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(out) :: amax, scond
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           real(dp), intent(out) :: s(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: smin, base, tmp
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt,log,int,real,aimag
+           ! Executable Statements 
+           ! test the input parameters.
+           ! positive definite only performs 1 pass of equilibration.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOEQUB', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           base = stdlib_dlamch( 'B' )
+           tmp = -0.5e+0_dp / log ( base )
+           ! find the minimum and maximum diagonal elements.
+           s( 1 ) = real( a( 1, 1 ),KIND=dp)
+           smin = s( 1 )
+           amax = s( 1 )
+           do i = 2, n
+              s( i ) = real( a( i, i ),KIND=dp)
+              smin = min( smin, s( i ) )
+              amax = max( amax, s( i ) )
+           end do
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = base ** int( tmp * log( s( i ) ),KIND=ilp)
+              end do
+              ! compute scond = min(s(i)) / max(s(i)).
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_zpoequb
+
+     !> ZPOTF2: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**H * U ,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_zpotf2( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j
+           real(dp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: real,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the cholesky factorization a = u**h *u.
+              do j = 1, n
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = real( a( j, j ),KIND=dp) - real( stdlib_zdotc( j-1, a( 1, j ), 1,a( 1, j ),&
+                            1 ),KIND=dp)
+                 if( ajj<=zero.or.stdlib_disnan( ajj ) ) then
+                    a( j, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of row j.
+                 if( j<n ) then
+                    call stdlib_zlacgv( j-1, a( 1, j ), 1 )
+                    call stdlib_zgemv( 'TRANSPOSE', j-1, n-j, -cone, a( 1, j+1 ),lda, a( 1, j ), &
+                              1, cone, a( j, j+1 ), lda )
+                    call stdlib_zlacgv( j-1, a( 1, j ), 1 )
+                    call stdlib_zdscal( n-j, one / ajj, a( j, j+1 ), lda )
+                 end if
+              end do
+           else
+              ! compute the cholesky factorization a = l*l**h.
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = real( a( j, j ),KIND=dp) - real( stdlib_zdotc( j-1, a( j, 1 ), lda,a( j, 1 &
+                           ), lda ),KIND=dp)
+                 if( ajj<=zero.or.stdlib_disnan( ajj ) ) then
+                    a( j, j ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of column j.
+                 if( j<n ) then
+                    call stdlib_zlacgv( j-1, a( j, 1 ), lda )
+                    call stdlib_zgemv( 'NO TRANSPOSE', n-j, j-1, -cone, a( j+1, 1 ),lda, a( j, 1 )&
+                              , lda, cone, a( j+1, j ), 1 )
+                    call stdlib_zlacgv( j-1, a( j, 1 ), lda )
+                    call stdlib_zdscal( n-j, one / ajj, a( j+1, j ), 1 )
+                 end if
+              end do
+           end if
+           go to 40
+           30 continue
+           info = j
+           40 continue
+           return
+     end subroutine stdlib_zpotf2
+
+     !> ZPOTRF2: computes the Cholesky factorization of a Hermitian
+     !> positive definite matrix A using the recursive algorithm.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the recursive version of the algorithm. It divides
+     !> the matrix into four submatrices:
+     !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+     !> A = [ -----|----- ]  with n1 = n/2
+     !> [  A21 | A22  ]       n2 = n-n1
+     !> The subroutine calls itself to factor A11. Update and scale A21
+     !> or A12, update A22 then call itself to factor A22.
+
+     pure recursive subroutine stdlib_zpotrf2( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: n1, n2, iinfo
+           real(dp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max,real,sqrt
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOTRF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! n=1 case
+           if( n==1 ) then
+              ! test for non-positive-definiteness
+              ajj = real( a( 1, 1 ),KIND=dp)
+              if( ajj<=zero.or.stdlib_disnan( ajj ) ) then
+                 info = 1
+                 return
+              end if
+              ! factor
+              a( 1, 1 ) = sqrt( ajj )
+           ! use recursive code
+           else
+              n1 = n/2
+              n2 = n-n1
+              ! factor a11
+              call stdlib_zpotrf2( uplo, n1, a( 1, 1 ), lda, iinfo )
+              if ( iinfo/=0 ) then
+                 info = iinfo
+                 return
+              end if
+              ! compute the cholesky factorization a = u**h*u
+              if( upper ) then
+                 ! update and scale a12
+                 call stdlib_ztrsm( 'L', 'U', 'C', 'N', n1, n2, cone,a( 1, 1 ), lda, a( 1, n1+1 ),&
+                            lda )
+                 ! update and factor a22
+                 call stdlib_zherk( uplo, 'C', n2, n1, -one, a( 1, n1+1 ), lda,one, a( n1+1, n1+1 &
+                           ), lda )
+                 call stdlib_zpotrf2( uplo, n2, a( n1+1, n1+1 ), lda, iinfo )
+                 if ( iinfo/=0 ) then
+                    info = iinfo + n1
+                    return
+                 end if
+              ! compute the cholesky factorization a = l*l**h
+              else
+                 ! update and scale a21
+                 call stdlib_ztrsm( 'R', 'L', 'C', 'N', n2, n1, cone,a( 1, 1 ), lda, a( n1+1, 1 ),&
+                            lda )
+                 ! update and factor a22
+                 call stdlib_zherk( uplo, 'N', n2, n1, -one, a( n1+1, 1 ), lda,one, a( n1+1, n1+1 &
+                           ), lda )
+                 call stdlib_zpotrf2( uplo, n2, a( n1+1, n1+1 ), lda, iinfo )
+                 if ( iinfo/=0 ) then
+                    info = iinfo + n1
+                    return
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_zpotrf2
+
+     !> ZPOTRS: solves a system of linear equations A*X = B with a Hermitian
+     !> positive definite matrix A using the Cholesky factorization
+     !> A = U**H * U or A = L * L**H computed by ZPOTRF.
+
+     pure subroutine stdlib_zpotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**h *u.
+              ! solve u**h *x = b, overwriting b with x.
+              call stdlib_ztrsm( 'LEFT', 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',n, nrhs, cone,&
+                         a, lda, b, ldb )
+              ! solve u*x = b, overwriting b with x.
+              call stdlib_ztrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, cone, a, &
+                        lda, b, ldb )
+           else
+              ! solve a*x = b where a = l*l**h.
+              ! solve l*x = b, overwriting b with x.
+              call stdlib_ztrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, cone, a, &
+                        lda, b, ldb )
+              ! solve l**h *x = b, overwriting b with x.
+              call stdlib_ztrsm( 'LEFT', 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',n, nrhs, cone,&
+                         a, lda, b, ldb )
+           end if
+           return
+     end subroutine stdlib_zpotrs
+
+     !> ZPPCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex Hermitian positive definite packed matrix using
+     !> the Cholesky factorization A = U**H*U or A = L*L**H computed by
+     !> ZPPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_zppcon( uplo, n, ap, anorm, rcond, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(in) :: ap(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           character :: normin
+           integer(ilp) :: ix, kase
+           real(dp) :: ainvnm, scale, scalel, scaleu, smlnum
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           normin = 'N'
+           10 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( upper ) then
+                 ! multiply by inv(u**h).
+                 call stdlib_zlatps( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, ap, &
+                           work, scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(u).
+                 call stdlib_zlatps( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,ap, work, &
+                           scaleu, rwork, info )
+              else
+                 ! multiply by inv(l).
+                 call stdlib_zlatps( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,ap, work, &
+                           scalel, rwork, info )
+                 normin = 'Y'
+                 ! multiply by inv(l**h).
+                 call stdlib_zlatps( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, ap, &
+                           work, scaleu, rwork, info )
+              end if
+              ! multiply by 1/scale if doing so will not cause overflow.
+              scale = scalel*scaleu
+              if( scale/=one ) then
+                 ix = stdlib_izamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_zdrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_zppcon
+
+     !> ZPPEQU: computes row and column scalings intended to equilibrate a
+     !> Hermitian positive definite matrix A in packed storage and reduce
+     !> its condition number (with respect to the two-norm).  S contains the
+     !> scale factors, S(i)=1/sqrt(A(i,i)), chosen so that the scaled matrix
+     !> B with elements B(i,j)=S(i)*A(i,j)*S(j) has ones on the diagonal.
+     !> This choice of S puts the condition number of B within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_zppequ( uplo, n, ap, s, scond, amax, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(out) :: amax, scond
+           ! Array Arguments 
+           real(dp), intent(out) :: s(*)
+           complex(dp), intent(in) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, jj
+           real(dp) :: smin
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPEQU', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              scond = one
+              amax = zero
+              return
+           end if
+           ! initialize smin and amax.
+           s( 1 ) = real( ap( 1 ),KIND=dp)
+           smin = s( 1 )
+           amax = s( 1 )
+           if( upper ) then
+              ! uplo = 'u':  upper triangle of a is stored.
+              ! find the minimum and maximum diagonal elements.
+              jj = 1
+              do i = 2, n
+                 jj = jj + i
+                 s( i ) = real( ap( jj ),KIND=dp)
+                 smin = min( smin, s( i ) )
+                 amax = max( amax, s( i ) )
+              end do
+           else
+              ! uplo = 'l':  lower triangle of a is stored.
+              ! find the minimum and maximum diagonal elements.
+              jj = 1
+              do i = 2, n
+                 jj = jj + n - i + 2
+                 s( i ) = real( ap( jj ),KIND=dp)
+                 smin = min( smin, s( i ) )
+                 amax = max( amax, s( i ) )
+              end do
+           end if
+           if( smin<=zero ) then
+              ! find the first non-positive diagonal element and return.
+              do i = 1, n
+                 if( s( i )<=zero ) then
+                    info = i
+                    return
+                 end if
+              end do
+           else
+              ! set the scale factors to the reciprocals
+              ! of the diagonal elements.
+              do i = 1, n
+                 s( i ) = one / sqrt( s( i ) )
+              end do
+              ! compute scond = min(s(i)) / max(s(i))
+              scond = sqrt( smin ) / sqrt( amax )
+           end if
+           return
+     end subroutine stdlib_zppequ
+
+     !> ZPPTRF: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite matrix A stored in packed format.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+
+     pure subroutine stdlib_zpptrf( uplo, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jc, jj
+           real(dp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: real,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! compute the cholesky factorization a = u**h * u.
+              jj = 0
+              do j = 1, n
+                 jc = jj + 1
+                 jj = jj + j
+                 ! compute elements 1:j-1 of column j.
+                 if( j>1 )call stdlib_ztpsv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',j-1, ap, &
+                           ap( jc ), 1 )
+                 ! compute u(j,j) and test for non-positive-definiteness.
+                 ajj = real( ap( jj ),KIND=dp) - real( stdlib_zdotc( j-1,ap( jc ), 1, ap( jc ), 1 &
+                           ),KIND=dp)
+                 if( ajj<=zero ) then
+                    ap( jj ) = ajj
+                    go to 30
+                 end if
+                 ap( jj ) = sqrt( ajj )
+              end do
+           else
+              ! compute the cholesky factorization a = l * l**h.
+              jj = 1
+              do j = 1, n
+                 ! compute l(j,j) and test for non-positive-definiteness.
+                 ajj = real( ap( jj ),KIND=dp)
+                 if( ajj<=zero ) then
+                    ap( jj ) = ajj
+                    go to 30
+                 end if
+                 ajj = sqrt( ajj )
+                 ap( jj ) = ajj
+                 ! compute elements j+1:n of column j and update the trailing
+                 ! submatrix.
+                 if( j<n ) then
+                    call stdlib_zdscal( n-j, one / ajj, ap( jj+1 ), 1 )
+                    call stdlib_zhpr( 'LOWER', n-j, -one, ap( jj+1 ), 1,ap( jj+n-j+1 ) )
+                    jj = jj + n - j + 1
+                 end if
+              end do
+           end if
+           go to 40
+           30 continue
+           info = j
+           40 continue
+           return
+     end subroutine stdlib_zpptrf
+
+     !> ZPPTRS: solves a system of linear equations A*X = B with a Hermitian
+     !> positive definite matrix A in packed storage using the Cholesky
+     !> factorization A = U**H * U or A = L * L**H computed by ZPPTRF.
+
+     pure subroutine stdlib_zpptrs( uplo, n, nrhs, ap, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(dp), intent(in) :: ap(*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b where a = u**h * u.
+              do i = 1, nrhs
+                 ! solve u**h *x = b, overwriting b with x.
+                 call stdlib_ztpsv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,ap, b( 1, i ), &
+                           1 )
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_ztpsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                           
+              end do
+           else
+              ! solve a*x = b where a = l * l**h.
+              do i = 1, nrhs
+                 ! solve l*y = b, overwriting b with x.
+                 call stdlib_ztpsv( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', n, ap,b( 1, i ), 1 )
+                           
+                 ! solve l**h *x = y, overwriting b with x.
+                 call stdlib_ztpsv( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,ap, b( 1, i ), &
+                           1 )
+              end do
+           end if
+           return
+     end subroutine stdlib_zpptrs
+
+     !> ZPSTF2: computes the Cholesky factorization with complete
+     !> pivoting of a complex Hermitian positive semidefinite matrix A.
+     !> The factorization has the form
+     !> P**T * A * P = U**H * U ,  if UPLO = 'U',
+     !> P**T * A * P = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular, and
+     !> P is stored as vector PIV.
+     !> This algorithm does not attempt to check that A is positive
+     !> semidefinite. This version of the algorithm calls level 2 BLAS.
+
+     pure subroutine stdlib_zpstf2( uplo, n, a, lda, piv, rank, tol, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: tol
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(2*n)
+           integer(ilp), intent(out) :: piv(n)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(dp) :: ztemp
+           real(dp) :: ajj, dstop, dtemp
+           integer(ilp) :: i, itemp, j, pvt
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPSTF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize piv
+           do i = 1, n
+              piv( i ) = i
+           end do
+           ! compute stopping value
+           do i = 1, n
+              work( i ) = real( a( i, i ),KIND=dp)
+           end do
+           pvt = maxloc( work( 1:n ), 1 )
+           ajj = real( a( pvt, pvt ),KIND=dp)
+           if( ajj<=zero.or.stdlib_disnan( ajj ) ) then
+              rank = 0
+              info = 1
+              go to 200
+           end if
+           ! compute stopping value if not supplied
+           if( tol<zero ) then
+              dstop = n * stdlib_dlamch( 'EPSILON' ) * ajj
+           else
+              dstop = tol
+           end if
+           ! set first chalf of work to zero, holds dot products
+           do i = 1, n
+              work( i ) = 0
+           end do
+           if( upper ) then
+              ! compute the cholesky factorization p**t * a * p = u**h* u
+              loop_150: do j = 1, n
+              ! find pivot, test for exit, else swap rows and columns
+              ! update dot products, compute possible pivots which are
+              ! stored in the second chalf of work
+                 do i = j, n
+                    if( j>1 ) then
+                       work( i ) = work( i ) +real( conjg( a( j-1, i ) )*a( j-1, i ),KIND=dp)
+                                 
+                    end if
+                    work( n+i ) = real( a( i, i ),KIND=dp) - work( i )
+                 end do
+                 if( j>1 ) then
+                    itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                    pvt = itemp + j - 1
+                    ajj = work( n+pvt )
+                    if( ajj<=dstop.or.stdlib_disnan( ajj ) ) then
+                       a( j, j ) = ajj
+                       go to 190
+                    end if
+                 end if
+                 if( j/=pvt ) then
+                    ! pivot ok, so can now swap pivot rows and columns
+                    a( pvt, pvt ) = a( j, j )
+                    call stdlib_zswap( j-1, a( 1, j ), 1, a( 1, pvt ), 1 )
+                    if( pvt<n )call stdlib_zswap( n-pvt, a( j, pvt+1 ), lda,a( pvt, pvt+1 ), lda )
+                              
+                    do i = j + 1, pvt - 1
+                       ztemp = conjg( a( j, i ) )
+                       a( j, i ) = conjg( a( i, pvt ) )
+                       a( i, pvt ) = ztemp
+                    end do
+                    a( j, pvt ) = conjg( a( j, pvt ) )
+                    ! swap dot products and piv
+                    dtemp = work( j )
+                    work( j ) = work( pvt )
+                    work( pvt ) = dtemp
+                    itemp = piv( pvt )
+                    piv( pvt ) = piv( j )
+                    piv( j ) = itemp
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of row j
+                 if( j<n ) then
+                    call stdlib_zlacgv( j-1, a( 1, j ), 1 )
+                    call stdlib_zgemv( 'TRANS', j-1, n-j, -cone, a( 1, j+1 ), lda,a( 1, j ), 1, &
+                              cone, a( j, j+1 ), lda )
+                    call stdlib_zlacgv( j-1, a( 1, j ), 1 )
+                    call stdlib_zdscal( n-j, one / ajj, a( j, j+1 ), lda )
+                 end if
+              end do loop_150
+           else
+              ! compute the cholesky factorization p**t * a * p = l * l**h
+              loop_180: do j = 1, n
+              ! find pivot, test for exit, else swap rows and columns
+              ! update dot products, compute possible pivots which are
+              ! stored in the second chalf of work
+                 do i = j, n
+                    if( j>1 ) then
+                       work( i ) = work( i ) +real( conjg( a( i, j-1 ) )*a( i, j-1 ),KIND=dp)
+                                 
+                    end if
+                    work( n+i ) = real( a( i, i ),KIND=dp) - work( i )
+                 end do
+                 if( j>1 ) then
+                    itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                    pvt = itemp + j - 1
+                    ajj = work( n+pvt )
+                    if( ajj<=dstop.or.stdlib_disnan( ajj ) ) then
+                       a( j, j ) = ajj
+                       go to 190
+                    end if
+                 end if
+                 if( j/=pvt ) then
+                    ! pivot ok, so can now swap pivot rows and columns
+                    a( pvt, pvt ) = a( j, j )
+                    call stdlib_zswap( j-1, a( j, 1 ), lda, a( pvt, 1 ), lda )
+                    if( pvt<n )call stdlib_zswap( n-pvt, a( pvt+1, j ), 1, a( pvt+1, pvt ),1 )
+                              
+                    do i = j + 1, pvt - 1
+                       ztemp = conjg( a( i, j ) )
+                       a( i, j ) = conjg( a( pvt, i ) )
+                       a( pvt, i ) = ztemp
+                    end do
+                    a( pvt, j ) = conjg( a( pvt, j ) )
+                    ! swap dot products and piv
+                    dtemp = work( j )
+                    work( j ) = work( pvt )
+                    work( pvt ) = dtemp
+                    itemp = piv( pvt )
+                    piv( pvt ) = piv( j )
+                    piv( j ) = itemp
+                 end if
+                 ajj = sqrt( ajj )
+                 a( j, j ) = ajj
+                 ! compute elements j+1:n of column j
+                 if( j<n ) then
+                    call stdlib_zlacgv( j-1, a( j, 1 ), lda )
+                    call stdlib_zgemv( 'NO TRANS', n-j, j-1, -cone, a( j+1, 1 ),lda, a( j, 1 ), &
+                              lda, cone, a( j+1, j ), 1 )
+                    call stdlib_zlacgv( j-1, a( j, 1 ), lda )
+                    call stdlib_zdscal( n-j, one / ajj, a( j+1, j ), 1 )
+                 end if
+              end do loop_180
+           end if
+           ! ran to completion, a has full rank
+           rank = n
+           go to 200
+           190 continue
+           ! rank is number of steps completed.  set info = 1 to signal
+           ! that the factorization cannot be used to solve a system.
+           rank = j - 1
+           info = 1
+           200 continue
+           return
+     end subroutine stdlib_zpstf2
+
+     !> ZPSTRF: computes the Cholesky factorization with complete
+     !> pivoting of a complex Hermitian positive semidefinite matrix A.
+     !> The factorization has the form
+     !> P**T * A * P = U**H * U ,  if UPLO = 'U',
+     !> P**T * A * P = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular, and
+     !> P is stored as vector PIV.
+     !> This algorithm does not attempt to check that A is positive
+     !> semidefinite. This version of the algorithm calls level 3 BLAS.
+
+     pure subroutine stdlib_zpstrf( uplo, n, a, lda, piv, rank, tol, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: tol
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, n
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           real(dp), intent(out) :: work(2*n)
+           integer(ilp), intent(out) :: piv(n)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(dp) :: ztemp
+           real(dp) :: ajj, dstop, dtemp
+           integer(ilp) :: i, itemp, j, jb, k, nb, pvt
+           logical(lk) :: upper
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max,min,sqrt,maxloc
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPSTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get block size
+           nb = stdlib_ilaenv( 1, 'ZPOTRF', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_zpstf2( uplo, n, a( 1, 1 ), lda, piv, rank, tol, work,info )
+              go to 230
+           else
+           ! initialize piv
+              do i = 1, n
+                 piv( i ) = i
+              end do
+           ! compute stopping value
+              do i = 1, n
+                 work( i ) = real( a( i, i ),KIND=dp)
+              end do
+              pvt = maxloc( work( 1:n ), 1 )
+              ajj = real( a( pvt, pvt ),KIND=dp)
+              if( ajj<=zero.or.stdlib_disnan( ajj ) ) then
+                 rank = 0
+                 info = 1
+                 go to 230
+              end if
+           ! compute stopping value if not supplied
+              if( tol<zero ) then
+                 dstop = n * stdlib_dlamch( 'EPSILON' ) * ajj
+              else
+                 dstop = tol
+              end if
+              if( upper ) then
+                 ! compute the cholesky factorization p**t * a * p = u**h * u
+                 loop_160: do k = 1, n, nb
+                    ! account for last block not being nb wide
+                    jb = min( nb, n-k+1 )
+                    ! set relevant part of first chalf of work to zero,
+                    ! holds dot products
+                    do i = k, n
+                       work( i ) = 0
+                    end do
+                    loop_150: do j = k, k + jb - 1
+                    ! find pivot, test for exit, else swap rows and columns
+                    ! update dot products, compute possible pivots which are
+                    ! stored in the second chalf of work
+                       do i = j, n
+                          if( j>k ) then
+                             work( i ) = work( i ) +real( conjg( a( j-1, i ) )*a( j-1, i ),&
+                                       KIND=dp)
+                          end if
+                          work( n+i ) = real( a( i, i ),KIND=dp) - work( i )
+                       end do
+                       if( j>1 ) then
+                          itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                          pvt = itemp + j - 1
+                          ajj = work( n+pvt )
+                          if( ajj<=dstop.or.stdlib_disnan( ajj ) ) then
+                             a( j, j ) = ajj
+                             go to 220
+                          end if
+                       end if
+                       if( j/=pvt ) then
+                          ! pivot ok, so can now swap pivot rows and columns
+                          a( pvt, pvt ) = a( j, j )
+                          call stdlib_zswap( j-1, a( 1, j ), 1, a( 1, pvt ), 1 )
+                          if( pvt<n )call stdlib_zswap( n-pvt, a( j, pvt+1 ), lda,a( pvt, pvt+1 ),&
+                                     lda )
+                          do i = j + 1, pvt - 1
+                             ztemp = conjg( a( j, i ) )
+                             a( j, i ) = conjg( a( i, pvt ) )
+                             a( i, pvt ) = ztemp
+                          end do
+                          a( j, pvt ) = conjg( a( j, pvt ) )
+                          ! swap dot products and piv
+                          dtemp = work( j )
+                          work( j ) = work( pvt )
+                          work( pvt ) = dtemp
+                          itemp = piv( pvt )
+                          piv( pvt ) = piv( j )
+                          piv( j ) = itemp
+                       end if
+                       ajj = sqrt( ajj )
+                       a( j, j ) = ajj
+                       ! compute elements j+1:n of row j.
+                       if( j<n ) then
+                          call stdlib_zlacgv( j-1, a( 1, j ), 1 )
+                          call stdlib_zgemv( 'TRANS', j-k, n-j, -cone, a( k, j+1 ),lda, a( k, j ),&
+                                     1, cone, a( j, j+1 ),lda )
+                          call stdlib_zlacgv( j-1, a( 1, j ), 1 )
+                          call stdlib_zdscal( n-j, one / ajj, a( j, j+1 ), lda )
+                       end if
+                    end do loop_150
+                    ! update trailing matrix, j already incremented
+                    if( k+jb<=n ) then
+                       call stdlib_zherk( 'UPPER', 'CONJ TRANS', n-j+1, jb, -one,a( k, j ), lda, &
+                                 one, a( j, j ), lda )
+                    end if
+                 end do loop_160
+              else
+              ! compute the cholesky factorization p**t * a * p = l * l**h
+                 loop_210: do k = 1, n, nb
+                    ! account for last block not being nb wide
+                    jb = min( nb, n-k+1 )
+                    ! set relevant part of first chalf of work to zero,
+                    ! holds dot products
+                    do i = k, n
+                       work( i ) = 0
+                    end do
+                    loop_200: do j = k, k + jb - 1
+                    ! find pivot, test for exit, else swap rows and columns
+                    ! update dot products, compute possible pivots which are
+                    ! stored in the second chalf of work
+                       do i = j, n
+                          if( j>k ) then
+                             work( i ) = work( i ) +real( conjg( a( i, j-1 ) )*a( i, j-1 ),&
+                                       KIND=dp)
+                          end if
+                          work( n+i ) = real( a( i, i ),KIND=dp) - work( i )
+                       end do
+                       if( j>1 ) then
+                          itemp = maxloc( work( (n+j):(2*n) ), 1 )
+                          pvt = itemp + j - 1
+                          ajj = work( n+pvt )
+                          if( ajj<=dstop.or.stdlib_disnan( ajj ) ) then
+                             a( j, j ) = ajj
+                             go to 220
+                          end if
+                       end if
+                       if( j/=pvt ) then
+                          ! pivot ok, so can now swap pivot rows and columns
+                          a( pvt, pvt ) = a( j, j )
+                          call stdlib_zswap( j-1, a( j, 1 ), lda, a( pvt, 1 ), lda )
+                          if( pvt<n )call stdlib_zswap( n-pvt, a( pvt+1, j ), 1,a( pvt+1, pvt ), &
+                                    1 )
+                          do i = j + 1, pvt - 1
+                             ztemp = conjg( a( i, j ) )
+                             a( i, j ) = conjg( a( pvt, i ) )
+                             a( pvt, i ) = ztemp
+                          end do
+                          a( pvt, j ) = conjg( a( pvt, j ) )
+                          ! swap dot products and piv
+                          dtemp = work( j )
+                          work( j ) = work( pvt )
+                          work( pvt ) = dtemp
+                          itemp = piv( pvt )
+                          piv( pvt ) = piv( j )
+                          piv( j ) = itemp
+                       end if
+                       ajj = sqrt( ajj )
+                       a( j, j ) = ajj
+                       ! compute elements j+1:n of column j.
+                       if( j<n ) then
+                          call stdlib_zlacgv( j-1, a( j, 1 ), lda )
+                          call stdlib_zgemv( 'NO TRANS', n-j, j-k, -cone,a( j+1, k ), lda, a( j, &
+                                    k ), lda, cone,a( j+1, j ), 1 )
+                          call stdlib_zlacgv( j-1, a( j, 1 ), lda )
+                          call stdlib_zdscal( n-j, one / ajj, a( j+1, j ), 1 )
+                       end if
+                    end do loop_200
+                    ! update trailing matrix, j already incremented
+                    if( k+jb<=n ) then
+                       call stdlib_zherk( 'LOWER', 'NO TRANS', n-j+1, jb, -one,a( j, k ), lda, &
+                                 one, a( j, j ), lda )
+                    end if
+                 end do loop_210
+              end if
+           end if
+           ! ran to completion, a has full rank
+           rank = n
+           go to 230
+           220 continue
+           ! rank is the number of steps completed.  set info = 1 to signal
+           ! that the factorization cannot be used to solve a system.
+           rank = j - 1
+           info = 1
+           230 continue
+           return
+     end subroutine stdlib_zpstrf
+
+     !> ZPTCON: computes the reciprocal of the condition number (in the
+     !> 1-norm) of a complex Hermitian positive definite tridiagonal matrix
+     !> using the factorization A = L*D*L**H or A = U**H*D*U computed by
+     !> ZPTTRF.
+     !> Norm(inv(A)) is computed by a direct method, and the reciprocal of
+     !> the condition number is computed as
+     !> RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_zptcon( n, d, e, anorm, rcond, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(in) :: e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ix
+           real(dp) :: ainvnm
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( anorm<zero ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPTCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           ! check that d(1:n) is positive.
+           do i = 1, n
+              if( d( i )<=zero )return
+           end do
+           ! solve m(a) * x = e, where m(a) = (m(i,j)) is given by
+              ! m(i,j) =  abs(a(i,j)), i = j,
+              ! m(i,j) = -abs(a(i,j)), i .ne. j,
+           ! and e = [ 1, 1, ..., 1 ]**t.  note m(a) = m(l)*d*m(l)**h.
+           ! solve m(l) * x = e.
+           rwork( 1 ) = one
+           do i = 2, n
+              rwork( i ) = one + rwork( i-1 )*abs( e( i-1 ) )
+           end do
+           ! solve d * m(l)**h * x = b.
+           rwork( n ) = rwork( n ) / d( n )
+           do i = n - 1, 1, -1
+              rwork( i ) = rwork( i ) / d( i ) + rwork( i+1 )*abs( e( i ) )
+           end do
+           ! compute ainvnm = max(x(i)), 1<=i<=n.
+           ix = stdlib_idamax( n, rwork, 1 )
+           ainvnm = abs( rwork( ix ) )
+           ! compute the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_zptcon
+
+     !> ZPTTRF: computes the L*D*L**H factorization of a complex Hermitian
+     !> positive definite tridiagonal matrix A.  The factorization may also
+     !> be regarded as having the form A = U**H *D*U.
+
+     pure subroutine stdlib_zpttrf( n, d, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(inout) :: d(*)
+           complex(dp), intent(inout) :: e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i4
+           real(dp) :: eii, eir, f, g
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,aimag,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+              call stdlib_xerbla( 'ZPTTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! compute the l*d*l**h (or u**h *d*u) factorization of a.
+           i4 = mod( n-1, 4 )
+           do i = 1, i4
+              if( d( i )<=zero ) then
+                 info = i
+                 go to 30
+              end if
+              eir = real( e( i ),KIND=dp)
+              eii = aimag( e( i ) )
+              f = eir / d( i )
+              g = eii / d( i )
+              e( i ) = cmplx( f, g,KIND=dp)
+              d( i+1 ) = d( i+1 ) - f*eir - g*eii
+           end do
+           loop_20: do i = i4 + 1, n - 4, 4
+              ! drop out of the loop if d(i) <= 0: the matrix is not positive
+              ! definite.
+              if( d( i )<=zero ) then
+                 info = i
+                 go to 30
+              end if
+              ! solve for e(i) and d(i+1).
+              eir = real( e( i ),KIND=dp)
+              eii = aimag( e( i ) )
+              f = eir / d( i )
+              g = eii / d( i )
+              e( i ) = cmplx( f, g,KIND=dp)
+              d( i+1 ) = d( i+1 ) - f*eir - g*eii
+              if( d( i+1 )<=zero ) then
+                 info = i + 1
+                 go to 30
+              end if
+              ! solve for e(i+1) and d(i+2).
+              eir = real( e( i+1 ),KIND=dp)
+              eii = aimag( e( i+1 ) )
+              f = eir / d( i+1 )
+              g = eii / d( i+1 )
+              e( i+1 ) = cmplx( f, g,KIND=dp)
+              d( i+2 ) = d( i+2 ) - f*eir - g*eii
+              if( d( i+2 )<=zero ) then
+                 info = i + 2
+                 go to 30
+              end if
+              ! solve for e(i+2) and d(i+3).
+              eir = real( e( i+2 ),KIND=dp)
+              eii = aimag( e( i+2 ) )
+              f = eir / d( i+2 )
+              g = eii / d( i+2 )
+              e( i+2 ) = cmplx( f, g,KIND=dp)
+              d( i+3 ) = d( i+3 ) - f*eir - g*eii
+              if( d( i+3 )<=zero ) then
+                 info = i + 3
+                 go to 30
+              end if
+              ! solve for e(i+3) and d(i+4).
+              eir = real( e( i+3 ),KIND=dp)
+              eii = aimag( e( i+3 ) )
+              f = eir / d( i+3 )
+              g = eii / d( i+3 )
+              e( i+3 ) = cmplx( f, g,KIND=dp)
+              d( i+4 ) = d( i+4 ) - f*eir - g*eii
+           end do loop_20
+           ! check d(n) for positive definiteness.
+           if( d( n )<=zero )info = n
+           30 continue
+           return
+     end subroutine stdlib_zpttrf
+
+     !> ZPTTS2: solves a tridiagonal system of the form
+     !> A * X = B
+     !> using the factorization A = U**H *D*U or A = L*D*L**H computed by ZPTTRF.
+     !> D is a diagonal matrix specified in the vector D, U (or L) is a unit
+     !> bidiagonal matrix whose superdiagonal (subdiagonal) is specified in
+     !> the vector E, and X and B are N by NRHS matrices.
+
+     pure subroutine stdlib_zptts2( iuplo, n, nrhs, d, e, b, ldb )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: iuplo, ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*)
+           complex(dp), intent(inout) :: b(ldb,*)
+           complex(dp), intent(in) :: e(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 )call stdlib_zdscal( nrhs, 1._dp / d( 1 ), b, ldb )
+              return
+           end if
+           if( iuplo==1 ) then
+              ! solve a * x = b using the factorization a = u**h *d*u,
+              ! overwriting each right hand side vector with its solution.
+              if( nrhs<=2 ) then
+                 j = 1
+                 10 continue
+                 ! solve u**h * x = b.
+                 do i = 2, n
+                    b( i, j ) = b( i, j ) - b( i-1, j )*conjg( e( i-1 ) )
+                 end do
+                 ! solve d * u * x = b.
+                 do i = 1, n
+                    b( i, j ) = b( i, j ) / d( i )
+                 end do
+                 do i = n - 1, 1, -1
+                    b( i, j ) = b( i, j ) - b( i+1, j )*e( i )
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 10
+                 end if
+              else
+                 do j = 1, nrhs
+                    ! solve u**h * x = b.
+                    do i = 2, n
+                       b( i, j ) = b( i, j ) - b( i-1, j )*conjg( e( i-1 ) )
+                    end do
+                    ! solve d * u * x = b.
+                    b( n, j ) = b( n, j ) / d( n )
+                    do i = n - 1, 1, -1
+                       b( i, j ) = b( i, j ) / d( i ) - b( i+1, j )*e( i )
+                    end do
+                 end do
+              end if
+           else
+              ! solve a * x = b using the factorization a = l*d*l**h,
+              ! overwriting each right hand side vector with its solution.
+              if( nrhs<=2 ) then
+                 j = 1
+                 80 continue
+                 ! solve l * x = b.
+                 do i = 2, n
+                    b( i, j ) = b( i, j ) - b( i-1, j )*e( i-1 )
+                 end do
+                 ! solve d * l**h * x = b.
+                 do i = 1, n
+                    b( i, j ) = b( i, j ) / d( i )
+                 end do
+                 do i = n - 1, 1, -1
+                    b( i, j ) = b( i, j ) - b( i+1, j )*conjg( e( i ) )
+                 end do
+                 if( j<nrhs ) then
+                    j = j + 1
+                    go to 80
+                 end if
+              else
+                 do j = 1, nrhs
+                    ! solve l * x = b.
+                    do i = 2, n
+                       b( i, j ) = b( i, j ) - b( i-1, j )*e( i-1 )
+                    end do
+                    ! solve d * l**h * x = b.
+                    b( n, j ) = b( n, j ) / d( n )
+                    do i = n - 1, 1, -1
+                       b( i, j ) = b( i, j ) / d( i ) -b( i+1, j )*conjg( e( i ) )
+                    end do
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zptts2
+
+     !> ZROT:   applies a plane rotation, where the cos (C) is real and the
+     !> sin (S) is complex, and the vectors CX and CY are complex.
+
+     pure subroutine stdlib_zrot( n, cx, incx, cy, incy, c, s )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           real(dp), intent(in) :: c
+           complex(dp), intent(in) :: s
+           ! Array Arguments 
+           complex(dp), intent(inout) :: cx(*), cy(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ix, iy
+           complex(dp) :: stemp
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           if( n<=0 )return
+           if( incx==1 .and. incy==1 )go to 20
+           ! code for unequal increments or equal increments not equal to 1
+           ix = 1
+           iy = 1
+           if( incx<0 )ix = ( -n+1 )*incx + 1
+           if( incy<0 )iy = ( -n+1 )*incy + 1
+           do i = 1, n
+              stemp = c*cx( ix ) + s*cy( iy )
+              cy( iy ) = c*cy( iy ) - conjg( s )*cx( ix )
+              cx( ix ) = stemp
+              ix = ix + incx
+              iy = iy + incy
+           end do
+           return
+           ! code for both increments equal to 1
+           20 continue
+           do i = 1, n
+              stemp = c*cx( i ) + s*cy( i )
+              cy( i ) = c*cy( i ) - conjg( s )*cx( i )
+              cx( i ) = stemp
+           end do
+           return
+     end subroutine stdlib_zrot
+
+     !> ZSPMV:  performs the matrix-vector operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_zspmv( uplo, n, alpha, ap, x, incx, beta, y, incy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incx, incy, n
+           complex(dp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           complex(dp), intent(in) :: ap(*), x(*)
+           complex(dp), intent(inout) :: y(*)
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, k, kk, kx, ky
+           complex(dp) :: temp1, temp2
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = 1
+           else if( n<0 ) then
+              info = 2
+           else if( incx==0 ) then
+              info = 6
+           else if( incy==0 ) then
+              info = 9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ) .or. ( ( alpha==czero ) .and. ( beta==cone ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 ) then
+              kx = 1
+           else
+              kx = 1 - ( n-1 )*incx
+           end if
+           if( incy>0 ) then
+              ky = 1
+           else
+              ky = 1 - ( n-1 )*incy
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           ! first form  y := beta*y.
+           if( beta/=cone ) then
+              if( incy==1 ) then
+                 if( beta==czero ) then
+                    do i = 1, n
+                       y( i ) = czero
+                    end do
+                 else
+                    do i = 1, n
+                       y( i ) = beta*y( i )
+                    end do
+                 end if
+              else
+                 iy = ky
+                 if( beta==czero ) then
+                    do i = 1, n
+                       y( iy ) = czero
+                       iy = iy + incy
+                    end do
+                 else
+                    do i = 1, n
+                       y( iy ) = beta*y( iy )
+                       iy = iy + incy
+                    end do
+                 end if
+              end if
+           end if
+           if( alpha==czero )return
+           kk = 1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! form  y  when ap contains the upper triangle.
+              if( ( incx==1 ) .and. ( incy==1 ) ) then
+                 do j = 1, n
+                    temp1 = alpha*x( j )
+                    temp2 = czero
+                    k = kk
+                    do i = 1, j - 1
+                       y( i ) = y( i ) + temp1*ap( k )
+                       temp2 = temp2 + ap( k )*x( i )
+                       k = k + 1
+                    end do
+                    y( j ) = y( j ) + temp1*ap( kk+j-1 ) + alpha*temp2
+                    kk = kk + j
+                 end do
+              else
+                 jx = kx
+                 jy = ky
+                 do j = 1, n
+                    temp1 = alpha*x( jx )
+                    temp2 = czero
+                    ix = kx
+                    iy = ky
+                    do k = kk, kk + j - 2
+                       y( iy ) = y( iy ) + temp1*ap( k )
+                       temp2 = temp2 + ap( k )*x( ix )
+                       ix = ix + incx
+                       iy = iy + incy
+                    end do
+                    y( jy ) = y( jy ) + temp1*ap( kk+j-1 ) + alpha*temp2
+                    jx = jx + incx
+                    jy = jy + incy
+                    kk = kk + j
+                 end do
+              end if
+           else
+              ! form  y  when ap contains the lower triangle.
+              if( ( incx==1 ) .and. ( incy==1 ) ) then
+                 do j = 1, n
+                    temp1 = alpha*x( j )
+                    temp2 = czero
+                    y( j ) = y( j ) + temp1*ap( kk )
+                    k = kk + 1
+                    do i = j + 1, n
+                       y( i ) = y( i ) + temp1*ap( k )
+                       temp2 = temp2 + ap( k )*x( i )
+                       k = k + 1
+                    end do
+                    y( j ) = y( j ) + alpha*temp2
+                    kk = kk + ( n-j+1 )
+                 end do
+              else
+                 jx = kx
+                 jy = ky
+                 do j = 1, n
+                    temp1 = alpha*x( jx )
+                    temp2 = czero
+                    y( jy ) = y( jy ) + temp1*ap( kk )
+                    ix = jx
+                    iy = jy
+                    do k = kk + 1, kk + n - j
+                       ix = ix + incx
+                       iy = iy + incy
+                       y( iy ) = y( iy ) + temp1*ap( k )
+                       temp2 = temp2 + ap( k )*x( ix )
+                    end do
+                    y( jy ) = y( jy ) + alpha*temp2
+                    jx = jx + incx
+                    jy = jy + incy
+                    kk = kk + ( n-j+1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zspmv
+
+     !> ZSPR:    performs the symmetric rank 1 operation
+     !> A := alpha*x*x**H + A,
+     !> where alpha is a complex scalar, x is an n element vector and A is an
+     !> n by n symmetric matrix, supplied in packed form.
+
+     pure subroutine stdlib_zspr( uplo, n, alpha, x, incx, ap )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incx, n
+           complex(dp), intent(in) :: alpha
+           ! Array Arguments 
+           complex(dp), intent(inout) :: ap(*)
+           complex(dp), intent(in) :: x(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, ix, j, jx, k, kk, kx
+           complex(dp) :: temp
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = 1
+           else if( n<0 ) then
+              info = 2
+           else if( incx==0 ) then
+              info = 5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPR  ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ) .or. ( alpha==czero ) )return
+           ! set the start point in x if the increment is not unity.
+           if( incx<=0 ) then
+              kx = 1 - ( n-1 )*incx
+           else if( incx/=1 ) then
+              kx = 1
+           end if
+           ! start the operations. in this version the elements of the array ap
+           ! are accessed sequentially with cone pass through ap.
+           kk = 1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! form  a  when upper triangle is stored in ap.
+              if( incx==1 ) then
+                 do j = 1, n
+                    if( x( j )/=czero ) then
+                       temp = alpha*x( j )
+                       k = kk
+                       do i = 1, j - 1
+                          ap( k ) = ap( k ) + x( i )*temp
+                          k = k + 1
+                       end do
+                       ap( kk+j-1 ) = ap( kk+j-1 ) + x( j )*temp
+                    else
+                       ap( kk+j-1 ) = ap( kk+j-1 )
+                    end if
+                    kk = kk + j
+                 end do
+              else
+                 jx = kx
+                 do j = 1, n
+                    if( x( jx )/=czero ) then
+                       temp = alpha*x( jx )
+                       ix = kx
+                       do k = kk, kk + j - 2
+                          ap( k ) = ap( k ) + x( ix )*temp
+                          ix = ix + incx
+                       end do
+                       ap( kk+j-1 ) = ap( kk+j-1 ) + x( jx )*temp
+                    else
+                       ap( kk+j-1 ) = ap( kk+j-1 )
+                    end if
+                    jx = jx + incx
+                    kk = kk + j
+                 end do
+              end if
+           else
+              ! form  a  when lower triangle is stored in ap.
+              if( incx==1 ) then
+                 do j = 1, n
+                    if( x( j )/=czero ) then
+                       temp = alpha*x( j )
+                       ap( kk ) = ap( kk ) + temp*x( j )
+                       k = kk + 1
+                       do i = j + 1, n
+                          ap( k ) = ap( k ) + x( i )*temp
+                          k = k + 1
+                       end do
+                    else
+                       ap( kk ) = ap( kk )
+                    end if
+                    kk = kk + n - j + 1
+                 end do
+              else
+                 jx = kx
+                 do j = 1, n
+                    if( x( jx )/=czero ) then
+                       temp = alpha*x( jx )
+                       ap( kk ) = ap( kk ) + temp*x( jx )
+                       ix = jx
+                       do k = kk + 1, kk + n - j
+                          ix = ix + incx
+                          ap( k ) = ap( k ) + x( ix )*temp
+                       end do
+                    else
+                       ap( kk ) = ap( kk )
+                    end if
+                    jx = jx + incx
+                    kk = kk + n - j + 1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zspr
+
+     !> ZSPTRF: computes the factorization of a complex symmetric matrix A
+     !> stored in packed format using the Bunch-Kaufman diagonal pivoting
+     !> method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+
+     pure subroutine stdlib_zsptrf( uplo, n, ap, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kc, kk, knc, kp, kpc, kstep, kx, npp
+           real(dp) :: absakk, alpha, colmax, rowmax
+           complex(dp) :: d11, d12, d21, d22, r1, t, wk, wkm1, wkp1, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPTRF', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              kc = ( n-1 )*n / 2 + 1
+              10 continue
+              knc = kc
+              ! if k < 1, exit from loop
+              if( k<1 )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( ap( kc+k-1 ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, ap( kc ), 1 )
+                 colmax = cabs1( ap( kc+imax-1 ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    rowmax = zero
+                    jmax = imax
+                    kx = imax*( imax+1 ) / 2 + imax
+                    do j = imax + 1, k
+                       if( cabs1( ap( kx ) )>rowmax ) then
+                          rowmax = cabs1( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + j
+                    end do
+                    kpc = ( imax-1 )*imax / 2 + 1
+                    if( imax>1 ) then
+                       jmax = stdlib_izamax( imax-1, ap( kpc ), 1 )
+                       rowmax = max( rowmax, cabs1( ap( kpc+jmax-1 ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( ap( kpc+imax-1 ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kstep==2 )knc = knc - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_zswap( kp-1, ap( knc ), 1, ap( kpc ), 1 )
+                    kx = kpc + kp - 1
+                    do j = kp + 1, kk - 1
+                       kx = kx + j - 1
+                       t = ap( knc+j-1 )
+                       ap( knc+j-1 ) = ap( kx )
+                       ap( kx ) = t
+                    end do
+                    t = ap( knc+kk-1 )
+                    ap( knc+kk-1 ) = ap( kpc+kp-1 )
+                    ap( kpc+kp-1 ) = t
+                    if( kstep==2 ) then
+                       t = ap( kc+k-2 )
+                       ap( kc+k-2 ) = ap( kc+kp-1 )
+                       ap( kc+kp-1 ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**t = a - w(k)*1/d(k)*w(k)**t
+                    r1 = cone / ap( kc+k-1 )
+                    call stdlib_zspr( uplo, k-1, -r1, ap( kc ), 1, ap )
+                    ! store u(k) in column k
+                    call stdlib_zscal( k-1, r1, ap( kc ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**t
+                    if( k>2 ) then
+                       d12 = ap( k-1+( k-1 )*k / 2 )
+                       d22 = ap( k-1+( k-2 )*( k-1 ) / 2 ) / d12
+                       d11 = ap( k+( k-1 )*k / 2 ) / d12
+                       t = cone / ( d11*d22-cone )
+                       d12 = t / d12
+                       do j = k - 2, 1, -1
+                          wkm1 = d12*( d11*ap( j+( k-2 )*( k-1 ) / 2 )-ap( j+( k-1 )*k / 2 ) )
+                                    
+                          wk = d12*( d22*ap( j+( k-1 )*k / 2 )-ap( j+( k-2 )*( k-1 ) / 2 ) )
+                                    
+                          do i = j, 1, -1
+                             ap( i+( j-1 )*j / 2 ) = ap( i+( j-1 )*j / 2 ) -ap( i+( k-1 )*k / 2 )&
+                                       *wk -ap( i+( k-2 )*( k-1 ) / 2 )*wkm1
+                          end do
+                          ap( j+( k-1 )*k / 2 ) = wk
+                          ap( j+( k-2 )*( k-1 ) / 2 ) = wkm1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              kc = knc - k
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              kc = 1
+              npp = n*( n+1 ) / 2
+              60 continue
+              knc = kc
+              ! if k > n, exit from loop
+              if( k>n )go to 110
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( ap( kc ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, ap( kc+1 ), 1 )
+                 colmax = cabs1( ap( kc+imax-k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero ) then
+                 ! column k is zero: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    rowmax = zero
+                    kx = kc + imax - k
+                    do j = k, imax - 1
+                       if( cabs1( ap( kx ) )>rowmax ) then
+                          rowmax = cabs1( ap( kx ) )
+                          jmax = j
+                       end if
+                       kx = kx + n - j
+                    end do
+                    kpc = npp - ( n-imax+1 )*( n-imax+2 ) / 2 + 1
+                    if( imax<n ) then
+                       jmax = imax + stdlib_izamax( n-imax, ap( kpc+1 ), 1 )
+                       rowmax = max( rowmax, cabs1( ap( kpc+jmax-imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( ap( kpc ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kstep==2 )knc = knc + n - k + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_zswap( n-kp, ap( knc+kp-kk+1 ), 1, ap( kpc+1 ),1 )
+                              
+                    kx = knc + kp - kk
+                    do j = kk + 1, kp - 1
+                       kx = kx + n - j + 1
+                       t = ap( knc+j-kk )
+                       ap( knc+j-kk ) = ap( kx )
+                       ap( kx ) = t
+                    end do
+                    t = ap( knc )
+                    ap( knc ) = ap( kpc )
+                    ap( kpc ) = t
+                    if( kstep==2 ) then
+                       t = ap( kc+1 )
+                       ap( kc+1 ) = ap( kc+kp-k )
+                       ap( kc+kp-k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**t = a - w(k)*(1/d(k))*w(k)**t
+                       r1 = cone / ap( kc )
+                       call stdlib_zspr( uplo, n-k, -r1, ap( kc+1 ), 1,ap( kc+n-k+1 ) )
+                       ! store l(k) in column k
+                       call stdlib_zscal( n-k, r1, ap( kc+1 ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**t
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**t
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d21 = ap( k+1+( k-1 )*( 2*n-k ) / 2 )
+                       d11 = ap( k+1+k*( 2*n-k-1 ) / 2 ) / d21
+                       d22 = ap( k+( k-1 )*( 2*n-k ) / 2 ) / d21
+                       t = cone / ( d11*d22-cone )
+                       d21 = t / d21
+                       do j = k + 2, n
+                          wk = d21*( d11*ap( j+( k-1 )*( 2*n-k ) / 2 )-ap( j+k*( 2*n-k-1 ) / 2 ) )
+                                    
+                          wkp1 = d21*( d22*ap( j+k*( 2*n-k-1 ) / 2 )-ap( j+( k-1 )*( 2*n-k ) / 2 )&
+                                     )
+                          do i = j, n
+                             ap( i+( j-1 )*( 2*n-j ) / 2 ) = ap( i+( j-1 )*( 2*n-j ) / 2 ) - ap( &
+                                       i+( k-1 )*( 2*n-k ) /2 )*wk - ap( i+k*( 2*n-k-1 ) / 2 )*wkp1
+                          end do
+                          ap( j+( k-1 )*( 2*n-k ) / 2 ) = wk
+                          ap( j+k*( 2*n-k-1 ) / 2 ) = wkp1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              kc = knc + n - k + 2
+              go to 60
+           end if
+           110 continue
+           return
+     end subroutine stdlib_zsptrf
+
+     !> ZSPTRI: computes the inverse of a complex symmetric indefinite matrix
+     !> A in packed storage using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by ZSPTRF.
+
+     pure subroutine stdlib_zsptri( uplo, n, ap, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: ap(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kcnext, kp, kpc, kstep, kx, npp
+           complex(dp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              kp = n*( n+1 ) / 2
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. ap( kp )==czero )return
+                 kp = kp - info
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              kp = 1
+              do info = 1, n
+                 if( ipiv( info )>0 .and. ap( kp )==czero )return
+                 kp = kp + n - info + 1
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              kcnext = kc + k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc+k-1 ) = cone / ap( kc+k-1 )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_zcopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_zspmv( uplo, k-1, -cone, ap, work, 1, czero, ap( kc ),1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -stdlib_zdotu( k-1, work, 1, ap( kc ), 1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = ap( kcnext+k-1 )
+                 ak = ap( kc+k-1 ) / t
+                 akp1 = ap( kcnext+k ) / t
+                 akkp1 = ap( kcnext+k-1 ) / t
+                 d = t*( ak*akp1-cone )
+                 ap( kc+k-1 ) = akp1 / d
+                 ap( kcnext+k ) = ak / d
+                 ap( kcnext+k-1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_zcopy( k-1, ap( kc ), 1, work, 1 )
+                    call stdlib_zspmv( uplo, k-1, -cone, ap, work, 1, czero, ap( kc ),1 )
+                    ap( kc+k-1 ) = ap( kc+k-1 ) -stdlib_zdotu( k-1, work, 1, ap( kc ), 1 )
+                    ap( kcnext+k-1 ) = ap( kcnext+k-1 ) -stdlib_zdotu( k-1, ap( kc ), 1, ap( &
+                              kcnext ),1 )
+                    call stdlib_zcopy( k-1, ap( kcnext ), 1, work, 1 )
+                    call stdlib_zspmv( uplo, k-1, -cone, ap, work, 1, czero,ap( kcnext ), 1 )
+                              
+                    ap( kcnext+k ) = ap( kcnext+k ) -stdlib_zdotu( k-1, work, 1, ap( kcnext ), 1 )
+                              
+                 end if
+                 kstep = 2
+                 kcnext = kcnext + k + 1
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kpc = ( kp-1 )*kp / 2 + 1
+                 call stdlib_zswap( kp-1, ap( kc ), 1, ap( kpc ), 1 )
+                 kx = kpc + kp - 1
+                 do j = kp + 1, k - 1
+                    kx = kx + j - 1
+                    temp = ap( kc+j-1 )
+                    ap( kc+j-1 ) = ap( kx )
+                    ap( kx ) = temp
+                 end do
+                 temp = ap( kc+k-1 )
+                 ap( kc+k-1 ) = ap( kpc+kp-1 )
+                 ap( kpc+kp-1 ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc+k+k-1 )
+                    ap( kc+k+k-1 ) = ap( kc+k+kp-1 )
+                    ap( kc+k+kp-1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              kc = kcnext
+              go to 30
+              50 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              npp = n*( n+1 ) / 2
+              k = n
+              kc = npp
+              60 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 80
+              kcnext = kc - ( n-k+2 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 ap( kc ) = cone / ap( kc )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_zcopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_zspmv( uplo, n-k, -cone, ap( kc+n-k+1 ), work, 1,czero, ap( kc+1 )&
+                              , 1 )
+                    ap( kc ) = ap( kc ) - stdlib_zdotu( n-k, work, 1, ap( kc+1 ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = ap( kcnext+1 )
+                 ak = ap( kcnext ) / t
+                 akp1 = ap( kc ) / t
+                 akkp1 = ap( kcnext+1 ) / t
+                 d = t*( ak*akp1-cone )
+                 ap( kcnext ) = akp1 / d
+                 ap( kc ) = ak / d
+                 ap( kcnext+1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_zcopy( n-k, ap( kc+1 ), 1, work, 1 )
+                    call stdlib_zspmv( uplo, n-k, -cone, ap( kc+( n-k+1 ) ), work, 1,czero, ap( &
+                              kc+1 ), 1 )
+                    ap( kc ) = ap( kc ) - stdlib_zdotu( n-k, work, 1, ap( kc+1 ),1 )
+                    ap( kcnext+1 ) = ap( kcnext+1 ) -stdlib_zdotu( n-k, ap( kc+1 ), 1,ap( kcnext+&
+                              2 ), 1 )
+                    call stdlib_zcopy( n-k, ap( kcnext+2 ), 1, work, 1 )
+                    call stdlib_zspmv( uplo, n-k, -cone, ap( kc+( n-k+1 ) ), work, 1,czero, ap( &
+                              kcnext+2 ), 1 )
+                    ap( kcnext ) = ap( kcnext ) -stdlib_zdotu( n-k, work, 1, ap( kcnext+2 ), 1 )
+                              
+                 end if
+                 kstep = 2
+                 kcnext = kcnext - ( n-k+3 )
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kpc = npp - ( n-kp+1 )*( n-kp+2 ) / 2 + 1
+                 if( kp<n )call stdlib_zswap( n-kp, ap( kc+kp-k+1 ), 1, ap( kpc+1 ), 1 )
+                 kx = kc + kp - k
+                 do j = k + 1, kp - 1
+                    kx = kx + n - j + 1
+                    temp = ap( kc+j-k )
+                    ap( kc+j-k ) = ap( kx )
+                    ap( kx ) = temp
+                 end do
+                 temp = ap( kc )
+                 ap( kc ) = ap( kpc )
+                 ap( kpc ) = temp
+                 if( kstep==2 ) then
+                    temp = ap( kc-n+k-1 )
+                    ap( kc-n+k-1 ) = ap( kc-n+kp-1 )
+                    ap( kc-n+kp-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              kc = kcnext
+              go to 60
+              80 continue
+           end if
+           return
+     end subroutine stdlib_zsptri
+
+     !> ZSPTRS: solves a system of linear equations A*X = B with a complex
+     !> symmetric matrix A stored in packed format using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by ZSPTRF.
+
+     pure subroutine stdlib_zsptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: ap(*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kp
+           complex(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              kc = kc - k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_zgeru( k-1, nrhs, -cone, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_zscal( nrhs, cone / ap( kc+k-1 ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_zswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_zgeru( k-2, nrhs, -cone, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 call stdlib_zgeru( k-2, nrhs, -cone, ap( kc-( k-1 ) ), 1,b( k-1, 1 ), ldb, b( 1, &
+                           1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+k-2 )
+                 akm1 = ap( kc-1 ) / akm1k
+                 ak = ap( kc+k-1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc - k + 1
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_zgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb, ap( kc ),1, cone, b( k,&
+                            1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + k
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 call stdlib_zgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb, ap( kc ),1, cone, b( k,&
+                            1 ), ldb )
+                 call stdlib_zgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb,ap( kc+k ), 1, cone, b( &
+                           k+1, 1 ), ldb )
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + 2*k + 1
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_zgeru( n-k, nrhs, -cone, ap( kc+1 ), 1, b( k, 1 ),ldb, b( k+&
+                           1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_zscal( nrhs, cone / ap( kc ), b( k, 1 ), ldb )
+                 kc = kc + n - k + 1
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_zswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_zgeru( n-k-1, nrhs, -cone, ap( kc+2 ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_zgeru( n-k-1, nrhs, -cone, ap( kc+n-k+2 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+1 )
+                 akm1 = ap( kc ) / akm1k
+                 ak = ap( kc+n-k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc + 2*( n-k ) + 1
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              kc = kc - ( n-k+1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_zgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, ap( &
+                           kc+1 ), 1, cone, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_zgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, ap( kc+1 ),&
+                               1, cone, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, ap( kc-( n-&
+                              k ) ), 1, cone, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc - ( n-k+2 )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_zsptrs
+
+     !> ZSTEIN: computes the eigenvectors of a real symmetric tridiagonal
+     !> matrix T corresponding to specified eigenvalues, using inverse
+     !> iteration.
+     !> The maximum number of iterations allowed for each eigenvector is
+     !> specified by an internal parameter MAXITS (currently set to 5).
+     !> Although the eigenvectors are real, they are stored in a complex
+     !> array, which may be passed to ZUNMTR or ZUPMTR for back
+     !> transformation to the eigenvectors of a complex Hermitian matrix
+     !> which was reduced to tridiagonal form.
+
+     pure subroutine stdlib_zstein( n, d, e, m, w, iblock, isplit, z, ldz, work,iwork, ifail, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, m, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: iblock(*), isplit(*)
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(in) :: d(*), e(*), w(*)
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(out) :: z(ldz,*)
+       ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: odm3 = 1.0e-3_dp
+           real(dp), parameter :: odm1 = 1.0e-1_dp
+           integer(ilp), parameter :: maxits = 5
+           integer(ilp), parameter :: extra = 2
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: b1, blksiz, bn, gpind, i, iinfo, indrv1, indrv2, indrv3, indrv4, &
+                     indrv5, its, j, j1, jblk, jmax, jr, nblk, nrmchk
+           real(dp) :: dtpcrt, eps, eps1, nrm, onenrm, ortol, pertol, scl, sep, tol, xj, xjm, &
+                     ztr
+           ! Local Arrays 
+           integer(ilp) :: iseed(4)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           do i = 1, m
+              ifail( i ) = 0
+           end do
+           if( n<0 ) then
+              info = -1
+           else if( m<0 .or. m>n ) then
+              info = -4
+           else if( ldz<max( 1, n ) ) then
+              info = -9
+           else
+              do j = 2, m
+                 if( iblock( j )<iblock( j-1 ) ) then
+                    info = -6
+                    go to 30
+                 end if
+                 if( iblock( j )==iblock( j-1 ) .and. w( j )<w( j-1 ) )then
+                    info = -5
+                    go to 30
+                 end if
+              end do
+              30 continue
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSTEIN', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) then
+              return
+           else if( n==1 ) then
+              z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           eps = stdlib_dlamch( 'PRECISION' )
+           ! initialize seed for random number generator stdlib_dlarnv.
+           do i = 1, 4
+              iseed( i ) = 1
+           end do
+           ! initialize pointers.
+           indrv1 = 0
+           indrv2 = indrv1 + n
+           indrv3 = indrv2 + n
+           indrv4 = indrv3 + n
+           indrv5 = indrv4 + n
+           ! compute eigenvectors of matrix blocks.
+           j1 = 1
+           loop_180: do nblk = 1, iblock( m )
+              ! find starting and ending indices of block nblk.
+              if( nblk==1 ) then
+                 b1 = 1
+              else
+                 b1 = isplit( nblk-1 ) + 1
+              end if
+              bn = isplit( nblk )
+              blksiz = bn - b1 + 1
+              if( blksiz==1 )go to 60
+              gpind = j1
+              ! compute reorthogonalization criterion and stopping criterion.
+              onenrm = abs( d( b1 ) ) + abs( e( b1 ) )
+              onenrm = max( onenrm, abs( d( bn ) )+abs( e( bn-1 ) ) )
+              do i = b1 + 1, bn - 1
+                 onenrm = max( onenrm, abs( d( i ) )+abs( e( i-1 ) )+abs( e( i ) ) )
+              end do
+              ortol = odm3*onenrm
+              dtpcrt = sqrt( odm1 / blksiz )
+              ! loop through eigenvalues of block nblk.
+              60 continue
+              jblk = 0
+              loop_170: do j = j1, m
+                 if( iblock( j )/=nblk ) then
+                    j1 = j
+                    cycle loop_180
+                 end if
+                 jblk = jblk + 1
+                 xj = w( j )
+                 ! skip all the work if the block size is one.
+                 if( blksiz==1 ) then
+                    work( indrv1+1 ) = one
+                    go to 140
+                 end if
+                 ! if eigenvalues j and j-1 are too close, add a relatively
+                 ! small perturbation.
+                 if( jblk>1 ) then
+                    eps1 = abs( eps*xj )
+                    pertol = ten*eps1
+                    sep = xj - xjm
+                    if( sep<pertol )xj = xjm + pertol
+                 end if
+                 its = 0
+                 nrmchk = 0
+                 ! get random starting vector.
+                 call stdlib_dlarnv( 2, iseed, blksiz, work( indrv1+1 ) )
+                 ! copy the matrix t so it won't be destroyed in factorization.
+                 call stdlib_dcopy( blksiz, d( b1 ), 1, work( indrv4+1 ), 1 )
+                 call stdlib_dcopy( blksiz-1, e( b1 ), 1, work( indrv2+2 ), 1 )
+                 call stdlib_dcopy( blksiz-1, e( b1 ), 1, work( indrv3+1 ), 1 )
+                 ! compute lu factors with partial pivoting  ( pt = lu )
+                 tol = zero
+                 call stdlib_dlagtf( blksiz, work( indrv4+1 ), xj, work( indrv2+2 ),work( indrv3+&
+                           1 ), tol, work( indrv5+1 ), iwork,iinfo )
+                 ! update iteration count.
+                 70 continue
+                 its = its + 1
+                 if( its>maxits )go to 120
+                 ! normalize and scale the righthand side vector pb.
+                 jmax = stdlib_idamax( blksiz, work( indrv1+1 ), 1 )
+                 scl = blksiz*onenrm*max( eps,abs( work( indrv4+blksiz ) ) ) /abs( work( indrv1+&
+                           jmax ) )
+                 call stdlib_dscal( blksiz, scl, work( indrv1+1 ), 1 )
+                 ! solve the system lu = pb.
+                 call stdlib_dlagts( -1, blksiz, work( indrv4+1 ), work( indrv2+2 ),work( indrv3+&
+                           1 ), work( indrv5+1 ), iwork,work( indrv1+1 ), tol, iinfo )
+                 ! reorthogonalize by modified gram-schmidt if eigenvalues are
+                 ! close enough.
+                 if( jblk==1 )go to 110
+                 if( abs( xj-xjm )>ortol )gpind = j
+                 if( gpind/=j ) then
+                    do i = gpind, j - 1
+                       ztr = zero
+                       do jr = 1, blksiz
+                          ztr = ztr + work( indrv1+jr )*real( z( b1-1+jr, i ),KIND=dp)
+                       end do
+                       do jr = 1, blksiz
+                          work( indrv1+jr ) = work( indrv1+jr ) -ztr*real( z( b1-1+jr, i ),&
+                                    KIND=dp)
+                       end do
+                    end do
+                 end if
+                 ! check the infinity norm of the iterate.
+                 110 continue
+                 jmax = stdlib_idamax( blksiz, work( indrv1+1 ), 1 )
+                 nrm = abs( work( indrv1+jmax ) )
+                 ! continue for additional iterations after norm reaches
+                 ! stopping criterion.
+                 if( nrm<dtpcrt )go to 70
+                 nrmchk = nrmchk + 1
+                 if( nrmchk<extra+1 )go to 70
+                 go to 130
+                 ! if stopping criterion was not satisfied, update info and
+                 ! store eigenvector number in array ifail.
+                 120 continue
+                 info = info + 1
+                 ifail( info ) = j
+                 ! accept iterate as jth eigenvector.
+                 130 continue
+                 scl = one / stdlib_dnrm2( blksiz, work( indrv1+1 ), 1 )
+                 jmax = stdlib_idamax( blksiz, work( indrv1+1 ), 1 )
+                 if( work( indrv1+jmax )<zero )scl = -scl
+                 call stdlib_dscal( blksiz, scl, work( indrv1+1 ), 1 )
+                 140 continue
+                 do i = 1, n
+                    z( i, j ) = czero
+                 end do
+                 do i = 1, blksiz
+                    z( b1+i-1, j ) = cmplx( work( indrv1+i ), zero,KIND=dp)
+                 end do
+                 ! save the shift to check eigenvalue spacing at next
+                 ! iteration.
+                 xjm = xj
+              end do loop_170
+           end do loop_180
+           return
+     end subroutine stdlib_zstein
+
+     !> ZSTEQR: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric tridiagonal matrix using the implicit QL or QR method.
+     !> The eigenvectors of a full or band complex Hermitian matrix can also
+     !> be found if ZHETRD or ZHPTRD or ZHBTRD has been used to reduce this
+     !> matrix to tridiagonal form.
+
+     pure subroutine stdlib_zsteqr( compz, n, d, e, z, ldz, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 30
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, icompz, ii, iscale, j, jtot, k, l, l1, lend, lendm1, lendp1, lendsv,&
+                      lm1, lsv, m, mm, mm1, nm1, nmaxit
+           real(dp) :: anorm, b, c, eps, eps2, f, g, p, r, rt1, rt2, s, safmax, safmin, ssfmax, &
+                     ssfmin, tst
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or. ( icompz>0 .and. ldz<max( 1,n ) ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSTEQR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz==2 )z( 1, 1 ) = cone
+              return
+           end if
+           ! determine the unit roundoff and over/underflow thresholds.
+           eps = stdlib_dlamch( 'E' )
+           eps2 = eps**2
+           safmin = stdlib_dlamch( 'S' )
+           safmax = one / safmin
+           ssfmax = sqrt( safmax ) / three
+           ssfmin = sqrt( safmin ) / eps2
+           ! compute the eigenvalues and eigenvectors of the tridiagonal
+           ! matrix.
+           if( icompz==2 )call stdlib_zlaset( 'FULL', n, n, czero, cone, z, ldz )
+           nmaxit = n*maxit
+           jtot = 0
+           ! determine where the matrix splits and choose ql or qr iteration
+           ! for each block, according to whether top or bottom diagonal
+           ! element is smaller.
+           l1 = 1
+           nm1 = n - 1
+           10 continue
+           if( l1>n )go to 160
+           if( l1>1 )e( l1-1 ) = zero
+           if( l1<=nm1 ) then
+              do m = l1, nm1
+                 tst = abs( e( m ) )
+                 if( tst==zero )go to 30
+                 if( tst<=( sqrt( abs( d( m ) ) )*sqrt( abs( d( m+1 ) ) ) )*eps ) then
+                    e( m ) = zero
+                    go to 30
+                 end if
+              end do
+           end if
+           m = n
+           30 continue
+           l = l1
+           lsv = l
+           lend = m
+           lendsv = lend
+           l1 = m + 1
+           if( lend==l )go to 10
+           ! scale submatrix in rows and columns l to lend
+           anorm = stdlib_dlanst( 'I', lend-l+1, d( l ), e( l ) )
+           iscale = 0
+           if( anorm==zero )go to 10
+           if( anorm>ssfmax ) then
+              iscale = 1
+              call stdlib_dlascl( 'G', 0, 0, anorm, ssfmax, lend-l+1, 1, d( l ), n,info )
+              call stdlib_dlascl( 'G', 0, 0, anorm, ssfmax, lend-l, 1, e( l ), n,info )
+           else if( anorm<ssfmin ) then
+              iscale = 2
+              call stdlib_dlascl( 'G', 0, 0, anorm, ssfmin, lend-l+1, 1, d( l ), n,info )
+              call stdlib_dlascl( 'G', 0, 0, anorm, ssfmin, lend-l, 1, e( l ), n,info )
+           end if
+           ! choose between ql and qr iteration
+           if( abs( d( lend ) )<abs( d( l ) ) ) then
+              lend = lsv
+              l = lendsv
+           end if
+           if( lend>l ) then
+              ! ql iteration
+              ! look for small subdiagonal element.
+              40 continue
+              if( l/=lend ) then
+                 lendm1 = lend - 1
+                 do m = l, lendm1
+                    tst = abs( e( m ) )**2
+                    if( tst<=( eps2*abs( d( m ) ) )*abs( d( m+1 ) )+safmin )go to 60
+                 end do
+              end if
+              m = lend
+              60 continue
+              if( m<lend )e( m ) = zero
+              p = d( l )
+              if( m==l )go to 80
+              ! if remaining matrix is 2-by-2, use stdlib_dlae2 or stdlib_slaev2
+              ! to compute its eigensystem.
+              if( m==l+1 ) then
+                 if( icompz>0 ) then
+                    call stdlib_dlaev2( d( l ), e( l ), d( l+1 ), rt1, rt2, c, s )
+                    work( l ) = c
+                    work( n-1+l ) = s
+                    call stdlib_zlasr( 'R', 'V', 'B', n, 2, work( l ),work( n-1+l ), z( 1, l ), &
+                              ldz )
+                 else
+                    call stdlib_dlae2( d( l ), e( l ), d( l+1 ), rt1, rt2 )
+                 end if
+                 d( l ) = rt1
+                 d( l+1 ) = rt2
+                 e( l ) = zero
+                 l = l + 2
+                 if( l<=lend )go to 40
+                 go to 140
+              end if
+              if( jtot==nmaxit )go to 140
+              jtot = jtot + 1
+              ! form shift.
+              g = ( d( l+1 )-p ) / ( two*e( l ) )
+              r = stdlib_dlapy2( g, one )
+              g = d( m ) - p + ( e( l ) / ( g+sign( r, g ) ) )
+              s = one
+              c = one
+              p = zero
+              ! inner loop
+              mm1 = m - 1
+              do i = mm1, l, -1
+                 f = s*e( i )
+                 b = c*e( i )
+                 call stdlib_dlartg( g, f, c, s, r )
+                 if( i/=m-1 )e( i+1 ) = r
+                 g = d( i+1 ) - p
+                 r = ( d( i )-g )*s + two*c*b
+                 p = s*r
+                 d( i+1 ) = g + p
+                 g = c*r - b
+                 ! if eigenvectors are desired, then save rotations.
+                 if( icompz>0 ) then
+                    work( i ) = c
+                    work( n-1+i ) = -s
+                 end if
+              end do
+              ! if eigenvectors are desired, then apply saved rotations.
+              if( icompz>0 ) then
+                 mm = m - l + 1
+                 call stdlib_zlasr( 'R', 'V', 'B', n, mm, work( l ), work( n-1+l ),z( 1, l ), ldz &
+                           )
+              end if
+              d( l ) = d( l ) - p
+              e( l ) = g
+              go to 40
+              ! eigenvalue found.
+              80 continue
+              d( l ) = p
+              l = l + 1
+              if( l<=lend )go to 40
+              go to 140
+           else
+              ! qr iteration
+              ! look for small superdiagonal element.
+              90 continue
+              if( l/=lend ) then
+                 lendp1 = lend + 1
+                 do m = l, lendp1, -1
+                    tst = abs( e( m-1 ) )**2
+                    if( tst<=( eps2*abs( d( m ) ) )*abs( d( m-1 ) )+safmin )go to 110
+                 end do
+              end if
+              m = lend
+              110 continue
+              if( m>lend )e( m-1 ) = zero
+              p = d( l )
+              if( m==l )go to 130
+              ! if remaining matrix is 2-by-2, use stdlib_dlae2 or stdlib_slaev2
+              ! to compute its eigensystem.
+              if( m==l-1 ) then
+                 if( icompz>0 ) then
+                    call stdlib_dlaev2( d( l-1 ), e( l-1 ), d( l ), rt1, rt2, c, s )
+                    work( m ) = c
+                    work( n-1+m ) = s
+                    call stdlib_zlasr( 'R', 'V', 'F', n, 2, work( m ),work( n-1+m ), z( 1, l-1 ), &
+                              ldz )
+                 else
+                    call stdlib_dlae2( d( l-1 ), e( l-1 ), d( l ), rt1, rt2 )
+                 end if
+                 d( l-1 ) = rt1
+                 d( l ) = rt2
+                 e( l-1 ) = zero
+                 l = l - 2
+                 if( l>=lend )go to 90
+                 go to 140
+              end if
+              if( jtot==nmaxit )go to 140
+              jtot = jtot + 1
+              ! form shift.
+              g = ( d( l-1 )-p ) / ( two*e( l-1 ) )
+              r = stdlib_dlapy2( g, one )
+              g = d( m ) - p + ( e( l-1 ) / ( g+sign( r, g ) ) )
+              s = one
+              c = one
+              p = zero
+              ! inner loop
+              lm1 = l - 1
+              do i = m, lm1
+                 f = s*e( i )
+                 b = c*e( i )
+                 call stdlib_dlartg( g, f, c, s, r )
+                 if( i/=m )e( i-1 ) = r
+                 g = d( i ) - p
+                 r = ( d( i+1 )-g )*s + two*c*b
+                 p = s*r
+                 d( i ) = g + p
+                 g = c*r - b
+                 ! if eigenvectors are desired, then save rotations.
+                 if( icompz>0 ) then
+                    work( i ) = c
+                    work( n-1+i ) = s
+                 end if
+              end do
+              ! if eigenvectors are desired, then apply saved rotations.
+              if( icompz>0 ) then
+                 mm = l - m + 1
+                 call stdlib_zlasr( 'R', 'V', 'F', n, mm, work( m ), work( n-1+m ),z( 1, m ), ldz &
+                           )
+              end if
+              d( l ) = d( l ) - p
+              e( lm1 ) = g
+              go to 90
+              ! eigenvalue found.
+              130 continue
+              d( l ) = p
+              l = l - 1
+              if( l>=lend )go to 90
+              go to 140
+           end if
+           ! undo scaling if necessary
+           140 continue
+           if( iscale==1 ) then
+              call stdlib_dlascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv+1, 1,d( lsv ), n, info )
+                        
+              call stdlib_dlascl( 'G', 0, 0, ssfmax, anorm, lendsv-lsv, 1, e( lsv ),n, info )
+                        
+           else if( iscale==2 ) then
+              call stdlib_dlascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv+1, 1,d( lsv ), n, info )
+                        
+              call stdlib_dlascl( 'G', 0, 0, ssfmin, anorm, lendsv-lsv, 1, e( lsv ),n, info )
+                        
+           end if
+           ! check for no convergence to an eigenvalue after a total
+           ! of n*maxit iterations.
+           if( jtot==nmaxit ) then
+              do i = 1, n - 1
+                 if( e( i )/=zero )info = info + 1
+              end do
+              return
+           end if
+           go to 10
+           ! order eigenvalues and eigenvectors.
+           160 continue
+           if( icompz==0 ) then
+              ! use quick sort
+              call stdlib_dlasrt( 'I', n, d, info )
+           else
+              ! use selection sort to minimize swaps of eigenvectors
+              do ii = 2, n
+                 i = ii - 1
+                 k = i
+                 p = d( i )
+                 do j = ii, n
+                    if( d( j )<p ) then
+                       k = j
+                       p = d( j )
+                    end if
+                 end do
+                 if( k/=i ) then
+                    d( k ) = d( i )
+                    d( i ) = p
+                    call stdlib_zswap( n, z( 1, i ), 1, z( 1, k ), 1 )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_zsteqr
+
+     !> ZSYCONV: converts A given by ZHETRF into L and D or vice-versa.
+     !> Get nondiagonal elements of D (returned in workspace) and
+     !> apply or reverse permutation done in TRF.
+
+     pure subroutine stdlib_zsyconv( uplo, way, n, a, lda, ipiv, e, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip, j
+           complex(dp) :: temp
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYCONV', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 i=n
+                 e(1)=czero
+                 do while ( i > 1 )
+                    if( ipiv(i) < 0 ) then
+                       e(i)=a(i-1,i)
+                       e(i-1)=czero
+                       a(i-1,i)=czero
+                       i=i-1
+                    else
+                       e(i)=czero
+                    endif
+                    i=i-1
+                 end do
+                 ! convert permutations
+                 i=n
+                 do while ( i >= 1 )
+                    if( ipiv(i) > 0) then
+                       ip=ipiv(i)
+                       if( i < n) then
+                          do j= i+1,n
+                            temp=a(ip,j)
+                            a(ip,j)=a(i,j)
+                            a(i,j)=temp
+                          end do
+                       endif
+                    else
+                       ip=-ipiv(i)
+                       if( i < n) then
+                          do j= i+1,n
+                             temp=a(ip,j)
+                             a(ip,j)=a(i-1,j)
+                             a(i-1,j)=temp
+                          end do
+                       endif
+                       i=i-1
+                    endif
+                    i=i-1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations
+                 i=1
+                 do while ( i <= n )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if( i < n) then
+                       do j= i+1,n
+                         temp=a(ip,j)
+                         a(ip,j)=a(i,j)
+                         a(i,j)=temp
+                       end do
+                       endif
+                    else
+                      ip=-ipiv(i)
+                      i=i+1
+                      if( i < n) then
+                         do j= i+1,n
+                            temp=a(ip,j)
+                            a(ip,j)=a(i-1,j)
+                            a(i-1,j)=temp
+                         end do
+                      endif
+                    endif
+                    i=i+1
+                 end do
+                 ! revert value
+                 i=n
+                 do while ( i > 1 )
+                    if( ipiv(i) < 0 ) then
+                       a(i-1,i)=e(i)
+                       i=i-1
+                    endif
+                    i=i-1
+                 end do
+              end if
+           else
+              ! a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 i=1
+                 e(n)=czero
+                 do while ( i <= n )
+                    if( i<n .and. ipiv(i) < 0 ) then
+                       e(i)=a(i+1,i)
+                       e(i+1)=czero
+                       a(i+1,i)=czero
+                       i=i+1
+                    else
+                       e(i)=czero
+                    endif
+                    i=i+1
+                 end do
+                 ! convert permutations
+                 i=1
+                 do while ( i <= n )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(ip,j)
+                             a(ip,j)=a(i,j)
+                             a(i,j)=temp
+                          end do
+                       endif
+                    else
+                       ip=-ipiv(i)
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(ip,j)
+                             a(ip,j)=a(i+1,j)
+                             a(i+1,j)=temp
+                          end do
+                       endif
+                       i=i+1
+                    endif
+                    i=i+1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations
+                 i=n
+                 do while ( i >= 1 )
+                    if( ipiv(i) > 0 ) then
+                       ip=ipiv(i)
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(i,j)
+                             a(i,j)=a(ip,j)
+                             a(ip,j)=temp
+                          end do
+                       endif
+                    else
+                       ip=-ipiv(i)
+                       i=i-1
+                       if (i > 1) then
+                          do j= 1,i-1
+                             temp=a(i+1,j)
+                             a(i+1,j)=a(ip,j)
+                             a(ip,j)=temp
+                          end do
+                       endif
+                    endif
+                    i=i-1
+                 end do
+                 ! revert value
+                 i=1
+                 do while ( i <= n-1 )
+                    if( ipiv(i) < 0 ) then
+                       a(i+1,i)=e(i)
+                       i=i+1
+                    endif
+                    i=i+1
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zsyconv
+
+     !> If parameter WAY = 'C':
+     !> ZSYCONVF: converts the factorization output format used in
+     !> ZSYTRF provided on entry in parameter A into the factorization
+     !> output format used in ZSYTRF_RK (or ZSYTRF_BK) that is stored
+     !> on exit in parameters A and E. It also converts in place details of
+     !> the intechanges stored in IPIV from the format used in ZSYTRF into
+     !> the format used in ZSYTRF_RK (or ZSYTRF_BK).
+     !> If parameter WAY = 'R':
+     !> ZSYCONVF performs the conversion in reverse direction, i.e.
+     !> converts the factorization output format used in ZSYTRF_RK
+     !> (or ZSYTRF_BK) provided on entry in parameters A and E into
+     !> the factorization output format used in ZSYTRF that is stored
+     !> on exit in parameter A. It also converts in place details of
+     !> the intechanges stored in IPIV from the format used in ZSYTRF_RK
+     !> (or ZSYTRF_BK) into the format used in ZSYTRF.
+     !> ZSYCONVF can also convert in Hermitian matrix case, i.e. between
+     !> formats used in ZHETRF and ZHETRF_RK (or ZHETRF_BK).
+
+     pure subroutine stdlib_zsyconvf( uplo, way, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYCONVF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! begin a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 ! assign superdiagonal entries of d to array e and czero out
+                 ! corresponding entries in input storage a
+                 i = n
+                 e( 1 ) = czero
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       e( i ) = a( i-1, i )
+                       e( i-1 ) = czero
+                       a( i-1, i ) = czero
+                       i = i - 1
+                    else
+                       e( i ) = czero
+                    end if
+                    i = i - 1
+                 end do
+                 ! convert permutations and ipiv
+                 ! apply permutations to submatrices of upper part of a
+                 ! in factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_zswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i) in a(1:i,n-i:n)
+                       ip = -ipiv( i )
+                       if( i<n ) then
+                          if( ip/=(i-1) ) then
+                             call stdlib_zswap( n-i, a( i-1, i+1 ), lda,a( ip, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is no interchnge of rows i and and ipiv(i),
+                       ! so this should be reflected in ipiv format for
+                       ! *sytrf_rk ( or *sytrf_bk)
+                       ipiv( i ) = i
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations and ipiv
+                 ! apply permutations to submatrices of upper part of a
+                 ! in reverse factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_zswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i) in a(1:i,n-i:n)
+                       i = i + 1
+                       ip = -ipiv( i )
+                       if( i<n ) then
+                          if( ip/=(i-1) ) then
+                             call stdlib_zswap( n-i, a( ip, i+1 ), lda,a( i-1, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is cone interchange of rows i-1 and ipiv(i-1),
+                       ! so this should be recorded in two consecutive entries
+                       ! in ipiv format for *sytrf
+                       ipiv( i ) = ipiv( i-1 )
+                    end if
+                    i = i + 1
+                 end do
+                 ! revert value
+                 ! assign superdiagonal entries of d from array e to
+                 ! superdiagonal entries of a.
+                 i = n
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       a( i-1, i ) = e( i )
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              ! end a is upper
+              end if
+           else
+              ! begin a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 ! assign subdiagonal entries of d to array e and czero out
+                 ! corresponding entries in input storage a
+                 i = 1
+                 e( n ) = czero
+                 do while ( i<=n )
+                    if( i<n .and. ipiv(i)<0 ) then
+                       e( i ) = a( i+1, i )
+                       e( i+1 ) = czero
+                       a( i+1, i ) = czero
+                       i = i + 1
+                    else
+                       e( i ) = czero
+                    end if
+                    i = i + 1
+                 end do
+                 ! convert permutations and ipiv
+                 ! apply permutations to submatrices of lower part of a
+                 ! in factorization order where k increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_zswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i) in a(i:n,1:i-1)
+                       ip = -ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=(i+1) ) then
+                             call stdlib_zswap( i-1, a( i+1, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is no interchnge of rows i and and ipiv(i),
+                       ! so this should be reflected in ipiv format for
+                       ! *sytrf_rk ( or *sytrf_bk)
+                       ipiv( i ) = i
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations and ipiv
+                 ! apply permutations to submatrices of lower part of a
+                 ! in reverse factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_zswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i) in a(i:n,1:i-1)
+                       i = i - 1
+                       ip = -ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=(i+1) ) then
+                             call stdlib_zswap( i-1, a( ip, 1 ), lda,a( i+1, 1 ), lda )
+                          end if
+                       end if
+                       ! convert ipiv
+                       ! there is cone interchange of rows i+1 and ipiv(i+1),
+                       ! so this should be recorded in consecutive entries
+                       ! in ipiv format for *sytrf
+                       ipiv( i ) = ipiv( i+1 )
+                    end if
+                    i = i - 1
+                 end do
+                 ! revert value
+                 ! assign subdiagonal entries of d from array e to
+                 ! subgiagonal entries of a.
+                 i = 1
+                 do while ( i<=n-1 )
+                    if( ipiv( i )<0 ) then
+                       a( i + 1, i ) = e( i )
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              end if
+              ! end a is lower
+           end if
+           return
+     end subroutine stdlib_zsyconvf
+
+     !> If parameter WAY = 'C':
+     !> ZSYCONVF_ROOK: converts the factorization output format used in
+     !> ZSYTRF_ROOK provided on entry in parameter A into the factorization
+     !> output format used in ZSYTRF_RK (or ZSYTRF_BK) that is stored
+     !> on exit in parameters A and E. IPIV format for ZSYTRF_ROOK and
+     !> ZSYTRF_RK (or ZSYTRF_BK) is the same and is not converted.
+     !> If parameter WAY = 'R':
+     !> ZSYCONVF_ROOK performs the conversion in reverse direction, i.e.
+     !> converts the factorization output format used in ZSYTRF_RK
+     !> (or ZSYTRF_BK) provided on entry in parameters A and E into
+     !> the factorization output format used in ZSYTRF_ROOK that is stored
+     !> on exit in parameter A. IPIV format for ZSYTRF_ROOK and
+     !> ZSYTRF_RK (or ZSYTRF_BK) is the same and is not converted.
+     !> ZSYCONVF_ROOK can also convert in Hermitian matrix case, i.e. between
+     !> formats used in ZHETRF_ROOK and ZHETRF_RK (or ZHETRF_BK).
+
+     pure subroutine stdlib_zsyconvf_rook( uplo, way, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, way
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), e(*)
+        ! =====================================================================
+           
+           ! External Subroutines 
+           logical(lk) :: upper, convert
+           integer(ilp) :: i, ip, ip2
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           convert = stdlib_lsame( way, 'C' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.convert .and. .not.stdlib_lsame( way, 'R' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYCONVF_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! begin a is upper
+              if ( convert ) then
+                 ! convert a (a is upper)
+                 ! convert value
+                 ! assign superdiagonal entries of d to array e and czero out
+                 ! corresponding entries in input storage a
+                 i = n
+                 e( 1 ) = czero
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       e( i ) = a( i-1, i )
+                       e( i-1 ) = czero
+                       a( i-1, i ) = czero
+                       i = i - 1
+                    else
+                       e( i ) = czero
+                    end if
+                    i = i - 1
+                 end do
+                 ! convert permutations
+                 ! apply permutations to submatrices of upper part of a
+                 ! in factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_zswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i and ipiv(i) and i-1 and ipiv(i-1)
+                       ! in a(1:i,n-i:n)
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i-1 )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_zswap( n-i, a( i, i+1 ), lda,a( ip, i+1 ), lda )
+                          end if
+                          if( ip2/=(i-1) ) then
+                             call stdlib_zswap( n-i, a( i-1, i+1 ), lda,a( ip2, i+1 ), lda )
+                                       
+                          end if
+                       end if
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              else
+                 ! revert a (a is upper)
+                 ! revert permutations
+                 ! apply permutations to submatrices of upper part of a
+                 ! in reverse factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(1:i,n-i:n)
+                       ip = ipiv( i )
+                       if( i<n ) then
+                          if( ip/=i ) then
+                             call stdlib_zswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i-1 and ipiv(i-1) and i and ipiv(i)
+                       ! in a(1:i,n-i:n)
+                       i = i + 1
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i-1 )
+                       if( i<n ) then
+                          if( ip2/=(i-1) ) then
+                             call stdlib_zswap( n-i, a( ip2, i+1 ), lda,a( i-1, i+1 ), lda )
+                                       
+                          end if
+                          if( ip/=i ) then
+                             call stdlib_zswap( n-i, a( ip, i+1 ), lda,a( i, i+1 ), lda )
+                          end if
+                       end if
+                    end if
+                    i = i + 1
+                 end do
+                 ! revert value
+                 ! assign superdiagonal entries of d from array e to
+                 ! superdiagonal entries of a.
+                 i = n
+                 do while ( i>1 )
+                    if( ipiv( i )<0 ) then
+                       a( i-1, i ) = e( i )
+                       i = i - 1
+                    end if
+                    i = i - 1
+                 end do
+              ! end a is upper
+              end if
+           else
+              ! begin a is lower
+              if ( convert ) then
+                 ! convert a (a is lower)
+                 ! convert value
+                 ! assign subdiagonal entries of d to array e and czero out
+                 ! corresponding entries in input storage a
+                 i = 1
+                 e( n ) = czero
+                 do while ( i<=n )
+                    if( i<n .and. ipiv(i)<0 ) then
+                       e( i ) = a( i+1, i )
+                       e( i+1 ) = czero
+                       a( i+1, i ) = czero
+                       i = i + 1
+                    else
+                       e( i ) = czero
+                    end if
+                    i = i + 1
+                 end do
+                 ! convert permutations
+                 ! apply permutations to submatrices of lower part of a
+                 ! in factorization order where i increases from 1 to n
+                 i = 1
+                 do while ( i<=n )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_zswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i and ipiv(i) and i+1 and ipiv(i+1)
+                       ! in a(i:n,1:i-1)
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i+1 )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_zswap( i-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                          end if
+                          if( ip2/=(i+1) ) then
+                             call stdlib_zswap( i-1, a( i+1, 1 ), lda,a( ip2, 1 ), lda )
+                          end if
+                       end if
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              else
+                 ! revert a (a is lower)
+                 ! revert permutations
+                 ! apply permutations to submatrices of lower part of a
+                 ! in reverse factorization order where i decreases from n to 1
+                 i = n
+                 do while ( i>=1 )
+                    if( ipiv( i )>0 ) then
+                       ! 1-by-1 pivot interchange
+                       ! swap rows i and ipiv(i) in a(i:n,1:i-1)
+                       ip = ipiv( i )
+                       if ( i>1 ) then
+                          if( ip/=i ) then
+                             call stdlib_zswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    else
+                       ! 2-by-2 pivot interchange
+                       ! swap rows i+1 and ipiv(i+1) and i and ipiv(i)
+                       ! in a(i:n,1:i-1)
+                       i = i - 1
+                       ip = -ipiv( i )
+                       ip2 = -ipiv( i+1 )
+                       if ( i>1 ) then
+                          if( ip2/=(i+1) ) then
+                             call stdlib_zswap( i-1, a( ip2, 1 ), lda,a( i+1, 1 ), lda )
+                          end if
+                          if( ip/=i ) then
+                             call stdlib_zswap( i-1, a( ip, 1 ), lda,a( i, 1 ), lda )
+                          end if
+                       end if
+                    end if
+                    i = i - 1
+                 end do
+                 ! revert value
+                 ! assign subdiagonal entries of d from array e to
+                 ! subgiagonal entries of a.
+                 i = 1
+                 do while ( i<=n-1 )
+                    if( ipiv( i )<0 ) then
+                       a( i + 1, i ) = e( i )
+                       i = i + 1
+                    end if
+                    i = i + 1
+                 end do
+              end if
+              ! end a is lower
+           end if
+           return
+     end subroutine stdlib_zsyconvf_rook
+
+     !> ZSYEQUB: computes row and column scalings intended to equilibrate a
+     !> symmetric matrix A (with respect to the Euclidean norm) and reduce
+     !> its condition number. The scale factors S are computed by the BIN
+     !> algorithm (see references) so that the scaled matrix B with elements
+     !> B(i,j) = S(i)*A(i,j)*S(j) has a condition number within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_zsyequb( uplo, n, a, lda, s, scond, amax, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(out) :: amax, scond
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+           real(dp), intent(out) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: max_iter = 100
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, iter
+           real(dp) :: avg, std, tol, c0, c1, c2, t, u, si, d, base, smin, smax, smlnum, bignum, &
+                     scale, sumsq
+           logical(lk) :: up
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,int,log,max,min,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if ( .not. ( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -1
+           else if ( n < 0 ) then
+              info = -2
+           else if ( lda < max( 1, n ) ) then
+              info = -4
+           end if
+           if ( info /= 0 ) then
+              call stdlib_xerbla( 'ZSYEQUB', -info )
+              return
+           end if
+           up = stdlib_lsame( uplo, 'U' )
+           amax = zero
+           ! quick return if possible.
+           if ( n == 0 ) then
+              scond = one
+              return
+           end if
+           do i = 1, n
+              s( i ) = zero
+           end do
+           amax = zero
+           if ( up ) then
+              do j = 1, n
+                 do i = 1, j-1
+                    s( i ) = max( s( i ), cabs1( a( i, j ) ) )
+                    s( j ) = max( s( j ), cabs1( a( i, j ) ) )
+                    amax = max( amax, cabs1( a( i, j ) ) )
+                 end do
+                 s( j ) = max( s( j ), cabs1( a( j, j ) ) )
+                 amax = max( amax, cabs1( a( j, j ) ) )
+              end do
+           else
+              do j = 1, n
+                 s( j ) = max( s( j ), cabs1( a( j, j ) ) )
+                 amax = max( amax, cabs1( a( j, j ) ) )
+                 do i = j+1, n
+                    s( i ) = max( s( i ), cabs1( a( i, j ) ) )
+                    s( j ) = max( s( j ), cabs1( a( i, j ) ) )
+                    amax = max( amax, cabs1( a( i, j ) ) )
+                 end do
+              end do
+           end if
+           do j = 1, n
+              s( j ) = 1.0_dp / s( j )
+           end do
+           tol = one / sqrt( 2.0_dp * n )
+           do iter = 1, max_iter
+              scale = zero
+              sumsq = zero
+              ! beta = |a|s
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              if ( up ) then
+                 do j = 1, n
+                    do i = 1, j-1
+                       work( i ) = work( i ) + cabs1( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + cabs1( a( i, j ) ) * s( i )
+                    end do
+                    work( j ) = work( j ) + cabs1( a( j, j ) ) * s( j )
+                 end do
+              else
+                 do j = 1, n
+                    work( j ) = work( j ) + cabs1( a( j, j ) ) * s( j )
+                    do i = j+1, n
+                       work( i ) = work( i ) + cabs1( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + cabs1( a( i, j ) ) * s( i )
+                    end do
+                 end do
+              end if
+              ! avg = s^t beta / n
+              avg = zero
+              do i = 1, n
+                 avg = avg + s( i ) * real( work( i ),KIND=dp)
+              end do
+              avg = avg / n
+              std = zero
+              do i = n+1, 2*n
+                 work( i ) = s( i-n ) * work( i-n ) - avg
+              end do
+              call stdlib_zlassq( n, work( n+1 ), 1, scale, sumsq )
+              std = scale * sqrt( sumsq / n )
+              if ( std < tol * avg ) goto 999
+              do i = 1, n
+                 t = cabs1( a( i, i ) )
+                 si = s( i )
+                 c2 = ( n-1 ) * t
+                 c1 = ( n-2 ) * ( real( work( i ),KIND=dp) - t*si )
+                 c0 = -(t*si)*si + 2 * real( work( i ),KIND=dp) * si - n*avg
+                 d = c1*c1 - 4*c0*c2
+                 if ( d <= 0 ) then
+                    info = -1
+                    return
+                 end if
+                 si = -2*c0 / ( c1 + sqrt( d ) )
+                 d = si - s( i )
+                 u = zero
+                 if ( up ) then
+                    do j = 1, i
+                       t = cabs1( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = cabs1( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 else
+                    do j = 1, i
+                       t = cabs1( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = cabs1( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 end if
+                 avg = avg + ( u + real( work( i ),KIND=dp) ) * d / n
+                 s( i ) = si
+              end do
+           end do
+           999 continue
+           smlnum = stdlib_dlamch( 'SAFEMIN' )
+           bignum = one / smlnum
+           smin = bignum
+           smax = zero
+           t = one / sqrt( avg )
+           base = stdlib_dlamch( 'B' )
+           u = one / log( base )
+           do i = 1, n
+              s( i ) = base ** int( u * log( s( i ) * t ),KIND=ilp)
+              smin = min( smin, s( i ) )
+              smax = max( smax, s( i ) )
+           end do
+           scond = max( smin, smlnum ) / min( smax, bignum )
+     end subroutine stdlib_zsyequb
+
+     !> ZSYMV:  performs the matrix-vector  operation
+     !> y := alpha*A*x + beta*y,
+     !> where alpha and beta are scalars, x and y are n element vectors and
+     !> A is an n by n symmetric matrix.
+
+     pure subroutine stdlib_zsymv( uplo, n, alpha, a, lda, x, incx, beta, y, incy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incx, incy, lda, n
+           complex(dp), intent(in) :: alpha, beta
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), x(*)
+           complex(dp), intent(inout) :: y(*)
+       ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, ix, iy, j, jx, jy, kx, ky
+           complex(dp) :: temp1, temp2
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = 1
+           else if( n<0 ) then
+              info = 2
+           else if( lda<max( 1, n ) ) then
+              info = 5
+           else if( incx==0 ) then
+              info = 7
+           else if( incy==0 ) then
+              info = 10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYMV ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ) .or. ( ( alpha==czero ) .and. ( beta==cone ) ) )return
+           ! set up the start points in  x  and  y.
+           if( incx>0 ) then
+              kx = 1
+           else
+              kx = 1 - ( n-1 )*incx
+           end if
+           if( incy>0 ) then
+              ky = 1
+           else
+              ky = 1 - ( n-1 )*incy
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           ! first form  y := beta*y.
+           if( beta/=cone ) then
+              if( incy==1 ) then
+                 if( beta==czero ) then
+                    do i = 1, n
+                       y( i ) = czero
+                    end do
+                 else
+                    do i = 1, n
+                       y( i ) = beta*y( i )
+                    end do
+                 end if
+              else
+                 iy = ky
+                 if( beta==czero ) then
+                    do i = 1, n
+                       y( iy ) = czero
+                       iy = iy + incy
+                    end do
+                 else
+                    do i = 1, n
+                       y( iy ) = beta*y( iy )
+                       iy = iy + incy
+                    end do
+                 end if
+              end if
+           end if
+           if( alpha==czero )return
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! form  y  when a is stored in upper triangle.
+              if( ( incx==1 ) .and. ( incy==1 ) ) then
+                 do j = 1, n
+                    temp1 = alpha*x( j )
+                    temp2 = czero
+                    do i = 1, j - 1
+                       y( i ) = y( i ) + temp1*a( i, j )
+                       temp2 = temp2 + a( i, j )*x( i )
+                    end do
+                    y( j ) = y( j ) + temp1*a( j, j ) + alpha*temp2
+                 end do
+              else
+                 jx = kx
+                 jy = ky
+                 do j = 1, n
+                    temp1 = alpha*x( jx )
+                    temp2 = czero
+                    ix = kx
+                    iy = ky
+                    do i = 1, j - 1
+                       y( iy ) = y( iy ) + temp1*a( i, j )
+                       temp2 = temp2 + a( i, j )*x( ix )
+                       ix = ix + incx
+                       iy = iy + incy
+                    end do
+                    y( jy ) = y( jy ) + temp1*a( j, j ) + alpha*temp2
+                    jx = jx + incx
+                    jy = jy + incy
+                 end do
+              end if
+           else
+              ! form  y  when a is stored in lower triangle.
+              if( ( incx==1 ) .and. ( incy==1 ) ) then
+                 do j = 1, n
+                    temp1 = alpha*x( j )
+                    temp2 = czero
+                    y( j ) = y( j ) + temp1*a( j, j )
+                    do i = j + 1, n
+                       y( i ) = y( i ) + temp1*a( i, j )
+                       temp2 = temp2 + a( i, j )*x( i )
+                    end do
+                    y( j ) = y( j ) + alpha*temp2
+                 end do
+              else
+                 jx = kx
+                 jy = ky
+                 do j = 1, n
+                    temp1 = alpha*x( jx )
+                    temp2 = czero
+                    y( jy ) = y( jy ) + temp1*a( j, j )
+                    ix = jx
+                    iy = jy
+                    do i = j + 1, n
+                       ix = ix + incx
+                       iy = iy + incy
+                       y( iy ) = y( iy ) + temp1*a( i, j )
+                       temp2 = temp2 + a( i, j )*x( ix )
+                    end do
+                    y( jy ) = y( jy ) + alpha*temp2
+                    jx = jx + incx
+                    jy = jy + incy
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zsymv
+
+     !> ZSYR:   performs the symmetric rank 1 operation
+     !> A := alpha*x*x**H + A,
+     !> where alpha is a complex scalar, x is an n element vector and A is an
+     !> n by n symmetric matrix.
+
+     pure subroutine stdlib_zsyr( uplo, n, alpha, x, incx, a, lda )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: incx, lda, n
+           complex(dp), intent(in) :: alpha
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: x(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, ix, j, jx, kx
+           complex(dp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = 1
+           else if( n<0 ) then
+              info = 2
+           else if( incx==0 ) then
+              info = 5
+           else if( lda<max( 1, n ) ) then
+              info = 7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYR  ', info )
+              return
+           end if
+           ! quick return if possible.
+           if( ( n==0 ) .or. ( alpha==czero ) )return
+           ! set the start point in x if the increment is not unity.
+           if( incx<=0 ) then
+              kx = 1 - ( n-1 )*incx
+           else if( incx/=1 ) then
+              kx = 1
+           end if
+           ! start the operations. in this version the elements of a are
+           ! accessed sequentially with cone pass through the triangular part
+           ! of a.
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! form  a  when a is stored in upper triangle.
+              if( incx==1 ) then
+                 do j = 1, n
+                    if( x( j )/=czero ) then
+                       temp = alpha*x( j )
+                       do i = 1, j
+                          a( i, j ) = a( i, j ) + x( i )*temp
+                       end do
+                    end if
+                 end do
+              else
+                 jx = kx
+                 do j = 1, n
+                    if( x( jx )/=czero ) then
+                       temp = alpha*x( jx )
+                       ix = kx
+                       do i = 1, j
+                          a( i, j ) = a( i, j ) + x( ix )*temp
+                          ix = ix + incx
+                       end do
+                    end if
+                    jx = jx + incx
+                 end do
+              end if
+           else
+              ! form  a  when a is stored in lower triangle.
+              if( incx==1 ) then
+                 do j = 1, n
+                    if( x( j )/=czero ) then
+                       temp = alpha*x( j )
+                       do i = j, n
+                          a( i, j ) = a( i, j ) + x( i )*temp
+                       end do
+                    end if
+                 end do
+              else
+                 jx = kx
+                 do j = 1, n
+                    if( x( jx )/=czero ) then
+                       temp = alpha*x( jx )
+                       ix = jx
+                       do i = j, n
+                          a( i, j ) = a( i, j ) + x( ix )*temp
+                          ix = ix + incx
+                       end do
+                    end if
+                    jx = jx + incx
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zsyr
+
+     !> ZSYSWAPR: applies an elementary permutation on the rows and the columns of
+     !> a symmetric matrix.
+
+     pure subroutine stdlib_zsyswapr( uplo, n, a, lda, i1, i2)
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: i1, i2, lda, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,n)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           complex(dp) :: tmp
+           ! Executable Statements 
+           upper = stdlib_lsame( uplo, 'U' )
+           if (upper) then
+               ! upper
+               ! first swap
+                ! - swap column i1 and i2 from i1 to i1-1
+              call stdlib_zswap( i1-1, a(1,i1), 1, a(1,i2), 1 )
+                ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap row i1 from i1+1 to i2-1 with col i2 from i1+1 to i2-1
+              tmp=a(i1,i1)
+              a(i1,i1)=a(i2,i2)
+              a(i2,i2)=tmp
+              do i=1,i2-i1-1
+                 tmp=a(i1,i1+i)
+                 a(i1,i1+i)=a(i1+i,i2)
+                 a(i1+i,i2)=tmp
+              end do
+                ! third swap
+                ! - swap row i1 and i2 from i2+1 to n
+              do i=i2+1,n
+                 tmp=a(i1,i)
+                 a(i1,i)=a(i2,i)
+                 a(i2,i)=tmp
+              end do
+             else
+               ! lower
+               ! first swap
+                ! - swap row i1 and i2 from i1 to i1-1
+              call stdlib_zswap( i1-1, a(i1,1), lda, a(i2,1), lda )
+               ! second swap :
+                ! - swap a(i1,i1) and a(i2,i2)
+                ! - swap col i1 from i1+1 to i2-1 with row i2 from i1+1 to i2-1
+               tmp=a(i1,i1)
+               a(i1,i1)=a(i2,i2)
+               a(i2,i2)=tmp
+               do i=1,i2-i1-1
+                  tmp=a(i1+i,i1)
+                  a(i1+i,i1)=a(i2,i1+i)
+                  a(i2,i1+i)=tmp
+               end do
+               ! third swap
+                ! - swap col i1 and i2 from i2+1 to n
+               do i=i2+1,n
+                  tmp=a(i,i1)
+                  a(i,i1)=a(i,i2)
+                  a(i,i2)=tmp
+               end do
+           endif
+     end subroutine stdlib_zsyswapr
+
+     !> ZSYTF2: computes the factorization of a complex symmetric matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**T is the transpose of U, and D is symmetric and
+     !> block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_zsytf2( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, imax, j, jmax, k, kk, kp, kstep
+           real(dp) :: absakk, alpha, colmax, rowmax
+           complex(dp) :: d11, d12, d21, d22, r1, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=dp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTF2', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero .or. stdlib_disnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = imax + stdlib_izamax( k-imax, a( imax, imax+1 ), lda )
+                    rowmax = cabs1( a( imax, jmax ) )
+                    if( imax>1 ) then
+                       jmax = stdlib_izamax( imax-1, a( 1, imax ), 1 )
+                       rowmax = max( rowmax, cabs1( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( a( imax, imax ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k-1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    call stdlib_zswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    call stdlib_zswap( kk-kp-1, a( kp+1, kk ), 1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                    ! a := a - u(k)*d(k)*u(k)**t = a - w(k)*1/d(k)*w(k)**t
+                    r1 = cone / a( k, k )
+                    call stdlib_zsyr( uplo, k-1, -r1, a( 1, k ), 1, a, lda )
+                    ! store u(k) in column k
+                    call stdlib_zscal( k-1, r1, a( 1, k ), 1 )
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( w(k-1) w(k) )*inv(d(k))*( w(k-1) w(k) )**t
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = cone / ( d11*d22-cone )
+                       d12 = t / d12
+                       do j = k - 2, 1, -1
+                          wkm1 = d12*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = d12*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - a( i, k )*wk -a( i, k-1 )*wkm1
+                          end do
+                          a( j, k ) = wk
+                          a( j, k-1 ) = wkm1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( max( absakk, colmax )==zero .or. stdlib_disnan(absakk) ) then
+                 ! column k is zero or underflow, or contains a nan:
+                 ! set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 if( absakk>=alpha*colmax ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    ! jmax is the column-index of the largest off-diagonal
+                    ! element in row imax, and rowmax is its absolute value
+                    jmax = k - 1 + stdlib_izamax( imax-k, a( imax, k ), lda )
+                    rowmax = cabs1( a( imax, jmax ) )
+                    if( imax<n ) then
+                       jmax = imax + stdlib_izamax( n-imax, a( imax+1, imax ), 1 )
+                       rowmax = max( rowmax, cabs1( a( jmax, imax ) ) )
+                    end if
+                    if( absakk>=alpha*colmax*( colmax / rowmax ) ) then
+                       ! no interchange, use 1-by-1 pivot block
+                       kp = k
+                    else if( cabs1( a( imax, imax ) )>=alpha*rowmax ) then
+                       ! interchange rows and columns k and imax, use 1-by-1
+                       ! pivot block
+                       kp = imax
+                    else
+                       ! interchange rows and columns k+1 and imax, use 2-by-2
+                       ! pivot block
+                       kp = imax
+                       kstep = 2
+                    end if
+                 end if
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_zswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    call stdlib_zswap( kp-kk-1, a( kk+1, kk ), 1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                       ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                       ! a := a - l(k)*d(k)*l(k)**t = a - w(k)*(1/d(k))*w(k)**t
+                       r1 = cone / a( k, k )
+                       call stdlib_zsyr( uplo, n-k, -r1, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                 
+                       ! store l(k) in column k
+                       call stdlib_zscal( n-k, r1, a( k+1, k ), 1 )
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k)
+                    if( k<n-1 ) then
+                       ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                       ! a := a - ( l(k) l(k+1) )*d(k)*( l(k) l(k+1) )**t
+                          ! = a - ( w(k) w(k+1) )*inv(d(k))*( w(k) w(k+1) )**t
+                       ! where l(k) and l(k+1) are the k-th and (k+1)-th
+                       ! columns of l
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       d21 = t / d21
+                       do j = k + 2, n
+                          wk = d21*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = d21*( d22*a( j, k+1 )-a( j, k ) )
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - a( i, k )*wk -a( i, k+1 )*wkp1
+                          end do
+                          a( j, k ) = wk
+                          a( j, k+1 ) = wkp1
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -kp
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_zsytf2
+
+     !> ZSYTF2_RK: computes the factorization of a complex symmetric matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_zsytf2_rk( uplo, n, a, lda, e, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: e(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper, done
+           integer(ilp) :: i, imax, j, jmax, itemp, k, kk, kp, kstep, p, ii
+           real(dp) :: absakk, alpha, colmax, rowmax, dtemp, sfmin
+           complex(dp) :: d11, d12, d21, d22, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt,aimag,real
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=dp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTF2_RK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! initialize the first entry of array e, where superdiagonal
+              ! elements of d are stored
+              e( 1 ) = czero
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 34
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ! set e( k ) to zero
+                 if( k>1 )e( k ) = czero
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange,
+                    ! use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_izamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_izamax( imax-1, a( 1, imax ), 1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( cabs1( a( imax, imax ) )<alpha*rowmax ))then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the leading
+                    ! submatrix a(1:k,1:k) if we have a 2-by-2 pivot
+                    if( p>1 )call stdlib_zswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    if( p<(k-1) )call stdlib_zswap( k-p-1, a( p+1, k ), 1, a( p, p+1 ),lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_zswap( n-k, a( k, k+1 ), lda, a( p, k+1 ), lda )
+                 end if
+                 ! second swap
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    if( kp>1 )call stdlib_zswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    if( ( kk>1 ) .and. ( kp<(kk-1) ) )call stdlib_zswap( kk-kp-1, a( kp+1, kk ), &
+                              1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert upper triangle of a into u form by applying
+                    ! the interchanges in columns k+1:n.
+                    if( k<n )call stdlib_zswap( n-k, a( kk, k+1 ), lda, a( kp, k+1 ),lda )
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = cone / a( k, k )
+                          call stdlib_zsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_zscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_zsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                       ! store the superdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = cone / ( d11*d22-cone )
+                       do j = k - 2, 1, -1
+                          wkm1 = t*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = t*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - (a( i, k ) / d12 )*wk -( a( i, k-1 ) / d12 )&
+                                       *wkm1
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d12
+                          a( j, k-1 ) = wkm1 / d12
+                       end do
+                    end if
+                    ! copy superdiagonal elements of d(k) to e(k) and
+                    ! zero out superdiagonal entry of a
+                    e( k ) = a( k-1, k )
+                    e( k-1 ) = czero
+                    a( k-1, k ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+              34 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! initialize the unused last entry of the subdiagonal array e.
+              e( n ) = czero
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 64
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+                 ! set e( k ) to zero
+                 if( k<n )e( k ) = czero
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_izamax( imax-k, a( imax, k ), lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_izamax( n-imax, a( imax+1, imax ),1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! abs( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( cabs1( a( imax, imax ) )<alpha*rowmax ))then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 42
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the trailing
+                    ! submatrix a(k:n,k:n) if we have a 2-by-2 pivot
+                    if( p<n )call stdlib_zswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    if( p>(k+1) )call stdlib_zswap( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_zswap( k-1, a( k, 1 ), lda, a( p, 1 ), lda )
+                 end if
+                 ! second swap
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_zswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    if( ( kk<n ) .and. ( kp>(kk+1) ) )call stdlib_zswap( kp-kk-1, a( kk+1, kk ), &
+                              1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                    ! convert lower triangle of a into l form by applying
+                    ! the interchanges in columns 1:k-1.
+                    if ( k>1 )call stdlib_zswap( k-1, a( kk, 1 ), lda, a( kp, 1 ), lda )
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                    ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                    ! store l(k) in column k
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = cone / a( k, k )
+                          call stdlib_zsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_zscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_zsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                       ! store the subdiagonal element of d in array e
+                       e( k ) = czero
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = t*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = t*( d22*a( j, k+1 )-a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - ( a( i, k ) / d21 )*wk -( a( i, k+1 ) / d21 )&
+                                       *wkp1
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d21
+                          a( j, k+1 ) = wkp1 / d21
+                       end do
+                    end if
+                    ! copy subdiagonal elements of d(k) to e(k) and
+                    ! zero out subdiagonal entry of a
+                    e( k ) = a( k+1, k )
+                    e( k+1 ) = czero
+                    a( k+1, k ) = czero
+                 end if
+                 ! end column k is nonsingular
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+              64 continue
+           end if
+           return
+     end subroutine stdlib_zsytf2_rk
+
+     !> ZSYTF2_ROOK: computes the factorization of a complex symmetric matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method:
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, U**T is the transpose of U, and D is symmetric and
+     !> block diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the unblocked version of the algorithm, calling Level 2 BLAS.
+
+     pure subroutine stdlib_zsytf2_rook( uplo, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: sevten = 17.0e+0_dp
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper, done
+           integer(ilp) :: i, imax, j, jmax, itemp, k, kk, kp, kstep, p, ii
+           real(dp) :: absakk, alpha, colmax, rowmax, dtemp, sfmin
+           complex(dp) :: d11, d12, d21, d22, t, wk, wkm1, wkp1, z
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt,aimag,real
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( z ) = abs( real( z,KIND=dp) ) + abs( aimag( z ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTF2_ROOK', -info )
+              return
+           end if
+           ! initialize alpha for use in choosing pivot block size.
+           alpha = ( one+sqrt( sevten ) ) / eight
+           ! compute machine safe minimum
+           sfmin = stdlib_dlamch( 'S' )
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k>1 ) then
+                 imax = stdlib_izamax( k-1, a( 1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( (max( absakk, colmax )==zero) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange,
+                    ! use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    12 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = imax + stdlib_izamax( k-imax, a( imax, imax+1 ),lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax>1 ) then
+                          itemp = stdlib_izamax( imax-1, a( 1, imax ), 1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! cabs1( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( cabs1(a( imax, imax ))<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 12
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the leading
+                    ! submatrix a(1:k,1:k) if we have a 2-by-2 pivot
+                    if( p>1 )call stdlib_zswap( p-1, a( 1, k ), 1, a( 1, p ), 1 )
+                    if( p<(k-1) )call stdlib_zswap( k-p-1, a( p+1, k ), 1, a( p, p+1 ),lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                 end if
+                 ! second swap
+                 kk = k - kstep + 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the leading
+                    ! submatrix a(1:k,1:k)
+                    if( kp>1 )call stdlib_zswap( kp-1, a( 1, kk ), 1, a( 1, kp ), 1 )
+                    if( ( kk>1 ) .and. ( kp<(kk-1) ) )call stdlib_zswap( kk-kp-1, a( kp+1, kk ), &
+                              1, a( kp, kp+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k-1, k )
+                       a( k-1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the leading submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = u(k)*d(k)
+                    ! where u(k) is the k-th column of u
+                    if( k>1 ) then
+                       ! perform a rank-1 update of a(1:k-1,1:k-1) and
+                       ! store u(k) in column k
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(1:k-1,1:k-1) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*1/d(k)*w(k)**t
+                          d11 = cone / a( k, k )
+                          call stdlib_zsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                          ! store u(k) in column k
+                          call stdlib_zscal( k-1, d11, a( 1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = 1, k - 1
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - u(k)*d(k)*u(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_zsyr( uplo, k-1, -d11, a( 1, k ), 1, a, lda )
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k-1 now hold
+                    ! ( w(k-1) w(k) ) = ( u(k-1) u(k) )*d(k)
+                    ! where u(k) and u(k-1) are the k-th and (k-1)-th columns
+                    ! of u
+                    ! perform a rank-2 update of a(1:k-2,1:k-2) as
+                    ! a := a - ( u(k-1) u(k) )*d(k)*( u(k-1) u(k) )**t
+                       ! = a - ( ( a(k-1)a(k) )*inv(d(k)) ) * ( a(k-1)a(k) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k>2 ) then
+                       d12 = a( k-1, k )
+                       d22 = a( k-1, k-1 ) / d12
+                       d11 = a( k, k ) / d12
+                       t = cone / ( d11*d22-cone )
+                       do j = k - 2, 1, -1
+                          wkm1 = t*( d11*a( j, k-1 )-a( j, k ) )
+                          wk = t*( d22*a( j, k )-a( j, k-1 ) )
+                          do i = j, 1, -1
+                             a( i, j ) = a( i, j ) - (a( i, k ) / d12 )*wk -( a( i, k-1 ) / d12 )&
+                                       *wkm1
+                          end do
+                          ! store u(k) and u(k-1) in cols k and k-1 for row j
+                          a( j, k ) = wk / d12
+                          a( j, k-1 ) = wkm1 / d12
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k-1 ) = -kp
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kstep
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2
+              k = 1
+              40 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 70
+              kstep = 1
+              p = k
+              ! determine rows and columns to be interchanged and whether
+              ! a 1-by-1 or 2-by-2 pivot block will be used
+              absakk = cabs1( a( k, k ) )
+              ! imax is the row-index of the largest off-diagonal element in
+              ! column k, and colmax is its absolute value.
+              ! determine both colmax and imax.
+              if( k<n ) then
+                 imax = k + stdlib_izamax( n-k, a( k+1, k ), 1 )
+                 colmax = cabs1( a( imax, k ) )
+              else
+                 colmax = zero
+              end if
+              if( ( max( absakk, colmax )==zero ) ) then
+                 ! column k is zero or underflow: set info and continue
+                 if( info==0 )info = k
+                 kp = k
+              else
+                 ! test for interchange
+                 ! equivalent to testing for (used to handle nan and inf)
+                 ! absakk>=alpha*colmax
+                 if( .not.( absakk<alpha*colmax ) ) then
+                    ! no interchange, use 1-by-1 pivot block
+                    kp = k
+                 else
+                    done = .false.
+                    ! loop until pivot found
+                    42 continue
+                       ! begin pivot search loop body
+                       ! jmax is the column-index of the largest off-diagonal
+                       ! element in row imax, and rowmax is its absolute value.
+                       ! determine both rowmax and jmax.
+                       if( imax/=k ) then
+                          jmax = k - 1 + stdlib_izamax( imax-k, a( imax, k ), lda )
+                          rowmax = cabs1( a( imax, jmax ) )
+                       else
+                          rowmax = zero
+                       end if
+                       if( imax<n ) then
+                          itemp = imax + stdlib_izamax( n-imax, a( imax+1, imax ),1 )
+                          dtemp = cabs1( a( itemp, imax ) )
+                          if( dtemp>rowmax ) then
+                             rowmax = dtemp
+                             jmax = itemp
+                          end if
+                       end if
+                       ! equivalent to testing for (used to handle nan and inf)
+                       ! cabs1( a( imax, imax ) )>=alpha*rowmax
+                       if( .not.( cabs1(a( imax, imax ))<alpha*rowmax ) )then
+                          ! interchange rows and columns k and imax,
+                          ! use 1-by-1 pivot block
+                          kp = imax
+                          done = .true.
+                       ! equivalent to testing for rowmax == colmax,
+                       ! used to handle nan and inf
+                       else if( ( p==jmax ).or.( rowmax<=colmax ) ) then
+                          ! interchange rows and columns k+1 and imax,
+                          ! use 2-by-2 pivot block
+                          kp = imax
+                          kstep = 2
+                          done = .true.
+                       else
+                          ! pivot not found, set variables and repeat
+                          p = imax
+                          colmax = rowmax
+                          imax = jmax
+                       end if
+                       ! end pivot search loop body
+                    if( .not. done ) goto 42
+                 end if
+                 ! swap two rows and two columns
+                 ! first swap
+                 if( ( kstep==2 ) .and. ( p/=k ) ) then
+                    ! interchange rows and column k and p in the trailing
+                    ! submatrix a(k:n,k:n) if we have a 2-by-2 pivot
+                    if( p<n )call stdlib_zswap( n-p, a( p+1, k ), 1, a( p+1, p ), 1 )
+                    if( p>(k+1) )call stdlib_zswap( p-k-1, a( k+1, k ), 1, a( p, k+1 ), lda )
+                              
+                    t = a( k, k )
+                    a( k, k ) = a( p, p )
+                    a( p, p ) = t
+                 end if
+                 ! second swap
+                 kk = k + kstep - 1
+                 if( kp/=kk ) then
+                    ! interchange rows and columns kk and kp in the trailing
+                    ! submatrix a(k:n,k:n)
+                    if( kp<n )call stdlib_zswap( n-kp, a( kp+1, kk ), 1, a( kp+1, kp ), 1 )
+                              
+                    if( ( kk<n ) .and. ( kp>(kk+1) ) )call stdlib_zswap( kp-kk-1, a( kk+1, kk ), &
+                              1, a( kp, kk+1 ),lda )
+                    t = a( kk, kk )
+                    a( kk, kk ) = a( kp, kp )
+                    a( kp, kp ) = t
+                    if( kstep==2 ) then
+                       t = a( k+1, k )
+                       a( k+1, k ) = a( kp, k )
+                       a( kp, k ) = t
+                    end if
+                 end if
+                 ! update the trailing submatrix
+                 if( kstep==1 ) then
+                    ! 1-by-1 pivot block d(k): column k now holds
+                    ! w(k) = l(k)*d(k)
+                    ! where l(k) is the k-th column of l
+                    if( k<n ) then
+                    ! perform a rank-1 update of a(k+1:n,k+1:n) and
+                    ! store l(k) in column k
+                       if( cabs1( a( k, k ) )>=sfmin ) then
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                          d11 = cone / a( k, k )
+                          call stdlib_zsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                          ! store l(k) in column k
+                          call stdlib_zscal( n-k, d11, a( k+1, k ), 1 )
+                       else
+                          ! store l(k) in column k
+                          d11 = a( k, k )
+                          do ii = k + 1, n
+                             a( ii, k ) = a( ii, k ) / d11
+                          end do
+                          ! perform a rank-1 update of a(k+1:n,k+1:n) as
+                          ! a := a - l(k)*d(k)*l(k)**t
+                             ! = a - w(k)*(1/d(k))*w(k)**t
+                             ! = a - (w(k)/d(k))*(d(k))*(w(k)/d(k))**t
+                          call stdlib_zsyr( uplo, n-k, -d11, a( k+1, k ), 1,a( k+1, k+1 ), lda )
+                                    
+                       end if
+                    end if
+                 else
+                    ! 2-by-2 pivot block d(k): columns k and k+1 now hold
+                    ! ( w(k) w(k+1) ) = ( l(k) l(k+1) )*d(k)
+                    ! where l(k) and l(k+1) are the k-th and (k+1)-th columns
+                    ! of l
+                    ! perform a rank-2 update of a(k+2:n,k+2:n) as
+                    ! a := a - ( l(k) l(k+1) ) * d(k) * ( l(k) l(k+1) )**t
+                       ! = a - ( ( a(k)a(k+1) )*inv(d(k) ) * ( a(k)a(k+1) )**t
+                    ! and store l(k) and l(k+1) in columns k and k+1
+                    if( k<n-1 ) then
+                       d21 = a( k+1, k )
+                       d11 = a( k+1, k+1 ) / d21
+                       d22 = a( k, k ) / d21
+                       t = cone / ( d11*d22-cone )
+                       do j = k + 2, n
+                          ! compute  d21 * ( w(k)w(k+1) ) * inv(d(k)) for row j
+                          wk = t*( d11*a( j, k )-a( j, k+1 ) )
+                          wkp1 = t*( d22*a( j, k+1 )-a( j, k ) )
+                          ! perform a rank-2 update of a(k+2:n,k+2:n)
+                          do i = j, n
+                             a( i, j ) = a( i, j ) - ( a( i, k ) / d21 )*wk -( a( i, k+1 ) / d21 )&
+                                       *wkp1
+                          end do
+                          ! store l(k) and l(k+1) in cols k and k+1 for row j
+                          a( j, k ) = wk / d21
+                          a( j, k+1 ) = wkp1 / d21
+                       end do
+                    end if
+                 end if
+              end if
+              ! store details of the interchanges in ipiv
+              if( kstep==1 ) then
+                 ipiv( k ) = kp
+              else
+                 ipiv( k ) = -p
+                 ipiv( k+1 ) = -kp
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kstep
+              go to 40
+           end if
+           70 continue
+           return
+     end subroutine stdlib_zsytf2_rook
+
+     !> ZSYTRF: computes the factorization of a complex symmetric matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method.  The form of the
+     !> factorization is
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_zsytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'ZSYTRF', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZSYTRF', uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_zlasyf;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_zlasyf( uplo, k, nb, kb, a, lda, ipiv, work, n, iinfo )
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_zsytf2( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_zlasyf;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_zlasyf( uplo, n-k+1, nb, kb, a( k, k ), lda, ipiv( k ),work, n, &
+                           iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_zsytf2( uplo, n-k+1, a( k, k ), lda, ipiv( k ), iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zsytrf
+
+     !> ZSYTRF_RK: computes the factorization of a complex symmetric matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_zsytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, ip, iws, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'ZSYTRF_RK', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRF_RK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZSYTRF_RK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_zlasyf_rk;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 15
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_zlasyf_rk( uplo, k, nb, kb, a, lda, e,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_zsytf2_rk( uplo, k, a, lda, e, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k-kb+1:k and apply row permutations to the
+              ! last k+1 colunms k+1:n after that block
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k<n ) then
+                 do i = k, ( k - kb + 1 ), -1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_zswap( n-k, a( i, k+1 ), lda,a( ip, k+1 ), lda )
+                    end if
+                 end do
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+              ! this label is the exit from main loop over k decreasing
+              ! from n to 1 in steps of kb
+              15 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_zlasyf_rk;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 35
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_zlasyf_rk( uplo, n-k+1, nb, kb, a( k, k ), lda, e( k ),ipiv( k ), &
+                           work, ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_zsytf2_rk( uplo, n-k+1, a( k, k ), lda, e( k ),ipiv( k ), iinfo )
+                           
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do i = k, k + kb - 1
+                 if( ipiv( i )>0 ) then
+                    ipiv( i ) = ipiv( i ) + k - 1
+                 else
+                    ipiv( i ) = ipiv( i ) - k + 1
+                 end if
+              end do
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k:k+kb-1 and apply row permutations to the
+              ! first k-1 colunms 1:k-1 before that block
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k>1 ) then
+                 do i = k, ( k + kb - 1 ), 1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_zswap( k-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                    end if
+                 end do
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+              ! this label is the exit from main loop over k increasing
+              ! from 1 to n in steps of kb
+              35 continue
+           ! end lower
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zsytrf_rk
+
+     !> ZSYTRF_ROOK: computes the factorization of a complex symmetric matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+     !> The form of the factorization is
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_zsytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'ZSYTRF_ROOK', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, n*nb )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRF_ROOK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZSYTRF_ROOK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_zlasyf_rook;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_zlasyf_rook( uplo, k, nb, kb, a, lda,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_zsytf2_rook( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_zlasyf_rook;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_zlasyf_rook( uplo, n-k+1, nb, kb, a( k, k ), lda,ipiv( k ), work, &
+                           ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_zsytf2_rook( uplo, n-k+1, a( k, k ), lda, ipiv( k ),iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zsytrf_rook
+
+     !> ZSYTRI: computes the inverse of a complex symmetric indefinite matrix
+     !> A using the factorization A = U*D*U**T or A = L*D*L**T computed by
+     !> ZSYTRF.
+
+     pure subroutine stdlib_zsytri( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kp, kstep
+           complex(dp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 40
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = cone / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_zcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_zsymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_zdotu( k-1, work, 1, a( 1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = a( k, k+1 )
+                 ak = a( k, k ) / t
+                 akp1 = a( k+1, k+1 ) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-cone )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_zcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_zsymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_zdotu( k-1, work, 1, a( 1, k ),1 )
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_zdotu( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_zcopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_zsymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -stdlib_zdotu( k-1, work, 1, a( 1, k+1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 call stdlib_zswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                 call stdlib_zswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+              end if
+              k = k + kstep
+              go to 30
+              40 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              50 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 60
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = cone / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_zcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_zsymv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_zdotu( n-k, work, 1, a( k+1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = a( k, k-1 )
+                 ak = a( k-1, k-1 ) / t
+                 akp1 = a( k, k ) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-cone )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_zcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_zsymv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+&
+                              1, k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_zdotu( n-k, work, 1, a( k+1, k ),1 )
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_zdotu( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 &
+                              )
+                    call stdlib_zcopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_zsymv( uplo, n-k, -cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+&
+                              1, k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -stdlib_zdotu( n-k, work, 1, a( k+1, k-1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              kp = abs( ipiv( k ) )
+              if( kp/=k ) then
+                 ! interchange rows and columns k and kp in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 if( kp<n )call stdlib_zswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                 call stdlib_zswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                 temp = a( k, k )
+                 a( k, k ) = a( kp, kp )
+                 a( kp, kp ) = temp
+                 if( kstep==2 ) then
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+              end if
+              k = k - kstep
+              go to 50
+              60 continue
+           end if
+           return
+     end subroutine stdlib_zsytri
+
+     !> ZSYTRI_ROOK: computes the inverse of a complex symmetric
+     !> matrix A using the factorization A = U*D*U**T or A = L*D*L**T
+     !> computed by ZSYTRF_ROOK.
+
+     pure subroutine stdlib_zsytri_rook( uplo, n, a, lda, ipiv, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kp, kstep
+           complex(dp) :: ak, akkp1, akp1, d, t, temp
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRI_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do info = n, 1, -1
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do info = 1, n
+                 if( ipiv( info )>0 .and. a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           if( upper ) then
+              ! compute inv(a) from the factorization a = u*d*u**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              30 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 40
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = cone / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k>1 ) then
+                    call stdlib_zcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_zsymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_zdotu( k-1, work, 1, a( 1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = a( k, k+1 )
+                 ak = a( k, k ) / t
+                 akp1 = a( k+1, k+1 ) / t
+                 akkp1 = a( k, k+1 ) / t
+                 d = t*( ak*akp1-cone )
+                 a( k, k ) = akp1 / d
+                 a( k+1, k+1 ) = ak / d
+                 a( k, k+1 ) = -akkp1 / d
+                 ! compute columns k and k+1 of the inverse.
+                 if( k>1 ) then
+                    call stdlib_zcopy( k-1, a( 1, k ), 1, work, 1 )
+                    call stdlib_zsymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k ), 1 )
+                              
+                    a( k, k ) = a( k, k ) - stdlib_zdotu( k-1, work, 1, a( 1, k ),1 )
+                    a( k, k+1 ) = a( k, k+1 ) -stdlib_zdotu( k-1, a( 1, k ), 1, a( 1, k+1 ), 1 )
+                              
+                    call stdlib_zcopy( k-1, a( 1, k+1 ), 1, work, 1 )
+                    call stdlib_zsymv( uplo, k-1, -cone, a, lda, work, 1, czero,a( 1, k+1 ), 1 )
+                              
+                    a( k+1, k+1 ) = a( k+1, k+1 ) -stdlib_zdotu( k-1, work, 1, a( 1, k+1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the leading
+                 ! submatrix a(1:k+1,1:k+1)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_zswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_zswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k+1 with -ipiv(k) and
+                 ! -ipiv(k+1)in the leading submatrix a(1:k+1,1:k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_zswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_zswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k+1 )
+                    a( k, k+1 ) = a( kp, k+1 )
+                    a( kp, k+1 ) = temp
+                 end if
+                 k = k + 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp>1 )call stdlib_zswap( kp-1, a( 1, k ), 1, a( 1, kp ), 1 )
+                    call stdlib_zswap( k-kp-1, a( kp+1, k ), 1, a( kp, kp+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k + 1
+              go to 30
+              40 continue
+           else
+              ! compute inv(a) from the factorization a = l*d*l**t.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              50 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 60
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! invert the diagonal block.
+                 a( k, k ) = cone / a( k, k )
+                 ! compute column k of the inverse.
+                 if( k<n ) then
+                    call stdlib_zcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_zsymv( uplo, n-k,-cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+1,&
+                               k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_zdotu( n-k, work, 1, a( k+1, k ),1 )
+                 end if
+                 kstep = 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! invert the diagonal block.
+                 t = a( k, k-1 )
+                 ak = a( k-1, k-1 ) / t
+                 akp1 = a( k, k ) / t
+                 akkp1 = a( k, k-1 ) / t
+                 d = t*( ak*akp1-cone )
+                 a( k-1, k-1 ) = akp1 / d
+                 a( k, k ) = ak / d
+                 a( k, k-1 ) = -akkp1 / d
+                 ! compute columns k-1 and k of the inverse.
+                 if( k<n ) then
+                    call stdlib_zcopy( n-k, a( k+1, k ), 1, work, 1 )
+                    call stdlib_zsymv( uplo, n-k,-cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+1,&
+                               k ), 1 )
+                    a( k, k ) = a( k, k ) - stdlib_zdotu( n-k, work, 1, a( k+1, k ),1 )
+                    a( k, k-1 ) = a( k, k-1 ) -stdlib_zdotu( n-k, a( k+1, k ), 1, a( k+1, k-1 ),1 &
+                              )
+                    call stdlib_zcopy( n-k, a( k+1, k-1 ), 1, work, 1 )
+                    call stdlib_zsymv( uplo, n-k,-cone, a( k+1, k+1 ), lda, work, 1,czero, a( k+1,&
+                               k-1 ), 1 )
+                    a( k-1, k-1 ) = a( k-1, k-1 ) -stdlib_zdotu( n-k, work, 1, a( k+1, k-1 ), 1 )
+                              
+                 end if
+                 kstep = 2
+              end if
+              if( kstep==1 ) then
+                 ! interchange rows and columns k and ipiv(k) in the trailing
+                 ! submatrix a(k-1:n,k-1:n)
+                 kp = ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_zswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_zswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              else
+                 ! interchange rows and columns k and k-1 with -ipiv(k) and
+                 ! -ipiv(k-1) in the trailing submatrix a(k-1:n,k-1:n)
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_zswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_zswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                    temp = a( k, k-1 )
+                    a( k, k-1 ) = a( kp, k-1 )
+                    a( kp, k-1 ) = temp
+                 end if
+                 k = k - 1
+                 kp = -ipiv( k )
+                 if( kp/=k ) then
+                    if( kp<n )call stdlib_zswap( n-kp, a( kp+1, k ), 1, a( kp+1, kp ), 1 )
+                    call stdlib_zswap( kp-k-1, a( k+1, k ), 1, a( kp, k+1 ), lda )
+                    temp = a( k, k )
+                    a( k, k ) = a( kp, kp )
+                    a( kp, kp ) = temp
+                 end if
+              end if
+              k = k - 1
+              go to 50
+              60 continue
+           end if
+           return
+     end subroutine stdlib_zsytri_rook
+
+     !> ZSYTRS: solves a system of linear equations A*X = B with a complex
+     !> symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by ZSYTRF.
+
+     pure subroutine stdlib_zsytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           complex(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_zgeru( k-1, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_zscal( nrhs, cone / a( k, k ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_zswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_zgeru( k-2, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 call stdlib_zgeru( k-2, nrhs, -cone, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ), &
+                           ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_zgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb, a( 1, k ),1, cone, b( &
+                           k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 call stdlib_zgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb, a( 1, k ),1, cone, b( &
+                           k, 1 ), ldb )
+                 call stdlib_zgemv( 'TRANSPOSE', k-1, nrhs, -cone, b, ldb,a( 1, k+1 ), 1, cone, b(&
+                            k+1, 1 ), ldb )
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_zgeru( n-k, nrhs, -cone, a( k+1, k ), 1, b( k, 1 ),ldb, b( &
+                           k+1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_zscal( nrhs, cone / a( k, k ), b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_zswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_zgeru( n-k-1, nrhs, -cone, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_zgeru( n-k-1, nrhs, -cone, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / akm1k
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_zgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+&
+                           1, k ), 1, cone, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_zgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+1, k )&
+                              , 1, cone, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+1, k-&
+                              1 ), 1, cone, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_zsytrs
+
+     !> ZSYTRS2: solves a system of linear equations A*X = B with a complex
+     !> symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by ZSYTRF and converted by ZSYCONV.
+
+     pure subroutine stdlib_zsytrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, j, k, kp
+           complex(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRS2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! convert a
+           call stdlib_zsyconv( uplo, 'C', n, a, lda, ipiv, work, iinfo )
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+             ! p**t * b
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp==-ipiv( k-1 ) )call stdlib_zswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb &
+                           )
+                 k=k-2
+              end if
+             end do
+        ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+             call stdlib_ztrsm('L','U','N','U',n,nrhs,cone,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i=n
+              do while ( i >= 1 )
+                 if( ipiv(i) > 0 ) then
+                   call stdlib_zscal( nrhs, cone / a( i, i ), b( i, 1 ), ldb )
+                 elseif ( i > 1) then
+                    if ( ipiv(i-1) == ipiv(i) ) then
+                       akm1k = work(i)
+                       akm1 = a( i-1, i-1 ) / akm1k
+                       ak = a( i, i ) / akm1k
+                       denom = akm1*ak - cone
+                       do j = 1, nrhs
+                          bkm1 = b( i-1, j ) / akm1k
+                          bk = b( i, j ) / akm1k
+                          b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                          b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                    i = i - 1
+                    endif
+                 endif
+                 i = i - 1
+              end do
+            ! compute (u**t \ b) -> b   [ u**t \ (d \ (u \p**t * b) ) ]
+              call stdlib_ztrsm('L','U','T','U',n,nrhs,cone,a,lda,b,ldb)
+             ! p * b  [ p * (u**t \ (d \ (u \p**t * b) )) ]
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k < n .and. kp==-ipiv( k+1 ) )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp,&
+                            1 ), ldb )
+                 k=k+2
+              endif
+             end do
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+             ! p**t * b
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k+1).
+                 kp = -ipiv( k+1 )
+                 if( kp==-ipiv( k ) )call stdlib_zswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                           
+                 k=k+2
+              endif
+             end do
+        ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+             call stdlib_ztrsm('L','L','N','U',n,nrhs,cone,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i=1
+              do while ( i <= n )
+                 if( ipiv(i) > 0 ) then
+                   call stdlib_zscal( nrhs, cone / a( i, i ), b( i, 1 ), ldb )
+                 else
+                       akm1k = work(i)
+                       akm1 = a( i, i ) / akm1k
+                       ak = a( i+1, i+1 ) / akm1k
+                       denom = akm1*ak - cone
+                       do j = 1, nrhs
+                          bkm1 = b( i, j ) / akm1k
+                          bk = b( i+1, j ) / akm1k
+                          b( i, j ) = ( ak*bkm1-bk ) / denom
+                          b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                       i = i + 1
+                 endif
+                 i = i + 1
+              end do
+        ! compute (l**t \ b) -> b   [ l**t \ (d \ (l \p**t * b) ) ]
+             call stdlib_ztrsm('L','L','T','U',n,nrhs,cone,a,lda,b,ldb)
+             ! p * b  [ p * (l**t \ (d \ (l \p**t * b) )) ]
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k>1 .and. kp==-ipiv( k-1 ) )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, &
+                           1 ), ldb )
+                 k=k-2
+              endif
+             end do
+           end if
+           ! revert a
+           call stdlib_zsyconv( uplo, 'R', n, a, lda, ipiv, work, iinfo )
+           return
+     end subroutine stdlib_zsytrs2
+
+     !> ZSYTRS_3: solves a system of linear equations A * X = B with a complex
+     !> symmetric matrix A using the factorization computed
+     !> by ZSYTRF_RK or ZSYTRF_BK:
+     !> A = P*U*D*(U**T)*(P**T) or A = P*L*D*(L**T)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This algorithm is using Level 3 BLAS.
+
+     pure subroutine stdlib_zsytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*), e(*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j, k, kp
+           complex(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRS_3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! begin upper
+              ! solve a*x = b, where a = u*d*u**t.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+              call stdlib_ztrsm( 'L', 'U', 'N', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i = n
+              do while ( i>=1 )
+                 if( ipiv( i )>0 ) then
+                    call stdlib_zscal( nrhs, cone / a( i, i ), b( i, 1 ), ldb )
+                 else if ( i>1 ) then
+                    akm1k = e( i )
+                    akm1 = a( i-1, i-1 ) / akm1k
+                    ak = a( i, i ) / akm1k
+                    denom = akm1*ak - cone
+                    do j = 1, nrhs
+                       bkm1 = b( i-1, j ) / akm1k
+                       bk = b( i, j ) / akm1k
+                       b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                       b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i - 1
+                 end if
+                 i = i - 1
+              end do
+              ! compute (u**t \ b) -> b   [ u**t \ (d \ (u \p**t * b) ) ]
+              call stdlib_ztrsm( 'L', 'U', 'T', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! p * b  [ p * (u**t \ (d \ (u \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for upper case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+           else
+              ! begin lower
+              ! solve a*x = b, where a = l*d*l**t.
+              ! p**t * b
+              ! interchange rows k and ipiv(k) of matrix b in the same order
+              ! that the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = 1, n, 1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+              call stdlib_ztrsm( 'L', 'L', 'N', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i = 1
+              do while ( i<=n )
+                 if( ipiv( i )>0 ) then
+                    call stdlib_zscal( nrhs, cone / a( i, i ), b( i, 1 ), ldb )
+                 else if( i<n ) then
+                    akm1k = e( i )
+                    akm1 = a( i, i ) / akm1k
+                    ak = a( i+1, i+1 ) / akm1k
+                    denom = akm1*ak - cone
+                    do  j = 1, nrhs
+                       bkm1 = b( i, j ) / akm1k
+                       bk = b( i+1, j ) / akm1k
+                       b( i, j ) = ( ak*bkm1-bk ) / denom
+                       b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                    end do
+                    i = i + 1
+                 end if
+                 i = i + 1
+              end do
+              ! compute (l**t \ b) -> b   [ l**t \ (d \ (l \p**t * b) ) ]
+              call stdlib_ztrsm('L', 'L', 'T', 'U', n, nrhs, cone, a, lda, b, ldb )
+              ! p * b  [ p * (l**t \ (d \ (l \p**t * b) )) ]
+              ! interchange rows k and ipiv(k) of matrix b in reverse order
+              ! from the formation order of ipiv(i) vector for lower case.
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv(i) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              do k = n, 1, -1
+                 kp = abs( ipiv( k ) )
+                 if( kp/=k ) then
+                    call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end if
+              end do
+              ! end lower
+           end if
+           return
+     end subroutine stdlib_zsytrs_3
+
+     !> ZSYTRS_AA: solves a system of linear equations A*X = B with a complex
+     !> symmetric matrix A using the factorization A = U**T*T*U or
+     !> A = L*T*L**T computed by ZSYTRF_AA.
+
+     pure subroutine stdlib_zsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, nrhs, lda, ldb, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           logical(lk) :: lquery, upper
+           integer(ilp) :: k, kp, lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, 3*n-2 ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRS_AA', -info )
+              return
+           else if( lquery ) then
+              lwkopt = (3*n-2)
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u**t*t*u.
+              ! 1) forward substitution with u**t
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute u**t \ b -> b    [ (u**t \p**t * b) ]
+                 call stdlib_ztrsm( 'L', 'U', 'T', 'U', n-1, nrhs, cone, a( 1, 2 ),lda, b( 2, 1 ),&
+                            ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (u**t \p**t * b) ]
+              call stdlib_zlacpy( 'F', 1, n, a( 1, 1 ), lda+1, work( n ), 1)
+              if( n>1 ) then
+                 call stdlib_zlacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 1 ), 1 )
+                 call stdlib_zlacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 2*n ), 1 )
+              end if
+              call stdlib_zgtsv( n, nrhs, work( 1 ), work( n ), work( 2*n ), b, ldb,info )
+              ! 3) backward substitution with u
+              if( n>1 ) then
+                 ! compute u \ b -> b   [ u \ (t \ (u**t \p**t * b) ) ]
+                 call stdlib_ztrsm( 'L', 'U', 'N', 'U', n-1, nrhs, cone, a( 1, 2 ),lda, b( 2, 1 ),&
+                            ldb)
+                 ! pivot, p * b -> b  [ p * (u \ (t \ (u**t \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           else
+              ! solve a*x = b, where a = l*t*l**t.
+              ! 1) forward substitution with l
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute l \ b -> b    [ (l \p**t * b) ]
+                 call stdlib_ztrsm( 'L', 'L', 'N', 'U', n-1, nrhs, cone, a( 2, 1 ),lda, b( 2, 1 ),&
+                            ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (l \p**t * b) ]
+              call stdlib_zlacpy( 'F', 1, n, a(1, 1), lda+1, work(n), 1)
+              if( n>1 ) then
+                 call stdlib_zlacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 1 ), 1 )
+                 call stdlib_zlacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 2*n ), 1 )
+              end if
+              call stdlib_zgtsv( n, nrhs, work( 1 ), work(n), work( 2*n ), b, ldb,info)
+              ! 3) backward substitution with l**t
+              if( n>1 ) then
+                 ! compute (l**t \ b) -> b   [ l**t \ (t \ (l \p**t * b) ) ]
+                 call stdlib_ztrsm( 'L', 'L', 'T', 'U', n-1, nrhs, cone, a( 2, 1 ),lda, b( 2, 1 ),&
+                            ldb)
+                 ! pivot, p * b -> b  [ p * (l**t \ (t \ (l \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zsytrs_aa
+
+     !> ZSYTRS_ROOK: solves a system of linear equations A*X = B with
+     !> a complex symmetric matrix A using the factorization A = U*D*U**T or
+     !> A = L*D*L**T computed by ZSYTRF_ROOK.
+
+     pure subroutine stdlib_zsytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           complex(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRS_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**t.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_zgeru( k-1, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_zscal( nrhs, cone / a( k, k ), b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k) then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_zswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k>2 ) then
+                    call stdlib_zgeru( k-2, nrhs,-cone, a( 1, k ), 1, b( k, 1 ),ldb, b( 1, 1 ), &
+                              ldb )
+                    call stdlib_zgeru( k-2, nrhs,-cone, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 )&
+                              , ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / akm1k
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**t *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**t(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 )call stdlib_zgemv( 'TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), 1, &
+                           cone, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**t(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_zgemv( 'TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), 1, cone, &
+                              b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k+1 ), 1, cone,&
+                               b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k) then k+1 and -ipiv(k+1).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_zswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**t.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_zgeru( n-k, nrhs, -cone, a( k+1, k ), 1, b( k, 1 ),ldb, b( &
+                           k+1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 call stdlib_zscal( nrhs, cone / a( k, k ), b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k) then k+1 and -ipiv(k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_zswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_zgeru( n-k-1, nrhs,-cone, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_zgeru( n-k-1, nrhs,-cone, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( k+&
+                              2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / akm1k
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / akm1k
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**t *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**t(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_zgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+&
+                           1, k ), 1, cone, b( k, 1 ), ldb )
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**t(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_zgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+1, k )&
+                              , 1, cone, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'TRANSPOSE', n-k, nrhs, -cone, b( k+1, 1 ),ldb, a( k+1, k-&
+                              1 ), 1, cone, b( k-1, 1 ),ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k) then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_zswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_zsytrs_rook
+
+     !> ZTBRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular band
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by ZTBTRS or some other
+     !> means before entering this routine.  ZTBRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_ztbrfs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, x, ldx, ferr,&
+                berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: ab(ldab,*), b(ldb,*), x(ldx,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transn, transt
+           integer(ilp) :: i, j, k, kase, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = kd + 2
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_zcopy( n, x( 1, j ), 1, work, 1 )
+              call stdlib_ztbmv( uplo, trans, diag, n, kd, ab, ldab, work, 1 )
+              call stdlib_zaxpy( n, -cone, b( 1, j ), 1, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = max( 1, k-kd ), k
+                             rwork( i ) = rwork( i ) +cabs1( ab( kd+1+i-k, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = max( 1, k-kd ), k - 1
+                             rwork( i ) = rwork( i ) +cabs1( ab( kd+1+i-k, k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k, min( n, k+kd )
+                             rwork( i ) = rwork( i ) +cabs1( ab( 1+i-k, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k + 1, min( n, k+kd )
+                             rwork( i ) = rwork( i ) +cabs1( ab( 1+i-k, k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**h)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = max( 1, k-kd ), k
+                             s = s + cabs1( ab( kd+1+i-k, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = max( 1, k-kd ), k - 1
+                             s = s + cabs1( ab( kd+1+i-k, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, min( n, k+kd )
+                             s = s + cabs1( ab( 1+i-k, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = k + 1, min( n, k+kd )
+                             s = s + cabs1( ab( 1+i-k, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_zlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_ztbsv( uplo, transt, diag, n, kd, ab, ldab, work,1 )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_ztbsv( uplo, transn, diag, n, kd, ab, ldab, work,1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_ztbrfs
+
+     !> ZTBTRS: solves a triangular system of the form
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> where A is a triangular band matrix of order N, and B is an
+     !> N-by-NRHS matrix.  A check is made to verify that A is nonsingular.
+
+     pure subroutine stdlib_ztbtrs( uplo, trans, diag, n, kd, nrhs, ab, ldab, b,ldb, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           complex(dp), intent(in) :: ab(ldab,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nounit = stdlib_lsame( diag, 'N' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              if( upper ) then
+                 do info = 1, n
+                    if( ab( kd+1, info )==czero )return
+                 end do
+              else
+                 do info = 1, n
+                    if( ab( 1, info )==czero )return
+                 end do
+              end if
+           end if
+           info = 0
+           ! solve a * x = b,  a**t * x = b,  or  a**h * x = b.
+           do j = 1, nrhs
+              call stdlib_ztbsv( uplo, trans, diag, n, kd, ab, ldab, b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_ztbtrs
+
+     !> Level 3 BLAS like routine for A in RFP Format.
+     !> ZTFSM:  solves the matrix equation
+     !> op( A )*X = alpha*B  or  X*op( A ) = alpha*B
+     !> where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+     !> non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+     !> op( A ) = A   or   op( A ) = A**H.
+     !> A is in Rectangular Full Packed (RFP) Format.
+     !> The matrix X is overwritten on B.
+
+     pure subroutine stdlib_ztfsm( transr, side, uplo, trans, diag, m, n, alpha, a,b, ldb )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, diag, side, trans, uplo
+           integer(ilp), intent(in) :: ldb, m, n
+           complex(dp), intent(in) :: alpha
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(0:*)
+           complex(dp), intent(inout) :: b(0:ldb-1,0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lside, misodd, nisodd, normaltransr, notrans
+           integer(ilp) :: m1, m2, n1, n2, k, info, i, j
+           ! Intrinsic Functions 
+           intrinsic :: max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lside = stdlib_lsame( side, 'L' )
+           lower = stdlib_lsame( uplo, 'L' )
+           notrans = stdlib_lsame( trans, 'N' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lside .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -2
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -3
+           else if( .not.notrans .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -4
+           else if( .not.stdlib_lsame( diag, 'N' ) .and. .not.stdlib_lsame( diag, 'U' ) )&
+                     then
+              info = -5
+           else if( m<0 ) then
+              info = -6
+           else if( n<0 ) then
+              info = -7
+           else if( ldb<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTFSM ', -info )
+              return
+           end if
+           ! quick return when ( (n==0).or.(m==0) )
+           if( ( m==0 ) .or. ( n==0 ) )return
+           ! quick return when alpha==(0e+0_dp,0e+0_dp)
+           if( alpha==czero ) then
+              do j = 0, n - 1
+                 do i = 0, m - 1
+                    b( i, j ) = czero
+                 end do
+              end do
+              return
+           end if
+           if( lside ) then
+              ! side = 'l'
+              ! a is m-by-m.
+              ! if m is odd, set nisodd = .true., and m1 and m2.
+              ! if m is even, nisodd = .false., and m.
+              if( mod( m, 2 )==0 ) then
+                 misodd = .false.
+                 k = m / 2
+              else
+                 misodd = .true.
+                 if( lower ) then
+                    m2 = m / 2
+                    m1 = m - m2
+                 else
+                    m1 = m / 2
+                    m2 = m - m1
+                 end if
+              end if
+              if( misodd ) then
+                 ! side = 'l' and n is odd
+                 if( normaltransr ) then
+                    ! side = 'l', n is odd, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='l', n is odd, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'n'
+                          if( m==1 ) then
+                             call stdlib_ztrsm( 'L', 'L', 'N', diag, m1, n, alpha,a, m, b, ldb )
+                                       
+                          else
+                             call stdlib_ztrsm( 'L', 'L', 'N', diag, m1, n, alpha,a( 0 ), m, b, &
+                                       ldb )
+                             call stdlib_zgemm( 'N', 'N', m2, n, m1, -cone, a( m1 ),m, b, ldb, &
+                                       alpha, b( m1, 0 ), ldb )
+                             call stdlib_ztrsm( 'L', 'U', 'C', diag, m2, n, cone,a( m ), m, b( m1,&
+                                        0 ), ldb )
+                          end if
+                       else
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'c'
+                          if( m==1 ) then
+                             call stdlib_ztrsm( 'L', 'L', 'C', diag, m1, n, alpha,a( 0 ), m, b, &
+                                       ldb )
+                          else
+                             call stdlib_ztrsm( 'L', 'U', 'N', diag, m2, n, alpha,a( m ), m, b( &
+                                       m1, 0 ), ldb )
+                             call stdlib_zgemm( 'C', 'N', m1, n, m2, -cone, a( m1 ),m, b( m1, 0 ),&
+                                        ldb, alpha, b, ldb )
+                             call stdlib_ztrsm( 'L', 'L', 'C', diag, m1, n, cone,a( 0 ), m, b, &
+                                       ldb )
+                          end if
+                       end if
+                    else
+                       ! side  ='l', n is odd, transr = 'n', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_ztrsm( 'L', 'L', 'N', diag, m1, n, alpha,a( m2 ), m, b, ldb &
+                                    )
+                          call stdlib_zgemm( 'C', 'N', m2, n, m1, -cone, a( 0 ), m,b, ldb, alpha, &
+                                    b( m1, 0 ), ldb )
+                          call stdlib_ztrsm( 'L', 'U', 'C', diag, m2, n, cone,a( m1 ), m, b( m1, &
+                                    0 ), ldb )
+                       else
+                          ! side  ='l', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'c'
+                          call stdlib_ztrsm( 'L', 'U', 'N', diag, m2, n, alpha,a( m1 ), m, b( m1, &
+                                    0 ), ldb )
+                          call stdlib_zgemm( 'N', 'N', m1, n, m2, -cone, a( 0 ), m,b( m1, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_ztrsm( 'L', 'L', 'C', diag, m1, n, cone,a( m2 ), m, b, ldb )
+                                    
+                       end if
+                    end if
+                 else
+                    ! side = 'l', n is odd, and transr = 'c'
+                    if( lower ) then
+                       ! side  ='l', n is odd, transr = 'c', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is odd, transr = 'c', uplo = 'l', and
+                          ! trans = 'n'
+                          if( m==1 ) then
+                             call stdlib_ztrsm( 'L', 'U', 'C', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                          else
+                             call stdlib_ztrsm( 'L', 'U', 'C', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                             call stdlib_zgemm( 'C', 'N', m2, n, m1, -cone,a( m1*m1 ), m1, b, ldb,&
+                                        alpha,b( m1, 0 ), ldb )
+                             call stdlib_ztrsm( 'L', 'L', 'N', diag, m2, n, cone,a( 1 ), m1, b( &
+                                       m1, 0 ), ldb )
+                          end if
+                       else
+                          ! side  ='l', n is odd, transr = 'c', uplo = 'l', and
+                          ! trans = 'c'
+                          if( m==1 ) then
+                             call stdlib_ztrsm( 'L', 'U', 'N', diag, m1, n, alpha,a( 0 ), m1, b, &
+                                       ldb )
+                          else
+                             call stdlib_ztrsm( 'L', 'L', 'C', diag, m2, n, alpha,a( 1 ), m1, b( &
+                                       m1, 0 ), ldb )
+                             call stdlib_zgemm( 'N', 'N', m1, n, m2, -cone,a( m1*m1 ), m1, b( m1, &
+                                       0 ), ldb,alpha, b, ldb )
+                             call stdlib_ztrsm( 'L', 'U', 'N', diag, m1, n, cone,a( 0 ), m1, b, &
+                                       ldb )
+                          end if
+                       end if
+                    else
+                       ! side  ='l', n is odd, transr = 'c', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is odd, transr = 'c', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_ztrsm( 'L', 'U', 'C', diag, m1, n, alpha,a( m2*m2 ), m2, b, &
+                                    ldb )
+                          call stdlib_zgemm( 'N', 'N', m2, n, m1, -cone, a( 0 ), m2,b, ldb, alpha,&
+                                     b( m1, 0 ), ldb )
+                          call stdlib_ztrsm( 'L', 'L', 'N', diag, m2, n, cone,a( m1*m2 ), m2, b( &
+                                    m1, 0 ), ldb )
+                       else
+                          ! side  ='l', n is odd, transr = 'c', uplo = 'u', and
+                          ! trans = 'c'
+                          call stdlib_ztrsm( 'L', 'L', 'C', diag, m2, n, alpha,a( m1*m2 ), m2, b( &
+                                    m1, 0 ), ldb )
+                          call stdlib_zgemm( 'C', 'N', m1, n, m2, -cone, a( 0 ), m2,b( m1, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_ztrsm( 'L', 'U', 'N', diag, m1, n, cone,a( m2*m2 ), m2, b, &
+                                    ldb )
+                       end if
+                    end if
+                 end if
+              else
+                 ! side = 'l' and n is even
+                 if( normaltransr ) then
+                    ! side = 'l', n is even, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='l', n is even, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_ztrsm( 'L', 'L', 'N', diag, k, n, alpha,a( 1 ), m+1, b, ldb &
+                                    )
+                          call stdlib_zgemm( 'N', 'N', k, n, k, -cone, a( k+1 ),m+1, b, ldb, &
+                                    alpha, b( k, 0 ), ldb )
+                          call stdlib_ztrsm( 'L', 'U', 'C', diag, k, n, cone,a( 0 ), m+1, b( k, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'c'
+                          call stdlib_ztrsm( 'L', 'U', 'N', diag, k, n, alpha,a( 0 ), m+1, b( k, &
+                                    0 ), ldb )
+                          call stdlib_zgemm( 'C', 'N', k, n, k, -cone, a( k+1 ),m+1, b( k, 0 ), &
+                                    ldb, alpha, b, ldb )
+                          call stdlib_ztrsm( 'L', 'L', 'C', diag, k, n, cone,a( 1 ), m+1, b, ldb )
+                                    
+                       end if
+                    else
+                       ! side  ='l', n is even, transr = 'n', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_ztrsm( 'L', 'L', 'N', diag, k, n, alpha,a( k+1 ), m+1, b, &
+                                    ldb )
+                          call stdlib_zgemm( 'C', 'N', k, n, k, -cone, a( 0 ), m+1,b, ldb, alpha, &
+                                    b( k, 0 ), ldb )
+                          call stdlib_ztrsm( 'L', 'U', 'C', diag, k, n, cone,a( k ), m+1, b( k, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'c'
+                          call stdlib_ztrsm( 'L', 'U', 'N', diag, k, n, alpha,a( k ), m+1, b( k, &
+                                    0 ), ldb )
+                          call stdlib_zgemm( 'N', 'N', k, n, k, -cone, a( 0 ), m+1,b( k, 0 ), ldb,&
+                                     alpha, b, ldb )
+                          call stdlib_ztrsm( 'L', 'L', 'C', diag, k, n, cone,a( k+1 ), m+1, b, &
+                                    ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'l', n is even, and transr = 'c'
+                    if( lower ) then
+                       ! side  ='l', n is even, transr = 'c', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='l', n is even, transr = 'c', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_ztrsm( 'L', 'U', 'C', diag, k, n, alpha,a( k ), k, b, ldb )
+                                    
+                          call stdlib_zgemm( 'C', 'N', k, n, k, -cone,a( k*( k+1 ) ), k, b, ldb, &
+                                    alpha,b( k, 0 ), ldb )
+                          call stdlib_ztrsm( 'L', 'L', 'N', diag, k, n, cone,a( 0 ), k, b( k, 0 ),&
+                                     ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'c', uplo = 'l',
+                          ! and trans = 'c'
+                          call stdlib_ztrsm( 'L', 'L', 'C', diag, k, n, alpha,a( 0 ), k, b( k, 0 )&
+                                    , ldb )
+                          call stdlib_zgemm( 'N', 'N', k, n, k, -cone,a( k*( k+1 ) ), k, b( k, 0 )&
+                                    , ldb,alpha, b, ldb )
+                          call stdlib_ztrsm( 'L', 'U', 'N', diag, k, n, cone,a( k ), k, b, ldb )
+                                    
+                       end if
+                    else
+                       ! side  ='l', n is even, transr = 'c', and uplo = 'u'
+                       if( .not.notrans ) then
+                          ! side  ='l', n is even, transr = 'c', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_ztrsm( 'L', 'U', 'C', diag, k, n, alpha,a( k*( k+1 ) ), k, &
+                                    b, ldb )
+                          call stdlib_zgemm( 'N', 'N', k, n, k, -cone, a( 0 ), k, b,ldb, alpha, b(&
+                                     k, 0 ), ldb )
+                          call stdlib_ztrsm( 'L', 'L', 'N', diag, k, n, cone,a( k*k ), k, b( k, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='l', n is even, transr = 'c', uplo = 'u',
+                          ! and trans = 'c'
+                          call stdlib_ztrsm( 'L', 'L', 'C', diag, k, n, alpha,a( k*k ), k, b( k, &
+                                    0 ), ldb )
+                          call stdlib_zgemm( 'C', 'N', k, n, k, -cone, a( 0 ), k,b( k, 0 ), ldb, &
+                                    alpha, b, ldb )
+                          call stdlib_ztrsm( 'L', 'U', 'N', diag, k, n, cone,a( k*( k+1 ) ), k, b,&
+                                     ldb )
+                       end if
+                    end if
+                 end if
+              end if
+           else
+              ! side = 'r'
+              ! a is n-by-n.
+              ! if n is odd, set nisodd = .true., and n1 and n2.
+              ! if n is even, nisodd = .false., and k.
+              if( mod( n, 2 )==0 ) then
+                 nisodd = .false.
+                 k = n / 2
+              else
+                 nisodd = .true.
+                 if( lower ) then
+                    n2 = n / 2
+                    n1 = n - n2
+                 else
+                    n1 = n / 2
+                    n2 = n - n1
+                 end if
+              end if
+              if( nisodd ) then
+                 ! side = 'r' and n is odd
+                 if( normaltransr ) then
+                    ! side = 'r', n is odd, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='r', n is odd, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'n'
+                          call stdlib_ztrsm( 'R', 'U', 'C', diag, m, n2, alpha,a( n ), n, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_zgemm( 'N', 'N', m, n1, n2, -cone, b( 0, n1 ),ldb, a( n1 ), &
+                                    n, alpha, b( 0, 0 ),ldb )
+                          call stdlib_ztrsm( 'R', 'L', 'N', diag, m, n1, cone,a( 0 ), n, b( 0, 0 )&
+                                    , ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'l', and
+                          ! trans = 'c'
+                          call stdlib_ztrsm( 'R', 'L', 'C', diag, m, n1, alpha,a( 0 ), n, b( 0, 0 &
+                                    ), ldb )
+                          call stdlib_zgemm( 'N', 'C', m, n2, n1, -cone, b( 0, 0 ),ldb, a( n1 ), &
+                                    n, alpha, b( 0, n1 ),ldb )
+                          call stdlib_ztrsm( 'R', 'U', 'N', diag, m, n2, cone,a( n ), n, b( 0, n1 &
+                                    ), ldb )
+                       end if
+                    else
+                       ! side  ='r', n is odd, transr = 'n', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_ztrsm( 'R', 'L', 'C', diag, m, n1, alpha,a( n2 ), n, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_zgemm( 'N', 'N', m, n2, n1, -cone, b( 0, 0 ),ldb, a( 0 ), n,&
+                                     alpha, b( 0, n1 ),ldb )
+                          call stdlib_ztrsm( 'R', 'U', 'N', diag, m, n2, cone,a( n1 ), n, b( 0, &
+                                    n1 ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'n', uplo = 'u', and
+                          ! trans = 'c'
+                          call stdlib_ztrsm( 'R', 'U', 'C', diag, m, n2, alpha,a( n1 ), n, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_zgemm( 'N', 'C', m, n1, n2, -cone, b( 0, n1 ),ldb, a( 0 ), &
+                                    n, alpha, b( 0, 0 ), ldb )
+                          call stdlib_ztrsm( 'R', 'L', 'N', diag, m, n1, cone,a( n2 ), n, b( 0, 0 &
+                                    ), ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'r', n is odd, and transr = 'c'
+                    if( lower ) then
+                       ! side  ='r', n is odd, transr = 'c', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'c', uplo = 'l', and
+                          ! trans = 'n'
+                          call stdlib_ztrsm( 'R', 'L', 'N', diag, m, n2, alpha,a( 1 ), n1, b( 0, &
+                                    n1 ), ldb )
+                          call stdlib_zgemm( 'N', 'C', m, n1, n2, -cone, b( 0, n1 ),ldb, a( n1*n1 &
+                                    ), n1, alpha, b( 0, 0 ),ldb )
+                          call stdlib_ztrsm( 'R', 'U', 'C', diag, m, n1, cone,a( 0 ), n1, b( 0, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'c', uplo = 'l', and
+                          ! trans = 'c'
+                          call stdlib_ztrsm( 'R', 'U', 'N', diag, m, n1, alpha,a( 0 ), n1, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_zgemm( 'N', 'N', m, n2, n1, -cone, b( 0, 0 ),ldb, a( n1*n1 )&
+                                    , n1, alpha, b( 0, n1 ),ldb )
+                          call stdlib_ztrsm( 'R', 'L', 'C', diag, m, n2, cone,a( 1 ), n1, b( 0, &
+                                    n1 ), ldb )
+                       end if
+                    else
+                       ! side  ='r', n is odd, transr = 'c', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is odd, transr = 'c', uplo = 'u', and
+                          ! trans = 'n'
+                          call stdlib_ztrsm( 'R', 'U', 'N', diag, m, n1, alpha,a( n2*n2 ), n2, b( &
+                                    0, 0 ), ldb )
+                          call stdlib_zgemm( 'N', 'C', m, n2, n1, -cone, b( 0, 0 ),ldb, a( 0 ), &
+                                    n2, alpha, b( 0, n1 ),ldb )
+                          call stdlib_ztrsm( 'R', 'L', 'C', diag, m, n2, cone,a( n1*n2 ), n2, b( &
+                                    0, n1 ), ldb )
+                       else
+                          ! side  ='r', n is odd, transr = 'c', uplo = 'u', and
+                          ! trans = 'c'
+                          call stdlib_ztrsm( 'R', 'L', 'N', diag, m, n2, alpha,a( n1*n2 ), n2, b( &
+                                    0, n1 ), ldb )
+                          call stdlib_zgemm( 'N', 'N', m, n1, n2, -cone, b( 0, n1 ),ldb, a( 0 ), &
+                                    n2, alpha, b( 0, 0 ),ldb )
+                          call stdlib_ztrsm( 'R', 'U', 'C', diag, m, n1, cone,a( n2*n2 ), n2, b( &
+                                    0, 0 ), ldb )
+                       end if
+                    end if
+                 end if
+              else
+                 ! side = 'r' and n is even
+                 if( normaltransr ) then
+                    ! side = 'r', n is even, and transr = 'n'
+                    if( lower ) then
+                       ! side  ='r', n is even, transr = 'n', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_ztrsm( 'R', 'U', 'C', diag, m, k, alpha,a( 0 ), n+1, b( 0, &
+                                    k ), ldb )
+                          call stdlib_zgemm( 'N', 'N', m, k, k, -cone, b( 0, k ),ldb, a( k+1 ), n+&
+                                    1, alpha, b( 0, 0 ),ldb )
+                          call stdlib_ztrsm( 'R', 'L', 'N', diag, m, k, cone,a( 1 ), n+1, b( 0, 0 &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'n', uplo = 'l',
+                          ! and trans = 'c'
+                          call stdlib_ztrsm( 'R', 'L', 'C', diag, m, k, alpha,a( 1 ), n+1, b( 0, &
+                                    0 ), ldb )
+                          call stdlib_zgemm( 'N', 'C', m, k, k, -cone, b( 0, 0 ),ldb, a( k+1 ), n+&
+                                    1, alpha, b( 0, k ),ldb )
+                          call stdlib_ztrsm( 'R', 'U', 'N', diag, m, k, cone,a( 0 ), n+1, b( 0, k &
+                                    ), ldb )
+                       end if
+                    else
+                       ! side  ='r', n is even, transr = 'n', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_ztrsm( 'R', 'L', 'C', diag, m, k, alpha,a( k+1 ), n+1, b( 0,&
+                                     0 ), ldb )
+                          call stdlib_zgemm( 'N', 'N', m, k, k, -cone, b( 0, 0 ),ldb, a( 0 ), n+1,&
+                                     alpha, b( 0, k ),ldb )
+                          call stdlib_ztrsm( 'R', 'U', 'N', diag, m, k, cone,a( k ), n+1, b( 0, k &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'n', uplo = 'u',
+                          ! and trans = 'c'
+                          call stdlib_ztrsm( 'R', 'U', 'C', diag, m, k, alpha,a( k ), n+1, b( 0, &
+                                    k ), ldb )
+                          call stdlib_zgemm( 'N', 'C', m, k, k, -cone, b( 0, k ),ldb, a( 0 ), n+1,&
+                                     alpha, b( 0, 0 ),ldb )
+                          call stdlib_ztrsm( 'R', 'L', 'N', diag, m, k, cone,a( k+1 ), n+1, b( 0, &
+                                    0 ), ldb )
+                       end if
+                    end if
+                 else
+                    ! side = 'r', n is even, and transr = 'c'
+                    if( lower ) then
+                       ! side  ='r', n is even, transr = 'c', and uplo = 'l'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'c', uplo = 'l',
+                          ! and trans = 'n'
+                          call stdlib_ztrsm( 'R', 'L', 'N', diag, m, k, alpha,a( 0 ), k, b( 0, k )&
+                                    , ldb )
+                          call stdlib_zgemm( 'N', 'C', m, k, k, -cone, b( 0, k ),ldb, a( ( k+1 )&
+                                    *k ), k, alpha,b( 0, 0 ), ldb )
+                          call stdlib_ztrsm( 'R', 'U', 'C', diag, m, k, cone,a( k ), k, b( 0, 0 ),&
+                                     ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'c', uplo = 'l',
+                          ! and trans = 'c'
+                          call stdlib_ztrsm( 'R', 'U', 'N', diag, m, k, alpha,a( k ), k, b( 0, 0 )&
+                                    , ldb )
+                          call stdlib_zgemm( 'N', 'N', m, k, k, -cone, b( 0, 0 ),ldb, a( ( k+1 )&
+                                    *k ), k, alpha,b( 0, k ), ldb )
+                          call stdlib_ztrsm( 'R', 'L', 'C', diag, m, k, cone,a( 0 ), k, b( 0, k ),&
+                                     ldb )
+                       end if
+                    else
+                       ! side  ='r', n is even, transr = 'c', and uplo = 'u'
+                       if( notrans ) then
+                          ! side  ='r', n is even, transr = 'c', uplo = 'u',
+                          ! and trans = 'n'
+                          call stdlib_ztrsm( 'R', 'U', 'N', diag, m, k, alpha,a( ( k+1 )*k ), k, &
+                                    b( 0, 0 ), ldb )
+                          call stdlib_zgemm( 'N', 'C', m, k, k, -cone, b( 0, 0 ),ldb, a( 0 ), k, &
+                                    alpha, b( 0, k ), ldb )
+                          call stdlib_ztrsm( 'R', 'L', 'C', diag, m, k, cone,a( k*k ), k, b( 0, k &
+                                    ), ldb )
+                       else
+                          ! side  ='r', n is even, transr = 'c', uplo = 'u',
+                          ! and trans = 'c'
+                          call stdlib_ztrsm( 'R', 'L', 'N', diag, m, k, alpha,a( k*k ), k, b( 0, &
+                                    k ), ldb )
+                          call stdlib_zgemm( 'N', 'N', m, k, k, -cone, b( 0, k ),ldb, a( 0 ), k, &
+                                    alpha, b( 0, 0 ), ldb )
+                          call stdlib_ztrsm( 'R', 'U', 'C', diag, m, k, cone,a( ( k+1 )*k ), k, b(&
+                                     0, 0 ), ldb )
+                       end if
+                    end if
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ztfsm
+
+     !> ZTFTTP: copies a triangular matrix A from rectangular full packed
+     !> format (TF) to standard packed format (TP).
+
+     pure subroutine stdlib_ztfttp( transr, uplo, n, arf, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(dp), intent(out) :: ap(0:*)
+           complex(dp), intent(in) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt
+           integer(ilp) :: i, j, ij
+           integer(ilp) :: ijp, jp, lda, js
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Intrinsic Functions 
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTFTTP', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( normaltransr ) then
+                 ap( 0 ) = arf( 0 )
+              else
+                 ap( 0 ) = conjg( arf( 0 ) )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           ! set lda of arf^c; arf^c is (0:(n+1)/2-1,0:n-noe)
+           ! where noe = 0 if n is even, noe = 1 if n is odd
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              lda = n + 1
+           else
+              nisodd = .true.
+              lda = n
+           end if
+           ! arf^c has lda rows and n+1-noe cols
+           if( .not.normaltransr )lda = ( n+1 ) / 2
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda = n
+                    ijp = 0
+                    jp = 0
+                    do j = 0, n2
+                       do i = j, n - 1
+                          ij = i + jp
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, n2 - 1
+                       do j = 1 + i, n2
+                          ij = i + j*lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    ijp = 0
+                    do j = 0, n1 - 1
+                       ij = n2 + j
+                       do i = 0, j
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = n1, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ijp = 0
+                    do i = 0, n2
+                       do ij = i*( lda+1 ), n*lda - 1, lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 1
+                    do j = 0, n2 - 1
+                       do ij = js, js + n2 - j - 1
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    ijp = 0
+                    js = n2*lda
+                    do j = 0, n1 - 1
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, n1
+                       do ij = i, i + ( n1+i )*lda, lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    ijp = 0
+                    jp = 0
+                    do j = 0, k - 1
+                       do i = j, n - 1
+                          ij = 1 + i + jp
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, k - 1
+                       do j = i, k - 1
+                          ij = i + j*lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    ijp = 0
+                    do j = 0, k - 1
+                       ij = k + 1 + j
+                       do i = 0, j
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = k, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    ijp = 0
+                    do i = 0, k - 1
+                       do ij = i + ( i+1 )*lda, ( n+1 )*lda - 1, lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 0
+                    do j = 0, k - 1
+                       do ij = js, js + k - j - 1
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    ijp = 0
+                    js = ( k+1 )*lda
+                    do j = 0, k - 1
+                       do ij = js, js + j
+                          ap( ijp ) = arf( ij )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, k - 1
+                       do ij = i, i + ( k+i )*lda, lda
+                          ap( ijp ) = conjg( arf( ij ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ztfttp
+
+     !> ZTFTTR: copies a triangular matrix A from rectangular full packed
+     !> format (TF) to standard full format (TR).
+
+     pure subroutine stdlib_ztfttr( transr, uplo, n, arf, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           complex(dp), intent(out) :: a(0:lda-1,0:*)
+           complex(dp), intent(in) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt, nx2, np1x2
+           integer(ilp) :: i, j, l, ij
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTFTTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 ) then
+                 if( normaltransr ) then
+                    a( 0, 0 ) = arf( 0 )
+                 else
+                    a( 0, 0 ) = conjg( arf( 0 ) )
+                 end if
+              end if
+              return
+           end if
+           ! size of array arf(1:2,0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower: for n even n1=n2=k
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true., lda=n+1 and a is (n+1)--by--k2.
+           ! if n is even, set k = n/2 and nisodd = .false., lda=n and a is
+           ! n--by--(n+1)/2.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              if( .not.lower )np1x2 = n + n + 2
+           else
+              nisodd = .true.
+              if( .not.lower )nx2 = n + n
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda=n
+                    ij = 0
+                    do j = 0, n2
+                       do i = n1, n2 + j
+                          a( n2+j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0); lda=n
+                    ij = nt - n
+                    do j = n - 1, n1, -1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = j - n1, n1 - 1
+                          a( j-n1, l ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       ij = ij - nx2
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ij = 0
+                    do j = 0, n2 - 1
+                       do i = 0, j
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       do i = n1 + j, n - 1
+                          a( i, n1+j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = n2, n - 1
+                       do i = 0, n1 - 1
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    ij = 0
+                    do j = 0, n1
+                       do i = n1, n - 1
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, n1 - 1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = n2 + j, n - 1
+                          a( n2+j, l ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1); lda=n+1
+                    ij = 0
+                    do j = 0, k - 1
+                       do i = k, k + j
+                          a( k+j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0); lda=n+1
+                    ij = nt - n - 1
+                    do j = n - 1, k, -1
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = j - k, k - 1
+                          a( j-k, l ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       ij = ij - np1x2
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper, a=b)
+                    ! t1 -> a(0,1) , t2 -> a(0,0) , s -> a(0,k+1) :
+                    ! t1 -> a(0+k) , t2 -> a(0+0) , s -> a(0+k*(k+1)); lda=k
+                    ij = 0
+                    j = k
+                    do i = k, n - 1
+                       a( i, j ) = arf( ij )
+                       ij = ij + 1
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                       do i = k + 1 + j, n - 1
+                          a( i, k+1+j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = k - 1, n - 1
+                       do i = 0, k - 1
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper, a=b)
+                    ! t1 -> a(0,k+1) , t2 -> a(0,k) , s -> a(0,0)
+                    ! t1 -> a(0+k*(k+1)) , t2 -> a(0+k*k) , s -> a(0+0)); lda=k
+                    ij = 0
+                    do j = 0, k
+                       do i = k, n - 1
+                          a( j, i ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          a( i, j ) = arf( ij )
+                          ij = ij + 1
+                       end do
+                       do l = k + 1 + j, n - 1
+                          a( k+1+j, l ) = conjg( arf( ij ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    ! note that here j = k-1
+                    do i = 0, j
+                       a( i, j ) = arf( ij )
+                       ij = ij + 1
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ztfttr
+
+     !> ZTGEVC: computes some or all of the right and/or left eigenvectors of
+     !> a pair of complex matrices (S,P), where S and P are upper triangular.
+     !> Matrix pairs of this type are produced by the generalized Schur
+     !> factorization of a complex matrix pair (A,B):
+     !> A = Q*S*Z**H,  B = Q*P*Z**H
+     !> as computed by ZGGHRD + ZHGEQZ.
+     !> The right eigenvector x and the left eigenvector y of (S,P)
+     !> corresponding to an eigenvalue w are defined by:
+     !> S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
+     !> where y**H denotes the conjugate tranpose of y.
+     !> The eigenvalues are not input to this routine, but are computed
+     !> directly from the diagonal elements of S and P.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of (S,P), or the products Z*X and/or Q*Y,
+     !> where Z and Q are input matrices.
+     !> If Q and Z are the unitary factors from the generalized Schur
+     !> factorization of a matrix pair (A,B), then Z*X and Q*Y
+     !> are the matrices of right and left eigenvectors of (A,B).
+
+     pure subroutine stdlib_ztgevc( side, howmny, select, n, s, lds, p, ldp, vl,ldvl, vr, ldvr, &
+               mm, m, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldp, lds, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(in) :: p(ldp,*), s(lds,*)
+           complex(dp), intent(inout) :: vl(ldvl,*), vr(ldvr,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: compl, compr, ilall, ilback, ilbbad, ilcomp, lsa, lsb
+           integer(ilp) :: i, ibeg, ieig, iend, ihwmny, im, iside, isrc, j, je, jr
+           real(dp) :: acoefa, acoeff, anorm, ascale, bcoefa, big, bignum, bnorm, bscale, dmin, &
+                     safmin, sbeta, scale, small, temp, ulp, xmax
+           complex(dp) :: bcoeff, ca, cb, d, salpha, sum, suma, sumb, x
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,min
+           ! Statement Functions 
+           real(dp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=dp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode and test the input parameters
+           if( stdlib_lsame( howmny, 'A' ) ) then
+              ihwmny = 1
+              ilall = .true.
+              ilback = .false.
+           else if( stdlib_lsame( howmny, 'S' ) ) then
+              ihwmny = 2
+              ilall = .false.
+              ilback = .false.
+           else if( stdlib_lsame( howmny, 'B' ) ) then
+              ihwmny = 3
+              ilall = .true.
+              ilback = .true.
+           else
+              ihwmny = -1
+           end if
+           if( stdlib_lsame( side, 'R' ) ) then
+              iside = 1
+              compl = .false.
+              compr = .true.
+           else if( stdlib_lsame( side, 'L' ) ) then
+              iside = 2
+              compl = .true.
+              compr = .false.
+           else if( stdlib_lsame( side, 'B' ) ) then
+              iside = 3
+              compl = .true.
+              compr = .true.
+           else
+              iside = -1
+           end if
+           info = 0
+           if( iside<0 ) then
+              info = -1
+           else if( ihwmny<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lds<max( 1, n ) ) then
+              info = -6
+           else if( ldp<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGEVC', -info )
+              return
+           end if
+           ! count the number of eigenvectors
+           if( .not.ilall ) then
+              im = 0
+              do j = 1, n
+                 if( select( j ) )im = im + 1
+              end do
+           else
+              im = n
+           end if
+           ! check diagonal of b
+           ilbbad = .false.
+           do j = 1, n
+              if( aimag( p( j, j ) )/=zero )ilbbad = .true.
+           end do
+           if( ilbbad ) then
+              info = -7
+           else if( compl .and. ldvl<n .or. ldvl<1 ) then
+              info = -10
+           else if( compr .and. ldvr<n .or. ldvr<1 ) then
+              info = -12
+           else if( mm<im ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGEVC', -info )
+              return
+           end if
+           ! quick return if possible
+           m = im
+           if( n==0 )return
+           ! machine constants
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           big = one / safmin
+           call stdlib_dlabad( safmin, big )
+           ulp = stdlib_dlamch( 'EPSILON' )*stdlib_dlamch( 'BASE' )
+           small = safmin*n / ulp
+           big = one / small
+           bignum = one / ( safmin*n )
+           ! compute the 1-norm of each column of the strictly upper triangular
+           ! part of a and b to check for possible overflow in the triangular
+           ! solver.
+           anorm = abs1( s( 1, 1 ) )
+           bnorm = abs1( p( 1, 1 ) )
+           rwork( 1 ) = zero
+           rwork( n+1 ) = zero
+           do j = 2, n
+              rwork( j ) = zero
+              rwork( n+j ) = zero
+              do i = 1, j - 1
+                 rwork( j ) = rwork( j ) + abs1( s( i, j ) )
+                 rwork( n+j ) = rwork( n+j ) + abs1( p( i, j ) )
+              end do
+              anorm = max( anorm, rwork( j )+abs1( s( j, j ) ) )
+              bnorm = max( bnorm, rwork( n+j )+abs1( p( j, j ) ) )
+           end do
+           ascale = one / max( anorm, safmin )
+           bscale = one / max( bnorm, safmin )
+           ! left eigenvectors
+           if( compl ) then
+              ieig = 0
+              ! main loop over eigenvalues
+              loop_140: do je = 1, n
+                 if( ilall ) then
+                    ilcomp = .true.
+                 else
+                    ilcomp = select( je )
+                 end if
+                 if( ilcomp ) then
+                    ieig = ieig + 1
+                    if( abs1( s( je, je ) )<=safmin .and.abs( real( p( je, je ),KIND=dp) )&
+                              <=safmin ) then
+                       ! singular matrix pencil -- return unit eigenvector
+                       do jr = 1, n
+                          vl( jr, ieig ) = czero
+                       end do
+                       vl( ieig, ieig ) = cone
+                       cycle loop_140
+                    end if
+                    ! non-singular eigenvalue:
+                    ! compute coefficients  a  and  b  in
+                         ! h
+                       ! y  ( a a - b b ) = 0
+                    temp = one / max( abs1( s( je, je ) )*ascale,abs( real( p( je, je ),KIND=dp) )&
+                              *bscale, safmin )
+                    salpha = ( temp*s( je, je ) )*ascale
+                    sbeta = ( temp*real( p( je, je ),KIND=dp) )*bscale
+                    acoeff = sbeta*ascale
+                    bcoeff = salpha*bscale
+                    ! scale to avoid underflow
+                    lsa = abs( sbeta )>=safmin .and. abs( acoeff )<small
+                    lsb = abs1( salpha )>=safmin .and. abs1( bcoeff )<small
+                    scale = one
+                    if( lsa )scale = ( small / abs( sbeta ) )*min( anorm, big )
+                    if( lsb )scale = max( scale, ( small / abs1( salpha ) )*min( bnorm, big ) )
+                              
+                    if( lsa .or. lsb ) then
+                       scale = min( scale, one /( safmin*max( one, abs( acoeff ),abs1( bcoeff ) ) &
+                                 ) )
+                       if( lsa ) then
+                          acoeff = ascale*( scale*sbeta )
+                       else
+                          acoeff = scale*acoeff
+                       end if
+                       if( lsb ) then
+                          bcoeff = bscale*( scale*salpha )
+                       else
+                          bcoeff = scale*bcoeff
+                       end if
+                    end if
+                    acoefa = abs( acoeff )
+                    bcoefa = abs1( bcoeff )
+                    xmax = one
+                    do jr = 1, n
+                       work( jr ) = czero
+                    end do
+                    work( je ) = cone
+                    dmin = max( ulp*acoefa*anorm, ulp*bcoefa*bnorm, safmin )
+                                                    ! h
+                    ! triangular solve of  (a a - b b)  y = 0
+                                            ! h
+                    ! (rowwise in  (a a - b b) , or columnwise in a a - b b)
+                    loop_100: do j = je + 1, n
+                       ! compute
+                             ! j-1
+                       ! sum = sum  conjg( a*s(k,j) - b*p(k,j) )*x(k)
+                             ! k=je
+                       ! (scale if necessary)
+                       temp = one / xmax
+                       if( acoefa*rwork( j )+bcoefa*rwork( n+j )>bignum*temp ) then
+                          do jr = je, j - 1
+                             work( jr ) = temp*work( jr )
+                          end do
+                          xmax = one
+                       end if
+                       suma = czero
+                       sumb = czero
+                       do jr = je, j - 1
+                          suma = suma + conjg( s( jr, j ) )*work( jr )
+                          sumb = sumb + conjg( p( jr, j ) )*work( jr )
+                       end do
+                       sum = acoeff*suma - conjg( bcoeff )*sumb
+                       ! form x(j) = - sum / conjg( a*s(j,j) - b*p(j,j) )
+                       ! with scaling and perturbation of the denominator
+                       d = conjg( acoeff*s( j, j )-bcoeff*p( j, j ) )
+                       if( abs1( d )<=dmin )d = cmplx( dmin,KIND=dp)
+                       if( abs1( d )<one ) then
+                          if( abs1( sum )>=bignum*abs1( d ) ) then
+                             temp = one / abs1( sum )
+                             do jr = je, j - 1
+                                work( jr ) = temp*work( jr )
+                             end do
+                             xmax = temp*xmax
+                             sum = temp*sum
+                          end if
+                       end if
+                       work( j ) = stdlib_zladiv( -sum, d )
+                       xmax = max( xmax, abs1( work( j ) ) )
+                    end do loop_100
+                    ! back transform eigenvector if howmny='b'.
+                    if( ilback ) then
+                       call stdlib_zgemv( 'N', n, n+1-je, cone, vl( 1, je ), ldvl,work( je ), 1, &
+                                 czero, work( n+1 ), 1 )
+                       isrc = 2
+                       ibeg = 1
+                    else
+                       isrc = 1
+                       ibeg = je
+                    end if
+                    ! copy and scale eigenvector into column of vl
+                    xmax = zero
+                    do jr = ibeg, n
+                       xmax = max( xmax, abs1( work( ( isrc-1 )*n+jr ) ) )
+                    end do
+                    if( xmax>safmin ) then
+                       temp = one / xmax
+                       do jr = ibeg, n
+                          vl( jr, ieig ) = temp*work( ( isrc-1 )*n+jr )
+                       end do
+                    else
+                       ibeg = n + 1
+                    end if
+                    do jr = 1, ibeg - 1
+                       vl( jr, ieig ) = czero
+                    end do
+                 end if
+              end do loop_140
+           end if
+           ! right eigenvectors
+           if( compr ) then
+              ieig = im + 1
+              ! main loop over eigenvalues
+              loop_250: do je = n, 1, -1
+                 if( ilall ) then
+                    ilcomp = .true.
+                 else
+                    ilcomp = select( je )
+                 end if
+                 if( ilcomp ) then
+                    ieig = ieig - 1
+                    if( abs1( s( je, je ) )<=safmin .and.abs( real( p( je, je ),KIND=dp) )&
+                              <=safmin ) then
+                       ! singular matrix pencil -- return unit eigenvector
+                       do jr = 1, n
+                          vr( jr, ieig ) = czero
+                       end do
+                       vr( ieig, ieig ) = cone
+                       cycle loop_250
+                    end if
+                    ! non-singular eigenvalue:
+                    ! compute coefficients  a  and  b  in
+                    ! ( a a - b b ) x  = 0
+                    temp = one / max( abs1( s( je, je ) )*ascale,abs( real( p( je, je ),KIND=dp) )&
+                              *bscale, safmin )
+                    salpha = ( temp*s( je, je ) )*ascale
+                    sbeta = ( temp*real( p( je, je ),KIND=dp) )*bscale
+                    acoeff = sbeta*ascale
+                    bcoeff = salpha*bscale
+                    ! scale to avoid underflow
+                    lsa = abs( sbeta )>=safmin .and. abs( acoeff )<small
+                    lsb = abs1( salpha )>=safmin .and. abs1( bcoeff )<small
+                    scale = one
+                    if( lsa )scale = ( small / abs( sbeta ) )*min( anorm, big )
+                    if( lsb )scale = max( scale, ( small / abs1( salpha ) )*min( bnorm, big ) )
+                              
+                    if( lsa .or. lsb ) then
+                       scale = min( scale, one /( safmin*max( one, abs( acoeff ),abs1( bcoeff ) ) &
+                                 ) )
+                       if( lsa ) then
+                          acoeff = ascale*( scale*sbeta )
+                       else
+                          acoeff = scale*acoeff
+                       end if
+                       if( lsb ) then
+                          bcoeff = bscale*( scale*salpha )
+                       else
+                          bcoeff = scale*bcoeff
+                       end if
+                    end if
+                    acoefa = abs( acoeff )
+                    bcoefa = abs1( bcoeff )
+                    xmax = one
+                    do jr = 1, n
+                       work( jr ) = czero
+                    end do
+                    work( je ) = cone
+                    dmin = max( ulp*acoefa*anorm, ulp*bcoefa*bnorm, safmin )
+                    ! triangular solve of  (a a - b b) x = 0  (columnwise)
+                    ! work(1:j-1) contains sums w,
+                    ! work(j+1:je) contains x
+                    do jr = 1, je - 1
+                       work( jr ) = acoeff*s( jr, je ) - bcoeff*p( jr, je )
+                    end do
+                    work( je ) = cone
+                    loop_210: do j = je - 1, 1, -1
+                       ! form x(j) := - w(j) / d
+                       ! with scaling and perturbation of the denominator
+                       d = acoeff*s( j, j ) - bcoeff*p( j, j )
+                       if( abs1( d )<=dmin )d = cmplx( dmin,KIND=dp)
+                       if( abs1( d )<one ) then
+                          if( abs1( work( j ) )>=bignum*abs1( d ) ) then
+                             temp = one / abs1( work( j ) )
+                             do jr = 1, je
+                                work( jr ) = temp*work( jr )
+                             end do
+                          end if
+                       end if
+                       work( j ) = stdlib_zladiv( -work( j ), d )
+                       if( j>1 ) then
+                          ! w = w + x(j)*(a s(*,j) - b p(*,j) ) with scaling
+                          if( abs1( work( j ) )>one ) then
+                             temp = one / abs1( work( j ) )
+                             if( acoefa*rwork( j )+bcoefa*rwork( n+j )>=bignum*temp ) then
+                                do jr = 1, je
+                                   work( jr ) = temp*work( jr )
+                                end do
+                             end if
+                          end if
+                          ca = acoeff*work( j )
+                          cb = bcoeff*work( j )
+                          do jr = 1, j - 1
+                             work( jr ) = work( jr ) + ca*s( jr, j ) -cb*p( jr, j )
+                          end do
+                       end if
+                    end do loop_210
+                    ! back transform eigenvector if howmny='b'.
+                    if( ilback ) then
+                       call stdlib_zgemv( 'N', n, je, cone, vr, ldvr, work, 1,czero, work( n+1 ), &
+                                 1 )
+                       isrc = 2
+                       iend = n
+                    else
+                       isrc = 1
+                       iend = je
+                    end if
+                    ! copy and scale eigenvector into column of vr
+                    xmax = zero
+                    do jr = 1, iend
+                       xmax = max( xmax, abs1( work( ( isrc-1 )*n+jr ) ) )
+                    end do
+                    if( xmax>safmin ) then
+                       temp = one / xmax
+                       do jr = 1, iend
+                          vr( jr, ieig ) = temp*work( ( isrc-1 )*n+jr )
+                       end do
+                    else
+                       iend = 0
+                    end if
+                    do jr = iend + 1, n
+                       vr( jr, ieig ) = czero
+                    end do
+                 end if
+              end do loop_250
+           end if
+           return
+     end subroutine stdlib_ztgevc
+
+     !> ZTGEX2: swaps adjacent diagonal 1 by 1 blocks (A11,B11) and (A22,B22)
+     !> in an upper triangular matrix pair (A, B) by an unitary equivalence
+     !> transformation.
+     !> (A, B) must be in generalized Schur canonical form, that is, A and
+     !> B are both upper triangular.
+     !> Optionally, the matrices Q and Z of generalized Schur vectors are
+     !> updated.
+     !> Q(in) * A(in) * Z(in)**H = Q(out) * A(out) * Z(out)**H
+     !> Q(in) * B(in) * Z(in)**H = Q(out) * B(out) * Z(out)**H
+
+     pure subroutine stdlib_ztgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, j1, info )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: j1, lda, ldb, ldq, ldz, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: twenty = 2.0e+1_dp
+           integer(ilp), parameter :: ldst = 2
+           logical(lk), parameter :: wands = .true.
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: strong, weak
+           integer(ilp) :: i, m
+           real(dp) :: cq, cz, eps, sa, sb, scale, smlnum, sum, thresha, threshb
+           complex(dp) :: cdum, f, g, sq, sz
+           ! Local Arrays 
+           complex(dp) :: s(ldst,ldst), t(ldst,ldst), work(8)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,max,sqrt
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n<=1 )return
+           m = ldst
+           weak = .false.
+           strong = .false.
+           ! make a local copy of selected block in (a, b)
+           call stdlib_zlacpy( 'FULL', m, m, a( j1, j1 ), lda, s, ldst )
+           call stdlib_zlacpy( 'FULL', m, m, b( j1, j1 ), ldb, t, ldst )
+           ! compute the threshold for testing the acceptance of swapping.
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' ) / eps
+           scale = real( czero,KIND=dp)
+           sum = real( cone,KIND=dp)
+           call stdlib_zlacpy( 'FULL', m, m, s, ldst, work, m )
+           call stdlib_zlacpy( 'FULL', m, m, t, ldst, work( m*m+1 ), m )
+           call stdlib_zlassq( m*m, work, 1, scale, sum )
+           sa = scale*sqrt( sum )
+           scale = real( czero,KIND=dp)
+           sum = real( cone,KIND=dp)
+           call stdlib_zlassq( m*m, work(m*m+1), 1, scale, sum )
+           sb = scale*sqrt( sum )
+           ! thres has been changed from
+              ! thresh = max( ten*eps*sa, smlnum )
+           ! to
+              ! thresh = max( twenty*eps*sa, smlnum )
+           ! on 04/01/10.
+           ! "bug" reported by ondra kamenik, confirmed by julie langou, fixed by
+           ! jim demmel and guillaume revy. see forum post 1783.
+           thresha = max( twenty*eps*sa, smlnum )
+           threshb = max( twenty*eps*sb, smlnum )
+           ! compute unitary ql and rq that swap 1-by-1 and 1-by-1 blocks
+           ! using givens rotations and perform the swap tentatively.
+           f = s( 2, 2 )*t( 1, 1 ) - t( 2, 2 )*s( 1, 1 )
+           g = s( 2, 2 )*t( 1, 2 ) - t( 2, 2 )*s( 1, 2 )
+           sa = abs( s( 2, 2 ) ) * abs( t( 1, 1 ) )
+           sb = abs( s( 1, 1 ) ) * abs( t( 2, 2 ) )
+           call stdlib_zlartg( g, f, cz, sz, cdum )
+           sz = -sz
+           call stdlib_zrot( 2, s( 1, 1 ), 1, s( 1, 2 ), 1, cz, conjg( sz ) )
+           call stdlib_zrot( 2, t( 1, 1 ), 1, t( 1, 2 ), 1, cz, conjg( sz ) )
+           if( sa>=sb ) then
+              call stdlib_zlartg( s( 1, 1 ), s( 2, 1 ), cq, sq, cdum )
+           else
+              call stdlib_zlartg( t( 1, 1 ), t( 2, 1 ), cq, sq, cdum )
+           end if
+           call stdlib_zrot( 2, s( 1, 1 ), ldst, s( 2, 1 ), ldst, cq, sq )
+           call stdlib_zrot( 2, t( 1, 1 ), ldst, t( 2, 1 ), ldst, cq, sq )
+           ! weak stability test: |s21| <= o(eps f-norm((a)))
+                                ! and  |t21| <= o(eps f-norm((b)))
+           weak = abs( s( 2, 1 ) )<=thresha .and.abs( t( 2, 1 ) )<=threshb
+           if( .not.weak )go to 20
+           if( wands ) then
+              ! strong stability test:
+                 ! f-norm((a-ql**h*s*qr)) <= o(eps*f-norm((a)))
+                 ! and
+                 ! f-norm((b-ql**h*t*qr)) <= o(eps*f-norm((b)))
+              call stdlib_zlacpy( 'FULL', m, m, s, ldst, work, m )
+              call stdlib_zlacpy( 'FULL', m, m, t, ldst, work( m*m+1 ), m )
+              call stdlib_zrot( 2, work, 1, work( 3 ), 1, cz, -conjg( sz ) )
+              call stdlib_zrot( 2, work( 5 ), 1, work( 7 ), 1, cz, -conjg( sz ) )
+              call stdlib_zrot( 2, work, 2, work( 2 ), 2, cq, -sq )
+              call stdlib_zrot( 2, work( 5 ), 2, work( 6 ), 2, cq, -sq )
+              do i = 1, 2
+                 work( i ) = work( i ) - a( j1+i-1, j1 )
+                 work( i+2 ) = work( i+2 ) - a( j1+i-1, j1+1 )
+                 work( i+4 ) = work( i+4 ) - b( j1+i-1, j1 )
+                 work( i+6 ) = work( i+6 ) - b( j1+i-1, j1+1 )
+              end do
+              scale = real( czero,KIND=dp)
+              sum = real( cone,KIND=dp)
+              call stdlib_zlassq( m*m, work, 1, scale, sum )
+              sa = scale*sqrt( sum )
+              scale = real( czero,KIND=dp)
+              sum = real( cone,KIND=dp)
+              call stdlib_zlassq( m*m, work(m*m+1), 1, scale, sum )
+              sb = scale*sqrt( sum )
+              strong = sa<=thresha .and. sb<=threshb
+              if( .not.strong )go to 20
+           end if
+           ! if the swap is accepted ("weakly" and "strongly"), apply the
+           ! equivalence transformations to the original matrix pair (a,b)
+           call stdlib_zrot( j1+1, a( 1, j1 ), 1, a( 1, j1+1 ), 1, cz,conjg( sz ) )
+           call stdlib_zrot( j1+1, b( 1, j1 ), 1, b( 1, j1+1 ), 1, cz,conjg( sz ) )
+           call stdlib_zrot( n-j1+1, a( j1, j1 ), lda, a( j1+1, j1 ), lda, cq, sq )
+           call stdlib_zrot( n-j1+1, b( j1, j1 ), ldb, b( j1+1, j1 ), ldb, cq, sq )
+           ! set  n1 by n2 (2,1) blocks to 0
+           a( j1+1, j1 ) = czero
+           b( j1+1, j1 ) = czero
+           ! accumulate transformations into q and z if requested.
+           if( wantz )call stdlib_zrot( n, z( 1, j1 ), 1, z( 1, j1+1 ), 1, cz,conjg( sz ) )
+                     
+           if( wantq )call stdlib_zrot( n, q( 1, j1 ), 1, q( 1, j1+1 ), 1, cq,conjg( sq ) )
+                     
+           ! exit with info = 0 if swap was successfully performed.
+           return
+           ! exit with info = 1 if swap was rejected.
+           20 continue
+           info = 1
+           return
+     end subroutine stdlib_ztgex2
+
+     !> ZTGEXC: reorders the generalized Schur decomposition of a complex
+     !> matrix pair (A,B), using an unitary equivalence transformation
+     !> (A, B) := Q * (A, B) * Z**H, so that the diagonal block of (A, B) with
+     !> row index IFST is moved to row ILST.
+     !> (A, B) must be in generalized Schur canonical form, that is, A and
+     !> B are both upper triangular.
+     !> Optionally, the matrices Q and Z of generalized Schur vectors are
+     !> updated.
+     !> Q(in) * A(in) * Z(in)**H = Q(out) * A(out) * Z(out)**H
+     !> Q(in) * B(in) * Z(in)**H = Q(out) * B(out) * Z(out)**H
+
+     pure subroutine stdlib_ztgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, ifst, ilst, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(in) :: ifst, lda, ldb, ldq, ldz, n
+           integer(ilp), intent(inout) :: ilst
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: here
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! decode and test input arguments.
+           info = 0
+           if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldq<1 .or. wantq .and. ( ldq<max( 1, n ) ) ) then
+              info = -9
+           else if( ldz<1 .or. wantz .and. ( ldz<max( 1, n ) ) ) then
+              info = -11
+           else if( ifst<1 .or. ifst>n ) then
+              info = -12
+           else if( ilst<1 .or. ilst>n ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGEXC', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 )return
+           if( ifst==ilst )return
+           if( ifst<ilst ) then
+              here = ifst
+              10 continue
+              ! swap with next one below
+              call stdlib_ztgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z, ldz,here, info )
+                        
+              if( info/=0 ) then
+                 ilst = here
+                 return
+              end if
+              here = here + 1
+              if( here<ilst )go to 10
+              here = here - 1
+           else
+              here = ifst - 1
+              20 continue
+              ! swap with next one above
+              call stdlib_ztgex2( wantq, wantz, n, a, lda, b, ldb, q, ldq, z, ldz,here, info )
+                        
+              if( info/=0 ) then
+                 ilst = here
+                 return
+              end if
+              here = here - 1
+              if( here>=ilst )go to 20
+              here = here + 1
+           end if
+           ilst = here
+           return
+     end subroutine stdlib_ztgexc
+
+     !> ZTPLQT2: computes a LQ a factorization of a complex "triangular-pentagonal"
+     !> matrix C, which is composed of a triangular block A and pentagonal block B,
+     !> using the compact WY representation for Q.
+
+     pure subroutine stdlib_ztplqt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, p, mp, np
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. l>min(m,n) ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1, m ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPLQT2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) return
+           do i = 1, m
+              ! generate elementary reflector h(i) to annihilate b(i,:)
+              p = n-l+min( l, i )
+              call stdlib_zlarfg( p+1, a( i, i ), b( i, 1 ), ldb, t( 1, i ) )
+              t(1,i)=conjg(t(1,i))
+              if( i<m ) then
+                 do j = 1, p
+                    b( i, j ) = conjg(b(i,j))
+                 end do
+                 ! w(m-i:1) := c(i+1:m,i:n) * c(i,i:n) [use w = t(m,:)]
+                 do j = 1, m-i
+                    t( m, j ) = (a( i+j, i ))
+                 end do
+                 call stdlib_zgemv( 'N', m-i, p, cone, b( i+1, 1 ), ldb,b( i, 1 ), ldb, cone, t( &
+                           m, 1 ), ldt )
+                 ! c(i+1:m,i:n) = c(i+1:m,i:n) + alpha * c(i,i:n)*w(m-1:1)^h
+                 alpha = -(t( 1, i ))
+                 do j = 1, m-i
+                    a( i+j, i ) = a( i+j, i ) + alpha*(t( m, j ))
+                 end do
+                 call stdlib_zgerc( m-i, p, (alpha),  t( m, 1 ), ldt,b( i, 1 ), ldb, b( i+1, 1 ), &
+                           ldb )
+                 do j = 1, p
+                    b( i, j ) = conjg(b(i,j))
+                 end do
+              end if
+           end do
+           do i = 2, m
+              ! t(i,1:i-1) := c(i:i-1,1:n)**h * (alpha * c(i,i:n))
+              alpha = -(t( 1, i ))
+              do j = 1, i-1
+                 t( i, j ) = czero
+              end do
+              p = min( i-1, l )
+              np = min( n-l+1, n )
+              mp = min( p+1, m )
+              do j = 1, n-l+p
+                b(i,j)=conjg(b(i,j))
+              end do
+              ! triangular part of b2
+              do j = 1, p
+                 t( i, j ) = (alpha*b( i, n-l+j ))
+              end do
+              call stdlib_ztrmv( 'L', 'N', 'N', p, b( 1, np ), ldb,t( i, 1 ), ldt )
+              ! rectangular part of b2
+              call stdlib_zgemv( 'N', i-1-p, l,  alpha, b( mp, np ), ldb,b( i, np ), ldb, czero, &
+                        t( i,mp ), ldt )
+              ! b1
+              call stdlib_zgemv( 'N', i-1, n-l, alpha, b, ldb, b( i, 1 ), ldb,cone, t( i, 1 ), &
+                        ldt )
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(i,1:i-1)
+              do j = 1, i-1
+                 t(i,j)=conjg(t(i,j))
+              end do
+              call stdlib_ztrmv( 'L', 'C', 'N', i-1, t, ldt, t( i, 1 ), ldt )
+              do j = 1, i-1
+                 t(i,j)=conjg(t(i,j))
+              end do
+              do j = 1, n-l+p
+                 b(i,j)=conjg(b(i,j))
+              end do
+              ! t(i,i) = tau(i)
+              t( i, i ) = t( 1, i )
+              t( 1, i ) = czero
+           end do
+           do i=1,m
+              do j= i+1,m
+                 t(i,j)=(t(j,i))
+                 t(j,i)=czero
+              end do
+           end do
+     end subroutine stdlib_ztplqt2
+
+     !> ZTPQRT2: computes a QR factorization of a complex "triangular-pentagonal"
+     !> matrix C, which is composed of a triangular block A and pentagonal block B,
+     !> using the compact WY representation for Q.
+
+     pure subroutine stdlib_ztpqrt2( m, n, l, a, lda, b, ldb, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, p, mp, np
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. l>min(m,n) ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPQRT2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. m==0 ) return
+           do i = 1, n
+              ! generate elementary reflector h(i) to annihilate b(:,i)
+              p = m-l+min( l, i )
+              call stdlib_zlarfg( p+1, a( i, i ), b( 1, i ), 1, t( i, 1 ) )
+              if( i<n ) then
+                 ! w(1:n-i) := c(i:m,i+1:n)**h * c(i:m,i) [use w = t(:,n)]
+                 do j = 1, n-i
+                    t( j, n ) = conjg(a( i, i+j ))
+                 end do
+                 call stdlib_zgemv( 'C', p, n-i, cone, b( 1, i+1 ), ldb,b( 1, i ), 1, cone, t( 1, &
+                           n ), 1 )
+                 ! c(i:m,i+1:n) = c(i:m,i+1:n) + alpha*c(i:m,i)*w(1:n-1)**h
+                 alpha = -conjg(t( i, 1 ))
+                 do j = 1, n-i
+                    a( i, i+j ) = a( i, i+j ) + alpha*conjg(t( j, n ))
+                 end do
+                 call stdlib_zgerc( p, n-i, alpha, b( 1, i ), 1,t( 1, n ), 1, b( 1, i+1 ), ldb )
+                           
+              end if
+           end do
+           do i = 2, n
+              ! t(1:i-1,i) := c(i:m,1:i-1)**h * (alpha * c(i:m,i))
+              alpha = -t( i, 1 )
+              do j = 1, i-1
+                 t( j, i ) = czero
+              end do
+              p = min( i-1, l )
+              mp = min( m-l+1, m )
+              np = min( p+1, n )
+              ! triangular part of b2
+              do j = 1, p
+                 t( j, i ) = alpha*b( m-l+j, i )
+              end do
+              call stdlib_ztrmv( 'U', 'C', 'N', p, b( mp, 1 ), ldb,t( 1, i ), 1 )
+              ! rectangular part of b2
+              call stdlib_zgemv( 'C', l, i-1-p, alpha, b( mp, np ), ldb,b( mp, i ), 1, czero, t( &
+                        np, i ), 1 )
+              ! b1
+              call stdlib_zgemv( 'C', m-l, i-1, alpha, b, ldb, b( 1, i ), 1,cone, t( 1, i ), 1 )
+                        
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+              call stdlib_ztrmv( 'U', 'N', 'N', i-1, t, ldt, t( 1, i ), 1 )
+              ! t(i,i) = tau(i)
+              t( i, i ) = t( i, 1 )
+              t( i, 1 ) = czero
+           end do
+     end subroutine stdlib_ztpqrt2
+
+     !> ZTPRFB: applies a complex "triangular-pentagonal" block reflector H or its
+     !> conjugate transpose H**H to a complex matrix C, which is composed of two
+     !> blocks A and B, either from the left or right.
+
+     pure subroutine stdlib_ztprfb( side, trans, direct, storev, m, n, k, l,v, ldv, t, ldt, a, &
+               lda, b, ldb, work, ldwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: direct, side, storev, trans
+           integer(ilp), intent(in) :: k, l, lda, ldb, ldt, ldv, ldwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(in) :: t(ldt,*), v(ldv,*)
+           complex(dp), intent(out) :: work(ldwork,*)
+        ! ==========================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, mp, np, kp
+           logical(lk) :: left, forward, column, right, backward, row
+           ! Intrinsic Functions 
+           intrinsic :: conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 .or. k<=0 .or. l<0 ) return
+           if( stdlib_lsame( storev, 'C' ) ) then
+              column = .true.
+              row = .false.
+           else if ( stdlib_lsame( storev, 'R' ) ) then
+              column = .false.
+              row = .true.
+           else
+              column = .false.
+              row = .false.
+           end if
+           if( stdlib_lsame( side, 'L' ) ) then
+              left = .true.
+              right = .false.
+           else if( stdlib_lsame( side, 'R' ) ) then
+              left = .false.
+              right = .true.
+           else
+              left = .false.
+              right = .false.
+           end if
+           if( stdlib_lsame( direct, 'F' ) ) then
+              forward = .true.
+              backward = .false.
+           else if( stdlib_lsame( direct, 'B' ) ) then
+              forward = .false.
+              backward = .true.
+           else
+              forward = .false.
+              backward = .false.
+           end if
+       ! ---------------------------------------------------------------------------
+           if( column .and. forward .and. left  ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i ]    (k-by-k)
+                        ! [ v ]    (m-by-k)
+              ! form  h c  or  h**h c  where  c = [ a ]  (k-by-n)
+                                                ! [ b ]  (m-by-n)
+              ! h = i - w t w**h          or  h**h = i - w t**h w**h
+              ! a = a -   t (a + v**h b)  or  a = a -   t**h (a + v**h b)
+              ! b = b - v t (a + v**h b)  or  b = b - v t**h (a + v**h b)
+       ! ---------------------------------------------------------------------------
+              mp = min( m-l+1, m )
+              kp = min( l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( i, j ) = b( m-l+i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'L', 'U', 'C', 'N', l, n, cone, v( mp, 1 ), ldv,work, ldwork )
+                        
+              call stdlib_zgemm( 'C', 'N', l, n, m-l, cone, v, ldv, b, ldb,cone, work, ldwork )
+                        
+              call stdlib_zgemm( 'C', 'N', k-l, n, m, cone, v( 1, kp ), ldv,b, ldb, czero, work( &
+                        kp, 1 ), ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'L', 'U', trans, 'N', k, n, cone, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_zgemm( 'N', 'N', m-l, n, k, -cone, v, ldv, work, ldwork,cone, b, ldb )
+                        
+              call stdlib_zgemm( 'N', 'N', l, n, k-l, -cone, v( mp, kp ), ldv,work( kp, 1 ), &
+                        ldwork, cone, b( mp, 1 ),  ldb )
+              call stdlib_ztrmm( 'L', 'U', 'N', 'N', l, n, cone, v( mp, 1 ), ldv,work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, l
+                    b( m-l+i, j ) = b( m-l+i, j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. forward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i ]    (k-by-k)
+                        ! [ v ]    (n-by-k)
+              ! form  c h or  c h**h  where  c = [ a b ] (a is m-by-k, b is m-by-n)
+              ! h = i - w t w**h          or  h**h = i - w t**h w**h
+              ! a = a - (a + b v) t      or  a = a - (a + b v) t**h
+              ! b = b - (a + b v) t v**h  or  b = b - (a + b v) t**h v**h
+       ! ---------------------------------------------------------------------------
+              np = min( n-l+1, n )
+              kp = min( l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, j ) = b( i, n-l+j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'R', 'U', 'N', 'N', m, l, cone, v( np, 1 ), ldv,work, ldwork )
+                        
+              call stdlib_zgemm( 'N', 'N', m, l, n-l, cone, b, ldb,v, ldv, cone, work, ldwork )
+                        
+              call stdlib_zgemm( 'N', 'N', m, k-l, n, cone, b, ldb,v( 1, kp ), ldv, czero, work( &
+                        1, kp ), ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'R', 'U', trans, 'N', m, k, cone, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_zgemm( 'N', 'C', m, n-l, k, -cone, work, ldwork,v, ldv, cone, b, ldb )
+                        
+              call stdlib_zgemm( 'N', 'C', m, l, k-l, -cone, work( 1, kp ), ldwork,v( np, kp ), &
+                        ldv, cone, b( 1, np ), ldb )
+              call stdlib_ztrmm( 'R', 'U', 'C', 'N', m, l, cone, v( np, 1 ), ldv,work, ldwork )
+                        
+              do j = 1, l
+                 do i = 1, m
+                    b( i, n-l+j ) = b( i, n-l+j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. backward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v ]    (m-by-k)
+                        ! [ i ]    (k-by-k)
+              ! form  h c  or  h**h c  where  c = [ b ]  (m-by-n)
+                                                ! [ a ]  (k-by-n)
+              ! h = i - w t w**h          or  h**h = i - w t**h w**h
+              ! a = a -   t (a + v**h b)  or  a = a -   t**h (a + v**h b)
+              ! b = b - v t (a + v**h b)  or  b = b - v t**h (a + v**h b)
+       ! ---------------------------------------------------------------------------
+              mp = min( l+1, m )
+              kp = min( k-l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( k-l+i, j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'L', 'L', 'C', 'N', l, n, cone, v( 1, kp ), ldv,work( kp, 1 ), &
+                        ldwork )
+              call stdlib_zgemm( 'C', 'N', l, n, m-l, cone, v( mp, kp ), ldv,b( mp, 1 ), ldb, &
+                        cone, work( kp, 1 ), ldwork )
+              call stdlib_zgemm( 'C', 'N', k-l, n, m, cone, v, ldv,b, ldb, czero, work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'L', 'L', trans, 'N', k, n, cone, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_zgemm( 'N', 'N', m-l, n, k, -cone, v( mp, 1 ), ldv,work, ldwork, cone, &
+                        b( mp, 1 ), ldb )
+              call stdlib_zgemm( 'N', 'N', l, n, k-l, -cone, v, ldv,work, ldwork, cone, b,  ldb )
+                        
+              call stdlib_ztrmm( 'L', 'L', 'N', 'N', l, n, cone, v( 1, kp ), ldv,work( kp, 1 ), &
+                        ldwork )
+              do j = 1, n
+                 do i = 1, l
+                    b( i, j ) = b( i, j ) - work( k-l+i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( column .and. backward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v ]    (n-by-k)
+                        ! [ i ]    (k-by-k)
+              ! form  c h  or  c h**h  where  c = [ b a ] (b is m-by-n, a is m-by-k)
+              ! h = i - w t w**h          or  h**h = i - w t**h w**h
+              ! a = a - (a + b v) t      or  a = a - (a + b v) t**h
+              ! b = b - (a + b v) t v**h  or  b = b - (a + b v) t**h v**h
+       ! ---------------------------------------------------------------------------
+              np = min( l+1, n )
+              kp = min( k-l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, k-l+j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'R', 'L', 'N', 'N', m, l, cone, v( 1, kp ), ldv,work( 1, kp ), &
+                        ldwork )
+              call stdlib_zgemm( 'N', 'N', m, l, n-l, cone, b( 1, np ), ldb,v( np, kp ), ldv, &
+                        cone, work( 1, kp ), ldwork )
+              call stdlib_zgemm( 'N', 'N', m, k-l, n, cone, b, ldb,v, ldv, czero, work, ldwork )
+                        
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'R', 'L', trans, 'N', m, k, cone, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_zgemm( 'N', 'C', m, n-l, k, -cone, work, ldwork,v( np, 1 ), ldv, cone, &
+                        b( 1, np ), ldb )
+              call stdlib_zgemm( 'N', 'C', m, l, k-l, -cone, work, ldwork,v, ldv, cone, b, ldb )
+                        
+              call stdlib_ztrmm( 'R', 'L', 'C', 'N', m, l, cone, v( 1, kp ), ldv,work( 1, kp ), &
+                        ldwork )
+              do j = 1, l
+                 do i = 1, m
+                    b( i, j ) = b( i, j ) - work( i, k-l+j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. forward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i v ] ( i is k-by-k, v is k-by-m )
+              ! form  h c  or  h**h c  where  c = [ a ]  (k-by-n)
+                                                ! [ b ]  (m-by-n)
+              ! h = i - w**h t w          or  h**h = i - w**h t**h w
+              ! a = a -     t (a + v b)  or  a = a -     t**h (a + v b)
+              ! b = b - v**h t (a + v b)  or  b = b - v**h t**h (a + v b)
+       ! ---------------------------------------------------------------------------
+              mp = min( m-l+1, m )
+              kp = min( l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( i, j ) = b( m-l+i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'L', 'L', 'N', 'N', l, n, cone, v( 1, mp ), ldv,work, ldb )
+                        
+              call stdlib_zgemm( 'N', 'N', l, n, m-l, cone, v, ldv,b, ldb,cone, work, ldwork )
+                        
+              call stdlib_zgemm( 'N', 'N', k-l, n, m, cone, v( kp, 1 ), ldv,b, ldb, czero, work( &
+                        kp, 1 ), ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'L', 'U', trans, 'N', k, n, cone, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_zgemm( 'C', 'N', m-l, n, k, -cone, v, ldv, work, ldwork,cone, b, ldb )
+                        
+              call stdlib_zgemm( 'C', 'N', l, n, k-l, -cone, v( kp, mp ), ldv,work( kp, 1 ), &
+                        ldwork, cone, b( mp, 1 ), ldb )
+              call stdlib_ztrmm( 'L', 'L', 'C', 'N', l, n, cone, v( 1, mp ), ldv,work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, l
+                    b( m-l+i, j ) = b( m-l+i, j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. forward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ i v ] ( i is k-by-k, v is k-by-n )
+              ! form  c h  or  c h**h  where  c = [ a b ] (a is m-by-k, b is m-by-n)
+              ! h = i - w**h t w            or  h**h = i - w**h t**h w
+              ! a = a - (a + b v**h) t      or  a = a - (a + b v**h) t**h
+              ! b = b - (a + b v**h) t v    or  b = b - (a + b v**h) t**h v
+       ! ---------------------------------------------------------------------------
+              np = min( n-l+1, n )
+              kp = min( l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, j ) = b( i, n-l+j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'R', 'L', 'C', 'N', m, l, cone, v( 1, np ), ldv,work, ldwork )
+                        
+              call stdlib_zgemm( 'N', 'C', m, l, n-l, cone, b, ldb, v, ldv,cone, work, ldwork )
+                        
+              call stdlib_zgemm( 'N', 'C', m, k-l, n, cone, b, ldb,v( kp, 1 ), ldv, czero, work( &
+                        1, kp ), ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'R', 'U', trans, 'N', m, k, cone, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_zgemm( 'N', 'N', m, n-l, k, -cone, work, ldwork,v, ldv, cone, b, ldb )
+                        
+              call stdlib_zgemm( 'N', 'N', m, l, k-l, -cone, work( 1, kp ), ldwork,v( kp, np ), &
+                        ldv, cone, b( 1, np ), ldb )
+              call stdlib_ztrmm( 'R', 'L', 'N', 'N', m, l, cone, v( 1, np ), ldv,work, ldwork )
+                        
+              do j = 1, l
+                 do i = 1, m
+                    b( i, n-l+j ) = b( i, n-l+j ) - work( i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. backward .and. left ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v i ] ( i is k-by-k, v is k-by-m )
+              ! form  h c  or  h**h c  where  c = [ b ]  (m-by-n)
+                                                ! [ a ]  (k-by-n)
+              ! h = i - w**h t w          or  h**h = i - w**h t**h w
+              ! a = a -     t (a + v b)  or  a = a -     t**h (a + v b)
+              ! b = b - v**h t (a + v b)  or  b = b - v**h t**h (a + v b)
+       ! ---------------------------------------------------------------------------
+              mp = min( l+1, m )
+              kp = min( k-l+1, k )
+              do j = 1, n
+                 do i = 1, l
+                    work( k-l+i, j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'L', 'U', 'N', 'N', l, n, cone, v( kp, 1 ), ldv,work( kp, 1 ), &
+                        ldwork )
+              call stdlib_zgemm( 'N', 'N', l, n, m-l, cone, v( kp, mp ), ldv,b( mp, 1 ), ldb, &
+                        cone, work( kp, 1 ), ldwork )
+              call stdlib_zgemm( 'N', 'N', k-l, n, m, cone, v, ldv, b, ldb,czero, work, ldwork )
+                        
+              do j = 1, n
+                 do i = 1, k
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'L', 'L ', trans, 'N', k, n, cone, t, ldt,work, ldwork )
+              do j = 1, n
+                 do i = 1, k
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_zgemm( 'C', 'N', m-l, n, k, -cone, v( 1, mp ), ldv,work, ldwork, cone, &
+                        b( mp, 1 ), ldb )
+              call stdlib_zgemm( 'C', 'N', l, n, k-l, -cone, v, ldv,work, ldwork, cone, b, ldb )
+                        
+              call stdlib_ztrmm( 'L', 'U', 'C', 'N', l, n, cone, v( kp, 1 ), ldv,work( kp, 1 ), &
+                        ldwork )
+              do j = 1, n
+                 do i = 1, l
+                    b( i, j ) = b( i, j ) - work( k-l+i, j )
+                 end do
+              end do
+       ! ---------------------------------------------------------------------------
+           else if( row .and. backward .and. right ) then
+       ! ---------------------------------------------------------------------------
+              ! let  w =  [ v i ] ( i is k-by-k, v is k-by-n )
+              ! form  c h  or  c h**h  where  c = [ b a ] (a is m-by-k, b is m-by-n)
+              ! h = i - w**h t w            or  h**h = i - w**h t**h w
+              ! a = a - (a + b v**h) t      or  a = a - (a + b v**h) t**h
+              ! b = b - (a + b v**h) t v    or  b = b - (a + b v**h) t**h v
+       ! ---------------------------------------------------------------------------
+              np = min( l+1, n )
+              kp = min( k-l+1, k )
+              do j = 1, l
+                 do i = 1, m
+                    work( i, k-l+j ) = b( i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'R', 'U', 'C', 'N', m, l, cone, v( kp, 1 ), ldv,work( 1, kp ), &
+                        ldwork )
+              call stdlib_zgemm( 'N', 'C', m, l, n-l, cone, b( 1, np ), ldb,v( kp, np ), ldv, &
+                        cone, work( 1, kp ), ldwork )
+              call stdlib_zgemm( 'N', 'C', m, k-l, n, cone, b, ldb, v, ldv,czero, work, ldwork )
+                        
+              do j = 1, k
+                 do i = 1, m
+                    work( i, j ) = work( i, j ) + a( i, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'R', 'L', trans, 'N', m, k, cone, t, ldt,work, ldwork )
+              do j = 1, k
+                 do i = 1, m
+                    a( i, j ) = a( i, j ) - work( i, j )
+                 end do
+              end do
+              call stdlib_zgemm( 'N', 'N', m, n-l, k, -cone, work, ldwork,v( 1, np ), ldv, cone, &
+                        b( 1, np ), ldb )
+              call stdlib_zgemm( 'N', 'N', m, l, k-l , -cone, work, ldwork,v, ldv, cone, b, ldb )
+                        
+              call stdlib_ztrmm( 'R', 'U', 'N', 'N', m, l, cone, v( kp, 1 ), ldv,work( 1, kp ), &
+                        ldwork )
+              do j = 1, l
+                 do i = 1, m
+                    b( i, j ) = b( i, j ) - work( i, k-l+j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_ztprfb
+
+     !> ZTPRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular packed
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by ZTPTRS or some other
+     !> means before entering this routine.  ZTPRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_ztprfs( uplo, trans, diag, n, nrhs, ap, b, ldb, x, ldx,ferr, berr, &
+               work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: ap(*), b(ldb,*), x(ldx,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transn, transt
+           integer(ilp) :: i, j, k, kase, kc, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_zcopy( n, x( 1, j ), 1, work, 1 )
+              call stdlib_ztpmv( uplo, trans, diag, n, ap, work, 1 )
+              call stdlib_zaxpy( n, -cone, b( 1, j ), 1, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = 1, k
+                             rwork( i ) = rwork( i ) +cabs1( ap( kc+i-1 ) )*xk
+                          end do
+                          kc = kc + k
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = 1, k - 1
+                             rwork( i ) = rwork( i ) +cabs1( ap( kc+i-1 ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                          kc = kc + k
+                       end do
+                    end if
+                 else
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k, n
+                             rwork( i ) = rwork( i ) +cabs1( ap( kc+i-k ) )*xk
+                          end do
+                          kc = kc + n - k + 1
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k + 1, n
+                             rwork( i ) = rwork( i ) +cabs1( ap( kc+i-k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                          kc = kc + n - k + 1
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**h)*abs(x) + abs(b).
+                 if( upper ) then
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = 1, k
+                             s = s + cabs1( ap( kc+i-1 ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                          kc = kc + k
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = 1, k - 1
+                             s = s + cabs1( ap( kc+i-1 ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                          kc = kc + k
+                       end do
+                    end if
+                 else
+                    kc = 1
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, n
+                             s = s + cabs1( ap( kc+i-k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                          kc = kc + n - k + 1
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = k + 1, n
+                             s = s + cabs1( ap( kc+i-k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                          kc = kc + n - k + 1
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_zlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_ztpsv( uplo, transt, diag, n, ap, work, 1 )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_ztpsv( uplo, transn, diag, n, ap, work, 1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_ztprfs
+
+     !> ZTPTRI: computes the inverse of a complex upper or lower triangular
+     !> matrix A stored in packed format.
+
+     pure subroutine stdlib_ztptri( uplo, diag, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jc, jclast, jj
+           complex(dp) :: ajj
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPTRI', -info )
+              return
+           end if
+           ! check for singularity if non-unit.
+           if( nounit ) then
+              if( upper ) then
+                 jj = 0
+                 do info = 1, n
+                    jj = jj + info
+                    if( ap( jj )==czero )return
+                 end do
+              else
+                 jj = 1
+                 do info = 1, n
+                    if( ap( jj )==czero )return
+                    jj = jj + n - info + 1
+                 end do
+              end if
+              info = 0
+           end if
+           if( upper ) then
+              ! compute inverse of upper triangular matrix.
+              jc = 1
+              do j = 1, n
+                 if( nounit ) then
+                    ap( jc+j-1 ) = cone / ap( jc+j-1 )
+                    ajj = -ap( jc+j-1 )
+                 else
+                    ajj = -cone
+                 end if
+                 ! compute elements 1:j-1 of j-th column.
+                 call stdlib_ztpmv( 'UPPER', 'NO TRANSPOSE', diag, j-1, ap,ap( jc ), 1 )
+                 call stdlib_zscal( j-1, ajj, ap( jc ), 1 )
+                 jc = jc + j
+              end do
+           else
+              ! compute inverse of lower triangular matrix.
+              jc = n*( n+1 ) / 2
+              do j = n, 1, -1
+                 if( nounit ) then
+                    ap( jc ) = cone / ap( jc )
+                    ajj = -ap( jc )
+                 else
+                    ajj = -cone
+                 end if
+                 if( j<n ) then
+                    ! compute elements j+1:n of j-th column.
+                    call stdlib_ztpmv( 'LOWER', 'NO TRANSPOSE', diag, n-j,ap( jclast ), ap( jc+1 )&
+                              , 1 )
+                    call stdlib_zscal( n-j, ajj, ap( jc+1 ), 1 )
+                 end if
+                 jclast = jc
+                 jc = jc - n + j - 2
+              end do
+           end if
+           return
+     end subroutine stdlib_ztptri
+
+     !> ZTPTRS: solves a triangular system of the form
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> where A is a triangular matrix of order N stored in packed format,
+     !> and B is an N-by-NRHS matrix.  A check is made to verify that A is
+     !> nonsingular.
+
+     pure subroutine stdlib_ztptrs( uplo, trans, diag, n, nrhs, ap, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(dp), intent(in) :: ap(*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jc
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              if( upper ) then
+                 jc = 1
+                 do info = 1, n
+                    if( ap( jc+info-1 )==czero )return
+                    jc = jc + info
+                 end do
+              else
+                 jc = 1
+                 do info = 1, n
+                    if( ap( jc )==czero )return
+                    jc = jc + n - info + 1
+                 end do
+              end if
+           end if
+           info = 0
+           ! solve  a * x = b,  a**t * x = b,  or  a**h * x = b.
+           do j = 1, nrhs
+              call stdlib_ztpsv( uplo, trans, diag, n, ap, b( 1, j ), 1 )
+           end do
+           return
+     end subroutine stdlib_ztptrs
+
+     !> ZTPTTF: copies a triangular matrix A from standard packed format (TP)
+     !> to rectangular full packed format (TF).
+
+     pure subroutine stdlib_ztpttf( transr, uplo, n, ap, arf, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(dp), intent(in) :: ap(0:*)
+           complex(dp), intent(out) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k, nt
+           integer(ilp) :: i, j, ij
+           integer(ilp) :: ijp, jp, lda, js
+           ! Intrinsic Functions 
+           intrinsic :: conjg,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPTTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( normaltransr ) then
+                 arf( 0 ) = ap( 0 )
+              else
+                 arf( 0 ) = conjg( ap( 0 ) )
+              end if
+              return
+           end if
+           ! size of array arf(0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           ! set lda of arf^c; arf^c is (0:(n+1)/2-1,0:n-noe)
+           ! where noe = 0 if n is even, noe = 1 if n is odd
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              lda = n + 1
+           else
+              nisodd = .true.
+              lda = n
+           end if
+           ! arf^c has lda rows and n+1-noe cols
+           if( .not.normaltransr )lda = ( n+1 ) / 2
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda = n
+                    ijp = 0
+                    jp = 0
+                    do j = 0, n2
+                       do i = j, n - 1
+                          ij = i + jp
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, n2 - 1
+                       do j = 1 + i, n2
+                          ij = i + j*lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    ijp = 0
+                    do j = 0, n1 - 1
+                       ij = n2 + j
+                       do i = 0, j
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = n1, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ijp = 0
+                    do i = 0, n2
+                       do ij = i*( lda+1 ), n*lda - 1, lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 1
+                    do j = 0, n2 - 1
+                       do ij = js, js + n2 - j - 1
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    ijp = 0
+                    js = n2*lda
+                    do j = 0, n1 - 1
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, n1
+                       do ij = i, i + ( n1+i )*lda, lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    ijp = 0
+                    jp = 0
+                    do j = 0, k - 1
+                       do i = j, n - 1
+                          ij = 1 + i + jp
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       jp = jp + lda
+                    end do
+                    do i = 0, k - 1
+                       do j = i, k - 1
+                          ij = i + j*lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    ijp = 0
+                    do j = 0, k - 1
+                       ij = k + 1 + j
+                       do i = 0, j
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                          ij = ij + lda
+                       end do
+                    end do
+                    js = 0
+                    do j = k, n - 1
+                       ij = js
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    ijp = 0
+                    do i = 0, k - 1
+                       do ij = i + ( i+1 )*lda, ( n+1 )*lda - 1, lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                    js = 0
+                    do j = 0, k - 1
+                       do ij = js, js + k - j - 1
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda + 1
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    ijp = 0
+                    js = ( k+1 )*lda
+                    do j = 0, k - 1
+                       do ij = js, js + j
+                          arf( ij ) = ap( ijp )
+                          ijp = ijp + 1
+                       end do
+                       js = js + lda
+                    end do
+                    do i = 0, k - 1
+                       do ij = i, i + ( k+i )*lda, lda
+                          arf( ij ) = conjg( ap( ijp ) )
+                          ijp = ijp + 1
+                       end do
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ztpttf
+
+     !> ZTPTTR: copies a triangular matrix A from standard packed format (TP)
+     !> to standard full format (TR).
+
+     pure subroutine stdlib_ztpttr( uplo, n, ap, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           complex(dp), intent(out) :: a(lda,*)
+           complex(dp), intent(in) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower
+           integer(ilp) :: i, j, k
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPTTR', -info )
+              return
+           end if
+           if( lower ) then
+              k = 0
+              do j = 1, n
+                 do i = j, n
+                    k = k + 1
+                    a( i, j ) = ap( k )
+                 end do
+              end do
+           else
+              k = 0
+              do j = 1, n
+                 do i = 1, j
+                    k = k + 1
+                    a( i, j ) = ap( k )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_ztpttr
+
+     !> ZTREVC: computes some or all of the right and/or left eigenvectors of
+     !> a complex upper triangular matrix T.
+     !> Matrices of this type are produced by the Schur factorization of
+     !> a complex general matrix:  A = Q*T*Q**H, as computed by ZHSEQR.
+     !> The right eigenvector x and the left eigenvector y of T corresponding
+     !> to an eigenvalue w are defined by:
+     !> T*x = w*x,     (y**H)*T = w*(y**H)
+     !> where y**H denotes the conjugate transpose of the vector y.
+     !> The eigenvalues are not input to this routine, but are read directly
+     !> from the diagonal of T.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+     !> input matrix.  If Q is the unitary factor that reduces a matrix A to
+     !> Schur form T, then Q*X and Q*Y are the matrices of right and left
+     !> eigenvectors of A.
+
+     pure subroutine stdlib_ztrevc( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m, &
+               work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: t(ldt,*), vl(ldvl,*), vr(ldvr,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           complex(dp), parameter :: cmzero = (0.0e+0_dp,0.0e+0_dp)
+           complex(dp), parameter :: cmone = (1.0e+0_dp,0.0e+0_dp)
+           
+           
+           ! Local Scalars 
+           logical(lk) :: allv, bothv, leftv, over, rightv, somev
+           integer(ilp) :: i, ii, is, j, k, ki
+           real(dp) :: ovfl, remax, scale, smin, smlnum, ulp, unfl
+           complex(dp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! decode and test the input parameters
+           bothv = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv = stdlib_lsame( side, 'L' ) .or. bothv
+           allv = stdlib_lsame( howmny, 'A' )
+           over = stdlib_lsame( howmny, 'B' )
+           somev = stdlib_lsame( howmny, 'S' )
+           ! set m to the number of columns required to store the selected
+           ! eigenvectors.
+           if( somev ) then
+              m = 0
+              do j = 1, n
+                 if( select( j ) )m = m + 1
+              end do
+           else
+              m = n
+           end if
+           info = 0
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.allv .and. .not.over .and. .not.somev ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -10
+           else if( mm<m ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTREVC', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! set the constants to control overflow.
+           unfl = stdlib_dlamch( 'SAFE MINIMUM' )
+           ovfl = one / unfl
+           call stdlib_dlabad( unfl, ovfl )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           ! store the diagonal elements of t in working array work.
+           do i = 1, n
+              work( i+n ) = t( i, i )
+           end do
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           rwork( 1 ) = zero
+           do j = 2, n
+              rwork( j ) = stdlib_dzasum( j-1, t( 1, j ), 1 )
+           end do
+           if( rightv ) then
+              ! compute right eigenvectors.
+              is = m
+              loop_80: do ki = n, 1, -1
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_80
+                 end if
+                 smin = max( ulp*( cabs1( t( ki, ki ) ) ), smlnum )
+                 work( 1 ) = cmone
+                 ! form right-hand side.
+                 do k = 1, ki - 1
+                    work( k ) = -t( k, ki )
+                 end do
+                 ! solve the triangular system:
+                    ! (t(1:ki-1,1:ki-1) - t(ki,ki))*x = scale*work.
+                 do k = 1, ki - 1
+                    t( k, k ) = t( k, k ) - t( ki, ki )
+                    if( cabs1( t( k, k ) )<smin )t( k, k ) = smin
+                 end do
+                 if( ki>1 ) then
+                    call stdlib_zlatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', 'Y',ki-1, t, ldt, &
+                              work( 1 ), scale, rwork,info )
+                    work( ki ) = scale
+                 end if
+                 ! copy the vector x or q*x to vr and normalize.
+                 if( .not.over ) then
+                    call stdlib_zcopy( ki, work( 1 ), 1, vr( 1, is ), 1 )
+                    ii = stdlib_izamax( ki, vr( 1, is ), 1 )
+                    remax = one / cabs1( vr( ii, is ) )
+                    call stdlib_zdscal( ki, remax, vr( 1, is ), 1 )
+                    do k = ki + 1, n
+                       vr( k, is ) = cmzero
+                    end do
+                 else
+                    if( ki>1 )call stdlib_zgemv( 'N', n, ki-1, cmone, vr, ldvr, work( 1 ),1, &
+                              cmplx( scale,KIND=dp), vr( 1, ki ), 1 )
+                    ii = stdlib_izamax( n, vr( 1, ki ), 1 )
+                    remax = one / cabs1( vr( ii, ki ) )
+                    call stdlib_zdscal( n, remax, vr( 1, ki ), 1 )
+                 end if
+                 ! set back the original diagonal elements of t.
+                 do k = 1, ki - 1
+                    t( k, k ) = work( k+n )
+                 end do
+                 is = is - 1
+              end do loop_80
+           end if
+           if( leftv ) then
+              ! compute left eigenvectors.
+              is = 1
+              loop_130: do ki = 1, n
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_130
+                 end if
+                 smin = max( ulp*( cabs1( t( ki, ki ) ) ), smlnum )
+                 work( n ) = cmone
+                 ! form right-hand side.
+                 do k = ki + 1, n
+                    work( k ) = -conjg( t( ki, k ) )
+                 end do
+                 ! solve the triangular system:
+                    ! (t(ki+1:n,ki+1:n) - t(ki,ki))**h * x = scale*work.
+                 do k = ki + 1, n
+                    t( k, k ) = t( k, k ) - t( ki, ki )
+                    if( cabs1( t( k, k ) )<smin )t( k, k ) = smin
+                 end do
+                 if( ki<n ) then
+                    call stdlib_zlatrs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT','Y', n-ki, t( &
+                              ki+1, ki+1 ), ldt,work( ki+1 ), scale, rwork, info )
+                    work( ki ) = scale
+                 end if
+                 ! copy the vector x or q*x to vl and normalize.
+                 if( .not.over ) then
+                    call stdlib_zcopy( n-ki+1, work( ki ), 1, vl( ki, is ), 1 )
+                    ii = stdlib_izamax( n-ki+1, vl( ki, is ), 1 ) + ki - 1
+                    remax = one / cabs1( vl( ii, is ) )
+                    call stdlib_zdscal( n-ki+1, remax, vl( ki, is ), 1 )
+                    do k = 1, ki - 1
+                       vl( k, is ) = cmzero
+                    end do
+                 else
+                    if( ki<n )call stdlib_zgemv( 'N', n, n-ki, cmone, vl( 1, ki+1 ), ldvl,work( &
+                              ki+1 ), 1, cmplx( scale,KIND=dp),vl( 1, ki ), 1 )
+                    ii = stdlib_izamax( n, vl( 1, ki ), 1 )
+                    remax = one / cabs1( vl( ii, ki ) )
+                    call stdlib_zdscal( n, remax, vl( 1, ki ), 1 )
+                 end if
+                 ! set back the original diagonal elements of t.
+                 do k = ki + 1, n
+                    t( k, k ) = work( k+n )
+                 end do
+                 is = is + 1
+              end do loop_130
+           end if
+           return
+     end subroutine stdlib_ztrevc
+
+     !> ZTREVC3: computes some or all of the right and/or left eigenvectors of
+     !> a complex upper triangular matrix T.
+     !> Matrices of this type are produced by the Schur factorization of
+     !> a complex general matrix:  A = Q*T*Q**H, as computed by ZHSEQR.
+     !> The right eigenvector x and the left eigenvector y of T corresponding
+     !> to an eigenvalue w are defined by:
+     !> T*x = w*x,     (y**H)*T = w*(y**H)
+     !> where y**H denotes the conjugate transpose of the vector y.
+     !> The eigenvalues are not input to this routine, but are read directly
+     !> from the diagonal of T.
+     !> This routine returns the matrices X and/or Y of right and left
+     !> eigenvectors of T, or the products Q*X and/or Q*Y, where Q is an
+     !> input matrix. If Q is the unitary factor that reduces a matrix A to
+     !> Schur form T, then Q*X and Q*Y are the matrices of right and left
+     !> eigenvectors of A.
+     !> This uses a Level 3 BLAS version of the back transformation.
+
+     pure subroutine stdlib_ztrevc3( side, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, mm, m, &
+               work, lwork, rwork, lrwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, lwork, lrwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: t(ldt,*), vl(ldvl,*), vr(ldvr,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmin = 8
+           integer(ilp), parameter :: nbmax = 128
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: allv, bothv, leftv, lquery, over, rightv, somev
+           integer(ilp) :: i, ii, is, j, k, ki, iv, maxwrk, nb
+           real(dp) :: ovfl, remax, scale, smin, smlnum, ulp, unfl
+           complex(dp) :: cdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! decode and test the input parameters
+           bothv  = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv  = stdlib_lsame( side, 'L' ) .or. bothv
+           allv  = stdlib_lsame( howmny, 'A' )
+           over  = stdlib_lsame( howmny, 'B' )
+           somev = stdlib_lsame( howmny, 'S' )
+           ! set m to the number of columns required to store the selected
+           ! eigenvectors.
+           if( somev ) then
+              m = 0
+              do j = 1, n
+                 if( select( j ) )m = m + 1
+              end do
+           else
+              m = n
+           end if
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZTREVC', side // howmny, n, -1, -1, -1 )
+           maxwrk = n + 2*n*nb
+           work(1) = maxwrk
+           rwork(1) = n
+           lquery = ( lwork==-1 .or. lrwork==-1 )
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.allv .and. .not.over .and. .not.somev ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -10
+           else if( mm<m ) then
+              info = -11
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -14
+           else if ( lrwork<max( 1, n ) .and. .not.lquery ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTREVC3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! use blocked version of back-transformation if sufficient workspace.
+           ! zero-out the workspace to avoid potential nan propagation.
+           if( over .and. lwork >= n + 2*n*nbmin ) then
+              nb = (lwork - n) / (2*n)
+              nb = min( nb, nbmax )
+              call stdlib_zlaset( 'F', n, 1+2*nb, czero, czero, work, n )
+           else
+              nb = 1
+           end if
+           ! set the constants to control overflow.
+           unfl = stdlib_dlamch( 'SAFE MINIMUM' )
+           ovfl = one / unfl
+           call stdlib_dlabad( unfl, ovfl )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           ! store the diagonal elements of t in working array work.
+           do i = 1, n
+              work( i ) = t( i, i )
+           end do
+           ! compute 1-norm of each column of strictly upper triangular
+           ! part of t to control overflow in triangular solver.
+           rwork( 1 ) = zero
+           do j = 2, n
+              rwork( j ) = stdlib_dzasum( j-1, t( 1, j ), 1 )
+           end do
+           if( rightv ) then
+              ! ============================================================
+              ! compute right eigenvectors.
+              ! iv is index of column in current block.
+              ! non-blocked version always uses iv=nb=1;
+              ! blocked     version starts with iv=nb, goes down to 1.
+              ! (note the "0-th" column is used to store the original diagonal.)
+              iv = nb
+              is = m
+              loop_80: do ki = n, 1, -1
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_80
+                 end if
+                 smin = max( ulp*( cabs1( t( ki, ki ) ) ), smlnum )
+                 ! --------------------------------------------------------
+                 ! complex right eigenvector
+                 work( ki + iv*n ) = cone
+                 ! form right-hand side.
+                 do k = 1, ki - 1
+                    work( k + iv*n ) = -t( k, ki )
+                 end do
+                 ! solve upper triangular system:
+                 ! [ t(1:ki-1,1:ki-1) - t(ki,ki) ]*x = scale*work.
+                 do k = 1, ki - 1
+                    t( k, k ) = t( k, k ) - t( ki, ki )
+                    if( cabs1( t( k, k ) )<smin )t( k, k ) = smin
+                 end do
+                 if( ki>1 ) then
+                    call stdlib_zlatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', 'Y',ki-1, t, ldt, &
+                              work( 1 + iv*n ), scale,rwork, info )
+                    work( ki + iv*n ) = scale
+                 end if
+                 ! copy the vector x or q*x to vr and normalize.
+                 if( .not.over ) then
+                    ! ------------------------------
+                    ! no back-transform: copy x to vr and normalize.
+                    call stdlib_zcopy( ki, work( 1 + iv*n ), 1, vr( 1, is ), 1 )
+                    ii = stdlib_izamax( ki, vr( 1, is ), 1 )
+                    remax = one / cabs1( vr( ii, is ) )
+                    call stdlib_zdscal( ki, remax, vr( 1, is ), 1 )
+                    do k = ki + 1, n
+                       vr( k, is ) = czero
+                    end do
+                 else if( nb==1 ) then
+                    ! ------------------------------
+                    ! version 1: back-transform each vector with gemv, q*x.
+                    if( ki>1 )call stdlib_zgemv( 'N', n, ki-1, cone, vr, ldvr,work( 1 + iv*n ), 1,&
+                               cmplx( scale,KIND=dp),vr( 1, ki ), 1 )
+                    ii = stdlib_izamax( n, vr( 1, ki ), 1 )
+                    remax = one / cabs1( vr( ii, ki ) )
+                    call stdlib_zdscal( n, remax, vr( 1, ki ), 1 )
+                 else
+                    ! ------------------------------
+                    ! version 2: back-transform block of vectors with gemm
+                    ! zero out below vector
+                    do k = ki + 1, n
+                       work( k + iv*n ) = czero
+                    end do
+                    ! columns iv:nb of work are valid vectors.
+                    ! when the number of vectors stored reaches nb,
+                    ! or if this was last vector, do the gemm
+                    if( (iv==1) .or. (ki==1) ) then
+                       call stdlib_zgemm( 'N', 'N', n, nb-iv+1, ki+nb-iv, cone,vr, ldvr,work( 1 + &
+                                 (iv)*n    ), n,czero,work( 1 + (nb+iv)*n ), n )
+                       ! normalize vectors
+                       do k = iv, nb
+                          ii = stdlib_izamax( n, work( 1 + (nb+k)*n ), 1 )
+                          remax = one / cabs1( work( ii + (nb+k)*n ) )
+                          call stdlib_zdscal( n, remax, work( 1 + (nb+k)*n ), 1 )
+                       end do
+                       call stdlib_zlacpy( 'F', n, nb-iv+1,work( 1 + (nb+iv)*n ), n,vr( 1, ki ), &
+                                 ldvr )
+                       iv = nb
+                    else
+                       iv = iv - 1
+                    end if
+                 end if
+                 ! restore the original diagonal elements of t.
+                 do k = 1, ki - 1
+                    t( k, k ) = work( k )
+                 end do
+                 is = is - 1
+              end do loop_80
+           end if
+           if( leftv ) then
+              ! ============================================================
+              ! compute left eigenvectors.
+              ! iv is index of column in current block.
+              ! non-blocked version always uses iv=1;
+              ! blocked     version starts with iv=1, goes up to nb.
+              ! (note the "0-th" column is used to store the original diagonal.)
+              iv = 1
+              is = 1
+              loop_130: do ki = 1, n
+                 if( somev ) then
+                    if( .not.select( ki ) )cycle loop_130
+                 end if
+                 smin = max( ulp*( cabs1( t( ki, ki ) ) ), smlnum )
+                 ! --------------------------------------------------------
+                 ! complex left eigenvector
+                 work( ki + iv*n ) = cone
+                 ! form right-hand side.
+                 do k = ki + 1, n
+                    work( k + iv*n ) = -conjg( t( ki, k ) )
+                 end do
+                 ! solve conjugate-transposed triangular system:
+                 ! [ t(ki+1:n,ki+1:n) - t(ki,ki) ]**h * x = scale*work.
+                 do k = ki + 1, n
+                    t( k, k ) = t( k, k ) - t( ki, ki )
+                    if( cabs1( t( k, k ) )<smin )t( k, k ) = smin
+                 end do
+                 if( ki<n ) then
+                    call stdlib_zlatrs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT','Y', n-ki, t( &
+                              ki+1, ki+1 ), ldt,work( ki+1 + iv*n ), scale, rwork, info )
+                    work( ki + iv*n ) = scale
+                 end if
+                 ! copy the vector x or q*x to vl and normalize.
+                 if( .not.over ) then
+                    ! ------------------------------
+                    ! no back-transform: copy x to vl and normalize.
+                    call stdlib_zcopy( n-ki+1, work( ki + iv*n ), 1, vl(ki,is), 1 )
+                    ii = stdlib_izamax( n-ki+1, vl( ki, is ), 1 ) + ki - 1
+                    remax = one / cabs1( vl( ii, is ) )
+                    call stdlib_zdscal( n-ki+1, remax, vl( ki, is ), 1 )
+                    do k = 1, ki - 1
+                       vl( k, is ) = czero
+                    end do
+                 else if( nb==1 ) then
+                    ! ------------------------------
+                    ! version 1: back-transform each vector with gemv, q*x.
+                    if( ki<n )call stdlib_zgemv( 'N', n, n-ki, cone, vl( 1, ki+1 ), ldvl,work( ki+&
+                              1 + iv*n ), 1, cmplx( scale,KIND=dp),vl( 1, ki ), 1 )
+                    ii = stdlib_izamax( n, vl( 1, ki ), 1 )
+                    remax = one / cabs1( vl( ii, ki ) )
+                    call stdlib_zdscal( n, remax, vl( 1, ki ), 1 )
+                 else
+                    ! ------------------------------
+                    ! version 2: back-transform block of vectors with gemm
+                    ! zero out above vector
+                    ! could go from ki-nv+1 to ki-1
+                    do k = 1, ki - 1
+                       work( k + iv*n ) = czero
+                    end do
+                    ! columns 1:iv of work are valid vectors.
+                    ! when the number of vectors stored reaches nb,
+                    ! or if this was last vector, do the gemm
+                    if( (iv==nb) .or. (ki==n) ) then
+                       call stdlib_zgemm( 'N', 'N', n, iv, n-ki+iv, cone,vl( 1, ki-iv+1 ), ldvl,&
+                                 work( ki-iv+1 + (1)*n ), n,czero,work( 1 + (nb+1)*n ), n )
+                       ! normalize vectors
+                       do k = 1, iv
+                          ii = stdlib_izamax( n, work( 1 + (nb+k)*n ), 1 )
+                          remax = one / cabs1( work( ii + (nb+k)*n ) )
+                          call stdlib_zdscal( n, remax, work( 1 + (nb+k)*n ), 1 )
+                       end do
+                       call stdlib_zlacpy( 'F', n, iv,work( 1 + (nb+1)*n ), n,vl( 1, ki-iv+1 ), &
+                                 ldvl )
+                       iv = 1
+                    else
+                       iv = iv + 1
+                    end if
+                 end if
+                 ! restore the original diagonal elements of t.
+                 do k = ki + 1, n
+                    t( k, k ) = work( k )
+                 end do
+                 is = is + 1
+              end do loop_130
+           end if
+           return
+     end subroutine stdlib_ztrevc3
+
+     !> ZTREXC: reorders the Schur factorization of a complex matrix
+     !> A = Q*T*Q**H, so that the diagonal element of T with row index IFST
+     !> is moved to row ILST.
+     !> The Schur form T is reordered by a unitary similarity transformation
+     !> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by
+     !> postmultplying it with Z.
+
+     pure subroutine stdlib_ztrexc( compq, n, t, ldt, q, ldq, ifst, ilst, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq
+           integer(ilp), intent(in) :: ifst, ilst, ldq, ldt, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(dp), intent(inout) :: q(ldq,*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: wantq
+           integer(ilp) :: k, m1, m2, m3
+           real(dp) :: cs
+           complex(dp) :: sn, t11, t22, temp
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           info = 0
+           wantq = stdlib_lsame( compq, 'V' )
+           if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldt<max( 1, n ) ) then
+              info = -4
+           else if( ldq<1 .or. ( wantq .and. ldq<max( 1, n ) ) ) then
+              info = -6
+           else if(( ifst<1 .or. ifst>n ).and.( n>0 )) then
+              info = -7
+           else if(( ilst<1 .or. ilst>n ).and.( n>0 )) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTREXC', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 .or. ifst==ilst )return
+           if( ifst<ilst ) then
+              ! move the ifst-th diagonal element forward down the diagonal.
+              m1 = 0
+              m2 = -1
+              m3 = 1
+           else
+              ! move the ifst-th diagonal element backward up the diagonal.
+              m1 = -1
+              m2 = 0
+              m3 = -1
+           end if
+           do k = ifst + m1, ilst + m2, m3
+              ! interchange the k-th and (k+1)-th diagonal elements.
+              t11 = t( k, k )
+              t22 = t( k+1, k+1 )
+              ! determine the transformation to perform the interchange.
+              call stdlib_zlartg( t( k, k+1 ), t22-t11, cs, sn, temp )
+              ! apply transformation to the matrix t.
+              if( k+2<=n )call stdlib_zrot( n-k-1, t( k, k+2 ), ldt, t( k+1, k+2 ), ldt, cs,sn )
+                        
+              call stdlib_zrot( k-1, t( 1, k ), 1, t( 1, k+1 ), 1, cs,conjg( sn ) )
+              t( k, k ) = t22
+              t( k+1, k+1 ) = t11
+              if( wantq ) then
+                 ! accumulate transformation in the matrix q.
+                 call stdlib_zrot( n, q( 1, k ), 1, q( 1, k+1 ), 1, cs,conjg( sn ) )
+              end if
+           end do
+           return
+     end subroutine stdlib_ztrexc
+
+     !> ZTRRFS: provides error bounds and backward error estimates for the
+     !> solution to a system of linear equations with a triangular
+     !> coefficient matrix.
+     !> The solution matrix X must be computed by ZTRTRS or some other
+     !> means before entering this routine.  ZTRRFS does not do iterative
+     !> refinement because doing so cannot improve the backward error.
+
+     pure subroutine stdlib_ztrrfs( uplo, trans, diag, n, nrhs, a, lda, b, ldb, x,ldx, ferr, berr,&
+                work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: a(lda,*), b(ldb,*), x(ldx,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran, nounit, upper
+           character :: transn, transt
+           integer(ilp) :: i, j, k, kase, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           notran = stdlib_lsame( trans, 'N' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_250: do j = 1, nrhs
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_zcopy( n, x( 1, j ), 1, work, 1 )
+              call stdlib_ztrmv( uplo, trans, diag, n, a, lda, work, 1 )
+              call stdlib_zaxpy( n, -cone, b( 1, j ), 1, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              if( notran ) then
+                 ! compute abs(a)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = 1, k
+                             rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = 1, k - 1
+                             rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k, n
+                             rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                          end do
+                       end do
+                    else
+                       do k = 1, n
+                          xk = cabs1( x( k, j ) )
+                          do i = k + 1, n
+                             rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                          end do
+                          rwork( k ) = rwork( k ) + xk
+                       end do
+                    end if
+                 end if
+              else
+                 ! compute abs(a**h)*abs(x) + abs(b).
+                 if( upper ) then
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = 1, k
+                             s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = 1, k - 1
+                             s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    end if
+                 else
+                    if( nounit ) then
+                       do k = 1, n
+                          s = zero
+                          do i = k, n
+                             s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    else
+                       do k = 1, n
+                          s = cabs1( x( k, j ) )
+                          do i = k + 1, n
+                             s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                          end do
+                          rwork( k ) = rwork( k ) + s
+                       end do
+                    end if
+                 end if
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_zlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              210 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_ztrsv( uplo, transt, diag, n, a, lda, work, 1 )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_ztrsv( uplo, transn, diag, n, a, lda, work, 1 )
+                 end if
+                 go to 210
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_250
+           return
+     end subroutine stdlib_ztrrfs
+
+     !> ZTRSNA: estimates reciprocal condition numbers for specified
+     !> eigenvalues and/or right eigenvectors of a complex upper triangular
+     !> matrix T (or of any matrix Q*T*Q**H with Q unitary).
+
+     pure subroutine stdlib_ztrsna( job, howmny, select, n, t, ldt, vl, ldvl, vr,ldvr, s, sep, mm,&
+                m, work, ldwork, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldt, ldvl, ldvr, ldwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           real(dp), intent(out) :: rwork(*), s(*), sep(*)
+           complex(dp), intent(in) :: t(ldt,*), vl(ldvl,*), vr(ldvr,*)
+           complex(dp), intent(out) :: work(ldwork,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: somcon, wantbh, wants, wantsp
+           character :: normin
+           integer(ilp) :: i, ierr, ix, j, k, kase, ks
+           real(dp) :: bignum, eps, est, lnrm, rnrm, scale, smlnum, xnorm
+           complex(dp) :: cdum, prod
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           complex(dp) :: dummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantsp = stdlib_lsame( job, 'V' ) .or. wantbh
+           somcon = stdlib_lsame( howmny, 'S' )
+           ! set m to the number of eigenpairs for which condition numbers are
+           ! to be computed.
+           if( somcon ) then
+              m = 0
+              do j = 1, n
+                 if( select( j ) )m = m + 1
+              end do
+           else
+              m = n
+           end if
+           info = 0
+           if( .not.wants .and. .not.wantsp ) then
+              info = -1
+           else if( .not.stdlib_lsame( howmny, 'A' ) .and. .not.somcon ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldvl<1 .or. ( wants .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( wants .and. ldvr<n ) ) then
+              info = -10
+           else if( mm<m ) then
+              info = -13
+           else if( ldwork<1 .or. ( wantsp .and. ldwork<n ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRSNA', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( somcon ) then
+                 if( .not.select( 1 ) )return
+              end if
+              if( wants )s( 1 ) = one
+              if( wantsp )sep( 1 ) = abs( t( 1, 1 ) )
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ks = 1
+           loop_50: do k = 1, n
+              if( somcon ) then
+                 if( .not.select( k ) )cycle loop_50
+              end if
+              if( wants ) then
+                 ! compute the reciprocal condition number of the k-th
+                 ! eigenvalue.
+                 prod = stdlib_zdotc( n, vr( 1, ks ), 1, vl( 1, ks ), 1 )
+                 rnrm = stdlib_dznrm2( n, vr( 1, ks ), 1 )
+                 lnrm = stdlib_dznrm2( n, vl( 1, ks ), 1 )
+                 s( ks ) = abs( prod ) / ( rnrm*lnrm )
+              end if
+              if( wantsp ) then
+                 ! estimate the reciprocal condition number of the k-th
+                 ! eigenvector.
+                 ! copy the matrix t to the array work and swap the k-th
+                 ! diagonal element to the (1,1) position.
+                 call stdlib_zlacpy( 'FULL', n, n, t, ldt, work, ldwork )
+                 call stdlib_ztrexc( 'NO Q', n, work, ldwork, dummy, 1, k, 1, ierr )
+                 ! form  c = t22 - lambda*i in work(2:n,2:n).
+                 do i = 2, n
+                    work( i, i ) = work( i, i ) - work( 1, 1 )
+                 end do
+                 ! estimate a lower bound for the 1-norm of inv(c**h). the 1st
+                 ! and (n+1)th columns of work are used to store work vectors.
+                 sep( ks ) = zero
+                 est = zero
+                 kase = 0
+                 normin = 'N'
+                 30 continue
+                 call stdlib_zlacn2( n-1, work( 1, n+1 ), work, est, kase, isave )
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve c**h*x = scale*b
+                       call stdlib_zlatrs( 'UPPER', 'CONJUGATE TRANSPOSE','NONUNIT', normin, n-1, &
+                                 work( 2, 2 ),ldwork, work, scale, rwork, ierr )
+                    else
+                       ! solve c*x = scale*b
+                       call stdlib_zlatrs( 'UPPER', 'NO TRANSPOSE', 'NONUNIT',normin, n-1, work( &
+                                 2, 2 ), ldwork, work,scale, rwork, ierr )
+                    end if
+                    normin = 'Y'
+                    if( scale/=one ) then
+                       ! multiply by 1/scale if doing so will not cause
+                       ! overflow.
+                       ix = stdlib_izamax( n-1, work, 1 )
+                       xnorm = cabs1( work( ix, 1 ) )
+                       if( scale<xnorm*smlnum .or. scale==zero )go to 40
+                       call stdlib_zdrscl( n, scale, work, 1 )
+                    end if
+                    go to 30
+                 end if
+                 sep( ks ) = one / max( est, smlnum )
+              end if
+              40 continue
+              ks = ks + 1
+           end do loop_50
+           return
+     end subroutine stdlib_ztrsna
+
+     !> ZTRTI2: computes the inverse of a complex upper or lower triangular
+     !> matrix.
+     !> This is the Level 2 BLAS version of the algorithm.
+
+     pure subroutine stdlib_ztrti2( uplo, diag, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j
+           complex(dp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRTI2', -info )
+              return
+           end if
+           if( upper ) then
+              ! compute inverse of upper triangular matrix.
+              do j = 1, n
+                 if( nounit ) then
+                    a( j, j ) = cone / a( j, j )
+                    ajj = -a( j, j )
+                 else
+                    ajj = -cone
+                 end if
+                 ! compute elements 1:j-1 of j-th column.
+                 call stdlib_ztrmv( 'UPPER', 'NO TRANSPOSE', diag, j-1, a, lda,a( 1, j ), 1 )
+                           
+                 call stdlib_zscal( j-1, ajj, a( 1, j ), 1 )
+              end do
+           else
+              ! compute inverse of lower triangular matrix.
+              do j = n, 1, -1
+                 if( nounit ) then
+                    a( j, j ) = cone / a( j, j )
+                    ajj = -a( j, j )
+                 else
+                    ajj = -cone
+                 end if
+                 if( j<n ) then
+                    ! compute elements j+1:n of j-th column.
+                    call stdlib_ztrmv( 'LOWER', 'NO TRANSPOSE', diag, n-j,a( j+1, j+1 ), lda, a( &
+                              j+1, j ), 1 )
+                    call stdlib_zscal( n-j, ajj, a( j+1, j ), 1 )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_ztrti2
+
+     !> ZTRTRI: computes the inverse of a complex upper or lower triangular
+     !> matrix A.
+     !> This is the Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_ztrtri( uplo, diag, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, upper
+           integer(ilp) :: j, jb, nb, nn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity if non-unit.
+           if( nounit ) then
+              do info = 1, n
+                 if( a( info, info )==czero )return
+              end do
+              info = 0
+           end if
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'ZTRTRI', uplo // diag, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_ztrti2( uplo, diag, n, a, lda, info )
+           else
+              ! use blocked code
+              if( upper ) then
+                 ! compute inverse of upper triangular matrix
+                 do j = 1, n, nb
+                    jb = min( nb, n-j+1 )
+                    ! compute rows 1:j-1 of current block column
+                    call stdlib_ztrmm( 'LEFT', 'UPPER', 'NO TRANSPOSE', diag, j-1,jb, cone, a, &
+                              lda, a( 1, j ), lda )
+                    call stdlib_ztrsm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', diag, j-1,jb, -cone, a( &
+                              j, j ), lda, a( 1, j ), lda )
+                    ! compute inverse of current diagonal block
+                    call stdlib_ztrti2( 'UPPER', diag, jb, a( j, j ), lda, info )
+                 end do
+              else
+                 ! compute inverse of lower triangular matrix
+                 nn = ( ( n-1 ) / nb )*nb + 1
+                 do j = nn, 1, -nb
+                    jb = min( nb, n-j+1 )
+                    if( j+jb<=n ) then
+                       ! compute rows j+jb:n of current block column
+                       call stdlib_ztrmm( 'LEFT', 'LOWER', 'NO TRANSPOSE', diag,n-j-jb+1, jb, &
+                                 cone, a( j+jb, j+jb ), lda,a( j+jb, j ), lda )
+                       call stdlib_ztrsm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', diag,n-j-jb+1, jb, -&
+                                 cone, a( j, j ), lda,a( j+jb, j ), lda )
+                    end if
+                    ! compute inverse of current diagonal block
+                    call stdlib_ztrti2( 'LOWER', diag, jb, a( j, j ), lda, info )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_ztrtri
+
+     !> ZTRTRS: solves a triangular system of the form
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> where A is a triangular matrix of order N, and B is an N-by-NRHS
+     !> matrix.  A check is made to verify that A is nonsingular.
+
+     pure subroutine stdlib_ztrtrs( uplo, trans, diag, n, nrhs, a, lda, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'T' ) .and. &
+                     .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! check for singularity.
+           if( nounit ) then
+              do info = 1, n
+                 if( a( info, info )==czero )return
+              end do
+           end if
+           info = 0
+           ! solve a * x = b,  a**t * x = b,  or  a**h * x = b.
+           call stdlib_ztrsm( 'LEFT', uplo, trans, diag, n, nrhs, cone, a, lda, b,ldb )
+           return
+     end subroutine stdlib_ztrtrs
+
+     !> ZTRTTF: copies a triangular matrix A from standard full format (TR)
+     !> to rectangular full packed format (TF) .
+
+     pure subroutine stdlib_ztrttf( transr, uplo, n, a, lda, arf, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(0:lda-1,0:*)
+           complex(dp), intent(out) :: arf(0:*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: i, ij, j, k, l, n1, n2, nt, nx2, np1x2
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRTTF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=1 ) then
+              if( n==1 ) then
+                 if( normaltransr ) then
+                    arf( 0 ) = a( 0, 0 )
+                 else
+                    arf( 0 ) = conjg( a( 0, 0 ) )
+                 end if
+              end if
+              return
+           end if
+           ! size of array arf(1:2,0:nt-1)
+           nt = n*( n+1 ) / 2
+           ! set n1 and n2 depending on lower: for n even n1=n2=k
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! if n is odd, set nisodd = .true., lda=n+1 and a is (n+1)--by--k2.
+           ! if n is even, set k = n/2 and nisodd = .false., lda=n and a is
+           ! n--by--(n+1)/2.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+              if( .not.lower )np1x2 = n + n + 2
+           else
+              nisodd = .true.
+              if( .not.lower )nx2 = n + n
+           end if
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1); lda=n
+                    ij = 0
+                    do j = 0, n2
+                       do i = n1, n2 + j
+                          arf( ij ) = conjg( a( n2+j, i ) )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0); lda=n
+                    ij = nt - n
+                    do j = n - 1, n1, -1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = j - n1, n1 - 1
+                          arf( ij ) = conjg( a( j-n1, l ) )
+                          ij = ij + 1
+                       end do
+                       ij = ij - nx2
+                    end do
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    ij = 0
+                    do j = 0, n2 - 1
+                       do i = 0, j
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                       do i = n1 + j, n - 1
+                          arf( ij ) = a( i, n1+j )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = n2, n - 1
+                       do i = 0, n1 - 1
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda=n2
+                    ij = 0
+                    do j = 0, n1
+                       do i = n1, n - 1
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, n1 - 1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = n2 + j, n - 1
+                          arf( ij ) = conjg( a( n2+j, l ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1); lda=n+1
+                    ij = 0
+                    do j = 0, k - 1
+                       do i = k, k + j
+                          arf( ij ) = conjg( a( k+j, i ) )
+                          ij = ij + 1
+                       end do
+                       do i = j, n - 1
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0); lda=n+1
+                    ij = nt - n - 1
+                    do j = n - 1, k, -1
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = j - k, k - 1
+                          arf( ij ) = conjg( a( j-k, l ) )
+                          ij = ij + 1
+                       end do
+                       ij = ij - np1x2
+                    end do
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper, a=b)
+                    ! t1 -> a(0,1) , t2 -> a(0,0) , s -> a(0,k+1) :
+                    ! t1 -> a(0+k) , t2 -> a(0+0) , s -> a(0+k*(k+1)); lda=k
+                    ij = 0
+                    j = k
+                    do i = k, n - 1
+                       arf( ij ) = a( i, j )
+                       ij = ij + 1
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                       do i = k + 1 + j, n - 1
+                          arf( ij ) = a( i, k+1+j )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = k - 1, n - 1
+                       do i = 0, k - 1
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                 else
+                    ! srpa for upper, transpose and n is even (see paper, a=b)
+                    ! t1 -> a(0,k+1) , t2 -> a(0,k) , s -> a(0,0)
+                    ! t1 -> a(0+k*(k+1)) , t2 -> a(0+k*k) , s -> a(0+0)); lda=k
+                    ij = 0
+                    do j = 0, k
+                       do i = k, n - 1
+                          arf( ij ) = conjg( a( j, i ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    do j = 0, k - 2
+                       do i = 0, j
+                          arf( ij ) = a( i, j )
+                          ij = ij + 1
+                       end do
+                       do l = k + 1 + j, n - 1
+                          arf( ij ) = conjg( a( k+1+j, l ) )
+                          ij = ij + 1
+                       end do
+                    end do
+                    ! note that here j = k-1
+                    do i = 0, j
+                       arf( ij ) = a( i, j )
+                       ij = ij + 1
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ztrttf
+
+     !> ZTRTTP: copies a triangular matrix A from full format (TR) to standard
+     !> packed format (TP).
+
+     pure subroutine stdlib_ztrttp( uplo, n, a, lda, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n, lda
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(out) :: ap(*)
+        ! =====================================================================
+           ! Parameters 
+           ! Local Scalars 
+           logical(lk) :: lower
+           integer(ilp) :: i, j, k
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRTTP', -info )
+              return
+           end if
+           if( lower ) then
+              k = 0
+              do j = 1, n
+                 do i = j, n
+                    k = k + 1
+                    ap( k ) = a( i, j )
+                 end do
+              end do
+           else
+              k = 0
+              do j = 1, n
+                 do i = 1, j
+                    k = k + 1
+                    ap( k ) = a( i, j )
+                 end do
+              end do
+           end if
+           return
+     end subroutine stdlib_ztrttp
+
+     !> ZTZRZF: reduces the M-by-N ( M<=N ) complex upper trapezoidal matrix A
+     !> to upper triangular form by means of unitary transformations.
+     !> The upper trapezoidal matrix A is factored as
+     !> A = ( R  0 ) * Z,
+     !> where Z is an N-by-N unitary matrix and R is an M-by-M upper
+     !> triangular matrix.
+
+     pure subroutine stdlib_ztzrzf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iws, ki, kk, ldwork, lwkmin, lwkopt, m1, mu, nb, nbmin, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              if( m==0 .or. m==n ) then
+                 lwkopt = 1
+                 lwkmin = 1
+              else
+                 ! determine the block size.
+                 nb = stdlib_ilaenv( 1, 'ZGERQF', ' ', m, n, -1, -1 )
+                 lwkopt = m*nb
+                 lwkmin = max( 1, m )
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTZRZF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 ) then
+              return
+           else if( m==n ) then
+              do i = 1, n
+                 tau( i ) = czero
+              end do
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = m
+           if( nb>1 .and. nb<m ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZGERQF', ' ', m, n, -1, -1 ) )
+              if( nx<m ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGERQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<m .and. nx<m ) then
+              ! use blocked code initially.
+              ! the last kk rows are handled by the block method.
+              m1 = min( m+1, n )
+              ki = ( ( m-nx-1 ) / nb )*nb
+              kk = min( m, ki+nb )
+              do i = m - kk + ki + 1, m - kk + 1, -nb
+                 ib = min( m-i+1, nb )
+                 ! compute the tz factorization of the current block
+                 ! a(i:i+ib-1,i:n)
+                 call stdlib_zlatrz( ib, n-i+1, n-m, a( i, i ), lda, tau( i ),work )
+                 if( i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_zlarzt( 'BACKWARD', 'ROWWISE', n-m, ib, a( i, m1 ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h to a(1:i-1,i:n) from the right
+                    call stdlib_zlarzb( 'RIGHT', 'NO TRANSPOSE', 'BACKWARD','ROWWISE', i-1, n-i+1,&
+                     ib, n-m, a( i, m1 ),lda, work, ldwork, a( 1, i ), lda,work( ib+1 ), ldwork )
+                               
+                 end if
+              end do
+              mu = i + nb - 1
+           else
+              mu = m
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 )call stdlib_zlatrz( mu, n, n-m, a, lda, tau, work )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_ztzrzf
+
+     !> ZUNBDB: simultaneously bidiagonalizes the blocks of an M-by-M
+     !> partitioned unitary matrix X:
+     !> [ B11 | B12 0  0 ]
+     !> [ X11 | X12 ]   [ P1 |    ] [  0  |  0 -I  0 ] [ Q1 |    ]**H
+     !> X = [-----------] = [---------] [----------------] [---------]   .
+     !> [ X21 | X22 ]   [    | P2 ] [ B21 | B22 0  0 ] [    | Q2 ]
+     !> [  0  |  0  0  I ]
+     !> X11 is P-by-Q. Q must be no larger than P, M-P, or M-Q. (If this is
+     !> not the case, then X must be transposed and/or permuted. This can be
+     !> done in constant time using the TRANS and SIGNS options. See ZUNCSD
+     !> for details.)
+     !> The unitary matrices P1, P2, Q1, and Q2 are P-by-P, (M-P)-by-
+     !> (M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. They are
+     !> represented implicitly by Householder vectors.
+     !> B11, B12, B21, and B22 are Q-by-Q bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_zunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+               ldx22, theta, phi, taup1,taup2, tauq1, tauq2, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: signs, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldx11, ldx12, ldx21, ldx22, lwork, m, p, q
+           ! Array Arguments 
+           real(dp), intent(out) :: phi(*), theta(*)
+           complex(dp), intent(out) :: taup1(*), taup2(*), tauq1(*), tauq2(*), work(*)
+           complex(dp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
+                     
+        ! ====================================================================
+           ! Parameters 
+           real(dp), parameter :: realone = 1.0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: colmajor, lquery
+           integer(ilp) :: i, lworkmin, lworkopt
+           real(dp) :: z1, z2, z3, z4
+           ! Intrinsic Functions
+           intrinsic :: atan2,cos,max,min,sin
+           intrinsic :: cmplx,conjg
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           if( .not. stdlib_lsame( signs, 'O' ) ) then
+              z1 = realone
+              z2 = realone
+              z3 = realone
+              z4 = realone
+           else
+              z1 = realone
+              z2 = -realone
+              z3 = realone
+              z4 = -realone
+           end if
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -3
+           else if( p < 0 .or. p > m ) then
+              info = -4
+           else if( q < 0 .or. q > p .or. q > m-p .or.q > m-q ) then
+              info = -5
+           else if( colmajor .and. ldx11 < max( 1, p ) ) then
+              info = -7
+           else if( .not.colmajor .and. ldx11 < max( 1, q ) ) then
+              info = -7
+           else if( colmajor .and. ldx12 < max( 1, p ) ) then
+              info = -9
+           else if( .not.colmajor .and. ldx12 < max( 1, m-q ) ) then
+              info = -9
+           else if( colmajor .and. ldx21 < max( 1, m-p ) ) then
+              info = -11
+           else if( .not.colmajor .and. ldx21 < max( 1, q ) ) then
+              info = -11
+           else if( colmajor .and. ldx22 < max( 1, m-p ) ) then
+              info = -13
+           else if( .not.colmajor .and. ldx22 < max( 1, m-q ) ) then
+              info = -13
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              lworkopt = m - q
+              lworkmin = m - q
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not. lquery ) then
+                 info = -21
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'XORBDB', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! handle column-major and row-major separately
+           if( colmajor ) then
+              ! reduce columns 1, ..., q of x11, x12, x21, and x22
+              do i = 1, q
+                 if( i == 1 ) then
+                    call stdlib_zscal( p-i+1, cmplx( z1, 0.0_dp,KIND=dp), x11(i,i), 1 )
+                 else
+                    call stdlib_zscal( p-i+1, cmplx( z1*cos(phi(i-1)), 0.0_dp,KIND=dp),x11(i,i), &
+                              1 )
+                    call stdlib_zaxpy( p-i+1, cmplx( -z1*z3*z4*sin(phi(i-1)),0.0_dp,KIND=dp), x12(&
+                              i,i-1), 1, x11(i,i), 1 )
+                 end if
+                 if( i == 1 ) then
+                    call stdlib_zscal( m-p-i+1, cmplx( z2, 0.0_dp,KIND=dp), x21(i,i), 1 )
+                 else
+                    call stdlib_zscal( m-p-i+1, cmplx( z2*cos(phi(i-1)), 0.0_dp,KIND=dp),x21(i,i),&
+                               1 )
+                    call stdlib_zaxpy( m-p-i+1, cmplx( -z2*z3*z4*sin(phi(i-1)),0.0_dp,KIND=dp), &
+                              x22(i,i-1), 1, x21(i,i), 1 )
+                 end if
+                 theta(i) = atan2( stdlib_dznrm2( m-p-i+1, x21(i,i), 1 ),stdlib_dznrm2( p-i+1, &
+                           x11(i,i), 1 ) )
+                 if( p > i ) then
+                    call stdlib_zlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+                 else if ( p == i ) then
+                    call stdlib_zlarfgp( p-i+1, x11(i,i), x11(i,i), 1, taup1(i) )
+                 end if
+                 x11(i,i) = cone
+                 if ( m-p > i ) then
+                    call stdlib_zlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1,taup2(i) )
+                 else if ( m-p == i ) then
+                    call stdlib_zlarfgp( m-p-i+1, x21(i,i), x21(i,i), 1,taup2(i) )
+                 end if
+                 x21(i,i) = cone
+                 if ( q > i ) then
+                    call stdlib_zlarf( 'L', p-i+1, q-i, x11(i,i), 1,conjg(taup1(i)), x11(i,i+1), &
+                              ldx11, work )
+                    call stdlib_zlarf( 'L', m-p-i+1, q-i, x21(i,i), 1,conjg(taup2(i)), x21(i,i+1),&
+                               ldx21, work )
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_zlarf( 'L', p-i+1, m-q-i+1, x11(i,i), 1,conjg(taup1(i)), x12(i,i),&
+                               ldx12, work )
+                    call stdlib_zlarf( 'L', m-p-i+1, m-q-i+1, x21(i,i), 1,conjg(taup2(i)), x22(i,&
+                              i), ldx22, work )
+                 end if
+                 if( i < q ) then
+                    call stdlib_zscal( q-i, cmplx( -z1*z3*sin(theta(i)), 0.0_dp,KIND=dp),x11(i,i+&
+                              1), ldx11 )
+                    call stdlib_zaxpy( q-i, cmplx( z2*z3*cos(theta(i)), 0.0_dp,KIND=dp),x21(i,i+1)&
+                              , ldx21, x11(i,i+1), ldx11 )
+                 end if
+                 call stdlib_zscal( m-q-i+1, cmplx( -z1*z4*sin(theta(i)), 0.0_dp,KIND=dp),x12(i,i)&
+                           , ldx12 )
+                 call stdlib_zaxpy( m-q-i+1, cmplx( z2*z4*cos(theta(i)), 0.0_dp,KIND=dp),x22(i,i),&
+                            ldx22, x12(i,i), ldx12 )
+                 if( i < q )phi(i) = atan2( stdlib_dznrm2( q-i, x11(i,i+1), ldx11 ),stdlib_dznrm2(&
+                            m-q-i+1, x12(i,i), ldx12 ) )
+                 if( i < q ) then
+                    call stdlib_zlacgv( q-i, x11(i,i+1), ldx11 )
+                    if ( i == q-1 ) then
+                       call stdlib_zlarfgp( q-i, x11(i,i+1), x11(i,i+1), ldx11,tauq1(i) )
+                    else
+                       call stdlib_zlarfgp( q-i, x11(i,i+1), x11(i,i+2), ldx11,tauq1(i) )
+                    end if
+                    x11(i,i+1) = cone
+                 end if
+                 if ( m-q+1 > i ) then
+                    call stdlib_zlacgv( m-q-i+1, x12(i,i), ldx12 )
+                    if ( m-q == i ) then
+                       call stdlib_zlarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
+                    else
+                       call stdlib_zlarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
+                                 
+                    end if
+                 end if
+                 x12(i,i) = cone
+                 if( i < q ) then
+                    call stdlib_zlarf( 'R', p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x11(i+1,i+1), &
+                              ldx11, work )
+                    call stdlib_zlarf( 'R', m-p-i, q-i, x11(i,i+1), ldx11, tauq1(i),x21(i+1,i+1), &
+                              ldx21, work )
+                 end if
+                 if ( p > i ) then
+                    call stdlib_zlarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
+                              ldx12, work )
+                 end if
+                 if ( m-p > i ) then
+                    call stdlib_zlarf( 'R', m-p-i, m-q-i+1, x12(i,i), ldx12,tauq2(i), x22(i+1,i), &
+                              ldx22, work )
+                 end if
+                 if( i < q )call stdlib_zlacgv( q-i, x11(i,i+1), ldx11 )
+                 call stdlib_zlacgv( m-q-i+1, x12(i,i), ldx12 )
+              end do
+              ! reduce columns q + 1, ..., p of x12, x22
+              do i = q + 1, p
+                 call stdlib_zscal( m-q-i+1, cmplx( -z1*z4, 0.0_dp,KIND=dp), x12(i,i),ldx12 )
+                           
+                 call stdlib_zlacgv( m-q-i+1, x12(i,i), ldx12 )
+                 if ( i >= m-q ) then
+                    call stdlib_zlarfgp( m-q-i+1, x12(i,i), x12(i,i), ldx12,tauq2(i) )
+                 else
+                    call stdlib_zlarfgp( m-q-i+1, x12(i,i), x12(i,i+1), ldx12,tauq2(i) )
+                 end if
+                 x12(i,i) = cone
+                 if ( p > i ) then
+                    call stdlib_zlarf( 'R', p-i, m-q-i+1, x12(i,i), ldx12, tauq2(i),x12(i+1,i), &
+                              ldx12, work )
+                 end if
+                 if( m-p-q >= 1 )call stdlib_zlarf( 'R', m-p-q, m-q-i+1, x12(i,i), ldx12,tauq2(i),&
+                            x22(q+1,i), ldx22, work )
+                 call stdlib_zlacgv( m-q-i+1, x12(i,i), ldx12 )
+              end do
+              ! reduce columns p + 1, ..., m - q of x12, x22
+              do i = 1, m - p - q
+                 call stdlib_zscal( m-p-q-i+1, cmplx( z2*z4, 0.0_dp,KIND=dp),x22(q+i,p+i), ldx22 )
+                           
+                 call stdlib_zlacgv( m-p-q-i+1, x22(q+i,p+i), ldx22 )
+                 call stdlib_zlarfgp( m-p-q-i+1, x22(q+i,p+i), x22(q+i,p+i+1),ldx22, tauq2(p+i) )
+                           
+                 x22(q+i,p+i) = cone
+                 call stdlib_zlarf( 'R', m-p-q-i, m-p-q-i+1, x22(q+i,p+i), ldx22,tauq2(p+i), x22(&
+                           q+i+1,p+i), ldx22, work )
+                 call stdlib_zlacgv( m-p-q-i+1, x22(q+i,p+i), ldx22 )
+              end do
+           else
+              ! reduce columns 1, ..., q of x11, x12, x21, x22
+              do i = 1, q
+                 if( i == 1 ) then
+                    call stdlib_zscal( p-i+1, cmplx( z1, 0.0_dp,KIND=dp), x11(i,i),ldx11 )
+                 else
+                    call stdlib_zscal( p-i+1, cmplx( z1*cos(phi(i-1)), 0.0_dp,KIND=dp),x11(i,i), &
+                              ldx11 )
+                    call stdlib_zaxpy( p-i+1, cmplx( -z1*z3*z4*sin(phi(i-1)),0.0_dp,KIND=dp), x12(&
+                              i-1,i), ldx12, x11(i,i), ldx11 )
+                 end if
+                 if( i == 1 ) then
+                    call stdlib_zscal( m-p-i+1, cmplx( z2, 0.0_dp,KIND=dp), x21(i,i),ldx21 )
+                              
+                 else
+                    call stdlib_zscal( m-p-i+1, cmplx( z2*cos(phi(i-1)), 0.0_dp,KIND=dp),x21(i,i),&
+                               ldx21 )
+                    call stdlib_zaxpy( m-p-i+1, cmplx( -z2*z3*z4*sin(phi(i-1)),0.0_dp,KIND=dp), &
+                              x22(i-1,i), ldx22, x21(i,i), ldx21 )
+                 end if
+                 theta(i) = atan2( stdlib_dznrm2( m-p-i+1, x21(i,i), ldx21 ),stdlib_dznrm2( p-i+1,&
+                            x11(i,i), ldx11 ) )
+                 call stdlib_zlacgv( p-i+1, x11(i,i), ldx11 )
+                 call stdlib_zlacgv( m-p-i+1, x21(i,i), ldx21 )
+                 call stdlib_zlarfgp( p-i+1, x11(i,i), x11(i,i+1), ldx11, taup1(i) )
+                 x11(i,i) = cone
+                 if ( i == m-p ) then
+                    call stdlib_zlarfgp( m-p-i+1, x21(i,i), x21(i,i), ldx21,taup2(i) )
+                 else
+                    call stdlib_zlarfgp( m-p-i+1, x21(i,i), x21(i,i+1), ldx21,taup2(i) )
+                 end if
+                 x21(i,i) = cone
+                 call stdlib_zlarf( 'R', q-i, p-i+1, x11(i,i), ldx11, taup1(i),x11(i+1,i), ldx11, &
+                           work )
+                 call stdlib_zlarf( 'R', m-q-i+1, p-i+1, x11(i,i), ldx11, taup1(i),x12(i,i), &
+                           ldx12, work )
+                 call stdlib_zlarf( 'R', q-i, m-p-i+1, x21(i,i), ldx21, taup2(i),x21(i+1,i), &
+                           ldx21, work )
+                 call stdlib_zlarf( 'R', m-q-i+1, m-p-i+1, x21(i,i), ldx21,taup2(i), x22(i,i), &
+                           ldx22, work )
+                 call stdlib_zlacgv( p-i+1, x11(i,i), ldx11 )
+                 call stdlib_zlacgv( m-p-i+1, x21(i,i), ldx21 )
+                 if( i < q ) then
+                    call stdlib_zscal( q-i, cmplx( -z1*z3*sin(theta(i)), 0.0_dp,KIND=dp),x11(i+1,&
+                              i), 1 )
+                    call stdlib_zaxpy( q-i, cmplx( z2*z3*cos(theta(i)), 0.0_dp,KIND=dp),x21(i+1,i)&
+                              , 1, x11(i+1,i), 1 )
+                 end if
+                 call stdlib_zscal( m-q-i+1, cmplx( -z1*z4*sin(theta(i)), 0.0_dp,KIND=dp),x12(i,i)&
+                           , 1 )
+                 call stdlib_zaxpy( m-q-i+1, cmplx( z2*z4*cos(theta(i)), 0.0_dp,KIND=dp),x22(i,i),&
+                            1, x12(i,i), 1 )
+                 if( i < q )phi(i) = atan2( stdlib_dznrm2( q-i, x11(i+1,i), 1 ),stdlib_dznrm2( m-&
+                           q-i+1, x12(i,i), 1 ) )
+                 if( i < q ) then
+                    call stdlib_zlarfgp( q-i, x11(i+1,i), x11(i+2,i), 1, tauq1(i) )
+                    x11(i+1,i) = cone
+                 end if
+                 call stdlib_zlarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1, tauq2(i) )
+                 x12(i,i) = cone
+                 if( i < q ) then
+                    call stdlib_zlarf( 'L', q-i, p-i, x11(i+1,i), 1,conjg(tauq1(i)), x11(i+1,i+1),&
+                               ldx11, work )
+                    call stdlib_zlarf( 'L', q-i, m-p-i, x11(i+1,i), 1,conjg(tauq1(i)), x21(i+1,i+&
+                              1), ldx21, work )
+                 end if
+                 call stdlib_zlarf( 'L', m-q-i+1, p-i, x12(i,i), 1,conjg(tauq2(i)), x12(i,i+1), &
+                           ldx12, work )
+                 if ( m-p > i ) then
+                    call stdlib_zlarf( 'L', m-q-i+1, m-p-i, x12(i,i), 1,conjg(tauq2(i)), x22(i,i+&
+                              1), ldx22, work )
+                 end if
+              end do
+              ! reduce columns q + 1, ..., p of x12, x22
+              do i = q + 1, p
+                 call stdlib_zscal( m-q-i+1, cmplx( -z1*z4, 0.0_dp,KIND=dp), x12(i,i), 1 )
+                 call stdlib_zlarfgp( m-q-i+1, x12(i,i), x12(i+1,i), 1, tauq2(i) )
+                 x12(i,i) = cone
+                 if ( p > i ) then
+                    call stdlib_zlarf( 'L', m-q-i+1, p-i, x12(i,i), 1,conjg(tauq2(i)), x12(i,i+1),&
+                               ldx12, work )
+                 end if
+                 if( m-p-q >= 1 )call stdlib_zlarf( 'L', m-q-i+1, m-p-q, x12(i,i), 1,conjg(tauq2(&
+                           i)), x22(i,q+1), ldx22, work )
+              end do
+              ! reduce columns p + 1, ..., m - q of x12, x22
+              do i = 1, m - p - q
+                 call stdlib_zscal( m-p-q-i+1, cmplx( z2*z4, 0.0_dp,KIND=dp),x22(p+i,q+i), 1 )
+                           
+                 call stdlib_zlarfgp( m-p-q-i+1, x22(p+i,q+i), x22(p+i+1,q+i), 1,tauq2(p+i) )
+                           
+                 x22(p+i,q+i) = cone
+                 if ( m-p-q /= i ) then
+                    call stdlib_zlarf( 'L', m-p-q-i+1, m-p-q-i, x22(p+i,q+i), 1,conjg(tauq2(p+i)),&
+                               x22(p+i,q+i+1), ldx22,work )
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_zunbdb
+
+     !> ZUNBDB6: orthogonalizes the column vector
+     !> X = [ X1 ]
+     !> [ X2 ]
+     !> with respect to the columns of
+     !> Q = [ Q1 ] .
+     !> [ Q2 ]
+     !> The columns of Q must be orthonormal.
+     !> If the projection is zero according to Kahan's "twice is enough"
+     !> criterion, then the zero vector is returned.
+
+     pure subroutine stdlib_zunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(dp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: x1(*), x2(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: alphasq = 0.01_dp
+           real(dp), parameter :: realone = 1.0_dp
+           real(dp), parameter :: realzero = 0.0_dp
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: normsq1, normsq2, scl1, scl2, ssq1, ssq2
+           ! Intrinsic Function 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           if( m1 < 0 ) then
+              info = -1
+           else if( m2 < 0 ) then
+              info = -2
+           else if( n < 0 ) then
+              info = -3
+           else if( incx1 < 1 ) then
+              info = -5
+           else if( incx2 < 1 ) then
+              info = -7
+           else if( ldq1 < max( 1, m1 ) ) then
+              info = -9
+           else if( ldq2 < max( 1, m2 ) ) then
+              info = -11
+           else if( lwork < n ) then
+              info = -13
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNBDB6', -info )
+              return
+           end if
+           ! first, project x onto the orthogonal complement of q's column
+           ! space
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_zlassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_zlassq( m2, x2, incx2, scl2, ssq2 )
+           normsq1 = scl1**2*ssq1 + scl2**2*ssq2
+           if( m1 == 0 ) then
+              do i = 1, n
+                 work(i) = czero
+              end do
+           else
+              call stdlib_zgemv( 'C', m1, n, cone, q1, ldq1, x1, incx1, czero, work,1 )
+           end if
+           call stdlib_zgemv( 'C', m2, n, cone, q2, ldq2, x2, incx2, cone, work, 1 )
+           call stdlib_zgemv( 'N', m1, n, cnegone, q1, ldq1, work, 1, cone, x1,incx1 )
+           call stdlib_zgemv( 'N', m2, n, cnegone, q2, ldq2, work, 1, cone, x2,incx2 )
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_zlassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_zlassq( m2, x2, incx2, scl2, ssq2 )
+           normsq2 = scl1**2*ssq1 + scl2**2*ssq2
+           ! if projection is sufficiently large in norm, then stop.
+           ! if projection is czero, then stop.
+           ! otherwise, project again.
+           if( normsq2 >= alphasq*normsq1 ) then
+              return
+           end if
+           if( normsq2 == czero ) then
+              return
+           end if
+           normsq1 = normsq2
+           do i = 1, n
+              work(i) = czero
+           end do
+           if( m1 == 0 ) then
+              do i = 1, n
+                 work(i) = czero
+              end do
+           else
+              call stdlib_zgemv( 'C', m1, n, cone, q1, ldq1, x1, incx1, czero, work,1 )
+           end if
+           call stdlib_zgemv( 'C', m2, n, cone, q2, ldq2, x2, incx2, cone, work, 1 )
+           call stdlib_zgemv( 'N', m1, n, cnegone, q1, ldq1, work, 1, cone, x1,incx1 )
+           call stdlib_zgemv( 'N', m2, n, cnegone, q2, ldq2, work, 1, cone, x2,incx2 )
+           scl1 = realzero
+           ssq1 = realone
+           call stdlib_zlassq( m1, x1, incx1, scl1, ssq1 )
+           scl2 = realzero
+           ssq2 = realone
+           call stdlib_zlassq( m1, x1, incx1, scl1, ssq1 )
+           normsq2 = scl1**2*ssq1 + scl2**2*ssq2
+           ! if second projection is sufficiently large in norm, then do
+           ! nothing more. alternatively, if it shrunk significantly, then
+           ! truncate it to czero.
+           if( normsq2 < alphasq*normsq1 ) then
+              do i = 1, m1
+                 x1(i) = czero
+              end do
+              do i = 1, m2
+                 x2(i) = czero
+              end do
+           end if
+           return
+     end subroutine stdlib_zunbdb6
+
+     !> ZUNG2L: generates an m by n complex matrix Q with orthonormal columns,
+     !> which is defined as the last n columns of a product of k elementary
+     !> reflectors of order m
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by ZGEQLF.
+
+     pure subroutine stdlib_zung2l( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNG2L', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           ! initialise columns 1:n-k to columns of the unit matrix
+           do j = 1, n - k
+              do l = 1, m
+                 a( l, j ) = czero
+              end do
+              a( m-n+j, j ) = cone
+           end do
+           do i = 1, k
+              ii = n - k + i
+              ! apply h(i) to a(1:m-k+i,1:n-k+i) from the left
+              a( m-n+ii, ii ) = cone
+              call stdlib_zlarf( 'LEFT', m-n+ii, ii-1, a( 1, ii ), 1, tau( i ), a,lda, work )
+                        
+              call stdlib_zscal( m-n+ii-1, -tau( i ), a( 1, ii ), 1 )
+              a( m-n+ii, ii ) = cone - tau( i )
+              ! set a(m-k+i+1:m,n-k+i) to czero
+              do l = m - n + ii + 1, m
+                 a( l, ii ) = czero
+              end do
+           end do
+           return
+     end subroutine stdlib_zung2l
+
+     !> ZUNG2R: generates an m by n complex matrix Q with orthonormal columns,
+     !> which is defined as the first n columns of a product of k elementary
+     !> reflectors of order m
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by ZGEQRF.
+
+     pure subroutine stdlib_zung2r( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNG2R', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           ! initialise columns k+1:n to columns of the unit matrix
+           do j = k + 1, n
+              do l = 1, m
+                 a( l, j ) = czero
+              end do
+              a( j, j ) = cone
+           end do
+           do i = k, 1, -1
+              ! apply h(i) to a(i:m,i:n) from the left
+              if( i<n ) then
+                 a( i, i ) = cone
+                 call stdlib_zlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1, tau( i ),a( i, i+1 ), lda, &
+                           work )
+              end if
+              if( i<m )call stdlib_zscal( m-i, -tau( i ), a( i+1, i ), 1 )
+              a( i, i ) = cone - tau( i )
+              ! set a(1:i-1,i) to czero
+              do l = 1, i - 1
+                 a( l, i ) = czero
+              end do
+           end do
+           return
+     end subroutine stdlib_zung2r
+
+     !> ZUNGL2: generates an m-by-n complex matrix Q with orthonormal rows,
+     !> which is defined as the first m rows of a product of k elementary
+     !> reflectors of order n
+     !> Q  =  H(k)**H . . . H(2)**H H(1)**H
+     !> as returned by ZGELQF.
+
+     pure subroutine stdlib_zungl2( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGL2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 )return
+           if( k<m ) then
+              ! initialise rows k+1:m to rows of the unit matrix
+              do j = 1, n
+                 do l = k + 1, m
+                    a( l, j ) = czero
+                 end do
+                 if( j>k .and. j<=m )a( j, j ) = cone
+              end do
+           end if
+           do i = k, 1, -1
+              ! apply h(i)**h to a(i:m,i:n) from the right
+              if( i<n ) then
+                 call stdlib_zlacgv( n-i, a( i, i+1 ), lda )
+                 if( i<m ) then
+                    a( i, i ) = cone
+                    call stdlib_zlarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda,conjg( tau( i ) ), a( &
+                              i+1, i ), lda, work )
+                 end if
+                 call stdlib_zscal( n-i, -tau( i ), a( i, i+1 ), lda )
+                 call stdlib_zlacgv( n-i, a( i, i+1 ), lda )
+              end if
+              a( i, i ) = cone - conjg( tau( i ) )
+              ! set a(i,1:i-1) to czero
+              do l = 1, i - 1
+                 a( i, l ) = czero
+              end do
+           end do
+           return
+     end subroutine stdlib_zungl2
+
+     !> ZUNGLQ: generates an M-by-N complex matrix Q with orthonormal rows,
+     !> which is defined as the first M rows of a product of K elementary
+     !> reflectors of order N
+     !> Q  =  H(k)**H . . . H(2)**H H(1)**H
+     !> as returned by ZGELQF.
+
+     pure subroutine stdlib_zunglq( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, ki, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZUNGLQ', ' ', m, n, k, -1 )
+           lwkopt = max( 1, m )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( lwork<max( 1, m ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGLQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZUNGLQ', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNGLQ', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the last block.
+              ! the first kk rows are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              ! set a(kk+1:m,1:kk) to czero.
+              do j = 1, kk
+                 do i = kk + 1, m
+                    a( i, j ) = czero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the last or only block.
+           if( kk<m )call stdlib_zungl2( m-kk, n-kk, k-kk, a( kk+1, kk+1 ), lda,tau( kk+1 ), work,&
+                      iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = ki + 1, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 if( i+ib<=m ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_zlarft( 'FORWARD', 'ROWWISE', n-i+1, ib, a( i, i ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h**h to a(i+ib:m,i:n) from the right
+                    call stdlib_zlarfb( 'RIGHT', 'CONJUGATE TRANSPOSE', 'FORWARD','ROWWISE', m-i-&
+                    ib+1, n-i+1, ib, a( i, i ),lda, work, ldwork, a( i+ib, i ), lda,work( ib+1 ), &
+                              ldwork )
+                 end if
+                 ! apply h**h to columns i:n of current block
+                 call stdlib_zungl2( ib, n-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 ! set columns 1:i-1 of current block to czero
+                 do j = 1, i - 1
+                    do l = i, i + ib - 1
+                       a( l, j ) = czero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_zunglq
+
+     !> ZUNGQL: generates an M-by-N complex matrix Q with orthonormal columns,
+     !> which is defined as the last N columns of a product of K elementary
+     !> reflectors of order M
+     !> Q  =  H(k) . . . H(2) H(1)
+     !> as returned by ZGEQLF.
+
+     pure subroutine stdlib_zungql( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'ZUNGQL', ' ', m, n, k, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, n ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGQL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZUNGQL', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNGQL', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the first block.
+              ! the last kk columns are handled by the block method.
+              kk = min( k, ( ( k-nx+nb-1 ) / nb )*nb )
+              ! set a(m-kk+1:m,1:n-kk) to czero.
+              do j = 1, n - kk
+                 do i = m - kk + 1, m
+                    a( i, j ) = czero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the first or only block.
+           call stdlib_zung2l( m-kk, n-kk, k-kk, a, lda, tau, work, iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = k - kk + 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 if( n-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_zlarft( 'BACKWARD', 'COLUMNWISE', m-k+i+ib-1, ib,a( 1, n-k+i ), &
+                              lda, tau( i ), work, ldwork )
+                    ! apply h to a(1:m-k+i+ib-1,1:n-k+i-1) from the left
+                    call stdlib_zlarfb( 'LEFT', 'NO TRANSPOSE', 'BACKWARD','COLUMNWISE', m-k+i+ib-&
+                    1, n-k+i-1, ib,a( 1, n-k+i ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+                 ! apply h to rows 1:m-k+i+ib-1 of current block
+                 call stdlib_zung2l( m-k+i+ib-1, ib, ib, a( 1, n-k+i ), lda,tau( i ), work, iinfo &
+                           )
+                 ! set rows m-k+i+ib:m of current block to czero
+                 do j = n - k + i, n - k + i + ib - 1
+                    do l = m - k + i + ib, m
+                       a( l, j ) = czero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_zungql
+
+     !> ZUNGQR: generates an M-by-N complex matrix Q with orthonormal columns,
+     !> which is defined as the first N columns of a product of K elementary
+     !> reflectors of order M
+     !> Q  =  H(1) H(2) . . . H(k)
+     !> as returned by ZGEQRF.
+
+     pure subroutine stdlib_zungqr( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, j, ki, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZUNGQR', ' ', m, n, k, -1 )
+           lwkopt = max( 1, n )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( k<0 .or. k>n ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGQR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZUNGQR', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNGQR', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the last block.
+              ! the first kk columns are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              ! set a(1:kk,kk+1:n) to czero.
+              do j = kk + 1, n
+                 do i = 1, kk
+                    a( i, j ) = czero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the last or only block.
+           if( kk<n )call stdlib_zung2r( m-kk, n-kk, k-kk, a( kk+1, kk+1 ), lda,tau( kk+1 ), work,&
+                      iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = ki + 1, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_zlarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h to a(i:m,i+ib:n) from the left
+                    call stdlib_zlarfb( 'LEFT', 'NO TRANSPOSE', 'FORWARD','COLUMNWISE', m-i+1, n-&
+                    i-ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 ), &
+                              ldwork )
+                 end if
+                 ! apply h to rows i:m of current block
+                 call stdlib_zung2r( m-i+1, ib, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 ! set rows 1:i-1 of current block to czero
+                 do j = i, i + ib - 1
+                    do l = 1, i - 1
+                       a( l, j ) = czero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_zungqr
+
+     !> ZUNGR2: generates an m by n complex matrix Q with orthonormal rows,
+     !> which is defined as the last m rows of a product of k elementary
+     !> reflectors of order n
+     !> Q  =  H(1)**H H(2)**H . . . H(k)**H
+     !> as returned by ZGERQF.
+
+     pure subroutine stdlib_zungr2( m, n, k, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, ii, j, l
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGR2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 )return
+           if( k<m ) then
+              ! initialise rows 1:m-k to rows of the unit matrix
+              do j = 1, n
+                 do l = 1, m - k
+                    a( l, j ) = czero
+                 end do
+                 if( j>n-m .and. j<=n-k )a( m-n+j, j ) = cone
+              end do
+           end if
+           do i = 1, k
+              ii = m - k + i
+              ! apply h(i)**h to a(1:m-k+i,1:n-k+i) from the right
+              call stdlib_zlacgv( n-m+ii-1, a( ii, 1 ), lda )
+              a( ii, n-m+ii ) = cone
+              call stdlib_zlarf( 'RIGHT', ii-1, n-m+ii, a( ii, 1 ), lda,conjg( tau( i ) ), a, lda,&
+                         work )
+              call stdlib_zscal( n-m+ii-1, -tau( i ), a( ii, 1 ), lda )
+              call stdlib_zlacgv( n-m+ii-1, a( ii, 1 ), lda )
+              a( ii, n-m+ii ) = cone - conjg( tau( i ) )
+              ! set a(m-k+i,n-k+i+1:n) to czero
+              do l = n - m + ii + 1, n
+                 a( ii, l ) = czero
+              end do
+           end do
+           return
+     end subroutine stdlib_zungr2
+
+     !> ZUNGRQ: generates an M-by-N complex matrix Q with orthonormal rows,
+     !> which is defined as the last M rows of a product of K elementary
+     !> reflectors of order N
+     !> Q  =  H(1)**H H(2)**H . . . H(k)**H
+     !> as returned by ZGERQF.
+
+     pure subroutine stdlib_zungrq( m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, ii, iinfo, iws, j, kk, l, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<m ) then
+              info = -2
+           else if( k<0 .or. k>m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'ZUNGRQ', ' ', m, n, k, -1 )
+                 lwkopt = m*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, m ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGRQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m<=0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZUNGRQ', ' ', m, n, k, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNGRQ', ' ', m, n, k, -1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code after the first block.
+              ! the last kk rows are handled by the block method.
+              kk = min( k, ( ( k-nx+nb-1 ) / nb )*nb )
+              ! set a(1:m-kk,n-kk+1:n) to czero.
+              do j = n - kk + 1, n
+                 do i = 1, m - kk
+                    a( i, j ) = czero
+                 end do
+              end do
+           else
+              kk = 0
+           end if
+           ! use unblocked code for the first or only block.
+           call stdlib_zungr2( m-kk, n-kk, k-kk, a, lda, tau, work, iinfo )
+           if( kk>0 ) then
+              ! use blocked code
+              do i = k - kk + 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 ii = m - k + i
+                 if( ii>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_zlarft( 'BACKWARD', 'ROWWISE', n-k+i+ib-1, ib,a( ii, 1 ), lda, &
+                              tau( i ), work, ldwork )
+                    ! apply h**h to a(1:m-k+i-1,1:n-k+i+ib-1) from the right
+                    call stdlib_zlarfb( 'RIGHT', 'CONJUGATE TRANSPOSE', 'BACKWARD','ROWWISE', ii-&
+                    1, n-k+i+ib-1, ib, a( ii, 1 ),lda, work, ldwork, a, lda, work( ib+1 ),ldwork )
+                              
+                 end if
+                 ! apply h**h to columns 1:n-k+i+ib-1 of current block
+                 call stdlib_zungr2( ib, n-k+i+ib-1, ib, a( ii, 1 ), lda, tau( i ),work, iinfo )
+                           
+                 ! set columns n-k+i+ib:n of current block to czero
+                 do l = n - k + i + ib, n
+                    do j = ii, ii + ib - 1
+                       a( j, l ) = czero
+                    end do
+                 end do
+              end do
+           end if
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_zungrq
+
+     !> ZUNGTSQR_ROW: generates an M-by-N complex matrix Q_out with
+     !> orthonormal columns from the output of ZLATSQR. These N orthonormal
+     !> columns are the first N columns of a product of complex unitary
+     !> matrices Q(k)_in of order M, which are returned by ZLATSQR in
+     !> a special format.
+     !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+     !> The input matrices Q(k)_in are stored in row and column blocks in A.
+     !> See the documentation of ZLATSQR for more details on the format of
+     !> Q(k)_in, where each Q(k)_in is represented by block Householder
+     !> transformations. This routine calls an auxiliary routine ZLARFB_GETT,
+     !> where the computation is performed on each individual block. The
+     !> algorithm first sweeps NB-sized column blocks from the right to left
+     !> starting in the bottom row block and continues to the top row block
+     !> (hence _ROW in the routine name). This sweep is in reverse order of
+     !> the order in which ZLATSQR generates the output blocks.
+
+     pure subroutine stdlib_zungtsqr_row( m, n, mb, nb, a, lda, t, ldt, work,lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, mb, nb
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: t(ldt,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: nblocal, mb2, m_plus_one, itmp, ib_bottom, lworkopt, &
+                     num_all_row_blocks, jb_t, ib, imb, kb, kb_last, knb, mb1
+           ! Local Arrays 
+           complex(dp) :: dummy(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           lquery  = lwork==-1
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb<=n ) then
+              info = -3
+           else if( nb<1 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           nblocal = min( nb, n )
+           ! determine the workspace size.
+           if( info==0 ) then
+              lworkopt = nblocal * max( nblocal, ( n - nblocal ) )
+           end if
+           ! handle error in the input parameters and handle the workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGTSQR_ROW', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = cmplx( lworkopt,KIND=dp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = cmplx( lworkopt,KIND=dp)
+              return
+           end if
+           ! (0) set the upper-triangular part of the matrix a to zero and
+           ! its diagonal elements to one.
+           call stdlib_zlaset('U', m, n, czero, cone, a, lda )
+           ! kb_last is the column index of the last column block reflector
+           ! in the matrices t and v.
+           kb_last = ( ( n-1 ) / nblocal ) * nblocal + 1
+           ! (1) bottom-up loop over row blocks of a, except the top row block.
+           ! note: if mb>=m, then the loop is never executed.
+           if ( mb<m ) then
+              ! mb2 is the row blocking size for the row blocks before the
+              ! first top row block in the matrix a. ib is the row index for
+              ! the row blocks in the matrix a before the first top row block.
+              ! ib_bottom is the row index for the last bottom row block
+              ! in the matrix a. jb_t is the column index of the corresponding
+              ! column block in the matrix t.
+              ! initialize variables.
+              ! num_all_row_blocks is the number of row blocks in the matrix a
+              ! including the first row block.
+              mb2 = mb - n
+              m_plus_one = m + 1
+              itmp = ( m - mb - 1 ) / mb2
+              ib_bottom = itmp * mb2 + mb + 1
+              num_all_row_blocks = itmp + 2
+              jb_t = num_all_row_blocks * n + 1
+              do ib = ib_bottom, mb+1, -mb2
+                 ! determine the block size imb for the current row block
+                 ! in the matrix a.
+                 imb = min( m_plus_one - ib, mb2 )
+                 ! determine the column index jb_t for the current column block
+                 ! in the matrix t.
+                 jb_t = jb_t - n
+                 ! apply column blocks of h in the row block from right to left.
+                 ! kb is the column index of the current column block reflector
+                 ! in the matrices t and v.
+                 do kb = kb_last, 1, -nblocal
+                    ! determine the size of the current column block knb in
+                    ! the matrices t and v.
+                    knb = min( nblocal, n - kb + 1 )
+                    call stdlib_zlarfb_gett( 'I', imb, n-kb+1, knb,t( 1, jb_t+kb-1 ), ldt, a( kb, &
+                              kb ), lda,a( ib, kb ), lda, work, knb )
+                 end do
+              end do
+           end if
+           ! (2) top row block of a.
+           ! note: if mb>=m, then we have only one row block of a of size m
+           ! and we work on the entire matrix a.
+           mb1 = min( mb, m )
+           ! apply column blocks of h in the top row block from right to left.
+           ! kb is the column index of the current block reflector in
+           ! the matrices t and v.
+           do kb = kb_last, 1, -nblocal
+              ! determine the size of the current column block knb in
+              ! the matrices t and v.
+              knb = min( nblocal, n - kb + 1 )
+              if( mb1-kb-knb+1==0 ) then
+                 ! in stdlib_slarfb_gett parameters, when m=0, then the matrix b
+                 ! does not exist, hence we need to pass a dummy array
+                 ! reference dummy(1,1) to b with lddummy=1.
+                 call stdlib_zlarfb_gett( 'N', 0, n-kb+1, knb,t( 1, kb ), ldt, a( kb, kb ), lda,&
+                           dummy( 1, 1 ), 1, work, knb )
+              else
+                 call stdlib_zlarfb_gett( 'N', mb1-kb-knb+1, n-kb+1, knb,t( 1, kb ), ldt, a( kb, &
+                           kb ), lda,a( kb+knb, kb), lda, work, knb )
+              end if
+           end do
+           work( 1 ) = cmplx( lworkopt,KIND=dp)
+           return
+     end subroutine stdlib_zungtsqr_row
+
+
+     pure subroutine stdlib_zunm22( side, trans, m, n, n1, n2, q, ldq, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(in) :: m, n, n1, n2, ldq, ldc, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(dp), intent(in) :: q(ldq,*)
+           complex(dp), intent(inout) :: c(ldc,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, ldwork, len, lwkopt, nb, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q;
+           ! nw is the minimum dimension of work.
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           nw = nq
+           if( n1==0 .or. n2==0 ) nw = 1
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'C' ) )&
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( n1<0 .or. n1+n2/=nq ) then
+              info = -5
+           else if( n2<0 ) then
+              info = -6
+           else if( ldq<max( 1, nq ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              lwkopt = m*n
+              work( 1 ) = cmplx( lwkopt,KIND=dp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNM22', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! degenerate cases (n1 = 0 or n2 = 0) are handled using stdlib_ztrmm.
+           if( n1==0 ) then
+              call stdlib_ztrmm( side, 'UPPER', trans, 'NON-UNIT', m, n, cone,q, ldq, c, ldc )
+                        
+              work( 1 ) = cone
+              return
+           else if( n2==0 ) then
+              call stdlib_ztrmm( side, 'LOWER', trans, 'NON-UNIT', m, n, cone,q, ldq, c, ldc )
+                        
+              work( 1 ) = cone
+              return
+           end if
+           ! compute the largest chunk size available from the workspace.
+           nb = max( 1, min( lwork, lwkopt ) / nq )
+           if( left ) then
+              if( notran ) then
+                 do i = 1, n, nb
+                    len = min( nb, n-i+1 )
+                    ldwork = m
+                    ! multiply bottom part of c by q12.
+                    call stdlib_zlacpy( 'ALL', n1, len, c( n2+1, i ), ldc, work,ldwork )
+                    call stdlib_ztrmm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT',n1, len, cone, &
+                              q( 1, n2+1 ), ldq, work,ldwork )
+                    ! multiply top part of c by q11.
+                    call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n1, len, n2,cone, q, ldq, &
+                              c( 1, i ), ldc, cone, work,ldwork )
+                    ! multiply top part of c by q21.
+                    call stdlib_zlacpy( 'ALL', n2, len, c( 1, i ), ldc,work( n1+1 ), ldwork )
+                              
+                    call stdlib_ztrmm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',n2, len, cone, &
+                              q( n1+1, 1 ), ldq,work( n1+1 ), ldwork )
+                    ! multiply bottom part of c by q22.
+                    call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n2, len, n1,cone, q( n1+1, &
+                              n2+1 ), ldq, c( n2+1, i ), ldc,cone, work( n1+1 ), ldwork )
+                    ! copy everything back.
+                    call stdlib_zlacpy( 'ALL', m, len, work, ldwork, c( 1, i ),ldc )
+                 end do
+              else
+                 do i = 1, n, nb
+                    len = min( nb, n-i+1 )
+                    ldwork = m
+                    ! multiply bottom part of c by q21**h.
+                    call stdlib_zlacpy( 'ALL', n2, len, c( n1+1, i ), ldc, work,ldwork )
+                    call stdlib_ztrmm( 'LEFT', 'UPPER', 'CONJUGATE', 'NON-UNIT',n2, len, cone, q( &
+                              n1+1, 1 ), ldq, work,ldwork )
+                    ! multiply top part of c by q11**h.
+                    call stdlib_zgemm( 'CONJUGATE', 'NO TRANSPOSE', n2, len, n1,cone, q, ldq, c( &
+                              1, i ), ldc, cone, work,ldwork )
+                    ! multiply top part of c by q12**h.
+                    call stdlib_zlacpy( 'ALL', n1, len, c( 1, i ), ldc,work( n2+1 ), ldwork )
+                              
+                    call stdlib_ztrmm( 'LEFT', 'LOWER', 'CONJUGATE', 'NON-UNIT',n1, len, cone, q( &
+                              1, n2+1 ), ldq,work( n2+1 ), ldwork )
+                    ! multiply bottom part of c by q22**h.
+                    call stdlib_zgemm( 'CONJUGATE', 'NO TRANSPOSE', n1, len, n2,cone, q( n1+1, n2+&
+                              1 ), ldq, c( n1+1, i ), ldc,cone, work( n2+1 ), ldwork )
+                    ! copy everything back.
+                    call stdlib_zlacpy( 'ALL', m, len, work, ldwork, c( 1, i ),ldc )
+                 end do
+              end if
+           else
+              if( notran ) then
+                 do i = 1, m, nb
+                    len = min( nb, m-i+1 )
+                    ldwork = len
+                    ! multiply right part of c by q21.
+                    call stdlib_zlacpy( 'ALL', len, n2, c( i, n1+1 ), ldc, work,ldwork )
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',len, n2, cone,&
+                               q( n1+1, 1 ), ldq, work,ldwork )
+                    ! multiply left part of c by q11.
+                    call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', len, n2, n1,cone, c( i, 1 )&
+                              , ldc, q, ldq, cone, work,ldwork )
+                    ! multiply left part of c by q12.
+                    call stdlib_zlacpy( 'ALL', len, n1, c( i, 1 ), ldc,work( 1 + n2*ldwork ), &
+                              ldwork )
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'NON-UNIT',len, n1, cone,&
+                               q( 1, n2+1 ), ldq,work( 1 + n2*ldwork ), ldwork )
+                    ! multiply right part of c by q22.
+                    call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', len, n1, n2,cone, c( i, n1+&
+                              1 ), ldc, q( n1+1, n2+1 ), ldq,cone, work( 1 + n2*ldwork ), ldwork )
+                    ! copy everything back.
+                    call stdlib_zlacpy( 'ALL', len, n, work, ldwork, c( i, 1 ),ldc )
+                 end do
+              else
+                 do i = 1, m, nb
+                    len = min( nb, m-i+1 )
+                    ldwork = len
+                    ! multiply right part of c by q12**h.
+                    call stdlib_zlacpy( 'ALL', len, n1, c( i, n2+1 ), ldc, work,ldwork )
+                    call stdlib_ztrmm( 'RIGHT', 'LOWER', 'CONJUGATE', 'NON-UNIT',len, n1, cone, q(&
+                               1, n2+1 ), ldq, work,ldwork )
+                    ! multiply left part of c by q11**h.
+                    call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE', len, n1, n2,cone, c( i, 1 ), &
+                              ldc, q, ldq, cone, work,ldwork )
+                    ! multiply left part of c by q21**h.
+                    call stdlib_zlacpy( 'ALL', len, n2, c( i, 1 ), ldc,work( 1 + n1*ldwork ), &
+                              ldwork )
+                    call stdlib_ztrmm( 'RIGHT', 'UPPER', 'CONJUGATE', 'NON-UNIT',len, n2, cone, q(&
+                               n1+1, 1 ), ldq,work( 1 + n1*ldwork ), ldwork )
+                    ! multiply right part of c by q22**h.
+                    call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE', len, n2, n1,cone, c( i, n2+1 )&
+                              , ldc, q( n1+1, n2+1 ), ldq,cone, work( 1 + n1*ldwork ), ldwork )
+                    ! copy everything back.
+                    call stdlib_zlacpy( 'ALL', len, n, work, ldwork, c( i, 1 ),ldc )
+                 end do
+              end if
+           end if
+           work( 1 ) = cmplx( lwkopt,KIND=dp)
+           return
+     end subroutine stdlib_zunm22
+
+     !> ZUNM2L: overwrites the general complex m-by-n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by ZGEQLF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_zunm2l( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, mi, ni, nq
+           complex(dp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNM2L', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. notran .or. .not.left .and. .not.notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+           else
+              mi = m
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(1:m-k+i,1:n)
+                 mi = m - k + i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,1:n-k+i)
+                 ni = n - k + i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = tau( i )
+              else
+                 taui = conjg( tau( i ) )
+              end if
+              aii = a( nq-k+i, i )
+              a( nq-k+i, i ) = cone
+              call stdlib_zlarf( side, mi, ni, a( 1, i ), 1, taui, c, ldc, work )
+              a( nq-k+i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_zunm2l
+
+     !> ZUNM2R: overwrites the general complex m-by-n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by ZGEQRF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_zunm2r( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, jc, mi, ni, nq
+           complex(dp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNM2R', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran .or. .not.left .and. notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              jc = 1
+           else
+              mi = m
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = tau( i )
+              else
+                 taui = conjg( tau( i ) )
+              end if
+              aii = a( i, i )
+              a( i, i ) = cone
+              call stdlib_zlarf( side, mi, ni, a( i, i ), 1, taui, c( ic, jc ), ldc,work )
+              a( i, i ) = aii
+           end do
+           return
+     end subroutine stdlib_zunm2r
+
+     !> ZUNML2: overwrites the general complex m-by-n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k)**H . . . H(2)**H H(1)**H
+     !> as returned by ZGELQF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_zunml2( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, jc, mi, ni, nq
+           complex(dp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNML2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. notran .or. .not.left .and. .not.notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              jc = 1
+           else
+              mi = m
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = conjg( tau( i ) )
+              else
+                 taui = tau( i )
+              end if
+              if( i<nq )call stdlib_zlacgv( nq-i, a( i, i+1 ), lda )
+              aii = a( i, i )
+              a( i, i ) = cone
+              call stdlib_zlarf( side, mi, ni, a( i, i ), lda, taui, c( ic, jc ),ldc, work )
+                        
+              a( i, i ) = aii
+              if( i<nq )call stdlib_zlacgv( nq-i, a( i, i+1 ), lda )
+           end do
+           return
+     end subroutine stdlib_zunml2
+
+     !> ZUNMLQ: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k)**H . . . H(2)**H H(1)**H
+     !> as returned by ZGELQF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_zunmlq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, jc, ldwork, lwkopt, mi, nb, nbmin, &
+                     ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'ZUNMLQ', side // trans, m, n, k,-1 ) )
+              lwkopt = nw*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMLQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNMLQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_zunml2( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. notran ) .or.( .not.left .and. .not.notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              if( notran ) then
+                 transt = 'C'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i) h(i+1) . . . h(i+ib-1)
+                 call stdlib_zlarft( 'FORWARD', 'ROWWISE', nq-i+1, ib, a( i, i ),lda, tau( i ), &
+                           work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**h is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**h
+                 call stdlib_zlarfb( side, transt, 'FORWARD', 'ROWWISE', mi, ni, ib,a( i, i ), &
+                           lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zunmlq
+
+     !> ZUNMQL: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(k) . . . H(2) H(1)
+     !> as returned by ZGEQLF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_zunmql( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, i1, i2, i3, ib, iinfo, iwt, ldwork, lwkopt, mi, nb, nbmin, ni, nq, &
+                     nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'ZUNMQL', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMQL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNMQL', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_zunm2l( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. notran ) .or.( .not.left .and. .not.notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_zlarft( 'BACKWARD', 'COLUMNWISE', nq-k+i+ib-1, ib,a( 1, i ), lda, &
+                           tau( i ), work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(1:m-k+i+ib-1,1:n)
+                    mi = m - k + i + ib - 1
+                 else
+                    ! h or h**h is applied to c(1:m,1:n-k+i+ib-1)
+                    ni = n - k + i + ib - 1
+                 end if
+                 ! apply h or h**h
+                 call stdlib_zlarfb( side, trans, 'BACKWARD', 'COLUMNWISE', mi, ni,ib, a( 1, i ), &
+                           lda, work( iwt ), ldt, c, ldc,work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zunmql
+
+     !> ZUNMQR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by ZGEQRF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_zunmqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, jc, ldwork, lwkopt, mi, nb, nbmin, &
+                     ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'ZUNMQR', side // trans, m, n, k,-1 ) )
+              lwkopt = nw*nb + tsize
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMQR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNMQR', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_zunm2r( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i) h(i+1) . . . h(i+ib-1)
+                 call stdlib_zlarft( 'FORWARD', 'COLUMNWISE', nq-i+1, ib, a( i, i ),lda, tau( i ),&
+                            work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**h is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**h
+                 call stdlib_zlarfb( side, trans, 'FORWARD', 'COLUMNWISE', mi, ni,ib, a( i, i ), &
+                           lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zunmqr
+
+     !> ZUNMR2: overwrites the general complex m-by-n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1)**H H(2)**H . . . H(k)**H
+     !> as returned by ZGERQF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_zunmr2( side, trans, m, n, k, a, lda, tau, c, ldc,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, mi, ni, nq
+           complex(dp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMR2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran .or. .not.left .and. notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+           else
+              mi = m
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(1:m-k+i,1:n)
+                 mi = m - k + i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,1:n-k+i)
+                 ni = n - k + i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = conjg( tau( i ) )
+              else
+                 taui = tau( i )
+              end if
+              call stdlib_zlacgv( nq-k+i-1, a( i, 1 ), lda )
+              aii = a( i, nq-k+i )
+              a( i, nq-k+i ) = cone
+              call stdlib_zlarf( side, mi, ni, a( i, 1 ), lda, taui, c, ldc, work )
+              a( i, nq-k+i ) = aii
+              call stdlib_zlacgv( nq-k+i-1, a( i, 1 ), lda )
+           end do
+           return
+     end subroutine stdlib_zunmr2
+
+     !> ZUNMR3: overwrites the general complex m by n matrix C with
+     !> Q * C  if SIDE = 'L' and TRANS = 'N', or
+     !> Q**H* C  if SIDE = 'L' and TRANS = 'C', or
+     !> C * Q  if SIDE = 'R' and TRANS = 'N', or
+     !> C * Q**H if SIDE = 'R' and TRANS = 'C',
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by ZTZRZF. Q is of order m if SIDE = 'L' and of order n
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_zunmr3( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, l, lda, ldc, m, n
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), tau(*)
+           complex(dp), intent(inout) :: c(ldc,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, notran
+           integer(ilp) :: i, i1, i2, i3, ic, ja, jc, mi, ni, nq
+           complex(dp) :: taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( l<0 .or. ( left .and. ( l>m ) ) .or.( .not.left .and. ( l>n ) ) ) then
+              info = -6
+           else if( lda<max( 1, k ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMR3', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. k==0 )return
+           if( ( left .and. .not.notran .or. .not.left .and. notran ) ) then
+              i1 = 1
+              i2 = k
+              i3 = 1
+           else
+              i1 = k
+              i2 = 1
+              i3 = -1
+           end if
+           if( left ) then
+              ni = n
+              ja = m - l + 1
+              jc = 1
+           else
+              mi = m
+              ja = n - l + 1
+              ic = 1
+           end if
+           do i = i1, i2, i3
+              if( left ) then
+                 ! h(i) or h(i)**h is applied to c(i:m,1:n)
+                 mi = m - i + 1
+                 ic = i
+              else
+                 ! h(i) or h(i)**h is applied to c(1:m,i:n)
+                 ni = n - i + 1
+                 jc = i
+              end if
+              ! apply h(i) or h(i)**h
+              if( notran ) then
+                 taui = tau( i )
+              else
+                 taui = conjg( tau( i ) )
+              end if
+              call stdlib_zlarz( side, mi, ni, l, a( i, ja ), lda, taui,c( ic, jc ), ldc, work )
+                        
+           end do
+           return
+     end subroutine stdlib_zunmr3
+
+     !> ZUNMRQ: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1)**H H(2)**H . . . H(k)**H
+     !> as returned by ZGERQF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_zunmrq( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, iinfo, iwt, ldwork, lwkopt, mi, nb, nbmin, ni, nq, &
+                     nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( lda<max( 1, k ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'ZUNMRQ', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMRQ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNMRQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_zunmr2( side, trans, m, n, k, a, lda, tau, c, ldc, work,iinfo )
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              if( notran ) then
+                 transt = 'C'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_zlarft( 'BACKWARD', 'ROWWISE', nq-k+i+ib-1, ib,a( i, 1 ), lda, tau( &
+                           i ), work( iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(1:m-k+i+ib-1,1:n)
+                    mi = m - k + i + ib - 1
+                 else
+                    ! h or h**h is applied to c(1:m,1:n-k+i+ib-1)
+                    ni = n - k + i + ib - 1
+                 end if
+                 ! apply h or h**h
+                 call stdlib_zlarfb( side, transt, 'BACKWARD', 'ROWWISE', mi, ni,ib, a( i, 1 ), &
+                           lda, work( iwt ), ldt, c, ldc,work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zunmrq
+
+     !> ZUNMRZ: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of k
+     !> elementary reflectors
+     !> Q = H(1) H(2) . . . H(k)
+     !> as returned by ZTZRZF. Q is of order M if SIDE = 'L' and of order N
+     !> if SIDE = 'R'.
+
+     pure subroutine stdlib_zunmrz( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, l, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           ! Local Scalars 
+           logical(lk) :: left, lquery, notran
+           character :: transt
+           integer(ilp) :: i, i1, i2, i3, ib, ic, iinfo, iwt, ja, jc, ldwork, lwkopt, mi, nb, &
+                     nbmin, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>nq ) then
+              info = -5
+           else if( l<0 .or. ( left .and. ( l>m ) ) .or.( .not.left .and. ( l>n ) ) ) then
+              info = -6
+           else if( lda<max( 1, k ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<max( 1, nw ) .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              if( m==0 .or. n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = min( nbmax, stdlib_ilaenv( 1, 'ZUNMRQ', side // trans, m, n,k, -1 ) )
+                           
+                 lwkopt = nw*nb + tsize
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMRZ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! determine the block size.  nb may be at most nbmax, where nbmax
+           ! is used to define the local array t.
+           nb = min( nbmax, stdlib_ilaenv( 1, 'ZUNMRQ', side // trans, m, n, k,-1 ) )
+           nbmin = 2
+           ldwork = nw
+           if( nb>1 .and. nb<k ) then
+              if( lwork<lwkopt ) then
+                 nb = (lwork-tsize) / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZUNMRQ', side // trans, m, n, k,-1 ) )
+              end if
+           end if
+           if( nb<nbmin .or. nb>=k ) then
+              ! use unblocked code
+              call stdlib_zunmr3( side, trans, m, n, k, l, a, lda, tau, c, ldc,work, iinfo )
+                        
+           else
+              ! use blocked code
+              iwt = 1 + nw*nb
+              if( ( left .and. .not.notran ) .or.( .not.left .and. notran ) ) then
+                 i1 = 1
+                 i2 = k
+                 i3 = nb
+              else
+                 i1 = ( ( k-1 ) / nb )*nb + 1
+                 i2 = 1
+                 i3 = -nb
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+                 ja = m - l + 1
+              else
+                 mi = m
+                 ic = 1
+                 ja = n - l + 1
+              end if
+              if( notran ) then
+                 transt = 'C'
+              else
+                 transt = 'N'
+              end if
+              do i = i1, i2, i3
+                 ib = min( nb, k-i+1 )
+                 ! form the triangular factor of the block reflector
+                 ! h = h(i+ib-1) . . . h(i+1) h(i)
+                 call stdlib_zlarzt( 'BACKWARD', 'ROWWISE', l, ib, a( i, ja ), lda,tau( i ), work(&
+                            iwt ), ldt )
+                 if( left ) then
+                    ! h or h**h is applied to c(i:m,1:n)
+                    mi = m - i + 1
+                    ic = i
+                 else
+                    ! h or h**h is applied to c(1:m,i:n)
+                    ni = n - i + 1
+                    jc = i
+                 end if
+                 ! apply h or h**h
+                 call stdlib_zlarzb( side, transt, 'BACKWARD', 'ROWWISE', mi, ni,ib, l, a( i, ja )&
+                           , lda, work( iwt ), ldt,c( ic, jc ), ldc, work, ldwork )
+              end do
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zunmrz
+
+     !> ZBBCSD: computes the CS decomposition of a unitary matrix in
+     !> bidiagonal-block form,
+     !> [ B11 | B12 0  0 ]
+     !> [  0  |  0 -I  0 ]
+     !> X = [----------------]
+     !> [ B21 | B22 0  0 ]
+     !> [  0  |  0  0  I ]
+     !> [  C | -S  0  0 ]
+     !> [ U1 |    ] [  0 |  0 -I  0 ] [ V1 |    ]**H
+     !> = [---------] [---------------] [---------]   .
+     !> [    | U2 ] [  S |  C  0  0 ] [    | V2 ]
+     !> [  0 |  0  0  I ]
+     !> X is M-by-M, its top-left block is P-by-Q, and Q must be no larger
+     !> than P, M-P, or M-Q. (If Q is not the smallest index, then X must be
+     !> transposed and/or permuted. This can be done in constant time using
+     !> the TRANS and SIGNS options. See ZUNCSD for details.)
+     !> The bidiagonal matrices B11, B12, B21, and B22 are represented
+     !> implicitly by angles THETA(1:Q) and PHI(1:Q-1).
+     !> The unitary matrices U1, U2, V1T, and V2T are input/output.
+     !> The input matrices are pre- or post-multiplied by the appropriate
+     !> singular vector matrices.
+
+     pure subroutine stdlib_zbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, phi, u1, &
+     ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, b11d, b11e, b12d, b12e, b21d, b21e,b22d, b22e, rwork, &
+               lrwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, lrwork, m, p, q
+           ! Array Arguments 
+           real(dp), intent(out) :: b11d(*), b11e(*), b12d(*), b12e(*), b21d(*), b21e(*), b22d(*),&
+                      b22e(*), rwork(*)
+           real(dp), intent(inout) :: phi(*), theta(*)
+           complex(dp), intent(inout) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*)
+                     
+        ! ===================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxitr = 6
+           real(dp), parameter :: hundred = 100.0_dp
+           real(dp), parameter :: meighth = -0.125_dp
+           real(dp), parameter :: piover2 = 1.57079632679489661923132169163975144210_dp
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: colmajor, lquery, restart11, restart12, restart21, restart22, wantu1, &
+                     wantu2, wantv1t, wantv2t
+           integer(ilp) :: i, imin, imax, iter, iu1cs, iu1sn, iu2cs, iu2sn, iv1tcs, iv1tsn, &
+                     iv2tcs, iv2tsn, j, lrworkmin, lrworkopt, maxit, mini
+           real(dp) :: b11bulge, b12bulge, b21bulge, b22bulge, dummy, eps, mu, nu, r, sigma11, &
+                     sigma21, temp, thetamax, thetamin, thresh, tol, tolmul, unfl, x1, x2, y1, y2
+           ! Intrinsic Functions 
+           intrinsic :: abs,atan2,cos,max,min,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lrwork == -1
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           wantv2t = stdlib_lsame( jobv2t, 'Y' )
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           if( m < 0 ) then
+              info = -6
+           else if( p < 0 .or. p > m ) then
+              info = -7
+           else if( q < 0 .or. q > m ) then
+              info = -8
+           else if( q > p .or. q > m-p .or. q > m-q ) then
+              info = -8
+           else if( wantu1 .and. ldu1 < p ) then
+              info = -12
+           else if( wantu2 .and. ldu2 < m-p ) then
+              info = -14
+           else if( wantv1t .and. ldv1t < q ) then
+              info = -16
+           else if( wantv2t .and. ldv2t < m-q ) then
+              info = -18
+           end if
+           ! quick return if q = 0
+           if( info == 0 .and. q == 0 ) then
+              lrworkmin = 1
+              rwork(1) = lrworkmin
+              return
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              iu1cs = 1
+              iu1sn = iu1cs + q
+              iu2cs = iu1sn + q
+              iu2sn = iu2cs + q
+              iv1tcs = iu2sn + q
+              iv1tsn = iv1tcs + q
+              iv2tcs = iv1tsn + q
+              iv2tsn = iv2tcs + q
+              lrworkopt = iv2tsn + q - 1
+              lrworkmin = lrworkopt
+              rwork(1) = lrworkopt
+              if( lrwork < lrworkmin .and. .not. lquery ) then
+                 info = -28
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZBBCSD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'EPSILON' )
+           unfl = stdlib_dlamch( 'SAFE MINIMUM' )
+           tolmul = max( ten, min( hundred, eps**meighth ) )
+           tol = tolmul*eps
+           thresh = max( tol, maxitr*q*q*unfl )
+           ! test for negligible sines or cosines
+           do i = 1, q
+              if( theta(i) < thresh ) then
+                 theta(i) = zero
+              else if( theta(i) > piover2-thresh ) then
+                 theta(i) = piover2
+              end if
+           end do
+           do i = 1, q-1
+              if( phi(i) < thresh ) then
+                 phi(i) = zero
+              else if( phi(i) > piover2-thresh ) then
+                 phi(i) = piover2
+              end if
+           end do
+           ! initial deflation
+           imax = q
+           do while( imax > 1 )
+              if( phi(imax-1) /= zero ) then
+                 exit
+              end if
+              imax = imax - 1
+           end do
+           imin = imax - 1
+           if  ( imin > 1 ) then
+              do while( phi(imin-1) /= zero )
+                 imin = imin - 1
+                 if  ( imin <= 1 ) exit
+              end do
+           end if
+           ! initialize iteration counter
+           maxit = maxitr*q*q
+           iter = 0
+           ! begin main iteration loop
+           do while( imax > 1 )
+              ! compute the matrix entries
+              b11d(imin) = cos( theta(imin) )
+              b21d(imin) = -sin( theta(imin) )
+              do i = imin, imax - 1
+                 b11e(i) = -sin( theta(i) ) * sin( phi(i) )
+                 b11d(i+1) = cos( theta(i+1) ) * cos( phi(i) )
+                 b12d(i) = sin( theta(i) ) * cos( phi(i) )
+                 b12e(i) = cos( theta(i+1) ) * sin( phi(i) )
+                 b21e(i) = -cos( theta(i) ) * sin( phi(i) )
+                 b21d(i+1) = -sin( theta(i+1) ) * cos( phi(i) )
+                 b22d(i) = cos( theta(i) ) * cos( phi(i) )
+                 b22e(i) = -sin( theta(i+1) ) * sin( phi(i) )
+              end do
+              b12d(imax) = sin( theta(imax) )
+              b22d(imax) = cos( theta(imax) )
+              ! abort if not converging; otherwise, increment iter
+              if( iter > maxit ) then
+                 info = 0
+                 do i = 1, q
+                    if( phi(i) /= zero )info = info + 1
+                 end do
+                 return
+              end if
+              iter = iter + imax - imin
+              ! compute shifts
+              thetamax = theta(imin)
+              thetamin = theta(imin)
+              do i = imin+1, imax
+                 if( theta(i) > thetamax )thetamax = theta(i)
+                 if( theta(i) < thetamin )thetamin = theta(i)
+              end do
+              if( thetamax > piover2 - thresh ) then
+                 ! zero on diagonals of b11 and b22; induce deflation with a
+                 ! zero shift
+                 mu = zero
+                 nu = one
+              else if( thetamin < thresh ) then
+                 ! zero on diagonals of b12 and b22; induce deflation with a
+                 ! zero shift
+                 mu = one
+                 nu = zero
+              else
+                 ! compute shifts for b11 and b21 and use the lesser
+                 call stdlib_dlas2( b11d(imax-1), b11e(imax-1), b11d(imax), sigma11,dummy )
+                           
+                 call stdlib_dlas2( b21d(imax-1), b21e(imax-1), b21d(imax), sigma21,dummy )
+                           
+                 if( sigma11 <= sigma21 ) then
+                    mu = sigma11
+                    nu = sqrt( one - mu**2 )
+                    if( mu < thresh ) then
+                       mu = zero
+                       nu = one
+                    end if
+                 else
+                    nu = sigma21
+                    mu = sqrt( 1.0_dp - nu**2 )
+                    if( nu < thresh ) then
+                       mu = one
+                       nu = zero
+                    end if
+                 end if
+              end if
+              ! rotate to produce bulges in b11 and b21
+              if( mu <= nu ) then
+                 call stdlib_dlartgs( b11d(imin), b11e(imin), mu,rwork(iv1tcs+imin-1), rwork(&
+                           iv1tsn+imin-1) )
+              else
+                 call stdlib_dlartgs( b21d(imin), b21e(imin), nu,rwork(iv1tcs+imin-1), rwork(&
+                           iv1tsn+imin-1) )
+              end if
+              temp = rwork(iv1tcs+imin-1)*b11d(imin) +rwork(iv1tsn+imin-1)*b11e(imin)
+              b11e(imin) = rwork(iv1tcs+imin-1)*b11e(imin) -rwork(iv1tsn+imin-1)*b11d(imin)
+                        
+              b11d(imin) = temp
+              b11bulge = rwork(iv1tsn+imin-1)*b11d(imin+1)
+              b11d(imin+1) = rwork(iv1tcs+imin-1)*b11d(imin+1)
+              temp = rwork(iv1tcs+imin-1)*b21d(imin) +rwork(iv1tsn+imin-1)*b21e(imin)
+              b21e(imin) = rwork(iv1tcs+imin-1)*b21e(imin) -rwork(iv1tsn+imin-1)*b21d(imin)
+                        
+              b21d(imin) = temp
+              b21bulge = rwork(iv1tsn+imin-1)*b21d(imin+1)
+              b21d(imin+1) = rwork(iv1tcs+imin-1)*b21d(imin+1)
+              ! compute theta(imin)
+              theta( imin ) = atan2( sqrt( b21d(imin)**2+b21bulge**2 ),sqrt( b11d(imin)**2+&
+                        b11bulge**2 ) )
+              ! chase the bulges in b11(imin+1,imin) and b21(imin+1,imin)
+              if( b11d(imin)**2+b11bulge**2 > thresh**2 ) then
+                 call stdlib_dlartgp( b11bulge, b11d(imin), rwork(iu1sn+imin-1),rwork(iu1cs+imin-&
+                           1), r )
+              else if( mu <= nu ) then
+                 call stdlib_dlartgs( b11e( imin ), b11d( imin + 1 ), mu,rwork(iu1cs+imin-1), &
+                           rwork(iu1sn+imin-1) )
+              else
+                 call stdlib_dlartgs( b12d( imin ), b12e( imin ), nu,rwork(iu1cs+imin-1), rwork(&
+                           iu1sn+imin-1) )
+              end if
+              if( b21d(imin)**2+b21bulge**2 > thresh**2 ) then
+                 call stdlib_dlartgp( b21bulge, b21d(imin), rwork(iu2sn+imin-1),rwork(iu2cs+imin-&
+                           1), r )
+              else if( nu < mu ) then
+                 call stdlib_dlartgs( b21e( imin ), b21d( imin + 1 ), nu,rwork(iu2cs+imin-1), &
+                           rwork(iu2sn+imin-1) )
+              else
+                 call stdlib_dlartgs( b22d(imin), b22e(imin), mu,rwork(iu2cs+imin-1), rwork(iu2sn+&
+                           imin-1) )
+              end if
+              rwork(iu2cs+imin-1) = -rwork(iu2cs+imin-1)
+              rwork(iu2sn+imin-1) = -rwork(iu2sn+imin-1)
+              temp = rwork(iu1cs+imin-1)*b11e(imin) +rwork(iu1sn+imin-1)*b11d(imin+1)
+              b11d(imin+1) = rwork(iu1cs+imin-1)*b11d(imin+1) -rwork(iu1sn+imin-1)*b11e(imin)
+                        
+              b11e(imin) = temp
+              if( imax > imin+1 ) then
+                 b11bulge = rwork(iu1sn+imin-1)*b11e(imin+1)
+                 b11e(imin+1) = rwork(iu1cs+imin-1)*b11e(imin+1)
+              end if
+              temp = rwork(iu1cs+imin-1)*b12d(imin) +rwork(iu1sn+imin-1)*b12e(imin)
+              b12e(imin) = rwork(iu1cs+imin-1)*b12e(imin) -rwork(iu1sn+imin-1)*b12d(imin)
+              b12d(imin) = temp
+              b12bulge = rwork(iu1sn+imin-1)*b12d(imin+1)
+              b12d(imin+1) = rwork(iu1cs+imin-1)*b12d(imin+1)
+              temp = rwork(iu2cs+imin-1)*b21e(imin) +rwork(iu2sn+imin-1)*b21d(imin+1)
+              b21d(imin+1) = rwork(iu2cs+imin-1)*b21d(imin+1) -rwork(iu2sn+imin-1)*b21e(imin)
+                        
+              b21e(imin) = temp
+              if( imax > imin+1 ) then
+                 b21bulge = rwork(iu2sn+imin-1)*b21e(imin+1)
+                 b21e(imin+1) = rwork(iu2cs+imin-1)*b21e(imin+1)
+              end if
+              temp = rwork(iu2cs+imin-1)*b22d(imin) +rwork(iu2sn+imin-1)*b22e(imin)
+              b22e(imin) = rwork(iu2cs+imin-1)*b22e(imin) -rwork(iu2sn+imin-1)*b22d(imin)
+              b22d(imin) = temp
+              b22bulge = rwork(iu2sn+imin-1)*b22d(imin+1)
+              b22d(imin+1) = rwork(iu2cs+imin-1)*b22d(imin+1)
+              ! inner loop: chase bulges from b11(imin,imin+2),
+              ! b12(imin,imin+1), b21(imin,imin+2), and b22(imin,imin+1) to
+              ! bottom-right
+              do i = imin+1, imax-1
+                 ! compute phi(i-1)
+                 x1 = sin(theta(i-1))*b11e(i-1) + cos(theta(i-1))*b21e(i-1)
+                 x2 = sin(theta(i-1))*b11bulge + cos(theta(i-1))*b21bulge
+                 y1 = sin(theta(i-1))*b12d(i-1) + cos(theta(i-1))*b22d(i-1)
+                 y2 = sin(theta(i-1))*b12bulge + cos(theta(i-1))*b22bulge
+                 phi(i-1) = atan2( sqrt(x1**2+x2**2), sqrt(y1**2+y2**2) )
+                 ! determine if there are bulges to chase or if a new direct
+                 ! summand has been reached
+                 restart11 = b11e(i-1)**2 + b11bulge**2 <= thresh**2
+                 restart21 = b21e(i-1)**2 + b21bulge**2 <= thresh**2
+                 restart12 = b12d(i-1)**2 + b12bulge**2 <= thresh**2
+                 restart22 = b22d(i-1)**2 + b22bulge**2 <= thresh**2
+                 ! if possible, chase bulges from b11(i-1,i+1), b12(i-1,i),
+                 ! b21(i-1,i+1), and b22(i-1,i). if necessary, restart bulge-
+                 ! chasing by applying the original shift again.
+                 if( .not. restart11 .and. .not. restart21 ) then
+                    call stdlib_dlartgp( x2, x1, rwork(iv1tsn+i-1),rwork(iv1tcs+i-1), r )
+                 else if( .not. restart11 .and. restart21 ) then
+                    call stdlib_dlartgp( b11bulge, b11e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
+                               r )
+                 else if( restart11 .and. .not. restart21 ) then
+                    call stdlib_dlartgp( b21bulge, b21e(i-1), rwork(iv1tsn+i-1),rwork(iv1tcs+i-1),&
+                               r )
+                 else if( mu <= nu ) then
+                    call stdlib_dlartgs( b11d(i), b11e(i), mu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
+                               )
+                 else
+                    call stdlib_dlartgs( b21d(i), b21e(i), nu, rwork(iv1tcs+i-1),rwork(iv1tsn+i-1)&
+                               )
+                 end if
+                 rwork(iv1tcs+i-1) = -rwork(iv1tcs+i-1)
+                 rwork(iv1tsn+i-1) = -rwork(iv1tsn+i-1)
+                 if( .not. restart12 .and. .not. restart22 ) then
+                    call stdlib_dlartgp( y2, y1, rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-1-1), r )
+                              
+                 else if( .not. restart12 .and. restart22 ) then
+                    call stdlib_dlartgp( b12bulge, b12d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
+                              1-1), r )
+                 else if( restart12 .and. .not. restart22 ) then
+                    call stdlib_dlartgp( b22bulge, b22d(i-1), rwork(iv2tsn+i-1-1),rwork(iv2tcs+i-&
+                              1-1), r )
+                 else if( nu < mu ) then
+                    call stdlib_dlartgs( b12e(i-1), b12d(i), nu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
+                              i-1-1) )
+                 else
+                    call stdlib_dlartgs( b22e(i-1), b22d(i), mu,rwork(iv2tcs+i-1-1), rwork(iv2tsn+&
+                              i-1-1) )
+                 end if
+                 temp = rwork(iv1tcs+i-1)*b11d(i) + rwork(iv1tsn+i-1)*b11e(i)
+                 b11e(i) = rwork(iv1tcs+i-1)*b11e(i) -rwork(iv1tsn+i-1)*b11d(i)
+                 b11d(i) = temp
+                 b11bulge = rwork(iv1tsn+i-1)*b11d(i+1)
+                 b11d(i+1) = rwork(iv1tcs+i-1)*b11d(i+1)
+                 temp = rwork(iv1tcs+i-1)*b21d(i) + rwork(iv1tsn+i-1)*b21e(i)
+                 b21e(i) = rwork(iv1tcs+i-1)*b21e(i) -rwork(iv1tsn+i-1)*b21d(i)
+                 b21d(i) = temp
+                 b21bulge = rwork(iv1tsn+i-1)*b21d(i+1)
+                 b21d(i+1) = rwork(iv1tcs+i-1)*b21d(i+1)
+                 temp = rwork(iv2tcs+i-1-1)*b12e(i-1) +rwork(iv2tsn+i-1-1)*b12d(i)
+                 b12d(i) = rwork(iv2tcs+i-1-1)*b12d(i) -rwork(iv2tsn+i-1-1)*b12e(i-1)
+                 b12e(i-1) = temp
+                 b12bulge = rwork(iv2tsn+i-1-1)*b12e(i)
+                 b12e(i) = rwork(iv2tcs+i-1-1)*b12e(i)
+                 temp = rwork(iv2tcs+i-1-1)*b22e(i-1) +rwork(iv2tsn+i-1-1)*b22d(i)
+                 b22d(i) = rwork(iv2tcs+i-1-1)*b22d(i) -rwork(iv2tsn+i-1-1)*b22e(i-1)
+                 b22e(i-1) = temp
+                 b22bulge = rwork(iv2tsn+i-1-1)*b22e(i)
+                 b22e(i) = rwork(iv2tcs+i-1-1)*b22e(i)
+                 ! compute theta(i)
+                 x1 = cos(phi(i-1))*b11d(i) + sin(phi(i-1))*b12e(i-1)
+                 x2 = cos(phi(i-1))*b11bulge + sin(phi(i-1))*b12bulge
+                 y1 = cos(phi(i-1))*b21d(i) + sin(phi(i-1))*b22e(i-1)
+                 y2 = cos(phi(i-1))*b21bulge + sin(phi(i-1))*b22bulge
+                 theta(i) = atan2( sqrt(y1**2+y2**2), sqrt(x1**2+x2**2) )
+                 ! determine if there are bulges to chase or if a new direct
+                 ! summand has been reached
+                 restart11 =   b11d(i)**2 + b11bulge**2 <= thresh**2
+                 restart12 = b12e(i-1)**2 + b12bulge**2 <= thresh**2
+                 restart21 =   b21d(i)**2 + b21bulge**2 <= thresh**2
+                 restart22 = b22e(i-1)**2 + b22bulge**2 <= thresh**2
+                 ! if possible, chase bulges from b11(i+1,i), b12(i+1,i-1),
+                 ! b21(i+1,i), and b22(i+1,i-1). if necessary, restart bulge-
+                 ! chasing by applying the original shift again.
+                 if( .not. restart11 .and. .not. restart12 ) then
+                    call stdlib_dlartgp( x2, x1, rwork(iu1sn+i-1), rwork(iu1cs+i-1),r )
+                 else if( .not. restart11 .and. restart12 ) then
+                    call stdlib_dlartgp( b11bulge, b11d(i), rwork(iu1sn+i-1),rwork(iu1cs+i-1), r )
+                              
+                 else if( restart11 .and. .not. restart12 ) then
+                    call stdlib_dlartgp( b12bulge, b12e(i-1), rwork(iu1sn+i-1),rwork(iu1cs+i-1), &
+                              r )
+                 else if( mu <= nu ) then
+                    call stdlib_dlartgs( b11e(i), b11d(i+1), mu, rwork(iu1cs+i-1),rwork(iu1sn+i-1)&
+                               )
+                 else
+                    call stdlib_dlartgs( b12d(i), b12e(i), nu, rwork(iu1cs+i-1),rwork(iu1sn+i-1) )
+                              
+                 end if
+                 if( .not. restart21 .and. .not. restart22 ) then
+                    call stdlib_dlartgp( y2, y1, rwork(iu2sn+i-1), rwork(iu2cs+i-1),r )
+                 else if( .not. restart21 .and. restart22 ) then
+                    call stdlib_dlartgp( b21bulge, b21d(i), rwork(iu2sn+i-1),rwork(iu2cs+i-1), r )
+                              
+                 else if( restart21 .and. .not. restart22 ) then
+                    call stdlib_dlartgp( b22bulge, b22e(i-1), rwork(iu2sn+i-1),rwork(iu2cs+i-1), &
+                              r )
+                 else if( nu < mu ) then
+                    call stdlib_dlartgs( b21e(i), b21e(i+1), nu, rwork(iu2cs+i-1),rwork(iu2sn+i-1)&
+                               )
+                 else
+                    call stdlib_dlartgs( b22d(i), b22e(i), mu, rwork(iu2cs+i-1),rwork(iu2sn+i-1) )
+                              
+                 end if
+                 rwork(iu2cs+i-1) = -rwork(iu2cs+i-1)
+                 rwork(iu2sn+i-1) = -rwork(iu2sn+i-1)
+                 temp = rwork(iu1cs+i-1)*b11e(i) + rwork(iu1sn+i-1)*b11d(i+1)
+                 b11d(i+1) = rwork(iu1cs+i-1)*b11d(i+1) -rwork(iu1sn+i-1)*b11e(i)
+                 b11e(i) = temp
+                 if( i < imax - 1 ) then
+                    b11bulge = rwork(iu1sn+i-1)*b11e(i+1)
+                    b11e(i+1) = rwork(iu1cs+i-1)*b11e(i+1)
+                 end if
+                 temp = rwork(iu2cs+i-1)*b21e(i) + rwork(iu2sn+i-1)*b21d(i+1)
+                 b21d(i+1) = rwork(iu2cs+i-1)*b21d(i+1) -rwork(iu2sn+i-1)*b21e(i)
+                 b21e(i) = temp
+                 if( i < imax - 1 ) then
+                    b21bulge = rwork(iu2sn+i-1)*b21e(i+1)
+                    b21e(i+1) = rwork(iu2cs+i-1)*b21e(i+1)
+                 end if
+                 temp = rwork(iu1cs+i-1)*b12d(i) + rwork(iu1sn+i-1)*b12e(i)
+                 b12e(i) = rwork(iu1cs+i-1)*b12e(i) -rwork(iu1sn+i-1)*b12d(i)
+                 b12d(i) = temp
+                 b12bulge = rwork(iu1sn+i-1)*b12d(i+1)
+                 b12d(i+1) = rwork(iu1cs+i-1)*b12d(i+1)
+                 temp = rwork(iu2cs+i-1)*b22d(i) + rwork(iu2sn+i-1)*b22e(i)
+                 b22e(i) = rwork(iu2cs+i-1)*b22e(i) -rwork(iu2sn+i-1)*b22d(i)
+                 b22d(i) = temp
+                 b22bulge = rwork(iu2sn+i-1)*b22d(i+1)
+                 b22d(i+1) = rwork(iu2cs+i-1)*b22d(i+1)
+              end do
+              ! compute phi(imax-1)
+              x1 = sin(theta(imax-1))*b11e(imax-1) +cos(theta(imax-1))*b21e(imax-1)
+              y1 = sin(theta(imax-1))*b12d(imax-1) +cos(theta(imax-1))*b22d(imax-1)
+              y2 = sin(theta(imax-1))*b12bulge + cos(theta(imax-1))*b22bulge
+              phi(imax-1) = atan2( abs(x1), sqrt(y1**2+y2**2) )
+              ! chase bulges from b12(imax-1,imax) and b22(imax-1,imax)
+              restart12 = b12d(imax-1)**2 + b12bulge**2 <= thresh**2
+              restart22 = b22d(imax-1)**2 + b22bulge**2 <= thresh**2
+              if( .not. restart12 .and. .not. restart22 ) then
+                 call stdlib_dlartgp( y2, y1, rwork(iv2tsn+imax-1-1),rwork(iv2tcs+imax-1-1), r )
+                           
+              else if( .not. restart12 .and. restart22 ) then
+                 call stdlib_dlartgp( b12bulge, b12d(imax-1),rwork(iv2tsn+imax-1-1),rwork(iv2tcs+&
+                           imax-1-1), r )
+              else if( restart12 .and. .not. restart22 ) then
+                 call stdlib_dlartgp( b22bulge, b22d(imax-1),rwork(iv2tsn+imax-1-1),rwork(iv2tcs+&
+                           imax-1-1), r )
+              else if( nu < mu ) then
+                 call stdlib_dlartgs( b12e(imax-1), b12d(imax), nu,rwork(iv2tcs+imax-1-1),rwork(&
+                           iv2tsn+imax-1-1) )
+              else
+                 call stdlib_dlartgs( b22e(imax-1), b22d(imax), mu,rwork(iv2tcs+imax-1-1),rwork(&
+                           iv2tsn+imax-1-1) )
+              end if
+              temp = rwork(iv2tcs+imax-1-1)*b12e(imax-1) +rwork(iv2tsn+imax-1-1)*b12d(imax)
+                        
+              b12d(imax) = rwork(iv2tcs+imax-1-1)*b12d(imax) -rwork(iv2tsn+imax-1-1)*b12e(imax-1)
+                        
+              b12e(imax-1) = temp
+              temp = rwork(iv2tcs+imax-1-1)*b22e(imax-1) +rwork(iv2tsn+imax-1-1)*b22d(imax)
+                        
+              b22d(imax) = rwork(iv2tcs+imax-1-1)*b22d(imax) -rwork(iv2tsn+imax-1-1)*b22e(imax-1)
+                        
+              b22e(imax-1) = temp
+              ! update singular vectors
+              if( wantu1 ) then
+                 if( colmajor ) then
+                    call stdlib_zlasr( 'R', 'V', 'F', p, imax-imin+1,rwork(iu1cs+imin-1), rwork(&
+                              iu1sn+imin-1),u1(1,imin), ldu1 )
+                 else
+                    call stdlib_zlasr( 'L', 'V', 'F', imax-imin+1, p,rwork(iu1cs+imin-1), rwork(&
+                              iu1sn+imin-1),u1(imin,1), ldu1 )
+                 end if
+              end if
+              if( wantu2 ) then
+                 if( colmajor ) then
+                    call stdlib_zlasr( 'R', 'V', 'F', m-p, imax-imin+1,rwork(iu2cs+imin-1), rwork(&
+                              iu2sn+imin-1),u2(1,imin), ldu2 )
+                 else
+                    call stdlib_zlasr( 'L', 'V', 'F', imax-imin+1, m-p,rwork(iu2cs+imin-1), rwork(&
+                              iu2sn+imin-1),u2(imin,1), ldu2 )
+                 end if
+              end if
+              if( wantv1t ) then
+                 if( colmajor ) then
+                    call stdlib_zlasr( 'L', 'V', 'F', imax-imin+1, q,rwork(iv1tcs+imin-1), rwork(&
+                              iv1tsn+imin-1),v1t(imin,1), ldv1t )
+                 else
+                    call stdlib_zlasr( 'R', 'V', 'F', q, imax-imin+1,rwork(iv1tcs+imin-1), rwork(&
+                              iv1tsn+imin-1),v1t(1,imin), ldv1t )
+                 end if
+              end if
+              if( wantv2t ) then
+                 if( colmajor ) then
+                    call stdlib_zlasr( 'L', 'V', 'F', imax-imin+1, m-q,rwork(iv2tcs+imin-1), &
+                              rwork(iv2tsn+imin-1),v2t(imin,1), ldv2t )
+                 else
+                    call stdlib_zlasr( 'R', 'V', 'F', m-q, imax-imin+1,rwork(iv2tcs+imin-1), &
+                              rwork(iv2tsn+imin-1),v2t(1,imin), ldv2t )
+                 end if
+              end if
+              ! fix signs on b11(imax-1,imax) and b21(imax-1,imax)
+              if( b11e(imax-1)+b21e(imax-1) > 0 ) then
+                 b11d(imax) = -b11d(imax)
+                 b21d(imax) = -b21d(imax)
+                 if( wantv1t ) then
+                    if( colmajor ) then
+                       call stdlib_zscal( q, cnegone, v1t(imax,1), ldv1t )
+                    else
+                       call stdlib_zscal( q, cnegone, v1t(1,imax), 1 )
+                    end if
+                 end if
+              end if
+              ! compute theta(imax)
+              x1 = cos(phi(imax-1))*b11d(imax) +sin(phi(imax-1))*b12e(imax-1)
+              y1 = cos(phi(imax-1))*b21d(imax) +sin(phi(imax-1))*b22e(imax-1)
+              theta(imax) = atan2( abs(y1), abs(x1) )
+              ! fix signs on b11(imax,imax), b12(imax,imax-1), b21(imax,imax),
+              ! and b22(imax,imax-1)
+              if( b11d(imax)+b12e(imax-1) < 0 ) then
+                 b12d(imax) = -b12d(imax)
+                 if( wantu1 ) then
+                    if( colmajor ) then
+                       call stdlib_zscal( p, cnegone, u1(1,imax), 1 )
+                    else
+                       call stdlib_zscal( p, cnegone, u1(imax,1), ldu1 )
+                    end if
+                 end if
+              end if
+              if( b21d(imax)+b22e(imax-1) > 0 ) then
+                 b22d(imax) = -b22d(imax)
+                 if( wantu2 ) then
+                    if( colmajor ) then
+                       call stdlib_zscal( m-p, cnegone, u2(1,imax), 1 )
+                    else
+                       call stdlib_zscal( m-p, cnegone, u2(imax,1), ldu2 )
+                    end if
+                 end if
+              end if
+              ! fix signs on b12(imax,imax) and b22(imax,imax)
+              if( b12d(imax)+b22d(imax) < 0 ) then
+                 if( wantv2t ) then
+                    if( colmajor ) then
+                       call stdlib_zscal( m-q, cnegone, v2t(imax,1), ldv2t )
+                    else
+                       call stdlib_zscal( m-q, cnegone, v2t(1,imax), 1 )
+                    end if
+                 end if
+              end if
+              ! test for negligible sines or cosines
+              do i = imin, imax
+                 if( theta(i) < thresh ) then
+                    theta(i) = zero
+                 else if( theta(i) > piover2-thresh ) then
+                    theta(i) = piover2
+                 end if
+              end do
+              do i = imin, imax-1
+                 if( phi(i) < thresh ) then
+                    phi(i) = zero
+                 else if( phi(i) > piover2-thresh ) then
+                    phi(i) = piover2
+                 end if
+              end do
+              ! deflate
+              if (imax > 1) then
+                 do while( phi(imax-1) == zero )
+                    imax = imax - 1
+                    if (imax <= 1) exit
+                 end do
+              end if
+              if( imin > imax - 1 )imin = imax - 1
+              if (imin > 1) then
+                 do while (phi(imin-1) /= zero)
+                     imin = imin - 1
+                     if (imin <= 1) exit
+                 end do
+              end if
+              ! repeat main iteration loop
+           end do
+           ! postprocessing: order theta from least to greatest
+           do i = 1, q
+              mini = i
+              thetamin = theta(i)
+              do j = i+1, q
+                 if( theta(j) < thetamin ) then
+                    mini = j
+                    thetamin = theta(j)
+                 end if
+              end do
+              if( mini /= i ) then
+                 theta(mini) = theta(i)
+                 theta(i) = thetamin
+                 if( colmajor ) then
+                    if( wantu1 )call stdlib_zswap( p, u1(1,i), 1, u1(1,mini), 1 )
+                    if( wantu2 )call stdlib_zswap( m-p, u2(1,i), 1, u2(1,mini), 1 )
+                    if( wantv1t )call stdlib_zswap( q, v1t(i,1), ldv1t, v1t(mini,1), ldv1t )
+                              
+                    if( wantv2t )call stdlib_zswap( m-q, v2t(i,1), ldv2t, v2t(mini,1),ldv2t )
+                              
+                 else
+                    if( wantu1 )call stdlib_zswap( p, u1(i,1), ldu1, u1(mini,1), ldu1 )
+                    if( wantu2 )call stdlib_zswap( m-p, u2(i,1), ldu2, u2(mini,1), ldu2 )
+                    if( wantv1t )call stdlib_zswap( q, v1t(1,i), 1, v1t(1,mini), 1 )
+                    if( wantv2t )call stdlib_zswap( m-q, v2t(1,i), 1, v2t(1,mini), 1 )
+                 end if
+              end if
+           end do
+           return
+     end subroutine stdlib_zbbcsd
+
+     !> ZBDSQR: computes the singular values and, optionally, the right and/or
+     !> left singular vectors from the singular value decomposition (SVD) of
+     !> a real N-by-N (upper or lower) bidiagonal matrix B using the implicit
+     !> zero-shift QR algorithm.  The SVD of B has the form
+     !> B = Q * S * P**H
+     !> where S is the diagonal matrix of singular values, Q is an orthogonal
+     !> matrix of left singular vectors, and P is an orthogonal matrix of
+     !> right singular vectors.  If left singular vectors are requested, this
+     !> subroutine actually returns U*Q instead of Q, and, if right singular
+     !> vectors are requested, this subroutine returns P**H*VT instead of
+     !> P**H, for given complex input matrices U and VT.  When U and VT are
+     !> the unitary matrices that reduce a general matrix A to bidiagonal
+     !> form: A = U*B*VT, as computed by ZGEBRD, then
+     !> A = (U*Q) * S * (P**H*VT)
+     !> is the SVD of A.  Optionally, the subroutine may also compute Q**H*C
+     !> for a given complex input matrix C.
+     !> See "Computing  Small Singular Values of Bidiagonal Matrices With
+     !> Guaranteed High Relative Accuracy," by J. Demmel and W. Kahan,
+     !> LAPACK Working Note #3 (or SIAM J. Sci. Statist. Comput. vol. 11,
+     !> no. 5, pp. 873-912, Sept 1990) and
+     !> "Accurate singular values and differential qd algorithms," by
+     !> B. Parlett and V. Fernando, Technical Report CPAM-554, Mathematics
+     !> Department, University of California at Berkeley, July 1992
+     !> for a detailed description of the algorithm.
+
+     pure subroutine stdlib_zbdsqr( uplo, n, ncvt, nru, ncc, d, e, vt, ldvt, u,ldu, c, ldc, rwork,&
+                info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, ldu, ldvt, n, ncc, ncvt, nru
+           ! Array Arguments 
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: c(ldc,*), u(ldu,*), vt(ldvt,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: hndrth = 0.01_dp
+           real(dp), parameter :: hndrd = 100.0_dp
+           real(dp), parameter :: meigth = -0.125_dp
+           integer(ilp), parameter :: maxitr = 6
+           
+           
+           
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, rotate
+           integer(ilp) :: i, idir, isub, iter, j, ll, lll, m, maxit, nm1, nm12, nm13, oldll, &
+                     oldm
+           real(dp) :: abse, abss, cosl, cosr, cs, eps, f, g, h, mu, oldcs, oldsn, r, shift, &
+           sigmn, sigmx, sinl, sinr, sll, smax, smin, sminl, sminoa, sn, thresh, tol, tolmul, &
+                     unfl
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,sign,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.lower ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ncvt<0 ) then
+              info = -3
+           else if( nru<0 ) then
+              info = -4
+           else if( ncc<0 ) then
+              info = -5
+           else if( ( ncvt==0 .and. ldvt<1 ) .or.( ncvt>0 .and. ldvt<max( 1, n ) ) ) then
+              info = -9
+           else if( ldu<max( 1, nru ) ) then
+              info = -11
+           else if( ( ncc==0 .and. ldc<1 ) .or.( ncc>0 .and. ldc<max( 1, n ) ) ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZBDSQR', -info )
+              return
+           end if
+           if( n==0 )return
+           if( n==1 )go to 160
+           ! rotate is true if any singular vectors desired, false otherwise
+           rotate = ( ncvt>0 ) .or. ( nru>0 ) .or. ( ncc>0 )
+           ! if no singular vectors desired, use qd algorithm
+           if( .not.rotate ) then
+              call stdlib_dlasq1( n, d, e, rwork, info )
+           ! if info equals 2, dqds didn't finish, try to finish
+              if( info /= 2 ) return
+              info = 0
+           end if
+           nm1 = n - 1
+           nm12 = nm1 + nm1
+           nm13 = nm12 + nm1
+           idir = 0
+           ! get machine constants
+           eps = stdlib_dlamch( 'EPSILON' )
+           unfl = stdlib_dlamch( 'SAFE MINIMUM' )
+           ! if matrix lower bidiagonal, rotate to be upper bidiagonal
+           ! by applying givens rotations on the left
+           if( lower ) then
+              do i = 1, n - 1
+                 call stdlib_dlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 rwork( i ) = cs
+                 rwork( nm1+i ) = sn
+              end do
+              ! update singular vectors if desired
+              if( nru>0 )call stdlib_zlasr( 'R', 'V', 'F', nru, n, rwork( 1 ), rwork( n ),u, ldu )
+                        
+              if( ncc>0 )call stdlib_zlasr( 'L', 'V', 'F', n, ncc, rwork( 1 ), rwork( n ),c, ldc )
+                        
+           end if
+           ! compute singular values to relative accuracy tol
+           ! (by setting tol to be negative, algorithm will compute
+           ! singular values to absolute accuracy abs(tol)*norm(input matrix))
+           tolmul = max( ten, min( hndrd, eps**meigth ) )
+           tol = tolmul*eps
+           ! compute approximate maximum, minimum singular values
+           smax = zero
+           do i = 1, n
+              smax = max( smax, abs( d( i ) ) )
+           end do
+           do i = 1, n - 1
+              smax = max( smax, abs( e( i ) ) )
+           end do
+           sminl = zero
+           if( tol>=zero ) then
+              ! relative accuracy desired
+              sminoa = abs( d( 1 ) )
+              if( sminoa==zero )go to 50
+              mu = sminoa
+              do i = 2, n
+                 mu = abs( d( i ) )*( mu / ( mu+abs( e( i-1 ) ) ) )
+                 sminoa = min( sminoa, mu )
+                 if( sminoa==zero )go to 50
+              end do
+              50 continue
+              sminoa = sminoa / sqrt( real( n,KIND=dp) )
+              thresh = max( tol*sminoa, maxitr*n*n*unfl )
+           else
+              ! absolute accuracy desired
+              thresh = max( abs( tol )*smax, maxitr*n*n*unfl )
+           end if
+           ! prepare for main iteration loop for the singular values
+           ! (maxit is the maximum number of passes through the inner
+           ! loop permitted before nonconvergence signalled.)
+           maxit = maxitr*n*n
+           iter = 0
+           oldll = -1
+           oldm = -1
+           ! m points to last element of unconverged part of matrix
+           m = n
+           ! begin main iteration loop
+           60 continue
+           ! check for convergence or exceeding iteration count
+           if( m<=1 )go to 160
+           if( iter>maxit )go to 200
+           ! find diagonal block of matrix to work on
+           if( tol<zero .and. abs( d( m ) )<=thresh )d( m ) = zero
+           smax = abs( d( m ) )
+           smin = smax
+           do lll = 1, m - 1
+              ll = m - lll
+              abss = abs( d( ll ) )
+              abse = abs( e( ll ) )
+              if( tol<zero .and. abss<=thresh )d( ll ) = zero
+              if( abse<=thresh )go to 80
+              smin = min( smin, abss )
+              smax = max( smax, abss, abse )
+           end do
+           ll = 0
+           go to 90
+           80 continue
+           e( ll ) = zero
+           ! matrix splits since e(ll) = 0
+           if( ll==m-1 ) then
+              ! convergence of bottom singular value, return to top of loop
+              m = m - 1
+              go to 60
+           end if
+           90 continue
+           ll = ll + 1
+           ! e(ll) through e(m-1) are nonzero, e(ll-1) is zero
+           if( ll==m-1 ) then
+              ! 2 by 2 block, handle separately
+              call stdlib_dlasv2( d( m-1 ), e( m-1 ), d( m ), sigmn, sigmx, sinr,cosr, sinl, cosl &
+                        )
+              d( m-1 ) = sigmx
+              e( m-1 ) = zero
+              d( m ) = sigmn
+              ! compute singular vectors, if desired
+              if( ncvt>0 )call stdlib_zdrot( ncvt, vt( m-1, 1 ), ldvt, vt( m, 1 ), ldvt,cosr, &
+                        sinr )
+              if( nru>0 )call stdlib_zdrot( nru, u( 1, m-1 ), 1, u( 1, m ), 1, cosl, sinl )
+                        
+              if( ncc>0 )call stdlib_zdrot( ncc, c( m-1, 1 ), ldc, c( m, 1 ), ldc, cosl,sinl )
+                        
+              m = m - 2
+              go to 60
+           end if
+           ! if working on new submatrix, choose shift direction
+           ! (from larger end diagonal element towards smaller)
+           if( ll>oldm .or. m<oldll ) then
+              if( abs( d( ll ) )>=abs( d( m ) ) ) then
+                 ! chase bulge from top (big end) to bottom (small end)
+                 idir = 1
+              else
+                 ! chase bulge from bottom (big end) to top (small end)
+                 idir = 2
+              end if
+           end if
+           ! apply convergence tests
+           if( idir==1 ) then
+              ! run convergence test in forward direction
+              ! first apply standard test to bottom of matrix
+              if( abs( e( m-1 ) )<=abs( tol )*abs( d( m ) ) .or.( tol<zero .and. abs( e( m-1 ) )&
+                        <=thresh ) ) then
+                 e( m-1 ) = zero
+                 go to 60
+              end if
+              if( tol>=zero ) then
+                 ! if relative accuracy desired,
+                 ! apply convergence criterion forward
+                 mu = abs( d( ll ) )
+                 sminl = mu
+                 do lll = ll, m - 1
+                    if( abs( e( lll ) )<=tol*mu ) then
+                       e( lll ) = zero
+                       go to 60
+                    end if
+                    mu = abs( d( lll+1 ) )*( mu / ( mu+abs( e( lll ) ) ) )
+                    sminl = min( sminl, mu )
+                 end do
+              end if
+           else
+              ! run convergence test in backward direction
+              ! first apply standard test to top of matrix
+              if( abs( e( ll ) )<=abs( tol )*abs( d( ll ) ) .or.( tol<zero .and. abs( e( ll ) )&
+                        <=thresh ) ) then
+                 e( ll ) = zero
+                 go to 60
+              end if
+              if( tol>=zero ) then
+                 ! if relative accuracy desired,
+                 ! apply convergence criterion backward
+                 mu = abs( d( m ) )
+                 sminl = mu
+                 do lll = m - 1, ll, -1
+                    if( abs( e( lll ) )<=tol*mu ) then
+                       e( lll ) = zero
+                       go to 60
+                    end if
+                    mu = abs( d( lll ) )*( mu / ( mu+abs( e( lll ) ) ) )
+                    sminl = min( sminl, mu )
+                 end do
+              end if
+           end if
+           oldll = ll
+           oldm = m
+           ! compute shift.  first, test if shifting would ruin relative
+           ! accuracy, and if so set the shift to zero.
+           if( tol>=zero .and. n*tol*( sminl / smax )<=max( eps, hndrth*tol ) ) then
+              ! use a zero shift to avoid loss of relative accuracy
+              shift = zero
+           else
+              ! compute the shift from 2-by-2 block at end of matrix
+              if( idir==1 ) then
+                 sll = abs( d( ll ) )
+                 call stdlib_dlas2( d( m-1 ), e( m-1 ), d( m ), shift, r )
+              else
+                 sll = abs( d( m ) )
+                 call stdlib_dlas2( d( ll ), e( ll ), d( ll+1 ), shift, r )
+              end if
+              ! test if shift negligible, and if so set to zero
+              if( sll>zero ) then
+                 if( ( shift / sll )**2<eps )shift = zero
+              end if
+           end if
+           ! increment iteration count
+           iter = iter + m - ll
+           ! if shift = 0, do simplified qr iteration
+           if( shift==zero ) then
+              if( idir==1 ) then
+                 ! chase bulge from top to bottom
+                 ! save cosines and sines for later singular vector updates
+                 cs = one
+                 oldcs = one
+                 do i = ll, m - 1
+                    call stdlib_dlartg( d( i )*cs, e( i ), cs, sn, r )
+                    if( i>ll )e( i-1 ) = oldsn*r
+                    call stdlib_dlartg( oldcs*r, d( i+1 )*sn, oldcs, oldsn, d( i ) )
+                    rwork( i-ll+1 ) = cs
+                    rwork( i-ll+1+nm1 ) = sn
+                    rwork( i-ll+1+nm12 ) = oldcs
+                    rwork( i-ll+1+nm13 ) = oldsn
+                 end do
+                 h = d( m )*cs
+                 d( m ) = h*oldcs
+                 e( m-1 ) = h*oldsn
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_zlasr( 'L', 'V', 'F', m-ll+1, ncvt, rwork( 1 ),rwork( n )&
+                           , vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_zlasr( 'R', 'V', 'F', nru, m-ll+1, rwork( nm12+1 ),rwork( &
+                           nm13+1 ), u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_zlasr( 'L', 'V', 'F', m-ll+1, ncc, rwork( nm12+1 ),rwork( &
+                           nm13+1 ), c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( m-1 ) )<=thresh )e( m-1 ) = zero
+              else
+                 ! chase bulge from bottom to top
+                 ! save cosines and sines for later singular vector updates
+                 cs = one
+                 oldcs = one
+                 do i = m, ll + 1, -1
+                    call stdlib_dlartg( d( i )*cs, e( i-1 ), cs, sn, r )
+                    if( i<m )e( i ) = oldsn*r
+                    call stdlib_dlartg( oldcs*r, d( i-1 )*sn, oldcs, oldsn, d( i ) )
+                    rwork( i-ll ) = cs
+                    rwork( i-ll+nm1 ) = -sn
+                    rwork( i-ll+nm12 ) = oldcs
+                    rwork( i-ll+nm13 ) = -oldsn
+                 end do
+                 h = d( ll )*cs
+                 d( ll ) = h*oldcs
+                 e( ll ) = h*oldsn
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_zlasr( 'L', 'V', 'B', m-ll+1, ncvt, rwork( nm12+1 ),&
+                           rwork( nm13+1 ), vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_zlasr( 'R', 'V', 'B', nru, m-ll+1, rwork( 1 ),rwork( n ), &
+                           u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_zlasr( 'L', 'V', 'B', m-ll+1, ncc, rwork( 1 ),rwork( n ), &
+                           c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( ll ) )<=thresh )e( ll ) = zero
+              end if
+           else
+              ! use nonzero shift
+              if( idir==1 ) then
+                 ! chase bulge from top to bottom
+                 ! save cosines and sines for later singular vector updates
+                 f = ( abs( d( ll ) )-shift )*( sign( one, d( ll ) )+shift / d( ll ) )
+                 g = e( ll )
+                 do i = ll, m - 1
+                    call stdlib_dlartg( f, g, cosr, sinr, r )
+                    if( i>ll )e( i-1 ) = r
+                    f = cosr*d( i ) + sinr*e( i )
+                    e( i ) = cosr*e( i ) - sinr*d( i )
+                    g = sinr*d( i+1 )
+                    d( i+1 ) = cosr*d( i+1 )
+                    call stdlib_dlartg( f, g, cosl, sinl, r )
+                    d( i ) = r
+                    f = cosl*e( i ) + sinl*d( i+1 )
+                    d( i+1 ) = cosl*d( i+1 ) - sinl*e( i )
+                    if( i<m-1 ) then
+                       g = sinl*e( i+1 )
+                       e( i+1 ) = cosl*e( i+1 )
+                    end if
+                    rwork( i-ll+1 ) = cosr
+                    rwork( i-ll+1+nm1 ) = sinr
+                    rwork( i-ll+1+nm12 ) = cosl
+                    rwork( i-ll+1+nm13 ) = sinl
+                 end do
+                 e( m-1 ) = f
+                 ! update singular vectors
+                 if( ncvt>0 )call stdlib_zlasr( 'L', 'V', 'F', m-ll+1, ncvt, rwork( 1 ),rwork( n )&
+                           , vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_zlasr( 'R', 'V', 'F', nru, m-ll+1, rwork( nm12+1 ),rwork( &
+                           nm13+1 ), u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_zlasr( 'L', 'V', 'F', m-ll+1, ncc, rwork( nm12+1 ),rwork( &
+                           nm13+1 ), c( ll, 1 ), ldc )
+                 ! test convergence
+                 if( abs( e( m-1 ) )<=thresh )e( m-1 ) = zero
+              else
+                 ! chase bulge from bottom to top
+                 ! save cosines and sines for later singular vector updates
+                 f = ( abs( d( m ) )-shift )*( sign( one, d( m ) )+shift /d( m ) )
+                 g = e( m-1 )
+                 do i = m, ll + 1, -1
+                    call stdlib_dlartg( f, g, cosr, sinr, r )
+                    if( i<m )e( i ) = r
+                    f = cosr*d( i ) + sinr*e( i-1 )
+                    e( i-1 ) = cosr*e( i-1 ) - sinr*d( i )
+                    g = sinr*d( i-1 )
+                    d( i-1 ) = cosr*d( i-1 )
+                    call stdlib_dlartg( f, g, cosl, sinl, r )
+                    d( i ) = r
+                    f = cosl*e( i-1 ) + sinl*d( i-1 )
+                    d( i-1 ) = cosl*d( i-1 ) - sinl*e( i-1 )
+                    if( i>ll+1 ) then
+                       g = sinl*e( i-2 )
+                       e( i-2 ) = cosl*e( i-2 )
+                    end if
+                    rwork( i-ll ) = cosr
+                    rwork( i-ll+nm1 ) = -sinr
+                    rwork( i-ll+nm12 ) = cosl
+                    rwork( i-ll+nm13 ) = -sinl
+                 end do
+                 e( ll ) = f
+                 ! test convergence
+                 if( abs( e( ll ) )<=thresh )e( ll ) = zero
+                 ! update singular vectors if desired
+                 if( ncvt>0 )call stdlib_zlasr( 'L', 'V', 'B', m-ll+1, ncvt, rwork( nm12+1 ),&
+                           rwork( nm13+1 ), vt( ll, 1 ), ldvt )
+                 if( nru>0 )call stdlib_zlasr( 'R', 'V', 'B', nru, m-ll+1, rwork( 1 ),rwork( n ), &
+                           u( 1, ll ), ldu )
+                 if( ncc>0 )call stdlib_zlasr( 'L', 'V', 'B', m-ll+1, ncc, rwork( 1 ),rwork( n ), &
+                           c( ll, 1 ), ldc )
+              end if
+           end if
+           ! qr iteration finished, go back and check convergence
+           go to 60
+           ! all singular values converged, so make them positive
+           160 continue
+           do i = 1, n
+              if( d( i )<zero ) then
+                 d( i ) = -d( i )
+                 ! change sign of singular vectors, if desired
+                 if( ncvt>0 )call stdlib_zdscal( ncvt, negone, vt( i, 1 ), ldvt )
+              end if
+           end do
+           ! sort the singular values into decreasing order (insertion sort on
+           ! singular values, but only one transposition per singular vector)
+           do i = 1, n - 1
+              ! scan for smallest d(i)
+              isub = 1
+              smin = d( 1 )
+              do j = 2, n + 1 - i
+                 if( d( j )<=smin ) then
+                    isub = j
+                    smin = d( j )
+                 end if
+              end do
+              if( isub/=n+1-i ) then
+                 ! swap singular values and vectors
+                 d( isub ) = d( n+1-i )
+                 d( n+1-i ) = smin
+                 if( ncvt>0 )call stdlib_zswap( ncvt, vt( isub, 1 ), ldvt, vt( n+1-i, 1 ),ldvt )
+                           
+                 if( nru>0 )call stdlib_zswap( nru, u( 1, isub ), 1, u( 1, n+1-i ), 1 )
+                 if( ncc>0 )call stdlib_zswap( ncc, c( isub, 1 ), ldc, c( n+1-i, 1 ), ldc )
+                           
+              end if
+           end do
+           go to 220
+           ! maximum number of iterations exceeded, failure to converge
+           200 continue
+           info = 0
+           do i = 1, n - 1
+              if( e( i )/=zero )info = info + 1
+           end do
+           220 continue
+           return
+     end subroutine stdlib_zbdsqr
+
+     !> ZGBCON: estimates the reciprocal of the condition number of a complex
+     !> general band matrix A, in either the 1-norm or the infinity-norm,
+     !> using the LU factorization computed by ZGBTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     pure subroutine stdlib_zgbcon( norm, n, kl, ku, ab, ldab, ipiv, anorm, rcond,work, rwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(in) :: ab(ldab,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnoti, onenrm
+           character :: normin
+           integer(ilp) :: ix, j, jp, kase, kase1, kd, lm
+           real(dp) :: ainvnm, scale, smlnum
+           complex(dp) :: t, zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,min
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<2*kl+ku+1 ) then
+              info = -6
+           else if( anorm<zero ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           ! estimate the norm of inv(a).
+           ainvnm = zero
+           normin = 'N'
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kd = kl + ku + 1
+           lnoti = kl>0
+           kase = 0
+           10 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(l).
+                 if( lnoti ) then
+                    do j = 1, n - 1
+                       lm = min( kl, n-j )
+                       jp = ipiv( j )
+                       t = work( jp )
+                       if( jp/=j ) then
+                          work( jp ) = work( j )
+                          work( j ) = t
+                       end if
+                       call stdlib_zaxpy( lm, -t, ab( kd+1, j ), 1, work( j+1 ), 1 )
+                    end do
+                 end if
+                 ! multiply by inv(u).
+                 call stdlib_zlatbs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,kl+ku, ab, &
+                           ldab, work, scale, rwork, info )
+              else
+                 ! multiply by inv(u**h).
+                 call stdlib_zlatbs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, kl+ku, &
+                           ab, ldab, work, scale, rwork,info )
+                 ! multiply by inv(l**h).
+                 if( lnoti ) then
+                    do j = n - 1, 1, -1
+                       lm = min( kl, n-j )
+                       work( j ) = work( j ) - stdlib_zdotc( lm, ab( kd+1, j ), 1,work( j+1 ), 1 )
+                                 
+                       jp = ipiv( j )
+                       if( jp/=j ) then
+                          t = work( jp )
+                          work( jp ) = work( j )
+                          work( j ) = t
+                       end if
+                    end do
+                 end if
+              end if
+              ! divide x by 1/scale if doing so will not cause overflow.
+              normin = 'Y'
+              if( scale/=one ) then
+                 ix = stdlib_izamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 40
+                 call stdlib_zdrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           40 continue
+           return
+     end subroutine stdlib_zgbcon
+
+     !> ZGBTRF: computes an LU factorization of a complex m-by-n band matrix A
+     !> using partial pivoting with row interchanges.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_zgbtrf( m, n, kl, ku, ab, ldab, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldwork = nbmax+1
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i2, i3, ii, ip, j, j2, j3, jb, jj, jm, jp, ju, k2, km, kv, nb, &
+                     nw
+           complex(dp) :: temp
+           ! Local Arrays 
+           complex(dp) :: work13(ldwork,nbmax), work31(ldwork,nbmax)
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! kv is the number of superdiagonals in the factor u, allowing for
+           ! fill-in
+           kv = ku + kl
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( ldab<kl+kv+1 ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! determine the block size for this environment
+           nb = stdlib_ilaenv( 1, 'ZGBTRF', ' ', m, n, kl, ku )
+           ! the block size must not exceed the limit set by the size of the
+           ! local arrays work13 and work31.
+           nb = min( nb, nbmax )
+           if( nb<=1 .or. nb>kl ) then
+              ! use unblocked code
+              call stdlib_zgbtf2( m, n, kl, ku, ab, ldab, ipiv, info )
+           else
+              ! use blocked code
+              ! czero the superdiagonal elements of the work array work13
+              do j = 1, nb
+                 do i = 1, j - 1
+                    work13( i, j ) = czero
+                 end do
+              end do
+              ! czero the subdiagonal elements of the work array work31
+              do j = 1, nb
+                 do i = j + 1, nb
+                    work31( i, j ) = czero
+                 end do
+              end do
+              ! gaussian elimination with partial pivoting
+              ! set fill-in elements in columns ku+2 to kv to czero
+              do j = ku + 2, min( kv, n )
+                 do i = kv - j + 2, kl
+                    ab( i, j ) = czero
+                 end do
+              end do
+              ! ju is the index of the last column affected by the current
+              ! stage of the factorization
+              ju = 1
+              loop_180: do j = 1, min( m, n ), nb
+                 jb = min( nb, min( m, n )-j+1 )
+                 ! the active part of the matrix is partitioned
+                    ! a11   a12   a13
+                    ! a21   a22   a23
+                    ! a31   a32   a33
+                 ! here a11, a21 and a31 denote the current block of jb columns
+                 ! which is about to be factorized. the number of rows in the
+                 ! partitioning are jb, i2, i3 respectively, and the numbers
+                 ! of columns are jb, j2, j3. the superdiagonal elements of a13
+                 ! and the subdiagonal elements of a31 lie outside the band.
+                 i2 = min( kl-jb, m-j-jb+1 )
+                 i3 = min( jb, m-j-kl+1 )
+                 ! j2 and j3 are computed after ju has been updated.
+                 ! factorize the current block of jb columns
+                 loop_80: do jj = j, j + jb - 1
+                    ! set fill-in elements in column jj+kv to czero
+                    if( jj+kv<=n ) then
+                       do i = 1, kl
+                          ab( i, jj+kv ) = czero
+                       end do
+                    end if
+                    ! find pivot and test for singularity. km is the number of
+                    ! subdiagonal elements in the current column.
+                    km = min( kl, m-jj )
+                    jp = stdlib_izamax( km+1, ab( kv+1, jj ), 1 )
+                    ipiv( jj ) = jp + jj - j
+                    if( ab( kv+jp, jj )/=czero ) then
+                       ju = max( ju, min( jj+ku+jp-1, n ) )
+                       if( jp/=1 ) then
+                          ! apply interchange to columns j to j+jb-1
+                          if( jp+jj-1<j+kl ) then
+                             call stdlib_zswap( jb, ab( kv+1+jj-j, j ), ldab-1,ab( kv+jp+jj-j, j )&
+                                       , ldab-1 )
+                          else
+                             ! the interchange affects columns j to jj-1 of a31
+                             ! which are stored in the work array work31
+                             call stdlib_zswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,work31( jp+jj-j-&
+                                       kl, 1 ), ldwork )
+                             call stdlib_zswap( j+jb-jj, ab( kv+1, jj ), ldab-1,ab( kv+jp, jj ), &
+                                       ldab-1 )
+                          end if
+                       end if
+                       ! compute multipliers
+                       call stdlib_zscal( km, cone / ab( kv+1, jj ), ab( kv+2, jj ),1 )
+                       ! update trailing submatrix within the band and within
+                       ! the current block. jm is the index of the last column
+                       ! which needs to be updated.
+                       jm = min( ju, j+jb-1 )
+                       if( jm>jj )call stdlib_zgeru( km, jm-jj, -cone, ab( kv+2, jj ), 1,ab( kv, &
+                                 jj+1 ), ldab-1,ab( kv+1, jj+1 ), ldab-1 )
+                    else
+                       ! if pivot is czero, set info to the index of the pivot
+                       ! unless a czero pivot has already been found.
+                       if( info==0 )info = jj
+                    end if
+                    ! copy current column of a31 into the work array work31
+                    nw = min( jj-j+1, i3 )
+                    if( nw>0 )call stdlib_zcopy( nw, ab( kv+kl+1-jj+j, jj ), 1,work31( 1, jj-j+1 )&
+                              , 1 )
+                 end do loop_80
+                 if( j+jb<=n ) then
+                    ! apply the row interchanges to the other blocks.
+                    j2 = min( ju-j+1, kv ) - jb
+                    j3 = max( 0, ju-j-kv+1 )
+                    ! use stdlib_zlaswp to apply the row interchanges to a12, a22, and
+                    ! a32.
+                    call stdlib_zlaswp( j2, ab( kv+1-jb, j+jb ), ldab-1, 1, jb,ipiv( j ), 1 )
+                              
+                    ! adjust the pivot indices.
+                    do i = j, j + jb - 1
+                       ipiv( i ) = ipiv( i ) + j - 1
+                    end do
+                    ! apply the row interchanges to a13, a23, and a33
+                    ! columnwise.
+                    k2 = j - 1 + jb + j2
+                    do i = 1, j3
+                       jj = k2 + i
+                       do ii = j + i - 1, j + jb - 1
+                          ip = ipiv( ii )
+                          if( ip/=ii ) then
+                             temp = ab( kv+1+ii-jj, jj )
+                             ab( kv+1+ii-jj, jj ) = ab( kv+1+ip-jj, jj )
+                             ab( kv+1+ip-jj, jj ) = temp
+                          end if
+                       end do
+                    end do
+                    ! update the relevant part of the trailing submatrix
+                    if( j2>0 ) then
+                       ! update a12
+                       call stdlib_ztrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT',jb, j2, cone, &
+                                 ab( kv+1, j ), ldab-1,ab( kv+1-jb, j+jb ), ldab-1 )
+                       if( i2>0 ) then
+                          ! update a22
+                          call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i2, j2,jb, -cone, ab(&
+                           kv+1+jb, j ), ldab-1,ab( kv+1-jb, j+jb ), ldab-1, cone,ab( kv+1, j+jb )&
+                                     , ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! update a32
+                          call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i3, j2,jb, -cone, &
+                          work31, ldwork,ab( kv+1-jb, j+jb ), ldab-1, cone,ab( kv+kl+1-jb, j+jb ),&
+                                     ldab-1 )
+                       end if
+                    end if
+                    if( j3>0 ) then
+                       ! copy the lower triangle of a13 into the work array
+                       ! work13
+                       do jj = 1, j3
+                          do ii = jj, jb
+                             work13( ii, jj ) = ab( ii-jj+1, jj+j+kv-1 )
+                          end do
+                       end do
+                       ! update a13 in the work array
+                       call stdlib_ztrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT',jb, j3, cone, &
+                                 ab( kv+1, j ), ldab-1,work13, ldwork )
+                       if( i2>0 ) then
+                          ! update a23
+                          call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i2, j3,jb, -cone, ab(&
+                           kv+1+jb, j ), ldab-1,work13, ldwork, cone, ab( 1+jb, j+kv ),ldab-1 )
+                                     
+                       end if
+                       if( i3>0 ) then
+                          ! update a33
+                          call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', i3, j3,jb, -cone, &
+                                    work31, ldwork, work13,ldwork, cone, ab( 1+kl, j+kv ), ldab-1 )
+                       end if
+                       ! copy the lower triangle of a13 back into place
+                       do jj = 1, j3
+                          do ii = jj, jb
+                             ab( ii-jj+1, jj+j+kv-1 ) = work13( ii, jj )
+                          end do
+                       end do
+                    end if
+                 else
+                    ! adjust the pivot indices.
+                    do i = j, j + jb - 1
+                       ipiv( i ) = ipiv( i ) + j - 1
+                    end do
+                 end if
+                 ! partially undo the interchanges in the current block to
+                 ! restore the upper triangular form of a31 and copy the upper
+                 ! triangle of a31 back into place
+                 do jj = j + jb - 1, j, -1
+                    jp = ipiv( jj ) - jj + 1
+                    if( jp/=1 ) then
+                       ! apply interchange to columns j to jj-1
+                       if( jp+jj-1<j+kl ) then
+                          ! the interchange does not affect a31
+                          call stdlib_zswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,ab( kv+jp+jj-j, j ),&
+                                     ldab-1 )
+                       else
+                          ! the interchange does affect a31
+                          call stdlib_zswap( jj-j, ab( kv+1+jj-j, j ), ldab-1,work31( jp+jj-j-kl, &
+                                    1 ), ldwork )
+                       end if
+                    end if
+                    ! copy the current column of a31 back into place
+                    nw = min( i3, jj-j+1 )
+                    if( nw>0 )call stdlib_zcopy( nw, work31( 1, jj-j+1 ), 1,ab( kv+kl+1-jj+j, jj )&
+                              , 1 )
+                 end do
+              end do loop_180
+           end if
+           return
+     end subroutine stdlib_zgbtrf
+
+     !> ZGBTRS: solves a system of linear equations
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B
+     !> with a general band matrix A using the LU factorization computed
+     !> by ZGBTRF.
+
+     pure subroutine stdlib_zgbtrs( trans, n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: ab(ldab,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lnoti, notran
+           integer(ilp) :: i, j, kd, l, lm
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<( 2*kl+ku+1 ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           kd = ku + kl + 1
+           lnoti = kl>0
+           if( notran ) then
+              ! solve  a*x = b.
+              ! solve l*x = b, overwriting b with x.
+              ! l is represented as a product of permutations and unit lower
+              ! triangular matrices l = p(1) * l(1) * ... * p(n-1) * l(n-1),
+              ! where each transformation l(i) is a rank-cone modification of
+              ! the identity matrix.
+              if( lnoti ) then
+                 do j = 1, n - 1
+                    lm = min( kl, n-j )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_zswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                    call stdlib_zgeru( lm, nrhs, -cone, ab( kd+1, j ), 1, b( j, 1 ),ldb, b( j+1, &
+                              1 ), ldb )
+                 end do
+              end if
+              do i = 1, nrhs
+                 ! solve u*x = b, overwriting b with x.
+                 call stdlib_ztbsv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, kl+ku,ab, ldab, b( 1, &
+                           i ), 1 )
+              end do
+           else if( stdlib_lsame( trans, 'T' ) ) then
+              ! solve a**t * x = b.
+              do i = 1, nrhs
+                 ! solve u**t * x = b, overwriting b with x.
+                 call stdlib_ztbsv( 'UPPER', 'TRANSPOSE', 'NON-UNIT', n, kl+ku, ab,ldab, b( 1, i )&
+                           , 1 )
+              end do
+              ! solve l**t * x = b, overwriting b with x.
+              if( lnoti ) then
+                 do j = n - 1, 1, -1
+                    lm = min( kl, n-j )
+                    call stdlib_zgemv( 'TRANSPOSE', lm, nrhs, -cone, b( j+1, 1 ),ldb, ab( kd+1, j &
+                              ), 1, cone, b( j, 1 ), ldb )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_zswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                 end do
+              end if
+           else
+              ! solve a**h * x = b.
+              do i = 1, nrhs
+                 ! solve u**h * x = b, overwriting b with x.
+                 call stdlib_ztbsv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT', n,kl+ku, ab, ldab,&
+                            b( 1, i ), 1 )
+              end do
+              ! solve l**h * x = b, overwriting b with x.
+              if( lnoti ) then
+                 do j = n - 1, 1, -1
+                    lm = min( kl, n-j )
+                    call stdlib_zlacgv( nrhs, b( j, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', lm, nrhs, -cone,b( j+1, 1 ), ldb, &
+                              ab( kd+1, j ), 1, cone,b( j, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( j, 1 ), ldb )
+                    l = ipiv( j )
+                    if( l/=j )call stdlib_zswap( nrhs, b( l, 1 ), ldb, b( j, 1 ), ldb )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zgbtrs
+
+     !> ZGEBD2: reduces a complex general m by n matrix A to upper or lower
+     !> real bidiagonal form B by a unitary transformation: Q**H * A * P = B.
+     !> If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+     pure subroutine stdlib_zgebd2( m, n, a, lda, d, e, tauq, taup, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(out) :: d(*), e(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: taup(*), tauq(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info<0 ) then
+              call stdlib_xerbla( 'ZGEBD2', -info )
+              return
+           end if
+           if( m>=n ) then
+              ! reduce to upper bidiagonal form
+              do i = 1, n
+                 ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+                 alpha = a( i, i )
+                 call stdlib_zlarfg( m-i+1, alpha, a( min( i+1, m ), i ), 1,tauq( i ) )
+                 d( i ) = real( alpha,KIND=dp)
+                 a( i, i ) = cone
+                 ! apply h(i)**h to a(i:m,i+1:n) from the left
+                 if( i<n )call stdlib_zlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1,conjg( tauq( i ) ), &
+                           a( i, i+1 ), lda, work )
+                 a( i, i ) = d( i )
+                 if( i<n ) then
+                    ! generate elementary reflector g(i) to annihilate
+                    ! a(i,i+2:n)
+                    call stdlib_zlacgv( n-i, a( i, i+1 ), lda )
+                    alpha = a( i, i+1 )
+                    call stdlib_zlarfg( n-i, alpha, a( i, min( i+2, n ) ), lda,taup( i ) )
+                    e( i ) = real( alpha,KIND=dp)
+                    a( i, i+1 ) = cone
+                    ! apply g(i) to a(i+1:m,i+1:n) from the right
+                    call stdlib_zlarf( 'RIGHT', m-i, n-i, a( i, i+1 ), lda,taup( i ), a( i+1, i+1 &
+                              ), lda, work )
+                    call stdlib_zlacgv( n-i, a( i, i+1 ), lda )
+                    a( i, i+1 ) = e( i )
+                 else
+                    taup( i ) = czero
+                 end if
+              end do
+           else
+              ! reduce to lower bidiagonal form
+              do i = 1, m
+                 ! generate elementary reflector g(i) to annihilate a(i,i+1:n)
+                 call stdlib_zlacgv( n-i+1, a( i, i ), lda )
+                 alpha = a( i, i )
+                 call stdlib_zlarfg( n-i+1, alpha, a( i, min( i+1, n ) ), lda,taup( i ) )
+                 d( i ) = real( alpha,KIND=dp)
+                 a( i, i ) = cone
+                 ! apply g(i) to a(i+1:m,i:n) from the right
+                 if( i<m )call stdlib_zlarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda,taup( i ), a( i+&
+                           1, i ), lda, work )
+                 call stdlib_zlacgv( n-i+1, a( i, i ), lda )
+                 a( i, i ) = d( i )
+                 if( i<m ) then
+                    ! generate elementary reflector h(i) to annihilate
+                    ! a(i+2:m,i)
+                    alpha = a( i+1, i )
+                    call stdlib_zlarfg( m-i, alpha, a( min( i+2, m ), i ), 1,tauq( i ) )
+                    e( i ) = real( alpha,KIND=dp)
+                    a( i+1, i ) = cone
+                    ! apply h(i)**h to a(i+1:m,i+1:n) from the left
+                    call stdlib_zlarf( 'LEFT', m-i, n-i, a( i+1, i ), 1,conjg( tauq( i ) ), a( i+&
+                              1, i+1 ), lda,work )
+                    a( i+1, i ) = e( i )
+                 else
+                    tauq( i ) = czero
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_zgebd2
+
+     !> ZGECON: estimates the reciprocal of the condition number of a general
+     !> complex matrix A, in either the 1-norm or the infinity-norm, using
+     !> the LU factorization computed by ZGETRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     pure subroutine stdlib_zgecon( norm, n, a, lda, anorm, rcond, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: onenrm
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(dp) :: ainvnm, scale, sl, smlnum, su
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGECON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+           ! estimate the norm of inv(a).
+           ainvnm = zero
+           normin = 'N'
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kase = 0
+           10 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(l).
+                 call stdlib_zlatrs( 'LOWER', 'NO TRANSPOSE', 'UNIT', normin, n, a,lda, work, sl, &
+                           rwork, info )
+                 ! multiply by inv(u).
+                 call stdlib_zlatrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', normin, n,a, lda, work, &
+                           su, rwork( n+1 ), info )
+              else
+                 ! multiply by inv(u**h).
+                 call stdlib_zlatrs( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',normin, n, a, lda,&
+                            work, su, rwork( n+1 ),info )
+                 ! multiply by inv(l**h).
+                 call stdlib_zlatrs( 'LOWER', 'CONJUGATE TRANSPOSE', 'UNIT', normin,n, a, lda, &
+                           work, sl, rwork, info )
+              end if
+              ! divide x by 1/(sl*su) if doing so will not cause overflow.
+              scale = sl*su
+              normin = 'Y'
+              if( scale/=one ) then
+                 ix = stdlib_izamax( n, work, 1 )
+                 if( scale<cabs1( work( ix ) )*smlnum .or. scale==zero )go to 20
+                 call stdlib_zdrscl( n, scale, work, 1 )
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           20 continue
+           return
+     end subroutine stdlib_zgecon
+
+     !> ZGEHD2: reduces a complex general matrix A to upper Hessenberg form H
+     !> by a unitary similarity transformation:  Q**H * A * Q = H .
+
+     pure subroutine stdlib_zgehd2( n, ilo, ihi, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEHD2', -info )
+              return
+           end if
+           do i = ilo, ihi - 1
+              ! compute elementary reflector h(i) to annihilate a(i+2:ihi,i)
+              alpha = a( i+1, i )
+              call stdlib_zlarfg( ihi-i, alpha, a( min( i+2, n ), i ), 1, tau( i ) )
+              a( i+1, i ) = cone
+              ! apply h(i) to a(1:ihi,i+1:ihi) from the right
+              call stdlib_zlarf( 'RIGHT', ihi, ihi-i, a( i+1, i ), 1, tau( i ),a( 1, i+1 ), lda, &
+                        work )
+              ! apply h(i)**h to a(i+1:ihi,i+1:n) from the left
+              call stdlib_zlarf( 'LEFT', ihi-i, n-i, a( i+1, i ), 1,conjg( tau( i ) ), a( i+1, i+&
+                        1 ), lda, work )
+              a( i+1, i ) = alpha
+           end do
+           return
+     end subroutine stdlib_zgehd2
+
+     !> ZGELQ2: computes an LQ factorization of a complex m-by-n matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a n-by-n orthogonal matrix;
+     !> L is a lower-triangular m-by-m matrix;
+     !> 0 is a m-by-(n-m) zero matrix, if m < n.
+
+     pure subroutine stdlib_zgelq2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELQ2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i,i+1:n)
+              call stdlib_zlacgv( n-i+1, a( i, i ), lda )
+              alpha = a( i, i )
+              call stdlib_zlarfg( n-i+1, alpha, a( i, min( i+1, n ) ), lda,tau( i ) )
+              if( i<m ) then
+                 ! apply h(i) to a(i+1:m,i:n) from the right
+                 a( i, i ) = cone
+                 call stdlib_zlarf( 'RIGHT', m-i, n-i+1, a( i, i ), lda, tau( i ),a( i+1, i ), &
+                           lda, work )
+              end if
+              a( i, i ) = alpha
+              call stdlib_zlacgv( n-i+1, a( i, i ), lda )
+           end do
+           return
+     end subroutine stdlib_zgelq2
+
+     !> ZGELQF: computes an LQ factorization of a complex M-by-N matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a N-by-N orthogonal matrix;
+     !> L is a lower-triangular M-by-M matrix;
+     !> 0 is a M-by-(N-M) zero matrix, if M < N.
+
+     pure subroutine stdlib_zgelqf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZGELQF', ' ', m, n, -1, -1 )
+           lwkopt = m*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, m ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZGELQF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGELQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the lq factorization of the current block
+                 ! a(i:i+ib-1,i:n)
+                 call stdlib_zgelq2( ib, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=m ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_zlarft( 'FORWARD', 'ROWWISE', n-i+1, ib, a( i, i ),lda, tau( i ), &
+                              work, ldwork )
+                    ! apply h to a(i+ib:m,i:n) from the right
+                    call stdlib_zlarfb( 'RIGHT', 'NO TRANSPOSE', 'FORWARD','ROWWISE', m-i-ib+1, n-&
+                    i+1, ib, a( i, i ),lda, work, ldwork, a( i+ib, i ), lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_zgelq2( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_zgelqf
+
+     !> ZGELQT3: recursively computes a LQ factorization of a complex M-by-N
+     !> matrix A, using the compact WY representation of Q.
+     !> Based on the algorithm of Elmroth and Gustavson,
+     !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+
+     pure recursive subroutine stdlib_zgelqt3( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, ldt
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, j, j1, m1, m2, iinfo
+           ! Executable Statements 
+           info = 0
+           if( m < 0 ) then
+              info = -1
+           else if( n < m ) then
+              info = -2
+           else if( lda < max( 1, m ) ) then
+              info = -4
+           else if( ldt < max( 1, m ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELQT3', -info )
+              return
+           end if
+           if( m==1 ) then
+              ! compute householder transform when m=1
+              call stdlib_zlarfg( n, a(1,1), a( 1, min( 2, n ) ), lda, t(1,1) )
+              t(1,1)=conjg(t(1,1))
+           else
+              ! otherwise, split a into blocks...
+              m1 = m/2
+              m2 = m-m1
+              i1 = min( m1+1, m )
+              j1 = min( m+1, n )
+              ! compute a(1:m1,1:n) <- (y1,r1,t1), where q1 = i - y1 t1 y1^h
+              call stdlib_zgelqt3( m1, n, a, lda, t, ldt, iinfo )
+              ! compute a(j1:m,1:n) =  a(j1:m,1:n) q1^h [workspace: t(1:n1,j1:n)]
+              do i=1,m2
+                 do j=1,m1
+                    t(  i+m1, j ) = a( i+m1, j )
+                 end do
+              end do
+              call stdlib_ztrmm( 'R', 'U', 'C', 'U', m2, m1, cone,a, lda, t( i1, 1 ), ldt )
+                        
+              call stdlib_zgemm( 'N', 'C', m2, m1, n-m1, cone, a( i1, i1 ), lda,a( 1, i1 ), lda, &
+                        cone, t( i1, 1 ), ldt)
+              call stdlib_ztrmm( 'R', 'U', 'N', 'N', m2, m1, cone,t, ldt, t( i1, 1 ), ldt )
+                        
+              call stdlib_zgemm( 'N', 'N', m2, n-m1, m1, -cone, t( i1, 1 ), ldt,a( 1, i1 ), lda, &
+                        cone, a( i1, i1 ), lda )
+              call stdlib_ztrmm( 'R', 'U', 'N', 'U', m2, m1 , cone,a, lda, t( i1, 1 ), ldt )
+                        
+              do i=1,m2
+                 do j=1,m1
+                    a(  i+m1, j ) = a( i+m1, j ) - t( i+m1, j )
+                    t( i+m1, j )= czero
+                 end do
+              end do
+              ! compute a(j1:m,j1:n) <- (y2,r2,t2) where q2 = i - y2 t2 y2^h
+              call stdlib_zgelqt3( m2, n-m1, a( i1, i1 ), lda,t( i1, i1 ), ldt, iinfo )
+              ! compute t3 = t(j1:n1,1:n) = -t1 y1^h y2 t2
+              do i=1,m2
+                 do j=1,m1
+                    t( j, i+m1  ) = (a( j, i+m1 ))
+                 end do
+              end do
+              call stdlib_ztrmm( 'R', 'U', 'C', 'U', m1, m2, cone,a( i1, i1 ), lda, t( 1, i1 ), &
+                        ldt )
+              call stdlib_zgemm( 'N', 'C', m1, m2, n-m, cone, a( 1, j1 ), lda,a( i1, j1 ), lda, &
+                        cone, t( 1, i1 ), ldt )
+              call stdlib_ztrmm( 'L', 'U', 'N', 'N', m1, m2, -cone, t, ldt,t( 1, i1 ), ldt )
+                        
+              call stdlib_ztrmm( 'R', 'U', 'N', 'N', m1, m2, cone,t( i1, i1 ), ldt, t( 1, i1 ), &
+                        ldt )
+              ! y = (y1,y2); l = [ l1            0  ];  t = [t1 t3]
+                               ! [ a(1:n1,j1:n)  l2 ]       [ 0 t2]
+           end if
+           return
+     end subroutine stdlib_zgelqt3
+
+     !> ZGEMLQT: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q C            C Q
+     !> TRANS = 'C':   Q**H C            C Q**H
+     !> where Q is a complex unitary matrix defined as the product of K
+     !> elementary reflectors:
+     !> Q = H(1) H(2) . . . H(K) = I - V T V**H
+     !> generated using the compact WY representation as returned by ZGELQT.
+     !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+
+     pure subroutine stdlib_zgemlqt( side, trans, m, n, k, mb, v, ldv, t, ldt,c, ldc, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, ldc, m, n, mb, ldt
+           ! Array Arguments 
+           complex(dp), intent(in) :: v(ldv,*), t(ldt,*)
+           complex(dp), intent(inout) :: c(ldc,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, ldwork, kf, q
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'C' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( left ) then
+              ldwork = max( 1, n )
+              q = m
+           else if ( right ) then
+              ldwork = max( 1, m )
+              q = n
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>q ) then
+              info = -5
+           else if( mb<1 .or. (mb>k .and. k>0)) then
+              info = -6
+           else if( ldv<max( 1, k ) ) then
+               info = -8
+           else if( ldt<mb ) then
+              info = -10
+           else if( ldc<max( 1, m ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEMLQT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. notran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_zlarfb( 'L', 'C', 'F', 'R', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. tran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_zlarfb( 'R', 'N', 'F', 'R', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           else if( left .and. tran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_zlarfb( 'L', 'N', 'F', 'R', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. notran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 call stdlib_zlarfb( 'R', 'C', 'F', 'R', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           end if
+           return
+     end subroutine stdlib_zgemlqt
+
+     !> ZGEMQRT: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q C            C Q
+     !> TRANS = 'C':    Q**H C            C Q**H
+     !> where Q is a complex orthogonal matrix defined as the product of K
+     !> elementary reflectors:
+     !> Q = H(1) H(2) . . . H(K) = I - V T V**H
+     !> generated using the compact WY representation as returned by ZGEQRT.
+     !> Q is of order M if SIDE = 'L' and of order N  if SIDE = 'R'.
+
+     pure subroutine stdlib_zgemqrt( side, trans, m, n, k, nb, v, ldv, t, ldt,c, ldc, work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, ldc, m, n, nb, ldt
+           ! Array Arguments 
+           complex(dp), intent(in) :: v(ldv,*), t(ldt,*)
+           complex(dp), intent(inout) :: c(ldc,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, ldwork, kf, q
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'C' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( left ) then
+              ldwork = max( 1, n )
+              q = m
+           else if ( right ) then
+              ldwork = max( 1, m )
+              q = n
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 .or. k>q ) then
+              info = -5
+           else if( nb<1 .or. (nb>k .and. k>0)) then
+              info = -6
+           else if( ldv<max( 1, q ) ) then
+              info = -8
+           else if( ldt<nb ) then
+              info = -10
+           else if( ldc<max( 1, m ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEMQRT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. tran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_zlarfb( 'L', 'C', 'F', 'C', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. notran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_zlarfb( 'R', 'N', 'F', 'C', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           else if( left .and. notran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_zlarfb( 'L', 'N', 'F', 'C', m-i+1, n, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( i, 1 ), ldc, work, ldwork )
+              end do
+           else if( right .and. tran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 call stdlib_zlarfb( 'R', 'C', 'F', 'C', m, n-i+1, ib,v( i, i ), ldv, t( 1, i ), &
+                           ldt,c( 1, i ), ldc, work, ldwork )
+              end do
+           end if
+           return
+     end subroutine stdlib_zgemqrt
+
+     !> ZGEQL2: computes a QL factorization of a complex m by n matrix A:
+     !> A = Q * L.
+
+     pure subroutine stdlib_zgeql2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQL2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = k, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! a(1:m-k+i-1,n-k+i)
+              alpha = a( m-k+i, n-k+i )
+              call stdlib_zlarfg( m-k+i, alpha, a( 1, n-k+i ), 1, tau( i ) )
+              ! apply h(i)**h to a(1:m-k+i,1:n-k+i-1) from the left
+              a( m-k+i, n-k+i ) = cone
+              call stdlib_zlarf( 'LEFT', m-k+i, n-k+i-1, a( 1, n-k+i ), 1,conjg( tau( i ) ), a, &
+                        lda, work )
+              a( m-k+i, n-k+i ) = alpha
+           end do
+           return
+     end subroutine stdlib_zgeql2
+
+     !> ZGEQLF: computes a QL factorization of a complex M-by-N matrix A:
+     !> A = Q * L.
+
+     pure subroutine stdlib_zgeqlf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ki, kk, ldwork, lwkopt, mu, nb, nbmin, nu, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              k = min( m, n )
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'ZGEQLF', ' ', m, n, -1, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, n ) .and. .not.lquery ) then
+                 info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQLF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZGEQLF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGEQLF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially.
+              ! the last kk columns are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              do i = k - kk + ki + 1, k - kk + 1, -nb
+                 ib = min( k-i+1, nb )
+                 ! compute the ql factorization of the current block
+                 ! a(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1)
+                 call stdlib_zgeql2( m-k+i+ib-1, ib, a( 1, n-k+i ), lda, tau( i ),work, iinfo )
+                           
+                 if( n-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_zlarft( 'BACKWARD', 'COLUMNWISE', m-k+i+ib-1, ib,a( 1, n-k+i ), &
+                              lda, tau( i ), work, ldwork )
+                    ! apply h**h to a(1:m-k+i+ib-1,1:n-k+i-1) from the left
+                    call stdlib_zlarfb( 'LEFT', 'CONJUGATE TRANSPOSE', 'BACKWARD','COLUMNWISE', m-&
+                    k+i+ib-1, n-k+i-1, ib,a( 1, n-k+i ), lda, work, ldwork, a, lda,work( ib+1 ), &
+                              ldwork )
+                 end if
+              end do
+              mu = m - k + i + nb - 1
+              nu = n - k + i + nb - 1
+           else
+              mu = m
+              nu = n
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 .and. nu>0 )call stdlib_zgeql2( mu, nu, a, lda, tau, work, iinfo )
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_zgeqlf
+
+     !> ZGEQR2: computes a QR factorization of a complex m-by-n matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a m-by-m orthogonal matrix;
+     !> R is an upper-triangular n-by-n matrix;
+     !> 0 is a (m-n)-by-n zero matrix, if m > n.
+
+     pure subroutine stdlib_zgeqr2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQR2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+              call stdlib_zlarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tau( i ) )
+              if( i<n ) then
+                 ! apply h(i)**h to a(i:m,i+1:n) from the left
+                 alpha = a( i, i )
+                 a( i, i ) = cone
+                 call stdlib_zlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1,conjg( tau( i ) ), a( i, i+1 &
+                           ), lda, work )
+                 a( i, i ) = alpha
+              end if
+           end do
+           return
+     end subroutine stdlib_zgeqr2
+
+     !> ZGEQR2P: computes a QR factorization of a complex m-by-n matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a m-by-m orthogonal matrix;
+     !> R is an upper-triangular n-by-n matrix with nonnegative diagonal
+     !> entries;
+     !> 0 is a (m-n)-by-n zero matrix, if m > n.
+
+     subroutine stdlib_zgeqr2p( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQR2P', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elementary reflector h(i) to annihilate a(i+1:m,i)
+              call stdlib_zlarfgp( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,tau( i ) )
+              if( i<n ) then
+                 ! apply h(i)**h to a(i:m,i+1:n) from the left
+                 alpha = a( i, i )
+                 a( i, i ) = cone
+                 call stdlib_zlarf( 'LEFT', m-i+1, n-i, a( i, i ), 1,conjg( tau( i ) ), a( i, i+1 &
+                           ), lda, work )
+                 a( i, i ) = alpha
+              end if
+           end do
+           return
+     end subroutine stdlib_zgeqr2p
+
+     !> ZGEQRF: computes a QR factorization of a complex M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     pure subroutine stdlib_zgeqrf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           k = min( m, n )
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZGEQRF', ' ', m, n, -1, -1 )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( .not.lquery ) then
+              if( lwork<=0 .or. ( m>0 .and. lwork<max( 1, n ) ) )info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQRF', -info )
+              return
+           else if( lquery ) then
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZGEQRF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGEQRF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the qr factorization of the current block
+                 ! a(i:m,i:i+ib-1)
+                 call stdlib_zgeqr2( m-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_zlarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h**h to a(i:m,i+ib:n) from the left
+                    call stdlib_zlarfb( 'LEFT', 'CONJUGATE TRANSPOSE', 'FORWARD','COLUMNWISE', m-&
+                    i+1, n-i-ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 )&
+                              , ldwork )
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_zgeqr2( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_zgeqrf
+
+     !> ZGEQR2P computes a QR factorization of a complex M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix with nonnegative diagonal
+     !> entries;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     subroutine stdlib_zgeqrfp( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZGEQRF', ' ', m, n, -1, -1 )
+           lwkopt = n*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQRFP', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nbmin = 2
+           nx = 0
+           iws = n
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZGEQRF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGEQRF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially
+              do i = 1, k - nx, nb
+                 ib = min( k-i+1, nb )
+                 ! compute the qr factorization of the current block
+                 ! a(i:m,i:i+ib-1)
+                 call stdlib_zgeqr2p( m-i+1, ib, a( i, i ), lda, tau( i ), work,iinfo )
+                 if( i+ib<=n ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i) h(i+1) . . . h(i+ib-1)
+                    call stdlib_zlarft( 'FORWARD', 'COLUMNWISE', m-i+1, ib,a( i, i ), lda, tau( i &
+                              ), work, ldwork )
+                    ! apply h**h to a(i:m,i+ib:n) from the left
+                    call stdlib_zlarfb( 'LEFT', 'CONJUGATE TRANSPOSE', 'FORWARD','COLUMNWISE', m-&
+                    i+1, n-i-ib+1, ib,a( i, i ), lda, work, ldwork, a( i, i+ib ),lda, work( ib+1 )&
+                              , ldwork )
+                 end if
+              end do
+           else
+              i = 1
+           end if
+           ! use unblocked code to factor the last or only block.
+           if( i<=k )call stdlib_zgeqr2p( m-i+1, n-i+1, a( i, i ), lda, tau( i ), work,iinfo )
+                     
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_zgeqrfp
+
+     !> ZGEQRT2: computes a QR factorization of a complex M-by-N matrix A,
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_zgeqrt2( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(dp) :: aii, alpha
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( n<0 ) then
+              info = -2
+           else if( m<n ) then
+              info = -1
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQRT2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = 1, k
+              ! generate elem. refl. h(i) to annihilate a(i+1:m,i), tau(i) -> t(i,1)
+              call stdlib_zlarfg( m-i+1, a( i, i ), a( min( i+1, m ), i ), 1,t( i, 1 ) )
+              if( i<n ) then
+                 ! apply h(i) to a(i:m,i+1:n) from the left
+                 aii = a( i, i )
+                 a( i, i ) = cone
+                 ! w(1:n-i) := a(i:m,i+1:n)^h * a(i:m,i) [w = t(:,n)]
+                 call stdlib_zgemv( 'C',m-i+1, n-i, cone, a( i, i+1 ), lda,a( i, i ), 1, czero, t(&
+                            1, n ), 1 )
+                 ! a(i:m,i+1:n) = a(i:m,i+1:n) + alpha*a(i:m,i)*w(1:n-1)^h
+                 alpha = -conjg(t( i, 1 ))
+                 call stdlib_zgerc( m-i+1, n-i, alpha, a( i, i ), 1,t( 1, n ), 1, a( i, i+1 ), &
+                           lda )
+                 a( i, i ) = aii
+              end if
+           end do
+           do i = 2, n
+              aii = a( i, i )
+              a( i, i ) = cone
+              ! t(1:i-1,i) := alpha * a(i:m,1:i-1)**h * a(i:m,i)
+              alpha = -t( i, 1 )
+              call stdlib_zgemv( 'C', m-i+1, i-1, alpha, a( i, 1 ), lda,a( i, i ), 1, czero, t( 1,&
+                         i ), 1 )
+              a( i, i ) = aii
+              ! t(1:i-1,i) := t(1:i-1,1:i-1) * t(1:i-1,i)
+              call stdlib_ztrmv( 'U', 'N', 'N', i-1, t, ldt, t( 1, i ), 1 )
+                 ! t(i,i) = tau(i)
+                 t( i, i ) = t( i, 1 )
+                 t( i, 1) = czero
+           end do
+     end subroutine stdlib_zgeqrt2
+
+     !> ZGEQRT3: recursively computes a QR factorization of a complex M-by-N
+     !> matrix A, using the compact WY representation of Q.
+     !> Based on the algorithm of Elmroth and Gustavson,
+     !> IBM J. Res. Develop. Vol 44 No. 4 July 2000.
+
+     pure recursive subroutine stdlib_zgeqrt3( m, n, a, lda, t, ldt, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, ldt
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, j, j1, n1, n2, iinfo
+           ! Executable Statements 
+           info = 0
+           if( n < 0 ) then
+              info = -2
+           else if( m < n ) then
+              info = -1
+           else if( lda < max( 1, m ) ) then
+              info = -4
+           else if( ldt < max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQRT3', -info )
+              return
+           end if
+           if( n==1 ) then
+              ! compute householder transform when n=1
+              call stdlib_zlarfg( m, a(1,1), a( min( 2, m ), 1 ), 1, t(1,1) )
+           else
+              ! otherwise, split a into blocks...
+              n1 = n/2
+              n2 = n-n1
+              j1 = min( n1+1, n )
+              i1 = min( n+1, m )
+              ! compute a(1:m,1:n1) <- (y1,r1,t1), where q1 = i - y1 t1 y1^h
+              call stdlib_zgeqrt3( m, n1, a, lda, t, ldt, iinfo )
+              ! compute a(1:m,j1:n) = q1^h a(1:m,j1:n) [workspace: t(1:n1,j1:n)]
+              do j=1,n2
+                 do i=1,n1
+                    t( i, j+n1 ) = a( i, j+n1 )
+                 end do
+              end do
+              call stdlib_ztrmm( 'L', 'L', 'C', 'U', n1, n2, cone,a, lda, t( 1, j1 ), ldt )
+                        
+              call stdlib_zgemm( 'C', 'N', n1, n2, m-n1, cone, a( j1, 1 ), lda,a( j1, j1 ), lda, &
+                        cone, t( 1, j1 ), ldt)
+              call stdlib_ztrmm( 'L', 'U', 'C', 'N', n1, n2, cone,t, ldt, t( 1, j1 ), ldt )
+                        
+              call stdlib_zgemm( 'N', 'N', m-n1, n2, n1, -cone, a( j1, 1 ), lda,t( 1, j1 ), ldt, &
+                        cone, a( j1, j1 ), lda )
+              call stdlib_ztrmm( 'L', 'L', 'N', 'U', n1, n2, cone,a, lda, t( 1, j1 ), ldt )
+                        
+              do j=1,n2
+                 do i=1,n1
+                    a( i, j+n1 ) = a( i, j+n1 ) - t( i, j+n1 )
+                 end do
+              end do
+              ! compute a(j1:m,j1:n) <- (y2,r2,t2) where q2 = i - y2 t2 y2^h
+              call stdlib_zgeqrt3( m-n1, n2, a( j1, j1 ), lda,t( j1, j1 ), ldt, iinfo )
+              ! compute t3 = t(1:n1,j1:n) = -t1 y1^h y2 t2
+              do i=1,n1
+                 do j=1,n2
+                    t( i, j+n1 ) = conjg(a( j+n1, i ))
+                 end do
+              end do
+              call stdlib_ztrmm( 'R', 'L', 'N', 'U', n1, n2, cone,a( j1, j1 ), lda, t( 1, j1 ), &
+                        ldt )
+              call stdlib_zgemm( 'C', 'N', n1, n2, m-n, cone, a( i1, 1 ), lda,a( i1, j1 ), lda, &
+                        cone, t( 1, j1 ), ldt )
+              call stdlib_ztrmm( 'L', 'U', 'N', 'N', n1, n2, -cone, t, ldt,t( 1, j1 ), ldt )
+                        
+              call stdlib_ztrmm( 'R', 'U', 'N', 'N', n1, n2, cone,t( j1, j1 ), ldt, t( 1, j1 ), &
+                        ldt )
+              ! y = (y1,y2); r = [ r1  a(1:n1,j1:n) ];  t = [t1 t3]
+                               ! [  0        r2     ]       [ 0 t2]
+           end if
+           return
+     end subroutine stdlib_zgeqrt3
+
+     !> ZGERQ2: computes an RQ factorization of a complex m by n matrix A:
+     !> A = R * Q.
+
+     pure subroutine stdlib_zgerq2( m, n, a, lda, tau, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, k
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGERQ2', -info )
+              return
+           end if
+           k = min( m, n )
+           do i = k, 1, -1
+              ! generate elementary reflector h(i) to annihilate
+              ! a(m-k+i,1:n-k+i-1)
+              call stdlib_zlacgv( n-k+i, a( m-k+i, 1 ), lda )
+              alpha = a( m-k+i, n-k+i )
+              call stdlib_zlarfg( n-k+i, alpha, a( m-k+i, 1 ), lda, tau( i ) )
+              ! apply h(i) to a(1:m-k+i-1,1:n-k+i) from the right
+              a( m-k+i, n-k+i ) = cone
+              call stdlib_zlarf( 'RIGHT', m-k+i-1, n-k+i, a( m-k+i, 1 ), lda,tau( i ), a, lda, &
+                        work )
+              a( m-k+i, n-k+i ) = alpha
+              call stdlib_zlacgv( n-k+i-1, a( m-k+i, 1 ), lda )
+           end do
+           return
+     end subroutine stdlib_zgerq2
+
+     !> ZGERQF: computes an RQ factorization of a complex M-by-N matrix A:
+     !> A = R * Q.
+
+     pure subroutine stdlib_zgerqf( m, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iws, k, ki, kk, ldwork, lwkopt, mu, nb, nbmin, nu, &
+                     nx
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              k = min( m, n )
+              if( k==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'ZGERQF', ' ', m, n, -1, -1 )
+                 lwkopt = m*nb
+              end if
+              work( 1 ) = lwkopt
+              if ( .not.lquery ) then
+                 if( lwork<=0 .or. ( n>0 .and. lwork<max( 1, m ) ) )info = -7
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGERQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( k==0 ) then
+              return
+           end if
+           nbmin = 2
+           nx = 1
+           iws = m
+           if( nb>1 .and. nb<k ) then
+              ! determine when to cross over from blocked to unblocked code.
+              nx = max( 0, stdlib_ilaenv( 3, 'ZGERQF', ' ', m, n, -1, -1 ) )
+              if( nx<k ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = m
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  reduce nb and
+                    ! determine the minimum value of nb.
+                    nb = lwork / ldwork
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGERQF', ' ', m, n, -1,-1 ) )
+                 end if
+              end if
+           end if
+           if( nb>=nbmin .and. nb<k .and. nx<k ) then
+              ! use blocked code initially.
+              ! the last kk rows are handled by the block method.
+              ki = ( ( k-nx-1 ) / nb )*nb
+              kk = min( k, ki+nb )
+              do i = k - kk + ki + 1, k - kk + 1, -nb
+                 ib = min( k-i+1, nb )
+                 ! compute the rq factorization of the current block
+                 ! a(m-k+i:m-k+i+ib-1,1:n-k+i+ib-1)
+                 call stdlib_zgerq2( ib, n-k+i+ib-1, a( m-k+i, 1 ), lda, tau( i ),work, iinfo )
+                           
+                 if( m-k+i>1 ) then
+                    ! form the triangular factor of the block reflector
+                    ! h = h(i+ib-1) . . . h(i+1) h(i)
+                    call stdlib_zlarft( 'BACKWARD', 'ROWWISE', n-k+i+ib-1, ib,a( m-k+i, 1 ), lda, &
+                              tau( i ), work, ldwork )
+                    ! apply h to a(1:m-k+i-1,1:n-k+i+ib-1) from the right
+                    call stdlib_zlarfb( 'RIGHT', 'NO TRANSPOSE', 'BACKWARD','ROWWISE', m-k+i-1, n-&
+                    k+i+ib-1, ib,a( m-k+i, 1 ), lda, work, ldwork, a, lda,work( ib+1 ), ldwork )
+                              
+                 end if
+              end do
+              mu = m - k + i + nb - 1
+              nu = n - k + i + nb - 1
+           else
+              mu = m
+              nu = n
+           end if
+           ! use unblocked code to factor the last or only block
+           if( mu>0 .and. nu>0 )call stdlib_zgerq2( mu, nu, a, lda, tau, work, iinfo )
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_zgerqf
+
+     !> ZGESC2: solves a system of linear equations
+     !> A * X = scale* RHS
+     !> with a general N-by-N matrix A using the LU factorization with
+     !> complete pivoting computed by ZGETC2.
+
+     pure subroutine stdlib_zgesc2( n, a, lda, rhs, ipiv, jpiv, scale )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(out) :: scale
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*), jpiv(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: rhs(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: bignum, eps, smlnum
+           complex(dp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx
+           ! Executable Statements 
+           ! set constant to control overflow
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! apply permutations ipiv to rhs
+           call stdlib_zlaswp( 1, rhs, lda, 1, n-1, ipiv, 1 )
+           ! solve for l part
+           do i = 1, n - 1
+              do j = i + 1, n
+                 rhs( j ) = rhs( j ) - a( j, i )*rhs( i )
+              end do
+           end do
+           ! solve for u part
+           scale = one
+           ! check for scaling
+           i = stdlib_izamax( n, rhs, 1 )
+           if( two*smlnum*abs( rhs( i ) )>abs( a( n, n ) ) ) then
+              temp = cmplx( one / two, zero,KIND=dp) / abs( rhs( i ) )
+              call stdlib_zscal( n, temp, rhs( 1 ), 1 )
+              scale = scale*real( temp,KIND=dp)
+           end if
+           do i = n, 1, -1
+              temp = cmplx( one, zero,KIND=dp) / a( i, i )
+              rhs( i ) = rhs( i )*temp
+              do j = i + 1, n
+                 rhs( i ) = rhs( i ) - rhs( j )*( a( i, j )*temp )
+              end do
+           end do
+           ! apply permutations jpiv to the solution (rhs)
+           call stdlib_zlaswp( 1, rhs, lda, 1, n-1, jpiv, -1 )
+           return
+     end subroutine stdlib_zgesc2
+
+     !> ZGETRF2: computes an LU factorization of a general M-by-N matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the recursive version of the algorithm. It divides
+     !> the matrix into four submatrices:
+     !> [  A11 | A12  ]  where A11 is n1 by n1 and A22 is n2 by n2
+     !> A = [ -----|----- ]  with n1 = min(m,n)/2
+     !> [  A21 | A22  ]       n2 = n-n1
+     !> [ A11 ]
+     !> The subroutine calls itself to factor [ --- ],
+     !> [ A12 ]
+     !> [ A12 ]
+     !> do the swaps on [ --- ], solve A12, update A22,
+     !> [ A22 ]
+     !> then calls itself to factor A22 and do the swaps on A21.
+
+     pure recursive subroutine stdlib_zgetrf2( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: sfmin
+           complex(dp) :: temp
+           integer(ilp) :: i, iinfo, n1, n2
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGETRF2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if ( m==1 ) then
+              ! use unblocked code for cone row case
+              ! just need to handle ipiv and info
+              ipiv( 1 ) = 1
+              if ( a(1,1)==czero )info = 1
+           else if( n==1 ) then
+              ! use unblocked code for cone column case
+              ! compute machine safe minimum
+              sfmin = stdlib_dlamch('S')
+              ! find pivot and test for singularity
+              i = stdlib_izamax( m, a( 1, 1 ), 1 )
+              ipiv( 1 ) = i
+              if( a( i, 1 )/=czero ) then
+                 ! apply the interchange
+                 if( i/=1 ) then
+                    temp = a( 1, 1 )
+                    a( 1, 1 ) = a( i, 1 )
+                    a( i, 1 ) = temp
+                 end if
+                 ! compute elements 2:m of the column
+                 if( abs(a( 1, 1 )) >= sfmin ) then
+                    call stdlib_zscal( m-1, cone / a( 1, 1 ), a( 2, 1 ), 1 )
+                 else
+                    do i = 1, m-1
+                       a( 1+i, 1 ) = a( 1+i, 1 ) / a( 1, 1 )
+                    end do
+                 end if
+              else
+                 info = 1
+              end if
+           else
+              ! use recursive code
+              n1 = min( m, n ) / 2
+              n2 = n-n1
+                     ! [ a11 ]
+              ! factor [ --- ]
+                     ! [ a21 ]
+              call stdlib_zgetrf2( m, n1, a, lda, ipiv, iinfo )
+              if ( info==0 .and. iinfo>0 )info = iinfo
+                                    ! [ a12 ]
+              ! apply interchanges to [ --- ]
+                                    ! [ a22 ]
+              call stdlib_zlaswp( n2, a( 1, n1+1 ), lda, 1, n1, ipiv, 1 )
+              ! solve a12
+              call stdlib_ztrsm( 'L', 'L', 'N', 'U', n1, n2, cone, a, lda,a( 1, n1+1 ), lda )
+                        
+              ! update a22
+              call stdlib_zgemm( 'N', 'N', m-n1, n2, n1, -cone, a( n1+1, 1 ), lda,a( 1, n1+1 ), &
+                        lda, cone, a( n1+1, n1+1 ), lda )
+              ! factor a22
+              call stdlib_zgetrf2( m-n1, n2, a( n1+1, n1+1 ), lda, ipiv( n1+1 ),iinfo )
+              ! adjust info and the pivot indices
+              if ( info==0 .and. iinfo>0 )info = iinfo + n1
+              do i = n1+1, min( m, n )
+                 ipiv( i ) = ipiv( i ) + n1
+              end do
+              ! apply interchanges to a21
+              call stdlib_zlaswp( n1, a( 1, 1 ), lda, n1+1, min( m, n), ipiv, 1 )
+           end if
+           return
+     end subroutine stdlib_zgetrf2
+
+     !> ZGETRI: computes the inverse of a matrix using the LU factorization
+     !> computed by ZGETRF.
+     !> This method inverts U and then computes inv(A) by solving the system
+     !> inv(A)*L = inv(U) for inv(A).
+
+     pure subroutine stdlib_zgetri( n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iws, j, jb, jj, jp, ldwork, lwkopt, nb, nbmin, nn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZGETRI', ' ', n, -1, -1, -1 )
+           lwkopt = n*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( lda<max( 1, n ) ) then
+              info = -3
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGETRI', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form inv(u).  if info > 0 from stdlib_ztrtri, then u is singular,
+           ! and the inverse is not computed.
+           call stdlib_ztrtri( 'UPPER', 'NON-UNIT', n, a, lda, info )
+           if( info>0 )return
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = max( ldwork*nb, 1 )
+              if( lwork<iws ) then
+                 nb = lwork / ldwork
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZGETRI', ' ', n, -1, -1, -1 ) )
+              end if
+           else
+              iws = n
+           end if
+           ! solve the equation inv(a)*l = inv(u) for inv(a).
+           if( nb<nbmin .or. nb>=n ) then
+              ! use unblocked code.
+              do j = n, 1, -1
+                 ! copy current column of l to work and replace with zeros.
+                 do i = j + 1, n
+                    work( i ) = a( i, j )
+                    a( i, j ) = czero
+                 end do
+                 ! compute current column of inv(a).
+                 if( j<n )call stdlib_zgemv( 'NO TRANSPOSE', n, n-j, -cone, a( 1, j+1 ),lda, work(&
+                            j+1 ), 1, cone, a( 1, j ), 1 )
+              end do
+           else
+              ! use blocked code.
+              nn = ( ( n-1 ) / nb )*nb + 1
+              do j = nn, 1, -nb
+                 jb = min( nb, n-j+1 )
+                 ! copy current block column of l to work and replace with
+                 ! zeros.
+                 do jj = j, j + jb - 1
+                    do i = jj + 1, n
+                       work( i+( jj-j )*ldwork ) = a( i, jj )
+                       a( i, jj ) = czero
+                    end do
+                 end do
+                 ! compute current block column of inv(a).
+                 if( j+jb<=n )call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, jb,n-j-jb+1, -&
+                           cone, a( 1, j+jb ), lda,work( j+jb ), ldwork, cone, a( 1, j ), lda )
+                 call stdlib_ztrsm( 'RIGHT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n, jb,cone, work( j &
+                           ), ldwork, a( 1, j ), lda )
+              end do
+           end if
+           ! apply column interchanges.
+           do j = n - 1, 1, -1
+              jp = ipiv( j )
+              if( jp/=j )call stdlib_zswap( n, a( 1, j ), 1, a( 1, jp ), 1 )
+           end do
+           work( 1 ) = iws
+           return
+     end subroutine stdlib_zgetri
+
+     !> ZGETRS: solves a system of linear equations
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B
+     !> with a general N-by-N matrix A using the LU factorization computed
+     !> by ZGETRF.
+
+     pure subroutine stdlib_zgetrs( trans, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGETRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( notran ) then
+              ! solve a * x = b.
+              ! apply row interchanges to the right hand sides.
+              call stdlib_zlaswp( nrhs, b, ldb, 1, n, ipiv, 1 )
+              ! solve l*x = b, overwriting b with x.
+              call stdlib_ztrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', n, nrhs,cone, a, lda, b,&
+                         ldb )
+              ! solve u*x = b, overwriting b with x.
+              call stdlib_ztrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n,nrhs, cone, a, &
+                        lda, b, ldb )
+           else
+              ! solve a**t * x = b  or a**h * x = b.
+              ! solve u**t *x = b or u**h *x = b, overwriting b with x.
+              call stdlib_ztrsm( 'LEFT', 'UPPER', trans, 'NON-UNIT', n, nrhs, cone,a, lda, b, ldb &
+                        )
+              ! solve l**t *x = b, or l**h *x = b overwriting b with x.
+              call stdlib_ztrsm( 'LEFT', 'LOWER', trans, 'UNIT', n, nrhs, cone, a,lda, b, ldb )
+                        
+              ! apply row interchanges to the solution vectors.
+              call stdlib_zlaswp( nrhs, b, ldb, 1, n, ipiv, -1 )
+           end if
+           return
+     end subroutine stdlib_zgetrs
+
+     !> ZGGHRD: reduces a pair of complex matrices (A,B) to generalized upper
+     !> Hessenberg form using unitary transformations, where A is a
+     !> general matrix and B is upper triangular.  The form of the
+     !> generalized eigenvalue problem is
+     !> A*x = lambda*B*x,
+     !> and B is typically made upper triangular by computing its QR
+     !> factorization and moving the unitary matrix Q to the left side
+     !> of the equation.
+     !> This subroutine simultaneously reduces A to a Hessenberg matrix H:
+     !> Q**H*A*Z = H
+     !> and transforms B to another upper triangular matrix T:
+     !> Q**H*B*Z = T
+     !> in order to reduce the problem to its standard form
+     !> H*y = lambda*T*y
+     !> where y = Z**H*x.
+     !> The unitary matrices Q and Z are determined as products of Givens
+     !> rotations.  They may either be formed explicitly, or they may be
+     !> postmultiplied into input matrices Q1 and Z1, so that
+     !> Q1 * A * Z1**H = (Q1*Q) * H * (Z1*Z)**H
+     !> Q1 * B * Z1**H = (Q1*Q) * T * (Z1*Z)**H
+     !> If Q1 is the unitary matrix from the QR factorization of B in the
+     !> original equation A*x = lambda*B*x, then ZGGHRD reduces the original
+     !> problem to generalized Hessenberg form.
+
+     pure subroutine stdlib_zgghrd( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldb, ldq, ldz, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: ilq, ilz
+           integer(ilp) :: icompq, icompz, jcol, jrow
+           real(dp) :: c
+           complex(dp) :: ctemp, s
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! decode compq
+           if( stdlib_lsame( compq, 'N' ) ) then
+              ilq = .false.
+              icompq = 1
+           else if( stdlib_lsame( compq, 'V' ) ) then
+              ilq = .true.
+              icompq = 2
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              ilq = .true.
+              icompq = 3
+           else
+              icompq = 0
+           end if
+           ! decode compz
+           if( stdlib_lsame( compz, 'N' ) ) then
+              ilz = .false.
+              icompz = 1
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              ilz = .true.
+              icompz = 2
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              ilz = .true.
+              icompz = 3
+           else
+              icompz = 0
+           end if
+           ! test the input parameters.
+           info = 0
+           if( icompq<=0 ) then
+              info = -1
+           else if( icompz<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ( ilq .and. ldq<n ) .or. ldq<1 ) then
+              info = -11
+           else if( ( ilz .and. ldz<n ) .or. ldz<1 ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGHRD', -info )
+              return
+           end if
+           ! initialize q and z if desired.
+           if( icompq==3 )call stdlib_zlaset( 'FULL', n, n, czero, cone, q, ldq )
+           if( icompz==3 )call stdlib_zlaset( 'FULL', n, n, czero, cone, z, ldz )
+           ! quick return if possible
+           if( n<=1 )return
+           ! zero out lower triangle of b
+           do jcol = 1, n - 1
+              do jrow = jcol + 1, n
+                 b( jrow, jcol ) = czero
+              end do
+           end do
+           ! reduce a and b
+           do jcol = ilo, ihi - 2
+              do jrow = ihi, jcol + 2, -1
+                 ! step 1: rotate rows jrow-1, jrow to kill a(jrow,jcol)
+                 ctemp = a( jrow-1, jcol )
+                 call stdlib_zlartg( ctemp, a( jrow, jcol ), c, s,a( jrow-1, jcol ) )
+                 a( jrow, jcol ) = czero
+                 call stdlib_zrot( n-jcol, a( jrow-1, jcol+1 ), lda,a( jrow, jcol+1 ), lda, c, s )
+                           
+                 call stdlib_zrot( n+2-jrow, b( jrow-1, jrow-1 ), ldb,b( jrow, jrow-1 ), ldb, c, &
+                           s )
+                 if( ilq )call stdlib_zrot( n, q( 1, jrow-1 ), 1, q( 1, jrow ), 1, c,conjg( s ) )
+                           
+                 ! step 2: rotate columns jrow, jrow-1 to kill b(jrow,jrow-1)
+                 ctemp = b( jrow, jrow )
+                 call stdlib_zlartg( ctemp, b( jrow, jrow-1 ), c, s,b( jrow, jrow ) )
+                 b( jrow, jrow-1 ) = czero
+                 call stdlib_zrot( ihi, a( 1, jrow ), 1, a( 1, jrow-1 ), 1, c, s )
+                 call stdlib_zrot( jrow-1, b( 1, jrow ), 1, b( 1, jrow-1 ), 1, c,s )
+                 if( ilz )call stdlib_zrot( n, z( 1, jrow ), 1, z( 1, jrow-1 ), 1, c, s )
+              end do
+           end do
+           return
+     end subroutine stdlib_zgghrd
+
+     !> ZGGQRF: computes a generalized QR factorization of an N-by-M matrix A
+     !> and an N-by-P matrix B:
+     !> A = Q*R,        B = Q*T*Z,
+     !> where Q is an N-by-N unitary matrix, Z is a P-by-P unitary matrix,
+     !> and R and T assume one of the forms:
+     !> if N >= M,  R = ( R11 ) M  ,   or if N < M,  R = ( R11  R12 ) N,
+     !> (  0  ) N-M                         N   M-N
+     !> M
+     !> where R11 is upper triangular, and
+     !> if N <= P,  T = ( 0  T12 ) N,   or if N > P,  T = ( T11 ) N-P,
+     !> P-N  N                           ( T21 ) P
+     !> P
+     !> where T12 or T21 is upper triangular.
+     !> In particular, if B is square and nonsingular, the GQR factorization
+     !> of A and B implicitly gives the QR factorization of inv(B)*A:
+     !> inv(B)*A = Z**H * (inv(T)*R)
+     !> where inv(B) denotes the inverse of the matrix B, and Z**H denotes the
+     !> conjugate transpose of matrix Z.
+
+     pure subroutine stdlib_zggqrf( n, m, p, a, lda, taua, b, ldb, taub, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: taua(*), taub(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkopt, nb, nb1, nb2, nb3
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'ZGEQRF', ' ', n, m, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'ZGERQF', ' ', n, p, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'ZUNMQR', ' ', n, m, p, -1 )
+           nb = max( nb1, nb2, nb3 )
+           lwkopt = max( n, m, p )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( p<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, n, m, p ) .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGQRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! qr factorization of n-by-m matrix a: a = q*r
+           call stdlib_zgeqrf( n, m, a, lda, taua, work, lwork, info )
+           lopt = real( work( 1 ),KIND=dp)
+           ! update b := q**h*b.
+           call stdlib_zunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', n, p, min( n, m ), a,lda, taua, b, &
+                     ldb, work, lwork, info )
+           lopt = max( lopt, int( work( 1 ),KIND=ilp) )
+           ! rq factorization of n-by-p matrix b: b = t*z.
+           call stdlib_zgerqf( n, p, b, ldb, taub, work, lwork, info )
+           work( 1 ) = max( lopt, int( work( 1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_zggqrf
+
+     !> ZGGRQF: computes a generalized RQ factorization of an M-by-N matrix A
+     !> and a P-by-N matrix B:
+     !> A = R*Q,        B = Z*T*Q,
+     !> where Q is an N-by-N unitary matrix, Z is a P-by-P unitary
+     !> matrix, and R and T assume one of the forms:
+     !> if M <= N,  R = ( 0  R12 ) M,   or if M > N,  R = ( R11 ) M-N,
+     !> N-M  M                           ( R21 ) N
+     !> N
+     !> where R12 or R21 is upper triangular, and
+     !> if P >= N,  T = ( T11 ) N  ,   or if P < N,  T = ( T11  T12 ) P,
+     !> (  0  ) P-N                         P   N-P
+     !> N
+     !> where T11 is upper triangular.
+     !> In particular, if B is square and nonsingular, the GRQ factorization
+     !> of A and B implicitly gives the RQ factorization of A*inv(B):
+     !> A*inv(B) = (R*inv(T))*Z**H
+     !> where inv(B) denotes the inverse of the matrix B, and Z**H denotes the
+     !> conjugate transpose of the matrix Z.
+
+     pure subroutine stdlib_zggrqf( m, p, n, a, lda, taua, b, ldb, taub, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: taua(*), taub(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkopt, nb, nb1, nb2, nb3
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'ZGERQF', ' ', m, n, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'ZGEQRF', ' ', p, n, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'ZUNMRQ', ' ', m, n, p, -1 )
+           nb = max( nb1, nb2, nb3 )
+           lwkopt = max( n, m, p )*nb
+           work( 1 ) = lwkopt
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( p<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, p ) ) then
+              info = -8
+           else if( lwork<max( 1, m, p, n ) .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGRQF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! rq factorization of m-by-n matrix a: a = r*q
+           call stdlib_zgerqf( m, n, a, lda, taua, work, lwork, info )
+           lopt = real( work( 1 ),KIND=dp)
+           ! update b := b*q**h
+           call stdlib_zunmrq( 'RIGHT', 'CONJUGATE TRANSPOSE', p, n, min( m, n ),a( max( 1, m-n+1 &
+                     ), 1 ), lda, taua, b, ldb, work,lwork, info )
+           lopt = max( lopt, int( work( 1 ),KIND=ilp) )
+           ! qr factorization of p-by-n matrix b: b = z*t
+           call stdlib_zgeqrf( p, n, b, ldb, taub, work, lwork, info )
+           work( 1 ) = max( lopt, int( work( 1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_zggrqf
+
+     !> ZGTTRS: solves one of the systems of equations
+     !> A * X = B,  A**T * X = B,  or  A**H * X = B,
+     !> with a tridiagonal matrix A using the LU factorization computed
+     !> by ZGTTRF.
+
+     pure subroutine stdlib_zgttrs( trans, n, nrhs, dl, d, du, du2, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: b(ldb,*)
+           complex(dp), intent(in) :: d(*), dl(*), du(*), du2(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: itrans, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           notran = ( trans=='N' .or. trans=='N' )
+           if( .not.notran .and. .not.( trans=='T' .or. trans=='T' ) .and. .not.( trans=='C' .or. &
+                     trans=='C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( n, 1 ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGTTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! decode trans
+           if( notran ) then
+              itrans = 0
+           else if( trans=='T' .or. trans=='T' ) then
+              itrans = 1
+           else
+              itrans = 2
+           end if
+           ! determine the number of right-hand sides to solve at a time.
+           if( nrhs==1 ) then
+              nb = 1
+           else
+              nb = max( 1, stdlib_ilaenv( 1, 'ZGTTRS', trans, n, nrhs, -1, -1 ) )
+           end if
+           if( nb>=nrhs ) then
+              call stdlib_zgtts2( itrans, n, nrhs, dl, d, du, du2, ipiv, b, ldb )
+           else
+              do j = 1, nrhs, nb
+                 jb = min( nrhs-j+1, nb )
+                 call stdlib_zgtts2( itrans, n, jb, dl, d, du, du2, ipiv, b( 1, j ),ldb )
+              end do
+           end if
+     end subroutine stdlib_zgttrs
+
+     !> ZHB2ST_KERNELS: is an internal routine used by the ZHETRD_HB2ST
+     !> subroutine.
+
+     pure subroutine  stdlib_zhb2st_kernels( uplo, wantz, ttype,st, ed, sweep, n, nb, ib,a, lda, &
+               v, tau, ldvt, work)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: wantz
+           integer(ilp), intent(in) :: ttype, st, ed, sweep, n, nb, ib, lda, ldvt
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: v(*), tau(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, j1, j2, lm, ln, vpos, taupos, dpos, ofdpos, ajeter
+           complex(dp) :: ctmp
+           ! Intrinsic Functions 
+           intrinsic :: conjg,mod
+           ! Executable Statements 
+           ajeter = ib + ldvt
+           upper = stdlib_lsame( uplo, 'U' )
+           if( upper ) then
+               dpos    = 2 * nb + 1
+               ofdpos  = 2 * nb
+           else
+               dpos    = 1
+               ofdpos  = 2
+           endif
+           ! upper case
+           if( upper ) then
+               if( wantz ) then
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               else
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               endif
+               if( ttype==1 ) then
+                   lm = ed - st + 1
+                   v( vpos ) = cone
+                   do i = 1, lm-1
+                       v( vpos+i )         = conjg( a( ofdpos-i, st+i ) )
+                       a( ofdpos-i, st+i ) = czero
+                   end do
+                   ctmp = conjg( a( ofdpos, st ) )
+                   call stdlib_zlarfg( lm, ctmp, v( vpos+1 ), 1,tau( taupos ) )
+                   a( ofdpos, st ) = ctmp
+                   lm = ed - st + 1
+                   call stdlib_zlarfy( uplo, lm, v( vpos ), 1,conjg( tau( taupos ) ),a( dpos, st )&
+                             , lda-1, work)
+               endif
+               if( ttype==3 ) then
+                   lm = ed - st + 1
+                   call stdlib_zlarfy( uplo, lm, v( vpos ), 1,conjg( tau( taupos ) ),a( dpos, st )&
+                             , lda-1, work)
+               endif
+               if( ttype==2 ) then
+                   j1 = ed+1
+                   j2 = min( ed+nb, n )
+                   ln = ed-st+1
+                   lm = j2-j1+1
+                   if( lm>0) then
+                       call stdlib_zlarfx( 'LEFT', ln, lm, v( vpos ),conjg( tau( taupos ) ),a( &
+                                 dpos-nb, j1 ), lda-1, work)
+                       if( wantz ) then
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       else
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       endif
+                       v( vpos ) = cone
+                       do i = 1, lm-1
+                           v( vpos+i )          =conjg( a( dpos-nb-i, j1+i ) )
+                           a( dpos-nb-i, j1+i ) = czero
+                       end do
+                       ctmp = conjg( a( dpos-nb, j1 ) )
+                       call stdlib_zlarfg( lm, ctmp, v( vpos+1 ), 1, tau( taupos ) )
+                       a( dpos-nb, j1 ) = ctmp
+                       call stdlib_zlarfx( 'RIGHT', ln-1, lm, v( vpos ),tau( taupos ),a( dpos-nb+&
+                                 1, j1 ), lda-1, work)
+                   endif
+               endif
+           ! lower case
+           else
+               if( wantz ) then
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               else
+                   vpos   = mod( sweep-1, 2 ) * n + st
+                   taupos = mod( sweep-1, 2 ) * n + st
+               endif
+               if( ttype==1 ) then
+                   lm = ed - st + 1
+                   v( vpos ) = cone
+                   do i = 1, lm-1
+                       v( vpos+i )         = a( ofdpos+i, st-1 )
+                       a( ofdpos+i, st-1 ) = czero
+                   end do
+                   call stdlib_zlarfg( lm, a( ofdpos, st-1 ), v( vpos+1 ), 1,tau( taupos ) )
+                             
+                   lm = ed - st + 1
+                   call stdlib_zlarfy( uplo, lm, v( vpos ), 1,conjg( tau( taupos ) ),a( dpos, st )&
+                             , lda-1, work)
+               endif
+               if( ttype==3 ) then
+                   lm = ed - st + 1
+                   call stdlib_zlarfy( uplo, lm, v( vpos ), 1,conjg( tau( taupos ) ),a( dpos, st )&
+                             , lda-1, work)
+               endif
+               if( ttype==2 ) then
+                   j1 = ed+1
+                   j2 = min( ed+nb, n )
+                   ln = ed-st+1
+                   lm = j2-j1+1
+                   if( lm>0) then
+                       call stdlib_zlarfx( 'RIGHT', lm, ln, v( vpos ),tau( taupos ), a( dpos+nb, &
+                                 st ),lda-1, work)
+                       if( wantz ) then
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       else
+                           vpos   = mod( sweep-1, 2 ) * n + j1
+                           taupos = mod( sweep-1, 2 ) * n + j1
+                       endif
+                       v( vpos ) = cone
+                       do i = 1, lm-1
+                           v( vpos+i )        = a( dpos+nb+i, st )
+                           a( dpos+nb+i, st ) = czero
+                       end do
+                       call stdlib_zlarfg( lm, a( dpos+nb, st ), v( vpos+1 ), 1,tau( taupos ) )
+                                 
+                       call stdlib_zlarfx( 'LEFT', lm, ln-1, v( vpos ),conjg( tau( taupos ) ),a( &
+                                 dpos+nb-1, st+1 ), lda-1, work)
+                   endif
+               endif
+           endif
+           return
+     end subroutine stdlib_zhb2st_kernels
+
+     !> ZHEEQUB: computes row and column scalings intended to equilibrate a
+     !> Hermitian matrix A (with respect to the Euclidean norm) and reduce
+     !> its condition number. The scale factors S are computed by the BIN
+     !> algorithm (see references) so that the scaled matrix B with elements
+     !> B(i,j) = S(i)*A(i,j)*S(j) has a condition number within a factor N of
+     !> the smallest possible condition number over all possible diagonal
+     !> scalings.
+
+     pure subroutine stdlib_zheequb( uplo, n, a, lda, s, scond, amax, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(out) :: amax, scond
+           character, intent(in) :: uplo
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+           real(dp), intent(out) :: s(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: max_iter = 100
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, iter
+           real(dp) :: avg, std, tol, c0, c1, c2, t, u, si, d, base, smin, smax, smlnum, bignum, &
+                     scale, sumsq
+           logical(lk) :: up
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,int,log,max,min,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if ( .not. ( stdlib_lsame( uplo, 'U' ) .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -1
+           else if ( n < 0 ) then
+              info = -2
+           else if ( lda < max( 1, n ) ) then
+              info = -4
+           end if
+           if ( info /= 0 ) then
+              call stdlib_xerbla( 'ZHEEQUB', -info )
+              return
+           end if
+           up = stdlib_lsame( uplo, 'U' )
+           amax = zero
+           ! quick return if possible.
+           if ( n == 0 ) then
+              scond = one
+              return
+           end if
+           do i = 1, n
+              s( i ) = zero
+           end do
+           amax = zero
+           if ( up ) then
+              do j = 1, n
+                 do i = 1, j-1
+                    s( i ) = max( s( i ), cabs1( a( i, j ) ) )
+                    s( j ) = max( s( j ), cabs1( a( i, j ) ) )
+                    amax = max( amax, cabs1( a( i, j ) ) )
+                 end do
+                 s( j ) = max( s( j ), cabs1( a( j, j ) ) )
+                 amax = max( amax, cabs1( a( j, j ) ) )
+              end do
+           else
+              do j = 1, n
+                 s( j ) = max( s( j ), cabs1( a( j, j ) ) )
+                 amax = max( amax, cabs1( a( j, j ) ) )
+                 do i = j+1, n
+                    s( i ) = max( s( i ), cabs1( a( i, j ) ) )
+                    s( j ) = max( s( j ), cabs1( a( i, j ) ) )
+                    amax = max( amax, cabs1( a( i, j ) ) )
+                 end do
+              end do
+           end if
+           do j = 1, n
+              s( j ) = 1.0_dp / s( j )
+           end do
+           tol = one / sqrt( 2.0_dp * n )
+           do iter = 1, max_iter
+              scale = zero
+              sumsq = zero
+              ! beta = |a|s
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              if ( up ) then
+                 do j = 1, n
+                    do i = 1, j-1
+                       work( i ) = work( i ) + cabs1( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + cabs1( a( i, j ) ) * s( i )
+                    end do
+                    work( j ) = work( j ) + cabs1( a( j, j ) ) * s( j )
+                 end do
+              else
+                 do j = 1, n
+                    work( j ) = work( j ) + cabs1( a( j, j ) ) * s( j )
+                    do i = j+1, n
+                       work( i ) = work( i ) + cabs1( a( i, j ) ) * s( j )
+                       work( j ) = work( j ) + cabs1( a( i, j ) ) * s( i )
+                    end do
+                 end do
+              end if
+              ! avg = s^t beta / n
+              avg = zero
+              do i = 1, n
+                 avg = avg + real( s( i )*work( i ),KIND=dp)
+              end do
+              avg = avg / n
+              std = zero
+              do i = n+1, 2*n
+                 work( i ) = s( i-n ) * work( i-n ) - avg
+              end do
+              call stdlib_zlassq( n, work( n+1 ), 1, scale, sumsq )
+              std = scale * sqrt( sumsq / n )
+              if ( std < tol * avg ) goto 999
+              do i = 1, n
+                 t = cabs1( a( i, i ) )
+                 si = s( i )
+                 c2 = ( n-1 ) * t
+                 c1 = ( n-2 ) * ( real( work( i ),KIND=dp) - t*si )
+                 c0 = -(t*si)*si + 2 * real( work( i ),KIND=dp) * si - n*avg
+                 d = c1*c1 - 4*c0*c2
+                 if ( d <= 0 ) then
+                    info = -1
+                    return
+                 end if
+                 si = -2*c0 / ( c1 + sqrt( d ) )
+                 d = si - s( i )
+                 u = zero
+                 if ( up ) then
+                    do j = 1, i
+                       t = cabs1( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = cabs1( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 else
+                    do j = 1, i
+                       t = cabs1( a( i, j ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                    do j = i+1,n
+                       t = cabs1( a( j, i ) )
+                       u = u + s( j )*t
+                       work( j ) = work( j ) + d*t
+                    end do
+                 end if
+                 avg = avg + ( u + real( work( i ),KIND=dp) ) * d / n
+                 s( i ) = si
+              end do
+           end do
+           999 continue
+           smlnum = stdlib_dlamch( 'SAFEMIN' )
+           bignum = one / smlnum
+           smin = bignum
+           smax = zero
+           t = one / sqrt( avg )
+           base = stdlib_dlamch( 'B' )
+           u = one / log( base )
+           do i = 1, n
+              s( i ) = base ** int( u * log( s( i ) * t ),KIND=ilp)
+              smin = min( smin, s( i ) )
+              smax = max( smax, s( i ) )
+           end do
+           scond = max( smin, smlnum ) / min( smax, bignum )
+     end subroutine stdlib_zheequb
+
+     !> ZHEGS2: reduces a complex Hermitian-definite generalized
+     !> eigenproblem to standard form.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H *A*L.
+     !> B must have been previously factorized as U**H *U or L*L**H by ZPOTRF.
+
+     pure subroutine stdlib_zhegs2( itype, uplo, n, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k
+           real(dp) :: akk, bkk
+           complex(dp) :: ct
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEGS2', -info )
+              return
+           end if
+           if( itype==1 ) then
+              if( upper ) then
+                 ! compute inv(u**h)*a*inv(u)
+                 do k = 1, n
+                    ! update the upper triangle of a(k:n,k:n)
+                    akk = real( a( k, k ),KIND=dp)
+                    bkk = real( b( k, k ),KIND=dp)
+                    akk = akk / bkk**2
+                    a( k, k ) = akk
+                    if( k<n ) then
+                       call stdlib_zdscal( n-k, one / bkk, a( k, k+1 ), lda )
+                       ct = -half*akk
+                       call stdlib_zlacgv( n-k, a( k, k+1 ), lda )
+                       call stdlib_zlacgv( n-k, b( k, k+1 ), ldb )
+                       call stdlib_zaxpy( n-k, ct, b( k, k+1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_zher2( uplo, n-k, -cone, a( k, k+1 ), lda,b( k, k+1 ), ldb, a( &
+                                 k+1, k+1 ), lda )
+                       call stdlib_zaxpy( n-k, ct, b( k, k+1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_zlacgv( n-k, b( k, k+1 ), ldb )
+                       call stdlib_ztrsv( uplo, 'CONJUGATE TRANSPOSE', 'NON-UNIT',n-k, b( k+1, k+&
+                                 1 ), ldb, a( k, k+1 ),lda )
+                       call stdlib_zlacgv( n-k, a( k, k+1 ), lda )
+                    end if
+                 end do
+              else
+                 ! compute inv(l)*a*inv(l**h)
+                 do k = 1, n
+                    ! update the lower triangle of a(k:n,k:n)
+                    akk = real( a( k, k ),KIND=dp)
+                    bkk = real( b( k, k ),KIND=dp)
+                    akk = akk / bkk**2
+                    a( k, k ) = akk
+                    if( k<n ) then
+                       call stdlib_zdscal( n-k, one / bkk, a( k+1, k ), 1 )
+                       ct = -half*akk
+                       call stdlib_zaxpy( n-k, ct, b( k+1, k ), 1, a( k+1, k ), 1 )
+                       call stdlib_zher2( uplo, n-k, -cone, a( k+1, k ), 1,b( k+1, k ), 1, a( k+1,&
+                                  k+1 ), lda )
+                       call stdlib_zaxpy( n-k, ct, b( k+1, k ), 1, a( k+1, k ), 1 )
+                       call stdlib_ztrsv( uplo, 'NO TRANSPOSE', 'NON-UNIT', n-k,b( k+1, k+1 ), &
+                                 ldb, a( k+1, k ), 1 )
+                    end if
+                 end do
+              end if
+           else
+              if( upper ) then
+                 ! compute u*a*u**h
+                 do k = 1, n
+                    ! update the upper triangle of a(1:k,1:k)
+                    akk = real( a( k, k ),KIND=dp)
+                    bkk = real( b( k, k ),KIND=dp)
+                    call stdlib_ztrmv( uplo, 'NO TRANSPOSE', 'NON-UNIT', k-1, b,ldb, a( 1, k ), 1 &
+                              )
+                    ct = half*akk
+                    call stdlib_zaxpy( k-1, ct, b( 1, k ), 1, a( 1, k ), 1 )
+                    call stdlib_zher2( uplo, k-1, cone, a( 1, k ), 1, b( 1, k ), 1,a, lda )
+                              
+                    call stdlib_zaxpy( k-1, ct, b( 1, k ), 1, a( 1, k ), 1 )
+                    call stdlib_zdscal( k-1, bkk, a( 1, k ), 1 )
+                    a( k, k ) = akk*bkk**2
+                 end do
+              else
+                 ! compute l**h *a*l
+                 do k = 1, n
+                    ! update the lower triangle of a(1:k,1:k)
+                    akk = real( a( k, k ),KIND=dp)
+                    bkk = real( b( k, k ),KIND=dp)
+                    call stdlib_zlacgv( k-1, a( k, 1 ), lda )
+                    call stdlib_ztrmv( uplo, 'CONJUGATE TRANSPOSE', 'NON-UNIT', k-1,b, ldb, a( k, &
+                              1 ), lda )
+                    ct = half*akk
+                    call stdlib_zlacgv( k-1, b( k, 1 ), ldb )
+                    call stdlib_zaxpy( k-1, ct, b( k, 1 ), ldb, a( k, 1 ), lda )
+                    call stdlib_zher2( uplo, k-1, cone, a( k, 1 ), lda, b( k, 1 ),ldb, a, lda )
+                              
+                    call stdlib_zaxpy( k-1, ct, b( k, 1 ), ldb, a( k, 1 ), lda )
+                    call stdlib_zlacgv( k-1, b( k, 1 ), ldb )
+                    call stdlib_zdscal( k-1, bkk, a( k, 1 ), lda )
+                    call stdlib_zlacgv( k-1, a( k, 1 ), lda )
+                    a( k, k ) = akk*bkk**2
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zhegs2
+
+     !> ZHEGST: reduces a complex Hermitian-definite generalized
+     !> eigenproblem to standard form.
+     !> If ITYPE = 1, the problem is A*x = lambda*B*x,
+     !> and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H)
+     !> If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or
+     !> B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L.
+     !> B must have been previously factorized as U**H*U or L*L**H by ZPOTRF.
+
+     pure subroutine stdlib_zhegst( itype, uplo, n, a, lda, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: k, kb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEGST', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'ZHEGST', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code
+              call stdlib_zhegs2( itype, uplo, n, a, lda, b, ldb, info )
+           else
+              ! use blocked code
+              if( itype==1 ) then
+                 if( upper ) then
+                    ! compute inv(u**h)*a*inv(u)
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the upper triangle of a(k:n,k:n)
+                       call stdlib_zhegs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                       if( k+kb<=n ) then
+                          call stdlib_ztrsm( 'LEFT', uplo, 'CONJUGATE TRANSPOSE','NON-UNIT', kb, &
+                                    n-k-kb+1, cone,b( k, k ), ldb, a( k, k+kb ), lda )
+                          call stdlib_zhemm( 'LEFT', uplo, kb, n-k-kb+1, -chalf,a( k, k ), lda, b(&
+                                     k, k+kb ), ldb,cone, a( k, k+kb ), lda )
+                          call stdlib_zher2k( uplo, 'CONJUGATE TRANSPOSE', n-k-kb+1,kb, -cone, a( &
+                                    k, k+kb ), lda,b( k, k+kb ), ldb, one,a( k+kb, k+kb ), lda )
+                          call stdlib_zhemm( 'LEFT', uplo, kb, n-k-kb+1, -chalf,a( k, k ), lda, b(&
+                                     k, k+kb ), ldb,cone, a( k, k+kb ), lda )
+                          call stdlib_ztrsm( 'RIGHT', uplo, 'NO TRANSPOSE','NON-UNIT', kb, n-k-kb+&
+                                    1, cone,b( k+kb, k+kb ), ldb, a( k, k+kb ),lda )
+                       end if
+                    end do
+                 else
+                    ! compute inv(l)*a*inv(l**h)
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the lower triangle of a(k:n,k:n)
+                       call stdlib_zhegs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                       if( k+kb<=n ) then
+                          call stdlib_ztrsm( 'RIGHT', uplo, 'CONJUGATE TRANSPOSE','NON-UNIT', n-k-&
+                                    kb+1, kb, cone,b( k, k ), ldb, a( k+kb, k ), lda )
+                          call stdlib_zhemm( 'RIGHT', uplo, n-k-kb+1, kb, -chalf,a( k, k ), lda, &
+                                    b( k+kb, k ), ldb,cone, a( k+kb, k ), lda )
+                          call stdlib_zher2k( uplo, 'NO TRANSPOSE', n-k-kb+1, kb,-cone, a( k+kb, &
+                                    k ), lda,b( k+kb, k ), ldb, one,a( k+kb, k+kb ), lda )
+                          call stdlib_zhemm( 'RIGHT', uplo, n-k-kb+1, kb, -chalf,a( k, k ), lda, &
+                                    b( k+kb, k ), ldb,cone, a( k+kb, k ), lda )
+                          call stdlib_ztrsm( 'LEFT', uplo, 'NO TRANSPOSE','NON-UNIT', n-k-kb+1, &
+                                    kb, cone,b( k+kb, k+kb ), ldb, a( k+kb, k ),lda )
+                       end if
+                    end do
+                 end if
+              else
+                 if( upper ) then
+                    ! compute u*a*u**h
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the upper triangle of a(1:k+kb-1,1:k+kb-1)
+                       call stdlib_ztrmm( 'LEFT', uplo, 'NO TRANSPOSE', 'NON-UNIT',k-1, kb, cone, &
+                                 b, ldb, a( 1, k ), lda )
+                       call stdlib_zhemm( 'RIGHT', uplo, k-1, kb, chalf, a( k, k ),lda, b( 1, k ),&
+                                  ldb, cone, a( 1, k ),lda )
+                       call stdlib_zher2k( uplo, 'NO TRANSPOSE', k-1, kb, cone,a( 1, k ), lda, b( &
+                                 1, k ), ldb, one, a,lda )
+                       call stdlib_zhemm( 'RIGHT', uplo, k-1, kb, chalf, a( k, k ),lda, b( 1, k ),&
+                                  ldb, cone, a( 1, k ),lda )
+                       call stdlib_ztrmm( 'RIGHT', uplo, 'CONJUGATE TRANSPOSE','NON-UNIT', k-1, &
+                                 kb, cone, b( k, k ), ldb,a( 1, k ), lda )
+                       call stdlib_zhegs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                    end do
+                 else
+                    ! compute l**h*a*l
+                    do k = 1, n, nb
+                       kb = min( n-k+1, nb )
+                       ! update the lower triangle of a(1:k+kb-1,1:k+kb-1)
+                       call stdlib_ztrmm( 'RIGHT', uplo, 'NO TRANSPOSE', 'NON-UNIT',kb, k-1, cone,&
+                                  b, ldb, a( k, 1 ), lda )
+                       call stdlib_zhemm( 'LEFT', uplo, kb, k-1, chalf, a( k, k ),lda, b( k, 1 ), &
+                                 ldb, cone, a( k, 1 ),lda )
+                       call stdlib_zher2k( uplo, 'CONJUGATE TRANSPOSE', k-1, kb,cone, a( k, 1 ), &
+                                 lda, b( k, 1 ), ldb,one, a, lda )
+                       call stdlib_zhemm( 'LEFT', uplo, kb, k-1, chalf, a( k, k ),lda, b( k, 1 ), &
+                                 ldb, cone, a( k, 1 ),lda )
+                       call stdlib_ztrmm( 'LEFT', uplo, 'CONJUGATE TRANSPOSE','NON-UNIT', kb, k-1,&
+                                  cone, b( k, k ), ldb,a( k, 1 ), lda )
+                       call stdlib_zhegs2( itype, uplo, kb, a( k, k ), lda,b( k, k ), ldb, info )
+                                 
+                    end do
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_zhegst
+
+     !> ZHETD2: reduces a complex Hermitian matrix A to real symmetric
+     !> tridiagonal form T by a unitary similarity transformation:
+     !> Q**H * A * Q = T.
+
+     pure subroutine stdlib_zhetd2( uplo, n, a, lda, d, e, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(out) :: d(*), e(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i
+           complex(dp) :: alpha, taui
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U')
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETD2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           if( upper ) then
+              ! reduce the upper triangle of a
+              a( n, n ) = real( a( n, n ),KIND=dp)
+              do i = n - 1, 1, -1
+                 ! generate elementary reflector h(i) = i - tau * v * v**h
+                 ! to annihilate a(1:i-1,i+1)
+                 alpha = a( i, i+1 )
+                 call stdlib_zlarfg( i, alpha, a( 1, i+1 ), 1, taui )
+                 e( i ) = real( alpha,KIND=dp)
+                 if( taui/=czero ) then
+                    ! apply h(i) from both sides to a(1:i,1:i)
+                    a( i, i+1 ) = cone
+                    ! compute  x := tau * a * v  storing x in tau(1:i)
+                    call stdlib_zhemv( uplo, i, taui, a, lda, a( 1, i+1 ), 1, czero,tau, 1 )
+                              
+                    ! compute  w := x - 1/2 * tau * (x**h * v) * v
+                    alpha = -chalf*taui*stdlib_zdotc( i, tau, 1, a( 1, i+1 ), 1 )
+                    call stdlib_zaxpy( i, alpha, a( 1, i+1 ), 1, tau, 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**h - w * v**h
+                    call stdlib_zher2( uplo, i, -cone, a( 1, i+1 ), 1, tau, 1, a,lda )
+                 else
+                    a( i, i ) = real( a( i, i ),KIND=dp)
+                 end if
+                 a( i, i+1 ) = e( i )
+                 d( i+1 ) = real( a( i+1, i+1 ),KIND=dp)
+                 tau( i ) = taui
+              end do
+              d( 1 ) = real( a( 1, 1 ),KIND=dp)
+           else
+              ! reduce the lower triangle of a
+              a( 1, 1 ) = real( a( 1, 1 ),KIND=dp)
+              do i = 1, n - 1
+                 ! generate elementary reflector h(i) = i - tau * v * v**h
+                 ! to annihilate a(i+2:n,i)
+                 alpha = a( i+1, i )
+                 call stdlib_zlarfg( n-i, alpha, a( min( i+2, n ), i ), 1, taui )
+                 e( i ) = real( alpha,KIND=dp)
+                 if( taui/=czero ) then
+                    ! apply h(i) from both sides to a(i+1:n,i+1:n)
+                    a( i+1, i ) = cone
+                    ! compute  x := tau * a * v  storing y in tau(i:n-1)
+                    call stdlib_zhemv( uplo, n-i, taui, a( i+1, i+1 ), lda,a( i+1, i ), 1, czero, &
+                              tau( i ), 1 )
+                    ! compute  w := x - 1/2 * tau * (x**h * v) * v
+                    alpha = -chalf*taui*stdlib_zdotc( n-i, tau( i ), 1, a( i+1, i ),1 )
+                    call stdlib_zaxpy( n-i, alpha, a( i+1, i ), 1, tau( i ), 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**h - w * v**h
+                    call stdlib_zher2( uplo, n-i, -cone, a( i+1, i ), 1, tau( i ), 1,a( i+1, i+1 )&
+                              , lda )
+                 else
+                    a( i+1, i+1 ) = real( a( i+1, i+1 ),KIND=dp)
+                 end if
+                 a( i+1, i ) = e( i )
+                 d( i ) = real( a( i, i ),KIND=dp)
+                 tau( i ) = taui
+              end do
+              d( n ) = real( a( n, n ),KIND=dp)
+           end if
+           return
+     end subroutine stdlib_zhetd2
+
+     !> ZHETRD: reduces a complex Hermitian matrix A to real symmetric
+     !> tridiagonal form T by a unitary similarity transformation:
+     !> Q**H * A * Q = T.
+
+     pure subroutine stdlib_zhetrd( uplo, n, a, lda, d, e, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(dp), intent(out) :: d(*), e(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, iws, j, kk, ldwork, lwkopt, nb, nbmin, nx
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              ! determine the block size.
+              nb = stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           nx = n
+           iws = 1
+           if( nb>1 .and. nb<n ) then
+              ! determine when to cross over from blocked to unblocked code
+              ! (last block is always handled by unblocked code).
+              nx = max( nb, stdlib_ilaenv( 3, 'ZHETRD', uplo, n, -1, -1, -1 ) )
+              if( nx<n ) then
+                 ! determine if workspace is large enough for blocked code.
+                 ldwork = n
+                 iws = ldwork*nb
+                 if( lwork<iws ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code by setting nx = n.
+                    nb = max( lwork / ldwork, 1 )
+                    nbmin = stdlib_ilaenv( 2, 'ZHETRD', uplo, n, -1, -1, -1 )
+                    if( nb<nbmin )nx = n
+                 end if
+              else
+                 nx = n
+              end if
+           else
+              nb = 1
+           end if
+           if( upper ) then
+              ! reduce the upper triangle of a.
+              ! columns 1:kk are handled by the unblocked method.
+              kk = n - ( ( n-nx+nb-1 ) / nb )*nb
+              do i = n - nb + 1, kk + 1, -nb
+                 ! reduce columns i:i+nb-1 to tridiagonal form and form the
+                 ! matrix w which is needed to update the unreduced part of
+                 ! the matrix
+                 call stdlib_zlatrd( uplo, i+nb-1, nb, a, lda, e, tau, work,ldwork )
+                 ! update the unreduced submatrix a(1:i-1,1:i-1), using an
+                 ! update of the form:  a := a - v*w**h - w*v**h
+                 call stdlib_zher2k( uplo, 'NO TRANSPOSE', i-1, nb, -cone,a( 1, i ), lda, work, &
+                           ldwork, one, a, lda )
+                 ! copy superdiagonal elements back into a, and diagonal
+                 ! elements into d
+                 do j = i, i + nb - 1
+                    a( j-1, j ) = e( j-1 )
+                    d( j ) = real( a( j, j ),KIND=dp)
+                 end do
+              end do
+              ! use unblocked code to reduce the last or only block
+              call stdlib_zhetd2( uplo, kk, a, lda, d, e, tau, iinfo )
+           else
+              ! reduce the lower triangle of a
+              do i = 1, n - nx, nb
+                 ! reduce columns i:i+nb-1 to tridiagonal form and form the
+                 ! matrix w which is needed to update the unreduced part of
+                 ! the matrix
+                 call stdlib_zlatrd( uplo, n-i+1, nb, a( i, i ), lda, e( i ),tau( i ), work, &
+                           ldwork )
+                 ! update the unreduced submatrix a(i+nb:n,i+nb:n), using
+                 ! an update of the form:  a := a - v*w**h - w*v**h
+                 call stdlib_zher2k( uplo, 'NO TRANSPOSE', n-i-nb+1, nb, -cone,a( i+nb, i ), lda, &
+                           work( nb+1 ), ldwork, one,a( i+nb, i+nb ), lda )
+                 ! copy subdiagonal elements back into a, and diagonal
+                 ! elements into d
+                 do j = i, i + nb - 1
+                    a( j+1, j ) = e( j )
+                    d( j ) = real( a( j, j ),KIND=dp)
+                 end do
+              end do
+              ! use unblocked code to reduce the last or only block
+              call stdlib_zhetd2( uplo, n-i+1, a( i, i ), lda, d( i ), e( i ),tau( i ), iinfo )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zhetrd
+
+     !> ZHETRD_HB2ST: reduces a complex Hermitian band matrix A to real symmetric
+     !> tridiagonal form T by a unitary similarity transformation:
+     !> Q**H * A * Q = T.
+
+     pure subroutine stdlib_zhetrd_hb2st( stage1, vect, uplo, n, kd, ab, ldab,d, e, hous, lhous, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: stage1, uplo, vect
+           integer(ilp), intent(in) :: n, kd, ldab, lhous, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(out) :: d(*), e(*)
+           complex(dp), intent(inout) :: ab(ldab,*)
+           complex(dp), intent(out) :: hous(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: rzero = 0.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantq, upper, afters1
+           integer(ilp) :: i, m, k, ib, sweepid, myid, shift, stt, st, ed, stind, edind, &
+           blklastind, colpt, thed, stepercol, grsiz, thgrsiz, thgrnb, thgrid, nbtiles, ttype, &
+           tid, nthreads, debug, abdpos, abofdpos, dpos, ofdpos, awpos, inda, indw, apos, sizea, &
+                     lda, indv, indtau, sizev, sizetau, ldv, lhmin, lwmin
+           real(dp) :: abstmp
+           complex(dp) :: tmp
+           ! Intrinsic Functions 
+           intrinsic :: min,max,ceiling,real
+           ! Executable Statements 
+           ! determine the minimal workspace size required.
+           ! test the input parameters
+           debug   = 0
+           info    = 0
+           afters1 = stdlib_lsame( stage1, 'Y' )
+           wantq   = stdlib_lsame( vect, 'V' )
+           upper   = stdlib_lsame( uplo, 'U' )
+           lquery  = ( lwork==-1 ) .or. ( lhous==-1 )
+           ! determine the block size, the workspace size and the hous size.
+           ib     = stdlib_ilaenv2stage( 2, 'ZHETRD_HB2ST', vect, n, kd, -1, -1 )
+           lhmin  = stdlib_ilaenv2stage( 3, 'ZHETRD_HB2ST', vect, n, kd, ib, -1 )
+           lwmin  = stdlib_ilaenv2stage( 4, 'ZHETRD_HB2ST', vect, n, kd, ib, -1 )
+           if( .not.afters1 .and. .not.stdlib_lsame( stage1, 'N' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -2
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<(kd+1) ) then
+              info = -7
+           else if( lhous<lhmin .and. .not.lquery ) then
+              info = -11
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              hous( 1 ) = lhmin
+              work( 1 ) = lwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRD_HB2ST', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! determine pointer position
+           ldv      = kd + ib
+           sizetau  = 2 * n
+           sizev    = 2 * n
+           indtau   = 1
+           indv     = indtau + sizetau
+           lda      = 2 * kd + 1
+           sizea    = lda * n
+           inda     = 1
+           indw     = inda + sizea
+           nthreads = 1
+           tid      = 0
+           if( upper ) then
+               apos     = inda + kd
+               awpos    = inda
+               dpos     = apos + kd
+               ofdpos   = dpos - 1
+               abdpos   = kd + 1
+               abofdpos = kd
+           else
+               apos     = inda
+               awpos    = inda + kd + 1
+               dpos     = apos
+               ofdpos   = dpos + 1
+               abdpos   = 1
+               abofdpos = 2
+           endif
+           ! case kd=0:
+           ! the matrix is diagonal. we just copy it (convert to "real" for
+           ! complex because d is double and the imaginary part should be 0)
+           ! and store it in d. a sequential code here is better or
+           ! in a parallel environment it might need two cores for d and e
+           if( kd==0 ) then
+               do i = 1, n
+                   d( i ) = real( ab( abdpos, i ),KIND=dp)
+               end do
+               do i = 1, n-1
+                   e( i ) = rzero
+               end do
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! case kd=1:
+           ! the matrix is already tridiagonal. we have to make diagonal
+           ! and offdiagonal elements real, and store them in d and e.
+           ! for that, for real precision just copy the diag and offdiag
+           ! to d and e while for the complex case the bulge chasing is
+           ! performed to convert the hermetian tridiagonal to symmetric
+           ! tridiagonal. a simpler conversion formula might be used, but then
+           ! updating the q matrix will be required and based if q is generated
+           ! or not this might complicate the story.
+           if( kd==1 ) then
+               do i = 1, n
+                   d( i ) = real( ab( abdpos, i ),KIND=dp)
+               end do
+               ! make off-diagonal elements real and copy them to e
+               if( upper ) then
+                   do i = 1, n - 1
+                       tmp = ab( abofdpos, i+1 )
+                       abstmp = abs( tmp )
+                       ab( abofdpos, i+1 ) = abstmp
+                       e( i ) = abstmp
+                       if( abstmp/=rzero ) then
+                          tmp = tmp / abstmp
+                       else
+                          tmp = cone
+                       end if
+                       if( i<n-1 )ab( abofdpos, i+2 ) = ab( abofdpos, i+2 )*tmp
+                        ! if( wantz ) then
+                           ! call stdlib_zscal( n, conjg( tmp ), q( 1, i+1 ), 1 )
+                        ! end if
+                   end do
+               else
+                   do i = 1, n - 1
+                      tmp = ab( abofdpos, i )
+                      abstmp = abs( tmp )
+                      ab( abofdpos, i ) = abstmp
+                      e( i ) = abstmp
+                      if( abstmp/=rzero ) then
+                         tmp = tmp / abstmp
+                      else
+                         tmp = cone
+                      end if
+                      if( i<n-1 )ab( abofdpos, i+1 ) = ab( abofdpos, i+1 )*tmp
+                       ! if( wantq ) then
+                          ! call stdlib_zscal( n, tmp, q( 1, i+1 ), 1 )
+                       ! end if
+                   end do
+               endif
+               hous( 1 ) = 1
+               work( 1 ) = 1
+               return
+           end if
+           ! main code start here.
+           ! reduce the hermitian band of a to a tridiagonal matrix.
+           thgrsiz   = n
+           grsiz     = 1
+           shift     = 3
+           nbtiles   = ceiling( real(n,KIND=dp)/real(kd,KIND=dp) )
+           stepercol = ceiling( real(shift,KIND=dp)/real(grsiz,KIND=dp) )
+           thgrnb    = ceiling( real(n-1,KIND=dp)/real(thgrsiz,KIND=dp) )
+           call stdlib_zlacpy( "A", kd+1, n, ab, ldab, work( apos ), lda )
+           call stdlib_zlaset( "A", kd,   n, czero, czero, work( awpos ), lda )
+           ! openmp parallelisation start here
+           !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND ) &
+           !$OMP&         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) &
+           !$OMP&         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND ) &
+           !$OMP&         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK) &
+           !$OMP&         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA ) &
+           !$OMP&         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
+           !$OMP MASTER
+           ! main bulge chasing loop
+           loop_100: do thgrid = 1, thgrnb
+               stt  = (thgrid-1)*thgrsiz+1
+               thed = min( (stt + thgrsiz -1), (n-1))
+               loop_110: do i = stt, n-1
+                   ed = min( i, thed )
+                   if( stt>ed ) exit
+                   loop_120: do m = 1, stepercol
+                       st = stt
+                       loop_130: do sweepid = st, ed
+                           loop_140: do k = 1, grsiz
+                               myid  = (i-sweepid)*(stepercol*grsiz)+ (m-1)*grsiz + k
+                               if ( myid==1 ) then
+                                   ttype = 1
+                               else
+                                   ttype = mod( myid, 2 ) + 2
+                               endif
+                               if( ttype==2 ) then
+                                   colpt      = (myid/2)*kd + sweepid
+                                   stind      = colpt-kd+1
+                                   edind      = min(colpt,n)
+                                   blklastind = colpt
+                               else
+                                   colpt      = ((myid+1)/2)*kd + sweepid
+                                   stind      = colpt-kd+1
+                                   edind      = min(colpt,n)
+                                   if( ( stind>=edind-1 ).and.( edind==n ) ) then
+                                       blklastind = n
+                                   else
+                                       blklastind = 0
+                                   endif
+                               endif
+                               ! call the kernel
+                               !$ if( ttype/=1 ) then
+                                   !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) &
+                                   !$OMP&     DEPEND(in:WORK(MYID-1)) &
+                                   !$OMP&     DEPEND(out:WORK(MYID))
+                                   !$ tid      = omp_get_thread_num()
+                                   !$ call stdlib_zhb2st_kernels( uplo, wantq, ttype,stind, edind, &
+                                   !$ sweepid, n, kd, ib,work ( inda ), lda,hous( indv ), hous( &
+                                   !$ indtau ), ldv,work( indw + tid*kd ) )
+                                   !$OMP END TASK
+                               !$ else
+                                   !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1)) &
+                                   !$OMP&     DEPEND(out:WORK(MYID))
+                                   !$ tid      = omp_get_thread_num()
+                                   call stdlib_zhb2st_kernels( uplo, wantq, ttype,stind, edind, &
+                                   sweepid, n, kd, ib,work ( inda ), lda,hous( indv ), hous( &
+                                             indtau ), ldv,work( indw + tid*kd ) )
+                               !$OMP END TASK
+                               !$ endif
+                               if ( blklastind>=(n-1) ) then
+                                   stt = stt + 1
+                                   exit
+                               endif
+                           end do loop_140
+                       end do loop_130
+                   end do loop_120
+               end do loop_110
+           end do loop_100
+           !$OMP END MASTER
+           !$OMP END PARALLEL
+           ! copy the diagonal from a to d. note that d is real thus only
+           ! the real part is needed, the imaginary part should be czero.
+           do i = 1, n
+               d( i ) = real( work( dpos+(i-1)*lda ),KIND=dp)
+           end do
+           ! copy the off diagonal from a to e. note that e is real thus only
+           ! the real part is needed, the imaginary part should be czero.
+           if( upper ) then
+               do i = 1, n-1
+                  e( i ) = real( work( ofdpos+i*lda ),KIND=dp)
+               end do
+           else
+               do i = 1, n-1
+                  e( i ) = real( work( ofdpos+(i-1)*lda ),KIND=dp)
+               end do
+           endif
+           hous( 1 ) = lhmin
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_zhetrd_hb2st
+
+     !> ZHETRD_HE2HB: reduces a complex Hermitian matrix A to complex Hermitian
+     !> band-diagonal form AB by a unitary similarity transformation:
+     !> Q**H * A * Q = AB.
+
+     pure subroutine stdlib_zhetrd_he2hb( uplo, n, kd, a, lda, ab, ldab, tau,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldab, lwork, n, kd
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: ab(ldab,*), tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: rone = 1.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, j, iinfo, lwmin, pn, pk, lk, ldt, ldw, lds2, lds1, ls2, ls1, lw, lt,&
+                      tpos, wpos, s2pos, s1pos
+           ! Intrinsic Functions 
+           intrinsic :: min,max
+           ! Executable Statements 
+           ! determine the minimal workspace size required
+           ! and test the input parameters
+           info   = 0
+           upper  = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           lwmin  = stdlib_ilaenv2stage( 4, 'ZHETRD_HE2HB', '', n, kd, -1, -1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldab<max( 1, kd+1 ) ) then
+              info = -7
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRD_HE2HB', -info )
+              return
+           else if( lquery ) then
+              work( 1 ) = lwmin
+              return
+           end if
+           ! quick return if possible
+           ! copy the upper/lower portion of a into ab
+           if( n<=kd+1 ) then
+               if( upper ) then
+                   do i = 1, n
+                       lk = min( kd+1, i )
+                       call stdlib_zcopy( lk, a( i-lk+1, i ), 1,ab( kd+1-lk+1, i ), 1 )
+                   end do
+               else
+                   do i = 1, n
+                       lk = min( kd+1, n-i+1 )
+                       call stdlib_zcopy( lk, a( i, i ), 1, ab( 1, i ), 1 )
+                   end do
+               endif
+               work( 1 ) = 1
+               return
+           end if
+           ! determine the pointer position for the workspace
+           ldt    = kd
+           lds1   = kd
+           lt     = ldt*kd
+           lw     = n*kd
+           ls1    = lds1*kd
+           ls2    = lwmin - lt - lw - ls1
+            ! ls2 = n*max(kd,factoptnb)
+           tpos   = 1
+           wpos   = tpos  + lt
+           s1pos  = wpos  + lw
+           s2pos  = s1pos + ls1
+           if( upper ) then
+               ldw    = kd
+               lds2   = kd
+           else
+               ldw    = n
+               lds2   = n
+           endif
+           ! set the workspace of the triangular matrix t to czero once such a
+           ! way every time t is generated the upper/lower portion will be always czero
+           call stdlib_zlaset( "A", ldt, kd, czero, czero, work( tpos ), ldt )
+           if( upper ) then
+               do i = 1, n - kd, kd
+                  pn = n-i-kd+1
+                  pk = min( n-i-kd+1, kd )
+                  ! compute the lq factorization of the current block
+                  call stdlib_zgelqf( kd, pn, a( i, i+kd ), lda,tau( i ), work( s2pos ), ls2, &
+                            iinfo )
+                  ! copy the upper portion of a into ab
+                  do j = i, i+pk-1
+                     lk = min( kd, n-j ) + 1
+                     call stdlib_zcopy( lk, a( j, j ), lda, ab( kd+1, j ), ldab-1 )
+                  end do
+                  call stdlib_zlaset( 'LOWER', pk, pk, czero, cone,a( i, i+kd ), lda )
+                  ! form the matrix t
+                  call stdlib_zlarft( 'FORWARD', 'ROWWISE', pn, pk,a( i, i+kd ), lda, tau( i ),&
+                            work( tpos ), ldt )
+                  ! compute w:
+                  call stdlib_zgemm( 'CONJUGATE', 'NO TRANSPOSE', pk, pn, pk,cone,  work( tpos ), &
+                            ldt,a( i, i+kd ), lda,czero, work( s2pos ), lds2 )
+                  call stdlib_zhemm( 'RIGHT', uplo, pk, pn,cone,  a( i+kd, i+kd ), lda,work( &
+                            s2pos ), lds2,czero, work( wpos ), ldw )
+                  call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE', pk, pk, pn,cone,  work( wpos ), &
+                            ldw,work( s2pos ), lds2,czero, work( s1pos ), lds1 )
+                  call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pk, pn, pk,-chalf, work( &
+                            s1pos ), lds1,a( i, i+kd ), lda,cone,   work( wpos ), ldw )
+                  ! update the unreduced submatrix a(i+kd:n,i+kd:n), using
+                  ! an update of the form:  a := a - v'*w - w'*v
+                  call stdlib_zher2k( uplo, 'CONJUGATE', pn, pk,-cone, a( i, i+kd ), lda,work( &
+                            wpos ), ldw,rone, a( i+kd, i+kd ), lda )
+               end do
+              ! copy the upper band to ab which is the band storage matrix
+              do j = n-kd+1, n
+                 lk = min(kd, n-j) + 1
+                 call stdlib_zcopy( lk, a( j, j ), lda, ab( kd+1, j ), ldab-1 )
+              end do
+           else
+               ! reduce the lower triangle of a to lower band matrix
+               loop_40: do i = 1, n - kd, kd
+                  pn = n-i-kd+1
+                  pk = min( n-i-kd+1, kd )
+                  ! compute the qr factorization of the current block
+                  call stdlib_zgeqrf( pn, kd, a( i+kd, i ), lda,tau( i ), work( s2pos ), ls2, &
+                            iinfo )
+                  ! copy the upper portion of a into ab
+                  do j = i, i+pk-1
+                     lk = min( kd, n-j ) + 1
+                     call stdlib_zcopy( lk, a( j, j ), 1, ab( 1, j ), 1 )
+                  end do
+                  call stdlib_zlaset( 'UPPER', pk, pk, czero, cone,a( i+kd, i ), lda )
+                  ! form the matrix t
+                  call stdlib_zlarft( 'FORWARD', 'COLUMNWISE', pn, pk,a( i+kd, i ), lda, tau( i ),&
+                            work( tpos ), ldt )
+                  ! compute w:
+                  call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pn, pk, pk,cone, a( i+kd, i )&
+                            , lda,work( tpos ), ldt,czero, work( s2pos ), lds2 )
+                  call stdlib_zhemm( 'LEFT', uplo, pn, pk,cone, a( i+kd, i+kd ), lda,work( s2pos )&
+                            , lds2,czero, work( wpos ), ldw )
+                  call stdlib_zgemm( 'CONJUGATE', 'NO TRANSPOSE', pk, pk, pn,cone, work( s2pos ), &
+                            lds2,work( wpos ), ldw,czero, work( s1pos ), lds1 )
+                  call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', pn, pk, pk,-chalf, a( i+kd, &
+                            i ), lda,work( s1pos ), lds1,cone, work( wpos ), ldw )
+                  ! update the unreduced submatrix a(i+kd:n,i+kd:n), using
+                  ! an update of the form:  a := a - v*w' - w*v'
+                  call stdlib_zher2k( uplo, 'NO TRANSPOSE', pn, pk,-cone, a( i+kd, i ), lda,work( &
+                            wpos ), ldw,rone, a( i+kd, i+kd ), lda )
+                  ! ==================================================================
+                  ! restore a for comparison and checking to be removed
+                   ! do 45 j = i, i+pk-1
+                      ! lk = min( kd, n-j ) + 1
+                      ! call stdlib_zcopy( lk, ab( 1, j ), 1, a( j, j ), 1 )
+                      45 continue
+                  ! ==================================================================
+               end do loop_40
+              ! copy the lower band to ab which is the band storage matrix
+              do j = n-kd+1, n
+                 lk = min(kd, n-j) + 1
+                 call stdlib_zcopy( lk, a( j, j ), 1, ab( 1, j ), 1 )
+              end do
+           end if
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_zhetrd_he2hb
+
+     !> ZHETRF: computes the factorization of a complex Hermitian matrix A
+     !> using the Bunch-Kaufman diagonal pivoting method.  The form of the
+     !> factorization is
+     !> A = U*D*U**H  or  A = L*D*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_zhetrf( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'ZHETRF', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRF', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZHETRF', uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**h using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_zlahef;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_zlahef( uplo, k, nb, kb, a, lda, ipiv, work, n, iinfo )
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_zhetf2( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_zlahef;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_zlahef( uplo, n-k+1, nb, kb, a( k, k ), lda, ipiv( k ),work, n, &
+                           iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_zhetf2( uplo, n-k+1, a( k, k ), lda, ipiv( k ), iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zhetrf
+
+     !> ZHETRF_RK: computes the factorization of a complex Hermitian matrix A
+     !> using the bounded Bunch-Kaufman (rook) diagonal pivoting method:
+     !> A = P*U*D*(U**H)*(P**T) or A = P*L*D*(L**H)*(P**T),
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is Hermitian and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+     !> For more information see Further Details section.
+
+     pure subroutine stdlib_zhetrf_rk( uplo, n, a, lda, e, ipiv, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, ip, iws, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'ZHETRF_RK', uplo, n, -1, -1, -1 )
+              lwkopt = n*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRF_RK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZHETRF_RK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_zlahef_rk;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 15
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_zlahef_rk( uplo, k, nb, kb, a, lda, e,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_zhetf2_rk( uplo, k, a, lda, e, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k-kb+1:k and apply row permutations to the
+              ! last k+1 colunms k+1:n after that block
+              ! (we can do the simple loop over ipiv with decrement -1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k<n ) then
+                 do i = k, ( k - kb + 1 ), -1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_zswap( n-k, a( i, k+1 ), lda,a( ip, k+1 ), lda )
+                    end if
+                 end do
+              end if
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+              ! this label is the exit from main loop over k decreasing
+              ! from n to 1 in steps of kb
+              15 continue
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_zlahef_rk;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 35
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_zlahef_rk( uplo, n-k+1, nb, kb, a( k, k ), lda, e( k ),ipiv( k ), &
+                           work, ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_zhetf2_rk( uplo, n-k+1, a( k, k ), lda, e( k ),ipiv( k ), iinfo )
+                           
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do i = k, k + kb - 1
+                 if( ipiv( i )>0 ) then
+                    ipiv( i ) = ipiv( i ) + k - 1
+                 else
+                    ipiv( i ) = ipiv( i ) - k + 1
+                 end if
+              end do
+              ! apply permutations to the leading panel 1:k-1
+              ! read ipiv from the last block factored, i.e.
+              ! indices  k:k+kb-1 and apply row permutations to the
+              ! first k-1 colunms 1:k-1 before that block
+              ! (we can do the simple loop over ipiv with increment 1,
+              ! since the abs value of ipiv( i ) represents the row index
+              ! of the interchange with row i in both 1x1 and 2x2 pivot cases)
+              if( k>1 ) then
+                 do i = k, ( k + kb - 1 ), 1
+                    ip = abs( ipiv( i ) )
+                    if( ip/=i ) then
+                       call stdlib_zswap( k-1, a( i, 1 ), lda,a( ip, 1 ), lda )
+                    end if
+                 end do
+              end if
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+              ! this label is the exit from main loop over k increasing
+              ! from 1 to n in steps of kb
+              35 continue
+           ! end lower
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zhetrf_rk
+
+     !> ZHETRF_ROOK: computes the factorization of a complex Hermitian matrix A
+     !> using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
+     !> The form of the factorization is
+     !> A = U*D*U**T  or  A = L*D*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_zhetrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: iinfo, iws, j, k, kb, ldwork, lwkopt, nb, nbmin
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              ! determine the block size
+              nb = stdlib_ilaenv( 1, 'ZHETRF_ROOK', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, n*nb )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRF_ROOK', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           nbmin = 2
+           ldwork = n
+           if( nb>1 .and. nb<n ) then
+              iws = ldwork*nb
+              if( lwork<iws ) then
+                 nb = max( lwork / ldwork, 1 )
+                 nbmin = max( 2, stdlib_ilaenv( 2, 'ZHETRF_ROOK',uplo, n, -1, -1, -1 ) )
+              end if
+           else
+              iws = 1
+           end if
+           if( nb<nbmin )nb = n
+           if( upper ) then
+              ! factorize a as u*d*u**t using the upper triangle of a
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_zlahef_rook;
+              ! kb is either nb or nb-1, or k for the last block
+              k = n
+              10 continue
+              ! if k < 1, exit from loop
+              if( k<1 )go to 40
+              if( k>nb ) then
+                 ! factorize columns k-kb+1:k of a and use blocked code to
+                 ! update columns 1:k-kb
+                 call stdlib_zlahef_rook( uplo, k, nb, kb, a, lda,ipiv, work, ldwork, iinfo )
+                           
+              else
+                 ! use unblocked code to factorize columns 1:k of a
+                 call stdlib_zhetf2_rook( uplo, k, a, lda, ipiv, iinfo )
+                 kb = k
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo
+              ! no need to adjust ipiv
+              ! decrease k and return to the start of the main loop
+              k = k - kb
+              go to 10
+           else
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! kb, where kb is the number of columns factorized by stdlib_zlahef_rook;
+              ! kb is either nb or nb-1, or n-k+1 for the last block
+              k = 1
+              20 continue
+              ! if k > n, exit from loop
+              if( k>n )go to 40
+              if( k<=n-nb ) then
+                 ! factorize columns k:k+kb-1 of a and use blocked code to
+                 ! update columns k+kb:n
+                 call stdlib_zlahef_rook( uplo, n-k+1, nb, kb, a( k, k ), lda,ipiv( k ), work, &
+                           ldwork, iinfo )
+              else
+                 ! use unblocked code to factorize columns k:n of a
+                 call stdlib_zhetf2_rook( uplo, n-k+1, a( k, k ), lda, ipiv( k ),iinfo )
+                 kb = n - k + 1
+              end if
+              ! set info on the first occurrence of a zero pivot
+              if( info==0 .and. iinfo>0 )info = iinfo + k - 1
+              ! adjust ipiv
+              do j = k, k + kb - 1
+                 if( ipiv( j )>0 ) then
+                    ipiv( j ) = ipiv( j ) + k - 1
+                 else
+                    ipiv( j ) = ipiv( j ) - k + 1
+                 end if
+              end do
+              ! increase k and return to the start of the main loop
+              k = k + kb
+              go to 20
+           end if
+           40 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zhetrf_rook
+
+     !> ZHETRS: solves a system of linear equations A*X = B with a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by ZHETRF.
+
+     pure subroutine stdlib_zhetrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           real(dp) :: s
+           complex(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**h.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_zgeru( k-1, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=dp) / real( a( k, k ),KIND=dp)
+                 call stdlib_zdscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_zswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_zgeru( k-2, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 call stdlib_zgeru( k-2, nrhs, -cone, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ), &
+                           ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / conjg( akm1k )
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / conjg( akm1k )
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**h *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**h(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 ) then
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**h(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k+1, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k+1 )&
+                              , 1, cone, b( k+1, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**h.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_zgeru( n-k, nrhs, -cone, a( k+1, k ), 1, b( k, 1 ),ldb, b( &
+                           k+1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=dp) / real( a( k, k ),KIND=dp)
+                 call stdlib_zdscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_zswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_zgeru( n-k-1, nrhs, -cone, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_zgeru( n-k-1, nrhs, -cone, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / conjg( akm1k )
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / conjg( akm1k )
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**h *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**h(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n ) then
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**h(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k-1, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k-1 ), 1, cone,b( k-1, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k-1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_zhetrs
+
+     !> ZHETRS2: solves a system of linear equations A*X = B with a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by ZHETRF and converted by ZSYCONV.
+
+     pure subroutine stdlib_zhetrs2( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, j, k, kp
+           real(dp) :: s
+           complex(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRS2', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! convert a
+           call stdlib_zsyconv( uplo, 'C', n, a, lda, ipiv, work, iinfo )
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**h.
+             ! p**t * b
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp==-ipiv( k-1 ) )call stdlib_zswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb &
+                           )
+                 k=k-2
+              end if
+             end do
+        ! compute (u \p**t * b) -> b    [ (u \p**t * b) ]
+             call stdlib_ztrsm('L','U','N','U',n,nrhs,cone,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (u \p**t * b) ]
+              i=n
+              do while ( i >= 1 )
+                 if( ipiv(i) > 0 ) then
+                   s = real( cone,KIND=dp) / real( a( i, i ),KIND=dp)
+                   call stdlib_zdscal( nrhs, s, b( i, 1 ), ldb )
+                 elseif ( i > 1) then
+                    if ( ipiv(i-1) == ipiv(i) ) then
+                       akm1k = work(i)
+                       akm1 = a( i-1, i-1 ) / akm1k
+                       ak = a( i, i ) / conjg( akm1k )
+                       denom = akm1*ak - cone
+                       do j = 1, nrhs
+                          bkm1 = b( i-1, j ) / akm1k
+                          bk = b( i, j ) / conjg( akm1k )
+                          b( i-1, j ) = ( ak*bkm1-bk ) / denom
+                          b( i, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                    i = i - 1
+                    endif
+                 endif
+                 i = i - 1
+              end do
+            ! compute (u**h \ b) -> b   [ u**h \ (d \ (u \p**t * b) ) ]
+              call stdlib_ztrsm('L','U','C','U',n,nrhs,cone,a,lda,b,ldb)
+             ! p * b  [ p * (u**h \ (d \ (u \p**t * b) )) ]
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k < n .and. kp==-ipiv( k+1 ) )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp,&
+                            1 ), ldb )
+                 k=k+2
+              endif
+             end do
+           else
+              ! solve a*x = b, where a = l*d*l**h.
+             ! p**t * b
+             k=1
+             do while ( k <= n )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k+1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k+1).
+                 kp = -ipiv( k+1 )
+                 if( kp==-ipiv( k ) )call stdlib_zswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                           
+                 k=k+2
+              endif
+             end do
+        ! compute (l \p**t * b) -> b    [ (l \p**t * b) ]
+             call stdlib_ztrsm('L','L','N','U',n,nrhs,cone,a,lda,b,ldb)
+        ! compute d \ b -> b   [ d \ (l \p**t * b) ]
+              i=1
+              do while ( i <= n )
+                 if( ipiv(i) > 0 ) then
+                   s = real( cone,KIND=dp) / real( a( i, i ),KIND=dp)
+                   call stdlib_zdscal( nrhs, s, b( i, 1 ), ldb )
+                 else
+                       akm1k = work(i)
+                       akm1 = a( i, i ) / conjg( akm1k )
+                       ak = a( i+1, i+1 ) / akm1k
+                       denom = akm1*ak - cone
+                       do j = 1, nrhs
+                          bkm1 = b( i, j ) / conjg( akm1k )
+                          bk = b( i+1, j ) / akm1k
+                          b( i, j ) = ( ak*bkm1-bk ) / denom
+                          b( i+1, j ) = ( akm1*bk-bkm1 ) / denom
+                       end do
+                       i = i + 1
+                 endif
+                 i = i + 1
+              end do
+        ! compute (l**h \ b) -> b   [ l**h \ (d \ (l \p**t * b) ) ]
+             call stdlib_ztrsm('L','L','C','U',n,nrhs,cone,a,lda,b,ldb)
+             ! p * b  [ p * (l**h \ (d \ (l \p**t * b) )) ]
+             k=n
+             do while ( k >= 1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k=k-1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( k>1 .and. kp==-ipiv( k-1 ) )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, &
+                           1 ), ldb )
+                 k=k-2
+              endif
+             end do
+           end if
+           ! revert a
+           call stdlib_zsyconv( uplo, 'R', n, a, lda, ipiv, work, iinfo )
+           return
+     end subroutine stdlib_zhetrs2
+
+     !> ZHETRS_AA: solves a system of linear equations A*X = B with a complex
+     !> hermitian matrix A using the factorization A = U**H*T*U or
+     !> A = L*T*L**H computed by ZHETRF_AA.
+
+     pure subroutine stdlib_zhetrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, nrhs, lda, ldb, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           logical(lk) :: lquery, upper
+           integer(ilp) :: k, kp, lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max( 1, 3*n-2 ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRS_AA', -info )
+              return
+           else if( lquery ) then
+              lwkopt = (3*n-2)
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u**h*t*u.
+              ! 1) forward substitution with u**h
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute u**h \ b -> b    [ (u**h \p**t * b) ]
+                 call stdlib_ztrsm( 'L', 'U', 'C', 'U', n-1, nrhs, cone, a( 1, 2 ),lda, b( 2, 1 ),&
+                            ldb )
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (u**h \p**t * b) ]
+              call stdlib_zlacpy( 'F', 1, n, a(1, 1), lda+1, work(n), 1 )
+              if( n>1 ) then
+                  call stdlib_zlacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 2*n ), 1)
+                  call stdlib_zlacpy( 'F', 1, n-1, a( 1, 2 ), lda+1, work( 1 ), 1 )
+                  call stdlib_zlacgv( n-1, work( 1 ), 1 )
+              end if
+              call stdlib_zgtsv( n, nrhs, work(1), work(n), work(2*n), b, ldb,info )
+              ! 3) backward substitution with u
+              if( n>1 ) then
+                 ! compute u \ b -> b   [ u \ (t \ (u**h \p**t * b) ) ]
+                 call stdlib_ztrsm( 'L', 'U', 'N', 'U', n-1, nrhs, cone, a( 1, 2 ),lda, b(2, 1), &
+                           ldb)
+                 ! pivot, p * b  [ p * (u**h \ (t \ (u \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           else
+              ! solve a*x = b, where a = l*t*l**h.
+              ! 1) forward substitution with l
+              if( n>1 ) then
+                 ! pivot, p**t * b -> b
+                 do k = 1, n
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+                 ! compute l \ b -> b    [ (l \p**t * b) ]
+                 call stdlib_ztrsm( 'L', 'L', 'N', 'U', n-1, nrhs, cone, a( 2, 1 ),lda, b(2, 1), &
+                           ldb)
+              end if
+              ! 2) solve with triangular matrix t
+              ! compute t \ b -> b   [ t \ (l \p**t * b) ]
+              call stdlib_zlacpy( 'F', 1, n, a(1, 1), lda+1, work(n), 1)
+              if( n>1 ) then
+                  call stdlib_zlacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 1 ), 1)
+                  call stdlib_zlacpy( 'F', 1, n-1, a( 2, 1 ), lda+1, work( 2*n ), 1)
+                  call stdlib_zlacgv( n-1, work( 2*n ), 1 )
+              end if
+              call stdlib_zgtsv(n, nrhs, work(1), work(n), work(2*n), b, ldb,info)
+              ! 3) backward substitution with l**h
+              if( n>1 ) then
+                 ! compute l**h \ b -> b   [ l**h \ (t \ (l \p**t * b) ) ]
+                 call stdlib_ztrsm( 'L', 'L', 'C', 'U', n-1, nrhs, cone, a( 2, 1 ),lda, b( 2, 1 ),&
+                            ldb)
+                 ! pivot, p * b  [ p * (l**h \ (t \ (l \p**t * b) )) ]
+                 do k = n, 1, -1
+                    kp = ipiv( k )
+                    if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zhetrs_aa
+
+     !> ZHETRS_ROOK: solves a system of linear equations A*X = B with a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by ZHETRF_ROOK.
+
+     pure subroutine stdlib_zhetrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kp
+           real(dp) :: s
+           complex(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max,real
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRS_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**h.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_zgeru( k-1, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=dp) / real( a( k, k ),KIND=dp)
+                 call stdlib_zdscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k), then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1)
+                 if( kp/=k-1 )call stdlib_zswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_zgeru( k-2, nrhs, -cone, a( 1, k ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb &
+                           )
+                 call stdlib_zgeru( k-2, nrhs, -cone, a( 1, k-1 ), 1, b( k-1, 1 ),ldb, b( 1, 1 ), &
+                           ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k-1, k )
+                 akm1 = a( k-1, k-1 ) / akm1k
+                 ak = a( k, k ) / conjg( akm1k )
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / conjg( akm1k )
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**h *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**h(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 ) then
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**h(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k+1, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, a( 1, k+1 )&
+                              , 1, cone, b( k+1, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k), then k+1 and -ipiv(k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_zswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**h.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_zgeru( n-k, nrhs, -cone, a( k+1, k ), 1, b( k, 1 ),ldb, b( &
+                           k+1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=dp) / real( a( k, k ),KIND=dp)
+                 call stdlib_zdscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k and -ipiv(k), then k+1 and -ipiv(k+1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k+1 )
+                 if( kp/=k+1 )call stdlib_zswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_zgeru( n-k-1, nrhs, -cone, a( k+2, k ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_zgeru( n-k-1, nrhs, -cone, a( k+2, k+1 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = a( k+1, k )
+                 akm1 = a( k, k ) / conjg( akm1k )
+                 ak = a( k+1, k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / conjg( akm1k )
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**h *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**h(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n ) then
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**h(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k-1, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              a( k+1, k-1 ), 1, cone,b( k-1, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k-1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k), then k-1 and -ipiv(k-1)
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kp = -ipiv( k-1 )
+                 if( kp/=k-1 )call stdlib_zswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_zhetrs_rook
+
+     !> ZHPTRD: reduces a complex Hermitian matrix A stored in packed form to
+     !> real symmetric tridiagonal form T by a unitary similarity
+     !> transformation: Q**H * A * Q = T.
+
+     pure subroutine stdlib_zhptrd( uplo, n, ap, d, e, tau, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(out) :: d(*), e(*)
+           complex(dp), intent(inout) :: ap(*)
+           complex(dp), intent(out) :: tau(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, i1, i1i1, ii
+           complex(dp) :: alpha, taui
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPTRD', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 )return
+           if( upper ) then
+              ! reduce the upper triangle of a.
+              ! i1 is the index in ap of a(1,i+1).
+              i1 = n*( n-1 ) / 2 + 1
+              ap( i1+n-1 ) = real( ap( i1+n-1 ),KIND=dp)
+              do i = n - 1, 1, -1
+                 ! generate elementary reflector h(i) = i - tau * v * v**h
+                 ! to annihilate a(1:i-1,i+1)
+                 alpha = ap( i1+i-1 )
+                 call stdlib_zlarfg( i, alpha, ap( i1 ), 1, taui )
+                 e( i ) = real( alpha,KIND=dp)
+                 if( taui/=czero ) then
+                    ! apply h(i) from both sides to a(1:i,1:i)
+                    ap( i1+i-1 ) = cone
+                    ! compute  y := tau * a * v  storing y in tau(1:i)
+                    call stdlib_zhpmv( uplo, i, taui, ap, ap( i1 ), 1, czero, tau,1 )
+                    ! compute  w := y - 1/2 * tau * (y**h *v) * v
+                    alpha = -chalf*taui*stdlib_zdotc( i, tau, 1, ap( i1 ), 1 )
+                    call stdlib_zaxpy( i, alpha, ap( i1 ), 1, tau, 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**h - w * v**h
+                    call stdlib_zhpr2( uplo, i, -cone, ap( i1 ), 1, tau, 1, ap )
+                 end if
+                 ap( i1+i-1 ) = e( i )
+                 d( i+1 ) = real( ap( i1+i ),KIND=dp)
+                 tau( i ) = taui
+                 i1 = i1 - i
+              end do
+              d( 1 ) = real( ap( 1 ),KIND=dp)
+           else
+              ! reduce the lower triangle of a. ii is the index in ap of
+              ! a(i,i) and i1i1 is the index of a(i+1,i+1).
+              ii = 1
+              ap( 1 ) = real( ap( 1 ),KIND=dp)
+              do i = 1, n - 1
+                 i1i1 = ii + n - i + 1
+                 ! generate elementary reflector h(i) = i - tau * v * v**h
+                 ! to annihilate a(i+2:n,i)
+                 alpha = ap( ii+1 )
+                 call stdlib_zlarfg( n-i, alpha, ap( ii+2 ), 1, taui )
+                 e( i ) = real( alpha,KIND=dp)
+                 if( taui/=czero ) then
+                    ! apply h(i) from both sides to a(i+1:n,i+1:n)
+                    ap( ii+1 ) = cone
+                    ! compute  y := tau * a * v  storing y in tau(i:n-1)
+                    call stdlib_zhpmv( uplo, n-i, taui, ap( i1i1 ), ap( ii+1 ), 1,czero, tau( i ),&
+                               1 )
+                    ! compute  w := y - 1/2 * tau * (y**h *v) * v
+                    alpha = -chalf*taui*stdlib_zdotc( n-i, tau( i ), 1, ap( ii+1 ),1 )
+                    call stdlib_zaxpy( n-i, alpha, ap( ii+1 ), 1, tau( i ), 1 )
+                    ! apply the transformation as a rank-2 update:
+                       ! a := a - v * w**h - w * v**h
+                    call stdlib_zhpr2( uplo, n-i, -cone, ap( ii+1 ), 1, tau( i ), 1,ap( i1i1 ) )
+                              
+                 end if
+                 ap( ii+1 ) = e( i )
+                 d( i ) = real( ap( ii ),KIND=dp)
+                 tau( i ) = taui
+                 ii = i1i1
+              end do
+              d( n ) = real( ap( ii ),KIND=dp)
+           end if
+           return
+     end subroutine stdlib_zhptrd
+
+     !> ZHPTRS: solves a system of linear equations A*X = B with a complex
+     !> Hermitian matrix A stored in packed format using the factorization
+     !> A = U*D*U**H or A = L*D*L**H computed by ZHPTRF.
+
+     pure subroutine stdlib_zhptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: ap(*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, k, kc, kp
+           real(dp) :: s
+           complex(dp) :: ak, akm1, akm1k, bk, bkm1, denom
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Executable Statements 
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           if( upper ) then
+              ! solve a*x = b, where a = u*d*u**h.
+              ! first solve u*d*x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              10 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 30
+              kc = kc - k
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 call stdlib_zgeru( k-1, nrhs, -cone, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=dp) / real( ap( kc+k-1 ),KIND=dp)
+                 call stdlib_zdscal( nrhs, s, b( k, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k-1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k-1 )call stdlib_zswap( nrhs, b( k-1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(u(k)), where u(k) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 call stdlib_zgeru( k-2, nrhs, -cone, ap( kc ), 1, b( k, 1 ), ldb,b( 1, 1 ), ldb )
+                           
+                 call stdlib_zgeru( k-2, nrhs, -cone, ap( kc-( k-1 ) ), 1,b( k-1, 1 ), ldb, b( 1, &
+                           1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+k-2 )
+                 akm1 = ap( kc-1 ) / akm1k
+                 ak = ap( kc+k-1 ) / conjg( akm1k )
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k-1, j ) / akm1k
+                    bk = b( k, j ) / conjg( akm1k )
+                    b( k-1, j ) = ( ak*bkm1-bk ) / denom
+                    b( k, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc - k + 1
+                 k = k - 2
+              end if
+              go to 10
+              30 continue
+              ! next solve u**h *x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              40 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 50
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(u**h(k)), where u(k) is the transformation
+                 ! stored in column k of a.
+                 if( k>1 ) then
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, ap( kc ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + k
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(u**h(k+1)), where u(k+1) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k>1 ) then
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, ap( kc ), &
+                              1, cone, b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k+1, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', k-1, nrhs, -cone, b,ldb, ap( kc+k ),&
+                               1, cone, b( k+1, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k+1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc + 2*k + 1
+                 k = k + 2
+              end if
+              go to 40
+              50 continue
+           else
+              ! solve a*x = b, where a = l*d*l**h.
+              ! first solve l*d*x = b, overwriting b with x.
+              ! k is the main loop index, increasing from 1 to n in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = 1
+              kc = 1
+              60 continue
+              ! if k > n, exit from loop.
+              if( k>n )go to 80
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n )call stdlib_zgeru( n-k, nrhs, -cone, ap( kc+1 ), 1, b( k, 1 ),ldb, b( k+&
+                           1, 1 ), ldb )
+                 ! multiply by the inverse of the diagonal block.
+                 s = real( cone,KIND=dp) / real( ap( kc ),KIND=dp)
+                 call stdlib_zdscal( nrhs, s, b( k, 1 ), ldb )
+                 kc = kc + n - k + 1
+                 k = k + 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! interchange rows k+1 and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k+1 )call stdlib_zswap( nrhs, b( k+1, 1 ), ldb, b( kp, 1 ), ldb )
+                 ! multiply by inv(l(k)), where l(k) is the transformation
+                 ! stored in columns k and k+1 of a.
+                 if( k<n-1 ) then
+                    call stdlib_zgeru( n-k-1, nrhs, -cone, ap( kc+2 ), 1, b( k, 1 ),ldb, b( k+2, &
+                              1 ), ldb )
+                    call stdlib_zgeru( n-k-1, nrhs, -cone, ap( kc+n-k+2 ), 1,b( k+1, 1 ), ldb, b( &
+                              k+2, 1 ), ldb )
+                 end if
+                 ! multiply by the inverse of the diagonal block.
+                 akm1k = ap( kc+1 )
+                 akm1 = ap( kc ) / conjg( akm1k )
+                 ak = ap( kc+n-k+1 ) / akm1k
+                 denom = akm1*ak - cone
+                 do j = 1, nrhs
+                    bkm1 = b( k, j ) / conjg( akm1k )
+                    bk = b( k+1, j ) / akm1k
+                    b( k, j ) = ( ak*bkm1-bk ) / denom
+                    b( k+1, j ) = ( akm1*bk-bkm1 ) / denom
+                 end do
+                 kc = kc + 2*( n-k ) + 1
+                 k = k + 2
+              end if
+              go to 60
+              80 continue
+              ! next solve l**h *x = b, overwriting b with x.
+              ! k is the main loop index, decreasing from n to 1 in steps of
+              ! 1 or 2, depending on the size of the diagonal blocks.
+              k = n
+              kc = n*( n+1 ) / 2 + 1
+              90 continue
+              ! if k < 1, exit from loop.
+              if( k<1 )go to 100
+              kc = kc - ( n-k+1 )
+              if( ipiv( k )>0 ) then
+                 ! 1 x 1 diagonal block
+                 ! multiply by inv(l**h(k)), where l(k) is the transformation
+                 ! stored in column k of a.
+                 if( k<n ) then
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              ap( kc+1 ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                 end if
+                 ! interchange rows k and ipiv(k).
+                 kp = ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 k = k - 1
+              else
+                 ! 2 x 2 diagonal block
+                 ! multiply by inv(l**h(k-1)), where l(k-1) is the transformation
+                 ! stored in columns k-1 and k of a.
+                 if( k<n ) then
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              ap( kc+1 ), 1, cone,b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k-1, 1 ), ldb )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-k, nrhs, -cone,b( k+1, 1 ), ldb, &
+                              ap( kc-( n-k ) ), 1, cone,b( k-1, 1 ), ldb )
+                    call stdlib_zlacgv( nrhs, b( k-1, 1 ), ldb )
+                 end if
+                 ! interchange rows k and -ipiv(k).
+                 kp = -ipiv( k )
+                 if( kp/=k )call stdlib_zswap( nrhs, b( k, 1 ), ldb, b( kp, 1 ), ldb )
+                 kc = kc - ( n-k+2 )
+                 k = k - 2
+              end if
+              go to 90
+              100 continue
+           end if
+           return
+     end subroutine stdlib_zhptrs
+
+     !> ZLA_GBRCOND_C: Computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a DOUBLE PRECISION vector.
+
+     real(dp) function stdlib_zla_gbrcond_c( trans, n, kl, ku, ab,ldab, afb, ldafb, ipiv,c, &
+               capply, info, work,rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, kl, ku, ldab, ldafb
+           integer(ilp) :: kd, ke
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: ab(ldab,*), afb(ldafb,*)
+           complex(dp), intent(out) :: work(*)
+           real(dp), intent(in) :: c(*)
+           real(dp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notrans
+           integer(ilp) :: kase, i, j
+           real(dp) :: ainvnm, anorm, tmp
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_zla_gbrcond_c = zero
+           info = 0
+           notrans = stdlib_lsame( trans, 'N' )
+           if ( .not. notrans .and. .not. stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 .or. kl>n-1 ) then
+              info = -3
+           else if( ku<0 .or. ku>n-1 ) then
+              info = -4
+           else if( ldab<kl+ku+1 ) then
+              info = -6
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLA_GBRCOND_C', -info )
+              return
+           end if
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           kd = ku + 1
+           ke = kl + 1
+           if ( notrans ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + cabs1( ab( kd+i-j, j ) ) / c( j )
+                    end do
+                 else
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + cabs1( ab( kd+i-j, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + cabs1( ab( ke-i+j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = max( i-kl, 1 ), min( i+ku, n )
+                       tmp = tmp + cabs1( ab( ke-i+j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_zla_gbrcond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if ( notrans ) then
+                    call stdlib_zgbtrs( 'NO TRANSPOSE', n, kl, ku, 1, afb, ldafb,ipiv, work, n, &
+                              info )
+                 else
+                    call stdlib_zgbtrs( 'CONJUGATE TRANSPOSE', n, kl, ku, 1, afb,ldafb, ipiv, &
+                              work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**h).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( notrans ) then
+                    call stdlib_zgbtrs( 'CONJUGATE TRANSPOSE', n, kl, ku, 1, afb,ldafb, ipiv,  &
+                              work, n, info )
+                 else
+                    call stdlib_zgbtrs( 'NO TRANSPOSE', n, kl, ku, 1, afb, ldafb,ipiv, work, n, &
+                              info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_zla_gbrcond_c = one / ainvnm
+           return
+     end function stdlib_zla_gbrcond_c
+
+     !> ZLA_GERCOND_C: computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a DOUBLE PRECISION vector.
+
+     real(dp) function stdlib_zla_gercond_c( trans, n, a, lda, af,ldaf, ipiv, c, capply,info, &
+               work, rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, lda, ldaf
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*), af(ldaf,*)
+           complex(dp), intent(out) :: work(*)
+           real(dp), intent(in) :: c(*)
+           real(dp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: notrans
+           integer(ilp) :: kase, i, j
+           real(dp) :: ainvnm, anorm, tmp
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real,aimag
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_zla_gercond_c = zero
+           info = 0
+           notrans = stdlib_lsame( trans, 'N' )
+           if ( .not. notrans .and. .not. stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLA_GERCOND_C', -info )
+              return
+           end if
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           if ( notrans ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, n
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, n
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, n
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, n
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_zla_gercond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if (notrans) then
+                    call stdlib_zgetrs( 'NO TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                              
+                 else
+                    call stdlib_zgetrs( 'CONJUGATE TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info &
+                              )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**h).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( notrans ) then
+                    call stdlib_zgetrs( 'CONJUGATE TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info &
+                              )
+                 else
+                    call stdlib_zgetrs( 'NO TRANSPOSE', n, 1, af, ldaf, ipiv,work, n, info )
+                              
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_zla_gercond_c = one / ainvnm
+           return
+     end function stdlib_zla_gercond_c
+
+     !> ZLA_HERCOND_C: computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a DOUBLE PRECISION vector.
+
+     real(dp) function stdlib_zla_hercond_c( uplo, n, a, lda, af,ldaf, ipiv, c, capply,info, work,&
+                rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, lda, ldaf
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*), af(ldaf,*)
+           complex(dp), intent(out) :: work(*)
+           real(dp), intent(in) :: c(*)
+           real(dp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: kase, i, j
+           real(dp) :: ainvnm, anorm, tmp
+           logical(lk) :: up, upper
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_zla_hercond_c = zero
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLA_HERCOND_C', -info )
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_zla_hercond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if ( up ) then
+                    call stdlib_zhetrs( 'U', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_zhetrs( 'L', n, 1, af, ldaf, ipiv,work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**h).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_zhetrs( 'U', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_zhetrs( 'L', n, 1, af, ldaf, ipiv,work, n, info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_zla_hercond_c = one / ainvnm
+           return
+     end function stdlib_zla_hercond_c
+
+     !> ZLA_HERPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(dp) function stdlib_zla_herpvgrw( uplo, n, info, a, lda, af,ldaf, ipiv, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, info, lda, ldaf
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: ncols, i, j, k, kp
+           real(dp) :: amax, umax, rpvgrw, tmp
+           logical(lk) :: upper, lsame
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag ( zdum ) )
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           if ( info==0 ) then
+              if (upper) then
+                 ncols = 1
+              else
+                 ncols = n
+              end if
+           else
+              ncols = info
+           end if
+           rpvgrw = one
+           do i = 1, 2*n
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column of a.  compute the max
+           ! for all n columns so we can apply the pivot permutation while
+           ! looping below.  assume a full factorization is the common case.
+           if ( upper ) then
+              do j = 1, n
+                 do i = 1, j
+                    work( n+i ) = max( cabs1( a( i,j ) ), work( n+i ) )
+                    work( n+j ) = max( cabs1( a( i,j ) ), work( n+j ) )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = j, n
+                    work( n+i ) = max( cabs1( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( cabs1( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of u or l.  also
+           ! permute the magnitudes of a above so they're in the same order as
+           ! the factor.
+           ! the iteration orders and permutations were copied from stdlib_zsytrs.
+           ! calls to stdlib_sswap would be severe overkill.
+           if ( upper ) then
+              k = n
+              do while ( k < ncols .and. k>0 )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = 1, k
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                    end do
+                    k = k - 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k-1 )
+                    work( n+k-1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = 1, k-1
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                       work( k-1 ) =max( cabs1( af( i, k-1 ) ), work( k-1 ) )
+                    end do
+                    work( k ) = max( cabs1( af( k, k ) ), work( k ) )
+                    k = k - 2
+                 end if
+              end do
+              k = ncols
+              do while ( k <= n )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k + 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k + 2
+                 end if
+              end do
+           else
+              k = 1
+              do while ( k <= ncols )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = k, n
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                    end do
+                    k = k + 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k+1 )
+                    work( n+k+1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = k+1, n
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                       work( k+1 ) =max( cabs1( af( i, k+1 ) ) , work( k+1 ) )
+                    end do
+                    work(k) = max( cabs1( af( k, k ) ), work( k ) )
+                    k = k + 2
+                 end if
+              end do
+              k = ncols
+              do while ( k >= 1 )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k - 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k - 2
+                 endif
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( upper ) then
+              do i = ncols, n
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_zla_herpvgrw = rpvgrw
+     end function stdlib_zla_herpvgrw
+
+     !> ZLA_PORCOND_C: Computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a DOUBLE PRECISION vector
+
+     real(dp) function stdlib_zla_porcond_c( uplo, n, a, lda, af,ldaf, c, capply, info,work, &
+               rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, lda, ldaf
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), af(ldaf,*)
+           complex(dp), intent(out) :: work(*)
+           real(dp), intent(in) :: c(*)
+           real(dp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: kase
+           real(dp) :: ainvnm, anorm, tmp
+           integer(ilp) :: i, j
+           logical(lk) :: up, upper
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,real,aimag
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_zla_porcond_c = zero
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLA_PORCOND_C', -info )
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_zla_porcond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if ( up ) then
+                    call stdlib_zpotrs( 'U', n, 1, af, ldaf,work, n, info )
+                 else
+                    call stdlib_zpotrs( 'L', n, 1, af, ldaf,work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**h).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_zpotrs( 'U', n, 1, af, ldaf,work, n, info )
+                 else
+                    call stdlib_zpotrs( 'L', n, 1, af, ldaf,work, n, info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_zla_porcond_c = one / ainvnm
+           return
+     end function stdlib_zla_porcond_c
+
+     !> ZLA_SYRCOND_C: Computes the infinity norm condition number of
+     !> op(A) * inv(diag(C)) where C is a DOUBLE PRECISION vector.
+
+     real(dp) function stdlib_zla_syrcond_c( uplo, n, a, lda, af,ldaf, ipiv, c, capply,info, work,&
+                rwork )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           logical(lk), intent(in) :: capply
+           integer(ilp), intent(in) :: n, lda, ldaf
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*), af(ldaf,*)
+           complex(dp), intent(out) :: work(*)
+           real(dp), intent(in) :: c(*)
+           real(dp), intent(out) :: rwork(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: kase
+           real(dp) :: ainvnm, anorm, tmp
+           integer(ilp) :: i, j
+           logical(lk) :: up, upper
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           stdlib_zla_syrcond_c = zero
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldaf<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLA_SYRCOND_C', -info )
+              return
+           end if
+           up = .false.
+           if ( stdlib_lsame( uplo, 'U' ) ) up = .true.
+           ! compute norm of op(a)*op2(c).
+           anorm = zero
+           if ( up ) then
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           else
+              do i = 1, n
+                 tmp = zero
+                 if ( capply ) then
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) ) / c( j )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) ) / c( j )
+                    end do
+                 else
+                    do j = 1, i
+                       tmp = tmp + cabs1( a( i, j ) )
+                    end do
+                    do j = i+1, n
+                       tmp = tmp + cabs1( a( j, i ) )
+                    end do
+                 end if
+                 rwork( i ) = tmp
+                 anorm = max( anorm, tmp )
+              end do
+           end if
+           ! quick return if possible.
+           if( n==0 ) then
+              stdlib_zla_syrcond_c = one
+              return
+           else if( anorm == zero ) then
+              return
+           end if
+           ! estimate the norm of inv(op(a)).
+           ainvnm = zero
+           kase = 0
+           10 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==2 ) then
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+                 if ( up ) then
+                    call stdlib_zsytrs( 'U', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_zsytrs( 'L', n, 1, af, ldaf, ipiv,work, n, info )
+                 endif
+                 ! multiply by inv(c).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+              else
+                 ! multiply by inv(c**t).
+                 if ( capply ) then
+                    do i = 1, n
+                       work( i ) = work( i ) * c( i )
+                    end do
+                 end if
+                 if ( up ) then
+                    call stdlib_zsytrs( 'U', n, 1, af, ldaf, ipiv,work, n, info )
+                 else
+                    call stdlib_zsytrs( 'L', n, 1, af, ldaf, ipiv,work, n, info )
+                 end if
+                 ! multiply by r.
+                 do i = 1, n
+                    work( i ) = work( i ) * rwork( i )
+                 end do
+              end if
+              go to 10
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm /= zero )stdlib_zla_syrcond_c = one / ainvnm
+           return
+     end function stdlib_zla_syrcond_c
+
+     !> ZLA_SYRPVGRW: computes the reciprocal pivot growth factor
+     !> norm(A)/norm(U). The "max absolute element" norm is used. If this is
+     !> much less than 1, the stability of the LU factorization of the
+     !> (equilibrated) matrix A could be poor. This also means that the
+     !> solution X, estimated condition numbers, and error bounds could be
+     !> unreliable.
+
+     real(dp) function stdlib_zla_syrpvgrw( uplo, n, info, a, lda, af,ldaf, ipiv, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, info, lda, ldaf
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), af(ldaf,*)
+           real(dp), intent(out) :: work(*)
+           integer(ilp), intent(in) :: ipiv(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: ncols, i, j, k, kp
+           real(dp) :: amax, umax, rpvgrw, tmp
+           logical(lk) :: upper
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag ( zdum ) )
+           ! Executable Statements 
+           upper = stdlib_lsame( 'UPPER', uplo )
+           if ( info==0 ) then
+              if ( upper ) then
+                 ncols = 1
+              else
+                 ncols = n
+              end if
+           else
+              ncols = info
+           end if
+           rpvgrw = one
+           do i = 1, 2*n
+              work( i ) = zero
+           end do
+           ! find the max magnitude entry of each column of a.  compute the max
+           ! for all n columns so we can apply the pivot permutation while
+           ! looping below.  assume a full factorization is the common case.
+           if ( upper ) then
+              do j = 1, n
+                 do i = 1, j
+                    work( n+i ) = max( cabs1( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( cabs1( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           else
+              do j = 1, n
+                 do i = j, n
+                    work( n+i ) = max( cabs1( a( i, j ) ), work( n+i ) )
+                    work( n+j ) = max( cabs1( a( i, j ) ), work( n+j ) )
+                 end do
+              end do
+           end if
+           ! now find the max magnitude entry of each column of u or l.  also
+           ! permute the magnitudes of a above so they're in the same order as
+           ! the factor.
+           ! the iteration orders and permutations were copied from stdlib_zsytrs.
+           ! calls to stdlib_sswap would be severe overkill.
+           if ( upper ) then
+              k = n
+              do while ( k < ncols .and. k>0 )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = 1, k
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                    end do
+                    k = k - 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k-1 )
+                    work( n+k-1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = 1, k-1
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                       work( k-1 ) =max( cabs1( af( i, k-1 ) ), work( k-1 ) )
+                    end do
+                    work( k ) = max( cabs1( af( k, k ) ), work( k ) )
+                    k = k - 2
+                 end if
+              end do
+              k = ncols
+              do while ( k <= n )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k + 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k + 2
+                 end if
+              end do
+           else
+              k = 1
+              do while ( k <= ncols )
+                 if ( ipiv( k )>0 ) then
+                    ! 1x1 pivot
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    do i = k, n
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                    end do
+                    k = k + 1
+                 else
+                    ! 2x2 pivot
+                    kp = -ipiv( k )
+                    tmp = work( n+k+1 )
+                    work( n+k+1 ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    do i = k+1, n
+                       work( k ) = max( cabs1( af( i, k ) ), work( k ) )
+                       work( k+1 ) =max( cabs1( af( i, k+1 ) ), work( k+1 ) )
+                    end do
+                    work( k ) = max( cabs1( af( k, k ) ), work( k ) )
+                    k = k + 2
+                 end if
+              end do
+              k = ncols
+              do while ( k >= 1 )
+                 if ( ipiv( k )>0 ) then
+                    kp = ipiv( k )
+                    if ( kp /= k ) then
+                       tmp = work( n+k )
+                       work( n+k ) = work( n+kp )
+                       work( n+kp ) = tmp
+                    end if
+                    k = k - 1
+                 else
+                    kp = -ipiv( k )
+                    tmp = work( n+k )
+                    work( n+k ) = work( n+kp )
+                    work( n+kp ) = tmp
+                    k = k - 2
+                 endif
+              end do
+           end if
+           ! compute the *inverse* of the max element growth factor.  dividing
+           ! by zero would imply the largest entry of the factor's column is
+           ! zero.  than can happen when either the column of a is zero or
+           ! massive pivots made the factor underflow to zero.  neither counts
+           ! as growth in itself, so simply ignore terms with zero
+           ! denominators.
+           if ( upper ) then
+              do i = ncols, n
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           else
+              do i = 1, ncols
+                 umax = work( i )
+                 amax = work( n+i )
+                 if ( umax /= zero ) then
+                    rpvgrw = min( amax / umax, rpvgrw )
+                 end if
+              end do
+           end if
+           stdlib_zla_syrpvgrw = rpvgrw
+     end function stdlib_zla_syrpvgrw
+
+     !> ZLABRD: reduces the first NB rows and columns of a complex general
+     !> m by n matrix A to upper or lower real bidiagonal form by a unitary
+     !> transformation Q**H * A * P, and returns the matrices X and Y which
+     !> are needed to apply the transformation to the unreduced part of A.
+     !> If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower
+     !> bidiagonal form.
+     !> This is an auxiliary routine called by ZGEBRD
+
+     pure subroutine stdlib_zlabrd( m, n, nb, a, lda, d, e, tauq, taup, x, ldx, y,ldy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, ldx, ldy, m, n, nb
+           ! Array Arguments 
+           real(dp), intent(out) :: d(*), e(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: taup(*), tauq(*), x(ldx,*), y(ldy,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( m<=0 .or. n<=0 )return
+           if( m>=n ) then
+              ! reduce to upper bidiagonal form
+              loop_10: do i = 1, nb
+                 ! update a(i:m,i)
+                 call stdlib_zlacgv( i-1, y( i, 1 ), ldy )
+                 call stdlib_zgemv( 'NO TRANSPOSE', m-i+1, i-1, -cone, a( i, 1 ),lda, y( i, 1 ), &
+                           ldy, cone, a( i, i ), 1 )
+                 call stdlib_zlacgv( i-1, y( i, 1 ), ldy )
+                 call stdlib_zgemv( 'NO TRANSPOSE', m-i+1, i-1, -cone, x( i, 1 ),ldx, a( 1, i ), &
+                           1, cone, a( i, i ), 1 )
+                 ! generate reflection q(i) to annihilate a(i+1:m,i)
+                 alpha = a( i, i )
+                 call stdlib_zlarfg( m-i+1, alpha, a( min( i+1, m ), i ), 1,tauq( i ) )
+                 d( i ) = real( alpha,KIND=dp)
+                 if( i<n ) then
+                    a( i, i ) = cone
+                    ! compute y(i+1:n,i)
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', m-i+1, n-i, cone,a( i, i+1 ), lda, &
+                              a( i, i ), 1, czero,y( i+1, i ), 1 )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', m-i+1, i-1, cone,a( i, 1 ), lda, a( &
+                              i, i ), 1, czero,y( 1, i ), 1 )
+                    call stdlib_zgemv( 'NO TRANSPOSE', n-i, i-1, -cone, y( i+1, 1 ),ldy, y( 1, i )&
+                              , 1, cone, y( i+1, i ), 1 )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', m-i+1, i-1, cone,x( i, 1 ), ldx, a( &
+                              i, i ), 1, czero,y( 1, i ), 1 )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', i-1, n-i, -cone,a( 1, i+1 ), lda, y(&
+                               1, i ), 1, cone,y( i+1, i ), 1 )
+                    call stdlib_zscal( n-i, tauq( i ), y( i+1, i ), 1 )
+                    ! update a(i,i+1:n)
+                    call stdlib_zlacgv( n-i, a( i, i+1 ), lda )
+                    call stdlib_zlacgv( i, a( i, 1 ), lda )
+                    call stdlib_zgemv( 'NO TRANSPOSE', n-i, i, -cone, y( i+1, 1 ),ldy, a( i, 1 ), &
+                              lda, cone, a( i, i+1 ), lda )
+                    call stdlib_zlacgv( i, a( i, 1 ), lda )
+                    call stdlib_zlacgv( i-1, x( i, 1 ), ldx )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', i-1, n-i, -cone,a( 1, i+1 ), lda, x(&
+                               i, 1 ), ldx, cone,a( i, i+1 ), lda )
+                    call stdlib_zlacgv( i-1, x( i, 1 ), ldx )
+                    ! generate reflection p(i) to annihilate a(i,i+2:n)
+                    alpha = a( i, i+1 )
+                    call stdlib_zlarfg( n-i, alpha, a( i, min( i+2, n ) ), lda,taup( i ) )
+                    e( i ) = real( alpha,KIND=dp)
+                    a( i, i+1 ) = cone
+                    ! compute x(i+1:m,i)
+                    call stdlib_zgemv( 'NO TRANSPOSE', m-i, n-i, cone, a( i+1, i+1 ),lda, a( i, i+&
+                              1 ), lda, czero, x( i+1, i ), 1 )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-i, i, cone,y( i+1, 1 ), ldy, a( i,&
+                               i+1 ), lda, czero,x( 1, i ), 1 )
+                    call stdlib_zgemv( 'NO TRANSPOSE', m-i, i, -cone, a( i+1, 1 ),lda, x( 1, i ), &
+                              1, cone, x( i+1, i ), 1 )
+                    call stdlib_zgemv( 'NO TRANSPOSE', i-1, n-i, cone, a( 1, i+1 ),lda, a( i, i+1 &
+                              ), lda, czero, x( 1, i ), 1 )
+                    call stdlib_zgemv( 'NO TRANSPOSE', m-i, i-1, -cone, x( i+1, 1 ),ldx, x( 1, i )&
+                              , 1, cone, x( i+1, i ), 1 )
+                    call stdlib_zscal( m-i, taup( i ), x( i+1, i ), 1 )
+                    call stdlib_zlacgv( n-i, a( i, i+1 ), lda )
+                 end if
+              end do loop_10
+           else
+              ! reduce to lower bidiagonal form
+              loop_20: do i = 1, nb
+                 ! update a(i,i:n)
+                 call stdlib_zlacgv( n-i+1, a( i, i ), lda )
+                 call stdlib_zlacgv( i-1, a( i, 1 ), lda )
+                 call stdlib_zgemv( 'NO TRANSPOSE', n-i+1, i-1, -cone, y( i, 1 ),ldy, a( i, 1 ), &
+                           lda, cone, a( i, i ), lda )
+                 call stdlib_zlacgv( i-1, a( i, 1 ), lda )
+                 call stdlib_zlacgv( i-1, x( i, 1 ), ldx )
+                 call stdlib_zgemv( 'CONJUGATE TRANSPOSE', i-1, n-i+1, -cone,a( 1, i ), lda, x( i,&
+                            1 ), ldx, cone, a( i, i ),lda )
+                 call stdlib_zlacgv( i-1, x( i, 1 ), ldx )
+                 ! generate reflection p(i) to annihilate a(i,i+1:n)
+                 alpha = a( i, i )
+                 call stdlib_zlarfg( n-i+1, alpha, a( i, min( i+1, n ) ), lda,taup( i ) )
+                 d( i ) = real( alpha,KIND=dp)
+                 if( i<m ) then
+                    a( i, i ) = cone
+                    ! compute x(i+1:m,i)
+                    call stdlib_zgemv( 'NO TRANSPOSE', m-i, n-i+1, cone, a( i+1, i ),lda, a( i, i &
+                              ), lda, czero, x( i+1, i ), 1 )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-i+1, i-1, cone,y( i, 1 ), ldy, a( &
+                              i, i ), lda, czero,x( 1, i ), 1 )
+                    call stdlib_zgemv( 'NO TRANSPOSE', m-i, i-1, -cone, a( i+1, 1 ),lda, x( 1, i )&
+                              , 1, cone, x( i+1, i ), 1 )
+                    call stdlib_zgemv( 'NO TRANSPOSE', i-1, n-i+1, cone, a( 1, i ),lda, a( i, i ),&
+                               lda, czero, x( 1, i ), 1 )
+                    call stdlib_zgemv( 'NO TRANSPOSE', m-i, i-1, -cone, x( i+1, 1 ),ldx, x( 1, i )&
+                              , 1, cone, x( i+1, i ), 1 )
+                    call stdlib_zscal( m-i, taup( i ), x( i+1, i ), 1 )
+                    call stdlib_zlacgv( n-i+1, a( i, i ), lda )
+                    ! update a(i+1:m,i)
+                    call stdlib_zlacgv( i-1, y( i, 1 ), ldy )
+                    call stdlib_zgemv( 'NO TRANSPOSE', m-i, i-1, -cone, a( i+1, 1 ),lda, y( i, 1 )&
+                              , ldy, cone, a( i+1, i ), 1 )
+                    call stdlib_zlacgv( i-1, y( i, 1 ), ldy )
+                    call stdlib_zgemv( 'NO TRANSPOSE', m-i, i, -cone, x( i+1, 1 ),ldx, a( 1, i ), &
+                              1, cone, a( i+1, i ), 1 )
+                    ! generate reflection q(i) to annihilate a(i+2:m,i)
+                    alpha = a( i+1, i )
+                    call stdlib_zlarfg( m-i, alpha, a( min( i+2, m ), i ), 1,tauq( i ) )
+                    e( i ) = real( alpha,KIND=dp)
+                    a( i+1, i ) = cone
+                    ! compute y(i+1:n,i)
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', m-i, n-i, cone,a( i+1, i+1 ), lda, &
+                              a( i+1, i ), 1, czero,y( i+1, i ), 1 )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', m-i, i-1, cone,a( i+1, 1 ), lda, a( &
+                              i+1, i ), 1, czero,y( 1, i ), 1 )
+                    call stdlib_zgemv( 'NO TRANSPOSE', n-i, i-1, -cone, y( i+1, 1 ),ldy, y( 1, i )&
+                              , 1, cone, y( i+1, i ), 1 )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', m-i, i, cone,x( i+1, 1 ), ldx, a( i+&
+                              1, i ), 1, czero,y( 1, i ), 1 )
+                    call stdlib_zgemv( 'CONJUGATE TRANSPOSE', i, n-i, -cone,a( 1, i+1 ), lda, y( &
+                              1, i ), 1, cone,y( i+1, i ), 1 )
+                    call stdlib_zscal( n-i, tauq( i ), y( i+1, i ), 1 )
+                 else
+                    call stdlib_zlacgv( n-i+1, a( i, i ), lda )
+                 end if
+              end do loop_20
+           end if
+           return
+     end subroutine stdlib_zlabrd
+
+     !> ZLAED7: computes the updated eigensystem of a diagonal
+     !> matrix after modification by a rank-one symmetric matrix. This
+     !> routine is used only for the eigenproblem which requires all
+     !> eigenvalues and optionally eigenvectors of a dense or banded
+     !> Hermitian matrix that has been reduced to tridiagonal form.
+     !> T = Q(in) ( D(in) + RHO * Z*Z**H ) Q**H(in) = Q(out) * D(out) * Q**H(out)
+     !> where Z = Q**Hu, u is a vector of length N with ones in the
+     !> CUTPNT and CUTPNT + 1 th elements and zeros elsewhere.
+     !> The eigenvectors of the original matrix are stored in Q, and the
+     !> eigenvalues are in D.  The algorithm consists of three stages:
+     !> The first stage consists of deflating the size of the problem
+     !> when there are multiple eigenvalues or if there is a zero in
+     !> the Z vector.  For each such occurrence the dimension of the
+     !> secular equation problem is reduced by one.  This stage is
+     !> performed by the routine DLAED2.
+     !> The second stage consists of calculating the updated
+     !> eigenvalues. This is done by finding the roots of the secular
+     !> equation via the routine DLAED4 (as called by SLAED3).
+     !> This routine also calculates the eigenvectors of the current
+     !> problem.
+     !> The final stage consists of computing the updated eigenvectors
+     !> directly using the updated eigenvalues.  The eigenvectors for
+     !> the current problem are multiplied with the eigenvectors from
+     !> the overall problem.
+
+     pure subroutine stdlib_zlaed7( n, cutpnt, qsiz, tlvls, curlvl, curpbm, d, q,ldq, rho, indxq, &
+               qstore, qptr, prmptr, perm,givptr, givcol, givnum, work, rwork, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: curlvl, curpbm, cutpnt, ldq, n, qsiz, tlvls
+           integer(ilp), intent(out) :: info
+           real(dp), intent(inout) :: rho
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: givcol(2,*), givptr(*), perm(*), prmptr(*), qptr(*)
+                     
+           integer(ilp), intent(out) :: indxq(*), iwork(*)
+           real(dp), intent(inout) :: d(*), givnum(2,*), qstore(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: q(ldq,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: coltyp, curr, i, idlmda, indx, indxc, indxp, iq, iw, iz, k, n1, n2, &
+                     ptr
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           ! if( icompq<0 .or. icompq>1 ) then
+              ! info = -1
+           ! else if( n<0 ) then
+           if( n<0 ) then
+              info = -1
+           else if( min( 1, n )>cutpnt .or. n<cutpnt ) then
+              info = -2
+           else if( qsiz<n ) then
+              info = -3
+           else if( ldq<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAED7', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! the following values are for bookkeeping purposes only.  they are
+           ! integer pointers which indicate the portion of the workspace
+           ! used by a particular array in stdlib_dlaed2 and stdlib_slaed3.
+           iz = 1
+           idlmda = iz + n
+           iw = idlmda + n
+           iq = iw + n
+           indx = 1
+           indxc = indx + n
+           coltyp = indxc + n
+           indxp = coltyp + n
+           ! form the z-vector which consists of the last row of q_1 and the
+           ! first row of q_2.
+           ptr = 1 + 2**tlvls
+           do i = 1, curlvl - 1
+              ptr = ptr + 2**( tlvls-i )
+           end do
+           curr = ptr + curpbm
+           call stdlib_dlaeda( n, tlvls, curlvl, curpbm, prmptr, perm, givptr,givcol, givnum, &
+                     qstore, qptr, rwork( iz ),rwork( iz+n ), info )
+           ! when solving the final problem, we no longer need the stored data,
+           ! so we will overwrite the data from this level onto the previously
+           ! used storage space.
+           if( curlvl==tlvls ) then
+              qptr( curr ) = 1
+              prmptr( curr ) = 1
+              givptr( curr ) = 1
+           end if
+           ! sort and deflate eigenvalues.
+           call stdlib_zlaed8( k, n, qsiz, q, ldq, d, rho, cutpnt, rwork( iz ),rwork( idlmda ), &
+           work, qsiz, rwork( iw ),iwork( indxp ), iwork( indx ), indxq,perm( prmptr( curr ) ), &
+           givptr( curr+1 ),givcol( 1, givptr( curr ) ),givnum( 1, givptr( curr ) ), info )
+                     
+           prmptr( curr+1 ) = prmptr( curr ) + n
+           givptr( curr+1 ) = givptr( curr+1 ) + givptr( curr )
+           ! solve secular equation.
+           if( k/=0 ) then
+              call stdlib_dlaed9( k, 1, k, n, d, rwork( iq ), k, rho,rwork( idlmda ), rwork( iw ),&
+                        qstore( qptr( curr ) ), k, info )
+              call stdlib_zlacrm( qsiz, k, work, qsiz, qstore( qptr( curr ) ), k, q,ldq, rwork( &
+                        iq ) )
+              qptr( curr+1 ) = qptr( curr ) + k**2
+              if( info/=0 ) then
+                 return
+              end if
+           ! prepare the indxq sorting premutation.
+              n1 = k
+              n2 = n - k
+              call stdlib_dlamrg( n1, n2, d, 1, -1, indxq )
+           else
+              qptr( curr+1 ) = qptr( curr )
+              do i = 1, n
+                 indxq( i ) = i
+              end do
+           end if
+           return
+     end subroutine stdlib_zlaed7
+
+     !> ZLAEIN: uses inverse iteration to find a right or left eigenvector
+     !> corresponding to the eigenvalue W of a complex upper Hessenberg
+     !> matrix H.
+
+     pure subroutine stdlib_zlaein( rightv, noinit, n, h, ldh, w, v, b, ldb, rwork,eps3, smlnum, &
+               info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: noinit, rightv
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldh, n
+           real(dp), intent(in) :: eps3, smlnum
+           complex(dp), intent(in) :: w
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(out) :: b(ldb,*)
+           complex(dp), intent(in) :: h(ldh,*)
+           complex(dp), intent(inout) :: v(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: tenth = 1.0e-1_dp
+           
+           
+           ! Local Scalars 
+           character :: normin, trans
+           integer(ilp) :: i, ierr, its, j
+           real(dp) :: growto, nrmsml, rootn, rtemp, scale, vnorm
+           complex(dp) :: cdum, ei, ej, temp, x
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           info = 0
+           ! growto is the threshold used in the acceptance test for an
+           ! eigenvector.
+           rootn = sqrt( real( n,KIND=dp) )
+           growto = tenth / rootn
+           nrmsml = max( one, eps3*rootn )*smlnum
+           ! form b = h - w*i (except that the subdiagonal elements are not
+           ! stored).
+           do j = 1, n
+              do i = 1, j - 1
+                 b( i, j ) = h( i, j )
+              end do
+              b( j, j ) = h( j, j ) - w
+           end do
+           if( noinit ) then
+              ! initialize v.
+              do i = 1, n
+                 v( i ) = eps3
+              end do
+           else
+              ! scale supplied initial vector.
+              vnorm = stdlib_dznrm2( n, v, 1 )
+              call stdlib_zdscal( n, ( eps3*rootn ) / max( vnorm, nrmsml ), v, 1 )
+           end if
+           if( rightv ) then
+              ! lu decomposition with partial pivoting of b, replacing czero
+              ! pivots by eps3.
+              do i = 1, n - 1
+                 ei = h( i+1, i )
+                 if( cabs1( b( i, i ) )<cabs1( ei ) ) then
+                    ! interchange rows and eliminate.
+                    x = stdlib_zladiv( b( i, i ), ei )
+                    b( i, i ) = ei
+                    do j = i + 1, n
+                       temp = b( i+1, j )
+                       b( i+1, j ) = b( i, j ) - x*temp
+                       b( i, j ) = temp
+                    end do
+                 else
+                    ! eliminate without interchange.
+                    if( b( i, i )==czero )b( i, i ) = eps3
+                    x = stdlib_zladiv( ei, b( i, i ) )
+                    if( x/=czero ) then
+                       do j = i + 1, n
+                          b( i+1, j ) = b( i+1, j ) - x*b( i, j )
+                       end do
+                    end if
+                 end if
+              end do
+              if( b( n, n )==czero )b( n, n ) = eps3
+              trans = 'N'
+           else
+              ! ul decomposition with partial pivoting of b, replacing czero
+              ! pivots by eps3.
+              do j = n, 2, -1
+                 ej = h( j, j-1 )
+                 if( cabs1( b( j, j ) )<cabs1( ej ) ) then
+                    ! interchange columns and eliminate.
+                    x = stdlib_zladiv( b( j, j ), ej )
+                    b( j, j ) = ej
+                    do i = 1, j - 1
+                       temp = b( i, j-1 )
+                       b( i, j-1 ) = b( i, j ) - x*temp
+                       b( i, j ) = temp
+                    end do
+                 else
+                    ! eliminate without interchange.
+                    if( b( j, j )==czero )b( j, j ) = eps3
+                    x = stdlib_zladiv( ej, b( j, j ) )
+                    if( x/=czero ) then
+                       do i = 1, j - 1
+                          b( i, j-1 ) = b( i, j-1 ) - x*b( i, j )
+                       end do
+                    end if
+                 end if
+              end do
+              if( b( 1, 1 )==czero )b( 1, 1 ) = eps3
+              trans = 'C'
+           end if
+           normin = 'N'
+           do its = 1, n
+              ! solve u*x = scale*v for a right eigenvector
+                ! or u**h *x = scale*v for a left eigenvector,
+              ! overwriting x on v.
+              call stdlib_zlatrs( 'UPPER', trans, 'NONUNIT', normin, n, b, ldb, v,scale, rwork, &
+                        ierr )
+              normin = 'Y'
+              ! test for sufficient growth in the norm of v.
+              vnorm = stdlib_dzasum( n, v, 1 )
+              if( vnorm>=growto*scale )go to 120
+              ! choose new orthogonal starting vector and try again.
+              rtemp = eps3 / ( rootn+one )
+              v( 1 ) = eps3
+              do i = 2, n
+                 v( i ) = rtemp
+              end do
+              v( n-its+1 ) = v( n-its+1 ) - eps3*rootn
+           end do
+           ! failure to find eigenvector in n iterations.
+           info = 1
+           120 continue
+           ! normalize eigenvector.
+           i = stdlib_izamax( n, v, 1 )
+           call stdlib_zdscal( n, one / cabs1( v( i ) ), v, 1 )
+           return
+     end subroutine stdlib_zlaein
+
+     !> ZLAGS2: computes 2-by-2 unitary matrices U, V and Q, such
+     !> that if ( UPPER ) then
+     !> U**H *A*Q = U**H *( A1 A2 )*Q = ( x  0  )
+     !> ( 0  A3 )     ( x  x  )
+     !> and
+     !> V**H*B*Q = V**H *( B1 B2 )*Q = ( x  0  )
+     !> ( 0  B3 )     ( x  x  )
+     !> or if ( .NOT.UPPER ) then
+     !> U**H *A*Q = U**H *( A1 0  )*Q = ( x  x  )
+     !> ( A2 A3 )     ( 0  x  )
+     !> and
+     !> V**H *B*Q = V**H *( B1 0  )*Q = ( x  x  )
+     !> ( B2 B3 )     ( 0  x  )
+     !> where
+     !> U = (   CSU    SNU ), V = (  CSV    SNV ),
+     !> ( -SNU**H  CSU )      ( -SNV**H CSV )
+     !> Q = (   CSQ    SNQ )
+     !> ( -SNQ**H  CSQ )
+     !> The rows of the transformed A and B are parallel. Moreover, if the
+     !> input 2-by-2 matrix A is not zero, then the transformed (1,1) entry
+     !> of A is not zero. If the input matrices A and B are both not zero,
+     !> then the transformed (2,2) element of B is not zero, except when the
+     !> first rows of input A and B are parallel and the second rows are
+     !> zero.
+
+     pure subroutine stdlib_zlags2( upper, a1, a2, a3, b1, b2, b3, csu, snu, csv,snv, csq, snq )
+               
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: upper
+           real(dp), intent(in) :: a1, a3, b1, b3
+           real(dp), intent(out) :: csq, csu, csv
+           complex(dp), intent(in) :: a2, b2
+           complex(dp), intent(out) :: snq, snu, snv
+        ! =====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: a, aua11, aua12, aua21, aua22, avb12, avb11, avb21, avb22, csl, csr, d, fb,&
+                      fc, s1, s2, snl, snr, ua11r, ua22r, vb11r, vb22r
+           complex(dp) :: b, c, d1, r, t, ua11, ua12, ua21, ua22, vb11, vb12, vb21, vb22
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag
+           ! Statement Functions 
+           real(dp) :: abs1
+           ! Statement Function Definitions 
+           abs1( t ) = abs( real( t,KIND=dp) ) + abs( aimag( t ) )
+           ! Executable Statements 
+           if( upper ) then
+              ! input matrices a and b are upper triangular matrices
+              ! form matrix c = a*adj(b) = ( a b )
+                                         ! ( 0 d )
+              a = a1*b3
+              d = a3*b1
+              b = a2*b1 - a1*b2
+              fb = abs( b )
+              ! transform complex 2-by-2 matrix c to real matrix by unitary
+              ! diagonal matrix diag(1,d1).
+              d1 = one
+              if( fb/=zero )d1 = b / fb
+              ! the svd of real 2 by 2 triangular c
+               ! ( csl -snl )*( a b )*(  csr  snr ) = ( r 0 )
+               ! ( snl  csl ) ( 0 d ) ( -snr  csr )   ( 0 t )
+              call stdlib_dlasv2( a, fb, d, s1, s2, snr, csr, snl, csl )
+              if( abs( csl )>=abs( snl ) .or. abs( csr )>=abs( snr ) )then
+                 ! compute the (1,1) and (1,2) elements of u**h *a and v**h *b,
+                 ! and (1,2) element of |u|**h *|a| and |v|**h *|b|.
+                 ua11r = csl*a1
+                 ua12 = csl*a2 + d1*snl*a3
+                 vb11r = csr*b1
+                 vb12 = csr*b2 + d1*snr*b3
+                 aua12 = abs( csl )*abs1( a2 ) + abs( snl )*abs( a3 )
+                 avb12 = abs( csr )*abs1( b2 ) + abs( snr )*abs( b3 )
+                 ! zero (1,2) elements of u**h *a and v**h *b
+                 if( ( abs( ua11r )+abs1( ua12 ) )==zero ) then
+                    call stdlib_zlartg( -cmplx( vb11r,KIND=dp), conjg( vb12 ), csq, snq,r )
+                              
+                 else if( ( abs( vb11r )+abs1( vb12 ) )==zero ) then
+                    call stdlib_zlartg( -cmplx( ua11r,KIND=dp), conjg( ua12 ), csq, snq,r )
+                              
+                 else if( aua12 / ( abs( ua11r )+abs1( ua12 ) )<=avb12 /( abs( vb11r )+abs1( vb12 &
+                           ) ) ) then
+                    call stdlib_zlartg( -cmplx( ua11r,KIND=dp), conjg( ua12 ), csq, snq,r )
+                              
+                 else
+                    call stdlib_zlartg( -cmplx( vb11r,KIND=dp), conjg( vb12 ), csq, snq,r )
+                              
+                 end if
+                 csu = csl
+                 snu = -d1*snl
+                 csv = csr
+                 snv = -d1*snr
+              else
+                 ! compute the (2,1) and (2,2) elements of u**h *a and v**h *b,
+                 ! and (2,2) element of |u|**h *|a| and |v|**h *|b|.
+                 ua21 = -conjg( d1 )*snl*a1
+                 ua22 = -conjg( d1 )*snl*a2 + csl*a3
+                 vb21 = -conjg( d1 )*snr*b1
+                 vb22 = -conjg( d1 )*snr*b2 + csr*b3
+                 aua22 = abs( snl )*abs1( a2 ) + abs( csl )*abs( a3 )
+                 avb22 = abs( snr )*abs1( b2 ) + abs( csr )*abs( b3 )
+                 ! zero (2,2) elements of u**h *a and v**h *b, and then swap.
+                 if( ( abs1( ua21 )+abs1( ua22 ) )==zero ) then
+                    call stdlib_zlartg( -conjg( vb21 ), conjg( vb22 ), csq, snq,r )
+                 else if( ( abs1( vb21 )+abs( vb22 ) )==zero ) then
+                    call stdlib_zlartg( -conjg( ua21 ), conjg( ua22 ), csq, snq,r )
+                 else if( aua22 / ( abs1( ua21 )+abs1( ua22 ) )<=avb22 /( abs1( vb21 )+abs1( vb22 &
+                           ) ) ) then
+                    call stdlib_zlartg( -conjg( ua21 ), conjg( ua22 ), csq, snq,r )
+                 else
+                    call stdlib_zlartg( -conjg( vb21 ), conjg( vb22 ), csq, snq,r )
+                 end if
+                 csu = snl
+                 snu = d1*csl
+                 csv = snr
+                 snv = d1*csr
+              end if
+           else
+              ! input matrices a and b are lower triangular matrices
+              ! form matrix c = a*adj(b) = ( a 0 )
+                                         ! ( c d )
+              a = a1*b3
+              d = a3*b1
+              c = a2*b3 - a3*b2
+              fc = abs( c )
+              ! transform complex 2-by-2 matrix c to real matrix by unitary
+              ! diagonal matrix diag(d1,1).
+              d1 = one
+              if( fc/=zero )d1 = c / fc
+              ! the svd of real 2 by 2 triangular c
+               ! ( csl -snl )*( a 0 )*(  csr  snr ) = ( r 0 )
+               ! ( snl  csl ) ( c d ) ( -snr  csr )   ( 0 t )
+              call stdlib_dlasv2( a, fc, d, s1, s2, snr, csr, snl, csl )
+              if( abs( csr )>=abs( snr ) .or. abs( csl )>=abs( snl ) )then
+                 ! compute the (2,1) and (2,2) elements of u**h *a and v**h *b,
+                 ! and (2,1) element of |u|**h *|a| and |v|**h *|b|.
+                 ua21 = -d1*snr*a1 + csr*a2
+                 ua22r = csr*a3
+                 vb21 = -d1*snl*b1 + csl*b2
+                 vb22r = csl*b3
+                 aua21 = abs( snr )*abs( a1 ) + abs( csr )*abs1( a2 )
+                 avb21 = abs( snl )*abs( b1 ) + abs( csl )*abs1( b2 )
+                 ! zero (2,1) elements of u**h *a and v**h *b.
+                 if( ( abs1( ua21 )+abs( ua22r ) )==zero ) then
+                    call stdlib_zlartg( cmplx( vb22r,KIND=dp), vb21, csq, snq, r )
+                 else if( ( abs1( vb21 )+abs( vb22r ) )==zero ) then
+                    call stdlib_zlartg( cmplx( ua22r,KIND=dp), ua21, csq, snq, r )
+                 else if( aua21 / ( abs1( ua21 )+abs( ua22r ) )<=avb21 /( abs1( vb21 )+abs( vb22r &
+                           ) ) ) then
+                    call stdlib_zlartg( cmplx( ua22r,KIND=dp), ua21, csq, snq, r )
+                 else
+                    call stdlib_zlartg( cmplx( vb22r,KIND=dp), vb21, csq, snq, r )
+                 end if
+                 csu = csr
+                 snu = -conjg( d1 )*snr
+                 csv = csl
+                 snv = -conjg( d1 )*snl
+              else
+                 ! compute the (1,1) and (1,2) elements of u**h *a and v**h *b,
+                 ! and (1,1) element of |u|**h *|a| and |v|**h *|b|.
+                 ua11 = csr*a1 + conjg( d1 )*snr*a2
+                 ua12 = conjg( d1 )*snr*a3
+                 vb11 = csl*b1 + conjg( d1 )*snl*b2
+                 vb12 = conjg( d1 )*snl*b3
+                 aua11 = abs( csr )*abs( a1 ) + abs( snr )*abs1( a2 )
+                 avb11 = abs( csl )*abs( b1 ) + abs( snl )*abs1( b2 )
+                 ! zero (1,1) elements of u**h *a and v**h *b, and then swap.
+                 if( ( abs1( ua11 )+abs1( ua12 ) )==zero ) then
+                    call stdlib_zlartg( vb12, vb11, csq, snq, r )
+                 else if( ( abs1( vb11 )+abs1( vb12 ) )==zero ) then
+                    call stdlib_zlartg( ua12, ua11, csq, snq, r )
+                 else if( aua11 / ( abs1( ua11 )+abs1( ua12 ) )<=avb11 /( abs1( vb11 )+abs1( vb12 &
+                           ) ) ) then
+                    call stdlib_zlartg( ua12, ua11, csq, snq, r )
+                 else
+                    call stdlib_zlartg( vb12, vb11, csq, snq, r )
+                 end if
+                 csu = snr
+                 snu = conjg( d1 )*csr
+                 csv = snl
+                 snv = conjg( d1 )*csl
+              end if
+           end if
+           return
+     end subroutine stdlib_zlags2
+
+     !> ZLAHQR: is an auxiliary routine called by CHSEQR to update the
+     !> eigenvalues and Schur decomposition already computed by CHSEQR, by
+     !> dealing with the Hessenberg submatrix in rows and columns ILO to
+     !> IHI.
+
+     pure subroutine stdlib_zlahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, info &
+               )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(dp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(dp), intent(out) :: w(*)
+        ! =========================================================
+           ! Parameters 
+           real(dp), parameter :: rzero = 0.0_dp
+           real(dp), parameter :: rone = 1.0_dp
+           real(dp), parameter :: dat1 = 3.0_dp/4.0_dp
+           integer(ilp), parameter :: kexsh = 10
+           
+           
+           
+           
+           ! Local Scalars 
+           complex(dp) :: cdum, h11, h11s, h22, sc, sum, t, t1, temp, u, v2, x, y
+           real(dp) :: aa, ab, ba, bb, h10, h21, rtemp, s, safmax, safmin, smlnum, sx, t2, tst, &
+                     ulp
+           integer(ilp) :: i, i1, i2, its, itmax, j, jhi, jlo, k, l, m, nh, nz, kdefl
+           ! Local Arrays 
+           complex(dp) :: v(2)
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,aimag,max,min,sqrt
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( n==0 )return
+           if( ilo==ihi ) then
+              w( ilo ) = h( ilo, ilo )
+              return
+           end if
+           ! ==== clear out the trash ====
+           do j = ilo, ihi - 3
+              h( j+2, j ) = czero
+              h( j+3, j ) = czero
+           end do
+           if( ilo<=ihi-2 )h( ihi, ihi-2 ) = czero
+           ! ==== ensure that subdiagonal entries are real ====
+           if( wantt ) then
+              jlo = 1
+              jhi = n
+           else
+              jlo = ilo
+              jhi = ihi
+           end if
+           do i = ilo + 1, ihi
+              if( aimag( h( i, i-1 ) )/=rzero ) then
+                 ! ==== the following redundant normalization
+                 ! .    avoids problems with both gradual and
+                 ! .    sudden underflow in abs(h(i,i-1)) ====
+                 sc = h( i, i-1 ) / cabs1( h( i, i-1 ) )
+                 sc = conjg( sc ) / abs( sc )
+                 h( i, i-1 ) = abs( h( i, i-1 ) )
+                 call stdlib_zscal( jhi-i+1, sc, h( i, i ), ldh )
+                 call stdlib_zscal( min( jhi, i+1 )-jlo+1, conjg( sc ),h( jlo, i ), 1 )
+                 if( wantz )call stdlib_zscal( ihiz-iloz+1, conjg( sc ), z( iloz, i ), 1 )
+              end if
+           end do
+           nh = ihi - ilo + 1
+           nz = ihiz - iloz + 1
+           ! set machine-dependent constants for the stopping criterion.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = rone / safmin
+           call stdlib_dlabad( safmin, safmax )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin*( real( nh,KIND=dp) / ulp )
+           ! i1 and i2 are the indices of the first row and last column of h
+           ! to which transformations must be applied. if eigenvalues only are
+           ! being computed, i1 and i2 are set inside the main loop.
+           if( wantt ) then
+              i1 = 1
+              i2 = n
+           end if
+           ! itmax is the total number of qr iterations allowed.
+           itmax = 30 * max( 10, nh )
+           ! kdefl counts the number of iterations since a deflation
+           kdefl = 0
+           ! the main loop begins here. i is the loop index and decreases from
+           ! ihi to ilo in steps of 1. each iteration of the loop works
+           ! with the active submatrix in rows and columns l to i.
+           ! eigenvalues i+1 to ihi have already converged. either l = ilo, or
+           ! h(l,l-1) is negligible so that the matrix splits.
+           i = ihi
+           30 continue
+           if( i<ilo )go to 150
+           ! perform qr iterations on rows and columns ilo to i until a
+           ! submatrix of order 1 splits off at the bottom because a
+           ! subdiagonal element has become negligible.
+           l = ilo
+           loop_130: do its = 0, itmax
+              ! look for a single small subdiagonal element.
+              do k = i, l + 1, -1
+                 if( cabs1( h( k, k-1 ) )<=smlnum )go to 50
+                 tst = cabs1( h( k-1, k-1 ) ) + cabs1( h( k, k ) )
+                 if( tst==czero ) then
+                    if( k-2>=ilo )tst = tst + abs( real( h( k-1, k-2 ),KIND=dp) )
+                    if( k+1<=ihi )tst = tst + abs( real( h( k+1, k ),KIND=dp) )
+                 end if
+                 ! ==== the following is a conservative small subdiagonal
+                 ! .    deflation criterion due to ahues
+                 ! .    1997). it has better mathematical foundation and
+                 ! .    improves accuracy in some examples.  ====
+                 if( abs( real( h( k, k-1 ),KIND=dp) )<=ulp*tst ) then
+                    ab = max( cabs1( h( k, k-1 ) ), cabs1( h( k-1, k ) ) )
+                    ba = min( cabs1( h( k, k-1 ) ), cabs1( h( k-1, k ) ) )
+                    aa = max( cabs1( h( k, k ) ),cabs1( h( k-1, k-1 )-h( k, k ) ) )
+                    bb = min( cabs1( h( k, k ) ),cabs1( h( k-1, k-1 )-h( k, k ) ) )
+                    s = aa + ab
+                    if( ba*( ab / s )<=max( smlnum,ulp*( bb*( aa / s ) ) ) )go to 50
+                 end if
+              end do
+              50 continue
+              l = k
+              if( l>ilo ) then
+                 ! h(l,l-1) is negligible
+                 h( l, l-1 ) = czero
+              end if
+              ! exit from loop if a submatrix of order 1 has split off.
+              if( l>=i )go to 140
+              kdefl = kdefl + 1
+              ! now the active submatrix is in rows and columns l to i. if
+              ! eigenvalues only are being computed, only the active submatrix
+              ! need be transformed.
+              if( .not.wantt ) then
+                 i1 = l
+                 i2 = i
+              end if
+              if( mod(kdefl,2*kexsh)==0 ) then
+                 ! exceptional shift.
+                 s = dat1*abs( real( h( i, i-1 ),KIND=dp) )
+                 t = s + h( i, i )
+              else if( mod(kdefl,kexsh)==0 ) then
+                 ! exceptional shift.
+                 s = dat1*abs( real( h( l+1, l ),KIND=dp) )
+                 t = s + h( l, l )
+              else
+                 ! wilkinson's shift.
+                 t = h( i, i )
+                 u = sqrt( h( i-1, i ) )*sqrt( h( i, i-1 ) )
+                 s = cabs1( u )
+                 if( s/=rzero ) then
+                    x = half*( h( i-1, i-1 )-t )
+                    sx = cabs1( x )
+                    s = max( s, cabs1( x ) )
+                    y = s*sqrt( ( x / s )**2+( u / s )**2 )
+                    if( sx>rzero ) then
+                       if( real( x / sx,KIND=dp)*real( y,KIND=dp)+aimag( x / sx )*aimag( y )&
+                                 <rzero )y = -y
+                    end if
+                    t = t - u*stdlib_zladiv( u, ( x+y ) )
+                 end if
+              end if
+              ! look for two consecutive small subdiagonal elements.
+              do m = i - 1, l + 1, -1
+                 ! determine the effect of starting the single-shift qr
+                 ! iteration at row m, and see if this would make h(m,m-1)
+                 ! negligible.
+                 h11 = h( m, m )
+                 h22 = h( m+1, m+1 )
+                 h11s = h11 - t
+                 h21 = real( h( m+1, m ),KIND=dp)
+                 s = cabs1( h11s ) + abs( h21 )
+                 h11s = h11s / s
+                 h21 = h21 / s
+                 v( 1 ) = h11s
+                 v( 2 ) = h21
+                 h10 = real( h( m, m-1 ),KIND=dp)
+                 if( abs( h10 )*abs( h21 )<=ulp*( cabs1( h11s )*( cabs1( h11 )+cabs1( h22 ) ) ) )&
+                           go to 70
+              end do
+              h11 = h( l, l )
+              h22 = h( l+1, l+1 )
+              h11s = h11 - t
+              h21 = real( h( l+1, l ),KIND=dp)
+              s = cabs1( h11s ) + abs( h21 )
+              h11s = h11s / s
+              h21 = h21 / s
+              v( 1 ) = h11s
+              v( 2 ) = h21
+              70 continue
+              ! single-shift qr step
+              loop_120: do k = m, i - 1
+                 ! the first iteration of this loop determines a reflection g
+                 ! from the vector v and applies it from left and right to h,
+                 ! thus creating a nonzero bulge below the subdiagonal.
+                 ! each subsequent iteration determines a reflection g to
+                 ! restore the hessenberg form in the (k-1)th column, and thus
+                 ! chases the bulge cone step toward the bottom of the active
+                 ! submatrix.
+                 ! v(2) is always real before the call to stdlib_zlarfg, and hence
+                 ! after the call t2 ( = t1*v(2) ) is also real.
+                 if( k>m )call stdlib_zcopy( 2, h( k, k-1 ), 1, v, 1 )
+                 call stdlib_zlarfg( 2, v( 1 ), v( 2 ), 1, t1 )
+                 if( k>m ) then
+                    h( k, k-1 ) = v( 1 )
+                    h( k+1, k-1 ) = czero
+                 end if
+                 v2 = v( 2 )
+                 t2 = real( t1*v2,KIND=dp)
+                 ! apply g from the left to transform the rows of the matrix
+                 ! in columns k to i2.
+                 do j = k, i2
+                    sum = conjg( t1 )*h( k, j ) + t2*h( k+1, j )
+                    h( k, j ) = h( k, j ) - sum
+                    h( k+1, j ) = h( k+1, j ) - sum*v2
+                 end do
+                 ! apply g from the right to transform the columns of the
+                 ! matrix in rows i1 to min(k+2,i).
+                 do j = i1, min( k+2, i )
+                    sum = t1*h( j, k ) + t2*h( j, k+1 )
+                    h( j, k ) = h( j, k ) - sum
+                    h( j, k+1 ) = h( j, k+1 ) - sum*conjg( v2 )
+                 end do
+                 if( wantz ) then
+                    ! accumulate transformations in the matrix z
+                    do j = iloz, ihiz
+                       sum = t1*z( j, k ) + t2*z( j, k+1 )
+                       z( j, k ) = z( j, k ) - sum
+                       z( j, k+1 ) = z( j, k+1 ) - sum*conjg( v2 )
+                    end do
+                 end if
+                 if( k==m .and. m>l ) then
+                    ! if the qr step was started at row m > l because two
+                    ! consecutive small subdiagonals were found, then extra
+                    ! scaling must be performed to ensure that h(m,m-1) remains
+                    ! real.
+                    temp = cone - t1
+                    temp = temp / abs( temp )
+                    h( m+1, m ) = h( m+1, m )*conjg( temp )
+                    if( m+2<=i )h( m+2, m+1 ) = h( m+2, m+1 )*temp
+                    do j = m, i
+                       if( j/=m+1 ) then
+                          if( i2>j )call stdlib_zscal( i2-j, temp, h( j, j+1 ), ldh )
+                          call stdlib_zscal( j-i1, conjg( temp ), h( i1, j ), 1 )
+                          if( wantz ) then
+                             call stdlib_zscal( nz, conjg( temp ), z( iloz, j ),1 )
+                          end if
+                       end if
+                    end do
+                 end if
+              end do loop_120
+              ! ensure that h(i,i-1) is real.
+              temp = h( i, i-1 )
+              if( aimag( temp )/=rzero ) then
+                 rtemp = abs( temp )
+                 h( i, i-1 ) = rtemp
+                 temp = temp / rtemp
+                 if( i2>i )call stdlib_zscal( i2-i, conjg( temp ), h( i, i+1 ), ldh )
+                 call stdlib_zscal( i-i1, temp, h( i1, i ), 1 )
+                 if( wantz ) then
+                    call stdlib_zscal( nz, temp, z( iloz, i ), 1 )
+                 end if
+              end if
+           end do loop_130
+           ! failure to converge in remaining number of iterations
+           info = i
+           return
+           140 continue
+           ! h(i,i-1) is negligible: cone eigenvalue has converged.
+           w( i ) = h( i, i )
+           ! reset deflation counter
+           kdefl = 0
+           ! return to start of the main loop with new value of i.
+           i = l - 1
+           go to 30
+           150 continue
+           return
+     end subroutine stdlib_zlahqr
+
+     !> ZLAHR2: reduces the first NB columns of A complex general n-BY-(n-k+1)
+     !> matrix A so that elements below the k-th subdiagonal are zero. The
+     !> reduction is performed by an unitary similarity transformation
+     !> Q**H * A * Q. The routine returns the matrices V and T which determine
+     !> Q as a block reflector I - V*T*V**H, and also the matrix Y = A * V * T.
+     !> This is an auxiliary routine called by ZGEHRD.
+
+     pure subroutine stdlib_zlahr2( n, k, nb, a, lda, tau, t, ldt, y, ldy )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: k, lda, ldt, ldy, n, nb
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: t(ldt,nb), tau(nb), y(ldy,nb)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           complex(dp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 )return
+           loop_10: do i = 1, nb
+              if( i>1 ) then
+                 ! update a(k+1:n,i)
+                 ! update i-th column of a - y * v**h
+                 call stdlib_zlacgv( i-1, a( k+i-1, 1 ), lda )
+                 call stdlib_zgemv( 'NO TRANSPOSE', n-k, i-1, -cone, y(k+1,1), ldy,a( k+i-1, 1 ), &
+                           lda, cone, a( k+1, i ), 1 )
+                 call stdlib_zlacgv( i-1, a( k+i-1, 1 ), lda )
+                 ! apply i - v * t**h * v**h to this column (call it b) from the
+                 ! left, using the last column of t as workspace
+                 ! let  v = ( v1 )   and   b = ( b1 )   (first i-1 rows)
+                          ! ( v2 )             ( b2 )
+                 ! where v1 is unit lower triangular
+                 ! w := v1**h * b1
+                 call stdlib_zcopy( i-1, a( k+1, i ), 1, t( 1, nb ), 1 )
+                 call stdlib_ztrmv( 'LOWER', 'CONJUGATE TRANSPOSE', 'UNIT',i-1, a( k+1, 1 ),lda, &
+                           t( 1, nb ), 1 )
+                 ! w := w + v2**h * b2
+                 call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-k-i+1, i-1,cone, a( k+i, 1 ),lda, a( &
+                           k+i, i ), 1, cone, t( 1, nb ), 1 )
+                 ! w := t**h * w
+                 call stdlib_ztrmv( 'UPPER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',i-1, t, ldt,t( 1, &
+                           nb ), 1 )
+                 ! b2 := b2 - v2*w
+                 call stdlib_zgemv( 'NO TRANSPOSE', n-k-i+1, i-1, -cone,a( k+i, 1 ),lda, t( 1, nb &
+                           ), 1, cone, a( k+i, i ), 1 )
+                 ! b1 := b1 - v1*w
+                 call stdlib_ztrmv( 'LOWER', 'NO TRANSPOSE','UNIT', i-1,a( k+1, 1 ), lda, t( 1, &
+                           nb ), 1 )
+                 call stdlib_zaxpy( i-1, -cone, t( 1, nb ), 1, a( k+1, i ), 1 )
+                 a( k+i-1, i-1 ) = ei
+              end if
+              ! generate the elementary reflector h(i) to annihilate
+              ! a(k+i+1:n,i)
+              call stdlib_zlarfg( n-k-i+1, a( k+i, i ), a( min( k+i+1, n ), i ), 1,tau( i ) )
+                        
+              ei = a( k+i, i )
+              a( k+i, i ) = cone
+              ! compute  y(k+1:n,i)
+              call stdlib_zgemv( 'NO TRANSPOSE', n-k, n-k-i+1,cone, a( k+1, i+1 ),lda, a( k+i, i )&
+                        , 1, czero, y( k+1, i ), 1 )
+              call stdlib_zgemv( 'CONJUGATE TRANSPOSE', n-k-i+1, i-1,cone, a( k+i, 1 ), lda,a( k+&
+                        i, i ), 1, czero, t( 1, i ), 1 )
+              call stdlib_zgemv( 'NO TRANSPOSE', n-k, i-1, -cone,y( k+1, 1 ), ldy,t( 1, i ), 1, &
+                        cone, y( k+1, i ), 1 )
+              call stdlib_zscal( n-k, tau( i ), y( k+1, i ), 1 )
+              ! compute t(1:i,i)
+              call stdlib_zscal( i-1, -tau( i ), t( 1, i ), 1 )
+              call stdlib_ztrmv( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT',i-1, t, ldt,t( 1, i ), 1 )
+                        
+              t( i, i ) = tau( i )
+           end do loop_10
+           a( k+nb, nb ) = ei
+           ! compute y(1:k,1:nb)
+           call stdlib_zlacpy( 'ALL', k, nb, a( 1, 2 ), lda, y, ldy )
+           call stdlib_ztrmm( 'RIGHT', 'LOWER', 'NO TRANSPOSE','UNIT', k, nb,cone, a( k+1, 1 ), &
+                     lda, y, ldy )
+           if( n>k+nb )call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', k,nb, n-k-nb, cone,a( 1,&
+                      2+nb ), lda, a( k+1+nb, 1 ), lda, cone, y,ldy )
+           call stdlib_ztrmm( 'RIGHT', 'UPPER', 'NO TRANSPOSE','NON-UNIT', k, nb,cone, t, ldt, y, &
+                     ldy )
+           return
+     end subroutine stdlib_zlahr2
+
+     !> ZLALS0: applies back the multiplying factors of either the left or the
+     !> right singular vector matrix of a diagonal matrix appended by a row
+     !> to the right hand side matrix B in solving the least squares problem
+     !> using the divide-and-conquer SVD approach.
+     !> For the left singular vector matrix, three types of orthogonal
+     !> matrices are involved:
+     !> (1L) Givens rotations: the number of such rotations is GIVPTR; the
+     !> pairs of columns/rows they were applied to are stored in GIVCOL;
+     !> and the C- and S-values of these rotations are stored in GIVNUM.
+     !> (2L) Permutation. The (NL+1)-st row of B is to be moved to the first
+     !> row, and for J=2:N, PERM(J)-th row of B is to be moved to the
+     !> J-th row.
+     !> (3L) The left singular vector matrix of the remaining matrix.
+     !> For the right singular vector matrix, four types of orthogonal
+     !> matrices are involved:
+     !> (1R) The right singular vector matrix of the remaining matrix.
+     !> (2R) If SQRE = 1, one extra Givens rotation to generate the right
+     !> null space.
+     !> (3R) The inverse transformation of (2L).
+     !> (4R) The inverse transformation of (1L).
+
+     pure subroutine stdlib_zlals0( icompq, nl, nr, sqre, nrhs, b, ldb, bx, ldbx,perm, givptr, &
+               givcol, ldgcol, givnum, ldgnum,poles, difl, difr, z, k, c, s, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: givptr, icompq, k, ldb, ldbx, ldgcol, ldgnum, nl, nr, nrhs,&
+                      sqre
+           integer(ilp), intent(out) :: info
+           real(dp), intent(in) :: c, s
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(ldgcol,*), perm(*)
+           real(dp), intent(in) :: difl(*), difr(ldgnum,*), givnum(ldgnum,*), poles(ldgnum,*), z(&
+                     *)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: b(ldb,*)
+           complex(dp), intent(out) :: bx(ldbx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, jcol, jrow, m, n, nlp1
+           real(dp) :: diflj, difrj, dj, dsigj, dsigjp, temp
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,aimag,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           n = nl + nr + 1
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( nl<1 ) then
+              info = -2
+           else if( nr<1 ) then
+              info = -3
+           else if( ( sqre<0 ) .or. ( sqre>1 ) ) then
+              info = -4
+           else if( nrhs<1 ) then
+              info = -5
+           else if( ldb<n ) then
+              info = -7
+           else if( ldbx<n ) then
+              info = -9
+           else if( givptr<0 ) then
+              info = -11
+           else if( ldgcol<n ) then
+              info = -13
+           else if( ldgnum<n ) then
+              info = -15
+           else if( k<1 ) then
+              info = -20
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLALS0', -info )
+              return
+           end if
+           m = n + sqre
+           nlp1 = nl + 1
+           if( icompq==0 ) then
+              ! apply back orthogonal transformations from the left.
+              ! step (1l): apply back the givens rotations performed.
+              do i = 1, givptr
+                 call stdlib_zdrot( nrhs, b( givcol( i, 2 ), 1 ), ldb,b( givcol( i, 1 ), 1 ), ldb,&
+                            givnum( i, 2 ),givnum( i, 1 ) )
+              end do
+              ! step (2l): permute rows of b.
+              call stdlib_zcopy( nrhs, b( nlp1, 1 ), ldb, bx( 1, 1 ), ldbx )
+              do i = 2, n
+                 call stdlib_zcopy( nrhs, b( perm( i ), 1 ), ldb, bx( i, 1 ), ldbx )
+              end do
+              ! step (3l): apply the inverse of the left singular vector
+              ! matrix to bx.
+              if( k==1 ) then
+                 call stdlib_zcopy( nrhs, bx, ldbx, b, ldb )
+                 if( z( 1 )<zero ) then
+                    call stdlib_zdscal( nrhs, negone, b, ldb )
+                 end if
+              else
+                 loop_100: do j = 1, k
+                    diflj = difl( j )
+                    dj = poles( j, 1 )
+                    dsigj = -poles( j, 2 )
+                    if( j<k ) then
+                       difrj = -difr( j, 1 )
+                       dsigjp = -poles( j+1, 2 )
+                    end if
+                    if( ( z( j )==zero ) .or. ( poles( j, 2 )==zero ) )then
+                       rwork( j ) = zero
+                    else
+                       rwork( j ) = -poles( j, 2 )*z( j ) / diflj /( poles( j, 2 )+dj )
+                    end if
+                    do i = 1, j - 1
+                       if( ( z( i )==zero ) .or.( poles( i, 2 )==zero ) ) then
+                          rwork( i ) = zero
+                       else
+                          rwork( i ) = poles( i, 2 )*z( i ) /( stdlib_dlamc3( poles( i, 2 ), &
+                                    dsigj )-diflj ) / ( poles( i, 2 )+dj )
+                       end if
+                    end do
+                    do i = j + 1, k
+                       if( ( z( i )==zero ) .or.( poles( i, 2 )==zero ) ) then
+                          rwork( i ) = zero
+                       else
+                          rwork( i ) = poles( i, 2 )*z( i ) /( stdlib_dlamc3( poles( i, 2 ), &
+                                    dsigjp )+difrj ) / ( poles( i, 2 )+dj )
+                       end if
+                    end do
+                    rwork( 1 ) = negone
+                    temp = stdlib_dnrm2( k, rwork, 1 )
+                    ! since b and bx are complex, the following call to stdlib_dgemv
+                    ! is performed in two steps (real and imaginary parts).
+                    ! call stdlib_dgemv( 't', k, nrhs, one, bx, ldbx, work, 1, zero,
+          ! $                     b( j, 1 ), ldb )
+                    i = k + nrhs*2
+                    do jcol = 1, nrhs
+                       do jrow = 1, k
+                          i = i + 1
+                          rwork( i ) = real( bx( jrow, jcol ),KIND=dp)
+                       end do
+                    end do
+                    call stdlib_dgemv( 'T', k, nrhs, one, rwork( 1+k+nrhs*2 ), k,rwork( 1 ), 1, &
+                              zero, rwork( 1+k ), 1 )
+                    i = k + nrhs*2
+                    do jcol = 1, nrhs
+                       do jrow = 1, k
+                          i = i + 1
+                          rwork( i ) = aimag( bx( jrow, jcol ) )
+                       end do
+                    end do
+                    call stdlib_dgemv( 'T', k, nrhs, one, rwork( 1+k+nrhs*2 ), k,rwork( 1 ), 1, &
+                              zero, rwork( 1+k+nrhs ), 1 )
+                    do jcol = 1, nrhs
+                       b( j, jcol ) = cmplx( rwork( jcol+k ),rwork( jcol+k+nrhs ),KIND=dp)
+                    end do
+                    call stdlib_zlascl( 'G', 0, 0, temp, one, 1, nrhs, b( j, 1 ),ldb, info )
+                              
+                 end do loop_100
+              end if
+              ! move the deflated rows of bx to b also.
+              if( k<max( m, n ) )call stdlib_zlacpy( 'A', n-k, nrhs, bx( k+1, 1 ), ldbx,b( k+1, 1 &
+                        ), ldb )
+           else
+              ! apply back the right orthogonal transformations.
+              ! step (1r): apply back the new right singular vector matrix
+              ! to b.
+              if( k==1 ) then
+                 call stdlib_zcopy( nrhs, b, ldb, bx, ldbx )
+              else
+                 loop_180: do j = 1, k
+                    dsigj = poles( j, 2 )
+                    if( z( j )==zero ) then
+                       rwork( j ) = zero
+                    else
+                       rwork( j ) = -z( j ) / difl( j ) /( dsigj+poles( j, 1 ) ) / difr( j, 2 )
+                                 
+                    end if
+                    do i = 1, j - 1
+                       if( z( j )==zero ) then
+                          rwork( i ) = zero
+                       else
+                          rwork( i ) = z( j ) / ( stdlib_dlamc3( dsigj, -poles( i+1,2 ) )-difr( i,&
+                                     1 ) ) /( dsigj+poles( i, 1 ) ) / difr( i, 2 )
+                       end if
+                    end do
+                    do i = j + 1, k
+                       if( z( j )==zero ) then
+                          rwork( i ) = zero
+                       else
+                          rwork( i ) = z( j ) / ( stdlib_dlamc3( dsigj, -poles( i,2 ) )-difl( i ) &
+                                    ) /( dsigj+poles( i, 1 ) ) / difr( i, 2 )
+                       end if
+                    end do
+                    ! since b and bx are complex, the following call to stdlib_dgemv
+                    ! is performed in two steps (real and imaginary parts).
+                    ! call stdlib_dgemv( 't', k, nrhs, one, b, ldb, work, 1, zero,
+          ! $                     bx( j, 1 ), ldbx )
+                    i = k + nrhs*2
+                    do jcol = 1, nrhs
+                       do jrow = 1, k
+                          i = i + 1
+                          rwork( i ) = real( b( jrow, jcol ),KIND=dp)
+                       end do
+                    end do
+                    call stdlib_dgemv( 'T', k, nrhs, one, rwork( 1+k+nrhs*2 ), k,rwork( 1 ), 1, &
+                              zero, rwork( 1+k ), 1 )
+                    i = k + nrhs*2
+                    do jcol = 1, nrhs
+                       do jrow = 1, k
+                          i = i + 1
+                          rwork( i ) = aimag( b( jrow, jcol ) )
+                       end do
+                    end do
+                    call stdlib_dgemv( 'T', k, nrhs, one, rwork( 1+k+nrhs*2 ), k,rwork( 1 ), 1, &
+                              zero, rwork( 1+k+nrhs ), 1 )
+                    do jcol = 1, nrhs
+                       bx( j, jcol ) = cmplx( rwork( jcol+k ),rwork( jcol+k+nrhs ),KIND=dp)
+                                 
+                    end do
+                 end do loop_180
+              end if
+              ! step (2r): if sqre = 1, apply back the rotation that is
+              ! related to the right null space of the subproblem.
+              if( sqre==1 ) then
+                 call stdlib_zcopy( nrhs, b( m, 1 ), ldb, bx( m, 1 ), ldbx )
+                 call stdlib_zdrot( nrhs, bx( 1, 1 ), ldbx, bx( m, 1 ), ldbx, c, s )
+              end if
+              if( k<max( m, n ) )call stdlib_zlacpy( 'A', n-k, nrhs, b( k+1, 1 ), ldb, bx( k+1, 1 &
+                        ),ldbx )
+              ! step (3r): permute rows of b.
+              call stdlib_zcopy( nrhs, bx( 1, 1 ), ldbx, b( nlp1, 1 ), ldb )
+              if( sqre==1 ) then
+                 call stdlib_zcopy( nrhs, bx( m, 1 ), ldbx, b( m, 1 ), ldb )
+              end if
+              do i = 2, n
+                 call stdlib_zcopy( nrhs, bx( i, 1 ), ldbx, b( perm( i ), 1 ), ldb )
+              end do
+              ! step (4r): apply back the givens rotations performed.
+              do i = givptr, 1, -1
+                 call stdlib_zdrot( nrhs, b( givcol( i, 2 ), 1 ), ldb,b( givcol( i, 1 ), 1 ), ldb,&
+                            givnum( i, 2 ),-givnum( i, 1 ) )
+              end do
+           end if
+           return
+     end subroutine stdlib_zlals0
+
+     !> ZLALSA: is an itermediate step in solving the least squares problem
+     !> by computing the SVD of the coefficient matrix in compact form (The
+     !> singular vectors are computed as products of simple orthorgonal
+     !> matrices.).
+     !> If ICOMPQ = 0, ZLALSA applies the inverse of the left singular vector
+     !> matrix of an upper bidiagonal matrix to the right hand side; and if
+     !> ICOMPQ = 1, ZLALSA applies the right singular vector matrix to the
+     !> right hand side. The singular vector matrices were generated in
+     !> compact form by ZLALSA.
+
+     pure subroutine stdlib_zlalsa( icompq, smlsiz, n, nrhs, b, ldb, bx, ldbx, u,ldu, vt, k, difl,&
+                difr, z, poles, givptr,givcol, ldgcol, perm, givnum, c, s, rwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: icompq, ldb, ldbx, ldgcol, ldu, n, nrhs, smlsiz
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(in) :: givcol(ldgcol,*), givptr(*), k(*), perm(ldgcol,*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(in) :: c(*), difl(ldu,*), difr(ldu,*), givnum(ldu,*), poles(ldu,*), s(&
+                     *), u(ldu,*), vt(ldu,*), z(ldu,*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: b(ldb,*)
+           complex(dp), intent(out) :: bx(ldbx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i1, ic, im1, inode, j, jcol, jimag, jreal, jrow, lf, ll, lvl, lvl2, &
+                     nd, ndb1, ndiml, ndimr, nl, nlf, nlp1, nlvl, nr, nrf, nrp1, sqre
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,aimag
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( icompq<0 ) .or. ( icompq>1 ) ) then
+              info = -1
+           else if( smlsiz<3 ) then
+              info = -2
+           else if( n<smlsiz ) then
+              info = -3
+           else if( nrhs<1 ) then
+              info = -4
+           else if( ldb<n ) then
+              info = -6
+           else if( ldbx<n ) then
+              info = -8
+           else if( ldu<n ) then
+              info = -10
+           else if( ldgcol<n ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLALSA', -info )
+              return
+           end if
+           ! book-keeping and  setting up the computation tree.
+           inode = 1
+           ndiml = inode + n
+           ndimr = ndiml + n
+           call stdlib_dlasdt( n, nlvl, nd, iwork( inode ), iwork( ndiml ),iwork( ndimr ), smlsiz &
+                     )
+           ! the following code applies back the left singular vector factors.
+           ! for applying back the right singular vector factors, go to 170.
+           if( icompq==1 ) then
+              go to 170
+           end if
+           ! the nodes on the bottom level of the tree were solved
+           ! by stdlib_dlasdq. the corresponding left and right singular vector
+           ! matrices are in explicit form. first apply back the left
+           ! singular vector matrices.
+           ndb1 = ( nd+1 ) / 2
+           loop_130: do i = ndb1, nd
+              ! ic : center row of each node
+              ! nl : number of rows of left  subproblem
+              ! nr : number of rows of right subproblem
+              ! nlf: starting row of the left   subproblem
+              ! nrf: starting row of the right  subproblem
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nr = iwork( ndimr+i1 )
+              nlf = ic - nl
+              nrf = ic + 1
+              ! since b and bx are complex, the following call to stdlib_dgemm
+              ! is performed in two steps (real and imaginary parts).
+              ! call stdlib_dgemm( 't', 'n', nl, nrhs, nl, one, u( nlf, 1 ), ldu,
+           ! $               b( nlf, 1 ), ldb, zero, bx( nlf, 1 ), ldbx )
+              j = nl*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nl - 1
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=dp)
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', nl, nrhs, nl, one, u( nlf, 1 ), ldu,rwork( 1+nl*nrhs*2 &
+                        ), nl, zero, rwork( 1 ), nl )
+              j = nl*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nl - 1
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', nl, nrhs, nl, one, u( nlf, 1 ), ldu,rwork( 1+nl*nrhs*2 &
+                        ), nl, zero, rwork( 1+nl*nrhs ),nl )
+              jreal = 0
+              jimag = nl*nrhs
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nl - 1
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    bx( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=dp)
+                 end do
+              end do
+              ! since b and bx are complex, the following call to stdlib_dgemm
+              ! is performed in two steps (real and imaginary parts).
+              ! call stdlib_dgemm( 't', 'n', nr, nrhs, nr, one, u( nrf, 1 ), ldu,
+          ! $               b( nrf, 1 ), ldb, zero, bx( nrf, 1 ), ldbx )
+              j = nr*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nr - 1
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=dp)
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', nr, nrhs, nr, one, u( nrf, 1 ), ldu,rwork( 1+nr*nrhs*2 &
+                        ), nr, zero, rwork( 1 ), nr )
+              j = nr*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nr - 1
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', nr, nrhs, nr, one, u( nrf, 1 ), ldu,rwork( 1+nr*nrhs*2 &
+                        ), nr, zero, rwork( 1+nr*nrhs ),nr )
+              jreal = 0
+              jimag = nr*nrhs
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nr - 1
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    bx( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=dp)
+                 end do
+              end do
+           end do loop_130
+           ! next copy the rows of b that correspond to unchanged rows
+           ! in the bidiagonal matrix to bx.
+           do i = 1, nd
+              ic = iwork( inode+i-1 )
+              call stdlib_zcopy( nrhs, b( ic, 1 ), ldb, bx( ic, 1 ), ldbx )
+           end do
+           ! finally go through the left singular vector matrices of all
+           ! the other subproblems bottom-up on the tree.
+           j = 2**nlvl
+           sqre = 0
+           do lvl = nlvl, 1, -1
+              lvl2 = 2*lvl - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = lf, ll
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 j = j - 1
+                 call stdlib_zlals0( icompq, nl, nr, sqre, nrhs, bx( nlf, 1 ), ldbx,b( nlf, 1 ), &
+                 ldb, perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ), ldgcol,givnum( nlf, lvl2 &
+                 ), ldu, poles( nlf, lvl2 ),difl( nlf, lvl ), difr( nlf, lvl2 ),z( nlf, lvl ), k( &
+                           j ), c( j ), s( j ), rwork,info )
+              end do
+           end do
+           go to 330
+           ! icompq = 1: applying back the right singular vector factors.
+           170 continue
+           ! first now go through the right singular vector matrices of all
+           ! the tree nodes top-down.
+           j = 0
+           loop_190: do lvl = 1, nlvl
+              lvl2 = 2*lvl - 1
+              ! find the first node lf and last node ll on
+              ! the current level lvl.
+              if( lvl==1 ) then
+                 lf = 1
+                 ll = 1
+              else
+                 lf = 2**( lvl-1 )
+                 ll = 2*lf - 1
+              end if
+              do i = ll, lf, -1
+                 im1 = i - 1
+                 ic = iwork( inode+im1 )
+                 nl = iwork( ndiml+im1 )
+                 nr = iwork( ndimr+im1 )
+                 nlf = ic - nl
+                 nrf = ic + 1
+                 if( i==ll ) then
+                    sqre = 0
+                 else
+                    sqre = 1
+                 end if
+                 j = j + 1
+                 call stdlib_zlals0( icompq, nl, nr, sqre, nrhs, b( nlf, 1 ), ldb,bx( nlf, 1 ), &
+                 ldbx, perm( nlf, lvl ),givptr( j ), givcol( nlf, lvl2 ), ldgcol,givnum( nlf, &
+                 lvl2 ), ldu, poles( nlf, lvl2 ),difl( nlf, lvl ), difr( nlf, lvl2 ),z( nlf, lvl )&
+                           , k( j ), c( j ), s( j ), rwork,info )
+              end do
+           end do loop_190
+           ! the nodes on the bottom level of the tree were solved
+           ! by stdlib_dlasdq. the corresponding right singular vector
+           ! matrices are in explicit form. apply them back.
+           ndb1 = ( nd+1 ) / 2
+           loop_320: do i = ndb1, nd
+              i1 = i - 1
+              ic = iwork( inode+i1 )
+              nl = iwork( ndiml+i1 )
+              nr = iwork( ndimr+i1 )
+              nlp1 = nl + 1
+              if( i==nd ) then
+                 nrp1 = nr
+              else
+                 nrp1 = nr + 1
+              end if
+              nlf = ic - nl
+              nrf = ic + 1
+              ! since b and bx are complex, the following call to stdlib_dgemm is
+              ! performed in two steps (real and imaginary parts).
+              ! call stdlib_dgemm( 't', 'n', nlp1, nrhs, nlp1, one, vt( nlf, 1 ), ldu,
+          ! $               b( nlf, 1 ), ldb, zero, bx( nlf, 1 ), ldbx )
+              j = nlp1*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nlp1 - 1
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=dp)
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', nlp1, nrhs, nlp1, one, vt( nlf, 1 ), ldu,rwork( 1+&
+                        nlp1*nrhs*2 ), nlp1, zero, rwork( 1 ),nlp1 )
+              j = nlp1*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nlp1 - 1
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', nlp1, nrhs, nlp1, one, vt( nlf, 1 ), ldu,rwork( 1+&
+                        nlp1*nrhs*2 ), nlp1, zero,rwork( 1+nlp1*nrhs ), nlp1 )
+              jreal = 0
+              jimag = nlp1*nrhs
+              do jcol = 1, nrhs
+                 do jrow = nlf, nlf + nlp1 - 1
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    bx( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=dp)
+                 end do
+              end do
+              ! since b and bx are complex, the following call to stdlib_dgemm is
+              ! performed in two steps (real and imaginary parts).
+              ! call stdlib_dgemm( 't', 'n', nrp1, nrhs, nrp1, one, vt( nrf, 1 ), ldu,
+          ! $               b( nrf, 1 ), ldb, zero, bx( nrf, 1 ), ldbx )
+              j = nrp1*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nrp1 - 1
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=dp)
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', nrp1, nrhs, nrp1, one, vt( nrf, 1 ), ldu,rwork( 1+&
+                        nrp1*nrhs*2 ), nrp1, zero, rwork( 1 ),nrp1 )
+              j = nrp1*nrhs*2
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nrp1 - 1
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', nrp1, nrhs, nrp1, one, vt( nrf, 1 ), ldu,rwork( 1+&
+                        nrp1*nrhs*2 ), nrp1, zero,rwork( 1+nrp1*nrhs ), nrp1 )
+              jreal = 0
+              jimag = nrp1*nrhs
+              do jcol = 1, nrhs
+                 do jrow = nrf, nrf + nrp1 - 1
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    bx( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=dp)
+                 end do
+              end do
+           end do loop_320
+           330 continue
+           return
+     end subroutine stdlib_zlalsa
+
+     !> ZLALSD: uses the singular value decomposition of A to solve the least
+     !> squares problem of finding X to minimize the Euclidean norm of each
+     !> column of A*X-B, where A is N-by-N upper bidiagonal, and X and B
+     !> are N-by-NRHS. The solution X overwrites B.
+     !> The singular values of A smaller than RCOND times the largest
+     !> singular value are treated as zero in solving the least squares
+     !> problem; in this case a minimum norm solution is returned.
+     !> The actual singular values are returned in D in ascending order.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_zlalsd( uplo, smlsiz, n, nrhs, d, e, b, ldb, rcond,rank, work, rwork, &
+               iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: ldb, n, nrhs, smlsiz
+           real(dp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: bx, bxst, c, difl, difr, givcol, givnum, givptr, i, icmpq1, icmpq2, &
+           irwb, irwib, irwrb, irwu, irwvt, irwwrk, iwk, j, jcol, jimag, jreal, jrow, k, nlvl, &
+           nm1, nrwork, nsize, nsub, perm, poles, s, sizei, smlszp, sqre, st, st1, u, vt, &
+                     z
+           real(dp) :: cs, eps, orgnrm, rcnd, r, sn, tol
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,int,log,sign
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -3
+           else if( nrhs<1 ) then
+              info = -4
+           else if( ( ldb<1 ) .or. ( ldb<n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLALSD', -info )
+              return
+           end if
+           eps = stdlib_dlamch( 'EPSILON' )
+           ! set up the tolerance.
+           if( ( rcond<=zero ) .or. ( rcond>=one ) ) then
+              rcnd = eps
+           else
+              rcnd = rcond
+           end if
+           rank = 0
+           ! quick return if possible.
+           if( n==0 ) then
+              return
+           else if( n==1 ) then
+              if( d( 1 )==zero ) then
+                 call stdlib_zlaset( 'A', 1, nrhs, czero, czero, b, ldb )
+              else
+                 rank = 1
+                 call stdlib_zlascl( 'G', 0, 0, d( 1 ), one, 1, nrhs, b, ldb, info )
+                 d( 1 ) = abs( d( 1 ) )
+              end if
+              return
+           end if
+           ! rotate the matrix if it is lower bidiagonal.
+           if( uplo=='L' ) then
+              do i = 1, n - 1
+                 call stdlib_dlartg( d( i ), e( i ), cs, sn, r )
+                 d( i ) = r
+                 e( i ) = sn*d( i+1 )
+                 d( i+1 ) = cs*d( i+1 )
+                 if( nrhs==1 ) then
+                    call stdlib_zdrot( 1, b( i, 1 ), 1, b( i+1, 1 ), 1, cs, sn )
+                 else
+                    rwork( i*2-1 ) = cs
+                    rwork( i*2 ) = sn
+                 end if
+              end do
+              if( nrhs>1 ) then
+                 do i = 1, nrhs
+                    do j = 1, n - 1
+                       cs = rwork( j*2-1 )
+                       sn = rwork( j*2 )
+                       call stdlib_zdrot( 1, b( j, i ), 1, b( j+1, i ), 1, cs, sn )
+                    end do
+                 end do
+              end if
+           end if
+           ! scale.
+           nm1 = n - 1
+           orgnrm = stdlib_dlanst( 'M', n, d, e )
+           if( orgnrm==zero ) then
+              call stdlib_zlaset( 'A', n, nrhs, czero, czero, b, ldb )
+              return
+           end if
+           call stdlib_dlascl( 'G', 0, 0, orgnrm, one, n, 1, d, n, info )
+           call stdlib_dlascl( 'G', 0, 0, orgnrm, one, nm1, 1, e, nm1, info )
+           ! if n is smaller than the minimum divide size smlsiz, then solve
+           ! the problem with another solver.
+           if( n<=smlsiz ) then
+              irwu = 1
+              irwvt = irwu + n*n
+              irwwrk = irwvt + n*n
+              irwrb = irwwrk
+              irwib = irwrb + n*nrhs
+              irwb = irwib + n*nrhs
+              call stdlib_dlaset( 'A', n, n, zero, one, rwork( irwu ), n )
+              call stdlib_dlaset( 'A', n, n, zero, one, rwork( irwvt ), n )
+              call stdlib_dlasdq( 'U', 0, n, n, n, 0, d, e, rwork( irwvt ), n,rwork( irwu ), n, &
+                        rwork( irwwrk ), 1,rwork( irwwrk ), info )
+              if( info/=0 ) then
+                 return
+              end if
+              ! in the real version, b is passed to stdlib_dlasdq and multiplied
+              ! internally by q**h. here b is complex and that product is
+              ! computed below in two steps (real and imaginary parts).
+              j = irwb - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=dp)
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', n, nrhs, n, one, rwork( irwu ), n,rwork( irwb ), n, &
+                        zero, rwork( irwrb ), n )
+              j = irwb - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', n, nrhs, n, one, rwork( irwu ), n,rwork( irwb ), n, &
+                        zero, rwork( irwib ), n )
+              jreal = irwrb - 1
+              jimag = irwib - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    b( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=dp)
+                 end do
+              end do
+              tol = rcnd*abs( d( stdlib_idamax( n, d, 1 ) ) )
+              do i = 1, n
+                 if( d( i )<=tol ) then
+                    call stdlib_zlaset( 'A', 1, nrhs, czero, czero, b( i, 1 ), ldb )
+                 else
+                    call stdlib_zlascl( 'G', 0, 0, d( i ), one, 1, nrhs, b( i, 1 ),ldb, info )
+                              
+                    rank = rank + 1
+                 end if
+              end do
+              ! since b is complex, the following call to stdlib_dgemm is performed
+              ! in two steps (real and imaginary parts). that is for v * b
+              ! (in the real version of the code v**h is stored in work).
+              ! call stdlib_dgemm( 't', 'n', n, nrhs, n, one, work, n, b, ldb, zero,
+          ! $               work( nwork ), n )
+              j = irwb - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    j = j + 1
+                    rwork( j ) = real( b( jrow, jcol ),KIND=dp)
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', n, nrhs, n, one, rwork( irwvt ), n,rwork( irwb ), n, &
+                        zero, rwork( irwrb ), n )
+              j = irwb - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    j = j + 1
+                    rwork( j ) = aimag( b( jrow, jcol ) )
+                 end do
+              end do
+              call stdlib_dgemm( 'T', 'N', n, nrhs, n, one, rwork( irwvt ), n,rwork( irwb ), n, &
+                        zero, rwork( irwib ), n )
+              jreal = irwrb - 1
+              jimag = irwib - 1
+              do jcol = 1, nrhs
+                 do jrow = 1, n
+                    jreal = jreal + 1
+                    jimag = jimag + 1
+                    b( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=dp)
+                 end do
+              end do
+              ! unscale.
+              call stdlib_dlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+              call stdlib_dlasrt( 'D', n, d, info )
+              call stdlib_zlascl( 'G', 0, 0, orgnrm, one, n, nrhs, b, ldb, info )
+              return
+           end if
+           ! book-keeping and setting up some constants.
+           nlvl = int( log( real( n,KIND=dp) / real( smlsiz+1,KIND=dp) ) / log( two ),KIND=ilp) + &
+                     1
+           smlszp = smlsiz + 1
+           u = 1
+           vt = 1 + smlsiz*n
+           difl = vt + smlszp*n
+           difr = difl + nlvl*n
+           z = difr + nlvl*n*2
+           c = z + nlvl*n
+           s = c + n
+           poles = s + n
+           givnum = poles + 2*nlvl*n
+           nrwork = givnum + 2*nlvl*n
+           bx = 1
+           irwrb = nrwork
+           irwib = irwrb + smlsiz*nrhs
+           irwb = irwib + smlsiz*nrhs
+           sizei = 1 + n
+           k = sizei + n
+           givptr = k + n
+           perm = givptr + n
+           givcol = perm + nlvl*n
+           iwk = givcol + nlvl*n*2
+           st = 1
+           sqre = 0
+           icmpq1 = 1
+           icmpq2 = 0
+           nsub = 0
+           do i = 1, n
+              if( abs( d( i ) )<eps ) then
+                 d( i ) = sign( eps, d( i ) )
+              end if
+           end do
+           loop_240: do i = 1, nm1
+              if( ( abs( e( i ) )<eps ) .or. ( i==nm1 ) ) then
+                 nsub = nsub + 1
+                 iwork( nsub ) = st
+                 ! subproblem found. first determine its size and then
+                 ! apply divide and conquer on it.
+                 if( i<nm1 ) then
+                    ! a subproblem with e(i) small for i < nm1.
+                    nsize = i - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                 else if( abs( e( i ) )>=eps ) then
+                    ! a subproblem with e(nm1) not too small but i = nm1.
+                    nsize = n - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                 else
+                    ! a subproblem with e(nm1) small. this implies an
+                    ! 1-by-1 subproblem at d(n), which is not solved
+                    ! explicitly.
+                    nsize = i - st + 1
+                    iwork( sizei+nsub-1 ) = nsize
+                    nsub = nsub + 1
+                    iwork( nsub ) = n
+                    iwork( sizei+nsub-1 ) = 1
+                    call stdlib_zcopy( nrhs, b( n, 1 ), ldb, work( bx+nm1 ), n )
+                 end if
+                 st1 = st - 1
+                 if( nsize==1 ) then
+                    ! this is a 1-by-1 subproblem and is not solved
+                    ! explicitly.
+                    call stdlib_zcopy( nrhs, b( st, 1 ), ldb, work( bx+st1 ), n )
+                 else if( nsize<=smlsiz ) then
+                    ! this is a small subproblem and is solved by stdlib_dlasdq.
+                    call stdlib_dlaset( 'A', nsize, nsize, zero, one,rwork( vt+st1 ), n )
+                    call stdlib_dlaset( 'A', nsize, nsize, zero, one,rwork( u+st1 ), n )
+                    call stdlib_dlasdq( 'U', 0, nsize, nsize, nsize, 0, d( st ),e( st ), rwork( &
+                    vt+st1 ), n, rwork( u+st1 ),n, rwork( nrwork ), 1, rwork( nrwork ),info )
+                              
+                    if( info/=0 ) then
+                       return
+                    end if
+                    ! in the real version, b is passed to stdlib_dlasdq and multiplied
+                    ! internally by q**h. here b is complex and that product is
+                    ! computed below in two steps (real and imaginary parts).
+                    j = irwb - 1
+                    do jcol = 1, nrhs
+                       do jrow = st, st + nsize - 1
+                          j = j + 1
+                          rwork( j ) = real( b( jrow, jcol ),KIND=dp)
+                       end do
+                    end do
+                    call stdlib_dgemm( 'T', 'N', nsize, nrhs, nsize, one,rwork( u+st1 ), n, rwork(&
+                               irwb ), nsize,zero, rwork( irwrb ), nsize )
+                    j = irwb - 1
+                    do jcol = 1, nrhs
+                       do jrow = st, st + nsize - 1
+                          j = j + 1
+                          rwork( j ) = aimag( b( jrow, jcol ) )
+                       end do
+                    end do
+                    call stdlib_dgemm( 'T', 'N', nsize, nrhs, nsize, one,rwork( u+st1 ), n, rwork(&
+                               irwb ), nsize,zero, rwork( irwib ), nsize )
+                    jreal = irwrb - 1
+                    jimag = irwib - 1
+                    do jcol = 1, nrhs
+                       do jrow = st, st + nsize - 1
+                          jreal = jreal + 1
+                          jimag = jimag + 1
+                          b( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=dp)
+                       end do
+                    end do
+                    call stdlib_zlacpy( 'A', nsize, nrhs, b( st, 1 ), ldb,work( bx+st1 ), n )
+                              
+                 else
+                    ! a large problem. solve it using divide and conquer.
+                    call stdlib_dlasda( icmpq1, smlsiz, nsize, sqre, d( st ),e( st ), rwork( u+&
+                    st1 ), n, rwork( vt+st1 ),iwork( k+st1 ), rwork( difl+st1 ),rwork( difr+st1 ),&
+                     rwork( z+st1 ),rwork( poles+st1 ), iwork( givptr+st1 ),iwork( givcol+st1 ), &
+                     n, iwork( perm+st1 ),rwork( givnum+st1 ), rwork( c+st1 ),rwork( s+st1 ), &
+                               rwork( nrwork ),iwork( iwk ), info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                    bxst = bx + st1
+                    call stdlib_zlalsa( icmpq2, smlsiz, nsize, nrhs, b( st, 1 ),ldb, work( bxst ),&
+                     n, rwork( u+st1 ), n,rwork( vt+st1 ), iwork( k+st1 ),rwork( difl+st1 ), &
+                     rwork( difr+st1 ),rwork( z+st1 ), rwork( poles+st1 ),iwork( givptr+st1 ), &
+                     iwork( givcol+st1 ), n,iwork( perm+st1 ), rwork( givnum+st1 ),rwork( c+st1 ),&
+                                rwork( s+st1 ),rwork( nrwork ), iwork( iwk ), info )
+                    if( info/=0 ) then
+                       return
+                    end if
+                 end if
+                 st = i + 1
+              end if
+           end do loop_240
+           ! apply the singular values and treat the tiny ones as zero.
+           tol = rcnd*abs( d( stdlib_idamax( n, d, 1 ) ) )
+           do i = 1, n
+              ! some of the elements in d can be negative because 1-by-1
+              ! subproblems were not solved explicitly.
+              if( abs( d( i ) )<=tol ) then
+                 call stdlib_zlaset( 'A', 1, nrhs, czero, czero, work( bx+i-1 ), n )
+              else
+                 rank = rank + 1
+                 call stdlib_zlascl( 'G', 0, 0, d( i ), one, 1, nrhs,work( bx+i-1 ), n, info )
+                           
+              end if
+              d( i ) = abs( d( i ) )
+           end do
+           ! now apply back the right singular vectors.
+           icmpq2 = 1
+           loop_320: do i = 1, nsub
+              st = iwork( i )
+              st1 = st - 1
+              nsize = iwork( sizei+i-1 )
+              bxst = bx + st1
+              if( nsize==1 ) then
+                 call stdlib_zcopy( nrhs, work( bxst ), n, b( st, 1 ), ldb )
+              else if( nsize<=smlsiz ) then
+                 ! since b and bx are complex, the following call to stdlib_dgemm
+                 ! is performed in two steps (real and imaginary parts).
+                 ! call stdlib_dgemm( 't', 'n', nsize, nrhs, nsize, one,
+          ! $                  rwork( vt+st1 ), n, rwork( bxst ), n, zero,
+          ! $                  b( st, 1 ), ldb )
+                 j = bxst - n - 1
+                 jreal = irwb - 1
+                 do jcol = 1, nrhs
+                    j = j + n
+                    do jrow = 1, nsize
+                       jreal = jreal + 1
+                       rwork( jreal ) = real( work( j+jrow ),KIND=dp)
+                    end do
+                 end do
+                 call stdlib_dgemm( 'T', 'N', nsize, nrhs, nsize, one,rwork( vt+st1 ), n, rwork( &
+                           irwb ), nsize, zero,rwork( irwrb ), nsize )
+                 j = bxst - n - 1
+                 jimag = irwb - 1
+                 do jcol = 1, nrhs
+                    j = j + n
+                    do jrow = 1, nsize
+                       jimag = jimag + 1
+                       rwork( jimag ) = aimag( work( j+jrow ) )
+                    end do
+                 end do
+                 call stdlib_dgemm( 'T', 'N', nsize, nrhs, nsize, one,rwork( vt+st1 ), n, rwork( &
+                           irwb ), nsize, zero,rwork( irwib ), nsize )
+                 jreal = irwrb - 1
+                 jimag = irwib - 1
+                 do jcol = 1, nrhs
+                    do jrow = st, st + nsize - 1
+                       jreal = jreal + 1
+                       jimag = jimag + 1
+                       b( jrow, jcol ) = cmplx( rwork( jreal ),rwork( jimag ),KIND=dp)
+                    end do
+                 end do
+              else
+                 call stdlib_zlalsa( icmpq2, smlsiz, nsize, nrhs, work( bxst ), n,b( st, 1 ), ldb,&
+                  rwork( u+st1 ), n,rwork( vt+st1 ), iwork( k+st1 ),rwork( difl+st1 ), rwork( &
+                  difr+st1 ),rwork( z+st1 ), rwork( poles+st1 ),iwork( givptr+st1 ), iwork( &
+                  givcol+st1 ), n,iwork( perm+st1 ), rwork( givnum+st1 ),rwork( c+st1 ), rwork( s+&
+                            st1 ),rwork( nrwork ), iwork( iwk ), info )
+                 if( info/=0 ) then
+                    return
+                 end if
+              end if
+           end do loop_320
+           ! unscale and sort the singular values.
+           call stdlib_dlascl( 'G', 0, 0, one, orgnrm, n, 1, d, n, info )
+           call stdlib_dlasrt( 'D', n, d, info )
+           call stdlib_zlascl( 'G', 0, 0, orgnrm, one, n, nrhs, b, ldb, info )
+           return
+     end subroutine stdlib_zlalsd
+
+     !> ZLANGB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n band matrix  A,  with kl sub-diagonals and ku super-diagonals.
+
+     real(dp) function stdlib_zlangb( norm, n, kl, ku, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: kl, ku, ldab, n
+           ! Array Arguments 
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(in) :: ab(ldab,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k, l
+           real(dp) :: scale, sum, value, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                    temp = abs( ab( i, j ) )
+                    if( value<temp .or. stdlib_disnan( temp ) ) value = temp
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                    sum = sum + abs( ab( i, j ) )
+                 end do
+                 if( value<sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 k = ku + 1 - j
+                 do i = max( 1, j-ku ), min( n, j+kl )
+                    work( i ) = work( i ) + abs( ab( k+i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, n
+                 temp = work( i )
+                 if( value<temp .or. stdlib_disnan( temp ) ) value = temp
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 l = max( 1, j-ku )
+                 k = ku + 1 - j + l
+                 call stdlib_zlassq( min( n, j+kl )-l+1, ab( k, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_zlangb = value
+           return
+     end function stdlib_zlangb
+
+     !> ZLANGE:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex matrix A.
+
+     real(dp) function stdlib_zlange( norm, m, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: scale, sum, value, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( min( m, n )==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = 1, m
+                    temp = abs( a( i, j ) )
+                    if( value<temp .or. stdlib_disnan( temp ) ) value = temp
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = 1, m
+                    sum = sum + abs( a( i, j ) )
+                 end do
+                 if( value<sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, m
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 do i = 1, m
+                    work( i ) = work( i ) + abs( a( i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, m
+                 temp = work( i )
+                 if( value<temp .or. stdlib_disnan( temp ) ) value = temp
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 call stdlib_zlassq( m, a( 1, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_zlange = value
+           return
+     end function stdlib_zlange
+
+     !> ZLANGT:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex tridiagonal matrix A.
+
+     pure real(dp) function stdlib_zlangt( norm, n, dl, d, du )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(dp), intent(in) :: d(*), dl(*), du(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: anorm, scale, sum, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n<=0 ) then
+              anorm = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              anorm = abs( d( n ) )
+              do i = 1, n - 1
+                 if( anorm<abs( dl( i ) ) .or. stdlib_disnan( abs( dl( i ) ) ) )anorm = abs(dl(i))
+                           
+                 if( anorm<abs( d( i ) ) .or. stdlib_disnan( abs( d( i ) ) ) )anorm = abs(d(i))
+                           
+                 if( anorm<abs( du( i ) ) .or. stdlib_disnan (abs( du( i ) ) ) )anorm = abs(du(i))
+                           
+              end do
+           else if( stdlib_lsame( norm, 'O' ) .or. norm=='1' ) then
+              ! find norm1(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( dl( 1 ) )
+                 temp = abs( d( n ) )+abs( du( n-1 ) )
+                 if( anorm < temp .or. stdlib_disnan( temp ) ) anorm = temp
+                 do i = 2, n - 1
+                    temp = abs( d( i ) )+abs( dl( i ) )+abs( du( i-1 ) )
+                    if( anorm < temp .or. stdlib_disnan( temp ) ) anorm = temp
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( du( 1 ) )
+                 temp = abs( d( n ) )+abs( dl( n-1 ) )
+                 if( anorm < temp .or. stdlib_disnan( temp ) ) anorm = temp
+                 do i = 2, n - 1
+                    temp = abs( d( i ) )+abs( du( i ) )+abs( dl( i-1 ) )
+                    if( anorm < temp .or. stdlib_disnan( temp ) ) anorm = temp
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              call stdlib_zlassq( n, d, 1, scale, sum )
+              if( n>1 ) then
+                 call stdlib_zlassq( n-1, dl, 1, scale, sum )
+                 call stdlib_zlassq( n-1, du, 1, scale, sum )
+              end if
+              anorm = scale*sqrt( sum )
+           end if
+           stdlib_zlangt = anorm
+           return
+     end function stdlib_zlangt
+
+     !> ZLANHB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n hermitian band matrix A,  with k super-diagonals.
+
+     real(dp) function stdlib_zlanhb( norm, uplo, n, k, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(in) :: ab(ldab,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           real(dp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = max( k+2-j, 1 ), k
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                    sum = abs( real( ab( k+1, j ),KIND=dp) )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    sum = abs( real( ab( 1, j ),KIND=dp) )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    do i = 2, min( n+1-j, k+1 )
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is hermitian).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    l = k + 1 - j
+                    do i = max( 1, j-k ), j - 1
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( real( ab( k+1, j ),KIND=dp) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( real( ab( 1, j ),KIND=dp) )
+                    l = 1 - j
+                    do i = j + 1, min( n, j+k )
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( k>0 ) then
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 2, n
+                       call stdlib_zlassq( min( j-1, k ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                    l = k + 1
+                 else
+                    do j = 1, n - 1
+                       call stdlib_zlassq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                    end do
+                    l = 1
+                 end if
+                 sum = 2*sum
+              else
+                 l = 1
+              end if
+              do j = 1, n
+                 if( real( ab( l, j ),KIND=dp)/=zero ) then
+                    absa = abs( real( ab( l, j ),KIND=dp) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_zlanhb = value
+           return
+     end function stdlib_zlanhb
+
+     !> ZLANHE:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex hermitian matrix A.
+
+     real(dp) function stdlib_zlanhe( norm, uplo, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = 1, j - 1
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                    sum = abs( real( a( j, j ),KIND=dp) )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    sum = abs( real( a( j, j ),KIND=dp) )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    do i = j + 1, n
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is hermitian).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( real( a( j, j ),KIND=dp) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( real( a( j, j ),KIND=dp) )
+                    do i = j + 1, n
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_zlassq( j-1, a( 1, j ), 1, scale, sum )
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_zlassq( n-j, a( j+1, j ), 1, scale, sum )
+                 end do
+              end if
+              sum = 2*sum
+              do i = 1, n
+                 if( real( a( i, i ),KIND=dp)/=zero ) then
+                    absa = abs( real( a( i, i ),KIND=dp) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_zlanhe = value
+           return
+     end function stdlib_zlanhe
+
+     !> ZLANHF:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex Hermitian matrix A in RFP format.
+
+     real(dp) function stdlib_zlanhf( norm, transr, uplo, n, a, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, transr, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(out) :: work(0:*)
+           complex(dp), intent(in) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, ifm, ilu, noe, n1, k, l, lda
+           real(dp) :: scale, s, value, aa, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              stdlib_zlanhf = zero
+              return
+           else if( n==1 ) then
+              stdlib_zlanhf = abs(real(a(0),KIND=dp))
+              return
+           end if
+           ! set noe = 1 if n is odd. if n is even set noe=0
+           noe = 1
+           if( mod( n, 2 )==0 )noe = 0
+           ! set ifm = 0 when form='c' or 'c' and 1 otherwise
+           ifm = 1
+           if( stdlib_lsame( transr, 'C' ) )ifm = 0
+           ! set ilu = 0 when uplo='u or 'u' and 1 otherwise
+           ilu = 1
+           if( stdlib_lsame( uplo, 'U' ) )ilu = 0
+           ! set lda = (n+1)/2 when ifm = 0
+           ! set lda = n when ifm = 1 and noe = 1
+           ! set lda = n+1 when ifm = 1 and noe = 0
+           if( ifm==1 ) then
+              if( noe==1 ) then
+                 lda = n
+              else
+                 ! noe=0
+                 lda = n + 1
+              end if
+           else
+              ! ifm=0
+              lda = ( n+1 ) / 2
+           end if
+           if( stdlib_lsame( norm, 'M' ) ) then
+             ! find max(abs(a(i,j))).
+              k = ( n+1 ) / 2
+              value = zero
+              if( noe==1 ) then
+                 ! n is odd
+                 if( ifm==1 ) then
+                    ! a is n by k
+                    if( ilu==1 ) then
+                       ! uplo ='l'
+                       j = 0
+                       ! -> l(0,0)
+                       temp = abs( real( a( j+j*lda ),KIND=dp) )
+                       if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       do i = 1, n - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                       do j = 1, k - 1
+                          do i = 0, j - 2
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                          i = j - 1
+                          ! l(k+j,k+j)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          i = j
+                          ! -> l(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          do i = j + 1, n - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                       end do
+                    else
+                       ! uplo = 'u'
+                       do j = 0, k - 2
+                          do i = 0, k + j - 2
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                          i = k + j - 1
+                          ! -> u(i,i)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          i = i + 1
+                          ! =k+j; i -> u(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          do i = k + j + 1, n - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                       end do
+                       do i = 0, n - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          ! j=k-1
+                       end do
+                       ! i=n-1 -> u(n-1,n-1)
+                       temp = abs( real( a( i+j*lda ),KIND=dp) )
+                       if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                    end if
+                 else
+                    ! xpose case; a is k by n
+                    if( ilu==1 ) then
+                       ! uplo ='l'
+                       do j = 0, k - 2
+                          do i = 0, j - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                          i = j
+                          ! l(i,i)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          i = j + 1
+                          ! l(j+k,j+k)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          do i = j + 2, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = k - 1
+                       do i = 0, k - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                       i = k - 1
+                       ! -> l(i,i) is at a(i,j)
+                       temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       do j = k, n - 1
+                          do i = 0, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                       end do
+                    else
+                       ! uplo = 'u'
+                       do j = 0, k - 2
+                          do i = 0, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = k - 1
+                       ! -> u(j,j) is at a(0,j)
+                       temp = abs( real( a( 0+j*lda ),KIND=dp) )
+                       if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       do i = 1, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                       do j = k, n - 1
+                          do i = 0, j - k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                          i = j - k
+                          ! -> u(i,i) at a(i,j)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          i = j - k + 1
+                          ! u(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          do i = j - k + 2, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                       end do
+                    end if
+                 end if
+              else
+                 ! n is even
+                 if( ifm==1 ) then
+                    ! a is n+1 by k
+                    if( ilu==1 ) then
+                       ! uplo ='l'
+                       j = 0
+                       ! -> l(k,k)
+                       temp = abs( real( a( j+j*lda ),KIND=dp) )
+                       if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       temp = abs( real( a( j+1+j*lda ),KIND=dp) )
+                       if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       do i = 2, n
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                       do j = 1, k - 1
+                          do i = 0, j - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                          i = j
+                          ! l(k+j,k+j)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          i = j + 1
+                          ! -> l(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          do i = j + 2, n
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                       end do
+                    else
+                       ! uplo = 'u'
+                       do j = 0, k - 2
+                          do i = 0, k + j - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                          i = k + j
+                          ! -> u(i,i)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          i = i + 1
+                          ! =k+j+1; i -> u(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          do i = k + j + 2, n
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                       end do
+                       do i = 0, n - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          ! j=k-1
+                       end do
+                       ! i=n-1 -> u(n-1,n-1)
+                       temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       i = n
+                       ! -> u(k-1,k-1)
+                       temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                    end if
+                 else
+                    ! xpose case; a is k by n+1
+                    if( ilu==1 ) then
+                       ! uplo ='l'
+                       j = 0
+                       ! -> l(k,k) at a(0,0)
+                       temp = abs( real( a( j+j*lda ),KIND=dp) )
+                       if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       do i = 1, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                       do j = 1, k - 1
+                          do i = 0, j - 2
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                          i = j - 1
+                          ! l(i,i)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          i = j
+                          ! l(j+k,j+k)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          do i = j + 1, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = k
+                       do i = 0, k - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                       i = k - 1
+                       ! -> l(i,i) is at a(i,j)
+                       temp = abs( real( a( i+j*lda ),KIND=dp) )
+                       if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       do j = k + 1, n
+                          do i = 0, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                       end do
+                    else
+                       ! uplo = 'u'
+                       do j = 0, k - 1
+                          do i = 0, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = k
+                       ! -> u(j,j) is at a(0,j)
+                       temp = abs( real( a( 0+j*lda ),KIND=dp) )
+                       if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       do i = 1, k - 1
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                       do j = k + 1, n - 1
+                          do i = 0, j - k - 2
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                          i = j - k - 1
+                          ! -> u(i,i) at a(i,j)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          i = j - k
+                          ! u(j,j)
+                          temp = abs( real( a( i+j*lda ),KIND=dp) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          do i = j - k + 1, k - 1
+                             temp = abs( a( i+j*lda ) )
+                             if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                          end do
+                       end do
+                       j = n
+                       do i = 0, k - 2
+                          temp = abs( a( i+j*lda ) )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                       i = k - 1
+                       ! u(k,k) at a(i,j)
+                       temp = abs( real( a( i+j*lda ),KIND=dp) )
+                       if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                    end if
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+             ! find normi(a) ( = norm1(a), since a is hermitian).
+              if( ifm==1 ) then
+                 ! a is 'n'
+                 k = n / 2
+                 if( noe==1 ) then
+                    ! n is odd
+                    if( ilu==0 ) then
+                       ! uplo = 'u'
+                       do i = 0, k - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k
+                          s = zero
+                          do i = 0, k + j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(i,j+k)
+                             s = s + aa
+                             work( i ) = work( i ) + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          ! -> a(j+k,j+k)
+                          work( j+k ) = s + aa
+                          if( i==k+k )go to 10
+                          i = i + 1
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          ! -> a(j,j)
+                          work( j ) = work( j ) + aa
+                          s = zero
+                          do l = j + 1, k - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       10 continue
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu = 1
+                       k = k + 1
+                       ! k=(n+1)/2 for n odd and ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = k - 1, 0, -1
+                          s = zero
+                          do i = 0, j - 2
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,i+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + aa
+                          end do
+                          if( j>0 ) then
+                             aa = abs( real( a( i+j*lda ),KIND=dp) )
+                             ! -> a(j+k,j+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + s
+                             ! i=j
+                             i = i + 1
+                          end if
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          ! -> a(j,j)
+                          work( j ) = aa
+                          s = zero
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    end if
+                 else
+                    ! n is even
+                    if( ilu==0 ) then
+                       ! uplo = 'u'
+                       do i = 0, k - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 1
+                          s = zero
+                          do i = 0, k + j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(i,j+k)
+                             s = s + aa
+                             work( i ) = work( i ) + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          ! -> a(j+k,j+k)
+                          work( j+k ) = s + aa
+                          i = i + 1
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          ! -> a(j,j)
+                          work( j ) = work( j ) + aa
+                          s = zero
+                          do l = j + 1, k - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu = 1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = k - 1, 0, -1
+                          s = zero
+                          do i = 0, j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(j+k,i+k)
+                             s = s + aa
+                             work( i+k ) = work( i+k ) + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          ! -> a(j+k,j+k)
+                          s = s + aa
+                          work( i+k ) = work( i+k ) + s
+                          ! i=j
+                          i = i + 1
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          ! -> a(j,j)
+                          work( j ) = aa
+                          s = zero
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! -> a(l,j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    end if
+                 end if
+              else
+                 ! ifm=0
+                 k = n / 2
+                 if( noe==1 ) then
+                    ! n is odd
+                    if( ilu==0 ) then
+                       ! uplo = 'u'
+                       n1 = k
+                       ! n/2
+                       k = k + 1
+                       ! k is the row size and lda
+                       do i = n1, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, n1 - 1
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,n1+i)
+                             work( i+n1 ) = work( i+n1 ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = s
+                       end do
+                       ! j=n1=k-1 is special
+                       s = abs( real( a( 0+j*lda ),KIND=dp) )
+                       ! a(k-1,k-1)
+                       do i = 1, k - 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k-1,i+n1)
+                          work( i+n1 ) = work( i+n1 ) + aa
+                          s = s + aa
+                       end do
+                       work( j ) = work( j ) + s
+                       do j = k, n - 1
+                          s = zero
+                          do i = 0, j - k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(i,j-k)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          ! i=j-k
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          ! a(j-k,j-k)
+                          s = s + aa
+                          work( j-k ) = work( j-k ) + s
+                          i = i + 1
+                          s = abs( real( a( i+j*lda ),KIND=dp) )
+                          ! a(j,j)
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,l)
+                             work( l ) = work( l ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu=1
+                       k = k + 1
+                       ! k=(n+1)/2 for n odd and ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 2
+                          ! process
+                          s = zero
+                          do i = 0, j - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          ! i=j so process of a(j,j)
+                          s = s + aa
+                          work( j ) = s
+                          ! is initialised here
+                          i = i + 1
+                          ! i=j process a(j+k,j+k)
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          s = aa
+                          do l = k + j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(l,k+j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( k+j ) = work( k+j ) + s
+                       end do
+                       ! j=k-1 is special :process col a(k-1,0:k-1)
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,i)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( real( a( i+j*lda ),KIND=dp) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = s
+                       ! done with col j=k+1
+                       do j = k, n - 1
+                          ! process col j of a = a(j,0:k-1)
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    end if
+                 else
+                    ! n is even
+                    if( ilu==0 ) then
+                       ! uplo = 'u'
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       do j = 0, k - 1
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,i+k)
+                             work( i+k ) = work( i+k ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = s
+                       end do
+                       ! j=k
+                       aa = abs( real( a( 0+j*lda ),KIND=dp) )
+                       ! a(k,k)
+                       s = aa
+                       do i = 1, k - 1
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,k+i)
+                          work( i+k ) = work( i+k ) + aa
+                          s = s + aa
+                       end do
+                       work( j ) = work( j ) + s
+                       do j = k + 1, n - 1
+                          s = zero
+                          do i = 0, j - 2 - k
+                             aa = abs( a( i+j*lda ) )
+                             ! a(i,j-k-1)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          ! i=j-1-k
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          ! a(j-k-1,j-k-1)
+                          s = s + aa
+                          work( j-k-1 ) = work( j-k-1 ) + s
+                          i = i + 1
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          ! a(j,j)
+                          s = aa
+                          do l = j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j,l)
+                             work( l ) = work( l ) + aa
+                             s = s + aa
+                          end do
+                          work( j ) = work( j ) + s
+                       end do
+                       ! j=n
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(i,k-1)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( real( a( i+j*lda ),KIND=dp) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = work( i ) + s
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    else
+                       ! ilu=1
+                       do i = k, n - 1
+                          work( i ) = zero
+                       end do
+                       ! j=0 is special :process col a(k:n-1,k)
+                       s = abs( real( a( 0 ),KIND=dp) )
+                       ! a(k,k)
+                       do i = 1, k - 1
+                          aa = abs( a( i ) )
+                          ! a(k+i,k)
+                          work( i+k ) = work( i+k ) + aa
+                          s = s + aa
+                       end do
+                       work( k ) = work( k ) + s
+                       do j = 1, k - 1
+                          ! process
+                          s = zero
+                          do i = 0, j - 2
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j-1,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          ! i=j-1 so process of a(j-1,j-1)
+                          s = s + aa
+                          work( j-1 ) = s
+                          ! is initialised here
+                          i = i + 1
+                          ! i=j process a(j+k,j+k)
+                          aa = abs( real( a( i+j*lda ),KIND=dp) )
+                          s = aa
+                          do l = k + j + 1, n - 1
+                             i = i + 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(l,k+j)
+                             s = s + aa
+                             work( l ) = work( l ) + aa
+                          end do
+                          work( k+j ) = work( k+j ) + s
+                       end do
+                       ! j=k is special :process col a(k,0:k-1)
+                       s = zero
+                       do i = 0, k - 2
+                          aa = abs( a( i+j*lda ) )
+                          ! a(k,i)
+                          work( i ) = work( i ) + aa
+                          s = s + aa
+                       end do
+                       ! i=k-1
+                       aa = abs( real( a( i+j*lda ),KIND=dp) )
+                       ! a(k-1,k-1)
+                       s = s + aa
+                       work( i ) = s
+                       ! done with col j=k+1
+                       do j = k + 1, n
+                          ! process col j-1 of a = a(j-1,0:k-1)
+                          s = zero
+                          do i = 0, k - 1
+                             aa = abs( a( i+j*lda ) )
+                             ! a(j-1,i)
+                             work( i ) = work( i ) + aa
+                             s = s + aa
+                          end do
+                          work( j-1 ) = work( j-1 ) + s
+                       end do
+                       value = work( 0 )
+                       do i = 1, n-1
+                          temp = work( i )
+                          if( value < temp .or. stdlib_disnan( temp ) )value = temp
+                       end do
+                    end if
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+             ! find normf(a).
+              k = ( n+1 ) / 2
+              scale = zero
+              s = one
+              if( noe==1 ) then
+                 ! n is odd
+                 if( ifm==1 ) then
+                    ! a is normal
+                    if( ilu==0 ) then
+                       ! a is upper
+                       do j = 0, k - 3
+                          call stdlib_zlassq( k-j-2, a( k+j+1+j*lda ), 1, scale, s )
+                          ! l at a(k,0)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_zlassq( k+j-1, a( 0+j*lda ), 1, scale, s )
+                          ! trap u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = k - 1
+                       ! -> u(k,k) at a(k-1,0)
+                       do i = 0, k - 2
+                          aa = real( a( l ),KIND=dp)
+                          ! u(k+i,k+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=dp)
+                          ! u(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                       aa = real( a( l ),KIND=dp)
+                       ! u(n-1,n-1)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                    else
+                       ! ilu=1
+                       do j = 0, k - 1
+                          call stdlib_zlassq( n-j-1, a( j+1+j*lda ), 1, scale, s )
+                          ! trap l at a(0,0)
+                       end do
+                       do j = 1, k - 2
+                          call stdlib_zlassq( j, a( 0+( 1+j )*lda ), 1, scale, s )
+                          ! u at a(0,1)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       aa = real( a( 0 ),KIND=dp)
+                       ! l(0,0) at a(0,0)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                       l = lda
+                       ! -> l(k,k) at a(0,1)
+                       do i = 1, k - 1
+                          aa = real( a( l ),KIND=dp)
+                          ! l(k-1+i,k-1+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=dp)
+                          ! l(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                    end if
+                 else
+                    ! a is xpose
+                    if( ilu==0 ) then
+                       ! a**h is upper
+                       do j = 1, k - 2
+                          call stdlib_zlassq( j, a( 0+( k+j )*lda ), 1, scale, s )
+                          ! u at a(0,k)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_zlassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k-1 rect. at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_zlassq( k-j-1, a( j+1+( j+k-1 )*lda ), 1,scale, s )
+                          ! l at a(0,k-1)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0 + k*lda - lda
+                       ! -> u(k-1,k-1) at a(0,k-1)
+                       aa = real( a( l ),KIND=dp)
+                       ! u(k-1,k-1)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                       l = l + lda
+                       ! -> u(0,0) at a(0,k)
+                       do j = k, n - 1
+                          aa = real( a( l ),KIND=dp)
+                          ! -> u(j-k,j-k)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=dp)
+                          ! -> u(j,j)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                    else
+                       ! a**h is lower
+                       do j = 1, k - 1
+                          call stdlib_zlassq( j, a( 0+j*lda ), 1, scale, s )
+                          ! u at a(0,0)
+                       end do
+                       do j = k, n - 1
+                          call stdlib_zlassq( k, a( 0+j*lda ), 1, scale, s )
+                          ! k by k-1 rect. at a(0,k)
+                       end do
+                       do j = 0, k - 3
+                          call stdlib_zlassq( k-j-2, a( j+2+j*lda ), 1, scale, s )
+                          ! l at a(1,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0
+                       ! -> l(0,0) at a(0,0)
+                       do i = 0, k - 2
+                          aa = real( a( l ),KIND=dp)
+                          ! l(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=dp)
+                          ! l(k+i,k+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                       ! l-> k-1 + (k-1)*lda or l(k-1,k-1) at a(k-1,k-1)
+                       aa = real( a( l ),KIND=dp)
+                       ! l(k-1,k-1) at a(k-1,k-1)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                    end if
+                 end if
+              else
+                 ! n is even
+                 if( ifm==1 ) then
+                    ! a is normal
+                    if( ilu==0 ) then
+                       ! a is upper
+                       do j = 0, k - 2
+                          call stdlib_zlassq( k-j-1, a( k+j+2+j*lda ), 1, scale, s )
+                       ! l at a(k+1,0)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_zlassq( k+j, a( 0+j*lda ), 1, scale, s )
+                       ! trap u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = k
+                       ! -> u(k,k) at a(k,0)
+                       do i = 0, k - 1
+                          aa = real( a( l ),KIND=dp)
+                          ! u(k+i,k+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=dp)
+                          ! u(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                    else
+                       ! ilu=1
+                       do j = 0, k - 1
+                          call stdlib_zlassq( n-j-1, a( j+2+j*lda ), 1, scale, s )
+                          ! trap l at a(1,0)
+                       end do
+                       do j = 1, k - 1
+                          call stdlib_zlassq( j, a( 0+j*lda ), 1, scale, s )
+                          ! u at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0
+                       ! -> l(k,k) at a(0,0)
+                       do i = 0, k - 1
+                          aa = real( a( l ),KIND=dp)
+                          ! l(k-1+i,k-1+i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=dp)
+                          ! l(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                    end if
+                 else
+                    ! a is xpose
+                    if( ilu==0 ) then
+                       ! a**h is upper
+                       do j = 1, k - 1
+                          call stdlib_zlassq( j, a( 0+( k+1+j )*lda ), 1, scale, s )
+                       ! u at a(0,k+1)
+                       end do
+                       do j = 0, k - 1
+                          call stdlib_zlassq( k, a( 0+j*lda ), 1, scale, s )
+                       ! k by k rect. at a(0,0)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_zlassq( k-j-1, a( j+1+( j+k )*lda ), 1, scale,s )
+                       ! l at a(0,k)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0 + k*lda
+                       ! -> u(k,k) at a(0,k)
+                       aa = real( a( l ),KIND=dp)
+                       ! u(k,k)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                       l = l + lda
+                       ! -> u(0,0) at a(0,k+1)
+                       do j = k + 1, n - 1
+                          aa = real( a( l ),KIND=dp)
+                          ! -> u(j-k-1,j-k-1)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=dp)
+                          ! -> u(j,j)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                       ! l=k-1+n*lda
+                       ! -> u(k-1,k-1) at a(k-1,n)
+                       aa = real( a( l ),KIND=dp)
+                       ! u(k,k)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                    else
+                       ! a**h is lower
+                       do j = 1, k - 1
+                          call stdlib_zlassq( j, a( 0+( j+1 )*lda ), 1, scale, s )
+                       ! u at a(0,1)
+                       end do
+                       do j = k + 1, n
+                          call stdlib_zlassq( k, a( 0+j*lda ), 1, scale, s )
+                       ! k by k rect. at a(0,k+1)
+                       end do
+                       do j = 0, k - 2
+                          call stdlib_zlassq( k-j-1, a( j+1+j*lda ), 1, scale, s )
+                       ! l at a(0,0)
+                       end do
+                       s = s + s
+                       ! double s for the off diagonal elements
+                       l = 0
+                       ! -> l(k,k) at a(0,0)
+                       aa = real( a( l ),KIND=dp)
+                       ! l(k,k) at a(0,0)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                       l = lda
+                       ! -> l(0,0) at a(0,1)
+                       do i = 0, k - 2
+                          aa = real( a( l ),KIND=dp)
+                          ! l(i,i)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          aa = real( a( l+1 ),KIND=dp)
+                          ! l(k+i+1,k+i+1)
+                          if( aa/=zero ) then
+                             if( scale<aa ) then
+                                s = one + s*( scale / aa )**2
+                                scale = aa
+                             else
+                                s = s + ( aa / scale )**2
+                             end if
+                          end if
+                          l = l + lda + 1
+                       end do
+                       ! l-> k - 1 + k*lda or l(k-1,k-1) at a(k-1,k)
+                       aa = real( a( l ),KIND=dp)
+                       ! l(k-1,k-1) at a(k-1,k)
+                       if( aa/=zero ) then
+                          if( scale<aa ) then
+                             s = one + s*( scale / aa )**2
+                             scale = aa
+                          else
+                             s = s + ( aa / scale )**2
+                          end if
+                       end if
+                    end if
+                 end if
+              end if
+              value = scale*sqrt( s )
+           end if
+           stdlib_zlanhf = value
+           return
+     end function stdlib_zlanhf
+
+     !> ZLANHP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex hermitian matrix A,  supplied in packed form.
+
+     real(dp) function stdlib_zlanhp( norm, uplo, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(in) :: ap(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k
+           real(dp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 k = 0
+                 do j = 1, n
+                    do i = k + 1, k + j - 1
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                    k = k + j
+                    sum = abs( real( ap( k ),KIND=dp) )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 k = 1
+                 do j = 1, n
+                    sum = abs( real( ap( k ),KIND=dp) )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    do i = k + 1, k + n - j
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                    k = k + n - j + 1
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is hermitian).
+              value = zero
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    work( j ) = sum + abs( real( ap( k ),KIND=dp) )
+                    k = k + 1
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( real( ap( k ),KIND=dp) )
+                    k = k + 1
+                    do i = j + 1, n
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              k = 2
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_zlassq( j-1, ap( k ), 1, scale, sum )
+                    k = k + j
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_zlassq( n-j, ap( k ), 1, scale, sum )
+                    k = k + n - j + 1
+                 end do
+              end if
+              sum = 2*sum
+              k = 1
+              do i = 1, n
+                 if( real( ap( k ),KIND=dp)/=zero ) then
+                    absa = abs( real( ap( k ),KIND=dp) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    k = k + i + 1
+                 else
+                    k = k + n - i + 1
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_zlanhp = value
+           return
+     end function stdlib_zlanhp
+
+     !> ZLANHS:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> Hessenberg matrix A.
+
+     real(dp) function stdlib_zlanhs( norm, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              do j = 1, n
+                 do i = 1, min( n, j+1 )
+                    sum = abs( a( i, j ) )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end do
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              do j = 1, n
+                 sum = zero
+                 do i = 1, min( n, j+1 )
+                    sum = sum + abs( a( i, j ) )
+                 end do
+                 if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              do i = 1, n
+                 work( i ) = zero
+              end do
+              do j = 1, n
+                 do i = 1, min( n, j+1 )
+                    work( i ) = work( i ) + abs( a( i, j ) )
+                 end do
+              end do
+              value = zero
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              do j = 1, n
+                 call stdlib_zlassq( min( n, j+1 ), a( 1, j ), 1, scale, sum )
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_zlanhs = value
+           return
+     end function stdlib_zlanhs
+
+     !> ZLANHT:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex Hermitian tridiagonal matrix A.
+
+     pure real(dp) function stdlib_zlanht( norm, n, d, e )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*)
+           complex(dp), intent(in) :: e(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i
+           real(dp) :: anorm, scale, sum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,sqrt
+           ! Executable Statements 
+           if( n<=0 ) then
+              anorm = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              anorm = abs( d( n ) )
+              do i = 1, n - 1
+                 sum =  abs( d( i ) )
+                 if( anorm < sum .or. stdlib_disnan( sum ) ) anorm = sum
+                 sum = abs( e( i ) )
+                 if( anorm < sum .or. stdlib_disnan( sum ) ) anorm = sum
+              end do
+           else if( stdlib_lsame( norm, 'O' ) .or. norm=='1' .or.stdlib_lsame( norm, 'I' ) ) &
+                     then
+              ! find norm1(a).
+              if( n==1 ) then
+                 anorm = abs( d( 1 ) )
+              else
+                 anorm = abs( d( 1 ) )+abs( e( 1 ) )
+                 sum = abs( e( n-1 ) )+abs( d( n ) )
+                 if( anorm < sum .or. stdlib_disnan( sum ) ) anorm = sum
+                 do i = 2, n - 1
+                    sum = abs( d( i ) )+abs( e( i ) )+abs( e( i-1 ) )
+                    if( anorm < sum .or. stdlib_disnan( sum ) ) anorm = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( n>1 ) then
+                 call stdlib_zlassq( n-1, e, 1, scale, sum )
+                 sum = 2*sum
+              end if
+              call stdlib_dlassq( n, d, 1, scale, sum )
+              anorm = scale*sqrt( sum )
+           end if
+           stdlib_zlanht = anorm
+           return
+     end function stdlib_zlanht
+
+     !> ZLANSB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n symmetric band matrix A,  with k super-diagonals.
+
+     real(dp) function stdlib_zlansb( norm, uplo, n, k, ab, ldab,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(in) :: ab(ldab,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, l
+           real(dp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = max( k+2-j, 1 ), k + 1
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                 end do
+              else
+                 do j = 1, n
+                    do i = 1, min( n+1-j, k+1 )
+                       sum = abs( ab( i, j ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    l = k + 1 - j
+                    do i = max( 1, j-k ), j - 1
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( ab( k+1, j ) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( ab( 1, j ) )
+                    l = 1 - j
+                    do i = j + 1, min( n, j+k )
+                       absa = abs( ab( l+i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( k>0 ) then
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 2, n
+                       call stdlib_zlassq( min( j-1, k ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                    l = k + 1
+                 else
+                    do j = 1, n - 1
+                       call stdlib_zlassq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                    end do
+                    l = 1
+                 end if
+                 sum = 2*sum
+              else
+                 l = 1
+              end if
+              call stdlib_zlassq( n, ab( l, 1 ), ldab, scale, sum )
+              value = scale*sqrt( sum )
+           end if
+           stdlib_zlansb = value
+           return
+     end function stdlib_zlansb
+
+     !> ZLANSP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex symmetric matrix A,  supplied in packed form.
+
+     real(dp) function stdlib_zlansp( norm, uplo, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(in) :: ap(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j, k
+           real(dp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 k = 1
+                 do j = 1, n
+                    do i = k, k + j - 1
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                    k = k + j
+                 end do
+              else
+                 k = 1
+                 do j = 1, n
+                    do i = k, k + n - j
+                       sum = abs( ap( i ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                    k = k + n - j + 1
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    work( j ) = sum + abs( ap( k ) )
+                    k = k + 1
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( ap( k ) )
+                    k = k + 1
+                    do i = j + 1, n
+                       absa = abs( ap( k ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                       k = k + 1
+                    end do
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              k = 2
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_zlassq( j-1, ap( k ), 1, scale, sum )
+                    k = k + j
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_zlassq( n-j, ap( k ), 1, scale, sum )
+                    k = k + n - j + 1
+                 end do
+              end if
+              sum = 2*sum
+              k = 1
+              do i = 1, n
+                 if( real( ap( k ),KIND=dp)/=zero ) then
+                    absa = abs( real( ap( k ),KIND=dp) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+                 if( aimag( ap( k ) )/=zero ) then
+                    absa = abs( aimag( ap( k ) ) )
+                    if( scale<absa ) then
+                       sum = one + sum*( scale / absa )**2
+                       scale = absa
+                    else
+                       sum = sum + ( absa / scale )**2
+                    end if
+                 end if
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    k = k + i + 1
+                 else
+                    k = k + n - i + 1
+                 end if
+              end do
+              value = scale*sqrt( sum )
+           end if
+           stdlib_zlansp = value
+           return
+     end function stdlib_zlansp
+
+     !> ZLANSY:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> complex symmetric matrix A.
+
+     real(dp) function stdlib_zlansy( norm, uplo, n, a, lda, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm, uplo
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, j
+           real(dp) :: absa, scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    do i = 1, j
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                 end do
+              else
+                 do j = 1, n
+                    do i = j, n
+                       sum = abs( a( i, j ) )
+                       if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                    end do
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'I' ) ) .or. ( stdlib_lsame( norm, 'O' ) ) .or.( &
+                     norm=='1' ) ) then
+              ! find normi(a) ( = norm1(a), since a is symmetric).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    sum = zero
+                    do i = 1, j - 1
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    work( j ) = sum + abs( a( j, j ) )
+                 end do
+                 do i = 1, n
+                    sum = work( i )
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do i = 1, n
+                    work( i ) = zero
+                 end do
+                 do j = 1, n
+                    sum = work( j ) + abs( a( j, j ) )
+                    do i = j + 1, n
+                       absa = abs( a( i, j ) )
+                       sum = sum + absa
+                       work( i ) = work( i ) + absa
+                    end do
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              scale = zero
+              sum = one
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 2, n
+                    call stdlib_zlassq( j-1, a( 1, j ), 1, scale, sum )
+                 end do
+              else
+                 do j = 1, n - 1
+                    call stdlib_zlassq( n-j, a( j+1, j ), 1, scale, sum )
+                 end do
+              end if
+              sum = 2*sum
+              call stdlib_zlassq( n, a, lda+1, scale, sum )
+              value = scale*sqrt( sum )
+           end if
+           stdlib_zlansy = value
+           return
+     end function stdlib_zlansy
+
+     !> ZLANTB:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the element of  largest absolute value  of an
+     !> n by n triangular band matrix A,  with ( k + 1 ) diagonals.
+
+     real(dp) function stdlib_zlantb( norm, uplo, diag, n, k, ab,ldab, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: k, ldab, n
+           ! Array Arguments 
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(in) :: ab(ldab,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j, l
+           real(dp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = max( k+2-j, 1 ), k
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = 2, min( n+1-j, k+1 )
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = max( k+2-j, 1 ), k + 1
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = 1, min( n+1-j, k+1 )
+                          sum = abs( ab( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = max( k+2-j, 1 ), k
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = max( k+2-j, 1 ), k + 1
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = 2, min( n+1-j, k+1 )
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = 1, min( n+1-j, k+1 )
+                          sum = sum + abs( ab( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              value = zero
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       l = k + 1 - j
+                       do i = max( 1, j-k ), j - 1
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       l = k + 1 - j
+                       do i = max( 1, j-k ), j
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       l = 1 - j
+                       do i = j + 1, min( n, j+k )
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       l = 1 - j
+                       do i = j, min( n, j+k )
+                          work( i ) = work( i ) + abs( ab( l+i, j ) )
+                       end do
+                    end do
+                 end if
+              end if
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    if( k>0 ) then
+                       do j = 2, n
+                          call stdlib_zlassq( min( j-1, k ),ab( max( k+2-j, 1 ), j ), 1, scale,&
+                                    sum )
+                       end do
+                    end if
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_zlassq( min( j, k+1 ), ab( max( k+2-j, 1 ), j ),1, scale, sum )
+                                 
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    if( k>0 ) then
+                       do j = 1, n - 1
+                          call stdlib_zlassq( min( n-j, k ), ab( 2, j ), 1, scale,sum )
+                       end do
+                    end if
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_zlassq( min( n-j+1, k+1 ), ab( 1, j ), 1, scale,sum )
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_zlantb = value
+           return
+     end function stdlib_zlantb
+
+     !> ZLANTP:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> triangular matrix A, supplied in packed form.
+
+     real(dp) function stdlib_zlantp( norm, uplo, diag, n, ap, work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(in) :: ap(*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j, k
+           real(dp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,sqrt
+           ! Executable Statements 
+           if( n==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              k = 1
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = k, k + j - 2
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                       k = k + j
+                    end do
+                 else
+                    do j = 1, n
+                       do i = k + 1, k + n - j
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                       k = k + n - j + 1
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = k, k + j - 1
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                       k = k + j
+                    end do
+                 else
+                    do j = 1, n
+                       do i = k, k + n - j
+                          sum = abs( ap( i ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                       k = k + n - j + 1
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              k = 1
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = k, k + j - 2
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = k, k + j - 1
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    end if
+                    k = k + j
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = k + 1, k + n - j
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = k, k + n - j
+                          sum = sum + abs( ap( i ) )
+                       end do
+                    end if
+                    k = k + n - j + 1
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              k = 1
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       do i = 1, j - 1
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                       k = k + 1
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = 1, j
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, n
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       k = k + 1
+                       do i = j + 1, n
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 else
+                    do i = 1, n
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j, n
+                          work( i ) = work( i ) + abs( ap( k ) )
+                          k = k + 1
+                       end do
+                    end do
+                 end if
+              end if
+              value = zero
+              do i = 1, n
+                 sum = work( i )
+                 if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    k = 2
+                    do j = 2, n
+                       call stdlib_zlassq( j-1, ap( k ), 1, scale, sum )
+                       k = k + j
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    k = 1
+                    do j = 1, n
+                       call stdlib_zlassq( j, ap( k ), 1, scale, sum )
+                       k = k + j
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = n
+                    k = 2
+                    do j = 1, n - 1
+                       call stdlib_zlassq( n-j, ap( k ), 1, scale, sum )
+                       k = k + n - j + 1
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    k = 1
+                    do j = 1, n
+                       call stdlib_zlassq( n-j+1, ap( k ), 1, scale, sum )
+                       k = k + n - j + 1
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_zlantp = value
+           return
+     end function stdlib_zlantp
+
+     !> ZLANTR:  returns the value of the one norm,  or the Frobenius norm, or
+     !> the  infinity norm,  or the  element of  largest absolute value  of a
+     !> trapezoidal or triangular matrix A.
+
+     real(dp) function stdlib_zlantr( norm, uplo, diag, m, n, a, lda,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(in) :: a(lda,*)
+       ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: udiag
+           integer(ilp) :: i, j
+           real(dp) :: scale, sum, value
+           ! Intrinsic Functions 
+           intrinsic :: abs,min,sqrt
+           ! Executable Statements 
+           if( min( m, n )==0 ) then
+              value = zero
+           else if( stdlib_lsame( norm, 'M' ) ) then
+              ! find max(abs(a(i,j))).
+              if( stdlib_lsame( diag, 'U' ) ) then
+                 value = one
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = 1, min( m, j-1 )
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = j + 1, m
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              else
+                 value = zero
+                 if( stdlib_lsame( uplo, 'U' ) ) then
+                    do j = 1, n
+                       do i = 1, min( m, j )
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 else
+                    do j = 1, n
+                       do i = j, m
+                          sum = abs( a( i, j ) )
+                          if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                       end do
+                    end do
+                 end if
+              end if
+           else if( ( stdlib_lsame( norm, 'O' ) ) .or. ( norm=='1' ) ) then
+              ! find norm1(a).
+              value = zero
+              udiag = stdlib_lsame( diag, 'U' )
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 do j = 1, n
+                    if( ( udiag ) .and. ( j<=m ) ) then
+                       sum = one
+                       do i = 1, j - 1
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = 1, min( m, j )
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              else
+                 do j = 1, n
+                    if( udiag ) then
+                       sum = one
+                       do i = j + 1, m
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    else
+                       sum = zero
+                       do i = j, m
+                          sum = sum + abs( a( i, j ) )
+                       end do
+                    end if
+                    if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+                 end do
+              end if
+           else if( stdlib_lsame( norm, 'I' ) ) then
+              ! find normi(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, m
+                       work( i ) = one
+                    end do
+                    do j = 1, n
+                       do i = 1, min( m, j-1 )
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = 1, min( m, j )
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    do i = 1, min( m, n )
+                       work( i ) = one
+                    end do
+                    do i = n + 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j + 1, m
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 else
+                    do i = 1, m
+                       work( i ) = zero
+                    end do
+                    do j = 1, n
+                       do i = j, m
+                          work( i ) = work( i ) + abs( a( i, j ) )
+                       end do
+                    end do
+                 end if
+              end if
+              value = zero
+              do i = 1, m
+                 sum = work( i )
+                 if( value < sum .or. stdlib_disnan( sum ) ) value = sum
+              end do
+           else if( ( stdlib_lsame( norm, 'F' ) ) .or. ( stdlib_lsame( norm, 'E' ) ) ) &
+                     then
+              ! find normf(a).
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = min( m, n )
+                    do j = 2, n
+                       call stdlib_zlassq( min( m, j-1 ), a( 1, j ), 1, scale, sum )
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_zlassq( min( m, j ), a( 1, j ), 1, scale, sum )
+                    end do
+                 end if
+              else
+                 if( stdlib_lsame( diag, 'U' ) ) then
+                    scale = one
+                    sum = min( m, n )
+                    do j = 1, n
+                       call stdlib_zlassq( m-j, a( min( m, j+1 ), j ), 1, scale,sum )
+                    end do
+                 else
+                    scale = zero
+                    sum = one
+                    do j = 1, n
+                       call stdlib_zlassq( m-j+1, a( j, j ), 1, scale, sum )
+                    end do
+                 end if
+              end if
+              value = scale*sqrt( sum )
+           end if
+           stdlib_zlantr = value
+           return
+     end function stdlib_zlantr
+
+     !> Given two column vectors X and Y, let
+     !> A = ( X Y ).
+     !> The subroutine first computes the QR factorization of A = Q*R,
+     !> and then computes the SVD of the 2-by-2 upper triangular matrix R.
+     !> The smaller singular value of R is returned in SSMIN, which is used
+     !> as the measurement of the linear dependency of the vectors X and Y.
+
+     pure subroutine stdlib_zlapll( n, x, incx, y, incy, ssmin )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx, incy, n
+           real(dp), intent(out) :: ssmin
+           ! Array Arguments 
+           complex(dp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           real(dp) :: ssmax
+           complex(dp) :: a11, a12, a22, c, tau
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg
+           ! Executable Statements 
+           ! quick return if possible
+           if( n<=1 ) then
+              ssmin = zero
+              return
+           end if
+           ! compute the qr factorization of the n-by-2 matrix ( x y )
+           call stdlib_zlarfg( n, x( 1 ), x( 1+incx ), incx, tau )
+           a11 = x( 1 )
+           x( 1 ) = cone
+           c = -conjg( tau )*stdlib_zdotc( n, x, incx, y, incy )
+           call stdlib_zaxpy( n, c, x, incx, y, incy )
+           call stdlib_zlarfg( n-1, y( 1+incy ), y( 1+2*incy ), incy, tau )
+           a12 = y( 1 )
+           a22 = y( 1+incy )
+           ! compute the svd of 2-by-2 upper triangular matrix.
+           call stdlib_dlas2( abs( a11 ), abs( a12 ), abs( a22 ), ssmin, ssmax )
+           return
+     end subroutine stdlib_zlapll
+
+     !> ZLAQP2: computes a QR factorization with column pivoting of
+     !> the block A(OFFSET+1:M,1:N).
+     !> The block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.
+
+     pure subroutine stdlib_zlaqp2( m, n, offset, a, lda, jpvt, tau, vn1, vn2,work )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: lda, m, n, offset
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(dp), intent(inout) :: vn1(*), vn2(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, itemp, j, mn, offpi, pvt
+           real(dp) :: temp, temp2, tol3z
+           complex(dp) :: aii
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,min,sqrt
+           ! Executable Statements 
+           mn = min( m-offset, n )
+           tol3z = sqrt(stdlib_dlamch('EPSILON'))
+           ! compute factorization.
+           loop_20: do i = 1, mn
+              offpi = offset + i
+              ! determine ith pivot column and swap if necessary.
+              pvt = ( i-1 ) + stdlib_idamax( n-i+1, vn1( i ), 1 )
+              if( pvt/=i ) then
+                 call stdlib_zswap( m, a( 1, pvt ), 1, a( 1, i ), 1 )
+                 itemp = jpvt( pvt )
+                 jpvt( pvt ) = jpvt( i )
+                 jpvt( i ) = itemp
+                 vn1( pvt ) = vn1( i )
+                 vn2( pvt ) = vn2( i )
+              end if
+              ! generate elementary reflector h(i).
+              if( offpi<m ) then
+                 call stdlib_zlarfg( m-offpi+1, a( offpi, i ), a( offpi+1, i ), 1,tau( i ) )
+                           
+              else
+                 call stdlib_zlarfg( 1, a( m, i ), a( m, i ), 1, tau( i ) )
+              end if
+              if( i<n ) then
+                 ! apply h(i)**h to a(offset+i:m,i+1:n) from the left.
+                 aii = a( offpi, i )
+                 a( offpi, i ) = cone
+                 call stdlib_zlarf( 'LEFT', m-offpi+1, n-i, a( offpi, i ), 1,conjg( tau( i ) ), a(&
+                            offpi, i+1 ), lda,work( 1 ) )
+                 a( offpi, i ) = aii
+              end if
+              ! update partial column norms.
+              do j = i + 1, n
+                 if( vn1( j )/=zero ) then
+                    ! note: the following 4 lines follow from the analysis in
+                    ! lapack working note 176.
+                    temp = one - ( abs( a( offpi, j ) ) / vn1( j ) )**2
+                    temp = max( temp, zero )
+                    temp2 = temp*( vn1( j ) / vn2( j ) )**2
+                    if( temp2 <= tol3z ) then
+                       if( offpi<m ) then
+                          vn1( j ) = stdlib_dznrm2( m-offpi, a( offpi+1, j ), 1 )
+                          vn2( j ) = vn1( j )
+                       else
+                          vn1( j ) = zero
+                          vn2( j ) = zero
+                       end if
+                    else
+                       vn1( j ) = vn1( j )*sqrt( temp )
+                    end if
+                 end if
+              end do
+           end do loop_20
+           return
+     end subroutine stdlib_zlaqp2
+
+     !> ZLAQPS: computes a step of QR factorization with column pivoting
+     !> of a complex M-by-N matrix A by using Blas-3.  It tries to factorize
+     !> NB columns from A starting from the row OFFSET+1, and updates all
+     !> of the matrix with Blas-3 xGEMM.
+     !> In some cases, due to catastrophic cancellations, it cannot
+     !> factorize NB columns.  Hence, the actual number of factorized
+     !> columns is returned in KB.
+     !> Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized.
+
+     pure subroutine stdlib_zlaqps( m, n, offset, nb, kb, a, lda, jpvt, tau, vn1,vn2, auxv, f, &
+               ldf )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: kb
+           integer(ilp), intent(in) :: lda, ldf, m, n, nb, offset
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(dp), intent(inout) :: vn1(*), vn2(*)
+           complex(dp), intent(inout) :: a(lda,*), auxv(*), f(ldf,*)
+           complex(dp), intent(out) :: tau(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: itemp, j, k, lastrk, lsticc, pvt, rk
+           real(dp) :: temp, temp2, tol3z
+           complex(dp) :: akk
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,max,min,nint,sqrt
+           ! Executable Statements 
+           lastrk = min( m, n+offset )
+           lsticc = 0
+           k = 0
+           tol3z = sqrt(stdlib_dlamch('EPSILON'))
+           ! beginning of while loop.
+           10 continue
+           if( ( k<nb ) .and. ( lsticc==0 ) ) then
+              k = k + 1
+              rk = offset + k
+              ! determine ith pivot column and swap if necessary
+              pvt = ( k-1 ) + stdlib_idamax( n-k+1, vn1( k ), 1 )
+              if( pvt/=k ) then
+                 call stdlib_zswap( m, a( 1, pvt ), 1, a( 1, k ), 1 )
+                 call stdlib_zswap( k-1, f( pvt, 1 ), ldf, f( k, 1 ), ldf )
+                 itemp = jpvt( pvt )
+                 jpvt( pvt ) = jpvt( k )
+                 jpvt( k ) = itemp
+                 vn1( pvt ) = vn1( k )
+                 vn2( pvt ) = vn2( k )
+              end if
+              ! apply previous householder reflectors to column k:
+              ! a(rk:m,k) := a(rk:m,k) - a(rk:m,1:k-1)*f(k,1:k-1)**h.
+              if( k>1 ) then
+                 do j = 1, k - 1
+                    f( k, j ) = conjg( f( k, j ) )
+                 end do
+                 call stdlib_zgemv( 'NO TRANSPOSE', m-rk+1, k-1, -cone, a( rk, 1 ),lda, f( k, 1 ),&
+                            ldf, cone, a( rk, k ), 1 )
+                 do j = 1, k - 1
+                    f( k, j ) = conjg( f( k, j ) )
+                 end do
+              end if
+              ! generate elementary reflector h(k).
+              if( rk<m ) then
+                 call stdlib_zlarfg( m-rk+1, a( rk, k ), a( rk+1, k ), 1, tau( k ) )
+              else
+                 call stdlib_zlarfg( 1, a( rk, k ), a( rk, k ), 1, tau( k ) )
+              end if
+              akk = a( rk, k )
+              a( rk, k ) = cone
+              ! compute kth column of f:
+              ! compute  f(k+1:n,k) := tau(k)*a(rk:m,k+1:n)**h*a(rk:m,k).
+              if( k<n ) then
+                 call stdlib_zgemv( 'CONJUGATE TRANSPOSE', m-rk+1, n-k, tau( k ),a( rk, k+1 ), &
+                           lda, a( rk, k ), 1, czero,f( k+1, k ), 1 )
+              end if
+              ! padding f(1:k,k) with zeros.
+              do j = 1, k
+                 f( j, k ) = czero
+              end do
+              ! incremental updating of f:
+              ! f(1:n,k) := f(1:n,k) - tau(k)*f(1:n,1:k-1)*a(rk:m,1:k-1)**h
+                          ! *a(rk:m,k).
+              if( k>1 ) then
+                 call stdlib_zgemv( 'CONJUGATE TRANSPOSE', m-rk+1, k-1, -tau( k ),a( rk, 1 ), lda,&
+                            a( rk, k ), 1, czero,auxv( 1 ), 1 )
+                 call stdlib_zgemv( 'NO TRANSPOSE', n, k-1, cone, f( 1, 1 ), ldf,auxv( 1 ), 1, &
+                           cone, f( 1, k ), 1 )
+              end if
+              ! update the current row of a:
+              ! a(rk,k+1:n) := a(rk,k+1:n) - a(rk,1:k)*f(k+1:n,1:k)**h.
+              if( k<n ) then
+                 call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', 1, n-k,k, -cone, a( rk,&
+                            1 ), lda, f( k+1, 1 ), ldf,cone, a( rk, k+1 ), lda )
+              end if
+              ! update partial column norms.
+              if( rk<lastrk ) then
+                 do j = k + 1, n
+                    if( vn1( j )/=zero ) then
+                       ! note: the following 4 lines follow from the analysis in
+                       ! lapack working note 176.
+                       temp = abs( a( rk, j ) ) / vn1( j )
+                       temp = max( zero, ( one+temp )*( one-temp ) )
+                       temp2 = temp*( vn1( j ) / vn2( j ) )**2
+                       if( temp2 <= tol3z ) then
+                          vn2( j ) = real( lsticc,KIND=dp)
+                          lsticc = j
+                       else
+                          vn1( j ) = vn1( j )*sqrt( temp )
+                       end if
+                    end if
+                 end do
+              end if
+              a( rk, k ) = akk
+              ! end of while loop.
+              go to 10
+           end if
+           kb = k
+           rk = offset + kb
+           ! apply the block reflector to the rest of the matrix:
+           ! a(offset+kb+1:m,kb+1:n) := a(offset+kb+1:m,kb+1:n) -
+                               ! a(offset+kb+1:m,1:kb)*f(kb+1:n,1:kb)**h.
+           if( kb<min( n, m-offset ) ) then
+              call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m-rk, n-kb,kb, -cone, a( &
+                        rk+1, 1 ), lda, f( kb+1, 1 ), ldf,cone, a( rk+1, kb+1 ), lda )
+           end if
+           ! recomputation of difficult columns.
+           60 continue
+           if( lsticc>0 ) then
+              itemp = nint( vn2( lsticc ),KIND=ilp)
+              vn1( lsticc ) = stdlib_dznrm2( m-rk, a( rk+1, lsticc ), 1 )
+              ! note: the computation of vn1( lsticc ) relies on the fact that
+              ! stdlib_snrm2 does not fail on vectors with norm below the value of
+              ! sqrt(stdlib_dlamch('s'))
+              vn2( lsticc ) = vn1( lsticc )
+              lsticc = itemp
+              go to 60
+           end if
+           return
+     end subroutine stdlib_zlaqps
+
+     !> ZLAQR5:, called by ZLAQR0, performs a
+     !> single small-bulge multi-shift QR sweep.
+
+     pure subroutine stdlib_zlaqr5( wantt, wantz, kacc22, n, ktop, kbot, nshfts, s,h, ldh, iloz, &
+               ihiz, z, ldz, v, ldv, u, ldu, nv,wv, ldwv, nh, wh, ldwh )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kacc22, kbot, ktop, ldh, ldu, ldv, ldwh, ldwv, &
+                     ldz, n, nh, nshfts, nv
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(dp), intent(inout) :: h(ldh,*), s(*), z(ldz,*)
+           complex(dp), intent(out) :: u(ldu,*), v(ldv,*), wh(ldwh,*), wv(ldwv,*)
+        ! ================================================================
+           ! Parameters 
+           real(dp), parameter :: rzero = 0.0_dp
+           real(dp), parameter :: rone = 1.0_dp
+           
+           
+           ! Local Scalars 
+           complex(dp) :: alpha, beta, cdum, refsum
+           real(dp) :: h11, h12, h21, h22, safmax, safmin, scl, smlnum, tst1, tst2, ulp
+           integer(ilp) :: i2, i4, incol, j, jbot, jcol, jlen, jrow, jtop, k, k1, kdu, kms, krcol,&
+                      m, m22, mbot, mtop, nbmps, ndcol, ns, nu
+           logical(lk) :: accum, bmp22
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,aimag,max,min,mod
+           ! Local Arrays 
+           complex(dp) :: vt(3)
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! ==== if there are no shifts, then there is nothing to do. ====
+           if( nshfts<2 )return
+           ! ==== if the active block is empty or 1-by-1, then there
+           ! .    is nothing to do. ====
+           if( ktop>=kbot )return
+           ! ==== nshfts is supposed to be even, but if it is odd,
+           ! .    then simply reduce it by cone.  ====
+           ns = nshfts - mod( nshfts, 2 )
+           ! ==== machine constants for deflation ====
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = rone / safmin
+           call stdlib_dlabad( safmin, safmax )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=dp) / ulp )
+           ! ==== use accumulated reflections to update far-from-diagonal
+           ! .    entries ? ====
+           accum = ( kacc22==1 ) .or. ( kacc22==2 )
+           ! ==== clear trash ====
+           if( ktop+2<=kbot )h( ktop+2, ktop ) = czero
+           ! ==== nbmps = number of 2-shift bulges in the chain ====
+           nbmps = ns / 2
+           ! ==== kdu = width of slab ====
+           kdu = 4*nbmps
+           ! ==== create and chase chains of nbmps bulges ====
+           loop_180: do incol = ktop - 2*nbmps + 1, kbot - 2, 2*nbmps
+              ! jtop = index from which updates from the right start.
+              if( accum ) then
+                 jtop = max( ktop, incol )
+              else if( wantt ) then
+                 jtop = 1
+              else
+                 jtop = ktop
+              end if
+              ndcol = incol + kdu
+              if( accum )call stdlib_zlaset( 'ALL', kdu, kdu, czero, cone, u, ldu )
+              ! ==== near-the-diagonal bulge chase.  the following loop
+              ! .    performs the near-the-diagonal part of a small bulge
+              ! .    multi-shift qr sweep.  each 4*nbmps column diagonal
+              ! .    chunk extends from column incol to column ndcol
+              ! .    (including both column incol and column ndcol). the
+              ! .    following loop chases a 2*nbmps+1 column long chain of
+              ! .    nbmps bulges 2*nbmps columns to the right.  (incol
+              ! .    may be less than ktop and and ndcol may be greater than
+              ! .    kbot indicating phantom columns from which to chase
+              ! .    bulges before they are actually introduced or to which
+              ! .    to chase bulges beyond column kbot.)  ====
+              loop_145: do krcol = incol, min( incol+2*nbmps-1, kbot-2 )
+                 ! ==== bulges number mtop to mbot are active double implicit
+                 ! .    shift bulges.  there may or may not also be small
+                 ! .    2-by-2 bulge, if there is room.  the inactive bulges
+                 ! .    (if any) must wait until the active bulges have moved
+                 ! .    down the diagonal to make room.  the phantom matrix
+                 ! .    paradigm described above helps keep track.  ====
+                 mtop = max( 1, ( ktop-krcol ) / 2+1 )
+                 mbot = min( nbmps, ( kbot-krcol-1 ) / 2 )
+                 m22 = mbot + 1
+                 bmp22 = ( mbot<nbmps ) .and. ( krcol+2*( m22-1 ) )==( kbot-2 )
+                 ! ==== generate reflections to chase the chain right
+                 ! .    cone column.  (the minimum value of k is ktop-1.) ====
+                 if ( bmp22 ) then
+                    ! ==== special case: 2-by-2 reflection at bottom treated
+                    ! .    separately ====
+                    k = krcol + 2*( m22-1 )
+                    if( k==ktop-1 ) then
+                       call stdlib_zlaqr1( 2, h( k+1, k+1 ), ldh, s( 2*m22-1 ),s( 2*m22 ), v( 1, &
+                                 m22 ) )
+                       beta = v( 1, m22 )
+                       call stdlib_zlarfg( 2, beta, v( 2, m22 ), 1, v( 1, m22 ) )
+                    else
+                       beta = h( k+1, k )
+                       v( 2, m22 ) = h( k+2, k )
+                       call stdlib_zlarfg( 2, beta, v( 2, m22 ), 1, v( 1, m22 ) )
+                       h( k+1, k ) = beta
+                       h( k+2, k ) = czero
+                    end if
+                    ! ==== perform update from right within
+                    ! .    computational window. ====
+                    do j = jtop, min( kbot, k+3 )
+                       refsum = v( 1, m22 )*( h( j, k+1 )+v( 2, m22 )*h( j, k+2 ) )
+                       h( j, k+1 ) = h( j, k+1 ) - refsum
+                       h( j, k+2 ) = h( j, k+2 ) -refsum*conjg( v( 2, m22 ) )
+                    end do
+                    ! ==== perform update from left within
+                    ! .    computational window. ====
+                    if( accum ) then
+                       jbot = min( ndcol, kbot )
+                    else if( wantt ) then
+                       jbot = n
+                    else
+                       jbot = kbot
+                    end if
+                    do j = k+1, jbot
+                       refsum = conjg( v( 1, m22 ) )*( h( k+1, j )+conjg( v( 2, m22 ) )*h( k+2, j &
+                                 ) )
+                       h( k+1, j ) = h( k+1, j ) - refsum
+                       h( k+2, j ) = h( k+2, j ) - refsum*v( 2, m22 )
+                    end do
+                    ! ==== the following convergence test requires that
+                    ! .    the tradition small-compared-to-nearby-diagonals
+                    ! .    criterion and the ahues
+                    ! .    criteria both be satisfied.  the latter improves
+                    ! .    accuracy in some examples. falling back on an
+                    ! .    alternate convergence criterion when tst1 or tst2
+                    ! .    is czero (as done here) is traditional but probably
+                    ! .    unnecessary. ====
+                    if( k>=ktop ) then
+                       if( h( k+1, k )/=czero ) then
+                          tst1 = cabs1( h( k, k ) ) + cabs1( h( k+1, k+1 ) )
+                          if( tst1==rzero ) then
+                             if( k>=ktop+1 )tst1 = tst1 + cabs1( h( k, k-1 ) )
+                             if( k>=ktop+2 )tst1 = tst1 + cabs1( h( k, k-2 ) )
+                             if( k>=ktop+3 )tst1 = tst1 + cabs1( h( k, k-3 ) )
+                             if( k<=kbot-2 )tst1 = tst1 + cabs1( h( k+2, k+1 ) )
+                             if( k<=kbot-3 )tst1 = tst1 + cabs1( h( k+3, k+1 ) )
+                             if( k<=kbot-4 )tst1 = tst1 + cabs1( h( k+4, k+1 ) )
+                          end if
+                          if( cabs1( h( k+1, k ) )<=max( smlnum, ulp*tst1 ) ) then
+                             h12 = max( cabs1( h( k+1, k ) ),cabs1( h( k, k+1 ) ) )
+                             h21 = min( cabs1( h( k+1, k ) ),cabs1( h( k, k+1 ) ) )
+                             h11 = max( cabs1( h( k+1, k+1 ) ),cabs1( h( k, k )-h( k+1, k+1 ) ) )
+                                       
+                             h22 = min( cabs1( h( k+1, k+1 ) ),cabs1( h( k, k )-h( k+1, k+1 ) ) )
+                                       
+                             scl = h11 + h12
+                             tst2 = h22*( h11 / scl )
+                             if( tst2==rzero .or. h21*( h12 / scl )<=max( smlnum, ulp*tst2 ) )h( &
+                                       k+1, k ) = czero
+                          end if
+                       end if
+                    end if
+                    ! ==== accumulate orthogonal transformations. ====
+                    if( accum ) then
+                       kms = k - incol
+                       do j = max( 1, ktop-incol ), kdu
+                          refsum = v( 1, m22 )*( u( j, kms+1 )+v( 2, m22 )*u( j, kms+2 ) )
+                          u( j, kms+1 ) = u( j, kms+1 ) - refsum
+                          u( j, kms+2 ) = u( j, kms+2 ) -refsum*conjg( v( 2, m22 ) )
+                       end do
+                    else if( wantz ) then
+                       do j = iloz, ihiz
+                          refsum = v( 1, m22 )*( z( j, k+1 )+v( 2, m22 )*z( j, k+2 ) )
+                          z( j, k+1 ) = z( j, k+1 ) - refsum
+                          z( j, k+2 ) = z( j, k+2 ) -refsum*conjg( v( 2, m22 ) )
+                       end do
+                    end if
+                 end if
+                 ! ==== normal case: chain of 3-by-3 reflections ====
+                 loop_80: do m = mbot, mtop, -1
+                    k = krcol + 2*( m-1 )
+                    if( k==ktop-1 ) then
+                       call stdlib_zlaqr1( 3, h( ktop, ktop ), ldh, s( 2*m-1 ),s( 2*m ), v( 1, m )&
+                                  )
+                       alpha = v( 1, m )
+                       call stdlib_zlarfg( 3, alpha, v( 2, m ), 1, v( 1, m ) )
+                    else
+                       ! ==== perform delayed transformation of row below
+                       ! .    mth bulge. exploit fact that first two elements
+                       ! .    of row are actually czero. ====
+                       refsum = v( 1, m )*v( 3, m )*h( k+3, k+2 )
+                       h( k+3, k   ) = -refsum
+                       h( k+3, k+1 ) = -refsum*conjg( v( 2, m ) )
+                       h( k+3, k+2 ) = h( k+3, k+2 ) -refsum*conjg( v( 3, m ) )
+                       ! ==== calculate reflection to move
+                       ! .    mth bulge cone step. ====
+                       beta      = h( k+1, k )
+                       v( 2, m ) = h( k+2, k )
+                       v( 3, m ) = h( k+3, k )
+                       call stdlib_zlarfg( 3, beta, v( 2, m ), 1, v( 1, m ) )
+                       ! ==== a bulge may collapse because of vigilant
+                       ! .    deflation or destructive underflow.  in the
+                       ! .    underflow case, try the two-small-subdiagonals
+                       ! .    trick to try to reinflate the bulge.  ====
+                       if( h( k+3, k )/=czero .or. h( k+3, k+1 )/=czero .or. h( k+3, k+2 )==czero &
+                                 ) then
+                          ! ==== typical case: not collapsed (yet). ====
+                          h( k+1, k ) = beta
+                          h( k+2, k ) = czero
+                          h( k+3, k ) = czero
+                       else
+                          ! ==== atypical case: collapsed.  attempt to
+                          ! .    reintroduce ignoring h(k+1,k) and h(k+2,k).
+                          ! .    if the fill resulting from the new
+                          ! .    reflector is too large, then abandon it.
+                          ! .    otherwise, use the new cone. ====
+                          call stdlib_zlaqr1( 3, h( k+1, k+1 ), ldh, s( 2*m-1 ),s( 2*m ), vt )
+                                    
+                          alpha = vt( 1 )
+                          call stdlib_zlarfg( 3, alpha, vt( 2 ), 1, vt( 1 ) )
+                          refsum = conjg( vt( 1 ) )*( h( k+1, k )+conjg( vt( 2 ) )*h( k+2, k ) )
+                                    
+                          if( cabs1( h( k+2, k )-refsum*vt( 2 ) )+cabs1( refsum*vt( 3 ) )>ulp*( &
+                          cabs1( h( k, k ) )+cabs1( h( k+1,k+1 ) )+cabs1( h( k+2, k+2 ) ) ) ) &
+                                    then
+                             ! ==== starting a new bulge here would
+                             ! .    create non-negligible fill.  use
+                             ! .    the old cone with trepidation. ====
+                             h( k+1, k ) = beta
+                             h( k+2, k ) = czero
+                             h( k+3, k ) = czero
+                          else
+                             ! ==== starting a new bulge here would
+                             ! .    create only negligible fill.
+                             ! .    replace the old reflector with
+                             ! .    the new cone. ====
+                             h( k+1, k ) = h( k+1, k ) - refsum
+                             h( k+2, k ) = czero
+                             h( k+3, k ) = czero
+                             v( 1, m ) = vt( 1 )
+                             v( 2, m ) = vt( 2 )
+                             v( 3, m ) = vt( 3 )
+                          end if
+                       end if
+                    end if
+                    ! ====  apply reflection from the right and
+                    ! .     the first column of update from the left.
+                    ! .     these updates are required for the vigilant
+                    ! .     deflation check. we still delay most of the
+                    ! .     updates from the left for efficiency. ====
+                    do j = jtop, min( kbot, k+3 )
+                       refsum = v( 1, m )*( h( j, k+1 )+v( 2, m )*h( j, k+2 )+v( 3, m )*h( j, k+3 &
+                                 ) )
+                       h( j, k+1 ) = h( j, k+1 ) - refsum
+                       h( j, k+2 ) = h( j, k+2 ) -refsum*conjg( v( 2, m ) )
+                       h( j, k+3 ) = h( j, k+3 ) -refsum*conjg( v( 3, m ) )
+                    end do
+                    ! ==== perform update from left for subsequent
+                    ! .    column. ====
+                    refsum =  conjg( v( 1, m ) )*( h( k+1, k+1 )+conjg( v( 2, m ) )*h( k+2, k+1 )+&
+                              conjg( v( 3, m ) )*h( k+3, k+1 ) )
+                    h( k+1, k+1 ) = h( k+1, k+1 ) - refsum
+                    h( k+2, k+1 ) = h( k+2, k+1 ) - refsum*v( 2, m )
+                    h( k+3, k+1 ) = h( k+3, k+1 ) - refsum*v( 3, m )
+                    ! ==== the following convergence test requires that
+                    ! .    the tradition small-compared-to-nearby-diagonals
+                    ! .    criterion and the ahues
+                    ! .    criteria both be satisfied.  the latter improves
+                    ! .    accuracy in some examples. falling back on an
+                    ! .    alternate convergence criterion when tst1 or tst2
+                    ! .    is czero (as done here) is traditional but probably
+                    ! .    unnecessary. ====
+                    if( k<ktop)cycle
+                    if( h( k+1, k )/=czero ) then
+                       tst1 = cabs1( h( k, k ) ) + cabs1( h( k+1, k+1 ) )
+                       if( tst1==rzero ) then
+                          if( k>=ktop+1 )tst1 = tst1 + cabs1( h( k, k-1 ) )
+                          if( k>=ktop+2 )tst1 = tst1 + cabs1( h( k, k-2 ) )
+                          if( k>=ktop+3 )tst1 = tst1 + cabs1( h( k, k-3 ) )
+                          if( k<=kbot-2 )tst1 = tst1 + cabs1( h( k+2, k+1 ) )
+                          if( k<=kbot-3 )tst1 = tst1 + cabs1( h( k+3, k+1 ) )
+                          if( k<=kbot-4 )tst1 = tst1 + cabs1( h( k+4, k+1 ) )
+                       end if
+                       if( cabs1( h( k+1, k ) )<=max( smlnum, ulp*tst1 ) )then
+                          h12 = max( cabs1( h( k+1, k ) ),cabs1( h( k, k+1 ) ) )
+                          h21 = min( cabs1( h( k+1, k ) ),cabs1( h( k, k+1 ) ) )
+                          h11 = max( cabs1( h( k+1, k+1 ) ),cabs1( h( k, k )-h( k+1, k+1 ) ) )
+                                    
+                          h22 = min( cabs1( h( k+1, k+1 ) ),cabs1( h( k, k )-h( k+1, k+1 ) ) )
+                                    
+                          scl = h11 + h12
+                          tst2 = h22*( h11 / scl )
+                          if( tst2==rzero .or. h21*( h12 / scl )<=max( smlnum, ulp*tst2 ) )h( k+1,&
+                                     k ) = czero
+                       end if
+                    end if
+                 end do loop_80
+                 ! ==== multiply h by reflections from the left ====
+                 if( accum ) then
+                    jbot = min( ndcol, kbot )
+                 else if( wantt ) then
+                    jbot = n
+                 else
+                    jbot = kbot
+                 end if
+                 do m = mbot, mtop, -1
+                    k = krcol + 2*( m-1 )
+                    do j = max( ktop, krcol + 2*m ), jbot
+                       refsum = conjg( v( 1, m ) )*( h( k+1, j )+conjg( v( 2, m ) )*h( k+2, j )+&
+                                 conjg( v( 3, m ) )*h( k+3, j ) )
+                       h( k+1, j ) = h( k+1, j ) - refsum
+                       h( k+2, j ) = h( k+2, j ) - refsum*v( 2, m )
+                       h( k+3, j ) = h( k+3, j ) - refsum*v( 3, m )
+                    end do
+                 end do
+                 ! ==== accumulate orthogonal transformations. ====
+                 if( accum ) then
+                    ! ==== accumulate u. (if needed, update z later
+                    ! .    with an efficient matrix-matrix
+                    ! .    multiply.) ====
+                    do m = mbot, mtop, -1
+                       k = krcol + 2*( m-1 )
+                       kms = k - incol
+                       i2 = max( 1, ktop-incol )
+                       i2 = max( i2, kms-(krcol-incol)+1 )
+                       i4 = min( kdu, krcol + 2*( mbot-1 ) - incol + 5 )
+                       do j = i2, i4
+                          refsum = v( 1, m )*( u( j, kms+1 )+v( 2, m )*u( j, kms+2 )+v( 3, m )*u( &
+                                    j, kms+3 ) )
+                          u( j, kms+1 ) = u( j, kms+1 ) - refsum
+                          u( j, kms+2 ) = u( j, kms+2 ) -refsum*conjg( v( 2, m ) )
+                          u( j, kms+3 ) = u( j, kms+3 ) -refsum*conjg( v( 3, m ) )
+                       end do
+                    end do
+                 else if( wantz ) then
+                    ! ==== u is not accumulated, so update z
+                    ! .    now by multiplying by reflections
+                    ! .    from the right. ====
+                    do m = mbot, mtop, -1
+                       k = krcol + 2*( m-1 )
+                       do j = iloz, ihiz
+                          refsum = v( 1, m )*( z( j, k+1 )+v( 2, m )*z( j, k+2 )+v( 3, m )*z( j, &
+                                    k+3 ) )
+                          z( j, k+1 ) = z( j, k+1 ) - refsum
+                          z( j, k+2 ) = z( j, k+2 ) -refsum*conjg( v( 2, m ) )
+                          z( j, k+3 ) = z( j, k+3 ) -refsum*conjg( v( 3, m ) )
+                       end do
+                    end do
+                 end if
+                 ! ==== end of near-the-diagonal bulge chase. ====
+              end do loop_145
+              ! ==== use u (if accumulated) to update far-from-diagonal
+              ! .    entries in h.  if required, use u to update z as
+              ! .    well. ====
+              if( accum ) then
+                 if( wantt ) then
+                    jtop = 1
+                    jbot = n
+                 else
+                    jtop = ktop
+                    jbot = kbot
+                 end if
+                 k1 = max( 1, ktop-incol )
+                 nu = ( kdu-max( 0, ndcol-kbot ) ) - k1 + 1
+                 ! ==== horizontal multiply ====
+                 do jcol = min( ndcol, kbot ) + 1, jbot, nh
+                    jlen = min( nh, jbot-jcol+1 )
+                    call stdlib_zgemm( 'C', 'N', nu, jlen, nu, cone, u( k1, k1 ),ldu, h( incol+k1,&
+                               jcol ), ldh, czero, wh,ldwh )
+                    call stdlib_zlacpy( 'ALL', nu, jlen, wh, ldwh,h( incol+k1, jcol ), ldh )
+                              
+                 end do
+                 ! ==== vertical multiply ====
+                 do jrow = jtop, max( ktop, incol ) - 1, nv
+                    jlen = min( nv, max( ktop, incol )-jrow )
+                    call stdlib_zgemm( 'N', 'N', jlen, nu, nu, cone,h( jrow, incol+k1 ), ldh, u( &
+                              k1, k1 ),ldu, czero, wv, ldwv )
+                    call stdlib_zlacpy( 'ALL', jlen, nu, wv, ldwv,h( jrow, incol+k1 ), ldh )
+                              
+                 end do
+                 ! ==== z multiply (also vertical) ====
+                 if( wantz ) then
+                    do jrow = iloz, ihiz, nv
+                       jlen = min( nv, ihiz-jrow+1 )
+                       call stdlib_zgemm( 'N', 'N', jlen, nu, nu, cone,z( jrow, incol+k1 ), ldz, &
+                                 u( k1, k1 ),ldu, czero, wv, ldwv )
+                       call stdlib_zlacpy( 'ALL', jlen, nu, wv, ldwv,z( jrow, incol+k1 ), ldz )
+                                 
+                    end do
+                 end if
+              end if
+           end do loop_180
+     end subroutine stdlib_zlaqr5
+
+     !> ZLAQZ1: chases a 1x1 shift bulge in a matrix pencil down a single position
+
+     pure subroutine stdlib_zlaqz1( ilq, ilz, k, istartm, istopm, ihi, a, lda, b,ldb, nq, qstart, &
+               q, ldq, nz, zstart, z, ldz )
+           ! arguments
+           logical(lk), intent( in ) :: ilq, ilz
+           integer(ilp), intent( in ) :: k, lda, ldb, ldq, ldz, istartm, istopm,nq, nz, qstart, &
+                     zstart, ihi
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           
+           
+           ! local variables
+           real(dp) :: c
+           complex(dp) :: s, temp
+           if( k+1 == ihi ) then
+              ! shift is located on the edge of the matrix, remove it
+              call stdlib_zlartg( b( ihi, ihi ), b( ihi, ihi-1 ), c, s, temp )
+              b( ihi, ihi ) = temp
+              b( ihi, ihi-1 ) = czero
+              call stdlib_zrot( ihi-istartm, b( istartm, ihi ), 1, b( istartm,ihi-1 ), 1, c, s )
+                        
+              call stdlib_zrot( ihi-istartm+1, a( istartm, ihi ), 1, a( istartm,ihi-1 ), 1, c, s )
+                        
+              if ( ilz ) then
+                 call stdlib_zrot( nz, z( 1, ihi-zstart+1 ), 1, z( 1, ihi-1-zstart+1 ), 1, c, s )
+                           
+              end if
+           else
+              ! normal operation, move bulge down
+              ! apply transformation from the right
+              call stdlib_zlartg( b( k+1, k+1 ), b( k+1, k ), c, s, temp )
+              b( k+1, k+1 ) = temp
+              b( k+1, k ) = czero
+              call stdlib_zrot( k+2-istartm+1, a( istartm, k+1 ), 1, a( istartm,k ), 1, c, s )
+                        
+              call stdlib_zrot( k-istartm+1, b( istartm, k+1 ), 1, b( istartm, k ),1, c, s )
+                        
+              if ( ilz ) then
+                 call stdlib_zrot( nz, z( 1, k+1-zstart+1 ), 1, z( 1, k-zstart+1 ),1, c, s )
+                           
+              end if
+              ! apply transformation from the left
+              call stdlib_zlartg( a( k+1, k ), a( k+2, k ), c, s, temp )
+              a( k+1, k ) = temp
+              a( k+2, k ) = czero
+              call stdlib_zrot( istopm-k, a( k+1, k+1 ), lda, a( k+2, k+1 ), lda, c,s )
+              call stdlib_zrot( istopm-k, b( k+1, k+1 ), ldb, b( k+2, k+1 ), ldb, c,s )
+              if ( ilq ) then
+                 call stdlib_zrot( nq, q( 1, k+1-qstart+1 ), 1, q( 1, k+2-qstart+1 ), 1, c, conjg(&
+                            s ) )
+              end if
+           end if
+     end subroutine stdlib_zlaqz1
+
+     !> ZLAQZ3: Executes a single multishift QZ sweep
+
+     pure subroutine stdlib_zlaqz3( ilschur, ilq, ilz, n, ilo, ihi, nshifts,nblock_desired, alpha,&
+                beta, a, lda, b, ldb,q, ldq, z, ldz, qc, ldqc, zc, ldzc, work,lwork, info )
+           ! function arguments
+           logical(lk), intent( in ) :: ilschur, ilq, ilz
+           integer(ilp), intent( in ) :: n, ilo, ihi, lda, ldb, ldq, ldz, lwork,nshifts, &
+                     nblock_desired, ldqc, ldzc
+           complex(dp), intent( inout ) :: a( lda, * ), b( ldb, * ), q( ldq,* ), z( ldz, * ), qc( &
+                     ldqc, * ), zc( ldzc, * ), work( * ),alpha( * ), beta( * )
+           integer(ilp), intent( out ) :: info
+           
+           
+           ! local scalars
+           integer(ilp) :: i, j, ns, istartm, istopm, sheight, swidth, k, np, istartb, istopb, &
+                     ishift, nblock, npos
+           real(dp) :: safmin, safmax, c, scale
+           complex(dp) :: temp, temp2, temp3, s
+           info = 0
+           if ( nblock_desired < nshifts+1 ) then
+              info = -8
+           end if
+           if ( lwork ==-1 ) then
+              ! workspace query, quick return
+              work( 1 ) = n*nblock_desired
+              return
+           else if ( lwork < n*nblock_desired ) then
+              info = -25
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAQZ3', -info )
+              return
+           end if
+           ! executable statements
+           ! get machine constants
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_dlabad( safmin, safmax )
+           if ( ilo >= ihi ) then
+              return
+           end if
+           if ( ilschur ) then
+              istartm = 1
+              istopm = n
+           else
+              istartm = ilo
+              istopm = ihi
+           end if
+           ns = nshifts
+           npos = max( nblock_desired-ns, 1 )
+           ! the following block introduces the shifts and chases
+           ! them down one by one just enough to make space for
+           ! the other shifts. the near-the-diagonal block is
+           ! of size (ns+1) x ns.
+           call stdlib_zlaset( 'FULL', ns+1, ns+1, czero, cone, qc, ldqc )
+           call stdlib_zlaset( 'FULL', ns, ns, czero, cone, zc, ldzc )
+           do i = 1, ns
+              ! introduce the shift
+              scale = sqrt( abs( alpha( i ) ) ) * sqrt( abs( beta( i ) ) )
+              if( scale >= safmin .and. scale <= safmax ) then
+                 alpha( i ) = alpha( i )/scale
+                 beta( i ) = beta( i )/scale
+              end if
+              temp2 = beta( i )*a( ilo, ilo )-alpha( i )*b( ilo, ilo )
+              temp3 = beta( i )*a( ilo+1, ilo )
+              if ( abs( temp2 ) > safmax .or.abs( temp3 ) > safmax ) then
+                 temp2 = cone
+                 temp3 = czero
+              end if
+              call stdlib_zlartg( temp2, temp3, c, s, temp )
+              call stdlib_zrot( ns, a( ilo, ilo ), lda, a( ilo+1, ilo ), lda, c,s )
+              call stdlib_zrot( ns, b( ilo, ilo ), ldb, b( ilo+1, ilo ), ldb, c,s )
+              call stdlib_zrot( ns+1, qc( 1, 1 ), 1, qc( 1, 2 ), 1, c,conjg( s ) )
+              ! chase the shift down
+              do j = 1, ns-i
+                 call stdlib_zlaqz1( .true., .true., j, 1, ns, ihi-ilo+1, a( ilo,ilo ), lda, b( &
+                           ilo, ilo ), ldb, ns+1, 1, qc,ldqc, ns, 1, zc, ldzc )
+              end do
+           end do
+           ! update the rest of the pencil
+           ! update a(ilo:ilo+ns,ilo+ns:istopm) and b(ilo:ilo+ns,ilo+ns:istopm)
+           ! from the left with qc(1:ns+1,1:ns+1)'
+           sheight = ns+1
+           swidth = istopm-( ilo+ns )+1
+           if ( swidth > 0 ) then
+              call stdlib_zgemm( 'C', 'N', sheight, swidth, sheight, cone, qc, ldqc,a( ilo, ilo+&
+                        ns ), lda, czero, work, sheight )
+              call stdlib_zlacpy( 'ALL', sheight, swidth, work, sheight, a( ilo,ilo+ns ), lda )
+                        
+              call stdlib_zgemm( 'C', 'N', sheight, swidth, sheight, cone, qc, ldqc,b( ilo, ilo+&
+                        ns ), ldb, czero, work, sheight )
+              call stdlib_zlacpy( 'ALL', sheight, swidth, work, sheight, b( ilo,ilo+ns ), ldb )
+                        
+           end if
+           if ( ilq ) then
+              call stdlib_zgemm( 'N', 'N', n, sheight, sheight, cone, q( 1, ilo ),ldq, qc, ldqc, &
+                        czero, work, n )
+              call stdlib_zlacpy( 'ALL', n, sheight, work, n, q( 1, ilo ), ldq )
+           end if
+           ! update a(istartm:ilo-1,ilo:ilo+ns-1) and b(istartm:ilo-1,ilo:ilo+ns-1)
+           ! from the right with zc(1:ns,1:ns)
+           sheight = ilo-1-istartm+1
+           swidth = ns
+           if ( sheight > 0 ) then
+              call stdlib_zgemm( 'N', 'N', sheight, swidth, swidth, cone,a( istartm, ilo ), lda, &
+                        zc, ldzc, czero, work,sheight )
+              call stdlib_zlacpy( 'ALL', sheight, swidth, work, sheight, a( istartm,ilo ), lda )
+                        
+              call stdlib_zgemm( 'N', 'N', sheight, swidth, swidth, cone,b( istartm, ilo ), ldb, &
+                        zc, ldzc, czero, work,sheight )
+              call stdlib_zlacpy( 'ALL', sheight, swidth, work, sheight, b( istartm,ilo ), ldb )
+                        
+           end if
+           if ( ilz ) then
+              call stdlib_zgemm( 'N', 'N', n, swidth, swidth, cone, z( 1, ilo ),ldz, zc, ldzc, &
+                        czero, work, n )
+              call stdlib_zlacpy( 'ALL', n, swidth, work, n, z( 1, ilo ), ldz )
+           end if
+           ! the following block chases the shifts down to the bottom
+           ! right block. if possible, a shift is moved down npos
+           ! positions at a time
+           k = ilo
+           do while ( k < ihi-ns )
+              np = min( ihi-ns-k, npos )
+              ! size of the near-the-diagonal block
+              nblock = ns+np
+              ! istartb points to the first row we will be updating
+              istartb = k+1
+              ! istopb points to the last column we will be updating
+              istopb = k+nblock-1
+              call stdlib_zlaset( 'FULL', ns+np, ns+np, czero, cone, qc, ldqc )
+              call stdlib_zlaset( 'FULL', ns+np, ns+np, czero, cone, zc, ldzc )
+              ! near the diagonal shift chase
+              do i = ns-1, 0, -1
+                 do j = 0, np-1
+                    ! move down the block with index k+i+j, updating
+                    ! the (ns+np x ns+np) block:
+                    ! (k:k+ns+np,k:k+ns+np-1)
+                    call stdlib_zlaqz1( .true., .true., k+i+j, istartb, istopb, ihi,a, lda, b, &
+                              ldb, nblock, k+1, qc, ldqc,nblock, k, zc, ldzc )
+                 end do
+              end do
+              ! update rest of the pencil
+              ! update a(k+1:k+ns+np, k+ns+np:istopm) and
+              ! b(k+1:k+ns+np, k+ns+np:istopm)
+              ! from the left with qc(1:ns+np,1:ns+np)'
+              sheight = ns+np
+              swidth = istopm-( k+ns+np )+1
+              if ( swidth > 0 ) then
+                 call stdlib_zgemm( 'C', 'N', sheight, swidth, sheight, cone, qc,ldqc, a( k+1, k+&
+                           ns+np ), lda, czero, work,sheight )
+                 call stdlib_zlacpy( 'ALL', sheight, swidth, work, sheight, a( k+1,k+ns+np ), lda &
+                           )
+                 call stdlib_zgemm( 'C', 'N', sheight, swidth, sheight, cone, qc,ldqc, b( k+1, k+&
+                           ns+np ), ldb, czero, work,sheight )
+                 call stdlib_zlacpy( 'ALL', sheight, swidth, work, sheight, b( k+1,k+ns+np ), ldb &
+                           )
+              end if
+              if ( ilq ) then
+                 call stdlib_zgemm( 'N', 'N', n, nblock, nblock, cone, q( 1, k+1 ),ldq, qc, ldqc, &
+                           czero, work, n )
+                 call stdlib_zlacpy( 'ALL', n, nblock, work, n, q( 1, k+1 ), ldq )
+              end if
+              ! update a(istartm:k,k:k+ns+npos-1) and b(istartm:k,k:k+ns+npos-1)
+              ! from the right with zc(1:ns+np,1:ns+np)
+              sheight = k-istartm+1
+              swidth = nblock
+              if ( sheight > 0 ) then
+                 call stdlib_zgemm( 'N', 'N', sheight, swidth, swidth, cone,a( istartm, k ), lda, &
+                           zc, ldzc, czero, work,sheight )
+                 call stdlib_zlacpy( 'ALL', sheight, swidth, work, sheight,a( istartm, k ), lda )
+                           
+                 call stdlib_zgemm( 'N', 'N', sheight, swidth, swidth, cone,b( istartm, k ), ldb, &
+                           zc, ldzc, czero, work,sheight )
+                 call stdlib_zlacpy( 'ALL', sheight, swidth, work, sheight,b( istartm, k ), ldb )
+                           
+              end if
+              if ( ilz ) then
+                 call stdlib_zgemm( 'N', 'N', n, nblock, nblock, cone, z( 1, k ),ldz, zc, ldzc, &
+                           czero, work, n )
+                 call stdlib_zlacpy( 'ALL', n, nblock, work, n, z( 1, k ), ldz )
+              end if
+              k = k+np
+           end do
+           ! the following block removes the shifts from the bottom right corner
+           ! one by one. updates are initially applied to a(ihi-ns+1:ihi,ihi-ns:ihi).
+           call stdlib_zlaset( 'FULL', ns, ns, czero, cone, qc, ldqc )
+           call stdlib_zlaset( 'FULL', ns+1, ns+1, czero, cone, zc, ldzc )
+           ! istartb points to the first row we will be updating
+           istartb = ihi-ns+1
+           ! istopb points to the last column we will be updating
+           istopb = ihi
+           do i = 1, ns
+              ! chase the shift down to the bottom right corner
+              do ishift = ihi-i, ihi-1
+                 call stdlib_zlaqz1( .true., .true., ishift, istartb, istopb, ihi,a, lda, b, ldb, &
+                           ns, ihi-ns+1, qc, ldqc, ns+1,ihi-ns, zc, ldzc )
+              end do
+           end do
+           ! update rest of the pencil
+           ! update a(ihi-ns+1:ihi, ihi+1:istopm)
+           ! from the left with qc(1:ns,1:ns)'
+           sheight = ns
+           swidth = istopm-( ihi+1 )+1
+           if ( swidth > 0 ) then
+              call stdlib_zgemm( 'C', 'N', sheight, swidth, sheight, cone, qc, ldqc,a( ihi-ns+1, &
+                        ihi+1 ), lda, czero, work, sheight )
+              call stdlib_zlacpy( 'ALL', sheight, swidth, work, sheight,a( ihi-ns+1, ihi+1 ), lda &
+                        )
+              call stdlib_zgemm( 'C', 'N', sheight, swidth, sheight, cone, qc, ldqc,b( ihi-ns+1, &
+                        ihi+1 ), ldb, czero, work, sheight )
+              call stdlib_zlacpy( 'ALL', sheight, swidth, work, sheight,b( ihi-ns+1, ihi+1 ), ldb &
+                        )
+           end if
+           if ( ilq ) then
+              call stdlib_zgemm( 'N', 'N', n, ns, ns, cone, q( 1, ihi-ns+1 ), ldq,qc, ldqc, czero,&
+                         work, n )
+              call stdlib_zlacpy( 'ALL', n, ns, work, n, q( 1, ihi-ns+1 ), ldq )
+           end if
+           ! update a(istartm:ihi-ns,ihi-ns:ihi)
+           ! from the right with zc(1:ns+1,1:ns+1)
+           sheight = ihi-ns-istartm+1
+           swidth = ns+1
+           if ( sheight > 0 ) then
+              call stdlib_zgemm( 'N', 'N', sheight, swidth, swidth, cone,a( istartm, ihi-ns ), &
+                        lda, zc, ldzc, czero, work,sheight )
+              call stdlib_zlacpy( 'ALL', sheight, swidth, work, sheight, a( istartm,ihi-ns ), lda &
+                        )
+              call stdlib_zgemm( 'N', 'N', sheight, swidth, swidth, cone,b( istartm, ihi-ns ), &
+                        ldb, zc, ldzc, czero, work,sheight )
+              call stdlib_zlacpy( 'ALL', sheight, swidth, work, sheight, b( istartm,ihi-ns ), ldb &
+                        )
+           end if
+           if ( ilz ) then
+              call stdlib_zgemm( 'N', 'N', n, ns+1, ns+1, cone, z( 1, ihi-ns ), ldz,zc, ldzc, &
+                        czero, work, n )
+              call stdlib_zlacpy( 'ALL', n, ns+1, work, n, z( 1, ihi-ns ), ldz )
+           end if
+     end subroutine stdlib_zlaqz3
+
+     !> ZLARGV: generates a vector of complex plane rotations with real
+     !> cosines, determined by elements of the complex vectors x and y.
+     !> For i = 1,2,...,n
+     !> (        c(i)   s(i) ) ( x(i) ) = ( r(i) )
+     !> ( -conjg(s(i))  c(i) ) ( y(i) ) = (   0  )
+     !> where c(i)**2 + ABS(s(i))**2 = 1
+     !> The following conventions are used (these are the same as in ZLARTG,
+     !> but differ from the BLAS1 routine ZROTG):
+     !> If y(i)=0, then c(i)=1 and s(i)=0.
+     !> If x(i)=0, then c(i)=0 and s(i) is chosen so that r(i) is real.
+
+     pure subroutine stdlib_zlargv( n, x, incx, y, incy, c, incc )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incc, incx, incy, n
+           ! Array Arguments 
+           real(dp), intent(out) :: c(*)
+           complex(dp), intent(inout) :: x(*), y(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           ! logical            first
+           integer(ilp) :: count, i, ic, ix, iy, j
+           real(dp) :: cs, d, di, dr, eps, f2, f2s, g2, g2s, safmin, safmn2, safmx2, scale
+           complex(dp) :: f, ff, fs, g, gs, r, sn
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,int,log,max,sqrt
+           ! Statement Functions 
+           real(dp) :: abs1, abssq
+           ! Save Statement 
+           ! save               first, safmx2, safmin, safmn2
+           ! Data Statements 
+           ! data               first / .true. /
+           ! Statement Function Definitions 
+           abs1( ff ) = max( abs( real( ff,KIND=dp) ), abs( aimag( ff ) ) )
+           abssq( ff ) = real( ff,KIND=dp)**2 + aimag( ff )**2
+           ! Executable Statements 
+           ! if( first ) then
+              ! first = .false.
+              safmin = stdlib_dlamch( 'S' )
+              eps = stdlib_dlamch( 'E' )
+              safmn2 = stdlib_dlamch( 'B' )**int( log( safmin / eps ) /log( stdlib_dlamch( 'B' ) )&
+                         / two,KIND=ilp)
+              safmx2 = one / safmn2
+           ! end if
+           ix = 1
+           iy = 1
+           ic = 1
+           loop_60: do i = 1, n
+              f = x( ix )
+              g = y( iy )
+              ! use identical algorithm as in stdlib_zlartg
+              scale = max( abs1( f ), abs1( g ) )
+              fs = f
+              gs = g
+              count = 0
+              if( scale>=safmx2 ) then
+              10 continue
+                 count = count + 1
+                 fs = fs*safmn2
+                 gs = gs*safmn2
+                 scale = scale*safmn2
+                 if( scale>=safmx2 .and. count < 20 )go to 10
+              else if( scale<=safmn2 ) then
+                 if( g==czero ) then
+                    cs = one
+                    sn = czero
+                    r = f
+                    go to 50
+                 end if
+                 20 continue
+                 count = count - 1
+                 fs = fs*safmx2
+                 gs = gs*safmx2
+                 scale = scale*safmx2
+                 if( scale<=safmn2 )go to 20
+              end if
+              f2 = abssq( fs )
+              g2 = abssq( gs )
+              if( f2<=max( g2, one )*safmin ) then
+                 ! this is a rare case: f is very small.
+                 if( f==czero ) then
+                    cs = zero
+                    r = stdlib_dlapy2( real( g,KIND=dp), aimag( g ) )
+                    ! do complex/real division explicitly with two real
+                    ! divisions
+                    d = stdlib_dlapy2( real( gs,KIND=dp), aimag( gs ) )
+                    sn = cmplx( real( gs,KIND=dp) / d, -aimag( gs ) / d,KIND=dp)
+                    go to 50
+                 end if
+                 f2s = stdlib_dlapy2( real( fs,KIND=dp), aimag( fs ) )
+                 ! g2 and g2s are accurate
+                 ! g2 is at least safmin, and g2s is at least safmn2
+                 g2s = sqrt( g2 )
+                 ! error in cs from underflow in f2s is at most
+                 ! unfl / safmn2 .lt. sqrt(unfl*eps) .lt. eps
+                 ! if max(g2,one)=g2, then f2 .lt. g2*safmin,
+                 ! and so cs .lt. sqrt(safmin)
+                 ! if max(g2,one)=one, then f2 .lt. safmin
+                 ! and so cs .lt. sqrt(safmin)/safmn2 = sqrt(eps)
+                 ! therefore, cs = f2s/g2s / sqrt( 1 + (f2s/g2s)**2 ) = f2s/g2s
+                 cs = f2s / g2s
+                 ! make sure abs(ff) = 1
+                 ! do complex/real division explicitly with 2 real divisions
+                 if( abs1( f )>one ) then
+                    d = stdlib_dlapy2( real( f,KIND=dp), aimag( f ) )
+                    ff = cmplx( real( f,KIND=dp) / d, aimag( f ) / d,KIND=dp)
+                 else
+                    dr = safmx2*real( f,KIND=dp)
+                    di = safmx2*aimag( f )
+                    d = stdlib_dlapy2( dr, di )
+                    ff = cmplx( dr / d, di / d,KIND=dp)
+                 end if
+                 sn = ff*cmplx( real( gs,KIND=dp) / g2s, -aimag( gs ) / g2s,KIND=dp)
+                 r = cs*f + sn*g
+              else
+                 ! this is the most common case.
+                 ! neither f2 nor f2/g2 are less than safmin
+                 ! f2s cannot overflow, and it is accurate
+                 f2s = sqrt( one+g2 / f2 )
+                 ! do the f2s(real)*fs(complex) multiply with two real
+                 ! multiplies
+                 r = cmplx( f2s*real( fs,KIND=dp), f2s*aimag( fs ),KIND=dp)
+                 cs = one / f2s
+                 d = f2 + g2
+                 ! do complex/real division explicitly with two real divisions
+                 sn = cmplx( real( r,KIND=dp) / d, aimag( r ) / d,KIND=dp)
+                 sn = sn*conjg( gs )
+                 if( count/=0 ) then
+                    if( count>0 ) then
+                       do j = 1, count
+                          r = r*safmx2
+                       end do
+                    else
+                       do j = 1, -count
+                          r = r*safmn2
+                       end do
+                    end if
+                 end if
+              end if
+              50 continue
+              c( ic ) = cs
+              y( iy ) = sn
+              x( ix ) = r
+              ic = ic + incc
+              iy = iy + incy
+              ix = ix + incx
+           end do loop_60
+           return
+     end subroutine stdlib_zlargv
+
+     !> ZLARRV: computes the eigenvectors of the tridiagonal matrix
+     !> T = L D L**T given L, D and APPROXIMATIONS to the eigenvalues of L D L**T.
+     !> The input eigenvalues should have been computed by DLARRE.
+
+     pure subroutine stdlib_zlarrv( n, vl, vu, d, l, pivmin,isplit, m, dol, dou, minrgp,rtol1, &
+               rtol2, w, werr, wgap,iblock, indexw, gers, z, ldz, isuppz,work, iwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: dol, dou, ldz, m, n
+           integer(ilp), intent(out) :: info
+           real(dp), intent(in) :: minrgp, pivmin, vl, vu
+           real(dp), intent(inout) :: rtol1, rtol2
+           ! Array Arguments 
+           integer(ilp), intent(in) :: iblock(*), indexw(*), isplit(*)
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(dp), intent(inout) :: d(*), l(*), w(*), werr(*), wgap(*)
+           real(dp), intent(in) :: gers(*)
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxitr = 10
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: eskip, needbs, stp2ii, tryrqc, usedbs, usedrq
+           integer(ilp) :: done, i, ibegin, idone, iend, ii, iindc1, iindc2, iindr, iindwk, iinfo,&
+            im, in, indeig, indld, indlld, indwrk, isupmn, isupmx, iter, itmp1, j, jblk, k, &
+            miniwsize, minwsize, nclus, ndepth, negcnt, newcls, newfst, newftt, newlst, newsiz, &
+            offset, oldcls, oldfst, oldien, oldlst, oldncl, p, parity, q, wbegin, wend, windex, &
+                      windmn, windpl, zfrom, zto, zusedl, zusedu, zusedw
+           integer(ilp) :: indin1, indin2
+           real(dp) :: bstres, bstw, eps, fudge, gap, gaptol, gl, gu, lambda, left, lgap, mingma, &
+           nrminv, resid, rgap, right, rqcorr, rqtol, savgap, sgndef, sigma, spdiam, ssigma, tau, &
+                     tmp, tol, ztz
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,min
+           intrinsic :: cmplx
+           ! Executable Statements 
+           info = 0
+           ! quick return if possible
+           if( (n<=0).or.(m<=0) ) then
+              return
+           end if
+           ! the first n entries of work are reserved for the eigenvalues
+           indld = n+1
+           indlld= 2*n+1
+           indin1 = 3*n + 1
+           indin2 = 4*n + 1
+           indwrk = 5*n + 1
+           minwsize = 12 * n
+           do i= 1,minwsize
+              work( i ) = zero
+           end do
+           ! iwork(iindr+1:iindr+n) hold the twist indices r for the
+           ! factorization used to compute the fp vector
+           iindr = 0
+           ! iwork(iindc1+1:iinc2+n) are used to store the clusters of the current
+           ! layer and the one above.
+           iindc1 = n
+           iindc2 = 2*n
+           iindwk = 3*n + 1
+           miniwsize = 7 * n
+           do i= 1,miniwsize
+              iwork( i ) = 0
+           end do
+           zusedl = 1
+           if(dol>1) then
+              ! set lower bound for use of z
+              zusedl = dol-1
+           endif
+           zusedu = m
+           if(dou<m) then
+              ! set lower bound for use of z
+              zusedu = dou+1
+           endif
+           ! the width of the part of z that is used
+           zusedw = zusedu - zusedl + 1
+           call stdlib_zlaset( 'FULL', n, zusedw, czero, czero,z(1,zusedl), ldz )
+           eps = stdlib_dlamch( 'PRECISION' )
+           rqtol = two * eps
+           ! set expert flags for standard code.
+           tryrqc = .true.
+           if((dol==1).and.(dou==m)) then
+           else
+              ! only selected eigenpairs are computed. since the other evalues
+              ! are not refined by rq iteration, bisection has to compute to full
+              ! accuracy.
+              rtol1 = four * eps
+              rtol2 = four * eps
+           endif
+           ! the entries wbegin:wend in w, werr, wgap correspond to the
+           ! desired eigenvalues. the support of the nonzero eigenvector
+           ! entries is contained in the interval ibegin:iend.
+           ! remark that if k eigenpairs are desired, then the eigenvectors
+           ! are stored in k contiguous columns of z.
+           ! done is the number of eigenvectors already computed
+           done = 0
+           ibegin = 1
+           wbegin = 1
+           loop_170: do jblk = 1, iblock( m )
+              iend = isplit( jblk )
+              sigma = l( iend )
+              ! find the eigenvectors of the submatrix indexed ibegin
+              ! through iend.
+              wend = wbegin - 1
+              15 continue
+              if( wend<m ) then
+                 if( iblock( wend+1 )==jblk ) then
+                    wend = wend + 1
+                    go to 15
+                 end if
+              end if
+              if( wend<wbegin ) then
+                 ibegin = iend + 1
+                 cycle loop_170
+              elseif( (wend<dol).or.(wbegin>dou) ) then
+                 ibegin = iend + 1
+                 wbegin = wend + 1
+                 cycle loop_170
+              end if
+              ! find local spectral diameter of the block
+              gl = gers( 2*ibegin-1 )
+              gu = gers( 2*ibegin )
+              do i = ibegin+1 , iend
+                 gl = min( gers( 2*i-1 ), gl )
+                 gu = max( gers( 2*i ), gu )
+              end do
+              spdiam = gu - gl
+              ! oldien is the last index of the previous block
+              oldien = ibegin - 1
+              ! calculate the size of the current block
+              in = iend - ibegin + 1
+              ! the number of eigenvalues in the current block
+              im = wend - wbegin + 1
+              ! this is for a 1x1 block
+              if( ibegin==iend ) then
+                 done = done+1
+                 z( ibegin, wbegin ) = cmplx( one, zero,KIND=dp)
+                 isuppz( 2*wbegin-1 ) = ibegin
+                 isuppz( 2*wbegin ) = ibegin
+                 w( wbegin ) = w( wbegin ) + sigma
+                 work( wbegin ) = w( wbegin )
+                 ibegin = iend + 1
+                 wbegin = wbegin + 1
+                 cycle loop_170
+              end if
+              ! the desired (shifted) eigenvalues are stored in w(wbegin:wend)
+              ! note that these can be approximations, in this case, the corresp.
+              ! entries of werr give the size of the uncertainty interval.
+              ! the eigenvalue approximations will be refined when necessary as
+              ! high relative accuracy is required for the computation of the
+              ! corresponding eigenvectors.
+              call stdlib_dcopy( im, w( wbegin ), 1,work( wbegin ), 1 )
+              ! we store in w the eigenvalue approximations w.r.t. the original
+              ! matrix t.
+              do i=1,im
+                 w(wbegin+i-1) = w(wbegin+i-1)+sigma
+              end do
+              ! ndepth is the current depth of the representation tree
+              ndepth = 0
+              ! parity is either 1 or 0
+              parity = 1
+              ! nclus is the number of clusters for the next level of the
+              ! representation tree, we start with nclus = 1 for the root
+              nclus = 1
+              iwork( iindc1+1 ) = 1
+              iwork( iindc1+2 ) = im
+              ! idone is the number of eigenvectors already computed in the current
+              ! block
+              idone = 0
+              ! loop while( idone<im )
+              ! generate the representation tree for the current block and
+              ! compute the eigenvectors
+              40 continue
+              if( idone<im ) then
+                 ! this is a crude protection against infinitely deep trees
+                 if( ndepth>m ) then
+                    info = -2
+                    return
+                 endif
+                 ! breadth first processing of the current level of the representation
+                 ! tree: oldncl = number of clusters on current level
+                 oldncl = nclus
+                 ! reset nclus to count the number of child clusters
+                 nclus = 0
+                 parity = 1 - parity
+                 if( parity==0 ) then
+                    oldcls = iindc1
+                    newcls = iindc2
+                 else
+                    oldcls = iindc2
+                    newcls = iindc1
+                 end if
+                 ! process the clusters on the current level
+                 loop_150: do i = 1, oldncl
+                    j = oldcls + 2*i
+                    ! oldfst, oldlst = first, last index of current cluster.
+                                     ! cluster indices start with 1 and are relative
+                                     ! to wbegin when accessing w, wgap, werr, z
+                    oldfst = iwork( j-1 )
+                    oldlst = iwork( j )
+                    if( ndepth>0 ) then
+                       ! retrieve relatively robust representation (rrr) of cluster
+                       ! that has been computed at the previous level
+                       ! the rrr is stored in z and overwritten once the eigenvectors
+                       ! have been computed or when the cluster is refined
+                       if((dol==1).and.(dou==m)) then
+                          ! get representation from location of the leftmost evalue
+                          ! of the cluster
+                          j = wbegin + oldfst - 1
+                       else
+                          if(wbegin+oldfst-1<dol) then
+                             ! get representation from the left end of z array
+                             j = dol - 1
+                          elseif(wbegin+oldfst-1>dou) then
+                             ! get representation from the right end of z array
+                             j = dou
+                          else
+                             j = wbegin + oldfst - 1
+                          endif
+                       endif
+                       do k = 1, in - 1
+                          d( ibegin+k-1 ) = real( z( ibegin+k-1,j ),KIND=dp)
+                          l( ibegin+k-1 ) = real( z( ibegin+k-1,j+1 ),KIND=dp)
+                       end do
+                       d( iend ) = real( z( iend, j ),KIND=dp)
+                       sigma = real( z( iend, j+1 ),KIND=dp)
+                       ! set the corresponding entries in z to zero
+                       call stdlib_zlaset( 'FULL', in, 2, czero, czero,z( ibegin, j), ldz )
+                                 
+                    end if
+                    ! compute dl and dll of current rrr
+                    do j = ibegin, iend-1
+                       tmp = d( j )*l( j )
+                       work( indld-1+j ) = tmp
+                       work( indlld-1+j ) = tmp*l( j )
+                    end do
+                    if( ndepth>0 ) then
+                       ! p and q are index of the first and last eigenvalue to compute
+                       ! within the current block
+                       p = indexw( wbegin-1+oldfst )
+                       q = indexw( wbegin-1+oldlst )
+                       ! offset for the arrays work, wgap and werr, i.e., the p-offset
+                       ! through the q-offset elements of these arrays are to be used.
+                        ! offset = p-oldfst
+                       offset = indexw( wbegin ) - 1
+                       ! perform limited bisection (if necessary) to get approximate
+                       ! eigenvalues to the precision needed.
+                       call stdlib_dlarrb( in, d( ibegin ),work(indlld+ibegin-1),p, q, rtol1, &
+                       rtol2, offset,work(wbegin),wgap(wbegin),werr(wbegin),work( indwrk ), iwork(&
+                                  iindwk ),pivmin, spdiam, in, iinfo )
+                       if( iinfo/=0 ) then
+                          info = -1
+                          return
+                       endif
+                       ! we also recompute the extremal gaps. w holds all eigenvalues
+                       ! of the unshifted matrix and must be used for computation
+                       ! of wgap, the entries of work might stem from rrrs with
+                       ! different shifts. the gaps from wbegin-1+oldfst to
+                       ! wbegin-1+oldlst are correctly computed in stdlib_dlarrb.
+                       ! however, we only allow the gaps to become greater since
+                       ! this is what should happen when we decrease werr
+                       if( oldfst>1) then
+                          wgap( wbegin+oldfst-2 ) =max(wgap(wbegin+oldfst-2),w(wbegin+oldfst-1)-&
+                          werr(wbegin+oldfst-1)- w(wbegin+oldfst-2)-werr(wbegin+oldfst-2) )
+                                    
+                       endif
+                       if( wbegin + oldlst -1 < wend ) then
+                          wgap( wbegin+oldlst-1 ) =max(wgap(wbegin+oldlst-1),w(wbegin+oldlst)-&
+                                    werr(wbegin+oldlst)- w(wbegin+oldlst-1)-werr(wbegin+oldlst-1) )
+                       endif
+                       ! each time the eigenvalues in work get refined, we store
+                       ! the newly found approximation with all shifts applied in w
+                       do j=oldfst,oldlst
+                          w(wbegin+j-1) = work(wbegin+j-1)+sigma
+                       end do
+                    end if
+                    ! process the current node.
+                    newfst = oldfst
+                    loop_140: do j = oldfst, oldlst
+                       if( j==oldlst ) then
+                          ! we are at the right end of the cluster, this is also the
+                          ! boundary of the child cluster
+                          newlst = j
+                       else if ( wgap( wbegin + j -1)>=minrgp* abs( work(wbegin + j -1) ) ) &
+                                 then
+                          ! the right relative gap is big enough, the child cluster
+                          ! (newfst,..,newlst) is well separated from the following
+                          newlst = j
+                        else
+                          ! inside a child cluster, the relative gap is not
+                          ! big enough.
+                          cycle loop_140
+                       end if
+                       ! compute size of child cluster found
+                       newsiz = newlst - newfst + 1
+                       ! newftt is the place in z where the new rrr or the computed
+                       ! eigenvector is to be stored
+                       if((dol==1).and.(dou==m)) then
+                          ! store representation at location of the leftmost evalue
+                          ! of the cluster
+                          newftt = wbegin + newfst - 1
+                       else
+                          if(wbegin+newfst-1<dol) then
+                             ! store representation at the left end of z array
+                             newftt = dol - 1
+                          elseif(wbegin+newfst-1>dou) then
+                             ! store representation at the right end of z array
+                             newftt = dou
+                          else
+                             newftt = wbegin + newfst - 1
+                          endif
+                       endif
+                       if( newsiz>1) then
+                          ! current child is not a singleton but a cluster.
+                          ! compute and store new representation of child.
+                          ! compute left and right cluster gap.
+                          ! lgap and rgap are not computed from work because
+                          ! the eigenvalue approximations may stem from rrrs
+                          ! different shifts. however, w hold all eigenvalues
+                          ! of the unshifted matrix. still, the entries in wgap
+                          ! have to be computed from work since the entries
+                          ! in w might be of the same order so that gaps are not
+                          ! exhibited correctly for very close eigenvalues.
+                          if( newfst==1 ) then
+                             lgap = max( zero,w(wbegin)-werr(wbegin) - vl )
+                         else
+                             lgap = wgap( wbegin+newfst-2 )
+                          endif
+                          rgap = wgap( wbegin+newlst-1 )
+                          ! compute left- and rightmost eigenvalue of child
+                          ! to high precision in order to shift as close
+                          ! as possible and obtain as large relative gaps
+                          ! as possible
+                          do k =1,2
+                             if(k==1) then
+                                p = indexw( wbegin-1+newfst )
+                             else
+                                p = indexw( wbegin-1+newlst )
+                             endif
+                             offset = indexw( wbegin ) - 1
+                             call stdlib_dlarrb( in, d(ibegin),work( indlld+ibegin-1 ),p,p,rqtol, &
+                             rqtol, offset,work(wbegin),wgap(wbegin),werr(wbegin),work( indwrk ),&
+                                       iwork( iindwk ), pivmin, spdiam,in, iinfo )
+                          end do
+                          if((wbegin+newlst-1<dol).or.(wbegin+newfst-1>dou)) then
+                             ! if the cluster contains no desired eigenvalues
+                             ! skip the computation of that branch of the rep. tree
+                             ! we could skip before the refinement of the extremal
+                             ! eigenvalues of the child, but then the representation
+                             ! tree could be different from the one when nothing is
+                             ! skipped. for this reason we skip at this place.
+                             idone = idone + newlst - newfst + 1
+                             goto 139
+                          endif
+                          ! compute rrr of child cluster.
+                          ! note that the new rrr is stored in z
+                          ! stdlib_dlarrf needs lwork = 2*n
+                          call stdlib_dlarrf( in, d( ibegin ), l( ibegin ),work(indld+ibegin-1),&
+                          newfst, newlst, work(wbegin),wgap(wbegin), werr(wbegin),spdiam, lgap, &
+                          rgap, pivmin, tau,work( indin1 ), work( indin2 ),work( indwrk ), iinfo )
+                                    
+                          ! in the complex case, stdlib_dlarrf cannot write
+                          ! the new rrr directly into z and needs an intermediate
+                          ! workspace
+                          do k = 1, in-1
+                             z( ibegin+k-1, newftt ) =cmplx( work( indin1+k-1 ), zero,KIND=dp)
+                                       
+                             z( ibegin+k-1, newftt+1 ) =cmplx( work( indin2+k-1 ), zero,KIND=dp)
+                                       
+                          end do
+                          z( iend, newftt ) =cmplx( work( indin1+in-1 ), zero,KIND=dp)
+                          if( iinfo==0 ) then
+                             ! a new rrr for the cluster was found by stdlib_dlarrf
+                             ! update shift and store it
+                             ssigma = sigma + tau
+                             z( iend, newftt+1 ) = cmplx( ssigma, zero,KIND=dp)
+                             ! work() are the midpoints and werr() the semi-width
+                             ! note that the entries in w are unchanged.
+                             do k = newfst, newlst
+                                fudge =three*eps*abs(work(wbegin+k-1))
+                                work( wbegin + k - 1 ) =work( wbegin + k - 1) - tau
+                                fudge = fudge +four*eps*abs(work(wbegin+k-1))
+                                ! fudge errors
+                                werr( wbegin + k - 1 ) =werr( wbegin + k - 1 ) + fudge
+                                ! gaps are not fudged. provided that werr is small
+                                ! when eigenvalues are close, a zero gap indicates
+                                ! that a new representation is needed for resolving
+                                ! the cluster. a fudge could lead to a wrong decision
+                                ! of judging eigenvalues 'separated' which in
+                                ! reality are not. this could have a negative impact
+                                ! on the orthogonality of the computed eigenvectors.
+                             end do
+                             nclus = nclus + 1
+                             k = newcls + 2*nclus
+                             iwork( k-1 ) = newfst
+                             iwork( k ) = newlst
+                          else
+                             info = -2
+                             return
+                          endif
+                       else
+                          ! compute eigenvector of singleton
+                          iter = 0
+                          tol = four * log(real(in,KIND=dp)) * eps
+                          k = newfst
+                          windex = wbegin + k - 1
+                          windmn = max(windex - 1,1)
+                          windpl = min(windex + 1,m)
+                          lambda = work( windex )
+                          done = done + 1
+                          ! check if eigenvector computation is to be skipped
+                          if((windex<dol).or.(windex>dou)) then
+                             eskip = .true.
+                             goto 125
+                          else
+                             eskip = .false.
+                          endif
+                          left = work( windex ) - werr( windex )
+                          right = work( windex ) + werr( windex )
+                          indeig = indexw( windex )
+                          ! note that since we compute the eigenpairs for a child,
+                          ! all eigenvalue approximations are w.r.t the same shift.
+                          ! in this case, the entries in work should be used for
+                          ! computing the gaps since they exhibit even very small
+                          ! differences in the eigenvalues, as opposed to the
+                          ! entries in w which might "look" the same.
+                          if( k == 1) then
+                             ! in the case range='i' and with not much initial
+                             ! accuracy in lambda and vl, the formula
+                             ! lgap = max( zero, (sigma - vl) + lambda )
+                             ! can lead to an overestimation of the left gap and
+                             ! thus to inadequately early rqi 'convergence'.
+                             ! prevent this by forcing a small left gap.
+                             lgap = eps*max(abs(left),abs(right))
+                          else
+                             lgap = wgap(windmn)
+                          endif
+                          if( k == im) then
+                             ! in the case range='i' and with not much initial
+                             ! accuracy in lambda and vu, the formula
+                             ! can lead to an overestimation of the right gap and
+                             ! thus to inadequately early rqi 'convergence'.
+                             ! prevent this by forcing a small right gap.
+                             rgap = eps*max(abs(left),abs(right))
+                          else
+                             rgap = wgap(windex)
+                          endif
+                          gap = min( lgap, rgap )
+                          if(( k == 1).or.(k == im)) then
+                             ! the eigenvector support can become wrong
+                             ! because significant entries could be cut off due to a
+                             ! large gaptol parameter in lar1v. prevent this.
+                             gaptol = zero
+                          else
+                             gaptol = gap * eps
+                          endif
+                          isupmn = in
+                          isupmx = 1
+                          ! update wgap so that it holds the minimum gap
+                          ! to the left or the right. this is crucial in the
+                          ! case where bisection is used to ensure that the
+                          ! eigenvalue is refined up to the required precision.
+                          ! the correct value is restored afterwards.
+                          savgap = wgap(windex)
+                          wgap(windex) = gap
+                          ! we want to use the rayleigh quotient correction
+                          ! as often as possible since it converges quadratically
+                          ! when we are close enough to the desired eigenvalue.
+                          ! however, the rayleigh quotient can have the wrong sign
+                          ! and lead us away from the desired eigenvalue. in this
+                          ! case, the best we can do is to use bisection.
+                          usedbs = .false.
+                          usedrq = .false.
+                          ! bisection is initially turned off unless it is forced
+                          needbs =  .not.tryrqc
+                          120 continue
+                          ! check if bisection should be used to refine eigenvalue
+                          if(needbs) then
+                             ! take the bisection as new iterate
+                             usedbs = .true.
+                             itmp1 = iwork( iindr+windex )
+                             offset = indexw( wbegin ) - 1
+                             call stdlib_dlarrb( in, d(ibegin),work(indlld+ibegin-1),indeig,&
+                             indeig,zero, two*eps, offset,work(wbegin),wgap(wbegin),werr(wbegin),&
+                                       work( indwrk ),iwork( iindwk ), pivmin, spdiam,itmp1, iinfo )
+                             if( iinfo/=0 ) then
+                                info = -3
+                                return
+                             endif
+                             lambda = work( windex )
+                             ! reset twist index from inaccurate lambda to
+                             ! force computation of true mingma
+                             iwork( iindr+windex ) = 0
+                          endif
+                          ! given lambda, compute the eigenvector.
+                          call stdlib_zlar1v( in, 1, in, lambda, d( ibegin ),l( ibegin ), work(&
+                          indld+ibegin-1),work(indlld+ibegin-1),pivmin, gaptol, z( ibegin, windex &
+                          ),.not.usedbs, negcnt, ztz, mingma,iwork( iindr+windex ), isuppz( &
+                                    2*windex-1 ),nrminv, resid, rqcorr, work( indwrk ) )
+                          if(iter == 0) then
+                             bstres = resid
+                             bstw = lambda
+                          elseif(resid<bstres) then
+                             bstres = resid
+                             bstw = lambda
+                          endif
+                          isupmn = min(isupmn,isuppz( 2*windex-1 ))
+                          isupmx = max(isupmx,isuppz( 2*windex ))
+                          iter = iter + 1
+                          ! sin alpha <= |resid|/gap
+                          ! note that both the residual and the gap are
+                          ! proportional to the matrix, so ||t|| doesn't play
+                          ! a role in the quotient
+                          ! convergence test for rayleigh-quotient iteration
+                          ! (omitted when bisection has been used)
+                          if( resid>tol*gap .and. abs( rqcorr )>rqtol*abs( lambda ) .and. .not. &
+                                    usedbs)then
+                             ! we need to check that the rqcorr update doesn't
+                             ! move the eigenvalue away from the desired one and
+                             ! towards a neighbor. -> protection with bisection
+                             if(indeig<=negcnt) then
+                                ! the wanted eigenvalue lies to the left
+                                sgndef = -one
+                             else
+                                ! the wanted eigenvalue lies to the right
+                                sgndef = one
+                             endif
+                             ! we only use the rqcorr if it improves the
+                             ! the iterate reasonably.
+                             if( ( rqcorr*sgndef>=zero ).and.( lambda + rqcorr<= right).and.( &
+                                       lambda + rqcorr>= left)) then
+                                usedrq = .true.
+                                ! store new midpoint of bisection interval in work
+                                if(sgndef==one) then
+                                   ! the current lambda is on the left of the true
+                                   ! eigenvalue
+                                   left = lambda
+                                   ! we prefer to assume that the error estimate
+                                   ! is correct. we could make the interval not
+                                   ! as a bracket but to be modified if the rqcorr
+                                   ! chooses to. in this case, the right side should
+                                   ! be modified as follows:
+                                    ! right = max(right, lambda + rqcorr)
+                                else
+                                   ! the current lambda is on the right of the true
+                                   ! eigenvalue
+                                   right = lambda
+                                   ! see comment about assuming the error estimate is
+                                   ! correct above.
+                                    ! left = min(left, lambda + rqcorr)
+                                endif
+                                work( windex ) =half * (right + left)
+                                ! take rqcorr since it has the correct sign and
+                                ! improves the iterate reasonably
+                                lambda = lambda + rqcorr
+                                ! update width of error interval
+                                werr( windex ) =half * (right-left)
+                             else
+                                needbs = .true.
+                             endif
+                             if(right-left<rqtol*abs(lambda)) then
+                                   ! the eigenvalue is computed to bisection accuracy
+                                   ! compute eigenvector and stop
+                                usedbs = .true.
+                                goto 120
+                             elseif( iter<maxitr ) then
+                                goto 120
+                             elseif( iter==maxitr ) then
+                                needbs = .true.
+                                goto 120
+                             else
+                                info = 5
+                                return
+                             end if
+                          else
+                             stp2ii = .false.
+             if(usedrq .and. usedbs .and.bstres<=resid) then
+                                lambda = bstw
+                                stp2ii = .true.
+                             endif
+                             if (stp2ii) then
+                                ! improve error angle by second step
+                                call stdlib_zlar1v( in, 1, in, lambda,d( ibegin ), l( ibegin ),&
+                                work(indld+ibegin-1),work(indlld+ibegin-1),pivmin, gaptol, z( &
+                                ibegin, windex ),.not.usedbs, negcnt, ztz, mingma,iwork( iindr+&
+                                windex ),isuppz( 2*windex-1 ),nrminv, resid, rqcorr, work( indwrk &
+                                          ) )
+                             endif
+                             work( windex ) = lambda
+                          end if
+                          ! compute fp-vector support w.r.t. whole matrix
+                          isuppz( 2*windex-1 ) = isuppz( 2*windex-1 )+oldien
+                          isuppz( 2*windex ) = isuppz( 2*windex )+oldien
+                          zfrom = isuppz( 2*windex-1 )
+                          zto = isuppz( 2*windex )
+                          isupmn = isupmn + oldien
+                          isupmx = isupmx + oldien
+                          ! ensure vector is ok if support in the rqi has changed
+                          if(isupmn<zfrom) then
+                             do ii = isupmn,zfrom-1
+                                z( ii, windex ) = zero
+                             end do
+                          endif
+                          if(isupmx>zto) then
+                             do ii = zto+1,isupmx
+                                z( ii, windex ) = zero
+                             end do
+                          endif
+                          call stdlib_zdscal( zto-zfrom+1, nrminv,z( zfrom, windex ), 1 )
+                          125 continue
+                          ! update w
+                          w( windex ) = lambda+sigma
+                          ! recompute the gaps on the left and right
+                          ! but only allow them to become larger and not
+                          ! smaller (which can only happen through "bad"
+                          ! cancellation and doesn't reflect the theory
+                          ! where the initial gaps are underestimated due
+                          ! to werr being too crude.)
+                          if(.not.eskip) then
+                             if( k>1) then
+                                wgap( windmn ) = max( wgap(windmn),w(windex)-werr(windex)- w(&
+                                          windmn)-werr(windmn) )
+                             endif
+                             if( windex<wend ) then
+                                wgap( windex ) = max( savgap,w( windpl )-werr( windpl )- w( &
+                                          windex )-werr( windex) )
+                             endif
+                          endif
+                          idone = idone + 1
+                       endif
+                       ! here ends the code for the current child
+                       139 continue
+                       ! proceed to any remaining child nodes
+                       newfst = j + 1
+                    end do loop_140
+                 end do loop_150
+                 ndepth = ndepth + 1
+                 go to 40
+              end if
+              ibegin = iend + 1
+              wbegin = wend + 1
+           end do loop_170
+           return
+     end subroutine stdlib_zlarrv
+
+     !> ZLATDF: computes the contribution to the reciprocal Dif-estimate
+     !> by solving for x in Z * x = b, where b is chosen such that the norm
+     !> of x is as large as possible. It is assumed that LU decomposition
+     !> of Z has been computed by ZGETC2. On entry RHS = f holds the
+     !> contribution from earlier solved sub-systems, and on return RHS = x.
+     !> The factorization of Z returned by ZGETC2 has the form
+     !> Z = P * L * U * Q, where P and Q are permutation matrices. L is lower
+     !> triangular with unit diagonal elements and U is upper triangular.
+
+     pure subroutine stdlib_zlatdf( ijob, n, z, ldz, rhs, rdsum, rdscal, ipiv,jpiv )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ijob, ldz, n
+           real(dp), intent(inout) :: rdscal, rdsum
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*), jpiv(*)
+           complex(dp), intent(inout) :: rhs(*), z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxdim = 2
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, info, j, k
+           real(dp) :: rtemp, scale, sminu, splus
+           complex(dp) :: bm, bp, pmone, temp
+           ! Local Arrays 
+           real(dp) :: rwork(maxdim)
+           complex(dp) :: work(4*maxdim), xm(maxdim), xp(maxdim)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,sqrt
+           ! Executable Statements 
+           if( ijob/=2 ) then
+              ! apply permutations ipiv to rhs
+              call stdlib_zlaswp( 1, rhs, ldz, 1, n-1, ipiv, 1 )
+              ! solve for l-part choosing rhs either to +1 or -1.
+              pmone = -cone
+              loop_10: do j = 1, n - 1
+                 bp = rhs( j ) + cone
+                 bm = rhs( j ) - cone
+                 splus = one
+                 ! lockahead for l- part rhs(1:n-1) = +-1
+                 ! splus and smin computed more efficiently than in bsolve[1].
+                 splus = splus + real( stdlib_zdotc( n-j, z( j+1, j ), 1, z( j+1,j ), 1 ),KIND=dp)
+                           
+                 sminu = real( stdlib_zdotc( n-j, z( j+1, j ), 1, rhs( j+1 ), 1 ),KIND=dp)
+                 splus = splus*real( rhs( j ),KIND=dp)
+                 if( splus>sminu ) then
+                    rhs( j ) = bp
+                 else if( sminu>splus ) then
+                    rhs( j ) = bm
+                 else
+                    ! in this case the updating sums are equal and we can
+                    ! choose rhs(j) +1 or -1. the first time this happens we
+                    ! choose -1, thereafter +1. this is a simple way to get
+                    ! good estimates of matrices like byers well-known example
+                    ! (see [1]). (not done in bsolve.)
+                    rhs( j ) = rhs( j ) + pmone
+                    pmone = cone
+                 end if
+                 ! compute the remaining r.h.s.
+                 temp = -rhs( j )
+                 call stdlib_zaxpy( n-j, temp, z( j+1, j ), 1, rhs( j+1 ), 1 )
+              end do loop_10
+              ! solve for u- part, lockahead for rhs(n) = +-1. this is not done
+              ! in bsolve and will hopefully give us a better estimate because
+              ! any ill-conditioning of the original matrix is transferred to u
+              ! and not to l. u(n, n) is an approximation to sigma_min(lu).
+              call stdlib_zcopy( n-1, rhs, 1, work, 1 )
+              work( n ) = rhs( n ) + cone
+              rhs( n ) = rhs( n ) - cone
+              splus = zero
+              sminu = zero
+              do i = n, 1, -1
+                 temp = cone / z( i, i )
+                 work( i ) = work( i )*temp
+                 rhs( i ) = rhs( i )*temp
+                 do k = i + 1, n
+                    work( i ) = work( i ) - work( k )*( z( i, k )*temp )
+                    rhs( i ) = rhs( i ) - rhs( k )*( z( i, k )*temp )
+                 end do
+                 splus = splus + abs( work( i ) )
+                 sminu = sminu + abs( rhs( i ) )
+              end do
+              if( splus>sminu )call stdlib_zcopy( n, work, 1, rhs, 1 )
+              ! apply the permutations jpiv to the computed solution (rhs)
+              call stdlib_zlaswp( 1, rhs, ldz, 1, n-1, jpiv, -1 )
+              ! compute the sum of squares
+              call stdlib_zlassq( n, rhs, 1, rdscal, rdsum )
+              return
+           end if
+           ! entry ijob = 2
+           ! compute approximate nullvector xm of z
+           call stdlib_zgecon( 'I', n, z, ldz, one, rtemp, work, rwork, info )
+           call stdlib_zcopy( n, work( n+1 ), 1, xm, 1 )
+           ! compute rhs
+           call stdlib_zlaswp( 1, xm, ldz, 1, n-1, ipiv, -1 )
+           temp = cone / sqrt( stdlib_zdotc( n, xm, 1, xm, 1 ) )
+           call stdlib_zscal( n, temp, xm, 1 )
+           call stdlib_zcopy( n, xm, 1, xp, 1 )
+           call stdlib_zaxpy( n, cone, rhs, 1, xp, 1 )
+           call stdlib_zaxpy( n, -cone, xm, 1, rhs, 1 )
+           call stdlib_zgesc2( n, z, ldz, rhs, ipiv, jpiv, scale )
+           call stdlib_zgesc2( n, z, ldz, xp, ipiv, jpiv, scale )
+           if( stdlib_dzasum( n, xp, 1 )>stdlib_dzasum( n, rhs, 1 ) )call stdlib_zcopy( n, xp, 1, &
+                     rhs, 1 )
+           ! compute the sum of squares
+           call stdlib_zlassq( n, rhs, 1, rdscal, rdsum )
+           return
+     end subroutine stdlib_zlatdf
+
+     !> ZLAUNHR_COL_GETRFNP: computes the modified LU factorization without
+     !> pivoting of a complex general M-by-N matrix A. The factorization has
+     !> the form:
+     !> A - S = L * U,
+     !> where:
+     !> S is a m-by-n diagonal sign matrix with the diagonal D, so that
+     !> D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+     !> as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+     !> i-1 steps of Gaussian elimination. This means that the diagonal
+     !> element at each step of "modified" Gaussian elimination is
+     !> at least one in absolute value (so that division-by-zero not
+     !> not possible during the division by the diagonal element);
+     !> L is a M-by-N lower triangular matrix with unit diagonal elements
+     !> (lower trapezoidal if M > N);
+     !> and U is a M-by-N upper triangular matrix
+     !> (upper trapezoidal if M < N).
+     !> This routine is an auxiliary routine used in the Householder
+     !> reconstruction routine ZUNHR_COL. In ZUNHR_COL, this routine is
+     !> applied to an M-by-N matrix A with orthonormal columns, where each
+     !> element is bounded by one in absolute value. With the choice of
+     !> the matrix S above, one can show that the diagonal element at each
+     !> step of Gaussian elimination is the largest (in absolute value) in
+     !> the column on or below the diagonal, so that no pivoting is required
+     !> for numerical stability [1].
+     !> For more details on the Householder reconstruction algorithm,
+     !> including the modified LU factorization, see [1].
+     !> This is the blocked right-looking version of the algorithm,
+     !> calling Level 3 BLAS to update the submatrix. To factorize a block,
+     !> this routine calls the recursive routine ZLAUNHR_COL_GETRFNP2.
+     !> [1] "Reconstructing Householder vectors from tall-skinny QR",
+     !> G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+     !> E. Solomonik, J. Parallel Distrib. Comput.,
+     !> vol. 85, pp. 3-31, 2015.
+
+     pure subroutine stdlib_zlaunhr_col_getrfnp( m, n, a, lda, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: d(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: iinfo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAUNHR_COL_GETRFNP', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'ZLAUNHR_COL_GETRFNP', ' ', m, n, -1, -1 )
+           if( nb<=1 .or. nb>=min( m, n ) ) then
+              ! use unblocked code.
+              call stdlib_zlaunhr_col_getrfnp2( m, n, a, lda, d, info )
+           else
+              ! use blocked code.
+              do j = 1, min( m, n ), nb
+                 jb = min( min( m, n )-j+1, nb )
+                 ! factor diagonal and subdiagonal blocks.
+                 call stdlib_zlaunhr_col_getrfnp2( m-j+1, jb, a( j, j ), lda,d( j ), iinfo )
+                           
+                 if( j+jb<=n ) then
+                    ! compute block row of u.
+                    call stdlib_ztrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', jb,n-j-jb+1, cone,&
+                               a( j, j ), lda, a( j, j+jb ),lda )
+                    if( j+jb<=m ) then
+                       ! update trailing submatrix.
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-j-jb+1,n-j-jb+1, jb, -&
+                       cone, a( j+jb, j ), lda,a( j, j+jb ), lda, cone, a( j+jb, j+jb ),lda )
+                                 
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_zlaunhr_col_getrfnp
+
+     !> ZPBRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian positive definite
+     !> and banded, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_zpbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b,ldb, x, ldx, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldafb, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, l, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldafb<kd+1 ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = min( n+1, 2*kd+2 )
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_zcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_zhbmv( uplo, n, kd, -cone, ab, ldab, x( 1, j ), 1, cone,work, 1 )
+                        
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    l = kd + 1 - k
+                    do i = max( 1, k-kd ), k - 1
+                       rwork( i ) = rwork( i ) + cabs1( ab( l+i, k ) )*xk
+                       s = s + cabs1( ab( l+i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( ab( kd+1, k ),KIND=dp) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( ab( 1, k ),KIND=dp) )*xk
+                    l = 1 - k
+                    do i = k + 1, min( n, k+kd )
+                       rwork( i ) = rwork( i ) + cabs1( ab( l+i, k ) )*xk
+                       s = s + cabs1( ab( l+i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_zpbtrs( uplo, n, kd, 1, afb, ldafb, work, n, info )
+                 call stdlib_zaxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_zlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_zpbtrs( uplo, n, kd, 1, afb, ldafb, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_zpbtrs( uplo, n, kd, 1, afb, ldafb, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_zpbrfs
+
+     !> ZPBTRF: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite band matrix A.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+
+     pure subroutine stdlib_zpbtrf( uplo, n, kd, ab, ldab, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: ab(ldab,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 32
+           integer(ilp), parameter :: ldwork = nbmax+1
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, i2, i3, ib, ii, j, jj, nb
+           ! Local Arrays 
+           complex(dp) :: work(ldwork,nbmax)
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( ( .not.stdlib_lsame( uplo, 'U' ) ) .and.( .not.stdlib_lsame( uplo, 'L' ) ) ) &
+                     then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( ldab<kd+1 ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment
+           nb = stdlib_ilaenv( 1, 'ZPBTRF', uplo, n, kd, -1, -1 )
+           ! the block size must not exceed the semi-bandwidth kd, and must not
+           ! exceed the limit set by the size of the local array work.
+           nb = min( nb, nbmax )
+           if( nb<=1 .or. nb>kd ) then
+              ! use unblocked code
+              call stdlib_zpbtf2( uplo, n, kd, ab, ldab, info )
+           else
+              ! use blocked code
+              if( stdlib_lsame( uplo, 'U' ) ) then
+                 ! compute the cholesky factorization of a hermitian band
+                 ! matrix, given the upper triangle of the matrix in band
+                 ! storage.
+                 ! zero the upper triangle of the work array.
+                 do j = 1, nb
+                    do i = 1, j - 1
+                       work( i, j ) = zero
+                    end do
+                 end do
+                 ! process the band matrix one diagonal block at a time.
+                 loop_70: do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    ! factorize the diagonal block
+                    call stdlib_zpotf2( uplo, ib, ab( kd+1, i ), ldab-1, ii )
+                    if( ii/=0 ) then
+                       info = i + ii - 1
+                       go to 150
+                    end if
+                    if( i+ib<=n ) then
+                       ! update the relevant part of the trailing submatrix.
+                       ! if a11 denotes the diagonal block which has just been
+                       ! factorized, then we need to update the remaining
+                       ! blocks in the diagram:
+                          ! a11   a12   a13
+                                ! a22   a23
+                                      ! a33
+                       ! the numbers of rows and columns in the partitioning
+                       ! are ib, i2, i3 respectively. the blocks a12, a22 and
+                       ! a23 are empty if ib = kd. the upper triangle of a13
+                       ! lies outside the band.
+                       i2 = min( kd-ib, n-i-ib+1 )
+                       i3 = min( ib, n-i-kd+1 )
+                       if( i2>0 ) then
+                          ! update a12
+                          call stdlib_ztrsm( 'LEFT', 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT', &
+                                    ib, i2, cone,ab( kd+1, i ), ldab-1,ab( kd+1-ib, i+ib ), ldab-1 )
+                          ! update a22
+                          call stdlib_zherk( 'UPPER', 'CONJUGATE TRANSPOSE', i2, ib,-one, ab( kd+&
+                                    1-ib, i+ib ), ldab-1, one,ab( kd+1, i+ib ), ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! copy the lower triangle of a13 into the work array.
+                          do jj = 1, i3
+                             do ii = jj, ib
+                                work( ii, jj ) = ab( ii-jj+1, jj+i+kd-1 )
+                             end do
+                          end do
+                          ! update a13 (in the work array).
+                          call stdlib_ztrsm( 'LEFT', 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT', &
+                                    ib, i3, cone,ab( kd+1, i ), ldab-1, work, ldwork )
+                          ! update a23
+                          if( i2>0 )call stdlib_zgemm( 'CONJUGATE TRANSPOSE','NO TRANSPOSE', i2, &
+                          i3, ib, -cone,ab( kd+1-ib, i+ib ), ldab-1, work,ldwork, cone, ab( 1+ib, &
+                                    i+kd ),ldab-1 )
+                          ! update a33
+                          call stdlib_zherk( 'UPPER', 'CONJUGATE TRANSPOSE', i3, ib,-one, work, &
+                                    ldwork, one,ab( kd+1, i+kd ), ldab-1 )
+                          ! copy the lower triangle of a13 back into place.
+                          do jj = 1, i3
+                             do ii = jj, ib
+                                ab( ii-jj+1, jj+i+kd-1 ) = work( ii, jj )
+                             end do
+                          end do
+                       end if
+                    end if
+                 end do loop_70
+              else
+                 ! compute the cholesky factorization of a hermitian band
+                 ! matrix, given the lower triangle of the matrix in band
+                 ! storage.
+                 ! zero the lower triangle of the work array.
+                 do j = 1, nb
+                    do i = j + 1, nb
+                       work( i, j ) = zero
+                    end do
+                 end do
+                 ! process the band matrix one diagonal block at a time.
+                 loop_140: do i = 1, n, nb
+                    ib = min( nb, n-i+1 )
+                    ! factorize the diagonal block
+                    call stdlib_zpotf2( uplo, ib, ab( 1, i ), ldab-1, ii )
+                    if( ii/=0 ) then
+                       info = i + ii - 1
+                       go to 150
+                    end if
+                    if( i+ib<=n ) then
+                       ! update the relevant part of the trailing submatrix.
+                       ! if a11 denotes the diagonal block which has just been
+                       ! factorized, then we need to update the remaining
+                       ! blocks in the diagram:
+                          ! a11
+                          ! a21   a22
+                          ! a31   a32   a33
+                       ! the numbers of rows and columns in the partitioning
+                       ! are ib, i2, i3 respectively. the blocks a21, a22 and
+                       ! a32 are empty if ib = kd. the lower triangle of a31
+                       ! lies outside the band.
+                       i2 = min( kd-ib, n-i-ib+1 )
+                       i3 = min( ib, n-i-kd+1 )
+                       if( i2>0 ) then
+                          ! update a21
+                          call stdlib_ztrsm( 'RIGHT', 'LOWER','CONJUGATE TRANSPOSE', 'NON-UNIT', &
+                                    i2,ib, cone, ab( 1, i ), ldab-1,ab( 1+ib, i ), ldab-1 )
+                          ! update a22
+                          call stdlib_zherk( 'LOWER', 'NO TRANSPOSE', i2, ib, -one,ab( 1+ib, i ), &
+                                    ldab-1, one,ab( 1, i+ib ), ldab-1 )
+                       end if
+                       if( i3>0 ) then
+                          ! copy the upper triangle of a31 into the work array.
+                          do jj = 1, ib
+                             do ii = 1, min( jj, i3 )
+                                work( ii, jj ) = ab( kd+1-jj+ii, jj+i-1 )
+                             end do
+                          end do
+                          ! update a31 (in the work array).
+                          call stdlib_ztrsm( 'RIGHT', 'LOWER','CONJUGATE TRANSPOSE', 'NON-UNIT', &
+                                    i3,ib, cone, ab( 1, i ), ldab-1, work,ldwork )
+                          ! update a32
+                          if( i2>0 )call stdlib_zgemm( 'NO TRANSPOSE','CONJUGATE TRANSPOSE', i3, &
+                          i2, ib,-cone, work, ldwork, ab( 1+ib, i ),ldab-1, cone, ab( 1+kd-ib, i+&
+                                    ib ),ldab-1 )
+                          ! update a33
+                          call stdlib_zherk( 'LOWER', 'NO TRANSPOSE', i3, ib, -one,work, ldwork, &
+                                    one, ab( 1, i+kd ),ldab-1 )
+                          ! copy the upper triangle of a31 back into place.
+                          do jj = 1, ib
+                             do ii = 1, min( jj, i3 )
+                                ab( kd+1-jj+ii, jj+i-1 ) = work( ii, jj )
+                             end do
+                          end do
+                       end if
+                    end if
+                 end do loop_140
+              end if
+           end if
+           return
+           150 continue
+           return
+     end subroutine stdlib_zpbtrf
+
+     !> ZPFTRS: solves a system of linear equations A*X = B with a Hermitian
+     !> positive definite matrix A using the Cholesky factorization
+     !> A = U**H*U or A = L*L**H computed by ZPFTRF.
+
+     pure subroutine stdlib_zpftrs( transr, uplo, n, nrhs, a, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(0:*)
+           complex(dp), intent(inout) :: b(ldb,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, normaltransr
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPFTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! start execution: there are two triangular solves
+           if( lower ) then
+              call stdlib_ztfsm( transr, 'L', uplo, 'N', 'N', n, nrhs, cone, a, b,ldb )
+              call stdlib_ztfsm( transr, 'L', uplo, 'C', 'N', n, nrhs, cone, a, b,ldb )
+           else
+              call stdlib_ztfsm( transr, 'L', uplo, 'C', 'N', n, nrhs, cone, a, b,ldb )
+              call stdlib_ztfsm( transr, 'L', uplo, 'N', 'N', n, nrhs, cone, a, b,ldb )
+           end if
+           return
+     end subroutine stdlib_zpftrs
+
+     !> ZPORFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian positive definite,
+     !> and provides error bounds and backward error estimates for the
+     !> solution.
+
+     pure subroutine stdlib_zporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x,ldx, ferr, berr, &
+               work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: x(ldx,*)
+        ! ====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPORFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_zcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_zhemv( uplo, n, -cone, a, lda, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( a( k, k ),KIND=dp) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( a( k, k ),KIND=dp) )*xk
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_zpotrs( uplo, n, 1, af, ldaf, work, n, info )
+                 call stdlib_zaxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_zlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_zpotrs( uplo, n, 1, af, ldaf, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_zpotrs( uplo, n, 1, af, ldaf, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_zporfs
+
+     !> ZPOTRF: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the block version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_zpotrf( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'ZPOTRF', uplo, n, -1, -1, -1 )
+           if( nb<=1 .or. nb>=n ) then
+              ! use unblocked code.
+              call stdlib_zpotrf2( uplo, n, a, lda, info )
+           else
+              ! use blocked code.
+              if( upper ) then
+                 ! compute the cholesky factorization a = u**h *u.
+                 do j = 1, n, nb
+                    ! update and factorize the current diagonal block and test
+                    ! for non-positive-definiteness.
+                    jb = min( nb, n-j+1 )
+                    call stdlib_zherk( 'UPPER', 'CONJUGATE TRANSPOSE', jb, j-1,-one, a( 1, j ), &
+                              lda, one, a( j, j ), lda )
+                    call stdlib_zpotrf2( 'UPPER', jb, a( j, j ), lda, info )
+                    if( info/=0 )go to 30
+                    if( j+jb<=n ) then
+                       ! compute the current block row.
+                       call stdlib_zgemm( 'CONJUGATE TRANSPOSE', 'NO TRANSPOSE', jb,n-j-jb+1, j-1,&
+                                  -cone, a( 1, j ), lda,a( 1, j+jb ), lda, cone, a( j, j+jb ),lda )
+                       call stdlib_ztrsm( 'LEFT', 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT', jb, &
+                                 n-j-jb+1, cone, a( j, j ),lda, a( j, j+jb ), lda )
+                    end if
+                 end do
+              else
+                 ! compute the cholesky factorization a = l*l**h.
+                 do j = 1, n, nb
+                    ! update and factorize the current diagonal block and test
+                    ! for non-positive-definiteness.
+                    jb = min( nb, n-j+1 )
+                    call stdlib_zherk( 'LOWER', 'NO TRANSPOSE', jb, j-1, -one,a( j, 1 ), lda, one,&
+                               a( j, j ), lda )
+                    call stdlib_zpotrf2( 'LOWER', jb, a( j, j ), lda, info )
+                    if( info/=0 )go to 30
+                    if( j+jb<=n ) then
+                       ! compute the current block column.
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',n-j-jb+1, jb, j-1,&
+                                  -cone, a( j+jb, 1 ),lda, a( j, 1 ), lda, cone, a( j+jb, j ),lda )
+                       call stdlib_ztrsm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','NON-UNIT', n-j-&
+                                 jb+1, jb, cone, a( j, j ),lda, a( j+jb, j ), lda )
+                    end if
+                 end do
+              end if
+           end if
+           go to 40
+           30 continue
+           info = info + j - 1
+           40 continue
+           return
+     end subroutine stdlib_zpotrf
+
+     !> ZPOTRI: computes the inverse of a complex Hermitian positive definite
+     !> matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+     !> computed by ZPOTRF.
+
+     pure subroutine stdlib_zpotri( uplo, n, a, lda, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_ztrtri( uplo, 'NON-UNIT', n, a, lda, info )
+           if( info>0 )return
+           ! form inv(u) * inv(u)**h or inv(l)**h * inv(l).
+           call stdlib_zlauum( uplo, n, a, lda, info )
+           return
+     end subroutine stdlib_zpotri
+
+     !> ZPPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian positive definite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_zpprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr,berr, work, &
+               rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: x(ldx,*)
+        ! ====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_zcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_zhpmv( uplo, n, -cone, ap, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( ap( kk+k-1 ),KIND=dp) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( ap( kk ),KIND=dp) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_zpptrs( uplo, n, 1, afp, work, n, info )
+                 call stdlib_zaxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_zlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_zpptrs( uplo, n, 1, afp, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_zpptrs( uplo, n, 1, afp, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_zpprfs
+
+     !> ZPPSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix stored in
+     !> packed format and X and B are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is a lower triangular
+     !> matrix.  The factored form of A is then used to solve the system of
+     !> equations A * X = B.
+
+     pure subroutine stdlib_zppsv( uplo, n, nrhs, ap, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           complex(dp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**h *u or a = l*l**h.
+           call stdlib_zpptrf( uplo, n, ap, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_zpptrs( uplo, n, nrhs, ap, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_zppsv
+
+     !> ZPPSVX: uses the Cholesky factorization A = U**H * U or A = L * L**H to
+     !> compute the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix stored in
+     !> packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_zppsvx( fact, uplo, n, nrhs, ap, afp, equed, s, b, ldb,x, ldx, rcond, ferr,&
+                berr, work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(dp), intent(inout) :: s(*)
+           complex(dp), intent(inout) :: afp(*), ap(*), b(ldb,*)
+           complex(dp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ
+           integer(ilp) :: i, infequ, j
+           real(dp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -7
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -8
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -10
+                 else if( ldx<max( 1, n ) ) then
+                    info = -12
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_zppequ( uplo, n, ap, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_zlaqhp( uplo, n, ap, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right-hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**h * u or a = l * l**h.
+              call stdlib_zcopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_zpptrf( uplo, n, afp, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_zlanhp( 'I', uplo, n, ap, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_zppcon( uplo, n, afp, anorm, rcond, work, rwork, info )
+           ! compute the solution matrix x.
+           call stdlib_zlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_zpptrs( uplo, n, nrhs, afp, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_zpprfs( uplo, n, nrhs, ap, afp, b, ldb, x, ldx, ferr, berr,work, rwork, &
+                     info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_zppsvx
+
+     !> ZPPTRI: computes the inverse of a complex Hermitian positive definite
+     !> matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+     !> computed by ZPPTRF.
+
+     pure subroutine stdlib_zpptri( uplo, n, ap, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: ap(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: j, jc, jj, jjn
+           real(dp) :: ajj
+           ! Intrinsic Functions 
+           intrinsic :: real
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPPTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_ztptri( uplo, 'NON-UNIT', n, ap, info )
+           if( info>0 )return
+           if( upper ) then
+              ! compute the product inv(u) * inv(u)**h.
+              jj = 0
+              do j = 1, n
+                 jc = jj + 1
+                 jj = jj + j
+                 if( j>1 )call stdlib_zhpr( 'UPPER', j-1, one, ap( jc ), 1, ap )
+                 ajj = real( ap( jj ),KIND=dp)
+                 call stdlib_zdscal( j, ajj, ap( jc ), 1 )
+              end do
+           else
+              ! compute the product inv(l)**h * inv(l).
+              jj = 1
+              do j = 1, n
+                 jjn = jj + n - j + 1
+                 ap( jj ) = real( stdlib_zdotc( n-j+1, ap( jj ), 1, ap( jj ), 1 ),KIND=dp)
+                 if( j<n )call stdlib_ztpmv( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',n-j, ap( &
+                           jjn ), ap( jj+1 ), 1 )
+                 jj = jjn
+              end do
+           end if
+           return
+     end subroutine stdlib_zpptri
+
+     !> ZPTEQR: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric positive definite tridiagonal matrix by first factoring the
+     !> matrix using DPTTRF and then calling ZBDSQR to compute the singular
+     !> values of the bidiagonal factor.
+     !> This routine computes the eigenvalues of the positive definite
+     !> tridiagonal matrix to high relative accuracy.  This means that if the
+     !> eigenvalues range over many orders of magnitude in size, then the
+     !> small eigenvalues and corresponding eigenvectors will be computed
+     !> more accurately than, for example, with the standard QR method.
+     !> The eigenvectors of a full or band positive definite Hermitian matrix
+     !> can also be found if ZHETRD, ZHPTRD, or ZHBTRD has been used to
+     !> reduce this matrix to tridiagonal form.  (The reduction to
+     !> tridiagonal form, however, may preclude the possibility of obtaining
+     !> high relative accuracy in the small eigenvalues of the original
+     !> matrix, if these eigenvalues range over many orders of magnitude.)
+
+     pure subroutine stdlib_zpteqr( compz, n, d, e, z, ldz, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: z(ldz,*)
+        ! ====================================================================
+           
+           ! Local Arrays 
+           complex(dp) :: c(1,1), vt(1,1)
+           ! Local Scalars 
+           integer(ilp) :: i, icompz, nru
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or. ( icompz>0 .and. ldz<max( 1,n ) ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPTEQR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz>0 )z( 1, 1 ) = cone
+              return
+           end if
+           if( icompz==2 )call stdlib_zlaset( 'FULL', n, n, czero, cone, z, ldz )
+           ! call stdlib_dpttrf to factor the matrix.
+           call stdlib_dpttrf( n, d, e, info )
+           if( info/=0 )return
+           do i = 1, n
+              d( i ) = sqrt( d( i ) )
+           end do
+           do i = 1, n - 1
+              e( i ) = e( i )*d( i )
+           end do
+           ! call stdlib_zbdsqr to compute the singular values/vectors of the
+           ! bidiagonal factor.
+           if( icompz>0 ) then
+              nru = n
+           else
+              nru = 0
+           end if
+           call stdlib_zbdsqr( 'LOWER', n, 0, nru, 0, d, e, vt, 1, z, ldz, c, 1,work, info )
+                     
+           ! square the singular values.
+           if( info==0 ) then
+              do i = 1, n
+                 d( i ) = d( i )*d( i )
+              end do
+           else
+              info = n + info
+           end if
+           return
+     end subroutine stdlib_zpteqr
+
+     !> ZPTTRS: solves a tridiagonal system of the form
+     !> A * X = B
+     !> using the factorization A = U**H *D* U or A = L*D*L**H computed by ZPTTRF.
+     !> D is a diagonal matrix specified in the vector D, U (or L) is a unit
+     !> bidiagonal matrix whose superdiagonal (subdiagonal) is specified in
+     !> the vector E, and X and B are N by NRHS matrices.
+
+     pure subroutine stdlib_zpttrs( uplo, n, nrhs, d, e, b, ldb, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(in) :: d(*)
+           complex(dp), intent(inout) :: b(ldb,*)
+           complex(dp), intent(in) :: e(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: iuplo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           upper = ( uplo=='U' .or. uplo=='U' )
+           if( .not.upper .and. .not.( uplo=='L' .or. uplo=='L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPTTRS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 )return
+           ! determine the number of right-hand sides to solve at a time.
+           if( nrhs==1 ) then
+              nb = 1
+           else
+              nb = max( 1, stdlib_ilaenv( 1, 'ZPTTRS', uplo, n, nrhs, -1, -1 ) )
+           end if
+           ! decode uplo
+           if( upper ) then
+              iuplo = 1
+           else
+              iuplo = 0
+           end if
+           if( nb>=nrhs ) then
+              call stdlib_zptts2( iuplo, n, nrhs, d, e, b, ldb )
+           else
+              do j = 1, nrhs, nb
+                 jb = min( nrhs-j+1, nb )
+                 call stdlib_zptts2( iuplo, n, jb, d, e, b( 1, j ), ldb )
+              end do
+           end if
+           return
+     end subroutine stdlib_zpttrs
+
+     !> ZSPCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex symmetric packed matrix A using the
+     !> factorization A = U*D*U**T or A = L*D*L**T computed by ZSPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_zspcon( uplo, n, ap, ipiv, anorm, rcond, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: ap(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ip, kase
+           real(dp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              ip = n*( n+1 ) / 2
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip - i
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              ip = 1
+              do i = 1, n
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip + n - i + 1
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_zsptrs( uplo, n, 1, ap, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_zspcon
+
+     !> ZSPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric indefinite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_zsprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, work,&
+                rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_zcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_zspmv( uplo, n, -cone, ap, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + cabs1( ap( kk+k-1 ) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + cabs1( ap( kk ) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_zsptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                 call stdlib_zaxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_zlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_zsptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_zsptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_zsprfs
+
+     !> ZSPSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix stored in packed format and X
+     !> and B are N-by-NRHS matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, D is symmetric and block diagonal with 1-by-1
+     !> and 2-by-2 diagonal blocks.  The factored form of A is then used to
+     !> solve the system of equations A * X = B.
+
+     pure subroutine stdlib_zspsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPSV ', -info )
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_zsptrf( uplo, n, ap, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_zsptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_zspsv
+
+     !> ZSPSVX: uses the diagonal pivoting factorization A = U*D*U**T or
+     !> A = L*D*L**T to compute the solution to a complex system of linear
+     !> equations A * X = B, where A is an N-by-N symmetric matrix stored
+     !> in packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_zspsvx( fact, uplo, n, nrhs, ap, afp, ipiv, b, ldb, x,ldx, rcond, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(inout) :: afp(*)
+           complex(dp), intent(in) :: ap(*), b(ldb,*)
+           complex(dp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(dp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSPSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+              call stdlib_zcopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_zsptrf( uplo, n, afp, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_zlansp( 'I', uplo, n, ap, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_zspcon( uplo, n, afp, ipiv, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_zlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_zsptrs( uplo, n, nrhs, afp, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_zsprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx, ferr,berr, work, &
+                     rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_zspsvx
+
+     !> ZSTEMR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+     !> a well defined set of pairwise different real eigenvalues, the corresponding
+     !> real eigenvectors are pairwise orthogonal.
+     !> The spectrum may be computed either completely or partially by specifying
+     !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+     !> eigenvalues.
+     !> Depending on the number of desired eigenvalues, these are computed either
+     !> by bisection or the dqds algorithm. Numerically orthogonal eigenvectors are
+     !> computed by the use of various suitable L D L^T factorizations near clusters
+     !> of close eigenvalues (referred to as RRRs, Relatively Robust
+     !> Representations). An informal sketch of the algorithm follows.
+     !> For each unreduced block (submatrix) of T,
+     !> (a) Compute T - sigma I  = L D L^T, so that L and D
+     !> define all the wanted eigenvalues to high relative accuracy.
+     !> This means that small relative changes in the entries of D and L
+     !> cause only small relative changes in the eigenvalues and
+     !> eigenvectors. The standard (unfactored) representation of the
+     !> tridiagonal matrix T does not have this property in general.
+     !> (b) Compute the eigenvalues to suitable accuracy.
+     !> If the eigenvectors are desired, the algorithm attains full
+     !> accuracy of the computed eigenvalues only right before
+     !> the corresponding vectors have to be computed, see steps c) and d).
+     !> (c) For each cluster of close eigenvalues, select a new
+     !> shift close to the cluster, find a new factorization, and refine
+     !> the shifted eigenvalues to suitable accuracy.
+     !> (d) For each eigenvalue with a large enough relative separation compute
+     !> the corresponding eigenvector by forming a rank revealing twisted
+     !> factorization. Go back to (c) for any clusters that remain.
+     !> For more details, see:
+     !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+     !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+     !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+     !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+     !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+     !> 2004.  Also LAPACK Working Note 154.
+     !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem",
+     !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Further Details
+     !> 1.ZSTEMR works only on machines which follow IEEE-754
+     !> floating-point standard in their handling of infinities and NaNs.
+     !> This permits the use of efficient inner loops avoiding a check for
+     !> zero divisors.
+     !> 2. LAPACK routines can be used to reduce a complex Hermitean matrix to
+     !> real symmetric tridiagonal form.
+     !> (Any complex Hermitean tridiagonal matrix has real values on its diagonal
+     !> and potentially complex numbers on its off-diagonals. By applying a
+     !> similarity transform with an appropriate diagonal matrix
+     !> diag(1,e^{i \phy_1}, ... , e^{i \phy_{n-1}}), the complex Hermitean
+     !> matrix can be transformed into a real symmetric matrix and complex
+     !> arithmetic can be entirely avoided.)
+     !> While the eigenvectors of the real symmetric tridiagonal matrix are real,
+     !> the eigenvectors of original complex Hermitean matrix have complex entries
+     !> in general.
+     !> Since LAPACK drivers overwrite the matrix data with the eigenvectors,
+     !> ZSTEMR accepts complex workspace to facilitate interoperability
+     !> with ZUNMTR or ZUPMTR.
+
+     pure subroutine stdlib_zstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, nzc, &
+               isuppz, tryrac, work, lwork,iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           logical(lk), intent(inout) :: tryrac
+           integer(ilp), intent(in) :: il, iu, ldz, nzc, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: w(*), work(*)
+           complex(dp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: minrgp = 1.0e-3_dp
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lquery, valeig, wantz, zquery
+           integer(ilp) :: i, ibegin, iend, ifirst, iil, iindbl, iindw, iindwk, iinfo, iinspl, &
+           iiu, ilast, in, indd, inde2, inderr, indgp, indgrs, indwrk, itmp, itmp2, j, jblk, jj, &
+                     liwmin, lwmin, nsplit, nzcmin, offset, wbegin, wend
+           real(dp) :: bignum, cs, eps, pivmin, r1, r2, rmax, rmin, rtol1, rtol2, safmin, scale, &
+                     smlnum, sn, thresh, tmp, tnrm, wl, wu
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ).or.( liwork==-1 ) )
+           zquery = ( nzc==-1 )
+           ! stdlib_dstemr needs work of size 6*n, iwork of size 3*n.
+           ! in addition, stdlib_dlarre needs work of size 6*n, iwork of size 5*n.
+           ! furthermore, stdlib_zlarrv needs work of size 12*n, iwork of size 7*n.
+           if( wantz ) then
+              lwmin = 18*n
+              liwmin = 10*n
+           else
+              ! need less workspace if only the eigenvalues are wanted
+              lwmin = 12*n
+              liwmin = 8*n
+           endif
+           wl = zero
+           wu = zero
+           iil = 0
+           iiu = 0
+           nsplit = 0
+           if( valeig ) then
+              ! we do not reference vl, vu in the cases range = 'i','a'
+              ! the interval (wl, wu] contains all the wanted eigenvalues.
+              ! it is either given by the user or computed in stdlib_dlarre.
+              wl = vl
+              wu = vu
+           elseif( indeig ) then
+              ! we do not reference il, iu in the cases range = 'v','a'
+              iil = il
+              iiu = iu
+           endif
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( valeig .and. n>0 .and. wu<=wl ) then
+              info = -7
+           else if( indeig .and. ( iil<1 .or. iil>n ) ) then
+              info = -8
+           else if( indeig .and. ( iiu<iil .or. iiu>n ) ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -13
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -17
+           else if( liwork<liwmin .and. .not.lquery ) then
+              info = -19
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              iwork( 1 ) = liwmin
+              if( wantz .and. alleig ) then
+                 nzcmin = n
+              else if( wantz .and. valeig ) then
+                 call stdlib_dlarrc( 'T', n, vl, vu, d, e, safmin,nzcmin, itmp, itmp2, info )
+                           
+              else if( wantz .and. indeig ) then
+                 nzcmin = iiu-iil+1
+              else
+                 ! wantz == false.
+                 nzcmin = 0
+              endif
+              if( zquery .and. info==0 ) then
+                 z( 1,1 ) = nzcmin
+              else if( nzc<nzcmin .and. .not.zquery ) then
+                 info = -14
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSTEMR', -info )
+              return
+           else if( lquery .or. zquery ) then
+              return
+           end if
+           ! handle n = 0, 1, and 2 cases immediately
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = d( 1 )
+              else
+                 if( wl<d( 1 ) .and. wu>=d( 1 ) ) then
+                    m = 1
+                    w( 1 ) = d( 1 )
+                 end if
+              end if
+              if( wantz.and.(.not.zquery) ) then
+                 z( 1, 1 ) = one
+                 isuppz(1) = 1
+                 isuppz(2) = 1
+              end if
+              return
+           end if
+           if( n==2 ) then
+              if( .not.wantz ) then
+                 call stdlib_dlae2( d(1), e(1), d(2), r1, r2 )
+              else if( wantz.and.(.not.zquery) ) then
+                 call stdlib_dlaev2( d(1), e(1), d(2), r1, r2, cs, sn )
+              end if
+              if( alleig.or.(valeig.and.(r2>wl).and.(r2<=wu)).or.(indeig.and.(iil==1)) ) &
+                        then
+                 m = m+1
+                 w( m ) = r2
+                 if( wantz.and.(.not.zquery) ) then
+                    z( 1, m ) = -sn
+                    z( 2, m ) = cs
+                    ! note: at most one of sn and cs can be zero.
+                    if (sn/=zero) then
+                       if (cs/=zero) then
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 2
+                       else
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 1
+                       end if
+                    else
+                       isuppz(2*m-1) = 2
+                       isuppz(2*m) = 2
+                    end if
+                 endif
+              endif
+              if( alleig.or.(valeig.and.(r1>wl).and.(r1<=wu)).or.(indeig.and.(iiu==2)) ) &
+                        then
+                 m = m+1
+                 w( m ) = r1
+                 if( wantz.and.(.not.zquery) ) then
+                    z( 1, m ) = cs
+                    z( 2, m ) = sn
+                    ! note: at most one of sn and cs can be zero.
+                    if (sn/=zero) then
+                       if (cs/=zero) then
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 2
+                       else
+                          isuppz(2*m-1) = 1
+                          isuppz(2*m) = 1
+                       end if
+                    else
+                       isuppz(2*m-1) = 2
+                       isuppz(2*m) = 2
+                    end if
+                 endif
+              endif
+           else
+              ! continue with general n
+              indgrs = 1
+              inderr = 2*n + 1
+              indgp = 3*n + 1
+              indd = 4*n + 1
+              inde2 = 5*n + 1
+              indwrk = 6*n + 1
+              iinspl = 1
+              iindbl = n + 1
+              iindw = 2*n + 1
+              iindwk = 3*n + 1
+              ! scale matrix to allowable range, if necessary.
+              ! the allowable range is related to the pivmin parameter; see the
+              ! comments in stdlib_dlarrd.  the preference for scaling small values
+              ! up is heuristic; we expect users' matrices not to be close to the
+              ! rmax threshold.
+              scale = one
+              tnrm = stdlib_dlanst( 'M', n, d, e )
+              if( tnrm>zero .and. tnrm<rmin ) then
+                 scale = rmin / tnrm
+              else if( tnrm>rmax ) then
+                 scale = rmax / tnrm
+              end if
+              if( scale/=one ) then
+                 call stdlib_dscal( n, scale, d, 1 )
+                 call stdlib_dscal( n-1, scale, e, 1 )
+                 tnrm = tnrm*scale
+                 if( valeig ) then
+                    ! if eigenvalues in interval have to be found,
+                    ! scale (wl, wu] accordingly
+                    wl = wl*scale
+                    wu = wu*scale
+                 endif
+              end if
+              ! compute the desired eigenvalues of the tridiagonal after splitting
+              ! into smaller subblocks if the corresponding off-diagonal elements
+              ! are small
+              ! thresh is the splitting parameter for stdlib_dlarre
+              ! a negative thresh forces the old splitting criterion based on the
+              ! size of the off-diagonal. a positive thresh switches to splitting
+              ! which preserves relative accuracy.
+              if( tryrac ) then
+                 ! test whether the matrix warrants the more expensive relative approach.
+                 call stdlib_dlarrr( n, d, e, iinfo )
+              else
+                 ! the user does not care about relative accurately eigenvalues
+                 iinfo = -1
+              endif
+              ! set the splitting criterion
+              if (iinfo==0) then
+                 thresh = eps
+              else
+                 thresh = -eps
+                 ! relative accuracy is desired but t does not guarantee it
+                 tryrac = .false.
+              endif
+              if( tryrac ) then
+                 ! copy original diagonal, needed to guarantee relative accuracy
+                 call stdlib_dcopy(n,d,1,work(indd),1)
+              endif
+              ! store the squares of the offdiagonal values of t
+              do j = 1, n-1
+                 work( inde2+j-1 ) = e(j)**2
+              end do
+              ! set the tolerance parameters for bisection
+              if( .not.wantz ) then
+                 ! stdlib_dlarre computes the eigenvalues to full precision.
+                 rtol1 = four * eps
+                 rtol2 = four * eps
+              else
+                 ! stdlib_dlarre computes the eigenvalues to less than full precision.
+                 ! stdlib_zlarrv will refine the eigenvalue approximations, and we only
+                 ! need less accurate initial bisection in stdlib_dlarre.
+                 ! note: these settings do only affect the subset case and stdlib_dlarre
+                 rtol1 = sqrt(eps)
+                 rtol2 = max( sqrt(eps)*5.0e-3_dp, four * eps )
+              endif
+              call stdlib_dlarre( range, n, wl, wu, iil, iiu, d, e,work(inde2), rtol1, rtol2, &
+              thresh, nsplit,iwork( iinspl ), m, w, work( inderr ),work( indgp ), iwork( iindbl ),&
+              iwork( iindw ), work( indgrs ), pivmin,work( indwrk ), iwork( iindwk ), iinfo )
+                        
+              if( iinfo/=0 ) then
+                 info = 10 + abs( iinfo )
+                 return
+              end if
+              ! note that if range /= 'v', stdlib_dlarre computes bounds on the desired
+              ! part of the spectrum. all desired eigenvalues are contained in
+              ! (wl,wu]
+              if( wantz ) then
+                 ! compute the desired eigenvectors corresponding to the computed
+                 ! eigenvalues
+                 call stdlib_zlarrv( n, wl, wu, d, e,pivmin, iwork( iinspl ), m,1, m, minrgp, &
+                 rtol1, rtol2,w, work( inderr ), work( indgp ), iwork( iindbl ),iwork( iindw ), &
+                           work( indgrs ), z, ldz,isuppz, work( indwrk ), iwork( iindwk ), iinfo )
+                 if( iinfo/=0 ) then
+                    info = 20 + abs( iinfo )
+                    return
+                 end if
+              else
+                 ! stdlib_dlarre computes eigenvalues of the (shifted) root representation
+                 ! stdlib_zlarrv returns the eigenvalues of the unshifted matrix.
+                 ! however, if the eigenvectors are not desired by the user, we need
+                 ! to apply the corresponding shifts from stdlib_dlarre to obtain the
+                 ! eigenvalues of the original matrix.
+                 do j = 1, m
+                    itmp = iwork( iindbl+j-1 )
+                    w( j ) = w( j ) + e( iwork( iinspl+itmp-1 ) )
+                 end do
+              end if
+              if ( tryrac ) then
+                 ! refine computed eigenvalues so that they are relatively accurate
+                 ! with respect to the original matrix t.
+                 ibegin = 1
+                 wbegin = 1
+                 loop_39: do jblk = 1, iwork( iindbl+m-1 )
+                    iend = iwork( iinspl+jblk-1 )
+                    in = iend - ibegin + 1
+                    wend = wbegin - 1
+                    ! check if any eigenvalues have to be refined in this block
+                    36 continue
+                    if( wend<m ) then
+                       if( iwork( iindbl+wend )==jblk ) then
+                          wend = wend + 1
+                          go to 36
+                       end if
+                    end if
+                    if( wend<wbegin ) then
+                       ibegin = iend + 1
+                       cycle loop_39
+                    end if
+                    offset = iwork(iindw+wbegin-1)-1
+                    ifirst = iwork(iindw+wbegin-1)
+                    ilast = iwork(iindw+wend-1)
+                    rtol2 = four * eps
+                    call stdlib_dlarrj( in,work(indd+ibegin-1), work(inde2+ibegin-1),ifirst, &
+                    ilast, rtol2, offset, w(wbegin),work( inderr+wbegin-1 ),work( indwrk ), iwork(&
+                               iindwk ), pivmin,tnrm, iinfo )
+                    ibegin = iend + 1
+                    wbegin = wend + 1
+                 end do loop_39
+              endif
+              ! if matrix was scaled, then rescale eigenvalues appropriately.
+              if( scale/=one ) then
+                 call stdlib_dscal( m, one / scale, w, 1 )
+              end if
+           end if
+           ! if eigenvalues are not in increasing order, then sort them,
+           ! possibly along with eigenvectors.
+           if( nsplit>1 .or. n==2 ) then
+              if( .not. wantz ) then
+                 call stdlib_dlasrt( 'I', m, w, iinfo )
+                 if( iinfo/=0 ) then
+                    info = 3
+                    return
+                 end if
+              else
+                 do j = 1, m - 1
+                    i = 0
+                    tmp = w( j )
+                    do jj = j + 1, m
+                       if( w( jj )<tmp ) then
+                          i = jj
+                          tmp = w( jj )
+                       end if
+                    end do
+                    if( i/=0 ) then
+                       w( i ) = w( j )
+                       w( j ) = tmp
+                       if( wantz ) then
+                          call stdlib_zswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                          itmp = isuppz( 2*i-1 )
+                          isuppz( 2*i-1 ) = isuppz( 2*j-1 )
+                          isuppz( 2*j-1 ) = itmp
+                          itmp = isuppz( 2*i )
+                          isuppz( 2*i ) = isuppz( 2*j )
+                          isuppz( 2*j ) = itmp
+                       end if
+                    end if
+                 end do
+              end if
+           endif
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_zstemr
+
+     !> ZSYCON: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex symmetric matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by ZSYTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_zsycon( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(dp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_zsytrs( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_zsycon
+
+     !> ZSYCON_ROOK: estimates the reciprocal of the condition number (in the
+     !> 1-norm) of a complex symmetric matrix A using the factorization
+     !> A = U*D*U**T or A = L*D*L**T computed by ZSYTRF_ROOK.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_zsycon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(dp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYCON_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==czero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==czero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**t) or inv(u*d*u**t).
+              call stdlib_zsytrs_rook( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_zsycon_rook
+
+     !> ZSYRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is symmetric indefinite, and
+     !> provides error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_zsyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_zcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_zsymv( uplo, n, -cone, a, lda, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + cabs1( a( k, k ) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + cabs1( a( k, k ) )*xk
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_zsytrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                 call stdlib_zaxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_zlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**t).
+                    call stdlib_zsytrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_zsytrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_zsyrfs
+
+     !> ZSYSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.  The factored form of A is then
+     !> used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_zsysv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_zsytrf( uplo, n, a, lda, ipiv, work, -1, info )
+                 lwkopt = real( work(1),KIND=dp)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYSV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_zsytrf( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              if ( lwork<n ) then
+              ! solve with trs ( use level blas 2)
+                 call stdlib_zsytrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+              else
+              ! solve with trs2 ( use level blas 3)
+                 call stdlib_zsytrs2( uplo,n,nrhs,a,lda,ipiv,b,ldb,work,info )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zsysv
+
+     !> ZSYSV_RK: computes the solution to a complex system of linear
+     !> equations A * X = B, where A is an N-by-N symmetric matrix
+     !> and X and B are N-by-NRHS matrices.
+     !> The bounded Bunch-Kaufman (rook) diagonal pivoting method is used
+     !> to factor A as
+     !> A = P*U*D*(U**T)*(P**T),  if UPLO = 'U', or
+     !> A = P*L*D*(L**T)*(P**T),  if UPLO = 'L',
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**T (or L**T) is the transpose of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is symmetric and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> ZSYTRF_RK is called to compute the factorization of a complex
+     !> symmetric matrix.  The factored form of A is then used to solve
+     !> the system of equations A * X = B by calling BLAS3 routine ZSYTRS_3.
+
+     pure subroutine stdlib_zsysv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_zsytrf_rk( uplo, n, a, lda, e, ipiv, work, -1, info )
+                 lwkopt = real( work(1),KIND=dp)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYSV_RK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = p*u*d*(u**t)*(p**t) or
+           ! a = p*u*d*(u**t)*(p**t).
+           call stdlib_zsytrf_rk( uplo, n, a, lda, e, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b with blas3 solver, overwriting b with x.
+              call stdlib_zsytrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zsysv_rk
+
+     !> ZSYSV_ROOK: computes the solution to a complex system of linear
+     !> equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is symmetric and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> ZSYTRF_ROOK is called to compute the factorization of a complex
+     !> symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method.
+     !> The factored form of A is then used to solve the system
+     !> of equations A * X = B by calling ZSYTRS_ROOK.
+
+     pure subroutine stdlib_zsysv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_zsytrf_rook( uplo, n, a, lda, ipiv, work, -1, info )
+                 lwkopt = real( work(1),KIND=dp)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYSV_ROOK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+           call stdlib_zsytrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              ! solve with trs_rook ( use level 2 blas)
+              call stdlib_zsytrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zsysv_rook
+
+     !> ZSYSVX: uses the diagonal pivoting factorization to compute the
+     !> solution to a complex system of linear equations A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_zsysvx( fact, uplo, n, nrhs, a, lda, af, ldaf, ipiv, b,ldb, x, ldx, rcond, &
+               ferr, berr, work, lwork,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, lwork, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(inout) :: af(ldaf,*)
+           complex(dp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, nofact
+           integer(ilp) :: lwkopt, nb
+           real(dp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -11
+           else if( ldx<max( 1, n ) ) then
+              info = -13
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           if( info==0 ) then
+              lwkopt = max( 1, 2*n )
+              if( nofact ) then
+                 nb = stdlib_ilaenv( 1, 'ZSYTRF', uplo, n, -1, -1, -1 )
+                 lwkopt = max( lwkopt, n*nb )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYSVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**t or a = l*d*l**t.
+              call stdlib_zlacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_zsytrf( uplo, n, af, ldaf, ipiv, work, lwork, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_zlansy( 'I', uplo, n, a, lda, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_zsycon( uplo, n, af, ldaf, ipiv, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_zlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_zsytrs( uplo, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_zsyrfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zsysvx
+
+     !> ZTBCON: estimates the reciprocal of the condition number of a
+     !> triangular band matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_ztbcon( norm, uplo, diag, n, kd, ab, ldab, rcond, work,rwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, n
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(in) :: ab(ldab,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(dp) :: ainvnm, anorm, scale, smlnum, xnorm
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTBCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )*real( max( n, 1 ),KIND=dp)
+           ! compute the 1-norm of the triangular matrix a or a**h.
+           anorm = stdlib_zlantb( norm, uplo, diag, n, kd, ab, ldab, rwork )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the 1-norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_zlatbs( uplo, 'NO TRANSPOSE', diag, normin, n, kd,ab, ldab, work, &
+                              scale, rwork, info )
+                 else
+                    ! multiply by inv(a**h).
+                    call stdlib_zlatbs( uplo, 'CONJUGATE TRANSPOSE', diag, normin,n, kd, ab, ldab,&
+                               work, scale, rwork, info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_izamax( n, work, 1 )
+                    xnorm = cabs1( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_zdrscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_ztbcon
+
+     !> ZTFTRI: computes the inverse of a triangular matrix A stored in RFP
+     !> format.
+     !> This is a Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_ztftri( transr, uplo, diag, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo, diag
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( .not.stdlib_lsame( diag, 'N' ) .and. .not.stdlib_lsame( diag, 'U' ) )&
+                     then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTFTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_ztrtri( 'L', diag, n1, a( 0 ), n, info )
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'R', 'L', 'N', diag, n2, n1, -cone, a( 0 ),n, a( n1 ), n )
+                              
+                    call stdlib_ztrtri( 'U', diag, n2, a( n ), n, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'L', 'U', 'C', diag, n2, n1, cone, a( n ), n,a( n1 ), n )
+                              
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_ztrtri( 'L', diag, n1, a( n2 ), n, info )
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'L', 'L', 'C', diag, n1, n2, -cone, a( n2 ),n, a( 0 ), n )
+                              
+                    call stdlib_ztrtri( 'U', diag, n2, a( n1 ), n, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'R', 'U', 'N', diag, n1, n2, cone, a( n1 ),n, a( 0 ), n )
+                              
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0), t2 -> a(1), s -> a(0+n1*n1)
+                    call stdlib_ztrtri( 'U', diag, n1, a( 0 ), n1, info )
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'L', 'U', 'N', diag, n1, n2, -cone, a( 0 ),n1, a( n1*n1 ), &
+                              n1 )
+                    call stdlib_ztrtri( 'L', diag, n2, a( 1 ), n1, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'R', 'L', 'C', diag, n1, n2, cone, a( 1 ),n1, a( n1*n1 ), &
+                              n1 )
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0+n2*n2), t2 -> a(0+n1*n2), s -> a(0)
+                    call stdlib_ztrtri( 'U', diag, n1, a( n2*n2 ), n2, info )
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'R', 'U', 'C', diag, n2, n1, -cone,a( n2*n2 ), n2, a( 0 ), &
+                              n2 )
+                    call stdlib_ztrtri( 'L', diag, n2, a( n1*n2 ), n2, info )
+                    if( info>0 )info = info + n1
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'L', 'L', 'N', diag, n2, n1, cone,a( n1*n2 ), n2, a( 0 ), &
+                              n2 )
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_ztrtri( 'L', diag, k, a( 1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'R', 'L', 'N', diag, k, k, -cone, a( 1 ),n+1, a( k+1 ), n+&
+                              1 )
+                    call stdlib_ztrtri( 'U', diag, k, a( 0 ), n+1, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'L', 'U', 'C', diag, k, k, cone, a( 0 ), n+1,a( k+1 ), n+1 &
+                              )
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_ztrtri( 'L', diag, k, a( k+1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'L', 'L', 'C', diag, k, k, -cone, a( k+1 ),n+1, a( 0 ), n+&
+                              1 )
+                    call stdlib_ztrtri( 'U', diag, k, a( k ), n+1, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'R', 'U', 'N', diag, k, k, cone, a( k ), n+1,a( 0 ), n+1 )
+                              
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_ztrtri( 'U', diag, k, a( k ), k, info )
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'L', 'U', 'N', diag, k, k, -cone, a( k ), k,a( k*( k+1 ) ),&
+                               k )
+                    call stdlib_ztrtri( 'L', diag, k, a( 0 ), k, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'R', 'L', 'C', diag, k, k, cone, a( 0 ), k,a( k*( k+1 ) ), &
+                              k )
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_ztrtri( 'U', diag, k, a( k*( k+1 ) ), k, info )
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'R', 'U', 'C', diag, k, k, -cone,a( k*( k+1 ) ), k, a( 0 ),&
+                               k )
+                    call stdlib_ztrtri( 'L', diag, k, a( k*k ), k, info )
+                    if( info>0 )info = info + k
+                    if( info>0 )return
+                    call stdlib_ztrmm( 'L', 'L', 'N', diag, k, k, cone, a( k*k ), k,a( 0 ), k )
+                              
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_ztftri
+
+     !> ZTGSJA: computes the generalized singular value decomposition (GSVD)
+     !> of two complex upper triangular (or trapezoidal) matrices A and B.
+     !> On entry, it is assumed that matrices A and B have the following
+     !> forms, which may be obtained by the preprocessing subroutine ZGGSVP
+     !> from a general M-by-N matrix A and P-by-N matrix B:
+     !> N-K-L  K    L
+     !> A =    K ( 0    A12  A13 ) if M-K-L >= 0;
+     !> L ( 0     0   A23 )
+     !> M-K-L ( 0     0    0  )
+     !> N-K-L  K    L
+     !> A =  K ( 0    A12  A13 ) if M-K-L < 0;
+     !> M-K ( 0     0   A23 )
+     !> N-K-L  K    L
+     !> B =  L ( 0     0   B13 )
+     !> P-L ( 0     0    0  )
+     !> where the K-by-K matrix A12 and L-by-L matrix B13 are nonsingular
+     !> upper triangular; A23 is L-by-L upper triangular if M-K-L >= 0,
+     !> otherwise A23 is (M-K)-by-L upper trapezoidal.
+     !> On exit,
+     !> U**H *A*Q = D1*( 0 R ),    V**H *B*Q = D2*( 0 R ),
+     !> where U, V and Q are unitary matrices.
+     !> R is a nonsingular upper triangular matrix, and D1
+     !> and D2 are ``diagonal'' matrices, which are of the following
+     !> structures:
+     !> If M-K-L >= 0,
+     !> K  L
+     !> D1 =     K ( I  0 )
+     !> L ( 0  C )
+     !> M-K-L ( 0  0 )
+     !> K  L
+     !> D2 = L   ( 0  S )
+     !> P-L ( 0  0 )
+     !> N-K-L  K    L
+     !> ( 0 R ) = K (  0   R11  R12 ) K
+     !> L (  0    0   R22 ) L
+     !> where
+     !> C = diag( ALPHA(K+1), ... , ALPHA(K+L) ),
+     !> S = diag( BETA(K+1),  ... , BETA(K+L) ),
+     !> C**2 + S**2 = I.
+     !> R is stored in A(1:K+L,N-K-L+1:N) on exit.
+     !> If M-K-L < 0,
+     !> K M-K K+L-M
+     !> D1 =   K ( I  0    0   )
+     !> M-K ( 0  C    0   )
+     !> K M-K K+L-M
+     !> D2 =   M-K ( 0  S    0   )
+     !> K+L-M ( 0  0    I   )
+     !> P-L ( 0  0    0   )
+     !> N-K-L  K   M-K  K+L-M
+     !> ( 0 R ) =    K ( 0    R11  R12  R13  )
+     !> M-K ( 0     0   R22  R23  )
+     !> K+L-M ( 0     0    0   R33  )
+     !> where
+     !> C = diag( ALPHA(K+1), ... , ALPHA(M) ),
+     !> S = diag( BETA(K+1),  ... , BETA(M) ),
+     !> C**2 + S**2 = I.
+     !> R = ( R11 R12 R13 ) is stored in A(1:M, N-K-L+1:N) and R33 is stored
+     !> (  0  R22 R23 )
+     !> in B(M-K+1:L,N+M-K-L+1:N) on exit.
+     !> The computation of the unitary transformation matrices U, V or Q
+     !> is optional.  These matrices may either be formed explicitly, or they
+     !> may be postmultiplied into input matrices U1, V1, or Q1.
+
+     pure subroutine stdlib_ztgsja( jobu, jobv, jobq, m, p, n, k, l, a, lda, b,ldb, tola, tolb, &
+               alpha, beta, u, ldu, v, ldv,q, ldq, work, ncycle, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobq, jobu, jobv
+           integer(ilp), intent(out) :: info, ncycle
+           integer(ilp), intent(in) :: k, l, lda, ldb, ldq, ldu, ldv, m, n, p
+           real(dp), intent(in) :: tola, tolb
+           ! Array Arguments 
+           real(dp), intent(out) :: alpha(*), beta(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), u(ldu,*), v(ldv,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: maxit = 40
+           real(dp), parameter :: hugenum = huge(zero)
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: initq, initu, initv, upper, wantq, wantu, wantv
+           integer(ilp) :: i, j, kcycle
+           real(dp) :: a1, a3, b1, b3, csq, csu, csv, error, gamma, rwk, ssmin
+           complex(dp) :: a2, b2, snq, snu, snv
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,max,min,huge
+           ! Executable Statements 
+           ! decode and test the input parameters
+           initu = stdlib_lsame( jobu, 'I' )
+           wantu = initu .or. stdlib_lsame( jobu, 'U' )
+           initv = stdlib_lsame( jobv, 'I' )
+           wantv = initv .or. stdlib_lsame( jobv, 'V' )
+           initq = stdlib_lsame( jobq, 'I' )
+           wantq = initq .or. stdlib_lsame( jobq, 'Q' )
+           info = 0
+           if( .not.( initu .or. wantu .or. stdlib_lsame( jobu, 'N' ) ) ) then
+              info = -1
+           else if( .not.( initv .or. wantv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -2
+           else if( .not.( initq .or. wantq .or. stdlib_lsame( jobq, 'N' ) ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( p<0 ) then
+              info = -5
+           else if( n<0 ) then
+              info = -6
+           else if( lda<max( 1, m ) ) then
+              info = -10
+           else if( ldb<max( 1, p ) ) then
+              info = -12
+           else if( ldu<1 .or. ( wantu .and. ldu<m ) ) then
+              info = -18
+           else if( ldv<1 .or. ( wantv .and. ldv<p ) ) then
+              info = -20
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -22
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGSJA', -info )
+              return
+           end if
+           ! initialize u, v and q, if necessary
+           if( initu )call stdlib_zlaset( 'FULL', m, m, czero, cone, u, ldu )
+           if( initv )call stdlib_zlaset( 'FULL', p, p, czero, cone, v, ldv )
+           if( initq )call stdlib_zlaset( 'FULL', n, n, czero, cone, q, ldq )
+           ! loop until convergence
+           upper = .false.
+           loop_40: do kcycle = 1, maxit
+              upper = .not.upper
+              loop_20: do i = 1, l - 1
+                 loop_10: do j = i + 1, l
+                    a1 = zero
+                    a2 = czero
+                    a3 = zero
+                    if( k+i<=m )a1 = real( a( k+i, n-l+i ),KIND=dp)
+                    if( k+j<=m )a3 = real( a( k+j, n-l+j ),KIND=dp)
+                    b1 = real( b( i, n-l+i ),KIND=dp)
+                    b3 = real( b( j, n-l+j ),KIND=dp)
+                    if( upper ) then
+                       if( k+i<=m )a2 = a( k+i, n-l+j )
+                       b2 = b( i, n-l+j )
+                    else
+                       if( k+j<=m )a2 = a( k+j, n-l+i )
+                       b2 = b( j, n-l+i )
+                    end if
+                    call stdlib_zlags2( upper, a1, a2, a3, b1, b2, b3, csu, snu,csv, snv, csq, &
+                              snq )
+                    ! update (k+i)-th and (k+j)-th rows of matrix a: u**h *a
+                    if( k+j<=m )call stdlib_zrot( l, a( k+j, n-l+1 ), lda, a( k+i, n-l+1 ),lda, &
+                              csu, conjg( snu ) )
+                    ! update i-th and j-th rows of matrix b: v**h *b
+                    call stdlib_zrot( l, b( j, n-l+1 ), ldb, b( i, n-l+1 ), ldb,csv, conjg( snv ) &
+                              )
+                    ! update (n-l+i)-th and (n-l+j)-th columns of matrices
+                    ! a and b: a*q and b*q
+                    call stdlib_zrot( min( k+l, m ), a( 1, n-l+j ), 1,a( 1, n-l+i ), 1, csq, snq )
+                              
+                    call stdlib_zrot( l, b( 1, n-l+j ), 1, b( 1, n-l+i ), 1, csq,snq )
+                    if( upper ) then
+                       if( k+i<=m )a( k+i, n-l+j ) = czero
+                       b( i, n-l+j ) = czero
+                    else
+                       if( k+j<=m )a( k+j, n-l+i ) = czero
+                       b( j, n-l+i ) = czero
+                    end if
+                    ! ensure that the diagonal elements of a and b are real.
+                    if( k+i<=m )a( k+i, n-l+i ) = real( a( k+i, n-l+i ),KIND=dp)
+                    if( k+j<=m )a( k+j, n-l+j ) = real( a( k+j, n-l+j ),KIND=dp)
+                    b( i, n-l+i ) = real( b( i, n-l+i ),KIND=dp)
+                    b( j, n-l+j ) = real( b( j, n-l+j ),KIND=dp)
+                    ! update unitary matrices u, v, q, if desired.
+                    if( wantu .and. k+j<=m )call stdlib_zrot( m, u( 1, k+j ), 1, u( 1, k+i ), 1, &
+                              csu,snu )
+                    if( wantv )call stdlib_zrot( p, v( 1, j ), 1, v( 1, i ), 1, csv, snv )
+                    if( wantq )call stdlib_zrot( n, q( 1, n-l+j ), 1, q( 1, n-l+i ), 1, csq,snq )
+                              
+                 end do loop_10
+              end do loop_20
+              if( .not.upper ) then
+                 ! the matrices a13 and b13 were lower triangular at the start
+                 ! of the cycle, and are now upper triangular.
+                 ! convergence test: test the parallelism of the corresponding
+                 ! rows of a and b.
+                 error = zero
+                 do i = 1, min( l, m-k )
+                    call stdlib_zcopy( l-i+1, a( k+i, n-l+i ), lda, work, 1 )
+                    call stdlib_zcopy( l-i+1, b( i, n-l+i ), ldb, work( l+1 ), 1 )
+                    call stdlib_zlapll( l-i+1, work, 1, work( l+1 ), 1, ssmin )
+                    error = max( error, ssmin )
+                 end do
+                 if( abs( error )<=min( tola, tolb ) )go to 50
+              end if
+              ! end of cycle loop
+           end do loop_40
+           ! the algorithm has not converged after maxit cycles.
+           info = 1
+           go to 100
+           50 continue
+           ! if error <= min(tola,tolb), then the algorithm has converged.
+           ! compute the generalized singular value pairs (alpha, beta), and
+           ! set the triangular matrix r to array a.
+           do i = 1, k
+              alpha( i ) = one
+              beta( i ) = zero
+           end do
+           do i = 1, min( l, m-k )
+              a1 = real( a( k+i, n-l+i ),KIND=dp)
+              b1 = real( b( i, n-l+i ),KIND=dp)
+              gamma = b1 / a1
+              if( (gamma<=hugenum).and.(gamma>=-hugenum) ) then
+                 if( gamma<zero ) then
+                    call stdlib_zdscal( l-i+1, -one, b( i, n-l+i ), ldb )
+                    if( wantv )call stdlib_zdscal( p, -one, v( 1, i ), 1 )
+                 end if
+                 call stdlib_dlartg( abs( gamma ), one, beta( k+i ), alpha( k+i ),rwk )
+                 if( alpha( k+i )>=beta( k+i ) ) then
+                    call stdlib_zdscal( l-i+1, one / alpha( k+i ), a( k+i, n-l+i ),lda )
+                 else
+                    call stdlib_zdscal( l-i+1, one / beta( k+i ), b( i, n-l+i ),ldb )
+                    call stdlib_zcopy( l-i+1, b( i, n-l+i ), ldb, a( k+i, n-l+i ),lda )
+                 end if
+              else
+                 alpha( k+i ) = zero
+                 beta( k+i ) = one
+                 call stdlib_zcopy( l-i+1, b( i, n-l+i ), ldb, a( k+i, n-l+i ),lda )
+              end if
+           end do
+           ! post-assignment
+           do i = m + 1, k + l
+              alpha( i ) = zero
+              beta( i ) = one
+           end do
+           if( k+l<n ) then
+              do i = k + l + 1, n
+                 alpha( i ) = zero
+                 beta( i ) = zero
+              end do
+           end if
+           100 continue
+           ncycle = kcycle
+           return
+     end subroutine stdlib_ztgsja
+
+     !> ZTGSY2: solves the generalized Sylvester equation
+     !> A * R - L * B = scale * C               (1)
+     !> D * R - L * E = scale * F
+     !> using Level 1 and 2 BLAS, where R and L are unknown M-by-N matrices,
+     !> (A, D), (B, E) and (C, F) are given matrix pairs of size M-by-M,
+     !> N-by-N and M-by-N, respectively. A, B, D and E are upper triangular
+     !> (i.e., (A,D) and (B,E) in generalized Schur form).
+     !> The solution (R, L) overwrites (C, F). 0 <= SCALE <= 1 is an output
+     !> scaling factor chosen to avoid overflow.
+     !> In matrix notation solving equation (1) corresponds to solve
+     !> Zx = scale * b, where Z is defined as
+     !> Z = [ kron(In, A)  -kron(B**H, Im) ]             (2)
+     !> [ kron(In, D)  -kron(E**H, Im) ],
+     !> Ik is the identity matrix of size k and X**H is the conjuguate transpose of X.
+     !> kron(X, Y) is the Kronecker product between the matrices X and Y.
+     !> If TRANS = 'C', y in the conjugate transposed system Z**H*y = scale*b
+     !> is solved for, which is equivalent to solve for R and L in
+     !> A**H * R  + D**H * L   = scale * C           (3)
+     !> R  * B**H + L  * E**H  = scale * -F
+     !> This case is used to compute an estimate of Dif[(A, D), (B, E)] =
+     !> = sigma_min(Z) using reverse communication with ZLACON.
+     !> ZTGSY2 also (IJOB >= 1) contributes to the computation in ZTGSYL
+     !> of an upper bound on the separation between to matrix pairs. Then
+     !> the input (A, D), (B, E) are sub-pencils of two matrix pairs in
+     !> ZTGSYL.
+
+     pure subroutine stdlib_ztgsy2( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f, &
+               ldf, scale, rdsum, rdscal,info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldc, ldd, lde, ldf, m, n
+           integer(ilp), intent(out) :: info
+           real(dp), intent(inout) :: rdscal, rdsum
+           real(dp), intent(out) :: scale
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), b(ldb,*), d(ldd,*), e(lde,*)
+           complex(dp), intent(inout) :: c(ldc,*), f(ldf,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ldz = 2
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           integer(ilp) :: i, ierr, j, k
+           real(dp) :: scaloc
+           complex(dp) :: alpha
+           ! Local Arrays 
+           integer(ilp) :: ipiv(ldz), jpiv(ldz)
+           complex(dp) :: rhs(ldz), z(ldz,ldz)
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,conjg,max
+           ! Executable Statements 
+           ! decode and test input parameters
+           info = 0
+           ierr = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -1
+           else if( notran ) then
+              if( ( ijob<0 ) .or. ( ijob>2 ) ) then
+                 info = -2
+              end if
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 info = -3
+              else if( n<=0 ) then
+                 info = -4
+              else if( lda<max( 1, m ) ) then
+                 info = -6
+              else if( ldb<max( 1, n ) ) then
+                 info = -8
+              else if( ldc<max( 1, m ) ) then
+                 info = -10
+              else if( ldd<max( 1, m ) ) then
+                 info = -12
+              else if( lde<max( 1, n ) ) then
+                 info = -14
+              else if( ldf<max( 1, m ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGSY2', -info )
+              return
+           end if
+           if( notran ) then
+              ! solve (i, j) - system
+                 ! a(i, i) * r(i, j) - l(i, j) * b(j, j) = c(i, j)
+                 ! d(i, i) * r(i, j) - l(i, j) * e(j, j) = f(i, j)
+              ! for i = m, m - 1, ..., 1; j = 1, 2, ..., n
+              scale = one
+              scaloc = one
+              loop_30: do j = 1, n
+                 loop_20: do i = m, 1, -1
+                    ! build 2 by 2 system
+                    z( 1, 1 ) = a( i, i )
+                    z( 2, 1 ) = d( i, i )
+                    z( 1, 2 ) = -b( j, j )
+                    z( 2, 2 ) = -e( j, j )
+                    ! set up right hand side(s)
+                    rhs( 1 ) = c( i, j )
+                    rhs( 2 ) = f( i, j )
+                    ! solve z * x = rhs
+                    call stdlib_zgetc2( ldz, z, ldz, ipiv, jpiv, ierr )
+                    if( ierr>0 )info = ierr
+                    if( ijob==0 ) then
+                       call stdlib_zgesc2( ldz, z, ldz, rhs, ipiv, jpiv, scaloc )
+                       if( scaloc/=one ) then
+                          do k = 1, n
+                             call stdlib_zscal( m, cmplx( scaloc, zero,KIND=dp),c( 1, k ), 1 )
+                                       
+                             call stdlib_zscal( m, cmplx( scaloc, zero,KIND=dp),f( 1, k ), 1 )
+                                       
+                          end do
+                          scale = scale*scaloc
+                       end if
+                    else
+                       call stdlib_zlatdf( ijob, ldz, z, ldz, rhs, rdsum, rdscal,ipiv, jpiv )
+                                 
+                    end if
+                    ! unpack solution vector(s)
+                    c( i, j ) = rhs( 1 )
+                    f( i, j ) = rhs( 2 )
+                    ! substitute r(i, j) and l(i, j) into remaining equation.
+                    if( i>1 ) then
+                       alpha = -rhs( 1 )
+                       call stdlib_zaxpy( i-1, alpha, a( 1, i ), 1, c( 1, j ), 1 )
+                       call stdlib_zaxpy( i-1, alpha, d( 1, i ), 1, f( 1, j ), 1 )
+                    end if
+                    if( j<n ) then
+                       call stdlib_zaxpy( n-j, rhs( 2 ), b( j, j+1 ), ldb,c( i, j+1 ), ldc )
+                                 
+                       call stdlib_zaxpy( n-j, rhs( 2 ), e( j, j+1 ), lde,f( i, j+1 ), ldf )
+                                 
+                    end if
+                 end do loop_20
+              end do loop_30
+           else
+              ! solve transposed (i, j) - system:
+                 ! a(i, i)**h * r(i, j) + d(i, i)**h * l(j, j) = c(i, j)
+                 ! r(i, i) * b(j, j) + l(i, j) * e(j, j)   = -f(i, j)
+              ! for i = 1, 2, ..., m, j = n, n - 1, ..., 1
+              scale = one
+              scaloc = one
+              loop_80: do i = 1, m
+                 loop_70: do j = n, 1, -1
+                    ! build 2 by 2 system z**h
+                    z( 1, 1 ) = conjg( a( i, i ) )
+                    z( 2, 1 ) = -conjg( b( j, j ) )
+                    z( 1, 2 ) = conjg( d( i, i ) )
+                    z( 2, 2 ) = -conjg( e( j, j ) )
+                    ! set up right hand side(s)
+                    rhs( 1 ) = c( i, j )
+                    rhs( 2 ) = f( i, j )
+                    ! solve z**h * x = rhs
+                    call stdlib_zgetc2( ldz, z, ldz, ipiv, jpiv, ierr )
+                    if( ierr>0 )info = ierr
+                    call stdlib_zgesc2( ldz, z, ldz, rhs, ipiv, jpiv, scaloc )
+                    if( scaloc/=one ) then
+                       do k = 1, n
+                          call stdlib_zscal( m, cmplx( scaloc, zero,KIND=dp), c( 1, k ),1 )
+                                    
+                          call stdlib_zscal( m, cmplx( scaloc, zero,KIND=dp), f( 1, k ),1 )
+                                    
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    ! unpack solution vector(s)
+                    c( i, j ) = rhs( 1 )
+                    f( i, j ) = rhs( 2 )
+                    ! substitute r(i, j) and l(i, j) into remaining equation.
+                    do k = 1, j - 1
+                       f( i, k ) = f( i, k ) + rhs( 1 )*conjg( b( k, j ) ) +rhs( 2 )*conjg( e( k, &
+                                 j ) )
+                    end do
+                    do k = i + 1, m
+                       c( k, j ) = c( k, j ) - conjg( a( i, k ) )*rhs( 1 ) -conjg( d( i, k ) )&
+                                 *rhs( 2 )
+                    end do
+                 end do loop_70
+              end do loop_80
+           end if
+           return
+     end subroutine stdlib_ztgsy2
+
+     !> ZTGSYL: solves the generalized Sylvester equation:
+     !> A * R - L * B = scale * C            (1)
+     !> D * R - L * E = scale * F
+     !> where R and L are unknown m-by-n matrices, (A, D), (B, E) and
+     !> (C, F) are given matrix pairs of size m-by-m, n-by-n and m-by-n,
+     !> respectively, with complex entries. A, B, D and E are upper
+     !> triangular (i.e., (A,D) and (B,E) in generalized Schur form).
+     !> The solution (R, L) overwrites (C, F). 0 <= SCALE <= 1
+     !> is an output scaling factor chosen to avoid overflow.
+     !> In matrix notation (1) is equivalent to solve Zx = scale*b, where Z
+     !> is defined as
+     !> Z = [ kron(In, A)  -kron(B**H, Im) ]        (2)
+     !> [ kron(In, D)  -kron(E**H, Im) ],
+     !> Here Ix is the identity matrix of size x and X**H is the conjugate
+     !> transpose of X. Kron(X, Y) is the Kronecker product between the
+     !> matrices X and Y.
+     !> If TRANS = 'C', y in the conjugate transposed system Z**H *y = scale*b
+     !> is solved for, which is equivalent to solve for R and L in
+     !> A**H * R + D**H * L = scale * C           (3)
+     !> R * B**H + L * E**H = scale * -F
+     !> This case (TRANS = 'C') is used to compute an one-norm-based estimate
+     !> of Dif[(A,D), (B,E)], the separation between the matrix pairs (A,D)
+     !> and (B,E), using ZLACON.
+     !> If IJOB >= 1, ZTGSYL computes a Frobenius norm-based estimate of
+     !> Dif[(A,D),(B,E)]. That is, the reciprocal of a lower bound on the
+     !> reciprocal of the smallest singular value of Z.
+     !> This is a level-3 BLAS algorithm.
+
+     pure subroutine stdlib_ztgsyl( trans, ijob, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f, &
+               ldf, scale, dif, work, lwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldc, ldd, lde, ldf, lwork, m, n
+           integer(ilp), intent(out) :: info
+           real(dp), intent(out) :: dif, scale
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           complex(dp), intent(in) :: a(lda,*), b(ldb,*), d(ldd,*), e(lde,*)
+           complex(dp), intent(inout) :: c(ldc,*), f(ldf,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+        ! replaced various illegal calls to stdlib_ccopy by calls to stdlib_claset.
+        ! sven hammarling, 1/5/02.
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, notran
+           integer(ilp) :: i, ie, ifunc, iround, is, isolve, j, je, js, k, linfo, lwmin, mb, nb, &
+                     p, pq, q
+           real(dp) :: dscale, dsum, scale2, scaloc
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,max,sqrt
+           ! Executable Statements 
+           ! decode and test input parameters
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -1
+           else if( notran ) then
+              if( ( ijob<0 ) .or. ( ijob>4 ) ) then
+                 info = -2
+              end if
+           end if
+           if( info==0 ) then
+              if( m<=0 ) then
+                 info = -3
+              else if( n<=0 ) then
+                 info = -4
+              else if( lda<max( 1, m ) ) then
+                 info = -6
+              else if( ldb<max( 1, n ) ) then
+                 info = -8
+              else if( ldc<max( 1, m ) ) then
+                 info = -10
+              else if( ldd<max( 1, m ) ) then
+                 info = -12
+              else if( lde<max( 1, n ) ) then
+                 info = -14
+              else if( ldf<max( 1, m ) ) then
+                 info = -16
+              end if
+           end if
+           if( info==0 ) then
+              if( notran ) then
+                 if( ijob==1 .or. ijob==2 ) then
+                    lwmin = max( 1, 2*m*n )
+                 else
+                    lwmin = 1
+                 end if
+              else
+                 lwmin = 1
+              end if
+              work( 1 ) = lwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGSYL', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              scale = 1
+              if( notran ) then
+                 if( ijob/=0 ) then
+                    dif = 0
+                 end if
+              end if
+              return
+           end if
+           ! determine  optimal block sizes mb and nb
+           mb = stdlib_ilaenv( 2, 'ZTGSYL', trans, m, n, -1, -1 )
+           nb = stdlib_ilaenv( 5, 'ZTGSYL', trans, m, n, -1, -1 )
+           isolve = 1
+           ifunc = 0
+           if( notran ) then
+              if( ijob>=3 ) then
+                 ifunc = ijob - 2
+                 call stdlib_zlaset( 'F', m, n, czero, czero, c, ldc )
+                 call stdlib_zlaset( 'F', m, n, czero, czero, f, ldf )
+              else if( ijob>=1 .and. notran ) then
+                 isolve = 2
+              end if
+           end if
+           if( ( mb<=1 .and. nb<=1 ) .or. ( mb>=m .and. nb>=n ) )then
+              ! use unblocked level 2 solver
+              loop_30: do iround = 1, isolve
+                 scale = one
+                 dscale = zero
+                 dsum = one
+                 pq = m*n
+                 call stdlib_ztgsy2( trans, ifunc, m, n, a, lda, b, ldb, c, ldc, d,ldd, e, lde, f,&
+                            ldf, scale, dsum, dscale,info )
+                 if( dscale/=zero ) then
+                    if( ijob==1 .or. ijob==3 ) then
+                       dif = sqrt( real( 2*m*n,KIND=dp) ) / ( dscale*sqrt( dsum ) )
+                    else
+                       dif = sqrt( real( pq,KIND=dp) ) / ( dscale*sqrt( dsum ) )
+                    end if
+                 end if
+                 if( isolve==2 .and. iround==1 ) then
+                    if( notran ) then
+                       ifunc = ijob
+                    end if
+                    scale2 = scale
+                    call stdlib_zlacpy( 'F', m, n, c, ldc, work, m )
+                    call stdlib_zlacpy( 'F', m, n, f, ldf, work( m*n+1 ), m )
+                    call stdlib_zlaset( 'F', m, n, czero, czero, c, ldc )
+                    call stdlib_zlaset( 'F', m, n, czero, czero, f, ldf )
+                 else if( isolve==2 .and. iround==2 ) then
+                    call stdlib_zlacpy( 'F', m, n, work, m, c, ldc )
+                    call stdlib_zlacpy( 'F', m, n, work( m*n+1 ), m, f, ldf )
+                    scale = scale2
+                 end if
+              end do loop_30
+              return
+           end if
+           ! determine block structure of a
+           p = 0
+           i = 1
+           40 continue
+           if( i>m )go to 50
+           p = p + 1
+           iwork( p ) = i
+           i = i + mb
+           if( i>=m )go to 50
+           go to 40
+           50 continue
+           iwork( p+1 ) = m + 1
+           if( iwork( p )==iwork( p+1 ) )p = p - 1
+           ! determine block structure of b
+           q = p + 1
+           j = 1
+           60 continue
+           if( j>n )go to 70
+           q = q + 1
+           iwork( q ) = j
+           j = j + nb
+           if( j>=n )go to 70
+           go to 60
+           70 continue
+           iwork( q+1 ) = n + 1
+           if( iwork( q )==iwork( q+1 ) )q = q - 1
+           if( notran ) then
+              loop_150: do iround = 1, isolve
+                 ! solve (i, j) - subsystem
+                     ! a(i, i) * r(i, j) - l(i, j) * b(j, j) = c(i, j)
+                     ! d(i, i) * r(i, j) - l(i, j) * e(j, j) = f(i, j)
+                 ! for i = p, p - 1, ..., 1; j = 1, 2, ..., q
+                 pq = 0
+                 scale = one
+                 dscale = zero
+                 dsum = one
+                 loop_130: do j = p + 2, q
+                    js = iwork( j )
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    loop_120: do i = p, 1, -1
+                       is = iwork( i )
+                       ie = iwork( i+1 ) - 1
+                       mb = ie - is + 1
+                       call stdlib_ztgsy2( trans, ifunc, mb, nb, a( is, is ), lda,b( js, js ), &
+                       ldb, c( is, js ), ldc,d( is, is ), ldd, e( js, js ), lde,f( is, js ), ldf, &
+                                 scaloc, dsum, dscale,linfo )
+                       if( linfo>0 )info = linfo
+                       pq = pq + mb*nb
+                       if( scaloc/=one ) then
+                          do k = 1, js - 1
+                             call stdlib_zscal( m, cmplx( scaloc, zero,KIND=dp),c( 1, k ), 1 )
+                                       
+                             call stdlib_zscal( m, cmplx( scaloc, zero,KIND=dp),f( 1, k ), 1 )
+                                       
+                          end do
+                          do k = js, je
+                             call stdlib_zscal( is-1, cmplx( scaloc, zero,KIND=dp),c( 1, k ), 1 )
+                                       
+                             call stdlib_zscal( is-1, cmplx( scaloc, zero,KIND=dp),f( 1, k ), 1 )
+                                       
+                          end do
+                          do k = js, je
+                             call stdlib_zscal( m-ie, cmplx( scaloc, zero,KIND=dp),c( ie+1, k ), &
+                                       1 )
+                             call stdlib_zscal( m-ie, cmplx( scaloc, zero,KIND=dp),f( ie+1, k ), &
+                                       1 )
+                          end do
+                          do k = je + 1, n
+                             call stdlib_zscal( m, cmplx( scaloc, zero,KIND=dp),c( 1, k ), 1 )
+                                       
+                             call stdlib_zscal( m, cmplx( scaloc, zero,KIND=dp),f( 1, k ), 1 )
+                                       
+                          end do
+                          scale = scale*scaloc
+                       end if
+                       ! substitute r(i,j) and l(i,j) into remaining equation.
+                       if( i>1 ) then
+                          call stdlib_zgemm( 'N', 'N', is-1, nb, mb,cmplx( -one, zero,KIND=dp), a(&
+                           1, is ), lda,c( is, js ), ldc, cmplx( one, zero,KIND=dp),c( 1, js ), &
+                                     ldc )
+                          call stdlib_zgemm( 'N', 'N', is-1, nb, mb,cmplx( -one, zero,KIND=dp), d(&
+                           1, is ), ldd,c( is, js ), ldc, cmplx( one, zero,KIND=dp),f( 1, js ), &
+                                     ldf )
+                       end if
+                       if( j<q ) then
+                          call stdlib_zgemm( 'N', 'N', mb, n-je, nb,cmplx( one, zero,KIND=dp), f( &
+                          is, js ), ldf,b( js, je+1 ), ldb,cmplx( one, zero,KIND=dp), c( is, je+1 &
+                                    ),ldc )
+                          call stdlib_zgemm( 'N', 'N', mb, n-je, nb,cmplx( one, zero,KIND=dp), f( &
+                          is, js ), ldf,e( js, je+1 ), lde,cmplx( one, zero,KIND=dp), f( is, je+1 &
+                                    ),ldf )
+                       end if
+                    end do loop_120
+                 end do loop_130
+                 if( dscale/=zero ) then
+                    if( ijob==1 .or. ijob==3 ) then
+                       dif = sqrt( real( 2*m*n,KIND=dp) ) / ( dscale*sqrt( dsum ) )
+                    else
+                       dif = sqrt( real( pq,KIND=dp) ) / ( dscale*sqrt( dsum ) )
+                    end if
+                 end if
+                 if( isolve==2 .and. iround==1 ) then
+                    if( notran ) then
+                       ifunc = ijob
+                    end if
+                    scale2 = scale
+                    call stdlib_zlacpy( 'F', m, n, c, ldc, work, m )
+                    call stdlib_zlacpy( 'F', m, n, f, ldf, work( m*n+1 ), m )
+                    call stdlib_zlaset( 'F', m, n, czero, czero, c, ldc )
+                    call stdlib_zlaset( 'F', m, n, czero, czero, f, ldf )
+                 else if( isolve==2 .and. iround==2 ) then
+                    call stdlib_zlacpy( 'F', m, n, work, m, c, ldc )
+                    call stdlib_zlacpy( 'F', m, n, work( m*n+1 ), m, f, ldf )
+                    scale = scale2
+                 end if
+              end do loop_150
+           else
+              ! solve transposed (i, j)-subsystem
+                  ! a(i, i)**h * r(i, j) + d(i, i)**h * l(i, j) = c(i, j)
+                  ! r(i, j) * b(j, j)  + l(i, j) * e(j, j) = -f(i, j)
+              ! for i = 1,2,..., p; j = q, q-1,..., 1
+              scale = one
+              loop_210: do i = 1, p
+                 is = iwork( i )
+                 ie = iwork( i+1 ) - 1
+                 mb = ie - is + 1
+                 loop_200: do j = q, p + 2, -1
+                    js = iwork( j )
+                    je = iwork( j+1 ) - 1
+                    nb = je - js + 1
+                    call stdlib_ztgsy2( trans, ifunc, mb, nb, a( is, is ), lda,b( js, js ), ldb, &
+                    c( is, js ), ldc,d( is, is ), ldd, e( js, js ), lde,f( is, js ), ldf, scaloc, &
+                              dsum, dscale,linfo )
+                    if( linfo>0 )info = linfo
+                    if( scaloc/=one ) then
+                       do k = 1, js - 1
+                          call stdlib_zscal( m, cmplx( scaloc, zero,KIND=dp), c( 1, k ),1 )
+                                    
+                          call stdlib_zscal( m, cmplx( scaloc, zero,KIND=dp), f( 1, k ),1 )
+                                    
+                       end do
+                       do k = js, je
+                          call stdlib_zscal( is-1, cmplx( scaloc, zero,KIND=dp),c( 1, k ), 1 )
+                                    
+                          call stdlib_zscal( is-1, cmplx( scaloc, zero,KIND=dp),f( 1, k ), 1 )
+                                    
+                       end do
+                       do k = js, je
+                          call stdlib_zscal( m-ie, cmplx( scaloc, zero,KIND=dp),c( ie+1, k ), 1 )
+                                    
+                          call stdlib_zscal( m-ie, cmplx( scaloc, zero,KIND=dp),f( ie+1, k ), 1 )
+                                    
+                       end do
+                       do k = je + 1, n
+                          call stdlib_zscal( m, cmplx( scaloc, zero,KIND=dp), c( 1, k ),1 )
+                                    
+                          call stdlib_zscal( m, cmplx( scaloc, zero,KIND=dp), f( 1, k ),1 )
+                                    
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    ! substitute r(i,j) and l(i,j) into remaining equation.
+                    if( j>p+2 ) then
+                       call stdlib_zgemm( 'N', 'C', mb, js-1, nb,cmplx( one, zero,KIND=dp), c( is,&
+                        js ), ldc,b( 1, js ), ldb, cmplx( one, zero,KIND=dp),f( is, 1 ), ldf )
+                                  
+                       call stdlib_zgemm( 'N', 'C', mb, js-1, nb,cmplx( one, zero,KIND=dp), f( is,&
+                        js ), ldf,e( 1, js ), lde, cmplx( one, zero,KIND=dp),f( is, 1 ), ldf )
+                                  
+                    end if
+                    if( i<p ) then
+                       call stdlib_zgemm( 'C', 'N', m-ie, nb, mb,cmplx( -one, zero,KIND=dp), a( &
+                       is, ie+1 ), lda,c( is, js ), ldc, cmplx( one, zero,KIND=dp),c( ie+1, js ), &
+                                 ldc )
+                       call stdlib_zgemm( 'C', 'N', m-ie, nb, mb,cmplx( -one, zero,KIND=dp), d( &
+                       is, ie+1 ), ldd,f( is, js ), ldf, cmplx( one, zero,KIND=dp),c( ie+1, js ), &
+                                 ldc )
+                    end if
+                 end do loop_200
+              end do loop_210
+           end if
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_ztgsyl
+
+     !> ZTPCON: estimates the reciprocal of the condition number of a packed
+     !> triangular matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_ztpcon( norm, uplo, diag, n, ap, rcond, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(in) :: ap(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(dp) :: ainvnm, anorm, scale, smlnum, xnorm
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=dp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_zlantp( norm, uplo, diag, n, ap, rwork )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_zlatps( uplo, 'NO TRANSPOSE', diag, normin, n, ap,work, scale, &
+                              rwork, info )
+                 else
+                    ! multiply by inv(a**h).
+                    call stdlib_zlatps( uplo, 'CONJUGATE TRANSPOSE', diag, normin,n, ap, work, &
+                              scale, rwork, info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_izamax( n, work, 1 )
+                    xnorm = cabs1( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_zdrscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_ztpcon
+
+     !> ZTPLQT: computes a blocked LQ factorization of a complex
+     !> "triangular-pentagonal" matrix C, which is composed of a
+     !> triangular block A and pentagonal block B, using the compact
+     !> WY representation for Q.
+
+     pure subroutine stdlib_ztplqt( m, n, l, mb, a, lda, b, ldb, t, ldt, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l, mb
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, lb, nb, iinfo
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. (l>min(m,n) .and. min(m,n)>=0)) then
+              info = -3
+           else if( mb<1 .or. (mb>m .and. m>0)) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m ) ) then
+              info = -8
+           else if( ldt<mb ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPLQT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) return
+           do i = 1, m, mb
+           ! compute the qr factorization of the current block
+              ib = min( m-i+1, mb )
+              nb = min( n-l+i+ib-1, n )
+              if( i>=l ) then
+                 lb = 0
+              else
+                 lb = nb-n+l-i+1
+              end if
+              call stdlib_ztplqt2( ib, nb, lb, a(i,i), lda, b( i, 1 ), ldb,t(1, i ), ldt, iinfo )
+                        
+           ! update by applying h**t to b(i+ib:m,:) from the right
+              if( i+ib<=m ) then
+                 call stdlib_ztprfb( 'R', 'N', 'F', 'R', m-i-ib+1, nb, ib, lb,b( i, 1 ), ldb, t( &
+                           1, i ), ldt,a( i+ib, i ), lda, b( i+ib, 1 ), ldb,work, m-i-ib+1)
+              end if
+           end do
+           return
+     end subroutine stdlib_ztplqt
+
+     !> ZTPMLQT: applies a complex unitary matrix Q obtained from a
+     !> "triangular-pentagonal" complex block reflector H to a general
+     !> complex matrix C, which consists of two blocks A and B.
+
+     pure subroutine stdlib_ztpmlqt( side, trans, m, n, k, l, mb, v, ldv, t, ldt,a, lda, b, ldb, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, lda, ldb, m, n, l, mb, ldt
+           ! Array Arguments 
+           complex(dp), intent(in) :: v(ldv,*), t(ldt,*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, nb, lb, kf, ldaq
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'C' )
+           notran = stdlib_lsame( trans, 'N' )
+           if ( left ) then
+              ldaq = max( 1, k )
+           else if ( right ) then
+              ldaq = max( 1, m )
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( l<0 .or. l>k ) then
+              info = -6
+           else if( mb<1 .or. (mb>k .and. k>0) ) then
+              info = -7
+           else if( ldv<k ) then
+              info = -9
+           else if( ldt<mb ) then
+              info = -11
+           else if( lda<ldaq ) then
+              info = -13
+           else if( ldb<max( 1, m ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPMLQT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. notran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = 0
+                 end if
+                 call stdlib_ztprfb( 'L', 'C', 'F', 'R', nb, n, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. tran ) then
+              do i = 1, k, mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = nb-n+l-i+1
+                 end if
+                 call stdlib_ztprfb( 'R', 'N', 'F', 'R', m, nb, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           else if( left .and. tran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = 0
+                 end if
+                 call stdlib_ztprfb( 'L', 'N', 'F', 'R', nb, n, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. notran ) then
+              kf = ((k-1)/mb)*mb+1
+              do i = kf, 1, -mb
+                 ib = min( mb, k-i+1 )
+                 nb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = nb-n+l-i+1
+                 end if
+                 call stdlib_ztprfb( 'R', 'C', 'F', 'R', m, nb, ib, lb,v( i, 1 ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           end if
+           return
+     end subroutine stdlib_ztpmlqt
+
+     !> ZTPMQRT: applies a complex orthogonal matrix Q obtained from a
+     !> "triangular-pentagonal" complex block reflector H to a general
+     !> complex matrix C, which consists of two blocks A and B.
+
+     pure subroutine stdlib_ztpmqrt( side, trans, m, n, k, l, nb, v, ldv, t, ldt,a, lda, b, ldb, &
+               work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, ldv, lda, ldb, m, n, l, nb, ldt
+           ! Array Arguments 
+           complex(dp), intent(in) :: v(ldv,*), t(ldt,*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran
+           integer(ilp) :: i, ib, mb, lb, kf, ldaq, ldvq
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! Test The Input Arguments 
+           info   = 0
+           left   = stdlib_lsame( side,  'L' )
+           right  = stdlib_lsame( side,  'R' )
+           tran   = stdlib_lsame( trans, 'C' )
+           notran = stdlib_lsame( trans, 'N' )
+           if ( left ) then
+              ldvq = max( 1, m )
+              ldaq = max( 1, k )
+           else if ( right ) then
+              ldvq = max( 1, n )
+              ldaq = max( 1, m )
+           end if
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( k<0 ) then
+              info = -5
+           else if( l<0 .or. l>k ) then
+              info = -6
+           else if( nb<1 .or. (nb>k .and. k>0) ) then
+              info = -7
+           else if( ldv<ldvq ) then
+              info = -9
+           else if( ldt<nb ) then
+              info = -11
+           else if( lda<ldaq ) then
+              info = -13
+           else if( ldb<max( 1, m ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPMQRT', -info )
+              return
+           end if
+           ! Quick Return If Possible 
+           if( m==0 .or. n==0 .or. k==0 ) return
+           if( left .and. tran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-m+l-i+1
+                 end if
+                 call stdlib_ztprfb( 'L', 'C', 'F', 'C', mb, n, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. notran ) then
+              do i = 1, k, nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-n+l-i+1
+                 end if
+                 call stdlib_ztprfb( 'R', 'N', 'F', 'C', m, mb, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           else if( left .and. notran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( m-l+i+ib-1, m )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-m+l-i+1
+                 end if
+                 call stdlib_ztprfb( 'L', 'N', 'F', 'C', mb, n, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( i, 1 ), lda, b, ldb, work, ib )
+              end do
+           else if( right .and. tran ) then
+              kf = ((k-1)/nb)*nb+1
+              do i = kf, 1, -nb
+                 ib = min( nb, k-i+1 )
+                 mb = min( n-l+i+ib-1, n )
+                 if( i>=l ) then
+                    lb = 0
+                 else
+                    lb = mb-n+l-i+1
+                 end if
+                 call stdlib_ztprfb( 'R', 'C', 'F', 'C', m, mb, ib, lb,v( 1, i ), ldv, t( 1, i ), &
+                           ldt,a( 1, i ), lda, b, ldb, work, m )
+              end do
+           end if
+           return
+     end subroutine stdlib_ztpmqrt
+
+     !> ZTPQRT: computes a blocked QR factorization of a complex
+     !> "triangular-pentagonal" matrix C, which is composed of a
+     !> triangular block A and pentagonal block B, using the compact
+     !> WY representation for Q.
+
+     pure subroutine stdlib_ztpqrt( m, n, l, nb, a, lda, b, ldb, t, ldt, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldt, n, m, l, nb
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, lb, mb, iinfo
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( l<0 .or. (l>min(m,n) .and. min(m,n)>=0)) then
+              info = -3
+           else if( nb<1 .or. (nb>n .and. n>0)) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, m ) ) then
+              info = -8
+           else if( ldt<nb ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTPQRT', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) return
+           do i = 1, n, nb
+           ! compute the qr factorization of the current block
+              ib = min( n-i+1, nb )
+              mb = min( m-l+i+ib-1, m )
+              if( i>=l ) then
+                 lb = 0
+              else
+                 lb = mb-m+l-i+1
+              end if
+              call stdlib_ztpqrt2( mb, ib, lb, a(i,i), lda, b( 1, i ), ldb,t(1, i ), ldt, iinfo )
+                        
+           ! update by applying h**h to b(:,i+ib:n) from the left
+              if( i+ib<=n ) then
+                 call stdlib_ztprfb( 'L', 'C', 'F', 'C', mb, n-i-ib+1, ib, lb,b( 1, i ), ldb, t( &
+                           1, i ), ldt,a( i, i+ib ), lda, b( 1, i+ib ), ldb,work, ib )
+              end if
+           end do
+           return
+     end subroutine stdlib_ztpqrt
+
+     !> ZTRCON: estimates the reciprocal of the condition number of a
+     !> triangular matrix A, in either the 1-norm or the infinity-norm.
+     !> The norm of A is computed and an estimate is obtained for
+     !> norm(inv(A)), then the reciprocal of the condition number is
+     !> computed as
+     !> RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+
+     subroutine stdlib_ztrcon( norm, uplo, diag, n, a, lda, rcond, work,rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: diag, norm, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nounit, onenrm, upper
+           character :: normin
+           integer(ilp) :: ix, kase, kase1
+           real(dp) :: ainvnm, anorm, scale, smlnum, xnorm
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           nounit = stdlib_lsame( diag, 'N' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.nounit .and. .not.stdlib_lsame( diag, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRCON', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              rcond = one
+              return
+           end if
+           rcond = zero
+           smlnum = stdlib_dlamch( 'SAFE MINIMUM' )*real( max( 1, n ),KIND=dp)
+           ! compute the norm of the triangular matrix a.
+           anorm = stdlib_zlantr( norm, uplo, diag, n, n, a, lda, rwork )
+           ! continue only if anorm > 0.
+           if( anorm>zero ) then
+              ! estimate the norm of the inverse of a.
+              ainvnm = zero
+              normin = 'N'
+              if( onenrm ) then
+                 kase1 = 1
+              else
+                 kase1 = 2
+              end if
+              kase = 0
+              10 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==kase1 ) then
+                    ! multiply by inv(a).
+                    call stdlib_zlatrs( uplo, 'NO TRANSPOSE', diag, normin, n, a,lda, work, scale,&
+                               rwork, info )
+                 else
+                    ! multiply by inv(a**h).
+                    call stdlib_zlatrs( uplo, 'CONJUGATE TRANSPOSE', diag, normin,n, a, lda, work,&
+                               scale, rwork, info )
+                 end if
+                 normin = 'Y'
+                 ! multiply by 1/scale if doing so will not cause overflow.
+                 if( scale/=one ) then
+                    ix = stdlib_izamax( n, work, 1 )
+                    xnorm = cabs1( work( ix ) )
+                    if( scale<xnorm*smlnum .or. scale==zero )go to 20
+                    call stdlib_zdrscl( n, scale, work, 1 )
+                 end if
+                 go to 10
+              end if
+              ! compute the estimate of the reciprocal condition number.
+              if( ainvnm/=zero )rcond = ( one / anorm ) / ainvnm
+           end if
+           20 continue
+           return
+     end subroutine stdlib_ztrcon
+
+     !> ZTRSYL: solves the complex Sylvester matrix equation:
+     !> op(A)*X + X*op(B) = scale*C or
+     !> op(A)*X - X*op(B) = scale*C,
+     !> where op(A) = A or A**H, and A and B are both upper triangular. A is
+     !> M-by-M and B is N-by-N; the right hand side C and the solution X are
+     !> M-by-N; and scale is an output scale factor, set <= 1 to avoid
+     !> overflow in X.
+
+     subroutine stdlib_ztrsyl( trana, tranb, isgn, m, n, a, lda, b, ldb, c,ldc, scale, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trana, tranb
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: isgn, lda, ldb, ldc, m, n
+           real(dp), intent(out) :: scale
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(inout) :: c(ldc,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: notrna, notrnb
+           integer(ilp) :: j, k, l
+           real(dp) :: bignum, da11, db, eps, scaloc, sgn, smin, smlnum
+           complex(dp) :: a11, suml, sumr, vec, x11
+           ! Local Arrays 
+           real(dp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,min
+           ! Executable Statements 
+           ! decode and test input parameters
+           notrna = stdlib_lsame( trana, 'N' )
+           notrnb = stdlib_lsame( tranb, 'N' )
+           info = 0
+           if( .not.notrna .and. .not.stdlib_lsame( trana, 'C' ) ) then
+              info = -1
+           else if( .not.notrnb .and. .not.stdlib_lsame( tranb, 'C' ) ) then
+              info = -2
+           else if( isgn/=1 .and. isgn/=-1 ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, m ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRSYL', -info )
+              return
+           end if
+           ! quick return if possible
+           scale = one
+           if( m==0 .or. n==0 )return
+           ! set constants to control overflow
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = smlnum*real( m*n,KIND=dp) / eps
+           bignum = one / smlnum
+           smin = max( smlnum, eps*stdlib_zlange( 'M', m, m, a, lda, dum ),eps*stdlib_zlange( 'M',&
+                      n, n, b, ldb, dum ) )
+           sgn = isgn
+           if( notrna .and. notrnb ) then
+              ! solve    a*x + isgn*x*b = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! bottom-left corner column by column by
+                  ! a(k,k)*x(k,l) + isgn*x(k,l)*b(l,l) = c(k,l) - r(k,l)
+              ! where
+                          ! m                        l-1
+                ! r(k,l) = sum [a(k,i)*x(i,l)] +isgn*sum [x(k,j)*b(j,l)].
+                        ! i=k+1                      j=1
+              loop_30: do l = 1, n
+                 do k = m, 1, -1
+                    suml = stdlib_zdotu( m-k, a( k, min( k+1, m ) ), lda,c( min( k+1, m ), l ), 1 &
+                              )
+                    sumr = stdlib_zdotu( l-1, c( k, 1 ), ldc, b( 1, l ), 1 )
+                    vec = c( k, l ) - ( suml+sgn*sumr )
+                    scaloc = one
+                    a11 = a( k, k ) + sgn*b( l, l )
+                    da11 = abs( real( a11,KIND=dp) ) + abs( aimag( a11 ) )
+                    if( da11<=smin ) then
+                       a11 = smin
+                       da11 = smin
+                       info = 1
+                    end if
+                    db = abs( real( vec,KIND=dp) ) + abs( aimag( vec ) )
+                    if( da11<one .and. db>one ) then
+                       if( db>bignum*da11 )scaloc = one / db
+                    end if
+                    x11 = stdlib_zladiv( vec*cmplx( scaloc,KIND=dp), a11 )
+                    if( scaloc/=one ) then
+                       do j = 1, n
+                          call stdlib_zdscal( m, scaloc, c( 1, j ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    c( k, l ) = x11
+                 end do
+              end do loop_30
+           else if( .not.notrna .and. notrnb ) then
+              ! solve    a**h *x + isgn*x*b = scale*c.
+              ! the (k,l)th block of x is determined starting from
+              ! upper-left corner column by column by
+                  ! a**h(k,k)*x(k,l) + isgn*x(k,l)*b(l,l) = c(k,l) - r(k,l)
+              ! where
+                         ! k-1                           l-1
+                ! r(k,l) = sum [a**h(i,k)*x(i,l)] + isgn*sum [x(k,j)*b(j,l)]
+                         ! i=1                           j=1
+              loop_60: do l = 1, n
+                 do k = 1, m
+                    suml = stdlib_zdotc( k-1, a( 1, k ), 1, c( 1, l ), 1 )
+                    sumr = stdlib_zdotu( l-1, c( k, 1 ), ldc, b( 1, l ), 1 )
+                    vec = c( k, l ) - ( suml+sgn*sumr )
+                    scaloc = one
+                    a11 = conjg( a( k, k ) ) + sgn*b( l, l )
+                    da11 = abs( real( a11,KIND=dp) ) + abs( aimag( a11 ) )
+                    if( da11<=smin ) then
+                       a11 = smin
+                       da11 = smin
+                       info = 1
+                    end if
+                    db = abs( real( vec,KIND=dp) ) + abs( aimag( vec ) )
+                    if( da11<one .and. db>one ) then
+                       if( db>bignum*da11 )scaloc = one / db
+                    end if
+                    x11 = stdlib_zladiv( vec*cmplx( scaloc,KIND=dp), a11 )
+                    if( scaloc/=one ) then
+                       do j = 1, n
+                          call stdlib_zdscal( m, scaloc, c( 1, j ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    c( k, l ) = x11
+                 end do
+              end do loop_60
+           else if( .not.notrna .and. .not.notrnb ) then
+              ! solve    a**h*x + isgn*x*b**h = c.
+              ! the (k,l)th block of x is determined starting from
+              ! upper-right corner column by column by
+                  ! a**h(k,k)*x(k,l) + isgn*x(k,l)*b**h(l,l) = c(k,l) - r(k,l)
+              ! where
+                          ! k-1
+                 ! r(k,l) = sum [a**h(i,k)*x(i,l)] +
+                          ! i=1
+                                 ! n
+                           ! isgn*sum [x(k,j)*b**h(l,j)].
+                                ! j=l+1
+              loop_90: do l = n, 1, -1
+                 do k = 1, m
+                    suml = stdlib_zdotc( k-1, a( 1, k ), 1, c( 1, l ), 1 )
+                    sumr = stdlib_zdotc( n-l, c( k, min( l+1, n ) ), ldc,b( l, min( l+1, n ) ), &
+                              ldb )
+                    vec = c( k, l ) - ( suml+sgn*conjg( sumr ) )
+                    scaloc = one
+                    a11 = conjg( a( k, k )+sgn*b( l, l ) )
+                    da11 = abs( real( a11,KIND=dp) ) + abs( aimag( a11 ) )
+                    if( da11<=smin ) then
+                       a11 = smin
+                       da11 = smin
+                       info = 1
+                    end if
+                    db = abs( real( vec,KIND=dp) ) + abs( aimag( vec ) )
+                    if( da11<one .and. db>one ) then
+                       if( db>bignum*da11 )scaloc = one / db
+                    end if
+                    x11 = stdlib_zladiv( vec*cmplx( scaloc,KIND=dp), a11 )
+                    if( scaloc/=one ) then
+                       do j = 1, n
+                          call stdlib_zdscal( m, scaloc, c( 1, j ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    c( k, l ) = x11
+                 end do
+              end do loop_90
+           else if( notrna .and. .not.notrnb ) then
+              ! solve    a*x + isgn*x*b**h = c.
+              ! the (k,l)th block of x is determined starting from
+              ! bottom-left corner column by column by
+                 ! a(k,k)*x(k,l) + isgn*x(k,l)*b**h(l,l) = c(k,l) - r(k,l)
+              ! where
+                          ! m                          n
+                ! r(k,l) = sum [a(k,i)*x(i,l)] + isgn*sum [x(k,j)*b**h(l,j)]
+                        ! i=k+1                      j=l+1
+              loop_120: do l = n, 1, -1
+                 do k = m, 1, -1
+                    suml = stdlib_zdotu( m-k, a( k, min( k+1, m ) ), lda,c( min( k+1, m ), l ), 1 &
+                              )
+                    sumr = stdlib_zdotc( n-l, c( k, min( l+1, n ) ), ldc,b( l, min( l+1, n ) ), &
+                              ldb )
+                    vec = c( k, l ) - ( suml+sgn*conjg( sumr ) )
+                    scaloc = one
+                    a11 = a( k, k ) + sgn*conjg( b( l, l ) )
+                    da11 = abs( real( a11,KIND=dp) ) + abs( aimag( a11 ) )
+                    if( da11<=smin ) then
+                       a11 = smin
+                       da11 = smin
+                       info = 1
+                    end if
+                    db = abs( real( vec,KIND=dp) ) + abs( aimag( vec ) )
+                    if( da11<one .and. db>one ) then
+                       if( db>bignum*da11 )scaloc = one / db
+                    end if
+                    x11 = stdlib_zladiv( vec*cmplx( scaloc,KIND=dp), a11 )
+                    if( scaloc/=one ) then
+                       do j = 1, n
+                          call stdlib_zdscal( m, scaloc, c( 1, j ), 1 )
+                       end do
+                       scale = scale*scaloc
+                    end if
+                    c( k, l ) = x11
+                 end do
+              end do loop_120
+           end if
+           return
+     end subroutine stdlib_ztrsyl
+
+     !> ZUNBDB5: orthogonalizes the column vector
+     !> X = [ X1 ]
+     !> [ X2 ]
+     !> with respect to the columns of
+     !> Q = [ Q1 ] .
+     !> [ Q2 ]
+     !> The columns of Q must be orthonormal.
+     !> If the projection is zero according to Kahan's "twice is enough"
+     !> criterion, then some other vector from the orthogonal complement
+     !> is returned. This vector is chosen in an arbitrary but deterministic
+     !> way.
+
+     pure subroutine stdlib_zunbdb5( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: incx1, incx2, ldq1, ldq2, lwork, m1, m2, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(dp), intent(in) :: q1(ldq1,*), q2(ldq2,*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: x1(*), x2(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: childinfo, i, j
+           ! Intrinsic Function 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           if( m1 < 0 ) then
+              info = -1
+           else if( m2 < 0 ) then
+              info = -2
+           else if( n < 0 ) then
+              info = -3
+           else if( incx1 < 1 ) then
+              info = -5
+           else if( incx2 < 1 ) then
+              info = -7
+           else if( ldq1 < max( 1, m1 ) ) then
+              info = -9
+           else if( ldq2 < max( 1, m2 ) ) then
+              info = -11
+           else if( lwork < n ) then
+              info = -13
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNBDB5', -info )
+              return
+           end if
+           ! project x onto the orthogonal complement of q
+           call stdlib_zunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2, ldq2,work, lwork, &
+                     childinfo )
+           ! if the projection is nonzero, then return
+           if( stdlib_dznrm2(m1,x1,incx1) /= czero.or. stdlib_dznrm2(m2,x2,incx2) /= czero ) &
+                     then
+              return
+           end if
+           ! project each standard basis vector e_1,...,e_m1 in turn, stopping
+           ! when a nonzero projection is found
+           do i = 1, m1
+              do j = 1, m1
+                 x1(j) = czero
+              end do
+              x1(i) = cone
+              do j = 1, m2
+                 x2(j) = czero
+              end do
+              call stdlib_zunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                        lwork, childinfo )
+              if( stdlib_dznrm2(m1,x1,incx1) /= czero.or. stdlib_dznrm2(m2,x2,incx2) /= czero ) &
+                        then
+                 return
+              end if
+           end do
+           ! project each standard basis vector e_(m1+1),...,e_(m1+m2) in turn,
+           ! stopping when a nonzero projection is found
+           do i = 1, m2
+              do j = 1, m1
+                 x1(j) = czero
+              end do
+              do j = 1, m2
+                 x2(j) = czero
+              end do
+              x2(i) = cone
+              call stdlib_zunbdb6( m1, m2, n, x1, incx1, x2, incx2, q1, ldq1, q2,ldq2, work, &
+                        lwork, childinfo )
+              if( stdlib_dznrm2(m1,x1,incx1) /= czero.or. stdlib_dznrm2(m2,x2,incx2) /= czero ) &
+                        then
+                 return
+              end if
+           end do
+           return
+     end subroutine stdlib_zunbdb5
+
+     !> ZUNCSD: computes the CS decomposition of an M-by-M partitioned
+     !> unitary matrix X:
+     !> [  I  0  0 |  0  0  0 ]
+     !> [  0  C  0 |  0 -S  0 ]
+     !> [ X11 | X12 ]   [ U1 |    ] [  0  0  0 |  0  0 -I ] [ V1 |    ]**H
+     !> X = [-----------] = [---------] [---------------------] [---------]   .
+     !> [ X21 | X22 ]   [    | U2 ] [  0  0  0 |  I  0  0 ] [    | V2 ]
+     !> [  0  S  0 |  0  C  0 ]
+     !> [  0  0  I |  0  0  0 ]
+     !> X11 is P-by-Q. The unitary matrices U1, U2, V1, and V2 are P-by-P,
+     !> (M-P)-by-(M-P), Q-by-Q, and (M-Q)-by-(M-Q), respectively. C and S are
+     !> R-by-R nonnegative diagonal matrices satisfying C^2 + S^2 = I, in
+     !> which R = MIN(P,M-P,Q,M-Q).
+
+     recursive subroutine stdlib_zuncsd( jobu1, jobu2, jobv1t, jobv2t, trans,signs, m, p, q, x11, &
+     ldx11, x12,ldx12, x21, ldx21, x22, ldx22, theta,u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, &
+               work, lwork, rwork, lrwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t, jobv2t, signs, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, ldv2t, ldx11, ldx12, ldx21, ldx22, &
+                     lrwork, lwork, m, p, q
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: theta(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), v2t(ldv2t,*), work(*)
+                     
+           complex(dp), intent(inout) :: x11(ldx11,*), x12(ldx12,*), x21(ldx21,*), x22(ldx22,*)
+                     
+        ! ===================================================================
+           
+           ! Local Scalars 
+           character :: transt, signst
+           integer(ilp) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
+           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, itauq2, j, lbbcsdwork, &
+           lbbcsdworkmin, lbbcsdworkopt, lorbdbwork, lorbdbworkmin, lorbdbworkopt, lorglqwork, &
+           lorglqworkmin, lorglqworkopt, lorgqrwork, lorgqrworkmin, lorgqrworkopt, lworkmin, &
+                     lworkopt, p1, q1
+           logical(lk) :: colmajor, defaultsigns, lquery, wantu1, wantu2, wantv1t, wantv2t
+           integer(ilp) :: lrworkmin, lrworkopt
+           logical(lk) :: lrquery
+           ! Intrinsic Functions
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           wantv2t = stdlib_lsame( jobv2t, 'Y' )
+           colmajor = .not. stdlib_lsame( trans, 'T' )
+           defaultsigns = .not. stdlib_lsame( signs, 'O' )
+           lquery = lwork == -1
+           lrquery = lrwork == -1
+           if( m < 0 ) then
+              info = -7
+           else if( p < 0 .or. p > m ) then
+              info = -8
+           else if( q < 0 .or. q > m ) then
+              info = -9
+           else if ( colmajor .and.  ldx11 < max( 1, p ) ) then
+             info = -11
+           else if (.not. colmajor .and. ldx11 < max( 1, q ) ) then
+             info = -11
+           else if (colmajor .and. ldx12 < max( 1, p ) ) then
+             info = -13
+           else if (.not. colmajor .and. ldx12 < max( 1, m-q ) ) then
+             info = -13
+           else if (colmajor .and. ldx21 < max( 1, m-p ) ) then
+             info = -15
+           else if (.not. colmajor .and. ldx21 < max( 1, q ) ) then
+             info = -15
+           else if (colmajor .and. ldx22 < max( 1, m-p ) ) then
+             info = -17
+           else if (.not. colmajor .and. ldx22 < max( 1, m-q ) ) then
+             info = -17
+           else if( wantu1 .and. ldu1 < p ) then
+              info = -20
+           else if( wantu2 .and. ldu2 < m-p ) then
+              info = -22
+           else if( wantv1t .and. ldv1t < q ) then
+              info = -24
+           else if( wantv2t .and. ldv2t < m-q ) then
+              info = -26
+           end if
+           ! work with transpose if convenient
+           if( info == 0 .and. min( p, m-p ) < min( q, m-q ) ) then
+              if( colmajor ) then
+                 transt = 'T'
+              else
+                 transt = 'N'
+              end if
+              if( defaultsigns ) then
+                 signst = 'O'
+              else
+                 signst = 'D'
+              end if
+              call stdlib_zuncsd( jobv1t, jobv2t, jobu1, jobu2, transt, signst, m,q, p, x11, &
+              ldx11, x21, ldx21, x12, ldx12, x22,ldx22, theta, v1t, ldv1t, v2t, ldv2t, u1, ldu1,&
+                        u2, ldu2, work, lwork, rwork, lrwork, iwork,info )
+              return
+           end if
+           ! work with permutation [ 0 i; i 0 ] * x * [ 0 i; i 0 ] if
+           ! convenient
+           if( info == 0 .and. m-q < q ) then
+              if( defaultsigns ) then
+                 signst = 'O'
+              else
+                 signst = 'D'
+              end if
+              call stdlib_zuncsd( jobu2, jobu1, jobv2t, jobv1t, trans, signst, m,m-p, m-q, x22, &
+              ldx22, x21, ldx21, x12, ldx12, x11,ldx11, theta, u2, ldu2, u1, ldu1, v2t, ldv2t, &
+                        v1t,ldv1t, work, lwork, rwork, lrwork, iwork, info )
+              return
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ! real workspace
+              iphi = 2
+              ib11d = iphi + max( 1, q - 1 )
+              ib11e = ib11d + max( 1, q )
+              ib12d = ib11e + max( 1, q - 1 )
+              ib12e = ib12d + max( 1, q )
+              ib21d = ib12e + max( 1, q - 1 )
+              ib21e = ib21d + max( 1, q )
+              ib22d = ib21e + max( 1, q - 1 )
+              ib22e = ib22d + max( 1, q )
+              ibbcsd = ib22e + max( 1, q - 1 )
+              call stdlib_zbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q,theta, theta, u1, &
+              ldu1, u2, ldu2, v1t, ldv1t,v2t, ldv2t, theta, theta, theta, theta, theta,theta, &
+                        theta, theta, rwork, -1, childinfo )
+              lbbcsdworkopt = int( rwork(1),KIND=ilp)
+              lbbcsdworkmin = lbbcsdworkopt
+              lrworkopt = ibbcsd + lbbcsdworkopt - 1
+              lrworkmin = ibbcsd + lbbcsdworkmin - 1
+              rwork(1) = lrworkopt
+              ! complex workspace
+              itaup1 = 2
+              itaup2 = itaup1 + max( 1, p )
+              itauq1 = itaup2 + max( 1, m - p )
+              itauq2 = itauq1 + max( 1, q )
+              iorgqr = itauq2 + max( 1, m - q )
+              call stdlib_zungqr( m-q, m-q, m-q, u1, max(1,m-q), u1, work, -1,childinfo )
+              lorgqrworkopt = int( work(1),KIND=ilp)
+              lorgqrworkmin = max( 1, m - q )
+              iorglq = itauq2 + max( 1, m - q )
+              call stdlib_zunglq( m-q, m-q, m-q, u1, max(1,m-q), u1, work, -1,childinfo )
+              lorglqworkopt = int( work(1),KIND=ilp)
+              lorglqworkmin = max( 1, m - q )
+              iorbdb = itauq2 + max( 1, m - q )
+              call stdlib_zunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12,x21, ldx21, x22, &
+                        ldx22, theta, theta, u1, u2,v1t, v2t, work, -1, childinfo )
+              lorbdbworkopt = int( work(1),KIND=ilp)
+              lorbdbworkmin = lorbdbworkopt
+              lworkopt = max( iorgqr + lorgqrworkopt, iorglq + lorglqworkopt,iorbdb + &
+                        lorbdbworkopt ) - 1
+              lworkmin = max( iorgqr + lorgqrworkmin, iorglq + lorglqworkmin,iorbdb + &
+                        lorbdbworkmin ) - 1
+              work(1) = max(lworkopt,lworkmin)
+              if( lwork < lworkmin.and. .not. ( lquery .or. lrquery ) ) then
+                 info = -22
+              else if( lrwork < lrworkmin.and. .not. ( lquery .or. lrquery ) ) then
+                 info = -24
+              else
+                 lorgqrwork = lwork - iorgqr + 1
+                 lorglqwork = lwork - iorglq + 1
+                 lorbdbwork = lwork - iorbdb + 1
+                 lbbcsdwork = lrwork - ibbcsd + 1
+              end if
+           end if
+           ! abort if any illegal arguments
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNCSD', -info )
+              return
+           else if( lquery .or. lrquery ) then
+              return
+           end if
+           ! transform to bidiagonal block form
+           call stdlib_zunbdb( trans, signs, m, p, q, x11, ldx11, x12, ldx12, x21,ldx21, x22, &
+           ldx22, theta, rwork(iphi), work(itaup1),work(itaup2), work(itauq1), work(itauq2),work(&
+                     iorbdb), lorbdbwork, childinfo )
+           ! accumulate householder reflectors
+           if( colmajor ) then
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_zlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_zungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqrwork, &
+                           info)
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_zlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_zungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqrwork,&
+                            info )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_zlacpy( 'U', q-1, q-1, x11(1,2), ldx11, v1t(2,2),ldv1t )
+                 v1t(1, 1) = cone
+                 do j = 2, q
+                    v1t(1,j) = czero
+                    v1t(j,1) = czero
+                 end do
+                 call stdlib_zunglq( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorglq), &
+                           lorglqwork, info )
+              end if
+              if( wantv2t .and. m-q > 0 ) then
+                 call stdlib_zlacpy( 'U', p, m-q, x12, ldx12, v2t, ldv2t )
+                 if( m-p > q) then
+                    call stdlib_zlacpy( 'U', m-p-q, m-p-q, x22(q+1,p+1), ldx22,v2t(p+1,p+1), &
+                              ldv2t )
+                 end if
+                 if( m > q ) then
+                    call stdlib_zunglq( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorglq), &
+                              lorglqwork, info )
+                 end if
+              end if
+           else
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_zlacpy( 'U', q, p, x11, ldx11, u1, ldu1 )
+                 call stdlib_zunglq( p, p, q, u1, ldu1, work(itaup1), work(iorglq),lorglqwork, &
+                           info)
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_zlacpy( 'U', q, m-p, x21, ldx21, u2, ldu2 )
+                 call stdlib_zunglq( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorglq), lorglqwork,&
+                            info )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_zlacpy( 'L', q-1, q-1, x11(2,1), ldx11, v1t(2,2),ldv1t )
+                 v1t(1, 1) = cone
+                 do j = 2, q
+                    v1t(1,j) = czero
+                    v1t(j,1) = czero
+                 end do
+                 call stdlib_zungqr( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorgqr), &
+                           lorgqrwork, info )
+              end if
+              if( wantv2t .and. m-q > 0 ) then
+                 p1 = min( p+1, m )
+                 q1 = min( q+1, m )
+                 call stdlib_zlacpy( 'L', m-q, p, x12, ldx12, v2t, ldv2t )
+                 if( m > p+q ) then
+                    call stdlib_zlacpy( 'L', m-p-q, m-p-q, x22(p1,q1), ldx22,v2t(p+1,p+1), ldv2t )
+                              
+                 end if
+                 call stdlib_zungqr( m-q, m-q, m-q, v2t, ldv2t, work(itauq2),work(iorgqr), &
+                           lorgqrwork, info )
+              end if
+           end if
+           ! compute the csd of the matrix in bidiagonal-block form
+           call stdlib_zbbcsd( jobu1, jobu2, jobv1t, jobv2t, trans, m, p, q, theta,rwork(iphi), &
+           u1, ldu1, u2, ldu2, v1t, ldv1t, v2t,ldv2t, rwork(ib11d), rwork(ib11e), rwork(ib12d),&
+           rwork(ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd),&
+                     lbbcsdwork, info )
+           ! permute rows and columns to place identity submatrices in top-
+           ! left corner of (1,1)-block and/or bottom-right corner of (1,2)-
+           ! block and/or bottom-right corner of (2,1)-block and/or top-left
+           ! corner of (2,2)-block
+           if( q > 0 .and. wantu2 ) then
+              do i = 1, q
+                 iwork(i) = m - p - q + i
+              end do
+              do i = q + 1, m - p
+                 iwork(i) = i - q
+              end do
+              if( colmajor ) then
+                 call stdlib_zlapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              else
+                 call stdlib_zlapmr( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           end if
+           if( m > 0 .and. wantv2t ) then
+              do i = 1, p
+                 iwork(i) = m - p - q + i
+              end do
+              do i = p + 1, m - q
+                 iwork(i) = i - p
+              end do
+              if( .not. colmajor ) then
+                 call stdlib_zlapmt( .false., m-q, m-q, v2t, ldv2t, iwork )
+              else
+                 call stdlib_zlapmr( .false., m-q, m-q, v2t, ldv2t, iwork )
+              end if
+           end if
+           return
+           ! end stdlib_zuncsd
+     end subroutine stdlib_zuncsd
+
+     !> ZUNGHR: generates a complex unitary matrix Q which is defined as the
+     !> product of IHI-ILO elementary reflectors of order N, as returned by
+     !> ZGEHRD:
+     !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+     pure subroutine stdlib_zunghr( n, ilo, ihi, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, lwkopt, nb, nh
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nh = ihi - ilo
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( lwork<max( 1, nh ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'ZUNGQR', ' ', nh, nh, nh, -1 )
+              lwkopt = max( 1, nh )*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGHR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! shift the vectors which define the elementary reflectors cone
+           ! column to the right, and set the first ilo and the last n-ihi
+           ! rows and columns to those of the unit matrix
+           do j = ihi, ilo + 1, -1
+              do i = 1, j - 1
+                 a( i, j ) = czero
+              end do
+              do i = j + 1, ihi
+                 a( i, j ) = a( i, j-1 )
+              end do
+              do i = ihi + 1, n
+                 a( i, j ) = czero
+              end do
+           end do
+           do j = 1, ilo
+              do i = 1, n
+                 a( i, j ) = czero
+              end do
+              a( j, j ) = cone
+           end do
+           do j = ihi + 1, n
+              do i = 1, n
+                 a( i, j ) = czero
+              end do
+              a( j, j ) = cone
+           end do
+           if( nh>0 ) then
+              ! generate q(ilo+1:ihi,ilo+1:ihi)
+              call stdlib_zungqr( nh, nh, nh, a( ilo+1, ilo+1 ), lda, tau( ilo ),work, lwork, &
+                        iinfo )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zunghr
+
+     !> ZUNGTR: generates a complex unitary matrix Q which is defined as the
+     !> product of n-1 elementary reflectors of order N, as returned by
+     !> ZHETRD:
+     !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+
+     pure subroutine stdlib_zungtr( uplo, n, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: i, iinfo, j, lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, n-1 ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              if( upper ) then
+                 nb = stdlib_ilaenv( 1, 'ZUNGQL', ' ', n-1, n-1, n-1, -1 )
+              else
+                 nb = stdlib_ilaenv( 1, 'ZUNGQR', ' ', n-1, n-1, n-1, -1 )
+              end if
+              lwkopt = max( 1, n-1 )*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGTR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( upper ) then
+              ! q was determined by a call to stdlib_zhetrd with uplo = 'u'
+              ! shift the vectors which define the elementary reflectors cone
+              ! column to the left, and set the last row and column of q to
+              ! those of the unit matrix
+              do j = 1, n - 1
+                 do i = 1, j - 1
+                    a( i, j ) = a( i, j+1 )
+                 end do
+                 a( n, j ) = czero
+              end do
+              do i = 1, n - 1
+                 a( i, n ) = czero
+              end do
+              a( n, n ) = cone
+              ! generate q(1:n-1,1:n-1)
+              call stdlib_zungql( n-1, n-1, n-1, a, lda, tau, work, lwork, iinfo )
+           else
+              ! q was determined by a call to stdlib_zhetrd with uplo = 'l'.
+              ! shift the vectors which define the elementary reflectors cone
+              ! column to the right, and set the first row and column of q to
+              ! those of the unit matrix
+              do j = n, 2, -1
+                 a( 1, j ) = czero
+                 do i = j + 1, n
+                    a( i, j ) = a( i, j-1 )
+                 end do
+              end do
+              a( 1, 1 ) = cone
+              do i = 2, n
+                 a( i, 1 ) = czero
+              end do
+              if( n>1 ) then
+                 ! generate q(2:n,2:n)
+                 call stdlib_zungqr( n-1, n-1, n-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                           
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zungtr
+
+     !> ZUNHR_COL: takes an M-by-N complex matrix Q_in with orthonormal columns
+     !> as input, stored in A, and performs Householder Reconstruction (HR),
+     !> i.e. reconstructs Householder vectors V(i) implicitly representing
+     !> another M-by-N matrix Q_out, with the property that Q_in = Q_out*S,
+     !> where S is an N-by-N diagonal matrix with diagonal entries
+     !> equal to +1 or -1. The Householder vectors (columns V(i) of V) are
+     !> stored in A on output, and the diagonal entries of S are stored in D.
+     !> Block reflectors are also returned in T
+     !> (same output format as ZGEQRT).
+
+     pure subroutine stdlib_zunhr_col( m, n, nb, a, lda, t, ldt, d, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, nb
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: d(*), t(ldt,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo, j, jb, jbtemp1, jbtemp2, jnb, nplusone
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. n>m ) then
+              info = -2
+           else if( nb<1 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -7
+           end if
+           ! handle error in the input parameters.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNHR_COL', -info )
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              return
+           end if
+           ! on input, the m-by-n matrix a contains the unitary
+           ! m-by-n matrix q_in.
+           ! (1) compute the unit lower-trapezoidal v (ones on the diagonal
+           ! are not stored) by performing the "modified" lu-decomposition.
+           ! q_in - ( s ) = v * u = ( v1 ) * u,
+                  ! ( 0 )           ( v2 )
+           ! where 0 is an (m-n)-by-n zero matrix.
+           ! (1-1) factor v1 and u.
+           call stdlib_zlaunhr_col_getrfnp( n, n, a, lda, d, iinfo )
+           ! (1-2) solve for v2.
+           if( m>n ) then
+              call stdlib_ztrsm( 'R', 'U', 'N', 'N', m-n, n, cone, a, lda,a( n+1, 1 ), lda )
+                        
+           end if
+           ! (2) reconstruct the block reflector t stored in t(1:nb, 1:n)
+           ! as a sequence of upper-triangular blocks with nb-size column
+           ! blocking.
+           ! loop over the column blocks of size nb of the array a(1:m,1:n)
+           ! and the array t(1:nb,1:n), jb is the column index of a column
+           ! block, jnb is the column block size at each step jb.
+           nplusone = n + 1
+           do jb = 1, n, nb
+              ! (2-0) determine the column block size jnb.
+              jnb = min( nplusone-jb, nb )
+              ! (2-1) copy the upper-triangular part of the current jnb-by-jnb
+              ! diagonal block u(jb) (of the n-by-n matrix u) stored
+              ! in a(jb:jb+jnb-1,jb:jb+jnb-1) into the upper-triangular part
+              ! of the current jnb-by-jnb block t(1:jnb,jb:jb+jnb-1)
+              ! column-by-column, total jnb*(jnb+1)/2 elements.
+              jbtemp1 = jb - 1
+              do j = jb, jb+jnb-1
+                 call stdlib_zcopy( j-jbtemp1, a( jb, j ), 1, t( 1, j ), 1 )
+              end do
+              ! (2-2) perform on the upper-triangular part of the current
+              ! jnb-by-jnb diagonal block u(jb) (of the n-by-n matrix u) stored
+              ! in t(1:jnb,jb:jb+jnb-1) the following operation in place:
+              ! (-1)*u(jb)*s(jb), i.e the result will be stored in the upper-
+              ! triangular part of t(1:jnb,jb:jb+jnb-1). this multiplication
+              ! of the jnb-by-jnb diagonal block u(jb) by the jnb-by-jnb
+              ! diagonal block s(jb) of the n-by-n sign matrix s from the
+              ! right means changing the sign of each j-th column of the block
+              ! u(jb) according to the sign of the diagonal element of the block
+              ! s(jb), i.e. s(j,j) that is stored in the array element d(j).
+              do j = jb, jb+jnb-1
+                 if( d( j )==cone ) then
+                    call stdlib_zscal( j-jbtemp1, -cone, t( 1, j ), 1 )
+                 end if
+              end do
+              ! (2-3) perform the triangular solve for the current block
+              ! matrix x(jb):
+                     ! x(jb) * (a(jb)**t) = b(jb), where:
+                     ! a(jb)**t  is a jnb-by-jnb unit upper-triangular
+                               ! coefficient block, and a(jb)=v1(jb), which
+                               ! is a jnb-by-jnb unit lower-triangular block
+                               ! stored in a(jb:jb+jnb-1,jb:jb+jnb-1).
+                               ! the n-by-n matrix v1 is the upper part
+                               ! of the m-by-n lower-trapezoidal matrix v
+                               ! stored in a(1:m,1:n);
+                     ! b(jb)     is a jnb-by-jnb  upper-triangular right-hand
+                               ! side block, b(jb) = (-1)*u(jb)*s(jb), and
+                               ! b(jb) is stored in t(1:jnb,jb:jb+jnb-1);
+                     ! x(jb)     is a jnb-by-jnb upper-triangular solution
+                               ! block, x(jb) is the upper-triangular block
+                               ! reflector t(jb), and x(jb) is stored
+                               ! in t(1:jnb,jb:jb+jnb-1).
+                   ! in other words, we perform the triangular solve for the
+                   ! upper-triangular block t(jb):
+                     ! t(jb) * (v1(jb)**t) = (-1)*u(jb)*s(jb).
+                   ! even though the blocks x(jb) and b(jb) are upper-
+                   ! triangular, the routine stdlib_ztrsm will access all jnb**2
+                   ! elements of the square t(1:jnb,jb:jb+jnb-1). therefore,
+                   ! we need to set to zero the elements of the block
+                   ! t(1:jnb,jb:jb+jnb-1) below the diagonal before the call
+                   ! to stdlib_ztrsm.
+              ! (2-3a) set the elements to zero.
+              jbtemp2 = jb - 2
+              do j = jb, jb+jnb-2
+                 do i = j-jbtemp2, nb
+                    t( i, j ) = czero
+                 end do
+              end do
+              ! (2-3b) perform the triangular solve.
+              call stdlib_ztrsm( 'R', 'L', 'C', 'U', jnb, jnb, cone,a( jb, jb ), lda, t( 1, jb ), &
+                        ldt )
+           end do
+           return
+     end subroutine stdlib_zunhr_col
+
+     !> ZUNMHR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> IHI-ILO elementary reflectors, as returned by ZGEHRD:
+     !> Q = H(ilo) H(ilo+1) . . . H(ihi-1).
+
+     pure subroutine stdlib_zunmhr( side, trans, m, n, ilo, ihi, a, lda, tau, c,ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldc, lwork, m, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, lquery
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, nh, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           nh = ihi - ilo
+           left = stdlib_lsame( side, 'L' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'C' ) )&
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 .or. ilo>max( 1, nq ) ) then
+              info = -5
+           else if( ihi<min( ilo, nq ) .or. ihi>nq ) then
+              info = -6
+           else if( lda<max( 1, nq ) ) then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( left ) then
+                 nb = stdlib_ilaenv( 1, 'ZUNMQR', side // trans, nh, n, nh, -1 )
+              else
+                 nb = stdlib_ilaenv( 1, 'ZUNMQR', side // trans, m, nh, nh, -1 )
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMHR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. nh==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( left ) then
+              mi = nh
+              ni = n
+              i1 = ilo + 1
+              i2 = 1
+           else
+              mi = m
+              ni = nh
+              i1 = 1
+              i2 = ilo + 1
+           end if
+           call stdlib_zunmqr( side, trans, mi, ni, nh, a( ilo+1, ilo ), lda,tau( ilo ), c( i1, &
+                     i2 ), ldc, work, lwork, iinfo )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zunmhr
+
+     !> ZUNMTR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> nq-1 elementary reflectors, as returned by ZHETRD:
+     !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+     pure subroutine stdlib_zunmtr( side, uplo, trans, m, n, a, lda, tau, c, ldc,work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, lquery, upper
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.stdlib_lsame( trans, 'N' ) .and. .not.stdlib_lsame( trans, 'C' ) )&
+                     then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, nq ) ) then
+              info = -7
+           else if( ldc<max( 1, m ) ) then
+              info = -10
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              if( upper ) then
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'ZUNMQL', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'ZUNMQL', side // trans, m, n-1, n-1,-1 )
+                 end if
+              else
+                 if( left ) then
+                    nb = stdlib_ilaenv( 1, 'ZUNMQR', side // trans, m-1, n, m-1,-1 )
+                 else
+                    nb = stdlib_ilaenv( 1, 'ZUNMQR', side // trans, m, n-1, n-1,-1 )
+                 end if
+              end if
+              lwkopt = nw*nb
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMTR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 .or. nq==1 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( left ) then
+              mi = m - 1
+              ni = n
+           else
+              mi = m
+              ni = n - 1
+           end if
+           if( upper ) then
+              ! q was determined by a call to stdlib_zhetrd with uplo = 'u'
+              call stdlib_zunmql( side, trans, mi, ni, nq-1, a( 1, 2 ), lda, tau, c,ldc, work, &
+                        lwork, iinfo )
+           else
+              ! q was determined by a call to stdlib_zhetrd with uplo = 'l'
+              if( left ) then
+                 i1 = 2
+                 i2 = 1
+              else
+                 i1 = 1
+                 i2 = 2
+              end if
+              call stdlib_zunmqr( side, trans, mi, ni, nq-1, a( 2, 1 ), lda, tau,c( i1, i2 ), ldc,&
+                         work, lwork, iinfo )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zunmtr
+
+     !> ZUPGTR: generates a complex unitary matrix Q which is defined as the
+     !> product of n-1 elementary reflectors H(i) of order n, as returned by
+     !> ZHPTRD using packed storage:
+     !> if UPLO = 'U', Q = H(n-1) . . . H(2) H(1),
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(n-1).
+
+     pure subroutine stdlib_zupgtr( uplo, n, ap, tau, q, ldq, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldq, n
+           ! Array Arguments 
+           complex(dp), intent(in) :: ap(*), tau(*)
+           complex(dp), intent(out) :: q(ldq,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, iinfo, ij, j
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUPGTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( upper ) then
+              ! q was determined by a call to stdlib_zhptrd with uplo = 'u'
+              ! unpack the vectors which define the elementary reflectors and
+              ! set the last row and column of q equal to those of the unit
+              ! matrix
+              ij = 2
+              do j = 1, n - 1
+                 do i = 1, j - 1
+                    q( i, j ) = ap( ij )
+                    ij = ij + 1
+                 end do
+                 ij = ij + 2
+                 q( n, j ) = czero
+              end do
+              do i = 1, n - 1
+                 q( i, n ) = czero
+              end do
+              q( n, n ) = cone
+              ! generate q(1:n-1,1:n-1)
+              call stdlib_zung2l( n-1, n-1, n-1, q, ldq, tau, work, iinfo )
+           else
+              ! q was determined by a call to stdlib_zhptrd with uplo = 'l'.
+              ! unpack the vectors which define the elementary reflectors and
+              ! set the first row and column of q equal to those of the unit
+              ! matrix
+              q( 1, 1 ) = cone
+              do i = 2, n
+                 q( i, 1 ) = czero
+              end do
+              ij = 3
+              do j = 2, n
+                 q( 1, j ) = czero
+                 do i = j + 1, n
+                    q( i, j ) = ap( ij )
+                    ij = ij + 1
+                 end do
+                 ij = ij + 2
+              end do
+              if( n>1 ) then
+                 ! generate q(2:n,2:n)
+                 call stdlib_zung2r( n-1, n-1, n-1, q( 2, 2 ), ldq, tau, work,iinfo )
+              end if
+           end if
+           return
+     end subroutine stdlib_zupgtr
+
+     !> ZUPMTR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix of order nq, with nq = m if
+     !> SIDE = 'L' and nq = n if SIDE = 'R'. Q is defined as the product of
+     !> nq-1 elementary reflectors, as returned by ZHPTRD using packed
+     !> storage:
+     !> if UPLO = 'U', Q = H(nq-1) . . . H(2) H(1);
+     !> if UPLO = 'L', Q = H(1) H(2) . . . H(nq-1).
+
+     pure subroutine stdlib_zupmtr( side, uplo, trans, m, n, ap, tau, c, ldc, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldc, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: ap(*), c(ldc,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: forwrd, left, notran, upper
+           integer(ilp) :: i, i1, i2, i3, ic, ii, jc, mi, ni, nq
+           complex(dp) :: aii, taui
+           ! Intrinsic Functions 
+           intrinsic :: conjg,max
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           upper = stdlib_lsame( uplo, 'U' )
+           ! nq is the order of q
+           if( left ) then
+              nq = m
+           else
+              nq = n
+           end if
+           if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( ldc<max( 1, m ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUPMTR', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if( upper ) then
+              ! q was determined by a call to stdlib_zhptrd with uplo = 'u'
+              forwrd = ( left .and. notran ) .or.( .not.left .and. .not.notran )
+              if( forwrd ) then
+                 i1 = 1
+                 i2 = nq - 1
+                 i3 = 1
+                 ii = 2
+              else
+                 i1 = nq - 1
+                 i2 = 1
+                 i3 = -1
+                 ii = nq*( nq+1 ) / 2 - 1
+              end if
+              if( left ) then
+                 ni = n
+              else
+                 mi = m
+              end if
+              do i = i1, i2, i3
+                 if( left ) then
+                    ! h(i) or h(i)**h is applied to c(1:i,1:n)
+                    mi = i
+                 else
+                    ! h(i) or h(i)**h is applied to c(1:m,1:i)
+                    ni = i
+                 end if
+                 ! apply h(i) or h(i)**h
+                 if( notran ) then
+                    taui = tau( i )
+                 else
+                    taui = conjg( tau( i ) )
+                 end if
+                 aii = ap( ii )
+                 ap( ii ) = cone
+                 call stdlib_zlarf( side, mi, ni, ap( ii-i+1 ), 1, taui, c, ldc,work )
+                 ap( ii ) = aii
+                 if( forwrd ) then
+                    ii = ii + i + 2
+                 else
+                    ii = ii - i - 1
+                 end if
+              end do
+           else
+              ! q was determined by a call to stdlib_zhptrd with uplo = 'l'.
+              forwrd = ( left .and. .not.notran ) .or.( .not.left .and. notran )
+              if( forwrd ) then
+                 i1 = 1
+                 i2 = nq - 1
+                 i3 = 1
+                 ii = 2
+              else
+                 i1 = nq - 1
+                 i2 = 1
+                 i3 = -1
+                 ii = nq*( nq+1 ) / 2 - 1
+              end if
+              if( left ) then
+                 ni = n
+                 jc = 1
+              else
+                 mi = m
+                 ic = 1
+              end if
+              loop_20: do i = i1, i2, i3
+                 aii = ap( ii )
+                 ap( ii ) = cone
+                 if( left ) then
+                    ! h(i) or h(i)**h is applied to c(i+1:m,1:n)
+                    mi = m - i
+                    ic = i + 1
+                 else
+                    ! h(i) or h(i)**h is applied to c(1:m,i+1:n)
+                    ni = n - i
+                    jc = i + 1
+                 end if
+                 ! apply h(i) or h(i)**h
+                 if( notran ) then
+                    taui = tau( i )
+                 else
+                    taui = conjg( tau( i ) )
+                 end if
+                 call stdlib_zlarf( side, mi, ni, ap( ii ), 1, taui, c( ic, jc ),ldc, work )
+                           
+                 ap( ii ) = aii
+                 if( forwrd ) then
+                    ii = ii + nq - i + 1
+                 else
+                    ii = ii - nq + i - 2
+                 end if
+              end do loop_20
+           end if
+           return
+     end subroutine stdlib_zupmtr
+
+     !> ZCPOSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> ZCPOSV first attempts to factorize the matrix in COMPLEX and use this
+     !> factorization within an iterative refinement procedure to produce a
+     !> solution with COMPLEX*16 normwise backward error quality (see below).
+     !> If the approach fails the method switches to a COMPLEX*16
+     !> factorization and solve.
+     !> The iterative refinement is not going to be a winning strategy if
+     !> the ratio COMPLEX performance over COMPLEX*16 performance is too
+     !> small. A reasonable strategy should take the number of right-hand
+     !> sides and the size of the matrix into account. This might be done
+     !> with a call to ILAENV in the future. Up to now, we always try
+     !> iterative refinement.
+     !> The iterative refinement process is stopped if
+     !> ITER > ITERMAX
+     !> or for all the RHS we have:
+     !> RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
+     !> where
+     !> o ITER is the number of the current iteration in the iterative
+     !> refinement process
+     !> o RNRM is the infinity-norm of the residual
+     !> o XNRM is the infinity-norm of the solution
+     !> o ANRM is the infinity-operator-norm of the matrix A
+     !> o EPS is the machine epsilon returned by DLAMCH('Epsilon')
+     !> The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00
+     !> respectively.
+
+     subroutine stdlib_zcposv( uplo, n, nrhs, a, lda, b, ldb, x, ldx, work,swork, rwork, iter, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info, iter
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(sp), intent(out) :: swork(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: b(ldb,*)
+           complex(dp), intent(out) :: work(n,*), x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           logical(lk), parameter :: doitref = .true.
+           integer(ilp), parameter :: itermax = 30
+           real(dp), parameter :: bwdmax = 1.0e+00_dp
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iiter, ptsa, ptsx
+           real(dp) :: anrm, cte, eps, rnrm, xnrm
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           info = 0
+           iter = 0
+           ! test the input parameters.
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZCPOSV', -info )
+              return
+           end if
+           ! quick return if (n==0).
+           if( n==0 )return
+           ! skip single precision iterative refinement if a priori slower
+           ! than double precision factorization.
+           if( .not.doitref ) then
+              iter = -1
+              go to 40
+           end if
+           ! compute some constants.
+           anrm = stdlib_zlanhe( 'I', uplo, n, a, lda, rwork )
+           eps = stdlib_dlamch( 'EPSILON' )
+           cte = anrm*eps*sqrt( real( n,KIND=dp) )*bwdmax
+           ! set the indices ptsa, ptsx for referencing sa and sx in swork.
+           ptsa = 1
+           ptsx = ptsa + n*n
+           ! convert b from double precision to single precision and store the
+           ! result in sx.
+           call stdlib_zlag2c( n, nrhs, b, ldb, swork( ptsx ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! convert a from double precision to single precision and store the
+           ! result in sa.
+           call stdlib_zlat2c( uplo, n, a, lda, swork( ptsa ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! compute the cholesky factorization of sa.
+           call stdlib_cpotrf( uplo, n, swork( ptsa ), n, info )
+           if( info/=0 ) then
+              iter = -3
+              go to 40
+           end if
+           ! solve the system sa*sx = sb.
+           call stdlib_cpotrs( uplo, n, nrhs, swork( ptsa ), n, swork( ptsx ), n,info )
+           ! convert sx back to complex*16
+           call stdlib_clag2z( n, nrhs, swork( ptsx ), n, x, ldx, info )
+           ! compute r = b - ax (r is work).
+           call stdlib_zlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+           call stdlib_zhemm( 'LEFT', uplo, n, nrhs, cnegone, a, lda, x, ldx, cone,work, n )
+                     
+           ! check whether the nrhs normwise backward errors satisfy the
+           ! stopping criterion. if yes, set iter=0 and return.
+           do i = 1, nrhs
+              xnrm = cabs1( x( stdlib_izamax( n, x( 1, i ), 1 ), i ) )
+              rnrm = cabs1( work( stdlib_izamax( n, work( 1, i ), 1 ), i ) )
+              if( rnrm>xnrm*cte )go to 10
+           end do
+           ! if we are here, the nrhs normwise backward errors satisfy the
+           ! stopping criterion. we are good to exit.
+           iter = 0
+           return
+           10 continue
+           loop_30: do iiter = 1, itermax
+              ! convert r (in work) from double precision to single precision
+              ! and store the result in sx.
+              call stdlib_zlag2c( n, nrhs, work, n, swork( ptsx ), n, info )
+              if( info/=0 ) then
+                 iter = -2
+                 go to 40
+              end if
+              ! solve the system sa*sx = sr.
+              call stdlib_cpotrs( uplo, n, nrhs, swork( ptsa ), n, swork( ptsx ), n,info )
+              ! convert sx back to double precision and update the current
+              ! iterate.
+              call stdlib_clag2z( n, nrhs, swork( ptsx ), n, work, n, info )
+              do i = 1, nrhs
+                 call stdlib_zaxpy( n, cone, work( 1, i ), 1, x( 1, i ), 1 )
+              end do
+              ! compute r = b - ax (r is work).
+              call stdlib_zlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+              call stdlib_zhemm( 'L', uplo, n, nrhs, cnegone, a, lda, x, ldx, cone,work, n )
+                        
+              ! check whether the nrhs normwise backward errors satisfy the
+              ! stopping criterion. if yes, set iter=iiter>0 and return.
+              do i = 1, nrhs
+                 xnrm = cabs1( x( stdlib_izamax( n, x( 1, i ), 1 ), i ) )
+                 rnrm = cabs1( work( stdlib_izamax( n, work( 1, i ), 1 ), i ) )
+                 if( rnrm>xnrm*cte )go to 20
+              end do
+              ! if we are here, the nrhs normwise backward errors satisfy the
+              ! stopping criterion, we are good to exit.
+              iter = iiter
+              return
+              20 continue
+           end do loop_30
+           ! if we are at this place of the code, this is because we have
+           ! performed iter=itermax iterations and never satisfied the
+           ! stopping criterion, set up the iter flag accordingly and follow
+           ! up on double precision routine.
+           iter = -itermax - 1
+           40 continue
+           ! single-precision iterative refinement failed to converge to a
+           ! satisfactory solution, so we resort to double precision.
+           call stdlib_zpotrf( uplo, n, a, lda, info )
+           if( info/=0 )return
+           call stdlib_zlacpy( 'ALL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_zpotrs( uplo, n, nrhs, a, lda, x, ldx, info )
+           return
+     end subroutine stdlib_zcposv
+
+     !> ZGBBRD: reduces a complex general m-by-n band matrix A to real upper
+     !> bidiagonal form B by a unitary transformation: Q**H * A * P = B.
+     !> The routine computes B, and optionally forms Q or P**H, or computes
+     !> Q**H*C for a given matrix C.
+
+     pure subroutine stdlib_zgbbrd( vect, m, n, ncc, kl, ku, ab, ldab, d, e, q,ldq, pt, ldpt, c, &
+               ldc, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldc, ldpt, ldq, m, n, ncc
+           ! Array Arguments 
+           real(dp), intent(out) :: d(*), e(*), rwork(*)
+           complex(dp), intent(inout) :: ab(ldab,*), c(ldc,*)
+           complex(dp), intent(out) :: pt(ldpt,*), q(ldq,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: wantb, wantc, wantpt, wantq
+           integer(ilp) :: i, inca, j, j1, j2, kb, kb1, kk, klm, klu1, kun, l, minmn, ml, ml0, mu,&
+                      mu0, nr, nrt
+           real(dp) :: abst, rc
+           complex(dp) :: ra, rb, rs, t
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           wantb = stdlib_lsame( vect, 'B' )
+           wantq = stdlib_lsame( vect, 'Q' ) .or. wantb
+           wantpt = stdlib_lsame( vect, 'P' ) .or. wantb
+           wantc = ncc>0
+           klu1 = kl + ku + 1
+           info = 0
+           if( .not.wantq .and. .not.wantpt .and. .not.stdlib_lsame( vect, 'N' ) )then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ncc<0 ) then
+              info = -4
+           else if( kl<0 ) then
+              info = -5
+           else if( ku<0 ) then
+              info = -6
+           else if( ldab<klu1 ) then
+              info = -8
+           else if( ldq<1 .or. wantq .and. ldq<max( 1, m ) ) then
+              info = -12
+           else if( ldpt<1 .or. wantpt .and. ldpt<max( 1, n ) ) then
+              info = -14
+           else if( ldc<1 .or. wantc .and. ldc<max( 1, m ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBBRD', -info )
+              return
+           end if
+           ! initialize q and p**h to the unit matrix, if needed
+           if( wantq )call stdlib_zlaset( 'FULL', m, m, czero, cone, q, ldq )
+           if( wantpt )call stdlib_zlaset( 'FULL', n, n, czero, cone, pt, ldpt )
+           ! quick return if possible.
+           if( m==0 .or. n==0 )return
+           minmn = min( m, n )
+           if( kl+ku>1 ) then
+              ! reduce to upper bidiagonal form if ku > 0; if ku = 0, reduce
+              ! first to lower bidiagonal form and then transform to upper
+              ! bidiagonal
+              if( ku>0 ) then
+                 ml0 = 1
+                 mu0 = 2
+              else
+                 ml0 = 2
+                 mu0 = 1
+              end if
+              ! wherever possible, plane rotations are generated and applied in
+              ! vector operations of length nr over the index set j1:j2:klu1.
+              ! the complex sines of the plane rotations are stored in work,
+              ! and the real cosines in rwork.
+              klm = min( m-1, kl )
+              kun = min( n-1, ku )
+              kb = klm + kun
+              kb1 = kb + 1
+              inca = kb1*ldab
+              nr = 0
+              j1 = klm + 2
+              j2 = 1 - kun
+              loop_90: do i = 1, minmn
+                 ! reduce i-th column and i-th row of matrix to bidiagonal form
+                 ml = klm + 1
+                 mu = kun + 1
+                 loop_80: do kk = 1, kb
+                    j1 = j1 + kb
+                    j2 = j2 + kb
+                    ! generate plane rotations to annihilate nonzero elements
+                    ! which have been created below the band
+                    if( nr>0 )call stdlib_zlargv( nr, ab( klu1, j1-klm-1 ), inca,work( j1 ), kb1, &
+                              rwork( j1 ), kb1 )
+                    ! apply plane rotations from the left
+                    do l = 1, kb
+                       if( j2-klm+l-1>n ) then
+                          nrt = nr - 1
+                       else
+                          nrt = nr
+                       end if
+                       if( nrt>0 )call stdlib_zlartv( nrt, ab( klu1-l, j1-klm+l-1 ), inca,ab( &
+                                 klu1-l+1, j1-klm+l-1 ), inca,rwork( j1 ), work( j1 ), kb1 )
+                    end do
+                    if( ml>ml0 ) then
+                       if( ml<=m-i+1 ) then
+                          ! generate plane rotation to annihilate a(i+ml-1,i)
+                          ! within the band, and apply rotation from the left
+                          call stdlib_zlartg( ab( ku+ml-1, i ), ab( ku+ml, i ),rwork( i+ml-1 ), &
+                                    work( i+ml-1 ), ra )
+                          ab( ku+ml-1, i ) = ra
+                          if( i<n )call stdlib_zrot( min( ku+ml-2, n-i ),ab( ku+ml-2, i+1 ), ldab-&
+                                    1,ab( ku+ml-1, i+1 ), ldab-1,rwork( i+ml-1 ), work( i+ml-1 ) )
+                       end if
+                       nr = nr + 1
+                       j1 = j1 - kb1
+                    end if
+                    if( wantq ) then
+                       ! accumulate product of plane rotations in q
+                       do j = j1, j2, kb1
+                          call stdlib_zrot( m, q( 1, j-1 ), 1, q( 1, j ), 1,rwork( j ), conjg( &
+                                    work( j ) ) )
+                       end do
+                    end if
+                    if( wantc ) then
+                       ! apply plane rotations to c
+                       do j = j1, j2, kb1
+                          call stdlib_zrot( ncc, c( j-1, 1 ), ldc, c( j, 1 ), ldc,rwork( j ), &
+                                    work( j ) )
+                       end do
+                    end if
+                    if( j2+kun>n ) then
+                       ! adjust j2 to keep within the bounds of the matrix
+                       nr = nr - 1
+                       j2 = j2 - kb1
+                    end if
+                    do j = j1, j2, kb1
+                       ! create nonzero element a(j-1,j+ku) above the band
+                       ! and store it in work(n+1:2*n)
+                       work( j+kun ) = work( j )*ab( 1, j+kun )
+                       ab( 1, j+kun ) = rwork( j )*ab( 1, j+kun )
+                    end do
+                    ! generate plane rotations to annihilate nonzero elements
+                    ! which have been generated above the band
+                    if( nr>0 )call stdlib_zlargv( nr, ab( 1, j1+kun-1 ), inca,work( j1+kun ), kb1,&
+                               rwork( j1+kun ),kb1 )
+                    ! apply plane rotations from the right
+                    do l = 1, kb
+                       if( j2+l-1>m ) then
+                          nrt = nr - 1
+                       else
+                          nrt = nr
+                       end if
+                       if( nrt>0 )call stdlib_zlartv( nrt, ab( l+1, j1+kun-1 ), inca,ab( l, j1+&
+                                 kun ), inca,rwork( j1+kun ), work( j1+kun ), kb1 )
+                    end do
+                    if( ml==ml0 .and. mu>mu0 ) then
+                       if( mu<=n-i+1 ) then
+                          ! generate plane rotation to annihilate a(i,i+mu-1)
+                          ! within the band, and apply rotation from the right
+                          call stdlib_zlartg( ab( ku-mu+3, i+mu-2 ),ab( ku-mu+2, i+mu-1 ),rwork( &
+                                    i+mu-1 ), work( i+mu-1 ), ra )
+                          ab( ku-mu+3, i+mu-2 ) = ra
+                          call stdlib_zrot( min( kl+mu-2, m-i ),ab( ku-mu+4, i+mu-2 ), 1,ab( ku-&
+                                    mu+3, i+mu-1 ), 1,rwork( i+mu-1 ), work( i+mu-1 ) )
+                       end if
+                       nr = nr + 1
+                       j1 = j1 - kb1
+                    end if
+                    if( wantpt ) then
+                       ! accumulate product of plane rotations in p**h
+                       do j = j1, j2, kb1
+                          call stdlib_zrot( n, pt( j+kun-1, 1 ), ldpt,pt( j+kun, 1 ), ldpt, rwork(&
+                                     j+kun ),conjg( work( j+kun ) ) )
+                       end do
+                    end if
+                    if( j2+kb>m ) then
+                       ! adjust j2 to keep within the bounds of the matrix
+                       nr = nr - 1
+                       j2 = j2 - kb1
+                    end if
+                    do j = j1, j2, kb1
+                       ! create nonzero element a(j+kl+ku,j+ku-1) below the
+                       ! band and store it in work(1:n)
+                       work( j+kb ) = work( j+kun )*ab( klu1, j+kun )
+                       ab( klu1, j+kun ) = rwork( j+kun )*ab( klu1, j+kun )
+                    end do
+                    if( ml>ml0 ) then
+                       ml = ml - 1
+                    else
+                       mu = mu - 1
+                    end if
+                 end do loop_80
+              end do loop_90
+           end if
+           if( ku==0 .and. kl>0 ) then
+              ! a has been reduced to complex lower bidiagonal form
+              ! transform lower bidiagonal form to upper bidiagonal by applying
+              ! plane rotations from the left, overwriting superdiagonal
+              ! elements on subdiagonal elements
+              do i = 1, min( m-1, n )
+                 call stdlib_zlartg( ab( 1, i ), ab( 2, i ), rc, rs, ra )
+                 ab( 1, i ) = ra
+                 if( i<n ) then
+                    ab( 2, i ) = rs*ab( 1, i+1 )
+                    ab( 1, i+1 ) = rc*ab( 1, i+1 )
+                 end if
+                 if( wantq )call stdlib_zrot( m, q( 1, i ), 1, q( 1, i+1 ), 1, rc,conjg( rs ) )
+                           
+                 if( wantc )call stdlib_zrot( ncc, c( i, 1 ), ldc, c( i+1, 1 ), ldc, rc,rs )
+                           
+              end do
+           else
+              ! a has been reduced to complex upper bidiagonal form or is
+              ! diagonal
+              if( ku>0 .and. m<n ) then
+                 ! annihilate a(m,m+1) by applying plane rotations from the
+                 ! right
+                 rb = ab( ku, m+1 )
+                 do i = m, 1, -1
+                    call stdlib_zlartg( ab( ku+1, i ), rb, rc, rs, ra )
+                    ab( ku+1, i ) = ra
+                    if( i>1 ) then
+                       rb = -conjg( rs )*ab( ku, i )
+                       ab( ku, i ) = rc*ab( ku, i )
+                    end if
+                    if( wantpt )call stdlib_zrot( n, pt( i, 1 ), ldpt, pt( m+1, 1 ), ldpt,rc, &
+                              conjg( rs ) )
+                 end do
+              end if
+           end if
+           ! make diagonal and superdiagonal elements real, storing them in d
+           ! and e
+           t = ab( ku+1, 1 )
+           loop_120: do i = 1, minmn
+              abst = abs( t )
+              d( i ) = abst
+              if( abst/=zero ) then
+                 t = t / abst
+              else
+                 t = cone
+              end if
+              if( wantq )call stdlib_zscal( m, t, q( 1, i ), 1 )
+              if( wantc )call stdlib_zscal( ncc, conjg( t ), c( i, 1 ), ldc )
+              if( i<minmn ) then
+                 if( ku==0 .and. kl==0 ) then
+                    e( i ) = zero
+                    t = ab( 1, i+1 )
+                 else
+                    if( ku==0 ) then
+                       t = ab( 2, i )*conjg( t )
+                    else
+                       t = ab( ku, i+1 )*conjg( t )
+                    end if
+                    abst = abs( t )
+                    e( i ) = abst
+                    if( abst/=zero ) then
+                       t = t / abst
+                    else
+                       t = cone
+                    end if
+                    if( wantpt )call stdlib_zscal( n, t, pt( i+1, 1 ), ldpt )
+                    t = ab( ku+1, i+1 )*conjg( t )
+                 end if
+              end if
+           end do loop_120
+           return
+     end subroutine stdlib_zgbbrd
+
+     !> ZGBRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is banded, and provides
+     !> error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_zgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb,ipiv, b, ldb, x, &
+               ldx, ferr, berr, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldafb, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transn, transt
+           integer(ilp) :: count, i, j, k, kase, kk, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kl<0 ) then
+              info = -3
+           else if( ku<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<kl+ku+1 ) then
+              info = -7
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -9
+           else if( ldb<max( 1, n ) ) then
+              info = -12
+           else if( ldx<max( 1, n ) ) then
+              info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = min( kl+ku+2, n+1 )
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_zcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_zgbmv( trans, n, n, kl, ku, -cone, ab, ldab, x( 1, j ), 1,cone, work, 1 &
+                        )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(op(a))*abs(x) + abs(b).
+              if( notran ) then
+                 do k = 1, n
+                    kk = ku + 1 - k
+                    xk = cabs1( x( k, j ) )
+                    do i = max( 1, k-ku ), min( n, k+kl )
+                       rwork( i ) = rwork( i ) + cabs1( ab( kk+i, k ) )*xk
+                    end do
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    kk = ku + 1 - k
+                    do i = max( 1, k-ku ), min( n, k+kl )
+                       s = s + cabs1( ab( kk+i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_zgbtrs( trans, n, kl, ku, 1, afb, ldafb, ipiv, work, n,info )
+                 call stdlib_zaxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_zlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_zgbtrs( transt, n, kl, ku, 1, afb, ldafb, ipiv,work, n, info )
+                              
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_zgbtrs( transn, n, kl, ku, 1, afb, ldafb, ipiv,work, n, info )
+                              
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_zgbrfs
+
+     !> ZGBSV: computes the solution to a complex system of linear equations
+     !> A * X = B, where A is a band matrix of order N with KL subdiagonals
+     !> and KU superdiagonals, and X and B are N-by-NRHS matrices.
+     !> The LU decomposition with partial pivoting and row interchanges is
+     !> used to factor A as A = L * U, where L is a product of permutation
+     !> and unit lower triangular matrices with KL subdiagonals, and U is
+     !> upper triangular with KL+KU superdiagonals.  The factored form of A
+     !> is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_zgbsv( n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: ab(ldab,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( kl<0 ) then
+              info = -2
+           else if( ku<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<2*kl+ku+1 ) then
+              info = -6
+           else if( ldb<max( n, 1 ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBSV ', -info )
+              return
+           end if
+           ! compute the lu factorization of the band matrix a.
+           call stdlib_zgbtrf( n, n, kl, ku, ab, ldab, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_zgbtrs( 'NO TRANSPOSE', n, kl, ku, nrhs, ab, ldab, ipiv,b, ldb, info )
+                        
+           end if
+           return
+     end subroutine stdlib_zgbsv
+
+     !> ZGBSVX: uses the LU factorization to compute the solution to a complex
+     !> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+     !> where A is a band matrix of order N with KL subdiagonals and KU
+     !> superdiagonals, and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_zgbsvx( fact, trans, n, kl, ku, nrhs, ab, ldab, afb,ldafb, ipiv, equed, r, &
+               c, b, ldb, x, ldx,rcond, ferr, berr, work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kl, ku, ldab, ldafb, ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(dp), intent(inout) :: c(*), r(*)
+           complex(dp), intent(inout) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+        ! moved setting of info = n+1 so info does not subsequently get
+        ! overwritten.  sven, 17 mar 05.
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: colequ, equil, nofact, notran, rowequ
+           character :: norm
+           integer(ilp) :: i, infequ, j, j1, j2
+           real(dp) :: amax, anorm, bignum, colcnd, rcmax, rcmin, rowcnd, rpvgrw, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rowequ = .false.
+              colequ = .false.
+           else
+              rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+              colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kl<0 ) then
+              info = -4
+           else if( ku<0 ) then
+              info = -5
+           else if( nrhs<0 ) then
+              info = -6
+           else if( ldab<kl+ku+1 ) then
+              info = -8
+           else if( ldafb<2*kl+ku+1 ) then
+              info = -10
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rowequ .or. colequ .or. stdlib_lsame( &
+                     equed, 'N' ) ) ) then
+              info = -12
+           else
+              if( rowequ ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, r( j ) )
+                    rcmax = max( rcmax, r( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -13
+                 else if( n>0 ) then
+                    rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    rowcnd = one
+                 end if
+              end if
+              if( colequ .and. info==0 ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, c( j ) )
+                    rcmax = max( rcmax, c( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -14
+                 else if( n>0 ) then
+                    colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    colcnd = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -16
+                 else if( ldx<max( 1, n ) ) then
+                    info = -18
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGBSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_zgbequ( n, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, infequ )
+                        
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_zlaqgb( n, n, kl, ku, ab, ldab, r, c, rowcnd, colcnd,amax, equed )
+                           
+                 rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+                 colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( notran ) then
+              if( rowequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       b( i, j ) = r( i )*b( i, j )
+                    end do
+                 end do
+              end if
+           else if( colequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = c( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the lu factorization of the band matrix a.
+              do j = 1, n
+                 j1 = max( j-ku, 1 )
+                 j2 = min( j+kl, n )
+                 call stdlib_zcopy( j2-j1+1, ab( ku+1-j+j1, j ), 1,afb( kl+ku+1-j+j1, j ), 1 )
+                           
+              end do
+              call stdlib_zgbtrf( n, n, kl, ku, afb, ldafb, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 ) then
+                 ! compute the reciprocal pivot growth factor of the
+                 ! leading rank-deficient info columns of a.
+                 anorm = zero
+                 do j = 1, info
+                    do i = max( ku+2-j, 1 ), min( n+ku+1-j, kl+ku+1 )
+                       anorm = max( anorm, abs( ab( i, j ) ) )
+                    end do
+                 end do
+                 rpvgrw = stdlib_zlantb( 'M', 'U', 'N', info, min( info-1, kl+ku ),afb( max( 1, &
+                           kl+ku+2-info ), 1 ), ldafb,rwork )
+                 if( rpvgrw==zero ) then
+                    rpvgrw = one
+                 else
+                    rpvgrw = anorm / rpvgrw
+                 end if
+                 rwork( 1 ) = rpvgrw
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a and the
+           ! reciprocal pivot growth factor rpvgrw.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_zlangb( norm, n, kl, ku, ab, ldab, rwork )
+           rpvgrw = stdlib_zlantb( 'M', 'U', 'N', n, kl+ku, afb, ldafb, rwork )
+           if( rpvgrw==zero ) then
+              rpvgrw = one
+           else
+              rpvgrw = stdlib_zlangb( 'M', n, kl, ku, ab, ldab, rwork ) / rpvgrw
+           end if
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_zgbcon( norm, n, kl, ku, afb, ldafb, ipiv, anorm, rcond,work, rwork, info )
+                     
+           ! compute the solution matrix x.
+           call stdlib_zlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_zgbtrs( trans, n, kl, ku, nrhs, afb, ldafb, ipiv, x, ldx,info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_zgbrfs( trans, n, kl, ku, nrhs, ab, ldab, afb, ldafb, ipiv,b, ldb, x, ldx, &
+                     ferr, berr, work, rwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( notran ) then
+              if( colequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       x( i, j ) = c( i )*x( i, j )
+                    end do
+                 end do
+                 do j = 1, nrhs
+                    ferr( j ) = ferr( j ) / colcnd
+                 end do
+              end if
+           else if( rowequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = r( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / rowcnd
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           rwork( 1 ) = rpvgrw
+           return
+     end subroutine stdlib_zgbsvx
+
+     !> ZGEBRD: reduces a general complex M-by-N matrix A to upper or lower
+     !> bidiagonal form B by a unitary transformation: Q**H * A * P = B.
+     !> If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal.
+
+     pure subroutine stdlib_zgebrd( m, n, a, lda, d, e, tauq, taup, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(out) :: d(*), e(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: taup(*), tauq(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, ldwrkx, ldwrky, lwkopt, minmn, nb, nbmin, nx, ws
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           nb = max( 1, stdlib_ilaenv( 1, 'ZGEBRD', ' ', m, n, -1, -1 ) )
+           lwkopt = ( m+n )*nb
+           work( 1 ) = real( lwkopt,KIND=dp)
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           else if( lwork<max( 1, m, n ) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info<0 ) then
+              call stdlib_xerbla( 'ZGEBRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           minmn = min( m, n )
+           if( minmn==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ws = max( m, n )
+           ldwrkx = m
+           ldwrky = n
+           if( nb>1 .and. nb<minmn ) then
+              ! set the crossover point nx.
+              nx = max( nb, stdlib_ilaenv( 3, 'ZGEBRD', ' ', m, n, -1, -1 ) )
+              ! determine when to switch from blocked to unblocked code.
+              if( nx<minmn ) then
+                 ws = ( m+n )*nb
+                 if( lwork<ws ) then
+                    ! not enough work space for the optimal nb, consider using
+                    ! a smaller block size.
+                    nbmin = stdlib_ilaenv( 2, 'ZGEBRD', ' ', m, n, -1, -1 )
+                    if( lwork>=( m+n )*nbmin ) then
+                       nb = lwork / ( m+n )
+                    else
+                       nb = 1
+                       nx = minmn
+                    end if
+                 end if
+              end if
+           else
+              nx = minmn
+           end if
+           do i = 1, minmn - nx, nb
+              ! reduce rows and columns i:i+ib-1 to bidiagonal form and return
+              ! the matrices x and y which are needed to update the unreduced
+              ! part of the matrix
+              call stdlib_zlabrd( m-i+1, n-i+1, nb, a( i, i ), lda, d( i ), e( i ),tauq( i ), &
+                        taup( i ), work, ldwrkx,work( ldwrkx*nb+1 ), ldwrky )
+              ! update the trailing submatrix a(i+ib:m,i+ib:n), using
+              ! an update of the form  a := a - v*y**h - x*u**h
+              call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE', m-i-nb+1,n-i-nb+1, nb, -&
+              cone, a( i+nb, i ), lda,work( ldwrkx*nb+nb+1 ), ldwrky, cone,a( i+nb, i+nb ), lda )
+                        
+              call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-i-nb+1, n-i-nb+1,nb, -cone, &
+                        work( nb+1 ), ldwrkx, a( i, i+nb ), lda,cone, a( i+nb, i+nb ), lda )
+              ! copy diagonal and off-diagonal elements of b back into a
+              if( m>=n ) then
+                 do j = i, i + nb - 1
+                    a( j, j ) = d( j )
+                    a( j, j+1 ) = e( j )
+                 end do
+              else
+                 do j = i, i + nb - 1
+                    a( j, j ) = d( j )
+                    a( j+1, j ) = e( j )
+                 end do
+              end if
+           end do
+           ! use unblocked code to reduce the remainder of the matrix
+           call stdlib_zgebd2( m-i+1, n-i+1, a( i, i ), lda, d( i ), e( i ),tauq( i ), taup( i ), &
+                     work, iinfo )
+           work( 1 ) = ws
+           return
+     end subroutine stdlib_zgebrd
+
+     !> ZGEHRD: reduces a complex general matrix A to upper Hessenberg form H by
+     !> an unitary similarity transformation:  Q**H * A * Q = H .
+
+     pure subroutine stdlib_zgehrd( n, ilo, ihi, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, lda, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: nbmax = 64
+           integer(ilp), parameter :: ldt = nbmax+1
+           integer(ilp), parameter :: tsize = ldt*nbmax
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ib, iinfo, iwt, j, ldwork, lwkopt, nb, nbmin, nh, nx
+           complex(dp) :: ei
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -2
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              nb = min( nbmax, stdlib_ilaenv( 1, 'ZGEHRD', ' ', n, ilo, ihi, -1 ) )
+              lwkopt = n*nb + tsize
+              work( 1 ) = lwkopt
+           endif
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEHRD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! set elements 1:ilo-1 and ihi:n-1 of tau to czero
+           do i = 1, ilo - 1
+              tau( i ) = czero
+           end do
+           do i = max( 1, ihi ), n - 1
+              tau( i ) = czero
+           end do
+           ! quick return if possible
+           nh = ihi - ilo + 1
+           if( nh<=1 ) then
+              work( 1 ) = 1
+              return
+           end if
+           ! determine the block size
+           nb = min( nbmax, stdlib_ilaenv( 1, 'ZGEHRD', ' ', n, ilo, ihi, -1 ) )
+           nbmin = 2
+           if( nb>1 .and. nb<nh ) then
+              ! determine when to cross over from blocked to unblocked code
+              ! (last block is always handled by unblocked code)
+              nx = max( nb, stdlib_ilaenv( 3, 'ZGEHRD', ' ', n, ilo, ihi, -1 ) )
+              if( nx<nh ) then
+                 ! determine if workspace is large enough for blocked code
+                 if( lwork<n*nb+tsize ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGEHRD', ' ', n, ilo, ihi,-1 ) )
+                    if( lwork>=(n*nbmin + tsize) ) then
+                       nb = (lwork-tsize) / n
+                    else
+                       nb = 1
+                    end if
+                 end if
+              end if
+           end if
+           ldwork = n
+           if( nb<nbmin .or. nb>=nh ) then
+              ! use unblocked code below
+              i = ilo
+           else
+              ! use blocked code
+              iwt = 1 + n*nb
+              do i = ilo, ihi - 1 - nx, nb
+                 ib = min( nb, ihi-i )
+                 ! reduce columns i:i+ib-1 to hessenberg form, returning the
+                 ! matrices v and t of the block reflector h = i - v*t*v**h
+                 ! which performs the reduction, and also the matrix y = a*v*t
+                 call stdlib_zlahr2( ihi, i, ib, a( 1, i ), lda, tau( i ),work( iwt ), ldt, work, &
+                           ldwork )
+                 ! apply the block reflector h to a(1:ihi,i+ib:ihi) from the
+                 ! right, computing  a := a - y * v**h. v(i+ib,ib-1) must be set
+                 ! to 1
+                 ei = a( i+ib, i+ib-1 )
+                 a( i+ib, i+ib-1 ) = cone
+                 call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',ihi, ihi-i-ib+1,ib, -&
+                           cone, work, ldwork, a( i+ib, i ), lda, cone,a( 1, i+ib ), lda )
+                 a( i+ib, i+ib-1 ) = ei
+                 ! apply the block reflector h to a(1:i,i+1:i+ib-1) from the
+                 ! right
+                 call stdlib_ztrmm( 'RIGHT', 'LOWER', 'CONJUGATE TRANSPOSE','UNIT', i, ib-1,cone, &
+                           a( i+1, i ), lda, work, ldwork )
+                 do j = 0, ib-2
+                    call stdlib_zaxpy( i, -cone, work( ldwork*j+1 ), 1,a( 1, i+j+1 ), 1 )
+                 end do
+                 ! apply the block reflector h to a(i+1:ihi,i+ib:n) from the
+                 ! left
+                 call stdlib_zlarfb( 'LEFT', 'CONJUGATE TRANSPOSE', 'FORWARD','COLUMNWISE',ihi-i, &
+                 n-i-ib+1, ib, a( i+1, i ), lda,work( iwt ), ldt, a( i+1, i+ib ), lda,work, &
+                           ldwork )
+              end do
+           end if
+           ! use unblocked code to reduce the rest of the matrix
+           call stdlib_zgehd2( n, i, ihi, a, lda, tau, work, iinfo )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zgehrd
+
+     !> ZGELQT: computes a blocked LQ factorization of a complex M-by-N matrix A
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_zgelqt( m, n, mb, a, lda, t, ldt, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, mb
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           integer(ilp) :: i, ib, iinfo, k
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( mb<1 .or. (mb>min(m,n) .and. min(m,n)>0 ))then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<mb ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELQT', -info )
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) return
+           ! blocked loop of length k
+           do i = 1, k,  mb
+              ib = min( k-i+1, mb )
+           ! compute the lq factorization of the current block a(i:m,i:i+ib-1)
+              call stdlib_zgelqt3( ib, n-i+1, a(i,i), lda, t(1,i), ldt, iinfo )
+              if( i+ib<=m ) then
+           ! update by applying h**t to a(i:m,i+ib:n) from the right
+              call stdlib_zlarfb( 'R', 'N', 'F', 'R', m-i-ib+1, n-i+1, ib,a( i, i ), lda, t( 1, i &
+                        ), ldt,a( i+ib, i ), lda, work , m-i-ib+1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_zgelqt
+
+     !> ZGELS: solves overdetermined or underdetermined complex linear systems
+     !> involving an M-by-N matrix A, or its conjugate-transpose, using a QR
+     !> or LQ factorization of A.  It is assumed that A has full rank.
+     !> The following options are provided:
+     !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A*X ||.
+     !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+     !> an underdetermined system A * X = B.
+     !> 3. If TRANS = 'C' and m >= n:  find the minimum norm solution of
+     !> an underdetermined system A**H * X = B.
+     !> 4. If TRANS = 'C' and m < n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A**H * X ||.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+
+     subroutine stdlib_zgels( trans, m, n, nrhs, a, lda, b, ldb, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, tpsd
+           integer(ilp) :: brow, i, iascl, ibscl, j, mn, nb, scllen, wsize
+           real(dp) :: anrm, bignum, bnrm, smlnum
+           ! Local Arrays 
+           real(dp) :: rwork(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( .not.( stdlib_lsame( trans, 'N' ) .or. stdlib_lsame( trans, 'C' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m, n ) ) then
+              info = -8
+           else if( lwork<max( 1, mn+max( mn, nrhs ) ) .and. .not.lquery )then
+              info = -10
+           end if
+           ! figure out optimal block size
+           if( info==0 .or. info==-10 ) then
+              tpsd = .true.
+              if( stdlib_lsame( trans, 'N' ) )tpsd = .false.
+              if( m>=n ) then
+                 nb = stdlib_ilaenv( 1, 'ZGEQRF', ' ', m, n, -1, -1 )
+                 if( tpsd ) then
+                    nb = max( nb, stdlib_ilaenv( 1, 'ZUNMQR', 'LN', m, nrhs, n,-1 ) )
+                 else
+                    nb = max( nb, stdlib_ilaenv( 1, 'ZUNMQR', 'LC', m, nrhs, n,-1 ) )
+                 end if
+              else
+                 nb = stdlib_ilaenv( 1, 'ZGELQF', ' ', m, n, -1, -1 )
+                 if( tpsd ) then
+                    nb = max( nb, stdlib_ilaenv( 1, 'ZUNMLQ', 'LC', n, nrhs, m,-1 ) )
+                 else
+                    nb = max( nb, stdlib_ilaenv( 1, 'ZUNMLQ', 'LN', n, nrhs, m,-1 ) )
+                 end if
+              end if
+              wsize = max( 1, mn+max( mn, nrhs )*nb )
+              work( 1 ) = real( wsize,KIND=dp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELS ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+              call stdlib_zlaset( 'FULL', max( m, n ), nrhs, czero, czero, b, ldb )
+              return
+           end if
+           ! get machine parameters
+           smlnum = stdlib_dlamch( 'S' ) / stdlib_dlamch( 'P' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! scale a, b if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_zlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_zlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_zlaset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              go to 50
+           end if
+           brow = m
+           if( tpsd )brow = n
+           bnrm = stdlib_zlange( 'M', brow, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_zlascl( 'G', 0, 0, bnrm, smlnum, brow, nrhs, b, ldb,info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_zlascl( 'G', 0, 0, bnrm, bignum, brow, nrhs, b, ldb,info )
+              ibscl = 2
+           end if
+           if( m>=n ) then
+              ! compute qr factorization of a
+              call stdlib_zgeqrf( m, n, a, lda, work( 1 ), work( mn+1 ), lwork-mn,info )
+              ! workspace at least n, optimally n*nb
+              if( .not.tpsd ) then
+                 ! least-squares problem min || a * x - b ||
+                 ! b(1:m,1:nrhs) := q**h * b(1:m,1:nrhs)
+                 call stdlib_zunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', m, nrhs, n, a,lda, work( 1 ), &
+                           b, ldb, work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:n,1:nrhs) := inv(r) * b(1:n,1:nrhs)
+                 call stdlib_ztrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = n
+              else
+                 ! underdetermined system of equations a**t * x = b
+                 ! b(1:n,1:nrhs) := inv(r**h) * b(1:n,1:nrhs)
+                 call stdlib_ztrtrs( 'UPPER', 'CONJUGATE TRANSPOSE','NON-UNIT',n, nrhs, a, lda, b,&
+                            ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(n+1:m,1:nrhs) = zero
+                 do j = 1, nrhs
+                    do i = n + 1, m
+                       b( i, j ) = czero
+                    end do
+                 end do
+                 ! b(1:m,1:nrhs) := q(1:n,:) * b(1:n,1:nrhs)
+                 call stdlib_zunmqr( 'LEFT', 'NO TRANSPOSE', m, nrhs, n, a, lda,work( 1 ), b, ldb,&
+                            work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = m
+              end if
+           else
+              ! compute lq factorization of a
+              call stdlib_zgelqf( m, n, a, lda, work( 1 ), work( mn+1 ), lwork-mn,info )
+              ! workspace at least m, optimally m*nb.
+              if( .not.tpsd ) then
+                 ! underdetermined system of equations a * x = b
+                 ! b(1:m,1:nrhs) := inv(l) * b(1:m,1:nrhs)
+                 call stdlib_ztrtrs( 'LOWER', 'NO TRANSPOSE', 'NON-UNIT', m, nrhs,a, lda, b, ldb, &
+                           info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(m+1:n,1:nrhs) = 0
+                 do j = 1, nrhs
+                    do i = m + 1, n
+                       b( i, j ) = czero
+                    end do
+                 end do
+                 ! b(1:n,1:nrhs) := q(1:n,:)**h * b(1:m,1:nrhs)
+                 call stdlib_zunmlq( 'LEFT', 'CONJUGATE TRANSPOSE', n, nrhs, m, a,lda, work( 1 ), &
+                           b, ldb, work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = n
+              else
+                 ! overdetermined system min || a**h * x - b ||
+                 ! b(1:n,1:nrhs) := q * b(1:n,1:nrhs)
+                 call stdlib_zunmlq( 'LEFT', 'NO TRANSPOSE', n, nrhs, m, a, lda,work( 1 ), b, ldb,&
+                            work( mn+1 ), lwork-mn,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:m,1:nrhs) := inv(l**h) * b(1:m,1:nrhs)
+                 call stdlib_ztrtrs( 'LOWER', 'CONJUGATE TRANSPOSE', 'NON-UNIT',m, nrhs, a, lda, &
+                           b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = m
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_zlascl( 'G', 0, 0, anrm, smlnum, scllen, nrhs, b, ldb,info )
+           else if( iascl==2 ) then
+              call stdlib_zlascl( 'G', 0, 0, anrm, bignum, scllen, nrhs, b, ldb,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_zlascl( 'G', 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb,info )
+           else if( ibscl==2 ) then
+              call stdlib_zlascl( 'G', 0, 0, bignum, bnrm, scllen, nrhs, b, ldb,info )
+           end if
+           50 continue
+           work( 1 ) = real( wsize,KIND=dp)
+           return
+     end subroutine stdlib_zgels
+
+     !> ZGEQP3: computes a QR factorization with column pivoting of a
+     !> matrix A:  A*P = Q*R  using Level 3 BLAS.
+
+     pure subroutine stdlib_zgeqp3( m, n, a, lda, jpvt, tau, work, lwork, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, m, n
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: tau(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: inb = 1
+           integer(ilp), parameter :: inbmin = 2
+           integer(ilp), parameter :: ixover = 3
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: fjb, iws, j, jb, lwkopt, minmn, minws, na, nb, nbmin, nfxd, nx, sm, &
+                     sminmn, sn, topbmn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+        ! ====================
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info==0 ) then
+              minmn = min( m, n )
+              if( minmn==0 ) then
+                 iws = 1
+                 lwkopt = 1
+              else
+                 iws = n + 1
+                 nb = stdlib_ilaenv( inb, 'ZGEQRF', ' ', m, n, -1, -1 )
+                 lwkopt = ( n + 1 )*nb
+              end if
+              work( 1 ) = cmplx( lwkopt,KIND=dp)
+              if( ( lwork<iws ) .and. .not.lquery ) then
+                 info = -8
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQP3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! move initial columns up front.
+           nfxd = 1
+           do j = 1, n
+              if( jpvt( j )/=0 ) then
+                 if( j/=nfxd ) then
+                    call stdlib_zswap( m, a( 1, j ), 1, a( 1, nfxd ), 1 )
+                    jpvt( j ) = jpvt( nfxd )
+                    jpvt( nfxd ) = j
+                 else
+                    jpvt( j ) = j
+                 end if
+                 nfxd = nfxd + 1
+              else
+                 jpvt( j ) = j
+              end if
+           end do
+           nfxd = nfxd - 1
+           ! factorize fixed columns
+        ! =======================
+           ! compute the qr factorization of fixed columns and update
+           ! remaining columns.
+           if( nfxd>0 ) then
+              na = min( m, nfxd )
+      ! cc      call stdlib_zgeqr2( m, na, a, lda, tau, work, info )
+              call stdlib_zgeqrf( m, na, a, lda, tau, work, lwork, info )
+              iws = max( iws, int( work( 1 ),KIND=ilp) )
+              if( na<n ) then
+      ! cc         call stdlib_zunm2r( 'left', 'conjugate transpose', m, n-na,
+      ! cc  $                   na, a, lda, tau, a( 1, na+1 ), lda, work,
+      ! cc  $                   info )
+                 call stdlib_zunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', m, n-na, na, a,lda, tau, a( 1,&
+                            na+1 ), lda, work, lwork,info )
+                 iws = max( iws, int( work( 1 ),KIND=ilp) )
+              end if
+           end if
+           ! factorize free columns
+        ! ======================
+           if( nfxd<minmn ) then
+              sm = m - nfxd
+              sn = n - nfxd
+              sminmn = minmn - nfxd
+              ! determine the block size.
+              nb = stdlib_ilaenv( inb, 'ZGEQRF', ' ', sm, sn, -1, -1 )
+              nbmin = 2
+              nx = 0
+              if( ( nb>1 ) .and. ( nb<sminmn ) ) then
+                 ! determine when to cross over from blocked to unblocked code.
+                 nx = max( 0, stdlib_ilaenv( ixover, 'ZGEQRF', ' ', sm, sn, -1,-1 ) )
+                 if( nx<sminmn ) then
+                    ! determine if workspace is large enough for blocked code.
+                    minws = ( sn+1 )*nb
+                    iws = max( iws, minws )
+                    if( lwork<minws ) then
+                       ! not enough workspace to use optimal nb: reduce nb and
+                       ! determine the minimum value of nb.
+                       nb = lwork / ( sn+1 )
+                       nbmin = max( 2, stdlib_ilaenv( inbmin, 'ZGEQRF', ' ', sm, sn,-1, -1 ) )
+                                 
+                    end if
+                 end if
+              end if
+              ! initialize partial column norms. the first n elements of work
+              ! store the exact column norms.
+              do j = nfxd + 1, n
+                 rwork( j ) = stdlib_dznrm2( sm, a( nfxd+1, j ), 1 )
+                 rwork( n+j ) = rwork( j )
+              end do
+              if( ( nb>=nbmin ) .and. ( nb<sminmn ) .and.( nx<sminmn ) ) then
+                 ! use blocked code initially.
+                 j = nfxd + 1
+                 ! compute factorization: while loop.
+                 topbmn = minmn - nx
+                 30 continue
+                 if( j<=topbmn ) then
+                    jb = min( nb, topbmn-j+1 )
+                    ! factorize jb columns among columns j:n.
+                    call stdlib_zlaqps( m, n-j+1, j-1, jb, fjb, a( 1, j ), lda,jpvt( j ), tau( j )&
+                              , rwork( j ),rwork( n+j ), work( 1 ), work( jb+1 ),n-j+1 )
+                    j = j + fjb
+                    go to 30
+                 end if
+              else
+                 j = nfxd + 1
+              end if
+              ! use unblocked code to factor the last or only block.
+              if( j<=minmn )call stdlib_zlaqp2( m, n-j+1, j-1, a( 1, j ), lda, jpvt( j ),tau( j ),&
+                         rwork( j ), rwork( n+j ), work( 1 ) )
+           end if
+           work( 1 ) = cmplx( lwkopt,KIND=dp)
+           return
+     end subroutine stdlib_zgeqp3
+
+     !> ZGEQRT: computes a blocked QR factorization of a complex M-by-N matrix A
+     !> using the compact WY representation of Q.
+
+     pure subroutine stdlib_zgeqrt( m, n, nb, a, lda, t, ldt, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, m, n, nb
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: t(ldt,*), work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk), parameter :: use_recursive_qr = .true.
+           integer(ilp) :: i, ib, iinfo, k
+           
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nb<1 .or. ( nb>min(m,n) .and. min(m,n)>0 ) )then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldt<nb ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEQRT', -info )
+              return
+           end if
+           ! quick return if possible
+           k = min( m, n )
+           if( k==0 ) return
+           ! blocked loop of length k
+           do i = 1, k,  nb
+              ib = min( k-i+1, nb )
+           ! compute the qr factorization of the current block a(i:m,i:i+ib-1)
+              if( use_recursive_qr ) then
+                 call stdlib_zgeqrt3( m-i+1, ib, a(i,i), lda, t(1,i), ldt, iinfo )
+              else
+                 call stdlib_zgeqrt2( m-i+1, ib, a(i,i), lda, t(1,i), ldt, iinfo )
+              end if
+              if( i+ib<=n ) then
+           ! update by applying h**h to a(i:m,i+ib:n) from the left
+                 call stdlib_zlarfb( 'L', 'C', 'F', 'C', m-i+1, n-i-ib+1, ib,a( i, i ), lda, t( 1,&
+                            i ), ldt,a( i, i+ib ), lda, work , n-i-ib+1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_zgeqrt
+
+     !> ZGERFS: improves the computed solution to a system of linear
+     !> equations and provides error bounds and backward error estimates for
+     !> the solution.
+
+     pure subroutine stdlib_zgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transn, transt
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGERFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_zcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_zgemv( trans, n, n, -cone, a, lda, x( 1, j ), 1, cone, work,1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(op(a))*abs(x) + abs(b).
+              if( notran ) then
+                 do k = 1, n
+                    xk = cabs1( x( k, j ) )
+                    do i = 1, n
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                    end do
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    do i = 1, n
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_zgetrs( trans, n, 1, af, ldaf, ipiv, work, n, info )
+                 call stdlib_zaxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_zlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_zgetrs( transt, n, 1, af, ldaf, ipiv, work, n,info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_zgetrs( transn, n, 1, af, ldaf, ipiv, work, n,info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_zgerfs
+
+     !> ZGETRF: computes an LU factorization of a general M-by-N matrix A
+     !> using partial pivoting with row interchanges.
+     !> The factorization has the form
+     !> A = P * L * U
+     !> where P is a permutation matrix, L is lower triangular with unit
+     !> diagonal elements (lower trapezoidal if m > n), and U is upper
+     !> triangular (upper trapezoidal if m < n).
+     !> This is the right-looking Level 3 BLAS version of the algorithm.
+
+     pure subroutine stdlib_zgetrf( m, n, a, lda, ipiv, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iinfo, j, jb, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, m ) ) then
+              info = -4
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGETRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           ! determine the block size for this environment.
+           nb = stdlib_ilaenv( 1, 'ZGETRF', ' ', m, n, -1, -1 )
+           if( nb<=1 .or. nb>=min( m, n ) ) then
+              ! use unblocked code.
+              call stdlib_zgetrf2( m, n, a, lda, ipiv, info )
+           else
+              ! use blocked code.
+              do j = 1, min( m, n ), nb
+                 jb = min( min( m, n )-j+1, nb )
+                 ! factor diagonal and subdiagonal blocks and test for exact
+                 ! singularity.
+                 call stdlib_zgetrf2( m-j+1, jb, a( j, j ), lda, ipiv( j ), iinfo )
+                 ! adjust info and the pivot indices.
+                 if( info==0 .and. iinfo>0 )info = iinfo + j - 1
+                 do i = j, min( m, j+jb-1 )
+                    ipiv( i ) = j - 1 + ipiv( i )
+                 end do
+                 ! apply interchanges to columns 1:j-1.
+                 call stdlib_zlaswp( j-1, a, lda, j, j+jb-1, ipiv, 1 )
+                 if( j+jb<=n ) then
+                    ! apply interchanges to columns j+jb:n.
+                    call stdlib_zlaswp( n-j-jb+1, a( 1, j+jb ), lda, j, j+jb-1,ipiv, 1 )
+                    ! compute block row of u.
+                    call stdlib_ztrsm( 'LEFT', 'LOWER', 'NO TRANSPOSE', 'UNIT', jb,n-j-jb+1, cone,&
+                               a( j, j ), lda, a( j, j+jb ),lda )
+                    if( j+jb<=m ) then
+                       ! update trailing submatrix.
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', m-j-jb+1,n-j-jb+1, jb, -&
+                       cone, a( j+jb, j ), lda,a( j, j+jb ), lda, cone, a( j+jb, j+jb ),lda )
+                                 
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_zgetrf
+
+     !> ZGGGLM: solves a general Gauss-Markov linear model (GLM) problem:
+     !> minimize || y ||_2   subject to   d = A*x + B*y
+     !> x
+     !> where A is an N-by-M matrix, B is an N-by-P matrix, and d is a
+     !> given N-vector. It is assumed that M <= N <= M+P, and
+     !> rank(A) = M    and    rank( A B ) = N.
+     !> Under these assumptions, the constrained equation is always
+     !> consistent, and there is a unique solution x and a minimal 2-norm
+     !> solution y, which is obtained using a generalized QR factorization
+     !> of the matrices (A, B) given by
+     !> A = Q*(R),   B = Q*T*Z.
+     !> (0)
+     !> In particular, if matrix B is square nonsingular, then the problem
+     !> GLM is equivalent to the following weighted linear least squares
+     !> problem
+     !> minimize || inv(B)*(d-A*x) ||_2
+     !> x
+     !> where inv(B) denotes the inverse of B.
+
+     pure subroutine stdlib_zggglm( n, m, p, a, lda, b, ldb, d, x, y, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*), d(*)
+           complex(dp), intent(out) :: work(*), x(*), y(*)
+        ! ===================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, lopt, lwkmin, lwkopt, nb, nb1, nb2, nb3, nb4, np
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           np = min( n, p )
+           lquery = ( lwork==-1 )
+           if( n<0 ) then
+              info = -1
+           else if( m<0 .or. m>n ) then
+              info = -2
+           else if( p<0 .or. p<n-m ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           ! calculate workspace
+           if( info==0) then
+              if( n==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'ZGEQRF', ' ', n, m, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'ZGERQF', ' ', n, m, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'ZUNMQR', ' ', n, m, p, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'ZUNMRQ', ' ', n, m, p, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = m + n + p
+                 lwkopt = m + np + max( n, p )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGGLM', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              do i = 1, m
+                 x(i) = czero
+              end do
+              do i = 1, p
+                 y(i) = czero
+              end do
+              return
+           end if
+           ! compute the gqr factorization of matrices a and b:
+                ! q**h*a = ( r11 ) m,    q**h*b*z**h = ( t11   t12 ) m
+                         ! (  0  ) n-m                 (  0    t22 ) n-m
+                            ! m                         m+p-n  n-m
+           ! where r11 and t22 are upper triangular, and q and z are
+           ! unitary.
+           call stdlib_zggqrf( n, m, p, a, lda, work, b, ldb, work( m+1 ),work( m+np+1 ), lwork-m-&
+                     np, info )
+           lopt = real( work( m+np+1 ),KIND=dp)
+           ! update left-hand-side vector d = q**h*d = ( d1 ) m
+                                                     ! ( d2 ) n-m
+           call stdlib_zunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', n, 1, m, a, lda, work,d, max( 1, n )&
+                     , work( m+np+1 ), lwork-m-np, info )
+           lopt = max( lopt, int( work( m+np+1 ),KIND=ilp) )
+           ! solve t22*y2 = d2 for y2
+           if( n>m ) then
+              call stdlib_ztrtrs( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', n-m, 1,b( m+1, m+p-n+1 ), &
+                        ldb, d( m+1 ), n-m, info )
+              if( info>0 ) then
+                 info = 1
+                 return
+              end if
+              call stdlib_zcopy( n-m, d( m+1 ), 1, y( m+p-n+1 ), 1 )
+           end if
+           ! set y1 = 0
+           do i = 1, m + p - n
+              y( i ) = czero
+           end do
+           ! update d1 = d1 - t12*y2
+           call stdlib_zgemv( 'NO TRANSPOSE', m, n-m, -cone, b( 1, m+p-n+1 ), ldb,y( m+p-n+1 ), 1,&
+                      cone, d, 1 )
+           ! solve triangular system: r11*x = d1
+           if( m>0 ) then
+              call stdlib_ztrtrs( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', m, 1, a, lda,d, m, info )
+                        
+              if( info>0 ) then
+                 info = 2
+                 return
+              end if
+              ! copy d to x
+              call stdlib_zcopy( m, d, 1, x, 1 )
+           end if
+           ! backward transformation y = z**h *y
+           call stdlib_zunmrq( 'LEFT', 'CONJUGATE TRANSPOSE', p, 1, np,b( max( 1, n-p+1 ), 1 ), &
+                     ldb, work( m+1 ), y,max( 1, p ), work( m+np+1 ), lwork-m-np, info )
+           work( 1 ) = m + np + max( lopt, int( work( m+np+1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_zggglm
+
+     !> ZGGHD3: reduces a pair of complex matrices (A,B) to generalized upper
+     !> Hessenberg form using unitary transformations, where A is a
+     !> general matrix and B is upper triangular.  The form of the
+     !> generalized eigenvalue problem is
+     !> A*x = lambda*B*x,
+     !> and B is typically made upper triangular by computing its QR
+     !> factorization and moving the unitary matrix Q to the left side
+     !> of the equation.
+     !> This subroutine simultaneously reduces A to a Hessenberg matrix H:
+     !> Q**H*A*Z = H
+     !> and transforms B to another upper triangular matrix T:
+     !> Q**H*B*Z = T
+     !> in order to reduce the problem to its standard form
+     !> H*y = lambda*T*y
+     !> where y = Z**H*x.
+     !> The unitary matrices Q and Z are determined as products of Givens
+     !> rotations.  They may either be formed explicitly, or they may be
+     !> postmultiplied into input matrices Q1 and Z1, so that
+     !> Q1 * A * Z1**H = (Q1*Q) * H * (Z1*Z)**H
+     !> Q1 * B * Z1**H = (Q1*Q) * T * (Z1*Z)**H
+     !> If Q1 is the unitary matrix from the QR factorization of B in the
+     !> original equation A*x = lambda*B*x, then ZGGHD3 reduces the original
+     !> problem to generalized Hessenberg form.
+     !> This is a blocked variant of CGGHRD, using matrix-matrix
+     !> multiplications for parts of the computation to enhance performance.
+
+     pure subroutine stdlib_zgghd3( compq, compz, n, ilo, ihi, a, lda, b, ldb, q,ldq, z, ldz, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz
+           integer(ilp), intent(in) :: ihi, ilo, lda, ldb, ldq, ldz, n, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: blk22, initq, initz, lquery, wantq, wantz
+           character :: compq2, compz2
+           integer(ilp) :: cola, i, ierr, j, j0, jcol, jj, jrow, k, kacc22, len, lwkopt, n2nb, nb,&
+                      nblst, nbmin, nh, nnb, nx, ppw, ppwo, pw, top, topq
+           real(dp) :: c
+           complex(dp) :: c1, c2, ctemp, s, s1, s2, temp, temp1, temp2, temp3
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,conjg,max
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           info = 0
+           nb = stdlib_ilaenv( 1, 'ZGGHD3', ' ', n, ilo, ihi, -1 )
+           lwkopt = max( 6*n*nb, 1 )
+           work( 1 ) = cmplx( lwkopt,KIND=dp)
+           initq = stdlib_lsame( compq, 'I' )
+           wantq = initq .or. stdlib_lsame( compq, 'V' )
+           initz = stdlib_lsame( compz, 'I' )
+           wantz = initz .or. stdlib_lsame( compz, 'V' )
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -1
+           else if( .not.stdlib_lsame( compz, 'N' ) .and. .not.wantz ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 ) then
+              info = -4
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ( wantq .and. ldq<n ) .or. ldq<1 ) then
+              info = -11
+           else if( ( wantz .and. ldz<n ) .or. ldz<1 ) then
+              info = -13
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGHD3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! initialize q and z if desired.
+           if( initq )call stdlib_zlaset( 'ALL', n, n, czero, cone, q, ldq )
+           if( initz )call stdlib_zlaset( 'ALL', n, n, czero, cone, z, ldz )
+           ! zero out lower triangle of b.
+           if( n>1 )call stdlib_zlaset( 'LOWER', n-1, n-1, czero, czero, b(2, 1), ldb )
+           ! quick return if possible
+           nh = ihi - ilo + 1
+           if( nh<=1 ) then
+              work( 1 ) = cone
+              return
+           end if
+           ! determine the blocksize.
+           nbmin = stdlib_ilaenv( 2, 'ZGGHD3', ' ', n, ilo, ihi, -1 )
+           if( nb>1 .and. nb<nh ) then
+              ! determine when to use unblocked instead of blocked code.
+              nx = max( nb, stdlib_ilaenv( 3, 'ZGGHD3', ' ', n, ilo, ihi, -1 ) )
+              if( nx<nh ) then
+                 ! determine if workspace is large enough for blocked code.
+                 if( lwork<lwkopt ) then
+                    ! not enough workspace to use optimal nb:  determine the
+                    ! minimum value of nb, and reduce nb or force use of
+                    ! unblocked code.
+                    nbmin = max( 2, stdlib_ilaenv( 2, 'ZGGHD3', ' ', n, ilo, ihi,-1 ) )
+                    if( lwork>=6*n*nbmin ) then
+                       nb = lwork / ( 6*n )
+                    else
+                       nb = 1
+                    end if
+                 end if
+              end if
+           end if
+           if( nb<nbmin .or. nb>=nh ) then
+              ! use unblocked code below
+              jcol = ilo
+           else
+              ! use blocked code
+              kacc22 = stdlib_ilaenv( 16, 'ZGGHD3', ' ', n, ilo, ihi, -1 )
+              blk22 = kacc22==2
+              do jcol = ilo, ihi-2, nb
+                 nnb = min( nb, ihi-jcol-1 )
+                 ! initialize small unitary factors that will hold the
+                 ! accumulated givens rotations in workspace.
+                 ! n2nb   denotes the number of 2*nnb-by-2*nnb factors
+                 ! nblst  denotes the (possibly smaller) order of the last
+                        ! factor.
+                 n2nb = ( ihi-jcol-1 ) / nnb - 1
+                 nblst = ihi - jcol - n2nb*nnb
+                 call stdlib_zlaset( 'ALL', nblst, nblst, czero, cone, work, nblst )
+                 pw = nblst * nblst + 1
+                 do i = 1, n2nb
+                    call stdlib_zlaset( 'ALL', 2*nnb, 2*nnb, czero, cone,work( pw ), 2*nnb )
+                              
+                    pw = pw + 4*nnb*nnb
+                 end do
+                 ! reduce columns jcol:jcol+nnb-1 of a to hessenberg form.
+                 do j = jcol, jcol+nnb-1
+                    ! reduce jth column of a. store cosines and sines in jth
+                    ! column of a and b, respectively.
+                    do i = ihi, j+2, -1
+                       temp = a( i-1, j )
+                       call stdlib_zlartg( temp, a( i, j ), c, s, a( i-1, j ) )
+                       a( i, j ) = cmplx( c,KIND=dp)
+                       b( i, j ) = s
+                    end do
+                    ! accumulate givens rotations into workspace array.
+                    ppw  = ( nblst + 1 )*( nblst - 2 ) - j + jcol + 1
+                    len  = 2 + j - jcol
+                    jrow = j + n2nb*nnb + 2
+                    do i = ihi, jrow, -1
+                       ctemp = a( i, j )
+                       s = b( i, j )
+                       do jj = ppw, ppw+len-1
+                          temp = work( jj + nblst )
+                          work( jj + nblst ) = ctemp*temp - s*work( jj )
+                          work( jj ) = conjg( s )*temp + ctemp*work( jj )
+                       end do
+                       len = len + 1
+                       ppw = ppw - nblst - 1
+                    end do
+                    ppwo = nblst*nblst + ( nnb+j-jcol-1 )*2*nnb + nnb
+                    j0 = jrow - nnb
+                    do jrow = j0, j+2, -nnb
+                       ppw = ppwo
+                       len  = 2 + j - jcol
+                       do i = jrow+nnb-1, jrow, -1
+                          ctemp = a( i, j )
+                          s = b( i, j )
+                          do jj = ppw, ppw+len-1
+                             temp = work( jj + 2*nnb )
+                             work( jj + 2*nnb ) = ctemp*temp - s*work( jj )
+                             work( jj ) = conjg( s )*temp + ctemp*work( jj )
+                          end do
+                          len = len + 1
+                          ppw = ppw - 2*nnb - 1
+                       end do
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                    ! top denotes the number of top rows in a and b that will
+                    ! not be updated during the next steps.
+                    if( jcol<=2 ) then
+                       top = 0
+                    else
+                       top = jcol
+                    end if
+                    ! propagate transformations through b and replace stored
+                    ! left sines/cosines by right sines/cosines.
+                    do jj = n, j+1, -1
+                       ! update jjth column of b.
+                       do i = min( jj+1, ihi ), j+2, -1
+                          ctemp = a( i, j )
+                          s = b( i, j )
+                          temp = b( i, jj )
+                          b( i, jj ) = ctemp*temp - conjg( s )*b( i-1, jj )
+                          b( i-1, jj ) = s*temp + ctemp*b( i-1, jj )
+                       end do
+                       ! annihilate b( jj+1, jj ).
+                       if( jj<ihi ) then
+                          temp = b( jj+1, jj+1 )
+                          call stdlib_zlartg( temp, b( jj+1, jj ), c, s,b( jj+1, jj+1 ) )
+                          b( jj+1, jj ) = czero
+                          call stdlib_zrot( jj-top, b( top+1, jj+1 ), 1,b( top+1, jj ), 1, c, s )
+                                    
+                          a( jj+1, j ) = cmplx( c,KIND=dp)
+                          b( jj+1, j ) = -conjg( s )
+                       end if
+                    end do
+                    ! update a by transformations from right.
+                    jj = mod( ihi-j-1, 3 )
+                    do i = ihi-j-3, jj+1, -3
+                       ctemp = a( j+1+i, j )
+                       s = -b( j+1+i, j )
+                       c1 = a( j+2+i, j )
+                       s1 = -b( j+2+i, j )
+                       c2 = a( j+3+i, j )
+                       s2 = -b( j+3+i, j )
+                       do k = top+1, ihi
+                          temp = a( k, j+i  )
+                          temp1 = a( k, j+i+1 )
+                          temp2 = a( k, j+i+2 )
+                          temp3 = a( k, j+i+3 )
+                          a( k, j+i+3 ) = c2*temp3 + conjg( s2 )*temp2
+                          temp2 = -s2*temp3 + c2*temp2
+                          a( k, j+i+2 ) = c1*temp2 + conjg( s1 )*temp1
+                          temp1 = -s1*temp2 + c1*temp1
+                          a( k, j+i+1 ) = ctemp*temp1 + conjg( s )*temp
+                          a( k, j+i ) = -s*temp1 + ctemp*temp
+                       end do
+                    end do
+                    if( jj>0 ) then
+                       do i = jj, 1, -1
+                          c = real( a( j+1+i, j ),KIND=dp)
+                          call stdlib_zrot( ihi-top, a( top+1, j+i+1 ), 1,a( top+1, j+i ), 1, c,-&
+                                    conjg( b( j+1+i, j ) ) )
+                       end do
+                    end if
+                    ! update (j+1)th column of a by transformations from left.
+                    if ( j < jcol + nnb - 1 ) then
+                       len  = 1 + j - jcol
+                       ! multiply with the trailing accumulated unitary
+                       ! matrix, which takes the form
+                              ! [  u11  u12  ]
+                          ! u = [            ],
+                              ! [  u21  u22  ]
+                       ! where u21 is a len-by-len matrix and u12 is lower
+                       ! triangular.
+                       jrow = ihi - nblst + 1
+                       call stdlib_zgemv( 'CONJUGATE', nblst, len, cone, work,nblst, a( jrow, j+1 &
+                                 ), 1, czero,work( pw ), 1 )
+                       ppw = pw + len
+                       do i = jrow, jrow+nblst-len-1
+                          work( ppw ) = a( i, j+1 )
+                          ppw = ppw + 1
+                       end do
+                       call stdlib_ztrmv( 'LOWER', 'CONJUGATE', 'NON-UNIT',nblst-len, work( &
+                                 len*nblst + 1 ), nblst,work( pw+len ), 1 )
+                       call stdlib_zgemv( 'CONJUGATE', len, nblst-len, cone,work( (len+1)*nblst - &
+                       len + 1 ), nblst,a( jrow+nblst-len, j+1 ), 1, cone,work( pw+len ), 1 )
+                                 
+                       ppw = pw
+                       do i = jrow, jrow+nblst-1
+                          a( i, j+1 ) = work( ppw )
+                          ppw = ppw + 1
+                       end do
+                       ! multiply with the other accumulated unitary
+                       ! matrices, which take the form
+                              ! [  u11  u12   0  ]
+                              ! [                ]
+                          ! u = [  u21  u22   0  ],
+                              ! [                ]
+                              ! [   0    0    i  ]
+                       ! where i denotes the (nnb-len)-by-(nnb-len) identity
+                       ! matrix, u21 is a len-by-len upper triangular matrix
+                       ! and u12 is an nnb-by-nnb lower triangular matrix.
+                       ppwo = 1 + nblst*nblst
+                       j0 = jrow - nnb
+                       do jrow = j0, jcol+1, -nnb
+                          ppw = pw + len
+                          do i = jrow, jrow+nnb-1
+                             work( ppw ) = a( i, j+1 )
+                             ppw = ppw + 1
+                          end do
+                          ppw = pw
+                          do i = jrow+nnb, jrow+nnb+len-1
+                             work( ppw ) = a( i, j+1 )
+                             ppw = ppw + 1
+                          end do
+                          call stdlib_ztrmv( 'UPPER', 'CONJUGATE', 'NON-UNIT', len,work( ppwo + &
+                                    nnb ), 2*nnb, work( pw ),1 )
+                          call stdlib_ztrmv( 'LOWER', 'CONJUGATE', 'NON-UNIT', nnb,work( ppwo + &
+                                    2*len*nnb ),2*nnb, work( pw + len ), 1 )
+                          call stdlib_zgemv( 'CONJUGATE', nnb, len, cone,work( ppwo ), 2*nnb, a( &
+                                    jrow, j+1 ), 1,cone, work( pw ), 1 )
+                          call stdlib_zgemv( 'CONJUGATE', len, nnb, cone,work( ppwo + 2*len*nnb + &
+                                    nnb ), 2*nnb,a( jrow+nnb, j+1 ), 1, cone,work( pw+len ), 1 )
+                          ppw = pw
+                          do i = jrow, jrow+len+nnb-1
+                             a( i, j+1 ) = work( ppw )
+                             ppw = ppw + 1
+                          end do
+                          ppwo = ppwo + 4*nnb*nnb
+                       end do
+                    end if
+                 end do
+                 ! apply accumulated unitary matrices to a.
+                 cola = n - jcol - nnb + 1
+                 j = ihi - nblst + 1
+                 call stdlib_zgemm( 'CONJUGATE', 'NO TRANSPOSE', nblst,cola, nblst, cone, work, &
+                           nblst,a( j, jcol+nnb ), lda, czero, work( pw ),nblst )
+                 call stdlib_zlacpy( 'ALL', nblst, cola, work( pw ), nblst,a( j, jcol+nnb ), lda )
+                           
+                 ppwo = nblst*nblst + 1
+                 j0 = j - nnb
+                 do j = j0, jcol+1, -nnb
+                    if ( blk22 ) then
+                       ! exploit the structure of
+                              ! [  u11  u12  ]
+                          ! u = [            ]
+                              ! [  u21  u22  ],
+                       ! where all blocks are nnb-by-nnb, u21 is upper
+                       ! triangular and u12 is lower triangular.
+                       call stdlib_zunm22( 'LEFT', 'CONJUGATE', 2*nnb, cola, nnb,nnb, work( ppwo )&
+                                 , 2*nnb,a( j, jcol+nnb ), lda, work( pw ),lwork-pw+1, ierr )
+                    else
+                       ! ignore the structure of u.
+                       call stdlib_zgemm( 'CONJUGATE', 'NO TRANSPOSE', 2*nnb,cola, 2*nnb, cone, &
+                       work( ppwo ), 2*nnb,a( j, jcol+nnb ), lda, czero, work( pw ),2*nnb )
+                                 
+                       call stdlib_zlacpy( 'ALL', 2*nnb, cola, work( pw ), 2*nnb,a( j, jcol+nnb ),&
+                                  lda )
+                    end if
+                    ppwo = ppwo + 4*nnb*nnb
+                 end do
+                 ! apply accumulated unitary matrices to q.
+                 if( wantq ) then
+                    j = ihi - nblst + 1
+                    if ( initq ) then
+                       topq = max( 2, j - jcol + 1 )
+                       nh  = ihi - topq + 1
+                    else
+                       topq = 1
+                       nh = n
+                    end if
+                    call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,nblst, nblst, cone, q( &
+                              topq, j ), ldq,work, nblst, czero, work( pw ), nh )
+                    call stdlib_zlacpy( 'ALL', nh, nblst, work( pw ), nh,q( topq, j ), ldq )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( initq ) then
+                          topq = max( 2, j - jcol + 1 )
+                          nh  = ihi - topq + 1
+                       end if
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_zunm22( 'RIGHT', 'NO TRANSPOSE', nh, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,q( topq, j ), ldq, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,2*nnb, 2*nnb, &
+                          cone, q( topq, j ), ldq,work( ppwo ), 2*nnb, czero, work( pw ),nh )
+                                    
+                          call stdlib_zlacpy( 'ALL', nh, 2*nnb, work( pw ), nh,q( topq, j ), ldq )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+                 ! accumulate right givens rotations if required.
+                 if ( wantz .or. top>0 ) then
+                    ! initialize small unitary factors that will hold the
+                    ! accumulated givens rotations in workspace.
+                    call stdlib_zlaset( 'ALL', nblst, nblst, czero, cone, work,nblst )
+                    pw = nblst * nblst + 1
+                    do i = 1, n2nb
+                       call stdlib_zlaset( 'ALL', 2*nnb, 2*nnb, czero, cone,work( pw ), 2*nnb )
+                                 
+                       pw = pw + 4*nnb*nnb
+                    end do
+                    ! accumulate givens rotations into workspace array.
+                    do j = jcol, jcol+nnb-1
+                       ppw  = ( nblst + 1 )*( nblst - 2 ) - j + jcol + 1
+                       len  = 2 + j - jcol
+                       jrow = j + n2nb*nnb + 2
+                       do i = ihi, jrow, -1
+                          ctemp = a( i, j )
+                          a( i, j ) = czero
+                          s = b( i, j )
+                          b( i, j ) = czero
+                          do jj = ppw, ppw+len-1
+                             temp = work( jj + nblst )
+                             work( jj + nblst ) = ctemp*temp -conjg( s )*work( jj )
+                             work( jj ) = s*temp + ctemp*work( jj )
+                          end do
+                          len = len + 1
+                          ppw = ppw - nblst - 1
+                       end do
+                       ppwo = nblst*nblst + ( nnb+j-jcol-1 )*2*nnb + nnb
+                       j0 = jrow - nnb
+                       do jrow = j0, j+2, -nnb
+                          ppw = ppwo
+                          len  = 2 + j - jcol
+                          do i = jrow+nnb-1, jrow, -1
+                             ctemp = a( i, j )
+                             a( i, j ) = czero
+                             s = b( i, j )
+                             b( i, j ) = czero
+                             do jj = ppw, ppw+len-1
+                                temp = work( jj + 2*nnb )
+                                work( jj + 2*nnb ) = ctemp*temp -conjg( s )*work( jj )
+                                work( jj ) = s*temp + ctemp*work( jj )
+                             end do
+                             len = len + 1
+                             ppw = ppw - 2*nnb - 1
+                          end do
+                          ppwo = ppwo + 4*nnb*nnb
+                       end do
+                    end do
+                 else
+                    call stdlib_zlaset( 'LOWER', ihi - jcol - 1, nnb, czero, czero,a( jcol + 2, &
+                              jcol ), lda )
+                    call stdlib_zlaset( 'LOWER', ihi - jcol - 1, nnb, czero, czero,b( jcol + 2, &
+                              jcol ), ldb )
+                 end if
+                 ! apply accumulated unitary matrices to a and b.
+                 if ( top>0 ) then
+                    j = ihi - nblst + 1
+                    call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,nblst, nblst, cone, a( &
+                              1, j ), lda,work, nblst, czero, work( pw ), top )
+                    call stdlib_zlacpy( 'ALL', top, nblst, work( pw ), top,a( 1, j ), lda )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_zunm22( 'RIGHT', 'NO TRANSPOSE', top, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,a( 1, j ), lda, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,2*nnb, 2*nnb, &
+                          cone, a( 1, j ), lda,work( ppwo ), 2*nnb, czero,work( pw ), top )
+                                    
+                          call stdlib_zlacpy( 'ALL', top, 2*nnb, work( pw ), top,a( 1, j ), lda )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                    j = ihi - nblst + 1
+                    call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,nblst, nblst, cone, b( &
+                              1, j ), ldb,work, nblst, czero, work( pw ), top )
+                    call stdlib_zlacpy( 'ALL', top, nblst, work( pw ), top,b( 1, j ), ldb )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_zunm22( 'RIGHT', 'NO TRANSPOSE', top, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,b( 1, j ), ldb, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', top,2*nnb, 2*nnb, &
+                          cone, b( 1, j ), ldb,work( ppwo ), 2*nnb, czero,work( pw ), top )
+                                    
+                          call stdlib_zlacpy( 'ALL', top, 2*nnb, work( pw ), top,b( 1, j ), ldb )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+                 ! apply accumulated unitary matrices to z.
+                 if( wantz ) then
+                    j = ihi - nblst + 1
+                    if ( initq ) then
+                       topq = max( 2, j - jcol + 1 )
+                       nh  = ihi - topq + 1
+                    else
+                       topq = 1
+                       nh = n
+                    end if
+                    call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,nblst, nblst, cone, z( &
+                              topq, j ), ldz,work, nblst, czero, work( pw ), nh )
+                    call stdlib_zlacpy( 'ALL', nh, nblst, work( pw ), nh,z( topq, j ), ldz )
+                              
+                    ppwo = nblst*nblst + 1
+                    j0 = j - nnb
+                    do j = j0, jcol+1, -nnb
+                          if ( initq ) then
+                          topq = max( 2, j - jcol + 1 )
+                          nh  = ihi - topq + 1
+                       end if
+                       if ( blk22 ) then
+                          ! exploit the structure of u.
+                          call stdlib_zunm22( 'RIGHT', 'NO TRANSPOSE', nh, 2*nnb,nnb, nnb, work( &
+                                    ppwo ), 2*nnb,z( topq, j ), ldz, work( pw ),lwork-pw+1, ierr )
+                       else
+                          ! ignore the structure of u.
+                          call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', nh,2*nnb, 2*nnb, &
+                          cone, z( topq, j ), ldz,work( ppwo ), 2*nnb, czero, work( pw ),nh )
+                                    
+                          call stdlib_zlacpy( 'ALL', nh, 2*nnb, work( pw ), nh,z( topq, j ), ldz )
+                                    
+                       end if
+                       ppwo = ppwo + 4*nnb*nnb
+                    end do
+                 end if
+              end do
+           end if
+           ! use unblocked code to reduce the rest of the matrix
+           ! avoid re-initialization of modified q and z.
+           compq2 = compq
+           compz2 = compz
+           if ( jcol/=ilo ) then
+              if ( wantq )compq2 = 'V'
+              if ( wantz )compz2 = 'V'
+           end if
+           if ( jcol<ihi )call stdlib_zgghrd( compq2, compz2, n, jcol, ihi, a, lda, b, ldb, q,ldq,&
+                      z, ldz, ierr )
+           work( 1 ) = cmplx( lwkopt,KIND=dp)
+           return
+     end subroutine stdlib_zgghd3
+
+     !> ZGGLSE: solves the linear equality-constrained least squares (LSE)
+     !> problem:
+     !> minimize || c - A*x ||_2   subject to   B*x = d
+     !> where A is an M-by-N matrix, B is a P-by-N matrix, c is a given
+     !> M-vector, and d is a given P-vector. It is assumed that
+     !> P <= N <= M+P, and
+     !> rank(B) = P and  rank( (A) ) = N.
+     !> ( (B) )
+     !> These conditions ensure that the LSE problem has a unique solution,
+     !> which is obtained using a generalized RQ factorization of the
+     !> matrices (B, A) given by
+     !> B = (0 R)*Q,   A = Z*T*Q.
+
+     pure subroutine stdlib_zgglse( m, n, p, a, lda, b, ldb, c, d, x, work, lwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, p
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*), c(*), d(*)
+           complex(dp), intent(out) :: work(*), x(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lopt, lwkmin, lwkopt, mn, nb, nb1, nb2, nb3, nb4, nr
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           info = 0
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( p<0 .or. p>n .or. p<n-m ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, p ) ) then
+              info = -7
+           end if
+           ! calculate workspace
+           if( info==0) then
+              if( n==0 ) then
+                 lwkmin = 1
+                 lwkopt = 1
+              else
+                 nb1 = stdlib_ilaenv( 1, 'ZGEQRF', ' ', m, n, -1, -1 )
+                 nb2 = stdlib_ilaenv( 1, 'ZGERQF', ' ', m, n, -1, -1 )
+                 nb3 = stdlib_ilaenv( 1, 'ZUNMQR', ' ', m, n, p, -1 )
+                 nb4 = stdlib_ilaenv( 1, 'ZUNMRQ', ' ', m, n, p, -1 )
+                 nb = max( nb1, nb2, nb3, nb4 )
+                 lwkmin = m + n + p
+                 lwkopt = p + mn + max( m, n )*nb
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGLSE', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! compute the grq factorization of matrices b and a:
+                  ! b*q**h = (  0  t12 ) p   z**h*a*q**h = ( r11 r12 ) n-p
+                              ! n-p  p                     (  0  r22 ) m+p-n
+                                                            ! n-p  p
+           ! where t12 and r11 are upper triangular, and q and z are
+           ! unitary.
+           call stdlib_zggrqf( p, m, n, b, ldb, work, a, lda, work( p+1 ),work( p+mn+1 ), lwork-p-&
+                     mn, info )
+           lopt = real( work( p+mn+1 ),KIND=dp)
+           ! update c = z**h *c = ( c1 ) n-p
+                             ! ( c2 ) m+p-n
+           call stdlib_zunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', m, 1, mn, a, lda,work( p+1 ), c, &
+                     max( 1, m ), work( p+mn+1 ),lwork-p-mn, info )
+           lopt = max( lopt, int( work( p+mn+1 ),KIND=ilp) )
+           ! solve t12*x2 = d for x2
+           if( p>0 ) then
+              call stdlib_ztrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', p, 1,b( 1, n-p+1 ), ldb, d,&
+                         p, info )
+              if( info>0 ) then
+                 info = 1
+                 return
+              end if
+              ! put the solution in x
+              call stdlib_zcopy( p, d, 1, x( n-p+1 ), 1 )
+              ! update c1
+              call stdlib_zgemv( 'NO TRANSPOSE', n-p, p, -cone, a( 1, n-p+1 ), lda,d, 1, cone, c, &
+                        1 )
+           end if
+           ! solve r11*x1 = c1 for x1
+           if( n>p ) then
+              call stdlib_ztrtrs( 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', n-p, 1,a, lda, c, n-p, &
+                        info )
+              if( info>0 ) then
+                 info = 2
+                 return
+              end if
+              ! put the solutions in x
+              call stdlib_zcopy( n-p, c, 1, x, 1 )
+           end if
+           ! compute the residual vector:
+           if( m<n ) then
+              nr = m + p - n
+              if( nr>0 )call stdlib_zgemv( 'NO TRANSPOSE', nr, n-m, -cone, a( n-p+1, m+1 ),lda, d(&
+                         nr+1 ), 1, cone, c( n-p+1 ), 1 )
+           else
+              nr = p
+           end if
+           if( nr>0 ) then
+              call stdlib_ztrmv( 'UPPER', 'NO TRANSPOSE', 'NON UNIT', nr,a( n-p+1, n-p+1 ), lda, &
+                        d, 1 )
+              call stdlib_zaxpy( nr, -cone, d, 1, c( n-p+1 ), 1 )
+           end if
+           ! backward transformation x = q**h*x
+           call stdlib_zunmrq( 'LEFT', 'CONJUGATE TRANSPOSE', n, 1, p, b, ldb,work( 1 ), x, n, &
+                     work( p+mn+1 ), lwork-p-mn, info )
+           work( 1 ) = p + mn + max( lopt, int( work( p+mn+1 ),KIND=ilp) )
+           return
+     end subroutine stdlib_zgglse
+
+     !> ZGTCON: estimates the reciprocal of the condition number of a complex
+     !> tridiagonal matrix A using the LU factorization as computed by
+     !> ZGTTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_zgtcon( norm, n, dl, d, du, du2, ipiv, anorm, rcond,work, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: norm
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: d(*), dl(*), du(*), du2(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: onenrm
+           integer(ilp) :: i, kase, kase1
+           real(dp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: cmplx
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           onenrm = norm=='1' .or. stdlib_lsame( norm, 'O' )
+           if( .not.onenrm .and. .not.stdlib_lsame( norm, 'I' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGTCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm==zero ) then
+              return
+           end if
+           ! check that d(1:n) is non-zero.
+           do i = 1, n
+              if( d( i )==cmplx( zero,KIND=dp) )return
+           end do
+           ainvnm = zero
+           if( onenrm ) then
+              kase1 = 1
+           else
+              kase1 = 2
+           end if
+           kase = 0
+           20 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              if( kase==kase1 ) then
+                 ! multiply by inv(u)*inv(l).
+                 call stdlib_zgttrs( 'NO TRANSPOSE', n, 1, dl, d, du, du2, ipiv,work, n, info )
+                           
+              else
+                 ! multiply by inv(l**h)*inv(u**h).
+                 call stdlib_zgttrs( 'CONJUGATE TRANSPOSE', n, 1, dl, d, du, du2,ipiv, work, n, &
+                           info )
+              end if
+              go to 20
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_zgtcon
+
+     !> ZGTRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is tridiagonal, and provides
+     !> error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_zgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2,ipiv, b, ldb, x, &
+               ldx, ferr, berr, work, rwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: b(ldb,*), d(*), df(*), dl(*), dlf(*), du(*), du2(*), duf(*)
+                     
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: notran
+           character :: transn, transt
+           integer(ilp) :: count, i, j, kase, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( trans, &
+                     'C' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -13
+           else if( ldx<max( 1, n ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGTRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           if( notran ) then
+              transn = 'N'
+              transt = 'C'
+           else
+              transn = 'C'
+              transt = 'N'
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = 4
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_110: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - op(a) * x,
+              ! where op(a) = a, a**t, or a**h, depending on trans.
+              call stdlib_zcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_zlagtm( trans, n, 1, -one, dl, d, du, x( 1, j ), ldx, one,work, n )
+                        
+              ! compute abs(op(a))*abs(x) + abs(b) for use in the backward
+              ! error bound.
+              if( notran ) then
+                 if( n==1 ) then
+                    rwork( 1 ) = cabs1( b( 1, j ) ) +cabs1( d( 1 ) )*cabs1( x( 1, j ) )
+                 else
+                    rwork( 1 ) = cabs1( b( 1, j ) ) +cabs1( d( 1 ) )*cabs1( x( 1, j ) ) +cabs1( &
+                              du( 1 ) )*cabs1( x( 2, j ) )
+                    do i = 2, n - 1
+                       rwork( i ) = cabs1( b( i, j ) ) +cabs1( dl( i-1 ) )*cabs1( x( i-1, j ) ) +&
+                       cabs1( d( i ) )*cabs1( x( i, j ) ) +cabs1( du( i ) )*cabs1( x( i+1, j ) )
+                                 
+                    end do
+                    rwork( n ) = cabs1( b( n, j ) ) +cabs1( dl( n-1 ) )*cabs1( x( n-1, j ) ) +&
+                              cabs1( d( n ) )*cabs1( x( n, j ) )
+                 end if
+              else
+                 if( n==1 ) then
+                    rwork( 1 ) = cabs1( b( 1, j ) ) +cabs1( d( 1 ) )*cabs1( x( 1, j ) )
+                 else
+                    rwork( 1 ) = cabs1( b( 1, j ) ) +cabs1( d( 1 ) )*cabs1( x( 1, j ) ) +cabs1( &
+                              dl( 1 ) )*cabs1( x( 2, j ) )
+                    do i = 2, n - 1
+                       rwork( i ) = cabs1( b( i, j ) ) +cabs1( du( i-1 ) )*cabs1( x( i-1, j ) ) +&
+                       cabs1( d( i ) )*cabs1( x( i, j ) ) +cabs1( dl( i ) )*cabs1( x( i+1, j ) )
+                                 
+                    end do
+                    rwork( n ) = cabs1( b( n, j ) ) +cabs1( du( n-1 ) )*cabs1( x( n-1, j ) ) +&
+                              cabs1( d( n ) )*cabs1( x( n, j ) )
+                 end if
+              end if
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(op(a))*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_zgttrs( trans, n, 1, dlf, df, duf, du2, ipiv, work, n,info )
+                 call stdlib_zaxpy( n, cmplx( one,KIND=dp), work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(op(a)))*
+                 ! ( abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(op(a)) is the inverse of op(a)
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(op(a))*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(op(a))*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_zlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(op(a)) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(op(a))*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              70 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(op(a)**h).
+                    call stdlib_zgttrs( transt, n, 1, dlf, df, duf, du2, ipiv, work,n, info )
+                              
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else
+                    ! multiply by inv(op(a))*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_zgttrs( transn, n, 1, dlf, df, duf, du2, ipiv, work,n, info )
+                              
+                 end if
+                 go to 70
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_110
+           return
+     end subroutine stdlib_zgtrfs
+
+     !> ZGTSVX: uses the LU factorization to compute the solution to a complex
+     !> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+     !> where A is a tridiagonal matrix of order N and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     pure subroutine stdlib_zgtsvx( fact, trans, n, nrhs, dl, d, du, dlf, df, duf,du2, ipiv, b, &
+               ldb, x, ldx, rcond, ferr, berr,work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: b(ldb,*), d(*), dl(*), du(*)
+           complex(dp), intent(inout) :: df(*), dlf(*), du2(*), duf(*)
+           complex(dp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact, notran
+           character :: norm
+           real(dp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -14
+           else if( ldx<max( 1, n ) ) then
+              info = -16
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGTSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the lu factorization of a.
+              call stdlib_zcopy( n, d, 1, df, 1 )
+              if( n>1 ) then
+                 call stdlib_zcopy( n-1, dl, 1, dlf, 1 )
+                 call stdlib_zcopy( n-1, du, 1, duf, 1 )
+              end if
+              call stdlib_zgttrf( n, dlf, df, duf, du2, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_zlangt( norm, n, dl, d, du )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_zgtcon( norm, n, dlf, df, duf, du2, ipiv, anorm, rcond, work,info )
+           ! compute the solution vectors x.
+           call stdlib_zlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_zgttrs( trans, n, nrhs, dlf, df, duf, du2, ipiv, x, ldx,info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_zgtrfs( trans, n, nrhs, dl, d, du, dlf, df, duf, du2, ipiv,b, ldb, x, ldx, &
+                     ferr, berr, work, rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_zgtsvx
+
+     !> ZHBGST: reduces a complex Hermitian-definite banded generalized
+     !> eigenproblem  A*x = lambda*B*x  to standard form  C*y = lambda*y,
+     !> such that C has the same bandwidth as A.
+     !> B must have been previously factorized as S**H*S by ZPBSTF, using a
+     !> split Cholesky factorization. A is overwritten by C = X**H*A*X, where
+     !> X = S**(-1)*Q and Q is a unitary matrix chosen to preserve the
+     !> bandwidth of A.
+
+     pure subroutine stdlib_zhbgst( vect, uplo, n, ka, kb, ab, ldab, bb, ldbb, x,ldx, work, rwork,&
+                info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldx, n
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: ab(ldab,*)
+           complex(dp), intent(in) :: bb(ldbb,*)
+           complex(dp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: update, upper, wantx
+           integer(ilp) :: i, i0, i1, i2, inca, j, j1, j1t, j2, j2t, k, ka1, kb1, kbt, l, m, nr, &
+                     nrt, nx
+           real(dp) :: bii
+           complex(dp) :: ra, ra1, t
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           wantx = stdlib_lsame( vect, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           ka1 = ka + 1
+           kb1 = kb + 1
+           info = 0
+           if( .not.wantx .and. .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldx<1 .or. wantx .and. ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBGST', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           inca = ldab*ka1
+           ! initialize x to the unit matrix, if needed
+           if( wantx )call stdlib_zlaset( 'FULL', n, n, czero, cone, x, ldx )
+           ! set m to the splitting point m. it must be the same value as is
+           ! used in stdlib_zpbstf. the chosen value allows the arrays work and rwork
+           ! to be of dimension (n).
+           m = ( n+kb ) / 2
+           ! the routine works in two phases, corresponding to the two halves
+           ! of the split cholesky factorization of b as s**h*s where
+           ! s = ( u    )
+               ! ( m  l )
+           ! with u upper triangular of order m, and l lower triangular of
+           ! order n-m. s has the same bandwidth as b.
+           ! s is treated as a product of elementary matrices:
+           ! s = s(m)*s(m-1)*...*s(2)*s(1)*s(m+1)*s(m+2)*...*s(n-1)*s(n)
+           ! where s(i) is determined by the i-th row of s.
+           ! in phase 1, the index i takes the values n, n-1, ... , m+1;
+           ! in phase 2, it takes the values 1, 2, ... , m.
+           ! for each value of i, the current matrix a is updated by forming
+           ! inv(s(i))**h*a*inv(s(i)). this creates a triangular bulge outside
+           ! the band of a. the bulge is then pushed down toward the bottom of
+           ! a in phase 1, and up toward the top of a in phase 2, by applying
+           ! plane rotations.
+           ! there are kb*(kb+1)/2 elements in the bulge, but at most 2*kb-1
+           ! of them are linearly independent, so annihilating a bulge requires
+           ! only 2*kb-1 plane rotations. the rotations are divided into a 1st
+           ! set of kb-1 rotations, and a 2nd set of kb rotations.
+           ! wherever possible, rotations are generated and applied in vector
+           ! operations of length nr between the indices j1 and j2 (sometimes
+           ! replaced by modified values nrt, j1t or j2t).
+           ! the real cosines and complex sines of the rotations are stored in
+           ! the arrays rwork and work, those of the 1st set in elements
+           ! 2:m-kb-1, and those of the 2nd set in elements m-kb+1:n.
+           ! the bulges are not formed explicitly; nonzero elements outside the
+           ! band are created only when they are required for generating new
+           ! rotations; they are stored in the array work, in positions where
+           ! they are later overwritten by the sines of the rotations which
+           ! annihilate them.
+           ! **************************** phase 1 *****************************
+           ! the logical structure of this phase is:
+           ! update = .true.
+           ! do i = n, m + 1, -1
+              ! use s(i) to update a and create a new bulge
+              ! apply rotations to push all bulges ka positions downward
+           ! end do
+           ! update = .false.
+           ! do i = m + ka + 1, n - 1
+              ! apply rotations to push all bulges ka positions downward
+           ! end do
+           ! to avoid duplicating code, the two loops are merged.
+           update = .true.
+           i = n + 1
+           10 continue
+           if( update ) then
+              i = i - 1
+              kbt = min( kb, i-1 )
+              i0 = i - 1
+              i1 = min( n, i+ka )
+              i2 = i - kbt + ka1
+              if( i<m+1 ) then
+                 update = .false.
+                 i = i + 1
+                 i0 = m
+                 if( ka==0 )go to 480
+                 go to 10
+              end if
+           else
+              i = i + ka
+              if( i>n-1 )go to 480
+           end if
+           if( upper ) then
+              ! transform a, working with the upper triangle
+              if( update ) then
+                 ! form  inv(s(i))**h * a * inv(s(i))
+                 bii = real( bb( kb1, i ),KIND=dp)
+                 ab( ka1, i ) = ( real( ab( ka1, i ),KIND=dp) / bii ) / bii
+                 do j = i + 1, i1
+                    ab( i-j+ka1, j ) = ab( i-j+ka1, j ) / bii
+                 end do
+                 do j = max( 1, i-ka ), i - 1
+                    ab( j-i+ka1, i ) = ab( j-i+ka1, i ) / bii
+                 end do
+                 do k = i - kbt, i - 1
+                    do j = i - kbt, k
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( j-i+kb1, i )*conjg( ab( k-i+ka1, &
+                       i ) ) -conjg( bb( k-i+kb1, i ) )*ab( j-i+ka1, i ) +real( ab( ka1, i ),&
+                                 KIND=dp)*bb( j-i+kb1, i )*conjg( bb( k-i+kb1, i ) )
+                    end do
+                    do j = max( 1, i-ka ), i - kbt - 1
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -conjg( bb( k-i+kb1, i ) )*ab( j-i+ka1,&
+                                  i )
+                    end do
+                 end do
+                 do j = i, i1
+                    do k = max( j-ka, i-kbt ), i - 1
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( k-i+kb1, i )*ab( i-j+ka1, j )
+                                 
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_zdscal( n-m, one / bii, x( m+1, i ), 1 )
+                    if( kbt>0 )call stdlib_zgerc( n-m, kbt, -cone, x( m+1, i ), 1,bb( kb1-kbt, i )&
+                              , 1, x( m+1, i-kbt ),ldx )
+                 end if
+                 ! store a(i,i1) in ra1 for use in next loop over k
+                 ra1 = ab( i-i1+ka1, i1 )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions down toward the bottom of the
+              ! band
+              loop_130: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i-k+ka<n .and. i-k>1 ) then
+                       ! generate rotation to annihilate a(i,i-k+ka+1)
+                       call stdlib_zlartg( ab( k+1, i-k+ka ), ra1,rwork( i-k+ka-m ), work( i-k+ka-&
+                                 m ), ra )
+                       ! create nonzero element a(i-k,i-k+ka+1) outside the
+                       ! band and store it in work(i-k)
+                       t = -bb( kb1-k, i )*ra1
+                       work( i-k ) = rwork( i-k+ka-m )*t -conjg( work( i-k+ka-m ) )*ab( 1, i-k+ka &
+                                 )
+                       ab( 1, i-k+ka ) = work( i-k+ka-m )*t +rwork( i-k+ka-m )*ab( 1, i-k+ka )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = max( j2, i+2*ka-k+1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( n-j2t+ka ) / ka1
+                 do j = j2t, j1, ka1
+                    ! create nonzero element a(j-ka,j+1) outside the band
+                    ! and store it in work(j-m)
+                    work( j-m ) = work( j-m )*ab( 1, j+1 )
+                    ab( 1, j+1 ) = rwork( j-m )*ab( 1, j+1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_zlargv( nrt, ab( 1, j2t ), inca, work( j2t-m ), ka1,rwork(&
+                            j2t-m ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the right
+                    do l = 1, ka - 1
+                       call stdlib_zlartv( nr, ab( ka1-l, j2 ), inca,ab( ka-l, j2+1 ), inca, &
+                                 rwork( j2-m ),work( j2-m ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_zlar2v( nr, ab( ka1, j2 ), ab( ka1, j2+1 ),ab( ka, j2+1 ), inca, &
+                              rwork( j2-m ),work( j2-m ), ka1 )
+                    call stdlib_zlacgv( nr, work( j2-m ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca, rwork( j2-m ),work( j2-m ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j2, j1, ka1
+                       call stdlib_zrot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,rwork( j-m ), &
+                                 conjg( work( j-m ) ) )
+                    end do
+                 end if
+              end do loop_130
+              if( update ) then
+                 if( i2<=n .and. kbt>0 ) then
+                    ! create nonzero element a(i-kbt,i-kbt+ka+1) outside the
+                    ! band and store it in work(i-kbt)
+                    work( i-kbt ) = -bb( kb1-kbt, i )*ra1
+                 end if
+              end if
+              loop_170: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i - k - 1 + max( 2, k-i0+1 )*ka1
+                 else
+                    j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+ka+l ) / ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( l, j2-l+1 ), inca,ab( l+1, j2-l+1 ), &
+                              inca, rwork( j2-ka ),work( j2-ka ), ka1 )
+                 end do
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 do j = j1, j2, -ka1
+                    work( j ) = work( j-ka )
+                    rwork( j ) = rwork( j-ka )
+                 end do
+                 do j = j2, j1, ka1
+                    ! create nonzero element a(j-ka,j+1) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( 1, j+1 )
+                    ab( 1, j+1 ) = rwork( j )*ab( 1, j+1 )
+                 end do
+                 if( update ) then
+                    if( i-k<n-ka .and. k<=kbt )work( i-k+ka ) = work( i-k )
+                 end if
+              end do loop_170
+              loop_210: do k = kb, 1, -1
+                 j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_zlargv( nr, ab( 1, j2 ), inca, work( j2 ), ka1,rwork( j2 ), ka1 )
+                              
+                    ! apply rotations in 2nd set from the right
+                    do l = 1, ka - 1
+                       call stdlib_zlartv( nr, ab( ka1-l, j2 ), inca,ab( ka-l, j2+1 ), inca, &
+                                 rwork( j2 ),work( j2 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_zlar2v( nr, ab( ka1, j2 ), ab( ka1, j2+1 ),ab( ka, j2+1 ), inca, &
+                              rwork( j2 ),work( j2 ), ka1 )
+                    call stdlib_zlacgv( nr, work( j2 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca, rwork( j2 ),work( j2 ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j2, j1, ka1
+                       call stdlib_zrot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,rwork( j ), conjg( &
+                                 work( j ) ) )
+                    end do
+                 end if
+              end do loop_210
+              do k = 1, kb - 1
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 ! finish applying rotations in 1st set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( l, j2+ka1-l ), inca,ab( l+1, j2+ka1-l &
+                              ), inca, rwork( j2-m ),work( j2-m ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = n - 1, j2 + ka, -1
+                    rwork( j-m ) = rwork( j-ka-m )
+                    work( j-m ) = work( j-ka-m )
+                 end do
+              end if
+           else
+              ! transform a, working with the lower triangle
+              if( update ) then
+                 ! form  inv(s(i))**h * a * inv(s(i))
+                 bii = real( bb( 1, i ),KIND=dp)
+                 ab( 1, i ) = ( real( ab( 1, i ),KIND=dp) / bii ) / bii
+                 do j = i + 1, i1
+                    ab( j-i+1, i ) = ab( j-i+1, i ) / bii
+                 end do
+                 do j = max( 1, i-ka ), i - 1
+                    ab( i-j+1, j ) = ab( i-j+1, j ) / bii
+                 end do
+                 do k = i - kbt, i - 1
+                    do j = i - kbt, k
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( i-j+1, j )*conjg( ab( i-k+1,k ) ) - &
+                       conjg( bb( i-k+1, k ) )*ab( i-j+1, j ) + real( ab( 1, i ),KIND=dp)*bb( i-j+&
+                                 1, j )*conjg( bb( i-k+1,k ) )
+                    end do
+                    do j = max( 1, i-ka ), i - kbt - 1
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -conjg( bb( i-k+1, k ) )*ab( i-j+1, j )
+                                 
+                    end do
+                 end do
+                 do j = i, i1
+                    do k = max( j-ka, i-kbt ), i - 1
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( i-k+1, k )*ab( j-i+1, i )
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_zdscal( n-m, one / bii, x( m+1, i ), 1 )
+                    if( kbt>0 )call stdlib_zgeru( n-m, kbt, -cone, x( m+1, i ), 1,bb( kbt+1, i-&
+                              kbt ), ldbb-1,x( m+1, i-kbt ), ldx )
+                 end if
+                 ! store a(i1,i) in ra1 for use in next loop over k
+                 ra1 = ab( i1-i+1, i )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions down toward the bottom of the
+              ! band
+              loop_360: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i-k+ka<n .and. i-k>1 ) then
+                       ! generate rotation to annihilate a(i-k+ka+1,i)
+                       call stdlib_zlartg( ab( ka1-k, i ), ra1, rwork( i-k+ka-m ),work( i-k+ka-m )&
+                                 , ra )
+                       ! create nonzero element a(i-k+ka+1,i-k) outside the
+                       ! band and store it in work(i-k)
+                       t = -bb( k+1, i-k )*ra1
+                       work( i-k ) = rwork( i-k+ka-m )*t -conjg( work( i-k+ka-m ) )*ab( ka1, i-k )
+                                 
+                       ab( ka1, i-k ) = work( i-k+ka-m )*t +rwork( i-k+ka-m )*ab( ka1, i-k )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = max( j2, i+2*ka-k+1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( n-j2t+ka ) / ka1
+                 do j = j2t, j1, ka1
+                    ! create nonzero element a(j+1,j-ka) outside the band
+                    ! and store it in work(j-m)
+                    work( j-m ) = work( j-m )*ab( ka1, j-ka+1 )
+                    ab( ka1, j-ka+1 ) = rwork( j-m )*ab( ka1, j-ka+1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_zlargv( nrt, ab( ka1, j2t-ka ), inca, work( j2t-m ),ka1, &
+                           rwork( j2t-m ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the left
+                    do l = 1, ka - 1
+                       call stdlib_zlartv( nr, ab( l+1, j2-l ), inca,ab( l+2, j2-l ), inca, rwork(&
+                                  j2-m ),work( j2-m ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_zlar2v( nr, ab( 1, j2 ), ab( 1, j2+1 ), ab( 2, j2 ),inca, rwork( &
+                              j2-m ), work( j2-m ), ka1 )
+                    call stdlib_zlacgv( nr, work( j2-m ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, rwork( j2-m ),work( j2-m ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j2, j1, ka1
+                       call stdlib_zrot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,rwork( j-m ), work(&
+                                  j-m ) )
+                    end do
+                 end if
+              end do loop_360
+              if( update ) then
+                 if( i2<=n .and. kbt>0 ) then
+                    ! create nonzero element a(i-kbt+ka+1,i-kbt) outside the
+                    ! band and store it in work(i-kbt)
+                    work( i-kbt ) = -bb( kbt+1, i-kbt )*ra1
+                 end if
+              end if
+              loop_400: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i - k - 1 + max( 2, k-i0+1 )*ka1
+                 else
+                    j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+ka+l ) / ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( ka1-l+1, j2-ka ), inca,ab( ka1-l, j2-&
+                              ka+1 ), inca,rwork( j2-ka ), work( j2-ka ), ka1 )
+                 end do
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 do j = j1, j2, -ka1
+                    work( j ) = work( j-ka )
+                    rwork( j ) = rwork( j-ka )
+                 end do
+                 do j = j2, j1, ka1
+                    ! create nonzero element a(j+1,j-ka) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( ka1, j-ka+1 )
+                    ab( ka1, j-ka+1 ) = rwork( j )*ab( ka1, j-ka+1 )
+                 end do
+                 if( update ) then
+                    if( i-k<n-ka .and. k<=kbt )work( i-k+ka ) = work( i-k )
+                 end if
+              end do loop_400
+              loop_440: do k = kb, 1, -1
+                 j2 = i - k - 1 + max( 1, k-i0+1 )*ka1
+                 nr = ( n-j2+ka ) / ka1
+                 j1 = j2 + ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_zlargv( nr, ab( ka1, j2-ka ), inca, work( j2 ), ka1,rwork( j2 ), &
+                              ka1 )
+                    ! apply rotations in 2nd set from the left
+                    do l = 1, ka - 1
+                       call stdlib_zlartv( nr, ab( l+1, j2-l ), inca,ab( l+2, j2-l ), inca, rwork(&
+                                  j2 ),work( j2 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_zlar2v( nr, ab( 1, j2 ), ab( 1, j2+1 ), ab( 2, j2 ),inca, rwork( &
+                              j2 ), work( j2 ), ka1 )
+                    call stdlib_zlacgv( nr, work( j2 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, rwork( j2 ),work( j2 ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j2, j1, ka1
+                       call stdlib_zrot( n-m, x( m+1, j ), 1, x( m+1, j+1 ), 1,rwork( j ), work( &
+                                 j ) )
+                    end do
+                 end if
+              end do loop_440
+              do k = 1, kb - 1
+                 j2 = i - k - 1 + max( 1, k-i0+2 )*ka1
+                 ! finish applying rotations in 1st set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( n-j2+l ) / ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( ka1-l+1, j2 ), inca,ab( ka1-l, j2+1 ),&
+                               inca, rwork( j2-m ),work( j2-m ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = n - 1, j2 + ka, -1
+                    rwork( j-m ) = rwork( j-ka-m )
+                    work( j-m ) = work( j-ka-m )
+                 end do
+              end if
+           end if
+           go to 10
+           480 continue
+           ! **************************** phase 2 *****************************
+           ! the logical structure of this phase is:
+           ! update = .true.
+           ! do i = 1, m
+              ! use s(i) to update a and create a new bulge
+              ! apply rotations to push all bulges ka positions upward
+           ! end do
+           ! update = .false.
+           ! do i = m - ka - 1, 2, -1
+              ! apply rotations to push all bulges ka positions upward
+           ! end do
+           ! to avoid duplicating code, the two loops are merged.
+           update = .true.
+           i = 0
+           490 continue
+           if( update ) then
+              i = i + 1
+              kbt = min( kb, m-i )
+              i0 = i + 1
+              i1 = max( 1, i-ka )
+              i2 = i + kbt - ka1
+              if( i>m ) then
+                 update = .false.
+                 i = i - 1
+                 i0 = m + 1
+                 if( ka==0 )return
+                 go to 490
+              end if
+           else
+              i = i - ka
+              if( i<2 )return
+           end if
+           if( i<m-kbt ) then
+              nx = m
+           else
+              nx = n
+           end if
+           if( upper ) then
+              ! transform a, working with the upper triangle
+              if( update ) then
+                 ! form  inv(s(i))**h * a * inv(s(i))
+                 bii = real( bb( kb1, i ),KIND=dp)
+                 ab( ka1, i ) = ( real( ab( ka1, i ),KIND=dp) / bii ) / bii
+                 do j = i1, i - 1
+                    ab( j-i+ka1, i ) = ab( j-i+ka1, i ) / bii
+                 end do
+                 do j = i + 1, min( n, i+ka )
+                    ab( i-j+ka1, j ) = ab( i-j+ka1, j ) / bii
+                 end do
+                 do k = i + 1, i + kbt
+                    do j = k, i + kbt
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -bb( i-j+kb1, j )*conjg( ab( i-k+ka1, &
+                       k ) ) -conjg( bb( i-k+kb1, k ) )*ab( i-j+ka1, j ) +real( ab( ka1, i ),&
+                                 KIND=dp)*bb( i-j+kb1, j )*conjg( bb( i-k+kb1, k ) )
+                    end do
+                    do j = i + kbt + 1, min( n, i+ka )
+                       ab( k-j+ka1, j ) = ab( k-j+ka1, j ) -conjg( bb( i-k+kb1, k ) )*ab( i-j+ka1,&
+                                  j )
+                    end do
+                 end do
+                 do j = i1, i
+                    do k = i + 1, min( j+ka, i+kbt )
+                       ab( j-k+ka1, k ) = ab( j-k+ka1, k ) -bb( i-k+kb1, k )*ab( j-i+ka1, i )
+                                 
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_zdscal( nx, one / bii, x( 1, i ), 1 )
+                    if( kbt>0 )call stdlib_zgeru( nx, kbt, -cone, x( 1, i ), 1,bb( kb, i+1 ), &
+                              ldbb-1, x( 1, i+1 ), ldx )
+                 end if
+                 ! store a(i1,i) in ra1 for use in next loop over k
+                 ra1 = ab( i1-i+ka1, i )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions up toward the top of the band
+              loop_610: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i+k-ka1>0 .and. i+k<m ) then
+                       ! generate rotation to annihilate a(i+k-ka-1,i)
+                       call stdlib_zlartg( ab( k+1, i ), ra1, rwork( i+k-ka ),work( i+k-ka ), ra )
+                                 
+                       ! create nonzero element a(i+k-ka-1,i+k) outside the
+                       ! band and store it in work(m-kb+i+k)
+                       t = -bb( kb1-k, i+k )*ra1
+                       work( m-kb+i+k ) = rwork( i+k-ka )*t -conjg( work( i+k-ka ) )*ab( 1, i+k )
+                                 
+                       ab( 1, i+k ) = work( i+k-ka )*t +rwork( i+k-ka )*ab( 1, i+k )
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = min( j2, i-2*ka+k-1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( j2t+ka-1 ) / ka1
+                 do j = j1, j2t, ka1
+                    ! create nonzero element a(j-1,j+ka) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( 1, j+ka-1 )
+                    ab( 1, j+ka-1 ) = rwork( j )*ab( 1, j+ka-1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_zlargv( nrt, ab( 1, j1+ka ), inca, work( j1 ), ka1,rwork( &
+                           j1 ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the left
+                    do l = 1, ka - 1
+                       call stdlib_zlartv( nr, ab( ka1-l, j1+l ), inca,ab( ka-l, j1+l ), inca, &
+                                 rwork( j1 ),work( j1 ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_zlar2v( nr, ab( ka1, j1 ), ab( ka1, j1-1 ),ab( ka, j1 ), inca, &
+                              rwork( j1 ), work( j1 ),ka1 )
+                    call stdlib_zlacgv( nr, work( j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                               rwork( j1t ),work( j1t ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j1, j2, ka1
+                       call stdlib_zrot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,rwork( j ), work( j ) )
+                                 
+                    end do
+                 end if
+              end do loop_610
+              if( update ) then
+                 if( i2>0 .and. kbt>0 ) then
+                    ! create nonzero element a(i+kbt-ka-1,i+kbt) outside the
+                    ! band and store it in work(m-kb+i+kbt)
+                    work( m-kb+i+kbt ) = -bb( kb1-kbt, i+kbt )*ra1
+                 end if
+              end if
+              loop_650: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i + k + 1 - max( 2, k+i0-m )*ka1
+                 else
+                    j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+ka+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( l, j1t+ka ), inca,ab( l+1, j1t+ka-1 ),&
+                               inca,rwork( m-kb+j1t+ka ),work( m-kb+j1t+ka ), ka1 )
+                 end do
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 do j = j1, j2, ka1
+                    work( m-kb+j ) = work( m-kb+j+ka )
+                    rwork( m-kb+j ) = rwork( m-kb+j+ka )
+                 end do
+                 do j = j1, j2, ka1
+                    ! create nonzero element a(j-1,j+ka) outside the band
+                    ! and store it in work(m-kb+j)
+                    work( m-kb+j ) = work( m-kb+j )*ab( 1, j+ka-1 )
+                    ab( 1, j+ka-1 ) = rwork( m-kb+j )*ab( 1, j+ka-1 )
+                 end do
+                 if( update ) then
+                    if( i+k>ka1 .and. k<=kbt )work( m-kb+i+k-ka ) = work( m-kb+i+k )
+                 end if
+              end do loop_650
+              loop_690: do k = kb, 1, -1
+                 j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_zlargv( nr, ab( 1, j1+ka ), inca, work( m-kb+j1 ),ka1, rwork( m-&
+                              kb+j1 ), ka1 )
+                    ! apply rotations in 2nd set from the left
+                    do l = 1, ka - 1
+                       call stdlib_zlartv( nr, ab( ka1-l, j1+l ), inca,ab( ka-l, j1+l ), inca, &
+                                 rwork( m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_zlar2v( nr, ab( ka1, j1 ), ab( ka1, j1-1 ),ab( ka, j1 ), inca, &
+                              rwork( m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                    call stdlib_zlacgv( nr, work( m-kb+j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the right
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                              rwork( m-kb+j1t ), work( m-kb+j1t ),ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j1, j2, ka1
+                       call stdlib_zrot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,rwork( m-kb+j ), work( &
+                                 m-kb+j ) )
+                    end do
+                 end if
+              end do loop_690
+              do k = 1, kb - 1
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 ! finish applying rotations in 1st set from the right
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( l, j1t ), inca,ab( l+1, j1t-1 ), inca,&
+                               rwork( j1t ),work( j1t ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = 2, i2 - ka
+                    rwork( j ) = rwork( j+ka )
+                    work( j ) = work( j+ka )
+                 end do
+              end if
+           else
+              ! transform a, working with the lower triangle
+              if( update ) then
+                 ! form  inv(s(i))**h * a * inv(s(i))
+                 bii = real( bb( 1, i ),KIND=dp)
+                 ab( 1, i ) = ( real( ab( 1, i ),KIND=dp) / bii ) / bii
+                 do j = i1, i - 1
+                    ab( i-j+1, j ) = ab( i-j+1, j ) / bii
+                 end do
+                 do j = i + 1, min( n, i+ka )
+                    ab( j-i+1, i ) = ab( j-i+1, i ) / bii
+                 end do
+                 do k = i + 1, i + kbt
+                    do j = k, i + kbt
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -bb( j-i+1, i )*conjg( ab( k-i+1,i ) ) - &
+                       conjg( bb( k-i+1, i ) )*ab( j-i+1, i ) + real( ab( 1, i ),KIND=dp)*bb( j-i+&
+                                 1, i )*conjg( bb( k-i+1,i ) )
+                    end do
+                    do j = i + kbt + 1, min( n, i+ka )
+                       ab( j-k+1, k ) = ab( j-k+1, k ) -conjg( bb( k-i+1, i ) )*ab( j-i+1, i )
+                                 
+                    end do
+                 end do
+                 do j = i1, i
+                    do k = i + 1, min( j+ka, i+kbt )
+                       ab( k-j+1, j ) = ab( k-j+1, j ) -bb( k-i+1, i )*ab( i-j+1, j )
+                    end do
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by inv(s(i))
+                    call stdlib_zdscal( nx, one / bii, x( 1, i ), 1 )
+                    if( kbt>0 )call stdlib_zgerc( nx, kbt, -cone, x( 1, i ), 1, bb( 2, i ),1, x( &
+                              1, i+1 ), ldx )
+                 end if
+                 ! store a(i,i1) in ra1 for use in next loop over k
+                 ra1 = ab( i-i1+1, i1 )
+              end if
+              ! generate and apply vectors of rotations to chase all the
+              ! existing bulges ka positions up toward the top of the band
+              loop_840: do k = 1, kb - 1
+                 if( update ) then
+                    ! determine the rotations which would annihilate the bulge
+                    ! which has in theory just been created
+                    if( i+k-ka1>0 .and. i+k<m ) then
+                       ! generate rotation to annihilate a(i,i+k-ka-1)
+                       call stdlib_zlartg( ab( ka1-k, i+k-ka ), ra1,rwork( i+k-ka ), work( i+k-ka &
+                                 ), ra )
+                       ! create nonzero element a(i+k,i+k-ka-1) outside the
+                       ! band and store it in work(m-kb+i+k)
+                       t = -bb( k+1, i )*ra1
+                       work( m-kb+i+k ) = rwork( i+k-ka )*t -conjg( work( i+k-ka ) )*ab( ka1, i+k-&
+                                 ka )
+                       ab( ka1, i+k-ka ) = work( i+k-ka )*t +rwork( i+k-ka )*ab( ka1, i+k-ka )
+                                 
+                       ra1 = ra
+                    end if
+                 end if
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( update ) then
+                    j2t = min( j2, i-2*ka+k-1 )
+                 else
+                    j2t = j2
+                 end if
+                 nrt = ( j2t+ka-1 ) / ka1
+                 do j = j1, j2t, ka1
+                    ! create nonzero element a(j+ka,j-1) outside the band
+                    ! and store it in work(j)
+                    work( j ) = work( j )*ab( ka1, j-1 )
+                    ab( ka1, j-1 ) = rwork( j )*ab( ka1, j-1 )
+                 end do
+                 ! generate rotations in 1st set to annihilate elements which
+                 ! have been created outside the band
+                 if( nrt>0 )call stdlib_zlargv( nrt, ab( ka1, j1 ), inca, work( j1 ), ka1,rwork( &
+                           j1 ), ka1 )
+                 if( nr>0 ) then
+                    ! apply rotations in 1st set from the right
+                    do l = 1, ka - 1
+                       call stdlib_zlartv( nr, ab( l+1, j1 ), inca, ab( l+2, j1-1 ),inca, rwork( &
+                                 j1 ), work( j1 ), ka1 )
+                    end do
+                    ! apply rotations in 1st set from both sides to diagonal
+                    ! blocks
+                    call stdlib_zlar2v( nr, ab( 1, j1 ), ab( 1, j1-1 ),ab( 2, j1-1 ), inca, rwork(&
+                               j1 ),work( j1 ), ka1 )
+                    call stdlib_zlacgv( nr, work( j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 1st set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,rwork( j1t ), work( j1t ), ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 1st set
+                    do j = j1, j2, ka1
+                       call stdlib_zrot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,rwork( j ), conjg( work(&
+                                  j ) ) )
+                    end do
+                 end if
+              end do loop_840
+              if( update ) then
+                 if( i2>0 .and. kbt>0 ) then
+                    ! create nonzero element a(i+kbt,i+kbt-ka-1) outside the
+                    ! band and store it in work(m-kb+i+kbt)
+                    work( m-kb+i+kbt ) = -bb( kbt+1, i )*ra1
+                 end if
+              end if
+              loop_880: do k = kb, 1, -1
+                 if( update ) then
+                    j2 = i + k + 1 - max( 2, k+i0-m )*ka1
+                 else
+                    j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 end if
+                 ! finish applying rotations in 2nd set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+ka+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( ka1-l+1, j1t+l-1 ), inca,ab( ka1-l, &
+                              j1t+l-1 ), inca,rwork( m-kb+j1t+ka ),work( m-kb+j1t+ka ), ka1 )
+                 end do
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 do j = j1, j2, ka1
+                    work( m-kb+j ) = work( m-kb+j+ka )
+                    rwork( m-kb+j ) = rwork( m-kb+j+ka )
+                 end do
+                 do j = j1, j2, ka1
+                    ! create nonzero element a(j+ka,j-1) outside the band
+                    ! and store it in work(m-kb+j)
+                    work( m-kb+j ) = work( m-kb+j )*ab( ka1, j-1 )
+                    ab( ka1, j-1 ) = rwork( m-kb+j )*ab( ka1, j-1 )
+                 end do
+                 if( update ) then
+                    if( i+k>ka1 .and. k<=kbt )work( m-kb+i+k-ka ) = work( m-kb+i+k )
+                 end if
+              end do loop_880
+              loop_920: do k = kb, 1, -1
+                 j2 = i + k + 1 - max( 1, k+i0-m )*ka1
+                 nr = ( j2+ka-1 ) / ka1
+                 j1 = j2 - ( nr-1 )*ka1
+                 if( nr>0 ) then
+                    ! generate rotations in 2nd set to annihilate elements
+                    ! which have been created outside the band
+                    call stdlib_zlargv( nr, ab( ka1, j1 ), inca, work( m-kb+j1 ),ka1, rwork( m-kb+&
+                              j1 ), ka1 )
+                    ! apply rotations in 2nd set from the right
+                    do l = 1, ka - 1
+                       call stdlib_zlartv( nr, ab( l+1, j1 ), inca, ab( l+2, j1-1 ),inca, rwork( &
+                                 m-kb+j1 ), work( m-kb+j1 ),ka1 )
+                    end do
+                    ! apply rotations in 2nd set from both sides to diagonal
+                    ! blocks
+                    call stdlib_zlar2v( nr, ab( 1, j1 ), ab( 1, j1-1 ),ab( 2, j1-1 ), inca, rwork(&
+                               m-kb+j1 ),work( m-kb+j1 ), ka1 )
+                    call stdlib_zlacgv( nr, work( m-kb+j1 ), ka1 )
+                 end if
+                 ! start applying rotations in 2nd set from the left
+                 do l = ka - 1, kb - k + 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,rwork( m-kb+j1t ), work( m-kb+j1t ),ka1 )
+                 end do
+                 if( wantx ) then
+                    ! post-multiply x by product of rotations in 2nd set
+                    do j = j1, j2, ka1
+                       call stdlib_zrot( nx, x( 1, j ), 1, x( 1, j-1 ), 1,rwork( m-kb+j ), conjg( &
+                                 work( m-kb+j ) ) )
+                    end do
+                 end if
+              end do loop_920
+              do k = 1, kb - 1
+                 j2 = i + k + 1 - max( 1, k+i0-m+1 )*ka1
+                 ! finish applying rotations in 1st set from the left
+                 do l = kb - k, 1, -1
+                    nrt = ( j2+l-1 ) / ka1
+                    j1t = j2 - ( nrt-1 )*ka1
+                    if( nrt>0 )call stdlib_zlartv( nrt, ab( ka1-l+1, j1t-ka1+l ), inca,ab( ka1-l, &
+                              j1t-ka1+l ), inca,rwork( j1t ), work( j1t ), ka1 )
+                 end do
+              end do
+              if( kb>1 ) then
+                 do j = 2, i2 - ka
+                    rwork( j ) = rwork( j+ka )
+                    work( j ) = work( j+ka )
+                 end do
+              end if
+           end if
+           go to 490
+     end subroutine stdlib_zhbgst
+
+     !> ZHBTRD: reduces a complex Hermitian band matrix A to real symmetric
+     !> tridiagonal form T by a unitary similarity transformation:
+     !> Q**H * A * Q = T.
+
+     pure subroutine stdlib_zhbtrd( vect, uplo, n, kd, ab, ldab, d, e, q, ldq,work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldq, n
+           ! Array Arguments 
+           real(dp), intent(out) :: d(*), e(*)
+           complex(dp), intent(inout) :: ab(ldab,*), q(ldq,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: initq, upper, wantq
+           integer(ilp) :: i, i2, ibl, inca, incx, iqaend, iqb, iqend, j, j1, j1end, j1inc, j2, &
+                     jend, jin, jinc, k, kd1, kdm1, kdn, l, last, lend, nq, nr, nrt
+           real(dp) :: abst
+           complex(dp) :: t, temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,conjg,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           initq = stdlib_lsame( vect, 'V' )
+           wantq = initq .or. stdlib_lsame( vect, 'U' )
+           upper = stdlib_lsame( uplo, 'U' )
+           kd1 = kd + 1
+           kdm1 = kd - 1
+           incx = ldab - 1
+           iqend = 1
+           info = 0
+           if( .not.wantq .and. .not.stdlib_lsame( vect, 'N' ) ) then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd1 ) then
+              info = -6
+           else if( ldq<max( 1, n ) .and. wantq ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBTRD', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! initialize q to the unit matrix, if needed
+           if( initq )call stdlib_zlaset( 'FULL', n, n, czero, cone, q, ldq )
+           ! wherever possible, plane rotations are generated and applied in
+           ! vector operations of length nr over the index set j1:j2:kd1.
+           ! the real cosines and complex sines of the plane rotations are
+           ! stored in the arrays d and work.
+           inca = kd1*ldab
+           kdn = min( n-1, kd )
+           if( upper ) then
+              if( kd>1 ) then
+                 ! reduce to complex hermitian tridiagonal form, working with
+                 ! the upper triangle
+                 nr = 0
+                 j1 = kdn + 2
+                 j2 = 1
+                 ab( kd1, 1 ) = real( ab( kd1, 1 ),KIND=dp)
+                 loop_90: do i = 1, n - 2
+                    ! reduce i-th row of matrix to tridiagonal form
+                    loop_80: do k = kdn + 1, 2, -1
+                       j1 = j1 + kdn
+                       j2 = j2 + kdn
+                       if( nr>0 ) then
+                          ! generate plane rotations to annihilate nonzero
+                          ! elements which have been created outside the band
+                          call stdlib_zlargv( nr, ab( 1, j1-1 ), inca, work( j1 ),kd1, d( j1 ), &
+                                    kd1 )
+                          ! apply rotations from the right
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_zlartv or stdlib_zrot is used
+                          if( nr>=2*kd-1 ) then
+                             do l = 1, kd - 1
+                                call stdlib_zlartv( nr, ab( l+1, j1-1 ), inca,ab( l, j1 ), inca, &
+                                          d( j1 ),work( j1 ), kd1 )
+                             end do
+                          else
+                             jend = j1 + ( nr-1 )*kd1
+                             do jinc = j1, jend, kd1
+                                call stdlib_zrot( kdm1, ab( 2, jinc-1 ), 1,ab( 1, jinc ), 1, d( &
+                                          jinc ),work( jinc ) )
+                             end do
+                          end if
+                       end if
+                       if( k>2 ) then
+                          if( k<=n-i+1 ) then
+                             ! generate plane rotation to annihilate a(i,i+k-1)
+                             ! within the band
+                             call stdlib_zlartg( ab( kd-k+3, i+k-2 ),ab( kd-k+2, i+k-1 ), d( i+k-&
+                                       1 ),work( i+k-1 ), temp )
+                             ab( kd-k+3, i+k-2 ) = temp
+                             ! apply rotation from the right
+                             call stdlib_zrot( k-3, ab( kd-k+4, i+k-2 ), 1,ab( kd-k+3, i+k-1 ), 1,&
+                                        d( i+k-1 ),work( i+k-1 ) )
+                          end if
+                          nr = nr + 1
+                          j1 = j1 - kdn - 1
+                       end if
+                       ! apply plane rotations from both sides to diagonal
+                       ! blocks
+                       if( nr>0 )call stdlib_zlar2v( nr, ab( kd1, j1-1 ), ab( kd1, j1 ),ab( kd, &
+                                 j1 ), inca, d( j1 ),work( j1 ), kd1 )
+                       ! apply plane rotations from the left
+                       if( nr>0 ) then
+                          call stdlib_zlacgv( nr, work( j1 ), kd1 )
+                          if( 2*kd-1<nr ) then
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_zlartv or stdlib_zrot is used
+                             do l = 1, kd - 1
+                                if( j2+l>n ) then
+                                   nrt = nr - 1
+                                else
+                                   nrt = nr
+                                end if
+                                if( nrt>0 )call stdlib_zlartv( nrt, ab( kd-l, j1+l ), inca,ab( kd-&
+                                          l+1, j1+l ), inca,d( j1 ), work( j1 ), kd1 )
+                             end do
+                          else
+                             j1end = j1 + kd1*( nr-2 )
+                             if( j1end>=j1 ) then
+                                do jin = j1, j1end, kd1
+                                   call stdlib_zrot( kd-1, ab( kd-1, jin+1 ), incx,ab( kd, jin+1 )&
+                                             , incx,d( jin ), work( jin ) )
+                                end do
+                             end if
+                             lend = min( kdm1, n-j2 )
+                             last = j1end + kd1
+                             if( lend>0 )call stdlib_zrot( lend, ab( kd-1, last+1 ), incx,ab( kd, &
+                                       last+1 ), incx, d( last ),work( last ) )
+                          end if
+                       end if
+                       if( wantq ) then
+                          ! accumulate product of plane rotations in q
+                          if( initq ) then
+                       ! take advantage of the fact that q was
+                       ! initially the identity matrix
+                             iqend = max( iqend, j2 )
+                             i2 = max( 0, k-3 )
+                             iqaend = 1 + i*kd
+                             if( k==2 )iqaend = iqaend + kd
+                             iqaend = min( iqaend, iqend )
+                             do j = j1, j2, kd1
+                                ibl = i - i2 / kdm1
+                                i2 = i2 + 1
+                                iqb = max( 1, j-ibl )
+                                nq = 1 + iqaend - iqb
+                                iqaend = min( iqaend+kd, iqend )
+                                call stdlib_zrot( nq, q( iqb, j-1 ), 1, q( iqb, j ),1, d( j ), &
+                                          conjg( work( j ) ) )
+                             end do
+                          else
+                             do j = j1, j2, kd1
+                                call stdlib_zrot( n, q( 1, j-1 ), 1, q( 1, j ), 1,d( j ), conjg( &
+                                          work( j ) ) )
+                             end do
+                          end if
+                       end if
+                       if( j2+kdn>n ) then
+                          ! adjust j2 to keep within the bounds of the matrix
+                          nr = nr - 1
+                          j2 = j2 - kdn - 1
+                       end if
+                       do j = j1, j2, kd1
+                          ! create nonzero element a(j-1,j+kd) outside the band
+                          ! and store it in work
+                          work( j+kd ) = work( j )*ab( 1, j+kd )
+                          ab( 1, j+kd ) = d( j )*ab( 1, j+kd )
+                       end do
+                    end do loop_80
+                 end do loop_90
+              end if
+              if( kd>0 ) then
+                 ! make off-diagonal elements real and copy them to e
+                 do i = 1, n - 1
+                    t = ab( kd, i+1 )
+                    abst = abs( t )
+                    ab( kd, i+1 ) = abst
+                    e( i ) = abst
+                    if( abst/=zero ) then
+                       t = t / abst
+                    else
+                       t = cone
+                    end if
+                    if( i<n-1 )ab( kd, i+2 ) = ab( kd, i+2 )*t
+                    if( wantq ) then
+                       call stdlib_zscal( n, conjg( t ), q( 1, i+1 ), 1 )
+                    end if
+                 end do
+              else
+                 ! set e to zero if original matrix was diagonal
+                 do i = 1, n - 1
+                    e( i ) = zero
+                 end do
+              end if
+              ! copy diagonal elements to d
+              do i = 1, n
+                 d( i ) = real( ab( kd1, i ),KIND=dp)
+              end do
+           else
+              if( kd>1 ) then
+                 ! reduce to complex hermitian tridiagonal form, working with
+                 ! the lower triangle
+                 nr = 0
+                 j1 = kdn + 2
+                 j2 = 1
+                 ab( 1, 1 ) = real( ab( 1, 1 ),KIND=dp)
+                 loop_210: do i = 1, n - 2
+                    ! reduce i-th column of matrix to tridiagonal form
+                    loop_200: do k = kdn + 1, 2, -1
+                       j1 = j1 + kdn
+                       j2 = j2 + kdn
+                       if( nr>0 ) then
+                          ! generate plane rotations to annihilate nonzero
+                          ! elements which have been created outside the band
+                          call stdlib_zlargv( nr, ab( kd1, j1-kd1 ), inca,work( j1 ), kd1, d( j1 )&
+                                    , kd1 )
+                          ! apply plane rotations from one side
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_zlartv or stdlib_zrot is used
+                          if( nr>2*kd-1 ) then
+                             do l = 1, kd - 1
+                                call stdlib_zlartv( nr, ab( kd1-l, j1-kd1+l ), inca,ab( kd1-l+1, &
+                                          j1-kd1+l ), inca,d( j1 ), work( j1 ), kd1 )
+                             end do
+                          else
+                             jend = j1 + kd1*( nr-1 )
+                             do jinc = j1, jend, kd1
+                                call stdlib_zrot( kdm1, ab( kd, jinc-kd ), incx,ab( kd1, jinc-kd )&
+                                          , incx,d( jinc ), work( jinc ) )
+                             end do
+                          end if
+                       end if
+                       if( k>2 ) then
+                          if( k<=n-i+1 ) then
+                             ! generate plane rotation to annihilate a(i+k-1,i)
+                             ! within the band
+                             call stdlib_zlartg( ab( k-1, i ), ab( k, i ),d( i+k-1 ), work( i+k-1 &
+                                       ), temp )
+                             ab( k-1, i ) = temp
+                             ! apply rotation from the left
+                             call stdlib_zrot( k-3, ab( k-2, i+1 ), ldab-1,ab( k-1, i+1 ), ldab-1,&
+                                        d( i+k-1 ),work( i+k-1 ) )
+                          end if
+                          nr = nr + 1
+                          j1 = j1 - kdn - 1
+                       end if
+                       ! apply plane rotations from both sides to diagonal
+                       ! blocks
+                       if( nr>0 )call stdlib_zlar2v( nr, ab( 1, j1-1 ), ab( 1, j1 ),ab( 2, j1-1 ),&
+                                  inca, d( j1 ),work( j1 ), kd1 )
+                       ! apply plane rotations from the right
+                          ! dependent on the the number of diagonals either
+                          ! stdlib_zlartv or stdlib_zrot is used
+                       if( nr>0 ) then
+                          call stdlib_zlacgv( nr, work( j1 ), kd1 )
+                          if( nr>2*kd-1 ) then
+                             do l = 1, kd - 1
+                                if( j2+l>n ) then
+                                   nrt = nr - 1
+                                else
+                                   nrt = nr
+                                end if
+                                if( nrt>0 )call stdlib_zlartv( nrt, ab( l+2, j1-1 ), inca,ab( l+1,&
+                                           j1 ), inca, d( j1 ),work( j1 ), kd1 )
+                             end do
+                          else
+                             j1end = j1 + kd1*( nr-2 )
+                             if( j1end>=j1 ) then
+                                do j1inc = j1, j1end, kd1
+                                   call stdlib_zrot( kdm1, ab( 3, j1inc-1 ), 1,ab( 2, j1inc ), 1, &
+                                             d( j1inc ),work( j1inc ) )
+                                end do
+                             end if
+                             lend = min( kdm1, n-j2 )
+                             last = j1end + kd1
+                             if( lend>0 )call stdlib_zrot( lend, ab( 3, last-1 ), 1,ab( 2, last ),&
+                                        1, d( last ),work( last ) )
+                          end if
+                       end if
+                       if( wantq ) then
+                          ! accumulate product of plane rotations in q
+                          if( initq ) then
+                       ! take advantage of the fact that q was
+                       ! initially the identity matrix
+                             iqend = max( iqend, j2 )
+                             i2 = max( 0, k-3 )
+                             iqaend = 1 + i*kd
+                             if( k==2 )iqaend = iqaend + kd
+                             iqaend = min( iqaend, iqend )
+                             do j = j1, j2, kd1
+                                ibl = i - i2 / kdm1
+                                i2 = i2 + 1
+                                iqb = max( 1, j-ibl )
+                                nq = 1 + iqaend - iqb
+                                iqaend = min( iqaend+kd, iqend )
+                                call stdlib_zrot( nq, q( iqb, j-1 ), 1, q( iqb, j ),1, d( j ), &
+                                          work( j ) )
+                             end do
+                          else
+                             do j = j1, j2, kd1
+                                call stdlib_zrot( n, q( 1, j-1 ), 1, q( 1, j ), 1,d( j ), work( j &
+                                          ) )
+                             end do
+                          end if
+                       end if
+                       if( j2+kdn>n ) then
+                          ! adjust j2 to keep within the bounds of the matrix
+                          nr = nr - 1
+                          j2 = j2 - kdn - 1
+                       end if
+                       do j = j1, j2, kd1
+                          ! create nonzero element a(j+kd,j-1) outside the
+                          ! band and store it in work
+                          work( j+kd ) = work( j )*ab( kd1, j )
+                          ab( kd1, j ) = d( j )*ab( kd1, j )
+                       end do
+                    end do loop_200
+                 end do loop_210
+              end if
+              if( kd>0 ) then
+                 ! make off-diagonal elements real and copy them to e
+                 do i = 1, n - 1
+                    t = ab( 2, i )
+                    abst = abs( t )
+                    ab( 2, i ) = abst
+                    e( i ) = abst
+                    if( abst/=zero ) then
+                       t = t / abst
+                    else
+                       t = cone
+                    end if
+                    if( i<n-1 )ab( 2, i+1 ) = ab( 2, i+1 )*t
+                    if( wantq ) then
+                       call stdlib_zscal( n, t, q( 1, i+1 ), 1 )
+                    end if
+                 end do
+              else
+                 ! set e to zero if original matrix was diagonal
+                 do i = 1, n - 1
+                    e( i ) = zero
+                 end do
+              end if
+              ! copy diagonal elements to d
+              do i = 1, n
+                 d( i ) = real( ab( 1, i ),KIND=dp)
+              end do
+           end if
+           return
+     end subroutine stdlib_zhbtrd
+
+     !> ZHECON: estimates the reciprocal of the condition number of a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by ZHETRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_zhecon( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(dp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHECON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**h) or inv(u*d*u**h).
+              call stdlib_zhetrs( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_zhecon
+
+     !> ZHECON_ROOK: estimates the reciprocal of the condition number of a complex
+     !> Hermitian matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by CHETRF_ROOK.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_zhecon_rook( uplo, n, a, lda, ipiv, anorm, rcond, work,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, kase
+           real(dp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( anorm<zero ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHECON_ROOK', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              do i = 1, n
+                 if( ipiv( i )>0 .and. a( i, i )==zero )return
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**h) or inv(u*d*u**h).
+              call stdlib_zhetrs_rook( uplo, n, 1, a, lda, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_zhecon_rook
+
+     !> ZHEEV: computes all eigenvalues and, optionally, eigenvectors of a
+     !> complex Hermitian matrix A.
+
+     subroutine stdlib_zheev( jobz, uplo, n, a, lda, w, work, lwork, rwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, lwork, n
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indtau, indwrk, iscale, llwork, lwkopt, nb
+           real(dp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, ( nb+1 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, 2*n-1 ) .and. .not.lquery )info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              return
+           end if
+           if( n==1 ) then
+              w( 1 ) = real( a( 1, 1 ),KIND=dp)
+              work( 1 ) = 1
+              if( wantz )a( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_zlanhe( 'M', uplo, n, a, lda, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 )call stdlib_zlascl( uplo, 0, 0, one, sigma, n, n, a, lda, info )
+           ! call stdlib_zhetrd to reduce hermitian matrix to tridiagonal form.
+           inde = 1
+           indtau = 1
+           indwrk = indtau + n
+           llwork = lwork - indwrk + 1
+           call stdlib_zhetrd( uplo, n, a, lda, w, rwork( inde ), work( indtau ),work( indwrk ), &
+                     llwork, iinfo )
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, first call
+           ! stdlib_zungtr to generate the unitary matrix, then call stdlib_zsteqr.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_zungtr( uplo, n, a, lda, work( indtau ), work( indwrk ),llwork, iinfo )
+                        
+              indwrk = inde + n
+              call stdlib_zsteqr( jobz, n, w, rwork( inde ), a, lda,rwork( indwrk ), info )
+                        
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           ! set work(1) to optimal complex workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zheev
+
+     !> ZHEEVR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex Hermitian matrix A.  Eigenvalues and eigenvectors can
+     !> be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+     !> ZHEEVR first reduces the matrix A to tridiagonal form T with a call
+     !> to ZHETRD.  Then, whenever possible, ZHEEVR calls ZSTEMR to compute
+     !> eigenspectrum using Relatively Robust Representations.  ZSTEMR
+     !> computes eigenvalues by the dqds algorithm, while orthogonal
+     !> eigenvectors are computed from various "good" L D L^T representations
+     !> (also known as Relatively Robust Representations). Gram-Schmidt
+     !> orthogonalization is avoided as far as possible. More specifically,
+     !> the various steps of the algorithm are as follows.
+     !> For each unreduced block (submatrix) of T,
+     !> (a) Compute T - sigma I  = L D L^T, so that L and D
+     !> define all the wanted eigenvalues to high relative accuracy.
+     !> This means that small relative changes in the entries of D and L
+     !> cause only small relative changes in the eigenvalues and
+     !> eigenvectors. The standard (unfactored) representation of the
+     !> tridiagonal matrix T does not have this property in general.
+     !> (b) Compute the eigenvalues to suitable accuracy.
+     !> If the eigenvectors are desired, the algorithm attains full
+     !> accuracy of the computed eigenvalues only right before
+     !> the corresponding vectors have to be computed, see steps c) and d).
+     !> (c) For each cluster of close eigenvalues, select a new
+     !> shift close to the cluster, find a new factorization, and refine
+     !> the shifted eigenvalues to suitable accuracy.
+     !> (d) For each eigenvalue with a large enough relative separation compute
+     !> the corresponding eigenvector by forming a rank revealing twisted
+     !> factorization. Go back to (c) for any clusters that remain.
+     !> The desired accuracy of the output can be specified by the input
+     !> parameter ABSTOL.
+     !> For more details, see ZSTEMR's documentation and:
+     !> - Inderjit S. Dhillon and Beresford N. Parlett: "Multiple representations
+     !> to compute orthogonal eigenvectors of symmetric tridiagonal matrices,"
+     !> Linear Algebra and its Applications, 387(1), pp. 1-28, August 2004.
+     !> - Inderjit Dhillon and Beresford Parlett: "Orthogonal Eigenvectors and
+     !> Relative Gaps," SIAM Journal on Matrix Analysis and Applications, Vol. 25,
+     !> 2004.  Also LAPACK Working Note 154.
+     !> - Inderjit Dhillon: "A new O(n^2) algorithm for the symmetric
+     !> tridiagonal eigenvalue/eigenvector problem",
+     !> Computer Science Division Technical Report No. UCB/CSD-97-971,
+     !> UC Berkeley, May 1997.
+     !> Note 1 : ZHEEVR calls ZSTEMR when the full spectrum is requested
+     !> on machines which conform to the ieee-754 floating point standard.
+     !> ZHEEVR calls DSTEBZ and ZSTEIN on non-ieee machines and
+     !> when partial spectrum requests are made.
+     !> Normal execution of ZSTEMR may create NaNs and infinities and
+     !> hence may abort due to a floating point exception in environments
+     !> which do not handle NaNs and infinities in the ieee standard default
+     !> manner.
+
+     subroutine stdlib_zheevr( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               isuppz, work, lwork,rwork, lrwork, iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, lda, ldz, liwork, lrwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, lquery, test, valeig, wantz, tryrac
+           character :: order
+           integer(ilp) :: i, ieeeok, iinfo, imax, indibl, indifl, indisp, indiwo, indrd, indrdd, &
+           indre, indree, indrwk, indtau, indwk, indwkn, iscale, itmp1, j, jj, liwmin, llwork, &
+                     llrwork, llwrkn, lrwmin, lwkopt, lwmin, nb, nsplit
+           real(dp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           ieeeok = stdlib_ilaenv( 10, 'ZHEEVR', 'N', 1, 2, 3, 4 )
+           lower = stdlib_lsame( uplo, 'L' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( ( lwork==-1 ) .or. ( lrwork==-1 ) .or.( liwork==-1 ) )
+           lrwmin = max( 1, 24*n )
+           liwmin = max( 1, 10*n )
+           lwmin = max( 1, 2*n )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -8
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -9
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -15
+              end if
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 )
+              nb = max( nb, stdlib_ilaenv( 1, 'ZUNMTR', uplo, n, -1, -1, -1 ) )
+              lwkopt = max( ( nb+1 )*n, lwmin )
+              work( 1 ) = lwkopt
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -18
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -20
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -22
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEEVR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( n==1 ) then
+              work( 1 ) = 2
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = real( a( 1, 1 ),KIND=dp)
+              else
+                 if( vl<real( a( 1, 1 ),KIND=dp) .and. vu>=real( a( 1, 1 ),KIND=dp) )then
+                    m = 1
+                    w( 1 ) = real( a( 1, 1 ),KIND=dp)
+                 end if
+              end if
+              if( wantz ) then
+                 z( 1, 1 ) = one
+                 isuppz( 1 ) = 1
+                 isuppz( 2 ) = 1
+              end if
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if (valeig) then
+              vll = vl
+              vuu = vu
+           end if
+           anrm = stdlib_zlansy( 'M', uplo, n, a, lda, rwork )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 do j = 1, n
+                    call stdlib_zdscal( n-j+1, sigma, a( j, j ), 1 )
+                 end do
+              else
+                 do j = 1, n
+                    call stdlib_zdscal( j, sigma, a( 1, j ), 1 )
+                 end do
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! initialize indices into workspaces.  note: the iwork indices are
+           ! used only if stdlib_dsterf or stdlib_zstemr fail.
+           ! work(indtau:indtau+n-1) stores the complex scalar factors of the
+           ! elementary reflectors used in stdlib_zhetrd.
+           indtau = 1
+           ! indwk is the starting offset of the remaining complex workspace,
+           ! and llwork is the remaining complex workspace size.
+           indwk = indtau + n
+           llwork = lwork - indwk + 1
+           ! rwork(indrd:indrd+n-1) stores the real tridiagonal's diagonal
+           ! entries.
+           indrd = 1
+           ! rwork(indre:indre+n-1) stores the off-diagonal entries of the
+           ! tridiagonal matrix from stdlib_zhetrd.
+           indre = indrd + n
+           ! rwork(indrdd:indrdd+n-1) is a copy of the diagonal entries over
+           ! -written by stdlib_zstemr (the stdlib_dsterf path copies the diagonal to w).
+           indrdd = indre + n
+           ! rwork(indree:indree+n-1) is a copy of the off-diagonal entries over
+           ! -written while computing the eigenvalues in stdlib_dsterf and stdlib_zstemr.
+           indree = indrdd + n
+           ! indrwk is the starting offset of the left-over real workspace, and
+           ! llrwork is the remaining workspace size.
+           indrwk = indree + n
+           llrwork = lrwork - indrwk + 1
+           ! iwork(indibl:indibl+m-1) corresponds to iblock in stdlib_dstebz and
+           ! stores the block indices of each of the m<=n eigenvalues.
+           indibl = 1
+           ! iwork(indisp:indisp+nsplit-1) corresponds to isplit in stdlib_dstebz and
+           ! stores the starting and finishing indices of each block.
+           indisp = indibl + n
+           ! iwork(indifl:indifl+n-1) stores the indices of eigenvectors
+           ! that corresponding to eigenvectors that fail to converge in
+           ! stdlib_dstein.  this information is discarded; if any fail, the driver
+           ! returns info > 0.
+           indifl = indisp + n
+           ! indiwo is the offset of the remaining integer workspace.
+           indiwo = indifl + n
+           ! call stdlib_zhetrd to reduce hermitian matrix to tridiagonal form.
+           call stdlib_zhetrd( uplo, n, a, lda, rwork( indrd ), rwork( indre ),work( indtau ), &
+                     work( indwk ), llwork, iinfo )
+           ! if all eigenvalues are desired
+           ! then call stdlib_dsterf or stdlib_zstemr and stdlib_zunmtr.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig.or.test ) .and. ( ieeeok==1 ) ) then
+              if( .not.wantz ) then
+                 call stdlib_dcopy( n, rwork( indrd ), 1, w, 1 )
+                 call stdlib_dcopy( n-1, rwork( indre ), 1, rwork( indree ), 1 )
+                 call stdlib_dsterf( n, w, rwork( indree ), info )
+              else
+                 call stdlib_dcopy( n-1, rwork( indre ), 1, rwork( indree ), 1 )
+                 call stdlib_dcopy( n, rwork( indrd ), 1, rwork( indrdd ), 1 )
+                 if (abstol <= two*n*eps) then
+                    tryrac = .true.
+                 else
+                    tryrac = .false.
+                 end if
+                 call stdlib_zstemr( jobz, 'A', n, rwork( indrdd ),rwork( indree ), vl, vu, il, &
+                 iu, m, w,z, ldz, n, isuppz, tryrac,rwork( indrwk ), llrwork,iwork, liwork, info )
+                           
+                 ! apply unitary matrix used in reduction to tridiagonal
+                 ! form to eigenvectors returned by stdlib_zstemr.
+                 if( wantz .and. info==0 ) then
+                    indwkn = indwk
+                    llwrkn = lwork - indwkn + 1
+                    call stdlib_zunmtr( 'L', uplo, 'N', n, m, a, lda,work( indtau ), z, ldz, work(&
+                               indwkn ),llwrkn, iinfo )
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_dstebz and, if eigenvectors are desired, stdlib_zstein.
+           ! also call stdlib_dstebz and stdlib_zstein if stdlib_zstemr fails.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           call stdlib_dstebz( range, order, n, vll, vuu, il, iu, abstll,rwork( indrd ), rwork( &
+           indre ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwo )&
+                     , info )
+           if( wantz ) then
+              call stdlib_zstein( n, rwork( indrd ), rwork( indre ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwo ), iwork( indifl ),info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_zstein.
+              indwkn = indwk
+              llwrkn = lwork - indwkn + 1
+              call stdlib_zunmtr( 'L', uplo, 'N', n, m, a, lda, work( indtau ), z,ldz, work( &
+                        indwkn ), llwrkn, iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           30 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_zswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                 end if
+              end do
+           end if
+           ! set work(1) to optimal workspace size.
+           work( 1 ) = lwkopt
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_zheevr
+
+     !> ZHEEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex Hermitian matrix A.  Eigenvalues and eigenvectors can
+     !> be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+
+     subroutine stdlib_zheevx( jobz, range, uplo, n, a, lda, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               work, lwork, rwork,iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, lda, ldz, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, lquery, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwk, indrwk, &
+                     indtau, indwrk, iscale, itmp1, j, jj, llwork, lwkmin, lwkopt, nb, nsplit
+           real(dp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           lower = stdlib_lsame( uplo, 'L' )
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -8
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -9
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -15
+              end if
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwkmin = 1
+                 work( 1 ) = lwkmin
+              else
+                 lwkmin = 2*n
+                 nb = stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 )
+                 nb = max( nb, stdlib_ilaenv( 1, 'ZUNMTR', uplo, n, -1, -1, -1 ) )
+                 lwkopt = max( 1, ( nb + 1 )*n )
+                 work( 1 ) = lwkopt
+              end if
+              if( lwork<lwkmin .and. .not.lquery )info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              return
+           end if
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = real( a( 1, 1 ),KIND=dp)
+              else if( valeig ) then
+                 if( vl<real( a( 1, 1 ),KIND=dp) .and. vu>=real( a( 1, 1 ),KIND=dp) )then
+                    m = 1
+                    w( 1 ) = real( a( 1, 1 ),KIND=dp)
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           end if
+           anrm = stdlib_zlanhe( 'M', uplo, n, a, lda, rwork )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 do j = 1, n
+                    call stdlib_zdscal( n-j+1, sigma, a( j, j ), 1 )
+                 end do
+              else
+                 do j = 1, n
+                    call stdlib_zdscal( j, sigma, a( 1, j ), 1 )
+                 end do
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_zhetrd to reduce hermitian matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indrwk = inde + n
+           indtau = 1
+           indwrk = indtau + n
+           llwork = lwork - indwrk + 1
+           call stdlib_zhetrd( uplo, n, a, lda, rwork( indd ), rwork( inde ),work( indtau ), work(&
+                      indwrk ), llwork, iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal to
+           ! zero, then call stdlib_dsterf or stdlib_zungtr and stdlib_zsteqr.  if this fails for
+           ! some eigenvalue, then try stdlib_dstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_dcopy( n, rwork( indd ), 1, w, 1 )
+              indee = indrwk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_dcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_dsterf( n, w, rwork( indee ), info )
+              else
+                 call stdlib_zlacpy( 'A', n, n, a, lda, z, ldz )
+                 call stdlib_zungtr( uplo, n, z, ldz, work( indtau ),work( indwrk ), llwork, &
+                           iinfo )
+                 call stdlib_dcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_zsteqr( jobz, n, w, rwork( indee ), z, ldz,rwork( indrwk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 40
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_dstebz and, if eigenvectors are desired, stdlib_zstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwk = indisp + n
+           call stdlib_dstebz( range, order, n, vll, vuu, il, iu, abstll,rwork( indd ), rwork( &
+           inde ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwk ),&
+                      info )
+           if( wantz ) then
+              call stdlib_zstein( n, rwork( indd ), rwork( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwk ), ifail, info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_zstein.
+              call stdlib_zunmtr( 'L', uplo, 'N', n, m, a, lda, work( indtau ), z,ldz, work( &
+                        indwrk ), llwork, iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           40 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_zswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           ! set work(1) to optimal complex workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zheevx
+
+     !> ZHEGV: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+     !> Here A and B are assumed to be Hermitian and B is also
+     !> positive definite.
+
+     subroutine stdlib_zhegv( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, rwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, lwork, n
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: lwkopt, nb, neig
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, ( nb + 1 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, 2*n - 1 ) .and. .not.lquery ) then
+                 info = -11
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEGV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_zpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_zhegst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_zheev( jobz, uplo, n, a, lda, w, work, lwork, rwork, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 call stdlib_ztrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, neig, cone,b, ldb, a, lda &
+                           )
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_ztrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, neig, cone,b, ldb, a, lda &
+                           )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zhegv
+
+     !> ZHEGVX: computes selected eigenvalues, and optionally, eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be Hermitian and B is also positive definite.
+     !> Eigenvalues and eigenvectors can be selected by specifying either a
+     !> range of values or a range of indices for the desired eigenvalues.
+
+     subroutine stdlib_zhegvx( itype, jobz, range, uplo, n, a, lda, b, ldb,vl, vu, il, iu, abstol,&
+                m, w, z, ldz, work,lwork, rwork, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, itype, iu, lda, ldb, ldz, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lquery, upper, valeig, wantz
+           character :: trans
+           integer(ilp) :: lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lquery = ( lwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -3
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -11
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -12
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -13
+                 end if
+              end if
+           end if
+           if (info==0) then
+              if (ldz<1 .or. (wantz .and. ldz<n)) then
+                 info = -18
+              end if
+           end if
+           if( info==0 ) then
+              nb = stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 )
+              lwkopt = max( 1, ( nb + 1 )*n )
+              work( 1 ) = lwkopt
+              if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEGVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 ) then
+              return
+           end if
+           ! form a cholesky factorization of b.
+           call stdlib_zpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_zhegst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_zheevx( jobz, range, uplo, n, a, lda, vl, vu, il, iu, abstol,m, w, z, ldz, &
+                     work, lwork, rwork, iwork, ifail,info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              if( info>0 )m = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 call stdlib_ztrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, m, cone, b,ldb, z, ldz )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_ztrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, m, cone, b,ldb, z, ldz )
+                           
+              end if
+           end if
+           ! set work(1) to optimal complex workspace size.
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zhegvx
+
+     !> ZHERFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian indefinite, and
+     !> provides error bounds and backward error estimates for the solution.
+
+     pure subroutine stdlib_zherfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb,x, ldx, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, j, k, kase, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldaf<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldx<max( 1, n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHERFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_zcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_zhemv( uplo, n, -cone, a, lda, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( a( k, k ),KIND=dp) )*xk + s
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( a( k, k ),KIND=dp) )*xk
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( a( i, k ) )*xk
+                       s = s + cabs1( a( i, k ) )*cabs1( x( i, j ) )
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_zhetrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                 call stdlib_zaxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_zlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_zhetrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_zhetrs( uplo, n, 1, af, ldaf, ipiv, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_zherfs
+
+     !> ZHESV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**H,  if UPLO = 'U', or
+     !> A = L * D * L**H,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.  The factored form of A is then
+     !> used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_zhesv( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'ZHETRF', uplo, n, -1, -1, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHESV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+           call stdlib_zhetrf( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              if ( lwork<n ) then
+              ! solve with trs ( use level blas 2)
+                 call stdlib_zhetrs( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+              else
+              ! solve with trs2 ( use level blas 3)
+                 call stdlib_zhetrs2( uplo,n,nrhs,a,lda,ipiv,b,ldb,work,info )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zhesv
+
+     !> ZHESV_RK: computes the solution to a complex system of linear
+     !> equations A * X = B, where A is an N-by-N Hermitian matrix
+     !> and X and B are N-by-NRHS matrices.
+     !> The bounded Bunch-Kaufman (rook) diagonal pivoting method is used
+     !> to factor A as
+     !> A = P*U*D*(U**H)*(P**T),  if UPLO = 'U', or
+     !> A = P*L*D*(L**H)*(P**T),  if UPLO = 'L',
+     !> where U (or L) is unit upper (or lower) triangular matrix,
+     !> U**H (or L**H) is the conjugate of U (or L), P is a permutation
+     !> matrix, P**T is the transpose of P, and D is Hermitian and block
+     !> diagonal with 1-by-1 and 2-by-2 diagonal blocks.
+     !> ZHETRF_RK is called to compute the factorization of a complex
+     !> Hermitian matrix.  The factored form of A is then used to solve
+     !> the system of equations A * X = B by calling BLAS3 routine ZHETRS_3.
+
+     pure subroutine stdlib_zhesv_rk( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: e(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -11
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 call stdlib_zhetrf_rk( uplo, n, a, lda, e, ipiv, work, -1, info )
+                 lwkopt = real( work(1),KIND=dp)
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHESV_RK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = p*u*d*(u**h)*(p**t) or
+           ! a = p*u*d*(u**h)*(p**t).
+           call stdlib_zhetrf_rk( uplo, n, a, lda, e, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b with blas3 solver, overwriting b with x.
+              call stdlib_zhetrs_3( uplo, n, nrhs, a, lda, e, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zhesv_rk
+
+     !> ZHESV_ROOK: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> The bounded Bunch-Kaufman ("rook") diagonal pivoting method is used
+     !> to factor A as
+     !> A = U * D * U**T,  if UPLO = 'U', or
+     !> A = L * D * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and D is Hermitian and block diagonal with
+     !> 1-by-1 and 2-by-2 diagonal blocks.
+     !> ZHETRF_ROOK is called to compute the factorization of a complex
+     !> Hermition matrix A using the bounded Bunch-Kaufman ("rook") diagonal
+     !> pivoting method.
+     !> The factored form of A is then used to solve the system
+     !> of equations A * X = B by calling ZHETRS_ROOK (uses BLAS 2).
+
+     pure subroutine stdlib_zhesv_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, nb
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<1 .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              if( n==0 ) then
+                 lwkopt = 1
+              else
+                 nb = stdlib_ilaenv( 1, 'ZHETRF_ROOK', uplo, n, -1, -1, -1 )
+                 lwkopt = n*nb
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHESV_ROOK ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+           call stdlib_zhetrf_rook( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              ! solve with trs ( use level blas 2)
+              call stdlib_zhetrs_rook( uplo, n, nrhs, a, lda, ipiv, b, ldb, info )
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zhesv_rook
+
+     !> ZHESVX: uses the diagonal pivoting factorization to compute the
+     !> solution to a complex system of linear equations A * X = B,
+     !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_zhesvx( fact, uplo, n, nrhs, a, lda, af, ldaf, ipiv, b,ldb, x, ldx, rcond, &
+               ferr, berr, work, lwork,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, lwork, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(inout) :: af(ldaf,*)
+           complex(dp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, nofact
+           integer(ilp) :: lwkopt, nb
+           real(dp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -11
+           else if( ldx<max( 1, n ) ) then
+              info = -13
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           if( info==0 ) then
+              lwkopt = max( 1, 2*n )
+              if( nofact ) then
+                 nb = stdlib_ilaenv( 1, 'ZHETRF', uplo, n, -1, -1, -1 )
+                 lwkopt = max( lwkopt, n*nb )
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHESVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+              call stdlib_zlacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_zhetrf( uplo, n, af, ldaf, ipiv, work, lwork, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_zlanhe( 'I', uplo, n, a, lda, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_zhecon( uplo, n, af, ldaf, ipiv, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_zlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_zhetrs( uplo, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_zherfs( uplo, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zhesvx
+
+     !> ZHGEQZ: computes the eigenvalues of a complex matrix pair (H,T),
+     !> where H is an upper Hessenberg matrix and T is upper triangular,
+     !> using the single-shift QZ method.
+     !> Matrix pairs of this type are produced by the reduction to
+     !> generalized upper Hessenberg form of a complex matrix pair (A,B):
+     !> A = Q1*H*Z1**H,  B = Q1*T*Z1**H,
+     !> as computed by ZGGHRD.
+     !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+     !> also reduced to generalized Schur form,
+     !> H = Q*S*Z**H,  T = Q*P*Z**H,
+     !> where Q and Z are unitary matrices and S and P are upper triangular.
+     !> Optionally, the unitary matrix Q from the generalized Schur
+     !> factorization may be postmultiplied into an input matrix Q1, and the
+     !> unitary matrix Z may be postmultiplied into an input matrix Z1.
+     !> If Q1 and Z1 are the unitary matrices from ZGGHRD that reduced
+     !> the matrix pair (A,B) to generalized Hessenberg form, then the output
+     !> matrices Q1*Q and Z1*Z are the unitary factors from the generalized
+     !> Schur factorization of (A,B):
+     !> A = (Q1*Q)*S*(Z1*Z)**H,  B = (Q1*Q)*P*(Z1*Z)**H.
+     !> To avoid overflow, eigenvalues of the matrix pair (H,T)
+     !> (equivalently, of (A,B)) are computed as a pair of complex values
+     !> (alpha,beta).  If beta is nonzero, lambda = alpha / beta is an
+     !> eigenvalue of the generalized nonsymmetric eigenvalue problem (GNEP)
+     !> A*x = lambda*B*x
+     !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+     !> alternate form of the GNEP
+     !> mu*A*y = B*y.
+     !> The values of alpha and beta for the i-th eigenvalue can be read
+     !> directly from the generalized Schur form:  alpha = S(i,i),
+     !> beta = P(i,i).
+     !> Ref: C.B. Moler
+     !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+     !> pp. 241--256.
+
+     subroutine stdlib_zhgeqz( job, compq, compz, n, ilo, ihi, h, ldh, t, ldt,alpha, beta, q, ldq,&
+                z, ldz, work, lwork,rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, compz, job
+           integer(ilp), intent(in) :: ihi, ilo, ldh, ldq, ldt, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(out) :: alpha(*), beta(*), work(*)
+           complex(dp), intent(inout) :: h(ldh,*), q(ldq,*), t(ldt,*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ilazr2, ilazro, ilq, ilschr, ilz, lquery
+           integer(ilp) :: icompq, icompz, ifirst, ifrstm, iiter, ilast, ilastm, in, ischur, &
+                     istart, j, jc, jch, jiter, jr, maxit
+           real(dp) :: absb, anorm, ascale, atol, bnorm, bscale, btol, c, safmin, temp, temp2, &
+                     tempr, ulp
+           complex(dp) :: abi22, ad11, ad12, ad21, ad22, ctemp, ctemp2, ctemp3, eshift, s, shift, &
+                     signbc, u12, x, abi12, y
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max,min,sqrt
+           ! Statement Functions 
+           real(dp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=dp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode job, compq, compz
+           if( stdlib_lsame( job, 'E' ) ) then
+              ilschr = .false.
+              ischur = 1
+           else if( stdlib_lsame( job, 'S' ) ) then
+              ilschr = .true.
+              ischur = 2
+           else
+              ilschr = .true.
+              ischur = 0
+           end if
+           if( stdlib_lsame( compq, 'N' ) ) then
+              ilq = .false.
+              icompq = 1
+           else if( stdlib_lsame( compq, 'V' ) ) then
+              ilq = .true.
+              icompq = 2
+           else if( stdlib_lsame( compq, 'I' ) ) then
+              ilq = .true.
+              icompq = 3
+           else
+              ilq = .true.
+              icompq = 0
+           end if
+           if( stdlib_lsame( compz, 'N' ) ) then
+              ilz = .false.
+              icompz = 1
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              ilz = .true.
+              icompz = 2
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              ilz = .true.
+              icompz = 3
+           else
+              ilz = .true.
+              icompz = 0
+           end if
+           ! check argument values
+           info = 0
+           work( 1 ) = max( 1, n )
+           lquery = ( lwork==-1 )
+           if( ischur==0 ) then
+              info = -1
+           else if( icompq==0 ) then
+              info = -2
+           else if( icompz==0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 ) then
+              info = -5
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -6
+           else if( ldh<n ) then
+              info = -8
+           else if( ldt<n ) then
+              info = -10
+           else if( ldq<1 .or. ( ilq .and. ldq<n ) ) then
+              info = -14
+           else if( ldz<1 .or. ( ilz .and. ldz<n ) ) then
+              info = -16
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHGEQZ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           ! work( 1 ) = cmplx( 1,KIND=dp)
+           if( n<=0 ) then
+              work( 1 ) = cmplx( 1,KIND=dp)
+              return
+           end if
+           ! initialize q and z
+           if( icompq==3 )call stdlib_zlaset( 'FULL', n, n, czero, cone, q, ldq )
+           if( icompz==3 )call stdlib_zlaset( 'FULL', n, n, czero, cone, z, ldz )
+           ! machine constants
+           in = ihi + 1 - ilo
+           safmin = stdlib_dlamch( 'S' )
+           ulp = stdlib_dlamch( 'E' )*stdlib_dlamch( 'B' )
+           anorm = stdlib_zlanhs( 'F', in, h( ilo, ilo ), ldh, rwork )
+           bnorm = stdlib_zlanhs( 'F', in, t( ilo, ilo ), ldt, rwork )
+           atol = max( safmin, ulp*anorm )
+           btol = max( safmin, ulp*bnorm )
+           ascale = one / max( safmin, anorm )
+           bscale = one / max( safmin, bnorm )
+           ! set eigenvalues ihi+1:n
+           do j = ihi + 1, n
+              absb = abs( t( j, j ) )
+              if( absb>safmin ) then
+                 signbc = conjg( t( j, j ) / absb )
+                 t( j, j ) = absb
+                 if( ilschr ) then
+                    call stdlib_zscal( j-1, signbc, t( 1, j ), 1 )
+                    call stdlib_zscal( j, signbc, h( 1, j ), 1 )
+                 else
+                    call stdlib_zscal( 1, signbc, h( j, j ), 1 )
+                 end if
+                 if( ilz )call stdlib_zscal( n, signbc, z( 1, j ), 1 )
+              else
+                 t( j, j ) = czero
+              end if
+              alpha( j ) = h( j, j )
+              beta( j ) = t( j, j )
+           end do
+           ! if ihi < ilo, skip qz steps
+           if( ihi<ilo )go to 190
+           ! main qz iteration loop
+           ! initialize dynamic indices
+           ! eigenvalues ilast+1:n have been found.
+              ! column operations modify rows ifrstm:whatever
+              ! row operations modify columns whatever:ilastm
+           ! if only eigenvalues are being computed, then
+              ! ifrstm is the row of the last splitting row above row ilast;
+              ! this is always at least ilo.
+           ! iiter counts iterations since the last eigenvalue was found,
+              ! to tell when to use an extraordinary shift.
+           ! maxit is the maximum number of qz sweeps allowed.
+           ilast = ihi
+           if( ilschr ) then
+              ifrstm = 1
+              ilastm = n
+           else
+              ifrstm = ilo
+              ilastm = ihi
+           end if
+           iiter = 0
+           eshift = czero
+           maxit = 30*( ihi-ilo+1 )
+           loop_170: do jiter = 1, maxit
+              ! check for too many iterations.
+              if( jiter>maxit )go to 180
+              ! split the matrix if possible.
+              ! two tests:
+                 ! 1: h(j,j-1)=0  or  j=ilo
+                 ! 2: t(j,j)=0
+              ! special case: j=ilast
+              if( ilast==ilo ) then
+                 go to 60
+              else
+                 if( abs1( h( ilast, ilast-1 ) )<=max( safmin, ulp*(abs1( h( ilast, ilast ) ) + &
+                           abs1( h( ilast-1, ilast-1 )) ) ) ) then
+                    h( ilast, ilast-1 ) = czero
+                    go to 60
+                 end if
+              end if
+              if( abs( t( ilast, ilast ) )<=max( safmin, ulp*(abs( t( ilast - 1, ilast ) ) + abs( &
+                        t( ilast-1, ilast-1 )) ) ) ) then
+                 t( ilast, ilast ) = czero
+                 go to 50
+              end if
+              ! general case: j<ilast
+              loop_40: do j = ilast - 1, ilo, -1
+                 ! test 1: for h(j,j-1)=0 or j=ilo
+                 if( j==ilo ) then
+                    ilazro = .true.
+                 else
+                    if( abs1( h( j, j-1 ) )<=max( safmin, ulp*(abs1( h( j, j ) ) + abs1( h( j-1, &
+                              j-1 ) )) ) ) then
+                       h( j, j-1 ) = czero
+                       ilazro = .true.
+                    else
+                       ilazro = .false.
+                    end if
+                 end if
+                 ! test 2: for t(j,j)=0
+                 temp = abs ( t( j, j + 1 ) )
+                 if ( j > ilo )temp = temp + abs ( t( j - 1, j ) )
+                 if( abs( t( j, j ) )<max( safmin,ulp*temp ) ) then
+                    t( j, j ) = czero
+                    ! test 1a: check for 2 consecutive small subdiagonals in a
+                    ilazr2 = .false.
+                    if( .not.ilazro ) then
+                       if( abs1( h( j, j-1 ) )*( ascale*abs1( h( j+1,j ) ) )<=abs1( h( j, j ) )*( &
+                                 ascale*atol ) )ilazr2 = .true.
+                    end if
+                    ! if both tests pass (1
+                    ! element of b in the block is zero, split a 1x1 block off
+                    ! at the top. (i.e., at the j-th row/column) the leading
+                    ! diagonal element of the remainder can also be zero, so
+                    ! this may have to be done repeatedly.
+                    if( ilazro .or. ilazr2 ) then
+                       do jch = j, ilast - 1
+                          ctemp = h( jch, jch )
+                          call stdlib_zlartg( ctemp, h( jch+1, jch ), c, s,h( jch, jch ) )
+                          h( jch+1, jch ) = czero
+                          call stdlib_zrot( ilastm-jch, h( jch, jch+1 ), ldh,h( jch+1, jch+1 ), &
+                                    ldh, c, s )
+                          call stdlib_zrot( ilastm-jch, t( jch, jch+1 ), ldt,t( jch+1, jch+1 ), &
+                                    ldt, c, s )
+                          if( ilq )call stdlib_zrot( n, q( 1, jch ), 1, q( 1, jch+1 ), 1,c, conjg(&
+                                     s ) )
+                          if( ilazr2 )h( jch, jch-1 ) = h( jch, jch-1 )*c
+                          ilazr2 = .false.
+                          if( abs1( t( jch+1, jch+1 ) )>=btol ) then
+                             if( jch+1>=ilast ) then
+                                go to 60
+                             else
+                                ifirst = jch + 1
+                                go to 70
+                             end if
+                          end if
+                          t( jch+1, jch+1 ) = czero
+                       end do
+                       go to 50
+                    else
+                       ! only test 2 passed -- chase the zero to t(ilast,ilast)
+                       ! then process as in the case t(ilast,ilast)=0
+                       do jch = j, ilast - 1
+                          ctemp = t( jch, jch+1 )
+                          call stdlib_zlartg( ctemp, t( jch+1, jch+1 ), c, s,t( jch, jch+1 ) )
+                                    
+                          t( jch+1, jch+1 ) = czero
+                          if( jch<ilastm-1 )call stdlib_zrot( ilastm-jch-1, t( jch, jch+2 ), ldt,&
+                                    t( jch+1, jch+2 ), ldt, c, s )
+                          call stdlib_zrot( ilastm-jch+2, h( jch, jch-1 ), ldh,h( jch+1, jch-1 ), &
+                                    ldh, c, s )
+                          if( ilq )call stdlib_zrot( n, q( 1, jch ), 1, q( 1, jch+1 ), 1,c, conjg(&
+                                     s ) )
+                          ctemp = h( jch+1, jch )
+                          call stdlib_zlartg( ctemp, h( jch+1, jch-1 ), c, s,h( jch+1, jch ) )
+                                    
+                          h( jch+1, jch-1 ) = czero
+                          call stdlib_zrot( jch+1-ifrstm, h( ifrstm, jch ), 1,h( ifrstm, jch-1 ), &
+                                    1, c, s )
+                          call stdlib_zrot( jch-ifrstm, t( ifrstm, jch ), 1,t( ifrstm, jch-1 ), 1,&
+                                     c, s )
+                          if( ilz )call stdlib_zrot( n, z( 1, jch ), 1, z( 1, jch-1 ), 1,c, s )
+                                    
+                       end do
+                       go to 50
+                    end if
+                 else if( ilazro ) then
+                    ! only test 1 passed -- work on j:ilast
+                    ifirst = j
+                    go to 70
+                 end if
+                 ! neither test passed -- try next j
+              end do loop_40
+              ! (drop-through is "impossible")
+              info = 2*n + 1
+              go to 210
+              ! t(ilast,ilast)=0 -- clear h(ilast,ilast-1) to split off a
+              ! 1x1 block.
+              50 continue
+              ctemp = h( ilast, ilast )
+              call stdlib_zlartg( ctemp, h( ilast, ilast-1 ), c, s,h( ilast, ilast ) )
+              h( ilast, ilast-1 ) = czero
+              call stdlib_zrot( ilast-ifrstm, h( ifrstm, ilast ), 1,h( ifrstm, ilast-1 ), 1, c, s &
+                        )
+              call stdlib_zrot( ilast-ifrstm, t( ifrstm, ilast ), 1,t( ifrstm, ilast-1 ), 1, c, s &
+                        )
+              if( ilz )call stdlib_zrot( n, z( 1, ilast ), 1, z( 1, ilast-1 ), 1, c, s )
+              ! h(ilast,ilast-1)=0 -- standardize b, set alpha and beta
+              60 continue
+              absb = abs( t( ilast, ilast ) )
+              if( absb>safmin ) then
+                 signbc = conjg( t( ilast, ilast ) / absb )
+                 t( ilast, ilast ) = absb
+                 if( ilschr ) then
+                    call stdlib_zscal( ilast-ifrstm, signbc, t( ifrstm, ilast ), 1 )
+                    call stdlib_zscal( ilast+1-ifrstm, signbc, h( ifrstm, ilast ),1 )
+                 else
+                    call stdlib_zscal( 1, signbc, h( ilast, ilast ), 1 )
+                 end if
+                 if( ilz )call stdlib_zscal( n, signbc, z( 1, ilast ), 1 )
+              else
+                 t( ilast, ilast ) = czero
+              end if
+              alpha( ilast ) = h( ilast, ilast )
+              beta( ilast ) = t( ilast, ilast )
+              ! go to next block -- exit if finished.
+              ilast = ilast - 1
+              if( ilast<ilo )go to 190
+              ! reset counters
+              iiter = 0
+              eshift = czero
+              if( .not.ilschr ) then
+                 ilastm = ilast
+                 if( ifrstm>ilast )ifrstm = ilo
+              end if
+              go to 160
+              ! qz step
+              ! this iteration only involves rows/columns ifirst:ilast.  we
+              ! assume ifirst < ilast, and that the diagonal of b is non-zero.
+              70 continue
+              iiter = iiter + 1
+              if( .not.ilschr ) then
+                 ifrstm = ifirst
+              end if
+              ! compute the shift.
+              ! at this point, ifirst < ilast, and the diagonal elements of
+              ! t(ifirst:ilast,ifirst,ilast) are larger than btol (in
+              ! magnitude)
+              if( ( iiter / 10 )*10/=iiter ) then
+                 ! the wilkinson shift (aep p.512_dp), i.e., the eigenvalue of
+                 ! the bottom-right 2x2 block of a inv(b) which is nearest to
+                 ! the bottom-right element.
+                 ! we factor b as u*d, where u has unit diagonals, and
+                 ! compute (a*inv(d))*inv(u).
+                 u12 = ( bscale*t( ilast-1, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 ad11 = ( ascale*h( ilast-1, ilast-1 ) ) /( bscale*t( ilast-1, ilast-1 ) )
+                 ad21 = ( ascale*h( ilast, ilast-1 ) ) /( bscale*t( ilast-1, ilast-1 ) )
+                 ad12 = ( ascale*h( ilast-1, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 ad22 = ( ascale*h( ilast, ilast ) ) /( bscale*t( ilast, ilast ) )
+                 abi22 = ad22 - u12*ad21
+                 abi12 = ad12 - u12*ad11
+                 shift = abi22
+                 ctemp = sqrt( abi12 )*sqrt( ad21 )
+                 temp = abs1( ctemp )
+                 if( ctemp/=zero ) then
+                    x = half*( ad11-shift )
+                    temp2 = abs1( x )
+                    temp = max( temp, abs1( x ) )
+                    y = temp*sqrt( ( x / temp )**2+( ctemp / temp )**2 )
+                    if( temp2>zero ) then
+                       if( real( x / temp2,KIND=dp)*real( y,KIND=dp)+aimag( x / temp2 )*aimag( y )&
+                                 <zero )y = -y
+                    end if
+                    shift = shift - ctemp*stdlib_zladiv( ctemp, ( x+y ) )
+                 end if
+              else
+                 ! exceptional shift.  chosen for no particularly good reason.
+                 if( ( iiter / 20 )*20==iiter .and.bscale*abs1(t( ilast, ilast ))>safmin ) &
+                           then
+                    eshift = eshift + ( ascale*h( ilast,ilast ) )/( bscale*t( ilast, ilast ) )
+                              
+                 else
+                    eshift = eshift + ( ascale*h( ilast,ilast-1 ) )/( bscale*t( ilast-1, ilast-1 )&
+                               )
+                 end if
+                 shift = eshift
+              end if
+              ! now check for two consecutive small subdiagonals.
+              do j = ilast - 1, ifirst + 1, -1
+                 istart = j
+                 ctemp = ascale*h( j, j ) - shift*( bscale*t( j, j ) )
+                 temp = abs1( ctemp )
+                 temp2 = ascale*abs1( h( j+1, j ) )
+                 tempr = max( temp, temp2 )
+                 if( tempr<one .and. tempr/=zero ) then
+                    temp = temp / tempr
+                    temp2 = temp2 / tempr
+                 end if
+                 if( abs1( h( j, j-1 ) )*temp2<=temp*atol )go to 90
+              end do
+              istart = ifirst
+              ctemp = ascale*h( ifirst, ifirst ) -shift*( bscale*t( ifirst, ifirst ) )
+              90 continue
+              ! do an implicit-shift qz sweep.
+              ! initial q
+              ctemp2 = ascale*h( istart+1, istart )
+              call stdlib_zlartg( ctemp, ctemp2, c, s, ctemp3 )
+              ! sweep
+              loop_150: do j = istart, ilast - 1
+                 if( j>istart ) then
+                    ctemp = h( j, j-1 )
+                    call stdlib_zlartg( ctemp, h( j+1, j-1 ), c, s, h( j, j-1 ) )
+                    h( j+1, j-1 ) = czero
+                 end if
+                 do jc = j, ilastm
+                    ctemp = c*h( j, jc ) + s*h( j+1, jc )
+                    h( j+1, jc ) = -conjg( s )*h( j, jc ) + c*h( j+1, jc )
+                    h( j, jc ) = ctemp
+                    ctemp2 = c*t( j, jc ) + s*t( j+1, jc )
+                    t( j+1, jc ) = -conjg( s )*t( j, jc ) + c*t( j+1, jc )
+                    t( j, jc ) = ctemp2
+                 end do
+                 if( ilq ) then
+                    do jr = 1, n
+                       ctemp = c*q( jr, j ) + conjg( s )*q( jr, j+1 )
+                       q( jr, j+1 ) = -s*q( jr, j ) + c*q( jr, j+1 )
+                       q( jr, j ) = ctemp
+                    end do
+                 end if
+                 ctemp = t( j+1, j+1 )
+                 call stdlib_zlartg( ctemp, t( j+1, j ), c, s, t( j+1, j+1 ) )
+                 t( j+1, j ) = czero
+                 do jr = ifrstm, min( j+2, ilast )
+                    ctemp = c*h( jr, j+1 ) + s*h( jr, j )
+                    h( jr, j ) = -conjg( s )*h( jr, j+1 ) + c*h( jr, j )
+                    h( jr, j+1 ) = ctemp
+                 end do
+                 do jr = ifrstm, j
+                    ctemp = c*t( jr, j+1 ) + s*t( jr, j )
+                    t( jr, j ) = -conjg( s )*t( jr, j+1 ) + c*t( jr, j )
+                    t( jr, j+1 ) = ctemp
+                 end do
+                 if( ilz ) then
+                    do jr = 1, n
+                       ctemp = c*z( jr, j+1 ) + s*z( jr, j )
+                       z( jr, j ) = -conjg( s )*z( jr, j+1 ) + c*z( jr, j )
+                       z( jr, j+1 ) = ctemp
+                    end do
+                 end if
+              end do loop_150
+              160 continue
+           end do loop_170
+           ! drop-through = non-convergence
+           180 continue
+           info = ilast
+           go to 210
+           ! successful completion of all qz steps
+           190 continue
+           ! set eigenvalues 1:ilo-1
+           do j = 1, ilo - 1
+              absb = abs( t( j, j ) )
+              if( absb>safmin ) then
+                 signbc = conjg( t( j, j ) / absb )
+                 t( j, j ) = absb
+                 if( ilschr ) then
+                    call stdlib_zscal( j-1, signbc, t( 1, j ), 1 )
+                    call stdlib_zscal( j, signbc, h( 1, j ), 1 )
+                 else
+                    call stdlib_zscal( 1, signbc, h( j, j ), 1 )
+                 end if
+                 if( ilz )call stdlib_zscal( n, signbc, z( 1, j ), 1 )
+              else
+                 t( j, j ) = czero
+              end if
+              alpha( j ) = h( j, j )
+              beta( j ) = t( j, j )
+           end do
+           ! normal termination
+           info = 0
+           ! exit (other than argument error) -- return optimal workspace size
+           210 continue
+           work( 1 ) = cmplx( n,KIND=dp)
+           return
+     end subroutine stdlib_zhgeqz
+
+     !> ZHPCON: estimates the reciprocal of the condition number of a complex
+     !> Hermitian packed matrix A using the factorization A = U*D*U**H or
+     !> A = L*D*L**H computed by ZHPTRF.
+     !> An estimate is obtained for norm(inv(A)), and the reciprocal of the
+     !> condition number is computed as RCOND = 1 / (ANORM * norm(inv(A))).
+
+     pure subroutine stdlib_zhpcon( uplo, n, ap, ipiv, anorm, rcond, work, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           real(dp), intent(in) :: anorm
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           complex(dp), intent(in) :: ap(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: i, ip, kase
+           real(dp) :: ainvnm
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( anorm<zero ) then
+              info = -5
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPCON', -info )
+              return
+           end if
+           ! quick return if possible
+           rcond = zero
+           if( n==0 ) then
+              rcond = one
+              return
+           else if( anorm<=zero ) then
+              return
+           end if
+           ! check that the diagonal matrix d is nonsingular.
+           if( upper ) then
+              ! upper triangular storage: examine d from bottom to top
+              ip = n*( n+1 ) / 2
+              do i = n, 1, -1
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip - i
+              end do
+           else
+              ! lower triangular storage: examine d from top to bottom.
+              ip = 1
+              do i = 1, n
+                 if( ipiv( i )>0 .and. ap( ip )==zero )return
+                 ip = ip + n - i + 1
+              end do
+           end if
+           ! estimate the 1-norm of the inverse.
+           kase = 0
+           30 continue
+           call stdlib_zlacn2( n, work( n+1 ), work, ainvnm, kase, isave )
+           if( kase/=0 ) then
+              ! multiply by inv(l*d*l**h) or inv(u*d*u**h).
+              call stdlib_zhptrs( uplo, n, 1, ap, ipiv, work, n, info )
+              go to 30
+           end if
+           ! compute the estimate of the reciprocal condition number.
+           if( ainvnm/=zero )rcond = ( one / ainvnm ) / anorm
+           return
+     end subroutine stdlib_zhpcon
+
+     !> ZHPEV: computes all the eigenvalues and, optionally, eigenvectors of a
+     !> complex Hermitian matrix in packed storage.
+
+     subroutine stdlib_zhpev( jobz, uplo, n, ap, w, z, ldz, work, rwork,info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, n
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: ap(*)
+           complex(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: wantz
+           integer(ilp) :: iinfo, imax, inde, indrwk, indtau, indwrk, iscale
+           real(dp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( stdlib_lsame( uplo, 'L' ) .or. stdlib_lsame( uplo, 'U' ) ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = real( ap( 1 ),KIND=dp)
+              rwork( 1 ) = 1
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_zlanhp( 'M', uplo, n, ap, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_zdscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+           end if
+           ! call stdlib_zhptrd to reduce hermitian packed matrix to tridiagonal form.
+           inde = 1
+           indtau = 1
+           call stdlib_zhptrd( uplo, n, ap, w, rwork( inde ), work( indtau ),iinfo )
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, first call
+           ! stdlib_zupgtr to generate the orthogonal matrix, then call stdlib_zsteqr.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, rwork( inde ), info )
+           else
+              indwrk = indtau + n
+              call stdlib_zupgtr( uplo, n, ap, work( indtau ), z, ldz,work( indwrk ), iinfo )
+                        
+              indrwk = inde + n
+              call stdlib_zsteqr( jobz, n, w, rwork( inde ), z, ldz,rwork( indrwk ), info )
+                        
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           return
+     end subroutine stdlib_zhpev
+
+     !> ZHPEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex Hermitian matrix A in packed storage.
+     !> Eigenvalues/vectors can be selected by specifying either a range of
+     !> values or a range of indices for the desired eigenvalues.
+
+     subroutine stdlib_zhpevx( jobz, range, uplo, n, ap, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               work, rwork, iwork,ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: ap(*)
+           complex(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwk, indrwk, &
+                     indtau, indwrk, iscale, itmp1, j, jj, nsplit
+           real(dp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( stdlib_lsame( uplo, 'L' ) .or. stdlib_lsame( uplo, 'U' ) ) )&
+                     then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -7
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -8
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -9
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -14
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              if( alleig .or. indeig ) then
+                 m = 1
+                 w( 1 ) = real( ap( 1 ),KIND=dp)
+              else
+                 if( vl<real( ap( 1 ),KIND=dp) .and. vu>=real( ap( 1 ),KIND=dp) ) then
+                    m = 1
+                    w( 1 ) = real( ap( 1 ),KIND=dp)
+                 end if
+              end if
+              if( wantz )z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           end if
+           anrm = stdlib_zlanhp( 'M', uplo, n, ap, rwork )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_zdscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_zhptrd to reduce hermitian packed matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indrwk = inde + n
+           indtau = 1
+           indwrk = indtau + n
+           call stdlib_zhptrd( uplo, n, ap, rwork( indd ), rwork( inde ),work( indtau ), iinfo )
+                     
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_dsterf or stdlib_zupgtr and stdlib_zsteqr.  if this fails
+           ! for some eigenvalue, then try stdlib_dstebz.
+           test = .false.
+           if (indeig) then
+              if (il==1 .and. iu==n) then
+                 test = .true.
+              end if
+           end if
+           if ((alleig .or. test) .and. (abstol<=zero)) then
+              call stdlib_dcopy( n, rwork( indd ), 1, w, 1 )
+              indee = indrwk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_dcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_dsterf( n, w, rwork( indee ), info )
+              else
+                 call stdlib_zupgtr( uplo, n, ap, work( indtau ), z, ldz,work( indwrk ), iinfo )
+                           
+                 call stdlib_dcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_zsteqr( jobz, n, w, rwork( indee ), z, ldz,rwork( indrwk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 20
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_dstebz and, if eigenvectors are desired, stdlib_zstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwk = indisp + n
+           call stdlib_dstebz( range, order, n, vll, vuu, il, iu, abstll,rwork( indd ), rwork( &
+           inde ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwk ),&
+                      info )
+           if( wantz ) then
+              call stdlib_zstein( n, rwork( indd ), rwork( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwk ), ifail, info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_zstein.
+              indwrk = indtau + n
+              call stdlib_zupmtr( 'L', uplo, 'N', n, m, ap, work( indtau ), z, ldz,work( indwrk ),&
+                         iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           20 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_zswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_zhpevx
+
+     !> ZHPGV: computes all the eigenvalues and, optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.
+     !> Here A and B are assumed to be Hermitian, stored in packed format,
+     !> and B is also positive definite.
+
+     subroutine stdlib_zhpgv( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, ldz, n
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: ap(*), bp(*)
+           complex(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper, wantz
+           character :: trans
+           integer(ilp) :: j, neig
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPGV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_zpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_zhpgst( itype, uplo, n, ap, bp, info )
+           call stdlib_zhpev( jobz, uplo, n, ap, w, z, ldz, work, rwork, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 do j = 1, neig
+                    call stdlib_ztpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, neig
+                    call stdlib_ztpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zhpgv
+
+     !> ZHPGVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be Hermitian, stored in packed format, and B is also
+     !> positive definite.  Eigenvalues and eigenvectors can be selected by
+     !> specifying either a range of values or a range of indices for the
+     !> desired eigenvalues.
+
+     subroutine stdlib_zhpgvx( itype, jobz, range, uplo, n, ap, bp, vl, vu,il, iu, abstol, m, w, &
+               z, ldz, work, rwork,iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, itype, iu, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: ap(*), bp(*)
+           complex(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, upper, valeig, wantz
+           character :: trans
+           integer(ilp) :: j
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -3
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl ) then
+                    info = -9
+                 end if
+              else if( indeig ) then
+                 if( il<1 ) then
+                    info = -10
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -11
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -16
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPGVX', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_zpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_zhpgst( itype, uplo, n, ap, bp, info )
+           call stdlib_zhpevx( jobz, range, uplo, n, ap, vl, vu, il, iu, abstol, m,w, z, ldz, &
+                     work, rwork, iwork, ifail, info )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              if( info>0 )m = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 do j = 1, m
+                    call stdlib_ztpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, m
+                    call stdlib_ztpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           return
+     end subroutine stdlib_zhpgvx
+
+     !> ZHPRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian indefinite
+     !> and packed, and provides error bounds and backward error estimates
+     !> for the solution.
+
+     pure subroutine stdlib_zhprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx,ferr, berr, work,&
+                rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(in) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(in) :: afp(*), ap(*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ik, j, k, kase, kk, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin, xk
+           complex(dp) :: zdum
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( ldx<max( 1, n ) ) then
+              info = -10
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = n + 1
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_140: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x
+              call stdlib_zcopy( n, b( 1, j ), 1, work, 1 )
+              call stdlib_zhpmv( uplo, n, -cone, ap, x( 1, j ), 1, cone, work, 1 )
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              do i = 1, n
+                 rwork( i ) = cabs1( b( i, j ) )
+              end do
+              ! compute abs(a)*abs(x) + abs(b).
+              kk = 1
+              if( upper ) then
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    ik = kk
+                    do i = 1, k - 1
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + abs( real( ap( kk+k-1 ),KIND=dp) )*xk + s
+                    kk = kk + k
+                 end do
+              else
+                 do k = 1, n
+                    s = zero
+                    xk = cabs1( x( k, j ) )
+                    rwork( k ) = rwork( k ) + abs( real( ap( kk ),KIND=dp) )*xk
+                    ik = kk + 1
+                    do i = k + 1, n
+                       rwork( i ) = rwork( i ) + cabs1( ap( ik ) )*xk
+                       s = s + cabs1( ap( ik ) )*cabs1( x( i, j ) )
+                       ik = ik + 1
+                    end do
+                    rwork( k ) = rwork( k ) + s
+                    kk = kk + ( n-k+1 )
+                 end do
+              end if
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_zhptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                 call stdlib_zaxpy( n, cone, work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              ! use stdlib_zlacn2 to estimate the infinity-norm of the matrix
+                 ! inv(a) * diag(w),
+              ! where w = abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) )))
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              kase = 0
+              100 continue
+              call stdlib_zlacn2( n, work( n+1 ), work, ferr( j ), kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! multiply by diag(w)*inv(a**h).
+                    call stdlib_zhptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                 else if( kase==2 ) then
+                    ! multiply by inv(a)*diag(w).
+                    do i = 1, n
+                       work( i ) = rwork( i )*work( i )
+                    end do
+                    call stdlib_zhptrs( uplo, n, 1, afp, ipiv, work, n, info )
+                 end if
+                 go to 100
+              end if
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, cabs1( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_140
+           return
+     end subroutine stdlib_zhprfs
+
+     !> ZHPSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian matrix stored in packed format and X
+     !> and B are N-by-NRHS matrices.
+     !> The diagonal pivoting method is used to factor A as
+     !> A = U * D * U**H,  if UPLO = 'U', or
+     !> A = L * D * L**H,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, D is Hermitian and block diagonal with 1-by-1
+     !> and 2-by-2 diagonal blocks.  The factored form of A is then used to
+     !> solve the system of equations A * X = B.
+
+     pure subroutine stdlib_zhpsv( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: ap(*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPSV ', -info )
+              return
+           end if
+           ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+           call stdlib_zhptrf( uplo, n, ap, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_zhptrs( uplo, n, nrhs, ap, ipiv, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_zhpsv
+
+     !> ZHPSVX: uses the diagonal pivoting factorization A = U*D*U**H or
+     !> A = L*D*L**H to compute the solution to a complex system of linear
+     !> equations A * X = B, where A is an N-by-N Hermitian matrix stored
+     !> in packed format and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_zhpsvx( fact, uplo, n, nrhs, ap, afp, ipiv, b, ldb, x,ldx, rcond, ferr, &
+               berr, work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           complex(dp), intent(inout) :: afp(*)
+           complex(dp), intent(in) :: ap(*), b(ldb,*)
+           complex(dp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(dp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the factorization a = u*d*u**h or a = l*d*l**h.
+              call stdlib_zcopy( n*( n+1 ) / 2, ap, 1, afp, 1 )
+              call stdlib_zhptrf( uplo, n, afp, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_zlanhp( 'I', uplo, n, ap, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_zhpcon( uplo, n, afp, ipiv, anorm, rcond, work, info )
+           ! compute the solution vectors x.
+           call stdlib_zlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_zhptrs( uplo, n, nrhs, afp, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_zhprfs( uplo, n, nrhs, ap, afp, ipiv, b, ldb, x, ldx, ferr,berr, work, &
+                     rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_zhpsvx
+
+     !> ZHSEIN: uses inverse iteration to find specified right and/or left
+     !> eigenvectors of a complex upper Hessenberg matrix H.
+     !> The right eigenvector x and the left eigenvector y of the matrix H
+     !> corresponding to an eigenvalue w are defined by:
+     !> H * x = w * x,     y**h * H = w * y**h
+     !> where y**h denotes the conjugate transpose of the vector y.
+
+     subroutine stdlib_zhsein( side, eigsrc, initv, select, n, h, ldh, w, vl,ldvl, vr, ldvr, mm, &
+               m, work, rwork, ifaill,ifailr, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: eigsrc, initv, side
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldh, ldvl, ldvr, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: ifaill(*), ifailr(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(in) :: h(ldh,*)
+           complex(dp), intent(inout) :: vl(ldvl,*), vr(ldvr,*), w(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           real(dp), parameter :: rzero = 0.0e+0_dp
+           
+           
+           ! Local Scalars 
+           logical(lk) :: bothv, fromqr, leftv, noinit, rightv
+           integer(ilp) :: i, iinfo, k, kl, kln, kr, ks, ldwork
+           real(dp) :: eps3, hnorm, smlnum, ulp, unfl
+           complex(dp) :: cdum, wk
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           bothv = stdlib_lsame( side, 'B' )
+           rightv = stdlib_lsame( side, 'R' ) .or. bothv
+           leftv = stdlib_lsame( side, 'L' ) .or. bothv
+           fromqr = stdlib_lsame( eigsrc, 'Q' )
+           noinit = stdlib_lsame( initv, 'N' )
+           ! set m to the number of columns required to store the selected
+           ! eigenvectors.
+           m = 0
+           do k = 1, n
+              if( select( k ) )m = m + 1
+           end do
+           info = 0
+           if( .not.rightv .and. .not.leftv ) then
+              info = -1
+           else if( .not.fromqr .and. .not.stdlib_lsame( eigsrc, 'N' ) ) then
+              info = -2
+           else if( .not.noinit .and. .not.stdlib_lsame( initv, 'U' ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( ldh<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( leftv .and. ldvl<n ) ) then
+              info = -10
+           else if( ldvr<1 .or. ( rightv .and. ldvr<n ) ) then
+              info = -12
+           else if( mm<m ) then
+              info = -13
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHSEIN', -info )
+              return
+           end if
+           ! quick return if possible.
+           if( n==0 )return
+           ! set machine-dependent constants.
+           unfl = stdlib_dlamch( 'SAFE MINIMUM' )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = unfl*( n / ulp )
+           ldwork = n
+           kl = 1
+           kln = 0
+           if( fromqr ) then
+              kr = 0
+           else
+              kr = n
+           end if
+           ks = 1
+           loop_100: do k = 1, n
+              if( select( k ) ) then
+                 ! compute eigenvector(s) corresponding to w(k).
+                 if( fromqr ) then
+                    ! if affiliation of eigenvalues is known, check whether
+                    ! the matrix splits.
+                    ! determine kl and kr such that 1 <= kl <= k <= kr <= n
+                    ! and h(kl,kl-1) and h(kr+1,kr) are czero (or kl = 1 or
+                    ! kr = n).
+                    ! then inverse iteration can be performed with the
+                    ! submatrix h(kl:n,kl:n) for a left eigenvector, and with
+                    ! the submatrix h(1:kr,1:kr) for a right eigenvector.
+                    do i = k, kl + 1, -1
+                       if( h( i, i-1 )==czero )go to 30
+                    end do
+                    30 continue
+                    kl = i
+                    if( k>kr ) then
+                       do i = k, n - 1
+                          if( h( i+1, i )==czero )go to 50
+                       end do
+                       50 continue
+                       kr = i
+                    end if
+                 end if
+                 if( kl/=kln ) then
+                    kln = kl
+                    ! compute infinity-norm of submatrix h(kl:kr,kl:kr) if it
+                    ! has not ben computed before.
+                    hnorm = stdlib_zlanhs( 'I', kr-kl+1, h( kl, kl ), ldh, rwork )
+                    if( stdlib_disnan( hnorm ) ) then
+                       info = -6
+                       return
+                    else if( hnorm>rzero ) then
+                       eps3 = hnorm*ulp
+                    else
+                       eps3 = smlnum
+                    end if
+                 end if
+                 ! perturb eigenvalue if it is close to any previous
+                 ! selected eigenvalues affiliated to the submatrix
+                 ! h(kl:kr,kl:kr). close roots are modified by eps3.
+                 wk = w( k )
+                 60 continue
+                 do i = k - 1, kl, -1
+                    if( select( i ) .and. cabs1( w( i )-wk )<eps3 ) then
+                       wk = wk + eps3
+                       go to 60
+                    end if
+                 end do
+                 w( k ) = wk
+                 if( leftv ) then
+                    ! compute left eigenvector.
+                    call stdlib_zlaein( .false., noinit, n-kl+1, h( kl, kl ), ldh,wk, vl( kl, ks )&
+                              , work, ldwork, rwork, eps3,smlnum, iinfo )
+                    if( iinfo>0 ) then
+                       info = info + 1
+                       ifaill( ks ) = k
+                    else
+                       ifaill( ks ) = 0
+                    end if
+                    do i = 1, kl - 1
+                       vl( i, ks ) = czero
+                    end do
+                 end if
+                 if( rightv ) then
+                    ! compute right eigenvector.
+                    call stdlib_zlaein( .true., noinit, kr, h, ldh, wk, vr( 1, ks ),work, ldwork, &
+                              rwork, eps3, smlnum, iinfo )
+                    if( iinfo>0 ) then
+                       info = info + 1
+                       ifailr( ks ) = k
+                    else
+                       ifailr( ks ) = 0
+                    end if
+                    do i = kr + 1, n
+                       vr( i, ks ) = czero
+                    end do
+                 end if
+                 ks = ks + 1
+              end if
+           end do loop_100
+           return
+     end subroutine stdlib_zhsein
+
+     !> Using the divide and conquer method, ZLAED0: computes all eigenvalues
+     !> of a symmetric tridiagonal matrix which is one diagonal block of
+     !> those from reducing a dense or band Hermitian matrix and
+     !> corresponding eigenvectors of the dense or band matrix.
+
+     pure subroutine stdlib_zlaed0( qsiz, n, d, e, q, ldq, qstore, ldqs, rwork,iwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldq, ldqs, n, qsiz
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: q(ldq,*)
+           complex(dp), intent(out) :: qstore(ldqs,*)
+        ! =====================================================================
+        ! warning:      n could be as big as qsiz!
+           
+           ! Local Scalars 
+           integer(ilp) :: curlvl, curprb, curr, i, igivcl, igivnm, igivpt, indxq, iperm, iprmpt, &
+           iq, iqptr, iwrem, j, k, lgn, ll, matsiz, msd2, smlsiz, smm1, spm1, spm2, submat, &
+                     subpbs, tlvls
+           real(dp) :: temp
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log,max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           ! if( icompq < 0 .or. icompq > 2 ) then
+              ! info = -1
+           ! else if( ( icompq == 1 ) .and. ( qsiz < max( 0, n ) ) )
+          ! $        then
+           if( qsiz<max( 0, n ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ldq<max( 1, n ) ) then
+              info = -6
+           else if( ldqs<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAED0', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           smlsiz = stdlib_ilaenv( 9, 'ZLAED0', ' ', 0, 0, 0, 0 )
+           ! determine the size and placement of the submatrices, and save in
+           ! the leading elements of iwork.
+           iwork( 1 ) = n
+           subpbs = 1
+           tlvls = 0
+           10 continue
+           if( iwork( subpbs )>smlsiz ) then
+              do j = subpbs, 1, -1
+                 iwork( 2*j ) = ( iwork( j )+1 ) / 2
+                 iwork( 2*j-1 ) = iwork( j ) / 2
+              end do
+              tlvls = tlvls + 1
+              subpbs = 2*subpbs
+              go to 10
+           end if
+           do j = 2, subpbs
+              iwork( j ) = iwork( j ) + iwork( j-1 )
+           end do
+           ! divide the matrix into subpbs submatrices of size at most smlsiz+1
+           ! using rank-1 modifications (cuts).
+           spm1 = subpbs - 1
+           do i = 1, spm1
+              submat = iwork( i ) + 1
+              smm1 = submat - 1
+              d( smm1 ) = d( smm1 ) - abs( e( smm1 ) )
+              d( submat ) = d( submat ) - abs( e( smm1 ) )
+           end do
+           indxq = 4*n + 3
+           ! set up workspaces for eigenvalues only/accumulate new vectors
+           ! routine
+           temp = log( real( n,KIND=dp) ) / log( two )
+           lgn = int( temp,KIND=ilp)
+           if( 2**lgn<n )lgn = lgn + 1
+           if( 2**lgn<n )lgn = lgn + 1
+           iprmpt = indxq + n + 1
+           iperm = iprmpt + n*lgn
+           iqptr = iperm + n*lgn
+           igivpt = iqptr + n + 2
+           igivcl = igivpt + n*lgn
+           igivnm = 1
+           iq = igivnm + 2*n*lgn
+           iwrem = iq + n**2 + 1
+           ! initialize pointers
+           do i = 0, subpbs
+              iwork( iprmpt+i ) = 1
+              iwork( igivpt+i ) = 1
+           end do
+           iwork( iqptr ) = 1
+           ! solve each submatrix eigenproblem at the bottom of the divide and
+           ! conquer tree.
+           curr = 0
+           do i = 0, spm1
+              if( i==0 ) then
+                 submat = 1
+                 matsiz = iwork( 1 )
+              else
+                 submat = iwork( i ) + 1
+                 matsiz = iwork( i+1 ) - iwork( i )
+              end if
+              ll = iq - 1 + iwork( iqptr+curr )
+              call stdlib_dsteqr( 'I', matsiz, d( submat ), e( submat ),rwork( ll ), matsiz, &
+                        rwork, info )
+              call stdlib_zlacrm( qsiz, matsiz, q( 1, submat ), ldq, rwork( ll ),matsiz, qstore( &
+                        1, submat ), ldqs,rwork( iwrem ) )
+              iwork( iqptr+curr+1 ) = iwork( iqptr+curr ) + matsiz**2
+              curr = curr + 1
+              if( info>0 ) then
+                 info = submat*( n+1 ) + submat + matsiz - 1
+                 return
+              end if
+              k = 1
+              do j = submat, iwork( i+1 )
+                 iwork( indxq+j ) = k
+                 k = k + 1
+              end do
+           end do
+           ! successively merge eigensystems of adjacent submatrices
+           ! into eigensystem for the corresponding larger matrix.
+           ! while ( subpbs > 1 )
+           curlvl = 1
+           80 continue
+           if( subpbs>1 ) then
+              spm2 = subpbs - 2
+              do i = 0, spm2, 2
+                 if( i==0 ) then
+                    submat = 1
+                    matsiz = iwork( 2 )
+                    msd2 = iwork( 1 )
+                    curprb = 0
+                 else
+                    submat = iwork( i ) + 1
+                    matsiz = iwork( i+2 ) - iwork( i )
+                    msd2 = matsiz / 2
+                    curprb = curprb + 1
+                 end if
+           ! merge lower order eigensystems (of size msd2 and matsiz - msd2)
+           ! into an eigensystem of size matsiz.  stdlib_zlaed7 handles the case
+           ! when the eigenvectors of a full or band hermitian matrix (which
+           ! was reduced to tridiagonal form) are desired.
+           ! i am free to use q as a valuable working space until loop 150.
+                 call stdlib_zlaed7( matsiz, msd2, qsiz, tlvls, curlvl, curprb,d( submat ), &
+                 qstore( 1, submat ), ldqs,e( submat+msd2-1 ), iwork( indxq+submat ),rwork( iq ), &
+                 iwork( iqptr ), iwork( iprmpt ),iwork( iperm ), iwork( igivpt ),iwork( igivcl ), &
+                           rwork( igivnm ),q( 1, submat ), rwork( iwrem ),iwork( subpbs+1 ), info )
+                 if( info>0 ) then
+                    info = submat*( n+1 ) + submat + matsiz - 1
+                    return
+                 end if
+                 iwork( i / 2+1 ) = iwork( i+2 )
+              end do
+              subpbs = subpbs / 2
+              curlvl = curlvl + 1
+              go to 80
+           end if
+           ! end while
+           ! re-merge the eigenvalues/vectors which were deflated at the final
+           ! merge step.
+           do i = 1, n
+              j = iwork( indxq+i )
+              rwork( i ) = d( j )
+              call stdlib_zcopy( qsiz, qstore( 1, j ), 1, q( 1, i ), 1 )
+           end do
+           call stdlib_dcopy( n, rwork, 1, d, 1 )
+           return
+     end subroutine stdlib_zlaed0
+
+     !> ZLAMSWLQ: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product of blocked
+     !> elementary reflectors computed by short wide LQ
+     !> factorization (ZLASWLQ)
+
+     pure subroutine stdlib_zlamswlq( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, mb, nb, ldt, lwork, ldc
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), t(ldt,*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: c(ldc,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: i, ii, kk, lw, ctr
+           ! External Subroutines 
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork<0
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'C' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           if (left) then
+             lw = n * mb
+           else
+             lw = m * mb
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( k<0 ) then
+             info = -5
+           else if( m<k ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<mb .or. mb<1) then
+             info = -6
+           else if( lda<max( 1, k ) ) then
+             info = -9
+           else if( ldt<max( 1, mb) ) then
+             info = -11
+           else if( ldc<max( 1, m ) ) then
+              info = -13
+           else if(( lwork<max(1,lw)).and.(.not.lquery)) then
+             info = -15
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZLAMSWLQ', -info )
+             work(1) = lw
+             return
+           else if (lquery) then
+             work(1) = lw
+             return
+           end if
+           ! quick return if possible
+           if( min(m,n,k)==0 ) then
+             return
+           end if
+           if((nb<=k).or.(nb>=max(m,n,k))) then
+             call stdlib_zgemlqt( side, trans, m, n, k, mb, a, lda,t, ldt, c, ldc, work, info)
+                       
+             return
+           end if
+           if(left.and.tran) then
+               ! multiply q to the last block of c
+               kk = mod((m-k),(nb-k))
+               ctr = (m-k)/(nb-k)
+               if (kk>0) then
+                 ii=m-kk+1
+                 call stdlib_ztpmlqt('L','C',kk , n, k, 0, mb, a(1,ii), lda,t(1,ctr*k+1), ldt, c(&
+                           1,1), ldc,c(ii,1), ldc, work, info )
+               else
+                 ii=m+1
+               end if
+               do i=ii-(nb-k),nb+1,-(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+nb)
+                 ctr = ctr - 1
+                 call stdlib_ztpmlqt('L','C',nb-k , n, k, 0,mb, a(1,i), lda,t(1,ctr*k+1),ldt, c(1,&
+                           1), ldc,c(i,1), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:nb)
+               call stdlib_zgemlqt('L','C',nb , n, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (left.and.notran) then
+               ! multiply q to the first block of c
+              kk = mod((m-k),(nb-k))
+              ii=m-kk+1
+              ctr = 1
+              call stdlib_zgemlqt('L','N',nb , n, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=nb+1,ii-nb+k,(nb-k)
+               ! multiply q to the current block of c (i:i+nb,1:n)
+               call stdlib_ztpmlqt('L','N',nb-k , n, k, 0,mb, a(1,i), lda,t(1, ctr * k + 1), ldt, &
+                         c(1,1), ldc,c(i,1), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=m) then
+               ! multiply q to the last block of c
+               call stdlib_ztpmlqt('L','N',kk , n, k, 0, mb, a(1,ii), lda,t(1, ctr * k + 1), ldt, &
+                         c(1,1), ldc,c(ii,1), ldc, work, info )
+              end if
+           else if(right.and.notran) then
+               ! multiply q to the last block of c
+               kk = mod((n-k),(nb-k))
+               ctr = (n-k)/(nb-k)
+               if (kk>0) then
+                 ii=n-kk+1
+                 call stdlib_ztpmlqt('R','N',m , kk, k, 0, mb, a(1, ii), lda,t(1, ctr * k + 1), &
+                           ldt, c(1,1), ldc,c(1,ii), ldc, work, info )
+               else
+                 ii=n+1
+               end if
+               do i=ii-(nb-k),nb+1,-(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               ctr = ctr - 1
+               call stdlib_ztpmlqt('R','N', m, nb-k, k, 0, mb, a(1, i), lda,t(1, ctr * k + 1), &
+                         ldt, c(1,1), ldc,c(1,i), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:mb)
+               call stdlib_zgemlqt('R','N',m , nb, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (right.and.tran) then
+             ! multiply q to the first block of c
+              kk = mod((n-k),(nb-k))
+              ii=n-kk+1
+              call stdlib_zgemlqt('R','C',m , nb, k, mb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              ctr = 1
+              do i=nb+1,ii-nb+k,(nb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               call stdlib_ztpmlqt('R','C',m , nb-k, k, 0,mb, a(1,i), lda,t(1,ctr *k+1), ldt, c(1,&
+                         1), ldc,c(1,i), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=n) then
+             ! multiply q to the last block of c
+               call stdlib_ztpmlqt('R','C',m , kk, k, 0,mb, a(1,ii), lda,t(1, ctr * k + 1),ldt, c(&
+                         1,1), ldc,c(1,ii), ldc, work, info )
+              end if
+           end if
+           work(1) = lw
+           return
+     end subroutine stdlib_zlamswlq
+
+     !> ZLAMTSQR: overwrites the general complex M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product
+     !> of blocked elementary reflectors computed by tall skinny
+     !> QR factorization (ZLATSQR)
+
+     pure subroutine stdlib_zlamtsqr( side, trans, m, n, k, mb, nb, a, lda, t,ldt, c, ldc, work, &
+               lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, mb, nb, ldt, lwork, ldc
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), t(ldt,*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: c(ldc,*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: i, ii, kk, lw, ctr, q
+           ! External Subroutines 
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork<0
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'C' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           if (left) then
+             lw = n * nb
+             q = m
+           else
+             lw = m * nb
+             q = n
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+              info = -1
+           else if( .not.tran .and. .not.notran ) then
+              info = -2
+           else if( m<k ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 ) then
+             info = -5
+           else if( k<nb .or. nb<1 ) then
+             info = -7
+           else if( lda<max( 1, q ) ) then
+             info = -9
+           else if( ldt<max( 1, nb) ) then
+             info = -11
+           else if( ldc<max( 1, m ) ) then
+              info = -13
+           else if(( lwork<max(1,lw)).and.(.not.lquery)) then
+             info = -15
+           end if
+           ! determine the block size if it is tall skinny or short and wide
+           if( info==0)  then
+               work(1) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZLAMTSQR', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n,k)==0 ) then
+             return
+           end if
+           if((mb<=k).or.(mb>=max(m,n,k))) then
+             call stdlib_zgemqrt( side, trans, m, n, k, nb, a, lda,t, ldt, c, ldc, work, info)
+                       
+             return
+            end if
+           if(left.and.notran) then
+               ! multiply q to the last block of c
+              kk = mod((m-k),(mb-k))
+              ctr = (m-k)/(mb-k)
+              if (kk>0) then
+                ii=m-kk+1
+                call stdlib_ztpmqrt('L','N',kk , n, k, 0, nb, a(ii,1), lda,t(1, ctr * k + 1),ldt ,&
+                           c(1,1), ldc,c(ii,1), ldc, work, info )
+              else
+                ii=m+1
+              end if
+              do i=ii-(mb-k),mb+1,-(mb-k)
+               ! multiply q to the current block of c (i:i+mb,1:n)
+                ctr = ctr - 1
+                call stdlib_ztpmqrt('L','N',mb-k , n, k, 0,nb, a(i,1), lda,t(1,ctr * k + 1),ldt, &
+                          c(1,1), ldc,c(i,1), ldc, work, info )
+              end do
+               ! multiply q to the first block of c (1:mb,1:n)
+              call stdlib_zgemqrt('L','N',mb , n, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+           else if (left.and.tran) then
+               ! multiply q to the first block of c
+              kk = mod((m-k),(mb-k))
+              ii=m-kk+1
+              ctr = 1
+              call stdlib_zgemqrt('L','C',mb , n, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=mb+1,ii-mb+k,(mb-k)
+               ! multiply q to the current block of c (i:i+mb,1:n)
+               call stdlib_ztpmqrt('L','C',mb-k , n, k, 0,nb, a(i,1), lda,t(1,ctr * k + 1),ldt, c(&
+                         1,1), ldc,c(i,1), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=m) then
+               ! multiply q to the last block of c
+               call stdlib_ztpmqrt('L','C',kk , n, k, 0,nb, a(ii,1), lda,t(1, ctr * k + 1), ldt, &
+                         c(1,1), ldc,c(ii,1), ldc, work, info )
+              end if
+           else if(right.and.tran) then
+               ! multiply q to the last block of c
+               kk = mod((n-k),(mb-k))
+               ctr = (n-k)/(mb-k)
+               if (kk>0) then
+                 ii=n-kk+1
+                 call stdlib_ztpmqrt('R','C',m , kk, k, 0, nb, a(ii,1), lda,t(1,ctr * k + 1), ldt,&
+                            c(1,1), ldc,c(1,ii), ldc, work, info )
+               else
+                 ii=n+1
+               end if
+               do i=ii-(mb-k),mb+1,-(mb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+                 ctr = ctr - 1
+                 call stdlib_ztpmqrt('R','C',m , mb-k, k, 0,nb, a(i,1), lda,t(1, ctr * k + 1), &
+                           ldt, c(1,1), ldc,c(1,i), ldc, work, info )
+               end do
+               ! multiply q to the first block of c (1:m,1:mb)
+               call stdlib_zgemqrt('R','C',m , mb, k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                         info )
+           else if (right.and.notran) then
+               ! multiply q to the first block of c
+              kk = mod((n-k),(mb-k))
+              ii=n-kk+1
+              ctr = 1
+              call stdlib_zgemqrt('R','N', m, mb , k, nb, a(1,1), lda, t,ldt ,c(1,1), ldc, work, &
+                        info )
+              do i=mb+1,ii-mb+k,(mb-k)
+               ! multiply q to the current block of c (1:m,i:i+mb)
+               call stdlib_ztpmqrt('R','N', m, mb-k, k, 0,nb, a(i,1), lda,t(1, ctr * k + 1),ldt, &
+                         c(1,1), ldc,c(1,i), ldc, work, info )
+               ctr = ctr + 1
+              end do
+              if(ii<=n) then
+               ! multiply q to the last block of c
+               call stdlib_ztpmqrt('R','N', m, kk , k, 0,nb, a(ii,1), lda,t(1,ctr * k + 1),ldt, c(&
+                         1,1), ldc,c(1,ii), ldc, work, info )
+              end if
+           end if
+           work(1) = lw
+           return
+     end subroutine stdlib_zlamtsqr
+
+     !> ZLAQR2: is identical to ZLAQR3 except that it avoids
+     !> recursion by calling ZLAHQR instead of ZLAQR4.
+     !> Aggressive early deflation:
+     !> ZLAQR2 accepts as input an upper Hessenberg matrix
+     !> H and performs an unitary similarity transformation
+     !> designed to detect and deflate fully converged eigenvalues from
+     !> a trailing principal submatrix.  On output H has been over-
+     !> written by a new Hessenberg matrix that is a perturbation of
+     !> an unitary similarity transformation of H.  It is to be
+     !> hoped that the final version of H has many zero subdiagonal
+     !> entries.
+
+     pure subroutine stdlib_zlaqr2( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+               ns, nd, sh, v, ldv, nh, t, ldt,nv, wv, ldwv, work, lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kbot, ktop, ldh, ldt, ldv, ldwv, ldz, lwork, n,&
+                      nh, nv, nw
+           integer(ilp), intent(out) :: nd, ns
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(dp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(dp), intent(out) :: sh(*), t(ldt,*), v(ldv,*), work(*), wv(ldwv,*)
+        ! ================================================================
+           ! Parameters 
+           real(dp), parameter :: rzero = 0.0_dp
+           real(dp), parameter :: rone = 1.0_dp
+           
+           
+           ! Local Scalars 
+           complex(dp) :: beta, cdum, s, tau
+           real(dp) :: foo, safmax, safmin, smlnum, ulp
+           integer(ilp) :: i, ifst, ilst, info, infqr, j, jw, kcol, kln, knt, krow, kwtop, ltop, &
+                     lwk1, lwk2, lwkopt
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,int,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! ==== estimate optimal workspace. ====
+           jw = min( nw, kbot-ktop+1 )
+           if( jw<=2 ) then
+              lwkopt = 1
+           else
+              ! ==== workspace query call to stdlib_zgehrd ====
+              call stdlib_zgehrd( jw, 1, jw-1, t, ldt, work, work, -1, info )
+              lwk1 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_zunmhr ====
+              call stdlib_zunmhr( 'R', 'N', jw, jw, 1, jw-1, t, ldt, work, v, ldv,work, -1, info )
+                        
+              lwk2 = int( work( 1 ),KIND=ilp)
+              ! ==== optimal workspace ====
+              lwkopt = jw + max( lwk1, lwk2 )
+           end if
+           ! ==== quick return in case of workspace query. ====
+           if( lwork==-1 ) then
+              work( 1 ) = cmplx( lwkopt, 0,KIND=dp)
+              return
+           end if
+           ! ==== nothing to do ...
+           ! ... for an empty active block ... ====
+           ns = 0
+           nd = 0
+           work( 1 ) = cone
+           if( ktop>kbot )return
+           ! ... nor for an empty deflation window. ====
+           if( nw<1 )return
+           ! ==== machine constants ====
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = rone / safmin
+           call stdlib_dlabad( safmin, safmax )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=dp) / ulp )
+           ! ==== setup deflation window ====
+           jw = min( nw, kbot-ktop+1 )
+           kwtop = kbot - jw + 1
+           if( kwtop==ktop ) then
+              s = czero
+           else
+              s = h( kwtop, kwtop-1 )
+           end if
+           if( kbot==kwtop ) then
+              ! ==== 1-by-1 deflation window: not much to do ====
+              sh( kwtop ) = h( kwtop, kwtop )
+              ns = 1
+              nd = 0
+              if( cabs1( s )<=max( smlnum, ulp*cabs1( h( kwtop,kwtop ) ) ) ) then
+                 ns = 0
+                 nd = 1
+                 if( kwtop>ktop )h( kwtop, kwtop-1 ) = czero
+              end if
+              work( 1 ) = cone
+              return
+           end if
+           ! ==== convert to spike-triangular form.  (in case of a
+           ! .    rare qr failure, this routine continues to do
+           ! .    aggressive early deflation using that part of
+           ! .    the deflation window that converged using infqr
+           ! .    here and there to keep track.) ====
+           call stdlib_zlacpy( 'U', jw, jw, h( kwtop, kwtop ), ldh, t, ldt )
+           call stdlib_zcopy( jw-1, h( kwtop+1, kwtop ), ldh+1, t( 2, 1 ), ldt+1 )
+           call stdlib_zlaset( 'A', jw, jw, czero, cone, v, ldv )
+           call stdlib_zlahqr( .true., .true., jw, 1, jw, t, ldt, sh( kwtop ), 1,jw, v, ldv, &
+                     infqr )
+           ! ==== deflation detection loop ====
+           ns = jw
+           ilst = infqr + 1
+           do knt = infqr + 1, jw
+              ! ==== small spike tip deflation test ====
+              foo = cabs1( t( ns, ns ) )
+              if( foo==rzero )foo = cabs1( s )
+              if( cabs1( s )*cabs1( v( 1, ns ) )<=max( smlnum, ulp*foo ) )then
+                 ! ==== cone more converged eigenvalue ====
+                 ns = ns - 1
+              else
+                 ! ==== cone undeflatable eigenvalue.  move it up out of the
+                 ! .    way.   (stdlib_ztrexc can not fail in this case.) ====
+                 ifst = ns
+                 call stdlib_ztrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, info )
+                 ilst = ilst + 1
+              end if
+           end do
+              ! ==== return to hessenberg form ====
+           if( ns==0 )s = czero
+           if( ns<jw ) then
+              ! ==== sorting the diagonal of t improves accuracy for
+              ! .    graded matrices.  ====
+              do i = infqr + 1, ns
+                 ifst = i
+                 do j = i + 1, ns
+                    if( cabs1( t( j, j ) )>cabs1( t( ifst, ifst ) ) )ifst = j
+                 end do
+                 ilst = i
+                 if( ifst/=ilst )call stdlib_ztrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, info )
+                           
+              end do
+           end if
+           ! ==== restore shift/eigenvalue array from t ====
+           do i = infqr + 1, jw
+              sh( kwtop+i-1 ) = t( i, i )
+           end do
+           if( ns<jw .or. s==czero ) then
+              if( ns>1 .and. s/=czero ) then
+                 ! ==== reflect spike back into lower triangle ====
+                 call stdlib_zcopy( ns, v, ldv, work, 1 )
+                 do i = 1, ns
+                    work( i ) = conjg( work( i ) )
+                 end do
+                 beta = work( 1 )
+                 call stdlib_zlarfg( ns, beta, work( 2 ), 1, tau )
+                 work( 1 ) = cone
+                 call stdlib_zlaset( 'L', jw-2, jw-2, czero, czero, t( 3, 1 ), ldt )
+                 call stdlib_zlarf( 'L', ns, jw, work, 1, conjg( tau ), t, ldt,work( jw+1 ) )
+                           
+                 call stdlib_zlarf( 'R', ns, ns, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_zlarf( 'R', jw, ns, work, 1, tau, v, ldv,work( jw+1 ) )
+                 call stdlib_zgehrd( jw, 1, ns, t, ldt, work, work( jw+1 ),lwork-jw, info )
+                           
+              end if
+              ! ==== copy updated reduced window into place ====
+              if( kwtop>1 )h( kwtop, kwtop-1 ) = s*conjg( v( 1, 1 ) )
+              call stdlib_zlacpy( 'U', jw, jw, t, ldt, h( kwtop, kwtop ), ldh )
+              call stdlib_zcopy( jw-1, t( 2, 1 ), ldt+1, h( kwtop+1, kwtop ),ldh+1 )
+              ! ==== accumulate orthogonal matrix in order update
+              ! .    h and z, if requested.  ====
+              if( ns>1 .and. s/=czero )call stdlib_zunmhr( 'R', 'N', jw, ns, 1, ns, t, ldt, work, &
+                        v, ldv,work( jw+1 ), lwork-jw, info )
+              ! ==== update vertical slab in h ====
+              if( wantt ) then
+                 ltop = 1
+              else
+                 ltop = ktop
+              end if
+              do krow = ltop, kwtop - 1, nv
+                 kln = min( nv, kwtop-krow )
+                 call stdlib_zgemm( 'N', 'N', kln, jw, jw, cone, h( krow, kwtop ),ldh, v, ldv, &
+                           czero, wv, ldwv )
+                 call stdlib_zlacpy( 'A', kln, jw, wv, ldwv, h( krow, kwtop ), ldh )
+              end do
+              ! ==== update horizontal slab in h ====
+              if( wantt ) then
+                 do kcol = kbot + 1, n, nh
+                    kln = min( nh, n-kcol+1 )
+                    call stdlib_zgemm( 'C', 'N', jw, kln, jw, cone, v, ldv,h( kwtop, kcol ), ldh, &
+                              czero, t, ldt )
+                    call stdlib_zlacpy( 'A', jw, kln, t, ldt, h( kwtop, kcol ),ldh )
+                 end do
+              end if
+              ! ==== update vertical slab in z ====
+              if( wantz ) then
+                 do krow = iloz, ihiz, nv
+                    kln = min( nv, ihiz-krow+1 )
+                    call stdlib_zgemm( 'N', 'N', kln, jw, jw, cone, z( krow, kwtop ),ldz, v, ldv, &
+                              czero, wv, ldwv )
+                    call stdlib_zlacpy( 'A', kln, jw, wv, ldwv, z( krow, kwtop ),ldz )
+                 end do
+              end if
+           end if
+           ! ==== return the number of deflations ... ====
+           nd = jw - ns
+           ! ==== ... and the number of shifts. (subtracting
+           ! .    infqr from the spike length takes care
+           ! .    of the case of a rare qr failure while
+           ! .    calculating eigenvalues of the deflation
+           ! .    window.)  ====
+           ns = ns - infqr
+            ! ==== return optimal workspace. ====
+           work( 1 ) = cmplx( lwkopt, 0,KIND=dp)
+     end subroutine stdlib_zlaqr2
+
+     !> ZLASWLQ: computes a blocked Tall-Skinny LQ factorization of
+     !> a complexx M-by-N matrix A for M <= N:
+     !> A = ( L 0 ) *  Q,
+     !> where:
+     !> Q is a n-by-N orthogonal matrix, stored on exit in an implicit
+     !> form in the elements above the diagonal of the array A and in
+     !> the elements of the array T;
+     !> L is a lower-triangular M-by-M matrix stored on exit in
+     !> the elements on and below the diagonal of the array A.
+     !> 0 is a M-by-(N-M) zero matrix, if M < N, and is not stored.
+
+     pure subroutine stdlib_zlaswlq( m, n, mb, nb, a, lda, t, ldt, work, lwork,info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, mb, nb, lwork, ldt
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ii, kk, ctr
+           ! External Subroutines 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+             info = -1
+           else if( n<0 .or. n<m ) then
+             info = -2
+           else if( mb<1 .or. ( mb>m .and. m>0 )) then
+             info = -3
+           else if( nb<=0 ) then
+             info = -4
+           else if( lda<max( 1, m ) ) then
+             info = -6
+           else if( ldt<mb ) then
+             info = -8
+           else if( ( lwork<m*mb) .and. (.not.lquery) ) then
+             info = -10
+           end if
+           if( info==0)  then
+           work(1) = mb*m
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZLASWLQ', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n)==0 ) then
+               return
+           end if
+           ! the lq decomposition
+            if((m>=n).or.(nb<=m).or.(nb>=n)) then
+             call stdlib_zgelqt( m, n, mb, a, lda, t, ldt, work, info)
+             return
+            end if
+            kk = mod((n-m),(nb-m))
+            ii=n-kk+1
+            ! compute the lq factorization of the first block a(1:m,1:nb)
+            call stdlib_zgelqt( m, nb, mb, a(1,1), lda, t, ldt, work, info)
+            ctr = 1
+            do i = nb+1, ii-nb+m , (nb-m)
+            ! compute the qr factorization of the current block a(1:m,i:i+nb-m)
+              call stdlib_ztplqt( m, nb-m, 0, mb, a(1,1), lda, a( 1, i ),lda, t(1, ctr * m + 1),&
+                        ldt, work, info )
+              ctr = ctr + 1
+            end do
+           ! compute the qr factorization of the last block a(1:m,ii:n)
+            if (ii<=n) then
+             call stdlib_ztplqt( m, kk, 0, mb, a(1,1), lda, a( 1, ii ),lda, t(1, ctr * m + 1), &
+                       ldt,work, info )
+            end if
+           work( 1 ) = m * mb
+           return
+     end subroutine stdlib_zlaswlq
+
+     !> ZLATSQR: computes a blocked Tall-Skinny QR factorization of
+     !> a complex M-by-N matrix A for M >= N:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix, stored on exit in an implicit
+     !> form in the elements below the diagonal of the array A and in
+     !> the elements of the array T;
+     !> R is an upper-triangular N-by-N matrix, stored on exit in
+     !> the elements on and above the diagonal of the array A.
+     !> 0 is a (M-N)-by-N zero matrix, and is not stored.
+
+     pure subroutine stdlib_zlatsqr( m, n, mb, nb, a, lda, t, ldt, work,lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, mb, nb, ldt, lwork
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*), t(ldt,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, ii, kk, ctr
+           ! External Subroutines 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+             info = -1
+           else if( n<0 .or. m<n ) then
+             info = -2
+           else if( mb<1 ) then
+             info = -3
+           else if( nb<1 .or. ( nb>n .and. n>0 )) then
+             info = -4
+           else if( lda<max( 1, m ) ) then
+             info = -6
+           else if( ldt<nb ) then
+             info = -8
+           else if( lwork<(n*nb) .and. (.not.lquery) ) then
+             info = -10
+           end if
+           if( info==0)  then
+             work(1) = nb*n
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZLATSQR', -info )
+             return
+           else if (lquery) then
+            return
+           end if
+           ! quick return if possible
+           if( min(m,n)==0 ) then
+               return
+           end if
+           ! the qr decomposition
+            if ((mb<=n).or.(mb>=m)) then
+              call stdlib_zgeqrt( m, n, nb, a, lda, t, ldt, work, info)
+              return
+            end if
+            kk = mod((m-n),(mb-n))
+            ii=m-kk+1
+            ! compute the qr factorization of the first block a(1:mb,1:n)
+            call stdlib_zgeqrt( mb, n, nb, a(1,1), lda, t, ldt, work, info )
+            ctr = 1
+            do i = mb+1, ii-mb+n ,  (mb-n)
+            ! compute the qr factorization of the current block a(i:i+mb-n,1:n)
+              call stdlib_ztpqrt( mb-n, n, 0, nb, a(1,1), lda, a( i, 1 ), lda,t(1, ctr * n + 1),&
+                        ldt, work, info )
+              ctr = ctr + 1
+            end do
+            ! compute the qr factorization of the last block a(ii:m,1:n)
+            if (ii<=m) then
+              call stdlib_ztpqrt( kk, n, 0, nb, a(1,1), lda, a( ii, 1 ), lda,t(1,ctr * n + 1), &
+                        ldt,work, info )
+            end if
+           work( 1 ) = n*nb
+           return
+     end subroutine stdlib_zlatsqr
+
+     !> ZPBSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite band matrix and X
+     !> and B are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular band matrix, and L is a lower
+     !> triangular band matrix, with the same number of superdiagonals or
+     !> subdiagonals as A.  The factored form of A is then used to solve the
+     !> system of equations A * X = B.
+
+     pure subroutine stdlib_zpbsv( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldb, n, nrhs
+           ! Array Arguments 
+           complex(dp), intent(inout) :: ab(ldab,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( kd<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**h *u or a = l*l**h.
+           call stdlib_zpbtrf( uplo, n, kd, ab, ldab, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_zpbtrs( uplo, n, kd, nrhs, ab, ldab, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_zpbsv
+
+     !> ZPBSVX: uses the Cholesky factorization A = U**H*U or A = L*L**H to
+     !> compute the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite band matrix and X
+     !> and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_zpbsvx( fact, uplo, n, kd, nrhs, ab, ldab, afb, ldafb,equed, s, b, ldb, x, &
+               ldx, rcond, ferr, berr,work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldafb, ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(dp), intent(inout) :: s(*)
+           complex(dp), intent(inout) :: ab(ldab,*), afb(ldafb,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ, upper
+           integer(ilp) :: i, infequ, j, j1, j2
+           real(dp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           upper = stdlib_lsame( uplo, 'U' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( nrhs<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           else if( ldafb<kd+1 ) then
+              info = -9
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -10
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -11
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -13
+                 else if( ldx<max( 1, n ) ) then
+                    info = -15
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPBSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_zpbequ( uplo, n, kd, ab, ldab, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_zlaqhb( uplo, n, kd, ab, ldab, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right-hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**h *u or a = l*l**h.
+              if( upper ) then
+                 do j = 1, n
+                    j1 = max( j-kd, 1 )
+                    call stdlib_zcopy( j-j1+1, ab( kd+1-j+j1, j ), 1,afb( kd+1-j+j1, j ), 1 )
+                              
+                 end do
+              else
+                 do j = 1, n
+                    j2 = min( j+kd, n )
+                    call stdlib_zcopy( j2-j+1, ab( 1, j ), 1, afb( 1, j ), 1 )
+                 end do
+              end if
+              call stdlib_zpbtrf( uplo, n, kd, afb, ldafb, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_zlanhb( '1', uplo, n, kd, ab, ldab, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_zpbcon( uplo, n, kd, afb, ldafb, anorm, rcond, work, rwork,info )
+           ! compute the solution matrix x.
+           call stdlib_zlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_zpbtrs( uplo, n, kd, nrhs, afb, ldafb, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_zpbrfs( uplo, n, kd, nrhs, ab, ldab, afb, ldafb, b, ldb, x,ldx, ferr, berr,&
+                      work, rwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_zpbsvx
+
+     !> ZPFTRF: computes the Cholesky factorization of a complex Hermitian
+     !> positive definite matrix A.
+     !> The factorization has the form
+     !> A = U**H * U,  if UPLO = 'U', or
+     !> A = L  * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and L is lower triangular.
+     !> This is the block version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_zpftrf( transr, uplo, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(in) :: n
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPFTRF', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution: there are eight cases
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                   ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                   ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                   ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_zpotrf( 'L', n1, a( 0 ), n, info )
+                    if( info>0 )return
+                    call stdlib_ztrsm( 'R', 'L', 'C', 'N', n2, n1, cone, a( 0 ), n,a( n1 ), n )
+                              
+                    call stdlib_zherk( 'U', 'N', n2, n1, -one, a( n1 ), n, one,a( n ), n )
+                    call stdlib_zpotrf( 'U', n2, a( n ), n, info )
+                    if( info>0 )info = info + n1
+                 else
+                   ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                   ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                   ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_zpotrf( 'L', n1, a( n2 ), n, info )
+                    if( info>0 )return
+                    call stdlib_ztrsm( 'L', 'L', 'N', 'N', n1, n2, cone, a( n2 ), n,a( 0 ), n )
+                              
+                    call stdlib_zherk( 'U', 'C', n2, n1, -one, a( 0 ), n, one,a( n1 ), n )
+                    call stdlib_zpotrf( 'U', n2, a( n1 ), n, info )
+                    if( info>0 )info = info + n1
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is odd
+                    ! t1 -> a(0,0) , t2 -> a(1,0) , s -> a(0,n1)
+                    ! t1 -> a(0+0) , t2 -> a(1+0) , s -> a(0+n1*n1); lda=n1
+                    call stdlib_zpotrf( 'U', n1, a( 0 ), n1, info )
+                    if( info>0 )return
+                    call stdlib_ztrsm( 'L', 'U', 'C', 'N', n1, n2, cone, a( 0 ), n1,a( n1*n1 ), &
+                              n1 )
+                    call stdlib_zherk( 'L', 'C', n2, n1, -one, a( n1*n1 ), n1, one,a( 1 ), n1 )
+                              
+                    call stdlib_zpotrf( 'L', n2, a( 1 ), n1, info )
+                    if( info>0 )info = info + n1
+                 else
+                    ! srpa for upper, transpose and n is odd
+                    ! t1 -> a(0,n1+1), t2 -> a(0,n1), s -> a(0,0)
+                    ! t1 -> a(n2*n2), t2 -> a(n1*n2), s -> a(0); lda = n2
+                    call stdlib_zpotrf( 'U', n1, a( n2*n2 ), n2, info )
+                    if( info>0 )return
+                    call stdlib_ztrsm( 'R', 'U', 'N', 'N', n2, n1, cone, a( n2*n2 ),n2, a( 0 ), &
+                              n2 )
+                    call stdlib_zherk( 'L', 'N', n2, n1, -one, a( 0 ), n2, one,a( n1*n2 ), n2 )
+                              
+                    call stdlib_zpotrf( 'L', n2, a( n1*n2 ), n2, info )
+                    if( info>0 )info = info + n1
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_zpotrf( 'L', k, a( 1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_ztrsm( 'R', 'L', 'C', 'N', k, k, cone, a( 1 ), n+1,a( k+1 ), n+1 )
+                              
+                    call stdlib_zherk( 'U', 'N', k, k, -one, a( k+1 ), n+1, one,a( 0 ), n+1 )
+                              
+                    call stdlib_zpotrf( 'U', k, a( 0 ), n+1, info )
+                    if( info>0 )info = info + k
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_zpotrf( 'L', k, a( k+1 ), n+1, info )
+                    if( info>0 )return
+                    call stdlib_ztrsm( 'L', 'L', 'N', 'N', k, k, cone, a( k+1 ),n+1, a( 0 ), n+1 )
+                              
+                    call stdlib_zherk( 'U', 'C', k, k, -one, a( 0 ), n+1, one,a( k ), n+1 )
+                              
+                    call stdlib_zpotrf( 'U', k, a( k ), n+1, info )
+                    if( info>0 )info = info + k
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1)
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_zpotrf( 'U', k, a( 0+k ), k, info )
+                    if( info>0 )return
+                    call stdlib_ztrsm( 'L', 'U', 'C', 'N', k, k, cone, a( k ), n1,a( k*( k+1 ) ), &
+                              k )
+                    call stdlib_zherk( 'L', 'C', k, k, -one, a( k*( k+1 ) ), k, one,a( 0 ), k )
+                              
+                    call stdlib_zpotrf( 'L', k, a( 0 ), k, info )
+                    if( info>0 )info = info + k
+                 else
+                    ! srpa for upper, transpose and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0)
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_zpotrf( 'U', k, a( k*( k+1 ) ), k, info )
+                    if( info>0 )return
+                    call stdlib_ztrsm( 'R', 'U', 'N', 'N', k, k, cone,a( k*( k+1 ) ), k, a( 0 ), &
+                              k )
+                    call stdlib_zherk( 'L', 'N', k, k, -one, a( 0 ), k, one,a( k*k ), k )
+                    call stdlib_zpotrf( 'L', k, a( k*k ), k, info )
+                    if( info>0 )info = info + k
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_zpftrf
+
+     !> ZPFTRI: computes the inverse of a complex Hermitian positive definite
+     !> matrix A using the Cholesky factorization A = U**H*U or A = L*L**H
+     !> computed by ZPFTRF.
+
+     pure subroutine stdlib_zpftri( transr, uplo, n, a, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: transr, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(0:*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, nisodd, normaltransr
+           integer(ilp) :: n1, n2, k
+           ! Intrinsic Functions 
+           intrinsic :: mod
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           normaltransr = stdlib_lsame( transr, 'N' )
+           lower = stdlib_lsame( uplo, 'L' )
+           if( .not.normaltransr .and. .not.stdlib_lsame( transr, 'C' ) ) then
+              info = -1
+           else if( .not.lower .and. .not.stdlib_lsame( uplo, 'U' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPFTRI', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! invert the triangular cholesky factor u or l.
+           call stdlib_ztftri( transr, uplo, 'N', n, a, info )
+           if( info>0 )return
+           ! if n is odd, set nisodd = .true.
+           ! if n is even, set k = n/2 and nisodd = .false.
+           if( mod( n, 2 )==0 ) then
+              k = n / 2
+              nisodd = .false.
+           else
+              nisodd = .true.
+           end if
+           ! set n1 and n2 depending on lower
+           if( lower ) then
+              n2 = n / 2
+              n1 = n - n2
+           else
+              n1 = n / 2
+              n2 = n - n1
+           end if
+           ! start execution of triangular matrix multiply: inv(u)*inv(u)^c or
+           ! inv(l)^c*inv(l). there are eight cases.
+           if( nisodd ) then
+              ! n is odd
+              if( normaltransr ) then
+                 ! n is odd and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal and n is odd ( a(0:n-1,0:n1-1) )
+                    ! t1 -> a(0,0), t2 -> a(0,1), s -> a(n1,0)
+                    ! t1 -> a(0), t2 -> a(n), s -> a(n1)
+                    call stdlib_zlauum( 'L', n1, a( 0 ), n, info )
+                    call stdlib_zherk( 'L', 'C', n1, n2, one, a( n1 ), n, one,a( 0 ), n )
+                    call stdlib_ztrmm( 'L', 'U', 'N', 'N', n2, n1, cone, a( n ), n,a( n1 ), n )
+                              
+                    call stdlib_zlauum( 'U', n2, a( n ), n, info )
+                 else
+                    ! srpa for upper, normal and n is odd ( a(0:n-1,0:n2-1)
+                    ! t1 -> a(n1+1,0), t2 -> a(n1,0), s -> a(0,0)
+                    ! t1 -> a(n2), t2 -> a(n1), s -> a(0)
+                    call stdlib_zlauum( 'L', n1, a( n2 ), n, info )
+                    call stdlib_zherk( 'L', 'N', n1, n2, one, a( 0 ), n, one,a( n2 ), n )
+                    call stdlib_ztrmm( 'R', 'U', 'C', 'N', n1, n2, cone, a( n1 ), n,a( 0 ), n )
+                              
+                    call stdlib_zlauum( 'U', n2, a( n1 ), n, info )
+                 end if
+              else
+                 ! n is odd and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose, and n is odd
+                    ! t1 -> a(0), t2 -> a(1), s -> a(0+n1*n1)
+                    call stdlib_zlauum( 'U', n1, a( 0 ), n1, info )
+                    call stdlib_zherk( 'U', 'N', n1, n2, one, a( n1*n1 ), n1, one,a( 0 ), n1 )
+                              
+                    call stdlib_ztrmm( 'R', 'L', 'N', 'N', n1, n2, cone, a( 1 ), n1,a( n1*n1 ), &
+                              n1 )
+                    call stdlib_zlauum( 'L', n2, a( 1 ), n1, info )
+                 else
+                    ! srpa for upper, transpose, and n is odd
+                    ! t1 -> a(0+n2*n2), t2 -> a(0+n1*n2), s -> a(0)
+                    call stdlib_zlauum( 'U', n1, a( n2*n2 ), n2, info )
+                    call stdlib_zherk( 'U', 'C', n1, n2, one, a( 0 ), n2, one,a( n2*n2 ), n2 )
+                              
+                    call stdlib_ztrmm( 'L', 'L', 'C', 'N', n2, n1, cone, a( n1*n2 ),n2, a( 0 ), &
+                              n2 )
+                    call stdlib_zlauum( 'L', n2, a( n1*n2 ), n2, info )
+                 end if
+              end if
+           else
+              ! n is even
+              if( normaltransr ) then
+                 ! n is even and transr = 'n'
+                 if( lower ) then
+                    ! srpa for lower, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(1,0), t2 -> a(0,0), s -> a(k+1,0)
+                    ! t1 -> a(1), t2 -> a(0), s -> a(k+1)
+                    call stdlib_zlauum( 'L', k, a( 1 ), n+1, info )
+                    call stdlib_zherk( 'L', 'C', k, k, one, a( k+1 ), n+1, one,a( 1 ), n+1 )
+                              
+                    call stdlib_ztrmm( 'L', 'U', 'N', 'N', k, k, cone, a( 0 ), n+1,a( k+1 ), n+1 )
+                              
+                    call stdlib_zlauum( 'U', k, a( 0 ), n+1, info )
+                 else
+                    ! srpa for upper, normal, and n is even ( a(0:n,0:k-1) )
+                    ! t1 -> a(k+1,0) ,  t2 -> a(k,0),   s -> a(0,0)
+                    ! t1 -> a(k+1), t2 -> a(k), s -> a(0)
+                    call stdlib_zlauum( 'L', k, a( k+1 ), n+1, info )
+                    call stdlib_zherk( 'L', 'N', k, k, one, a( 0 ), n+1, one,a( k+1 ), n+1 )
+                              
+                    call stdlib_ztrmm( 'R', 'U', 'C', 'N', k, k, cone, a( k ), n+1,a( 0 ), n+1 )
+                              
+                    call stdlib_zlauum( 'U', k, a( k ), n+1, info )
+                 end if
+              else
+                 ! n is even and transr = 'c'
+                 if( lower ) then
+                    ! srpa for lower, transpose, and n is even (see paper)
+                    ! t1 -> b(0,1), t2 -> b(0,0), s -> b(0,k+1),
+                    ! t1 -> a(0+k), t2 -> a(0+0), s -> a(0+k*(k+1)); lda=k
+                    call stdlib_zlauum( 'U', k, a( k ), k, info )
+                    call stdlib_zherk( 'U', 'N', k, k, one, a( k*( k+1 ) ), k, one,a( k ), k )
+                              
+                    call stdlib_ztrmm( 'R', 'L', 'N', 'N', k, k, cone, a( 0 ), k,a( k*( k+1 ) ), &
+                              k )
+                    call stdlib_zlauum( 'L', k, a( 0 ), k, info )
+                 else
+                    ! srpa for upper, transpose, and n is even (see paper)
+                    ! t1 -> b(0,k+1),     t2 -> b(0,k),   s -> b(0,0),
+                    ! t1 -> a(0+k*(k+1)), t2 -> a(0+k*k), s -> a(0+0)); lda=k
+                    call stdlib_zlauum( 'U', k, a( k*( k+1 ) ), k, info )
+                    call stdlib_zherk( 'U', 'C', k, k, one, a( 0 ), k, one,a( k*( k+1 ) ), k )
+                              
+                    call stdlib_ztrmm( 'L', 'L', 'C', 'N', k, k, cone, a( k*k ), k,a( 0 ), k )
+                              
+                    call stdlib_zlauum( 'L', k, a( k*k ), k, info )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_zpftri
+
+     !> ZPOSV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> The Cholesky decomposition is used to factor A as
+     !> A = U**H* U,  if UPLO = 'U', or
+     !> A = L * L**H,  if UPLO = 'L',
+     !> where U is an upper triangular matrix and  L is a lower triangular
+     !> matrix.  The factored form of A is then used to solve the system of
+     !> equations A * X = B.
+
+     pure subroutine stdlib_zposv( uplo, n, nrhs, a, lda, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOSV ', -info )
+              return
+           end if
+           ! compute the cholesky factorization a = u**h *u or a = l*l**h.
+           call stdlib_zpotrf( uplo, n, a, lda, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_zpotrs( uplo, n, nrhs, a, lda, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_zposv
+
+     !> ZPOSVX: uses the Cholesky factorization A = U**H*U or A = L*L**H to
+     !> compute the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian positive definite matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_zposvx( fact, uplo, n, nrhs, a, lda, af, ldaf, equed,s, b, ldb, x, ldx, &
+               rcond, ferr, berr, work,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(dp), intent(inout) :: s(*)
+           complex(dp), intent(inout) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: equil, nofact, rcequ
+           integer(ilp) :: i, infequ, j
+           real(dp) :: amax, anorm, bignum, scond, smax, smin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rcequ = .false.
+           else
+              rcequ = stdlib_lsame( equed, 'Y' )
+              smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rcequ .or. stdlib_lsame( equed, 'N' ) )&
+                      ) then
+              info = -9
+           else
+              if( rcequ ) then
+                 smin = bignum
+                 smax = zero
+                 do j = 1, n
+                    smin = min( smin, s( j ) )
+                    smax = max( smax, s( j ) )
+                 end do
+                 if( smin<=zero ) then
+                    info = -10
+                 else if( n>0 ) then
+                    scond = max( smin, smlnum ) / min( smax, bignum )
+                 else
+                    scond = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -12
+                 else if( ldx<max( 1, n ) ) then
+                    info = -14
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPOSVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_zpoequ( n, a, lda, s, scond, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_zlaqhe( uplo, n, a, lda, s, scond, amax, equed )
+                 rcequ = stdlib_lsame( equed, 'Y' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = s( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the cholesky factorization a = u**h *u or a = l*l**h.
+              call stdlib_zlacpy( uplo, n, n, a, lda, af, ldaf )
+              call stdlib_zpotrf( uplo, n, af, ldaf, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_zlanhe( '1', uplo, n, a, lda, rwork )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_zpocon( uplo, n, af, ldaf, anorm, rcond, work, rwork, info )
+           ! compute the solution matrix x.
+           call stdlib_zlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_zpotrs( uplo, n, nrhs, af, ldaf, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_zporfs( uplo, n, nrhs, a, lda, af, ldaf, b, ldb, x, ldx,ferr, berr, work, &
+                     rwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( rcequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = s( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / scond
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_zposvx
+
+     !> ZPTRFS: improves the computed solution to a system of linear
+     !> equations when the coefficient matrix is Hermitian positive definite
+     !> and tridiagonal, and provides error bounds and backward error
+     !> estimates for the solution.
+
+     pure subroutine stdlib_zptrfs( uplo, n, nrhs, d, e, df, ef, b, ldb, x, ldx,ferr, berr, work, &
+               rwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(dp), intent(in) :: d(*), df(*)
+           complex(dp), intent(in) :: b(ldb,*), e(*), ef(*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: itmax = 5
+           
+           
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: upper
+           integer(ilp) :: count, i, ix, j, nz
+           real(dp) :: eps, lstres, s, safe1, safe2, safmin
+           complex(dp) :: bi, cx, dx, ex, zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,max
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPTRFS', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 .or. nrhs==0 ) then
+              do j = 1, nrhs
+                 ferr( j ) = zero
+                 berr( j ) = zero
+              end do
+              return
+           end if
+           ! nz = maximum number of nonzero elements in each row of a, plus 1
+           nz = 4
+           eps = stdlib_dlamch( 'EPSILON' )
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safe1 = nz*safmin
+           safe2 = safe1 / eps
+           ! do for each right hand side
+           loop_100: do j = 1, nrhs
+              count = 1
+              lstres = three
+              20 continue
+              ! loop until stopping criterion is satisfied.
+              ! compute residual r = b - a * x.  also compute
+              ! abs(a)*abs(x) + abs(b) for use in the backward error bound.
+              if( upper ) then
+                 if( n==1 ) then
+                    bi = b( 1, j )
+                    dx = d( 1 )*x( 1, j )
+                    work( 1 ) = bi - dx
+                    rwork( 1 ) = cabs1( bi ) + cabs1( dx )
+                 else
+                    bi = b( 1, j )
+                    dx = d( 1 )*x( 1, j )
+                    ex = e( 1 )*x( 2, j )
+                    work( 1 ) = bi - dx - ex
+                    rwork( 1 ) = cabs1( bi ) + cabs1( dx ) +cabs1( e( 1 ) )*cabs1( x( 2, j ) )
+                              
+                    do i = 2, n - 1
+                       bi = b( i, j )
+                       cx = conjg( e( i-1 ) )*x( i-1, j )
+                       dx = d( i )*x( i, j )
+                       ex = e( i )*x( i+1, j )
+                       work( i ) = bi - cx - dx - ex
+                       rwork( i ) = cabs1( bi ) +cabs1( e( i-1 ) )*cabs1( x( i-1, j ) ) +cabs1( &
+                                 dx ) + cabs1( e( i ) )*cabs1( x( i+1, j ) )
+                    end do
+                    bi = b( n, j )
+                    cx = conjg( e( n-1 ) )*x( n-1, j )
+                    dx = d( n )*x( n, j )
+                    work( n ) = bi - cx - dx
+                    rwork( n ) = cabs1( bi ) + cabs1( e( n-1 ) )*cabs1( x( n-1, j ) ) + cabs1( dx &
+                              )
+                 end if
+              else
+                 if( n==1 ) then
+                    bi = b( 1, j )
+                    dx = d( 1 )*x( 1, j )
+                    work( 1 ) = bi - dx
+                    rwork( 1 ) = cabs1( bi ) + cabs1( dx )
+                 else
+                    bi = b( 1, j )
+                    dx = d( 1 )*x( 1, j )
+                    ex = conjg( e( 1 ) )*x( 2, j )
+                    work( 1 ) = bi - dx - ex
+                    rwork( 1 ) = cabs1( bi ) + cabs1( dx ) +cabs1( e( 1 ) )*cabs1( x( 2, j ) )
+                              
+                    do i = 2, n - 1
+                       bi = b( i, j )
+                       cx = e( i-1 )*x( i-1, j )
+                       dx = d( i )*x( i, j )
+                       ex = conjg( e( i ) )*x( i+1, j )
+                       work( i ) = bi - cx - dx - ex
+                       rwork( i ) = cabs1( bi ) +cabs1( e( i-1 ) )*cabs1( x( i-1, j ) ) +cabs1( &
+                                 dx ) + cabs1( e( i ) )*cabs1( x( i+1, j ) )
+                    end do
+                    bi = b( n, j )
+                    cx = e( n-1 )*x( n-1, j )
+                    dx = d( n )*x( n, j )
+                    work( n ) = bi - cx - dx
+                    rwork( n ) = cabs1( bi ) + cabs1( e( n-1 ) )*cabs1( x( n-1, j ) ) + cabs1( dx &
+                              )
+                 end if
+              end if
+              ! compute componentwise relative backward error from formula
+              ! max(i) ( abs(r(i)) / ( abs(a)*abs(x) + abs(b) )(i) )
+              ! where abs(z) is the componentwise absolute value of the matrix
+              ! or vector z.  if the i-th component of the denominator is less
+              ! than safe2, then safe1 is added to the i-th components of the
+              ! numerator and denominator before dividing.
+              s = zero
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    s = max( s, cabs1( work( i ) ) / rwork( i ) )
+                 else
+                    s = max( s, ( cabs1( work( i ) )+safe1 ) /( rwork( i )+safe1 ) )
+                 end if
+              end do
+              berr( j ) = s
+              ! test stopping criterion. continue iterating if
+                 ! 1) the residual berr(j) is larger than machine epsilon, and
+                 ! 2) berr(j) decreased by at least a factor of 2 during the
+                    ! last iteration, and
+                 ! 3) at most itmax iterations tried.
+              if( berr( j )>eps .and. two*berr( j )<=lstres .and.count<=itmax ) then
+                 ! update solution and try again.
+                 call stdlib_zpttrs( uplo, n, 1, df, ef, work, n, info )
+                 call stdlib_zaxpy( n, cmplx( one,KIND=dp), work, 1, x( 1, j ), 1 )
+                 lstres = berr( j )
+                 count = count + 1
+                 go to 20
+              end if
+              ! bound error from formula
+              ! norm(x - xtrue) / norm(x) .le. ferr =
+              ! norm( abs(inv(a))*
+                 ! ( abs(r) + nz*eps*( abs(a)*abs(x)+abs(b) ))) / norm(x)
+              ! where
+                ! norm(z) is the magnitude of the largest component of z
+                ! inv(a) is the inverse of a
+                ! abs(z) is the componentwise absolute value of the matrix or
+                   ! vector z
+                ! nz is the maximum number of nonzeros in any row of a, plus 1
+                ! eps is machine epsilon
+              ! the i-th component of abs(r)+nz*eps*(abs(a)*abs(x)+abs(b))
+              ! is incremented by safe1 if the i-th component of
+              ! abs(a)*abs(x) + abs(b) is less than safe2.
+              do i = 1, n
+                 if( rwork( i )>safe2 ) then
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i )
+                 else
+                    rwork( i ) = cabs1( work( i ) ) + nz*eps*rwork( i ) +safe1
+                 end if
+              end do
+              ix = stdlib_idamax( n, rwork, 1 )
+              ferr( j ) = rwork( ix )
+              ! estimate the norm of inv(a).
+              ! solve m(a) * x = e, where m(a) = (m(i,j)) is given by
+                 ! m(i,j) =  abs(a(i,j)), i = j,
+                 ! m(i,j) = -abs(a(i,j)), i .ne. j,
+              ! and e = [ 1, 1, ..., 1 ]**t.  note m(a) = m(l)*d*m(l)**h.
+              ! solve m(l) * x = e.
+              rwork( 1 ) = one
+              do i = 2, n
+                 rwork( i ) = one + rwork( i-1 )*abs( ef( i-1 ) )
+              end do
+              ! solve d * m(l)**h * x = b.
+              rwork( n ) = rwork( n ) / df( n )
+              do i = n - 1, 1, -1
+                 rwork( i ) = rwork( i ) / df( i ) +rwork( i+1 )*abs( ef( i ) )
+              end do
+              ! compute norm(inv(a)) = max(x(i)), 1<=i<=n.
+              ix = stdlib_idamax( n, rwork, 1 )
+              ferr( j ) = ferr( j )*abs( rwork( ix ) )
+              ! normalize error.
+              lstres = zero
+              do i = 1, n
+                 lstres = max( lstres, abs( x( i, j ) ) )
+              end do
+              if( lstres/=zero )ferr( j ) = ferr( j ) / lstres
+           end do loop_100
+           return
+     end subroutine stdlib_zptrfs
+
+     !> ZPTSV: computes the solution to a complex system of linear equations
+     !> A*X = B, where A is an N-by-N Hermitian positive definite tridiagonal
+     !> matrix, and X and B are N-by-NRHS matrices.
+     !> A is factored as A = L*D*L**H, and the factored form of A is then
+     !> used to solve the system of equations.
+
+     pure subroutine stdlib_zptsv( n, nrhs, d, e, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, n, nrhs
+           ! Array Arguments 
+           real(dp), intent(inout) :: d(*)
+           complex(dp), intent(inout) :: b(ldb,*), e(*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( ldb<max( 1, n ) ) then
+              info = -6
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPTSV ', -info )
+              return
+           end if
+           ! compute the l*d*l**h (or u**h*d*u) factorization of a.
+           call stdlib_zpttrf( n, d, e, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_zpttrs( 'LOWER', n, nrhs, d, e, b, ldb, info )
+           end if
+           return
+     end subroutine stdlib_zptsv
+
+     !> ZPTSVX: uses the factorization A = L*D*L**H to compute the solution
+     !> to a complex system of linear equations A*X = B, where A is an
+     !> N-by-N Hermitian positive definite tridiagonal matrix and X and B
+     !> are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     pure subroutine stdlib_zptsvx( fact, n, nrhs, d, e, df, ef, b, ldb, x, ldx,rcond, ferr, berr,&
+                work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: fact
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(dp), intent(in) :: d(*)
+           real(dp), intent(inout) :: df(*)
+           complex(dp), intent(in) :: b(ldb,*), e(*)
+           complex(dp), intent(inout) :: ef(*)
+           complex(dp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: nofact
+           real(dp) :: anorm
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           if( .not.nofact .and. .not.stdlib_lsame( fact, 'F' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldx<max( 1, n ) ) then
+              info = -11
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZPTSVX', -info )
+              return
+           end if
+           if( nofact ) then
+              ! compute the l*d*l**h (or u**h*d*u) factorization of a.
+              call stdlib_dcopy( n, d, 1, df, 1 )
+              if( n>1 )call stdlib_zcopy( n-1, e, 1, ef, 1 )
+              call stdlib_zpttrf( n, df, ef, info )
+              ! return if info is non-zero.
+              if( info>0 )then
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a.
+           anorm = stdlib_zlanht( '1', n, d, e )
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_zptcon( n, df, ef, anorm, rcond, rwork, info )
+           ! compute the solution vectors x.
+           call stdlib_zlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_zpttrs( 'LOWER', n, nrhs, df, ef, x, ldx, info )
+           ! use iterative refinement to improve the computed solutions and
+           ! compute error bounds and backward error estimates for them.
+           call stdlib_zptrfs( 'LOWER', n, nrhs, d, e, df, ef, b, ldb, x, ldx, ferr,berr, work, &
+                     rwork, info )
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           return
+     end subroutine stdlib_zptsvx
+
+     !> ZSTEDC: computes all eigenvalues and, optionally, eigenvectors of a
+     !> symmetric tridiagonal matrix using the divide and conquer method.
+     !> The eigenvectors of a full or band complex Hermitian matrix can also
+     !> be found if ZHETRD or ZHPTRD or ZHBTRD has been used to reduce this
+     !> matrix to tridiagonal form.
+     !> This code makes very mild assumptions about floating point
+     !> arithmetic. It will work on machines with a guard digit in
+     !> add/subtract, or on those binary machines without guard digits
+     !> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
+     !> It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.  See DLAED3 for details.
+
+     pure subroutine stdlib_zstedc( compz, n, d, e, z, ldz, work, lwork, rwork,lrwork, iwork, &
+               liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(out) :: work(*)
+           complex(dp), intent(inout) :: z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: finish, i, icompz, ii, j, k, lgn, liwmin, ll, lrwmin, lwmin, m, smlsiz,&
+                      start
+           real(dp) :: eps, orgnrm, p, tiny
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,int,log,max,mod,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           if( stdlib_lsame( compz, 'N' ) ) then
+              icompz = 0
+           else if( stdlib_lsame( compz, 'V' ) ) then
+              icompz = 1
+           else if( stdlib_lsame( compz, 'I' ) ) then
+              icompz = 2
+           else
+              icompz = -1
+           end if
+           if( icompz<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( ( ldz<1 ) .or.( icompz>0 .and. ldz<max( 1, n ) ) ) then
+              info = -6
+           end if
+           if( info==0 ) then
+              ! compute the workspace requirements
+              smlsiz = stdlib_ilaenv( 9, 'ZSTEDC', ' ', 0, 0, 0, 0 )
+              if( n<=1 .or. icompz==0 ) then
+                 lwmin = 1
+                 liwmin = 1
+                 lrwmin = 1
+              else if( n<=smlsiz ) then
+                 lwmin = 1
+                 liwmin = 1
+                 lrwmin = 2*( n - 1 )
+              else if( icompz==1 ) then
+                 lgn = int( log( real( n,KIND=dp) ) / log( two ),KIND=ilp)
+                 if( 2**lgn<n )lgn = lgn + 1
+                 if( 2**lgn<n )lgn = lgn + 1
+                 lwmin = n*n
+                 lrwmin = 1 + 3*n + 2*n*lgn + 4*n**2
+                 liwmin = 6 + 6*n + 5*n*lgn
+              else if( icompz==2 ) then
+                 lwmin = 1
+                 lrwmin = 1 + 4*n + 2*n**2
+                 liwmin = 3 + 5*n
+              end if
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -8
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -10
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSTEDC', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( icompz/=0 )z( 1, 1 ) = one
+              return
+           end if
+           ! if the following conditional clause is removed, then the routine
+           ! will use the divide and conquer routine to compute only the
+           ! eigenvalues, which requires (3n + 3n**2) real workspace and
+           ! (2 + 5n + 2n lg(n)) integer workspace.
+           ! since on many architectures stdlib_dsterf is much faster than any other
+           ! algorithm for finding eigenvalues only, it is used here
+           ! as the default. if the conditional clause is removed, then
+           ! information on the size of workspace needs to be changed.
+           ! if compz = 'n', use stdlib_dsterf to compute the eigenvalues.
+           if( icompz==0 ) then
+              call stdlib_dsterf( n, d, e, info )
+              go to 70
+           end if
+           ! if n is smaller than the minimum divide size (smlsiz+1), then
+           ! solve the problem with another solver.
+           if( n<=smlsiz ) then
+              call stdlib_zsteqr( compz, n, d, e, z, ldz, rwork, info )
+           else
+              ! if compz = 'i', we simply call stdlib_dstedc instead.
+              if( icompz==2 ) then
+                 call stdlib_dlaset( 'FULL', n, n, zero, one, rwork, n )
+                 ll = n*n + 1
+                 call stdlib_dstedc( 'I', n, d, e, rwork, n,rwork( ll ), lrwork-ll+1, iwork, &
+                           liwork, info )
+                 do j = 1, n
+                    do i = 1, n
+                       z( i, j ) = rwork( ( j-1 )*n+i )
+                    end do
+                 end do
+                 go to 70
+              end if
+              ! from now on, only option left to be handled is compz = 'v',
+              ! i.e. icompz = 1.
+              ! scale.
+              orgnrm = stdlib_dlanst( 'M', n, d, e )
+              if( orgnrm==zero )go to 70
+              eps = stdlib_dlamch( 'EPSILON' )
+              start = 1
+              ! while ( start <= n )
+              30 continue
+              if( start<=n ) then
+                 ! let finish be the position of the next subdiagonal entry
+                 ! such that e( finish ) <= tiny or finish = n if no such
+                 ! subdiagonal exists.  the matrix identified by the elements
+                 ! between start and finish constitutes an independent
+                 ! sub-problem.
+                 finish = start
+                 40 continue
+                 if( finish<n ) then
+                    tiny = eps*sqrt( abs( d( finish ) ) )*sqrt( abs( d( finish+1 ) ) )
+                    if( abs( e( finish ) )>tiny ) then
+                       finish = finish + 1
+                       go to 40
+                    end if
+                 end if
+                 ! (sub) problem determined.  compute its size and solve it.
+                 m = finish - start + 1
+                 if( m>smlsiz ) then
+                    ! scale.
+                    orgnrm = stdlib_dlanst( 'M', m, d( start ), e( start ) )
+                    call stdlib_dlascl( 'G', 0, 0, orgnrm, one, m, 1, d( start ), m,info )
+                    call stdlib_dlascl( 'G', 0, 0, orgnrm, one, m-1, 1, e( start ),m-1, info )
+                              
+                    call stdlib_zlaed0( n, m, d( start ), e( start ), z( 1, start ),ldz, work, n, &
+                              rwork, iwork, info )
+                    if( info>0 ) then
+                       info = ( info / ( m+1 )+start-1 )*( n+1 ) +mod( info, ( m+1 ) ) + start - &
+                                 1
+                       go to 70
+                    end if
+                    ! scale back.
+                    call stdlib_dlascl( 'G', 0, 0, one, orgnrm, m, 1, d( start ), m,info )
+                 else
+                    call stdlib_dsteqr( 'I', m, d( start ), e( start ), rwork, m,rwork( m*m+1 ), &
+                              info )
+                    call stdlib_zlacrm( n, m, z( 1, start ), ldz, rwork, m, work, n,rwork( m*m+1 )&
+                               )
+                    call stdlib_zlacpy( 'A', n, m, work, n, z( 1, start ), ldz )
+                    if( info>0 ) then
+                       info = start*( n+1 ) + finish
+                       go to 70
+                    end if
+                 end if
+                 start = finish + 1
+                 go to 30
+              end if
+              ! endwhile
+              ! use selection sort to minimize swaps of eigenvectors
+              do ii = 2, n
+                i = ii - 1
+                k = i
+                p = d( i )
+                do j = ii, n
+                   if( d( j )<p ) then
+                      k = j
+                      p = d( j )
+                   end if
+                end do
+                if( k/=i ) then
+                   d( k ) = d( i )
+                   d( i ) = p
+                   call stdlib_zswap( n, z( 1, i ), 1, z( 1, k ), 1 )
+                end if
+              end do
+           end if
+           70 continue
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_zstedc
+
+     !> ZSTEGR: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a real symmetric tridiagonal matrix T. Any such unreduced matrix has
+     !> a well defined set of pairwise different real eigenvalues, the corresponding
+     !> real eigenvectors are pairwise orthogonal.
+     !> The spectrum may be computed either completely or partially by specifying
+     !> either an interval (VL,VU] or a range of indices IL:IU for the desired
+     !> eigenvalues.
+     !> ZSTEGR is a compatibility wrapper around the improved ZSTEMR routine.
+     !> See ZSTEMR for further details.
+     !> One important change is that the ABSTOL parameter no longer provides any
+     !> benefit and hence is no longer used.
+     !> Note : ZSTEGR and ZSTEMR work only on machines which follow
+     !> IEEE-754 floating-point standard in their handling of infinities and
+     !> NaNs.  Normal execution may create these exceptiona values and hence
+     !> may abort due to a floating point exception in environments which
+     !> do not conform to the IEEE-754 standard.
+
+     pure subroutine stdlib_zstegr( jobz, range, n, d, e, vl, vu, il, iu,abstol, m, w, z, ldz, &
+               isuppz, work, lwork, iwork,liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range
+           integer(ilp), intent(in) :: il, iu, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: isuppz(*), iwork(*)
+           real(dp), intent(inout) :: d(*), e(*)
+           real(dp), intent(out) :: w(*), work(*)
+           complex(dp), intent(out) :: z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: tryrac
+           ! Executable Statements 
+           info = 0
+           tryrac = .false.
+           call stdlib_zstemr( jobz, range, n, d, e, vl, vu, il, iu,m, w, z, ldz, n, isuppz, &
+                     tryrac, work, lwork,iwork, liwork, info )
+     end subroutine stdlib_zstegr
+
+     !> ZTGSEN: reorders the generalized Schur decomposition of a complex
+     !> matrix pair (A, B) (in terms of an unitary equivalence trans-
+     !> formation Q**H * (A, B) * Z), so that a selected cluster of eigenvalues
+     !> appears in the leading diagonal blocks of the pair (A,B). The leading
+     !> columns of Q and Z form unitary bases of the corresponding left and
+     !> right eigenspaces (deflating subspaces). (A, B) must be in
+     !> generalized Schur canonical form, that is, A and B are both upper
+     !> triangular.
+     !> ZTGSEN also computes the generalized eigenvalues
+     !> w(j)= ALPHA(j) / BETA(j)
+     !> of the reordered matrix pair (A, B).
+     !> Optionally, the routine computes estimates of reciprocal condition
+     !> numbers for eigenvalues and eigenspaces. These are Difu[(A11,B11),
+     !> (A22,B22)] and Difl[(A11,B11), (A22,B22)], i.e. the separation(s)
+     !> between the matrix pairs (A11, B11) and (A22,B22) that correspond to
+     !> the selected cluster and the eigenvalues outside the cluster, resp.,
+     !> and norms of "projections" onto left and right eigenspaces w.r.t.
+     !> the selected cluster in the (1,1)-block.
+
+     pure subroutine stdlib_ztgsen( ijob, wantq, wantz, select, n, a, lda, b, ldb,alpha, beta, q, &
+               ldq, z, ldz, m, pl, pr, dif,work, lwork, iwork, liwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           logical(lk), intent(in) :: wantq, wantz
+           integer(ilp), intent(in) :: ijob, lda, ldb, ldq, ldz, liwork, lwork, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(out) :: pl, pr
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: dif(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*), q(ldq,*), z(ldz,*)
+           complex(dp), intent(out) :: alpha(*), beta(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: idifjb = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, swap, wantd, wantd1, wantd2, wantp
+           integer(ilp) :: i, ierr, ijb, k, kase, ks, liwmin, lwmin, mn2, n1, n2
+           real(dp) :: dscale, dsum, rdscal, safmin
+           complex(dp) :: temp1, temp2
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,conjg,max,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters
+           info = 0
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( ijob<0 .or. ijob>5 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -13
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGSEN', -info )
+              return
+           end if
+           ierr = 0
+           wantp = ijob==1 .or. ijob>=4
+           wantd1 = ijob==2 .or. ijob==4
+           wantd2 = ijob==3 .or. ijob==5
+           wantd = wantd1 .or. wantd2
+           ! set m to the dimension of the specified pair of deflating
+           ! subspaces.
+           m = 0
+           if( .not.lquery .or. ijob/=0 ) then
+           do k = 1, n
+              alpha( k ) = a( k, k )
+              beta( k ) = b( k, k )
+              if( k<n ) then
+                 if( select( k ) )m = m + 1
+              else
+                 if( select( n ) )m = m + 1
+              end if
+           end do
+           end if
+           if( ijob==1 .or. ijob==2 .or. ijob==4 ) then
+              lwmin = max( 1, 2*m*( n-m ) )
+              liwmin = max( 1, n+2 )
+           else if( ijob==3 .or. ijob==5 ) then
+              lwmin = max( 1, 4*m*( n-m ) )
+              liwmin = max( 1, 2*m*( n-m ), n+2 )
+           else
+              lwmin = 1
+              liwmin = 1
+           end if
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           if( lwork<lwmin .and. .not.lquery ) then
+              info = -21
+           else if( liwork<liwmin .and. .not.lquery ) then
+              info = -23
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGSEN', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( m==n .or. m==0 ) then
+              if( wantp ) then
+                 pl = one
+                 pr = one
+              end if
+              if( wantd ) then
+                 dscale = zero
+                 dsum = one
+                 do i = 1, n
+                    call stdlib_zlassq( n, a( 1, i ), 1, dscale, dsum )
+                    call stdlib_zlassq( n, b( 1, i ), 1, dscale, dsum )
+                 end do
+                 dif( 1 ) = dscale*sqrt( dsum )
+                 dif( 2 ) = dif( 1 )
+              end if
+              go to 70
+           end if
+           ! get machine constant
+           safmin = stdlib_dlamch( 'S' )
+           ! collect the selected blocks at the top-left corner of (a, b).
+           ks = 0
+           do k = 1, n
+              swap = select( k )
+              if( swap ) then
+                 ks = ks + 1
+                 ! swap the k-th block to position ks. compute unitary q
+                 ! and z that will swap adjacent diagonal blocks in (a, b).
+                 if( k/=ks )call stdlib_ztgexc( wantq, wantz, n, a, lda, b, ldb, q, ldq, z,ldz, k,&
+                            ks, ierr )
+                 if( ierr>0 ) then
+                    ! swap is rejected: exit.
+                    info = 1
+                    if( wantp ) then
+                       pl = zero
+                       pr = zero
+                    end if
+                    if( wantd ) then
+                       dif( 1 ) = zero
+                       dif( 2 ) = zero
+                    end if
+                    go to 70
+                 end if
+              end if
+           end do
+           if( wantp ) then
+              ! solve generalized sylvester equation for r and l:
+                         ! a11 * r - l * a22 = a12
+                         ! b11 * r - l * b22 = b12
+              n1 = m
+              n2 = n - m
+              i = n1 + 1
+              call stdlib_zlacpy( 'FULL', n1, n2, a( 1, i ), lda, work, n1 )
+              call stdlib_zlacpy( 'FULL', n1, n2, b( 1, i ), ldb, work( n1*n2+1 ),n1 )
+              ijb = 0
+              call stdlib_ztgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda, work,n1, b, ldb, b( i,&
+               i ), ldb, work( n1*n2+1 ), n1,dscale, dif( 1 ), work( n1*n2*2+1 ),lwork-2*n1*n2, &
+                         iwork, ierr )
+              ! estimate the reciprocal of norms of "projections" onto
+              ! left and right eigenspaces
+              rdscal = zero
+              dsum = one
+              call stdlib_zlassq( n1*n2, work, 1, rdscal, dsum )
+              pl = rdscal*sqrt( dsum )
+              if( pl==zero ) then
+                 pl = one
+              else
+                 pl = dscale / ( sqrt( dscale*dscale / pl+pl )*sqrt( pl ) )
+              end if
+              rdscal = zero
+              dsum = one
+              call stdlib_zlassq( n1*n2, work( n1*n2+1 ), 1, rdscal, dsum )
+              pr = rdscal*sqrt( dsum )
+              if( pr==zero ) then
+                 pr = one
+              else
+                 pr = dscale / ( sqrt( dscale*dscale / pr+pr )*sqrt( pr ) )
+              end if
+           end if
+           if( wantd ) then
+              ! compute estimates difu and difl.
+              if( wantd1 ) then
+                 n1 = m
+                 n2 = n - m
+                 i = n1 + 1
+                 ijb = idifjb
+                 ! frobenius norm-based difu estimate.
+                 call stdlib_ztgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda, work,n1, b, ldb, b(&
+                  i, i ), ldb, work( n1*n2+1 ),n1, dscale, dif( 1 ), work( n1*n2*2+1 ),lwork-&
+                            2*n1*n2, iwork, ierr )
+                 ! frobenius norm-based difl estimate.
+                 call stdlib_ztgsyl( 'N', ijb, n2, n1, a( i, i ), lda, a, lda, work,n2, b( i, i ),&
+                  ldb, b, ldb, work( n1*n2+1 ),n2, dscale, dif( 2 ), work( n1*n2*2+1 ),lwork-&
+                            2*n1*n2, iwork, ierr )
+              else
+                 ! compute 1-norm-based estimates of difu and difl using
+                 ! reversed communication with stdlib_zlacn2. in each step a
+                 ! generalized sylvester equation or a transposed variant
+                 ! is solved.
+                 kase = 0
+                 n1 = m
+                 n2 = n - m
+                 i = n1 + 1
+                 ijb = 0
+                 mn2 = 2*n1*n2
+                 ! 1-norm-based estimate of difu.
+                 40 continue
+                 call stdlib_zlacn2( mn2, work( mn2+1 ), work, dif( 1 ), kase,isave )
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve generalized sylvester equation
+                       call stdlib_ztgsyl( 'N', ijb, n1, n2, a, lda, a( i, i ), lda,work, n1, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n1, dscale, dif( 1 ),work( n1*n2*2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    else
+                       ! solve the transposed variant.
+                       call stdlib_ztgsyl( 'C', ijb, n1, n2, a, lda, a( i, i ), lda,work, n1, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n1, dscale, dif( 1 ),work( n1*n2*2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    end if
+                    go to 40
+                 end if
+                 dif( 1 ) = dscale / dif( 1 )
+                 ! 1-norm-based estimate of difl.
+                 50 continue
+                 call stdlib_zlacn2( mn2, work( mn2+1 ), work, dif( 2 ), kase,isave )
+                 if( kase/=0 ) then
+                    if( kase==1 ) then
+                       ! solve generalized sylvester equation
+                       call stdlib_ztgsyl( 'N', ijb, n2, n1, a( i, i ), lda, a, lda,work, n2, b( &
+                       i, i ), ldb, b, ldb,work( n1*n2+1 ), n2, dscale, dif( 2 ),work( n1*n2*2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    else
+                       ! solve the transposed variant.
+                       call stdlib_ztgsyl( 'C', ijb, n2, n1, a( i, i ), lda, a, lda,work, n2, b, &
+                       ldb, b( i, i ), ldb,work( n1*n2+1 ), n2, dscale, dif( 2 ),work( n1*n2*2+1 )&
+                                 , lwork-2*n1*n2, iwork,ierr )
+                    end if
+                    go to 50
+                 end if
+                 dif( 2 ) = dscale / dif( 2 )
+              end if
+           end if
+           ! if b(k,k) is complex, make it real and positive (normalization
+           ! of the generalized schur form) and store the generalized
+           ! eigenvalues of reordered pair (a, b)
+           do k = 1, n
+              dscale = abs( b( k, k ) )
+              if( dscale>safmin ) then
+                 temp1 = conjg( b( k, k ) / dscale )
+                 temp2 = b( k, k ) / dscale
+                 b( k, k ) = dscale
+                 call stdlib_zscal( n-k, temp1, b( k, k+1 ), ldb )
+                 call stdlib_zscal( n-k+1, temp1, a( k, k ), lda )
+                 if( wantq )call stdlib_zscal( n, temp2, q( 1, k ), 1 )
+              else
+                 b( k, k ) = cmplx( zero, zero,KIND=dp)
+              end if
+              alpha( k ) = a( k, k )
+              beta( k ) = b( k, k )
+           end do
+           70 continue
+           work( 1 ) = lwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_ztgsen
+
+     !> ZTGSNA: estimates reciprocal condition numbers for specified
+     !> eigenvalues and/or eigenvectors of a matrix pair (A, B).
+     !> (A, B) must be in generalized Schur canonical form, that is, A and
+     !> B are both upper triangular.
+
+     pure subroutine stdlib_ztgsna( job, howmny, select, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, s, &
+               dif, mm, m, work, lwork,iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: howmny, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, mm, n
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: dif(*), s(*)
+           complex(dp), intent(in) :: a(lda,*), b(ldb,*), vl(ldvl,*), vr(ldvr,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: idifjb = 3
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, somcon, wantbh, wantdf, wants
+           integer(ilp) :: i, ierr, ifst, ilst, k, ks, lwmin, n1, n2
+           real(dp) :: bignum, cond, eps, lnrm, rnrm, scale, smlnum
+           complex(dp) :: yhax, yhbx
+           ! Local Arrays 
+           complex(dp) :: dummy(1), dummy1(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,max
+           ! Executable Statements 
+           ! decode and test the input parameters
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantdf = stdlib_lsame( job, 'V' ) .or. wantbh
+           somcon = stdlib_lsame( howmny, 'S' )
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.wants .and. .not.wantdf ) then
+              info = -1
+           else if( .not.stdlib_lsame( howmny, 'A' ) .and. .not.somcon ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( wants .and. ldvl<n ) then
+              info = -10
+           else if( wants .and. ldvr<n ) then
+              info = -12
+           else
+              ! set m to the number of eigenpairs for which condition numbers
+              ! are required, and test mm.
+              if( somcon ) then
+                 m = 0
+                 do k = 1, n
+                    if( select( k ) )m = m + 1
+                 end do
+              else
+                 m = n
+              end if
+              if( n==0 ) then
+                 lwmin = 1
+              else if( stdlib_lsame( job, 'V' ) .or. stdlib_lsame( job, 'B' ) ) then
+                 lwmin = 2*n*n
+              else
+                 lwmin = n
+              end if
+              work( 1 ) = lwmin
+              if( mm<m ) then
+                 info = -15
+              else if( lwork<lwmin .and. .not.lquery ) then
+                 info = -18
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTGSNA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' ) / eps
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ks = 0
+           loop_20: do k = 1, n
+              ! determine whether condition numbers are required for the k-th
+              ! eigenpair.
+              if( somcon ) then
+                 if( .not.select( k ) )cycle loop_20
+              end if
+              ks = ks + 1
+              if( wants ) then
+                 ! compute the reciprocal condition number of the k-th
+                 ! eigenvalue.
+                 rnrm = stdlib_dznrm2( n, vr( 1, ks ), 1 )
+                 lnrm = stdlib_dznrm2( n, vl( 1, ks ), 1 )
+                 call stdlib_zgemv( 'N', n, n, cmplx( one, zero,KIND=dp), a, lda,vr( 1, ks ), 1, &
+                           cmplx( zero, zero,KIND=dp), work, 1 )
+                 yhax = stdlib_zdotc( n, work, 1, vl( 1, ks ), 1 )
+                 call stdlib_zgemv( 'N', n, n, cmplx( one, zero,KIND=dp), b, ldb,vr( 1, ks ), 1, &
+                           cmplx( zero, zero,KIND=dp), work, 1 )
+                 yhbx = stdlib_zdotc( n, work, 1, vl( 1, ks ), 1 )
+                 cond = stdlib_dlapy2( abs( yhax ), abs( yhbx ) )
+                 if( cond==zero ) then
+                    s( ks ) = -one
+                 else
+                    s( ks ) = cond / ( rnrm*lnrm )
+                 end if
+              end if
+              if( wantdf ) then
+                 if( n==1 ) then
+                    dif( ks ) = stdlib_dlapy2( abs( a( 1, 1 ) ), abs( b( 1, 1 ) ) )
+                 else
+                    ! estimate the reciprocal condition number of the k-th
+                    ! eigenvectors.
+                    ! copy the matrix (a, b) to the array work and move the
+                    ! (k,k)th pair to the (1,1) position.
+                    call stdlib_zlacpy( 'FULL', n, n, a, lda, work, n )
+                    call stdlib_zlacpy( 'FULL', n, n, b, ldb, work( n*n+1 ), n )
+                    ifst = k
+                    ilst = 1
+                    call stdlib_ztgexc( .false., .false., n, work, n, work( n*n+1 ),n, dummy, 1, &
+                              dummy1, 1, ifst, ilst, ierr )
+                    if( ierr>0 ) then
+                       ! ill-conditioned problem - swap rejected.
+                       dif( ks ) = zero
+                    else
+                       ! reordering successful, solve generalized sylvester
+                       ! equation for r and l,
+                                  ! a22 * r - l * a11 = a12
+                                  ! b22 * r - l * b11 = b12,
+                       ! and compute estimate of difl[(a11,b11), (a22, b22)].
+                       n1 = 1
+                       n2 = n - n1
+                       i = n*n + 1
+                       call stdlib_ztgsyl( 'N', idifjb, n2, n1, work( n*n1+n1+1 ),n, work, n, &
+                       work( n1+1 ), n,work( n*n1+n1+i ), n, work( i ), n,work( n1+i ), n, scale, &
+                                 dif( ks ), dummy,1, iwork, ierr )
+                    end if
+                 end if
+              end if
+           end do loop_20
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_ztgsna
+
+     !> ZTRSEN: reorders the Schur factorization of a complex matrix
+     !> A = Q*T*Q**H, so that a selected cluster of eigenvalues appears in
+     !> the leading positions on the diagonal of the upper triangular matrix
+     !> T, and the leading columns of Q form an orthonormal basis of the
+     !> corresponding right invariant subspace.
+     !> Optionally the routine computes the reciprocal condition numbers of
+     !> the cluster of eigenvalues and/or the invariant subspace.
+
+     subroutine stdlib_ztrsen( job, compq, select, n, t, ldt, q, ldq, w, m, s,sep, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: compq, job
+           integer(ilp), intent(out) :: info, m
+           integer(ilp), intent(in) :: ldq, ldt, lwork, n
+           real(dp), intent(out) :: s, sep
+           ! Array Arguments 
+           logical(lk), intent(in) :: select(*)
+           complex(dp), intent(inout) :: q(ldq,*), t(ldt,*)
+           complex(dp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantbh, wantq, wants, wantsp
+           integer(ilp) :: ierr, k, kase, ks, lwmin, n1, n2, nn
+           real(dp) :: est, rnorm, scale
+           ! Local Arrays 
+           integer(ilp) :: isave(3)
+           real(dp) :: rwork(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! decode and test the input parameters.
+           wantbh = stdlib_lsame( job, 'B' )
+           wants = stdlib_lsame( job, 'E' ) .or. wantbh
+           wantsp = stdlib_lsame( job, 'V' ) .or. wantbh
+           wantq = stdlib_lsame( compq, 'V' )
+           ! set m to the number of selected eigenvalues.
+           m = 0
+           do k = 1, n
+              if( select( k ) )m = m + 1
+           end do
+           n1 = m
+           n2 = n - m
+           nn = n1*n2
+           info = 0
+           lquery = ( lwork==-1 )
+           if( wantsp ) then
+              lwmin = max( 1, 2*nn )
+           else if( stdlib_lsame( job, 'N' ) ) then
+              lwmin = 1
+           else if( stdlib_lsame( job, 'E' ) ) then
+              lwmin = max( 1, nn )
+           end if
+           if( .not.stdlib_lsame( job, 'N' ) .and. .not.wants .and. .not.wantsp )then
+              info = -1
+           else if( .not.stdlib_lsame( compq, 'N' ) .and. .not.wantq ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( ldt<max( 1, n ) ) then
+              info = -6
+           else if( ldq<1 .or. ( wantq .and. ldq<n ) ) then
+              info = -8
+           else if( lwork<lwmin .and. .not.lquery ) then
+              info = -14
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZTRSEN', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==n .or. m==0 ) then
+              if( wants )s = one
+              if( wantsp )sep = stdlib_zlange( '1', n, n, t, ldt, rwork )
+              go to 40
+           end if
+           ! collect the selected eigenvalues at the top left corner of t.
+           ks = 0
+           do k = 1, n
+              if( select( k ) ) then
+                 ks = ks + 1
+                 ! swap the k-th eigenvalue to position ks.
+                 if( k/=ks )call stdlib_ztrexc( compq, n, t, ldt, q, ldq, k, ks, ierr )
+              end if
+           end do
+           if( wants ) then
+              ! solve the sylvester equation for r:
+                 ! t11*r - r*t22 = scale*t12
+              call stdlib_zlacpy( 'F', n1, n2, t( 1, n1+1 ), ldt, work, n1 )
+              call stdlib_ztrsyl( 'N', 'N', -1, n1, n2, t, ldt, t( n1+1, n1+1 ),ldt, work, n1, &
+                        scale, ierr )
+              ! estimate the reciprocal of the condition number of the cluster
+              ! of eigenvalues.
+              rnorm = stdlib_zlange( 'F', n1, n2, work, n1, rwork )
+              if( rnorm==zero ) then
+                 s = one
+              else
+                 s = scale / ( sqrt( scale*scale / rnorm+rnorm )*sqrt( rnorm ) )
+              end if
+           end if
+           if( wantsp ) then
+              ! estimate sep(t11,t22).
+              est = zero
+              kase = 0
+              30 continue
+              call stdlib_zlacn2( nn, work( nn+1 ), work, est, kase, isave )
+              if( kase/=0 ) then
+                 if( kase==1 ) then
+                    ! solve t11*r - r*t22 = scale*x.
+                    call stdlib_ztrsyl( 'N', 'N', -1, n1, n2, t, ldt,t( n1+1, n1+1 ), ldt, work, &
+                              n1, scale,ierr )
+                 else
+                    ! solve t11**h*r - r*t22**h = scale*x.
+                    call stdlib_ztrsyl( 'C', 'C', -1, n1, n2, t, ldt,t( n1+1, n1+1 ), ldt, work, &
+                              n1, scale,ierr )
+                 end if
+                 go to 30
+              end if
+              sep = scale / est
+           end if
+           40 continue
+           ! copy reordered eigenvalues to w.
+           do k = 1, n
+              w( k ) = t( k, k )
+           end do
+           work( 1 ) = lwmin
+           return
+     end subroutine stdlib_ztrsen
+
+     !> ZUNBDB1: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. Q must be no larger than P,
+     !> M-P, or M-Q. Routines ZUNBDB2, ZUNBDB3, and ZUNBDB4 handle cases in
+     !> which Q is not the minimum dimension.
+     !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are Q-by-Q bidiagonal matrices represented implicitly by
+     !> angles THETA, PHI.
+
+     subroutine stdlib_zunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(dp), intent(out) :: phi(*), theta(*)
+           complex(dp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           complex(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < q .or. m-p < q ) then
+              info = -2
+           else if( q < 0 .or. m-q < q ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p-1, m-p-1, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-2
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNBDB1', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce columns 1, ..., q of x11 and x21
+           do i = 1, q
+              call stdlib_zlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              call stdlib_zlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              theta(i) = atan2( real( x21(i,i),KIND=dp), real( x11(i,i),KIND=dp) )
+              c = cos( theta(i) )
+              s = sin( theta(i) )
+              x11(i,i) = cone
+              x21(i,i) = cone
+              call stdlib_zlarf( 'L', p-i+1, q-i, x11(i,i), 1, conjg(taup1(i)),x11(i,i+1), ldx11, &
+                        work(ilarf) )
+              call stdlib_zlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, conjg(taup2(i)),x21(i,i+1), &
+                        ldx21, work(ilarf) )
+              if( i < q ) then
+                 call stdlib_zdrot( q-i, x11(i,i+1), ldx11, x21(i,i+1), ldx21, c,s )
+                 call stdlib_zlacgv( q-i, x21(i,i+1), ldx21 )
+                 call stdlib_zlarfgp( q-i, x21(i,i+1), x21(i,i+2), ldx21, tauq1(i) )
+                 s = real( x21(i,i+1),KIND=dp)
+                 x21(i,i+1) = cone
+                 call stdlib_zlarf( 'R', p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x11(i+1,i+1), &
+                           ldx11, work(ilarf) )
+                 call stdlib_zlarf( 'R', m-p-i, q-i, x21(i,i+1), ldx21, tauq1(i),x21(i+1,i+1), &
+                           ldx21, work(ilarf) )
+                 call stdlib_zlacgv( q-i, x21(i,i+1), ldx21 )
+                 c = sqrt( stdlib_dznrm2( p-i, x11(i+1,i+1), 1 )**2+ stdlib_dznrm2( m-p-i, x21(i+&
+                           1,i+1), 1 )**2 )
+                 phi(i) = atan2( s, c )
+                 call stdlib_zunbdb5( p-i, m-p-i, q-i-1, x11(i+1,i+1), 1,x21(i+1,i+1), 1, x11(i+1,&
+                           i+2), ldx11,x21(i+1,i+2), ldx21, work(iorbdb5), lorbdb5,childinfo )
+              end if
+           end do
+           return
+     end subroutine stdlib_zunbdb1
+
+     !> ZUNBDB2: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. P must be no larger than M-P,
+     !> Q, or M-Q. Routines ZUNBDB1, ZUNBDB3, and ZUNBDB4 handle cases in
+     !> which P is not the minimum dimension.
+     !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are P-by-P bidiagonal matrices represented implicitly by
+     !> angles THETA, PHI.
+
+     subroutine stdlib_zunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(dp), intent(out) :: phi(*), theta(*)
+           complex(dp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           complex(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < 0 .or. p > m-p ) then
+              info = -2
+           else if( q < 0 .or. q < p .or. m-q < p ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p-1, m-p, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-1
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNBDB2', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce rows 1, ..., p of x11 and x21
+           do i = 1, p
+              if( i > 1 ) then
+                 call stdlib_zdrot( q-i+1, x11(i,i), ldx11, x21(i-1,i), ldx21, c,s )
+              end if
+              call stdlib_zlacgv( q-i+1, x11(i,i), ldx11 )
+              call stdlib_zlarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
+              c = real( x11(i,i),KIND=dp)
+              x11(i,i) = cone
+              call stdlib_zlarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_zlarf( 'R', m-p-i+1, q-i+1, x11(i,i), ldx11, tauq1(i),x21(i,i), ldx21, &
+                        work(ilarf) )
+              call stdlib_zlacgv( q-i+1, x11(i,i), ldx11 )
+              s = sqrt( stdlib_dznrm2( p-i, x11(i+1,i), 1 )**2+ stdlib_dznrm2( m-p-i+1, x21(i,i), &
+                        1 )**2 )
+              theta(i) = atan2( s, c )
+              call stdlib_zunbdb5( p-i, m-p-i+1, q-i, x11(i+1,i), 1, x21(i,i), 1,x11(i+1,i+1), &
+                        ldx11, x21(i,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
+              call stdlib_zscal( p-i, cnegone, x11(i+1,i), 1 )
+              call stdlib_zlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              if( i < p ) then
+                 call stdlib_zlarfgp( p-i, x11(i+1,i), x11(i+2,i), 1, taup1(i) )
+                 phi(i) = atan2( real( x11(i+1,i),KIND=dp), real( x21(i,i),KIND=dp) )
+                 c = cos( phi(i) )
+                 s = sin( phi(i) )
+                 x11(i+1,i) = cone
+                 call stdlib_zlarf( 'L', p-i, q-i, x11(i+1,i), 1, conjg(taup1(i)),x11(i+1,i+1), &
+                           ldx11, work(ilarf) )
+              end if
+              x21(i,i) = cone
+              call stdlib_zlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, conjg(taup2(i)),x21(i,i+1), &
+                        ldx21, work(ilarf) )
+           end do
+           ! reduce the bottom-right portion of x21 to the identity matrix
+           do i = p + 1, q
+              call stdlib_zlarfgp( m-p-i+1, x21(i,i), x21(i+1,i), 1, taup2(i) )
+              x21(i,i) = cone
+              call stdlib_zlarf( 'L', m-p-i+1, q-i, x21(i,i), 1, conjg(taup2(i)),x21(i,i+1), &
+                        ldx21, work(ilarf) )
+           end do
+           return
+     end subroutine stdlib_zunbdb2
+
+     !> ZUNBDB3: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-P must be no larger than P,
+     !> Q, or M-Q. Routines ZUNBDB1, ZUNBDB2, and ZUNBDB4 handle cases in
+     !> which M-P is not the minimum dimension.
+     !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are (M-P)-by-(M-P) bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_zunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(dp), intent(out) :: phi(*), theta(*)
+           complex(dp), intent(out) :: taup1(*), taup2(*), tauq1(*), work(*)
+           complex(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( 2*p < m .or. p > m ) then
+              info = -2
+           else if( q < m-p .or. m-q < m-p ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( p, m-p-1, q-1 )
+              iorbdb5 = 2
+              lorbdb5 = q-1
+              lworkopt = max( ilarf+llarf-1, iorbdb5+lorbdb5-1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNBDB3', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce rows 1, ..., m-p of x11 and x21
+           do i = 1, m-p
+              if( i > 1 ) then
+                 call stdlib_zdrot( q-i+1, x11(i-1,i), ldx11, x21(i,i), ldx11, c,s )
+              end if
+              call stdlib_zlacgv( q-i+1, x21(i,i), ldx21 )
+              call stdlib_zlarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
+              s = real( x21(i,i),KIND=dp)
+              x21(i,i) = cone
+              call stdlib_zlarf( 'R', p-i+1, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_zlarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
+                        work(ilarf) )
+              call stdlib_zlacgv( q-i+1, x21(i,i), ldx21 )
+              c = sqrt( stdlib_dznrm2( p-i+1, x11(i,i), 1 )**2+ stdlib_dznrm2( m-p-i, x21(i+1,i), &
+                        1 )**2 )
+              theta(i) = atan2( s, c )
+              call stdlib_zunbdb5( p-i+1, m-p-i, q-i, x11(i,i), 1, x21(i+1,i), 1,x11(i,i+1), &
+                        ldx11, x21(i+1,i+1), ldx21,work(iorbdb5), lorbdb5, childinfo )
+              call stdlib_zlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              if( i < m-p ) then
+                 call stdlib_zlarfgp( m-p-i, x21(i+1,i), x21(i+2,i), 1, taup2(i) )
+                 phi(i) = atan2( real( x21(i+1,i),KIND=dp), real( x11(i,i),KIND=dp) )
+                 c = cos( phi(i) )
+                 s = sin( phi(i) )
+                 x21(i+1,i) = cone
+                 call stdlib_zlarf( 'L', m-p-i, q-i, x21(i+1,i), 1,conjg(taup2(i)), x21(i+1,i+1), &
+                           ldx21,work(ilarf) )
+              end if
+              x11(i,i) = cone
+              call stdlib_zlarf( 'L', p-i+1, q-i, x11(i,i), 1, conjg(taup1(i)),x11(i,i+1), ldx11, &
+                        work(ilarf) )
+           end do
+           ! reduce the bottom-right portion of x11 to the identity matrix
+           do i = m-p + 1, q
+              call stdlib_zlarfgp( p-i+1, x11(i,i), x11(i+1,i), 1, taup1(i) )
+              x11(i,i) = cone
+              call stdlib_zlarf( 'L', p-i+1, q-i, x11(i,i), 1, conjg(taup1(i)),x11(i,i+1), ldx11, &
+                        work(ilarf) )
+           end do
+           return
+     end subroutine stdlib_zunbdb3
+
+     !> ZUNBDB4: simultaneously bidiagonalizes the blocks of a tall and skinny
+     !> matrix X with orthonomal columns:
+     !> [ B11 ]
+     !> [ X11 ]   [ P1 |    ] [  0  ]
+     !> [-----] = [---------] [-----] Q1**T .
+     !> [ X21 ]   [    | P2 ] [ B21 ]
+     !> [  0  ]
+     !> X11 is P-by-Q, and X21 is (M-P)-by-Q. M-Q must be no larger than P,
+     !> M-P, or Q. Routines ZUNBDB1, ZUNBDB2, and ZUNBDB3 handle cases in
+     !> which M-Q is not the minimum dimension.
+     !> The unitary matrices P1, P2, and Q1 are P-by-P, (M-P)-by-(M-P),
+     !> and (M-Q)-by-(M-Q), respectively. They are represented implicitly by
+     !> Householder vectors.
+     !> B11 and B12 are (M-Q)-by-(M-Q) bidiagonal matrices represented
+     !> implicitly by angles THETA, PHI.
+
+     subroutine stdlib_zunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, phi,taup1, taup2, tauq1, &
+               phantom, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lwork, m, p, q, ldx11, ldx21
+           ! Array Arguments 
+           real(dp), intent(out) :: phi(*), theta(*)
+           complex(dp), intent(out) :: phantom(*), taup1(*), taup2(*), tauq1(*), work(*)
+           complex(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+        ! ====================================================================
+           
+           ! Local Scalars 
+           real(dp) :: c, s
+           integer(ilp) :: childinfo, i, ilarf, iorbdb5, j, llarf, lorbdb5, lworkmin, &
+                     lworkopt
+           logical(lk) :: lquery
+           ! Intrinsic Function 
+           intrinsic :: atan2,cos,max,sin,sqrt
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           lquery = lwork == -1
+           if( m < 0 ) then
+              info = -1
+           else if( p < m-q .or. m-p < m-q ) then
+              info = -2
+           else if( q < m-q .or. q > m ) then
+              info = -3
+           else if( ldx11 < max( 1, p ) ) then
+              info = -5
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -7
+           end if
+           ! compute workspace
+           if( info == 0 ) then
+              ilarf = 2
+              llarf = max( q-1, p-1, m-p-1 )
+              iorbdb5 = 2
+              lorbdb5 = q
+              lworkopt = ilarf + llarf - 1
+              lworkopt = max( lworkopt, iorbdb5 + lorbdb5 - 1 )
+              lworkmin = lworkopt
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                info = -14
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNBDB4', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! reduce columns 1, ..., m-q of x11 and x21
+           do i = 1, m-q
+              if( i == 1 ) then
+                 do j = 1, m
+                    phantom(j) = czero
+                 end do
+                 call stdlib_zunbdb5( p, m-p, q, phantom(1), 1, phantom(p+1), 1,x11, ldx11, x21, &
+                           ldx21, work(iorbdb5),lorbdb5, childinfo )
+                 call stdlib_zscal( p, cnegone, phantom(1), 1 )
+                 call stdlib_zlarfgp( p, phantom(1), phantom(2), 1, taup1(1) )
+                 call stdlib_zlarfgp( m-p, phantom(p+1), phantom(p+2), 1, taup2(1) )
+                 theta(i) = atan2( real( phantom(1),KIND=dp), real( phantom(p+1),KIND=dp) )
+                           
+                 c = cos( theta(i) )
+                 s = sin( theta(i) )
+                 phantom(1) = cone
+                 phantom(p+1) = cone
+                 call stdlib_zlarf( 'L', p, q, phantom(1), 1, conjg(taup1(1)), x11,ldx11, work(&
+                           ilarf) )
+                 call stdlib_zlarf( 'L', m-p, q, phantom(p+1), 1, conjg(taup2(1)),x21, ldx21, &
+                           work(ilarf) )
+              else
+                 call stdlib_zunbdb5( p-i+1, m-p-i+1, q-i+1, x11(i,i-1), 1,x21(i,i-1), 1, x11(i,i)&
+                           , ldx11, x21(i,i),ldx21, work(iorbdb5), lorbdb5, childinfo )
+                 call stdlib_zscal( p-i+1, cnegone, x11(i,i-1), 1 )
+                 call stdlib_zlarfgp( p-i+1, x11(i,i-1), x11(i+1,i-1), 1, taup1(i) )
+                 call stdlib_zlarfgp( m-p-i+1, x21(i,i-1), x21(i+1,i-1), 1,taup2(i) )
+                 theta(i) = atan2( real( x11(i,i-1),KIND=dp), real( x21(i,i-1),KIND=dp) )
+                 c = cos( theta(i) )
+                 s = sin( theta(i) )
+                 x11(i,i-1) = cone
+                 x21(i,i-1) = cone
+                 call stdlib_zlarf( 'L', p-i+1, q-i+1, x11(i,i-1), 1,conjg(taup1(i)), x11(i,i), &
+                           ldx11, work(ilarf) )
+                 call stdlib_zlarf( 'L', m-p-i+1, q-i+1, x21(i,i-1), 1,conjg(taup2(i)), x21(i,i), &
+                           ldx21, work(ilarf) )
+              end if
+              call stdlib_zdrot( q-i+1, x11(i,i), ldx11, x21(i,i), ldx21, s, -c )
+              call stdlib_zlacgv( q-i+1, x21(i,i), ldx21 )
+              call stdlib_zlarfgp( q-i+1, x21(i,i), x21(i,i+1), ldx21, tauq1(i) )
+              c = real( x21(i,i),KIND=dp)
+              x21(i,i) = cone
+              call stdlib_zlarf( 'R', p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_zlarf( 'R', m-p-i, q-i+1, x21(i,i), ldx21, tauq1(i),x21(i+1,i), ldx21, &
+                        work(ilarf) )
+              call stdlib_zlacgv( q-i+1, x21(i,i), ldx21 )
+              if( i < m-q ) then
+                 s = sqrt( stdlib_dznrm2( p-i, x11(i+1,i), 1 )**2+ stdlib_dznrm2( m-p-i, x21(i+1,&
+                           i), 1 )**2 )
+                 phi(i) = atan2( s, c )
+              end if
+           end do
+           ! reduce the bottom-right portion of x11 to [ i 0 ]
+           do i = m - q + 1, p
+              call stdlib_zlacgv( q-i+1, x11(i,i), ldx11 )
+              call stdlib_zlarfgp( q-i+1, x11(i,i), x11(i,i+1), ldx11, tauq1(i) )
+              x11(i,i) = cone
+              call stdlib_zlarf( 'R', p-i, q-i+1, x11(i,i), ldx11, tauq1(i),x11(i+1,i), ldx11, &
+                        work(ilarf) )
+              call stdlib_zlarf( 'R', q-p, q-i+1, x11(i,i), ldx11, tauq1(i),x21(m-q+1,i), ldx21, &
+                        work(ilarf) )
+              call stdlib_zlacgv( q-i+1, x11(i,i), ldx11 )
+           end do
+           ! reduce the bottom-right portion of x21 to [ 0 i ]
+           do i = p + 1, q
+              call stdlib_zlacgv( q-i+1, x21(m-q+i-p,i), ldx21 )
+              call stdlib_zlarfgp( q-i+1, x21(m-q+i-p,i), x21(m-q+i-p,i+1), ldx21,tauq1(i) )
+                        
+              x21(m-q+i-p,i) = cone
+              call stdlib_zlarf( 'R', q-i, q-i+1, x21(m-q+i-p,i), ldx21, tauq1(i),x21(m-q+i-p+1,i)&
+                        , ldx21, work(ilarf) )
+              call stdlib_zlacgv( q-i+1, x21(m-q+i-p,i), ldx21 )
+           end do
+           return
+     end subroutine stdlib_zunbdb4
+
+     !> ZUNCSD2BY1: computes the CS decomposition of an M-by-Q matrix X with
+     !> orthonormal columns that has been partitioned into a 2-by-1 block
+     !> structure:
+     !> [  I1 0  0 ]
+     !> [  0  C  0 ]
+     !> [ X11 ]   [ U1 |    ] [  0  0  0 ]
+     !> X = [-----] = [---------] [----------] V1**T .
+     !> [ X21 ]   [    | U2 ] [  0  0  0 ]
+     !> [  0  S  0 ]
+     !> [  0  0  I2]
+     !> X11 is P-by-Q. The unitary matrices U1, U2, and V1 are P-by-P,
+     !> (M-P)-by-(M-P), and Q-by-Q, respectively. C and S are R-by-R
+     !> nonnegative diagonal matrices satisfying C^2 + S^2 = I, in which
+     !> R = MIN(P,M-P,Q,M-Q). I1 is a K1-by-K1 identity matrix and I2 is a
+     !> K2-by-K2 identity matrix, where K1 = MAX(Q+P-M,0), K2 = MAX(Q-P,0).
+
+     subroutine stdlib_zuncsd2by1( jobu1, jobu2, jobv1t, m, p, q, x11, ldx11,x21, ldx21, theta, &
+               u1, ldu1, u2, ldu2, v1t,ldv1t, work, lwork, rwork, lrwork, iwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu1, jobu2, jobv1t
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldu1, ldu2, ldv1t, lwork, ldx11, ldx21, m, p, q
+           integer(ilp), intent(in) :: lrwork
+           integer(ilp) :: lrworkmin, lrworkopt
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           real(dp), intent(out) :: theta(*)
+           complex(dp), intent(out) :: u1(ldu1,*), u2(ldu2,*), v1t(ldv1t,*), work(*)
+           complex(dp), intent(inout) :: x11(ldx11,*), x21(ldx21,*)
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: childinfo, i, ib11d, ib11e, ib12d, ib12e, ib21d, ib21e, ib22d, ib22e, &
+           ibbcsd, iorbdb, iorglq, iorgqr, iphi, itaup1, itaup2, itauq1, j, lbbcsd, lorbdb, &
+           lorglq, lorglqmin, lorglqopt, lorgqr, lorgqrmin, lorgqropt, lworkmin, lworkopt, &
+                     r
+           logical(lk) :: lquery, wantu1, wantu2, wantv1t
+           ! Local Arrays 
+           real(dp) :: dum(1)
+           complex(dp) :: cdum(1,1)
+           ! Intrinsic Function 
+           intrinsic :: int,max,min
+           ! Executable Statements 
+           ! test input arguments
+           info = 0
+           wantu1 = stdlib_lsame( jobu1, 'Y' )
+           wantu2 = stdlib_lsame( jobu2, 'Y' )
+           wantv1t = stdlib_lsame( jobv1t, 'Y' )
+           lquery = ( lwork==-1 ) .or. ( lrwork==-1 )
+           if( m < 0 ) then
+              info = -4
+           else if( p < 0 .or. p > m ) then
+              info = -5
+           else if( q < 0 .or. q > m ) then
+              info = -6
+           else if( ldx11 < max( 1, p ) ) then
+              info = -8
+           else if( ldx21 < max( 1, m-p ) ) then
+              info = -10
+           else if( wantu1 .and. ldu1 < max( 1, p ) ) then
+              info = -13
+           else if( wantu2 .and. ldu2 < max( 1, m - p ) ) then
+              info = -15
+           else if( wantv1t .and. ldv1t < max( 1, q ) ) then
+              info = -17
+           end if
+           r = min( p, m-p, q, m-q )
+           ! compute workspace
+             ! work layout:
+           ! |-----------------------------------------|
+           ! | lworkopt (1)                            |
+           ! |-----------------------------------------|
+           ! | taup1 (max(1,p))                        |
+           ! | taup2 (max(1,m-p))                      |
+           ! | tauq1 (max(1,q))                        |
+           ! |-----------------------------------------|
+           ! | stdlib_zunbdb work | stdlib_zungqr work | stdlib_zunglq work |
+           ! |             |             |             |
+           ! |             |             |             |
+           ! |             |             |             |
+           ! |             |             |             |
+           ! |-----------------------------------------|
+             ! rwork layout:
+           ! |------------------|
+           ! | lrworkopt (1)    |
+           ! |------------------|
+           ! | phi (max(1,r-1)) |
+           ! |------------------|
+           ! | b11d (r)         |
+           ! | b11e (r-1)       |
+           ! | b12d (r)         |
+           ! | b12e (r-1)       |
+           ! | b21d (r)         |
+           ! | b21e (r-1)       |
+           ! | b22d (r)         |
+           ! | b22e (r-1)       |
+           ! | stdlib_zbbcsd rwork     |
+           ! |------------------|
+           if( info == 0 ) then
+              iphi = 2
+              ib11d = iphi + max( 1, r-1 )
+              ib11e = ib11d + max( 1, r )
+              ib12d = ib11e + max( 1, r - 1 )
+              ib12e = ib12d + max( 1, r )
+              ib21d = ib12e + max( 1, r - 1 )
+              ib21e = ib21d + max( 1, r )
+              ib22d = ib21e + max( 1, r - 1 )
+              ib22e = ib22d + max( 1, r )
+              ibbcsd = ib22e + max( 1, r - 1 )
+              itaup1 = 2
+              itaup2 = itaup1 + max( 1, p )
+              itauq1 = itaup2 + max( 1, m-p )
+              iorbdb = itauq1 + max( 1, q )
+              iorgqr = itauq1 + max( 1, q )
+              iorglq = itauq1 + max( 1, q )
+              lorgqrmin = 1
+              lorgqropt = 1
+              lorglqmin = 1
+              lorglqopt = 1
+              if( r == q ) then
+                 call stdlib_zunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
+                           cdum, work, -1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_zungqr( p, p, q, u1, ldu1, cdum, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 endif
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_zungqr( m-p, m-p, q, u2, ldu2, cdum, work(1), -1,childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_zunglq( q-1, q-1, q-1, v1t, ldv1t,cdum, work(1), -1, childinfo )
+                              
+                    lorglqmin = max( lorglqmin, q-1 )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_zbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,dum, u1, ldu1,&
+                  u2, ldu2, v1t, ldv1t, cdum, 1,dum, dum, dum, dum, dum, dum, dum, dum,rwork(1), -&
+                            1, childinfo )
+                 lbbcsd = int( rwork(1),KIND=ilp)
+              else if( r == p ) then
+                 call stdlib_zunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
+                           cdum, work(1), -1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_zungqr( p-1, p-1, p-1, u1(2,2), ldu1, cdum, work(1),-1, childinfo &
+                              )
+                    lorgqrmin = max( lorgqrmin, p-1 )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_zungqr( m-p, m-p, q, u2, ldu2, cdum, work(1), -1,childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_zunglq( q, q, r, v1t, ldv1t, cdum, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_zbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,dum, v1t, &
+                 ldv1t, cdum, 1, u1, ldu1, u2, ldu2,dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
+                           1), -1, childinfo )
+                 lbbcsd = int( rwork(1),KIND=ilp)
+              else if( r == m-p ) then
+                 call stdlib_zunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
+                           cdum, work(1), -1, childinfo )
+                 lorbdb = int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_zungqr( p, p, q, u1, ldu1, cdum, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_zungqr( m-p-1, m-p-1, m-p-1, u2(2,2), ldu2, cdum,work(1), -1, &
+                              childinfo )
+                    lorgqrmin = max( lorgqrmin, m-p-1 )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_zunglq( q, q, r, v1t, ldv1t, cdum, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_zbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, dum, cdum,&
+                  1, v1t, ldv1t, u2, ldu2, u1,ldu1, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
+                            1), -1, childinfo )
+                 lbbcsd = int( rwork(1),KIND=ilp)
+              else
+                 call stdlib_zunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta, dum,cdum, cdum, &
+                           cdum, cdum, work(1), -1, childinfo)
+                 lorbdb = m + int( work(1),KIND=ilp)
+                 if( wantu1 .and. p > 0 ) then
+                    call stdlib_zungqr( p, p, m-q, u1, ldu1, cdum, work(1), -1,childinfo )
+                    lorgqrmin = max( lorgqrmin, p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantu2 .and. m-p > 0 ) then
+                    call stdlib_zungqr( m-p, m-p, m-q, u2, ldu2, cdum, work(1), -1,childinfo )
+                              
+                    lorgqrmin = max( lorgqrmin, m-p )
+                    lorgqropt = max( lorgqropt, int( work(1),KIND=ilp) )
+                 end if
+                 if( wantv1t .and. q > 0 ) then
+                    call stdlib_zunglq( q, q, q, v1t, ldv1t, cdum, work(1), -1,childinfo )
+                    lorglqmin = max( lorglqmin, q )
+                    lorglqopt = max( lorglqopt, int( work(1),KIND=ilp) )
+                 end if
+                 call stdlib_zbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, dum, u2, &
+                 ldu2, u1, ldu1, cdum, 1, v1t,ldv1t, dum, dum, dum, dum, dum, dum, dum, dum,rwork(&
+                           1), -1, childinfo )
+                 lbbcsd = int( rwork(1),KIND=ilp)
+              end if
+              lrworkmin = ibbcsd+lbbcsd-1
+              lrworkopt = lrworkmin
+              rwork(1) = lrworkopt
+              lworkmin = max( iorbdb+lorbdb-1,iorgqr+lorgqrmin-1,iorglq+lorglqmin-1 )
+              lworkopt = max( iorbdb+lorbdb-1,iorgqr+lorgqropt-1,iorglq+lorglqopt-1 )
+              work(1) = lworkopt
+              if( lwork < lworkmin .and. .not.lquery ) then
+                 info = -19
+              end if
+              if( lrwork < lrworkmin .and. .not.lquery ) then
+                 info = -21
+              end if
+           end if
+           if( info /= 0 ) then
+              call stdlib_xerbla( 'ZUNCSD2BY1', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           lorgqr = lwork-iorgqr+1
+           lorglq = lwork-iorglq+1
+           ! handle four cases separately: r = q, r = p, r = m-p, and r = m-q,
+           ! in which r = min(p,m-p,q,m-q)
+           if( r == q ) then
+              ! case 1: r = q
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_zunbdb1( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
+                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_zlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_zungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_zlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_zungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 v1t(1,1) = cone
+                 do j = 2, q
+                    v1t(1,j) = czero
+                    v1t(j,1) = czero
+                 end do
+                 call stdlib_zlacpy( 'U', q-1, q-1, x21(1,2), ldx21, v1t(2,2),ldv1t )
+                 call stdlib_zunglq( q-1, q-1, q-1, v1t(2,2), ldv1t, work(itauq1),work(iorglq), &
+                           lorglq, childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_zbbcsd( jobu1, jobu2, jobv1t, 'N', 'N', m, p, q, theta,rwork(iphi), u1, &
+              ldu1, u2, ldu2, v1t, ldv1t, cdum,1, rwork(ib11d), rwork(ib11e), rwork(ib12d),rwork(&
+              ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd),lrwork-&
+                        ibbcsd+1, childinfo )
+              ! permute rows and columns to place czero submatrices in
+              ! preferred positions
+              if( q > 0 .and. wantu2 ) then
+                 do i = 1, q
+                    iwork(i) = m - p - q + i
+                 end do
+                 do i = q + 1, m - p
+                    iwork(i) = i - q
+                 end do
+                 call stdlib_zlapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           else if( r == p ) then
+              ! case 2: r = p
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_zunbdb2( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
+                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 u1(1,1) = cone
+                 do j = 2, p
+                    u1(1,j) = czero
+                    u1(j,1) = czero
+                 end do
+                 call stdlib_zlacpy( 'L', p-1, p-1, x11(2,1), ldx11, u1(2,2), ldu1 )
+                 call stdlib_zungqr( p-1, p-1, p-1, u1(2,2), ldu1, work(itaup1),work(iorgqr), &
+                           lorgqr, childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_zlacpy( 'L', m-p, q, x21, ldx21, u2, ldu2 )
+                 call stdlib_zungqr( m-p, m-p, q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_zlacpy( 'U', p, q, x11, ldx11, v1t, ldv1t )
+                 call stdlib_zunglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_zbbcsd( jobv1t, 'N', jobu1, jobu2, 'T', m, q, p, theta,rwork(iphi), v1t,&
+               ldv1t, cdum, 1, u1, ldu1, u2,ldu2, rwork(ib11d), rwork(ib11e), rwork(ib12d),rwork(&
+               ib12e), rwork(ib21d), rwork(ib21e),rwork(ib22d), rwork(ib22e), rwork(ibbcsd), &
+                         lbbcsd,childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( q > 0 .and. wantu2 ) then
+                 do i = 1, q
+                    iwork(i) = m - p - q + i
+                 end do
+                 do i = q + 1, m - p
+                    iwork(i) = i - q
+                 end do
+                 call stdlib_zlapmt( .false., m-p, m-p, u2, ldu2, iwork )
+              end if
+           else if( r == m-p ) then
+              ! case 3: r = m-p
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_zunbdb3( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
+                        itaup1), work(itaup2),work(itauq1), work(iorbdb), lorbdb, childinfo )
+              ! accumulate householder reflectors
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_zlacpy( 'L', p, q, x11, ldx11, u1, ldu1 )
+                 call stdlib_zungqr( p, p, q, u1, ldu1, work(itaup1), work(iorgqr),lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 u2(1,1) = cone
+                 do j = 2, m-p
+                    u2(1,j) = czero
+                    u2(j,1) = czero
+                 end do
+                 call stdlib_zlacpy( 'L', m-p-1, m-p-1, x21(2,1), ldx21, u2(2,2),ldu2 )
+                 call stdlib_zungqr( m-p-1, m-p-1, m-p-1, u2(2,2), ldu2,work(itaup2), work(iorgqr)&
+                           , lorgqr, childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_zlacpy( 'U', m-p, q, x21, ldx21, v1t, ldv1t )
+                 call stdlib_zunglq( q, q, r, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_zbbcsd( 'N', jobv1t, jobu2, jobu1, 'T', m, m-q, m-p,theta, rwork(iphi), &
+              cdum, 1, v1t, ldv1t, u2, ldu2,u1, ldu1, rwork(ib11d), rwork(ib11e),rwork(ib12d), &
+              rwork(ib12e), rwork(ib21d),rwork(ib21e), rwork(ib22d), rwork(ib22e),rwork(ibbcsd), &
+                        lbbcsd, childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( q > r ) then
+                 do i = 1, r
+                    iwork(i) = q - r + i
+                 end do
+                 do i = r + 1, q
+                    iwork(i) = i - r
+                 end do
+                 if( wantu1 ) then
+                    call stdlib_zlapmt( .false., p, q, u1, ldu1, iwork )
+                 end if
+                 if( wantv1t ) then
+                    call stdlib_zlapmr( .false., q, q, v1t, ldv1t, iwork )
+                 end if
+              end if
+           else
+              ! case 4: r = m-q
+              ! simultaneously bidiagonalize x11 and x21
+              call stdlib_zunbdb4( m, p, q, x11, ldx11, x21, ldx21, theta,rwork(iphi), work(&
+              itaup1), work(itaup2),work(itauq1), work(iorbdb), work(iorbdb+m),lorbdb-m, &
+                        childinfo )
+              ! accumulate householder reflectors
+              if( wantu2 .and. m-p > 0 ) then
+                 call stdlib_zcopy( m-p, work(iorbdb+p), 1, u2, 1 )
+              end if
+              if( wantu1 .and. p > 0 ) then
+                 call stdlib_zcopy( p, work(iorbdb), 1, u1, 1 )
+                 do j = 2, p
+                    u1(1,j) = czero
+                 end do
+                 call stdlib_zlacpy( 'L', p-1, m-q-1, x11(2,1), ldx11, u1(2,2),ldu1 )
+                 call stdlib_zungqr( p, p, m-q, u1, ldu1, work(itaup1),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantu2 .and. m-p > 0 ) then
+                 do j = 2, m-p
+                    u2(1,j) = czero
+                 end do
+                 call stdlib_zlacpy( 'L', m-p-1, m-q-1, x21(2,1), ldx21, u2(2,2),ldu2 )
+                 call stdlib_zungqr( m-p, m-p, m-q, u2, ldu2, work(itaup2),work(iorgqr), lorgqr, &
+                           childinfo )
+              end if
+              if( wantv1t .and. q > 0 ) then
+                 call stdlib_zlacpy( 'U', m-q, q, x21, ldx21, v1t, ldv1t )
+                 call stdlib_zlacpy( 'U', p-(m-q), q-(m-q), x11(m-q+1,m-q+1), ldx11,v1t(m-q+1,m-q+&
+                           1), ldv1t )
+                 call stdlib_zlacpy( 'U', -p+q, q-p, x21(m-q+1,p+1), ldx21,v1t(p+1,p+1), ldv1t )
+                           
+                 call stdlib_zunglq( q, q, q, v1t, ldv1t, work(itauq1),work(iorglq), lorglq, &
+                           childinfo )
+              end if
+              ! simultaneously diagonalize x11 and x21.
+              call stdlib_zbbcsd( jobu2, jobu1, 'N', jobv1t, 'N', m, m-p, m-q,theta, rwork(iphi), &
+              u2, ldu2, u1, ldu1, cdum, 1,v1t, ldv1t, rwork(ib11d), rwork(ib11e),rwork(ib12d), &
+              rwork(ib12e), rwork(ib21d),rwork(ib21e), rwork(ib22d), rwork(ib22e),rwork(ibbcsd), &
+                        lbbcsd, childinfo )
+              ! permute rows and columns to place identity submatrices in
+              ! preferred positions
+              if( p > r ) then
+                 do i = 1, r
+                    iwork(i) = p - r + i
+                 end do
+                 do i = r + 1, p
+                    iwork(i) = i - r
+                 end do
+                 if( wantu1 ) then
+                    call stdlib_zlapmt( .false., p, p, u1, ldu1, iwork )
+                 end if
+                 if( wantv1t ) then
+                    call stdlib_zlapmr( .false., p, q, v1t, ldv1t, iwork )
+                 end if
+              end if
+           end if
+           return
+     end subroutine stdlib_zuncsd2by1
+
+     !> ZUNGBR: generates one of the complex unitary matrices Q or P**H
+     !> determined by ZGEBRD when reducing a complex matrix A to bidiagonal
+     !> form: A = Q * B * P**H.  Q and P**H are defined as products of
+     !> elementary reflectors H(i) or G(i) respectively.
+     !> If VECT = 'Q', A is assumed to have been an M-by-K matrix, and Q
+     !> is of order M:
+     !> if m >= k, Q = H(1) H(2) . . . H(k) and ZUNGBR returns the first n
+     !> columns of Q, where m >= n >= k;
+     !> if m < k, Q = H(1) H(2) . . . H(m-1) and ZUNGBR returns Q as an
+     !> M-by-M matrix.
+     !> If VECT = 'P', A is assumed to have been a K-by-N matrix, and P**H
+     !> is of order N:
+     !> if k < n, P**H = G(k) . . . G(2) G(1) and ZUNGBR returns the first m
+     !> rows of P**H, where n >= m >= k;
+     !> if k >= n, P**H = G(n-1) . . . G(2) G(1) and ZUNGBR returns P**H as
+     !> an N-by-N matrix.
+
+     pure subroutine stdlib_zungbr( vect, m, n, k, a, lda, tau, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantq
+           integer(ilp) :: i, iinfo, j, lwkopt, mn
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           wantq = stdlib_lsame( vect, 'Q' )
+           mn = min( m, n )
+           lquery = ( lwork==-1 )
+           if( .not.wantq .and. .not.stdlib_lsame( vect, 'P' ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 .or. ( wantq .and. ( n>m .or. n<min( m,k ) ) ) .or. ( .not.wantq .and. ( &
+                     m>n .or. m<min( n, k ) ) ) ) then
+              info = -3
+           else if( k<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( lwork<max( 1, mn ) .and. .not.lquery ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              work( 1 ) = 1
+              if( wantq ) then
+                 if( m>=k ) then
+                    call stdlib_zungqr( m, n, k, a, lda, tau, work, -1, iinfo )
+                 else
+                    if( m>1 ) then
+                       call stdlib_zungqr( m-1, m-1, m-1, a, lda, tau, work, -1,iinfo )
+                    end if
+                 end if
+              else
+                 if( k<n ) then
+                    call stdlib_zunglq( m, n, k, a, lda, tau, work, -1, iinfo )
+                 else
+                    if( n>1 ) then
+                       call stdlib_zunglq( n-1, n-1, n-1, a, lda, tau, work, -1,iinfo )
+                    end if
+                 end if
+              end if
+              lwkopt = real( work( 1 ),KIND=dp)
+              lwkopt = max (lwkopt, mn)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGBR', -info )
+              return
+           else if( lquery ) then
+              work( 1 ) = lwkopt
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              work( 1 ) = 1
+              return
+           end if
+           if( wantq ) then
+              ! form q, determined by a call to stdlib_zgebrd to reduce an m-by-k
+              ! matrix
+              if( m>=k ) then
+                 ! if m >= k, assume m >= n >= k
+                 call stdlib_zungqr( m, n, k, a, lda, tau, work, lwork, iinfo )
+              else
+                 ! if m < k, assume m = n
+                 ! shift the vectors which define the elementary reflectors cone
+                 ! column to the right, and set the first row and column of q
+                 ! to those of the unit matrix
+                 do j = m, 2, -1
+                    a( 1, j ) = czero
+                    do i = j + 1, m
+                       a( i, j ) = a( i, j-1 )
+                    end do
+                 end do
+                 a( 1, 1 ) = cone
+                 do i = 2, m
+                    a( i, 1 ) = czero
+                 end do
+                 if( m>1 ) then
+                    ! form q(2:m,2:m)
+                    call stdlib_zungqr( m-1, m-1, m-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                              
+                 end if
+              end if
+           else
+              ! form p**h, determined by a call to stdlib_zgebrd to reduce a k-by-n
+              ! matrix
+              if( k<n ) then
+                 ! if k < n, assume k <= m <= n
+                 call stdlib_zunglq( m, n, k, a, lda, tau, work, lwork, iinfo )
+              else
+                 ! if k >= n, assume m = n
+                 ! shift the vectors which define the elementary reflectors cone
+                 ! row downward, and set the first row and column of p**h to
+                 ! those of the unit matrix
+                 a( 1, 1 ) = cone
+                 do i = 2, n
+                    a( i, 1 ) = czero
+                 end do
+                 do j = 2, n
+                    do i = j - 1, 2, -1
+                       a( i, j ) = a( i-1, j )
+                    end do
+                    a( 1, j ) = czero
+                 end do
+                 if( n>1 ) then
+                    ! form p**h(2:n,2:n)
+                    call stdlib_zunglq( n-1, n-1, n-1, a( 2, 2 ), lda, tau, work,lwork, iinfo )
+                              
+                 end if
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zungbr
+
+     !> ZUNGTSQR: generates an M-by-N complex matrix Q_out with orthonormal
+     !> columns, which are the first N columns of a product of comlpex unitary
+     !> matrices of order M which are returned by ZLATSQR
+     !> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+     !> See the documentation for ZLATSQR.
+
+     pure subroutine stdlib_zungtsqr( m, n, mb, nb, a, lda, t, ldt, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, mb, nb
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: t(ldt,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: iinfo, ldc, lworkopt, lc, lw, nblocal, j
+           ! Intrinsic Functions 
+           intrinsic :: cmplx,max,min
+           ! Executable Statements 
+           ! test the input parameters
+           lquery  = lwork==-1
+           info = 0
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb<=n ) then
+              info = -3
+           else if( nb<1 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldt<max( 1, min( nb, n ) ) ) then
+              info = -8
+           else
+              ! test the input lwork for the dimension of the array work.
+              ! this workspace is used to store array c(ldc, n) and work(lwork)
+              ! in the call to stdlib_zlamtsqr. see the documentation for stdlib_zlamtsqr.
+              if( lwork<2 .and. (.not.lquery) ) then
+                 info = -10
+              else
+                 ! set block size for column blocks
+                 nblocal = min( nb, n )
+                 ! lwork = -1, then set the size for the array c(ldc,n)
+                 ! in stdlib_zlamtsqr call and set the optimal size of the work array
+                 ! work(lwork) in stdlib_zlamtsqr call.
+                 ldc = m
+                 lc = ldc*n
+                 lw = n * nblocal
+                 lworkopt = lc+lw
+                 if( ( lwork<max( 1, lworkopt ) ).and.(.not.lquery) ) then
+                    info = -10
+                 end if
+              end if
+           end if
+           ! handle error in the input parameters and return workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNGTSQR', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = cmplx( lworkopt,KIND=dp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = cmplx( lworkopt,KIND=dp)
+              return
+           end if
+           ! (1) form explicitly the tall-skinny m-by-n left submatrix q1_in
+           ! of m-by-m orthogonal matrix q_in, which is implicitly stored in
+           ! the subdiagonal part of input array a and in the input array t.
+           ! perform by the following operation using the routine stdlib_zlamtsqr.
+               ! q1_in = q_in * ( i ), where i is a n-by-n identity matrix,
+                              ! ( 0 )        0 is a (m-n)-by-n zero matrix.
+           ! (1a) form m-by-n matrix in the array work(1:ldc*n) with ones
+           ! on the diagonal and zeros elsewhere.
+           call stdlib_zlaset( 'F', m, n, czero, cone, work, ldc )
+           ! (1b)  on input, work(1:ldc*n) stores ( i );
+                                                ! ( 0 )
+                 ! on output, work(1:ldc*n) stores q1_in.
+           call stdlib_zlamtsqr( 'L', 'N', m, n, n, mb, nblocal, a, lda, t, ldt,work, ldc, work( &
+                     lc+1 ), lw, iinfo )
+           ! (2) copy the result from the part of the work array (1:m,1:n)
+           ! with the leading dimension ldc that starts at work(1) into
+           ! the output array a(1:m,1:n) column-by-column.
+           do j = 1, n
+              call stdlib_zcopy( m, work( (j-1)*ldc + 1 ), 1, a( 1, j ), 1 )
+           end do
+           work( 1 ) = cmplx( lworkopt,KIND=dp)
+           return
+     end subroutine stdlib_zungtsqr
+
+     !> If VECT = 'Q', ZUNMBR: overwrites the general complex M-by-N matrix C
+     !> with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> If VECT = 'P', ZUNMBR overwrites the general complex M-by-N matrix C
+     !> with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      P * C          C * P
+     !> TRANS = 'C':      P**H * C       C * P**H
+     !> Here Q and P**H are the unitary matrices determined by ZGEBRD when
+     !> reducing a complex matrix A to bidiagonal form: A = Q * B * P**H. Q
+     !> and P**H are defined as products of elementary reflectors H(i) and
+     !> G(i) respectively.
+     !> Let nq = m if SIDE = 'L' and nq = n if SIDE = 'R'. Thus nq is the
+     !> order of the unitary matrix Q or P**H that is applied.
+     !> If VECT = 'Q', A is assumed to have been an NQ-by-K matrix:
+     !> if nq >= k, Q = H(1) H(2) . . . H(k);
+     !> if nq < k, Q = H(1) H(2) . . . H(nq-1).
+     !> If VECT = 'P', A is assumed to have been a K-by-NQ matrix:
+     !> if k < nq, P = G(1) G(2) . . . G(k);
+     !> if k >= nq, P = G(1) G(2) . . . G(nq-1).
+
+     pure subroutine stdlib_zunmbr( vect, side, trans, m, n, k, a, lda, tau, c,ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans, vect
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: k, lda, ldc, lwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), c(ldc,*)
+           complex(dp), intent(in) :: tau(*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: applyq, left, lquery, notran
+           character :: transt
+           integer(ilp) :: i1, i2, iinfo, lwkopt, mi, nb, ni, nq, nw
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           applyq = stdlib_lsame( vect, 'Q' )
+           left = stdlib_lsame( side, 'L' )
+           notran = stdlib_lsame( trans, 'N' )
+           lquery = ( lwork==-1 )
+           ! nq is the order of q or p and nw is the minimum dimension of work
+           if( left ) then
+              nq = m
+              nw = max( 1, n )
+           else
+              nq = n
+              nw = max( 1, m )
+           end if
+           if( .not.applyq .and. .not.stdlib_lsame( vect, 'P' ) ) then
+              info = -1
+           else if( .not.left .and. .not.stdlib_lsame( side, 'R' ) ) then
+              info = -2
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'C' ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( k<0 ) then
+              info = -6
+           else if( ( applyq .and. lda<max( 1, nq ) ) .or.( .not.applyq .and. lda<max( 1, min( nq,&
+                      k ) ) ) )then
+              info = -8
+           else if( ldc<max( 1, m ) ) then
+              info = -11
+           else if( lwork<nw .and. .not.lquery ) then
+              info = -13
+           end if
+           if( info==0 ) then
+              if( m>0 .and. n>0 ) then
+                 if( applyq ) then
+                    if( left ) then
+                       nb = stdlib_ilaenv( 1, 'ZUNMQR', side // trans, m-1, n, m-1,-1 )
+                    else
+                       nb = stdlib_ilaenv( 1, 'ZUNMQR', side // trans, m, n-1, n-1,-1 )
+                    end if
+                 else
+                    if( left ) then
+                       nb = stdlib_ilaenv( 1, 'ZUNMLQ', side // trans, m-1, n, m-1,-1 )
+                    else
+                       nb = stdlib_ilaenv( 1, 'ZUNMLQ', side // trans, m, n-1, n-1,-1 )
+                    end if
+                 end if
+                 lwkopt = nw*nb
+              else
+                 lwkopt = 1
+              end if
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZUNMBR', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 )return
+           if( applyq ) then
+              ! apply q
+              if( nq>=k ) then
+                 ! q was determined by a call to stdlib_zgebrd with nq >= k
+                 call stdlib_zunmqr( side, trans, m, n, k, a, lda, tau, c, ldc,work, lwork, iinfo &
+                           )
+              else if( nq>1 ) then
+                 ! q was determined by a call to stdlib_zgebrd with nq < k
+                 if( left ) then
+                    mi = m - 1
+                    ni = n
+                    i1 = 2
+                    i2 = 1
+                 else
+                    mi = m
+                    ni = n - 1
+                    i1 = 1
+                    i2 = 2
+                 end if
+                 call stdlib_zunmqr( side, trans, mi, ni, nq-1, a( 2, 1 ), lda, tau,c( i1, i2 ), &
+                           ldc, work, lwork, iinfo )
+              end if
+           else
+              ! apply p
+              if( notran ) then
+                 transt = 'C'
+              else
+                 transt = 'N'
+              end if
+              if( nq>k ) then
+                 ! p was determined by a call to stdlib_zgebrd with nq > k
+                 call stdlib_zunmlq( side, transt, m, n, k, a, lda, tau, c, ldc,work, lwork, &
+                           iinfo )
+              else if( nq>1 ) then
+                 ! p was determined by a call to stdlib_zgebrd with nq <= k
+                 if( left ) then
+                    mi = m - 1
+                    ni = n
+                    i1 = 2
+                    i2 = 1
+                 else
+                    mi = m
+                    ni = n - 1
+                    i1 = 1
+                    i2 = 2
+                 end if
+                 call stdlib_zunmlq( side, transt, mi, ni, nq-1, a( 1, 2 ), lda,tau, c( i1, i2 ), &
+                           ldc, work, lwork, iinfo )
+              end if
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zunmbr
+
+     !> ZCGESV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> ZCGESV first attempts to factorize the matrix in COMPLEX and use this
+     !> factorization within an iterative refinement procedure to produce a
+     !> solution with COMPLEX*16 normwise backward error quality (see below).
+     !> If the approach fails the method switches to a COMPLEX*16
+     !> factorization and solve.
+     !> The iterative refinement is not going to be a winning strategy if
+     !> the ratio COMPLEX performance over COMPLEX*16 performance is too
+     !> small. A reasonable strategy should take the number of right-hand
+     !> sides and the size of the matrix into account. This might be done
+     !> with a call to ILAENV in the future. Up to now, we always try
+     !> iterative refinement.
+     !> The iterative refinement process is stopped if
+     !> ITER > ITERMAX
+     !> or for all the RHS we have:
+     !> RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX
+     !> where
+     !> o ITER is the number of the current iteration in the iterative
+     !> refinement process
+     !> o RNRM is the infinity-norm of the residual
+     !> o XNRM is the infinity-norm of the solution
+     !> o ANRM is the infinity-operator-norm of the matrix A
+     !> o EPS is the machine epsilon returned by DLAMCH('Epsilon')
+     !> The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00
+     !> respectively.
+
+     subroutine stdlib_zcgesv( n, nrhs, a, lda, ipiv, b, ldb, x, ldx, work,swork, rwork, iter, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, iter
+           integer(ilp), intent(in) :: lda, ldb, ldx, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(sp), intent(out) :: swork(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(in) :: b(ldb,*)
+           complex(dp), intent(out) :: work(n,*), x(ldx,*)
+        ! =====================================================================
+           ! Parameters 
+           logical(lk), parameter :: doitref = .true.
+           integer(ilp), parameter :: itermax = 30
+           real(dp), parameter :: bwdmax = 1.0e+00_dp
+           
+           
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: i, iiter, ptsa, ptsx
+           real(dp) :: anrm, cte, eps, rnrm, xnrm
+           complex(dp) :: zdum
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,max,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( zdum ) = abs( real( zdum,KIND=dp) ) + abs( aimag( zdum ) )
+           ! Executable Statements 
+           info = 0
+           iter = 0
+           ! test the input parameters.
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldx<max( 1, n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZCGESV', -info )
+              return
+           end if
+           ! quick return if (n==0).
+           if( n==0 )return
+           ! skip single precision iterative refinement if a priori slower
+           ! than double precision factorization.
+           if( .not.doitref ) then
+              iter = -1
+              go to 40
+           end if
+           ! compute some constants.
+           anrm = stdlib_zlange( 'I', n, n, a, lda, rwork )
+           eps = stdlib_dlamch( 'EPSILON' )
+           cte = anrm*eps*sqrt( real( n,KIND=dp) )*bwdmax
+           ! set the indices ptsa, ptsx for referencing sa and sx in swork.
+           ptsa = 1
+           ptsx = ptsa + n*n
+           ! convert b from double precision to single precision and store the
+           ! result in sx.
+           call stdlib_zlag2c( n, nrhs, b, ldb, swork( ptsx ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! convert a from double precision to single precision and store the
+           ! result in sa.
+           call stdlib_zlag2c( n, n, a, lda, swork( ptsa ), n, info )
+           if( info/=0 ) then
+              iter = -2
+              go to 40
+           end if
+           ! compute the lu factorization of sa.
+           call stdlib_cgetrf( n, n, swork( ptsa ), n, ipiv, info )
+           if( info/=0 ) then
+              iter = -3
+              go to 40
+           end if
+           ! solve the system sa*sx = sb.
+           call stdlib_cgetrs( 'NO TRANSPOSE', n, nrhs, swork( ptsa ), n, ipiv,swork( ptsx ), n, &
+                     info )
+           ! convert sx back to double precision
+           call stdlib_clag2z( n, nrhs, swork( ptsx ), n, x, ldx, info )
+           ! compute r = b - ax (r is work).
+           call stdlib_zlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+           call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, nrhs, n, cnegone, a,lda, x, ldx, &
+                     cone, work, n )
+           ! check whether the nrhs normwise backward errors satisfy the
+           ! stopping criterion. if yes, set iter=0 and return.
+           do i = 1, nrhs
+              xnrm = cabs1( x( stdlib_izamax( n, x( 1, i ), 1 ), i ) )
+              rnrm = cabs1( work( stdlib_izamax( n, work( 1, i ), 1 ), i ) )
+              if( rnrm>xnrm*cte )go to 10
+           end do
+           ! if we are here, the nrhs normwise backward errors satisfy the
+           ! stopping criterion. we are good to exit.
+           iter = 0
+           return
+           10 continue
+           loop_30: do iiter = 1, itermax
+              ! convert r (in work) from double precision to single precision
+              ! and store the result in sx.
+              call stdlib_zlag2c( n, nrhs, work, n, swork( ptsx ), n, info )
+              if( info/=0 ) then
+                 iter = -2
+                 go to 40
+              end if
+              ! solve the system sa*sx = sr.
+              call stdlib_cgetrs( 'NO TRANSPOSE', n, nrhs, swork( ptsa ), n, ipiv,swork( ptsx ), &
+                        n, info )
+              ! convert sx back to double precision and update the current
+              ! iterate.
+              call stdlib_clag2z( n, nrhs, swork( ptsx ), n, work, n, info )
+              do i = 1, nrhs
+                 call stdlib_zaxpy( n, cone, work( 1, i ), 1, x( 1, i ), 1 )
+              end do
+              ! compute r = b - ax (r is work).
+              call stdlib_zlacpy( 'ALL', n, nrhs, b, ldb, work, n )
+              call stdlib_zgemm( 'NO TRANSPOSE', 'NO TRANSPOSE', n, nrhs, n, cnegone,a, lda, x, &
+                        ldx, cone, work, n )
+              ! check whether the nrhs normwise backward errors satisfy the
+              ! stopping criterion. if yes, set iter=iiter>0 and return.
+              do i = 1, nrhs
+                 xnrm = cabs1( x( stdlib_izamax( n, x( 1, i ), 1 ), i ) )
+                 rnrm = cabs1( work( stdlib_izamax( n, work( 1, i ), 1 ), i ) )
+                 if( rnrm>xnrm*cte )go to 20
+              end do
+              ! if we are here, the nrhs normwise backward errors satisfy the
+              ! stopping criterion, we are good to exit.
+              iter = iiter
+              return
+              20 continue
+           end do loop_30
+           ! if we are at this place of the code, this is because we have
+           ! performed iter=itermax iterations and never satisfied the stopping
+           ! criterion, set up the iter flag accordingly and follow up on double
+           ! precision routine.
+           iter = -itermax - 1
+           40 continue
+           ! single-precision iterative refinement failed to converge to a
+           ! satisfactory solution, so we resort to double precision.
+           call stdlib_zgetrf( n, n, a, lda, ipiv, info )
+           if( info/=0 )return
+           call stdlib_zlacpy( 'ALL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_zgetrs( 'NO TRANSPOSE', n, nrhs, a, lda, ipiv, x, ldx,info )
+           return
+     end subroutine stdlib_zcgesv
+
+     !> ZGELQ: computes an LQ factorization of a complex M-by-N matrix A:
+     !> A = ( L 0 ) *  Q
+     !> where:
+     !> Q is a N-by-N orthogonal matrix;
+     !> L is a lower-triangular M-by-M matrix;
+     !> 0 is a M-by-(N-M) zero matrix, if M < N.
+
+     pure subroutine stdlib_zgelq( m, n, a, lda, t, tsize, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, tsize, lwork
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: t(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, lminws, mint, minw
+           integer(ilp) :: mb, nb, mintsz, nblcks, lwmin, lwopt, lwreq
+           ! Intrinsic Functions 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( tsize==-1 .or. tsize==-2 .or.lwork==-1 .or. lwork==-2 )
+           mint = .false.
+           minw = .false.
+           if( tsize==-2 .or. lwork==-2 ) then
+             if( tsize/=-1 ) mint = .true.
+             if( lwork/=-1 ) minw = .true.
+           end if
+           ! determine the block size
+           if( min( m, n )>0 ) then
+             mb = stdlib_ilaenv( 1, 'ZGELQ ', ' ', m, n, 1, -1 )
+             nb = stdlib_ilaenv( 1, 'ZGELQ ', ' ', m, n, 2, -1 )
+           else
+             mb = 1
+             nb = n
+           end if
+           if( mb>min( m, n ) .or. mb<1 ) mb = 1
+           if( nb>n .or. nb<=m ) nb = n
+           mintsz = m + 5
+           if ( nb>m .and. n>m ) then
+             if( mod( n - m, nb - m )==0 ) then
+               nblcks = ( n - m ) / ( nb - m )
+             else
+               nblcks = ( n - m ) / ( nb - m ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           ! determine if the workspace size satisfies minimal size
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+              lwmin = max( 1, n )
+              lwopt = max( 1, mb*n )
+           else
+              lwmin = max( 1, m )
+              lwopt = max( 1, mb*m )
+           end if
+           lminws = .false.
+           if( ( tsize<max( 1, mb*m*nblcks + 5 ) .or. lwork<lwopt ).and. ( lwork>=lwmin ) .and. ( &
+                     tsize>=mintsz ).and. ( .not.lquery ) ) then
+             if( tsize<max( 1, mb*m*nblcks + 5 ) ) then
+                 lminws = .true.
+                 mb = 1
+                 nb = n
+             end if
+             if( lwork<lwopt ) then
+                 lminws = .true.
+                 mb = 1
+             end if
+           end if
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+              lwreq = max( 1, mb*n )
+           else
+              lwreq = max( 1, mb*m )
+           end if
+           if( m<0 ) then
+             info = -1
+           else if( n<0 ) then
+             info = -2
+           else if( lda<max( 1, m ) ) then
+             info = -4
+           else if( tsize<max( 1, mb*m*nblcks + 5 ).and. ( .not.lquery ) .and. ( .not.lminws ) ) &
+                     then
+             info = -6
+           else if( ( lwork<lwreq ) .and.( .not.lquery ).and. ( .not.lminws ) ) then
+             info = -8
+           end if
+           if( info==0 ) then
+             if( mint ) then
+               t( 1 ) = mintsz
+             else
+               t( 1 ) = mb*m*nblcks + 5
+             end if
+             t( 2 ) = mb
+             t( 3 ) = nb
+             if( minw ) then
+               work( 1 ) = lwmin
+             else
+               work( 1 ) = lwreq
+             end if
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZGELQ', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+             return
+           end if
+           ! the lq decomposition
+           if( ( n<=m ) .or. ( nb<=m ) .or. ( nb>=n ) ) then
+             call stdlib_zgelqt( m, n, mb, a, lda, t( 6 ), mb, work, info )
+           else
+             call stdlib_zlaswlq( m, n, mb, nb, a, lda, t( 6 ), mb, work,lwork, info )
+           end if
+           work( 1 ) = lwreq
+           return
+     end subroutine stdlib_zgelq
+
+     !> ZGELSD: computes the minimum-norm solution to a real linear least
+     !> squares problem:
+     !> minimize 2-norm(| b - A*x |)
+     !> using the singular value decomposition (SVD) of A. A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+     !> The problem is solved in three steps:
+     !> (1) Reduce the coefficient matrix A to bidiagonal form with
+     !> Householder transformations, reducing the original problem
+     !> into a "bidiagonal least squares problem" (BLS)
+     !> (2) Solve the BLS using a divide and conquer approach.
+     !> (3) Apply back all the Householder transformations to solve
+     !> the original least squares problem.
+     !> The effective rank of A is determined by treating as zero those
+     !> singular values which are less than RCOND times the largest singular
+     !> value.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_zgelsd( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, rwork, &
+               iwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(dp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: rwork(*), s(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: iascl, ibscl, ie, il, itau, itaup, itauq, ldwork, liwork, lrwork, &
+                     maxmn, maxwrk, minmn, minwrk, mm, mnthr, nlvl, nrwork, nwork, smlsiz
+           real(dp) :: anrm, bignum, bnrm, eps, sfmin, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: int,log,max,min,real
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           minmn = min( m, n )
+           maxmn = max( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, maxmn ) ) then
+              info = -7
+           end if
+           ! compute workspace.
+           ! (note: comments in the code beginning "workspace:" describe the
+           ! minimal amount of workspace needed at that point in the code,
+           ! as well as the preferred amount for good performance.
+           ! nb refers to the optimal block size for the immediately
+           ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              liwork = 1
+              lrwork = 1
+              if( minmn>0 ) then
+                 smlsiz = stdlib_ilaenv( 9, 'ZGELSD', ' ', 0, 0, 0, 0 )
+                 mnthr = stdlib_ilaenv( 6, 'ZGELSD', ' ', m, n, nrhs, -1 )
+                 nlvl = max( int( log( real( minmn,KIND=dp) / real( smlsiz + 1,KIND=dp) ) /log( &
+                           two ),KIND=ilp) + 1, 0 )
+                 liwork = 3*minmn*nlvl + 11*minmn
+                 mm = m
+                 if( m>=n .and. m>=mnthr ) then
+                    ! path 1a - overdetermined, with many more rows than
+                              ! columns.
+                    mm = n
+                    maxwrk = max( maxwrk, n*stdlib_ilaenv( 1, 'ZGEQRF', ' ', m, n,-1, -1 ) )
+                              
+                    maxwrk = max( maxwrk, nrhs*stdlib_ilaenv( 1, 'ZUNMQR', 'LC', m,nrhs, n, -1 ) )
+                              
+                 end if
+                 if( m>=n ) then
+                    ! path 1 - overdetermined or exactly determined.
+                    lrwork = 10*n + 2*n*smlsiz + 8*n*nlvl + 3*smlsiz*nrhs +max( (smlsiz+1)**2, n*(&
+                              1+nrhs) + 2*nrhs )
+                    maxwrk = max( maxwrk, 2*n + ( mm + n )*stdlib_ilaenv( 1,'ZGEBRD', ' ', mm, n, &
+                              -1, -1 ) )
+                    maxwrk = max( maxwrk, 2*n + nrhs*stdlib_ilaenv( 1, 'ZUNMBR','QLC', mm, nrhs, &
+                              n, -1 ) )
+                    maxwrk = max( maxwrk, 2*n + ( n - 1 )*stdlib_ilaenv( 1,'ZUNMBR', 'PLN', n, &
+                              nrhs, n, -1 ) )
+                    maxwrk = max( maxwrk, 2*n + n*nrhs )
+                    minwrk = max( 2*n + mm, 2*n + n*nrhs )
+                 end if
+                 if( n>m ) then
+                    lrwork = 10*m + 2*m*smlsiz + 8*m*nlvl + 3*smlsiz*nrhs +max( (smlsiz+1)**2, n*(&
+                              1+nrhs) + 2*nrhs )
+                    if( n>=mnthr ) then
+                       ! path 2a - underdetermined, with many more columns
+                                 ! than rows.
+                       maxwrk = m + m*stdlib_ilaenv( 1, 'ZGELQF', ' ', m, n, -1,-1 )
+                       maxwrk = max( maxwrk, m*m + 4*m + 2*m*stdlib_ilaenv( 1,'ZGEBRD', ' ', m, m,&
+                                  -1, -1 ) )
+                       maxwrk = max( maxwrk, m*m + 4*m + nrhs*stdlib_ilaenv( 1,'ZUNMBR', 'QLC', m,&
+                                  nrhs, m, -1 ) )
+                       maxwrk = max( maxwrk, m*m + 4*m + ( m - 1 )*stdlib_ilaenv( 1,'ZUNMLQ', &
+                                 'LC', n, nrhs, m, -1 ) )
+                       if( nrhs>1 ) then
+                          maxwrk = max( maxwrk, m*m + m + m*nrhs )
+                       else
+                          maxwrk = max( maxwrk, m*m + 2*m )
+                       end if
+                       maxwrk = max( maxwrk, m*m + 4*m + m*nrhs )
+           ! xxx: ensure the path 2a case below is triggered.  the workspace
+           ! calculation should use queries for all routines eventually.
+                       maxwrk = max( maxwrk,4*m+m*m+max( m, 2*m-4, nrhs, n-3*m ) )
+                    else
+                       ! path 2 - underdetermined.
+                       maxwrk = 2*m + ( n + m )*stdlib_ilaenv( 1, 'ZGEBRD', ' ', m,n, -1, -1 )
+                                 
+                       maxwrk = max( maxwrk, 2*m + nrhs*stdlib_ilaenv( 1, 'ZUNMBR','QLC', m, nrhs,&
+                                  m, -1 ) )
+                       maxwrk = max( maxwrk, 2*m + m*stdlib_ilaenv( 1, 'ZUNMBR','PLN', n, nrhs, m,&
+                                  -1 ) )
+                       maxwrk = max( maxwrk, 2*m + m*nrhs )
+                    end if
+                    minwrk = max( 2*m + n, 2*m + m*nrhs )
+                 end if
+              end if
+              minwrk = min( minwrk, maxwrk )
+              work( 1 ) = maxwrk
+              iwork( 1 ) = liwork
+              rwork( 1 ) = lrwork
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELSD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible.
+           if( m==0 .or. n==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters.
+           eps = stdlib_dlamch( 'P' )
+           sfmin = stdlib_dlamch( 'S' )
+           smlnum = sfmin / eps
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! scale a if max entry outside range [smlnum,bignum].
+           anrm = stdlib_zlange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_zlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum.
+              call stdlib_zlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_zlaset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              call stdlib_dlaset( 'F', minmn, 1, zero, zero, s, 1 )
+              rank = 0
+              go to 10
+           end if
+           ! scale b if max entry outside range [smlnum,bignum].
+           bnrm = stdlib_zlange( 'M', m, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum.
+              call stdlib_zlascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum.
+              call stdlib_zlascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! if m < n make sure b(m+1:n,:) = 0
+           if( m<n )call stdlib_zlaset( 'F', n-m, nrhs, czero, czero, b( m+1, 1 ), ldb )
+           ! overdetermined case.
+           if( m>=n ) then
+              ! path 1 - overdetermined or exactly determined.
+              mm = m
+              if( m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns
+                 mm = n
+                 itau = 1
+                 nwork = itau + n
+                 ! compute a=q*r.
+                 ! (rworkspace: need n)
+                 ! (cworkspace: need n, prefer n*nb)
+                 call stdlib_zgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                           info )
+                 ! multiply b by transpose(q).
+                 ! (rworkspace: need n)
+                 ! (cworkspace: need nrhs, prefer nrhs*nb)
+                 call stdlib_zunmqr( 'L', 'C', m, nrhs, n, a, lda, work( itau ), b,ldb, work( &
+                           nwork ), lwork-nwork+1, info )
+                 ! zero out below r.
+                 if( n>1 ) then
+                    call stdlib_zlaset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+                 end if
+              end if
+              itauq = 1
+              itaup = itauq + n
+              nwork = itaup + n
+              ie = 1
+              nrwork = ie + n
+              ! bidiagonalize r in a.
+              ! (rworkspace: need n)
+              ! (cworkspace: need 2*n+mm, prefer 2*n+(mm+n)*nb)
+              call stdlib_zgebrd( mm, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                        work( nwork ), lwork-nwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors of r.
+              ! (cworkspace: need 2*n+nrhs, prefer 2*n+nrhs*nb)
+              call stdlib_zunmbr( 'Q', 'L', 'C', mm, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_zlalsd( 'U', smlsiz, n, nrhs, s, rwork( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), rwork( nrwork ),iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of r.
+              call stdlib_zunmbr( 'P', 'L', 'N', n, nrhs, n, a, lda, work( itaup ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+           else if( n>=mnthr .and. lwork>=4*m+m*m+max( m, 2*m-4, nrhs, n-3*m ) ) then
+              ! path 2a - underdetermined, with many more columns than rows
+              ! and sufficient workspace for an efficient algorithm.
+              ldwork = m
+              if( lwork>=max( 4*m+m*lda+max( m, 2*m-4, nrhs, n-3*m ),m*lda+m+m*nrhs ) )ldwork = &
+                        lda
+              itau = 1
+              nwork = m + 1
+              ! compute a=l*q.
+              ! (cworkspace: need 2*m, prefer m+m*nb)
+              call stdlib_zgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, info )
+                        
+              il = nwork
+              ! copy l to work(il), zeroing out above its diagonal.
+              call stdlib_zlacpy( 'L', m, m, a, lda, work( il ), ldwork )
+              call stdlib_zlaset( 'U', m-1, m-1, czero, czero, work( il+ldwork ),ldwork )
+              itauq = il + ldwork*m
+              itaup = itauq + m
+              nwork = itaup + m
+              ie = 1
+              nrwork = ie + m
+              ! bidiagonalize l in work(il).
+              ! (rworkspace: need m)
+              ! (cworkspace: need m*m+4*m, prefer m*m+4*m+2*m*nb)
+              call stdlib_zgebrd( m, m, work( il ), ldwork, s, rwork( ie ),work( itauq ), work( &
+                        itaup ), work( nwork ),lwork-nwork+1, info )
+              ! multiply b by transpose of left bidiagonalizing vectors of l.
+              ! (cworkspace: need m*m+4*m+nrhs, prefer m*m+4*m+nrhs*nb)
+              call stdlib_zunmbr( 'Q', 'L', 'C', m, nrhs, m, work( il ), ldwork,work( itauq ), b, &
+                        ldb, work( nwork ),lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_zlalsd( 'U', smlsiz, m, nrhs, s, rwork( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), rwork( nrwork ),iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of l.
+              call stdlib_zunmbr( 'P', 'L', 'N', m, nrhs, m, work( il ), ldwork,work( itaup ), b, &
+                        ldb, work( nwork ),lwork-nwork+1, info )
+              ! zero out below first m rows of b.
+              call stdlib_zlaset( 'F', n-m, nrhs, czero, czero, b( m+1, 1 ), ldb )
+              nwork = itau + m
+              ! multiply transpose(q) by b.
+              ! (cworkspace: need nrhs, prefer nrhs*nb)
+              call stdlib_zunmlq( 'L', 'C', n, nrhs, m, a, lda, work( itau ), b,ldb, work( nwork )&
+                        , lwork-nwork+1, info )
+           else
+              ! path 2 - remaining underdetermined cases.
+              itauq = 1
+              itaup = itauq + m
+              nwork = itaup + m
+              ie = 1
+              nrwork = ie + m
+              ! bidiagonalize a.
+              ! (rworkspace: need m)
+              ! (cworkspace: need 2*m+n, prefer 2*m+(m+n)*nb)
+              call stdlib_zgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), work(&
+                         nwork ), lwork-nwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors.
+              ! (cworkspace: need 2*m+nrhs, prefer 2*m+nrhs*nb)
+              call stdlib_zunmbr( 'Q', 'L', 'C', m, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+              ! solve the bidiagonal least squares problem.
+              call stdlib_zlalsd( 'L', smlsiz, m, nrhs, s, rwork( ie ), b, ldb,rcond, rank, work( &
+                        nwork ), rwork( nrwork ),iwork, info )
+              if( info/=0 ) then
+                 go to 10
+              end if
+              ! multiply b by right bidiagonalizing vectors of a.
+              call stdlib_zunmbr( 'P', 'L', 'N', n, nrhs, m, a, lda, work( itaup ),b, ldb, work( &
+                        nwork ), lwork-nwork+1, info )
+           end if
+           ! undo scaling.
+           if( iascl==1 ) then
+              call stdlib_zlascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_dlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,info )
+           else if( iascl==2 ) then
+              call stdlib_zlascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_dlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_zlascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_zlascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           10 continue
+           work( 1 ) = maxwrk
+           iwork( 1 ) = liwork
+           rwork( 1 ) = lrwork
+           return
+     end subroutine stdlib_zgelsd
+
+     !> ZGELSS: computes the minimum norm solution to a complex linear
+     !> least squares problem:
+     !> Minimize 2-norm(| b - A*x |).
+     !> using the singular value decomposition (SVD) of A. A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution matrix
+     !> X.
+     !> The effective rank of A is determined by treating as zero those
+     !> singular values which are less than RCOND times the largest singular
+     !> value.
+
+     subroutine stdlib_zgelss( m, n, nrhs, a, lda, b, ldb, s, rcond, rank,work, lwork, rwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(dp), intent(in) :: rcond
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*), s(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: bl, chunk, i, iascl, ibscl, ie, il, irwork, itau, itaup, itauq, iwork, &
+                     ldwork, maxmn, maxwrk, minmn, minwrk, mm, mnthr
+           integer(ilp) :: lwork_zgeqrf, lwork_zunmqr, lwork_zgebrd, lwork_zunmbr, lwork_zungbr, &
+                     lwork_zunmlq, lwork_zgelqf
+           real(dp) :: anrm, bignum, bnrm, eps, sfmin, smlnum, thr
+           ! Local Arrays 
+           complex(dp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           minmn = min( m, n )
+           maxmn = max( m, n )
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, maxmn ) ) then
+              info = -7
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace refers
+             ! to real workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( minmn>0 ) then
+                 mm = m
+                 mnthr = stdlib_ilaenv( 6, 'ZGELSS', ' ', m, n, nrhs, -1 )
+                 if( m>=n .and. m>=mnthr ) then
+                    ! path 1a - overdetermined, with many more rows than
+                              ! columns
+                    ! compute space needed for stdlib_zgeqrf
+                    call stdlib_zgeqrf( m, n, a, lda, dum(1), dum(1), -1, info )
+                    lwork_zgeqrf = real( dum(1),KIND=dp)
+                    ! compute space needed for stdlib_zunmqr
+                    call stdlib_zunmqr( 'L', 'C', m, nrhs, n, a, lda, dum(1), b,ldb, dum(1), -1, &
+                              info )
+                    lwork_zunmqr = real( dum(1),KIND=dp)
+                    mm = n
+                    maxwrk = max( maxwrk, n + n*stdlib_ilaenv( 1, 'ZGEQRF', ' ', m,n, -1, -1 ) )
+                              
+                    maxwrk = max( maxwrk, n + nrhs*stdlib_ilaenv( 1, 'ZUNMQR', 'LC',m, nrhs, n, -&
+                              1 ) )
+                 end if
+                 if( m>=n ) then
+                    ! path 1 - overdetermined or exactly determined
+                    ! compute space needed for stdlib_zgebrd
+                    call stdlib_zgebrd( mm, n, a, lda, s, s, dum(1), dum(1), dum(1),-1, info )
+                              
+                    lwork_zgebrd = real( dum(1),KIND=dp)
+                    ! compute space needed for stdlib_zunmbr
+                    call stdlib_zunmbr( 'Q', 'L', 'C', mm, nrhs, n, a, lda, dum(1),b, ldb, dum(1),&
+                               -1, info )
+                    lwork_zunmbr = real( dum(1),KIND=dp)
+                    ! compute space needed for stdlib_zungbr
+                    call stdlib_zungbr( 'P', n, n, n, a, lda, dum(1),dum(1), -1, info )
+                    lwork_zungbr = real( dum(1),KIND=dp)
+                    ! compute total workspace needed
+                    maxwrk = max( maxwrk, 2*n + lwork_zgebrd )
+                    maxwrk = max( maxwrk, 2*n + lwork_zunmbr )
+                    maxwrk = max( maxwrk, 2*n + lwork_zungbr )
+                    maxwrk = max( maxwrk, n*nrhs )
+                    minwrk = 2*n + max( nrhs, m )
+                 end if
+                 if( n>m ) then
+                    minwrk = 2*m + max( nrhs, n )
+                    if( n>=mnthr ) then
+                       ! path 2a - underdetermined, with many more columns
+                       ! than rows
+                       ! compute space needed for stdlib_zgelqf
+                       call stdlib_zgelqf( m, n, a, lda, dum(1), dum(1),-1, info )
+                       lwork_zgelqf = real( dum(1),KIND=dp)
+                       ! compute space needed for stdlib_zgebrd
+                       call stdlib_zgebrd( m, m, a, lda, s, s, dum(1), dum(1),dum(1), -1, info )
+                                 
+                       lwork_zgebrd = real( dum(1),KIND=dp)
+                       ! compute space needed for stdlib_zunmbr
+                       call stdlib_zunmbr( 'Q', 'L', 'C', m, nrhs, n, a, lda,dum(1), b, ldb, dum(&
+                                 1), -1, info )
+                       lwork_zunmbr = real( dum(1),KIND=dp)
+                       ! compute space needed for stdlib_zungbr
+                       call stdlib_zungbr( 'P', m, m, m, a, lda, dum(1),dum(1), -1, info )
+                       lwork_zungbr = real( dum(1),KIND=dp)
+                       ! compute space needed for stdlib_zunmlq
+                       call stdlib_zunmlq( 'L', 'C', n, nrhs, m, a, lda, dum(1),b, ldb, dum(1), -&
+                                 1, info )
+                       lwork_zunmlq = real( dum(1),KIND=dp)
+                       ! compute total workspace needed
+                       maxwrk = m + lwork_zgelqf
+                       maxwrk = max( maxwrk, 3*m + m*m + lwork_zgebrd )
+                       maxwrk = max( maxwrk, 3*m + m*m + lwork_zunmbr )
+                       maxwrk = max( maxwrk, 3*m + m*m + lwork_zungbr )
+                       if( nrhs>1 ) then
+                          maxwrk = max( maxwrk, m*m + m + m*nrhs )
+                       else
+                          maxwrk = max( maxwrk, m*m + 2*m )
+                       end if
+                       maxwrk = max( maxwrk, m + lwork_zunmlq )
+                    else
+                       ! path 2 - underdetermined
+                       ! compute space needed for stdlib_zgebrd
+                       call stdlib_zgebrd( m, n, a, lda, s, s, dum(1), dum(1),dum(1), -1, info )
+                                 
+                       lwork_zgebrd = real( dum(1),KIND=dp)
+                       ! compute space needed for stdlib_zunmbr
+                       call stdlib_zunmbr( 'Q', 'L', 'C', m, nrhs, m, a, lda,dum(1), b, ldb, dum(&
+                                 1), -1, info )
+                       lwork_zunmbr = real( dum(1),KIND=dp)
+                       ! compute space needed for stdlib_zungbr
+                       call stdlib_zungbr( 'P', m, n, m, a, lda, dum(1),dum(1), -1, info )
+                       lwork_zungbr = real( dum(1),KIND=dp)
+                       maxwrk = 2*m + lwork_zgebrd
+                       maxwrk = max( maxwrk, 2*m + lwork_zunmbr )
+                       maxwrk = max( maxwrk, 2*m + lwork_zungbr )
+                       maxwrk = max( maxwrk, n*nrhs )
+                    end if
+                 end if
+                 maxwrk = max( minwrk, maxwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery )info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELSS', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters
+           eps = stdlib_dlamch( 'P' )
+           sfmin = stdlib_dlamch( 'S' )
+           smlnum = sfmin / eps
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_zlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_zlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_zlaset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              call stdlib_dlaset( 'F', minmn, 1, zero, zero, s, minmn )
+              rank = 0
+              go to 70
+           end if
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_zlange( 'M', m, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_zlascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_zlascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! overdetermined case
+           if( m>=n ) then
+              ! path 1 - overdetermined or exactly determined
+              mm = m
+              if( m>=mnthr ) then
+                 ! path 1a - overdetermined, with many more rows than columns
+                 mm = n
+                 itau = 1
+                 iwork = itau + n
+                 ! compute a=q*r
+                 ! (cworkspace: need 2*n, prefer n+n*nb)
+                 ! (rworkspace: none)
+                 call stdlib_zgeqrf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                           info )
+                 ! multiply b by transpose(q)
+                 ! (cworkspace: need n+nrhs, prefer n+nrhs*nb)
+                 ! (rworkspace: none)
+                 call stdlib_zunmqr( 'L', 'C', m, nrhs, n, a, lda, work( itau ), b,ldb, work( &
+                           iwork ), lwork-iwork+1, info )
+                 ! zero out below r
+                 if( n>1 )call stdlib_zlaset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+              end if
+              ie = 1
+              itauq = 1
+              itaup = itauq + n
+              iwork = itaup + n
+              ! bidiagonalize r in a
+              ! (cworkspace: need 2*n+mm, prefer 2*n+(mm+n)*nb)
+              ! (rworkspace: need n)
+              call stdlib_zgebrd( mm, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                        work( iwork ), lwork-iwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors of r
+              ! (cworkspace: need 2*n+nrhs, prefer 2*n+nrhs*nb)
+              ! (rworkspace: none)
+              call stdlib_zunmbr( 'Q', 'L', 'C', mm, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        iwork ), lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors of r in a
+              ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_zungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), lwork-iwork+&
+                        1, info )
+              irwork = ie + n
+              ! perform bidiagonal qr iteration
+                ! multiply b by transpose of left singular vectors
+                ! compute right singular vectors in a
+              ! (cworkspace: none)
+              ! (rworkspace: need bdspac)
+              call stdlib_zbdsqr( 'U', n, n, 0, nrhs, s, rwork( ie ), a, lda, dum,1, b, ldb, &
+                        rwork( irwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, n
+                 if( s( i )>thr ) then
+                    call stdlib_zdrscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_zlaset( 'F', 1, nrhs, czero, czero, b( i, 1 ), ldb )
+                 end if
+              end do
+              ! multiply b by right singular vectors
+              ! (cworkspace: need n, prefer n*nrhs)
+              ! (rworkspace: none)
+              if( lwork>=ldb*nrhs .and. nrhs>1 ) then
+                 call stdlib_zgemm( 'C', 'N', n, nrhs, n, cone, a, lda, b, ldb,czero, work, ldb )
+                           
+                 call stdlib_zlacpy( 'G', n, nrhs, work, ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = lwork / n
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_zgemm( 'C', 'N', n, bl, n, cone, a, lda, b( 1, i ),ldb, czero, &
+                              work, n )
+                    call stdlib_zlacpy( 'G', n, bl, work, n, b( 1, i ), ldb )
+                 end do
+              else
+                 call stdlib_zgemv( 'C', n, n, cone, a, lda, b, 1, czero, work, 1 )
+                 call stdlib_zcopy( n, work, 1, b, 1 )
+              end if
+           else if( n>=mnthr .and. lwork>=3*m+m*m+max( m, nrhs, n-2*m ) )then
+              ! underdetermined case, m much less than n
+              ! path 2a - underdetermined, with many more columns than rows
+              ! and sufficient workspace for an efficient algorithm
+              ldwork = m
+              if( lwork>=3*m+m*lda+max( m, nrhs, n-2*m ) )ldwork = lda
+              itau = 1
+              iwork = m + 1
+              ! compute a=l*q
+              ! (cworkspace: need 2*m, prefer m+m*nb)
+              ! (rworkspace: none)
+              call stdlib_zgelqf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, info )
+                        
+              il = iwork
+              ! copy l to work(il), zeroing out above it
+              call stdlib_zlacpy( 'L', m, m, a, lda, work( il ), ldwork )
+              call stdlib_zlaset( 'U', m-1, m-1, czero, czero, work( il+ldwork ),ldwork )
+              ie = 1
+              itauq = il + ldwork*m
+              itaup = itauq + m
+              iwork = itaup + m
+              ! bidiagonalize l in work(il)
+              ! (cworkspace: need m*m+4*m, prefer m*m+3*m+2*m*nb)
+              ! (rworkspace: need m)
+              call stdlib_zgebrd( m, m, work( il ), ldwork, s, rwork( ie ),work( itauq ), work( &
+                        itaup ), work( iwork ),lwork-iwork+1, info )
+              ! multiply b by transpose of left bidiagonalizing vectors of l
+              ! (cworkspace: need m*m+3*m+nrhs, prefer m*m+3*m+nrhs*nb)
+              ! (rworkspace: none)
+              call stdlib_zunmbr( 'Q', 'L', 'C', m, nrhs, m, work( il ), ldwork,work( itauq ), b, &
+                        ldb, work( iwork ),lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors of r in work(il)
+              ! (cworkspace: need m*m+4*m-1, prefer m*m+3*m+(m-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_zungbr( 'P', m, m, m, work( il ), ldwork, work( itaup ),work( iwork ), &
+                        lwork-iwork+1, info )
+              irwork = ie + m
+              ! perform bidiagonal qr iteration, computing right singular
+              ! vectors of l in work(il) and multiplying b by transpose of
+              ! left singular vectors
+              ! (cworkspace: need m*m)
+              ! (rworkspace: need bdspac)
+              call stdlib_zbdsqr( 'U', m, m, 0, nrhs, s, rwork( ie ), work( il ),ldwork, a, lda, &
+                        b, ldb, rwork( irwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, m
+                 if( s( i )>thr ) then
+                    call stdlib_zdrscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_zlaset( 'F', 1, nrhs, czero, czero, b( i, 1 ), ldb )
+                 end if
+              end do
+              iwork = il + m*ldwork
+              ! multiply b by right singular vectors of l in work(il)
+              ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nrhs)
+              ! (rworkspace: none)
+              if( lwork>=ldb*nrhs+iwork-1 .and. nrhs>1 ) then
+                 call stdlib_zgemm( 'C', 'N', m, nrhs, m, cone, work( il ), ldwork,b, ldb, czero, &
+                           work( iwork ), ldb )
+                 call stdlib_zlacpy( 'G', m, nrhs, work( iwork ), ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = ( lwork-iwork+1 ) / m
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_zgemm( 'C', 'N', m, bl, m, cone, work( il ), ldwork,b( 1, i ), &
+                              ldb, czero, work( iwork ), m )
+                    call stdlib_zlacpy( 'G', m, bl, work( iwork ), m, b( 1, i ),ldb )
+                 end do
+              else
+                 call stdlib_zgemv( 'C', m, m, cone, work( il ), ldwork, b( 1, 1 ),1, czero, work(&
+                            iwork ), 1 )
+                 call stdlib_zcopy( m, work( iwork ), 1, b( 1, 1 ), 1 )
+              end if
+              ! zero out below first m rows of b
+              call stdlib_zlaset( 'F', n-m, nrhs, czero, czero, b( m+1, 1 ), ldb )
+              iwork = itau + m
+              ! multiply transpose(q) by b
+              ! (cworkspace: need m+nrhs, prefer m+nhrs*nb)
+              ! (rworkspace: none)
+              call stdlib_zunmlq( 'L', 'C', n, nrhs, m, a, lda, work( itau ), b,ldb, work( iwork )&
+                        , lwork-iwork+1, info )
+           else
+              ! path 2 - remaining underdetermined cases
+              ie = 1
+              itauq = 1
+              itaup = itauq + m
+              iwork = itaup + m
+              ! bidiagonalize a
+              ! (cworkspace: need 3*m, prefer 2*m+(m+n)*nb)
+              ! (rworkspace: need n)
+              call stdlib_zgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), work(&
+                         iwork ), lwork-iwork+1,info )
+              ! multiply b by transpose of left bidiagonalizing vectors
+              ! (cworkspace: need 2*m+nrhs, prefer 2*m+nrhs*nb)
+              ! (rworkspace: none)
+              call stdlib_zunmbr( 'Q', 'L', 'C', m, nrhs, n, a, lda, work( itauq ),b, ldb, work( &
+                        iwork ), lwork-iwork+1, info )
+              ! generate right bidiagonalizing vectors in a
+              ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+              ! (rworkspace: none)
+              call stdlib_zungbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), lwork-iwork+&
+                        1, info )
+              irwork = ie + m
+              ! perform bidiagonal qr iteration,
+                 ! computing right singular vectors of a in a and
+                 ! multiplying b by transpose of left singular vectors
+              ! (cworkspace: none)
+              ! (rworkspace: need bdspac)
+              call stdlib_zbdsqr( 'L', m, n, 0, nrhs, s, rwork( ie ), a, lda, dum,1, b, ldb, &
+                        rwork( irwork ), info )
+              if( info/=0 )go to 70
+              ! multiply b by reciprocals of singular values
+              thr = max( rcond*s( 1 ), sfmin )
+              if( rcond<zero )thr = max( eps*s( 1 ), sfmin )
+              rank = 0
+              do i = 1, m
+                 if( s( i )>thr ) then
+                    call stdlib_zdrscl( nrhs, s( i ), b( i, 1 ), ldb )
+                    rank = rank + 1
+                 else
+                    call stdlib_zlaset( 'F', 1, nrhs, czero, czero, b( i, 1 ), ldb )
+                 end if
+              end do
+              ! multiply b by right singular vectors of a
+              ! (cworkspace: need n, prefer n*nrhs)
+              ! (rworkspace: none)
+              if( lwork>=ldb*nrhs .and. nrhs>1 ) then
+                 call stdlib_zgemm( 'C', 'N', n, nrhs, m, cone, a, lda, b, ldb,czero, work, ldb )
+                           
+                 call stdlib_zlacpy( 'G', n, nrhs, work, ldb, b, ldb )
+              else if( nrhs>1 ) then
+                 chunk = lwork / n
+                 do i = 1, nrhs, chunk
+                    bl = min( nrhs-i+1, chunk )
+                    call stdlib_zgemm( 'C', 'N', n, bl, m, cone, a, lda, b( 1, i ),ldb, czero, &
+                              work, n )
+                    call stdlib_zlacpy( 'F', n, bl, work, n, b( 1, i ), ldb )
+                 end do
+              else
+                 call stdlib_zgemv( 'C', m, n, cone, a, lda, b, 1, czero, work, 1 )
+                 call stdlib_zcopy( n, work, 1, b, 1 )
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_zlascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_dlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,info )
+           else if( iascl==2 ) then
+              call stdlib_zlascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_dlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_zlascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_zlascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           70 continue
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_zgelss
+
+     !> ZGELSY: computes the minimum-norm solution to a complex linear least
+     !> squares problem:
+     !> minimize || A * X - B ||
+     !> using a complete orthogonal factorization of A.  A is an M-by-N
+     !> matrix which may be rank-deficient.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+     !> The routine first computes a QR factorization with column pivoting:
+     !> A * P = Q * [ R11 R12 ]
+     !> [  0  R22 ]
+     !> with R11 defined as the largest leading submatrix whose estimated
+     !> condition number is less than 1/RCOND.  The order of R11, RANK,
+     !> is the effective rank of A.
+     !> Then, R22 is considered to be negligible, and R12 is annihilated
+     !> by unitary transformations from the right, arriving at the
+     !> complete orthogonal factorization:
+     !> A * P = Q * [ T11 0 ] * Z
+     !> [  0  0 ]
+     !> The minimum-norm solution is then
+     !> X = P * Z**H [ inv(T11)*Q1**H*B ]
+     !> [        0         ]
+     !> where Q1 consists of the first RANK columns of Q.
+     !> This routine is basically identical to the original xGELSX except
+     !> three differences:
+     !> o The permutation of matrix B (the right hand side) is faster and
+     !> more simple.
+     !> o The call to the subroutine xGEQPF has been substituted by the
+     !> the call to the subroutine xGEQP3. This subroutine is a Blas-3
+     !> version of the QR factorization with column pivoting.
+     !> o Matrix B (the right hand side) is updated with Blas-3.
+
+     subroutine stdlib_zgelsy( m, n, nrhs, a, lda, b, ldb, jpvt, rcond, rank,work, lwork, rwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info, rank
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           real(dp), intent(in) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: jpvt(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: imax = 1
+           integer(ilp), parameter :: imin = 2
+           
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iascl, ibscl, ismax, ismin, j, lwkopt, mn, nb, nb1, nb2, nb3, &
+                     nb4
+           real(dp) :: anrm, bignum, bnrm, smax, smaxpr, smin, sminpr, smlnum, wsize
+           complex(dp) :: c1, c2, s1, s2
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,max,min
+           ! Executable Statements 
+           mn = min( m, n )
+           ismin = mn + 1
+           ismax = 2*mn + 1
+           ! test the input arguments.
+           info = 0
+           nb1 = stdlib_ilaenv( 1, 'ZGEQRF', ' ', m, n, -1, -1 )
+           nb2 = stdlib_ilaenv( 1, 'ZGERQF', ' ', m, n, -1, -1 )
+           nb3 = stdlib_ilaenv( 1, 'ZUNMQR', ' ', m, n, nrhs, -1 )
+           nb4 = stdlib_ilaenv( 1, 'ZUNMRQ', ' ', m, n, nrhs, -1 )
+           nb = max( nb1, nb2, nb3, nb4 )
+           lwkopt = max( 1, mn+2*n+nb*( n+1 ), 2*mn+nb*nrhs )
+           work( 1 ) = cmplx( lwkopt,KIND=dp)
+           lquery = ( lwork==-1 )
+           if( m<0 ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldb<max( 1, m, n ) ) then
+              info = -7
+           else if( lwork<( mn+max( 2*mn, n+1, mn+nrhs ) ) .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGELSY', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+              rank = 0
+              return
+           end if
+           ! get machine parameters
+           smlnum = stdlib_dlamch( 'S' ) / stdlib_dlamch( 'P' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           ! scale a, b if max entries outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', m, n, a, lda, rwork )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_zlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_zlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_zlaset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              rank = 0
+              go to 70
+           end if
+           bnrm = stdlib_zlange( 'M', m, nrhs, b, ldb, rwork )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_zlascl( 'G', 0, 0, bnrm, smlnum, m, nrhs, b, ldb, info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_zlascl( 'G', 0, 0, bnrm, bignum, m, nrhs, b, ldb, info )
+              ibscl = 2
+           end if
+           ! compute qr factorization with column pivoting of a:
+              ! a * p = q * r
+           call stdlib_zgeqp3( m, n, a, lda, jpvt, work( 1 ), work( mn+1 ),lwork-mn, rwork, info )
+                     
+           wsize = mn + real( work( mn+1 ),KIND=dp)
+           ! complex workspace: mn+nb*(n+1). real workspace 2*n.
+           ! details of householder rotations stored in work(1:mn).
+           ! determine rank using incremental condition estimation
+           work( ismin ) = cone
+           work( ismax ) = cone
+           smax = abs( a( 1, 1 ) )
+           smin = smax
+           if( abs( a( 1, 1 ) )==zero ) then
+              rank = 0
+              call stdlib_zlaset( 'F', max( m, n ), nrhs, czero, czero, b, ldb )
+              go to 70
+           else
+              rank = 1
+           end if
+           10 continue
+           if( rank<mn ) then
+              i = rank + 1
+              call stdlib_zlaic1( imin, rank, work( ismin ), smin, a( 1, i ),a( i, i ), sminpr, &
+                        s1, c1 )
+              call stdlib_zlaic1( imax, rank, work( ismax ), smax, a( 1, i ),a( i, i ), smaxpr, &
+                        s2, c2 )
+              if( smaxpr*rcond<=sminpr ) then
+                 do i = 1, rank
+                    work( ismin+i-1 ) = s1*work( ismin+i-1 )
+                    work( ismax+i-1 ) = s2*work( ismax+i-1 )
+                 end do
+                 work( ismin+rank ) = c1
+                 work( ismax+rank ) = c2
+                 smin = sminpr
+                 smax = smaxpr
+                 rank = rank + 1
+                 go to 10
+              end if
+           end if
+           ! complex workspace: 3*mn.
+           ! logically partition r = [ r11 r12 ]
+                                   ! [  0  r22 ]
+           ! where r11 = r(1:rank,1:rank)
+           ! [r11,r12] = [ t11, 0 ] * y
+           if( rank<n )call stdlib_ztzrzf( rank, n, a, lda, work( mn+1 ), work( 2*mn+1 ),lwork-&
+                     2*mn, info )
+           ! complex workspace: 2*mn.
+           ! details of householder rotations stored in work(mn+1:2*mn)
+           ! b(1:m,1:nrhs) := q**h * b(1:m,1:nrhs)
+           call stdlib_zunmqr( 'LEFT', 'CONJUGATE TRANSPOSE', m, nrhs, mn, a, lda,work( 1 ), b, &
+                     ldb, work( 2*mn+1 ), lwork-2*mn, info )
+           wsize = max( wsize, 2*mn+real( work( 2*mn+1 ),KIND=dp) )
+           ! complex workspace: 2*mn+nb*nrhs.
+           ! b(1:rank,1:nrhs) := inv(t11) * b(1:rank,1:nrhs)
+           call stdlib_ztrsm( 'LEFT', 'UPPER', 'NO TRANSPOSE', 'NON-UNIT', rank,nrhs, cone, a, &
+                     lda, b, ldb )
+           do j = 1, nrhs
+              do i = rank + 1, n
+                 b( i, j ) = czero
+              end do
+           end do
+           ! b(1:n,1:nrhs) := y**h * b(1:n,1:nrhs)
+           if( rank<n ) then
+              call stdlib_zunmrz( 'LEFT', 'CONJUGATE TRANSPOSE', n, nrhs, rank,n-rank, a, lda, &
+                        work( mn+1 ), b, ldb,work( 2*mn+1 ), lwork-2*mn, info )
+           end if
+           ! complex workspace: 2*mn+nrhs.
+           ! b(1:n,1:nrhs) := p * b(1:n,1:nrhs)
+           do j = 1, nrhs
+              do i = 1, n
+                 work( jpvt( i ) ) = b( i, j )
+              end do
+              call stdlib_zcopy( n, work( 1 ), 1, b( 1, j ), 1 )
+           end do
+           ! complex workspace: n.
+           ! undo scaling
+           if( iascl==1 ) then
+              call stdlib_zlascl( 'G', 0, 0, anrm, smlnum, n, nrhs, b, ldb, info )
+              call stdlib_zlascl( 'U', 0, 0, smlnum, anrm, rank, rank, a, lda,info )
+           else if( iascl==2 ) then
+              call stdlib_zlascl( 'G', 0, 0, anrm, bignum, n, nrhs, b, ldb, info )
+              call stdlib_zlascl( 'U', 0, 0, bignum, anrm, rank, rank, a, lda,info )
+           end if
+           if( ibscl==1 ) then
+              call stdlib_zlascl( 'G', 0, 0, smlnum, bnrm, n, nrhs, b, ldb, info )
+           else if( ibscl==2 ) then
+              call stdlib_zlascl( 'G', 0, 0, bignum, bnrm, n, nrhs, b, ldb, info )
+           end if
+           70 continue
+           work( 1 ) = cmplx( lwkopt,KIND=dp)
+           return
+     end subroutine stdlib_zgelsy
+
+     !> ZGEMLQ: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'C':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product
+     !> of blocked elementary reflectors computed by short wide
+     !> LQ factorization (ZGELQ)
+
+     pure subroutine stdlib_zgemlq( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, tsize, lwork, ldc
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), t(*)
+           complex(dp), intent(inout) :: c(ldc,*)
+           complex(dp), intent(out) :: work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: mb, nb, lw, nblcks, mn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork==-1
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'C' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           mb = int( t( 2 ),KIND=ilp)
+           nb = int( t( 3 ),KIND=ilp)
+           if( left ) then
+             lw = n * mb
+             mn = m
+           else
+             lw = m * mb
+             mn = n
+           end if
+           if( ( nb>k ) .and. ( mn>k ) ) then
+             if( mod( mn - k, nb - k ) == 0 ) then
+               nblcks = ( mn - k ) / ( nb - k )
+             else
+               nblcks = ( mn - k ) / ( nb - k ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+             info = -1
+           else if( .not.tran .and. .not.notran ) then
+             info = -2
+           else if( m<0 ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 .or. k>mn ) then
+             info = -5
+           else if( lda<max( 1, k ) ) then
+             info = -7
+           else if( tsize<5 ) then
+             info = -9
+           else if( ldc<max( 1, m ) ) then
+             info = -11
+           else if( ( lwork<max( 1, lw ) ) .and. ( .not.lquery ) ) then
+             info = -13
+           end if
+           if( info==0 ) then
+             work( 1 ) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZGEMLQ', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n, k )==0 ) then
+             return
+           end if
+           if( ( left .and. m<=k ) .or. ( right .and. n<=k ).or. ( nb<=k ) .or. ( nb>=max( m, n, &
+                     k ) ) ) then
+             call stdlib_zgemlqt( side, trans, m, n, k, mb, a, lda,t( 6 ), mb, c, ldc, work, info &
+                       )
+           else
+             call stdlib_zlamswlq( side, trans, m, n, k, mb, nb, a, lda, t( 6 ),mb, c, ldc, work, &
+                       lwork, info )
+           end if
+           work( 1 ) = lw
+           return
+     end subroutine stdlib_zgemlq
+
+     !> ZGEMQR: overwrites the general real M-by-N matrix C with
+     !> SIDE = 'L'     SIDE = 'R'
+     !> TRANS = 'N':      Q * C          C * Q
+     !> TRANS = 'T':      Q**H * C       C * Q**H
+     !> where Q is a complex unitary matrix defined as the product
+     !> of blocked elementary reflectors computed by tall skinny
+     !> QR factorization (ZGEQR)
+
+     pure subroutine stdlib_zgemqr( side, trans, m, n, k, a, lda, t, tsize,c, ldc, work, lwork, &
+               info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: side, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, k, tsize, lwork, ldc
+           ! Array Arguments 
+           complex(dp), intent(in) :: a(lda,*), t(*)
+           complex(dp), intent(inout) :: c(ldc,*)
+           complex(dp), intent(out) :: work(*)
+       ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: left, right, tran, notran, lquery
+           integer(ilp) :: mb, nb, lw, nblcks, mn
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           lquery  = lwork==-1
+           notran  = stdlib_lsame( trans, 'N' )
+           tran    = stdlib_lsame( trans, 'C' )
+           left    = stdlib_lsame( side, 'L' )
+           right   = stdlib_lsame( side, 'R' )
+           mb = int( t( 2 ),KIND=ilp)
+           nb = int( t( 3 ),KIND=ilp)
+           if( left ) then
+             lw = n * nb
+             mn = m
+           else
+             lw = mb * nb
+             mn = n
+           end if
+           if( ( mb>k ) .and. ( mn>k ) ) then
+             if( mod( mn - k, mb - k )==0 ) then
+               nblcks = ( mn - k ) / ( mb - k )
+             else
+               nblcks = ( mn - k ) / ( mb - k ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           info = 0
+           if( .not.left .and. .not.right ) then
+             info = -1
+           else if( .not.tran .and. .not.notran ) then
+             info = -2
+           else if( m<0 ) then
+             info = -3
+           else if( n<0 ) then
+             info = -4
+           else if( k<0 .or. k>mn ) then
+             info = -5
+           else if( lda<max( 1, mn ) ) then
+             info = -7
+           else if( tsize<5 ) then
+             info = -9
+           else if( ldc<max( 1, m ) ) then
+             info = -11
+           else if( ( lwork<max( 1, lw ) ) .and. ( .not.lquery ) ) then
+             info = -13
+           end if
+           if( info==0 ) then
+             work( 1 ) = lw
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZGEMQR', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n, k )==0 ) then
+             return
+           end if
+           if( ( left .and. m<=k ) .or. ( right .and. n<=k ).or. ( mb<=k ) .or. ( mb>=max( m, n, &
+                     k ) ) ) then
+             call stdlib_zgemqrt( side, trans, m, n, k, nb, a, lda, t( 6 ),nb, c, ldc, work, info &
+                       )
+           else
+             call stdlib_zlamtsqr( side, trans, m, n, k, mb, nb, a, lda, t( 6 ),nb, c, ldc, work, &
+                       lwork, info )
+           end if
+           work( 1 ) = lw
+           return
+     end subroutine stdlib_zgemqr
+
+     !> ZGEQR: computes a QR factorization of a complex M-by-N matrix A:
+     !> A = Q * ( R ),
+     !> ( 0 )
+     !> where:
+     !> Q is a M-by-M orthogonal matrix;
+     !> R is an upper-triangular N-by-N matrix;
+     !> 0 is a (M-N)-by-N zero matrix, if M > N.
+
+     pure subroutine stdlib_zgeqr( m, n, a, lda, t, tsize, work, lwork,info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd. --
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, m, n, tsize, lwork
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: t(*), work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, lminws, mint, minw
+           integer(ilp) :: mb, nb, mintsz, nblcks
+           ! Intrinsic Functions 
+           intrinsic :: max,min,mod
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( tsize==-1 .or. tsize==-2 .or.lwork==-1 .or. lwork==-2 )
+           mint = .false.
+           minw = .false.
+           if( tsize==-2 .or. lwork==-2 ) then
+             if( tsize/=-1 ) mint = .true.
+             if( lwork/=-1 ) minw = .true.
+           end if
+           ! determine the block size
+           if( min ( m, n )>0 ) then
+             mb = stdlib_ilaenv( 1, 'ZGEQR ', ' ', m, n, 1, -1 )
+             nb = stdlib_ilaenv( 1, 'ZGEQR ', ' ', m, n, 2, -1 )
+           else
+             mb = m
+             nb = 1
+           end if
+           if( mb>m .or. mb<=n ) mb = m
+           if( nb>min( m, n ) .or. nb<1 ) nb = 1
+           mintsz = n + 5
+           if( mb>n .and. m>n ) then
+             if( mod( m - n, mb - n )==0 ) then
+               nblcks = ( m - n ) / ( mb - n )
+             else
+               nblcks = ( m - n ) / ( mb - n ) + 1
+             end if
+           else
+             nblcks = 1
+           end if
+           ! determine if the workspace size satisfies minimal size
+           lminws = .false.
+           if( ( tsize<max( 1, nb*n*nblcks + 5 ) .or. lwork<nb*n ).and. ( lwork>=n ) .and. ( &
+                     tsize>=mintsz ).and. ( .not.lquery ) ) then
+             if( tsize<max( 1, nb*n*nblcks + 5 ) ) then
+               lminws = .true.
+               nb = 1
+               mb = m
+             end if
+             if( lwork<nb*n ) then
+               lminws = .true.
+               nb = 1
+             end if
+           end if
+           if( m<0 ) then
+             info = -1
+           else if( n<0 ) then
+             info = -2
+           else if( lda<max( 1, m ) ) then
+             info = -4
+           else if( tsize<max( 1, nb*n*nblcks + 5 ).and. ( .not.lquery ) .and. ( .not.lminws ) ) &
+                     then
+             info = -6
+           else if( ( lwork<max( 1, n*nb ) ) .and. ( .not.lquery ).and. ( .not.lminws ) ) &
+                     then
+             info = -8
+           end if
+           if( info==0 ) then
+             if( mint ) then
+               t( 1 ) = mintsz
+             else
+               t( 1 ) = nb*n*nblcks + 5
+             end if
+             t( 2 ) = mb
+             t( 3 ) = nb
+             if( minw ) then
+               work( 1 ) = max( 1, n )
+             else
+               work( 1 ) = max( 1, nb*n )
+             end if
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZGEQR', -info )
+             return
+           else if( lquery ) then
+             return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+             return
+           end if
+           ! the qr decomposition
+           if( ( m<=n ) .or. ( mb<=n ) .or. ( mb>=m ) ) then
+             call stdlib_zgeqrt( m, n, nb, a, lda, t( 6 ), nb, work, info )
+           else
+             call stdlib_zlatsqr( m, n, mb, nb, a, lda, t( 6 ), nb, work,lwork, info )
+           end if
+           work( 1 ) = max( 1, nb*n )
+           return
+     end subroutine stdlib_zgeqr
+
+     !> ZGESDD: computes the singular value decomposition (SVD) of a complex
+     !> M-by-N matrix A, optionally computing the left and/or right singular
+     !> vectors, by using divide-and-conquer method. The SVD is written
+     !> A = U * SIGMA * conjugate-transpose(V)
+     !> where SIGMA is an M-by-N matrix which is zero except for its
+     !> min(m,n) diagonal elements, U is an M-by-M unitary matrix, and
+     !> V is an N-by-N unitary matrix.  The diagonal elements of SIGMA
+     !> are the singular values of A; they are real and non-negative, and
+     !> are returned in descending order.  The first min(m,n) columns of
+     !> U and V are the left and right singular vectors of A.
+     !> Note that the routine returns VT = V**H, not V.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_zgesdd( jobz, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, rwork, iwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldvt, lwork, m, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: rwork(*), s(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wntqa, wntqas, wntqn, wntqo, wntqs
+           integer(ilp) :: blk, chunk, i, ie, ierr, il, ir, iru, irvt, iscl, itau, itaup, itauq, &
+           iu, ivt, ldwkvt, ldwrkl, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr1, mnthr2, nrwork,&
+                      nwork, wrkbl
+           integer(ilp) :: lwork_zgebrd_mn, lwork_zgebrd_mm, lwork_zgebrd_nn, lwork_zgelqf_mn, &
+           lwork_zgeqrf_mn, lwork_zungbr_p_mn, lwork_zungbr_p_nn, lwork_zungbr_q_mn, &
+           lwork_zungbr_q_mm, lwork_zunglq_mn, lwork_zunglq_nn, lwork_zungqr_mm, lwork_zungqr_mn, &
+           lwork_zunmbr_prc_mm, lwork_zunmbr_qln_mm, lwork_zunmbr_prc_mn, lwork_zunmbr_qln_mn, &
+                     lwork_zunmbr_prc_nn, lwork_zunmbr_qln_nn
+           real(dp) :: anrm, bignum, eps, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(dp) :: dum(1)
+           complex(dp) :: cdum(1)
+           ! Intrinsic Functions 
+           intrinsic :: int,max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info   = 0
+           minmn  = min( m, n )
+           mnthr1 = int( minmn*17.0_dp / 9.0_dp,KIND=ilp)
+           mnthr2 = int( minmn*5.0_dp / 3.0_dp,KIND=ilp)
+           wntqa  = stdlib_lsame( jobz, 'A' )
+           wntqs  = stdlib_lsame( jobz, 'S' )
+           wntqas = wntqa .or. wntqs
+           wntqo  = stdlib_lsame( jobz, 'O' )
+           wntqn  = stdlib_lsame( jobz, 'N' )
+           lquery = ( lwork==-1 )
+           minwrk = 1
+           maxwrk = 1
+           if( .not.( wntqa .or. wntqs .or. wntqo .or. wntqn ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, m ) ) then
+              info = -5
+           else if( ldu<1 .or. ( wntqas .and. ldu<m ) .or.( wntqo .and. m<n .and. ldu<m ) ) &
+                     then
+              info = -8
+           else if( ldvt<1 .or. ( wntqa .and. ldvt<n ) .or.( wntqs .and. ldvt<minmn ) .or.( wntqo &
+                     .and. m>=n .and. ldvt<n ) ) then
+              info = -10
+           end if
+           ! compute workspace
+             ! note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace allocated at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to
+             ! real workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( m>=n .and. minmn>0 ) then
+                 ! there is no complex work space needed for bidiagonal svd
+                 ! the realwork space needed for bidiagonal svd (stdlib_dbdsdc,KIND=dp) is
+                 ! bdspac = 3*n*n + 4*n for singular values and vectors;
+                 ! bdspac = 4*n         for singular values only;
+                 ! not including e, ru, and rvt matrices.
+                 ! compute space preferred for each routine
+                 call stdlib_zgebrd( m, n, cdum(1), m, dum(1), dum(1), cdum(1),cdum(1), cdum(1), -&
+                           1, ierr )
+                 lwork_zgebrd_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_zgebrd( n, n, cdum(1), n, dum(1), dum(1), cdum(1),cdum(1), cdum(1), -&
+                           1, ierr )
+                 lwork_zgebrd_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_zgeqrf( m, n, cdum(1), m, cdum(1), cdum(1), -1, ierr )
+                 lwork_zgeqrf_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_zungbr( 'P', n, n, n, cdum(1), n, cdum(1), cdum(1),-1, ierr )
+                 lwork_zungbr_p_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_zungbr( 'Q', m, m, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_zungbr_q_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_zungbr( 'Q', m, n, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_zungbr_q_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_zungqr( m, m, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_zungqr_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_zungqr( m, n, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_zungqr_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_zunmbr( 'P', 'R', 'C', n, n, n, cdum(1), n, cdum(1),cdum(1), n, cdum(&
+                           1), -1, ierr )
+                 lwork_zunmbr_prc_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_zunmbr( 'Q', 'L', 'N', m, m, n, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_zunmbr_qln_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_zunmbr( 'Q', 'L', 'N', m, n, n, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_zunmbr_qln_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_zunmbr( 'Q', 'L', 'N', n, n, n, cdum(1), n, cdum(1),cdum(1), n, cdum(&
+                           1), -1, ierr )
+                 lwork_zunmbr_qln_nn = int( cdum(1),KIND=ilp)
+                 if( m>=mnthr1 ) then
+                    if( wntqn ) then
+                       ! path 1 (m >> n, jobz='n')
+                       maxwrk = n + lwork_zgeqrf_mn
+                       maxwrk = max( maxwrk, 2*n + lwork_zgebrd_nn )
+                       minwrk = 3*n
+                    else if( wntqo ) then
+                       ! path 2 (m >> n, jobz='o')
+                       wrkbl = n + lwork_zgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_zungqr_mn )
+                       wrkbl = max( wrkbl, 2*n + lwork_zgebrd_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_zunmbr_qln_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_zunmbr_prc_nn )
+                       maxwrk = m*n + n*n + wrkbl
+                       minwrk = 2*n*n + 3*n
+                    else if( wntqs ) then
+                       ! path 3 (m >> n, jobz='s')
+                       wrkbl = n + lwork_zgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_zungqr_mn )
+                       wrkbl = max( wrkbl, 2*n + lwork_zgebrd_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_zunmbr_qln_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_zunmbr_prc_nn )
+                       maxwrk = n*n + wrkbl
+                       minwrk = n*n + 3*n
+                    else if( wntqa ) then
+                       ! path 4 (m >> n, jobz='a')
+                       wrkbl = n + lwork_zgeqrf_mn
+                       wrkbl = max( wrkbl,   n + lwork_zungqr_mm )
+                       wrkbl = max( wrkbl, 2*n + lwork_zgebrd_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_zunmbr_qln_nn )
+                       wrkbl = max( wrkbl, 2*n + lwork_zunmbr_prc_nn )
+                       maxwrk = n*n + wrkbl
+                       minwrk = n*n + max( 3*n, n + m )
+                    end if
+                 else if( m>=mnthr2 ) then
+                    ! path 5 (m >> n, but not as much as mnthr1)
+                    maxwrk = 2*n + lwork_zgebrd_mn
+                    minwrk = 2*n + m
+                    if( wntqo ) then
+                       ! path 5o (m >> n, jobz='o')
+                       maxwrk = max( maxwrk, 2*n + lwork_zungbr_p_nn )
+                       maxwrk = max( maxwrk, 2*n + lwork_zungbr_q_mn )
+                       maxwrk = maxwrk + m*n
+                       minwrk = minwrk + n*n
+                    else if( wntqs ) then
+                       ! path 5s (m >> n, jobz='s')
+                       maxwrk = max( maxwrk, 2*n + lwork_zungbr_p_nn )
+                       maxwrk = max( maxwrk, 2*n + lwork_zungbr_q_mn )
+                    else if( wntqa ) then
+                       ! path 5a (m >> n, jobz='a')
+                       maxwrk = max( maxwrk, 2*n + lwork_zungbr_p_nn )
+                       maxwrk = max( maxwrk, 2*n + lwork_zungbr_q_mm )
+                    end if
+                 else
+                    ! path 6 (m >= n, but not much larger)
+                    maxwrk = 2*n + lwork_zgebrd_mn
+                    minwrk = 2*n + m
+                    if( wntqo ) then
+                       ! path 6o (m >= n, jobz='o')
+                       maxwrk = max( maxwrk, 2*n + lwork_zunmbr_prc_nn )
+                       maxwrk = max( maxwrk, 2*n + lwork_zunmbr_qln_mn )
+                       maxwrk = maxwrk + m*n
+                       minwrk = minwrk + n*n
+                    else if( wntqs ) then
+                       ! path 6s (m >= n, jobz='s')
+                       maxwrk = max( maxwrk, 2*n + lwork_zunmbr_qln_mn )
+                       maxwrk = max( maxwrk, 2*n + lwork_zunmbr_prc_nn )
+                    else if( wntqa ) then
+                       ! path 6a (m >= n, jobz='a')
+                       maxwrk = max( maxwrk, 2*n + lwork_zunmbr_qln_mm )
+                       maxwrk = max( maxwrk, 2*n + lwork_zunmbr_prc_nn )
+                    end if
+                 end if
+              else if( minmn>0 ) then
+                 ! there is no complex work space needed for bidiagonal svd
+                 ! the realwork space needed for bidiagonal svd (stdlib_dbdsdc,KIND=dp) is
+                 ! bdspac = 3*m*m + 4*m for singular values and vectors;
+                 ! bdspac = 4*m         for singular values only;
+                 ! not including e, ru, and rvt matrices.
+                 ! compute space preferred for each routine
+                 call stdlib_zgebrd( m, n, cdum(1), m, dum(1), dum(1), cdum(1),cdum(1), cdum(1), -&
+                           1, ierr )
+                 lwork_zgebrd_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_zgebrd( m, m, cdum(1), m, dum(1), dum(1), cdum(1),cdum(1), cdum(1), -&
+                           1, ierr )
+                 lwork_zgebrd_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_zgelqf( m, n, cdum(1), m, cdum(1), cdum(1), -1, ierr )
+                 lwork_zgelqf_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_zungbr( 'P', m, n, m, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_zungbr_p_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_zungbr( 'P', n, n, m, cdum(1), n, cdum(1), cdum(1),-1, ierr )
+                 lwork_zungbr_p_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_zungbr( 'Q', m, m, n, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_zungbr_q_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_zunglq( m, n, m, cdum(1), m, cdum(1), cdum(1),-1, ierr )
+                 lwork_zunglq_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_zunglq( n, n, m, cdum(1), n, cdum(1), cdum(1),-1, ierr )
+                 lwork_zunglq_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_zunmbr( 'P', 'R', 'C', m, m, m, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_zunmbr_prc_mm = int( cdum(1),KIND=ilp)
+                 call stdlib_zunmbr( 'P', 'R', 'C', m, n, m, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_zunmbr_prc_mn = int( cdum(1),KIND=ilp)
+                 call stdlib_zunmbr( 'P', 'R', 'C', n, n, m, cdum(1), n, cdum(1),cdum(1), n, cdum(&
+                           1), -1, ierr )
+                 lwork_zunmbr_prc_nn = int( cdum(1),KIND=ilp)
+                 call stdlib_zunmbr( 'Q', 'L', 'N', m, m, m, cdum(1), m, cdum(1),cdum(1), m, cdum(&
+                           1), -1, ierr )
+                 lwork_zunmbr_qln_mm = int( cdum(1),KIND=ilp)
+                 if( n>=mnthr1 ) then
+                    if( wntqn ) then
+                       ! path 1t (n >> m, jobz='n')
+                       maxwrk = m + lwork_zgelqf_mn
+                       maxwrk = max( maxwrk, 2*m + lwork_zgebrd_mm )
+                       minwrk = 3*m
+                    else if( wntqo ) then
+                       ! path 2t (n >> m, jobz='o')
+                       wrkbl = m + lwork_zgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_zunglq_mn )
+                       wrkbl = max( wrkbl, 2*m + lwork_zgebrd_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_zunmbr_qln_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_zunmbr_prc_mm )
+                       maxwrk = m*n + m*m + wrkbl
+                       minwrk = 2*m*m + 3*m
+                    else if( wntqs ) then
+                       ! path 3t (n >> m, jobz='s')
+                       wrkbl = m + lwork_zgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_zunglq_mn )
+                       wrkbl = max( wrkbl, 2*m + lwork_zgebrd_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_zunmbr_qln_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_zunmbr_prc_mm )
+                       maxwrk = m*m + wrkbl
+                       minwrk = m*m + 3*m
+                    else if( wntqa ) then
+                       ! path 4t (n >> m, jobz='a')
+                       wrkbl = m + lwork_zgelqf_mn
+                       wrkbl = max( wrkbl,   m + lwork_zunglq_nn )
+                       wrkbl = max( wrkbl, 2*m + lwork_zgebrd_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_zunmbr_qln_mm )
+                       wrkbl = max( wrkbl, 2*m + lwork_zunmbr_prc_mm )
+                       maxwrk = m*m + wrkbl
+                       minwrk = m*m + max( 3*m, m + n )
+                    end if
+                 else if( n>=mnthr2 ) then
+                    ! path 5t (n >> m, but not as much as mnthr1)
+                    maxwrk = 2*m + lwork_zgebrd_mn
+                    minwrk = 2*m + n
+                    if( wntqo ) then
+                       ! path 5to (n >> m, jobz='o')
+                       maxwrk = max( maxwrk, 2*m + lwork_zungbr_q_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_zungbr_p_mn )
+                       maxwrk = maxwrk + m*n
+                       minwrk = minwrk + m*m
+                    else if( wntqs ) then
+                       ! path 5ts (n >> m, jobz='s')
+                       maxwrk = max( maxwrk, 2*m + lwork_zungbr_q_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_zungbr_p_mn )
+                    else if( wntqa ) then
+                       ! path 5ta (n >> m, jobz='a')
+                       maxwrk = max( maxwrk, 2*m + lwork_zungbr_q_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_zungbr_p_nn )
+                    end if
+                 else
+                    ! path 6t (n > m, but not much larger)
+                    maxwrk = 2*m + lwork_zgebrd_mn
+                    minwrk = 2*m + n
+                    if( wntqo ) then
+                       ! path 6to (n > m, jobz='o')
+                       maxwrk = max( maxwrk, 2*m + lwork_zunmbr_qln_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_zunmbr_prc_mn )
+                       maxwrk = maxwrk + m*n
+                       minwrk = minwrk + m*m
+                    else if( wntqs ) then
+                       ! path 6ts (n > m, jobz='s')
+                       maxwrk = max( maxwrk, 2*m + lwork_zunmbr_qln_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_zunmbr_prc_mn )
+                    else if( wntqa ) then
+                       ! path 6ta (n > m, jobz='a')
+                       maxwrk = max( maxwrk, 2*m + lwork_zunmbr_qln_mm )
+                       maxwrk = max( maxwrk, 2*m + lwork_zunmbr_prc_nn )
+                    end if
+                 end if
+              end if
+              maxwrk = max( maxwrk, minwrk )
+           end if
+           if( info==0 ) then
+              work( 1 ) = stdlib_droundup_lwork( maxwrk )
+              if( lwork<minwrk .and. .not. lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGESDD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = sqrt( stdlib_dlamch( 'S' ) ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', m, n, a, lda, dum )
+           if( stdlib_disnan( anrm ) ) then
+               info = -4
+               return
+           end if
+           iscl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              iscl = 1
+              call stdlib_zlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, ierr )
+           else if( anrm>bignum ) then
+              iscl = 1
+              call stdlib_zlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, ierr )
+           end if
+           if( m>=n ) then
+              ! a has at least as many rows as columns. if a has sufficiently
+              ! more rows than columns, first reduce using the qr
+              ! decomposition (if sufficient workspace available)
+              if( m>=mnthr1 ) then
+                 if( wntqn ) then
+                    ! path 1 (m >> n, jobz='n')
+                    ! no singular vectors to be computed
+                    itau = 1
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! cworkspace: need   n [tau] + n    [work]
+                    ! cworkspace: prefer n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! zero out below r
+                    call stdlib_zlaset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+                    ie = 1
+                    itauq = 1
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! cworkspace: need   2*n [tauq, taup] + n      [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + 2*n*nb [work]
+                    ! rworkspace: need   n [e]
+                    call stdlib_zgebrd( n, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( nwork ), lwork-nwork+1,ierr )
+                    nrwork = ie + n
+                    ! perform bidiagonal svd, compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + bdspac
+                    call stdlib_dbdsdc( 'U', 'N', n, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 2 (m >> n, jobz='o')
+                    ! n left singular vectors to be overwritten on a and
+                    ! n right singular vectors to be computed in vt
+                    iu = 1
+                    ! work(iu) is n by n
+                    ldwrku = n
+                    ir = iu + ldwrku*n
+                    if( lwork >= m*n + n*n + 3*n ) then
+                       ! work(ir) is m by n
+                       ldwrkr = m
+                    else
+                       ldwrkr = ( lwork - n*n - 3*n ) / n
+                    end if
+                    itau = ir + ldwrkr*n
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! cworkspace: need   n*n [u] + n*n [r] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! copy r to work( ir ), zeroing out below it
+                    call stdlib_zlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                    call stdlib_zlaset( 'L', n-1, n-1, czero, czero, work( ir+1 ),ldwrkr )
+                    ! generate q in a
+                    ! cworkspace: need   n*n [u] + n*n [r] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zungqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in work(ir)
+                    ! cworkspace: need   n*n [u] + n*n [r] + 2*n [tauq, taup] + n      [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + 2*n [tauq, taup] + 2*n*nb [work]
+                    ! rworkspace: need   n [e]
+                    call stdlib_zgebrd( n, n, work( ir ), ldwrkr, s, rwork( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of r in work(iru) and computing right singular vectors
+                    ! of r in work(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = ie + n
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_dbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=dp) to complex matrix work(iu)
+                    ! overwrite work(iu) by the left singular vectors of r
+                    ! cworkspace: need   n*n [u] + n*n [r] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacp2( 'F', n, n, rwork( iru ), n, work( iu ),ldwrku )
+                    call stdlib_zunmbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                              work( iu ), ldwrku,work( nwork ), lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=dp) to complex matrix vt
+                    ! overwrite vt by the right singular vectors of r
+                    ! cworkspace: need   n*n [u] + n*n [r] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n*n [r] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_zunmbr( 'P', 'R', 'C', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork-nwork+1, ierr )
+                    ! multiply q in a by left singular vectors of r in
+                    ! work(iu), storing result in work(ir) and copying to a
+                    ! cworkspace: need   n*n [u] + n*n [r]
+                    ! cworkspace: prefer n*n [u] + m*n [r]
+                    ! rworkspace: need   0
+                    do i = 1, m, ldwrkr
+                       chunk = min( m-i+1, ldwrkr )
+                       call stdlib_zgemm( 'N', 'N', chunk, n, n, cone, a( i, 1 ),lda, work( iu ), &
+                                 ldwrku, czero,work( ir ), ldwrkr )
+                       call stdlib_zlacpy( 'F', chunk, n, work( ir ), ldwrkr,a( i, 1 ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 3 (m >> n, jobz='s')
+                    ! n left singular vectors to be computed in u and
+                    ! n right singular vectors to be computed in vt
+                    ir = 1
+                    ! work(ir) is n by n
+                    ldwrkr = n
+                    itau = ir + ldwrkr*n
+                    nwork = itau + n
+                    ! compute a=q*r
+                    ! cworkspace: need   n*n [r] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! copy r to work(ir), zeroing out below it
+                    call stdlib_zlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                    call stdlib_zlaset( 'L', n-1, n-1, czero, czero, work( ir+1 ),ldwrkr )
+                    ! generate q in a
+                    ! cworkspace: need   n*n [r] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [r] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zungqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in work(ir)
+                    ! cworkspace: need   n*n [r] + 2*n [tauq, taup] + n      [work]
+                    ! cworkspace: prefer n*n [r] + 2*n [tauq, taup] + 2*n*nb [work]
+                    ! rworkspace: need   n [e]
+                    call stdlib_zgebrd( n, n, work( ir ), ldwrkr, s, rwork( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = ie + n
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_dbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=dp) to complex matrix u
+                    ! overwrite u by left singular vectors of r
+                    ! cworkspace: need   n*n [r] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [r] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacp2( 'F', n, n, rwork( iru ), n, u, ldu )
+                    call stdlib_zunmbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                              u, ldu, work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=dp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of r
+                    ! cworkspace: need   n*n [r] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [r] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_zunmbr( 'P', 'R', 'C', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork-nwork+1, ierr )
+                    ! multiply q in a by left singular vectors of r in
+                    ! work(ir), storing result in u
+                    ! cworkspace: need   n*n [r]
+                    ! rworkspace: need   0
+                    call stdlib_zlacpy( 'F', n, n, u, ldu, work( ir ), ldwrkr )
+                    call stdlib_zgemm( 'N', 'N', m, n, n, cone, a, lda, work( ir ),ldwrkr, czero, &
+                              u, ldu )
+                 else if( wntqa ) then
+                    ! path 4 (m >> n, jobz='a')
+                    ! m left singular vectors to be computed in u and
+                    ! n right singular vectors to be computed in vt
+                    iu = 1
+                    ! work(iu) is n by n
+                    ldwrku = n
+                    itau = iu + ldwrku*n
+                    nwork = itau + n
+                    ! compute a=q*r, copying result to u
+                    ! cworkspace: need   n*n [u] + n [tau] + n    [work]
+                    ! cworkspace: prefer n*n [u] + n [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    call stdlib_zlacpy( 'L', m, n, a, lda, u, ldu )
+                    ! generate q in u
+                    ! cworkspace: need   n*n [u] + n [tau] + m    [work]
+                    ! cworkspace: prefer n*n [u] + n [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zungqr( m, m, n, u, ldu, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ! produce r in a, zeroing out below it
+                    call stdlib_zlaset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + n
+                    nwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! cworkspace: need   n*n [u] + 2*n [tauq, taup] + n      [work]
+                    ! cworkspace: prefer n*n [u] + 2*n [tauq, taup] + 2*n*nb [work]
+                    ! rworkspace: need   n [e]
+                    call stdlib_zgebrd( n, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( nwork ), lwork-nwork+1,ierr )
+                    iru = ie + n
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    call stdlib_dbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=dp) to complex matrix work(iu)
+                    ! overwrite work(iu) by left singular vectors of r
+                    ! cworkspace: need   n*n [u] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [u] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacp2( 'F', n, n, rwork( iru ), n, work( iu ),ldwrku )
+                    call stdlib_zunmbr( 'Q', 'L', 'N', n, n, n, a, lda,work( itauq ), work( iu ), &
+                              ldwrku,work( nwork ), lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=dp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of r
+                    ! cworkspace: need   n*n [u] + 2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer n*n [u] + 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_zunmbr( 'P', 'R', 'C', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! multiply q in u by left singular vectors of r in
+                    ! work(iu), storing result in a
+                    ! cworkspace: need   n*n [u]
+                    ! rworkspace: need   0
+                    call stdlib_zgemm( 'N', 'N', m, n, n, cone, u, ldu, work( iu ),ldwrku, czero, &
+                              a, lda )
+                    ! copy left singular vectors of a from a to u
+                    call stdlib_zlacpy( 'F', m, n, a, lda, u, ldu )
+                 end if
+              else if( m>=mnthr2 ) then
+                 ! mnthr2 <= m < mnthr1
+                 ! path 5 (m >> n, but not as much as mnthr1)
+                 ! reduce to bidiagonal form without qr decomposition, use
+                 ! stdlib_zungbr and matrix multiplication to compute singular vectors
+                 ie = 1
+                 nrwork = ie + n
+                 itauq = 1
+                 itaup = itauq + n
+                 nwork = itaup + n
+                 ! bidiagonalize a
+                 ! cworkspace: need   2*n [tauq, taup] + m        [work]
+                 ! cworkspace: prefer 2*n [tauq, taup] + (m+n)*nb [work]
+                 ! rworkspace: need   n [e]
+                 call stdlib_zgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 5n (m >> n, jobz='n')
+                    ! compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + bdspac
+                    call stdlib_dbdsdc( 'U', 'N', n, s, rwork( ie ), dum, 1,dum,1,dum, idum, &
+                              rwork( nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    iu = nwork
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    ! path 5o (m >> n, jobz='o')
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_zungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! generate q in a
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zungbr( 'Q', m, n, n, a, lda, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    if( lwork >= m*n + 3*n ) then
+                       ! work( iu ) is m by n
+                       ldwrku = m
+                    else
+                       ! work(iu) is ldwrku by n
+                       ldwrku = ( lwork - 3*n ) / n
+                    end if
+                    nwork = iu + ldwrku*n
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    call stdlib_dbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply realmatrix rwork(irvt,KIND=dp) by p**h in vt,
+                    ! storing the result in work(iu), copying to vt
+                    ! cworkspace: need   2*n [tauq, taup] + n*n [u]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + 2*n*n [rwork]
+                    call stdlib_zlarcm( n, n, rwork( irvt ), n, vt, ldvt,work( iu ), ldwrku, &
+                              rwork( nrwork ) )
+                    call stdlib_zlacpy( 'F', n, n, work( iu ), ldwrku, vt, ldvt )
+                    ! multiply q in a by realmatrix rwork(iru,KIND=dp), storing the
+                    ! result in work(iu), copying to a
+                    ! cworkspace: need   2*n [tauq, taup] + n*n [u]
+                    ! cworkspace: prefer 2*n [tauq, taup] + m*n [u]
+                    ! rworkspace: need   n [e] + n*n [ru] + 2*n*n [rwork]
+                    ! rworkspace: prefer n [e] + n*n [ru] + 2*m*n [rwork] < n + 5*n*n since m < 2*n here
+                    nrwork = irvt
+                    do i = 1, m, ldwrku
+                       chunk = min( m-i+1, ldwrku )
+                       call stdlib_zlacrm( chunk, n, a( i, 1 ), lda, rwork( iru ),n, work( iu ), &
+                                 ldwrku, rwork( nrwork ) )
+                       call stdlib_zlacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 5s (m >> n, jobz='s')
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_zungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacpy( 'L', m, n, a, lda, u, ldu )
+                    call stdlib_zungbr( 'Q', m, n, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_dbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply realmatrix rwork(irvt,KIND=dp) by p**h in vt,
+                    ! storing the result in a, copying to vt
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + 2*n*n [rwork]
+                    call stdlib_zlarcm( n, n, rwork( irvt ), n, vt, ldvt, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_zlacpy( 'F', n, n, a, lda, vt, ldvt )
+                    ! multiply q in u by realmatrix rwork(iru,KIND=dp), storing the
+                    ! result in a, copying to u
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + 2*m*n [rwork] < n + 5*n*n since m < 2*n here
+                    nrwork = irvt
+                    call stdlib_zlacrm( m, n, u, ldu, rwork( iru ), n, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_zlacpy( 'F', m, n, a, lda, u, ldu )
+                 else
+                    ! path 5a (m >> n, jobz='a')
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_zungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*n [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacpy( 'L', m, n, a, lda, u, ldu )
+                    call stdlib_zungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_dbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply realmatrix rwork(irvt,KIND=dp) by p**h in vt,
+                    ! storing the result in a, copying to vt
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + 2*n*n [rwork]
+                    call stdlib_zlarcm( n, n, rwork( irvt ), n, vt, ldvt, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_zlacpy( 'F', n, n, a, lda, vt, ldvt )
+                    ! multiply q in u by realmatrix rwork(iru,KIND=dp), storing the
+                    ! result in a, copying to u
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + 2*m*n [rwork] < n + 5*n*n since m < 2*n here
+                    nrwork = irvt
+                    call stdlib_zlacrm( m, n, u, ldu, rwork( iru ), n, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_zlacpy( 'F', m, n, a, lda, u, ldu )
+                 end if
+              else
+                 ! m < mnthr2
+                 ! path 6 (m >= n, but not much larger)
+                 ! reduce to bidiagonal form without qr decomposition
+                 ! use stdlib_zunmbr to compute singular vectors
+                 ie = 1
+                 nrwork = ie + n
+                 itauq = 1
+                 itaup = itauq + n
+                 nwork = itaup + n
+                 ! bidiagonalize a
+                 ! cworkspace: need   2*n [tauq, taup] + m        [work]
+                 ! cworkspace: prefer 2*n [tauq, taup] + (m+n)*nb [work]
+                 ! rworkspace: need   n [e]
+                 call stdlib_zgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 6n (m >= n, jobz='n')
+                    ! compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + bdspac
+                    call stdlib_dbdsdc( 'U', 'N', n, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    iu = nwork
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    if( lwork >= m*n + 3*n ) then
+                       ! work( iu ) is m by n
+                       ldwrku = m
+                    else
+                       ! work( iu ) is ldwrku by n
+                       ldwrku = ( lwork - 3*n ) / n
+                    end if
+                    nwork = iu + ldwrku*n
+                    ! path 6o (m >= n, jobz='o')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    call stdlib_dbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(irvt,KIND=dp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + n*n [u] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*n [u] + n*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_zlacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_zunmbr( 'P', 'R', 'C', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    if( lwork >= m*n + 3*n ) then
+                       ! path 6o-fast
+                       ! copy realmatrix rwork(iru,KIND=dp) to complex matrix work(iu)
+                       ! overwrite work(iu) by left singular vectors of a, copying
+                       ! to a
+                       ! cworkspace: need   2*n [tauq, taup] + m*n [u] + n    [work]
+                       ! cworkspace: prefer 2*n [tauq, taup] + m*n [u] + n*nb [work]
+                       ! rworkspace: need   n [e] + n*n [ru]
+                       call stdlib_zlaset( 'F', m, n, czero, czero, work( iu ),ldwrku )
+                       call stdlib_zlacp2( 'F', n, n, rwork( iru ), n, work( iu ),ldwrku )
+                       call stdlib_zunmbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), work( iu &
+                                 ), ldwrku,work( nwork ), lwork-nwork+1, ierr )
+                       call stdlib_zlacpy( 'F', m, n, work( iu ), ldwrku, a, lda )
+                    else
+                       ! path 6o-slow
+                       ! generate q in a
+                       ! cworkspace: need   2*n [tauq, taup] + n*n [u] + n    [work]
+                       ! cworkspace: prefer 2*n [tauq, taup] + n*n [u] + n*nb [work]
+                       ! rworkspace: need   0
+                       call stdlib_zungbr( 'Q', m, n, n, a, lda, work( itauq ),work( nwork ), &
+                                 lwork-nwork+1, ierr )
+                       ! multiply q in a by realmatrix rwork(iru,KIND=dp), storing the
+                       ! result in work(iu), copying to a
+                       ! cworkspace: need   2*n [tauq, taup] + n*n [u]
+                       ! cworkspace: prefer 2*n [tauq, taup] + m*n [u]
+                       ! rworkspace: need   n [e] + n*n [ru] + 2*n*n [rwork]
+                       ! rworkspace: prefer n [e] + n*n [ru] + 2*m*n [rwork] < n + 5*n*n since m < 2*n here
+                       nrwork = irvt
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_zlacrm( chunk, n, a( i, 1 ), lda,rwork( iru ), n, work( iu )&
+                                    , ldwrku,rwork( nrwork ) )
+                          call stdlib_zlacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    end if
+                 else if( wntqs ) then
+                    ! path 6s (m >= n, jobz='s')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_dbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=dp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_zlaset( 'F', m, n, czero, czero, u, ldu )
+                    call stdlib_zlacp2( 'F', n, n, rwork( iru ), n, u, ldu )
+                    call stdlib_zunmbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=dp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_zlacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_zunmbr( 'P', 'R', 'C', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                 else
+                    ! path 6a (m >= n, jobz='a')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt] + bdspac
+                    iru = nrwork
+                    irvt = iru + n*n
+                    nrwork = irvt + n*n
+                    call stdlib_dbdsdc( 'U', 'I', n, s, rwork( ie ), rwork( iru ),n, rwork( irvt )&
+                              , n, dum, idum,rwork( nrwork ), iwork, info )
+                    ! set the right corner of u to identity matrix
+                    call stdlib_zlaset( 'F', m, m, czero, czero, u, ldu )
+                    if( m>n ) then
+                       call stdlib_zlaset( 'F', m-n, m-n, czero, cone,u( n+1, n+1 ), ldu )
+                    end if
+                    ! copy realmatrix rwork(iru,KIND=dp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_zlacp2( 'F', n, n, rwork( iru ), n, u, ldu )
+                    call stdlib_zunmbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=dp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*n [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*n [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   n [e] + n*n [ru] + n*n [rvt]
+                    call stdlib_zlacp2( 'F', n, n, rwork( irvt ), n, vt, ldvt )
+                    call stdlib_zunmbr( 'P', 'R', 'C', n, n, n, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                 end if
+              end if
+           else
+              ! a has more columns than rows. if a has sufficiently more
+              ! columns than rows, first reduce using the lq decomposition (if
+              ! sufficient workspace available)
+              if( n>=mnthr1 ) then
+                 if( wntqn ) then
+                    ! path 1t (n >> m, jobz='n')
+                    ! no singular vectors to be computed
+                    itau = 1
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! cworkspace: need   m [tau] + m    [work]
+                    ! cworkspace: prefer m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! zero out above l
+                    call stdlib_zlaset( 'U', m-1, m-1, czero, czero, a( 1, 2 ),lda )
+                    ie = 1
+                    itauq = 1
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! cworkspace: need   2*m [tauq, taup] + m      [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + 2*m*nb [work]
+                    ! rworkspace: need   m [e]
+                    call stdlib_zgebrd( m, m, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( nwork ), lwork-nwork+1,ierr )
+                    nrwork = ie + m
+                    ! perform bidiagonal svd, compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + bdspac
+                    call stdlib_dbdsdc( 'U', 'N', m, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 2t (n >> m, jobz='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! m left singular vectors to be computed in u
+                    ivt = 1
+                    ldwkvt = m
+                    ! work(ivt) is m by m
+                    il = ivt + ldwkvt*m
+                    if( lwork >= m*n + m*m + 3*m ) then
+                       ! work(il) m by n
+                       ldwrkl = m
+                       chunk = n
+                    else
+                       ! work(il) is m by chunk
+                       ldwrkl = m
+                       chunk = ( lwork - m*m - 3*m ) / m
+                    end if
+                    itau = il + ldwrkl*chunk
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! cworkspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! copy l to work(il), zeroing about above it
+                    call stdlib_zlacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
+                    call stdlib_zlaset( 'U', m-1, m-1, czero, czero,work( il+ldwrkl ), ldwrkl )
+                              
+                    ! generate q in a
+                    ! cworkspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zunglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in work(il)
+                    ! cworkspace: need   m*m [vt] + m*m [l] + 2*m [tauq, taup] + m      [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + 2*m [tauq, taup] + 2*m*nb [work]
+                    ! rworkspace: need   m [e]
+                    call stdlib_zgebrd( m, m, work( il ), ldwrkl, s, rwork( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [ru] + m*m [rvt] + bdspac
+                    iru = ie + m
+                    irvt = iru + m*m
+                    nrwork = irvt + m*m
+                    call stdlib_dbdsdc( 'U', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=dp) to complex matrix work(iu)
+                    ! overwrite work(iu) by the left singular vectors of l
+                    ! cworkspace: need   m*m [vt] + m*m [l] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_zunmbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
+                              u, ldu, work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=dp) to complex matrix work(ivt)
+                    ! overwrite work(ivt) by the right singular vectors of l
+                    ! cworkspace: need   m*m [vt] + m*m [l] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m*m [l] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacp2( 'F', m, m, rwork( irvt ), m, work( ivt ),ldwkvt )
+                    call stdlib_zunmbr( 'P', 'R', 'C', m, m, m, work( il ), ldwrkl,work( itaup ), &
+                              work( ivt ), ldwkvt,work( nwork ), lwork-nwork+1, ierr )
+                    ! multiply right singular vectors of l in work(il) by q
+                    ! in a, storing result in work(il) and copying to a
+                    ! cworkspace: need   m*m [vt] + m*m [l]
+                    ! cworkspace: prefer m*m [vt] + m*n [l]
+                    ! rworkspace: need   0
+                    do i = 1, n, chunk
+                       blk = min( n-i+1, chunk )
+                       call stdlib_zgemm( 'N', 'N', m, blk, m, cone, work( ivt ), m,a( 1, i ), &
+                                 lda, czero, work( il ),ldwrkl )
+                       call stdlib_zlacpy( 'F', m, blk, work( il ), ldwrkl,a( 1, i ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 3t (n >> m, jobz='s')
+                    ! m right singular vectors to be computed in vt and
+                    ! m left singular vectors to be computed in u
+                    il = 1
+                    ! work(il) is m by m
+                    ldwrkl = m
+                    itau = il + ldwrkl*m
+                    nwork = itau + m
+                    ! compute a=l*q
+                    ! cworkspace: need   m*m [l] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [l] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    ! copy l to work(il), zeroing out above it
+                    call stdlib_zlacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
+                    call stdlib_zlaset( 'U', m-1, m-1, czero, czero,work( il+ldwrkl ), ldwrkl )
+                              
+                    ! generate q in a
+                    ! cworkspace: need   m*m [l] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [l] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zunglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork-nwork+&
+                              1, ierr )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in work(il)
+                    ! cworkspace: need   m*m [l] + 2*m [tauq, taup] + m      [work]
+                    ! cworkspace: prefer m*m [l] + 2*m [tauq, taup] + 2*m*nb [work]
+                    ! rworkspace: need   m [e]
+                    call stdlib_zgebrd( m, m, work( il ), ldwrkl, s, rwork( ie ),work( itauq ), &
+                              work( itaup ), work( nwork ),lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [ru] + m*m [rvt] + bdspac
+                    iru = ie + m
+                    irvt = iru + m*m
+                    nrwork = irvt + m*m
+                    call stdlib_dbdsdc( 'U', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=dp) to complex matrix u
+                    ! overwrite u by left singular vectors of l
+                    ! cworkspace: need   m*m [l] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [l] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_zunmbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
+                              u, ldu, work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=dp) to complex matrix vt
+                    ! overwrite vt by left singular vectors of l
+                    ! cworkspace: need   m*m [l] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [l] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacp2( 'F', m, m, rwork( irvt ), m, vt, ldvt )
+                    call stdlib_zunmbr( 'P', 'R', 'C', m, m, m, work( il ), ldwrkl,work( itaup ), &
+                              vt, ldvt, work( nwork ),lwork-nwork+1, ierr )
+                    ! copy vt to work(il), multiply right singular vectors of l
+                    ! in work(il) by q in a, storing result in vt
+                    ! cworkspace: need   m*m [l]
+                    ! rworkspace: need   0
+                    call stdlib_zlacpy( 'F', m, m, vt, ldvt, work( il ), ldwrkl )
+                    call stdlib_zgemm( 'N', 'N', m, n, m, cone, work( il ), ldwrkl,a, lda, czero, &
+                              vt, ldvt )
+                 else if( wntqa ) then
+                    ! path 4t (n >> m, jobz='a')
+                    ! n right singular vectors to be computed in vt and
+                    ! m left singular vectors to be computed in u
+                    ivt = 1
+                    ! work(ivt) is m by m
+                    ldwkvt = m
+                    itau = ivt + ldwkvt*m
+                    nwork = itau + m
+                    ! compute a=l*q, copying result to vt
+                    ! cworkspace: need   m*m [vt] + m [tau] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + m [tau] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork-nwork+1, &
+                              ierr )
+                    call stdlib_zlacpy( 'U', m, n, a, lda, vt, ldvt )
+                    ! generate q in vt
+                    ! cworkspace: need   m*m [vt] + m [tau] + n    [work]
+                    ! cworkspace: prefer m*m [vt] + m [tau] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zunglq( n, n, m, vt, ldvt, work( itau ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! produce l in a, zeroing out above it
+                    call stdlib_zlaset( 'U', m-1, m-1, czero, czero, a( 1, 2 ),lda )
+                    ie = 1
+                    itauq = itau
+                    itaup = itauq + m
+                    nwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! cworkspace: need   m*m [vt] + 2*m [tauq, taup] + m      [work]
+                    ! cworkspace: prefer m*m [vt] + 2*m [tauq, taup] + 2*m*nb [work]
+                    ! rworkspace: need   m [e]
+                    call stdlib_zgebrd( m, m, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( nwork ), lwork-nwork+1,ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [ru] + m*m [rvt] + bdspac
+                    iru = ie + m
+                    irvt = iru + m*m
+                    nrwork = irvt + m*m
+                    call stdlib_dbdsdc( 'U', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=dp) to complex matrix u
+                    ! overwrite u by left singular vectors of l
+                    ! cworkspace: need   m*m [vt] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_zunmbr( 'Q', 'L', 'N', m, m, m, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=dp) to complex matrix work(ivt)
+                    ! overwrite work(ivt) by right singular vectors of l
+                    ! cworkspace: need   m*m [vt] + 2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer m*m [vt] + 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacp2( 'F', m, m, rwork( irvt ), m, work( ivt ),ldwkvt )
+                    call stdlib_zunmbr( 'P', 'R', 'C', m, m, m, a, lda,work( itaup ), work( ivt ),&
+                               ldwkvt,work( nwork ), lwork-nwork+1, ierr )
+                    ! multiply right singular vectors of l in work(ivt) by
+                    ! q in vt, storing result in a
+                    ! cworkspace: need   m*m [vt]
+                    ! rworkspace: need   0
+                    call stdlib_zgemm( 'N', 'N', m, n, m, cone, work( ivt ), ldwkvt,vt, ldvt, &
+                              czero, a, lda )
+                    ! copy right singular vectors of a from a to vt
+                    call stdlib_zlacpy( 'F', m, n, a, lda, vt, ldvt )
+                 end if
+              else if( n>=mnthr2 ) then
+                 ! mnthr2 <= n < mnthr1
+                 ! path 5t (n >> m, but not as much as mnthr1)
+                 ! reduce to bidiagonal form without qr decomposition, use
+                 ! stdlib_zungbr and matrix multiplication to compute singular vectors
+                 ie = 1
+                 nrwork = ie + m
+                 itauq = 1
+                 itaup = itauq + m
+                 nwork = itaup + m
+                 ! bidiagonalize a
+                 ! cworkspace: need   2*m [tauq, taup] + n        [work]
+                 ! cworkspace: prefer 2*m [tauq, taup] + (m+n)*nb [work]
+                 ! rworkspace: need   m [e]
+                 call stdlib_zgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 5tn (n >> m, jobz='n')
+                    ! compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + bdspac
+                    call stdlib_dbdsdc( 'L', 'N', m, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    ivt = nwork
+                    ! path 5to (n >> m, jobz='o')
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_zungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! generate p**h in a
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zungbr( 'P', m, n, m, a, lda, work( itaup ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ldwkvt = m
+                    if( lwork >= m*n + 3*m ) then
+                       ! work( ivt ) is m by n
+                       nwork = ivt + ldwkvt*n
+                       chunk = n
+                    else
+                       ! work( ivt ) is m by chunk
+                       chunk = ( lwork - 3*m ) / m
+                       nwork = ivt + ldwkvt*chunk
+                    end if
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    call stdlib_dbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply q in u by realmatrix rwork(irvt,KIND=dp)
+                    ! storing the result in work(ivt), copying to u
+                    ! cworkspace: need   2*m [tauq, taup] + m*m [vt]
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + 2*m*m [rwork]
+                    call stdlib_zlacrm( m, m, u, ldu, rwork( iru ), m, work( ivt ),ldwkvt, rwork( &
+                              nrwork ) )
+                    call stdlib_zlacpy( 'F', m, m, work( ivt ), ldwkvt, u, ldu )
+                    ! multiply rwork(irvt) by p**h in a, storing the
+                    ! result in work(ivt), copying to a
+                    ! cworkspace: need   2*m [tauq, taup] + m*m [vt]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*n [vt]
+                    ! rworkspace: need   m [e] + m*m [rvt] + 2*m*m [rwork]
+                    ! rworkspace: prefer m [e] + m*m [rvt] + 2*m*n [rwork] < m + 5*m*m since n < 2*m here
+                    nrwork = iru
+                    do i = 1, n, chunk
+                       blk = min( n-i+1, chunk )
+                       call stdlib_zlarcm( m, blk, rwork( irvt ), m, a( 1, i ), lda,work( ivt ), &
+                                 ldwkvt, rwork( nrwork ) )
+                       call stdlib_zlacpy( 'F', m, blk, work( ivt ), ldwkvt,a( 1, i ), lda )
+                                 
+                    end do
+                 else if( wntqs ) then
+                    ! path 5ts (n >> m, jobz='s')
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_zungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacpy( 'U', m, n, a, lda, vt, ldvt )
+                    call stdlib_zungbr( 'P', m, n, m, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_dbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply q in u by realmatrix rwork(iru,KIND=dp), storing the
+                    ! result in a, copying to u
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + 2*m*m [rwork]
+                    call stdlib_zlacrm( m, m, u, ldu, rwork( iru ), m, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_zlacpy( 'F', m, m, a, lda, u, ldu )
+                    ! multiply realmatrix rwork(irvt,KIND=dp) by p**h in vt,
+                    ! storing the result in a, copying to vt
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + 2*m*n [rwork] < m + 5*m*m since n < 2*m here
+                    nrwork = iru
+                    call stdlib_zlarcm( m, n, rwork( irvt ), m, vt, ldvt, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_zlacpy( 'F', m, n, a, lda, vt, ldvt )
+                 else
+                    ! path 5ta (n >> m, jobz='a')
+                    ! copy a to u, generate q
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_zungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( nwork ), lwork-&
+                              nwork+1, ierr )
+                    ! copy a to vt, generate p**h
+                    ! cworkspace: need   2*m [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   0
+                    call stdlib_zlacpy( 'U', m, n, a, lda, vt, ldvt )
+                    call stdlib_zungbr( 'P', n, n, m, vt, ldvt, work( itaup ),work( nwork ), &
+                              lwork-nwork+1, ierr )
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_dbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! multiply q in u by realmatrix rwork(iru,KIND=dp), storing the
+                    ! result in a, copying to u
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + 2*m*m [rwork]
+                    call stdlib_zlacrm( m, m, u, ldu, rwork( iru ), m, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_zlacpy( 'F', m, m, a, lda, u, ldu )
+                    ! multiply realmatrix rwork(irvt,KIND=dp) by p**h in vt,
+                    ! storing the result in a, copying to vt
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + 2*m*n [rwork] < m + 5*m*m since n < 2*m here
+                    nrwork = iru
+                    call stdlib_zlarcm( m, n, rwork( irvt ), m, vt, ldvt, a, lda,rwork( nrwork ) )
+                              
+                    call stdlib_zlacpy( 'F', m, n, a, lda, vt, ldvt )
+                 end if
+              else
+                 ! n < mnthr2
+                 ! path 6t (n > m, but not much larger)
+                 ! reduce to bidiagonal form without lq decomposition
+                 ! use stdlib_zunmbr to compute singular vectors
+                 ie = 1
+                 nrwork = ie + m
+                 itauq = 1
+                 itaup = itauq + m
+                 nwork = itaup + m
+                 ! bidiagonalize a
+                 ! cworkspace: need   2*m [tauq, taup] + n        [work]
+                 ! cworkspace: prefer 2*m [tauq, taup] + (m+n)*nb [work]
+                 ! rworkspace: need   m [e]
+                 call stdlib_zgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( nwork ), lwork-nwork+1,ierr )
+                 if( wntqn ) then
+                    ! path 6tn (n > m, jobz='n')
+                    ! compute singular values only
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + bdspac
+                    call stdlib_dbdsdc( 'L', 'N', m, s, rwork( ie ), dum,1,dum,1,dum, idum, rwork(&
+                               nrwork ), iwork, info )
+                 else if( wntqo ) then
+                    ! path 6to (n > m, jobz='o')
+                    ldwkvt = m
+                    ivt = nwork
+                    if( lwork >= m*n + 3*m ) then
+                       ! work( ivt ) is m by n
+                       call stdlib_zlaset( 'F', m, n, czero, czero, work( ivt ),ldwkvt )
+                       nwork = ivt + ldwkvt*n
+                    else
+                       ! work( ivt ) is m by chunk
+                       chunk = ( lwork - 3*m ) / m
+                       nwork = ivt + ldwkvt*chunk
+                    end if
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_dbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=dp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + m*m [vt] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*m [vt] + m*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru]
+                    call stdlib_zlacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_zunmbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    if( lwork >= m*n + 3*m ) then
+                       ! path 6to-fast
+                       ! copy realmatrix rwork(irvt,KIND=dp) to complex matrix work(ivt)
+                       ! overwrite work(ivt) by right singular vectors of a,
+                       ! copying to a
+                       ! cworkspace: need   2*m [tauq, taup] + m*n [vt] + m    [work]
+                       ! cworkspace: prefer 2*m [tauq, taup] + m*n [vt] + m*nb [work]
+                       ! rworkspace: need   m [e] + m*m [rvt]
+                       call stdlib_zlacp2( 'F', m, m, rwork( irvt ), m, work( ivt ),ldwkvt )
+                                 
+                       call stdlib_zunmbr( 'P', 'R', 'C', m, n, m, a, lda,work( itaup ), work( &
+                                 ivt ), ldwkvt,work( nwork ), lwork-nwork+1, ierr )
+                       call stdlib_zlacpy( 'F', m, n, work( ivt ), ldwkvt, a, lda )
+                    else
+                       ! path 6to-slow
+                       ! generate p**h in a
+                       ! cworkspace: need   2*m [tauq, taup] + m*m [vt] + m    [work]
+                       ! cworkspace: prefer 2*m [tauq, taup] + m*m [vt] + m*nb [work]
+                       ! rworkspace: need   0
+                       call stdlib_zungbr( 'P', m, n, m, a, lda, work( itaup ),work( nwork ), &
+                                 lwork-nwork+1, ierr )
+                       ! multiply q in a by realmatrix rwork(iru,KIND=dp), storing the
+                       ! result in work(iu), copying to a
+                       ! cworkspace: need   2*m [tauq, taup] + m*m [vt]
+                       ! cworkspace: prefer 2*m [tauq, taup] + m*n [vt]
+                       ! rworkspace: need   m [e] + m*m [rvt] + 2*m*m [rwork]
+                       ! rworkspace: prefer m [e] + m*m [rvt] + 2*m*n [rwork] < m + 5*m*m since n < 2*m here
+                       nrwork = iru
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_zlarcm( m, blk, rwork( irvt ), m, a( 1, i ),lda, work( ivt )&
+                                    , ldwkvt,rwork( nrwork ) )
+                          call stdlib_zlacpy( 'F', m, blk, work( ivt ), ldwkvt,a( 1, i ), lda )
+                                    
+                       end do
+                    end if
+                 else if( wntqs ) then
+                    ! path 6ts (n > m, jobz='s')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_dbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=dp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru]
+                    call stdlib_zlacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_zunmbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! copy realmatrix rwork(irvt,KIND=dp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt]
+                    call stdlib_zlaset( 'F', m, n, czero, czero, vt, ldvt )
+                    call stdlib_zlacp2( 'F', m, m, rwork( irvt ), m, vt, ldvt )
+                    call stdlib_zunmbr( 'P', 'R', 'C', m, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                 else
+                    ! path 6ta (n > m, jobz='a')
+                    ! perform bidiagonal svd, computing left singular vectors
+                    ! of bidiagonal matrix in rwork(iru) and computing right
+                    ! singular vectors of bidiagonal matrix in rwork(irvt)
+                    ! cworkspace: need   0
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru] + bdspac
+                    irvt = nrwork
+                    iru = irvt + m*m
+                    nrwork = iru + m*m
+                    call stdlib_dbdsdc( 'L', 'I', m, s, rwork( ie ), rwork( iru ),m, rwork( irvt )&
+                              , m, dum, idum,rwork( nrwork ), iwork, info )
+                    ! copy realmatrix rwork(iru,KIND=dp) to complex matrix u
+                    ! overwrite u by left singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + m    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + m*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt] + m*m [ru]
+                    call stdlib_zlacp2( 'F', m, m, rwork( iru ), m, u, ldu )
+                    call stdlib_zunmbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                    ! set all of vt to identity matrix
+                    call stdlib_zlaset( 'F', n, n, czero, cone, vt, ldvt )
+                    ! copy realmatrix rwork(irvt,KIND=dp) to complex matrix vt
+                    ! overwrite vt by right singular vectors of a
+                    ! cworkspace: need   2*m [tauq, taup] + n    [work]
+                    ! cworkspace: prefer 2*m [tauq, taup] + n*nb [work]
+                    ! rworkspace: need   m [e] + m*m [rvt]
+                    call stdlib_zlacp2( 'F', m, m, rwork( irvt ), m, vt, ldvt )
+                    call stdlib_zunmbr( 'P', 'R', 'C', n, n, m, a, lda,work( itaup ), vt, ldvt, &
+                              work( nwork ),lwork-nwork+1, ierr )
+                 end if
+              end if
+           end if
+           ! undo scaling if necessary
+           if( iscl==1 ) then
+              if( anrm>bignum )call stdlib_dlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm>bignum )call stdlib_dlascl( 'G', 0, 0, bignum, anrm, minmn-1,&
+                         1,rwork( ie ), minmn, ierr )
+              if( anrm<smlnum )call stdlib_dlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm<smlnum )call stdlib_dlascl( 'G', 0, 0, smlnum, anrm, minmn-1,&
+                         1,rwork( ie ), minmn, ierr )
+           end if
+           ! return optimal workspace in work(1)
+           work( 1 ) = stdlib_droundup_lwork( maxwrk )
+           return
+     end subroutine stdlib_zgesdd
+
+     !> ZGESV: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> The LU decomposition with partial pivoting and row interchanges is
+     !> used to factor A as
+     !> A = P * L * U,
+     !> where P is a permutation matrix, L is unit lower triangular, and U is
+     !> upper triangular.  The factored form of A is then used to solve the
+     !> system of equations A * X = B.
+
+     pure subroutine stdlib_zgesv( n, nrhs, a, lda, ipiv, b, ldb, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+        ! =====================================================================
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           if( n<0 ) then
+              info = -1
+           else if( nrhs<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGESV ', -info )
+              return
+           end if
+           ! compute the lu factorization of a.
+           call stdlib_zgetrf( n, n, a, lda, ipiv, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_zgetrs( 'NO TRANSPOSE', n, nrhs, a, lda, ipiv, b, ldb,info )
+           end if
+           return
+     end subroutine stdlib_zgesv
+
+     !> ZGESVD: computes the singular value decomposition (SVD) of a complex
+     !> M-by-N matrix A, optionally computing the left and/or right singular
+     !> vectors. The SVD is written
+     !> A = U * SIGMA * conjugate-transpose(V)
+     !> where SIGMA is an M-by-N matrix which is zero except for its
+     !> min(m,n) diagonal elements, U is an M-by-M unitary matrix, and
+     !> V is an N-by-N unitary matrix.  The diagonal elements of SIGMA
+     !> are the singular values of A; they are real and non-negative, and
+     !> are returned in descending order.  The first min(m,n) columns of
+     !> U and V are the left and right singular vectors of A.
+     !> Note that the routine returns V**H, not V.
+
+     subroutine stdlib_zgesvd( jobu, jobvt, m, n, a, lda, s, u, ldu,vt, ldvt, work, lwork, rwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobu, jobvt
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldvt, lwork, m, n
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*), s(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: u(ldu,*), vt(ldvt,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wntua, wntuas, wntun, wntuo, wntus, wntva, wntvas, wntvn, wntvo,&
+                      wntvs
+           integer(ilp) :: blk, chunk, i, ie, ierr, ir, irwork, iscl, itau, itaup, itauq, iu, &
+           iwork, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr, ncu, ncvt, nru, nrvt, &
+                     wrkbl
+           integer(ilp) :: lwork_zgeqrf, lwork_zungqr_n, lwork_zungqr_m, lwork_zgebrd, &
+                     lwork_zungbr_p, lwork_zungbr_q, lwork_zgelqf, lwork_zunglq_n, lwork_zunglq_m
+           real(dp) :: anrm, bignum, eps, smlnum
+           ! Local Arrays 
+           real(dp) :: dum(1)
+           complex(dp) :: cdum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,min,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           minmn = min( m, n )
+           wntua = stdlib_lsame( jobu, 'A' )
+           wntus = stdlib_lsame( jobu, 'S' )
+           wntuas = wntua .or. wntus
+           wntuo = stdlib_lsame( jobu, 'O' )
+           wntun = stdlib_lsame( jobu, 'N' )
+           wntva = stdlib_lsame( jobvt, 'A' )
+           wntvs = stdlib_lsame( jobvt, 'S' )
+           wntvas = wntva .or. wntvs
+           wntvo = stdlib_lsame( jobvt, 'O' )
+           wntvn = stdlib_lsame( jobvt, 'N' )
+           lquery = ( lwork==-1 )
+           if( .not.( wntua .or. wntus .or. wntuo .or. wntun ) ) then
+              info = -1
+           else if( .not.( wntva .or. wntvs .or. wntvo .or. wntvn ) .or.( wntvo .and. wntuo ) ) &
+                     then
+              info = -2
+           else if( m<0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldu<1 .or. ( wntuas .and. ldu<m ) ) then
+              info = -9
+           else if( ldvt<1 .or. ( wntva .and. ldvt<n ) .or.( wntvs .and. ldvt<minmn ) ) &
+                     then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to
+             ! real workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              minwrk = 1
+              maxwrk = 1
+              if( m>=n .and. minmn>0 ) then
+                 ! space needed for stdlib_zbdsqr is bdspac = 5*n
+                 mnthr = stdlib_ilaenv( 6, 'ZGESVD', jobu // jobvt, m, n, 0, 0 )
+                 ! compute space needed for stdlib_zgeqrf
+                 call stdlib_zgeqrf( m, n, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_zgeqrf = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_zungqr
+                 call stdlib_zungqr( m, n, n, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_zungqr_n = int( cdum(1),KIND=ilp)
+                 call stdlib_zungqr( m, m, n, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_zungqr_m = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_zgebrd
+                 call stdlib_zgebrd( n, n, a, lda, s, dum(1), cdum(1),cdum(1), cdum(1), -1, ierr )
+                           
+                 lwork_zgebrd = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_zungbr
+                 call stdlib_zungbr( 'P', n, n, n, a, lda, cdum(1),cdum(1), -1, ierr )
+                 lwork_zungbr_p = int( cdum(1),KIND=ilp)
+                 call stdlib_zungbr( 'Q', n, n, n, a, lda, cdum(1),cdum(1), -1, ierr )
+                 lwork_zungbr_q = int( cdum(1),KIND=ilp)
+                 if( m>=mnthr ) then
+                    if( wntun ) then
+                       ! path 1 (m much larger than n, jobu='n')
+                       maxwrk = n + lwork_zgeqrf
+                       maxwrk = max( maxwrk, 2*n+lwork_zgebrd )
+                       if( wntvo .or. wntvas )maxwrk = max( maxwrk, 2*n+lwork_zungbr_p )
+                       minwrk = 3*n
+                    else if( wntuo .and. wntvn ) then
+                       ! path 2 (m much larger than n, jobu='o', jobvt='n')
+                       wrkbl = n + lwork_zgeqrf
+                       wrkbl = max( wrkbl, n+lwork_zungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_zungbr_q )
+                       maxwrk = max( n*n+wrkbl, n*n+m*n )
+                       minwrk = 2*n + m
+                    else if( wntuo .and. wntvas ) then
+                       ! path 3 (m much larger than n, jobu='o', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_zgeqrf
+                       wrkbl = max( wrkbl, n+lwork_zungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_zungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_zungbr_p )
+                       maxwrk = max( n*n+wrkbl, n*n+m*n )
+                       minwrk = 2*n + m
+                    else if( wntus .and. wntvn ) then
+                       ! path 4 (m much larger than n, jobu='s', jobvt='n')
+                       wrkbl = n + lwork_zgeqrf
+                       wrkbl = max( wrkbl, n+lwork_zungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_zungbr_q )
+                       maxwrk = n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntus .and. wntvo ) then
+                       ! path 5 (m much larger than n, jobu='s', jobvt='o')
+                       wrkbl = n + lwork_zgeqrf
+                       wrkbl = max( wrkbl, n+lwork_zungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_zungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_zungbr_p )
+                       maxwrk = 2*n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntus .and. wntvas ) then
+                       ! path 6 (m much larger than n, jobu='s', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_zgeqrf
+                       wrkbl = max( wrkbl, n+lwork_zungqr_n )
+                       wrkbl = max( wrkbl, 2*n+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_zungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_zungbr_p )
+                       maxwrk = n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntua .and. wntvn ) then
+                       ! path 7 (m much larger than n, jobu='a', jobvt='n')
+                       wrkbl = n + lwork_zgeqrf
+                       wrkbl = max( wrkbl, n+lwork_zungqr_m )
+                       wrkbl = max( wrkbl, 2*n+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_zungbr_q )
+                       maxwrk = n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntua .and. wntvo ) then
+                       ! path 8 (m much larger than n, jobu='a', jobvt='o')
+                       wrkbl = n + lwork_zgeqrf
+                       wrkbl = max( wrkbl, n+lwork_zungqr_m )
+                       wrkbl = max( wrkbl, 2*n+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_zungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_zungbr_p )
+                       maxwrk = 2*n*n + wrkbl
+                       minwrk = 2*n + m
+                    else if( wntua .and. wntvas ) then
+                       ! path 9 (m much larger than n, jobu='a', jobvt='s' or
+                       ! 'a')
+                       wrkbl = n + lwork_zgeqrf
+                       wrkbl = max( wrkbl, n+lwork_zungqr_m )
+                       wrkbl = max( wrkbl, 2*n+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*n+lwork_zungbr_q )
+                       wrkbl = max( wrkbl, 2*n+lwork_zungbr_p )
+                       maxwrk = n*n + wrkbl
+                       minwrk = 2*n + m
+                    end if
+                 else
+                    ! path 10 (m at least n, but not much larger)
+                    call stdlib_zgebrd( m, n, a, lda, s, dum(1), cdum(1),cdum(1), cdum(1), -1, &
+                              ierr )
+                    lwork_zgebrd = int( cdum(1),KIND=ilp)
+                    maxwrk = 2*n + lwork_zgebrd
+                    if( wntus .or. wntuo ) then
+                       call stdlib_zungbr( 'Q', m, n, n, a, lda, cdum(1),cdum(1), -1, ierr )
+                                 
+                       lwork_zungbr_q = int( cdum(1),KIND=ilp)
+                       maxwrk = max( maxwrk, 2*n+lwork_zungbr_q )
+                    end if
+                    if( wntua ) then
+                       call stdlib_zungbr( 'Q', m, m, n, a, lda, cdum(1),cdum(1), -1, ierr )
+                                 
+                       lwork_zungbr_q = int( cdum(1),KIND=ilp)
+                       maxwrk = max( maxwrk, 2*n+lwork_zungbr_q )
+                    end if
+                    if( .not.wntvn ) then
+                       maxwrk = max( maxwrk, 2*n+lwork_zungbr_p )
+                    end if
+                    minwrk = 2*n + m
+                 end if
+              else if( minmn>0 ) then
+                 ! space needed for stdlib_zbdsqr is bdspac = 5*m
+                 mnthr = stdlib_ilaenv( 6, 'ZGESVD', jobu // jobvt, m, n, 0, 0 )
+                 ! compute space needed for stdlib_zgelqf
+                 call stdlib_zgelqf( m, n, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_zgelqf = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_zunglq
+                 call stdlib_zunglq( n, n, m, cdum(1), n, cdum(1), cdum(1), -1,ierr )
+                 lwork_zunglq_n = int( cdum(1),KIND=ilp)
+                 call stdlib_zunglq( m, n, m, a, lda, cdum(1), cdum(1), -1, ierr )
+                 lwork_zunglq_m = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_zgebrd
+                 call stdlib_zgebrd( m, m, a, lda, s, dum(1), cdum(1),cdum(1), cdum(1), -1, ierr )
+                           
+                 lwork_zgebrd = int( cdum(1),KIND=ilp)
+                  ! compute space needed for stdlib_zungbr p
+                 call stdlib_zungbr( 'P', m, m, m, a, n, cdum(1),cdum(1), -1, ierr )
+                 lwork_zungbr_p = int( cdum(1),KIND=ilp)
+                 ! compute space needed for stdlib_zungbr q
+                 call stdlib_zungbr( 'Q', m, m, m, a, n, cdum(1),cdum(1), -1, ierr )
+                 lwork_zungbr_q = int( cdum(1),KIND=ilp)
+                 if( n>=mnthr ) then
+                    if( wntvn ) then
+                       ! path 1t(n much larger than m, jobvt='n')
+                       maxwrk = m + lwork_zgelqf
+                       maxwrk = max( maxwrk, 2*m+lwork_zgebrd )
+                       if( wntuo .or. wntuas )maxwrk = max( maxwrk, 2*m+lwork_zungbr_q )
+                       minwrk = 3*m
+                    else if( wntvo .and. wntun ) then
+                       ! path 2t(n much larger than m, jobu='n', jobvt='o')
+                       wrkbl = m + lwork_zgelqf
+                       wrkbl = max( wrkbl, m+lwork_zunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_zungbr_p )
+                       maxwrk = max( m*m+wrkbl, m*m+m*n )
+                       minwrk = 2*m + n
+                    else if( wntvo .and. wntuas ) then
+                       ! path 3t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='o')
+                       wrkbl = m + lwork_zgelqf
+                       wrkbl = max( wrkbl, m+lwork_zunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_zungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_zungbr_q )
+                       maxwrk = max( m*m+wrkbl, m*m+m*n )
+                       minwrk = 2*m + n
+                    else if( wntvs .and. wntun ) then
+                       ! path 4t(n much larger than m, jobu='n', jobvt='s')
+                       wrkbl = m + lwork_zgelqf
+                       wrkbl = max( wrkbl, m+lwork_zunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_zungbr_p )
+                       maxwrk = m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntvs .and. wntuo ) then
+                       ! path 5t(n much larger than m, jobu='o', jobvt='s')
+                       wrkbl = m + lwork_zgelqf
+                       wrkbl = max( wrkbl, m+lwork_zunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_zungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_zungbr_q )
+                       maxwrk = 2*m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntvs .and. wntuas ) then
+                       ! path 6t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='s')
+                       wrkbl = m + lwork_zgelqf
+                       wrkbl = max( wrkbl, m+lwork_zunglq_m )
+                       wrkbl = max( wrkbl, 2*m+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_zungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_zungbr_q )
+                       maxwrk = m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntva .and. wntun ) then
+                       ! path 7t(n much larger than m, jobu='n', jobvt='a')
+                       wrkbl = m + lwork_zgelqf
+                       wrkbl = max( wrkbl, m+lwork_zunglq_n )
+                       wrkbl = max( wrkbl, 2*m+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_zungbr_p )
+                       maxwrk = m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntva .and. wntuo ) then
+                       ! path 8t(n much larger than m, jobu='o', jobvt='a')
+                       wrkbl = m + lwork_zgelqf
+                       wrkbl = max( wrkbl, m+lwork_zunglq_n )
+                       wrkbl = max( wrkbl, 2*m+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_zungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_zungbr_q )
+                       maxwrk = 2*m*m + wrkbl
+                       minwrk = 2*m + n
+                    else if( wntva .and. wntuas ) then
+                       ! path 9t(n much larger than m, jobu='s' or 'a',
+                       ! jobvt='a')
+                       wrkbl = m + lwork_zgelqf
+                       wrkbl = max( wrkbl, m+lwork_zunglq_n )
+                       wrkbl = max( wrkbl, 2*m+lwork_zgebrd )
+                       wrkbl = max( wrkbl, 2*m+lwork_zungbr_p )
+                       wrkbl = max( wrkbl, 2*m+lwork_zungbr_q )
+                       maxwrk = m*m + wrkbl
+                       minwrk = 2*m + n
+                    end if
+                 else
+                    ! path 10t(n greater than m, but not much larger)
+                    call stdlib_zgebrd( m, n, a, lda, s, dum(1), cdum(1),cdum(1), cdum(1), -1, &
+                              ierr )
+                    lwork_zgebrd = int( cdum(1),KIND=ilp)
+                    maxwrk = 2*m + lwork_zgebrd
+                    if( wntvs .or. wntvo ) then
+                      ! compute space needed for stdlib_zungbr p
+                      call stdlib_zungbr( 'P', m, n, m, a, n, cdum(1),cdum(1), -1, ierr )
+                      lwork_zungbr_p = int( cdum(1),KIND=ilp)
+                      maxwrk = max( maxwrk, 2*m+lwork_zungbr_p )
+                    end if
+                    if( wntva ) then
+                      call stdlib_zungbr( 'P', n,  n, m, a, n, cdum(1),cdum(1), -1, ierr )
+                      lwork_zungbr_p = int( cdum(1),KIND=ilp)
+                      maxwrk = max( maxwrk, 2*m+lwork_zungbr_p )
+                    end if
+                    if( .not.wntun ) then
+                       maxwrk = max( maxwrk, 2*m+lwork_zungbr_q )
+                    end if
+                    minwrk = 2*m + n
+                 end if
+              end if
+              maxwrk = max( maxwrk, minwrk )
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGESVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( m==0 .or. n==0 ) then
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = sqrt( stdlib_dlamch( 'S' ) ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', m, n, a, lda, dum )
+           iscl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              iscl = 1
+              call stdlib_zlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, ierr )
+           else if( anrm>bignum ) then
+              iscl = 1
+              call stdlib_zlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, ierr )
+           end if
+           if( m>=n ) then
+              ! a has at least as many rows as columns. if a has sufficiently
+              ! more rows than columns, first reduce using the qr
+              ! decomposition (if sufficient workspace available)
+              if( m>=mnthr ) then
+                 if( wntun ) then
+                    ! path 1 (m much larger than n, jobu='n')
+                    ! no left singular vectors to be computed
+                    itau = 1
+                    iwork = itau + n
+                    ! compute a=q*r
+                    ! (cworkspace: need 2*n, prefer n+n*nb)
+                    ! (rworkspace: need 0)
+                    call stdlib_zgeqrf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                              ierr )
+                    ! zero out below r
+                    if( n > 1 ) then
+                       call stdlib_zlaset( 'L', n-1, n-1, czero, czero, a( 2, 1 ),lda )
+                    end if
+                    ie = 1
+                    itauq = 1
+                    itaup = itauq + n
+                    iwork = itaup + n
+                    ! bidiagonalize r in a
+                    ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                    ! (rworkspace: need n)
+                    call stdlib_zgebrd( n, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( iwork ), lwork-iwork+1,ierr )
+                    ncvt = 0
+                    if( wntvo .or. wntvas ) then
+                       ! if right singular vectors desired, generate p'.
+                       ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       ncvt = n
+                    end if
+                    irwork = ie + n
+                    ! perform bidiagonal qr iteration, computing right
+                    ! singular vectors of a in a if desired
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_zbdsqr( 'U', n, ncvt, 0, 0, s, rwork( ie ), a, lda,cdum, 1, cdum, &
+                              1, rwork( irwork ), info )
+                    ! if right singular vectors desired in vt, copy them there
+                    if( wntvas )call stdlib_zlacpy( 'F', n, n, a, lda, vt, ldvt )
+                 else if( wntuo .and. wntvn ) then
+                    ! path 2 (m much larger than n, jobu='o', jobvt='n')
+                    ! n left singular vectors to be overwritten on a and
+                    ! no right singular vectors to be computed
+                    if( lwork>=n*n+3*n ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n )+lda*n ) then
+                          ! work(iu) is lda by n, work(ir) is lda by n
+                          ldwrku = lda
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n )+n*n ) then
+                          ! work(iu) is lda by n, work(ir) is n by n
+                          ldwrku = lda
+                          ldwrkr = n
+                       else
+                          ! work(iu) is ldwrku by n, work(ir) is n by n
+                          ldwrku = ( lwork-n*n ) / n
+                          ldwrkr = n
+                       end if
+                       itau = ir + ldwrkr*n
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to work(ir) and zero out below it
+                       call stdlib_zlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
+                       call stdlib_zlaset( 'L', n-1, n-1, czero, czero,work( ir+1 ), ldwrkr )
+                                 
+                       ! generate q in a
+                       ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in work(ir)
+                       ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                       ! (rworkspace: need n)
+                       call stdlib_zgebrd( n, n, work( ir ), ldwrkr, s, rwork( ie ),work( itauq ),&
+                                  work( itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing r
+                       ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                       ! (rworkspace: need 0)
+                       call stdlib_zungbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       irwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of r in work(ir)
+                       ! (cworkspace: need n*n)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_zbdsqr( 'U', n, 0, n, 0, s, rwork( ie ), cdum, 1,work( ir ), &
+                                 ldwrkr, cdum, 1,rwork( irwork ), info )
+                       iu = itauq
+                       ! multiply q in a by left singular vectors of r in
+                       ! work(ir), storing result in work(iu) and copying to a
+                       ! (cworkspace: need n*n+n, prefer n*n+m*n)
+                       ! (rworkspace: 0)
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_zgemm( 'N', 'N', chunk, n, n, cone, a( i, 1 ),lda, work( ir &
+                                    ), ldwrkr, czero,work( iu ), ldwrku )
+                          call stdlib_zlacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       ie = 1
+                       itauq = 1
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize a
+                       ! (cworkspace: need 2*n+m, prefer 2*n+(m+n)*nb)
+                       ! (rworkspace: n)
+                       call stdlib_zgebrd( m, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing a
+                       ! (cworkspace: need 3*n, prefer 2*n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungbr( 'Q', m, n, n, a, lda, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in a
+                       ! (cworkspace: need 0)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_zbdsqr( 'U', n, 0, m, 0, s, rwork( ie ), cdum, 1,a, lda, cdum, &
+                                 1, rwork( irwork ), info )
+                    end if
+                 else if( wntuo .and. wntvas ) then
+                    ! path 3 (m much larger than n, jobu='o', jobvt='s' or 'a')
+                    ! n left singular vectors to be overwritten on a and
+                    ! n right singular vectors to be computed in vt
+                    if( lwork>=n*n+3*n ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n )+lda*n ) then
+                          ! work(iu) is lda by n and work(ir) is lda by n
+                          ldwrku = lda
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n )+n*n ) then
+                          ! work(iu) is lda by n and work(ir) is n by n
+                          ldwrku = lda
+                          ldwrkr = n
+                       else
+                          ! work(iu) is ldwrku by n and work(ir) is n by n
+                          ldwrku = ( lwork-n*n ) / n
+                          ldwrkr = n
+                       end if
+                       itau = ir + ldwrkr*n
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to vt, zeroing out below it
+                       call stdlib_zlacpy( 'U', n, n, a, lda, vt, ldvt )
+                       if( n>1 )call stdlib_zlaset( 'L', n-1, n-1, czero, czero,vt( 2, 1 ), ldvt )
+                                 
+                       ! generate q in a
+                       ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in vt, copying result to work(ir)
+                       ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                       ! (rworkspace: need n)
+                       call stdlib_zgebrd( n, n, vt, ldvt, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       call stdlib_zlacpy( 'L', n, n, vt, ldvt, work( ir ), ldwrkr )
+                       ! generate left vectors bidiagonalizing r in work(ir)
+                       ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing r in vt
+                       ! (cworkspace: need n*n+3*n-1, prefer n*n+2*n+(n-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of r in work(ir) and computing right
+                       ! singular vectors of r in vt
+                       ! (cworkspace: need n*n)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_zbdsqr( 'U', n, n, n, 0, s, rwork( ie ), vt,ldvt, work( ir ), &
+                                 ldwrkr, cdum, 1,rwork( irwork ), info )
+                       iu = itauq
+                       ! multiply q in a by left singular vectors of r in
+                       ! work(ir), storing result in work(iu) and copying to a
+                       ! (cworkspace: need n*n+n, prefer n*n+m*n)
+                       ! (rworkspace: 0)
+                       do i = 1, m, ldwrku
+                          chunk = min( m-i+1, ldwrku )
+                          call stdlib_zgemm( 'N', 'N', chunk, n, n, cone, a( i, 1 ),lda, work( ir &
+                                    ), ldwrkr, czero,work( iu ), ldwrku )
+                          call stdlib_zlacpy( 'F', chunk, n, work( iu ), ldwrku,a( i, 1 ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       itau = 1
+                       iwork = itau + n
+                       ! compute a=q*r
+                       ! (cworkspace: need 2*n, prefer n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy r to vt, zeroing out below it
+                       call stdlib_zlacpy( 'U', n, n, a, lda, vt, ldvt )
+                       if( n>1 )call stdlib_zlaset( 'L', n-1, n-1, czero, czero,vt( 2, 1 ), ldvt )
+                                 
+                       ! generate q in a
+                       ! (cworkspace: need 2*n, prefer n+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + n
+                       iwork = itaup + n
+                       ! bidiagonalize r in vt
+                       ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                       ! (rworkspace: n)
+                       call stdlib_zgebrd( n, n, vt, ldvt, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! multiply q in a by left vectors bidiagonalizing r
+                       ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zunmbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), a, lda,&
+                                  work( iwork ),lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing r in vt
+                       ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + n
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in a and computing right
+                       ! singular vectors of a in vt
+                       ! (cworkspace: 0)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_zbdsqr( 'U', n, n, m, 0, s, rwork( ie ), vt,ldvt, a, lda, cdum,&
+                                  1, rwork( irwork ),info )
+                    end if
+                 else if( wntus ) then
+                    if( wntvn ) then
+                       ! path 4 (m much larger than n, jobu='s', jobvt='n')
+                       ! n left singular vectors to be computed in u and
+                       ! no right singular vectors to be computed
+                       if( lwork>=n*n+3*n ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(ir) is lda by n
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is n by n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(ir), zeroing out below it
+                          call stdlib_zlacpy( 'U', n, n, a, lda, work( ir ),ldwrkr )
+                          call stdlib_zlaset( 'L', n-1, n-1, czero, czero,work( ir+1 ), ldwrkr )
+                                    
+                          ! generate q in a
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(ir)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_zgebrd( n, n, work( ir ), ldwrkr, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left vectors bidiagonalizing r in work(ir)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(ir)
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', n, 0, n, 0, s, rwork( ie ), cdum,1, work( ir ),&
+                                     ldwrkr, cdum, 1,rwork( irwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(ir), storing result in u
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_zgemm( 'N', 'N', m, n, n, cone, a, lda,work( ir ), ldwrkr, &
+                                    czero, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_zlaset( 'L', n-1, n-1, czero, czero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_zgebrd( n, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left vectors bidiagonalizing r
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunmbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', n, 0, m, 0, s, rwork( ie ), cdum,1, u, ldu, &
+                                    cdum, 1, rwork( irwork ),info )
+                       end if
+                    else if( wntvo ) then
+                       ! path 5 (m much larger than n, jobu='s', jobvt='o')
+                       ! n left singular vectors to be computed in u and
+                       ! n right singular vectors to be overwritten on a
+                       if( lwork>=2*n*n+3*n ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*n ) then
+                             ! work(iu) is lda by n and work(ir) is lda by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+n )*n ) then
+                             ! work(iu) is lda by n and work(ir) is n by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          else
+                             ! work(iu) is n by n and work(ir) is n by n
+                             ldwrku = n
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (cworkspace: need 2*n*n+2*n, prefer 2*n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_zlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_zlaset( 'L', n-1, n-1, czero, czero,work( iu+1 ), ldwrku )
+                                    
+                          ! generate q in a
+                          ! (cworkspace: need 2*n*n+2*n, prefer 2*n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to
+                          ! work(ir)
+                          ! (cworkspace: need   2*n*n+3*n,
+                                       ! prefer 2*n*n+2*n+2*n*nb)
+                          ! (rworkspace: need   n)
+                          call stdlib_zgebrd( n, n, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_zlacpy( 'U', n, n, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need 2*n*n+3*n, prefer 2*n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need   2*n*n+3*n-1,
+                                       ! prefer 2*n*n+2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'P', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in work(ir)
+                          ! (cworkspace: need 2*n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', n, n, n, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                    work( iu ),ldwrku, cdum, 1, rwork( irwork ),info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(iu), storing result in u
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_zgemm( 'N', 'N', m, n, n, cone, a, lda,work( iu ), ldwrku, &
+                                    czero, u, ldu )
+                          ! copy right singular vectors of r to a
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_zlacpy( 'F', n, n, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_zlaset( 'L', n-1, n-1, czero, czero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_zgebrd( n, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left vectors bidiagonalizing r
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunmbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right vectors bidiagonalizing r in a
+                          ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in a
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', n, n, m, 0, s, rwork( ie ), a,lda, u, ldu, &
+                                    cdum, 1, rwork( irwork ),info )
+                       end if
+                    else if( wntvas ) then
+                       ! path 6 (m much larger than n, jobu='s', jobvt='s'
+                               ! or 'a')
+                       ! n left singular vectors to be computed in u and
+                       ! n right singular vectors to be computed in vt
+                       if( lwork>=n*n+3*n ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is n by n
+                             ldwrku = n
+                          end if
+                          itau = iu + ldwrku*n
+                          iwork = itau + n
+                          ! compute a=q*r
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_zlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_zlaset( 'L', n-1, n-1, czero, czero,work( iu+1 ), ldwrku )
+                                    
+                          ! generate q in a
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungqr( m, n, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to vt
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_zgebrd( n, n, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_zlacpy( 'U', n, n, work( iu ), ldwrku, vt,ldvt )
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (cworkspace: need   n*n+3*n-1,
+                                       ! prefer n*n+2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in vt
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', n, n, n, 0, s, rwork( ie ), vt,ldvt, work( iu )&
+                                    , ldwrku, cdum, 1,rwork( irwork ), info )
+                          ! multiply q in a by left singular vectors of r in
+                          ! work(iu), storing result in u
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_zgemm( 'N', 'N', m, n, n, cone, a, lda,work( iu ), ldwrku, &
+                                    czero, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungqr( m, n, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to vt, zeroing out below it
+                          call stdlib_zlacpy( 'U', n, n, a, lda, vt, ldvt )
+                          if( n>1 )call stdlib_zlaset( 'L', n-1, n-1, czero, czero,vt( 2, 1 ), &
+                                    ldvt )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in vt
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_zgebrd( n, n, vt, ldvt, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in vt
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunmbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', n, n, m, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    end if
+                 else if( wntua ) then
+                    if( wntvn ) then
+                       ! path 7 (m much larger than n, jobu='a', jobvt='n')
+                       ! m left singular vectors to be computed in u and
+                       ! no right singular vectors to be computed
+                       if( lwork>=n*n+max( n+m, 3*n ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(ir) is lda by n
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is n by n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! copy r to work(ir), zeroing out below it
+                          call stdlib_zlacpy( 'U', n, n, a, lda, work( ir ),ldwrkr )
+                          call stdlib_zlaset( 'L', n-1, n-1, czero, czero,work( ir+1 ), ldwrkr )
+                                    
+                          ! generate q in u
+                          ! (cworkspace: need n*n+n+m, prefer n*n+n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(ir)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_zgebrd( n, n, work( ir ), ldwrkr, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', n, n, n, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(ir)
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', n, 0, n, 0, s, rwork( ie ), cdum,1, work( ir ),&
+                                     ldwrkr, cdum, 1,rwork( irwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(ir), storing result in a
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_zgemm( 'N', 'N', m, n, n, cone, u, ldu,work( ir ), ldwrkr, &
+                                    czero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_zlacpy( 'F', m, n, a, lda, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need n+m, prefer n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_zlaset( 'L', n-1, n-1, czero, czero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_zgebrd( n, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in a
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunmbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', n, 0, m, 0, s, rwork( ie ), cdum,1, u, ldu, &
+                                    cdum, 1, rwork( irwork ),info )
+                       end if
+                    else if( wntvo ) then
+                       ! path 8 (m much larger than n, jobu='a', jobvt='o')
+                       ! m left singular vectors to be computed in u and
+                       ! n right singular vectors to be overwritten on a
+                       if( lwork>=2*n*n+max( n+m, 3*n ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*n ) then
+                             ! work(iu) is lda by n and work(ir) is lda by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+n )*n ) then
+                             ! work(iu) is lda by n and work(ir) is n by n
+                             ldwrku = lda
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          else
+                             ! work(iu) is n by n and work(ir) is n by n
+                             ldwrku = n
+                             ir = iu + ldwrku*n
+                             ldwrkr = n
+                          end if
+                          itau = ir + ldwrkr*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n*n+2*n, prefer 2*n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need 2*n*n+n+m, prefer 2*n*n+n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_zlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_zlaset( 'L', n-1, n-1, czero, czero,work( iu+1 ), ldwrku )
+                                    
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to
+                          ! work(ir)
+                          ! (cworkspace: need   2*n*n+3*n,
+                                       ! prefer 2*n*n+2*n+2*n*nb)
+                          ! (rworkspace: need   n)
+                          call stdlib_zgebrd( n, n, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_zlacpy( 'U', n, n, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need 2*n*n+3*n, prefer 2*n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need   2*n*n+3*n-1,
+                                       ! prefer 2*n*n+2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'P', n, n, n, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in work(ir)
+                          ! (cworkspace: need 2*n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', n, n, n, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                    work( iu ),ldwrku, cdum, 1, rwork( irwork ),info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(iu), storing result in a
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_zgemm( 'N', 'N', m, n, n, cone, u, ldu,work( iu ), ldwrku, &
+                                    czero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_zlacpy( 'F', m, n, a, lda, u, ldu )
+                          ! copy right singular vectors of r from work(ir) to a
+                          call stdlib_zlacpy( 'F', n, n, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need n+m, prefer n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! zero out below r in a
+                          if( n > 1 ) then
+                             call stdlib_zlaset( 'L', n-1, n-1, czero, czero,a( 2, 1 ), lda )
+                                       
+                          end if
+                          ! bidiagonalize r in a
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_zgebrd( n, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in a
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunmbr( 'Q', 'R', 'N', m, n, n, a, lda,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in a
+                          ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in a
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', n, n, m, 0, s, rwork( ie ), a,lda, u, ldu, &
+                                    cdum, 1, rwork( irwork ),info )
+                       end if
+                    else if( wntvas ) then
+                       ! path 9 (m much larger than n, jobu='a', jobvt='s'
+                               ! or 'a')
+                       ! m left singular vectors to be computed in u and
+                       ! n right singular vectors to be computed in vt
+                       if( lwork>=n*n+max( n+m, 3*n ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*n ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is n by n
+                             ldwrku = n
+                          end if
+                          itau = iu + ldwrku*n
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need n*n+2*n, prefer n*n+n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need n*n+n+m, prefer n*n+n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r to work(iu), zeroing out below it
+                          call stdlib_zlacpy( 'U', n, n, a, lda, work( iu ),ldwrku )
+                          call stdlib_zlaset( 'L', n-1, n-1, czero, czero,work( iu+1 ), ldwrku )
+                                    
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in work(iu), copying result to vt
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_zgebrd( n, n, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_zlacpy( 'U', n, n, work( iu ), ldwrku, vt,ldvt )
+                          ! generate left bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need n*n+3*n, prefer n*n+2*n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', n, n, n, work( iu ), ldwrku,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (cworkspace: need   n*n+3*n-1,
+                                       ! prefer n*n+2*n+(n-1)*nb)
+                          ! (rworkspace: need   0)
+                          call stdlib_zungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of r in work(iu) and computing
+                          ! right singular vectors of r in vt
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', n, n, n, 0, s, rwork( ie ), vt,ldvt, work( iu )&
+                                    , ldwrku, cdum, 1,rwork( irwork ), info )
+                          ! multiply q in u by left singular vectors of r in
+                          ! work(iu), storing result in a
+                          ! (cworkspace: need n*n)
+                          ! (rworkspace: 0)
+                          call stdlib_zgemm( 'N', 'N', m, n, n, cone, u, ldu,work( iu ), ldwrku, &
+                                    czero, a, lda )
+                          ! copy left singular vectors of a from a to u
+                          call stdlib_zlacpy( 'F', m, n, a, lda, u, ldu )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + n
+                          ! compute a=q*r, copying result to u
+                          ! (cworkspace: need 2*n, prefer n+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgeqrf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'L', m, n, a, lda, u, ldu )
+                          ! generate q in u
+                          ! (cworkspace: need n+m, prefer n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungqr( m, m, n, u, ldu, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy r from a to vt, zeroing out below it
+                          call stdlib_zlacpy( 'U', n, n, a, lda, vt, ldvt )
+                          if( n>1 )call stdlib_zlaset( 'L', n-1, n-1, czero, czero,vt( 2, 1 ), &
+                                    ldvt )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + n
+                          iwork = itaup + n
+                          ! bidiagonalize r in vt
+                          ! (cworkspace: need 3*n, prefer 2*n+2*n*nb)
+                          ! (rworkspace: need n)
+                          call stdlib_zgebrd( n, n, vt, ldvt, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply q in u by left bidiagonalizing vectors
+                          ! in vt
+                          ! (cworkspace: need 2*n+m, prefer 2*n+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunmbr( 'Q', 'R', 'N', m, n, n, vt, ldvt,work( itauq ), u, &
+                                    ldu, work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in vt
+                          ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ),&
+                                     lwork-iwork+1, ierr )
+                          irwork = ie + n
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', n, n, m, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    end if
+                 end if
+              else
+                 ! m < mnthr
+                 ! path 10 (m at least n, but not much larger)
+                 ! reduce to bidiagonal form without qr decomposition
+                 ie = 1
+                 itauq = 1
+                 itaup = itauq + n
+                 iwork = itaup + n
+                 ! bidiagonalize a
+                 ! (cworkspace: need 2*n+m, prefer 2*n+(m+n)*nb)
+                 ! (rworkspace: need n)
+                 call stdlib_zgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( iwork ), lwork-iwork+1,ierr )
+                 if( wntuas ) then
+                    ! if left singular vectors desired in u, copy result to u
+                    ! and generate left bidiagonalizing vectors in u
+                    ! (cworkspace: need 2*n+ncu, prefer 2*n+ncu*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_zlacpy( 'L', m, n, a, lda, u, ldu )
+                    if( wntus )ncu = n
+                    if( wntua )ncu = m
+                    call stdlib_zungbr( 'Q', m, ncu, n, u, ldu, work( itauq ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntvas ) then
+                    ! if right singular vectors desired in vt, copy result to
+                    ! vt and generate right bidiagonalizing vectors in vt
+                    ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_zlacpy( 'U', n, n, a, lda, vt, ldvt )
+                    call stdlib_zungbr( 'P', n, n, n, vt, ldvt, work( itaup ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntuo ) then
+                    ! if left singular vectors desired in a, generate left
+                    ! bidiagonalizing vectors in a
+                    ! (cworkspace: need 3*n, prefer 2*n+n*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_zungbr( 'Q', m, n, n, a, lda, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvo ) then
+                    ! if right singular vectors desired in a, generate right
+                    ! bidiagonalizing vectors in a
+                    ! (cworkspace: need 3*n-1, prefer 2*n+(n-1)*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_zungbr( 'P', n, n, n, a, lda, work( itaup ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 irwork = ie + n
+                 if( wntuas .or. wntuo )nru = m
+                 if( wntun )nru = 0
+                 if( wntvas .or. wntvo )ncvt = n
+                 if( wntvn )ncvt = 0
+                 if( ( .not.wntuo ) .and. ( .not.wntvo ) ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in vt
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_zbdsqr( 'U', n, ncvt, nru, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                              cdum, 1, rwork( irwork ),info )
+                 else if( ( .not.wntuo ) .and. wntvo ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in a
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_zbdsqr( 'U', n, ncvt, nru, 0, s, rwork( ie ), a,lda, u, ldu, cdum,&
+                               1, rwork( irwork ),info )
+                 else
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in a and computing right singular
+                    ! vectors in vt
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_zbdsqr( 'U', n, ncvt, nru, 0, s, rwork( ie ), vt,ldvt, a, lda, &
+                              cdum, 1, rwork( irwork ),info )
+                 end if
+              end if
+           else
+              ! a has more columns than rows. if a has sufficiently more
+              ! columns than rows, first reduce using the lq decomposition (if
+              ! sufficient workspace available)
+              if( n>=mnthr ) then
+                 if( wntvn ) then
+                    ! path 1t(n much larger than m, jobvt='n')
+                    ! no right singular vectors to be computed
+                    itau = 1
+                    iwork = itau + m
+                    ! compute a=l*q
+                    ! (cworkspace: need 2*m, prefer m+m*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_zgelqf( m, n, a, lda, work( itau ), work( iwork ),lwork-iwork+1, &
+                              ierr )
+                    ! zero out above l
+                    call stdlib_zlaset( 'U', m-1, m-1, czero, czero, a( 1, 2 ),lda )
+                    ie = 1
+                    itauq = 1
+                    itaup = itauq + m
+                    iwork = itaup + m
+                    ! bidiagonalize l in a
+                    ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                    ! (rworkspace: need m)
+                    call stdlib_zgebrd( m, m, a, lda, s, rwork( ie ), work( itauq ),work( itaup ),&
+                               work( iwork ), lwork-iwork+1,ierr )
+                    if( wntuo .or. wntuas ) then
+                       ! if left singular vectors desired, generate q
+                       ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                    end if
+                    irwork = ie + m
+                    nru = 0
+                    if( wntuo .or. wntuas )nru = m
+                    ! perform bidiagonal qr iteration, computing left singular
+                    ! vectors of a in a if desired
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_zbdsqr( 'U', m, 0, nru, 0, s, rwork( ie ), cdum, 1,a, lda, cdum, &
+                              1, rwork( irwork ), info )
+                    ! if left singular vectors desired in u, copy them there
+                    if( wntuas )call stdlib_zlacpy( 'F', m, m, a, lda, u, ldu )
+                 else if( wntvo .and. wntun ) then
+                    ! path 2t(n much larger than m, jobu='n', jobvt='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! no left singular vectors to be computed
+                    if( lwork>=m*m+3*m ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n )+lda*m ) then
+                          ! work(iu) is lda by n and work(ir) is lda by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n )+m*m ) then
+                          ! work(iu) is lda by n and work(ir) is m by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = m
+                       else
+                          ! work(iu) is m by chunk and work(ir) is m by m
+                          ldwrku = m
+                          chunk = ( lwork-m*m ) / m
+                          ldwrkr = m
+                       end if
+                       itau = ir + ldwrkr*m
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to work(ir) and zero out above it
+                       call stdlib_zlacpy( 'L', m, m, a, lda, work( ir ), ldwrkr )
+                       call stdlib_zlaset( 'U', m-1, m-1, czero, czero,work( ir+ldwrkr ), ldwrkr )
+                                 
+                       ! generate q in a
+                       ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in work(ir)
+                       ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                       ! (rworkspace: need m)
+                       call stdlib_zgebrd( m, m, work( ir ), ldwrkr, s, rwork( ie ),work( itauq ),&
+                                  work( itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing l
+                       ! (cworkspace: need m*m+3*m-1, prefer m*m+2*m+(m-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       irwork = ie + m
+                       ! perform bidiagonal qr iteration, computing right
+                       ! singular vectors of l in work(ir)
+                       ! (cworkspace: need m*m)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_zbdsqr( 'U', m, m, 0, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                 cdum, 1, cdum, 1,rwork( irwork ), info )
+                       iu = itauq
+                       ! multiply right singular vectors of l in work(ir) by q
+                       ! in a, storing result in work(iu) and copying to a
+                       ! (cworkspace: need m*m+m, prefer m*m+m*n)
+                       ! (rworkspace: 0)
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_zgemm( 'N', 'N', m, blk, m, cone, work( ir ),ldwrkr, a( 1, &
+                                    i ), lda, czero,work( iu ), ldwrku )
+                          call stdlib_zlacpy( 'F', m, blk, work( iu ), ldwrku,a( 1, i ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       ie = 1
+                       itauq = 1
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize a
+                       ! (cworkspace: need 2*m+n, prefer 2*m+(m+n)*nb)
+                       ! (rworkspace: need m)
+                       call stdlib_zgebrd( m, n, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! generate right vectors bidiagonalizing a
+                       ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + m
+                       ! perform bidiagonal qr iteration, computing right
+                       ! singular vectors of a in a
+                       ! (cworkspace: 0)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_zbdsqr( 'L', m, n, 0, 0, s, rwork( ie ), a, lda,cdum, 1, cdum, &
+                                 1, rwork( irwork ), info )
+                    end if
+                 else if( wntvo .and. wntuas ) then
+                    ! path 3t(n much larger than m, jobu='s' or 'a', jobvt='o')
+                    ! m right singular vectors to be overwritten on a and
+                    ! m left singular vectors to be computed in u
+                    if( lwork>=m*m+3*m ) then
+                       ! sufficient workspace for a fast algorithm
+                       ir = 1
+                       if( lwork>=max( wrkbl, lda*n )+lda*m ) then
+                          ! work(iu) is lda by n and work(ir) is lda by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = lda
+                       else if( lwork>=max( wrkbl, lda*n )+m*m ) then
+                          ! work(iu) is lda by n and work(ir) is m by m
+                          ldwrku = lda
+                          chunk = n
+                          ldwrkr = m
+                       else
+                          ! work(iu) is m by chunk and work(ir) is m by m
+                          ldwrku = m
+                          chunk = ( lwork-m*m ) / m
+                          ldwrkr = m
+                       end if
+                       itau = ir + ldwrkr*m
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to u, zeroing about above it
+                       call stdlib_zlacpy( 'L', m, m, a, lda, u, ldu )
+                       call stdlib_zlaset( 'U', m-1, m-1, czero, czero, u( 1, 2 ),ldu )
+                       ! generate q in a
+                       ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in u, copying result to work(ir)
+                       ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                       ! (rworkspace: need m)
+                       call stdlib_zgebrd( m, m, u, ldu, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       call stdlib_zlacpy( 'U', m, m, u, ldu, work( ir ), ldwrkr )
+                       ! generate right vectors bidiagonalizing l in work(ir)
+                       ! (cworkspace: need m*m+3*m-1, prefer m*m+2*m+(m-1)*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), work( &
+                                 iwork ),lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing l in u
+                       ! (cworkspace: need m*m+3*m, prefer m*m+2*m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + m
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of l in u, and computing right
+                       ! singular vectors of l in work(ir)
+                       ! (cworkspace: need m*m)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_zbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( ir ), ldwrkr, u, &
+                                 ldu, cdum, 1,rwork( irwork ), info )
+                       iu = itauq
+                       ! multiply right singular vectors of l in work(ir) by q
+                       ! in a, storing result in work(iu) and copying to a
+                       ! (cworkspace: need m*m+m, prefer m*m+m*n))
+                       ! (rworkspace: 0)
+                       do i = 1, n, chunk
+                          blk = min( n-i+1, chunk )
+                          call stdlib_zgemm( 'N', 'N', m, blk, m, cone, work( ir ),ldwrkr, a( 1, &
+                                    i ), lda, czero,work( iu ), ldwrku )
+                          call stdlib_zlacpy( 'F', m, blk, work( iu ), ldwrku,a( 1, i ), lda )
+                                    
+                       end do
+                    else
+                       ! insufficient workspace for a fast algorithm
+                       itau = 1
+                       iwork = itau + m
+                       ! compute a=l*q
+                       ! (cworkspace: need 2*m, prefer m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-iwork+&
+                                 1, ierr )
+                       ! copy l to u, zeroing out above it
+                       call stdlib_zlacpy( 'L', m, m, a, lda, u, ldu )
+                       call stdlib_zlaset( 'U', m-1, m-1, czero, czero, u( 1, 2 ),ldu )
+                       ! generate q in a
+                       ! (cworkspace: need 2*m, prefer m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                 iwork+1, ierr )
+                       ie = 1
+                       itauq = itau
+                       itaup = itauq + m
+                       iwork = itaup + m
+                       ! bidiagonalize l in u
+                       ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                       ! (rworkspace: need m)
+                       call stdlib_zgebrd( m, m, u, ldu, s, rwork( ie ),work( itauq ), work( &
+                                 itaup ),work( iwork ), lwork-iwork+1, ierr )
+                       ! multiply right vectors bidiagonalizing l by q in a
+                       ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zunmbr( 'P', 'L', 'C', m, n, m, u, ldu,work( itaup ), a, lda, &
+                                 work( iwork ),lwork-iwork+1, ierr )
+                       ! generate left vectors bidiagonalizing l in u
+                       ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                       ! (rworkspace: 0)
+                       call stdlib_zungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                 lwork-iwork+1, ierr )
+                       irwork = ie + m
+                       ! perform bidiagonal qr iteration, computing left
+                       ! singular vectors of a in u and computing right
+                       ! singular vectors of a in a
+                       ! (cworkspace: 0)
+                       ! (rworkspace: need bdspac)
+                       call stdlib_zbdsqr( 'U', m, n, m, 0, s, rwork( ie ), a, lda,u, ldu, cdum, &
+                                 1, rwork( irwork ), info )
+                    end if
+                 else if( wntvs ) then
+                    if( wntun ) then
+                       ! path 4t(n much larger than m, jobu='n', jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! no left singular vectors to be computed
+                       if( lwork>=m*m+3*m ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(ir) is lda by m
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is m by m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(ir), zeroing out above it
+                          call stdlib_zlacpy( 'L', m, m, a, lda, work( ir ),ldwrkr )
+                          call stdlib_zlaset( 'U', m-1, m-1, czero, czero,work( ir+ldwrkr ), &
+                                    ldwrkr )
+                          ! generate q in a
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(ir)
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_zgebrd( m, m, work( ir ), ldwrkr, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right vectors bidiagonalizing l in
+                          ! work(ir)
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of l in work(ir)
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', m, m, 0, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                    cdum, 1, cdum, 1,rwork( irwork ), info )
+                          ! multiply right singular vectors of l in work(ir) by
+                          ! q in a, storing result in vt
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_zgemm( 'N', 'N', m, n, m, cone, work( ir ),ldwrkr, a, lda, &
+                                    czero, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy result to vt
+                          call stdlib_zlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_zlaset( 'U', m-1, m-1, czero, czero,a( 1, 2 ), lda )
+                          ! bidiagonalize l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_zgebrd( m, m, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right vectors bidiagonalizing l by q in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunmbr( 'P', 'L', 'C', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', m, n, 0, 0, s, rwork( ie ), vt,ldvt, cdum, 1, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    else if( wntuo ) then
+                       ! path 5t(n much larger than m, jobu='o', jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! m left singular vectors to be overwritten on a
+                       if( lwork>=2*m*m+3*m ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*m ) then
+                             ! work(iu) is lda by m and work(ir) is lda by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+m )*m ) then
+                             ! work(iu) is lda by m and work(ir) is m by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          else
+                             ! work(iu) is m by m and work(ir) is m by m
+                             ldwrku = m
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (cworkspace: need 2*m*m+2*m, prefer 2*m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out below it
+                          call stdlib_zlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_zlaset( 'U', m-1, m-1, czero, czero,work( iu+ldwrku ), &
+                                    ldwrku )
+                          ! generate q in a
+                          ! (cworkspace: need 2*m*m+2*m, prefer 2*m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to
+                          ! work(ir)
+                          ! (cworkspace: need   2*m*m+3*m,
+                                       ! prefer 2*m*m+2*m+2*m*nb)
+                          ! (rworkspace: need   m)
+                          call stdlib_zgebrd( m, m, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_zlacpy( 'L', m, m, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need   2*m*m+3*m-1,
+                                       ! prefer 2*m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need 2*m*m+3*m, prefer 2*m*m+2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', m, m, m, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in work(ir) and computing
+                          ! right singular vectors of l in work(iu)
+                          ! (cworkspace: need 2*m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( iu ), ldwrku, &
+                                    work( ir ),ldwrkr, cdum, 1, rwork( irwork ),info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in a, storing result in vt
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_zgemm( 'N', 'N', m, n, m, cone, work( iu ),ldwrku, a, lda, &
+                                    czero, vt, ldvt )
+                          ! copy left singular vectors of l to a
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_zlacpy( 'F', m, m, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_zlaset( 'U', m-1, m-1, czero, czero,a( 1, 2 ), lda )
+                          ! bidiagonalize l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_zgebrd( m, m, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right vectors bidiagonalizing l by q in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunmbr( 'P', 'L', 'C', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors of l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in a and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', m, n, m, 0, s, rwork( ie ), vt,ldvt, a, lda, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    else if( wntuas ) then
+                       ! path 6t(n much larger than m, jobu='s' or 'a',
+                               ! jobvt='s')
+                       ! m right singular vectors to be computed in vt and
+                       ! m left singular vectors to be computed in u
+                       if( lwork>=m*m+3*m ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(iu) is lda by n
+                             ldwrku = lda
+                          else
+                             ! work(iu) is lda by m
+                             ldwrku = m
+                          end if
+                          itau = iu + ldwrku*m
+                          iwork = itau + m
+                          ! compute a=l*q
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_zlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_zlaset( 'U', m-1, m-1, czero, czero,work( iu+ldwrku ), &
+                                    ldwrku )
+                          ! generate q in a
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunglq( m, n, m, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to u
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_zgebrd( m, m, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_zlacpy( 'L', m, m, work( iu ), ldwrku, u,ldu )
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need   m*m+3*m-1,
+                                       ! prefer m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in u and computing right
+                          ! singular vectors of l in work(iu)
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( iu ), ldwrku, &
+                                    u, ldu, cdum, 1,rwork( irwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in a, storing result in vt
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_zgemm( 'N', 'N', m, n, m, cone, work( iu ),ldwrku, a, lda, &
+                                    czero, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunglq( m, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to u, zeroing out above it
+                          call stdlib_zlacpy( 'L', m, m, a, lda, u, ldu )
+                          call stdlib_zlaset( 'U', m-1, m-1, czero, czero,u( 1, 2 ), ldu )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in u
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_zgebrd( m, m, u, ldu, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in u by q
+                          ! in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunmbr( 'P', 'L', 'C', m, n, m, u, ldu,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', m, n, m, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    end if
+                 else if( wntva ) then
+                    if( wntun ) then
+                       ! path 7t(n much larger than m, jobu='n', jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! no left singular vectors to be computed
+                       if( lwork>=m*m+max( n+m, 3*m ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          ir = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(ir) is lda by m
+                             ldwrkr = lda
+                          else
+                             ! work(ir) is m by m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! copy l to work(ir), zeroing out above it
+                          call stdlib_zlacpy( 'L', m, m, a, lda, work( ir ),ldwrkr )
+                          call stdlib_zlaset( 'U', m-1, m-1, czero, czero,work( ir+ldwrkr ), &
+                                    ldwrkr )
+                          ! generate q in vt
+                          ! (cworkspace: need m*m+m+n, prefer m*m+m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(ir)
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_zgebrd( m, m, work( ir ), ldwrkr, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          ! generate right bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need   m*m+3*m-1,
+                                       ! prefer m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'P', m, m, m, work( ir ), ldwrkr,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of l in work(ir)
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', m, m, 0, 0, s, rwork( ie ),work( ir ), ldwrkr, &
+                                    cdum, 1, cdum, 1,rwork( irwork ), info )
+                          ! multiply right singular vectors of l in work(ir) by
+                          ! q in vt, storing result in a
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_zgemm( 'N', 'N', m, n, m, cone, work( ir ),ldwrkr, vt, ldvt,&
+                                     czero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_zlacpy( 'F', m, n, a, lda, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need m+n, prefer m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_zlaset( 'U', m-1, m-1, czero, czero,a( 1, 2 ), lda )
+                          ! bidiagonalize l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_zgebrd( m, m, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in a by q
+                          ! in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunmbr( 'P', 'L', 'C', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', m, n, 0, 0, s, rwork( ie ), vt,ldvt, cdum, 1, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    else if( wntuo ) then
+                       ! path 8t(n much larger than m, jobu='o', jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! m left singular vectors to be overwritten on a
+                       if( lwork>=2*m*m+max( n+m, 3*m ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+2*lda*m ) then
+                             ! work(iu) is lda by m and work(ir) is lda by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = lda
+                          else if( lwork>=wrkbl+( lda+m )*m ) then
+                             ! work(iu) is lda by m and work(ir) is m by m
+                             ldwrku = lda
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          else
+                             ! work(iu) is m by m and work(ir) is m by m
+                             ldwrku = m
+                             ir = iu + ldwrku*m
+                             ldwrkr = m
+                          end if
+                          itau = ir + ldwrkr*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m*m+2*m, prefer 2*m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need 2*m*m+m+n, prefer 2*m*m+m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_zlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_zlaset( 'U', m-1, m-1, czero, czero,work( iu+ldwrku ), &
+                                    ldwrku )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to
+                          ! work(ir)
+                          ! (cworkspace: need   2*m*m+3*m,
+                                       ! prefer 2*m*m+2*m+2*m*nb)
+                          ! (rworkspace: need   m)
+                          call stdlib_zgebrd( m, m, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_zlacpy( 'L', m, m, work( iu ), ldwrku,work( ir ), ldwrkr )
+                                    
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need   2*m*m+3*m-1,
+                                       ! prefer 2*m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in work(ir)
+                          ! (cworkspace: need 2*m*m+3*m, prefer 2*m*m+2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', m, m, m, work( ir ), ldwrkr,work( itauq ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in work(ir) and computing
+                          ! right singular vectors of l in work(iu)
+                          ! (cworkspace: need 2*m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( iu ), ldwrku, &
+                                    work( ir ),ldwrkr, cdum, 1, rwork( irwork ),info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in vt, storing result in a
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_zgemm( 'N', 'N', m, n, m, cone, work( iu ),ldwrku, vt, ldvt,&
+                                     czero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_zlacpy( 'F', m, n, a, lda, vt, ldvt )
+                          ! copy left singular vectors of a from work(ir) to a
+                          call stdlib_zlacpy( 'F', m, m, work( ir ), ldwrkr, a,lda )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need m+n, prefer m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! zero out above l in a
+                          call stdlib_zlaset( 'U', m-1, m-1, czero, czero,a( 1, 2 ), lda )
+                          ! bidiagonalize l in a
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_zgebrd( m, m, a, lda, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in a by q
+                          ! in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunmbr( 'P', 'L', 'C', m, n, m, a, lda,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in a
+                          ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', m, m, m, a, lda, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in a and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', m, n, m, 0, s, rwork( ie ), vt,ldvt, a, lda, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    else if( wntuas ) then
+                       ! path 9t(n much larger than m, jobu='s' or 'a',
+                               ! jobvt='a')
+                       ! n right singular vectors to be computed in vt and
+                       ! m left singular vectors to be computed in u
+                       if( lwork>=m*m+max( n+m, 3*m ) ) then
+                          ! sufficient workspace for a fast algorithm
+                          iu = 1
+                          if( lwork>=wrkbl+lda*m ) then
+                             ! work(iu) is lda by m
+                             ldwrku = lda
+                          else
+                             ! work(iu) is m by m
+                             ldwrku = m
+                          end if
+                          itau = iu + ldwrku*m
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need m*m+2*m, prefer m*m+m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need m*m+m+n, prefer m*m+m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to work(iu), zeroing out above it
+                          call stdlib_zlacpy( 'L', m, m, a, lda, work( iu ),ldwrku )
+                          call stdlib_zlaset( 'U', m-1, m-1, czero, czero,work( iu+ldwrku ), &
+                                    ldwrku )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in work(iu), copying result to u
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_zgebrd( m, m, work( iu ), ldwrku, s,rwork( ie ), work( &
+                                    itauq ),work( itaup ), work( iwork ),lwork-iwork+1, ierr )
+                          call stdlib_zlacpy( 'L', m, m, work( iu ), ldwrku, u,ldu )
+                          ! generate right bidiagonalizing vectors in work(iu)
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+(m-1)*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'P', m, m, m, work( iu ), ldwrku,work( itaup ), &
+                                    work( iwork ),lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (cworkspace: need m*m+3*m, prefer m*m+2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of l in u and computing right
+                          ! singular vectors of l in work(iu)
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', m, m, m, 0, s, rwork( ie ),work( iu ), ldwrku, &
+                                    u, ldu, cdum, 1,rwork( irwork ), info )
+                          ! multiply right singular vectors of l in work(iu) by
+                          ! q in vt, storing result in a
+                          ! (cworkspace: need m*m)
+                          ! (rworkspace: 0)
+                          call stdlib_zgemm( 'N', 'N', m, n, m, cone, work( iu ),ldwrku, vt, ldvt,&
+                                     czero, a, lda )
+                          ! copy right singular vectors of a from a to vt
+                          call stdlib_zlacpy( 'F', m, n, a, lda, vt, ldvt )
+                       else
+                          ! insufficient workspace for a fast algorithm
+                          itau = 1
+                          iwork = itau + m
+                          ! compute a=l*q, copying result to vt
+                          ! (cworkspace: need 2*m, prefer m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zgelqf( m, n, a, lda, work( itau ),work( iwork ), lwork-&
+                                    iwork+1, ierr )
+                          call stdlib_zlacpy( 'U', m, n, a, lda, vt, ldvt )
+                          ! generate q in vt
+                          ! (cworkspace: need m+n, prefer m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunglq( n, n, m, vt, ldvt, work( itau ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          ! copy l to u, zeroing out above it
+                          call stdlib_zlacpy( 'L', m, m, a, lda, u, ldu )
+                          call stdlib_zlaset( 'U', m-1, m-1, czero, czero,u( 1, 2 ), ldu )
+                          ie = 1
+                          itauq = itau
+                          itaup = itauq + m
+                          iwork = itaup + m
+                          ! bidiagonalize l in u
+                          ! (cworkspace: need 3*m, prefer 2*m+2*m*nb)
+                          ! (rworkspace: need m)
+                          call stdlib_zgebrd( m, m, u, ldu, s, rwork( ie ),work( itauq ), work( &
+                                    itaup ),work( iwork ), lwork-iwork+1, ierr )
+                          ! multiply right bidiagonalizing vectors in u by q
+                          ! in vt
+                          ! (cworkspace: need 2*m+n, prefer 2*m+n*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zunmbr( 'P', 'L', 'C', m, n, m, u, ldu,work( itaup ), vt, &
+                                    ldvt,work( iwork ), lwork-iwork+1, ierr )
+                          ! generate left bidiagonalizing vectors in u
+                          ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                          ! (rworkspace: 0)
+                          call stdlib_zungbr( 'Q', m, m, m, u, ldu, work( itauq ),work( iwork ), &
+                                    lwork-iwork+1, ierr )
+                          irwork = ie + m
+                          ! perform bidiagonal qr iteration, computing left
+                          ! singular vectors of a in u and computing right
+                          ! singular vectors of a in vt
+                          ! (cworkspace: 0)
+                          ! (rworkspace: need bdspac)
+                          call stdlib_zbdsqr( 'U', m, n, m, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                                    cdum, 1,rwork( irwork ), info )
+                       end if
+                    end if
+                 end if
+              else
+                 ! n < mnthr
+                 ! path 10t(n greater than m, but not much larger)
+                 ! reduce to bidiagonal form without lq decomposition
+                 ie = 1
+                 itauq = 1
+                 itaup = itauq + m
+                 iwork = itaup + m
+                 ! bidiagonalize a
+                 ! (cworkspace: need 2*m+n, prefer 2*m+(m+n)*nb)
+                 ! (rworkspace: m)
+                 call stdlib_zgebrd( m, n, a, lda, s, rwork( ie ), work( itauq ),work( itaup ), &
+                           work( iwork ), lwork-iwork+1,ierr )
+                 if( wntuas ) then
+                    ! if left singular vectors desired in u, copy result to u
+                    ! and generate left bidiagonalizing vectors in u
+                    ! (cworkspace: need 3*m-1, prefer 2*m+(m-1)*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_zlacpy( 'L', m, m, a, lda, u, ldu )
+                    call stdlib_zungbr( 'Q', m, m, n, u, ldu, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvas ) then
+                    ! if right singular vectors desired in vt, copy result to
+                    ! vt and generate right bidiagonalizing vectors in vt
+                    ! (cworkspace: need 2*m+nrvt, prefer 2*m+nrvt*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_zlacpy( 'U', m, n, a, lda, vt, ldvt )
+                    if( wntva )nrvt = n
+                    if( wntvs )nrvt = m
+                    call stdlib_zungbr( 'P', nrvt, n, m, vt, ldvt, work( itaup ),work( iwork ), &
+                              lwork-iwork+1, ierr )
+                 end if
+                 if( wntuo ) then
+                    ! if left singular vectors desired in a, generate left
+                    ! bidiagonalizing vectors in a
+                    ! (cworkspace: need 3*m-1, prefer 2*m+(m-1)*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_zungbr( 'Q', m, m, n, a, lda, work( itauq ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 if( wntvo ) then
+                    ! if right singular vectors desired in a, generate right
+                    ! bidiagonalizing vectors in a
+                    ! (cworkspace: need 3*m, prefer 2*m+m*nb)
+                    ! (rworkspace: 0)
+                    call stdlib_zungbr( 'P', m, n, m, a, lda, work( itaup ),work( iwork ), lwork-&
+                              iwork+1, ierr )
+                 end if
+                 irwork = ie + m
+                 if( wntuas .or. wntuo )nru = m
+                 if( wntun )nru = 0
+                 if( wntvas .or. wntvo )ncvt = n
+                 if( wntvn )ncvt = 0
+                 if( ( .not.wntuo ) .and. ( .not.wntvo ) ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in vt
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_zbdsqr( 'L', m, ncvt, nru, 0, s, rwork( ie ), vt,ldvt, u, ldu, &
+                              cdum, 1, rwork( irwork ),info )
+                 else if( ( .not.wntuo ) .and. wntvo ) then
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in u and computing right singular
+                    ! vectors in a
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_zbdsqr( 'L', m, ncvt, nru, 0, s, rwork( ie ), a,lda, u, ldu, cdum,&
+                               1, rwork( irwork ),info )
+                 else
+                    ! perform bidiagonal qr iteration, if desired, computing
+                    ! left singular vectors in a and computing right singular
+                    ! vectors in vt
+                    ! (cworkspace: 0)
+                    ! (rworkspace: need bdspac)
+                    call stdlib_zbdsqr( 'L', m, ncvt, nru, 0, s, rwork( ie ), vt,ldvt, a, lda, &
+                              cdum, 1, rwork( irwork ),info )
+                 end if
+              end if
+           end if
+           ! undo scaling if necessary
+           if( iscl==1 ) then
+              if( anrm>bignum )call stdlib_dlascl( 'G', 0, 0, bignum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm>bignum )call stdlib_dlascl( 'G', 0, 0, bignum, anrm, minmn-1,&
+                         1,rwork( ie ), minmn, ierr )
+              if( anrm<smlnum )call stdlib_dlascl( 'G', 0, 0, smlnum, anrm, minmn, 1, s, minmn,&
+                        ierr )
+              if( info/=0 .and. anrm<smlnum )call stdlib_dlascl( 'G', 0, 0, smlnum, anrm, minmn-1,&
+                         1,rwork( ie ), minmn, ierr )
+           end if
+           ! return optimal workspace in work(1)
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_zgesvd
+
+     !> ZCGESVDQ computes the singular value decomposition (SVD) of a complex
+     !> M-by-N matrix A, where M >= N. The SVD of A is written as
+     !> [++]   [xx]   [x0]   [xx]
+     !> A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+     !> [++]   [xx]
+     !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+     !> matrix, and V is an N-by-N unitary matrix. The diagonal elements
+     !> of SIGMA are the singular values of A. The columns of U and V are the
+     !> left and the right singular vectors of A, respectively.
+
+     subroutine stdlib_zgesvdq( joba, jobp, jobr, jobu, jobv, m, n, a, lda,s, u, ldu, v, ldv, &
+               numrank, iwork, liwork,cwork, lcwork, rwork, lrwork, info )
+           ! Scalar Arguments 
+           character, intent(in) :: joba, jobp, jobr, jobu, jobv
+           integer(ilp), intent(in) :: m, n, lda, ldu, ldv, liwork, lrwork
+           integer(ilp), intent(out) :: numrank, info
+           integer(ilp), intent(inout) :: lcwork
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: u(ldu,*), v(ldv,*), cwork(*)
+           real(dp), intent(out) :: s(*), rwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           integer(ilp) :: ierr, nr, n1, optratio, p, q
+           integer(ilp) :: lwcon, lwqp3, lwrk_zgelqf, lwrk_zgesvd, lwrk_zgesvd2, lwrk_zgeqp3, &
+           lwrk_zgeqrf, lwrk_zunmlq, lwrk_zunmqr, lwrk_zunmqr2, lwlqf, lwqrf, lwsvd, lwsvd2, &
+                     lwunq, lwunq2, lwunlq, minwrk, minwrk2, optwrk, optwrk2, iminwrk, rminwrk
+           logical(lk) :: accla, acclm, acclh, ascaled, conda, dntwu, dntwv, lquery, lsvc0, lsvec,&
+                      rowprm, rsvec, rtrans, wntua, wntuf, wntur, wntus, wntva, wntvr
+           real(dp) :: big, epsln, rtmp, sconda, sfmin
+           complex(dp) :: ctmp
+           ! Local Arrays
+           complex(dp) :: cdummy(1)
+           real(dp) :: rdummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,min,real,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           wntus  = stdlib_lsame( jobu, 'S' ) .or. stdlib_lsame( jobu, 'U' )
+           wntur  = stdlib_lsame( jobu, 'R' )
+           wntua  = stdlib_lsame( jobu, 'A' )
+           wntuf  = stdlib_lsame( jobu, 'F' )
+           lsvc0  = wntus .or. wntur .or. wntua
+           lsvec  = lsvc0 .or. wntuf
+           dntwu  = stdlib_lsame( jobu, 'N' )
+           wntvr  = stdlib_lsame( jobv, 'R' )
+           wntva  = stdlib_lsame( jobv, 'A' ) .or. stdlib_lsame( jobv, 'V' )
+           rsvec  = wntvr .or. wntva
+           dntwv  = stdlib_lsame( jobv, 'N' )
+           accla  = stdlib_lsame( joba, 'A' )
+           acclm  = stdlib_lsame( joba, 'M' )
+           conda  = stdlib_lsame( joba, 'E' )
+           acclh  = stdlib_lsame( joba, 'H' ) .or. conda
+           rowprm = stdlib_lsame( jobp, 'P' )
+           rtrans = stdlib_lsame( jobr, 'T' )
+           if ( rowprm ) then
+              iminwrk = max( 1, n + m - 1 )
+              rminwrk = max( 2, m, 5*n )
+           else
+              iminwrk = max( 1, n )
+              rminwrk = max( 2, 5*n )
+           end if
+           lquery = (liwork == -1 .or. lcwork == -1 .or. lrwork == -1)
+           info  = 0
+           if ( .not. ( accla .or. acclm .or. acclh ) ) then
+              info = -1
+           else if ( .not.( rowprm .or. stdlib_lsame( jobp, 'N' ) ) ) then
+               info = -2
+           else if ( .not.( rtrans .or. stdlib_lsame( jobr, 'N' ) ) ) then
+               info = -3
+           else if ( .not.( lsvec .or. dntwu ) ) then
+              info = -4
+           else if ( wntur .and. wntva ) then
+              info = -5
+           else if ( .not.( rsvec .or. dntwv )) then
+              info = -5
+           else if ( m<0 ) then
+              info = -6
+           else if ( ( n<0 ) .or. ( n>m ) ) then
+              info = -7
+           else if ( lda<max( 1, m ) ) then
+              info = -9
+           else if ( ldu<1 .or. ( lsvc0 .and. ldu<m ) .or.( wntuf .and. ldu<n ) ) then
+              info = -12
+           else if ( ldv<1 .or. ( rsvec .and. ldv<n ) .or.( conda .and. ldv<n ) ) then
+              info = -14
+           else if ( liwork < iminwrk .and. .not. lquery ) then
+              info = -17
+           end if
+           if ( info == 0 ) then
+              ! Compute The Minimal And The Optimal Workspace Lengths
+              ! [[the expressions for computing the minimal and the optimal
+              ! values of lcwork are written with a lot of redundancy and
+              ! can be simplified. however, this detailed form is easier for
+              ! maintenance and modifications of the code.]]
+              ! Minimal Workspace Length For Stdlib_Zgeqp3 Of An M X N Matrix
+              lwqp3 = n+1
+              ! Minimal Workspace Length For Stdlib_Zunmqr To Build Left Singular Vectors
+              if ( wntus .or. wntur ) then
+                  lwunq  = max( n  , 1 )
+              else if ( wntua ) then
+                  lwunq = max( m , 1 )
+              end if
+              ! Minimal Workspace Length For Stdlib_Zpocon Of An N X N Matrix
+              lwcon = 2 * n
+              ! Stdlib_Zgesvd Of An N X N Matrix
+              lwsvd = max( 3 * n, 1 )
+              if ( lquery ) then
+                  call stdlib_zgeqp3( m, n, a, lda, iwork, cdummy, cdummy, -1,rdummy, ierr )
+                            
+                  lwrk_zgeqp3 = int( cdummy(1),KIND=ilp)
+                  if ( wntus .or. wntur ) then
+                      call stdlib_zunmqr( 'L', 'N', m, n, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_zunmqr = int( cdummy(1),KIND=ilp)
+                  else if ( wntua ) then
+                      call stdlib_zunmqr( 'L', 'N', m, m, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_zunmqr = int( cdummy(1),KIND=ilp)
+                  else
+                      lwrk_zunmqr = 0
+                  end if
+              end if
+              minwrk = 2
+              optwrk = 2
+              if ( .not. (lsvec .or. rsvec ) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If
+                  ! only the singular values are requested
+                  if ( conda ) then
+                     minwrk = max( n+lwqp3, lwcon, lwsvd )
+                  else
+                     minwrk = max( n+lwqp3, lwsvd )
+                  end if
+                  if ( lquery ) then
+                      call stdlib_zgesvd( 'N', 'N', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                rdummy, ierr )
+                      lwrk_zgesvd = int( cdummy(1),KIND=ilp)
+                      if ( conda ) then
+                         optwrk = max( n+lwrk_zgeqp3, n+lwcon, lwrk_zgesvd )
+                      else
+                         optwrk = max( n+lwrk_zgeqp3, lwrk_zgesvd )
+                      end if
+                  end if
+              else if ( lsvec .and. (.not.rsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! singular values and the left singular vectors are requested
+                  if ( conda ) then
+                      minwrk = n + max( lwqp3, lwcon, lwsvd, lwunq )
+                  else
+                      minwrk = n + max( lwqp3, lwsvd, lwunq )
+                  end if
+                  if ( lquery ) then
+                     if ( rtrans ) then
+                        call stdlib_zgesvd( 'N', 'O', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                  rdummy, ierr )
+                     else
+                        call stdlib_zgesvd( 'O', 'N', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                  rdummy, ierr )
+                     end if
+                     lwrk_zgesvd = int( cdummy(1),KIND=ilp)
+                     if ( conda ) then
+                         optwrk = n + max( lwrk_zgeqp3, lwcon, lwrk_zgesvd,lwrk_zunmqr )
+                     else
+                         optwrk = n + max( lwrk_zgeqp3, lwrk_zgesvd,lwrk_zunmqr )
+                     end if
+                  end if
+              else if ( rsvec .and. (.not.lsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! singular values and the right singular vectors are requested
+                  if ( conda ) then
+                      minwrk = n + max( lwqp3, lwcon, lwsvd )
+                  else
+                      minwrk = n + max( lwqp3, lwsvd )
+                  end if
+                  if ( lquery ) then
+                      if ( rtrans ) then
+                          call stdlib_zgesvd( 'O', 'N', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -&
+                                    1, rdummy, ierr )
+                      else
+                          call stdlib_zgesvd( 'N', 'O', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -&
+                                    1, rdummy, ierr )
+                      end if
+                      lwrk_zgesvd = int( cdummy(1),KIND=ilp)
+                      if ( conda ) then
+                          optwrk = n + max( lwrk_zgeqp3, lwcon, lwrk_zgesvd )
+                      else
+                          optwrk = n + max( lwrk_zgeqp3, lwrk_zgesvd )
+                      end if
+                  end if
+              else
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! full svd is requested
+                  if ( rtrans ) then
+                      minwrk = max( lwqp3, lwsvd, lwunq )
+                      if ( conda ) minwrk = max( minwrk, lwcon )
+                      minwrk = minwrk + n
+                      if ( wntva ) then
+                         ! .. minimal workspace length for n x n/2 stdlib_zgeqrf
+                         lwqrf  = max( n/2, 1 )
+                         ! .. minimal workspace length for n/2 x n/2 stdlib_zgesvd
+                         lwsvd2 = max( 3 * (n/2), 1 )
+                         lwunq2 = max( n, 1 )
+                         minwrk2 = max( lwqp3, n/2+lwqrf, n/2+lwsvd2,n/2+lwunq2, lwunq )
+                         if ( conda ) minwrk2 = max( minwrk2, lwcon )
+                         minwrk2 = n + minwrk2
+                         minwrk = max( minwrk, minwrk2 )
+                      end if
+                  else
+                      minwrk = max( lwqp3, lwsvd, lwunq )
+                      if ( conda ) minwrk = max( minwrk, lwcon )
+                      minwrk = minwrk + n
+                      if ( wntva ) then
+                         ! .. minimal workspace length for n/2 x n stdlib_zgelqf
+                         lwlqf  = max( n/2, 1 )
+                         lwsvd2 = max( 3 * (n/2), 1 )
+                         lwunlq = max( n , 1 )
+                         minwrk2 = max( lwqp3, n/2+lwlqf, n/2+lwsvd2,n/2+lwunlq, lwunq )
+                         if ( conda ) minwrk2 = max( minwrk2, lwcon )
+                         minwrk2 = n + minwrk2
+                         minwrk = max( minwrk, minwrk2 )
+                      end if
+                  end if
+                  if ( lquery ) then
+                     if ( rtrans ) then
+                        call stdlib_zgesvd( 'O', 'A', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                  rdummy, ierr )
+                        lwrk_zgesvd = int( cdummy(1),KIND=ilp)
+                        optwrk = max(lwrk_zgeqp3,lwrk_zgesvd,lwrk_zunmqr)
+                        if ( conda ) optwrk = max( optwrk, lwcon )
+                        optwrk = n + optwrk
+                        if ( wntva ) then
+                            call stdlib_zgeqrf(n,n/2,u,ldu,cdummy,cdummy,-1,ierr)
+                            lwrk_zgeqrf = int( cdummy(1),KIND=ilp)
+                            call stdlib_zgesvd( 'S', 'O', n/2,n/2, v,ldv, s, u,ldu,v, ldv, cdummy,&
+                                       -1, rdummy, ierr )
+                            lwrk_zgesvd2 = int( cdummy(1),KIND=ilp)
+                            call stdlib_zunmqr( 'R', 'C', n, n, n/2, u, ldu, cdummy,v, ldv, &
+                                      cdummy, -1, ierr )
+                            lwrk_zunmqr2 = int( cdummy(1),KIND=ilp)
+                            optwrk2 = max( lwrk_zgeqp3, n/2+lwrk_zgeqrf,n/2+lwrk_zgesvd2, n/2+&
+                                      lwrk_zunmqr2 )
+                            if ( conda ) optwrk2 = max( optwrk2, lwcon )
+                            optwrk2 = n + optwrk2
+                            optwrk = max( optwrk, optwrk2 )
+                        end if
+                     else
+                        call stdlib_zgesvd( 'S', 'O', n, n, a, lda, s, u, ldu,v, ldv, cdummy, -1, &
+                                  rdummy, ierr )
+                        lwrk_zgesvd = int( cdummy(1),KIND=ilp)
+                        optwrk = max(lwrk_zgeqp3,lwrk_zgesvd,lwrk_zunmqr)
+                        if ( conda ) optwrk = max( optwrk, lwcon )
+                        optwrk = n + optwrk
+                        if ( wntva ) then
+                           call stdlib_zgelqf(n/2,n,u,ldu,cdummy,cdummy,-1,ierr)
+                           lwrk_zgelqf = int( cdummy(1),KIND=ilp)
+                           call stdlib_zgesvd( 'S','O', n/2,n/2, v, ldv, s, u, ldu,v, ldv, cdummy,&
+                                      -1, rdummy, ierr )
+                           lwrk_zgesvd2 = int( cdummy(1),KIND=ilp)
+                           call stdlib_zunmlq( 'R', 'N', n, n, n/2, u, ldu, cdummy,v, ldv, cdummy,&
+                                     -1,ierr )
+                           lwrk_zunmlq = int( cdummy(1),KIND=ilp)
+                           optwrk2 = max( lwrk_zgeqp3, n/2+lwrk_zgelqf,n/2+lwrk_zgesvd2, n/2+&
+                                     lwrk_zunmlq )
+                            if ( conda ) optwrk2 = max( optwrk2, lwcon )
+                            optwrk2 = n + optwrk2
+                            optwrk = max( optwrk, optwrk2 )
+                        end if
+                     end if
+                  end if
+              end if
+              minwrk = max( 2, minwrk )
+              optwrk = max( 2, optwrk )
+              if ( lcwork < minwrk .and. (.not.lquery) ) info = -19
+           end if
+           if (info == 0 .and. lrwork < rminwrk .and. .not. lquery) then
+              info = -21
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGESVDQ', -info )
+              return
+           else if ( lquery ) then
+           ! return optimal workspace
+               iwork(1) = iminwrk
+               cwork(1) = optwrk
+               cwork(2) = minwrk
+               rwork(1) = rminwrk
+               return
+           end if
+           ! quick return if the matrix is void.
+           if( ( m==0 ) .or. ( n==0 ) ) then
+           ! All Output Is Void
+              return
+           end if
+           big = stdlib_dlamch('O')
+           ascaled = .false.
+           if ( rowprm ) then
+                 ! Reordering The Rows In Decreasing Sequence In The
+                 ! ell-infinity norm - this enhances numerical robustness in
+                 ! the case of differently scaled rows.
+                 do p = 1, m
+                     ! rwork(p) = abs( a(p,stdlib_izamax(n,a(p,1),lda)) )
+                     ! [[stdlib_zlange will return nan if an entry of the p-th row is nan]]
+                     rwork(p) = stdlib_zlange( 'M', 1, n, a(p,1), lda, rdummy )
+                     ! .. check for nan's and inf's
+                     if ( ( rwork(p) /= rwork(p) ) .or.( (rwork(p)*zero) /= zero ) ) then
+                         info = -8
+                         call stdlib_xerbla( 'ZGESVDQ', -info )
+                         return
+                     end if
+                 end do
+                 do p = 1, m - 1
+                 q = stdlib_idamax( m-p+1, rwork(p), 1 ) + p - 1
+                 iwork(n+p) = q
+                 if ( p /= q ) then
+                    rtmp     = rwork(p)
+                    rwork(p) = rwork(q)
+                    rwork(q) = rtmp
+                 end if
+                 end do
+                 if ( rwork(1) == zero ) then
+                    ! quick return: a is the m x n zero matrix.
+                    numrank = 0
+                    call stdlib_dlaset( 'G', n, 1, zero, zero, s, n )
+                    if ( wntus ) call stdlib_zlaset('G', m, n, czero, cone, u, ldu)
+                    if ( wntua ) call stdlib_zlaset('G', m, m, czero, cone, u, ldu)
+                    if ( wntva ) call stdlib_zlaset('G', n, n, czero, cone, v, ldv)
+                    if ( wntuf ) then
+                        call stdlib_zlaset( 'G', n, 1, czero, czero, cwork, n )
+                        call stdlib_zlaset( 'G', m, n, czero, cone, u, ldu )
+                    end if
+                    do p = 1, n
+                        iwork(p) = p
+                    end do
+                    if ( rowprm ) then
+                        do p = n + 1, n + m - 1
+                            iwork(p) = p - n
+                        end do
+                    end if
+                    if ( conda ) rwork(1) = -1
+                    rwork(2) = -1
+                    return
+                 end if
+                 if ( rwork(1) > big / sqrt(real(m,KIND=dp)) ) then
+                     ! .. to prevent overflow in the qr factorization, scale the
+                     ! matrix by 1/sqrt(m) if too large entry detected
+                     call stdlib_zlascl('G',0,0,sqrt(real(m,KIND=dp)),one, m,n, a,lda, ierr)
+                               
+                     ascaled = .true.
+                 end if
+                 call stdlib_zlaswp( n, a, lda, 1, m-1, iwork(n+1), 1 )
+           end if
+          ! .. at this stage, preemptive scaling is done only to avoid column
+          ! norms overflows during the qr factorization. the svd procedure should
+          ! have its own scaling to save the singular values from overflows and
+          ! underflows. that depends on the svd procedure.
+           if ( .not.rowprm ) then
+               rtmp = stdlib_zlange( 'M', m, n, a, lda, rwork )
+               if ( ( rtmp /= rtmp ) .or.( (rtmp*zero) /= zero ) ) then
+                    info = -8
+                    call stdlib_xerbla( 'ZGESVDQ', -info )
+                    return
+               end if
+               if ( rtmp > big / sqrt(real(m,KIND=dp)) ) then
+                   ! .. to prevent overflow in the qr factorization, scale the
+                   ! matrix by 1/sqrt(m) if too large entry detected
+                   call stdlib_zlascl('G',0,0, sqrt(real(m,KIND=dp)),one, m,n, a,lda, ierr)
+                             
+                   ascaled = .true.
+               end if
+           end if
+           ! Qr Factorization With Column Pivoting
+           ! a * p = q * [ r ]
+                       ! [ 0 ]
+           do p = 1, n
+              ! All Columns Are Free Columns
+              iwork(p) = 0
+           end do
+           call stdlib_zgeqp3( m, n, a, lda, iwork, cwork, cwork(n+1), lcwork-n,rwork, ierr )
+                     
+          ! if the user requested accuracy level allows truncation in the
+          ! computed upper triangular factor, the matrix r is examined and,
+          ! if possible, replaced with its leading upper trapezoidal part.
+           epsln = stdlib_dlamch('E')
+           sfmin = stdlib_dlamch('S')
+           ! small = sfmin / epsln
+           nr = n
+           if ( accla ) then
+              ! standard absolute error bound suffices. all sigma_i with
+              ! sigma_i < n*eps*||a||_f are flushed to zero. this is an
+              ! aggressive enforcement of lower numerical rank by introducing a
+              ! backward error of the order of n*eps*||a||_f.
+              nr = 1
+              rtmp = sqrt(real(n,KIND=dp))*epsln
+              do p = 2, n
+                 if ( abs(a(p,p)) < (rtmp*abs(a(1,1))) ) go to 3002
+                    nr = nr + 1
+              end do
+              3002 continue
+           elseif ( acclm ) then
+              ! .. similarly as above, only slightly more gentle (less aggressive).
+              ! sudden drop on the diagonal of r is used as the criterion for being
+              ! close-to-rank-deficient. the threshold is set to epsln=stdlib_dlamch('e').
+              ! [[this can be made more flexible by replacing this hard-coded value
+              ! with a user specified threshold.]] also, the values that underflow
+              ! will be truncated.
+              nr = 1
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < (epsln*abs(a(p-1,p-1))) ) .or.( abs(a(p,p)) < sfmin ) ) go &
+                           to 3402
+                 nr = nr + 1
+              end do
+              3402 continue
+           else
+              ! Rrqr Not Authorized To Determine Numerical Rank Except In The
+              ! obvious case of zero pivots.
+              ! .. inspect r for exact zeros on the diagonal;
+              ! r(i,i)=0 => r(i:n,i:n)=0.
+              nr = 1
+              do p = 2, n
+                 if ( abs(a(p,p)) == zero ) go to 3502
+                 nr = nr + 1
+              end do
+              3502 continue
+              if ( conda ) then
+                 ! estimate the scaled condition number of a. use the fact that it is
+                 ! the same as the scaled condition number of r.
+                    ! V Is Used As Workspace
+                    call stdlib_zlacpy( 'U', n, n, a, lda, v, ldv )
+                    ! only the leading nr x nr submatrix of the triangular factor
+                    ! is considered. only if nr=n will this give a reliable error
+                    ! bound. however, even for nr < n, this can be used on an
+                    ! expert level and obtain useful information in the sense of
+                    ! perturbation theory.
+                    do p = 1, nr
+                       rtmp = stdlib_dznrm2( p, v(1,p), 1 )
+                       call stdlib_zdscal( p, one/rtmp, v(1,p), 1 )
+                    end do
+                    if ( .not. ( lsvec .or. rsvec ) ) then
+                        call stdlib_zpocon( 'U', nr, v, ldv, one, rtmp,cwork, rwork, ierr )
+                                  
+                    else
+                        call stdlib_zpocon( 'U', nr, v, ldv, one, rtmp,cwork(n+1), rwork, ierr )
+                                  
+                    end if
+                    sconda = one / sqrt(rtmp)
+                 ! for nr=n, sconda is an estimate of sqrt(||(r^* * r)^(-1)||_1),
+                 ! n^(-1/4) * sconda <= ||r^(-1)||_2 <= n^(1/4) * sconda
+                 ! see the reference [1] for more details.
+              end if
+           endif
+           if ( wntur ) then
+               n1 = nr
+           else if ( wntus .or. wntuf) then
+               n1 = n
+           else if ( wntua ) then
+               n1 = m
+           end if
+           if ( .not. ( rsvec .or. lsvec ) ) then
+      ! .......................................................................
+              ! Only The Singular Values Are Requested
+      ! .......................................................................
+              if ( rtrans ) then
+               ! .. compute the singular values of r**h = [a](1:nr,1:n)**h
+                 ! .. set the lower triangle of [a] to [a](1:nr,1:n)**h and
+                 ! the upper triangle of [a] to zero.
+                 do p = 1, min( n, nr )
+                    a(p,p) = conjg(a(p,p))
+                    do q = p + 1, n
+                       a(q,p) = conjg(a(p,q))
+                       if ( q <= nr ) a(p,q) = czero
+                    end do
+                 end do
+                 call stdlib_zgesvd( 'N', 'N', n, nr, a, lda, s, u, ldu,v, ldv, cwork, lcwork, &
+                           rwork, info )
+              else
+                 ! .. compute the singular values of r = [a](1:nr,1:n)
+                 if ( nr > 1 )call stdlib_zlaset( 'L', nr-1,nr-1, czero,czero, a(2,1), lda )
+                           
+                 call stdlib_zgesvd( 'N', 'N', nr, n, a, lda, s, u, ldu,v, ldv, cwork, lcwork, &
+                           rwork, info )
+              end if
+           else if ( lsvec .and. ( .not. rsvec) ) then
+      ! .......................................................................
+             ! The Singular Values And The Left Singular Vectors Requested
+      ! .......................................................................""""""""
+              if ( rtrans ) then
+                  ! .. apply stdlib_zgesvd to r**h
+                  ! .. copy r**h into [u] and overwrite [u] with the right singular
+                  ! vectors of r
+                 do p = 1, nr
+                    do q = p, n
+                       u(q,p) = conjg(a(p,q))
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_zlaset( 'U', nr-1,nr-1, czero,czero, u(1,2), ldu )
+                           
+                 ! .. the left singular vectors not computed, the nr right singular
+                 ! vectors overwrite [u](1:nr,1:nr) as conjugate transposed. these
+                 ! will be pre-multiplied by q to build the left singular vectors of a.
+                    call stdlib_zgesvd( 'N', 'O', n, nr, u, ldu, s, u, ldu,u, ldu, cwork(n+1), &
+                              lcwork-n, rwork, info )
+                    do p = 1, nr
+                        u(p,p) = conjg(u(p,p))
+                        do q = p + 1, nr
+                           ctmp   = conjg(u(q,p))
+                           u(q,p) = conjg(u(p,q))
+                           u(p,q) = ctmp
+                        end do
+                    end do
+              else
+                  ! Apply Stdlib_Zgesvd To R
+                  ! .. copy r into [u] and overwrite [u] with the left singular vectors
+                  call stdlib_zlacpy( 'U', nr, n, a, lda, u, ldu )
+                  if ( nr > 1 )call stdlib_zlaset( 'L', nr-1, nr-1, czero, czero, u(2,1), ldu )
+                            
+                  ! .. the right singular vectors not computed, the nr left singular
+                  ! vectors overwrite [u](1:nr,1:nr)
+                     call stdlib_zgesvd( 'O', 'N', nr, n, u, ldu, s, u, ldu,v, ldv, cwork(n+1), &
+                               lcwork-n, rwork, info )
+                     ! .. now [u](1:nr,1:nr) contains the nr left singular vectors of
+                     ! r. these will be pre-multiplied by q to build the left singular
+                     ! vectors of a.
+              end if
+                 ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+              if ( ( nr < m ) .and. ( .not.wntuf ) ) then
+                  call stdlib_zlaset('A', m-nr, nr, czero, czero, u(nr+1,1), ldu)
+                  if ( nr < n1 ) then
+                     call stdlib_zlaset( 'A',nr,n1-nr,czero,czero,u(1,nr+1), ldu )
+                     call stdlib_zlaset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1), ldu )
+                  end if
+              end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! vectors matrix u.
+              if ( .not.wntuf )call stdlib_zunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, &
+                        cwork(n+1), lcwork-n, ierr )
+              if ( rowprm .and. .not.wntuf )call stdlib_zlaswp( n1, u, ldu, 1, m-1, iwork(n+1), -&
+                        1 )
+           else if ( rsvec .and. ( .not. lsvec ) ) then
+      ! .......................................................................
+             ! The Singular Values And The Right Singular Vectors Requested
+      ! .......................................................................
+               if ( rtrans ) then
+                  ! .. apply stdlib_zgesvd to r**h
+                  ! .. copy r**h into v and overwrite v with the left singular vectors
+                 do p = 1, nr
+                    do q = p, n
+                       v(q,p) = conjg(a(p,q))
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_zlaset( 'U', nr-1,nr-1, czero,czero, v(1,2), ldv )
+                           
+                 ! .. the left singular vectors of r**h overwrite v, the right singular
+                 ! vectors not computed
+                 if ( wntvr .or. ( nr == n ) ) then
+                    call stdlib_zgesvd( 'O', 'N', n, nr, v, ldv, s, u, ldu,u, ldu, cwork(n+1), &
+                              lcwork-n, rwork, info )
+                    do p = 1, nr
+                        v(p,p) = conjg(v(p,p))
+                        do q = p + 1, nr
+                           ctmp   = conjg(v(q,p))
+                           v(q,p) = conjg(v(p,q))
+                           v(p,q) = ctmp
+                        end do
+                    end do
+                    if ( nr < n ) then
+                        do p = 1, nr
+                           do q = nr + 1, n
+                               v(p,q) = conjg(v(q,p))
+                           end do
+                        end do
+                    end if
+                    call stdlib_zlapmt( .false., nr, n, v, ldv, iwork )
+                 else
+                     ! .. need all n right singular vectors and nr < n
+                     ! [!] this is simple implementation that augments [v](1:n,1:nr)
+                     ! by padding a zero block. in the case nr << n, a more efficient
+                     ! way is to first use the qr factorization. for more details
+                     ! how to implement this, see the " full svd " branch.
+                     call stdlib_zlaset('G', n, n-nr, czero, czero, v(1,nr+1), ldv)
+                     call stdlib_zgesvd( 'O', 'N', n, n, v, ldv, s, u, ldu,u, ldu, cwork(n+1), &
+                               lcwork-n, rwork, info )
+                     do p = 1, n
+                        v(p,p) = conjg(v(p,p))
+                        do q = p + 1, n
+                           ctmp   = conjg(v(q,p))
+                           v(q,p) = conjg(v(p,q))
+                           v(p,q) = ctmp
+                        end do
+                     end do
+                     call stdlib_zlapmt( .false., n, n, v, ldv, iwork )
+                 end if
+               else
+                  ! Aply Stdlib_Zgesvd To R
+                  ! Copy R Into V And Overwrite V With The Right Singular Vectors
+                  call stdlib_zlacpy( 'U', nr, n, a, lda, v, ldv )
+                  if ( nr > 1 )call stdlib_zlaset( 'L', nr-1, nr-1, czero, czero, v(2,1), ldv )
+                            
+                  ! .. the right singular vectors overwrite v, the nr left singular
+                  ! vectors stored in u(1:nr,1:nr)
+                  if ( wntvr .or. ( nr == n ) ) then
+                     call stdlib_zgesvd( 'N', 'O', nr, n, v, ldv, s, u, ldu,v, ldv, cwork(n+1), &
+                               lcwork-n, rwork, info )
+                     call stdlib_zlapmt( .false., nr, n, v, ldv, iwork )
+                     ! .. now [v](1:nr,1:n) contains v(1:n,1:nr)**h
+                  else
+                     ! .. need all n right singular vectors and nr < n
+                     ! [!] this is simple implementation that augments [v](1:nr,1:n)
+                     ! by padding a zero block. in the case nr << n, a more efficient
+                     ! way is to first use the lq factorization. for more details
+                     ! how to implement this, see the " full svd " branch.
+                      call stdlib_zlaset('G', n-nr, n, czero,czero, v(nr+1,1), ldv)
+                      call stdlib_zgesvd( 'N', 'O', n, n, v, ldv, s, u, ldu,v, ldv, cwork(n+1), &
+                                lcwork-n, rwork, info )
+                      call stdlib_zlapmt( .false., n, n, v, ldv, iwork )
+                  end if
+                  ! .. now [v] contains the adjoint of the matrix of the right singular
+                  ! vectors of a.
+               end if
+           else
+      ! .......................................................................
+             ! Full Svd Requested
+      ! .......................................................................
+              if ( rtrans ) then
+                  ! .. apply stdlib_zgesvd to r**h [[this option is left for r
+                 if ( wntvr .or. ( nr == n ) ) then
+                  ! .. copy r**h into [v] and overwrite [v] with the left singular
+                  ! vectors of r**h
+                 do p = 1, nr
+                    do q = p, n
+                       v(q,p) = conjg(a(p,q))
+                    end do
+                 end do
+                 if ( nr > 1 )call stdlib_zlaset( 'U', nr-1,nr-1, czero,czero, v(1,2), ldv )
+                           
+                 ! .. the left singular vectors of r**h overwrite [v], the nr right
+                 ! singular vectors of r**h stored in [u](1:nr,1:nr) as conjugate
+                 ! transposed
+                    call stdlib_zgesvd( 'O', 'A', n, nr, v, ldv, s, v, ldv,u, ldu, cwork(n+1), &
+                              lcwork-n, rwork, info )
+                    ! Assemble V
+                    do p = 1, nr
+                       v(p,p) = conjg(v(p,p))
+                       do q = p + 1, nr
+                          ctmp   = conjg(v(q,p))
+                          v(q,p) = conjg(v(p,q))
+                          v(p,q) = ctmp
+                       end do
+                    end do
+                    if ( nr < n ) then
+                        do p = 1, nr
+                           do q = nr+1, n
+                              v(p,q) = conjg(v(q,p))
+                           end do
+                        end do
+                    end if
+                    call stdlib_zlapmt( .false., nr, n, v, ldv, iwork )
+                     do p = 1, nr
+                        u(p,p) = conjg(u(p,p))
+                        do q = p + 1, nr
+                           ctmp   = conjg(u(q,p))
+                           u(q,p) = conjg(u(p,q))
+                           u(p,q) = ctmp
+                        end do
+                     end do
+                     if ( ( nr < m ) .and. .not.(wntuf)) then
+                       call stdlib_zlaset('A', m-nr,nr, czero,czero, u(nr+1,1), ldu)
+                       if ( nr < n1 ) then
+                          call stdlib_zlaset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                          call stdlib_zlaset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1), ldu )
+                                    
+                       end if
+                    end if
+                 else
+                     ! .. need all n right singular vectors and nr < n
+                  ! .. copy r**h into [v] and overwrite [v] with the left singular
+                  ! vectors of r**h
+                     ! [[the optimal ratio n/nr for using qrf instead of padding
+                       ! with zeros. here hard coded to 2; it must be at least
+                       ! two due to work space constraints.]]
+                     ! optratio = stdlib_ilaenv(6, 'zgesvd', 's' // 'o', nr,n,0,0)
+                     ! optratio = max( optratio, 2 )
+                     optratio = 2
+                     if ( optratio*nr > n ) then
+                        do p = 1, nr
+                           do q = p, n
+                              v(q,p) = conjg(a(p,q))
+                           end do
+                        end do
+                        if ( nr > 1 )call stdlib_zlaset('U',nr-1,nr-1, czero,czero, v(1,2),ldv)
+                                  
+                        call stdlib_zlaset('A',n,n-nr,czero,czero,v(1,nr+1),ldv)
+                        call stdlib_zgesvd( 'O', 'A', n, n, v, ldv, s, v, ldv,u, ldu, cwork(n+1), &
+                                  lcwork-n, rwork, info )
+                        do p = 1, n
+                           v(p,p) = conjg(v(p,p))
+                           do q = p + 1, n
+                              ctmp   = conjg(v(q,p))
+                              v(q,p) = conjg(v(p,q))
+                              v(p,q) = ctmp
+                           end do
+                        end do
+                        call stdlib_zlapmt( .false., n, n, v, ldv, iwork )
+                    ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x n1), i.e. (m x n) or (m x m).
+                        do p = 1, n
+                           u(p,p) = conjg(u(p,p))
+                           do q = p + 1, n
+                              ctmp   = conjg(u(q,p))
+                              u(q,p) = conjg(u(p,q))
+                              u(p,q) = ctmp
+                           end do
+                        end do
+                        if ( ( n < m ) .and. .not.(wntuf)) then
+                           call stdlib_zlaset('A',m-n,n,czero,czero,u(n+1,1),ldu)
+                           if ( n < n1 ) then
+                             call stdlib_zlaset('A',n,n1-n,czero,czero,u(1,n+1),ldu)
+                             call stdlib_zlaset('A',m-n,n1-n,czero,cone,u(n+1,n+1), ldu )
+                           end if
+                        end if
+                     else
+                        ! .. copy r**h into [u] and overwrite [u] with the right
+                        ! singular vectors of r
+                        do p = 1, nr
+                           do q = p, n
+                              u(q,nr+p) = conjg(a(p,q))
+                           end do
+                        end do
+                        if ( nr > 1 )call stdlib_zlaset('U',nr-1,nr-1,czero,czero,u(1,nr+2),ldu)
+                                  
+                        call stdlib_zgeqrf( n, nr, u(1,nr+1), ldu, cwork(n+1),cwork(n+nr+1), &
+                                  lcwork-n-nr, ierr )
+                        do p = 1, nr
+                            do q = 1, n
+                                v(q,p) = conjg(u(p,nr+q))
+                            end do
+                        end do
+                       call stdlib_zlaset('U',nr-1,nr-1,czero,czero,v(1,2),ldv)
+                       call stdlib_zgesvd( 'S', 'O', nr, nr, v, ldv, s, u, ldu,v,ldv, cwork(n+nr+&
+                                 1),lcwork-n-nr,rwork, info )
+                       call stdlib_zlaset('A',n-nr,nr,czero,czero,v(nr+1,1),ldv)
+                       call stdlib_zlaset('A',nr,n-nr,czero,czero,v(1,nr+1),ldv)
+                       call stdlib_zlaset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                       call stdlib_zunmqr('R','C', n, n, nr, u(1,nr+1), ldu,cwork(n+1),v,ldv,&
+                                 cwork(n+nr+1),lcwork-n-nr,ierr)
+                       call stdlib_zlapmt( .false., n, n, v, ldv, iwork )
+                       ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                       ! (m x nr) or (m x n) or (m x m).
+                       if ( ( nr < m ) .and. .not.(wntuf)) then
+                          call stdlib_zlaset('A',m-nr,nr,czero,czero,u(nr+1,1),ldu)
+                          if ( nr < n1 ) then
+                          call stdlib_zlaset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                          call stdlib_zlaset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1),ldu)
+                          end if
+                       end if
+                     end if
+                 end if
+              else
+                  ! .. apply stdlib_zgesvd to r [[this is the recommended option]]
+                  if ( wntvr .or. ( nr == n ) ) then
+                      ! .. copy r into [v] and overwrite v with the right singular vectors
+                      call stdlib_zlacpy( 'U', nr, n, a, lda, v, ldv )
+                     if ( nr > 1 )call stdlib_zlaset( 'L', nr-1,nr-1, czero,czero, v(2,1), ldv )
+                               
+                     ! .. the right singular vectors of r overwrite [v], the nr left
+                     ! singular vectors of r stored in [u](1:nr,1:nr)
+                     call stdlib_zgesvd( 'S', 'O', nr, n, v, ldv, s, u, ldu,v, ldv, cwork(n+1), &
+                               lcwork-n, rwork, info )
+                     call stdlib_zlapmt( .false., nr, n, v, ldv, iwork )
+                     ! .. now [v](1:nr,1:n) contains v(1:n,1:nr)**h
+                     ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+                    if ( ( nr < m ) .and. .not.(wntuf)) then
+                       call stdlib_zlaset('A', m-nr,nr, czero,czero, u(nr+1,1), ldu)
+                       if ( nr < n1 ) then
+                          call stdlib_zlaset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                          call stdlib_zlaset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1), ldu )
+                                    
+                       end if
+                    end if
+                  else
+                    ! .. need all n right singular vectors and nr < n
+                    ! The Requested Number Of The Left Singular Vectors
+                     ! is then n1 (n or m)
+                     ! [[the optimal ratio n/nr for using lq instead of padding
+                       ! with zeros. here hard coded to 2; it must be at least
+                       ! two due to work space constraints.]]
+                     ! optratio = stdlib_ilaenv(6, 'zgesvd', 's' // 'o', nr,n,0,0)
+                     ! optratio = max( optratio, 2 )
+                    optratio = 2
+                    if ( optratio * nr > n ) then
+                       call stdlib_zlacpy( 'U', nr, n, a, lda, v, ldv )
+                       if ( nr > 1 )call stdlib_zlaset('L', nr-1,nr-1, czero,czero, v(2,1),ldv)
+                                 
+                    ! .. the right singular vectors of r overwrite [v], the nr left
+                       ! singular vectors of r stored in [u](1:nr,1:nr)
+                       call stdlib_zlaset('A', n-nr,n, czero,czero, v(nr+1,1),ldv)
+                       call stdlib_zgesvd( 'S', 'O', n, n, v, ldv, s, u, ldu,v, ldv, cwork(n+1), &
+                                 lcwork-n, rwork, info )
+                       call stdlib_zlapmt( .false., n, n, v, ldv, iwork )
+                       ! .. now [v] contains the adjoint of the matrix of the right
+                       ! singular vectors of a. the leading n left singular vectors
+                       ! are in [u](1:n,1:n)
+                       ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                       ! (m x n1), i.e. (m x n) or (m x m).
+                       if ( ( n < m ) .and. .not.(wntuf)) then
+                           call stdlib_zlaset('A',m-n,n,czero,czero,u(n+1,1),ldu)
+                           if ( n < n1 ) then
+                             call stdlib_zlaset('A',n,n1-n,czero,czero,u(1,n+1),ldu)
+                             call stdlib_zlaset( 'A',m-n,n1-n,czero,cone,u(n+1,n+1), ldu )
+                           end if
+                       end if
+                    else
+                       call stdlib_zlacpy( 'U', nr, n, a, lda, u(nr+1,1), ldu )
+                       if ( nr > 1 )call stdlib_zlaset('L',nr-1,nr-1,czero,czero,u(nr+2,1),ldu)
+                                 
+                       call stdlib_zgelqf( nr, n, u(nr+1,1), ldu, cwork(n+1),cwork(n+nr+1), &
+                                 lcwork-n-nr, ierr )
+                       call stdlib_zlacpy('L',nr,nr,u(nr+1,1),ldu,v,ldv)
+                       if ( nr > 1 )call stdlib_zlaset('U',nr-1,nr-1,czero,czero,v(1,2),ldv)
+                                 
+                       call stdlib_zgesvd( 'S', 'O', nr, nr, v, ldv, s, u, ldu,v, ldv, cwork(n+nr+&
+                                 1), lcwork-n-nr, rwork, info )
+                       call stdlib_zlaset('A',n-nr,nr,czero,czero,v(nr+1,1),ldv)
+                       call stdlib_zlaset('A',nr,n-nr,czero,czero,v(1,nr+1),ldv)
+                       call stdlib_zlaset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                       call stdlib_zunmlq('R','N',n,n,nr,u(nr+1,1),ldu,cwork(n+1),v, ldv, cwork(n+&
+                                 nr+1),lcwork-n-nr,ierr)
+                       call stdlib_zlapmt( .false., n, n, v, ldv, iwork )
+                     ! Assemble The Left Singular Vector Matrix U Of Dimensions
+                    ! (m x nr) or (m x n) or (m x m).
+                       if ( ( nr < m ) .and. .not.(wntuf)) then
+                          call stdlib_zlaset('A',m-nr,nr,czero,czero,u(nr+1,1),ldu)
+                          if ( nr < n1 ) then
+                          call stdlib_zlaset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                          call stdlib_zlaset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1), ldu )
+                                    
+                          end if
+                       end if
+                    end if
+                  end if
+              ! .. end of the "r**h or r" branch
+              end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! vectors matrix u.
+              if ( .not. wntuf )call stdlib_zunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, &
+                        cwork(n+1), lcwork-n, ierr )
+              if ( rowprm .and. .not.wntuf )call stdlib_zlaswp( n1, u, ldu, 1, m-1, iwork(n+1), -&
+                        1 )
+           ! ... end of the "full svd" branch
+           end if
+           ! check whether some singular values are returned as zeros, e.g.
+           ! due to underflow, and update the numerical rank.
+           p = nr
+           do q = p, 1, -1
+               if ( s(q) > zero ) go to 4002
+               nr = nr - 1
+           end do
+           4002 continue
+           ! .. if numerical rank deficiency is detected, the truncated
+           ! singular values are set to zero.
+           if ( nr < n ) call stdlib_dlaset( 'G', n-nr,1, zero,zero, s(nr+1), n )
+           ! .. undo scaling; this may cause overflow in the largest singular
+           ! values.
+           if ( ascaled )call stdlib_dlascl( 'G',0,0, one,sqrt(real(m,KIND=dp)), nr,1, s, n, ierr &
+                     )
+           if ( conda ) rwork(1) = sconda
+           rwork(2) = p - nr
+           ! .. p-nr is the number of singular values that are computed as
+           ! exact zeros in stdlib_zgesvd() applied to the (possibly truncated)
+           ! full row rank triangular (trapezoidal) factor of a.
+           numrank = nr
+           return
+     end subroutine stdlib_zgesvdq
+
+     !> ZGESVX: uses the LU factorization to compute the solution to a complex
+     !> system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+     !> Error bounds on the solution and a condition estimate are also
+     !> provided.
+
+     subroutine stdlib_zgesvx( fact, trans, n, nrhs, a, lda, af, ldaf, ipiv,equed, r, c, b, ldb, &
+               x, ldx, rcond, ferr, berr,work, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(inout) :: equed
+           character, intent(in) :: fact, trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldaf, ldb, ldx, n, nrhs
+           real(dp), intent(out) :: rcond
+           ! Array Arguments 
+           integer(ilp), intent(inout) :: ipiv(*)
+           real(dp), intent(out) :: berr(*), ferr(*), rwork(*)
+           real(dp), intent(inout) :: c(*), r(*)
+           complex(dp), intent(inout) :: a(lda,*), af(ldaf,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*), x(ldx,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: colequ, equil, nofact, notran, rowequ
+           character :: norm
+           integer(ilp) :: i, infequ, j
+           real(dp) :: amax, anorm, bignum, colcnd, rcmax, rcmin, rowcnd, rpvgrw, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,min
+           ! Executable Statements 
+           info = 0
+           nofact = stdlib_lsame( fact, 'N' )
+           equil = stdlib_lsame( fact, 'E' )
+           notran = stdlib_lsame( trans, 'N' )
+           if( nofact .or. equil ) then
+              equed = 'N'
+              rowequ = .false.
+              colequ = .false.
+           else
+              rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+              colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              smlnum = stdlib_dlamch( 'SAFE MINIMUM' )
+              bignum = one / smlnum
+           end if
+           ! test the input parameters.
+           if( .not.nofact .and. .not.equil .and. .not.stdlib_lsame( fact, 'F' ) )then
+              info = -1
+           else if( .not.notran .and. .not.stdlib_lsame( trans, 'T' ) .and. .not.stdlib_lsame( &
+                     trans, 'C' ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldaf<max( 1, n ) ) then
+              info = -8
+           else if( stdlib_lsame( fact, 'F' ) .and. .not.( rowequ .or. colequ .or. stdlib_lsame( &
+                     equed, 'N' ) ) ) then
+              info = -10
+           else
+              if( rowequ ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, r( j ) )
+                    rcmax = max( rcmax, r( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -11
+                 else if( n>0 ) then
+                    rowcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    rowcnd = one
+                 end if
+              end if
+              if( colequ .and. info==0 ) then
+                 rcmin = bignum
+                 rcmax = zero
+                 do j = 1, n
+                    rcmin = min( rcmin, c( j ) )
+                    rcmax = max( rcmax, c( j ) )
+                 end do
+                 if( rcmin<=zero ) then
+                    info = -12
+                 else if( n>0 ) then
+                    colcnd = max( rcmin, smlnum ) / min( rcmax, bignum )
+                 else
+                    colcnd = one
+                 end if
+              end if
+              if( info==0 ) then
+                 if( ldb<max( 1, n ) ) then
+                    info = -14
+                 else if( ldx<max( 1, n ) ) then
+                    info = -16
+                 end if
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGESVX', -info )
+              return
+           end if
+           if( equil ) then
+              ! compute row and column scalings to equilibrate the matrix a.
+              call stdlib_zgeequ( n, n, a, lda, r, c, rowcnd, colcnd, amax, infequ )
+              if( infequ==0 ) then
+                 ! equilibrate the matrix.
+                 call stdlib_zlaqge( n, n, a, lda, r, c, rowcnd, colcnd, amax,equed )
+                 rowequ = stdlib_lsame( equed, 'R' ) .or. stdlib_lsame( equed, 'B' )
+                 colequ = stdlib_lsame( equed, 'C' ) .or. stdlib_lsame( equed, 'B' )
+              end if
+           end if
+           ! scale the right hand side.
+           if( notran ) then
+              if( rowequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       b( i, j ) = r( i )*b( i, j )
+                    end do
+                 end do
+              end if
+           else if( colequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    b( i, j ) = c( i )*b( i, j )
+                 end do
+              end do
+           end if
+           if( nofact .or. equil ) then
+              ! compute the lu factorization of a.
+              call stdlib_zlacpy( 'FULL', n, n, a, lda, af, ldaf )
+              call stdlib_zgetrf( n, n, af, ldaf, ipiv, info )
+              ! return if info is non-zero.
+              if( info>0 ) then
+                 ! compute the reciprocal pivot growth factor of the
+                 ! leading rank-deficient info columns of a.
+                 rpvgrw = stdlib_zlantr( 'M', 'U', 'N', info, info, af, ldaf,rwork )
+                 if( rpvgrw==zero ) then
+                    rpvgrw = one
+                 else
+                    rpvgrw = stdlib_zlange( 'M', n, info, a, lda, rwork ) /rpvgrw
+                 end if
+                 rwork( 1 ) = rpvgrw
+                 rcond = zero
+                 return
+              end if
+           end if
+           ! compute the norm of the matrix a and the
+           ! reciprocal pivot growth factor rpvgrw.
+           if( notran ) then
+              norm = '1'
+           else
+              norm = 'I'
+           end if
+           anorm = stdlib_zlange( norm, n, n, a, lda, rwork )
+           rpvgrw = stdlib_zlantr( 'M', 'U', 'N', n, n, af, ldaf, rwork )
+           if( rpvgrw==zero ) then
+              rpvgrw = one
+           else
+              rpvgrw = stdlib_zlange( 'M', n, n, a, lda, rwork ) / rpvgrw
+           end if
+           ! compute the reciprocal of the condition number of a.
+           call stdlib_zgecon( norm, n, af, ldaf, anorm, rcond, work, rwork, info )
+           ! compute the solution matrix x.
+           call stdlib_zlacpy( 'FULL', n, nrhs, b, ldb, x, ldx )
+           call stdlib_zgetrs( trans, n, nrhs, af, ldaf, ipiv, x, ldx, info )
+           ! use iterative refinement to improve the computed solution and
+           ! compute error bounds and backward error estimates for it.
+           call stdlib_zgerfs( trans, n, nrhs, a, lda, af, ldaf, ipiv, b, ldb, x,ldx, ferr, berr, &
+                     work, rwork, info )
+           ! transform the solution matrix x to a solution of the original
+           ! system.
+           if( notran ) then
+              if( colequ ) then
+                 do j = 1, nrhs
+                    do i = 1, n
+                       x( i, j ) = c( i )*x( i, j )
+                    end do
+                 end do
+                 do j = 1, nrhs
+                    ferr( j ) = ferr( j ) / colcnd
+                 end do
+              end if
+           else if( rowequ ) then
+              do j = 1, nrhs
+                 do i = 1, n
+                    x( i, j ) = r( i )*x( i, j )
+                 end do
+              end do
+              do j = 1, nrhs
+                 ferr( j ) = ferr( j ) / rowcnd
+              end do
+           end if
+           ! set info = n+1 if the matrix is singular to working precision.
+           if( rcond<stdlib_dlamch( 'EPSILON' ) )info = n + 1
+           rwork( 1 ) = rpvgrw
+           return
+     end subroutine stdlib_zgesvx
+
+     !> ZGETSLS: solves overdetermined or underdetermined complex linear systems
+     !> involving an M-by-N matrix A, using a tall skinny QR or short wide LQ
+     !> factorization of A.  It is assumed that A has full rank.
+     !> The following options are provided:
+     !> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A*X ||.
+     !> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+     !> an underdetermined system A * X = B.
+     !> 3. If TRANS = 'C' and m >= n:  find the minimum norm solution of
+     !> an undetermined system A**T * X = B.
+     !> 4. If TRANS = 'C' and m < n:  find the least squares solution of
+     !> an overdetermined system, i.e., solve the least squares problem
+     !> minimize || B - A**T * X ||.
+     !> Several right hand side vectors b and solution vectors x can be
+     !> handled in a single call; they are stored as the columns of the
+     !> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+     !> matrix X.
+
+     subroutine stdlib_zgetsls( trans, m, n, nrhs, a, lda, b, ldb,work, lwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: trans
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, m, n, nrhs
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, tran
+           integer(ilp) :: i, iascl, ibscl, j, maxmn, brow, scllen, tszo, tszm, lwo, lwm, lw1, &
+                     lw2, wsizeo, wsizem, info2
+           real(dp) :: anrm, bignum, bnrm, smlnum, dum(1)
+           complex(dp) :: tq(5), workq(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,int
+           ! Executable Statements 
+           ! test the input arguments.
+           info = 0
+           maxmn = max( m, n )
+           tran  = stdlib_lsame( trans, 'C' )
+           lquery = ( lwork==-1 .or. lwork==-2 )
+           if( .not.( stdlib_lsame( trans, 'N' ) .or.stdlib_lsame( trans, 'C' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( nrhs<0 ) then
+              info = -4
+           else if( lda<max( 1, m ) ) then
+              info = -6
+           else if( ldb<max( 1, m, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+           ! determine the optimum and minimum lwork
+            if( m>=n ) then
+              call stdlib_zgeqr( m, n, a, lda, tq, -1, workq, -1, info2 )
+              tszo = int( tq( 1 ),KIND=ilp)
+              lwo  = int( workq( 1 ),KIND=ilp)
+              call stdlib_zgemqr( 'L', trans, m, nrhs, n, a, lda, tq,tszo, b, ldb, workq, -1, &
+                        info2 )
+              lwo  = max( lwo, int( workq( 1 ),KIND=ilp) )
+              call stdlib_zgeqr( m, n, a, lda, tq, -2, workq, -2, info2 )
+              tszm = int( tq( 1 ),KIND=ilp)
+              lwm  = int( workq( 1 ),KIND=ilp)
+              call stdlib_zgemqr( 'L', trans, m, nrhs, n, a, lda, tq,tszm, b, ldb, workq, -1, &
+                        info2 )
+              lwm = max( lwm, int( workq( 1 ),KIND=ilp) )
+              wsizeo = tszo + lwo
+              wsizem = tszm + lwm
+            else
+              call stdlib_zgelq( m, n, a, lda, tq, -1, workq, -1, info2 )
+              tszo = int( tq( 1 ),KIND=ilp)
+              lwo  = int( workq( 1 ),KIND=ilp)
+              call stdlib_zgemlq( 'L', trans, n, nrhs, m, a, lda, tq,tszo, b, ldb, workq, -1, &
+                        info2 )
+              lwo  = max( lwo, int( workq( 1 ),KIND=ilp) )
+              call stdlib_zgelq( m, n, a, lda, tq, -2, workq, -2, info2 )
+              tszm = int( tq( 1 ),KIND=ilp)
+              lwm  = int( workq( 1 ),KIND=ilp)
+              call stdlib_zgemlq( 'L', trans, n, nrhs, m, a, lda, tq,tszm, b, ldb, workq, -1, &
+                        info2 )
+              lwm  = max( lwm, int( workq( 1 ),KIND=ilp) )
+              wsizeo = tszo + lwo
+              wsizem = tszm + lwm
+            end if
+            if( ( lwork<wsizem ).and.( .not.lquery ) ) then
+               info = -10
+            end if
+            work( 1 ) = real( wsizeo,KIND=dp)
+           end if
+           if( info/=0 ) then
+             call stdlib_xerbla( 'ZGETSLS', -info )
+             return
+           end if
+           if( lquery ) then
+             if( lwork==-2 ) work( 1 ) = real( wsizem,KIND=dp)
+             return
+           end if
+           if( lwork<wsizeo ) then
+             lw1 = tszm
+             lw2 = lwm
+           else
+             lw1 = tszo
+             lw2 = lwo
+           end if
+           ! quick return if possible
+           if( min( m, n, nrhs )==0 ) then
+                call stdlib_zlaset( 'FULL', max( m, n ), nrhs, czero, czero,b, ldb )
+                return
+           end if
+           ! get machine parameters
+            smlnum = stdlib_dlamch( 'S' ) / stdlib_dlamch( 'P' )
+            bignum = one / smlnum
+            call stdlib_dlabad( smlnum, bignum )
+           ! scale a, b if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', m, n, a, lda, dum )
+           iascl = 0
+           if( anrm>zero .and. anrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_zlascl( 'G', 0, 0, anrm, smlnum, m, n, a, lda, info )
+              iascl = 1
+           else if( anrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_zlascl( 'G', 0, 0, anrm, bignum, m, n, a, lda, info )
+              iascl = 2
+           else if( anrm==zero ) then
+              ! matrix all zero. return zero solution.
+              call stdlib_zlaset( 'F', maxmn, nrhs, czero, czero, b, ldb )
+              go to 50
+           end if
+           brow = m
+           if ( tran ) then
+             brow = n
+           end if
+           bnrm = stdlib_zlange( 'M', brow, nrhs, b, ldb, dum )
+           ibscl = 0
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              ! scale matrix norm up to smlnum
+              call stdlib_zlascl( 'G', 0, 0, bnrm, smlnum, brow, nrhs, b, ldb,info )
+              ibscl = 1
+           else if( bnrm>bignum ) then
+              ! scale matrix norm down to bignum
+              call stdlib_zlascl( 'G', 0, 0, bnrm, bignum, brow, nrhs, b, ldb,info )
+              ibscl = 2
+           end if
+           if ( m>=n ) then
+              ! compute qr factorization of a
+             call stdlib_zgeqr( m, n, a, lda, work( lw2+1 ), lw1,work( 1 ), lw2, info )
+             if ( .not.tran ) then
+                 ! least-squares problem min || a * x - b ||
+                 ! b(1:m,1:nrhs) := q**t * b(1:m,1:nrhs)
+               call stdlib_zgemqr( 'L' , 'C', m, nrhs, n, a, lda,work( lw2+1 ), lw1, b, ldb, work(&
+                          1 ), lw2,info )
+                 ! b(1:n,1:nrhs) := inv(r) * b(1:n,1:nrhs)
+               call stdlib_ztrtrs( 'U', 'N', 'N', n, nrhs,a, lda, b, ldb, info )
+               if( info>0 ) then
+                 return
+               end if
+               scllen = n
+             else
+                 ! overdetermined system of equations a**t * x = b
+                 ! b(1:n,1:nrhs) := inv(r**t) * b(1:n,1:nrhs)
+                 call stdlib_ztrtrs( 'U', 'C', 'N', n, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(n+1:m,1:nrhs) = czero
+                 do j = 1, nrhs
+                    do i = n + 1, m
+                       b( i, j ) = czero
+                    end do
+                 end do
+                 ! b(1:m,1:nrhs) := q(1:n,:) * b(1:n,1:nrhs)
+                 call stdlib_zgemqr( 'L', 'N', m, nrhs, n, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 scllen = m
+              end if
+           else
+              ! compute lq factorization of a
+              call stdlib_zgelq( m, n, a, lda, work( lw2+1 ), lw1,work( 1 ), lw2, info )
+              ! workspace at least m, optimally m*nb.
+              if( .not.tran ) then
+                 ! underdetermined system of equations a * x = b
+                 ! b(1:m,1:nrhs) := inv(l) * b(1:m,1:nrhs)
+                 call stdlib_ztrtrs( 'L', 'N', 'N', m, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 ! b(m+1:n,1:nrhs) = 0
+                 do j = 1, nrhs
+                    do i = m + 1, n
+                       b( i, j ) = czero
+                    end do
+                 end do
+                 ! b(1:n,1:nrhs) := q(1:n,:)**t * b(1:m,1:nrhs)
+                 call stdlib_zgemlq( 'L', 'C', n, nrhs, m, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 scllen = n
+              else
+                 ! overdetermined system min || a**t * x - b ||
+                 ! b(1:n,1:nrhs) := q * b(1:n,1:nrhs)
+                 call stdlib_zgemlq( 'L', 'N', n, nrhs, m, a, lda,work( lw2+1 ), lw1, b, ldb, &
+                           work( 1 ), lw2,info )
+                 ! workspace at least nrhs, optimally nrhs*nb
+                 ! b(1:m,1:nrhs) := inv(l**t) * b(1:m,1:nrhs)
+                 call stdlib_ztrtrs( 'L', 'C', 'N', m, nrhs,a, lda, b, ldb, info )
+                 if( info>0 ) then
+                    return
+                 end if
+                 scllen = m
+              end if
+           end if
+           ! undo scaling
+           if( iascl==1 ) then
+             call stdlib_zlascl( 'G', 0, 0, anrm, smlnum, scllen, nrhs, b, ldb,info )
+           else if( iascl==2 ) then
+             call stdlib_zlascl( 'G', 0, 0, anrm, bignum, scllen, nrhs, b, ldb,info )
+           end if
+           if( ibscl==1 ) then
+             call stdlib_zlascl( 'G', 0, 0, smlnum, bnrm, scllen, nrhs, b, ldb,info )
+           else if( ibscl==2 ) then
+             call stdlib_zlascl( 'G', 0, 0, bignum, bnrm, scllen, nrhs, b, ldb,info )
+           end if
+           50 continue
+           work( 1 ) = real( tszo + lwo,KIND=dp)
+           return
+     end subroutine stdlib_zgetsls
+
+     !> ZGETSQRHRT: computes a NB2-sized column blocked QR-factorization
+     !> of a complex M-by-N matrix A with M >= N,
+     !> A = Q * R.
+     !> The routine uses internally a NB1-sized column blocked and MB1-sized
+     !> row blocked TSQR-factorization and perfors the reconstruction
+     !> of the Householder vectors from the TSQR output. The routine also
+     !> converts the R_tsqr factor from the TSQR-factorization output into
+     !> the R factor that corresponds to the Householder QR-factorization,
+     !> A = Q_tsqr * R_tsqr = Q * R.
+     !> The output Q and R factors are stored in the same format as in ZGEQRT
+     !> (Q is in blocked compact WY-representation). See the documentation
+     !> of ZGEQRT for more details on the format.
+
+     pure subroutine stdlib_zgetsqrhrt( m, n, mb1, nb1, nb2, a, lda, t, ldt, work,lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldt, lwork, m, n, nb1, nb2, mb1
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: t(ldt,*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: i, iinfo, j, lw1, lw2, lwt, ldwt, lworkopt, nb1local, nb2local, &
+                     num_all_row_blocks
+           ! Intrinsic Functions 
+           intrinsic :: ceiling,real,cmplx,max,min
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery  = lwork==-1
+           if( m<0 ) then
+              info = -1
+           else if( n<0 .or. m<n ) then
+              info = -2
+           else if( mb1<=n ) then
+              info = -3
+           else if( nb1<1 ) then
+              info = -4
+           else if( nb2<1 ) then
+              info = -5
+           else if( lda<max( 1, m ) ) then
+              info = -7
+           else if( ldt<max( 1,  min( nb2, n ) ) ) then
+              info = -9
+           else
+              ! test the input lwork for the dimension of the array work.
+              ! this workspace is used to store array:
+              ! a) matrix t and work for stdlib_zlatsqr;
+              ! b) n-by-n upper-triangular factor r_tsqr;
+              ! c) matrix t and array work for stdlib_zungtsqr_row;
+              ! d) diagonal d for stdlib_zunhr_col.
+              if( lwork<n*n+1 .and. .not.lquery ) then
+                 info = -11
+              else
+                 ! set block size for column blocks
+                 nb1local = min( nb1, n )
+                 num_all_row_blocks = max( 1,ceiling( real( m - n,KIND=dp) / real( mb1 - n,&
+                           KIND=dp) ) )
+                 ! length and leading dimension of work array to place
+                 ! t array in tsqr.
+                 lwt = num_all_row_blocks * n * nb1local
+                 ldwt = nb1local
+                 ! length of tsqr work array
+                 lw1 = nb1local * n
+                 ! length of stdlib_zungtsqr_row work array.
+                 lw2 = nb1local * max( nb1local, ( n - nb1local ) )
+                 lworkopt = max( lwt + lw1, max( lwt+n*n+lw2, lwt+n*n+n ) )
+                 if( ( lwork<max( 1, lworkopt ) ).and.(.not.lquery) ) then
+                    info = -11
+                 end if
+              end if
+           end if
+           ! handle error in the input parameters and return workspace query.
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGETSQRHRT', -info )
+              return
+           else if ( lquery ) then
+              work( 1 ) = cmplx( lworkopt,KIND=dp)
+              return
+           end if
+           ! quick return if possible
+           if( min( m, n )==0 ) then
+              work( 1 ) = cmplx( lworkopt,KIND=dp)
+              return
+           end if
+           nb2local = min( nb2, n )
+           ! (1) perform tsqr-factorization of the m-by-n matrix a.
+           call stdlib_zlatsqr( m, n, mb1, nb1local, a, lda, work, ldwt,work(lwt+1), lw1, iinfo )
+                     
+           ! (2) copy the factor r_tsqr stored in the upper-triangular part
+               ! of a into the square matrix in the work array
+               ! work(lwt+1:lwt+n*n) column-by-column.
+           do j = 1, n
+              call stdlib_zcopy( j, a( 1, j ), 1, work( lwt + n*(j-1)+1 ), 1 )
+           end do
+           ! (3) generate a m-by-n matrix q with orthonormal columns from
+           ! the result stored below the diagonal in the array a in place.
+           call stdlib_zungtsqr_row( m, n, mb1, nb1local, a, lda, work, ldwt,work( lwt+n*n+1 ), &
+                     lw2, iinfo )
+           ! (4) perform the reconstruction of householder vectors from
+           ! the matrix q (stored in a) in place.
+           call stdlib_zunhr_col( m, n, nb2local, a, lda, t, ldt,work( lwt+n*n+1 ), iinfo )
+                     
+           ! (5) copy the factor r_tsqr stored in the square matrix in the
+           ! work array work(lwt+1:lwt+n*n) into the upper-triangular
+           ! part of a.
+           ! (6) compute from r_tsqr the factor r_hr corresponding to
+           ! the reconstructed householder vectors, i.e. r_hr = s * r_tsqr.
+           ! this multiplication by the sign matrix s on the left means
+           ! changing the sign of i-th row of the matrix r_tsqr according
+           ! to sign of the i-th diagonal element diag(i) of the matrix s.
+           ! diag is stored in work( lwt+n*n+1 ) from the stdlib_zunhr_col output.
+           ! (5) and (6) can be combined in a single loop, so the rows in a
+           ! are accessed only once.
+           do i = 1, n
+              if( work( lwt+n*n+i )==-cone ) then
+                 do j = i, n
+                    a( i, j ) = -cone * work( lwt+n*(j-1)+i )
+                 end do
+              else
+                 call stdlib_zcopy( n-i+1, work(lwt+n*(i-1)+i), n, a( i, i ), lda )
+              end if
+           end do
+           work( 1 ) = cmplx( lworkopt,KIND=dp)
+           return
+     end subroutine stdlib_zgetsqrhrt
+
+     !> ZGGES: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, the generalized complex Schur
+     !> form (S, T), and optionally left and/or right Schur vectors (VSL
+     !> and VSR). This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL)*S*(VSR)**H, (VSL)*T*(VSR)**H )
+     !> where (VSR)**H is the conjugate-transpose of VSR.
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> triangular matrix S and the upper triangular matrix T. The leading
+     !> columns of VSL and VSR then form an unitary basis for the
+     !> corresponding left and right eigenspaces (deflating subspaces).
+     !> (If only the generalized eigenvalues are needed, use the driver
+     !> ZGGEV instead, which is faster.)
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0, and even for both being zero.
+     !> A pair of matrices (S,T) is in generalized complex Schur form if S
+     !> and T are upper triangular and, in addition, the diagonal elements
+     !> of T are non-negative real numbers.
+
+     subroutine stdlib_zgges( jobvsl, jobvsr, sort, selctg, n, a, lda, b, ldb,sdim, alpha, beta, &
+               vsl, ldvsl, vsr, ldvsr, work,lwork, rwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: alpha(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), work(*)
+                     
+           ! Function Arguments 
+           procedure(stdlib_selctg_z) :: selctg
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, wantst
+           integer(ilp) :: i, icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, iright, irows, irwrk, &
+                     itau, iwrk, lwkmin, lwkopt
+           real(dp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pvsl, pvsr, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(dp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -14
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -16
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              lwkmin = max( 1, 2*n )
+              lwkopt = max( 1, n + n*stdlib_ilaenv( 1, 'ZGEQRF', ' ', n, 1, n, 0 ) )
+              lwkopt = max( lwkopt, n +n*stdlib_ilaenv( 1, 'ZUNMQR', ' ', n, 1, n, -1 ) )
+              if( ilvsl ) then
+                 lwkopt = max( lwkopt, n +n*stdlib_ilaenv( 1, 'ZUNGQR', ' ', n, 1, n, -1 ) )
+                           
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery )info = -18
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGES ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_zlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_zlange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_zlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (real workspace: need 6*n)
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_zggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (complex workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_zgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (complex workspace: need n, prefer n*nb)
+           call stdlib_zunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           ! (complex workspace: need n, prefer n*nb)
+           if( ilvsl ) then
+              call stdlib_zlaset( 'FULL', n, n, czero, cone, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_zlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_zungqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_zlaset( 'FULL', n, n, czero, cone, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           call stdlib_zgghrd( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      ierr )
+           sdim = 0
+           ! perform qz algorithm, computing schur vectors if desired
+           ! (complex workspace: need n)
+           ! (real workspace: need n)
+           iwrk = itau
+           call stdlib_zhgeqz( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vsl, &
+                     ldvsl, vsr, ldvsr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 30
+           end if
+           ! sort eigenvalues alpha/beta if desired
+           ! (workspace: none needed)
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selecting
+              if( ilascl )call stdlib_zlascl( 'G', 0, 0, anrm, anrmto, n, 1, alpha, n, ierr )
+                        
+              if( ilbscl )call stdlib_zlascl( 'G', 0, 0, bnrm, bnrmto, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alpha( i ), beta( i ) )
+              end do
+              call stdlib_ztgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb, alpha,beta, vsl, &
+              ldvsl, vsr, ldvsr, sdim, pvsl, pvsr,dif, work( iwrk ), lwork-iwrk+1, idum, 1, ierr )
+                        
+              if( ierr==1 )info = n + 3
+           end if
+           ! apply back-permutation to vsl and vsr
+           ! (workspace: none needed)
+           if( ilvsl )call stdlib_zggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_zggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsr, ldvsr, ierr )
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_zlascl( 'U', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_zlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_zlascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_zlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              sdim = 0
+              do i = 1, n
+                 cursl = selctg( alpha( i ), beta( i ) )
+                 if( cursl )sdim = sdim + 1
+                 if( cursl .and. .not.lastsl )info = n + 2
+                 lastsl = cursl
+              end do
+           end if
+           30 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zgges
+
+     !> ZGGESX: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, the complex Schur form (S,T),
+     !> and, optionally, the left and/or right matrices of Schur vectors (VSL
+     !> and VSR).  This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL) S (VSR)**H, (VSL) T (VSR)**H )
+     !> where (VSR)**H is the conjugate-transpose of VSR.
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> triangular matrix S and the upper triangular matrix T; computes
+     !> a reciprocal condition number for the average of the selected
+     !> eigenvalues (RCONDE); and computes a reciprocal condition number for
+     !> the right and left deflating subspaces corresponding to the selected
+     !> eigenvalues (RCONDV). The leading columns of VSL and VSR then form
+     !> an orthonormal basis for the corresponding left and right eigenspaces
+     !> (deflating subspaces).
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0 or for both being zero.
+     !> A pair of matrices (S,T) is in generalized complex Schur form if T is
+     !> upper triangular with non-negative diagonal and S is upper
+     !> triangular.
+
+     subroutine stdlib_zggesx( jobvsl, jobvsr, sort, selctg, sense, n, a, lda,b, ldb, sdim, alpha,&
+      beta, vsl, ldvsl, vsr,ldvsr, rconde, rcondv, work, lwork, rwork,iwork, liwork, bwork, info )
+                
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sense, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, liwork, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: rconde(2), rcondv(2), rwork(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: alpha(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), work(*)
+                     
+           ! Function Arguments 
+           procedure(stdlib_selctg_z) :: selctg
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, wantsb, wantse, &
+                     wantsn, wantst, wantsv
+           integer(ilp) :: i, icols, ierr, ihi, ijob, ijobvl, ijobvr, ileft, ilo, iright, irows, &
+                     irwrk, itau, iwrk, liwmin, lwrk, maxwrk, minwrk
+           real(dp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pl, pr, smlnum
+           ! Local Arrays 
+           real(dp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           lquery = ( lwork==-1 .or. liwork==-1 )
+           if( wantsn ) then
+              ijob = 0
+           else if( wantse ) then
+              ijob = 1
+           else if( wantsv ) then
+              ijob = 2
+           else if( wantsb ) then
+              ijob = 4
+           end if
+           ! test the input arguments
+           info = 0
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( .not.( wantsn .or. wantse .or. wantsv .or. wantsb ) .or.( .not.wantst .and. &
+                     .not.wantsn ) ) then
+              info = -5
+           else if( n<0 ) then
+              info = -6
+           else if( lda<max( 1, n ) ) then
+              info = -8
+           else if( ldb<max( 1, n ) ) then
+              info = -10
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -15
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -17
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv.)
+           if( info==0 ) then
+              if( n>0) then
+                 minwrk = 2*n
+                 maxwrk = n*(1 + stdlib_ilaenv( 1, 'ZGEQRF', ' ', n, 1, n, 0 ) )
+                 maxwrk = max( maxwrk, n*( 1 +stdlib_ilaenv( 1, 'ZUNMQR', ' ', n, 1, n, -1 ) ) )
+                           
+                 if( ilvsl ) then
+                    maxwrk = max( maxwrk, n*( 1 +stdlib_ilaenv( 1, 'ZUNGQR', ' ', n, 1, n, -1 ) ) &
+                              )
+                 end if
+                 lwrk = maxwrk
+                 if( ijob>=1 )lwrk = max( lwrk, n*n/2 )
+              else
+                 minwrk = 1
+                 maxwrk = 1
+                 lwrk   = 1
+              end if
+              work( 1 ) = lwrk
+              if( wantsn .or. n==0 ) then
+                 liwmin = 1
+              else
+                 liwmin = n + 2
+              end if
+              iwork( 1 ) = liwmin
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -21
+              else if( liwork<liwmin  .and. .not.lquery) then
+                 info = -24
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGESX', -info )
+              return
+           else if (lquery) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_zlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_zlange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_zlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (real workspace: need 6*n)
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_zggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (complex workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_zgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the unitary transformation to matrix a
+           ! (complex workspace: need n, prefer n*nb)
+           call stdlib_zunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           ! (complex workspace: need n, prefer n*nb)
+           if( ilvsl ) then
+              call stdlib_zlaset( 'FULL', n, n, czero, cone, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_zlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_zungqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_zlaset( 'FULL', n, n, czero, cone, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           call stdlib_zgghrd( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      ierr )
+           sdim = 0
+           ! perform qz algorithm, computing schur vectors if desired
+           ! (complex workspace: need n)
+           ! (real workspace:    need n)
+           iwrk = itau
+           call stdlib_zhgeqz( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vsl, &
+                     ldvsl, vsr, ldvsr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 40
+           end if
+           ! sort eigenvalues alpha/beta and compute the reciprocal of
+           ! condition number(s)
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selctging
+              if( ilascl )call stdlib_zlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+                        
+              if( ilbscl )call stdlib_zlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alpha( i ), beta( i ) )
+              end do
+              ! reorder eigenvalues, transform generalized schur vectors, and
+              ! compute reciprocal condition numbers
+              ! (complex workspace: if ijob >= 1, need max(1, 2*sdim*(n-sdim))
+                                  ! otherwise, need 1 )
+              call stdlib_ztgsen( ijob, ilvsl, ilvsr, bwork, n, a, lda, b, ldb,alpha, beta, vsl, &
+              ldvsl, vsr, ldvsr, sdim, pl, pr,dif, work( iwrk ), lwork-iwrk+1, iwork, liwork,ierr &
+                        )
+              if( ijob>=1 )maxwrk = max( maxwrk, 2*sdim*( n-sdim ) )
+              if( ierr==-21 ) then
+                  ! not enough complex workspace
+                 info = -21
+              else
+                 if( ijob==1 .or. ijob==4 ) then
+                    rconde( 1 ) = pl
+                    rconde( 2 ) = pr
+                 end if
+                 if( ijob==2 .or. ijob==4 ) then
+                    rcondv( 1 ) = dif( 1 )
+                    rcondv( 2 ) = dif( 2 )
+                 end if
+                 if( ierr==1 )info = n + 3
+              end if
+           end if
+           ! apply permutation to vsl and vsr
+           ! (workspace: none needed)
+           if( ilvsl )call stdlib_zggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_zggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsr, ldvsr, ierr )
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_zlascl( 'U', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_zlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_zlascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_zlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              sdim = 0
+              do i = 1, n
+                 cursl = selctg( alpha( i ), beta( i ) )
+                 if( cursl )sdim = sdim + 1
+                 if( cursl .and. .not.lastsl )info = n + 2
+                 lastsl = cursl
+              end do
+           end if
+           40 continue
+           work( 1 ) = maxwrk
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_zggesx
+
+     !> ZGGEV: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, and optionally, the left and/or
+     !> right generalized eigenvectors.
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right generalized eigenvector v(j) corresponding to the
+     !> generalized eigenvalue lambda(j) of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j).
+     !> The left generalized eigenvector u(j) corresponding to the
+     !> generalized eigenvalues lambda(j) of (A,B) satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H * B
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_zggev( jobvl, jobvr, n, a, lda, b, ldb, alpha, beta,vl, ldvl, vr, ldvr, &
+               work, lwork, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: alpha(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery
+           character :: chtemp
+           integer(ilp) :: icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, in, iright, irows, irwrk,&
+                      itau, iwrk, jc, jr, lwkmin, lwkopt
+           real(dp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           complex(dp) :: x
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,sqrt
+           ! Statement Functions 
+           real(dp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=dp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -11
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -13
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv. the workspace is
+             ! computed assuming ilo = 1 and ihi = n, the worst case.)
+           if( info==0 ) then
+              lwkmin = max( 1, 2*n )
+              lwkopt = max( 1, n + n*stdlib_ilaenv( 1, 'ZGEQRF', ' ', n, 1, n, 0 ) )
+              lwkopt = max( lwkopt, n +n*stdlib_ilaenv( 1, 'ZUNMQR', ' ', n, 1, n, 0 ) )
+              if( ilvl ) then
+                 lwkopt = max( lwkopt, n +n*stdlib_ilaenv( 1, 'ZUNGQR', ' ', n, 1, n, -1 ) )
+                           
+              end if
+              work( 1 ) = lwkopt
+              if( lwork<lwkmin .and. .not.lquery )info = -15
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_dlamch( 'E' )*stdlib_dlamch( 'B' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_zlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_zlange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_zlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrices a, b to isolate eigenvalues if possible
+           ! (real workspace: need 6*n)
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_zggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (complex workspace: need n, prefer n*nb)
+           irows = ihi + 1 - ilo
+           if( ilv ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_zgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           ! (complex workspace: need n, prefer n*nb)
+           call stdlib_zunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl
+           ! (complex workspace: need n, prefer n*nb)
+           if( ilvl ) then
+              call stdlib_zlaset( 'FULL', n, n, czero, cone, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_zlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_zungqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vr
+           if( ilvr )call stdlib_zlaset( 'FULL', n, n, czero, cone, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           if( ilv ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_zgghrd( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        ierr )
+           else
+              call stdlib_zgghrd( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur form and schur vectors)
+           ! (complex workspace: need n)
+           ! (real workspace: need n)
+           iwrk = itau
+           if( ilv ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_zhgeqz( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vl, &
+                     ldvl, vr, ldvr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 70
+           end if
+           ! compute eigenvectors
+           ! (real workspace: need 2*n)
+           ! (complex workspace: need 2*n)
+           if( ilv ) then
+              if( ilvl ) then
+                 if( ilvr ) then
+                    chtemp = 'B'
+                 else
+                    chtemp = 'L'
+                 end if
+              else
+                 chtemp = 'R'
+              end if
+              call stdlib_ztgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, n, &
+                        in, work( iwrk ), rwork( irwrk ),ierr )
+              if( ierr/=0 ) then
+                 info = n + 2
+                 go to 70
+              end if
+              ! undo balancing on vl and vr and normalization
+              ! (workspace: none needed)
+              if( ilvl ) then
+                 call stdlib_zggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), n, vl,&
+                            ldvl, ierr )
+                 loop_30: do jc = 1, n
+                    temp = zero
+                    do jr = 1, n
+                       temp = max( temp, abs1( vl( jr, jc ) ) )
+                    end do
+                    if( temp<smlnum )cycle loop_30
+                    temp = one / temp
+                    do jr = 1, n
+                       vl( jr, jc ) = vl( jr, jc )*temp
+                    end do
+                 end do loop_30
+              end if
+              if( ilvr ) then
+                 call stdlib_zggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), n, vr,&
+                            ldvr, ierr )
+                 loop_60: do jc = 1, n
+                    temp = zero
+                    do jr = 1, n
+                       temp = max( temp, abs1( vr( jr, jc ) ) )
+                    end do
+                    if( temp<smlnum )cycle loop_60
+                    temp = one / temp
+                    do jr = 1, n
+                       vr( jr, jc ) = vr( jr, jc )*temp
+                    end do
+                 end do loop_60
+              end if
+           end if
+           ! undo scaling if necessary
+           70 continue
+           if( ilascl )call stdlib_zlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           if( ilbscl )call stdlib_zlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zggev
+
+     !> ZGGEVX: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B) the generalized eigenvalues, and optionally, the left and/or
+     !> right generalized eigenvectors.
+     !> Optionally, it also computes a balancing transformation to improve
+     !> the conditioning of the eigenvalues and eigenvectors (ILO, IHI,
+     !> LSCALE, RSCALE, ABNRM, and BBNRM), reciprocal condition numbers for
+     !> the eigenvalues (RCONDE), and reciprocal condition numbers for the
+     !> right eigenvectors (RCONDV).
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right eigenvector v(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j) .
+     !> The left eigenvector u(j) corresponding to the eigenvalue lambda(j)
+     !> of (A,B) satisfies
+     !> u(j)**H * A  = lambda(j) * u(j)**H * B.
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_zggevx( balanc, jobvl, jobvr, sense, n, a, lda, b, ldb,alpha, beta, vl, &
+     ldvl, vr, ldvr, ilo, ihi,lscale, rscale, abnrm, bbnrm, rconde, rcondv,work, lwork, rwork, &
+               iwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: balanc, jobvl, jobvr, sense
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           real(dp), intent(out) :: abnrm, bbnrm
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: lscale(*), rconde(*), rcondv(*), rscale(*), rwork(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: alpha(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery, noscl, wantsb, wantse, wantsn, &
+                     wantsv
+           character :: chtemp
+           integer(ilp) :: i, icols, ierr, ijobvl, ijobvr, in, irows, itau, iwrk, iwrk1, j, jc, &
+                     jr, m, maxwrk, minwrk
+           real(dp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           complex(dp) :: x
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,sqrt
+           ! Statement Functions 
+           real(dp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=dp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           noscl  = stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc, 'P' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.( noscl .or. stdlib_lsame( balanc,'S' ) .or.stdlib_lsame( balanc, 'B' ) ) ) &
+                     then
+              info = -1
+           else if( ijobvl<=0 ) then
+              info = -2
+           else if( ijobvr<=0 ) then
+              info = -3
+           else if( .not.( wantsn .or. wantse .or. wantsb .or. wantsv ) )then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -13
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -15
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! nb refers to the optimal block size for the immediately
+             ! following subroutine, as returned by stdlib_ilaenv. the workspace is
+             ! computed assuming ilo = 1 and ihi = n, the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 minwrk = 2*n
+                 if( wantse ) then
+                    minwrk = 4*n
+                 else if( wantsv .or. wantsb ) then
+                    minwrk = 2*n*( n + 1)
+                 end if
+                 maxwrk = minwrk
+                 maxwrk = max( maxwrk,n + n*stdlib_ilaenv( 1, 'ZGEQRF', ' ', n, 1, n, 0 ) )
+                           
+                 maxwrk = max( maxwrk,n + n*stdlib_ilaenv( 1, 'ZUNMQR', ' ', n, 1, n, 0 ) )
+                           
+                 if( ilvl ) then
+                    maxwrk = max( maxwrk, n +n*stdlib_ilaenv( 1, 'ZUNGQR', ' ', n, 1, n, 0 ) )
+                              
+                 end if
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -25
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_zlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_zlange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_zlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute and/or balance the matrix pair (a,b)
+           ! (real workspace: need 6*n if balanc = 's' or 'b', 1 otherwise)
+           call stdlib_zggbal( balanc, n, a, lda, b, ldb, ilo, ihi, lscale, rscale,rwork, ierr )
+                     
+           ! compute abnrm and bbnrm
+           abnrm = stdlib_zlange( '1', n, n, a, lda, rwork( 1 ) )
+           if( ilascl ) then
+              rwork( 1 ) = abnrm
+              call stdlib_dlascl( 'G', 0, 0, anrmto, anrm, 1, 1, rwork( 1 ), 1,ierr )
+              abnrm = rwork( 1 )
+           end if
+           bbnrm = stdlib_zlange( '1', n, n, b, ldb, rwork( 1 ) )
+           if( ilbscl ) then
+              rwork( 1 ) = bbnrm
+              call stdlib_dlascl( 'G', 0, 0, bnrmto, bnrm, 1, 1, rwork( 1 ), 1,ierr )
+              bbnrm = rwork( 1 )
+           end if
+           ! reduce b to triangular form (qr decomposition of b)
+           ! (complex workspace: need n, prefer n*nb )
+           irows = ihi + 1 - ilo
+           if( ilv .or. .not.wantsn ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_zgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the unitary transformation to a
+           ! (complex workspace: need n, prefer n*nb)
+           call stdlib_zunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl and/or vr
+           ! (workspace: need n, prefer n*nb)
+           if( ilvl ) then
+              call stdlib_zlaset( 'FULL', n, n, czero, cone, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_zlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_zungqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           if( ilvr )call stdlib_zlaset( 'FULL', n, n, czero, cone, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           ! (workspace: none needed)
+           if( ilv .or. .not.wantsn ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_zgghrd( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        ierr )
+           else
+              call stdlib_zgghrd( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur forms and schur vectors)
+           ! (complex workspace: need n)
+           ! (real workspace: need n)
+           iwrk = itau
+           if( ilv .or. .not.wantsn ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_zhgeqz( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vl, &
+                     ldvl, vr, ldvr, work( iwrk ),lwork+1-iwrk, rwork, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 90
+           end if
+           ! compute eigenvectors and estimate condition numbers if desired
+           ! stdlib_ztgevc: (complex workspace: need 2*n )
+                   ! (real workspace:    need 2*n )
+           ! stdlib_ztgsna: (complex workspace: need 2*n*n if sense='v' or 'b')
+                   ! (integer workspace: need n+2 )
+           if( ilv .or. .not.wantsn ) then
+              if( ilv ) then
+                 if( ilvl ) then
+                    if( ilvr ) then
+                       chtemp = 'B'
+                    else
+                       chtemp = 'L'
+                    end if
+                 else
+                    chtemp = 'R'
+                 end if
+                 call stdlib_ztgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, n,&
+                            in, work( iwrk ), rwork,ierr )
+                 if( ierr/=0 ) then
+                    info = n + 2
+                    go to 90
+                 end if
+              end if
+              if( .not.wantsn ) then
+                 ! compute eigenvectors (stdlib_ztgevc) and estimate condition
+                 ! numbers (stdlib_ztgsna). note that the definition of the condition
+                 ! number is not invariant under transformation (u,v) to
+                 ! (q*u, z*v), where (u,v) are eigenvectors of the generalized
+                 ! schur form (s,t), q and z are orthogonal matrices. in order
+                 ! to avoid using extra 2*n*n workspace, we have to
+                 ! re-calculate eigenvectors and estimate the condition numbers
+                 ! one at a time.
+                 do i = 1, n
+                    do j = 1, n
+                       bwork( j ) = .false.
+                    end do
+                    bwork( i ) = .true.
+                    iwrk = n + 1
+                    iwrk1 = iwrk + n
+                    if( wantse .or. wantsb ) then
+                       call stdlib_ztgevc( 'B', 'S', bwork, n, a, lda, b, ldb,work( 1 ), n, work( &
+                                 iwrk ), n, 1, m,work( iwrk1 ), rwork, ierr )
+                       if( ierr/=0 ) then
+                          info = n + 2
+                          go to 90
+                       end if
+                    end if
+                    call stdlib_ztgsna( sense, 'S', bwork, n, a, lda, b, ldb,work( 1 ), n, work( &
+                    iwrk ), n, rconde( i ),rcondv( i ), 1, m, work( iwrk1 ),lwork-iwrk1+1, iwork, &
+                              ierr )
+                 end do
+              end if
+           end if
+           ! undo balancing on vl and vr and normalization
+           ! (workspace: none needed)
+           if( ilvl ) then
+              call stdlib_zggbak( balanc, 'L', n, ilo, ihi, lscale, rscale, n, vl,ldvl, ierr )
+                        
+              loop_50: do jc = 1, n
+                 temp = zero
+                 do jr = 1, n
+                    temp = max( temp, abs1( vl( jr, jc ) ) )
+                 end do
+                 if( temp<smlnum )cycle loop_50
+                 temp = one / temp
+                 do jr = 1, n
+                    vl( jr, jc ) = vl( jr, jc )*temp
+                 end do
+              end do loop_50
+           end if
+           if( ilvr ) then
+              call stdlib_zggbak( balanc, 'R', n, ilo, ihi, lscale, rscale, n, vr,ldvr, ierr )
+                        
+              loop_80: do jc = 1, n
+                 temp = zero
+                 do jr = 1, n
+                    temp = max( temp, abs1( vr( jr, jc ) ) )
+                 end do
+                 if( temp<smlnum )cycle loop_80
+                 temp = one / temp
+                 do jr = 1, n
+                    vr( jr, jc ) = vr( jr, jc )*temp
+                 end do
+              end do loop_80
+           end if
+           ! undo scaling if necessary
+           90 continue
+           if( ilascl )call stdlib_zlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           if( ilbscl )call stdlib_zlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_zggevx
+
+     !> ZHBEV: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a complex Hermitian band matrix A.
+
+     subroutine stdlib_zhbev( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldz, n
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: ab(ldab,*)
+           complex(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lower, wantz
+           integer(ilp) :: iinfo, imax, inde, indrwk, iscale
+           real(dp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBEV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              if( lower ) then
+                 w( 1 ) = real( ab( 1, 1 ),KIND=dp)
+              else
+                 w( 1 ) = real( ab( kd+1, 1 ),KIND=dp)
+              end if
+              if( wantz )z( 1, 1 ) = one
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_zlanhb( 'M', uplo, n, kd, ab, ldab, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_zlascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_zlascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+           end if
+           ! call stdlib_zhbtrd to reduce hermitian band matrix to tridiagonal form.
+           inde = 1
+           call stdlib_zhbtrd( jobz, uplo, n, kd, ab, ldab, w, rwork( inde ), z,ldz, work, iinfo )
+                     
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, call stdlib_zsteqr.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, rwork( inde ), info )
+           else
+              indrwk = inde + n
+              call stdlib_zsteqr( jobz, n, w, rwork( inde ), z, ldz,rwork( indrwk ), info )
+                        
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           return
+     end subroutine stdlib_zhbev
+
+     !> ZHBEVD: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a complex Hermitian band matrix A.  If eigenvectors are desired, it
+     !> uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_zhbevd( jobz, uplo, n, kd, ab, ldab, w, z, ldz, work,lwork, rwork, lrwork, &
+               iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: kd, ldab, ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: ab(ldab,*)
+           complex(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indwk2, indwrk, iscale, liwmin, llrwk, llwk2, &
+                     lrwmin, lwmin
+           real(dp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 .or. liwork==-1 .or. lrwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              lwmin = 1
+              lrwmin = 1
+              liwmin = 1
+           else
+              if( wantz ) then
+                 lwmin = 2*n**2
+                 lrwmin = 1 + 5*n + 2*n**2
+                 liwmin = 3 + 5*n
+              else
+                 lwmin = n
+                 lrwmin = n
+                 liwmin = 1
+              end if
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( kd<0 ) then
+              info = -4
+           else if( ldab<kd+1 ) then
+              info = -6
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -13
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = real( ab( 1, 1 ),KIND=dp)
+              if( wantz )z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_zlanhb( 'M', uplo, n, kd, ab, ldab, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_zlascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_zlascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+           end if
+           ! call stdlib_zhbtrd to reduce hermitian band matrix to tridiagonal form.
+           inde = 1
+           indwrk = inde + n
+           indwk2 = 1 + n*n
+           llwk2 = lwork - indwk2 + 1
+           llrwk = lrwork - indwrk + 1
+           call stdlib_zhbtrd( jobz, uplo, n, kd, ab, ldab, w, rwork( inde ), z,ldz, work, iinfo )
+                     
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, call stdlib_zstedc.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_zstedc( 'I', n, w, rwork( inde ), work, n, work( indwk2 ),llwk2, rwork( &
+                        indwrk ), llrwk, iwork, liwork,info )
+              call stdlib_zgemm( 'N', 'N', n, n, n, cone, z, ldz, work, n, czero,work( indwk2 ), &
+                        n )
+              call stdlib_zlacpy( 'A', n, n, work( indwk2 ), n, z, ldz )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_zhbevd
+
+     !> ZHBEVX: computes selected eigenvalues and, optionally, eigenvectors
+     !> of a complex Hermitian band matrix A.  Eigenvalues and eigenvectors
+     !> can be selected by specifying either a range of values or a range of
+     !> indices for the desired eigenvalues.
+
+     subroutine stdlib_zhbevx( jobz, range, uplo, n, kd, ab, ldab, q, ldq, vl,vu, il, iu, abstol, &
+               m, w, z, ldz, work, rwork,iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, kd, ldab, ldq, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: ab(ldab,*)
+           complex(dp), intent(out) :: q(ldq,*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, lower, test, valeig, wantz
+           character :: order
+           integer(ilp) :: i, iinfo, imax, indd, inde, indee, indibl, indisp, indiwk, indrwk, &
+                     indwrk, iscale, itmp1, j, jj, nsplit
+           real(dp) :: abstll, anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum, tmp1, vll, &
+                     vuu
+           complex(dp) :: ctmp1
+           ! Intrinsic Functions 
+           intrinsic :: real,max,min,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           lower = stdlib_lsame( uplo, 'L' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( kd<0 ) then
+              info = -5
+           else if( ldab<kd+1 ) then
+              info = -7
+           else if( wantz .and. ldq<max( 1, n ) ) then
+              info = -9
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -11
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -12
+                 else if( iu<min( n, il ) .or. iu>n ) then
+                    info = -13
+                 end if
+              end if
+           end if
+           if( info==0 ) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) )info = -18
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBEVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           if( n==1 ) then
+              m = 1
+              if( lower ) then
+                 ctmp1 = ab( 1, 1 )
+              else
+                 ctmp1 = ab( kd+1, 1 )
+              end if
+              tmp1 = real( ctmp1,KIND=dp)
+              if( valeig ) then
+                 if( .not.( vl<tmp1 .and. vu>=tmp1 ) )m = 0
+              end if
+              if( m==1 ) then
+                 w( 1 ) = real( ctmp1,KIND=dp)
+                 if( wantz )z( 1, 1 ) = cone
+              end if
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = min( sqrt( bignum ), one / sqrt( sqrt( safmin ) ) )
+           ! scale matrix to allowable range, if necessary.
+           iscale = 0
+           abstll = abstol
+           if( valeig ) then
+              vll = vl
+              vuu = vu
+           else
+              vll = zero
+              vuu = zero
+           end if
+           anrm = stdlib_zlanhb( 'M', uplo, n, kd, ab, ldab, rwork )
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              if( lower ) then
+                 call stdlib_zlascl( 'B', kd, kd, one, sigma, n, n, ab, ldab, info )
+              else
+                 call stdlib_zlascl( 'Q', kd, kd, one, sigma, n, n, ab, ldab, info )
+              end if
+              if( abstol>0 )abstll = abstol*sigma
+              if( valeig ) then
+                 vll = vl*sigma
+                 vuu = vu*sigma
+              end if
+           end if
+           ! call stdlib_zhbtrd to reduce hermitian band matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indrwk = inde + n
+           indwrk = 1
+           call stdlib_zhbtrd( jobz, uplo, n, kd, ab, ldab, rwork( indd ),rwork( inde ), q, ldq, &
+                     work( indwrk ), iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_dsterf or stdlib_zsteqr.  if this fails for some
+           ! eigenvalue, then try stdlib_dstebz.
+           test = .false.
+           if (indeig) then
+              if (il==1 .and. iu==n) then
+                 test = .true.
+              end if
+           end if
+           if ((alleig .or. test) .and. (abstol<=zero)) then
+              call stdlib_dcopy( n, rwork( indd ), 1, w, 1 )
+              indee = indrwk + 2*n
+              if( .not.wantz ) then
+                 call stdlib_dcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_dsterf( n, w, rwork( indee ), info )
+              else
+                 call stdlib_zlacpy( 'A', n, n, q, ldq, z, ldz )
+                 call stdlib_dcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+                 call stdlib_zsteqr( jobz, n, w, rwork( indee ), z, ldz,rwork( indrwk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_dstebz and, if eigenvectors are desired, stdlib_zstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwk = indisp + n
+           call stdlib_dstebz( range, order, n, vll, vuu, il, iu, abstll,rwork( indd ), rwork( &
+           inde ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwk ),&
+                      info )
+           if( wantz ) then
+              call stdlib_zstein( n, rwork( indd ), rwork( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwk ), ifail, info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_zstein.
+              do j = 1, m
+                 call stdlib_zcopy( n, z( 1, j ), 1, work( 1 ), 1 )
+                 call stdlib_zgemv( 'N', n, n, cone, q, ldq, work, 1, czero,z( 1, j ), 1 )
+              end do
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           30 continue
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = m
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_zswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_zhbevx
+
+     !> ZHBGV: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x. Here A and B are assumed to be Hermitian
+     !> and banded, and B is also positive definite.
+
+     pure subroutine stdlib_zhbgv( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w, z,ldz, work, &
+               rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldz, n
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           complex(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: upper, wantz
+           character :: vect
+           integer(ilp) :: iinfo, inde, indwrk
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBGV ', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_zpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           inde = 1
+           indwrk = inde + n
+           call stdlib_zhbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, z, ldz,work, rwork( &
+                     indwrk ), iinfo )
+           ! reduce to tridiagonal form.
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_zhbtrd( vect, uplo, n, ka, ab, ldab, w, rwork( inde ), z,ldz, work, iinfo )
+                     
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, call stdlib_zsteqr.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_zsteqr( jobz, n, w, rwork( inde ), z, ldz,rwork( indwrk ), info )
+                        
+           end if
+           return
+     end subroutine stdlib_zhbgv
+
+     !> ZHBGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x. Here A and B are assumed to be Hermitian
+     !> and banded, and B is also positive definite.  If eigenvectors are
+     !> desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     pure subroutine stdlib_zhbgvd( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, w,z, ldz, work, &
+               lwork, rwork, lrwork, iwork,liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ka, kb, ldab, ldbb, ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           complex(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: vect
+           integer(ilp) :: iinfo, inde, indwk2, indwrk, liwmin, llrwk, llwk2, lrwmin, &
+                     lwmin
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              lwmin = 1+n
+              lrwmin = 1+n
+              liwmin = 1
+           else if( wantz ) then
+              lwmin = 2*n**2
+              lrwmin = 1 + 5*n + 2*n**2
+              liwmin = 3 + 5*n
+           else
+              lwmin = n
+              lrwmin = n
+              liwmin = 1
+           end if
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ka<0 ) then
+              info = -4
+           else if( kb<0 .or. kb>ka ) then
+              info = -5
+           else if( ldab<ka+1 ) then
+              info = -7
+           else if( ldbb<kb+1 ) then
+              info = -9
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -12
+           end if
+           if( info==0 ) then
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -14
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -16
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -18
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_zpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           inde = 1
+           indwrk = inde + n
+           indwk2 = 1 + n*n
+           llwk2 = lwork - indwk2 + 2
+           llrwk = lrwork - indwrk + 2
+           call stdlib_zhbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, z, ldz,work, rwork, &
+                     iinfo )
+           ! reduce hermitian band matrix to tridiagonal form.
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_zhbtrd( vect, uplo, n, ka, ab, ldab, w, rwork( inde ), z,ldz, work, iinfo )
+                     
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, call stdlib_zstedc.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_zstedc( 'I', n, w, rwork( inde ), work, n, work( indwk2 ),llwk2, rwork( &
+                        indwrk ), llrwk, iwork, liwork,info )
+              call stdlib_zgemm( 'N', 'N', n, n, n, cone, z, ldz, work, n, czero,work( indwk2 ), &
+                        n )
+              call stdlib_zlacpy( 'A', n, n, work( indwk2 ), n, z, ldz )
+           end if
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_zhbgvd
+
+     !> ZHBGVX: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite banded eigenproblem, of
+     !> the form A*x=(lambda)*B*x. Here A and B are assumed to be Hermitian
+     !> and banded, and B is also positive definite.  Eigenvalues and
+     !> eigenvectors can be selected by specifying either all eigenvalues,
+     !> a range of values or a range of indices for the desired eigenvalues.
+
+     pure subroutine stdlib_zhbgvx( jobz, range, uplo, n, ka, kb, ab, ldab, bb,ldbb, q, ldq, vl, &
+               vu, il, iu, abstol, m, w, z,ldz, work, rwork, iwork, ifail, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, range, uplo
+           integer(ilp), intent(in) :: il, iu, ka, kb, ldab, ldbb, ldq, ldz, n
+           integer(ilp), intent(out) :: info, m
+           real(dp), intent(in) :: abstol, vl, vu
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ifail(*), iwork(*)
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: ab(ldab,*), bb(ldbb,*)
+           complex(dp), intent(out) :: q(ldq,*), work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: alleig, indeig, test, upper, valeig, wantz
+           character :: order, vect
+           integer(ilp) :: i, iinfo, indd, inde, indee, indibl, indisp, indiwk, indrwk, indwrk, &
+                     itmp1, j, jj, nsplit
+           real(dp) :: tmp1
+           ! Intrinsic Functions 
+           intrinsic :: min
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           alleig = stdlib_lsame( range, 'A' )
+           valeig = stdlib_lsame( range, 'V' )
+           indeig = stdlib_lsame( range, 'I' )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( alleig .or. valeig .or. indeig ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ka<0 ) then
+              info = -5
+           else if( kb<0 .or. kb>ka ) then
+              info = -6
+           else if( ldab<ka+1 ) then
+              info = -8
+           else if( ldbb<kb+1 ) then
+              info = -10
+           else if( ldq<1 .or. ( wantz .and. ldq<n ) ) then
+              info = -12
+           else
+              if( valeig ) then
+                 if( n>0 .and. vu<=vl )info = -14
+              else if( indeig ) then
+                 if( il<1 .or. il>max( 1, n ) ) then
+                    info = -15
+                 else if ( iu<min( n, il ) .or. iu>n ) then
+                    info = -16
+                 end if
+              end if
+           end if
+           if( info==0) then
+              if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+                 info = -21
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHBGVX', -info )
+              return
+           end if
+           ! quick return if possible
+           m = 0
+           if( n==0 )return
+           ! form a split cholesky factorization of b.
+           call stdlib_zpbstf( uplo, n, kb, bb, ldbb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem.
+           call stdlib_zhbgst( jobz, uplo, n, ka, kb, ab, ldab, bb, ldbb, q, ldq,work, rwork, &
+                     iinfo )
+           ! solve the standard eigenvalue problem.
+           ! reduce hermitian band matrix to tridiagonal form.
+           indd = 1
+           inde = indd + n
+           indrwk = inde + n
+           indwrk = 1
+           if( wantz ) then
+              vect = 'U'
+           else
+              vect = 'N'
+           end if
+           call stdlib_zhbtrd( vect, uplo, n, ka, ab, ldab, rwork( indd ),rwork( inde ), q, ldq, &
+                     work( indwrk ), iinfo )
+           ! if all eigenvalues are desired and abstol is less than or equal
+           ! to zero, then call stdlib_dsterf or stdlib_zsteqr.  if this fails for some
+           ! eigenvalue, then try stdlib_dstebz.
+           test = .false.
+           if( indeig ) then
+              if( il==1 .and. iu==n ) then
+                 test = .true.
+              end if
+           end if
+           if( ( alleig .or. test ) .and. ( abstol<=zero ) ) then
+              call stdlib_dcopy( n, rwork( indd ), 1, w, 1 )
+              indee = indrwk + 2*n
+              call stdlib_dcopy( n-1, rwork( inde ), 1, rwork( indee ), 1 )
+              if( .not.wantz ) then
+                 call stdlib_dsterf( n, w, rwork( indee ), info )
+              else
+                 call stdlib_zlacpy( 'A', n, n, q, ldq, z, ldz )
+                 call stdlib_zsteqr( jobz, n, w, rwork( indee ), z, ldz,rwork( indrwk ), info )
+                           
+                 if( info==0 ) then
+                    do i = 1, n
+                       ifail( i ) = 0
+                    end do
+                 end if
+              end if
+              if( info==0 ) then
+                 m = n
+                 go to 30
+              end if
+              info = 0
+           end if
+           ! otherwise, call stdlib_dstebz and, if eigenvectors are desired,
+           ! call stdlib_zstein.
+           if( wantz ) then
+              order = 'B'
+           else
+              order = 'E'
+           end if
+           indibl = 1
+           indisp = indibl + n
+           indiwk = indisp + n
+           call stdlib_dstebz( range, order, n, vl, vu, il, iu, abstol,rwork( indd ), rwork( inde &
+           ), m, nsplit, w,iwork( indibl ), iwork( indisp ), rwork( indrwk ),iwork( indiwk ), &
+                     info )
+           if( wantz ) then
+              call stdlib_zstein( n, rwork( indd ), rwork( inde ), m, w,iwork( indibl ), iwork( &
+                        indisp ), z, ldz,rwork( indrwk ), iwork( indiwk ), ifail, info )
+              ! apply unitary matrix used in reduction to tridiagonal
+              ! form to eigenvectors returned by stdlib_zstein.
+              do j = 1, m
+                 call stdlib_zcopy( n, z( 1, j ), 1, work( 1 ), 1 )
+                 call stdlib_zgemv( 'N', n, n, cone, q, ldq, work, 1, czero,z( 1, j ), 1 )
+              end do
+           end if
+           30 continue
+           ! if eigenvalues are not in order, then sort them, along with
+           ! eigenvectors.
+           if( wantz ) then
+              do j = 1, m - 1
+                 i = 0
+                 tmp1 = w( j )
+                 do jj = j + 1, m
+                    if( w( jj )<tmp1 ) then
+                       i = jj
+                       tmp1 = w( jj )
+                    end if
+                 end do
+                 if( i/=0 ) then
+                    itmp1 = iwork( indibl+i-1 )
+                    w( i ) = w( j )
+                    iwork( indibl+i-1 ) = iwork( indibl+j-1 )
+                    w( j ) = tmp1
+                    iwork( indibl+j-1 ) = itmp1
+                    call stdlib_zswap( n, z( 1, i ), 1, z( 1, j ), 1 )
+                    if( info/=0 ) then
+                       itmp1 = ifail( i )
+                       ifail( i ) = ifail( j )
+                       ifail( j ) = itmp1
+                    end if
+                 end if
+              end do
+           end if
+           return
+     end subroutine stdlib_zhbgvx
+
+     !> ZHEEVD: computes all eigenvalues and, optionally, eigenvectors of a
+     !> complex Hermitian matrix A.  If eigenvectors are desired, it uses a
+     !> divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_zheevd( jobz, uplo, n, a, lda, w, work, lwork, rwork,lrwork, iwork, liwork,&
+                info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lower, lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indrwk, indtau, indwk2, indwrk, iscale, liopt, &
+                     liwmin, llrwk, llwork, llwrk2, lopt, lropt, lrwmin, lwmin
+           real(dp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lower = stdlib_lsame( uplo, 'L' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( lower .or. stdlib_lsame( uplo, 'U' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwmin = 1
+                 lrwmin = 1
+                 liwmin = 1
+                 lopt = lwmin
+                 lropt = lrwmin
+                 liopt = liwmin
+              else
+                 if( wantz ) then
+                    lwmin = 2*n + n*n
+                    lrwmin = 1 + 5*n + 2*n**2
+                    liwmin = 3 + 5*n
+                 else
+                    lwmin = n + 1
+                    lrwmin = n
+                    liwmin = 1
+                 end if
+                 lopt = max( lwmin, n +stdlib_ilaenv( 1, 'ZHETRD', uplo, n, -1, -1, -1 ) )
+                 lropt = lrwmin
+                 liopt = liwmin
+              end if
+              work( 1 ) = lopt
+              rwork( 1 ) = lropt
+              iwork( 1 ) = liopt
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -8
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -10
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = real( a( 1, 1 ),KIND=dp)
+              if( wantz )a( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_zlanhe( 'M', uplo, n, a, lda, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 )call stdlib_zlascl( uplo, 0, 0, one, sigma, n, n, a, lda, info )
+           ! call stdlib_zhetrd to reduce hermitian matrix to tridiagonal form.
+           inde = 1
+           indtau = 1
+           indwrk = indtau + n
+           indrwk = inde + n
+           indwk2 = indwrk + n*n
+           llwork = lwork - indwrk + 1
+           llwrk2 = lwork - indwk2 + 1
+           llrwk = lrwork - indrwk + 1
+           call stdlib_zhetrd( uplo, n, a, lda, w, rwork( inde ), work( indtau ),work( indwrk ), &
+                     llwork, iinfo )
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, first call
+           ! stdlib_zstedc to generate the eigenvector matrix, work(indwrk), of the
+           ! tridiagonal matrix, then call stdlib_zunmtr to multiply it to the
+           ! householder transformations represented as householder vectors in
+           ! a.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_zstedc( 'I', n, w, rwork( inde ), work( indwrk ), n,work( indwk2 ), &
+                        llwrk2, rwork( indrwk ), llrwk,iwork, liwork, info )
+              call stdlib_zunmtr( 'L', uplo, 'N', n, n, a, lda, work( indtau ),work( indwrk ), n, &
+                        work( indwk2 ), llwrk2, iinfo )
+              call stdlib_zlacpy( 'A', n, n, work( indwrk ), n, a, lda )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           work( 1 ) = lopt
+           rwork( 1 ) = lropt
+           iwork( 1 ) = liopt
+           return
+     end subroutine stdlib_zheevd
+
+     !> ZHEGVD: computes all the eigenvalues, and optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be Hermitian and B is also positive definite.
+     !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_zhegvd( itype, jobz, uplo, n, a, lda, b, ldb, w, work,lwork, rwork, lrwork,&
+                iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, lda, ldb, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: liopt, liwmin, lopt, lropt, lrwmin, lwmin
+           ! Intrinsic Functions 
+           intrinsic :: real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( n<=1 ) then
+              lwmin = 1
+              lrwmin = 1
+              liwmin = 1
+           else if( wantz ) then
+              lwmin = 2*n + n*n
+              lrwmin = 1 + 5*n + 2*n*n
+              liwmin = 3 + 5*n
+           else
+              lwmin = n + 1
+              lrwmin = n
+              liwmin = 1
+           end if
+           lopt = lwmin
+           lropt = lrwmin
+           liopt = liwmin
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           end if
+           if( info==0 ) then
+              work( 1 ) = lopt
+              rwork( 1 ) = lropt
+              iwork( 1 ) = liopt
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -13
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHEGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_zpotrf( uplo, n, b, ldb, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_zhegst( itype, uplo, n, a, lda, b, ldb, info )
+           call stdlib_zheevd( jobz, uplo, n, a, lda, w, work, lwork, rwork, lrwork,iwork, liwork,&
+                      info )
+           lopt = max( real( lopt,KIND=dp), real( work( 1 ),KIND=dp) )
+           lropt = max( real( lropt,KIND=dp), real( rwork( 1 ),KIND=dp) )
+           liopt = max( real( liopt,KIND=dp), real( iwork( 1 ),KIND=dp) )
+           if( wantz .and. info==0 ) then
+              ! backtransform eigenvectors to the original problem.
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 call stdlib_ztrsm( 'LEFT', uplo, trans, 'NON-UNIT', n, n, cone,b, ldb, a, lda )
+                           
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 call stdlib_ztrmm( 'LEFT', uplo, trans, 'NON-UNIT', n, n, cone,b, ldb, a, lda )
+                           
+              end if
+           end if
+           work( 1 ) = lopt
+           rwork( 1 ) = lropt
+           iwork( 1 ) = liopt
+           return
+     end subroutine stdlib_zhegvd
+
+     !> ZHPEVD: computes all the eigenvalues and, optionally, eigenvectors of
+     !> a complex Hermitian matrix A in packed storage.  If eigenvectors are
+     !> desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_zhpevd( jobz, uplo, n, ap, w, z, ldz, work, lwork,rwork, lrwork, iwork, &
+               liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: ap(*)
+           complex(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, wantz
+           integer(ilp) :: iinfo, imax, inde, indrwk, indtau, indwrk, iscale, liwmin, llrwk, &
+                     llwrk, lrwmin, lwmin
+           real(dp) :: anrm, bignum, eps, rmax, rmin, safmin, sigma, smlnum
+           ! Intrinsic Functions 
+           intrinsic :: sqrt
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -1
+           else if( .not.( stdlib_lsame( uplo, 'L' ) .or. stdlib_lsame( uplo, 'U' ) ) )&
+                     then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwmin = 1
+                 liwmin = 1
+                 lrwmin = 1
+              else
+                 if( wantz ) then
+                    lwmin = 2*n
+                    lrwmin = 1 + 5*n + 2*n**2
+                    liwmin = 3 + 5*n
+                 else
+                    lwmin = n
+                    lrwmin = n
+                    liwmin = 1
+                 end if
+              end if
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -9
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -11
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -13
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPEVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           if( n==1 ) then
+              w( 1 ) = real( ap( 1 ),KIND=dp)
+              if( wantz )z( 1, 1 ) = cone
+              return
+           end if
+           ! get machine constants.
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           eps = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin / eps
+           bignum = one / smlnum
+           rmin = sqrt( smlnum )
+           rmax = sqrt( bignum )
+           ! scale matrix to allowable range, if necessary.
+           anrm = stdlib_zlanhp( 'M', uplo, n, ap, rwork )
+           iscale = 0
+           if( anrm>zero .and. anrm<rmin ) then
+              iscale = 1
+              sigma = rmin / anrm
+           else if( anrm>rmax ) then
+              iscale = 1
+              sigma = rmax / anrm
+           end if
+           if( iscale==1 ) then
+              call stdlib_zdscal( ( n*( n+1 ) ) / 2, sigma, ap, 1 )
+           end if
+           ! call stdlib_zhptrd to reduce hermitian packed matrix to tridiagonal form.
+           inde = 1
+           indtau = 1
+           indrwk = inde + n
+           indwrk = indtau + n
+           llwrk = lwork - indwrk + 1
+           llrwk = lrwork - indrwk + 1
+           call stdlib_zhptrd( uplo, n, ap, w, rwork( inde ), work( indtau ),iinfo )
+           ! for eigenvalues only, call stdlib_dsterf.  for eigenvectors, first call
+           ! stdlib_zupgtr to generate the orthogonal matrix, then call stdlib_zstedc.
+           if( .not.wantz ) then
+              call stdlib_dsterf( n, w, rwork( inde ), info )
+           else
+              call stdlib_zstedc( 'I', n, w, rwork( inde ), z, ldz, work( indwrk ),llwrk, rwork( &
+                        indrwk ), llrwk, iwork, liwork,info )
+              call stdlib_zupmtr( 'L', uplo, 'N', n, n, ap, work( indtau ), z, ldz,work( indwrk ),&
+                         iinfo )
+           end if
+           ! if matrix was scaled, then rescale eigenvalues appropriately.
+           if( iscale==1 ) then
+              if( info==0 ) then
+                 imax = n
+              else
+                 imax = info - 1
+              end if
+              call stdlib_dscal( imax, one / sigma, w, 1 )
+           end if
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_zhpevd
+
+     !> ZHPGVD: computes all the eigenvalues and, optionally, the eigenvectors
+     !> of a complex generalized Hermitian-definite eigenproblem, of the form
+     !> A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
+     !> B are assumed to be Hermitian, stored in packed format, and B is also
+     !> positive definite.
+     !> If eigenvectors are desired, it uses a divide and conquer algorithm.
+     !> The divide and conquer algorithm makes very mild assumptions about
+     !> floating point arithmetic. It will work on machines with a guard
+     !> digit in add/subtract, or on those binary machines without guard
+     !> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+     !> Cray-2. It could conceivably fail on hexadecimal or decimal machines
+     !> without guard digits, but we know of none.
+
+     subroutine stdlib_zhpgvd( itype, jobz, uplo, n, ap, bp, w, z, ldz, work,lwork, rwork, lrwork,&
+                iwork, liwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobz, uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: itype, ldz, liwork, lrwork, lwork, n
+           ! Array Arguments 
+           integer(ilp), intent(out) :: iwork(*)
+           real(dp), intent(out) :: rwork(*), w(*)
+           complex(dp), intent(inout) :: ap(*), bp(*)
+           complex(dp), intent(out) :: work(*), z(ldz,*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery, upper, wantz
+           character :: trans
+           integer(ilp) :: j, liwmin, lrwmin, lwmin, neig
+           ! Intrinsic Functions 
+           intrinsic :: real,max
+           ! Executable Statements 
+           ! test the input parameters.
+           wantz = stdlib_lsame( jobz, 'V' )
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 .or. lrwork==-1 .or. liwork==-1 )
+           info = 0
+           if( itype<1 .or. itype>3 ) then
+              info = -1
+           else if( .not.( wantz .or. stdlib_lsame( jobz, 'N' ) ) ) then
+              info = -2
+           else if( .not.( upper .or. stdlib_lsame( uplo, 'L' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ldz<1 .or. ( wantz .and. ldz<n ) ) then
+              info = -9
+           end if
+           if( info==0 ) then
+              if( n<=1 ) then
+                 lwmin = 1
+                 liwmin = 1
+                 lrwmin = 1
+              else
+                 if( wantz ) then
+                    lwmin = 2*n
+                    lrwmin = 1 + 5*n + 2*n**2
+                    liwmin = 3 + 5*n
+                 else
+                    lwmin = n
+                    lrwmin = n
+                    liwmin = 1
+                 end if
+              end if
+              work( 1 ) = lwmin
+              rwork( 1 ) = lrwmin
+              iwork( 1 ) = liwmin
+              if( lwork<lwmin .and. .not.lquery ) then
+                 info = -11
+              else if( lrwork<lrwmin .and. .not.lquery ) then
+                 info = -13
+              else if( liwork<liwmin .and. .not.lquery ) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHPGVD', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! form a cholesky factorization of b.
+           call stdlib_zpptrf( uplo, n, bp, info )
+           if( info/=0 ) then
+              info = n + info
+              return
+           end if
+           ! transform problem to standard eigenvalue problem and solve.
+           call stdlib_zhpgst( itype, uplo, n, ap, bp, info )
+           call stdlib_zhpevd( jobz, uplo, n, ap, w, z, ldz, work, lwork, rwork,lrwork, iwork, &
+                     liwork, info )
+           lwmin = max( real( lwmin,KIND=dp), real( work( 1 ),KIND=dp) )
+           lrwmin = max( real( lrwmin,KIND=dp), real( rwork( 1 ),KIND=dp) )
+           liwmin = max( real( liwmin,KIND=dp), real( iwork( 1 ),KIND=dp) )
+           if( wantz ) then
+              ! backtransform eigenvectors to the original problem.
+              neig = n
+              if( info>0 )neig = info - 1
+              if( itype==1 .or. itype==2 ) then
+                 ! for a*x=(lambda)*b*x and a*b*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = inv(l)**h *y or inv(u)*y
+                 if( upper ) then
+                    trans = 'N'
+                 else
+                    trans = 'C'
+                 end if
+                 do j = 1, neig
+                    call stdlib_ztpsv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              else if( itype==3 ) then
+                 ! for b*a*x=(lambda)*x;
+                 ! backtransform eigenvectors: x = l*y or u**h *y
+                 if( upper ) then
+                    trans = 'C'
+                 else
+                    trans = 'N'
+                 end if
+                 do j = 1, neig
+                    call stdlib_ztpmv( uplo, trans, 'NON-UNIT', n, bp, z( 1, j ),1 )
+                 end do
+              end if
+           end if
+           work( 1 ) = lwmin
+           rwork( 1 ) = lrwmin
+           iwork( 1 ) = liwmin
+           return
+     end subroutine stdlib_zhpgvd
+
+     !> ZGEES: computes for an N-by-N complex nonsymmetric matrix A, the
+     !> eigenvalues, the Schur form T, and, optionally, the matrix of Schur
+     !> vectors Z.  This gives the Schur factorization A = Z*T*(Z**H).
+     !> Optionally, it also orders the eigenvalues on the diagonal of the
+     !> Schur form so that selected eigenvalues are at the top left.
+     !> The leading columns of Z then form an orthonormal basis for the
+     !> invariant subspace corresponding to the selected eigenvalues.
+     !> A complex matrix is in Schur form if it is upper triangular.
+
+     subroutine stdlib_zgees( jobvs, sort, select, n, a, lda, sdim, w, vs,ldvs, work, lwork, &
+               rwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvs, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldvs, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: vs(ldvs,*), w(*), work(*)
+           ! Function Arguments 
+           procedure(stdlib_select_z) :: select
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantst, wantvs
+           integer(ilp) :: hswork, i, ibal, icond, ierr, ieval, ihi, ilo, itau, iwrk, maxwrk, &
+                     minwrk
+           real(dp) :: anrm, bignum, cscale, eps, s, sep, smlnum
+           ! Local Arrays 
+           real(dp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvs = stdlib_lsame( jobvs, 'V' )
+           wantst = stdlib_lsame( sort, 'S' )
+           if( ( .not.wantvs ) .and. ( .not.stdlib_lsame( jobvs, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -4
+           else if( lda<max( 1, n ) ) then
+              info = -6
+           else if( ldvs<1 .or. ( wantvs .and. ldvs<n ) ) then
+              info = -10
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to real
+             ! workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_zhseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'ZGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 2*n
+                 call stdlib_zhseqr( 'S', jobvs, n, 1, n, a, lda, w, vs, ldvs,work, -1, ieval )
+                           
+                 hswork = real( work( 1 ),KIND=dp)
+                 if( .not.wantvs ) then
+                    maxwrk = max( maxwrk, hswork )
+                 else
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'ZUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    maxwrk = max( maxwrk, hswork )
+                 end if
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEES ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_zlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (cworkspace: none)
+           ! (rworkspace: need n)
+           ibal = 1
+           call stdlib_zgebal( 'P', n, a, lda, ilo, ihi, rwork( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (cworkspace: need 2*n, prefer n+n*nb)
+           ! (rworkspace: none)
+           itau = 1
+           iwrk = n + itau
+           call stdlib_zgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvs ) then
+              ! copy householder vectors to vs
+              call stdlib_zlacpy( 'L', n, n, a, lda, vs, ldvs )
+              ! generate unitary matrix in vs
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_zunghr( n, ilo, ihi, vs, ldvs, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+           end if
+           sdim = 0
+           ! perform qr iteration, accumulating schur vectors in vs if desired
+           ! (cworkspace: need 1, prefer hswork (see comments) )
+           ! (rworkspace: none)
+           iwrk = itau
+           call stdlib_zhseqr( 'S', jobvs, n, ilo, ihi, a, lda, w, vs, ldvs,work( iwrk ), lwork-&
+                     iwrk+1, ieval )
+           if( ieval>0 )info = ieval
+           ! sort eigenvalues if desired
+           if( wantst .and. info==0 ) then
+              if( scalea )call stdlib_zlascl( 'G', 0, 0, cscale, anrm, n, 1, w, n, ierr )
+              do i = 1, n
+                 bwork( i ) = select( w( i ) )
+              end do
+              ! reorder eigenvalues and transform schur vectors
+              ! (cworkspace: none)
+              ! (rworkspace: none)
+              call stdlib_ztrsen( 'N', jobvs, bwork, n, a, lda, vs, ldvs, w, sdim,s, sep, work( &
+                        iwrk ), lwork-iwrk+1, icond )
+           end if
+           if( wantvs ) then
+              ! undo balancing
+              ! (cworkspace: none)
+              ! (rworkspace: need n)
+              call stdlib_zgebak( 'P', 'R', n, ilo, ihi, rwork( ibal ), n, vs, ldvs,ierr )
+           end if
+           if( scalea ) then
+              ! undo scaling for the schur form of a
+              call stdlib_zlascl( 'U', 0, 0, cscale, anrm, n, n, a, lda, ierr )
+              call stdlib_zcopy( n, a, lda+1, w, 1 )
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_zgees
+
+     !> ZGEESX: computes for an N-by-N complex nonsymmetric matrix A, the
+     !> eigenvalues, the Schur form T, and, optionally, the matrix of Schur
+     !> vectors Z.  This gives the Schur factorization A = Z*T*(Z**H).
+     !> Optionally, it also orders the eigenvalues on the diagonal of the
+     !> Schur form so that selected eigenvalues are at the top left;
+     !> computes a reciprocal condition number for the average of the
+     !> selected eigenvalues (RCONDE); and computes a reciprocal condition
+     !> number for the right invariant subspace corresponding to the
+     !> selected eigenvalues (RCONDV).  The leading columns of Z form an
+     !> orthonormal basis for this invariant subspace.
+     !> For further explanation of the reciprocal condition numbers RCONDE
+     !> and RCONDV, see Section 4.10_dp of the LAPACK Users' Guide (where
+     !> these quantities are called s and sep respectively).
+     !> A complex matrix is in Schur form if it is upper triangular.
+
+     subroutine stdlib_zgeesx( jobvs, sort, select, sense, n, a, lda, sdim, w,vs, ldvs, rconde, &
+               rcondv, work, lwork, rwork,bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvs, sense, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldvs, lwork, n
+           real(dp), intent(out) :: rconde, rcondv
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: vs(ldvs,*), w(*), work(*)
+           ! Function Arguments 
+           procedure(stdlib_select_z) :: select
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantsb, wantse, wantsn, wantst, wantsv, wantvs
+           integer(ilp) :: hswork, i, ibal, icond, ierr, ieval, ihi, ilo, itau, iwrk, lwrk, &
+                     maxwrk, minwrk
+           real(dp) :: anrm, bignum, cscale, eps, smlnum
+           ! Local Arrays 
+           real(dp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           wantvs = stdlib_lsame( jobvs, 'V' )
+           wantst = stdlib_lsame( sort, 'S' )
+           wantsn = stdlib_lsame( sense, 'N' )
+           wantse = stdlib_lsame( sense, 'E' )
+           wantsv = stdlib_lsame( sense, 'V' )
+           wantsb = stdlib_lsame( sense, 'B' )
+           lquery = ( lwork==-1 )
+           if( ( .not.wantvs ) .and. ( .not.stdlib_lsame( jobvs, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -2
+           else if( .not.( wantsn .or. wantse .or. wantsv .or. wantsb ) .or.( .not.wantst .and. &
+                     .not.wantsn ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldvs<1 .or. ( wantvs .and. ldvs<n ) ) then
+              info = -11
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of real workspace needed at that point in the
+             ! code, as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to real
+             ! workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_zhseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.
+             ! if sense = 'e', 'v' or 'b', then the amount of workspace needed
+             ! depends on sdim, which is computed by the routine stdlib_ztrsen later
+             ! in the code.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 lwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'ZGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 2*n
+                 call stdlib_zhseqr( 'S', jobvs, n, 1, n, a, lda, w, vs, ldvs,work, -1, ieval )
+                           
+                 hswork = real( work( 1 ),KIND=dp)
+                 if( .not.wantvs ) then
+                    maxwrk = max( maxwrk, hswork )
+                 else
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'ZUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    maxwrk = max( maxwrk, hswork )
+                 end if
+                 lwrk = maxwrk
+                 if( .not.wantsn )lwrk = max( lwrk, ( n*n )/2 )
+              end if
+              work( 1 ) = lwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -15
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEESX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_zlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ! (cworkspace: none)
+           ! (rworkspace: need n)
+           ibal = 1
+           call stdlib_zgebal( 'P', n, a, lda, ilo, ihi, rwork( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (cworkspace: need 2*n, prefer n+n*nb)
+           ! (rworkspace: none)
+           itau = 1
+           iwrk = n + itau
+           call stdlib_zgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvs ) then
+              ! copy householder vectors to vs
+              call stdlib_zlacpy( 'L', n, n, a, lda, vs, ldvs )
+              ! generate unitary matrix in vs
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_zunghr( n, ilo, ihi, vs, ldvs, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+           end if
+           sdim = 0
+           ! perform qr iteration, accumulating schur vectors in vs if desired
+           ! (cworkspace: need 1, prefer hswork (see comments) )
+           ! (rworkspace: none)
+           iwrk = itau
+           call stdlib_zhseqr( 'S', jobvs, n, ilo, ihi, a, lda, w, vs, ldvs,work( iwrk ), lwork-&
+                     iwrk+1, ieval )
+           if( ieval>0 )info = ieval
+           ! sort eigenvalues if desired
+           if( wantst .and. info==0 ) then
+              if( scalea )call stdlib_zlascl( 'G', 0, 0, cscale, anrm, n, 1, w, n, ierr )
+              do i = 1, n
+                 bwork( i ) = select( w( i ) )
+              end do
+              ! reorder eigenvalues, transform schur vectors, and compute
+              ! reciprocal condition numbers
+              ! (cworkspace: if sense is not 'n', need 2*sdim*(n-sdim)
+                           ! otherwise, need none )
+              ! (rworkspace: none)
+              call stdlib_ztrsen( sense, jobvs, bwork, n, a, lda, vs, ldvs, w, sdim,rconde, &
+                        rcondv, work( iwrk ), lwork-iwrk+1,icond )
+              if( .not.wantsn )maxwrk = max( maxwrk, 2*sdim*( n-sdim ) )
+              if( icond==-14 ) then
+                 ! not enough complex workspace
+                 info = -15
+              end if
+           end if
+           if( wantvs ) then
+              ! undo balancing
+              ! (cworkspace: none)
+              ! (rworkspace: need n)
+              call stdlib_zgebak( 'P', 'R', n, ilo, ihi, rwork( ibal ), n, vs, ldvs,ierr )
+           end if
+           if( scalea ) then
+              ! undo scaling for the schur form of a
+              call stdlib_zlascl( 'U', 0, 0, cscale, anrm, n, n, a, lda, ierr )
+              call stdlib_zcopy( n, a, lda+1, w, 1 )
+              if( ( wantsv .or. wantsb ) .and. info==0 ) then
+                 dum( 1 ) = rcondv
+                 call stdlib_dlascl( 'G', 0, 0, cscale, anrm, 1, 1, dum, 1, ierr )
+                 rcondv = dum( 1 )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_zgeesx
+
+     !> ZGEEV: computes for an N-by-N complex nonsymmetric matrix A, the
+     !> eigenvalues and, optionally, the left and/or right eigenvectors.
+     !> The right eigenvector v(j) of A satisfies
+     !> A * v(j) = lambda(j) * v(j)
+     !> where lambda(j) is its eigenvalue.
+     !> The left eigenvector u(j) of A satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H
+     !> where u(j)**H denotes the conjugate transpose of u(j).
+     !> The computed eigenvectors are normalized to have Euclidean norm
+     !> equal to 1 and largest component real.
+
+     subroutine stdlib_zgeev( jobvl, jobvr, n, a, lda, w, vl, ldvl, vr, ldvr,work, lwork, rwork, &
+               info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: vl(ldvl,*), vr(ldvr,*), w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantvl, wantvr
+           character :: side
+           integer(ilp) :: hswork, i, ibal, ierr, ihi, ilo, irwork, itau, iwrk, k, lwork_trevc, &
+                     maxwrk, minwrk, nout
+           real(dp) :: anrm, bignum, cscale, eps, scl, smlnum
+           complex(dp) :: tmp
+           ! Local Arrays 
+           logical(lk) :: select(1)
+           real(dp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,conjg,aimag,max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvl = stdlib_lsame( jobvl, 'V' )
+           wantvr = stdlib_lsame( jobvr, 'V' )
+           if( ( .not.wantvl ) .and. ( .not.stdlib_lsame( jobvl, 'N' ) ) ) then
+              info = -1
+           else if( ( .not.wantvr ) .and. ( .not.stdlib_lsame( jobvr, 'N' ) ) ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldvl<1 .or. ( wantvl .and. ldvl<n ) ) then
+              info = -8
+           else if( ldvr<1 .or. ( wantvr .and. ldvr<n ) ) then
+              info = -10
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to real
+             ! workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_zhseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'ZGEHRD', ' ', n, 1, n, 0 )
+                 minwrk = 2*n
+                 if( wantvl ) then
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'ZUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    call stdlib_ztrevc3( 'L', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, rwork, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    call stdlib_zhseqr( 'S', 'V', n, 1, n, a, lda, w, vl, ldvl,work, -1, info )
+                              
+                 else if( wantvr ) then
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'ZUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    call stdlib_ztrevc3( 'R', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, rwork, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, n + lwork_trevc )
+                    call stdlib_zhseqr( 'S', 'V', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                              
+                 else
+                    call stdlib_zhseqr( 'E', 'N', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                              
+                 end if
+                 hswork = int( work(1),KIND=ilp)
+                 maxwrk = max( maxwrk, hswork, minwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -12
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEEV ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_zlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! balance the matrix
+           ! (cworkspace: none)
+           ! (rworkspace: need n)
+           ibal = 1
+           call stdlib_zgebal( 'B', n, a, lda, ilo, ihi, rwork( ibal ), ierr )
+           ! reduce to upper hessenberg form
+           ! (cworkspace: need 2*n, prefer n+n*nb)
+           ! (rworkspace: none)
+           itau = 1
+           iwrk = itau + n
+           call stdlib_zgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvl ) then
+              ! want left eigenvectors
+              ! copy householder vectors to vl
+              side = 'L'
+              call stdlib_zlacpy( 'L', n, n, a, lda, vl, ldvl )
+              ! generate unitary matrix in vl
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_zunghr( n, ilo, ihi, vl, ldvl, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vl
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_zhseqr( 'S', 'V', n, ilo, ihi, a, lda, w, vl, ldvl,work( iwrk ), lwork-&
+                        iwrk+1, info )
+              if( wantvr ) then
+                 ! want left and right eigenvectors
+                 ! copy schur vectors to vr
+                 side = 'B'
+                 call stdlib_zlacpy( 'F', n, n, vl, ldvl, vr, ldvr )
+              end if
+           else if( wantvr ) then
+              ! want right eigenvectors
+              ! copy householder vectors to vr
+              side = 'R'
+              call stdlib_zlacpy( 'L', n, n, a, lda, vr, ldvr )
+              ! generate unitary matrix in vr
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_zunghr( n, ilo, ihi, vr, ldvr, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vr
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_zhseqr( 'S', 'V', n, ilo, ihi, a, lda, w, vr, ldvr,work( iwrk ), lwork-&
+                        iwrk+1, info )
+           else
+              ! compute eigenvalues only
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_zhseqr( 'E', 'N', n, ilo, ihi, a, lda, w, vr, ldvr,work( iwrk ), lwork-&
+                        iwrk+1, info )
+           end if
+           ! if info /= 0 from stdlib_zhseqr, then quit
+           if( info/=0 )go to 50
+           if( wantvl .or. wantvr ) then
+              ! compute left and/or right eigenvectors
+              ! (cworkspace: need 2*n, prefer n + 2*n*nb)
+              ! (rworkspace: need 2*n)
+              irwork = ibal + n
+              call stdlib_ztrevc3( side, 'B', select, n, a, lda, vl, ldvl, vr, ldvr,n, nout, work(&
+                         iwrk ), lwork-iwrk+1,rwork( irwork ), n, ierr )
+           end if
+           if( wantvl ) then
+              ! undo balancing of left eigenvectors
+              ! (cworkspace: none)
+              ! (rworkspace: need n)
+              call stdlib_zgebak( 'B', 'L', n, ilo, ihi, rwork( ibal ), n, vl, ldvl,ierr )
+              ! normalize left eigenvectors and make largest component real
+              do i = 1, n
+                 scl = one / stdlib_dznrm2( n, vl( 1, i ), 1 )
+                 call stdlib_zdscal( n, scl, vl( 1, i ), 1 )
+                 do k = 1, n
+                    rwork( irwork+k-1 ) = real( vl( k, i ),KIND=dp)**2 +aimag( vl( k, i ) )&
+                              **2
+                 end do
+                 k = stdlib_idamax( n, rwork( irwork ), 1 )
+                 tmp = conjg( vl( k, i ) ) / sqrt( rwork( irwork+k-1 ) )
+                 call stdlib_zscal( n, tmp, vl( 1, i ), 1 )
+                 vl( k, i ) = cmplx( real( vl( k, i ),KIND=dp), zero,KIND=dp)
+              end do
+           end if
+           if( wantvr ) then
+              ! undo balancing of right eigenvectors
+              ! (cworkspace: none)
+              ! (rworkspace: need n)
+              call stdlib_zgebak( 'B', 'R', n, ilo, ihi, rwork( ibal ), n, vr, ldvr,ierr )
+              ! normalize right eigenvectors and make largest component real
+              do i = 1, n
+                 scl = one / stdlib_dznrm2( n, vr( 1, i ), 1 )
+                 call stdlib_zdscal( n, scl, vr( 1, i ), 1 )
+                 do k = 1, n
+                    rwork( irwork+k-1 ) = real( vr( k, i ),KIND=dp)**2 +aimag( vr( k, i ) )&
+                              **2
+                 end do
+                 k = stdlib_idamax( n, rwork( irwork ), 1 )
+                 tmp = conjg( vr( k, i ) ) / sqrt( rwork( irwork+k-1 ) )
+                 call stdlib_zscal( n, tmp, vr( 1, i ), 1 )
+                 vr( k, i ) = cmplx( real( vr( k, i ),KIND=dp), zero,KIND=dp)
+              end do
+           end if
+           ! undo scaling if necessary
+           50 continue
+           if( scalea ) then
+              call stdlib_zlascl( 'G', 0, 0, cscale, anrm, n-info, 1, w( info+1 ),max( n-info, 1 )&
+                        , ierr )
+              if( info>0 ) then
+                 call stdlib_zlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, w, n, ierr )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_zgeev
+
+     !> ZGEEVX: computes for an N-by-N complex nonsymmetric matrix A, the
+     !> eigenvalues and, optionally, the left and/or right eigenvectors.
+     !> Optionally also, it computes a balancing transformation to improve
+     !> the conditioning of the eigenvalues and eigenvectors (ILO, IHI,
+     !> SCALE, and ABNRM), reciprocal condition numbers for the eigenvalues
+     !> (RCONDE), and reciprocal condition numbers for the right
+     !> eigenvectors (RCONDV).
+     !> The right eigenvector v(j) of A satisfies
+     !> A * v(j) = lambda(j) * v(j)
+     !> where lambda(j) is its eigenvalue.
+     !> The left eigenvector u(j) of A satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H
+     !> where u(j)**H denotes the conjugate transpose of u(j).
+     !> The computed eigenvectors are normalized to have Euclidean norm
+     !> equal to 1 and largest component real.
+     !> Balancing a matrix means permuting the rows and columns to make it
+     !> more nearly upper triangular, and applying a diagonal similarity
+     !> transformation D * A * D**(-1), where D is a diagonal matrix, to
+     !> make its rows and columns closer in norm and the condition numbers
+     !> of its eigenvalues and eigenvectors smaller.  The computed
+     !> reciprocal condition numbers correspond to the balanced matrix.
+     !> Permuting rows and columns will not change the condition numbers
+     !> (in exact arithmetic) but diagonal scaling will.  For further
+     !> explanation of balancing, see section 4.10.2_dp of the LAPACK
+     !> Users' Guide.
+
+     subroutine stdlib_zgeevx( balanc, jobvl, jobvr, sense, n, a, lda, w, vl,ldvl, vr, ldvr, ilo, &
+               ihi, scale, abnrm, rconde,rcondv, work, lwork, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: balanc, jobvl, jobvr, sense
+           integer(ilp), intent(out) :: ihi, ilo, info
+           integer(ilp), intent(in) :: lda, ldvl, ldvr, lwork, n
+           real(dp), intent(out) :: abnrm
+           ! Array Arguments 
+           real(dp), intent(out) :: rconde(*), rcondv(*), rwork(*), scale(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: vl(ldvl,*), vr(ldvr,*), w(*), work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, scalea, wantvl, wantvr, wntsnb, wntsne, wntsnn, wntsnv
+           character :: job, side
+           integer(ilp) :: hswork, i, icond, ierr, itau, iwrk, k, lwork_trevc, maxwrk, minwrk, &
+                     nout
+           real(dp) :: anrm, bignum, cscale, eps, scl, smlnum
+           complex(dp) :: tmp
+           ! Local Arrays 
+           logical(lk) :: select(1)
+           real(dp) :: dum(1)
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,conjg,aimag,max,sqrt
+           ! Executable Statements 
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           wantvl = stdlib_lsame( jobvl, 'V' )
+           wantvr = stdlib_lsame( jobvr, 'V' )
+           wntsnn = stdlib_lsame( sense, 'N' )
+           wntsne = stdlib_lsame( sense, 'E' )
+           wntsnv = stdlib_lsame( sense, 'V' )
+           wntsnb = stdlib_lsame( sense, 'B' )
+           if( .not.( stdlib_lsame( balanc, 'N' ) .or. stdlib_lsame( balanc, 'S' ) &
+                     .or.stdlib_lsame( balanc, 'P' ) .or. stdlib_lsame( balanc, 'B' ) ) ) then
+              info = -1
+           else if( ( .not.wantvl ) .and. ( .not.stdlib_lsame( jobvl, 'N' ) ) ) then
+              info = -2
+           else if( ( .not.wantvr ) .and. ( .not.stdlib_lsame( jobvr, 'N' ) ) ) then
+              info = -3
+           else if( .not.( wntsnn .or. wntsne .or. wntsnb .or. wntsnv ) .or.( ( wntsne .or. &
+                     wntsnb ) .and. .not.( wantvl .and.wantvr ) ) ) then
+              info = -4
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( wantvl .and. ldvl<n ) ) then
+              info = -10
+           else if( ldvr<1 .or. ( wantvr .and. ldvr<n ) ) then
+              info = -12
+           end if
+           ! compute workspace
+            ! (note: comments in the code beginning "workspace:" describe the
+             ! minimal amount of workspace needed at that point in the code,
+             ! as well as the preferred amount for good performance.
+             ! cworkspace refers to complex workspace, and rworkspace to real
+             ! workspace. nb refers to the optimal block size for the
+             ! immediately following subroutine, as returned by stdlib_ilaenv.
+             ! hswork refers to the workspace preferred by stdlib_zhseqr, as
+             ! calculated below. hswork is computed assuming ilo=1 and ihi=n,
+             ! the worst case.)
+           if( info==0 ) then
+              if( n==0 ) then
+                 minwrk = 1
+                 maxwrk = 1
+              else
+                 maxwrk = n + n*stdlib_ilaenv( 1, 'ZGEHRD', ' ', n, 1, n, 0 )
+                 if( wantvl ) then
+                    call stdlib_ztrevc3( 'L', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, rwork, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, lwork_trevc )
+                    call stdlib_zhseqr( 'S', 'V', n, 1, n, a, lda, w, vl, ldvl,work, -1, info )
+                              
+                 else if( wantvr ) then
+                    call stdlib_ztrevc3( 'R', 'B', select, n, a, lda,vl, ldvl, vr, ldvr,n, nout, &
+                              work, -1, rwork, -1, ierr )
+                    lwork_trevc = int( work(1),KIND=ilp)
+                    maxwrk = max( maxwrk, lwork_trevc )
+                    call stdlib_zhseqr( 'S', 'V', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                              
+                 else
+                    if( wntsnn ) then
+                       call stdlib_zhseqr( 'E', 'N', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                                 
+                    else
+                       call stdlib_zhseqr( 'S', 'N', n, 1, n, a, lda, w, vr, ldvr,work, -1, info )
+                                 
+                    end if
+                 end if
+                 hswork = int( work(1),KIND=ilp)
+                 if( ( .not.wantvl ) .and. ( .not.wantvr ) ) then
+                    minwrk = 2*n
+                    if( .not.( wntsnn .or. wntsne ) )minwrk = max( minwrk, n*n + 2*n )
+                    maxwrk = max( maxwrk, hswork )
+                    if( .not.( wntsnn .or. wntsne ) )maxwrk = max( maxwrk, n*n + 2*n )
+                 else
+                    minwrk = 2*n
+                    if( .not.( wntsnn .or. wntsne ) )minwrk = max( minwrk, n*n + 2*n )
+                    maxwrk = max( maxwrk, hswork )
+                    maxwrk = max( maxwrk, n + ( n - 1 )*stdlib_ilaenv( 1, 'ZUNGHR',' ', n, 1, n, -&
+                              1 ) )
+                    if( .not.( wntsnn .or. wntsne ) )maxwrk = max( maxwrk, n*n + 2*n )
+                    maxwrk = max( maxwrk, 2*n )
+                 end if
+                 maxwrk = max( maxwrk, minwrk )
+              end if
+              work( 1 ) = maxwrk
+              if( lwork<minwrk .and. .not.lquery ) then
+                 info = -20
+              end if
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGEEVX', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           icond = 0
+           anrm = stdlib_zlange( 'M', n, n, a, lda, dum )
+           scalea = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              scalea = .true.
+              cscale = smlnum
+           else if( anrm>bignum ) then
+              scalea = .true.
+              cscale = bignum
+           end if
+           if( scalea )call stdlib_zlascl( 'G', 0, 0, anrm, cscale, n, n, a, lda, ierr )
+           ! balance the matrix and compute abnrm
+           call stdlib_zgebal( balanc, n, a, lda, ilo, ihi, scale, ierr )
+           abnrm = stdlib_zlange( '1', n, n, a, lda, dum )
+           if( scalea ) then
+              dum( 1 ) = abnrm
+              call stdlib_dlascl( 'G', 0, 0, cscale, anrm, 1, 1, dum, 1, ierr )
+              abnrm = dum( 1 )
+           end if
+           ! reduce to upper hessenberg form
+           ! (cworkspace: need 2*n, prefer n+n*nb)
+           ! (rworkspace: none)
+           itau = 1
+           iwrk = itau + n
+           call stdlib_zgehrd( n, ilo, ihi, a, lda, work( itau ), work( iwrk ),lwork-iwrk+1, ierr &
+                     )
+           if( wantvl ) then
+              ! want left eigenvectors
+              ! copy householder vectors to vl
+              side = 'L'
+              call stdlib_zlacpy( 'L', n, n, a, lda, vl, ldvl )
+              ! generate unitary matrix in vl
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_zunghr( n, ilo, ihi, vl, ldvl, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vl
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_zhseqr( 'S', 'V', n, ilo, ihi, a, lda, w, vl, ldvl,work( iwrk ), lwork-&
+                        iwrk+1, info )
+              if( wantvr ) then
+                 ! want left and right eigenvectors
+                 ! copy schur vectors to vr
+                 side = 'B'
+                 call stdlib_zlacpy( 'F', n, n, vl, ldvl, vr, ldvr )
+              end if
+           else if( wantvr ) then
+              ! want right eigenvectors
+              ! copy householder vectors to vr
+              side = 'R'
+              call stdlib_zlacpy( 'L', n, n, a, lda, vr, ldvr )
+              ! generate unitary matrix in vr
+              ! (cworkspace: need 2*n-1, prefer n+(n-1)*nb)
+              ! (rworkspace: none)
+              call stdlib_zunghr( n, ilo, ihi, vr, ldvr, work( itau ), work( iwrk ),lwork-iwrk+1, &
+                        ierr )
+              ! perform qr iteration, accumulating schur vectors in vr
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_zhseqr( 'S', 'V', n, ilo, ihi, a, lda, w, vr, ldvr,work( iwrk ), lwork-&
+                        iwrk+1, info )
+           else
+              ! compute eigenvalues only
+              ! if condition numbers desired, compute schur form
+              if( wntsnn ) then
+                 job = 'E'
+              else
+                 job = 'S'
+              end if
+              ! (cworkspace: need 1, prefer hswork (see comments) )
+              ! (rworkspace: none)
+              iwrk = itau
+              call stdlib_zhseqr( job, 'N', n, ilo, ihi, a, lda, w, vr, ldvr,work( iwrk ), lwork-&
+                        iwrk+1, info )
+           end if
+           ! if info /= 0 from stdlib_zhseqr, then quit
+           if( info/=0 )go to 50
+           if( wantvl .or. wantvr ) then
+              ! compute left and/or right eigenvectors
+              ! (cworkspace: need 2*n, prefer n + 2*n*nb)
+              ! (rworkspace: need n)
+              call stdlib_ztrevc3( side, 'B', select, n, a, lda, vl, ldvl, vr, ldvr,n, nout, work(&
+                         iwrk ), lwork-iwrk+1,rwork, n, ierr )
+           end if
+           ! compute condition numbers if desired
+           ! (cworkspace: need n*n+2*n unless sense = 'e')
+           ! (rworkspace: need 2*n unless sense = 'e')
+           if( .not.wntsnn ) then
+              call stdlib_ztrsna( sense, 'A', select, n, a, lda, vl, ldvl, vr, ldvr,rconde, &
+                        rcondv, n, nout, work( iwrk ), n, rwork,icond )
+           end if
+           if( wantvl ) then
+              ! undo balancing of left eigenvectors
+              call stdlib_zgebak( balanc, 'L', n, ilo, ihi, scale, n, vl, ldvl,ierr )
+              ! normalize left eigenvectors and make largest component real
+              do i = 1, n
+                 scl = one / stdlib_dznrm2( n, vl( 1, i ), 1 )
+                 call stdlib_zdscal( n, scl, vl( 1, i ), 1 )
+                 do k = 1, n
+                    rwork( k ) = real( vl( k, i ),KIND=dp)**2 +aimag( vl( k, i ) )**2
+                 end do
+                 k = stdlib_idamax( n, rwork, 1 )
+                 tmp = conjg( vl( k, i ) ) / sqrt( rwork( k ) )
+                 call stdlib_zscal( n, tmp, vl( 1, i ), 1 )
+                 vl( k, i ) = cmplx( real( vl( k, i ),KIND=dp), zero,KIND=dp)
+              end do
+           end if
+           if( wantvr ) then
+              ! undo balancing of right eigenvectors
+              call stdlib_zgebak( balanc, 'R', n, ilo, ihi, scale, n, vr, ldvr,ierr )
+              ! normalize right eigenvectors and make largest component real
+              do i = 1, n
+                 scl = one / stdlib_dznrm2( n, vr( 1, i ), 1 )
+                 call stdlib_zdscal( n, scl, vr( 1, i ), 1 )
+                 do k = 1, n
+                    rwork( k ) = real( vr( k, i ),KIND=dp)**2 +aimag( vr( k, i ) )**2
+                 end do
+                 k = stdlib_idamax( n, rwork, 1 )
+                 tmp = conjg( vr( k, i ) ) / sqrt( rwork( k ) )
+                 call stdlib_zscal( n, tmp, vr( 1, i ), 1 )
+                 vr( k, i ) = cmplx( real( vr( k, i ),KIND=dp), zero,KIND=dp)
+              end do
+           end if
+           ! undo scaling if necessary
+           50 continue
+           if( scalea ) then
+              call stdlib_zlascl( 'G', 0, 0, cscale, anrm, n-info, 1, w( info+1 ),max( n-info, 1 )&
+                        , ierr )
+              if( info==0 ) then
+                 if( ( wntsnv .or. wntsnb ) .and. icond==0 )call stdlib_dlascl( 'G', 0, 0, cscale,&
+                            anrm, n, 1, rcondv, n,ierr )
+              else
+                 call stdlib_zlascl( 'G', 0, 0, cscale, anrm, ilo-1, 1, w, n, ierr )
+              end if
+           end if
+           work( 1 ) = maxwrk
+           return
+     end subroutine stdlib_zgeevx
+
+     !> ZGEJSV: computes the singular value decomposition (SVD) of a complex M-by-N
+     !> matrix [A], where M >= N. The SVD of [A] is written as
+     !> [A] = [U] * [SIGMA] * [V]^*,
+     !> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+     !> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and
+     !> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are
+     !> the singular values of [A]. The columns of [U] and [V] are the left and
+     !> the right singular vectors of [A], respectively. The matrices [U] and [V]
+     !> are computed and stored in the arrays U and V, respectively. The diagonal
+     !> of [SIGMA] is computed and stored in the array SVA.
+
+     pure subroutine stdlib_zgejsv( joba, jobu, jobv, jobr, jobt, jobp,m, n, a, lda, sva, u, ldu, &
+               v, ldv,cwork, lwork, rwork, lrwork, iwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldu, ldv, lwork, lrwork, m, n
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: u(ldu,*), v(ldv,*), cwork(lwork)
+           real(dp), intent(out) :: sva(n), rwork(lrwork)
+           integer(ilp), intent(out) :: iwork(*)
+           character, intent(in) :: joba, jobp, jobr, jobt, jobu, jobv
+        ! ===========================================================================
+           
+           
+           ! Local Scalars 
+           complex(dp) :: ctemp
+           real(dp) :: aapp, aaqq, aatmax, aatmin, big, big1, cond_ok, condr1, condr2, entra, &
+                     entrat, epsln, maxprj, scalem, sconda, sfmin, small, temp1, uscal1, uscal2, xsc
+           integer(ilp) :: ierr, n1, nr, numrank, p, q, warning
+           logical(lk) :: almort, defr, errest, goscal, jracc, kill, lquery, lsvec, l2aber, &
+                     l2kill, l2pert, l2rank, l2tran, noscal, rowpiv, rsvec, transp
+           integer(ilp) :: optwrk, minwrk, minrwrk, miniwrk
+           integer(ilp) :: lwcon, lwlqf, lwqp3, lwqrf, lwunmlq, lwunmqr, lwunmqrm, lwsvdj, &
+                     lwsvdjv, lrwqp3, lrwcon, lrwsvdj, iwoff
+           integer(ilp) :: lwrk_zgelqf, lwrk_zgeqp3, lwrk_zgeqp3n, lwrk_zgeqrf, lwrk_zgesvj, &
+                     lwrk_zgesvjv, lwrk_zgesvju, lwrk_zunmlq, lwrk_zunmqr, lwrk_zunmqrm
+           ! Local Arrays
+           complex(dp) :: cdummy(1)
+           real(dp) :: rdummy(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,cmplx,conjg,log,max,min,real,nint,sqrt
+           ! test the input arguments
+           lsvec  = stdlib_lsame( jobu, 'U' ) .or. stdlib_lsame( jobu, 'F' )
+           jracc  = stdlib_lsame( jobv, 'J' )
+           rsvec  = stdlib_lsame( jobv, 'V' ) .or. jracc
+           rowpiv = stdlib_lsame( joba, 'F' ) .or. stdlib_lsame( joba, 'G' )
+           l2rank = stdlib_lsame( joba, 'R' )
+           l2aber = stdlib_lsame( joba, 'A' )
+           errest = stdlib_lsame( joba, 'E' ) .or. stdlib_lsame( joba, 'G' )
+           l2tran = stdlib_lsame( jobt, 'T' ) .and. ( m == n )
+           l2kill = stdlib_lsame( jobr, 'R' )
+           defr   = stdlib_lsame( jobr, 'N' )
+           l2pert = stdlib_lsame( jobp, 'P' )
+           lquery = ( lwork == -1 ) .or. ( lrwork == -1 )
+           if ( .not.(rowpiv .or. l2rank .or. l2aber .or.errest .or. stdlib_lsame( joba, 'C' ) )) &
+                     then
+              info = - 1
+           else if ( .not.( lsvec .or. stdlib_lsame( jobu, 'N' ) .or.( stdlib_lsame( jobu, 'W' ) &
+                     .and. rsvec .and. l2tran ) ) ) then
+              info = - 2
+           else if ( .not.( rsvec .or. stdlib_lsame( jobv, 'N' ) .or.( stdlib_lsame( jobv, 'W' ) &
+                     .and. lsvec .and. l2tran ) ) ) then
+              info = - 3
+           else if ( .not. ( l2kill .or. defr ) )    then
+              info = - 4
+           else if ( .not. ( stdlib_lsame(jobt,'T') .or. stdlib_lsame(jobt,'N') ) ) then
+              info = - 5
+           else if ( .not. ( l2pert .or. stdlib_lsame( jobp, 'N' ) ) ) then
+              info = - 6
+           else if ( m < 0 ) then
+              info = - 7
+           else if ( ( n < 0 ) .or. ( n > m ) ) then
+              info = - 8
+           else if ( lda < m ) then
+              info = - 10
+           else if ( lsvec .and. ( ldu < m ) ) then
+              info = - 13
+           else if ( rsvec .and. ( ldv < n ) ) then
+              info = - 15
+           else
+              ! #:)
+              info = 0
+           end if
+           if ( info == 0 ) then
+               ! Compute The Minimal And The Optimal Workspace Lengths
+               ! [[the expressions for computing the minimal and the optimal
+               ! values of lcwork, lrwork are written with a lot of redundancy and
+               ! can be simplified. however, this verbose form is useful for
+               ! maintenance and modifications of the code.]]
+              ! .. minimal workspace length for stdlib_zgeqp3 of an m x n matrix,
+               ! stdlib_zgeqrf of an n x n matrix, stdlib_zgelqf of an n x n matrix,
+               ! stdlib_zunmlq for computing n x n matrix, stdlib_zunmqr for computing n x n
+               ! matrix, stdlib_zunmqr for computing m x n matrix, respectively.
+               lwqp3 = n+1
+               lwqrf = max( 1, n )
+               lwlqf = max( 1, n )
+               lwunmlq  = max( 1, n )
+               lwunmqr  = max( 1, n )
+               lwunmqrm = max( 1, m )
+              ! Minimal Workspace Length For Stdlib_Zpocon Of An N X N Matrix
+               lwcon = 2 * n
+              ! .. minimal workspace length for stdlib_zgesvj of an n x n matrix,
+               ! without and with explicit accumulation of jacobi rotations
+               lwsvdj  = max( 2 * n, 1 )
+               lwsvdjv = max( 2 * n, 1 )
+               ! .. minimal real workspace length for stdlib_zgeqp3, stdlib_zpocon, stdlib_zgesvj
+               lrwqp3  = 2 * n
+               lrwcon  = n
+               lrwsvdj = n
+               if ( lquery ) then
+                   call stdlib_zgeqp3( m, n, a, lda, iwork, cdummy, cdummy, -1,rdummy, ierr )
+                             
+                   lwrk_zgeqp3 = real( cdummy(1),KIND=dp)
+                   call stdlib_zgeqrf( n, n, a, lda, cdummy, cdummy,-1, ierr )
+                   lwrk_zgeqrf = real( cdummy(1),KIND=dp)
+                   call stdlib_zgelqf( n, n, a, lda, cdummy, cdummy,-1, ierr )
+                   lwrk_zgelqf = real( cdummy(1),KIND=dp)
+               end if
+               minwrk  = 2
+               optwrk  = 2
+               miniwrk = n
+               if ( .not. (lsvec .or. rsvec ) ) then
+                   ! Minimal And Optimal Sizes Of The Complex Workspace If
+                   ! only the singular values are requested
+                   if ( errest ) then
+                       minwrk = max( n+lwqp3, n**2+lwcon, n+lwqrf, lwsvdj )
+                   else
+                       minwrk = max( n+lwqp3, n+lwqrf, lwsvdj )
+                   end if
+                   if ( lquery ) then
+                       call stdlib_zgesvj( 'L', 'N', 'N', n, n, a, lda, sva, n, v,ldv, cdummy, -1,&
+                                  rdummy, -1, ierr )
+                       lwrk_zgesvj = real( cdummy(1),KIND=dp)
+                       if ( errest ) then
+                           optwrk = max( n+lwrk_zgeqp3, n**2+lwcon,n+lwrk_zgeqrf, lwrk_zgesvj )
+                                     
+                       else
+                           optwrk = max( n+lwrk_zgeqp3, n+lwrk_zgeqrf,lwrk_zgesvj )
+                       end if
+                   end if
+                   if ( l2tran .or. rowpiv ) then
+                       if ( errest ) then
+                          minrwrk = max( 7, 2*m,  lrwqp3, lrwcon, lrwsvdj )
+                       else
+                          minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj )
+                       end if
+                   else
+                       if ( errest ) then
+                          minrwrk = max( 7, lrwqp3, lrwcon, lrwsvdj )
+                       else
+                          minrwrk = max( 7, lrwqp3, lrwsvdj )
+                       end if
+                   end if
+                   if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+               else if ( rsvec .and. (.not.lsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! singular values and the right singular vectors are requested
+                  if ( errest ) then
+                      minwrk = max( n+lwqp3, lwcon, lwsvdj, n+lwlqf,2*n+lwqrf, n+lwsvdj, n+&
+                                lwunmlq )
+                  else
+                      minwrk = max( n+lwqp3, lwsvdj, n+lwlqf, 2*n+lwqrf,n+lwsvdj, n+lwunmlq )
+                                
+                  end if
+                  if ( lquery ) then
+                      call stdlib_zgesvj( 'L', 'U', 'N', n,n, u, ldu, sva, n, a,lda, cdummy, -1, &
+                                rdummy, -1, ierr )
+                      lwrk_zgesvj = real( cdummy(1),KIND=dp)
+                      call stdlib_zunmlq( 'L', 'C', n, n, n, a, lda, cdummy,v, ldv, cdummy, -1, &
+                                ierr )
+                      lwrk_zunmlq = real( cdummy(1),KIND=dp)
+                      if ( errest ) then
+                      optwrk = max( n+lwrk_zgeqp3, lwcon, lwrk_zgesvj,n+lwrk_zgelqf, 2*n+&
+                                lwrk_zgeqrf,n+lwrk_zgesvj,  n+lwrk_zunmlq )
+                      else
+                      optwrk = max( n+lwrk_zgeqp3, lwrk_zgesvj,n+lwrk_zgelqf,2*n+lwrk_zgeqrf, n+&
+                                lwrk_zgesvj,n+lwrk_zunmlq )
+                      end if
+                  end if
+                  if ( l2tran .or. rowpiv ) then
+                       if ( errest ) then
+                          minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj, lrwcon )
+                       else
+                          minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj )
+                       end if
+                  else
+                       if ( errest ) then
+                          minrwrk = max( 7, lrwqp3, lrwsvdj, lrwcon )
+                       else
+                          minrwrk = max( 7, lrwqp3, lrwsvdj )
+                       end if
+                  end if
+                  if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+               else if ( lsvec .and. (.not.rsvec) ) then
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! singular values and the left singular vectors are requested
+                  if ( errest ) then
+                      minwrk = n + max( lwqp3,lwcon,n+lwqrf,lwsvdj,lwunmqrm )
+                  else
+                      minwrk = n + max( lwqp3, n+lwqrf, lwsvdj, lwunmqrm )
+                  end if
+                  if ( lquery ) then
+                      call stdlib_zgesvj( 'L', 'U', 'N', n,n, u, ldu, sva, n, a,lda, cdummy, -1, &
+                                rdummy, -1, ierr )
+                      lwrk_zgesvj = real( cdummy(1),KIND=dp)
+                      call stdlib_zunmqr( 'L', 'N', m, n, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_zunmqrm = real( cdummy(1),KIND=dp)
+                      if ( errest ) then
+                      optwrk = n + max( lwrk_zgeqp3, lwcon, n+lwrk_zgeqrf,lwrk_zgesvj, &
+                                lwrk_zunmqrm )
+                      else
+                      optwrk = n + max( lwrk_zgeqp3, n+lwrk_zgeqrf,lwrk_zgesvj, lwrk_zunmqrm )
+                                
+                      end if
+                  end if
+                  if ( l2tran .or. rowpiv ) then
+                      if ( errest ) then
+                         minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj, lrwcon )
+                      else
+                         minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj )
+                      end if
+                  else
+                      if ( errest ) then
+                         minrwrk = max( 7, lrwqp3, lrwsvdj, lrwcon )
+                      else
+                         minrwrk = max( 7, lrwqp3, lrwsvdj )
+                      end if
+                  end if
+                  if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+               else
+                  ! Minimal And Optimal Sizes Of The Complex Workspace If The
+                  ! full svd is requested
+                  if ( .not. jracc ) then
+                      if ( errest ) then
+                         minwrk = max( n+lwqp3, n+lwcon,  2*n+n**2+lwcon,2*n+lwqrf,         2*n+&
+                         lwqp3,2*n+n**2+n+lwlqf,  2*n+n**2+n+n**2+lwcon,2*n+n**2+n+lwsvdj, 2*n+&
+                         n**2+n+lwsvdjv,2*n+n**2+n+lwunmqr,2*n+n**2+n+lwunmlq,n+n**2+lwsvdj,   n+&
+                                   lwunmqrm )
+                      else
+                         minwrk = max( n+lwqp3,        2*n+n**2+lwcon,2*n+lwqrf,         2*n+&
+                         lwqp3,2*n+n**2+n+lwlqf,  2*n+n**2+n+n**2+lwcon,2*n+n**2+n+lwsvdj, 2*n+&
+                         n**2+n+lwsvdjv,2*n+n**2+n+lwunmqr,2*n+n**2+n+lwunmlq,n+n**2+lwsvdj,      &
+                                   n+lwunmqrm )
+                      end if
+                      miniwrk = miniwrk + n
+                      if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+                  else
+                      if ( errest ) then
+                         minwrk = max( n+lwqp3, n+lwcon, 2*n+lwqrf,2*n+n**2+lwsvdjv, 2*n+n**2+n+&
+                                   lwunmqr,n+lwunmqrm )
+                      else
+                         minwrk = max( n+lwqp3, 2*n+lwqrf,2*n+n**2+lwsvdjv, 2*n+n**2+n+lwunmqr,n+&
+                                   lwunmqrm )
+                      end if
+                      if ( rowpiv .or. l2tran ) miniwrk = miniwrk + m
+                  end if
+                  if ( lquery ) then
+                      call stdlib_zunmqr( 'L', 'N', m, n, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_zunmqrm = real( cdummy(1),KIND=dp)
+                      call stdlib_zunmqr( 'L', 'N', n, n, n, a, lda, cdummy, u,ldu, cdummy, -1, &
+                                ierr )
+                      lwrk_zunmqr = real( cdummy(1),KIND=dp)
+                      if ( .not. jracc ) then
+                          call stdlib_zgeqp3( n,n, a, lda, iwork, cdummy,cdummy, -1,rdummy, ierr )
+                                    
+                          lwrk_zgeqp3n = real( cdummy(1),KIND=dp)
+                          call stdlib_zgesvj( 'L', 'U', 'N', n, n, u, ldu, sva,n, v, ldv, cdummy, &
+                                    -1, rdummy, -1, ierr )
+                          lwrk_zgesvj = real( cdummy(1),KIND=dp)
+                          call stdlib_zgesvj( 'U', 'U', 'N', n, n, u, ldu, sva,n, v, ldv, cdummy, &
+                                    -1, rdummy, -1, ierr )
+                          lwrk_zgesvju = real( cdummy(1),KIND=dp)
+                          call stdlib_zgesvj( 'L', 'U', 'V', n, n, u, ldu, sva,n, v, ldv, cdummy, &
+                                    -1, rdummy, -1, ierr )
+                          lwrk_zgesvjv = real( cdummy(1),KIND=dp)
+                          call stdlib_zunmlq( 'L', 'C', n, n, n, a, lda, cdummy,v, ldv, cdummy, -&
+                                    1, ierr )
+                          lwrk_zunmlq = real( cdummy(1),KIND=dp)
+                          if ( errest ) then
+                            optwrk = max( n+lwrk_zgeqp3, n+lwcon,2*n+n**2+lwcon, 2*n+lwrk_zgeqrf,&
+                            2*n+lwrk_zgeqp3n,2*n+n**2+n+lwrk_zgelqf,2*n+n**2+n+n**2+lwcon,2*n+&
+                            n**2+n+lwrk_zgesvj,2*n+n**2+n+lwrk_zgesvjv,2*n+n**2+n+lwrk_zunmqr,2*n+&
+                                      n**2+n+lwrk_zunmlq,n+n**2+lwrk_zgesvju,n+lwrk_zunmqrm )
+                          else
+                            optwrk = max( n+lwrk_zgeqp3,2*n+n**2+lwcon, 2*n+lwrk_zgeqrf,2*n+&
+                            lwrk_zgeqp3n,2*n+n**2+n+lwrk_zgelqf,2*n+n**2+n+n**2+lwcon,2*n+n**2+n+&
+                            lwrk_zgesvj,2*n+n**2+n+lwrk_zgesvjv,2*n+n**2+n+lwrk_zunmqr,2*n+n**2+n+&
+                                      lwrk_zunmlq,n+n**2+lwrk_zgesvju,n+lwrk_zunmqrm )
+                          end if
+                      else
+                          call stdlib_zgesvj( 'L', 'U', 'V', n, n, u, ldu, sva,n, v, ldv, cdummy, &
+                                    -1, rdummy, -1, ierr )
+                          lwrk_zgesvjv = real( cdummy(1),KIND=dp)
+                          call stdlib_zunmqr( 'L', 'N', n, n, n, cdummy, n, cdummy,v, ldv, cdummy,&
+                                     -1, ierr )
+                          lwrk_zunmqr = real( cdummy(1),KIND=dp)
+                          call stdlib_zunmqr( 'L', 'N', m, n, n, a, lda, cdummy, u,ldu, cdummy, -&
+                                    1, ierr )
+                          lwrk_zunmqrm = real( cdummy(1),KIND=dp)
+                          if ( errest ) then
+                             optwrk = max( n+lwrk_zgeqp3, n+lwcon,2*n+lwrk_zgeqrf, 2*n+n**2,2*n+&
+                                       n**2+lwrk_zgesvjv,2*n+n**2+n+lwrk_zunmqr,n+lwrk_zunmqrm )
+                          else
+                             optwrk = max( n+lwrk_zgeqp3, 2*n+lwrk_zgeqrf,2*n+n**2, 2*n+n**2+&
+                                       lwrk_zgesvjv,2*n+n**2+n+lwrk_zunmqr,n+lwrk_zunmqrm )
+                          end if
+                      end if
+                  end if
+                  if ( l2tran .or. rowpiv ) then
+                      minrwrk = max( 7, 2*m,  lrwqp3, lrwsvdj, lrwcon )
+                  else
+                      minrwrk = max( 7, lrwqp3, lrwsvdj, lrwcon )
+                  end if
+               end if
+               minwrk = max( 2, minwrk )
+               optwrk = max( minwrk, optwrk )
+               if ( lwork  < minwrk  .and. (.not.lquery) ) info = - 17
+               if ( lrwork < minrwrk .and. (.not.lquery) ) info = - 19
+           end if
+           if ( info /= 0 ) then
+             ! #:(
+              call stdlib_xerbla( 'ZGEJSV', - info )
+              return
+           else if ( lquery ) then
+               cwork(1) = optwrk
+               cwork(2) = minwrk
+               rwork(1) = minrwrk
+               iwork(1) = max( 4, miniwrk )
+               return
+           end if
+           ! quick return for void matrix (y3k safe)
+       ! #:)
+           if ( ( m == 0 ) .or. ( n == 0 ) ) then
+              iwork(1:4) = 0
+              rwork(1:7) = 0
+              return
+           endif
+           ! determine whether the matrix u should be m x n or m x m
+           if ( lsvec ) then
+              n1 = n
+              if ( stdlib_lsame( jobu, 'F' ) ) n1 = m
+           end if
+           ! set numerical parameters
+      ! !    note: make sure stdlib_dlamch() does not fail on the target architecture.
+           epsln = stdlib_dlamch('EPSILON')
+           sfmin = stdlib_dlamch('SAFEMINIMUM')
+           small = sfmin / epsln
+           big   = stdlib_dlamch('O')
+           ! big   = one / sfmin
+           ! initialize sva(1:n) = diag( ||a e_i||_2 )_1^n
+      ! (!)  if necessary, scale sva() to protect the largest norm from
+           ! overflow. it is possible that this scaling pushes the smallest
+           ! column norm left from the underflow threshold (extreme case).
+           scalem  = one / sqrt(real(m,KIND=dp)*real(n,KIND=dp))
+           noscal  = .true.
+           goscal  = .true.
+           do p = 1, n
+              aapp = zero
+              aaqq = one
+              call stdlib_zlassq( m, a(1,p), 1, aapp, aaqq )
+              if ( aapp > big ) then
+                 info = - 9
+                 call stdlib_xerbla( 'ZGEJSV', -info )
+                 return
+              end if
+              aaqq = sqrt(aaqq)
+              if ( ( aapp < (big / aaqq) ) .and. noscal  ) then
+                 sva(p)  = aapp * aaqq
+              else
+                 noscal  = .false.
+                 sva(p)  = aapp * ( aaqq * scalem )
+                 if ( goscal ) then
+                    goscal = .false.
+                    call stdlib_dscal( p-1, scalem, sva, 1 )
+                 end if
+              end if
+           end do
+           if ( noscal ) scalem = one
+           aapp = zero
+           aaqq = big
+           do p = 1, n
+              aapp = max( aapp, sva(p) )
+              if ( sva(p) /= zero ) aaqq = min( aaqq, sva(p) )
+           end do
+           ! quick return for zero m x n matrix
+       ! #:)
+           if ( aapp == zero ) then
+              if ( lsvec ) call stdlib_zlaset( 'G', m, n1, czero, cone, u, ldu )
+              if ( rsvec ) call stdlib_zlaset( 'G', n, n,  czero, cone, v, ldv )
+              rwork(1) = one
+              rwork(2) = one
+              if ( errest ) rwork(3) = one
+              if ( lsvec .and. rsvec ) then
+                 rwork(4) = one
+                 rwork(5) = one
+              end if
+              if ( l2tran ) then
+                 rwork(6) = zero
+                 rwork(7) = zero
+              end if
+              iwork(1) = 0
+              iwork(2) = 0
+              iwork(3) = 0
+              iwork(4) = -1
+              return
+           end if
+           ! issue warning if denormalized column norms detected. override the
+           ! high relative accuracy request. issue licence to kill nonzero columns
+           ! (set them to zero) whose norm is less than sigma_max / big (roughly).
+       ! #:(
+           warning = 0
+           if ( aaqq <= sfmin ) then
+              l2rank = .true.
+              l2kill = .true.
+              warning = 1
+           end if
+           ! quick return for one-column matrix
+       ! #:)
+           if ( n == 1 ) then
+              if ( lsvec ) then
+                 call stdlib_zlascl( 'G',0,0,sva(1),scalem, m,1,a(1,1),lda,ierr )
+                 call stdlib_zlacpy( 'A', m, 1, a, lda, u, ldu )
+                 ! computing all m left singular vectors of the m x 1 matrix
+                 if ( n1 /= n  ) then
+                   call stdlib_zgeqrf( m, n, u,ldu, cwork, cwork(n+1),lwork-n,ierr )
+                   call stdlib_zungqr( m,n1,1, u,ldu,cwork,cwork(n+1),lwork-n,ierr )
+                   call stdlib_zcopy( m, a(1,1), 1, u(1,1), 1 )
+                 end if
+              end if
+              if ( rsvec ) then
+                  v(1,1) = cone
+              end if
+              if ( sva(1) < (big*scalem) ) then
+                 sva(1)  = sva(1) / scalem
+                 scalem  = one
+              end if
+              rwork(1) = one / scalem
+              rwork(2) = one
+              if ( sva(1) /= zero ) then
+                 iwork(1) = 1
+                 if ( ( sva(1) / scalem) >= sfmin ) then
+                    iwork(2) = 1
+                 else
+                    iwork(2) = 0
+                 end if
+              else
+                 iwork(1) = 0
+                 iwork(2) = 0
+              end if
+              iwork(3) = 0
+              iwork(4) = -1
+              if ( errest ) rwork(3) = one
+              if ( lsvec .and. rsvec ) then
+                 rwork(4) = one
+                 rwork(5) = one
+              end if
+              if ( l2tran ) then
+                 rwork(6) = zero
+                 rwork(7) = zero
+              end if
+              return
+           end if
+           transp = .false.
+           aatmax = -one
+           aatmin =  big
+           if ( rowpiv .or. l2tran ) then
+           ! compute the row norms, needed to determine row pivoting sequence
+           ! (in the case of heavily row weighted a, row pivoting is strongly
+           ! advised) and to collect information needed to compare the
+           ! structures of a * a^* and a^* * a (in the case l2tran==.true.).
+              if ( l2tran ) then
+                 do p = 1, m
+                    xsc   = zero
+                    temp1 = one
+                    call stdlib_zlassq( n, a(p,1), lda, xsc, temp1 )
+                    ! stdlib_zlassq gets both the ell_2 and the ell_infinity norm
+                    ! in one pass through the vector
+                    rwork(m+p)  = xsc * scalem
+                    rwork(p)    = xsc * (scalem*sqrt(temp1))
+                    aatmax = max( aatmax, rwork(p) )
+                    if (rwork(p) /= zero)aatmin = min(aatmin,rwork(p))
+                 end do
+              else
+                 do p = 1, m
+                    rwork(m+p) = scalem*abs( a(p,stdlib_izamax(n,a(p,1),lda)) )
+                    aatmax = max( aatmax, rwork(m+p) )
+                    aatmin = min( aatmin, rwork(m+p) )
+                 end do
+              end if
+           end if
+           ! for square matrix a try to determine whether a^*  would be better
+           ! input for the preconditioned jacobi svd, with faster convergence.
+           ! the decision is based on an o(n) function of the vector of column
+           ! and row norms of a, based on the shannon entropy. this should give
+           ! the right choice in most cases when the difference actually matters.
+           ! it may fail and pick the slower converging side.
+           entra  = zero
+           entrat = zero
+           if ( l2tran ) then
+              xsc   = zero
+              temp1 = one
+              call stdlib_dlassq( n, sva, 1, xsc, temp1 )
+              temp1 = one / temp1
+              entra = zero
+              do p = 1, n
+                 big1  = ( ( sva(p) / xsc )**2 ) * temp1
+                 if ( big1 /= zero ) entra = entra + big1 * log(big1)
+              end do
+              entra = - entra / log(real(n,KIND=dp))
+              ! now, sva().^2/trace(a^* * a) is a point in the probability simplex.
+              ! it is derived from the diagonal of  a^* * a.  do the same with the
+              ! diagonal of a * a^*, compute the entropy of the corresponding
+              ! probability distribution. note that a * a^* and a^* * a have the
+              ! same trace.
+              entrat = zero
+              do p = 1, m
+                 big1 = ( ( rwork(p) / xsc )**2 ) * temp1
+                 if ( big1 /= zero ) entrat = entrat + big1 * log(big1)
+              end do
+              entrat = - entrat / log(real(m,KIND=dp))
+              ! analyze the entropies and decide a or a^*. smaller entropy
+              ! usually means better input for the algorithm.
+              transp = ( entrat < entra )
+              ! if a^* is better than a, take the adjoint of a. this is allowed
+              ! only for square matrices, m=n.
+              if ( transp ) then
+                 ! in an optimal implementation, this trivial transpose
+                 ! should be replaced with faster transpose.
+                 do p = 1, n - 1
+                    a(p,p) = conjg(a(p,p))
+                    do q = p + 1, n
+                        ctemp = conjg(a(q,p))
+                       a(q,p) = conjg(a(p,q))
+                       a(p,q) = ctemp
+                    end do
+                 end do
+                 a(n,n) = conjg(a(n,n))
+                 do p = 1, n
+                    rwork(m+p) = sva(p)
+                    sva(p)     = rwork(p)
+                    ! previously computed row 2-norms are now column 2-norms
+                    ! of the transposed matrix
+                 end do
+                 temp1  = aapp
+                 aapp   = aatmax
+                 aatmax = temp1
+                 temp1  = aaqq
+                 aaqq   = aatmin
+                 aatmin = temp1
+                 kill   = lsvec
+                 lsvec  = rsvec
+                 rsvec  = kill
+                 if ( lsvec ) n1 = n
+                 rowpiv = .true.
+              end if
+           end if
+           ! end if l2tran
+           ! scale the matrix so that its maximal singular value remains less
+           ! than sqrt(big) -- the matrix is scaled so that its maximal column
+           ! has euclidean norm equal to sqrt(big/n). the only reason to keep
+           ! sqrt(big) instead of big is the fact that stdlib_zgejsv uses lapack and
+           ! blas routines that, in some implementations, are not capable of
+           ! working in the full interval [sfmin,big] and that they may provoke
+           ! overflows in the intermediate results. if the singular values spread
+           ! from sfmin to big, then stdlib_zgesvj will compute them. so, in that case,
+           ! one should use stdlib_zgesvj instead of stdlib_zgejsv.
+           ! >> change in the april 2016 update: allow bigger range, i.e. the
+           ! largest column is allowed up to big/n and stdlib_zgesvj will do the rest.
+           big1   = sqrt( big )
+           temp1  = sqrt( big / real(n,KIND=dp) )
+            ! temp1  = big/real(n,KIND=dp)
+           call stdlib_dlascl( 'G', 0, 0, aapp, temp1, n, 1, sva, n, ierr )
+           if ( aaqq > (aapp * sfmin) ) then
+               aaqq = ( aaqq / aapp ) * temp1
+           else
+               aaqq = ( aaqq * temp1 ) / aapp
+           end if
+           temp1 = temp1 * scalem
+           call stdlib_zlascl( 'G', 0, 0, aapp, temp1, m, n, a, lda, ierr )
+           ! to undo scaling at the end of this procedure, multiply the
+           ! computed singular values with uscal2 / uscal1.
+           uscal1 = temp1
+           uscal2 = aapp
+           if ( l2kill ) then
+              ! l2kill enforces computation of nonzero singular values in
+              ! the restricted range of condition number of the initial a,
+              ! sigma_max(a) / sigma_min(a) approx. sqrt(big)/sqrt(sfmin).
+              xsc = sqrt( sfmin )
+           else
+              xsc = small
+              ! now, if the condition number of a is too big,
+              ! sigma_max(a) / sigma_min(a) > sqrt(big/n) * epsln / sfmin,
+              ! as a precaution measure, the full svd is computed using stdlib_zgesvj
+              ! with accumulated jacobi rotations. this provides numerically
+              ! more robust computation, at the cost of slightly increased run
+              ! time. depending on the concrete implementation of blas and lapack
+              ! (i.e. how they behave in presence of extreme ill-conditioning) the
+              ! implementor may decide to remove this switch.
+              if ( ( aaqq<sqrt(sfmin) ) .and. lsvec .and. rsvec ) then
+                 jracc = .true.
+              end if
+           end if
+           if ( aaqq < xsc ) then
+              do p = 1, n
+                 if ( sva(p) < xsc ) then
+                    call stdlib_zlaset( 'A', m, 1, czero, czero, a(1,p), lda )
+                    sva(p) = zero
+                 end if
+              end do
+           end if
+           ! preconditioning using qr factorization with pivoting
+           if ( rowpiv ) then
+              ! optional row permutation (bjoerck row pivoting):
+              ! a result by cox and higham shows that the bjoerck's
+              ! row pivoting combined with standard column pivoting
+              ! has similar effect as powell-reid complete pivoting.
+              ! the ell-infinity norms of a are made nonincreasing.
+              if ( ( lsvec .and. rsvec ) .and. .not.( jracc ) ) then
+                   iwoff = 2*n
+              else
+                   iwoff = n
+              end if
+              do p = 1, m - 1
+                 q = stdlib_idamax( m-p+1, rwork(m+p), 1 ) + p - 1
+                 iwork(iwoff+p) = q
+                 if ( p /= q ) then
+                    temp1      = rwork(m+p)
+                    rwork(m+p) = rwork(m+q)
+                    rwork(m+q) = temp1
+                 end if
+              end do
+              call stdlib_zlaswp( n, a, lda, 1, m-1, iwork(iwoff+1), 1 )
+           end if
+           ! end of the preparation phase (scaling, optional sorting and
+           ! transposing, optional flushing of small columns).
+           ! preconditioning
+           ! if the full svd is needed, the right singular vectors are computed
+           ! from a matrix equation, and for that we need theoretical analysis
+           ! of the businger-golub pivoting. so we use stdlib_zgeqp3 as the first rr qrf.
+           ! in all other cases the first rr qrf can be chosen by other criteria
+           ! (eg speed by replacing global with restricted window pivoting, such
+           ! as in xgeqpx from toms # 782). good results will be obtained using
+           ! xgeqpx with properly (!) chosen numerical parameters.
+           ! any improvement of stdlib_zgeqp3 improves overall performance of stdlib_zgejsv.
+           ! a * p1 = q1 * [ r1^* 0]^*:
+           do p = 1, n
+              ! All Columns Are Free Columns
+              iwork(p) = 0
+           end do
+           call stdlib_zgeqp3( m, n, a, lda, iwork, cwork, cwork(n+1), lwork-n,rwork, ierr )
+                     
+           ! the upper triangular matrix r1 from the first qrf is inspected for
+           ! rank deficiency and possibilities for deflation, or possible
+           ! ill-conditioning. depending on the user specified flag l2rank,
+           ! the procedure explores possibilities to reduce the numerical
+           ! rank by inspecting the computed upper triangular factor. if
+           ! l2rank or l2aber are up, then stdlib_zgejsv will compute the svd of
+           ! a + da, where ||da|| <= f(m,n)*epsln.
+           nr = 1
+           if ( l2aber ) then
+              ! standard absolute error bound suffices. all sigma_i with
+              ! sigma_i < n*epsln*||a|| are flushed to zero. this is an
+              ! aggressive enforcement of lower numerical rank by introducing a
+              ! backward error of the order of n*epsln*||a||.
+              temp1 = sqrt(real(n,KIND=dp))*epsln
+              do p = 2, n
+                 if ( abs(a(p,p)) >= (temp1*abs(a(1,1))) ) then
+                    nr = nr + 1
+                 else
+                    go to 3002
+                 end if
+              end do
+              3002 continue
+           else if ( l2rank ) then
+              ! .. similarly as above, only slightly more gentle (less aggressive).
+              ! sudden drop on the diagonal of r1 is used as the criterion for
+              ! close-to-rank-deficient.
+              temp1 = sqrt(sfmin)
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < (epsln*abs(a(p-1,p-1))) ) .or.( abs(a(p,p)) < small ) .or.( &
+                           l2kill .and. (abs(a(p,p)) < temp1) ) ) go to 3402
+                 nr = nr + 1
+              end do
+              3402 continue
+           else
+              ! the goal is high relative accuracy. however, if the matrix
+              ! has high scaled condition number the relative accuracy is in
+              ! general not feasible. later on, a condition number estimator
+              ! will be deployed to estimate the scaled condition number.
+              ! here we just remove the underflowed part of the triangular
+              ! factor. this prevents the situation in which the code is
+              ! working hard to get the accuracy not warranted by the data.
+              temp1  = sqrt(sfmin)
+              do p = 2, n
+                 if ( ( abs(a(p,p)) < small ) .or.( l2kill .and. (abs(a(p,p)) < temp1) ) ) go to &
+                           3302
+                 nr = nr + 1
+              end do
+              3302 continue
+           end if
+           almort = .false.
+           if ( nr == n ) then
+              maxprj = one
+              do p = 2, n
+                 temp1  = abs(a(p,p)) / sva(iwork(p))
+                 maxprj = min( maxprj, temp1 )
+              end do
+              if ( maxprj**2 >= one - real(n,KIND=dp)*epsln ) almort = .true.
+           end if
+           sconda = - one
+           condr1 = - one
+           condr2 = - one
+           if ( errest ) then
+              if ( n == nr ) then
+                 if ( rsvec ) then
+                    ! V Is Available As Workspace
+                    call stdlib_zlacpy( 'U', n, n, a, lda, v, ldv )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_zdscal( p, one/temp1, v(1,p), 1 )
+                    end do
+                    if ( lsvec )then
+                        call stdlib_zpocon( 'U', n, v, ldv, one, temp1,cwork(n+1), rwork, ierr )
+                                  
+                    else
+                        call stdlib_zpocon( 'U', n, v, ldv, one, temp1,cwork, rwork, ierr )
+                                  
+                    end if
+                 else if ( lsvec ) then
+                    ! U Is Available As Workspace
+                    call stdlib_zlacpy( 'U', n, n, a, lda, u, ldu )
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+                       call stdlib_zdscal( p, one/temp1, u(1,p), 1 )
+                    end do
+                    call stdlib_zpocon( 'U', n, u, ldu, one, temp1,cwork(n+1), rwork, ierr )
+                              
+                 else
+                    call stdlib_zlacpy( 'U', n, n, a, lda, cwork, n )
+      ! []            call stdlib_zlacpy( 'u', n, n, a, lda, cwork(n+1), n )
+                    ! change: here index shifted by n to the left, cwork(1:n)
+                    ! not needed for sigma only computation
+                    do p = 1, n
+                       temp1 = sva(iwork(p))
+      ! []               call stdlib_zdscal( p, one/temp1, cwork(n+(p-1)*n+1), 1 )
+                       call stdlib_zdscal( p, one/temp1, cwork((p-1)*n+1), 1 )
+                    end do
+                 ! The Columns Of R Are Scaled To Have Unit Euclidean Lengths
+      ! []               call stdlib_zpocon( 'u', n, cwork(n+1), n, one, temp1,
+      ! []     $              cwork(n+n*n+1), rwork, ierr )
+                    call stdlib_zpocon( 'U', n, cwork, n, one, temp1,cwork(n*n+1), rwork, ierr )
+                              
+                 end if
+                 if ( temp1 /= zero ) then
+                    sconda = one / sqrt(temp1)
+                 else
+                    sconda = - one
+                 end if
+                 ! sconda is an estimate of sqrt(||(r^* * r)^(-1)||_1).
+                 ! n^(-1/4) * sconda <= ||r^(-1)||_2 <= n^(1/4) * sconda
+              else
+                 sconda = - one
+              end if
+           end if
+           l2pert = l2pert .and. ( abs( a(1,1)/a(nr,nr) ) > sqrt(big1) )
+           ! if there is no violent scaling, artificial perturbation is not needed.
+           ! phase 3:
+           if ( .not. ( rsvec .or. lsvec ) ) then
+               ! singular values only
+               ! .. transpose a(1:nr,1:n)
+              do p = 1, min( n-1, nr )
+                 call stdlib_zcopy( n-p, a(p,p+1), lda, a(p+1,p), 1 )
+                 call stdlib_zlacgv( n-p+1, a(p,p), 1 )
+              end do
+              if ( nr == n ) a(n,n) = conjg(a(n,n))
+              ! the following two do-loops introduce small relative perturbation
+              ! into the strict upper triangle of the lower triangular matrix.
+              ! small entries below the main diagonal are also changed.
+              ! this modification is useful if the computing environment does not
+              ! provide/allow flush to zero underflow, for it prevents many
+              ! annoying denormalized numbers in case of strongly scaled matrices.
+              ! the perturbation is structured so that it does not introduce any
+              ! new perturbation of the singular values, and it does not destroy
+              ! the job done by the preconditioner.
+              ! the licence for this perturbation is in the variable l2pert, which
+              ! should be .false. if flush to zero underflow is active.
+              if ( .not. almort ) then
+                 if ( l2pert ) then
+                    ! xsc = sqrt(small)
+                    xsc = epsln / real(n,KIND=dp)
+                    do q = 1, nr
+                       ctemp = cmplx(xsc*abs(a(q,q)),zero,KIND=dp)
+                       do p = 1, n
+                          if ( ( (p>q) .and. (abs(a(p,q))<=temp1) ).or. ( p < q ) )a(p,q) = &
+                                    ctemp
+           ! $                     a(p,q) = temp1 * ( a(p,q) / abs(a(p,q)) )
+                       end do
+                    end do
+                 else
+                    call stdlib_zlaset( 'U', nr-1,nr-1, czero,czero, a(1,2),lda )
+                 end if
+                  ! Second Preconditioning Using The Qr Factorization
+                 call stdlib_zgeqrf( n,nr, a,lda, cwork, cwork(n+1),lwork-n, ierr )
+                 ! And Transpose Upper To Lower Triangular
+                 do p = 1, nr - 1
+                    call stdlib_zcopy( nr-p, a(p,p+1), lda, a(p+1,p), 1 )
+                    call stdlib_zlacgv( nr-p+1, a(p,p), 1 )
+                 end do
+           end if
+                 ! row-cyclic jacobi svd algorithm with column pivoting
+                 ! .. again some perturbation (a "background noise") is added
+                 ! to drown denormals
+                 if ( l2pert ) then
+                    ! xsc = sqrt(small)
+                    xsc = epsln / real(n,KIND=dp)
+                    do q = 1, nr
+                       ctemp = cmplx(xsc*abs(a(q,q)),zero,KIND=dp)
+                       do p = 1, nr
+                          if ( ( (p>q) .and. (abs(a(p,q))<=temp1) ).or. ( p < q ) )a(p,q) = &
+                                    ctemp
+           ! $                   a(p,q) = temp1 * ( a(p,q) / abs(a(p,q)) )
+                       end do
+                    end do
+                 else
+                    call stdlib_zlaset( 'U', nr-1, nr-1, czero, czero, a(1,2), lda )
+                 end if
+                 ! .. and one-sided jacobi rotations are started on a lower
+                 ! triangular matrix (plus perturbation which is ignored in
+                 ! the part which destroys triangular form (confusing?!))
+                 call stdlib_zgesvj( 'L', 'N', 'N', nr, nr, a, lda, sva,n, v, ldv, cwork, lwork, &
+                           rwork, lrwork, info )
+                 scalem  = rwork(1)
+                 numrank = nint(rwork(2),KIND=ilp)
+           else if ( ( rsvec .and. ( .not. lsvec ) .and. ( .not. jracc ) ).or.( jracc .and. ( &
+                     .not. lsvec ) .and. ( nr /= n ) ) ) then
+              ! -> singular values and right singular vectors <-
+              if ( almort ) then
+                 ! In This Case Nr Equals N
+                 do p = 1, nr
+                    call stdlib_zcopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                    call stdlib_zlacgv( n-p+1, v(p,p), 1 )
+                 end do
+                 call stdlib_zlaset( 'U', nr-1,nr-1, czero, czero, v(1,2), ldv )
+                 call stdlib_zgesvj( 'L','U','N', n, nr, v, ldv, sva, nr, a, lda,cwork, lwork, &
+                           rwork, lrwork, info )
+                 scalem  = rwork(1)
+                 numrank = nint(rwork(2),KIND=ilp)
+              else
+              ! .. two more qr factorizations ( one qrf is not enough, two require
+              ! accumulated product of jacobi rotations, three are perfect )
+                 call stdlib_zlaset( 'L', nr-1,nr-1, czero, czero, a(2,1), lda )
+                 call stdlib_zgelqf( nr,n, a, lda, cwork, cwork(n+1), lwork-n, ierr)
+                 call stdlib_zlacpy( 'L', nr, nr, a, lda, v, ldv )
+                 call stdlib_zlaset( 'U', nr-1,nr-1, czero, czero, v(1,2), ldv )
+                 call stdlib_zgeqrf( nr, nr, v, ldv, cwork(n+1), cwork(2*n+1),lwork-2*n, ierr )
+                           
+                 do p = 1, nr
+                    call stdlib_zcopy( nr-p+1, v(p,p), ldv, v(p,p), 1 )
+                    call stdlib_zlacgv( nr-p+1, v(p,p), 1 )
+                 end do
+                 call stdlib_zlaset('U', nr-1, nr-1, czero, czero, v(1,2), ldv)
+                 call stdlib_zgesvj( 'L', 'U','N', nr, nr, v,ldv, sva, nr, u,ldu, cwork(n+1), &
+                           lwork-n, rwork, lrwork, info )
+                 scalem  = rwork(1)
+                 numrank = nint(rwork(2),KIND=ilp)
+                 if ( nr < n ) then
+                    call stdlib_zlaset( 'A',n-nr, nr, czero,czero, v(nr+1,1),  ldv )
+                    call stdlib_zlaset( 'A',nr, n-nr, czero,czero, v(1,nr+1),  ldv )
+                    call stdlib_zlaset( 'A',n-nr,n-nr,czero,cone, v(nr+1,nr+1),ldv )
+                 end if
+              call stdlib_zunmlq( 'L', 'C', n, n, nr, a, lda, cwork,v, ldv, cwork(n+1), lwork-n, &
+                        ierr )
+              end if
+               ! Permute The Rows Of V
+               ! do 8991 p = 1, n
+                  ! call stdlib_zcopy( n, v(p,1), ldv, a(iwork(p),1), lda )
+                  8991 continue
+               ! call stdlib_zlacpy( 'all', n, n, a, lda, v, ldv )
+              call stdlib_zlapmr( .false., n, n, v, ldv, iwork )
+               if ( transp ) then
+                 call stdlib_zlacpy( 'A', n, n, v, ldv, u, ldu )
+               end if
+           else if ( jracc .and. (.not. lsvec) .and. ( nr== n ) ) then
+              call stdlib_zlaset( 'L', n-1,n-1, czero, czero, a(2,1), lda )
+              call stdlib_zgesvj( 'U','N','V', n, n, a, lda, sva, n, v, ldv,cwork, lwork, rwork, &
+                        lrwork, info )
+               scalem  = rwork(1)
+               numrank = nint(rwork(2),KIND=ilp)
+               call stdlib_zlapmr( .false., n, n, v, ldv, iwork )
+           else if ( lsvec .and. ( .not. rsvec ) ) then
+              ! Singular Values And Left Singular Vectors                 
+              ! Second Preconditioning Step To Avoid Need To Accumulate
+              ! jacobi rotations in the jacobi iterations.
+              do p = 1, nr
+                 call stdlib_zcopy( n-p+1, a(p,p), lda, u(p,p), 1 )
+                 call stdlib_zlacgv( n-p+1, u(p,p), 1 )
+              end do
+              call stdlib_zlaset( 'U', nr-1, nr-1, czero, czero, u(1,2), ldu )
+              call stdlib_zgeqrf( n, nr, u, ldu, cwork(n+1), cwork(2*n+1),lwork-2*n, ierr )
+                        
+              do p = 1, nr - 1
+                 call stdlib_zcopy( nr-p, u(p,p+1), ldu, u(p+1,p), 1 )
+                 call stdlib_zlacgv( n-p+1, u(p,p), 1 )
+              end do
+              call stdlib_zlaset( 'U', nr-1, nr-1, czero, czero, u(1,2), ldu )
+              call stdlib_zgesvj( 'L', 'U', 'N', nr,nr, u, ldu, sva, nr, a,lda, cwork(n+1), lwork-&
+                        n, rwork, lrwork, info )
+              scalem  = rwork(1)
+              numrank = nint(rwork(2),KIND=ilp)
+              if ( nr < m ) then
+                 call stdlib_zlaset( 'A',  m-nr, nr,czero, czero, u(nr+1,1), ldu )
+                 if ( nr < n1 ) then
+                    call stdlib_zlaset( 'A',nr, n1-nr, czero, czero, u(1,nr+1),ldu )
+                    call stdlib_zlaset( 'A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1),ldu )
+                 end if
+              end if
+              call stdlib_zunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, cwork(n+1), lwork-n, &
+                        ierr )
+              if ( rowpiv )call stdlib_zlaswp( n1, u, ldu, 1, m-1, iwork(iwoff+1), -1 )
+              do p = 1, n1
+                 xsc = one / stdlib_dznrm2( m, u(1,p), 1 )
+                 call stdlib_zdscal( m, xsc, u(1,p), 1 )
+              end do
+              if ( transp ) then
+                 call stdlib_zlacpy( 'A', n, n, u, ldu, v, ldv )
+              end if
+           else
+              ! Full Svd 
+              if ( .not. jracc ) then
+              if ( .not. almort ) then
+                 ! second preconditioning step (qrf [with pivoting])
+                 ! note that the composition of transpose, qrf and transpose is
+                 ! equivalent to an lqf call. since in many libraries the qrf
+                 ! seems to be better optimized than the lqf, we do explicit
+                 ! transpose and use the qrf. this is subject to changes in an
+                 ! optimized implementation of stdlib_zgejsv.
+                 do p = 1, nr
+                    call stdlib_zcopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                    call stdlib_zlacgv( n-p+1, v(p,p), 1 )
+                 end do
+                 ! The Following Two Loops Perturb Small Entries To Avoid
+                 ! denormals in the second qr factorization, where they are
+                 ! as good as zeros. this is done to avoid painfully slow
+                 ! computation with denormals. the relative size of the perturbation
+                 ! is a parameter that can be changed by the implementer.
+                 ! this perturbation device will be obsolete on machines with
+                 ! properly implemented arithmetic.
+                 ! to switch it off, set l2pert=.false. to remove it from  the
+                 ! code, remove the action under l2pert=.true., leave the else part.
+                 ! the following two loops should be blocked and fused with the
+                 ! transposed copy above.
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do q = 1, nr
+                       ctemp = cmplx(xsc*abs( v(q,q) ),zero,KIND=dp)
+                       do p = 1, n
+                          if ( ( p > q ) .and. ( abs(v(p,q)) <= temp1 ).or. ( p < q ) )v(p,q) = &
+                                    ctemp
+           ! $                   v(p,q) = temp1 * ( v(p,q) / abs(v(p,q)) )
+                          if ( p < q ) v(p,q) = - v(p,q)
+                       end do
+                    end do
+                 else
+                    call stdlib_zlaset( 'U', nr-1, nr-1, czero, czero, v(1,2), ldv )
+                 end if
+                 ! estimate the row scaled condition number of r1
+                 ! (if r1 is rectangular, n > nr, then the condition number
+                 ! of the leading nr x nr submatrix is estimated.)
+                 call stdlib_zlacpy( 'L', nr, nr, v, ldv, cwork(2*n+1), nr )
+                 do p = 1, nr
+                    temp1 = stdlib_dznrm2(nr-p+1,cwork(2*n+(p-1)*nr+p),1)
+                    call stdlib_zdscal(nr-p+1,one/temp1,cwork(2*n+(p-1)*nr+p),1)
+                 end do
+                 call stdlib_zpocon('L',nr,cwork(2*n+1),nr,one,temp1,cwork(2*n+nr*nr+1),rwork,&
+                           ierr)
+                 condr1 = one / sqrt(temp1)
+                 ! Here Need A Second Opinion On The Condition Number
+                 ! Then Assume Worst Case Scenario
+                 ! r1 is ok for inverse <=> condr1 < real(n,KIND=dp)
+                 ! more conservative    <=> condr1 < sqrt(real(n,KIND=dp))
+                 cond_ok = sqrt(sqrt(real(nr,KIND=dp)))
+      ! [tp]       cond_ok is a tuning parameter.
+                 if ( condr1 < cond_ok ) then
+                    ! .. the second qrf without pivoting. note: in an optimized
+                    ! implementation, this qrf should be implemented as the qrf
+                    ! of a lower triangular matrix.
+                    ! r1^* = q2 * r2
+                    call stdlib_zgeqrf( n, nr, v, ldv, cwork(n+1), cwork(2*n+1),lwork-2*n, ierr )
+                              
+                    if ( l2pert ) then
+                       xsc = sqrt(small)/epsln
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             ctemp=cmplx(xsc*min(abs(v(p,p)),abs(v(q,q))),zero,KIND=dp)
+                             if ( abs(v(q,p)) <= temp1 )v(q,p) = ctemp
+           ! $                     v(q,p) = temp1 * ( v(q,p) / abs(v(q,p)) )
+                          end do
+                       end do
+                    end if
+                    if ( nr /= n )call stdlib_zlacpy( 'A', n, nr, v, ldv, cwork(2*n+1), n )
+                              
+                    ! .. save ...
+                 ! This Transposed Copy Should Be Better Than Naive
+                    do p = 1, nr - 1
+                       call stdlib_zcopy( nr-p, v(p,p+1), ldv, v(p+1,p), 1 )
+                       call stdlib_zlacgv(nr-p+1, v(p,p), 1 )
+                    end do
+                    v(nr,nr)=conjg(v(nr,nr))
+                    condr2 = condr1
+                 else
+                    ! .. ill-conditioned case: second qrf with pivoting
+                    ! note that windowed pivoting would be equally good
+                    ! numerically, and more run-time efficient. so, in
+                    ! an optimal implementation, the next call to stdlib_zgeqp3
+                    ! should be replaced with eg. call zgeqpx (acm toms #782)
+                    ! with properly (carefully) chosen parameters.
+                    ! r1^* * p2 = q2 * r2
+                    do p = 1, nr
+                       iwork(n+p) = 0
+                    end do
+                    call stdlib_zgeqp3( n, nr, v, ldv, iwork(n+1), cwork(n+1),cwork(2*n+1), lwork-&
+                              2*n, rwork, ierr )
+      ! *               call stdlib_zgeqrf( n, nr, v, ldv, cwork(n+1), cwork(2*n+1),
+      ! *     $              lwork-2*n, ierr )
+                    if ( l2pert ) then
+                       xsc = sqrt(small)
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             ctemp=cmplx(xsc*min(abs(v(p,p)),abs(v(q,q))),zero,KIND=dp)
+                             if ( abs(v(q,p)) <= temp1 )v(q,p) = ctemp
+           ! $                     v(q,p) = temp1 * ( v(q,p) / abs(v(q,p)) )
+                          end do
+                       end do
+                    end if
+                    call stdlib_zlacpy( 'A', n, nr, v, ldv, cwork(2*n+1), n )
+                    if ( l2pert ) then
+                       xsc = sqrt(small)
+                       do p = 2, nr
+                          do q = 1, p - 1
+                             ctemp=cmplx(xsc*min(abs(v(p,p)),abs(v(q,q))),zero,KIND=dp)
+                              ! v(p,q) = - temp1*( v(q,p) / abs(v(q,p)) )
+                             v(p,q) = - ctemp
+                          end do
+                       end do
+                    else
+                       call stdlib_zlaset( 'L',nr-1,nr-1,czero,czero,v(2,1),ldv )
+                    end if
+                    ! now, compute r2 = l3 * q3, the lq factorization.
+                    call stdlib_zgelqf( nr, nr, v, ldv, cwork(2*n+n*nr+1),cwork(2*n+n*nr+nr+1), &
+                              lwork-2*n-n*nr-nr, ierr )
+                    ! And Estimate The Condition Number
+                    call stdlib_zlacpy( 'L',nr,nr,v,ldv,cwork(2*n+n*nr+nr+1),nr )
+                    do p = 1, nr
+                       temp1 = stdlib_dznrm2( p, cwork(2*n+n*nr+nr+p), nr )
+                       call stdlib_zdscal( p, one/temp1, cwork(2*n+n*nr+nr+p), nr )
+                    end do
+                    call stdlib_zpocon( 'L',nr,cwork(2*n+n*nr+nr+1),nr,one,temp1,cwork(2*n+n*nr+&
+                              nr+nr*nr+1),rwork,ierr )
+                    condr2 = one / sqrt(temp1)
+                    if ( condr2 >= cond_ok ) then
+                       ! Save The Householder Vectors Used For Q3
+                       ! (this overwrites the copy of r2, as it will not be
+                       ! needed in this branch, but it does not overwritte the
+                       ! huseholder vectors of q2.).
+                       call stdlib_zlacpy( 'U', nr, nr, v, ldv, cwork(2*n+1), n )
+                       ! And The Rest Of The Information On Q3 Is In
+                       ! work(2*n+n*nr+1:2*n+n*nr+n)
+                    end if
+                 end if
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do q = 2, nr
+                       ctemp = xsc * v(q,q)
+                       do p = 1, q - 1
+                           ! v(p,q) = - temp1*( v(p,q) / abs(v(p,q)) )
+                          v(p,q) = - ctemp
+                       end do
+                    end do
+                 else
+                    call stdlib_zlaset( 'U', nr-1,nr-1, czero,czero, v(1,2), ldv )
+                 end if
+              ! second preconditioning finished; continue with jacobi svd
+              ! the input matrix is lower trinagular.
+              ! recover the right singular vectors as solution of a well
+              ! conditioned triangular matrix equation.
+                 if ( condr1 < cond_ok ) then
+                    call stdlib_zgesvj( 'L','U','N',nr,nr,v,ldv,sva,nr,u, ldu,cwork(2*n+n*nr+nr+1)&
+                              ,lwork-2*n-n*nr-nr,rwork,lrwork, info )
+                    scalem  = rwork(1)
+                    numrank = nint(rwork(2),KIND=ilp)
+                    do p = 1, nr
+                       call stdlib_zcopy(  nr, v(1,p), 1, u(1,p), 1 )
+                       call stdlib_zdscal( nr, sva(p),    v(1,p), 1 )
+                    end do
+              ! Pick The Right Matrix Equation And Solve It
+                    if ( nr == n ) then
+       ! :))             .. best case, r1 is inverted. the solution of this matrix
+                       ! equation is q2*v2 = the product of the jacobi rotations
+                       ! used in stdlib_zgesvj, premultiplied with the orthogonal matrix
+                       ! from the second qr factorization.
+                       call stdlib_ztrsm('L','U','N','N', nr,nr,cone, a,lda, v,ldv)
+                    else
+                       ! .. r1 is well conditioned, but non-square. adjoint of r2
+                       ! is inverted to get the product of the jacobi rotations
+                       ! used in stdlib_zgesvj. the q-factor from the second qr
+                       ! factorization is then built in explicitly.
+                       call stdlib_ztrsm('L','U','C','N',nr,nr,cone,cwork(2*n+1),n,v,ldv)
+                       if ( nr < n ) then
+                       call stdlib_zlaset('A',n-nr,nr,czero,czero,v(nr+1,1),ldv)
+                       call stdlib_zlaset('A',nr,n-nr,czero,czero,v(1,nr+1),ldv)
+                       call stdlib_zlaset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                       end if
+                       call stdlib_zunmqr('L','N',n,n,nr,cwork(2*n+1),n,cwork(n+1),v,ldv,cwork(&
+                                 2*n+n*nr+nr+1),lwork-2*n-n*nr-nr,ierr)
+                    end if
+                 else if ( condr2 < cond_ok ) then
+                    ! the matrix r2 is inverted. the solution of the matrix equation
+                    ! is q3^* * v3 = the product of the jacobi rotations (appplied to
+                    ! the lower triangular l3 from the lq factorization of
+                    ! r2=l3*q3), pre-multiplied with the transposed q3.
+                    call stdlib_zgesvj( 'L', 'U', 'N', nr, nr, v, ldv, sva, nr, u,ldu, cwork(2*n+&
+                              n*nr+nr+1), lwork-2*n-n*nr-nr,rwork, lrwork, info )
+                    scalem  = rwork(1)
+                    numrank = nint(rwork(2),KIND=ilp)
+                    do p = 1, nr
+                       call stdlib_zcopy( nr, v(1,p), 1, u(1,p), 1 )
+                       call stdlib_zdscal( nr, sva(p),    u(1,p), 1 )
+                    end do
+                    call stdlib_ztrsm('L','U','N','N',nr,nr,cone,cwork(2*n+1),n,u,ldu)
+                    ! Apply The Permutation From The Second Qr Factorization
+                    do q = 1, nr
+                       do p = 1, nr
+                          cwork(2*n+n*nr+nr+iwork(n+p)) = u(p,q)
+                       end do
+                       do p = 1, nr
+                          u(p,q) = cwork(2*n+n*nr+nr+p)
+                       end do
+                    end do
+                    if ( nr < n ) then
+                       call stdlib_zlaset( 'A',n-nr,nr,czero,czero,v(nr+1,1),ldv )
+                       call stdlib_zlaset( 'A',nr,n-nr,czero,czero,v(1,nr+1),ldv )
+                       call stdlib_zlaset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                    end if
+                    call stdlib_zunmqr( 'L','N',n,n,nr,cwork(2*n+1),n,cwork(n+1),v,ldv,cwork(2*n+&
+                              n*nr+nr+1),lwork-2*n-n*nr-nr,ierr )
+                 else
+                    ! last line of defense.
+       ! #:(          this is a rather pathological case: no scaled condition
+                    ! improvement after two pivoted qr factorizations. other
+                    ! possibility is that the rank revealing qr factorization
+                    ! or the condition estimator has failed, or the cond_ok
+                    ! is set very close to one (which is unnecessary). normally,
+                    ! this branch should never be executed, but in rare cases of
+                    ! failure of the rrqr or condition estimator, the last line of
+                    ! defense ensures that stdlib_zgejsv completes the task.
+                    ! compute the full svd of l3 using stdlib_zgesvj with explicit
+                    ! accumulation of jacobi rotations.
+                    call stdlib_zgesvj( 'L', 'U', 'V', nr, nr, v, ldv, sva, nr, u,ldu, cwork(2*n+&
+                              n*nr+nr+1), lwork-2*n-n*nr-nr,rwork, lrwork, info )
+                    scalem  = rwork(1)
+                    numrank = nint(rwork(2),KIND=ilp)
+                    if ( nr < n ) then
+                       call stdlib_zlaset( 'A',n-nr,nr,czero,czero,v(nr+1,1),ldv )
+                       call stdlib_zlaset( 'A',nr,n-nr,czero,czero,v(1,nr+1),ldv )
+                       call stdlib_zlaset('A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv)
+                    end if
+                    call stdlib_zunmqr( 'L','N',n,n,nr,cwork(2*n+1),n,cwork(n+1),v,ldv,cwork(2*n+&
+                              n*nr+nr+1),lwork-2*n-n*nr-nr,ierr )
+                    call stdlib_zunmlq( 'L', 'C', nr, nr, nr, cwork(2*n+1), n,cwork(2*n+n*nr+1), &
+                              u, ldu, cwork(2*n+n*nr+nr+1),lwork-2*n-n*nr-nr, ierr )
+                    do q = 1, nr
+                       do p = 1, nr
+                          cwork(2*n+n*nr+nr+iwork(n+p)) = u(p,q)
+                       end do
+                       do p = 1, nr
+                          u(p,q) = cwork(2*n+n*nr+nr+p)
+                       end do
+                    end do
+                 end if
+                 ! permute the rows of v using the (column) permutation from the
+                 ! first qrf. also, scale the columns to make them unit in
+                 ! euclidean norm. this applies to all cases.
+                 temp1 = sqrt(real(n,KIND=dp)) * epsln
+                 do q = 1, n
+                    do p = 1, n
+                       cwork(2*n+n*nr+nr+iwork(p)) = v(p,q)
+                    end do
+                    do p = 1, n
+                       v(p,q) = cwork(2*n+n*nr+nr+p)
+                    end do
+                    xsc = one / stdlib_dznrm2( n, v(1,q), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_zdscal( n, xsc,&
+                               v(1,q), 1 )
+                 end do
+                 ! at this moment, v contains the right singular vectors of a.
+                 ! next, assemble the left singular vector matrix u (m x n).
+                 if ( nr < m ) then
+                    call stdlib_zlaset('A', m-nr, nr, czero, czero, u(nr+1,1), ldu)
+                    if ( nr < n1 ) then
+                       call stdlib_zlaset('A',nr,n1-nr,czero,czero,u(1,nr+1),ldu)
+                       call stdlib_zlaset('A',m-nr,n1-nr,czero,cone,u(nr+1,nr+1),ldu)
+                    end if
+                 end if
+                 ! the q matrix from the first qrf is built into the left singular
+                 ! matrix u. this applies to all cases.
+                 call stdlib_zunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, cwork(n+1), lwork-&
+                           n, ierr )
+                 ! the columns of u are normalized. the cost is o(m*n) flops.
+                 temp1 = sqrt(real(m,KIND=dp)) * epsln
+                 do p = 1, nr
+                    xsc = one / stdlib_dznrm2( m, u(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_zdscal( m, xsc,&
+                               u(1,p), 1 )
+                 end do
+                 ! if the initial qrf is computed with row pivoting, the left
+                 ! singular vectors must be adjusted.
+                 if ( rowpiv )call stdlib_zlaswp( n1, u, ldu, 1, m-1, iwork(iwoff+1), -1 )
+              else
+              ! The Initial Matrix A Has Almost Orthogonal Columns And
+              ! the second qrf is not needed
+                 call stdlib_zlacpy( 'U', n, n, a, lda, cwork(n+1), n )
+                 if ( l2pert ) then
+                    xsc = sqrt(small)
+                    do p = 2, n
+                       ctemp = xsc * cwork( n + (p-1)*n + p )
+                       do q = 1, p - 1
+                           ! cwork(n+(q-1)*n+p)=-temp1 * ( cwork(n+(p-1)*n+q) /
+           ! $                                        abs(cwork(n+(p-1)*n+q)) )
+                          cwork(n+(q-1)*n+p)=-ctemp
+                       end do
+                    end do
+                 else
+                    call stdlib_zlaset( 'L',n-1,n-1,czero,czero,cwork(n+2),n )
+                 end if
+                 call stdlib_zgesvj( 'U', 'U', 'N', n, n, cwork(n+1), n, sva,n, u, ldu, cwork(n+&
+                           n*n+1), lwork-n-n*n, rwork, lrwork,info )
+                 scalem  = rwork(1)
+                 numrank = nint(rwork(2),KIND=ilp)
+                 do p = 1, n
+                    call stdlib_zcopy( n, cwork(n+(p-1)*n+1), 1, u(1,p), 1 )
+                    call stdlib_zdscal( n, sva(p), cwork(n+(p-1)*n+1), 1 )
+                 end do
+                 call stdlib_ztrsm( 'L', 'U', 'N', 'N', n, n,cone, a, lda, cwork(n+1), n )
+                 do p = 1, n
+                    call stdlib_zcopy( n, cwork(n+p), n, v(iwork(p),1), ldv )
+                 end do
+                 temp1 = sqrt(real(n,KIND=dp))*epsln
+                 do p = 1, n
+                    xsc = one / stdlib_dznrm2( n, v(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_zdscal( n, xsc,&
+                               v(1,p), 1 )
+                 end do
+                 ! assemble the left singular vector matrix u (m x n).
+                 if ( n < m ) then
+                    call stdlib_zlaset( 'A',  m-n, n, czero, czero, u(n+1,1), ldu )
+                    if ( n < n1 ) then
+                       call stdlib_zlaset('A',n,  n1-n, czero, czero,  u(1,n+1),ldu)
+                       call stdlib_zlaset( 'A',m-n,n1-n, czero, cone,u(n+1,n+1),ldu)
+                    end if
+                 end if
+                 call stdlib_zunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, cwork(n+1), lwork-&
+                           n, ierr )
+                 temp1 = sqrt(real(m,KIND=dp))*epsln
+                 do p = 1, n1
+                    xsc = one / stdlib_dznrm2( m, u(1,p), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_zdscal( m, xsc,&
+                               u(1,p), 1 )
+                 end do
+                 if ( rowpiv )call stdlib_zlaswp( n1, u, ldu, 1, m-1, iwork(iwoff+1), -1 )
+              end if
+              ! end of the  >> almost orthogonal case <<  in the full svd
+              else
+              ! this branch deploys a preconditioned jacobi svd with explicitly
+              ! accumulated rotations. it is included as optional, mainly for
+              ! experimental purposes. it does perform well, and can also be used.
+              ! in this implementation, this branch will be automatically activated
+              ! if the  condition number sigma_max(a) / sigma_min(a) is predicted
+              ! to be greater than the overflow threshold. this is because the
+              ! a posteriori computation of the singular vectors assumes robust
+              ! implementation of blas and some lapack procedures, capable of working
+              ! in presence of extreme values, e.g. when the singular values spread from
+              ! the underflow to the overflow threshold.
+              do p = 1, nr
+                 call stdlib_zcopy( n-p+1, a(p,p), lda, v(p,p), 1 )
+                 call stdlib_zlacgv( n-p+1, v(p,p), 1 )
+              end do
+              if ( l2pert ) then
+                 xsc = sqrt(small/epsln)
+                 do q = 1, nr
+                    ctemp = cmplx(xsc*abs( v(q,q) ),zero,KIND=dp)
+                    do p = 1, n
+                       if ( ( p > q ) .and. ( abs(v(p,q)) <= temp1 ).or. ( p < q ) )v(p,q) = &
+                                 ctemp
+           ! $                v(p,q) = temp1 * ( v(p,q) / abs(v(p,q)) )
+                       if ( p < q ) v(p,q) = - v(p,q)
+                    end do
+                 end do
+              else
+                 call stdlib_zlaset( 'U', nr-1, nr-1, czero, czero, v(1,2), ldv )
+              end if
+              call stdlib_zgeqrf( n, nr, v, ldv, cwork(n+1), cwork(2*n+1),lwork-2*n, ierr )
+                        
+              call stdlib_zlacpy( 'L', n, nr, v, ldv, cwork(2*n+1), n )
+              do p = 1, nr
+                 call stdlib_zcopy( nr-p+1, v(p,p), ldv, u(p,p), 1 )
+                 call stdlib_zlacgv( nr-p+1, u(p,p), 1 )
+              end do
+              if ( l2pert ) then
+                 xsc = sqrt(small/epsln)
+                 do q = 2, nr
+                    do p = 1, q - 1
+                       ctemp = cmplx(xsc * min(abs(u(p,p)),abs(u(q,q))),zero,KIND=dp)
+                        ! u(p,q) = - temp1 * ( u(q,p) / abs(u(q,p)) )
+                       u(p,q) = - ctemp
+                    end do
+                 end do
+              else
+                 call stdlib_zlaset('U', nr-1, nr-1, czero, czero, u(1,2), ldu )
+              end if
+              call stdlib_zgesvj( 'L', 'U', 'V', nr, nr, u, ldu, sva,n, v, ldv, cwork(2*n+n*nr+1),&
+                         lwork-2*n-n*nr,rwork, lrwork, info )
+              scalem  = rwork(1)
+              numrank = nint(rwork(2),KIND=ilp)
+              if ( nr < n ) then
+                 call stdlib_zlaset( 'A',n-nr,nr,czero,czero,v(nr+1,1),ldv )
+                 call stdlib_zlaset( 'A',nr,n-nr,czero,czero,v(1,nr+1),ldv )
+                 call stdlib_zlaset( 'A',n-nr,n-nr,czero,cone,v(nr+1,nr+1),ldv )
+              end if
+              call stdlib_zunmqr( 'L','N',n,n,nr,cwork(2*n+1),n,cwork(n+1),v,ldv,cwork(2*n+n*nr+&
+                        nr+1),lwork-2*n-n*nr-nr,ierr )
+                 ! permute the rows of v using the (column) permutation from the
+                 ! first qrf. also, scale the columns to make them unit in
+                 ! euclidean norm. this applies to all cases.
+                 temp1 = sqrt(real(n,KIND=dp)) * epsln
+                 do q = 1, n
+                    do p = 1, n
+                       cwork(2*n+n*nr+nr+iwork(p)) = v(p,q)
+                    end do
+                    do p = 1, n
+                       v(p,q) = cwork(2*n+n*nr+nr+p)
+                    end do
+                    xsc = one / stdlib_dznrm2( n, v(1,q), 1 )
+                    if ( (xsc < (one-temp1)) .or. (xsc > (one+temp1)) )call stdlib_zdscal( n, xsc,&
+                               v(1,q), 1 )
+                 end do
+                 ! at this moment, v contains the right singular vectors of a.
+                 ! next, assemble the left singular vector matrix u (m x n).
+              if ( nr < m ) then
+                 call stdlib_zlaset( 'A',  m-nr, nr, czero, czero, u(nr+1,1), ldu )
+                 if ( nr < n1 ) then
+                    call stdlib_zlaset('A',nr,  n1-nr, czero, czero,  u(1,nr+1),ldu)
+                    call stdlib_zlaset('A',m-nr,n1-nr, czero, cone,u(nr+1,nr+1),ldu)
+                 end if
+              end if
+              call stdlib_zunmqr( 'L', 'N', m, n1, n, a, lda, cwork, u,ldu, cwork(n+1), lwork-n, &
+                        ierr )
+                 if ( rowpiv )call stdlib_zlaswp( n1, u, ldu, 1, m-1, iwork(iwoff+1), -1 )
+              end if
+              if ( transp ) then
+                 ! .. swap u and v because the procedure worked on a^*
+                 do p = 1, n
+                    call stdlib_zswap( n, u(1,p), 1, v(1,p), 1 )
+                 end do
+              end if
+           end if
+           ! end of the full svd
+           ! undo scaling, if necessary (and possible)
+           if ( uscal2 <= (big/sva(1))*uscal1 ) then
+              call stdlib_dlascl( 'G', 0, 0, uscal1, uscal2, nr, 1, sva, n, ierr )
+              uscal1 = one
+              uscal2 = one
+           end if
+           if ( nr < n ) then
+              do p = nr+1, n
+                 sva(p) = zero
+              end do
+           end if
+           rwork(1) = uscal2 * scalem
+           rwork(2) = uscal1
+           if ( errest ) rwork(3) = sconda
+           if ( lsvec .and. rsvec ) then
+              rwork(4) = condr1
+              rwork(5) = condr2
+           end if
+           if ( l2tran ) then
+              rwork(6) = entra
+              rwork(7) = entrat
+           end if
+           iwork(1) = nr
+           iwork(2) = numrank
+           iwork(3) = warning
+           if ( transp ) then
+               iwork(4) =  1
+           else
+               iwork(4) = -1
+           end if
+           return
+     end subroutine stdlib_zgejsv
+
+     !> ZGESVJ: computes the singular value decomposition (SVD) of a complex
+     !> M-by-N matrix A, where M >= N. The SVD of A is written as
+     !> [++]   [xx]   [x0]   [xx]
+     !> A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+     !> [++]   [xx]
+     !> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+     !> matrix, and V is an N-by-N unitary matrix. The diagonal elements
+     !> of SIGMA are the singular values of A. The columns of U and V are the
+     !> left and the right singular vectors of A, respectively.
+
+     pure subroutine stdlib_zgesvj( joba, jobu, jobv, m, n, a, lda, sva, mv, v,ldv, cwork, lwork, &
+               rwork, lrwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, lrwork, m, mv, n
+           character, intent(in) :: joba, jobu, jobv
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), v(ldv,*), cwork(lwork)
+           real(dp), intent(inout) :: rwork(lrwork)
+           real(dp), intent(out) :: sva(n)
+        ! =====================================================================
+           ! Local Parameters 
+           integer(ilp), parameter :: nsweep = 30
+           
+           
+           
+           ! Local Scalars 
+           complex(dp) :: aapq, ompq
+           real(dp) :: aapp, aapp0, aapq1, aaqq, apoaq, aqoap, big, bigtheta, cs, ctol, epsln, &
+           mxaapq, mxsinj, rootbig, rooteps, rootsfmin, roottol, skl, sfmin, small, sn, t, temp1, &
+                     theta, thsign, tol
+           integer(ilp) :: blskip, emptsw, i, ibr, ierr, igl, ijblsk, ir1, iswrot, jbc, jgl, kbl, &
+                     lkahead, mvl, n2, n34, n4, nbl, notrot, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, goscale, lower, lquery, lsvec, noscale, rotok, rsvec, uctol, &
+                     upper
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,min,conjg,real,sign,sqrt
+           ! from lapack
+           ! from lapack
+           ! Executable Statements 
+           ! test the input arguments
+           lsvec = stdlib_lsame( jobu, 'U' ) .or. stdlib_lsame( jobu, 'F' )
+           uctol = stdlib_lsame( jobu, 'C' )
+           rsvec = stdlib_lsame( jobv, 'V' ) .or. stdlib_lsame( jobv, 'J' )
+           applv = stdlib_lsame( jobv, 'A' )
+           upper = stdlib_lsame( joba, 'U' )
+           lower = stdlib_lsame( joba, 'L' )
+           lquery = ( lwork == -1 ) .or. ( lrwork == -1 )
+           if( .not.( upper .or. lower .or. stdlib_lsame( joba, 'G' ) ) ) then
+              info = -1
+           else if( .not.( lsvec .or. uctol .or. stdlib_lsame( jobu, 'N' ) ) ) then
+              info = -2
+           else if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -3
+           else if( m<0 ) then
+              info = -4
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -5
+           else if( lda<m ) then
+              info = -7
+           else if( mv<0 ) then
+              info = -9
+           else if( ( rsvec .and. ( ldv<n ) ) .or.( applv .and. ( ldv<mv ) ) ) then
+              info = -11
+           else if( uctol .and. ( rwork( 1 )<=one ) ) then
+              info = -12
+           else if( ( lwork<( m+n ) ) .and. ( .not.lquery ) ) then
+              info = -13
+           else if( ( lrwork<max( n, 6 ) ) .and. ( .not.lquery ) ) then
+              info = -15
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGESVJ', -info )
+              return
+           else if ( lquery ) then
+              cwork(1) = m + n
+              rwork(1) = max( n, 6 )
+              return
+           end if
+       ! #:) quick return for void matrix
+           if( ( m==0 ) .or. ( n==0 ) )return
+           ! set numerical parameters
+           ! the stopping criterion for jacobi rotations is
+           ! max_{i<>j}|a(:,i)^* * a(:,j)| / (||a(:,i)||*||a(:,j)||) < ctol*eps
+           ! where eps is the round-off and ctol is defined as follows:
+           if( uctol ) then
+              ! ... user controlled
+              ctol = rwork( 1 )
+           else
+              ! ... default
+              if( lsvec .or. rsvec .or. applv ) then
+                 ctol = sqrt( real( m,KIND=dp) )
+              else
+                 ctol = real( m,KIND=dp)
+              end if
+           end if
+           ! ... and the machine dependent parameters are
+      ! [!]  (make sure that stdlib_slamch() works properly on the target machine.)
+           epsln = stdlib_dlamch( 'EPSILON' )
+           rooteps = sqrt( epsln )
+           sfmin = stdlib_dlamch( 'SAFEMINIMUM' )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / epsln
+           big = stdlib_dlamch( 'OVERFLOW' )
+           ! big         = one    / sfmin
+           rootbig = one / rootsfmin
+            ! large = big / sqrt( real( m*n,KIND=dp) )
+           bigtheta = one / rooteps
+           tol = ctol*epsln
+           roottol = sqrt( tol )
+           if( real( m,KIND=dp)*epsln>=one ) then
+              info = -4
+              call stdlib_xerbla( 'ZGESVJ', -info )
+              return
+           end if
+           ! initialize the right singular vector matrix.
+           if( rsvec ) then
+              mvl = n
+              call stdlib_zlaset( 'A', mvl, n, czero, cone, v, ldv )
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           ! initialize sva( 1:n ) = ( ||a e_i||_2, i = 1:n )
+      ! (!)  if necessary, scale a to protect the largest singular value
+           ! from overflow. it is possible that saving the largest singular
+           ! value destroys the information about the small ones.
+           ! this initial scaling is almost minimal in the sense that the
+           ! goal is to make sure that no column norm overflows, and that
+           ! sqrt(n)*max_i sva(i) does not overflow. if infinite entries
+           ! in a are detected, the procedure returns with info=-6.
+           skl = one / sqrt( real( m,KIND=dp)*real( n,KIND=dp) )
+           noscale = .true.
+           goscale = .true.
+           if( lower ) then
+              ! the input matrix is m-by-n lower triangular (trapezoidal)
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_zlassq( m-p+1, a( p, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'ZGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl )
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           else if( upper ) then
+              ! the input matrix is m-by-n upper triangular (trapezoidal)
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_zlassq( p, a( 1, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'ZGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl )
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           else
+              ! the input matrix is m-by-n general dense
+              do p = 1, n
+                 aapp = zero
+                 aaqq = one
+                 call stdlib_zlassq( m, a( 1, p ), 1, aapp, aaqq )
+                 if( aapp>big ) then
+                    info = -6
+                    call stdlib_xerbla( 'ZGESVJ', -info )
+                    return
+                 end if
+                 aaqq = sqrt( aaqq )
+                 if( ( aapp<( big / aaqq ) ) .and. noscale ) then
+                    sva( p ) = aapp*aaqq
+                 else
+                    noscale = .false.
+                    sva( p ) = aapp*( aaqq*skl )
+                    if( goscale ) then
+                       goscale = .false.
+                       do q = 1, p - 1
+                          sva( q ) = sva( q )*skl
+                       end do
+                    end if
+                 end if
+              end do
+           end if
+           if( noscale )skl = one
+           ! move the smaller part of the spectrum from the underflow threshold
+      ! (!)  start by determining the position of the nonzero entries of the
+           ! array sva() relative to ( sfmin, big ).
+           aapp = zero
+           aaqq = big
+           do p = 1, n
+              if( sva( p )/=zero )aaqq = min( aaqq, sva( p ) )
+              aapp = max( aapp, sva( p ) )
+           end do
+       ! #:) quick return for zero matrix
+           if( aapp==zero ) then
+              if( lsvec )call stdlib_zlaset( 'G', m, n, czero, cone, a, lda )
+              rwork( 1 ) = one
+              rwork( 2 ) = zero
+              rwork( 3 ) = zero
+              rwork( 4 ) = zero
+              rwork( 5 ) = zero
+              rwork( 6 ) = zero
+              return
+           end if
+       ! #:) quick return for one-column matrix
+           if( n==1 ) then
+              if( lsvec )call stdlib_zlascl( 'G', 0, 0, sva( 1 ), skl, m, 1,a( 1, 1 ), lda, ierr )
+                        
+              rwork( 1 ) = one / skl
+              if( sva( 1 )>=sfmin ) then
+                 rwork( 2 ) = one
+              else
+                 rwork( 2 ) = zero
+              end if
+              rwork( 3 ) = zero
+              rwork( 4 ) = zero
+              rwork( 5 ) = zero
+              rwork( 6 ) = zero
+              return
+           end if
+           ! protect small singular values from underflow, and try to
+           ! avoid underflows/overflows in computing jacobi rotations.
+           sn = sqrt( sfmin / epsln )
+           temp1 = sqrt( big / real( n,KIND=dp) )
+           if( ( aapp<=sn ) .or. ( aaqq>=temp1 ) .or.( ( sn<=aaqq ) .and. ( aapp<=temp1 ) ) ) &
+                     then
+              temp1 = min( big, temp1 / aapp )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq<=sn ) .and. ( aapp<=temp1 ) ) then
+              temp1 = min( sn / aaqq, big / (aapp*sqrt( real(n,KIND=dp)) ) )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq>=sn ) .and. ( aapp>=temp1 ) ) then
+              temp1 = max( sn / aaqq, temp1 / aapp )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else if( ( aaqq<=sn ) .and. ( aapp>=temp1 ) ) then
+              temp1 = min( sn / aaqq, big / ( sqrt( real( n,KIND=dp) )*aapp ) )
+               ! aaqq  = aaqq*temp1
+               ! aapp  = aapp*temp1
+           else
+              temp1 = one
+           end if
+           ! scale, if necessary
+           if( temp1/=one ) then
+              call stdlib_dlascl( 'G', 0, 0, one, temp1, n, 1, sva, n, ierr )
+           end if
+           skl = temp1*skl
+           if( skl/=one ) then
+              call stdlib_zlascl( joba, 0, 0, one, skl, m, n, a, lda, ierr )
+              skl = one / skl
+           end if
+           ! row-cyclic jacobi svd algorithm with column pivoting
+           emptsw = ( n*( n-1 ) ) / 2
+           notrot = 0
+           do q = 1, n
+              cwork( q ) = cone
+           end do
+           swband = 3
+      ! [tp] swband is a tuning parameter [tp]. it is meaningful and effective
+           ! if stdlib_zgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_zgejsv. for sweeps i=1:swband the procedure
+           ! works on pivots inside a band-like region around the diagonal.
+           ! the boundaries are determined dynamically, based on the number of
+           ! pivots above a threshold.
+           kbl = min( 8, n )
+      ! [tp] kbl is a tuning parameter that defines the tile size in the
+           ! tiling of the p-q loops of pivot pairs. in general, an optimal
+           ! value of kbl depends on the matrix dimensions and on the
+           ! parameters of the computer's memory.
+           nbl = n / kbl
+           if( ( nbl*kbl )/=n )nbl = nbl + 1
+           blskip = kbl**2
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           lkahead = 1
+      ! [tp] lkahead is a tuning parameter.
+           ! quasi block transformations, using the lower (upper) triangular
+           ! structure of the input matrix. the quasi-block-cycling usually
+           ! invokes cubic convergence. big part of this cycle is done inside
+           ! canonical subspaces of dimensions less than m.
+           if( ( lower .or. upper ) .and. ( n>max( 64, 4*kbl ) ) ) then
+      ! [tp] the number of partition levels and the actual partition are
+           ! tuning parameters.
+              n4 = n / 4
+              n2 = n / 2
+              n34 = 3*n4
+              if( applv ) then
+                 q = 0
+              else
+                 q = 1
+              end if
+              if( lower ) then
+           ! this works very well on lower triangular matrices, in particular
+           ! in the framework of the preconditioned jacobi svd (xgejsv).
+           ! the idea is simple:
+           ! [+ 0 0 0]   note that jacobi transformations of [0 0]
+           ! [+ + 0 0]                                       [0 0]
+           ! [+ + x 0]   actually work on [x 0]              [x 0]
+           ! [+ + x x]                    [x x].             [x x]
+                 call stdlib_zgsvj0( jobv, m-n34, n-n34, a( n34+1, n34+1 ), lda,cwork( n34+1 ), &
+                 sva( n34+1 ), mvl,v( n34*q+1, n34+1 ), ldv, epsln, sfmin, tol,2, cwork( n+1 ), &
+                           lwork-n, ierr )
+                 call stdlib_zgsvj0( jobv, m-n2, n34-n2, a( n2+1, n2+1 ), lda,cwork( n2+1 ), sva( &
+                 n2+1 ), mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 2,cwork( n+1 ), lwork-n, &
+                           ierr )
+                 call stdlib_zgsvj1( jobv, m-n2, n-n2, n4, a( n2+1, n2+1 ), lda,cwork( n2+1 ), &
+                 sva( n2+1 ), mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 1,cwork( n+1 ), &
+                           lwork-n, ierr )
+                 call stdlib_zgsvj0( jobv, m-n4, n2-n4, a( n4+1, n4+1 ), lda,cwork( n4+1 ), sva( &
+                 n4+1 ), mvl,v( n4*q+1, n4+1 ), ldv, epsln, sfmin, tol, 1,cwork( n+1 ), lwork-n, &
+                           ierr )
+                 call stdlib_zgsvj0( jobv, m, n4, a, lda, cwork, sva, mvl, v, ldv,epsln, sfmin, &
+                           tol, 1, cwork( n+1 ), lwork-n,ierr )
+                 call stdlib_zgsvj1( jobv, m, n2, n4, a, lda, cwork, sva, mvl, v,ldv, epsln, &
+                           sfmin, tol, 1, cwork( n+1 ),lwork-n, ierr )
+              else if( upper ) then
+                 call stdlib_zgsvj0( jobv, n4, n4, a, lda, cwork, sva, mvl, v, ldv,epsln, sfmin, &
+                           tol, 2, cwork( n+1 ), lwork-n,ierr )
+                 call stdlib_zgsvj0( jobv, n2, n4, a( 1, n4+1 ), lda, cwork( n4+1 ),sva( n4+1 ), &
+                 mvl, v( n4*q+1, n4+1 ), ldv,epsln, sfmin, tol, 1, cwork( n+1 ), lwork-n,ierr )
+                           
+                 call stdlib_zgsvj1( jobv, n2, n2, n4, a, lda, cwork, sva, mvl, v,ldv, epsln, &
+                           sfmin, tol, 1, cwork( n+1 ),lwork-n, ierr )
+                 call stdlib_zgsvj0( jobv, n2+n4, n4, a( 1, n2+1 ), lda,cwork( n2+1 ), sva( n2+1 )&
+                 , mvl,v( n2*q+1, n2+1 ), ldv, epsln, sfmin, tol, 1,cwork( n+1 ), lwork-n, ierr )
+                           
+              end if
+           end if
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+           ! each sweep is unrolled using kbl-by-kbl tiles over the pivot pairs
+           ! 1 <= p < q <= n. this is the first step toward a blocked implementation
+           ! of the rotations. new implementation, based on block transformations,
+           ! is under development.
+              loop_2000: do ibr = 1, nbl
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_1002: do ir1 = 0, min( lkahead, nbl-ibr )
+                    igl = igl + ir1*kbl
+                    loop_2001: do p = igl, min( igl+kbl-1, n-1 )
+           ! .. de rijk's pivoting
+                       q = stdlib_idamax( n-p+1, sva( p ), 1 ) + p - 1
+                       if( p/=q ) then
+                          call stdlib_zswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                          if( rsvec )call stdlib_zswap( mvl, v( 1, p ), 1,v( 1, q ), 1 )
+                          temp1 = sva( p )
+                          sva( p ) = sva( q )
+                          sva( q ) = temp1
+                          aapq = cwork(p)
+                          cwork(p) = cwork(q)
+                          cwork(q) = aapq
+                       end if
+                       if( ir1==0 ) then
+              ! column norms are periodically updated by explicit
+              ! norm computation.
+      ! [!]     caveat:
+              ! unfortunately, some blas implementations compute stdlib_dznrm2(m,a(1,p),1)
+              ! as sqrt(s=stdlib_cdotc(m,a(1,p),1,a(1,p),1)), which may cause the result to
+              ! overflow for ||a(:,p)||_2 > sqrt(overflow_threshold), and to
+              ! underflow for ||a(:,p)||_2 < sqrt(underflow_threshold).
+              ! hence, stdlib_dznrm2 cannot be trusted, not even in the case when
+              ! the true norm is far from the under(over)flow boundaries.
+              ! if properly implemented stdlib_scnrm2 is available, the if-then-else-end if
+              ! below should be replaced with "aapp = stdlib_dznrm2( m, a(1,p), 1 )".
+                          if( ( sva( p )<rootbig ) .and.( sva( p )>rootsfmin ) ) then
+                             sva( p ) = stdlib_dznrm2( m, a( 1, p ), 1 )
+                          else
+                             temp1 = zero
+                             aapp = one
+                             call stdlib_zlassq( m, a( 1, p ), 1, temp1, aapp )
+                             sva( p ) = temp1*sqrt( aapp )
+                          end if
+                          aapp = sva( p )
+                       else
+                          aapp = sva( p )
+                       end if
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2002: do q = p + 1, min( igl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+                                if( aaqq>=one ) then
+                                   rotok = ( small*aapp )<=aaqq
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_zdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_zcopy( m, a( 1, p ), 1,cwork(n+1), 1 )
+                                      call stdlib_zlascl( 'G', 0, 0, aapp, one,m, 1, cwork(n+1), &
+                                                lda, ierr )
+                                      aapq = stdlib_zdotc( m, cwork(n+1), 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   rotok = aapp<=( aaqq / small )
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_zdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aapp ) / aaqq
+                                   else
+                                      call stdlib_zcopy( m, a( 1, q ), 1,cwork(n+1), 1 )
+                                      call stdlib_zlascl( 'G', 0, 0, aaqq,one, m, 1,cwork(n+1), &
+                                                lda, ierr )
+                                      aapq = stdlib_zdotc( m, a(1, p ), 1,cwork(n+1), 1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg( cwork(p) ) * cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                ompq = aapq / abs(aapq)
+                 ! Rotate
+      ! [rtd]      rotated = rotated + one
+                                   if( ir1==0 ) then
+                                      notrot = 0
+                                      pskipped = 0
+                                      iswrot = iswrot + 1
+                                   end if
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/aapq1
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_zrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if ( rsvec ) then
+                                             call stdlib_zrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_zrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if ( rsvec ) then
+                                             call stdlib_zrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      cwork(p) = -cwork(q) * ompq
+                                      else
+                    ! .. have to use modified gram-schmidt like transformation
+                                      call stdlib_zcopy( m, a( 1, p ), 1,cwork(n+1), 1 )
+                                      call stdlib_zlascl( 'G', 0, 0, aapp, one, m,1, cwork(n+1), &
+                                                lda,ierr )
+                                      call stdlib_zlascl( 'G', 0, 0, aaqq, one, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      call stdlib_zaxpy( m, -aapq, cwork(n+1), 1,a( 1, q ), 1 )
+                                                
+                                      call stdlib_zlascl( 'G', 0, 0, one, aaqq, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                      mxsinj = max( mxsinj, sfmin )
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! recompute sva(q), sva(p).
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_dznrm2( m, a( 1, q ), 1 )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_zlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_dznrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_zlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                                else
+                                   ! a(:,p) and a(:,q) already numerically orthogonal
+                                   if( ir1==0 )notrot = notrot + 1
+      ! [rtd]      skipped  = skipped + 1
+                                   pskipped = pskipped + 1
+                                end if
+                             else
+                                ! a(:,q) is zero column
+                                if( ir1==0 )notrot = notrot + 1
+                                pskipped = pskipped + 1
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                if( ir1==0 )aapp = -aapp
+                                notrot = 0
+                                go to 2103
+                             end if
+                          end do loop_2002
+           ! end q-loop
+           2103 continue
+           ! bailed out of q-loop
+                          sva( p ) = aapp
+                       else
+                          sva( p ) = aapp
+                          if( ( ir1==0 ) .and. ( aapp==zero ) )notrot = notrot + min( igl+kbl-1, &
+                                    n ) - p
+                       end if
+                    end do loop_2001
+           ! end of the p-loop
+           ! end of doing the block ( ibr, ibr )
+                 end do loop_1002
+           ! end of ir1-loop
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = ibr + 1, nbl
+                    jgl = ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! safe gram matrix computation
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_zdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_zcopy( m, a( 1, p ), 1,cwork(n+1), 1 )
+                                      call stdlib_zlascl( 'G', 0, 0, aapp,one, m, 1,cwork(n+1), &
+                                                lda, ierr )
+                                      aapq = stdlib_zdotc( m, cwork(n+1), 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_zdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / max(&
+                                                aaqq,aapp) )/ min(aaqq,aapp)
+                                   else
+                                      call stdlib_zcopy( m, a( 1, q ), 1,cwork(n+1), 1 )
+                                      call stdlib_zlascl( 'G', 0, 0, aaqq,one, m, 1,cwork(n+1), &
+                                                lda, ierr )
+                                      aapq = stdlib_zdotc( m, a( 1, p ), 1,cwork(n+1),  1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg(cwork(p))*cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                   ompq = aapq / abs(aapq)
+                                   notrot = 0
+      ! [rtd]      rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/ aapq1
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_zrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if( rsvec ) then
+                                             call stdlib_zrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_zrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if( rsvec ) then
+                                             call stdlib_zrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      cwork(p) = -cwork(q) * ompq
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                    if( aapp>aaqq ) then
+                                         call stdlib_zcopy( m, a( 1, p ), 1,cwork(n+1), 1 )
+                                                   
+                                         call stdlib_zlascl( 'G', 0, 0, aapp, one,m, 1, cwork(n+1)&
+                                                   ,lda,ierr )
+                                         call stdlib_zlascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         call stdlib_zaxpy( m, -aapq, cwork(n+1),1, a( 1, q ), 1 )
+                                                   
+                                         call stdlib_zlascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    else
+                                        call stdlib_zcopy( m, a( 1, q ), 1,cwork(n+1), 1 )
+                                         call stdlib_zlascl( 'G', 0, 0, aaqq, one,m, 1, cwork(n+1)&
+                                                   ,lda,ierr )
+                                         call stdlib_zlascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         call stdlib_zaxpy( m, -conjg(aapq),cwork(n+1), 1, a( 1, &
+                                                   p ), 1 )
+                                         call stdlib_zlascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! .. recompute sva(q), sva(p)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_dznrm2( m, a( 1, q ), 1)
+                                       else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_zlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_dznrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_zlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_dznrm2( m, a( 1, n ), 1 )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_zlassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<sqrt( real( n,KIND=dp) )*tol ) .and. ( real( n,&
+                        KIND=dp)*mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:( reaching this point means that the procedure has not converged.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means numerical convergence after the i-th
+           ! sweep.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the singular values and find how many are above
+           ! the underflow threshold.
+           n2 = 0
+           n4 = 0
+           do p = 1, n - 1
+              q = stdlib_idamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 call stdlib_zswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_zswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+              if( sva( p )/=zero ) then
+                 n4 = n4 + 1
+                 if( sva( p )*skl>sfmin )n2 = n2 + 1
+              end if
+           end do
+           if( sva( n )/=zero ) then
+              n4 = n4 + 1
+              if( sva( n )*skl>sfmin )n2 = n2 + 1
+           end if
+           ! normalize the left singular vectors.
+           if( lsvec .or. uctol ) then
+              do p = 1, n4
+                  ! call stdlib_zdscal( m, one / sva( p ), a( 1, p ), 1 )
+                 call stdlib_zlascl( 'G',0,0, sva(p), one, m, 1, a(1,p), m, ierr )
+              end do
+           end if
+           ! scale the product of jacobi rotations.
+           if( rsvec ) then
+                 do p = 1, n
+                    temp1 = one / stdlib_dznrm2( mvl, v( 1, p ), 1 )
+                    call stdlib_zdscal( mvl, temp1, v( 1, p ), 1 )
+                 end do
+           end if
+           ! undo scaling, if necessary (and possible).
+           if( ( ( skl>one ) .and. ( sva( 1 )<( big / skl ) ) ).or. ( ( skl<one ) .and. ( sva( &
+                     max( n2, 1 ) ) >( sfmin / skl ) ) ) ) then
+              do p = 1, n
+                 sva( p ) = skl*sva( p )
+              end do
+              skl = one
+           end if
+           rwork( 1 ) = skl
+           ! the singular values of a are skl*sva(1:n). if skl/=one
+           ! then some of the singular values may overflow or underflow and
+           ! the spectrum is given in this factored representation.
+           rwork( 2 ) = real( n4,KIND=dp)
+           ! n4 is the number of computed nonzero singular values of a.
+           rwork( 3 ) = real( n2,KIND=dp)
+           ! n2 is the number of singular values of a greater than sfmin.
+           ! if n2<n, sva(n2:n) contains zeros and/or denormalized numbers
+           ! that may carry some information.
+           rwork( 4 ) = real( i,KIND=dp)
+           ! i is the index of the last sweep before declaring convergence.
+           rwork( 5 ) = mxaapq
+           ! mxaapq is the largest absolute value of scaled pivots in the
+           ! last sweep
+           rwork( 6 ) = mxsinj
+           ! mxsinj is the largest absolute value of the sines of jacobi angles
+           ! in the last sweep
+           return
+     end subroutine stdlib_zgesvj
+
+     !> ZGGES3: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, the generalized complex Schur
+     !> form (S, T), and optionally left and/or right Schur vectors (VSL
+     !> and VSR). This gives the generalized Schur factorization
+     !> (A,B) = ( (VSL)*S*(VSR)**H, (VSL)*T*(VSR)**H )
+     !> where (VSR)**H is the conjugate-transpose of VSR.
+     !> Optionally, it also orders the eigenvalues so that a selected cluster
+     !> of eigenvalues appears in the leading diagonal blocks of the upper
+     !> triangular matrix S and the upper triangular matrix T. The leading
+     !> columns of VSL and VSR then form an unitary basis for the
+     !> corresponding left and right eigenspaces (deflating subspaces).
+     !> (If only the generalized eigenvalues are needed, use the driver
+     !> ZGGEV instead, which is faster.)
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar w
+     !> or a ratio alpha/beta = w, such that  A - w*B is singular.  It is
+     !> usually represented as the pair (alpha,beta), as there is a
+     !> reasonable interpretation for beta=0, and even for both being zero.
+     !> A pair of matrices (S,T) is in generalized complex Schur form if S
+     !> and T are upper triangular and, in addition, the diagonal elements
+     !> of T are non-negative real numbers.
+
+     subroutine stdlib_zgges3( jobvsl, jobvsr, sort, selctg, n, a, lda, b,ldb, sdim, alpha, beta, &
+               vsl, ldvsl, vsr, ldvsr,work, lwork, rwork, bwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvsl, jobvsr, sort
+           integer(ilp), intent(out) :: info, sdim
+           integer(ilp), intent(in) :: lda, ldb, ldvsl, ldvsr, lwork, n
+           ! Array Arguments 
+           logical(lk), intent(out) :: bwork(*)
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: alpha(*), beta(*), vsl(ldvsl,*), vsr(ldvsr,*), work(*)
+                     
+           ! Function Arguments 
+           procedure(stdlib_selctg_z) :: selctg
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: cursl, ilascl, ilbscl, ilvsl, ilvsr, lastsl, lquery, wantst
+           integer(ilp) :: i, icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, iright, irows, irwrk, &
+                     itau, iwrk, lwkopt
+           real(dp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, pvsl, pvsr, smlnum
+           ! Local Arrays 
+           integer(ilp) :: idum(1)
+           real(dp) :: dif(2)
+           ! Intrinsic Functions 
+           intrinsic :: max,sqrt
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvsl, 'N' ) ) then
+              ijobvl = 1
+              ilvsl = .false.
+           else if( stdlib_lsame( jobvsl, 'V' ) ) then
+              ijobvl = 2
+              ilvsl = .true.
+           else
+              ijobvl = -1
+              ilvsl = .false.
+           end if
+           if( stdlib_lsame( jobvsr, 'N' ) ) then
+              ijobvr = 1
+              ilvsr = .false.
+           else if( stdlib_lsame( jobvsr, 'V' ) ) then
+              ijobvr = 2
+              ilvsr = .true.
+           else
+              ijobvr = -1
+              ilvsr = .false.
+           end if
+           wantst = stdlib_lsame( sort, 'S' )
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( ( .not.wantst ) .and. ( .not.stdlib_lsame( sort, 'N' ) ) ) then
+              info = -3
+           else if( n<0 ) then
+              info = -5
+           else if( lda<max( 1, n ) ) then
+              info = -7
+           else if( ldb<max( 1, n ) ) then
+              info = -9
+           else if( ldvsl<1 .or. ( ilvsl .and. ldvsl<n ) ) then
+              info = -14
+           else if( ldvsr<1 .or. ( ilvsr .and. ldvsr<n ) ) then
+              info = -16
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -18
+           end if
+           ! compute workspace
+           if( info==0 ) then
+              call stdlib_zgeqrf( n, n, b, ldb, work, work, -1, ierr )
+              lwkopt = max( 1,  n + int( work( 1 ),KIND=ilp) )
+              call stdlib_zunmqr( 'L', 'C', n, n, n, b, ldb, work, a, lda, work,-1, ierr )
+              lwkopt = max( lwkopt, n + int( work( 1 ),KIND=ilp) )
+              if( ilvsl ) then
+                 call stdlib_zungqr( n, n, n, vsl, ldvsl, work, work, -1, ierr )
+                 lwkopt = max( lwkopt, n + int( work( 1 ),KIND=ilp) )
+              end if
+              call stdlib_zgghd3( jobvsl, jobvsr, n, 1, n, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr, &
+                        work, -1, ierr )
+              lwkopt = max( lwkopt, n + int( work( 1 ),KIND=ilp) )
+              call stdlib_zlaqz0( 'S', jobvsl, jobvsr, n, 1, n, a, lda, b, ldb,alpha, beta, vsl, &
+                        ldvsl, vsr, ldvsr, work, -1,rwork, 0, ierr )
+              lwkopt = max( lwkopt, int( work( 1 ),KIND=ilp) )
+              if( wantst ) then
+                 call stdlib_ztgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb,alpha, beta, vsl, &
+                           ldvsl, vsr, ldvsr, sdim,pvsl, pvsr, dif, work, -1, idum, 1, ierr )
+                 lwkopt = max( lwkopt, int( work( 1 ),KIND=ilp) )
+              end if
+              work( 1 ) = cmplx( lwkopt,KIND=dp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGES3 ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 ) then
+              sdim = 0
+              return
+           end if
+           ! get machine constants
+           eps = stdlib_dlamch( 'P' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_zlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_zlange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_zlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrix to make it more nearly triangular
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_zggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           irows = ihi + 1 - ilo
+           icols = n + 1 - ilo
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_zgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           call stdlib_zunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vsl
+           if( ilvsl ) then
+              call stdlib_zlaset( 'FULL', n, n, czero, cone, vsl, ldvsl )
+              if( irows>1 ) then
+                 call stdlib_zlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vsl( ilo+1, ilo )&
+                           , ldvsl )
+              end if
+              call stdlib_zungqr( irows, irows, irows, vsl( ilo, ilo ), ldvsl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vsr
+           if( ilvsr )call stdlib_zlaset( 'FULL', n, n, czero, cone, vsr, ldvsr )
+           ! reduce to generalized hessenberg form
+           call stdlib_zgghd3( jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb, vsl,ldvsl, vsr, ldvsr,&
+                      work( iwrk ), lwork+1-iwrk, ierr )
+           sdim = 0
+           ! perform qz algorithm, computing schur vectors if desired
+           iwrk = itau
+           call stdlib_zlaqz0( 'S', jobvsl, jobvsr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vsl, &
+                     ldvsl, vsr, ldvsr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), 0, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 30
+           end if
+           ! sort eigenvalues alpha/beta if desired
+           if( wantst ) then
+              ! undo scaling on eigenvalues before selecting
+              if( ilascl )call stdlib_zlascl( 'G', 0, 0, anrm, anrmto, n, 1, alpha, n, ierr )
+                        
+              if( ilbscl )call stdlib_zlascl( 'G', 0, 0, bnrm, bnrmto, n, 1, beta, n, ierr )
+                        
+              ! select eigenvalues
+              do i = 1, n
+                 bwork( i ) = selctg( alpha( i ), beta( i ) )
+              end do
+              call stdlib_ztgsen( 0, ilvsl, ilvsr, bwork, n, a, lda, b, ldb, alpha,beta, vsl, &
+              ldvsl, vsr, ldvsr, sdim, pvsl, pvsr,dif, work( iwrk ), lwork-iwrk+1, idum, 1, ierr )
+                        
+              if( ierr==1 )info = n + 3
+           end if
+           ! apply back-permutation to vsl and vsr
+           if( ilvsl )call stdlib_zggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsl, ldvsl, ierr )
+           if( ilvsr )call stdlib_zggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     n, vsr, ldvsr, ierr )
+           ! undo scaling
+           if( ilascl ) then
+              call stdlib_zlascl( 'U', 0, 0, anrmto, anrm, n, n, a, lda, ierr )
+              call stdlib_zlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           end if
+           if( ilbscl ) then
+              call stdlib_zlascl( 'U', 0, 0, bnrmto, bnrm, n, n, b, ldb, ierr )
+              call stdlib_zlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           end if
+           if( wantst ) then
+              ! check if reordering is correct
+              lastsl = .true.
+              sdim = 0
+              do i = 1, n
+                 cursl = selctg( alpha( i ), beta( i ) )
+                 if( cursl )sdim = sdim + 1
+                 if( cursl .and. .not.lastsl )info = n + 2
+                 lastsl = cursl
+              end do
+           end if
+           30 continue
+           work( 1 ) = cmplx( lwkopt,KIND=dp)
+           return
+     end subroutine stdlib_zgges3
+
+     !> ZGGEV3: computes for a pair of N-by-N complex nonsymmetric matrices
+     !> (A,B), the generalized eigenvalues, and optionally, the left and/or
+     !> right generalized eigenvectors.
+     !> A generalized eigenvalue for a pair of matrices (A,B) is a scalar
+     !> lambda or a ratio alpha/beta = lambda, such that A - lambda*B is
+     !> singular. It is usually represented as the pair (alpha,beta), as
+     !> there is a reasonable interpretation for beta=0, and even for both
+     !> being zero.
+     !> The right generalized eigenvector v(j) corresponding to the
+     !> generalized eigenvalue lambda(j) of (A,B) satisfies
+     !> A * v(j) = lambda(j) * B * v(j).
+     !> The left generalized eigenvector u(j) corresponding to the
+     !> generalized eigenvalues lambda(j) of (A,B) satisfies
+     !> u(j)**H * A = lambda(j) * u(j)**H * B
+     !> where u(j)**H is the conjugate-transpose of u(j).
+
+     subroutine stdlib_zggev3( jobvl, jobvr, n, a, lda, b, ldb, alpha, beta,vl, ldvl, vr, ldvr, &
+               work, lwork, rwork, info )
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: jobvl, jobvr
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, ldvl, ldvr, lwork, n
+           ! Array Arguments 
+           real(dp), intent(out) :: rwork(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: alpha(*), beta(*), vl(ldvl,*), vr(ldvr,*), work(*)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           logical(lk) :: ilascl, ilbscl, ilv, ilvl, ilvr, lquery
+           character :: chtemp
+           integer(ilp) :: icols, ierr, ihi, ijobvl, ijobvr, ileft, ilo, in, iright, irows, irwrk,&
+                      itau, iwrk, jc, jr, lwkopt
+           real(dp) :: anrm, anrmto, bignum, bnrm, bnrmto, eps, smlnum, temp
+           complex(dp) :: x
+           ! Local Arrays 
+           logical(lk) :: ldumma(1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,aimag,max,sqrt
+           ! Statement Functions 
+           real(dp) :: abs1
+           ! Statement Function Definitions 
+           abs1( x ) = abs( real( x,KIND=dp) ) + abs( aimag( x ) )
+           ! Executable Statements 
+           ! decode the input arguments
+           if( stdlib_lsame( jobvl, 'N' ) ) then
+              ijobvl = 1
+              ilvl = .false.
+           else if( stdlib_lsame( jobvl, 'V' ) ) then
+              ijobvl = 2
+              ilvl = .true.
+           else
+              ijobvl = -1
+              ilvl = .false.
+           end if
+           if( stdlib_lsame( jobvr, 'N' ) ) then
+              ijobvr = 1
+              ilvr = .false.
+           else if( stdlib_lsame( jobvr, 'V' ) ) then
+              ijobvr = 2
+              ilvr = .true.
+           else
+              ijobvr = -1
+              ilvr = .false.
+           end if
+           ilv = ilvl .or. ilvr
+           ! test the input arguments
+           info = 0
+           lquery = ( lwork==-1 )
+           if( ijobvl<=0 ) then
+              info = -1
+           else if( ijobvr<=0 ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -7
+           else if( ldvl<1 .or. ( ilvl .and. ldvl<n ) ) then
+              info = -11
+           else if( ldvr<1 .or. ( ilvr .and. ldvr<n ) ) then
+              info = -13
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -15
+           end if
+           ! compute workspace
+           if( info==0 ) then
+              call stdlib_zgeqrf( n, n, b, ldb, work, work, -1, ierr )
+              lwkopt = max( 1,  n+int( work( 1 ),KIND=ilp) )
+              call stdlib_zunmqr( 'L', 'C', n, n, n, b, ldb, work, a, lda, work,-1, ierr )
+              lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+              if( ilvl ) then
+                 call stdlib_zungqr( n, n, n, vl, ldvl, work, work, -1, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+              end if
+              if( ilv ) then
+                 call stdlib_zgghd3( jobvl, jobvr, n, 1, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                           work, -1, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+                 call stdlib_zlaqz0( 'S', jobvl, jobvr, n, 1, n, a, lda, b, ldb,alpha, beta, vl, &
+                           ldvl, vr, ldvr, work, -1,rwork, 0, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+              else
+                 call stdlib_zgghd3( jobvl, jobvr, n, 1, n, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                           work, -1, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+                 call stdlib_zlaqz0( 'E', jobvl, jobvr, n, 1, n, a, lda, b, ldb,alpha, beta, vl, &
+                           ldvl, vr, ldvr, work, -1,rwork, 0, ierr )
+                 lwkopt = max( lwkopt, n+int( work( 1 ),KIND=ilp) )
+              end if
+              work( 1 ) = cmplx( lwkopt,KIND=dp)
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGGEV3 ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return if possible
+           if( n==0 )return
+           ! get machine constants
+           eps = stdlib_dlamch( 'E' )*stdlib_dlamch( 'B' )
+           smlnum = stdlib_dlamch( 'S' )
+           bignum = one / smlnum
+           call stdlib_dlabad( smlnum, bignum )
+           smlnum = sqrt( smlnum ) / eps
+           bignum = one / smlnum
+           ! scale a if max element outside range [smlnum,bignum]
+           anrm = stdlib_zlange( 'M', n, n, a, lda, rwork )
+           ilascl = .false.
+           if( anrm>zero .and. anrm<smlnum ) then
+              anrmto = smlnum
+              ilascl = .true.
+           else if( anrm>bignum ) then
+              anrmto = bignum
+              ilascl = .true.
+           end if
+           if( ilascl )call stdlib_zlascl( 'G', 0, 0, anrm, anrmto, n, n, a, lda, ierr )
+           ! scale b if max element outside range [smlnum,bignum]
+           bnrm = stdlib_zlange( 'M', n, n, b, ldb, rwork )
+           ilbscl = .false.
+           if( bnrm>zero .and. bnrm<smlnum ) then
+              bnrmto = smlnum
+              ilbscl = .true.
+           else if( bnrm>bignum ) then
+              bnrmto = bignum
+              ilbscl = .true.
+           end if
+           if( ilbscl )call stdlib_zlascl( 'G', 0, 0, bnrm, bnrmto, n, n, b, ldb, ierr )
+           ! permute the matrices a, b to isolate eigenvalues if possible
+           ileft = 1
+           iright = n + 1
+           irwrk = iright + n
+           call stdlib_zggbal( 'P', n, a, lda, b, ldb, ilo, ihi, rwork( ileft ),rwork( iright ), &
+                     rwork( irwrk ), ierr )
+           ! reduce b to triangular form (qr decomposition of b)
+           irows = ihi + 1 - ilo
+           if( ilv ) then
+              icols = n + 1 - ilo
+           else
+              icols = irows
+           end if
+           itau = 1
+           iwrk = itau + irows
+           call stdlib_zgeqrf( irows, icols, b( ilo, ilo ), ldb, work( itau ),work( iwrk ), lwork+&
+                     1-iwrk, ierr )
+           ! apply the orthogonal transformation to matrix a
+           call stdlib_zunmqr( 'L', 'C', irows, icols, irows, b( ilo, ilo ), ldb,work( itau ), a( &
+                     ilo, ilo ), lda, work( iwrk ),lwork+1-iwrk, ierr )
+           ! initialize vl
+           if( ilvl ) then
+              call stdlib_zlaset( 'FULL', n, n, czero, cone, vl, ldvl )
+              if( irows>1 ) then
+                 call stdlib_zlacpy( 'L', irows-1, irows-1, b( ilo+1, ilo ), ldb,vl( ilo+1, ilo ),&
+                            ldvl )
+              end if
+              call stdlib_zungqr( irows, irows, irows, vl( ilo, ilo ), ldvl,work( itau ), work( &
+                        iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! initialize vr
+           if( ilvr )call stdlib_zlaset( 'FULL', n, n, czero, cone, vr, ldvr )
+           ! reduce to generalized hessenberg form
+           if( ilv ) then
+              ! eigenvectors requested -- work on whole matrix.
+              call stdlib_zgghd3( jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb, vl,ldvl, vr, ldvr, &
+                        work( iwrk ), lwork+1-iwrk, ierr )
+           else
+              call stdlib_zgghd3( 'N', 'N', irows, 1, irows, a( ilo, ilo ), lda,b( ilo, ilo ), &
+                        ldb, vl, ldvl, vr, ldvr,work( iwrk ), lwork+1-iwrk, ierr )
+           end if
+           ! perform qz algorithm (compute eigenvalues, and optionally, the
+           ! schur form and schur vectors)
+           iwrk = itau
+           if( ilv ) then
+              chtemp = 'S'
+           else
+              chtemp = 'E'
+           end if
+           call stdlib_zlaqz0( chtemp, jobvl, jobvr, n, ilo, ihi, a, lda, b, ldb,alpha, beta, vl, &
+                     ldvl, vr, ldvr, work( iwrk ),lwork+1-iwrk, rwork( irwrk ), 0, ierr )
+           if( ierr/=0 ) then
+              if( ierr>0 .and. ierr<=n ) then
+                 info = ierr
+              else if( ierr>n .and. ierr<=2*n ) then
+                 info = ierr - n
+              else
+                 info = n + 1
+              end if
+              go to 70
+           end if
+           ! compute eigenvectors
+           if( ilv ) then
+              if( ilvl ) then
+                 if( ilvr ) then
+                    chtemp = 'B'
+                 else
+                    chtemp = 'L'
+                 end if
+              else
+                 chtemp = 'R'
+              end if
+              call stdlib_ztgevc( chtemp, 'B', ldumma, n, a, lda, b, ldb, vl, ldvl,vr, ldvr, n, &
+                        in, work( iwrk ), rwork( irwrk ),ierr )
+              if( ierr/=0 ) then
+                 info = n + 2
+                 go to 70
+              end if
+              ! undo balancing on vl and vr and normalization
+              if( ilvl ) then
+                 call stdlib_zggbak( 'P', 'L', n, ilo, ihi, rwork( ileft ),rwork( iright ), n, vl,&
+                            ldvl, ierr )
+                 loop_30: do jc = 1, n
+                    temp = zero
+                    do jr = 1, n
+                       temp = max( temp, abs1( vl( jr, jc ) ) )
+                    end do
+                    if( temp<smlnum )cycle loop_30
+                    temp = one / temp
+                    do jr = 1, n
+                       vl( jr, jc ) = vl( jr, jc )*temp
+                    end do
+                 end do loop_30
+              end if
+              if( ilvr ) then
+                 call stdlib_zggbak( 'P', 'R', n, ilo, ihi, rwork( ileft ),rwork( iright ), n, vr,&
+                            ldvr, ierr )
+                 loop_60: do jc = 1, n
+                    temp = zero
+                    do jr = 1, n
+                       temp = max( temp, abs1( vr( jr, jc ) ) )
+                    end do
+                    if( temp<smlnum )cycle loop_60
+                    temp = one / temp
+                    do jr = 1, n
+                       vr( jr, jc ) = vr( jr, jc )*temp
+                    end do
+                 end do loop_60
+              end if
+           end if
+           ! undo scaling if necessary
+           70 continue
+           if( ilascl )call stdlib_zlascl( 'G', 0, 0, anrmto, anrm, n, 1, alpha, n, ierr )
+           if( ilbscl )call stdlib_zlascl( 'G', 0, 0, bnrmto, bnrm, n, 1, beta, n, ierr )
+           work( 1 ) = cmplx( lwkopt,KIND=dp)
+           return
+     end subroutine stdlib_zggev3
+
+     !> ZGSVJ0: is called from ZGESVJ as a pre-processor and that is its main
+     !> purpose. It applies Jacobi rotations in the same way as ZGESVJ does, but
+     !> it does not check convergence (stopping criterion). Few tuning
+     !> parameters (marked by [TP]) are available for the implementer.
+
+     pure subroutine stdlib_zgsvj0( jobv, m, n, a, lda, d, sva, mv, v, ldv, eps,sfmin, tol, &
+               nsweep, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n, nsweep
+           real(dp), intent(in) :: eps, sfmin, tol
+           character, intent(in) :: jobv
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), d(n), v(ldv,*)
+           complex(dp), intent(out) :: work(lwork)
+           real(dp), intent(inout) :: sva(n)
+        ! =====================================================================
+           
+           
+           ! Local Scalars 
+           complex(dp) :: aapq, ompq
+           real(dp) :: aapp, aapp0, aapq1, aaqq, apoaq, aqoap, big, bigtheta, cs, mxaapq, mxsinj, &
+                     rootbig, rooteps, rootsfmin, roottol, small, sn, t, temp1, theta, thsign
+           integer(ilp) :: blskip, emptsw, i, ibr, ierr, igl, ijblsk, ir1, iswrot, jbc, jgl, kbl, &
+                     lkahead, mvl, nbl, notrot, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, rotok, rsvec
+           ! Intrinsic Functions 
+           intrinsic :: abs,max,conjg,real,min,sign,sqrt
+           ! from lapack
+           ! Executable Statements 
+           ! test the input parameters.
+           applv = stdlib_lsame( jobv, 'A' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -3
+           else if( lda<m ) then
+              info = -5
+           else if( ( rsvec.or.applv ) .and. ( mv<0 ) ) then
+              info = -8
+           else if( ( rsvec.and.( ldv<n ) ).or.( applv.and.( ldv<mv ) ) ) then
+              info = -10
+           else if( tol<=eps ) then
+              info = -13
+           else if( nsweep<0 ) then
+              info = -14
+           else if( lwork<m ) then
+              info = -16
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGSVJ0', -info )
+              return
+           end if
+           if( rsvec ) then
+              mvl = n
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           rooteps = sqrt( eps )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / eps
+           big = one / sfmin
+           rootbig = one / rootsfmin
+           bigtheta = one / rooteps
+           roottol = sqrt( tol )
+           ! .. row-cyclic jacobi svd algorithm with column pivoting ..
+           emptsw = ( n*( n-1 ) ) / 2
+           notrot = 0
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           swband = 0
+      ! [tp] swband is a tuning parameter [tp]. it is meaningful and effective
+           ! if stdlib_zgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_zgejsv. for sweeps i=1:swband the procedure
+           ! works on pivots inside a band-like region around the diagonal.
+           ! the boundaries are determined dynamically, based on the number of
+           ! pivots above a threshold.
+           kbl = min( 8, n )
+      ! [tp] kbl is a tuning parameter that defines the tile size in the
+           ! tiling of the p-q loops of pivot pairs. in general, an optimal
+           ! value of kbl depends on the matrix dimensions and on the
+           ! parameters of the computer's memory.
+           nbl = n / kbl
+           if( ( nbl*kbl )/=n )nbl = nbl + 1
+           blskip = kbl**2
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           lkahead = 1
+      ! [tp] lkahead is a tuning parameter.
+           ! quasi block transformations, using the lower (upper) triangular
+           ! structure of the input matrix. the quasi-block-cycling usually
+           ! invokes cubic convergence. big part of this cycle is done inside
+           ! canonical subspaces of dimensions less than m.
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+           ! each sweep is unrolled using kbl-by-kbl tiles over the pivot pairs
+           ! 1 <= p < q <= n. this is the first step toward a blocked implementation
+           ! of the rotations. new implementation, based on block transformations,
+           ! is under development.
+              loop_2000: do ibr = 1, nbl
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_1002: do ir1 = 0, min( lkahead, nbl-ibr )
+                    igl = igl + ir1*kbl
+                    loop_2001: do p = igl, min( igl+kbl-1, n-1 )
+           ! .. de rijk's pivoting
+                       q = stdlib_idamax( n-p+1, sva( p ), 1 ) + p - 1
+                       if( p/=q ) then
+                          call stdlib_zswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                          if( rsvec )call stdlib_zswap( mvl, v( 1, p ), 1,v( 1, q ), 1 )
+                          temp1 = sva( p )
+                          sva( p ) = sva( q )
+                          sva( q ) = temp1
+                          aapq = d(p)
+                          d(p) = d(q)
+                          d(q) = aapq
+                       end if
+                       if( ir1==0 ) then
+              ! column norms are periodically updated by explicit
+              ! norm computation.
+              ! caveat:
+              ! unfortunately, some blas implementations compute sncrm2(m,a(1,p),1)
+              ! as sqrt(s=stdlib_zdotc(m,a(1,p),1,a(1,p),1)), which may cause the result to
+              ! overflow for ||a(:,p)||_2 > sqrt(overflow_threshold), and to
+              ! underflow for ||a(:,p)||_2 < sqrt(underflow_threshold).
+              ! hence, stdlib_dznrm2 cannot be trusted, not even in the case when
+              ! the true norm is far from the under(over)flow boundaries.
+              ! if properly implemented stdlib_dznrm2 is available, the if-then-else-end if
+              ! below should be replaced with "aapp = stdlib_dznrm2( m, a(1,p), 1 )".
+                          if( ( sva( p )<rootbig ) .and.( sva( p )>rootsfmin ) ) then
+                             sva( p ) = stdlib_dznrm2( m, a( 1, p ), 1 )
+                          else
+                             temp1 = zero
+                             aapp = one
+                             call stdlib_zlassq( m, a( 1, p ), 1, temp1, aapp )
+                             sva( p ) = temp1*sqrt( aapp )
+                          end if
+                          aapp = sva( p )
+                       else
+                          aapp = sva( p )
+                       end if
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2002: do q = p + 1, min( igl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+                                if( aaqq>=one ) then
+                                   rotok = ( small*aapp )<=aaqq
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_zdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_zcopy( m, a( 1, p ), 1,work, 1 )
+                                      call stdlib_zlascl( 'G', 0, 0, aapp, one,m, 1, work, lda, &
+                                                ierr )
+                                      aapq = stdlib_zdotc( m, work, 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   rotok = aapp<=( aaqq / small )
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_zdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aapp ) / aaqq
+                                   else
+                                      call stdlib_zcopy( m, a( 1, q ), 1,work, 1 )
+                                      call stdlib_zlascl( 'G', 0, 0, aaqq,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_zdotc( m, a( 1, p ), 1,work, 1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg( cwork(p) ) * cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                   ompq = aapq / abs(aapq)
+                 ! Rotate
+      ! [rtd]      rotated = rotated + one
+                                   if( ir1==0 ) then
+                                      notrot = 0
+                                      pskipped = 0
+                                      iswrot = iswrot + 1
+                                   end if
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/aapq1
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_zrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if ( rsvec ) then
+                                             call stdlib_zrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_zrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if ( rsvec ) then
+                                             call stdlib_zrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      d(p) = -d(q) * ompq
+                                      else
+                    ! .. have to use modified gram-schmidt like transformation
+                                      call stdlib_zcopy( m, a( 1, p ), 1,work, 1 )
+                                      call stdlib_zlascl( 'G', 0, 0, aapp, one, m,1, work, lda,&
+                                                ierr )
+                                      call stdlib_zlascl( 'G', 0, 0, aaqq, one, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      call stdlib_zaxpy( m, -aapq, work, 1,a( 1, q ), 1 )
+                                      call stdlib_zlascl( 'G', 0, 0, one, aaqq, m,1, a( 1, q ), &
+                                                lda, ierr )
+                                      sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                      mxsinj = max( mxsinj, sfmin )
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! recompute sva(q), sva(p).
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_dznrm2( m, a( 1, q ), 1 )
+                                      else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_zlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_dznrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_zlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                                else
+              ! a(:,p) and a(:,q) already numerically orthogonal
+                                   if( ir1==0 )notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                end if
+                             else
+              ! a(:,q) is zero column
+                                if( ir1==0 )notrot = notrot + 1
+                                pskipped = pskipped + 1
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                if( ir1==0 )aapp = -aapp
+                                notrot = 0
+                                go to 2103
+                             end if
+                          end do loop_2002
+           ! end q-loop
+           2103 continue
+           ! bailed out of q-loop
+                          sva( p ) = aapp
+                       else
+                          sva( p ) = aapp
+                          if( ( ir1==0 ) .and. ( aapp==zero ) )notrot = notrot + min( igl+kbl-1, &
+                                    n ) - p
+                       end if
+                    end do loop_2001
+           ! end of the p-loop
+           ! end of doing the block ( ibr, ibr )
+                 end do loop_1002
+           ! end of ir1-loop
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                 loop_2010: do jbc = ibr + 1, nbl
+                    jgl = ( jbc-1 )*kbl + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! safe gram matrix computation
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_zdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_zcopy( m, a( 1, p ), 1,work, 1 )
+                                      call stdlib_zlascl( 'G', 0, 0, aapp,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_zdotc( m, work, 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_zdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / max(&
+                                                aaqq,aapp) )/ min(aaqq,aapp)
+                                   else
+                                      call stdlib_zcopy( m, a( 1, q ), 1,work, 1 )
+                                      call stdlib_zlascl( 'G', 0, 0, aaqq,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_zdotc( m, a( 1, p ), 1,work, 1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg(cwork(p))*cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                   ompq = aapq / abs(aapq)
+                                   notrot = 0
+      ! [rtd]      rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/ aapq1
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_zrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if( rsvec ) then
+                                             call stdlib_zrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_zrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if( rsvec ) then
+                                             call stdlib_zrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      d(p) = -d(q) * ompq
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                    if( aapp>aaqq ) then
+                                         call stdlib_zcopy( m, a( 1, p ), 1,work, 1 )
+                                         call stdlib_zlascl( 'G', 0, 0, aapp, one,m, 1, work,lda,&
+                                                   ierr )
+                                         call stdlib_zlascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         call stdlib_zaxpy( m, -aapq, work,1, a( 1, q ), 1 )
+                                                   
+                                         call stdlib_zlascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    else
+                                        call stdlib_zcopy( m, a( 1, q ), 1,work, 1 )
+                                         call stdlib_zlascl( 'G', 0, 0, aaqq, one,m, 1, work,lda,&
+                                                   ierr )
+                                         call stdlib_zlascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         call stdlib_zaxpy( m, -conjg(aapq),work, 1, a( 1, p ), 1 &
+                                                   )
+                                         call stdlib_zlascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! .. recompute sva(q), sva(p)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_dznrm2( m, a( 1, q ), 1)
+                                       else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_zlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_dznrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_zlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_dznrm2( m, a( 1, n ), 1 )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_zlassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<sqrt( real( n,KIND=dp) )*tol ) .and. ( real( n,&
+                        KIND=dp)*mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:( reaching this point means that the procedure has not converged.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means numerical convergence after the i-th
+           ! sweep.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the vector sva() of column norms.
+           do p = 1, n - 1
+              q = stdlib_idamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 aapq = d( p )
+                 d( p ) = d( q )
+                 d( q ) = aapq
+                 call stdlib_zswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_zswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_zgsvj0
+
+     !> ZGSVJ1: is called from ZGESVJ as a pre-processor and that is its main
+     !> purpose. It applies Jacobi rotations in the same way as ZGESVJ does, but
+     !> it targets only particular pivots and it does not check convergence
+     !> (stopping criterion). Few tuning parameters (marked by [TP]) are
+     !> available for the implementer.
+     !> Further Details
+     !> ~~~~~~~~~~~~~~~
+     !> ZGSVJ1 applies few sweeps of Jacobi rotations in the column space of
+     !> the input M-by-N matrix A. The pivot pairs are taken from the (1,2)
+     !> off-diagonal block in the corresponding N-by-N Gram matrix A^T * A. The
+     !> block-entries (tiles) of the (1,2) off-diagonal block are marked by the
+     !> [x]'s in the following scheme:
+     !> | *  *  * [x] [x] [x]|
+     !> | *  *  * [x] [x] [x]|    Row-cycling in the nblr-by-nblc [x] blocks.
+     !> | *  *  * [x] [x] [x]|    Row-cyclic pivoting inside each [x] block.
+     !> |[x] [x] [x] *  *  * |
+     !> |[x] [x] [x] *  *  * |
+     !> |[x] [x] [x] *  *  * |
+     !> In terms of the columns of A, the first N1 columns are rotated 'against'
+     !> the remaining N-N1 columns, trying to increase the angle between the
+     !> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is
+     !> tiled using quadratic tiles of side KBL. Here, KBL is a tuning parameter.
+     !> The number of sweeps is given in NSWEEP and the orthogonality threshold
+     !> is given in TOL.
+
+     pure subroutine stdlib_zgsvj1( jobv, m, n, n1, a, lda, d, sva, mv, v, ldv,eps, sfmin, tol, &
+               nsweep, work, lwork, info )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           real(dp), intent(in) :: eps, sfmin, tol
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldv, lwork, m, mv, n, n1, nsweep
+           character, intent(in) :: jobv
+           ! Array Arguments 
+           complex(dp), intent(inout) :: a(lda,*), d(n), v(ldv,*)
+           complex(dp), intent(out) :: work(lwork)
+           real(dp), intent(inout) :: sva(n)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           complex(dp) :: aapq, ompq
+           real(dp) :: aapp, aapp0, aapq1, aaqq, apoaq, aqoap, big, bigtheta, cs, mxaapq, mxsinj, &
+                     rootbig, rooteps, rootsfmin, roottol, small, sn, t, temp1, theta, thsign
+           integer(ilp) :: blskip, emptsw, i, ibr, igl, ierr, ijblsk, iswrot, jbc, jgl, kbl, mvl, &
+                     notrot, nblc, nblr, p, pskipped, q, rowskip, swband
+           logical(lk) :: applv, rotok, rsvec
+           ! Intrinsic Functions 
+           intrinsic :: abs,conjg,max,real,min,sign,sqrt
+           ! From Lapack
+           ! Executable Statements 
+           ! test the input parameters.
+           applv = stdlib_lsame( jobv, 'A' )
+           rsvec = stdlib_lsame( jobv, 'V' )
+           if( .not.( rsvec .or. applv .or. stdlib_lsame( jobv, 'N' ) ) ) then
+              info = -1
+           else if( m<0 ) then
+              info = -2
+           else if( ( n<0 ) .or. ( n>m ) ) then
+              info = -3
+           else if( n1<0 ) then
+              info = -4
+           else if( lda<m ) then
+              info = -6
+           else if( ( rsvec.or.applv ) .and. ( mv<0 ) ) then
+              info = -9
+           else if( ( rsvec.and.( ldv<n ) ).or.( applv.and.( ldv<mv ) )  ) then
+              info = -11
+           else if( tol<=eps ) then
+              info = -14
+           else if( nsweep<0 ) then
+              info = -15
+           else if( lwork<m ) then
+              info = -17
+           else
+              info = 0
+           end if
+           ! #:(
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZGSVJ1', -info )
+              return
+           end if
+           if( rsvec ) then
+              mvl = n
+           else if( applv ) then
+              mvl = mv
+           end if
+           rsvec = rsvec .or. applv
+           rooteps = sqrt( eps )
+           rootsfmin = sqrt( sfmin )
+           small = sfmin / eps
+           big = one / sfmin
+           rootbig = one / rootsfmin
+           ! large = big / sqrt( real( m*n,KIND=dp) )
+           bigtheta = one / rooteps
+           roottol = sqrt( tol )
+           ! Initialize The Right Singular Vector Matrix 
+           ! rsvec = stdlib_lsame( jobv, 'y' )
+           emptsw = n1*( n-n1 )
+           notrot = 0
+           ! .. row-cyclic pivot strategy with de rijk's pivoting ..
+           kbl = min( 8, n )
+           nblr = n1 / kbl
+           if( ( nblr*kbl )/=n1 )nblr = nblr + 1
+           ! .. the tiling is nblr-by-nblc [tiles]
+           nblc = ( n-n1 ) / kbl
+           if( ( nblc*kbl )/=( n-n1 ) )nblc = nblc + 1
+           blskip = ( kbl**2 ) + 1
+      ! [tp] blkskip is a tuning parameter that depends on swband and kbl.
+           rowskip = min( 5, kbl )
+      ! [tp] rowskip is a tuning parameter.
+           swband = 0
+      ! [tp] swband is a tuning parameter. it is meaningful and effective
+           ! if stdlib_zgesvj is used as a computational routine in the preconditioned
+           ! jacobi svd algorithm stdlib_zgejsv.
+           ! | *   *   * [x] [x] [x]|
+           ! | *   *   * [x] [x] [x]|    row-cycling in the nblr-by-nblc [x] blocks.
+           ! | *   *   * [x] [x] [x]|    row-cyclic pivoting inside each [x] block.
+           ! |[x] [x] [x] *   *   * |
+           ! |[x] [x] [x] *   *   * |
+           ! |[x] [x] [x] *   *   * |
+           loop_1993: do i = 1, nsweep
+           ! .. go go go ...
+              mxaapq = zero
+              mxsinj = zero
+              iswrot = 0
+              notrot = 0
+              pskipped = 0
+           ! each sweep is unrolled using kbl-by-kbl tiles over the pivot pairs
+           ! 1 <= p < q <= n. this is the first step toward a blocked implementation
+           ! of the rotations. new implementation, based on block transformations,
+           ! is under development.
+              loop_2000: do ibr = 1, nblr
+                 igl = ( ibr-1 )*kbl + 1
+       ! ... go to the off diagonal blocks
+                 igl = ( ibr-1 )*kbl + 1
+                  ! do 2010 jbc = ibr + 1, nbl
+                 loop_2010: do jbc = 1, nblc
+                    jgl = ( jbc-1 )*kbl + n1 + 1
+              ! doing the block at ( ibr, jbc )
+                    ijblsk = 0
+                    loop_2100: do p = igl, min( igl+kbl-1, n1 )
+                       aapp = sva( p )
+                       if( aapp>zero ) then
+                          pskipped = 0
+                          loop_2200: do q = jgl, min( jgl+kbl-1, n )
+                             aaqq = sva( q )
+                             if( aaqq>zero ) then
+                                aapp0 = aapp
+           ! M X 2 Jacobi Svd 
+              ! safe gram matrix computation
+                                if( aaqq>=one ) then
+                                   if( aapp>=aaqq ) then
+                                      rotok = ( small*aapp )<=aaqq
+                                   else
+                                      rotok = ( small*aaqq )<=aapp
+                                   end if
+                                   if( aapp<( big / aaqq ) ) then
+                                      aapq = ( stdlib_zdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / &
+                                                aaqq ) / aapp
+                                   else
+                                      call stdlib_zcopy( m, a( 1, p ), 1,work, 1 )
+                                      call stdlib_zlascl( 'G', 0, 0, aapp,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_zdotc( m, work, 1,a( 1, q ), 1 ) / &
+                                                aaqq
+                                   end if
+                                else
+                                   if( aapp>=aaqq ) then
+                                      rotok = aapp<=( aaqq / small )
+                                   else
+                                      rotok = aaqq<=( aapp / small )
+                                   end if
+                                   if( aapp>( small / aaqq ) ) then
+                                      aapq = ( stdlib_zdotc( m, a( 1, p ), 1,a( 1, q ), 1 ) / max(&
+                                                aaqq,aapp) )/ min(aaqq,aapp)
+                                   else
+                                      call stdlib_zcopy( m, a( 1, q ), 1,work, 1 )
+                                      call stdlib_zlascl( 'G', 0, 0, aaqq,one, m, 1,work, lda, &
+                                                ierr )
+                                      aapq = stdlib_zdotc( m, a( 1, p ), 1,work, 1 ) / &
+                                                aapp
+                                   end if
+                                end if
+                                 ! aapq = aapq * conjg(cwork(p))*cwork(q)
+                                aapq1  = -abs(aapq)
+                                mxaapq = max( mxaapq, -aapq1 )
+              ! to rotate or not to rotate, that is the question ...
+                                if( abs( aapq1 )>tol ) then
+                                   ompq = aapq / abs(aapq)
+                                   notrot = 0
+      ! [rtd]      rotated  = rotated + 1
+                                   pskipped = 0
+                                   iswrot = iswrot + 1
+                                   if( rotok ) then
+                                      aqoap = aaqq / aapp
+                                      apoaq = aapp / aaqq
+                                      theta = -half*abs( aqoap-apoaq )/ aapq1
+                                      if( aaqq>aapp0 )theta = -theta
+                                      if( abs( theta )>bigtheta ) then
+                                         t  = half / theta
+                                         cs = one
+                                         call stdlib_zrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *t )
+                                         if( rsvec ) then
+                                             call stdlib_zrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*t )
+                                         end if
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         mxsinj = max( mxsinj, abs( t ) )
+                                      else
+                       ! Choose Correct Signum For Theta And Rotate
+                                         thsign = -sign( one, aapq1 )
+                                         if( aaqq>aapp0 )thsign = -thsign
+                                         t = one / ( theta+thsign*sqrt( one+theta*theta ) )
+                                                   
+                                         cs = sqrt( one / ( one+t*t ) )
+                                         sn = t*cs
+                                         mxsinj = max( mxsinj, abs( sn ) )
+                                         sva( q ) = aaqq*sqrt( max( zero,one+t*apoaq*aapq1 ) )
+                                                   
+                                         aapp = aapp*sqrt( max( zero,one-t*aqoap*aapq1 ) )
+                                         call stdlib_zrot( m, a(1,p), 1, a(1,q), 1,cs, conjg(ompq)&
+                                                   *sn )
+                                         if( rsvec ) then
+                                             call stdlib_zrot( mvl, v(1,p), 1,v(1,q), 1, cs, &
+                                                       conjg(ompq)*sn )
+                                         end if
+                                      end if
+                                      d(p) = -d(q) * ompq
+                                   else
+                    ! .. have to use modified gram-schmidt like transformation
+                                    if( aapp>aaqq ) then
+                                         call stdlib_zcopy( m, a( 1, p ), 1,work, 1 )
+                                         call stdlib_zlascl( 'G', 0, 0, aapp, one,m, 1, work,lda,&
+                                                   ierr )
+                                         call stdlib_zlascl( 'G', 0, 0, aaqq, one,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         call stdlib_zaxpy( m, -aapq, work,1, a( 1, q ), 1 )
+                                                   
+                                         call stdlib_zlascl( 'G', 0, 0, one, aaqq,m, 1, a( 1, q ),&
+                                                    lda,ierr )
+                                         sva( q ) = aaqq*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    else
+                                        call stdlib_zcopy( m, a( 1, q ), 1,work, 1 )
+                                         call stdlib_zlascl( 'G', 0, 0, aaqq, one,m, 1, work,lda,&
+                                                   ierr )
+                                         call stdlib_zlascl( 'G', 0, 0, aapp, one,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         call stdlib_zaxpy( m, -conjg(aapq),work, 1, a( 1, p ), 1 &
+                                                   )
+                                         call stdlib_zlascl( 'G', 0, 0, one, aapp,m, 1, a( 1, p ),&
+                                                    lda,ierr )
+                                         sva( p ) = aapp*sqrt( max( zero,one-aapq1*aapq1 ) )
+                                                   
+                                         mxsinj = max( mxsinj, sfmin )
+                                    end if
+                                   end if
+                 ! end if rotok then ... else
+                 ! in the case of cancellation in updating sva(q), sva(p)
+                 ! .. recompute sva(q), sva(p)
+                                   if( ( sva( q ) / aaqq )**2<=rooteps )then
+                                      if( ( aaqq<rootbig ) .and.( aaqq>rootsfmin ) ) then
+                                         sva( q ) = stdlib_dznrm2( m, a( 1, q ), 1)
+                                       else
+                                         t = zero
+                                         aaqq = one
+                                         call stdlib_zlassq( m, a( 1, q ), 1, t,aaqq )
+                                         sva( q ) = t*sqrt( aaqq )
+                                      end if
+                                   end if
+                                   if( ( aapp / aapp0 )**2<=rooteps ) then
+                                      if( ( aapp<rootbig ) .and.( aapp>rootsfmin ) ) then
+                                         aapp = stdlib_dznrm2( m, a( 1, p ), 1 )
+                                      else
+                                         t = zero
+                                         aapp = one
+                                         call stdlib_zlassq( m, a( 1, p ), 1, t,aapp )
+                                         aapp = t*sqrt( aapp )
+                                      end if
+                                      sva( p ) = aapp
+                                   end if
+                    ! end of ok rotation
+                                else
+                                   notrot = notrot + 1
+      ! [rtd]      skipped  = skipped  + 1
+                                   pskipped = pskipped + 1
+                                   ijblsk = ijblsk + 1
+                                end if
+                             else
+                                notrot = notrot + 1
+                                pskipped = pskipped + 1
+                                ijblsk = ijblsk + 1
+                             end if
+                             if( ( i<=swband ) .and. ( ijblsk>=blskip ) )then
+                                sva( p ) = aapp
+                                notrot = 0
+                                go to 2011
+                             end if
+                             if( ( i<=swband ) .and.( pskipped>rowskip ) ) then
+                                aapp = -aapp
+                                notrot = 0
+                                go to 2203
+                             end if
+                          end do loop_2200
+              ! end of the q-loop
+              2203 continue
+                          sva( p ) = aapp
+                       else
+                          if( aapp==zero )notrot = notrot +min( jgl+kbl-1, n ) - jgl + 1
+                          if( aapp<zero )notrot = 0
+                       end if
+                    end do loop_2100
+           ! end of the p-loop
+                 end do loop_2010
+           ! end of the jbc-loop
+           2011 continue
+      ! 2011 bailed out of the jbc-loop
+                 do p = igl, min( igl+kbl-1, n )
+                    sva( p ) = abs( sva( p ) )
+                 end do
+      ! **
+              end do loop_2000
+      ! 2000 :: end of the ibr-loop
+           ! .. update sva(n)
+              if( ( sva( n )<rootbig ) .and. ( sva( n )>rootsfmin ) )then
+                 sva( n ) = stdlib_dznrm2( m, a( 1, n ), 1 )
+              else
+                 t = zero
+                 aapp = one
+                 call stdlib_zlassq( m, a( 1, n ), 1, t, aapp )
+                 sva( n ) = t*sqrt( aapp )
+              end if
+           ! additional steering devices
+              if( ( i<swband ) .and. ( ( mxaapq<=roottol ) .or.( iswrot<=n ) ) )swband = i
+              if( ( i>swband+1 ) .and. ( mxaapq<sqrt( real( n,KIND=dp) )*tol ) .and. ( real( n,&
+                        KIND=dp)*mxaapq*mxsinj<tol ) ) then
+                 go to 1994
+              end if
+              if( notrot>=emptsw )go to 1994
+           end do loop_1993
+           ! end i=1:nsweep loop
+       ! #:( reaching this point means that the procedure has not converged.
+           info = nsweep - 1
+           go to 1995
+           1994 continue
+       ! #:) reaching this point means numerical convergence after the i-th
+           ! sweep.
+           info = 0
+       ! #:) info = 0 confirms successful iterations.
+       1995 continue
+           ! sort the vector sva() of column norms.
+           do p = 1, n - 1
+              q = stdlib_idamax( n-p+1, sva( p ), 1 ) + p - 1
+              if( p/=q ) then
+                 temp1 = sva( p )
+                 sva( p ) = sva( q )
+                 sva( q ) = temp1
+                 aapq = d( p )
+                 d( p ) = d( q )
+                 d( q ) = aapq
+                 call stdlib_zswap( m, a( 1, p ), 1, a( 1, q ), 1 )
+                 if( rsvec )call stdlib_zswap( mvl, v( 1, p ), 1, v( 1, q ), 1 )
+              end if
+           end do
+           return
+     end subroutine stdlib_zgsvj1
+
+     !> ZHESV_AA: computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Aasen's algorithm is used to factor A as
+     !> A = U**H * T * U,  if UPLO = 'U', or
+     !> A = L * T * L**H,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is Hermitian and tridiagonal. The factored form
+     !> of A is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_zhesv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, lwkopt_hetrf, lwkopt_hetrs
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max(2*n, 3*n-2) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              call stdlib_zhetrf_aa( uplo, n, a, lda, ipiv, work, -1, info )
+              lwkopt_hetrf = int( work(1),KIND=ilp)
+              call stdlib_zhetrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,-1, info )
+              lwkopt_hetrs = int( work(1),KIND=ilp)
+              lwkopt = max( lwkopt_hetrf, lwkopt_hetrs )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHESV_AA ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u**h*t*u or a = l*t*l**h.
+           call stdlib_zhetrf_aa( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_zhetrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zhesv_aa
+
+     !> ZHETRF_AA: computes the factorization of a complex hermitian matrix A
+     !> using the Aasen's algorithm.  The form of the factorization is
+     !> A = U**H*T*U  or  A = L*T*L**H
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is a hermitian tridiagonal matrix.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_zhetrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: j, lwkopt
+           integer(ilp) :: nb, mj, nj, k1, k2, j1, j2, j3, jb
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Executable Statements 
+           ! determine the block size
+           nb = stdlib_ilaenv( 1, 'ZHETRF_AA', uplo, n, -1, -1, -1 )
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              lwkopt = (nb+1)*n
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZHETRF_AA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return
+           if ( n==0 ) then
+               return
+           endif
+           ipiv( 1 ) = 1
+           if ( n==1 ) then
+              a( 1, 1 ) = real( a( 1, 1 ),KIND=dp)
+              return
+           end if
+           ! adjust block size based on the workspace size
+           if( lwork<((1+nb)*n) ) then
+              nb = ( lwork-n ) / n
+           end if
+           if( upper ) then
+              ! .....................................................
+              ! factorize a as u**h*d*u using the upper triangle of a
+              ! .....................................................
+              ! copy first row a(1, 1:n) into h(1:n) (stored in work(1:n))
+              call stdlib_zcopy( n, a( 1, 1 ), lda, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_zlahef;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              10 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j + 1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_zlahef_aa( uplo, 2-k1, n-j, jb,a( max(1, j), j+1 ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_zswap( j1-k1-2, a( 1, j2 ), 1,a( 1, ipiv(j2) ), 1 )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+               ! the row a(j1-1, j2-1:n) stores u(j1, j2+1:n) and
+               ! work stores the current block of the auxiriarly matrix h
+              if( j<n ) then
+                ! if the first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = conjg( a( j, j+1 ) )
+                    a( j, j+1 ) = cone
+                    call stdlib_zcopy( n-j, a( j-1, j+1 ), lda,work( (j+1-j1+1)+jb*n ), 1 )
+                              
+                    call stdlib_zscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=0 and k2=1 for the first panel,
+                     ! and k1=1 and k2=0 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_zgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_zgemm( 'CONJUGATE TRANSPOSE', 'TRANSPOSE',1, mj, jb+1,-cone,&
+                           a( j1-k2, j3 ), lda,work( (j3-j1+1)+k1*n ), n,cone, a( j3, j3 ), lda )
+                                     
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block of j2-th block row with stdlib_zgemm
+                       call stdlib_zgemm( 'CONJUGATE TRANSPOSE', 'TRANSPOSE',nj, n-j3+1, jb+1,-&
+                       cone, a( j1-k2, j2 ), lda,work( (j3-j1+1)+k1*n ), n,cone, a( j2, j3 ), lda &
+                                 )
+                    end do
+                    ! recover t( j, j+1 )
+                    a( j, j+1 ) = conjg( alpha )
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_zcopy( n-j, a( j+1, j+1 ), lda, work( 1 ), 1 )
+              end if
+              go to 10
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**h using the lower triangle of a
+              ! .....................................................
+              ! copy first column a(1:n, 1) into h(1:n, 1)
+               ! (stored in work(1:n))
+              call stdlib_zcopy( n, a( 1, 1 ), 1, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_zlahef;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              11 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j+1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_zlahef_aa( uplo, 2-k1, n-j, jb,a( j+1, max(1, j) ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_zswap( j1-k1-2, a( j2, 1 ), lda,a( ipiv(j2), 1 ), lda )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+                ! a(j2+1, j1-1) stores l(j2+1, j1) and
+                ! work(j2+1, 1) stores h(j2+1, 1)
+              if( j<n ) then
+                ! if the first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = conjg( a( j+1, j ) )
+                    a( j+1, j ) = cone
+                    call stdlib_zcopy( n-j, a( j+1, j-1 ), 1,work( (j+1-j1+1)+jb*n ), 1 )
+                    call stdlib_zscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=0 and k2=1 for the first panel,
+                     ! and k1=1 and k2=0 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_zgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',mj, 1, jb+1,-&
+                          cone, work( (j3-j1+1)+k1*n ), n,a( j3, j1-k2 ), lda,cone, a( j3, j3 ), &
+                                    lda )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block of j2-th block column with stdlib_zgemm
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'CONJUGATE TRANSPOSE',n-j3+1, nj, jb+1,-&
+                       cone, work( (j3-j1+1)+k1*n ), n,a( j2, j1-k2 ), lda,cone, a( j3, j2 ), lda &
+                                 )
+                    end do
+                    ! recover t( j+1, j )
+                    a( j+1, j ) = conjg( alpha )
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_zcopy( n-j, a( j+1, j+1 ), 1, work( 1 ), 1 )
+              end if
+              go to 11
+           end if
+           20 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zhetrf_aa
+
+     !> ZHSEQR: computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**H, where T is an upper triangular matrix (the
+     !> Schur form), and Z is the unitary matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input unitary
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*T*(QZ)**H.
+
+     pure subroutine stdlib_zhseqr( job, compz, n, ilo, ihi, h, ldh, w, z, ldz,work, lwork, info )
+               
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ilo, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           character, intent(in) :: compz, job
+           ! Array Arguments 
+           complex(dp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(dp), intent(out) :: w(*), work(*)
+        ! =====================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: nl = 49
+           real(dp), parameter :: rzero = 0.0_dp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_zlahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== nl allocates some local workspace to help small matrices
+           ! .    through a rare stdlib_zlahqr failure.  nl > ntiny = 15 is
+           ! .    required and nl <= nmin = stdlib_ilaenv(ispec=12,...) is recom-
+           ! .    mended.  (the default value of nmin is 75.)  using nl = 49
+           ! .    allows up to six simultaneous shifts and a 16-by-16
+           ! .    deflation window.  ====
+           
+           
+           
+           ! Local Arrays 
+           complex(dp) :: hl(nl,nl), workl(nl)
+           ! Local Scalars 
+           integer(ilp) :: kbot, nmin
+           logical(lk) :: initz, lquery, wantt, wantz
+           ! Intrinsic Functions 
+           intrinsic :: real,cmplx,max,min
+           ! Executable Statements 
+           ! ==== decode and check the input parameters. ====
+           wantt = stdlib_lsame( job, 'S' )
+           initz = stdlib_lsame( compz, 'I' )
+           wantz = initz .or. stdlib_lsame( compz, 'V' )
+           work( 1 ) = cmplx( real( max( 1, n ),KIND=dp), rzero,KIND=dp)
+           lquery = lwork==-1
+           info = 0
+           if( .not.stdlib_lsame( job, 'E' ) .and. .not.wantt ) then
+              info = -1
+           else if( .not.stdlib_lsame( compz, 'N' ) .and. .not.wantz ) then
+              info = -2
+           else if( n<0 ) then
+              info = -3
+           else if( ilo<1 .or. ilo>max( 1, n ) ) then
+              info = -4
+           else if( ihi<min( ilo, n ) .or. ihi>n ) then
+              info = -5
+           else if( ldh<max( 1, n ) ) then
+              info = -7
+           else if( ldz<1 .or. ( wantz .and. ldz<max( 1, n ) ) ) then
+              info = -10
+           else if( lwork<max( 1, n ) .and. .not.lquery ) then
+              info = -12
+           end if
+           if( info/=0 ) then
+              ! ==== quick return in case of invalid argument. ====
+              call stdlib_xerbla( 'ZHSEQR', -info )
+              return
+           else if( n==0 ) then
+              ! ==== quick return in case n = 0; nothing to do. ====
+              return
+           else if( lquery ) then
+              ! ==== quick return in case of a workspace query ====
+              call stdlib_zlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, w, ilo, ihi, z,ldz, work, &
+                        lwork, info )
+              ! ==== ensure reported workspace size is backward-compatible with
+              ! .    previous lapack versions. ====
+              work( 1 ) = cmplx( max( real( work( 1 ),KIND=dp), real( max( 1,n ),KIND=dp) ), &
+                        rzero,KIND=dp)
+              return
+           else
+              ! ==== copy eigenvalues isolated by stdlib_zgebal ====
+              if( ilo>1 )call stdlib_zcopy( ilo-1, h, ldh+1, w, 1 )
+              if( ihi<n )call stdlib_zcopy( n-ihi, h( ihi+1, ihi+1 ), ldh+1, w( ihi+1 ), 1 )
+                        
+              ! ==== initialize z, if requested ====
+              if( initz )call stdlib_zlaset( 'A', n, n, czero, cone, z, ldz )
+              ! ==== quick return if possible ====
+              if( ilo==ihi ) then
+                 w( ilo ) = h( ilo, ilo )
+                 return
+              end if
+              ! ==== stdlib_zlahqr/stdlib_zlaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'ZHSEQR', job( : 1 ) // compz( : 1 ), n,ilo, ihi, lwork )
+                        
+              nmin = max( ntiny, nmin )
+              ! ==== stdlib_zlaqr0 for big matrices; stdlib_zlahqr for small ones ====
+              if( n>nmin ) then
+                 call stdlib_zlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, w, ilo, ihi,z, ldz, work, &
+                           lwork, info )
+              else
+                 ! ==== small matrix ====
+                 call stdlib_zlahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, ilo, ihi,z, ldz, info )
+                           
+                 if( info>0 ) then
+                    ! ==== a rare stdlib_zlahqr failure!  stdlib_zlaqr0 sometimes succeeds
+                    ! .    when stdlib_zlahqr fails. ====
+                    kbot = info
+                    if( n>=nl ) then
+                       ! ==== larger matrices have enough subdiagonal scratch
+                       ! .    space to call stdlib_zlaqr0 directly. ====
+                       call stdlib_zlaqr0( wantt, wantz, n, ilo, kbot, h, ldh, w,ilo, ihi, z, ldz,&
+                                  work, lwork, info )
+                    else
+                       ! ==== tiny matrices don't have enough subdiagonal
+                       ! .    scratch space to benefit from stdlib_zlaqr0.  hence,
+                       ! .    tiny matrices must be copied into a larger
+                       ! .    array before calling stdlib_zlaqr0. ====
+                       call stdlib_zlacpy( 'A', n, n, h, ldh, hl, nl )
+                       hl( n+1, n ) = czero
+                       call stdlib_zlaset( 'A', nl, nl-n, czero, czero, hl( 1, n+1 ),nl )
+                       call stdlib_zlaqr0( wantt, wantz, nl, ilo, kbot, hl, nl, w,ilo, ihi, z, &
+                                 ldz, workl, nl, info )
+                       if( wantt .or. info/=0 )call stdlib_zlacpy( 'A', n, n, hl, nl, h, ldh )
+                                 
+                    end if
+                 end if
+              end if
+              ! ==== clear out the trash, if necessary. ====
+              if( ( wantt .or. info/=0 ) .and. n>2 )call stdlib_zlaset( 'L', n-2, n-2, czero, &
+                        czero, h( 3, 1 ), ldh )
+              ! ==== ensure reported workspace size is backward-compatible with
+              ! .    previous lapack versions. ====
+              work( 1 ) = cmplx( max( real( max( 1, n ),KIND=dp),real( work( 1 ),KIND=dp) ), &
+                        rzero,KIND=dp)
+           end if
+     end subroutine stdlib_zhseqr
+
+     !> DLAHEF_AA factorizes a panel of a complex hermitian matrix A using
+     !> the Aasen's algorithm. The panel consists of a set of NB rows of A
+     !> when UPLO is U, or a set of NB columns when UPLO is L.
+     !> In order to factorize the panel, the Aasen's algorithm requires the
+     !> last row, or column, of the previous panel. The first row, or column,
+     !> of A is set to be the first row, or column, of an identity matrix,
+     !> which is used to factorize the first panel.
+     !> The resulting J-th row of U, or J-th column of L, is stored in the
+     !> (J-1)-th row, or column, of A (without the unit diagonals), while
+     !> the diagonal and subdiagonal of A are overwritten by those of T.
+
+     pure subroutine stdlib_zlahef_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: m, nb, j1, lda, ldh
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), h(ldh,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, k, k1, i1, i2, mj
+           complex(dp) :: piv, alpha
+           ! Intrinsic Functions 
+           intrinsic :: real,conjg,max
+           ! Executable Statements 
+           j = 1
+           ! k1 is the first column of the panel to be factorized
+           ! i.e.,  k1 is 2 for the first block column, and 1 for the rest of the blocks
+           k1 = (2-j1)+1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              10 continue
+              if ( j>min(m, nb) )go to 20
+              ! k is the column to be factorized
+               ! when being called from stdlib_zhetrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:n, j) := a(j, j:n) - h(j:n, 1:(j-1)) * l(j1:(j-1), j),
+               ! where h(j:n, j) has been initialized to be a(j, j:n)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_zlacgv( j-k1, a( 1, j ), 1 )
+                 call stdlib_zgemv( 'NO TRANSPOSE', mj, j-k1,-cone, h( j, k1 ), ldh,a( 1, j ), 1,&
+                           cone, h( j, j ), 1 )
+                 call stdlib_zlacgv( j-k1, a( 1, j ), 1 )
+              end if
+              ! copy h(i:n, i) into work
+              call stdlib_zcopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j-1, j:n) * t(j-1,j),
+                  ! where a(j-1, j) stores t(j-1, j) and a(j-2, j:n) stores u(j-1, j:n)
+                 alpha = -conjg( a( k-1, j ) )
+                 call stdlib_zaxpy( mj, alpha, a( k-2, j ), lda, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( k, j ) = real( work( 1 ),KIND=dp)
+              if( j<m ) then
+                 ! compute work(2:n) = t(j, j) l(j, (j+1):n)
+                  ! where a(j, j) stores t(j, j) and a(j-1, (j+1):n) stores u(j, (j+1):n)
+                 if( k>1 ) then
+                    alpha = -a( k, j )
+                    call stdlib_zaxpy( m-j, alpha, a( k-1, j+1 ), lda,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:n)|)
+                 i2 = stdlib_izamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply hermitian pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1, i1+1:n) with a(i1+1:n, i2)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_zswap( i2-i1-1, a( j1+i1-1, i1+1 ), lda,a( j1+i1, i2 ), 1 )
+                              
+                    call stdlib_zlacgv( i2-i1, a( j1+i1-1, i1+1 ), lda )
+                    call stdlib_zlacgv( i2-i1-1, a( j1+i1, i2 ), 1 )
+                    ! swap a(i1, i2+1:n) with a(i2, i2+1:n)
+                    if( i2<m )call stdlib_zswap( m-i2, a( j1+i1-1, i2+1 ), lda,a( j1+i2-1, i2+1 ),&
+                               lda )
+                    ! swap a(i1, i1) with a(i2,i2)
+                    piv = a( i1+j1-1, i1 )
+                    a( j1+i1-1, i1 ) = a( j1+i2-1, i2 )
+                    a( j1+i2-1, i2 ) = piv
+                    ! swap h(i1, 1:j1) with h(i2, 1:j1)
+                    call stdlib_zswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_zswap( i1-k1+1, a( 1, i1 ), 1,a( 1, i2 ), 1 )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j, j+1) = t(j, j+1)
+                 a( k, j+1 ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:n, j+1) into h(j:n, j),
+                    call stdlib_zcopy( m-j, a( k+1, j+1 ), lda,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:n ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:n, j) = l(j+2:n, j+1)
+                 if( j<(m-1) ) then
+                    if( a( k, j+1 )/=czero ) then
+                       alpha = cone / a( k, j+1 )
+                       call stdlib_zcopy( m-j-1, work( 3 ), 1, a( k, j+2 ), lda )
+                       call stdlib_zscal( m-j-1, alpha, a( k, j+2 ), lda )
+                    else
+                       call stdlib_zlaset( 'FULL', 1, m-j-1, czero, czero,a( k, j+2 ), lda)
+                                 
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 10
+              20 continue
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              30 continue
+              if( j>min( m, nb ) )go to 40
+              ! k is the column to be factorized
+               ! when being called from stdlib_zhetrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:n, j) := a(j:n, j) - h(j:n, 1:(j-1)) * l(j, j1:(j-1))^t,
+               ! where h(j:n, j) has been initialized to be a(j:n, j)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_zlacgv( j-k1, a( j, 1 ), lda )
+                 call stdlib_zgemv( 'NO TRANSPOSE', mj, j-k1,-cone, h( j, k1 ), ldh,a( j, 1 ), &
+                           lda,cone, h( j, j ), 1 )
+                 call stdlib_zlacgv( j-k1, a( j, 1 ), lda )
+              end if
+              ! copy h(j:n, j) into work
+              call stdlib_zcopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j:n, j-1) * t(j-1,j),
+                  ! where a(j-1, j) = t(j-1, j) and a(j, j-2) = l(j, j-1)
+                 alpha = -conjg( a( j, k-1 ) )
+                 call stdlib_zaxpy( mj, alpha, a( j, k-2 ), 1, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( j, k ) = real( work( 1 ),KIND=dp)
+              if( j<m ) then
+                 ! compute work(2:n) = t(j, j) l((j+1):n, j)
+                  ! where a(j, j) = t(j, j) and a((j+1):n, j-1) = l((j+1):n, j)
+                 if( k>1 ) then
+                    alpha = -a( j, k )
+                    call stdlib_zaxpy( m-j, alpha, a( j+1, k-1 ), 1,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:n)|)
+                 i2 = stdlib_izamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply hermitian pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1+1:n, i1) with a(i2, i1+1:n)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_zswap( i2-i1-1, a( i1+1, j1+i1-1 ), 1,a( i2, j1+i1 ), lda )
+                              
+                    call stdlib_zlacgv( i2-i1, a( i1+1, j1+i1-1 ), 1 )
+                    call stdlib_zlacgv( i2-i1-1, a( i2, j1+i1 ), lda )
+                    ! swap a(i2+1:n, i1) with a(i2+1:n, i2)
+                    if( i2<m )call stdlib_zswap( m-i2, a( i2+1, j1+i1-1 ), 1,a( i2+1, j1+i2-1 ), &
+                              1 )
+                    ! swap a(i1, i1) with a(i2, i2)
+                    piv = a( i1, j1+i1-1 )
+                    a( i1, j1+i1-1 ) = a( i2, j1+i2-1 )
+                    a( i2, j1+i2-1 ) = piv
+                    ! swap h(i1, i1:j1) with h(i2, i2:j1)
+                    call stdlib_zswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_zswap( i1-k1+1, a( i1, 1 ), lda,a( i2, 1 ), lda )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j+1, j) = t(j+1, j)
+                 a( j+1, k ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:n, j+1) into h(j+1:n, j),
+                    call stdlib_zcopy( m-j, a( j+1, k+1 ), 1,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:n ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:n, j) = l(j+2:n, j+1)
+                 if( j<(m-1) ) then
+                    if( a( j+1, k )/=czero ) then
+                       alpha = cone / a( j+1, k )
+                       call stdlib_zcopy( m-j-1, work( 3 ), 1, a( j+2, k ), 1 )
+                       call stdlib_zscal( m-j-1, alpha, a( j+2, k ), 1 )
+                    else
+                       call stdlib_zlaset( 'FULL', m-j-1, 1, czero, czero,a( j+2, k ), lda )
+                                 
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 30
+              40 continue
+           end if
+           return
+     end subroutine stdlib_zlahef_aa
+
+     !> ZLAQR0: computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**H, where T is an upper triangular matrix (the
+     !> Schur form), and Z is the unitary matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input unitary
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+
+     pure subroutine stdlib_zlaqr0( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, work,&
+                lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(dp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(dp), intent(out) :: w(*), work(*)
+        ! ================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: kexnw = 5
+           integer(ilp), parameter :: kexsh = 6
+           real(dp), parameter :: wilk1 = 0.75_dp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_zlahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== exceptional deflation windows:  try to cure rare
+           ! .    slow convergence by varying the size of the
+           ! .    deflation window after kexnw iterations. ====
+           
+           ! ==== exceptional shifts: try to cure rare slow convergence
+           ! .    with ad-hoc exceptional shifts every kexsh iterations.
+           ! .    ====
+           
+           ! ==== the constant wilk1 is used to form the exceptional
+           ! .    shifts. ====
+           
+           
+           
+           ! Local Scalars 
+           complex(dp) :: aa, bb, cc, cdum, dd, det, rtdisc, swap, tr2
+           real(dp) :: s
+           integer(ilp) :: i, inf, it, itmax, k, kacc22, kbot, kdu, ks, kt, ktop, ku, kv, kwh, &
+           kwtop, kwv, ld, ls, lwkopt, ndec, ndfl, nh, nho, nibble, nmin, ns, nsmax, nsr, nve, nw,&
+                      nwmax, nwr, nwupbd
+           logical(lk) :: sorted
+           character :: jbcmpz*2
+           ! Local Arrays 
+           complex(dp) :: zdum(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,int,max,min,mod,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           info = 0
+           ! ==== quick return for n = 0: nothing to do. ====
+           if( n==0 ) then
+              work( 1 ) = cone
+              return
+           end if
+           if( n<=ntiny ) then
+              ! ==== tiny matrices must use stdlib_zlahqr. ====
+              lwkopt = 1
+              if( lwork/=-1 )call stdlib_zlahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, &
+                        z, ldz, info )
+           else
+              ! ==== use small bulge multi-shift qr with aggressive early
+              ! .    deflation on larger-than-tiny matrices. ====
+              ! ==== hope for the best. ====
+              info = 0
+              ! ==== set up job flags for stdlib_ilaenv. ====
+              if( wantt ) then
+                 jbcmpz( 1: 1 ) = 'S'
+              else
+                 jbcmpz( 1: 1 ) = 'E'
+              end if
+              if( wantz ) then
+                 jbcmpz( 2: 2 ) = 'V'
+              else
+                 jbcmpz( 2: 2 ) = 'N'
+              end if
+              ! ==== nwr = recommended deflation window size.  at this
+              ! .    point,  n > ntiny = 15, so there is enough
+              ! .    subdiagonal workspace for nwr>=2 as required.
+              ! .    (in fact, there is enough subdiagonal space for
+              ! .    nwr>=4.) ====
+              nwr = stdlib_ilaenv( 13, 'ZLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nwr = max( 2, nwr )
+              nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+              ! ==== nsr = recommended number of simultaneous shifts.
+              ! .    at this point n > ntiny = 15, so there is at
+              ! .    enough subdiagonal workspace for nsr to be even
+              ! .    and greater than or equal to two as required. ====
+              nsr = stdlib_ilaenv( 15, 'ZLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nsr = min( nsr, ( n-3 ) / 6, ihi-ilo )
+              nsr = max( 2, nsr-mod( nsr, 2 ) )
+              ! ==== estimate optimal workspace ====
+              ! ==== workspace query call to stdlib_zlaqr3 ====
+              call stdlib_zlaqr3( wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz,ihiz, z, ldz, ls,&
+                         ld, w, h, ldh, n, h, ldh, n, h,ldh, work, -1 )
+              ! ==== optimal workspace = max(stdlib_zlaqr5, stdlib_zlaqr3) ====
+              lwkopt = max( 3*nsr / 2, int( work( 1 ),KIND=ilp) )
+              ! ==== quick return in case of workspace query. ====
+              if( lwork==-1 ) then
+                 work( 1 ) = cmplx( lwkopt, 0,KIND=dp)
+                 return
+              end if
+              ! ==== stdlib_zlahqr/stdlib_zlaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'ZLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nmin = max( ntiny, nmin )
+              ! ==== nibble crossover point ====
+              nibble = stdlib_ilaenv( 14, 'ZLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              nibble = max( 0, nibble )
+              ! ==== accumulate reflections during ttswp?  use block
+              ! .    2-by-2 structure during matrix-matrix multiply? ====
+              kacc22 = stdlib_ilaenv( 16, 'ZLAQR0', jbcmpz, n, ilo, ihi, lwork )
+              kacc22 = max( 0, kacc22 )
+              kacc22 = min( 2, kacc22 )
+              ! ==== nwmax = the largest possible deflation window for
+              ! .    which there is sufficient workspace. ====
+              nwmax = min( ( n-1 ) / 3, lwork / 2 )
+              nw = nwmax
+              ! ==== nsmax = the largest number of simultaneous shifts
+              ! .    for which there is sufficient workspace. ====
+              nsmax = min( ( n-3 ) / 6, 2*lwork / 3 )
+              nsmax = nsmax - mod( nsmax, 2 )
+              ! ==== ndfl: an iteration count restarted at deflation. ====
+              ndfl = 1
+              ! ==== itmax = iteration limit ====
+              itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )
+              ! ==== last row and column in the active block ====
+              kbot = ihi
+              ! ==== main loop ====
+              loop_70: do it = 1, itmax
+                 ! ==== done when kbot falls below ilo ====
+                 if( kbot<ilo )go to 80
+                 ! ==== locate active block ====
+                 do k = kbot, ilo + 1, -1
+                    if( h( k, k-1 )==czero )go to 20
+                 end do
+                 k = ilo
+                 20 continue
+                 ktop = k
+                 ! ==== select deflation window size:
+                 ! .    typical case:
+                 ! .      if possible and advisable, nibble the entire
+                 ! .      active block.  if not, use size min(nwr,nwmax)
+                 ! .      or min(nwr+1,nwmax) depending upon which has
+                 ! .      the smaller corresponding subdiagonal entry
+                 ! .      (a heuristic).
+                 ! .    exceptional case:
+                 ! .      if there have been no deflations in kexnw or
+                 ! .      more iterations, then vary the deflation window
+                 ! .      size.   at first, because, larger windows are,
+                 ! .      in general, more powerful than smaller ones,
+                 ! .      rapidly increase the window to the maximum possible.
+                 ! .      then, gradually reduce the window size. ====
+                 nh = kbot - ktop + 1
+                 nwupbd = min( nh, nwmax )
+                 if( ndfl<kexnw ) then
+                    nw = min( nwupbd, nwr )
+                 else
+                    nw = min( nwupbd, 2*nw )
+                 end if
+                 if( nw<nwmax ) then
+                    if( nw>=nh-1 ) then
+                       nw = nh
+                    else
+                       kwtop = kbot - nw + 1
+                       if( cabs1( h( kwtop, kwtop-1 ) )>cabs1( h( kwtop-1, kwtop-2 ) ) )nw = nw + &
+                                 1
+                    end if
+                 end if
+                 if( ndfl<kexnw ) then
+                    ndec = -1
+                 else if( ndec>=0 .or. nw>=nwupbd ) then
+                    ndec = ndec + 1
+                    if( nw-ndec<2 )ndec = 0
+                    nw = nw - ndec
+                 end if
+                 ! ==== aggressive early deflation:
+                 ! .    split workspace under the subdiagonal into
+                 ! .      - an nw-by-nw work array v in the lower
+                 ! .        left-hand-corner,
+                 ! .      - an nw-by-at-least-nw-but-more-is-better
+                 ! .        (nw-by-nho) horizontal work array along
+                 ! .        the bottom edge,
+                 ! .      - an at-least-nw-but-more-is-better (nhv-by-nw)
+                 ! .        vertical work array along the left-hand-edge.
+                 ! .        ====
+                 kv = n - nw + 1
+                 kt = nw + 1
+                 nho = ( n-nw-1 ) - kt + 1
+                 kwv = nw + 2
+                 nve = ( n-nw ) - kwv + 1
+                 ! ==== aggressive early deflation ====
+                 call stdlib_zlaqr3( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+                 ls, ld, w, h( kv, 1 ), ldh, nho,h( kv, kt ), ldh, nve, h( kwv, 1 ), ldh, work,&
+                           lwork )
+                 ! ==== adjust kbot accounting for new deflations. ====
+                 kbot = kbot - ld
+                 ! ==== ks points to the shifts. ====
+                 ks = kbot - ls + 1
+                 ! ==== skip an expensive qr sweep if there is a (partly
+                 ! .    heuristic) reason to expect that many eigenvalues
+                 ! .    will deflate without it.  here, the qr sweep is
+                 ! .    skipped if many eigenvalues have just been deflated
+                 ! .    or if the remaining active block is small.
+                 if( ( ld==0 ) .or. ( ( 100*ld<=nw*nibble ) .and. ( kbot-ktop+1>min( nmin, nwmax )&
+                            ) ) ) then
+                    ! ==== ns = nominal number of simultaneous shifts.
+                    ! .    this may be lowered (slightly) if stdlib_zlaqr3
+                    ! .    did not provide that many shifts. ====
+                    ns = min( nsmax, nsr, max( 2, kbot-ktop ) )
+                    ns = ns - mod( ns, 2 )
+                    ! ==== if there have been no deflations
+                    ! .    in a multiple of kexsh iterations,
+                    ! .    then try exceptional shifts.
+                    ! .    otherwise use shifts provided by
+                    ! .    stdlib_zlaqr3 above or from the eigenvalues
+                    ! .    of a trailing principal submatrix. ====
+                    if( mod( ndfl, kexsh )==0 ) then
+                       ks = kbot - ns + 1
+                       do i = kbot, ks + 1, -2
+                          w( i ) = h( i, i ) + wilk1*cabs1( h( i, i-1 ) )
+                          w( i-1 ) = w( i )
+                       end do
+                    else
+                       ! ==== got ns/2 or fewer shifts? use stdlib_zlaqr4 or
+                       ! .    stdlib_zlahqr on a trailing principal submatrix to
+                       ! .    get more. (since ns<=nsmax<=(n-3)/6,
+                       ! .    there is enough space below the subdiagonal
+                       ! .    to fit an ns-by-ns scratch array.) ====
+                       if( kbot-ks+1<=ns / 2 ) then
+                          ks = kbot - ns + 1
+                          kt = n - ns + 1
+                          call stdlib_zlacpy( 'A', ns, ns, h( ks, ks ), ldh,h( kt, 1 ), ldh )
+                                    
+                          if( ns>nmin ) then
+                             call stdlib_zlaqr4( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, w( &
+                                       ks ), 1, 1,zdum, 1, work, lwork, inf )
+                          else
+                             call stdlib_zlahqr( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, w( &
+                                       ks ), 1, 1,zdum, 1, inf )
+                          end if
+                          ks = ks + inf
+                          ! ==== in case of a rare qr failure use
+                          ! .    eigenvalues of the trailing 2-by-2
+                          ! .    principal submatrix.  scale to avoid
+                          ! .    overflows, underflows and subnormals.
+                          ! .    (the scale factor s can not be czero,
+                          ! .    because h(kbot,kbot-1) is nonzero.) ====
+                          if( ks>=kbot ) then
+                             s = cabs1( h( kbot-1, kbot-1 ) ) +cabs1( h( kbot, kbot-1 ) ) +cabs1( &
+                                       h( kbot-1, kbot ) ) +cabs1( h( kbot, kbot ) )
+                             aa = h( kbot-1, kbot-1 ) / s
+                             cc = h( kbot, kbot-1 ) / s
+                             bb = h( kbot-1, kbot ) / s
+                             dd = h( kbot, kbot ) / s
+                             tr2 = ( aa+dd ) / two
+                             det = ( aa-tr2 )*( dd-tr2 ) - bb*cc
+                             rtdisc = sqrt( -det )
+                             w( kbot-1 ) = ( tr2+rtdisc )*s
+                             w( kbot ) = ( tr2-rtdisc )*s
+                             ks = kbot - 1
+                          end if
+                       end if
+                       if( kbot-ks+1>ns ) then
+                          ! ==== sort the shifts (helps a little) ====
+                          sorted = .false.
+                          do k = kbot, ks + 1, -1
+                             if( sorted )go to 60
+                             sorted = .true.
+                             do i = ks, k - 1
+                                if( cabs1( w( i ) )<cabs1( w( i+1 ) ) )then
+                                   sorted = .false.
+                                   swap = w( i )
+                                   w( i ) = w( i+1 )
+                                   w( i+1 ) = swap
+                                end if
+                             end do
+                          end do
+                          60 continue
+                       end if
+                    end if
+                    ! ==== if there are only two shifts, then use
+                    ! .    only cone.  ====
+                    if( kbot-ks+1==2 ) then
+                       if( cabs1( w( kbot )-h( kbot, kbot ) )<cabs1( w( kbot-1 )-h( kbot, kbot ) )&
+                                  ) then
+                          w( kbot-1 ) = w( kbot )
+                       else
+                          w( kbot ) = w( kbot-1 )
+                       end if
+                    end if
+                    ! ==== use up to ns of the the smallest magnitude
+                    ! .    shifts.  if there aren't ns shifts available,
+                    ! .    then use them all, possibly dropping cone to
+                    ! .    make the number of shifts even. ====
+                    ns = min( ns, kbot-ks+1 )
+                    ns = ns - mod( ns, 2 )
+                    ks = kbot - ns + 1
+                    ! ==== small-bulge multi-shift qr sweep:
+                    ! .    split workspace under the subdiagonal into
+                    ! .    - a kdu-by-kdu work array u in the lower
+                    ! .      left-hand-corner,
+                    ! .    - a kdu-by-at-least-kdu-but-more-is-better
+                    ! .      (kdu-by-nho) horizontal work array wh along
+                    ! .      the bottom edge,
+                    ! .    - and an at-least-kdu-but-more-is-better-by-kdu
+                    ! .      (nve-by-kdu) vertical work wv arrow along
+                    ! .      the left-hand-edge. ====
+                    kdu = 2*ns
+                    ku = n - kdu + 1
+                    kwh = kdu + 1
+                    nho = ( n-kdu+1-4 ) - ( kdu+1 ) + 1
+                    kwv = kdu + 4
+                    nve = n - kdu - kwv + 1
+                    ! ==== small-bulge multi-shift qr sweep ====
+                    call stdlib_zlaqr5( wantt, wantz, kacc22, n, ktop, kbot, ns,w( ks ), h, ldh, &
+                    iloz, ihiz, z, ldz, work,3, h( ku, 1 ), ldh, nve, h( kwv, 1 ), ldh,nho, h( ku,&
+                               kwh ), ldh )
+                 end if
+                 ! ==== note progress (or the lack of it). ====
+                 if( ld>0 ) then
+                    ndfl = 1
+                 else
+                    ndfl = ndfl + 1
+                 end if
+                 ! ==== end of main loop ====
+              end do loop_70
+              ! ==== iteration limit exceeded.  set info to show where
+              ! .    the problem occurred and exit. ====
+              info = kbot
+              80 continue
+           end if
+           ! ==== return the optimal value of lwork. ====
+           work( 1 ) = cmplx( lwkopt, 0,KIND=dp)
+     end subroutine stdlib_zlaqr0
+
+     !> Aggressive early deflation:
+     !> ZLAQR3: accepts as input an upper Hessenberg matrix
+     !> H and performs an unitary similarity transformation
+     !> designed to detect and deflate fully converged eigenvalues from
+     !> a trailing principal submatrix.  On output H has been over-
+     !> written by a new Hessenberg matrix that is a perturbation of
+     !> an unitary similarity transformation of H.  It is to be
+     !> hoped that the final version of H has many zero subdiagonal
+     !> entries.
+
+     pure subroutine stdlib_zlaqr3( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+               ns, nd, sh, v, ldv, nh, t, ldt,nv, wv, ldwv, work, lwork )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihiz, iloz, kbot, ktop, ldh, ldt, ldv, ldwv, ldz, lwork, n,&
+                      nh, nv, nw
+           integer(ilp), intent(out) :: nd, ns
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(dp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(dp), intent(out) :: sh(*), t(ldt,*), v(ldv,*), work(*), wv(ldwv,*)
+        ! ================================================================
+           ! Parameters 
+           real(dp), parameter :: rzero = 0.0_dp
+           real(dp), parameter :: rone = 1.0_dp
+           
+           
+           ! Local Scalars 
+           complex(dp) :: beta, cdum, s, tau
+           real(dp) :: foo, safmax, safmin, smlnum, ulp
+           integer(ilp) :: i, ifst, ilst, info, infqr, j, jw, kcol, kln, knt, krow, kwtop, ltop, &
+                     lwk1, lwk2, lwk3, lwkopt, nmin
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,conjg,aimag,int,max,min
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           ! ==== estimate optimal workspace. ====
+           jw = min( nw, kbot-ktop+1 )
+           if( jw<=2 ) then
+              lwkopt = 1
+           else
+              ! ==== workspace query call to stdlib_zgehrd ====
+              call stdlib_zgehrd( jw, 1, jw-1, t, ldt, work, work, -1, info )
+              lwk1 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_zunmhr ====
+              call stdlib_zunmhr( 'R', 'N', jw, jw, 1, jw-1, t, ldt, work, v, ldv,work, -1, info )
+                        
+              lwk2 = int( work( 1 ),KIND=ilp)
+              ! ==== workspace query call to stdlib_zlaqr4 ====
+              call stdlib_zlaqr4( .true., .true., jw, 1, jw, t, ldt, sh, 1, jw, v,ldv, work, -1, &
+                        infqr )
+              lwk3 = int( work( 1 ),KIND=ilp)
+              ! ==== optimal workspace ====
+              lwkopt = max( jw+max( lwk1, lwk2 ), lwk3 )
+           end if
+           ! ==== quick return in case of workspace query. ====
+           if( lwork==-1 ) then
+              work( 1 ) = cmplx( lwkopt, 0,KIND=dp)
+              return
+           end if
+           ! ==== nothing to do ...
+           ! ... for an empty active block ... ====
+           ns = 0
+           nd = 0
+           work( 1 ) = cone
+           if( ktop>kbot )return
+           ! ... nor for an empty deflation window. ====
+           if( nw<1 )return
+           ! ==== machine constants ====
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = rone / safmin
+           call stdlib_dlabad( safmin, safmax )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=dp) / ulp )
+           ! ==== setup deflation window ====
+           jw = min( nw, kbot-ktop+1 )
+           kwtop = kbot - jw + 1
+           if( kwtop==ktop ) then
+              s = czero
+           else
+              s = h( kwtop, kwtop-1 )
+           end if
+           if( kbot==kwtop ) then
+              ! ==== 1-by-1 deflation window: not much to do ====
+              sh( kwtop ) = h( kwtop, kwtop )
+              ns = 1
+              nd = 0
+              if( cabs1( s )<=max( smlnum, ulp*cabs1( h( kwtop,kwtop ) ) ) ) then
+                 ns = 0
+                 nd = 1
+                 if( kwtop>ktop )h( kwtop, kwtop-1 ) = czero
+              end if
+              work( 1 ) = cone
+              return
+           end if
+           ! ==== convert to spike-triangular form.  (in case of a
+           ! .    rare qr failure, this routine continues to do
+           ! .    aggressive early deflation using that part of
+           ! .    the deflation window that converged using infqr
+           ! .    here and there to keep track.) ====
+           call stdlib_zlacpy( 'U', jw, jw, h( kwtop, kwtop ), ldh, t, ldt )
+           call stdlib_zcopy( jw-1, h( kwtop+1, kwtop ), ldh+1, t( 2, 1 ), ldt+1 )
+           call stdlib_zlaset( 'A', jw, jw, czero, cone, v, ldv )
+           nmin = stdlib_ilaenv( 12, 'ZLAQR3', 'SV', jw, 1, jw, lwork )
+           if( jw>nmin ) then
+              call stdlib_zlaqr4( .true., .true., jw, 1, jw, t, ldt, sh( kwtop ), 1,jw, v, ldv, &
+                        work, lwork, infqr )
+           else
+              call stdlib_zlahqr( .true., .true., jw, 1, jw, t, ldt, sh( kwtop ), 1,jw, v, ldv, &
+                        infqr )
+           end if
+           ! ==== deflation detection loop ====
+           ns = jw
+           ilst = infqr + 1
+           do knt = infqr + 1, jw
+              ! ==== small spike tip deflation test ====
+              foo = cabs1( t( ns, ns ) )
+              if( foo==rzero )foo = cabs1( s )
+              if( cabs1( s )*cabs1( v( 1, ns ) )<=max( smlnum, ulp*foo ) )then
+                 ! ==== cone more converged eigenvalue ====
+                 ns = ns - 1
+              else
+                 ! ==== cone undeflatable eigenvalue.  move it up out of the
+                 ! .    way.   (stdlib_ztrexc can not fail in this case.) ====
+                 ifst = ns
+                 call stdlib_ztrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, info )
+                 ilst = ilst + 1
+              end if
+           end do
+              ! ==== return to hessenberg form ====
+           if( ns==0 )s = czero
+           if( ns<jw ) then
+              ! ==== sorting the diagonal of t improves accuracy for
+              ! .    graded matrices.  ====
+              do i = infqr + 1, ns
+                 ifst = i
+                 do j = i + 1, ns
+                    if( cabs1( t( j, j ) )>cabs1( t( ifst, ifst ) ) )ifst = j
+                 end do
+                 ilst = i
+                 if( ifst/=ilst )call stdlib_ztrexc( 'V', jw, t, ldt, v, ldv, ifst, ilst, info )
+                           
+              end do
+           end if
+           ! ==== restore shift/eigenvalue array from t ====
+           do i = infqr + 1, jw
+              sh( kwtop+i-1 ) = t( i, i )
+           end do
+           if( ns<jw .or. s==czero ) then
+              if( ns>1 .and. s/=czero ) then
+                 ! ==== reflect spike back into lower triangle ====
+                 call stdlib_zcopy( ns, v, ldv, work, 1 )
+                 do i = 1, ns
+                    work( i ) = conjg( work( i ) )
+                 end do
+                 beta = work( 1 )
+                 call stdlib_zlarfg( ns, beta, work( 2 ), 1, tau )
+                 work( 1 ) = cone
+                 call stdlib_zlaset( 'L', jw-2, jw-2, czero, czero, t( 3, 1 ), ldt )
+                 call stdlib_zlarf( 'L', ns, jw, work, 1, conjg( tau ), t, ldt,work( jw+1 ) )
+                           
+                 call stdlib_zlarf( 'R', ns, ns, work, 1, tau, t, ldt,work( jw+1 ) )
+                 call stdlib_zlarf( 'R', jw, ns, work, 1, tau, v, ldv,work( jw+1 ) )
+                 call stdlib_zgehrd( jw, 1, ns, t, ldt, work, work( jw+1 ),lwork-jw, info )
+                           
+              end if
+              ! ==== copy updated reduced window into place ====
+              if( kwtop>1 )h( kwtop, kwtop-1 ) = s*conjg( v( 1, 1 ) )
+              call stdlib_zlacpy( 'U', jw, jw, t, ldt, h( kwtop, kwtop ), ldh )
+              call stdlib_zcopy( jw-1, t( 2, 1 ), ldt+1, h( kwtop+1, kwtop ),ldh+1 )
+              ! ==== accumulate orthogonal matrix in order update
+              ! .    h and z, if requested.  ====
+              if( ns>1 .and. s/=czero )call stdlib_zunmhr( 'R', 'N', jw, ns, 1, ns, t, ldt, work, &
+                        v, ldv,work( jw+1 ), lwork-jw, info )
+              ! ==== update vertical slab in h ====
+              if( wantt ) then
+                 ltop = 1
+              else
+                 ltop = ktop
+              end if
+              do krow = ltop, kwtop - 1, nv
+                 kln = min( nv, kwtop-krow )
+                 call stdlib_zgemm( 'N', 'N', kln, jw, jw, cone, h( krow, kwtop ),ldh, v, ldv, &
+                           czero, wv, ldwv )
+                 call stdlib_zlacpy( 'A', kln, jw, wv, ldwv, h( krow, kwtop ), ldh )
+              end do
+              ! ==== update horizontal slab in h ====
+              if( wantt ) then
+                 do kcol = kbot + 1, n, nh
+                    kln = min( nh, n-kcol+1 )
+                    call stdlib_zgemm( 'C', 'N', jw, kln, jw, cone, v, ldv,h( kwtop, kcol ), ldh, &
+                              czero, t, ldt )
+                    call stdlib_zlacpy( 'A', jw, kln, t, ldt, h( kwtop, kcol ),ldh )
+                 end do
+              end if
+              ! ==== update vertical slab in z ====
+              if( wantz ) then
+                 do krow = iloz, ihiz, nv
+                    kln = min( nv, ihiz-krow+1 )
+                    call stdlib_zgemm( 'N', 'N', kln, jw, jw, cone, z( krow, kwtop ),ldz, v, ldv, &
+                              czero, wv, ldwv )
+                    call stdlib_zlacpy( 'A', kln, jw, wv, ldwv, z( krow, kwtop ),ldz )
+                 end do
+              end if
+           end if
+           ! ==== return the number of deflations ... ====
+           nd = jw - ns
+           ! ==== ... and the number of shifts. (subtracting
+           ! .    infqr from the spike length takes care
+           ! .    of the case of a rare qr failure while
+           ! .    calculating eigenvalues of the deflation
+           ! .    window.)  ====
+           ns = ns - infqr
+            ! ==== return optimal workspace. ====
+           work( 1 ) = cmplx( lwkopt, 0,KIND=dp)
+     end subroutine stdlib_zlaqr3
+
+     !> ZLAQR4: implements one level of recursion for ZLAQR0.
+     !> It is a complete implementation of the small bulge multi-shift
+     !> QR algorithm.  It may be called by ZLAQR0 and, for large enough
+     !> deflation window size, it may be called by ZLAQR3.  This
+     !> subroutine is identical to ZLAQR0 except that it calls ZLAQR2
+     !> instead of ZLAQR3.
+     !> ZLAQR4 computes the eigenvalues of a Hessenberg matrix H
+     !> and, optionally, the matrices T and Z from the Schur decomposition
+     !> H = Z T Z**H, where T is an upper triangular matrix (the
+     !> Schur form), and Z is the unitary matrix of Schur vectors.
+     !> Optionally Z may be postmultiplied into an input unitary
+     !> matrix Q so that this routine can give the Schur factorization
+     !> of a matrix A which has been reduced to the Hessenberg form H
+     !> by the unitary matrix Q:  A = Q*H*Q**H = (QZ)*H*(QZ)**H.
+
+     pure subroutine stdlib_zlaqr4( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, z, ldz, work,&
+                lwork, info )
+        ! -- lapack auxiliary routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           integer(ilp), intent(in) :: ihi, ihiz, ilo, iloz, ldh, ldz, lwork, n
+           integer(ilp), intent(out) :: info
+           logical(lk), intent(in) :: wantt, wantz
+           ! Array Arguments 
+           complex(dp), intent(inout) :: h(ldh,*), z(ldz,*)
+           complex(dp), intent(out) :: w(*), work(*)
+        ! ================================================================
+           ! Parameters 
+           integer(ilp), parameter :: ntiny = 15
+           integer(ilp), parameter :: kexnw = 5
+           integer(ilp), parameter :: kexsh = 6
+           real(dp), parameter :: wilk1 = 0.75_dp
+           ! ==== matrices of order ntiny or smaller must be processed by
+           ! .    stdlib_zlahqr because of insufficient subdiagonal scratch space.
+           ! .    (this is a hard limit.) ====
+           
+           ! ==== exceptional deflation windows:  try to cure rare
+           ! .    slow convergence by varying the size of the
+           ! .    deflation window after kexnw iterations. ====
+           
+           ! ==== exceptional shifts: try to cure rare slow convergence
+           ! .    with ad-hoc exceptional shifts every kexsh iterations.
+           ! .    ====
+           
+           ! ==== the constant wilk1 is used to form the exceptional
+           ! .    shifts. ====
+           
+           
+           
+           ! Local Scalars 
+           complex(dp) :: aa, bb, cc, cdum, dd, det, rtdisc, swap, tr2
+           real(dp) :: s
+           integer(ilp) :: i, inf, it, itmax, k, kacc22, kbot, kdu, ks, kt, ktop, ku, kv, kwh, &
+           kwtop, kwv, ld, ls, lwkopt, ndec, ndfl, nh, nho, nibble, nmin, ns, nsmax, nsr, nve, nw,&
+                      nwmax, nwr, nwupbd
+           logical(lk) :: sorted
+           character :: jbcmpz*2
+           ! Local Arrays 
+           complex(dp) :: zdum(1,1)
+           ! Intrinsic Functions 
+           intrinsic :: abs,real,cmplx,aimag,int,max,min,mod,sqrt
+           ! Statement Functions 
+           real(dp) :: cabs1
+           ! Statement Function Definitions 
+           cabs1( cdum ) = abs( real( cdum,KIND=dp) ) + abs( aimag( cdum ) )
+           ! Executable Statements 
+           info = 0
+           ! ==== quick return for n = 0: nothing to do. ====
+           if( n==0 ) then
+              work( 1 ) = cone
+              return
+           end if
+           if( n<=ntiny ) then
+              ! ==== tiny matrices must use stdlib_zlahqr. ====
+              lwkopt = 1
+              if( lwork/=-1 )call stdlib_zlahqr( wantt, wantz, n, ilo, ihi, h, ldh, w, iloz,ihiz, &
+                        z, ldz, info )
+           else
+              ! ==== use small bulge multi-shift qr with aggressive early
+              ! .    deflation on larger-than-tiny matrices. ====
+              ! ==== hope for the best. ====
+              info = 0
+              ! ==== set up job flags for stdlib_ilaenv. ====
+              if( wantt ) then
+                 jbcmpz( 1: 1 ) = 'S'
+              else
+                 jbcmpz( 1: 1 ) = 'E'
+              end if
+              if( wantz ) then
+                 jbcmpz( 2: 2 ) = 'V'
+              else
+                 jbcmpz( 2: 2 ) = 'N'
+              end if
+              ! ==== nwr = recommended deflation window size.  at this
+              ! .    point,  n > ntiny = 15, so there is enough
+              ! .    subdiagonal workspace for nwr>=2 as required.
+              ! .    (in fact, there is enough subdiagonal space for
+              ! .    nwr>=4.) ====
+              nwr = stdlib_ilaenv( 13, 'ZLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nwr = max( 2, nwr )
+              nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+              ! ==== nsr = recommended number of simultaneous shifts.
+              ! .    at this point n > ntiny = 15, so there is at
+              ! .    enough subdiagonal workspace for nsr to be even
+              ! .    and greater than or equal to two as required. ====
+              nsr = stdlib_ilaenv( 15, 'ZLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nsr = min( nsr, ( n-3 ) / 6, ihi-ilo )
+              nsr = max( 2, nsr-mod( nsr, 2 ) )
+              ! ==== estimate optimal workspace ====
+              ! ==== workspace query call to stdlib_zlaqr2 ====
+              call stdlib_zlaqr2( wantt, wantz, n, ilo, ihi, nwr+1, h, ldh, iloz,ihiz, z, ldz, ls,&
+                         ld, w, h, ldh, n, h, ldh, n, h,ldh, work, -1 )
+              ! ==== optimal workspace = max(stdlib_zlaqr5, stdlib_zlaqr2) ====
+              lwkopt = max( 3*nsr / 2, int( work( 1 ),KIND=ilp) )
+              ! ==== quick return in case of workspace query. ====
+              if( lwork==-1 ) then
+                 work( 1 ) = cmplx( lwkopt, 0,KIND=dp)
+                 return
+              end if
+              ! ==== stdlib_zlahqr/stdlib_zlaqr0 crossover point ====
+              nmin = stdlib_ilaenv( 12, 'ZLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nmin = max( ntiny, nmin )
+              ! ==== nibble crossover point ====
+              nibble = stdlib_ilaenv( 14, 'ZLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              nibble = max( 0, nibble )
+              ! ==== accumulate reflections during ttswp?  use block
+              ! .    2-by-2 structure during matrix-matrix multiply? ====
+              kacc22 = stdlib_ilaenv( 16, 'ZLAQR4', jbcmpz, n, ilo, ihi, lwork )
+              kacc22 = max( 0, kacc22 )
+              kacc22 = min( 2, kacc22 )
+              ! ==== nwmax = the largest possible deflation window for
+              ! .    which there is sufficient workspace. ====
+              nwmax = min( ( n-1 ) / 3, lwork / 2 )
+              nw = nwmax
+              ! ==== nsmax = the largest number of simultaneous shifts
+              ! .    for which there is sufficient workspace. ====
+              nsmax = min( ( n-3 ) / 6, 2*lwork / 3 )
+              nsmax = nsmax - mod( nsmax, 2 )
+              ! ==== ndfl: an iteration count restarted at deflation. ====
+              ndfl = 1
+              ! ==== itmax = iteration limit ====
+              itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )
+              ! ==== last row and column in the active block ====
+              kbot = ihi
+              ! ==== main loop ====
+              loop_70: do it = 1, itmax
+                 ! ==== done when kbot falls below ilo ====
+                 if( kbot<ilo )go to 80
+                 ! ==== locate active block ====
+                 do k = kbot, ilo + 1, -1
+                    if( h( k, k-1 )==czero )go to 20
+                 end do
+                 k = ilo
+                 20 continue
+                 ktop = k
+                 ! ==== select deflation window size:
+                 ! .    typical case:
+                 ! .      if possible and advisable, nibble the entire
+                 ! .      active block.  if not, use size min(nwr,nwmax)
+                 ! .      or min(nwr+1,nwmax) depending upon which has
+                 ! .      the smaller corresponding subdiagonal entry
+                 ! .      (a heuristic).
+                 ! .    exceptional case:
+                 ! .      if there have been no deflations in kexnw or
+                 ! .      more iterations, then vary the deflation window
+                 ! .      size.   at first, because, larger windows are,
+                 ! .      in general, more powerful than smaller ones,
+                 ! .      rapidly increase the window to the maximum possible.
+                 ! .      then, gradually reduce the window size. ====
+                 nh = kbot - ktop + 1
+                 nwupbd = min( nh, nwmax )
+                 if( ndfl<kexnw ) then
+                    nw = min( nwupbd, nwr )
+                 else
+                    nw = min( nwupbd, 2*nw )
+                 end if
+                 if( nw<nwmax ) then
+                    if( nw>=nh-1 ) then
+                       nw = nh
+                    else
+                       kwtop = kbot - nw + 1
+                       if( cabs1( h( kwtop, kwtop-1 ) )>cabs1( h( kwtop-1, kwtop-2 ) ) )nw = nw + &
+                                 1
+                    end if
+                 end if
+                 if( ndfl<kexnw ) then
+                    ndec = -1
+                 else if( ndec>=0 .or. nw>=nwupbd ) then
+                    ndec = ndec + 1
+                    if( nw-ndec<2 )ndec = 0
+                    nw = nw - ndec
+                 end if
+                 ! ==== aggressive early deflation:
+                 ! .    split workspace under the subdiagonal into
+                 ! .      - an nw-by-nw work array v in the lower
+                 ! .        left-hand-corner,
+                 ! .      - an nw-by-at-least-nw-but-more-is-better
+                 ! .        (nw-by-nho) horizontal work array along
+                 ! .        the bottom edge,
+                 ! .      - an at-least-nw-but-more-is-better (nhv-by-nw)
+                 ! .        vertical work array along the left-hand-edge.
+                 ! .        ====
+                 kv = n - nw + 1
+                 kt = nw + 1
+                 nho = ( n-nw-1 ) - kt + 1
+                 kwv = nw + 2
+                 nve = ( n-nw ) - kwv + 1
+                 ! ==== aggressive early deflation ====
+                 call stdlib_zlaqr2( wantt, wantz, n, ktop, kbot, nw, h, ldh, iloz,ihiz, z, ldz, &
+                 ls, ld, w, h( kv, 1 ), ldh, nho,h( kv, kt ), ldh, nve, h( kwv, 1 ), ldh, work,&
+                           lwork )
+                 ! ==== adjust kbot accounting for new deflations. ====
+                 kbot = kbot - ld
+                 ! ==== ks points to the shifts. ====
+                 ks = kbot - ls + 1
+                 ! ==== skip an expensive qr sweep if there is a (partly
+                 ! .    heuristic) reason to expect that many eigenvalues
+                 ! .    will deflate without it.  here, the qr sweep is
+                 ! .    skipped if many eigenvalues have just been deflated
+                 ! .    or if the remaining active block is small.
+                 if( ( ld==0 ) .or. ( ( 100*ld<=nw*nibble ) .and. ( kbot-ktop+1>min( nmin, nwmax )&
+                            ) ) ) then
+                    ! ==== ns = nominal number of simultaneous shifts.
+                    ! .    this may be lowered (slightly) if stdlib_zlaqr2
+                    ! .    did not provide that many shifts. ====
+                    ns = min( nsmax, nsr, max( 2, kbot-ktop ) )
+                    ns = ns - mod( ns, 2 )
+                    ! ==== if there have been no deflations
+                    ! .    in a multiple of kexsh iterations,
+                    ! .    then try exceptional shifts.
+                    ! .    otherwise use shifts provided by
+                    ! .    stdlib_zlaqr2 above or from the eigenvalues
+                    ! .    of a trailing principal submatrix. ====
+                    if( mod( ndfl, kexsh )==0 ) then
+                       ks = kbot - ns + 1
+                       do i = kbot, ks + 1, -2
+                          w( i ) = h( i, i ) + wilk1*cabs1( h( i, i-1 ) )
+                          w( i-1 ) = w( i )
+                       end do
+                    else
+                       ! ==== got ns/2 or fewer shifts? use stdlib_zlahqr
+                       ! .    on a trailing principal submatrix to
+                       ! .    get more. (since ns<=nsmax<=(n-3)/6,
+                       ! .    there is enough space below the subdiagonal
+                       ! .    to fit an ns-by-ns scratch array.) ====
+                       if( kbot-ks+1<=ns / 2 ) then
+                          ks = kbot - ns + 1
+                          kt = n - ns + 1
+                          call stdlib_zlacpy( 'A', ns, ns, h( ks, ks ), ldh,h( kt, 1 ), ldh )
+                                    
+                          call stdlib_zlahqr( .false., .false., ns, 1, ns,h( kt, 1 ), ldh, w( ks )&
+                                    , 1, 1, zdum,1, inf )
+                          ks = ks + inf
+                          ! ==== in case of a rare qr failure use
+                          ! .    eigenvalues of the trailing 2-by-2
+                          ! .    principal submatrix.  scale to avoid
+                          ! .    overflows, underflows and subnormals.
+                          ! .    (the scale factor s can not be czero,
+                          ! .    because h(kbot,kbot-1) is nonzero.) ====
+                          if( ks>=kbot ) then
+                             s = cabs1( h( kbot-1, kbot-1 ) ) +cabs1( h( kbot, kbot-1 ) ) +cabs1( &
+                                       h( kbot-1, kbot ) ) +cabs1( h( kbot, kbot ) )
+                             aa = h( kbot-1, kbot-1 ) / s
+                             cc = h( kbot, kbot-1 ) / s
+                             bb = h( kbot-1, kbot ) / s
+                             dd = h( kbot, kbot ) / s
+                             tr2 = ( aa+dd ) / two
+                             det = ( aa-tr2 )*( dd-tr2 ) - bb*cc
+                             rtdisc = sqrt( -det )
+                             w( kbot-1 ) = ( tr2+rtdisc )*s
+                             w( kbot ) = ( tr2-rtdisc )*s
+                             ks = kbot - 1
+                          end if
+                       end if
+                       if( kbot-ks+1>ns ) then
+                          ! ==== sort the shifts (helps a little) ====
+                          sorted = .false.
+                          do k = kbot, ks + 1, -1
+                             if( sorted )go to 60
+                             sorted = .true.
+                             do i = ks, k - 1
+                                if( cabs1( w( i ) )<cabs1( w( i+1 ) ) )then
+                                   sorted = .false.
+                                   swap = w( i )
+                                   w( i ) = w( i+1 )
+                                   w( i+1 ) = swap
+                                end if
+                             end do
+                          end do
+                          60 continue
+                       end if
+                    end if
+                    ! ==== if there are only two shifts, then use
+                    ! .    only cone.  ====
+                    if( kbot-ks+1==2 ) then
+                       if( cabs1( w( kbot )-h( kbot, kbot ) )<cabs1( w( kbot-1 )-h( kbot, kbot ) )&
+                                  ) then
+                          w( kbot-1 ) = w( kbot )
+                       else
+                          w( kbot ) = w( kbot-1 )
+                       end if
+                    end if
+                    ! ==== use up to ns of the the smallest magnitude
+                    ! .    shifts.  if there aren't ns shifts available,
+                    ! .    then use them all, possibly dropping cone to
+                    ! .    make the number of shifts even. ====
+                    ns = min( ns, kbot-ks+1 )
+                    ns = ns - mod( ns, 2 )
+                    ks = kbot - ns + 1
+                    ! ==== small-bulge multi-shift qr sweep:
+                    ! .    split workspace under the subdiagonal into
+                    ! .    - a kdu-by-kdu work array u in the lower
+                    ! .      left-hand-corner,
+                    ! .    - a kdu-by-at-least-kdu-but-more-is-better
+                    ! .      (kdu-by-nho) horizontal work array wh along
+                    ! .      the bottom edge,
+                    ! .    - and an at-least-kdu-but-more-is-better-by-kdu
+                    ! .      (nve-by-kdu) vertical work wv arrow along
+                    ! .      the left-hand-edge. ====
+                    kdu = 2*ns
+                    ku = n - kdu + 1
+                    kwh = kdu + 1
+                    nho = ( n-kdu+1-4 ) - ( kdu+1 ) + 1
+                    kwv = kdu + 4
+                    nve = n - kdu - kwv + 1
+                    ! ==== small-bulge multi-shift qr sweep ====
+                    call stdlib_zlaqr5( wantt, wantz, kacc22, n, ktop, kbot, ns,w( ks ), h, ldh, &
+                    iloz, ihiz, z, ldz, work,3, h( ku, 1 ), ldh, nve, h( kwv, 1 ), ldh,nho, h( ku,&
+                               kwh ), ldh )
+                 end if
+                 ! ==== note progress (or the lack of it). ====
+                 if( ld>0 ) then
+                    ndfl = 1
+                 else
+                    ndfl = ndfl + 1
+                 end if
+                 ! ==== end of main loop ====
+              end do loop_70
+              ! ==== iteration limit exceeded.  set info to show where
+              ! .    the problem occurred and exit. ====
+              info = kbot
+              80 continue
+           end if
+           ! ==== return the optimal value of lwork. ====
+           work( 1 ) = cmplx( lwkopt, 0,KIND=dp)
+     end subroutine stdlib_zlaqr4
+
+     !> ZLAQZ0: computes the eigenvalues of a real matrix pair (H,T),
+     !> where H is an upper Hessenberg matrix and T is upper triangular,
+     !> using the double-shift QZ method.
+     !> Matrix pairs of this type are produced by the reduction to
+     !> generalized upper Hessenberg form of a real matrix pair (A,B):
+     !> A = Q1*H*Z1**H,  B = Q1*T*Z1**H,
+     !> as computed by ZGGHRD.
+     !> If JOB='S', then the Hessenberg-triangular pair (H,T) is
+     !> also reduced to generalized Schur form,
+     !> H = Q*S*Z**H,  T = Q*P*Z**H,
+     !> where Q and Z are unitary matrices, P and S are an upper triangular
+     !> matrices.
+     !> Optionally, the unitary matrix Q from the generalized Schur
+     !> factorization may be postmultiplied into an input matrix Q1, and the
+     !> unitary matrix Z may be postmultiplied into an input matrix Z1.
+     !> If Q1 and Z1 are the unitary matrices from ZGGHRD that reduced
+     !> the matrix pair (A,B) to generalized upper Hessenberg form, then the
+     !> output matrices Q1*Q and Z1*Z are the unitary factors from the
+     !> generalized Schur factorization of (A,B):
+     !> A = (Q1*Q)*S*(Z1*Z)**H,  B = (Q1*Q)*P*(Z1*Z)**H.
+     !> To avoid overflow, eigenvalues of the matrix pair (H,T) (equivalently,
+     !> of (A,B)) are computed as a pair of values (alpha,beta), where alpha is
+     !> complex and beta real.
+     !> If beta is nonzero, lambda = alpha / beta is an eigenvalue of the
+     !> generalized nonsymmetric eigenvalue problem (GNEP)
+     !> A*x = lambda*B*x
+     !> and if alpha is nonzero, mu = beta / alpha is an eigenvalue of the
+     !> alternate form of the GNEP
+     !> mu*A*y = B*y.
+     !> Eigenvalues can be read directly from the generalized Schur
+     !> form:
+     !> alpha = S(i,i), beta = P(i,i).
+     !> Ref: C.B. Moler
+     !> Eigenvalue Problems", SIAM J. Numer. Anal., 10(1973),
+     !> pp. 241--256.
+     !> Ref: B. Kagstrom, D. Kressner, "Multishift Variants of the QZ
+     !> Algorithm with Aggressive Early Deflation", SIAM J. Numer.
+     !> Anal., 29(2006), pp. 199--227.
+     !> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
+     !> multipole rational QZ method with agressive early deflation"
+
+     recursive subroutine stdlib_zlaqz0( wants, wantq, wantz, n, ilo, ihi, a,lda, b, ldb, alpha, &
+               beta, q, ldq, z,ldz, work, lwork, rwork, rec,info )
+           ! arguments
+           character, intent( in ) :: wants, wantq, wantz
+           integer(ilp), intent( in ) :: n, ilo, ihi, lda, ldb, ldq, ldz, lwork,rec
+           integer(ilp), intent( out ) :: info
+           complex(dp), intent( inout ) :: a( lda, * ), b( ldb, * ), q( ldq,* ), z( ldz, * ), &
+                     alpha( * ), beta( * ), work( * )
+           real(dp), intent( out ) :: rwork( * )
+           
+           
+           ! local scalars
+           real(dp) :: smlnum, ulp, safmin, safmax, c1, tempr
+           complex(dp) :: eshift, s1, temp
+           integer(ilp) :: istart, istop, iiter, maxit, istart2, k, ld, nshifts, nblock, nw, nmin,&
+            nibble, n_undeflated, n_deflated, ns, sweep_info, shiftpos, lworkreq, k2, istartm, &
+            istopm, iwants, iwantq, iwantz, norm_info, aed_info, nwr, nbr, nsr, itemp1, itemp2, &
+                      rcost
+           logical(lk) :: ilschur, ilq, ilz
+           character :: jbcmpz*3
+           if( stdlib_lsame( wants, 'E' ) ) then
+              ilschur = .false.
+              iwants = 1
+           else if( stdlib_lsame( wants, 'S' ) ) then
+              ilschur = .true.
+              iwants = 2
+           else
+              iwants = 0
+           end if
+           if( stdlib_lsame( wantq, 'N' ) ) then
+              ilq = .false.
+              iwantq = 1
+           else if( stdlib_lsame( wantq, 'V' ) ) then
+              ilq = .true.
+              iwantq = 2
+           else if( stdlib_lsame( wantq, 'I' ) ) then
+              ilq = .true.
+              iwantq = 3
+           else
+              iwantq = 0
+           end if
+           if( stdlib_lsame( wantz, 'N' ) ) then
+              ilz = .false.
+              iwantz = 1
+           else if( stdlib_lsame( wantz, 'V' ) ) then
+              ilz = .true.
+              iwantz = 2
+           else if( stdlib_lsame( wantz, 'I' ) ) then
+              ilz = .true.
+              iwantz = 3
+           else
+              iwantz = 0
+           end if
+           ! check argument values
+           info = 0
+           if( iwants==0 ) then
+              info = -1
+           else if( iwantq==0 ) then
+              info = -2
+           else if( iwantz==0 ) then
+              info = -3
+           else if( n<0 ) then
+              info = -4
+           else if( ilo<1 ) then
+              info = -5
+           else if( ihi>n .or. ihi<ilo-1 ) then
+              info = -6
+           else if( lda<n ) then
+              info = -8
+           else if( ldb<n ) then
+              info = -10
+           else if( ldq<1 .or. ( ilq .and. ldq<n ) ) then
+              info = -15
+           else if( ldz<1 .or. ( ilz .and. ldz<n ) ) then
+              info = -17
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAQZ0', -info )
+              return
+           end if
+           ! quick return if possible
+           if( n<=0 ) then
+              work( 1 ) = real( 1,KIND=dp)
+              return
+           end if
+           ! get the parameters
+           jbcmpz( 1:1 ) = wants
+           jbcmpz( 2:2 ) = wantq
+           jbcmpz( 3:3 ) = wantz
+           nmin = stdlib_ilaenv( 12, 'ZLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nwr = stdlib_ilaenv( 13, 'ZLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nwr = max( 2, nwr )
+           nwr = min( ihi-ilo+1, ( n-1 ) / 3, nwr )
+           nibble = stdlib_ilaenv( 14, 'ZLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nsr = stdlib_ilaenv( 15, 'ZLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           nsr = min( nsr, ( n+6 ) / 9, ihi-ilo )
+           nsr = max( 2, nsr-mod( nsr, 2 ) )
+           rcost = stdlib_ilaenv( 17, 'ZLAQZ0', jbcmpz, n, ilo, ihi, lwork )
+           itemp1 = int( nsr/sqrt( 1+2*nsr/( real( rcost,KIND=dp)/100*n ) ),KIND=ilp)
+           itemp1 = ( ( itemp1-1 )/4 )*4+4
+           nbr = nsr+itemp1
+           if( n < nmin .or. rec >= 2 ) then
+              call stdlib_zhgeqz( wants, wantq, wantz, n, ilo, ihi, a, lda, b, ldb,alpha, beta, q,&
+                         ldq, z, ldz, work, lwork, rwork,info )
+              return
+           end if
+           ! find out required workspace
+           ! workspace query to stdlib_zlaqz2
+           nw = max( nwr, nmin )
+           call stdlib_zlaqz2( ilschur, ilq, ilz, n, ilo, ihi, nw, a, lda, b, ldb,q, ldq, z, ldz, &
+           n_undeflated, n_deflated, alpha,beta, work, nw, work, nw, work, -1, rwork, rec,&
+                     aed_info )
+           itemp1 = int( work( 1 ),KIND=ilp)
+           ! workspace query to stdlib_zlaqz3
+           call stdlib_zlaqz3( ilschur, ilq, ilz, n, ilo, ihi, nsr, nbr, alpha,beta, a, lda, b, &
+                     ldb, q, ldq, z, ldz, work, nbr,work, nbr, work, -1, sweep_info )
+           itemp2 = int( work( 1 ),KIND=ilp)
+           lworkreq = max( itemp1+2*nw**2, itemp2+2*nbr**2 )
+           if ( lwork ==-1 ) then
+              work( 1 ) = real( lworkreq,KIND=dp)
+              return
+           else if ( lwork < lworkreq ) then
+              info = -19
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAQZ0', info )
+              return
+           end if
+           ! initialize q and z
+           if( iwantq==3 ) call stdlib_zlaset( 'FULL', n, n, czero, cone, q,ldq )
+           if( iwantz==3 ) call stdlib_zlaset( 'FULL', n, n, czero, cone, z,ldz )
+           ! get machine constants
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_dlabad( safmin, safmax )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=dp)/ulp )
+           istart = ilo
+           istop = ihi
+           maxit = 30*( ihi-ilo+1 )
+           ld = 0
+           do iiter = 1, maxit
+              if( iiter >= maxit ) then
+                 info = istop+1
+                 goto 80
+              end if
+              if ( istart+1 >= istop ) then
+                 istop = istart
+                 exit
+              end if
+              ! check deflations at the end
+              if ( abs( a( istop, istop-1 ) ) <= max( smlnum,ulp*( abs( a( istop, istop ) )+abs( &
+                        a( istop-1,istop-1 ) ) ) ) ) then
+                 a( istop, istop-1 ) = czero
+                 istop = istop-1
+                 ld = 0
+                 eshift = czero
+              end if
+              ! check deflations at the start
+              if ( abs( a( istart+1, istart ) ) <= max( smlnum,ulp*( abs( a( istart, istart ) )+&
+                        abs( a( istart+1,istart+1 ) ) ) ) ) then
+                 a( istart+1, istart ) = czero
+                 istart = istart+1
+                 ld = 0
+                 eshift = czero
+              end if
+              if ( istart+1 >= istop ) then
+                 exit
+              end if
+              ! check interior deflations
+              istart2 = istart
+              do k = istop, istart+1, -1
+                 if ( abs( a( k, k-1 ) ) <= max( smlnum, ulp*( abs( a( k,k ) )+abs( a( k-1, k-1 ) &
+                           ) ) ) ) then
+                    a( k, k-1 ) = czero
+                    istart2 = k
+                    exit
+                 end if
+              end do
+              ! get range to apply rotations to
+              if ( ilschur ) then
+                 istartm = 1
+                 istopm = n
+              else
+                 istartm = istart2
+                 istopm = istop
+              end if
+              ! check infinite eigenvalues, this is done without blocking so might
+              ! slow down the method when many infinite eigenvalues are present
+              k = istop
+              do while ( k>=istart2 )
+                 tempr = zero
+                 if( k < istop ) then
+                    tempr = tempr+abs( b( k, k+1 ) )
+                 end if
+                 if( k > istart2 ) then
+                    tempr = tempr+abs( b( k-1, k ) )
+                 end if
+                 if( abs( b( k, k ) ) < max( smlnum, ulp*tempr ) ) then
+                    ! a diagonal element of b is negligable, move it
+                    ! to the top and deflate it
+                    do k2 = k, istart2+1, -1
+                       call stdlib_zlartg( b( k2-1, k2 ), b( k2-1, k2-1 ), c1, s1,temp )
+                       b( k2-1, k2 ) = temp
+                       b( k2-1, k2-1 ) = czero
+                       call stdlib_zrot( k2-2-istartm+1, b( istartm, k2 ), 1,b( istartm, k2-1 ), &
+                                 1, c1, s1 )
+                       call stdlib_zrot( min( k2+1, istop )-istartm+1, a( istartm,k2 ), 1, a( &
+                                 istartm, k2-1 ), 1, c1, s1 )
+                       if ( ilz ) then
+                          call stdlib_zrot( n, z( 1, k2 ), 1, z( 1, k2-1 ), 1, c1,s1 )
+                       end if
+                       if( k2<istop ) then
+                          call stdlib_zlartg( a( k2, k2-1 ), a( k2+1, k2-1 ), c1,s1, temp )
+                                    
+                          a( k2, k2-1 ) = temp
+                          a( k2+1, k2-1 ) = czero
+                          call stdlib_zrot( istopm-k2+1, a( k2, k2 ), lda, a( k2+1,k2 ), lda, c1, &
+                                    s1 )
+                          call stdlib_zrot( istopm-k2+1, b( k2, k2 ), ldb, b( k2+1,k2 ), ldb, c1, &
+                                    s1 )
+                          if( ilq ) then
+                             call stdlib_zrot( n, q( 1, k2 ), 1, q( 1, k2+1 ), 1,c1, conjg( s1 ) )
+                                       
+                          end if
+                       end if
+                    end do
+                    if( istart2<istop )then
+                       call stdlib_zlartg( a( istart2, istart2 ), a( istart2+1,istart2 ), c1, s1, &
+                                 temp )
+                       a( istart2, istart2 ) = temp
+                       a( istart2+1, istart2 ) = czero
+                       call stdlib_zrot( istopm-( istart2+1 )+1, a( istart2,istart2+1 ), lda, a( &
+                                 istart2+1,istart2+1 ), lda, c1, s1 )
+                       call stdlib_zrot( istopm-( istart2+1 )+1, b( istart2,istart2+1 ), ldb, b( &
+                                 istart2+1,istart2+1 ), ldb, c1, s1 )
+                       if( ilq ) then
+                          call stdlib_zrot( n, q( 1, istart2 ), 1, q( 1,istart2+1 ), 1, c1, conjg(&
+                                     s1 ) )
+                       end if
+                    end if
+                    istart2 = istart2+1
+                 end if
+                 k = k-1
+              end do
+              ! istart2 now points to the top of the bottom right
+              ! unreduced hessenberg block
+              if ( istart2 >= istop ) then
+                 istop = istart2-1
+                 ld = 0
+                 eshift = czero
+                 cycle
+              end if
+              nw = nwr
+              nshifts = nsr
+              nblock = nbr
+              if ( istop-istart2+1 < nmin ) then
+                 ! setting nw to the size of the subblock will make aed deflate
+                 ! all the eigenvalues. this is slightly more efficient than just
+                 ! using qz_small because the off diagonal part gets updated via blas.
+                 if ( istop-istart+1 < nmin ) then
+                    nw = istop-istart+1
+                    istart2 = istart
+                 else
+                    nw = istop-istart2+1
+                 end if
+              end if
+              ! time for aed
+              call stdlib_zlaqz2( ilschur, ilq, ilz, n, istart2, istop, nw, a, lda,b, ldb, q, ldq,&
+               z, ldz, n_undeflated, n_deflated,alpha, beta, work, nw, work( nw**2+1 ), nw,work( &
+                         2*nw**2+1 ), lwork-2*nw**2, rwork, rec,aed_info )
+              if ( n_deflated > 0 ) then
+                 istop = istop-n_deflated
+                 ld = 0
+                 eshift = czero
+              end if
+              if ( 100*n_deflated > nibble*( n_deflated+n_undeflated ) .or.istop-istart2+1 < nmin &
+                        ) then
+                 ! aed has uncovered many eigenvalues. skip a qz sweep and run
+                 ! aed again.
+                 cycle
+              end if
+              ld = ld+1
+              ns = min( nshifts, istop-istart2 )
+              ns = min( ns, n_undeflated )
+              shiftpos = istop-n_deflated-n_undeflated+1
+              if ( mod( ld, 6 ) == 0 ) then
+                 ! exceptional shift.  chosen for no particularly good reason.
+                 if( ( real( maxit,KIND=dp)*safmin )*abs( a( istop,istop-1 ) )<abs( a( istop-1, &
+                           istop-1 ) ) ) then
+                    eshift = a( istop, istop-1 )/b( istop-1, istop-1 )
+                 else
+                    eshift = eshift+cone/( safmin*real( maxit,KIND=dp) )
+                 end if
+                 alpha( shiftpos ) = cone
+                 beta( shiftpos ) = eshift
+                 ns = 1
+              end if
+              ! time for a qz sweep
+              call stdlib_zlaqz3( ilschur, ilq, ilz, n, istart2, istop, ns, nblock,alpha( &
+              shiftpos ), beta( shiftpos ), a, lda, b,ldb, q, ldq, z, ldz, work, nblock, work( &
+                        nblock**2+1 ), nblock, work( 2*nblock**2+1 ),lwork-2*nblock**2, sweep_info )
+           end do
+           ! call stdlib_zhgeqz to normalize the eigenvalue blocks and set the eigenvalues
+           ! if all the eigenvalues have been found, stdlib_zhgeqz will not do any iterations
+           ! and only normalize the blocks. in case of a rare convergence failure,
+           ! the single shift might perform better.
+        80 call stdlib_zhgeqz( wants, wantq, wantz, n, ilo, ihi, a, lda, b, ldb,alpha, beta, q, &
+                  ldq, z, ldz, work, lwork, rwork,norm_info )
+           info = norm_info
+     end subroutine stdlib_zlaqz0
+
+     !> ZLAQZ2: performs AED
+
+     recursive subroutine stdlib_zlaqz2( ilschur, ilq, ilz, n, ilo, ihi, nw,a, lda, b, ldb, q, &
+               ldq, z, ldz, ns,nd, alpha, beta, qc, ldqc, zc, ldzc,work, lwork, rwork, rec, info )
+           ! arguments
+           logical(lk), intent( in ) :: ilschur, ilq, ilz
+           integer(ilp), intent( in ) :: n, ilo, ihi, nw, lda, ldb, ldq, ldz,ldqc, ldzc, lwork, &
+                     rec
+           complex(dp), intent( inout ) :: a( lda, * ), b( ldb, * ), q( ldq,* ), z( ldz, * ), &
+                     alpha( * ), beta( * )
+           integer(ilp), intent( out ) :: ns, nd, info
+           complex(dp), intent(inout) :: qc(ldqc,*), zc(ldzc,*)
+           complex(dp), intent(out) :: work(*)
+           real(dp), intent(out) :: rwork(*)
+           
+           
+           ! local scalars
+           integer(ilp) :: jw, kwtop, kwbot, istopm, istartm, k, k2, ztgexc_info, ifst, ilst, &
+                     lworkreq, qz_small_info
+           real(dp) ::smlnum, ulp, safmin, safmax, c1, tempr
+           complex(dp) :: s, s1, temp
+           info = 0
+           ! set up deflation window
+           jw = min( nw, ihi-ilo+1 )
+           kwtop = ihi-jw+1
+           if ( kwtop == ilo ) then
+              s = czero
+           else
+              s = a( kwtop, kwtop-1 )
+           end if
+           ! determine required workspace
+           ifst = 1
+           ilst = jw
+           call stdlib_zlaqz0( 'S', 'V', 'V', jw, 1, jw, a( kwtop, kwtop ), lda,b( kwtop, kwtop ),&
+                      ldb, alpha, beta, qc, ldqc, zc,ldzc, work, -1, rwork, rec+1, qz_small_info )
+           lworkreq = int( work( 1 ),KIND=ilp)+2*jw**2
+           lworkreq = max( lworkreq, n*nw, 2*nw**2+n )
+           if ( lwork ==-1 ) then
+              ! workspace query, quick return
+              work( 1 ) = lworkreq
+              return
+           else if ( lwork < lworkreq ) then
+              info = -26
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZLAQZ2', -info )
+              return
+           end if
+           ! get machine constants
+           safmin = stdlib_dlamch( 'SAFE MINIMUM' )
+           safmax = one/safmin
+           call stdlib_dlabad( safmin, safmax )
+           ulp = stdlib_dlamch( 'PRECISION' )
+           smlnum = safmin*( real( n,KIND=dp)/ulp )
+           if ( ihi == kwtop ) then
+              ! 1 by 1 deflation window, just try a regular deflation
+              alpha( kwtop ) = a( kwtop, kwtop )
+              beta( kwtop ) = b( kwtop, kwtop )
+              ns = 1
+              nd = 0
+              if ( abs( s ) <= max( smlnum, ulp*abs( a( kwtop,kwtop ) ) ) ) then
+                 ns = 0
+                 nd = 1
+                 if ( kwtop > ilo ) then
+                    a( kwtop, kwtop-1 ) = czero
+                 end if
+              end if
+           end if
+           ! store window in case of convergence failure
+           call stdlib_zlacpy( 'ALL', jw, jw, a( kwtop, kwtop ), lda, work, jw )
+           call stdlib_zlacpy( 'ALL', jw, jw, b( kwtop, kwtop ), ldb, work( jw**2+1 ), jw )
+                     
+           ! transform window to real schur form
+           call stdlib_zlaset( 'FULL', jw, jw, czero, cone, qc, ldqc )
+           call stdlib_zlaset( 'FULL', jw, jw, czero, cone, zc, ldzc )
+           call stdlib_zlaqz0( 'S', 'V', 'V', jw, 1, jw, a( kwtop, kwtop ), lda,b( kwtop, kwtop ),&
+            ldb, alpha, beta, qc, ldqc, zc,ldzc, work( 2*jw**2+1 ), lwork-2*jw**2, rwork,rec+1, &
+                      qz_small_info )
+           if( qz_small_info /= 0 ) then
+              ! convergence failure, restore the window and exit
+              nd = 0
+              ns = jw-qz_small_info
+              call stdlib_zlacpy( 'ALL', jw, jw, work, jw, a( kwtop, kwtop ), lda )
+              call stdlib_zlacpy( 'ALL', jw, jw, work( jw**2+1 ), jw, b( kwtop,kwtop ), ldb )
+                        
+              return
+           end if
+           ! deflation detection loop
+           if ( kwtop == ilo .or. s == czero ) then
+              kwbot = kwtop-1
+           else
+              kwbot = ihi
+              k = 1
+              k2 = 1
+              do while ( k <= jw )
+                    ! try to deflate eigenvalue
+                    tempr = abs( a( kwbot, kwbot ) )
+                    if( tempr == zero ) then
+                       tempr = abs( s )
+                    end if
+                    if ( ( abs( s*qc( 1, kwbot-kwtop+1 ) ) ) <= max( ulp*tempr, smlnum ) ) &
+                              then
+                       ! deflatable
+                       kwbot = kwbot-1
+                    else
+                       ! not deflatable, move out of the way
+                       ifst = kwbot-kwtop+1
+                       ilst = k2
+                       call stdlib_ztgexc( .true., .true., jw, a( kwtop, kwtop ),lda, b( kwtop, &
+                                 kwtop ), ldb, qc, ldqc,zc, ldzc, ifst, ilst, ztgexc_info )
+                       k2 = k2+1
+                    end if
+                    k = k+1
+              end do
+           end if
+           ! store eigenvalues
+           nd = ihi-kwbot
+           ns = jw-nd
+           k = kwtop
+           do while ( k <= ihi )
+              alpha( k ) = a( k, k )
+              beta( k ) = b( k, k )
+              k = k+1
+           end do
+           if ( kwtop /= ilo .and. s /= czero ) then
+              ! reflect spike back, this will create optimally packed bulges
+              a( kwtop:kwbot, kwtop-1 ) = a( kwtop, kwtop-1 ) *conjg( qc( 1,1:jw-nd ) )
+              do k = kwbot-1, kwtop, -1
+                 call stdlib_zlartg( a( k, kwtop-1 ), a( k+1, kwtop-1 ), c1, s1,temp )
+                 a( k, kwtop-1 ) = temp
+                 a( k+1, kwtop-1 ) = czero
+                 k2 = max( kwtop, k-1 )
+                 call stdlib_zrot( ihi-k2+1, a( k, k2 ), lda, a( k+1, k2 ), lda, c1,s1 )
+                 call stdlib_zrot( ihi-( k-1 )+1, b( k, k-1 ), ldb, b( k+1, k-1 ),ldb, c1, s1 )
+                           
+                 call stdlib_zrot( jw, qc( 1, k-kwtop+1 ), 1, qc( 1, k+1-kwtop+1 ),1, c1, conjg( &
+                           s1 ) )
+              end do
+              ! chase bulges down
+              istartm = kwtop
+              istopm = ihi
+              k = kwbot-1
+              do while ( k >= kwtop )
+                 ! move bulge down and remove it
+                 do k2 = k, kwbot-1
+                    call stdlib_zlaqz1( .true., .true., k2, kwtop, kwtop+jw-1,kwbot, a, lda, b, &
+                              ldb, jw, kwtop, qc, ldqc,jw, kwtop, zc, ldzc )
+                 end do
+                 k = k-1
+              end do
+           end if
+           ! apply qc and zc to rest of the matrix
+           if ( ilschur ) then
+              istartm = 1
+              istopm = n
+           else
+              istartm = ilo
+              istopm = ihi
+           end if
+           if ( istopm-ihi > 0 ) then
+              call stdlib_zgemm( 'C', 'N', jw, istopm-ihi, jw, cone, qc, ldqc,a( kwtop, ihi+1 ), &
+                        lda, czero, work, jw )
+              call stdlib_zlacpy( 'ALL', jw, istopm-ihi, work, jw, a( kwtop,ihi+1 ), lda )
+              call stdlib_zgemm( 'C', 'N', jw, istopm-ihi, jw, cone, qc, ldqc,b( kwtop, ihi+1 ), &
+                        ldb, czero, work, jw )
+              call stdlib_zlacpy( 'ALL', jw, istopm-ihi, work, jw, b( kwtop,ihi+1 ), ldb )
+           end if
+           if ( ilq ) then
+              call stdlib_zgemm( 'N', 'N', n, jw, jw, cone, q( 1, kwtop ), ldq, qc,ldqc, czero, &
+                        work, n )
+              call stdlib_zlacpy( 'ALL', n, jw, work, n, q( 1, kwtop ), ldq )
+           end if
+           if ( kwtop-1-istartm+1 > 0 ) then
+              call stdlib_zgemm( 'N', 'N', kwtop-istartm, jw, jw, cone, a( istartm,kwtop ), lda, &
+                        zc, ldzc, czero, work,kwtop-istartm )
+             call stdlib_zlacpy( 'ALL', kwtop-istartm, jw, work, kwtop-istartm,a( istartm, kwtop )&
+                       , lda )
+              call stdlib_zgemm( 'N', 'N', kwtop-istartm, jw, jw, cone, b( istartm,kwtop ), ldb, &
+                        zc, ldzc, czero, work,kwtop-istartm )
+             call stdlib_zlacpy( 'ALL', kwtop-istartm, jw, work, kwtop-istartm,b( istartm, kwtop )&
+                       , ldb )
+           end if
+           if ( ilz ) then
+              call stdlib_zgemm( 'N', 'N', n, jw, jw, cone, z( 1, kwtop ), ldz, zc,ldzc, czero, &
+                        work, n )
+              call stdlib_zlacpy( 'ALL', n, jw, work, n, z( 1, kwtop ), ldz )
+           end if
+     end subroutine stdlib_zlaqz2
+
+     !> DLATRF_AA factorizes a panel of a complex symmetric matrix A using
+     !> the Aasen's algorithm. The panel consists of a set of NB rows of A
+     !> when UPLO is U, or a set of NB columns when UPLO is L.
+     !> In order to factorize the panel, the Aasen's algorithm requires the
+     !> last row, or column, of the previous panel. The first row, or column,
+     !> of A is set to be the first row, or column, of an identity matrix,
+     !> which is used to factorize the first panel.
+     !> The resulting J-th row of U, or J-th column of L, is stored in the
+     !> (J-1)-th row, or column, of A (without the unit diagonals), while
+     !> the diagonal and subdiagonal of A are overwritten by those of T.
+
+     pure subroutine stdlib_zlasyf_aa( uplo, j1, m, nb, a, lda, ipiv,h, ldh, work )
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: m, nb, j1, lda, ldh
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), h(ldh,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           integer(ilp) :: j, k, k1, i1, i2, mj
+           complex(dp) :: piv, alpha
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           j = 1
+           ! k1 is the first column of the panel to be factorized
+           ! i.e.,  k1 is 2 for the first block column, and 1 for the rest of the blocks
+           k1 = (2-j1)+1
+           if( stdlib_lsame( uplo, 'U' ) ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              10 continue
+              if ( j>min(m, nb) )go to 20
+              ! k is the column to be factorized
+               ! when being called from stdlib_zsytrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:m, j) := a(j, j:m) - h(j:m, 1:(j-1)) * l(j1:(j-1), j),
+               ! where h(j:m, j) has been initialized to be a(j, j:m)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_zgemv( 'NO TRANSPOSE', mj, j-k1,-cone, h( j, k1 ), ldh,a( 1, j ), 1,&
+                           cone, h( j, j ), 1 )
+              end if
+              ! copy h(i:m, i) into work
+              call stdlib_zcopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j-1, j:m) * t(j-1,j),
+                  ! where a(j-1, j) stores t(j-1, j) and a(j-2, j:m) stores u(j-1, j:m)
+                 alpha = -a( k-1, j )
+                 call stdlib_zaxpy( mj, alpha, a( k-2, j ), lda, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( k, j ) = work( 1 )
+              if( j<m ) then
+                 ! compute work(2:m) = t(j, j) l(j, (j+1):m)
+                  ! where a(j, j) stores t(j, j) and a(j-1, (j+1):m) stores u(j, (j+1):m)
+                 if( k>1 ) then
+                    alpha = -a( k, j )
+                    call stdlib_zaxpy( m-j, alpha, a( k-1, j+1 ), lda,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:m)|)
+                 i2 = stdlib_izamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply symmetric pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1, i1+1:m) with a(i1+1:m, i2)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_zswap( i2-i1-1, a( j1+i1-1, i1+1 ), lda,a( j1+i1, i2 ), 1 )
+                              
+                    ! swap a(i1, i2+1:m) with a(i2, i2+1:m)
+                    if( i2<m )call stdlib_zswap( m-i2, a( j1+i1-1, i2+1 ), lda,a( j1+i2-1, i2+1 ),&
+                               lda )
+                    ! swap a(i1, i1) with a(i2,i2)
+                    piv = a( i1+j1-1, i1 )
+                    a( j1+i1-1, i1 ) = a( j1+i2-1, i2 )
+                    a( j1+i2-1, i2 ) = piv
+                    ! swap h(i1, 1:j1) with h(i2, 1:j1)
+                    call stdlib_zswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_zswap( i1-k1+1, a( 1, i1 ), 1,a( 1, i2 ), 1 )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j, j+1) = t(j, j+1)
+                 a( k, j+1 ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:m, j+1) into h(j:m, j),
+                    call stdlib_zcopy( m-j, a( k+1, j+1 ), lda,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:m ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:m, j) = l(j+2:m, j+1)
+                 if( j<(m-1) ) then
+                    if( a( k, j+1 )/=czero ) then
+                       alpha = cone / a( k, j+1 )
+                       call stdlib_zcopy( m-j-1, work( 3 ), 1, a( k, j+2 ), lda )
+                       call stdlib_zscal( m-j-1, alpha, a( k, j+2 ), lda )
+                    else
+                       call stdlib_zlaset( 'FULL', 1, m-j-1, czero, czero,a( k, j+2 ), lda)
+                                 
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 10
+              20 continue
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              30 continue
+              if( j>min( m, nb ) )go to 40
+              ! k is the column to be factorized
+               ! when being called from stdlib_zsytrf_aa,
+               ! > for the first block column, j1 is 1, hence j1+j-1 is j,
+               ! > for the rest of the columns, j1 is 2, and j1+j-1 is j+1,
+              k = j1+j-1
+              if( j==m ) then
+                  ! only need to compute t(j, j)
+                  mj = 1
+              else
+                  mj = m-j+1
+              end if
+              ! h(j:m, j) := a(j:m, j) - h(j:m, 1:(j-1)) * l(j, j1:(j-1))^t,
+               ! where h(j:m, j) has been initialized to be a(j:m, j)
+              if( k>2 ) then
+              ! k is the column to be factorized
+               ! > for the first block column, k is j, skipping the first two
+                 ! columns
+               ! > for the rest of the columns, k is j+1, skipping only the
+                 ! first column
+                 call stdlib_zgemv( 'NO TRANSPOSE', mj, j-k1,-cone, h( j, k1 ), ldh,a( j, 1 ), &
+                           lda,cone, h( j, j ), 1 )
+              end if
+              ! copy h(j:m, j) into work
+              call stdlib_zcopy( mj, h( j, j ), 1, work( 1 ), 1 )
+              if( j>k1 ) then
+                 ! compute work := work - l(j:m, j-1) * t(j-1,j),
+                  ! where a(j-1, j) = t(j-1, j) and a(j, j-2) = l(j, j-1)
+                 alpha = -a( j, k-1 )
+                 call stdlib_zaxpy( mj, alpha, a( j, k-2 ), 1, work( 1 ), 1 )
+              end if
+              ! set a(j, j) = t(j, j)
+              a( j, k ) = work( 1 )
+              if( j<m ) then
+                 ! compute work(2:m) = t(j, j) l((j+1):m, j)
+                  ! where a(j, j) = t(j, j) and a((j+1):m, j-1) = l((j+1):m, j)
+                 if( k>1 ) then
+                    alpha = -a( j, k )
+                    call stdlib_zaxpy( m-j, alpha, a( j+1, k-1 ), 1,work( 2 ), 1 )
+                 endif
+                 ! find max(|work(2:m)|)
+                 i2 = stdlib_izamax( m-j, work( 2 ), 1 ) + 1
+                 piv = work( i2 )
+                 ! apply symmetric pivot
+                 if( (i2/=2) .and. (piv/=0) ) then
+                    ! swap work(i1) and work(i2)
+                    i1 = 2
+                    work( i2 ) = work( i1 )
+                    work( i1 ) = piv
+                    ! swap a(i1+1:m, i1) with a(i2, i1+1:m)
+                    i1 = i1+j-1
+                    i2 = i2+j-1
+                    call stdlib_zswap( i2-i1-1, a( i1+1, j1+i1-1 ), 1,a( i2, j1+i1 ), lda )
+                              
+                    ! swap a(i2+1:m, i1) with a(i2+1:m, i2)
+                    if( i2<m )call stdlib_zswap( m-i2, a( i2+1, j1+i1-1 ), 1,a( i2+1, j1+i2-1 ), &
+                              1 )
+                    ! swap a(i1, i1) with a(i2, i2)
+                    piv = a( i1, j1+i1-1 )
+                    a( i1, j1+i1-1 ) = a( i2, j1+i2-1 )
+                    a( i2, j1+i2-1 ) = piv
+                    ! swap h(i1, i1:j1) with h(i2, i2:j1)
+                    call stdlib_zswap( i1-1, h( i1, 1 ), ldh, h( i2, 1 ), ldh )
+                    ipiv( i1 ) = i2
+                    if( i1>(k1-1) ) then
+                       ! swap l(1:i1-1, i1) with l(1:i1-1, i2),
+                        ! skipping the first column
+                       call stdlib_zswap( i1-k1+1, a( i1, 1 ), lda,a( i2, 1 ), lda )
+                    end if
+                 else
+                    ipiv( j+1 ) = j+1
+                 endif
+                 ! set a(j+1, j) = t(j+1, j)
+                 a( j+1, k ) = work( 2 )
+                 if( j<nb ) then
+                    ! copy a(j+1:m, j+1) into h(j+1:m, j),
+                    call stdlib_zcopy( m-j, a( j+1, k+1 ), 1,h( j+1, j+1 ), 1 )
+                 end if
+                 ! compute l(j+2, j+1) = work( 3:m ) / t(j, j+1),
+                  ! where a(j, j+1) = t(j, j+1) and a(j+2:m, j) = l(j+2:m, j+1)
+                 if( j<(m-1) ) then
+                    if( a( j+1, k )/=czero ) then
+                       alpha = cone / a( j+1, k )
+                       call stdlib_zcopy( m-j-1, work( 3 ), 1, a( j+2, k ), 1 )
+                       call stdlib_zscal( m-j-1, alpha, a( j+2, k ), 1 )
+                    else
+                       call stdlib_zlaset( 'FULL', m-j-1, 1, czero, czero,a( j+2, k ), lda )
+                                 
+                    end if
+                 end if
+              end if
+              j = j + 1
+              go to 30
+              40 continue
+           end if
+           return
+     end subroutine stdlib_zlasyf_aa
+
+     !> ZSYSV computes the solution to a complex system of linear equations
+     !> A * X = B,
+     !> where A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+     !> matrices.
+     !> Aasen's algorithm is used to factor A as
+     !> A = U**T * T * U,  if UPLO = 'U', or
+     !> A = L * T * L**T,  if UPLO = 'L',
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is symmetric tridiagonal. The factored
+     !> form of A is then used to solve the system of equations A * X = B.
+
+     pure subroutine stdlib_zsysv_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+               
+        ! -- lapack driver routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(out) :: info
+           integer(ilp), intent(in) :: lda, ldb, lwork, n, nrhs
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*), b(ldb,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           ! Local Scalars 
+           logical(lk) :: lquery
+           integer(ilp) :: lwkopt, lwkopt_sytrf, lwkopt_sytrs
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! test the input parameters.
+           info = 0
+           lquery = ( lwork==-1 )
+           if( .not.stdlib_lsame( uplo, 'U' ) .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( nrhs<0 ) then
+              info = -3
+           else if( lda<max( 1, n ) ) then
+              info = -5
+           else if( ldb<max( 1, n ) ) then
+              info = -8
+           else if( lwork<max(2*n, 3*n-2) .and. .not.lquery ) then
+              info = -10
+           end if
+           if( info==0 ) then
+              call stdlib_zsytrf_aa( uplo, n, a, lda, ipiv, work, -1, info )
+              lwkopt_sytrf = int( work(1),KIND=ilp)
+              call stdlib_zsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,-1, info )
+              lwkopt_sytrs = int( work(1),KIND=ilp)
+              lwkopt = max( lwkopt_sytrf, lwkopt_sytrs )
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYSV_AA ', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! compute the factorization a = u**t*t*u or a = l*t*l**t.
+           call stdlib_zsytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info )
+           if( info==0 ) then
+              ! solve the system a*x = b, overwriting b with x.
+              call stdlib_zsytrs_aa( uplo, n, nrhs, a, lda, ipiv, b, ldb, work,lwork, info )
+                        
+           end if
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zsysv_aa
+
+     !> ZSYTRF_AA: computes the factorization of a complex symmetric matrix A
+     !> using the Aasen's algorithm.  The form of the factorization is
+     !> A = U**T*T*U  or  A = L*T*L**T
+     !> where U (or L) is a product of permutation and unit upper (lower)
+     !> triangular matrices, and T is a complex symmetric tridiagonal matrix.
+     !> This is the blocked version of the algorithm, calling Level 3 BLAS.
+
+     pure subroutine stdlib_zsytrf_aa( uplo, n, a, lda, ipiv, work, lwork, info)
+        ! -- lapack computational routine --
+        ! -- lapack is a software package provided by univ. of tennessee,    --
+        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
+           ! Scalar Arguments 
+           character, intent(in) :: uplo
+           integer(ilp), intent(in) :: n, lda, lwork
+           integer(ilp), intent(out) :: info
+           ! Array Arguments 
+           integer(ilp), intent(out) :: ipiv(*)
+           complex(dp), intent(inout) :: a(lda,*)
+           complex(dp), intent(out) :: work(*)
+        ! =====================================================================
+           
+           ! Local Scalars 
+           logical(lk) :: lquery, upper
+           integer(ilp) :: j, lwkopt
+           integer(ilp) :: nb, mj, nj, k1, k2, j1, j2, j3, jb
+           complex(dp) :: alpha
+           ! Intrinsic Functions 
+           intrinsic :: max
+           ! Executable Statements 
+           ! determine the block size
+           nb = stdlib_ilaenv( 1, 'ZSYTRF_AA', uplo, n, -1, -1, -1 )
+           ! test the input parameters.
+           info = 0
+           upper = stdlib_lsame( uplo, 'U' )
+           lquery = ( lwork==-1 )
+           if( .not.upper .and. .not.stdlib_lsame( uplo, 'L' ) ) then
+              info = -1
+           else if( n<0 ) then
+              info = -2
+           else if( lda<max( 1, n ) ) then
+              info = -4
+           else if( lwork<max( 1, 2*n ) .and. .not.lquery ) then
+              info = -7
+           end if
+           if( info==0 ) then
+              lwkopt = (nb+1)*n
+              work( 1 ) = lwkopt
+           end if
+           if( info/=0 ) then
+              call stdlib_xerbla( 'ZSYTRF_AA', -info )
+              return
+           else if( lquery ) then
+              return
+           end if
+           ! quick return
+           if ( n==0 ) then
+               return
+           endif
+           ipiv( 1 ) = 1
+           if ( n==1 ) then
+              return
+           end if
+           ! adjust block size based on the workspace size
+           if( lwork<((1+nb)*n) ) then
+              nb = ( lwork-n ) / n
+           end if
+           if( upper ) then
+              ! .....................................................
+              ! factorize a as u**t*d*u using the upper triangle of a
+              ! .....................................................
+              ! copy first row a(1, 1:n) into h(1:n) (stored in work(1:n))
+              call stdlib_zcopy( n, a( 1, 1 ), lda, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_zlasyf;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              10 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j + 1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_zlasyf_aa( uplo, 2-k1, n-j, jb,a( max(1, j), j+1 ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_zswap( j1-k1-2, a( 1, j2 ), 1,a( 1, ipiv(j2) ), 1 )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+               ! the row a(j1-1, j2-1:n) stores u(j1, j2+1:n) and
+               ! work stores the current block of the auxiriarly matrix h
+              if( j<n ) then
+                 ! if first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = a( j, j+1 )
+                    a( j, j+1 ) = cone
+                    call stdlib_zcopy( n-j, a( j-1, j+1 ), lda,work( (j+1-j1+1)+jb*n ), 1 )
+                              
+                    call stdlib_zscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=1 and k2= 0 for the first panel,
+                     ! while k1=0 and k2=1 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_zgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_zgemv( 'NO TRANSPOSE', mj, jb+1,-cone, work( j3-j1+1+k1*n ),&
+                                     n,a( j1-k2, j3 ), 1,cone, a( j3, j3 ), lda )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block of j2-th block row with stdlib_zgemm
+                       call stdlib_zgemm( 'TRANSPOSE', 'TRANSPOSE',nj, n-j3+1, jb+1,-cone, a( j1-&
+                                 k2, j2 ), lda,work( j3-j1+1+k1*n ), n,cone, a( j2, j3 ), lda )
+                    end do
+                    ! recover t( j, j+1 )
+                    a( j, j+1 ) = alpha
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_zcopy( n-j, a( j+1, j+1 ), lda, work( 1 ), 1 )
+              end if
+              go to 10
+           else
+              ! .....................................................
+              ! factorize a as l*d*l**t using the lower triangle of a
+              ! .....................................................
+              ! copy first column a(1:n, 1) into h(1:n, 1)
+               ! (stored in work(1:n))
+              call stdlib_zcopy( n, a( 1, 1 ), 1, work( 1 ), 1 )
+              ! j is the main loop index, increasing from 1 to n in steps of
+              ! jb, where jb is the number of columns factorized by stdlib_zlasyf;
+              ! jb is either nb, or n-j+1 for the last block
+              j = 0
+              11 continue
+              if( j>=n )go to 20
+              ! each step of the main loop
+               ! j is the last column of the previous panel
+               ! j1 is the first column of the current panel
+               ! k1 identifies if the previous column of the panel has been
+                ! explicitly stored, e.g., k1=1 for the first panel, and
+                ! k1=0 for the rest
+              j1 = j+1
+              jb = min( n-j1+1, nb )
+              k1 = max(1, j)-j
+              ! panel factorization
+              call stdlib_zlasyf_aa( uplo, 2-k1, n-j, jb,a( j+1, max(1, j) ), lda,ipiv( j+1 ), &
+                        work, n, work( n*nb+1 ) )
+              ! adjust ipiv and apply it back (j-th step picks (j+1)-th pivot)
+              do j2 = j+2, min(n, j+jb+1)
+                 ipiv( j2 ) = ipiv( j2 ) + j
+                 if( (j2/=ipiv(j2)) .and. ((j1-k1)>2) ) then
+                    call stdlib_zswap( j1-k1-2, a( j2, 1 ), lda,a( ipiv(j2), 1 ), lda )
+                 end if
+              end do
+              j = j + jb
+              ! trailing submatrix update, where
+                ! a(j2+1, j1-1) stores l(j2+1, j1) and
+                ! work(j2+1, 1) stores h(j2+1, 1)
+              if( j<n ) then
+                 ! if first panel and jb=1 (nb=1), then nothing to do
+                 if( j1>1 .or. jb>1 ) then
+                    ! merge rank-1 update with blas-3 update
+                    alpha = a( j+1, j )
+                    a( j+1, j ) = cone
+                    call stdlib_zcopy( n-j, a( j+1, j-1 ), 1,work( (j+1-j1+1)+jb*n ), 1 )
+                    call stdlib_zscal( n-j, alpha, work( (j+1-j1+1)+jb*n ), 1 )
+                    ! k1 identifies if the previous column of the panel has been
+                     ! explicitly stored, e.g., k1=1 and k2= 0 for the first panel,
+                     ! while k1=0 and k2=1 for the rest
+                    if( j1>1 ) then
+                       ! not first panel
+                       k2 = 1
+                    else
+                       ! first panel
+                       k2 = 0
+                       ! first update skips the first column
+                       jb = jb - 1
+                    end if
+                    do j2 = j+1, n, nb
+                       nj = min( nb, n-j2+1 )
+                       ! update (j2, j2) diagonal block with stdlib_zgemv
+                       j3 = j2
+                       do mj = nj-1, 1, -1
+                          call stdlib_zgemv( 'NO TRANSPOSE', mj, jb+1,-cone, work( j3-j1+1+k1*n ),&
+                                     n,a( j3, j1-k2 ), lda,cone, a( j3, j3 ), 1 )
+                          j3 = j3 + 1
+                       end do
+                       ! update off-diagonal block in j2-th block column with stdlib_zgemm
+                       call stdlib_zgemm( 'NO TRANSPOSE', 'TRANSPOSE',n-j3+1, nj, jb+1,-cone, &
+                       work( j3-j1+1+k1*n ), n,a( j2, j1-k2 ), lda,cone, a( j3, j2 ), lda )
+                                 
+                    end do
+                    ! recover t( j+1, j )
+                    a( j+1, j ) = alpha
+                 end if
+                 ! work(j+1, 1) stores h(j+1, 1)
+                 call stdlib_zcopy( n-j, a( j+1, j+1 ), 1, work( 1 ), 1 )
+              end if
+              go to 11
+           end if
+           20 continue
+           work( 1 ) = lwkopt
+           return
+     end subroutine stdlib_zsytrf_aa
+
+
+
+end module stdlib_linalg_lapack_z
diff --git a/test/linalg/CMakeLists.txt b/test/linalg/CMakeLists.txt
index 4a315f545..3d590a9d2 100644
--- a/test/linalg/CMakeLists.txt
+++ b/test/linalg/CMakeLists.txt
@@ -1,9 +1,11 @@
 set(
   fppFiles
   "test_linalg.fypp"
+  "test_blas_lapack.fypp"
   "test_linalg_matrix_property_checks.fypp"
 )
 fypp_f90("${fyppFlags}" "${fppFiles}" outFiles)
 
 ADDTEST(linalg)
 ADDTEST(linalg_matrix_property_checks)
+ADDTEST(blas_lapack)
diff --git a/test/linalg/test_blas_lapack.fypp b/test/linalg/test_blas_lapack.fypp
new file mode 100644
index 000000000..e36ac2717
--- /dev/null
+++ b/test/linalg/test_blas_lapack.fypp
@@ -0,0 +1,147 @@
+#:include "common.fypp"
+#:set RC_KINDS_TYPES = REAL_KINDS_TYPES + CMPLX_KINDS_TYPES
+
+module test_blas_lapack
+    use testdrive, only : new_unittest, unittest_type, error_type, check, skip_test
+    use stdlib_kinds, only: sp, dp, xdp, qp, int8, int16, int32, int64
+    use stdlib_linalg, only: eye
+    use stdlib_linalg_blas
+    use stdlib_linalg_lapack
+
+    implicit none
+
+    real(sp), parameter :: sptol = 1000 * epsilon(1._sp)
+    real(dp), parameter :: dptol = 1000 * epsilon(1._dp)
+#:if WITH_QP
+    real(qp), parameter :: qptol = 1000 * epsilon(1._qp)
+#:endif
+
+
+
+contains
+
+    !> Collect all exported unit tests
+    subroutine collect_blas_lapack(testsuite)
+        !> Collection of tests
+        type(unittest_type), allocatable, intent(out) :: testsuite(:)
+
+        testsuite = [ &
+  #:for k1, t1 in REAL_KINDS_TYPES
+            new_unittest("test_gemv${t1[0]}$${k1}$", test_gemv${t1[0]}$${k1}$), &
+            new_unittest("test_getri${t1[0]}$${k1}$", test_getri${t1[0]}$${k1}$), &
+  #:endfor
+            new_unittest("test_idamax", test_idamax) &
+            ]
+
+    end subroutine collect_blas_lapack
+
+
+  #:for k1, t1 in REAL_KINDS_TYPES
+    subroutine test_gemv${t1[0]}$${k1}$(error)
+        !> Error handling
+        type(error_type), allocatable, intent(out) :: error
+
+#:if k1=="xdp"
+        call skip_test(error, "Extended precision is not enabled")
+#:else
+        ${t1}$ :: A(3,3),x(3),y(3),ylap(3),yintr(3),alpha,beta
+        call random_number(alpha)
+        call random_number(beta)
+        call random_number(A)
+        call random_number(x)
+        call random_number(y)
+        ylap = y
+        call gemv('No transpose',size(A,1),size(A,2),alpha,A,size(A,1),x,1,beta,ylap,1)
+        yintr = alpha*matmul(A,x)+beta*y
+
+        call check(error, sum(abs(ylap - yintr)) < sptol, &
+            "blas vs. intrinsics axpy: sum() < sptol failed")
+        if (allocated(error)) return
+#:endif
+    end subroutine test_gemv${t1[0]}$${k1}$
+
+    ! Find matrix inverse from LU decomposition
+    subroutine test_getri${t1[0]}$${k1}$(error)
+        !> Error handling
+        type(error_type), allocatable, intent(out) :: error
+
+#:if k1=="xdp"
+        call skip_test(error, "Extended precision is not enabled")
+#:else
+
+        integer(ilp), parameter :: n = 3
+        ${t1}$ :: A(n,n)
+        ${t1}$,allocatable :: work(:)
+        integer(ilp) :: ipiv(n),info,lwork,nb
+
+
+        A = eye(n)
+
+        ! Factorize matrix (overwrite result)
+        call getrf(size(A,1),size(A,2),A,size(A,1),ipiv,info)
+        call check(error, info==0, "lapack getrf returned info/=0")
+        if (allocated(error)) return
+
+        ! Get optimal worksize (returned in work(1)) (apply 2% safety parameter)
+        nb = stdlib_ilaenv(1,'${t1[0]}$getri',' ',n,-1,-1,-1)
+        lwork = nint(1.02*n*nb,kind=ilp)
+        allocate (work(lwork))
+
+        ! Invert matrix
+        call getri(n,a,n,ipiv,work,lwork,info)
+
+        call check(error, info==0, "lapack getri returned info/=0")
+        if (allocated(error)) return
+
+        call check(error, sum(abs(A - eye(3))) < sptol, &
+            "lapack eye inversion: tolerance check failed")
+        if (allocated(error)) return
+#:endif
+    end subroutine test_getri${t1[0]}$${k1}$
+  #:endfor
+
+    ! Return
+    subroutine test_idamax(error)
+        !> Error handling
+        type(error_type), allocatable, intent(out) :: error
+
+        integer(ilp), parameter :: n = 5
+        integer(ilp) :: imax
+        real(dp) :: x(n)
+
+        x = [1,2,3,4,5]
+
+        imax = stdlib_idamax(n,x,1)
+
+        call check(error, imax==5, "blas idamax returned wrong location")
+
+    end subroutine test_idamax
+
+end module test_blas_lapack
+
+
+program tester
+    use, intrinsic :: iso_fortran_env, only : error_unit
+    use testdrive, only : run_testsuite, new_testsuite, testsuite_type
+    use test_blas_lapack, only : collect_blas_lapack
+    implicit none
+    integer :: stat, is
+    type(testsuite_type), allocatable :: testsuites(:)
+    character(len=*), parameter :: fmt = '("#", *(1x, a))'
+
+    stat = 0
+    testsuites = [ &
+        new_testsuite("blas_lapack", collect_blas_lapack) &
+        ]
+
+    do is = 1, size(testsuites)
+        write(error_unit, fmt) "Testing:", testsuites(is)%name
+        call run_testsuite(testsuites(is)%collect, error_unit, stat)
+    end do
+
+    if (stat > 0) then
+        write(error_unit, '(i0, 1x, a)') stat, "test(s) failed!"
+        error stop
+    end if
+end program
+
diff --git a/test/stats/array3.dat b/test/stats/array3.dat
deleted file mode 100644
index 13b583f89..000000000
--- a/test/stats/array3.dat
+++ /dev/null
@@ -1,16 +0,0 @@
-1.000000000000000021e-08 9.199998759392489944e+01
-1.024113254885563425e-08 9.199998731474968849e+01
-1.048233721895820948e-08 9.199998703587728244e+01
-1.072361403187881949e-08 9.199998675729767683e+01
-1.096496300919481796e-08 9.199998647900135040e+01
-1.120638417249036630e-08 9.199998620097916557e+01
-1.144787754335570897e-08 9.199998592322251056e+01
-1.168944314338753750e-08 9.199998564572304360e+01
-1.193108099418952317e-08 9.199998536847290609e+01
-1.217279111737088596e-08 9.199998509146449521e+01
-1.241457353454836993e-08 9.199998481469057765e+01
-1.265642826734443823e-08 9.199998453814424693e+01
-1.289835533738818635e-08 9.199998426181879552e+01
-1.314035476631514857e-08 9.199998398570787117e+01
-1.338242657576766519e-08 9.199998370980536322e+01
-1.362457078739434161e-08 9.199998343410533153e+01